diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml
new file mode 100644
index 0000000000..834add9314
--- /dev/null
+++ b/.github/workflows/format.yml
@@ -0,0 +1,36 @@
+name: github-FORMAT
+
+on:
+  pull_request:
+    branches:
+      - master
+      - develop
+
+jobs:
+  clang-format-check:
+    runs-on: ubuntu-18.04
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Install Dependencies
+        run: sudo apt install clang-format-8
+
+      - name: check
+        run: |
+          # Fetch from the default remote (origin)
+          git fetch &> /dev/null
+
+          # For every file changed, apply clang-format
+          for file in $(git diff --name-only origin/$GITHUB_BASE_REF | egrep '.*\.cpp$|.*\.hpp$|.*\.h$'); do
+            if [ -e $file ]; then
+              clang-format-8 -i -style=file $file
+              git add $file
+            fi
+          done
+
+          # If any diffs exist, error out
+          if [[ ! -z $(git status -s -uno . -- ':!.github') ]]; then
+            echo "The following files require formatting changes:"
+            git status -s -uno . -- ':!.github'
+            exit 1
+          fi
diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml
new file mode 100644
index 0000000000..72152f749a
--- /dev/null
+++ b/.github/workflows/osx.yml
@@ -0,0 +1,86 @@
+name: github-OSX
+
+on:
+  pull_request:
+    branches:
+      - master
+      - develop
+
+jobs:
+  osxci:
+    name: osx-ci
+    runs-on: [macos-latest]
+
+    strategy:
+      matrix:
+        include:
+          - backend: "SERIAL"
+            cmake_build_type: "RelWithDebInfo"
+          - backend: "THREADS"
+            cmake_build_type: "RelWithDebInfo"
+          - backend: "SERIAL"
+            cmake_build_type: "Debug"
+          - backend: "SERIAL"
+            cmake_build_type: "Release"
+
+    steps:
+      - name: checkout_kokkos_kernels
+        uses: actions/checkout@v2
+        with:
+          path: kokkos-kernels
+
+      - name: checkout_kokkos
+        uses: actions/checkout@v2
+        with:
+          repository: kokkos/kokkos
+          ref: ${{ github.base_ref }}
+          path: kokkos
+
+      - name: configure_kokkos
+        run: |
+          ls -lat
+          mkdir -p kokkos/{build,install}
+          cd kokkos/build
+          cmake \
+          -DKokkos_ENABLE_${{ matrix.backend }}=ON \
+          -DCMAKE_CXX_FLAGS="-Werror" \
+          -DCMAKE_CXX_STANDARD=14 \
+          -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
+          -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \
+          -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \
+          -DCMAKE_INSTALL_PREFIX=$PWD/../install \
+          ..
+
+      - name: build_and_install_kokkos
+        working-directory: kokkos/build
+        run: make -j2 install
+
+      - name: configure_kokkos_kernels
+        run: |
+          ls -lat
+          mkdir -p kokkos-kernels/{build,install}
+          cd kokkos-kernels/build
+          cmake \
+          -DKokkos_DIR=$PWD/../../kokkos/install/lib/cmake/Kokkos \
+          -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \
+          -DCMAKE_CXX_FLAGS="-Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wuninitialized" \
+          -DCMAKE_INSTALL_PREFIX=$PWD/../install \
+          -DKokkosKernels_ENABLE_TESTS=ON \
+          -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \
+          -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \
+          -DKokkosKernels_INST_DOUBLE=ON \
+          -DKokkosKernels_INST_COMPLEX_FLOAT=ON \
+          -DKokkosKernels_INST_FLOAT=ON \
+          -DKokkosKernels_INST_LAYOUTLEFT:BOOL=ON \
+          -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON \
+          -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \
+          -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \
+          ..
+
+      - name: build_kokkos_kernels
+        working-directory: kokkos-kernels/build
+        run: make -j2
+
+      - name: test
+        working-directory: kokkos-kernels/build
+        run: ctest -j2 --output-on-failure
\ No newline at end of file
diff --git a/.jenkins/nightly.groovy b/.jenkins/nightly.groovy
index e98b34001c..f30d580edc 100644
--- a/.jenkins/nightly.groovy
+++ b/.jenkins/nightly.groovy
@@ -1,40 +1,86 @@
 pipeline {
     agent none
 
+    options {
+        timeout(time: 3, unit: 'HOURS')
+    }
+
     stages {
-        stage('HIP-ROCm-4.2-C++14') {
-            agent {
-                dockerfile {
-                    filename 'Dockerfile.hip'
-                    dir 'scripts/docker'
-                    additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:4.2'
-                    label 'rocm-docker && vega'
-                    args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES'
+        stage('Build & Run') {
+            parallel {
+                stage('SYCL-OneAPI') {
+                    agent {
+                        dockerfile {
+                            filename 'Dockerfile.sycl'
+                            dir 'scripts/docker'
+                            label 'nvidia-docker && volta'
+                            args '-v /tmp/ccache.kokkos:/tmp/ccache'
+                        }
+                    }
+                    steps {
+                        sh '''rm -rf kokkos &&
+                              git clone -b develop https://github.com/kokkos/kokkos.git && cd kokkos && \
+                              mkdir build && cd build && \
+                              cmake \
+                                -DCMAKE_BUILD_TYPE=Release \
+                                -DCMAKE_CXX_COMPILER=clang++ \
+                                -DKokkos_ARCH_VOLTA70=ON \
+                                -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \
+                                -DKokkos_ENABLE_SYCL=ON \
+                                -DKokkos_ENABLE_UNSUPPORTED_ARCHS=ON \
+                                -DCMAKE_CXX_STANDARD=17 \
+                              .. && \
+                              make -j8 && make install && \
+                              cd ../.. && rm -rf kokkos'''
+                        sh '''rm -rf build && mkdir -p build && cd build && \
+                              cmake \
+                                -DCMAKE_BUILD_TYPE=Release \
+                                -DCMAKE_CXX_COMPILER=clang++ \
+                                -DKokkosKernels_ENABLE_TESTS=ON \
+                                -DKokkosKernels_ENABLE_EXAMPLES=ON \
+                                -DKokkosKernels_INST_DOUBLE=ON \
+                                -DKokkosKernels_INST_ORDINAL_INT=ON \
+                                -DKokkosKernels_INST_OFFSET_INT=ON \
+                              .. && \
+                              make -j8'''
+                    }
+                }
+
+                stage('HIP-ROCm-4.5-C++14') {
+                    agent {
+                        dockerfile {
+                            filename 'Dockerfile.hip'
+                            dir 'scripts/docker'
+                            additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:4.5'
+                            label 'rocm-docker && vega'
+                            args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES'
+                        }
+                    }
+                    steps {
+                        sh '''rm -rf kokkos &&
+                              git clone -b develop https://github.com/kokkos/kokkos.git && cd kokkos && \
+                              mkdir build && cd build && \
+                              cmake \
+                                -DCMAKE_CXX_COMPILER=hipcc \
+                                -DCMAKE_CXX_EXTENSIONS=OFF \
+                                -DKokkos_ENABLE_HIP=ON \
+                              .. && \
+                              make -j8 && make install && \
+                              cd ../.. && rm -rf kokkos'''
+                        sh '''rm -rf build && mkdir -p build && cd build && \
+                              cmake \
+                                -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+                                -DCMAKE_CXX_COMPILER=hipcc \
+                                -DCMAKE_CXX_EXTENSIONS=OFF \
+                                -DKokkosKernels_ENABLE_TESTS=ON \
+                                -DKokkosKernels_ENABLE_EXAMPLES=ON \
+                                -DKokkosKernels_INST_DOUBLE=ON \
+                                -DKokkosKernels_INST_ORDINAL_INT=ON \
+                                -DKokkosKernels_INST_OFFSET_INT=ON \
+                              .. && \
+                              make -j8 && ctest --verbose'''
+                    }
                 }
-            }
-            steps {
-                sh '''rm -rf kokkos &&
-                      git clone -b develop https://github.com/kokkos/kokkos.git && cd kokkos && \
-                      mkdir build && cd build && \
-                      cmake \
-                        -DCMAKE_CXX_COMPILER=hipcc \
-                        -DCMAKE_CXX_EXTENSIONS=OFF \
-                        -DKokkos_ENABLE_HIP=ON \
-                        .. && \
-                       make -j8 && make install && \
-                       cd ../.. && rm -rf kokkos'''
-                sh '''rm -rf build && mkdir -p build && cd build && \
-                      cmake \
-                        -DCMAKE_BUILD_TYPE=RelWithDebInfo \
-                        -DCMAKE_CXX_COMPILER=hipcc \
-                        -DCMAKE_CXX_EXTENSIONS=OFF \
-                        -DKokkosKernels_ENABLE_TESTS=ON \
-                        -DKokkosKernels_ENABLE_EXAMPLES=ON \
-                        -DKokkosKernels_INST_DOUBLE=ON \
-                        -DKokkosKernels_INST_ORDINAL_INT=ON \
-                        -DKokkosKernels_INST_OFFSET_INT=ON \
-                      .. && \
-                      make -j8 && ctest --verbose'''
             }
         }
     }
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 76de9db0d0..a961701013 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,129 @@
 # Change Log
 
+## [3.6.00](https://github.com/kokkos/kokkos-kernels/tree/3.6.00) (2022-02-18)
+[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.5.00...3.6.00)
+
+### Features: 
+
+#### Batched Sparse Linear algebra
+- Kokkos Kernels is adding a new component to the library: batched sparse linear algebra.
+- Similarly to the current dense batched algorithms, the new algorithms are called from
+- the GPU and provide Team and TeamVector level of parallelism, SpMV also provides a Serial
+- call on GPU.
+
+- Add Batched CG and Batched GMRES [\#1155](https://github.com/kokkos/kokkos-kernels/pull/1155)
+- Add Jacobi Batched preconditioner [\#1219](https://github.com/kokkos/kokkos-kernels/pull/1219)
+
+#### Bsr and Tensor core algorithm for sparse linear algebra
+- After introducing the BsrMatrix in release 3.5.0 new algorithms are now supporting this format.
+- For release 3.6.0 we are adding matrix-vector (matvec) multiplication and Gauss-Seidel as well as an
+- implementation of matvec that leverages tensor cores on Nvidia GPUs. More kernels are expected to
+- support the Bsr format in future releases.
+
+- Add Spmv for BsrMatrix [\#1255](https://github.com/kokkos/kokkos-kernels/pull/1255)
+- Add BLAS to SpMV operations for BsrMatrix [\#1297](https://github.com/kokkos/kokkos-kernels/pull/1297)
+- BSR format support in block Gauss-Seidel [\#1232](https://github.com/kokkos/kokkos-kernels/pull/1232)
+- Experimental tensor-core SpMV for BsrMatrix [\#1090](https://github.com/kokkos/kokkos-kernels/pull/1090)
+
+#### Improved AMD math libraries support
+- rocBLAS and rocSPARSE TPLs are now officially supported, they can be enabled at configure time.
+- Initial kernels that can call rocBLAS are GEMV, GEMM, IAMAX and SCAL, while rocSPARSE can be
+- called for matrix-vector multiplication. Further support for TPL calls can be requested on slack
+- and by GitHub issues.
+
+- Tpl rocBLAS and rocSPARSE [\#1153](https://github.com/kokkos/kokkos-kernels/pull/1153)
+- Add rocBLAS GEMV wrapper [\#1201](https://github.com/kokkos/kokkos-kernels/pull/1201)
+- Add rocBLAS wrappers for GEMM, IAMAX, and SCAL [\#1230](https://github.com/kokkos/kokkos-kernels/pull/1230)
+- SpMV: adding support for rocSPARSE TPL [\#1221](https://github.com/kokkos/kokkos-kernels/pull/1221)
+
+#### Additional new features
+- bhalf: Unit test Batched GEMM [\#1251](https://github.com/kokkos/kokkos-kernels/pull/1251) 
+-   and demostrate GMRES example convergence with bhalf_t (https://github.com/kokkos/kokkos-kernels/pull/1300)
+- Stream interface: adding stream support in GEMV and GEMM [\#1131](https://github.com/kokkos/kokkos-kernels/pull/1131)
+- Improve double buffering batched gemm performance [\#1217](https://github.com/kokkos/kokkos-kernels/pull/1217)
+- Allow choosing coloring algorithm in multicolor GS [\#1199](https://github.com/kokkos/kokkos-kernels/pull/1199)
+- Batched: Add armpl dgemm support [\#1256](https://github.com/kokkos/kokkos-kernels/pull/1256)
+
+### Deprecations:
+- Deprecation warning: SpaceAccessibility move out of impl, see #1140 [\#1141](https://github.com/kokkos/kokkos-kernels/pull/1141)
+
+### Backends and Archs Enhancements:
+
+#### SYCL:
+- Full Blas support on SYCL [\#1270](https://github.com/kokkos/kokkos-kernels/pull/1270)
+- Get sparse tests enabled and working for SYCL [\#1269](https://github.com/kokkos/kokkos-kernels/pull/1269)
+- Changes to make graph run on SYCL [\#1268](https://github.com/kokkos/kokkos-kernels/pull/1268)
+- Allow querying free/total memory for SYCL [\#1225](https://github.com/kokkos/kokkos-kernels/pull/1225)
+- Use KOKKOS_IMPL_DO_NOT_USE_PRINTF instead of printf in kernels [\#1162](https://github.com/kokkos/kokkos-kernels/pull/1162)
+
+#### HIP:
+- Work around hipcc size_t/int division with remainder bug [\#1262](https://github.com/kokkos/kokkos-kernels/pull/1262)
+
+#### Other Improvements:
+- Replace std::abs with ArithTraits::abs [\#1312](https://github.com/kokkos/kokkos-kernels/pull/1312)
+- Batched/dense: Add Gemm_DblBuf LayoutLeft operator [\#1299](https://github.com/kokkos/kokkos-kernels/pull/1299)
+- KokkosKernels: adding variable that returns version as a single number [\#1295](https://github.com/kokkos/kokkos-kernels/pull/1295)
+- Add KOKKOSKERNELS_FORCE_SIMD macro (Fix #1040) [\#1290](https://github.com/kokkos/kokkos-kernels/pull/1290)
+- Rename KOKKOS_IF_{HOST,DEVICE} -> KOKKOS_IF_ON_{HOST,DEVICE} [\#1278](https://github.com/kokkos/kokkos-kernels/pull/1278)
+- Algo::Level{2,3}::Blocked::mb() [\#1265](https://github.com/kokkos/kokkos-kernels/pull/1265)
+- Batched: Use SerialOpt2 for 33 to 39 square matrices [\#1261](https://github.com/kokkos/kokkos-kernels/pull/1261)
+- Prune extra dependencies [\#1241](https://github.com/kokkos/kokkos-kernels/pull/1241)
+- Improve double buffering batched gemm perf for matrix sizes >64x64 [\#1239](https://github.com/kokkos/kokkos-kernels/pull/1239)
+- Improve graph color perf test [\#1229](https://github.com/kokkos/kokkos-kernels/pull/1229)
+- Add custom implementation for strcasecmp [\#1227](https://github.com/kokkos/kokkos-kernels/pull/1227)
+- Replace __restrict__ with KOKKOS_RESTRICT [\#1223](https://github.com/kokkos/kokkos-kernels/pull/1223)
+- Replace array reductions in BLAS-1 MV reductions [\#1204](https://github.com/kokkos/kokkos-kernels/pull/1204)
+- Update MIS-2 and aggregation [\#1143](https://github.com/kokkos/kokkos-kernels/pull/1143)
+- perf_test/blas/blas3: Update SHAs for benchmarking [\#1139](https://github.com/kokkos/kokkos-kernels/pull/1139)
+
+### Implemented enhancements BuildSystem
+- Bump ROCm version 4.2 -> 4.5 in nightly Jenkins CI build [\#1279](https://github.com/kokkos/kokkos-kernels/pull/1279)
+- scripts/cm_test_all_sandia: Add A64FX ci checks [\#1276](https://github.com/kokkos/kokkos-kernels/pull/1276)
+- github/workflows: Add osx CI [\#1254](https://github.com/kokkos/kokkos-kernels/pull/1254)
+- Update SYCL compiler version in CI [\#1247](https://github.com/kokkos/kokkos-kernels/pull/1247)
+- Do not set Kokkos variables when exporting CMake configuration [\#1236](https://github.com/kokkos/kokkos-kernels/pull/1236)
+- Add nightly CI check for SYCL [\#1190](https://github.com/kokkos/kokkos-kernels/pull/1190)
+- Update cmake minimum version to 3.16 [\#866](https://github.com/kokkos/kokkos-kernels/pull/866)
+
+### Incompatibilities:
+- Kokkos::Impl: removing a few more instances of throw_runtime_exception [\#1320](https://github.com/kokkos/kokkos-kernels/pull/1320)
+- Remove Kokkos::Impl::throw_runtime_exception from Kokkos Kernels [\#1294](https://github.com/kokkos/kokkos-kernels/pull/1294)
+- Remove unused memory space utility [\#1283](https://github.com/kokkos/kokkos-kernels/pull/1283)
+- Clean up Kokkos header includes [\#1282](https://github.com/kokkos/kokkos-kernels/pull/1282)
+- Remove private Kokkos header include (Cuda/Kokkos_Cuda_Half.hpp) [\#1281](https://github.com/kokkos/kokkos-kernels/pull/1281)
+- Avoid using #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_* macro guards [\#1266](https://github.com/kokkos/kokkos-kernels/pull/1266)
+- Rename enumerator Impl::Exec_{PTHREADS -> THREADS} [\#1253](https://github.com/kokkos/kokkos-kernels/pull/1253)
+- Remove all references to the Kokkos QThreads backend [\#1238](https://github.com/kokkos/kokkos-kernels/pull/1238)
+- Replace more occurences of Kokkos::Impl::is_view [\#1234](https://github.com/kokkos/kokkos-kernels/pull/1234)
+- Do not use Kokkos::Impl::is_view [\#1214](https://github.com/kokkos/kokkos-kernels/pull/1214)
+- Replace Kokkos::Impl::if_c -> std::conditional [\#1213](https://github.com/kokkos/kokkos-kernels/pull/1213)
+
+### Bug Fixes:
+- Fix bug in spmv_mv_bsrmatrix() for Ampere GPU arch [\#1315](https://github.com/kokkos/kokkos-kernels/pull/1315)
+- Fix std::abs calls for rocBLAS/rocSparse [\#1310](https://github.com/kokkos/kokkos-kernels/pull/1310)
+- cast literal 0 to fragment scalar type [\#1307](https://github.com/kokkos/kokkos-kernels/pull/1307)
+- Fix 1303: maintain correct #cols on A in twostage [\#1304](https://github.com/kokkos/kokkos-kernels/pull/1304)
+- Add dimension checking to generic spmv interface [\#1301](https://github.com/kokkos/kokkos-kernels/pull/1301)
+- Add missing barriers to TeamGMRES, fix vector len [\#1285](https://github.com/kokkos/kokkos-kernels/pull/1285)
+- Examples: fixing some issues related to type checking [\#1267](https://github.com/kokkos/kokkos-kernels/pull/1267)
+- Restrict BsrMatrix specialization for AMPERE and VOLTA to CUDA [\#1242](https://github.com/kokkos/kokkos-kernels/pull/1242)
+- Fix compilation errors for multi-vectors in kk_print_1Dview() [\#1231](https://github.com/kokkos/kokkos-kernels/pull/1231)
+- src/batched: Fixes #1224 [\#1226](https://github.com/kokkos/kokkos-kernels/pull/1226)
+- Fix SpGEMM crashing on empty rows [\#1220](https://github.com/kokkos/kokkos-kernels/pull/1220)
+- Fix issue #1212 [\#1218](https://github.com/kokkos/kokkos-kernels/pull/1218)
+- example/gmres: Specify half_t namespace [\#1208](https://github.com/kokkos/kokkos-kernels/pull/1208)
+- Check that ordinal types are signed [\#1188](https://github.com/kokkos/kokkos-kernels/pull/1188)
+- Fixing a couple of small issue with tensor core spmv [\#1185](https://github.com/kokkos/kokkos-kernels/pull/1185)
+- Fix #threads setting in pcg for OpenMP [\#1182](https://github.com/kokkos/kokkos-kernels/pull/1182)
+- SpMV: fix catch all case to avoid compiler warnings [\#1179](https://github.com/kokkos/kokkos-kernels/pull/1179)
+- using namespace should be scoped to prevent name clashes [\#1177](https://github.com/kokkos/kokkos-kernels/pull/1177)
+- using namespace should be scoped to prevent name clashes, see issue #1170 [\#1171](https://github.com/kokkos/kokkos-kernels/pull/1171)
+- Fix bug with mkl impl of spgemm [\#1167](https://github.com/kokkos/kokkos-kernels/pull/1167)
+- Add missing $ to KOKKOS_HAS_TRILINOS in sparse_sptrsv_superlu check [\#1160](https://github.com/kokkos/kokkos-kernels/pull/1160)
+- Small fixes to spgemm, and plug gaps in testing [\#1159](https://github.com/kokkos/kokkos-kernels/pull/1159)
+- SpMV: mismatch in #ifdef check and kernel specialization [\#1151](https://github.com/kokkos/kokkos-kernels/pull/1151)
+- Fix values dimension for block sparse matrices [\#1147](https://github.com/kokkos/kokkos-kernels/pull/1147)
+
 ## [3.5.00](https://github.com/kokkos/kokkos-kernels/tree/3.5.00) (2021-10-19)
 [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.4.01...3.5.00)
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 95f4cd0ee9..c4c8a3ccfa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,7 +10,7 @@ SET(KOKKOSKERNELS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR})
 SET(KOKKOSKERNELS_TOP_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 
 IF(NOT KOKKOSKERNELS_HAS_TRILINOS)
-  cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
+  cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
   IF (Spack_WORKAROUND)
     #if we are explicitly using Spack for development,
     #nuke the Spack compiler
@@ -24,14 +24,14 @@ IF(NOT KOKKOSKERNELS_HAS_TRILINOS)
     PROJECT(KokkosKernels CXX)
   ENDIF()
   SET(KokkosKernels_VERSION_MAJOR 3)
-  SET(KokkosKernels_VERSION_MINOR 5)
+  SET(KokkosKernels_VERSION_MINOR 6)
   SET(KokkosKernels_VERSION_PATCH 00)
+  SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}")
+  MATH(EXPR KOKKOSKERNELS_VERSION "${KokkosKernels_VERSION_MAJOR} * 10000 + ${KokkosKernels_VERSION_MINOR} * 100 + ${KokkosKernels_VERSION_PATCH}")
 ENDIF()
 
-IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0")
-  MESSAGE(STATUS "Setting policy CMP0074 to use <Package>_ROOT variables")
-  CMAKE_POLICY(SET CMP0074 NEW)
-ENDIF()
+MESSAGE(STATUS "Setting policy CMP0074 to use <Package>_ROOT variables")
+CMAKE_POLICY(SET CMP0074 NEW)
 
 INCLUDE(GNUInstallDirs)
 IF (KOKKOSKERNELS_HAS_TRILINOS)
@@ -47,6 +47,8 @@ ENDIF()
 INCLUDE(cmake/fake_tribits.cmake)
 INCLUDE(cmake/kokkoskernels_tribits.cmake)
 
+OPTION(BUILD_SHARED_LIBS "Build shared libraries" OFF)
+
 KOKKOSKERNELS_PACKAGE()
 
 IF (NOT KOKKOSKERNELS_HAS_TRILINOS)
diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash
index d633a139c8..b26ba7be97 100755
--- a/cm_generate_makefile.bash
+++ b/cm_generate_makefile.bash
@@ -291,6 +291,7 @@ display_help_text() {
       echo "--ldflags=[FLAGS]             Overwrite LDFLAGS for library build and test"
       echo "                                build. This will still set certain required"
       echo "                                flags (such as -fopenmp, -lpthread, etc.)."
+      echo "--shared:                     Build Kokkos and KokkosKernels as shared libraries (required for SYCL on Intel)"
       echo "--with-gtest=/Path/To/Gtest:  Set path to gtest.  (Used in unit and performance"
       echo "                                tests.)"
       echo "--with-hwloc=/Path/To/Hwloc:  Set path to hwloc library."
@@ -348,6 +349,9 @@ KOKKOSKERNELS_DO_TESTS=ON
 KOKKOSKERNELS_DO_PERFSUITE=OFF
 KOKKOSKERNELS_DO_EXAMPLES=ON
 
+#Build static libraries by default
+BUILD_SHARED_LIBRARIES=OFF
+
 KOKKOS_MAKEINSTALL_J=4
 
 KERNELS_DEFAULT_ETI_OPTION=""
@@ -467,6 +471,9 @@ do
     --debug|-dbg)
       KOKKOSKERNELS_DEBUG=ON
       ;;
+    --shared)
+      BUILD_SHARED_LIBRARIES=ON
+      ;;
     --no-default-eti)
       KERNELS_DEFAULT_ETI_OPTION="-DKokkosKernels_ADD_DEFAULT_ETI=OFF"
       ;;
@@ -731,9 +738,9 @@ cd ${KOKKOS_INSTALL_PATH}
 
 # Configure kokkos
 echo ""
-echo cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF ${KOKKOS_PATH}
+echo cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF ${KOKKOS_PATH}
 echo ""
-cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF ${KOKKOS_PATH}
+cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES}${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF ${KOKKOS_PATH}
 
 # Install kokkos library
 make install -j $KOKKOS_MAKEINSTALL_J
@@ -758,7 +765,7 @@ cd $STORE_KOKKOSKERNELS_BUILD_PATH
 
 # Configure kokkos-kernels
 echo ""
-echo cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} ${KOKKOSKERNELS_BUILDTYPE_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KOKKOSKERNELS_PATH}
+echo cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KOKKOSKERNELS_PATH}
 echo ""
-cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} ${KOKKOSKERNELS_BUILDTYPE_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KOKKOSKERNELS_PATH}
+cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KOKKOSKERNELS_PATH}
 
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index a36e745c71..2dcedcc1c9 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1,6 +1,6 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
         LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers KokkosAlgorithms
-        LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE MAGMA METIS SuperLU Cholmod LAPACKE CBLAS ARMPL
+        LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE MAGMA METIS SuperLU Cholmod LAPACKE CBLAS ARMPL ROCBLAS ROCSPARSE
         TEST_OPTIONAL_TPLS yaml-cpp
 )
 # NOTE: If you update names in LIB_OPTIONAL_TPLS above, make sure to map those names in
diff --git a/cmake/KokkosKernelsConfig.cmake.in b/cmake/KokkosKernelsConfig.cmake.in
index f930dd51a0..fbceffe76c 100644
--- a/cmake/KokkosKernelsConfig.cmake.in
+++ b/cmake/KokkosKernelsConfig.cmake.in
@@ -9,13 +9,5 @@ include(CMakeFindDependencyMacro)
 
 find_dependency(Kokkos HINTS @Kokkos_DIR@)
 
-SET(Kokkos_ENABLE_OPENMP       @Kokkos_ENABLE_OPENMP@)
-SET(Kokkos_ENABLE_OPENMPTARGET @Kokkos_ENABLE_OPENMPTARGET@)
-SET(Kokkos_ENABLE_CUDA         @Kokkos_ENABLE_CUDA@)
-SET(Kokkos_ENABLE_HIP          @Kokkos_ENABLE_HIP@)
-SET(Kokkos_ENABLE_SYCL         @Kokkos_ENABLE_SYCL@)
-SET(Kokkos_ENABLE_PTHREAD      @Kokkos_ENABLE_PTHREAD@) 
-SET(Kokkos_ENABLE_SERIAL       @Kokkos_ENABLE_SERIAL@)
-
 INCLUDE("${KokkosKernels_CMAKE_DIR}/KokkosKernelsTargets.cmake")
 
diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in
index adfd3cd118..f8dd2ae133 100644
--- a/cmake/KokkosKernels_config.h.in
+++ b/cmake/KokkosKernels_config.h.in
@@ -1,11 +1,15 @@
 #ifndef KOKKOSKERNELS_CONFIG_H
 #define KOKKOSKERNELS_CONFIG_H
 
-
 /* Define Fortran mangle from Trilinos macro definition */
-#ifndef F77_BLAS_MANGLE                                                                                       
-# define F77_BLAS_MANGLE@F77_BLAS_MANGLE@                                                                     
-#endif 
+// clang-format off
+#ifndef F77_BLAS_MANGLE
+#define F77_BLAS_MANGLE@F77_BLAS_MANGLE@
+#endif
+// clang-format on
+
+/* Define the current version of Kokkos Kernels */
+#cmakedefine KOKKOSKERNELS_VERSION @KOKKOSKERNELS_VERSION@
 
 /* Define if fortran blas 1 function can return complex type */
 #cmakedefine KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX
@@ -22,7 +26,6 @@
    than just BLAS and LAPACK functions.  */
 #cmakedefine HAVE_KOKKOSKERNELS_MKL
 
-
 #cmakedefine KOKKOSKERNELS_ENABLE_TESTS_AND_PERFSUITE
 
 /* Define this macro if experimental features of Kokkoskernels are enabled */
@@ -61,11 +64,12 @@
 /* Whether to build kernels for memory space Kokkos::HostSpace */
 #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_HOSTSPACE
 
-
 /* Whether to build kernels for scalar type double */
 #cmakedefine KOKKOSKERNELS_INST_DOUBLE
 /* Whether to build kernels for scalar type float */
 #cmakedefine KOKKOSKERNELS_INST_FLOAT
+/* Whether to build kernels for scalar type Kokkos::Experimental::half_t */
+#cmakedefine KOKKOSKERNELS_INST_HALF
 /* Whether to build kernels for scalar type complex<double> */
 #cmakedefine KOKKOSKERNELS_INST_COMPLEX_DOUBLE
 /* Whether to build kernels for scalar type complex<float> */
@@ -119,25 +123,27 @@
 #cmakedefine KOKKOSKERNELS_ENABLE_TPL_METIS
 /* ARMPL */
 #cmakedefine KOKKOSKERNELS_ENABLE_TPL_ARMPL
+#cmakedefine ARMPL_BUILD @ARMPL_BUILD@
+/* ROCBLAS */
+#cmakedefine KOKKOSKERNELS_ENABLE_TPL_ROCBLAS
+/* ROCSPARSE */
+#cmakedefine KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE
 
 #cmakedefine KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
 
 /* if MKL or ARMPL, BLAS is also defined */
-#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) ||\
+#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) || \
     defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL)
 #if !defined(KOKKOSKERNELS_ENABLE_TPL_BLAS)
 #define KOKKOSKERNELS_ENABLE_TPL_BLAS
 #endif
 #endif
 
-#if !defined(KOKKOS_ENABLE_CUDA) \
-  && !defined(KOKKOS_ENABLE_HIP) \
-  && !defined(KOKKOS_ENABLE_SYCL) \
-  && !defined(KOKKOS_ENABLE_OPENMPTARGET)
+#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \
+    !defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ENABLE_OPENMPTARGET)
 #define KOKKOSKERNELS_ENABLE_HOST_ONLY
 #endif
 
-
 /*
  * "Optimization level" for computational kernels in this subpackage.
  * The higher the level, the more code variants get generated, and
@@ -145,11 +151,10 @@
  * mean both better performance overall, and more uniform performance
  * for corner cases.
  */
-#define KOKKOSLINALG_OPT_LEVEL @KokkosLinAlg_Opt_Level@
+#define KOKKOSLINALG_OPT_LEVEL @KokkosLinAlg_Opt_Level @
 
 #ifndef KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 #define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY false
 #endif
 
-
-#endif // KOKKOSKERNELS_CONFIG_H
+#endif  // KOKKOSKERNELS_CONFIG_H
diff --git a/cmake/Modules/FindTPLARMPL.cmake b/cmake/Modules/FindTPLARMPL.cmake
index 62e1e33ea3..6f56b0a884 100644
--- a/cmake/Modules/FindTPLARMPL.cmake
+++ b/cmake/Modules/FindTPLARMPL.cmake
@@ -14,6 +14,7 @@ ELSEIF (ARMPL_LIBRARIES)
 ELSEIF (ARMPL_LIBRARY_DIRS)
   KOKKOSKERNELS_FIND_IMPORTED(ARMPL INTERFACE LIBRARIES amath ${ARMPL_LIB} LIBRARY_PATHS ${ARMPL_LIBRARY_DIRS})
 ELSEIF (DEFINED ENV{ARMPL_DIR})
+  SET(ARMPL_BUILD $ENV{ARMPL_BUILD})
   SET(ARMPL_ROOT $ENV{ARMPL_DIR})
   KOKKOSKERNELS_FIND_IMPORTED(ARMPL INTERFACE
     LIBRARIES
diff --git a/cmake/Modules/FindTPLROCBLAS.cmake b/cmake/Modules/FindTPLROCBLAS.cmake
new file mode 100644
index 0000000000..0217e8cf2c
--- /dev/null
+++ b/cmake/Modules/FindTPLROCBLAS.cmake
@@ -0,0 +1,37 @@
+IF (ROCBLAS_LIBRARY_DIRS AND ROCBLAS_LIBRARIES)
+  KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES} LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS})
+ELSEIF (ROCBLAS_LIBRARIES)
+  KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES})
+ELSEIF (ROCBLAS_LIBRARY_DIRS)
+  KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES rocblas LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS})
+ELSEIF (KokkosKernels_ROCBLAS_ROOT)
+  KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE
+    LIBRARIES
+      rocblas
+    LIBRARY_PATHS
+      ${KokkosKernels_ROCBLAS_ROOT}/lib
+    HEADERS
+      rocblas.h
+    HEADER_PATHS
+      ${KokkosKernels_ROCBLAS_ROOT}/include
+  )
+ELSEIF (DEFINED ENV{ROCM_PATH})
+  MESSAGE(STATUS "Detected ROCM_PATH: ENV{ROCM_PATH}")
+  SET(ROCBLAS_ROOT "$ENV{ROCM_PATH}/rocblas")
+  KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE
+    LIBRARIES
+      rocblas
+    LIBRARY_PATHS
+      ${ROCBLAS_ROOT}/lib
+    HEADERS
+      rocblas.h
+    HEADER_PATHS
+      ${ROCBLAS_ROOT}/include
+  )
+ELSE()
+  MESSAGE(ERROR "rocBLAS was not detected properly, please disable it or provide sufficient information at configure time.")
+  # Todo: figure out how to use the target defined during rocblas installation
+  # FIND_PACKAGE(ROCBLAS REQUIRED)
+  # KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCBLAS INTERFACE LINK_LIBRARIES ${ROCBLAS_LIBRARIES})
+  # GET_TARGET_PROPERTY(ROCBLAS_LINK_LIBRARIES ${ROCBLAS_LIBRARIES} IMPORTED_LINK_INTERFACE_LIBRARIES)
+ENDIF()
\ No newline at end of file
diff --git a/cmake/Modules/FindTPLROCSPARSE.cmake b/cmake/Modules/FindTPLROCSPARSE.cmake
new file mode 100644
index 0000000000..52a0261b48
--- /dev/null
+++ b/cmake/Modules/FindTPLROCSPARSE.cmake
@@ -0,0 +1,37 @@
+IF (ROCSPARSE_LIBRARY_DIRS AND ROCSPARSE_LIBRARIES)
+  KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES} LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS})
+ELSEIF (ROCSPARSE_LIBRARIES)
+  KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES})
+ELSEIF (ROCSPARSE_LIBRARY_DIRS)
+  KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES rocsparse LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS})
+ELSEIF (KokkosKernels_ROCSPARSE_ROOT)
+  KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE
+    LIBRARIES
+      rocsparse
+    LIBRARY_PATHS
+      ${KokkosKernels_ROCSPARSE_ROOT}/lib
+    HEADERS
+      rocsparse.h
+    HEADER_PATHS
+      ${KokkosKernels_ROCSPARSE_ROOT}/include
+  )
+ELSEIF (DEFINED ENV{ROCM_PATH})
+  MESSAGE(STATUS "Detected ROCM_PATH: ENV{ROCM_PATH}")
+  SET(ROCSPARSE_ROOT "$ENV{ROCM_PATH}/rocsparse")
+  KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE
+    LIBRARIES
+      rocsparse
+    LIBRARY_PATHS
+      ${ROCSPARSE_ROOT}/lib
+    HEADERS
+      rocsparse.h
+    HEADER_PATHS
+      ${ROCSPARSE_ROOT}/include
+  )
+ELSE()
+  MESSAGE(ERROR "rocSPARSE was not detected properly, please disable it or provide sufficient information at configure time.")
+  # Todo: figure out how to use the target defined during rocsparse installation
+  # FIND_PACKAGE(ROCSPARSE REQUIRED)
+  # KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCSPARSE INTERFACE LINK_LIBRARIES ${ROCSPARSE_LIBRARIES})
+  # GET_TARGET_PROPERTY(ROCSPARSE_LINK_LIBRARIES ${ROCSPARSE_LIBRARIES} IMPORTED_LINK_INTERFACE_LIBRARIES)
+ENDIF()
\ No newline at end of file
diff --git a/cmake/kokkos_backends.cmake b/cmake/kokkos_backends.cmake
index 11c51eed53..9346475f91 100644
--- a/cmake/kokkos_backends.cmake
+++ b/cmake/kokkos_backends.cmake
@@ -10,9 +10,15 @@ MACRO(CHECK_KOKKOS_BACKEND BE)
 ENDMACRO(CHECK_KOKKOS_BACKEND)
 
 CHECK_KOKKOS_BACKEND(SERIAL)
-CHECK_KOKKOS_BACKEND(PTHREAD)
+CHECK_KOKKOS_BACKEND(THREADS)
 CHECK_KOKKOS_BACKEND(OPENMP)
 CHECK_KOKKOS_BACKEND(OPENMPTARGET)
 CHECK_KOKKOS_BACKEND(CUDA)
 CHECK_KOKKOS_BACKEND(HIP)
 CHECK_KOKKOS_BACKEND(SYCL)
+
+# for backward compatibility.  can be dropped when requiring Kokkos 3.6
+IF (Kokkos_ENABLE_PTHREAD)
+  SET(KOKKOS_ENABLE_THREADS ON)
+  SET(KOKKOSKERNELS_INST_EXECSPACE_THREADS_DEFAULT ON)
+ENDIF()
diff --git a/cmake/kokkoskernels_eti_devices.cmake b/cmake/kokkoskernels_eti_devices.cmake
index 81ab89508e..47dce1f9d1 100644
--- a/cmake/kokkoskernels_eti_devices.cmake
+++ b/cmake/kokkoskernels_eti_devices.cmake
@@ -8,7 +8,7 @@ SET(EXEC_SPACES
   EXECSPACE_SYCL
   EXECSPACE_OPENMPTARGET
   EXECSPACE_OPENMP
-  EXECSPACE_PTHREAD
+  EXECSPACE_THREADS
   EXECSPACE_SERIAL
 )
 SET(EXECSPACE_CUDA_CPP_TYPE         Kokkos::Cuda)
@@ -16,7 +16,7 @@ SET(EXECSPACE_HIP_CPP_TYPE          Kokkos::Experimental::HIP)
 SET(EXECSPACE_SYCL_CPP_TYPE         Kokkos::Experimental::SYCL)
 SET(EXECSPACE_OPENMPTARGET_CPP_TYPE Kokkos::Experimental::OpenMPTarget)
 SET(EXECSPACE_OPENMP_CPP_TYPE       Kokkos::OpenMP)
-SET(EXECSPACE_PTHREAD_CPP_TYPE      Kokkos::Threads)
+SET(EXECSPACE_THREADS_CPP_TYPE      Kokkos::Threads)
 SET(EXECSPACE_SERIAL_CPP_TYPE       Kokkos::Serial)
 
 SET(MEM_SPACES
@@ -174,17 +174,17 @@ ENDIF()
 
 KOKKOSKERNELS_ADD_OPTION(
   INST_EXECSPACE_THREADS
-  ${KOKKOSKERNELS_INST_EXECSPACE_PTHREAD_DEFAULT}
+  ${KOKKOSKERNELS_INST_EXECSPACE_THREADS_DEFAULT}
   BOOL
-  "Whether to build kernels for the execution space Kokkos::Threads.  If explicit template instantiation (ETI) is enabled in Trilinos, disabling this when Kokkos_ENABLE_PTHREAD is enabled may increase build times. Default: ON if Kokkos is Threads-enabled, OFF otherwise."
+  "Whether to build kernels for the execution space Kokkos::Threads.  If explicit template instantiation (ETI) is enabled in Trilinos, disabling this when Kokkos_ENABLE_THREADS is enabled may increase build times. Default: ON if Kokkos is Threads-enabled, OFF otherwise."
 )
 #There continues to be name ambiguity with threads vs pthreads
-SET(KOKKOSKERNELS_INST_EXECSPACE_PTHREAD ${KOKKOSKERNELS_INST_EXECSPACE_THREADS})
+SET(KOKKOSKERNELS_INST_EXECSPACE_THREADS ${KOKKOSKERNELS_INST_EXECSPACE_THREADS})
 
-IF(KOKKOSKERNELS_INST_EXECSPACE_PTHREAD AND KOKKOSKERNELS_INST_MEMSPACE_HOSTSPACE)
+IF(KOKKOSKERNELS_INST_EXECSPACE_THREADS AND KOKKOSKERNELS_INST_MEMSPACE_HOSTSPACE)
   LIST(APPEND DEVICE_LIST "<Threads,HostSpace>")
-  IF(NOT KOKKOS_ENABLE_PTHREAD)
-    MESSAGE(FATAL_ERROR "Set ETI on for PTHREAD, but Kokkos was not configured with the PTHREAD backend")
+  IF(NOT KOKKOS_ENABLE_THREADS)
+    MESSAGE(FATAL_ERROR "Set ETI on for THREADS, but Kokkos was not configured with the THREADS backend")
   ENDIF()
 ENDIF()
 
@@ -201,7 +201,7 @@ SET(EXECSPACE_SYCL_VALID_MEM_SPACES               SYCLSPACE SYCLSHAREDSPACE)
 SET(EXECSPACE_OPENMPTARGET_VALID_MEM_SPACES       OPENMPTARGETSPACE)
 SET(EXECSPACE_SERIAL_VALID_MEM_SPACES             HBWSPACE HOSTSPACE)
 SET(EXECSPACE_OPENMP_VALID_MEM_SPACES             HBWSPACE HOSTSPACE)
-SET(EXECSPACE_PTHREAD_VALID_MEM_SPACES            HBWSPACE HOSTSPACE)
+SET(EXECSPACE_THREADS_VALID_MEM_SPACES            HBWSPACE HOSTSPACE)
 SET(DEVICES)
 FOREACH(EXEC ${EXEC_SPACES})
   IF (KOKKOSKERNELS_INST_${EXEC})
diff --git a/cmake/kokkoskernels_eti_floats.cmake b/cmake/kokkoskernels_eti_floats.cmake
index 69e50af3cd..debf99bb0e 100644
--- a/cmake/kokkoskernels_eti_floats.cmake
+++ b/cmake/kokkoskernels_eti_floats.cmake
@@ -18,6 +18,13 @@ KOKKOSKERNELS_ADD_OPTION(
   "Whether to pre instantiate kernels for the scalar type float.  Disabling this may increase build times. Default: OFF or unless enabled during a Trilinos build with Trilinos_ENABLE_FLOAT."
   )
 
+KOKKOSKERNELS_ADD_OPTION(
+        INST_HALF
+        OFF
+        BOOL
+        "Whether to pre instantiate kernels for the scalar type Kokkos::Experimental::half_t.  Disabling this may increase build times. Default: OFF"
+)
+
 SET(FLOATS
   FLOAT
   DOUBLE
@@ -25,6 +32,7 @@ SET(FLOATS
   COMPLEX_DOUBLE)
 SET(DOUBLE_CPP_TYPE "double")
 SET(FLOAT_CPP_TYPE "float")
+SET(HALF_CPP_TYPE "Kokkos::Experimental::half_t")
 SET(COMPLEX_FLOAT_CPP_TYPE "Kokkos::complex<float>")
 SET(COMPLEX_DOUBLE_CPP_TYPE "Kokkos::complex<double>")
 
@@ -63,6 +71,11 @@ IF (KOKKOSKERNELS_INST_FLOAT)
   LIST(APPEND SCALAR_LIST "float")
 ENDIF()
 
+# TODO: Fix build errors in kokkos when half_t is used in ETI
+#IF (KOKKOSKERNELS_INST_HALF)
+#  LIST(APPEND SCALAR_LIST "Kokkos::Experimental::half_t")
+#ENDIF()
+
 IF (KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
   LIST(APPEND SCALAR_LIST "complex<double>")
 ENDIF()
diff --git a/cmake/kokkoskernels_tpls.cmake b/cmake/kokkoskernels_tpls.cmake
index 15ff4e8bd6..f650168757 100644
--- a/cmake/kokkoskernels_tpls.cmake
+++ b/cmake/kokkoskernels_tpls.cmake
@@ -339,9 +339,7 @@ MACRO(kokkoskernels_export_imported_tpl NAME)
       ENDIF()
 
       SET(TPL_LINK_OPTIONS)
-      IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.13.0")
-        GET_TARGET_PROPERTY(TPL_LINK_OPTIONS ${TPL_IMPORTED_NAME} INTERFACE_LINK_OPTIONS)
-      ENDIF()
+      GET_TARGET_PROPERTY(TPL_LINK_OPTIONS ${TPL_IMPORTED_NAME} INTERFACE_LINK_OPTIONS)
       IF(TPL_LINK_OPTIONS)
         KOKKOSKERNELS_APPEND_CONFIG_LINE("INTERFACE_LINK_OPTIONS ${TPL_LINK_OPTIONS}")
       ENDIF()
@@ -371,9 +369,7 @@ MACRO(kokkoskernels_import_tpl NAME)
   # I have still been getting errors about ROOT variables being ignored
   # I'm not sure if this is a scope issue - but make sure
   # the policy is set before we do any find_package calls
-  IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0")
-    CMAKE_POLICY(SET CMP0074 NEW)
-  ENDIF()
+  CMAKE_POLICY(SET CMP0074 NEW)
 
   IF (KOKKOSKERNELS_ENABLE_TPL_${NAME})
     #Tack on a TPL here to make sure we avoid using anyone else's find
@@ -391,16 +387,6 @@ MACRO(kokkoskernels_import_tpl NAME)
   ENDIF()
 ENDMACRO(kokkoskernels_import_tpl)
 
-FUNCTION(TARGET_LINK_FLAGS_PORTABLE LIBRARY)
-  IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.13")
-    #great, this works the "right" way
-    TARGET_LINK_OPTIONS(${LIBRARY} ${ARGN})
-  ELSE()
-    #bummer, this works the "hacky" way
-    TARGET_LINK_LIBRARIES(${LIBRARY} ${ARGN})
-  ENDIF()
-ENDFUNCTION(TARGET_LINK_FLAGS_PORTABLE)
-
 FUNCTION(kokkoskernels_link_tpl TARGET)
   CMAKE_PARSE_ARGUMENTS(TPL
    "PUBLIC;PRIVATE;INTERFACE"
@@ -466,6 +452,20 @@ KOKKOSKERNELS_ADD_TPL_OPTION(CUBLAS   ${CUBLAS_DEFAULT}     "Whether to enable C
 KOKKOSKERNELS_ADD_TPL_OPTION(CUSPARSE ${CUSPARSE_DEFAULT}   "Whether to enable CUSPARSE"
   DEFAULT_DOCSTRING "ON if CUDA-enabled Kokkos, otherwise OFF")
 
+KOKKOSKERNELS_ADD_OPTION(NO_DEFAULT_ROCM_TPLS OFF BOOL "Whether ROCM TPLs should be enabled by default. Default: OFF")
+# Unlike CUDA, ROCm does not automatically install these TPLs
+SET(ROCBLAS_DEFAULT   OFF)
+SET(ROCSPARSE_DEFAULT OFF)
+# Since the default is OFF we do not really need this piece of logic here.
+# IF(KOKKOSKERNELS_NO_DEFAULT_ROCM_TPLS)
+#   SET(ROCBLAS_DEFAULT   OFF)
+#   SET(ROCSPARSE_DEFAULT OFF)
+# ENDIF()
+KOKKOSKERNELS_ADD_TPL_OPTION(ROCBLAS   ${ROCBLAS_DEFAULT}     "Whether to enable ROCBLAS"
+  DEFAULT_DOCSTRING "ON if HIP-enabled Kokkos, otherwise OFF")
+KOKKOSKERNELS_ADD_TPL_OPTION(ROCSPARSE ${ROCSPARSE_DEFAULT}   "Whether to enable ROCSPARSE"
+  DEFAULT_DOCSTRING "ON if HIP-enabled Kokkos, otherwise OFF")
+
 IF (KOKKOSKERNELS_ENABLE_TPL_MAGMA)
     IF (F77_BLAS_MANGLE STREQUAL "(name,NAME) name ## _")
       SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DADD_ -fopenmp -lgfortran")
@@ -501,6 +501,8 @@ IF (NOT KOKKOSKERNELS_HAS_TRILINOS)
   KOKKOSKERNELS_IMPORT_TPL(METIS)
   KOKKOSKERNELS_IMPORT_TPL(ARMPL)
   KOKKOSKERNELS_IMPORT_TPL(MAGMA)
+  KOKKOSKERNELS_IMPORT_TPL(ROCBLAS)
+  KOKKOSKERNELS_IMPORT_TPL(ROCSPARSE)
 ELSE ()
   IF (Trilinos_ENABLE_SuperLU5_API)
     SET(HAVE_KOKKOSKERNELS_SUPERLU5_API TRUE)
diff --git a/docs/Doxyfile.in b/docs/Doxyfile.in
index 8e208bb937..8ca2c879ea 100644
--- a/docs/Doxyfile.in
+++ b/docs/Doxyfile.in
@@ -477,13 +477,13 @@ NUM_PROC_THREADS       = 1
 # normally produced when WARNINGS is set to YES.
 # The default value is: NO.
 
-EXTRACT_ALL            = NO
+EXTRACT_ALL            = YES
 
 # If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
 # be included in the documentation.
 # The default value is: NO.
 
-EXTRACT_PRIVATE        = NO
+EXTRACT_PRIVATE        = YES
 
 # If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
 # methods of a class will be included in the documentation.
@@ -495,13 +495,13 @@ EXTRACT_PRIV_VIRTUAL   = NO
 # scope will be included in the documentation.
 # The default value is: NO.
 
-EXTRACT_PACKAGE        = NO
+EXTRACT_PACKAGE        = YES
 
 # If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
 # included in the documentation.
 # The default value is: NO.
 
-EXTRACT_STATIC         = NO
+EXTRACT_STATIC         = YES
 
 # If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
 # locally in source files will be included in the documentation. If set to NO,
@@ -2236,7 +2236,7 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             =
+PREDEFINED             = DOXY
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
diff --git a/docs/index.rst b/docs/index.rst
index a728877de3..06240595bf 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -22,11 +22,12 @@ Indices and tables
 Docs
 ====
 
-.. doxygennamespace:: Kokkos
-   :members:
 .. doxygennamespace:: KokkosBlas
+   :project: KokkosKernels
    :members:
 .. doxygennamespace:: KokkosSparse
+   :project: KokkosKernels
    :members:
 .. doxygennamespace:: KokkosBatched
+   :project: KokkosKernels
    :members:
\ No newline at end of file
diff --git a/example/cmake/in-tree/CMakeLists.txt b/example/cmake/in-tree/CMakeLists.txt
index 79dc09b06c..2192d78e29 100644
--- a/example/cmake/in-tree/CMakeLists.txt
+++ b/example/cmake/in-tree/CMakeLists.txt
@@ -1,6 +1,5 @@
-#Kokkos requires at least 3.10
-#but really you should use 3.12
-cmake_minimum_required (VERSION 3.10)
+#Kokkos requires at least 3.16
+cmake_minimum_required (VERSION 3.16)
 
 project (MyProgram)
 
diff --git a/example/cmake/install/CMakeLists.txt b/example/cmake/install/CMakeLists.txt
index 44a1777e3c..51233783df 100644
--- a/example/cmake/install/CMakeLists.txt
+++ b/example/cmake/install/CMakeLists.txt
@@ -1,11 +1,8 @@
-#Kokkos requires at least 3.10 
-#but really you should use 3.12
-cmake_minimum_required (VERSION 3.10)
+#Kokkos requires at least 3.16
+cmake_minimum_required (VERSION 3.16)
 
-IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0")
-  MESSAGE(STATUS "Setting policy CMP0074 to use <Package>_ROOT variables")
-  CMAKE_POLICY(SET CMP0074 NEW)
-ENDIF()
+MESSAGE(STATUS "Setting policy CMP0074 to use <Package>_ROOT variables")
+CMAKE_POLICY(SET CMP0074 NEW)
 
 project (MyProgram)
 
diff --git a/example/fenl/CGSolve.hpp b/example/fenl/CGSolve.hpp
index 3a3d2a3301..79c8badfeb 100644
--- a/example/fenl/CGSolve.hpp
+++ b/example/fenl/CGSolve.hpp
@@ -60,86 +60,80 @@
 namespace Kokkos {
 namespace Example {
 
-template< class ImportType , class SparseMatrixType , class VectorType , class TagType = void >
-struct CGSolve ;
-
-
-template< class ImportType , class SparseMatrixType , class VectorType >
-struct CGSolve< ImportType , SparseMatrixType , VectorType ,
-  typename std::enable_if<(
-    Kokkos::Impl::is_view< VectorType >::value &&
-    VectorType::rank == 1
-  )>::type >
-{
-  typedef typename VectorType::value_type scalar_type ;
+template <class ImportType, class SparseMatrixType, class VectorType,
+          class TagType = void>
+struct CGSolve;
+
+template <class ImportType, class SparseMatrixType, class VectorType>
+struct CGSolve<ImportType, SparseMatrixType, VectorType,
+               typename std::enable_if<(Kokkos::is_view<VectorType>::value &&
+                                        VectorType::rank == 1)>::type> {
+  typedef typename VectorType::value_type scalar_type;
   typedef typename VectorType::execution_space execution_space;
 
-  size_t iteration ;
-  double iter_time ;
-  double matvec_time ;
-  double norm_res ;
-
-  CGSolve( const ImportType       & import ,
-           const SparseMatrixType & A ,
-           const VectorType       & b ,
-           const VectorType       & x ,
-           const size_t             maximum_iteration = 200 ,
-           const double             tolerance = std::numeric_limits<double>::epsilon() )
-    : iteration(0)
-    , iter_time(0)
-    , matvec_time(0)
-    , norm_res(0)
-  {
-    const size_t count_owned = import.count_owned ;
+  size_t iteration;
+  double iter_time;
+  double matvec_time;
+  double norm_res;
+
+  CGSolve(const ImportType& import, const SparseMatrixType& A,
+          const VectorType& b, const VectorType& x,
+          const size_t maximum_iteration = 200,
+          const double tolerance = std::numeric_limits<double>::epsilon())
+      : iteration(0), iter_time(0), matvec_time(0), norm_res(0) {
+    const size_t count_owned = import.count_owned;
     const size_t count_total = import.count_owned + import.count_receive;
 
     // Need input vector to matvec to be owned + received
-    VectorType pAll ( "cg::p" , count_total );
+    VectorType pAll("cg::p", count_total);
 
-    VectorType p = Kokkos::subview( pAll , std::pair<size_t,size_t>(0,count_owned) );
-    VectorType r ( "cg::r" , count_owned );
-    VectorType Ap( "cg::Ap", count_owned );
+    VectorType p =
+        Kokkos::subview(pAll, std::pair<size_t, size_t>(0, count_owned));
+    VectorType r("cg::r", count_owned);
+    VectorType Ap("cg::Ap", count_owned);
 
     /* r = b - A * x ; */
 
-    /* p  = x       */  Kokkos::deep_copy( p , x );
-    /* import p     */  import( pAll );
-    /* Ap = A * p   */  KokkosSparse::spmv( "N" , 1.0 , A , pAll , 0.0 , Ap);
-    /* b - Ap => r  */  KokkosBlas::update( 1.0 , b , -1.0 , Ap , 0.0 , r);
-    /* p  = r       */  Kokkos::deep_copy( p , r );
+    /* p  = x       */ Kokkos::deep_copy(p, x);
+    /* import p     */ import(pAll);
+    /* Ap = A * p   */ KokkosSparse::spmv("N", 1.0, A, pAll, 0.0, Ap);
+    /* b - Ap => r  */ KokkosBlas::update(1.0, b, -1.0, Ap, 0.0, r);
+    /* p  = r       */ Kokkos::deep_copy(p, r);
 
-    double old_rdot = Kokkos::Example::all_reduce( KokkosBlas::dot( r , r ) , import.comm );
+    double old_rdot =
+        Kokkos::Example::all_reduce(KokkosBlas::dot(r, r), import.comm);
 
-    norm_res  = sqrt( old_rdot );
-    iteration = 0 ;
+    norm_res  = sqrt(old_rdot);
+    iteration = 0;
 
-    Kokkos::Timer wall_clock ;
+    Kokkos::Timer wall_clock;
     Kokkos::Timer timer;
 
-    while ( tolerance < norm_res && iteration < maximum_iteration ) {
-
+    while (tolerance < norm_res && iteration < maximum_iteration) {
       /* pAp_dot = dot( p , Ap = A * p ) */
 
       timer.reset();
-      /* import p    */  import( pAll );
-      /* Ap = A * p  */  KokkosSparse::spmv( "N", 1.0, A , pAll, 0.0, Ap);
+      /* import p    */ import(pAll);
+      /* Ap = A * p  */ KokkosSparse::spmv("N", 1.0, A, pAll, 0.0, Ap);
       execution_space().fence();
       matvec_time += timer.seconds();
 
-      const double pAp_dot = Kokkos::Example::all_reduce( KokkosBlas::dot( p , Ap ) , import.comm );
-      const double alpha   = old_rdot / pAp_dot ;
+      const double pAp_dot =
+          Kokkos::Example::all_reduce(KokkosBlas::dot(p, Ap), import.comm);
+      const double alpha = old_rdot / pAp_dot;
 
-      /* x +=  alpha * p ;  */ KokkosBlas::axpby( alpha, p  , 1.0 , x );
-      /* r += -alpha * Ap ; */ KokkosBlas::axpby(-alpha, Ap , 1.0 , r );
+      /* x +=  alpha * p ;  */ KokkosBlas::axpby(alpha, p, 1.0, x);
+      /* r += -alpha * Ap ; */ KokkosBlas::axpby(-alpha, Ap, 1.0, r);
 
-      const double r_dot = Kokkos::Example::all_reduce( KokkosBlas::dot( r , r ) , import.comm );
-      const double beta  = r_dot / old_rdot ;
+      const double r_dot =
+          Kokkos::Example::all_reduce(KokkosBlas::dot(r, r), import.comm);
+      const double beta = r_dot / old_rdot;
 
-      /* p = r + beta * p ; */ KokkosBlas::axpby( 1.0 , r , beta , p );
+      /* p = r + beta * p ; */ KokkosBlas::axpby(1.0, r, beta, p);
 
-      norm_res = std::sqrt( old_rdot = r_dot );
+      norm_res = std::sqrt(old_rdot = r_dot);
 
-      ++iteration ;
+      ++iteration;
     }
 
     execution_space().fence();
@@ -147,12 +141,10 @@ struct CGSolve< ImportType , SparseMatrixType , VectorType ,
   }
 };
 
-} // namespace Example
-} // namespace Kokkos
+}  // namespace Example
+}  // namespace Kokkos
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 #endif /* #ifndef KOKKOS_EXAMPLE_CG_SOLVE */
-
-
diff --git a/example/fenl/main.cpp b/example/fenl/main.cpp
index 67bf07d98e..ba99f0341e 100644
--- a/example/fenl/main.cpp
+++ b/example/fenl/main.cpp
@@ -65,153 +65,166 @@
 
 //----------------------------------------------------------------------------
 
-enum { CMD_USE_THREADS = 0
-     , CMD_USE_NUMA
-     , CMD_USE_CORE_PER_NUMA
-     , CMD_USE_CUDA
-     , CMD_USE_OPENMP
-     , CMD_USE_CUDA_DEV
-     , CMD_USE_FIXTURE_X
-     , CMD_USE_FIXTURE_Y
-     , CMD_USE_FIXTURE_Z
-     , CMD_USE_FIXTURE_BEGIN
-     , CMD_USE_FIXTURE_END
-     , CMD_USE_FIXTURE_QUADRATIC
-     , CMD_USE_ATOMIC
-     , CMD_USE_TRIALS
-     , CMD_VTUNE
-     , CMD_PRINT
-     , CMD_ECHO
-     , CMD_ERROR
-     , CMD_COUNT };
-
-void print_cmdline( std::ostream & s , const int cmd[] )
-{
-  if ( cmd[ CMD_USE_THREADS ] ) {
-    s << " Threads(" << cmd[ CMD_USE_THREADS ]
-      << ") NUMA(" << cmd[ CMD_USE_NUMA ]
-      << ") CORE_PER_NUMA(" << cmd[ CMD_USE_CORE_PER_NUMA ]
-      << ")" ;
+enum {
+  CMD_USE_THREADS = 0,
+  CMD_USE_NUMA,
+  CMD_USE_CORE_PER_NUMA,
+  CMD_USE_CUDA,
+  CMD_USE_OPENMP,
+  CMD_USE_CUDA_DEV,
+  CMD_USE_FIXTURE_X,
+  CMD_USE_FIXTURE_Y,
+  CMD_USE_FIXTURE_Z,
+  CMD_USE_FIXTURE_BEGIN,
+  CMD_USE_FIXTURE_END,
+  CMD_USE_FIXTURE_QUADRATIC,
+  CMD_USE_ATOMIC,
+  CMD_USE_TRIALS,
+  CMD_VTUNE,
+  CMD_PRINT,
+  CMD_ECHO,
+  CMD_ERROR,
+  CMD_COUNT
+};
+
+void print_cmdline(std::ostream& s, const int cmd[]) {
+  if (cmd[CMD_USE_THREADS]) {
+    s << " Threads(" << cmd[CMD_USE_THREADS] << ") NUMA(" << cmd[CMD_USE_NUMA]
+      << ") CORE_PER_NUMA(" << cmd[CMD_USE_CORE_PER_NUMA] << ")";
   }
-  if ( cmd[ CMD_USE_OPENMP ] ) {
-    s << " OpenMP(" << cmd[ CMD_USE_OPENMP ]
-      << ") NUMA(" << cmd[ CMD_USE_NUMA ]
-      << ") CORE_PER_NUMA(" << cmd[ CMD_USE_CORE_PER_NUMA ]
-      << ")" ;
+  if (cmd[CMD_USE_OPENMP]) {
+    s << " OpenMP(" << cmd[CMD_USE_OPENMP] << ") NUMA(" << cmd[CMD_USE_NUMA]
+      << ") CORE_PER_NUMA(" << cmd[CMD_USE_CORE_PER_NUMA] << ")";
   }
-  if ( cmd[ CMD_USE_FIXTURE_X ] ) {
-    s << " Fixture(" << cmd[ CMD_USE_FIXTURE_X ]
-      << "x" << cmd[ CMD_USE_FIXTURE_Y ]
-      << "x" << cmd[ CMD_USE_FIXTURE_Z ]
-      << ")" ;
+  if (cmd[CMD_USE_FIXTURE_X]) {
+    s << " Fixture(" << cmd[CMD_USE_FIXTURE_X] << "x" << cmd[CMD_USE_FIXTURE_Y]
+      << "x" << cmd[CMD_USE_FIXTURE_Z] << ")";
   }
-  if ( cmd[ CMD_USE_FIXTURE_BEGIN ] ) {
-    s << " Fixture( " << cmd[ CMD_USE_FIXTURE_BEGIN ]
-      << " .. " << cmd[ CMD_USE_FIXTURE_END ]
-      << " )" ;
+  if (cmd[CMD_USE_FIXTURE_BEGIN]) {
+    s << " Fixture( " << cmd[CMD_USE_FIXTURE_BEGIN] << " .. "
+      << cmd[CMD_USE_FIXTURE_END] << " )";
   }
-  if ( cmd[ CMD_USE_FIXTURE_QUADRATIC ] ) {
-    s << " Quadratic-Element" ;
+  if (cmd[CMD_USE_FIXTURE_QUADRATIC]) {
+    s << " Quadratic-Element";
   }
-  if ( cmd[ CMD_USE_CUDA ] ) {
-    s << " CUDA(" << cmd[ CMD_USE_CUDA_DEV ] << ")" ;
+  if (cmd[CMD_USE_CUDA]) {
+    s << " CUDA(" << cmd[CMD_USE_CUDA_DEV] << ")";
   }
-  if ( cmd[ CMD_USE_ATOMIC ] ) {
-    s << " ATOMIC" ;
+  if (cmd[CMD_USE_ATOMIC]) {
+    s << " ATOMIC";
   }
-  if ( cmd[ CMD_USE_TRIALS ] ) {
-    s << " TRIALS(" << cmd[ CMD_USE_TRIALS ] << ")" ;
+  if (cmd[CMD_USE_TRIALS]) {
+    s << " TRIALS(" << cmd[CMD_USE_TRIALS] << ")";
   }
-  if ( cmd[ CMD_VTUNE ] ) {
-    s << " VTUNE" ;
+  if (cmd[CMD_VTUNE]) {
+    s << " VTUNE";
   }
-  if ( cmd[ CMD_PRINT ] ) {
-    s << " PRINT" ;
+  if (cmd[CMD_PRINT]) {
+    s << " PRINT";
   }
-  s << std::endl ;
+  s << std::endl;
 }
 
-void print_perf_value( std::ostream & s , const std::vector<size_t> & widths,  const Kokkos::Example::FENL::Perf & perf )
-{
-  int i=0;
+void print_perf_value(std::ostream& s, const std::vector<size_t>& widths,
+                      const Kokkos::Example::FENL::Perf& perf) {
+  int i = 0;
   s << std::setw(widths[i++]) << perf.global_elem_count << " ,";
   s << std::setw(widths[i++]) << perf.global_node_count << " ,";
   s << std::setw(widths[i++]) << perf.newton_iter_count << " ,";
   s << std::setw(widths[i++]) << perf.cg_iter_count << " ,";
   s << std::setw(widths[i++]) << perf.map_ratio << " ,";
-  s << std::setw(widths[i++]) << ( perf.fill_node_set * 1000.0 ) / perf.global_node_count << " ,";
-  s << std::setw(widths[i++]) << ( perf.scan_node_count * 1000.0 ) / perf.global_node_count << " ,";
-  s << std::setw(widths[i++]) << ( perf.fill_graph_entries * 1000.0 ) / perf.global_node_count << " ,";
-  s << std::setw(widths[i++]) << ( perf.sort_graph_entries * 1000.0 ) / perf.global_node_count << " ,";
-  s << std::setw(widths[i++]) << ( perf.fill_element_graph * 1000.0 ) / perf.global_node_count << " ,";
-  s << std::setw(widths[i++]) << ( perf.create_sparse_matrix * 1000.0 ) / perf.global_node_count << " ,";
-  s << std::setw(widths[i++]) << ( perf.fill_time * 1000.0 ) / perf.global_node_count << " ,";
-  s << std::setw(widths[i++]) << ( perf.bc_time * 1000.0 ) / perf.global_node_count << " ,";
-  s << std::setw(widths[i++]) << ( ( perf.matvec_time * 1000.0 ) / perf.cg_iter_count ) / perf.global_node_count << " ,";
-  s << std::setw(widths[i++]) << ( ( perf.cg_time * 1000.0 ) / perf.cg_iter_count ) / perf.global_node_count << " ,";
-  s << std::setw(widths[i])   << perf.error_max;
-  s << std::endl ;
+  s << std::setw(widths[i++])
+    << (perf.fill_node_set * 1000.0) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++])
+    << (perf.scan_node_count * 1000.0) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++])
+    << (perf.fill_graph_entries * 1000.0) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++])
+    << (perf.sort_graph_entries * 1000.0) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++])
+    << (perf.fill_element_graph * 1000.0) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++])
+    << (perf.create_sparse_matrix * 1000.0) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++])
+    << (perf.fill_time * 1000.0) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++])
+    << (perf.bc_time * 1000.0) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++])
+    << ((perf.matvec_time * 1000.0) / perf.cg_iter_count) /
+           perf.global_node_count
+    << " ,";
+  s << std::setw(widths[i++])
+    << ((perf.cg_time * 1000.0) / perf.cg_iter_count) / perf.global_node_count
+    << " ,";
+  s << std::setw(widths[i]) << perf.error_max;
+  s << std::endl;
 }
 
-template< class Device , Kokkos::Example::BoxElemPart::ElemOrder ElemOrder >
-void run( MPI_Comm comm , const int cmd[] )
-{
-  int comm_rank = 0 ;
-  int comm_size = 1 ;
+template <class Device, Kokkos::Example::BoxElemPart::ElemOrder ElemOrder>
+void run(MPI_Comm comm, const int cmd[]) {
+  int comm_rank = 0;
+  int comm_size = 1;
 
-#if defined( KOKKOS_ENABLE_MPI )
-  MPI_Comm_rank( comm , & comm_rank );
-  MPI_Comm_size( comm , & comm_size );
+#if defined(KOKKOS_ENABLE_MPI)
+  MPI_Comm_rank(comm, &comm_rank);
+  MPI_Comm_size(comm, &comm_size);
 #else
-  comm = 0 ;
+  comm = 0;
   (void)comm_size;
 #endif
 
+  if (0 == comm_rank) {
+    if (cmd[CMD_USE_THREADS]) {
+      std::cout << "THREADS , " << cmd[CMD_USE_THREADS];
+    } else if (cmd[CMD_USE_OPENMP]) {
+      std::cout << "OPENMP , " << cmd[CMD_USE_OPENMP];
+    } else if (cmd[CMD_USE_CUDA]) {
+      std::cout << "CUDA";
+    }
 
-  if ( 0 == comm_rank ) {
-    if ( cmd[ CMD_USE_THREADS ] ) { std::cout << "THREADS , " << cmd[ CMD_USE_THREADS ] ; }
-    else if ( cmd[ CMD_USE_OPENMP ] ) { std::cout << "OPENMP , " << cmd[ CMD_USE_OPENMP ] ; }
-    else if ( cmd[ CMD_USE_CUDA ] ) { std::cout << "CUDA" ; }
-
-    if ( cmd[ CMD_USE_FIXTURE_QUADRATIC ] ) { std::cout << " , QUADRATIC-ELEMENT" ; }
-    else { std::cout << " , LINEAR-ELEMENT" ; }
+    if (cmd[CMD_USE_FIXTURE_QUADRATIC]) {
+      std::cout << " , QUADRATIC-ELEMENT";
+    } else {
+      std::cout << " , LINEAR-ELEMENT";
+    }
 
-    if ( cmd[ CMD_USE_ATOMIC ] ) { std::cout << " , USING ATOMICS" ; }
+    if (cmd[CMD_USE_ATOMIC]) {
+      std::cout << " , USING ATOMICS";
+    }
   }
 
-  std::vector< std::pair<std::string,std::string> > headers;
-
-
-  headers.push_back(std::make_pair("ELEMS","count"));
-  headers.push_back(std::make_pair("NODES","count"));
-  headers.push_back(std::make_pair("NEWTON","iter"));
-  headers.push_back(std::make_pair("CG","iter"));
-  headers.push_back(std::make_pair("MAP_RATIO","ratio"));
-  headers.push_back(std::make_pair("SET_FILL/NODE","millisec"));
-  headers.push_back(std::make_pair("SCAN/NODE","millisec"));
-  headers.push_back(std::make_pair("GRAPH_FILL/NODE","millisec"));
-  headers.push_back(std::make_pair("SORT/NODE","millisec"));
-  headers.push_back(std::make_pair("ELEM_GRAPH_FILL/NODE","millisec"));
-  headers.push_back(std::make_pair("MATRIX_CREATE/NODE","millisec"));
-  headers.push_back(std::make_pair("MATRIX_FILL/NODE","millisec"));
-  headers.push_back(std::make_pair("BOUNDARY/NODE","millisec"));
-  headers.push_back(std::make_pair("MAT_VEC/ITER/ROW","millisec"));
-  headers.push_back(std::make_pair("CG/ITER/ROW","millisec"));
-  headers.push_back(std::make_pair("ERROR","ratio"));
+  std::vector<std::pair<std::string, std::string> > headers;
+
+  headers.push_back(std::make_pair("ELEMS", "count"));
+  headers.push_back(std::make_pair("NODES", "count"));
+  headers.push_back(std::make_pair("NEWTON", "iter"));
+  headers.push_back(std::make_pair("CG", "iter"));
+  headers.push_back(std::make_pair("MAP_RATIO", "ratio"));
+  headers.push_back(std::make_pair("SET_FILL/NODE", "millisec"));
+  headers.push_back(std::make_pair("SCAN/NODE", "millisec"));
+  headers.push_back(std::make_pair("GRAPH_FILL/NODE", "millisec"));
+  headers.push_back(std::make_pair("SORT/NODE", "millisec"));
+  headers.push_back(std::make_pair("ELEM_GRAPH_FILL/NODE", "millisec"));
+  headers.push_back(std::make_pair("MATRIX_CREATE/NODE", "millisec"));
+  headers.push_back(std::make_pair("MATRIX_FILL/NODE", "millisec"));
+  headers.push_back(std::make_pair("BOUNDARY/NODE", "millisec"));
+  headers.push_back(std::make_pair("MAT_VEC/ITER/ROW", "millisec"));
+  headers.push_back(std::make_pair("CG/ITER/ROW", "millisec"));
+  headers.push_back(std::make_pair("ERROR", "ratio"));
 
   // find print widths
   size_t min_width = 10;
-  std::vector< size_t > widths(headers.size());
-  for (size_t i=0, ie=headers.size(); i<ie; ++i)
-    widths[i] = std::max(min_width, headers[i].first.size()+1);
+  std::vector<size_t> widths(headers.size());
+  for (size_t i = 0, ie = headers.size(); i < ie; ++i)
+    widths[i] = std::max(min_width, headers[i].first.size() + 1);
 
   // print column headers
-  if ( 0 == comm_rank ) {
-    std::cout << std::endl ;
-    for (size_t i=0; i<headers.size(); ++i)
+  if (0 == comm_rank) {
+    std::cout << std::endl;
+    for (size_t i = 0; i < headers.size(); ++i)
       std::cout << std::setw(widths[i]) << headers[i].first << " ,";
     std::cout << "\b\b  " << std::endl;
-    for (size_t i=0; i<headers.size(); ++i)
+    for (size_t i = 0; i < headers.size(); ++i)
       std::cout << std::setw(widths[i]) << headers[i].second << " ,";
     std::cout << "\b\b  " << std::endl;
 
@@ -219,213 +232,201 @@ void run( MPI_Comm comm , const int cmd[] )
     std::cout.precision(3);
   }
 
-  if ( cmd[ CMD_USE_FIXTURE_BEGIN ] ) {
-    for ( int i = cmd[CMD_USE_FIXTURE_BEGIN] ; i < cmd[CMD_USE_FIXTURE_END] * 2 ; i *= 2 ) {
-      int nelem[3] ;
-      nelem[0] = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
-      nelem[1] = 1 + nelem[0] ;
-      nelem[2] = 2 * nelem[0] ;
+  if (cmd[CMD_USE_FIXTURE_BEGIN]) {
+    for (int i = cmd[CMD_USE_FIXTURE_BEGIN]; i < cmd[CMD_USE_FIXTURE_END] * 2;
+         i *= 2) {
+      int nelem[3];
+      nelem[0] = std::max(1, (int)cbrt(((double)i) / 2.0));
+      nelem[1] = 1 + nelem[0];
+      nelem[2] = 2 * nelem[0];
 
       const Kokkos::Example::FENL::Perf perf =
-        cmd[ CMD_USE_FIXTURE_QUADRATIC ]
-        ? Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemQuadratic >
-            ( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
-        : Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemLinear >
-            ( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
-        ;
-
-      if ( 0 == comm_rank ) print_perf_value( std::cout , widths, perf );
+          cmd[CMD_USE_FIXTURE_QUADRATIC]
+              ? Kokkos::Example::FENL::fenl<
+                    Device, Kokkos::Example::BoxElemPart::ElemQuadratic>(
+                    comm, cmd[CMD_PRINT], cmd[CMD_USE_TRIALS],
+                    cmd[CMD_USE_ATOMIC], nelem)
+              : Kokkos::Example::FENL::fenl<
+                    Device, Kokkos::Example::BoxElemPart::ElemLinear>(
+                    comm, cmd[CMD_PRINT], cmd[CMD_USE_TRIALS],
+                    cmd[CMD_USE_ATOMIC], nelem);
+
+      if (0 == comm_rank) print_perf_value(std::cout, widths, perf);
     }
-  }
-  else {
-    int nelem[3] = { cmd[ CMD_USE_FIXTURE_X ] ,
-                     cmd[ CMD_USE_FIXTURE_Y ] ,
-                     cmd[ CMD_USE_FIXTURE_Z ] };
+  } else {
+    int nelem[3] = {cmd[CMD_USE_FIXTURE_X], cmd[CMD_USE_FIXTURE_Y],
+                    cmd[CMD_USE_FIXTURE_Z]};
 
     const Kokkos::Example::FENL::Perf perf =
-      cmd[ CMD_USE_FIXTURE_QUADRATIC ]
-      ? Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemQuadratic >
-          ( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
-      : Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemLinear >
-          ( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
-      ;
-
-    if ( 0 == comm_rank ) print_perf_value( std::cout , widths, perf );
+        cmd[CMD_USE_FIXTURE_QUADRATIC]
+            ? Kokkos::Example::FENL::fenl<
+                  Device, Kokkos::Example::BoxElemPart::ElemQuadratic>(
+                  comm, cmd[CMD_PRINT], cmd[CMD_USE_TRIALS],
+                  cmd[CMD_USE_ATOMIC], nelem)
+            : Kokkos::Example::FENL::fenl<
+                  Device, Kokkos::Example::BoxElemPart::ElemLinear>(
+                  comm, cmd[CMD_PRINT], cmd[CMD_USE_TRIALS],
+                  cmd[CMD_USE_ATOMIC], nelem);
+
+    if (0 == comm_rank) print_perf_value(std::cout, widths, perf);
   }
 }
 
 //----------------------------------------------------------------------------
 
-int main( int argc , char ** argv )
-{
-  int comm_rank = 0 ;
-  int comm_size = 1 ;
+int main(int argc, char** argv) {
+  int comm_rank = 0;
+  int comm_size = 1;
 
-#if defined( KOKKOS_ENABLE_MPI )
-  MPI_Init( & argc , & argv );
-  MPI_Comm comm = MPI_COMM_WORLD ;
-  MPI_Comm_rank( comm , & comm_rank );
-  MPI_Comm_size( comm , & comm_size );
+#if defined(KOKKOS_ENABLE_MPI)
+  MPI_Init(&argc, &argv);
+  MPI_Comm comm = MPI_COMM_WORLD;
+  MPI_Comm_rank(comm, &comm_rank);
+  MPI_Comm_size(comm, &comm_size);
 #else
-  MPI_Comm comm = 0 ;
-  (void) comm;
-  (void) comm_size;
+  MPI_Comm comm = 0;
+  (void)comm;
+  (void)comm_size;
 #endif
 
-  int cmdline[ CMD_COUNT ] ;
-
-  for ( int i = 0 ; i < CMD_COUNT ; ++i ) cmdline[i] = 0 ;
-
-  if ( 0 == comm_rank ) {
-    for ( int i = 1 ; i < argc ; ++i ) {
-      if ( 0 == strcasecmp( argv[i] , "threads" ) ) {
-        cmdline[ CMD_USE_THREADS ] = atoi( argv[++i] );
-      }
-      else if ( 0 == strcasecmp( argv[i] , "openmp" ) ) {
-        cmdline[ CMD_USE_OPENMP ] = atoi( argv[++i] );
-      }
-      else if ( 0 == strcasecmp( argv[i] , "cores" ) ) {
-        sscanf( argv[++i] , "%dx%d" ,
-                cmdline + CMD_USE_NUMA ,
-                cmdline + CMD_USE_CORE_PER_NUMA );
-      }
-      else if ( 0 == strcasecmp( argv[i] , "cuda" ) ) {
-        cmdline[ CMD_USE_CUDA ] = 1 ;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "cuda-dev" ) ) {
-        cmdline[ CMD_USE_CUDA ] = 1 ;
-        cmdline[ CMD_USE_CUDA_DEV ] = atoi( argv[++i] ) ;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "fixture" ) ) {
-        sscanf( argv[++i] , "%dx%dx%d" ,
-                cmdline + CMD_USE_FIXTURE_X ,
-                cmdline + CMD_USE_FIXTURE_Y ,
-                cmdline + CMD_USE_FIXTURE_Z );
-      }
-      else if ( 0 == strcasecmp( argv[i] , "fixture-range" ) ) {
-        sscanf( argv[++i] , "%d..%d" ,
-                cmdline + CMD_USE_FIXTURE_BEGIN ,
-                cmdline + CMD_USE_FIXTURE_END );
-      }
-      else if ( 0 == strcasecmp( argv[i] , "fixture-quadratic" ) ) {
-        cmdline[ CMD_USE_FIXTURE_QUADRATIC ] = 1 ;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "atomic" ) ) {
-        cmdline[ CMD_USE_ATOMIC ] = 1 ;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "trials" ) ) {
-        cmdline[ CMD_USE_TRIALS ] = atoi( argv[++i] ) ;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "vtune" ) ) {
-        cmdline[ CMD_VTUNE ] = 1 ;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "print" ) ) {
-        cmdline[ CMD_PRINT ] = 1 ;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "echo" ) ) {
-        cmdline[ CMD_ECHO ] = 1 ;
-      }
-      else {
-        cmdline[ CMD_ERROR ] = 1 ;
-
-        std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ;
+  int cmdline[CMD_COUNT];
+
+  for (int i = 0; i < CMD_COUNT; ++i) cmdline[i] = 0;
+
+  if (0 == comm_rank) {
+    for (int i = 1; i < argc; ++i) {
+      if (0 == Test::string_compare_no_case(argv[i], "threads")) {
+        cmdline[CMD_USE_THREADS] = atoi(argv[++i]);
+      } else if (0 == Test::string_compare_no_case(argv[i], "openmp")) {
+        cmdline[CMD_USE_OPENMP] = atoi(argv[++i]);
+      } else if (0 == Test::string_compare_no_case(argv[i], "cores")) {
+        sscanf(argv[++i], "%dx%d", cmdline + CMD_USE_NUMA,
+               cmdline + CMD_USE_CORE_PER_NUMA);
+      } else if (0 == Test::string_compare_no_case(argv[i], "cuda")) {
+        cmdline[CMD_USE_CUDA] = 1;
+      } else if (0 == Test::string_compare_no_case(argv[i], "cuda-dev")) {
+        cmdline[CMD_USE_CUDA]     = 1;
+        cmdline[CMD_USE_CUDA_DEV] = atoi(argv[++i]);
+      } else if (0 == Test::string_compare_no_case(argv[i], "fixture")) {
+        sscanf(argv[++i], "%dx%dx%d", cmdline + CMD_USE_FIXTURE_X,
+               cmdline + CMD_USE_FIXTURE_Y, cmdline + CMD_USE_FIXTURE_Z);
+      } else if (0 == Test::string_compare_no_case(argv[i], "fixture-range")) {
+        sscanf(argv[++i], "%d..%d", cmdline + CMD_USE_FIXTURE_BEGIN,
+               cmdline + CMD_USE_FIXTURE_END);
+      } else if (0 ==
+                 Test::string_compare_no_case(argv[i], "fixture-quadratic")) {
+        cmdline[CMD_USE_FIXTURE_QUADRATIC] = 1;
+      } else if (0 == Test::string_compare_no_case(argv[i], "atomic")) {
+        cmdline[CMD_USE_ATOMIC] = 1;
+      } else if (0 == Test::string_compare_no_case(argv[i], "trials")) {
+        cmdline[CMD_USE_TRIALS] = atoi(argv[++i]);
+      } else if (0 == Test::string_compare_no_case(argv[i], "vtune")) {
+        cmdline[CMD_VTUNE] = 1;
+      } else if (0 == Test::string_compare_no_case(argv[i], "print")) {
+        cmdline[CMD_PRINT] = 1;
+      } else if (0 == Test::string_compare_no_case(argv[i], "echo")) {
+        cmdline[CMD_ECHO] = 1;
+      } else {
+        cmdline[CMD_ERROR] = 1;
+
+        std::cerr << "Unrecognized command line argument #" << i << ": "
+                  << argv[i] << std::endl;
       }
     }
 
-    if ( cmdline[ CMD_ECHO ] && 0 == comm_rank ) { print_cmdline( std::cout , cmdline ); }
+    if (cmdline[CMD_ECHO] && 0 == comm_rank) {
+      print_cmdline(std::cout, cmdline);
+    }
   }
 
-#if defined( KOKKOS_ENABLE_MPI )
-  MPI_Bcast( cmdline , CMD_COUNT , MPI_INT , 0 , comm );
+#if defined(KOKKOS_ENABLE_MPI)
+  MPI_Bcast(cmdline, CMD_COUNT, MPI_INT, 0, comm);
 #endif
 
-  if ( cmdline[ CMD_VTUNE ] ) {
+  if (cmdline[CMD_VTUNE]) {
     std::stringstream cmd;
-    pid_t my_os_pid=getpid();
+    pid_t my_os_pid = getpid();
     const std::string vtune_loc =
-      "/usr/local/intel/vtune_amplifier_xe_2013/bin64/amplxe-cl";
+        "/usr/local/intel/vtune_amplifier_xe_2013/bin64/amplxe-cl";
     const std::string output_dir = "./vtune/vtune.";
-    const int p_rank = comm_rank;
-    cmd << vtune_loc
-        << " -collect hotspots -result-dir " << output_dir << p_rank
-        << " -target-pid " << my_os_pid << " &";
-    if (p_rank == 0)
-      std::cout << cmd.str() << std::endl;
+    const int p_rank             = comm_rank;
+    cmd << vtune_loc << " -collect hotspots -result-dir " << output_dir
+        << p_rank << " -target-pid " << my_os_pid << " &";
+    if (p_rank == 0) std::cout << cmd.str() << std::endl;
     system(cmd.str().c_str());
     system("sleep 10");
   }
 
-  if ( ! cmdline[ CMD_ERROR ] && ! cmdline[ CMD_ECHO ] ) {
-
-    if ( ! cmdline[ CMD_USE_TRIALS ] ) { cmdline[ CMD_USE_TRIALS ] = 1 ; }
-
-    if ( ! cmdline[ CMD_USE_FIXTURE_X ] && ! cmdline[ CMD_USE_FIXTURE_BEGIN ] ) {
-      cmdline[ CMD_USE_FIXTURE_X ] = 2 ;
-      cmdline[ CMD_USE_FIXTURE_Y ] = 2 ;
-      cmdline[ CMD_USE_FIXTURE_Z ] = 2 ;
+  if (!cmdline[CMD_ERROR] && !cmdline[CMD_ECHO]) {
+    if (!cmdline[CMD_USE_TRIALS]) {
+      cmdline[CMD_USE_TRIALS] = 1;
     }
 
-#if defined( KOKKOS_ENABLE_THREADS )
+    if (!cmdline[CMD_USE_FIXTURE_X] && !cmdline[CMD_USE_FIXTURE_BEGIN]) {
+      cmdline[CMD_USE_FIXTURE_X] = 2;
+      cmdline[CMD_USE_FIXTURE_Y] = 2;
+      cmdline[CMD_USE_FIXTURE_Z] = 2;
+    }
 
-    if ( cmdline[ CMD_USE_THREADS ] ) {
+#if defined(KOKKOS_ENABLE_THREADS)
 
-      if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) {
-        Kokkos::Threads::initialize( cmdline[ CMD_USE_THREADS ] ,
-                                     cmdline[ CMD_USE_NUMA ] ,
-                                     cmdline[ CMD_USE_CORE_PER_NUMA ] );
-      }
-      else {
-        Kokkos::Threads::initialize( cmdline[ CMD_USE_THREADS ] );
+    if (cmdline[CMD_USE_THREADS]) {
+      if (cmdline[CMD_USE_NUMA] && cmdline[CMD_USE_CORE_PER_NUMA]) {
+        Kokkos::Threads::initialize(cmdline[CMD_USE_THREADS],
+                                    cmdline[CMD_USE_NUMA],
+                                    cmdline[CMD_USE_CORE_PER_NUMA]);
+      } else {
+        Kokkos::Threads::initialize(cmdline[CMD_USE_THREADS]);
       }
 
-      run< Kokkos::Threads , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );
+      run<Kokkos::Threads, Kokkos::Example::BoxElemPart::ElemLinear>(comm,
+                                                                     cmdline);
 
       Kokkos::Threads::finalize();
     }
 
 #endif
 
-#if defined( KOKKOS_ENABLE_OPENMP )
-
-    if ( cmdline[ CMD_USE_OPENMP ] ) {
+#if defined(KOKKOS_ENABLE_OPENMP)
 
-      if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) {
-        Kokkos::OpenMP::initialize( cmdline[ CMD_USE_OPENMP ] ,
-                                     cmdline[ CMD_USE_NUMA ] ,
-                                     cmdline[ CMD_USE_CORE_PER_NUMA ] );
-      }
-      else {
-        Kokkos::OpenMP::initialize( cmdline[ CMD_USE_OPENMP ] );
+    if (cmdline[CMD_USE_OPENMP]) {
+      if (cmdline[CMD_USE_NUMA] && cmdline[CMD_USE_CORE_PER_NUMA]) {
+        Kokkos::OpenMP::initialize(cmdline[CMD_USE_OPENMP],
+                                   cmdline[CMD_USE_NUMA],
+                                   cmdline[CMD_USE_CORE_PER_NUMA]);
+      } else {
+        Kokkos::OpenMP::initialize(cmdline[CMD_USE_OPENMP]);
       }
 
-      run< Kokkos::OpenMP , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );
+      run<Kokkos::OpenMP, Kokkos::Example::BoxElemPart::ElemLinear>(comm,
+                                                                    cmdline);
 
       Kokkos::OpenMP::finalize();
     }
 
 #endif
 
-#if defined( KOKKOS_ENABLE_CUDA )
-    if ( cmdline[ CMD_USE_CUDA ] ) {
+#if defined(KOKKOS_ENABLE_CUDA)
+    if (cmdline[CMD_USE_CUDA]) {
       // Use the last device:
 
       Kokkos::HostSpace::execution_space::initialize();
-      Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( cmdline[ CMD_USE_CUDA_DEV ] ) );
+      Kokkos::Cuda::initialize(
+          Kokkos::Cuda::SelectDevice(cmdline[CMD_USE_CUDA_DEV]));
 
-      run< Kokkos::Cuda , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );
+      run<Kokkos::Cuda, Kokkos::Example::BoxElemPart::ElemLinear>(comm,
+                                                                  cmdline);
 
       Kokkos::Cuda::finalize();
       Kokkos::HostSpace::execution_space::finalize();
     }
 
 #endif
-
   }
 
-#if defined( KOKKOS_ENABLE_MPI )
+#if defined(KOKKOS_ENABLE_MPI)
   MPI_Finalize();
 #endif
 
-  return cmdline[ CMD_ERROR ] ? -1 : 0 ;
+  return cmdline[CMD_ERROR] ? -1 : 0;
 }
-
diff --git a/example/gmres/CMakeLists.txt b/example/gmres/CMakeLists.txt
index 15bfaac95d..4265fc4a5f 100644
--- a/example/gmres/CMakeLists.txt
+++ b/example/gmres/CMakeLists.txt
@@ -1,29 +1,29 @@
 KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
 KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 
-# Workaround https://github.com/kokkos/kokkos/issues/4376 for ibm/xl
-IF (NOT ${KOKKOS_COMPILER_IBM})
 KOKKOSKERNELS_ADD_EXECUTABLE(
   gmres_ex_real_A
   SOURCES ex_real_A.cpp
   )
 
+# FIXME_SYCL CUDA_ERROR_INVALID_ADDRESS_SPACE
+IF(NOT KOKKOS_ENABLE_SYCL)
 KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST(
   gmres_test_real_A
   SOURCES test_real_A.cpp
   )
+ENDIF()
 
 KOKKOSKERNELS_ADD_EXECUTABLE(
   gmres_test_cmplx_A
   SOURCES test_cmplx_A.cpp
   )
 
+# FIXME_SYCL CUDA_ERROR_INVALID_ADDRESS_SPACE
+IF(NOT KOKKOS_ENABLE_SYCL)
 KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST(
   gmres_test_prec
   SOURCES test_prec.cpp
   )
-
-ELSE ()
-  MESSAGE (STATUS "SKIPPING gmres examples - Kokkos::complex<half_t> unsupported with ibm/xlC as host compiler")
-ENDIF ()
+ENDIF()
 
diff --git a/example/gmres/gmres.hpp b/example/gmres/gmres.hpp
index 1354e4637c..48a6e4ae0d 100644
--- a/example/gmres/gmres.hpp
+++ b/example/gmres/gmres.hpp
@@ -42,28 +42,57 @@
 //@HEADER
 */
 
-#include<math.h>
-#include"KokkosKernels_IOUtils.hpp"
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas.hpp>
-#include<KokkosBlas3_trsm.hpp>
-#include<KokkosSparse_spmv.hpp>
-#include<KokkosSparse_Preconditioner.hpp>
+#include <math.h>
+#include "Kokkos_ArithTraits.hpp"
+#include "KokkosKernels_IOUtils.hpp"
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas.hpp>
+#include <KokkosBlas3_trsm.hpp>
+#include <KokkosSparse_spmv.hpp>
+#include <KokkosSparse_Preconditioner.hpp>
+#include "KokkosKernels_Error.hpp"
 
 ////////////////////////////////////////////////////////////////////////////////
 // libstdc++ half_t overloads
 ////////////////////////////////////////////////////////////////////////////////
+namespace Kokkos {
+namespace Issue1172WorkAround {
 #if !KOKKOS_HALF_T_IS_FLOAT
-Kokkos::Experimental::half_t abs(Kokkos::Experimental::half_t arg) {
-  return arg < 0.0 ? -arg : arg;
+Kokkos::Experimental::half_t fabs(Kokkos::Experimental::half_t arg) {
+  using AT = Kokkos::Details::ArithTraits<Kokkos::Experimental::half_t>;
+  return AT::abs(arg);
 }
 
-Kokkos::complex<Kokkos::Experimental::half_t> abs(Kokkos::complex<Kokkos::Experimental::half_t> arg) noexcept {
-  return Kokkos::complex<Kokkos::Experimental::half_t>(abs(Kokkos::complex<double>((double) arg.real(), (double) arg.imag())));
+Kokkos::Experimental::half_t fabs(
+    Kokkos::complex<Kokkos::Experimental::half_t> arg) noexcept {
+  return Kokkos::Experimental::half_t(Kokkos::abs(
+      Kokkos::complex<double>((double)arg.real(), (double)arg.imag())));
 }
 #endif  // KOKKOS_HALF_T_IS_FLOAT
 
+#if !KOKKOS_BHALF_T_IS_FLOAT
+Kokkos::Experimental::bhalf_t fabs(Kokkos::Experimental::bhalf_t arg) {
+  using AT = Kokkos::Details::ArithTraits<Kokkos::Experimental::bhalf_t>;
+  return AT::abs(arg);
+}
+
+Kokkos::Experimental::bhalf_t fabs(
+    Kokkos::complex<Kokkos::Experimental::bhalf_t> arg) noexcept {
+  return Kokkos::Experimental::bhalf_t(Kokkos::abs(
+      Kokkos::complex<double>((double)arg.real(), (double)arg.imag())));
+}
+#endif  // KOKKOS_BHALF_T_IS_FLOAT
+
+// This fabs wrapper was added to resolve:
+// https://github.com/kokkos/kokkos-kernels/issues/1172
+template <class T>
+KOKKOS_INLINE_FUNCTION T fabs(const Kokkos::complex<T> &x) {
+  return Kokkos::abs(x);
+}
+}  // namespace Issue1172WorkAround
+}  // namespace Kokkos
+
 // This struct is returned to the user to give solver
 // statistics and convergence status.
 struct GmresStats {
@@ -72,53 +101,49 @@ struct GmresStats {
   enum FLAG { Conv, NoConv, LOA };
   FLAG convFlagVal;
   std::string convFlag() {
-    switch(convFlagVal){
-      case Conv:
-        return "Converged";
-      case NoConv:
-        return "Not Converged";
-      case LOA:
-        return "Solver has had loss of accuracy.";
-      default:
-        return "Flag not defined.";
+    switch (convFlagVal) {
+      case Conv: return "Converged";
+      case NoConv: return "Not Converged";
+      case LOA: return "Solver has had loss of accuracy.";
+      default: return "Flag not defined.";
     }
   }
 };
 
 // This struct allows the user to pass in several
-// options to the solver. 
-template< class ScalarType > 
-struct GmresOpts
-{
-    typename Kokkos::Details::ArithTraits<ScalarType>::mag_type tol;
-    int m;
-    int maxRestart;
-    std::string ortho;
-    std::string precSide;
-
-  GmresOpts<ScalarType>():
-    tol(1e-8),
-    m(50),
-    maxRestart(50),
-    ortho("CGS2") { }
+// options to the solver.
+template <class ScalarType>
+struct GmresOpts {
+  typename Kokkos::Details::ArithTraits<ScalarType>::mag_type tol;
+  int m;
+  int maxRestart;
+  std::string ortho;
+  std::string precSide;
+
+  GmresOpts<ScalarType>() : tol(1e-8), m(50), maxRestart(50), ortho("CGS2") {}
 };
 
-template< class ScalarType, class Layout, class EXSP, class OrdinalType = int > 
-  GmresStats gmres( const KokkosSparse::CrsMatrix<ScalarType, OrdinalType, EXSP> &A, 
-                    const Kokkos::View<ScalarType*, Layout, EXSP> &B,
-                    Kokkos::View<ScalarType*, Layout, EXSP> &X, 
-                    const GmresOpts<ScalarType> &opts,
-                    const KokkosSparse::Experimental::Preconditioner<ScalarType, Layout, EXSP, OrdinalType> * const M = NULL){
+template <class ScalarType, class Layout, class EXSP, class OrdinalType = int>
+GmresStats gmres(
+    const KokkosSparse::CrsMatrix<ScalarType, OrdinalType, EXSP> &A,
+    const Kokkos::View<ScalarType *, Layout, EXSP> &B,
+    Kokkos::View<ScalarType *, Layout, EXSP> &X,
+    const GmresOpts<ScalarType> &opts,
+    const KokkosSparse::Experimental::Preconditioner<
+        ScalarType, Layout, EXSP, OrdinalType> *const M = NULL) {
+  using namespace Kokkos::Issue1172WorkAround;  // For 'fabs' wrappers above
   Kokkos::Profiling::pushRegion("GMRES::TotalTime:");
   typedef Kokkos::Details::ArithTraits<ScalarType> AT;
-  typedef typename AT::val_type ST; // So this code will run with ScalarType = std::complex<T>.
+  typedef typename AT::val_type
+      ST;  // So this code will run with ScalarType = std::complex<T>.
   typedef typename AT::mag_type MT;
-  ST one = AT::one();
+  ST one  = AT::one();
   ST zero = AT::zero();
 
-  typedef Kokkos::View<ST*, Layout, EXSP> ViewVectorType;
-  typedef Kokkos::View<ST*, Kokkos::LayoutRight, Kokkos::HostSpace> ViewHostVectorType;
-  typedef Kokkos::View<ST**, Layout, EXSP> ViewMatrixType;
+  typedef Kokkos::View<ST *, Layout, EXSP> ViewVectorType;
+  typedef Kokkos::View<ST *, Kokkos::LayoutRight, Kokkos::HostSpace>
+      ViewHostVectorType;
+  typedef Kokkos::View<ST **, Layout, EXSP> ViewMatrixType;
 
   unsigned int n = A.numRows();
 
@@ -126,233 +151,263 @@ template< class ScalarType, class Layout, class EXSP, class OrdinalType = int >
   const int m = opts.m;
 
   // Check compatibility of dimensions at run time.
-  if ( n != unsigned(A.numCols()) ){
+  if (n != unsigned(A.numCols())) {
     std::ostringstream os;
     os << "gmres: A must be a square matrix: "
-      << "numRows: " << n << "  numCols: " << A.numCols();
-      Kokkos::Impl::throw_runtime_exception (os.str ());
+       << "numRows: " << n << "  numCols: " << A.numCols();
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
-  if (X.extent(0) != B.extent(0) ||
-      X.extent(0) != n ) {
+  if (X.extent(0) != B.extent(0) || X.extent(0) != n) {
     std::ostringstream os;
     os << "gmres: Dimensions of A, X, and B do not match: "
        << "A: " << n << " x " << n << ", X: " << X.extent(0)
        << "x 1, B: " << B.extent(0) << " x 1";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
-  //Check parameter validity:
-  if(m <= 0){
-    throw std::invalid_argument("gmres: Please choose restart size m greater than zero.");
+  // Check parameter validity:
+  if (m <= 0) {
+    throw std::invalid_argument(
+        "gmres: Please choose restart size m greater than zero.");
   }
-  if(opts.maxRestart < 0){
-    throw std::invalid_argument("gmres: Please choose maxRestart greater than zero.");
+  if (opts.maxRestart < 0) {
+    throw std::invalid_argument(
+        "gmres: Please choose maxRestart greater than zero.");
   }
 
   bool converged = false;
-  int cycle = 0; // How many times have we restarted?
-  int numIters = 0;  //Number of iterations within the cycle before convergence.
+  int cycle      = 0;  // How many times have we restarted?
+  int numIters   = 0;  // Number of iterations within the cycle before
+                       // convergence.
   MT nrmB, trueRes, relRes, shortRelRes;
   GmresStats myStats;
 
   std::cout << "Convergence tolerance is: " << opts.tol << std::endl;
 
-  ViewVectorType Xiter("Xiter",n); //Intermediate solution at iterations before restart.
-  ViewVectorType Res(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Res"),n); //Residual vector
-  ViewVectorType Wj(Kokkos::view_alloc(Kokkos::WithoutInitializing, "W_j"),n); //Tmp work vector 1
-  ViewVectorType Wj2(Kokkos::view_alloc(Kokkos::WithoutInitializing, "W_j2"),n); //Tmp work vector 2
-  ViewHostVectorType GVec_h(Kokkos::view_alloc(Kokkos::WithoutInitializing, "GVec"),m+1);
-  ViewMatrixType GLsSoln("GLsSoln",m,1);//LS solution vec for Givens Rotation. Must be 2-D for trsm.
-  typename ViewMatrixType::HostMirror GLsSoln_h = Kokkos::create_mirror_view(GLsSoln); //This one is needed for triangular solve.
-  ViewHostVectorType CosVal_h("CosVal",m);
-  ViewHostVectorType SinVal_h("SinVal",m);
-  ViewMatrixType V(Kokkos::view_alloc(Kokkos::WithoutInitializing, "V"),n,m+1);
-  ViewMatrixType VSub; //Subview of 1st m cols for updating soln.
-  ViewVectorType orthoTmp(Kokkos::view_alloc(Kokkos::WithoutInitializing, "orthoTmp"),m);
-
-  ViewMatrixType H("H",m+1,m); //H matrix on device. Also used in Arn Rec debug.
-  typename ViewMatrixType::HostMirror H_h = Kokkos::create_mirror_view(H); //Make H into a host view of H.
-
-  //Compute initial residuals:
+  ViewVectorType Xiter(
+      "Xiter", n);  // Intermediate solution at iterations before restart.
+  ViewVectorType Res(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Res"),
+                     n);  // Residual vector
+  ViewVectorType Wj(Kokkos::view_alloc(Kokkos::WithoutInitializing, "W_j"),
+                    n);  // Tmp work vector 1
+  ViewVectorType Wj2(Kokkos::view_alloc(Kokkos::WithoutInitializing, "W_j2"),
+                     n);  // Tmp work vector 2
+  ViewHostVectorType GVec_h(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "GVec"), m + 1);
+  ViewMatrixType GLsSoln(
+      "GLsSoln", m,
+      1);  // LS solution vec for Givens Rotation. Must be 2-D for trsm.
+  typename ViewMatrixType::HostMirror GLsSoln_h = Kokkos::create_mirror_view(
+      GLsSoln);  // This one is needed for triangular solve.
+  ViewHostVectorType CosVal_h("CosVal", m);
+  ViewHostVectorType SinVal_h("SinVal", m);
+  ViewMatrixType V(Kokkos::view_alloc(Kokkos::WithoutInitializing, "V"), n,
+                   m + 1);
+  ViewMatrixType VSub;  // Subview of 1st m cols for updating soln.
+  ViewVectorType orthoTmp(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "orthoTmp"), m);
+
+  ViewMatrixType H("H", m + 1,
+                   m);  // H matrix on device. Also used in Arn Rec debug.
+  typename ViewMatrixType::HostMirror H_h =
+      Kokkos::create_mirror_view(H);  // Make H into a host view of H.
+
+  // Compute initial residuals:
   nrmB = KokkosBlas::nrm2(B);
-  Kokkos::deep_copy(Res,B);
+  Kokkos::deep_copy(Res, B);
 
-  //This is initial true residual, so don't need prec here. 
-  KokkosSparse::spmv("N", one, A, X, zero, Wj); // wj = Ax
-  KokkosBlas::axpy(-one, Wj, Res); // res = res-Wj = b-Ax.
+  // This is initial true residual, so don't need prec here.
+  KokkosSparse::spmv("N", one, A, X, zero, Wj);  // wj = Ax
+  KokkosBlas::axpy(-one, Wj, Res);               // res = res-Wj = b-Ax.
   trueRes = KokkosBlas::nrm2(Res);
-  if( nrmB != 0 ){
-    relRes = trueRes/nrmB;
-  }
-  else if( trueRes == 0 ){
+  if (nrmB != 0) {
+    relRes = trueRes / nrmB;
+  } else if (trueRes == 0) {
     relRes = trueRes;
-  }
-  else{ //B is zero, but X has wrong initial guess. 
-    Kokkos::deep_copy(X,0.0);
+  } else {  // B is zero, but X has wrong initial guess.
+    Kokkos::deep_copy(X, 0.0);
     relRes = 0;
   }
   shortRelRes = relRes;
   std::cout << "Initial relative residual is: " << relRes << std::endl;
-  if( relRes < opts.tol ){
+  if (relRes < opts.tol) {
     converged = true;
   }
 
-  while( !converged && cycle <= opts.maxRestart && shortRelRes >= 1e-14){
+  while (!converged && cycle <= opts.maxRestart && shortRelRes >= 1e-14) {
     GVec_h(0) = trueRes;
 
     // Run Arnoldi iteration:
-    auto Vj = Kokkos::subview(V,Kokkos::ALL,0);
-    Kokkos::deep_copy(Vj,Res);
-    KokkosBlas::scal(Vj,one/trueRes,Vj); //V0 = V0/norm(V0)
-
-    for (int j = 0; j < m; j++){
-      if( M != NULL){ //Apply Right prec
-        M->apply(Vj, Wj2); // wj2 = M*Vj
-        KokkosSparse::spmv("N", one, A, Wj2, zero, Wj); //wj = A*MVj = A*Wj2
-      }
-      else{
-        KokkosSparse::spmv("N", one, A, Vj, zero, Wj); //wj = A*Vj
+    auto Vj = Kokkos::subview(V, Kokkos::ALL, 0);
+    Kokkos::deep_copy(Vj, Res);
+    KokkosBlas::scal(Vj, one / trueRes, Vj);  // V0 = V0/norm(V0)
+
+    for (int j = 0; j < m; j++) {
+      if (M != NULL) {                                   // Apply Right prec
+        M->apply(Vj, Wj2);                               // wj2 = M*Vj
+        KokkosSparse::spmv("N", one, A, Wj2, zero, Wj);  // wj = A*MVj = A*Wj2
+      } else {
+        KokkosSparse::spmv("N", one, A, Vj, zero, Wj);  // wj = A*Vj
       }
       Kokkos::Profiling::pushRegion("GMRES::Orthog:");
-      if( opts.ortho == "MGS"){
-        for (int i = 0; i <= j; i++){
-          auto Vi = Kokkos::subview(V,Kokkos::ALL,i);
-          H_h(i,j) = KokkosBlas::dot(Vi,Wj);  //Vi^* Wj
-          KokkosBlas::axpy(-H_h(i,j),Vi,Wj);//wj = wj-Hij*Vi
+      if (opts.ortho == "MGS") {
+        for (int i = 0; i <= j; i++) {
+          auto Vi   = Kokkos::subview(V, Kokkos::ALL, i);
+          H_h(i, j) = KokkosBlas::dot(Vi, Wj);   // Vi^* Wj
+          KokkosBlas::axpy(-H_h(i, j), Vi, Wj);  // wj = wj-Hij*Vi
         }
-        auto Hj_h = Kokkos::subview(H_h,Kokkos::make_pair(0,j+1) ,j);
-      }
-      else if( opts.ortho == "CGS2"){
-        auto V0j = Kokkos::subview(V,Kokkos::ALL,Kokkos::make_pair(0,j+1));
-        auto Hj = Kokkos::subview(H,Kokkos::make_pair(0,j+1) ,j);
-        auto Hj_h = Kokkos::subview(H_h,Kokkos::make_pair(0,j+1) ,j);
-        KokkosBlas::gemv("C", one, V0j, Wj, zero, Hj); // Hj = Vj^T * wj
-        KokkosBlas::gemv("N", -one, V0j, Hj, one, Wj); // wj = wj - Vj * Hj
-
-        //Re-orthog CGS:
-        auto orthoTmpSub = Kokkos::subview(orthoTmp,Kokkos::make_pair(0,j+1));
-        KokkosBlas::gemv("C", one, V0j, Wj, zero, orthoTmpSub); // tmp (Hj) = Vj^T * wj
-        KokkosBlas::gemv("N", -one, V0j, orthoTmpSub, one, Wj); // wj = wj - Vj * tmp 
-        KokkosBlas::axpy(one, orthoTmpSub, Hj); // Hj = Hj + tmp
-        Kokkos::deep_copy(Hj_h,Hj);
-      }
-      else {
-        throw std::invalid_argument("Invalid argument for 'ortho'.  Please use 'CGS2' or 'MGS'.");
+        auto Hj_h = Kokkos::subview(H_h, Kokkos::make_pair(0, j + 1), j);
+      } else if (opts.ortho == "CGS2") {
+        auto V0j = Kokkos::subview(V, Kokkos::ALL, Kokkos::make_pair(0, j + 1));
+        auto Hj  = Kokkos::subview(H, Kokkos::make_pair(0, j + 1), j);
+        auto Hj_h = Kokkos::subview(H_h, Kokkos::make_pair(0, j + 1), j);
+        KokkosBlas::gemv("C", one, V0j, Wj, zero, Hj);  // Hj = Vj^T * wj
+        KokkosBlas::gemv("N", -one, V0j, Hj, one, Wj);  // wj = wj - Vj * Hj
+
+        // Re-orthog CGS:
+        auto orthoTmpSub =
+            Kokkos::subview(orthoTmp, Kokkos::make_pair(0, j + 1));
+        KokkosBlas::gemv("C", one, V0j, Wj, zero,
+                         orthoTmpSub);  // tmp (Hj) = Vj^T * wj
+        KokkosBlas::gemv("N", -one, V0j, orthoTmpSub, one,
+                         Wj);                    // wj = wj - Vj * tmp
+        KokkosBlas::axpy(one, orthoTmpSub, Hj);  // Hj = Hj + tmp
+        Kokkos::deep_copy(Hj_h, Hj);
+      } else {
+        throw std::invalid_argument(
+            "Invalid argument for 'ortho'.  Please use 'CGS2' or 'MGS'.");
       }
 
-      MT tmpNrm = KokkosBlas::nrm2(Wj);
-      H_h(j+1,j) = tmpNrm;
-      if(tmpNrm > 1e-14){
-        Vj = Kokkos::subview(V,Kokkos::ALL,j+1);
-        KokkosBlas::scal(Vj,one/H_h(j+1,j),Wj); // Vj = Wj/H(j+1,j)
+      MT tmpNrm     = KokkosBlas::nrm2(Wj);
+      H_h(j + 1, j) = tmpNrm;
+      if (tmpNrm > 1e-14) {
+        Vj = Kokkos::subview(V, Kokkos::ALL, j + 1);
+        KokkosBlas::scal(Vj, one / H_h(j + 1, j), Wj);  // Vj = Wj/H(j+1,j)
       }
       Kokkos::Profiling::popRegion();
 
-      // Givens for real and complex (See Alg 3 in "On computing Givens rotations reliably and efficiently"
-      // by Demmel, et. al. 2001)
-      // Apply Givens rotation and compute shortcut residual:
-      for(int i=0; i<j; i++){
-        ST tempVal = CosVal_h(i)*H_h(i,j) + SinVal_h(i)*H_h(i+1,j);
-        H_h(i+1,j) = -AT::conj(SinVal_h(i))*H_h(i,j) + CosVal_h(i)*H_h(i+1,j);
-        H_h(i,j) = tempVal;
+      // Givens for real and complex (See Alg 3 in "On computing Givens
+      // rotations reliably and efficiently" by Demmel, et. al. 2001) Apply
+      // Givens rotation and compute shortcut residual:
+      for (int i = 0; i < j; i++) {
+        ST tempVal = CosVal_h(i) * H_h(i, j) + SinVal_h(i) * H_h(i + 1, j);
+        H_h(i + 1, j) =
+            -AT::conj(SinVal_h(i)) * H_h(i, j) + CosVal_h(i) * H_h(i + 1, j);
+        H_h(i, j) = tempVal;
       }
-      ST f = H_h(j,j);
-      ST g = H_h(j+1,j);
-      MT f2 = AT::real(f)*AT::real(f) + AT::imag(f)*AT::imag(f);
-      MT g2 = AT::real(g)*AT::real(g) + AT::imag(g)*AT::imag(g);
-      ST fg2 = f2 + g2;
-      ST D1 = one / AT::sqrt(f2*fg2);
-      CosVal_h(j) = f2*D1;
-      fg2 = fg2 * D1;
-      H_h(j,j) = f*fg2;
-      SinVal_h(j) = f*D1*AT::conj(g);
-      H_h(j+1,j) = zero;
-
-      GVec_h(j+1) = GVec_h(j)*(-AT::conj(SinVal_h(j)));
-      GVec_h(j) = GVec_h(j)*CosVal_h(j);
-      shortRelRes = abs(GVec_h(j+1))/nrmB; // this abs is in libstdc++
-
-      std::cout << "Shortcut relative residual for iteration " << j+(cycle*m) << " is: " << shortRelRes << std::endl;
-      if(tmpNrm <= 1e-14 && shortRelRes >= opts.tol){
-        throw std::runtime_error("GMRES has experienced lucky breakdown, but the residual has not converged.\n\
+      ST f          = H_h(j, j);
+      ST g          = H_h(j + 1, j);
+      MT f2         = AT::real(f) * AT::real(f) + AT::imag(f) * AT::imag(f);
+      MT g2         = AT::real(g) * AT::real(g) + AT::imag(g) * AT::imag(g);
+      ST fg2        = f2 + g2;
+      ST D1         = one / AT::sqrt(f2 * fg2);
+      CosVal_h(j)   = f2 * D1;
+      fg2           = fg2 * D1;
+      H_h(j, j)     = f * fg2;
+      SinVal_h(j)   = f * D1 * AT::conj(g);
+      H_h(j + 1, j) = zero;
+
+      GVec_h(j + 1) = GVec_h(j) * (-AT::conj(SinVal_h(j)));
+      GVec_h(j)     = GVec_h(j) * CosVal_h(j);
+      shortRelRes   = fabs(GVec_h(j + 1)) / nrmB;
+
+      std::cout << "Shortcut relative residual for iteration "
+                << j + (cycle * m) << " is: " << shortRelRes << std::endl;
+      if (tmpNrm <= 1e-14 && shortRelRes >= opts.tol) {
+        throw std::runtime_error(
+            "GMRES has experienced lucky breakdown, but the residual has not converged.\n\
                                   Solver terminated without convergence.");
       }
-      if( AT::isNan(ST(shortRelRes)) ){
-        throw std::runtime_error("gmres: Relative residual is nan. Terminating solver.");
+      if (AT::isNan(ST(shortRelRes))) {
+        throw std::runtime_error(
+            "gmres: Relative residual is nan. Terminating solver.");
       }
 
-      //If short residual converged, or time to restart, check true residual
-      if( shortRelRes < opts.tol || j == m-1 ) {
-        //Compute least squares soln with Givens rotation:
-        auto GLsSolnSub_h = Kokkos::subview(GLsSoln_h,Kokkos::ALL,0); //Original view has rank 2, need a rank 1 here.
-        auto GVecSub_h = Kokkos::subview(GVec_h, Kokkos::make_pair(0,m));
-        Kokkos::deep_copy(GLsSolnSub_h, GVecSub_h); //Copy LS rhs vec for triangle solve.
-        auto GLsSolnSub2_h = Kokkos::subview(GLsSoln_h,Kokkos::make_pair(0,j+1),Kokkos::ALL);
-        auto H_Sub_h = Kokkos::subview(H_h, Kokkos::make_pair(0,j+1), Kokkos::make_pair(0,j+1));
-        KokkosBlas::trsm("L", "U", "N", "N", one, H_Sub_h, GLsSolnSub2_h); //GLsSoln = H\GLsSoln
+      // If short residual converged, or time to restart, check true residual
+      if (shortRelRes < opts.tol || j == m - 1) {
+        // Compute least squares soln with Givens rotation:
+        auto GLsSolnSub_h = Kokkos::subview(
+            GLsSoln_h, Kokkos::ALL,
+            0);  // Original view has rank 2, need a rank 1 here.
+        auto GVecSub_h = Kokkos::subview(GVec_h, Kokkos::make_pair(0, m));
+        Kokkos::deep_copy(GLsSolnSub_h,
+                          GVecSub_h);  // Copy LS rhs vec for triangle solve.
+        auto GLsSolnSub2_h = Kokkos::subview(
+            GLsSoln_h, Kokkos::make_pair(0, j + 1), Kokkos::ALL);
+        auto H_Sub_h = Kokkos::subview(H_h, Kokkos::make_pair(0, j + 1),
+                                       Kokkos::make_pair(0, j + 1));
+        KokkosBlas::trsm("L", "U", "N", "N", one, H_Sub_h,
+                         GLsSolnSub2_h);  // GLsSoln = H\GLsSoln
         Kokkos::deep_copy(GLsSoln, GLsSoln_h);
 
-        //Update solution and compute residual with Givens:
-        VSub = Kokkos::subview(V,Kokkos::ALL,Kokkos::make_pair(0,j+1));
-        Kokkos::deep_copy(Xiter,X); //Can't overwrite X with intermediate solution.
-        auto GLsSolnSub3 = Kokkos::subview(GLsSoln,Kokkos::make_pair(0,j+1),0);
-        if(M != NULL){ //Apply right prec to correct soln. 
-          KokkosBlas::gemv ("N", one, VSub, GLsSolnSub3, zero, Wj); //wj = V(1:j+1)*lsSoln
-          M->apply(Wj, Xiter, "N", one, one); //Xiter = M*wj + X
-        }
-        else{
-          KokkosBlas::gemv ("N", one, VSub, GLsSolnSub3, one, Xiter); //x_iter = x + V(1:j+1)*lsSoln
+        // Update solution and compute residual with Givens:
+        VSub = Kokkos::subview(V, Kokkos::ALL, Kokkos::make_pair(0, j + 1));
+        Kokkos::deep_copy(Xiter,
+                          X);  // Can't overwrite X with intermediate solution.
+        auto GLsSolnSub3 =
+            Kokkos::subview(GLsSoln, Kokkos::make_pair(0, j + 1), 0);
+        if (M != NULL) {  // Apply right prec to correct soln.
+          KokkosBlas::gemv("N", one, VSub, GLsSolnSub3, zero,
+                           Wj);                // wj = V(1:j+1)*lsSoln
+          M->apply(Wj, Xiter, "N", one, one);  // Xiter = M*wj + X
+        } else {
+          KokkosBlas::gemv("N", one, VSub, GLsSolnSub3, one,
+                           Xiter);  // x_iter = x + V(1:j+1)*lsSoln
         }
-        KokkosSparse::spmv("N", one, A, Xiter, zero, Wj); // wj = Ax
-        Kokkos::deep_copy(Res,B); // Reset r=b.
-        KokkosBlas::axpy(-one, Wj, Res); // r = b-Ax.
+        KokkosSparse::spmv("N", one, A, Xiter, zero, Wj);  // wj = Ax
+        Kokkos::deep_copy(Res, B);                         // Reset r=b.
+        KokkosBlas::axpy(-one, Wj, Res);                   // r = b-Ax.
         trueRes = KokkosBlas::nrm2(Res);
-        relRes = trueRes/nrmB;
-        std::cout << "True relative residual for iteration " << j+(cycle*m) << " is : " << relRes << std::endl;
-        numIters = j+1;
+        relRes  = trueRes / nrmB;
+        std::cout << "True relative residual for iteration " << j + (cycle * m)
+                  << " is : " << relRes << std::endl;
+        numIters = j + 1;
 
-        if(relRes < opts.tol){
+        if (relRes < opts.tol) {
           converged = true;
-          Kokkos::deep_copy(X, Xiter); //Final solution is the iteration solution.
-          break; //End Arnoldi iteration.
-        }
-        else if(shortRelRes < 1e-30){
-          std::cout << "Short residual has converged to machine zero, but true residual is not converged.\n"
-                    << "You may have given GMRES a singular matrix. Ending the GMRES iteration."
+          Kokkos::deep_copy(
+              X, Xiter);  // Final solution is the iteration solution.
+          break;          // End Arnoldi iteration.
+        } else if (shortRelRes < 1e-30) {
+          std::cout << "Short residual has converged to machine zero, but true "
+                       "residual is not converged.\n"
+                    << "You may have given GMRES a singular matrix. Ending the "
+                       "GMRES iteration."
                     << std::endl;
-                    break; //End Arnoldi iteration; we can't make any more progress.
+          break;  // End Arnoldi iteration; we can't make any more progress.
         }
       }
 
-    }//end Arnoldi iter.
+    }  // end Arnoldi iter.
 
     cycle++;
 
-    //This is the end, or it's time to restart. Update solution to most recent vector.
+    // This is the end, or it's time to restart. Update solution to most
+    // recent vector.
     Kokkos::deep_copy(X, Xiter);
   }
 
   std::cout << "Ending relative residual is: " << relRes << std::endl;
   myStats.endRelRes = static_cast<double>(relRes);
-  if( converged ){
+  if (converged) {
     std::cout << "Solver converged! " << std::endl;
     myStats.convFlagVal = GmresStats::FLAG::Conv;
-  }
-  else if( shortRelRes < opts.tol ){
-    std::cout << "Shortcut residual converged, but solver experienced a loss of accuracy." << std::endl;
+  } else if (shortRelRes < opts.tol) {
+    std::cout << "Shortcut residual converged, but solver experienced a loss "
+                 "of accuracy."
+              << std::endl;
     myStats.convFlagVal = GmresStats::FLAG::LOA;
-  }
-  else{
+  } else {
     std::cout << "Solver did not converge. :( " << std::endl;
     myStats.convFlagVal = GmresStats::FLAG::NoConv;
   }
-  if(cycle > 0){
-    myStats.numIters = (cycle-1)*m + numIters;
-  }
-  else{
+  if (cycle > 0) {
+    myStats.numIters = (cycle - 1) * m + numIters;
+  } else {
     myStats.numIters = 0;
   }
-  std::cout << "The solver completed " << myStats.numIters << " iterations." << std::endl;
+  std::cout << "The solver completed " << myStats.numIters << " iterations."
+            << std::endl;
 
   Kokkos::Profiling::popRegion();
   return myStats;
diff --git a/example/gmres/test_cmplx_A.cpp b/example/gmres/test_cmplx_A.cpp
index 322273db15..a19d6ad7e1 100644
--- a/example/gmres/test_cmplx_A.cpp
+++ b/example/gmres/test_cmplx_A.cpp
@@ -42,114 +42,126 @@
 //@HEADER
 */
 
-#include<math.h>
-#include"KokkosKernels_IOUtils.hpp"
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas.hpp>
-#include<KokkosBlas3_trsm.hpp>
-#include<KokkosSparse_spmv.hpp>
+#include <math.h>
+#include "KokkosKernels_IOUtils.hpp"
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas.hpp>
+#include <KokkosBlas3_trsm.hpp>
+#include <KokkosSparse_spmv.hpp>
 
-#include"gmres.hpp"
+#include "gmres.hpp"
 
-int main(int /*argc*/, char ** /*argv[]*/) {
+int main(int /*argc*/, char** /*argv[]*/) {
+  typedef Kokkos::complex<double> ST;
+  typedef int OT;
+  typedef Kokkos::DefaultExecutionSpace EXSP;
 
-  typedef Kokkos::complex<double>           ST;
-  typedef int                               OT;
-  typedef Kokkos::DefaultExecutionSpace     EXSP;
+  using ViewVectorType = Kokkos::View<ST*, Kokkos::LayoutLeft, EXSP>;
 
-  using ViewVectorType = Kokkos::View<ST*,Kokkos::LayoutLeft, EXSP>;
-
-  std::string filename("young1c.mtx"); // example matrix
+  std::string filename("young1c.mtx");  // example matrix
   GmresOpts<ST> solverOpts;
-  solverOpts.m = 100; //Max subspace size before restarting.
-  solverOpts.tol = 1e-05; //Relative residual convergence tolerance.
+  solverOpts.m          = 100;    // Max subspace size before restarting.
+  solverOpts.tol        = 1e-05;  // Relative residual convergence tolerance.
   solverOpts.maxRestart = 60;
-  solverOpts.ortho = "CGS2"; //orthog type
-  bool pass1 = false;
-  bool pass2 = false;
+  solverOpts.ortho      = "CGS2";  // orthog type
+  bool pass1            = false;
+  bool pass2            = false;
 
   std::cout << "File to process is: " << filename << std::endl;
   std::cout << "Convergence tolerance is: " << solverOpts.tol << std::endl;
 
-  //Initialize Kokkos AFTER parsing parameters:
+  // Initialize Kokkos AFTER parsing parameters:
   Kokkos::initialize();
   {
-
-  // Read in a matrix Market file and use it to test the Kokkos Operator.
-  KokkosSparse::CrsMatrix<ST, OT, EXSP> A = 
-    KokkosKernels::Impl::read_kokkos_crst_matrix<KokkosSparse::CrsMatrix<ST, OT, EXSP>>(filename.c_str()); 
-
-  int n = A.numRows();
-  ViewVectorType X("X",n); //Solution and initial guess
-  ViewVectorType Wj("Wj",n); //For checking residuals at end.
-  ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"),n);//right-hand side vec
-
-  // Make rhs ones so that results are repeatable:
-  Kokkos::deep_copy(B,1.0);
-
-  std::cout << "Testing GMRES with CGS2 ortho:" << std::endl;
-  GmresStats solveStats = gmres<ST, Kokkos::LayoutLeft, EXSP>(A, B, X, solverOpts);
-
-  // Double check residuals at end of solve:
-  double nrmB = KokkosBlas::nrm2(B);
-  KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax
-  KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. 
-  double endRes = KokkosBlas::nrm2(B)/nrmB;
-  std::cout << "=======================================" << std::endl;
-  std::cout << "Verify from main: Ending residual is " << endRes << std::endl;
-  std::cout << "Number of iterations is: " << solveStats.numIters << std::endl;
-  std::cout << "Diff of residual from main - residual from solver: " << solveStats.endRelRes - endRes << std::endl;
-  std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl;
-  
-  if( solveStats.numIters < 700 && solveStats.numIters > 600 && endRes < solverOpts.tol){
-    std::cout << "Test CGS2 Passed!" << std::endl;
-    pass1 = true;
+    // Read in a matrix Market file and use it to test the Kokkos Operator.
+    KokkosSparse::CrsMatrix<ST, OT, EXSP> A =
+        KokkosKernels::Impl::read_kokkos_crst_matrix<
+            KokkosSparse::CrsMatrix<ST, OT, EXSP>>(filename.c_str());
+
+    int n = A.numRows();
+    ViewVectorType X("X", n);    // Solution and initial guess
+    ViewVectorType Wj("Wj", n);  // For checking residuals at end.
+    ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"),
+                     n);  // right-hand side vec
+
+    // Make rhs ones so that results are repeatable:
+    Kokkos::deep_copy(B, 1.0);
+
+    std::cout << "Testing GMRES with CGS2 ortho:" << std::endl;
+    GmresStats solveStats =
+        gmres<ST, Kokkos::LayoutLeft, EXSP>(A, B, X, solverOpts);
+
+    // Double check residuals at end of solve:
+    double nrmB = KokkosBlas::nrm2(B);
+    KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj);  // wj = Ax
+    KokkosBlas::axpy(-1.0, Wj, B);                // b = b-Ax.
+    double endRes = KokkosBlas::nrm2(B) / nrmB;
+    std::cout << "=======================================" << std::endl;
+    std::cout << "Verify from main: Ending residual is " << endRes << std::endl;
+    std::cout << "Number of iterations is: " << solveStats.numIters
+              << std::endl;
+    std::cout << "Diff of residual from main - residual from solver: "
+              << solveStats.endRelRes - endRes << std::endl;
+    std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl;
+
+    if (solveStats.numIters < 700 && solveStats.numIters > 600 &&
+        endRes < solverOpts.tol) {
+      std::cout << "Test CGS2 Passed!" << std::endl;
+      pass1 = true;
+    } else {
+      std::cout << "Solver did not converge within the expected number of "
+                   "iterations. "
+                << std::endl
+                << "CGS2 Test Failed." << std::endl;
     }
-  else{
-    std::cout << "Solver did not converge within the expected number of iterations. " << std::endl
-              << "CGS2 Test Failed." << std::endl;
-      }
-  std::cout << "=======================================" << std::endl << std::endl << std::endl;
-
-  solverOpts.ortho = "MGS";
-  Kokkos::deep_copy(X,0.0);
-  Kokkos::deep_copy(B,1.0);
-
-  std::cout << "Testing GMRES with MGS ortho:" << std::endl;
-  solveStats = gmres<ST, Kokkos::LayoutLeft, EXSP>(A, B, X, solverOpts);
-
-  // Double check residuals at end of solve:
-  nrmB = KokkosBlas::nrm2(B);
-  KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax
-  KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. 
-  endRes = KokkosBlas::nrm2(B)/nrmB;
-  std::cout << "=======================================" << std::endl;
-  std::cout << "Verify from main: Ending residual is " << endRes << std::endl;
-  std::cout << "Number of iterations is: " << solveStats.numIters << std::endl;
-  std::cout << "Diff of residual from main - residual from solver: " << solveStats.endRelRes - endRes << std::endl;
-  std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl;
-  
-  if( solveStats.numIters < 700 && solveStats.numIters > 600 && endRes < solverOpts.tol){
-    std::cout << "Test MGS Passed!" << std::endl;
-    if( pass1 ){ pass2 = true; };
-  }
-  else{
-    std::cout << "Solver did not converge within the expected number of iterations. " << std::endl
-              << "MGS Test Failed." << std::endl;
-  }
-  std::cout << "=======================================" << std::endl << std::endl << std::endl;
-
+    std::cout << "=======================================" << std::endl
+              << std::endl
+              << std::endl;
+
+    solverOpts.ortho = "MGS";
+    Kokkos::deep_copy(X, 0.0);
+    Kokkos::deep_copy(B, 1.0);
+
+    std::cout << "Testing GMRES with MGS ortho:" << std::endl;
+    solveStats = gmres<ST, Kokkos::LayoutLeft, EXSP>(A, B, X, solverOpts);
+
+    // Double check residuals at end of solve:
+    nrmB = KokkosBlas::nrm2(B);
+    KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj);  // wj = Ax
+    KokkosBlas::axpy(-1.0, Wj, B);                // b = b-Ax.
+    endRes = KokkosBlas::nrm2(B) / nrmB;
+    std::cout << "=======================================" << std::endl;
+    std::cout << "Verify from main: Ending residual is " << endRes << std::endl;
+    std::cout << "Number of iterations is: " << solveStats.numIters
+              << std::endl;
+    std::cout << "Diff of residual from main - residual from solver: "
+              << solveStats.endRelRes - endRes << std::endl;
+    std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl;
+
+    if (solveStats.numIters < 700 && solveStats.numIters > 600 &&
+        endRes < solverOpts.tol) {
+      std::cout << "Test MGS Passed!" << std::endl;
+      if (pass1) {
+        pass2 = true;
+      };
+    } else {
+      std::cout << "Solver did not converge within the expected number of "
+                   "iterations. "
+                << std::endl
+                << "MGS Test Failed." << std::endl;
+    }
+    std::cout << "=======================================" << std::endl
+              << std::endl
+              << std::endl;
   }
   Kokkos::finalize();
 
-  if(pass2){ 
-    std::cout << "Both tests have passed!!" << std::endl; 
-  }
-  else{
+  if (pass2) {
+    std::cout << "Both tests have passed!!" << std::endl;
+  } else {
     std::cout << "One or more tests has failed." << std::endl;
   }
 
-  return ( pass2 ? EXIT_SUCCESS : EXIT_FAILURE );
+  return (pass2 ? EXIT_SUCCESS : EXIT_FAILURE);
 }
-
diff --git a/example/gmres/test_real_A.cpp b/example/gmres/test_real_A.cpp
index 1a0bb09683..3f6edd06a3 100644
--- a/example/gmres/test_real_A.cpp
+++ b/example/gmres/test_real_A.cpp
@@ -42,18 +42,18 @@
 //@HEADER
 */
 
-#include<math.h>
-#include"KokkosKernels_IOUtils.hpp"
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas.hpp>
-#include<KokkosBlas3_trsm.hpp>
-#include<KokkosSparse_spmv.hpp>
+#include <math.h>
+#include "KokkosKernels_IOUtils.hpp"
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas.hpp>
+#include <KokkosBlas3_trsm.hpp>
+#include <KokkosSparse_spmv.hpp>
 #include <KokkosKernels_IOUtils.hpp>
 
-#include"gmres.hpp"
+#include "gmres.hpp"
 
-int main(int /*argc*/, char ** /*argv[]*/) {
+int main(int /*argc*/, char** /*argv[]*/) {
   typedef double ST;
   typedef int OT;
   typedef Kokkos::DefaultExecutionSpace EXSP;
@@ -70,101 +70,111 @@ int main(int /*argc*/, char ** /*argv[]*/) {
   GmresOpts<ST> solverOpts;
   solverOpts.ortho      = "CGS2";  // orthog type
   solverOpts.m          = 15;      // Max subspace size before restarting.
-  solverOpts.tol = 1e-10; //Relative residual convergence tolerance.
+  solverOpts.tol        = 1e-10;   // Relative residual convergence tolerance.
   solverOpts.maxRestart = 50;
-  bool pass1 = false;
-  bool pass2 = false;
+  bool pass1            = false;
+  bool pass2            = false;
 
   std::cout << "Convergence tolerance is: " << solverOpts.tol << std::endl;
 
   // Initialize Kokkos AFTER parsing parameters:
   Kokkos::initialize();
   {
-  // Create a diagonally dominant sparse matrix to test:
-  ncST nnz;
-  cOT n = 5000;
-  cOT numRows = n;
-  cOT numCols = n;
-  cOT diagDominance = 1;
-  nnz = 10 * numRows;
-  sp_matrix_type A = KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<sp_matrix_type>
-                                                (numRows, numCols, nnz, 0, ncOT(0.01 * numRows), diagDominance);
-
-  // Set initial vectors:
-  ViewVectorType X("X",n); //Solution and initial guess
-  ViewVectorType Wj("Wj",n); //For checking residuals at end.
-  ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"),n);//right-hand side vec
-
-  // Make rhs ones so that results are repeatable:
-  Kokkos::deep_copy(B,1.0);
-
-  std::cout << "Testing GMRES with CGS2 ortho:" << std::endl;
-  GmresStats solveStats = gmres<ST, Kokkos::LayoutLeft, EXSP>(A, B, X, solverOpts);
-
-  // Double check residuals at end of solve:
-  double nrmB = static_cast<double>(KokkosBlas::nrm2(B));
-  KokkosSparse::spmv("N", ST(1.0), A, X, ST(0.0), Wj); // wj = Ax
-  KokkosBlas::axpy(ST(-1.0), Wj, B); // b = b-Ax.
-  double endRes = KokkosBlas::nrm2(B)/nrmB;
-  std::cout << "=======================================" << std::endl;
-  std::cout << "Verify from main: Ending residual is " << endRes << std::endl;
-  std::cout << "Number of iterations is: " << solveStats.numIters << std::endl;
-  std::cout << "Diff of residual from main - residual from solver: " << solveStats.endRelRes - endRes << std::endl;
-  std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl;
-
-  if (solveStats.numIters < 40 && solveStats.numIters > 20 &&
-      endRes < static_cast<double>(solverOpts.tol)) {
-    std::cout << "Test CGS2 Passed!" << std::endl;
-    pass1 = true;
-  } else {
-    std::cout
-        << "Solver did not converge within the expected number of iterations. "
-        << std::endl
-        << "CGS2 Test Failed." << std::endl;
-  }
-  std::cout << "=======================================" << std::endl << std::endl << std::endl;
-
-  solverOpts.ortho = "MGS";
-  Kokkos::deep_copy(X,0.0);
-  Kokkos::deep_copy(B,1.0);
-
-  std::cout << "Testing GMRES with MGS ortho:" << std::endl;
-  solveStats = gmres<ST, Kokkos::LayoutLeft, EXSP>(A, B, X, solverOpts);
-
-  // Double check residuals at end of solve:
-  nrmB = static_cast<double>(KokkosBlas::nrm2(B));
-  KokkosSparse::spmv("N", ST(1.0), A, X, ST(0.0), Wj); // wj = Ax
-  KokkosBlas::axpy(ST(-1.0), Wj, B); // b = b-Ax.
-  endRes = KokkosBlas::nrm2(B)/nrmB;
-  std::cout << "=======================================" << std::endl;
-  std::cout << "Verify from main: Ending residual is " << endRes << std::endl;
-  std::cout << "Number of iterations is: " << solveStats.numIters << std::endl;
-  std::cout << "Diff of residual from main - residual from solver: " << solveStats.endRelRes - endRes << std::endl;
-  std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl;
-
-  if (solveStats.numIters < 40 && solveStats.numIters > 20 &&
-      endRes < static_cast<double>(solverOpts.tol)) {
-    std::cout << "Test MGS Passed!" << std::endl;
-    if (pass1) {
-      pass2 = true;
-    };
-  } else {
-    std::cout
-        << "Solver did not converge within the expected number of iterations. "
-        << std::endl
-        << "MGS Test Failed." << std::endl;
-  }
-  std::cout << "=======================================" << std::endl << std::endl << std::endl;
-
+    // Create a diagonally dominant sparse matrix to test:
+    ncST nnz;
+    cOT n             = 5000;
+    cOT numRows       = n;
+    cOT numCols       = n;
+    cOT diagDominance = 1;
+    nnz               = 10 * numRows;
+    sp_matrix_type A =
+        KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<
+            sp_matrix_type>(numRows, numCols, nnz, 0, ncOT(0.01 * numRows),
+                            diagDominance);
+
+    // Set initial vectors:
+    ViewVectorType X("X", n);    // Solution and initial guess
+    ViewVectorType Wj("Wj", n);  // For checking residuals at end.
+    ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"),
+                     n);  // right-hand side vec
+
+    // Make rhs ones so that results are repeatable:
+    Kokkos::deep_copy(B, 1.0);
+
+    std::cout << "Testing GMRES with CGS2 ortho:" << std::endl;
+    GmresStats solveStats =
+        gmres<ST, Kokkos::LayoutLeft, EXSP>(A, B, X, solverOpts);
+
+    // Double check residuals at end of solve:
+    double nrmB = static_cast<double>(KokkosBlas::nrm2(B));
+    KokkosSparse::spmv("N", ST(1.0), A, X, ST(0.0), Wj);  // wj = Ax
+    KokkosBlas::axpy(ST(-1.0), Wj, B);                    // b = b-Ax.
+    double endRes = KokkosBlas::nrm2(B) / nrmB;
+    std::cout << "=======================================" << std::endl;
+    std::cout << "Verify from main: Ending residual is " << endRes << std::endl;
+    std::cout << "Number of iterations is: " << solveStats.numIters
+              << std::endl;
+    std::cout << "Diff of residual from main - residual from solver: "
+              << solveStats.endRelRes - endRes << std::endl;
+    std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl;
+
+    if (solveStats.numIters < 40 && solveStats.numIters > 20 &&
+        endRes < static_cast<double>(solverOpts.tol)) {
+      std::cout << "Test CGS2 Passed!" << std::endl;
+      pass1 = true;
+    } else {
+      std::cout << "Solver did not converge within the expected number of "
+                   "iterations. "
+                << std::endl
+                << "CGS2 Test Failed." << std::endl;
+    }
+    std::cout << "=======================================" << std::endl
+              << std::endl
+              << std::endl;
+
+    solverOpts.ortho = "MGS";
+    Kokkos::deep_copy(X, 0.0);
+    Kokkos::deep_copy(B, 1.0);
+
+    std::cout << "Testing GMRES with MGS ortho:" << std::endl;
+    solveStats = gmres<ST, Kokkos::LayoutLeft, EXSP>(A, B, X, solverOpts);
+
+    // Double check residuals at end of solve:
+    nrmB = static_cast<double>(KokkosBlas::nrm2(B));
+    KokkosSparse::spmv("N", ST(1.0), A, X, ST(0.0), Wj);  // wj = Ax
+    KokkosBlas::axpy(ST(-1.0), Wj, B);                    // b = b-Ax.
+    endRes = KokkosBlas::nrm2(B) / nrmB;
+    std::cout << "=======================================" << std::endl;
+    std::cout << "Verify from main: Ending residual is " << endRes << std::endl;
+    std::cout << "Number of iterations is: " << solveStats.numIters
+              << std::endl;
+    std::cout << "Diff of residual from main - residual from solver: "
+              << solveStats.endRelRes - endRes << std::endl;
+    std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl;
+
+    if (solveStats.numIters < 40 && solveStats.numIters > 20 &&
+        endRes < static_cast<double>(solverOpts.tol)) {
+      std::cout << "Test MGS Passed!" << std::endl;
+      if (pass1) {
+        pass2 = true;
+      };
+    } else {
+      std::cout << "Solver did not converge within the expected number of "
+                   "iterations. "
+                << std::endl
+                << "MGS Test Failed." << std::endl;
+    }
+    std::cout << "=======================================" << std::endl
+              << std::endl
+              << std::endl;
   }
   Kokkos::finalize();
 
-  if(pass2){ 
-    std::cout << "Both tests have passed!!" << std::endl; 
-  }
-  else{
+  if (pass2) {
+    std::cout << "Both tests have passed!!" << std::endl;
+  } else {
     std::cout << "One or more tests has failed." << std::endl;
   }
 
-  return ( pass2 ? EXIT_SUCCESS : EXIT_FAILURE );
+  return (pass2 ? EXIT_SUCCESS : EXIT_FAILURE);
 }
diff --git a/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp b/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp
index fa788b4daa..99b398e40c 100644
--- a/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp
+++ b/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp
@@ -57,10 +57,8 @@
 #include <KokkosKernels_IOUtils.hpp>
 #include <KokkosSparse_CrsMatrix.hpp>
 
-
 using namespace KokkosGraph;
 
-
 #ifdef KOKKOSKERNELS_INST_DOUBLE
 using kk_scalar_type = double;
 #else
@@ -73,7 +71,7 @@ using kk_scalar_type = float;
 using kk_size_type = int;
 #else
 #ifdef KOKKOSKERNELS_INST_OFFSET_SIZE_T
-using kk_size_type = size_t;
+using kk_size_type   = size_t;
 #endif
 #endif
 
@@ -81,486 +79,483 @@ using kk_size_type = size_t;
 using kk_lno_type = int;
 #else
 #ifdef KOKKOSKERNELS_INST_ORDINAL_INT64_T
-using kk_lno_type = int64_t;
+using kk_lno_type    = int64_t;
 #endif
 #endif
 
-
-
 using namespace KokkosGraph;
 
 namespace KokkosKernels {
 namespace Example {
 
-
-
-struct Parameters
-{
-    int   algorithm;
-    int   repeat;
-    int   chunk_size;
-    int   output_graphviz_vert_max;
-    int   output_graphviz;
-    int   shmemsize;
-    int   verbose_level;
-    int   check_output;
-    char* coloring_input_file;
-    char* coloring_output_file;
-    int   output_histogram;
-    int   use_threads;
-    int   use_openmp;
-    int   use_cuda;
-    int   use_serial;
-    int   validate;
-    char* mtx_bin_file;
-
-    Parameters()
-    {
-        algorithm                = 0;
-        repeat                   = 6;
-        chunk_size               = -1;
-        shmemsize                = 16128;
-        verbose_level            = 0;
-        check_output             = 0;
-        coloring_input_file      = NULL;
-        coloring_output_file     = NULL;
-        output_histogram         = 0;
-        output_graphviz          = 0;
-        output_graphviz_vert_max = 1500;
-        use_threads              = 0;
-        use_openmp               = 0;
-        use_cuda                 = 0;
-        use_serial               = 0;
-        validate                 = 0;
-        mtx_bin_file             = NULL;
-    }
+struct Parameters {
+  int algorithm;
+  int repeat;
+  int chunk_size;
+  int output_graphviz_vert_max;
+  int output_graphviz;
+  int shmemsize;
+  int verbose_level;
+  int check_output;
+  char* coloring_input_file;
+  char* coloring_output_file;
+  int output_histogram;
+  int use_threads;
+  int use_openmp;
+  int use_cuda;
+  int use_serial;
+  int validate;
+  char* mtx_bin_file;
+
+  Parameters() {
+    algorithm                = 0;
+    repeat                   = 6;
+    chunk_size               = -1;
+    shmemsize                = 16128;
+    verbose_level            = 0;
+    check_output             = 0;
+    coloring_input_file      = NULL;
+    coloring_output_file     = NULL;
+    output_histogram         = 0;
+    output_graphviz          = 0;
+    output_graphviz_vert_max = 1500;
+    use_threads              = 0;
+    use_openmp               = 0;
+    use_cuda                 = 0;
+    use_serial               = 0;
+    validate                 = 0;
+    mtx_bin_file             = NULL;
+  }
 };
 
-
-
-void
-print_options(std::ostream& os, const char* app_name, unsigned int indent = 0)
-{
-    std::string spaces(indent, ' ');
-    os << "Usage:" << std::endl
-       << spaces << "  " << app_name << " [parameters]" << std::endl
-       << std::endl
-       << spaces << "Parameters:" << std::endl
-       << spaces << "  Parallelism (select one of the following):" << std::endl
-       << spaces << "      --serial <N>        Execute serially." << std::endl
-       << spaces << "      --threads <N>       Use N posix threads." << std::endl
-       << spaces << "      --openmp <N>        Use OpenMP with N threads." << std::endl
-       << spaces << "      --cuda              Use CUDA" << std::endl
-       << std::endl
-       << spaces << "  Required Parameters:" << std::endl
-       << spaces << "      --amtx <filename>   Input file in Matrix Market format (.mtx)." << std::endl
-       << std::endl
-       << spaces << "      --algorithm <algorithm_name>   Set the algorithm to use.  Allowable values are:" << std::endl
-       << spaces << "                 COLORING_D2_MATRIX_SQUARED  - Matrix-squared + Distance-1 method." << std::endl
-       << spaces << "                 COLORING_D2_SERIAL          - Serial algorithm (must use with 'serial' mode)" << std::endl
-       << spaces << "                 COLORING_D2_VB              - Vertex Based method using boolean forbidden array (Default)." << std::endl
-       << spaces << "                 COLORING_D2_VB_BIT          - VB with Bitvector Forbidden Array" << std::endl
-       << spaces << "                 COLORING_D2_VB_BIT_EF       - VB_BIT with Edge Filtering" << std::endl
-       << std::endl
-       << spaces << "  Optional Parameters:" << std::endl
-       << spaces << "      --output-histogram              Print out a histogram of the colors." << std::endl
-       << spaces << "      --output-graphviz               Write the output to a graphviz file (G.dot)." << std::endl
-       << spaces << "                                      Note: Vertices with color 0 will be filled in and colored" << std::endl
-       << spaces << "      --output-graphviz-vert-max <N>  Upper limit of vertices in G to allow graphviz output. Default=1500." << std::endl
-       << spaces << "                                      Requires --output-graphviz to also be enabled." << std::endl
-       << spaces << "      --validate                      Check that the coloring is a valid distance-2 graph coloring" << std::endl
-       << spaces << "      --verbose-level <N>             Set verbosity level [0..5] where N > 0 means print verbose messags." << std::endl
-       << spaces << "                                      Default: 0" << std::endl
-       << spaces << "      --help                          Print out command line help." << std::endl
-       << spaces << " " << std::endl;
+void print_options(std::ostream& os, const char* app_name,
+                   unsigned int indent = 0) {
+  std::string spaces(indent, ' ');
+  os << "Usage:" << std::endl
+     << spaces << "  " << app_name << " [parameters]" << std::endl
+     << std::endl
+     << spaces << "Parameters:" << std::endl
+     << spaces << "  Parallelism (select one of the following):" << std::endl
+     << spaces << "      --serial <N>        Execute serially." << std::endl
+     << spaces << "      --threads <N>       Use N posix threads." << std::endl
+     << spaces << "      --openmp <N>        Use OpenMP with N threads."
+     << std::endl
+     << spaces << "      --cuda              Use CUDA" << std::endl
+     << std::endl
+     << spaces << "  Required Parameters:" << std::endl
+     << spaces
+     << "      --amtx <filename>   Input file in Matrix Market format (.mtx)."
+     << std::endl
+     << std::endl
+     << spaces
+     << "      --algorithm <algorithm_name>   Set the algorithm to use.  "
+        "Allowable values are:"
+     << std::endl
+     << spaces
+     << "                 COLORING_D2_MATRIX_SQUARED  - Matrix-squared + "
+        "Distance-1 method."
+     << std::endl
+     << spaces
+     << "                 COLORING_D2_SERIAL          - Serial algorithm (must "
+        "use with 'serial' mode)"
+     << std::endl
+     << spaces
+     << "                 COLORING_D2_VB              - Vertex Based method "
+        "using boolean forbidden array (Default)."
+     << std::endl
+     << spaces
+     << "                 COLORING_D2_VB_BIT          - VB with Bitvector "
+        "Forbidden Array"
+     << std::endl
+     << spaces
+     << "                 COLORING_D2_VB_BIT_EF       - VB_BIT with Edge "
+        "Filtering"
+     << std::endl
+     << std::endl
+     << spaces << "  Optional Parameters:" << std::endl
+     << spaces
+     << "      --output-histogram              Print out a histogram of the "
+        "colors."
+     << std::endl
+     << spaces
+     << "      --output-graphviz               Write the output to a graphviz "
+        "file (G.dot)."
+     << std::endl
+     << spaces
+     << "                                      Note: Vertices with color 0 "
+        "will be filled in and colored"
+     << std::endl
+     << spaces
+     << "      --output-graphviz-vert-max <N>  Upper limit of vertices in G to "
+        "allow graphviz output. Default=1500."
+     << std::endl
+     << spaces
+     << "                                      Requires --output-graphviz to "
+        "also be enabled."
+     << std::endl
+     << spaces
+     << "      --validate                      Check that the coloring is a "
+        "valid distance-2 graph coloring"
+     << std::endl
+     << spaces
+     << "      --verbose-level <N>             Set verbosity level [0..5] "
+        "where N > 0 means print verbose messags."
+     << std::endl
+     << spaces << "                                      Default: 0"
+     << std::endl
+     << spaces
+     << "      --help                          Print out command line help."
+     << std::endl
+     << spaces << " " << std::endl;
 }
 
-
-int
-parse_inputs(KokkosKernels::Example::Parameters& params, int argc, char** argv)
-{
-    bool got_required_param_amtx      = false;
-    bool got_required_param_algorithm = false;
-
-    for(int i = 1; i < argc; ++i)
-    {
-        if(0 == strcasecmp(argv[ i ], "--threads"))
-        {
-            params.use_threads = atoi(argv[ ++i ]);
-            //std::cout << "use_threads = " << params.use_threads << std::endl;
-        }
-        else if(0 == strcasecmp(argv[ i ], "--serial"))
-        {
-            params.use_serial = atoi(argv[ ++i ]);
-            //std::cout << "use_serial = " << params.use_serial << std::endl;
-        }
-        else if(0 == strcasecmp(argv[ i ], "--openmp"))
-        {
-            params.use_openmp = atoi(argv[ ++i ]);
-            //std::cout << "use_openmp = " << params.use_openmp << std::endl;
-        }
-        else if(0 == strcasecmp(argv[ i ], "--cuda"))
-        {
-            params.use_cuda = 1;
-            //std::cout << "use_cuda = " << params.use_cuda << std::endl;
-        }
-        else if(0 == strcasecmp(argv[ i ], "--amtx"))
-        {
-            got_required_param_amtx = true;
-            params.mtx_bin_file     = argv[ ++i ];
-        }
-        else if(0 == strcasecmp(argv[ i ], "--validate"))
-        {
-            params.validate = 1;
-        }
-        else if(0 == strcasecmp(argv[ i ], "--verbose-level"))
-        {
-            params.verbose_level = atoi( argv[++i] );
-            params.verbose_level = std::min(5, params.verbose_level);
-            params.verbose_level = std::max(0, params.verbose_level);
-        }
-        else if(0 == strcasecmp(argv[ i ], "--output-histogram"))
-        {
-            params.output_histogram = 1;
-        }
-        else if(0 == strcasecmp(argv[ i ], "--output-graphviz"))
-        {
-            params.output_graphviz = 1;
-        }
-        else if(0 == strcasecmp(argv[ i ], "--output-graphviz-vert-max"))
-        {
-            params.output_graphviz_vert_max = atoi( argv[++i] );
-        }
-        else if(0 == strcasecmp(argv[ i ], "--algorithm"))
-        {
-            ++i;
-            if(0 == strcasecmp(argv[ i ], "COLORING_D2_MATRIX_SQUARED"))
-            {
-                params.algorithm             = 1;
-                got_required_param_algorithm = true;
-            }
-            else if(0 == strcasecmp(argv[ i ], "COLORING_D2_SERIAL"))
-            {
-                params.algorithm             = 2;
-                got_required_param_algorithm = true;
-            }
-            else if(0 == strcasecmp(argv[ i ], "COLORING_D2_VB") || 0 == strcasecmp(argv[ i ], "COLORING_D2"))
-            {
-                params.algorithm             = 3;
-                got_required_param_algorithm = true;
-            }
-            else if(0 == strcasecmp(argv[ i ], "COLORING_D2_VB_BIT"))
-            {
-                params.algorithm             = 4;
-                got_required_param_algorithm = true;
-            }
-            else if(0 == strcasecmp(argv[ i ], "COLORING_D2_VB_BIT_EF"))
-            {
-                params.algorithm             = 5;
-                got_required_param_algorithm = true;
-            }
-            else
-            {
-                std::cerr << "2-Unrecognized command line argument #" << i << ": " << argv[ i ] << std::endl;
-                print_options(std::cout, argv[ 0 ]);
-                return 1;
-            }
-        }
-        else if(0 == strcasecmp(argv[ i ], "--help") || 0 == strcasecmp(argv[ i ], "-h"))
-        {
-            print_options(std::cout, argv[ 0 ]);
-            return 1;
-        }
-        else
-        {
-            std::cerr << "3-Unrecognized command line argument #" << i << ": " << argv[ i ] << std::endl;
-            print_options(std::cout, argv[ 0 ]);
-            return 1;
-        }
-    }
-
-    if(!got_required_param_amtx)
-    {
-        std::cout << "Missing required parameter amtx" << std::endl << std::endl;
-        print_options(std::cout, argv[ 0 ]);
-        return 1;
-    }
-    if(!got_required_param_algorithm)
-    {
-        std::cout << "Missing required parameter algorithm" << std::endl << std::endl;
-        print_options(std::cout, argv[ 0 ]);
-        return 1;
-    }
-    if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda)
-    {
-        print_options(std::cout, argv[ 0 ]);
+int parse_inputs(KokkosKernels::Example::Parameters& params, int argc,
+                 char** argv) {
+  bool got_required_param_amtx      = false;
+  bool got_required_param_algorithm = false;
+
+  for (int i = 1; i < argc; ++i) {
+    if (0 == Test::string_compare_no_case(argv[i], "--threads")) {
+      params.use_threads = atoi(argv[++i]);
+      // std::cout << "use_threads = " << params.use_threads << std::endl;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--serial")) {
+      params.use_serial = atoi(argv[++i]);
+      // std::cout << "use_serial = " << params.use_serial << std::endl;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) {
+      params.use_openmp = atoi(argv[++i]);
+      // std::cout << "use_openmp = " << params.use_openmp << std::endl;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) {
+      params.use_cuda = 1;
+      // std::cout << "use_cuda = " << params.use_cuda << std::endl;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--amtx")) {
+      got_required_param_amtx = true;
+      params.mtx_bin_file     = argv[++i];
+    } else if (0 == Test::string_compare_no_case(argv[i], "--validate")) {
+      params.validate = 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--verbose-level")) {
+      params.verbose_level = atoi(argv[++i]);
+      params.verbose_level = std::min(5, params.verbose_level);
+      params.verbose_level = std::max(0, params.verbose_level);
+    } else if (0 ==
+               Test::string_compare_no_case(argv[i], "--output-histogram")) {
+      params.output_histogram = 1;
+    } else if (0 ==
+               Test::string_compare_no_case(argv[i], "--output-graphviz")) {
+      params.output_graphviz = 1;
+    } else if (0 == Test::string_compare_no_case(
+                        argv[i], "--output-graphviz-vert-max")) {
+      params.output_graphviz_vert_max = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--algorithm")) {
+      ++i;
+      if (0 ==
+          Test::string_compare_no_case(argv[i], "COLORING_D2_MATRIX_SQUARED")) {
+        params.algorithm             = 1;
+        got_required_param_algorithm = true;
+      } else if (0 ==
+                 Test::string_compare_no_case(argv[i], "COLORING_D2_SERIAL")) {
+        params.algorithm             = 2;
+        got_required_param_algorithm = true;
+      } else if (0 == Test::string_compare_no_case(argv[i], "COLORING_D2_VB") ||
+                 0 == Test::string_compare_no_case(argv[i], "COLORING_D2")) {
+        params.algorithm             = 3;
+        got_required_param_algorithm = true;
+      } else if (0 ==
+                 Test::string_compare_no_case(argv[i], "COLORING_D2_VB_BIT")) {
+        params.algorithm             = 4;
+        got_required_param_algorithm = true;
+      } else if (0 == Test::string_compare_no_case(argv[i],
+                                                   "COLORING_D2_VB_BIT_EF")) {
+        params.algorithm             = 5;
+        got_required_param_algorithm = true;
+      } else {
+        std::cerr << "2-Unrecognized command line argument #" << i << ": "
+                  << argv[i] << std::endl;
+        print_options(std::cout, argv[0]);
         return 1;
+      }
+    } else if (0 == Test::string_compare_no_case(argv[i], "--help") ||
+               0 == Test::string_compare_no_case(argv[i], "-h")) {
+      print_options(std::cout, argv[0]);
+      return 1;
+    } else {
+      std::cerr << "3-Unrecognized command line argument #" << i << ": "
+                << argv[i] << std::endl;
+      print_options(std::cout, argv[0]);
+      return 1;
     }
-    return 0;
-}
-
-
-
-template<typename ExecSpace,
-         typename DataType,
-         typename CrsGraph_type,
-         typename TempMemSpace,
-         typename PersistentMemSpace>
-void
-run_example(CrsGraph_type crsGraph, DataType num_cols, Parameters params)
-{
-    using namespace KokkosGraph;
-    using namespace KokkosGraph::Experimental;
-
-    int algorithm = params.algorithm;
-    int shmemsize = params.shmemsize;
-
-    using lno_view_type     = typename CrsGraph_type::row_map_type::non_const_type;
-    using lno_nnz_view_type = typename CrsGraph_type::entries_type::non_const_type;
-    using size_type         = typename lno_view_type::non_const_value_type;
-    using lno_type          = typename lno_nnz_view_type::non_const_value_type;
-    using KernelHandle_type = KokkosKernels::Experimental::KokkosKernelsHandle<size_type, lno_type, kk_scalar_type, ExecSpace, TempMemSpace, PersistentMemSpace>;
-
-
-    // Create a kernel handle
-    KernelHandle_type kh;
-    kh.set_shmem_size(shmemsize);
-
-    if(params.verbose_level > 0)
-    {
-        kh.set_verbose(true);
-    }
-
-    // ------------------------------------------
-    // Set up the D2 coloring kernel handle
-    // ------------------------------------------
-    std::string label_algorithm;
-    switch(algorithm)
-    {
-        case 1:
-            kh.create_distance2_graph_coloring_handle(COLORING_D2_MATRIX_SQUARED);
-            label_algorithm = "COLORING_D2_MATRIX_SQUARED";
-            break;
-        case 2:
-            kh.create_distance2_graph_coloring_handle(COLORING_D2_SERIAL);
-            label_algorithm = "COLORING_D2_SERIAL";
-            break;
-        case 3:
-            kh.create_distance2_graph_coloring_handle(COLORING_D2_VB);
-            label_algorithm = "COLORING_D2_VB";
-            break;
-        case 4:
-            kh.create_distance2_graph_coloring_handle(COLORING_D2_VB_BIT);
-            label_algorithm = "COLORING_D2_VB_BIT";
-            break;
-        case 5:
-            kh.create_distance2_graph_coloring_handle(COLORING_D2_VB_BIT_EF);
-            label_algorithm = "COLORING_D2_VB_BIT_EF";
-            break;
-        default:
-            kh.create_distance2_graph_coloring_handle(COLORING_D2_VB);
-            label_algorithm = "COLORING_D2_VB";
-            break;
-    }
-
-    std::cout << std::endl << "Run Graph Color D2 (" << label_algorithm << ")" << std::endl;
-
-    // ------------------------------------------
-    // Call the distance-2 graph coloring routine
-    // ------------------------------------------
-    graph_compute_distance2_color(&kh,
-                                  crsGraph.numRows(),
-                                  num_cols,
-                                  crsGraph.row_map,
-                                  crsGraph.entries,
-                                  crsGraph.row_map,
-                                  crsGraph.entries);
-
-    // ------------------------------------------
-    // Get the results
-    // ------------------------------------------
-    size_t num_colors = kh.get_distance2_graph_coloring_handle()->get_num_colors();
-    size_t num_phases = kh.get_distance2_graph_coloring_handle()->get_num_phases();
-
-    if(params.verbose_level > 0)
-    {
-        std::cout << "Total Time: " << kh.get_distance2_graph_coloring_handle()->get_overall_coloring_time() << std::endl
-                  << "Num colors: " << kh.get_distance2_graph_coloring_handle()->get_num_colors() << std::endl
-                  << "Num Phases: " << kh.get_distance2_graph_coloring_handle()->get_num_phases() << std::endl
-                  << "Colors:\n\t";
-        KokkosKernels::Impl::print_1Dview(kh.get_distance2_graph_coloring_handle()->get_vertex_colors());
-        std::cout << std::endl;
-    }
-
-    // ------------------------------------------
-    // Save coloring to a GraphViz file
-    // ------------------------------------------
-    if(params.output_graphviz && crsGraph.numRows() <= params.output_graphviz_vert_max)
-    {
-        auto colors = kh.get_distance2_graph_coloring_handle()->get_vertex_colors();
-
-        std::ofstream os("G.dot", std::ofstream::out);
-
-        kh.get_distance2_graph_coloring_handle()->dump_graphviz(os, crsGraph.numRows(), crsGraph.row_map, crsGraph.entries, colors);
-    }
-
-    // ------------------------------------------
-    // Verify correctness
-    // ------------------------------------------
-    std::string str_color_is_valid = "UNKNOWN";
-    if(0 != params.validate)
-    {
-        str_color_is_valid = "VALID";
-
-        bool d2_coloring_is_valid              = false;
-        bool d2_coloring_validation_flags[ 4 ] = {false};
-
-        d2_coloring_is_valid = KokkosGraph::Impl::graph_verify_distance2_color(&kh,
-                                                                               crsGraph.numRows(),
-                                                                               //crsGraph.numCols(),
-                                                                               num_cols,
-                                                                               crsGraph.row_map,
-                                                                               crsGraph.entries,
-                                                                               crsGraph.row_map,
-                                                                               crsGraph.entries,
-                                                                               d2_coloring_validation_flags);
-
-        // Print out messages based on coloring validation check.
-        if(d2_coloring_is_valid)
-        {
-            std::cout << std::endl << "Distance-2 Graph Coloring is VALID" << std::endl << std::endl;
-        }
-        else
-        {
-            str_color_is_valid = "INVALID";
-            std::cout << std::endl
-                      << "Distance-2 Graph Coloring is NOT VALID" << std::endl
-                      << "  - Vert(s) left uncolored : " << d2_coloring_validation_flags[ 1 ] << std::endl
-                      << "  - Invalid D2 Coloring    : " << d2_coloring_validation_flags[ 2 ] << std::endl
-                      << std::endl;
-        }
-        if(d2_coloring_validation_flags[ 3 ])
-        {
-            std::cout << "Distance-2 Graph Coloring may have poor quality." << std::endl
-                      << "  - Vert(s) have high color value : " << d2_coloring_validation_flags[ 3 ] << std::endl
-                      << std::endl;
-        }
-    }
-
-    // ------------------------------------------
-    // Print out a histogram of the colors
-    // ------------------------------------------
-    if(0 != params.output_histogram)
-    {
-        KokkosGraph::Impl::graph_print_distance2_color_histogram(&kh,
-                                                                 crsGraph.numRows(),
-                                                                 num_cols,
-                                                                 crsGraph.row_map,
-                                                                 crsGraph.entries,
-                                                                 crsGraph.row_map,
-                                                                 crsGraph.entries,
-                                                                 false);
-    }
-
-    // ------------------------------------------
-    // Print out a summary 
-    // ------------------------------------------
-    std::string mtx_bin_file = params.mtx_bin_file;
-    mtx_bin_file             = mtx_bin_file.substr(mtx_bin_file.find_last_of("/\\") + 1);
-
-    std::cout << "Summary" << std::endl
-              << "-------" << std::endl
-              << "    KExecSName     : " << Kokkos::DefaultExecutionSpace::name() << std::endl
-              << "    Filename       : " << mtx_bin_file << std::endl
-              << "    Num Verts      : " << crsGraph.numRows() << std::endl
-              << "    Num Edges      : " << crsGraph.entries.extent(0) << std::endl
-              << "    Concurrency    : " << Kokkos::DefaultExecutionSpace::concurrency() << std::endl
-              << "    Algorithm      : " << label_algorithm << std::endl
-              << "Coloring Stats" << std::endl
-              << "    Num colors     : " << num_colors << std::endl
-              << "    Num Phases     : " << num_phases << std::endl
-              << "    Validation     : " << str_color_is_valid << std::endl
+  }
+
+  if (!got_required_param_amtx) {
+    std::cout << "Missing required parameter amtx" << std::endl << std::endl;
+    print_options(std::cout, argv[0]);
+    return 1;
+  }
+  if (!got_required_param_algorithm) {
+    std::cout << "Missing required parameter algorithm" << std::endl
               << std::endl;
+    print_options(std::cout, argv[0]);
+    return 1;
+  }
+  if (!params.use_serial && !params.use_threads && !params.use_openmp &&
+      !params.use_cuda) {
+    print_options(std::cout, argv[0]);
+    return 1;
+  }
+  return 0;
+}
 
-}   // run_example()
-
-
-
-template<typename size_type, typename lno_type, typename exec_space, typename hbm_mem_space>
-void
-driver(Parameters params)
-{
-    using myExecSpace  = exec_space;
-    using myFastDevice = Kokkos::Device<exec_space, hbm_mem_space>;
-    using crstmat_type = typename KokkosSparse::CrsMatrix<double, lno_type, myFastDevice, void, size_type>;
-    using graph_type   = typename crstmat_type::StaticCrsGraphType;
-    using data_type    = typename graph_type::data_type;
-
-    char* mat_file = params.mtx_bin_file;
-
-    crstmat_type crsmat   = KokkosKernels::Impl::read_kokkos_crst_matrix<crstmat_type>(mat_file);
-    graph_type   crsgraph = crsmat.graph;
-    data_type    num_cols = crsmat.numCols();
-
-    KokkosKernels::Example::run_example<myExecSpace, data_type, graph_type, hbm_mem_space, hbm_mem_space>
-        (crsgraph, num_cols, params);
-
-} // driver()
-
-
-}      // namespace Example
-}      // namespace KokkosKernels
-
-
-
-int
-main(int argc, char* argv[])
-{
-    KokkosKernels::Example::Parameters params;
-
-    if(parse_inputs(params, argc, argv))
-    {
-        return 1;
-    }
-
-    if(params.mtx_bin_file == NULL)
-    {
-        std::cerr << "Provide a matrix file" << std::endl;
-        return 0;
+template <typename ExecSpace, typename DataType, typename CrsGraph_type,
+          typename TempMemSpace, typename PersistentMemSpace>
+void run_example(CrsGraph_type crsGraph, DataType num_cols, Parameters params) {
+  using namespace KokkosGraph;
+  using namespace KokkosGraph::Experimental;
+
+  int algorithm = params.algorithm;
+  int shmemsize = params.shmemsize;
+
+  using lno_view_type = typename CrsGraph_type::row_map_type::non_const_type;
+  using lno_nnz_view_type =
+      typename CrsGraph_type::entries_type::non_const_type;
+  using size_type         = typename lno_view_type::non_const_value_type;
+  using lno_type          = typename lno_nnz_view_type::non_const_value_type;
+  using KernelHandle_type = KokkosKernels::Experimental::KokkosKernelsHandle<
+      size_type, lno_type, kk_scalar_type, ExecSpace, TempMemSpace,
+      PersistentMemSpace>;
+
+  // Create a kernel handle
+  KernelHandle_type kh;
+  kh.set_shmem_size(shmemsize);
+
+  if (params.verbose_level > 0) {
+    kh.set_verbose(true);
+  }
+
+  // ------------------------------------------
+  // Set up the D2 coloring kernel handle
+  // ------------------------------------------
+  std::string label_algorithm;
+  switch (algorithm) {
+    case 1:
+      kh.create_distance2_graph_coloring_handle(COLORING_D2_MATRIX_SQUARED);
+      label_algorithm = "COLORING_D2_MATRIX_SQUARED";
+      break;
+    case 2:
+      kh.create_distance2_graph_coloring_handle(COLORING_D2_SERIAL);
+      label_algorithm = "COLORING_D2_SERIAL";
+      break;
+    case 3:
+      kh.create_distance2_graph_coloring_handle(COLORING_D2_VB);
+      label_algorithm = "COLORING_D2_VB";
+      break;
+    case 4:
+      kh.create_distance2_graph_coloring_handle(COLORING_D2_VB_BIT);
+      label_algorithm = "COLORING_D2_VB_BIT";
+      break;
+    case 5:
+      kh.create_distance2_graph_coloring_handle(COLORING_D2_VB_BIT_EF);
+      label_algorithm = "COLORING_D2_VB_BIT_EF";
+      break;
+    default:
+      kh.create_distance2_graph_coloring_handle(COLORING_D2_VB);
+      label_algorithm = "COLORING_D2_VB";
+      break;
+  }
+
+  std::cout << std::endl
+            << "Run Graph Color D2 (" << label_algorithm << ")" << std::endl;
+
+  // ------------------------------------------
+  // Call the distance-2 graph coloring routine
+  // ------------------------------------------
+  graph_compute_distance2_color(&kh, crsGraph.numRows(), num_cols,
+                                crsGraph.row_map, crsGraph.entries,
+                                crsGraph.row_map, crsGraph.entries);
+
+  // ------------------------------------------
+  // Get the results
+  // ------------------------------------------
+  size_t num_colors =
+      kh.get_distance2_graph_coloring_handle()->get_num_colors();
+  size_t num_phases =
+      kh.get_distance2_graph_coloring_handle()->get_num_phases();
+
+  if (params.verbose_level > 0) {
+    std::cout
+        << "Total Time: "
+        << kh.get_distance2_graph_coloring_handle()->get_overall_coloring_time()
+        << std::endl
+        << "Num colors: "
+        << kh.get_distance2_graph_coloring_handle()->get_num_colors()
+        << std::endl
+        << "Num Phases: "
+        << kh.get_distance2_graph_coloring_handle()->get_num_phases()
+        << std::endl
+        << "Colors:\n\t";
+    KokkosKernels::Impl::print_1Dview(
+        kh.get_distance2_graph_coloring_handle()->get_vertex_colors());
+    std::cout << std::endl;
+  }
+
+  // ------------------------------------------
+  // Save coloring to a GraphViz file
+  // ------------------------------------------
+  if (params.output_graphviz &&
+      crsGraph.numRows() <= params.output_graphviz_vert_max) {
+    auto colors = kh.get_distance2_graph_coloring_handle()->get_vertex_colors();
+
+    std::ofstream os("G.dot", std::ofstream::out);
+
+    kh.get_distance2_graph_coloring_handle()->dump_graphviz(
+        os, crsGraph.numRows(), crsGraph.row_map, crsGraph.entries, colors);
+  }
+
+  // ------------------------------------------
+  // Verify correctness
+  // ------------------------------------------
+  std::string str_color_is_valid = "UNKNOWN";
+  if (0 != params.validate) {
+    str_color_is_valid = "VALID";
+
+    bool d2_coloring_is_valid            = false;
+    bool d2_coloring_validation_flags[4] = {false};
+
+    d2_coloring_is_valid = KokkosGraph::Impl::graph_verify_distance2_color(
+        &kh, crsGraph.numRows(),
+        // crsGraph.numCols(),
+        num_cols, crsGraph.row_map, crsGraph.entries, crsGraph.row_map,
+        crsGraph.entries, d2_coloring_validation_flags);
+
+    // Print out messages based on coloring validation check.
+    if (d2_coloring_is_valid) {
+      std::cout << std::endl
+                << "Distance-2 Graph Coloring is VALID" << std::endl
+                << std::endl;
+    } else {
+      str_color_is_valid = "INVALID";
+      std::cout << std::endl
+                << "Distance-2 Graph Coloring is NOT VALID" << std::endl
+                << "  - Vert(s) left uncolored : "
+                << d2_coloring_validation_flags[1] << std::endl
+                << "  - Invalid D2 Coloring    : "
+                << d2_coloring_validation_flags[2] << std::endl
+                << std::endl;
     }
-
-    const int num_threads = params.use_openmp;      // Assumption is that use_openmp variable is provided as number of threads
-    const int device_id   = 0;
-    Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
-
-    // Print out information about the configuration of the run if verbose_level >= 5
-    if(params.verbose_level >= 5)
-    {
-        Kokkos::print_configuration(std::cout);
+    if (d2_coloring_validation_flags[3]) {
+      std::cout << "Distance-2 Graph Coloring may have poor quality."
+                << std::endl
+                << "  - Vert(s) have high color value : "
+                << d2_coloring_validation_flags[3] << std::endl
+                << std::endl;
     }
+  }
+
+  // ------------------------------------------
+  // Print out a histogram of the colors
+  // ------------------------------------------
+  if (0 != params.output_histogram) {
+    KokkosGraph::Impl::graph_print_distance2_color_histogram(
+        &kh, crsGraph.numRows(), num_cols, crsGraph.row_map, crsGraph.entries,
+        crsGraph.row_map, crsGraph.entries, false);
+  }
+
+  // ------------------------------------------
+  // Print out a summary
+  // ------------------------------------------
+  std::string mtx_bin_file = params.mtx_bin_file;
+  mtx_bin_file = mtx_bin_file.substr(mtx_bin_file.find_last_of("/\\") + 1);
+
+  std::cout << "Summary" << std::endl
+            << "-------" << std::endl
+            << "    KExecSName     : " << Kokkos::DefaultExecutionSpace::name()
+            << std::endl
+            << "    Filename       : " << mtx_bin_file << std::endl
+            << "    Num Verts      : " << crsGraph.numRows() << std::endl
+            << "    Num Edges      : " << crsGraph.entries.extent(0)
+            << std::endl
+            << "    Concurrency    : "
+            << Kokkos::DefaultExecutionSpace::concurrency() << std::endl
+            << "    Algorithm      : " << label_algorithm << std::endl
+            << "Coloring Stats" << std::endl
+            << "    Num colors     : " << num_colors << std::endl
+            << "    Num Phases     : " << num_phases << std::endl
+            << "    Validation     : " << str_color_is_valid << std::endl
+            << std::endl;
+
+}  // run_example()
+
+template <typename size_type, typename lno_type, typename exec_space,
+          typename hbm_mem_space>
+void driver(Parameters params) {
+  using myExecSpace  = exec_space;
+  using myFastDevice = Kokkos::Device<exec_space, hbm_mem_space>;
+  using crstmat_type =
+      typename KokkosSparse::CrsMatrix<double, lno_type, myFastDevice, void,
+                                       size_type>;
+  using graph_type = typename crstmat_type::StaticCrsGraphType;
+  using data_type  = typename graph_type::data_type;
+
+  char* mat_file = params.mtx_bin_file;
+
+  crstmat_type crsmat =
+      KokkosKernels::Impl::read_kokkos_crst_matrix<crstmat_type>(mat_file);
+  graph_type crsgraph = crsmat.graph;
+  data_type num_cols  = crsmat.numCols();
+
+  KokkosKernels::Example::run_example<myExecSpace, data_type, graph_type,
+                                      hbm_mem_space, hbm_mem_space>(
+      crsgraph, num_cols, params);
+
+}  // driver()
+
+}  // namespace Example
+}  // namespace KokkosKernels
+
+int main(int argc, char* argv[]) {
+  KokkosKernels::Example::Parameters params;
+
+  if (parse_inputs(params, argc, argv)) {
+    return 1;
+  }
+
+  if (params.mtx_bin_file == NULL) {
+    std::cerr << "Provide a matrix file" << std::endl;
+    return 0;
+  }
+
+  const int num_threads =
+      params.use_openmp;  // Assumption is that use_openmp variable is provided
+                          // as number of threads
+  const int device_id = 0;
+  Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
+
+  // Print out information about the configuration of the run if verbose_level
+  // >= 5
+  if (params.verbose_level >= 5) {
+    Kokkos::print_configuration(std::cout);
+  }
+
+#if defined(KOKKOS_ENABLE_OPENMP)
+  if (params.use_openmp) {
+    KokkosKernels::Example::driver<kk_size_type, kk_lno_type, Kokkos::OpenMP,
+                                   Kokkos::OpenMP::memory_space>(params);
+  }
+#endif
 
-    #if defined(KOKKOS_ENABLE_OPENMP)
-    if(params.use_openmp)
-    {
-        KokkosKernels::Example::driver<kk_size_type, kk_lno_type, Kokkos::OpenMP, Kokkos::OpenMP::memory_space>(params);
-    }
-    #endif
-
-    #if defined(KOKKOS_ENABLE_CUDA)
-    if(params.use_cuda)
-    {
-        KokkosKernels::Example::driver<kk_size_type, kk_lno_type, Kokkos::Cuda, Kokkos::Cuda::memory_space>(params);
-    }
-    #endif
+#if defined(KOKKOS_ENABLE_CUDA)
+  if (params.use_cuda) {
+    KokkosKernels::Example::driver<kk_size_type, kk_lno_type, Kokkos::Cuda,
+                                   Kokkos::Cuda::memory_space>(params);
+  }
+#endif
 
-    #if defined(KOKKOS_ENABLE_SERIAL)
-    if(params.use_serial)
-    {
-        KokkosKernels::Example::driver<kk_size_type, kk_lno_type, Kokkos::Serial, Kokkos::Serial::memory_space>(params);
-    }
-    #endif
+#if defined(KOKKOS_ENABLE_SERIAL)
+  if (params.use_serial) {
+    KokkosKernels::Example::driver<kk_size_type, kk_lno_type, Kokkos::Serial,
+                                   Kokkos::Serial::memory_space>(params);
+  }
+#endif
 
-    Kokkos::finalize();
+  Kokkos::finalize();
 
-    return 0;
+  return 0;
 }
diff --git a/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp b/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp
index 74aa8fb802..9909c55720 100644
--- a/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp
+++ b/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp
@@ -65,359 +65,335 @@
 #include <KokkosKernels_HashmapAccumulator.hpp>
 #include <KokkosKernels_Uniform_Initialized_MemoryPool.hpp>
 
-
-
 // Command Line Parameters structure
-typedef struct params
-{
-    uint32_t use_serial  = false;
-    uint32_t use_threads = false;
-    uint32_t use_cuda    = false;
-    uint32_t use_openmp  = false;
-    bool verbose     = false;
-
-    size_t problem_size = 20;
-    size_t repeat       = 1;
+typedef struct params {
+  uint32_t use_serial  = false;
+  uint32_t use_threads = false;
+  uint32_t use_cuda    = false;
+  uint32_t use_openmp  = false;
+  bool verbose         = false;
+
+  size_t problem_size = 20;
+  size_t repeat       = 1;
 } parameters_t;
 
-
-
 namespace KokkosKernels {
 namespace Experiment {
 
+template <typename ExecutionSpace, typename uniform_memory_pool_t,
+          typename scalar_t>
+struct functorTestHashmapAccumulator {
+  typedef ExecutionSpace execution_space;
+  typedef typename Kokkos::View<scalar_t*> data_view_t;
+
+  const size_t _num_entries;
+  const data_view_t _data;
+  uniform_memory_pool_t _memory_pool;
+  const size_t _hash_size;
+  const size_t _max_hash_entries;
+  const parameters_t& _params;
+
+  typedef Kokkos::Experimental::UniqueToken<
+      execution_space, Kokkos::Experimental::UniqueTokenScope::Global>
+      unique_token_t;
+  unique_token_t tokens;
+
+  functorTestHashmapAccumulator(const size_t num_entries,
+                                const data_view_t& data,
+                                uniform_memory_pool_t memory_pool,
+                                const size_t hash_size,
+                                const size_t max_hash_entries,
+                                const parameters_t& params)
+      : _num_entries(num_entries),
+        _data(data),
+        _memory_pool(memory_pool),
+        _hash_size(hash_size),
+        _max_hash_entries(max_hash_entries),
+        _params(params),
+        tokens(ExecutionSpace()) {
+    if (_params.verbose) {
+      std::cout << "UniqueToken.size: " << tokens.size() << std::endl;
+    }
+  }
 
-    template<typename ExecutionSpace, typename uniform_memory_pool_t, typename scalar_t>
-    struct functorTestHashmapAccumulator
-    {
-        typedef ExecutionSpace execution_space;
-        typedef typename Kokkos::View<scalar_t*> data_view_t;
-
-        const size_t _num_entries;
-        const data_view_t _data;
-        uniform_memory_pool_t _memory_pool;
-        const size_t _hash_size;
-        const size_t _max_hash_entries;
-        const parameters_t& _params;
-
-        typedef Kokkos::Experimental::UniqueToken<execution_space, Kokkos::Experimental::UniqueTokenScope::Global> unique_token_t;
-        unique_token_t tokens;
-
-        functorTestHashmapAccumulator( const size_t num_entries,
-                                       const data_view_t& data,
-                                       uniform_memory_pool_t memory_pool,
-                                       const size_t hash_size,
-                                       const size_t max_hash_entries,
-                                       const parameters_t& params)
-            : _num_entries(num_entries)
-            , _data(data)
-            , _memory_pool(memory_pool)
-            , _hash_size(hash_size)
-            , _max_hash_entries(max_hash_entries)
-            , _params(params)
-            , tokens( ExecutionSpace() )
-        {
-            if(_params.verbose)
-            {
-                std::cout << "UniqueToken.size: " << tokens.size() << std::endl;
-            }
-        }
-
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const scalar_t idx) const
-        {
-            typedef scalar_t hash_size_type;
-            typedef scalar_t hash_key_type;
-            typedef scalar_t hash_value_type;
-
-            // Alternative to team_policy thread id
-            auto tid = tokens.acquire();
-
-            // Acquire a chunk from the memory pool using a spin-loop.
-            volatile scalar_t* ptr_temp = nullptr;
-            while(nullptr==ptr_temp)
-            {
-                ptr_temp = (volatile scalar_t*)(_memory_pool.allocate_chunk(tid));
-            }
-            scalar_t* ptr_memory_pool_chunk = (scalar_t*)(ptr_temp);
-
-            KokkosKernels::Experimental::HashmapAccumulator<hash_size_type, hash_key_type, hash_value_type> hash_map;
-
-            // Set pointer to hash indices
-            scalar_t* used_hash_indices = (scalar_t*)(ptr_temp);
-            ptr_temp += _hash_size;
-
-            // Set pointer to hash begins
-            hash_map.hash_begins = (scalar_t*)(ptr_temp);
-            ptr_temp += _hash_size;
-
-            // Set pointer to hash nexts
-            hash_map.hash_nexts = (scalar_t*)(ptr_temp);
-            ptr_temp += _max_hash_entries;
-
-            // Set pointer to hash keys
-            hash_map.keys = (scalar_t*)(ptr_temp);
-            // ptr_temp += _max_hash_entries;
-
-            // Set pointer to hash values
-            //hash_map.values = (scalar_t*)(ptr_temp);
-
-            // Set up limits in Hashmap_Accumulator
-            hash_map.hash_key_size  = _max_hash_entries;
-            hash_map.max_value_size = _max_hash_entries;
-
-            // hash function is hash_size-1 (note: hash_size must be a power of 2)
-            scalar_t hash_func_pow2 = _hash_size-1;
-
-            // These are updated by Hashmap_Accumulator insert functions.
-            scalar_t used_hash_size  = 0;
-            scalar_t used_hash_count = 0;
-
-            // Loop over stuff
-            for(size_t i=0; i<_num_entries; i++)
-            {
-                scalar_t key  = _data(i);
-
-                // Compute the hash index using & instead of % (modulus is slower).
-                scalar_t hash = key & hash_func_pow2;
-
-                int r = hash_map.sequential_insert_into_hash_TrackHashes(hash,
-                                                                         key,
-                                                                         &used_hash_size,
-                                                                         hash_map.max_value_size,
-                                                                         &used_hash_count,
-                                                                         used_hash_indices);
-
-                // Check return code
-                if(r)
-                {
-                    // insert should return nonzero if the insert failed, but for sequential_insert_into_hash_TrackHashes
-                    // the 'full' case is currently ignored, so r will always be 0.
-                }
-            }
-
-            // TODO: Get the # of unique values inserted and return that out of the functor.
-
-            // Reset the Begins values to -1 before releasing the memory pool chunk.
-            // If you don't do this the next thread that grabs this memory chunk will not work properly.
-            for(scalar_t i=0; i<used_hash_count; i++)
-            {
-                scalar_t dirty_hash = used_hash_indices[i];
-                hash_map.hash_begins[dirty_hash] = -1;
-            }
-
-            // Release the memory pool chunk back to the pool
-            _memory_pool.release_chunk(ptr_memory_pool_chunk);
-
-            // Release the UniqueToken
-            tokens.release(tid);
-
-        }   // operator()
-
-    };  // functorTestHashmapAccumulator
-
-
-
-    template<typename execution_space, typename scalar_t=int>
-    void experiment(const parameters_t& params)
-    {
-        typedef typename KokkosKernels::Impl::UniformMemoryPool<execution_space, scalar_t> uniform_memory_pool_t;
-        typedef typename Kokkos::View<scalar_t*> data_view_t;
-        typedef typename data_view_t::HostMirror data_view_hostmirror_t;
-
-        size_t num_entries = params.problem_size;
-
-        // Set max value in the list
-        size_t max_value = 100;
-
-        // Get the concurrecny
-        size_t concurrency = execution_space::concurrency();
-
-        // Set up random number generator
-        std::random_device rd;
-        std::mt19937 eng(rd());
-        std::uniform_int_distribution<scalar_t> distr(1, max_value);
-
-        // Create a view of random values
-        data_view_t d_data("data", num_entries);
-        data_view_hostmirror_t h_data = Kokkos::create_mirror_view(d_data);
-
-        for(size_t i=0; i<num_entries; i++)
-        {
-            h_data(i) = distr(eng);
-        }
-
-        // Print out the array of random numbers if the list size is small.
-        if(num_entries <= 50 || params.verbose)
-        {
-            std::cout << "Data: ";
-            for(size_t i=0; i<num_entries; i++)
-            {
-                std::cout << h_data(i) << " ";
-            }
-            std::cout << std::endl;
-        }
-
-        Kokkos::Timer timer;
-
-        // Deep copy initialized values to device memory.
-        Kokkos::deep_copy(d_data, h_data);
-
-        // Set Hash Table Parameters
-        size_t max_hash_entries = max_value;       // Max number of entries that can be inserted (values allowed are 1..100)
-        size_t hash_size_hint   = max_value;       // How many hash keys are allowed. The actual hash size will be set to the
-                                                   // next power of 2 bigger than hash_size_hint.
-
-        // Set the hash_size as the next power of 2 bigger than hash_size_hint.
-        // - hash_size must be a power of two since we use & rather than % (which is slower) for
-        // computing the hash value for HashmapAccumulator.
-        size_t hash_size = 1;
-        while(hash_size < hash_size_hint) { hash_size *= 2; }
-
-        // Create Uniform Initialized Memory Pool
-        KokkosKernels::Impl::PoolType pool_type = KokkosKernels::Impl::OneThread2OneChunk;
-
-        // Determine memory chunk size for UniformMemoryPool
-        size_t mem_chunk_size = hash_size;      // for hash indices
-        mem_chunk_size += hash_size;            // for hash begins
-        mem_chunk_size += max_hash_entries;     // for hash nexts
-        mem_chunk_size += max_hash_entries;     // for hash keys
-        //mem_chunk_size += max_entries;          // for hash values
-
-        // Set a cap on # of chunks to 32.  In application something else should be done
-        // here differently if we're OpenMP vs. GPU but for this example we can just cap
-        // our number of chunks at 32.
-        size_t mem_chunk_count  = KOKKOSKERNELS_MACRO_MIN(32, concurrency);
-
-        //KokkosKernels::Impl::UniformMemoryPool<Kokkos::DefaultExecutionSpace, size_t> m_space(mem_chunk_count, mem_chunk_size, -1, pool_type);
-        uniform_memory_pool_t memory_pool(mem_chunk_count, mem_chunk_size, -1, pool_type);
-
-        functorTestHashmapAccumulator<execution_space, uniform_memory_pool_t, scalar_t>
-        testHashmapAccumulator(num_entries, d_data, memory_pool, hash_size, max_hash_entries, params);
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const scalar_t idx) const {
+    typedef scalar_t hash_size_type;
+    typedef scalar_t hash_key_type;
+    typedef scalar_t hash_value_type;
 
-        Kokkos::parallel_for("testHashmapAccumulator", num_entries, testHashmapAccumulator);
+    // Alternative to team_policy thread id
+    auto tid = tokens.acquire();
 
-        if(params.verbose)
-        {
-            double t = timer.seconds();
-            std::cout << "Execution Time: " << std::setw(-2) << t << std::endl;
-            timer.reset();
-        }
+    // Acquire a chunk from the memory pool using a spin-loop.
+    volatile scalar_t* ptr_temp = nullptr;
+    while (nullptr == ptr_temp) {
+      ptr_temp = (volatile scalar_t*)(_memory_pool.allocate_chunk(tid));
     }
+    scalar_t* ptr_memory_pool_chunk = (scalar_t*)(ptr_temp);
 
-}   // namespace Experiment
-}   // namespace KokkosKernels
-
-
-
-void print_options(std::ostream &os, const char *app_name, unsigned int indent = 0)
-{
-    std::string spaces(indent, ' ');
-    os << "Usage:" << std::endl
-       << spaces << "  " << app_name << " [parameters]" << std::endl
-       << std::endl
-       << spaces << "Parameters:" << std::endl
-       << spaces << "  Parallelism (select one of the following):" << std::endl
-       << spaces << "      --serial <N>        Execute serially." << std::endl
-       << spaces << "      --threads <N>       Use N posix threads." << std::endl
-       << spaces << "      --openmp <N>        Use OpenMP with N threads." << std::endl
-       << spaces << "      --cuda              Use CUDA" << std::endl
-       << spaces << "  Optional Parameters:" << std::endl
-       << spaces << "      --problem-size <N>  Problem Size (Default: 20)" << std::endl
-       << spaces << "      --verbose           Verbose output" << std::endl
-       << spaces << "      --help              Print out command line help." << std::endl
-       << spaces << " " << std::endl;
-}   // print_options
-
-
-
-//int parse_inputs(KokkosKernels::Experiment::Parameters &params, int argc, char **argv)
-int parse_inputs(parameters_t &params, int argc, char **argv)
-{
-    if(argc==1)
-    {
-        print_options(std::cout, argv[0]);
-        return 1;
-    }
+    KokkosKernels::Experimental::HashmapAccumulator<
+        hash_size_type, hash_key_type, hash_value_type>
+        hash_map;
+
+    // Set pointer to hash indices
+    scalar_t* used_hash_indices = (scalar_t*)(ptr_temp);
+    ptr_temp += _hash_size;
+
+    // Set pointer to hash begins
+    hash_map.hash_begins = (scalar_t*)(ptr_temp);
+    ptr_temp += _hash_size;
+
+    // Set pointer to hash nexts
+    hash_map.hash_nexts = (scalar_t*)(ptr_temp);
+    ptr_temp += _max_hash_entries;
+
+    // Set pointer to hash keys
+    hash_map.keys = (scalar_t*)(ptr_temp);
+    // ptr_temp += _max_hash_entries;
+
+    // Set pointer to hash values
+    // hash_map.values = (scalar_t*)(ptr_temp);
+
+    // Set up limits in Hashmap_Accumulator
+    hash_map.hash_key_size  = _max_hash_entries;
+    hash_map.max_value_size = _max_hash_entries;
+
+    // hash function is hash_size-1 (note: hash_size must be a power of 2)
+    scalar_t hash_func_pow2 = _hash_size - 1;
+
+    // These are updated by Hashmap_Accumulator insert functions.
+    scalar_t used_hash_size  = 0;
+    scalar_t used_hash_count = 0;
 
-    for(int i = 1; i < argc; ++i)
-    {
-        if(0 == strcasecmp(argv[i], "--threads"))
-        {
-            params.use_threads = atoi(argv[++i]);
-        }
-        else if(0 == strcasecmp(argv[i], "--serial"))
-        {
-            params.use_serial = atoi(argv[++i]);
-        }
-        else if(0 == strcasecmp(argv[i], "--openmp"))
-        {
-            params.use_openmp = atoi(argv[++i]);
-        }
-        else if(0 == strcasecmp(argv[i], "--cuda"))
-        {
-            params.use_cuda = 1;
-        }
-        else if (0 == strcasecmp(argv[i], "--repeat"))
-        {
-            params.repeat = atoi(argv[++i]);
-        }
-        else if (0 == strcasecmp(argv[i], "--problem-size"))
-        {
-            params.problem_size = atoi(argv[++i]);
-        }
-        else if(0 == strcasecmp(argv[i], "--verbose") || 0 == strcasecmp(argv[i], "-V") )
-        {
-            params.verbose = true;
-        }
-        else if(0 == strcasecmp(argv[i], "help") || 0 == strcasecmp(argv[i], "-h"))
-        {
-            print_options(std::cout, argv[0]);
-            return 1;
-        }
-        else
-        {
-            std::cerr << "3-Unrecognized command line argument #" << i << ": " << argv[i] << std::endl;
-            print_options(std::cout, argv[0]);
-            return 1;
-        }
+    // Loop over stuff
+    for (size_t i = 0; i < _num_entries; i++) {
+      scalar_t key = _data(i);
+
+      // Compute the hash index using & instead of % (modulus is slower).
+      scalar_t hash = key & hash_func_pow2;
+
+      int r = hash_map.sequential_insert_into_hash_TrackHashes(
+          hash, key, &used_hash_size, hash_map.max_value_size, &used_hash_count,
+          used_hash_indices);
+
+      // Check return code
+      if (r) {
+        // insert should return nonzero if the insert failed, but for
+        // sequential_insert_into_hash_TrackHashes the 'full' case is currently
+        // ignored, so r will always be 0.
+      }
     }
-    if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda)
-    {
-        print_options(std::cout, argv[0]);
-        return 1;
+
+    // TODO: Get the # of unique values inserted and return that out of the
+    // functor.
+
+    // Reset the Begins values to -1 before releasing the memory pool chunk.
+    // If you don't do this the next thread that grabs this memory chunk will
+    // not work properly.
+    for (scalar_t i = 0; i < used_hash_count; i++) {
+      scalar_t dirty_hash              = used_hash_indices[i];
+      hash_map.hash_begins[dirty_hash] = -1;
     }
-    return 0;
-}   // parse_inputs
 
+    // Release the memory pool chunk back to the pool
+    _memory_pool.release_chunk(ptr_memory_pool_chunk);
 
+    // Release the UniqueToken
+    tokens.release(tid);
 
-int main(int argc, char *argv[])
-{
-    //KokkosKernels::Experiment::Parameters params;
-    parameters_t params;
+  }  // operator()
 
-    // Override default repeats (default is 6)
-    params.repeat = 1;
+};  // functorTestHashmapAccumulator
 
-    if(parse_inputs(params, argc, argv))
-    {
-        return 1;
-    }
+template <typename execution_space, typename scalar_t = int>
+void experiment(const parameters_t& params) {
+  typedef
+      typename KokkosKernels::Impl::UniformMemoryPool<execution_space, scalar_t>
+          uniform_memory_pool_t;
+  typedef typename Kokkos::View<scalar_t*> data_view_t;
+  typedef typename data_view_t::HostMirror data_view_hostmirror_t;
 
-    const int device_id   = 0;
-    const int num_threads = params.use_openmp;      // Assumption is that use_openmp variable is provided as number of threads
+  size_t num_entries = params.problem_size;
 
-    Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
+  // Set max value in the list
+  size_t max_value = 100;
 
-    if(params.verbose)
-    {
-        Kokkos::print_configuration(std::cout);
-    }
+  // Get the concurrecny
+  size_t concurrency = execution_space::concurrency();
 
-    // Work goes here.
-    KokkosKernels::Experiment::experiment<Kokkos::DefaultExecutionSpace>(params);
+  // Set up random number generator
+  std::random_device rd;
+  std::mt19937 eng(rd());
+  std::uniform_int_distribution<scalar_t> distr(1, max_value);
 
-    Kokkos::finalize();
-    std::cout << "Done." << std::endl;
-    return 0;
+  // Create a view of random values
+  data_view_t d_data("data", num_entries);
+  data_view_hostmirror_t h_data = Kokkos::create_mirror_view(d_data);
+
+  for (size_t i = 0; i < num_entries; i++) {
+    h_data(i) = distr(eng);
+  }
+
+  // Print out the array of random numbers if the list size is small.
+  if (num_entries <= 50 || params.verbose) {
+    std::cout << "Data: ";
+    for (size_t i = 0; i < num_entries; i++) {
+      std::cout << h_data(i) << " ";
+    }
+    std::cout << std::endl;
+  }
+
+  Kokkos::Timer timer;
+
+  // Deep copy initialized values to device memory.
+  Kokkos::deep_copy(d_data, h_data);
+
+  // Set Hash Table Parameters
+  size_t max_hash_entries = max_value;  // Max number of entries that can be
+                                        // inserted (values allowed are 1..100)
+  size_t hash_size_hint =
+      max_value;  // How many hash keys are allowed. The actual hash size will
+                  // be set to the next power of 2 bigger than hash_size_hint.
+
+  // Set the hash_size as the next power of 2 bigger than hash_size_hint.
+  // - hash_size must be a power of two since we use & rather than % (which is
+  // slower) for computing the hash value for HashmapAccumulator.
+  size_t hash_size = 1;
+  while (hash_size < hash_size_hint) {
+    hash_size *= 2;
+  }
+
+  // Create Uniform Initialized Memory Pool
+  KokkosKernels::Impl::PoolType pool_type =
+      KokkosKernels::Impl::OneThread2OneChunk;
+
+  // Determine memory chunk size for UniformMemoryPool
+  size_t mem_chunk_size = hash_size;   // for hash indices
+  mem_chunk_size += hash_size;         // for hash begins
+  mem_chunk_size += max_hash_entries;  // for hash nexts
+  mem_chunk_size += max_hash_entries;  // for hash keys
+  // mem_chunk_size += max_entries;          // for hash values
+
+  // Set a cap on # of chunks to 32.  In application something else should be
+  // done here differently if we're OpenMP vs. GPU but for this example we can
+  // just cap our number of chunks at 32.
+  size_t mem_chunk_count = KOKKOSKERNELS_MACRO_MIN(32, concurrency);
+
+  // KokkosKernels::Impl::UniformMemoryPool<Kokkos::DefaultExecutionSpace,
+  // size_t> m_space(mem_chunk_count, mem_chunk_size, -1, pool_type);
+  uniform_memory_pool_t memory_pool(mem_chunk_count, mem_chunk_size, -1,
+                                    pool_type);
+
+  functorTestHashmapAccumulator<execution_space, uniform_memory_pool_t,
+                                scalar_t>
+      testHashmapAccumulator(num_entries, d_data, memory_pool, hash_size,
+                             max_hash_entries, params);
+
+  Kokkos::parallel_for("testHashmapAccumulator", num_entries,
+                       testHashmapAccumulator);
+
+  if (params.verbose) {
+    double t = timer.seconds();
+    std::cout << "Execution Time: " << std::setw(-2) << t << std::endl;
+    timer.reset();
+  }
+}
+
+}  // namespace Experiment
+}  // namespace KokkosKernels
+
+void print_options(std::ostream& os, const char* app_name,
+                   unsigned int indent = 0) {
+  std::string spaces(indent, ' ');
+  os << "Usage:" << std::endl
+     << spaces << "  " << app_name << " [parameters]" << std::endl
+     << std::endl
+     << spaces << "Parameters:" << std::endl
+     << spaces << "  Parallelism (select one of the following):" << std::endl
+     << spaces << "      --serial <N>        Execute serially." << std::endl
+     << spaces << "      --threads <N>       Use N posix threads." << std::endl
+     << spaces << "      --openmp <N>        Use OpenMP with N threads."
+     << std::endl
+     << spaces << "      --cuda              Use CUDA" << std::endl
+     << spaces << "  Optional Parameters:" << std::endl
+     << spaces << "      --problem-size <N>  Problem Size (Default: 20)"
+     << std::endl
+     << spaces << "      --verbose           Verbose output" << std::endl
+     << spaces << "      --help              Print out command line help."
+     << std::endl
+     << spaces << " " << std::endl;
+}  // print_options
+
+// int parse_inputs(KokkosKernels::Experiment::Parameters &params, int argc,
+// char **argv)
+int parse_inputs(parameters_t& params, int argc, char** argv) {
+  if (argc == 1) {
+    print_options(std::cout, argv[0]);
+    return 1;
+  }
+
+  for (int i = 1; i < argc; ++i) {
+    if (0 == Test::string_compare_no_case(argv[i], "--threads")) {
+      params.use_threads = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--serial")) {
+      params.use_serial = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) {
+      params.use_openmp = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) {
+      params.use_cuda = 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) {
+      params.repeat = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--problem-size")) {
+      params.problem_size = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--verbose") ||
+               0 == Test::string_compare_no_case(argv[i], "-V")) {
+      params.verbose = true;
+    } else if (0 == Test::string_compare_no_case(argv[i], "help") ||
+               0 == Test::string_compare_no_case(argv[i], "-h")) {
+      print_options(std::cout, argv[0]);
+      return 1;
+    } else {
+      std::cerr << "3-Unrecognized command line argument #" << i << ": "
+                << argv[i] << std::endl;
+      print_options(std::cout, argv[0]);
+      return 1;
+    }
+  }
+  if (!params.use_serial && !params.use_threads && !params.use_openmp &&
+      !params.use_cuda) {
+    print_options(std::cout, argv[0]);
+    return 1;
+  }
+  return 0;
+}  // parse_inputs
+
+int main(int argc, char* argv[]) {
+  // KokkosKernels::Experiment::Parameters params;
+  parameters_t params;
+
+  // Override default repeats (default is 6)
+  params.repeat = 1;
+
+  if (parse_inputs(params, argc, argv)) {
+    return 1;
+  }
+
+  const int device_id = 0;
+  const int num_threads =
+      params.use_openmp;  // Assumption is that use_openmp variable is provided
+                          // as number of threads
+
+  Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
+
+  if (params.verbose) {
+    Kokkos::print_configuration(std::cout);
+  }
+
+  // Work goes here.
+  KokkosKernels::Experiment::experiment<Kokkos::DefaultExecutionSpace>(params);
+
+  Kokkos::finalize();
+  std::cout << "Done." << std::endl;
+  return 0;
 }
diff --git a/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp b/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp
index f4a0026de1..9fa6adb484 100644
--- a/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp
+++ b/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp
@@ -1,23 +1,23 @@
 #include "KokkosGraph_wiki_9pt_stencil.hpp"
 #include "KokkosGraph_MIS2.hpp"
 
-int main()
-{
+int main() {
   Kokkos::initialize();
   {
     using GraphDemo::numVertices;
     RowmapType rowmapDevice;
     ColindsType colindsDevice;
-    //Step 1: Generate the graph on host, allocate space on device, and copy.
-    //See function "generate9pt" below.
+    // Step 1: Generate the graph on host, allocate space on device, and copy.
+    // See function "generate9pt" below.
     GraphDemo::generate9pt(rowmapDevice, colindsDevice);
-    //Step 2: Run MIS-2 based coarsening and print the result
+    // Step 2: Run MIS-2 based coarsening and print the result
     {
       std::cout << "Coarsened vertex labels:\n";
       Ordinal numClusters = 0;
-      auto labels = KokkosGraph::Experimental::graph_mis2_coarsen<ExecSpace, RowmapType, ColindsType>(
-          rowmapDevice, colindsDevice, numClusters, KokkosGraph::MIS2_FAST);
-      //coarsening labels can be printed in the same way as colors
+      auto labels =
+          KokkosGraph::graph_mis2_aggregate<ExecSpace, RowmapType, ColindsType>(
+              rowmapDevice, colindsDevice, numClusters);
+      // coarsening labels can be printed in the same way as colors
       GraphDemo::printColoring(labels, numClusters);
       putchar('\n');
     }
@@ -25,4 +25,3 @@ int main()
   Kokkos::finalize();
   return 0;
 }
-
diff --git a/example/wiki/graph/KokkosGraph_wiki_mis2.cpp b/example/wiki/graph/KokkosGraph_wiki_mis2.cpp
index 466d506170..2b56af9c96 100644
--- a/example/wiki/graph/KokkosGraph_wiki_mis2.cpp
+++ b/example/wiki/graph/KokkosGraph_wiki_mis2.cpp
@@ -1,29 +1,32 @@
 #include "KokkosGraph_wiki_9pt_stencil.hpp"
 #include "KokkosGraph_MIS2.hpp"
 
-int main()
-{
+int main() {
   Kokkos::initialize();
   {
     using GraphDemo::numVertices;
     RowmapType rowmapDevice;
     ColindsType colindsDevice;
-    //Step 1: Generate the graph on host, allocate space on device, and copy.
-    //See function "generate9pt" below.
+    // Step 1: Generate the graph on host, allocate space on device, and copy.
+    // See function "generate9pt" below.
     GraphDemo::generate9pt(rowmapDevice, colindsDevice);
-    //Step 2: Run distance-2 MIS and print the results, with three different algorithms
+    // Step 2: Run distance-2 MIS and print the results, with three different
+    // algorithms
     {
-      //Run coloring
-      auto misDevice = KokkosGraph::Experimental::graph_d2_mis<ExecSpace, RowmapType, ColindsType>(
-          rowmapDevice, colindsDevice, KokkosGraph::MIS2_FAST);
+      // Run coloring
+      auto misDevice =
+          KokkosGraph::graph_d2_mis<ExecSpace, RowmapType, ColindsType>(
+              rowmapDevice, colindsDevice, KokkosGraph::MIS2_FAST);
       std::cout << "Distance-2 MIS, FAST algorithm: contains "
-        << misDevice.extent(0) << " out of " << GraphDemo::numVertices << " vertices.\n";
+                << misDevice.extent(0) << " out of " << GraphDemo::numVertices
+                << " vertices.\n";
       GraphDemo::printMIS(misDevice);
       putchar('\n');
-      misDevice = KokkosGraph::Experimental::graph_d2_mis<ExecSpace, RowmapType, ColindsType>(
+      misDevice = KokkosGraph::graph_d2_mis<ExecSpace, RowmapType, ColindsType>(
           rowmapDevice, colindsDevice, KokkosGraph::MIS2_QUALITY);
       std::cout << "Distance-2 MIS, QUALITY algorithm: contains "
-        << misDevice.extent(0) << " out of " << GraphDemo::numVertices << " vertices.\n";
+                << misDevice.extent(0) << " out of " << GraphDemo::numVertices
+                << " vertices.\n";
       GraphDemo::printMIS(misDevice);
       putchar('\n');
     }
@@ -31,4 +34,3 @@ int main()
   Kokkos::finalize();
   return 0;
 }
-
diff --git a/install_test/CMakeLists.txt.in b/install_test/CMakeLists.txt.in
index 74605ac73f..edf6c2cc1a 100644
--- a/install_test/CMakeLists.txt.in
+++ b/install_test/CMakeLists.txt.in
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.13)
+cmake_minimum_required(VERSION 3.16)
 project(kokkoskernels_install_test CXX)
 
 include(CTest)
diff --git a/master_history.txt b/master_history.txt
index 652150a31f..222913d92c 100644
--- a/master_history.txt
+++ b/master_history.txt
@@ -15,3 +15,4 @@ tag: 3.3.01     date: 01/18/2021  master: f64b1c57    release: 4e1cc00b
 tag: 3.4.00     date: 04/26/2021  master: fe439b21    release: d3c33910
 tag: 3.4.01     date: 05/20/2021  master: 564dccb3    release: 4c62eb86
 tag: 3.5.00     date: 11/19/2021  master: 00189c0b    release: f171533d
+tag: 3.6.00     date: 04/06/2022  master: 8381db04    release: a7e683c4
diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt
index d9ec2a34d9..91dc727867 100644
--- a/perf_test/CMakeLists.txt
+++ b/perf_test/CMakeLists.txt
@@ -35,6 +35,7 @@ if (KokkosKernels_ENABLE_TESTS_AND_PERFSUITE)
         blas/blas2/KokkosBlas2_gemv_tracked_perf_test.cpp
         blas/blas1/KokkosBlas_dot_tracked_perf_test.cpp
         blas/blas1/KokkosBlas_team_dot_tracked_perf_test.cpp
+        blas/blas3/KokkosBlas3_gemm_tracked_perf_test.cpp
         PerfTestUtilities.cpp
         sparse/spmv/OpenMPSmartStatic_SPMV.cpp
         #sparse / KokkosSparse_spgemm_test.cpp
diff --git a/perf_test/KokkosKernelsTrackedTesting.cpp b/perf_test/KokkosKernelsTrackedTesting.cpp
index ffb7f98447..10fb834270 100644
--- a/perf_test/KokkosKernelsTrackedTesting.cpp
+++ b/perf_test/KokkosKernelsTrackedTesting.cpp
@@ -9,6 +9,7 @@
 // For RPS version of BLAS Level-1 Tests
 #include "blas/blas1/tracked_testing.hpp"
 #include "blas/blas2/tracked_testing.hpp"
+#include "blas/blas3/tracked_testing.hpp"
 int main(int argc, char* argv[]) {
   {
     // argument parsing for setting input data at runtime
@@ -55,6 +56,8 @@ int main(int argc, char* argv[]) {
 
     test::blas2::build_blas2_executor(exec, argc, argv, run_params);
 
+    test::blas3::build_blas3_executor(exec, argc, argv, run_params);
+
     exec.setupSuite();
 
     // STEP 3: Report suite run summary
diff --git a/perf_test/PerfTestUtilities.hpp b/perf_test/PerfTestUtilities.hpp
index 828c0d285a..743df53502 100644
--- a/perf_test/PerfTestUtilities.hpp
+++ b/perf_test/PerfTestUtilities.hpp
@@ -46,8 +46,9 @@ inline std::vector<std::string> get_directories(std::string path) {
     while ((dir = readdir(d)) != NULL) {
       std::string nname = std::string(dir->d_name);
       // Check to see if item is a directory
-      //if (isDirectory(path + '/' + nname))
-      if(nname != "." && nname != ".." && isDirectory(path + '/' + dir->d_name))
+      // if (isDirectory(path + '/' + nname))
+      if (nname != "." && nname != ".." &&
+          isDirectory(path + '/' + dir->d_name))
         // std::vector::emplace_back: insert a new element to the end of vector
         paths.emplace_back(dir->d_name);
     }
diff --git a/perf_test/batched/KokkosBatched_Test_BlockCrs_Cuda.cpp b/perf_test/batched/KokkosBatched_Test_BlockCrs_Cuda.cpp
index 2930aa4e79..50f15cf719 100644
--- a/perf_test/batched/KokkosBatched_Test_BlockCrs_Cuda.cpp
+++ b/perf_test/batched/KokkosBatched_Test_BlockCrs_Cuda.cpp
@@ -22,7 +22,7 @@
 #include "Kokkos_Core.hpp"
 #include "Kokkos_Timer.hpp"
 
-#if defined(KOKKOS_ENABLE_CUDA) 
+#if defined(KOKKOS_ENABLE_CUDA)
 #define __KOKKOSBATCHED_TEST_ENABLE_CUDA__
 
 #include "KokkosBatched_Util.hpp"
@@ -30,16 +30,16 @@
 #define KOKKOSBATCHED_USE_UNBLOCKED_ALGO 1
 //#define KOKKOSBATCHED_USE_BLOCKED_ALGO 1
 
-#if defined (KOKKOSBATCHED_USE_UNBLOCKED_ALGO)
-typedef KokkosBatched::Algo::LU::Unblocked   AlgoLU;
+#if defined(KOKKOSBATCHED_USE_UNBLOCKED_ALGO)
+typedef KokkosBatched::Algo::LU::Unblocked AlgoLU;
 typedef KokkosBatched::Algo::Trsm::Unblocked AlgoTrsm;
 typedef KokkosBatched::Algo::Gemm::Unblocked AlgoGemm;
 
 typedef KokkosBatched::Algo::Trsv::Unblocked AlgoTrsv;
 typedef KokkosBatched::Algo::Gemv::Unblocked AlgoGemv;
 #endif
-#if defined (KOKKOSBATCHED_USE_BLOCKED_ALGO)
-typedef KokkosBatched::Algo::LU::Blocked   AlgoLU;
+#if defined(KOKKOSBATCHED_USE_BLOCKED_ALGO)
+typedef KokkosBatched::Algo::LU::Blocked AlgoLU;
 typedef KokkosBatched::Algo::Trsm::Blocked AlgoTrsm;
 typedef KokkosBatched::Algo::Gemm::Blocked AlgoGemm;
 
@@ -51,8 +51,8 @@ typedef KokkosBatched::Algo::Gemv::Blocked AlgoGemv;
 
 using namespace KokkosBatched;
 
-int main (int argc, char *argv[]) {
-  Kokkos::initialize(argc, argv); 
+int main(int argc, char* argv[]) {
+  Kokkos::initialize(argc, argv);
 
   typedef Kokkos::DefaultExecutionSpace DeviceSpaceType;
 
@@ -60,40 +60,53 @@ int main (int argc, char *argv[]) {
 
   Kokkos::print_configuration(std::cout, detail);
 
-  enum : int { VectorLength = DefaultVectorLength<Test::scalar_type,typename DeviceSpaceType::memory_space>::value,
-               RangeTagOper = 0,
-               TeamTagOper = 1 };
-  
+  enum : int {
+    VectorLength =
+        DefaultVectorLength<Test::scalar_type,
+                            typename DeviceSpaceType::memory_space>::value,
+    RangeTagOper = 0,
+    TeamTagOper  = 1
+  };
+
   // Unit tests
   bool profile = false;
-  for (int i=1;i<argc;++i) {
+  for (int i = 1; i < argc; ++i) {
     const std::string& token = argv[i];
     if (strncmp(token.c_str(), "-profile", 8) == 0) profile = true;
   }
-  
 
   if (!profile) {
     // std::cout << " Unit Test::Range :: Begin\n";
     // {
-    //   Test::run<DeviceSpaceType,Test::scalar_type,VectorLength,RangeTagOper>( 3,  4,  2, 25, 2);
-    //   Test::run<DeviceSpaceType,Test::scalar_type,VectorLength,RangeTagOper>(44, 63, 15,  4, 1);
-    //   Test::run<DeviceSpaceType,Test::scalar_type,VectorLength,RangeTagOper>( 2,  2, 15,  3, 3);
-    //   Test::run<DeviceSpaceType,Test::scalar_type,VectorLength,RangeTagOper>( 1,  1,  2, 63, 8);
-      
+    //   Test::run<DeviceSpaceType,Test::scalar_type,VectorLength,RangeTagOper>(
+    //   3,  4,  2, 25, 2);
+    //   Test::run<DeviceSpaceType,Test::scalar_type,VectorLength,RangeTagOper>(44,
+    //   63, 15,  4, 1);
+    //   Test::run<DeviceSpaceType,Test::scalar_type,VectorLength,RangeTagOper>(
+    //   2,  2, 15,  3, 3);
+    //   Test::run<DeviceSpaceType,Test::scalar_type,VectorLength,RangeTagOper>(
+    //   1,  1,  2, 63, 8);
+
     //   for (int nrhs=1;nrhs<=33;++nrhs)
-    //     Test::run<DeviceSpaceType,Test::scalar_type,VectorLength,RangeTagOper>(2, 2, 15, 3, nrhs);
+    //     Test::run<DeviceSpaceType,Test::scalar_type,VectorLength,RangeTagOper>(2,
+    //     2, 15, 3, nrhs);
     // }
     // std::cout << " Unit Test::Range :: End\n";
-    
+
     std::cout << " Unit Test::Team :: Begin\n";
     {
-      Test::run<DeviceSpaceType,Test::scalar_type,VectorLength,TeamTagOper>( 3,  4,  2, 25, 2);
-      Test::run<DeviceSpaceType,Test::scalar_type,VectorLength,TeamTagOper>(44, 63, 15,  4, 1);
-      Test::run<DeviceSpaceType,Test::scalar_type,VectorLength,TeamTagOper>( 2,  2, 15,  3, 3);
-      Test::run<DeviceSpaceType,Test::scalar_type,VectorLength,TeamTagOper>( 1,  1,  2, 63, 8);
-      
-      for (int nrhs=1;nrhs<=33;++nrhs)
-        Test::run<DeviceSpaceType,Test::scalar_type,VectorLength,TeamTagOper>(2, 2, 15, 3, nrhs);
+      Test::run<DeviceSpaceType, Test::scalar_type, VectorLength, TeamTagOper>(
+          3, 4, 2, 25, 2);
+      Test::run<DeviceSpaceType, Test::scalar_type, VectorLength, TeamTagOper>(
+          44, 63, 15, 4, 1);
+      Test::run<DeviceSpaceType, Test::scalar_type, VectorLength, TeamTagOper>(
+          2, 2, 15, 3, 3);
+      Test::run<DeviceSpaceType, Test::scalar_type, VectorLength, TeamTagOper>(
+          1, 1, 2, 63, 8);
+
+      for (int nrhs = 1; nrhs <= 33; ++nrhs)
+        Test::run<DeviceSpaceType, Test::scalar_type, VectorLength,
+                  TeamTagOper>(2, 2, 15, 3, nrhs);
     }
     std::cout << " Unit Test::Team :: End\n";
   }
@@ -101,9 +114,9 @@ int main (int argc, char *argv[]) {
   // Performance tests
   std::cout << " Perf Test:: Begin\n";
   {
-    const Test::Input<DeviceSpaceType> input(argc, argv); 
-    Test::run<DeviceSpaceType,Test::scalar_type,VectorLength>(input);
-  } 
+    const Test::Input<DeviceSpaceType> input(argc, argv);
+    Test::run<DeviceSpaceType, Test::scalar_type, VectorLength>(input);
+  }
   std::cout << " Perf Test:: End\n";
 
   Kokkos::finalize();
diff --git a/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp b/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp
index f682e1e119..1319fa03db 100644
--- a/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp
+++ b/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp
@@ -27,16 +27,16 @@
 //#define KOKKOSBATCHED_USE_UNBLOCKED_ALGO 1
 #define KOKKOSBATCHED_USE_BLOCKED_ALGO 1
 
-#if defined (KOKKOSBATCHED_USE_UNBLOCKED_ALGO)
-typedef KokkosBatched::Algo::LU::Unblocked   AlgoLU;
+#if defined(KOKKOSBATCHED_USE_UNBLOCKED_ALGO)
+typedef KokkosBatched::Algo::LU::Unblocked AlgoLU;
 typedef KokkosBatched::Algo::Trsm::Unblocked AlgoTrsm;
 typedef KokkosBatched::Algo::Gemm::Unblocked AlgoGemm;
 
 typedef KokkosBatched::Algo::Trsv::Unblocked AlgoTrsv;
 typedef KokkosBatched::Algo::Gemv::Unblocked AlgoGemv;
 #endif
-#if defined (KOKKOSBATCHED_USE_BLOCKED_ALGO)
-typedef KokkosBatched::Algo::LU::Blocked   AlgoLU;
+#if defined(KOKKOSBATCHED_USE_BLOCKED_ALGO)
+typedef KokkosBatched::Algo::LU::Blocked AlgoLU;
 typedef KokkosBatched::Algo::Trsm::Blocked AlgoTrsm;
 typedef KokkosBatched::Algo::Gemm::Blocked AlgoGemm;
 
@@ -48,8 +48,8 @@ typedef KokkosBatched::Algo::Gemv::Blocked AlgoGemv;
 
 using namespace KokkosBatched;
 
-int main (int argc, char *argv[]) {
-  Kokkos::initialize(argc, argv); 
+int main(int argc, char* argv[]) {
+  Kokkos::initialize(argc, argv);
 
 #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
   typedef Kokkos::DefaultHostExecutionSpace HostSpaceType;
@@ -57,53 +57,60 @@ int main (int argc, char *argv[]) {
 
   Kokkos::print_configuration(std::cout, detail);
 
-  enum : int { VectorLength = DefaultVectorLength<Test::scalar_type,typename HostSpaceType::memory_space>::value,
-               RangeTagOper = 0 };
+  enum : int {
+    VectorLength =
+        DefaultVectorLength<Test::scalar_type,
+                            typename HostSpaceType::memory_space>::value,
+    RangeTagOper = 0
+  };
 
   // vector type
-  typedef Vector<SIMD<Test::scalar_type>,VectorLength> VectorType;  
+  typedef Vector<SIMD<Test::scalar_type>, VectorLength> VectorType;
 
   // Unit tests
   bool profile = false;
-  for (int i=1;i<argc;++i) {
+  for (int i = 1; i < argc; ++i) {
     const std::string& token = argv[i];
     if (strncmp(token.c_str(), "-profile", 8) == 0) profile = true;
   }
 
-
   if (!profile) {
-    // including compact layer, it is not possible to test 
+    // including compact layer, it is not possible to test
     // scalar and vector in the same code without templating
     std::cout << " Unit Test::Range::Vector :: Begin\n";
     {
-      Test::run<HostSpaceType,VectorType,VectorLength,RangeTagOper>( 3,  4,  2, 25, 2);
-      Test::run<HostSpaceType,VectorType,VectorLength,RangeTagOper>(44, 63, 15,  4, 1);
-      Test::run<HostSpaceType,VectorType,VectorLength,RangeTagOper>( 2,  2, 15,  3, 3);
-      
-      for (int nrhs=1;nrhs<=33;++nrhs) 
-        Test::run<HostSpaceType,VectorType,VectorLength,RangeTagOper>(2, 2, 15, 3, nrhs);
+      Test::run<HostSpaceType, VectorType, VectorLength, RangeTagOper>(3, 4, 2,
+                                                                       25, 2);
+      Test::run<HostSpaceType, VectorType, VectorLength, RangeTagOper>(
+          44, 63, 15, 4, 1);
+      Test::run<HostSpaceType, VectorType, VectorLength, RangeTagOper>(2, 2, 15,
+                                                                       3, 3);
+
+      for (int nrhs = 1; nrhs <= 33; ++nrhs)
+        Test::run<HostSpaceType, VectorType, VectorLength, RangeTagOper>(
+            2, 2, 15, 3, nrhs);
     }
 
     std::cout << " Unit Test::Range::Vector :: End\n";
   }
-  
+
   // MKL
 #if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
   std::cout << " Perf Test::CompactMKL Begin\n";
   {
     const bool test_mkl = true;
-    const Test::Input<HostSpaceType> input(argc, argv); 
-    Test::run<HostSpaceType,VectorType,VectorLength>(input, test_mkl);
-  } 
-  std::cout << " Perf Test::CompactMKL End\n";  
+    const Test::Input<HostSpaceType> input(argc, argv);
+    Test::run<HostSpaceType, VectorType, VectorLength>(input, test_mkl);
+  }
+  std::cout << " Perf Test::CompactMKL End\n";
 #endif
 
   // Performance tests
   std::cout << " Perf Test::Vector Begin\n";
   {
-    const Test::Input<HostSpaceType> input(argc, argv); 
-    Test::run<HostSpaceType,VectorType,VectorLength>(input);
-  } 
+    const Test::Input<HostSpaceType> input(argc, argv);
+    Test::run<HostSpaceType, VectorType, VectorLength>(input);
+  }
   std::cout << " Perf Test::Vector End\n";
 
 #endif
diff --git a/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp b/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp
index cd2e0015a0..f3237d9b4f 100644
--- a/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp
+++ b/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp
@@ -3,16 +3,15 @@
 #include "Kokkos_Timer.hpp"
 #include "Kokkos_Random.hpp"
 
-
-#if  defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
 #if !defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION)
-#if  defined(KOKKOS_ENABLE_CUDA_LAMBDA)
-#define KOKKOSBATCHED_TEST_BLOCKJACOBI 
-#endif 
+#if defined(KOKKOS_ENABLE_CUDA_LAMBDA)
+#define KOKKOSBATCHED_TEST_BLOCKJACOBI
+#endif
 #endif
 #endif
 
-#if defined(KOKKOSBATCHED_TEST_BLOCKJACOBI) 
+#if defined(KOKKOSBATCHED_TEST_BLOCKJACOBI)
 
 /// KokkosKernels headers
 #include "KokkosBatched_Util.hpp"
@@ -35,57 +34,52 @@
 #include "cuda_profiler_api.h"
 #endif
 
-
-using exec_space_type =  Kokkos::DefaultExecutionSpace;
+using exec_space_type   = Kokkos::DefaultExecutionSpace;
 using memory_space_type = typename exec_space_type::memory_space;
-using host_space = Kokkos::DefaultHostExecutionSpace;
+using host_space        = Kokkos::DefaultHostExecutionSpace;
 
-using val_type = double;
+using val_type    = double;
 using policy_type = Kokkos::TeamPolicy<exec_space_type>;
 using member_type = typename policy_type::member_type;
 
 using namespace KokkosBatched;
 
-template<typename ManyMatrixType,
-	 typename ManyVectorType>
-val_type computeResidual(const ManyMatrixType &A,
-			   const ManyVectorType &x,
-			   const ManyVectorType &b,
-			   const ManyVectorType &r) {
+template <typename ManyMatrixType, typename ManyVectorType>
+val_type computeResidual(const ManyMatrixType &A, const ManyVectorType &x,
+                         const ManyVectorType &b, const ManyVectorType &r) {
   /// compute residual
   val_type residual(0);
   {
-    policy_type policy(A.extent(0), Kokkos::AUTO());	
+    policy_type policy(A.extent(0), Kokkos::AUTO());
     Kokkos::deep_copy(r, b);
-    Kokkos::parallel_reduce
-      ("compute-residual",
-       policy, KOKKOS_LAMBDA(const member_type &member, val_type &update) {
-	const int i = member.league_rank();
-	const val_type one(1);
-	auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
-	auto xx = Kokkos::subview(x, i, Kokkos::ALL());
-	auto rr = Kokkos::subview(r, i, Kokkos::ALL());
-	
-	TeamGemv<member_type,
-		 Trans::NoTranspose,
-		 Algo::Level2::Unblocked>
-	  ::invoke(member, -one, AA, xx, one, rr);
-
-        val_type sum(0);
-	Kokkos::parallel_reduce
-	  (Kokkos::TeamThreadRange(member, rr.extent(0)),
-	   [&](const int &k, val_type &lsum) {
-            lsum += Kokkos::ArithTraits<val_type>::abs(rr(k));
-          }, sum);
-        Kokkos::single(Kokkos::PerTeam(member), [&]() {
-            update += sum;
-          });
-      }, residual);
+    Kokkos::parallel_reduce(
+        "compute-residual", policy,
+        KOKKOS_LAMBDA(const member_type &member, val_type &update) {
+          const int i = member.league_rank();
+          const val_type one(1);
+          auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
+          auto xx = Kokkos::subview(x, i, Kokkos::ALL());
+          auto rr = Kokkos::subview(r, i, Kokkos::ALL());
+
+          TeamGemv<member_type, Trans::NoTranspose,
+                   Algo::Level2::Unblocked>::invoke(member, -one, AA, xx, one,
+                                                    rr);
+
+          val_type sum(0);
+          Kokkos::parallel_reduce(
+              Kokkos::TeamThreadRange(member, rr.extent(0)),
+              [&](const int &k, val_type &lsum) {
+                lsum += Kokkos::ArithTraits<val_type>::abs(rr(k));
+              },
+              sum);
+          Kokkos::single(Kokkos::PerTeam(member), [&]() { update += sum; });
+        },
+        residual);
   }
   return residual;
 }
 
-int main(int argc, char* argv[]) {
+int main(int argc, char *argv[]) {
   Kokkos::initialize(argc, argv);
   {
 #if defined(KOKKOS_ENABLE_CUDA)
@@ -98,16 +92,15 @@ int main(int argc, char* argv[]) {
     ///
     /// input arguments parsing
     ///
-    int N = 128*128; /// # of problems (batch size)
-    int Blk = 5;     /// block dimension
-    for (int i=1;i<argc;++i) {
-      const std::string& token = argv[i];
+    int N   = 128 * 128;  /// # of problems (batch size)
+    int Blk = 5;          /// block dimension
+    for (int i = 1; i < argc; ++i) {
+      const std::string &token = argv[i];
       if (token == std::string("-N")) N = std::atoi(argv[++i]);
       if (token == std::string("-B")) Blk = std::atoi(argv[++i]);
     }
     printf(" :::: Testing (N = %d, Blk = %d)\n", N, Blk);
 
-
     ///
     /// Problem container: rank-3 array
     ///
@@ -116,24 +109,25 @@ int main(int argc, char* argv[]) {
     /// x - solution vector
     /// b - right hand side vector
     ///
-    Kokkos::View<val_type***,Kokkos::LayoutRight,exec_space_type> A("block diagonals", N, Blk, Blk);
-    Kokkos::View<val_type***,Kokkos::LayoutRight,exec_space_type> T("temporal block diagonals", N, Blk, Blk);    
-    Kokkos::View<val_type**,Kokkos::LayoutRight,exec_space_type> x("x", N, Blk);
-    Kokkos::View<val_type**,Kokkos::LayoutRight,exec_space_type> b("b", N, Blk);
+    Kokkos::View<val_type ***, Kokkos::LayoutRight, exec_space_type> A(
+        "block diagonals", N, Blk, Blk);
+    Kokkos::View<val_type ***, Kokkos::LayoutRight, exec_space_type> T(
+        "temporal block diagonals", N, Blk, Blk);
+    Kokkos::View<val_type **, Kokkos::LayoutRight, exec_space_type> x("x", N,
+                                                                      Blk);
+    Kokkos::View<val_type **, Kokkos::LayoutRight, exec_space_type> b("b", N,
+                                                                      Blk);
 
     /// copy of A to check residual
-    Kokkos::View<val_type***,Kokkos::LayoutRight,exec_space_type> Acopy("Acopy",
-								     A.extent(0),
-								     A.extent(1),
-								     A.extent(2));
+    Kokkos::View<val_type ***, Kokkos::LayoutRight, exec_space_type> Acopy(
+        "Acopy", A.extent(0), A.extent(1), A.extent(2));
 
     /// residual vector
-    Kokkos::View<val_type**,Kokkos::LayoutRight,exec_space_type> r("r",
-								b.extent(0), 
-								b.extent(1));
+    Kokkos::View<val_type **, Kokkos::LayoutRight, exec_space_type> r(
+        "r", b.extent(0), b.extent(1));
 
-    /// The block diagonal matrices are assumed to be extracted from a block sparse matrix.
-    /// Here we set the blocks with random values
+    /// The block diagonal matrices are assumed to be extracted from a block
+    /// sparse matrix. Here we set the blocks with random values
     Kokkos::Random_XorShift64_Pool<exec_space_type> random(13245);
     Kokkos::fill_random(A, random, val_type(1.0));
     Kokkos::fill_random(b, random, val_type(1.0));
@@ -143,8 +137,8 @@ int main(int argc, char* argv[]) {
     ///
     /// Objective :
     ///    - Construct the inverse of A(i,:,:) for all i.
-    ///    - Solve the equation using matrix vector multiplication. 
-    
+    ///    - Solve the equation using matrix vector multiplication.
+
     /// Task 1. Use the so-called standard batch interface
     ///    parallel_for(factorize)
     ///    parallel_For(set identity matrix)
@@ -157,90 +151,95 @@ int main(int argc, char* argv[]) {
       cudaProfilerStart();
 #endif
       Kokkos::deep_copy(A, Acopy);
-      
+
       /// construction of block jacobi using batched blas interface
       /// each parallel for is a batch function
       {
-	policy_type policy(A.extent(0), Kokkos::AUTO());	
-	timer.reset();
-	Kokkos::parallel_for
-	  ("task1.factorize",
-	   policy, KOKKOS_LAMBDA(const member_type &member) {
-	    const int i = member.league_rank();
-	    auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
-	    TeamLU<member_type,Algo::Level3::Unblocked>::invoke(member,AA);
-	  });
-	Kokkos::deep_copy(T, A);
-	Kokkos::parallel_for
-	  ("task1.set-identity",
-	   policy, KOKKOS_LAMBDA(const member_type &member) {
-	    const int i = member.league_rank();
-	    auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());	  
-	    TeamSetIdentity<member_type>::invoke(member, AA);
-	  });
-	Kokkos::fence();      
-	Kokkos::parallel_for
-	  ("task1.solve-lower-triangular",
-	   policy, KOKKOS_LAMBDA(const member_type &member) {
-	    const int i = member.league_rank();
-	    const val_type one(1);
-	    auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
-	    auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL());
-	    TeamTrsm<member_type,
-		     Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit,
-		     Algo::Level3::Unblocked>
-	      ::invoke(member, one, TT, AA);
-	  });
-	Kokkos::fence();      
-	Kokkos::parallel_for
-	  ("task1.solve-upper-triangular",
-	   policy, KOKKOS_LAMBDA(const member_type &member) {
-	    const int i = member.league_rank();
-	    const val_type one(1);
-	    auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
-	    auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL());
-	    TeamTrsm<member_type,
-		     Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,
-		     Algo::Level3::Unblocked>
-	      ::invoke(member, one, TT, AA);
-	  });      
-	Kokkos::fence();
-	const double t = timer.seconds();
-	printf("task 1: construction of jacobi time = %f , # of constructions per min = %.0f \n", t, 1.0/t*60);
+        policy_type policy(A.extent(0), Kokkos::AUTO());
+        timer.reset();
+        Kokkos::parallel_for(
+            "task1.factorize", policy,
+            KOKKOS_LAMBDA(const member_type &member) {
+              const int i = member.league_rank();
+              auto AA     = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
+              TeamLU<member_type, Algo::Level3::Unblocked>::invoke(member, AA);
+            });
+        Kokkos::deep_copy(T, A);
+        Kokkos::parallel_for(
+            "task1.set-identity", policy,
+            KOKKOS_LAMBDA(const member_type &member) {
+              const int i = member.league_rank();
+              auto AA     = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
+              TeamSetIdentity<member_type>::invoke(member, AA);
+            });
+        Kokkos::fence();
+        Kokkos::parallel_for(
+            "task1.solve-lower-triangular", policy,
+            KOKKOS_LAMBDA(const member_type &member) {
+              const int i = member.league_rank();
+              const val_type one(1);
+              auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
+              auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL());
+              TeamTrsm<member_type, Side::Left, Uplo::Lower, Trans::NoTranspose,
+                       Diag::Unit, Algo::Level3::Unblocked>::invoke(member, one,
+                                                                    TT, AA);
+            });
+        Kokkos::fence();
+        Kokkos::parallel_for(
+            "task1.solve-upper-triangular", policy,
+            KOKKOS_LAMBDA(const member_type &member) {
+              const int i = member.league_rank();
+              const val_type one(1);
+              auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
+              auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL());
+              TeamTrsm<member_type, Side::Left, Uplo::Upper, Trans::NoTranspose,
+                       Diag::NonUnit, Algo::Level3::Unblocked>::invoke(member,
+                                                                       one, TT,
+                                                                       AA);
+            });
+        Kokkos::fence();
+        const double t = timer.seconds();
+        printf(
+            "task 1: construction of jacobi time = %f , # of constructions per "
+            "min = %.0f \n",
+            t, 1.0 / t * 60);
       }
-      
+
       /// apply block jacobi
       {
-	timer.reset();
-	policy_type policy(A.extent(0), Kokkos::AUTO());
-	Kokkos::parallel_for
-	  ("task1.apply-block-jacobi",
-	   policy, KOKKOS_LAMBDA(const member_type &member) {
-	    const int i = member.league_rank();
-	    const val_type one(1), zero(0);
-	    auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
-	    auto xx = Kokkos::subview(x, i, Kokkos::ALL());
-	    auto bb = Kokkos::subview(b, i, Kokkos::ALL());
-	    TeamGemv<member_type,
-		     Trans::NoTranspose,
-		     Algo::Level2::Unblocked>
-	      ::invoke(member, one, AA, bb, zero, xx);
-	  });
-	const double t = timer.seconds();
-	printf("task 1: application of jacobi time = %f , # of applications per min = %.0f \n", t, 1.0/t*60);	
+        timer.reset();
+        policy_type policy(A.extent(0), Kokkos::AUTO());
+        Kokkos::parallel_for(
+            "task1.apply-block-jacobi", policy,
+            KOKKOS_LAMBDA(const member_type &member) {
+              const int i = member.league_rank();
+              const val_type one(1), zero(0);
+              auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
+              auto xx = Kokkos::subview(x, i, Kokkos::ALL());
+              auto bb = Kokkos::subview(b, i, Kokkos::ALL());
+              TeamGemv<member_type, Trans::NoTranspose,
+                       Algo::Level2::Unblocked>::invoke(member, one, AA, bb,
+                                                        zero, xx);
+            });
+        const double t = timer.seconds();
+        printf(
+            "task 1: application of jacobi time = %f , # of applications per "
+            "min = %.0f \n",
+            t, 1.0 / t * 60);
       }
 
       /// check residual
       {
-	const double residual = computeResidual(Acopy, x, b, r);
-	printf("task 1: residual = %f\n", residual);
+        const double residual = computeResidual(Acopy, x, b, r);
+        printf("task 1: residual = %f\n", residual);
       }
-#if defined(KOKKOS_ENABLE_CUDA) 
+#if defined(KOKKOS_ENABLE_CUDA)
       cudaProfilerStop();
-#endif      
+#endif
     }
-    
-    /// Task 2. Compose a new batch function using kokkos batched team-level interface
+
+    /// Task 2. Compose a new batch function using kokkos batched team-level
+    /// interface
     ///    parallel_for(LU, set identity, solve lower/upper triangular)
     ///    parallel_for(matrix vector multiplication)
 
@@ -249,78 +248,77 @@ int main(int argc, char* argv[]) {
       cudaProfilerStart();
 #endif
       Kokkos::deep_copy(A, Acopy);
-      
+
       /// construction of block jacobi using batched blas interface
       /// each parallel for is a batch function
       {
-	policy_type policy(A.extent(0), Kokkos::AUTO());	
-	timer.reset();
-	Kokkos::parallel_for
-	  ("task2.factorize-invert",
-	   policy, KOKKOS_LAMBDA(const member_type &member) {
-	    const val_type one(1);
-	    const int i = member.league_rank();
-	    auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
-	    auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL());
-	    
-	    TeamLU<member_type,Algo::Level3::Unblocked>::invoke(member,AA);
-	    TeamCopy<member_type,Trans::NoTranspose>::invoke(member, AA, TT);
-	    TeamSetIdentity<member_type>::invoke(member, AA);
-	    TeamTrsm<member_type,
-		     Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit,
-		     Algo::Level3::Unblocked>
-	      ::invoke(member, one, TT, AA);
-	    TeamTrsm<member_type,
-		     Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,
-		     Algo::Level3::Unblocked>
-	      ::invoke(member, one, TT, AA);
-	  });
-	Kokkos::fence();
-	const double t = timer.seconds();
-	printf("task 2: construction of jacobi time = %f , # of constructions per min = %.0f \n", t, 1.0/t*60);
+        policy_type policy(A.extent(0), Kokkos::AUTO());
+        timer.reset();
+        Kokkos::parallel_for(
+            "task2.factorize-invert", policy,
+            KOKKOS_LAMBDA(const member_type &member) {
+              const val_type one(1);
+              const int i = member.league_rank();
+              auto AA     = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
+              auto TT     = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL());
+
+              TeamLU<member_type, Algo::Level3::Unblocked>::invoke(member, AA);
+              TeamCopy<member_type, Trans::NoTranspose>::invoke(member, AA, TT);
+              TeamSetIdentity<member_type>::invoke(member, AA);
+              TeamTrsm<member_type, Side::Left, Uplo::Lower, Trans::NoTranspose,
+                       Diag::Unit, Algo::Level3::Unblocked>::invoke(member, one,
+                                                                    TT, AA);
+              TeamTrsm<member_type, Side::Left, Uplo::Upper, Trans::NoTranspose,
+                       Diag::NonUnit, Algo::Level3::Unblocked>::invoke(member,
+                                                                       one, TT,
+                                                                       AA);
+            });
+        Kokkos::fence();
+        const double t = timer.seconds();
+        printf(
+            "task 2: construction of jacobi time = %f , # of constructions per "
+            "min = %.0f \n",
+            t, 1.0 / t * 60);
       }
-      
+
       /// apply block jacobi
       {
-	timer.reset();
-	policy_type policy(A.extent(0), Kokkos::AUTO());
-	Kokkos::parallel_for
-	  ("task2.apply-block-jacobi",
-	   policy, KOKKOS_LAMBDA(const member_type &member) {
-	    const int i = member.league_rank();
-	    const val_type one(1), zero(0);
-	    auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
-	    auto xx = Kokkos::subview(x, i, Kokkos::ALL());
-	    auto bb = Kokkos::subview(b, i, Kokkos::ALL());
-	    TeamGemv<member_type,
-		     Trans::NoTranspose,
-		     Algo::Level2::Unblocked>
-	      ::invoke(member, one, AA, bb, zero, xx);
-	  });
-	const double t = timer.seconds();
-	printf("task 2: application of jacobi time = %f , # of applications per min = %.0f \n", t, 1.0/t*60);	
+        timer.reset();
+        policy_type policy(A.extent(0), Kokkos::AUTO());
+        Kokkos::parallel_for(
+            "task2.apply-block-jacobi", policy,
+            KOKKOS_LAMBDA(const member_type &member) {
+              const int i = member.league_rank();
+              const val_type one(1), zero(0);
+              auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
+              auto xx = Kokkos::subview(x, i, Kokkos::ALL());
+              auto bb = Kokkos::subview(b, i, Kokkos::ALL());
+              TeamGemv<member_type, Trans::NoTranspose,
+                       Algo::Level2::Unblocked>::invoke(member, one, AA, bb,
+                                                        zero, xx);
+            });
+        const double t = timer.seconds();
+        printf(
+            "task 2: application of jacobi time = %f , # of applications per "
+            "min = %.0f \n",
+            t, 1.0 / t * 60);
       }
 
       /// check residual
       {
-	const double residual = computeResidual(Acopy, x, b, r);
-	printf("task 2: residual = %f\n", residual);
+        const double residual = computeResidual(Acopy, x, b, r);
+        printf("task 2: residual = %f\n", residual);
       }
-#if defined(KOKKOS_ENABLE_CUDA) 
+#if defined(KOKKOS_ENABLE_CUDA)
       cudaProfilerStop();
-#endif      
+#endif
     }
-
   }
   Kokkos::finalize();
 
   return 0;
 }
 
-
 #else
-int main() {
-  return 0;
-}
+int main() { return 0; }
 #endif
-
diff --git a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp
index 4183380854..a8b3de209b 100644
--- a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp
+++ b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp
@@ -3,14 +3,13 @@
 #include "Kokkos_Timer.hpp"
 #include "Kokkos_Random.hpp"
 
-#if  defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
 #if !defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION)
-#if  defined(KOKKOS_ENABLE_CUDA_LAMBDA)
-#define KOKKOSBATCHED_TEST_BLOCKTRIDIAGDIRECT 
-#endif 
+#if defined(KOKKOS_ENABLE_CUDA_LAMBDA)
+#define KOKKOSBATCHED_TEST_BLOCKTRIDIAGDIRECT
+#endif
 #endif
 #endif
-
 
 #if defined(KOKKOSBATCHED_TEST_BLOCKTRIDIAGDIRECT)
 
@@ -60,55 +59,57 @@ typedef double value_type;
 ///
 using namespace KokkosBatched;
 
-static constexpr int vector_length = DefaultVectorLength<value_type,memory_space>::value;
+static constexpr int vector_length =
+    DefaultVectorLength<value_type, memory_space>::value;
 #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST)
-static constexpr int internal_vector_length = DefaultInternalVectorLength<value_type,memory_space>::value;
+static constexpr int internal_vector_length =
+    DefaultInternalVectorLength<value_type, memory_space>::value;
 #else
 static constexpr int internal_vector_length = 1;
 #endif
 
-typedef Vector<SIMD<value_type>,vector_length> vector_type;
+typedef Vector<SIMD<value_type>, vector_length> vector_type;
 #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST)
-typedef Vector<SIMD<value_type>,internal_vector_length> internal_vector_type;
+typedef Vector<SIMD<value_type>, internal_vector_length> internal_vector_type;
 #else
 typedef value_type internal_vector_type;
 #endif
 
-template<typename ActiveMemorySpace>
+template <typename ActiveMemorySpace>
 struct FactorizeModeAndAlgo;
 
-template<>
+template <>
 struct FactorizeModeAndAlgo<Kokkos::HostSpace> {
   typedef Mode::Serial mode_type;
-  typedef Algo::Level3::Blocked algo_type;   
+  typedef Algo::Level3::Blocked algo_type;
 };
 
 #if defined(KOKKOS_ENABLE_CUDA)
-template<>
+template <>
 struct FactorizeModeAndAlgo<Kokkos::CudaSpace> {
   typedef Mode::Team mode_type;
-  typedef Algo::Level3::Unblocked algo_type;   
+  typedef Algo::Level3::Unblocked algo_type;
 };
 #endif
 
-template<typename ActiveMemorySpace>
+template <typename ActiveMemorySpace>
 struct SolveModeAndAlgo;
 
-template<>
+template <>
 struct SolveModeAndAlgo<Kokkos::HostSpace> {
   typedef Mode::Serial mode_type;
-  typedef Algo::Level2::Blocked algo_type;   
+  typedef Algo::Level2::Blocked algo_type;
 };
 
 #if defined(KOKKOS_ENABLE_CUDA)
-template<>
+template <>
 struct SolveModeAndAlgo<Kokkos::CudaSpace> {
   typedef Mode::Team mode_type;
-  typedef Algo::Level2::Unblocked algo_type;   
+  typedef Algo::Level2::Unblocked algo_type;
 };
 #endif
 
-int main(int argc, char* argv[]) {
+int main(int argc, char *argv[]) {
   Kokkos::initialize(argc, argv);
   {
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE)
@@ -116,20 +117,20 @@ int main(int argc, char* argv[]) {
 #endif
     Kokkos::print_configuration(std::cout);
 
-    //typedef Kokkos::Details::ArithTraits<value_type> ats;
+    // typedef Kokkos::Details::ArithTraits<value_type> ats;
     Kokkos::Timer timer;
 
     ///
     /// input arguments parsing
     ///
-    int N = 128*128; /// # of problems (batch size)
-    int L = 128;     /// length of block tridiags
-    int Blk = 5;     /// block dimension
-    int Nvec = 1;
-    int S = 0; /// scratch size
+    int N     = 128 * 128;  /// # of problems (batch size)
+    int L     = 128;        /// length of block tridiags
+    int Blk   = 5;          /// block dimension
+    int Nvec  = 1;
+    int S     = 0;  /// scratch size
     int niter = 1;
-    for (int i=1;i<argc;++i) {
-      const std::string& token = argv[i];
+    for (int i = 1; i < argc; ++i) {
+      const std::string &token = argv[i];
       if (token == std::string("-N")) N = std::atoi(argv[++i]);
       if (token == std::string("-L")) L = std::atoi(argv[++i]);
       if (token == std::string("-B")) Blk = std::atoi(argv[++i]);
@@ -138,91 +139,65 @@ int main(int argc, char* argv[]) {
       if (token == std::string("-Niter")) niter = std::atoi(argv[++i]);
     }
 
-    printf(" :::: Testing (N = %d, L = %d, Blk = %d, vl = %d, vi = %d, niter = %d)\n", 
-           N, L, Blk, vector_length, internal_vector_length, niter);
-
+    printf(
+        " :::: Testing (N = %d, L = %d, Blk = %d, vl = %d, vi = %d, niter = "
+        "%d)\n",
+        N, L, Blk, vector_length, internal_vector_length, niter);
 
     ///
     /// problem container
     ///
 
     /// double 16
-    Kokkos::View<vector_type*****,Kokkos::LayoutRight,exec_space> Av("A",
-                                                                     N/vector_length, L, 3, Blk, Blk);
+    Kokkos::View<vector_type *****, Kokkos::LayoutRight, exec_space> Av(
+        "A", N / vector_length, L, 3, Blk, Blk);
 
     /// double
-    Kokkos::View<value_type******,Kokkos::LayoutRight,exec_space> As((value_type*)Av.data(),
-                                                                     Av.extent(0),
-                                                                     Av.extent(1),
-                                                                     Av.extent(2),
-                                                                     Av.extent(3),
-                                                                     Av.extent(4),
-                                                                     vector_length);
+    Kokkos::View<value_type ******, Kokkos::LayoutRight, exec_space> As(
+        (value_type *)Av.data(), Av.extent(0), Av.extent(1), Av.extent(2),
+        Av.extent(3), Av.extent(4), vector_length);
 
     /// double 2
-    Kokkos::View<internal_vector_type******,Kokkos::LayoutRight,exec_space> Ai((internal_vector_type*)Av.data(),
-                                                                               Av.extent(0),
-                                                                               Av.extent(1),
-                                                                               Av.extent(2),
-                                                                               Av.extent(3),
-                                                                               Av.extent(4),
-                                                                               vector_length/internal_vector_length);
+    Kokkos::View<internal_vector_type ******, Kokkos::LayoutRight, exec_space>
+        Ai((internal_vector_type *)Av.data(), Av.extent(0), Av.extent(1),
+           Av.extent(2), Av.extent(3), Av.extent(4),
+           vector_length / internal_vector_length);
     /// double 16
-    Kokkos::View<vector_type****,Kokkos::LayoutRight,exec_space> xv("x",
-                                                                    N/vector_length, Nvec, L, Blk);
+    Kokkos::View<vector_type ****, Kokkos::LayoutRight, exec_space> xv(
+        "x", N / vector_length, Nvec, L, Blk);
 
     /// double
-    Kokkos::View<value_type*****,Kokkos::LayoutRight,exec_space> xs((value_type*)xv.data(),
-                                                                    xv.extent(0),
-                                                                    xv.extent(1),
-                                                                    xv.extent(2),
-                                                                    xv.extent(3),
-                                                                    vector_length);
+    Kokkos::View<value_type *****, Kokkos::LayoutRight, exec_space> xs(
+        (value_type *)xv.data(), xv.extent(0), xv.extent(1), xv.extent(2),
+        xv.extent(3), vector_length);
 
     /// double 2
-    Kokkos::View<internal_vector_type*****,Kokkos::LayoutRight,exec_space> xi((internal_vector_type*)xv.data(),
-                                                                               xv.extent(0),
-                                                                               xv.extent(1),
-                                                                               xv.extent(2),
-                                                                               xv.extent(3),
-                                                                               vector_length/internal_vector_length);
+    Kokkos::View<internal_vector_type *****, Kokkos::LayoutRight, exec_space>
+        xi((internal_vector_type *)xv.data(), xv.extent(0), xv.extent(1),
+           xv.extent(2), xv.extent(3), vector_length / internal_vector_length);
 
     /// double 16
-    Kokkos::View<vector_type****,Kokkos::LayoutRight,exec_space> bv("b",
-                                                                    N/vector_length, Nvec, L, Blk);
+    Kokkos::View<vector_type ****, Kokkos::LayoutRight, exec_space> bv(
+        "b", N / vector_length, Nvec, L, Blk);
 
     /// double
-    Kokkos::View<value_type*****,Kokkos::LayoutRight,exec_space> bs((value_type*)bv.data(),
-                                                                    bv.extent(0),
-                                                                    bv.extent(1),
-                                                                    bv.extent(2),
-                                                                    bv.extent(3),
-                                                                    vector_length);
+    Kokkos::View<value_type *****, Kokkos::LayoutRight, exec_space> bs(
+        (value_type *)bv.data(), bv.extent(0), bv.extent(1), bv.extent(2),
+        bv.extent(3), vector_length);
 
     /// double 2
-    Kokkos::View<internal_vector_type*****,Kokkos::LayoutRight,exec_space> bi((internal_vector_type*)bv.data(),
-                                                                              bv.extent(0),
-                                                                              bv.extent(1),
-                                                                              bv.extent(2),
-                                                                              bv.extent(3),
-                                                                              vector_length/internal_vector_length);
-    
+    Kokkos::View<internal_vector_type *****, Kokkos::LayoutRight, exec_space>
+        bi((internal_vector_type *)bv.data(), bv.extent(0), bv.extent(1),
+           bv.extent(2), bv.extent(3), vector_length / internal_vector_length);
 
     /// double copy of A
-    Kokkos::View<value_type******,Kokkos::LayoutRight,exec_space> Acopy("Acopy",
-                                                                        As.extent(0),
-                                                                        As.extent(1),
-                                                                        As.extent(2),
-                                                                        As.extent(3),
-                                                                        As.extent(4),
-                                                                        As.extent(5));
-
-    Kokkos::View<value_type*****,Kokkos::LayoutRight,exec_space> rs("rs",
-                                                                    bs.extent(0), 
-                                                                    bs.extent(1), 
-                                                                    bs.extent(2), 
-                                                                    bs.extent(3), 
-                                                                    bs.extent(4));
+    Kokkos::View<value_type ******, Kokkos::LayoutRight, exec_space> Acopy(
+        "Acopy", As.extent(0), As.extent(1), As.extent(2), As.extent(3),
+        As.extent(4), As.extent(5));
+
+    Kokkos::View<value_type *****, Kokkos::LayoutRight, exec_space> rs(
+        "rs", bs.extent(0), bs.extent(1), bs.extent(2), bs.extent(3),
+        bs.extent(4));
 
 #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST)
     auto AA = Ai;
@@ -245,17 +220,21 @@ int main(int argc, char* argv[]) {
       using policy_type = Kokkos::TeamPolicy<exec_space>;
       using member_type = typename policy_type::member_type;
       policy_type policy(AA.extent(0), Kokkos::AUTO(), AA.extent(5));
-      Kokkos::parallel_for
-        ("setTridiagToIdentity",
-         policy, KOKKOS_LAMBDA(const member_type &member) {
-          const int i = member.league_rank();
-          Kokkos::parallel_for(Kokkos::TeamThreadRange(member,AA.extent(1)),[&](const int &j) {
-              Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)),[&](const int &v) {
-                  for (int k=0,kend=AA.extent(3);k<kend;++k)
-                    AA(i, j, 1, k, k, v) = 1;
+      Kokkos::parallel_for(
+          "setTridiagToIdentity", policy,
+          KOKKOS_LAMBDA(const member_type &member) {
+            const int i = member.league_rank();
+            Kokkos::parallel_for(
+                Kokkos::TeamThreadRange(member, AA.extent(1)),
+                [&](const int &j) {
+                  Kokkos::parallel_for(
+                      Kokkos::ThreadVectorRange(member, AA.extent(5)),
+                      [&](const int &v) {
+                        for (int k = 0, kend = AA.extent(3); k < kend; ++k)
+                          AA(i, j, 1, k, k, v) = 1;
+                      });
                 });
-            });
-        });
+          });
       Kokkos::fence();
       const double t = timer.seconds();
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE)
@@ -270,7 +249,7 @@ int main(int argc, char* argv[]) {
       Kokkos::Random_XorShift64_Pool<exec_space> random(13245);
       Kokkos::fill_random(As, random, one);
       Kokkos::fill_random(bs, random, one);
-      
+
       Kokkos::deep_copy(Acopy, As);
     }
 
@@ -284,70 +263,76 @@ int main(int argc, char* argv[]) {
       timer.reset();
       using policy_type = Kokkos::TeamPolicy<exec_space>;
       using member_type = typename policy_type::member_type;
-      int team_size = 0;
-      if        (Blk < 8)  { team_size =  32/AA.extent(5); 
-      } else if (Blk < 12) { team_size =  64/AA.extent(5);
-      } else               { team_size = 128/AA.extent(5); }
+      int team_size     = 0;
+      if (Blk < 8) {
+        team_size = 32 / AA.extent(5);
+      } else if (Blk < 12) {
+        team_size = 64 / AA.extent(5);
+      } else {
+        team_size = 128 / AA.extent(5);
+      }
 
       policy_type policy(AA.extent(0), team_size, AA.extent(5));
-      Kokkos::parallel_for
-        ("factorize",
-         policy.set_scratch_size(0,Kokkos::PerTeam(S)), 
-         KOKKOS_LAMBDA(const member_type &member) {
-	  typedef FactorizeModeAndAlgo<Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
-	  typedef default_mode_and_algo_type::mode_type mode_type; 
-	  typedef default_mode_and_algo_type::algo_type algo_type;
-
-          const int i = member.league_rank();
-	  
-          Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)),[&](const int &v) {
-              auto AAA = Kokkos::subview(AA, i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), v);
-
-              /// subview patterns
-              auto A = Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL());
-              auto B = Kokkos::subview(AAA, 0, 2, Kokkos::ALL(), Kokkos::ALL());
-              auto C = Kokkos::subview(AAA, 0, 0, Kokkos::ALL(), Kokkos::ALL());
-              auto D = Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL());
-	      
-              if (L == 1) {
-                A.assign_data( &AAA(0, 1, 0, 0) );                
-                LU<member_type,
-		  mode_type,algo_type>::invoke(member, A);
-              } else {
-                for (int k=0;k<(L-1);++k) {
-                  A.assign_data( &AAA(k,   1, 0, 0) );
-                  B.assign_data( &AAA(k,   2, 0, 0) );
-                  C.assign_data( &AAA(k,   0, 0, 0) );
-                  D.assign_data( &AAA(k+1, 1, 0, 0) );
-                
-                  LU<member_type,
-		    mode_type,algo_type>
-                    ::invoke(member, A);
-                  Trsm<member_type,
-		    Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit,
-		    mode_type,algo_type>
-                    ::invoke(member, 1.0, A, B);
-                  Trsm<member_type,
-		    Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,
-		    mode_type,algo_type>
-                    ::invoke(member, 1.0, A, C);
-                  Gemm<member_type,
-		    Trans::NoTranspose,Trans::NoTranspose,
-		    mode_type,algo_type>
-                    ::invoke(member, -1.0, C, B, 1.0, D);
-                }
-                LU<member_type,
-		  mode_type,algo_type>
-		  ::invoke(member, D);
-              }
-            });
-        });
+      Kokkos::parallel_for(
+          "factorize", policy.set_scratch_size(0, Kokkos::PerTeam(S)),
+          KOKKOS_LAMBDA(const member_type &member) {
+            typedef FactorizeModeAndAlgo<
+                Kokkos::Impl::ActiveExecutionMemorySpace>
+                default_mode_and_algo_type;
+            typedef default_mode_and_algo_type::mode_type mode_type;
+            typedef default_mode_and_algo_type::algo_type algo_type;
+
+            const int i = member.league_rank();
+
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(member, AA.extent(5)),
+                [&](const int &v) {
+                  auto AAA =
+                      Kokkos::subview(AA, i, Kokkos::ALL(), Kokkos::ALL(),
+                                      Kokkos::ALL(), Kokkos::ALL(), v);
+
+                  /// subview patterns
+                  auto A =
+                      Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL());
+                  auto B =
+                      Kokkos::subview(AAA, 0, 2, Kokkos::ALL(), Kokkos::ALL());
+                  auto C =
+                      Kokkos::subview(AAA, 0, 0, Kokkos::ALL(), Kokkos::ALL());
+                  auto D =
+                      Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL());
+
+                  if (L == 1) {
+                    A.assign_data(&AAA(0, 1, 0, 0));
+                    LU<member_type, mode_type, algo_type>::invoke(member, A);
+                  } else {
+                    for (int k = 0; k < (L - 1); ++k) {
+                      A.assign_data(&AAA(k, 1, 0, 0));
+                      B.assign_data(&AAA(k, 2, 0, 0));
+                      C.assign_data(&AAA(k, 0, 0, 0));
+                      D.assign_data(&AAA(k + 1, 1, 0, 0));
+
+                      LU<member_type, mode_type, algo_type>::invoke(member, A);
+                      Trsm<member_type, Side::Left, Uplo::Lower,
+                           Trans::NoTranspose, Diag::Unit, mode_type,
+                           algo_type>::invoke(member, 1.0, A, B);
+                      Trsm<member_type, Side::Right, Uplo::Upper,
+                           Trans::NoTranspose, Diag::NonUnit, mode_type,
+                           algo_type>::invoke(member, 1.0, A, C);
+                      Gemm<member_type, Trans::NoTranspose, Trans::NoTranspose,
+                           mode_type, algo_type>::invoke(member, -1.0, C, B,
+                                                         1.0, D);
+                    }
+                    LU<member_type, mode_type, algo_type>::invoke(member, D);
+                  }
+                });
+          });
       Kokkos::fence();
       const double t = timer.seconds();
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE)
       cudaProfilerStop();
 #endif
-      printf("factorize time = %f , # of factorization per min = %f \n", t, 1.0/t*60);
+      printf("factorize time = %f , # of factorization per min = %f \n", t,
+             1.0 / t * 60);
     }
 
     ///
@@ -360,133 +345,144 @@ int main(int argc, char* argv[]) {
       timer.reset();
       using policy_type = Kokkos::TeamPolicy<exec_space>;
       using member_type = typename policy_type::member_type;
-      int team_size = 0;
-      if        (Blk < 8)  { team_size =  32/AA.extent(5); 
-      } else if (Blk < 12) { team_size =  64/AA.extent(5);
-      } else               { team_size = 128/AA.extent(5); }
-      
-      policy_type policy(AA.extent(0), team_size, AA.extent(5));
-      for (int iter=0;iter<niter;++iter) {
-        Kokkos::parallel_for
-          ("solve",
-           policy.set_scratch_size(0,Kokkos::PerTeam(S)), KOKKOS_LAMBDA(const member_type &member) {
-	    typedef SolveModeAndAlgo<Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
-	    typedef default_mode_and_algo_type::mode_type mode_type; 
-	    typedef default_mode_and_algo_type::algo_type algo_type;
+      int team_size     = 0;
+      if (Blk < 8) {
+        team_size = 32 / AA.extent(5);
+      } else if (Blk < 12) {
+        team_size = 64 / AA.extent(5);
+      } else {
+        team_size = 128 / AA.extent(5);
+      }
 
-            const int i = member.league_rank();
-            Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)),[&](const int &v) {
-                auto A = Kokkos::subview(AA, i, Kokkos::ALL(), 1, Kokkos::ALL(), Kokkos::ALL(), v);
-                auto B = Kokkos::subview(AA, i, Kokkos::ALL(), 2, Kokkos::ALL(), Kokkos::ALL(), v);
-                auto C = Kokkos::subview(AA, i, Kokkos::ALL(), 0, Kokkos::ALL(), Kokkos::ALL(), v);
-                
-                for (int jvec=0;jvec<Nvec;++jvec) {
-                  auto x = Kokkos::subview(xx, i, jvec, Kokkos::ALL(), Kokkos::ALL(), v);
-                  auto b = Kokkos::subview(bb, i, jvec, Kokkos::ALL(), Kokkos::ALL(), v);
-
-		  auto xt = Kokkos::subview(x, 0, Kokkos::ALL());
-		  auto xb = Kokkos::subview(x, 0, Kokkos::ALL());
-                  
-                  ///
-                  /// forward substitution
-                  ///
-                  {
-                    //const bool is_same_x_and_b = (x.data() == b.data());
-                    auto LT = Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL());
-                    auto LB = Kokkos::subview(C, 0, Kokkos::ALL(), Kokkos::ALL());
-                    
-                    auto bk = Kokkos::subview(b, 0, Kokkos::ALL());
-                    {
-                      {//if (!is_same_x_and_b) {
-                        Copy<member_type,
-			  Trans::NoTranspose,
-			  mode_type>
-                          ::invoke(member, bk, xb);
-                        member.team_barrier();
-                      }
-                    }
-                    const int kend = L - 1;
-                    for (int k=0;k<kend;++k) {
-                      LT.assign_data(&A(k, 0, 0));
-                      LB.assign_data(&C(k, 0, 0));
-                      
-                      xt.assign_data(&x(k,   0));
-                      xb.assign_data(&x(k+1, 0));
-                      
-                      { //if (!is_same_x_and_b) {
-                        bk.assign_data(&b(k+1, 0));
-                        Copy<member_type,
-			  Trans::NoTranspose,
-			  mode_type>
-                          ::invoke(member, bk, xb);
-                      }
-                      
-                      Trsv<member_type,
-			Uplo::Lower,Trans::NoTranspose,Diag::Unit,
-			mode_type,algo_type>
-                        ::invoke(member, 1.0, LT, xt);
-                      
-                      Gemv<member_type,
-			Trans::NoTranspose,
-			mode_type,algo_type>
-                        ::invoke(member, -1.0, LB, xt, 1.0, xb);
-                    }
-                    {
-                      LT.assign_data(&A(kend, 0, 0));
-                      xt.assign_data(&x(kend, 0));
-                      Trsv<member_type,
-			Uplo::Lower,Trans::NoTranspose,Diag::Unit,
-			mode_type,algo_type>
-                        ::invoke(member, 1.0, LT, xt);
-                    }
-                  } /// end forward substitution
-                  
-                  ///
-                  /// backward substitution
-                  ///
-                  {
-                    auto UT = Kokkos::subview(B, 0, Kokkos::ALL(), Kokkos::ALL());
-                    auto UB = Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL());
-                                        
-                    const int kbegin = L - 1;
-                    for (int k=kbegin;k>0;--k) {
-                      UT.assign_data(&B(k-1, 0, 0));
-                      UB.assign_data(&A(k,   0, 0));
-                      
-                      xt.assign_data(&x(k-1, 0));
-                      xb.assign_data(&x(k,   0));
-                      
-                      Trsv<member_type,
-			Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,
-			mode_type,algo_type>
-                        ::invoke(member, 1.0, UB, xb);
-                      
-                      Gemv<member_type,
-			Trans::NoTranspose,
-			mode_type,algo_type>
-                        ::invoke(member, -1.0, UT, xb, 1.0, xt);
-                    }
-                    {
-                      UB.assign_data(&A(0, 0, 0));
-                      xb.assign_data(&x(0, 0));
-                      Trsv<member_type,
-			Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,
-			mode_type,algo_type>
-                        ::invoke(member, 1.0, UB, xb); 
+      policy_type policy(AA.extent(0), team_size, AA.extent(5));
+      for (int iter = 0; iter < niter; ++iter) {
+        Kokkos::parallel_for(
+            "solve", policy.set_scratch_size(0, Kokkos::PerTeam(S)),
+            KOKKOS_LAMBDA(const member_type &member) {
+              typedef SolveModeAndAlgo<Kokkos::Impl::ActiveExecutionMemorySpace>
+                  default_mode_and_algo_type;
+              typedef default_mode_and_algo_type::mode_type mode_type;
+              typedef default_mode_and_algo_type::algo_type algo_type;
+
+              const int i = member.league_rank();
+              Kokkos::parallel_for(
+                  Kokkos::ThreadVectorRange(member, AA.extent(5)),
+                  [&](const int &v) {
+                    auto A = Kokkos::subview(AA, i, Kokkos::ALL(), 1,
+                                             Kokkos::ALL(), Kokkos::ALL(), v);
+                    auto B = Kokkos::subview(AA, i, Kokkos::ALL(), 2,
+                                             Kokkos::ALL(), Kokkos::ALL(), v);
+                    auto C = Kokkos::subview(AA, i, Kokkos::ALL(), 0,
+                                             Kokkos::ALL(), Kokkos::ALL(), v);
+
+                    for (int jvec = 0; jvec < Nvec; ++jvec) {
+                      auto x = Kokkos::subview(xx, i, jvec, Kokkos::ALL(),
+                                               Kokkos::ALL(), v);
+                      auto b = Kokkos::subview(bb, i, jvec, Kokkos::ALL(),
+                                               Kokkos::ALL(), v);
+
+                      auto xt = Kokkos::subview(x, 0, Kokkos::ALL());
+                      auto xb = Kokkos::subview(x, 0, Kokkos::ALL());
+
+                      ///
+                      /// forward substitution
+                      ///
+                      {
+                        // const bool is_same_x_and_b = (x.data() == b.data());
+                        auto LT =
+                            Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL());
+                        auto LB =
+                            Kokkos::subview(C, 0, Kokkos::ALL(), Kokkos::ALL());
+
+                        auto bk = Kokkos::subview(b, 0, Kokkos::ALL());
+                        {
+                          {  // if (!is_same_x_and_b) {
+                            Copy<member_type, Trans::NoTranspose,
+                                 mode_type>::invoke(member, bk, xb);
+                            member.team_barrier();
+                          }
+                        }
+                        const int kend = L - 1;
+                        for (int k = 0; k < kend; ++k) {
+                          LT.assign_data(&A(k, 0, 0));
+                          LB.assign_data(&C(k, 0, 0));
+
+                          xt.assign_data(&x(k, 0));
+                          xb.assign_data(&x(k + 1, 0));
+
+                          {  // if (!is_same_x_and_b) {
+                            bk.assign_data(&b(k + 1, 0));
+                            Copy<member_type, Trans::NoTranspose,
+                                 mode_type>::invoke(member, bk, xb);
+                          }
+
+                          Trsv<member_type, Uplo::Lower, Trans::NoTranspose,
+                               Diag::Unit, mode_type, algo_type>::invoke(member,
+                                                                         1.0,
+                                                                         LT,
+                                                                         xt);
+
+                          Gemv<member_type, Trans::NoTranspose, mode_type,
+                               algo_type>::invoke(member, -1.0, LB, xt, 1.0,
+                                                  xb);
+                        }
+                        {
+                          LT.assign_data(&A(kend, 0, 0));
+                          xt.assign_data(&x(kend, 0));
+                          Trsv<member_type, Uplo::Lower, Trans::NoTranspose,
+                               Diag::Unit, mode_type, algo_type>::invoke(member,
+                                                                         1.0,
+                                                                         LT,
+                                                                         xt);
+                        }
+                      }  /// end forward substitution
+
+                      ///
+                      /// backward substitution
+                      ///
+                      {
+                        auto UT =
+                            Kokkos::subview(B, 0, Kokkos::ALL(), Kokkos::ALL());
+                        auto UB =
+                            Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL());
+
+                        const int kbegin = L - 1;
+                        for (int k = kbegin; k > 0; --k) {
+                          UT.assign_data(&B(k - 1, 0, 0));
+                          UB.assign_data(&A(k, 0, 0));
+
+                          xt.assign_data(&x(k - 1, 0));
+                          xb.assign_data(&x(k, 0));
+
+                          Trsv<member_type, Uplo::Upper, Trans::NoTranspose,
+                               Diag::NonUnit, mode_type,
+                               algo_type>::invoke(member, 1.0, UB, xb);
+
+                          Gemv<member_type, Trans::NoTranspose, mode_type,
+                               algo_type>::invoke(member, -1.0, UT, xb, 1.0,
+                                                  xt);
+                        }
+                        {
+                          UB.assign_data(&A(0, 0, 0));
+                          xb.assign_data(&x(0, 0));
+                          Trsv<member_type, Uplo::Upper, Trans::NoTranspose,
+                               Diag::NonUnit, mode_type,
+                               algo_type>::invoke(member, 1.0, UB, xb);
+                        }
+                      }  // end backward substitution
                     }
-                  } // end backward substitution
-                }
-              });
-          });
+                  });
+            });
         Kokkos::fence();
       }
       const double t = timer.seconds();
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE)
       cudaProfilerStop();
 #endif
-      printf("solve time = %f , # of solves per min = %f\n", t, 1.0/t*60*niter);
+      printf("solve time = %f , # of solves per min = %f\n", t,
+             1.0 / t * 60 * niter);
     }
-    
+
     ///
     /// compute residual
     ///
@@ -495,105 +491,118 @@ int main(int argc, char* argv[]) {
       using policy_type = Kokkos::TeamPolicy<exec_space>;
       using member_type = typename policy_type::member_type;
       policy_type policy(Acopy.extent(0), Kokkos::AUTO(), Acopy.extent(5));
-      Kokkos::parallel_for
-        ("compute residual",
-         policy, KOKKOS_LAMBDA(const member_type &member) {
-          const int i = member.league_rank();
-          Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, Acopy.extent(5)),[&](const int &v) {
-              auto A = Kokkos::subview(Acopy, i, Kokkos::ALL(), 1, Kokkos::ALL(), Kokkos::ALL(), v);
-              auto B = Kokkos::subview(Acopy, i, Kokkos::ALL(), 2, Kokkos::ALL(), Kokkos::ALL(), v);
-              auto C = Kokkos::subview(Acopy, i, Kokkos::ALL(), 0, Kokkos::ALL(), Kokkos::ALL(), v);
-                
-              for (int jvec=0,jvecend=rs.extent(1);jvec<jvecend;++jvec) {
-                auto x = Kokkos::subview(xs, i, jvec, Kokkos::ALL(), Kokkos::ALL(), v);
-                auto b = Kokkos::subview(bs, i, jvec, Kokkos::ALL(), Kokkos::ALL(), v);
-                auto r = Kokkos::subview(rs, i, jvec, Kokkos::ALL(), Kokkos::ALL(), v);
-                
-                if (L == 1) {
-                  auto A0 = Kokkos::subview(A, 0,   Kokkos::ALL(), Kokkos::ALL());
-                  auto x0 = Kokkos::subview(x, 0,   Kokkos::ALL());
-                  auto b0 = Kokkos::subview(b, 0,   Kokkos::ALL());
-                  auto r0 = Kokkos::subview(r, 0,   Kokkos::ALL());
-
-                  TeamCopy<member_type,
-                           Trans::NoTranspose>
-                    ::invoke(member, b0, r0);
-                  TeamGemv<member_type,
-                           Trans::NoTranspose,algo_type>
-                    ::invoke(member, -1.0, A0, x0, 1.0, r0);
-                } else {
-                  int k = 0;
-                  {
-                    /// first row 
-                    auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL());
-                    auto B2 = Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL());
-                    
-                    auto x1 = Kokkos::subview(x, k,   Kokkos::ALL());
-                    auto x2 = Kokkos::subview(x, k+1, Kokkos::ALL());
-                    
-                    auto bk = Kokkos::subview(b, k, Kokkos::ALL());
-                    auto rk = Kokkos::subview(r, k, Kokkos::ALL());
-                    TeamCopy<member_type,
-                             Trans::NoTranspose>
-                      ::invoke(member, bk, rk);
-                    member.team_barrier();
-                    TeamGemv<member_type,
-                             Trans::NoTranspose,algo_type>
-                      ::invoke(member, -1.0, A1, x1, 1.0, rk);
-                    TeamGemv<member_type,
-                             Trans::NoTranspose,algo_type>
-                      ::invoke(member, -1.0, B2, x2, 1.0, rk);
-                    ++k;
-                  }
-                  for (;k<(L-1);++k) {
-                    auto C0 = Kokkos::subview(C, k-1, Kokkos::ALL(), Kokkos::ALL());
-                    auto A1 = Kokkos::subview(A, k,   Kokkos::ALL(), Kokkos::ALL());
-                    auto B2 = Kokkos::subview(B, k,   Kokkos::ALL(), Kokkos::ALL());
-                    
-                    auto x0 = Kokkos::subview(x, k-1, Kokkos::ALL());
-                    auto x1 = Kokkos::subview(x, k,   Kokkos::ALL());
-                    auto x2 = Kokkos::subview(x, k+1, Kokkos::ALL());
-                    
-                    auto bk = Kokkos::subview(b, k, Kokkos::ALL());
-                    auto rk = Kokkos::subview(r, k, Kokkos::ALL());
-                    TeamCopy<member_type,
-                             Trans::NoTranspose>
-                      ::invoke(member, bk, rk);
-                    member.team_barrier();
-                    TeamGemv<member_type,
-                             Trans::NoTranspose,algo_type>
-                      ::invoke(member, -1.0, C0, x0, 1.0, rk);
-                    TeamGemv<member_type,
-                             Trans::NoTranspose,algo_type>
-                      ::invoke(member, -1.0, A1, x1, 1.0, rk);
-                    TeamGemv<member_type,
-                             Trans::NoTranspose,algo_type>
-                      ::invoke(member, -1.0, B2, x2, 1.0, rk);
+      Kokkos::parallel_for(
+          "compute residual", policy, KOKKOS_LAMBDA(const member_type &member) {
+            const int i = member.league_rank();
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(member, Acopy.extent(5)),
+                [&](const int &v) {
+                  auto A = Kokkos::subview(Acopy, i, Kokkos::ALL(), 1,
+                                           Kokkos::ALL(), Kokkos::ALL(), v);
+                  auto B = Kokkos::subview(Acopy, i, Kokkos::ALL(), 2,
+                                           Kokkos::ALL(), Kokkos::ALL(), v);
+                  auto C = Kokkos::subview(Acopy, i, Kokkos::ALL(), 0,
+                                           Kokkos::ALL(), Kokkos::ALL(), v);
+
+                  for (int jvec = 0, jvecend = rs.extent(1); jvec < jvecend;
+                       ++jvec) {
+                    auto x = Kokkos::subview(xs, i, jvec, Kokkos::ALL(),
+                                             Kokkos::ALL(), v);
+                    auto b = Kokkos::subview(bs, i, jvec, Kokkos::ALL(),
+                                             Kokkos::ALL(), v);
+                    auto r = Kokkos::subview(rs, i, jvec, Kokkos::ALL(),
+                                             Kokkos::ALL(), v);
+
+                    if (L == 1) {
+                      auto A0 =
+                          Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL());
+                      auto x0 = Kokkos::subview(x, 0, Kokkos::ALL());
+                      auto b0 = Kokkos::subview(b, 0, Kokkos::ALL());
+                      auto r0 = Kokkos::subview(r, 0, Kokkos::ALL());
+
+                      TeamCopy<member_type, Trans::NoTranspose>::invoke(member,
+                                                                        b0, r0);
+                      TeamGemv<member_type, Trans::NoTranspose,
+                               algo_type>::invoke(member, -1.0, A0, x0, 1.0,
+                                                  r0);
+                    } else {
+                      int k = 0;
+                      {
+                        /// first row
+                        auto A1 =
+                            Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL());
+                        auto B2 =
+                            Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL());
+
+                        auto x1 = Kokkos::subview(x, k, Kokkos::ALL());
+                        auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL());
+
+                        auto bk = Kokkos::subview(b, k, Kokkos::ALL());
+                        auto rk = Kokkos::subview(r, k, Kokkos::ALL());
+                        TeamCopy<member_type, Trans::NoTranspose>::invoke(
+                            member, bk, rk);
+                        member.team_barrier();
+                        TeamGemv<member_type, Trans::NoTranspose,
+                                 algo_type>::invoke(member, -1.0, A1, x1, 1.0,
+                                                    rk);
+                        TeamGemv<member_type, Trans::NoTranspose,
+                                 algo_type>::invoke(member, -1.0, B2, x2, 1.0,
+                                                    rk);
+                        ++k;
+                      }
+                      for (; k < (L - 1); ++k) {
+                        auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(),
+                                                  Kokkos::ALL());
+                        auto A1 =
+                            Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL());
+                        auto B2 =
+                            Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL());
+
+                        auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL());
+                        auto x1 = Kokkos::subview(x, k, Kokkos::ALL());
+                        auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL());
+
+                        auto bk = Kokkos::subview(b, k, Kokkos::ALL());
+                        auto rk = Kokkos::subview(r, k, Kokkos::ALL());
+                        TeamCopy<member_type, Trans::NoTranspose>::invoke(
+                            member, bk, rk);
+                        member.team_barrier();
+                        TeamGemv<member_type, Trans::NoTranspose,
+                                 algo_type>::invoke(member, -1.0, C0, x0, 1.0,
+                                                    rk);
+                        TeamGemv<member_type, Trans::NoTranspose,
+                                 algo_type>::invoke(member, -1.0, A1, x1, 1.0,
+                                                    rk);
+                        TeamGemv<member_type, Trans::NoTranspose,
+                                 algo_type>::invoke(member, -1.0, B2, x2, 1.0,
+                                                    rk);
+                      }
+                      {
+                        // last row
+                        auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(),
+                                                  Kokkos::ALL());
+                        auto A1 =
+                            Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL());
+
+                        auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL());
+                        auto x1 = Kokkos::subview(x, k, Kokkos::ALL());
+
+                        auto bk = Kokkos::subview(b, k, Kokkos::ALL());
+                        auto rk = Kokkos::subview(r, k, Kokkos::ALL());
+                        TeamCopy<member_type, Trans::NoTranspose>::invoke(
+                            member, bk, rk);
+                        member.team_barrier();
+                        TeamGemv<member_type, Trans::NoTranspose,
+                                 algo_type>::invoke(member, -1.0, C0, x0, 1.0,
+                                                    rk);
+                        TeamGemv<member_type, Trans::NoTranspose,
+                                 algo_type>::invoke(member, -1.0, A1, x1, 1.0,
+                                                    rk);
+                      }
+                    }
                   }
-                  {  
-                    // last row
-                    auto C0 = Kokkos::subview(C, k-1, Kokkos::ALL(), Kokkos::ALL());
-                    auto A1 = Kokkos::subview(A, k,   Kokkos::ALL(), Kokkos::ALL());
-                    
-                    auto x0 = Kokkos::subview(x, k-1, Kokkos::ALL());
-                    auto x1 = Kokkos::subview(x, k,   Kokkos::ALL());
-                    
-                    auto bk = Kokkos::subview(b, k, Kokkos::ALL());
-                    auto rk = Kokkos::subview(r, k, Kokkos::ALL());
-                    TeamCopy<member_type,
-                             Trans::NoTranspose>
-                      ::invoke(member, bk, rk);
-                    member.team_barrier();
-                    TeamGemv<member_type,
-                             Trans::NoTranspose,algo_type>
-                      ::invoke(member, -1.0, C0, x0, 1.0, rk);
-                    TeamGemv<member_type,Trans::NoTranspose,algo_type>
-                      ::invoke(member, -1.0, A1, x1, 1.0, rk);
-                  }                   
-                }
-              }
-            });
-        });
+                });
+          });
       Kokkos::fence();
       auto rs_host = Kokkos::create_mirror_view(rs);
       auto bs_host = Kokkos::create_mirror_view(bs);
@@ -602,17 +611,19 @@ int main(int argc, char* argv[]) {
       Kokkos::fence();
       {
         double norm2 = 0, diff2 = 0;
-        for (int i0=0,i0end=rs.extent(0);i0<i0end;++i0) // N/vector_length
-          for (int i1=0,i1end=rs.extent(1);i1<i1end;++i1) // Nvec
-            for (int i2=0,i2end=rs.extent(2);i2<i2end;++i2) // L
-              for (int i3=0,i3end=rs.extent(3);i3<i3end;++i3) // Blk
-                for (int i4=0,i4end=rs.extent(4);i4<i4end;++i4) {// vector_length
-                  const auto val = bs_host(i0,i1,i2,i3,i4);
-                  const auto res = rs_host(i0,i1,i2,i3,i4);
-                  norm2 += val*val;
-                  diff2 += res*res;
+        for (int i0 = 0, i0end = rs.extent(0); i0 < i0end;
+             ++i0)  // N/vector_length
+          for (int i1 = 0, i1end = rs.extent(1); i1 < i1end; ++i1)      // Nvec
+            for (int i2 = 0, i2end = rs.extent(2); i2 < i2end; ++i2)    // L
+              for (int i3 = 0, i3end = rs.extent(3); i3 < i3end; ++i3)  // Blk
+                for (int i4 = 0, i4end = rs.extent(4); i4 < i4end;
+                     ++i4) {  // vector_length
+                  const auto val = bs_host(i0, i1, i2, i3, i4);
+                  const auto res = rs_host(i0, i1, i2, i3, i4);
+                  norm2 += val * val;
+                  diff2 += res * res;
                 }
-        printf("rel error = %e\n", diff2/norm2);
+        printf("rel error = %e\n", diff2 / norm2);
 
         // const int i0 = 0;
         // const int i1 = 0;
@@ -630,8 +641,5 @@ int main(int argc, char* argv[]) {
 }
 
 #else
-int main() {
-  return 0;
-}
+int main() { return 0; }
 #endif
-
diff --git a/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp b/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp
index 3e039b294b..fb9cd6297d 100644
--- a/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp
+++ b/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp
@@ -3,13 +3,13 @@
 #include "Kokkos_Timer.hpp"
 #include "Kokkos_Random.hpp"
 
-#if  defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
 #if !defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION)
 #define KOKKOSBATCHED_TEST_BLOCKTRIDIAGJACOBI
 #endif
 #endif
 
-#if defined( KOKKOSBATCHED_TEST_BLOCKTRIDIAGJACOBI )
+#if defined(KOKKOSBATCHED_TEST_BLOCKTRIDIAGJACOBI)
 
 /// KokkosKernels headers
 #include "KokkosBatched_Util.hpp"
@@ -59,55 +59,57 @@ typedef double value_type;
 ///
 using namespace KokkosBatched;
 
-static constexpr int vector_length = DefaultVectorLength<value_type,memory_space>::value;
+static constexpr int vector_length =
+    DefaultVectorLength<value_type, memory_space>::value;
 #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST)
-static constexpr int internal_vector_length = DefaultInternalVectorLength<value_type,memory_space>::value;
+static constexpr int internal_vector_length =
+    DefaultInternalVectorLength<value_type, memory_space>::value;
 #else
 static constexpr int internal_vector_length = 1;
 #endif
 
-typedef Vector<SIMD<value_type>,vector_length> vector_type;
+typedef Vector<SIMD<value_type>, vector_length> vector_type;
 #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST)
-typedef Vector<SIMD<value_type>,internal_vector_length> internal_vector_type;
+typedef Vector<SIMD<value_type>, internal_vector_length> internal_vector_type;
 #else
 typedef value_type internal_vector_type;
 #endif
 
-template<typename ActiveMemorySpace>
+template <typename ActiveMemorySpace>
 struct InverseDiagonalsModeAndAlgo;
 
-template<>
+template <>
 struct InverseDiagonalsModeAndAlgo<Kokkos::HostSpace> {
   typedef Mode::Serial mode_type;
-  typedef Algo::Level3::Blocked algo_type;   
+  typedef Algo::Level3::Blocked algo_type;
 };
 
 #if defined(KOKKOS_ENABLE_CUDA)
-template<>
+template <>
 struct InverseDiagonalsModeAndAlgo<Kokkos::CudaSpace> {
   typedef Mode::Team mode_type;
-  typedef Algo::Level3::Unblocked algo_type;   
+  typedef Algo::Level3::Unblocked algo_type;
 };
 #endif
 
-template<typename ActiveMemorySpace>
+template <typename ActiveMemorySpace>
 struct SolveModeAndAlgo;
 
-template<>
+template <>
 struct SolveModeAndAlgo<Kokkos::HostSpace> {
   typedef Mode::Serial mode_type;
-  typedef Algo::Level2::Blocked algo_type;   
+  typedef Algo::Level2::Blocked algo_type;
 };
 
 #if defined(KOKKOS_ENABLE_CUDA)
-template<>
+template <>
 struct SolveModeAndAlgo<Kokkos::CudaSpace> {
   typedef Mode::Team mode_type;
-  typedef Algo::Level2::Unblocked algo_type;   
+  typedef Algo::Level2::Unblocked algo_type;
 };
 #endif
 
-int main(int argc, char* argv[]) {
+int main(int argc, char *argv[]) {
   Kokkos::initialize(argc, argv);
   {
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE)
@@ -115,21 +117,21 @@ int main(int argc, char* argv[]) {
 #endif
     Kokkos::print_configuration(std::cout);
 
-    //typedef Kokkos::Details::ArithTraits<value_type> ats;
+    // typedef Kokkos::Details::ArithTraits<value_type> ats;
     Kokkos::Timer timer;
 
     ///
     /// input arguments parsing
     ///
-    int N = 128*128; /// # of problems (batch size)
-    int L = 128;     /// length of block tridiags
-    int Blk = 5;     /// block dimension
-    int Nvec = 1;
-    int S = 0; /// scratch size
-    int niter = 1;
-    int nsweep = 10; 
-    for (int i=1;i<argc;++i) {
-      const std::string& token = argv[i];
+    int N      = 128 * 128;  /// # of problems (batch size)
+    int L      = 128;        /// length of block tridiags
+    int Blk    = 5;          /// block dimension
+    int Nvec   = 1;
+    int S      = 0;  /// scratch size
+    int niter  = 1;
+    int nsweep = 10;
+    for (int i = 1; i < argc; ++i) {
+      const std::string &token = argv[i];
       if (token == std::string("-N")) N = std::atoi(argv[++i]);
       if (token == std::string("-L")) L = std::atoi(argv[++i]);
       if (token == std::string("-B")) Blk = std::atoi(argv[++i]);
@@ -139,93 +141,66 @@ int main(int argc, char* argv[]) {
       if (token == std::string("-Nsweep")) nsweep = std::atoi(argv[++i]);
     }
 
-    printf(" :::: Testing (N = %d, L = %d, Blk = %d, vl = %d, vi = %d, niter = %d, nsweep = %d)\n", 
-           N, L, Blk, vector_length, internal_vector_length, niter, nsweep);
-
+    printf(
+        " :::: Testing (N = %d, L = %d, Blk = %d, vl = %d, vi = %d, niter = "
+        "%d, nsweep = %d)\n",
+        N, L, Blk, vector_length, internal_vector_length, niter, nsweep);
 
     ///
     /// problem container
     ///
 
     /// double 16
-    Kokkos::View<vector_type*****,Kokkos::LayoutRight,exec_space> Av("A",
-                                                                     N/vector_length, L, 4, Blk, Blk);
+    Kokkos::View<vector_type *****, Kokkos::LayoutRight, exec_space> Av(
+        "A", N / vector_length, L, 4, Blk, Blk);
 
     /// double
-    Kokkos::View<value_type******,Kokkos::LayoutRight,exec_space> As((value_type*)Av.data(),
-                                                                     Av.extent(0),
-                                                                     Av.extent(1),
-                                                                     Av.extent(2),
-                                                                     Av.extent(3),
-                                                                     Av.extent(4),
-                                                                     vector_length);
+    Kokkos::View<value_type ******, Kokkos::LayoutRight, exec_space> As(
+        (value_type *)Av.data(), Av.extent(0), Av.extent(1), Av.extent(2),
+        Av.extent(3), Av.extent(4), vector_length);
 
     /// double 2
-    Kokkos::View<internal_vector_type******,Kokkos::LayoutRight,exec_space> Ai((internal_vector_type*)Av.data(),
-                                                                               Av.extent(0),
-                                                                               Av.extent(1),
-                                                                               Av.extent(2),
-                                                                               Av.extent(3),
-                                                                               Av.extent(4),
-                                                                               vector_length/internal_vector_length);
+    Kokkos::View<internal_vector_type ******, Kokkos::LayoutRight, exec_space>
+        Ai((internal_vector_type *)Av.data(), Av.extent(0), Av.extent(1),
+           Av.extent(2), Av.extent(3), Av.extent(4),
+           vector_length / internal_vector_length);
     /// double 16
-    Kokkos::View<vector_type*****,Kokkos::LayoutRight,exec_space> xv("x",
-                                                                    N/vector_length, Nvec, 2, L, Blk);
+    Kokkos::View<vector_type *****, Kokkos::LayoutRight, exec_space> xv(
+        "x", N / vector_length, Nvec, 2, L, Blk);
 
     /// double
-    Kokkos::View<value_type******,Kokkos::LayoutRight,exec_space> xs((value_type*)xv.data(),
-                                                                    xv.extent(0),
-                                                                    xv.extent(1),
-                                                                    xv.extent(2),
-                                                                    xv.extent(3),
-                                                                    xv.extent(4),
-                                                                    vector_length);
+    Kokkos::View<value_type ******, Kokkos::LayoutRight, exec_space> xs(
+        (value_type *)xv.data(), xv.extent(0), xv.extent(1), xv.extent(2),
+        xv.extent(3), xv.extent(4), vector_length);
 
     /// double 2
-    Kokkos::View<internal_vector_type******,Kokkos::LayoutRight,exec_space> xi((internal_vector_type*)xv.data(),
-                                                                               xv.extent(0),
-                                                                               xv.extent(1),
-                                                                               xv.extent(2),
-                                                                               xv.extent(3),
-                                                                               xv.extent(4),
-                                                                               vector_length/internal_vector_length);
+    Kokkos::View<internal_vector_type ******, Kokkos::LayoutRight, exec_space>
+        xi((internal_vector_type *)xv.data(), xv.extent(0), xv.extent(1),
+           xv.extent(2), xv.extent(3), xv.extent(4),
+           vector_length / internal_vector_length);
 
     /// double 16
-    Kokkos::View<vector_type****,Kokkos::LayoutRight,exec_space> bv("b",
-                                                                    N/vector_length, Nvec, L, Blk);
+    Kokkos::View<vector_type ****, Kokkos::LayoutRight, exec_space> bv(
+        "b", N / vector_length, Nvec, L, Blk);
 
     /// double
-    Kokkos::View<value_type*****,Kokkos::LayoutRight,exec_space> bs((value_type*)bv.data(),
-                                                                    bv.extent(0),
-                                                                    bv.extent(1),
-                                                                    bv.extent(2),
-                                                                    bv.extent(3),
-                                                                    vector_length);
+    Kokkos::View<value_type *****, Kokkos::LayoutRight, exec_space> bs(
+        (value_type *)bv.data(), bv.extent(0), bv.extent(1), bv.extent(2),
+        bv.extent(3), vector_length);
 
     /// double 2
-    Kokkos::View<internal_vector_type*****,Kokkos::LayoutRight,exec_space> bi((internal_vector_type*)bv.data(),
-                                                                              bv.extent(0),
-                                                                              bv.extent(1),
-                                                                              bv.extent(2),
-                                                                              bv.extent(3),
-                                                                              vector_length/internal_vector_length);
-    
+    Kokkos::View<internal_vector_type *****, Kokkos::LayoutRight, exec_space>
+        bi((internal_vector_type *)bv.data(), bv.extent(0), bv.extent(1),
+           bv.extent(2), bv.extent(3), vector_length / internal_vector_length);
 
     /// double copy of A
-    Kokkos::View<value_type******,Kokkos::LayoutRight,exec_space> Acopy("Acopy",
-                                                                        As.extent(0),
-                                                                        As.extent(1),
-                                                                        As.extent(2),
-                                                                        As.extent(3),
-                                                                        As.extent(4),
-                                                                        As.extent(5));
-
-    Kokkos::View<value_type*****,Kokkos::LayoutRight,exec_space> rs("rs",
-                                                                    bs.extent(0), 
-                                                                    bs.extent(1), 
-                                                                    bs.extent(2), 
-                                                                    bs.extent(3), 
-                                                                    bs.extent(4));
+    Kokkos::View<value_type ******, Kokkos::LayoutRight, exec_space> Acopy(
+        "Acopy", As.extent(0), As.extent(1), As.extent(2), As.extent(3),
+        As.extent(4), As.extent(5));
+
+    Kokkos::View<value_type *****, Kokkos::LayoutRight, exec_space> rs(
+        "rs", bs.extent(0), bs.extent(1), bs.extent(2), bs.extent(3),
+        bs.extent(4));
 
 #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST)
     auto AA = Ai;
@@ -243,7 +218,7 @@ int main(int argc, char* argv[]) {
     Kokkos::fill_random(bs, random, value_type(1.0));
 
     ///
-    /// diagonal dominant 
+    /// diagonal dominant
     ///
     if (1) {
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE)
@@ -251,18 +226,21 @@ int main(int argc, char* argv[]) {
 #endif
       using policy_type = Kokkos::TeamPolicy<exec_space>;
       using member_type = typename policy_type::member_type;
-      policy_type policy(AA.extent(0)*L, Kokkos::AUTO(), AA.extent(5));
-      Kokkos::parallel_for
-        ("diagonal dominant",
-         policy, KOKKOS_LAMBDA(const member_type &member) {
-          const int i = member.league_rank()/L;
-          const int k = member.league_rank()%L;
-          Kokkos::parallel_for(Kokkos::TeamThreadRange(member,Blk),[&](const int &j) {
-              Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)),[&](const int &v) {
-		  AA(i, k, 1, j, j, v) += internal_vector_type(9*Blk);
+      policy_type policy(AA.extent(0) * L, Kokkos::AUTO(), AA.extent(5));
+      Kokkos::parallel_for(
+          "diagonal dominant", policy,
+          KOKKOS_LAMBDA(const member_type &member) {
+            const int i = member.league_rank() / L;
+            const int k = member.league_rank() % L;
+            Kokkos::parallel_for(
+                Kokkos::TeamThreadRange(member, Blk), [&](const int &j) {
+                  Kokkos::parallel_for(
+                      Kokkos::ThreadVectorRange(member, AA.extent(5)),
+                      [&](const int &v) {
+                        AA(i, k, 1, j, j, v) += internal_vector_type(9 * Blk);
+                      });
                 });
-            });
-        });
+          });
       Kokkos::fence();
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE)
       cudaProfilerStop();
@@ -280,61 +258,70 @@ int main(int argc, char* argv[]) {
 #endif
       timer.reset();
       typedef internal_vector_type scratch_value_type;
-      typedef Kokkos::View<scratch_value_type***,Kokkos::LayoutRight,
-	typename exec_space::scratch_memory_space,Kokkos::MemoryUnmanaged> scratch_view_type;
-      
+      typedef Kokkos::View<scratch_value_type ***, Kokkos::LayoutRight,
+                           typename exec_space::scratch_memory_space,
+                           Kokkos::MemoryUnmanaged>
+          scratch_view_type;
+
       using policy_type = Kokkos::TeamPolicy<exec_space>;
       using member_type = typename policy_type::member_type;
-      const int per_team_scratch = scratch_view_type::shmem_size(Blk, Blk, AA.extent(5));
-      int team_size = 0;                                                                                      
-      if (Blk < 8)         { team_size =  32/AA.extent(5);                                                    
-      } else if (Blk < 12) { team_size =  32/AA.extent(5);                                                    
-      } else               { team_size =  64/AA.extent(5); } 
-
-      policy_type policy(AA.extent(0)*L, team_size, AA.extent(5));
-      Kokkos::parallel_for
-        ("inverse diagonals",
-         policy.set_scratch_size(0,Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)), 
-         KOKKOS_LAMBDA(const member_type &member) {
-	  typedef InverseDiagonalsModeAndAlgo<Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
-	  typedef default_mode_and_algo_type::mode_type mode_type; 
-	  typedef default_mode_and_algo_type::algo_type algo_type;
-
-          const int i = member.league_rank()/L;
-          const int k = member.league_rank()%L;
-
-	  scratch_view_type WW(member.team_scratch(0), Blk, Blk, AA.extent(5));	  
-          Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)),[&](const int &v) {
-	      auto A = Kokkos::subview(AA, i, k, 1, Kokkos::ALL(), Kokkos::ALL(), v);
-	      auto D = Kokkos::subview(AA, i, k, 3, Kokkos::ALL(), Kokkos::ALL(), v);
-	      auto W = Kokkos::subview(WW,          Kokkos::ALL(), Kokkos::ALL(), v);
-
-	      Copy<member_type,
-		Trans::NoTranspose,
-		mode_type>
-		::invoke(member, A, W);
-	      SetIdentity<member_type,
-		mode_type>
-		::invoke(member, D);
-	      member.team_barrier();
-	      LU<member_type,
-		mode_type,algo_type>::invoke(member, W);
-	      Trsm<member_type,
-		Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit,
-		mode_type,algo_type>
-                ::invoke(member, 1.0, W, D);
-              Trsm<member_type,
-		Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,
-		mode_type,algo_type>
-                ::invoke(member, 1.0, W, D);
-            });
-        });
+      const int per_team_scratch =
+          scratch_view_type::shmem_size(Blk, Blk, AA.extent(5));
+      int team_size = 0;
+      if (Blk < 8) {
+        team_size = 32 / AA.extent(5);
+      } else if (Blk < 12) {
+        team_size = 32 / AA.extent(5);
+      } else {
+        team_size = 64 / AA.extent(5);
+      }
+
+      policy_type policy(AA.extent(0) * L, team_size, AA.extent(5));
+      Kokkos::parallel_for(
+          "inverse diagonals",
+          policy.set_scratch_size(
+              0, Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)),
+          KOKKOS_LAMBDA(const member_type &member) {
+            typedef InverseDiagonalsModeAndAlgo<
+                Kokkos::Impl::ActiveExecutionMemorySpace>
+                default_mode_and_algo_type;
+            typedef default_mode_and_algo_type::mode_type mode_type;
+            typedef default_mode_and_algo_type::algo_type algo_type;
+
+            const int i = member.league_rank() / L;
+            const int k = member.league_rank() % L;
+
+            scratch_view_type WW(member.team_scratch(0), Blk, Blk,
+                                 AA.extent(5));
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(member, AA.extent(5)),
+                [&](const int &v) {
+                  auto A = Kokkos::subview(AA, i, k, 1, Kokkos::ALL(),
+                                           Kokkos::ALL(), v);
+                  auto D = Kokkos::subview(AA, i, k, 3, Kokkos::ALL(),
+                                           Kokkos::ALL(), v);
+                  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
+
+                  Copy<member_type, Trans::NoTranspose, mode_type>::invoke(
+                      member, A, W);
+                  SetIdentity<member_type, mode_type>::invoke(member, D);
+                  member.team_barrier();
+                  LU<member_type, mode_type, algo_type>::invoke(member, W);
+                  Trsm<member_type, Side::Left, Uplo::Lower, Trans::NoTranspose,
+                       Diag::Unit, mode_type, algo_type>::invoke(member, 1.0, W,
+                                                                 D);
+                  Trsm<member_type, Side::Left, Uplo::Upper, Trans::NoTranspose,
+                       Diag::NonUnit, mode_type, algo_type>::invoke(member, 1.0,
+                                                                    W, D);
+                });
+          });
       Kokkos::fence();
       const double t = timer.seconds();
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE)
       cudaProfilerStop();
 #endif
-      printf("inverse time = %f , # of inverse per min = %f \n", t, 1.0/t*60);
+      printf("inverse time = %f , # of inverse per min = %f \n", t,
+             1.0 / t * 60);
     }
 
     ///
@@ -346,75 +333,114 @@ int main(int argc, char* argv[]) {
 #endif
       timer.reset();
       typedef internal_vector_type scratch_value_type;
-      typedef Kokkos::View<scratch_value_type**,Kokkos::LayoutRight,
-	typename exec_space::scratch_memory_space,Kokkos::MemoryUnmanaged> scratch_view_type;
-      const int per_team_scratch = scratch_view_type::shmem_size(Blk, AA.extent(5));
-      
+      typedef Kokkos::View<scratch_value_type **, Kokkos::LayoutRight,
+                           typename exec_space::scratch_memory_space,
+                           Kokkos::MemoryUnmanaged>
+          scratch_view_type;
+      const int per_team_scratch =
+          scratch_view_type::shmem_size(Blk, AA.extent(5));
+
       using policy_type = Kokkos::TeamPolicy<exec_space>;
       using member_type = typename policy_type::member_type;
-      int team_size = 0;                                                                                      
-      if (Blk < 8)         { team_size =  32/AA.extent(5);                                                    
-      } else if (Blk < 12) { team_size =  32/AA.extent(5);                                                    
-      } else               { team_size =  32/AA.extent(5); } 
-      policy_type policy(AA.extent(0)*L, team_size, AA.extent(5));
-
-      for (int iter=0;iter<niter;++iter) {
-        auto xxx = Kokkos::subview(xx, Kokkos::ALL(), Kokkos::ALL(), 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
-        auto yyy = Kokkos::subview(xx, Kokkos::ALL(), Kokkos::ALL(), 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
-        
-	for (int nis=0;nis<nsweep;++nis) {
-	  Kokkos::parallel_for
-	    ("solve",
-	     policy.set_scratch_size(0,Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)), 
-	     KOKKOS_LAMBDA(const member_type &member) {
-	      typedef SolveModeAndAlgo<Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
-	      typedef default_mode_and_algo_type::mode_type mode_type; 
-	      typedef default_mode_and_algo_type::algo_type algo_type;
-	      
-	      scratch_view_type WW(member.team_scratch(0), Blk, AA.extent(5));	    
-	      const int i = member.league_rank()/L; //%AA.extent(0);
-	      const int k = member.league_rank()%L;
-	      Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)),[&](const int &v) {
-		  auto A  = Kokkos::subview(AA, i, k,           1, Kokkos::ALL(), Kokkos::ALL(), v);
-		  auto D  = Kokkos::subview(AA, i, k,           3, Kokkos::ALL(), Kokkos::ALL(), v);                 
-		  auto B  = Kokkos::subview(AA, i, k,           2, Kokkos::ALL(), Kokkos::ALL(), v);
-		  auto C  = Kokkos::subview(AA, i, k ? k-1 : 0, 0, Kokkos::ALL(), Kokkos::ALL(), v);		  
-		  auto u  = Kokkos::subview(WW, Kokkos::ALL(), v); 
-		  for (int jvec=0;jvec<Nvec;++jvec) {
-		    auto x0 = Kokkos::subview(xxx, i, jvec, k == 0   ? 0 : k-1, Kokkos::ALL(), v);
-		    auto x1 = Kokkos::subview(xxx, i, jvec, k,                  Kokkos::ALL(), v);
-		    auto x2 = Kokkos::subview(xxx, i, jvec, k == L-1 ? 0 : k+1, Kokkos::ALL(), v);
-		    auto y1 = Kokkos::subview(yyy, i, jvec, k,                  Kokkos::ALL(), v);
-		    auto b  = Kokkos::subview(bb,  i, jvec, k,                  Kokkos::ALL(), v);
-		    
-		    if (L == 1) {
-		      Gemv<member_type,Trans::NoTranspose,mode_type,algo_type>::invoke(member,  1.0, D, b,  0.0, x1);
-		    } else {
-		      Copy<member_type,Trans::NoTranspose,mode_type>::invoke(member, b,  u);
-		      if (k == 0) {
-			Gemv<member_type,Trans::NoTranspose,mode_type,algo_type>::invoke(member, -1.0, B, x2, 1.0, u);		    
-		      } else if (k == L-1) {
-			Gemv<member_type,Trans::NoTranspose,mode_type,algo_type>::invoke(member, -1.0, C, x0, 1.0, u);
-		      } else {
-			Gemv<member_type,Trans::NoTranspose,mode_type,algo_type>::invoke(member, -1.0, B, x2, 1.0, u);		    
-			Gemv<member_type,Trans::NoTranspose,mode_type,algo_type>::invoke(member, -1.0, C, x0, 1.0, u);
-		      }		    
-		      Gemv<member_type,Trans::NoTranspose,mode_type,algo_type>::invoke(member,  1.0, D, u,  0.0, y1);
-		    }
-		  }
-		});
-	    });
-          auto tmp = xxx; xxx = yyy; yyy = tmp;
-	}
-	Kokkos::fence();
+      int team_size     = 0;
+      if (Blk < 8) {
+        team_size = 32 / AA.extent(5);
+      } else if (Blk < 12) {
+        team_size = 32 / AA.extent(5);
+      } else {
+        team_size = 32 / AA.extent(5);
+      }
+      policy_type policy(AA.extent(0) * L, team_size, AA.extent(5));
+
+      for (int iter = 0; iter < niter; ++iter) {
+        auto xxx = Kokkos::subview(xx, Kokkos::ALL(), Kokkos::ALL(), 0,
+                                   Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
+        auto yyy = Kokkos::subview(xx, Kokkos::ALL(), Kokkos::ALL(), 1,
+                                   Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
+
+        for (int nis = 0; nis < nsweep; ++nis) {
+          Kokkos::parallel_for(
+              "solve",
+              policy.set_scratch_size(
+                  0,
+                  Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)),
+              KOKKOS_LAMBDA(const member_type &member) {
+                typedef SolveModeAndAlgo<
+                    Kokkos::Impl::ActiveExecutionMemorySpace>
+                    default_mode_and_algo_type;
+                typedef default_mode_and_algo_type::mode_type mode_type;
+                typedef default_mode_and_algo_type::algo_type algo_type;
+
+                scratch_view_type WW(member.team_scratch(0), Blk, AA.extent(5));
+                const int i = member.league_rank() / L;  //%AA.extent(0);
+                const int k = member.league_rank() % L;
+                Kokkos::parallel_for(
+                    Kokkos::ThreadVectorRange(member, AA.extent(5)),
+                    [&](const int &v) {
+                      auto A = Kokkos::subview(AA, i, k, 1, Kokkos::ALL(),
+                                               Kokkos::ALL(), v);
+                      auto D = Kokkos::subview(AA, i, k, 3, Kokkos::ALL(),
+                                               Kokkos::ALL(), v);
+                      auto B = Kokkos::subview(AA, i, k, 2, Kokkos::ALL(),
+                                               Kokkos::ALL(), v);
+                      auto C = Kokkos::subview(AA, i, k ? k - 1 : 0, 0,
+                                               Kokkos::ALL(), Kokkos::ALL(), v);
+                      auto u = Kokkos::subview(WW, Kokkos::ALL(), v);
+                      for (int jvec = 0; jvec < Nvec; ++jvec) {
+                        auto x0 = Kokkos::subview(
+                            xxx, i, jvec, k == 0 ? 0 : k - 1, Kokkos::ALL(), v);
+                        auto x1 =
+                            Kokkos::subview(xxx, i, jvec, k, Kokkos::ALL(), v);
+                        auto x2 = Kokkos::subview(xxx, i, jvec,
+                                                  k == L - 1 ? 0 : k + 1,
+                                                  Kokkos::ALL(), v);
+                        auto y1 =
+                            Kokkos::subview(yyy, i, jvec, k, Kokkos::ALL(), v);
+                        auto b =
+                            Kokkos::subview(bb, i, jvec, k, Kokkos::ALL(), v);
+
+                        if (L == 1) {
+                          Gemv<member_type, Trans::NoTranspose, mode_type,
+                               algo_type>::invoke(member, 1.0, D, b, 0.0, x1);
+                        } else {
+                          Copy<member_type, Trans::NoTranspose,
+                               mode_type>::invoke(member, b, u);
+                          if (k == 0) {
+                            Gemv<member_type, Trans::NoTranspose, mode_type,
+                                 algo_type>::invoke(member, -1.0, B, x2, 1.0,
+                                                    u);
+                          } else if (k == L - 1) {
+                            Gemv<member_type, Trans::NoTranspose, mode_type,
+                                 algo_type>::invoke(member, -1.0, C, x0, 1.0,
+                                                    u);
+                          } else {
+                            Gemv<member_type, Trans::NoTranspose, mode_type,
+                                 algo_type>::invoke(member, -1.0, B, x2, 1.0,
+                                                    u);
+                            Gemv<member_type, Trans::NoTranspose, mode_type,
+                                 algo_type>::invoke(member, -1.0, C, x0, 1.0,
+                                                    u);
+                          }
+                          Gemv<member_type, Trans::NoTranspose, mode_type,
+                               algo_type>::invoke(member, 1.0, D, u, 0.0, y1);
+                        }
+                      }
+                    });
+              });
+          auto tmp = xxx;
+          xxx      = yyy;
+          yyy      = tmp;
+        }
+        Kokkos::fence();
       }
       const double t = timer.seconds();
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE)
       cudaProfilerStop();
 #endif
-      printf("solve time = %f , # of solves per min = %f\n", t, 1.0/t*60*niter);
+      printf("solve time = %f , # of solves per min = %f\n", t,
+             1.0 / t * 60 * niter);
     }
-    
+
     ///
     /// compute residual
     ///
@@ -422,105 +448,142 @@ int main(int argc, char* argv[]) {
       typedef KokkosBatched::Algo::Level2::Unblocked algo_type;
       using policy_type = Kokkos::TeamPolicy<exec_space>;
       policy_type policy(Acopy.extent(0), Kokkos::AUTO(), Acopy.extent(5));
-      Kokkos::parallel_for
-        ("compute residual",
-         policy, KOKKOS_LAMBDA(const typename policy_type::member_type &member) {
-          const int i = member.league_rank();
-          Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, Acopy.extent(5)),[&](const int &v) {
-              auto A = Kokkos::subview(Acopy, i, Kokkos::ALL(), 1, Kokkos::ALL(), Kokkos::ALL(), v);
-              auto B = Kokkos::subview(Acopy, i, Kokkos::ALL(), 2, Kokkos::ALL(), Kokkos::ALL(), v);
-              auto C = Kokkos::subview(Acopy, i, Kokkos::ALL(), 0, Kokkos::ALL(), Kokkos::ALL(), v);
-                
-              for (int jvec=0,jvecend=rs.extent(1);jvec<jvecend;++jvec) {
-                auto x = Kokkos::subview(xs, i, jvec, nsweep%2, Kokkos::ALL(), Kokkos::ALL(), v);
-                auto b = Kokkos::subview(bs, i, jvec,           Kokkos::ALL(), Kokkos::ALL(), v);
-                auto r = Kokkos::subview(rs, i, jvec,           Kokkos::ALL(), Kokkos::ALL(), v);
-                
-                if (L == 1) {
-                  auto A0 = Kokkos::subview(A, 0,   Kokkos::ALL(), Kokkos::ALL());
-                  auto x0 = Kokkos::subview(x, 0,   Kokkos::ALL());
-                  auto b0 = Kokkos::subview(b, 0,   Kokkos::ALL());
-                  auto r0 = Kokkos::subview(r, 0,   Kokkos::ALL());
-
-                  TeamCopy<typename policy_type::member_type,
-                           Trans::NoTranspose>
-                    ::invoke(member, b0, r0);
-                  TeamGemv<typename policy_type::member_type,
-                           Trans::NoTranspose,algo_type>
-                    ::invoke(member, -1.0, A0, x0, 1.0, r0);
-                } else {
-                  int k = 0;
-                  {
-                    /// first row 
-                    auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL());
-                    auto B2 = Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL());
-                    
-                    auto x1 = Kokkos::subview(x, k,   Kokkos::ALL());
-                    auto x2 = Kokkos::subview(x, k+1, Kokkos::ALL());
-                    
-                    auto bk = Kokkos::subview(b, k, Kokkos::ALL());
-                    auto rk = Kokkos::subview(r, k, Kokkos::ALL());
-                    TeamCopy<typename policy_type::member_type,
-                             Trans::NoTranspose>
-                      ::invoke(member, bk, rk);
-                    member.team_barrier();
-                    TeamGemv<typename policy_type::member_type,
-                             Trans::NoTranspose,algo_type>
-                      ::invoke(member, -1.0, A1, x1, 1.0, rk);
-                    TeamGemv<typename policy_type::member_type,
-                             Trans::NoTranspose,algo_type>
-                      ::invoke(member, -1.0, B2, x2, 1.0, rk);
-                    ++k;
+      Kokkos::parallel_for(
+          "compute residual", policy,
+          KOKKOS_LAMBDA(const typename policy_type::member_type &member) {
+            const int i = member.league_rank();
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(member, Acopy.extent(5)),
+                [&](const int &v) {
+                  auto A = Kokkos::subview(Acopy, i, Kokkos::ALL(), 1,
+                                           Kokkos::ALL(), Kokkos::ALL(), v);
+                  auto B = Kokkos::subview(Acopy, i, Kokkos::ALL(), 2,
+                                           Kokkos::ALL(), Kokkos::ALL(), v);
+                  auto C = Kokkos::subview(Acopy, i, Kokkos::ALL(), 0,
+                                           Kokkos::ALL(), Kokkos::ALL(), v);
+
+                  for (int jvec = 0, jvecend = rs.extent(1); jvec < jvecend;
+                       ++jvec) {
+                    auto x = Kokkos::subview(xs, i, jvec, nsweep % 2,
+                                             Kokkos::ALL(), Kokkos::ALL(), v);
+                    auto b = Kokkos::subview(bs, i, jvec, Kokkos::ALL(),
+                                             Kokkos::ALL(), v);
+                    auto r = Kokkos::subview(rs, i, jvec, Kokkos::ALL(),
+                                             Kokkos::ALL(), v);
+
+                    if (L == 1) {
+                      auto A0 =
+                          Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL());
+                      auto x0 = Kokkos::subview(x, 0, Kokkos::ALL());
+                      auto b0 = Kokkos::subview(b, 0, Kokkos::ALL());
+                      auto r0 = Kokkos::subview(r, 0, Kokkos::ALL());
+
+                      TeamCopy<typename policy_type::member_type,
+                               Trans::NoTranspose>::invoke(member, b0, r0);
+                      TeamGemv<typename policy_type::member_type,
+                               Trans::NoTranspose, algo_type>::invoke(member,
+                                                                      -1.0, A0,
+                                                                      x0, 1.0,
+                                                                      r0);
+                    } else {
+                      int k = 0;
+                      {
+                        /// first row
+                        auto A1 =
+                            Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL());
+                        auto B2 =
+                            Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL());
+
+                        auto x1 = Kokkos::subview(x, k, Kokkos::ALL());
+                        auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL());
+
+                        auto bk = Kokkos::subview(b, k, Kokkos::ALL());
+                        auto rk = Kokkos::subview(r, k, Kokkos::ALL());
+                        TeamCopy<typename policy_type::member_type,
+                                 Trans::NoTranspose>::invoke(member, bk, rk);
+                        member.team_barrier();
+                        TeamGemv<typename policy_type::member_type,
+                                 Trans::NoTranspose, algo_type>::invoke(member,
+                                                                        -1.0,
+                                                                        A1, x1,
+                                                                        1.0,
+                                                                        rk);
+                        TeamGemv<typename policy_type::member_type,
+                                 Trans::NoTranspose, algo_type>::invoke(member,
+                                                                        -1.0,
+                                                                        B2, x2,
+                                                                        1.0,
+                                                                        rk);
+                        ++k;
+                      }
+                      for (; k < (L - 1); ++k) {
+                        auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(),
+                                                  Kokkos::ALL());
+                        auto A1 =
+                            Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL());
+                        auto B2 =
+                            Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL());
+
+                        auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL());
+                        auto x1 = Kokkos::subview(x, k, Kokkos::ALL());
+                        auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL());
+
+                        auto bk = Kokkos::subview(b, k, Kokkos::ALL());
+                        auto rk = Kokkos::subview(r, k, Kokkos::ALL());
+                        TeamCopy<typename policy_type::member_type,
+                                 Trans::NoTranspose>::invoke(member, bk, rk);
+                        member.team_barrier();
+                        TeamGemv<typename policy_type::member_type,
+                                 Trans::NoTranspose, algo_type>::invoke(member,
+                                                                        -1.0,
+                                                                        C0, x0,
+                                                                        1.0,
+                                                                        rk);
+                        TeamGemv<typename policy_type::member_type,
+                                 Trans::NoTranspose, algo_type>::invoke(member,
+                                                                        -1.0,
+                                                                        A1, x1,
+                                                                        1.0,
+                                                                        rk);
+                        TeamGemv<typename policy_type::member_type,
+                                 Trans::NoTranspose, algo_type>::invoke(member,
+                                                                        -1.0,
+                                                                        B2, x2,
+                                                                        1.0,
+                                                                        rk);
+                      }
+                      {
+                        // last row
+                        auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(),
+                                                  Kokkos::ALL());
+                        auto A1 =
+                            Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL());
+
+                        auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL());
+                        auto x1 = Kokkos::subview(x, k, Kokkos::ALL());
+
+                        auto bk = Kokkos::subview(b, k, Kokkos::ALL());
+                        auto rk = Kokkos::subview(r, k, Kokkos::ALL());
+                        TeamCopy<typename policy_type::member_type,
+                                 Trans::NoTranspose>::invoke(member, bk, rk);
+                        member.team_barrier();
+                        TeamGemv<typename policy_type::member_type,
+                                 Trans::NoTranspose, algo_type>::invoke(member,
+                                                                        -1.0,
+                                                                        C0, x0,
+                                                                        1.0,
+                                                                        rk);
+                        TeamGemv<typename policy_type::member_type,
+                                 Trans::NoTranspose, algo_type>::invoke(member,
+                                                                        -1.0,
+                                                                        A1, x1,
+                                                                        1.0,
+                                                                        rk);
+                      }
+                    }
                   }
-                  for (;k<(L-1);++k) {
-                    auto C0 = Kokkos::subview(C, k-1, Kokkos::ALL(), Kokkos::ALL());
-                    auto A1 = Kokkos::subview(A, k,   Kokkos::ALL(), Kokkos::ALL());
-                    auto B2 = Kokkos::subview(B, k,   Kokkos::ALL(), Kokkos::ALL());
-                    
-                    auto x0 = Kokkos::subview(x, k-1, Kokkos::ALL());
-                    auto x1 = Kokkos::subview(x, k,   Kokkos::ALL());
-                    auto x2 = Kokkos::subview(x, k+1, Kokkos::ALL());
-                    
-                    auto bk = Kokkos::subview(b, k, Kokkos::ALL());
-                    auto rk = Kokkos::subview(r, k, Kokkos::ALL());
-                    TeamCopy<typename policy_type::member_type,
-                             Trans::NoTranspose>
-                      ::invoke(member, bk, rk);
-                    member.team_barrier();
-                    TeamGemv<typename policy_type::member_type,
-                             Trans::NoTranspose,algo_type>
-                      ::invoke(member, -1.0, C0, x0, 1.0, rk);
-                    TeamGemv<typename policy_type::member_type,
-                             Trans::NoTranspose,algo_type>
-                      ::invoke(member, -1.0, A1, x1, 1.0, rk);
-                    TeamGemv<typename policy_type::member_type,
-                             Trans::NoTranspose,algo_type>
-                      ::invoke(member, -1.0, B2, x2, 1.0, rk);
-                  }
-                  {  
-                    // last row
-                    auto C0 = Kokkos::subview(C, k-1, Kokkos::ALL(), Kokkos::ALL());
-                    auto A1 = Kokkos::subview(A, k,   Kokkos::ALL(), Kokkos::ALL());
-                    
-                    auto x0 = Kokkos::subview(x, k-1, Kokkos::ALL());
-                    auto x1 = Kokkos::subview(x, k,   Kokkos::ALL());
-                    
-                    auto bk = Kokkos::subview(b, k, Kokkos::ALL());
-                    auto rk = Kokkos::subview(r, k, Kokkos::ALL());
-                    TeamCopy<typename policy_type::member_type,
-                             Trans::NoTranspose>
-                      ::invoke(member, bk, rk);
-                    member.team_barrier();
-                    TeamGemv<typename policy_type::member_type,
-                             Trans::NoTranspose,algo_type>
-                      ::invoke(member, -1.0, C0, x0, 1.0, rk);
-                    TeamGemv<typename policy_type::member_type,Trans::NoTranspose,algo_type>
-                      ::invoke(member, -1.0, A1, x1, 1.0, rk);
-                  }                   
-                }
-              }
-            });
-        });
+                });
+          });
       Kokkos::fence();
       auto rs_host = Kokkos::create_mirror_view(rs);
       auto bs_host = Kokkos::create_mirror_view(bs);
@@ -529,17 +592,19 @@ int main(int argc, char* argv[]) {
       Kokkos::fence();
       {
         double norm2 = 0, diff2 = 0;
-        for (int i0=0,i0end=rs.extent(0);i0<i0end;++i0) // N/vector_length
-          for (int i1=0,i1end=rs.extent(1);i1<i1end;++i1) // Nvec
-            for (int i2=0,i2end=rs.extent(2);i2<i2end;++i2) // L
-              for (int i3=0,i3end=rs.extent(3);i3<i3end;++i3) // Blk
-                for (int i4=0,i4end=rs.extent(4);i4<i4end;++i4) {// vector_length
-                  const auto val = bs_host(i0,i1,i2,i3,i4);
-                  const auto res = rs_host(i0,i1,i2,i3,i4);
-                  norm2 += val*val;
-                  diff2 += res*res;
+        for (int i0 = 0, i0end = rs.extent(0); i0 < i0end;
+             ++i0)  // N/vector_length
+          for (int i1 = 0, i1end = rs.extent(1); i1 < i1end; ++i1)      // Nvec
+            for (int i2 = 0, i2end = rs.extent(2); i2 < i2end; ++i2)    // L
+              for (int i3 = 0, i3end = rs.extent(3); i3 < i3end; ++i3)  // Blk
+                for (int i4 = 0, i4end = rs.extent(4); i4 < i4end;
+                     ++i4) {  // vector_length
+                  const auto val = bs_host(i0, i1, i2, i3, i4);
+                  const auto res = rs_host(i0, i1, i2, i3, i4);
+                  norm2 += val * val;
+                  diff2 += res * res;
                 }
-        printf("rel error = %e\n", diff2/norm2);
+        printf("rel error = %e\n", diff2 / norm2);
 
         // const int i0 = 0;
         // const int i1 = 0;
@@ -556,7 +621,5 @@ int main(int argc, char* argv[]) {
   return 0;
 }
 #else
-int main() {
-  return 0;
-}
+int main() { return 0; }
 #endif
diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Cuda.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Cuda.cpp
index 9175720e10..5bc5e9d1b7 100644
--- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Cuda.cpp
+++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Cuda.cpp
@@ -3,7 +3,7 @@
 #include "Kokkos_Core.hpp"
 #include "Kokkos_Timer.hpp"
 
-#if defined(KOKKOS_ENABLE_CUDA) 
+#if defined(KOKKOS_ENABLE_CUDA)
 
 #include <iomanip>
 
@@ -24,622 +24,638 @@
 #include "KokkosBatched_Gemm_Team_Impl.hpp"
 
 namespace KokkosBatched {
-  namespace PerfTest {
+namespace PerfTest {
 
 #undef FLOP_MUL
 #undef FLOP_ADD
 #define FLOP_MUL 1.0
 #define FLOP_ADD 1.0
-    typedef double value_type;
+typedef double value_type;
 
-    double FlopCount(int mm, int nn, int kk) {
-      double m = (double)mm;    double n = (double)nn;    double k = (double)kk;
-      return (FLOP_MUL*(m*n*k) +
-              FLOP_ADD*(m*n*k));
-    }
+double FlopCount(int mm, int nn, int kk) {
+  double m = (double)mm;
+  double n = (double)nn;
+  double k = (double)kk;
+  return (FLOP_MUL * (m * n * k) + FLOP_ADD * (m * n * k));
+}
 
-    struct RangeTag {};
-    struct TeamTagV1 {};
-    struct TeamTagV2 {};
-    struct TeamTagV3 {};
-    struct TeamTagHandmade {};
-
-    template<typename ViewType, typename AlgoTagType, int VectorLength = 0>
-    struct Functor {
-      ConstUnmanagedViewType<ViewType> _a, _b;
-      UnmanagedViewType<ViewType> _c;
-
-      KOKKOS_INLINE_FUNCTION
-      Functor() = default;
-         
-      KOKKOS_INLINE_FUNCTION
-      Functor(const ViewType &a, 
-              const ViewType &b,
-              const ViewType &c)
-        : _a(a), _b(b), _c(c) {}
-                    
-      KOKKOS_INLINE_FUNCTION
-      void operator()(const RangeTag &, const int k) const {
-        auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-        auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
-        auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL());
-            
-        SerialGemm<Trans::NoTranspose,Trans::NoTranspose,AlgoTagType>::
-          invoke(1.0, aa, bb, 1.0, cc);
-      }
+struct RangeTag {};
+struct TeamTagV1 {};
+struct TeamTagV2 {};
+struct TeamTagV3 {};
+struct TeamTagHandmade {};
 
-      template<typename MemberType>
-      KOKKOS_INLINE_FUNCTION
-      void operator()(const TeamTagV1 &, const MemberType &member) const {
-        const int kbeg = (member.league_rank()*(member.team_size()*VectorLength) +
-                          member.team_rank()*VectorLength);
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength),
-                             [&](const int &k) {
-                               const int kk = kbeg + k;
-                               if (kk < int(_c.extent(0))) {
-                                 auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL());
-                                 auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL());
-                                 auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL());
-                  
-                                 SerialGemm<Trans::NoTranspose,Trans::NoTranspose,AlgoTagType>::
-                                   invoke(1.0, aa, bb, 1.0, cc);
-                               }
-                             });
-      }   
-
-      template<typename MemberType>
-      KOKKOS_INLINE_FUNCTION
-      void operator()(const TeamTagV2 &, const MemberType &member) const {
-        const int kbeg = member.league_rank()*VectorLength;
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength),
-                             [&](const int &k) {
-                               const int kk = kbeg + k;
-                               if (kk < int(_c.extent(0))) {
-                                 auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL());
-                                 auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL());
-                                 auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL());
-                  
-                                 TeamGemm<MemberType,Trans::NoTranspose,Trans::NoTranspose,AlgoTagType>::
-                                   invoke(member, 1.0, aa, bb, 1.0, cc);
-                               }
-                             });
-      }
+template <typename ViewType, typename AlgoTagType, int VectorLength = 0>
+struct Functor {
+  ConstUnmanagedViewType<ViewType> _a, _b;
+  UnmanagedViewType<ViewType> _c;
 
-      template<typename MemberType>
-      KOKKOS_INLINE_FUNCTION
-      void operator()(const TeamTagV3 &, const MemberType &member) const {
-        const int lvl = 0;
-        ScratchViewType<ViewType> sa(member.team_scratch(lvl), VectorLength, _a.extent(1), _a.extent(2));
-        ScratchViewType<ViewType> sb(member.team_scratch(lvl), VectorLength, _b.extent(1), _b.extent(2));
-
-        const int kbeg = member.league_rank()*VectorLength;
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength),
-                             [&](const int &k) {
-                               const int kk = kbeg + k;
-                               if (kk < int(_c.extent(0))) {                  
-                                 auto aa  = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL());
-                                 auto bb  = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL());
-                                 auto cc  = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL());
-                  
-                                 auto saa = Kokkos::subview(sa,  k, Kokkos::ALL(), Kokkos::ALL());
-                                 auto sbb = Kokkos::subview(sb,  k, Kokkos::ALL(), Kokkos::ALL());
-                  
-                                 TeamCopy<MemberType,Trans::NoTranspose>::invoke(member, aa, saa);                  
-                                 TeamCopy<MemberType,Trans::NoTranspose>::invoke(member, bb, sbb);
-                                 member.team_barrier();
-                  
-                                 TeamGemm<MemberType,Trans::NoTranspose,Trans::NoTranspose,AlgoTagType>::
-                                   invoke(member, 1.0, saa, sbb, 1.0, cc);
-                               }
-                             });
-      }
-          
-      template<typename MemberType>
-      KOKKOS_INLINE_FUNCTION
-      void operator()(const TeamTagHandmade &, const MemberType &member) const {
-        const int kbeg = member.league_rank()*VectorLength;
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength),
-                             [&](const int &k) {
-                               const int kk = kbeg + k;
-                               if (kk < int(_c.extent(0))) {
-                                 const int m = _c.extent(1), n = _c.extent(2), q = _a.extent(2);
-                                 Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,m*n),
-                                                      [&](const int &ij) {
-                                                        const int i = ij%m, j = ij/m;                              
-                                                        typename ViewType::non_const_value_type cval = 0;
-                                                        for (int p=0;p<q;++p)
-                                                          cval += _a(kk, i, p)*_b(kk, p, j);
-                                                        _c(kk, i, j) += cval;
-                                                      });
-                               }
-                             });
-      }
-    };
-
-    template<typename DeviceSpaceType, typename AlgoTagType>
-    void Gemm(const int NN, const int BlkSize) {
-      typedef Kokkos::Schedule<Kokkos::Static> ScheduleType;
-
-      constexpr int VectorLength = DefaultVectorLength<value_type,typename DeviceSpaceType::memory_space>::value;
-      const int N = NN/VectorLength;
-          
-      {
-        std::string value_type_name;
-        if (std::is_same<value_type,double>::value)                   value_type_name = "double";
-        if (std::is_same<value_type,Kokkos::complex<double> >::value) value_type_name = "Kokkos::complex<double>";
-            
-        std::cout << "SIMD is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
-      }
+  KOKKOS_INLINE_FUNCTION
+  Functor() = default;
 
-      const double flop = (N*VectorLength)*FlopCount(BlkSize,BlkSize,BlkSize);
-      const double tmax = 1.0e15;
+  KOKKOS_INLINE_FUNCTION
+  Functor(const ViewType &a, const ViewType &b, const ViewType &c)
+      : _a(a), _b(b), _c(c) {}
 
-      typedef Kokkos::DefaultHostExecutionSpace HostSpaceType;
-      typedef typename DeviceSpaceType::memory_space DeviceMemorySpaceType;
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const RangeTag &, const int k) const {
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
+    auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL());
 
-      const int iter_begin = -3, iter_end = 30;
-      Kokkos::Timer timer;
+    SerialGemm<Trans::NoTranspose, Trans::NoTranspose, AlgoTagType>::invoke(
+        1.0, aa, bb, 1.0, cc);
+  }
 
-      Kokkos::View<value_type***,Kokkos::LayoutLeft,HostSpaceType>
-        amat("amat", N*VectorLength, BlkSize, BlkSize),
-        bmat("bmat", N*VectorLength, BlkSize, BlkSize),
-        cref("cref", N*VectorLength, BlkSize, BlkSize);
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV1 &,
+                                         const MemberType &member) const {
+    const int kbeg =
+        (member.league_rank() * (member.team_size() * VectorLength) +
+         member.team_rank() * VectorLength);
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) {
+          const int kk = kbeg + k;
+          if (kk < int(_c.extent(0))) {
+            auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL());
+            auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL());
+            auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL());
+
+            SerialGemm<Trans::NoTranspose, Trans::NoTranspose,
+                       AlgoTagType>::invoke(1.0, aa, bb, 1.0, cc);
+          }
+        });
+  }
 
-      {
-        Random<value_type> random;
-        for (int k=0;k<N*VectorLength;++k)
-          for (int i=0;i<BlkSize;++i)
-            for (int j=0;j<BlkSize;++j) {
-              amat(k, i, j) = random.value();
-              bmat(k, i, j) = random.value();
-            }
-      }
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV2 &,
+                                         const MemberType &member) const {
+    const int kbeg = member.league_rank() * VectorLength;
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) {
+          const int kk = kbeg + k;
+          if (kk < int(_c.extent(0))) {
+            auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL());
+            auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL());
+            auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL());
+
+            TeamGemm<MemberType, Trans::NoTranspose, Trans::NoTranspose,
+                     AlgoTagType>::invoke(member, 1.0, aa, bb, 1.0, cc);
+          }
+        });
+  }
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV3 &,
+                                         const MemberType &member) const {
+    const int lvl = 0;
+    ScratchViewType<ViewType> sa(member.team_scratch(lvl), VectorLength,
+                                 _a.extent(1), _a.extent(2));
+    ScratchViewType<ViewType> sb(member.team_scratch(lvl), VectorLength,
+                                 _b.extent(1), _b.extent(2));
+
+    const int kbeg = member.league_rank() * VectorLength;
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) {
+          const int kk = kbeg + k;
+          if (kk < int(_c.extent(0))) {
+            auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL());
+            auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL());
+            auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL());
+
+            auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL());
+            auto sbb = Kokkos::subview(sb, k, Kokkos::ALL(), Kokkos::ALL());
+
+            TeamCopy<MemberType, Trans::NoTranspose>::invoke(member, aa, saa);
+            TeamCopy<MemberType, Trans::NoTranspose>::invoke(member, bb, sbb);
+            member.team_barrier();
+
+            TeamGemm<MemberType, Trans::NoTranspose, Trans::NoTranspose,
+                     AlgoTagType>::invoke(member, 1.0, saa, sbb, 1.0, cc);
+          }
+        });
+  }
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamTagHandmade &,
+                                         const MemberType &member) const {
+    const int kbeg = member.league_rank() * VectorLength;
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) {
+          const int kk = kbeg + k;
+          if (kk < int(_c.extent(0))) {
+            const int m = _c.extent(1), n = _c.extent(2), q = _a.extent(2);
+            Kokkos::parallel_for(
+                Kokkos::TeamThreadRange(member, 0, m * n), [&](const int &ij) {
+                  const int i = ij % m, j = ij / m;
+                  typename ViewType::non_const_value_type cval = 0;
+                  for (int p = 0; p < q; ++p)
+                    cval += _a(kk, i, p) * _b(kk, p, j);
+                  _c(kk, i, j) += cval;
+                });
+          }
+        });
+  }
+};
+
+template <typename DeviceSpaceType, typename AlgoTagType>
+void Gemm(const int NN, const int BlkSize) {
+  typedef Kokkos::Schedule<Kokkos::Static> ScheduleType;
+
+  constexpr int VectorLength =
+      DefaultVectorLength<value_type,
+                          typename DeviceSpaceType::memory_space>::value;
+  const int N = NN / VectorLength;
+
+  {
+    std::string value_type_name;
+    if (std::is_same<value_type, double>::value) value_type_name = "double";
+    if (std::is_same<value_type, Kokkos::complex<double> >::value)
+      value_type_name = "Kokkos::complex<double>";
+
+    std::cout << "SIMD is defined: datatype " << value_type_name
+              << " a vector length " << VectorLength << "\n";
+  }
 
-      // P100 L2 cache 4MB per core
-      constexpr size_t LLC_CAPACITY = 56*4*1024*1024;
-      Flush<LLC_CAPACITY,DeviceSpaceType> flush;
+  const double flop = (N * VectorLength) * FlopCount(BlkSize, BlkSize, BlkSize);
+  const double tmax = 1.0e15;
+
+  typedef Kokkos::DefaultHostExecutionSpace HostSpaceType;
+  typedef typename DeviceSpaceType::memory_space DeviceMemorySpaceType;
+
+  const int iter_begin = -3, iter_end = 30;
+  Kokkos::Timer timer;
+
+  Kokkos::View<value_type ***, Kokkos::LayoutLeft, HostSpaceType> amat(
+      "amat", N * VectorLength, BlkSize, BlkSize),
+      bmat("bmat", N * VectorLength, BlkSize, BlkSize),
+      cref("cref", N * VectorLength, BlkSize, BlkSize);
+
+  {
+    Random<value_type> random;
+    for (int k = 0; k < N * VectorLength; ++k)
+      for (int i = 0; i < BlkSize; ++i)
+        for (int j = 0; j < BlkSize; ++j) {
+          amat(k, i, j) = random.value();
+          bmat(k, i, j) = random.value();
+        }
+  }
+
+  // P100 L2 cache 4MB per core
+  constexpr size_t LLC_CAPACITY = 56 * 4 * 1024 * 1024;
+  Flush<LLC_CAPACITY, DeviceSpaceType> flush;
 
 #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__)
-      if (1) {
-        ///
-        /// CUBLAS Strided version
-        ///
-        const Kokkos::LayoutStride stride(N*VectorLength, BlkSize*BlkSize,
-                                          BlkSize, 1,
-                                          BlkSize, BlkSize);
+  if (1) {
+    ///
+    /// CUBLAS Strided version
+    ///
+    const Kokkos::LayoutStride stride(N * VectorLength, BlkSize * BlkSize,
+                                      BlkSize, 1, BlkSize, BlkSize);
+
+    Kokkos::View<value_type ***, Kokkos::LayoutStride, DeviceSpaceType> a(
+        "a", stride),
+        b("b", stride), c("c", stride);
+
+    double tavg = 0, tmin = tmax;
 
-        Kokkos::View<value_type***,Kokkos::LayoutStride,DeviceSpaceType>
-          a("a", stride),
-          b("b", stride),
-          c("c", stride);
+    cublasStatus_t stat;
+    cublasHandle_t handle;
 
-        double tavg = 0, tmin = tmax;
+    stat = cublasCreate(&handle);
+    if (stat != CUBLAS_STATUS_SUCCESS)
+      Kokkos::abort("CUBLAS initialization failed\n");
 
-        cublasStatus_t stat;
-        cublasHandle_t handle;
+    auto amat_device =
+        Kokkos::create_mirror_view(DeviceMemorySpaceType(), amat);
+    auto bmat_device =
+        Kokkos::create_mirror_view(DeviceMemorySpaceType(), bmat);
 
-        stat = cublasCreate(&handle);
-        if (stat != CUBLAS_STATUS_SUCCESS)
-          Kokkos::abort("CUBLAS initialization failed\n");
+    Kokkos::deep_copy(amat_device, amat);
+    Kokkos::deep_copy(bmat_device, bmat);
 
-        auto amat_device = Kokkos::create_mirror_view(DeviceMemorySpaceType(), amat);
-        auto bmat_device = Kokkos::create_mirror_view(DeviceMemorySpaceType(), bmat);
+    Kokkos::fence();
 
-        Kokkos::deep_copy(amat_device, amat);
-        Kokkos::deep_copy(bmat_device, bmat);
+    const double one(1.0), zero(0.0);
+    {
+      tavg = 0;
+      tmin = tmax;
+
+      for (int iter = iter_begin; iter < iter_end; ++iter) {
+        // flush
+        flush.run();
+
+        // initialize matrices
+        Kokkos::deep_copy(a, amat_device);
+        Kokkos::deep_copy(b, bmat_device);
+        Kokkos::deep_copy(c, 0);
 
         Kokkos::fence();
+        timer.reset();
 
-        const double one(1.0), zero(0.0);
-        {
-          tavg = 0; tmin = tmax;
-
-          for (int iter=iter_begin;iter<iter_end;++iter) {
-            // flush
-            flush.run();
-
-            // initialize matrices
-            Kokkos::deep_copy(a, amat_device);
-            Kokkos::deep_copy(b, bmat_device);
-            Kokkos::deep_copy(c, 0);
-
-            Kokkos::fence();
-            timer.reset();
-
-            stat = cublasDgemmStridedBatched(handle,
-                                             CUBLAS_OP_N,
-                                             CUBLAS_OP_N,
-                                             BlkSize, BlkSize, BlkSize,
-                                             &one,
-                                             (const value_type*)a.data(), BlkSize, BlkSize*BlkSize,
-                                             (const value_type*)b.data(), BlkSize, BlkSize*BlkSize,
-                                             &zero,
-                                             (value_type*)c.data(), BlkSize, BlkSize*BlkSize,
-                                             N*VectorLength);
-
-            Kokkos::fence();
-            const double t = timer.seconds();
-            tmin = std::min(tmin, t);
-            tavg += (iter >= 0)*t;
-          }
-          tavg /= iter_end;
-
-          auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c);
-          Kokkos::deep_copy(csol, c);
-          Kokkos::deep_copy(cref, csol);
-
-          std::cout << std::setw(8) << "CUBLAS"
-                    << std::setw(8) << "Strided"
-                    << " BlkSize = " << std::setw(3) << BlkSize
-                    << " TeamSize = N/A" 
-                    << " ScratchSize (KB) =   0" 
-                    << " time = " << std::scientific << tmin
-                    << " avg flop/s = " << (flop/tavg)
-                    << " max flop/s = " << (flop/tmin)
-                    << std::endl;
-        }
-        cublasDestroy(handle);
+        stat = cublasDgemmStridedBatched(
+            handle, CUBLAS_OP_N, CUBLAS_OP_N, BlkSize, BlkSize, BlkSize, &one,
+            (const value_type *)a.data(), BlkSize, BlkSize * BlkSize,
+            (const value_type *)b.data(), BlkSize, BlkSize * BlkSize, &zero,
+            (value_type *)c.data(), BlkSize, BlkSize * BlkSize,
+            N * VectorLength);
+
+        Kokkos::fence();
+        const double t = timer.seconds();
+        tmin           = std::min(tmin, t);
+        tavg += (iter >= 0) * t;
       }
+      tavg /= iter_end;
+
+      auto csol =
+          Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c);
+      Kokkos::deep_copy(csol, c);
+      Kokkos::deep_copy(cref, csol);
+
+      std::cout << std::setw(8) << "CUBLAS" << std::setw(8) << "Strided"
+                << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = N/A"
+                << " ScratchSize (KB) =   0"
+                << " time = " << std::scientific << tmin
+                << " avg flop/s = " << (flop / tavg)
+                << " max flop/s = " << (flop / tmin) << std::endl;
+    }
+    cublasDestroy(handle);
+  }
 #endif
 
-      if (1) {
-        ///
-        /// Range policy version
-        ///
-        typedef Kokkos::View<value_type***,DeviceSpaceType> view_type;
-        view_type
-          a("a", N*VectorLength, BlkSize, BlkSize),
-          b("b", N*VectorLength, BlkSize, BlkSize),
-          c("c", N*VectorLength, BlkSize, BlkSize);
-
-        double tavg = 0, tmin = tmax;
-        {
-          typedef Functor<view_type,AlgoTagType> functor_type;
-          const Kokkos::RangePolicy<DeviceSpaceType,ScheduleType,RangeTag> policy(0, N*VectorLength);
-
-          for (int iter=iter_begin;iter<iter_end;++iter) {
-            // flush
-            flush.run();
-
-            // initialize matrices
-            Kokkos::deep_copy(a, amat);
-            Kokkos::deep_copy(b, bmat);
-            Kokkos::deep_copy(c, 0);
-
-            Kokkos::fence();
-            timer.reset();
-
-            Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::RangeTag", policy, functor_type(a,b,c));
-                
-            Kokkos::fence();
-            const double t = timer.seconds();
-            tmin = std::min(tmin, t);
-            tavg += (iter >= 0)*t;
-          }
-          tavg /= iter_end;
-
-          auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c);
-          Kokkos::deep_copy(csol, c);
-
-          double diff = 0;
-          for (int i=0,iend=cref.extent(0);i<iend;++i)
-            for (int j=0,jend=cref.extent(1);j<jend;++j)
-              for (int k=0,kend=cref.extent(2);k<kend;++k)
-                diff += std::abs(cref(i,j,k) - csol(i,j,k));
-
-          std::cout << std::setw(8) << "Kokkos"
-                    << std::setw(8) << "Range"
-                    << " BlkSize = " << std::setw(3) << BlkSize
-                    << " TeamSize = N/A" 
-                    << " ScratchSize (KB) =   0" 
-                    << " time = " << std::scientific << tmin
-                    << " avg flop/s = " << (flop/tavg)
-                    << " max flop/s = " << (flop/tmin);
+  if (1) {
+    ///
+    /// Range policy version
+    ///
+    typedef Kokkos::View<value_type ***, DeviceSpaceType> view_type;
+    view_type a("a", N * VectorLength, BlkSize, BlkSize),
+        b("b", N * VectorLength, BlkSize, BlkSize),
+        c("c", N * VectorLength, BlkSize, BlkSize);
+
+    double tavg = 0, tmin = tmax;
+    {
+      typedef Functor<view_type, AlgoTagType> functor_type;
+      const Kokkos::RangePolicy<DeviceSpaceType, ScheduleType, RangeTag> policy(
+          0, N * VectorLength);
+
+      for (int iter = iter_begin; iter < iter_end; ++iter) {
+        // flush
+        flush.run();
+
+        // initialize matrices
+        Kokkos::deep_copy(a, amat);
+        Kokkos::deep_copy(b, bmat);
+        Kokkos::deep_copy(c, 0);
+
+        Kokkos::fence();
+        timer.reset();
+
+        Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::RangeTag",
+                             policy, functor_type(a, b, c));
+
+        Kokkos::fence();
+        const double t = timer.seconds();
+        tmin           = std::min(tmin, t);
+        tavg += (iter >= 0) * t;
+      }
+      tavg /= iter_end;
+
+      auto csol =
+          Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c);
+      Kokkos::deep_copy(csol, c);
+
+      double diff = 0;
+      for (int i = 0, iend = cref.extent(0); i < iend; ++i)
+        for (int j = 0, jend = cref.extent(1); j < jend; ++j)
+          for (int k = 0, kend = cref.extent(2); k < kend; ++k)
+            diff += Kokkos::ArithTraits<value_type>::abs(cref(i, j, k) -
+                                                         csol(i, j, k));
+
+      std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Range"
+                << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = N/A"
+                << " ScratchSize (KB) =   0"
+                << " time = " << std::scientific << tmin
+                << " avg flop/s = " << (flop / tavg)
+                << " max flop/s = " << (flop / tmin);
 #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__)
-          std::cout << " diff to ref = " << diff;
+      std::cout << " diff to ref = " << diff;
 #endif
-          std::cout << std::endl;
-        }
-      }
+      std::cout << std::endl;
+    }
+  }
 
-      if (1) {
-        ///
-        /// Team policy V1 - almost same scheduling with range policy; 
-        ///                  expect the same performance as range policy
-        ///
-        typedef Kokkos::View<value_type***,DeviceSpaceType> view_type;
-        view_type
-          a("a", N*VectorLength, BlkSize, BlkSize),
-          b("b", N*VectorLength, BlkSize, BlkSize),
-          c("c", N*VectorLength, BlkSize, BlkSize);
-            
-        double tavg = 0, tmin = tmax;
-        {
-          typedef Kokkos::TeamPolicy<DeviceSpaceType,ScheduleType,TeamTagV1> policy_type;
-
-          typedef Functor<view_type,AlgoTagType,VectorLength> functor_type;
-
-          // 128 is rough estimates
-          const int team_size = 
-            policy_type(N/32, Kokkos::AUTO, VectorLength).team_size_recommended(functor_type(), Kokkos::ParallelForTag());
-              
-          const policy_type policy(N/team_size, team_size, VectorLength);
-          for (int iter=iter_begin;iter<iter_end;++iter) {
-            // flush
-            flush.run();
-
-            // initialize matrices
-            Kokkos::deep_copy(a, amat);
-            Kokkos::deep_copy(b, bmat);
-            Kokkos::deep_copy(c, 0);
-
-            Kokkos::fence();
-            timer.reset();
-
-            Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV1", policy,functor_type(a,b,c));
-                
-            Kokkos::fence();
-            const double t = timer.seconds();
-            tmin = std::min(tmin, t);
-            tavg += (iter >= 0)*t;
-          }
-          tavg /= iter_end;
-
-          auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c);
-          Kokkos::deep_copy(csol, c);
-
-          double diff = 0;
-          for (int i=0,iend=cref.extent(0);i<iend;++i)
-            for (int j=0,jend=cref.extent(1);j<jend;++j)
-              for (int k=0,kend=cref.extent(2);k<kend;++k)
-                diff += std::abs(cref(i,j,k) - csol(i,j,k));
-
-          std::cout << std::setw(8) << "Kokkos"
-                    << std::setw(8) << "Team V1"
-                    << " BlkSize = " << std::setw(3) << BlkSize
-                    << " TeamSize = " << std::setw(3) << team_size 
-                    << " ScratchSize (KB) =   0" 
-                    << " time = " << std::scientific << tmin
-                    << " avg flop/s = " << (flop/tavg)
-                    << " max flop/s = " << (flop/tmin);
+  if (1) {
+    ///
+    /// Team policy V1 - almost same scheduling with range policy;
+    ///                  expect the same performance as range policy
+    ///
+    typedef Kokkos::View<value_type ***, DeviceSpaceType> view_type;
+    view_type a("a", N * VectorLength, BlkSize, BlkSize),
+        b("b", N * VectorLength, BlkSize, BlkSize),
+        c("c", N * VectorLength, BlkSize, BlkSize);
+
+    double tavg = 0, tmin = tmax;
+    {
+      typedef Kokkos::TeamPolicy<DeviceSpaceType, ScheduleType, TeamTagV1>
+          policy_type;
+
+      typedef Functor<view_type, AlgoTagType, VectorLength> functor_type;
+
+      // 128 is rough estimates
+      const int team_size =
+          policy_type(N / 32, Kokkos::AUTO, VectorLength)
+              .team_size_recommended(functor_type(), Kokkos::ParallelForTag());
+
+      const policy_type policy(N / team_size, team_size, VectorLength);
+      for (int iter = iter_begin; iter < iter_end; ++iter) {
+        // flush
+        flush.run();
+
+        // initialize matrices
+        Kokkos::deep_copy(a, amat);
+        Kokkos::deep_copy(b, bmat);
+        Kokkos::deep_copy(c, 0);
+
+        Kokkos::fence();
+        timer.reset();
+
+        Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV1",
+                             policy, functor_type(a, b, c));
+
+        Kokkos::fence();
+        const double t = timer.seconds();
+        tmin           = std::min(tmin, t);
+        tavg += (iter >= 0) * t;
+      }
+      tavg /= iter_end;
+
+      auto csol =
+          Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c);
+      Kokkos::deep_copy(csol, c);
+
+      double diff = 0;
+      for (int i = 0, iend = cref.extent(0); i < iend; ++i)
+        for (int j = 0, jend = cref.extent(1); j < jend; ++j)
+          for (int k = 0, kend = cref.extent(2); k < kend; ++k)
+            diff += Kokkos::ArithTraits<value_type>::abs(cref(i, j, k) -
+                                                         csol(i, j, k));
+
+      std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V1"
+                << " BlkSize = " << std::setw(3) << BlkSize
+                << " TeamSize = " << std::setw(3) << team_size
+                << " ScratchSize (KB) =   0"
+                << " time = " << std::scientific << tmin
+                << " avg flop/s = " << (flop / tavg)
+                << " max flop/s = " << (flop / tmin);
 #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__)
-          std::cout << " diff to ref = " << diff;
+      std::cout << " diff to ref = " << diff;
 #endif
-          std::cout << std::endl;
-        }
-      }
+      std::cout << std::endl;
+    }
+  }
 
-      if (1) {
-        ///
-        /// Team policy V2 - team parallel 
-        ///
-        typedef Kokkos::View<value_type***,DeviceSpaceType> view_type;
-        view_type
-          a("a", N*VectorLength, BlkSize, BlkSize),
-          b("b", N*VectorLength, BlkSize, BlkSize),
-          c("c", N*VectorLength, BlkSize, BlkSize);
-
-        double tavg = 0, tmin = tmax;
-        {
-          typedef Kokkos::TeamPolicy<DeviceSpaceType,ScheduleType,TeamTagV2> policy_type;
-          typedef Functor<view_type,AlgoTagType,VectorLength> functor_type;
-
-          const int 
-            is_blocked_algo = (std::is_same<AlgoTagType,Algo::Gemm::Blocked>::value), 
-            mb = Algo::Gemm::Blocked::mb<DeviceMemorySpaceType>(),
-            mp = BlkSize%mb > 0;
-
-          const int 
-            mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize;
-
-          const int max_team_size = 
-            policy_type(N, Kokkos::AUTO, VectorLength).team_size_max(functor_type(), Kokkos::ParallelForTag());
-          const int team_size = std::min(std::max(mblk*mblk,4), max_team_size);
-
-          policy_type policy(N, team_size, VectorLength);
-          for (int iter=iter_begin;iter<iter_end;++iter) {
-            // flush
-            flush.run();
-
-            // initialize matrices
-            Kokkos::deep_copy(a, amat);
-            Kokkos::deep_copy(b, bmat);
-            Kokkos::deep_copy(c, 0);
-
-            Kokkos::fence();
-            timer.reset();
-
-            Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV2", policy, functor_type(a,b,c));
-                
-            Kokkos::fence();
-            const double t = timer.seconds();
-            tmin = std::min(tmin, t);
-            tavg += (iter >= 0)*t;
-          }
-          tavg /= iter_end;
-
-          auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c);
-          Kokkos::deep_copy(csol, c);
-
-          double diff = 0;
-          for (int i=0,iend=cref.extent(0);i<iend;++i)
-            for (int j=0,jend=cref.extent(1);j<jend;++j)
-              for (int k=0,kend=cref.extent(2);k<kend;++k)
-                diff += std::abs(cref(i,j,k) - csol(i,j,k));
-
-          std::cout << std::setw(8) << "Kokkos"
-                    << std::setw(8) << "Team V2"
-                    << " BlkSize = " << std::setw(3) << BlkSize
-                    << " TeamSize = " << std::setw(3) << team_size
-                    << " ScratchSize (KB) =   0" 
-                    << " time = " << std::scientific << tmin
-                    << " avg flop/s = " << (flop/tavg)
-                    << " max flop/s = " << (flop/tmin);
+  if (1) {
+    ///
+    /// Team policy V2 - team parallel
+    ///
+    typedef Kokkos::View<value_type ***, DeviceSpaceType> view_type;
+    view_type a("a", N * VectorLength, BlkSize, BlkSize),
+        b("b", N * VectorLength, BlkSize, BlkSize),
+        c("c", N * VectorLength, BlkSize, BlkSize);
+
+    double tavg = 0, tmin = tmax;
+    {
+      typedef Kokkos::TeamPolicy<DeviceSpaceType, ScheduleType, TeamTagV2>
+          policy_type;
+      typedef Functor<view_type, AlgoTagType, VectorLength> functor_type;
+
+      const int is_blocked_algo =
+                    (std::is_same<AlgoTagType, Algo::Gemm::Blocked>::value),
+                mb = Algo::Gemm::Blocked::mb<DeviceMemorySpaceType>(),
+                mp = BlkSize % mb > 0;
+
+      const int mblk = is_blocked_algo ? (BlkSize / mb + mp) : BlkSize;
+
+      const int max_team_size =
+          policy_type(N, Kokkos::AUTO, VectorLength)
+              .team_size_max(functor_type(), Kokkos::ParallelForTag());
+      const int team_size = std::min(std::max(mblk * mblk, 4), max_team_size);
+
+      policy_type policy(N, team_size, VectorLength);
+      for (int iter = iter_begin; iter < iter_end; ++iter) {
+        // flush
+        flush.run();
+
+        // initialize matrices
+        Kokkos::deep_copy(a, amat);
+        Kokkos::deep_copy(b, bmat);
+        Kokkos::deep_copy(c, 0);
+
+        Kokkos::fence();
+        timer.reset();
+
+        Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV2",
+                             policy, functor_type(a, b, c));
+
+        Kokkos::fence();
+        const double t = timer.seconds();
+        tmin           = std::min(tmin, t);
+        tavg += (iter >= 0) * t;
+      }
+      tavg /= iter_end;
+
+      auto csol =
+          Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c);
+      Kokkos::deep_copy(csol, c);
+
+      double diff = 0;
+      for (int i = 0, iend = cref.extent(0); i < iend; ++i)
+        for (int j = 0, jend = cref.extent(1); j < jend; ++j)
+          for (int k = 0, kend = cref.extent(2); k < kend; ++k)
+            diff += Kokkos::ArithTraits<value_type>::abs(cref(i, j, k) -
+                                                         csol(i, j, k));
+
+      std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V2"
+                << " BlkSize = " << std::setw(3) << BlkSize
+                << " TeamSize = " << std::setw(3) << team_size
+                << " ScratchSize (KB) =   0"
+                << " time = " << std::scientific << tmin
+                << " avg flop/s = " << (flop / tavg)
+                << " max flop/s = " << (flop / tmin);
 #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__)
-          std::cout << " diff to ref = " << diff;
+      std::cout << " diff to ref = " << diff;
 #endif
-          std::cout << std::endl;
-        }
-      }
+      std::cout << std::endl;
+    }
+  }
 
-      if (1) {
-        ///
-        /// Team policy V3 - team parallel + scratch
-        ///
-        typedef Kokkos::View<value_type***,DeviceSpaceType> view_type;
-        view_type
-          a("a", N*VectorLength, BlkSize, BlkSize),
-          b("b", N*VectorLength, BlkSize, BlkSize),
-          c("c", N*VectorLength, BlkSize, BlkSize);
-
-        double tavg = 0, tmin = tmax;
-        {
-          typedef Kokkos::TeamPolicy<DeviceSpaceType,ScheduleType,TeamTagV3> policy_type;
-          typedef Functor<view_type,AlgoTagType,VectorLength> functor_type;
-
-          const int lvl = 0, per_team_scratch = 2*ScratchViewType<view_type>::shmem_size(VectorLength, BlkSize, BlkSize);
-          //std::cout << "per team scratch " << per_team_scratch << "\n";
-          if (per_team_scratch/1024 < 48) {
-            const int 
-              is_blocked_algo = (std::is_same<AlgoTagType,Algo::Gemm::Blocked>::value), 
-              mb = Algo::Gemm::Blocked::mb<DeviceMemorySpaceType>(),
-              mp = BlkSize%mb > 0;
-
-            const int 
-              mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize;
-              
-            const int max_team_size =
-              policy_type(N, Kokkos::AUTO, VectorLength).set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch))
-              .team_size_max(functor_type(), Kokkos::ParallelForTag());   
-            const int team_size = std::min(std::max(mblk*mblk,4), max_team_size);
-              
-            policy_type policy = policy_type(N, team_size, VectorLength).set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch));
-            for (int iter=iter_begin;iter<iter_end;++iter) {
-              // flush
-              flush.run();
-
-              // initialize matrices
-              Kokkos::deep_copy(a, amat);
-              Kokkos::deep_copy(b, bmat);
-              Kokkos::deep_copy(c, 0);
-
-              Kokkos::fence();
-              timer.reset();
-
-              Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV3", policy, functor_type(a,b,c));
-                
-              Kokkos::fence();
-              const double t = timer.seconds();
-              tmin = std::min(tmin, t);
-              tavg += (iter >= 0)*t;
-            }
-            tavg /= iter_end;
-
-            auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c);
-            Kokkos::deep_copy(csol, c);
-
-            double diff = 0;
-            for (int i=0,iend=cref.extent(0);i<iend;++i)
-              for (int j=0,jend=cref.extent(1);j<jend;++j)
-                for (int k=0,kend=cref.extent(2);k<kend;++k)
-                  diff += std::abs(cref(i,j,k) - csol(i,j,k));
-
-            std::cout << std::setw(8) << "Kokkos"
-                      << std::setw(8) << "Team V3"
-                      << " BlkSize = " << std::setw(3) << BlkSize
-                      << " TeamSize = " << std::setw(3) << team_size
-                      << " ScratchSize (KB) = " << std::setw(3) << (per_team_scratch/1024)
-                      << " time = " << std::scientific << tmin
-                      << " avg flop/s = " << (flop/tavg)
-                      << " max flop/s = " << (flop/tmin);
+  if (1) {
+    ///
+    /// Team policy V3 - team parallel + scratch
+    ///
+    typedef Kokkos::View<value_type ***, DeviceSpaceType> view_type;
+    view_type a("a", N * VectorLength, BlkSize, BlkSize),
+        b("b", N * VectorLength, BlkSize, BlkSize),
+        c("c", N * VectorLength, BlkSize, BlkSize);
+
+    double tavg = 0, tmin = tmax;
+    {
+      typedef Kokkos::TeamPolicy<DeviceSpaceType, ScheduleType, TeamTagV3>
+          policy_type;
+      typedef Functor<view_type, AlgoTagType, VectorLength> functor_type;
+
+      const int lvl              = 0,
+                per_team_scratch = 2 * ScratchViewType<view_type>::shmem_size(
+                                           VectorLength, BlkSize, BlkSize);
+      // std::cout << "per team scratch " << per_team_scratch << "\n";
+      if (per_team_scratch / 1024 < 48) {
+        const int is_blocked_algo =
+                      (std::is_same<AlgoTagType, Algo::Gemm::Blocked>::value),
+                  mb = Algo::Gemm::Blocked::mb<DeviceMemorySpaceType>(),
+                  mp = BlkSize % mb > 0;
+
+        const int mblk = is_blocked_algo ? (BlkSize / mb + mp) : BlkSize;
+
+        const int max_team_size =
+            policy_type(N, Kokkos::AUTO, VectorLength)
+                .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch))
+                .team_size_max(functor_type(), Kokkos::ParallelForTag());
+        const int team_size = std::min(std::max(mblk * mblk, 4), max_team_size);
+
+        policy_type policy =
+            policy_type(N, team_size, VectorLength)
+                .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch));
+        for (int iter = iter_begin; iter < iter_end; ++iter) {
+          // flush
+          flush.run();
+
+          // initialize matrices
+          Kokkos::deep_copy(a, amat);
+          Kokkos::deep_copy(b, bmat);
+          Kokkos::deep_copy(c, 0);
+
+          Kokkos::fence();
+          timer.reset();
+
+          Kokkos::parallel_for(
+              "KokkosBatched::PerfTest::GemmCuda::TeamPolicyV3", policy,
+              functor_type(a, b, c));
+
+          Kokkos::fence();
+          const double t = timer.seconds();
+          tmin           = std::min(tmin, t);
+          tavg += (iter >= 0) * t;
+        }
+        tavg /= iter_end;
+
+        auto csol = Kokkos::create_mirror_view(
+            typename HostSpaceType::memory_space(), c);
+        Kokkos::deep_copy(csol, c);
+
+        double diff = 0;
+        for (int i = 0, iend = cref.extent(0); i < iend; ++i)
+          for (int j = 0, jend = cref.extent(1); j < jend; ++j)
+            for (int k = 0, kend = cref.extent(2); k < kend; ++k)
+              diff += Kokkos::ArithTraits<value_type>::abs(cref(i, j, k) -
+                                                           csol(i, j, k));
+
+        std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3"
+                  << " BlkSize = " << std::setw(3) << BlkSize
+                  << " TeamSize = " << std::setw(3) << team_size
+                  << " ScratchSize (KB) = " << std::setw(3)
+                  << (per_team_scratch / 1024) << " time = " << std::scientific
+                  << tmin << " avg flop/s = " << (flop / tavg)
+                  << " max flop/s = " << (flop / tmin);
 #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__)
-            std::cout << " diff to ref = " << diff;
+        std::cout << " diff to ref = " << diff;
 #endif
-            std::cout << std::endl;
-          } else {
-            std::cout << std::setw(8) << "Kokkos"
-                      << std::setw(8) << "Team V3"
-                      << " Scratch per team is too big:" << std::setw(3) << (per_team_scratch/1024)
-                      << std::endl;
-          }
-        }
+        std::cout << std::endl;
+      } else {
+        std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3"
+                  << " Scratch per team is too big:" << std::setw(3)
+                  << (per_team_scratch / 1024) << std::endl;
       }
+    }
+  }
 
-      if (1) {
-        ///
-        /// Team policy - handmade
-        ///
-        typedef Kokkos::View<value_type***,DeviceSpaceType> view_type;
-        view_type
-          a("a", N*VectorLength, BlkSize, BlkSize),
-          b("b", N*VectorLength, BlkSize, BlkSize),
-          c("c", N*VectorLength, BlkSize, BlkSize);
-
-        double tavg = 0, tmin = tmax;
-        {
-          typedef Kokkos::TeamPolicy<DeviceSpaceType,ScheduleType,TeamTagHandmade> policy_type;
-          typedef Functor<view_type,AlgoTagType,VectorLength> functor_type;
-
-          const int max_team_size = 
-            policy_type(N, Kokkos::AUTO, VectorLength).team_size_max(functor_type(), Kokkos::ParallelForTag());
-
-          const int team_size = std::min(max_team_size,BlkSize*BlkSize);
-
-          const policy_type policy(N, team_size, VectorLength);
-          for (int iter=iter_begin;iter<iter_end;++iter) {
-            // flush
-            flush.run();
-
-            // initialize matrices
-            Kokkos::deep_copy(a, amat);
-            Kokkos::deep_copy(b, bmat);
-            Kokkos::deep_copy(c, 0);
-
-            Kokkos::fence();
-            timer.reset();
-
-            Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyHandmade", policy, functor_type(a,b,c));
-                
-            Kokkos::fence();
-            const double t = timer.seconds();
-            tmin = std::min(tmin, t);
-            tavg += (iter >= 0)*t;
-          }
-          tavg /= iter_end;
-
-          auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c);
-          Kokkos::deep_copy(csol, c);
-
-          double diff = 0;
-          for (int i=0,iend=cref.extent(0);i<iend;++i)
-            for (int j=0,jend=cref.extent(1);j<jend;++j)
-              for (int k=0,kend=cref.extent(2);k<kend;++k)
-                diff += std::abs(cref(i,j,k) - csol(i,j,k));
-
-          std::cout << std::setw(8) << "Kokkos"
-                    << std::setw(8) << "Team HM"
-                    << " BlkSize = " << std::setw(3) << BlkSize
-                    << " TeamSize = " << std::setw(3) << team_size
-                    << " ScratchSize (KB) =   0" 
-                    << " time = " << std::scientific << tmin
-                    << " avg flop/s = " << (flop/tavg)
-                    << " max flop/s = " << (flop/tmin);
+  if (1) {
+    ///
+    /// Team policy - handmade
+    ///
+    typedef Kokkos::View<value_type ***, DeviceSpaceType> view_type;
+    view_type a("a", N * VectorLength, BlkSize, BlkSize),
+        b("b", N * VectorLength, BlkSize, BlkSize),
+        c("c", N * VectorLength, BlkSize, BlkSize);
+
+    double tavg = 0, tmin = tmax;
+    {
+      typedef Kokkos::TeamPolicy<DeviceSpaceType, ScheduleType, TeamTagHandmade>
+          policy_type;
+      typedef Functor<view_type, AlgoTagType, VectorLength> functor_type;
+
+      const int max_team_size =
+          policy_type(N, Kokkos::AUTO, VectorLength)
+              .team_size_max(functor_type(), Kokkos::ParallelForTag());
+
+      const int team_size = std::min(max_team_size, BlkSize * BlkSize);
+
+      const policy_type policy(N, team_size, VectorLength);
+      for (int iter = iter_begin; iter < iter_end; ++iter) {
+        // flush
+        flush.run();
+
+        // initialize matrices
+        Kokkos::deep_copy(a, amat);
+        Kokkos::deep_copy(b, bmat);
+        Kokkos::deep_copy(c, 0);
+
+        Kokkos::fence();
+        timer.reset();
+
+        Kokkos::parallel_for(
+            "KokkosBatched::PerfTest::GemmCuda::TeamPolicyHandmade", policy,
+            functor_type(a, b, c));
+
+        Kokkos::fence();
+        const double t = timer.seconds();
+        tmin           = std::min(tmin, t);
+        tavg += (iter >= 0) * t;
+      }
+      tavg /= iter_end;
+
+      auto csol =
+          Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c);
+      Kokkos::deep_copy(csol, c);
+
+      double diff = 0;
+      for (int i = 0, iend = cref.extent(0); i < iend; ++i)
+        for (int j = 0, jend = cref.extent(1); j < jend; ++j)
+          for (int k = 0, kend = cref.extent(2); k < kend; ++k)
+            diff += Kokkos::ArithTraits<value_type>::abs(cref(i, j, k) -
+                                                         csol(i, j, k));
+
+      std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team HM"
+                << " BlkSize = " << std::setw(3) << BlkSize
+                << " TeamSize = " << std::setw(3) << team_size
+                << " ScratchSize (KB) =   0"
+                << " time = " << std::scientific << tmin
+                << " avg flop/s = " << (flop / tavg)
+                << " max flop/s = " << (flop / tmin);
 #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__)
-          std::cout << " diff to ref = " << diff;
+      std::cout << " diff to ref = " << diff;
 #endif
-          std::cout << std::endl;
-        }
-      }
-
       std::cout << std::endl;
     }
   }
-}
 
+  std::cout << std::endl;
+}
+}  // namespace PerfTest
+}  // namespace KokkosBatched
 
 using namespace KokkosBatched;
 
-template<typename AlgoTagType>
+template <typename AlgoTagType>
 void run(const int N, const int B) {
   typedef Kokkos::DefaultExecutionSpace ExecSpace;
 
@@ -648,27 +664,25 @@ void run(const int N, const int B) {
   if (B != 0) {
     PerfTest::Gemm<ExecSpace, AlgoTagType>(N, B);
   } else {
-    PerfTest::Gemm<ExecSpace, AlgoTagType>(N,  3);
-    PerfTest::Gemm<ExecSpace, AlgoTagType>(N,  5);
+    PerfTest::Gemm<ExecSpace, AlgoTagType>(N, 3);
+    PerfTest::Gemm<ExecSpace, AlgoTagType>(N, 5);
     PerfTest::Gemm<ExecSpace, AlgoTagType>(N, 10);
     PerfTest::Gemm<ExecSpace, AlgoTagType>(N, 15);
-    
+
     // PerfTest::Gemm<ExecSpace, AlgoTagType>(N,  4);
     // PerfTest::Gemm<ExecSpace, AlgoTagType>(N,  8);
     // PerfTest::Gemm<ExecSpace, AlgoTagType>(N, 16);
     // PerfTest::Gemm<ExecSpace, AlgoTagType>(N, 18);
   }
-    
 }
 
 int main(int argc, char *argv[]) {
-
   Kokkos::initialize(argc, argv);
 
-  int N = 128*128, B = 0;
+  int N = 128 * 128, B = 0;
 
-  for (int i=1;i<argc;++i) {
-    const std::string& token = argv[i];
+  for (int i = 1; i < argc; ++i) {
+    const std::string &token = argv[i];
     if (token == std::string("-N")) N = std::atoi(argv[++i]);
     if (token == std::string("-B")) B = std::atoi(argv[++i]);
   }
@@ -676,10 +690,10 @@ int main(int argc, char *argv[]) {
   {
     std::cout << " N = " << N << std::endl;
 
-    std::cout << "\n Testing LayoutLeft Algo::Gemm::Unblocked\n";      
+    std::cout << "\n Testing LayoutLeft Algo::Gemm::Unblocked\n";
     run<Algo::Gemm::Unblocked>(N, B);
-    
-    std::cout << "\n Testing LayoutLeft Algo::Gemm::Blocked\n";      
+
+    std::cout << "\n Testing LayoutLeft Algo::Gemm::Blocked\n";
     run<Algo::Gemm::Blocked>(N, B);
   }
 
diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host.hpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host.hpp
index 4e827f34b6..de67d9c804 100644
--- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host.hpp
+++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host.hpp
@@ -23,533 +23,512 @@
 //#undef __KOKKOSBATCHED_INTEL_MKL_BATCHED__
 
 namespace KokkosBatched {
-  namespace PerfTest {
+namespace PerfTest {
 
 #undef FLOP_MUL
 #undef FLOP_ADD
 
-#if defined( KokkosBatched_Test_Gemm_Host_Complex )      
+#if defined(KokkosBatched_Test_Gemm_Host_Complex)
 #define FLOP_MUL 6.0
 #define FLOP_ADD 2.0
-    typedef Kokkos::complex<double> value_type;
+typedef Kokkos::complex<double> value_type;
 #endif
 
-#if defined( KokkosBatched_Test_Gemm_Host_Real )      
+#if defined(KokkosBatched_Test_Gemm_Host_Real)
 #define FLOP_MUL 1.0
-#define FLOP_ADD 1.0      
-    typedef double value_type;
+#define FLOP_ADD 1.0
+typedef double value_type;
 #endif
 
-    double FlopCount(int mm, int nn, int kk) {
-      double m = (double)mm;    double n = (double)nn;    double k = (double)kk;
-      return (FLOP_MUL*(m*n*k) +
-              FLOP_ADD*(m*n*k));
-    }
-    
-    template<int BlkSize, typename HostSpaceType, typename AlgoTagType>
-    void Gemm(const int NN) {
-      typedef Kokkos::Schedule<Kokkos::Static> ScheduleType;
-
-      constexpr int VectorLength = DefaultVectorLength<value_type,typename HostSpaceType::memory_space>::value;
-      const int N = NN/VectorLength;
-
-      {
-        std::string value_type_name;
-        if (std::is_same<value_type,double>::value)                   value_type_name = "double";
-        if (std::is_same<value_type,Kokkos::complex<double> >::value) value_type_name = "Kokkos::complex<double>";
-#if   defined(__AVX512F__)
-        std::cout << "AVX512 is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
+double FlopCount(int mm, int nn, int kk) {
+  double m = (double)mm;
+  double n = (double)nn;
+  double k = (double)kk;
+  return (FLOP_MUL * (m * n * k) + FLOP_ADD * (m * n * k));
+}
+
+template <int BlkSize, typename HostSpaceType, typename AlgoTagType>
+void Gemm(const int NN) {
+  typedef Kokkos::Schedule<Kokkos::Static> ScheduleType;
+
+  constexpr int VectorLength =
+      DefaultVectorLength<value_type,
+                          typename HostSpaceType::memory_space>::value;
+  const int N = NN / VectorLength;
+
+  {
+    std::string value_type_name;
+    if (std::is_same<value_type, double>::value) value_type_name = "double";
+    if (std::is_same<value_type, Kokkos::complex<double> >::value)
+      value_type_name = "Kokkos::complex<double>";
+#if defined(__AVX512F__)
+    std::cout << "AVX512 is defined: datatype " << value_type_name
+              << " a vector length " << VectorLength << "\n";
 #elif defined(__AVX__) || defined(__AVX2__)
-        std::cout << "AVX or AVX2 is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
+    std::cout << "AVX or AVX2 is defined: datatype " << value_type_name
+              << " a vector length " << VectorLength << "\n";
 #else
-        std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
+    std::cout << "SIMD (compiler vectorization) is defined: datatype "
+              << value_type_name << " a vector length " << VectorLength << "\n";
 #endif
-      }
-
-      const double flop = (N*VectorLength)*FlopCount(BlkSize,BlkSize,BlkSize);
-      const double tmax = 1.0e15;
-
-      const int iter_begin = -10, iter_end = 100;
-      Kokkos::Timer timer;
-
-      Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> cref;
-      Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> 
-        amat("amat", N*VectorLength, BlkSize, BlkSize),
-        bmat("bmat", N*VectorLength, BlkSize, BlkSize);
-
-      Kokkos::Random_XorShift64_Pool<HostSpaceType> random(13718);
-      Kokkos::fill_random(amat, random, value_type(1.0));
-      Kokkos::fill_random(bmat, random, value_type(1.0));
-
-      typedef Vector<SIMD<value_type>,VectorLength> VectorType;
-      Kokkos::View<VectorType***,Kokkos::LayoutRight,HostSpaceType> 
-        amat_simd("amat_simd", N, BlkSize, BlkSize),
-        bmat_simd("bmat_simd", N, BlkSize, BlkSize);
-
-      Kokkos::parallel_for("KokkosBatched::PerfTest::GemmHost::Pack",
-                           Kokkos::RangePolicy<HostSpaceType>(0, N*VectorLength),
-                           KOKKOS_LAMBDA(const int k) {
-                             const int k0 = k/VectorLength, k1 = k%VectorLength;
-                             for (int i=0;i<BlkSize;++i)
-                               for (int j=0;j<BlkSize;++j) {
-                                 amat_simd(k0, i, j)[k1] = amat(k, i, j);
-                                 bmat_simd(k0, i, j)[k1] = bmat(k, i, j);                  
-                               }
-                           });
-      
-      // for KNL (1MB per tile)
-      constexpr size_t LLC_CAPACITY = 34*1024*1024;
-      Flush<LLC_CAPACITY> flush;
-      
-      ///
-      /// Reference version using MKL DGEMM
-      ///
-#if defined(__KOKKOSBATCHED_INTEL_MKL__)
-      {
-        Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> 
-          a("a", N*VectorLength, BlkSize, BlkSize),
-          b("b", N*VectorLength, BlkSize, BlkSize),
-          c("c", N*VectorLength, BlkSize, BlkSize);
-
-        {
-          const Kokkos::RangePolicy<HostSpaceType,ScheduleType> policy(0, N*VectorLength);
-          
-          double tavg = 0, tmin = tmax;
-          for (int iter=iter_begin;iter<iter_end;++iter) {
-            // flush
-            flush.run();
-
-            // initialize matrices
-            Kokkos::deep_copy(a, amat);
-            Kokkos::deep_copy(b, bmat);
-            Kokkos::deep_copy(c, 0);
-
-            HostSpaceType().fence();
-            timer.reset();
-
-            Kokkos::parallel_for("KokkosBatched::PerfTest::GemmHost::CblasOpenMP",
-                                 policy, 
-                                 KOKKOS_LAMBDA(const int k) {
-                                   auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
-                                   auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL());
-                                   auto cc = Kokkos::subview(c, k, Kokkos::ALL(), Kokkos::ALL());
-                
-                                   const double one = 1.0;
-                                   if (std::is_same<value_type,double>::value) {
-                                     cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
-                                                 BlkSize, BlkSize, BlkSize,
-                                                 one,
-                                                 (double*)aa.data(), aa.stride_0(),
-                                                 (double*)bb.data(), bb.stride_0(),
-                                                 one,
-                                                 (double*)cc.data(), cc.stride_0());
-                                   } else if (std::is_same<value_type,Kokkos::complex<double> >::value) {
-                                     cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
-                                                 BlkSize, BlkSize, BlkSize,
-                                                 (void*)&one,
-                                                 (void*)aa.data(), aa.stride_0(),
-                                                 (void*)bb.data(), bb.stride_0(),
-                                                 (void*)&one,
-                                                 (void*)cc.data(), cc.stride_0());
-                                   } 
-                  
-                                 });
-            
-            HostSpaceType().fence();
-            const double t = timer.seconds();
-            tmin = std::min(tmin, t);
-            tavg += (iter >= 0)*t;
+  }
+
+  const double flop = (N * VectorLength) * FlopCount(BlkSize, BlkSize, BlkSize);
+  const double tmax = 1.0e15;
+
+  const int iter_begin = -10, iter_end = 100;
+  Kokkos::Timer timer;
+
+  Kokkos::View<value_type ***, Kokkos::LayoutRight, HostSpaceType> cref;
+  Kokkos::View<value_type ***, Kokkos::LayoutRight, HostSpaceType> amat(
+      "amat", N * VectorLength, BlkSize, BlkSize),
+      bmat("bmat", N * VectorLength, BlkSize, BlkSize);
+
+  Kokkos::Random_XorShift64_Pool<HostSpaceType> random(13718);
+  Kokkos::fill_random(amat, random, value_type(1.0));
+  Kokkos::fill_random(bmat, random, value_type(1.0));
+
+  typedef Vector<SIMD<value_type>, VectorLength> VectorType;
+  Kokkos::View<VectorType ***, Kokkos::LayoutRight, HostSpaceType> amat_simd(
+      "amat_simd", N, BlkSize, BlkSize),
+      bmat_simd("bmat_simd", N, BlkSize, BlkSize);
+
+  Kokkos::parallel_for(
+      "KokkosBatched::PerfTest::GemmHost::Pack",
+      Kokkos::RangePolicy<HostSpaceType>(0, N * VectorLength),
+      KOKKOS_LAMBDA(const int k) {
+        const int k0 = k / VectorLength, k1 = k % VectorLength;
+        for (int i = 0; i < BlkSize; ++i)
+          for (int j = 0; j < BlkSize; ++j) {
+            amat_simd(k0, i, j)[k1] = amat(k, i, j);
+            bmat_simd(k0, i, j)[k1] = bmat(k, i, j);
           }
-          tavg /= iter_end;
+      });
 
-          std::cout << std::setw(12) << "MKL DGEMM"
-                    << " BlkSize = " << std::setw(3) << BlkSize
-                    << " time = " << std::scientific << tmin
-                    << " avg flop/s = " << (flop/tavg)
-                    << " max flop/s = " << (flop/tmin)
-                    << std::endl;
+  // for KNL (1MB per tile)
+  constexpr size_t LLC_CAPACITY = 34 * 1024 * 1024;
+  Flush<LLC_CAPACITY> flush;
 
-          cref = c;
-        }
+  ///
+  /// Reference version using MKL DGEMM
+  ///
+#if defined(__KOKKOSBATCHED_INTEL_MKL__)
+  {
+    Kokkos::View<value_type ***, Kokkos::LayoutRight, HostSpaceType> a(
+        "a", N * VectorLength, BlkSize, BlkSize),
+        b("b", N * VectorLength, BlkSize, BlkSize),
+        c("c", N * VectorLength, BlkSize, BlkSize);
+
+    {
+      const Kokkos::RangePolicy<HostSpaceType, ScheduleType> policy(
+          0, N * VectorLength);
+
+      double tavg = 0, tmin = tmax;
+      for (int iter = iter_begin; iter < iter_end; ++iter) {
+        // flush
+        flush.run();
+
+        // initialize matrices
+        Kokkos::deep_copy(a, amat);
+        Kokkos::deep_copy(b, bmat);
+        Kokkos::deep_copy(c, 0);
+
+        HostSpaceType().fence();
+        timer.reset();
+
+        Kokkos::parallel_for(
+            "KokkosBatched::PerfTest::GemmHost::CblasOpenMP", policy,
+            KOKKOS_LAMBDA(const int k) {
+              auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
+              auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL());
+              auto cc = Kokkos::subview(c, k, Kokkos::ALL(), Kokkos::ALL());
+
+              const double one = 1.0;
+              if (std::is_same<value_type, double>::value) {
+                cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, BlkSize,
+                            BlkSize, BlkSize, one, (double *)aa.data(),
+                            aa.stride_0(), (double *)bb.data(), bb.stride_0(),
+                            one, (double *)cc.data(), cc.stride_0());
+              } else if (std::is_same<value_type,
+                                      Kokkos::complex<double> >::value) {
+                cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, BlkSize,
+                            BlkSize, BlkSize, (void *)&one, (void *)aa.data(),
+                            aa.stride_0(), (void *)bb.data(), bb.stride_0(),
+                            (void *)&one, (void *)cc.data(), cc.stride_0());
+              }
+            });
+
+        HostSpaceType().fence();
+        const double t = timer.seconds();
+        tmin           = std::min(tmin, t);
+        tavg += (iter >= 0) * t;
       }
+      tavg /= iter_end;
+
+      std::cout << std::setw(12) << "MKL DGEMM"
+                << " BlkSize = " << std::setw(3) << BlkSize
+                << " time = " << std::scientific << tmin
+                << " avg flop/s = " << (flop / tavg)
+                << " max flop/s = " << (flop / tmin) << std::endl;
+
+      cref = c;
+    }
+  }
 
 #if defined(__KOKKOSBATCHED_INTEL_MKL_BATCHED__)
-      {
-        typedef Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> ViewType;
-        ViewType
-          a("a", N*VectorLength, BlkSize, BlkSize),
-          b("b", N*VectorLength, BlkSize, BlkSize),
-          c("c", N*VectorLength, BlkSize, BlkSize);
-
-        value_type
-          *aa[N*VectorLength],
-          *bb[N*VectorLength],
-          *cc[N*VectorLength];
-
-        for (int k=0;k<N*VectorLength;++k) {
-          aa[k] = &a(k, 0, 0);
-          bb[k] = &b(k, 0, 0);
-          cc[k] = &c(k, 0, 0);
-        }
+  {
+    typedef Kokkos::View<value_type ***, Kokkos::LayoutRight, HostSpaceType>
+        ViewType;
+    ViewType a("a", N * VectorLength, BlkSize, BlkSize),
+        b("b", N * VectorLength, BlkSize, BlkSize),
+        c("c", N * VectorLength, BlkSize, BlkSize);
+
+    value_type *aa[N * VectorLength], *bb[N * VectorLength],
+        *cc[N * VectorLength];
+
+    for (int k = 0; k < N * VectorLength; ++k) {
+      aa[k] = &a(k, 0, 0);
+      bb[k] = &b(k, 0, 0);
+      cc[k] = &c(k, 0, 0);
+    }
 
-        {
-          double tavg = 0, tmin = tmax;
-
-          MKL_INT blksize[1] = { BlkSize };
-          MKL_INT lda[1] = { a.stride_1() };
-          MKL_INT ldb[1] = { b.stride_1() };
-          MKL_INT ldc[1] = { c.stride_1() };
-
-          CBLAS_TRANSPOSE transA[1] = { CblasNoTrans };
-          CBLAS_TRANSPOSE transB[1] = { CblasNoTrans };
-
-          double one[1] = { 1.0 };
-          MKL_INT size_per_grp[1] = { N*VectorLength };
-
-          for (int iter=iter_begin;iter<iter_end;++iter) {
-            // flush
-            flush.run();
-
-            // initialize matrices
-            Kokkos::deep_copy(a, amat);
-            Kokkos::deep_copy(b, bmat);
-            Kokkos::deep_copy(c, 0);
-
-            HostSpaceType().fence();
-            timer.reset();
-            
-            if (std::is_same<value_type,double>::value) {
-              cblas_dgemm_batch(CblasRowMajor, 
-                                transA,
-                                transB,
-                                blksize, blksize, blksize, 
-                                one,
-                                (const double**)aa, lda,
-                                (const double**)bb, ldb,
-                                one,
-                                (double**)cc, ldc,
-                                1, size_per_grp);
-            } else if (std::is_same<value_type,Kokkos::complex<double> >::value) {
-              cblas_zgemm_batch(CblasRowMajor,
-                                transA,
-                                transB,
-                                blksize, blksize, blksize,
-                                one,
-                                (const void**)aa, lda,
-                                (const void**)bb, ldb,
-                                one,
-                                (void**)cc, ldc,
-                                1, size_per_grp);                
-            }
-
-            HostSpaceType().fence();
-            const double t = timer.seconds();
-            tmin = std::min(tmin, t);
-            tavg += (iter >= 0)*t;
-          }
-          tavg /= iter_end;
-
-          double diff = 0;
-          for (int i=0,iend=cref.extent(0);i<iend;++i)
-            for (int j=0,jend=cref.extent(1);j<jend;++j)
-              for (int k=0,kend=cref.extent(2);k<kend;++k)
-                diff += abs(cref(i,j,k) - c(i,j,k));
-
-          std::cout << std::setw(12) << "MKL Batch"
-                    << " BlkSize = " << std::setw(3) << BlkSize
-                    << " time = " << std::scientific << tmin
-                    << " avg flop/s = " << (flop/tavg)
-                    << " max flop/s = " << (flop/tmin)
-                    << " diff to ref = " << diff
-                    << std::endl;
+    {
+      double tavg = 0, tmin = tmax;
+
+      MKL_INT blksize[1] = {BlkSize};
+      MKL_INT lda[1]     = {a.stride_1()};
+      MKL_INT ldb[1]     = {b.stride_1()};
+      MKL_INT ldc[1]     = {c.stride_1()};
+
+      CBLAS_TRANSPOSE transA[1] = {CblasNoTrans};
+      CBLAS_TRANSPOSE transB[1] = {CblasNoTrans};
+
+      double one[1]           = {1.0};
+      MKL_INT size_per_grp[1] = {N * VectorLength};
+
+      for (int iter = iter_begin; iter < iter_end; ++iter) {
+        // flush
+        flush.run();
+
+        // initialize matrices
+        Kokkos::deep_copy(a, amat);
+        Kokkos::deep_copy(b, bmat);
+        Kokkos::deep_copy(c, 0);
+
+        HostSpaceType().fence();
+        timer.reset();
+
+        if (std::is_same<value_type, double>::value) {
+          cblas_dgemm_batch(CblasRowMajor, transA, transB, blksize, blksize,
+                            blksize, one, (const double **)aa, lda,
+                            (const double **)bb, ldb, one, (double **)cc, ldc,
+                            1, size_per_grp);
+        } else if (std::is_same<value_type, Kokkos::complex<double> >::value) {
+          cblas_zgemm_batch(CblasRowMajor, transA, transB, blksize, blksize,
+                            blksize, one, (const void **)aa, lda,
+                            (const void **)bb, ldb, one, (void **)cc, ldc, 1,
+                            size_per_grp);
         }
+
+        HostSpaceType().fence();
+        const double t = timer.seconds();
+        tmin           = std::min(tmin, t);
+        tavg += (iter >= 0) * t;
       }
+      tavg /= iter_end;
+
+      double diff = 0;
+      for (int i = 0, iend = cref.extent(0); i < iend; ++i)
+        for (int j = 0, jend = cref.extent(1); j < jend; ++j)
+          for (int k = 0, kend = cref.extent(2); k < kend; ++k)
+            diff += abs(cref(i, j, k) - c(i, j, k));
+
+      std::cout << std::setw(12) << "MKL Batch"
+                << " BlkSize = " << std::setw(3) << BlkSize
+                << " time = " << std::scientific << tmin
+                << " avg flop/s = " << (flop / tavg)
+                << " max flop/s = " << (flop / tmin)
+                << " diff to ref = " << diff << std::endl;
+    }
+  }
 #endif
 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
-      {
-        Kokkos::View<VectorType***,Kokkos::LayoutRight,HostSpaceType> 
-          a("a", N, BlkSize, BlkSize),
-          b("b", N, BlkSize, BlkSize),
-          c("c", N, BlkSize, BlkSize);
-
-        {
-          double tavg = 0, tmin = tmax;
-
-          double done(1.0);
-          std::complex<double> zone(1.0);
-
-          MKL_COMPACT_PACK format;
-          if (std::is_same<value_type,double>::value) {
-            if (VectorLength == 4)                 format = MKL_COMPACT_AVX;
-            else if (VectorLength == 8)            format = MKL_COMPACT_AVX512;
-          } else if (std::is_same<value_type,Kokkos::complex<double> >::value) {
-            if (VectorLength == 2)                 format = MKL_COMPACT_AVX;
-            else if (VectorLength == 4)            format = MKL_COMPACT_AVX512;
-          }
+  {
+    Kokkos::View<VectorType ***, Kokkos::LayoutRight, HostSpaceType> a(
+        "a", N, BlkSize, BlkSize),
+        b("b", N, BlkSize, BlkSize), c("c", N, BlkSize, BlkSize);
+
+    {
+      double tavg = 0, tmin = tmax;
+
+      double done(1.0);
+      std::complex<double> zone(1.0);
+
+      MKL_COMPACT_PACK format;
+      if (std::is_same<value_type, double>::value) {
+        if (VectorLength == 4)
+          format = MKL_COMPACT_AVX;
+        else if (VectorLength == 8)
+          format = MKL_COMPACT_AVX512;
+      } else if (std::is_same<value_type, Kokkos::complex<double> >::value) {
+        if (VectorLength == 2)
+          format = MKL_COMPACT_AVX;
+        else if (VectorLength == 4)
+          format = MKL_COMPACT_AVX512;
+      }
 
-          if (format == MKL_COMPACT_AVX512 || format == MKL_COMPACT_AVX) {
-            for (int iter=iter_begin;iter<iter_end;++iter) {
-              // flush
-              flush.run();
-                
-              // initialize matrices
-              Kokkos::deep_copy(a, amat_simd);
-              Kokkos::deep_copy(b, bmat_simd);
-              Kokkos::deep_copy(c, 0);
-                
-              HostSpaceType().fence();
-              timer.reset();
-                
-              if (std::is_same<value_type,double>::value) {
-                mkl_dgemm_compact(MKL_ROW_MAJOR,
-                                  MKL_NOTRANS, MKL_NOTRANS,
-                                  BlkSize, BlkSize, BlkSize,
-                                  done, 
-                                  (const double*)a.data(), (MKL_INT)a.stride_1(),
-                                  (const double*)b.data(), (MKL_INT)b.stride_1(),
-                                  done,
-                                  (      double*)c.data(), (MKL_INT)c.stride_1(),
-                                  format, N*VectorLength);
-              } else if (std::is_same<value_type,Kokkos::complex<double> >::value) {
-                mkl_zgemm_compact(MKL_ROW_MAJOR,
-                                  MKL_NOTRANS, MKL_NOTRANS,
-                                  BlkSize, BlkSize, BlkSize,
-                                  (MKL_Complex16*)&zone, 
-                                  (const double*)a.data(), (MKL_INT)a.stride_1(),
-                                  (const double*)b.data(), (MKL_INT)b.stride_1(),
-                                  (MKL_Complex16*)&zone,
-                                  (      double*)c.data(), (MKL_INT)c.stride_1(),
-                                  format, N*VectorLength);
-              }
-                
-              HostSpaceType().fence();
-              const double t = timer.seconds();
-              tmin = std::min(tmin, t);
-              tavg += (iter >= 0)*t;
-            }
-            tavg /= iter_end;
-              
-            double diff = 0;
-            for (int i=0,iend=cref.extent(0);i<iend;++i)
-              for (int j=0,jend=cref.extent(1);j<jend;++j)
-                for (int k=0,kend=cref.extent(2);k<kend;++k) 
-                  diff += abs(cref(i,j,k) - c(i/VectorLength,j,k)[i%VectorLength]);
-
-            std::cout << std::setw(12) << "MKL Cmpct"
-                      << " BlkSize = " << std::setw(3) << BlkSize
-                      << " time = " << std::scientific << tmin
-                      << " avg flop/s = " << (flop/tavg)
-                      << " max flop/s = " << (flop/tmin)
-                      << " diff to ref = " << diff
-                      << std::endl;
+      if (format == MKL_COMPACT_AVX512 || format == MKL_COMPACT_AVX) {
+        for (int iter = iter_begin; iter < iter_end; ++iter) {
+          // flush
+          flush.run();
+
+          // initialize matrices
+          Kokkos::deep_copy(a, amat_simd);
+          Kokkos::deep_copy(b, bmat_simd);
+          Kokkos::deep_copy(c, 0);
+
+          HostSpaceType().fence();
+          timer.reset();
+
+          if (std::is_same<value_type, double>::value) {
+            mkl_dgemm_compact(MKL_ROW_MAJOR, MKL_NOTRANS, MKL_NOTRANS, BlkSize,
+                              BlkSize, BlkSize, done, (const double *)a.data(),
+                              (MKL_INT)a.stride_1(), (const double *)b.data(),
+                              (MKL_INT)b.stride_1(), done, (double *)c.data(),
+                              (MKL_INT)c.stride_1(), format, N * VectorLength);
+          } else if (std::is_same<value_type,
+                                  Kokkos::complex<double> >::value) {
+            mkl_zgemm_compact(MKL_ROW_MAJOR, MKL_NOTRANS, MKL_NOTRANS, BlkSize,
+                              BlkSize, BlkSize, (MKL_Complex16 *)&zone,
+                              (const double *)a.data(), (MKL_INT)a.stride_1(),
+                              (const double *)b.data(), (MKL_INT)b.stride_1(),
+                              (MKL_Complex16 *)&zone, (double *)c.data(),
+                              (MKL_INT)c.stride_1(), format, N * VectorLength);
           }
+
+          HostSpaceType().fence();
+          const double t = timer.seconds();
+          tmin           = std::min(tmin, t);
+          tavg += (iter >= 0) * t;
         }
+        tavg /= iter_end;
+
+        double diff = 0;
+        for (int i = 0, iend = cref.extent(0); i < iend; ++i)
+          for (int j = 0, jend = cref.extent(1); j < jend; ++j)
+            for (int k = 0, kend = cref.extent(2); k < kend; ++k)
+              diff += abs(cref(i, j, k) -
+                          c(i / VectorLength, j, k)[i % VectorLength]);
+
+        std::cout << std::setw(12) << "MKL Cmpct"
+                  << " BlkSize = " << std::setw(3) << BlkSize
+                  << " time = " << std::scientific << tmin
+                  << " avg flop/s = " << (flop / tavg)
+                  << " max flop/s = " << (flop / tmin)
+                  << " diff to ref = " << diff << std::endl;
       }
+    }
+  }
 #endif
 #endif
 
 #if defined(__KOKKOSBATCHED_LIBXSMM__)
-      {
-        libxsmm_init();
-
-        Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> 
-          a("a", N*VectorLength, BlkSize, BlkSize),
-          b("b", N*VectorLength, BlkSize, BlkSize),
-          c("c", N*VectorLength, BlkSize, BlkSize);
-
-        libxsmm_blasint 
-          lda = a.stride_1(),
-          ldb = b.stride_1(),
-          ldc = c.stride_1();
-
-        {
-          const Kokkos::RangePolicy<HostSpaceType,ScheduleType> policy(0, N*VectorLength);
-          
-          double tavg = 0, tmin = tmax;
-
-          // adjust column major order in xsmm
-          char transA = 'N',  transB = 'N';
-          libxsmm_blasint blksize = BlkSize;
-          double one = 1.0;
-
-          for (int iter=iter_begin;iter<iter_end;++iter) {
-            // flush
-            flush.run();
-
-            // initialize matrices
-            Kokkos::deep_copy(a, amat);
-            Kokkos::deep_copy(b, bmat);
-            Kokkos::deep_copy(c, 0);
-
-            HostSpaceType().fence();
-            timer.reset();
-
-            Kokkos::parallel_for("KokkosBatched::PerfTest::GemmHost::libxswmmOpenMP",
-                                 policy, 
-                                 KOKKOS_LAMBDA(const int k) {
-                                   auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
-                                   auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL());
-                                   auto cc = Kokkos::subview(c, k, Kokkos::ALL(), Kokkos::ALL());
-                
-                                   // column major
-                                   libxsmm_gemm((const char*)&transA, 
-                                                (const char*)&transB, 
-                                                blksize, blksize, blksize,
-                                                (const double*)&one, 
-                                                (const double*)bb.data(), (const libxsmm_blasint*)&ldb,
-                                                (const double*)aa.data(), (const libxsmm_blasint*)&lda, 
-                                                (const double*)&one, 
-                                                (double*)cc.data(), (const libxsmm_blasint*)&ldc);
-                                 });
-            
-            HostSpaceType().fence();
-            const double t = timer.seconds();
-            tmin = std::min(tmin, t);
-            tavg += (iter >= 0)*t;
-          }
-          tavg /= iter_end;
-
-          // adjust transpose
-          double diff = 0;
-          for (int i=0,iend=cref.extent(0);i<iend;++i)
-            for (int j=0,jend=cref.extent(1);j<jend;++j)
-              for (int k=0,kend=cref.extent(2);k<kend;++k)
-                diff += abs(cref(i,j,k) - c(i,j,k));
-
-          std::cout << std::setw(12) << "libxsmm"
-                    << " BlkSize = " << std::setw(3) << BlkSize
-                    << " time = " << std::scientific << tmin
-                    << " avg flop/s = " << (flop/tavg)
-                    << " max flop/s = " << (flop/tmin)
-                    << " diff to ref = " << diff
-                    << std::endl;
-        }
-        libxsmm_finalize();
+  {
+    libxsmm_init();
+
+    Kokkos::View<value_type ***, Kokkos::LayoutRight, HostSpaceType> a(
+        "a", N * VectorLength, BlkSize, BlkSize),
+        b("b", N * VectorLength, BlkSize, BlkSize),
+        c("c", N * VectorLength, BlkSize, BlkSize);
+
+    libxsmm_blasint lda = a.stride_1(), ldb = b.stride_1(), ldc = c.stride_1();
+
+    {
+      const Kokkos::RangePolicy<HostSpaceType, ScheduleType> policy(
+          0, N * VectorLength);
+
+      double tavg = 0, tmin = tmax;
+
+      // adjust column major order in xsmm
+      char transA = 'N', transB = 'N';
+      libxsmm_blasint blksize = BlkSize;
+      double one              = 1.0;
+
+      for (int iter = iter_begin; iter < iter_end; ++iter) {
+        // flush
+        flush.run();
+
+        // initialize matrices
+        Kokkos::deep_copy(a, amat);
+        Kokkos::deep_copy(b, bmat);
+        Kokkos::deep_copy(c, 0);
+
+        HostSpaceType().fence();
+        timer.reset();
+
+        Kokkos::parallel_for(
+            "KokkosBatched::PerfTest::GemmHost::libxswmmOpenMP", policy,
+            KOKKOS_LAMBDA(const int k) {
+              auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
+              auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL());
+              auto cc = Kokkos::subview(c, k, Kokkos::ALL(), Kokkos::ALL());
+
+              // column major
+              libxsmm_gemm((const char *)&transA, (const char *)&transB,
+                           blksize, blksize, blksize, (const double *)&one,
+                           (const double *)bb.data(),
+                           (const libxsmm_blasint *)&ldb,
+                           (const double *)aa.data(),
+                           (const libxsmm_blasint *)&lda, (const double *)&one,
+                           (double *)cc.data(), (const libxsmm_blasint *)&ldc);
+            });
+
+        HostSpaceType().fence();
+        const double t = timer.seconds();
+        tmin           = std::min(tmin, t);
+        tavg += (iter >= 0) * t;
       }
+      tavg /= iter_end;
+
+      // adjust transpose
+      double diff = 0;
+      for (int i = 0, iend = cref.extent(0); i < iend; ++i)
+        for (int j = 0, jend = cref.extent(1); j < jend; ++j)
+          for (int k = 0, kend = cref.extent(2); k < kend; ++k)
+            diff += abs(cref(i, j, k) - c(i, j, k));
+
+      std::cout << std::setw(12) << "libxsmm"
+                << " BlkSize = " << std::setw(3) << BlkSize
+                << " time = " << std::scientific << tmin
+                << " avg flop/s = " << (flop / tavg)
+                << " max flop/s = " << (flop / tmin)
+                << " diff to ref = " << diff << std::endl;
+    }
+    libxsmm_finalize();
+  }
 #endif
-      ///
-      /// Do not test this. Test Compact vs MKL
-      /// KK Scalar version (comparable to micro BLAS version)
-      ///
-      // if (!std::is_same<AlgoTagType,Algo::Gemm::CompactMKL>::value) {
-      //   Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> 
-      //     a("a", N*VectorLength, BlkSize, BlkSize),
-      //     b("b", N*VectorLength, BlkSize, BlkSize),
-      //     c("c", N*VectorLength, BlkSize, BlkSize);
-
-      //   {
-      //     const Kokkos::RangePolicy<HostSpaceType,ScheduleType> policy(0, N*VectorLength);
-          
-      //     double tavg = 0, tmin = tmax;
-
-      //     for (int iter=iter_begin;iter<iter_end;++iter) {
-      //       // flush
-      //       flush.run();
-
-      //       // initialize matrices
-      //       Kokkos::deep_copy(a, amat);
-      //       Kokkos::deep_copy(b, bmat);
-      //       Kokkos::deep_copy(c, 0);
-
-      //       HostSpaceType().fence();
-      //       timer.reset();
-
-      //       Kokkos::parallel_for
-      //         (policy, 
-      //          KOKKOS_LAMBDA(const int k) {
-      //           auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
-      //           auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL());
-      //           auto cc = Kokkos::subview(c, k, Kokkos::ALL(), Kokkos::ALL());
-                
-      //           SerialGemm<Trans::NoTranspose,Trans::NoTranspose,AlgoTagType>::
-      //             invoke(1.0, aa, bb, 1.0, cc);
-      //         });
-            
-      //       HostSpaceType().fence();
-      //       const double t = timer.seconds();
-      //       tmin = std::min(tmin, t);
-      //       tavg += (iter >= 0)*t;
-      //     }
-      //     tavg /= iter_end;
-
-      //     double diff = 0;
-      //     for (int i=0,iend=cref.extent(2);i<iend;++i)
-      //       for (int j=0,jend=cref.extent(2);j<jend;++j)
-      //         for (int k=0,kend=cref.extent(2);k<kend;++k)
-      //           diff += abs(cref(i,j,k) - c(i,j,k));
-
-      //     std::cout << std::setw(12) << "KK Scalar"
-      //               << " BlkSize = " << std::setw(3) << BlkSize
-      //               << " time = " << std::scientific << tmin
-      //               << " avg flop/s = " << (flop/tavg)
-      //               << " max flop/s = " << (flop/tmin)
-      //               << " diff to ref = " << diff
-      //               << std::endl;
-      //   }
-      // }
-
-      ///
-      /// Serial SIMD with appropriate data layout
-      ///
-      {
-        Kokkos::View<VectorType***,Kokkos::LayoutRight,HostSpaceType> 
-          a("a", N, BlkSize, BlkSize),
-          b("b", N, BlkSize, BlkSize),
-          c("c", N, BlkSize, BlkSize);
-        
-        {
-          const Kokkos::RangePolicy<HostSpaceType,ScheduleType> policy(0, N);
-          
-          double tavg = 0, tmin = tmax;
-
-          for (int iter=iter_begin;iter<iter_end;++iter) {
-            // flush
-            flush.run();
-
-            // initialize matrices
-            Kokkos::deep_copy(a, amat_simd);
-            Kokkos::deep_copy(b, bmat_simd);
-            Kokkos::deep_copy(c, 0);
-
-            HostSpaceType().fence();
-            timer.reset();
-
-            Kokkos::parallel_for("KokkosBatched::PerfTest::GemmHost::SIMDSerialOpenMP",
-                                 policy, 
-                                 KOKKOS_LAMBDA(const int k) {
-                                   auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
-                                   auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL());
-                                   auto cc = Kokkos::subview(c, k, Kokkos::ALL(), Kokkos::ALL());
-                
-                                   SerialGemm<Trans::NoTranspose,Trans::NoTranspose,AlgoTagType>::
-                                     invoke(1.0, aa, bb, 1.0, cc);
-                                 });
-            
-            HostSpaceType().fence();
-            const double t = timer.seconds();
-            tmin = std::min(tmin, t);
-            tavg += (iter >= 0)*t;
-          }
-          tavg /= iter_end;
-
-          double diff = 0;
-          for (int i=0,iend=cref.extent(0);i<iend;++i)
-            for (int j=0,jend=cref.extent(1);j<jend;++j)
-              for (int k=0,kend=cref.extent(2);k<kend;++k) 
-                diff += abs(cref(i,j,k) - c(i/VectorLength,j,k)[i%VectorLength]);
-
-          std::cout << std::setw(12) << "KK Vector"
-                    << " BlkSize = " << std::setw(3) << BlkSize
-                    << " time = " << std::scientific << tmin
-                    << " avg flop/s = " << (flop/tavg)
-                    << " max flop/s = " << (flop/tmin)
-                    << " diff to ref = " << diff
-                    << std::endl;
-        }
+  ///
+  /// Do not test this. Test Compact vs MKL
+  /// KK Scalar version (comparable to micro BLAS version)
+  ///
+  // if (!std::is_same<AlgoTagType,Algo::Gemm::CompactMKL>::value) {
+  //   Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType>
+  //     a("a", N*VectorLength, BlkSize, BlkSize),
+  //     b("b", N*VectorLength, BlkSize, BlkSize),
+  //     c("c", N*VectorLength, BlkSize, BlkSize);
+
+  //   {
+  //     const Kokkos::RangePolicy<HostSpaceType,ScheduleType> policy(0,
+  //     N*VectorLength);
+
+  //     double tavg = 0, tmin = tmax;
+
+  //     for (int iter=iter_begin;iter<iter_end;++iter) {
+  //       // flush
+  //       flush.run();
+
+  //       // initialize matrices
+  //       Kokkos::deep_copy(a, amat);
+  //       Kokkos::deep_copy(b, bmat);
+  //       Kokkos::deep_copy(c, 0);
+
+  //       HostSpaceType().fence();
+  //       timer.reset();
+
+  //       Kokkos::parallel_for
+  //         (policy,
+  //          KOKKOS_LAMBDA(const int k) {
+  //           auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
+  //           auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL());
+  //           auto cc = Kokkos::subview(c, k, Kokkos::ALL(), Kokkos::ALL());
+
+  //           SerialGemm<Trans::NoTranspose,Trans::NoTranspose,AlgoTagType>::
+  //             invoke(1.0, aa, bb, 1.0, cc);
+  //         });
+
+  //       HostSpaceType().fence();
+  //       const double t = timer.seconds();
+  //       tmin = std::min(tmin, t);
+  //       tavg += (iter >= 0)*t;
+  //     }
+  //     tavg /= iter_end;
+
+  //     double diff = 0;
+  //     for (int i=0,iend=cref.extent(2);i<iend;++i)
+  //       for (int j=0,jend=cref.extent(2);j<jend;++j)
+  //         for (int k=0,kend=cref.extent(2);k<kend;++k)
+  //           diff += abs(cref(i,j,k) - c(i,j,k));
+
+  //     std::cout << std::setw(12) << "KK Scalar"
+  //               << " BlkSize = " << std::setw(3) << BlkSize
+  //               << " time = " << std::scientific << tmin
+  //               << " avg flop/s = " << (flop/tavg)
+  //               << " max flop/s = " << (flop/tmin)
+  //               << " diff to ref = " << diff
+  //               << std::endl;
+  //   }
+  // }
+
+  ///
+  /// Serial SIMD with appropriate data layout
+  ///
+  {
+    Kokkos::View<VectorType ***, Kokkos::LayoutRight, HostSpaceType> a(
+        "a", N, BlkSize, BlkSize),
+        b("b", N, BlkSize, BlkSize), c("c", N, BlkSize, BlkSize);
+
+    {
+      const Kokkos::RangePolicy<HostSpaceType, ScheduleType> policy(0, N);
+
+      double tavg = 0, tmin = tmax;
+
+      for (int iter = iter_begin; iter < iter_end; ++iter) {
+        // flush
+        flush.run();
+
+        // initialize matrices
+        Kokkos::deep_copy(a, amat_simd);
+        Kokkos::deep_copy(b, bmat_simd);
+        Kokkos::deep_copy(c, 0);
+
+        HostSpaceType().fence();
+        timer.reset();
+
+        Kokkos::parallel_for(
+            "KokkosBatched::PerfTest::GemmHost::SIMDSerialOpenMP", policy,
+            KOKKOS_LAMBDA(const int k) {
+              auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
+              auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL());
+              auto cc = Kokkos::subview(c, k, Kokkos::ALL(), Kokkos::ALL());
+
+              SerialGemm<Trans::NoTranspose, Trans::NoTranspose,
+                         AlgoTagType>::invoke(1.0, aa, bb, 1.0, cc);
+            });
+
+        HostSpaceType().fence();
+        const double t = timer.seconds();
+        tmin           = std::min(tmin, t);
+        tavg += (iter >= 0) * t;
       }
-      std::cout << std::endl;
+      tavg /= iter_end;
+
+      double diff = 0;
+      for (int i = 0, iend = cref.extent(0); i < iend; ++i)
+        for (int j = 0, jend = cref.extent(1); j < jend; ++j)
+          for (int k = 0, kend = cref.extent(2); k < kend; ++k)
+            diff += abs(cref(i, j, k) -
+                        c(i / VectorLength, j, k)[i % VectorLength]);
+
+      std::cout << std::setw(12) << "KK Vector"
+                << " BlkSize = " << std::setw(3) << BlkSize
+                << " time = " << std::scientific << tmin
+                << " avg flop/s = " << (flop / tavg)
+                << " max flop/s = " << (flop / tmin)
+                << " diff to ref = " << diff << std::endl;
     }
-        
-  } // end perftest
-} // end batched
+  }
+  std::cout << std::endl;
+}
 
+}  // namespace PerfTest
+}  // namespace KokkosBatched
diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp
index 2fffa06855..484c519b1c 100644
--- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp
+++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp
@@ -6,7 +6,7 @@
 
 using namespace KokkosBatched;
 
-template<typename AlgoTagType>
+template <typename AlgoTagType>
 void run(const int N) {
   typedef Kokkos::DefaultHostExecutionSpace HostSpaceType;
 
@@ -19,35 +19,35 @@ void run(const int N) {
   // Test::Gemm<32, AlgoTagType>(N);
   // Test::Gemm<64, AlgoTagType>(N);
 
-  PerfTest::Gemm< 3, HostSpaceType, AlgoTagType>(N);
-  PerfTest::Gemm< 5, HostSpaceType, AlgoTagType>(N);
+  PerfTest::Gemm<3, HostSpaceType, AlgoTagType>(N);
+  PerfTest::Gemm<5, HostSpaceType, AlgoTagType>(N);
   PerfTest::Gemm<10, HostSpaceType, AlgoTagType>(N);
   PerfTest::Gemm<15, HostSpaceType, AlgoTagType>(N);
 }
 
-int main(int argc, char *argv[]) {
-  
+int main(int argc, char* argv[]) {
   Kokkos::initialize(argc, argv);
 
 #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
   const int ntest = 1;
-  //const int N[6] = { 256, 512, 768, 1024, 1280, 1536 };
-  int N[1] = { 128*128 };
+  // const int N[6] = { 256, 512, 768, 1024, 1280, 1536 };
+  int N[1] = {128 * 128};
 
-  for (int i=1;i<argc;++i) {
+  for (int i = 1; i < argc; ++i) {
     const std::string& token = argv[i];
     if (token == std::string("-N")) N[0] = std::atoi(argv[++i]);
   }
 
-  {        
-    std::cout << "  KokkosKernels complex SIMD format is different from Intel MKL compact format.\n";
+  {
+    std::cout << "  KokkosKernels complex SIMD format is different from Intel "
+                 "MKL compact format.\n";
     std::cout << "  Accuracy check is not meaningful. \n";
-    for (int i=0;i<ntest;++i) {
+    for (int i = 0; i < ntest; ++i) {
       std::cout << " N = " << N[i] << std::endl;
-      
+
       std::cout << "\n Testing Algo::Gemm::Unblocked\n";
       run<Algo::Gemm::Unblocked>(N[i]);
-      
+
       std::cout << "\n Testing Algo::Gemm::Blocked\n";
       run<Algo::Gemm::Blocked>(N[i]);
 
@@ -55,7 +55,6 @@ int main(int argc, char *argv[]) {
       std::cout << "\n Testing Algo::Gemm::CompactMKL\n";
       run<Algo::Gemm::CompactMKL>(N[i]);
 #endif
-
     }
   }
 
diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp
index 031909d540..b062942341 100644
--- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp
+++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp
@@ -5,7 +5,7 @@
 
 using namespace KokkosBatched;
 
-template<typename AlgoTagType>
+template <typename AlgoTagType>
 void run(const int N) {
   typedef Kokkos::DefaultHostExecutionSpace HostSpaceType;
 
@@ -18,32 +18,31 @@ void run(const int N) {
   // Test::Gemm<32, AlgoTagType>(N);
   // Test::Gemm<64, AlgoTagType>(N);
 
-  PerfTest::Gemm< 3, HostSpaceType, AlgoTagType>(N);
-  PerfTest::Gemm< 5, HostSpaceType, AlgoTagType>(N);
+  PerfTest::Gemm<3, HostSpaceType, AlgoTagType>(N);
+  PerfTest::Gemm<5, HostSpaceType, AlgoTagType>(N);
   PerfTest::Gemm<10, HostSpaceType, AlgoTagType>(N);
   PerfTest::Gemm<15, HostSpaceType, AlgoTagType>(N);
 }
 
-int main(int argc, char *argv[]) {
-  
+int main(int argc, char* argv[]) {
   Kokkos::initialize(argc, argv);
 #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
   const int ntest = 1;
-  //const int N[6] = { 256, 512, 768, 1024, 1280, 1536 };
-  int N[1] = { 128*128 };
+  // const int N[6] = { 256, 512, 768, 1024, 1280, 1536 };
+  int N[1] = {128 * 128};
 
-  for (int i=1;i<argc;++i) {
+  for (int i = 1; i < argc; ++i) {
     const std::string& token = argv[i];
     if (token == std::string("-N")) N[0] = std::atoi(argv[++i]);
   }
 
-  {        
-    for (int i=0;i<ntest;++i) {
+  {
+    for (int i = 0; i < ntest; ++i) {
       std::cout << " N = " << N[i] << std::endl;
-      
+
       std::cout << "\n Testing Algo::Gemm::Unblocked\n";
       run<Algo::Gemm::Unblocked>(N[i]);
-      
+
       std::cout << "\n Testing Algo::Gemm::Blocked\n";
       run<Algo::Gemm::Blocked>(N[i]);
 
@@ -51,7 +50,6 @@ int main(int argc, char *argv[]) {
       std::cout << "\n Testing Algo::Gemm::CompactMKL\n";
       run<Algo::Gemm::CompactMKL>(N[i]);
 #endif
-
     }
   }
 
diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host.hpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host.hpp
index 0a45a0b56b..9480b810ba 100644
--- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host.hpp
+++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host.hpp
@@ -22,267 +22,271 @@
 #undef __KOKKOSBATCHED_INTEL_MKL_BATCHED__
 
 namespace KokkosBatched {
-  namespace PerfTest {
+namespace PerfTest {
 
 #undef FLOP_MUL
 #undef FLOP_ADD
 
-#if defined( KokkosBatched_Test_Gemv_Host_Complex )      
+#if defined(KokkosBatched_Test_Gemv_Host_Complex)
 #define FLOP_MUL 6.0
 #define FLOP_ADD 2.0
-    typedef Kokkos::complex<double> value_type;
+typedef Kokkos::complex<double> value_type;
 #endif
 
-#if defined( KokkosBatched_Test_Gemv_Host_Real )      
+#if defined(KokkosBatched_Test_Gemv_Host_Real)
 #define FLOP_MUL 1.0
-#define FLOP_ADD 1.0      
-    typedef double value_type;
+#define FLOP_ADD 1.0
+typedef double value_type;
 #endif
 
-    double FlopCount(int mm, int nn) {
-      double m = (double)mm;    double n = (double)nn;    
-      return (FLOP_MUL*(m*n) +
-              FLOP_ADD*(m*n));
-    }
-    
-    template<int BlkSize, int NumVecs, typename HostSpaceType, typename AlgoTagType>
-    void Gemv(const int NN) {
-      typedef Kokkos::Schedule<Kokkos::Static> ScheduleType;
-      //typedef Kokkos::Schedule<Kokkos::Dynamic> ScheduleType;
-
-      constexpr int VectorLength = DefaultVectorLength<value_type,typename HostSpaceType::memory_space>::value;
-      const int N = NN/VectorLength;
-
-      {
-        std::string value_type_name;
-        if (std::is_same<value_type,double>::value)                   value_type_name = "double";
-        if (std::is_same<value_type,Kokkos::complex<double> >::value) value_type_name = "Kokkos::complex<double>";
-
-#if   defined(__AVX512F__)
-        std::cout << "AVX512 is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
+double FlopCount(int mm, int nn) {
+  double m = (double)mm;
+  double n = (double)nn;
+  return (FLOP_MUL * (m * n) + FLOP_ADD * (m * n));
+}
+
+template <int BlkSize, int NumVecs, typename HostSpaceType,
+          typename AlgoTagType>
+void Gemv(const int NN) {
+  typedef Kokkos::Schedule<Kokkos::Static> ScheduleType;
+  // typedef Kokkos::Schedule<Kokkos::Dynamic> ScheduleType;
+
+  constexpr int VectorLength =
+      DefaultVectorLength<value_type,
+                          typename HostSpaceType::memory_space>::value;
+  const int N = NN / VectorLength;
+
+  {
+    std::string value_type_name;
+    if (std::is_same<value_type, double>::value) value_type_name = "double";
+    if (std::is_same<value_type, Kokkos::complex<double> >::value)
+      value_type_name = "Kokkos::complex<double>";
+
+#if defined(__AVX512F__)
+    std::cout << "AVX512 is defined: datatype " << value_type_name
+              << " a vector length " << VectorLength << "\n";
 #elif defined(__AVX__) || defined(__AVX2__)
-        std::cout << "AVX or AVX2 is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
+    std::cout << "AVX or AVX2 is defined: datatype " << value_type_name
+              << " a vector length " << VectorLength << "\n";
 #else
-        std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
+    std::cout << "SIMD (compiler vectorization) is defined: datatype "
+              << value_type_name << " a vector length " << VectorLength << "\n";
 #endif
-      }
+  }
+
+  const double flop =
+      (N * VectorLength) * FlopCount(BlkSize, BlkSize) * NumVecs;
+  // const double tmax = 1.0e15;
 
-      const double flop = (N*VectorLength)*FlopCount(BlkSize,BlkSize)*NumVecs;
-      //const double tmax = 1.0e15;
-
-      const int iter_begin = -10, iter_end = 100;
-      Kokkos::Timer timer;
-      
-      Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> yref;
-      Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> 
-        amat("amat", N*VectorLength, BlkSize, BlkSize);
-      Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> 
-        xvec("xvec", N*VectorLength, NumVecs, BlkSize);
-
-      Kokkos::Random_XorShift64_Pool<HostSpaceType> random(13718);
-      Kokkos::fill_random(xvec, random, value_type(1.0));
-      Kokkos::fill_random(amat, random, value_type(1.0));
-      
-      // for KNL
-      constexpr size_t LLC_CAPACITY = 34*1024*1024;
-      Flush<LLC_CAPACITY> flush;
-      
-      ///
-      /// Reference version using MKL DGEMM
-      ///
+  const int iter_begin = -10, iter_end = 100;
+  Kokkos::Timer timer;
+
+  Kokkos::View<value_type***, Kokkos::LayoutRight, HostSpaceType> yref;
+  Kokkos::View<value_type***, Kokkos::LayoutRight, HostSpaceType> amat(
+      "amat", N * VectorLength, BlkSize, BlkSize);
+  Kokkos::View<value_type***, Kokkos::LayoutRight, HostSpaceType> xvec(
+      "xvec", N * VectorLength, NumVecs, BlkSize);
+
+  Kokkos::Random_XorShift64_Pool<HostSpaceType> random(13718);
+  Kokkos::fill_random(xvec, random, value_type(1.0));
+  Kokkos::fill_random(amat, random, value_type(1.0));
+
+  // for KNL
+  constexpr size_t LLC_CAPACITY = 34 * 1024 * 1024;
+  Flush<LLC_CAPACITY> flush;
+
+  ///
+  /// Reference version using MKL DGEMM
+  ///
 #if defined(__KOKKOSBATCHED_INTEL_MKL__)
-      {
-        Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> 
-          a("a", N*VectorLength, BlkSize, BlkSize),
-          x("x", N*VectorLength, NumVecs, BlkSize),
-          y("y", N*VectorLength, NumVecs, BlkSize);
-
-        {
-          const Kokkos::RangePolicy<HostSpaceType,ScheduleType> policy(0, N*VectorLength);
-          
-          double t = 0;
-          for (int iter=iter_begin;iter<iter_end;++iter) {
-            // flush
-            flush.run();
-            
-            // initialize matrices
-            Kokkos::deep_copy(a, amat);
-            Kokkos::deep_copy(x, xvec);
-            Kokkos::deep_copy(y, 0);
-            
-            HostSpaceType().fence();
-            timer.reset();
-            
-            Kokkos::parallel_for("KokkosBatched::PerfTest::GemvHost::CblasOpenMP",
-                                 policy, 
-                                 KOKKOS_LAMBDA(const int k) {
-                                   auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
-                                   for (int j=0;j<NumVecs;++j) {
-                                     auto xx = Kokkos::subview(x, k, j, Kokkos::ALL());
-                                     auto yy = Kokkos::subview(y, k, j, Kokkos::ALL());
-                  
-                                     cblas_dgemv(CblasRowMajor, CblasNoTrans, 
-                                                 BlkSize, BlkSize, 
-                                                 1.0,
-                                                 (double*)aa.data(), aa.stride_0(),
-                                                 (double*)xx.data(), xx.stride_0(),
-                                                 1.0,
-                                                 (double*)yy.data(), yy.stride_0());
-                                   }
-                                 });
-            
-            HostSpaceType().fence();
-            t += (iter >= 0)*timer.seconds();
-          }
-          t /= iter_end;
-
-          std::cout << std::setw(12) << "MKL DGEMV"
-                    << " BlkSize = " << std::setw(3) << BlkSize
-                    << " NumVecs = " << std::setw(3) << NumVecs
-                    << " time = " << std::scientific << t
-                    << " flop/s = " << (flop/t)
-                    << std::endl;
-
-          yref = y;
-        }
+  {
+    Kokkos::View<value_type***, Kokkos::LayoutRight, HostSpaceType> a(
+        "a", N * VectorLength, BlkSize, BlkSize),
+        x("x", N * VectorLength, NumVecs, BlkSize),
+        y("y", N * VectorLength, NumVecs, BlkSize);
+
+    {
+      const Kokkos::RangePolicy<HostSpaceType, ScheduleType> policy(
+          0, N * VectorLength);
+
+      double t = 0;
+      for (int iter = iter_begin; iter < iter_end; ++iter) {
+        // flush
+        flush.run();
+
+        // initialize matrices
+        Kokkos::deep_copy(a, amat);
+        Kokkos::deep_copy(x, xvec);
+        Kokkos::deep_copy(y, 0);
+
+        HostSpaceType().fence();
+        timer.reset();
+
+        Kokkos::parallel_for(
+            "KokkosBatched::PerfTest::GemvHost::CblasOpenMP", policy,
+            KOKKOS_LAMBDA(const int k) {
+              auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
+              for (int j = 0; j < NumVecs; ++j) {
+                auto xx = Kokkos::subview(x, k, j, Kokkos::ALL());
+                auto yy = Kokkos::subview(y, k, j, Kokkos::ALL());
+
+                cblas_dgemv(CblasRowMajor, CblasNoTrans, BlkSize, BlkSize, 1.0,
+                            (double*)aa.data(), aa.stride_0(),
+                            (double*)xx.data(), xx.stride_0(), 1.0,
+                            (double*)yy.data(), yy.stride_0());
+              }
+            });
+
+        HostSpaceType().fence();
+        t += (iter >= 0) * timer.seconds();
       }
+      t /= iter_end;
+
+      std::cout << std::setw(12) << "MKL DGEMV"
+                << " BlkSize = " << std::setw(3) << BlkSize
+                << " NumVecs = " << std::setw(3) << NumVecs
+                << " time = " << std::scientific << t
+                << " flop/s = " << (flop / t) << std::endl;
+
+      yref = y;
+    }
+  }
 #endif
-        
-      ///
-      /// Plain version (comparable to micro BLAS version)
-      ///
-      {
-        Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> 
-          a("a", N*VectorLength, BlkSize, BlkSize),
-          x("x", N*VectorLength, NumVecs, BlkSize),
-          y("y", N*VectorLength, NumVecs, BlkSize);
-
-        {
-          const Kokkos::RangePolicy<HostSpaceType,ScheduleType> policy(0, N*VectorLength);
-          
-          double t = 0;
-          for (int iter=iter_begin;iter<iter_end;++iter) {
-            // flush
-            flush.run();
-
-            // initialize matrices
-            Kokkos::deep_copy(a, amat);
-            Kokkos::deep_copy(x, xvec);
-            Kokkos::deep_copy(y, 0);
-
-            HostSpaceType().fence();
-            timer.reset();
-
-            Kokkos::parallel_for("KokkosBatched::PerfTest::GemvHost::SerialOpenMP",
-                                 policy, 
-                                 KOKKOS_LAMBDA(const int k) {
-                                   auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
-
-                                   for (int j=0;j<NumVecs;++j) {                
-                                     auto xx = Kokkos::subview(x, k, j, Kokkos::ALL());
-                                     auto yy = Kokkos::subview(y, k, j, Kokkos::ALL());
-                
-                                     SerialGemv<Trans::NoTranspose,AlgoTagType>::
-                                       invoke(1.0, aa, xx, 1.0, yy);
-                                   }
-                                 });
-            
-            HostSpaceType().fence();
-            t += (iter >= 0)*timer.seconds();
-          }
-          t /= iter_end;
-
-          double diff = 0;
-          for (int i=0,iend=yref.extent(0);i<iend;++i)
-            for (int j=0,jend=yref.extent(1);j<jend;++j)
-              for (int k=0,kend=yref.extent(2);k<kend;++k)
-                diff += std::abs(yref(i,j,k) - y(i,j,k));
-
-          std::cout << std::setw(12) << "Plain"
-                    << " BlkSize = " << std::setw(3) << BlkSize
-                    << " NumVecs = " << std::setw(3) << NumVecs
-                    << " time = " << std::scientific << t
-                    << " flop/s = " << (flop/t)
-                    << " diff to ref = " << diff
-                    << std::endl;
-        }
+
+  ///
+  /// Plain version (comparable to micro BLAS version)
+  ///
+  {
+    Kokkos::View<value_type***, Kokkos::LayoutRight, HostSpaceType> a(
+        "a", N * VectorLength, BlkSize, BlkSize),
+        x("x", N * VectorLength, NumVecs, BlkSize),
+        y("y", N * VectorLength, NumVecs, BlkSize);
+
+    {
+      const Kokkos::RangePolicy<HostSpaceType, ScheduleType> policy(
+          0, N * VectorLength);
+
+      double t = 0;
+      for (int iter = iter_begin; iter < iter_end; ++iter) {
+        // flush
+        flush.run();
+
+        // initialize matrices
+        Kokkos::deep_copy(a, amat);
+        Kokkos::deep_copy(x, xvec);
+        Kokkos::deep_copy(y, 0);
+
+        HostSpaceType().fence();
+        timer.reset();
+
+        Kokkos::parallel_for(
+            "KokkosBatched::PerfTest::GemvHost::SerialOpenMP", policy,
+            KOKKOS_LAMBDA(const int k) {
+              auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
+
+              for (int j = 0; j < NumVecs; ++j) {
+                auto xx = Kokkos::subview(x, k, j, Kokkos::ALL());
+                auto yy = Kokkos::subview(y, k, j, Kokkos::ALL());
+
+                SerialGemv<Trans::NoTranspose, AlgoTagType>::invoke(1.0, aa, xx,
+                                                                    1.0, yy);
+              }
+            });
+
+        HostSpaceType().fence();
+        t += (iter >= 0) * timer.seconds();
+      }
+      t /= iter_end;
+
+      double diff = 0;
+      for (int i = 0, iend = yref.extent(0); i < iend; ++i)
+        for (int j = 0, jend = yref.extent(1); j < jend; ++j)
+          for (int k = 0, kend = yref.extent(2); k < kend; ++k)
+            diff += Kokkos::ArithTraits<value_type>::abs(yref(i, j, k) -
+                                                         y(i, j, k));
+
+      std::cout << std::setw(12) << "Plain"
+                << " BlkSize = " << std::setw(3) << BlkSize
+                << " NumVecs = " << std::setw(3) << NumVecs
+                << " time = " << std::scientific << t
+                << " flop/s = " << (flop / t) << " diff to ref = " << diff
+                << std::endl;
+    }
+  }
+
+  typedef Vector<SIMD<value_type>, VectorLength> VectorType;
+  Kokkos::View<VectorType***, Kokkos::LayoutRight, HostSpaceType> amat_simd(
+      "amat_simd", N, BlkSize, BlkSize),
+      xvec_simd("xvec_simd", N, NumVecs, BlkSize);
+
+  for (int k0 = 0; k0 < N; ++k0)
+    for (int k1 = 0; k1 < VectorLength; ++k1)
+      for (int i = 0; i < BlkSize; ++i) {
+        for (int j = 0; j < NumVecs; ++j)
+          xvec_simd(k0, j, i)[k1] = xvec(k0 * VectorLength + k1, j, i);
+        for (int j = 0; j < BlkSize; ++j)
+          amat_simd(k0, i, j)[k1] = amat(k0 * VectorLength + k1, i, j);
       }
-      
-      typedef Vector<SIMD<value_type>,VectorLength> VectorType;
-      Kokkos::View<VectorType***,Kokkos::LayoutRight,HostSpaceType> 
-        amat_simd("amat_simd", N, BlkSize, BlkSize),
-        xvec_simd("xvec_simd", N, NumVecs, BlkSize);
-        
-      for (int k0=0;k0<N;++k0)
-        for (int k1=0;k1<VectorLength;++k1) 
-          for (int i=0;i<BlkSize;++i) {
-            for (int j=0;j<NumVecs;++j) 
-              xvec_simd(k0, j, i)[k1] = xvec(k0*VectorLength+k1, j, i);
-            for (int j=0;j<BlkSize;++j) 
-              amat_simd(k0, i, j)[k1] = amat(k0*VectorLength+k1, i, j);
-          }
-      
-      
-      ///
-      /// Serial SIMD with appropriate data layout
-      ///
-      {
-        Kokkos::View<VectorType***,Kokkos::LayoutRight,HostSpaceType> 
-          a("a", N, BlkSize, BlkSize),
-          x("x", N, NumVecs, BlkSize),
-          y("y", N, NumVecs, BlkSize);
-        
-        {
-          const Kokkos::RangePolicy<HostSpaceType,ScheduleType> policy(0, N);
-          
-          double t = 0;
-          for (int iter=iter_begin;iter<iter_end;++iter) {
-            // flush
-            flush.run();
-
-            // initialize matrices
-            Kokkos::deep_copy(a, amat_simd);
-            Kokkos::deep_copy(x, xvec_simd);
-            Kokkos::deep_copy(y, 0);
-
-            HostSpaceType().fence();
-            timer.reset();
-
-            Kokkos::parallel_for("KokkosBatched::PerfTest::GemvHost::SIMDSerialOpenMP",
-                                 policy, 
-                                 KOKKOS_LAMBDA(const int k) {
-                                   auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
-
-                                   for (int j=0;j<NumVecs;++j) {                
-                                     auto xx = Kokkos::subview(x, k, j, Kokkos::ALL());
-                                     auto yy = Kokkos::subview(y, k, j, Kokkos::ALL());
-                
-                                     SerialGemv<Trans::NoTranspose,AlgoTagType>::
-                                       invoke(1.0, aa, xx, 1.0, yy);
-                                   }
-                                 });
-            
-            HostSpaceType().fence();
-            t += (iter >= 0)*timer.seconds();
-          }
-          t /= iter_end;
-          
-          double diff = 0;
-          for (int i=0,iend=yref.extent(0);i<iend;++i)
-            for (int j=0,jend=yref.extent(1);j<jend;++j)
-              for (int k=0,kend=yref.extent(2);k<kend;++k)
-                diff += std::abs(yref(i,j,k) - y(i/VectorLength,j,k)[i%VectorLength]);
-
-          std::cout << std::setw(12) << "Serial SIMD"
-                    << " BlkSize = " << std::setw(3) << BlkSize
-                    << " NumVecs = " << std::setw(3) << NumVecs
-                    << " time = " << std::scientific << t
-                    << " flop/s = " << (flop/t)
-                    << " diff to ref = " << diff
-                    << std::endl;
-        }
+
+  ///
+  /// Serial SIMD with appropriate data layout
+  ///
+  {
+    Kokkos::View<VectorType***, Kokkos::LayoutRight, HostSpaceType> a(
+        "a", N, BlkSize, BlkSize),
+        x("x", N, NumVecs, BlkSize), y("y", N, NumVecs, BlkSize);
+
+    {
+      const Kokkos::RangePolicy<HostSpaceType, ScheduleType> policy(0, N);
+
+      double t = 0;
+      for (int iter = iter_begin; iter < iter_end; ++iter) {
+        // flush
+        flush.run();
+
+        // initialize matrices
+        Kokkos::deep_copy(a, amat_simd);
+        Kokkos::deep_copy(x, xvec_simd);
+        Kokkos::deep_copy(y, 0);
+
+        HostSpaceType().fence();
+        timer.reset();
+
+        Kokkos::parallel_for(
+            "KokkosBatched::PerfTest::GemvHost::SIMDSerialOpenMP", policy,
+            KOKKOS_LAMBDA(const int k) {
+              auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
+
+              for (int j = 0; j < NumVecs; ++j) {
+                auto xx = Kokkos::subview(x, k, j, Kokkos::ALL());
+                auto yy = Kokkos::subview(y, k, j, Kokkos::ALL());
+
+                SerialGemv<Trans::NoTranspose, AlgoTagType>::invoke(1.0, aa, xx,
+                                                                    1.0, yy);
+              }
+            });
+
+        HostSpaceType().fence();
+        t += (iter >= 0) * timer.seconds();
       }
+      t /= iter_end;
+
+      double diff = 0;
+      for (int i = 0, iend = yref.extent(0); i < iend; ++i)
+        for (int j = 0, jend = yref.extent(1); j < jend; ++j)
+          for (int k = 0, kend = yref.extent(2); k < kend; ++k)
+            diff += Kokkos::ArithTraits<value_type>::abs(
+                yref(i, j, k) - y(i / VectorLength, j, k)[i % VectorLength]);
+
+      std::cout << std::setw(12) << "Serial SIMD"
+                << " BlkSize = " << std::setw(3) << BlkSize
+                << " NumVecs = " << std::setw(3) << NumVecs
+                << " time = " << std::scientific << t
+                << " flop/s = " << (flop / t) << " diff to ref = " << diff
+                << std::endl;
     }
-    
   }
 }
+
+}  // namespace PerfTest
+}  // namespace KokkosBatched
diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp
index 56ade7a446..75f4bca4c0 100644
--- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp
+++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp
@@ -5,7 +5,7 @@
 
 using namespace KokkosBatched;
 
-template<typename AlgoTagType>
+template <typename AlgoTagType>
 void run(const int N) {
   typedef Kokkos::DefaultHostExecutionSpace HostSpaceType;
 
@@ -18,33 +18,32 @@ void run(const int N) {
   // PerfTest::Gemv<32, 1, ExecSpace,AlgoTagType>(N);
   // PerfTest::Gemv<64, 1, ExecSpace,AlgoTagType>(N);
 
-  PerfTest::Gemv< 3, 1, HostSpaceType,AlgoTagType>(N);
-  PerfTest::Gemv< 5, 1, HostSpaceType,AlgoTagType>(N);
-  PerfTest::Gemv<10, 1, HostSpaceType,AlgoTagType>(N);
-  PerfTest::Gemv<15, 1, HostSpaceType,AlgoTagType>(N);
+  PerfTest::Gemv<3, 1, HostSpaceType, AlgoTagType>(N);
+  PerfTest::Gemv<5, 1, HostSpaceType, AlgoTagType>(N);
+  PerfTest::Gemv<10, 1, HostSpaceType, AlgoTagType>(N);
+  PerfTest::Gemv<15, 1, HostSpaceType, AlgoTagType>(N);
 }
 
 int main(int argc, char *argv[]) {
-  
   Kokkos::initialize(argc, argv);
 #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
   const int ntest = 1;
-  //const int N[6] = { 256, 512, 768, 1024, 1280, 1536 };
-  const int N[1] = { 128*128 };
+  // const int N[6] = { 256, 512, 768, 1024, 1280, 1536 };
+  const int N[1] = {128 * 128};
 
-  {        
-    for (int i=0;i<ntest;++i) {
+  {
+    for (int i = 0; i < ntest; ++i) {
       std::cout << " N = " << N[i] << std::endl;
-      
+
       std::cout << "\n Testing Algo::Gemv::Unblocked\n";
       run<Algo::Gemv::Unblocked>(N[i]);
-      
+
       std::cout << "\n Testing Algo::Gemv::Blocked\n";
       run<Algo::Gemv::Blocked>(N[i]);
     }
   }
 #endif
   Kokkos::finalize();
-  
+
   return 0;
 }
diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Cuda.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Cuda.cpp
index dcd60af9f0..6cf9ec5725 100644
--- a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Cuda.cpp
+++ b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Cuda.cpp
@@ -3,7 +3,7 @@
 #include "Kokkos_Core.hpp"
 #include "Kokkos_Timer.hpp"
 
-#if defined(KOKKOS_ENABLE_CUDA) 
+#if defined(KOKKOS_ENABLE_CUDA)
 
 #include <iomanip>
 
@@ -23,551 +23,565 @@
 #include "KokkosBatched_LU_Team_Impl.hpp"
 
 namespace KokkosBatched {
-  namespace PerfTest {
-    
+namespace PerfTest {
+
 #define FLOP_MUL 1.0
 #define FLOP_ADD 1.0
-    typedef double value_type;
-    
-    double FlopCount(int mm, int nn) {
-      double m = (double)mm;    double n = (double)nn;
-      if (m > n)
-        return (FLOP_MUL*(0.5*m*n*n-(1.0/6.0)*n*n*n+0.5*m*n-0.5*n*n+(2.0/3.0)*n) +
-                FLOP_ADD*(0.5*m*n*n-(1.0/6.0)*n*n*n-0.5*m*n+        (1.0/6.0)*n));
-      else
-        return (FLOP_MUL*(0.5*n*m*m-(1.0/6.0)*m*m*m+0.5*n*m-0.5*m*m+(2.0/3.0)*m) +
-                FLOP_ADD*(0.5*n*m*m-(1.0/6.0)*m*m*m-0.5*n*m+        (1.0/6.0)*m));
-    }
+typedef double value_type;
+
+double FlopCount(int mm, int nn) {
+  double m = (double)mm;
+  double n = (double)nn;
+  if (m > n)
+    return (FLOP_MUL * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n +
+                        0.5 * m * n - 0.5 * n * n + (2.0 / 3.0) * n) +
+            FLOP_ADD * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n -
+                        0.5 * m * n + (1.0 / 6.0) * n));
+  else
+    return (FLOP_MUL * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m +
+                        0.5 * n * m - 0.5 * m * m + (2.0 / 3.0) * m) +
+            FLOP_ADD * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m -
+                        0.5 * n * m + (1.0 / 6.0) * m));
+}
 
-    struct RangeTag {};
-    struct TeamTagV1 {};
-    struct TeamTagV2 {};
-    struct TeamTagV3 {};
-    struct TeamTagHandmade {};
+struct RangeTag {};
+struct TeamTagV1 {};
+struct TeamTagV2 {};
+struct TeamTagV3 {};
+struct TeamTagHandmade {};
 
-    template<typename ViewType, typename AlgoTagType, int VectorLength = 0> 
-    struct Functor {
-      UnmanagedViewType<ViewType> _a;
+template <typename ViewType, typename AlgoTagType, int VectorLength = 0>
+struct Functor {
+  UnmanagedViewType<ViewType> _a;
 
-      KOKKOS_INLINE_FUNCTION
-      Functor() = default;
+  KOKKOS_INLINE_FUNCTION
+  Functor() = default;
 
-      KOKKOS_INLINE_FUNCTION
-      Functor(const ViewType &a)
-        : _a(a) {}
+  KOKKOS_INLINE_FUNCTION
+  Functor(const ViewType &a) : _a(a) {}
 
-      KOKKOS_INLINE_FUNCTION
-      void operator()(const RangeTag &, const int k) const {
-        auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-        SerialLU<AlgoTagType>::invoke(aa);
-      }
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const RangeTag &, const int k) const {
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    SerialLU<AlgoTagType>::invoke(aa);
+  }
 
-      template<typename MemberType>
-      KOKKOS_INLINE_FUNCTION
-      void operator()(const TeamTagV1 &, const MemberType &member) const {
-        const int kbeg = (member.league_rank()*(member.team_size()*VectorLength) +
-                          member.team_rank()*VectorLength);
-        Kokkos::parallel_for
-          (Kokkos::ThreadVectorRange(member, VectorLength),
-           [&](const int &k) {
-            const int kk = kbeg + k;
-            if (kk < _a.extent_int(0)) {
-              auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL());
-              SerialLU<AlgoTagType>::invoke(aa);
-            }
-          });
-      }
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV1 &,
+                                         const MemberType &member) const {
+    const int kbeg =
+        (member.league_rank() * (member.team_size() * VectorLength) +
+         member.team_rank() * VectorLength);
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) {
+          const int kk = kbeg + k;
+          if (kk < _a.extent_int(0)) {
+            auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL());
+            SerialLU<AlgoTagType>::invoke(aa);
+          }
+        });
+  }
 
-      template<typename MemberType>
-      KOKKOS_INLINE_FUNCTION
-      void operator()(const TeamTagV2 &, const MemberType &member) const {
-        const int kbeg = member.league_rank()*VectorLength;
-        Kokkos::parallel_for
-          (Kokkos::ThreadVectorRange(member, VectorLength),
-           [&](const int &k) {
-            const int kk = kbeg + k;
-            if (kk < _a.extent_int(0)) {
-              auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL());
-              TeamLU<MemberType,AlgoTagType>::invoke(member, aa);
-            }
-          });
-      }
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV2 &,
+                                         const MemberType &member) const {
+    const int kbeg = member.league_rank() * VectorLength;
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) {
+          const int kk = kbeg + k;
+          if (kk < _a.extent_int(0)) {
+            auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL());
+            TeamLU<MemberType, AlgoTagType>::invoke(member, aa);
+          }
+        });
+  }
 
-      template<typename MemberType>
-      KOKKOS_INLINE_FUNCTION
-      void operator()(const TeamTagV3 &, const MemberType &member) const {
-        const int lvl = 0;
-        ScratchViewType<ViewType> sa(member.team_scratch(lvl), VectorLength, _a.extent(1), _a.extent(2));
-            
-        const int kbeg = member.league_rank()*VectorLength;
-        Kokkos::parallel_for
-          (Kokkos::ThreadVectorRange(member, VectorLength),
-           [&](const int &k) {
-            const int kk = kbeg + k;
-            if (kk < _a.extent_int(0)) {
-              auto aa  = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL());
-              auto saa = Kokkos::subview(sa,  k, Kokkos::ALL(), Kokkos::ALL());
-
-              TeamCopy<MemberType,Trans::NoTranspose>::invoke(member, aa, saa);
-              member.team_barrier();
-              TeamLU<MemberType,AlgoTagType>::invoke(member, saa);
-              member.team_barrier();
-              TeamCopy<MemberType,Trans::NoTranspose>::invoke(member, saa, aa);
-            }
-          });
-      }
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV3 &,
+                                         const MemberType &member) const {
+    const int lvl = 0;
+    ScratchViewType<ViewType> sa(member.team_scratch(lvl), VectorLength,
+                                 _a.extent(1), _a.extent(2));
+
+    const int kbeg = member.league_rank() * VectorLength;
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) {
+          const int kk = kbeg + k;
+          if (kk < _a.extent_int(0)) {
+            auto aa  = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL());
+            auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL());
+
+            TeamCopy<MemberType, Trans::NoTranspose>::invoke(member, aa, saa);
+            member.team_barrier();
+            TeamLU<MemberType, AlgoTagType>::invoke(member, saa);
+            member.team_barrier();
+            TeamCopy<MemberType, Trans::NoTranspose>::invoke(member, saa, aa);
+          }
+        });
+  }
+};
 
-    };
-        
-    template<typename DeviceSpaceType, typename AlgoTagType>
-    void LU(const int NN, const int BlkSize) {
-      typedef Kokkos::Schedule<Kokkos::Static> ScheduleType;
-          
-      constexpr int VectorLength = DefaultVectorLength<value_type,typename DeviceSpaceType::memory_space>::value;
-      const int N = NN/VectorLength;
-          
-      {
-        std::string value_type_name;
-        if (std::is_same<value_type,double>::value)                   value_type_name = "double";
-        if (std::is_same<value_type,Kokkos::complex<double> >::value) value_type_name = "Kokkos::complex<double>";
-            
-        std::cout << "SIMD is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
-      }
+template <typename DeviceSpaceType, typename AlgoTagType>
+void LU(const int NN, const int BlkSize) {
+  typedef Kokkos::Schedule<Kokkos::Static> ScheduleType;
 
-      const double flop = (N*VectorLength)*FlopCount(BlkSize,BlkSize);
-      const double tmax = 1.0e15;
-
-      typedef Kokkos::DefaultHostExecutionSpace HostSpaceType;
-      typedef typename DeviceSpaceType::memory_space DeviceMemorySpaceType;
-
-      const int iter_begin = -3, iter_end = 50;
-      Kokkos::Timer timer;
-
-      Kokkos::View<value_type***,Kokkos::LayoutLeft,HostSpaceType>
-        amat("amat", N*VectorLength, BlkSize, BlkSize),
-        aref("aref", N*VectorLength, BlkSize, BlkSize);
-
-      {
-        Random<value_type> random;
-        for (int k=0;k<N*VectorLength;++k) {
-          // use tridiagonal matrices; for now we just check elementwise l/u factors
-          // do not allow pivots
-          for (int i=0;i<BlkSize;++i) {
-            amat(k, i, i) = random.value() + 10.0;
-            if ((i+1) < BlkSize) {
-              amat(k, i, i+1) = random.value() + 1.0;
-              amat(k, i+1, i) = random.value() + 1.0;
-            }
-          }
+  constexpr int VectorLength =
+      DefaultVectorLength<value_type,
+                          typename DeviceSpaceType::memory_space>::value;
+  const int N = NN / VectorLength;
+
+  {
+    std::string value_type_name;
+    if (std::is_same<value_type, double>::value) value_type_name = "double";
+    if (std::is_same<value_type, Kokkos::complex<double> >::value)
+      value_type_name = "Kokkos::complex<double>";
+
+    std::cout << "SIMD is defined: datatype " << value_type_name
+              << " a vector length " << VectorLength << "\n";
+  }
+
+  const double flop = (N * VectorLength) * FlopCount(BlkSize, BlkSize);
+  const double tmax = 1.0e15;
 
-          // value_type d[BlkSize], v[BlkSize][BlkSize];
-          // for (int i=0;i<BlkSize;++i) {
-          //   d[i] = random.value() + 1.0; // positive value
-          //   for (int j=0;j<BlkSize;++j)
-          //     v[i][j] = random.value();
-          // }
-          // for (int i=0;i<BlkSize;++i)
-          //   for (int j=0;j<BlkSize;++j)
-          //     for (int l=0;l<BlkSize;++l)
-          //       amat(k, i, j) = v[i][l]*d[l]*v[l][j];
+  typedef Kokkos::DefaultHostExecutionSpace HostSpaceType;
+  typedef typename DeviceSpaceType::memory_space DeviceMemorySpaceType;
+
+  const int iter_begin = -3, iter_end = 50;
+  Kokkos::Timer timer;
+
+  Kokkos::View<value_type ***, Kokkos::LayoutLeft, HostSpaceType> amat(
+      "amat", N * VectorLength, BlkSize, BlkSize),
+      aref("aref", N * VectorLength, BlkSize, BlkSize);
+
+  {
+    Random<value_type> random;
+    for (int k = 0; k < N * VectorLength; ++k) {
+      // use tridiagonal matrices; for now we just check elementwise l/u factors
+      // do not allow pivots
+      for (int i = 0; i < BlkSize; ++i) {
+        amat(k, i, i) = random.value() + 10.0;
+        if ((i + 1) < BlkSize) {
+          amat(k, i, i + 1) = random.value() + 1.0;
+          amat(k, i + 1, i) = random.value() + 1.0;
         }
       }
 
-      constexpr size_t LLC_CAPACITY = 56*4*1024*1024;
-      Flush<LLC_CAPACITY> flush;
-      
+      // value_type d[BlkSize], v[BlkSize][BlkSize];
+      // for (int i=0;i<BlkSize;++i) {
+      //   d[i] = random.value() + 1.0; // positive value
+      //   for (int j=0;j<BlkSize;++j)
+      //     v[i][j] = random.value();
+      // }
+      // for (int i=0;i<BlkSize;++i)
+      //   for (int j=0;j<BlkSize;++j)
+      //     for (int l=0;l<BlkSize;++l)
+      //       amat(k, i, j) = v[i][l]*d[l]*v[l][j];
+    }
+  }
+
+  constexpr size_t LLC_CAPACITY = 56 * 4 * 1024 * 1024;
+  Flush<LLC_CAPACITY> flush;
+
 #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__)
-      if (1) {
-        ///
-        /// CUBLAS Batch version
-        ///
-        const Kokkos::LayoutStride stride(N*VectorLength, BlkSize*BlkSize,
-                                          BlkSize, 1,
-                                          BlkSize, BlkSize);
-        
-        Kokkos::View<value_type***,Kokkos::LayoutStride,DeviceSpaceType> a("a", stride);        
-        Kokkos::View<int*,DeviceSpaceType>                           info("info", N*VectorLength);
-
-        cublasStatus_t stat;
-        cublasHandle_t handle;
-
-        stat = cublasCreate(&handle);
-        if (stat != CUBLAS_STATUS_SUCCESS)
-          Kokkos::abort("CUBLAS initialization failed\n");
-
-        auto amat_device = Kokkos::create_mirror_view(typename DeviceSpaceType::memory_space(), amat);
-        Kokkos::deep_copy(amat_device, amat);
+  if (1) {
+    ///
+    /// CUBLAS Batch version
+    ///
+    const Kokkos::LayoutStride stride(N * VectorLength, BlkSize * BlkSize,
+                                      BlkSize, 1, BlkSize, BlkSize);
+
+    Kokkos::View<value_type ***, Kokkos::LayoutStride, DeviceSpaceType> a(
+        "a", stride);
+    Kokkos::View<int *, DeviceSpaceType> info("info", N * VectorLength);
+
+    cublasStatus_t stat;
+    cublasHandle_t handle;
+
+    stat = cublasCreate(&handle);
+    if (stat != CUBLAS_STATUS_SUCCESS)
+      Kokkos::abort("CUBLAS initialization failed\n");
+
+    auto amat_device = Kokkos::create_mirror_view(
+        typename DeviceSpaceType::memory_space(), amat);
+    Kokkos::deep_copy(amat_device, amat);
+
+    Kokkos::fence();
+    {
+      double tavg = 0, tmin = tmax;
+      value_type *aa[N * VectorLength];
+
+      for (int k = 0; k < N * VectorLength; ++k) {
+        aa[k] = a.data() + k * a.stride_0();
+      }
+      value_type **aa_device;
+      if (cudaMalloc(&aa_device, N * VectorLength * sizeof(value_type *)) !=
+          cudaSuccess) {
+        Kokkos::abort("CUDA memory allocation failed\n");
+      }
+      if (cudaMemcpy(aa_device, aa, sizeof(value_type *) * N * VectorLength,
+                     cudaMemcpyHostToDevice) != cudaSuccess) {
+        Kokkos::abort("CUDA memcpy failed\n");
+      }
+      Kokkos::fence();
+      for (int iter = iter_begin; iter < iter_end; ++iter) {
+        // flush
+        flush.run();
+
+        // initialize matrix
+        Kokkos::deep_copy(a, amat_device);
 
         Kokkos::fence();
-        {
-          double tavg = 0, tmin = tmax;
-          value_type *aa[N*VectorLength];
+        timer.reset();
 
-          for (int k=0;k<N*VectorLength;++k) {
-            aa[k] = a.data() + k*a.stride_0();
-          }
-          value_type **aa_device;
-          if (cudaMalloc(&aa_device, N*VectorLength*sizeof(value_type*)) != cudaSuccess) {
-            Kokkos::abort("CUDA memory allocation failed\n");
-          }
-          if (cudaMemcpy(aa_device, aa, sizeof(value_type*)*N*VectorLength, cudaMemcpyHostToDevice) != cudaSuccess) {
-            Kokkos::abort("CUDA memcpy failed\n");
-          }
-          Kokkos::fence();          
-          for (int iter=iter_begin;iter<iter_end;++iter) {
-            // flush
-            flush.run();
-            
-            // initialize matrix
-            Kokkos::deep_copy(a, amat_device);
-
-            Kokkos::fence();
-            timer.reset();
-
-            stat = cublasDgetrfBatched(handle, 
-                                       BlkSize, 
-                                       (value_type**)aa_device, BlkSize, 
-                                       NULL, 
-                                       (int*)info.data(), 
-                                       N*VectorLength);
-            if (stat != CUBLAS_STATUS_SUCCESS) {
-              Kokkos::abort("CUBLAS LU Batched failed\n");
-            }
-            
-            Kokkos::fence();
-            const double t = timer.seconds();
-            tmin = std::min(tmin, t);
-            tavg += (iter >= 0)*t;
-          }
-          tavg /= iter_end;
-          
-          auto asol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a);
-          Kokkos::deep_copy(asol, a);
-          Kokkos::deep_copy(aref, asol);
-          
-          if (cudaFree(aa_device) != cudaSuccess) {
-            Kokkos::abort("CUDA memory free failed\n");
-          }
-          
-          std::cout << std::setw(8) << "CUBLAS"
-                    << std::setw(8) << "Batch"
-                    << " BlkSize = " << std::setw(3) << BlkSize
-                    << " TeamSize = N/A" 
-                    << " ScratchSize (KB) = N/A"
-                    << " time = " << std::scientific << tmin
-                    << " avg flop/s = " << (flop/tavg)
-                    << " max flop/s = " << (flop/tmin)
-                    << std::endl;
+        stat = cublasDgetrfBatched(handle, BlkSize, (value_type **)aa_device,
+                                   BlkSize, NULL, (int *)info.data(),
+                                   N * VectorLength);
+        if (stat != CUBLAS_STATUS_SUCCESS) {
+          Kokkos::abort("CUBLAS LU Batched failed\n");
         }
+
+        Kokkos::fence();
+        const double t = timer.seconds();
+        tmin           = std::min(tmin, t);
+        tavg += (iter >= 0) * t;
       }
+      tavg /= iter_end;
+
+      auto asol =
+          Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a);
+      Kokkos::deep_copy(asol, a);
+      Kokkos::deep_copy(aref, asol);
+
+      if (cudaFree(aa_device) != cudaSuccess) {
+        Kokkos::abort("CUDA memory free failed\n");
+      }
+
+      std::cout << std::setw(8) << "CUBLAS" << std::setw(8) << "Batch"
+                << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = N/A"
+                << " ScratchSize (KB) = N/A"
+                << " time = " << std::scientific << tmin
+                << " avg flop/s = " << (flop / tavg)
+                << " max flop/s = " << (flop / tmin) << std::endl;
+    }
+  }
 #endif
 
-      if (1) {
-        ///
-        /// Range policy version
-        ///
-        typedef Kokkos::View<value_type***,DeviceSpaceType> view_type;
-        view_type
-          a("a", N*VectorLength, BlkSize, BlkSize);
+  if (1) {
+    ///
+    /// Range policy version
+    ///
+    typedef Kokkos::View<value_type ***, DeviceSpaceType> view_type;
+    view_type a("a", N * VectorLength, BlkSize, BlkSize);
 
-        double tavg = 0, tmin = tmax;
-        {
-          typedef Functor<view_type,AlgoTagType> functor_type;
-          const Kokkos::RangePolicy<DeviceSpaceType,ScheduleType,RangeTag> policy(0, N*VectorLength);
+    double tavg = 0, tmin = tmax;
+    {
+      typedef Functor<view_type, AlgoTagType> functor_type;
+      const Kokkos::RangePolicy<DeviceSpaceType, ScheduleType, RangeTag> policy(
+          0, N * VectorLength);
 
-          for (int iter=iter_begin;iter<iter_end;++iter) {
-            // flush
-            flush.run();
+      for (int iter = iter_begin; iter < iter_end; ++iter) {
+        // flush
+        flush.run();
 
-            // initialize matrix
-            Kokkos::deep_copy(a, amat);
+        // initialize matrix
+        Kokkos::deep_copy(a, amat);
 
-            Kokkos::fence();
-            timer.reset();
+        Kokkos::fence();
+        timer.reset();
 
-            Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::RangeTag", policy, functor_type(a));
+        Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::RangeTag",
+                             policy, functor_type(a));
 
-            Kokkos::fence();
-            const double t = timer.seconds();
-            tmin = std::min(tmin, t);
-            tavg += (iter >= 0)*t;
-          }
-          tavg /= iter_end;
-          
-          auto asol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a);
-          Kokkos::deep_copy(asol, a);
-          
-          double diff = 0;
-          for (int i=0,iend=aref.extent(0);i<iend;++i)
-            for (int j=0,jend=aref.extent(1);j<jend;++j)
-              for (int k=0,kend=aref.extent(2);k<kend;++k)
-                diff += std::abs(aref(i,j,k) - asol(i,j,k));
-
-          std::cout << std::setw(8) << "Kokkos"
-                    << std::setw(8) << "Range"
-                    << " BlkSize = " << std::setw(3) << BlkSize
-                    << " TeamSize = N/A" 
-                    << " ScratchSize (KB) =   0"
-                    << " time = " << std::scientific << tmin
-                    << " avg flop/s = " << (flop/tavg)
-                    << " max flop/s = " << (flop/tmin);
+        Kokkos::fence();
+        const double t = timer.seconds();
+        tmin           = std::min(tmin, t);
+        tavg += (iter >= 0) * t;
+      }
+      tavg /= iter_end;
+
+      auto asol =
+          Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a);
+      Kokkos::deep_copy(asol, a);
+
+      double diff = 0;
+      for (int i = 0, iend = aref.extent(0); i < iend; ++i)
+        for (int j = 0, jend = aref.extent(1); j < jend; ++j)
+          for (int k = 0, kend = aref.extent(2); k < kend; ++k)
+            diff += Kokkos::ArithTraits<value_type>::abs(aref(i, j, k) -
+                                                         asol(i, j, k));
+
+      std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Range"
+                << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = N/A"
+                << " ScratchSize (KB) =   0"
+                << " time = " << std::scientific << tmin
+                << " avg flop/s = " << (flop / tavg)
+                << " max flop/s = " << (flop / tmin);
 #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__)
-          std::cout << " diff to ref = " << diff;
+      std::cout << " diff to ref = " << diff;
 #endif
-          std::cout << std::endl;
-        }
+      std::cout << std::endl;
+    }
+  }
+  if (1) {
+    ///
+    /// Team V1
+    ///
+    typedef Kokkos::View<value_type ***, DeviceSpaceType> view_type;
+    view_type a("a", N * VectorLength, BlkSize, BlkSize);
+
+    double tavg = 0, tmin = tmax;
+    {
+      typedef Kokkos::TeamPolicy<DeviceSpaceType, ScheduleType, TeamTagV1>
+          policy_type;
+      typedef Functor<view_type, AlgoTagType, VectorLength> functor_type;
+
+      const int team_size =
+          policy_type(N / 32, Kokkos::AUTO, VectorLength)
+              .team_size_recommended(functor_type(), Kokkos::ParallelForTag());
+
+      const policy_type policy(N / team_size, team_size, VectorLength);
+      for (int iter = iter_begin; iter < iter_end; ++iter) {
+        // flush
+        flush.run();
+
+        // initialize matrix
+        Kokkos::deep_copy(a, amat);
+
+        Kokkos::fence();
+        timer.reset();
+
+        Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::TeamTagV1",
+                             policy, functor_type(a));
+
+        Kokkos::fence();
+        const double t = timer.seconds();
+        tmin           = std::min(tmin, t);
+        tavg += (iter >= 0) * t;
       }
-      if (1) {
-        ///
-        /// Team V1
-        ///
-        typedef Kokkos::View<value_type***,DeviceSpaceType> view_type;
-        view_type
-          a("a", N*VectorLength, BlkSize, BlkSize);
-
-        double tavg = 0, tmin = tmax;
-        {
-          typedef Kokkos::TeamPolicy<DeviceSpaceType,ScheduleType,TeamTagV1> policy_type;
-          typedef Functor<view_type,AlgoTagType,VectorLength> functor_type;
-
-          const int team_size =
-            policy_type(N/32, Kokkos::AUTO, VectorLength).team_size_recommended(functor_type(), Kokkos::ParallelForTag());
-
-          const policy_type policy(N/team_size, team_size, VectorLength);
-          for (int iter=iter_begin;iter<iter_end;++iter) {
-            // flush
-            flush.run();
-
-            // initialize matrix
-            Kokkos::deep_copy(a, amat);
-
-            Kokkos::fence();
-            timer.reset();
-
-            Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::TeamTagV1", policy, functor_type(a));
-
-            Kokkos::fence();
-            const double t = timer.seconds();
-            tmin = std::min(tmin, t);
-            tavg += (iter >= 0)*t;
-          }
-          tavg /= iter_end;
-          
-          auto asol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a);
-          Kokkos::deep_copy(asol, a);
-          
-          double diff = 0;
-          for (int i=0,iend=aref.extent(0);i<iend;++i)
-            for (int j=0,jend=aref.extent(1);j<jend;++j)
-              for (int k=0,kend=aref.extent(2);k<kend;++k)
-                diff += std::abs(aref(i,j,k) - asol(i,j,k));
-
-          std::cout << std::setw(8) << "Kokkos"
-                    << std::setw(8) << "Team V1"
-                    << " BlkSize = " << std::setw(3) << BlkSize
-                    << " TeamSize = " << std::setw(3) << team_size
-                    << " ScratchSize (KB) =   0"
-                    << " time = " << std::scientific << tmin
-                    << " avg flop/s = " << (flop/tavg)
-                    << " max flop/s = " << (flop/tmin);
+      tavg /= iter_end;
+
+      auto asol =
+          Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a);
+      Kokkos::deep_copy(asol, a);
+
+      double diff = 0;
+      for (int i = 0, iend = aref.extent(0); i < iend; ++i)
+        for (int j = 0, jend = aref.extent(1); j < jend; ++j)
+          for (int k = 0, kend = aref.extent(2); k < kend; ++k)
+            diff += Kokkos::ArithTraits<value_type>::abs(aref(i, j, k) -
+                                                         asol(i, j, k));
+
+      std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V1"
+                << " BlkSize = " << std::setw(3) << BlkSize
+                << " TeamSize = " << std::setw(3) << team_size
+                << " ScratchSize (KB) =   0"
+                << " time = " << std::scientific << tmin
+                << " avg flop/s = " << (flop / tavg)
+                << " max flop/s = " << (flop / tmin);
 #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__)
-          std::cout << " diff to ref = " << diff;
+      std::cout << " diff to ref = " << diff;
 #endif
-          std::cout << std::endl;
-        }
+      std::cout << std::endl;
+    }
+  }
+  if (1) {
+    ///
+    /// Team V2
+    ///
+    typedef Kokkos::View<value_type ***, DeviceSpaceType> view_type;
+    view_type a("a", N * VectorLength, BlkSize, BlkSize);
+
+    double tavg = 0, tmin = tmax;
+    {
+      typedef Kokkos::TeamPolicy<DeviceSpaceType, ScheduleType, TeamTagV2>
+          policy_type;
+      typedef Functor<view_type, AlgoTagType, VectorLength> functor_type;
+
+      const int is_blocked_algo =
+                    (std::is_same<AlgoTagType, Algo::LU::Blocked>::value),
+                mb = Algo::LU::Blocked::mb<DeviceMemorySpaceType>();
+      // mp = BlkSize%mb > 0;
+
+      const int
+          // mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize;
+          mblk = is_blocked_algo ? (BlkSize - mb) : (BlkSize - 1);
+
+      const int max_team_size =
+          policy_type(N, Kokkos::AUTO, VectorLength)
+              .team_size_max(functor_type(), Kokkos::ParallelForTag());
+      const int team_size = std::min(std::max(mblk * 2, 1), max_team_size);
+
+      const policy_type policy(N, team_size, VectorLength);
+      for (int iter = iter_begin; iter < iter_end; ++iter) {
+        // flush
+        flush.run();
+
+        // initialize matrix
+        Kokkos::deep_copy(a, amat);
+
+        Kokkos::fence();
+        timer.reset();
+
+        Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::TeamTagV2",
+                             policy, functor_type(a));
+
+        Kokkos::fence();
+        const double t = timer.seconds();
+        tmin           = std::min(tmin, t);
+        tavg += (iter >= 0) * t;
       }
-      if (1) {
-        ///
-        /// Team V2
-        ///
-        typedef Kokkos::View<value_type***,DeviceSpaceType> view_type;
-        view_type
-          a("a", N*VectorLength, BlkSize, BlkSize);
-
-        double tavg = 0, tmin = tmax;
-        {
-          typedef Kokkos::TeamPolicy<DeviceSpaceType,ScheduleType,TeamTagV2> policy_type;
-          typedef Functor<view_type,AlgoTagType,VectorLength> functor_type;
-
-          const int
-            is_blocked_algo = (std::is_same<AlgoTagType,Algo::LU::Blocked>::value),
-            mb = Algo::LU::Blocked::mb<DeviceMemorySpaceType>();
-          //mp = BlkSize%mb > 0;
-
-          const int
-            //mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize;
+      tavg /= iter_end;
+
+      auto asol =
+          Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a);
+      Kokkos::deep_copy(asol, a);
+
+      double diff = 0;
+      for (int i = 0, iend = aref.extent(0); i < iend; ++i)
+        for (int j = 0, jend = aref.extent(1); j < jend; ++j)
+          for (int k = 0, kend = aref.extent(2); k < kend; ++k)
+            diff += Kokkos::ArithTraits<value_type>::abs(aref(i, j, k) -
+                                                         asol(i, j, k));
+
+      std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V2"
+                << " BlkSize = " << std::setw(3) << BlkSize
+                << " TeamSize = " << std::setw(3) << team_size
+                << " ScratchSize (KB) =   0"
+                << " time = " << std::scientific << tmin
+                << " avg flop/s = " << (flop / tavg)
+                << " max flop/s = " << (flop / tmin);
+#if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__)
+      std::cout << " diff to ref = " << diff;
+#endif
+      std::cout << std::endl;
+    }
+  }
+  if (1) {
+    ///
+    /// Team V3
+    ///
+    typedef Kokkos::View<value_type ***, DeviceSpaceType> view_type;
+    view_type a("a", N * VectorLength, BlkSize, BlkSize);
+
+    double tavg = 0, tmin = tmax;
+    {
+      typedef Kokkos::TeamPolicy<DeviceSpaceType, ScheduleType, TeamTagV3>
+          policy_type;
+      typedef Functor<view_type, AlgoTagType, VectorLength> functor_type;
+
+      const int lvl              = 0,
+                per_team_scratch = ScratchViewType<view_type>::shmem_size(
+                    VectorLength, BlkSize, BlkSize);
+      if (per_team_scratch / 1024 < 48) {
+        const int is_blocked_algo =
+                      (std::is_same<AlgoTagType, Algo::LU::Blocked>::value),
+                  mb = Algo::LU::Blocked::mb<DeviceMemorySpaceType>();
+        //                  mp = BlkSize%mb > 0;
+
+        const int
+            // mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize;
             mblk = is_blocked_algo ? (BlkSize - mb) : (BlkSize - 1);
 
-          const int max_team_size = 
-            policy_type(N, Kokkos::AUTO, VectorLength).team_size_max(functor_type(), Kokkos::ParallelForTag());
-          const int team_size = std::min(std::max(mblk*2,1), max_team_size);
+        const int max_team_size =
+            policy_type(N, Kokkos::AUTO, VectorLength)
+                .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch))
+                .team_size_max(functor_type(), Kokkos::ParallelForTag());
+        const int team_size = std::min(std::max(mblk * 2, 1), max_team_size);
 
-          const policy_type policy(N, team_size, VectorLength);
-          for (int iter=iter_begin;iter<iter_end;++iter) {
-            // flush
-            flush.run();
+        policy_type policy(N, team_size, VectorLength);
+        for (int iter = iter_begin; iter < iter_end; ++iter) {
+          // flush
+          flush.run();
 
-            // initialize matrix
-            Kokkos::deep_copy(a, amat);
+          // initialize matrix
+          Kokkos::deep_copy(a, amat);
 
-            Kokkos::fence();
-            timer.reset();
+          Kokkos::fence();
+          timer.reset();
 
-            Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::TeamTagV2", policy, functor_type(a));
+          Kokkos::parallel_for(
+              "KokkosBatched::PerfTest::LUCuda::TeamTagV3",
+              policy.set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)),
+              functor_type(a));
 
-            Kokkos::fence();
-            const double t = timer.seconds();
-            tmin = std::min(tmin, t);
-            tavg += (iter >= 0)*t;
-          }
-          tavg /= iter_end;
-          
-          auto asol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a);
-          Kokkos::deep_copy(asol, a);
-          
-          double diff = 0;
-          for (int i=0,iend=aref.extent(0);i<iend;++i)
-            for (int j=0,jend=aref.extent(1);j<jend;++j)
-              for (int k=0,kend=aref.extent(2);k<kend;++k)
-                diff += std::abs(aref(i,j,k) - asol(i,j,k));
-
-          std::cout << std::setw(8) << "Kokkos"
-                    << std::setw(8) << "Team V2"
-                    << " BlkSize = " << std::setw(3) << BlkSize
-                    << " TeamSize = " << std::setw(3) << team_size
-                    << " ScratchSize (KB) =   0"
-                    << " time = " << std::scientific << tmin
-                    << " avg flop/s = " << (flop/tavg)
-                    << " max flop/s = " << (flop/tmin);
-#if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__)
-          std::cout << " diff to ref = " << diff;
-#endif
-          std::cout << std::endl;
+          Kokkos::fence();
+          const double t = timer.seconds();
+          tmin           = std::min(tmin, t);
+          tavg += (iter >= 0) * t;
         }
-      }
-      if (1) {
-        ///
-        /// Team V3
-        ///
-        typedef Kokkos::View<value_type***,DeviceSpaceType> view_type;
-        view_type
-          a("a", N*VectorLength, BlkSize, BlkSize);
-
-        double tavg = 0, tmin = tmax;
-        {
-          typedef Kokkos::TeamPolicy<DeviceSpaceType,ScheduleType,TeamTagV3> policy_type;
-          typedef Functor<view_type,AlgoTagType,VectorLength> functor_type;
-
-          const int lvl = 0, per_team_scratch = ScratchViewType<view_type>::shmem_size(VectorLength, BlkSize, BlkSize);
-          if (per_team_scratch/1024 < 48) {
-            const int
-              is_blocked_algo = (std::is_same<AlgoTagType,Algo::LU::Blocked>::value),
-              mb = Algo::LU::Blocked::mb<DeviceMemorySpaceType>();
-            //                  mp = BlkSize%mb > 0;
-
-            const int
-              //mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize;
-              mblk = is_blocked_algo ? (BlkSize - mb) : (BlkSize - 1);
-
-            const int max_team_size =
-              policy_type(N, Kokkos::AUTO, VectorLength).set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch))
-              .team_size_max(functor_type(), Kokkos::ParallelForTag());   
-            const int team_size = std::min(std::max(mblk*2,1), max_team_size);
-
-            policy_type policy(N, team_size, VectorLength);
-            for (int iter=iter_begin;iter<iter_end;++iter) {
-              // flush
-              flush.run();
-
-              // initialize matrix
-              Kokkos::deep_copy(a, amat);
-
-              Kokkos::fence();
-              timer.reset();
-
-              Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::TeamTagV3", policy.set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)),
-                                   functor_type(a));
-
-              Kokkos::fence();
-              const double t = timer.seconds();
-              tmin = std::min(tmin, t);
-              tavg += (iter >= 0)*t;
-            }
-            tavg /= iter_end;
-          
-            auto asol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a);
-            Kokkos::deep_copy(asol, a);
-          
-            double diff = 0;
-            for (int i=0,iend=aref.extent(0);i<iend;++i)
-              for (int j=0,jend=aref.extent(1);j<jend;++j)
-                for (int k=0,kend=aref.extent(2);k<kend;++k)
-                  diff += std::abs(aref(i,j,k) - asol(i,j,k));
-
-            std::cout << std::setw(8) << "Kokkos"
-                      << std::setw(8) << "Team V3"
-                      << " BlkSize = " << std::setw(3) << BlkSize
-                      << " TeamSize = " << std::setw(3) << team_size
-                      << " ScratchSize (KB) = " << std::setw(3) << (per_team_scratch/1024)
-                      << " time = " << std::scientific << tmin
-                      << " avg flop/s = " << (flop/tavg)
-                      << " max flop/s = " << (flop/tmin);
+        tavg /= iter_end;
+
+        auto asol = Kokkos::create_mirror_view(
+            typename HostSpaceType::memory_space(), a);
+        Kokkos::deep_copy(asol, a);
+
+        double diff = 0;
+        for (int i = 0, iend = aref.extent(0); i < iend; ++i)
+          for (int j = 0, jend = aref.extent(1); j < jend; ++j)
+            for (int k = 0, kend = aref.extent(2); k < kend; ++k)
+              diff += Kokkos::ArithTraits<value_type>::abs(aref(i, j, k) -
+                                                           asol(i, j, k));
+
+        std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3"
+                  << " BlkSize = " << std::setw(3) << BlkSize
+                  << " TeamSize = " << std::setw(3) << team_size
+                  << " ScratchSize (KB) = " << std::setw(3)
+                  << (per_team_scratch / 1024) << " time = " << std::scientific
+                  << tmin << " avg flop/s = " << (flop / tavg)
+                  << " max flop/s = " << (flop / tmin);
 #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__)
-            std::cout << " diff to ref = " << diff;
+        std::cout << " diff to ref = " << diff;
 #endif
-            std::cout << std::endl;
-          } else {
-            std::cout << std::setw(8) << "Kokkos"
-                      << std::setw(8) << "Team V3"
-                      << " Scratch per team is too big (KB): " << (per_team_scratch/1024)
-                      << std::endl;
-          }
-        }
+        std::cout << std::endl;
+      } else {
+        std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3"
+                  << " Scratch per team is too big (KB): "
+                  << (per_team_scratch / 1024) << std::endl;
       }
-      std::cout << "\n\n";
     }
   }
+  std::cout << "\n\n";
 }
-
+}  // namespace PerfTest
+}  // namespace KokkosBatched
 
 using namespace KokkosBatched;
 
-template<typename AlgoTagType>
+template <typename AlgoTagType>
 void run(const int N, const int B) {
   typedef Kokkos::DefaultExecutionSpace ExecSpace;
 
   Kokkos::print_configuration(std::cout, false);
 
   if (B != 0) {
-    PerfTest::LU<ExecSpace, AlgoTagType>(N,B);
+    PerfTest::LU<ExecSpace, AlgoTagType>(N, B);
   } else {
     PerfTest::LU<ExecSpace, AlgoTagType>(N, 3);
     PerfTest::LU<ExecSpace, AlgoTagType>(N, 5);
-    PerfTest::LU<ExecSpace, AlgoTagType>(N,10);
-    PerfTest::LU<ExecSpace, AlgoTagType>(N,15);
+    PerfTest::LU<ExecSpace, AlgoTagType>(N, 10);
+    PerfTest::LU<ExecSpace, AlgoTagType>(N, 15);
   }
 }
 
 int main(int argc, char *argv[]) {
-
   Kokkos::initialize(argc, argv);
 
-  int N = 128*128, B = 0;
+  int N = 128 * 128, B = 0;
 
-  for (int i=1;i<argc;++i) {
-    const std::string& token = argv[i];
+  for (int i = 1; i < argc; ++i) {
+    const std::string &token = argv[i];
     if (token == std::string("-N")) N = std::atoi(argv[++i]);
     if (token == std::string("-B")) B = std::atoi(argv[++i]);
   }
 
   {
     std::cout << " N = " << N << std::endl;
-    
+
     std::cout << "\n Testing Algo::LU::Unblocked\n";
-    run<Algo::LU::Unblocked>(N,B);
-    
+    run<Algo::LU::Unblocked>(N, B);
+
     std::cout << "\n Testing LayoutLeft Algo::LU::Blocked\n";
-    run<Algo::LU::Blocked>(N,B);
+    run<Algo::LU::Blocked>(N, B);
   }
 
   Kokkos::finalize();
diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host.hpp b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host.hpp
index 33cbd78b6c..68daa24eb1 100644
--- a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host.hpp
+++ b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host.hpp
@@ -20,312 +20,324 @@
 #undef __KOKKOSBATCHED_INTEL_MKL_BATCHED__
 
 namespace KokkosBatched {
-  namespace PerfTest {
+namespace PerfTest {
 
 #undef FLOP_MUL
 #undef FLOP_ADD
 
-    // no complex yet
-#if defined( KokkosBatched_Test_LU_Host_Complex )      
+// no complex yet
+#if defined(KokkosBatched_Test_LU_Host_Complex)
 #define FLOP_MUL 6.0
 #define FLOP_ADD 2.0
-    typedef Kokkos::complex<double> value_type;
+typedef Kokkos::complex<double> value_type;
 #endif
 
-#if defined( KokkosBatched_Test_LU_Host_Real )      
+#if defined(KokkosBatched_Test_LU_Host_Real)
 #define FLOP_MUL 1.0
-#define FLOP_ADD 1.0      
-    typedef double value_type;
+#define FLOP_ADD 1.0
+typedef double value_type;
 #endif
 
-    double FlopCount(int mm, int nn) {
-      double m = (double)mm;    double n = (double)nn;
-      if (m > n)
-        return (FLOP_MUL*(0.5*m*n*n-(1.0/6.0)*n*n*n+0.5*m*n-0.5*n*n+(2.0/3.0)*n) +
-                FLOP_ADD*(0.5*m*n*n-(1.0/6.0)*n*n*n-0.5*m*n+        (1.0/6.0)*n));
-      else
-        return (FLOP_MUL*(0.5*n*m*m-(1.0/6.0)*m*m*m+0.5*n*m-0.5*m*m+(2.0/3.0)*m) +
-                FLOP_ADD*(0.5*n*m*m-(1.0/6.0)*m*m*m-0.5*n*m+        (1.0/6.0)*m));
-    }
-
-    template<int BlkSize, typename HostSpaceType, typename AlgoTagType>
-    void LU(const int NN) {
-      typedef Kokkos::Schedule<Kokkos::Static> ScheduleType;
-      //typedef Kokkos::Schedule<Kokkos::Dynamic> ScheduleType;
-
-      constexpr int VectorLength = DefaultVectorLength<value_type,typename HostSpaceType::memory_space>::value;
-      const int N = NN/VectorLength;
-
-      {
-        std::string value_type_name;
-        if (std::is_same<value_type,double>::value)                   value_type_name = "double";
-        if (std::is_same<value_type,Kokkos::complex<double> >::value) value_type_name = "Kokkos::complex<double>";
-
-#if   defined(__AVX512F__)
-        std::cout << "AVX512 is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
+double FlopCount(int mm, int nn) {
+  double m = (double)mm;
+  double n = (double)nn;
+  if (m > n)
+    return (FLOP_MUL * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n +
+                        0.5 * m * n - 0.5 * n * n + (2.0 / 3.0) * n) +
+            FLOP_ADD * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n -
+                        0.5 * m * n + (1.0 / 6.0) * n));
+  else
+    return (FLOP_MUL * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m +
+                        0.5 * n * m - 0.5 * m * m + (2.0 / 3.0) * m) +
+            FLOP_ADD * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m -
+                        0.5 * n * m + (1.0 / 6.0) * m));
+}
+
+template <int BlkSize, typename HostSpaceType, typename AlgoTagType>
+void LU(const int NN) {
+  typedef Kokkos::Schedule<Kokkos::Static> ScheduleType;
+  // typedef Kokkos::Schedule<Kokkos::Dynamic> ScheduleType;
+
+  constexpr int VectorLength =
+      DefaultVectorLength<value_type,
+                          typename HostSpaceType::memory_space>::value;
+  const int N = NN / VectorLength;
+
+  {
+    std::string value_type_name;
+    if (std::is_same<value_type, double>::value) value_type_name = "double";
+    if (std::is_same<value_type, Kokkos::complex<double> >::value)
+      value_type_name = "Kokkos::complex<double>";
+
+#if defined(__AVX512F__)
+    std::cout << "AVX512 is defined: datatype " << value_type_name
+              << " a vector length " << VectorLength << "\n";
 #elif defined(__AVX__) || defined(__AVX2__)
-        std::cout << "AVX or AVX2 is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
+    std::cout << "AVX or AVX2 is defined: datatype " << value_type_name
+              << " a vector length " << VectorLength << "\n";
 #else
-        std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
+    std::cout << "SIMD (compiler vectorization) is defined: datatype "
+              << value_type_name << " a vector length " << VectorLength << "\n";
 #endif
+  }
+
+  const double flop = (N * VectorLength) * FlopCount(BlkSize, BlkSize);
+  const double tmax = 1.0e15;
+
+  const int iter_begin = -10, iter_end = 100;
+  Kokkos::Timer timer;
+
+  ///
+  /// Reference version using MKL DGETRF
+  ///
+  Kokkos::View<value_type***, Kokkos::LayoutRight, HostSpaceType> aref;
+  Kokkos::View<value_type***, Kokkos::LayoutRight, HostSpaceType> amat(
+      "amat", N * VectorLength, BlkSize, BlkSize);
+
+  Random<value_type> random;
+
+  for (int k = 0; k < N * VectorLength; ++k) {
+    // use tridiagonal matrices; for now we just check elementwise l/u factors
+    // do not allow pivots
+    for (int i = 0; i < BlkSize; ++i) {
+      amat(k, i, i) = random.value() + 10.0;
+      if ((i + 1) < BlkSize) {
+        amat(k, i, i + 1) = random.value() + 1.0;
+        amat(k, i + 1, i) = random.value() + 1.0;
       }
-
-      const double flop = (N*VectorLength)*FlopCount(BlkSize,BlkSize);
-      const double tmax = 1.0e15;
-
-      const int iter_begin = -10, iter_end = 100;
-      Kokkos::Timer timer;
-
-      ///
-      /// Reference version using MKL DGETRF
-      ///
-      Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> aref;
-      Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType>
-        amat("amat", N*VectorLength, BlkSize, BlkSize);
-
-      Random<value_type> random;
-
-      for (int k=0;k<N*VectorLength;++k) {
-        // use tridiagonal matrices; for now we just check elementwise l/u factors
-        // do not allow pivots
-        for (int i=0;i<BlkSize;++i) {
-          amat(k, i, i) = random.value() + 10.0;
-          if ((i+1) < BlkSize) {
-            amat(k, i, i+1) = random.value() + 1.0;
-            amat(k, i+1, i) = random.value() + 1.0;
+    }
+  }
+
+  typedef Vector<SIMD<value_type>, VectorLength> VectorType;
+  Kokkos::View<VectorType***, Kokkos::LayoutRight, HostSpaceType> amat_simd(
+      "amat_simd", N, BlkSize, BlkSize);  //, a("a", N, BlkSize, BlkSize);
+
+  Kokkos::parallel_for(
+      "KokkosBatched::PerfTest::LUHost::Pack",
+      Kokkos::RangePolicy<HostSpaceType>(0, N * VectorLength),
+      KOKKOS_LAMBDA(const int k) {
+        const int k0 = k / VectorLength, k1 = k % VectorLength;
+        for (int i = 0; i < BlkSize; ++i)
+          for (int j = 0; j < BlkSize; ++j) {
+            amat_simd(k0, i, j)[k1] = amat(k0 * VectorLength + k1, i, j);
           }
-        }
-      }
+      });
 
-      typedef Vector<SIMD<value_type>,VectorLength> VectorType;
-      Kokkos::View<VectorType***,Kokkos::LayoutRight,HostSpaceType>
-        amat_simd("amat_simd", N, BlkSize, BlkSize); //, a("a", N, BlkSize, BlkSize);
-      
-      Kokkos::parallel_for("KokkosBatched::PerfTest::LUHost::Pack", 
-                           Kokkos::RangePolicy<HostSpaceType>(0, N*VectorLength),
-                           KOKKOS_LAMBDA(const int k) {
-                             const int k0 = k/VectorLength, k1 = k%VectorLength;
-                             for (int i=0;i<BlkSize;++i)
-                               for (int j=0;j<BlkSize;++j) {
-                                 amat_simd(k0, i, j)[k1] = amat(k0*VectorLength+k1, i, j);
-                               }
-                           });
-
-      // for KNL
-      constexpr size_t LLC_CAPACITY = 34*1024*1024;
-      Flush<LLC_CAPACITY> flush;
-
-      ///
-      /// Reference version using MKL DGETRF
-      ///
-#if defined(__KOKKOSBATCHED_INTEL_MKL__)
-      {
-        Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> a("a", N*VectorLength, BlkSize, BlkSize);
-        Kokkos::View<int**,Kokkos::LayoutRight,HostSpaceType> p("p", N*VectorLength, BlkSize);
-        {
-          double tavg = 0, tmin = tmax;
-          for (int iter=iter_begin;iter<iter_end;++iter) {
-            // flush
-            flush.run();
-
-            // initialize matrix
-            Kokkos::deep_copy(a, amat);
-
-            HostSpaceType().fence();
-            timer.reset();
-
-            Kokkos::RangePolicy<HostSpaceType,ScheduleType> policy(0, N*VectorLength);
-            Kokkos::parallel_for("KokkosBatched::PerfTest::LUHost::LAPACKE_dgetrfOpenMP", 
-                                 policy,
-                                 KOKKOS_LAMBDA(const int k) {
-                                   auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
-                                   auto pp = Kokkos::subview(p, k, Kokkos::ALL());
-                                   LAPACKE_dgetrf(LAPACK_ROW_MAJOR,
-                                                  BlkSize, BlkSize,
-                                                  (double*)aa.data(), aa.stride_0(),
-                                                  (int*)pp.data());
-                                 });
-
-            HostSpaceType().fence();
-            const double t = timer.seconds();
-            tmin = std::min(tmin, t);
-            tavg += (iter >= 0)*t;
-          }
-          tavg /= iter_end;
-
-          std::cout << std::setw(10) << "MKL LU"
-                    << " BlkSize = " << std::setw(3) << BlkSize
-                    << " time = " << std::scientific << tmin
-                    << " avg flop/s = " << (flop/tavg)
-                    << " max flop/s = " << (flop/tmin)
-                    << std::endl;
-        }
+  // for KNL
+  constexpr size_t LLC_CAPACITY = 34 * 1024 * 1024;
+  Flush<LLC_CAPACITY> flush;
 
-        aref = a;
+  ///
+  /// Reference version using MKL DGETRF
+  ///
+#if defined(__KOKKOSBATCHED_INTEL_MKL__)
+  {
+    Kokkos::View<value_type***, Kokkos::LayoutRight, HostSpaceType> a(
+        "a", N * VectorLength, BlkSize, BlkSize);
+    Kokkos::View<int**, Kokkos::LayoutRight, HostSpaceType> p(
+        "p", N * VectorLength, BlkSize);
+    {
+      double tavg = 0, tmin = tmax;
+      for (int iter = iter_begin; iter < iter_end; ++iter) {
+        // flush
+        flush.run();
+
+        // initialize matrix
+        Kokkos::deep_copy(a, amat);
+
+        HostSpaceType().fence();
+        timer.reset();
+
+        Kokkos::RangePolicy<HostSpaceType, ScheduleType> policy(
+            0, N * VectorLength);
+        Kokkos::parallel_for(
+            "KokkosBatched::PerfTest::LUHost::LAPACKE_dgetrfOpenMP", policy,
+            KOKKOS_LAMBDA(const int k) {
+              auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
+              auto pp = Kokkos::subview(p, k, Kokkos::ALL());
+              LAPACKE_dgetrf(LAPACK_ROW_MAJOR, BlkSize, BlkSize,
+                             (double*)aa.data(), aa.stride_0(),
+                             (int*)pp.data());
+            });
+
+        HostSpaceType().fence();
+        const double t = timer.seconds();
+        tmin           = std::min(tmin, t);
+        tavg += (iter >= 0) * t;
       }
+      tavg /= iter_end;
+
+      std::cout << std::setw(10) << "MKL LU"
+                << " BlkSize = " << std::setw(3) << BlkSize
+                << " time = " << std::scientific << tmin
+                << " avg flop/s = " << (flop / tavg)
+                << " max flop/s = " << (flop / tmin) << std::endl;
+    }
+
+    aref = a;
+  }
 
 #if defined(__KOKKOSBATCHED_INTEL_MKL_BATCHED__)
 #endif
 
 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
-      {
-        Kokkos::View<VectorType***,Kokkos::LayoutRight,HostSpaceType>
-          a("a", N, BlkSize, BlkSize);
-        
-        {
-          double tavg = 0, tmin = tmax;
-          MKL_COMPACT_PACK format;
-          if (VectorLength == 8)              format = MKL_COMPACT_AVX512;
-          else if (VectorLength == 4)         format = MKL_COMPACT_AVX;
-
-          if (format == MKL_COMPACT_AVX512 || format == MKL_COMPACT_AVX) {
-            int info;
-            for (int iter=iter_begin;iter<iter_end;++iter) {
-              // flush
-              flush.run();
-                
-              // initialize matrix
-              Kokkos::deep_copy(a, amat_simd);
-                
-              HostSpaceType().fence();
-              timer.reset();
-                
-              mkl_dgetrfnp_compact(MKL_ROW_MAJOR,
-                                   BlkSize, BlkSize,
-                                   (double*)a.data(), a.stride_1(),
-                                   (MKL_INT*)&info, format, (MKL_INT)N*VectorLength);
-                
-              HostSpaceType().fence();
-              const double t = timer.seconds();
-              tmin = std::min(tmin, t);
-              tavg += (iter >= 0)*t;
-            }
-            tavg /= iter_end;
-              
-            double diff = 0;
-            for (int i=0,iend=aref.extent(0);i<iend;++i)
-              for (int j=0,jend=aref.extent(1);j<jend;++j)
-                for (int k=0,kend=aref.extent(2);k<kend;++k)
-                  diff += abs(aref(i,j,k) - a(i/VectorLength,j,k)[i%VectorLength]);
-              
-            std::cout << std::setw(10) << "MKL Cmpt"
-                      << " BlkSize = " << std::setw(3) << BlkSize
-                      << " time = " << std::scientific << tmin
-                      << " avg flop/s = " << (flop/tavg)
-                      << " max flop/s = " << (flop/tmin)
-                      << " diff to ref = " << diff
-                      << std::endl;
-          }
+  {
+    Kokkos::View<VectorType***, Kokkos::LayoutRight, HostSpaceType> a(
+        "a", N, BlkSize, BlkSize);
+
+    {
+      double tavg = 0, tmin = tmax;
+      MKL_COMPACT_PACK format;
+      if (VectorLength == 8)
+        format = MKL_COMPACT_AVX512;
+      else if (VectorLength == 4)
+        format = MKL_COMPACT_AVX;
+
+      if (format == MKL_COMPACT_AVX512 || format == MKL_COMPACT_AVX) {
+        int info;
+        for (int iter = iter_begin; iter < iter_end; ++iter) {
+          // flush
+          flush.run();
+
+          // initialize matrix
+          Kokkos::deep_copy(a, amat_simd);
+
+          HostSpaceType().fence();
+          timer.reset();
+
+          mkl_dgetrfnp_compact(MKL_ROW_MAJOR, BlkSize, BlkSize,
+                               (double*)a.data(), a.stride_1(), (MKL_INT*)&info,
+                               format, (MKL_INT)N * VectorLength);
+
+          HostSpaceType().fence();
+          const double t = timer.seconds();
+          tmin           = std::min(tmin, t);
+          tavg += (iter >= 0) * t;
         }
+        tavg /= iter_end;
+
+        double diff = 0;
+        for (int i = 0, iend = aref.extent(0); i < iend; ++i)
+          for (int j = 0, jend = aref.extent(1); j < jend; ++j)
+            for (int k = 0, kend = aref.extent(2); k < kend; ++k)
+              diff += abs(aref(i, j, k) -
+                          a(i / VectorLength, j, k)[i % VectorLength]);
+
+        std::cout << std::setw(10) << "MKL Cmpt"
+                  << " BlkSize = " << std::setw(3) << BlkSize
+                  << " time = " << std::scientific << tmin
+                  << " avg flop/s = " << (flop / tavg)
+                  << " max flop/s = " << (flop / tmin)
+                  << " diff to ref = " << diff << std::endl;
       }
+    }
+  }
 #endif
 
 #endif
-      // ///
-      // /// Plain version (comparable to micro BLAS version)
-      // ///
-
-      // {
-      //   Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType>
-      //     a("a", N*VectorLength, BlkSize, BlkSize);
-
-      //   {
-      //     double tavg = 0, tmin = tmax;
-      //     for (int iter=iter_begin;iter<iter_end;++iter) {
-      //       // flush
-      //       flush.run();
-
-      //       // initialize matrix
-      //       Kokkos::deep_copy(a, amat);
-
-      //       HostSpaceType().fence();
-      //       timer.reset();
-
-      //       Kokkos::RangePolicy<HostSpaceType,ScheduleType> policy(0, N*VectorLength);
-      //       Kokkos::parallel_for
-      //         (policy,
-      //          KOKKOS_LAMBDA(const int k) {
-      //           auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
-
-      //           SerialLU<AlgoTagType>::invoke(aa);
-      //         });
-
-      //       HostSpaceType().fence();
-      //       const double t = timer.seconds();
-      //       tmin = std::min(tmin, t);
-      //       tavg += (iter >= 0)*t;
-      //     }
-      //     tavg /= iter_end;
-
-      //     double diff = 0;
-      //     for (int i=0,iend=aref.extent(0);i<iend;++i)
-      //       for (int j=0,jend=aref.extent(1);j<jend;++j)
-      //         for (int k=0,kend=aref.extent(2);k<kend;++k)
-      //           diff += abs(aref(i,j,k) - a(i,j,k));
-
-      //     std::cout << std::setw(10) << "Plain"
-      //               << " BlkSize = " << std::setw(3) << BlkSize
-      //               << " time = " << std::scientific << tmin
-      //               << " avg flop/s = " << (flop/tavg)
-      //               << " max flop/s = " << (flop/tmin)
-      //               << " diff to ref = " << diff
-      //               << std::endl;
-      //   }
-      // }
-
-      ///
-      /// SIMD with appropriate data layout
-      ///
-
-      {
-        Kokkos::View<VectorType***,Kokkos::LayoutRight,HostSpaceType>
-          a("a", N, BlkSize, BlkSize);
-        
-        {
-          double tavg = 0, tmin = tmax;
-          for (int iter=iter_begin;iter<iter_end;++iter) {
-            // flush
-            flush.run();
-
-            // initialize matrix
-            Kokkos::deep_copy(a, amat_simd);
-
-            HostSpaceType().fence();
-            timer.reset();
-
-            Kokkos::RangePolicy<HostSpaceType,ScheduleType > policy(0, N);
-            Kokkos::parallel_for("KokkosBatched::PerfTest::LUHost::SIMDSerialOpenMP", 
-                                 policy,
-                                 KOKKOS_LAMBDA(const int k) {
-                                   auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
-
-                                   SerialLU<AlgoTagType>::invoke(aa);
-                                 });
-
-            HostSpaceType().fence();
-            const double t = timer.seconds();
-            tmin = std::min(tmin, t);
-            tavg += (iter >= 0)*t;
-          }
-          tavg /= iter_end;
-
-          double diff = 0;
-          for (int i=0,iend=aref.extent(0);i<iend;++i)
-            for (int j=0,jend=aref.extent(1);j<jend;++j)
-              for (int k=0,kend=aref.extent(2);k<kend;++k)
-                diff += abs(aref(i,j,k) - a(i/VectorLength,j,k)[i%VectorLength]);
-          std::cout << std::setw(10) << "SIMD"
-                    << " BlkSize = " << std::setw(3) << BlkSize
-                    << " time = " << std::scientific << tmin
-                    << " avg flop/s = " << (flop/tavg)
-                    << " max flop/s = " << (flop/tmin)
-                    << " diff to ref = " << diff
-                    << std::endl;
-        }
+  // ///
+  // /// Plain version (comparable to micro BLAS version)
+  // ///
+
+  // {
+  //   Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType>
+  //     a("a", N*VectorLength, BlkSize, BlkSize);
+
+  //   {
+  //     double tavg = 0, tmin = tmax;
+  //     for (int iter=iter_begin;iter<iter_end;++iter) {
+  //       // flush
+  //       flush.run();
+
+  //       // initialize matrix
+  //       Kokkos::deep_copy(a, amat);
+
+  //       HostSpaceType().fence();
+  //       timer.reset();
+
+  //       Kokkos::RangePolicy<HostSpaceType,ScheduleType> policy(0,
+  //       N*VectorLength); Kokkos::parallel_for
+  //         (policy,
+  //          KOKKOS_LAMBDA(const int k) {
+  //           auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
+
+  //           SerialLU<AlgoTagType>::invoke(aa);
+  //         });
+
+  //       HostSpaceType().fence();
+  //       const double t = timer.seconds();
+  //       tmin = std::min(tmin, t);
+  //       tavg += (iter >= 0)*t;
+  //     }
+  //     tavg /= iter_end;
+
+  //     double diff = 0;
+  //     for (int i=0,iend=aref.extent(0);i<iend;++i)
+  //       for (int j=0,jend=aref.extent(1);j<jend;++j)
+  //         for (int k=0,kend=aref.extent(2);k<kend;++k)
+  //           diff += abs(aref(i,j,k) - a(i,j,k));
+
+  //     std::cout << std::setw(10) << "Plain"
+  //               << " BlkSize = " << std::setw(3) << BlkSize
+  //               << " time = " << std::scientific << tmin
+  //               << " avg flop/s = " << (flop/tavg)
+  //               << " max flop/s = " << (flop/tmin)
+  //               << " diff to ref = " << diff
+  //               << std::endl;
+  //   }
+  // }
+
+  ///
+  /// SIMD with appropriate data layout
+  ///
+
+  {
+    Kokkos::View<VectorType***, Kokkos::LayoutRight, HostSpaceType> a(
+        "a", N, BlkSize, BlkSize);
+
+    {
+      double tavg = 0, tmin = tmax;
+      for (int iter = iter_begin; iter < iter_end; ++iter) {
+        // flush
+        flush.run();
+
+        // initialize matrix
+        Kokkos::deep_copy(a, amat_simd);
+
+        HostSpaceType().fence();
+        timer.reset();
+
+        Kokkos::RangePolicy<HostSpaceType, ScheduleType> policy(0, N);
+        Kokkos::parallel_for(
+            "KokkosBatched::PerfTest::LUHost::SIMDSerialOpenMP", policy,
+            KOKKOS_LAMBDA(const int k) {
+              auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
+
+              SerialLU<AlgoTagType>::invoke(aa);
+            });
+
+        HostSpaceType().fence();
+        const double t = timer.seconds();
+        tmin           = std::min(tmin, t);
+        tavg += (iter >= 0) * t;
       }
+      tavg /= iter_end;
+
+      double diff = 0;
+      for (int i = 0, iend = aref.extent(0); i < iend; ++i)
+        for (int j = 0, jend = aref.extent(1); j < jend; ++j)
+          for (int k = 0, kend = aref.extent(2); k < kend; ++k)
+            diff += abs(aref(i, j, k) -
+                        a(i / VectorLength, j, k)[i % VectorLength]);
+      std::cout << std::setw(10) << "SIMD"
+                << " BlkSize = " << std::setw(3) << BlkSize
+                << " time = " << std::scientific << tmin
+                << " avg flop/s = " << (flop / tavg)
+                << " max flop/s = " << (flop / tmin)
+                << " diff to ref = " << diff << std::endl;
     }
+  }
+}
 
-  } // namespace PerfTest
-} // namespace KokkosBatched
-
-
+}  // namespace PerfTest
+}  // namespace KokkosBatched
diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp
index 7d352283c6..6c0736501d 100644
--- a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp
+++ b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp
@@ -5,36 +5,35 @@
 
 using namespace KokkosBatched;
 
-template<typename AlgoTagType>
+template <typename AlgoTagType>
 void run(const int N) {
   typedef Kokkos::DefaultHostExecutionSpace HostSpaceType;
 
   Kokkos::print_configuration(std::cout, false);
 
-  PerfTest::LU< 3, HostSpaceType,AlgoTagType>(N);
-  PerfTest::LU< 5, HostSpaceType,AlgoTagType>(N);
-  PerfTest::LU<10, HostSpaceType,AlgoTagType>(N);
-  PerfTest::LU<15, HostSpaceType,AlgoTagType>(N);
+  PerfTest::LU<3, HostSpaceType, AlgoTagType>(N);
+  PerfTest::LU<5, HostSpaceType, AlgoTagType>(N);
+  PerfTest::LU<10, HostSpaceType, AlgoTagType>(N);
+  PerfTest::LU<15, HostSpaceType, AlgoTagType>(N);
 }
 
-int main(int argc, char *argv[]) {
-
+int main(int argc, char* argv[]) {
   Kokkos::initialize(argc, argv);
 
 #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
-  int N = 128*128;
+  int N = 128 * 128;
 
-  for (int i=1;i<argc;++i) {
+  for (int i = 1; i < argc; ++i) {
     const std::string& token = argv[i];
     if (token == std::string("-N")) N = std::atoi(argv[++i]);
   }
 
   {
     std::cout << " N = " << N << std::endl;
-    
+
     std::cout << "\n Testing Algo::LU::Unblocked\n";
     run<Algo::LU::Unblocked>(N);
-    
+
     std::cout << "\n Testing Algo::LU::Blocked\n";
     run<Algo::LU::Blocked>(N);
 
diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp
index 807b7a884e..6000bc7c9d 100644
--- a/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp
+++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp
@@ -3,7 +3,7 @@
 #include "Kokkos_Core.hpp"
 #include "Kokkos_Timer.hpp"
 
-#if defined(KOKKOS_ENABLE_CUDA) 
+#if defined(KOKKOS_ENABLE_CUDA)
 
 #include <iomanip>
 
@@ -24,755 +24,746 @@
 #include "KokkosBatched_Trsm_Team_Impl.hpp"
 
 namespace KokkosBatched {
-  namespace PerfTest {
-    
+namespace PerfTest {
+
 #undef FLOP_MUL
 #undef FLOP_ADD
 #define FLOP_MUL 1.0
 #define FLOP_ADD 1.0
-    typedef double value_type;
+typedef double value_type;
 
-    double FlopCountLower(int mm, int nn) {
-      double m = (double)mm;    double n = (double)nn;
-      return (FLOP_MUL*(0.5*m*n*(n+1.0)) +
-              FLOP_ADD*(0.5*m*n*(n-1.0)));
-    }
-    
-    double FlopCountUpper(int mm, int nn) {
-      double m = (double)mm;    double n = (double)nn;
-      return (FLOP_MUL*(0.5*m*n*(n+1.0)) +
-              FLOP_ADD*(0.5*m*n*(n-1.0)));
+double FlopCountLower(int mm, int nn) {
+  double m = (double)mm;
+  double n = (double)nn;
+  return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) +
+          FLOP_ADD * (0.5 * m * n * (n - 1.0)));
+}
+
+double FlopCountUpper(int mm, int nn) {
+  double m = (double)mm;
+  double n = (double)nn;
+  return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) +
+          FLOP_ADD * (0.5 * m * n * (n - 1.0)));
+}
+
+struct RangeTag {};
+struct TeamTagV1 {};
+struct TeamTagV2 {};
+struct TeamTagV3 {};
+struct TeamTagHandmade {};
+
+template <int test, typename ViewType, typename AlgoTagType,
+          int VectorLength = 0>
+struct Functor {
+  ConstUnmanagedViewType<ViewType> _a;
+  UnmanagedViewType<ViewType> _b;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor() = default;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor(const ViewType &a, const ViewType &b) : _a(a), _b(b) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const RangeTag &, const int k) const {
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
+
+    switch (test) {
+      case 0:
+        SerialTrsm<Side::Left, Uplo::Lower, Trans::NoTranspose, Diag::Unit,
+                   AlgoTagType>::invoke(1.0, aa, bb);
+        break;
+      case 1:
+        SerialTrsm<Side::Left, Uplo::Lower, Trans::NoTranspose, Diag::NonUnit,
+                   AlgoTagType>::invoke(1.0, aa, bb);
+        break;
+      case 2:
+        SerialTrsm<Side::Right, Uplo::Upper, Trans::NoTranspose, Diag::Unit,
+                   AlgoTagType>::invoke(1.0, aa, bb);
+        break;
+      case 3:
+        SerialTrsm<Side::Right, Uplo::Upper, Trans::NoTranspose, Diag::NonUnit,
+                   AlgoTagType>::invoke(1.0, aa, bb);
+        break;
+      case 4:
+        SerialTrsm<Side::Left, Uplo::Upper, Trans::NoTranspose, Diag::NonUnit,
+                   AlgoTagType>::invoke(1.0, aa, bb);
+        break;
     }
+  }
 
-    struct RangeTag {};
-    struct TeamTagV1 {};
-    struct TeamTagV2 {};
-    struct TeamTagV3 {};
-    struct TeamTagHandmade {};
-      
-    template<int test, typename ViewType, typename AlgoTagType, int VectorLength = 0>
-    struct Functor {
-      ConstUnmanagedViewType<ViewType> _a;
-      UnmanagedViewType<ViewType> _b;
-
-      KOKKOS_INLINE_FUNCTION
-      Functor() = default;
-
-      KOKKOS_INLINE_FUNCTION
-      Functor(const ViewType &a,
-              const ViewType &b)
-        : _a(a), _b(b) {}
-
-      KOKKOS_INLINE_FUNCTION
-      void operator()(const RangeTag &, const int k) const {
-        auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-        auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV1 &,
+                                         const MemberType &member) const {
+    const int kbeg =
+        (member.league_rank() * (member.team_size() * VectorLength) +
+         member.team_rank() * VectorLength);
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) {
+          const int kk = kbeg + k;
+          if (kk < int(_b.extent(0))) {
+            auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL());
+            auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL());
 
-        switch (test) {
-        case 0: 
-          SerialTrsm<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit,AlgoTagType>::
-            invoke(1.0, aa, bb);
-          break;
-        case 1:
-          SerialTrsm<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::NonUnit,AlgoTagType>::
-            invoke(1.0, aa, bb);
-          break;
-        case 2:
-          SerialTrsm<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::Unit,AlgoTagType>::
-            invoke(1.0, aa, bb);
-          break;
-        case 3:
-          SerialTrsm<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,AlgoTagType>::
-            invoke(1.0, aa, bb);
-          break;
-        case 4:
-          SerialTrsm<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,AlgoTagType>::
-            invoke(1.0, aa, bb);
-          break;
-        }
-      }
-          
-      template<typename MemberType>
-      KOKKOS_INLINE_FUNCTION
-      void operator()(const TeamTagV1 &, const MemberType &member) const {
-        const int kbeg = (member.league_rank()*(member.team_size()*VectorLength) +
-                          member.team_rank()*VectorLength);
-        Kokkos::parallel_for
-          (Kokkos::ThreadVectorRange(member, VectorLength),
-           [&](const int &k) {
-            const int kk = kbeg + k;
-            if (kk < int(_b.extent(0))) {
-              auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL());
-              auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL());
-
-              switch (test) {
-              case 0: 
-                SerialTrsm<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit,AlgoTagType>::
-                  invoke(1.0, aa, bb);
+            switch (test) {
+              case 0:
+                SerialTrsm<Side::Left, Uplo::Lower, Trans::NoTranspose,
+                           Diag::Unit, AlgoTagType>::invoke(1.0, aa, bb);
                 break;
               case 1:
-                SerialTrsm<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::NonUnit,AlgoTagType>::
-                  invoke(1.0, aa, bb);
+                SerialTrsm<Side::Left, Uplo::Lower, Trans::NoTranspose,
+                           Diag::NonUnit, AlgoTagType>::invoke(1.0, aa, bb);
                 break;
               case 2:
-                SerialTrsm<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::Unit,AlgoTagType>::
-                  invoke(1.0, aa, bb);
+                SerialTrsm<Side::Right, Uplo::Upper, Trans::NoTranspose,
+                           Diag::Unit, AlgoTagType>::invoke(1.0, aa, bb);
                 break;
               case 3:
-                SerialTrsm<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,AlgoTagType>::
-                  invoke(1.0, aa, bb);
+                SerialTrsm<Side::Right, Uplo::Upper, Trans::NoTranspose,
+                           Diag::NonUnit, AlgoTagType>::invoke(1.0, aa, bb);
                 break;
               case 4:
-                SerialTrsm<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,AlgoTagType>::
-                  invoke(1.0, aa, bb);
+                SerialTrsm<Side::Left, Uplo::Upper, Trans::NoTranspose,
+                           Diag::NonUnit, AlgoTagType>::invoke(1.0, aa, bb);
                 break;
-              }
             }
-          });
-      }
-          
-      template<typename MemberType>
-      KOKKOS_INLINE_FUNCTION
-      void operator()(const TeamTagV2 &, const MemberType &member) const {
-        const int kbeg = member.league_rank()*VectorLength;
-        Kokkos::parallel_for
-          (Kokkos::ThreadVectorRange(member, VectorLength),
-           [&](const int &k) {
-            const int kk = kbeg + k;
-            if (kk < int(_b.extent(0))) {
-              auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL());
-              auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL());
-
-              switch (test) {
-              case 0: 
-                TeamTrsm<MemberType,Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit,AlgoTagType>::
-                  invoke(member, 1.0, aa, bb);
+          }
+        });
+  }
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV2 &,
+                                         const MemberType &member) const {
+    const int kbeg = member.league_rank() * VectorLength;
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) {
+          const int kk = kbeg + k;
+          if (kk < int(_b.extent(0))) {
+            auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL());
+            auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL());
+
+            switch (test) {
+              case 0:
+                TeamTrsm<MemberType, Side::Left, Uplo::Lower,
+                         Trans::NoTranspose, Diag::Unit,
+                         AlgoTagType>::invoke(member, 1.0, aa, bb);
                 break;
               case 1:
-                TeamTrsm<MemberType,Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::NonUnit,AlgoTagType>::
-                  invoke(member, 1.0, aa, bb);
+                TeamTrsm<MemberType, Side::Left, Uplo::Lower,
+                         Trans::NoTranspose, Diag::NonUnit,
+                         AlgoTagType>::invoke(member, 1.0, aa, bb);
                 break;
               case 2:
-                TeamTrsm<MemberType,Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::Unit,AlgoTagType>::
-                  invoke(member, 1.0, aa, bb);
+                TeamTrsm<MemberType, Side::Right, Uplo::Upper,
+                         Trans::NoTranspose, Diag::Unit,
+                         AlgoTagType>::invoke(member, 1.0, aa, bb);
                 break;
               case 3:
-                TeamTrsm<MemberType,Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,AlgoTagType>::
-                  invoke(member, 1.0, aa, bb);
+                TeamTrsm<MemberType, Side::Right, Uplo::Upper,
+                         Trans::NoTranspose, Diag::NonUnit,
+                         AlgoTagType>::invoke(member, 1.0, aa, bb);
                 break;
               case 4:
-                TeamTrsm<MemberType,Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,AlgoTagType>::
-                  invoke(member, 1.0, aa, bb);
+                TeamTrsm<MemberType, Side::Left, Uplo::Upper,
+                         Trans::NoTranspose, Diag::NonUnit,
+                         AlgoTagType>::invoke(member, 1.0, aa, bb);
                 break;
-              }
             }
-          });
-      }
+          }
+        });
+  }
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV3 &,
+                                         const MemberType &member) const {
+    const int lvl = 0;
+    ScratchViewType<ViewType> sa(member.team_scratch(lvl), VectorLength,
+                                 _a.extent(1), _a.extent(2));
+    // ScratchViewType<ViewType> sb(member.team_scratch(lvl), VectorLength,
+    // _b.extent(1), _b.extent(2));
 
-      template<typename MemberType>
-      KOKKOS_INLINE_FUNCTION
-      void operator()(const TeamTagV3 &, const MemberType &member) const {
-        const int lvl = 0;
-        ScratchViewType<ViewType> sa(member.team_scratch(lvl), VectorLength, _a.extent(1), _a.extent(2));
-        //ScratchViewType<ViewType> sb(member.team_scratch(lvl), VectorLength, _b.extent(1), _b.extent(2));
-
-        const int kbeg = member.league_rank()*VectorLength;
-        Kokkos::parallel_for
-          (Kokkos::ThreadVectorRange(member, VectorLength),
-           [&](const int &k) {
-            const int kk = kbeg + k;
-            if (kk < int(_b.extent(0))) {
-              auto aa  = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL());
-              auto bb  = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL());
-
-              auto saa = Kokkos::subview(sa,  k, Kokkos::ALL(), Kokkos::ALL());
-
-              TeamCopy<MemberType,Trans::NoTranspose>::invoke(member, aa, saa);
-              member.team_barrier();
-
-              switch (test) {
-              case 0: 
-                TeamTrsm<MemberType,Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit,AlgoTagType>::
-                  invoke(member, 1.0, saa, bb);
+    const int kbeg = member.league_rank() * VectorLength;
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) {
+          const int kk = kbeg + k;
+          if (kk < int(_b.extent(0))) {
+            auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL());
+            auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL());
+
+            auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL());
+
+            TeamCopy<MemberType, Trans::NoTranspose>::invoke(member, aa, saa);
+            member.team_barrier();
+
+            switch (test) {
+              case 0:
+                TeamTrsm<MemberType, Side::Left, Uplo::Lower,
+                         Trans::NoTranspose, Diag::Unit,
+                         AlgoTagType>::invoke(member, 1.0, saa, bb);
                 break;
               case 1:
-                TeamTrsm<MemberType,Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::NonUnit,AlgoTagType>::
-                  invoke(member, 1.0, saa, bb);
+                TeamTrsm<MemberType, Side::Left, Uplo::Lower,
+                         Trans::NoTranspose, Diag::NonUnit,
+                         AlgoTagType>::invoke(member, 1.0, saa, bb);
                 break;
               case 2:
-                TeamTrsm<MemberType,Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::Unit,AlgoTagType>::
-                  invoke(member, 1.0, saa, bb);
+                TeamTrsm<MemberType, Side::Right, Uplo::Upper,
+                         Trans::NoTranspose, Diag::Unit,
+                         AlgoTagType>::invoke(member, 1.0, saa, bb);
                 break;
               case 3:
-                TeamTrsm<MemberType,Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,AlgoTagType>::
-                  invoke(member, 1.0, saa, bb);
+                TeamTrsm<MemberType, Side::Right, Uplo::Upper,
+                         Trans::NoTranspose, Diag::NonUnit,
+                         AlgoTagType>::invoke(member, 1.0, saa, bb);
                 break;
               case 4:
-                TeamTrsm<MemberType,Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,AlgoTagType>::
-                  invoke(member, 1.0, saa, bb);
+                TeamTrsm<MemberType, Side::Left, Uplo::Upper,
+                         Trans::NoTranspose, Diag::NonUnit,
+                         AlgoTagType>::invoke(member, 1.0, saa, bb);
                 break;
-              }
             }
-          });
-      }
+          }
+        });
+  }
+};
 
-    };
+template <int test, typename DeviceSpaceType, typename AlgoTagType>
+void Trsm(const int NN, const int BlkSize, const int NumCols) {
+  typedef Kokkos::Schedule<Kokkos::Static> ScheduleType;
 
+  constexpr int VectorLength =
+      DefaultVectorLength<value_type,
+                          typename DeviceSpaceType::memory_space>::value;
+  const int N = NN / VectorLength;
 
-    template<int test, typename DeviceSpaceType, typename AlgoTagType>
-    void Trsm(const int NN, const int BlkSize, const int NumCols) {
-      typedef Kokkos::Schedule<Kokkos::Static> ScheduleType;
+  {
+    std::string value_type_name;
+    if (std::is_same<value_type, double>::value) value_type_name = "double";
+    if (std::is_same<value_type, Kokkos::complex<double> >::value)
+      value_type_name = "Kokkos::complex<double>";
 
-      constexpr int VectorLength = DefaultVectorLength<value_type,typename DeviceSpaceType::memory_space>::value;
-      const int N = NN/VectorLength;
-          
-      {
-        std::string value_type_name;
-        if (std::is_same<value_type,double>::value)                   value_type_name = "double";
-        if (std::is_same<value_type,Kokkos::complex<double> >::value) value_type_name = "Kokkos::complex<double>";
-            
-        std::cout << "SIMD is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
-      }
+    std::cout << "SIMD is defined: datatype " << value_type_name
+              << " a vector length " << VectorLength << "\n";
+  }
 
-      switch (test) {
-      case 0: std::cout << "TestID = Left,  Lower, NoTrans,    UnitDiag\n"; break;
-      case 1: std::cout << "TestID = Left,  Lower, NoTrans, NonUnitDiag\n"; break;
-      case 2: std::cout << "TestID = Right, Upper, NoTrans,    UnitDiag\n"; break;
-      case 3: std::cout << "TestID = Right, Upper, NoTrans, NonUnitDiag\n"; break;
-      case 4: std::cout << "TestID = Left,  Upper, NoTrans, NonUnitDiag\n"; break;
-      }
+  switch (test) {
+    case 0: std::cout << "TestID = Left,  Lower, NoTrans,    UnitDiag\n"; break;
+    case 1: std::cout << "TestID = Left,  Lower, NoTrans, NonUnitDiag\n"; break;
+    case 2: std::cout << "TestID = Right, Upper, NoTrans,    UnitDiag\n"; break;
+    case 3: std::cout << "TestID = Right, Upper, NoTrans, NonUnitDiag\n"; break;
+    case 4: std::cout << "TestID = Left,  Upper, NoTrans, NonUnitDiag\n"; break;
+  }
 
-      // when m == n, lower upper does not matter (unit and nonunit)
-      double flop = 0;
-      switch (test) {
-      case 0:
-      case 1:
-        flop = FlopCountLower(BlkSize,NumCols);
-        break;
-      case 2:
-      case 3:
-      case 4:
-        flop = FlopCountUpper(BlkSize,NumCols);
-        break;
-      }
-      flop *= (N*VectorLength);
-      const double tmax = 1.0e15;
-
-      typedef Kokkos::DefaultHostExecutionSpace HostSpaceType;
-      typedef typename DeviceSpaceType::memory_space DeviceMemorySpaceType;
-
-      const int iter_begin = -3, iter_end = 30;
-      Kokkos::Timer timer;
-
-      Kokkos::View<value_type***,Kokkos::LayoutLeft,HostSpaceType>
-        amat("amat", N*VectorLength, BlkSize, BlkSize),
-        bmat("bmat", N*VectorLength, BlkSize, NumCols),
-        bref("bmat", N*VectorLength, BlkSize, NumCols);
-
-      {
-        Random<value_type> random;
-        for (int k=0;k<N*VectorLength;++k) {
-          for (int i=0;i<BlkSize;++i)
-            for (int j=0;j<BlkSize;++j)
-              amat(k, i, j) = random.value() + 4.0*(i==j);
-          for (int i=0;i<BlkSize;++i)
-            for (int j=0;j<NumCols;++j)
-              bmat(k, i, j) = random.value();
-        }
-      }
-      
-      // P100 L2 cache 4MB per core
-      constexpr size_t LLC_CAPACITY = 56*4*1024*1024;
-      Flush<LLC_CAPACITY,DeviceSpaceType> flush;
+  // when m == n, lower upper does not matter (unit and nonunit)
+  double flop = 0;
+  switch (test) {
+    case 0:
+    case 1: flop = FlopCountLower(BlkSize, NumCols); break;
+    case 2:
+    case 3:
+    case 4: flop = FlopCountUpper(BlkSize, NumCols); break;
+  }
+  flop *= (N * VectorLength);
+  const double tmax = 1.0e15;
 
-#if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__)
-      if (1) {
-        ///
-        /// CUBLAS Batch version
-        ///
-        const Kokkos::LayoutStride stride(N*VectorLength, BlkSize*BlkSize,
-                                          BlkSize, 1,
-                                          BlkSize, BlkSize);
+  typedef Kokkos::DefaultHostExecutionSpace HostSpaceType;
+  typedef typename DeviceSpaceType::memory_space DeviceMemorySpaceType;
 
-        Kokkos::View<value_type***,Kokkos::LayoutStride,DeviceSpaceType>
-          a("a", stride),
-          b("b", stride);       
+  const int iter_begin = -3, iter_end = 30;
+  Kokkos::Timer timer;
 
-        cublasStatus_t stat;
-        cublasHandle_t handle;
+  Kokkos::View<value_type ***, Kokkos::LayoutLeft, HostSpaceType> amat(
+      "amat", N * VectorLength, BlkSize, BlkSize),
+      bmat("bmat", N * VectorLength, BlkSize, NumCols),
+      bref("bmat", N * VectorLength, BlkSize, NumCols);
 
-        stat = cublasCreate(&handle);
-        if (stat != CUBLAS_STATUS_SUCCESS)
-          Kokkos::abort("CUBLAS initialization failed\n");
+  {
+    Random<value_type> random;
+    for (int k = 0; k < N * VectorLength; ++k) {
+      for (int i = 0; i < BlkSize; ++i)
+        for (int j = 0; j < BlkSize; ++j)
+          amat(k, i, j) = random.value() + 4.0 * (i == j);
+      for (int i = 0; i < BlkSize; ++i)
+        for (int j = 0; j < NumCols; ++j) bmat(k, i, j) = random.value();
+    }
+  }
 
-        auto amat_device = Kokkos::create_mirror_view(typename DeviceSpaceType::memory_space(), amat);
-        auto bmat_device = Kokkos::create_mirror_view(typename DeviceSpaceType::memory_space(), bmat);
+  // P100 L2 cache 4MB per core
+  constexpr size_t LLC_CAPACITY = 56 * 4 * 1024 * 1024;
+  Flush<LLC_CAPACITY, DeviceSpaceType> flush;
 
-        Kokkos::deep_copy(amat_device, amat);
-        Kokkos::deep_copy(bmat_device, bmat);
+#if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__)
+  if (1) {
+    ///
+    /// CUBLAS Batch version
+    ///
+    const Kokkos::LayoutStride stride(N * VectorLength, BlkSize * BlkSize,
+                                      BlkSize, 1, BlkSize, BlkSize);
+
+    Kokkos::View<value_type ***, Kokkos::LayoutStride, DeviceSpaceType> a(
+        "a", stride),
+        b("b", stride);
+
+    cublasStatus_t stat;
+    cublasHandle_t handle;
+
+    stat = cublasCreate(&handle);
+    if (stat != CUBLAS_STATUS_SUCCESS)
+      Kokkos::abort("CUBLAS initialization failed\n");
+
+    auto amat_device = Kokkos::create_mirror_view(
+        typename DeviceSpaceType::memory_space(), amat);
+    auto bmat_device = Kokkos::create_mirror_view(
+        typename DeviceSpaceType::memory_space(), bmat);
+
+    Kokkos::deep_copy(amat_device, amat);
+    Kokkos::deep_copy(bmat_device, bmat);
+
+    Kokkos::fence();
+
+    const double one(1.0);  //, zero(0.0);
+    {
+      double tavg = 0, tmin = tmax;
+      value_type *aa[N * VectorLength], *bb[N * VectorLength];
+      for (int k = 0; k < N * VectorLength; ++k) {
+        aa[k] = a.data() + k * a.stride_0();
+        bb[k] = b.data() + k * b.stride_0();
+      }
+      value_type **aa_device, **bb_device;
+      if (cudaMalloc(&aa_device, N * VectorLength * sizeof(value_type *)) !=
+              cudaSuccess ||
+          cudaMalloc(&bb_device, N * VectorLength * sizeof(value_type *)) !=
+              cudaSuccess) {
+        Kokkos::abort("CUDA memory allocation failed\n");
+      }
+      if (cudaMemcpy(aa_device, aa, sizeof(value_type *) * N * VectorLength,
+                     cudaMemcpyHostToDevice) != cudaSuccess ||
+          cudaMemcpy(bb_device, bb, sizeof(value_type *) * N * VectorLength,
+                     cudaMemcpyHostToDevice) != cudaSuccess) {
+        Kokkos::abort("CUDA memcpy failed\n");
+      }
+      Kokkos::fence();
+      for (int iter = iter_begin; iter < iter_end; ++iter) {
+        // flush
+        flush.run();
 
-        Kokkos::fence();
+        // initialize matrices
+        Kokkos::deep_copy(a, amat_device);
+        Kokkos::deep_copy(b, bmat_device);
 
-        const double one(1.0); //, zero(0.0);
-        {
-          double tavg = 0, tmin = tmax;
-          value_type 
-            *aa[N*VectorLength],
-            *bb[N*VectorLength];
-          for (int k=0;k<N*VectorLength;++k) {
-            aa[k] = a.data() + k*a.stride_0();
-            bb[k] = b.data() + k*b.stride_0();
+        timer.reset();
+        switch (test) {
+          case 0: {
+            // Left,  Lower, NoTrans,    UnitDiag
+            stat = cublasDtrsmBatched(
+                handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N,
+                CUBLAS_DIAG_UNIT, BlkSize, NumCols, &one,
+                (const value_type **)aa_device, BlkSize,
+                (value_type **)bb_device, BlkSize, N * VectorLength);
+            break;
           }
-          value_type 
-            **aa_device,
-            **bb_device;
-          if (cudaMalloc(&aa_device, N*VectorLength*sizeof(value_type*)) != cudaSuccess || 
-              cudaMalloc(&bb_device, N*VectorLength*sizeof(value_type*)) != cudaSuccess) {
-            Kokkos::abort("CUDA memory allocation failed\n"); 
+          case 1: {
+            // Left,  Lower, NoTrans, NonUnitDiag
+            stat = cublasDtrsmBatched(
+                handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N,
+                CUBLAS_DIAG_NON_UNIT, BlkSize, NumCols, &one,
+                (const value_type **)aa_device, BlkSize,
+                (value_type **)bb_device, BlkSize, N * VectorLength);
+            break;
           }
-          if (cudaMemcpy(aa_device, aa, sizeof(value_type*)*N*VectorLength, cudaMemcpyHostToDevice) != cudaSuccess ||
-              cudaMemcpy(bb_device, bb, sizeof(value_type*)*N*VectorLength, cudaMemcpyHostToDevice) != cudaSuccess) {
-            Kokkos::abort("CUDA memcpy failed\n");
+          case 2: {
+            // Right, Upper, NoTrans,    UnitDiag
+            stat = cublasDtrsmBatched(
+                handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
+                CUBLAS_DIAG_UNIT, BlkSize, NumCols, &one,
+                (const value_type **)aa_device, BlkSize,
+                (value_type **)bb_device, BlkSize, N * VectorLength);
+            break;
           }
-          Kokkos::fence();
-          for (int iter=iter_begin;iter<iter_end;++iter) {
-            // flush
-            flush.run();
-
-            // initialize matrices
-            Kokkos::deep_copy(a, amat_device);
-            Kokkos::deep_copy(b, bmat_device);
-            
-            timer.reset();
-            switch (test) {
-            case 0: {
-              // Left,  Lower, NoTrans,    UnitDiag 
-              stat = cublasDtrsmBatched(handle, 
-                                        CUBLAS_SIDE_LEFT,
-                                        CUBLAS_FILL_MODE_LOWER,
-                                        CUBLAS_OP_N,
-                                        CUBLAS_DIAG_UNIT, 
-                                        BlkSize, NumCols,
-                                        &one, 
-                                        (const value_type**)aa_device, BlkSize, 
-                                        (value_type**)bb_device, BlkSize, 
-                                        N*VectorLength);
-              break;
-            }
-            case 1: {
-              // Left,  Lower, NoTrans, NonUnitDiag 
-              stat = cublasDtrsmBatched(handle, 
-                                        CUBLAS_SIDE_LEFT,
-                                        CUBLAS_FILL_MODE_LOWER,
-                                        CUBLAS_OP_N,
-                                        CUBLAS_DIAG_NON_UNIT,
-                                        BlkSize, NumCols,
-                                        &one, 
-                                        (const value_type**)aa_device, BlkSize, 
-                                        (value_type**)bb_device, BlkSize, 
-                                        N*VectorLength);
-              break;
-            }
-            case 2: {
-              // Right, Upper, NoTrans,    UnitDiag
-              stat = cublasDtrsmBatched(handle, 
-                                        CUBLAS_SIDE_RIGHT,
-                                        CUBLAS_FILL_MODE_UPPER,
-                                        CUBLAS_OP_N,
-                                        CUBLAS_DIAG_UNIT,
-                                        BlkSize, NumCols,
-                                        &one, 
-                                        (const value_type**)aa_device, BlkSize, 
-                                        (value_type**)bb_device, BlkSize, 
-                                        N*VectorLength);
-              break;             
-            }
-            case 3: { 
-              // Right, Upper, NoTrans, NonUnitDiag
-              stat = cublasDtrsmBatched(handle, 
-                                        CUBLAS_SIDE_RIGHT,
-                                        CUBLAS_FILL_MODE_UPPER,
-                                        CUBLAS_OP_N,
-                                        CUBLAS_DIAG_NON_UNIT,
-                                        BlkSize, NumCols,
-                                        &one, 
-                                        (const value_type**)aa_device, BlkSize, 
-                                        (value_type**)bb_device, BlkSize, 
-                                        N*VectorLength);
-              break;             
-            }
-            case 4: {
-              // Left,  Upper, NoTrans, NonUnitDiag
-              stat = cublasDtrsmBatched(handle, 
-                                        CUBLAS_SIDE_LEFT,
-                                        CUBLAS_FILL_MODE_UPPER,
-                                        CUBLAS_OP_N,
-                                        CUBLAS_DIAG_NON_UNIT,
-                                        BlkSize, NumCols,
-                                        &one, 
-                                        (const value_type**)aa_device, BlkSize, 
-                                        (value_type**)bb_device, BlkSize, 
-                                        N*VectorLength);
-              break;                           
-            }
-            }
-
-            if (stat != CUBLAS_STATUS_SUCCESS) {
-              Kokkos::abort("CUBLAS Trsm Batched failed\n");
-            }
-            Kokkos::fence();
-            const double t = timer.seconds();
-            tmin = std::min(tmin, t);
-            tavg += (iter >= 0)*t;
+          case 3: {
+            // Right, Upper, NoTrans, NonUnitDiag
+            stat = cublasDtrsmBatched(
+                handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
+                CUBLAS_DIAG_NON_UNIT, BlkSize, NumCols, &one,
+                (const value_type **)aa_device, BlkSize,
+                (value_type **)bb_device, BlkSize, N * VectorLength);
+            break;
           }
-          tavg /= iter_end;
-
-          auto bsol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b);
-          Kokkos::deep_copy(bsol, b);
-          Kokkos::deep_copy(bref, bsol);
-
-          if (cudaFree(aa_device) != cudaSuccess || 
-              cudaFree(bb_device) != cudaSuccess) {
-            Kokkos::abort("CUDA memory free failed\n"); 
+          case 4: {
+            // Left,  Upper, NoTrans, NonUnitDiag
+            stat = cublasDtrsmBatched(
+                handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
+                CUBLAS_DIAG_NON_UNIT, BlkSize, NumCols, &one,
+                (const value_type **)aa_device, BlkSize,
+                (value_type **)bb_device, BlkSize, N * VectorLength);
+            break;
           }
-          
-          std::cout << std::setw(8) << "CUBLAS"
-                    << std::setw(8) << "Batched"
-                    << " BlkSize = " << std::setw(3) << BlkSize
-                    << " NumCols = " << std::setw(3) << NumCols
-                    << " TeamSize = N/A" 
-                    << " ScratchSize (KB) = N/A" 
-                    << " time = " << std::scientific << tmin
-                    << " avg flop/s = " << (flop/tavg)
-                    << " max flop/s = " << (flop/tmin)
-                    << std::endl;
         }
-        cublasDestroy(handle);
+
+        if (stat != CUBLAS_STATUS_SUCCESS) {
+          Kokkos::abort("CUBLAS Trsm Batched failed\n");
+        }
+        Kokkos::fence();
+        const double t = timer.seconds();
+        tmin           = std::min(tmin, t);
+        tavg += (iter >= 0) * t;
       }
+      tavg /= iter_end;
+
+      auto bsol =
+          Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b);
+      Kokkos::deep_copy(bsol, b);
+      Kokkos::deep_copy(bref, bsol);
+
+      if (cudaFree(aa_device) != cudaSuccess ||
+          cudaFree(bb_device) != cudaSuccess) {
+        Kokkos::abort("CUDA memory free failed\n");
+      }
+
+      std::cout << std::setw(8) << "CUBLAS" << std::setw(8) << "Batched"
+                << " BlkSize = " << std::setw(3) << BlkSize
+                << " NumCols = " << std::setw(3) << NumCols << " TeamSize = N/A"
+                << " ScratchSize (KB) = N/A"
+                << " time = " << std::scientific << tmin
+                << " avg flop/s = " << (flop / tavg)
+                << " max flop/s = " << (flop / tmin) << std::endl;
+    }
+    cublasDestroy(handle);
+  }
 #endif
 
-      if (1) {
-        ///
-        /// Range policy version
-        ///
-        typedef Kokkos::View<value_type***,DeviceSpaceType> view_type;        
-        view_type
-          a("a", N*VectorLength, BlkSize, BlkSize),
-          b("b", N*VectorLength, BlkSize, NumCols);
-
-        double tavg = 0, tmin = tmax;        
-        {
-          typedef Functor<test,view_type,AlgoTagType> functor_type;
-          const Kokkos::RangePolicy<DeviceSpaceType,ScheduleType,RangeTag> policy(0, N*VectorLength);
-
-          for (int iter=iter_begin;iter<iter_end;++iter) {
-            // flush
-            flush.run();
-            
-            // initialize matrices
-            Kokkos::deep_copy(a, amat);
-            Kokkos::deep_copy(b, bmat);
-            
-            Kokkos::fence();
-            timer.reset();
-            
-            Kokkos::parallel_for("KokkosBatched::PerfTest::RangeTag", policy, functor_type(a, b));
-
-            Kokkos::fence();
-            const double t = timer.seconds();
-            tmin = std::min(tmin, t);
-            tavg += (iter >= 0)*t;
-          }
-          tavg /= iter_end;
-              
-          auto bsol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b);
-          Kokkos::deep_copy(bsol, b);
-              
-          double diff = 0;
-          for (int i=0,iend=bref.extent(0);i<iend;++i)
-            for (int j=0,jend=bref.extent(1);j<jend;++j)
-              for (int k=0,kend=bref.extent(2);k<kend;++k)
-                diff += std::abs(bref(i,j,k) - bsol(i,j,k));
-
-          std::cout << std::setw(8) << "Kokkos"
-                    << std::setw(8) << "Range"
-                    << " BlkSize = " << std::setw(3) << BlkSize
-                    << " NumCols = " << std::setw(3) << NumCols
-                    << " TeamSize = N/A"
-                    << " ScratchSize (KB) =   0" 
-                    << " time = " << std::scientific << tmin
-                    << " avg flop/s = " << (flop/tavg)
-                    << " max flop/s = " << (flop/tmin);
+  if (1) {
+    ///
+    /// Range policy version
+    ///
+    typedef Kokkos::View<value_type ***, DeviceSpaceType> view_type;
+    view_type a("a", N * VectorLength, BlkSize, BlkSize),
+        b("b", N * VectorLength, BlkSize, NumCols);
+
+    double tavg = 0, tmin = tmax;
+    {
+      typedef Functor<test, view_type, AlgoTagType> functor_type;
+      const Kokkos::RangePolicy<DeviceSpaceType, ScheduleType, RangeTag> policy(
+          0, N * VectorLength);
+
+      for (int iter = iter_begin; iter < iter_end; ++iter) {
+        // flush
+        flush.run();
+
+        // initialize matrices
+        Kokkos::deep_copy(a, amat);
+        Kokkos::deep_copy(b, bmat);
+
+        Kokkos::fence();
+        timer.reset();
+
+        Kokkos::parallel_for("KokkosBatched::PerfTest::RangeTag", policy,
+                             functor_type(a, b));
+
+        Kokkos::fence();
+        const double t = timer.seconds();
+        tmin           = std::min(tmin, t);
+        tavg += (iter >= 0) * t;
+      }
+      tavg /= iter_end;
+
+      auto bsol =
+          Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b);
+      Kokkos::deep_copy(bsol, b);
+
+      double diff = 0;
+      for (int i = 0, iend = bref.extent(0); i < iend; ++i)
+        for (int j = 0, jend = bref.extent(1); j < jend; ++j)
+          for (int k = 0, kend = bref.extent(2); k < kend; ++k)
+            diff += Kokkos::ArithTraits<value_type>::abs(bref(i, j, k) -
+                                                         bsol(i, j, k));
+
+      std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Range"
+                << " BlkSize = " << std::setw(3) << BlkSize
+                << " NumCols = " << std::setw(3) << NumCols << " TeamSize = N/A"
+                << " ScratchSize (KB) =   0"
+                << " time = " << std::scientific << tmin
+                << " avg flop/s = " << (flop / tavg)
+                << " max flop/s = " << (flop / tmin);
 #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__)
-          std::cout << " diff to ref = " << diff;
+      std::cout << " diff to ref = " << diff;
 #endif
-          std::cout << std::endl;
-        }
-      }
+      std::cout << std::endl;
+    }
+  }
 
-      if (1) {
-        ///
-        /// Team policy V1 - almost same scheduling with range policy
-        ///
-        typedef Kokkos::View<value_type***,DeviceSpaceType> view_type;        
-        view_type
-          a("a", N*VectorLength, BlkSize, BlkSize),
-          b("b", N*VectorLength, BlkSize, NumCols);
-
-        double tavg = 0, tmin = tmax;        
-        {
-          typedef Kokkos::TeamPolicy<DeviceSpaceType,ScheduleType,TeamTagV1> policy_type;
-          typedef Functor<test,view_type,AlgoTagType,VectorLength> functor_type;
-
-          const int team_size = 
-            policy_type(N/32, Kokkos::AUTO, VectorLength).team_size_recommended(functor_type(), Kokkos::ParallelForTag());
-
-          const policy_type policy(N/team_size, team_size, VectorLength);
-          for (int iter=iter_begin;iter<iter_end;++iter) {
-            // flush
-            flush.run();
-            
-            // initialize matrices
-            Kokkos::deep_copy(a, amat);
-            Kokkos::deep_copy(b, bmat);
-            
-            Kokkos::fence();
-            timer.reset();
-            
-            Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV1", policy, functor_type(a, b));
-
-            Kokkos::fence();
-            const double t = timer.seconds();
-            tmin = std::min(tmin, t);
-            tavg += (iter >= 0)*t;
-          }
-          tavg /= iter_end;
-              
-          auto bsol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b);
-          Kokkos::deep_copy(bsol, b);
-              
-          double diff = 0;
-          for (int i=0,iend=bref.extent(0);i<iend;++i)
-            for (int j=0,jend=bref.extent(1);j<jend;++j)
-              for (int k=0,kend=bref.extent(2);k<kend;++k)
-                diff += std::abs(bref(i,j,k) - bsol(i,j,k));
-
-          std::cout << std::setw(8) << "Kokkos"
-                    << std::setw(8) << "Team V1"
-                    << " BlkSize = " << std::setw(3) << BlkSize
-                    << " NumCols = " << std::setw(3) << NumCols
-                    << " TeamSize = " << std::setw(3) << team_size
-                    << " ScratchSize (KB) =   0" 
-                    << " time = " << std::scientific << tmin
-                    << " avg flop/s = " << (flop/tavg)
-                    << " max flop/s = " << (flop/tmin);
+  if (1) {
+    ///
+    /// Team policy V1 - almost same scheduling with range policy
+    ///
+    typedef Kokkos::View<value_type ***, DeviceSpaceType> view_type;
+    view_type a("a", N * VectorLength, BlkSize, BlkSize),
+        b("b", N * VectorLength, BlkSize, NumCols);
+
+    double tavg = 0, tmin = tmax;
+    {
+      typedef Kokkos::TeamPolicy<DeviceSpaceType, ScheduleType, TeamTagV1>
+          policy_type;
+      typedef Functor<test, view_type, AlgoTagType, VectorLength> functor_type;
+
+      const int team_size =
+          policy_type(N / 32, Kokkos::AUTO, VectorLength)
+              .team_size_recommended(functor_type(), Kokkos::ParallelForTag());
+
+      const policy_type policy(N / team_size, team_size, VectorLength);
+      for (int iter = iter_begin; iter < iter_end; ++iter) {
+        // flush
+        flush.run();
+
+        // initialize matrices
+        Kokkos::deep_copy(a, amat);
+        Kokkos::deep_copy(b, bmat);
+
+        Kokkos::fence();
+        timer.reset();
+
+        Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV1", policy,
+                             functor_type(a, b));
+
+        Kokkos::fence();
+        const double t = timer.seconds();
+        tmin           = std::min(tmin, t);
+        tavg += (iter >= 0) * t;
+      }
+      tavg /= iter_end;
+
+      auto bsol =
+          Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b);
+      Kokkos::deep_copy(bsol, b);
+
+      double diff = 0;
+      for (int i = 0, iend = bref.extent(0); i < iend; ++i)
+        for (int j = 0, jend = bref.extent(1); j < jend; ++j)
+          for (int k = 0, kend = bref.extent(2); k < kend; ++k)
+            diff += Kokkos::ArithTraits<value_type>::abs(bref(i, j, k) -
+                                                         bsol(i, j, k));
+
+      std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V1"
+                << " BlkSize = " << std::setw(3) << BlkSize
+                << " NumCols = " << std::setw(3) << NumCols
+                << " TeamSize = " << std::setw(3) << team_size
+                << " ScratchSize (KB) =   0"
+                << " time = " << std::scientific << tmin
+                << " avg flop/s = " << (flop / tavg)
+                << " max flop/s = " << (flop / tmin);
 #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__)
-          std::cout << " diff to ref = " << diff;
+      std::cout << " diff to ref = " << diff;
 #endif
-          std::cout << std::endl;
-        }
-      }
+      std::cout << std::endl;
+    }
+  }
 
-      if (1) {
-        ///
-        /// Team policy V2 - team parallel
-        ///
-        typedef Kokkos::View<value_type***,DeviceSpaceType> view_type;        
-        view_type
-          a("a", N*VectorLength, BlkSize, BlkSize),
-          b("b", N*VectorLength, BlkSize, NumCols);
-
-        double tavg = 0, tmin = tmax;        
-        {
-          typedef Kokkos::TeamPolicy<DeviceSpaceType,ScheduleType,TeamTagV2> policy_type;
-          typedef Functor<test,view_type,AlgoTagType,VectorLength> functor_type;
-
-          const int
-            is_blocked_algo = (std::is_same<AlgoTagType,Algo::Trsm::Blocked>::value),
-            mb = Algo::Trsm::Blocked::mb<DeviceMemorySpaceType>(),
-            mp = BlkSize%mb > 0;
-              
-          const int
-            mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize;
-              
-          const int max_team_size = 
-            policy_type(N, Kokkos::AUTO, VectorLength).team_size_max(functor_type(), Kokkos::ParallelForTag());
-          const int team_size = std::min(std::max(NumCols,(mblk-1)*mblk), max_team_size);
-
-          const policy_type policy(N, team_size, VectorLength);
-          for (int iter=iter_begin;iter<iter_end;++iter) {
-            // flush
-            flush.run();
-            
-            // initialize matrices
-            Kokkos::deep_copy(a, amat);
-            Kokkos::deep_copy(b, bmat);
-            
-            DeviceSpaceType().fence();
-            timer.reset();
-            
-            Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV2", policy, functor_type(a, b));
-
-            DeviceSpaceType().fence();
-            const double t = timer.seconds();
-            tmin = std::min(tmin, t);
-            tavg += (iter >= 0)*t;
-          }
-          tavg /= iter_end;
-              
-          auto bsol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b);
-          Kokkos::deep_copy(bsol, b);
-              
-          double diff = 0;
-          for (int i=0,iend=bref.extent(0);i<iend;++i)
-            for (int j=0,jend=bref.extent(1);j<jend;++j)
-              for (int k=0,kend=bref.extent(2);k<kend;++k)
-                diff += std::abs(bref(i,j,k) - bsol(i,j,k));
-
-          std::cout << std::setw(8) << "Kokkos"
-                    << std::setw(8) << "Team V2"
-                    << " BlkSize = " << std::setw(3) << BlkSize
-                    << " NumCols = " << std::setw(3) << NumCols
-                    << " TeamSize = " << std::setw(3) << team_size
-                    << " ScratchSize (KB) =   0" 
-                    << " time = " << std::scientific << tmin
-                    << " avg flop/s = " << (flop/tavg)
-                    << " max flop/s = " << (flop/tmin);
+  if (1) {
+    ///
+    /// Team policy V2 - team parallel
+    ///
+    typedef Kokkos::View<value_type ***, DeviceSpaceType> view_type;
+    view_type a("a", N * VectorLength, BlkSize, BlkSize),
+        b("b", N * VectorLength, BlkSize, NumCols);
+
+    double tavg = 0, tmin = tmax;
+    {
+      typedef Kokkos::TeamPolicy<DeviceSpaceType, ScheduleType, TeamTagV2>
+          policy_type;
+      typedef Functor<test, view_type, AlgoTagType, VectorLength> functor_type;
+
+      const int is_blocked_algo =
+                    (std::is_same<AlgoTagType, Algo::Trsm::Blocked>::value),
+                mb = Algo::Trsm::Blocked::mb<DeviceMemorySpaceType>(),
+                mp = BlkSize % mb > 0;
+
+      const int mblk = is_blocked_algo ? (BlkSize / mb + mp) : BlkSize;
+
+      const int max_team_size =
+          policy_type(N, Kokkos::AUTO, VectorLength)
+              .team_size_max(functor_type(), Kokkos::ParallelForTag());
+      const int team_size =
+          std::min(std::max(NumCols, (mblk - 1) * mblk), max_team_size);
+
+      const policy_type policy(N, team_size, VectorLength);
+      for (int iter = iter_begin; iter < iter_end; ++iter) {
+        // flush
+        flush.run();
+
+        // initialize matrices
+        Kokkos::deep_copy(a, amat);
+        Kokkos::deep_copy(b, bmat);
+
+        DeviceSpaceType().fence();
+        timer.reset();
+
+        Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV2", policy,
+                             functor_type(a, b));
+
+        DeviceSpaceType().fence();
+        const double t = timer.seconds();
+        tmin           = std::min(tmin, t);
+        tavg += (iter >= 0) * t;
+      }
+      tavg /= iter_end;
+
+      auto bsol =
+          Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b);
+      Kokkos::deep_copy(bsol, b);
+
+      double diff = 0;
+      for (int i = 0, iend = bref.extent(0); i < iend; ++i)
+        for (int j = 0, jend = bref.extent(1); j < jend; ++j)
+          for (int k = 0, kend = bref.extent(2); k < kend; ++k)
+            diff += Kokkos::ArithTraits<value_type>::abs(bref(i, j, k) -
+                                                         bsol(i, j, k));
+
+      std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V2"
+                << " BlkSize = " << std::setw(3) << BlkSize
+                << " NumCols = " << std::setw(3) << NumCols
+                << " TeamSize = " << std::setw(3) << team_size
+                << " ScratchSize (KB) =   0"
+                << " time = " << std::scientific << tmin
+                << " avg flop/s = " << (flop / tavg)
+                << " max flop/s = " << (flop / tmin);
 #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__)
-          std::cout << " diff to ref = " << diff;
+      std::cout << " diff to ref = " << diff;
 #endif
-          std::cout << std::endl;
-        }
-      }
+      std::cout << std::endl;
+    }
+  }
 
-      if (1) {
-        ///
-        /// Team policy V3 - team parallel + sratch
-        ///
-        typedef Kokkos::View<value_type***,DeviceSpaceType> view_type;        
-        view_type
-          a("a", N*VectorLength, BlkSize, BlkSize),
-          b("b", N*VectorLength, BlkSize, NumCols);
-
-        double tavg = 0, tmin = tmax;        
-        {
-          typedef Kokkos::TeamPolicy<DeviceSpaceType,ScheduleType,TeamTagV3> policy_type;
-          typedef Functor<test,view_type,AlgoTagType,VectorLength> functor_type;
-
-          const int lvl = 0, per_team_scratch 
-            = ScratchViewType<view_type>::shmem_size(VectorLength, BlkSize, BlkSize);
-
-          if (per_team_scratch/1024 < 48) {
-            const int
-              is_blocked_algo = (std::is_same<AlgoTagType,Algo::Trsm::Blocked>::value),
-              mb = Algo::Trsm::Blocked::mb<DeviceMemorySpaceType>(),
-              mp = BlkSize%mb > 0;
-
-            const int
-              mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize;
-              
-            const int max_team_size =
-              policy_type(N, Kokkos::AUTO, VectorLength).set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch))
-              .team_size_max(functor_type(), Kokkos::ParallelForTag());   
-            const int team_size = std::min(std::max(NumCols,(mblk-1)*mblk), max_team_size);
-
-            policy_type policy(N, team_size, VectorLength);
-            for (int iter=iter_begin;iter<iter_end;++iter) {
-              // flush
-              flush.run();
-            
-              // initialize matrices
-              Kokkos::deep_copy(a, amat);
-              Kokkos::deep_copy(b, bmat);
-                  
-              DeviceSpaceType().fence();
-              timer.reset();
-            
-              Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV3", policy, functor_type(a, b));
-
-              DeviceSpaceType().fence();
-              const double t = timer.seconds();
-              tmin = std::min(tmin, t);
-              tavg += (iter >= 0)*t;
-            }
-            tavg /= iter_end;
-              
-            auto bsol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b);
-            Kokkos::deep_copy(bsol, b);
-              
-            double diff = 0;
-            for (int i=0,iend=bref.extent(0);i<iend;++i)
-              for (int j=0,jend=bref.extent(1);j<jend;++j)
-                for (int k=0,kend=bref.extent(2);k<kend;++k)
-                  diff += std::abs(bref(i,j,k) - bsol(i,j,k));
-
-            std::cout << std::setw(8) << "Kokkos"
-                      << std::setw(8) << "Team V3"
-                      << " BlkSize = " << std::setw(3) << BlkSize
-                      << " NumCols = " << std::setw(3) << NumCols
-                      << " TeamSize = " << std::setw(3) << team_size
-                      << " ScratchSize (KB) = " << std::setw(3) << (per_team_scratch/1024)
-                      << " time = " << std::scientific << tmin
-                      << " avg flop/s = " << (flop/tavg)
-                      << " max flop/s = " << (flop/tmin);
+  if (1) {
+    ///
+    /// Team policy V3 - team parallel + sratch
+    ///
+    typedef Kokkos::View<value_type ***, DeviceSpaceType> view_type;
+    view_type a("a", N * VectorLength, BlkSize, BlkSize),
+        b("b", N * VectorLength, BlkSize, NumCols);
+
+    double tavg = 0, tmin = tmax;
+    {
+      typedef Kokkos::TeamPolicy<DeviceSpaceType, ScheduleType, TeamTagV3>
+          policy_type;
+      typedef Functor<test, view_type, AlgoTagType, VectorLength> functor_type;
+
+      const int lvl              = 0,
+                per_team_scratch = ScratchViewType<view_type>::shmem_size(
+                    VectorLength, BlkSize, BlkSize);
+
+      if (per_team_scratch / 1024 < 48) {
+        const int is_blocked_algo =
+                      (std::is_same<AlgoTagType, Algo::Trsm::Blocked>::value),
+                  mb = Algo::Trsm::Blocked::mb<DeviceMemorySpaceType>(),
+                  mp = BlkSize % mb > 0;
+
+        const int mblk = is_blocked_algo ? (BlkSize / mb + mp) : BlkSize;
+
+        const int max_team_size =
+            policy_type(N, Kokkos::AUTO, VectorLength)
+                .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch))
+                .team_size_max(functor_type(), Kokkos::ParallelForTag());
+        const int team_size =
+            std::min(std::max(NumCols, (mblk - 1) * mblk), max_team_size);
+
+        policy_type policy(N, team_size, VectorLength);
+        for (int iter = iter_begin; iter < iter_end; ++iter) {
+          // flush
+          flush.run();
+
+          // initialize matrices
+          Kokkos::deep_copy(a, amat);
+          Kokkos::deep_copy(b, bmat);
+
+          DeviceSpaceType().fence();
+          timer.reset();
+
+          Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV3", policy,
+                               functor_type(a, b));
+
+          DeviceSpaceType().fence();
+          const double t = timer.seconds();
+          tmin           = std::min(tmin, t);
+          tavg += (iter >= 0) * t;
+        }
+        tavg /= iter_end;
+
+        auto bsol = Kokkos::create_mirror_view(
+            typename HostSpaceType::memory_space(), b);
+        Kokkos::deep_copy(bsol, b);
+
+        double diff = 0;
+        for (int i = 0, iend = bref.extent(0); i < iend; ++i)
+          for (int j = 0, jend = bref.extent(1); j < jend; ++j)
+            for (int k = 0, kend = bref.extent(2); k < kend; ++k)
+              diff += Kokkos::ArithTraits<value_type>::abs(bref(i, j, k) -
+                                                           bsol(i, j, k));
+
+        std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3"
+                  << " BlkSize = " << std::setw(3) << BlkSize
+                  << " NumCols = " << std::setw(3) << NumCols
+                  << " TeamSize = " << std::setw(3) << team_size
+                  << " ScratchSize (KB) = " << std::setw(3)
+                  << (per_team_scratch / 1024) << " time = " << std::scientific
+                  << tmin << " avg flop/s = " << (flop / tavg)
+                  << " max flop/s = " << (flop / tmin);
 #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__)
-            std::cout << " diff to ref = " << diff;
+        std::cout << " diff to ref = " << diff;
 #endif
-            std::cout << std::endl;
-          } else {
-            std::cout << std::setw(8) << "Kokkos"
-                      << std::setw(8) << "Team V3"
-                      << " Scratch per team is too big (KB): " << (per_team_scratch/1024)
-                      << std::endl;
-          }
-        }
+        std::cout << std::endl;
+      } else {
+        std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3"
+                  << " Scratch per team is too big (KB): "
+                  << (per_team_scratch / 1024) << std::endl;
       }
-      std::cout << "\n\n";
     }
   }
+  std::cout << "\n\n";
 }
+}  // namespace PerfTest
+}  // namespace KokkosBatched
 
 using namespace KokkosBatched;
 
-template<typename AlgoTagType>
+template <typename AlgoTagType>
 void run(const int N, const int B, const int R) {
   typedef Kokkos::DefaultExecutionSpace ExecSpace;
 
   Kokkos::print_configuration(std::cout, false);
 
   if (B != 0 && R != 0) {
-    PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N,B,R);
+    PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, B, R);
   } else {
-
     std::cout << "\n\n Used for Factorization \n\n";
 
     /// Left, Lower, NoTrans, UnitDiag (used in LU factorization and LU solve)
 
     PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, 3, 3);
     PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, 5, 5);
-    PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N,10,10);
-    PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N,15,15);
+    PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, 10, 10);
+    PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, 15, 15);
 
     /// Left, Lower, NoTrans, NonUnitDiag
 
     PerfTest::Trsm<1, ExecSpace, AlgoTagType>(N, 3, 3);
     PerfTest::Trsm<1, ExecSpace, AlgoTagType>(N, 5, 5);
-    PerfTest::Trsm<1, ExecSpace, AlgoTagType>(N,10,10);
-    PerfTest::Trsm<1, ExecSpace, AlgoTagType>(N,15,15);
+    PerfTest::Trsm<1, ExecSpace, AlgoTagType>(N, 10, 10);
+    PerfTest::Trsm<1, ExecSpace, AlgoTagType>(N, 15, 15);
 
     /// Right, Upper, NoTrans, UnitDiag
 
     PerfTest::Trsm<2, ExecSpace, AlgoTagType>(N, 3, 3);
     PerfTest::Trsm<2, ExecSpace, AlgoTagType>(N, 5, 5);
-    PerfTest::Trsm<2, ExecSpace, AlgoTagType>(N,10,10);
-    PerfTest::Trsm<2, ExecSpace, AlgoTagType>(N,15,15);
+    PerfTest::Trsm<2, ExecSpace, AlgoTagType>(N, 10, 10);
+    PerfTest::Trsm<2, ExecSpace, AlgoTagType>(N, 15, 15);
 
     /// Right, Upper, NoTrans, NonUnitDiag (used in LU factorization)
 
     PerfTest::Trsm<3, ExecSpace, AlgoTagType>(N, 3, 3);
     PerfTest::Trsm<3, ExecSpace, AlgoTagType>(N, 5, 5);
-    PerfTest::Trsm<3, ExecSpace, AlgoTagType>(N,10,10);
-    PerfTest::Trsm<3, ExecSpace, AlgoTagType>(N,15,15);
+    PerfTest::Trsm<3, ExecSpace, AlgoTagType>(N, 10, 10);
+    PerfTest::Trsm<3, ExecSpace, AlgoTagType>(N, 15, 15);
 
     std::cout << "\n\n Used for Solve \n\n";
 
@@ -780,26 +771,25 @@ void run(const int N, const int B, const int R) {
 
     PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, 3, 1);
     PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, 5, 1);
-    PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N,10, 1);
-    PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N,15, 1);
+    PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, 10, 1);
+    PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, 15, 1);
 
     /// Left, Upper, Notrans, NonUnitDiag (user in LU solve)
 
     PerfTest::Trsm<4, ExecSpace, AlgoTagType>(N, 3, 1);
     PerfTest::Trsm<4, ExecSpace, AlgoTagType>(N, 5, 1);
-    PerfTest::Trsm<4, ExecSpace, AlgoTagType>(N,10, 1);
-    PerfTest::Trsm<4, ExecSpace, AlgoTagType>(N,15, 1);
+    PerfTest::Trsm<4, ExecSpace, AlgoTagType>(N, 10, 1);
+    PerfTest::Trsm<4, ExecSpace, AlgoTagType>(N, 15, 1);
   }
 }
 
 int main(int argc, char *argv[]) {
-
   Kokkos::initialize(argc, argv);
 
-  int N = 128*128, B = 0, R = 0;
+  int N = 128 * 128, B = 0, R = 0;
 
-  for (int i=1;i<argc;++i) {
-    const std::string& token = argv[i];
+  for (int i = 1; i < argc; ++i) {
+    const std::string &token = argv[i];
     if (token == std::string("-N")) N = std::atoi(argv[++i]);
     if (token == std::string("-B")) B = std::atoi(argv[++i]);
     if (token == std::string("-R")) R = std::atoi(argv[++i]);
@@ -811,10 +801,10 @@ int main(int argc, char *argv[]) {
     std::cout << " N = " << N << std::endl;
 
     std::cout << "\n Testing LayoutLeft and Algo::Trsm::Unblocked\n";
-    run<Algo::Trsm::Unblocked>(N,B,R);
+    run<Algo::Trsm::Unblocked>(N, B, R);
 
     std::cout << "\n Testing LayoutLeft Algo::Trsm::Blocked\n";
-    run<Algo::Trsm::Blocked>(N,B,R);
+    run<Algo::Trsm::Blocked>(N, B, R);
   }
 
   Kokkos::finalize();
@@ -822,7 +812,7 @@ int main(int argc, char *argv[]) {
   return 0;
 }
 
-#else 
+#else
 
 int main(int argc, char *argv[]) {
   std::cout << "Kokkos::Cuda is not enabled\n";
diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host.hpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host.hpp
index 0e14fe0cf9..0770055cb0 100644
--- a/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host.hpp
+++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host.hpp
@@ -18,641 +18,602 @@
 //#undef __KOKKOSBATCHED_INTEL_MKL_BATCHED__
 
 namespace KokkosBatched {
-  namespace PerfTest {
+namespace PerfTest {
 
 #undef FLOP_MUL
 #undef FLOP_ADD
 
-    // no complex yet
-#if defined( KokkosBatched_Test_Trsm_Host_Complex )      
+// no complex yet
+#if defined(KokkosBatched_Test_Trsm_Host_Complex)
 #define FLOP_MUL 6.0
 #define FLOP_ADD 2.0
-    typedef Kokkos::complex<double> value_type;
+typedef Kokkos::complex<double> value_type;
 #endif
 
-#if defined( KokkosBatched_Test_Trsm_Host_Real )      
+#if defined(KokkosBatched_Test_Trsm_Host_Real)
 #define FLOP_MUL 1.0
-#define FLOP_ADD 1.0      
-    typedef double value_type;
+#define FLOP_ADD 1.0
+typedef double value_type;
 #endif
 
-    double FlopCountLower(int mm, int nn) {
-      double m = (double)mm;    double n = (double)nn;
-      return (FLOP_MUL*(0.5*m*n*(n+1.0)) +
-              FLOP_ADD*(0.5*m*n*(n-1.0)));
-    }
-
-    double FlopCountUpper(int mm, int nn) {
-      double m = (double)mm;    double n = (double)nn;
-      return (FLOP_MUL*(0.5*m*n*(n+1.0)) +
-              FLOP_ADD*(0.5*m*n*(n-1.0)));
-    }
-
-    template<int test, int BlkSize, int NumCols, typename HostSpaceType, typename AlgoTagType>
-    void Trsm(const int NN) {
-      typedef Kokkos::Schedule<Kokkos::Static> ScheduleType;
+double FlopCountLower(int mm, int nn) {
+  double m = (double)mm;
+  double n = (double)nn;
+  return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) +
+          FLOP_ADD * (0.5 * m * n * (n - 1.0)));
+}
 
-      constexpr int VectorLength = DefaultVectorLength<value_type,typename HostSpaceType::memory_space>::value;
-      const int N = NN/VectorLength;
+double FlopCountUpper(int mm, int nn) {
+  double m = (double)mm;
+  double n = (double)nn;
+  return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) +
+          FLOP_ADD * (0.5 * m * n * (n - 1.0)));
+}
 
-      {
-        std::string value_type_name;
-        if (std::is_same<value_type,double>::value)                   value_type_name = "double";
-        if (std::is_same<value_type,Kokkos::complex<double> >::value) value_type_name = "Kokkos::complex<double>";
-#if   defined(__AVX512F__)
-        std::cout << "AVX512 is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
+template <int test, int BlkSize, int NumCols, typename HostSpaceType,
+          typename AlgoTagType>
+void Trsm(const int NN) {
+  typedef Kokkos::Schedule<Kokkos::Static> ScheduleType;
+
+  constexpr int VectorLength =
+      DefaultVectorLength<value_type,
+                          typename HostSpaceType::memory_space>::value;
+  const int N = NN / VectorLength;
+
+  {
+    std::string value_type_name;
+    if (std::is_same<value_type, double>::value) value_type_name = "double";
+    if (std::is_same<value_type, Kokkos::complex<double> >::value)
+      value_type_name = "Kokkos::complex<double>";
+#if defined(__AVX512F__)
+    std::cout << "AVX512 is defined: datatype " << value_type_name
+              << " a vector length " << VectorLength << "\n";
 #elif defined(__AVX__) || defined(__AVX2__)
-        std::cout << "AVX or AVX2 is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
+    std::cout << "AVX or AVX2 is defined: datatype " << value_type_name
+              << " a vector length " << VectorLength << "\n";
 #else
-        std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
+    std::cout << "SIMD (compiler vectorization) is defined: datatype "
+              << value_type_name << " a vector length " << VectorLength << "\n";
 #endif
-      }
+  }
+
+  switch (test) {
+    case 0: std::cout << "TestID = Left,  Lower, NoTrans,    UnitDiag\n"; break;
+    case 1: std::cout << "TestID = Left,  Lower, NoTrans, NonUnitDiag\n"; break;
+    case 2: std::cout << "TestID = Right, Upper, NoTrans,    UnitDiag\n"; break;
+    case 3: std::cout << "TestID = Right, Upper, NoTrans, NonUnitDiag\n"; break;
+    case 4: std::cout << "TestID = Left,  Upper, NoTrans, NonUnitDiag\n"; break;
+  }
 
-      switch (test) {
-      case 0: std::cout << "TestID = Left,  Lower, NoTrans,    UnitDiag\n"; break;
-      case 1: std::cout << "TestID = Left,  Lower, NoTrans, NonUnitDiag\n"; break;
-      case 2: std::cout << "TestID = Right, Upper, NoTrans,    UnitDiag\n"; break;
-      case 3: std::cout << "TestID = Right, Upper, NoTrans, NonUnitDiag\n"; break;
-      case 4: std::cout << "TestID = Left,  Upper, NoTrans, NonUnitDiag\n"; break;
+  // when m == n, lower upper does not matter (unit and nonunit)
+  double flop = 0;
+  switch (test) {
+    case 0:
+    case 1: flop = FlopCountLower(BlkSize, NumCols); break;
+    case 2:
+    case 3:
+    case 4: flop = FlopCountUpper(BlkSize, NumCols); break;
+  }
+  flop *= (N * VectorLength);
+
+  const double tmax = 1.0e15;
+
+  const int iter_begin = -10, iter_end = 100;
+  Kokkos::Timer timer;
+
+  ///
+  /// Reference version using MKL DTRSM
+  ///
+  Kokkos::View<value_type ***, Kokkos::LayoutRight, HostSpaceType> bref;
+  Kokkos::View<value_type ***, Kokkos::LayoutRight, HostSpaceType> amat(
+      "amat", N * VectorLength, BlkSize, BlkSize),
+      bmat("bmat", N * VectorLength, BlkSize, NumCols);
+
+  typedef Vector<SIMD<value_type>, VectorLength> VectorType;
+  Kokkos::View<VectorType ***, Kokkos::LayoutRight, HostSpaceType> amat_simd(
+      "amat_simd", N, BlkSize, BlkSize),
+      bmat_simd("bmat_simd", N, BlkSize, NumCols);
+
+  Random<value_type> random;
+
+  for (int k = 0; k < N * VectorLength; ++k) {
+    const int k0 = k / VectorLength, k1 = k % VectorLength;
+    for (int i = 0; i < BlkSize; ++i)
+      for (int j = 0; j < BlkSize; ++j) {
+        amat(k, i, j)           = random.value() + 4.0 * (i == j);
+        amat_simd(k0, i, j)[k1] = amat(k, i, j);
+      }
+    for (int i = 0; i < BlkSize; ++i)
+      for (int j = 0; j < NumCols; ++j) {
+        bmat(k, i, j)           = random.value();
+        bmat_simd(k0, i, j)[k1] = bmat(k, i, j);
       }
+  }
+
+  // for KNL
+  constexpr size_t LLC_CAPACITY = 34 * 1024 * 1024;
+  Flush<LLC_CAPACITY> flush;
 
-      // when m == n, lower upper does not matter (unit and nonunit)
-      double flop = 0;
-      switch (test) {
-      case 0:
-      case 1:
-        flop = FlopCountLower(BlkSize,NumCols);
-        break;
-      case 2:
-      case 3:
-      case 4:
-        flop = FlopCountUpper(BlkSize,NumCols);
-        break;
+  ///
+  /// Reference version using MKL DTRSM
+  ///
+#if defined(__KOKKOSBATCHED_INTEL_MKL__)
+  {
+    Kokkos::View<value_type ***, Kokkos::LayoutRight, HostSpaceType> a(
+        "a", N * VectorLength, BlkSize, BlkSize),
+        b("b", N * VectorLength, BlkSize, NumCols);
+
+    {
+      double tavg = 0, tmin = tmax;
+      for (int iter = iter_begin; iter < iter_end; ++iter) {
+        // flush
+        flush.run();
+
+        // initialize matrices
+        Kokkos::deep_copy(a, amat);
+        Kokkos::deep_copy(b, bmat);
+
+        HostSpaceType().fence();
+        timer.reset();
+
+        Kokkos::RangePolicy<HostSpaceType, ScheduleType> policy(
+            0, N * VectorLength);
+        Kokkos::parallel_for(
+            "KokkosBatched::PerfTest::TrsmHost::MKLOpenMP", policy,
+            KOKKOS_LAMBDA(const int k) {
+              auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
+              auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL());
+
+              switch (test) {
+                case 0:
+                  cblas_dtrsm(CblasRowMajor, CblasLeft, CblasLower,
+                              CblasNoTrans, CblasUnit, BlkSize, NumCols, 1.0,
+                              (double *)aa.data(), aa.stride_0(),
+                              (double *)bb.data(), bb.stride_0());
+                  break;
+                case 1:
+                  cblas_dtrsm(CblasRowMajor, CblasLeft, CblasLower,
+                              CblasNoTrans, CblasNonUnit, BlkSize, NumCols, 1.0,
+                              (double *)aa.data(), aa.stride_0(),
+                              (double *)bb.data(), bb.stride_0());
+                  break;
+                case 2:
+                  cblas_dtrsm(CblasRowMajor, CblasRight, CblasUpper,
+                              CblasNoTrans, CblasUnit, BlkSize, NumCols, 1.0,
+                              (double *)aa.data(), aa.stride_0(),
+                              (double *)bb.data(), bb.stride_0());
+                  break;
+                case 3:
+                  cblas_dtrsm(CblasRowMajor, CblasRight, CblasUpper,
+                              CblasNoTrans, CblasNonUnit, BlkSize, NumCols, 1.0,
+                              (double *)aa.data(), aa.stride_0(),
+                              (double *)bb.data(), bb.stride_0());
+                  break;
+                case 4:
+                  cblas_dtrsm(CblasRowMajor, CblasLeft, CblasUpper,
+                              CblasNoTrans, CblasNonUnit, BlkSize, NumCols, 1.0,
+                              (double *)aa.data(), aa.stride_0(),
+                              (double *)bb.data(), bb.stride_0());
+                  break;
+              }
+            });
+
+        HostSpaceType().fence();
+        const double t = timer.seconds();
+        tmin           = std::min(tmin, t);
+        tavg += (iter >= 0) * t;
       }
-      flop *= (N*VectorLength);
-
-      const double tmax = 1.0e15;
-
-      const int iter_begin = -10, iter_end = 100;
-      Kokkos::Timer timer;
-
-      ///
-      /// Reference version using MKL DTRSM
-      ///
-      Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> bref;
-      Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType>
-        amat("amat", N*VectorLength, BlkSize, BlkSize),
-        bmat("bmat", N*VectorLength, BlkSize, NumCols);
-
-      typedef Vector<SIMD<value_type>,VectorLength> VectorType;
-      Kokkos::View<VectorType***,Kokkos::LayoutRight,HostSpaceType>
-        amat_simd("amat_simd", N, BlkSize, BlkSize),
-        bmat_simd("bmat_simd", N, BlkSize, NumCols); 
-      
-      Random<value_type> random;
-
-      for (int k=0;k<N*VectorLength;++k) {
-        const int k0 = k/VectorLength, k1 = k%VectorLength;
-        for (int i=0;i<BlkSize;++i)
-          for (int j=0;j<BlkSize;++j) {
-            amat(k, i, j) = random.value() + 4.0*(i==j);
-            amat_simd(k0, i, j)[k1] = amat(k, i, j);
+      tavg /= iter_end;
+
+      double sum = 0;
+      for (int i = 0, iend = b.extent(0); i < iend; ++i)
+        for (int j = 0, jend = b.extent(1); j < jend; ++j)
+          for (int k = 0, kend = b.extent(2); k < kend; ++k)
+            sum += Kokkos::ArithTraits<value_type>::abs(bmat(i, j, k));
+
+      std::cout << std::setw(10) << "MKL TRSM"
+                << " BlkSize = " << std::setw(3) << BlkSize
+                << " NumCols = " << std::setw(3) << NumCols
+                << " time = " << std::scientific << tmin
+                << " avg flop/s = " << (flop / tavg)
+                << " max flop/s = " << (flop / tmin) << " sum abs(B)  = " << sum
+                << std::endl;
+
+      bref = b;
+    }
+  }
+#if defined(__KOKKOSBATCHED_INTEL_MKL_BATCHED__)
+  {
+    Kokkos::View<value_type ***, Kokkos::LayoutRight, HostSpaceType> a(
+        "a", N * VectorLength, BlkSize, BlkSize),
+        b("b", N * VectorLength, BlkSize, NumCols);
+
+    value_type *aa[N * VectorLength], *bb[N * VectorLength];
+
+    for (int k = 0; k < N * VectorLength; ++k) {
+      aa[k] = &a(k, 0, 0);
+      bb[k] = &b(k, 0, 0);
+    }
+
+    {
+      double tavg = 0, tmin = tmax;
+
+      MKL_INT blksize[1] = {BlkSize};
+      MKL_INT numcols[1] = {NumCols};
+
+      MKL_INT lda[1] = {a.stride_1()};
+      MKL_INT ldb[1] = {b.stride_1()};
+
+      double one[1]           = {1.0};
+      MKL_INT size_per_grp[1] = {N * VectorLength};
+
+      for (int iter = iter_begin; iter < iter_end; ++iter) {
+        // flush
+        flush.run();
+
+        // initialize matrices
+        Kokkos::deep_copy(a, amat);
+        Kokkos::deep_copy(b, bmat);
+
+        HostSpaceType().fence();
+        timer.reset();
+
+        switch (test) {
+          case 0: {
+            CBLAS_SIDE side[1]        = {CblasLeft};
+            CBLAS_UPLO uplo[1]        = {CblasLower};
+            CBLAS_TRANSPOSE transA[1] = {CblasNoTrans};
+            CBLAS_DIAG diag[1]        = {CblasUnit};
+
+            cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize,
+                              numcols, one, (const double **)aa, lda,
+                              (double **)bb, ldb, 1, size_per_grp);
+            break;
           }
-        for (int i=0;i<BlkSize;++i)
-          for (int j=0;j<NumCols;++j) {
-            bmat(k, i, j) = random.value(); 
-            bmat_simd(k0, i, j)[k1] = bmat(k, i, j);
+          case 1: {
+            CBLAS_SIDE side[1]        = {CblasLeft};
+            CBLAS_UPLO uplo[1]        = {CblasLower};
+            CBLAS_TRANSPOSE transA[1] = {CblasNoTrans};
+            CBLAS_DIAG diag[1]        = {CblasNonUnit};
+
+            cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize,
+                              numcols, one, (const double **)aa, lda,
+                              (double **)bb, ldb, 1, size_per_grp);
+            break;
           }
-      }
-      
-      // for KNL
-      constexpr size_t LLC_CAPACITY = 34*1024*1024;
-      Flush<LLC_CAPACITY> flush;
-
-      ///
-      /// Reference version using MKL DTRSM
-      /// 
-#if defined(__KOKKOSBATCHED_INTEL_MKL__)
-      {
-        Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType>
-          a("a", N*VectorLength, BlkSize, BlkSize),
-          b("b", N*VectorLength, BlkSize, NumCols);
-
-        {
-          double tavg = 0, tmin = tmax;
-          for (int iter=iter_begin;iter<iter_end;++iter) {
-            // flush
-            flush.run();
-
-            // initialize matrices
-            Kokkos::deep_copy(a, amat);
-            Kokkos::deep_copy(b, bmat);
-
-            HostSpaceType().fence();
-            timer.reset();
-
-            Kokkos::RangePolicy<HostSpaceType,ScheduleType> policy(0, N*VectorLength);
-            Kokkos::parallel_for("KokkosBatched::PerfTest::TrsmHost::MKLOpenMP", policy,
-                                 KOKKOS_LAMBDA(const int k) {
-                                   auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
-                                   auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL());
-
-                                   switch (test) {
-                                   case 0:
-                                     cblas_dtrsm(CblasRowMajor,
-                                                 CblasLeft, CblasLower, CblasNoTrans, CblasUnit,
-                                                 BlkSize, NumCols,
-                                                 1.0,
-                                                 (double*)aa.data(), aa.stride_0(),
-                                                 (double*)bb.data(), bb.stride_0());
-                                     break;
-                                   case 1:
-                                     cblas_dtrsm(CblasRowMajor,
-                                                 CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit,
-                                                 BlkSize, NumCols,
-                                                 1.0,
-                                                 (double*)aa.data(), aa.stride_0(),
-                                                 (double*)bb.data(), bb.stride_0());
-                                     break;
-                                   case 2:
-                                     cblas_dtrsm(CblasRowMajor,
-                                                 CblasRight, CblasUpper, CblasNoTrans, CblasUnit,
-                                                 BlkSize, NumCols,
-                                                 1.0,
-                                                 (double*)aa.data(), aa.stride_0(),
-                                                 (double*)bb.data(), bb.stride_0());
-                                     break;
-                                   case 3:
-                                     cblas_dtrsm(CblasRowMajor,
-                                                 CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit,
-                                                 BlkSize, NumCols,
-                                                 1.0,
-                                                 (double*)aa.data(), aa.stride_0(),
-                                                 (double*)bb.data(), bb.stride_0());
-                                     break;
-                                   case 4:
-                                     cblas_dtrsm(CblasRowMajor,
-                                                 CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit,
-                                                 BlkSize, NumCols,
-                                                 1.0,
-                                                 (double*)aa.data(), aa.stride_0(),
-                                                 (double*)bb.data(), bb.stride_0());
-                                     break;
-                                   }
-                                 });
-
-            HostSpaceType().fence();
-            const double t = timer.seconds();
-            tmin = std::min(tmin, t);
-            tavg += (iter >= 0)*t;
+          case 2: {
+            CBLAS_SIDE side[1]        = {CblasRight};
+            CBLAS_UPLO uplo[1]        = {CblasUpper};
+            CBLAS_TRANSPOSE transA[1] = {CblasNoTrans};
+            CBLAS_DIAG diag[1]        = {CblasUnit};
+
+            cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize,
+                              numcols, one, (const double **)aa, lda,
+                              (double **)bb, ldb, 1, size_per_grp);
+            break;
+          }
+          case 3: {
+            CBLAS_SIDE side[1]        = {CblasRight};
+            CBLAS_UPLO uplo[1]        = {CblasUpper};
+            CBLAS_TRANSPOSE transA[1] = {CblasNoTrans};
+            CBLAS_DIAG diag[1]        = {CblasNonUnit};
+
+            cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize,
+                              numcols, one, (const double **)aa, lda,
+                              (double **)bb, ldb, 1, size_per_grp);
+            break;
+          }
+          case 4: {
+            CBLAS_SIDE side[1]        = {CblasLeft};
+            CBLAS_UPLO uplo[1]        = {CblasUpper};
+            CBLAS_TRANSPOSE transA[1] = {CblasNoTrans};
+            CBLAS_DIAG diag[1]        = {CblasNonUnit};
+
+            cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize,
+                              numcols, one, (const double **)aa, lda,
+                              (double **)bb, ldb, 1, size_per_grp);
+            break;
           }
-          tavg /= iter_end;
-
-          double sum = 0;
-          for (int i=0,iend=b.extent(0);i<iend;++i)
-            for (int j=0,jend=b.extent(1);j<jend;++j)
-              for (int k=0,kend=b.extent(2);k<kend;++k)
-                sum += std::abs(bmat(i,j,k));
-
-          std::cout << std::setw(10) << "MKL TRSM"
-                    << " BlkSize = " << std::setw(3) << BlkSize
-                    << " NumCols = " << std::setw(3) << NumCols
-                    << " time = " << std::scientific << tmin
-                    << " avg flop/s = " << (flop/tavg)
-                    << " max flop/s = " << (flop/tmin)
-                    << " sum abs(B)  = " << sum
-                    << std::endl;
-
-          bref = b;
         }
+
+        HostSpaceType().fence();
+        const double t = timer.seconds();
+        tmin           = std::min(tmin, t);
+        tavg += (iter >= 0) * t;
       }
-#if defined(__KOKKOSBATCHED_INTEL_MKL_BATCHED__)
-      {
-        Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType>
-          a("a", N*VectorLength, BlkSize, BlkSize),
-          b("b", N*VectorLength, BlkSize, NumCols);
-        
-        value_type 
-          *aa[N*VectorLength],
-          *bb[N*VectorLength];
-
-        for (int k=0;k<N*VectorLength;++k) {
-          aa[k] = &a(k, 0, 0);
-          bb[k] = &b(k, 0, 0);
-        }
-        
-        {
-          double tavg = 0, tmin = tmax;
-          
-          MKL_INT blksize[1] = { BlkSize };
-          MKL_INT numcols[1] = { NumCols };
+      tavg /= iter_end;
+
+      double diff = 0;
+      for (int i = 0, iend = bref.extent(0); i < iend; ++i)
+        for (int j = 0, jend = bref.extent(1); j < jend; ++j)
+          for (int k = 0, kend = bref.extent(2); k < kend; ++k)
+            diff += Kokkos::ArithTraits<value_type>::abs(bref(i, j, k) -
+                                                         b(i, j, k));
+
+      std::cout << std::setw(10) << "MKL Batch"
+                << " BlkSize = " << std::setw(3) << BlkSize
+                << " NumCols = " << std::setw(3) << NumCols
+                << " time = " << std::scientific << tmin
+                << " avg flop/s = " << (flop / tavg)
+                << " max flop/s = " << (flop / tmin)
+                << " diff to ref = " << diff << std::endl;
+    }
+  }
+#endif
 
-          MKL_INT lda[1] = { a.stride_1() };
-          MKL_INT ldb[1] = { b.stride_1() };
+#if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
+  {
+    Kokkos::View<VectorType ***, Kokkos::LayoutRight, HostSpaceType> a(
+        "a", N, BlkSize, BlkSize),
+        b("b", N, BlkSize, NumCols);
+
+    {
+      double tavg = 0, tmin = tmax;
+
+      MKL_COMPACT_PACK format;
 
-          double one[1] = { 1.0 };
-          MKL_INT size_per_grp[1] = { N*VectorLength };
+      if (VectorLength == 4)
+        format = MKL_COMPACT_AVX;
+      else if (VectorLength == 8)
+        format = MKL_COMPACT_AVX512;
 
-          for (int iter=iter_begin;iter<iter_end;++iter) {
-            // flush
-            flush.run();
+      double one = 1.0;
+      if (format == MKL_COMPACT_AVX512 || format == MKL_COMPACT_AVX) {
+        for (int iter = iter_begin; iter < iter_end; ++iter) {
+          // flush
+          flush.run();
 
-            // initialize matrices
-            Kokkos::deep_copy(a, amat);
-            Kokkos::deep_copy(b, bmat);
+          // initialize matrices
+          Kokkos::deep_copy(a, amat_simd);
+          Kokkos::deep_copy(b, bmat_simd);
 
-            HostSpaceType().fence();
-            timer.reset();
+          HostSpaceType().fence();
+          timer.reset();
 
-            switch (test) {
+          switch (test) {
             case 0: {
-              CBLAS_SIDE side[1] = {CblasLeft};
-              CBLAS_UPLO uplo[1] = {CblasLower};
-              CBLAS_TRANSPOSE transA[1] = {CblasNoTrans};
-              CBLAS_DIAG diag[1] = {CblasUnit};
-
-              cblas_dtrsm_batch(CblasRowMajor,
-                                side, uplo, transA, diag,
-                                blksize, numcols,
-                                one,
-                                (const double**)aa, lda,
-                                (double**)bb, ldb,
-                                1, size_per_grp);
+              MKL_SIDE side        = MKL_LEFT;
+              MKL_UPLO uplo        = MKL_LOWER;
+              MKL_TRANSPOSE transA = MKL_NOTRANS;
+              MKL_DIAG diag        = MKL_UNIT;
+
+              mkl_dtrsm_compact(MKL_ROW_MAJOR, side, uplo, transA, diag,
+                                BlkSize, NumCols, one, (const double *)a.data(),
+                                a.stride_1(), (double *)b.data(), b.stride_1(),
+                                format, (MKL_INT)N * VectorLength);
               break;
             }
             case 1: {
-              CBLAS_SIDE side[1] = {CblasLeft};
-              CBLAS_UPLO uplo[1] = {CblasLower};
-              CBLAS_TRANSPOSE transA[1] = {CblasNoTrans};
-              CBLAS_DIAG diag[1] = {CblasNonUnit};
-
-              cblas_dtrsm_batch(CblasRowMajor,
-                                side, uplo, transA, diag,
-                                blksize, numcols,
-                                one,
-                                (const double**)aa, lda,
-                                (double**)bb, ldb,
-                                1, size_per_grp);
+              MKL_SIDE side        = MKL_LEFT;
+              MKL_UPLO uplo        = MKL_LOWER;
+              MKL_TRANSPOSE transA = MKL_NOTRANS;
+              MKL_DIAG diag        = MKL_NONUNIT;
+
+              mkl_dtrsm_compact(MKL_ROW_MAJOR, side, uplo, transA, diag,
+                                BlkSize, NumCols, one, (const double *)a.data(),
+                                a.stride_1(), (double *)b.data(), b.stride_1(),
+                                format, (MKL_INT)N * VectorLength);
               break;
             }
             case 2: {
-              CBLAS_SIDE side[1] = {CblasRight};
-              CBLAS_UPLO uplo[1] = {CblasUpper};
-              CBLAS_TRANSPOSE transA[1] = {CblasNoTrans};
-              CBLAS_DIAG diag[1] = {CblasUnit};
-
-              cblas_dtrsm_batch(CblasRowMajor,
-                                side, uplo, transA, diag,
-                                blksize, numcols,
-                                one,
-                                (const double**)aa, lda,
-                                (double**)bb, ldb,
-                                1, size_per_grp);
+              MKL_SIDE side        = MKL_RIGHT;
+              MKL_UPLO uplo        = MKL_UPPER;
+              MKL_TRANSPOSE transA = MKL_NOTRANS;
+              MKL_DIAG diag        = MKL_UNIT;
+
+              mkl_dtrsm_compact(MKL_ROW_MAJOR, side, uplo, transA, diag,
+                                BlkSize, NumCols, one, (const double *)a.data(),
+                                a.stride_1(), (double *)b.data(), b.stride_1(),
+                                format, (MKL_INT)N * VectorLength);
               break;
             }
             case 3: {
-              CBLAS_SIDE side[1] = {CblasRight};
-              CBLAS_UPLO uplo[1] = {CblasUpper};
-              CBLAS_TRANSPOSE transA[1] = {CblasNoTrans};
-              CBLAS_DIAG diag[1] = {CblasNonUnit};
-
-              cblas_dtrsm_batch(CblasRowMajor,
-                                side, uplo, transA, diag,
-                                blksize, numcols,
-                                one,
-                                (const double**)aa, lda,
-                                (double**)bb, ldb,
-                                1, size_per_grp);
+              MKL_SIDE side        = MKL_RIGHT;
+              MKL_UPLO uplo        = MKL_UPPER;
+              MKL_TRANSPOSE transA = MKL_NOTRANS;
+              MKL_DIAG diag        = MKL_NONUNIT;
+
+              mkl_dtrsm_compact(MKL_ROW_MAJOR, side, uplo, transA, diag,
+                                BlkSize, NumCols, one, (const double *)a.data(),
+                                a.stride_1(), (double *)b.data(), b.stride_1(),
+                                format, (MKL_INT)N * VectorLength);
               break;
             }
             case 4: {
-              CBLAS_SIDE side[1] = {CblasLeft};
-              CBLAS_UPLO uplo[1] = {CblasUpper};
-              CBLAS_TRANSPOSE transA[1] = {CblasNoTrans};
-              CBLAS_DIAG diag[1] = {CblasNonUnit};
-
-              cblas_dtrsm_batch(CblasRowMajor,
-                                side, uplo, transA, diag,
-                                blksize, numcols,
-                                one,
-                                (const double**)aa, lda,
-                                (double**)bb, ldb,
-                                1, size_per_grp);
+              MKL_SIDE side        = MKL_LEFT;
+              MKL_UPLO uplo        = MKL_UPPER;
+              MKL_TRANSPOSE transA = MKL_NOTRANS;
+              MKL_DIAG diag        = MKL_NONUNIT;
+
+              mkl_dtrsm_compact(MKL_ROW_MAJOR, side, uplo, transA, diag,
+                                BlkSize, NumCols, one, (const double *)a.data(),
+                                a.stride_1(), (double *)b.data(), b.stride_1(),
+                                format, (MKL_INT)N * VectorLength);
               break;
             }
-            }
-            
-            HostSpaceType().fence();
-            const double t = timer.seconds();
-            tmin = std::min(tmin, t);
-            tavg += (iter >= 0)*t;
           }
-          tavg /= iter_end;
-
-          double diff = 0;
-          for (int i=0,iend=bref.extent(0);i<iend;++i)
-            for (int j=0,jend=bref.extent(1);j<jend;++j)
-              for (int k=0,kend=bref.extent(2);k<kend;++k)
-                diff += std::abs(bref(i,j,k) - b(i,j,k));
-
-          std::cout << std::setw(10) << "MKL Batch"
-                    << " BlkSize = " << std::setw(3) << BlkSize
-                    << " NumCols = " << std::setw(3) << NumCols
-                    << " time = " << std::scientific << tmin
-                    << " avg flop/s = " << (flop/tavg)
-                    << " max flop/s = " << (flop/tmin)
-                    << " diff to ref = " << diff
-                    << std::endl;
-        }
-      }
-#endif
 
-#if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
-      {
-        Kokkos::View<VectorType***,Kokkos::LayoutRight,HostSpaceType>
-          a("a", N, BlkSize, BlkSize),
-          b("b", N, BlkSize, NumCols);
-          
-        {
-          double tavg = 0, tmin = tmax;
-
-          MKL_COMPACT_PACK format;
-
-          if (VectorLength == 4)                 format = MKL_COMPACT_AVX;
-          else if (VectorLength == 8)            format = MKL_COMPACT_AVX512;
-          
-          double one = 1.0;
-          if (format == MKL_COMPACT_AVX512 || format == MKL_COMPACT_AVX) {
-            for (int iter=iter_begin;iter<iter_end;++iter) {
-              // flush
-              flush.run();
-                
-              // initialize matrices
-              Kokkos::deep_copy(a, amat_simd);
-              Kokkos::deep_copy(b, bmat_simd);
-                
-              HostSpaceType().fence();
-              timer.reset();
-                
-              switch (test) {
-              case 0: {
-                MKL_SIDE side = MKL_LEFT;
-                MKL_UPLO uplo = MKL_LOWER;
-                MKL_TRANSPOSE transA = MKL_NOTRANS;
-                MKL_DIAG diag = MKL_UNIT;
-
-                mkl_dtrsm_compact(MKL_ROW_MAJOR,
-                                  side, uplo, transA, diag,
-                                  BlkSize, NumCols,
-                                  one,
-                                  (const double*)a.data(), a.stride_1(),
-                                  (      double*)b.data(), b.stride_1(),
-                                  format, (MKL_INT)N*VectorLength);
-                break;
-              }
-              case 1: {
-                MKL_SIDE side = MKL_LEFT;
-                MKL_UPLO uplo = MKL_LOWER;
-                MKL_TRANSPOSE transA = MKL_NOTRANS;
-                MKL_DIAG diag = MKL_NONUNIT;
-                  
-                mkl_dtrsm_compact(MKL_ROW_MAJOR,
-                                  side, uplo, transA, diag,
-                                  BlkSize, NumCols,
-                                  one,
-                                  (const double*)a.data(), a.stride_1(),
-                                  (      double*)b.data(), b.stride_1(),
-                                  format, (MKL_INT)N*VectorLength);
-                break;
-              }
-              case 2: {
-                MKL_SIDE side = MKL_RIGHT;
-                MKL_UPLO uplo = MKL_UPPER;
-                MKL_TRANSPOSE transA = MKL_NOTRANS;
-                MKL_DIAG diag = MKL_UNIT;
-                  
-                mkl_dtrsm_compact(MKL_ROW_MAJOR,
-                                  side, uplo, transA, diag,
-                                  BlkSize, NumCols,
-                                  one,
-                                  (const double*)a.data(), a.stride_1(),
-                                  (      double*)b.data(), b.stride_1(),
-                                  format, (MKL_INT)N*VectorLength);
-                break;
-              }
-              case 3: {
-                MKL_SIDE side = MKL_RIGHT;
-                MKL_UPLO uplo = MKL_UPPER;
-                MKL_TRANSPOSE transA = MKL_NOTRANS;
-                MKL_DIAG diag = MKL_NONUNIT;
-                  
-                mkl_dtrsm_compact(MKL_ROW_MAJOR,
-                                  side, uplo, transA, diag,
-                                  BlkSize, NumCols,
-                                  one,
-                                  (const double*)a.data(), a.stride_1(),
-                                  (      double*)b.data(), b.stride_1(),
-                                  format, (MKL_INT)N*VectorLength);
-                break;
-              }
-              case 4: {
-                MKL_SIDE side = MKL_LEFT;
-                MKL_UPLO uplo = MKL_UPPER;
-                MKL_TRANSPOSE transA = MKL_NOTRANS;
-                MKL_DIAG diag = MKL_NONUNIT;
-                  
-                mkl_dtrsm_compact(MKL_ROW_MAJOR,
-                                  side, uplo, transA, diag,
-                                  BlkSize, NumCols,
-                                  one,
-                                  (const double*)a.data(), a.stride_1(),
-                                  (      double*)b.data(), b.stride_1(),
-                                  format, (MKL_INT)N*VectorLength);
-                break;
-              }
-              }
-                
-              HostSpaceType().fence();
-              const double t = timer.seconds();
-              tmin = std::min(tmin, t);
-              tavg += (iter >= 0)*t;
-            }
-            tavg /= iter_end;
-              
-            double diff = 0;
-            for (int i=0,iend=bref.extent(0);i<iend;++i)
-              for (int j=0,jend=bref.extent(1);j<jend;++j)
-                for (int k=0,kend=bref.extent(2);k<kend;++k)
-                  diff += std::abs(bref(i,j,k) - b(i/VectorLength,j,k)[i%VectorLength]);
-              
-            std::cout << std::setw(10) << "MKL Cmpt"
-                      << " BlkSize = " << std::setw(3) << BlkSize
-                      << " NumCols = " << std::setw(3) << NumCols
-                      << " time = " << std::scientific << tmin
-                      << " avg flop/s = " << (flop/tavg)
-                      << " max flop/s = " << (flop/tmin)
-                      << " diff to ref = " << diff
-                      << std::endl;
-          }
+          HostSpaceType().fence();
+          const double t = timer.seconds();
+          tmin           = std::min(tmin, t);
+          tavg += (iter >= 0) * t;
         }
+        tavg /= iter_end;
+
+        double diff = 0;
+        for (int i = 0, iend = bref.extent(0); i < iend; ++i)
+          for (int j = 0, jend = bref.extent(1); j < jend; ++j)
+            for (int k = 0, kend = bref.extent(2); k < kend; ++k)
+              diff += Kokkos::ArithTraits<value_type>::abs(
+                  bref(i, j, k) - b(i / VectorLength, j, k)[i % VectorLength]);
+
+        std::cout << std::setw(10) << "MKL Cmpt"
+                  << " BlkSize = " << std::setw(3) << BlkSize
+                  << " NumCols = " << std::setw(3) << NumCols
+                  << " time = " << std::scientific << tmin
+                  << " avg flop/s = " << (flop / tavg)
+                  << " max flop/s = " << (flop / tmin)
+                  << " diff to ref = " << diff << std::endl;
       }
+    }
+  }
 #endif
 
 #endif
 
-      // ///
-      // /// Plain version (comparable to micro BLAS version)
-      // ///
-      // {
-      //   Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType>
-      //     a("a", N*VectorLength, BlkSize, BlkSize),
-      //     b("b", N*VectorLength, BlkSize, NumCols);
-
-      //   {
-      //     double tavg = 0, tmin = tmax;
-      //     for (int iter=iter_begin;iter<iter_end;++iter) {
-      //       // flush
-      //       flush.run();
-
-      //       // initialize matrices
-      //       Kokkos::deep_copy(a, amat);
-      //       Kokkos::deep_copy(b, bmat);
-
-      //       HostSpaceType().fence();
-      //       timer.reset();
-
-      //       Kokkos::RangePolicy<HostSpaceType,ScheduleType> policy(0, N*VectorLength);
-      //       Kokkos::parallel_for
-      //         (policy,
-      //          KOKKOS_LAMBDA(const int k) {
-      //           auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
-      //           auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL());
-
-      //           switch (test) {
-      //           case 0: 
-      //             SerialTrsm<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit,AlgoTagType>::
-      //               invoke(1.0, aa, bb);
-      //             break;
-      //           case 1:
-      //             SerialTrsm<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::NonUnit,AlgoTagType>::
-      //               invoke(1.0, aa, bb);
-      //             break;
-      //           case 2:
-      //             SerialTrsm<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::Unit,AlgoTagType>::
-      //               invoke(1.0, aa, bb);
-      //             break;
-      //           case 3:
-      //             SerialTrsm<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,AlgoTagType>::
-      //               invoke(1.0, aa, bb);
-      //             break;
-      //           case 4:
-      //             SerialTrsm<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,AlgoTagType>::
-      //               invoke(1.0, aa, bb);
-      //             break;
-      //           }
-      //         });
-
-      //       HostSpaceType().fence();
-      //       const double t = timer.seconds();
-      //       tmin = std::min(tmin, t);
-      //       tavg += (iter >= 0)*t;
-      //     }
-      //     tavg /= iter_end;
-
-      //     double diff = 0;
-      //     for (int i=0,iend=bref.extent(0);i<iend;++i)
-      //       for (int j=0,jend=bref.extent(1);j<jend;++j)
-      //         for (int k=0,kend=bref.extent(2);k<kend;++k)
-      //           diff += std::abs(bref(i,j,k) - b(i,j,k));
-
-      //     std::cout << std::setw(10) << "KK Scalar"
-      //               << " BlkSize = " << std::setw(3) << BlkSize
-      //               << " NumCols = " << std::setw(3) << NumCols
-      //               << " time = " << std::scientific << tmin
-      //               << " avg flop/s = " << (flop/tavg)
-      //               << " max flop/s = " << (flop/tmin)
-      //               << " diff to ref = " << diff
-      //               << std::endl;
-      //   }
-      // }
-
-      ///
-      /// SIMD with appropriate data layout
-      ///
-      {
-        Kokkos::View<VectorType***,Kokkos::LayoutRight,HostSpaceType>
-          a("a", N, BlkSize, BlkSize),
-          b("b", N, BlkSize, NumCols);
-
-        {
-          double tavg = 0, tmin = tmax;
-          for (int iter=iter_begin;iter<iter_end;++iter) {
-            // flush
-            flush.run();
-
-            // initialize matrices
-            Kokkos::deep_copy(a, amat_simd);
-            Kokkos::deep_copy(b, bmat_simd);
-
-            HostSpaceType().fence();
-            timer.reset();
-
-            Kokkos::RangePolicy<HostSpaceType,ScheduleType> policy(0, N);
-            Kokkos::parallel_for("KokkosBatched::PerfTest::TrsmHost::SIMDSerialOpenMP", 
-                                 policy,
-                                 KOKKOS_LAMBDA(const int k) {
-                                   auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
-                                   auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL());
-
-                                   switch (test) {
-                                   case 0:
-                                     SerialTrsm<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit,AlgoTagType>::
-                                       invoke(1.0, aa, bb);
-                                     break;
-                                   case 1:
-                                     SerialTrsm<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::NonUnit,AlgoTagType>::
-                                       invoke(1.0, aa, bb);
-                                     break;
-                                   case 2:
-                                     SerialTrsm<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::Unit,AlgoTagType>::
-                                       invoke(1.0, aa, bb);
-                                     break;
-                                   case 3:
-                                     SerialTrsm<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,AlgoTagType>::
-                                       invoke(1.0, aa, bb);
-                                     break;
-                                   case 4:
-                                     SerialTrsm<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,AlgoTagType>::
-                                       invoke(1.0, aa, bb);
-                                     break;
-                                   }
-                                 });
-
-            HostSpaceType().fence();
-            const double t = timer.seconds();
-            tmin = std::min(tmin, t);
-            tavg += (iter >= 0)*t;
-          }
-          tavg /= iter_end;
-
-          double diff = 0;
-          for (int i=0,iend=bref.extent(0);i<iend;++i)
-            for (int j=0,jend=bref.extent(1);j<jend;++j)
-              for (int k=0,kend=bref.extent(2);k<kend;++k)
-                diff += std::abs(bref(i,j,k) - b(i/VectorLength,j,k)[i%VectorLength]);
-
-          std::cout << std::setw(10) << "KK Vector"
-                    << " BlkSize = " << std::setw(3) << BlkSize
-                    << " NumCols = " << std::setw(3) << NumCols
-                    << " time = " << std::scientific << tmin
-                    << " avg flop/s = " << (flop/tavg)
-                    << " max flop/s = " << (flop/tmin)
-                    << " diff to ref = " << diff
-                    << std::endl;
-        }
+  // ///
+  // /// Plain version (comparable to micro BLAS version)
+  // ///
+  // {
+  //   Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType>
+  //     a("a", N*VectorLength, BlkSize, BlkSize),
+  //     b("b", N*VectorLength, BlkSize, NumCols);
+
+  //   {
+  //     double tavg = 0, tmin = tmax;
+  //     for (int iter=iter_begin;iter<iter_end;++iter) {
+  //       // flush
+  //       flush.run();
+
+  //       // initialize matrices
+  //       Kokkos::deep_copy(a, amat);
+  //       Kokkos::deep_copy(b, bmat);
+
+  //       HostSpaceType().fence();
+  //       timer.reset();
+
+  //       Kokkos::RangePolicy<HostSpaceType,ScheduleType> policy(0,
+  //       N*VectorLength); Kokkos::parallel_for
+  //         (policy,
+  //          KOKKOS_LAMBDA(const int k) {
+  //           auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
+  //           auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL());
+
+  //           switch (test) {
+  //           case 0:
+  //             SerialTrsm<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit,AlgoTagType>::
+  //               invoke(1.0, aa, bb);
+  //             break;
+  //           case 1:
+  //             SerialTrsm<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::NonUnit,AlgoTagType>::
+  //               invoke(1.0, aa, bb);
+  //             break;
+  //           case 2:
+  //             SerialTrsm<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::Unit,AlgoTagType>::
+  //               invoke(1.0, aa, bb);
+  //             break;
+  //           case 3:
+  //             SerialTrsm<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,AlgoTagType>::
+  //               invoke(1.0, aa, bb);
+  //             break;
+  //           case 4:
+  //             SerialTrsm<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,AlgoTagType>::
+  //               invoke(1.0, aa, bb);
+  //             break;
+  //           }
+  //         });
+
+  //       HostSpaceType().fence();
+  //       const double t = timer.seconds();
+  //       tmin = std::min(tmin, t);
+  //       tavg += (iter >= 0)*t;
+  //     }
+  //     tavg /= iter_end;
+
+  //     double diff = 0;
+  //     for (int i=0,iend=bref.extent(0);i<iend;++i)
+  //       for (int j=0,jend=bref.extent(1);j<jend;++j)
+  //         for (int k=0,kend=bref.extent(2);k<kend;++k)
+  //           diff += Kokkos::ArithTraits<value_type>::abs(bref(i,j,k) -
+  //           b(i,j,k));
+
+  //     std::cout << std::setw(10) << "KK Scalar"
+  //               << " BlkSize = " << std::setw(3) << BlkSize
+  //               << " NumCols = " << std::setw(3) << NumCols
+  //               << " time = " << std::scientific << tmin
+  //               << " avg flop/s = " << (flop/tavg)
+  //               << " max flop/s = " << (flop/tmin)
+  //               << " diff to ref = " << diff
+  //               << std::endl;
+  //   }
+  // }
+
+  ///
+  /// SIMD with appropriate data layout
+  ///
+  {
+    Kokkos::View<VectorType ***, Kokkos::LayoutRight, HostSpaceType> a(
+        "a", N, BlkSize, BlkSize),
+        b("b", N, BlkSize, NumCols);
+
+    {
+      double tavg = 0, tmin = tmax;
+      for (int iter = iter_begin; iter < iter_end; ++iter) {
+        // flush
+        flush.run();
+
+        // initialize matrices
+        Kokkos::deep_copy(a, amat_simd);
+        Kokkos::deep_copy(b, bmat_simd);
+
+        HostSpaceType().fence();
+        timer.reset();
+
+        Kokkos::RangePolicy<HostSpaceType, ScheduleType> policy(0, N);
+        Kokkos::parallel_for(
+            "KokkosBatched::PerfTest::TrsmHost::SIMDSerialOpenMP", policy,
+            KOKKOS_LAMBDA(const int k) {
+              auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL());
+              auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL());
+
+              switch (test) {
+                case 0:
+                  SerialTrsm<Side::Left, Uplo::Lower, Trans::NoTranspose,
+                             Diag::Unit, AlgoTagType>::invoke(1.0, aa, bb);
+                  break;
+                case 1:
+                  SerialTrsm<Side::Left, Uplo::Lower, Trans::NoTranspose,
+                             Diag::NonUnit, AlgoTagType>::invoke(1.0, aa, bb);
+                  break;
+                case 2:
+                  SerialTrsm<Side::Right, Uplo::Upper, Trans::NoTranspose,
+                             Diag::Unit, AlgoTagType>::invoke(1.0, aa, bb);
+                  break;
+                case 3:
+                  SerialTrsm<Side::Right, Uplo::Upper, Trans::NoTranspose,
+                             Diag::NonUnit, AlgoTagType>::invoke(1.0, aa, bb);
+                  break;
+                case 4:
+                  SerialTrsm<Side::Left, Uplo::Upper, Trans::NoTranspose,
+                             Diag::NonUnit, AlgoTagType>::invoke(1.0, aa, bb);
+                  break;
+              }
+            });
+
+        HostSpaceType().fence();
+        const double t = timer.seconds();
+        tmin           = std::min(tmin, t);
+        tavg += (iter >= 0) * t;
       }
-      std::cout << "\n\n";
+      tavg /= iter_end;
+
+      double diff = 0;
+      for (int i = 0, iend = bref.extent(0); i < iend; ++i)
+        for (int j = 0, jend = bref.extent(1); j < jend; ++j)
+          for (int k = 0, kend = bref.extent(2); k < kend; ++k)
+            diff += Kokkos::ArithTraits<value_type>::abs(
+                bref(i, j, k) - b(i / VectorLength, j, k)[i % VectorLength]);
+
+      std::cout << std::setw(10) << "KK Vector"
+                << " BlkSize = " << std::setw(3) << BlkSize
+                << " NumCols = " << std::setw(3) << NumCols
+                << " time = " << std::scientific << tmin
+                << " avg flop/s = " << (flop / tavg)
+                << " max flop/s = " << (flop / tmin)
+                << " diff to ref = " << diff << std::endl;
     }
   }
+  std::cout << "\n\n";
 }
-
+}  // namespace PerfTest
+}  // namespace KokkosBatched
diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host_Real.cpp
index bb82e0e56d..3d45195bb1 100644
--- a/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host_Real.cpp
+++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host_Real.cpp
@@ -5,7 +5,7 @@
 
 using namespace KokkosBatched;
 
-template<typename AlgoTagType>
+template <typename AlgoTagType>
 void run(const int N) {
   typedef Kokkos::DefaultHostExecutionSpace HostSpaceType;
 
@@ -15,56 +15,55 @@ void run(const int N) {
 
   /// Left, Lower, NoTrans, UnitDiag (used in LU factorization and LU solve)
 
-  PerfTest::Trsm<0, 3, 3, HostSpaceType,AlgoTagType>(N);
-  PerfTest::Trsm<0, 5, 5, HostSpaceType,AlgoTagType>(N);
-  PerfTest::Trsm<0,10,10, HostSpaceType,AlgoTagType>(N);
-  PerfTest::Trsm<0,15,15, HostSpaceType,AlgoTagType>(N);
+  PerfTest::Trsm<0, 3, 3, HostSpaceType, AlgoTagType>(N);
+  PerfTest::Trsm<0, 5, 5, HostSpaceType, AlgoTagType>(N);
+  PerfTest::Trsm<0, 10, 10, HostSpaceType, AlgoTagType>(N);
+  PerfTest::Trsm<0, 15, 15, HostSpaceType, AlgoTagType>(N);
 
   /// Left, Lower, NoTrans, NonUnitDiag
 
-  PerfTest::Trsm<1, 3, 3, HostSpaceType,AlgoTagType>(N);
-  PerfTest::Trsm<1, 5, 5, HostSpaceType,AlgoTagType>(N);
-  PerfTest::Trsm<1,10,10, HostSpaceType,AlgoTagType>(N);
-  PerfTest::Trsm<1,15,15, HostSpaceType,AlgoTagType>(N);
+  PerfTest::Trsm<1, 3, 3, HostSpaceType, AlgoTagType>(N);
+  PerfTest::Trsm<1, 5, 5, HostSpaceType, AlgoTagType>(N);
+  PerfTest::Trsm<1, 10, 10, HostSpaceType, AlgoTagType>(N);
+  PerfTest::Trsm<1, 15, 15, HostSpaceType, AlgoTagType>(N);
 
   /// Right, Upper, NoTrans, UnitDiag
 
-  PerfTest::Trsm<2, 3, 3, HostSpaceType,AlgoTagType>(N);
-  PerfTest::Trsm<2, 5, 5, HostSpaceType,AlgoTagType>(N);
-  PerfTest::Trsm<2,10,10, HostSpaceType,AlgoTagType>(N);
-  PerfTest::Trsm<2,15,15, HostSpaceType,AlgoTagType>(N);
+  PerfTest::Trsm<2, 3, 3, HostSpaceType, AlgoTagType>(N);
+  PerfTest::Trsm<2, 5, 5, HostSpaceType, AlgoTagType>(N);
+  PerfTest::Trsm<2, 10, 10, HostSpaceType, AlgoTagType>(N);
+  PerfTest::Trsm<2, 15, 15, HostSpaceType, AlgoTagType>(N);
 
   /// Right, Upper, NoTrans, NonUnitDiag (used in LU factorization)
 
-  PerfTest::Trsm<3, 3, 3, HostSpaceType,AlgoTagType>(N);
-  PerfTest::Trsm<3, 5, 5, HostSpaceType,AlgoTagType>(N);
-  PerfTest::Trsm<3,10,10, HostSpaceType,AlgoTagType>(N);
-  PerfTest::Trsm<3,15,15, HostSpaceType,AlgoTagType>(N);
+  PerfTest::Trsm<3, 3, 3, HostSpaceType, AlgoTagType>(N);
+  PerfTest::Trsm<3, 5, 5, HostSpaceType, AlgoTagType>(N);
+  PerfTest::Trsm<3, 10, 10, HostSpaceType, AlgoTagType>(N);
+  PerfTest::Trsm<3, 15, 15, HostSpaceType, AlgoTagType>(N);
 
   std::cout << "\n\n Used for Solve \n\n";
 
   /// Left, Lower, NoTrans, UnitDiag (used in LU solve)
 
-  PerfTest::Trsm<0, 3, 1, HostSpaceType,AlgoTagType>(N);
-  PerfTest::Trsm<0, 5, 1, HostSpaceType,AlgoTagType>(N);
-  PerfTest::Trsm<0,10, 1, HostSpaceType,AlgoTagType>(N);
-  PerfTest::Trsm<0,15, 1, HostSpaceType,AlgoTagType>(N);
+  PerfTest::Trsm<0, 3, 1, HostSpaceType, AlgoTagType>(N);
+  PerfTest::Trsm<0, 5, 1, HostSpaceType, AlgoTagType>(N);
+  PerfTest::Trsm<0, 10, 1, HostSpaceType, AlgoTagType>(N);
+  PerfTest::Trsm<0, 15, 1, HostSpaceType, AlgoTagType>(N);
 
   /// Left, Upper, Notrans, NonUnitDiag (user in LU solve)
 
-  PerfTest::Trsm<4, 3, 1, HostSpaceType,AlgoTagType>(N);
-  PerfTest::Trsm<4, 5, 1, HostSpaceType,AlgoTagType>(N);
-  PerfTest::Trsm<4,10, 1, HostSpaceType,AlgoTagType>(N);
-  PerfTest::Trsm<4,15, 1, HostSpaceType,AlgoTagType>(N);
+  PerfTest::Trsm<4, 3, 1, HostSpaceType, AlgoTagType>(N);
+  PerfTest::Trsm<4, 5, 1, HostSpaceType, AlgoTagType>(N);
+  PerfTest::Trsm<4, 10, 1, HostSpaceType, AlgoTagType>(N);
+  PerfTest::Trsm<4, 15, 1, HostSpaceType, AlgoTagType>(N);
 }
 
-int main(int argc, char *argv[]) {
-
+int main(int argc, char* argv[]) {
   Kokkos::initialize(argc, argv);
 #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
-  int N = 128*128;
+  int N = 128 * 128;
 
-  for (int i=1;i<argc;++i) {
+  for (int i = 1; i < argc; ++i) {
     const std::string& token = argv[i];
     if (token == std::string("-N")) N = std::atoi(argv[++i]);
   }
@@ -82,7 +81,6 @@ int main(int argc, char *argv[]) {
     std::cout << "\n Testing Algo::Trsm::CompactMKL\n";
     run<Algo::Gemm::CompactMKL>(N);
 #endif
-
   }
 #endif
   Kokkos::finalize();
diff --git a/perf_test/blas/KokkosBlas_blas1.cpp b/perf_test/blas/KokkosBlas_blas1.cpp
index 01c6c430fa..764f800f39 100644
--- a/perf_test/blas/KokkosBlas_blas1.cpp
+++ b/perf_test/blas/KokkosBlas_blas1.cpp
@@ -46,10 +46,10 @@
 #include <Teuchos_TimeMonitor.hpp>
 #include <Teuchos_Comm.hpp>
 #ifdef HAVE_MPI
-#  include <Teuchos_DefaultMpiComm.hpp>
+#include <Teuchos_DefaultMpiComm.hpp>
 #else
-#  include <Teuchos_DefaultSerialComm.hpp>
-#endif // HAVE_MPI
+#include <Teuchos_DefaultSerialComm.hpp>
+#endif  // HAVE_MPI
 
 using Teuchos::Comm;
 using Teuchos::CommandLineProcessor;
@@ -60,61 +60,58 @@ using Teuchos::TimeMonitor;
 
 // Create a new timer with the given name if it hasn't already been
 // created, else get the previously created timer with that name.
-RCP<Time> getTimer (const std::string& timerName) {
-  RCP<Time> timer = TimeMonitor::lookupCounter (timerName);
-  if (timer.is_null ()) {
-    timer = TimeMonitor::getNewCounter (timerName);
+RCP<Time> getTimer(const std::string& timerName) {
+  RCP<Time> timer = TimeMonitor::lookupCounter(timerName);
+  if (timer.is_null()) {
+    timer = TimeMonitor::getNewCounter(timerName);
   }
   return timer;
 }
 
-bool
-benchmarkKokkos (std::ostream& out,
-                 const int lclNumRows,
-                 const int numTrials)
-{
+bool benchmarkKokkos(std::ostream& out, const int lclNumRows,
+                     const int numTrials) {
   using std::endl;
   typedef Kokkos::View<double*, Kokkos::LayoutLeft> vector_type;
 
-  RCP<Time> vecCreateTimer = getTimer ("Kokkos: Vector: Create");
-  RCP<Time> vecFillTimer = getTimer ("Kokkos: Vector: Fill");
-  RCP<Time> vecDotTimer = getTimer ("Kokkos: Vector: Dot");
+  RCP<Time> vecCreateTimer = getTimer("Kokkos: Vector: Create");
+  RCP<Time> vecFillTimer   = getTimer("Kokkos: Vector: Fill");
+  RCP<Time> vecDotTimer    = getTimer("Kokkos: Vector: Dot");
 
   // Benchmark creation of a Vector.
   vector_type x;
   {
-    TimeMonitor timeMon (*vecCreateTimer);
+    TimeMonitor timeMon(*vecCreateTimer);
     // This benchmarks both vector creation and vector destruction.
     for (int k = 0; k < numTrials; ++k) {
-      x = vector_type ("x", lclNumRows);
+      x = vector_type("x", lclNumRows);
     }
   }
 
   // Benchmark filling a Vector.
   {
-    TimeMonitor timeMon (*vecFillTimer);
+    TimeMonitor timeMon(*vecFillTimer);
     for (int k = 0; k < numTrials; ++k) {
-      Kokkos::deep_copy (x, 1.0);
+      Kokkos::deep_copy(x, 1.0);
     }
   }
 
-  vector_type y ("y", lclNumRows);
-  Kokkos::deep_copy (y, -1.0);
+  vector_type y("y", lclNumRows);
+  Kokkos::deep_copy(y, -1.0);
 
   // Benchmark computing the dot product of two Vectors.
   double dotResults[2];
   dotResults[0] = 0.0;
   dotResults[1] = 0.0;
   {
-    TimeMonitor timeMon (*vecDotTimer);
+    TimeMonitor timeMon(*vecDotTimer);
     for (int k = 0; k < numTrials; ++k) {
       // "Confuse" the compiler so it doesn't optimize away the dot() calls.
-      dotResults[k % 2] = KokkosBlas::dot (x, y);
+      dotResults[k % 2] = KokkosBlas::dot(x, y);
     }
   }
 
   if (numTrials > 0) {
-    const double expectedResult = static_cast<double> (lclNumRows) * -1.0;
+    const double expectedResult = static_cast<double>(lclNumRows) * -1.0;
     if (dotResults[0] != expectedResult) {
       out << "Kokkos dot product result is wrong!  Expected " << expectedResult
           << " but got " << dotResults[0] << " instead." << endl;
@@ -126,34 +123,30 @@ benchmarkKokkos (std::ostream& out,
   return true;
 }
 
-
-bool
-benchmarkRaw (std::ostream& out,
-              const int lclNumRows,
-              const int numTrials)
-{
+bool benchmarkRaw(std::ostream& out, const int lclNumRows,
+                  const int numTrials) {
   using std::endl;
-  RCP<Time> vecCreateTimer = getTimer ("Raw: Vector: Create");
-  RCP<Time> vecFillTimer = getTimer ("Raw: Vector: Fill");
-  RCP<Time> vecDotTimer = getTimer ("Raw: Vector: Dot");
+  RCP<Time> vecCreateTimer = getTimer("Raw: Vector: Create");
+  RCP<Time> vecFillTimer   = getTimer("Raw: Vector: Fill");
+  RCP<Time> vecDotTimer    = getTimer("Raw: Vector: Dot");
 
   // Benchmark creation of a Vector.
-  double* x = 0 ;
+  double* x = 0;
   {
-    TimeMonitor timeMon (*vecCreateTimer);
+    TimeMonitor timeMon(*vecCreateTimer);
     // This benchmarks both vector creation and vector destruction.
     for (int k = 0; k < numTrials; ++k) {
-      x = new double [lclNumRows];
-      memset (x, 0, lclNumRows * sizeof (double));
+      x = new double[lclNumRows];
+      memset(x, 0, lclNumRows * sizeof(double));
       if (k + 1 < numTrials) {
-        delete [] x;
+        delete[] x;
       }
     }
   }
 
   // Benchmark filling a Vector.
   {
-    TimeMonitor timeMon (*vecFillTimer);
+    TimeMonitor timeMon(*vecFillTimer);
     for (int k = 0; k < numTrials; ++k) {
       for (int i = 0; i < lclNumRows; ++i) {
         x[i] = 1.0;
@@ -161,7 +154,7 @@ benchmarkRaw (std::ostream& out,
     }
   }
 
-  double* y = new double [lclNumRows];
+  double* y = new double[lclNumRows];
   for (int i = 0; i < lclNumRows; ++i) {
     y[i] = -1.0;
   }
@@ -171,7 +164,7 @@ benchmarkRaw (std::ostream& out,
   dotResults[0] = 0.0;
   dotResults[1] = 0.0;
   {
-    TimeMonitor timeMon (*vecDotTimer);
+    TimeMonitor timeMon(*vecDotTimer);
     for (int k = 0; k < numTrials; ++k) {
       double sum = 0.0;
       for (int i = 0; i < lclNumRows; ++i) {
@@ -183,19 +176,18 @@ benchmarkRaw (std::ostream& out,
   }
 
   if (x != NULL) {
-    delete [] x;
+    delete[] x;
     x = NULL;
   }
   if (y != NULL) {
-    delete [] y;
+    delete[] y;
     y = NULL;
   }
 
   if (numTrials == 0) {
-    return true; // trivially
-  }
-  else { // numTrials > 0
-    const double expectedResult = static_cast<double> (lclNumRows) * -1.0;
+    return true;  // trivially
+  } else {        // numTrials > 0
+    const double expectedResult = static_cast<double>(lclNumRows) * -1.0;
     if (dotResults[0] != expectedResult) {
       out << "Raw dot product result is wrong!  Expected " << expectedResult
           << " but got " << dotResults[0] << " instead." << endl;
@@ -206,42 +198,41 @@ benchmarkRaw (std::ostream& out,
   }
 }
 
-
-int
-main (int argc, char* argv[])
-{
+int main(int argc, char* argv[]) {
   using std::cout;
   using std::endl;
   Teuchos::oblackholestream blackHole;
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackHole);
-  Kokkos::initialize (argc, argv);
+  Teuchos::GlobalMPISession mpiSession(&argc, &argv, &blackHole);
+  Kokkos::initialize(argc, argv);
 
 #ifdef HAVE_MPI
-  RCP<const Comm<int> > comm = rcp (new Teuchos::MpiComm<int> (MPI_COMM_WORLD));
+  RCP<const Comm<int> > comm = rcp(new Teuchos::MpiComm<int>(MPI_COMM_WORLD));
 #else
-  RCP<const Comm<int> > comm = rcp (new Teuchos::SerialComm<int> ());
-#endif // HAVE_MPI
+  RCP<const Comm<int> > comm = rcp(new Teuchos::SerialComm<int>());
+#endif  // HAVE_MPI
 
-  //const int numProcs = comm->getSize (); // unused
-  const int myRank = comm->getRank ();
+  // const int numProcs = comm->getSize (); // unused
+  const int myRank = comm->getRank();
 
   // Benchmark parameters
   int lclNumRows = 100000;
-  int numTrials = 1000;
+  int numTrials  = 1000;
 
   bool runKokkos = true;
-  bool runRaw = true;
+  bool runRaw    = true;
 
   CommandLineProcessor cmdp;
-  cmdp.setOption ("lclNumRows", &lclNumRows, "Number of global indices "
-                  "owned by each process");
-  cmdp.setOption ("numTrials", &numTrials, "Number of timing loop iterations for each event to time");
-  cmdp.setOption ("runKokkos", "noKokkos", &runKokkos,
-                  "Whether to run the Kokkos benchmark");
-  cmdp.setOption ("runRaw", "noRaw", &runRaw,
-                  "Whether to run the raw benchmark");
+  cmdp.setOption("lclNumRows", &lclNumRows,
+                 "Number of global indices "
+                 "owned by each process");
+  cmdp.setOption("numTrials", &numTrials,
+                 "Number of timing loop iterations for each event to time");
+  cmdp.setOption("runKokkos", "noKokkos", &runKokkos,
+                 "Whether to run the Kokkos benchmark");
+  cmdp.setOption("runRaw", "noRaw", &runRaw,
+                 "Whether to run the raw benchmark");
   const CommandLineProcessor::EParseCommandLineReturn parseResult =
-    cmdp.parse (argc, argv);
+      cmdp.parse(argc, argv);
   if (parseResult == CommandLineProcessor::PARSE_HELP_PRINTED) {
     // The user specified --help at the command line to print help
     // with command-line arguments.  We printed help already, so quit
@@ -249,15 +240,15 @@ main (int argc, char* argv[])
     return EXIT_SUCCESS;
   } else {
     TEUCHOS_TEST_FOR_EXCEPTION(
-      parseResult != CommandLineProcessor::PARSE_SUCCESSFUL,
-      std::invalid_argument, "Failed to parse command-line arguments.");
-    TEUCHOS_TEST_FOR_EXCEPTION(
-      lclNumRows < 0, std::invalid_argument,
-      "lclNumRows must be nonnegative.");
+        parseResult != CommandLineProcessor::PARSE_SUCCESSFUL,
+        std::invalid_argument, "Failed to parse command-line arguments.");
+    TEUCHOS_TEST_FOR_EXCEPTION(lclNumRows < 0, std::invalid_argument,
+                               "lclNumRows must be nonnegative.");
   }
 
   if (myRank == 0) {
-    cout << endl << "---" << endl
+    cout << endl
+         << "---" << endl
          << "Command-line options:" << endl
          << "  lclNumRows: " << lclNumRows << endl
          << "  numTrials: " << numTrials << endl
@@ -269,20 +260,20 @@ main (int argc, char* argv[])
   // Run the benchmark
   bool success = true;
   if (runKokkos) {
-    const bool lclSuccess = benchmarkKokkos (cout, lclNumRows, numTrials);
-    success = success && lclSuccess;
+    const bool lclSuccess = benchmarkKokkos(cout, lclNumRows, numTrials);
+    success               = success && lclSuccess;
   }
   if (runRaw) {
-    const bool lclSuccess = benchmarkRaw (cout, lclNumRows, numTrials);
-    success = success && lclSuccess;
+    const bool lclSuccess = benchmarkRaw(cout, lclNumRows, numTrials);
+    success               = success && lclSuccess;
   }
 
-  TimeMonitor::report (comm.ptr (), cout);
+  TimeMonitor::report(comm.ptr(), cout);
   if (success) {
     cout << "End Result: TEST PASSED" << endl;
   } else {
     cout << "End Result: TEST FAILED" << endl;
   }
-  Kokkos::finalize ();
+  Kokkos::finalize();
   return EXIT_SUCCESS;
 }
diff --git a/perf_test/blas/KokkosBlas_blas1_MV.cpp b/perf_test/blas/KokkosBlas_blas1_MV.cpp
index 8a4105df67..6f2d3bf8ef 100644
--- a/perf_test/blas/KokkosBlas_blas1_MV.cpp
+++ b/perf_test/blas/KokkosBlas_blas1_MV.cpp
@@ -46,10 +46,10 @@
 #include <Teuchos_TimeMonitor.hpp>
 #include <Teuchos_Comm.hpp>
 #ifdef HAVE_MPI
-#  include <Teuchos_DefaultMpiComm.hpp>
+#include <Teuchos_DefaultMpiComm.hpp>
 #else
-#  include <Teuchos_DefaultSerialComm.hpp>
-#endif // HAVE_MPI
+#include <Teuchos_DefaultSerialComm.hpp>
+#endif  // HAVE_MPI
 
 using Teuchos::Comm;
 using Teuchos::CommandLineProcessor;
@@ -60,90 +60,85 @@ using Teuchos::TimeMonitor;
 
 // Create a new timer with the given name if it hasn't already been
 // created, else get the previously created timer with that name.
-RCP<Time> getTimer (const std::string& timerName) {
-  RCP<Time> timer = TimeMonitor::lookupCounter (timerName);
-  if (timer.is_null ()) {
-    timer = TimeMonitor::getNewCounter (timerName);
+RCP<Time> getTimer(const std::string& timerName) {
+  RCP<Time> timer = TimeMonitor::lookupCounter(timerName);
+  if (timer.is_null()) {
+    timer = TimeMonitor::getNewCounter(timerName);
   }
   return timer;
 }
 
-bool
-benchmarkKokkos (std::ostream& out,
-                 const int numRows,
-                 const int numCols,
-                 const int numTrials)
-{
+bool benchmarkKokkos(std::ostream& out, const int numRows, const int numCols,
+                     const int numTrials) {
   using Kokkos::ALL;
   using Kokkos::subview;
   using std::endl;
 #ifdef KOKKOS_ENABLE_SERIAL
   typedef Kokkos::Serial execution_space;
 #else
-  typedef Kokkos::View<double**, Kokkos::LayoutLeft>::execution_space execution_space;
-#endif // KOKKOS_ENABLE_SERIAL
+  typedef Kokkos::View<double**, Kokkos::LayoutLeft>::execution_space
+      execution_space;
+#endif  // KOKKOS_ENABLE_SERIAL
   typedef Kokkos::View<double**, Kokkos::LayoutLeft, execution_space> mv_type;
   bool success = true;
 
-  RCP<Time> vecCreateTimer = getTimer ("Kokkos: MV: Create");
-  RCP<Time> vecFillZeroTimer = getTimer ("Kokkos: MV: Fill zero");
-  RCP<Time> vecFillNonZeroTimer = getTimer ("Kokkos: MV: Fill nonzero");
-  RCP<Time> vecNrm2Timer = getTimer ("Kokkos: MV: Nrm2 (contiguous)");
-  RCP<Time> vecNrm2Timer2 = getTimer ("Kokkos: MV: Nrm2 (noncontiguous)");
-  RCP<Time> vecNrm1Timer = getTimer ("Kokkos: MV: Nrm1 (contiguous)");
-  RCP<Time> vecNrm1Timer2 = getTimer ("Kokkos: MV: Nrm1 (noncontiguous)");
-  RCP<Time> vecDotTimer = getTimer ("Kokkos: MV: Dot (contiguous)");
-  RCP<Time> vecDotTimer2 = getTimer ("Kokkos: MV: Dot (noncontiguous)");
-  RCP<Time> vecNrmInfTimer = getTimer ("Kokkos: MV: NrmInf (contiguous)");
-  RCP<Time> vecNrmInfTimer2 = getTimer ("Kokkos: MV: NrmInf (noncontiguous)");
-  RCP<Time> vecAxpyTimer = getTimer ("Kokkos: MV: Axpy");
-  RCP<Time> vecAxpbyTimer = getTimer ("Kokkos: MV: Axpby");
-  RCP<Time> vecScalTimer = getTimer ("Kokkos: MV: Scal");
+  RCP<Time> vecCreateTimer      = getTimer("Kokkos: MV: Create");
+  RCP<Time> vecFillZeroTimer    = getTimer("Kokkos: MV: Fill zero");
+  RCP<Time> vecFillNonZeroTimer = getTimer("Kokkos: MV: Fill nonzero");
+  RCP<Time> vecNrm2Timer        = getTimer("Kokkos: MV: Nrm2 (contiguous)");
+  RCP<Time> vecNrm2Timer2       = getTimer("Kokkos: MV: Nrm2 (noncontiguous)");
+  RCP<Time> vecNrm1Timer        = getTimer("Kokkos: MV: Nrm1 (contiguous)");
+  RCP<Time> vecNrm1Timer2       = getTimer("Kokkos: MV: Nrm1 (noncontiguous)");
+  RCP<Time> vecDotTimer         = getTimer("Kokkos: MV: Dot (contiguous)");
+  RCP<Time> vecDotTimer2        = getTimer("Kokkos: MV: Dot (noncontiguous)");
+  RCP<Time> vecNrmInfTimer      = getTimer("Kokkos: MV: NrmInf (contiguous)");
+  RCP<Time> vecNrmInfTimer2 = getTimer("Kokkos: MV: NrmInf (noncontiguous)");
+  RCP<Time> vecAxpyTimer    = getTimer("Kokkos: MV: Axpy");
+  RCP<Time> vecAxpbyTimer   = getTimer("Kokkos: MV: Axpby");
+  RCP<Time> vecScalTimer    = getTimer("Kokkos: MV: Scal");
 
   // Benchmark creation of a MultiVector.
   mv_type x;
   {
-    TimeMonitor timeMon (*vecCreateTimer);
+    TimeMonitor timeMon(*vecCreateTimer);
     // This benchmarks both vector creation and vector destruction.
     for (int k = 0; k < numTrials; ++k) {
-      x = mv_type ("x", numRows, numCols);
+      x = mv_type("x", numRows, numCols);
     }
   }
 
   // Benchmark filling a Vector with zero.
   {
-    TimeMonitor timeMon (*vecFillZeroTimer);
+    TimeMonitor timeMon(*vecFillZeroTimer);
     for (int k = 0; k < numTrials; ++k) {
-      KokkosBlas::fill (x, 0.0);
+      KokkosBlas::fill(x, 0.0);
     }
   }
 
   // Benchmark filling a Vector with a nonzero value.
   {
-    TimeMonitor timeMon (*vecFillNonZeroTimer);
+    TimeMonitor timeMon(*vecFillNonZeroTimer);
     for (int k = 0; k < numTrials; ++k) {
-      KokkosBlas::fill (x, 1.0);
+      KokkosBlas::fill(x, 1.0);
     }
   }
 
   // Benchmark computing the (square of the) 2-norm of a MultiVector.
   typedef Kokkos::View<double*, Kokkos::DefaultHostExecutionSpace> norms_type;
-  norms_type norms ("norms", numCols);
+  norms_type norms("norms", numCols);
   {
-    TimeMonitor timeMon (*vecNrm2Timer);
+    TimeMonitor timeMon(*vecNrm2Timer);
     for (int k = 0; k < numTrials; ++k) {
-      KokkosBlas::nrm2_squared (norms, x);
+      KokkosBlas::nrm2_squared(norms, x);
     }
   }
 
   if (numTrials > 0 && numCols > 0) {
-
     for (int j = 0; j < numCols; ++j) {
-      const double expectedResult = static_cast<double> (numRows);
+      const double expectedResult = static_cast<double>(numRows);
       if (norms(j) != expectedResult) {
         out << "Kokkos 2-norm (squared) result is wrong!  Expected "
-            << expectedResult << " but got " << norms(j) << " instead."
-            << endl;
+            << expectedResult << " but got " << norms(j) << " instead." << endl;
         success = false;
       }
     }
@@ -152,18 +147,17 @@ benchmarkKokkos (std::ostream& out,
   // Benchmark computing the (square of the) 2-norm of a MultiVector,
   // using the 4-argument variant.
   {
-    TimeMonitor timeMon (*vecNrm2Timer2);
+    TimeMonitor timeMon(*vecNrm2Timer2);
     for (int k = 0; k < numTrials; ++k) {
       for (int j = 0; j < numCols; ++j) {
-        norms(j) = KokkosBlas::nrm2_squared (subview (x, ALL (), j));
+        norms(j) = KokkosBlas::nrm2_squared(subview(x, ALL(), j));
       }
     }
   }
 
   if (numTrials > 0 && numCols > 0) {
-
     for (int j = 0; j < numCols; ++j) {
-      const double expectedResult = static_cast<double> (numRows);
+      const double expectedResult = static_cast<double>(numRows);
       if (norms(j) != expectedResult) {
         out << "Kokkos 2-norm (squared) result (3-arg variant) is wrong!  "
             << "Expected " << expectedResult << " but got " << norms(j)
@@ -175,15 +169,15 @@ benchmarkKokkos (std::ostream& out,
 
   // Benchmark computing the 1-norm of a MultiVector.
   {
-    TimeMonitor timeMon (*vecNrm1Timer);
+    TimeMonitor timeMon(*vecNrm1Timer);
     for (int k = 0; k < numTrials; ++k) {
-      KokkosBlas::nrm1 (norms, x);
+      KokkosBlas::nrm1(norms, x);
     }
   }
 
   if (numTrials > 0 && numCols > 0) {
     for (int j = 0; j < numCols; ++j) {
-      const double expectedResult = static_cast<double> (numRows);
+      const double expectedResult = static_cast<double>(numRows);
       if (norms(j) != expectedResult) {
         out << "Kokkos 1-norm result is wrong!  Expected " << expectedResult
             << " but got " << norms(j) << " instead." << endl;
@@ -195,18 +189,17 @@ benchmarkKokkos (std::ostream& out,
   // Benchmark computing the 1-norm of a MultiVector, using the
   // 4-argument variant.
   {
-    TimeMonitor timeMon (*vecNrm1Timer2);
+    TimeMonitor timeMon(*vecNrm1Timer2);
     for (int k = 0; k < numTrials; ++k) {
       for (int j = 0; j < numCols; ++j) {
-        norms(j) = KokkosBlas::nrm1 (subview (x, ALL (), j));
+        norms(j) = KokkosBlas::nrm1(subview(x, ALL(), j));
       }
     }
   }
 
   if (numTrials > 0 && numCols > 0) {
-
     for (int j = 0; j < numCols; ++j) {
-      const double expectedResult = static_cast<double> (numRows);
+      const double expectedResult = static_cast<double>(numRows);
       if (norms(j) != expectedResult) {
         out << "Kokkos 1-norm result (3-arg variant) is wrong!  "
             << "Expected " << expectedResult << " but got " << norms(j)
@@ -217,23 +210,23 @@ benchmarkKokkos (std::ostream& out,
   }
 
   // Benchmark computing the dot product of two MultiVectors.
-  mv_type y ("y", numRows, numCols);
-  KokkosBlas::fill (y, -1.0);
+  mv_type y("y", numRows, numCols);
+  KokkosBlas::fill(y, -1.0);
   typedef Kokkos::View<double*, Kokkos::DefaultHostExecutionSpace> dots_type;
-  dots_type dots ("dots", numCols);
+  dots_type dots("dots", numCols);
   {
-    TimeMonitor timeMon (*vecDotTimer);
+    TimeMonitor timeMon(*vecDotTimer);
     for (int k = 0; k < numTrials; ++k) {
-      KokkosBlas::dot (dots, x, y);
+      KokkosBlas::dot(dots, x, y);
     }
   }
 
   if (numTrials > 0 && numCols > 0) {
     for (int j = 0; j < numCols; ++j) {
-      const double expectedResult = static_cast<double> (numRows) * -1.0;
+      const double expectedResult = static_cast<double>(numRows) * -1.0;
       if (dots(j) != expectedResult) {
-        out << "Kokkos dot product result is wrong!  Expected " << expectedResult
-            << " but got " << (j) << " instead." << endl;
+        out << "Kokkos dot product result is wrong!  Expected "
+            << expectedResult << " but got " << (j) << " instead." << endl;
         success = false;
       }
     }
@@ -242,18 +235,17 @@ benchmarkKokkos (std::ostream& out,
   // Benchmark computing the dot product of two MultiVectors,
   // using the variant that selects a column at a time.
   {
-    TimeMonitor timeMon (*vecDotTimer2);
+    TimeMonitor timeMon(*vecDotTimer2);
     for (int k = 0; k < numTrials; ++k) {
       for (int j = 0; j < numCols; ++j) {
-        dots(j) = KokkosBlas::dot (subview (x, ALL (), j),
-                                   subview (y, ALL (), j));
+        dots(j) = KokkosBlas::dot(subview(x, ALL(), j), subview(y, ALL(), j));
       }
     }
   }
 
   if (numTrials > 0 && numCols > 0) {
     for (int j = 0; j < numCols; ++j) {
-      const double expectedResult = static_cast<double> (numRows) * -1.0;
+      const double expectedResult = static_cast<double>(numRows) * -1.0;
       if (dots(j) != expectedResult) {
         out << "Kokkos dot product result (5-arg variant) is wrong!  "
             << "Expected " << expectedResult << " but got " << (j)
@@ -265,14 +257,13 @@ benchmarkKokkos (std::ostream& out,
 
   // Benchmark computing the inf-norm of a MultiVector.
   {
-    TimeMonitor timeMon (*vecNrmInfTimer);
+    TimeMonitor timeMon(*vecNrmInfTimer);
     for (int k = 0; k < numTrials; ++k) {
-      KokkosBlas::nrminf (norms, x);
+      KokkosBlas::nrminf(norms, x);
     }
   }
 
   if (numTrials > 0 && numCols > 0) {
-
     for (int j = 0; j < numCols; ++j) {
       const double expectedResult = 1.0;
       if (norms(j) != expectedResult) {
@@ -286,16 +277,15 @@ benchmarkKokkos (std::ostream& out,
   // Benchmark computing the inf-norm of a MultiVector, using the
   // 4-argument variant.
   {
-    TimeMonitor timeMon (*vecNrmInfTimer2);
+    TimeMonitor timeMon(*vecNrmInfTimer2);
     for (int k = 0; k < numTrials; ++k) {
       for (int j = 0; j < numCols; ++j) {
-        norms(j) = KokkosBlas::nrminf ( subview (x, ALL (), j));
+        norms(j) = KokkosBlas::nrminf(subview(x, ALL(), j));
       }
     }
   }
 
   if (numTrials > 0 && numCols > 0) {
-
     for (int j = 0; j < numCols; ++j) {
       const double expectedResult = 1.0;
       if (norms(j) != expectedResult) {
@@ -309,77 +299,72 @@ benchmarkKokkos (std::ostream& out,
 
   // Benchmark y := alpha*x + beta*y for beta = 0 and beta != 0.
   {
-    TimeMonitor timeMon (*vecAxpyTimer);
+    TimeMonitor timeMon(*vecAxpyTimer);
     const double alpha = 3.0;
-    const double beta = 0.0;
+    const double beta  = 0.0;
 
     for (int k = 0; k < numTrials; ++k) {
-      KokkosBlas::axpby (alpha, x, beta, y);
+      KokkosBlas::axpby(alpha, x, beta, y);
     }
   }
   {
-    TimeMonitor timeMon (*vecAxpbyTimer);
+    TimeMonitor timeMon(*vecAxpbyTimer);
     const double alpha = 3.0;
-    const double beta = 4.0;
+    const double beta  = 4.0;
 
     for (int k = 0; k < numTrials; ++k) {
-      KokkosBlas::axpby (alpha, x, beta, y);
+      KokkosBlas::axpby(alpha, x, beta, y);
     }
   }
 
   // Benchmark y := alpha*y.
   {
-    TimeMonitor timeMon (*vecScalTimer);
+    TimeMonitor timeMon(*vecScalTimer);
     const double alpha = 0.5;
 
     for (int k = 0; k < numTrials; ++k) {
-      KokkosBlas::scal (y, alpha, y);
+      KokkosBlas::scal(y, alpha, y);
     }
   }
 
   return success;
 }
 
-
-bool
-benchmarkRaw (std::ostream& out,
-              const int numRows,
-              const int numCols,
-              const int numTrials)
-{
+bool benchmarkRaw(std::ostream& out, const int numRows, const int numCols,
+                  const int numTrials) {
   using std::endl;
-  RCP<Time> vecCreateTimer = getTimer ("Raw: MV: Create");
-  RCP<Time> vecFillZeroTimer = getTimer ("Raw: MV: Fill zero");
-  RCP<Time> vecFillNonzeroTimer = getTimer ("Raw: MV: Fill nonzero");
-  RCP<Time> vecNrm2Timer = getTimer ("Raw: MV: Nrm2");
-  RCP<Time> vecNrm1Timer = getTimer ("Raw: MV: Nrm1");
-  RCP<Time> vecDotTimer = getTimer ("Raw: MV: Dot");
-  RCP<Time> vecNrmInfTimer = getTimer ("Raw: MV: NrmInf");
-  RCP<Time> vecAxpyTimer = getTimer ("Raw: MV: Axpy");
-  RCP<Time> vecAxpbyTimer = getTimer ("Raw: MV: Axpby");
-  bool success = true;
+  RCP<Time> vecCreateTimer      = getTimer("Raw: MV: Create");
+  RCP<Time> vecFillZeroTimer    = getTimer("Raw: MV: Fill zero");
+  RCP<Time> vecFillNonzeroTimer = getTimer("Raw: MV: Fill nonzero");
+  RCP<Time> vecNrm2Timer        = getTimer("Raw: MV: Nrm2");
+  RCP<Time> vecNrm1Timer        = getTimer("Raw: MV: Nrm1");
+  RCP<Time> vecDotTimer         = getTimer("Raw: MV: Dot");
+  RCP<Time> vecNrmInfTimer      = getTimer("Raw: MV: NrmInf");
+  RCP<Time> vecAxpyTimer        = getTimer("Raw: MV: Axpy");
+  RCP<Time> vecAxpbyTimer       = getTimer("Raw: MV: Axpby");
+  bool success                  = true;
 
   if (numTrials <= 0) {
-    return success; // trivial success
+    return success;  // trivial success
   }
 
   // Benchmark creation of a MultiVector.
   double* x = NULL;
   {
-    TimeMonitor timeMon (*vecCreateTimer);
+    TimeMonitor timeMon(*vecCreateTimer);
     // This benchmarks both vector creation and vector destruction.
     for (int k = 0; k < numTrials; ++k) {
-      x = new double [numRows * numCols];
-      memset (x, 0, numRows * numCols * sizeof (double));
+      x = new double[numRows * numCols];
+      memset(x, 0, numRows * numCols * sizeof(double));
       if (k + 1 < numTrials) {
-        delete [] x;
+        delete[] x;
       }
     }
   }
 
   // Benchmark filling a Vector with zeros.
   {
-    TimeMonitor timeMon (*vecFillZeroTimer);
+    TimeMonitor timeMon(*vecFillZeroTimer);
     for (int k = 0; k < numTrials; ++k) {
       for (int j = 0; j < numCols; ++j) {
         double* x_j = x + numRows * j;
@@ -392,7 +377,7 @@ benchmarkRaw (std::ostream& out,
 
   // Benchmark filling a Vector with a nonzero value.
   {
-    TimeMonitor timeMon (*vecFillNonzeroTimer);
+    TimeMonitor timeMon(*vecFillNonzeroTimer);
     for (int k = 0; k < numTrials; ++k) {
       for (int j = 0; j < numCols; ++j) {
         double* x_j = x + numRows * j;
@@ -404,12 +389,12 @@ benchmarkRaw (std::ostream& out,
   }
 
   // Benchmark computing the (square of the) 2-norm of a MultiVector.
-  double* norms = new double [numCols];
+  double* norms = new double[numCols];
   {
-    TimeMonitor timeMon (*vecNrm2Timer);
+    TimeMonitor timeMon(*vecNrm2Timer);
     for (int k = 0; k < numTrials; ++k) {
       for (int j = 0; j < numCols; ++j) {
-        double sum = 0.0;
+        double sum  = 0.0;
         double* x_j = x + numRows * j;
         for (int i = 0; i < numRows; ++i) {
           sum += x_j[i] * x_j[i];
@@ -421,10 +406,10 @@ benchmarkRaw (std::ostream& out,
 
   // Benchmark computing the 1-norm of a MultiVector.
   {
-    TimeMonitor timeMon (*vecNrm1Timer);
+    TimeMonitor timeMon(*vecNrm1Timer);
     for (int k = 0; k < numTrials; ++k) {
       for (int j = 0; j < numCols; ++j) {
-        double sum = 0.0;
+        double sum  = 0.0;
         double* x_j = x + numRows * j;
         for (int i = 0; i < numRows; ++i) {
           const double tmp = x_j[i] < 0.0 ? -x_j[i] : x_j[i];
@@ -437,7 +422,7 @@ benchmarkRaw (std::ostream& out,
 
   // Benchmark computing the inf-norm of a MultiVector.
   {
-    TimeMonitor timeMon (*vecNrmInfTimer);
+    TimeMonitor timeMon(*vecNrmInfTimer);
     for (int k = 0; k < numTrials; ++k) {
       for (int j = 0; j < numCols; ++j) {
         double norm = 0.0;
@@ -454,24 +439,24 @@ benchmarkRaw (std::ostream& out,
   }
 
   // Benchmark computing the dot product of two MultiVectors.
-  double* y = new double [numRows * numCols];
+  double* y = new double[numRows * numCols];
   for (int j = 0; j < numCols; ++j) {
     double* y_j = y + numRows * j;
     for (int i = 0; i < numRows; ++i) {
       y_j[i] = -1.0;
     }
   }
-  double* dots = new double [numCols];
+  double* dots = new double[numCols];
   for (int j = 0; j < numCols; ++j) {
     dots[j] = 0.0;
   }
   {
-    TimeMonitor timeMon (*vecDotTimer);
+    TimeMonitor timeMon(*vecDotTimer);
     for (int k = 0; k < numTrials; ++k) {
       for (int j = 0; j < numCols; ++j) {
         double* const x_j = x + numRows * j;
         double* const y_j = y + numRows * j;
-        double sum = 0.0;
+        double sum        = 0.0;
         for (int i = 0; i < numRows; ++i) {
           sum += x_j[i] * y_j[i];
         }
@@ -482,7 +467,7 @@ benchmarkRaw (std::ostream& out,
 
   // Benchmark y := alpha*x + beta*y for beta = 0 and beta != 0.
   {
-    TimeMonitor timeMon (*vecAxpyTimer);
+    TimeMonitor timeMon(*vecAxpyTimer);
     const double alpha = 3.0;
 
     for (int k = 0; k < numTrials; ++k) {
@@ -496,9 +481,9 @@ benchmarkRaw (std::ostream& out,
     }
   }
   {
-    TimeMonitor timeMon (*vecAxpbyTimer);
+    TimeMonitor timeMon(*vecAxpbyTimer);
     const double alpha = 3.0;
-    const double beta = 4.0;
+    const double beta  = 4.0;
 
     for (int k = 0; k < numTrials; ++k) {
       for (int j = 0; j < numCols; ++j) {
@@ -511,36 +496,34 @@ benchmarkRaw (std::ostream& out,
     }
   }
 
-
   if (norms != NULL) {
-    delete [] norms;
+    delete[] norms;
     norms = NULL;
   }
 
   if (x != NULL) {
-    delete [] x;
+    delete[] x;
     x = NULL;
   }
   if (y != NULL) {
-    delete [] y;
+    delete[] y;
     y = NULL;
   }
 
   if (numTrials == 0) {
     return success;
-  }
-  else { // numTrials > 0
-    const double expectedResult = static_cast<double> (numRows) * -1.0;
+  } else {  // numTrials > 0
+    const double expectedResult = static_cast<double>(numRows) * -1.0;
     if (dots[0] != expectedResult) {
       out << "Raw dot product result is wrong!  Expected " << expectedResult
           << " but got " << dots[0] << " instead." << endl;
       if (dots != NULL) {
-        delete [] dots;
+        delete[] dots;
       }
       success = false;
     } else {
       if (dots != NULL) {
-        delete [] dots;
+        delete[] dots;
       }
     }
   }
@@ -548,43 +531,41 @@ benchmarkRaw (std::ostream& out,
   return success;
 }
 
-
-int
-main (int argc, char* argv[])
-{
+int main(int argc, char* argv[]) {
   using std::cout;
   using std::endl;
   Teuchos::oblackholestream blackHole;
-  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackHole);
-  Kokkos::initialize (argc, argv);
+  Teuchos::GlobalMPISession mpiSession(&argc, &argv, &blackHole);
+  Kokkos::initialize(argc, argv);
 
 #ifdef HAVE_MPI
-  RCP<const Comm<int> > comm = rcp (new Teuchos::MpiComm<int> (MPI_COMM_WORLD));
+  RCP<const Comm<int> > comm = rcp(new Teuchos::MpiComm<int>(MPI_COMM_WORLD));
 #else
-  RCP<const Comm<int> > comm = rcp (new Teuchos::SerialComm<int> ());
-#endif // HAVE_MPI
+  RCP<const Comm<int> > comm = rcp(new Teuchos::SerialComm<int>());
+#endif  // HAVE_MPI
 
-  //const int numProcs = comm->getSize (); // unused
-  const int myRank = comm->getRank ();
+  // const int numProcs = comm->getSize (); // unused
+  const int myRank = comm->getRank();
 
   // Benchmark parameters
-  int numRows = 100000;
-  int numCols = 3;
-  int numTrials = 350;
+  int numRows    = 100000;
+  int numCols    = 3;
+  int numTrials  = 350;
   bool runKokkos = true;
-  bool runRaw = true;
+  bool runRaw    = true;
 
   CommandLineProcessor cmdp;
-  cmdp.setOption ("numRows", &numRows, "Number of rows in the multivectors");
-  cmdp.setOption ("numCols", &numCols, "Number of columns in the multivectors");
-  cmdp.setOption ("numTrials", &numTrials, "Number of timing loop iterations "
-                  "for each event to time");
-  cmdp.setOption ("runKokkos", "noKokkos", &runKokkos,
-                  "Whether to run the Kokkos benchmark");
-  cmdp.setOption ("runRaw", "noRaw", &runRaw,
-                  "Whether to run the raw benchmark");
+  cmdp.setOption("numRows", &numRows, "Number of rows in the multivectors");
+  cmdp.setOption("numCols", &numCols, "Number of columns in the multivectors");
+  cmdp.setOption("numTrials", &numTrials,
+                 "Number of timing loop iterations "
+                 "for each event to time");
+  cmdp.setOption("runKokkos", "noKokkos", &runKokkos,
+                 "Whether to run the Kokkos benchmark");
+  cmdp.setOption("runRaw", "noRaw", &runRaw,
+                 "Whether to run the raw benchmark");
   const CommandLineProcessor::EParseCommandLineReturn parseResult =
-    cmdp.parse (argc, argv);
+      cmdp.parse(argc, argv);
   if (parseResult == CommandLineProcessor::PARSE_HELP_PRINTED) {
     // The user specified --help at the command line to print help
     // with command-line arguments.  We printed help already, so quit
@@ -592,16 +573,17 @@ main (int argc, char* argv[])
     return EXIT_SUCCESS;
   } else {
     TEUCHOS_TEST_FOR_EXCEPTION(
-      parseResult != CommandLineProcessor::PARSE_SUCCESSFUL,
-      std::invalid_argument, "Failed to parse command-line arguments.");
-    TEUCHOS_TEST_FOR_EXCEPTION(
-      numRows < 0, std::invalid_argument, "numRows must be nonnegative.");
-    TEUCHOS_TEST_FOR_EXCEPTION(
-      numCols < 0, std::invalid_argument, "numCols must be nonnegative.");
+        parseResult != CommandLineProcessor::PARSE_SUCCESSFUL,
+        std::invalid_argument, "Failed to parse command-line arguments.");
+    TEUCHOS_TEST_FOR_EXCEPTION(numRows < 0, std::invalid_argument,
+                               "numRows must be nonnegative.");
+    TEUCHOS_TEST_FOR_EXCEPTION(numCols < 0, std::invalid_argument,
+                               "numCols must be nonnegative.");
   }
 
   if (myRank == 0) {
-    cout << endl << "---" << endl
+    cout << endl
+         << "---" << endl
          << "Command-line options:" << endl
          << "  numRows: " << numRows << endl
          << "  numCols: " << numCols << endl
@@ -614,20 +596,20 @@ main (int argc, char* argv[])
   // Run the benchmark
   bool success = true;
   if (runKokkos) {
-    const bool lclSuccess = benchmarkKokkos (cout, numRows, numCols, numTrials);
-    success = success && lclSuccess;
+    const bool lclSuccess = benchmarkKokkos(cout, numRows, numCols, numTrials);
+    success               = success && lclSuccess;
   }
   if (runRaw) {
-    const bool lclSuccess = benchmarkRaw (cout, numRows, numCols, numTrials);
-    success = success && lclSuccess;
+    const bool lclSuccess = benchmarkRaw(cout, numRows, numCols, numTrials);
+    success               = success && lclSuccess;
   }
 
-  TimeMonitor::report (comm.ptr (), cout);
+  TimeMonitor::report(comm.ptr(), cout);
   if (success) {
     cout << "End Result: TEST PASSED" << endl;
   } else {
     cout << "End Result: TEST FAILED" << endl;
   }
-  Kokkos::finalize ();
+  Kokkos::finalize();
   return EXIT_SUCCESS;
 }
diff --git a/perf_test/blas/blas1/CMakeLists.txt b/perf_test/blas/blas1/CMakeLists.txt
index fcc488dfc4..64b0925874 100644
--- a/perf_test/blas/blas1/CMakeLists.txt
+++ b/perf_test/blas/blas1/CMakeLists.txt
@@ -7,5 +7,8 @@ KOKKOSKERNELS_ADD_EXECUTABLE(
 KOKKOSKERNELS_ADD_EXECUTABLE(
     KokkosBlas_dot_perf_test SOURCES KokkosBlas_dot_perf_test.cpp)
 
+KOKKOSKERNELS_ADD_EXECUTABLE(
+    KokkosBlas_dot_mv_perf_test SOURCES KokkosBlas_dot_mv_perf_test.cpp)
+
 KOKKOSKERNELS_ADD_EXECUTABLE(
     KokkosBlas_team_dot_perf_test SOURCES KokkosBlas_team_dot_perf_test.cpp)
diff --git a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp
new file mode 100644
index 0000000000..49032307c4
--- /dev/null
+++ b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp
@@ -0,0 +1,249 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <blas/KokkosBlas1_dot.hpp>
+#include <Kokkos_Random.hpp>
+#include "KokkosKernels_TestUtils.hpp"
+
+struct Params {
+  int use_cuda    = 0;
+  int use_hip     = 0;
+  int use_openmp  = 0;
+  int use_threads = 0;
+  // m is vector length
+  int m = 100000;
+  // n is number of columns
+  int n      = 5;
+  int repeat = 20;
+};
+
+void print_options() {
+  std::cerr << "Options:\n" << std::endl;
+
+  std::cerr << "\tBACKEND: '--threads[numThreads]' | '--openmp [numThreads]' | "
+               "'--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]'"
+            << std::endl;
+  std::cerr << "\tIf no BACKEND selected, serial is the default." << std::endl;
+  std::cerr << "\t[Optional] --repeat :: how many times to repeat overall "
+               "dot (symbolic + repeated numeric)"
+            << std::endl;
+  std::cerr << "\t[Optional] --m      :: desired length of test vectors; test "
+               "vectors will have the same length"
+            << std::endl;
+  std::cerr << "\t[Optional] --n      :: number of test vectors (columns)"
+            << std::endl;
+}
+
+int parse_inputs(Params& params, int argc, char** argv) {
+  for (int i = 1; i < argc; ++i) {
+    if (0 == Test::string_compare_no_case(argv[i], "--help") ||
+        0 == Test::string_compare_no_case(argv[i], "-h")) {
+      print_options();
+      exit(0);  // note: this is before Kokkos::initialize
+    } else if (0 == Test::string_compare_no_case(argv[i], "--threads")) {
+      params.use_threads = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) {
+      params.use_openmp = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) {
+      params.use_cuda = atoi(argv[++i]) + 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) {
+      params.use_hip = atoi(argv[++i]) + 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--m")) {
+      params.m = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--n")) {
+      params.n = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) {
+      // if provided, C will be written to given file.
+      // has to have ".bin", or ".crs" extension.
+      params.repeat = atoi(argv[++i]);
+    } else {
+      std::cerr << "Unrecognized command line argument #" << i << ": "
+                << argv[i] << std::endl;
+      print_options();
+      return 1;
+    }
+  }
+  return 0;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// The Level 1 BLAS perform scalar, vector and vector-vector operations;
+//
+// https://github.com/kokkos/kokkos-kernels/wiki/BLAS-1%3A%3Adot
+//
+// Usage: result = KokkosBlas::dot(x,y); KokkosBlas::dot(r,x,y);
+// Multiplies each value of x(i) [x(i,j)] with y(i) or [y(i,j)] and computes the
+// sum. (If x and y have scalar type Kokkos::complex, the complex conjugate of
+// x(i) or x(i,j) will be used.) VectorX: A rank-1 Kokkos::View VectorY: A
+// rank-1 Kokkos::View ReturnVector: A rank-0 or rank-1 Kokkos::View
+//
+// REQUIREMENTS:
+// Y.rank == 1 or X.rank == 1
+// Y.extent(0) == X.extent(0)
+
+// Dot Test design:
+// 1) create 1D View containing 1D matrix, aka a vector; this will be your X
+// input matrix; 2) create 1D View containing 1D matrix, aka a vector; this will
+// be your Y input matrix; 3) perform the dot operation on the two inputs, and
+// capture result in "result"
+
+// Here, m represents the desired length for each 1D matrix;
+// "m" is used here, because code from another test was adapted for this test.
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class ExecSpace>
+void run(int m, int n, int repeat) {
+  // Declare type aliases
+  using Scalar   = double;
+  using MemSpace = typename ExecSpace::memory_space;
+  using Device   = Kokkos::Device<ExecSpace, MemSpace>;
+
+  std::cout << "Running BLAS Level 1 DOT performance experiment ("
+            << ExecSpace::name() << ")\n";
+
+  std::cout << "Each test input vector has a length of " << m << std::endl;
+
+  Kokkos::View<Scalar**, Kokkos::LayoutLeft, Device> x(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "x"), m, n);
+
+  Kokkos::View<Scalar**, Kokkos::LayoutLeft, Device> y(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "y"), m, n);
+
+  Kokkos::View<Scalar*, Device> result(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "x dot y"), n);
+
+  // Declaring variable pool w/ a seeded random number;
+  // a parallel random number generator, so you
+  // won't get the same number with a given seed each time
+  Kokkos::Random_XorShift64_Pool<ExecSpace> pool(123);
+
+  Kokkos::fill_random(x, pool, 10.0);
+  Kokkos::fill_random(y, pool, 10.0);
+
+  // do a warm up run of dot:
+  KokkosBlas::dot(result, x, y);
+
+  // The live test of dot:
+
+  Kokkos::fence();
+  Kokkos::Timer timer;
+
+  for (int i = 0; i < repeat; i++) {
+    KokkosBlas::dot(result, x, y);
+    ExecSpace().fence();
+  }
+
+  // Kokkos Timer set up
+  double total = timer.seconds();
+  double avg   = total / repeat;
+  // Flops calculation for a 1D matrix dot product per test run;
+  size_t flopsPerRun = (size_t)2 * m * n;
+  printf("Avg DOT time: %f s.\n", avg);
+  printf("Avg DOT FLOP/s: %.3e\n", flopsPerRun / avg);
+}
+
+int main(int argc, char** argv) {
+  Params params;
+
+  if (parse_inputs(params, argc, argv)) {
+    return 1;
+  }
+  const int device_id = std::max(params.use_cuda, params.use_hip) - 1;
+
+  const int num_threads = std::max(params.use_openmp, params.use_threads);
+
+  Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
+
+  bool useThreads = params.use_threads != 0;
+  bool useOMP     = params.use_openmp != 0;
+  bool useCUDA    = params.use_cuda != 0;
+  bool useHIP     = params.use_hip != 0;
+  bool useSerial  = !useThreads && !useOMP && !useCUDA && !useHIP;
+
+  if (useThreads) {
+#if defined(KOKKOS_ENABLE_THREADS)
+    run<Kokkos::Threads>(params.m, params.n, params.repeat);
+#else
+    std::cout << "ERROR:  PThreads requested, but not available.\n";
+    return 1;
+#endif
+  }
+
+  if (useOMP) {
+#if defined(KOKKOS_ENABLE_OPENMP)
+    run<Kokkos::OpenMP>(params.m, params.n, params.repeat);
+#else
+    std::cout << "ERROR: OpenMP requested, but not available.\n";
+    return 1;
+#endif
+  }
+
+  if (useCUDA) {
+#if defined(KOKKOS_ENABLE_CUDA)
+    run<Kokkos::Cuda>(params.m, params.n, params.repeat);
+#else
+    std::cout << "ERROR: CUDA requested, but not available.\n";
+    return 1;
+#endif
+  }
+  if (useHIP) {
+#if defined(KOKKOS_ENABLE_HIP)
+    run<Kokkos::Experimental::HIP>(params.m, params.n, params.repeat);
+#else
+    std::cout << "ERROR: HIP requested, but not available.\n";
+    return 1;
+#endif
+  }
+  if (useSerial) {
+#if defined(KOKKOS_ENABLE_SERIAL)
+    run<Kokkos::Serial>(params.m, params.n, params.repeat);
+#else
+    std::cout << "ERROR: Serial device requested, but not available.\n";
+    return 1;
+#endif
+  }
+  Kokkos::finalize();
+  return 0;
+}
diff --git a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp
index 6ccdf0b1ac..9219d34810 100644
--- a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp
+++ b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp
@@ -48,6 +48,7 @@
 
 // For RPS implementation
 #include "KokkosBlas_dot_perf_test.hpp"
+#include "KokkosKernels_TestUtils.hpp"
 
 struct Params {
   int use_cuda    = 0;
@@ -75,18 +76,19 @@ void print_options() {
 
 int parse_inputs(Params& params, int argc, char** argv) {
   for (int i = 1; i < argc; ++i) {
-    if (0 == strcasecmp(argv[i], "--help") || 0 == strcasecmp(argv[i], "-h")) {
+    if (0 == Test::string_compare_no_case(argv[i], "--help") ||
+        0 == Test::string_compare_no_case(argv[i], "-h")) {
       print_options();
       exit(0);  // note: this is before Kokkos::initialize
-    } else if (0 == strcasecmp(argv[i], "--threads")) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--threads")) {
       params.use_threads = atoi(argv[++i]);
-    } else if (0 == strcasecmp(argv[i], "--openmp")) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) {
       params.use_openmp = atoi(argv[++i]);
-    } else if (0 == strcasecmp(argv[i], "--cuda")) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) {
       params.use_cuda = atoi(argv[++i]) + 1;
-    } else if (0 == strcasecmp(argv[i], "--m")) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--m")) {
       params.m = atoi(argv[++i]);
-    } else if (0 == strcasecmp(argv[i], "--repeat")) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) {
       // if provided, C will be written to given file.
       // has to have ".bin", or ".crs" extension.
       params.repeat = atoi(argv[++i]);
@@ -191,7 +193,7 @@ int main(int argc, char** argv) {
   bool useThreads = params.use_threads != 0;
   bool useOMP     = params.use_openmp != 0;
   bool useCUDA    = params.use_cuda != 0;
-  bool useSerial = !useThreads && !useOMP && !useCUDA;
+  bool useSerial  = !useThreads && !useOMP && !useCUDA;
 
   if (useThreads) {
 #if defined(KOKKOS_ENABLE_THREADS)
diff --git a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.hpp b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.hpp
index 4e324ab0c1..c98749c2ff 100644
--- a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.hpp
+++ b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.hpp
@@ -44,9 +44,9 @@ struct testData {
   // constructor or a destructor;
   // Constructor -- create function:
   testData(int m_in) : m(m_in) {
-    x = Kokkos::View<Scalar*, Device> (
+    x = Kokkos::View<Scalar*, Device>(
         Kokkos::view_alloc(Kokkos::WithoutInitializing, "x"), m);
-    y = Kokkos::View<Scalar*, Device> (
+    y = Kokkos::View<Scalar*, Device>(
         Kokkos::view_alloc(Kokkos::WithoutInitializing, "y"), m);
 
     Kokkos::Random_XorShift64_Pool<ExecSpace> pool(123);
diff --git a/perf_test/blas/blas1/KokkosBlas_dot_tracked_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_dot_tracked_perf_test.cpp
index fc0ef890fb..8f502bdeca 100644
--- a/perf_test/blas/blas1/KokkosBlas_dot_tracked_perf_test.cpp
+++ b/perf_test/blas/blas1/KokkosBlas_dot_tracked_perf_test.cpp
@@ -77,7 +77,7 @@ test_list construct_dot_kernel_base(const rajaperf::RunParams& run_params)
         return std::make_tuple(
             setup_test<Kokkos::DefaultExecutionSpace>(m, repeat));
       },
-      [&](const int , const int , auto& data) {
+      [&](const int, const int, auto& data) {
         KokkosBlas::dot(data.x, data.y);
       }));
 
diff --git a/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test.cpp
index 6b5cd4d619..f8a2a5aa43 100644
--- a/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test.cpp
+++ b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test.cpp
@@ -45,6 +45,7 @@
 #include <Kokkos_Core.hpp>
 #include <KokkosBlas1_team_dot.hpp>
 #include <Kokkos_Random.hpp>
+#include "KokkosKernels_TestUtils.hpp"
 
 struct Params {
   int use_cuda    = 0;
@@ -69,18 +70,19 @@ void print_options() {
 
 int parse_inputs(Params& params, int argc, char** argv) {
   for (int i = 1; i < argc; ++i) {
-    if (0 == strcasecmp(argv[i], "--help") || 0 == strcasecmp(argv[i], "-h")) {
+    if (0 == Test::string_compare_no_case(argv[i], "--help") ||
+        0 == Test::string_compare_no_case(argv[i], "-h")) {
       print_options();
       exit(0);  // note: this is before Kokkos::initialize
-    } else if (0 == strcasecmp(argv[i], "--threads")) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--threads")) {
       params.use_threads = atoi(argv[++i]);
-    } else if (0 == strcasecmp(argv[i], "--openmp")) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) {
       params.use_openmp = atoi(argv[++i]);
-    } else if (0 == strcasecmp(argv[i], "--cuda")) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) {
       params.use_cuda = atoi(argv[++i]) + 1;
-    } else if (0 == strcasecmp(argv[i], "--m")) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--m")) {
       params.m = atoi(argv[++i]);
-    } else if (0 == strcasecmp(argv[i], "--repeat")) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) {
       // if provided, C will be written to given file.
       // has to have ".bin", or ".crs" extension.
       params.repeat = atoi(argv[++i]);
@@ -97,9 +99,8 @@ int parse_inputs(Params& params, int argc, char** argv) {
 template <class Vector, class ExecSpace>
 struct teamDotFunctor {
   // Compile - time check to see if your data type is a Kokkos::View:
-  static_assert(Kokkos::Impl::is_view<Vector>::value,
-                "Vector is not a "
-                "Kokkos::View.");
+  static_assert(Kokkos::is_view<Vector>::value,
+                "Vector is not a Kokkos::View.");
 
   using Scalar = typename Vector::non_const_value_type;
   // Vector is templated on memory space
diff --git a/perf_test/blas/blas1/KokkosBlas_team_dot_tracked_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_team_dot_tracked_perf_test.cpp
index 44b039352f..90f81f5da4 100644
--- a/perf_test/blas/blas1/KokkosBlas_team_dot_tracked_perf_test.cpp
+++ b/perf_test/blas/blas1/KokkosBlas_team_dot_tracked_perf_test.cpp
@@ -87,15 +87,16 @@ test_list construct_team_dot_kernel_base(const rajaperf::RunParams& run_params)
             // TODO: Discuss decltype
             // TODO: Ask KK what values they want tested?
             setup_test<Kokkos::DefaultExecutionSpace,
-                       Kokkos::DefaultExecutionSpace::array_layout>(m, repeat, 1));
+                       Kokkos::DefaultExecutionSpace::array_layout>(m, repeat,
+                                                                    1));
       },
-      [&](const int , const int , test_data_type& data) {
+      [&](const int, const int, test_data_type& data) {
         Kokkos::parallel_for(
             "TeamDotUsage_RPS",
             test_data_type::policy(data.numberOfTeams, Kokkos::AUTO),
             KOKKOS_LAMBDA(const test_data_type::member_type& team) {
               // loop body
-                  KokkosBlas::Experimental::dot(team, data.x, data.y);
+              KokkosBlas::Experimental::dot(team, data.x, data.y);
             });
       }));
 
diff --git a/perf_test/blas/blas1/tracked_testing.hpp b/perf_test/blas/blas1/tracked_testing.hpp
index cf8d074813..b08356b478 100644
--- a/perf_test/blas/blas1/tracked_testing.hpp
+++ b/perf_test/blas/blas1/tracked_testing.hpp
@@ -15,7 +15,7 @@ namespace blas {
 
 // Register kernels per test
 
-void build_dot_executor(rajaperf::Executor& exec, int , char* [],
+void build_dot_executor(rajaperf::Executor& exec, int, char*[],
                         const rajaperf::RunParams& params) {
   for (auto* kernel : construct_dot_kernel_base(params)) {
     exec.registerKernel("BLAS", kernel);
@@ -23,7 +23,7 @@ void build_dot_executor(rajaperf::Executor& exec, int , char* [],
 }
 // Team Dot build_executor
 
-void build_team_dot_executor(rajaperf::Executor& exec, int , char* [],
+void build_team_dot_executor(rajaperf::Executor& exec, int, char*[],
                              const rajaperf::RunParams& params) {
   for (auto* kernel : construct_team_dot_kernel_base(params)) {
     exec.registerKernel("BLAS", kernel);
diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp
index 1fb72c69b8..a82ece030b 100644
--- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp
+++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp
@@ -44,9 +44,11 @@
 
 #include "KokkosBlas2_gemv.hpp"
 #include <Kokkos_Random.hpp>
+#include "KokkosKernels_TestUtils.hpp"
 
 struct Params {
   int use_cuda    = 0;
+  int use_hip     = 0;
   int use_openmp  = 0;
   int use_threads = 0;
   int m           = 5000;
@@ -76,30 +78,33 @@ void print_options() {
 
 int parse_inputs(Params& params, int argc, char** argv) {
   for (int i = 1; i < argc; ++i) {
-    if (0 == strcasecmp(argv[i], "--help") || 0 == strcasecmp(argv[i], "-h")) {
+    if (0 == Test::string_compare_no_case(argv[i], "--help") ||
+        0 == Test::string_compare_no_case(argv[i], "-h")) {
       print_options();
       exit(0);  // note: this is before Kokkos::initialize
-    } else if (0 == strcasecmp(argv[i], "--threads")) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--threads")) {
       params.use_threads = atoi(argv[++i]);
-    } else if (0 == strcasecmp(argv[i], "--openmp")) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) {
       params.use_openmp = atoi(argv[++i]);
-    } else if (0 == strcasecmp(argv[i], "--cuda")) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) {
       params.use_cuda = atoi(argv[++i]) + 1;
-    } else if (0 == strcasecmp(argv[i], "--layout")) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) {
+      params.use_hip = atoi(argv[++i]) + 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--layout")) {
       i++;
-      if (0 == strcasecmp(argv[i], "left"))
+      if (0 == Test::string_compare_no_case(argv[i], "left"))
         params.layoutLeft = true;
-      else if (0 == strcasecmp(argv[i], "right"))
+      else if (0 == Test::string_compare_no_case(argv[i], "right"))
         params.layoutLeft = false;
       else {
         std::cerr << "Invalid layout: must be 'left' or 'right'.\n";
         exit(1);
       }
-    } else if (0 == strcasecmp(argv[i], "--m")) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--m")) {
       params.m = atoi(argv[++i]);
-    } else if (0 == strcasecmp(argv[i], "--n")) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--n")) {
       params.n = atoi(argv[++i]);
-    } else if (0 == strcasecmp(argv[i], "--repeat")) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) {
       // if provided, C will be written to given file.
       // has to have ".bin", or ".crs" extension.
       params.repeat = atoi(argv[++i]);
@@ -174,8 +179,7 @@ int main(int argc, char** argv) {
   // const int num_threads = params.use_openmp;
   const int num_threads = std::max(params.use_openmp, params.use_threads);
 
-  const int device_id = params.use_cuda - 1;
-
+  const int device_id = std::max(params.use_cuda, params.use_hip) - 1;
   Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
 
   // Create booleans to handle pthreads, openmp and cuda params and initialize
@@ -183,14 +187,15 @@ int main(int argc, char** argv) {
   bool useThreads = params.use_threads != 0;
   bool useOMP     = params.use_openmp != 0;
   bool useCUDA    = params.use_cuda != 0;
+  bool useHIP     = params.use_hip != 0;
 
   // Create boolean to handle serial setting if not using open and cuda
-  bool useSerial = !useOMP && !useCUDA;
+  bool useSerial = !useThreads && !useOMP && !useCUDA && !useHIP;
 
   // Logic for runtime with PThreads
   if (useThreads) {
 #if defined(KOKKOS_ENABLE_THREADS)
-    if (params.use_threads)
+    if (params.layoutLeft)
       run<Kokkos::Threads, Kokkos::LayoutLeft>(params.m, params.n,
                                                params.repeat);
     else
@@ -227,6 +232,19 @@ int main(int argc, char** argv) {
 #else
     std::cout << "ERROR: CUDA requested, but not available.\n";
     return 1;
+#endif
+  }
+  if (useHIP) {
+#if defined(KOKKOS_ENABLE_HIP)
+    if (params.layoutLeft)
+      run<Kokkos::Experimental::HIP, Kokkos::LayoutLeft>(params.m, params.n,
+                                                         params.repeat);
+    else
+      run<Kokkos::Experimental::HIP, Kokkos::LayoutRight>(params.m, params.n,
+                                                          params.repeat);
+#else
+    std::cout << "ERROR: HIP requested, but not available.\n";
+    return 1;
 #endif
   }
   // Logic for serial runtime
diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.hpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.hpp
index af7cc372d9..88eacdf182 100644
--- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.hpp
+++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.hpp
@@ -2,7 +2,6 @@
 
 // Created by David Poliakoff and Amy Powell on 6/15/2021
 
-
 #ifndef KOKKOSKERNELS_KOKKOSBLAS_GEMV_TEST_RPS_HPP
 #define KOKKOSKERNELS_KOKKOSBLAS_GEMV_TEST_RPS_HPP
 
@@ -17,57 +16,58 @@
 
 test_list construct_gemv_kernel_base(const rajaperf::RunParams& run_params);
 
-#endif //KOKKOSKERNELS_ENABLE_TESTS_AND_PERFSUITE
-
+#endif  // KOKKOSKERNELS_ENABLE_TESTS_AND_PERFSUITE
 
 template <class ExecSpace, class Layout>
 struct testData_gemv {
-  
   // type aliases for Kokkos data structures
 
   using Scalar   = double;
   using MemSpace = typename ExecSpace::memory_space;
   using Device   = Kokkos::Device<ExecSpace, MemSpace>;
-  
+
   // These are fields in the struct
-  // m is vector length                            
-  int m           = 100000;
+  // m is vector length
+  int m = 100000;
   // n is the number of columns
   int n           = 100000;
-  int repeat      = 1;              
+  int repeat      = 1;
   bool layoutLeft = true;
-  
-  // Create 2D view, "A", w/ Device as the ExecSpace; this is the input matrix, as a 2-D Kokkos::View
+
+  // Create 2D view, "A", w/ Device as the ExecSpace; this is the input matrix,
+  // as a 2-D Kokkos::View
   Kokkos::View<Scalar**, Layout, Device> A;
 
-  // Create 1D view, "x", w/ Device as the ExecSpace; input vector, as a 1-D Kokkos::View
+  // Create 1D view, "x", w/ Device as the ExecSpace; input vector, as a 1-D
+  // Kokkos::View
   Kokkos::View<Scalar*, Device> x;
 
-  // Create 1D view, "y", w/ Device as the ExecSpace; input/output vector, as a 1-D Kokkos::View
+  // Create 1D view, "y", w/ Device as the ExecSpace; input/output vector, as a
+  // 1-D Kokkos::View
   Kokkos::View<Scalar*, Device> y;
 
   // A function with no return type whose name is the name of the class is a
   // constructor or a destructor;
   // Constructor -- create function:
-  testData_gemv(int m_in, int n_in, int repeat_in):m(m_in),n(n_in),repeat(repeat_in) {
-          A = Kokkos::View<Scalar**, Layout, Device> (Kokkos::ViewAllocateWithoutInitializing("A"), m, n);
-          x = Kokkos::View<Scalar*, Device> (Kokkos::ViewAllocateWithoutInitializing("x"), n);
-          y = Kokkos::View<Scalar*, Device> (Kokkos::ViewAllocateWithoutInitializing("y"), m);
-          
-          Kokkos::Random_XorShift64_Pool<ExecSpace> pool(123);
-
-          // Fill input matrices A and x with 10 random values from pool
-          Kokkos::fill_random(A, pool, 10.0);
-          Kokkos::fill_random(x, pool, 10.0);
+  testData_gemv(int m_in, int n_in, int repeat_in)
+      : m(m_in), n(n_in), repeat(repeat_in) {
+    A = Kokkos::View<Scalar**, Layout, Device>(
+        Kokkos::ViewAllocateWithoutInitializing("A"), m, n);
+    x = Kokkos::View<Scalar*, Device>(
+        Kokkos::ViewAllocateWithoutInitializing("x"), n);
+    y = Kokkos::View<Scalar*, Device>(
+        Kokkos::ViewAllocateWithoutInitializing("y"), m);
+
+    Kokkos::Random_XorShift64_Pool<ExecSpace> pool(123);
+
+    // Fill input matrices A and x with 10 random values from pool
+    Kokkos::fill_random(A, pool, 10.0);
+    Kokkos::fill_random(x, pool, 10.0);
   }
 };
 
-template<typename ExecSpace, typename Layout>
-testData_gemv<ExecSpace, Layout> setup_test(int m,
-                                            int n,
-                                            int repeat,
-                                            bool layoutLeft
-                                            );
+template <typename ExecSpace, typename Layout>
+testData_gemv<ExecSpace, Layout> setup_test(int m, int n, int repeat,
+                                            bool layoutLeft);
 
-                                
-#endif //KOKKOSKERNELS_KOKKOSBLAS_GEMV_TEST_RPS_HPP
+#endif  // KOKKOSKERNELS_KOKKOSBLAS_GEMV_TEST_RPS_HPP
diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_tracked_perf_test.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_tracked_perf_test.cpp
index 180e7d73d1..730fec2dff 100644
--- a/perf_test/blas/blas2/KokkosBlas2_gemv_tracked_perf_test.cpp
+++ b/perf_test/blas/blas2/KokkosBlas2_gemv_tracked_perf_test.cpp
@@ -54,7 +54,6 @@
 
 template <class ExecSpace, class Layout>
 testData_gemv<ExecSpace, Layout> setup_test(int m, int n, int repeat) {
-
   testData_gemv<ExecSpace, Layout> testData_gemv_obj(m, n, repeat);
 
   return testData_gemv_obj;
diff --git a/perf_test/blas/blas2/tracked_testing.hpp b/perf_test/blas/blas2/tracked_testing.hpp
index 88b7bdca9e..6bb0e74cb3 100644
--- a/perf_test/blas/blas2/tracked_testing.hpp
+++ b/perf_test/blas/blas2/tracked_testing.hpp
@@ -13,25 +13,20 @@ namespace test {
 namespace blas2 {
 
 // Register kernels for a specific test
-void build_gemv_executor(rajaperf::Executor& exec,
-                         int argc,
-                         char* argv[],
+void build_gemv_executor(rajaperf::Executor& exec, int argc, char* argv[],
                          const rajaperf::RunParams& params) {
-        for(auto* kernel : construct_gemv_kernel_base(params)) {
-                exec.registerKernel("BLAS2", kernel);
-        }
+  for (auto* kernel : construct_gemv_kernel_base(params)) {
+    exec.registerKernel("BLAS2", kernel);
+  }
 }
 
-void build_blas2_executor(rajaperf::Executor& exec,
-                          int argc,
-                          char* argv[],
-                          const rajaperf::RunParams& params){
-        exec.registerGroup("BLAS2");
-        build_gemv_executor(exec, argc, argv, params);
+void build_blas2_executor(rajaperf::Executor& exec, int argc, char* argv[],
+                          const rajaperf::RunParams& params) {
+  exec.registerGroup("BLAS2");
+  build_gemv_executor(exec, argc, argv, params);
 }
 
-
-} // namespace blas2
-} // namespace test
+}  // namespace blas2
+}  // namespace test
 
 #endif  // KOKKOSKERNELS_TRACKED_TESTING_HPP
diff --git a/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh
new file mode 100755
index 0000000000..0b08977748
--- /dev/null
+++ b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh
@@ -0,0 +1,196 @@
+#!/bin/bash
+################################################################################
+# @Brief: On the specified arch, build and run KokkosBlas3_perf_test.
+#
+# The value of this script is to ensure that the benchmark results can be easily
+# reproduced.
+#
+# Author: Evan Harvey <eharvey@sandia.gov>
+################################################################################
+
+function envprint() {
+  for x in $@; do
+      echo $x:\$$x | envsubst
+  done
+}
+
+function printhelp() {
+  echo "--Usage--"
+  echo "$0 PRECISION HOST_ARCH <ACCELERATOR_ARCH>"
+  echo "  PRECISION:        Kokkos::Experimental::half_t, float, double"
+  echo "  HOST_ARCH:        POWER9, A64FX, SKX"
+  echo "  ACCELERATOR_ARCH: VOLTA70"
+  echo ""
+}
+
+function earlyexit() {
+   rm -rf $benchmark_dir
+   exit $1
+}
+
+function beval() {
+  local ret=0
+  echo "---------------------------------------------------------------------------------------------------------------"
+  echo "START: \"$@\""
+  if [ $dry_run == "off" ]; then
+    eval $@
+    ret=$PIPESTATUS
+  fi
+  if [ $ret -ne 0 ]; then
+      echo "ERROR: \"$@\""
+      earlyexit 1
+  fi
+  echo "END  : \"$@\""
+  echo "---------------------------------------------------------------------------------------------------------------"
+}
+
+# Handle input args
+export KOKKOS_SRC_DIR=${KOKKOS_SRC_DIR:-"$HOME/KOKKOS.base/kokkos"}
+export KOKKOS_SRC_DIR=$(realpath $KOKKOS_SRC_DIR)
+export KOKKOS_SHA=${KOKKOS_SHA:-"b9f15a4"} # Tip of develop as of 10-14-21
+export KOKKOSKERNELS_SRC_DIR=${KOKKOSKERNELS_SRC_DIR:-"$HOME/KOKKOS.base/kokkos-kernels"}
+export KOKKOSKERNELS_SRC_DIR=$(realpath $KOKKOSKERNELS_SRC_DIR)
+export KOKKOSKERNELS_SHA=${KOKKOSKERNELS_SHA:-"a2fff48"} # Tip of developer as of 10-14-21
+envprint KOKKOS_SRC_DIR KOKKOS_SHA KOKKOSKERNELS_SRC_DIR KOKKOSKERNELS_SHA
+
+dry_run="off"
+precision="$1"
+arch_names="$2 $3"
+echo "PRECISION=\"$1\", HOST_ARCH=\"$2\", ACCELERATOR_ARCH=\"$3\""
+
+# Create benchmark directory
+benchmark_dir=$PWD/$0_$(date +"%Y-%m-%d_%H.%M.%S")
+beval mkdir -p $benchmark_dir/kokkos-{build,install}
+beval mkdir -p $benchmark_dir/kokkos-kernels-{build,install}
+export KOKKOS_BUILD_DIR=$(realpath $benchmark_dir/kokkos-build)
+export KOKKOS_INSTALL_DIR=$(realpath $benchmark_dir/kokkos-install)
+export KOKKOSKERNELS_BUILD_DIR=$(realpath $benchmark_dir/kokkos-kernels-build)
+export KOKKOSKERNELS_INSTALL_DIR=$(realpath $benchmark_dir/kokkos-kernels-install)
+envprint KOKKOS_INSTALL_DIR KOKKOS_BUILD_DIR KOKKOSKERNELS_BUILD_DIR KOKKOSKERNELS_INSTALL_DIR
+
+# Setup arch specific cmake configurations and job submission commands
+if [[ "$arch_names" == " " || -z $precision ]]; then
+    printhelp; earlyexit 1
+elif [ "$arch_names" == "POWER9 VOLTA70" ]; then
+  module purge
+  module load cmake/3.18.0 gcc/7.2.0 cuda/10.2.2
+  kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \
+                     --arch=Power9,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_SRC_DIR/bin/nvcc_wrapper \
+                     --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out"
+  kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \
+                              | tee -a kokkos_config_cmd.out"
+
+  kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \
+                            --arch=Power9,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \
+                            --cxxflags='-O3' --with-scalars=$precision \
+                            --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \
+                            --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \
+                            tee kokkoskernels_config_cmd.out"
+  kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \
+                                   -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \
+                                   $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out"
+
+  kokkos_build_cmd="bsub -q rhel7W -W 2:00 -Is $KOKKOS_BUILD_DIR/build.sh"
+  kokkoskernels_build_cmd="bsub -q rhel7W -W 2:00 -Is $KOKKOSKERNELS_BUILD_DIR/build.sh"
+  benchmark_cmd="bsub -q rhel7W -W 2:00 -Is $KOKKOSKERNELS_BUILD_DIR/bench.sh"
+elif [ "$arch_names" == "A64FX " ]; then
+  export OMP_PROC_BIND=close
+  export OMP_PLACES=cores
+  export OMP_NUM_THREADS=48
+  module purge
+  module load gcc/10.2.0 cmake/3.17.0
+  kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \
+                     --arch=A64FX \
+                     --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out"
+  kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \
+                              | tee -a kokkos_config_cmd.out"
+
+  kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \
+                            --cxxflags='-msve-vector-bits=512 -Ofast' --arch=A64FX --with-scalars=$precision --with-openmp \
+                            --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \
+                            --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \
+                            tee kokkoskernels_config_cmd.out"
+  kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \
+                                   -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \
+                                   $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out"
+
+  kokkos_build_cmd="srun --time=2:00:00 -N1 $KOKKOS_BUILD_DIR/build.sh"
+  kokkoskernels_build_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/build.sh"
+  benchmark_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/bench.sh"
+elif [ "$arch_names" == "SKX " ]; then
+    export OMP_PROC_BIND=close
+    export OMP_PLACES=cores
+    export OMP_NUM_THREADS=96
+    module purge
+    module load gcc/7.2.0 cmake/3.19.3
+    kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \
+                       --arch=SKX \
+                       --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out"
+    kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \
+                                | tee -a kokkos_config_cmd.out"
+  
+    kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \
+                              --cxxflags='-O3' --arch=SKX --with-scalars=$precision --with-openmp \
+                              --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \
+                              --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \
+                              tee kokkoskernels_config_cmd.out"
+    kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \
+                                     -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \
+                                     $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out"
+  
+    kokkos_build_cmd="srun --time=2:00:00 -N1 $KOKKOS_BUILD_DIR/build.sh"
+    kokkoskernels_build_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/build.sh"
+    benchmark_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/bench.sh"
+    use_simd="--use_simd=1"
+else
+  echo "Invalid arch: $arch_names"
+  printhelp; earlyexit 1
+fi
+
+# Write the arch agnostic kokkos build script
+echo "#!/bin/bash" > $KOKKOS_BUILD_DIR/build.sh
+echo "cd $KOKKOS_BUILD_DIR" >> $KOKKOS_BUILD_DIR/build.sh
+echo "make -j40 install" >> $KOKKOS_BUILD_DIR/build.sh
+chmod +x $KOKKOS_BUILD_DIR/build.sh
+
+# Write the arch agnostic kokkos-kernels build script
+echo "#!/bin/bash" > $KOKKOSKERNELS_BUILD_DIR/build.sh
+echo "cd $KOKKOSKERNELS_BUILD_DIR/perf_test/blas/blas3" >> $KOKKOSKERNELS_BUILD_DIR/build.sh
+echo "make -j40 KokkosBlas3_perf_test" >> $KOKKOSKERNELS_BUILD_DIR/build.sh
+chmod +x $KOKKOSKERNELS_BUILD_DIR/build.sh
+
+# Write the arch agnostic kokkos-kernels benchmark script
+echo "#!/bin/bash" > $KOKKOSKERNELS_BUILD_DIR/bench.sh
+echo "cd $benchmark_dir" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh
+echo "$KOKKOSKERNELS_BUILD_DIR/perf_test/blas/blas3/KokkosBlas3_perf_test \
+      --test=batched_heuristic --routines=gemm --loop_type=parallel --batch_size_last_dim=0 \
+      --matrix_size_start=2x2,2x2,2x2 --matrix_size_stop=64x64,64x64,64x64 \
+      --matrix_size_step=2 --batch_size=1024 \
+      --warm_up_loop=10 --iter=20 --verify=1 \
+      ${use_simd} \
+      --csv=${benchmark_dir}/${precision}_bench.csv" \
+       >> $KOKKOSKERNELS_BUILD_DIR/bench.sh
+chmod +x $KOKKOSKERNELS_BUILD_DIR/bench.sh
+
+# Check out the correct SHAs
+beval "cd $KOKKOS_SRC_DIR && git checkout $KOKKOS_SHA"
+beval "cd $KOKKOSKERNELS_SRC_DIR && git checkout $KOKKOSKERNELS_SHA"
+
+# Build Kokkos
+beval $kokkos_config_cmd
+beval $kokkos_config_defaults_cmd
+beval $kokkos_build_cmd
+
+# Wait for the file system on the head node to catch up
+while [[ "$arch_names" == "POWER9 VOLTA70" && ! -e $KOKKOS_INSTALL_DIR/bin/nvcc_wrapper ]]; do
+  sleep 3s
+done
+
+# Build KokkosKernels
+beval $kokkoskernels_config_cmd
+beval $kokkoskernels_config_defaults_cmd
+beval $kokkoskernels_build_cmd
+
+# Run the benchmark
+beval $benchmark_cmd
+beval "cat ${benchmark_dir}/${precision}_bench.csv"
diff --git a/perf_test/blas/blas3/KokkosBlas3_common.hpp b/perf_test/blas/blas3/KokkosBlas3_common.hpp
index ec34a1fb80..6d85fbd0da 100644
--- a/perf_test/blas/blas3/KokkosBlas3_common.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_common.hpp
@@ -65,6 +65,7 @@
 #define DEFAULT_BATCH_SIZE_LAST_DIM 0
 #define DEFAULT_VERIFY 1
 #define DEFAULT_NINTER 4
+#define DEFAULT_USE_SIMD 0
 
 /************************ blas routine structure definitions **********/
 struct perf_test_trmm_args {
@@ -213,6 +214,7 @@ struct perf_test_options {
   std::string blas_routines;
   bool verify;
   int ninter;
+  bool use_simd;
 };
 typedef struct perf_test_options options_t;
 
diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index a8ea1bf52c..e3d991c7c1 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -46,7 +46,6 @@
 
 //#include <complex.h>
 
-
 #include "Kokkos_MathematicalFunctions.hpp"
 
 #include "KokkosBlas3_common.hpp"
@@ -293,8 +292,8 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args,
                                   const char *vec_type        = nullptr) {
   std::string algo_name = !experiment_name ? test_e_str[options.test]
                                            : std::string(experiment_name);
-  std::string ts        = !team_size ? std::to_string(gemm_args.bp.team_size)
-                                     : std::string(team_size);
+  std::string ts = !team_size ? std::to_string(gemm_args.bp.team_size)
+                              : std::string(team_size);
   std::string vlen =
       !vec_len ? std::to_string(gemm_args.bp.vector_len) : std::string(vec_len);
   std::string vtype =
@@ -448,7 +447,7 @@ void __do_gemm_serial_batched(options_t options, gemm_args_t gemm_args) {
   char b  = toupper(gemm_args.transB);
   using N = KokkosBatched::Trans::NoTranspose;
   using T = KokkosBatched::Trans::Transpose;
-  // using C = Trans::ConjTranspose;
+  // using C = KokkosBatched::Trans::ConjTranspose;
 
   STATUS;
 
@@ -480,7 +479,8 @@ template <class algo_tag, class blocking_type, class device_type,
           class algo_mode = void>
 void __do_gemm_parallel_batched_heuristic_template(options_t options,
                                                    gemm_args_t gemm_args) {
-  KokkosBatched::BatchedGemmHandle batchedGemmHandle(KokkosBatched::BaseHeuristicAlgos::SQUARE);
+  KokkosBatched::BatchedGemmHandle batchedGemmHandle(
+      KokkosBatched::BaseHeuristicAlgos::SQUARE);
   char a  = toupper(gemm_args.transA);
   char b  = toupper(gemm_args.transB);
   using N = KokkosBatched::Trans::NoTranspose;
@@ -489,21 +489,39 @@ void __do_gemm_parallel_batched_heuristic_template(options_t options,
   using KokkosBatched::BatchLayout;
 
   STATUS;
-
   if (a == 'N' && b == 'N') {
     if (options.blas_args.batch_size_last_dim)
-      KokkosBatched::BatchedGemm<N, N, BatchLayout::Right>(
-          &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B,
-          gemm_args.beta, gemm_args.C);
+      if (options.use_simd)
+        KokkosBatched::BatchedGemm<N, N, BatchLayout::Right>(
+            &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d,
+            gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d);
+      else
+        KokkosBatched::BatchedGemm<N, N, BatchLayout::Right>(
+            &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B,
+            gemm_args.beta, gemm_args.C);
+    else if (options.use_simd)
+      KokkosBatched::BatchedGemm<N, N, BatchLayout::Left>(
+          &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d,
+          gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d);
     else
       KokkosBatched::BatchedGemm<N, N, BatchLayout::Left>(
           &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B,
           gemm_args.beta, gemm_args.C);
+
   } else if (a == 'N' && b == 'T') {
     if (options.blas_args.batch_size_last_dim)
-      KokkosBatched::BatchedGemm<N, T, BatchLayout::Right>(
-          &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B,
-          gemm_args.beta, gemm_args.C);
+      if (options.use_simd)
+        KokkosBatched::BatchedGemm<N, T, BatchLayout::Right>(
+            &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d,
+            gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d);
+      else
+        KokkosBatched::BatchedGemm<N, T, BatchLayout::Right>(
+            &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B,
+            gemm_args.beta, gemm_args.C);
+    else if (options.use_simd)
+      KokkosBatched::BatchedGemm<N, T, BatchLayout::Left>(
+          &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d,
+          gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d);
     else
       KokkosBatched::BatchedGemm<N, T, BatchLayout::Left>(
           &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B,
@@ -512,18 +530,36 @@ void __do_gemm_parallel_batched_heuristic_template(options_t options,
     //  __do_gemm_serial_batched_template<N, C, algo_type>(options, gemm_args);
   } else if (a == 'T' && b == 'N') {
     if (options.blas_args.batch_size_last_dim)
-      KokkosBatched::BatchedGemm<T, N, BatchLayout::Right>(
-          &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B,
-          gemm_args.beta, gemm_args.C);
+      if (options.use_simd)
+        KokkosBatched::BatchedGemm<T, N, BatchLayout::Right>(
+            &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d,
+            gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d);
+      else
+        KokkosBatched::BatchedGemm<T, N, BatchLayout::Right>(
+            &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B,
+            gemm_args.beta, gemm_args.C);
+    else if (options.use_simd)
+      KokkosBatched::BatchedGemm<T, N, BatchLayout::Left>(
+          &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d,
+          gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d);
     else
       KokkosBatched::BatchedGemm<T, N, BatchLayout::Left>(
           &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B,
           gemm_args.beta, gemm_args.C);
   } else if (a == 'T' && b == 'T') {
     if (options.blas_args.batch_size_last_dim)
-      KokkosBatched::BatchedGemm<T, T, BatchLayout::Right>(
-          &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B,
-          gemm_args.beta, gemm_args.C);
+      if (options.use_simd)
+        KokkosBatched::BatchedGemm<T, T, BatchLayout::Right>(
+            &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d,
+            gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d);
+      else
+        KokkosBatched::BatchedGemm<T, T, BatchLayout::Right>(
+            &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B,
+            gemm_args.beta, gemm_args.C);
+    else if (options.use_simd)
+      KokkosBatched::BatchedGemm<T, T, BatchLayout::Left>(
+          &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d,
+          gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d);
     else
       KokkosBatched::BatchedGemm<T, T, BatchLayout::Left>(
           &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B,
@@ -1475,14 +1511,16 @@ void __do_gemm_parallel_experiment6(options_t /*options*/,
 template <class TransAType, class TransBType, class BlockingType,
           class device_type>
 void __do_gemm_armpl(options_t options, gemm_args_t gemm_args) {
-#if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL)
-  using execution_space = typename device_type::execution_space;
-
+#if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) && ARMPL_BUILD >= 1058
   uint32_t warm_up_n = options.warm_up_n;
   uint32_t n         = options.n;
   Kokkos::Timer timer;
-  char transa = std::is_same<TransAType, KokkosBatched::Trans::NoTranspose>::value ? 'N' : 'T';
-  char transb = std::is_same<TransBType, KokkosBatched::Trans::NoTranspose>::value ? 'N' : 'T';
+  char transa =
+      std::is_same<TransAType, KokkosBatched::Trans::NoTranspose>::value ? 'N'
+                                                                         : 'T';
+  char transb =
+      std::is_same<TransBType, KokkosBatched::Trans::NoTranspose>::value ? 'N'
+                                                                         : 'T';
 
   if (!std::is_same<default_scalar, double>::value)
     FATAL_ERROR("only double scalars are supported!");
@@ -1907,11 +1945,7 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) {
   gemm_args.dims     = dims;
   gemm_args.transA   = options.blas_args.gemm.gemm_args.c_str()[0];
   gemm_args.transB   = options.blas_args.gemm.gemm_args.c_str()[1];
-  if (options.test == BATCHED_TEAM_SIMD ||
-      options.test == BATCHED_TEAM_SIMD_BLOCKED ||
-      options.test == BATCHED_SERIAL_SIMD ||
-      options.test == BATCHED_SERIAL_SIMD_BLOCKED ||
-      options.test == BATCHED_SERIAL_COMPACT_MKL) {
+  if (options.use_simd) {
     // Calculate the batch size for simd views
     auto a_simd_batch_size =
         dims.a.k / simd_vector_size + (dims.a.k % simd_vector_size > 0);
@@ -2199,18 +2233,20 @@ void do_gemm_serial_blas(options_t options) {
 void do_gemm_serial_batched(options_t options) {
   STATUS;
   __do_loop_and_invoke(
-      options, __do_gemm_serial_batched<default_scalar, view_type_3d,
-                                        view_type_3d, view_type_3d,
-                                        default_device, KokkosBatched::Algo::Gemm::Unblocked>);
+      options,
+      __do_gemm_serial_batched<default_scalar, view_type_3d, view_type_3d,
+                               view_type_3d, default_device,
+                               KokkosBatched::Algo::Gemm::Unblocked>);
   return;
 }
 
 void do_gemm_serial_batched_blocked(options_t options) {
   STATUS;
   __do_loop_and_invoke(
-      options, __do_gemm_serial_batched<default_scalar, view_type_3d,
-                                        view_type_3d, view_type_3d,
-                                        default_device, KokkosBatched::Algo::Gemm::Blocked>);
+      options,
+      __do_gemm_serial_batched<default_scalar, view_type_3d, view_type_3d,
+                               view_type_3d, default_device,
+                               KokkosBatched::Algo::Gemm::Blocked>);
   return;
 }
 
@@ -2233,12 +2269,14 @@ void do_gemm_serial_batched_parallel(options_t options) {
   if (options.blas_args.batch_size_last_dim)
     __do_loop_and_invoke(
         options,
-        __do_gemm_parallel_batched<SerialBatchDim3Tag, KokkosBatched::Algo::Gemm::Unblocked,
+        __do_gemm_parallel_batched<SerialBatchDim3Tag,
+                                   KokkosBatched::Algo::Gemm::Unblocked,
                                    default_device>);
   else
     __do_loop_and_invoke(
-        options, __do_gemm_parallel_batched<SerialTag, KokkosBatched::Algo::Gemm::Unblocked,
-                                            default_device>);
+        options,
+        __do_gemm_parallel_batched<
+            SerialTag, KokkosBatched::Algo::Gemm::Unblocked, default_device>);
   return;
 }
 
@@ -2246,13 +2284,14 @@ void do_gemm_serial_batched_blocked_parallel(options_t options) {
   STATUS;
   if (options.blas_args.batch_size_last_dim)
     __do_loop_and_invoke(
-        options,
-        __do_gemm_parallel_batched<SerialBatchDim3Tag, KokkosBatched::Algo::Gemm::Blocked,
-                                   default_device>);
+        options, __do_gemm_parallel_batched<SerialBatchDim3Tag,
+                                            KokkosBatched::Algo::Gemm::Blocked,
+                                            default_device>);
   else
     __do_loop_and_invoke(
-        options, __do_gemm_parallel_batched<SerialTag, KokkosBatched::Algo::Gemm::Blocked,
-                                            default_device>);
+        options,
+        __do_gemm_parallel_batched<
+            SerialTag, KokkosBatched::Algo::Gemm::Blocked, default_device>);
   return;
 }
 
@@ -2260,15 +2299,17 @@ void do_gemm_serial_simd_batched_parallel(options_t options) {
   STATUS;
   // SerialBatchDim3Tag
   // SerialSimdTag
+  options.use_simd = true;
   if (options.blas_args.batch_size_last_dim)
     __do_loop_and_invoke(
-        options,
-        __do_gemm_parallel_batched<TeamSimdBatchDim4Tag, KokkosBatched::Algo::Gemm::Unblocked,
-                                   default_device, KokkosBatched::Mode::Serial>);
+        options, __do_gemm_parallel_batched<
+                     TeamSimdBatchDim4Tag, KokkosBatched::Algo::Gemm::Unblocked,
+                     default_device, KokkosBatched::Mode::Serial>);
   else
-    __do_loop_and_invoke(
-        options, __do_gemm_parallel_batched<TeamSimdTag, KokkosBatched::Algo::Gemm::Unblocked,
-                                            default_device, KokkosBatched::Mode::Serial>);
+    __do_loop_and_invoke(options,
+                         __do_gemm_parallel_batched<
+                             TeamSimdTag, KokkosBatched::Algo::Gemm::Unblocked,
+                             default_device, KokkosBatched::Mode::Serial>);
   return;
 }
 
@@ -2276,15 +2317,17 @@ void do_gemm_serial_simd_batched_blocked_parallel(options_t options) {
   STATUS;
   // SerialBatchDim3Tag
   // SerialSimdTag
+  options.use_simd = true;
   if (options.blas_args.batch_size_last_dim)
     __do_loop_and_invoke(
-        options,
-        __do_gemm_parallel_batched<TeamSimdBatchDim4Tag, KokkosBatched::Algo::Gemm::Blocked,
-                                   default_device, KokkosBatched::Mode::Serial>);
+        options, __do_gemm_parallel_batched<
+                     TeamSimdBatchDim4Tag, KokkosBatched::Algo::Gemm::Blocked,
+                     default_device, KokkosBatched::Mode::Serial>);
   else
-    __do_loop_and_invoke(
-        options, __do_gemm_parallel_batched<TeamSimdTag, KokkosBatched::Algo::Gemm::Blocked,
-                                            default_device, KokkosBatched::Mode::Serial>);
+    __do_loop_and_invoke(options,
+                         __do_gemm_parallel_batched<
+                             TeamSimdTag, KokkosBatched::Algo::Gemm::Blocked,
+                             default_device, KokkosBatched::Mode::Serial>);
   return;
 }
 
@@ -2297,16 +2340,18 @@ void do_gemm_serial_batched_compact_mkl_parallel(options_t options) {
     __do_loop_and_invoke(
         options,
         __do_gemm_parallel_batched<SerialSimdBatchDim3Tag,
-                                   KokkosBatched::Algo::Gemm::CompactMKL, default_device>);
+                                   KokkosBatched::Algo::Gemm::CompactMKL,
+                                   default_device>);
   else
     __do_loop_and_invoke(
         options,
-        __do_gemm_parallel_batched<SerialSimdTag, KokkosBatched::Algo::Gemm::CompactMKL,
+        __do_gemm_parallel_batched<SerialSimdTag,
+                                   KokkosBatched::Algo::Gemm::CompactMKL,
                                    default_device>);
   return;
 }
 #else
-void do_gemm_serial_batched_compact_mkl_parallel(options_t /*options*/) {
+void do_gemm_serial_batched_compact_mkl_parallel(options_t) {
   STATUS;
 #if !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__)
   std::cerr
@@ -2334,12 +2379,14 @@ void do_gemm_team_batched_parallel(options_t options) {
   if (options.blas_args.batch_size_last_dim)
     __do_loop_and_invoke(
         options,
-        __do_gemm_parallel_batched<TeamBatchDim3Tag, KokkosBatched::Algo::Gemm::Unblocked,
+        __do_gemm_parallel_batched<TeamBatchDim3Tag,
+                                   KokkosBatched::Algo::Gemm::Unblocked,
                                    default_device>);
   else
     __do_loop_and_invoke(
-        options, __do_gemm_parallel_batched<TeamTag, KokkosBatched::Algo::Gemm::Unblocked,
-                                            default_device>);
+        options,
+        __do_gemm_parallel_batched<
+            TeamTag, KokkosBatched::Algo::Gemm::Unblocked, default_device>);
   return;
 }
 
@@ -2347,13 +2394,14 @@ void do_gemm_team_batched_blocked_parallel(options_t options) {
   STATUS;
   if (options.blas_args.batch_size_last_dim)
     __do_loop_and_invoke(
-        options,
-        __do_gemm_parallel_batched<TeamBatchDim3Tag, KokkosBatched::Algo::Gemm::Blocked,
-                                   default_device>);
+        options, __do_gemm_parallel_batched<TeamBatchDim3Tag,
+                                            KokkosBatched::Algo::Gemm::Blocked,
+                                            default_device>);
   else
     __do_loop_and_invoke(
-        options, __do_gemm_parallel_batched<TeamTag, KokkosBatched::Algo::Gemm::Blocked,
-                                            default_device>);
+        options,
+        __do_gemm_parallel_batched<TeamTag, KokkosBatched::Algo::Gemm::Blocked,
+                                   default_device>);
   return;
 }
 
@@ -2363,40 +2411,50 @@ void do_gemm_team_vector_batched_parallel(options_t options) {
     __do_loop_and_invoke(
         options,
         __do_gemm_parallel_batched<TeamVectorBatchDim3Tag,
-                                   KokkosBatched::Algo::Gemm::Unblocked, default_device>);
+                                   KokkosBatched::Algo::Gemm::Unblocked,
+                                   default_device>);
   else
     __do_loop_and_invoke(
         options,
-        __do_gemm_parallel_batched<TeamVectorTag, KokkosBatched::Algo::Gemm::Unblocked,
+        __do_gemm_parallel_batched<TeamVectorTag,
+                                   KokkosBatched::Algo::Gemm::Unblocked,
                                    default_device>);
   return;
 }
 
 void do_gemm_team_simd_batched_parallel(options_t options) {
   STATUS;
+  options.use_simd = true;
   if (options.blas_args.batch_size_last_dim)
     __do_loop_and_invoke(
         options,
-        __do_gemm_parallel_batched<TeamSimdBatchDim4Tag, KokkosBatched::Algo::Gemm::Unblocked,
+        __do_gemm_parallel_batched<TeamSimdBatchDim4Tag,
+                                   KokkosBatched::Algo::Gemm::Unblocked,
                                    default_device, KokkosBatched::Mode::Team>);
   else
     __do_loop_and_invoke(
-        options, __do_gemm_parallel_batched<TeamSimdTag, KokkosBatched::Algo::Gemm::Unblocked,
-                                            default_device, KokkosBatched::Mode::Team>);
+        options,
+        __do_gemm_parallel_batched<TeamSimdTag,
+                                   KokkosBatched::Algo::Gemm::Unblocked,
+                                   default_device, KokkosBatched::Mode::Team>);
   return;
 }
 
 void do_gemm_team_simd_batched_blocked_parallel(options_t options) {
   STATUS;
+  options.use_simd = true;
   if (options.blas_args.batch_size_last_dim)
     __do_loop_and_invoke(
         options,
-        __do_gemm_parallel_batched<TeamSimdBatchDim4Tag, KokkosBatched::Algo::Gemm::Blocked,
+        __do_gemm_parallel_batched<TeamSimdBatchDim4Tag,
+                                   KokkosBatched::Algo::Gemm::Blocked,
                                    default_device, KokkosBatched::Mode::Team>);
   else
     __do_loop_and_invoke(
-        options, __do_gemm_parallel_batched<TeamSimdTag, KokkosBatched::Algo::Gemm::Blocked,
-                                            default_device, KokkosBatched::Mode::Team>);
+        options,
+        __do_gemm_parallel_batched<TeamSimdTag,
+                                   KokkosBatched::Algo::Gemm::Blocked,
+                                   default_device, KokkosBatched::Mode::Team>);
   return;
 }
 
@@ -2404,8 +2462,8 @@ void do_gemm_team_simd_batched_blocked_parallel(options_t options) {
 /* void do_gemm_team_vector_batched_blocked_parallel(options_t options) {
   STATUS;
   __do_loop_and_invoke(
-      options, __do_gemm_parallel_batched<TeamVectorTag, KokkosBatched::Algo::Gemm::Blocked,
-default_device>); return;
+      options, __do_gemm_parallel_batched<TeamVectorTag,
+KokkosBatched::Algo::Gemm::Blocked, default_device>); return;
 } */
 
 void do_gemm_experiment_parallel(options_t options) {
diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test.cpp
index e1426674b1..595292ebd7 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test.cpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test.cpp
@@ -44,63 +44,58 @@
 
 #include "KokkosBlas3_gemm.hpp"
 #include <Kokkos_Random.hpp>
+#include "KokkosKernels_TestUtils.hpp"
 
-struct Params
-{
-  int use_cuda = 0;
-  int use_openmp = 0;
+struct Params {
+  int use_cuda    = 0;
+  int use_openmp  = 0;
   int use_threads = 0;
-  int m = 1000;
-  int n = 1000;
-  int k = 1000;
-  int repeat = 1;
+  int m           = 1000;
+  int n           = 1000;
+  int k           = 1000;
+  int repeat      = 1;
 };
 
-void print_options(){
+void print_options() {
   std::cerr << "Options\n" << std::endl;
 
-  std::cerr << "\tBACKEND: '--threads[numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]'" << std::endl;
+  std::cerr << "\tBACKEND: '--threads[numThreads]' | '--openmp [numThreads]' | "
+               "'--cuda [cudaDeviceIndex]'"
+            << std::endl;
   std::cerr << "\tIf none selected, serial is used." << std::endl;
-  std::cerr << "\t[Optional] --repeat :: how many times to repeat overall spadd (symbolic + repeated numeric)" << std::endl;
+  std::cerr << "\t[Optional] --repeat :: how many times to repeat overall "
+               "spadd (symbolic + repeated numeric)"
+            << std::endl;
   std::cerr << "\t[Optional] --m      :: Rows in A" << std::endl;
   std::cerr << "\t[Optional] --n      :: Columns in A / Rows in B" << std::endl;
   std::cerr << "\t[Optional] --k      :: Columns in B" << std::endl;
 }
 
-int parse_inputs (Params& params, int argc, char **argv){
-  for ( int i = 1 ; i < argc ; ++i ) {
-    if ( 0 == strcasecmp( argv[i] , "--help") || 0 == strcasecmp( argv[i] , "-h" )) {
+int parse_inputs(Params& params, int argc, char** argv) {
+  for (int i = 1; i < argc; ++i) {
+    if (0 == Test::string_compare_no_case(argv[i], "--help") ||
+        0 == Test::string_compare_no_case(argv[i], "-h")) {
       print_options();
-      exit(0);  //note: this is before Kokkos::initialize
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--threads" ) ) {
-      params.use_threads = atoi( argv[++i] );
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--openmp" ) ) {
-      params.use_openmp = atoi( argv[++i] );
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) {
-      params.use_cuda = atoi( argv[++i] ) + 1;
-    }
-    else if( 0 == strcasecmp( argv[i], "--m" ))
-    {
+      exit(0);  // note: this is before Kokkos::initialize
+    } else if (0 == Test::string_compare_no_case(argv[i], "--threads")) {
+      params.use_threads = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) {
+      params.use_openmp = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) {
+      params.use_cuda = atoi(argv[++i]) + 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--m")) {
       params.m = atoi(argv[++i]);
-    }
-    else if( 0 == strcasecmp( argv[i], "--n" ))
-    {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--n")) {
       params.n = atoi(argv[++i]);
-    }
-    else if( 0 == strcasecmp( argv[i], "--k" ))
-    {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--k")) {
       params.k = atoi(argv[++i]);
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--repeat" ) ) {
-      //if provided, C will be written to given file.
-      //has to have ".bin", or ".crs" extension.
-      params.repeat = atoi( argv[++i] );
-    }
-    else {
-      std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) {
+      // if provided, C will be written to given file.
+      // has to have ".bin", or ".crs" extension.
+      params.repeat = atoi(argv[++i]);
+    } else {
+      std::cerr << "Unrecognized command line argument #" << i << ": "
+                << argv[i] << std::endl;
       print_options();
       return 1;
     }
@@ -108,37 +103,37 @@ int parse_inputs (Params& params, int argc, char **argv){
   return 0;
 }
 
-template<typename ExecSpace, typename ALayout, typename BLayout>
-void runImpl(int m, int n, int k, int repeat)
-{
-  using Scalar = double;
+template <typename ExecSpace, typename ALayout, typename BLayout>
+void runImpl(int m, int n, int k, int repeat) {
+  using Scalar   = double;
   using MemSpace = typename ExecSpace::memory_space;
-  using Device = Kokkos::Device<ExecSpace, MemSpace>;
-  Kokkos::View<Scalar**, ALayout, Device> A(Kokkos::view_alloc(Kokkos::WithoutInitializing, "A"), m, n);
-  Kokkos::View<Scalar**, BLayout, Device> B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), n, k);
-  Kokkos::View<Scalar**, Kokkos::LayoutLeft, Device> C(Kokkos::view_alloc(Kokkos::WithoutInitializing, "C"), m, k);
+  using Device   = Kokkos::Device<ExecSpace, MemSpace>;
+  Kokkos::View<Scalar**, ALayout, Device> A(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "A"), m, n);
+  Kokkos::View<Scalar**, BLayout, Device> B(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), n, k);
+  Kokkos::View<Scalar**, Kokkos::LayoutLeft, Device> C(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "C"), m, k);
   Kokkos::Random_XorShift64_Pool<ExecSpace> pool(123);
   Kokkos::fill_random(A, pool, 10.0);
   Kokkos::fill_random(B, pool, 10.0);
-  //Do a warm-up run
+  // Do a warm-up run
   KokkosBlas::gemm("N", "N", 1.0, A, B, 0.0, C);
-  //Now, start timing
+  // Now, start timing
   Kokkos::fence();
   Kokkos::Timer timer;
-  for(int i = 0; i < repeat; i++)
-  {
+  for (int i = 0; i < repeat; i++) {
     KokkosBlas::gemm("N", "N", 1.0, A, B, 0.0, C);
     ExecSpace().fence();
   }
-  double total = timer.seconds();
-  double avg = total / repeat;
-  size_t flopsPerRun = (size_t) 2 * m * n * k;
+  double total       = timer.seconds();
+  double avg         = total / repeat;
+  size_t flopsPerRun = (size_t)2 * m * n * k;
   printf("Avg GEMM FLOP/s: %.3e --- Avg time: %f\n", flopsPerRun / avg, avg);
 }
 
-template<typename ExecSpace>
-void run(int m, int n, int k, int repeat)
-{
+template <typename ExecSpace>
+void run(int m, int n, int k, int repeat) {
   using LL = Kokkos::LayoutLeft;
   using LR = Kokkos::LayoutRight;
   std::cout << "** Running GEMM experiments (" << ExecSpace::name() << ") **\n";
@@ -152,50 +147,48 @@ void run(int m, int n, int k, int repeat)
   runImpl<ExecSpace, LR, LR>(m, n, k, repeat);
 }
 
-int main (int argc, char ** argv){
+int main(int argc, char** argv) {
   Params params;
 
-  if (parse_inputs (params, argc, argv) ){
+  if (parse_inputs(params, argc, argv)) {
     return 1;
   }
-  const int num_threads = params.use_openmp; // Assumption is that use_openmp variable is provided as number of threads
+  const int num_threads =
+      params.use_openmp;  // Assumption is that use_openmp variable is provided
+                          // as number of threads
   const int device_id = params.use_cuda - 1;
 
-  Kokkos::initialize( Kokkos::InitArguments( num_threads, -1, device_id ) );
+  Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
 
-  bool useOMP = params.use_openmp != 0;
+  bool useOMP  = params.use_openmp != 0;
   bool useCUDA = params.use_cuda != 0;
 
   bool useSerial = !useOMP && !useCUDA;
 
-  if(useOMP)
-  {
-#if defined( KOKKOS_ENABLE_OPENMP )
+  if (useOMP) {
+#if defined(KOKKOS_ENABLE_OPENMP)
     run<Kokkos::OpenMP>(params.m, params.n, params.k, params.repeat);
 #else
     std::cout << "ERROR: OpenMP requested, but not available.\n";
     return 1;
 #endif
   }
-  if(useCUDA)
-  {
-#if defined( KOKKOS_ENABLE_CUDA )
+  if (useCUDA) {
+#if defined(KOKKOS_ENABLE_CUDA)
     run<Kokkos::Cuda>(params.m, params.n, params.k, params.repeat);
 #else
     std::cout << "ERROR: CUDA requested, but not available.\n";
     return 1;
 #endif
   }
-  if(useSerial)
-  {
-#if defined( KOKKOS_ENABLE_SERIAL )
+  if (useSerial) {
+#if defined(KOKKOS_ENABLE_SERIAL)
     run<Kokkos::Serial>(params.m, params.n, params.k, params.repeat);
 #else
     std::cout << "ERROR: Serial device requested, but not available.\n";
     return 1;
 #endif
   }
-  Kokkos::finalize(); 
+  Kokkos::finalize();
   return 0;
 }
-
diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_tracked_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_gemm_tracked_perf_test.cpp
new file mode 100644
index 0000000000..e1bf74ecaa
--- /dev/null
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_tracked_perf_test.cpp
@@ -0,0 +1,125 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <KokkosBlas3_gemm.hpp>
+#include <Kokkos_Random.hpp>
+
+// Required for tracked_testing version
+#include "KokkosBlas3_gemm_tracked_perf_test.hpp"
+
+#ifdef KOKKOSKERNELS_ENABLE_TESTS_AND_PERFSUITE
+#include <PerfTestUtilities.hpp>
+#endif
+
+// API ref for "General Matrix Multiplication" (gemm)
+// https://github.com/kokkos/kokkos-kernels/wiki/BLAS-3%3A%3Agemm
+
+// Usage: KokkosBlas::gemm(modeA, modeB, alpha, A, B, beta, C);
+/*
+* transA [in] "N" for non-transpose, "T" for transpose,
+* "C" for conjugate transpose.
+* All characters after the first are ignored. This works just like the BLAS
+routines.
+*
+* transB [in] "N" for non-transpose,
+* "T" for transpose,
+* "C" for conjugate transpose.
+* All characters after the first are ignored. This works just like the BLAS
+routines.
+
+* alpha [in] Input coefficient of A*x
+* A [in] Input matrix, as a 2-D Kokkos::View
+* B [in] Input matrix, as a 2-D Kokkos::View
+* beta [in] Input coefficient of C
+* C [in/out] Output vector, as a nonconst 2-D Kokkos::View
+
+*/
+
+// Define setup_test
+template <class ExecSpace, class ALayout, class BLayout>
+testData_gemm<ExecSpace, ALayout, BLayout> setup_test(int m, int n, int k,
+                                                      int repeat) {
+  testData_gemm<ExecSpace, ALayout, BLayout> testData_gemm_obj(m, n, k, repeat);
+
+  return testData_gemm_obj;
+}
+
+test_list construct_gemm_kernel_base(const rajaperf::RunParams& run_params,
+                                     const std::vector<gemmConfig>& m_n_k_vect)
+
+{
+  // instantiate kernel_base_vector as type test_list
+  // kernel_base_vector will contain which tests to run, and data to run them
+  test_list kernel_base_vector;
+
+  for (const auto& value : m_n_k_vect) {
+    kernel_base_vector.push_back(rajaperf::make_kernel_base(
+        "BLAS3_GEMM_" + std::to_string(value.m) + "_" +
+            std::to_string(value.n) + "_" + std::to_string(value.k),
+        run_params,
+        // setup_test lambda captures by value;
+        // Mapping Kokkos features to RAJAPerf Suite
+        // repeat = runreps (RAJAPerf Suite)
+        // m = getActualRunSize() (RAJAPerf Suite)
+        [=](const int repeat, const int) {
+          // returns a tuple of testData objects
+          return std::make_tuple(
+              // setup_test is templated on ExecSpace and Layout
+              setup_test<Kokkos::DefaultExecutionSpace,
+                         Kokkos::DefaultExecutionSpace::array_layout,
+                         Kokkos::DefaultExecutionSpace::array_layout>(
+                  value.m, value.n, value.k, repeat));
+        },
+        // the run lambda will capture the returned setup_test tuple by
+        // reference
+        [&](const int iteration, const int runsize, auto& data) {
+          // KokkosBlas::gemm(modeA, modeB, alpha, A, B, beta, C);
+          KokkosBlas::gemm("N", "N", 1.0, data.A, data.B, 0.0, data.C);
+        }));
+  }
+
+  // return a vector of kernel base objects of type test_list
+  return kernel_base_vector;
+}
diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_tracked_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_tracked_perf_test.hpp
new file mode 100644
index 0000000000..f9b4e8eaff
--- /dev/null
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_tracked_perf_test.hpp
@@ -0,0 +1,133 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+// This file is for the "tracked test" version of
+// a Kokkos Kernels performance test.
+// Created by David Poliakoff and Amy Powell on 9/22/2021
+
+#ifndef KOKKOSKERNELS_KOKKOSBLAS_GEMM_TEST_RPS_HPP
+#define KOKKOSKERNELS_KOKKOSBLAS_GEMM_TEST_RPS_HPP
+
+#include <Kokkos_Core.hpp>
+#include "blas/KokkosBlas3_gemm.hpp"
+#include <Kokkos_Random.hpp>
+
+// These headers are required for RPS tracked perf testing
+#ifdef KOKKOSKERNELS_ENABLE_TESTS_AND_PERFSUITE
+#include <PerfTestUtilities.hpp>
+
+struct gemmConfig {
+  int m;
+  int n;
+  int k;
+};
+
+test_list construct_gemm_kernel_base(const rajaperf::RunParams& run_params,
+                                     const std::vector<gemmConfig>& n_k_vect);
+
+#endif  // KOKKOSKERNELS_ENABLE_TESTS_AND_PERFSUITE
+
+// Templating on these three types, mirroring
+template <class ExecSpace, class ALayout, class BLayout>
+struct testData_gemm {
+  // Data for running tests
+  // m is the number of rows in A
+  int m = 1000;
+  // n is the number of columns in A;
+  // n is the number of rows in B;
+  int n = 1000;
+  // k is the number of columns in B;
+  int k      = 1000;
+  int repeat = 1;
+
+  std::string modeA = "N";
+  std::string modeB = "N";
+
+  // Usage: KokkosBlas::gemm(modeA, modeB, alpha, A, B, beta, C);
+  // Dense matrix-matrix multiply: C = beta*C + alpha*op(A)*op(B);
+  // alpha:  alpha [in] Input coefficient of A*x
+  // beta [in] Input coefficient of C
+
+  double alpha = 1.0;
+  double beta  = 0.0;
+
+  using Scalar   = double;
+  using MemSpace = typename ExecSpace::memory_space;
+  using Device   = Kokkos::Device<ExecSpace, MemSpace>;
+
+  // Create 2D Kokoks::View, "A" containing an input matrix with m x n
+  // dimensions
+  Kokkos::View<Scalar**, ALayout, Device> A;
+
+  // Create 2D Kokkos::View, "B" containing an input matrix with n x k
+  // dimensions
+  Kokkos::View<Scalar**, BLayout, Device> B;
+
+  // Create 2D Kokkos::View, "C" , the resultant matrix
+  Kokkos::View<Scalar**, Kokkos::LayoutLeft, Device> C;
+
+  // class Constructor:
+  testData_gemm(int m_in, int n_in, int k_in, int repeat_in)
+      : m(m_in), n(n_in), k(k_in), repeat(repeat_in) {
+    // You must set A, B and C equal to its intended value
+    A = Kokkos::View<Scalar**, ALayout, Device>(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "A"), m, n);
+    B = Kokkos::View<Scalar**, BLayout, Device>(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), n, k);
+    C = Kokkos::View<Scalar**, Kokkos::LayoutLeft, Device>(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "C"), m, k);
+    // Seed random number generation
+    Kokkos::Random_XorShift64_Pool<ExecSpace> pool(123);
+    // Fill input matrices A and x with 10 random values from pool
+    Kokkos::fill_random(A, pool, 10.0);
+    Kokkos::fill_random(B, pool, 10.0);
+  }
+};
+
+// Declare setup_test
+template <typename ExecSpace, typename ALayout, typename BLayout>
+testData_gemm<ExecSpace, ALayout, BLayout> setup_test(int m, int n, int k,
+                                                      int repeat);
+
+#endif  // KOKKOSKERNELS_KOKKOSBLAS_GEMM_TEST_RPS_HPP
diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
index 1069bc4d00..905e123daa 100644
--- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
+++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
@@ -46,6 +46,7 @@
 #include "KokkosBlas3_gemm_perf_test.hpp"
 
 #include <cstdlib>
+#include <memory>
 #include <unistd.h>
 #include <getopt.h>
 
@@ -71,6 +72,7 @@ static struct option long_options[] = {
     {"routines", required_argument, 0, 'r'},
     {"verify", required_argument, 0, 'v'},
     {"ninter", required_argument, 0, 'j'},
+    {"use_simd", required_argument, 0, 'f'},
     {0, 0, 0, 0}};
 
 static void __print_help_blas3_perf_test() {
@@ -224,10 +226,18 @@ static void __print_help_blas3_perf_test() {
   printf("\t-j, --ninter=NINTER\n");
   printf("\t\tInterleaving size for armpl. (untimed)\n");
   printf(
-    "\t\t\tValid values for NINTER is any positive integer "
-    "that evenly divides the batch size. "
-    "(default: %d)\n",
-    DEFAULT_NINTER);
+      "\t\t\tValid values for NINTER is any positive integer "
+      "that evenly divides the batch size. "
+      "(default: %d)\n",
+      DEFAULT_NINTER);
+
+  printf("\t-u, --use_simd=SIMD\n");
+  printf("\t\tWhether to use SIMD views.\n");
+  printf(
+      "\t\t\tValid values for SIMD are 1 to use SIMD views and 0 to use "
+      "non-SIMD"
+      "views instead. (default: %d)\n",
+      DEFAULT_USE_SIMD);
 }
 
 static void __blas3_perf_test_input_error(char ** /*argv*/, char short_opt,
@@ -242,6 +252,7 @@ int main(int argc, char **argv) {
   int option_idx = 0, ret, i;
   char *n_str = nullptr, *adim = nullptr, *bdim = nullptr, *cdim = nullptr;
   std::filebuf fb;
+  std::ostream out(&fb);
   char *out_file                          = nullptr;
   using rt_type                           = decltype(do_trmm_invoke);
   rt_type *routine_table[BLAS_ROUTINES_N] = {
@@ -281,6 +292,7 @@ int main(int argc, char **argv) {
   options.blas_args.batch_size_last_dim = DEFAULT_BATCH_SIZE_LAST_DIM;
   options.verify                        = DEFAULT_VERIFY;
   options.ninter                        = DEFAULT_NINTER;
+  options.use_simd                      = DEFAULT_USE_SIMD;
 
   options.blas_args.trmm.trmm_args = DEFAULT_TRMM_ARGS;
   options.blas_args.trmm.alpha     = DEFAULT_TRMM_ALPHA;
@@ -289,9 +301,9 @@ int main(int argc, char **argv) {
   options.blas_args.gemm.alpha     = DEFAULT_GEMM_ALPHA;
   options.blas_args.gemm.beta      = DEFAULT_GEMM_BETA;
 
-  while (
-      (ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:d:v:j:",
-                         long_options, &option_idx)) != -1) {
+  while ((ret = getopt_long(argc, argv,
+                            "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:d:v:j:f:",
+                            long_options, &option_idx)) != -1) {
     switch (ret) {
       case 'h': __print_help_blas3_perf_test(); return 0;
       case 't':
@@ -417,6 +429,7 @@ int main(int argc, char **argv) {
       case 'z': options.blas_args.team_size = atoi(optarg); break;
       case 'n': options.blas_args.vector_len = atoi(optarg); break;
       case 'u': options.blas_args.use_auto = atoi(optarg); break;
+      case 'f': options.use_simd = atoi(optarg); break;
       case 'c':
         out_file         = optarg;
         options.out_file = std::string(out_file);
@@ -429,7 +442,6 @@ int main(int argc, char **argv) {
 
   if (out_file != nullptr) {
     fb.open(out_file, std::ios::out);
-    std::ostream out(&fb);
     options.out = &out;
   }
 
diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
index fed8f322dd..096088773f 100644
--- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
@@ -82,8 +82,8 @@ void (*do_trmm_invoke[LOOP_N][TEST_N])(options_t) = {
  * LHS giving us this flop count: flops = columns_LHS * (columns_LHS + 1) flops
  * = (flops / 2) * 2 flops = flops * rows_LHS
  */
-static inline int __trmm_impl_flop_count(char side, int b_m, int b_n, int /*a_m*/,
-                                         int /*a_n*/) {
+static inline int __trmm_impl_flop_count(char side, int b_m, int b_n,
+                                         int /*a_m*/, int /*a_n*/) {
   int flops;
 
   if (side == 'L' || side == 'l') {
@@ -210,16 +210,13 @@ static void __print_trmm_perf_test_options(options_t options) {
   return;
 }
 #else
-static void __print_trmm_perf_test_options(options_t /*options*/) {
-  return;
-}
+static void __print_trmm_perf_test_options(options_t /*options*/) { return; }
 #endif  // PERF_TEST_DEBUG
 
 /*************************** Internal templated fns **************************/
 // Need to take subviews on the device
-#if !defined(KOKKOS_ENABLE_CUDA) \
-  && !defined(KOKKOS_ENABLE_HIP) \
-  && !defined(KOKKOS_ENABLE_OPENMPTARGET)
+#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \
+    !defined(KOKKOS_ENABLE_OPENMPTARGET)
 template <class scalar_type, class vta, class vtb, class device_type>
 void __do_trmm_serial_blas(options_t options, trmm_args_t trmm_args) {
   uint32_t warm_up_n = options.warm_up_n;
@@ -259,15 +256,16 @@ void __do_trmm_serial_blas(options_t options, trmm_args_t trmm_args) {
 template <class scalar_type, class vta, class vtb, class device_type>
 void __do_trmm_serial_blas(options_t /*options*/, trmm_args_t /*trmm_args*/) {
   std::cerr << std::string(__func__)
-            << " disabled since KOKKOS_ENABLE_CUDA or KOKKOS_ENABLE_OPENMPTARGET is defined." << std::endl;
+            << " disabled since KOKKOS_ENABLE_CUDA or "
+               "KOKKOS_ENABLE_OPENMPTARGET is defined."
+            << std::endl;
   return;
 }
 #endif  // !KOKKOS_ENABLE_CUDA && !KOKKOS_ENABLE_OPENMPTARGET
 
 // Need to take subviews on the device
-#if !defined(KOKKOS_ENABLE_CUDA) \
-  && !defined(KOKKOS_ENABLE_HIP) \
-  && !defined(KOKKOS_ENABLE_OPENMPTARGET)
+#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \
+    !defined(KOKKOS_ENABLE_OPENMPTARGET)
 template <class side, class uplo, class trans, class diag>
 void __do_trmm_serial_batched_template(options_t options,
                                        trmm_args_t trmm_args) {
@@ -281,7 +279,8 @@ void __do_trmm_serial_batched_template(options_t options,
       auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL());
       auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL());
 
-      KokkosBatched::SerialTrmm<side, uplo, trans, diag, tag>::invoke(trmm_args.alpha, A, B);
+      KokkosBatched::SerialTrmm<side, uplo, trans, diag, tag>::invoke(
+          trmm_args.alpha, A, B);
     }
     // Fence after submitting each batch operation
     Kokkos::fence();
@@ -293,7 +292,8 @@ void __do_trmm_serial_batched_template(options_t options,
       auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL());
       auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL());
 
-      KokkosBatched::SerialTrmm<side, uplo, trans, diag, tag>::invoke(trmm_args.alpha, A, B);
+      KokkosBatched::SerialTrmm<side, uplo, trans, diag, tag>::invoke(
+          trmm_args.alpha, A, B);
     }
     // Fence after submitting each batch operation
     Kokkos::fence();
@@ -305,7 +305,9 @@ template <class side, class uplo, class trans, class diag>
 void __do_trmm_serial_batched_template(options_t /*options*/,
                                        trmm_args_t /*trmm_args*/) {
   std::cerr << std::string(__func__)
-            << " disabled since KOKKOS_ENABLE_CUDA or KOKKOS_ENABLE_OPENMPTARGET is defined." << std::endl;
+            << " disabled since KOKKOS_ENABLE_CUDA or "
+               "KOKKOS_ENABLE_OPENMPTARGET is defined."
+            << std::endl;
 }
 #endif  // !KOKKOS_ENABLE_CUDA && !KOKKOS_ENABLE_OPENMPTARGET
 
@@ -411,9 +413,8 @@ void __do_trmm_serial_batched(options_t options, trmm_args_t trmm_args) {
   return;
 }
 
-#if !defined(KOKKOS_ENABLE_CUDA) \
-  && !defined(KOKKOS_ENABLE_HIP) \
-  && !defined(KOKKOS_ENABLE_OPENMPTARGET)
+#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \
+    !defined(KOKKOS_ENABLE_OPENMPTARGET)
 template <class ExecutionSpace>
 struct parallel_blas_trmm {
   trmm_args_t trmm_args_;
@@ -434,9 +435,8 @@ struct parallel_blas_trmm {
 template <class scalar_type, class vta, class vtb, class device_type>
 void __do_trmm_parallel_blas(options_t options, trmm_args_t trmm_args) {
 // TODO: Note why this is disabled on CUDA, OPENMPTARGET and HIP
-#if !defined(KOKKOS_ENABLE_CUDA) \
-  && !defined(KOKKOS_ENABLE_HIP) \
-  && !defined(KOKKOS_ENABLE_OPENMPTARGET)
+#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \
+    !defined(KOKKOS_ENABLE_OPENMPTARGET)
   uint32_t warm_up_n = options.warm_up_n;
   uint32_t n         = options.n;
   Kokkos::Timer timer;
@@ -487,8 +487,8 @@ struct parallel_batched_trmm {
     auto svA = Kokkos::subview(trmm_args_.A, i, Kokkos::ALL(), Kokkos::ALL());
     auto svB = Kokkos::subview(trmm_args_.B, i, Kokkos::ALL(), Kokkos::ALL());
 
-    KokkosBatched::SerialTrmm<side, uplo, trans, diag, tag>::invoke(trmm_args_.alpha, svA,
-                                                     svB);
+    KokkosBatched::SerialTrmm<side, uplo, trans, diag, tag>::invoke(
+        trmm_args_.alpha, svA, svB);
   }
 };
 
diff --git a/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp
index a3ae834006..bff28b05ab 100644
--- a/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp
@@ -78,37 +78,40 @@ void (*do_trtri_invoke[LOOP_N][TEST_N])(options_t) = {
 /*************************** Test types and defaults **************************/
 #define DEFAULT_TRTRI_ARGS "UU"
 
-  /**
-   * The KokkosBatched::SerialTrtri implementation performs trmm and scal on subblocks
-   * of the A matrix. a_m subblocks are selected.
-   */
+/**
+ * The KokkosBatched::SerialTrtri implementation performs trmm and scal on
+ * subblocks of the A matrix. a_m subblocks are selected.
+ */
 static inline double __trtri_impl_flop_count(double a_m, double /*a_n*/) {
   double flop_count = 0;
   double flops_per_div, flops_per_mul, flops_per_add;
 
-    if (std::is_same<double, default_scalar>::value ||
-        std::is_same<float, default_scalar>::value ||
-        std::is_same<Kokkos::Experimental::half_t, default_scalar>::value) {
-      flops_per_div = 1;
-      flops_per_mul = 1;
-      flops_per_add = 1;
-    } else {
-      // For complex, we need to count 2 flops for each add and 6 flops for each multiply or divide.
-      flops_per_div = 6;
-      flops_per_mul = 6;
-      flops_per_add = 2;
-    }
+  if (std::is_same<double, default_scalar>::value ||
+      std::is_same<float, default_scalar>::value ||
+      std::is_same<Kokkos::Experimental::half_t, default_scalar>::value) {
+    flops_per_div = 1;
+    flops_per_mul = 1;
+    flops_per_add = 1;
+  } else {
+    // For complex, we need to count 2 flops for each add and 6 flops for each
+    // multiply or divide.
+    flops_per_div = 6;
+    flops_per_mul = 6;
+    flops_per_add = 2;
+  }
 
   for (int i = 0; i < a_m; i++) {
-    flop_count += flops_per_div;                                         // 1 / A[i,j]
-    flop_count += ((i * (i + 1)) / 2) * (flops_per_mul + flops_per_add); // TRMM FLOPS
-    flop_count += i * flops_per_mul;                                     // SCAL FLOPS
+    flop_count += flops_per_div;  // 1 / A[i,j]
+    flop_count +=
+        ((i * (i + 1)) / 2) * (flops_per_mul + flops_per_add);  // TRMM FLOPS
+    flop_count += i * flops_per_mul;                            // SCAL FLOPS
   }
 
   return flop_count;
 }
 
-// Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf
+// Flop count formula from lapack working note 41:
+// http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf
 static inline double __trtri_flop_count(double a_m, double a_n) {
   double flops;
   double flops_per_mul;
@@ -120,18 +123,21 @@ static inline double __trtri_flop_count(double a_m, double a_n) {
   }
 
   if (std::is_same<double, default_scalar>::value ||
-        std::is_same<float, default_scalar>::value ||
-        std::is_same<Kokkos::Experimental::half_t, default_scalar>::value) {
+      std::is_same<float, default_scalar>::value ||
+      std::is_same<Kokkos::Experimental::half_t, default_scalar>::value) {
     flops_per_mul = 1;
     flops_per_add = 1;
   } else {
-    // For complex, we need to count 2 flops for each add and 6 flops for each multiply.
+    // For complex, we need to count 2 flops for each add and 6 flops for each
+    // multiply.
     flops_per_mul = 6;
     flops_per_add = 2;
   }
 
-  flops = (1./6.*a_n*a_n*a_n + 1./2.*a_n*a_n + 1./3.*a_n) * flops_per_mul +
-          (1./6.*a_n*a_n*a_n - 1./2.*a_n*a_n + 1./3.*a_n) * flops_per_add;
+  flops = (1. / 6. * a_n * a_n * a_n + 1. / 2. * a_n * a_n + 1. / 3. * a_n) *
+              flops_per_mul +
+          (1. / 6. * a_n * a_n * a_n - 1. / 2. * a_n * a_n + 1. / 3. * a_n) *
+              flops_per_add;
 
   return flops;
 }
@@ -151,19 +157,19 @@ static std::string trtri_csv_header_str =
 /*************************** Internal helper fns **************************/
 static void __trtri_output_csv_row(options_t options, trtri_args_t trtri_args,
                                    double time_in_seconds) {
-  double flops = trtri_args.A.extent(0) * __trtri_flop_count(trtri_args.A.extent(1), trtri_args.A.extent(2));
-  double gflops = flops / 1e9;
+  double flops =
+      trtri_args.A.extent(0) *
+      __trtri_flop_count(trtri_args.A.extent(1), trtri_args.A.extent(2));
+  double gflops       = flops / 1e9;
   double average_time = time_in_seconds / options.n;
 
   options.out[0] << test_e_str[options.test] << ","
                  << options.blas_args.trtri.trtri_args << ","
-                 << loop_e_str[options.loop] << "," << trtri_args.A.extent(0) << "x" << trtri_args.A.extent(1)
-                 << "x" << trtri_args.A.extent(2) << "," << options.warm_up_n
-                 << "," << options.n << "," << time_in_seconds << ","
-                 << average_time << ","
-                 << flops << ","
-                 << gflops / average_time
-                 << std::endl;
+                 << loop_e_str[options.loop] << "," << trtri_args.A.extent(0)
+                 << "x" << trtri_args.A.extent(1) << "x"
+                 << trtri_args.A.extent(2) << "," << options.warm_up_n << ","
+                 << options.n << "," << time_in_seconds << "," << average_time
+                 << "," << flops << "," << gflops / average_time << std::endl;
 }
 
 #ifdef TRTRI_PERF_TEST_DEBUG
@@ -190,12 +196,11 @@ static void __print_trtri_perf_test_options(options_t) {
 }
 
 /*************************** Internal templated fns **************************/
-#if !defined(KOKKOS_ENABLE_CUDA) \
-  && !defined(KOKKOS_ENABLE_HIP) \
-  && !defined(KOKKOS_ENABLE_OPENMPTARGET)
+#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \
+    !defined(KOKKOS_ENABLE_OPENMPTARGET)
 template <class scalar_type, class vta, class device_type>
 void __do_trtri_serial_blas(options_t options, trtri_args_t trtri_args) {
-// Need to take subviews on the device
+  // Need to take subviews on the device
   uint32_t warm_up_n = options.warm_up_n;
   uint32_t n         = options.n;
   Kokkos::Timer timer;
@@ -227,7 +232,8 @@ void __do_trtri_serial_blas(options_t options, trtri_args_t trtri_args) {
 }
 #else
 template <class scalar_type, class vta, class device_type>
-void __do_trtri_serial_blas(options_t /*options*/, trtri_args_t /*trtri_args*/) {
+void __do_trtri_serial_blas(options_t /*options*/,
+                            trtri_args_t /*trtri_args*/) {
   std::cerr << std::string(__func__)
             << " disabled since KOKKOS_ENABLE_DEVICE is defined." << std::endl;
   return;
@@ -235,9 +241,8 @@ void __do_trtri_serial_blas(options_t /*options*/, trtri_args_t /*trtri_args*/)
 #endif  // !KOKKOS_ENABLE_CUDA
 
 // Need to take subviews on the device
-#if !defined(KOKKOS_ENABLE_CUDA) \
-  && !defined(KOKKOS_ENABLE_HIP) \
-  && !defined(KOKKOS_ENABLE_OPENMPTARGET)
+#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \
+    !defined(KOKKOS_ENABLE_OPENMPTARGET)
 template <class uplo, class diag>
 void __do_trtri_serial_batched_template(options_t options,
                                         trtri_args_t trtri_args) {
@@ -271,7 +276,7 @@ void __do_trtri_serial_batched_template(options_t options,
 #else
 template <class uplo, class diag>
 void __do_trtri_serial_batched_template(options_t /*options*/,
-					trtri_args_t /*trtri_args*/) {
+                                        trtri_args_t /*trtri_args*/) {
   std::cerr << std::string(__func__)
             << " disabled since KOKKOS_ENABLE_DEVICE is defined." << std::endl;
 }
@@ -279,6 +284,9 @@ void __do_trtri_serial_batched_template(options_t /*options*/,
 
 template <class scalar_type, class vta, class device_type>
 void __do_trtri_serial_batched(options_t options, trtri_args_t trtri_args) {
+  using KokkosBatched::Diag;
+  using KokkosBatched::Uplo;
+
   char __uplo = tolower(trtri_args.uplo), __diag = tolower(trtri_args.diag);
 
   using KokkosBatched::Diag;
@@ -309,9 +317,8 @@ void __do_trtri_serial_batched(options_t options, trtri_args_t trtri_args) {
   return;
 }
 
-#if !defined(KOKKOS_ENABLE_CUDA) \
-  && !defined(KOKKOS_ENABLE_HIP) \
-  && !defined(KOKKOS_ENABLE_OPENMPTARGET)
+#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \
+    !defined(KOKKOS_ENABLE_OPENMPTARGET)
 template <class ExecutionSpace>
 struct parallel_blas_trtri {
   trtri_args_t trtri_args_;
@@ -325,13 +332,13 @@ struct parallel_blas_trtri {
     KokkosBlas::trtri(&trtri_args_.uplo, &trtri_args_.diag, svA);
   }
 };
-#endif  // !KOKKOS_ENABLE_CUDA && !KOKKOS_ENABLE_HIP && !KOKKOS_ENABLE_OPENMPTARGET
+#endif  // !KOKKOS_ENABLE_CUDA && !KOKKOS_ENABLE_HIP &&
+        // !KOKKOS_ENABLE_OPENMPTARGET
 
 template <class scalar_type, class vta, class device_type>
 void __do_trtri_parallel_blas(options_t options, trtri_args_t trtri_args) {
-#if !defined(KOKKOS_ENABLE_CUDA) \
-  && !defined(KOKKOS_ENABLE_HIP) \
-  && !defined(KOKKOS_ENABLE_OPENMPTARGET)
+#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \
+    !defined(KOKKOS_ENABLE_OPENMPTARGET)
   uint32_t warm_up_n = options.warm_up_n;
   uint32_t n         = options.n;
   Kokkos::Timer timer;
@@ -342,27 +349,32 @@ void __do_trtri_parallel_blas(options_t options, trtri_args_t trtri_args) {
   STATUS;
 
   for (uint32_t i = 0; i < warm_up_n; ++i) {
-    Kokkos::parallel_for("parallelBlasWarmUpLoopTrtri",
-                        Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
-                        parallel_blas_trtri_functor);
+    Kokkos::parallel_for(
+        "parallelBlasWarmUpLoopTrtri",
+        Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
+        parallel_blas_trtri_functor);
     // Fence after each batch operation
     Kokkos::fence();
   }
 
   timer.reset();
   for (uint32_t i = 0; i < n; ++i) {
-    Kokkos::parallel_for("parallelBlasTimedLoopTrtri",
-                        Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
-                        parallel_blas_trtri_functor);
+    Kokkos::parallel_for(
+        "parallelBlasTimedLoopTrtri",
+        Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
+        parallel_blas_trtri_functor);
     // Fence after each batch operation
     Kokkos::fence();
   }
   __trtri_output_csv_row(options, trtri_args, timer.seconds());
 #else
   std::cerr << std::string(__func__)
-            << " disabled since KOKKOS_ENABLE_CUDA, KOKKOS_ENABLE_HIP or KOKKOS_ENABLE_OPENMPTARGET is defined." << std::endl;
+            << " disabled since KOKKOS_ENABLE_CUDA, KOKKOS_ENABLE_HIP or "
+               "KOKKOS_ENABLE_OPENMPTARGET is defined."
+            << std::endl;
   __trtri_output_csv_row(options, trtri_args, -1);
-#endif  // !KOKKOS_ENABLE_CUDA && !KOKKOS_ENABLE_HIP && !defined(KOKKOS_ENABLE_OPENMPTARGET)
+#endif  // !KOKKOS_ENABLE_CUDA && !KOKKOS_ENABLE_HIP &&
+        // !defined(KOKKOS_ENABLE_OPENMPTARGET)
   return;
 }
 
@@ -394,9 +406,10 @@ void __do_trtri_parallel_batched_template(options_t options,
   STATUS;
 
   for (uint32_t i = 0; i < warm_up_n; ++i) {
-    Kokkos::parallel_for("parallelBatchedWarmUpLoopTrtri",
-                        Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
-                        parallel_batched_trtri_functor);
+    Kokkos::parallel_for(
+        "parallelBatchedWarmUpLoopTrtri",
+        Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
+        parallel_batched_trtri_functor);
     // Fence after each batch operation
     Kokkos::fence();
   }
@@ -404,9 +417,10 @@ void __do_trtri_parallel_batched_template(options_t options,
   timer.reset();
 
   for (uint32_t i = 0; i < n; ++i) {
-    Kokkos::parallel_for("parallelBatchedTimedLoopTrtri",
-                        Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
-                        parallel_batched_trtri_functor);
+    Kokkos::parallel_for(
+        "parallelBatchedTimedLoopTrtri",
+        Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
+        parallel_batched_trtri_functor);
     // Fence after each batch operation
     Kokkos::fence();
   }
@@ -417,6 +431,9 @@ void __do_trtri_parallel_batched_template(options_t options,
 
 template <class scalar_type, class vta, class device_type>
 void __do_trtri_parallel_batched(options_t options, trtri_args_t trtri_args) {
+  using KokkosBatched::Diag;
+  using KokkosBatched::Uplo;
+
   char __uplo = tolower(trtri_args.uplo), __diag = tolower(trtri_args.diag);
   using KokkosBatched::Diag;
   using KokkosBatched::Uplo;
diff --git a/perf_test/blas/blas3/tracked_testing.hpp b/perf_test/blas/blas3/tracked_testing.hpp
new file mode 100644
index 0000000000..a899fa91f1
--- /dev/null
+++ b/perf_test/blas/blas3/tracked_testing.hpp
@@ -0,0 +1,58 @@
+//
+// Created by Poliakoff, David Zoeller on 4/26/21.
+//
+#ifndef KOKKOSKERNELS_BLAS3_TRACKED_TESTING_HPP
+#define KOKKOSKERNELS_BLAS3_TRACKED_TESTING_HPP
+
+#include <common/RAJAPerfSuite.hpp>
+#include <common/Executor.hpp>
+
+#include "KokkosBlas3_gemm_tracked_perf_test.hpp"
+
+/*
+ *Three cases to test:
+ *
+ * 1) m = n = k
+ * 2) one case for m, n, k all pretty large,
+ *  3) and another for m, k small but n  large
+ *
+ *  You could use m = k = 5, n = 1 million or something like that for dot based
+ *  gemm
+ *
+ */
+
+namespace test {
+namespace blas3 {
+
+// Change n and k values in the context of the backend
+template <class ExecSpace>
+std::vector<gemmConfig> create_m_n_k_vect() {
+  std::string exec_space_name = ExecSpace::name();
+
+  return {
+
+      // m = n = k; one case for m, n, k all pretty large,
+      {1000, 1000, 1000},
+      // and another for m, k small but n  large
+      {5, 1000000, 5}};
+}
+
+// Register kernels for a specific test
+void build_gemm_executor(rajaperf::Executor& exec, int argc, char* argv[],
+                         const rajaperf::RunParams& params) {
+  for (auto* kernel : construct_gemm_kernel_base(
+           params, create_m_n_k_vect<Kokkos::DefaultExecutionSpace>())) {
+    exec.registerKernel("BLAS3", kernel);
+  }
+}
+
+void build_blas3_executor(rajaperf::Executor& exec, int argc, char* argv[],
+                          const rajaperf::RunParams& params) {
+  exec.registerGroup("BLAS3");
+  build_gemm_executor(exec, argc, argv, params);
+}
+
+}  // namespace blas3
+}  // namespace test
+
+#endif  // KOKKOSKERNELS_TRACKED_TESTING_HPP
diff --git a/perf_test/graph/KokkosGraph_color.cpp b/perf_test/graph/KokkosGraph_color.cpp
index a3fecb4c99..8b16111157 100644
--- a/perf_test/graph/KokkosGraph_color.cpp
+++ b/perf_test/graph/KokkosGraph_color.cpp
@@ -46,180 +46,191 @@
 #include <cstdlib>
 #include <iostream>
 
-#include <random>       // std::default_random_engine
-#include <algorithm>    // std::shuffle
+#include <random>     // std::default_random_engine
+#include <algorithm>  // std::shuffle
 #include <vector>
 
 #include "KokkosKernels_IOUtils.hpp"
 #include "KokkosSparse_CrsMatrix.hpp"
 #include "KokkosKernels_TestParameters.hpp"
 #include "KokkosGraph_Distance1Color.hpp"
-
-
-
-void print_options(std::ostream &os, const char *app_name, unsigned int indent = 0)
-{
-    std::string spaces(indent, ' ');
-    os << "Usage:" << std::endl
-       << spaces << "  " << app_name << " [parameters]" << std::endl
-       << std::endl
-       << spaces << "Parameters:" << std::endl
-       << spaces << "  Parallelism (select one of the following):" << std::endl
+#include "KokkosKernels_TestUtils.hpp"
+
+void print_options(std::ostream &os, const char *app_name,
+                   unsigned int indent = 0) {
+  std::string spaces(indent, ' ');
+  os << "Usage:" << std::endl
+     << spaces << "  " << app_name << " [parameters]" << std::endl
+     << std::endl
+     << spaces << "Parameters:" << std::endl
+     << spaces << "  Parallelism (select one of the following):" << std::endl
 #if defined(KOKKOS_ENABLE_SERIAL)
-       << spaces << "      --serial            Execute serially." << std::endl
+     << spaces << "      --serial            Execute serially." << std::endl
 #endif
 #if defined(KOKKOS_ENABLE_THREADS)
-       << spaces << "      --threads <N>       Use N posix threads." << std::endl
+     << spaces << "      --threads <N>       Use N posix threads." << std::endl
 #endif
 #if defined(KOKKOS_ENABLE_OPENMP)
-       << spaces << "      --openmp <N>        Use OpenMP with N threads." << std::endl
+     << spaces << "      --openmp <N>        Use OpenMP with N threads."
+     << std::endl
 #endif
 #if defined(KOKKOS_ENABLE_CUDA)
-       << spaces << "      --cuda <id>         Use CUDA (device $id)" << std::endl
+     << spaces << "      --cuda <id>         Use CUDA (device $id)" << std::endl
 #endif
 #if defined(KOKKOS_ENABLE_HIP)
-       << spaces << "      --hip <id>          Use HIP (device $id)" << std::endl
+     << spaces << "      --hip <id>          Use HIP (device $id)" << std::endl
 #endif
-       << std::endl
-       << spaces << "  Required Parameters:" << std::endl
-       << spaces << "      --amtx <filename>   Input file in Matrix Market format (.mtx)." << std::endl
-       << std::endl
-       << spaces << "      --algorithm <algorithm_name>   Set the algorithm to use.  Allowable values are:" << std::endl
-       << spaces << "                 COLORING_DEFAULT  - Use the default coloring method, architecture dependent." << std::endl
-       << spaces << "                 COLORING_SERIAL   - Use the serial algorithm." << std::endl
-       << spaces << "                 COLORING_VB       - Use the parallel vertex-based method." << std::endl
-       << spaces << "                 COLORING_VBBIT    - Use the parallel vertex-based with bit vectors method." << std::endl
-       << spaces << "                 COLORING_EB       - Use edge based method." << std::endl
-       << spaces << "                 COLORING_VBD      - Use the vertex-based deterministic method." << std::endl
-       << spaces << "                 COLORING_VBDBIT   - Use the vertex-based deterministic with bit vectors method." << std::endl
-       << std::endl
-       << spaces << "  Optional Parameters:" << std::endl
-       << spaces << "      --chunksize <N>     Set the chunk size." << std::endl
-       << spaces << "      --dynamic           Use dynamic scheduling." << std::endl
-       << spaces << "      --outputfile <FILE> Output the colors of the nodes to the file." << std::endl
-       << spaces << "      --repeat <N>        Set number of test repetitions (Default: 1) " << std::endl
-       << spaces << "      --teamsize  <N>     Set the team size." << std::endl
-       << spaces << "      --vectorsize <N>    Set the vector size." << std::endl
-       << spaces << "      --verbose           Enable verbose mode (record and print timing + extra information)" << std::endl
-       << spaces << "      --help              Print out command line help." << std::endl
-       << spaces << " " << std::endl;
+     << std::endl
+     << spaces << "  Required Parameters:" << std::endl
+     << spaces
+     << "      --amtx <filename>   Input file in Matrix Market format (.mtx)."
+     << std::endl
+     << std::endl
+     << spaces
+     << "      --algorithm <algorithm_name>   Set the algorithm to use.  "
+        "Allowable values are:"
+     << std::endl
+     << spaces
+     << "                 COLORING_DEFAULT  - Use the default coloring method, "
+        "architecture dependent."
+     << std::endl
+     << spaces
+     << "                 COLORING_SERIAL   - Use the serial algorithm."
+     << std::endl
+     << spaces
+     << "                 COLORING_VB       - Use the parallel vertex-based "
+        "method."
+     << std::endl
+     << spaces
+     << "                 COLORING_VBBIT    - Use the parallel vertex-based "
+        "with bit vectors method."
+     << std::endl
+     << spaces << "                 COLORING_EB       - Use edge based method."
+     << std::endl
+     << spaces
+     << "                 COLORING_VBD      - Use the vertex-based "
+        "deterministic method."
+     << std::endl
+     << spaces
+     << "                 COLORING_VBDBIT   - Use the vertex-based "
+        "deterministic with bit vectors method."
+     << std::endl
+     << std::endl
+     << spaces << "  Optional Parameters:" << std::endl
+     << spaces << "      --chunksize <N>     Set the chunk size." << std::endl
+     << spaces << "      --dynamic           Use dynamic scheduling."
+     << std::endl
+     << spaces
+     << "      --outputfile <FILE> Output the colors of the nodes to the file."
+     << std::endl
+     << spaces
+     << "      --repeat <N>        Set number of test repetitions (Default: 1) "
+     << std::endl
+     << spaces << "      --teamsize  <N>     Set the team size." << std::endl
+     << spaces << "      --vectorsize <N>    Set the vector size." << std::endl
+     << spaces
+     << "      --verbose           Enable verbose mode (record and print "
+        "timing + extra information)"
+     << std::endl
+     << spaces << "      --help              Print out command line help."
+     << std::endl
+     << spaces << " " << std::endl;
 }
 
-static char* getNextArg(int& i, int argc, char** argv)
-{
+static char *getNextArg(int &i, int argc, char **argv) {
   i++;
-  if(i >= argc)
-  {
+  if (i >= argc) {
     std::cerr << "Error: expected additional command-line argument!\n";
     exit(1);
   }
   return argv[i];
 }
 
-int parse_inputs (KokkosKernels::Experiment::Parameters &params, int argc, char **argv)
-{
+int parse_inputs(KokkosKernels::Experiment::Parameters &params, int argc,
+                 char **argv) {
   bool got_required_param_amtx      = false;
   bool got_required_param_algorithm = false;
 
-  for ( int i = 1 ; i < argc ; ++i ) {
-    if ( 0 == strcasecmp( argv[i] , "--threads" ) ) {
+  for (int i = 1; i < argc; ++i) {
+    if (0 == Test::string_compare_no_case(argv[i], "--threads")) {
       params.use_threads = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--serial" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--serial")) {
       params.use_serial = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--openmp" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) {
       params.use_openmp = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) {
       params.use_cuda = 1 + atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--hip" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) {
       params.use_hip = 1 + atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--repeat" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) {
       params.repeat = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--chunksize" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--chunksize")) {
       params.chunk_size = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--teamsize" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--teamsize")) {
       params.team_size = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--vectorsize" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--vectorsize")) {
       params.vector_size = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--amtx" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--amtx")) {
       got_required_param_amtx = true;
-      params.a_mtx_bin_file = getNextArg(i, argc, argv);
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--dynamic" ) ) {
+      params.a_mtx_bin_file   = getNextArg(i, argc, argv);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--dynamic")) {
       params.use_dynamic_scheduling = 1;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--verbose" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--verbose")) {
       params.verbose = 1;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--outputfile" ) || 0 == strcasecmp( argv[i] , "-o" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--outputfile") ||
+               0 == Test::string_compare_no_case(argv[i], "-o")) {
       params.coloring_output_file = getNextArg(i, argc, argv);
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--algorithm" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--algorithm")) {
       got_required_param_algorithm = true;
       ++i;
-      if ( 0 == strcasecmp( argv[i] , "COLORING_DEFAULT" ) ) {
+      if (0 == Test::string_compare_no_case(argv[i], "COLORING_DEFAULT")) {
         params.algorithm = 1;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "COLORING_SERIAL" ) ) {
+      } else if (0 ==
+                 Test::string_compare_no_case(argv[i], "COLORING_SERIAL")) {
         params.algorithm = 2;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "COLORING_VB" ) ) {
+      } else if (0 == Test::string_compare_no_case(argv[i], "COLORING_VB")) {
         params.algorithm = 3;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "COLORING_VBBIT" ) ) {
+      } else if (0 == Test::string_compare_no_case(argv[i], "COLORING_VBBIT")) {
         params.algorithm = 4;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "COLORING_VBCS" ) ) {
+      } else if (0 == Test::string_compare_no_case(argv[i], "COLORING_VBCS")) {
         params.algorithm = 5;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "COLORING_EB" ) ) {
+      } else if (0 == Test::string_compare_no_case(argv[i], "COLORING_EB")) {
         params.algorithm = 6;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "COLORING_VBD" ) ) {
+      } else if (0 == Test::string_compare_no_case(argv[i], "COLORING_VBD")) {
         params.algorithm = 7;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "COLORING_VBDBIT" ) ) {
+      } else if (0 ==
+                 Test::string_compare_no_case(argv[i], "COLORING_VBDBIT")) {
         params.algorithm = 8;
-      }
-      else if ( 0 == strcasecmp( argv[i], "--help") || 0 == strcasecmp(argv[i], "-h") )
-      {
+      } else if (0 == Test::string_compare_no_case(argv[i], "--help") ||
+                 0 == Test::string_compare_no_case(argv[i], "-h")) {
         print_options(std::cout, argv[0]);
         return 1;
-      }
-      else {
-        std::cerr << "2-Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ;
+      } else {
+        std::cerr << "2-Unrecognized command line argument #" << i << ": "
+                  << argv[i] << std::endl;
         print_options(std::cout, argv[0]);
         return 1;
       }
-    }
-    else {
-      std::cerr << "3-Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ;
+    } else {
+      std::cerr << "3-Unrecognized command line argument #" << i << ": "
+                << argv[i] << std::endl;
       print_options(std::cout, argv[0]);
       return 1;
     }
   }
-  if(!got_required_param_amtx)
-  {
+  if (!got_required_param_amtx) {
     std::cout << "Missing required parameter amtx" << std::endl << std::endl;
     print_options(std::cout, argv[0]);
     return 1;
   }
-  if(!got_required_param_algorithm)
-  {
-    std::cout << "Missing required parameter algorithm" << std::endl << std::endl;
+  if (!got_required_param_algorithm) {
+    std::cout << "Missing required parameter algorithm" << std::endl
+              << std::endl;
     print_options(std::cout, argv[0]);
-  return 1;
+    return 1;
   }
-  if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda && !params.use_hip)
-  {
+  if (!params.use_serial && !params.use_threads && !params.use_openmp &&
+      !params.use_cuda && !params.use_hip) {
     print_options(std::cout, argv[0]);
     return 1;
   }
@@ -227,29 +238,29 @@ int parse_inputs (KokkosKernels::Experiment::Parameters &params, int argc, char
   return 0;
 }
 
-namespace KokkosKernels{
-
-namespace Experiment{
+namespace KokkosKernels {
 
+namespace Experiment {
 
-template <typename ExecSpace, typename crsGraph_t, typename crsGraph_t2 , typename crsGraph_t3 , typename TempMemSpace , typename PersistentMemSpace >
-void run_experiment(
-    crsGraph_t crsGraph, int num_cols, Parameters params){
-  //using namespace KokkosSparse;
+template <typename ExecSpace, typename crsGraph_t, typename crsGraph_t2,
+          typename crsGraph_t3, typename TempMemSpace,
+          typename PersistentMemSpace>
+void run_experiment(crsGraph_t crsGraph, int num_cols, Parameters params) {
+  // using namespace KokkosSparse;
   using namespace KokkosGraph;
   using namespace KokkosGraph::Experimental;
-  //using namespace KokkosSparse::Experimental;
+  // using namespace KokkosSparse::Experimental;
 
-  int algorithm = params.algorithm;
-  int repeat = params.repeat;
+  int algorithm  = params.algorithm;
+  int repeat     = params.repeat;
   int chunk_size = params.chunk_size;
 
-  int shmemsize = params.shmemsize;
-  int team_size = params.team_size;
+  int shmemsize              = params.shmemsize;
+  int team_size              = params.team_size;
   int use_dynamic_scheduling = params.use_dynamic_scheduling;
-  int verbose = params.verbose;
+  int verbose                = params.verbose;
 
-  //char spgemm_step = params.spgemm_step;
+  // char spgemm_step = params.spgemm_step;
   int vector_size = params.vector_size;
 
   typedef typename crsGraph_t3::row_map_type::non_const_type lno_view_t;
@@ -258,9 +269,9 @@ void run_experiment(
   typedef typename lno_view_t::non_const_value_type size_type;
   typedef typename lno_nnz_view_t::non_const_value_type lno_t;
 
-  typedef KokkosKernels::Experimental::KokkosKernelsHandle
-      <size_type,lno_t, lno_t,
-      ExecSpace, TempMemSpace,PersistentMemSpace > KernelHandle;
+  typedef KokkosKernels::Experimental::KokkosKernelsHandle<
+      size_type, lno_t, lno_t, ExecSpace, TempMemSpace, PersistentMemSpace>
+      KernelHandle;
 
   KernelHandle kh;
   kh.set_team_work_size(chunk_size);
@@ -268,345 +279,318 @@ void run_experiment(
   kh.set_suggested_team_size(team_size);
   kh.set_suggested_vector_size(vector_size);
 
-
-  if (use_dynamic_scheduling){
+  if (use_dynamic_scheduling) {
     kh.set_dynamic_scheduling(true);
   }
-  if (verbose){
+  if (verbose) {
     kh.set_verbose(true);
   }
 
   std::cout << "algorithm: " << algorithm << std::endl;
 
-  for (int i = 0; i < repeat; ++i){
-
-    switch (algorithm){
-    case 1:
-      kh.create_graph_coloring_handle(COLORING_DEFAULT);
-
-      break;
-    case 2:
-      kh.create_graph_coloring_handle(COLORING_SERIAL);
-
-      break;
-    case 3:
-      kh.create_graph_coloring_handle(COLORING_VB);
-      break;
-    case 4:
-      kh.create_graph_coloring_handle(COLORING_VBBIT);
-
-      break;
-    case 5:
-      kh.create_graph_coloring_handle(COLORING_VBCS);
-
-      break;
-    case 6:
-      kh.create_graph_coloring_handle(COLORING_EB);
-      break;
+  double totalTime = 0.0;
+  for (int i = 0; i < repeat; ++i) {
+    switch (algorithm) {
+      case 1: kh.create_graph_coloring_handle(COLORING_DEFAULT); break;
+      case 2: kh.create_graph_coloring_handle(COLORING_SERIAL); break;
+      case 3: kh.create_graph_coloring_handle(COLORING_VB); break;
+      case 4: kh.create_graph_coloring_handle(COLORING_VBBIT); break;
+      case 5: kh.create_graph_coloring_handle(COLORING_VBCS); break;
+      case 6: kh.create_graph_coloring_handle(COLORING_EB); break;
 
-    case 7:
-      kh.create_graph_coloring_handle(COLORING_VBD);
-      break;
+      case 7: kh.create_graph_coloring_handle(COLORING_VBD); break;
 
-    case 8:
-      kh.create_graph_coloring_handle(COLORING_VBDBIT);
-      break;
-
-    default:
-      kh.create_graph_coloring_handle(COLORING_DEFAULT);
+      case 8: kh.create_graph_coloring_handle(COLORING_VBDBIT); break;
 
+      default: kh.create_graph_coloring_handle(COLORING_DEFAULT);
     }
 
-    graph_color_symbolic(&kh,crsGraph.numRows(), num_cols, crsGraph.row_map, crsGraph.entries);
-
-    std::cout << std::endl <<
-        "Time:" << kh.get_graph_coloring_handle()->get_overall_coloring_time() << " "
-        "Num colors:" << kh.get_graph_coloring_handle()->get_num_colors() << " "
-        "Num Phases:" << kh.get_graph_coloring_handle()->get_num_phases() << std::endl;
-    std::cout << "\t"; KokkosKernels::Impl::print_1Dview(kh.get_graph_coloring_handle()->get_vertex_colors());
-
-    if( params.coloring_output_file != NULL ) {
+    graph_color_symbolic(&kh, crsGraph.numRows(), num_cols, crsGraph.row_map,
+                         crsGraph.entries);
+
+    std::cout << std::endl
+              << "Time:"
+              << kh.get_graph_coloring_handle()->get_overall_coloring_time()
+              << " sec. "
+                 "Num colors:"
+              << kh.get_graph_coloring_handle()->get_num_colors()
+              << " "
+                 "Num Phases:"
+              << kh.get_graph_coloring_handle()->get_num_phases() << std::endl;
+    std::cout << "\t";
+    KokkosKernels::Impl::print_1Dview(
+        kh.get_graph_coloring_handle()->get_vertex_colors());
+
+    if (params.coloring_output_file != NULL) {
       std::ofstream os(params.coloring_output_file, std::ofstream::out);
-      KokkosKernels::Impl::print_1Dview(os, kh.get_graph_coloring_handle()->get_vertex_colors(), true, "\n"); 
+      KokkosKernels::Impl::print_1Dview(
+          os, kh.get_graph_coloring_handle()->get_vertex_colors(), true, "\n");
     }
+    totalTime += kh.get_graph_coloring_handle()->get_overall_coloring_time();
   }
+  std::cout << "Average time over " << repeat
+            << " trials: " << totalTime / repeat << " sec.\n";
 }
 
-template <typename size_type, typename lno_t,
-          typename exec_space, typename hbm_mem_space, typename sbm_mem_space>
-void run_multi_mem_experiment(Parameters params){
-
+template <typename size_type, typename lno_t, typename exec_space,
+          typename hbm_mem_space, typename sbm_mem_space>
+void run_multi_mem_experiment(Parameters params) {
   typedef exec_space myExecSpace;
   typedef Kokkos::Device<exec_space, hbm_mem_space> myFastDevice;
   typedef Kokkos::Device<exec_space, sbm_mem_space> mySlowExecSpace;
 
-  typedef typename KokkosSparse::CrsMatrix<double, lno_t, myFastDevice, void, size_type > fast_crstmat_t;
+  typedef typename KokkosSparse::CrsMatrix<double, lno_t, myFastDevice, void,
+                                           size_type>
+      fast_crstmat_t;
   typedef typename fast_crstmat_t::StaticCrsGraphType fast_graph_t;
-  //typedef typename fast_graph_t::row_map_type::non_const_type fast_row_map_view_t;
-  //typedef typename fast_graph_t::entries_type::non_const_type   fast_cols_view_t;
+  // typedef typename fast_graph_t::row_map_type::non_const_type
+  // fast_row_map_view_t; typedef typename
+  // fast_graph_t::entries_type::non_const_type   fast_cols_view_t;
 
-  //typedef typename fast_graph_t::row_map_type::const_type const_fast_row_map_view_t;
-  //typedef typename fast_graph_t::entries_type::const_type   const_fast_cols_view_t;
+  // typedef typename fast_graph_t::row_map_type::const_type
+  // const_fast_row_map_view_t; typedef typename
+  // fast_graph_t::entries_type::const_type   const_fast_cols_view_t;
 
-  typedef typename KokkosSparse::CrsMatrix<double, lno_t, mySlowExecSpace, void, size_type > slow_crstmat_t;
+  typedef typename KokkosSparse::CrsMatrix<double, lno_t, mySlowExecSpace, void,
+                                           size_type>
+      slow_crstmat_t;
   typedef typename slow_crstmat_t::StaticCrsGraphType slow_graph_t;
 
-  //typedef typename slow_graph_t::row_map_type::non_const_type slow_row_map_view_t;
-  //typedef typename slow_graph_t::entries_type::non_const_type   slow_cols_view_t;
-  //typedef typename slow_graph_t::row_map_type::const_type const_slow_row_map_view_t;
-  //typedef typename slow_graph_t::entries_type::const_type   const_slow_cols_view_t;
+  // typedef typename slow_graph_t::row_map_type::non_const_type
+  // slow_row_map_view_t; typedef typename
+  // slow_graph_t::entries_type::non_const_type   slow_cols_view_t; typedef
+  // typename slow_graph_t::row_map_type::const_type const_slow_row_map_view_t;
+  // typedef typename slow_graph_t::entries_type::const_type
+  // const_slow_cols_view_t;
 
   char *a_mat_file = params.a_mtx_bin_file;
-  //char *b_mat_file = params.b_mtx_bin_file;
-  //char *c_mat_file = params.c_mtx_bin_file;
+  // char *b_mat_file = params.b_mtx_bin_file;
+  // char *c_mat_file = params.c_mtx_bin_file;
 
   slow_graph_t a_slow_crsgraph, /*b_slow_crsgraph,*/ c_slow_crsgraph;
   fast_graph_t a_fast_crsgraph, /*b_fast_crsgraph,*/ c_fast_crsgraph;
 
   int num_cols = 0;
 
-  //read a and b matrices and store them on slow or fast memory.
-  if (params.a_mem_space == 1){
+  // read a and b matrices and store them on slow or fast memory.
+  if (params.a_mem_space == 1) {
     fast_crstmat_t a_fast_crsmat;
-    a_fast_crsmat = KokkosKernels::Impl::read_kokkos_crst_matrix<fast_crstmat_t>(a_mat_file);
+    a_fast_crsmat =
+        KokkosKernels::Impl::read_kokkos_crst_matrix<fast_crstmat_t>(
+            a_mat_file);
     a_fast_crsgraph = a_fast_crsmat.graph;
-    num_cols = a_fast_crsmat.numCols();
+    num_cols        = a_fast_crsmat.numCols();
 
-  }
-  else {
+  } else {
     slow_crstmat_t a_slow_crsmat;
-    a_slow_crsmat = KokkosKernels::Impl::read_kokkos_crst_matrix<slow_crstmat_t>(a_mat_file);
+    a_slow_crsmat =
+        KokkosKernels::Impl::read_kokkos_crst_matrix<slow_crstmat_t>(
+            a_mat_file);
     a_slow_crsgraph = a_slow_crsmat.graph;
-    num_cols = a_slow_crsmat.numCols();
+    num_cols        = a_slow_crsmat.numCols();
   }
 
-
-  if (params.a_mem_space == 1){
-    if (params.b_mem_space == 1){
-      if (params.c_mem_space == 1){
-        if (params.work_mem_space == 1){
-           /* c_fast_crsgraph = */
-              KokkosKernels::Experiment::run_experiment
-                <myExecSpace, fast_graph_t,fast_graph_t,fast_graph_t, hbm_mem_space, hbm_mem_space>
-                (a_fast_crsgraph, num_cols, params);
-        }
-        else {
+  if (params.a_mem_space == 1) {
+    if (params.b_mem_space == 1) {
+      if (params.c_mem_space == 1) {
+        if (params.work_mem_space == 1) {
           /* c_fast_crsgraph = */
-              KokkosKernels::Experiment::run_experiment
-                <myExecSpace, fast_graph_t,fast_graph_t,fast_graph_t, sbm_mem_space, sbm_mem_space>
-                (a_fast_crsgraph, num_cols, params);
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_graph_t, fast_graph_t, fast_graph_t,
+              hbm_mem_space, hbm_mem_space>(a_fast_crsgraph, num_cols, params);
+        } else {
+          /* c_fast_crsgraph = */
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_graph_t, fast_graph_t, fast_graph_t,
+              sbm_mem_space, sbm_mem_space>(a_fast_crsgraph, num_cols, params);
         }
 
-      }
-      else {
-        //C is in slow memory.
-        if (params.work_mem_space == 1){
+      } else {
+        // C is in slow memory.
+        if (params.work_mem_space == 1) {
           /*c_slow_crsgraph =*/
-              KokkosKernels::Experiment::run_experiment
-                <myExecSpace, fast_graph_t,fast_graph_t,slow_graph_t, hbm_mem_space, hbm_mem_space>
-                (a_fast_crsgraph, num_cols, params);
-        }
-        else {
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_graph_t, fast_graph_t, slow_graph_t,
+              hbm_mem_space, hbm_mem_space>(a_fast_crsgraph, num_cols, params);
+        } else {
           /*c_slow_crsgraph =*/
-              KokkosKernels::Experiment::run_experiment
-                <myExecSpace, fast_graph_t,fast_graph_t,slow_graph_t, sbm_mem_space, sbm_mem_space>
-                (a_fast_crsgraph, num_cols, params);
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_graph_t, fast_graph_t, slow_graph_t,
+              sbm_mem_space, sbm_mem_space>(a_fast_crsgraph, num_cols, params);
         }
       }
-    }
-    else {
-      //B is in slow memory
-      if (params.c_mem_space == 1){
-        if (params.work_mem_space == 1){
+    } else {
+      // B is in slow memory
+      if (params.c_mem_space == 1) {
+        if (params.work_mem_space == 1) {
           /* c_fast_crsgraph = */
-              KokkosKernels::Experiment::run_experiment
-                <myExecSpace, fast_graph_t,slow_graph_t,fast_graph_t, hbm_mem_space, hbm_mem_space>
-                (a_fast_crsgraph, num_cols, params);
-        }
-        else {
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_graph_t, slow_graph_t, fast_graph_t,
+              hbm_mem_space, hbm_mem_space>(a_fast_crsgraph, num_cols, params);
+        } else {
           /* c_fast_crsgraph = */
-              KokkosKernels::Experiment::run_experiment
-                <myExecSpace, fast_graph_t,slow_graph_t,fast_graph_t, sbm_mem_space, sbm_mem_space>
-                (a_fast_crsgraph, num_cols, params);
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_graph_t, slow_graph_t, fast_graph_t,
+              sbm_mem_space, sbm_mem_space>(a_fast_crsgraph, num_cols, params);
         }
 
-      }
-      else {
-        //C is in slow memory.
-        if (params.work_mem_space == 1){
+      } else {
+        // C is in slow memory.
+        if (params.work_mem_space == 1) {
           /*c_slow_crsgraph =*/
-              KokkosKernels::Experiment::run_experiment
-                <myExecSpace, fast_graph_t,slow_graph_t,slow_graph_t, hbm_mem_space, hbm_mem_space>
-                (a_fast_crsgraph, num_cols, params);
-        }
-        else {
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_graph_t, slow_graph_t, slow_graph_t,
+              hbm_mem_space, hbm_mem_space>(a_fast_crsgraph, num_cols, params);
+        } else {
           /*c_slow_crsgraph =*/
-              KokkosKernels::Experiment::run_experiment
-                <myExecSpace, fast_graph_t,slow_graph_t,slow_graph_t, sbm_mem_space, sbm_mem_space>
-                (a_fast_crsgraph, num_cols, params);
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_graph_t, slow_graph_t, slow_graph_t,
+              sbm_mem_space, sbm_mem_space>(a_fast_crsgraph, num_cols, params);
         }
       }
-
     }
-  }
-  else {
-    //A is in slow memory
-    if (params.b_mem_space == 1){
-      if (params.c_mem_space == 1){
-        if (params.work_mem_space == 1){
+  } else {
+    // A is in slow memory
+    if (params.b_mem_space == 1) {
+      if (params.c_mem_space == 1) {
+        if (params.work_mem_space == 1) {
           /* c_fast_crsgraph = */
-              KokkosKernels::Experiment::run_experiment
-                <myExecSpace, slow_graph_t,fast_graph_t,fast_graph_t, hbm_mem_space, hbm_mem_space>
-                (a_slow_crsgraph, num_cols, params);
-        }
-        else {
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_graph_t, fast_graph_t, fast_graph_t,
+              hbm_mem_space, hbm_mem_space>(a_slow_crsgraph, num_cols, params);
+        } else {
           /* c_fast_crsgraph = */
-              KokkosKernels::Experiment::run_experiment
-                <myExecSpace, slow_graph_t,fast_graph_t,fast_graph_t, sbm_mem_space, sbm_mem_space>
-                (a_slow_crsgraph, num_cols, params);
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_graph_t, fast_graph_t, fast_graph_t,
+              sbm_mem_space, sbm_mem_space>(a_slow_crsgraph, num_cols, params);
         }
 
-      }
-      else {
-        //C is in slow memory.
-        if (params.work_mem_space == 1){
+      } else {
+        // C is in slow memory.
+        if (params.work_mem_space == 1) {
           /*c_slow_crsgraph =*/
-              KokkosKernels::Experiment::run_experiment
-                <myExecSpace, slow_graph_t,fast_graph_t,slow_graph_t, hbm_mem_space, hbm_mem_space>
-                (a_slow_crsgraph, num_cols, params);
-        }
-        else {
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_graph_t, fast_graph_t, slow_graph_t,
+              hbm_mem_space, hbm_mem_space>(a_slow_crsgraph, num_cols, params);
+        } else {
           /*c_slow_crsgraph =*/
-              KokkosKernels::Experiment::run_experiment
-                <myExecSpace, slow_graph_t,fast_graph_t,slow_graph_t, sbm_mem_space, sbm_mem_space>
-                (a_slow_crsgraph, num_cols, params);
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_graph_t, fast_graph_t, slow_graph_t,
+              sbm_mem_space, sbm_mem_space>(a_slow_crsgraph, num_cols, params);
         }
       }
-    }
-    else {
-      //B is in slow memory
-      if (params.c_mem_space == 1){
-        if (params.work_mem_space == 1){
+    } else {
+      // B is in slow memory
+      if (params.c_mem_space == 1) {
+        if (params.work_mem_space == 1) {
           /* c_fast_crsgraph = */
-              KokkosKernels::Experiment::run_experiment
-                <myExecSpace, slow_graph_t,slow_graph_t,fast_graph_t, hbm_mem_space, hbm_mem_space>
-                (a_slow_crsgraph, num_cols, params);
-        }
-        else {
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_graph_t, slow_graph_t, fast_graph_t,
+              hbm_mem_space, hbm_mem_space>(a_slow_crsgraph, num_cols, params);
+        } else {
           /* c_fast_crsgraph = */
-              KokkosKernels::Experiment::run_experiment
-                <myExecSpace, slow_graph_t,slow_graph_t,fast_graph_t, sbm_mem_space, sbm_mem_space>
-                (a_slow_crsgraph, num_cols, params);
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_graph_t, slow_graph_t, fast_graph_t,
+              sbm_mem_space, sbm_mem_space>(a_slow_crsgraph, num_cols, params);
         }
 
-      }
-      else {
-        //C is in slow memory.
-        if (params.work_mem_space == 1){
+      } else {
+        // C is in slow memory.
+        if (params.work_mem_space == 1) {
           /*c_slow_crsgraph =*/
-              KokkosKernels::Experiment::run_experiment
-                <myExecSpace, slow_graph_t,slow_graph_t,slow_graph_t, hbm_mem_space, hbm_mem_space>
-                (a_slow_crsgraph, num_cols, params);
-        }
-        else {
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_graph_t, slow_graph_t, slow_graph_t,
+              hbm_mem_space, hbm_mem_space>(a_slow_crsgraph, num_cols, params);
+        } else {
           /*c_slow_crsgraph =*/
-              KokkosKernels::Experiment::run_experiment
-                <myExecSpace, slow_graph_t,slow_graph_t,slow_graph_t, sbm_mem_space, sbm_mem_space>
-                (a_slow_crsgraph, num_cols, params);
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_graph_t, slow_graph_t, slow_graph_t,
+              sbm_mem_space, sbm_mem_space>(a_slow_crsgraph, num_cols, params);
         }
       }
-
     }
-
   }
 }
 
+}  // namespace Experiment
+}  // namespace KokkosKernels
 
-
-}
-}
-
-int main (int argc, char ** argv){
-
+int main(int argc, char **argv) {
   typedef unsigned size_type;
   typedef int idx;
-  //typedef int size_type;
-  //typedef int idx;
+  // typedef int size_type;
+  // typedef int idx;
 
   KokkosKernels::Experiment::Parameters params;
 
-  if (parse_inputs (params, argc, argv) ){
+  if (parse_inputs(params, argc, argv)) {
     return 1;
   }
-  if (params.a_mtx_bin_file == NULL){
-    std::cerr << "Provide a matrix file" << std::endl ;
+  if (params.a_mtx_bin_file == NULL) {
+    std::cerr << "Provide a matrix file" << std::endl;
     return 0;
   }
-  std::cout << "Sizeof(idx):" << sizeof(idx) << " sizeof(size_type):" << sizeof(size_type) << std::endl;
+  std::cout << "Sizeof(idx):" << sizeof(idx)
+            << " sizeof(size_type):" << sizeof(size_type) << std::endl;
 
-  const int num_threads = params.use_openmp; // Assumption is that use_openmp variable is provided as number of threads
+  const int num_threads =
+      params.use_openmp;  // Assumption is that use_openmp variable is provided
+                          // as number of threads
   const int device_id = std::max(params.use_cuda, params.use_hip) - 1;
-  Kokkos::initialize( Kokkos::InitArguments( num_threads, -1, device_id ) );
+  Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
   Kokkos::print_configuration(std::cout);
 
-#if defined( KOKKOS_ENABLE_OPENMP )
+#if defined(KOKKOS_ENABLE_OPENMP)
 
   if (params.use_openmp) {
 #ifdef KOKKOSKERNELS_MULTI_MEM
-    KokkosKernels::Experiment::run_multi_mem_experiment
-    <size_type, idx, Kokkos::OpenMP, Kokkos::OpenMP::memory_space, Kokkos::HostSpace>(
-        params
-        );
+    KokkosKernels::Experiment::run_multi_mem_experiment<
+        size_type, idx, Kokkos::OpenMP, Kokkos::OpenMP::memory_space,
+        Kokkos::HostSpace>(params);
 #else
 
-    KokkosKernels::Experiment::run_multi_mem_experiment
-    <size_type, idx, Kokkos::OpenMP, Kokkos::OpenMP::memory_space, Kokkos::OpenMP::memory_space>(
-        params
-        );
+    KokkosKernels::Experiment::run_multi_mem_experiment<
+        size_type, idx, Kokkos::OpenMP, Kokkos::OpenMP::memory_space,
+        Kokkos::OpenMP::memory_space>(params);
 #endif
   }
 #endif
 
-#if defined( KOKKOS_ENABLE_CUDA )
+#if defined(KOKKOS_ENABLE_CUDA)
   if (params.use_cuda) {
 #ifdef KOKKOSKERNELS_MULTI_MEM
-    KokkosKernels::Experiment::run_multi_mem_experiment
-    <size_type, idx, Kokkos::Cuda, Kokkos::Cuda::memory_space, Kokkos::CudaHostPinnedSpace>(
-        params
-        );
+    KokkosKernels::Experiment::run_multi_mem_experiment<
+        size_type, idx, Kokkos::Cuda, Kokkos::Cuda::memory_space,
+        Kokkos::CudaHostPinnedSpace>(params);
 #else
-    KokkosKernels::Experiment::run_multi_mem_experiment
-    <size_type, idx, Kokkos::Cuda, Kokkos::Cuda::memory_space, Kokkos::Cuda::memory_space>(
-        params
-        );
+    KokkosKernels::Experiment::run_multi_mem_experiment<
+        size_type, idx, Kokkos::Cuda, Kokkos::Cuda::memory_space,
+        Kokkos::Cuda::memory_space>(params);
 
 #endif
   }
 
 #endif
 
-#if defined( KOKKOS_ENABLE_HIP )
+#if defined(KOKKOS_ENABLE_HIP)
   if (params.use_hip) {
-    KokkosKernels::Experiment::run_multi_mem_experiment
-    <size_type, idx, Kokkos::Experimental::HIP, Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace>(
-        params
-        );
+    KokkosKernels::Experiment::run_multi_mem_experiment<
+        size_type, idx, Kokkos::Experimental::HIP,
+        Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace>(params);
   }
 #endif
 
-#if defined( KOKKOS_ENABLE_SERIAL )
+#if defined(KOKKOS_ENABLE_SERIAL)
   if (params.use_serial) {
 #ifdef KOKKOSKERNELS_MULTI_MEM
-    KokkosKernels::Experiment::run_multi_mem_experiment
-    <size_type, idx, Kokkos::Serial, Kokkos::Serial::memory_space, Kokkos::HostSpace>(
-        params
-        );
+    KokkosKernels::Experiment::run_multi_mem_experiment<
+        size_type, idx, Kokkos::Serial, Kokkos::Serial::memory_space,
+        Kokkos::HostSpace>(params);
 #else
 
-    KokkosKernels::Experiment::run_multi_mem_experiment
-    <size_type, idx, Kokkos::Serial, Kokkos::Serial::memory_space, Kokkos::Serial::memory_space>(
-        params
-        );
+    KokkosKernels::Experiment::run_multi_mem_experiment<
+        size_type, idx, Kokkos::Serial, Kokkos::Serial::memory_space,
+        Kokkos::Serial::memory_space>(params);
 #endif
   }
 #endif
diff --git a/perf_test/graph/KokkosGraph_color_d2.cpp b/perf_test/graph/KokkosGraph_color_d2.cpp
index d417a82880..7d6f45889a 100644
--- a/perf_test/graph/KokkosGraph_color_d2.cpp
+++ b/perf_test/graph/KokkosGraph_color_d2.cpp
@@ -64,18 +64,17 @@
 #include <KokkosKernels_TestParameters.hpp>
 #include <KokkosGraph_Distance2Color.hpp>
 #include "KokkosKernels_default_types.hpp"
+#include "KokkosKernels_TestUtils.hpp"
 
 using namespace KokkosGraph;
 
-enum ColoringMode
-{
+enum ColoringMode {
   MODE_D2_SYMMETRIC,
   MODE_BIPARTITE_ROWS,
   MODE_BIPARTITE_COLS
 };
 
-struct D2Parameters
-{
+struct D2Parameters {
   GraphColoringAlgorithmDistance2 algorithm;
   int repeat;
   int verbose;
@@ -87,17 +86,16 @@ struct D2Parameters
   const char* mtx_file;
   ColoringMode d2_color_type;
 
-  D2Parameters()
-  {
-    algorithm = COLORING_D2_DEFAULT;
-    repeat = 6;
-    verbose = 0;
-    use_threads = 0;
-    use_openmp = 0;
-    use_cuda = 0;
-    use_hip = 0;
-    use_serial = 0;
-    mtx_file = NULL;
+  D2Parameters() {
+    algorithm     = COLORING_D2_DEFAULT;
+    repeat        = 6;
+    verbose       = 0;
+    use_threads   = 0;
+    use_openmp    = 0;
+    use_cuda      = 0;
+    use_hip       = 0;
+    use_serial    = 0;
+    mtx_file      = NULL;
     d2_color_type = MODE_D2_SYMMETRIC;
   }
 };
@@ -108,556 +106,595 @@ typedef default_lno_t kk_lno_t;
 
 using namespace KokkosGraph;
 
-void print_options(std::ostream &os, const char *app_name, unsigned int indent = 0)
-{
-    std::string spaces(indent, ' ');
-    os << "Usage:" << std::endl
-       << spaces << "  " << app_name << " [parameters]" << std::endl
-       << std::endl
-       << spaces << "Parameters:" << std::endl
-       << spaces << "  Required Parameters:" << std::endl
-       << spaces << "      --amtx <filename>   Input file in Matrix Market format (.mtx)." << std::endl
-       << std::endl
-       << spaces << "      Device type (the following are enabled in this build):" << std::endl
+void print_options(std::ostream& os, const char* app_name,
+                   unsigned int indent = 0) {
+  std::string spaces(indent, ' ');
+  os << "Usage:" << std::endl
+     << spaces << "  " << app_name << " [parameters]" << std::endl
+     << std::endl
+     << spaces << "Parameters:" << std::endl
+     << spaces << "  Required Parameters:" << std::endl
+     << spaces
+     << "      --amtx <filename>   Input file in Matrix Market format (.mtx)."
+     << std::endl
+     << std::endl
+     << spaces << "      Device type (the following are enabled in this build):"
+     << std::endl
 #ifdef KOKKOS_ENABLE_SERIAL
-       << spaces << "          --serial            Execute serially." << std::endl
+     << spaces << "          --serial            Execute serially." << std::endl
 #endif
 #ifdef KOKKOS_ENABLE_THREADS
-       << spaces << "          --threads <N>       Use N posix threads." << std::endl
+     << spaces << "          --threads <N>       Use N posix threads."
+     << std::endl
 #endif
 #ifdef KOKKOS_ENABLE_OPENMP
-       << spaces << "          --openmp <N>        Use OpenMP with N threads." << std::endl
+     << spaces << "          --openmp <N>        Use OpenMP with N threads."
+     << std::endl
 #endif
 #ifdef KOKKOS_ENABLE_CUDA
-       << spaces << "          --cuda <device id>  Use given CUDA device" << std::endl
+     << spaces << "          --cuda <device id>  Use given CUDA device"
+     << std::endl
 #endif
 #ifdef KOKKOS_ENABLE_HIP
-       << spaces << "          --hip <device id>  Use given HIP device" << std::endl
+     << spaces << "          --hip <device id>  Use given HIP device"
+     << std::endl
 #endif
-       << std::endl
-       << spaces << "  Coloring modes:" << std::endl
-       << spaces << "      --symmetric_d2  (default): distance-2 on undirected/symmmetric graph" << std::endl
-       << spaces << "      --bipartite_rows: color rows (left side of bipartite graph)" << std::endl
-       << spaces << "      --bipartite_cols: color columns (right side of bipartite graph)" << std::endl
-       << std::endl
-       << spaces << "  Optional Parameters:" << std::endl
-       << spaces << "      --algorithm <algorithm_name>   Set the algorithm to use.  Allowable values are:" << std::endl
-       << spaces << "          COLORING_D2_SERIAL          - Serial net-based algorithm" << std::endl
-       << spaces << "          COLORING_D2_VB              - Vertex Based method using boolean forbidden array (Default)." << std::endl
-       << spaces << "          COLORING_D2_VB_BIT          - VB with Bitvector Forbidden Array" << std::endl
-       << spaces << "          COLORING_D2_VB_BIT_EF       - VB_BIT with Edge Filtering" << std::endl
-       << spaces << "          COLORING_D2_NB_BIT          - Net-based (fastest parallel algorithm)" << std::endl
-       << spaces << "      --repeat <N>        Set number of test repetitions (Default: 1) " << std::endl
-       << spaces << "      --verbose           Enable verbose mode (record and print timing + extra information)" << std::endl
-       << spaces << "      --help              Print out command line help." << std::endl
-       << spaces << " " << std::endl;
+     << std::endl
+     << spaces << "  Coloring modes:" << std::endl
+     << spaces
+     << "      --symmetric_d2  (default): distance-2 on undirected/symmmetric "
+        "graph"
+     << std::endl
+     << spaces
+     << "      --bipartite_rows: color rows (left side of bipartite graph)"
+     << std::endl
+     << spaces
+     << "      --bipartite_cols: color columns (right side of bipartite graph)"
+     << std::endl
+     << std::endl
+     << spaces << "  Optional Parameters:" << std::endl
+     << spaces
+     << "      --algorithm <algorithm_name>   Set the algorithm to use.  "
+        "Allowable values are:"
+     << std::endl
+     << spaces
+     << "          COLORING_D2_SERIAL          - Serial net-based algorithm"
+     << std::endl
+     << spaces
+     << "          COLORING_D2_VB              - Vertex Based method using "
+        "boolean forbidden array (Default)."
+     << std::endl
+     << spaces
+     << "          COLORING_D2_VB_BIT          - VB with Bitvector Forbidden "
+        "Array"
+     << std::endl
+     << spaces
+     << "          COLORING_D2_VB_BIT_EF       - VB_BIT with Edge Filtering"
+     << std::endl
+     << spaces
+     << "          COLORING_D2_NB_BIT          - Net-based (fastest parallel "
+        "algorithm)"
+     << std::endl
+     << spaces
+     << "      --repeat <N>        Set number of test repetitions (Default: 1) "
+     << std::endl
+     << spaces
+     << "      --verbose           Enable verbose mode (record and print "
+        "timing + extra information)"
+     << std::endl
+     << spaces << "      --help              Print out command line help."
+     << std::endl
+     << spaces << " " << std::endl;
 }
 
-static char* getNextArg(int& i, int argc, char** argv)
-{
+static char* getNextArg(int& i, int argc, char** argv) {
   i++;
-  if(i >= argc)
-  {
+  if (i >= argc) {
     std::cerr << "Error: expected additional command-line argument!\n";
     exit(1);
   }
   return argv[i];
 }
 
-int parse_inputs(D2Parameters &params, int argc, char **argv)
-{
-    bool got_required_param_amtx      = false;
-    for(int i = 1; i < argc; ++i)
-    {
-        if(0 == strcasecmp(argv[i], "--threads"))
-        {
-            params.use_threads = atoi(getNextArg(i, argc, argv));
-        }
-        else if(0 == strcasecmp(argv[i], "--serial"))
-        {
-            params.use_serial = 1;
-        }
-        else if(0 == strcasecmp(argv[i], "--openmp"))
-        {
-            params.use_openmp = atoi(getNextArg(i, argc, argv));
-        }
-        else if(0 == strcasecmp(argv[i], "--cuda"))
-        {
-            params.use_cuda = 1 + atoi(getNextArg(i, argc, argv));
-        }
-        else if(0 == strcasecmp(argv[i], "--hip"))
-        {
-            params.use_hip = 1 + atoi(getNextArg(i, argc, argv));
-        }
-        else if(0 == strcasecmp(argv[i], "--repeat"))
-        {
-            params.repeat = atoi(getNextArg(i, argc, argv));
-        }
-        else if(0 == strcasecmp(argv[i], "--amtx"))
-        {
-            got_required_param_amtx = true;
-            params.mtx_file  = getNextArg(i, argc, argv);
-        }
-        else if(0 == strcasecmp(argv[i], "--verbose"))
-        {
-            params.verbose = 1;
-        }
-        else if(0 == strcasecmp(argv[i], "--algorithm"))
-        {
-            ++i;
-            if(0 == strcasecmp(argv[i], "COLORING_D2_SERIAL"))
-            {
-                params.algorithm = COLORING_D2_SERIAL;
-            }
-            else if(0 == strcasecmp(argv[i], "COLORING_D2_VB"))
-            {
-                params.algorithm = COLORING_D2_VB;
-            }
-            else if(0 == strcasecmp(argv[i], "COLORING_D2_VB_BIT"))
-            {
-                params.algorithm = COLORING_D2_VB_BIT;
-            }
-            else if(0 == strcasecmp(argv[i], "COLORING_D2_VB_BIT_EF"))
-            {
-                params.algorithm = COLORING_D2_VB_BIT_EF;
-            }
-            else if(0 == strcasecmp(argv[i], "COLORING_D2_NB_BIT"))
-            {
-                params.algorithm = COLORING_D2_NB_BIT;
-            }
-            else
-            {
-                std::cerr << "2-Unrecognized command line argument #" << i << ": " << argv[i] << std::endl;
-                print_options(std::cout, argv[0]);
-                return 1;
-            }
-        }
-        else if(0 == strcasecmp(argv[i], "--symmetric_d2"))
-        {
-          params.d2_color_type = MODE_D2_SYMMETRIC;
-        }
-        else if(0 == strcasecmp(argv[i], "--bipartite_rows"))
-        {
-          params.d2_color_type = MODE_BIPARTITE_ROWS;
-        }
-        else if(0 == strcasecmp(argv[i], "--bipartite_cols"))
-        {
-          params.d2_color_type = MODE_BIPARTITE_COLS;
-        }
-        else if(0 == strcasecmp(argv[i], "--help") || 0 == strcasecmp(argv[i], "-h"))
-        {
-            print_options(std::cout, argv[0]);
-            return 1;
-        }
-        else
-        {
-            std::cerr << "3-Unrecognized command line argument #" << i << ": " << argv[i] << std::endl;
-            print_options(std::cout, argv[0]);
-            return 1;
-        }
-    }
-
-    if(!got_required_param_amtx)
-    {
-        std::cout << "Missing required parameter amtx" << std::endl << std::endl;
-        print_options(std::cout, argv[0]);
-        return 1;
-    }
-    if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda && !params.use_hip)
-    {
+int parse_inputs(D2Parameters& params, int argc, char** argv) {
+  bool got_required_param_amtx = false;
+  for (int i = 1; i < argc; ++i) {
+    if (0 == Test::string_compare_no_case(argv[i], "--threads")) {
+      params.use_threads = atoi(getNextArg(i, argc, argv));
+    } else if (0 == Test::string_compare_no_case(argv[i], "--serial")) {
+      params.use_serial = 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) {
+      params.use_openmp = atoi(getNextArg(i, argc, argv));
+    } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) {
+      params.use_cuda = 1 + atoi(getNextArg(i, argc, argv));
+    } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) {
+      params.use_hip = 1 + atoi(getNextArg(i, argc, argv));
+    } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) {
+      params.repeat = atoi(getNextArg(i, argc, argv));
+    } else if (0 == Test::string_compare_no_case(argv[i], "--amtx")) {
+      got_required_param_amtx = true;
+      params.mtx_file         = getNextArg(i, argc, argv);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--verbose")) {
+      params.verbose = 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--algorithm")) {
+      ++i;
+      if (0 == Test::string_compare_no_case(argv[i], "COLORING_D2_SERIAL")) {
+        params.algorithm = COLORING_D2_SERIAL;
+      } else if (0 == Test::string_compare_no_case(argv[i], "COLORING_D2_VB")) {
+        params.algorithm = COLORING_D2_VB;
+      } else if (0 ==
+                 Test::string_compare_no_case(argv[i], "COLORING_D2_VB_BIT")) {
+        params.algorithm = COLORING_D2_VB_BIT;
+      } else if (0 == Test::string_compare_no_case(argv[i],
+                                                   "COLORING_D2_VB_BIT_EF")) {
+        params.algorithm = COLORING_D2_VB_BIT_EF;
+      } else if (0 ==
+                 Test::string_compare_no_case(argv[i], "COLORING_D2_NB_BIT")) {
+        params.algorithm = COLORING_D2_NB_BIT;
+      } else {
+        std::cerr << "2-Unrecognized command line argument #" << i << ": "
+                  << argv[i] << std::endl;
         print_options(std::cout, argv[0]);
         return 1;
+      }
+    } else if (0 == Test::string_compare_no_case(argv[i], "--symmetric_d2")) {
+      params.d2_color_type = MODE_D2_SYMMETRIC;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--bipartite_rows")) {
+      params.d2_color_type = MODE_BIPARTITE_ROWS;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--bipartite_cols")) {
+      params.d2_color_type = MODE_BIPARTITE_COLS;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--help") ||
+               0 == Test::string_compare_no_case(argv[i], "-h")) {
+      print_options(std::cout, argv[0]);
+      return 1;
+    } else {
+      std::cerr << "3-Unrecognized command line argument #" << i << ": "
+                << argv[i] << std::endl;
+      print_options(std::cout, argv[0]);
+      return 1;
     }
-    return 0;
-}
+  }
 
+  if (!got_required_param_amtx) {
+    std::cout << "Missing required parameter amtx" << std::endl << std::endl;
+    print_options(std::cout, argv[0]);
+    return 1;
+  }
+  if (!params.use_serial && !params.use_threads && !params.use_openmp &&
+      !params.use_cuda && !params.use_hip) {
+    print_options(std::cout, argv[0]);
+    return 1;
+  }
+  return 0;
+}
 
 namespace KokkosKernels {
 namespace Experiment {
 
-
-std::string getCurrentDateTimeStr()
-{
-    // Note: This could be replaced with `std::put_time(&tm, "%FT%T%z")` but std::put_time isn't
-    //       supported on the intel C++ compilers as of v. 17.0.x
-    time_t now = time(0);
-    char output[100];
-    std::strftime(output, sizeof(output), "%FT%T%Z", std::localtime(&now));
-    return output;
+std::string getCurrentDateTimeStr() {
+  // Note: This could be replaced with `std::put_time(&tm, "%FT%T%z")` but
+  // std::put_time isn't
+  //       supported on the intel C++ compilers as of v. 17.0.x
+  time_t now = time(0);
+  char output[100];
+  std::strftime(output, sizeof(output), "%FT%T%Z", std::localtime(&now));
+  return output;
 }
 
+template <typename crsGraph_t>
+void run_experiment(crsGraph_t crsGraph, int num_cols,
+                    const D2Parameters& params) {
+  using namespace KokkosGraph;
+  using namespace KokkosGraph::Experimental;
 
-template<typename crsGraph_t>
-void run_experiment(crsGraph_t crsGraph, int num_cols, const D2Parameters& params)
-{
-    using namespace KokkosGraph;
-    using namespace KokkosGraph::Experimental;
+  using device_t       = typename crsGraph_t::device_type;
+  using exec_space     = typename device_t::execution_space;
+  using mem_space      = typename device_t::memory_space;
+  using lno_view_t     = typename crsGraph_t::row_map_type::non_const_type;
+  using lno_nnz_view_t = typename crsGraph_t::entries_type::non_const_type;
+  using size_type      = typename lno_view_t::non_const_value_type;
+  using lno_t          = typename lno_nnz_view_t::non_const_value_type;
 
-    using device_t = typename crsGraph_t::device_type;
-    using exec_space = typename device_t::execution_space;
-    using mem_space = typename device_t::memory_space;
-    using lno_view_t = typename crsGraph_t::row_map_type::non_const_type;
-    using lno_nnz_view_t = typename crsGraph_t::entries_type::non_const_type;
-    using size_type = typename lno_view_t::non_const_value_type;
-    using lno_t = typename lno_nnz_view_t::non_const_value_type;
+  int repeat = params.repeat;
 
-    int repeat = params.repeat;
+  int verbose = params.verbose;
 
-    int verbose = params.verbose;
+  typedef KokkosKernels::Experimental::KokkosKernelsHandle<
+      size_type, lno_t, kk_scalar_t, exec_space, mem_space, mem_space>
+      KernelHandle;
 
-    typedef KokkosKernels::Experimental::KokkosKernelsHandle<size_type, lno_t, kk_scalar_t, exec_space, mem_space, mem_space> KernelHandle;
+  std::cout << "Num verts: " << crsGraph.numRows() << std::endl
+            << "Num edges: " << crsGraph.entries.extent(0) << std::endl;
 
-    std::cout << "Num verts: " << crsGraph.numRows()         << std::endl
-              << "Num edges: " << crsGraph.entries.extent(0) << std::endl;
+  KernelHandle kh;
 
-    KernelHandle kh;
+  if (verbose) {
+    kh.set_verbose(true);
+  }
 
-    if(verbose)
-    {
-        kh.set_verbose(true);
+  // accumulators for average stats
+  size_t total_colors = 0;
+  size_t total_phases = 0;
+
+  kh.create_distance2_graph_coloring_handle(params.algorithm);
+  std::string label_algorithm =
+      kh.get_distance2_graph_coloring_handle()->getD2AlgorithmName();
+  std::cout << std::endl
+            << "Run Graph Color D2 (" << label_algorithm << ")" << std::endl;
+
+  // If any of the runs have an invalid result, this will be set to false.
+  bool all_results_valid = true;
+
+  // Loop over # of experiments to run
+  for (int i = 0; i < repeat; ++i) {
+    switch (params.d2_color_type) {
+      case MODE_D2_SYMMETRIC:
+        graph_color_distance2(&kh, crsGraph.numRows(), crsGraph.row_map,
+                              crsGraph.entries);
+        break;
+      case MODE_BIPARTITE_ROWS:
+        bipartite_color_rows(&kh, crsGraph.numRows(), num_cols,
+                             crsGraph.row_map, crsGraph.entries);
+        break;
+      case MODE_BIPARTITE_COLS:
+        bipartite_color_columns(&kh, crsGraph.numRows(), num_cols,
+                                crsGraph.row_map, crsGraph.entries);
+        break;
     }
+    total_colors += kh.get_distance2_graph_coloring_handle()->get_num_colors();
+    total_phases += kh.get_distance2_graph_coloring_handle()->get_num_phases();
+
+    std::cout
+        << "Total Time: "
+        << kh.get_distance2_graph_coloring_handle()->get_overall_coloring_time()
+        << std::endl
+        << "Num colors: "
+        << kh.get_distance2_graph_coloring_handle()->get_num_colors()
+        << std::endl
+        << "Num Phases: "
+        << kh.get_distance2_graph_coloring_handle()->get_num_phases()
+        << std::endl;
+
+    std::cout << "\t";
+    KokkosKernels::Impl::print_1Dview(
+        kh.get_distance2_graph_coloring_handle()->get_vertex_colors());
+    std::cout << std::endl;
 
-    // accumulators for average stats
-    size_t total_colors = 0;
-    size_t total_phases = 0;
+    // If verbose mode is on and there the graph has fewer than 1500 verts, dump
+    // a GraphVIZ DOT file.
+    if (verbose && repeat == i + 1 && crsGraph.numRows() < 1500) {
+      auto colors =
+          kh.get_distance2_graph_coloring_handle()->get_vertex_colors();
+      std::ofstream os("G.dot", std::ofstream::out);
+      kh.get_distance2_graph_coloring_handle()->dump_graphviz(
+          os, crsGraph.numRows(), crsGraph.row_map, crsGraph.entries, colors);
+    }
 
-    kh.create_distance2_graph_coloring_handle(params.algorithm);
-    std::string label_algorithm = kh.get_distance2_graph_coloring_handle()->getD2AlgorithmName();
-    std::cout << std::endl << "Run Graph Color D2 (" << label_algorithm << ")" << std::endl;
+    // ------------------------------------------
+    // Verify correctness
+    // ------------------------------------------
+    // TODO bmk: write a faster color verification
+    /*
+    bool d2_coloring_is_valid            = false;
+    bool d2_coloring_validation_flags[4] = { false };
 
-    // If any of the runs have an invalid result, this will be set to false.
-    bool all_results_valid = true;
+    d2_coloring_is_valid = KokkosGraph::Impl::graph_verify_distance2_color(&kh,
+    crsGraph.numRows(), num_cols, crsGraph.row_map, crsGraph.entries,
+    crsGraph.row_map, crsGraph.entries, d2_coloring_validation_flags);
 
-    // Loop over # of experiments to run
-    for(int i = 0; i < repeat; ++i)
+    // Print out messages based on coloring validation check.
+    if(d2_coloring_is_valid)
     {
-        switch(params.d2_color_type)
-        {
-          case MODE_D2_SYMMETRIC:
-            graph_color_distance2(&kh, crsGraph.numRows(), crsGraph.row_map, crsGraph.entries);
-            break;
-          case MODE_BIPARTITE_ROWS:
-            bipartite_color_rows(&kh, crsGraph.numRows(), num_cols,
-                crsGraph.row_map, crsGraph.entries);
-            break;
-          case MODE_BIPARTITE_COLS:
-            bipartite_color_columns(&kh, crsGraph.numRows(), num_cols,
-                crsGraph.row_map, crsGraph.entries);
-            break;
-        }
-        total_colors += kh.get_distance2_graph_coloring_handle()->get_num_colors();
-        total_phases += kh.get_distance2_graph_coloring_handle()->get_num_phases();
-
-        std::cout << "Total Time: " << kh.get_distance2_graph_coloring_handle()->get_overall_coloring_time() << std::endl
-                  << "Num colors: " << kh.get_distance2_graph_coloring_handle()->get_num_colors() << std::endl
-                  << "Num Phases: " << kh.get_distance2_graph_coloring_handle()->get_num_phases() << std::endl;
-
-        std::cout << "\t";
-        KokkosKernels::Impl::print_1Dview(kh.get_distance2_graph_coloring_handle()->get_vertex_colors());
-        std::cout << std::endl;
-
-        // If verbose mode is on and there the graph has fewer than 1500 verts, dump a GraphVIZ DOT file.
-        if(verbose && repeat==i+1 && crsGraph.numRows() < 1500)
-        {
-            auto colors = kh.get_distance2_graph_coloring_handle()->get_vertex_colors();
-            std::ofstream os("G.dot", std::ofstream::out);
-            kh.get_distance2_graph_coloring_handle()->dump_graphviz(os, crsGraph.numRows(), crsGraph.row_map, crsGraph.entries, colors);
-        }
-
-        // ------------------------------------------
-        // Verify correctness
-        // ------------------------------------------
-        // TODO bmk: write a faster color verification
-        /*
-        bool d2_coloring_is_valid            = false;
-        bool d2_coloring_validation_flags[4] = { false };
-
-        d2_coloring_is_valid = KokkosGraph::Impl::graph_verify_distance2_color(&kh, crsGraph.numRows(), num_cols, crsGraph.row_map, crsGraph.entries, crsGraph.row_map, crsGraph.entries, d2_coloring_validation_flags);
-
-        // Print out messages based on coloring validation check.
-        if(d2_coloring_is_valid)
-        {
-            std::cout << std::endl << "Distance-2 Graph Coloring is VALID" << std::endl << std::endl;
-        }
-        else
-        {
-            all_results_valid = false;
-            std::cout << std::endl
-                      << "Distance-2 Graph Coloring is NOT VALID" << std::endl
-                      << "  - Vert(s) left uncolored : " << d2_coloring_validation_flags[1] << std::endl
-                      << "  - Invalid D2 Coloring    : " << d2_coloring_validation_flags[2] << std::endl
-                      << std::endl;
-        }
-        if(d2_coloring_validation_flags[3])
-        {
-            std::cout << "Distance-2 Graph Coloring may have poor quality." << std::endl
-                      << "  - Vert(s) have high color value : " << d2_coloring_validation_flags[3] << std::endl
-                      << std::endl;
-        }
-        */
-
-        // ------------------------------------------
-        // Print out the colors histogram
-        // ------------------------------------------
-        KokkosGraph::Impl::graph_print_distance2_color_histogram(&kh, false);
-
-    } // for i...
-
-    // ------------------------------------------
-    // Compute Distance 2 Degree Stats
-    // ------------------------------------------
-    std::cout << "Compute Distance-2 Degree " << std::endl;
-
-    Kokkos::Timer timer;
-
-    double total_time                   = kh.get_distance2_graph_coloring_handle()->get_overall_coloring_time();
-    double total_time_color_greedy      = kh.get_distance2_graph_coloring_handle()->get_overall_coloring_time_phase1();
-    double total_time_find_conflicts    = kh.get_distance2_graph_coloring_handle()->get_overall_coloring_time_phase2();
-    double total_time_resolve_conflicts = kh.get_distance2_graph_coloring_handle()->get_overall_coloring_time_phase3();
-    double total_time_matrix_squared    = kh.get_distance2_graph_coloring_handle()->get_overall_coloring_time_phase4();
-    double total_time_matrix_squared_d1 = kh.get_distance2_graph_coloring_handle()->get_overall_coloring_time_phase5();
-
-    double avg_time                   = total_time / (double)repeat;
-    double avg_time_color_greedy      = total_time_color_greedy / (double)repeat;
-    double avg_time_find_conflicts    = total_time_find_conflicts / (double)repeat;
-    double avg_time_resolve_conflicts = total_time_resolve_conflicts / (double)repeat;
-    double avg_colors                 = total_colors / (double)repeat;
-    double avg_phases                 = total_phases / (double)repeat;
-    double avg_time_matrix_squared    = total_time_matrix_squared / (double)repeat;
-    double avg_time_matrix_squared_d1 = total_time_matrix_squared_d1 / (double)repeat;
-
-    std::string short_mtx_file(params.mtx_file);
-    short_mtx_file = short_mtx_file.substr(short_mtx_file.find_last_of("/\\") + 1);
-
-    int result;
-    char hostname[100];
-    char username[100];
-
-    result = gethostname(hostname, 100);
-    if(result)
+        std::cout << std::endl << "Distance-2 Graph Coloring is VALID" <<
+    std::endl << std::endl;
+    }
+    else
     {
-        perror("gethostname");
+        all_results_valid = false;
+        std::cout << std::endl
+                  << "Distance-2 Graph Coloring is NOT VALID" << std::endl
+                  << "  - Vert(s) left uncolored : " <<
+    d2_coloring_validation_flags[1] << std::endl
+                  << "  - Invalid D2 Coloring    : " <<
+    d2_coloring_validation_flags[2] << std::endl
+                  << std::endl;
     }
-
-    result = getlogin_r(username, 100);
-    if(result)
+    if(d2_coloring_validation_flags[3])
     {
-        perror("getlogin_r");
+        std::cout << "Distance-2 Graph Coloring may have poor quality." <<
+    std::endl
+                  << "  - Vert(s) have high color value : " <<
+    d2_coloring_validation_flags[3] << std::endl
+                  << std::endl;
     }
+    */
 
-    std::string all_results_valid_str = "PASSED";
-    if(!all_results_valid)
-        all_results_valid_str = "FAILED";
-
-    std::string currentDateTimeStr = getCurrentDateTimeStr();
-
-    std::cout << "Summary" << std::endl
-              << "-------" << std::endl
-              << "    Date/Time      : " << currentDateTimeStr << std::endl
-              << "    KExecSName     : " << Kokkos::DefaultExecutionSpace::name() << std::endl
-              << "    Filename       : " << short_mtx_file << std::endl
-              << "    Num Verts      : " << crsGraph.numRows() << std::endl
-              << "    Num Edges      : " << crsGraph.entries.extent(0) << std::endl
-              << "    Concurrency    : " << Kokkos::DefaultExecutionSpace::concurrency() << std::endl
-              << "    Algorithm      : " << label_algorithm << std::endl
-              << "Overall Time/Stats" << std::endl
-              << "    Total Time     : " << total_time << std::endl
-              << "    Avg Time       : " << avg_time << std::endl
-              << "VB Distance[1|2] Stats " << std::endl
-              << "    Avg Time CG    : " << avg_time_color_greedy << std::endl
-              << "    Avg Time FC    : " << avg_time_find_conflicts << std::endl
-              << "    Avg Time RC    : " << avg_time_resolve_conflicts << std::endl
-              << "Matrix-Squared + D1 Stats" << std::endl
-              << "    Avg Time to M^2: " << avg_time_matrix_squared << std::endl
-              << "    Avg Time to D1 : " << avg_time_matrix_squared_d1 << std::endl
-              << "Coloring Stats" << std::endl
-              << "    Avg colors     : " << avg_colors << std::endl
-              << "    Avg Phases     : " << avg_phases << std::endl
-              << "    Validation     : " << all_results_valid_str << std::endl
-              << std::endl;
-
-    std::cout << "CSVTIMEHDR"
-              << "," << "Filename"
-              << "," << "Host"
-              << "," << "DateTime"
-              << "," << "Num Rows"
-              << "," << "Num Edges"
-              << "," << "Execution Space"
-              << "," << "Algorithm"
-              << "," << "Concurrency"
-              << "," << "Repetitions"
-              << "," << "Total Time"
-              << "," << "Total Time to M^2"
-              << "," << "Total Time D1(M^2)"
-              << "," << "Total Time CG"
-              << "," << "Total Time FC"
-              << "," << "Total Time RC"
-              << "," << "Avg Colors"
-              << "," << "Avg Num Phases"
-              << "," << "Validation"
-              << std::endl;
-
-    std::cout << "CSVTIMEDATA"
-              << "," << short_mtx_file
-              << "," << hostname
-              << "," << currentDateTimeStr
-              << "," << crsGraph.numRows()
-              << "," << crsGraph.entries.extent(0)
-              << "," << Kokkos::DefaultExecutionSpace::name()
-              << "," << label_algorithm
-              << "," << Kokkos::DefaultExecutionSpace::concurrency()
-              << "," << repeat
-              << "," << total_time
-              << "," << total_time_matrix_squared
-              << "," << total_time_matrix_squared_d1
-              << "," << total_time_color_greedy
-              << "," << total_time_find_conflicts
-              << "," << total_time_resolve_conflicts
-
-              << "," << avg_colors
-              << "," << avg_phases
-              << "," << all_results_valid_str
-              << std::endl;
-
-    std::cout << "CSVHISTHDR"
-              << "," << "Filename"
-              << "," << "Host"
-              << "," << "DateTime"
-              << "," << "Num Rows"
-              << "," << "Num Edges"
-              << "," << "Execution Space"
-              << "," << "Algorithm"
-              << "," << "Concurrency"
-              << "," << "Histogram: 1 .. N"
-              << std::endl;
-
-    std::cout << "CSVHISTDATA"
-              << "," << short_mtx_file
-              << "," << hostname
-              << "," << currentDateTimeStr
-              << "," << crsGraph.numRows()
-              << "," << crsGraph.entries.extent(0)
-              << "," << Kokkos::DefaultExecutionSpace::name()
-              << "," << label_algorithm
-              << "," << Kokkos::DefaultExecutionSpace::concurrency()
-              << ",";
-    KokkosGraph::Impl::graph_print_distance2_color_histogram(&kh, true);
-    std::cout << std::endl;
+    // ------------------------------------------
+    // Print out the colors histogram
+    // ------------------------------------------
+    KokkosGraph::Impl::graph_print_distance2_color_histogram(&kh, false);
+
+  }  // for i...
+
+  // ------------------------------------------
+  // Compute Distance 2 Degree Stats
+  // ------------------------------------------
+  std::cout << "Compute Distance-2 Degree " << std::endl;
+
+  Kokkos::Timer timer;
+
+  double total_time =
+      kh.get_distance2_graph_coloring_handle()->get_overall_coloring_time();
+  double total_time_color_greedy = kh.get_distance2_graph_coloring_handle()
+                                       ->get_overall_coloring_time_phase1();
+  double total_time_find_conflicts = kh.get_distance2_graph_coloring_handle()
+                                         ->get_overall_coloring_time_phase2();
+  double total_time_resolve_conflicts =
+      kh.get_distance2_graph_coloring_handle()
+          ->get_overall_coloring_time_phase3();
+  double total_time_matrix_squared = kh.get_distance2_graph_coloring_handle()
+                                         ->get_overall_coloring_time_phase4();
+  double total_time_matrix_squared_d1 =
+      kh.get_distance2_graph_coloring_handle()
+          ->get_overall_coloring_time_phase5();
+
+  double avg_time                = total_time / (double)repeat;
+  double avg_time_color_greedy   = total_time_color_greedy / (double)repeat;
+  double avg_time_find_conflicts = total_time_find_conflicts / (double)repeat;
+  double avg_time_resolve_conflicts =
+      total_time_resolve_conflicts / (double)repeat;
+  double avg_colors              = total_colors / (double)repeat;
+  double avg_phases              = total_phases / (double)repeat;
+  double avg_time_matrix_squared = total_time_matrix_squared / (double)repeat;
+  double avg_time_matrix_squared_d1 =
+      total_time_matrix_squared_d1 / (double)repeat;
+
+  std::string short_mtx_file(params.mtx_file);
+  short_mtx_file =
+      short_mtx_file.substr(short_mtx_file.find_last_of("/\\") + 1);
+
+  int result;
+  char hostname[100];
+  char username[100];
+
+  result = gethostname(hostname, 100);
+  if (result) {
+    perror("gethostname");
+  }
 
-    // Kokkos::print_configuration(std::cout);
+  result = getlogin_r(username, 100);
+  if (result) {
+    perror("getlogin_r");
+  }
+
+  std::string all_results_valid_str = "PASSED";
+  if (!all_results_valid) all_results_valid_str = "FAILED";
+
+  std::string currentDateTimeStr = getCurrentDateTimeStr();
+
+  std::cout << "Summary" << std::endl
+            << "-------" << std::endl
+            << "    Date/Time      : " << currentDateTimeStr << std::endl
+            << "    KExecSName     : " << Kokkos::DefaultExecutionSpace::name()
+            << std::endl
+            << "    Filename       : " << short_mtx_file << std::endl
+            << "    Num Verts      : " << crsGraph.numRows() << std::endl
+            << "    Num Edges      : " << crsGraph.entries.extent(0)
+            << std::endl
+            << "    Concurrency    : "
+            << Kokkos::DefaultExecutionSpace::concurrency() << std::endl
+            << "    Algorithm      : " << label_algorithm << std::endl
+            << "Overall Time/Stats" << std::endl
+            << "    Total Time     : " << total_time << std::endl
+            << "    Avg Time       : " << avg_time << std::endl
+            << "VB Distance[1|2] Stats " << std::endl
+            << "    Avg Time CG    : " << avg_time_color_greedy << std::endl
+            << "    Avg Time FC    : " << avg_time_find_conflicts << std::endl
+            << "    Avg Time RC    : " << avg_time_resolve_conflicts
+            << std::endl
+            << "Matrix-Squared + D1 Stats" << std::endl
+            << "    Avg Time to M^2: " << avg_time_matrix_squared << std::endl
+            << "    Avg Time to D1 : " << avg_time_matrix_squared_d1
+            << std::endl
+            << "Coloring Stats" << std::endl
+            << "    Avg colors     : " << avg_colors << std::endl
+            << "    Avg Phases     : " << avg_phases << std::endl
+            << "    Validation     : " << all_results_valid_str << std::endl
+            << std::endl;
+
+  std::cout << "CSVTIMEHDR"
+            << ","
+            << "Filename"
+            << ","
+            << "Host"
+            << ","
+            << "DateTime"
+            << ","
+            << "Num Rows"
+            << ","
+            << "Num Edges"
+            << ","
+            << "Execution Space"
+            << ","
+            << "Algorithm"
+            << ","
+            << "Concurrency"
+            << ","
+            << "Repetitions"
+            << ","
+            << "Total Time"
+            << ","
+            << "Total Time to M^2"
+            << ","
+            << "Total Time D1(M^2)"
+            << ","
+            << "Total Time CG"
+            << ","
+            << "Total Time FC"
+            << ","
+            << "Total Time RC"
+            << ","
+            << "Avg Colors"
+            << ","
+            << "Avg Num Phases"
+            << ","
+            << "Validation" << std::endl;
+
+  std::cout << "CSVTIMEDATA"
+            << "," << short_mtx_file << "," << hostname << ","
+            << currentDateTimeStr << "," << crsGraph.numRows() << ","
+            << crsGraph.entries.extent(0) << ","
+            << Kokkos::DefaultExecutionSpace::name() << "," << label_algorithm
+            << "," << Kokkos::DefaultExecutionSpace::concurrency() << ","
+            << repeat << "," << total_time << "," << total_time_matrix_squared
+            << "," << total_time_matrix_squared_d1 << ","
+            << total_time_color_greedy << "," << total_time_find_conflicts
+            << "," << total_time_resolve_conflicts
+
+            << "," << avg_colors << "," << avg_phases << ","
+            << all_results_valid_str << std::endl;
+
+  std::cout << "CSVHISTHDR"
+            << ","
+            << "Filename"
+            << ","
+            << "Host"
+            << ","
+            << "DateTime"
+            << ","
+            << "Num Rows"
+            << ","
+            << "Num Edges"
+            << ","
+            << "Execution Space"
+            << ","
+            << "Algorithm"
+            << ","
+            << "Concurrency"
+            << ","
+            << "Histogram: 1 .. N" << std::endl;
+
+  std::cout << "CSVHISTDATA"
+            << "," << short_mtx_file << "," << hostname << ","
+            << currentDateTimeStr << "," << crsGraph.numRows() << ","
+            << crsGraph.entries.extent(0) << ","
+            << Kokkos::DefaultExecutionSpace::name() << "," << label_algorithm
+            << "," << Kokkos::DefaultExecutionSpace::concurrency() << ",";
+  KokkosGraph::Impl::graph_print_distance2_color_histogram(&kh, true);
+  std::cout << std::endl;
+
+  // Kokkos::print_configuration(std::cout);
 }
 
+template <typename size_type, typename lno_t, typename exec_space,
+          typename mem_space>
+void experiment_driver(const D2Parameters& params) {
+  using device_t = Kokkos::Device<exec_space, mem_space>;
+  using crsMat_t = typename KokkosSparse::CrsMatrix<double, lno_t, device_t,
+                                                    void, size_type>;
+  using graph_t  = typename crsMat_t::StaticCrsGraphType;
 
-template<typename size_type, typename lno_t, typename exec_space, typename mem_space>
-void experiment_driver(const D2Parameters& params)
-{
-    using device_t    = Kokkos::Device<exec_space, mem_space>;
-    using crsMat_t    = typename KokkosSparse::CrsMatrix<double, lno_t, device_t, void, size_type>;
-    using graph_t     = typename crsMat_t::StaticCrsGraphType;
- 
-    crsMat_t A     = KokkosKernels::Impl::read_kokkos_crst_matrix<crsMat_t>(params.mtx_file);
-    graph_t Agraph = A.graph;
-    int num_cols   = A.numCols();
+  crsMat_t A =
+      KokkosKernels::Impl::read_kokkos_crst_matrix<crsMat_t>(params.mtx_file);
+  graph_t Agraph = A.graph;
+  int num_cols   = A.numCols();
 
-    KokkosKernels::Experiment::run_experiment<graph_t>(Agraph, num_cols, params);
+  KokkosKernels::Experiment::run_experiment<graph_t>(Agraph, num_cols, params);
 }
 
+}  // namespace Experiment
+}  // namespace KokkosKernels
 
-}      // namespace Experiment
-}      // namespace KokkosKernels
-
-
+int main(int argc, char* argv[]) {
+  D2Parameters params;
 
-int main(int argc, char *argv[])
-{
-    D2Parameters params;
+  // Override default repeats (default is 6)
+  params.repeat = 1;
 
-    // Override default repeats (default is 6)
-    params.repeat = 1;
+  if (parse_inputs(params, argc, argv)) {
+    return 1;
+  }
 
-    if(parse_inputs(params, argc, argv))
-    {
-        return 1;
-    }
+  if (params.mtx_file == NULL) {
+    std::cerr << "Provide a matrix file" << std::endl;
+    return 0;
+  }
 
-    if(params.mtx_file == NULL)
-    {
-        std::cerr << "Provide a matrix file" << std::endl;
-        return 0;
-    }
+  std::cout << "Sizeof(kk_lno_t) : " << sizeof(kk_lno_t) << std::endl
+            << "Sizeof(size_type): " << sizeof(kk_size_type) << std::endl;
+
+  const int num_threads =
+      params.use_openmp;  // Assumption is that use_openmp variable is provided
+                          // as number of threads
+  int device_id = 0;
+  if (params.use_cuda)
+    device_id = params.use_cuda - 1;
+  else if (params.use_hip)
+    device_id = params.use_hip - 1;
+  Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
+
+  // Print out verbose information about the configuration of the run.
+  // Kokkos::print_configuration(std::cout);
+
+#if defined(KOKKOS_MULTI_MEM)
+  const bool use_multi_mem = true;
+// todo: Throw an error or print a message if KOKKOS_MULTI_MEM is enabled for
+// this test?  (WCMCLEN--SCAFFOLDING)
+#else
+  const bool use_multi_mem = false;
+#endif
 
-    std::cout << "Sizeof(kk_lno_t) : " << sizeof(kk_lno_t)     << std::endl
-              << "Sizeof(size_type): " << sizeof(kk_size_type) << std::endl;
-
-    const int num_threads = params.use_openmp;      // Assumption is that use_openmp variable is provided as number of threads
-    int device_id = 0;
-    if(params.use_cuda)
-      device_id = params.use_cuda - 1;
-    else if(params.use_hip)
-      device_id = params.use_hip - 1;
-    Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
-
-    // Print out verbose information about the configuration of the run.
-    // Kokkos::print_configuration(std::cout);
-
-    #if defined(KOKKOS_MULTI_MEM)
-    const bool use_multi_mem = true;
-    // todo: Throw an error or print a message if KOKKOS_MULTI_MEM is enabled for this test?  (WCMCLEN--SCAFFOLDING)
-    #else
-    const bool use_multi_mem = false;
-    #endif
-
-    #if defined(KOKKOS_ENABLE_OPENMP)
-    if(params.use_openmp)
-    {
-        if(!use_multi_mem)
-        {
-            KokkosKernels::Experiment::experiment_driver<kk_size_type, kk_lno_t, Kokkos::OpenMP, Kokkos::OpenMP::memory_space>(params);
-        }
+#if defined(KOKKOS_ENABLE_OPENMP)
+  if (params.use_openmp) {
+    if (!use_multi_mem) {
+      KokkosKernels::Experiment::experiment_driver<
+          kk_size_type, kk_lno_t, Kokkos::OpenMP, Kokkos::OpenMP::memory_space>(
+          params);
     }
-    #endif
+  }
+#endif
 
-    #if defined(KOKKOS_ENABLE_THREADS)
-    if(params.use_threads)
-    {
-        if(!use_multi_mem)
-        {
-            KokkosKernels::Experiment::experiment_driver<kk_size_type, kk_lno_t, Kokkos::Threads, Kokkos::Threads::memory_space>(params);
-        }
+#if defined(KOKKOS_ENABLE_THREADS)
+  if (params.use_threads) {
+    if (!use_multi_mem) {
+      KokkosKernels::Experiment::experiment_driver<
+          kk_size_type, kk_lno_t, Kokkos::Threads,
+          Kokkos::Threads::memory_space>(params);
     }
-    #endif
+  }
+#endif
 
-    #if defined(KOKKOS_ENABLE_CUDA)
-    if(params.use_cuda)
-    {
-        if(!use_multi_mem)
-        {
-            KokkosKernels::Experiment::experiment_driver<kk_size_type, kk_lno_t, Kokkos::Cuda, Kokkos::Cuda::memory_space>(params);
-        }
+#if defined(KOKKOS_ENABLE_CUDA)
+  if (params.use_cuda) {
+    if (!use_multi_mem) {
+      KokkosKernels::Experiment::experiment_driver<
+          kk_size_type, kk_lno_t, Kokkos::Cuda, Kokkos::Cuda::memory_space>(
+          params);
     }
-    #endif
+  }
+#endif
 
-    #if defined(KOKKOS_ENABLE_HIP)
-    if(params.use_hip)
-    {
-        if(!use_multi_mem)
-        {
-            KokkosKernels::Experiment::experiment_driver<kk_size_type, kk_lno_t, Kokkos::Experimental::HIP, Kokkos::Experimental::HIPSpace>(params);
-        }
+#if defined(KOKKOS_ENABLE_HIP)
+  if (params.use_hip) {
+    if (!use_multi_mem) {
+      KokkosKernels::Experiment::experiment_driver<
+          kk_size_type, kk_lno_t, Kokkos::Experimental::HIP,
+          Kokkos::Experimental::HIPSpace>(params);
     }
-    #endif
+  }
+#endif
 
-    #if defined(KOKKOS_ENABLE_SERIAL)
-    if(params.use_serial)
-    {
-        if(!use_multi_mem)
-        {
-            KokkosKernels::Experiment::experiment_driver<kk_size_type, kk_lno_t, Kokkos::Serial, Kokkos::Serial::memory_space>(params);
-        }
+#if defined(KOKKOS_ENABLE_SERIAL)
+  if (params.use_serial) {
+    if (!use_multi_mem) {
+      KokkosKernels::Experiment::experiment_driver<
+          kk_size_type, kk_lno_t, Kokkos::Serial, Kokkos::Serial::memory_space>(
+          params);
     }
-    #endif
+  }
+#endif
 
-    Kokkos::finalize();
+  Kokkos::finalize();
 
-    return 0;
+  return 0;
 }
diff --git a/perf_test/graph/KokkosGraph_mis_d2.cpp b/perf_test/graph/KokkosGraph_mis_d2.cpp
index 32ff5f5fbd..c68d5f85e2 100644
--- a/perf_test/graph/KokkosGraph_mis_d2.cpp
+++ b/perf_test/graph/KokkosGraph_mis_d2.cpp
@@ -65,76 +65,62 @@
 #include "KokkosSparse_spadd.hpp"
 #include "KokkosGraph_MIS2.hpp"
 #include "KokkosKernels_default_types.hpp"
+#include "KokkosKernels_TestUtils.hpp"
 
 using namespace KokkosGraph;
 
-struct MIS2Parameters
-{
-  int repeat = 1;
-  bool verbose = false;
-  int use_threads = 0;
-  int use_openmp = 0;
-  int use_cuda = 0;
-  int use_hip = 0;
-  int use_serial = 0;
+struct MIS2Parameters {
+  int repeat           = 1;
+  bool verbose         = false;
+  int use_threads      = 0;
+  int use_openmp       = 0;
+  int use_cuda         = 0;
+  int use_hip          = 0;
+  int use_serial       = 0;
   const char* mtx_file = NULL;
-  MIS2_Algorithm algo = MIS2_FAST;
+  MIS2_Algorithm algo  = MIS2_FAST;
 };
 
-template<typename lno_t, typename size_type, typename rowmap_t, typename entries_t, typename mis_t>
-bool verifyD2MIS(
-    lno_t numVerts,
-    const rowmap_t& rowmap, const entries_t& entries,
-    const mis_t& misArray)
-{
-  //set a std::set of the mis, for fast membership test
+template <typename lno_t, typename size_type, typename rowmap_t,
+          typename entries_t, typename mis_t>
+bool verifyD2MIS(lno_t numVerts, const rowmap_t& rowmap,
+                 const entries_t& entries, const mis_t& misArray) {
+  // set a std::set of the mis, for fast membership test
   std::set<lno_t> mis;
-  for(size_t i = 0; i < misArray.extent(0); i++)
-    mis.insert(misArray(i));
-  for(lno_t i = 0; i < numVerts; i++)
-  {
-    //determine whether another vertex in the set is
-    //within 2 hops of i.
+  for (size_t i = 0; i < misArray.extent(0); i++) mis.insert(misArray(i));
+  for (lno_t i = 0; i < numVerts; i++) {
+    // determine whether another vertex in the set is
+    // within 2 hops of i.
     bool misIn2Hops = false;
-    for(size_type j = rowmap(i); j < rowmap(i + 1); j++)
-    {
+    for (size_type j = rowmap(i); j < rowmap(i + 1); j++) {
       lno_t nei1 = entries(j);
-      if(nei1 == i || nei1 >= numVerts)
-        continue;
-      if(mis.find(nei1) != mis.end())
-      {
+      if (nei1 == i || nei1 >= numVerts) continue;
+      if (mis.find(nei1) != mis.end()) {
         misIn2Hops = true;
         break;
       }
-      for(size_type k = rowmap(nei1); k < rowmap(nei1 + 1); k++)
-      {
+      for (size_type k = rowmap(nei1); k < rowmap(nei1 + 1); k++) {
         lno_t nei2 = entries(k);
-        if(nei2 == i || nei2 >= numVerts)
-          continue;
-        if(mis.find(nei2) != mis.end())
-        {
+        if (nei2 == i || nei2 >= numVerts) continue;
+        if (mis.find(nei2) != mis.end()) {
           misIn2Hops = true;
           break;
         }
       }
     }
-    if(mis.find(i) == mis.end())
-    {
-      //i is not in the set
-      if(!misIn2Hops)
-      {
+    if (mis.find(i) == mis.end()) {
+      // i is not in the set
+      if (!misIn2Hops) {
         std::cout << "INVALID D2 MIS: vertex " << i << " is not in the set,\n";
         std::cout << "but there are no vertices in the set within 2 hops.\n";
         return false;
       }
-    }
-    else
-    {
-      //i is in the set
-      if(misIn2Hops)
-      {
+    } else {
+      // i is in the set
+      if (misIn2Hops) {
         std::cout << "INVALID D2 MIS: vertex " << i << " is in the set,\n";
-        std::cout << "but there is another vertex within 2 hops which is also in the set.\n";
+        std::cout << "but there is another vertex within 2 hops which is also "
+                     "in the set.\n";
         return false;
       }
     }
@@ -142,256 +128,239 @@ bool verifyD2MIS(
   return true;
 }
 
-void print_options(std::ostream &os, const char *app_name, unsigned int indent = 0)
-{
-    std::string spaces(indent, ' ');
-    os << "Usage:" << std::endl
-       << spaces << "  " << app_name << " [parameters]" << std::endl
-       << std::endl
-       << spaces << "Parameters:" << std::endl
-       << spaces << "  Required Parameters:" << std::endl
-       << spaces << "      --amtx <filename>   Input file in Matrix Market format (.mtx)." << std::endl
-       << std::endl
-       << spaces << "      Device type (the following are enabled in this build):" << std::endl
+void print_options(std::ostream& os, const char* app_name,
+                   unsigned int indent = 0) {
+  std::string spaces(indent, ' ');
+  os << "Usage:" << std::endl
+     << spaces << "  " << app_name << " [parameters]" << std::endl
+     << std::endl
+     << spaces << "Parameters:" << std::endl
+     << spaces << "  Required Parameters:" << std::endl
+     << spaces
+     << "      --amtx <filename>   Input file in Matrix Market format (.mtx)."
+     << std::endl
+     << std::endl
+     << spaces << "      Device type (the following are enabled in this build):"
+     << std::endl
 #ifdef KOKKOS_ENABLE_SERIAL
-       << spaces << "          --serial            Execute serially." << std::endl
+     << spaces << "          --serial            Execute serially." << std::endl
 #endif
 #ifdef KOKKOS_ENABLE_THREADS
-       << spaces << "          --threads           Use posix threads.\n"
+     << spaces << "          --threads           Use posix threads.\n"
 #endif
 #ifdef KOKKOS_ENABLE_OPENMP
-       << spaces << "          --openmp            Use OpenMP.\n"
+     << spaces << "          --openmp            Use OpenMP.\n"
 #endif
 #ifdef KOKKOS_ENABLE_CUDA
-       << spaces << "          --cuda              Use CUDA.\n"
+     << spaces << "          --cuda              Use CUDA.\n"
 #endif
 #ifdef KOKKOS_ENABLE_HIP
-       << spaces << "          --hip               Use HIP.\n"
+     << spaces << "          --hip               Use HIP.\n"
 #endif
-       << std::endl
-       << spaces << "  Optional Parameters:" << std::endl
-       << spaces << "      --algo alg          alg: fast, quality" << std::endl
-       << spaces << "      --repeat <N>        Set number of test repetitions (Default: 1) " << std::endl
-       << spaces << "      --verbose           Enable verbose mode (record and print timing + extra information)" << std::endl
-       << spaces << "      --help              Print out command line help." << std::endl
-       << spaces << " " << std::endl;
+     << std::endl
+     << spaces << "  Optional Parameters:" << std::endl
+     << spaces << "      --algo alg          alg: fast, quality" << std::endl
+     << spaces
+     << "      --repeat <N>        Set number of test repetitions (Default: 1) "
+     << std::endl
+     << spaces
+     << "      --verbose           Enable verbose mode (record and print "
+        "timing + extra information)"
+     << std::endl
+     << spaces << "      --help              Print out command line help."
+     << std::endl
+     << spaces << " " << std::endl;
 }
 
-static char* getNextArg(int& i, int argc, char** argv)
-{
+static char* getNextArg(int& i, int argc, char** argv) {
   i++;
-  if(i >= argc)
-  {
+  if (i >= argc) {
     std::cerr << "Error: expected additional command-line argument!\n";
     exit(1);
   }
   return argv[i];
 }
 
-int parse_inputs(MIS2Parameters &params, int argc, char **argv)
-{
-    bool got_required_param_amtx      = false;
-    for(int i = 1; i < argc; ++i)
-    {
-        if(0 == strcasecmp(argv[i], "--threads"))
-        {
-            params.use_threads = 1;
-        }
-        else if(0 == strcasecmp(argv[i], "--serial"))
-        {
-            params.use_serial = 1;
-        }
-        else if(0 == strcasecmp(argv[i], "--openmp"))
-        {
-            params.use_openmp = 1;
-        }
-        else if(0 == strcasecmp(argv[i], "--cuda"))
-        {
-            params.use_cuda = 1;
-        }
-        else if(0 == strcasecmp(argv[i], "--hip"))
-        {
-            params.use_hip = 1;
-        }
-        else if(0 == strcasecmp(argv[i], "--repeat"))
-        {
-            params.repeat = atoi(getNextArg(i, argc, argv));
-            if(params.repeat <= 0)
-            {
-              std::cout << "*** Repeat count must be positive, defaulting to 1.\n";
-              params.repeat = 1;
-            }
-        }
-        else if(0 == strcasecmp(argv[i], "--amtx"))
-        {
-            got_required_param_amtx = true;
-            params.mtx_file  = getNextArg(i, argc, argv);
-        }
-        else if(0 == strcasecmp(argv[i], "--algo"))
-        {
-            const char* algName = getNextArg(i, argc, argv);
-            if(!strcasecmp(algName, "fast"))
-              params.algo = MIS2_FAST;
-            else if(!strcasecmp(algName, "quality"))
-              params.algo = MIS2_QUALITY;
-            else
-              throw std::invalid_argument("Algorithm not valid: must be 'fast' or 'quality'");
-        }
-        else if(0 == strcasecmp(argv[i], "--verbose"))
-        {
-            params.verbose = true;
-        }
-        else if(0 == strcasecmp(argv[i], "--help") || 0 == strcasecmp(argv[i], "-h"))
-        {
-            print_options(std::cout, argv[0]);
-            return 1;
-        }
-        else
-        {
-            std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl;
-            print_options(std::cout, argv[0]);
-            return 1;
-        }
+int parse_inputs(MIS2Parameters& params, int argc, char** argv) {
+  bool got_required_param_amtx = false;
+  for (int i = 1; i < argc; ++i) {
+    if (0 == Test::string_compare_no_case(argv[i], "--threads")) {
+      params.use_threads = 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--serial")) {
+      params.use_serial = 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) {
+      params.use_openmp = 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) {
+      params.use_cuda = 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) {
+      params.use_hip = 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) {
+      params.repeat = atoi(getNextArg(i, argc, argv));
+      if (params.repeat <= 0) {
+        std::cout << "*** Repeat count must be positive, defaulting to 1.\n";
+        params.repeat = 1;
+      }
+    } else if (0 == Test::string_compare_no_case(argv[i], "--amtx")) {
+      got_required_param_amtx = true;
+      params.mtx_file         = getNextArg(i, argc, argv);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--algo")) {
+      const char* algName = getNextArg(i, argc, argv);
+      if (!Test::string_compare_no_case(algName, "fast"))
+        params.algo = MIS2_FAST;
+      else if (!Test::string_compare_no_case(algName, "quality"))
+        params.algo = MIS2_QUALITY;
+      else
+        throw std::invalid_argument(
+            "Algorithm not valid: must be 'fast' or 'quality'");
+    } else if (0 == Test::string_compare_no_case(argv[i], "--verbose")) {
+      params.verbose = true;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--help") ||
+               0 == Test::string_compare_no_case(argv[i], "-h")) {
+      print_options(std::cout, argv[0]);
+      return 1;
+    } else {
+      std::cerr << "Unrecognized command line argument #" << i << ": "
+                << argv[i] << std::endl;
+      print_options(std::cout, argv[0]);
+      return 1;
     }
+  }
 
-    if(!got_required_param_amtx)
-    {
-        std::cout << "Missing required parameter amtx" << std::endl << std::endl;
-        print_options(std::cout, argv[0]);
-        return 1;
-    }
-    if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda && !params.use_hip)
-    {
-        print_options(std::cout, argv[0]);
-        return 1;
-    }
-    return 0;
+  if (!got_required_param_amtx) {
+    std::cout << "Missing required parameter amtx" << std::endl << std::endl;
+    print_options(std::cout, argv[0]);
+    return 1;
+  }
+  if (!params.use_serial && !params.use_threads && !params.use_openmp &&
+      !params.use_cuda && !params.use_hip) {
+    print_options(std::cout, argv[0]);
+    return 1;
+  }
+  return 0;
 }
 
-template<typename device_t>
-void run_mis2(const MIS2Parameters& params)
-{
-    using size_type = default_size_type;
-    using lno_t = default_lno_t;
-    using exec_space = typename device_t::execution_space;
-    using mem_space = typename device_t::memory_space;
-    using crsMat_t = typename KokkosSparse::CrsMatrix<default_scalar, default_lno_t, device_t, void, default_size_type>;
-    using lno_view_t = typename crsMat_t::index_type::non_const_type;
-    using KKH = KokkosKernels::Experimental::KokkosKernelsHandle<size_type, lno_t, double, exec_space, mem_space, mem_space>;
- 
-    Kokkos::Timer t;
-    crsMat_t A_in = KokkosKernels::Impl::read_kokkos_crst_matrix<crsMat_t>(params.mtx_file);
-    std::cout << "I/O time: " << t.seconds() << " s\n";
-    t.reset();
-    //Symmetrize the matrix just in case
-    crsMat_t At_in = KokkosKernels::Impl::transpose_matrix(A_in);
-    crsMat_t A;
-    KKH kkh;
-    kkh.create_spadd_handle(false);
-    KokkosSparse::spadd_symbolic(&kkh, A_in, At_in, A);
-    KokkosSparse::spadd_numeric(&kkh, 1.0, A_in, 1.0, At_in, A);
-    kkh.destroy_spadd_handle();
-    std::cout << "Time to symmetrize: " << t.seconds() << " s\n";
-    auto rowmap = A.graph.row_map;
-    auto entries = A.graph.entries;
-    lno_t numVerts = A.numRows();
+template <typename device_t>
+void run_mis2(const MIS2Parameters& params) {
+  using size_type  = default_size_type;
+  using lno_t      = default_lno_t;
+  using exec_space = typename device_t::execution_space;
+  using mem_space  = typename device_t::memory_space;
+  using crsMat_t   = typename KokkosSparse::CrsMatrix<default_scalar, lno_t,
+                                                    device_t, void, size_type>;
+  using lno_view_t = typename crsMat_t::index_type::non_const_type;
+  using KKH        = KokkosKernels::Experimental::KokkosKernelsHandle<
+      size_type, lno_t, default_scalar, exec_space, mem_space, mem_space>;
 
-    std::cout << "Num verts: " << numVerts << '\n'
-              << "Num edges: " << A.nnz() << '\n';
+  Kokkos::Timer t;
+  crsMat_t A_in =
+      KokkosKernels::Impl::read_kokkos_crst_matrix<crsMat_t>(params.mtx_file);
+  std::cout << "I/O time: " << t.seconds() << " s\n";
+  t.reset();
+  // Symmetrize the matrix just in case
+  crsMat_t At_in = KokkosKernels::Impl::transpose_matrix(A_in);
+  crsMat_t A;
+  KKH kkh;
+  const default_scalar one = Kokkos::ArithTraits<default_scalar>::one();
+  kkh.create_spadd_handle(false);
+  KokkosSparse::spadd_symbolic(&kkh, A_in, At_in, A);
+  KokkosSparse::spadd_numeric(&kkh, one, A_in, one, At_in, A);
+  kkh.destroy_spadd_handle();
+  std::cout << "Time to symmetrize: " << t.seconds() << " s\n";
+  auto rowmap    = A.graph.row_map;
+  auto entries   = A.graph.entries;
+  lno_t numVerts = A.numRows();
 
-    lno_view_t mis;
+  std::cout << "Num verts: " << numVerts << '\n'
+            << "Num edges: " << A.nnz() << '\n';
 
-    t.reset();
-    for(int rep = 0; rep < params.repeat; rep++)
-    {
-      mis = KokkosGraph::Experimental::graph_d2_mis<device_t, decltype(rowmap), decltype(entries)>(rowmap, entries, params.algo);
-      exec_space().fence();
-    }
-    double totalTime = t.seconds();
-    std::cout << "MIS-2 average time: " << totalTime / params.repeat << '\n';
-    std::cout << "MIS size: " << mis.extent(0) << '\n';
+  lno_view_t mis;
 
-    if(params.verbose)
-    {
-      std::cout << "Vertices in independent set:\n";
-      KokkosKernels::Impl::print_1Dview(mis);
-      auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmap);
-      auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries);
-      auto misHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), mis);
-      if(verifyD2MIS
-        <lno_t, size_type, decltype(rowmapHost), decltype(entriesHost), decltype(misHost)>
-        (numVerts, rowmapHost, entriesHost, misHost))
-        std::cout << "MIS-2 is correct.\n";
-      else
-        std::cout << "*** MIS-2 not correct! ***\n";
-    }
+  t.reset();
+  for (int rep = 0; rep < params.repeat; rep++) {
+    mis = KokkosGraph::graph_d2_mis<device_t, decltype(rowmap),
+                                    decltype(entries)>(rowmap, entries,
+                                                       params.algo);
+    exec_space().fence();
+  }
+  double totalTime = t.seconds();
+  std::cout << "MIS-2 average time: " << totalTime / params.repeat << '\n';
+  std::cout << "MIS size: " << mis.extent(0) << '\n';
+
+  if (params.verbose) {
+    std::cout << "Vertices in independent set:\n";
+    KokkosKernels::Impl::print_1Dview(mis);
+    auto rowmapHost =
+        Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmap);
+    auto entriesHost =
+        Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries);
+    auto misHost =
+        Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), mis);
+    if (verifyD2MIS<lno_t, size_type, decltype(rowmapHost),
+                    decltype(entriesHost), decltype(misHost)>(
+            numVerts, rowmapHost, entriesHost, misHost))
+      std::cout << "MIS-2 is correct.\n";
+    else
+      std::cout << "*** MIS-2 not correct! ***\n";
+  }
 }
 
-int main(int argc, char *argv[])
-{
-    MIS2Parameters params;
+int main(int argc, char* argv[]) {
+  MIS2Parameters params;
 
-    if(parse_inputs(params, argc, argv))
-    {
-        return 1;
-    }
+  if (parse_inputs(params, argc, argv)) {
+    return 1;
+  }
 
-    if(params.mtx_file == NULL)
-    {
-        std::cerr << "Provide a matrix file" << std::endl;
-        return 0;
-    }
+  if (params.mtx_file == NULL) {
+    std::cerr << "Provide a matrix file" << std::endl;
+    return 0;
+  }
 
-    Kokkos::initialize();
+  Kokkos::initialize();
 
-    bool run = false;
+  bool run = false;
 
-    #if defined(KOKKOS_ENABLE_OPENMP)
-    if(params.use_openmp)
-    {
-      run_mis2<Kokkos::OpenMP>(params);
-      run = true;
-    }
-    #endif
+#if defined(KOKKOS_ENABLE_OPENMP)
+  if (params.use_openmp) {
+    run_mis2<Kokkos::OpenMP>(params);
+    run = true;
+  }
+#endif
 
-    #if defined(KOKKOS_ENABLE_THREADS)
-    if(params.use_threads)
-    {
-      run_mis2<Kokkos::Threads>(params);
-      run = true;
-    }
-    #endif
+#if defined(KOKKOS_ENABLE_THREADS)
+  if (params.use_threads) {
+    run_mis2<Kokkos::Threads>(params);
+    run = true;
+  }
+#endif
 
-    #if defined(KOKKOS_ENABLE_CUDA)
-    if(params.use_cuda)
-    {
-      run_mis2<Kokkos::Cuda>(params);
-      run = true;
-    }
-    #endif
+#if defined(KOKKOS_ENABLE_CUDA)
+  if (params.use_cuda) {
+    run_mis2<Kokkos::Cuda>(params);
+    run = true;
+  }
+#endif
 
-    #if defined(KOKKOS_ENABLE_HIP)
-    if(params.use_hip)
-    {
-      run_mis2<Kokkos::Experimental::HIP>(params);
-      run = true;
-    }
-    #endif
+#if defined(KOKKOS_ENABLE_HIP)
+  if (params.use_hip) {
+    run_mis2<Kokkos::Experimental::HIP>(params);
+    run = true;
+  }
+#endif
 
-    #if defined(KOKKOS_ENABLE_SERIAL)
-    if(params.use_serial)
-    {
-      run_mis2<Kokkos::Serial>(params);
-      run = true;
-    }
-    #endif
+#if defined(KOKKOS_ENABLE_SERIAL)
+  if (params.use_serial) {
+    run_mis2<Kokkos::Serial>(params);
+    run = true;
+  }
+#endif
 
-    if(!run)
-    {
-      std::cerr << "*** ERROR: did not run, none of the supported device types were selected.\n";
-    }
+  if (!run) {
+    std::cerr << "*** ERROR: did not run, none of the supported device types "
+                 "were selected.\n";
+  }
 
-    Kokkos::finalize();
+  Kokkos::finalize();
 
-    return 0;
+  return 0;
 }
diff --git a/perf_test/graph/KokkosGraph_multimem_triangle.hpp b/perf_test/graph/KokkosGraph_multimem_triangle.hpp
index 7cb1baba46..1c26b5fcea 100644
--- a/perf_test/graph/KokkosGraph_multimem_triangle.hpp
+++ b/perf_test/graph/KokkosGraph_multimem_triangle.hpp
@@ -45,189 +45,189 @@
 #include "KokkosGraph_run_triangle.hpp"
 #include "KokkosSparse_CrsMatrix.hpp"
 
-namespace KokkosKernels{
-
-namespace Experiment{
-
-  template <typename size_type, typename lno_t,
-            typename exec_space, typename hbm_mem_space, typename sbm_mem_space>
-  void run_multi_mem_triangle(Parameters params){
-
-    typedef exec_space myExecSpace;
-    typedef Kokkos::Device<exec_space, hbm_mem_space> myFastDevice;
-    typedef Kokkos::Device<exec_space, sbm_mem_space> mySlowExecSpace;
-
-    typedef typename KokkosSparse::CrsMatrix<double, lno_t, myFastDevice, void, size_type > fast_crstmat_t;
-    typedef typename fast_crstmat_t::StaticCrsGraphType fast_graph_t;
-
-    typedef typename KokkosSparse::CrsMatrix<double, lno_t, mySlowExecSpace, void, size_type > slow_crstmat_t;
-    typedef typename slow_crstmat_t::StaticCrsGraphType slow_graph_t;
-
-    char *a_mat_file = params.a_mtx_bin_file;
-    //char *b_mat_file = params.b_mtx_bin_file;
-    //char *c_mat_file = params.c_mtx_bin_file;
-
-    slow_graph_t a_slow_crsgraph, /*b_slow_crsgraph,*/ c_slow_crsgraph;
-    fast_graph_t a_fast_crsgraph, /*b_fast_crsgraph,*/ c_fast_crsgraph;
-
-
-
-    //read a and b matrices and store them on slow or fast memory.
-    if (params.a_mem_space == 1){
-      fast_crstmat_t a_fast_crsmat;
-      a_fast_crsmat = KokkosKernels::Impl::read_kokkos_crst_matrix<fast_crstmat_t>(a_mat_file);
-      a_fast_crsgraph = a_fast_crsmat.graph;
-      a_fast_crsgraph.num_cols = a_fast_crsmat.numCols();
-
-    }
-    else {
-      slow_crstmat_t a_slow_crsmat;
-      a_slow_crsmat = KokkosKernels::Impl::read_kokkos_crst_matrix<slow_crstmat_t>(a_mat_file);
-      a_slow_crsgraph = a_slow_crsmat.graph;
-      a_slow_crsgraph.num_cols = a_slow_crsmat.numCols();
-    }
-
-
-    if (params.a_mem_space == 1){
-      if (params.b_mem_space == 1){
-        if (params.c_mem_space == 1){
-          if (params.work_mem_space == 1){
-             /* c_fast_crsgraph = */
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, fast_graph_t,fast_graph_t,fast_graph_t, hbm_mem_space, hbm_mem_space>
-                  (a_fast_crsgraph, /*b_fast_crsgraph,*/ params);
-          }
-          else {
-            /* c_fast_crsgraph = */
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, fast_graph_t,fast_graph_t,fast_graph_t, sbm_mem_space, sbm_mem_space>
-                  (a_fast_crsgraph, /*b_fast_crsgraph,*/ params);
-          }
+namespace KokkosKernels {
+
+namespace Experiment {
+
+template <typename size_type, typename lno_t, typename exec_space,
+          typename hbm_mem_space, typename sbm_mem_space>
+void run_multi_mem_triangle(Parameters params) {
+  typedef exec_space myExecSpace;
+  typedef Kokkos::Device<exec_space, hbm_mem_space> myFastDevice;
+  typedef Kokkos::Device<exec_space, sbm_mem_space> mySlowExecSpace;
+
+  typedef typename KokkosSparse::CrsMatrix<double, lno_t, myFastDevice, void,
+                                           size_type>
+      fast_crstmat_t;
+  typedef typename fast_crstmat_t::StaticCrsGraphType fast_graph_t;
+
+  typedef typename KokkosSparse::CrsMatrix<double, lno_t, mySlowExecSpace, void,
+                                           size_type>
+      slow_crstmat_t;
+  typedef typename slow_crstmat_t::StaticCrsGraphType slow_graph_t;
+
+  char *a_mat_file = params.a_mtx_bin_file;
+  // char *b_mat_file = params.b_mtx_bin_file;
+  // char *c_mat_file = params.c_mtx_bin_file;
+
+  slow_graph_t a_slow_crsgraph, /*b_slow_crsgraph,*/ c_slow_crsgraph;
+  fast_graph_t a_fast_crsgraph, /*b_fast_crsgraph,*/ c_fast_crsgraph;
+
+  // read a and b matrices and store them on slow or fast memory.
+  if (params.a_mem_space == 1) {
+    fast_crstmat_t a_fast_crsmat;
+    a_fast_crsmat =
+        KokkosKernels::Impl::read_kokkos_crst_matrix<fast_crstmat_t>(
+            a_mat_file);
+    a_fast_crsgraph          = a_fast_crsmat.graph;
+    a_fast_crsgraph.num_cols = a_fast_crsmat.numCols();
+
+  } else {
+    slow_crstmat_t a_slow_crsmat;
+    a_slow_crsmat =
+        KokkosKernels::Impl::read_kokkos_crst_matrix<slow_crstmat_t>(
+            a_mat_file);
+    a_slow_crsgraph          = a_slow_crsmat.graph;
+    a_slow_crsgraph.num_cols = a_slow_crsmat.numCols();
+  }
 
+  if (params.a_mem_space == 1) {
+    if (params.b_mem_space == 1) {
+      if (params.c_mem_space == 1) {
+        if (params.work_mem_space == 1) {
+          /* c_fast_crsgraph = */
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_graph_t, fast_graph_t, fast_graph_t,
+              hbm_mem_space, hbm_mem_space>(a_fast_crsgraph,
+                                            /*b_fast_crsgraph,*/ params);
+        } else {
+          /* c_fast_crsgraph = */
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_graph_t, fast_graph_t, fast_graph_t,
+              sbm_mem_space, sbm_mem_space>(a_fast_crsgraph,
+                                            /*b_fast_crsgraph,*/ params);
         }
-        else {
-          //C is in slow memory.
-          if (params.work_mem_space == 1){
-            /*c_slow_crsgraph =*/
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, fast_graph_t,fast_graph_t,slow_graph_t, hbm_mem_space, hbm_mem_space>
-                  (a_fast_crsgraph, /*b_fast_crsgraph,*/ params);
-          }
-          else {
-            /*c_slow_crsgraph =*/
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, fast_graph_t,fast_graph_t,slow_graph_t, sbm_mem_space, sbm_mem_space>
-                  (a_fast_crsgraph, /*b_fast_crsgraph,*/ params);
-          }
-        }
-      }
-      else {
-        //B is in slow memory
-        if (params.c_mem_space == 1){
-          if (params.work_mem_space == 1){
-            /* c_fast_crsgraph = */
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, fast_graph_t,slow_graph_t,fast_graph_t, hbm_mem_space, hbm_mem_space>
-                  (a_fast_crsgraph, /*b_slow_crsgraph,*/ params);
-          }
-          else {
-            /* c_fast_crsgraph = */
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, fast_graph_t,slow_graph_t,fast_graph_t, sbm_mem_space, sbm_mem_space>
-                  (a_fast_crsgraph, /*b_slow_crsgraph,*/ params);
-          }
 
+      } else {
+        // C is in slow memory.
+        if (params.work_mem_space == 1) {
+          /*c_slow_crsgraph =*/
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_graph_t, fast_graph_t, slow_graph_t,
+              hbm_mem_space, hbm_mem_space>(a_fast_crsgraph,
+                                            /*b_fast_crsgraph,*/ params);
+        } else {
+          /*c_slow_crsgraph =*/
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_graph_t, fast_graph_t, slow_graph_t,
+              sbm_mem_space, sbm_mem_space>(a_fast_crsgraph,
+                                            /*b_fast_crsgraph,*/ params);
         }
-        else {
-          //C is in slow memory.
-          if (params.work_mem_space == 1){
-            /*c_slow_crsgraph =*/
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, fast_graph_t,slow_graph_t,slow_graph_t, hbm_mem_space, hbm_mem_space>
-                  (a_fast_crsgraph, /*b_slow_crsgraph,*/ params);
-          }
-          else {
-            /*c_slow_crsgraph =*/
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, fast_graph_t,slow_graph_t,slow_graph_t, sbm_mem_space, sbm_mem_space>
-                  (a_fast_crsgraph, /*b_slow_crsgraph,*/ params);
-          }
+      }
+    } else {
+      // B is in slow memory
+      if (params.c_mem_space == 1) {
+        if (params.work_mem_space == 1) {
+          /* c_fast_crsgraph = */
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_graph_t, slow_graph_t, fast_graph_t,
+              hbm_mem_space, hbm_mem_space>(a_fast_crsgraph,
+                                            /*b_slow_crsgraph,*/ params);
+        } else {
+          /* c_fast_crsgraph = */
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_graph_t, slow_graph_t, fast_graph_t,
+              sbm_mem_space, sbm_mem_space>(a_fast_crsgraph,
+                                            /*b_slow_crsgraph,*/ params);
         }
 
+      } else {
+        // C is in slow memory.
+        if (params.work_mem_space == 1) {
+          /*c_slow_crsgraph =*/
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_graph_t, slow_graph_t, slow_graph_t,
+              hbm_mem_space, hbm_mem_space>(a_fast_crsgraph,
+                                            /*b_slow_crsgraph,*/ params);
+        } else {
+          /*c_slow_crsgraph =*/
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_graph_t, slow_graph_t, slow_graph_t,
+              sbm_mem_space, sbm_mem_space>(a_fast_crsgraph,
+                                            /*b_slow_crsgraph,*/ params);
+        }
       }
     }
-    else {
-      //A is in slow memory
-      if (params.b_mem_space == 1){
-        if (params.c_mem_space == 1){
-          if (params.work_mem_space == 1){
-            /* c_fast_crsgraph = */
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, slow_graph_t,fast_graph_t,fast_graph_t, hbm_mem_space, hbm_mem_space>
-                  (a_slow_crsgraph, /*b_fast_crsgraph,*/ params);
-          }
-          else {
-            /* c_fast_crsgraph = */
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, slow_graph_t,fast_graph_t,fast_graph_t, sbm_mem_space, sbm_mem_space>
-                  (a_slow_crsgraph, /*b_fast_crsgraph,*/ params);
-          }
-
+  } else {
+    // A is in slow memory
+    if (params.b_mem_space == 1) {
+      if (params.c_mem_space == 1) {
+        if (params.work_mem_space == 1) {
+          /* c_fast_crsgraph = */
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_graph_t, fast_graph_t, fast_graph_t,
+              hbm_mem_space, hbm_mem_space>(a_slow_crsgraph,
+                                            /*b_fast_crsgraph,*/ params);
+        } else {
+          /* c_fast_crsgraph = */
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_graph_t, fast_graph_t, fast_graph_t,
+              sbm_mem_space, sbm_mem_space>(a_slow_crsgraph,
+                                            /*b_fast_crsgraph,*/ params);
         }
-        else {
-          //C is in slow memory.
-          if (params.work_mem_space == 1){
-            /*c_slow_crsgraph =*/
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, slow_graph_t,fast_graph_t,slow_graph_t, hbm_mem_space, hbm_mem_space>
-                  (a_slow_crsgraph, /*b_fast_crsgraph,*/ params);
-          }
-          else {
-            /*c_slow_crsgraph =*/
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, slow_graph_t,fast_graph_t,slow_graph_t, sbm_mem_space, sbm_mem_space>
-                  (a_slow_crsgraph, /*b_fast_crsgraph,*/ params);
-          }
-        }
-      }
-      else {
-        //B is in slow memory
-        if (params.c_mem_space == 1){
-          if (params.work_mem_space == 1){
-            /* c_fast_crsgraph = */
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, slow_graph_t,slow_graph_t,fast_graph_t, hbm_mem_space, hbm_mem_space>
-                  (a_slow_crsgraph, /*b_slow_crsgraph,*/ params);
-          }
-          else {
-            /* c_fast_crsgraph = */
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, slow_graph_t,slow_graph_t,fast_graph_t, sbm_mem_space, sbm_mem_space>
-                  (a_slow_crsgraph, /*b_slow_crsgraph,*/ params);
-          }
 
+      } else {
+        // C is in slow memory.
+        if (params.work_mem_space == 1) {
+          /*c_slow_crsgraph =*/
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_graph_t, fast_graph_t, slow_graph_t,
+              hbm_mem_space, hbm_mem_space>(a_slow_crsgraph,
+                                            /*b_fast_crsgraph,*/ params);
+        } else {
+          /*c_slow_crsgraph =*/
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_graph_t, fast_graph_t, slow_graph_t,
+              sbm_mem_space, sbm_mem_space>(a_slow_crsgraph,
+                                            /*b_fast_crsgraph,*/ params);
         }
-        else {
-          //C is in slow memory.
-          if (params.work_mem_space == 1){
-            /*c_slow_crsgraph =*/
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, slow_graph_t,slow_graph_t,slow_graph_t, hbm_mem_space, hbm_mem_space>
-                  (a_slow_crsgraph, /*b_slow_crsgraph,*/ params);
-          }
-          else {
-            /*c_slow_crsgraph =*/
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, slow_graph_t,slow_graph_t,slow_graph_t, sbm_mem_space, sbm_mem_space>
-                  (a_slow_crsgraph, /*b_slow_crsgraph,*/ params);
-          }
+      }
+    } else {
+      // B is in slow memory
+      if (params.c_mem_space == 1) {
+        if (params.work_mem_space == 1) {
+          /* c_fast_crsgraph = */
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_graph_t, slow_graph_t, fast_graph_t,
+              hbm_mem_space, hbm_mem_space>(a_slow_crsgraph,
+                                            /*b_slow_crsgraph,*/ params);
+        } else {
+          /* c_fast_crsgraph = */
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_graph_t, slow_graph_t, fast_graph_t,
+              sbm_mem_space, sbm_mem_space>(a_slow_crsgraph,
+                                            /*b_slow_crsgraph,*/ params);
         }
 
+      } else {
+        // C is in slow memory.
+        if (params.work_mem_space == 1) {
+          /*c_slow_crsgraph =*/
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_graph_t, slow_graph_t, slow_graph_t,
+              hbm_mem_space, hbm_mem_space>(a_slow_crsgraph,
+                                            /*b_slow_crsgraph,*/ params);
+        } else {
+          /*c_slow_crsgraph =*/
+          KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_graph_t, slow_graph_t, slow_graph_t,
+              sbm_mem_space, sbm_mem_space>(a_slow_crsgraph,
+                                            /*b_slow_crsgraph,*/ params);
+        }
       }
-
     }
   }
-
-
-}
 }
+
+}  // namespace Experiment
+}  // namespace KokkosKernels
diff --git a/perf_test/graph/KokkosGraph_run_triangle.hpp b/perf_test/graph/KokkosGraph_run_triangle.hpp
index 60c02f384a..2fee139a64 100644
--- a/perf_test/graph/KokkosGraph_run_triangle.hpp
+++ b/perf_test/graph/KokkosGraph_run_triangle.hpp
@@ -42,51 +42,53 @@
 //@HEADER
 */
 
-
 #include "KokkosGraph_Triangle.hpp"
 #include "KokkosKernels_TestParameters.hpp"
 
 #define TRANPOSEFIRST false
 #define TRANPOSESECOND false
 
-namespace KokkosKernels{
+namespace KokkosKernels {
 
-namespace Experiment{
+namespace Experiment {
 template <typename crsGraph_t, typename device>
-bool is_same_graph(crsGraph_t output_mat1, crsGraph_t output_mat2){
-
-  //typedef typename crsGraph_t::StaticCrsGraphType crsGraph_t;
+bool is_same_graph(crsGraph_t output_mat1, crsGraph_t output_mat2) {
+  // typedef typename crsGraph_t::StaticCrsGraphType crsGraph_t;
   typedef typename crsGraph_t::row_map_type::non_const_type lno_view_t;
-  typedef typename crsGraph_t::entries_type::non_const_type   lno_nnz_view_t;
-  //typedef typename crsGraph_t::values_type::non_const_type scalar_view_t;
+  typedef typename crsGraph_t::entries_type::non_const_type lno_nnz_view_t;
+  // typedef typename crsGraph_t::values_type::non_const_type scalar_view_t;
 
-  size_t nrows1 = output_mat1.row_map.extent(0);
-  size_t nentries1 = output_mat1.entries.extent(0) ;
+  size_t nrows1    = output_mat1.row_map.extent(0);
+  size_t nentries1 = output_mat1.entries.extent(0);
 
-  size_t nrows2 = output_mat2.row_map.extent(0);
-  size_t nentries2 = output_mat2.entries.extent(0) ;
-  //size_t nvals2 = output_mat2.values.extent(0);
+  size_t nrows2    = output_mat2.row_map.extent(0);
+  size_t nentries2 = output_mat2.entries.extent(0);
+  // size_t nvals2 = output_mat2.values.extent(0);
 
   KokkosKernels::sort_crs_graph<typename device::execution_space,
-    typename crsGraph_t::row_map_type, typename crsGraph_t::entries_type>
-      (output_mat1.graph.row_map, output_mat1.entries);
+                                typename crsGraph_t::row_map_type,
+                                typename crsGraph_t::entries_type>(
+      output_mat1.graph.row_map, output_mat1.entries);
 
   if (nrows1 != nrows2) return false;
   if (nentries1 != nentries2) return false;
 
   KokkosKernels::sort_crs_graph<typename device::execution_space,
-    typename crsGraph_t::row_map_type, typename crsGraph_t::entries_type>
-      (output_mat2.graph.row_map, output_mat2.entries);
+                                typename crsGraph_t::row_map_type,
+                                typename crsGraph_t::entries_type>(
+      output_mat2.graph.row_map, output_mat2.entries);
 
   bool is_identical = true;
-  is_identical = KokkosKernels::Impl::kk_is_identical_view
-      <typename crsGraph_t::row_map_type, typename crsGraph_t::row_map_type, typename lno_view_t::value_type,
-      typename device::execution_space>(output_mat1.row_map, output_mat2.row_map, 0);
+  is_identical      = KokkosKernels::Impl::kk_is_identical_view<
+      typename crsGraph_t::row_map_type, typename crsGraph_t::row_map_type,
+      typename lno_view_t::value_type, typename device::execution_space>(
+      output_mat1.row_map, output_mat2.row_map, 0);
   if (!is_identical) return false;
 
-  is_identical = KokkosKernels::Impl::kk_is_identical_view
-      <lno_nnz_view_t, lno_nnz_view_t, typename lno_nnz_view_t::value_type,
-      typename device::execution_space>(output_mat1.entries, output_mat2.entries, 0 );
+  is_identical = KokkosKernels::Impl::kk_is_identical_view<
+      lno_nnz_view_t, lno_nnz_view_t, typename lno_nnz_view_t::value_type,
+      typename device::execution_space>(output_mat1.entries,
+                                        output_mat2.entries, 0);
   if (!is_identical) return false;
 
   if (!is_identical) {
@@ -95,109 +97,106 @@ bool is_same_graph(crsGraph_t output_mat1, crsGraph_t output_mat2){
   return true;
 }
 
-template<size_t BufSize, typename SpaceType = Kokkos::DefaultExecutionSpace>
+template <size_t BufSize, typename SpaceType = Kokkos::DefaultExecutionSpace>
 struct Flush {
   typedef double value_type;
 
   // flush a large host buffer
-  Kokkos::View<value_type*,SpaceType> _buf;
+  Kokkos::View<value_type *, SpaceType> _buf;
   Flush(int flush_option) : _buf("Flush::buf", BufSize) {
-    Kokkos::deep_copy(_buf, 1); 
+    Kokkos::deep_copy(_buf, 1);
     Kokkos::fence();
-    if (flush_option == 2){
-    for (size_t i = 0; i < BufSize; ++i){ 
-      _buf(i) = rand();
-    }
+    if (flush_option == 2) {
+      for (size_t i = 0; i < BufSize; ++i) {
+        _buf(i) = rand();
+      }
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void init(value_type &update) {
-    update = 0;
-  }
+  void init(value_type &update) { update = 0; }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type &update,
-            const volatile value_type &input) {
+  void join(volatile value_type &update, const volatile value_type &input) {
     update += input;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const int i, value_type &update) const {
-    update += _buf[i];
-  }
+  void operator()(const int i, value_type &update) const { update += _buf[i]; }
 
   void run() {
     double sum = 0;
-    Kokkos::parallel_reduce("KokkosGraph::PerfTest::Flush", Kokkos::RangePolicy<SpaceType>(0,BufSize/sizeof(double)), *this, sum);
+    Kokkos::parallel_reduce(
+        "KokkosGraph::PerfTest::Flush",
+        Kokkos::RangePolicy<SpaceType>(0, BufSize / sizeof(double)), *this,
+        sum);
     SpaceType().fence();
     std::cout << "Flush sum:" << sum << std::endl;
     FILE *fp = fopen("/dev/null", "w");
     fprintf(fp, "%f\n", sum);
     fclose(fp);
-   
-/*
-#pragma omp parallel 
-    {
-    const size_t cache_line = 64;
-    const char *cp = (const char *) _buf.data();
-    size_t i = 0;
-
-
-    for (i = 0; i < BufSize; i += cache_line) {
-            asm volatile("clflush (%0)\n\t"
-                         :
-                         : "r"(&cp[i])
-                         : "memory");
-    }
 
-    asm volatile("sfence\n\t"
-                 :
-                 :
-                 : "memory");
-  }
-*/
-  }
+    /*
+    #pragma omp parallel
+        {
+        const size_t cache_line = 64;
+        const char *cp = (const char *) _buf.data();
+        size_t i = 0;
+
+
+        for (i = 0; i < BufSize; i += cache_line) {
+                asm volatile("clflush (%0)\n\t"
+                             :
+                             : "r"(&cp[i])
+                             : "memory");
+        }
 
+        asm volatile("sfence\n\t"
+                     :
+                     :
+                     : "memory");
+      }
+    */
+  }
 };
 
-template <typename ExecSpace, typename crsGraph_t, typename crsGraph_t2 , typename crsGraph_t3 , typename TempMemSpace , typename PersistentMemSpace >
-void run_experiment(
-    crsGraph_t crsGraph, Parameters params){
-  //using namespace KokkosSparse;
+template <typename ExecSpace, typename crsGraph_t, typename crsGraph_t2,
+          typename crsGraph_t3, typename TempMemSpace,
+          typename PersistentMemSpace>
+void run_experiment(crsGraph_t crsGraph, Parameters params) {
+  // using namespace KokkosSparse;
   using namespace KokkosSparse;
   using namespace KokkosGraph::Experimental;
-  //using namespace KokkosSparse::Experimental;
+  // using namespace KokkosSparse::Experimental;
 
-  int algorithm = params.algorithm;
-  int repeat = params.repeat;
+  int algorithm  = params.algorithm;
+  int repeat     = params.repeat;
   int chunk_size = params.chunk_size;
 
-  int shmemsize = params.shmemsize;
-  int team_size = params.team_size;
+  int shmemsize              = params.shmemsize;
+  int team_size              = params.team_size;
   int use_dynamic_scheduling = params.use_dynamic_scheduling;
-  int verbose = params.verbose;
+  int verbose                = params.verbose;
 
   int accumulator = params.accumulator;
-  //char spgemm_step = params.spgemm_step;
+  // char spgemm_step = params.spgemm_step;
   int vector_size = params.vector_size;
 
-  //spgemm_step++;
+  // spgemm_step++;
 
   typedef typename crsGraph_t3::row_map_type::non_const_type lno_view_t;
   typedef typename crsGraph_t3::entries_type::non_const_type lno_nnz_view_t;
 
-  Kokkos::View <size_t *,ExecSpace> row_mapC;
+  Kokkos::View<size_t *, ExecSpace> row_mapC;
   lno_nnz_view_t entriesC;
   lno_nnz_view_t valuesC;
 
   typedef typename lno_nnz_view_t::value_type lno_t;
   typedef typename lno_view_t::value_type size_type;
 
-  typedef KokkosKernels::Experimental::KokkosKernelsHandle
-      <size_type,lno_t, lno_t,
-      ExecSpace, TempMemSpace,PersistentMemSpace > KernelHandle;
-
+  typedef KokkosKernels::Experimental::KokkosKernelsHandle<
+      size_type, lno_t, lno_t, ExecSpace, TempMemSpace, PersistentMemSpace>
+      KernelHandle;
 
   KernelHandle kh;
   kh.set_team_work_size(chunk_size);
@@ -205,132 +204,117 @@ void run_experiment(
   kh.set_suggested_team_size(team_size);
   kh.set_suggested_vector_size(vector_size);
 
-
-  if (use_dynamic_scheduling){
+  if (use_dynamic_scheduling) {
     kh.set_dynamic_scheduling(true);
   }
-  if (verbose){
+  if (verbose) {
     kh.set_verbose(true);
   }
-  const lno_t m = crsGraph.numRows();;
-
-
-
-  for (int i = 0; i < repeat; ++i){
-    size_type rowmap_size = crsGraph.entries.extent(0) ;
-    switch (algorithm){
-    case 16:
-      kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_AI);
-      rowmap_size = m ;
-      break;
-    case 17:
-      kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_IA);
-      std::cout << "IA" << std::endl;
-      break;
-    case 18:
-      kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_IA_UNION);
-      break;
-    case 19:
-      kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_LL);
-      rowmap_size = m ;
-      break;
-    case 20:
-      kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_LU);
-      rowmap_size = m ;
-      break;
-    default:
-      kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_IA);
-      break;
+  const lno_t m = crsGraph.numRows();
+  ;
+
+  for (int i = 0; i < repeat; ++i) {
+    size_type rowmap_size = crsGraph.entries.extent(0);
+    switch (algorithm) {
+      case 16:
+        kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_AI);
+        rowmap_size = m;
+        break;
+      case 17:
+        kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_IA);
+        std::cout << "IA" << std::endl;
+        break;
+      case 18: kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_IA_UNION); break;
+      case 19:
+        kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_LL);
+        rowmap_size = m;
+        break;
+      case 20:
+        kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_LU);
+        rowmap_size = m;
+        break;
+      default: kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_IA); break;
     }
 
     kh.get_spgemm_handle()->set_compression_steps(!params.compression2step);
 
     kh.get_spgemm_handle()->set_sort_lower_triangular(params.right_sort);
-    kh.get_spgemm_handle()->set_create_lower_triangular(params.right_lower_triangle);
+    kh.get_spgemm_handle()->set_create_lower_triangular(
+        params.right_lower_triangle);
     kh.get_spgemm_handle()->set_compression(params.apply_compression);
     kh.get_spgemm_handle()->set_sort_option(params.sort_option);
     kh.get_spgemm_handle()->set_min_hash_size_scale(params.minhashscale);
 
-    switch (accumulator){
-    case 0:
-    default:
-      kh.get_spgemm_handle()->set_accumulator_type(SPGEMM_ACC_DEFAULT);
-      break;
-    case 1:
-      kh.get_spgemm_handle()->set_accumulator_type(SPGEMM_ACC_DENSE);
-      break;
-    case 2:
-      kh.get_spgemm_handle()->set_accumulator_type(SPGEMM_ACC_SPARSE);
-      break;
+    switch (accumulator) {
+      case 0:
+      default:
+        kh.get_spgemm_handle()->set_accumulator_type(SPGEMM_ACC_DEFAULT);
+        break;
+      case 1:
+        kh.get_spgemm_handle()->set_accumulator_type(SPGEMM_ACC_DENSE);
+        break;
+      case 2:
+        kh.get_spgemm_handle()->set_accumulator_type(SPGEMM_ACC_SPARSE);
+        break;
     }
 
-
-    constexpr size_t LLC_CAPACITY = 256*4*1024*1024;
-    if (params.cache_flush)
-    {
-    std::cout << "Flushing cache with option:" << params.cache_flush << std::endl;
-    Flush<LLC_CAPACITY, ExecSpace> flush(params.cache_flush);
-        flush.run();
+    constexpr size_t LLC_CAPACITY = 256 * 4 * 1024 * 1024;
+    if (params.cache_flush) {
+      std::cout << "Flushing cache with option:" << params.cache_flush
+                << std::endl;
+      Flush<LLC_CAPACITY, ExecSpace> flush(params.cache_flush);
+      flush.run();
     }
-    if (i == 0){
-      kh.get_spgemm_handle()->set_read_write_cost_calc(params.calculate_read_write_cost);
+    if (i == 0) {
+      kh.get_spgemm_handle()->set_read_write_cost_calc(
+          params.calculate_read_write_cost);
     }
 
-
-
     Kokkos::Timer timer1;
 
-    row_mapC = Kokkos::View <size_t *,ExecSpace>
-              ("non_const_lnow_row",
-                  rowmap_size);
-    entriesC = lno_nnz_view_t ("");
-    valuesC  = lno_nnz_view_t ("");
+    row_mapC =
+        Kokkos::View<size_t *, ExecSpace>("non_const_lnow_row", rowmap_size);
+    entriesC = lno_nnz_view_t("");
+    valuesC  = lno_nnz_view_t("");
 
     double symbolic_time = 0;
-    if (params.triangle_options == 0 ){
-      if (params.apply_compression){
-        triangle_generic (
-            &kh,
-            m,
-            crsGraph.row_map,
-            crsGraph.entries,
-            KOKKOS_LAMBDA(const lno_t& row, const lno_t &col_set_index, const lno_t &col_set,  const lno_t &thread_id) {
-
-          //row_mapC(row) += KokkosKernels::Impl::set_bit_count<lno_t, ExecSpace>(col_set);
-          row_mapC(row) += KokkosKernels::Impl::pop_count(col_set);
-        }
-        );
-      }
-      else {
-        triangle_generic (
-            &kh,
-            m,
-            crsGraph.row_map,
-            crsGraph.entries,
-            KOKKOS_LAMBDA(const lno_t& row, const lno_t &col_set_index, const lno_t &col_set,  const lno_t &thread_id) {
-
-          row_mapC(row) += 1;
-          //row_mapC(row) += KokkosKernels::Impl::set_bit_count<lno_t, ExecSpace>(col_set);
-          //row_mapC(row) += KokkosKernels::Impl::pop_count(col_set);
-
-        }
-        );
+    if (params.triangle_options == 0) {
+      if (params.apply_compression) {
+        triangle_generic(
+            &kh, m, crsGraph.row_map, crsGraph.entries,
+            KOKKOS_LAMBDA(const lno_t &row, const lno_t &col_set_index,
+                          const lno_t &col_set, const lno_t &thread_id) {
+              // row_mapC(row) += KokkosKernels::Impl::set_bit_count<lno_t,
+              // ExecSpace>(col_set);
+              row_mapC(row) += KokkosKernels::Impl::pop_count(col_set);
+            });
+      } else {
+        triangle_generic(
+            &kh, m, crsGraph.row_map, crsGraph.entries,
+            KOKKOS_LAMBDA(const lno_t &row, const lno_t &col_set_index,
+                          const lno_t &col_set, const lno_t &thread_id) {
+              row_mapC(row) += 1;
+              // row_mapC(row) += KokkosKernels::Impl::set_bit_count<lno_t,
+              // ExecSpace>(col_set); row_mapC(row) +=
+              // KokkosKernels::Impl::pop_count(col_set);
+            });
       }
 
       size_t num_triangles = 0;
-      KokkosKernels::Impl::kk_reduce_view< Kokkos::View <size_t *,ExecSpace>, ExecSpace>(rowmap_size, row_mapC, num_triangles);
+      KokkosKernels::Impl::kk_reduce_view<Kokkos::View<size_t *, ExecSpace>,
+                                          ExecSpace>(rowmap_size, row_mapC,
+                                                     num_triangles);
       ExecSpace().fence();
 
       symbolic_time = timer1.seconds();
       std::cout << "num_triangles:" << num_triangles << std::endl;
     }
     kh.destroy_spgemm_handle();
-    std::cout  << "mm_time:" << symbolic_time << std::endl;
-    //only do this once
-    //kh.get_spgemm_handle()->set_read_write_cost_calc(false);
+    std::cout << "mm_time:" << symbolic_time << std::endl;
+    // only do this once
+    // kh.get_spgemm_handle()->set_read_write_cost_calc(false);
   }
 }
 
-
-}
-}
+}  // namespace Experiment
+}  // namespace KokkosKernels
diff --git a/perf_test/graph/KokkosGraph_triangle.cpp b/perf_test/graph/KokkosGraph_triangle.cpp
index 63a52dbaea..17e4a08de4 100644
--- a/perf_test/graph/KokkosGraph_triangle.cpp
+++ b/perf_test/graph/KokkosGraph_triangle.cpp
@@ -46,223 +46,227 @@
 #include "KokkosGraph_multimem_triangle.hpp"
 #include "KokkosKernels_IOUtils.hpp"
 
-
-
-
-
-
-
-void print_options(){
+void print_options() {
   std::cerr << "Options\n" << std::endl;
-  std::cerr << "Choose BackEnd                     : --openmp [numthreads] | --cuda | --hip" << std::endl;
-  std::cerr << "Input Matrix                       : --amtx [path_to_input_matrix]" << std::endl;
-  std::cerr << "\tInput Matrix format can be multiple formats. If it ends with:" << std::endl;
+  std::cerr << "Choose BackEnd                     : --openmp [numthreads] | "
+               "--cuda | --hip"
+            << std::endl;
+  std::cerr
+      << "Input Matrix                       : --amtx [path_to_input_matrix]"
+      << std::endl;
+  std::cerr << "\tInput Matrix format can be multiple formats. If it ends with:"
+            << std::endl;
   std::cerr << "\t\t.mtx: it will read matrix market format." << std::endl;
   std::cerr << "\t\t.bin: it will read binary crs matrix format." << std::endl;
   std::cerr << "\t\t.crs: it will read text crs matrix format." << std::endl;
   std::cerr << "--algorithm                          :" << std::endl;
   std::cerr << "\tTRIANGLEAI: for Adj x Incidence" << std::endl;
-  std::cerr << "\tTRIANGLEIA: for Incidence x Adj -- implementing set intersection (2D) -- 3rd fastest"  << std::endl;
-  std::cerr << "\tTRIANGLEIAUNION: for Incidence x Adj -- implementing set union " << std::endl;
+  std::cerr << "\tTRIANGLEIA: for Incidence x Adj -- implementing set "
+               "intersection (2D) -- 3rd fastest"
+            << std::endl;
+  std::cerr
+      << "\tTRIANGLEIAUNION: for Incidence x Adj -- implementing set union "
+      << std::endl;
   std::cerr << "\tTRIANGLELL: Lower x Lower -- usually fastest " << std::endl;
-  std::cerr << "\tTRIANGLELU: Lower x Upper -- usually 2nd fastest " << std::endl;
-  std::cerr << "--FLOP                               : Calculate and print the number of operations. This will be calculated on the first run." << std::endl;
-  std::cerr << "--COMPRESSION [0|1]                   : Enable disable compression. Default:1." << std::endl;
-  std::cerr << "--RS [0|1|2]                         : Whether to sort lower triangular matrix. 0 - no sort, 1 - sort, 2 - algorithm decides based on max row size (default)" << std::endl;
-  std::cerr << "--accumulator [default|dense|sparse] : what type of accumulator to use." << std::endl;
-  std::cerr << "--RLT                                : If given, lower triangle will be used for AdjxIncidence or Incidence x Adj algorithms." << std::endl;
-  std::cerr << "--dynamic                            : If set, dynamic schedule will be used. Currently default is dynamic scheduling as well." << std::endl;
-  std::cerr << "--verbose                            : If set, the inner timer stats will be printed." << std::endl;
-  std::cerr << "--repeat [repeatnum]                 : how many repeats will be run." << std::endl;
-  std::cerr << "--chunksize [chunksize]              : how many vertices are executed with in a loop index. Default is 16." << std::endl;
-  std::cerr << "--sort_option [0|1|2]                : How lower triangle will be sorted. 0: for largest to bottom, 1 for largest to top, 2 for interleaved." << std::endl;
-  std::cerr << "--cache_flush [0|1|2]                : Flush between repetitions. 0 - no flush, 1 - soft flush, 2 - hard flush with random numbers." << std::endl;
-
-  std::cerr << "\nSuggested use of LL: executable --amtx path_to_file.bin --algorithm TRIANGLELL --repeat 6 --verbose --chunksize [4|16]" << std::endl;
-  std::cerr << "Suggested use of LU: executable --amtx path_to_file.bin --algorithm TRIANGLELU --repeat 6 --verbose --chunksize [4|16]" << std::endl;
-  std::cerr << "Suggested use of AI: executable --amtx path_to_file.bin --algorithm TRIANGLEIA --repeat 6 --verbose --chunksize [4|16] rlt" << std::endl;
-
+  std::cerr << "\tTRIANGLELU: Lower x Upper -- usually 2nd fastest "
+            << std::endl;
+  std::cerr << "--FLOP                               : Calculate and print the "
+               "number of operations. This will be calculated on the first run."
+            << std::endl;
+  std::cerr << "--COMPRESSION [0|1]                   : Enable disable "
+               "compression. Default:1."
+            << std::endl;
+  std::cerr << "--RS [0|1|2]                         : Whether to sort lower "
+               "triangular matrix. 0 - no sort, 1 - sort, 2 - algorithm "
+               "decides based on max row size (default)"
+            << std::endl;
+  std::cerr << "--accumulator [default|dense|sparse] : what type of "
+               "accumulator to use."
+            << std::endl;
+  std::cerr
+      << "--RLT                                : If given, lower triangle will "
+         "be used for AdjxIncidence or Incidence x Adj algorithms."
+      << std::endl;
+  std::cerr
+      << "--dynamic                            : If set, dynamic schedule will "
+         "be used. Currently default is dynamic scheduling as well."
+      << std::endl;
+  std::cerr << "--verbose                            : If set, the inner timer "
+               "stats will be printed."
+            << std::endl;
+  std::cerr
+      << "--repeat [repeatnum]                 : how many repeats will be run."
+      << std::endl;
+  std::cerr << "--chunksize [chunksize]              : how many vertices are "
+               "executed with in a loop index. Default is 16."
+            << std::endl;
+  std::cerr << "--sort_option [0|1|2]                : How lower triangle will "
+               "be sorted. 0: for largest to bottom, 1 for largest to top, 2 "
+               "for interleaved."
+            << std::endl;
+  std::cerr
+      << "--cache_flush [0|1|2]                : Flush between repetitions. 0 "
+         "- no flush, 1 - soft flush, 2 - hard flush with random numbers."
+      << std::endl;
+
+  std::cerr << "\nSuggested use of LL: executable --amtx path_to_file.bin "
+               "--algorithm TRIANGLELL --repeat 6 --verbose --chunksize [4|16]"
+            << std::endl;
+  std::cerr << "Suggested use of LU: executable --amtx path_to_file.bin "
+               "--algorithm TRIANGLELU --repeat 6 --verbose --chunksize [4|16]"
+            << std::endl;
+  std::cerr
+      << "Suggested use of AI: executable --amtx path_to_file.bin --algorithm "
+         "TRIANGLEIA --repeat 6 --verbose --chunksize [4|16] rlt"
+      << std::endl;
 }
 
-
-int parse_inputs (KokkosKernels::Experiment::Parameters &params, int argc, char **argv){
-  for ( int i = 1 ; i < argc ; ++i ) {
-    if ( 0 == strcasecmp( argv[i] , "--threads" ) ) {
-      params.use_threads = atoi( argv[++i] );
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--openmp" ) ) {
-      params.use_openmp = atoi( argv[++i] );
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) {
+int parse_inputs(KokkosKernels::Experiment::Parameters &params, int argc,
+                 char **argv) {
+  for (int i = 1; i < argc; ++i) {
+    if (0 == Test::string_compare_no_case(argv[i], "--threads")) {
+      params.use_threads = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) {
+      params.use_openmp = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) {
       params.use_cuda = 1;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--hip" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) {
       params.use_hip = 1;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--repeat" ) ) {
-      params.repeat = atoi( argv[++i] );
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--triangle_operation" ) ) {
-      params.triangle_options = atoi( argv[++i] );
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--chunksize" ) ) {
-      params.chunk_size = atoi( argv[++i] ) ;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--teamsize" ) ) {
-      params.team_size = atoi( argv[++i] ) ;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--vectorsize" ) ) {
-      params.vector_size  = atoi( argv[++i] ) ;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--compression" ) ) {
-      params.apply_compression = atoi( argv[++i] ) ;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--sort_option" ) ) {
-      params.sort_option = atoi( argv[++i] ) ;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--memspaces" ) ) {
-      int memspaces = atoi( argv[++i] ) ;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) {
+      params.repeat = atoi(argv[++i]);
+    } else if (0 ==
+               Test::string_compare_no_case(argv[i], "--triangle_operation")) {
+      params.triangle_options = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--chunksize")) {
+      params.chunk_size = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--teamsize")) {
+      params.team_size = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--vectorsize")) {
+      params.vector_size = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--compression")) {
+      params.apply_compression = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--sort_option")) {
+      params.sort_option = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--memspaces")) {
+      int memspaces    = atoi(argv[++i]);
       int memspaceinfo = memspaces;
       std::cout << "memspaceinfo:" << memspaceinfo << std::endl;
-      if (memspaceinfo & 1){
+      if (memspaceinfo & 1) {
         params.a_mem_space = 1;
         std::cout << "Using HBM for A" << std::endl;
-      }
-      else {
+      } else {
         params.a_mem_space = 0;
         std::cout << "Using DDR4 for A" << std::endl;
       }
-      memspaceinfo  = memspaceinfo >> 1;
-      if (memspaceinfo & 1){
+      memspaceinfo = memspaceinfo >> 1;
+      if (memspaceinfo & 1) {
         params.b_mem_space = 1;
         std::cout << "Using HBM for B" << std::endl;
-      }
-      else {
+      } else {
         params.b_mem_space = 0;
         std::cout << "Using DDR4 for B" << std::endl;
       }
-      memspaceinfo  = memspaceinfo >> 1;
-      if (memspaceinfo & 1){
+      memspaceinfo = memspaceinfo >> 1;
+      if (memspaceinfo & 1) {
         params.c_mem_space = 1;
         std::cout << "Using HBM for C" << std::endl;
-      }
-      else {
+      } else {
         params.c_mem_space = 0;
         std::cout << "Using DDR4 for C" << std::endl;
       }
-      memspaceinfo  = memspaceinfo >> 1;
-      if (memspaceinfo & 1){
+      memspaceinfo = memspaceinfo >> 1;
+      if (memspaceinfo & 1) {
         params.work_mem_space = 1;
         std::cout << "Using HBM for work memory space" << std::endl;
-      }
-      else {
+      } else {
         params.work_mem_space = 0;
         std::cout << "Using DDR4 for work memory space" << std::endl;
       }
-      memspaceinfo  = memspaceinfo >> 1;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--flop" ) ) {
+      memspaceinfo = memspaceinfo >> 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--flop")) {
       params.calculate_read_write_cost = 1;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--CIF" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--CIF")) {
       params.coloring_input_file = argv[++i];
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--COF" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--COF")) {
       params.coloring_output_file = argv[++i];
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--mhscale" ) ) {
-      params.minhashscale = atoi( argv[++i] ) ;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--mcscale" ) ) {
-      params.multi_color_scale = atoi( argv[++i] ) ;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--shmem" ) ) {
-      params.shmemsize =  atoi( argv[++i] ) ;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--compression2step" ) ) {
-      params.compression2step =  true ;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--mklsort" ) ) {
-      params.mkl_sort_option = atoi( argv[++i] ) ;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--mklkeepout" ) ) {
-      params.mkl_keep_output = atoi( argv[++i] ) ;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--checkoutput" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--mhscale")) {
+      params.minhashscale = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--mcscale")) {
+      params.multi_color_scale = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--shmem")) {
+      params.shmemsize = atoi(argv[++i]);
+    } else if (0 ==
+               Test::string_compare_no_case(argv[i], "--compression2step")) {
+      params.compression2step = true;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--mklsort")) {
+      params.mkl_sort_option = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--mklkeepout")) {
+      params.mkl_keep_output = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--checkoutput")) {
       params.check_output = 1;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--amtx" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--amtx")) {
       params.a_mtx_bin_file = argv[++i];
     }
     /*
-    else if ( 0 == strcasecmp( argv[i] , "cmtx" ) ) {
+    else if ( 0 == Test::string_compare_no_case( argv[i] , "cmtx" ) ) {
       params.c_mtx_bin_file = argv[++i];
     }
-    else if ( 0 == strcasecmp( argv[i] , "bmtx" ) ) {
+    else if ( 0 == Test::string_compare_no_case( argv[i] , "bmtx" ) ) {
       params.b_mtx_bin_file = argv[++i];
     }
     */
-    else if ( 0 == strcasecmp( argv[i] , "--dynamic" ) ) {
+    else if (0 == Test::string_compare_no_case(argv[i], "--dynamic")) {
       params.use_dynamic_scheduling = 1;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--cache_flush" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--cache_flush")) {
       params.cache_flush = atoi(argv[++i]);
     }
 
-
-    else if ( 0 == strcasecmp( argv[i] , "--RLT" ) ) {
+    else if (0 == Test::string_compare_no_case(argv[i], "--RLT")) {
       params.right_lower_triangle = 1;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--RS" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--RS")) {
       params.right_sort = atoi(argv[++i]);
     }
 
-    else if ( 0 == strcasecmp( argv[i] , "--verbose" ) ) {
+    else if (0 == Test::string_compare_no_case(argv[i], "--verbose")) {
       params.verbose = 1;
     }
 
-    else if ( 0 == strcasecmp( argv[i] , "--accumulator" ) ) {
+    else if (0 == Test::string_compare_no_case(argv[i], "--accumulator")) {
       ++i;
-      if ( 0 == strcasecmp( argv[i] , "default" ) ) {
+      if (0 == Test::string_compare_no_case(argv[i], "default")) {
         params.accumulator = 0;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "dense" ) ) {
+      } else if (0 == Test::string_compare_no_case(argv[i], "dense")) {
         params.accumulator = 1;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "sparse" ) ) {
+      } else if (0 == Test::string_compare_no_case(argv[i], "sparse")) {
         params.accumulator = 2;
-      }
-      else {
-        std::cerr << "1-Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ;
+      } else {
+        std::cerr << "1-Unrecognized command line argument #" << i << ": "
+                  << argv[i] << std::endl;
         print_options();
         return 1;
       }
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--algorithm" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--algorithm")) {
       ++i;
-      if ( 0 == strcasecmp( argv[i] , "TRIANGLEAI" ) ) {
+      if (0 == Test::string_compare_no_case(argv[i], "TRIANGLEAI")) {
         params.algorithm = 16;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "TRIANGLEIA" ) ) {
+      } else if (0 == Test::string_compare_no_case(argv[i], "TRIANGLEIA")) {
         params.algorithm = 17;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "TRIANGLEIAUNION" ) ) {
+      } else if (0 ==
+                 Test::string_compare_no_case(argv[i], "TRIANGLEIAUNION")) {
         params.algorithm = 18;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "TRIANGLELL" ) ) {
+      } else if (0 == Test::string_compare_no_case(argv[i], "TRIANGLELL")) {
         params.algorithm = 19;
-      }
-      else if ( 0 == strcasecmp( argv[i] , "TRIANGLELU" ) ) {
+      } else if (0 == Test::string_compare_no_case(argv[i], "TRIANGLELU")) {
         params.algorithm = 20;
-      }
-      else {
-        std::cerr << "2-Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ;
+      } else {
+        std::cerr << "2-Unrecognized command line argument #" << i << ": "
+                  << argv[i] << std::endl;
         print_options();
         return 1;
       }
-    }
-    else {
-      std::cerr << "3-Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ;
+    } else {
+      std::cerr << "3-Unrecognized command line argument #" << i << ": "
+                << argv[i] << std::endl;
       print_options();
       return 1;
     }
@@ -270,87 +274,73 @@ int parse_inputs (KokkosKernels::Experiment::Parameters &params, int argc, char
   return 0;
 }
 
-int main (int argc, char ** argv){
-
+int main(int argc, char **argv) {
   typedef unsigned size_type;
   typedef int idx;
 
-
   KokkosKernels::Experiment::Parameters params;
 
-
-
-  if (parse_inputs (params, argc, argv) ){
+  if (parse_inputs(params, argc, argv)) {
     return 1;
   }
-  if (params.a_mtx_bin_file == NULL){
-    std::cerr << "Provide a matrix file" << std::endl ;
+  if (params.a_mtx_bin_file == NULL) {
+    std::cerr << "Provide a matrix file" << std::endl;
     print_options();
     return 0;
   }
 
-  std::cout << "Sizeof(idx):" << sizeof(idx) << " sizeof(size_type):" << sizeof(size_type) << std::endl;
+  std::cout << "Sizeof(idx):" << sizeof(idx)
+            << " sizeof(size_type):" << sizeof(size_type) << std::endl;
 
-  const int num_threads = params.use_openmp; // Assumption is that use_openmp variable is provided as number of threads
+  const int num_threads =
+      params.use_openmp;  // Assumption is that use_openmp variable is provided
+                          // as number of threads
   const int device_id = 0;
-  Kokkos::initialize( Kokkos::InitArguments( num_threads, -1, device_id ) );
+  Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
 
-#if defined( KOKKOS_ENABLE_OPENMP )
+#if defined(KOKKOS_ENABLE_OPENMP)
 
   if (params.use_openmp) {
-	  Kokkos::OpenMP::print_configuration(std::cout);
+    Kokkos::OpenMP::print_configuration(std::cout);
 #ifdef KOKKOSKERNELS_MULTI_MEM
-    KokkosKernels::Experiment::run_multi_mem_triangle
-    <size_type, idx, Kokkos::OpenMP, Kokkos::OpenMP::memory_space, Kokkos::HostSpace>(
-        params
-        );
+    KokkosKernels::Experiment::run_multi_mem_triangle<
+        size_type, idx, Kokkos::OpenMP, Kokkos::OpenMP::memory_space,
+        Kokkos::HostSpace>(params);
 #else
-    KokkosKernels::Experiment::run_multi_mem_triangle
-    <size_type, idx, Kokkos::OpenMP, Kokkos::OpenMP::memory_space, Kokkos::OpenMP::memory_space>(
-        params
-        );
+    KokkosKernels::Experiment::run_multi_mem_triangle<
+        size_type, idx, Kokkos::OpenMP, Kokkos::OpenMP::memory_space,
+        Kokkos::OpenMP::memory_space>(params);
 #endif
   }
 
 #endif
 
-
-#if defined( KOKKOS_ENABLE_CUDA )
+#if defined(KOKKOS_ENABLE_CUDA)
   if (params.use_cuda) {
     Kokkos::Cuda::print_configuration(std::cout);
 #ifdef KOKKOSKERNELS_MULTI_MEM
-    KokkosKernels::Experiment::run_multi_mem_triangle
-    <size_type, idx, Kokkos::Cuda, Kokkos::Cuda::memory_space, Kokkos::CudaHostPinnedSpace>(
-        params
-        );
-#else 
-    KokkosKernels::Experiment::run_multi_mem_triangle
-    <size_type, idx, Kokkos::Cuda, Kokkos::Cuda::memory_space, Kokkos::Cuda::memory_space>(
-        params
-        );
-#endif 
+    KokkosKernels::Experiment::run_multi_mem_triangle<
+        size_type, idx, Kokkos::Cuda, Kokkos::Cuda::memory_space,
+        Kokkos::CudaHostPinnedSpace>(params);
+#else
+    KokkosKernels::Experiment::run_multi_mem_triangle<
+        size_type, idx, Kokkos::Cuda, Kokkos::Cuda::memory_space,
+        Kokkos::Cuda::memory_space>(params);
+#endif
   }
 
 #endif
 
-#if defined( KOKKOS_ENABLE_HIP )
+#if defined(KOKKOS_ENABLE_HIP)
   if (params.use_hip) {
     Kokkos::Experimental::HIP::print_configuration(std::cout);
-    KokkosKernels::Experiment::run_multi_mem_triangle
-    <size_type, idx, Kokkos::Experimental::HIP, Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace>(
-        params
-        );
+    KokkosKernels::Experiment::run_multi_mem_triangle<
+        size_type, idx, Kokkos::Experimental::HIP,
+        Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace>(params);
   }
-#endif 
+#endif
 
   Kokkos::finalize();
 
   return 0;
 }
-
-
-
-
-
-
-
diff --git a/perf_test/performance/performance_example.cpp b/perf_test/performance/performance_example.cpp
index 873652c41c..554469d340 100644
--- a/perf_test/performance/performance_example.cpp
+++ b/perf_test/performance/performance_example.cpp
@@ -47,14 +47,15 @@
 
   This is intended to be a minimal example of using the new YAML archiver.
   The times and residuals are dummy values to mimic a real test.
-  First time running the test it should create the new yaml archive with 1 entry.
-  Subsequent runs will validate the values and pass.
+  First time running the test it should create the new yaml archive with 1
+  entry. Subsequent runs will validate the values and pass.
 
   To play around with this change the time1 value and run again to see it fail.
-  Or see performance_validate which runs through all the things the archiver does.
+  Or see performance_validate which runs through all the things the archiver
+  does.
 */
 
-#include "Kokkos_Performance.hpp" // provides performance archiver
+#include "Kokkos_Performance.hpp"  // provides performance archiver
 
 bool run_example() {
   // Some tests are run and produce some times...
@@ -62,14 +63,14 @@ bool run_example() {
   double time2 = 13.3;
 
   // and they produce some results...
-  double residual = 0.001;
+  double residual        = 0.001;
   int some_exact_counter = 22;
 
   // set up some user options
-  std::string archiveName("performance_example.yaml"); //  name of the archive
-  std::string testName = "performance_example"; // name of test
-  std::string hostName; // optional hostname - auto detected if blank
-  double tolerance = 0.1; // for residual and times
+  std::string archiveName("performance_example.yaml");  //  name of the archive
+  std::string testName = "performance_example";         // name of test
+  std::string hostName;    // optional hostname - auto detected if blank
+  double tolerance = 0.1;  // for residual and times
 
   using KokkosKernels::Performance;
 
@@ -82,15 +83,16 @@ bool run_example() {
 
   // Fill config
   archiver.set_config("MPI_Ranks", 2);
-  archiver.set_config("Teams", 1); // just arbitrary right now
-  archiver.set_config("Threads", 1); // just arbitrary right now
-  archiver.set_config("Filename", "somefilename"); // arbitrary - example of a string
+  archiver.set_config("Teams", 1);    // just arbitrary right now
+  archiver.set_config("Threads", 1);  // just arbitrary right now
+  archiver.set_config("Filename",
+                      "somefilename");  // arbitrary - example of a string
 
   // Fill results
   archiver.set_result("Time1", time1, tolerance);
   archiver.set_result("Time2", time2, tolerance);
   archiver.set_result("Residual", residual, tolerance);
-  archiver.set_result("Counter", some_exact_counter); // must match exactly
+  archiver.set_result("Counter", some_exact_counter);  // must match exactly
 
   // run it
   Performance::Result result = archiver.run(archiveName, testName, hostName);
@@ -110,20 +112,20 @@ bool run_example() {
       std::cout << "Archiver Passed. Adding new machine entry." << std::endl;
       break;
     case Performance::NewConfiguration:
-      std::cout << "Archiver Passed. Adding new machine configuration." << std::endl;
+      std::cout << "Archiver Passed. Adding new machine configuration."
+                << std::endl;
       break;
     case Performance::NewTest:
       std::cout << "Archiver Passed. Adding new test entry." << std::endl;
       break;
     case Performance::NewTestConfiguration:
-      std::cout << "Archiver Passed. Adding new test entry configuration." << std::endl;
+      std::cout << "Archiver Passed. Adding new test entry configuration."
+                << std::endl;
       break;
     case Performance::UpdatedTest:
       std::cout << "Archiver Passed. Updating test entry." << std::endl;
       break;
-    default:
-      throw std::logic_error("Unexpected result code.");
-      break;
+    default: throw std::logic_error("Unexpected result code."); break;
   }
 
   return (result != Performance::Failed);
diff --git a/perf_test/performance/performance_validate.cpp b/perf_test/performance/performance_validate.cpp
index 2b78e8886a..9c4470e003 100644
--- a/perf_test/performance/performance_validate.cpp
+++ b/perf_test/performance/performance_validate.cpp
@@ -56,316 +56,361 @@
   See the update() method below for the sequence of events that are tested.
 */
 
-#include "Kokkos_Performance.hpp" // provides performance archiver
+#include "Kokkos_Performance.hpp"  // provides performance archiver
 
 // a manager class to run the series of tests
 class TestManager {
-  public:
-    TestManager() :
-      is_error(false), // tracks if there was an unexpected error
-      update_index(0), // utility for running through a sequence of tests
-      archiveName("performance_validate.yaml"), // the yaml file used
-      testName("test1"), // the initial test name appearing in the archive
-      hostName(""),      // blank means auto detect - otherwise sets host name
-      changeCompiler(""), // override option for simulating different machine config
-      filename("somefilename"),    // arbitrary configuration parameter
-      mpi_ranks(1),                // arbitrary configuration parameter
-      teams(1),                    // arbitrary configuration parameter
-      threads(1),                  // arbitrary configuration parameter
-      tolerance(0.1),              // tolerance on time values
-      time1(10.0),                 // arbitrary time value
-      time2(13.3),                 // arbitrary time value
-      niters(44),                  // arbitrary result value
-      residual(0.001),             // arbitrary result value
-      bExtraParameters(false),     // used to validate and test the update mechanism
-      extra_time(22.3),            // extra time we'll add to preexisiting data
-      extra_result(40),            // extra result we'll add to preexisting data
-      bExactParameters(false),     // add exact match parameters
-      exact_int(5),                // test for exact match int
-      exact_string("somestring") { // test for exact match string
+ public:
+  TestManager()
+      : is_error(false),  // tracks if there was an unexpected error
+        update_index(0),  // utility for running through a sequence of tests
+        archiveName("performance_validate.yaml"),  // the yaml file used
+        testName("test1"),  // the initial test name appearing in the archive
+        hostName(""),  // blank means auto detect - otherwise sets host name
+        changeCompiler(
+            ""),  // override option for simulating different machine config
+        filename("somefilename"),  // arbitrary configuration parameter
+        mpi_ranks(1),              // arbitrary configuration parameter
+        teams(1),                  // arbitrary configuration parameter
+        threads(1),                // arbitrary configuration parameter
+        tolerance(0.1),            // tolerance on time values
+        time1(10.0),               // arbitrary time value
+        time2(13.3),               // arbitrary time value
+        niters(44),                // arbitrary result value
+        residual(0.001),           // arbitrary result value
+        bExtraParameters(
+            false),        // used to validate and test the update mechanism
+        extra_time(22.3),  // extra time we'll add to preexisiting data
+        extra_result(40),  // extra result we'll add to preexisting data
+        bExactParameters(false),      // add exact match parameters
+        exact_int(5),                 // test for exact match int
+        exact_string("somestring") {  // test for exact match string
 
-      using KokkosKernels::Performance;
+    using KokkosKernels::Performance;
 
-      // normally we would not delete the yaml since we want to compare but this
-      // test is designed to run through a series of steps including starting
-      // from nothing, so write out a blank node to start clean
-      Performance::erase_archive(archiveName);
+    // normally we would not delete the yaml since we want to compare but this
+    // test is designed to run through a series of steps including starting
+    // from nothing, so write out a blank node to start clean
+    Performance::erase_archive(archiveName);
 
-      // now loop all the tests in update until we run out of things to try
-      while(update()) {}
-
-      // print the archive for inspection
-      Performance::print_archive(archiveName);
+    // now loop all the tests in update until we run out of things to try
+    while (update()) {
     }
 
-    bool isError() const { return is_error; }
+    // print the archive for inspection
+    Performance::print_archive(archiveName);
+  }
 
-    int update() {
-     using KokkosKernels::Performance;
+  bool isError() const { return is_error; }
 
-     // some arbitrary delta used to shift values up and down during test
-     const double small_time_change = 0.01; // a small value not exceeding tolerance
-     const double big_time_change = 10.0; // a big value exceeding tolerance
-     const int integer_change = 2; // some non 0 value
+  int update() {
+    using KokkosKernels::Performance;
 
-     Performance::Result expected_result = Performance::Unknown;
-      switch(++update_index) {
-        case 1:
-          std::cout << "Create a new machine entry with no prexisting archive." << std::endl;
-          expected_result = Performance::NewMachine;
-          break;
-        case 2:
-          std::cout << "Run again with same data - we expect it to pass." << std::endl;
-          expected_result = Performance::Passed;
-          break;
-        case 3:
-          std::cout << "Make a small time change not exceeding tolerance should still pass." << std::endl;
-          time1 += small_time_change;
-          expected_result = Performance::Passed;
-          break;
-        case 4:
-          std::cout << "Make a big time change exceeding tolerance to trigger failure." << std::endl;
-          time1 -= small_time_change;
-          time1 += big_time_change;
-          expected_result = Performance::Failed;
-          break;
-        case 5:
-          std::cout << "Run with the original time again and should pass with a match." << std::endl;
-          time1 -= big_time_change;
-          expected_result = Performance::Passed;
-          break;
-        case 6:
-          std::cout << "Change the host name to 'custom hostname 1' which creates a new entry." << std::endl;
-          hostName = "custom hostname 1";
-          expected_result = Performance::NewMachine;
-          break;
-        case 7:
-          std::cout << "Change the host name to 'custom hostname 2' which creates another entry." << std::endl;
-          hostName = "custom hostname 2";
-          expected_result = Performance::NewMachine;
-          break;
-        case 8:
-          std::cout << "Change back to default host name so we should match data." << std::endl;
-          hostName = "";
-          expected_result = Performance::Passed;
-          break;
-        case 9:
-          std::cout << "Change the test configuration to create a new config type." << std::endl;
-          mpi_ranks += integer_change;
-          expected_result = Performance::NewTestConfiguration;
-          break;
-        case 10:
-          std::cout << "Go back to original configuration which we should match." << std::endl;
-          mpi_ranks -= integer_change;
-          expected_result = Performance::Passed;
-          break;
-        case 11:
-          std::cout << "Create a new test name which will be a new entry." << std::endl;
-          testName = "test2";
-          expected_result = Performance::NewTest;
-          break;
-        case 12:
-          std::cout << "Change test name and the niters result value at same time which creates a new entry." << std::endl;
-          testName = "test3";
-          niters += integer_change;
-          expected_result = Performance::NewTest;
-          break;
-        case 13:
-          std::cout << "Still on test3 change niters back so we should fail - doesn't match test3." << std::endl;
-          niters -= integer_change;
-          expected_result = Performance::Failed; // because still on test3 which had +5
-          break;
-        case 14:
-          std::cout << "Changing back to test1 we should now pass because niters matches test1." << std::endl;
-          testName = "test1";
-          expected_result = Performance::Passed; // should work because niters matches
-          break;
-        case 15:
-          std::cout << "Simulate a different compiler by overriding the machine configuration to create a new entry. Also change test config to mix it up." << std::endl;
-          changeCompiler = "SpecialCompiler";
-          teams += integer_change;
-          expected_result = Performance::NewConfiguration;
-          break;
-        case 16:
-          std::cout << "Run again - should match our machine configuration." << std::endl;
-          expected_result = Performance::Passed;
-          break;
-        case 17:
-          std::cout << "Change test config back but stay on this machine configuration - creates a new entry." << std::endl;
-          teams -= integer_change;
-          expected_result = Performance::NewTestConfiguration;
-          break;
-        case 18:
-          std::cout << "Go back to the original machine configuration - we should succeed." << std::endl;
-          changeCompiler = "";
-          expected_result = Performance::Passed;
-          break;
-        case 19:
-          std::cout << "Add an additional parameters to the test entry. This is allowed." << std::endl;
-          bExtraParameters = true;
-          expected_result = Performance::UpdatedTest;
-          break;
-        case 20:
-          std::cout << "Run again with extra parameters. Should be fine and pass." << std::endl;
-          expected_result = Performance::Passed;
-          break;
-        case 21:
-          std::cout << "Change an extra parameter - should fail." << std::endl;
-          extra_time += big_time_change;
-          expected_result = Performance::Failed;
-          break;
-        case 22:
-          std::cout << "Try test2 without extra parameters and make sure it still passes." << std::endl;
-          bExtraParameters = false;
-          testName = "test2";
-          expected_result = Performance::Passed;
-          break;
-        case 23:
-          std::cout << "Back to test1 with extra parameters still off - this is not allowed." << std::endl;
-          testName = "test1";
-          extra_time -= big_time_change; // restore the value so it's correct - we fail here because we don't have the parameters, not because they are wrong
-          expected_result = Performance::Failed;
-          break;
-        case 24:
-          std::cout << "Restore extra parameters and check it's ok." << std::endl;
-          bExtraParameters = true;
-          expected_result = Performance::Passed;
-          break;
-        case 25:
-          std::cout << "Turn on exact parameters." << std::endl;
-          bExactParameters = true;
-          expected_result = Performance::UpdatedTest;
-          break;
-        case 26:
-          std::cout << "Change exact string - should fail." << std::endl;
-          exact_string = "anotherstring";
-          expected_result = Performance::Failed;
-          break;
-        case 27:
-          std::cout << "Change exact string back. Should pass." << std::endl;
-          exact_string = "somestring";
-          expected_result = Performance::Passed;
-          break;
-        case 28:
-          std::cout << "Change exact int - should fail." << std::endl;
-          exact_int += integer_change;
-          expected_result = Performance::Failed;
-          break;
-        case 29:
-          std::cout << "Change it back - should pass." << std::endl;
-          exact_int -= integer_change;
-          expected_result = Performance::Passed;
-          break;
-        default:
-          std::cout << "All tests completed!" << std::endl << std::endl;
-          return false; // we finished all the tests
-          break;
-      }
+    // some arbitrary delta used to shift values up and down during test
+    const double small_time_change =
+        0.01;  // a small value not exceeding tolerance
+    const double big_time_change = 10.0;  // a big value exceeding tolerance
+    const int integer_change     = 2;     // some non 0 value
 
-      Performance archiver;
-      // this step would normally not exist as we'd want the correct machine config
-      // but to test we override a setting to simulate running a different config
-      if(changeCompiler != "") {
-        archiver.set_machine_config("Compiler", changeCompiler);
-      }
+    Performance::Result expected_result = Performance::Unknown;
+    switch (++update_index) {
+      case 1:
+        std::cout << "Create a new machine entry with no prexisting archive."
+                  << std::endl;
+        expected_result = Performance::NewMachine;
+        break;
+      case 2:
+        std::cout << "Run again with same data - we expect it to pass."
+                  << std::endl;
+        expected_result = Performance::Passed;
+        break;
+      case 3:
+        std::cout << "Make a small time change not exceeding tolerance should "
+                     "still pass."
+                  << std::endl;
+        time1 += small_time_change;
+        expected_result = Performance::Passed;
+        break;
+      case 4:
+        std::cout
+            << "Make a big time change exceeding tolerance to trigger failure."
+            << std::endl;
+        time1 -= small_time_change;
+        time1 += big_time_change;
+        expected_result = Performance::Failed;
+        break;
+      case 5:
+        std::cout
+            << "Run with the original time again and should pass with a match."
+            << std::endl;
+        time1 -= big_time_change;
+        expected_result = Performance::Passed;
+        break;
+      case 6:
+        std::cout << "Change the host name to 'custom hostname 1' which "
+                     "creates a new entry."
+                  << std::endl;
+        hostName        = "custom hostname 1";
+        expected_result = Performance::NewMachine;
+        break;
+      case 7:
+        std::cout << "Change the host name to 'custom hostname 2' which "
+                     "creates another entry."
+                  << std::endl;
+        hostName        = "custom hostname 2";
+        expected_result = Performance::NewMachine;
+        break;
+      case 8:
+        std::cout << "Change back to default host name so we should match data."
+                  << std::endl;
+        hostName        = "";
+        expected_result = Performance::Passed;
+        break;
+      case 9:
+        std::cout
+            << "Change the test configuration to create a new config type."
+            << std::endl;
+        mpi_ranks += integer_change;
+        expected_result = Performance::NewTestConfiguration;
+        break;
+      case 10:
+        std::cout << "Go back to original configuration which we should match."
+                  << std::endl;
+        mpi_ranks -= integer_change;
+        expected_result = Performance::Passed;
+        break;
+      case 11:
+        std::cout << "Create a new test name which will be a new entry."
+                  << std::endl;
+        testName        = "test2";
+        expected_result = Performance::NewTest;
+        break;
+      case 12:
+        std::cout << "Change test name and the niters result value at same "
+                     "time which creates a new entry."
+                  << std::endl;
+        testName = "test3";
+        niters += integer_change;
+        expected_result = Performance::NewTest;
+        break;
+      case 13:
+        std::cout << "Still on test3 change niters back so we should fail - "
+                     "doesn't match test3."
+                  << std::endl;
+        niters -= integer_change;
+        expected_result =
+            Performance::Failed;  // because still on test3 which had +5
+        break;
+      case 14:
+        std::cout << "Changing back to test1 we should now pass because niters "
+                     "matches test1."
+                  << std::endl;
+        testName = "test1";
+        expected_result =
+            Performance::Passed;  // should work because niters matches
+        break;
+      case 15:
+        std::cout << "Simulate a different compiler by overriding the machine "
+                     "configuration to create a new entry. Also change test "
+                     "config to mix it up."
+                  << std::endl;
+        changeCompiler = "SpecialCompiler";
+        teams += integer_change;
+        expected_result = Performance::NewConfiguration;
+        break;
+      case 16:
+        std::cout << "Run again - should match our machine configuration."
+                  << std::endl;
+        expected_result = Performance::Passed;
+        break;
+      case 17:
+        std::cout << "Change test config back but stay on this machine "
+                     "configuration - creates a new entry."
+                  << std::endl;
+        teams -= integer_change;
+        expected_result = Performance::NewTestConfiguration;
+        break;
+      case 18:
+        std::cout << "Go back to the original machine configuration - we "
+                     "should succeed."
+                  << std::endl;
+        changeCompiler  = "";
+        expected_result = Performance::Passed;
+        break;
+      case 19:
+        std::cout << "Add an additional parameters to the test entry. This is "
+                     "allowed."
+                  << std::endl;
+        bExtraParameters = true;
+        expected_result  = Performance::UpdatedTest;
+        break;
+      case 20:
+        std::cout << "Run again with extra parameters. Should be fine and pass."
+                  << std::endl;
+        expected_result = Performance::Passed;
+        break;
+      case 21:
+        std::cout << "Change an extra parameter - should fail." << std::endl;
+        extra_time += big_time_change;
+        expected_result = Performance::Failed;
+        break;
+      case 22:
+        std::cout << "Try test2 without extra parameters and make sure it "
+                     "still passes."
+                  << std::endl;
+        bExtraParameters = false;
+        testName         = "test2";
+        expected_result  = Performance::Passed;
+        break;
+      case 23:
+        std::cout << "Back to test1 with extra parameters still off - this is "
+                     "not allowed."
+                  << std::endl;
+        testName = "test1";
+        extra_time -=
+            big_time_change;  // restore the value so it's correct - we fail
+                              // here because we don't have the parameters, not
+                              // because they are wrong
+        expected_result = Performance::Failed;
+        break;
+      case 24:
+        std::cout << "Restore extra parameters and check it's ok." << std::endl;
+        bExtraParameters = true;
+        expected_result  = Performance::Passed;
+        break;
+      case 25:
+        std::cout << "Turn on exact parameters." << std::endl;
+        bExactParameters = true;
+        expected_result  = Performance::UpdatedTest;
+        break;
+      case 26:
+        std::cout << "Change exact string - should fail." << std::endl;
+        exact_string    = "anotherstring";
+        expected_result = Performance::Failed;
+        break;
+      case 27:
+        std::cout << "Change exact string back. Should pass." << std::endl;
+        exact_string    = "somestring";
+        expected_result = Performance::Passed;
+        break;
+      case 28:
+        std::cout << "Change exact int - should fail." << std::endl;
+        exact_int += integer_change;
+        expected_result = Performance::Failed;
+        break;
+      case 29:
+        std::cout << "Change it back - should pass." << std::endl;
+        exact_int -= integer_change;
+        expected_result = Performance::Passed;
+        break;
+      default:
+        std::cout << "All tests completed!" << std::endl << std::endl;
+        return false;  // we finished all the tests
+        break;
+    }
 
-      // set up test config
-      archiver.set_config("MPI Ranks", mpi_ranks);
-      archiver.set_config("Teams", teams);
-      archiver.set_config("Threads", threads);
-      archiver.set_config("Filename", filename);
+    Performance archiver;
+    // this step would normally not exist as we'd want the correct machine
+    // config but to test we override a setting to simulate running a different
+    // config
+    if (changeCompiler != "") {
+      archiver.set_machine_config("Compiler", changeCompiler);
+    }
 
-      // set up test results
-      archiver.set_result("Time1", time1, tolerance);
-      archiver.set_result("Time2", time2, tolerance);
-      archiver.set_result("Iterations", niters, niters>0?niters-1:0, niters+1);
-      archiver.set_result("Residual", residual, tolerance);
+    // set up test config
+    archiver.set_config("MPI Ranks", mpi_ranks);
+    archiver.set_config("Teams", teams);
+    archiver.set_config("Threads", threads);
+    archiver.set_config("Filename", filename);
 
-      // add optionals if turned on
-      if(bExtraParameters) {
-        archiver.set_result("Extra time", extra_time, tolerance);
-        archiver.set_result("Extra result",
-          extra_result, extra_result-3, extra_result+3);
-      }
-      if(bExactParameters) {
-        // if we don't set tolerance it will be assigned as an exact match
-        // currently internally using the key character '!' to identify these
-        archiver.set_result("Exact string", exact_string);
-        archiver.set_result("Exact int", exact_int);
-      }
+    // set up test results
+    archiver.set_result("Time1", time1, tolerance);
+    archiver.set_result("Time2", time2, tolerance);
+    archiver.set_result("Iterations", niters, niters > 0 ? niters - 1 : 0,
+                        niters + 1);
+    archiver.set_result("Residual", residual, tolerance);
 
-      // update the archive
-      Performance::Result result = archiver.run(archiveName, testName, hostName);
+    // add optionals if turned on
+    if (bExtraParameters) {
+      archiver.set_result("Extra time", extra_time, tolerance);
+      archiver.set_result("Extra result", extra_result, extra_result - 3,
+                          extra_result + 3);
+    }
+    if (bExactParameters) {
+      // if we don't set tolerance it will be assigned as an exact match
+      // currently internally using the key character '!' to identify these
+      archiver.set_result("Exact string", exact_string);
+      archiver.set_result("Exact int", exact_int);
+    }
 
-      // Get results string
-      std::string result_string;
-      switch (result) {
-        case Performance::Passed:
-          result_string = "Archiver Passed";
-          break;
-        case Performance::Failed:
-          result_string = "Archiver Failed";
-          break;
-        case Performance::NewMachine:
-          result_string = "Archiver Passed. Adding new machine entry.";
-          break;
-        case Performance::NewConfiguration:
-          result_string = "Archiver Passed. Adding new machine configuration.";
-          break;
-        case Performance::NewTest:
-          result_string = "Archiver Passed. Adding new test entry.";
-          break;
-        case Performance::NewTestConfiguration:
-          result_string = "Archiver Passed. Adding new test entry configuration.";
-          break;
-        case Performance::UpdatedTest:
-          result_string = "Archiver Passed. Updating test entry.";
-          break;
-        default:
-          throw std::logic_error("Unexpected result code.");
-          break;
-      }
+    // update the archive
+    Performance::Result result = archiver.run(archiveName, testName, hostName);
 
-      // validate the returned result is as expected
-      // note that a Fail event from the archiver may be the expected result
-      if(result != expected_result) {
-        std::cout << "  Invalid result from archive! " << result_string << std::endl << std::endl;
-        is_error = true; // we failed!
-        return false; // we can stop now - something went wrong
-      }
-      else {
-        std::cout << "  Correct: " << result_string << std::endl << std::endl;
-      }
+    // Get results string
+    std::string result_string;
+    switch (result) {
+      case Performance::Passed: result_string = "Archiver Passed"; break;
+      case Performance::Failed: result_string = "Archiver Failed"; break;
+      case Performance::NewMachine:
+        result_string = "Archiver Passed. Adding new machine entry.";
+        break;
+      case Performance::NewConfiguration:
+        result_string = "Archiver Passed. Adding new machine configuration.";
+        break;
+      case Performance::NewTest:
+        result_string = "Archiver Passed. Adding new test entry.";
+        break;
+      case Performance::NewTestConfiguration:
+        result_string = "Archiver Passed. Adding new test entry configuration.";
+        break;
+      case Performance::UpdatedTest:
+        result_string = "Archiver Passed. Updating test entry.";
+        break;
+      default: throw std::logic_error("Unexpected result code."); break;
+    }
 
-      return true;
+    // validate the returned result is as expected
+    // note that a Fail event from the archiver may be the expected result
+    if (result != expected_result) {
+      std::cout << "  Invalid result from archive! " << result_string
+                << std::endl
+                << std::endl;
+      is_error = true;  // we failed!
+      return false;     // we can stop now - something went wrong
+    } else {
+      std::cout << "  Correct: " << result_string << std::endl << std::endl;
     }
 
-  private:
-    bool is_error;            // tracks if there was an unexpected error
-    int update_index;         // utility for running through a sequence of tests
-    std::string archiveName;  // the yaml file used
-    std::string testName;     // the test name appearing in the archive
-    std::string hostName;     // blank means auto detect - otherwise sets host name
-    std::string changeCompiler; // override option to simulate different machine config
-    std::string filename;     // arbitrary configuration parameter
-    int mpi_ranks;            // arbitrary configuration parameter
-    int teams;                // arbitrary configuration parameter
-    int threads;              // arbitrary configuration parameter
-    double tolerance;         // tolerance on time values
-    double time1;             // arbitrary time value
-    double time2;             // arbitrary time value
-    int niters;               // arbitrary result value
-    double residual;          // arbitrary result value
-    bool bExtraParameters;    // used to validate and test the update mechanism
-    double extra_time;        // extra time we'll add to preexisiting data
-    int extra_result;         // extra result we'll add to preexisting data
-    bool bExactParameters;    // used to turn on exact match parameters
-    int exact_int;            // test for exact match int
-    std::string exact_string; // test for exact match string
+    return true;
+  }
+
+ private:
+  bool is_error;            // tracks if there was an unexpected error
+  int update_index;         // utility for running through a sequence of tests
+  std::string archiveName;  // the yaml file used
+  std::string testName;     // the test name appearing in the archive
+  std::string hostName;  // blank means auto detect - otherwise sets host name
+  std::string
+      changeCompiler;    // override option to simulate different machine config
+  std::string filename;  // arbitrary configuration parameter
+  int mpi_ranks;         // arbitrary configuration parameter
+  int teams;             // arbitrary configuration parameter
+  int threads;           // arbitrary configuration parameter
+  double tolerance;      // tolerance on time values
+  double time1;          // arbitrary time value
+  double time2;          // arbitrary time value
+  int niters;            // arbitrary result value
+  double residual;       // arbitrary result value
+  bool bExtraParameters;     // used to validate and test the update mechanism
+  double extra_time;         // extra time we'll add to preexisiting data
+  int extra_result;          // extra result we'll add to preexisting data
+  bool bExactParameters;     // used to turn on exact match parameters
+  int exact_int;             // test for exact match int
+  std::string exact_string;  // test for exact match string
 };
 
 int main(int argc, char *argv[]) {
-  TestManager manager; // loops through a series of changes and archive each
+  TestManager manager;  // loops through a series of changes and archive each
   bool success = !manager.isError();
 
   if (success) {
diff --git a/perf_test/sparse/CMakeLists.txt b/perf_test/sparse/CMakeLists.txt
index c515ef2986..fe2b7a094e 100644
--- a/perf_test/sparse/CMakeLists.txt
+++ b/perf_test/sparse/CMakeLists.txt
@@ -1,99 +1,111 @@
 KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
 KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 
-KOKKOSKERNELS_ADD_EXECUTABLE(
-  sparse_pcg
-  SOURCES KokkosSparse_pcg.cpp
+IF(KOKKOSKERNELS_INST_DOUBLE)
+  KOKKOSKERNELS_ADD_EXECUTABLE(
+          sparse_pcg
+          SOURCES KokkosSparse_pcg.cpp
   )
 
-KOKKOSKERNELS_ADD_EXECUTABLE(
-  sparse_block_pcg
-  SOURCES KokkosSparse_block_pcg.cpp
+  KOKKOSKERNELS_ADD_EXECUTABLE(
+          sparse_block_pcg
+          SOURCES KokkosSparse_block_pcg.cpp
   )
+ENDIF()
 
 KOKKOSKERNELS_ADD_EXECUTABLE(
-  sparse_spgemm
-  SOURCES KokkosSparse_spgemm.cpp
-  )
+        sparse_spgemm
+        SOURCES KokkosSparse_spgemm.cpp
+)
 
 KOKKOSKERNELS_ADD_EXECUTABLE(
-  sparse_spgemm_jacobi
-  SOURCES KokkosSparse_spgemm_jacobi.cpp
-  )
-  
+        sparse_spgemm_jacobi
+        SOURCES KokkosSparse_spgemm_jacobi.cpp
+)
+
 KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/spmv)
 
 KOKKOSKERNELS_ADD_EXECUTABLE(
-  sparse_spadd
-  SOURCES KokkosSparse_spadd.cpp
-  )
+        sparse_spadd
+        SOURCES KokkosSparse_spadd.cpp
+)
 
 KOKKOSKERNELS_ADD_EXECUTABLE(
-  sparse_spmv_struct
-  SOURCES KokkosSparse_spmv_struct.cpp
-  )
+        sparse_spmv_struct
+        SOURCES KokkosSparse_spmv_struct.cpp
+)
 
 KOKKOSKERNELS_ADD_EXECUTABLE(
-  sparse_spmv_struct_tuning
-  SOURCES KokkosSparse_spmv_struct_tuning.cpp
-  )
+        sparse_spmv_struct_tuning
+        SOURCES KokkosSparse_spmv_struct_tuning.cpp
+)
 
 
 set(utilities_list)
 
-IF(KokkosKernels_ENABLE_TESTS_AND_PERFSUITE)
-LIST(APPEND utilities_list ../PerfTestUtilities.cpp)
-ENDIF()
+IF (KokkosKernels_ENABLE_TESTS_AND_PERFSUITE)
+    LIST(APPEND utilities_list ../PerfTestUtilities.cpp)
+ENDIF ()
 
 KOKKOSKERNELS_ADD_EXECUTABLE(
-  sparse_spmv
-  SOURCES KokkosSparse_spmv.cpp KokkosSparse_spmv_test.cpp spmv/OpenMPSmartStatic_SPMV.cpp
-  ${utilities_list}
-  )
+        sparse_spmv
+        SOURCES KokkosSparse_spmv.cpp KokkosSparse_spmv_test.cpp spmv/OpenMPSmartStatic_SPMV.cpp
+        ${utilities_list}
+)
 
 KOKKOSKERNELS_ADD_EXECUTABLE(
-  sparse_kk_spmv
-  SOURCES KokkosSparse_kk_spmv.cpp
-  )
-
-IF(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
-  KOKKOSKERNELS_ADD_EXECUTABLE(
-    sparse_spmv_merge
-    SOURCES KokkosSparse_spmv_merge.cpp
+        sparse_kk_spmv
+        SOURCES KokkosSparse_kk_spmv.cpp
+)
+
+IF (KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
+    KOKKOSKERNELS_ADD_EXECUTABLE(
+            sparse_spmv_merge
+            SOURCES KokkosSparse_spmv_merge.cpp
     )
-ENDIF()
+ENDIF ()
 
 KOKKOSKERNELS_ADD_EXECUTABLE(
-  sparse_sptrsv
-  SOURCES KokkosSparse_sptrsv.cpp
-  )
+        sparse_spmv_blockcrs
+        SOURCES KokkosSparse_spmv_blockcrs.cpp
+)
 
 KOKKOSKERNELS_ADD_EXECUTABLE(
-  sparse_sptrsv_cholmod
-  SOURCES KokkosSparse_sptrsv_cholmod.cpp 
-  )
+        sparse_spmv_bsr
+        SOURCES KokkosSparse_spmv_bsr.cpp
+)
 
-IF(NOT ${KOKKOS_HAS_TRILINOS})
-# Disable this perf test with Trilinos builds to workaround
-# -Werror issues error: declaration of xyz with C language linkage
 KOKKOSKERNELS_ADD_EXECUTABLE(
-  sparse_sptrsv_superlu
-  SOURCES KokkosSparse_sptrsv_superlu.cpp 
-  )
-ENDIF()
+        sparse_sptrsv
+        SOURCES KokkosSparse_sptrsv.cpp
+)
 
 KOKKOSKERNELS_ADD_EXECUTABLE(
-  sparse_sptrsv_supernode
-  SOURCES KokkosSparse_sptrsv_supernode.cpp 
-  )
+        sparse_sptrsv_cholmod
+        SOURCES KokkosSparse_sptrsv_cholmod.cpp
+)
+
+IF (NOT ${KOKKOS_HAS_TRILINOS})
+    # Disable this perf test with Trilinos builds to workaround
+    # -Werror issues error: declaration of xyz with C language linkage
+    KOKKOSKERNELS_ADD_EXECUTABLE(
+            sparse_sptrsv_superlu
+            SOURCES KokkosSparse_sptrsv_superlu.cpp
+    )
+ENDIF ()
 
 KOKKOSKERNELS_ADD_EXECUTABLE(
-  sparse_gs
-  SOURCES KokkosSparse_gs.cpp
-  TESTONLYLIBS kokkoskernelsperf_gtest
-  )
+        sparse_sptrsv_supernode
+        SOURCES KokkosSparse_sptrsv_supernode.cpp
+)
 
 KOKKOSKERNELS_ADD_EXECUTABLE(
-  sparse_spiluk
-  SOURCES KokkosSparse_spiluk.cpp 
-  )
+        sparse_gs
+        SOURCES KokkosSparse_gs.cpp
+        TESTONLYLIBS kokkoskernelsperf_gtest
+)
+
+KOKKOSKERNELS_ADD_EXECUTABLE(
+        sparse_spiluk
+        SOURCES KokkosSparse_spiluk.cpp
+)
diff --git a/perf_test/sparse/KokkosSparse_block_pcg.cpp b/perf_test/sparse/KokkosSparse_block_pcg.cpp
index e16bdfd89f..89ab0bfdca 100644
--- a/perf_test/sparse/KokkosSparse_block_pcg.cpp
+++ b/perf_test/sparse/KokkosSparse_block_pcg.cpp
@@ -44,7 +44,7 @@
 
 #include <KokkosKernels_config.h>
 #include <iostream>
-#if defined(KOKKOSKERNELS_INST_DOUBLE) &&  \
+#if defined(KOKKOSKERNELS_INST_DOUBLE) &&     \
     defined(KOKKOSKERNELS_INST_OFFSET_INT) && \
     defined(KOKKOSKERNELS_INST_ORDINAL_INT)
 #include "KokkosSparse_pcg.hpp"
@@ -52,6 +52,8 @@
 #include "KokkosKernels_Utils.hpp"
 #include "KokkosKernels_IOUtils.hpp"
 
+#include "KokkosKernels_TestUtils.hpp"
+
 #define MAXVAL 1
 
 #define SIZE_TYPE int
@@ -59,10 +61,8 @@
 #define SCALAR_TYPE double
 unsigned cg_iteration_limit = 10;
 
-
-template<typename crsMat_t>
+template <typename crsMat_t>
 crsMat_t create_crs_matrix(char *mtx_bin_file) {
-
   using graph_t        = typename crsMat_t::StaticCrsGraphType;
   using row_map_view_t = typename graph_t::row_map_type::non_const_type;
   using cols_view_t    = typename graph_t::entries_type::non_const_type;
@@ -73,9 +73,10 @@ crsMat_t create_crs_matrix(char *mtx_bin_file) {
 
   printf("matrix file: %s\n", mtx_bin_file);
 
-  if(std::string(mtx_bin_file) == "auto") {
+  if (std::string(mtx_bin_file) == "auto") {
     INDEX_TYPE num_rows = 11, num_cols = 11, nnz = 40;
-    crsmat = KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<crsMat_t>(num_rows, num_cols, nnz, 3, 5);
+    crsmat = KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<
+        crsMat_t>(num_rows, num_cols, nnz, 3, 5);
     printf("generating test matrix automatically\n");
     printf("   num rows:      %d", num_rows);
     printf("   num cols:      %d", num_cols);
@@ -85,65 +86,69 @@ crsMat_t create_crs_matrix(char *mtx_bin_file) {
     INDEX_TYPE *xadj, *adj;
     SCALAR_TYPE *ew;
 
-    KokkosKernels::Impl::read_matrix<INDEX_TYPE,INDEX_TYPE, SCALAR_TYPE> (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file);
+    KokkosKernels::Impl::read_matrix<INDEX_TYPE, INDEX_TYPE, SCALAR_TYPE>(
+        &nv, &ne, &xadj, &adj, &ew, mtx_bin_file);
 
-    row_map_view_t rowmap_view ("rowmap_view",  nv+1);
-    cols_view_t    columns_view("colsmap_view", ne);
-    values_view_t  values_view ("values_view",  ne);
+    row_map_view_t rowmap_view("rowmap_view", nv + 1);
+    cols_view_t columns_view("colsmap_view", ne);
+    values_view_t values_view("values_view", ne);
 
-    if(KokkosKernels::Impl::kk_is_gpu_exec_space<myExecSpace>()) {
-      typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view);
-      typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view);
-      typename values_view_t::HostMirror hv = Kokkos::create_mirror_view (values_view);
+    if (KokkosKernels::Impl::kk_is_gpu_exec_space<myExecSpace>()) {
+      typename row_map_view_t::HostMirror hr =
+          Kokkos::create_mirror_view(rowmap_view);
+      typename cols_view_t::HostMirror hc =
+          Kokkos::create_mirror_view(columns_view);
+      typename values_view_t::HostMirror hv =
+          Kokkos::create_mirror_view(values_view);
 
-      for (INDEX_TYPE i = 0; i <= nv; ++i){
-	hr(i) = xadj[i];
+      for (INDEX_TYPE i = 0; i <= nv; ++i) {
+        hr(i) = xadj[i];
       }
-      for (INDEX_TYPE i = 0; i < ne; ++i){
-	hc(i) = adj[i];
-	hv(i) = ew[i];
+      for (INDEX_TYPE i = 0; i < ne; ++i) {
+        hc(i) = adj[i];
+        hv(i) = ew[i];
       }
 
-      Kokkos::deep_copy (rowmap_view , hr);
-      Kokkos::deep_copy (columns_view , hc);
-      Kokkos::deep_copy (values_view , hv);
+      Kokkos::deep_copy(rowmap_view, hr);
+      Kokkos::deep_copy(columns_view, hc);
+      Kokkos::deep_copy(values_view, hv);
     } else {
-      KokkosKernels::Impl::copy_vector<SCALAR_TYPE * , values_view_t, myExecSpace>(ne, ew, values_view);
-      KokkosKernels::Impl::copy_vector<INDEX_TYPE * , cols_view_t, myExecSpace>(ne, adj, columns_view);
-      KokkosKernels::Impl::copy_vector<INDEX_TYPE * , row_map_view_t, myExecSpace>(nv+1, xadj, rowmap_view);
+      KokkosKernels::Impl::copy_vector<SCALAR_TYPE *, values_view_t,
+                                       myExecSpace>(ne, ew, values_view);
+      KokkosKernels::Impl::copy_vector<INDEX_TYPE *, cols_view_t, myExecSpace>(
+          ne, adj, columns_view);
+      KokkosKernels::Impl::copy_vector<INDEX_TYPE *, row_map_view_t,
+                                       myExecSpace>(nv + 1, xadj, rowmap_view);
     }
 
-    graph_t static_graph (columns_view, rowmap_view);
+    graph_t static_graph(columns_view, rowmap_view);
     crsmat = crsMat_t("CrsMatrix", nv, values_view, static_graph);
-    delete [] xadj;
-    delete [] adj;
-    delete [] ew;
+    delete[] xadj;
+    delete[] adj;
+    delete[] ew;
   }
 
   return crsmat;
 }
 
+template <typename scalar_view_t>
+scalar_view_t create_x_vector(INDEX_TYPE nv, SCALAR_TYPE max_value = 1.0) {
+  scalar_view_t kok_x("X", nv);
 
-template<typename scalar_view_t>
-scalar_view_t create_x_vector(INDEX_TYPE nv, SCALAR_TYPE max_value = 1.0){
-  scalar_view_t kok_x ("X", nv);
-
-  typename scalar_view_t::HostMirror h_x =  Kokkos::create_mirror_view (kok_x);
-
+  typename scalar_view_t::HostMirror h_x = Kokkos::create_mirror_view(kok_x);
 
-  for (INDEX_TYPE i = 0; i < nv; ++i){
-    SCALAR_TYPE r = static_cast <SCALAR_TYPE> (rand()) / static_cast <SCALAR_TYPE> (RAND_MAX / max_value);
+  for (INDEX_TYPE i = 0; i < nv; ++i) {
+    SCALAR_TYPE r = static_cast<SCALAR_TYPE>(rand()) /
+                    static_cast<SCALAR_TYPE>(RAND_MAX / max_value);
     h_x(i) = r;
   }
-  Kokkos::deep_copy (kok_x, h_x);
+  Kokkos::deep_copy(kok_x, h_x);
   return kok_x;
 }
 
-
-
 template <typename crsMat_t, typename vector_t>
-vector_t create_y_vector(crsMat_t crsMat, vector_t x_vector){
-  vector_t y_vector ("Y VECTOR", crsMat.numRows());
+vector_t create_y_vector(crsMat_t crsMat, vector_t x_vector) {
+  vector_t y_vector("Y VECTOR", crsMat.numRows());
   KokkosSparse::spmv("N", 1, crsMat, x_vector, 1, y_vector);
   return y_vector;
 }
@@ -151,401 +156,380 @@ vector_t create_y_vector(crsMat_t crsMat, vector_t x_vector){
 template <typename ExecSpace, typename crsMat_t>
 void run_point_experiment(
     crsMat_t crsmat,
-	typename  crsMat_t::values_type::non_const_type kok_x_original){
-
-
-	  //typedef typename crsMat_t::StaticCrsGraphType graph_t;
-	  typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
-	  typedef typename crsMat_t::StaticCrsGraphType::row_map_type::non_const_type lno_view_t;
-	  typedef typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type lno_nnz_view_t;
-
-	  typedef typename lno_nnz_view_t::value_type lno_t;
-	  typedef typename lno_view_t::value_type size_type;
-	  typedef typename scalar_view_t::value_type scalar_t;
-	  INDEX_TYPE nv = crsmat.numRows();
-	  //scalar_view_t kok_x_original = create_x_vector<scalar_view_t>(nv, MAXVAL);
-
-	  //KokkosKernels::Impl::print_1Dview(kok_x_original);
-	  scalar_view_t kok_b_vector = create_y_vector(crsmat, kok_x_original);
-
-	  //create X vector
-	  scalar_view_t kok_x_vector("kok_x_vector", nv);
-
-
-	  double solve_time = 0;
-	  const double   cg_iteration_tolerance     = 1e-7 ;
-
-	  KokkosKernels::Experimental::Example::CGSolveResult cg_result ;
-
-
-
-
-	  typedef KokkosKernels::Experimental::KokkosKernelsHandle
-	        < size_type,
-			  lno_t,
-			  scalar_t,
-	          ExecSpace, ExecSpace, ExecSpace > KernelHandle;
-
-	  KernelHandle kh;
-
-	  kh.create_gs_handle(/*KokkosSparse::GS_TEAM*/);
-	  Kokkos::Timer timer1;
-	  KokkosKernels::Experimental::Example::pcgsolve(
-	        kh
-	      , crsmat
-	      , kok_b_vector
-	      , kok_x_vector
-	      , cg_iteration_limit
-	      , cg_iteration_tolerance
-	      , & cg_result
-	      , true
-	  );
-	  Kokkos::fence();
-
-	  solve_time = timer1.seconds();
-
-
-	  std::cout  << "DEFAULT SOLVE:"
-	      << "\n\t(P)CG_NUM_ITER              [" << cg_result.iteration << "]"
-	      << "\n\tMATVEC_TIME                 [" << cg_result.matvec_time << "]"
-	      << "\n\tCG_RESIDUAL                 [" << cg_result.norm_res << "]"
-	      << "\n\tCG_ITERATION_TIME           [" << cg_result.iter_time << "]"
-	      << "\n\tPRECONDITIONER_TIME         [" << cg_result.precond_time << "]"
-	      << "\n\tPRECONDITIONER_INIT_TIME    [" << cg_result.precond_init_time << "]"
-	      << "\n\tPRECOND_APPLY_TIME_PER_ITER [" << cg_result.precond_time / (cg_result.iteration  + 1) << "]"
-	      << "\n\tSOLVE_TIME                  [" << solve_time<< "]"
-	      << std::endl ;
-
+    typename crsMat_t::values_type::non_const_type kok_x_original) {
+  // typedef typename crsMat_t::StaticCrsGraphType graph_t;
+  typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
+  typedef typename crsMat_t::StaticCrsGraphType::row_map_type::non_const_type
+      lno_view_t;
+  typedef typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type
+      lno_nnz_view_t;
+
+  typedef typename lno_nnz_view_t::value_type lno_t;
+  typedef typename lno_view_t::value_type size_type;
+  typedef typename scalar_view_t::value_type scalar_t;
+  INDEX_TYPE nv = crsmat.numRows();
+  // scalar_view_t kok_x_original = create_x_vector<scalar_view_t>(nv, MAXVAL);
+
+  // KokkosKernels::Impl::print_1Dview(kok_x_original);
+  scalar_view_t kok_b_vector = create_y_vector(crsmat, kok_x_original);
+
+  // create X vector
+  scalar_view_t kok_x_vector("kok_x_vector", nv);
+
+  double solve_time                   = 0;
+  const double cg_iteration_tolerance = 1e-7;
+
+  KokkosKernels::Experimental::Example::CGSolveResult cg_result;
+
+  typedef KokkosKernels::Experimental::KokkosKernelsHandle<
+      size_type, lno_t, scalar_t, ExecSpace, ExecSpace, ExecSpace>
+      KernelHandle;
+
+  KernelHandle kh;
+
+  kh.create_gs_handle(/*KokkosSparse::GS_TEAM*/);
+  Kokkos::Timer timer1;
+  KokkosKernels::Experimental::Example::pcgsolve(
+      kh, crsmat, kok_b_vector, kok_x_vector, cg_iteration_limit,
+      cg_iteration_tolerance, &cg_result, true);
+  Kokkos::fence();
+
+  solve_time = timer1.seconds();
+
+  std::cout << "DEFAULT SOLVE:"
+            << "\n\t(P)CG_NUM_ITER              [" << cg_result.iteration << "]"
+            << "\n\tMATVEC_TIME                 [" << cg_result.matvec_time
+            << "]"
+            << "\n\tCG_RESIDUAL                 [" << cg_result.norm_res << "]"
+            << "\n\tCG_ITERATION_TIME           [" << cg_result.iter_time << "]"
+            << "\n\tPRECONDITIONER_TIME         [" << cg_result.precond_time
+            << "]"
+            << "\n\tPRECONDITIONER_INIT_TIME    ["
+            << cg_result.precond_init_time << "]"
+            << "\n\tPRECOND_APPLY_TIME_PER_ITER ["
+            << cg_result.precond_time / (cg_result.iteration + 1) << "]"
+            << "\n\tSOLVE_TIME                  [" << solve_time << "]"
+            << std::endl;
 
 #if KOKKOSSPARSE_IMPL_PRINTDEBUG
-	  kok_x_vector = scalar_view_t("kok_x_vector", nv);
-
-	  kh.create_gs_handle(KokkosSparse::GS_TEAM);
-	  timer1.reset();
-	  KokkosKernels::Experimental::Example::pcgsolve(
-	        kh
-	      , crsmat
-	      , kok_b_vector
-	      , kok_x_vector
-	      , cg_iteration_limit
-	      , cg_iteration_tolerance
-	      , & cg_result
-	      , true
-	  );
-	  Kokkos::fence();
-
-	  solve_time = timer1.seconds();
-
-
-
-	  std::cout  << "TEAM SOLVE:"
-	      << "\n\t(P)CG_NUM_ITER              [" << cg_result.iteration << "]"
-	      << "\n\tMATVEC_TIME                 [" << cg_result.matvec_time << "]"
-	      << "\n\tCG_RESIDUAL                 [" << cg_result.norm_res << "]"
-	      << "\n\tCG_ITERATION_TIME           [" << cg_result.iter_time << "]"
-	      << "\n\tPRECONDITIONER_TIME         [" << cg_result.precond_time << "]"
-	      << "\n\tPRECONDITIONER_INIT_TIME    [" << cg_result.precond_init_time << "]"
-	      << "\n\tPRECOND_APPLY_TIME_PER_ITER [" << cg_result.precond_time / (cg_result.iteration  + 1) << "]"
-	      << "\n\tSOLVE_TIME                  [" << solve_time<< "]"
-	      << std::endl ;
+  kok_x_vector = scalar_view_t("kok_x_vector", nv);
+
+  kh.create_gs_handle(KokkosSparse::GS_TEAM);
+  timer1.reset();
+  KokkosKernels::Experimental::Example::pcgsolve(
+      kh, crsmat, kok_b_vector, kok_x_vector, cg_iteration_limit,
+      cg_iteration_tolerance, &cg_result, true);
+  Kokkos::fence();
+
+  solve_time = timer1.seconds();
+
+  std::cout << "TEAM SOLVE:"
+            << "\n\t(P)CG_NUM_ITER              [" << cg_result.iteration << "]"
+            << "\n\tMATVEC_TIME                 [" << cg_result.matvec_time
+            << "]"
+            << "\n\tCG_RESIDUAL                 [" << cg_result.norm_res << "]"
+            << "\n\tCG_ITERATION_TIME           [" << cg_result.iter_time << "]"
+            << "\n\tPRECONDITIONER_TIME         [" << cg_result.precond_time
+            << "]"
+            << "\n\tPRECONDITIONER_INIT_TIME    ["
+            << cg_result.precond_init_time << "]"
+            << "\n\tPRECOND_APPLY_TIME_PER_ITER ["
+            << cg_result.precond_time / (cg_result.iteration + 1) << "]"
+            << "\n\tSOLVE_TIME                  [" << solve_time << "]"
+            << std::endl;
 #endif
 }
 
-
 template <typename ExecSpace, typename crsMat_t>
 void run_block_experiment(
-    crsMat_t point_crsmat,
-	crsMat_t block_crsmat, int block_size,
-	typename crsMat_t::values_type::non_const_type kok_x_original){
-
-
-	  //typedef typename crsMat_t::StaticCrsGraphType graph_t;
-	  typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
-	  typedef typename crsMat_t::StaticCrsGraphType::row_map_type::non_const_type lno_view_t;
-	  typedef typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type lno_nnz_view_t;
-
-	  typedef typename lno_nnz_view_t::value_type lno_t;
-	  typedef typename lno_view_t::value_type size_type;
-	  typedef typename scalar_view_t::value_type scalar_t;
-	  INDEX_TYPE nv = point_crsmat.numRows();
-	  //scalar_view_t kok_x_original = create_x_vector<scalar_view_t>(nv, MAXVAL);
-
-	  //KokkosKernels::Impl::print_1Dview(kok_x_original);
-	  scalar_view_t kok_b_vector = create_y_vector(point_crsmat, kok_x_original);
-
-	  //create X vector
-	  scalar_view_t kok_x_vector("kok_x_vector", nv);
-
-
-	  double solve_time = 0;
-	  //const unsigned cg_iteration_limit = 10;
-	  const double   cg_iteration_tolerance     = 1e-7 ;
-
-	  KokkosKernels::Experimental::Example::CGSolveResult cg_result ;
-
-
-
-
-	  typedef KokkosKernels::Experimental::KokkosKernelsHandle
-	        < size_type,
-			  lno_t,
-			  scalar_t,
-	          ExecSpace, ExecSpace, ExecSpace > KernelHandle;
-
-	  KernelHandle kh;
-
-
-	  kh.create_gs_handle();
-	  Kokkos::Timer timer1;
-	  KokkosKernels::Experimental::Example::block_pcgsolve(
-	        kh
-	      , point_crsmat
-		  , block_crsmat, block_size
-	      , kok_b_vector
-	      , kok_x_vector
-	      , cg_iteration_limit
-	      , cg_iteration_tolerance
-	      , & cg_result
-	      , true
-	  );
-	  Kokkos::fence();
-
-	  solve_time = timer1.seconds();
-
-
-	  std::cout  << "DEFAULT SOLVE:"
-	      << "\n\t(P)CG_NUM_ITER              [" << cg_result.iteration << "]"
-	      << "\n\tMATVEC_TIME                 [" << cg_result.matvec_time << "]"
-	      << "\n\tCG_RESIDUAL                 [" << cg_result.norm_res << "]"
-	      << "\n\tCG_ITERATION_TIME           [" << cg_result.iter_time << "]"
-	      << "\n\tPRECONDITIONER_TIME         [" << cg_result.precond_time << "]"
-	      << "\n\tPRECONDITIONER_INIT_TIME    [" << cg_result.precond_init_time << "]"
-	      << "\n\tPRECOND_APPLY_TIME_PER_ITER [" << cg_result.precond_time / (cg_result.iteration  + 1) << "]"
-	      << "\n\tSOLVE_TIME                  [" << solve_time<< "]"
-	      << std::endl ;
+    crsMat_t point_crsmat, crsMat_t block_crsmat, int block_size,
+    typename crsMat_t::values_type::non_const_type kok_x_original) {
+  // typedef typename crsMat_t::StaticCrsGraphType graph_t;
+  typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
+  typedef typename crsMat_t::StaticCrsGraphType::row_map_type::non_const_type
+      lno_view_t;
+  typedef typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type
+      lno_nnz_view_t;
+
+  typedef typename lno_nnz_view_t::value_type lno_t;
+  typedef typename lno_view_t::value_type size_type;
+  typedef typename scalar_view_t::value_type scalar_t;
+  INDEX_TYPE nv = point_crsmat.numRows();
+  // scalar_view_t kok_x_original = create_x_vector<scalar_view_t>(nv, MAXVAL);
+
+  // KokkosKernels::Impl::print_1Dview(kok_x_original);
+  scalar_view_t kok_b_vector = create_y_vector(point_crsmat, kok_x_original);
+
+  // create X vector
+  scalar_view_t kok_x_vector("kok_x_vector", nv);
+
+  double solve_time = 0;
+  // const unsigned cg_iteration_limit = 10;
+  const double cg_iteration_tolerance = 1e-7;
+
+  KokkosKernels::Experimental::Example::CGSolveResult cg_result;
+
+  typedef KokkosKernels::Experimental::KokkosKernelsHandle<
+      size_type, lno_t, scalar_t, ExecSpace, ExecSpace, ExecSpace>
+      KernelHandle;
+
+  KernelHandle kh;
+
+  kh.create_gs_handle();
+  Kokkos::Timer timer1;
+  KokkosKernels::Experimental::Example::block_pcgsolve(
+      kh, point_crsmat, block_crsmat, block_size, kok_b_vector, kok_x_vector,
+      cg_iteration_limit, cg_iteration_tolerance, &cg_result, true);
+  Kokkos::fence();
+
+  solve_time = timer1.seconds();
+
+  std::cout << "DEFAULT SOLVE:"
+            << "\n\t(P)CG_NUM_ITER              [" << cg_result.iteration << "]"
+            << "\n\tMATVEC_TIME                 [" << cg_result.matvec_time
+            << "]"
+            << "\n\tCG_RESIDUAL                 [" << cg_result.norm_res << "]"
+            << "\n\tCG_ITERATION_TIME           [" << cg_result.iter_time << "]"
+            << "\n\tPRECONDITIONER_TIME         [" << cg_result.precond_time
+            << "]"
+            << "\n\tPRECONDITIONER_INIT_TIME    ["
+            << cg_result.precond_init_time << "]"
+            << "\n\tPRECOND_APPLY_TIME_PER_ITER ["
+            << cg_result.precond_time / (cg_result.iteration + 1) << "]"
+            << "\n\tSOLVE_TIME                  [" << solve_time << "]"
+            << std::endl;
 }
 
-
 template <typename ExecSpace, typename crsMat_t>
 void run_experiment(
     crsMat_t crsmat,
-	typename  crsMat_t::values_type::non_const_type kok_x_original, int block_size = 5 ){
-
-	//run_point_experiment<ExecSpace, crsMat_t>(crsmat, kok_x_original);
+    typename crsMat_t::values_type::non_const_type kok_x_original,
+    int block_size = 5) {
+  // run_point_experiment<ExecSpace, crsMat_t>(crsmat, kok_x_original);
 
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
-  typedef typename crsMat_t::StaticCrsGraphType::row_map_type::non_const_type lno_view_t;
-  typedef typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type lno_nnz_view_t;
+  typedef typename crsMat_t::StaticCrsGraphType::row_map_type::non_const_type
+      lno_view_t;
+  typedef typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type
+      lno_nnz_view_t;
 
   lno_view_t pf_rm;
   lno_nnz_view_t pf_e;
   scalar_view_t pf_v;
   size_t out_r, out_c;
 
-  //typedef typename lno_nnz_view_t::value_type lno_t;
-  //typedef typename lno_view_t::value_type size_type;
-  //typedef typename scalar_view_t::value_type scalar_t;
+  // typedef typename lno_nnz_view_t::value_type lno_t;
+  // typedef typename lno_view_t::value_type size_type;
+  // typedef typename scalar_view_t::value_type scalar_t;
   KokkosKernels::Impl::kk_create_blockcrs_formated_point_crsmatrix(
-		  block_size , crsmat.numRows(), crsmat.numCols(),
-		  crsmat.graph.row_map, crsmat.graph.entries, crsmat.values,
-		  out_r, out_c,
-		  pf_rm, pf_e, pf_v);
-
+      block_size, crsmat.numRows(), crsmat.numCols(), crsmat.graph.row_map,
+      crsmat.graph.entries, crsmat.values, out_r, out_c, pf_rm, pf_e, pf_v);
 
 #if KOKKOSSPARSE_IMPL_PRINTDEBUG
-  std::cout << "nr:" << crsmat.numRows() << " nc:" << crsmat.numCols() << std::endl;
+  std::cout << "nr:" << crsmat.numRows() << " nc:" << crsmat.numCols()
+            << std::endl;
 
   KokkosKernels::Impl::print_1Dview(crsmat.graph.row_map);
   KokkosKernels::Impl::print_1Dview(crsmat.graph.entries);
   KokkosKernels::Impl::print_1Dview(crsmat.values);
-  std::cout << "onr:" << out_r << " oc:" << out_c<< std::endl;
+  std::cout << "onr:" << out_r << " oc:" << out_c << std::endl;
 
   KokkosKernels::Impl::print_1Dview(pf_rm);
   KokkosKernels::Impl::print_1Dview(pf_e);
   KokkosKernels::Impl::print_1Dview(pf_v);
 #endif
 
-  graph_t static_graph2 (pf_e, pf_rm);
+  graph_t static_graph2(pf_e, pf_rm);
   crsMat_t crsmat2("CrsMatrix2", out_c, pf_v, static_graph2);
   run_point_experiment<ExecSpace, crsMat_t>(crsmat2, kok_x_original);
 
-
   lno_view_t bf_rm;
   lno_nnz_view_t bf_e;
   scalar_view_t bf_v;
   size_t but_r, but_c;
 
   KokkosKernels::Impl::kk_create_blockcrs_from_blockcrs_formatted_point_crs(
-		  block_size , out_r, out_c,
-		  pf_rm, pf_e, pf_v,
-		  but_r, but_c,
-		  bf_rm, bf_e, bf_v);
+      block_size, out_r, out_c, pf_rm, pf_e, pf_v, but_r, but_c, bf_rm, bf_e,
+      bf_v);
 
 #if KOKKOSSPARSE_IMPL_PRINTDEBUG
   KokkosKernels::Impl::print_1Dview(bf_rm);
   KokkosKernels::Impl::print_1Dview(bf_e);
   KokkosKernels::Impl::print_1Dview(bf_v);
 #endif
-  graph_t static_graph3 (bf_e, bf_rm);
+  graph_t static_graph3(bf_e, bf_rm);
   crsMat_t crsmat3("CrsMatrix3", but_c, bf_v, static_graph3);
-  run_block_experiment<ExecSpace, crsMat_t>(crsmat2, crsmat3, block_size, kok_x_original);
-
+  run_block_experiment<ExecSpace, crsMat_t>(crsmat2, crsmat3, block_size,
+                                            kok_x_original);
 }
 
-
-
-
-enum { CMD_USE_THREADS = 0
-     , CMD_USE_NUMA
-     , CMD_USE_CORE_PER_NUMA
-     , CMD_USE_CUDA
-     , CMD_USE_OPENMP
-     , CMD_USE_SERIAL
-     , CMD_USE_CUDA_DEV
-     , CMD_BIN_MTX
-     , CMD_ERROR
-     , CMD_COUNT};
-
-int main (int argc, char ** argv){
-
-
-  int cmdline[ CMD_COUNT ] ;
+enum {
+  CMD_USE_THREADS = 0,
+  CMD_USE_NUMA,
+  CMD_USE_CORE_PER_NUMA,
+  CMD_USE_CUDA,
+  CMD_USE_OPENMP,
+  CMD_USE_SERIAL,
+  CMD_USE_CUDA_DEV,
+  CMD_BIN_MTX,
+  CMD_ERROR,
+  CMD_COUNT
+};
+
+int main(int argc, char **argv) {
+  int cmdline[CMD_COUNT];
   char *mtx_bin_file = NULL;
-  int block_size = 5;
+  int block_size     = 5;
   struct Kokkos::InitArguments kargs;
 
-  for ( int i = 0 ; i < CMD_COUNT ; ++i ) cmdline[i] = 0 ;
-
-
-  for ( int i = 1 ; i < argc ; ++i ) {
-    if ( 0 == strcasecmp( argv[i] , "--serial" ) ) {
-      cmdline[ CMD_USE_SERIAL ] = 1 ;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--threads" ) ) {
-      kargs.num_threads = cmdline[ CMD_USE_THREADS ] = atoi( argv[++i] );
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--openmp" ) ) {
-       kargs.num_threads = cmdline[ CMD_USE_OPENMP ] = atoi( argv[++i] );
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) {
-      cmdline[ CMD_USE_CUDA ] = 1 ;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--mtx" ) ) {
+  for (int i = 0; i < CMD_COUNT; ++i) cmdline[i] = 0;
+
+  for (int i = 1; i < argc; ++i) {
+    if (0 == Test::string_compare_no_case(argv[i], "--serial")) {
+      cmdline[CMD_USE_SERIAL] = 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--threads")) {
+      kargs.num_threads = cmdline[CMD_USE_THREADS] = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) {
+      kargs.num_threads = cmdline[CMD_USE_OPENMP] = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) {
+      cmdline[CMD_USE_CUDA] = 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--mtx")) {
       mtx_bin_file = argv[++i];
     }
 
-    else if ( 0 == strcasecmp( argv[i] , "--block_size" ) ) {
-    	block_size = atoi( argv[++i] );
+    else if (0 == Test::string_compare_no_case(argv[i], "--block_size")) {
+      block_size = atoi(argv[++i]);
     }
 
-    else if ( 0 == strcasecmp( argv[i] , "--iteration" ) ) {
-    	cg_iteration_limit = atoi( argv[++i] );
-    }
-    else {
-      cmdline[ CMD_ERROR ] = 1 ;
-      std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ;
-      std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--cuda-dev[DeviceIndex]\n\t--mtx[binary_mtx_file]" << std::endl;
+    else if (0 == Test::string_compare_no_case(argv[i], "--iteration")) {
+      cg_iteration_limit = atoi(argv[++i]);
+    } else {
+      cmdline[CMD_ERROR] = 1;
+      std::cerr << "Unrecognized command line argument #" << i << ": "
+                << argv[i] << std::endl;
+      std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp "
+                   "[numThreads]\n\t--cuda\n\t--cuda-dev[DeviceIndex]\n\t--mtx["
+                   "binary_mtx_file]"
+                << std::endl;
 
       return 1;
     }
   }
 
-  if (mtx_bin_file == NULL){
-    std::cerr << "Provide a mtx binary file or specify auto-generation" << std::endl ;
-    std::cerr << "OPTIONS\n\t--serial\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--cuda-dev[DeviceIndex]\n\t--mtx[binary_mtx_file|auto]" << std::endl;
+  if (mtx_bin_file == NULL) {
+    std::cerr << "Provide a mtx binary file or specify auto-generation"
+              << std::endl;
+    std::cerr << "OPTIONS\n\t--serial\n\t--threads [numThreads]\n\t--openmp "
+                 "[numThreads]\n\t--cuda\n\t--cuda-dev[DeviceIndex]\n\t--mtx["
+                 "binary_mtx_file|auto]"
+              << std::endl;
     return 1;
   }
-  std::cout << "Running experiments with block size:" << block_size << std::endl;
-
+  std::cout << "Running experiments with block size:" << block_size
+            << std::endl;
 
   Kokkos::initialize(kargs);
 
+#if defined(KOKKOS_ENABLE_SERIAL)
 
-#if defined( KOKKOS_ENABLE_SERIAL )
-
-    if ( cmdline[ CMD_USE_SERIAL ] ) {
-      using myExecSpace = Kokkos::Serial;
-      Kokkos::Serial::print_configuration(std::cout);
+  if (cmdline[CMD_USE_SERIAL]) {
+    using myExecSpace = Kokkos::Serial;
+    Kokkos::Serial::print_configuration(std::cout);
 
-      using crsMat_t = typename KokkosSparse::CrsMatrix<SCALAR_TYPE, INDEX_TYPE, myExecSpace, void, SIZE_TYPE >;
-      crsMat_t crsmat = create_crs_matrix<crsMat_t>(mtx_bin_file);
-      INDEX_TYPE nv = crsmat.numRows();
+    using crsMat_t =
+        typename KokkosSparse::CrsMatrix<SCALAR_TYPE, INDEX_TYPE, myExecSpace,
+                                         void, SIZE_TYPE>;
+    crsMat_t crsmat = create_crs_matrix<crsMat_t>(mtx_bin_file);
+    INDEX_TYPE nv   = crsmat.numRows();
 
-      using values_view_t  = typename crsMat_t::values_type::non_const_type;
-      values_view_t kok_x_original = create_x_vector<values_view_t>(((nv /block_size) + 1) * block_size, MAXVAL);
-      for (INDEX_TYPE i = nv; i < ((nv /block_size) + 1) * block_size; ++i){
-    	  kok_x_original(i) = 0;
-      }
-      run_experiment<myExecSpace, crsMat_t>(crsmat, kok_x_original, block_size);
+    using values_view_t = typename crsMat_t::values_type::non_const_type;
+    values_view_t kok_x_original = create_x_vector<values_view_t>(
+        ((nv / block_size) + 1) * block_size, MAXVAL);
+    for (INDEX_TYPE i = nv; i < ((nv / block_size) + 1) * block_size; ++i) {
+      kok_x_original(i) = 0;
     }
+    run_experiment<myExecSpace, crsMat_t>(crsmat, kok_x_original, block_size);
+  }
 
 #endif
 
-#if defined( KOKKOS_ENABLE_THREADS )
+#if defined(KOKKOS_ENABLE_THREADS)
 
-    if ( cmdline[ CMD_USE_THREADS ] ) {
-      using myExecSpace = Kokkos::Threads;
-      Kokkos::Threads::print_configuration(std::cout);
+  if (cmdline[CMD_USE_THREADS]) {
+    using myExecSpace = Kokkos::Threads;
+    Kokkos::Threads::print_configuration(std::cout);
 
-      using crsMat_t = typename KokkosSparse::CrsMatrix<SCALAR_TYPE, INDEX_TYPE, myExecSpace, void, SIZE_TYPE >;
-      crsMat_t crsmat = create_crs_matrix<crsMat_t>(mtx_bin_file);
-      INDEX_TYPE nv = crsmat.numRows();
+    using crsMat_t =
+        typename KokkosSparse::CrsMatrix<SCALAR_TYPE, INDEX_TYPE, myExecSpace,
+                                         void, SIZE_TYPE>;
+    crsMat_t crsmat = create_crs_matrix<crsMat_t>(mtx_bin_file);
+    INDEX_TYPE nv   = crsmat.numRows();
 
-      using values_view_t  = typename crsMat_t::values_type::non_const_type;
-      values_view_t kok_x_original = create_x_vector<values_view_t>(((nv /block_size) + 1) * block_size, MAXVAL);
-      for (INDEX_TYPE i = nv; i < ((nv /block_size) + 1) * block_size; ++i){
-    	  kok_x_original(i) = 0;
-      }
-      run_experiment<myExecSpace, crsMat_t>(crsmat, kok_x_original, block_size);
+    using values_view_t = typename crsMat_t::values_type::non_const_type;
+    values_view_t kok_x_original = create_x_vector<values_view_t>(
+        ((nv / block_size) + 1) * block_size, MAXVAL);
+    for (INDEX_TYPE i = nv; i < ((nv / block_size) + 1) * block_size; ++i) {
+      kok_x_original(i) = 0;
     }
+    run_experiment<myExecSpace, crsMat_t>(crsmat, kok_x_original, block_size);
+  }
 
 #endif
 
-#if defined( KOKKOS_ENABLE_OPENMP )
+#if defined(KOKKOS_ENABLE_OPENMP)
 
-    if ( cmdline[ CMD_USE_OPENMP ] ) {
-      using myExecSpace = Kokkos::OpenMP;
-      Kokkos::OpenMP::print_configuration(std::cout);
+  if (cmdline[CMD_USE_OPENMP]) {
+    using myExecSpace = Kokkos::OpenMP;
+    Kokkos::OpenMP::print_configuration(std::cout);
 
-      using crsMat_t = typename KokkosSparse::CrsMatrix<SCALAR_TYPE, INDEX_TYPE, myExecSpace, void, SIZE_TYPE >;
-      crsMat_t crsmat = create_crs_matrix<crsMat_t>(mtx_bin_file);
-      INDEX_TYPE nv = crsmat.numRows();
+    using crsMat_t =
+        typename KokkosSparse::CrsMatrix<SCALAR_TYPE, INDEX_TYPE, myExecSpace,
+                                         void, SIZE_TYPE>;
+    crsMat_t crsmat = create_crs_matrix<crsMat_t>(mtx_bin_file);
+    INDEX_TYPE nv   = crsmat.numRows();
 
-      using values_view_t  = typename crsMat_t::values_type::non_const_type;
-      values_view_t kok_x_original = create_x_vector<values_view_t>(((nv /block_size) + 1) * block_size, MAXVAL);
-      for (INDEX_TYPE i = nv; i < ((nv /block_size) + 1) * block_size; ++i){
-    	  kok_x_original(i) = 0;
-      }
-      run_experiment<myExecSpace, crsMat_t>(crsmat, kok_x_original, block_size);
+    using values_view_t = typename crsMat_t::values_type::non_const_type;
+    values_view_t kok_x_original = create_x_vector<values_view_t>(
+        ((nv / block_size) + 1) * block_size, MAXVAL);
+    for (INDEX_TYPE i = nv; i < ((nv / block_size) + 1) * block_size; ++i) {
+      kok_x_original(i) = 0;
     }
+    run_experiment<myExecSpace, crsMat_t>(crsmat, kok_x_original, block_size);
+  }
 
 #endif
 
-#if defined( KOKKOS_ENABLE_CUDA )
-    if ( cmdline[ CMD_USE_CUDA ] ) {
-      // Use the last device:
-      using myExecSpace = Kokkos::Cuda;
-      Kokkos::Cuda::print_configuration(std::cout);
-
-      using crsMat_t = typename KokkosSparse::CrsMatrix<SCALAR_TYPE, INDEX_TYPE, myExecSpace, void, SIZE_TYPE >;
-      crsMat_t crsmat = create_crs_matrix<crsMat_t>(mtx_bin_file);
-      INDEX_TYPE nv = crsmat.numRows();
-
-      using values_view_t  = typename crsMat_t::values_type::non_const_type;
-      values_view_t kok_x_original = create_x_vector<values_view_t>(((nv /block_size) + 1) * block_size, MAXVAL);
-      run_experiment<myExecSpace, crsMat_t>(crsmat, kok_x_original, block_size);
-    }
+#if defined(KOKKOS_ENABLE_CUDA)
+  if (cmdline[CMD_USE_CUDA]) {
+    // Use the last device:
+    using myExecSpace = Kokkos::Cuda;
+    Kokkos::Cuda::print_configuration(std::cout);
+
+    using crsMat_t =
+        typename KokkosSparse::CrsMatrix<SCALAR_TYPE, INDEX_TYPE, myExecSpace,
+                                         void, SIZE_TYPE>;
+    crsMat_t crsmat = create_crs_matrix<crsMat_t>(mtx_bin_file);
+    INDEX_TYPE nv   = crsmat.numRows();
+
+    using values_view_t = typename crsMat_t::values_type::non_const_type;
+    values_view_t kok_x_original = create_x_vector<values_view_t>(
+        ((nv / block_size) + 1) * block_size, MAXVAL);
+    run_experiment<myExecSpace, crsMat_t>(crsmat, kok_x_original, block_size);
+  }
 
 #endif
   Kokkos::finalize();
 
-
   return 0;
 }
-#else //defined(KOKKOSKERNELS_INST_DOUBLE) &&  defined(KOKKOSKERNELS_INST_OFFSET_INT) &&     defined(KOKKOSKERNELS_INST_ORDINAL_INT)
+#else  // defined(KOKKOSKERNELS_INST_DOUBLE) &&
+       // defined(KOKKOSKERNELS_INST_OFFSET_INT) &&
+       // defined(KOKKOSKERNELS_INST_ORDINAL_INT)
 
 int main() {
-  std::cerr << "PCG is configured with INT, INT, DOUBLE. Test is not compiled as these are not instantiated." << std::endl; 
+  std::cerr << "PCG is configured with INT, INT, DOUBLE. Test is not compiled "
+               "as these are not instantiated."
+            << std::endl;
 }
 #endif
diff --git a/perf_test/sparse/KokkosSparse_gs.cpp b/perf_test/sparse/KokkosSparse_gs.cpp
index 6483a95373..3d2be67676 100644
--- a/perf_test/sparse/KokkosSparse_gs.cpp
+++ b/perf_test/sparse/KokkosSparse_gs.cpp
@@ -53,6 +53,7 @@
 #include <KokkosKernels_config.h>
 #include "KokkosKernels_default_types.hpp"
 #include <iostream>
+#include <random>
 #include <vector>
 #include <string>
 #include <unordered_set>
@@ -61,41 +62,37 @@ using std::cout;
 using std::string;
 using namespace KokkosSparse;
 
-static char* getNextArg(int& i, int argc, char** argv)
-{
+static char* getNextArg(int& i, int argc, char** argv) {
   i++;
-  if(i >= argc)
-  {
+  if (i >= argc) {
     std::cerr << "Error: expected additional command-line argument!\n";
     exit(1);
   }
   return argv[i];
 }
 
-struct GS_Parameters
-{
+struct GS_Parameters {
   const char* matrix_path = nullptr;
-  int n = 10000;
-  int nnzPerRow = 27;
-  int numLongRows = 0;
-  int minNnzPerLongRow = 1000;
-  int maxNnzPerLongRow = 2000;
-  bool graph_symmetric = false;
-  int sweeps = 1;
-  GSAlgorithm algo = GS_DEFAULT;
-  GSDirection direction = GS_FORWARD;
-  //Point:
+  int n                   = 10000;
+  int nnzPerRow           = 27;
+  int numLongRows         = 0;
+  int minNnzPerLongRow    = 1000;
+  int maxNnzPerLongRow    = 2000;
+  bool graph_symmetric    = false;
+  int sweeps              = 1;
+  GSAlgorithm algo        = GS_DEFAULT;
+  GSDirection direction   = GS_FORWARD;
+  // Point:
   int longRowThreshold = 0;
-  //Cluster:
+  // Cluster:
   ClusteringAlgorithm coarse_algo = CLUSTER_DEFAULT;
-  int cluster_size = 10;
-  //Two stage:
+  int cluster_size                = 10;
+  // Two stage:
   bool classic = false;
 };
 
-template<typename crsMat_t>
-crsMat_t generateLongRowMatrix(const GS_Parameters& params)
-{
+template <typename crsMat_t>
+crsMat_t generateLongRowMatrix(const GS_Parameters& params) {
   typedef typename crsMat_t::value_type scalar_t;
   typedef typename crsMat_t::ordinal_type lno_t;
   typedef typename crsMat_t::size_type size_type;
@@ -103,213 +100,215 @@ crsMat_t generateLongRowMatrix(const GS_Parameters& params)
   typedef typename crsMat_t::index_type::non_const_type entries_view_t;
   typedef typename crsMat_t::row_map_type::non_const_type rowmap_view_t;
   typedef typename crsMat_t::device_type device;
-  //Generate random diag. dominant matrix
+  // Generate random diag. dominant matrix
   srand(245);
   std::vector<size_type> rowmap = {0};
   std::vector<lno_t> entries;
   std::vector<scalar_t> values;
   std::vector<lno_t> rowLengths;
   lno_t numRows = params.n;
-  for(lno_t i = 0; i < numRows; i++)
-  {
-    if(i < params.numLongRows)
-    {
+  for (lno_t i = 0; i < numRows; i++) {
+    if (i < params.numLongRows) {
       lno_t interval = params.maxNnzPerLongRow - params.minNnzPerLongRow;
       lno_t rowLen;
-      if(interval == 0)
+      if (interval == 0)
         rowLen = params.maxNnzPerLongRow;
       else
         rowLen = params.minNnzPerLongRow + rand() % interval;
-      if(rowLen > numRows)
-        rowLen = numRows;
+      if (rowLen > numRows) rowLen = numRows;
       rowLengths.push_back(rowLen);
-    }
-    else
+    } else
       rowLengths.push_back(params.nnzPerRow);
   }
-  std::random_shuffle(rowLengths.begin(), rowLengths.end());
+  std::shuffle(rowLengths.begin(), rowLengths.end(),
+               std::mt19937(std::random_device()()));
   size_type totalEntries = 0;
-  int randSteps = 1000000;
-  //Set of columns inserted so far into current short row
+  int randSteps          = 1000000;
+  // Set of columns inserted so far into current short row
   std::unordered_set<lno_t> shortRowEntries;
-  //Set of all possible rows, randomly permuted (select long row entries from head)
+  // Set of all possible rows, randomly permuted (select long row entries from
+  // head)
   std::vector<lno_t> longRowEntries(numRows);
-  for(lno_t i = 0; i < numRows; i++)
-    longRowEntries[i] = i;
+  for (lno_t i = 0; i < numRows; i++) longRowEntries[i] = i;
   const scalar_t one = Kokkos::reduction_identity<scalar_t>::prod();
-  for(lno_t i = 0; i < numRows; i++)
-  {
+  for (lno_t i = 0; i < numRows; i++) {
     shortRowEntries.clear();
     bool rowIsLong = rowLengths[i] > params.nnzPerRow;
-    if(rowIsLong)
-      std::random_shuffle(longRowEntries.begin(), longRowEntries.end());
-    for(lno_t ent = 0; ent < rowLengths[i]; ent++)
-    {
-      if(ent == 0)
-      {
+    if (rowIsLong)
+      std::shuffle(longRowEntries.begin(), longRowEntries.end(),
+                   std::mt19937(std::random_device()()));
+    for (lno_t ent = 0; ent < rowLengths[i]; ent++) {
+      if (ent == 0) {
         entries.push_back(i);
         values.push_back(5.0 + 3.0 * (rand() % randSteps) / randSteps * one);
-      }
-      else
-      {
-        if(rowIsLong)
+      } else {
+        if (rowIsLong)
           entries.push_back(longRowEntries[ent]);
-        else
-        {
-          //re-roll random column until one is found that isn't already in row
+        else {
+          // re-roll random column until one is found that isn't already in row
           lno_t col;
-          while(true)
-          {
+          while (true) {
             col = rand() % numRows;
-            if(shortRowEntries.find(col) == shortRowEntries.end())
-            {
+            if (shortRowEntries.find(col) == shortRowEntries.end()) {
               shortRowEntries.insert(col);
               break;
             }
           }
           entries.push_back(col);
         }
-        values.push_back((-0.1 + (0.2 * (rand() % randSteps) / randSteps)) * one);
+        values.push_back((-0.1 + (0.2 * (rand() % randSteps) / randSteps)) *
+                         one);
       }
     }
     totalEntries += rowLengths[i];
     rowmap.push_back(totalEntries);
   }
-  scalar_view_t valuesView(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values"), totalEntries);
-  entries_view_t entriesView(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries"), totalEntries);
-  rowmap_view_t rowmapView(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Rowmap"), numRows + 1);
-  Kokkos::deep_copy(valuesView, Kokkos::View<scalar_t*, Kokkos::HostSpace>(values.data(), totalEntries));
-  Kokkos::deep_copy(entriesView, Kokkos::View<lno_t*, Kokkos::HostSpace>(entries.data(), totalEntries));
-  Kokkos::deep_copy(rowmapView, Kokkos::View<size_type*, Kokkos::HostSpace>(rowmap.data(), numRows + 1));
-  crsMat_t A("A", numRows, numRows, totalEntries, valuesView, rowmapView, entriesView);
+  scalar_view_t valuesView(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values"), totalEntries);
+  entries_view_t entriesView(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries"), totalEntries);
+  rowmap_view_t rowmapView(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Rowmap"), numRows + 1);
+  Kokkos::deep_copy(valuesView, Kokkos::View<scalar_t*, Kokkos::HostSpace>(
+                                    values.data(), totalEntries));
+  Kokkos::deep_copy(entriesView, Kokkos::View<lno_t*, Kokkos::HostSpace>(
+                                     entries.data(), totalEntries));
+  Kokkos::deep_copy(rowmapView, Kokkos::View<size_type*, Kokkos::HostSpace>(
+                                    rowmap.data(), numRows + 1));
+  crsMat_t A("A", numRows, numRows, totalEntries, valuesView, rowmapView,
+             entriesView);
   A = KokkosKernels::sort_and_merge_matrix(A);
-  if(params.graph_symmetric)
-  {
-    //Symmetrize on host, rather than relying on the parallel versions (those can be tested for symmetric=false)
+  if (params.graph_symmetric) {
+    // Symmetrize on host, rather than relying on the parallel versions (those
+    // can be tested for symmetric=false)
     A = Test::symmetrize<scalar_t, lno_t, size_type, device, crsMat_t>(A);
   }
   return A;
 }
 
-template<typename device_t>
-void runGS(const GS_Parameters& params)
-{
+template <typename device_t>
+void runGS(const GS_Parameters& params) {
   typedef default_scalar scalar_t;
   typedef default_lno_t lno_t;
   typedef default_size_type size_type;
   typedef typename device_t::execution_space exec_space;
   typedef typename device_t::memory_space mem_space;
-  typedef KokkosKernels::Experimental::KokkosKernelsHandle<size_type, lno_t, scalar_t, exec_space, mem_space, mem_space> KernelHandle;
-  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device_t, void, size_type> crsMat_t;
-  //typedef typename crsMat_t::StaticCrsGraphType graph_t;
+  typedef KokkosKernels::Experimental::KokkosKernelsHandle<
+      size_type, lno_t, scalar_t, exec_space, mem_space, mem_space>
+      KernelHandle;
+  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device_t, void,
+                                           size_type>
+      crsMat_t;
+  // typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
   crsMat_t A;
-  if(params.matrix_path)
-    A = KokkosKernels::Impl::read_kokkos_crst_matrix<crsMat_t>(params.matrix_path);
+  if (params.matrix_path)
+    A = KokkosKernels::Impl::read_kokkos_crst_matrix<crsMat_t>(
+        params.matrix_path);
   else
     A = generateLongRowMatrix<crsMat_t>(params);
   lno_t nrows = A.numRows();
   lno_t ncols = A.numCols();
-  if(nrows != ncols)
-  {
+  if (nrows != ncols) {
     cout << "ERROR: Gauss-Seidel only works for square matrices\n";
     Kokkos::finalize();
     exit(1);
   }
-  //size_type nnz = A.nnz();
+  // size_type nnz = A.nnz();
   KernelHandle kh;
-  //use a random RHS - uniformly distributed over (-5, 5)
+  // use a random RHS - uniformly distributed over (-5, 5)
   scalar_view_t b("b", nrows);
   {
     srand(54321);
     auto bhost = Kokkos::create_mirror_view(b);
-    for(lno_t i = 0; i < nrows; i++)
-    {
+    for (lno_t i = 0; i < nrows; i++) {
       bhost(i) = 10.0 * rand() / RAND_MAX - 5.0;
     }
     Kokkos::deep_copy(b, bhost);
   }
   double bnorm = KokkosBlas::nrm2(b);
-  //initial LHS is 0
+  // initial LHS is 0
   scalar_view_t x("x", nrows);
-  //how long symbolic/numeric phases take (the graph reuse case isn't that interesting since numeric doesn't do much)
+  // how long symbolic/numeric phases take (the graph reuse case isn't that
+  // interesting since numeric doesn't do much)
   Kokkos::Timer timer;
-  //cluster size of 1 is standard multicolor GS
-  if(params.algo == GS_DEFAULT)
-  {
+  // cluster size of 1 is standard multicolor GS
+  if (params.algo == GS_DEFAULT) {
     kh.create_gs_handle();
     kh.get_point_gs_handle()->set_long_row_threshold(params.longRowThreshold);
-  }
-  else if(params.algo == GS_CLUSTER)
-  {
+  } else if (params.algo == GS_CLUSTER) {
     kh.create_gs_handle(params.coarse_algo, params.cluster_size);
-  }
-  else
-  {
+  } else {
     kh.create_gs_handle(params.algo);
-    if(params.algo == GS_TWOSTAGE)
-      kh.set_gs_twostage(!params.classic, nrows);
+    if (params.algo == GS_TWOSTAGE) kh.set_gs_twostage(!params.classic, nrows);
   }
   timer.reset();
-  KokkosSparse::Experimental::gauss_seidel_symbolic
-    (&kh, nrows, nrows, A.graph.row_map, A.graph.entries, params.graph_symmetric);
+  KokkosSparse::Experimental::gauss_seidel_symbolic(
+      &kh, nrows, nrows, A.graph.row_map, A.graph.entries,
+      params.graph_symmetric);
   double symbolicTime = timer.seconds();
   std::cout << "\n*** Symbolic time: " << symbolicTime << '\n';
   timer.reset();
-  KokkosSparse::Experimental::gauss_seidel_numeric
-    (&kh, nrows, nrows, A.graph.row_map, A.graph.entries, A.values, params.graph_symmetric);
+  KokkosSparse::Experimental::gauss_seidel_numeric(
+      &kh, nrows, nrows, A.graph.row_map, A.graph.entries, A.values,
+      params.graph_symmetric);
   double numericTime = timer.seconds();
   std::cout << "\n*** Numeric time: " << numericTime << '\n';
   timer.reset();
-  //Last two parameters are damping factor (should be 1) and sweeps
-  switch(params.direction)
-  {
+  // Last two parameters are damping factor (should be 1) and sweeps
+  switch (params.direction) {
     case GS_SYMMETRIC:
-      KokkosSparse::Experimental::symmetric_gauss_seidel_apply
-        (&kh, nrows, nrows, A.graph.row_map, A.graph.entries, A.values, x, b, true, true, 1.0, params.sweeps);
+      KokkosSparse::Experimental::symmetric_gauss_seidel_apply(
+          &kh, nrows, nrows, A.graph.row_map, A.graph.entries, A.values, x, b,
+          true, true, 1.0, params.sweeps);
       break;
     case GS_FORWARD:
-      KokkosSparse::Experimental::forward_sweep_gauss_seidel_apply
-        (&kh, nrows, nrows, A.graph.row_map, A.graph.entries, A.values, x, b, true, true, 1.0, params.sweeps);
+      KokkosSparse::Experimental::forward_sweep_gauss_seidel_apply(
+          &kh, nrows, nrows, A.graph.row_map, A.graph.entries, A.values, x, b,
+          true, true, 1.0, params.sweeps);
       break;
     case GS_BACKWARD:
-      KokkosSparse::Experimental::backward_sweep_gauss_seidel_apply
-        (&kh, nrows, nrows, A.graph.row_map, A.graph.entries, A.values, x, b, true, true, 1.0, params.sweeps);
+      KokkosSparse::Experimental::backward_sweep_gauss_seidel_apply(
+          &kh, nrows, nrows, A.graph.row_map, A.graph.entries, A.values, x, b,
+          true, true, 1.0, params.sweeps);
       break;
   }
   double applyTime = timer.seconds();
   std::cout << "\n*** Apply time: " << applyTime << '\n';
   kh.destroy_gs_handle();
-  //Now, compute the 2-norm of residual 
+  // Now, compute the 2-norm of residual
   scalar_view_t res("Ax-b", nrows);
   Kokkos::deep_copy(res, b);
   scalar_t alpha = Kokkos::reduction_identity<scalar_t>::prod();
-  scalar_t beta = -alpha;
-  KokkosSparse::spmv<scalar_t, crsMat_t, scalar_view_t, scalar_t, scalar_view_t>
-    ("N", alpha, A, x, beta, res);
+  scalar_t beta  = -alpha;
+  KokkosSparse::spmv<scalar_t, crsMat_t, scalar_view_t, scalar_t,
+                     scalar_view_t>("N", alpha, A, x, beta, res);
   double resnorm = KokkosBlas::nrm2(res);
-  //note: this still works if the solution diverges
+  // note: this still works if the solution diverges
   std::cout << "Relative res norm: " << resnorm / bnorm << '\n';
 }
 
-int main(int argc, char** argv)
-{
-  //Expect two args: matrix name and device flag.
-  if(argc == 1 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help"))
-  {
+int main(int argc, char** argv) {
+  // Expect two args: matrix name and device flag.
+  if (argc == 1 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
     cout << "Usage: ./sparse_gs [--device] --amtx matrix.mtx [other args]\n\n";
-    cout << "\"--device-type\" flag can be \"--serial\", \"--openmp\", \"--cuda\" or \"--threads\".\n";
-    cout << "If device is not given, the default device for this build is used.\n";
+    cout << "\"--device-type\" flag can be \"--serial\", \"--openmp\", "
+            "\"--cuda\" or \"--threads\".\n";
+    cout << "If device is not given, the default device for this build is "
+            "used.\n";
     cout << "\nOther flags:\n";
-    cout << "--sym-graph : pass if matrix is known to be structurally symmetric.\n";
+    cout << "--sym-graph : pass if matrix is known to be structurally "
+            "symmetric.\n";
     cout << "            : if generating matrix randomly, it is symmetrized\n";
     cout << "--sweeps S: run S times (default 1)\n";
     cout << "Randomized matrix settings, if not reading from file:\n";
     cout << "  --n <N> : number of rows/columns\n";
     cout << "  --nnz <N> : number of nonzeros in each regular row\n";
     cout << "  --long-rows <N> : number of long rows\n";
-    cout << "  --min-long-row-nnz <N> : min number of nonzeros in each long row (default 1000)\n";
-    cout << "  --max-long-row-nnz <N> : max number of nonzeros in each long row (default 2000)\n";
+    cout << "  --min-long-row-nnz <N> : min number of nonzeros in each long "
+            "row (default 1000)\n";
+    cout << "  --max-long-row-nnz <N> : max number of nonzeros in each long "
+            "row (default 2000)\n";
     cout << "Randomized matrix settings, if not reading from file:\n";
     cout << "Randomized matrix settings, if not reading from file:\n";
     cout << "4 main algorithms (required, choose one):\n";
@@ -322,127 +321,117 @@ int main(int argc, char** argv)
     cout << "  --backward\n";
     cout << "  --symmetric\n";
     cout << "Options for point:\n";
-    cout << "  --long-row-threshold <N> : rows with at least this many entries are processed separately.\n";
+    cout << "  --long-row-threshold <N> : rows with at least this many entries "
+            "are processed separately.\n";
     cout << "Options for cluster:\n";
     cout << "  --cluster-size N (default: 10)\n";
     cout << "  --coarse-algo ALGO\n";
     cout << "     ALGO may be: \"balloon\" or \"mis2\"\n";
-    cout << "     Default is chosen by the library. If using mis2, --cluster-size option has no effect.\n";
+    cout << "     Default is chosen by the library. If using mis2, "
+            "--cluster-size option has no effect.\n";
     return 0;
   }
   Kokkos::initialize(argc, argv);
-  //device is just the name of the execution space, lowercase
+  // device is just the name of the execution space, lowercase
   string deviceName;
   GS_Parameters params;
   int i = 1;
-  for(; i < argc; i++)
-  {
-    if(!strcmp(argv[i], "--amtx"))
+  for (; i < argc; i++) {
+    if (!strcmp(argv[i], "--amtx"))
       params.matrix_path = getNextArg(i, argc, argv);
-    else if(!strcmp(argv[i], "--n"))
+    else if (!strcmp(argv[i], "--n"))
       params.n = atoi(getNextArg(i, argc, argv));
-    else if(!strcmp(argv[i], "--nnz"))
+    else if (!strcmp(argv[i], "--nnz"))
       params.nnzPerRow = atoi(getNextArg(i, argc, argv));
-    else if(!strcmp(argv[i], "--long-rows"))
+    else if (!strcmp(argv[i], "--long-rows"))
       params.numLongRows = atoi(getNextArg(i, argc, argv));
-    else if(!strcmp(argv[i], "--min-long-row-nnz"))
+    else if (!strcmp(argv[i], "--min-long-row-nnz"))
       params.minNnzPerLongRow = atoi(getNextArg(i, argc, argv));
-    else if(!strcmp(argv[i], "--max-long-row-nnz"))
+    else if (!strcmp(argv[i], "--max-long-row-nnz"))
       params.maxNnzPerLongRow = atoi(getNextArg(i, argc, argv));
-    else if(!strcmp(argv[i], "--serial"))
+    else if (!strcmp(argv[i], "--serial"))
       deviceName = "serial";
-    else if(!strcmp(argv[i], "--openmp"))
+    else if (!strcmp(argv[i], "--openmp"))
       deviceName = "openmp";
-    else if(!strcmp(argv[i], "--threads"))
+    else if (!strcmp(argv[i], "--threads"))
       deviceName = "threads";
-    else if(!strcmp(argv[i], "--cuda"))
+    else if (!strcmp(argv[i], "--cuda"))
       deviceName = "cuda";
-    else if(!strcmp(argv[i], "--sym-graph"))
+    else if (!strcmp(argv[i], "--sym-graph"))
       params.graph_symmetric = true;
-    else if(!strcmp(argv[i], "--symmetric"))
+    else if (!strcmp(argv[i], "--symmetric"))
       params.direction = GS_SYMMETRIC;
-    else if(!strcmp(argv[i], "--forward"))
+    else if (!strcmp(argv[i], "--forward"))
       params.direction = GS_FORWARD;
-    else if(!strcmp(argv[i], "--backward"))
+    else if (!strcmp(argv[i], "--backward"))
       params.direction = GS_BACKWARD;
-    else if(!strcmp(argv[i], "--sweeps"))
+    else if (!strcmp(argv[i], "--sweeps"))
       params.sweeps = atoi(getNextArg(i, argc, argv));
-    else if(!strcmp(argv[i], "--point"))
+    else if (!strcmp(argv[i], "--point"))
       params.algo = GS_DEFAULT;
-    else if(!strcmp(argv[i], "--cluster"))
+    else if (!strcmp(argv[i], "--cluster"))
       params.algo = GS_CLUSTER;
-    else if(!strcmp(argv[i], "--twostage"))
-      params.algo = GS_TWOSTAGE;
-    else if(!strcmp(argv[i], "--classic"))
-    {
+    else if (!strcmp(argv[i], "--twostage"))
       params.algo = GS_TWOSTAGE;
+    else if (!strcmp(argv[i], "--classic")) {
+      params.algo    = GS_TWOSTAGE;
       params.classic = true;
-    }
-    else if(!strcmp(argv[i], "--long-row-threshold"))
+    } else if (!strcmp(argv[i], "--long-row-threshold"))
       params.longRowThreshold = atoi(getNextArg(i, argc, argv));
-    else if(!strcmp(argv[i], "--coarse-algo"))
-    {
+    else if (!strcmp(argv[i], "--coarse-algo")) {
       const char* algo = getNextArg(i, argc, argv);
-      if(!strcmp(algo, "balloon"))
+      if (!strcmp(algo, "balloon"))
         params.coarse_algo = CLUSTER_BALLOON;
-      else if(!strcmp(algo, "mis2"))
+      else if (!strcmp(algo, "mis2"))
         params.coarse_algo = CLUSTER_MIS2;
-      else
-      {
-        std::cout << "Error: invalid coarsening algorithm. Options are balloon and mis2.\n";
+      else {
+        std::cout << "Error: invalid coarsening algorithm. Options are balloon "
+                     "and mis2.\n";
         Kokkos::finalize();
         exit(1);
       }
-    }
-    else if(!strcmp(argv[i], "--cluster-size"))
+    } else if (!strcmp(argv[i], "--cluster-size"))
       params.cluster_size = atoi(getNextArg(i, argc, argv));
-    else
-    {
+    else {
       cout << "Error: unknown argument " << argv[i] << '\n';
       Kokkos::finalize();
       exit(1);
     }
   }
   bool run = false;
-  if(!deviceName.length())
-  {
+  if (!deviceName.length()) {
     runGS<Kokkos::DefaultExecutionSpace>(params);
     run = true;
   }
-  #ifdef KOKKOS_ENABLE_SERIAL
-  if(deviceName == "serial")
-  {
+#ifdef KOKKOS_ENABLE_SERIAL
+  if (deviceName == "serial") {
     runGS<Kokkos::Serial>(params);
     run = true;
   }
-  #endif
-  #ifdef KOKKOS_ENABLE_OPENMP
-  if(deviceName == "openmp")
-  {
+#endif
+#ifdef KOKKOS_ENABLE_OPENMP
+  if (deviceName == "openmp") {
     runGS<Kokkos::OpenMP>(params);
     run = true;
   }
-  #endif
-  #ifdef KOKKOS_ENABLE_THREADS
-  if(deviceName == "threads")
-  {
+#endif
+#ifdef KOKKOS_ENABLE_THREADS
+  if (deviceName == "threads") {
     runGS<Kokkos::Threads>(params);
     run = true;
   }
-  #endif
-  #ifdef KOKKOS_ENABLE_CUDA
-  if(deviceName == "cuda")
-  {
+#endif
+#ifdef KOKKOS_ENABLE_CUDA
+  if (deviceName == "cuda") {
     runGS<Kokkos::Cuda>(params);
     run = true;
   }
-  #endif
-  if(!run)
-  {
-    std::cerr << "Error: device " << deviceName << " was requested but it's not enabled in this build.\n";
+#endif
+  if (!run) {
+    std::cerr << "Error: device " << deviceName
+              << " was requested but it's not enabled in this build.\n";
     return 1;
   }
   Kokkos::finalize();
   return 0;
 }
-
diff --git a/perf_test/sparse/KokkosSparse_kk_spmv.cpp b/perf_test/sparse/KokkosSparse_kk_spmv.cpp
index aa8f2ddfa3..953294b120 100644
--- a/perf_test/sparse/KokkosSparse_kk_spmv.cpp
+++ b/perf_test/sparse/KokkosSparse_kk_spmv.cpp
@@ -62,54 +62,52 @@ typedef default_scalar Scalar;
 typedef default_lno_t Ordinal;
 typedef default_size_type Offset;
 
-template<typename Layout>
-void run_spmv(Ordinal numRows, Ordinal numCols, const char* filename, int loop, int num_vecs, char mode, Scalar beta) {
-  typedef KokkosSparse::CrsMatrix<Scalar, Ordinal, Kokkos::DefaultExecutionSpace, void, Offset> matrix_type;
+template <typename Layout>
+void run_spmv(Ordinal numRows, Ordinal numCols, const char* filename, int loop,
+              int num_vecs, char mode, Scalar beta) {
+  typedef KokkosSparse::CrsMatrix<Scalar, Ordinal,
+                                  Kokkos::DefaultExecutionSpace, void, Offset>
+      matrix_type;
   typedef typename Kokkos::View<Scalar**, Layout> mv_type;
   typedef typename mv_type::HostMirror h_mv_type;
 
   srand(17312837);
   matrix_type A;
-  if(filename)
+  if (filename)
     A = KokkosKernels::Impl::read_kokkos_crst_matrix<matrix_type>(filename);
-  else
-  {
+  else {
     Offset nnz = 10 * numRows;
-    //note: the help text says the bandwidth is fixed at 0.01 * numRows
-    A = KokkosKernels::Impl::kk_generate_sparse_matrix<matrix_type>(numRows, numCols, nnz, 0, 0.01 * numRows);
+    // note: the help text says the bandwidth is fixed at 0.01 * numRows
+    A = KokkosKernels::Impl::kk_generate_sparse_matrix<matrix_type>(
+        numRows, numCols, nnz, 0, 0.01 * numRows);
   }
   numRows = A.numRows();
   numCols = A.numCols();
   mv_type x("X", numCols, num_vecs);
   mv_type y("Y", numRows, num_vecs);
-  h_mv_type h_x = Kokkos::create_mirror_view(x);
-  h_mv_type h_y = Kokkos::create_mirror_view(y);
+  h_mv_type h_x         = Kokkos::create_mirror_view(x);
+  h_mv_type h_y         = Kokkos::create_mirror_view(y);
   h_mv_type h_y_compare = Kokkos::create_mirror(y);
 
-  for(int v = 0; v < num_vecs; v++)
-  {
-    for(int i=0; i<numCols;i++)
-    {
-      h_x(i, v) = (Scalar) (1.0*(rand()%40)-20.);
+  for (int v = 0; v < num_vecs; v++) {
+    for (int i = 0; i < numCols; i++) {
+      h_x(i, v) = (Scalar)(1.0 * (rand() % 40) - 20.);
     }
   }
 
-  Kokkos::deep_copy(x,h_x);
+  Kokkos::deep_copy(x, h_x);
 
   // Benchmark
   auto x0 = Kokkos::subview(x, Kokkos::ALL(), 0);
   auto y0 = Kokkos::subview(y, Kokkos::ALL(), 0);
   Kokkos::Timer timer;
-  for(int i=0;i<loop;i++) {
-    if(num_vecs == 1)
-    {
-      //run the rank-1 version
-      KokkosSparse::spmv(&mode,1.0,A,x0,beta,y0);
-    }
-    else
-    {
-      //rank-2
-      KokkosSparse::spmv(&mode,1.0,A,x,beta,y);
+  for (int i = 0; i < loop; i++) {
+    if (num_vecs == 1) {
+      // run the rank-1 version
+      KokkosSparse::spmv(&mode, 1.0, A, x0, beta, y0);
+    } else {
+      // rank-2
+      KokkosSparse::spmv(&mode, 1.0, A, x, beta, y);
     }
     Kokkos::DefaultExecutionSpace().fence();
   }
@@ -119,67 +117,94 @@ void run_spmv(Ordinal numRows, Ordinal numCols, const char* filename, int loop,
 
 void print_help() {
   printf("  -s [nrows]            : matrix dimension (square)\n");
-  printf("  --nv n                : number of columns in x/y multivector (default 1).\n");
-  printf("  --layout left|right   : memory layout of x/y. Default depends on build's default execution space\n");
-  printf("  -m N|T                : matrix apply mode: N (normal, default), T (transpose)\n");
-  printf("  -f [file],-fb [file]  : Read in Matrix Market (.mtx), or binary (.bin) matrix file.\n");
-  printf("  -l [LOOP]             : How many spmv to run to aggregate average time. \n");
+  printf(
+      "  --nv n                : number of columns in x/y multivector (default "
+      "1).\n");
+  printf(
+      "  --layout left|right   : memory layout of x/y. Default depends on "
+      "build's default execution space\n");
+  printf(
+      "  -m N|T                : matrix apply mode: N (normal, default), T "
+      "(transpose)\n");
+  printf(
+      "  -f [file],-fb [file]  : Read in Matrix Market (.mtx), or binary "
+      "(.bin) matrix file.\n");
+  printf(
+      "  -l [LOOP]             : How many spmv to run to aggregate average "
+      "time. \n");
   printf("  -b beta               : beta, as in y := Ax + (beta)y\n");
 }
 
-int main(int argc, char **argv)
-{
- long long int size = 110503; // a prime number
- char* filename = NULL;
-
- char mode = 'N';
- char layout;
- if(std::is_same<default_layout, Kokkos::LayoutLeft>::value)
-   layout = 'L';
- else
-   layout = 'R';
- int loop = 100;
- int num_vecs = 1;
- Scalar beta = 0.0;
-
- if(argc == 1) {
-   print_help();
-   return 0;
- }
-
- for(int i=0;i<argc;i++)
- {
-   if((strcmp(argv[i],"-s")==0)) {size=atoi(argv[++i]); continue;}
-   if((strcmp(argv[i],"-f")==0 || strcmp(argv[i], "-fb") == 0)) {filename = argv[++i]; continue;}
-   if((strcmp(argv[i],"-l")==0)) {loop=atoi(argv[++i]); continue;}
-   if((strcmp(argv[i],"-m")==0)) {mode=toupper(argv[++i][0]); continue;}
-   if((strcmp(argv[i],"--nv")==0)) {num_vecs=atoi(argv[++i]); continue;}
-   if((strcmp(argv[i],"-b")==0)) {beta=atof(argv[++i]); continue;}
-   if((strcmp(argv[i],"--layout")==0))
-   {
-     i++;
-     if(toupper(argv[i][0]) == 'L')
-       layout = 'L';
-     else if(toupper(argv[i][0]) == 'R')
-       layout = 'R';
-     else
-       throw std::runtime_error("Invalid layout");
-   }
-   if((strcmp(argv[i],"--help")==0) || (strcmp(argv[i],"-h")==0)) {
-     print_help();
-     return 0;
-   }
- }
-
- Kokkos::initialize(argc,argv);
-
- std::cout << size << " rows/cols, mode " << mode << ", " << num_vecs << " vectors, beta = " << beta << ", layout " << layout << ": ";
-
- if(layout == 'L')
-   run_spmv<Kokkos::LayoutLeft>(size,size,filename,loop,num_vecs,mode,beta);
- else
-   run_spmv<Kokkos::LayoutRight>(size,size,filename,loop,num_vecs,mode,beta);
-
- Kokkos::finalize();
-}
+int main(int argc, char** argv) {
+  long long int size = 110503;  // a prime number
+  char* filename     = NULL;
+
+  char mode = 'N';
+  char layout;
+  if (std::is_same<default_layout, Kokkos::LayoutLeft>::value)
+    layout = 'L';
+  else
+    layout = 'R';
+  int loop     = 100;
+  int num_vecs = 1;
+  Scalar beta  = 0.0;
+
+  if (argc == 1) {
+    print_help();
+    return 0;
+  }
 
+  for (int i = 0; i < argc; i++) {
+    if ((strcmp(argv[i], "-s") == 0)) {
+      size = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-f") == 0 || strcmp(argv[i], "-fb") == 0)) {
+      filename = argv[++i];
+      continue;
+    }
+    if ((strcmp(argv[i], "-l") == 0)) {
+      loop = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-m") == 0)) {
+      mode = toupper(argv[++i][0]);
+      continue;
+    }
+    if ((strcmp(argv[i], "--nv") == 0)) {
+      num_vecs = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-b") == 0)) {
+      beta = atof(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "--layout") == 0)) {
+      i++;
+      if (toupper(argv[i][0]) == 'L')
+        layout = 'L';
+      else if (toupper(argv[i][0]) == 'R')
+        layout = 'R';
+      else
+        throw std::runtime_error("Invalid layout");
+    }
+    if ((strcmp(argv[i], "--help") == 0) || (strcmp(argv[i], "-h") == 0)) {
+      print_help();
+      return 0;
+    }
+  }
+
+  Kokkos::initialize(argc, argv);
+
+  std::cout << size << " rows/cols, mode " << mode << ", " << num_vecs
+            << " vectors, beta = " << beta << ", layout " << layout << ": ";
+
+  if (layout == 'L')
+    run_spmv<Kokkos::LayoutLeft>(size, size, filename, loop, num_vecs, mode,
+                                 beta);
+  else
+    run_spmv<Kokkos::LayoutRight>(size, size, filename, loop, num_vecs, mode,
+                                  beta);
+
+  Kokkos::finalize();
+}
diff --git a/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp b/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp
index 79fc0dbe87..371f1b1d33 100644
--- a/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp
+++ b/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp
@@ -45,218 +45,203 @@
 #include "KokkosSparse_CrsMatrix.hpp"
 #include "KokkosSparse_run_spgemm.hpp"
 
-namespace KokkosKernels{
-
-namespace Experiment{
-
-  template <typename size_type, typename lno_t, typename scalar_t,
-            typename exec_space, typename hbm_mem_space, typename sbm_mem_space>
-  void run_multi_mem_spgemm(Parameters params){
-
-    typedef exec_space myExecSpace;
-    typedef Kokkos::Device<exec_space, hbm_mem_space> myFastDevice;
-    typedef Kokkos::Device<exec_space, sbm_mem_space> mySlowExecSpace;
-
-    typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, myFastDevice, void, size_type > fast_crstmat_t;
-    typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, mySlowExecSpace, void, size_type > slow_crstmat_t;
-
-    char *a_mat_file = params.a_mtx_bin_file;
-    char *b_mat_file = params.b_mtx_bin_file;
-    char *c_mat_file = params.c_mtx_bin_file;
-
-    slow_crstmat_t a_slow_crsmat, b_slow_crsmat, c_slow_crsmat;
-    fast_crstmat_t a_fast_crsmat, b_fast_crsmat, c_fast_crsmat;
-
-    //read a and b matrices and store them on slow or fast memory.
-
-    if (params.a_mem_space == 1){
-      a_fast_crsmat = KokkosKernels::Impl::read_kokkos_crst_matrix<fast_crstmat_t>(a_mat_file);
-    }
-    else {
-      a_slow_crsmat = KokkosKernels::Impl::read_kokkos_crst_matrix<slow_crstmat_t>(a_mat_file);
-    }
-
-
-    if ((b_mat_file == NULL || strcmp(b_mat_file, a_mat_file) == 0) && params.b_mem_space == params.a_mem_space){
-      std::cout << "Using A matrix for B as well" << std::endl;
-      b_fast_crsmat = a_fast_crsmat;
-      b_slow_crsmat = a_slow_crsmat;
-    }
-    else if (params.b_mem_space == 1){
-      if (b_mat_file == NULL) b_mat_file = a_mat_file;
-      b_fast_crsmat = KokkosKernels::Impl::read_kokkos_crst_matrix<fast_crstmat_t>(b_mat_file);
-    }
-    else {
-      if (b_mat_file == NULL) b_mat_file = a_mat_file;
-      b_slow_crsmat = KokkosKernels::Impl::read_kokkos_crst_matrix<slow_crstmat_t>(b_mat_file);
-    }
+namespace KokkosKernels {
+
+namespace Experiment {
+
+template <typename size_type, typename lno_t, typename scalar_t,
+          typename exec_space, typename hbm_mem_space, typename sbm_mem_space>
+void run_multi_mem_spgemm(Parameters params) {
+  typedef exec_space myExecSpace;
+  typedef Kokkos::Device<exec_space, hbm_mem_space> myFastDevice;
+  typedef Kokkos::Device<exec_space, sbm_mem_space> mySlowExecSpace;
+
+  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, myFastDevice, void,
+                                           size_type>
+      fast_crstmat_t;
+  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, mySlowExecSpace,
+                                           void, size_type>
+      slow_crstmat_t;
+
+  char *a_mat_file = params.a_mtx_bin_file;
+  char *b_mat_file = params.b_mtx_bin_file;
+  char *c_mat_file = params.c_mtx_bin_file;
+
+  slow_crstmat_t a_slow_crsmat, b_slow_crsmat, c_slow_crsmat;
+  fast_crstmat_t a_fast_crsmat, b_fast_crsmat, c_fast_crsmat;
+
+  // read a and b matrices and store them on slow or fast memory.
+
+  if (params.a_mem_space == 1) {
+    a_fast_crsmat =
+        KokkosKernels::Impl::read_kokkos_crst_matrix<fast_crstmat_t>(
+            a_mat_file);
+  } else {
+    a_slow_crsmat =
+        KokkosKernels::Impl::read_kokkos_crst_matrix<slow_crstmat_t>(
+            a_mat_file);
+  }
 
-    if (params.a_mem_space == 1){
-      if (params.b_mem_space == 1){
-        if (params.c_mem_space == 1){
-          if (params.work_mem_space == 1){
-            c_fast_crsmat =
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, fast_crstmat_t,fast_crstmat_t,fast_crstmat_t, hbm_mem_space, hbm_mem_space>
-                  (a_fast_crsmat, b_fast_crsmat, params);
-          }
-          else {
-            c_fast_crsmat =
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, fast_crstmat_t,fast_crstmat_t,fast_crstmat_t, sbm_mem_space, sbm_mem_space>
-                  (a_fast_crsmat, b_fast_crsmat, params);
-          }
+  if ((b_mat_file == NULL || strcmp(b_mat_file, a_mat_file) == 0) &&
+      params.b_mem_space == params.a_mem_space) {
+    std::cout << "Using A matrix for B as well" << std::endl;
+    b_fast_crsmat = a_fast_crsmat;
+    b_slow_crsmat = a_slow_crsmat;
+  } else if (params.b_mem_space == 1) {
+    if (b_mat_file == NULL) b_mat_file = a_mat_file;
+    b_fast_crsmat =
+        KokkosKernels::Impl::read_kokkos_crst_matrix<fast_crstmat_t>(
+            b_mat_file);
+  } else {
+    if (b_mat_file == NULL) b_mat_file = a_mat_file;
+    b_slow_crsmat =
+        KokkosKernels::Impl::read_kokkos_crst_matrix<slow_crstmat_t>(
+            b_mat_file);
+  }
 
+  if (params.a_mem_space == 1) {
+    if (params.b_mem_space == 1) {
+      if (params.c_mem_space == 1) {
+        if (params.work_mem_space == 1) {
+          c_fast_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_crstmat_t, fast_crstmat_t, fast_crstmat_t,
+              hbm_mem_space, hbm_mem_space>(a_fast_crsmat, b_fast_crsmat,
+                                            params);
+        } else {
+          c_fast_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_crstmat_t, fast_crstmat_t, fast_crstmat_t,
+              sbm_mem_space, sbm_mem_space>(a_fast_crsmat, b_fast_crsmat,
+                                            params);
         }
-        else {
-          //C is in slow memory.
-          if (params.work_mem_space == 1){
-            c_slow_crsmat =
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, fast_crstmat_t,fast_crstmat_t,slow_crstmat_t, hbm_mem_space, hbm_mem_space>
-                  (a_fast_crsmat, b_fast_crsmat, params);
-          }
-          else {
-            c_slow_crsmat =
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, fast_crstmat_t,fast_crstmat_t,slow_crstmat_t, sbm_mem_space, sbm_mem_space>
-                  (a_fast_crsmat, b_fast_crsmat, params);
-          }
-        }
-      }
-      else {
-        //B is in slow memory
-        if (params.c_mem_space == 1){
-          if (params.work_mem_space == 1){
-            c_fast_crsmat =
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, fast_crstmat_t,slow_crstmat_t,fast_crstmat_t, hbm_mem_space, hbm_mem_space>
-                  (a_fast_crsmat, b_slow_crsmat, params);
-          }
-          else {
-            c_fast_crsmat =
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, fast_crstmat_t,slow_crstmat_t,fast_crstmat_t, sbm_mem_space, sbm_mem_space>
-                  (a_fast_crsmat, b_slow_crsmat, params);
-          }
 
+      } else {
+        // C is in slow memory.
+        if (params.work_mem_space == 1) {
+          c_slow_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_crstmat_t, fast_crstmat_t, slow_crstmat_t,
+              hbm_mem_space, hbm_mem_space>(a_fast_crsmat, b_fast_crsmat,
+                                            params);
+        } else {
+          c_slow_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_crstmat_t, fast_crstmat_t, slow_crstmat_t,
+              sbm_mem_space, sbm_mem_space>(a_fast_crsmat, b_fast_crsmat,
+                                            params);
         }
-        else {
-          //C is in slow memory.
-          if (params.work_mem_space == 1){
-            c_slow_crsmat =
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, fast_crstmat_t,slow_crstmat_t,slow_crstmat_t, hbm_mem_space, hbm_mem_space>
-                  (a_fast_crsmat, b_slow_crsmat, params);
-          }
-          else {
-            c_slow_crsmat =
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, fast_crstmat_t,slow_crstmat_t,slow_crstmat_t, sbm_mem_space, sbm_mem_space>
-                  (a_fast_crsmat, b_slow_crsmat, params);
-          }
+      }
+    } else {
+      // B is in slow memory
+      if (params.c_mem_space == 1) {
+        if (params.work_mem_space == 1) {
+          c_fast_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_crstmat_t, slow_crstmat_t, fast_crstmat_t,
+              hbm_mem_space, hbm_mem_space>(a_fast_crsmat, b_slow_crsmat,
+                                            params);
+        } else {
+          c_fast_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_crstmat_t, slow_crstmat_t, fast_crstmat_t,
+              sbm_mem_space, sbm_mem_space>(a_fast_crsmat, b_slow_crsmat,
+                                            params);
         }
 
+      } else {
+        // C is in slow memory.
+        if (params.work_mem_space == 1) {
+          c_slow_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_crstmat_t, slow_crstmat_t, slow_crstmat_t,
+              hbm_mem_space, hbm_mem_space>(a_fast_crsmat, b_slow_crsmat,
+                                            params);
+        } else {
+          c_slow_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_crstmat_t, slow_crstmat_t, slow_crstmat_t,
+              sbm_mem_space, sbm_mem_space>(a_fast_crsmat, b_slow_crsmat,
+                                            params);
+        }
       }
     }
-    else {
-      //A is in slow memory
-      if (params.b_mem_space == 1){
-        if (params.c_mem_space == 1){
-          if (params.work_mem_space == 1){
-            c_fast_crsmat =
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, slow_crstmat_t,fast_crstmat_t,fast_crstmat_t, hbm_mem_space, hbm_mem_space>
-                  (a_slow_crsmat, b_fast_crsmat, params);
-          }
-          else {
-            c_fast_crsmat =
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, slow_crstmat_t,fast_crstmat_t,fast_crstmat_t, sbm_mem_space, sbm_mem_space>
-                  (a_slow_crsmat, b_fast_crsmat, params);
-          }
-
+  } else {
+    // A is in slow memory
+    if (params.b_mem_space == 1) {
+      if (params.c_mem_space == 1) {
+        if (params.work_mem_space == 1) {
+          c_fast_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_crstmat_t, fast_crstmat_t, fast_crstmat_t,
+              hbm_mem_space, hbm_mem_space>(a_slow_crsmat, b_fast_crsmat,
+                                            params);
+        } else {
+          c_fast_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_crstmat_t, fast_crstmat_t, fast_crstmat_t,
+              sbm_mem_space, sbm_mem_space>(a_slow_crsmat, b_fast_crsmat,
+                                            params);
         }
-        else {
-          //C is in slow memory.
-          if (params.work_mem_space == 1){
-            c_slow_crsmat =
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, slow_crstmat_t,fast_crstmat_t,slow_crstmat_t, hbm_mem_space, hbm_mem_space>
-                  (a_slow_crsmat, b_fast_crsmat, params);
-          }
-          else {
-            c_slow_crsmat =
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, slow_crstmat_t,fast_crstmat_t,slow_crstmat_t, sbm_mem_space, sbm_mem_space>
-                  (a_slow_crsmat, b_fast_crsmat, params);
-          }
-        }
-      }
-      else {
-        //B is in slow memory
-        if (params.c_mem_space == 1){
-          if (params.work_mem_space == 1){
-            c_fast_crsmat =
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, slow_crstmat_t,slow_crstmat_t,fast_crstmat_t, hbm_mem_space, hbm_mem_space>
-                  (a_slow_crsmat, b_slow_crsmat, params);
-          }
-          else {
-            c_fast_crsmat =
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, slow_crstmat_t,slow_crstmat_t,fast_crstmat_t, sbm_mem_space, sbm_mem_space>
-                  (a_slow_crsmat, b_slow_crsmat, params);
-          }
 
+      } else {
+        // C is in slow memory.
+        if (params.work_mem_space == 1) {
+          c_slow_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_crstmat_t, fast_crstmat_t, slow_crstmat_t,
+              hbm_mem_space, hbm_mem_space>(a_slow_crsmat, b_fast_crsmat,
+                                            params);
+        } else {
+          c_slow_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_crstmat_t, fast_crstmat_t, slow_crstmat_t,
+              sbm_mem_space, sbm_mem_space>(a_slow_crsmat, b_fast_crsmat,
+                                            params);
         }
-        else {
-          //C is in slow memory.
-          if (params.work_mem_space == 1){
-            c_slow_crsmat =
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, slow_crstmat_t,slow_crstmat_t,slow_crstmat_t, hbm_mem_space, hbm_mem_space>
-                  (a_slow_crsmat, b_slow_crsmat, params);
-          }
-          else {
-            c_slow_crsmat =
-                KokkosKernels::Experiment::run_experiment
-                  <myExecSpace, slow_crstmat_t,slow_crstmat_t,slow_crstmat_t, sbm_mem_space, sbm_mem_space>
-                  (a_slow_crsmat, b_slow_crsmat, params);
-          }
+      }
+    } else {
+      // B is in slow memory
+      if (params.c_mem_space == 1) {
+        if (params.work_mem_space == 1) {
+          c_fast_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_crstmat_t, slow_crstmat_t, fast_crstmat_t,
+              hbm_mem_space, hbm_mem_space>(a_slow_crsmat, b_slow_crsmat,
+                                            params);
+        } else {
+          c_fast_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_crstmat_t, slow_crstmat_t, fast_crstmat_t,
+              sbm_mem_space, sbm_mem_space>(a_slow_crsmat, b_slow_crsmat,
+                                            params);
         }
 
+      } else {
+        // C is in slow memory.
+        if (params.work_mem_space == 1) {
+          c_slow_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_crstmat_t, slow_crstmat_t, slow_crstmat_t,
+              hbm_mem_space, hbm_mem_space>(a_slow_crsmat, b_slow_crsmat,
+                                            params);
+        } else {
+          c_slow_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_crstmat_t, slow_crstmat_t, slow_crstmat_t,
+              sbm_mem_space, sbm_mem_space>(a_slow_crsmat, b_slow_crsmat,
+                                            params);
+        }
       }
-
     }
+  }
 
-
-    if (c_mat_file != NULL){
-      if (params.c_mem_space == 1){
-        KokkosKernels::sort_crs_matrix(c_fast_crsmat);
-
-        KokkosKernels::Impl::write_graph_bin(
-            (lno_t) (c_fast_crsmat.numRows()),
-            (size_type) (c_fast_crsmat.graph.entries.extent(0)),
-            c_fast_crsmat.graph.row_map.data(),
-            c_fast_crsmat.graph.entries.data(),
-            c_fast_crsmat.values.data(),
-            c_mat_file);
-      }
-      else {
-        KokkosKernels::sort_crs_matrix(c_slow_crsmat);
-
-        KokkosKernels::Impl::write_graph_bin(
-            (lno_t) c_slow_crsmat.numRows(),
-            (size_type) c_slow_crsmat.graph.entries.extent(0),
-            c_slow_crsmat.graph.row_map.data(),
-            c_slow_crsmat.graph.entries.data(),
-            c_slow_crsmat.values.data(),
-            c_mat_file);
-      }
+  if (c_mat_file != NULL) {
+    if (params.c_mem_space == 1) {
+      KokkosKernels::sort_crs_matrix(c_fast_crsmat);
+
+      KokkosKernels::Impl::write_graph_bin(
+          (lno_t)(c_fast_crsmat.numRows()),
+          (size_type)(c_fast_crsmat.graph.entries.extent(0)),
+          c_fast_crsmat.graph.row_map.data(),
+          c_fast_crsmat.graph.entries.data(), c_fast_crsmat.values.data(),
+          c_mat_file);
+    } else {
+      KokkosKernels::sort_crs_matrix(c_slow_crsmat);
+
+      KokkosKernels::Impl::write_graph_bin(
+          (lno_t)c_slow_crsmat.numRows(),
+          (size_type)c_slow_crsmat.graph.entries.extent(0),
+          c_slow_crsmat.graph.row_map.data(),
+          c_slow_crsmat.graph.entries.data(), c_slow_crsmat.values.data(),
+          c_mat_file);
     }
   }
-
-
-}
 }
+
+}  // namespace Experiment
+}  // namespace KokkosKernels
diff --git a/perf_test/sparse/KokkosSparse_pcg.cpp b/perf_test/sparse/KokkosSparse_pcg.cpp
index 351169e617..5f34ec1cd9 100644
--- a/perf_test/sparse/KokkosSparse_pcg.cpp
+++ b/perf_test/sparse/KokkosSparse_pcg.cpp
@@ -48,113 +48,102 @@
 #include "KokkosKernels_Utils.hpp"
 #include "KokkosKernels_IOUtils.hpp"
 #include "KokkosKernels_default_types.hpp"
+#include "KokkosKernels_TestUtils.hpp"
 #include <iostream>
 
 #define MAXVAL 1
 
-template<typename scalar_view_t>
-scalar_view_t create_x_vector(default_lno_t nv, default_scalar max_value = 1.0){
-  scalar_view_t kok_x ("X", nv);
+template <typename scalar_view_t>
+scalar_view_t create_x_vector(default_lno_t nv,
+                              default_scalar max_value = 1.0) {
+  scalar_view_t kok_x("X", nv);
 
-  typename scalar_view_t::HostMirror h_x =  Kokkos::create_mirror_view (kok_x);
+  typename scalar_view_t::HostMirror h_x = Kokkos::create_mirror_view(kok_x);
 
-
-  for (default_lno_t i = 0; i < nv; ++i){
-    default_scalar r = static_cast <default_scalar> (rand()) / static_cast <default_scalar> (RAND_MAX / max_value);
+  for (default_lno_t i = 0; i < nv; ++i) {
+    default_scalar r = static_cast<default_scalar>(rand()) /
+                       static_cast<default_scalar>(RAND_MAX / max_value);
     h_x(i) = r;
   }
-  Kokkos::deep_copy (kok_x, h_x);
+  Kokkos::deep_copy(kok_x, h_x);
   return kok_x;
 }
 
-
-
 template <typename crsMat_t, typename vector_t>
-vector_t create_y_vector(crsMat_t crsMat, vector_t x_vector){
-  vector_t y_vector ("Y VECTOR", crsMat.numRows());
+vector_t create_y_vector(crsMat_t crsMat, vector_t x_vector) {
+  vector_t y_vector("Y VECTOR", crsMat.numRows());
   KokkosSparse::spmv("N", 1, crsMat, x_vector, 1, y_vector);
   return y_vector;
 }
 
 template <typename ExecSpace, typename crsMat_t>
-void run_experiment(
-    crsMat_t crsmat,
-    int clusterSize,
-    bool useSequential)
-{
+void run_experiment(crsMat_t crsmat, int clusterSize, bool useSequential) {
   typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
-  typedef typename crsMat_t::StaticCrsGraphType::row_map_type::non_const_type lno_view_t;
-  typedef typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type lno_nnz_view_t;
+  typedef typename crsMat_t::StaticCrsGraphType::row_map_type::non_const_type
+      lno_view_t;
+  typedef typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type
+      lno_nnz_view_t;
 
   typedef typename lno_nnz_view_t::value_type lno_t;
   typedef typename lno_view_t::value_type size_type;
   typedef typename scalar_view_t::value_type scalar_t;
 
-  default_lno_t nv = crsmat.numRows();
+  default_lno_t nv             = crsmat.numRows();
   scalar_view_t kok_x_original = create_x_vector<scalar_view_t>(nv, MAXVAL);
-  scalar_view_t kok_b_vector = create_y_vector(crsmat, kok_x_original);
+  scalar_view_t kok_b_vector   = create_y_vector(crsmat, kok_x_original);
 
-  //create X vector
+  // create X vector
   scalar_view_t kok_x_vector("kok_x_vector", nv);
 
-  double solve_time = 0;
-  const unsigned cg_iteration_limit = 100000;
-  const double   cg_iteration_tolerance     = 1e-7 ;
+  double solve_time                   = 0;
+  const unsigned cg_iteration_limit   = 100000;
+  const double cg_iteration_tolerance = 1e-7;
 
-  KokkosKernels::Experimental::Example::CGSolveResult cg_result ;
+  KokkosKernels::Experimental::Example::CGSolveResult cg_result;
 
-  typedef KokkosKernels::Experimental::KokkosKernelsHandle
-        < size_type,
-		  lno_t,
-		  scalar_t,
-          ExecSpace, ExecSpace, ExecSpace > KernelHandle;
+  typedef KokkosKernels::Experimental::KokkosKernelsHandle<
+      size_type, lno_t, scalar_t, ExecSpace, ExecSpace, ExecSpace>
+      KernelHandle;
 
   KernelHandle kh;
 
-  if(clusterSize == 1)
+  if (clusterSize == 1)
     kh.create_gs_handle();
   else
     kh.create_gs_handle(KokkosSparse::CLUSTER_BALLOON, clusterSize);
   Kokkos::Timer timer1;
   KokkosKernels::Experimental::Example::pcgsolve(
-        kh
-      , crsmat
-      , kok_b_vector
-      , kok_x_vector
-      , cg_iteration_limit
-      , cg_iteration_tolerance
-      , & cg_result
-      , true
-      , clusterSize
-      , useSequential
-  );
+      kh, crsmat, kok_b_vector, kok_x_vector, cg_iteration_limit,
+      cg_iteration_tolerance, &cg_result, true, clusterSize, useSequential);
   Kokkos::fence();
 
   solve_time = timer1.seconds();
 
   std::string algoSummary;
-  if(useSequential)
+  if (useSequential)
     algoSummary = "SEQUENTIAL SGS";
-  else
-  {
-    if(clusterSize == 1)
+  else {
+    if (clusterSize == 1)
       algoSummary = "POINT-COLORING SGS";
     else
-      algoSummary = "CLUSTER-COLORING SGS (CLUSTER SIZE " + std::to_string(clusterSize) + ")";
+      algoSummary = "CLUSTER-COLORING SGS (CLUSTER SIZE " +
+                    std::to_string(clusterSize) + ")";
   }
 
-  std::cout  << "DEFAULT SOLVE: " << algoSummary << " PRECONDITIONER"
-      << "\n\t(P)CG_NUM_ITER              [" << cg_result.iteration << "]"
-      << "\n\tMATVEC_TIME                 [" << cg_result.matvec_time << "]"
-      << "\n\tCG_RESIDUAL                 [" << cg_result.norm_res << "]"
-      << "\n\tCG_ITERATION_TIME           [" << cg_result.iter_time << "]"
-      << "\n\tPRECONDITIONER_TIME         [" << cg_result.precond_time << "]"
-      << "\n\tPRECONDITIONER_INIT_TIME    [" << cg_result.precond_init_time << "]"
-      << "\n\tPRECOND_APPLY_TIME_PER_ITER [" << cg_result.precond_time / (cg_result.iteration  + 1) << "]"
-      << "\n\tSOLVE_TIME                  [" << solve_time<< "]"
-      << std::endl ;
-
-
+  std::cout << "DEFAULT SOLVE: " << algoSummary << " PRECONDITIONER"
+            << "\n\t(P)CG_NUM_ITER              [" << cg_result.iteration << "]"
+            << "\n\tMATVEC_TIME                 [" << cg_result.matvec_time
+            << "]"
+            << "\n\tCG_RESIDUAL                 [" << cg_result.norm_res << "]"
+            << "\n\tCG_ITERATION_TIME           [" << cg_result.iter_time << "]"
+            << "\n\tPRECONDITIONER_TIME         [" << cg_result.precond_time
+            << "]"
+            << "\n\tPRECONDITIONER_INIT_TIME    ["
+            << cg_result.precond_init_time << "]"
+            << "\n\tPRECOND_APPLY_TIME_PER_ITER ["
+            << cg_result.precond_time / (cg_result.iteration + 1) << "]"
+            << "\n\tSOLVE_TIME                  [" << solve_time << "]"
+            << std::endl;
 
   /*
   kh.destroy_gs_handle();
@@ -181,8 +170,10 @@ void run_experiment(
       << "\n\tCG_RESIDUAL                 [" << cg_result.norm_res << "]"
       << "\n\tCG_ITERATION_TIME           [" << cg_result.iter_time << "]"
       << "\n\tPRECONDITIONER_TIME         [" << cg_result.precond_time << "]"
-      << "\n\tPRECONDITIONER_INIT_TIME    [" << cg_result.precond_init_time << "]"
-      << "\n\tPRECOND_APPLY_TIME_PER_ITER [" << cg_result.precond_time / (cg_result.iteration  + 1) << "]"
+      << "\n\tPRECONDITIONER_INIT_TIME    [" << cg_result.precond_init_time <<
+  "]"
+      << "\n\tPRECOND_APPLY_TIME_PER_ITER [" << cg_result.precond_time /
+  (cg_result.iteration  + 1) << "]"
       << "\n\tSOLVE_TIME                  [" << solve_time<< "]"
       << std::endl ;
 
@@ -211,8 +202,10 @@ void run_experiment(
       << "\n\tCG_RESIDUAL                 [" << cg_result.norm_res << "]"
       << "\n\tCG_ITERATION_TIME           [" << cg_result.iter_time << "]"
       << "\n\tPRECONDITIONER_TIME         [" << cg_result.precond_time << "]"
-      << "\n\tPRECONDITIONER_INIT_TIME    [" << cg_result.precond_init_time << "]"
-      << "\n\tPRECOND_APPLY_TIME_PER_ITER [" << cg_result.precond_time / (cg_result.iteration  + 1) << "]"
+      << "\n\tPRECONDITIONER_INIT_TIME    [" << cg_result.precond_init_time <<
+  "]"
+      << "\n\tPRECOND_APPLY_TIME_PER_ITER [" << cg_result.precond_time /
+  (cg_result.iteration  + 1) << "]"
       << "\n\tSOLVE_TIME                  [" << solve_time<< "]"
       << std::endl ;
 
@@ -240,159 +233,169 @@ void run_experiment(
       << "\n\tCG_RESIDUAL                 [" << cg_result.norm_res << "]"
       << "\n\tCG_ITERATION_TIME           [" << cg_result.iter_time << "]"
       << "\n\tPRECONDITIONER_TIME         [" << cg_result.precond_time << "]"
-      << "\n\tPRECONDITIONER_INIT_TIME    [" << cg_result.precond_init_time << "]"
-      << "\n\tPRECOND_APPLY_TIME_PER_ITER [" << cg_result.precond_time / (cg_result.iteration  + 1) << "]"
+      << "\n\tPRECONDITIONER_INIT_TIME    [" << cg_result.precond_init_time <<
+  "]"
+      << "\n\tPRECOND_APPLY_TIME_PER_ITER [" << cg_result.precond_time /
+  (cg_result.iteration  + 1) << "]"
       << "\n\tSOLVE_TIME                  [" << solve_time<< "]"
       << std::endl ;
   */
 }
 
-enum { CMD_USE_THREADS = 0
-     , CMD_USE_NUMA
-     , CMD_USE_CORE_PER_NUMA
-     , CMD_USE_CUDA
-     , CMD_USE_HIP
-     , CMD_USE_OPENMP
-     , CMD_DEVICE
-     , CMD_BIN_MTX
-     , CMD_CLUSTER_SIZE
-     , CMD_USE_SEQUENTIAL_SGS
-     , CMD_ERROR
-     , CMD_COUNT };
-
-template<typename execution_space>
-void run_pcg(int* cmdline, const char* mtx_file)
-{
+enum {
+  CMD_USE_THREADS = 0,
+  CMD_USE_NUMA,
+  CMD_USE_CORE_PER_NUMA,
+  CMD_USE_CUDA,
+  CMD_USE_HIP,
+  CMD_USE_OPENMP,
+  CMD_DEVICE,
+  CMD_BIN_MTX,
+  CMD_CLUSTER_SIZE,
+  CMD_USE_SEQUENTIAL_SGS,
+  CMD_ERROR,
+  CMD_COUNT
+};
+
+template <typename execution_space>
+void run_pcg(int *cmdline, const char *mtx_file) {
   default_lno_t nv = 0, ne = 0;
   default_lno_t *xadj, *adj;
   default_scalar *ew;
 
-  KokkosKernels::Impl::read_matrix<default_lno_t,default_lno_t, default_scalar> (&nv, &ne, &xadj, &adj, &ew, mtx_file);
+  KokkosKernels::Impl::read_matrix<default_lno_t, default_lno_t,
+                                   default_scalar>(&nv, &ne, &xadj, &adj, &ew,
+                                                   mtx_file);
 
-  typedef typename KokkosSparse::CrsMatrix<default_scalar, default_lno_t, execution_space, void, default_size_type> crsMat_t;
+  typedef
+      typename KokkosSparse::CrsMatrix<default_scalar, default_lno_t,
+                                       execution_space, void, default_size_type>
+          crsMat_t;
 
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t;
-  typedef typename crsMat_t::index_type::non_const_type   cols_view_t;
+  typedef typename crsMat_t::index_type::non_const_type cols_view_t;
   typedef typename crsMat_t::values_type::non_const_type values_view_t;
 
-  row_map_view_t rowmap_view("rowmap_view", nv+1);
+  row_map_view_t rowmap_view("rowmap_view", nv + 1);
   cols_view_t columns_view("colsmap_view", ne);
   values_view_t values_view("values_view", ne);
 
   {
-    typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view);
-    typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view);
-    typename values_view_t::HostMirror hv = Kokkos::create_mirror_view (values_view);
-
-    for (default_lno_t i = 0; i <= nv; ++i){
+    typename row_map_view_t::HostMirror hr =
+        Kokkos::create_mirror_view(rowmap_view);
+    typename cols_view_t::HostMirror hc =
+        Kokkos::create_mirror_view(columns_view);
+    typename values_view_t::HostMirror hv =
+        Kokkos::create_mirror_view(values_view);
+
+    for (default_lno_t i = 0; i <= nv; ++i) {
       hr(i) = xadj[i];
     }
 
-    for (default_lno_t i = 0; i < ne; ++i){
+    for (default_lno_t i = 0; i < ne; ++i) {
       hc(i) = adj[i];
       hv(i) = ew[i];
     }
-    Kokkos::deep_copy (rowmap_view , hr);
-    Kokkos::deep_copy (columns_view , hc);
-    Kokkos::deep_copy (values_view , hv);
+    Kokkos::deep_copy(rowmap_view, hr);
+    Kokkos::deep_copy(columns_view, hc);
+    Kokkos::deep_copy(values_view, hv);
   }
-  graph_t static_graph (columns_view, rowmap_view);
+  graph_t static_graph(columns_view, rowmap_view);
   crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph);
 
-  delete [] xadj;
-  delete [] adj;
-  delete [] ew;
+  delete[] xadj;
+  delete[] adj;
+  delete[] ew;
 
-  run_experiment<execution_space, crsMat_t>(crsmat, cmdline[CMD_CLUSTER_SIZE], cmdline[CMD_USE_SEQUENTIAL_SGS]);
+  run_experiment<execution_space, crsMat_t>(crsmat, cmdline[CMD_CLUSTER_SIZE],
+                                            cmdline[CMD_USE_SEQUENTIAL_SGS]);
 }
 
-int main (int argc, char ** argv){
-
-  int cmdline[ CMD_COUNT ] ;
+int main(int argc, char **argv) {
+  int cmdline[CMD_COUNT];
   char *mtx_file = NULL;
-  for ( int i = 0 ; i < CMD_COUNT ; ++i ) cmdline[i] = 0 ;
+  for (int i = 0; i < CMD_COUNT; ++i) cmdline[i] = 0;
 
-  for ( int i = 1 ; i < argc ; ++i ) {
-    if ( 0 == strcasecmp( argv[i] , "--threads" ) ) {
-      cmdline[ CMD_USE_THREADS ] = atoi( argv[++i] );
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--openmp" ) ) {
-      cmdline[ CMD_USE_OPENMP ] = atoi( argv[++i] );
+  for (int i = 1; i < argc; ++i) {
+    if (0 == Test::string_compare_no_case(argv[i], "--threads")) {
+      cmdline[CMD_USE_THREADS] = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) {
+      cmdline[CMD_USE_OPENMP] = atoi(argv[++i]);
     }
     /*
-    else if ( 0 == strcasecmp( argv[i] , "--cores" ) ) {
+    else if ( 0 == Test::string_compare_no_case( argv[i] , "--cores" ) ) {
       //Note BMK: specifying #NUMA regions isn't supported by initialize
       sscanf( argv[++i] , "%dx%d" ,
               cmdline + CMD_USE_NUMA ,
               cmdline + CMD_USE_CORE_PER_NUMA );
     }
     */
-    else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) {
-      cmdline[ CMD_USE_CUDA ] = 1 ;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--hip" ) ) {
-      cmdline[ CMD_USE_HIP ] = 1 ;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--device-id" ) ) {
-      cmdline[ CMD_DEVICE ] = atoi( argv[++i] ) ;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--cluster-size" ) ) {
+    else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) {
+      cmdline[CMD_USE_CUDA] = 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) {
+      cmdline[CMD_USE_HIP] = 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--device-id")) {
+      cmdline[CMD_DEVICE] = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--cluster-size")) {
       cmdline[CMD_CLUSTER_SIZE] = atoi(argv[++i]);
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--seq-gs" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--seq-gs")) {
       cmdline[CMD_USE_SEQUENTIAL_SGS] = 1;
     }
 
-    else if ( 0 == strcasecmp( argv[i] , "--mtx" ) ) {
+    else if (0 == Test::string_compare_no_case(argv[i], "--mtx")) {
       mtx_file = argv[++i];
-    }
-    else {
-      cmdline[ CMD_ERROR ] = 1 ;
-      std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ;
-      std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--hip\n\t--device-id[DeviceIndex]\n\t--mtx[binary_mtx_file]" << std::endl;
+    } else {
+      cmdline[CMD_ERROR] = 1;
+      std::cerr << "Unrecognized command line argument #" << i << ": "
+                << argv[i] << std::endl;
+      std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp "
+                   "[numThreads]\n\t--cuda\n\t--hip\n\t--device-id[DeviceIndex]"
+                   "\n\t--mtx[binary_mtx_file]"
+                << std::endl;
 
       return 0;
     }
   }
-  //default cluster size is always 1 (this runs point coloring GS)
-  if(cmdline[CMD_CLUSTER_SIZE] == 0)
-    cmdline[CMD_CLUSTER_SIZE] = 1;
+  // default cluster size is always 1 (this runs point coloring GS)
+  if (cmdline[CMD_CLUSTER_SIZE] == 0) cmdline[CMD_CLUSTER_SIZE] = 1;
 
-  if (mtx_file == NULL){
-    std::cerr << "Provide a matrix file" << std::endl ;
-    std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--hip\n\t--device-id[DeviceIndex]\n\t--mtx[matrix]" << std::endl;
+  if (mtx_file == NULL) {
+    std::cerr << "Provide a matrix file" << std::endl;
+    std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp "
+                 "[numThreads]\n\t--cuda\n\t--hip\n\t--device-id[DeviceIndex]"
+                 "\n\t--mtx[matrix]"
+              << std::endl;
 
     return 0;
   }
 
-  Kokkos::InitArguments init_args; // Construct with default args, change members based on exec space
-
-  init_args.device_id = cmdline[ CMD_DEVICE ];
-  if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) {
-    init_args.num_threads = std::max(cmdline[ CMD_USE_THREADS ], cmdline [ CMD_USE_OPENMP ]);
-    init_args.num_numa = cmdline[ CMD_USE_NUMA ];
-  }
-  else {
-    init_args.num_threads = cmdline[ CMD_USE_THREADS ];
+  Kokkos::InitArguments init_args;  // Construct with default args, change
+                                    // members based on exec space
+
+  init_args.device_id = cmdline[CMD_DEVICE];
+  if (cmdline[CMD_USE_NUMA] && cmdline[CMD_USE_CORE_PER_NUMA]) {
+    init_args.num_threads =
+        std::max(cmdline[CMD_USE_THREADS], cmdline[CMD_USE_OPENMP]);
+    init_args.num_numa = cmdline[CMD_USE_NUMA];
+  } else {
+    init_args.num_threads =
+        std::max(cmdline[CMD_USE_THREADS], cmdline[CMD_USE_OPENMP]);
   }
 
-  Kokkos::initialize( init_args );
+  Kokkos::initialize(init_args);
   {
-#if defined( KOKKOS_ENABLE_THREADS )
-    if(cmdline[CMD_USE_THREADS])
-      run_pcg<Kokkos::Threads>(cmdline, mtx_file);
+#if defined(KOKKOS_ENABLE_THREADS)
+    if (cmdline[CMD_USE_THREADS]) run_pcg<Kokkos::Threads>(cmdline, mtx_file);
 #endif
-#if defined( KOKKOS_ENABLE_OPENMP )
-    if(cmdline[CMD_USE_OPENMP])
-      run_pcg<Kokkos::OpenMP>(cmdline, mtx_file);
+#if defined(KOKKOS_ENABLE_OPENMP)
+    if (cmdline[CMD_USE_OPENMP]) run_pcg<Kokkos::OpenMP>(cmdline, mtx_file);
 #endif
-#if defined( KOKKOS_ENABLE_CUDA )
-    if(cmdline[CMD_USE_CUDA])
-      run_pcg<Kokkos::Cuda>(cmdline, mtx_file);
+#if defined(KOKKOS_ENABLE_CUDA)
+    if (cmdline[CMD_USE_CUDA]) run_pcg<Kokkos::Cuda>(cmdline, mtx_file);
 #endif
-#if defined( KOKKOS_ENABLE_HIP )
-    if(cmdline[CMD_USE_HIP])
+#if defined(KOKKOS_ENABLE_HIP)
+    if (cmdline[CMD_USE_HIP])
       run_pcg<Kokkos::Experimental::HIP>(cmdline, mtx_file);
 #endif
   }
diff --git a/perf_test/sparse/KokkosSparse_pcg.hpp b/perf_test/sparse/KokkosSparse_pcg.hpp
index f7748b220a..481b5b0d00 100644
--- a/perf_test/sparse/KokkosSparse_pcg.hpp
+++ b/perf_test/sparse/KokkosSparse_pcg.hpp
@@ -48,10 +48,6 @@
 #include <cmath>
 #include <limits>
 #include <Kokkos_Core.hpp>
-#include <Kokkos_Timer.hpp>
-
-#include <Kokkos_Atomic.hpp>
-#include <Kokkos_MemoryTraits.hpp>
 
 #include <iostream>
 #include "KokkosKernels_Handle.hpp"
@@ -62,158 +58,152 @@
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-
 //#define KK_TICTOCPRINT
 namespace KokkosKernels {
-namespace Experimental{
+namespace Experimental {
 namespace Example {
 
-
 struct CGSolveResult {
-  size_t  iteration ;
-  double  iter_time ;
-  double  matvec_time ;
-  double  norm_res ;
+  size_t iteration;
+  double iter_time;
+  double matvec_time;
+  double norm_res;
   double precond_time;
   double precond_init_time;
 };
 
-template< typename KernelHandle_t,
-          typename crsMatrix_t,
-          typename y_vector_t,
-          typename x_vector_t
-          >
+template <typename KernelHandle_t, typename crsMatrix_t, typename y_vector_t,
+          typename x_vector_t>
 void block_pcgsolve(
-               KernelHandle_t &kh
-            ,  const crsMatrix_t &point_crsMat
-            ,  const crsMatrix_t &_block_crsMat, int block_size
-            ,  const y_vector_t &y_vector
-            ,  x_vector_t x_vector
-            ,  const size_t  maximum_iteration = 200
-            ,  const double  tolerance = std::numeric_limits<double>::epsilon()
-            ,  CGSolveResult * result = 0
-            ,  bool use_sgs = true)
-{
+    KernelHandle_t &kh, const crsMatrix_t &point_crsMat,
+    const crsMatrix_t &_block_crsMat, int block_size,
+    const y_vector_t &y_vector, x_vector_t x_vector,
+    const size_t maximum_iteration = 200,
+    const double tolerance         = std::numeric_limits<double>::epsilon(),
+    CGSolveResult *result = 0, bool use_sgs = true) {
   using namespace KokkosSparse;
   using namespace KokkosSparse::Experimental;
   typedef typename KernelHandle_t::HandleExecSpace Space;
 
   const size_t count_total = point_crsMat.numRows();
 
-  size_t  iteration = 0 ;
-  double  iter_time = 0 ;
-  double  matvec_time = 0 ;
-  double  norm_res = 0 ;
-  double precond_time = 0;
+  size_t iteration         = 0;
+  double iter_time         = 0;
+  double matvec_time       = 0;
+  double norm_res          = 0;
+  double precond_time      = 0;
   double precond_init_time = 0;
 
-  Kokkos::Timer wall_clock ;
+  Kokkos::Timer wall_clock;
   Kokkos::Timer timer;
 
   // Need input vector to matvec to be owned + received
-  y_vector_t pAll ( "cg::p" , count_total );
+  y_vector_t pAll("cg::p", count_total);
 
-  y_vector_t p = Kokkos::subview( pAll , std::pair<size_t,size_t>(0,count_total) );
-  y_vector_t r ( "cg::r" , count_total );
-  y_vector_t Ap( "cg::Ap", count_total );
+  y_vector_t p =
+      Kokkos::subview(pAll, std::pair<size_t, size_t>(0, count_total));
+  y_vector_t r("cg::r", count_total);
+  y_vector_t Ap("cg::Ap", count_total);
 
   // r = b - A * x ;
-  // p  = x      
-  Kokkos::deep_copy( p , x_vector );
+  // p  = x
+  Kokkos::deep_copy(p, x_vector);
 
-  // Ap = A * p 
+  // Ap = A * p
   KokkosSparse::spmv("N", 1, point_crsMat, pAll, 0, Ap);
 
   // r  = Ap
-  Kokkos::deep_copy( r , Ap );
+  Kokkos::deep_copy(r, Ap);
 
   // r = b - r
   KokkosBlas::axpby(1.0, y_vector, -1.0, r);
 
   // p  = r
-  Kokkos::deep_copy( p , r );
-;
-  double old_rdot = KokkosBlas::dot( r , r );
-  norm_res  = sqrt( old_rdot );
+  Kokkos::deep_copy(p, r);
+  ;
+  double old_rdot = KokkosBlas::dot(r, r);
+  norm_res        = sqrt(old_rdot);
 
   int apply_count = 1;
   y_vector_t z;
 
   double precond_old_rdot = 1;
-  //Kokkos::deep_copy( p , z );
+  // Kokkos::deep_copy( p , z );
 
   bool owner_handle = false;
 
   KernelHandle_t block_kh;
   block_kh.create_gs_handle();
   block_kh.get_point_gs_handle()->set_block_size(block_size);
-    //block_kh.set_shmem_size(8032);
-  if (use_sgs){
-    if (kh.get_gs_handle() == NULL){
+  // block_kh.set_shmem_size(8032);
+  if (use_sgs) {
+    if (kh.get_gs_handle() == NULL) {
       owner_handle = true;
       kh.create_gs_handle();
     }
 
     timer.reset();
 
-    //gauss_seidel_numeric
-    //  (&kh, count_total, count_total, point_crsMat.graph.row_map, point_crsMat.graph.entries, point_crsMat.values);
+    // gauss_seidel_numeric
+    //  (&kh, count_total, count_total, point_crsMat.graph.row_map,
+    //  point_crsMat.graph.entries, point_crsMat.values);
 
-    //Space().fence();
-    //timer.reset();
+    // Space().fence();
+    // timer.reset();
 
-    //block_kh.set_verbose(true);
-    block_gauss_seidel_numeric
-          (&block_kh, _block_crsMat.numRows(), _block_crsMat.numCols(), block_size, _block_crsMat.graph.row_map, _block_crsMat.graph.entries, _block_crsMat.values);
+    // block_kh.set_verbose(true);
+    block_gauss_seidel_numeric(
+        &block_kh, _block_crsMat.numRows(), _block_crsMat.numCols(), block_size,
+        _block_crsMat.graph.row_map, _block_crsMat.graph.entries,
+        _block_crsMat.values);
 
     precond_init_time += timer.seconds();
 
-    z = y_vector_t( "pcg::z" , count_total );
+    z = y_vector_t("pcg::z", count_total);
     Space().fence();
     timer.reset();
-    symmetric_block_gauss_seidel_apply
-            (&block_kh, _block_crsMat.numRows(), _block_crsMat.numCols(),block_size,  _block_crsMat.graph.row_map, _block_crsMat.graph.entries, _block_crsMat.values,
-            		z, r, true, true, 1.0, apply_count);
-
-    //symmetric_gauss_seidel_apply
-    //    (&kh, count_total, count_total, point_crsMat.graph.row_map, point_crsMat.graph.entries, point_crsMat.values, z, r, true, true, apply_count);
+    symmetric_block_gauss_seidel_apply(
+        &block_kh, _block_crsMat.numRows(), _block_crsMat.numCols(), block_size,
+        _block_crsMat.graph.row_map, _block_crsMat.graph.entries,
+        _block_crsMat.values, z, r, true, true, 1.0, apply_count);
+
+    // symmetric_gauss_seidel_apply
+    //    (&kh, count_total, count_total, point_crsMat.graph.row_map,
+    //    point_crsMat.graph.entries, point_crsMat.values, z, r, true, true,
+    //    apply_count);
     Space().fence();
     precond_time += timer.seconds();
-    precond_old_rdot = KokkosBlas::dot( r , z );
-    Kokkos::deep_copy( p , z );
+    precond_old_rdot = KokkosBlas::dot(r, z);
+    Kokkos::deep_copy(p, z);
   }
 
-  iteration = 0 ;
+  iteration = 0;
 
 #ifdef KK_TICTOCPRINT
 
-  std::cout << "norm_res:" << norm_res << " old_rdot:" << old_rdot<<  std::endl;
+  std::cout << "norm_res:" << norm_res << " old_rdot:" << old_rdot << std::endl;
 
 #endif
-  while ( tolerance < norm_res && iteration < maximum_iteration ) {
-
-
+  while (tolerance < norm_res && iteration < maximum_iteration) {
     timer.reset();
-    //Ap = A * p
+    // Ap = A * p
     KokkosSparse::spmv("N", 1, point_crsMat, pAll, 0, Ap);
 
-
     Space().fence();
     matvec_time += timer.seconds();
 
-    //const double pAp_dot = Kokkos::Example::all_reduce( dot( count_owned , p , Ap ) , import.comm );
-    //const double pAp_dot = dot<y_vector_t,y_vector_t, Space>( count_total , p , Ap ) ;
+    // const double pAp_dot = Kokkos::Example::all_reduce( dot( count_owned , p
+    // , Ap ) , import.comm ); const double pAp_dot = dot<y_vector_t,y_vector_t,
+    // Space>( count_total , p , Ap ) ;
 
     // pAp_dot = dot(Ap , p);
-    const double pAp_dot = KokkosBlas::dot( p , Ap ) ;
-
+    const double pAp_dot = KokkosBlas::dot(p, Ap);
 
-    double alpha  = 0;
-    if (use_sgs){
-      alpha = precond_old_rdot / pAp_dot ;
-    }
-    else {
-      alpha = old_rdot / pAp_dot ;
+    double alpha = 0;
+    if (use_sgs) {
+      alpha = precond_old_rdot / pAp_dot;
+    } else {
+      alpha = old_rdot / pAp_dot;
     }
 
     // x +=  alpha * p ;
@@ -222,19 +212,20 @@ void block_pcgsolve(
     // r += -alpha * Ap ;
     KokkosBlas::axpby(-alpha, Ap, 1.0, r);
 
-    const double r_dot = KokkosBlas::dot( r , r );
+    const double r_dot = KokkosBlas::dot(r, r);
 
-    const double beta_original  = r_dot / old_rdot ;
-    double precond_r_dot = 1;
-    double precond_beta = 1;
-    if (use_sgs){
+    const double beta_original = r_dot / old_rdot;
+    double precond_r_dot       = 1;
+    double precond_beta        = 1;
+    if (use_sgs) {
       Space().fence();
       timer.reset();
-      symmetric_block_gauss_seidel_apply
-                  (&block_kh, _block_crsMat.numRows(), _block_crsMat.numCols(),block_size, _block_crsMat.graph.row_map, _block_crsMat.graph.entries, _block_crsMat.values,
-                  		z, r, true, true, 1.0, apply_count);
+      symmetric_block_gauss_seidel_apply(
+          &block_kh, _block_crsMat.numRows(), _block_crsMat.numCols(),
+          block_size, _block_crsMat.graph.row_map, _block_crsMat.graph.entries,
+          _block_crsMat.values, z, r, true, true, 1.0, apply_count);
 
-      //symmetric_gauss_seidel_apply(
+      // symmetric_gauss_seidel_apply(
       //    &kh,
       //    count_total, count_total,
       //    point_crsMat.graph.row_map,
@@ -244,146 +235,139 @@ void block_pcgsolve(
 
       Space().fence();
       precond_time += timer.seconds();
-      precond_r_dot = KokkosBlas::dot(r , z );
-      precond_beta  = precond_r_dot / precond_old_rdot ;
+      precond_r_dot = KokkosBlas::dot(r, z);
+      precond_beta  = precond_r_dot / precond_old_rdot;
     }
 
-    double beta  = 1;
-    if (!use_sgs){
+    double beta = 1;
+    if (!use_sgs) {
       beta = beta_original;
       // p = r + beta * p ;
       KokkosBlas::axpby(1.0, r, beta, p);
-    }
-    else {
+    } else {
       beta = precond_beta;
       KokkosBlas::axpby(1.0, z, beta, p);
     }
 
 #ifdef KK_TICTOCPRINT
-    std::cout << "\tbeta_original:" << beta_original <<  std::endl;
-    if (use_sgs)
-    std::cout << "\tprecond_beta:" << precond_beta <<  std::endl;
+    std::cout << "\tbeta_original:" << beta_original << std::endl;
+    if (use_sgs) std::cout << "\tprecond_beta:" << precond_beta << std::endl;
 
 #endif
 
-    norm_res = sqrt( old_rdot = r_dot );
+    norm_res         = sqrt(old_rdot = r_dot);
     precond_old_rdot = precond_r_dot;
 
 #ifdef KK_TICTOCPRINT
-    std::cout << "\tnorm_res:" << norm_res << " old_rdot:" << old_rdot<<  std::endl;
+    std::cout << "\tnorm_res:" << norm_res << " old_rdot:" << old_rdot
+              << std::endl;
 #endif
-    ++iteration ;
+    ++iteration;
   }
 
   Space().fence();
   iter_time = wall_clock.seconds();
 
-  if ( 0 != result ) {
-    result->iteration   = iteration ;
-    result->iter_time   = iter_time ;
-    result->matvec_time = matvec_time ;
-    result->norm_res    = norm_res ;
-    result->precond_time = precond_time;
+  if (0 != result) {
+    result->iteration         = iteration;
+    result->iter_time         = iter_time;
+    result->matvec_time       = matvec_time;
+    result->norm_res          = norm_res;
+    result->precond_time      = precond_time;
     result->precond_init_time = precond_init_time;
   }
 
-  if (use_sgs & owner_handle ){
-
+  if (use_sgs & owner_handle) {
     kh.destroy_gs_handle();
   }
 }
 
-template< typename KernelHandle_t,
-          typename crsMatrix_t,
-          typename y_vector_t,
-          typename x_vector_t
-          >
-void pcgsolve(
-               KernelHandle_t &kh
-            ,  const crsMatrix_t &crsMat
-            ,  const y_vector_t &y_vector
-            ,  x_vector_t x_vector
-            ,  const size_t  maximum_iteration = 200
-            ,  const double tolerance = std::numeric_limits<double>::epsilon()
-            ,  CGSolveResult * result = 0
-            ,  bool use_sgs = true
-            ,  int /*clusterSize*/ = 1
-            ,  bool use_sequential_sgs = false)
-{
+template <typename KernelHandle_t, typename crsMatrix_t, typename y_vector_t,
+          typename x_vector_t>
+void pcgsolve(KernelHandle_t &kh, const crsMatrix_t &crsMat,
+              const y_vector_t &y_vector, x_vector_t x_vector,
+              const size_t maximum_iteration = 200,
+              const double tolerance = std::numeric_limits<double>::epsilon(),
+              CGSolveResult *result = 0, bool use_sgs = true,
+              int /*clusterSize*/ = 1, bool use_sequential_sgs = false) {
   using namespace KokkosSparse;
   using namespace KokkosSparse::Experimental;
   using size_type = typename KernelHandle_t::size_type;
   using nnz_lno_t = typename KernelHandle_t::nnz_lno_t;
-  using Space = typename KernelHandle_t::HandleExecSpace;
-  static_assert(std::is_same<double, typename KernelHandle_t::nnz_scalar_t>::value,
+  using Space     = typename KernelHandle_t::HandleExecSpace;
+  static_assert(
+      std::is_same<double, typename KernelHandle_t::nnz_scalar_t>::value,
       "The PCG performance test only works with scalar = double.");
 
   const nnz_lno_t count_total = crsMat.numRows();
 
-  size_t  iteration = 0 ;
-  double  iter_time = 0 ;
-  double  matvec_time = 0 ;
-  double  norm_res = 0 ;
-  double precond_time = 0;
+  size_t iteration         = 0;
+  double iter_time         = 0;
+  double matvec_time       = 0;
+  double norm_res          = 0;
+  double precond_time      = 0;
   double precond_init_time = 0;
 
-  Kokkos::Timer wall_clock ;
+  Kokkos::Timer wall_clock;
   Kokkos::Timer timer;
 
   // Need input vector to matvec to be owned + received
-  y_vector_t pAll ( "cg::p" , count_total );
+  y_vector_t pAll("cg::p", count_total);
 
-  y_vector_t p = Kokkos::subview( pAll , std::pair<size_t,size_t>(0,count_total) );
-  y_vector_t r ( "cg::r" , count_total );
-  y_vector_t Ap( "cg::Ap", count_total );
+  y_vector_t p =
+      Kokkos::subview(pAll, std::pair<size_t, size_t>(0, count_total));
+  y_vector_t r("cg::r", count_total);
+  y_vector_t Ap("cg::Ap", count_total);
 
   /* r = b - A * x ; */
-  /* p  = x       */  Kokkos::deep_copy( p , x_vector );
+  /* p  = x       */ Kokkos::deep_copy(p, x_vector);
 
-  /* Ap = A * p   */  KokkosSparse::spmv("N", 1, crsMat, pAll, 0, Ap);
+  /* Ap = A * p   */ KokkosSparse::spmv("N", 1, crsMat, pAll, 0, Ap);
 
-  /* r  = Ap       */  Kokkos::deep_copy( r , Ap );
+  /* r  = Ap       */ Kokkos::deep_copy(r, Ap);
 
-  /* r = b - r   */  KokkosBlas::axpby(1.0, y_vector, -1.0, r);
+  /* r = b - r   */ KokkosBlas::axpby(1.0, y_vector, -1.0, r);
 
-  /* p  = r       */  Kokkos::deep_copy( p , r );
+  /* p  = r       */ Kokkos::deep_copy(p, r);
 
-  double old_rdot = KokkosBlas::dot( r , r );
-  norm_res  = sqrt( old_rdot );
+  double old_rdot = KokkosBlas::dot(r, r);
+  norm_res        = sqrt(old_rdot);
 
   int apply_count = 1;
   y_vector_t z;
 
   double precond_old_rdot = 1;
-  //Kokkos::deep_copy( p , z );
+  // Kokkos::deep_copy( p , z );
 
   bool use_par_sgs = use_sgs && !use_sequential_sgs;
 
-  auto ptrHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), crsMat.graph.row_map);
-  auto indHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), crsMat.graph.entries);
-  auto valHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), crsMat.values);
-  Kokkos::View<double*, Kokkos::HostSpace> diagHost;
-  if(use_sequential_sgs)
-  {
-    diagHost = Kokkos::View<double*, Kokkos::HostSpace>("Diag for Seq SOR", count_total);
-    for(int i = 0; i < count_total; i++)
-    {
-      for(size_type j = ptrHost(i); j < ptrHost(i + 1); j++)
-      {
-        if(indHost(j) == i)
-          diagHost(i) = 1.0 / valHost(j);
+  auto ptrHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
+                                                     crsMat.graph.row_map);
+  auto indHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
+                                                     crsMat.graph.entries);
+  auto valHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), crsMat.values);
+  Kokkos::View<double *, Kokkos::HostSpace> diagHost;
+  if (use_sequential_sgs) {
+    diagHost = Kokkos::View<double *, Kokkos::HostSpace>("Diag for Seq SOR",
+                                                         count_total);
+    for (int i = 0; i < count_total; i++) {
+      for (size_type j = ptrHost(i); j < ptrHost(i + 1); j++) {
+        if (indHost(j) == i) diagHost(i) = 1.0 / valHost(j);
       }
     }
   }
-  auto xHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), x_vector);
-  auto yHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), y_vector);
-  
-  if(use_sgs) {
+  auto xHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), x_vector);
+  auto yHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), y_vector);
+
+  if (use_sgs) {
     timer.reset();
-    z = y_vector_t( "pcg::z" , count_total );
+    z = y_vector_t("pcg::z", count_total);
     if (use_par_sgs) {
-      gauss_seidel_numeric
-        (&kh, count_total, count_total, crsMat.graph.row_map, crsMat.graph.entries, crsMat.values);
+      gauss_seidel_numeric(&kh, count_total, count_total, crsMat.graph.row_map,
+                           crsMat.graph.entries, crsMat.values);
 
       Space().fence();
 
@@ -391,179 +375,161 @@ void pcgsolve(
       Space().fence();
       timer.reset();
 
-      symmetric_gauss_seidel_apply
-          (&kh, count_total, count_total, crsMat.graph.row_map, crsMat.graph.entries, crsMat.values, z, r, true, true, 1.0, apply_count);
+      symmetric_gauss_seidel_apply(&kh, count_total, count_total,
+                                   crsMat.graph.row_map, crsMat.graph.entries,
+                                   crsMat.values, z, r, true, true, 1.0,
+                                   apply_count);
 
       Space().fence();
-    }
-    else if(use_sequential_sgs) {
-      //z = LHS (aka x), r RHS (aka y or b)
+    } else if (use_sequential_sgs) {
+      // z = LHS (aka x), r RHS (aka y or b)
       Kokkos::deep_copy(z, 0.0);
       auto zhost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), z);
       auto rhost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), r);
-      //as with par_sgs, init unknown to 0
+      // as with par_sgs, init unknown to 0
       timer.reset();
-      for(int sweep = 0; sweep < apply_count; sweep++)
-      {
-        KokkosSparse::Impl::Sequential::gaussSeidel<nnz_lno_t, size_type, double, double, double>
-          (count_total, // rows = cols of the matrix
-           1,           // number of vectors in X and B
-           ptrHost.data(), indHost.data(), valHost.data(),
-           rhost.data(), count_total, //raw ptr to B vector, and B column stride (for when multiple RHS gets added to MTSGS)
-           zhost.data(), count_total, //raw ptr to X vector, and X column stride
-           diagHost.data(),
-           1.0,
-           "F");
-        KokkosSparse::Impl::Sequential::gaussSeidel<nnz_lno_t, size_type, double, double, double>
-          (count_total, 1,
-           ptrHost.data(), indHost.data(), valHost.data(), 
-           rhost.data(), count_total,
-           zhost.data(), count_total,
-           diagHost.data(),
-           1.0,
-           "B");
+      for (int sweep = 0; sweep < apply_count; sweep++) {
+        KokkosSparse::Impl::Sequential::gaussSeidel<nnz_lno_t, size_type,
+                                                    double, double, double>(
+            count_total,  // rows = cols of the matrix
+            1,            // number of vectors in X and B
+            ptrHost.data(), indHost.data(), valHost.data(), rhost.data(),
+            count_total,  // raw ptr to B vector, and B column stride (for when
+                          // multiple RHS gets added to MTSGS)
+            zhost.data(),
+            count_total,  // raw ptr to X vector, and X column stride
+            diagHost.data(), 1.0, "F");
+        KokkosSparse::Impl::Sequential::gaussSeidel<nnz_lno_t, size_type,
+                                                    double, double, double>(
+            count_total, 1, ptrHost.data(), indHost.data(), valHost.data(),
+            rhost.data(), count_total, zhost.data(), count_total,
+            diagHost.data(), 1.0, "B");
       }
-      //result is in z (but r doesn't change)
+      // result is in z (but r doesn't change)
       Kokkos::deep_copy(z, zhost);
       Kokkos::deep_copy(r, rhost);
     }
     precond_time += timer.seconds();
-    precond_old_rdot = KokkosBlas::dot(r , z);
+    precond_old_rdot = KokkosBlas::dot(r, z);
     Kokkos::deep_copy(p, z);
   }
 
-  iteration = 0 ;
+  iteration = 0;
 
 #ifdef KK_TICTOCPRINT
 
-  std::cout << "norm_res:" << norm_res << " old_rdot:" << old_rdot<<  std::endl;
+  std::cout << "norm_res:" << norm_res << " old_rdot:" << old_rdot << std::endl;
 
 #endif
-  while (tolerance < norm_res && iteration < maximum_iteration ) {
-    std::cout << "Running CG iteration " << iteration << ", current resnorm = " << norm_res << '\n';
-
+  while (tolerance < norm_res && iteration < maximum_iteration) {
+    std::cout << "Running CG iteration " << iteration
+              << ", current resnorm = " << norm_res << '\n';
 
     timer.reset();
-    /* Ap = A * p   */  KokkosSparse::spmv("N", 1, crsMat, pAll, 0, Ap);
-
+    /* Ap = A * p   */ KokkosSparse::spmv("N", 1, crsMat, pAll, 0, Ap);
 
     Space().fence();
     matvec_time += timer.seconds();
 
-    //const double pAp_dot = Kokkos::Example::all_reduce( dot( count_owned , p , Ap ) , import.comm );
-    //const double pAp_dot = dot<y_vector_t,y_vector_t, Space>( count_total , p , Ap ) ;
+    // const double pAp_dot = Kokkos::Example::all_reduce( dot( count_owned , p
+    // , Ap ) , import.comm ); const double pAp_dot = dot<y_vector_t,y_vector_t,
+    // Space>( count_total , p , Ap ) ;
 
-    /* pAp_dot = dot(Ap , p ) */ const double pAp_dot = KokkosBlas::dot( p , Ap ) ;
+    /* pAp_dot = dot(Ap , p ) */ const double pAp_dot = KokkosBlas::dot(p, Ap);
 
-    double alpha  = 0;
-    if (use_sgs){
-      alpha = precond_old_rdot / pAp_dot ;
-    }
-    else {
-      alpha = old_rdot / pAp_dot ;
+    double alpha = 0;
+    if (use_sgs) {
+      alpha = precond_old_rdot / pAp_dot;
+    } else {
+      alpha = old_rdot / pAp_dot;
     }
 
-    /* x +=  alpha * p ;  */  KokkosBlas::axpby(alpha, p, 1.0, x_vector);
+    /* x +=  alpha * p ;  */ KokkosBlas::axpby(alpha, p, 1.0, x_vector);
 
-    /* r += -alpha * Ap ; */  KokkosBlas::axpby(-alpha, Ap, 1.0, r);
+    /* r += -alpha * Ap ; */ KokkosBlas::axpby(-alpha, Ap, 1.0, r);
 
-    const double r_dot = KokkosBlas::dot( r , r );
+    const double r_dot = KokkosBlas::dot(r, r);
 
-    const double beta_original  = r_dot / old_rdot ;
-    double precond_r_dot = 1;
-    double precond_beta = 1;
-    if(use_sgs)
-    {
+    const double beta_original = r_dot / old_rdot;
+    double precond_r_dot       = 1;
+    double precond_beta        = 1;
+    if (use_sgs) {
       Space().fence();
       timer.reset();
-      if (use_par_sgs)
-      {
-        symmetric_gauss_seidel_apply(
-            &kh,
-            count_total, count_total,
-            crsMat.graph.row_map,
-            crsMat.graph.entries,
-            crsMat.values, z, r, true, true,
-            1.0, apply_count);
-      }
-      else if(use_sequential_sgs)
-      {
-        //z = LHS (aka x), r RHS (aka y or b)
+      if (use_par_sgs) {
+        symmetric_gauss_seidel_apply(&kh, count_total, count_total,
+                                     crsMat.graph.row_map, crsMat.graph.entries,
+                                     crsMat.values, z, r, true, true, 1.0,
+                                     apply_count);
+      } else if (use_sequential_sgs) {
+        // z = LHS (aka x), r RHS (aka y or b)
         Kokkos::deep_copy(z, 0.0);
-        auto zhost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), z);
-        auto rhost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), r);
-        //as with the par_sgs version, init unknown (here, zhost) to 0
-        for(int sweep = 0; sweep < apply_count; sweep++)
-        {
-          KokkosSparse::Impl::Sequential::gaussSeidel<nnz_lno_t, size_type, double, double, double>
-            (count_total, 1,
-             ptrHost.data(), indHost.data(), valHost.data(), 
-             rhost.data(), count_total,
-             zhost.data(), count_total,
-             diagHost.data(),
-             1.0,
-             "F");
-          KokkosSparse::Impl::Sequential::gaussSeidel<nnz_lno_t , size_type, double, double, double>
-            (count_total, 1,
-             ptrHost.data(), indHost.data(), valHost.data(), 
-             rhost.data(), count_total,
-             zhost.data(), count_total,
-             diagHost.data(),
-             1.0,
-             "B");
+        auto zhost =
+            Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), z);
+        auto rhost =
+            Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), r);
+        // as with the par_sgs version, init unknown (here, zhost) to 0
+        for (int sweep = 0; sweep < apply_count; sweep++) {
+          KokkosSparse::Impl::Sequential::gaussSeidel<nnz_lno_t, size_type,
+                                                      double, double, double>(
+              count_total, 1, ptrHost.data(), indHost.data(), valHost.data(),
+              rhost.data(), count_total, zhost.data(), count_total,
+              diagHost.data(), 1.0, "F");
+          KokkosSparse::Impl::Sequential::gaussSeidel<nnz_lno_t, size_type,
+                                                      double, double, double>(
+              count_total, 1, ptrHost.data(), indHost.data(), valHost.data(),
+              rhost.data(), count_total, zhost.data(), count_total,
+              diagHost.data(), 1.0, "B");
         }
         Kokkos::deep_copy(z, zhost);
         Kokkos::deep_copy(r, rhost);
       }
       precond_time += timer.seconds();
-      precond_r_dot = KokkosBlas::dot(r , z );
-      precond_beta  = precond_r_dot / precond_old_rdot ;
+      precond_r_dot = KokkosBlas::dot(r, z);
+      precond_beta  = precond_r_dot / precond_old_rdot;
     }
     double beta = 1;
     if (!use_sgs) {
       beta = beta_original;
-      /* p = r + beta * p ; */  KokkosBlas::axpby(1.0, r, beta, p);
-    }
-    else {
+      /* p = r + beta * p ; */ KokkosBlas::axpby(1.0, r, beta, p);
+    } else {
       beta = precond_beta;
       KokkosBlas::axpby(1.0, z, beta, p);
     }
 
 #ifdef KK_TICTOCPRINT
-    std::cout << "\tbeta_original:" << beta_original <<  std::endl;
-    if (use_sgs)
-    std::cout << "\tprecond_beta:" << precond_beta <<  std::endl;
+    std::cout << "\tbeta_original:" << beta_original << std::endl;
+    if (use_sgs) std::cout << "\tprecond_beta:" << precond_beta << std::endl;
 
 #endif
 
-    norm_res = sqrt( old_rdot = r_dot );
+    norm_res         = sqrt(old_rdot = r_dot);
     precond_old_rdot = precond_r_dot;
 
 #ifdef KK_TICTOCPRINT
-    std::cout << "\tnorm_res:" << norm_res << " old_rdot:" << old_rdot<<  std::endl;
+    std::cout << "\tnorm_res:" << norm_res << " old_rdot:" << old_rdot
+              << std::endl;
 #endif
-    ++iteration ;
+    ++iteration;
   }
 
   Space().fence();
   iter_time = wall_clock.seconds();
 
-  if ( 0 != result ) {
-    result->iteration   = iteration ;
-    result->iter_time   = iter_time ;
-    result->matvec_time = matvec_time ;
-    result->norm_res    = norm_res ;
-    result->precond_time = precond_time;
+  if (0 != result) {
+    result->iteration         = iteration;
+    result->iter_time         = iter_time;
+    result->matvec_time       = matvec_time;
+    result->norm_res          = norm_res;
+    result->precond_time      = precond_time;
     result->precond_init_time = precond_init_time;
   }
 }
 
-} // namespace Example
-} // namespace Kokkos
-}
+}  // namespace Example
+}  // namespace Experimental
+}  // namespace KokkosKernels
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 #endif /* #ifndef KOKKOS_EXAMPLE_CG_SOLVE */
-
-
diff --git a/perf_test/sparse/KokkosSparse_run_spgemm.hpp b/perf_test/sparse/KokkosSparse_run_spgemm.hpp
index d259eb43b5..caedb013c3 100644
--- a/perf_test/sparse/KokkosSparse_run_spgemm.hpp
+++ b/perf_test/sparse/KokkosSparse_run_spgemm.hpp
@@ -42,7 +42,6 @@
 //@HEADER
 */
 
-
 #include "KokkosSparse_spgemm.hpp"
 #include "KokkosKernels_TestParameters.hpp"
 #include "KokkosKernels_Sorting.hpp"
@@ -50,111 +49,114 @@
 #define TRANPOSEFIRST false
 #define TRANPOSESECOND false
 
-namespace KokkosKernels{
+namespace KokkosKernels {
 
-namespace Experiment{
+namespace Experiment {
 template <typename crsMat_t, typename device>
-bool is_same_matrix(crsMat_t output_mat1, crsMat_t output_mat2){
-
+bool is_same_matrix(crsMat_t output_mat1, crsMat_t output_mat2) {
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::non_const_type lno_view_t;
-  typedef typename graph_t::entries_type::non_const_type   lno_nnz_view_t;
+  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
   typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
 
-  size_t nrows1 = output_mat1.graph.row_map.extent(0);
-  size_t nentries1 = output_mat1.graph.entries.extent(0) ;
-  size_t nvals1 = output_mat1.values.extent(0);
+  size_t nrows1    = output_mat1.graph.row_map.extent(0);
+  size_t nentries1 = output_mat1.graph.entries.extent(0);
+  size_t nvals1    = output_mat1.values.extent(0);
 
-  size_t nrows2 = output_mat2.graph.row_map.extent(0);
-  size_t nentries2 = output_mat2.graph.entries.extent(0) ;
-  size_t nvals2 = output_mat2.values.extent(0);
+  size_t nrows2    = output_mat2.graph.row_map.extent(0);
+  size_t nentries2 = output_mat2.graph.entries.extent(0);
+  size_t nvals2    = output_mat2.values.extent(0);
 
   KokkosKernels::sort_crs_matrix(output_mat1);
 
   if (nrows1 != nrows2) {
-	  std::cerr <<"row count is different" << std::endl;
-	  return false;
+    std::cerr << "row count is different" << std::endl;
+    return false;
   }
   if (nentries1 != nentries2) {
-	  std::cerr <<"nentries2 is different" << std::endl;
-	  return false;
+    std::cerr << "nentries2 is different" << std::endl;
+    return false;
   }
   if (nvals1 != nvals2) {
-	  std::cerr <<"nvals1 is different" << std::endl;
-	  return false;
+    std::cerr << "nvals1 is different" << std::endl;
+    return false;
   }
 
   KokkosKernels::sort_crs_matrix(output_mat2);
 
   bool is_identical = true;
-  is_identical = KokkosKernels::Impl::kk_is_identical_view
-      <typename graph_t::row_map_type, typename graph_t::row_map_type, typename lno_view_t::value_type,
-      typename device::execution_space>(output_mat1.graph.row_map, output_mat2.graph.row_map, 0);
+  is_identical      = KokkosKernels::Impl::kk_is_identical_view<
+      typename graph_t::row_map_type, typename graph_t::row_map_type,
+      typename lno_view_t::value_type, typename device::execution_space>(
+      output_mat1.graph.row_map, output_mat2.graph.row_map, 0);
   if (!is_identical) {
-	  std::cerr << "rowmaps differ" << std::endl;
-	  return false;
+    std::cerr << "rowmaps differ" << std::endl;
+    return false;
   }
 
-  is_identical = KokkosKernels::Impl::kk_is_identical_view
-      <lno_nnz_view_t, lno_nnz_view_t, typename lno_nnz_view_t::value_type,
-      typename device::execution_space>(output_mat1.graph.entries, output_mat2.graph.entries, 0 );
+  is_identical = KokkosKernels::Impl::kk_is_identical_view<
+      lno_nnz_view_t, lno_nnz_view_t, typename lno_nnz_view_t::value_type,
+      typename device::execution_space>(output_mat1.graph.entries,
+                                        output_mat2.graph.entries, 0);
   if (!is_identical) {
-	  for (size_t i = 0; i <  nrows1; ++i){
-		  size_t rb = output_mat1.graph.row_map(i);
-		  size_t re = output_mat1.graph.row_map(i + 1);
-		  bool incorrect =false;
-		  for (size_t j = rb; j <  re; ++j){
-			 if (output_mat1.graph.entries(j) != output_mat2.graph.entries(j)){
-				 incorrect = true;
-				 break;
-			 }
-		  }
-		  if (incorrect){
-			  for (size_t j = rb; j <  re; ++j){
-                            std::cerr << "row:" << i << " j:" << j <<
-                              " h_ent1(j):" << output_mat1.graph.entries(j) << " h_ent2(j):" << output_mat2.graph.entries(j) <<
-                              " rb:" << rb << " re:" << re << std::endl;
-			  }
-		  }
-
-	  }
-	  std::cerr << "entries differ" << std::endl;
-	  return false;
+    for (size_t i = 0; i < nrows1; ++i) {
+      size_t rb      = output_mat1.graph.row_map(i);
+      size_t re      = output_mat1.graph.row_map(i + 1);
+      bool incorrect = false;
+      for (size_t j = rb; j < re; ++j) {
+        if (output_mat1.graph.entries(j) != output_mat2.graph.entries(j)) {
+          incorrect = true;
+          break;
+        }
+      }
+      if (incorrect) {
+        for (size_t j = rb; j < re; ++j) {
+          std::cerr << "row:" << i << " j:" << j
+                    << " h_ent1(j):" << output_mat1.graph.entries(j)
+                    << " h_ent2(j):" << output_mat2.graph.entries(j)
+                    << " rb:" << rb << " re:" << re << std::endl;
+        }
+      }
+    }
+    std::cerr << "entries differ" << std::endl;
+    return false;
   }
 
-  is_identical = KokkosKernels::Impl::kk_is_identical_view
-      <scalar_view_t, scalar_view_t, typename scalar_view_t::value_type,
-      typename device::execution_space>(output_mat1.values, output_mat2.values, 0.000001);
+  is_identical = KokkosKernels::Impl::kk_is_identical_view<
+      scalar_view_t, scalar_view_t, typename scalar_view_t::value_type,
+      typename device::execution_space>(output_mat1.values, output_mat2.values,
+                                        0.000001);
   if (!is_identical) {
     std::cerr << "Incorret values" << std::endl;
   }
   return true;
 }
 
-
-template <typename ExecSpace, typename crsMat_t, typename crsMat_t2 , typename crsMat_t3 , typename TempMemSpace , typename PersistentMemSpace >
-crsMat_t3 run_experiment(crsMat_t crsMat, crsMat_t2 crsMat2, Parameters params)
-{
+template <typename ExecSpace, typename crsMat_t, typename crsMat_t2,
+          typename crsMat_t3, typename TempMemSpace,
+          typename PersistentMemSpace>
+crsMat_t3 run_experiment(crsMat_t crsMat, crsMat_t2 crsMat2,
+                         Parameters params) {
   using namespace KokkosSparse;
   using namespace KokkosSparse::Experimental;
   using device_t = Kokkos::Device<ExecSpace, PersistentMemSpace>;
-  int algorithm = params.algorithm;
-  int repeat = params.repeat;
+  int algorithm  = params.algorithm;
+  int repeat     = params.repeat;
   int chunk_size = params.chunk_size;
 
-  int shmemsize = params.shmemsize;
-  int team_size = params.team_size;
-  int use_dynamic_scheduling = params.use_dynamic_scheduling;
-  int verbose = params.verbose;
+  int shmemsize                 = params.shmemsize;
+  int team_size                 = params.team_size;
+  int use_dynamic_scheduling    = params.use_dynamic_scheduling;
+  int verbose                   = params.verbose;
   int calculate_read_write_cost = params.calculate_read_write_cost;
-  //char spgemm_step = params.spgemm_step;
-  int vector_size = params.vector_size;
-  int check_output = params.check_output;
+  // char spgemm_step = params.spgemm_step;
+  int vector_size     = params.vector_size;
+  int check_output    = params.check_output;
   int mkl_keep_output = params.mkl_keep_output;
-  //spgemm_step++;
-  typedef typename crsMat_t3::values_type::non_const_type  scalar_view_t;
+  // spgemm_step++;
+  typedef typename crsMat_t3::values_type::non_const_type scalar_view_t;
   typedef typename crsMat_t3::row_map_type::non_const_type lno_view_t;
-  typedef typename crsMat_t3::index_type::non_const_type   lno_nnz_view_t;
+  typedef typename crsMat_t3::index_type::non_const_type lno_nnz_view_t;
   typedef typename lno_nnz_view_t::value_type lno_t;
   typedef typename lno_view_t::value_type size_type;
   typedef typename scalar_view_t::value_type scalar_t;
@@ -163,9 +165,9 @@ crsMat_t3 run_experiment(crsMat_t crsMat, crsMat_t2 crsMat2, Parameters params)
   lno_nnz_view_t entriesC;
   scalar_view_t valuesC;
 
-  typedef KokkosKernels::Experimental::KokkosKernelsHandle
-      <size_type,lno_t, scalar_t,
-      ExecSpace, TempMemSpace,PersistentMemSpace > KernelHandle;
+  typedef KokkosKernels::Experimental::KokkosKernelsHandle<
+      size_type, lno_t, scalar_t, ExecSpace, TempMemSpace, PersistentMemSpace>
+      KernelHandle;
 
   typedef typename lno_nnz_view_t::value_type idx;
   typedef typename lno_view_t::value_type size_type;
@@ -176,10 +178,10 @@ crsMat_t3 run_experiment(crsMat_t crsMat, crsMat_t2 crsMat2, Parameters params)
   kh.set_suggested_team_size(team_size);
   kh.set_suggested_vector_size(vector_size);
 
-  if (use_dynamic_scheduling){
+  if (use_dynamic_scheduling) {
     kh.set_dynamic_scheduling(true);
   }
-  if (verbose){
+  if (verbose) {
     kh.set_verbose(true);
   }
 
@@ -188,23 +190,22 @@ crsMat_t3 run_experiment(crsMat_t crsMat, crsMat_t2 crsMat2, Parameters params)
   const idx k = crsMat2.numCols();
 
   if (verbose) std::cout << "m:" << m << " n:" << n << " k:" << k << std::endl;
-  if (n < crsMat.numCols()){
-    std::cerr << "left.numCols():" << crsMat.numCols() << " right.numRows():" << crsMat2.numRows() << std::endl;
+  if (n < crsMat.numCols()) {
+    std::cerr << "left.numCols():" << crsMat.numCols()
+              << " right.numRows():" << crsMat2.numRows() << std::endl;
     exit(1);
   }
 
-  //The reference product (for verifying correctness)
-  //Don't allocate them if they won't be used, but they must be declared here.
-  lno_view_t     row_mapC_ref;
+  // The reference product (for verifying correctness)
+  // Don't allocate them if they won't be used, but they must be declared here.
+  lno_view_t row_mapC_ref;
   lno_nnz_view_t entriesC_ref;
-  scalar_view_t  valuesC_ref;
-  //Reference output has same type as actual output
-  crsMat_t3      Ccrsmat_ref;
-
-  if (check_output)
-  {
-    if (verbose)
-      std::cout << "Running a reference algorithm" << std::endl;
+  scalar_view_t valuesC_ref;
+  // Reference output has same type as actual output
+  crsMat_t3 Ccrsmat_ref;
+
+  if (check_output) {
+    if (verbose) std::cout << "Running a reference algorithm" << std::endl;
     row_mapC_ref = lno_view_t("non_const_lnow_row", m + 1);
     KernelHandle sequential_kh;
     sequential_kh.set_team_work_size(chunk_size);
@@ -212,52 +213,32 @@ crsMat_t3 run_experiment(crsMat_t crsMat, crsMat_t2 crsMat2, Parameters params)
     sequential_kh.set_suggested_team_size(team_size);
     sequential_kh.create_spgemm_handle(KokkosSparse::SPGEMM_SERIAL);
 
-    if (use_dynamic_scheduling){
+    if (use_dynamic_scheduling) {
       sequential_kh.set_dynamic_scheduling(true);
     }
 
-    spgemm_symbolic (
-        &sequential_kh,
-        m,
-        n,
-        k,
-        crsMat.graph.row_map,
-        crsMat.graph.entries,
-        TRANPOSEFIRST,
-        crsMat2.graph.row_map,
-        crsMat2.graph.entries,
-        TRANPOSESECOND,
-        row_mapC_ref
-    );
+    spgemm_symbolic(&sequential_kh, m, n, k, crsMat.graph.row_map,
+                    crsMat.graph.entries, TRANPOSEFIRST, crsMat2.graph.row_map,
+                    crsMat2.graph.entries, TRANPOSESECOND, row_mapC_ref);
 
     ExecSpace().fence();
 
-
     size_type c_nnz_size = sequential_kh.get_spgemm_handle()->get_c_nnz();
-    entriesC_ref = lno_nnz_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), c_nnz_size);
-    valuesC_ref =  scalar_view_t (Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size);
-
-    spgemm_numeric(
-        &sequential_kh,
-        m,
-        n,
-        k,
-        crsMat.graph.row_map,
-        crsMat.graph.entries,
-        crsMat.values,
-        TRANPOSEFIRST,
-
-        crsMat2.graph.row_map,
-        crsMat2.graph.entries,
-        crsMat2.values,
-        TRANPOSESECOND,
-        row_mapC_ref,
-        entriesC_ref,
-        valuesC_ref
-    );
+    entriesC_ref         = lno_nnz_view_t(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"),
+        c_nnz_size);
+    valuesC_ref = scalar_view_t(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size);
+
+    spgemm_numeric(&sequential_kh, m, n, k, crsMat.graph.row_map,
+                   crsMat.graph.entries, crsMat.values, TRANPOSEFIRST,
+
+                   crsMat2.graph.row_map, crsMat2.graph.entries, crsMat2.values,
+                   TRANPOSESECOND, row_mapC_ref, entriesC_ref, valuesC_ref);
     ExecSpace().fence();
 
-    Ccrsmat_ref = crsMat_t3("CorrectC", m, k, valuesC_ref.extent(0), valuesC_ref, row_mapC_ref, entriesC_ref);
+    Ccrsmat_ref = crsMat_t3("CorrectC", m, k, valuesC_ref.extent(0),
+                            valuesC_ref, row_mapC_ref, entriesC_ref);
   }
 
   for (int i = 0; i < repeat; ++i) {
@@ -266,80 +247,60 @@ crsMat_t3 run_experiment(crsMat_t crsMat, crsMat_t2 crsMat2, Parameters params)
     kh.get_spgemm_handle()->mkl_keep_output = mkl_keep_output;
     kh.get_spgemm_handle()->set_mkl_sort_option(params.mkl_sort_option);
 
-    //if mkl2 input needs to be converted to 1base.
+    // if mkl2 input needs to be converted to 1base.
     kh.get_spgemm_handle()->mkl_convert_to_1base = true;
 
-    //250000 default. if cache-mode is used on KNL can increase to 1M.
+    // 250000 default. if cache-mode is used on KNL can increase to 1M.
     kh.get_spgemm_handle()->MaxColDenseAcc = params.MaxColDenseAcc;
 
-    if (i == 0){
-      kh.get_spgemm_handle()->set_read_write_cost_calc (calculate_read_write_cost);
+    if (i == 0) {
+      kh.get_spgemm_handle()->set_read_write_cost_calc(
+          calculate_read_write_cost);
     }
-    //do the compression whether in 2 step, or 1 step.
+    // do the compression whether in 2 step, or 1 step.
     kh.get_spgemm_handle()->set_compression_steps(!params.compression2step);
-    //whether to scale the hash more. default is 1, so no scale.
+    // whether to scale the hash more. default is 1, so no scale.
     kh.get_spgemm_handle()->set_min_hash_size_scale(params.minhashscale);
-    //max occupancy in 1-level LP hashes. LL hashes can be 100%
-    kh.get_spgemm_handle()->set_first_level_hash_cut_off(params.first_level_hash_cut_off);
-    //min reduction on FLOPs to run compression
+    // max occupancy in 1-level LP hashes. LL hashes can be 100%
+    kh.get_spgemm_handle()->set_first_level_hash_cut_off(
+        params.first_level_hash_cut_off);
+    // min reduction on FLOPs to run compression
     kh.get_spgemm_handle()->set_compression_cut_off(params.compression_cut_off);
 
-    row_mapC = lno_view_t
-      ("non_const_lnow_row",
-       m + 1);
-    entriesC = lno_nnz_view_t ("entriesC (empty)", 0);
-    valuesC = scalar_view_t ("valuesC (empty)", 0);
+    row_mapC = lno_view_t("non_const_lnow_row", m + 1);
+    entriesC = lno_nnz_view_t("entriesC (empty)", 0);
+    valuesC  = scalar_view_t("valuesC (empty)", 0);
 
     Kokkos::Timer timer1;
-    spgemm_symbolic (
-        &kh,
-        m,
-        n,
-        k,
-        crsMat.graph.row_map,
-        crsMat.graph.entries,
-        TRANPOSEFIRST,
-        crsMat2.graph.row_map,
-        crsMat2.graph.entries,
-        TRANPOSESECOND,
-        row_mapC
-        );
+    spgemm_symbolic(&kh, m, n, k, crsMat.graph.row_map, crsMat.graph.entries,
+                    TRANPOSEFIRST, crsMat2.graph.row_map, crsMat2.graph.entries,
+                    TRANPOSESECOND, row_mapC);
 
     ExecSpace().fence();
     double symbolic_time = timer1.seconds();
 
     Kokkos::Timer timer3;
     size_type c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
-    if (verbose)  std::cout << "C SIZE:" << c_nnz_size << std::endl;
-    if (c_nnz_size){
-      entriesC = lno_nnz_view_t (Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), c_nnz_size);
-      valuesC = scalar_view_t (Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size);
+    if (verbose) std::cout << "C SIZE:" << c_nnz_size << std::endl;
+    if (c_nnz_size) {
+      entriesC = lno_nnz_view_t(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"),
+          c_nnz_size);
+      valuesC = scalar_view_t(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"),
+          c_nnz_size);
     }
-    spgemm_numeric(
-        &kh,
-        m,
-        n,
-        k,
-        crsMat.graph.row_map,
-        crsMat.graph.entries,
-        crsMat.values,
-        TRANPOSEFIRST,
-
-        crsMat2.graph.row_map,
-        crsMat2.graph.entries,
-        crsMat2.values,
-        TRANPOSESECOND,
-        row_mapC,
-        entriesC,
-        valuesC
-        );
+    spgemm_numeric(&kh, m, n, k, crsMat.graph.row_map, crsMat.graph.entries,
+                   crsMat.values, TRANPOSEFIRST,
+
+                   crsMat2.graph.row_map, crsMat2.graph.entries, crsMat2.values,
+                   TRANPOSESECOND, row_mapC, entriesC, valuesC);
     ExecSpace().fence();
     double numeric_time = timer3.seconds();
 
-    std::cout
-      << "mm_time:" << symbolic_time + numeric_time
-      << " symbolic_time:" << symbolic_time
-      << " numeric_time:" << numeric_time << std::endl;
+    std::cout << "mm_time:" << symbolic_time + numeric_time
+              << " symbolic_time:" << symbolic_time
+              << " numeric_time:" << numeric_time << std::endl;
   }
   if (verbose) {
     std::cout << "row_mapC:" << row_mapC.extent(0) << std::endl;
@@ -349,17 +310,20 @@ crsMat_t3 run_experiment(crsMat_t crsMat, crsMat_t2 crsMat2, Parameters params)
     KokkosKernels::Impl::print_1Dview(entriesC);
     KokkosKernels::Impl::print_1Dview(row_mapC);
   }
-  crsMat_t3 Ccrsmat_result("CrsMatrixC", m, k, valuesC.extent(0), valuesC, row_mapC, entriesC);
-  if (check_output){
-    bool is_identical = is_same_matrix<crsMat_t3, device_t>(Ccrsmat_result, Ccrsmat_ref);
-    if (!is_identical){
-      std::cerr << "Result differs. If values are differing, might be floating point order error." << std::endl;
+  crsMat_t3 Ccrsmat_result("CrsMatrixC", m, k, valuesC.extent(0), valuesC,
+                           row_mapC, entriesC);
+  if (check_output) {
+    bool is_identical =
+        is_same_matrix<crsMat_t3, device_t>(Ccrsmat_result, Ccrsmat_ref);
+    if (!is_identical) {
+      std::cerr << "Result differs. If values are differing, might be floating "
+                   "point order error."
+                << std::endl;
       exit(1);
     }
   }
   return Ccrsmat_result;
 }
 
-
-}
-}
+}  // namespace Experiment
+}  // namespace KokkosKernels
diff --git a/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp b/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp
index eb64eca623..b5ac32a86e 100644
--- a/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp
+++ b/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp
@@ -50,460 +50,461 @@
 #define TRANSPOSEFIRST false
 #define TRANSPOSESECOND false
 
-namespace KokkosKernels{
+namespace KokkosKernels {
 
-  namespace Experiment{
+namespace Experiment {
 
-    template <typename crsMat_t, typename device>
-    bool is_same_matrix(crsMat_t output_mat1, crsMat_t output_mat2){
+template <typename crsMat_t, typename device>
+bool is_same_matrix(crsMat_t output_mat1, crsMat_t output_mat2) {
+  using graph_t        = typename crsMat_t::StaticCrsGraphType;
+  using lno_view_t     = typename graph_t::row_map_type::non_const_type;
+  using lno_nnz_view_t = typename graph_t::entries_type::non_const_type;
+  using scalar_view_t  = typename crsMat_t::values_type::non_const_type;
 
-      using graph_t = typename crsMat_t::StaticCrsGraphType;
-      using lno_view_t = typename graph_t::row_map_type::non_const_type;
-      using lno_nnz_view_t = typename graph_t::entries_type::non_const_type;
-      using scalar_view_t = typename crsMat_t::values_type::non_const_type;
+  size_t nrows1    = output_mat1.graph.row_map.extent(0);
+  size_t nentries1 = output_mat1.graph.entries.extent(0);
+  size_t nvals1    = output_mat1.values.extent(0);
 
-      size_t nrows1 = output_mat1.graph.row_map.extent(0);
-      size_t nentries1 = output_mat1.graph.entries.extent(0);
-      size_t nvals1 = output_mat1.values.extent(0);
+  size_t nrows2    = output_mat2.graph.row_map.extent(0);
+  size_t nentries2 = output_mat2.graph.entries.extent(0);
+  size_t nvals2    = output_mat2.values.extent(0);
 
-      size_t nrows2 = output_mat2.graph.row_map.extent(0);
-      size_t nentries2 = output_mat2.graph.entries.extent(0);
-      size_t nvals2 = output_mat2.values.extent(0);
+  KokkosKernels::sort_crs_matrix(output_mat1);
 
-      KokkosKernels::sort_crs_matrix(output_mat1);
+  if (nrows1 != nrows2) {
+    std::cerr << "row count is different" << std::endl;
+    return false;
+  }
+  if (nentries1 != nentries2) {
+    std::cerr << "nentries2 is different" << std::endl;
+    return false;
+  }
+  if (nvals1 != nvals2) {
+    std::cerr << "nvals1 is different" << std::endl;
+    return false;
+  }
 
-      if (nrows1 != nrows2) {
-	std::cerr <<"row count is different" << std::endl;
-	return false;
-      }
-      if (nentries1 != nentries2) {
-	std::cerr <<"nentries2 is different" << std::endl;
-	return false;
-      }
-      if (nvals1 != nvals2) {
-	std::cerr <<"nvals1 is different" << std::endl;
-	return false;
-      }
+  KokkosKernels::sort_crs_matrix(output_mat2);
 
-      KokkosKernels::sort_crs_matrix(output_mat2);
+  bool is_identical = true;
+  is_identical      = KokkosKernels::Impl::kk_is_identical_view<
+      typename graph_t::row_map_type, typename graph_t::row_map_type,
+      typename lno_view_t::value_type, typename device::execution_space>(
+      output_mat1.graph.row_map, output_mat2.graph.row_map, 0);
+  if (!is_identical) {
+    std::cerr << "rowmaps are different" << std::endl;
+    return false;
+  }
 
-      bool is_identical = true;
-      is_identical = KokkosKernels::Impl::kk_is_identical_view
-	<typename graph_t::row_map_type, typename graph_t::row_map_type, typename lno_view_t::value_type,
-	 typename device::execution_space>(output_mat1.graph.row_map, output_mat2.graph.row_map, 0);
-      if (!is_identical) {
-	std::cerr << "rowmaps are different" << std::endl;
-	return false;
-      }
+  is_identical = KokkosKernels::Impl::kk_is_identical_view<
+      lno_nnz_view_t, lno_nnz_view_t, typename lno_nnz_view_t::value_type,
+      typename device::execution_space>(output_mat1.graph.entries,
+                                        output_mat2.graph.entries, 0);
 
-      is_identical = KokkosKernels::Impl::kk_is_identical_view
-	<lno_nnz_view_t, lno_nnz_view_t, typename lno_nnz_view_t::value_type,
-	 typename device::execution_space>(output_mat1.graph.entries, output_mat2.graph.entries, 0);
+  if (!is_identical) {
+    std::cerr << "entries are different" << std::endl;
+    return false;
+  }
 
-      if (!is_identical) {
-	std::cerr << "entries are different" << std::endl;
-	return false;
-      }
+  is_identical = KokkosKernels::Impl::kk_is_identical_view<
+      scalar_view_t, scalar_view_t, typename scalar_view_t::value_type,
+      typename device::execution_space>(output_mat1.values, output_mat2.values,
+                                        0.00001);
+  if (!is_identical) {
+    std::cerr << "values are different" << std::endl;
+  }
+  return true;
+}
 
-      is_identical = KokkosKernels::Impl::kk_is_identical_view
-	<scalar_view_t, scalar_view_t, typename scalar_view_t::value_type,
-	 typename device::execution_space>(output_mat1.values, output_mat2.values, 0.00001);
-      if (!is_identical) {
-	std::cerr << "values are different" << std::endl;
-      }
-      return true;
-    }
+template <typename ExecSpace, typename crsMat_t, typename crsMat_t2,
+          typename crsMat_t3, typename TempMemSpace,
+          typename PersistentMemSpace>
+crsMat_t3 run_experiment(crsMat_t crsMat, crsMat_t2 crsMat2,
+                         Parameters params) {
+  using namespace KokkosSparse;
+  using namespace KokkosSparse::Experimental;
+  using device_t       = Kokkos::Device<ExecSpace, PersistentMemSpace>;
+  using scalar_view_t  = typename crsMat_t3::values_type::non_const_type;
+  using lno_view_t     = typename crsMat_t3::row_map_type::non_const_type;
+  using lno_nnz_view_t = typename crsMat_t3::index_type::non_const_type;
+  using lno_t          = typename lno_nnz_view_t::value_type;
+  using size_type      = typename lno_view_t::value_type;
+  using scalar_t       = typename scalar_view_t::value_type;
+  using KernelHandle   = KokkosKernels::Experimental::KokkosKernelsHandle<
+      size_type, lno_t, scalar_t, ExecSpace, TempMemSpace, PersistentMemSpace>;
+
+  int algorithm                 = params.algorithm;
+  int repeat                    = params.repeat;
+  int chunk_size                = params.chunk_size;
+  int shmemsize                 = params.shmemsize;
+  int team_size                 = params.team_size;
+  int use_dynamic_scheduling    = params.use_dynamic_scheduling;
+  int verbose                   = params.verbose;
+  int calculate_read_write_cost = params.calculate_read_write_cost;
+  int vector_size               = params.vector_size;
+  int check_output              = params.check_output;
+
+  lno_view_t row_mapC;
+  lno_nnz_view_t entriesC;
+  scalar_view_t valuesC;
+
+  KernelHandle kh;
+  kh.set_team_work_size(chunk_size);
+  kh.set_shmem_size(shmemsize);
+  kh.set_suggested_team_size(team_size);
+  kh.set_suggested_vector_size(vector_size);
+
+  if (use_dynamic_scheduling) {
+    kh.set_dynamic_scheduling(true);
+  }
+  if (verbose) {
+    kh.set_verbose(true);
+  }
 
-    template <typename ExecSpace, typename crsMat_t, typename crsMat_t2 , typename crsMat_t3 , typename TempMemSpace , typename PersistentMemSpace >
-    crsMat_t3 run_experiment(crsMat_t crsMat, crsMat_t2 crsMat2, Parameters params)
-    {
-      using namespace KokkosSparse;
-      using namespace KokkosSparse::Experimental;
-      using device_t = Kokkos::Device<ExecSpace, PersistentMemSpace>;
-      using scalar_view_t = typename crsMat_t3::values_type::non_const_type;
-      using lno_view_t = typename crsMat_t3::row_map_type::non_const_type;
-      using lno_nnz_view_t = typename crsMat_t3::index_type::non_const_type;
-      using lno_t = typename lno_nnz_view_t::value_type;
-      using size_type = typename lno_view_t::value_type;
-      using scalar_t = typename scalar_view_t::value_type;
-      using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<size_type, lno_t, scalar_t,
-									    ExecSpace, TempMemSpace, PersistentMemSpace>;
-
-      int algorithm = params.algorithm;
-      int repeat = params.repeat;
-      int chunk_size = params.chunk_size;
-      int shmemsize = params.shmemsize;
-      int team_size = params.team_size;
-      int use_dynamic_scheduling = params.use_dynamic_scheduling;
-      int verbose = params.verbose;
-      int calculate_read_write_cost = params.calculate_read_write_cost;
-      int vector_size = params.vector_size;
-      int check_output = params.check_output;
-
-      lno_view_t row_mapC;
-      lno_nnz_view_t entriesC;
-      scalar_view_t valuesC;
-
-      KernelHandle kh;
-      kh.set_team_work_size(chunk_size);
-      kh.set_shmem_size(shmemsize);
-      kh.set_suggested_team_size(team_size);
-      kh.set_suggested_vector_size(vector_size);
-
-      if (use_dynamic_scheduling){
-	kh.set_dynamic_scheduling(true);
-      }
-      if (verbose){
-	kh.set_verbose(true);
-      }
+  const lno_t m = crsMat.numRows();
+  const lno_t n = crsMat2.numRows();
+  const lno_t k = crsMat2.numCols();
 
-      const lno_t m = crsMat.numRows();
-      const lno_t n = crsMat2.numRows();
-      const lno_t k = crsMat2.numCols();
+  if (verbose) std::cout << "m:" << m << " n:" << n << " k:" << k << std::endl;
+  if (m != n) {
+    std::cerr << "left.numCols():" << n << " left.numRows():" << m << std::endl;
+    exit(1);
+  }
+  if (n < crsMat.numCols()) {
+    std::cerr << "left.numCols():" << crsMat.numCols()
+              << " right.numRows():" << crsMat2.numRows() << std::endl;
+    exit(1);
+  }
 
-      if (verbose) 
-	std::cout << "m:" << m << " n:" << n << " k:" << k << std::endl;
-      if (m != n){
-	std::cerr << "left.numCols():" << n << " left.numRows():" << m << std::endl;
-	exit(1);
-      }
-      if (n < crsMat.numCols()){
-	std::cerr << "left.numCols():" << crsMat.numCols() << " right.numRows():" << crsMat2.numRows() << std::endl;
-	exit(1);
-      }
+  typedef typename Kokkos::View<scalar_t **,
+                                typename KokkosKernels::Impl::GetUnifiedLayout<
+                                    scalar_view_t>::array_layout,
+                                device_t>
+      view_t;
 
-      typedef typename Kokkos::View<scalar_t **,
-				    typename KokkosKernels::Impl::GetUnifiedLayout<scalar_view_t>::array_layout,
-				    device_t > view_t;
+  view_t dinv("Dinv", m, 1);
+  Kokkos::deep_copy(dinv, 2.0);
+  scalar_t omega = 3;
 
+  // The reference product (for verifying correctness)
+  // Don't allocate them if they won't be used, but they must be declared here.
+  lno_view_t row_mapC_ref;
+  lno_nnz_view_t entriesC_ref;
+  scalar_view_t valuesC_ref;
+  crsMat_t3 Ccrsmat_ref;
 
-      view_t dinv("Dinv", m, 1);
-      Kokkos::deep_copy(dinv, 2.0);
-      scalar_t omega = 3;
+  if (check_output) {
+    if (verbose) std::cout << "Running a reference algorithm" << std::endl;
 
-      // The reference product (for verifying correctness)
-      // Don't allocate them if they won't be used, but they must be declared here.
-      lno_view_t row_mapC_ref;
-      lno_nnz_view_t entriesC_ref;
-      scalar_view_t valuesC_ref;
-      crsMat_t3 Ccrsmat_ref;
+    row_mapC_ref = lno_view_t("non_const_lnow_row", m + 1);
 
-      if (check_output) {
+    KernelHandle sequential_kh;
+    sequential_kh.set_team_work_size(chunk_size);
+    sequential_kh.set_shmem_size(shmemsize);
+    sequential_kh.set_suggested_team_size(team_size);
+    sequential_kh.create_spgemm_handle(KokkosSparse::SPGEMM_SERIAL);
 
-      	if (verbose)
-      	  std::cout << "Running a reference algorithm" << std::endl;
+    if (use_dynamic_scheduling) {
+      sequential_kh.set_dynamic_scheduling(true);
+    }
 
-      	row_mapC_ref = lno_view_t("non_const_lnow_row", m + 1);
+    spgemm_symbolic(&sequential_kh, m, n, k, crsMat.graph.row_map,
+                    crsMat.graph.entries, TRANSPOSEFIRST, crsMat2.graph.row_map,
+                    crsMat2.graph.entries, TRANSPOSESECOND, row_mapC_ref);
 
-      	KernelHandle sequential_kh;
-      	sequential_kh.set_team_work_size(chunk_size);
-      	sequential_kh.set_shmem_size(shmemsize);
-      	sequential_kh.set_suggested_team_size(team_size);
-      	sequential_kh.create_spgemm_handle(KokkosSparse::SPGEMM_SERIAL);
+    ExecSpace().fence();
 
-      	if (use_dynamic_scheduling){
-      	  sequential_kh.set_dynamic_scheduling(true);
-      	}
+    size_type c_nnz_size = sequential_kh.get_spgemm_handle()->get_c_nnz();
+    entriesC_ref         = lno_nnz_view_t(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"),
+        c_nnz_size);
+    valuesC_ref = scalar_view_t(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size);
 
-      	spgemm_symbolic(&sequential_kh, m, n, k,
-      			crsMat.graph.row_map, crsMat.graph.entries, TRANSPOSEFIRST,
-      			crsMat2.graph.row_map, crsMat2.graph.entries, TRANSPOSESECOND,
-      			row_mapC_ref);
+    spgemm_jacobi(&sequential_kh, m, n, k, crsMat.graph.row_map,
+                  crsMat.graph.entries, crsMat.values, TRANSPOSEFIRST,
+                  crsMat2.graph.row_map, crsMat2.graph.entries, crsMat2.values,
+                  TRANSPOSESECOND, row_mapC_ref, entriesC_ref, valuesC_ref,
+                  omega, dinv);
 
-      	ExecSpace().fence();
+    ExecSpace().fence();
 
-      	size_type c_nnz_size = sequential_kh.get_spgemm_handle()->get_c_nnz();
-      	entriesC_ref = lno_nnz_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), c_nnz_size);
-      	valuesC_ref = scalar_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size);
+    Ccrsmat_ref = crsMat_t3("CorrectC", m, k, valuesC_ref.extent(0),
+                            valuesC_ref, row_mapC_ref, entriesC_ref);
+  }
 
-      	spgemm_jacobi(&sequential_kh, m, n, k,
-		      crsMat.graph.row_map, crsMat.graph.entries, crsMat.values, TRANSPOSEFIRST,
-		      crsMat2.graph.row_map, crsMat2.graph.entries, crsMat2.values, TRANSPOSESECOND,
-		      row_mapC_ref, entriesC_ref, valuesC_ref,
-		      omega, dinv);
-	
-      	ExecSpace().fence();
+  for (int i = 0; i < repeat; ++i) {
+    kh.create_spgemm_handle(KokkosSparse::SPGEMMAlgorithm(algorithm));
 
-      	Ccrsmat_ref = crsMat_t3("CorrectC", m, k, valuesC_ref.extent(0), valuesC_ref, row_mapC_ref, entriesC_ref);
-      }
+    // 250000 default. if cache-mode is used on KNL can increase to 1M.
+    kh.get_spgemm_handle()->MaxColDenseAcc = params.MaxColDenseAcc;
 
-      for (int i = 0; i < repeat; ++i) {
-	kh.create_spgemm_handle(KokkosSparse::SPGEMMAlgorithm(algorithm));
-
-	//250000 default. if cache-mode is used on KNL can increase to 1M.
-	kh.get_spgemm_handle()->MaxColDenseAcc = params.MaxColDenseAcc;
-
-	if (i == 0){
-	  kh.get_spgemm_handle()->set_read_write_cost_calc (calculate_read_write_cost);
-	}
-	//do the compression whether in 2 step, or 1 step.
-	kh.get_spgemm_handle()->set_compression_steps(!params.compression2step);
-	//whether to scale the hash more. default is 1, so no scale.
-	kh.get_spgemm_handle()->set_min_hash_size_scale(params.minhashscale);
-	//max occupancy in 1-level LP hashes. LL hashes can be 100%
-	kh.get_spgemm_handle()->set_first_level_hash_cut_off(params.first_level_hash_cut_off);
-	//min reduction on FLOPs to run compression
-	kh.get_spgemm_handle()->set_compression_cut_off(params.compression_cut_off);
-
-	row_mapC = lno_view_t("non_const_lnow_row", m + 1);
-	entriesC = lno_nnz_view_t("entriesC (empty)", 0);
-	valuesC = scalar_view_t("valuesC (empty)", 0);
-
-	Kokkos::Timer timer1;
-	spgemm_symbolic(&kh, m, n, k,
-			crsMat.graph.row_map, crsMat.graph.entries, TRANSPOSEFIRST,
-			crsMat2.graph.row_map, crsMat2.graph.entries, TRANSPOSESECOND,
-			row_mapC);
-
-	ExecSpace().fence();
-	double symbolic_time = timer1.seconds();
-
-	Kokkos::Timer timer2;
-	size_type c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
-	if (verbose)  
-	  std::cout << "C SIZE:" << c_nnz_size << std::endl;
-	if (c_nnz_size){
-	  entriesC = lno_nnz_view_t (Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), c_nnz_size);
-	  valuesC = scalar_view_t (Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size);
-	}
-
-	spgemm_jacobi(&kh, m, n, k,
-		      crsMat.graph.row_map, crsMat.graph.entries, crsMat.values, TRANSPOSEFIRST,
-		      crsMat2.graph.row_map, crsMat2.graph.entries, crsMat2.values, TRANSPOSESECOND,
-		      row_mapC, entriesC, valuesC,
-		      omega, dinv);
-
-	ExecSpace().fence();
-	double numeric_time = timer2.seconds();
-
-	std::cout << "mm_time:" << symbolic_time + numeric_time << " symbolic_time:" << symbolic_time << " numeric_time:" << numeric_time << std::endl;
-      }
-      if (verbose) {
-	std::cout << "row_mapC:" << row_mapC.extent(0) << std::endl;
-	std::cout << "entriesC:" << entriesC.extent(0) << std::endl;
-	std::cout << "valuesC:" << valuesC.extent(0) << std::endl;
-	KokkosKernels::Impl::print_1Dview(valuesC);
-	KokkosKernels::Impl::print_1Dview(entriesC);
-	KokkosKernels::Impl::print_1Dview(row_mapC);
-      }
-      crsMat_t3 Ccrsmat_result("CrsMatrixC", m, k, valuesC.extent(0), valuesC, row_mapC, entriesC);
-      if (check_output){
-	bool is_identical = is_same_matrix<crsMat_t3, device_t>(Ccrsmat_result, Ccrsmat_ref);
-	if (!is_identical){
-	  std::cerr << "Result differs. If values are differing, might be floating point order error." << std::endl;
-	  exit(1);
-	}
-      }
-      return Ccrsmat_result;
+    if (i == 0) {
+      kh.get_spgemm_handle()->set_read_write_cost_calc(
+          calculate_read_write_cost);
+    }
+    // do the compression whether in 2 step, or 1 step.
+    kh.get_spgemm_handle()->set_compression_steps(!params.compression2step);
+    // whether to scale the hash more. default is 1, so no scale.
+    kh.get_spgemm_handle()->set_min_hash_size_scale(params.minhashscale);
+    // max occupancy in 1-level LP hashes. LL hashes can be 100%
+    kh.get_spgemm_handle()->set_first_level_hash_cut_off(
+        params.first_level_hash_cut_off);
+    // min reduction on FLOPs to run compression
+    kh.get_spgemm_handle()->set_compression_cut_off(params.compression_cut_off);
+
+    row_mapC = lno_view_t("non_const_lnow_row", m + 1);
+    entriesC = lno_nnz_view_t("entriesC (empty)", 0);
+    valuesC  = scalar_view_t("valuesC (empty)", 0);
+
+    Kokkos::Timer timer1;
+    spgemm_symbolic(&kh, m, n, k, crsMat.graph.row_map, crsMat.graph.entries,
+                    TRANSPOSEFIRST, crsMat2.graph.row_map,
+                    crsMat2.graph.entries, TRANSPOSESECOND, row_mapC);
+
+    ExecSpace().fence();
+    double symbolic_time = timer1.seconds();
+
+    Kokkos::Timer timer2;
+    size_type c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
+    if (verbose) std::cout << "C SIZE:" << c_nnz_size << std::endl;
+    if (c_nnz_size) {
+      entriesC = lno_nnz_view_t(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"),
+          c_nnz_size);
+      valuesC = scalar_view_t(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"),
+          c_nnz_size);
     }
 
+    spgemm_jacobi(&kh, m, n, k, crsMat.graph.row_map, crsMat.graph.entries,
+                  crsMat.values, TRANSPOSEFIRST, crsMat2.graph.row_map,
+                  crsMat2.graph.entries, crsMat2.values, TRANSPOSESECOND,
+                  row_mapC, entriesC, valuesC, omega, dinv);
 
-    template <typename size_type, typename lno_t, typename scalar_t,
-	      typename exec_space, typename hbm_mem_space, typename sbm_mem_space>
-    void run_spgemm_jacobi(Parameters params){
-
-      typedef exec_space myExecSpace;
-      typedef Kokkos::Device<exec_space, hbm_mem_space> myFastDevice;
-      typedef Kokkos::Device<exec_space, sbm_mem_space> mySlowExecSpace;
-
-      typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, myFastDevice, void, size_type > fast_crstmat_t;
-      typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, mySlowExecSpace, void, size_type > slow_crstmat_t;
-
-      char *a_mat_file = params.a_mtx_bin_file;
-      char *b_mat_file = params.b_mtx_bin_file;
-      char *c_mat_file = params.c_mtx_bin_file;
-
-      slow_crstmat_t a_slow_crsmat, b_slow_crsmat, c_slow_crsmat;
-      fast_crstmat_t a_fast_crsmat, b_fast_crsmat, c_fast_crsmat;
+    ExecSpace().fence();
+    double numeric_time = timer2.seconds();
 
-      //read a and b matrices and store them on slow or fast memory.
+    std::cout << "mm_time:" << symbolic_time + numeric_time
+              << " symbolic_time:" << symbolic_time
+              << " numeric_time:" << numeric_time << std::endl;
+  }
+  if (verbose) {
+    std::cout << "row_mapC:" << row_mapC.extent(0) << std::endl;
+    std::cout << "entriesC:" << entriesC.extent(0) << std::endl;
+    std::cout << "valuesC:" << valuesC.extent(0) << std::endl;
+    KokkosKernels::Impl::print_1Dview(valuesC);
+    KokkosKernels::Impl::print_1Dview(entriesC);
+    KokkosKernels::Impl::print_1Dview(row_mapC);
+  }
+  crsMat_t3 Ccrsmat_result("CrsMatrixC", m, k, valuesC.extent(0), valuesC,
+                           row_mapC, entriesC);
+  if (check_output) {
+    bool is_identical =
+        is_same_matrix<crsMat_t3, device_t>(Ccrsmat_result, Ccrsmat_ref);
+    if (!is_identical) {
+      std::cerr << "Result differs. If values are differing, might be floating "
+                   "point order error."
+                << std::endl;
+      exit(1);
+    }
+  }
+  return Ccrsmat_result;
+}
 
-      if (params.a_mem_space == 1){
-	a_fast_crsmat = KokkosKernels::Impl::read_kokkos_crst_matrix<fast_crstmat_t>(a_mat_file);
-      }
-      else {
-	a_slow_crsmat = KokkosKernels::Impl::read_kokkos_crst_matrix<slow_crstmat_t>(a_mat_file);
-      }
+template <typename size_type, typename lno_t, typename scalar_t,
+          typename exec_space, typename hbm_mem_space, typename sbm_mem_space>
+void run_spgemm_jacobi(Parameters params) {
+  typedef exec_space myExecSpace;
+  typedef Kokkos::Device<exec_space, hbm_mem_space> myFastDevice;
+  typedef Kokkos::Device<exec_space, sbm_mem_space> mySlowExecSpace;
+
+  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, myFastDevice, void,
+                                           size_type>
+      fast_crstmat_t;
+  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, mySlowExecSpace,
+                                           void, size_type>
+      slow_crstmat_t;
+
+  char *a_mat_file = params.a_mtx_bin_file;
+  char *b_mat_file = params.b_mtx_bin_file;
+  char *c_mat_file = params.c_mtx_bin_file;
+
+  slow_crstmat_t a_slow_crsmat, b_slow_crsmat, c_slow_crsmat;
+  fast_crstmat_t a_fast_crsmat, b_fast_crsmat, c_fast_crsmat;
+
+  // read a and b matrices and store them on slow or fast memory.
+
+  if (params.a_mem_space == 1) {
+    a_fast_crsmat =
+        KokkosKernels::Impl::read_kokkos_crst_matrix<fast_crstmat_t>(
+            a_mat_file);
+  } else {
+    a_slow_crsmat =
+        KokkosKernels::Impl::read_kokkos_crst_matrix<slow_crstmat_t>(
+            a_mat_file);
+  }
 
+  if ((b_mat_file == NULL || strcmp(b_mat_file, a_mat_file) == 0) &&
+      params.b_mem_space == params.a_mem_space) {
+    std::cout << "Using A matrix for B as well" << std::endl;
+    b_fast_crsmat = a_fast_crsmat;
+    b_slow_crsmat = a_slow_crsmat;
+  } else if (params.b_mem_space == 1) {
+    if (b_mat_file == NULL) b_mat_file = a_mat_file;
+    b_fast_crsmat =
+        KokkosKernels::Impl::read_kokkos_crst_matrix<fast_crstmat_t>(
+            b_mat_file);
+  } else {
+    if (b_mat_file == NULL) b_mat_file = a_mat_file;
+    b_slow_crsmat =
+        KokkosKernels::Impl::read_kokkos_crst_matrix<slow_crstmat_t>(
+            b_mat_file);
+  }
 
-      if ((b_mat_file == NULL || strcmp(b_mat_file, a_mat_file) == 0) && params.b_mem_space == params.a_mem_space){
-	std::cout << "Using A matrix for B as well" << std::endl;
-	b_fast_crsmat = a_fast_crsmat;
-	b_slow_crsmat = a_slow_crsmat;
-      }
-      else if (params.b_mem_space == 1){
-	if (b_mat_file == NULL) b_mat_file = a_mat_file;
-	b_fast_crsmat = KokkosKernels::Impl::read_kokkos_crst_matrix<fast_crstmat_t>(b_mat_file);
+  if (params.a_mem_space == 1) {
+    if (params.b_mem_space == 1) {
+      if (params.c_mem_space == 1) {
+        if (params.work_mem_space == 1) {
+          c_fast_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_crstmat_t, fast_crstmat_t, fast_crstmat_t,
+              hbm_mem_space, hbm_mem_space>(a_fast_crsmat, b_fast_crsmat,
+                                            params);
+        } else {
+          c_fast_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_crstmat_t, fast_crstmat_t, fast_crstmat_t,
+              sbm_mem_space, sbm_mem_space>(a_fast_crsmat, b_fast_crsmat,
+                                            params);
+        }
+
+      } else {
+        // C is in slow memory.
+        if (params.work_mem_space == 1) {
+          c_slow_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_crstmat_t, fast_crstmat_t, slow_crstmat_t,
+              hbm_mem_space, hbm_mem_space>(a_fast_crsmat, b_fast_crsmat,
+                                            params);
+        } else {
+          c_slow_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_crstmat_t, fast_crstmat_t, slow_crstmat_t,
+              sbm_mem_space, sbm_mem_space>(a_fast_crsmat, b_fast_crsmat,
+                                            params);
+        }
       }
-      else {
-	if (b_mat_file == NULL) b_mat_file = a_mat_file;
-	b_slow_crsmat = KokkosKernels::Impl::read_kokkos_crst_matrix<slow_crstmat_t>(b_mat_file);
+    } else {
+      // B is in slow memory
+      if (params.c_mem_space == 1) {
+        if (params.work_mem_space == 1) {
+          c_fast_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_crstmat_t, slow_crstmat_t, fast_crstmat_t,
+              hbm_mem_space, hbm_mem_space>(a_fast_crsmat, b_slow_crsmat,
+                                            params);
+        } else {
+          c_fast_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_crstmat_t, slow_crstmat_t, fast_crstmat_t,
+              sbm_mem_space, sbm_mem_space>(a_fast_crsmat, b_slow_crsmat,
+                                            params);
+        }
+
+      } else {
+        // C is in slow memory.
+        if (params.work_mem_space == 1) {
+          c_slow_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_crstmat_t, slow_crstmat_t, slow_crstmat_t,
+              hbm_mem_space, hbm_mem_space>(a_fast_crsmat, b_slow_crsmat,
+                                            params);
+        } else {
+          c_slow_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, fast_crstmat_t, slow_crstmat_t, slow_crstmat_t,
+              sbm_mem_space, sbm_mem_space>(a_fast_crsmat, b_slow_crsmat,
+                                            params);
+        }
       }
-
-      if (params.a_mem_space == 1){
-	if (params.b_mem_space == 1){
-	  if (params.c_mem_space == 1){
-	    if (params.work_mem_space == 1){
-	      c_fast_crsmat =
-                KokkosKernels::Experiment::run_experiment
-		<myExecSpace, fast_crstmat_t,fast_crstmat_t,fast_crstmat_t, hbm_mem_space, hbm_mem_space>
-		(a_fast_crsmat, b_fast_crsmat, params);
-	    }
-	    else {
-	      c_fast_crsmat =
-                KokkosKernels::Experiment::run_experiment
-		<myExecSpace, fast_crstmat_t,fast_crstmat_t,fast_crstmat_t, sbm_mem_space, sbm_mem_space>
-		(a_fast_crsmat, b_fast_crsmat, params);
-	    }
-
-	  }
-	  else {
-	    //C is in slow memory.
-	    if (params.work_mem_space == 1){
-	      c_slow_crsmat =
-                KokkosKernels::Experiment::run_experiment
-		<myExecSpace, fast_crstmat_t,fast_crstmat_t,slow_crstmat_t, hbm_mem_space, hbm_mem_space>
-		(a_fast_crsmat, b_fast_crsmat, params);
-	    }
-	    else {
-	      c_slow_crsmat =
-                KokkosKernels::Experiment::run_experiment
-		<myExecSpace, fast_crstmat_t,fast_crstmat_t,slow_crstmat_t, sbm_mem_space, sbm_mem_space>
-		(a_fast_crsmat, b_fast_crsmat, params);
-	    }
-	  }
-	}
-	else {
-	  //B is in slow memory
-	  if (params.c_mem_space == 1){
-	    if (params.work_mem_space == 1){
-	      c_fast_crsmat =
-                KokkosKernels::Experiment::run_experiment
-		<myExecSpace, fast_crstmat_t,slow_crstmat_t,fast_crstmat_t, hbm_mem_space, hbm_mem_space>
-		(a_fast_crsmat, b_slow_crsmat, params);
-	    }
-	    else {
-	      c_fast_crsmat =
-                KokkosKernels::Experiment::run_experiment
-		<myExecSpace, fast_crstmat_t,slow_crstmat_t,fast_crstmat_t, sbm_mem_space, sbm_mem_space>
-		(a_fast_crsmat, b_slow_crsmat, params);
-	    }
-
-	  }
-	  else {
-	    //C is in slow memory.
-	    if (params.work_mem_space == 1){
-	      c_slow_crsmat =
-                KokkosKernels::Experiment::run_experiment
-		<myExecSpace, fast_crstmat_t,slow_crstmat_t,slow_crstmat_t, hbm_mem_space, hbm_mem_space>
-		(a_fast_crsmat, b_slow_crsmat, params);
-	    }
-	    else {
-	      c_slow_crsmat =
-                KokkosKernels::Experiment::run_experiment
-		<myExecSpace, fast_crstmat_t,slow_crstmat_t,slow_crstmat_t, sbm_mem_space, sbm_mem_space>
-		(a_fast_crsmat, b_slow_crsmat, params);
-	    }
-	  }
-
-	}
-      }
-      else {
-	//A is in slow memory
-	if (params.b_mem_space == 1){
-	  if (params.c_mem_space == 1){
-	    if (params.work_mem_space == 1){
-	      c_fast_crsmat =
-                KokkosKernels::Experiment::run_experiment
-		<myExecSpace, slow_crstmat_t,fast_crstmat_t,fast_crstmat_t, hbm_mem_space, hbm_mem_space>
-		(a_slow_crsmat, b_fast_crsmat, params);
-	    }
-	    else {
-	      c_fast_crsmat =
-                KokkosKernels::Experiment::run_experiment
-		<myExecSpace, slow_crstmat_t,fast_crstmat_t,fast_crstmat_t, sbm_mem_space, sbm_mem_space>
-		(a_slow_crsmat, b_fast_crsmat, params);
-	    }
-
-	  }
-	  else {
-	    //C is in slow memory.
-	    if (params.work_mem_space == 1){
-	      c_slow_crsmat =
-                KokkosKernels::Experiment::run_experiment
-		<myExecSpace, slow_crstmat_t,fast_crstmat_t,slow_crstmat_t, hbm_mem_space, hbm_mem_space>
-		(a_slow_crsmat, b_fast_crsmat, params);
-	    }
-	    else {
-	      c_slow_crsmat =
-                KokkosKernels::Experiment::run_experiment
-		<myExecSpace, slow_crstmat_t,fast_crstmat_t,slow_crstmat_t, sbm_mem_space, sbm_mem_space>
-		(a_slow_crsmat, b_fast_crsmat, params);
-	    }
-	  }
-	}
-	else {
-	  //B is in slow memory
-	  if (params.c_mem_space == 1){
-	    if (params.work_mem_space == 1){
-	      c_fast_crsmat =
-                KokkosKernels::Experiment::run_experiment
-		<myExecSpace, slow_crstmat_t,slow_crstmat_t,fast_crstmat_t, hbm_mem_space, hbm_mem_space>
-		(a_slow_crsmat, b_slow_crsmat, params);
-	    }
-	    else {
-	      c_fast_crsmat =
-                KokkosKernels::Experiment::run_experiment
-		<myExecSpace, slow_crstmat_t,slow_crstmat_t,fast_crstmat_t, sbm_mem_space, sbm_mem_space>
-		(a_slow_crsmat, b_slow_crsmat, params);
-	    }
-
-	  }
-	  else {
-	    //C is in slow memory.
-	    if (params.work_mem_space == 1){
-	      c_slow_crsmat =
-                KokkosKernels::Experiment::run_experiment
-		<myExecSpace, slow_crstmat_t,slow_crstmat_t,slow_crstmat_t, hbm_mem_space, hbm_mem_space>
-		(a_slow_crsmat, b_slow_crsmat, params);
-	    }
-	    else {
-	      c_slow_crsmat =
-                KokkosKernels::Experiment::run_experiment
-		<myExecSpace, slow_crstmat_t,slow_crstmat_t,slow_crstmat_t, sbm_mem_space, sbm_mem_space>
-		(a_slow_crsmat, b_slow_crsmat, params);
-	    }
-	  }
-
-	}
-
+    }
+  } else {
+    // A is in slow memory
+    if (params.b_mem_space == 1) {
+      if (params.c_mem_space == 1) {
+        if (params.work_mem_space == 1) {
+          c_fast_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_crstmat_t, fast_crstmat_t, fast_crstmat_t,
+              hbm_mem_space, hbm_mem_space>(a_slow_crsmat, b_fast_crsmat,
+                                            params);
+        } else {
+          c_fast_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_crstmat_t, fast_crstmat_t, fast_crstmat_t,
+              sbm_mem_space, sbm_mem_space>(a_slow_crsmat, b_fast_crsmat,
+                                            params);
+        }
+
+      } else {
+        // C is in slow memory.
+        if (params.work_mem_space == 1) {
+          c_slow_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_crstmat_t, fast_crstmat_t, slow_crstmat_t,
+              hbm_mem_space, hbm_mem_space>(a_slow_crsmat, b_fast_crsmat,
+                                            params);
+        } else {
+          c_slow_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_crstmat_t, fast_crstmat_t, slow_crstmat_t,
+              sbm_mem_space, sbm_mem_space>(a_slow_crsmat, b_fast_crsmat,
+                                            params);
+        }
       }
-
-
-      if (c_mat_file != NULL){
-	if (params.c_mem_space == 1){
-
-          KokkosKernels::sort_crs_matrix(c_fast_crsmat);
-
-	  KokkosKernels::Impl::write_graph_bin((lno_t) (c_fast_crsmat.numRows()),
-					       (size_type) (c_fast_crsmat.graph.entries.extent(0)),
-					       c_fast_crsmat.graph.row_map.data(),
-					       c_fast_crsmat.graph.entries.data(),
-					       c_fast_crsmat.values.data(),
-					       c_mat_file);
-	}
-	else {
-          KokkosKernels::sort_crs_matrix(c_slow_crsmat);
-
-	  KokkosKernels::Impl::write_graph_bin((lno_t) c_slow_crsmat.numRows(),
-					       (size_type) c_slow_crsmat.graph.entries.extent(0),
-					       c_slow_crsmat.graph.row_map.data(),
-					       c_slow_crsmat.graph.entries.data(),
-					       c_slow_crsmat.values.data(),
-					       c_mat_file);
-	}
+    } else {
+      // B is in slow memory
+      if (params.c_mem_space == 1) {
+        if (params.work_mem_space == 1) {
+          c_fast_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_crstmat_t, slow_crstmat_t, fast_crstmat_t,
+              hbm_mem_space, hbm_mem_space>(a_slow_crsmat, b_slow_crsmat,
+                                            params);
+        } else {
+          c_fast_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_crstmat_t, slow_crstmat_t, fast_crstmat_t,
+              sbm_mem_space, sbm_mem_space>(a_slow_crsmat, b_slow_crsmat,
+                                            params);
+        }
+
+      } else {
+        // C is in slow memory.
+        if (params.work_mem_space == 1) {
+          c_slow_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_crstmat_t, slow_crstmat_t, slow_crstmat_t,
+              hbm_mem_space, hbm_mem_space>(a_slow_crsmat, b_slow_crsmat,
+                                            params);
+        } else {
+          c_slow_crsmat = KokkosKernels::Experiment::run_experiment<
+              myExecSpace, slow_crstmat_t, slow_crstmat_t, slow_crstmat_t,
+              sbm_mem_space, sbm_mem_space>(a_slow_crsmat, b_slow_crsmat,
+                                            params);
+        }
       }
     }
+  }
 
+  if (c_mat_file != NULL) {
+    if (params.c_mem_space == 1) {
+      KokkosKernels::sort_crs_matrix(c_fast_crsmat);
+
+      KokkosKernels::Impl::write_graph_bin(
+          (lno_t)(c_fast_crsmat.numRows()),
+          (size_type)(c_fast_crsmat.graph.entries.extent(0)),
+          c_fast_crsmat.graph.row_map.data(),
+          c_fast_crsmat.graph.entries.data(), c_fast_crsmat.values.data(),
+          c_mat_file);
+    } else {
+      KokkosKernels::sort_crs_matrix(c_slow_crsmat);
+
+      KokkosKernels::Impl::write_graph_bin(
+          (lno_t)c_slow_crsmat.numRows(),
+          (size_type)c_slow_crsmat.graph.entries.extent(0),
+          c_slow_crsmat.graph.row_map.data(),
+          c_slow_crsmat.graph.entries.data(), c_slow_crsmat.values.data(),
+          c_mat_file);
+    }
   }
 }
+
+}  // namespace Experiment
+}  // namespace KokkosKernels
diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp
index b7d935fde2..7b0bd42d2a 100644
--- a/perf_test/sparse/KokkosSparse_spadd.cpp
+++ b/perf_test/sparse/KokkosSparse_spadd.cpp
@@ -48,6 +48,7 @@
 #include "KokkosKernels_IOUtils.hpp"
 #include "KokkosKernels_SparseUtils_cusparse.hpp"
 #include "KokkosSparse_spadd.hpp"
+#include "KokkosKernels_TestUtils.hpp"
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
 #include <cusparse.h>
@@ -58,12 +59,13 @@
 #include <mkl_spblas.h>
 
 inline void spadd_mkl_internal_safe_call(sparse_status_t mklStatus,
-					const char* name,
-					const char* file = nullptr,
-					const int line   = 0) {
+                                         const char* name,
+                                         const char* file = nullptr,
+                                         const int line   = 0) {
   if (SPARSE_STATUS_SUCCESS != mklStatus) {
     std::ostringstream oss;
-    oss << "MKL call \"" << name << "\" encountered error at " << file << ":" << line << '\n';
+    oss << "MKL call \"" << name << "\" encountered error at " << file << ":"
+        << line << '\n';
     Kokkos::abort(oss.str().c_str());
   }
 }
@@ -72,50 +74,48 @@ inline void spadd_mkl_internal_safe_call(sparse_status_t mklStatus,
   spadd_mkl_internal_safe_call(call, #call, __FILE__, __LINE__)
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) &&  \
+#if defined(KOKKOSKERNELS_INST_DOUBLE) &&     \
     defined(KOKKOSKERNELS_INST_OFFSET_INT) && \
     defined(KOKKOSKERNELS_INST_ORDINAL_INT)
 
-struct Params
-{
-  int use_cuda = 0;
-  int use_openmp = 0;
-  int use_threads = 0;
-  int use_mkl = 0;
+struct Params {
+  int use_cuda     = 0;
+  int use_openmp   = 0;
+  int use_threads  = 0;
+  int use_mkl      = 0;
   int use_cusparse = 0;
-  bool sorted = true;
+  bool sorted      = true;
   std::string amtx;
   std::string bmtx;
   std::string cmtx;
-  int m = 10000;
-  int n = 10000;
+  int m         = 10000;
+  int n         = 10000;
   int nnzPerRow = 30;
-  bool bDiag = false; //Whether B should be diagonal only (requires A square)
-  bool verbose = false;
-  int repeat = 1;
-  int numericRepeat = 1;  //how many times to call numeric per overall run
+  bool bDiag = false;  // Whether B should be diagonal only (requires A square)
+  bool verbose      = false;
+  int repeat        = 1;
+  int numericRepeat = 1;  // how many times to call numeric per overall run
 };
 
 template <typename crsMat_t>
-void run_experiment(const Params& params)
-{
+void run_experiment(const Params& params) {
   using namespace KokkosSparse;
   using namespace KokkosSparse::Experimental;
 
-  using size_type = typename crsMat_t::size_type;
-  using lno_t = typename crsMat_t::ordinal_type;
-  using scalar_t = typename crsMat_t::value_type;
-  using device_t = typename crsMat_t::device_type;
+  using size_type  = typename crsMat_t::size_type;
+  using lno_t      = typename crsMat_t::ordinal_type;
+  using scalar_t   = typename crsMat_t::value_type;
+  using device_t   = typename crsMat_t::device_type;
   using exec_space = typename device_t::execution_space;
-  using mem_space = typename device_t::memory_space;
+  using mem_space  = typename device_t::memory_space;
 
-  using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle
-      <size_type, lno_t, scalar_t, exec_space, mem_space, mem_space>;
+  using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<
+      size_type, lno_t, scalar_t, exec_space, mem_space, mem_space>;
 
-  using graph_t = typename crsMat_t::StaticCrsGraphType;
-  using rowmap_t = typename graph_t::row_map_type::non_const_type;
+  using graph_t   = typename crsMat_t::StaticCrsGraphType;
+  using rowmap_t  = typename graph_t::row_map_type::non_const_type;
   using entries_t = typename graph_t::entries_type::non_const_type;
-  using values_t = typename crsMat_t::values_type::non_const_type;
+  using values_t  = typename crsMat_t::values_type::non_const_type;
 
   std::cout << "************************************* \n";
   std::cout << "************************************* \n";
@@ -123,84 +123,86 @@ void run_experiment(const Params& params)
   crsMat_t B;
   lno_t m = params.m;
   lno_t n = params.n;
-  if(params.amtx.length())
-  {
+  if (params.amtx.length()) {
     std::cout << "Loading A from " << params.amtx << '\n';
-    A = KokkosKernels::Impl::read_kokkos_crst_matrix<crsMat_t>(params.amtx.c_str());
+    A = KokkosKernels::Impl::read_kokkos_crst_matrix<crsMat_t>(
+        params.amtx.c_str());
     m = A.numRows();
     n = A.numCols();
-  }
-  else
-  {
+  } else {
     std::cout << "Randomly generating A\n";
     size_type nnzUnused = m * params.nnzPerRow;
-    A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(m, n, nnzUnused, 0, (n + 3) / 3);
+    A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(
+        m, n, nnzUnused, 0, (n + 3) / 3);
   }
-  if(params.bmtx.length())
-  {
+  if (params.bmtx.length()) {
     std::cout << "Loading B from " << params.bmtx << '\n';
-    B = KokkosKernels::Impl::read_kokkos_crst_matrix<crsMat_t>(params.bmtx.c_str());
-  }
-  else if(params.bDiag)
-  {
+    B = KokkosKernels::Impl::read_kokkos_crst_matrix<crsMat_t>(
+        params.bmtx.c_str());
+  } else if (params.bDiag) {
     std::cout << "Generating B as diagonal matrix.\n";
     int diagLength = std::min(m, n);
-    rowmap_t rowmap(Kokkos::view_alloc(Kokkos::WithoutInitializing, "rowmap_view"), m + 1);
-    entries_t entries(Kokkos::view_alloc(Kokkos::WithoutInitializing, "colsmap_view"), diagLength);
-    values_t values(Kokkos::view_alloc(Kokkos::WithoutInitializing, "values_view"), diagLength);
-    auto rowmapHost = Kokkos::create_mirror_view(rowmap);
+    rowmap_t rowmap(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "rowmap_view"), m + 1);
+    entries_t entries(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "colsmap_view"),
+        diagLength);
+    values_t values(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "values_view"),
+        diagLength);
+    auto rowmapHost  = Kokkos::create_mirror_view(rowmap);
     auto entriesHost = Kokkos::create_mirror_view(entries);
-    auto valuesHost = Kokkos::create_mirror_view(values);
-    for(int i = 0; i < diagLength; i++)
-    {
-      rowmapHost(i) = i;
+    auto valuesHost  = Kokkos::create_mirror_view(values);
+    for (int i = 0; i < diagLength; i++) {
+      rowmapHost(i)  = i;
       entriesHost(i) = i;
-      valuesHost(i) = 1.0;
+      valuesHost(i)  = 1.0;
     }
-    for(int i = diagLength; i <= m; i++)
-    {
+    for (int i = diagLength; i <= m; i++) {
       rowmapHost(i) = diagLength;
     }
     Kokkos::deep_copy(rowmap, rowmapHost);
     Kokkos::deep_copy(entries, entriesHost);
     Kokkos::deep_copy(values, valuesHost);
     B = crsMat_t("B", m, n, diagLength, values, rowmap, entries);
-  }
-  else
-  {
+  } else {
     std::cout << "Randomly generating B\n";
     size_type nnzUnused = m * params.nnzPerRow;
-    B = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(m, n, nnzUnused, 0, (n + 3) / 3);
+    B = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(
+        m, n, nnzUnused, 0, (n + 3) / 3);
   }
-  //Make sure dimensions are compatible
-  if(A.numRows() != B.numRows() || A.numCols() != B.numCols())
-  {
-    std::cout << "ERROR: A is " << A.numRows() << 'x' << A.numCols() << ", but B is " << B.numRows() << 'x' << B.numCols() << '\n';
+  // Make sure dimensions are compatible
+  if (A.numRows() != B.numRows() || A.numCols() != B.numCols()) {
+    std::cout << "ERROR: A is " << A.numRows() << 'x' << A.numCols()
+              << ", but B is " << B.numRows() << 'x' << B.numCols() << '\n';
     exit(1);
   }
-  std::cout << "A and B are " << m << "x" << n << ". A, B have " << A.nnz() << " and " << B.nnz() << " entries.\n";
+  std::cout << "A and B are " << m << "x" << n << ". A, B have " << A.nnz()
+            << " and " << B.nnz() << " entries.\n";
 
   typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
-  typedef typename crsMat_t::StaticCrsGraphType::row_map_type::non_const_type lno_view_t;
-  typedef typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type lno_nnz_view_t;
+  typedef typename crsMat_t::StaticCrsGraphType::row_map_type::non_const_type
+      lno_view_t;
+  typedef typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type
+      lno_nnz_view_t;
   typedef typename crsMat_t::StaticCrsGraphType::row_map_type const_lno_view_t;
-  typedef typename crsMat_t::StaticCrsGraphType::entries_type const_lno_nnz_view_t;
+  typedef
+      typename crsMat_t::StaticCrsGraphType::entries_type const_lno_nnz_view_t;
 
   lno_view_t row_mapC;
-  //entriesC, valuesC and cusparseBuffer are allocated inside
-  //the loop, as part of symbolic
+  // entriesC, valuesC and cusparseBuffer are allocated inside
+  // the loop, as part of symbolic
   lno_nnz_view_t entriesC;
   scalar_view_t valuesC;
 
   KernelHandle kh;
 
-  if(params.sorted)
-  {
-    std::cout << "Assuming input matrices are sorted (explicitly sorting just in case)\n";
+  if (params.sorted) {
+    std::cout << "Assuming input matrices are sorted (explicitly sorting just "
+                 "in case)\n";
     KokkosKernels::sort_crs_matrix(A);
     KokkosKernels::sort_crs_matrix(B);
-  }
-  else
+  } else
     std::cout << "Assuming input matrices are not sorted.\n";
   kh.create_spadd_handle(params.sorted);
   auto addHandle = kh.get_spadd_handle();
@@ -209,14 +211,13 @@ void run_experiment(const Params& params)
 
   Kokkos::Timer timer;
   double symbolicTime = 0;
-  double numericTime = 0;
+  double numericTime  = 0;
 
-  //Do an untimed warm up symbolic, and preallocate space for C entries/values
-  spadd_symbolic<KernelHandle, const_lno_view_t, const_lno_nnz_view_t, const_lno_view_t, const_lno_nnz_view_t, lno_view_t, lno_nnz_view_t>
-    (&kh,
-     A.graph.row_map, A.graph.entries,
-     B.graph.row_map, B.graph.entries,
-     row_mapC);
+  // Do an untimed warm up symbolic, and preallocate space for C entries/values
+  spadd_symbolic<KernelHandle, const_lno_view_t, const_lno_nnz_view_t,
+                 const_lno_view_t, const_lno_nnz_view_t, lno_view_t,
+                 lno_nnz_view_t>(&kh, A.graph.row_map, A.graph.entries,
+                                 B.graph.row_map, B.graph.entries, row_mapC);
 
   bool use_kk = !params.use_cusparse && !params.use_mkl;
 
@@ -228,139 +229,143 @@ void run_experiment(const Params& params)
   char* cusparseBuffer;
   const double alphabeta = 1.0;
 
-  if(params.use_cusparse)
-  {
+  if (params.use_cusparse) {
     KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreate(&cusparseHandle));
-    KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetPointerMode(cusparseHandle, CUSPARSE_POINTER_MODE_HOST));
+    KOKKOS_CUSPARSE_SAFE_CALL(
+        cusparseSetPointerMode(cusparseHandle, CUSPARSE_POINTER_MODE_HOST));
     KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&A_cusparse));
     KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&B_cusparse));
     KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&C_cusparse));
-    KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatType(A_cusparse, CUSPARSE_MATRIX_TYPE_GENERAL));
-    KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatType(B_cusparse, CUSPARSE_MATRIX_TYPE_GENERAL));
-    KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatType(C_cusparse, CUSPARSE_MATRIX_TYPE_GENERAL));
-    KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatDiagType(A_cusparse, CUSPARSE_DIAG_TYPE_NON_UNIT));
-    KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatDiagType(B_cusparse, CUSPARSE_DIAG_TYPE_NON_UNIT));
-    KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatDiagType(C_cusparse, CUSPARSE_DIAG_TYPE_NON_UNIT));
-    KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatIndexBase(A_cusparse, CUSPARSE_INDEX_BASE_ZERO));
-    KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatIndexBase(B_cusparse, CUSPARSE_INDEX_BASE_ZERO));
-    KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatIndexBase(C_cusparse, CUSPARSE_INDEX_BASE_ZERO));
+    KOKKOS_CUSPARSE_SAFE_CALL(
+        cusparseSetMatType(A_cusparse, CUSPARSE_MATRIX_TYPE_GENERAL));
+    KOKKOS_CUSPARSE_SAFE_CALL(
+        cusparseSetMatType(B_cusparse, CUSPARSE_MATRIX_TYPE_GENERAL));
+    KOKKOS_CUSPARSE_SAFE_CALL(
+        cusparseSetMatType(C_cusparse, CUSPARSE_MATRIX_TYPE_GENERAL));
+    KOKKOS_CUSPARSE_SAFE_CALL(
+        cusparseSetMatDiagType(A_cusparse, CUSPARSE_DIAG_TYPE_NON_UNIT));
+    KOKKOS_CUSPARSE_SAFE_CALL(
+        cusparseSetMatDiagType(B_cusparse, CUSPARSE_DIAG_TYPE_NON_UNIT));
+    KOKKOS_CUSPARSE_SAFE_CALL(
+        cusparseSetMatDiagType(C_cusparse, CUSPARSE_DIAG_TYPE_NON_UNIT));
+    KOKKOS_CUSPARSE_SAFE_CALL(
+        cusparseSetMatIndexBase(A_cusparse, CUSPARSE_INDEX_BASE_ZERO));
+    KOKKOS_CUSPARSE_SAFE_CALL(
+        cusparseSetMatIndexBase(B_cusparse, CUSPARSE_INDEX_BASE_ZERO));
+    KOKKOS_CUSPARSE_SAFE_CALL(
+        cusparseSetMatIndexBase(C_cusparse, CUSPARSE_INDEX_BASE_ZERO));
   }
 #endif
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
   sparse_matrix_t Amkl, Bmkl, Cmkl;
-  if(params.use_mkl)
-  {
-    SPADD_MKL_SAFE_CALL(mkl_sparse_d_create_csr(&Amkl, SPARSE_INDEX_BASE_ZERO, m, n,
-        (int*) A.graph.row_map.data(), (int*) A.graph.row_map.data() + 1, A.graph.entries.data(), A.values.data()));
-    SPADD_MKL_SAFE_CALL(mkl_sparse_d_create_csr(&Bmkl, SPARSE_INDEX_BASE_ZERO, m, n,
-        (int*) B.graph.row_map.data(), (int*) B.graph.row_map.data() + 1, B.graph.entries.data(), B.values.data()));
+  if (params.use_mkl) {
+    SPADD_MKL_SAFE_CALL(mkl_sparse_d_create_csr(
+        &Amkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)A.graph.row_map.data(),
+        (int*)A.graph.row_map.data() + 1, A.graph.entries.data(),
+        A.values.data()));
+    SPADD_MKL_SAFE_CALL(mkl_sparse_d_create_csr(
+        &Bmkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)B.graph.row_map.data(),
+        (int*)B.graph.row_map.data() + 1, B.graph.entries.data(),
+        B.values.data()));
   }
 #endif
 
   int c_nnz = 0;
 
-  for(int sumRep = 0; sumRep < params.repeat; sumRep++)
-  {
+  for (int sumRep = 0; sumRep < params.repeat; sumRep++) {
     timer.reset();
-    if(use_kk)
-    {
-      spadd_symbolic<KernelHandle, const_lno_view_t, const_lno_nnz_view_t, const_lno_view_t, const_lno_nnz_view_t, lno_view_t, lno_nnz_view_t>
-        (&kh,
-          A.graph.row_map, A.graph.entries,
-          B.graph.row_map, B.graph.entries,
-          row_mapC);
+    if (use_kk) {
+      spadd_symbolic<KernelHandle, const_lno_view_t, const_lno_nnz_view_t,
+                     const_lno_view_t, const_lno_nnz_view_t, lno_view_t,
+                     lno_nnz_view_t>(&kh, A.graph.row_map, A.graph.entries,
+                                     B.graph.row_map, B.graph.entries,
+                                     row_mapC);
       c_nnz = addHandle->get_c_nnz();
-    }
-    else if(params.use_cusparse)
-    {
+    } else if (params.use_cusparse) {
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-      //Symbolic phase: compute buffer size, then compute nnz
+      // Symbolic phase: compute buffer size, then compute nnz
       size_t bufferSize;
-      KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrgeam2_bufferSizeExt(cusparseHandle,
-          A.numRows(), A.numCols(),
-          &alphabeta, A_cusparse, A.nnz(),
-          A.values.data(), A.graph.row_map.data(), A.graph.entries.data(),
-          &alphabeta, B_cusparse, B.nnz(),
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrgeam2_bufferSizeExt(
+          cusparseHandle, A.numRows(), A.numCols(), &alphabeta, A_cusparse,
+          A.nnz(), A.values.data(), A.graph.row_map.data(),
+          A.graph.entries.data(), &alphabeta, B_cusparse, B.nnz(),
           B.values.data(), B.graph.row_map.data(), B.graph.entries.data(),
           C_cusparse, NULL, row_mapC.data(), NULL, &bufferSize));
-      //Allocate work buffer
-      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc((void**) &cusparseBuffer, bufferSize));
-      KOKKOS_CUSPARSE_SAFE_CALL(cusparseXcsrgeam2Nnz(cusparseHandle, m, n,
-          A_cusparse, A.nnz(), A.graph.row_map.data(), A.graph.entries.data(),
-          B_cusparse, B.nnz(), B.graph.row_map.data(), B.graph.entries.data(),
-          C_cusparse, row_mapC.data(), &c_nnz,
+      // Allocate work buffer
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          cudaMalloc((void**)&cusparseBuffer, bufferSize));
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseXcsrgeam2Nnz(
+          cusparseHandle, m, n, A_cusparse, A.nnz(), A.graph.row_map.data(),
+          A.graph.entries.data(), B_cusparse, B.nnz(), B.graph.row_map.data(),
+          B.graph.entries.data(), C_cusparse, row_mapC.data(), &c_nnz,
           cusparseBuffer));
 #endif
     }
-    if(!params.use_mkl)
-    {
-      entriesC = lno_nnz_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC (empty)"), c_nnz);
-      valuesC = scalar_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC (empty)"), c_nnz);
+    if (!params.use_mkl) {
+      entriesC = lno_nnz_view_t(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC (empty)"),
+          c_nnz);
+      valuesC = scalar_view_t(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC (empty)"),
+          c_nnz);
     }
 
-    //note: symbolic has a fence at the end
+    // note: symbolic has a fence at the end
     symbolicTime += timer.seconds();
     timer.reset();
-    //Just time all numeric repetitions together
-    for(int numericRep = 0; numericRep < params.numericRepeat; numericRep++)
-    {
-      if(params.use_cusparse)
-      {
+    // Just time all numeric repetitions together
+    for (int numericRep = 0; numericRep < params.numericRepeat; numericRep++) {
+      if (params.use_cusparse) {
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-        KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrgeam2(cusparseHandle, m, n,
-            &alphabeta, A_cusparse, A.nnz(),
+        KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrgeam2(
+            cusparseHandle, m, n, &alphabeta, A_cusparse, A.nnz(),
             A.values.data(), A.graph.row_map.data(), A.graph.entries.data(),
-            &alphabeta, B_cusparse, B.nnz(),
-            B.values.data(), B.graph.row_map.data(), B.graph.entries.data(),
-            C_cusparse, valuesC.data(), row_mapC.data(), entriesC.data(),
-            cusparseBuffer));
+            &alphabeta, B_cusparse, B.nnz(), B.values.data(),
+            B.graph.row_map.data(), B.graph.entries.data(), C_cusparse,
+            valuesC.data(), row_mapC.data(), entriesC.data(), cusparseBuffer));
 #endif
-      }
-      else if(params.use_mkl)
-      {
+      } else if (params.use_mkl) {
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
-        SPADD_MKL_SAFE_CALL(mkl_sparse_d_add(SPARSE_OPERATION_NON_TRANSPOSE, Amkl, 1.0, Bmkl, &Cmkl));
+        SPADD_MKL_SAFE_CALL(mkl_sparse_d_add(SPARSE_OPERATION_NON_TRANSPOSE,
+                                             Amkl, 1.0, Bmkl, &Cmkl));
         SPADD_MKL_SAFE_CALL(mkl_sparse_destroy(Cmkl));
 #endif
-      }
-      else
-      {
-        spadd_numeric(&kh,
-            A.graph.row_map, A.graph.entries, A.values, 1.0, //A, alpha
-            B.graph.row_map, B.graph.entries, B.values, 1.0, //B, beta
-            row_mapC, entriesC, valuesC);  //C
+      } else {
+        spadd_numeric(
+            &kh, A.graph.row_map, A.graph.entries, A.values, 1.0,  // A, alpha
+            B.graph.row_map, B.graph.entries, B.values, 1.0,       // B, beta
+            row_mapC, entriesC, valuesC);                          // C
       }
     }
     numericTime += timer.seconds();
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-    if(params.use_cusparse)
+    if (params.use_cusparse)
       KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(cusparseBuffer));
 #endif
   }
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-  if(params.use_cusparse)
+  if (params.use_cusparse)
     KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroy(cusparseHandle));
 #endif
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
-  if(params.use_mkl)
-  {
+  if (params.use_mkl) {
     SPADD_MKL_SAFE_CALL(mkl_sparse_destroy(Amkl));
     SPADD_MKL_SAFE_CALL(mkl_sparse_destroy(Bmkl));
   }
 #endif
 
   int symbolicCalls = params.repeat;
-  int numericCalls = params.repeat * params.numericRepeat;
+  int numericCalls  = params.repeat * params.numericRepeat;
 
-  std::cout
-    << "Mean total time:    " << (symbolicTime / symbolicCalls) + (numericTime / numericCalls) << '\n'
-    << "Mean symbolic time: " << (symbolicTime / symbolicCalls) << '\n'
-    << "Mean numeric time:  " << (numericTime / numericCalls) << '\n';
+  std::cout << "Mean total time:    "
+            << (symbolicTime / symbolicCalls) + (numericTime / numericCalls)
+            << '\n'
+            << "Mean symbolic time: " << (symbolicTime / symbolicCalls) << '\n'
+            << "Mean numeric time:  " << (numericTime / numericCalls) << '\n';
 
-  if (params.verbose)
-  {
+  if (params.verbose) {
     std::cout << "row_mapC:" << row_mapC.extent(0) << std::endl;
     std::cout << "entriesC:" << entriesC.extent(0) << std::endl;
     std::cout << "valuesC:" << valuesC.extent(0) << std::endl;
@@ -368,104 +373,105 @@ void run_experiment(const Params& params)
     KokkosKernels::Impl::print_1Dview(entriesC);
     KokkosKernels::Impl::print_1Dview(row_mapC);
   }
-  if(params.cmtx.length())
-  {
-    std::cout << "Writing C (" << m << "x" << n << ") to " << params.cmtx << "\n";
+  if (params.cmtx.length()) {
+    std::cout << "Writing C (" << m << "x" << n << ") to " << params.cmtx
+              << "\n";
     crsMat_t C("C", m, n, c_nnz, valuesC, row_mapC, entriesC);
-    KokkosKernels::Impl::write_kokkos_crst_matrix<crsMat_t>(C, params.cmtx.c_str());
+    KokkosKernels::Impl::write_kokkos_crst_matrix<crsMat_t>(
+        C, params.cmtx.c_str());
   }
 }
 
-void print_options(){
+void print_options() {
   std::cerr << "Options\n" << std::endl;
 
-  std::cerr << "\t[Required] BACKEND: '--threads[numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]'" << std::endl;
+  std::cerr
+      << "\t[Required] BACKEND: '--threads[numThreads]' | '--openmp "
+         "[numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]'"
+      << std::endl;
 
   std::cerr << "\t[Optional] --amtx <path> :: 1st input matrix" << std::endl;
   std::cerr << "\t[Optional] --bmtx <path> :: 2nd input matrix" << std::endl;
-  std::cerr << "\t[Optional] --cmtx <path> :: output matrix for C = A+B"  << std::endl;
+  std::cerr << "\t[Optional] --cmtx <path> :: output matrix for C = A+B"
+            << std::endl;
   std::cerr << "\t[Optional] --mkl         :: run SpAdd from MKL" << std::endl;
-  std::cerr << "\t[Optional] --cusparse    :: run SpAdd from cuSPARSE " << std::endl;
-  std::cerr << "\t[Optional] --sorted      :: sort rows of inputs, and run the sorted algorithm" << std::endl;
-  std::cerr << "\t[Optional] --unsorted    :: run the unsorted algorithm" << std::endl;
-  std::cerr << "\t[Optional] --repeat      :: how many times to repeat overall spadd (symbolic + repeated numeric)" << std::endl;
-  std::cerr << "\t[Optional] --numeric-repeat :: how many times to repeat numeric per symbolic" << std::endl;
-  std::cerr << "\t[Optional] --verbose     :: enable verbose output" << std::endl;
+  std::cerr << "\t[Optional] --cusparse    :: run SpAdd from cuSPARSE "
+            << std::endl;
+  std::cerr << "\t[Optional] --sorted      :: sort rows of inputs, and run the "
+               "sorted algorithm"
+            << std::endl;
+  std::cerr << "\t[Optional] --unsorted    :: run the unsorted algorithm"
+            << std::endl;
+  std::cerr << "\t[Optional] --repeat      :: how many times to repeat overall "
+               "spadd (symbolic + repeated numeric)"
+            << std::endl;
+  std::cerr << "\t[Optional] --numeric-repeat :: how many times to repeat "
+               "numeric per symbolic"
+            << std::endl;
+  std::cerr << "\t[Optional] --verbose     :: enable verbose output"
+            << std::endl;
   std::cerr << "\nSettings for randomly generated A/B matrices" << std::endl;
-  std::cerr << "\t[Optional] --m           :: number of rows to generate" << std::endl;
-  std::cerr << "\t[Optional] --n           :: number of cols to generate" << std::endl;
-  std::cerr << "\t[Optional] --nnz         :: number of entries per row to generate" << std::endl;
-  std::cerr << "\t[Optional] --nnz         :: number of entries per row to generate" << std::endl;
-  std::cerr << "\t[Optional] --bdiag       :: generate B as a diagonal matrix" << std::endl;
+  std::cerr << "\t[Optional] --m           :: number of rows to generate"
+            << std::endl;
+  std::cerr << "\t[Optional] --n           :: number of cols to generate"
+            << std::endl;
+  std::cerr
+      << "\t[Optional] --nnz         :: number of entries per row to generate"
+      << std::endl;
+  std::cerr
+      << "\t[Optional] --nnz         :: number of entries per row to generate"
+      << std::endl;
+  std::cerr << "\t[Optional] --bdiag       :: generate B as a diagonal matrix"
+            << std::endl;
 }
 
-int parse_inputs (Params& params, int argc, char **argv){
-  for ( int i = 1 ; i < argc ; ++i ) {
-    if ( 0 == strcasecmp( argv[i] , "--threads" ) ) {
-      params.use_threads = atoi( argv[++i] );
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--openmp" ) ) {
-      params.use_openmp = atoi( argv[++i] );
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) {
-      params.use_cuda = atoi( argv[++i] ) + 1;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--mkl" ) ) {
+int parse_inputs(Params& params, int argc, char** argv) {
+  for (int i = 1; i < argc; ++i) {
+    if (0 == Test::string_compare_no_case(argv[i], "--threads")) {
+      params.use_threads = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) {
+      params.use_openmp = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) {
+      params.use_cuda = atoi(argv[++i]) + 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--mkl")) {
       params.use_mkl = 1;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--cusparse" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--cusparse")) {
       params.use_cusparse = 1;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--sorted" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--sorted")) {
       params.sorted = true;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--unsorted" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--unsorted")) {
       params.sorted = false;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--amtx" ) ) {
-      //A at C=AxB
+    } else if (0 == Test::string_compare_no_case(argv[i], "--amtx")) {
+      // A at C=AxB
       params.amtx = argv[++i];
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--bmtx" ) ) {
-      //B at C=AxB.
-      //if not provided, C = AxA will be performed.
+    } else if (0 == Test::string_compare_no_case(argv[i], "--bmtx")) {
+      // B at C=AxB.
+      // if not provided, C = AxA will be performed.
       params.bmtx = argv[++i];
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--cmtx" ) ) {
-      //if provided, C will be written to given file.
-      //has to have ".bin", or ".crs" extension.
+    } else if (0 == Test::string_compare_no_case(argv[i], "--cmtx")) {
+      // if provided, C will be written to given file.
+      // has to have ".bin", or ".crs" extension.
       params.cmtx = argv[++i];
-    }
-    else if( 0 == strcasecmp( argv[i], "--m" ))
-    {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--m")) {
       params.m = atoi(argv[++i]);
-    }
-    else if( 0 == strcasecmp( argv[i], "--n" ))
-    {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--n")) {
       params.n = atoi(argv[++i]);
-    }
-    else if( 0 == strcasecmp( argv[i], "--nnz" ))
-    {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--nnz")) {
       params.nnzPerRow = atoi(argv[++i]);
-    }
-    else if( 0 == strcasecmp( argv[i], "--bdiag" ))
-    {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--bdiag")) {
       params.bDiag = true;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--repeat" ) ) {
-      //if provided, C will be written to given file.
-      //has to have ".bin", or ".crs" extension.
-      params.repeat = atoi( argv[++i] );
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--numeric-repeat" ) ) {
-      //Reuse the symbolic step this many times.
-      params.numericRepeat = atoi( argv[++i] );
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--verbose" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) {
+      // if provided, C will be written to given file.
+      // has to have ".bin", or ".crs" extension.
+      params.repeat = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--numeric-repeat")) {
+      // Reuse the symbolic step this many times.
+      params.numericRepeat = atoi(argv[++i]);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--verbose")) {
       params.verbose = true;
-    }
-    else {
-      std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ;
+    } else {
+      std::cerr << "Unrecognized command line argument #" << i << ": "
+                << argv[i] << std::endl;
       print_options();
       return 1;
     }
@@ -473,93 +479,95 @@ int parse_inputs (Params& params, int argc, char **argv){
   return 0;
 }
 
-int main (int argc, char ** argv){
-
+int main(int argc, char** argv) {
   Params params;
 
-  if (parse_inputs (params, argc, argv) ){
+  if (parse_inputs(params, argc, argv)) {
     return 1;
   }
-  const int num_threads = params.use_openmp; // Assumption is that use_openmp variable is provided as number of threads
+  const int num_threads =
+      params.use_openmp;  // Assumption is that use_openmp variable is provided
+                          // as number of threads
   const int device_id = params.use_cuda - 1;
 
-  Kokkos::initialize( Kokkos::InitArguments( num_threads, -1, device_id ) );
-  //Kokkos::print_configuration(std::cout);
+  Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
+  // Kokkos::print_configuration(std::cout);
 
-  //First, make sure that requested TPL (if any) is actually available
+  // First, make sure that requested TPL (if any) is actually available
 #if !defined(KOKKOSKERNELS_ENABLE_TPL_MKL)
-  if(params.use_mkl)
-    throw std::invalid_argument("To run MKL SpAdd, must enable the MKL TPL in cmake");
+  if (params.use_mkl)
+    throw std::invalid_argument(
+        "To run MKL SpAdd, must enable the MKL TPL in cmake");
 #endif
 #if !defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
-  if(params.use_cusparse)
-    throw std::invalid_argument("To run cuSPARSE SpAdd, must enable the cuSPARSE TPL in cmake");
+  if (params.use_cusparse)
+    throw std::invalid_argument(
+        "To run cuSPARSE SpAdd, must enable the cuSPARSE TPL in cmake");
 #endif
 
-  bool useOMP = params.use_openmp != 0;
+  bool useOMP  = params.use_openmp != 0;
   bool useCUDA = params.use_cuda != 0;
 
-  if(params.use_cusparse && !useCUDA)
-  {
-    throw std::invalid_argument("To run cuSPARSE SpAdd, must supply the '--cuda <device id>' flag");
+  if (params.use_cusparse && !useCUDA) {
+    throw std::invalid_argument(
+        "To run cuSPARSE SpAdd, must supply the '--cuda <device id>' flag");
   }
 
-  if(params.cmtx.length() && params.use_mkl)
-  {
-    throw std::invalid_argument("If running MKL, can't output the result to file");
+  if (params.cmtx.length() && params.use_mkl) {
+    throw std::invalid_argument(
+        "If running MKL, can't output the result to file");
   }
 
   bool useSerial = !useOMP && !useCUDA;
 
-  if(useOMP)
-  {
-#if defined( KOKKOS_ENABLE_OPENMP )
-    using crsMat_t = KokkosSparse::CrsMatrix<double, int, Kokkos::OpenMP, void, int>;
+  if (useOMP) {
+#if defined(KOKKOS_ENABLE_OPENMP)
+    using crsMat_t =
+        KokkosSparse::CrsMatrix<double, int, Kokkos::OpenMP, void, int>;
     run_experiment<crsMat_t>(params);
 #else
     std::cout << "ERROR: OpenMP requested, but not available.\n";
     return 1;
 #endif
   }
-  if(useCUDA)
-  {
-#if defined( KOKKOS_ENABLE_CUDA )
-    using crsMat_t = KokkosSparse::CrsMatrix<double, int, Kokkos::Cuda, void, int>;
+  if (useCUDA) {
+#if defined(KOKKOS_ENABLE_CUDA)
+    using crsMat_t =
+        KokkosSparse::CrsMatrix<double, int, Kokkos::Cuda, void, int>;
     run_experiment<crsMat_t>(params);
 #else
     std::cout << "ERROR: CUDA requested, but not available.\n";
     return 1;
 #endif
   }
-  if(useSerial)
-  {
-#if defined( KOKKOS_ENABLE_SERIAL )
-    using crsMat_t = KokkosSparse::CrsMatrix<double, int, Kokkos::Serial, void, int>;
+  if (useSerial) {
+#if defined(KOKKOS_ENABLE_SERIAL)
+    using crsMat_t =
+        KokkosSparse::CrsMatrix<double, int, Kokkos::Serial, void, int>;
     run_experiment<crsMat_t>(params);
 #else
     std::cout << "ERROR: Serial device requested, but not available.\n";
     return 1;
 #endif
   }
-  Kokkos::finalize(); 
+  Kokkos::finalize();
   return 0;
 }
 
 #else
 int main() {
 #if !defined(KOKKOSKERNELS_INST_DOUBLE)
-std::cout  << " not defined KOKKOSKERNELS_INST_DOUBLE"  << std::endl;
+  std::cout << " not defined KOKKOSKERNELS_INST_DOUBLE" << std::endl;
 #endif
 
 #if !defined(KOKKOSKERNELS_INST_OFFSET_INT)
-std::cout  << " not defined KOKKOSKERNELS_INST_OFFSET_INT"  << std::endl;
+  std::cout << " not defined KOKKOSKERNELS_INST_OFFSET_INT" << std::endl;
 
 #endif
 
 #if !defined(KOKKOSKERNELS_INST_ORDINAL_INT)
-std::cout  << " not defined KOKKOSKERNELS_INST_ORDINAL_INT"  << std::endl;
+  std::cout << " not defined KOKKOSKERNELS_INST_ORDINAL_INT" << std::endl;
 
 #endif
 }
 #endif
-
diff --git a/perf_test/sparse/KokkosSparse_spgemm.cpp b/perf_test/sparse/KokkosSparse_spgemm.cpp
index 0f1c9f6210..9fada4caaa 100644
--- a/perf_test/sparse/KokkosSparse_spgemm.cpp
+++ b/perf_test/sparse/KokkosSparse_spgemm.cpp
@@ -46,233 +46,224 @@
 #include "KokkosKernels_default_types.hpp"
 #include "KokkosKernels_IOUtils.hpp"
 #include "KokkosSparse_multimem_spgemm.hpp"
+#include "KokkosKernels_TestUtils.hpp"
 
-void print_options(){
+void print_options() {
   std::cerr << "Options\n" << std::endl;
 
-  std::cerr << "\t[Required] INPUT MATRIX: '--amtx [left_hand_side.mtx]' -- for C=AxA" << std::endl;
+  std::cerr
+      << "\t[Required] INPUT MATRIX: '--amtx [left_hand_side.mtx]' -- for C=AxA"
+      << std::endl;
 
-  std::cerr << "\t[Optional] BACKEND: '--threads [numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]' --> if none are specified, Serial is used (if enabled)" << std::endl;
-  std::cerr << "\t[Optional] '--algorithm [DEFAULT=KKDEFAULT=KKSPGEMM|KKMEM|KKDENSE|MKL|CUSPARSE|CUSP|VIENNA|MKL2]' --> to choose algorithm. KKMEM is outdated, use KKSPGEMM instead." << std::endl;
-  std::cerr << "\t[Optional] --bmtx [righ_hand_side.mtx]' for C = AxB" << std::endl;
-  std::cerr << "\t[Optional] OUTPUT MATRICES: '--cmtx [output_matrix.mtx]' --> to write output C=AxB"  << std::endl;
-  std::cerr << "\t[Optional] --DENSEACCMAX: on CPUs default algorithm may choose to use dense accumulators. This parameter defaults to 250k, which is max k value to choose dense accumulators. This can be increased with more memory bandwidth." << std::endl;
-  std::cerr << "\tThe memory space used for each matrix: '--memspaces [0|1|....15]' --> Bits representing the use of HBM for Work, C, B, and A respectively. For example 12 = 1100, will store work arrays and C on HBM. A and B will be stored DDR. To use this enable multilevel memory in Kokkos, check generate_makefile.sh" << std::endl;
-  std::cerr << "\tLoop scheduling: '--dynamic': Use this for dynamic scheduling of the loops. (Better performance most of the time)" << std::endl;
+  std::cerr << "\t[Optional] BACKEND: '--threads [numThreads]' | '--openmp "
+               "[numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip "
+               "[hipDeviceIndex]' --> if none are specified, Serial is used "
+               "(if enabled)"
+            << std::endl;
+  std::cerr << "\t[Optional] '--algorithm "
+               "[DEFAULT=KKDEFAULT=KKSPGEMM|KKMEM|KKDENSE|MKL|CUSPARSE|CUSP|"
+               "VIENNA|MKL2]' --> to choose algorithm. KKMEM is outdated, use "
+               "KKSPGEMM instead."
+            << std::endl;
+  std::cerr << "\t[Optional] --bmtx [righ_hand_side.mtx]' for C = AxB"
+            << std::endl;
+  std::cerr << "\t[Optional] OUTPUT MATRICES: '--cmtx [output_matrix.mtx]' --> "
+               "to write output C=AxB"
+            << std::endl;
+  std::cerr << "\t[Optional] --DENSEACCMAX: on CPUs default algorithm may "
+               "choose to use dense accumulators. This parameter defaults to "
+               "250k, which is max k value to choose dense accumulators. This "
+               "can be increased with more memory bandwidth."
+            << std::endl;
+  std::cerr
+      << "\tThe memory space used for each matrix: '--memspaces [0|1|....15]' "
+         "--> Bits representing the use of HBM for Work, C, B, and A "
+         "respectively. For example 12 = 1100, will store work arrays and C on "
+         "HBM. A and B will be stored DDR. To use this enable multilevel "
+         "memory in Kokkos, check generate_makefile.sh"
+      << std::endl;
+  std::cerr << "\tLoop scheduling: '--dynamic': Use this for dynamic "
+               "scheduling of the loops. (Better performance most of the time)"
+            << std::endl;
   std::cerr << "\tVerbose Output: '--verbose'" << std::endl;
 }
 
-static char* getNextArg(int& i, int argc, char** argv)
-{
+static char* getNextArg(int& i, int argc, char** argv) {
   i++;
-  if(i >= argc)
-  {
+  if (i >= argc) {
     std::cerr << "Error: expected additional command-line argument!\n";
     exit(1);
   }
   return argv[i];
 }
 
-int parse_inputs (KokkosKernels::Experiment::Parameters &params, int argc, char **argv){
-  for ( int i = 1 ; i < argc ; ++i ) {
-    if ( 0 == strcasecmp( argv[i] , "--threads" ) ) {
+int parse_inputs(KokkosKernels::Experiment::Parameters& params, int argc,
+                 char** argv) {
+  for (int i = 1; i < argc; ++i) {
+    if (0 == Test::string_compare_no_case(argv[i], "--threads")) {
       params.use_threads = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--openmp" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) {
       params.use_openmp = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) {
       params.use_cuda = atoi(getNextArg(i, argc, argv)) + 1;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--hip" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) {
       params.use_hip = atoi(getNextArg(i, argc, argv)) + 1;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--repeat" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) {
       params.repeat = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--hashscale" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--hashscale")) {
       params.minhashscale = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--chunksize" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--chunksize")) {
       params.chunk_size = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--teamsize" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--teamsize")) {
       params.team_size = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--vectorsize" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--vectorsize")) {
       params.vector_size = atoi(getNextArg(i, argc, argv));
     }
 
-    else if ( 0 == strcasecmp( argv[i] , "--compression2step" ) ) {
-      params.compression2step =  true ;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--shmem" ) ) {
+    else if (0 == Test::string_compare_no_case(argv[i], "--compression2step")) {
+      params.compression2step = true;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--shmem")) {
       params.shmemsize = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--memspaces" ) ) {
-      int memspaces = atoi(getNextArg(i, argc, argv));
+    } else if (0 == Test::string_compare_no_case(argv[i], "--memspaces")) {
+      int memspaces    = atoi(getNextArg(i, argc, argv));
       int memspaceinfo = memspaces;
       std::cout << "memspaceinfo:" << memspaceinfo << std::endl;
-      if (memspaceinfo & 1){
+      if (memspaceinfo & 1) {
         params.a_mem_space = 1;
         std::cout << "Using HBM for A" << std::endl;
-      }
-      else {
+      } else {
         params.a_mem_space = 0;
         std::cout << "Using DDR4 for A" << std::endl;
       }
-      memspaceinfo  = memspaceinfo >> 1;
-      if (memspaceinfo & 1){
+      memspaceinfo = memspaceinfo >> 1;
+      if (memspaceinfo & 1) {
         params.b_mem_space = 1;
         std::cout << "Using HBM for B" << std::endl;
-      }
-      else {
+      } else {
         params.b_mem_space = 0;
         std::cout << "Using DDR4 for B" << std::endl;
       }
-      memspaceinfo  = memspaceinfo >> 1;
-      if (memspaceinfo & 1){
+      memspaceinfo = memspaceinfo >> 1;
+      if (memspaceinfo & 1) {
         params.c_mem_space = 1;
         std::cout << "Using HBM for C" << std::endl;
-      }
-      else {
+      } else {
         params.c_mem_space = 0;
         std::cout << "Using DDR4 for C" << std::endl;
       }
-      memspaceinfo  = memspaceinfo >> 1;
-      if (memspaceinfo & 1){
+      memspaceinfo = memspaceinfo >> 1;
+      if (memspaceinfo & 1) {
         params.work_mem_space = 1;
         std::cout << "Using HBM for work memory space" << std::endl;
-      }
-      else {
+      } else {
         params.work_mem_space = 0;
         std::cout << "Using DDR4 for work memory space" << std::endl;
       }
-      memspaceinfo  = memspaceinfo >> 1;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--CRWC" ) ) {
+      memspaceinfo = memspaceinfo >> 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--CRWC")) {
       params.calculate_read_write_cost = 1;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--CIF" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--CIF")) {
       params.coloring_input_file = getNextArg(i, argc, argv);
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--COF" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--COF")) {
       params.coloring_output_file = getNextArg(i, argc, argv);
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--CCO" ) ) {
-        //if 0.85 set, if compression does not reduce flops by at least 15% symbolic will run on original matrix.
-    	//otherwise, it will compress the graph and run symbolic on compressed one.
+    } else if (0 == Test::string_compare_no_case(argv[i], "--CCO")) {
+      // if 0.85 set, if compression does not reduce flops by at least 15%
+      // symbolic will run on original matrix. otherwise, it will compress the
+      // graph and run symbolic on compressed one.
       params.compression_cut_off = atof(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--FLHCO" ) ) {
-    	//if linear probing is used as hash, what is the max occupancy percantage we allow in the hash.
-        params.first_level_hash_cut_off = atof(getNextArg(i, argc, argv));
+    } else if (0 == Test::string_compare_no_case(argv[i], "--FLHCO")) {
+      // if linear probing is used as hash, what is the max occupancy percantage
+      // we allow in the hash.
+      params.first_level_hash_cut_off = atof(getNextArg(i, argc, argv));
     }
 
-    else if ( 0 == strcasecmp( argv[i] , "--flop" ) ) {
-    	//print flop statistics. only for the first repeat.
-        params.calculate_read_write_cost = 1;
+    else if (0 == Test::string_compare_no_case(argv[i], "--flop")) {
+      // print flop statistics. only for the first repeat.
+      params.calculate_read_write_cost = 1;
     }
 
-    else if ( 0 == strcasecmp( argv[i] , "--mklsort" ) ) {
-    	//when mkl2 is run, the sort option to use.
-    	//7:not to sort the output
-    	//8:to sort the output
-        params.mkl_sort_option = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--mklkeepout" ) ) {
-    	//mkl output is not kept.
-        params.mkl_keep_output = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--checkoutput" ) ) {
-    	//check correctness
-        params.check_output = 1;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--amtx" ) ) {
-    	//A at C=AxB
-        params.a_mtx_bin_file = getNextArg(i, argc, argv);
+    else if (0 == Test::string_compare_no_case(argv[i], "--mklsort")) {
+      // when mkl2 is run, the sort option to use.
+      // 7:not to sort the output
+      // 8:to sort the output
+      params.mkl_sort_option = atoi(getNextArg(i, argc, argv));
+    } else if (0 == Test::string_compare_no_case(argv[i], "--mklkeepout")) {
+      // mkl output is not kept.
+      params.mkl_keep_output = atoi(getNextArg(i, argc, argv));
+    } else if (0 == Test::string_compare_no_case(argv[i], "--checkoutput")) {
+      // check correctness
+      params.check_output = 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--amtx")) {
+      // A at C=AxB
+      params.a_mtx_bin_file = getNextArg(i, argc, argv);
     }
 
-    else if ( 0 == strcasecmp( argv[i] , "--bmtx" ) ) {
-    	//B at C=AxB.
-    	//if not provided, C = AxA will be performed.
-    	params.b_mtx_bin_file = getNextArg(i, argc, argv);
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--cmtx" ) ) {
-    	//if provided, C will be written to given file.
-    	//has to have ".bin", or ".crs" extension.
-    	params.c_mtx_bin_file = getNextArg(i, argc, argv);
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--dynamic" ) ) {
-    	//dynamic scheduling will be used for loops.
-    	//currently it is default already.
-    	//so has to use the dynamic schedulin.
-        params.use_dynamic_scheduling = 1;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--DENSEACCMAX" ) ) {
-    	//on CPUs and KNLs if DEFAULT algorithm or KKSPGEMM is chosen,
-    	//it uses dense accumulators for smaller matrices based on the size of column (k) in B.
-    	//Max column size is 250,000 for k to use dense accumulators.
-    	//this parameter overwrites this.
-    	//with cache mode, or CPUs with smaller thread count, where memory bandwidth is not an issue,
-    	//this cut-off can be increased to be more than 250,000
-        params.MaxColDenseAcc = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--verbose" ) ) {
-    	//print the timing and information about the inner steps.
-    	//if you are timing TPL libraries, for correct timing use verbose option,
-    	//because there are pre- post processing in these TPL kernel wraps.
-        params.verbose = 1;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--algorithm" ) ) {
+    else if (0 == Test::string_compare_no_case(argv[i], "--bmtx")) {
+      // B at C=AxB.
+      // if not provided, C = AxA will be performed.
+      params.b_mtx_bin_file = getNextArg(i, argc, argv);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--cmtx")) {
+      // if provided, C will be written to given file.
+      // has to have ".bin", or ".crs" extension.
+      params.c_mtx_bin_file = getNextArg(i, argc, argv);
+    } else if (0 == Test::string_compare_no_case(argv[i], "--dynamic")) {
+      // dynamic scheduling will be used for loops.
+      // currently it is default already.
+      // so has to use the dynamic schedulin.
+      params.use_dynamic_scheduling = 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--DENSEACCMAX")) {
+      // on CPUs and KNLs if DEFAULT algorithm or KKSPGEMM is chosen,
+      // it uses dense accumulators for smaller matrices based on the size of
+      // column (k) in B. Max column size is 250,000 for k to use dense
+      // accumulators. this parameter overwrites this. with cache mode, or CPUs
+      // with smaller thread count, where memory bandwidth is not an issue, this
+      // cut-off can be increased to be more than 250,000
+      params.MaxColDenseAcc = atoi(getNextArg(i, argc, argv));
+    } else if (0 == Test::string_compare_no_case(argv[i], "--verbose")) {
+      // print the timing and information about the inner steps.
+      // if you are timing TPL libraries, for correct timing use verbose option,
+      // because there are pre- post processing in these TPL kernel wraps.
+      params.verbose = 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--algorithm")) {
       char* algoStr = getNextArg(i, argc, argv);
 
-      if ( 0 == strcasecmp( algoStr, "DEFAULT" ) ) {
-    	  params.algorithm = KokkosSparse::SPGEMM_KK;
-      }
-      else if ( 0 == strcasecmp( algoStr, "KKDEFAULT" ) ) {
-    	  params.algorithm = KokkosSparse::SPGEMM_KK;
-      }
-      else if ( 0 == strcasecmp( algoStr, "KKSPGEMM" ) ) {
-    	  params.algorithm = KokkosSparse::SPGEMM_KK;
+      if (0 == Test::string_compare_no_case(algoStr, "DEFAULT")) {
+        params.algorithm = KokkosSparse::SPGEMM_KK;
+      } else if (0 == Test::string_compare_no_case(algoStr, "KKDEFAULT")) {
+        params.algorithm = KokkosSparse::SPGEMM_KK;
+      } else if (0 == Test::string_compare_no_case(algoStr, "KKSPGEMM")) {
+        params.algorithm = KokkosSparse::SPGEMM_KK;
       }
 
-      else if ( 0 == strcasecmp( algoStr, "KKMEM" ) ) {
-    	  params.algorithm = KokkosSparse::SPGEMM_KK_MEMORY;
-      }
-      else if ( 0 == strcasecmp( algoStr, "KKDENSE" ) ) {
-        params.algorithm =  KokkosSparse::SPGEMM_KK_DENSE;
-      }
-      else if ( 0 == strcasecmp( algoStr, "KKLP" ) ) {
-    	  params.algorithm = KokkosSparse::SPGEMM_KK_LP;
-      }
-      else if ( 0 == strcasecmp( algoStr, "MKL" ) ) {
-    	  params.algorithm = KokkosSparse::SPGEMM_MKL;
-      }
-      else if ( 0 == strcasecmp( algoStr, "CUSPARSE" ) ) {
-    	  params.algorithm = KokkosSparse::SPGEMM_CUSPARSE;
-      }
-      else if ( 0 == strcasecmp( algoStr, "CUSP" ) ) {
-    	  params.algorithm = KokkosSparse::SPGEMM_CUSP;
-      }
-      else if ( 0 == strcasecmp( algoStr, "KKDEBUG" ) ) {
-    	  params.algorithm = KokkosSparse::SPGEMM_KK_LP;
-      }
-      else if ( 0 == strcasecmp( algoStr, "MKL2" ) ) {
-    	  params.algorithm = KokkosSparse::SPGEMM_MKL2PHASE;
-      }
-      else if ( 0 == strcasecmp( algoStr, "VIENNA" ) ) {
-    	  params.algorithm = KokkosSparse::SPGEMM_VIENNA;
+      else if (0 == Test::string_compare_no_case(algoStr, "KKMEM")) {
+        params.algorithm = KokkosSparse::SPGEMM_KK_MEMORY;
+      } else if (0 == Test::string_compare_no_case(algoStr, "KKDENSE")) {
+        params.algorithm = KokkosSparse::SPGEMM_KK_DENSE;
+      } else if (0 == Test::string_compare_no_case(algoStr, "KKLP")) {
+        params.algorithm = KokkosSparse::SPGEMM_KK_LP;
+      } else if (0 == Test::string_compare_no_case(algoStr, "MKL")) {
+        params.algorithm = KokkosSparse::SPGEMM_MKL;
+      } else if (0 == Test::string_compare_no_case(algoStr, "CUSPARSE")) {
+        params.algorithm = KokkosSparse::SPGEMM_CUSPARSE;
+      } else if (0 == Test::string_compare_no_case(algoStr, "CUSP")) {
+        params.algorithm = KokkosSparse::SPGEMM_CUSP;
+      } else if (0 == Test::string_compare_no_case(algoStr, "KKDEBUG")) {
+        params.algorithm = KokkosSparse::SPGEMM_KK_LP;
+      } else if (0 == Test::string_compare_no_case(algoStr, "MKL2")) {
+        params.algorithm = KokkosSparse::SPGEMM_MKL2PHASE;
+      } else if (0 == Test::string_compare_no_case(algoStr, "VIENNA")) {
+        params.algorithm = KokkosSparse::SPGEMM_VIENNA;
       }
 
       else {
-        std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ;
+        std::cerr << "Unrecognized command line argument #" << i << ": "
+                  << argv[i] << std::endl;
         print_options();
         return 1;
       }
-    }
-    else {
-      std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ;
+    } else {
+      std::cerr << "Unrecognized command line argument #" << i << ": "
+                << argv[i] << std::endl;
       print_options();
       return 1;
     }
@@ -280,95 +271,91 @@ int parse_inputs (KokkosKernels::Experiment::Parameters &params, int argc, char
   return 0;
 }
 
-int main (int argc, char ** argv){
+int main(int argc, char** argv) {
   using size_type = default_size_type;
-  using lno_t = default_lno_t;
-  using scalar_t = default_scalar;
+  using lno_t     = default_lno_t;
+  using scalar_t  = default_scalar;
 
   KokkosKernels::Experiment::Parameters params;
 
-  if (parse_inputs (params, argc, argv) ){
+  if (parse_inputs(params, argc, argv)) {
     return 1;
   }
-  if (params.a_mtx_bin_file == NULL){
-    std::cerr << "Provide a and b matrix files" << std::endl ;
+  if (params.a_mtx_bin_file == NULL) {
+    std::cerr << "Provide a and b matrix files" << std::endl;
     print_options();
     return 0;
   }
-  if (params.b_mtx_bin_file == NULL){
+  if (params.b_mtx_bin_file == NULL) {
     std::cout << "B is not provided. Multiplying AxA." << std::endl;
   }
 
   const int num_threads = std::max(params.use_openmp, params.use_threads);
-  const int device_id = params.use_cuda ? params.use_cuda - 1 : params.use_hip - 1;
+  const int device_id =
+      params.use_cuda ? params.use_cuda - 1 : params.use_hip - 1;
 
-  Kokkos::initialize( Kokkos::InitArguments( num_threads, -1, device_id ) );
+  Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
   Kokkos::print_configuration(std::cout);
 
-#if defined( KOKKOS_ENABLE_OPENMP )
+#if defined(KOKKOS_ENABLE_OPENMP)
 
   if (params.use_openmp) {
 #ifdef KOKKOSKERNELS_INST_MEMSPACE_HBWSPACE
-    KokkosKernels::Experiment::run_multi_mem_spgemm
-    <size_type, lno_t, scalar_t, Kokkos::OpenMP, Kokkos::Experimental::HBWSpace, Kokkos::HostSpace>(
-        params
-        );
-#else 
-    KokkosKernels::Experiment::run_multi_mem_spgemm
-    <size_type, lno_t, scalar_t, Kokkos::OpenMP, Kokkos::OpenMP::memory_space, Kokkos::OpenMP::memory_space>(
-        params
-        );
+    KokkosKernels::Experiment::run_multi_mem_spgemm<
+        size_type, lno_t, scalar_t, Kokkos::OpenMP,
+        Kokkos::Experimental::HBWSpace, Kokkos::HostSpace>(params);
+#else
+    KokkosKernels::Experiment::run_multi_mem_spgemm<
+        size_type, lno_t, scalar_t, Kokkos::OpenMP,
+        Kokkos::OpenMP::memory_space, Kokkos::OpenMP::memory_space>(params);
 #endif
   }
 #endif
 
-#if defined( KOKKOS_ENABLE_CUDA )
+#if defined(KOKKOS_ENABLE_CUDA)
   if (params.use_cuda) {
 #ifdef KOKKOSKERNELS_INST_MEMSPACE_CUDAHOSTPINNEDSPACE
-    KokkosKernels::Experiment::run_multi_mem_spgemm
-    <size_type, lno_t, scalar_t, Kokkos::Cuda, Kokkos::Cuda::memory_space, Kokkos::CudaHostPinnedSpace>(
-        params
-        );
+    KokkosKernels::Experiment::run_multi_mem_spgemm<
+        size_type, lno_t, scalar_t, Kokkos::Cuda, Kokkos::Cuda::memory_space,
+        Kokkos::CudaHostPinnedSpace>(params);
 #else
-    KokkosKernels::Experiment::run_multi_mem_spgemm
-    <size_type, lno_t, scalar_t, Kokkos::Cuda, Kokkos::Cuda::memory_space, Kokkos::Cuda::memory_space>(
-        params
-        );
+    KokkosKernels::Experiment::run_multi_mem_spgemm<
+        size_type, lno_t, scalar_t, Kokkos::Cuda, Kokkos::Cuda::memory_space,
+        Kokkos::Cuda::memory_space>(params);
 
 #endif
   }
 #endif
 
-#if defined( KOKKOS_ENABLE_HIP )
+#if defined(KOKKOS_ENABLE_HIP)
   if (params.use_hip) {
-    KokkosKernels::Experiment::run_multi_mem_spgemm
-    <size_type, lno_t, scalar_t, Kokkos::Experimental::HIP, Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace>(
-        params
-        );
-
+    KokkosKernels::Experiment::run_multi_mem_spgemm<
+        size_type, lno_t, scalar_t, Kokkos::Experimental::HIP,
+        Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace>(params);
   }
 #endif
 
-#if defined( KOKKOS_ENABLE_THREADS )
-  //If only serial is enabled (or no other device was specified), run with serial
-  if (params.use_threads)
-  {
-    KokkosKernels::Experiment::run_multi_mem_spgemm
-    <size_type, lno_t, scalar_t, Kokkos::Threads, Kokkos::HostSpace, Kokkos::HostSpace>(params);
+#if defined(KOKKOS_ENABLE_THREADS)
+  // If only serial is enabled (or no other device was specified), run with
+  // serial
+  if (params.use_threads) {
+    KokkosKernels::Experiment::run_multi_mem_spgemm<
+        size_type, lno_t, scalar_t, Kokkos::Threads, Kokkos::HostSpace,
+        Kokkos::HostSpace>(params);
   }
 #endif
 
-#if defined( KOKKOS_ENABLE_SERIAL )
-  //If only serial is enabled (or no other device was specified), run with serial
-  if (!params.use_openmp && !params.use_cuda && !params.use_threads)
-  {
-    KokkosKernels::Experiment::run_multi_mem_spgemm
-    <size_type, lno_t, scalar_t, Kokkos::Serial, Kokkos::HostSpace, Kokkos::HostSpace>(params);
+#if defined(KOKKOS_ENABLE_SERIAL)
+  // If only serial is enabled (or no other device was specified), run with
+  // serial
+  if (!params.use_openmp && !params.use_cuda && !params.use_threads) {
+    KokkosKernels::Experiment::run_multi_mem_spgemm<
+        size_type, lno_t, scalar_t, Kokkos::Serial, Kokkos::HostSpace,
+        Kokkos::HostSpace>(params);
   }
 #endif
 
-  Kokkos::finalize(); 
+  Kokkos::finalize();
 
   return 0;
 }
-
diff --git a/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp b/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp
index 7196ecf5f5..98942acb27 100644
--- a/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp
+++ b/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp
@@ -46,198 +46,190 @@
 #include "KokkosKernels_default_types.hpp"
 #include "KokkosKernels_IOUtils.hpp"
 #include "KokkosSparse_run_spgemm_jacobi.hpp"
+#include "KokkosKernels_TestUtils.hpp"
 
-void print_options(){
+void print_options() {
   std::cerr << "Options\n" << std::endl;
-  std::cerr << "\t[Required] INPUT MATRIX: '--amtx [left_hand_side.mtx]' -- for C=AxA" << std::endl;
-  std::cerr << "\t[Optional] BACKEND: '--threads [numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]' --> if none are specified, Serial is used (if enabled)" << std::endl;
-  std::cerr << "\t[Optional] --bmtx [righ_hand_side.mtx]' for C = AxB" << std::endl;
-  std::cerr << "\t[Optional] OUTPUT MATRICES: '--cmtx [output_matrix.mtx]' --> to write output C=AxB"  << std::endl;
-  std::cerr << "\t[Optional] --DENSEACCMAX: on CPUs default algorithm may choose to use dense accumulators. This parameter defaults to 250k, which is max k value to choose dense accumulators. This can be increased with more memory bandwidth." << std::endl;
-  std::cerr << "\t[Optional] The memory space used for each matrix: '--memspaces [0|1|....15]' --> Bits representing the use of HBM for Work, C, B, and A respectively. For example 12 = 1100, will store work arrays and C on HBM. A and B will be stored DDR. To use this enable multilevel memory in Kokkos, check generate_makefile.sh" << std::endl;
-  std::cerr << "\t[Optional] Loop scheduling: '--dynamic': Use this for dynamic scheduling of the loops. (Better performance most of the time)" << std::endl;
+  std::cerr
+      << "\t[Required] INPUT MATRIX: '--amtx [left_hand_side.mtx]' -- for C=AxA"
+      << std::endl;
+  std::cerr << "\t[Optional] BACKEND: '--threads [numThreads]' | '--openmp "
+               "[numThreads]' | '--cuda [cudaDeviceIndex]' --> if none are "
+               "specified, Serial is used (if enabled)"
+            << std::endl;
+  std::cerr << "\t[Optional] --bmtx [righ_hand_side.mtx]' for C = AxB"
+            << std::endl;
+  std::cerr << "\t[Optional] OUTPUT MATRICES: '--cmtx [output_matrix.mtx]' --> "
+               "to write output C=AxB"
+            << std::endl;
+  std::cerr << "\t[Optional] --DENSEACCMAX: on CPUs default algorithm may "
+               "choose to use dense accumulators. This parameter defaults to "
+               "250k, which is max k value to choose dense accumulators. This "
+               "can be increased with more memory bandwidth."
+            << std::endl;
+  std::cerr
+      << "\t[Optional] The memory space used for each matrix: '--memspaces "
+         "[0|1|....15]' --> Bits representing the use of HBM for Work, C, B, "
+         "and A respectively. For example 12 = 1100, will store work arrays "
+         "and C on HBM. A and B will be stored DDR. To use this enable "
+         "multilevel memory in Kokkos, check generate_makefile.sh"
+      << std::endl;
+  std::cerr
+      << "\t[Optional] Loop scheduling: '--dynamic': Use this for dynamic "
+         "scheduling of the loops. (Better performance most of the time)"
+      << std::endl;
   std::cerr << "\t[Optional] Verbose Output: '--verbose'" << std::endl;
 }
 
-static char* getNextArg(int& i, int argc, char** argv)
-{
+static char* getNextArg(int& i, int argc, char** argv) {
   i++;
-  if(i >= argc)
-    {
-      std::cerr << "Error: expected additional command-line argument!\n";
-      exit(1);
-    }
+  if (i >= argc) {
+    std::cerr << "Error: expected additional command-line argument!\n";
+    exit(1);
+  }
   return argv[i];
 }
 
-int parse_inputs (KokkosKernels::Experiment::Parameters &params, int argc, char **argv){
-  
-  for ( int i = 1 ; i < argc ; ++i ) {
-    if ( 0 == strcasecmp( argv[i] , "--threads" ) ) {
+int parse_inputs(KokkosKernels::Experiment::Parameters& params, int argc,
+                 char** argv) {
+  for (int i = 1; i < argc; ++i) {
+    if (0 == Test::string_compare_no_case(argv[i], "--threads")) {
       params.use_threads = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--openmp" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) {
       params.use_openmp = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) {
       params.use_cuda = atoi(getNextArg(i, argc, argv)) + 1;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--repeat" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) {
       params.repeat = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--hashscale" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--hashscale")) {
       params.minhashscale = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--chunksize" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--chunksize")) {
       params.chunk_size = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--teamsize" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--teamsize")) {
       params.team_size = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--vectorsize" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--vectorsize")) {
       params.vector_size = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--compression2step" ) ) {
-      params.compression2step =  true ;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--shmem" ) ) {
+    } else if (0 ==
+               Test::string_compare_no_case(argv[i], "--compression2step")) {
+      params.compression2step = true;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--shmem")) {
       params.shmemsize = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--memspaces" ) ) {
-      int memspaces = atoi(getNextArg(i, argc, argv));
+    } else if (0 == Test::string_compare_no_case(argv[i], "--memspaces")) {
+      int memspaces    = atoi(getNextArg(i, argc, argv));
       int memspaceinfo = memspaces;
       std::cout << "memspaceinfo:" << memspaceinfo << std::endl;
-      if (memspaceinfo & 1){
+      if (memspaceinfo & 1) {
         params.a_mem_space = 1;
         std::cout << "Using HBM for A" << std::endl;
-      }
-      else {
+      } else {
         params.a_mem_space = 0;
         std::cout << "Using DDR4 for A" << std::endl;
       }
-      memspaceinfo  = memspaceinfo >> 1;
-      if (memspaceinfo & 1){
+      memspaceinfo = memspaceinfo >> 1;
+      if (memspaceinfo & 1) {
         params.b_mem_space = 1;
         std::cout << "Using HBM for B" << std::endl;
-      }
-      else {
+      } else {
         params.b_mem_space = 0;
         std::cout << "Using DDR4 for B" << std::endl;
       }
-      memspaceinfo  = memspaceinfo >> 1;
-      if (memspaceinfo & 1){
+      memspaceinfo = memspaceinfo >> 1;
+      if (memspaceinfo & 1) {
         params.c_mem_space = 1;
         std::cout << "Using HBM for C" << std::endl;
-      }
-      else {
+      } else {
         params.c_mem_space = 0;
         std::cout << "Using DDR4 for C" << std::endl;
       }
-      memspaceinfo  = memspaceinfo >> 1;
-      if (memspaceinfo & 1){
+      memspaceinfo = memspaceinfo >> 1;
+      if (memspaceinfo & 1) {
         params.work_mem_space = 1;
         std::cout << "Using HBM for work memory space" << std::endl;
-      }
-      else {
+      } else {
         params.work_mem_space = 0;
         std::cout << "Using DDR4 for work memory space" << std::endl;
       }
-      memspaceinfo  = memspaceinfo >> 1;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--CRWC" ) ) {
+      memspaceinfo = memspaceinfo >> 1;
+    } else if (0 == Test::string_compare_no_case(argv[i], "--CRWC")) {
       params.calculate_read_write_cost = 1;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--CIF" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--CIF")) {
       params.coloring_input_file = getNextArg(i, argc, argv);
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--COF" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--COF")) {
       params.coloring_output_file = getNextArg(i, argc, argv);
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--CCO" ) ) {
-      //if 0.85 set, if compression does not reduce flops by at least 15% symbolic will run on original matrix.
-      //otherwise, it will compress the graph and run symbolic on compressed one.
+    } else if (0 == Test::string_compare_no_case(argv[i], "--CCO")) {
+      // if 0.85 set, if compression does not reduce flops by at least 15%
+      // symbolic will run on original matrix. otherwise, it will compress the
+      // graph and run symbolic on compressed one.
       params.compression_cut_off = atof(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--FLHCO" ) ) {
-      //if linear probing is used as hash, what is the max occupancy percantage we allow in the hash.
+    } else if (0 == Test::string_compare_no_case(argv[i], "--FLHCO")) {
+      // if linear probing is used as hash, what is the max occupancy percantage
+      // we allow in the hash.
       params.first_level_hash_cut_off = atof(getNextArg(i, argc, argv));
     }
 
-    else if ( 0 == strcasecmp( argv[i] , "--flop" ) ) {
-      //print flop statistics. only for the first repeat.
+    else if (0 == Test::string_compare_no_case(argv[i], "--flop")) {
+      // print flop statistics. only for the first repeat.
       params.calculate_read_write_cost = 1;
     }
 
-    else if ( 0 == strcasecmp( argv[i] , "--checkoutput" ) ) {
-      //check correctness
+    else if (0 == Test::string_compare_no_case(argv[i], "--checkoutput")) {
+      // check correctness
       params.check_output = 1;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--amtx" ) ) {
-      //A at C=AxB
+    } else if (0 == Test::string_compare_no_case(argv[i], "--amtx")) {
+      // A at C=AxB
       params.a_mtx_bin_file = getNextArg(i, argc, argv);
     }
 
-    else if ( 0 == strcasecmp( argv[i] , "--bmtx" ) ) {
-      //B at C=AxB.
-      //if not provided, C = AxA will be performed.
+    else if (0 == Test::string_compare_no_case(argv[i], "--bmtx")) {
+      // B at C=AxB.
+      // if not provided, C = AxA will be performed.
       params.b_mtx_bin_file = getNextArg(i, argc, argv);
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--cmtx" ) ) {
-      //if provided, C will be written to given file.
-      //has to have ".bin", or ".crs" extension.
+    } else if (0 == Test::string_compare_no_case(argv[i], "--cmtx")) {
+      // if provided, C will be written to given file.
+      // has to have ".bin", or ".crs" extension.
       params.c_mtx_bin_file = getNextArg(i, argc, argv);
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--dynamic" ) ) {
-      //dynamic scheduling will be used for loops.
-      //currently it is default already.
-      //so has to use the dynamic schedulin.
+    } else if (0 == Test::string_compare_no_case(argv[i], "--dynamic")) {
+      // dynamic scheduling will be used for loops.
+      // currently it is default already.
+      // so has to use the dynamic schedulin.
       params.use_dynamic_scheduling = 1;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--DENSEACCMAX" ) ) {
-      //on CPUs and KNLs if DEFAULT algorithm or KKSPGEMM is chosen,
-      //it uses dense accumulators for smaller matrices based on the size of column (k) in B.
-      //Max column size is 250,000 for k to use dense accumulators.
-      //this parameter overwrites this.
-      //with cache mode, or CPUs with smaller thread count, where memory bandwidth is not an issue,
-      //this cut-off can be increased to be more than 250,000
+    } else if (0 == Test::string_compare_no_case(argv[i], "--DENSEACCMAX")) {
+      // on CPUs and KNLs if DEFAULT algorithm or KKSPGEMM is chosen,
+      // it uses dense accumulators for smaller matrices based on the size of
+      // column (k) in B. Max column size is 250,000 for k to use dense
+      // accumulators. this parameter overwrites this. with cache mode, or CPUs
+      // with smaller thread count, where memory bandwidth is not an issue, this
+      // cut-off can be increased to be more than 250,000
       params.MaxColDenseAcc = atoi(getNextArg(i, argc, argv));
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--verbose" ) ) {
-      //print the timing and information about the inner steps.
+    } else if (0 == Test::string_compare_no_case(argv[i], "--verbose")) {
+      // print the timing and information about the inner steps.
       params.verbose = 1;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--algorithm" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--algorithm")) {
       char* algoStr = getNextArg(i, argc, argv);
 
-      if ( 0 == strcasecmp( algoStr, "DEFAULT" ) ) {
-	params.algorithm = KokkosSparse::SPGEMM_KK;
-      }
-      else if ( 0 == strcasecmp( algoStr, "KKDEFAULT" ) ) {
-	params.algorithm = KokkosSparse::SPGEMM_KK;
+      if (0 == Test::string_compare_no_case(algoStr, "DEFAULT")) {
+        params.algorithm = KokkosSparse::SPGEMM_KK;
+      } else if (0 == Test::string_compare_no_case(algoStr, "KKDEFAULT")) {
+        params.algorithm = KokkosSparse::SPGEMM_KK;
+      } else if (0 == Test::string_compare_no_case(algoStr, "KKSPGEMM")) {
+        params.algorithm = KokkosSparse::SPGEMM_KK;
+      } else if (0 == Test::string_compare_no_case(algoStr, "KKMEM")) {
+        params.algorithm = KokkosSparse::SPGEMM_KK_MEMORY;
+      } else if (0 == Test::string_compare_no_case(algoStr, "KKDENSE")) {
+        params.algorithm = KokkosSparse::SPGEMM_KK_DENSE;
+      } else if (0 == Test::string_compare_no_case(algoStr, "KKLP")) {
+        params.algorithm = KokkosSparse::SPGEMM_KK_LP;
+      } else if (0 == Test::string_compare_no_case(algoStr, "KKDEBUG")) {
+        params.algorithm = KokkosSparse::SPGEMM_KK_LP;
+      } else {
+        std::cerr << "Unrecognized command line argument #" << i << ": "
+                  << argv[i] << std::endl;
+        print_options();
+        return 1;
       }
-      else if ( 0 == strcasecmp( algoStr, "KKSPGEMM" ) ) {
-	params.algorithm = KokkosSparse::SPGEMM_KK;
-      }
-      else if ( 0 == strcasecmp( algoStr, "KKMEM" ) ) {
-	params.algorithm = KokkosSparse::SPGEMM_KK_MEMORY;
-      }
-      else if ( 0 == strcasecmp( algoStr, "KKDENSE" ) ) {
-        params.algorithm =  KokkosSparse::SPGEMM_KK_DENSE;
-      }
-      else if ( 0 == strcasecmp( algoStr, "KKLP" ) ) {
-	params.algorithm = KokkosSparse::SPGEMM_KK_LP;
-      }
-      else if ( 0 == strcasecmp( algoStr, "KKDEBUG" ) ) {
-	params.algorithm = KokkosSparse::SPGEMM_KK_LP;
-      }
-      else {
-	std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ;
-	print_options();
-	return 1;
-      }
-    }
-    else {
-      std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ;
+    } else {
+      std::cerr << "Unrecognized command line argument #" << i << ": "
+                << argv[i] << std::endl;
       print_options();
       return 1;
     }
@@ -245,76 +237,81 @@ int parse_inputs (KokkosKernels::Experiment::Parameters &params, int argc, char
   return 0;
 }
 
-int main (int argc, char ** argv){
+int main(int argc, char** argv) {
   using size_type = default_size_type;
-  using lno_t =  default_lno_t;
-  using scalar_t = default_scalar;
+  using lno_t     = default_lno_t;
+  using scalar_t  = default_scalar;
 
   KokkosKernels::Experiment::Parameters params;
 
-  if (parse_inputs (params, argc, argv) ){
+  if (parse_inputs(params, argc, argv)) {
     return 1;
   }
-  if (params.a_mtx_bin_file == NULL){
-    std::cerr << "Provide a and b matrix files" << std::endl ;
+  if (params.a_mtx_bin_file == NULL) {
+    std::cerr << "Provide a and b matrix files" << std::endl;
     print_options();
     return 0;
   }
-  if (params.b_mtx_bin_file == NULL){
+  if (params.b_mtx_bin_file == NULL) {
     std::cout << "B is not provided. Multiplying AxA." << std::endl;
   }
 
   const int num_threads = std::max(params.use_openmp, params.use_threads);
-  const int device_id = params.use_cuda - 1;
+  const int device_id   = params.use_cuda - 1;
 
-  Kokkos::initialize( Kokkos::InitArguments( num_threads, -1, device_id ) );
+  Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
   Kokkos::print_configuration(std::cout);
 
-#if defined( KOKKOS_ENABLE_OPENMP )
+#if defined(KOKKOS_ENABLE_OPENMP)
 
   if (params.use_openmp) {
 #ifdef KOKKOSKERNELS_INST_MEMSPACE_HBWSPACE
-    KokkosKernels::Experiment::run_spgemm_jacobi
-      <size_type, lno_t, scalar_t, Kokkos::OpenMP, Kokkos::Experimental::HBWSpace, Kokkos::HostSpace>(params);
-#else 
-    KokkosKernels::Experiment::run_spgemm_jacobi
-      <size_type, lno_t, scalar_t, Kokkos::OpenMP, Kokkos::OpenMP::memory_space, Kokkos::OpenMP::memory_space>(params);
+    KokkosKernels::Experiment::run_spgemm_jacobi<
+        size_type, lno_t, scalar_t, Kokkos::OpenMP,
+        Kokkos::Experimental::HBWSpace, Kokkos::HostSpace>(params);
+#else
+    KokkosKernels::Experiment::run_spgemm_jacobi<
+        size_type, lno_t, scalar_t, Kokkos::OpenMP,
+        Kokkos::OpenMP::memory_space, Kokkos::OpenMP::memory_space>(params);
 #endif
   }
 #endif
 
-#if defined( KOKKOS_ENABLE_CUDA )
+#if defined(KOKKOS_ENABLE_CUDA)
   if (params.use_cuda) {
 #ifdef KOKKOSKERNELS_INST_MEMSPACE_CUDAHOSTPINNEDSPACE
-    KokkosKernels::Experiment::run_spgemm_jacobi
-      <size_type, lno_t, scalar_t, Kokkos::Cuda, Kokkos::Cuda::memory_space, Kokkos::CudaHostPinnedSpace>(params);
+    KokkosKernels::Experiment::run_spgemm_jacobi<
+        size_type, lno_t, scalar_t, Kokkos::Cuda, Kokkos::Cuda::memory_space,
+        Kokkos::CudaHostPinnedSpace>(params);
 #else
-    KokkosKernels::Experiment::run_spgemm_jacobi
-      <size_type, lno_t, scalar_t, Kokkos::Cuda, Kokkos::Cuda::memory_space, Kokkos::Cuda::memory_space>(params);
+    KokkosKernels::Experiment::run_spgemm_jacobi<
+        size_type, lno_t, scalar_t, Kokkos::Cuda, Kokkos::Cuda::memory_space,
+        Kokkos::Cuda::memory_space>(params);
 #endif
   }
 #endif
 
-#if defined( KOKKOS_ENABLE_THREADS )
-  //If only serial is enabled (or no other device was specified), run with serial
-  if (params.use_threads)
-  {
-    KokkosKernels::Experiment::run_spgemm_jacobi
-      <size_type, lno_t, scalar_t, Kokkos::Threads, Kokkos::HostSpace, Kokkos::HostSpace>(params);
+#if defined(KOKKOS_ENABLE_THREADS)
+  // If only serial is enabled (or no other device was specified), run with
+  // serial
+  if (params.use_threads) {
+    KokkosKernels::Experiment::run_spgemm_jacobi<
+        size_type, lno_t, scalar_t, Kokkos::Threads, Kokkos::HostSpace,
+        Kokkos::HostSpace>(params);
   }
 #endif
 
-#if defined( KOKKOS_ENABLE_SERIAL )
-  //If only serial is enabled (or no other device was specified), run with serial
-  if (!params.use_openmp && !params.use_cuda && !params.use_threads)
-  {
-    KokkosKernels::Experiment::run_spgemm_jacobi
-      <size_type, lno_t, scalar_t, Kokkos::Serial, Kokkos::HostSpace, Kokkos::HostSpace>(params);
+#if defined(KOKKOS_ENABLE_SERIAL)
+  // If only serial is enabled (or no other device was specified), run with
+  // serial
+  if (!params.use_openmp && !params.use_cuda && !params.use_threads) {
+    KokkosKernels::Experiment::run_spgemm_jacobi<
+        size_type, lno_t, scalar_t, Kokkos::Serial, Kokkos::HostSpace,
+        Kokkos::HostSpace>(params);
   }
 #endif
 
-  Kokkos::finalize(); 
+  Kokkos::finalize();
 
   return 0;
 }
-
diff --git a/perf_test/sparse/KokkosSparse_spiluk.cpp b/perf_test/sparse/KokkosSparse_spiluk.cpp
index f590c2008b..d381b9b888 100644
--- a/perf_test/sparse/KokkosSparse_spiluk.cpp
+++ b/perf_test/sparse/KokkosSparse_spiluk.cpp
@@ -50,7 +50,7 @@
 #include <limits>
 #include <cmath>
 #include <unordered_map>
-#include <iomanip>      // std::setprecision
+#include <iomanip>  // std::setprecision
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
 #include <cusparse.h>
@@ -66,269 +66,352 @@
 #include "KokkosKernels_default_types.hpp"
 #include <KokkosKernels_IOUtils.hpp>
 
-#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA ) && (!defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION ))
+#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) && \
+    (!defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION))
 using namespace KokkosSparse;
 using namespace KokkosSparse::Experimental;
 using namespace KokkosKernels;
 using namespace KokkosKernels::Experimental;
 
-enum {DEFAULT, CUSPARSE, LVLSCHED_RP, LVLSCHED_TP1/*, LVLSCHED_TP2*/};
+enum { DEFAULT, CUSPARSE, LVLSCHED_RP, LVLSCHED_TP1 /*, LVLSCHED_TP2*/ };
 
-int test_spiluk_perf(std::vector<int> tests, std::string afilename, int kin, int team_size, int /*vector_length*/, /*int idx_offset,*/ int loop) {
+int test_spiluk_perf(std::vector<int> tests, std::string afilename, int kin,
+                     int team_size, int /*vector_length*/,
+                     /*int idx_offset,*/ int loop) {
   typedef default_scalar scalar_t;
   typedef default_lno_t lno_t;
   typedef default_size_type size_type;
   typedef Kokkos::DefaultExecutionSpace execution_space;
   typedef typename execution_space::memory_space memory_space;
 
-  typedef KokkosSparse::CrsMatrix<scalar_t, lno_t, execution_space, void, size_type> crsmat_t;
+  typedef KokkosSparse::CrsMatrix<scalar_t, lno_t, execution_space, void,
+                                  size_type>
+      crsmat_t;
   typedef typename crsmat_t::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::non_const_type lno_view_t;
   typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
   typedef typename crsmat_t::values_type::non_const_type scalar_view_t;
 
-  typedef Kokkos::View< scalar_t*, memory_space >  ValuesType;
+  typedef Kokkos::View<scalar_t *, memory_space> ValuesType;
 
-  typedef KokkosKernels::Experimental::KokkosKernelsHandle <size_type, lno_t, scalar_t,
-                              execution_space, memory_space, memory_space > KernelHandle;
-  printf("Execution space: %s, Memory space: %s\n", typeid(execution_space).name(), typeid(memory_space).name());
-  scalar_t ZERO = scalar_t(0);
-  scalar_t ONE  = scalar_t(1);
-  scalar_t MONE = scalar_t(-1);
+  typedef KokkosKernels::Experimental::KokkosKernelsHandle<
+      size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>
+      KernelHandle;
+  printf("Execution space: %s, Memory space: %s\n",
+         typeid(execution_space).name(), typeid(memory_space).name());
+  scalar_t ZERO             = scalar_t(0);
+  scalar_t ONE              = scalar_t(1);
+  scalar_t MONE             = scalar_t(-1);
   constexpr int EXPAND_FACT = 6;
 
-// Read amtx
-// Run all requested algorithms
+  // Read amtx
+  // Run all requested algorithms
 
   std::cout << "\n\n" << std::endl;
   if (!afilename.empty()) {
-    std::cout << "ILU(K) Begin: Read matrix filename " << afilename << std::endl;
-    crsmat_t A    = KokkosKernels::Impl::read_kokkos_crst_matrix<crsmat_t>(afilename.c_str()); //in_matrix
-    graph_t graph = A.graph; // in_graph
+    std::cout << "ILU(K) Begin: Read matrix filename " << afilename
+              << std::endl;
+    crsmat_t A = KokkosKernels::Impl::read_kokkos_crst_matrix<crsmat_t>(
+        afilename.c_str());           // in_matrix
+    graph_t graph         = A.graph;  // in_graph
     const size_type nrows = graph.numRows();
-    const int       nnz   = A.nnz();
+    const int nnz         = A.nnz();
     const typename KernelHandle::const_nnz_lno_t fill_lev = lno_t(kin);
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-    //cuSPARSE requires lno_t = size_type = int. For both, int is always used (if enabled)
-#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_INT)
-    //std::cout << "  cusparse: create handle" << std::endl;
+    // cuSPARSE requires lno_t = size_type = int. For both, int is always used
+    // (if enabled)
+#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+    defined(KOKKOSKERNELS_INST_OFFSET_INT)
+    // std::cout << "  cusparse: create handle" << std::endl;
     cusparseStatus_t status;
     cusparseHandle_t handle = 0;
-    status = cusparseCreate(&handle);
+    status                  = cusparseCreate(&handle);
     if (CUSPARSE_STATUS_SUCCESS != status)
       std::cout << "handle create status error name " << (status) << std::endl;
     cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST);
     cusparseMatDescr_t descr = 0;
-    csrilu02Info_t info = 0;
+    csrilu02Info_t info      = 0;
     int pBufferSize;
     void *pBuffer = 0;
     int structural_zero;
     int numerical_zero;
     const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
-    
+
     // step 1: create a descriptor
     status = cusparseCreateMatDescr(&descr);
     if (CUSPARSE_STATUS_SUCCESS != status)
-      std::cout << "MatDescr create status error name " << (status) << std::endl;
+      std::cout << "MatDescr create status error name " << (status)
+                << std::endl;
     cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
-    cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
+    cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
 
     // step 2: create a empty info structure
     status = cusparseCreateCsrilu02Info(&info);
     if (CUSPARSE_STATUS_SUCCESS != status)
-      std::cout << "Csrilu02Info create status error name " << (status) << std::endl;
-    
+      std::cout << "Csrilu02Info create status error name " << (status)
+                << std::endl;
+
     // step 3: query how much memory used in csrsv2, and allocate the buffer
-    cusparseDcsrilu02_bufferSize(handle, nrows, nnz,
-              descr, A.values.data(), A.graph.row_map.data(), A.graph.entries.data(), info, &pBufferSize);
+    cusparseDcsrilu02_bufferSize(handle, nrows, nnz, descr, A.values.data(),
+                                 A.graph.row_map.data(), A.graph.entries.data(),
+                                 info, &pBufferSize);
     // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes.
-    cudaMalloc((void**)&pBuffer, pBufferSize);
+    cudaMalloc((void **)&pBuffer, pBufferSize);
 #else
-    std::cout << "Note: the cuSPARSE TPL is enabled, but either offset=int or ordinal=int is disabled, so it can't be used.\n";
+    std::cout << "Note: the cuSPARSE TPL is enabled, but either offset=int or "
+                 "ordinal=int is disabled, so it can't be used.\n";
 #endif
 #endif
 
-    for ( auto test : tests ) {
+    for (auto test : tests) {
       std::cout << "\ntest = " << test << std::endl;
-    
+
       KernelHandle kh;
-    
-      //std::cout << "Create handle" << std::endl;
-      switch(test) {
+
+      // std::cout << "Create handle" << std::endl;
+      switch (test) {
         case LVLSCHED_RP:
-          kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_RP, nrows, EXPAND_FACT*nnz*(fill_lev+1), EXPAND_FACT*nnz*(fill_lev+1));
+          kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_RP, nrows,
+                                  EXPAND_FACT * nnz * (fill_lev + 1),
+                                  EXPAND_FACT * nnz * (fill_lev + 1));
           kh.get_spiluk_handle()->print_algorithm();
           break;
         case LVLSCHED_TP1:
-          kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, nrows, EXPAND_FACT*nnz*(fill_lev+1), EXPAND_FACT*nnz*(fill_lev+1));
+          kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, nrows,
+                                  EXPAND_FACT * nnz * (fill_lev + 1),
+                                  EXPAND_FACT * nnz * (fill_lev + 1));
           kh.get_spiluk_handle()->print_algorithm();
           kh.get_spiluk_handle()->set_team_size(team_size);
           break;
-        //case LVLSCHED_TP2:
-        //  kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHED_TP2, nrows, EXPAND_FACT*nnz*(fill_lev+1), EXPAND_FACT*nnz*(fill_lev+1));
+        // case LVLSCHED_TP2:
+        //  kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHED_TP2, nrows,
+        //  EXPAND_FACT*nnz*(fill_lev+1), EXPAND_FACT*nnz*(fill_lev+1));
         //  kh.get_spiluk_handle()->print_algorithm();
         //  break;
         default:
-          kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, nrows, EXPAND_FACT*nnz*(fill_lev+1), EXPAND_FACT*nnz*(fill_lev+1));
+          kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, nrows,
+                                  EXPAND_FACT * nnz * (fill_lev + 1),
+                                  EXPAND_FACT * nnz * (fill_lev + 1));
           kh.get_spiluk_handle()->print_algorithm();
           kh.get_spiluk_handle()->set_team_size(team_size);
       }
-	    
-      lno_view_t     L_row_map("L_row_map", nrows + 1);
+
+      lno_view_t L_row_map("L_row_map", nrows + 1);
       lno_nnz_view_t L_entries("L_entries", kh.get_spiluk_handle()->get_nnzL());
-      scalar_view_t  L_values ("L_values",  kh.get_spiluk_handle()->get_nnzL());
-      lno_view_t     U_row_map("U_row_map", nrows + 1);
+      scalar_view_t L_values("L_values", kh.get_spiluk_handle()->get_nnzL());
+      lno_view_t U_row_map("U_row_map", nrows + 1);
       lno_nnz_view_t U_entries("U_entries", kh.get_spiluk_handle()->get_nnzU());
-      scalar_view_t  U_values ("U_values",  kh.get_spiluk_handle()->get_nnzU());
+      scalar_view_t U_values("U_values", kh.get_spiluk_handle()->get_nnzU());
 
       // Init run to clear cache etc.
       Kokkos::Timer timer;
 
       timer.reset();
-      spiluk_symbolic( &kh, fill_lev, 
-                       A.graph.row_map, A.graph.entries, 
-                       L_row_map, L_entries, U_row_map, U_entries );
-      std::cout << "ILU(" << fill_lev << ") Symbolic Time: " << timer.seconds() << std::endl;
+      spiluk_symbolic(&kh, fill_lev, A.graph.row_map, A.graph.entries,
+                      L_row_map, L_entries, U_row_map, U_entries);
+      std::cout << "ILU(" << fill_lev << ") Symbolic Time: " << timer.seconds()
+                << std::endl;
 
       Kokkos::resize(L_entries, kh.get_spiluk_handle()->get_nnzL());
-      Kokkos::resize(L_values,  kh.get_spiluk_handle()->get_nnzL());
+      Kokkos::resize(L_values, kh.get_spiluk_handle()->get_nnzL());
       Kokkos::resize(U_entries, kh.get_spiluk_handle()->get_nnzU());
-      Kokkos::resize(U_values,  kh.get_spiluk_handle()->get_nnzU());
-
-      std::cout << "num levels: "          << kh.get_spiluk_handle()->get_num_levels() << std::endl;
-      std::cout << "max num rows levels: " << kh.get_spiluk_handle()->get_level_maxrows() << std::endl;
-      std::cout << "team size: "           << kh.get_spiluk_handle()->get_team_size() << std::endl;
-      std::cout << "vector size: "         << kh.get_spiluk_handle()->get_vector_size() << std::endl;
+      Kokkos::resize(U_values, kh.get_spiluk_handle()->get_nnzU());
+
+      std::cout << "num levels: " << kh.get_spiluk_handle()->get_num_levels()
+                << std::endl;
+      std::cout << "max num rows levels: "
+                << kh.get_spiluk_handle()->get_level_maxrows() << std::endl;
+      std::cout << "team size: " << kh.get_spiluk_handle()->get_team_size()
+                << std::endl;
+      std::cout << "vector size: " << kh.get_spiluk_handle()->get_vector_size()
+                << std::endl;
       std::cout << "nnzL: " << kh.get_spiluk_handle()->get_nnzL() << std::endl;
       std::cout << "nnzU: " << kh.get_spiluk_handle()->get_nnzU() << std::endl;
-      
+
       timer.reset();
-      spiluk_numeric( &kh, fill_lev, 
-                      A.graph.row_map, A.graph.entries, A.values, 
-                      L_row_map, L_entries, L_values, U_row_map, U_entries, U_values );
+      spiluk_numeric(&kh, fill_lev, A.graph.row_map, A.graph.entries, A.values,
+                     L_row_map, L_entries, L_values, U_row_map, U_entries,
+                     U_values);
       Kokkos::fence();
-      std::cout << "ILU(" << fill_lev << ") Numeric Time: " << timer.seconds() << std::endl;
-      
-      crsmat_t L("L", nrows, nrows, kh.get_spiluk_handle()->get_nnzL(), L_values, L_row_map, L_entries);
-      crsmat_t U("U", nrows, nrows, kh.get_spiluk_handle()->get_nnzU(), U_values, U_row_map, U_entries);
-      ValuesType e_one  ( "e_one",  nrows );
-      ValuesType bb     ( "bb",     nrows );
-      ValuesType bb_tmp ( "bb_tmp", nrows );
-      
-      Kokkos::deep_copy( e_one, scalar_t(1) );
-      
-      KokkosSparse::spmv( "N", ONE, A, e_one,  ZERO, bb);
-      KokkosSparse::spmv( "N", ONE, U, e_one,  ZERO, bb_tmp);
-      KokkosSparse::spmv( "N", ONE, L, bb_tmp, MONE, bb);
-	  
+      std::cout << "ILU(" << fill_lev << ") Numeric Time: " << timer.seconds()
+                << std::endl;
+
+      crsmat_t L("L", nrows, nrows, kh.get_spiluk_handle()->get_nnzL(),
+                 L_values, L_row_map, L_entries);
+      crsmat_t U("U", nrows, nrows, kh.get_spiluk_handle()->get_nnzU(),
+                 U_values, U_row_map, U_entries);
+      ValuesType e_one("e_one", nrows);
+      ValuesType bb("bb", nrows);
+      ValuesType bb_tmp("bb_tmp", nrows);
+
+      Kokkos::deep_copy(e_one, scalar_t(1));
+
+      KokkosSparse::spmv("N", ONE, A, e_one, ZERO, bb);
+      KokkosSparse::spmv("N", ONE, U, e_one, ZERO, bb_tmp);
+      KokkosSparse::spmv("N", ONE, L, bb_tmp, MONE, bb);
+
       scalar_t bb_nrm = KokkosBlas::nrm2(bb);
 
-      std::cout << "nrm2(A*e-L*U*e) = " << std::setprecision(15) << bb_nrm << std::endl;
+      std::cout << "nrm2(A*e-L*U*e) = " << std::setprecision(15) << bb_nrm
+                << std::endl;
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-      if (fill_lev==0) {
+      if (fill_lev == 0) {
         std::cout << "CUSPARSE: No KK interface added yet" << std::endl;
-        
-        lno_view_t     A_row_map("A_row_map", nrows + 1);
+
+        lno_view_t A_row_map("A_row_map", nrows + 1);
         lno_nnz_view_t A_entries("A_entries", nnz);
-        scalar_view_t  A_values ("A_values",  nnz);
-        
-        Kokkos::deep_copy( A_row_map, A.graph.row_map );
-        Kokkos::deep_copy( A_entries, A.graph.entries );
-        Kokkos::deep_copy( A_values,  A.values );
-        
+        scalar_view_t A_values("A_values", nnz);
+
+        Kokkos::deep_copy(A_row_map, A.graph.row_map);
+        Kokkos::deep_copy(A_entries, A.graph.entries);
+        Kokkos::deep_copy(A_values, A.values);
+
         // step 4: perform analysis
         timer.reset();
-        status = cusparseDcsrilu02_analysis(handle, nrows, A.nnz(), descr, A_values.data(), A_row_map.data(), A_entries.data(), info, policy, pBuffer);
+        status = cusparseDcsrilu02_analysis(
+            handle, nrows, A.nnz(), descr, A_values.data(), A_row_map.data(),
+            A_entries.data(), info, policy, pBuffer);
         Kokkos::fence();
-        std::cout << "cuSPARSE ILU(0) Symbolic Time: " << timer.seconds() << std::endl;
+        std::cout << "cuSPARSE ILU(0) Symbolic Time: " << timer.seconds()
+                  << std::endl;
         if (CUSPARSE_STATUS_SUCCESS != status)
           std::cout << "analysis status error name " << (status) << std::endl;
-        
+
         status = cusparseXcsrilu02_zeroPivot(handle, info, &structural_zero);
-        if (CUSPARSE_STATUS_ZERO_PIVOT == status){
-           printf("A(%d,%d) is missing\n", structural_zero, structural_zero);
+        if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
+          printf("A(%d,%d) is missing\n", structural_zero, structural_zero);
         }
-        
+
         // step 5: M = L*U
         timer.reset();
-        status =  cusparseDcsrilu02(handle, nrows, A.nnz(), descr, A_values.data(), A_row_map.data(), A_entries.data(), info, policy, pBuffer);
+        status = cusparseDcsrilu02(handle, nrows, A.nnz(), descr,
+                                   A_values.data(), A_row_map.data(),
+                                   A_entries.data(), info, policy, pBuffer);
         Kokkos::fence();
-        std::cout << "cuSPARSE ILU(0) Numeric Time: " << timer.seconds() << std::endl;
+        std::cout << "cuSPARSE ILU(0) Numeric Time: " << timer.seconds()
+                  << std::endl;
         if (CUSPARSE_STATUS_SUCCESS != status)
           std::cout << "numeric status error name " << (status) << std::endl;
-        
+
         status = cusparseXcsrilu02_zeroPivot(handle, info, &numerical_zero);
-        if (CUSPARSE_STATUS_ZERO_PIVOT == status){
-           printf("U(%d,%d) is zero\n", numerical_zero, numerical_zero);
+        if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
+          printf("U(%d,%d) is zero\n", numerical_zero, numerical_zero);
         }
-        
+
         // Error Check
-        //KK
+        // KK
         auto h_L_row_map = Kokkos::create_mirror_view(L_row_map);
         auto h_L_entries = Kokkos::create_mirror_view(L_entries);
         auto h_L_values  = Kokkos::create_mirror_view(L_values);
         auto h_U_row_map = Kokkos::create_mirror_view(U_row_map);
         auto h_U_entries = Kokkos::create_mirror_view(U_entries);
         auto h_U_values  = Kokkos::create_mirror_view(U_values);
-        
+
         Kokkos::deep_copy(h_L_row_map, L_row_map);
         Kokkos::deep_copy(h_L_entries, L_entries);
-        Kokkos::deep_copy(h_L_values,  L_values);
+        Kokkos::deep_copy(h_L_values, L_values);
         Kokkos::deep_copy(h_U_row_map, U_row_map);
         Kokkos::deep_copy(h_U_entries, U_entries);
-        Kokkos::deep_copy(h_U_values,  U_values);
-        //cuSPARSE
+        Kokkos::deep_copy(h_U_values, U_values);
+        // cuSPARSE
         auto h_A_row_map = Kokkos::create_mirror_view(A_row_map);
         auto h_A_entries = Kokkos::create_mirror_view(A_entries);
         auto h_A_values  = Kokkos::create_mirror_view(A_values);
-        
+
         Kokkos::deep_copy(h_A_row_map, A_row_map);
         Kokkos::deep_copy(h_A_entries, A_entries);
-        Kokkos::deep_copy(h_A_values,  A_values);
-        
-        for ( size_type i = 0; i < nrows; ++i ) {
-          auto a_row_start = h_A_row_map(i);  auto a_row_end = h_A_row_map(i+1);
-          auto l_row_start = h_L_row_map(i);  auto l_row_end = h_L_row_map(i+1);
-          auto u_row_start = h_U_row_map(i);  auto u_row_end = h_U_row_map(i+1);
-          if ( (a_row_end-a_row_start) != ((l_row_end-l_row_start) + (u_row_end-u_row_start) - 1) ) {
-            std::cout << "ILU(0) FAILURE: nnz on row " << i << " do not match -- KK = " << (l_row_end-l_row_start) + (u_row_end-u_row_start) - 1 << ", cuSPARSE = " << a_row_end-a_row_start << std::endl;
+        Kokkos::deep_copy(h_A_values, A_values);
+
+        for (size_type i = 0; i < nrows; ++i) {
+          auto a_row_start = h_A_row_map(i);
+          auto a_row_end   = h_A_row_map(i + 1);
+          auto l_row_start = h_L_row_map(i);
+          auto l_row_end   = h_L_row_map(i + 1);
+          auto u_row_start = h_U_row_map(i);
+          auto u_row_end   = h_U_row_map(i + 1);
+          if ((a_row_end - a_row_start) !=
+              ((l_row_end - l_row_start) + (u_row_end - u_row_start) - 1)) {
+            std::cout << "ILU(0) FAILURE: nnz on row " << i
+                      << " do not match -- KK = "
+                      << (l_row_end - l_row_start) + (u_row_end - u_row_start) -
+                             1
+                      << ", cuSPARSE = " << a_row_end - a_row_start
+                      << std::endl;
             return 1;
-          }
-          else {
-            Kokkos::View<lno_t*, Kokkos::LayoutLeft, Kokkos::HostSpace> h_tmp_entries ( "h_tmp_entries", a_row_end-a_row_start);
-            Kokkos::View<scalar_t*, Kokkos::LayoutLeft, Kokkos::HostSpace> h_tmp_values  ( "h_tmp_values",  a_row_end-a_row_start);
-        
-            Kokkos::deep_copy(subview(h_tmp_entries, Kokkos::make_pair(0,l_row_end-l_row_start)),
-                              subview(h_L_entries,   Kokkos::make_pair(l_row_start,l_row_end)));//L part
-            Kokkos::deep_copy(subview(h_tmp_entries, Kokkos::make_pair(l_row_end-l_row_start-1,a_row_end-a_row_start)),
-                              subview(h_U_entries,   Kokkos::make_pair(u_row_start,u_row_end+1)));//U part
-        
-            Kokkos::deep_copy(subview(h_tmp_values,  Kokkos::make_pair(0,l_row_end-l_row_start)),
-                              subview(h_L_values,    Kokkos::make_pair(l_row_start,l_row_end)));//L part
-            Kokkos::deep_copy(subview(h_tmp_values,  Kokkos::make_pair(l_row_end-l_row_start-1,a_row_end-a_row_start)),
-                              subview(h_U_values,    Kokkos::make_pair(u_row_start,u_row_end+1)));//U part
-        
-            for ( size_type k = 0; k < (a_row_end-a_row_start); ++k ) {
-              if ( h_tmp_entries(k) != h_A_entries(a_row_start+k) ) {
-                if ( h_A_entries(a_row_start+k) < i )
-                  std::cout << "ILU(0) FAILURE: non-zero col idx on row " << i << " do not match -- KK (L part) = " << h_tmp_entries(k) << ", cuSPARSE = " << h_A_entries(a_row_start+k) << std::endl;
+          } else {
+            Kokkos::View<lno_t *, Kokkos::LayoutLeft, Kokkos::HostSpace>
+                h_tmp_entries("h_tmp_entries", a_row_end - a_row_start);
+            Kokkos::View<scalar_t *, Kokkos::LayoutLeft, Kokkos::HostSpace>
+                h_tmp_values("h_tmp_values", a_row_end - a_row_start);
+
+            Kokkos::deep_copy(
+                subview(h_tmp_entries,
+                        Kokkos::make_pair(0, l_row_end - l_row_start)),
+                subview(h_L_entries,
+                        Kokkos::make_pair(l_row_start, l_row_end)));  // L part
+            Kokkos::deep_copy(
+                subview(h_tmp_entries,
+                        Kokkos::make_pair(l_row_end - l_row_start - 1,
+                                          a_row_end - a_row_start)),
+                subview(
+                    h_U_entries,
+                    Kokkos::make_pair(u_row_start, u_row_end + 1)));  // U part
+
+            Kokkos::deep_copy(
+                subview(h_tmp_values,
+                        Kokkos::make_pair(0, l_row_end - l_row_start)),
+                subview(h_L_values,
+                        Kokkos::make_pair(l_row_start, l_row_end)));  // L part
+            Kokkos::deep_copy(
+                subview(h_tmp_values,
+                        Kokkos::make_pair(l_row_end - l_row_start - 1,
+                                          a_row_end - a_row_start)),
+                subview(
+                    h_U_values,
+                    Kokkos::make_pair(u_row_start, u_row_end + 1)));  // U part
+
+            for (size_type k = 0; k < (a_row_end - a_row_start); ++k) {
+              if (h_tmp_entries(k) != h_A_entries(a_row_start + k)) {
+                if (h_A_entries(a_row_start + k) < i)
+                  std::cout
+                      << "ILU(0) FAILURE: non-zero col idx on row " << i
+                      << " do not match -- KK (L part) = " << h_tmp_entries(k)
+                      << ", cuSPARSE = " << h_A_entries(a_row_start + k)
+                      << std::endl;
                 else
-                  std::cout << "ILU(0) FAILURE: non-zero col idx on row " << i << " do not match -- KK (U part) = " << h_tmp_entries(k) << ", cuSPARSE = " << h_A_entries(a_row_start+k) << std::endl;
+                  std::cout
+                      << "ILU(0) FAILURE: non-zero col idx on row " << i
+                      << " do not match -- KK (U part) = " << h_tmp_entries(k)
+                      << ", cuSPARSE = " << h_A_entries(a_row_start + k)
+                      << std::endl;
                 return 1;
-              } else if ( abs(h_tmp_values(k) - h_A_values(a_row_start+k)) > 1e-3 ) {
-                if ( h_A_entries(a_row_start+k) < i )
-                  std::cout << "ILU(0) FAILURE: non-zero entry on row " << i << " do not match -- KK (L part) = " << h_tmp_values(k) << " at col " << h_tmp_entries(k) << ", cuSPARSE = " << h_A_values(a_row_start+k) << " at col " << h_A_entries(a_row_start+k) << std::endl;
+              } else if (abs(h_tmp_values(k) - h_A_values(a_row_start + k)) >
+                         1e-3) {
+                if (h_A_entries(a_row_start + k) < i)
+                  std::cout
+                      << "ILU(0) FAILURE: non-zero entry on row " << i
+                      << " do not match -- KK (L part) = " << h_tmp_values(k)
+                      << " at col " << h_tmp_entries(k)
+                      << ", cuSPARSE = " << h_A_values(a_row_start + k)
+                      << " at col " << h_A_entries(a_row_start + k)
+                      << std::endl;
                 else
-                  std::cout << "ILU(0) FAILURE: non-zero entry on row " << i << " do not match -- KK (U part) = " << h_tmp_values(k) << " at col " << h_tmp_entries(k) << ", cuSPARSE = " << h_A_values(a_row_start+k) << " at col " << h_A_entries(a_row_start+k) << std::endl;
+                  std::cout
+                      << "ILU(0) FAILURE: non-zero entry on row " << i
+                      << " do not match -- KK (U part) = " << h_tmp_values(k)
+                      << " at col " << h_tmp_entries(k)
+                      << ", cuSPARSE = " << h_A_values(a_row_start + k)
+                      << " at col " << h_A_entries(a_row_start + k)
+                      << std::endl;
                 return 1;
               }
-            }// end col
+            }  // end col
           }
-        }// end row
+        }  // end row
         std::cout << "ILU(0) SUCCESS!" << std::endl;
-     }//fill_lev=0
+      }  // fill_lev=0
 #endif
 
       // Benchmark
@@ -336,52 +419,54 @@ int test_spiluk_perf(std::vector<int> tests, std::string afilename, int kin, int
       double min_time = std::numeric_limits<double>::infinity();
       double max_time = 0.0;
       double ave_time = 0.0;
-      
-      for(int i=0;i<loop;i++) {
+
+      for (int i = 0; i < loop; i++) {
         timer.reset();
-        spiluk_numeric( &kh, fill_lev, 
-                        A.graph.row_map, A.graph.entries, A.values, 
-                        L_row_map, L_entries, L_values, U_row_map, U_entries, U_values );
+        spiluk_numeric(&kh, fill_lev, A.graph.row_map, A.graph.entries,
+                       A.values, L_row_map, L_entries, L_values, U_row_map,
+                       U_entries, U_values);
         Kokkos::fence();
         double time = timer.seconds();
         ave_time += time;
-        if(time>max_time) max_time = time;
-        if(time<min_time) min_time = time;
+        if (time > max_time) max_time = time;
+        if (time < min_time) min_time = time;
       }
-      std::cout << "LOOP_AVG_TIME:  " << ave_time/loop << std::endl;
+      std::cout << "LOOP_AVG_TIME:  " << ave_time / loop << std::endl;
       std::cout << "LOOP_MAX_TIME:  " << max_time << std::endl;
       std::cout << "LOOP_MIN_TIME:  " << min_time << std::endl;
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-      if (fill_lev==0) {
-        
-        lno_view_t     A_row_map("A_row_map", nrows + 1);
+      if (fill_lev == 0) {
+        lno_view_t A_row_map("A_row_map", nrows + 1);
         lno_nnz_view_t A_entries("A_entries", nnz);
-        scalar_view_t  A_values ("A_values",  nnz);
+        scalar_view_t A_values("A_values", nnz);
 
         min_time = std::numeric_limits<double>::infinity();
         max_time = 0.0;
         ave_time = 0.0;
-        
-        for(int i=0;i<loop;i++) {
-          Kokkos::deep_copy( A_row_map, A.graph.row_map );
-          Kokkos::deep_copy( A_entries, A.graph.entries );
-          Kokkos::deep_copy( A_values, A.values );
-	    
+
+        for (int i = 0; i < loop; i++) {
+          Kokkos::deep_copy(A_row_map, A.graph.row_map);
+          Kokkos::deep_copy(A_entries, A.graph.entries);
+          Kokkos::deep_copy(A_values, A.values);
+
           timer.reset();
-          cusparseDcsrilu02(handle, nrows, A.nnz(), descr, A_values.data(), A_row_map.data(), A_entries.data(), info, policy, pBuffer);
+          cusparseDcsrilu02(handle, nrows, A.nnz(), descr, A_values.data(),
+                            A_row_map.data(), A_entries.data(), info, policy,
+                            pBuffer);
           Kokkos::fence();
           double time = timer.seconds();
           ave_time += time;
-          if(time>max_time) max_time = time;
-          if(time<min_time) min_time = time;
+          if (time > max_time) max_time = time;
+          if (time < min_time) min_time = time;
         }
-        std::cout << "LOOP_AVG_TIME (cuSPARSE):  " << ave_time/loop << std::endl;
+        std::cout << "LOOP_AVG_TIME (cuSPARSE):  " << ave_time / loop
+                  << std::endl;
         std::cout << "LOOP_MAX_TIME (cuSPARSE):  " << max_time << std::endl;
         std::cout << "LOOP_MIN_TIME (cuSPARSE):  " << min_time << std::endl;
-      }//fill_lev=0
+      }  // fill_lev=0
 #endif
-    }//end tests
+    }  // end tests
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
     // step 6: free resources
@@ -390,89 +475,107 @@ int test_spiluk_perf(std::vector<int> tests, std::string afilename, int kin, int
     cusparseDestroyMatDescr(descr);
     cusparseDestroy(handle);
 #endif
-  }//end if (!afilename.empty())
+  }  // end if (!afilename.empty())
 
   std::cout << "\n\n" << std::endl;
 
   return 0;
 }
 
-
 void print_help_spiluk() {
   printf("Options:\n");
   printf("  --test [OPTION] : Use different kernel implementations\n");
   printf("                    Options:\n");
   printf("                      lvlrp, lvltp1, lvltp2\n\n");
-  printf("  -f [file]       : Read in Matrix Market formatted text file 'file'.\n");
-//  printf("  -s [N]          : generate a semi-random banded (band size 0.01xN) NxN matrix\n");
-//  printf("                    with average of 10 entries per row.\n");
-//  printf("  --schedule [SCH]: Set schedule for kk variant (static,dynamic,auto [ default ]).\n");
-//  printf("  -afb [file]      : Read in binary Matrix files 'file'.\n");
-//  printf("  --write-binary  : In combination with -f, generate binary files.\n");
-//  printf("  --offset [O]    : Subtract O from every index.\n");
-//  printf("                    Useful in case the matrix market file is not 0 based.\n\n");
-  printf("  -k [K]          : Fill level (default: 0)\n" );
+  printf(
+      "  -f [file]       : Read in Matrix Market formatted text file "
+      "'file'.\n");
+  //  printf("  -s [N]          : generate a semi-random banded (band size
+  //  0.01xN) NxN matrix\n"); printf("                    with average of 10
+  //  entries per row.\n"); printf("  --schedule [SCH]: Set schedule for kk
+  //  variant (static,dynamic,auto [ default ]).\n"); printf("  -afb [file] :
+  //  Read in binary Matrix files 'file'.\n"); printf("  --write-binary  : In
+  //  combination with -f, generate binary files.\n"); printf("  --offset [O] :
+  //  Subtract O from every index.\n"); printf("                    Useful in
+  //  case the matrix market file is not 0 based.\n\n");
+  printf("  -k [K]          : Fill level (default: 0)\n");
   printf("  -ts [T]         : Number of threads per team.\n");
-  printf("  -vl [V]         : Vector-length (i.e. how many Cuda threads are a Kokkos 'thread').\n");
-  printf("  --loop [LOOP]   : How many spiluk to run to aggregate average time. \n");
+  printf(
+      "  -vl [V]         : Vector-length (i.e. how many Cuda threads are a "
+      "Kokkos 'thread').\n");
+  printf(
+      "  --loop [LOOP]   : How many spiluk to run to aggregate average time. "
+      "\n");
 }
 
-
-int main(int argc, char **argv)
-{
+int main(int argc, char **argv) {
   std::vector<int> tests;
-  
+
   std::string afilename;
-  
-  int kin = 0;
+
+  int kin           = 0;
   int vector_length = -1;
-  int team_size = -1;
+  int team_size     = -1;
   // int idx_offset = 0;
   int loop = 1;
   // int schedule=AUTO;
-  
-  if(argc == 1) {
+
+  if (argc == 1) {
     print_help_spiluk();
     return 0;
   }
-  
-  for(int i=0;i<argc;i++)
-  {
-    if((strcmp(argv[i],"--test")==0)) {
+
+  for (int i = 0; i < argc; i++) {
+    if ((strcmp(argv[i], "--test") == 0)) {
       i++;
-      if((strcmp(argv[i],"lvlrp")==0)) {
-        tests.push_back( LVLSCHED_RP );
-      }
-      if((strcmp(argv[i],"lvltp1")==0)) {
-        tests.push_back( LVLSCHED_TP1 );
+      if ((strcmp(argv[i], "lvlrp") == 0)) {
+        tests.push_back(LVLSCHED_RP);
       }
-/*
-      if((strcmp(argv[i],"lvltp2")==0)) {
-        tests.push_back( LVLSCHED_TP2 );
+      if ((strcmp(argv[i], "lvltp1") == 0)) {
+        tests.push_back(LVLSCHED_TP1);
       }
-*/
+      /*
+            if((strcmp(argv[i],"lvltp2")==0)) {
+              tests.push_back( LVLSCHED_TP2 );
+            }
+      */
       continue;
     }
-    if((strcmp(argv[i],"-f")==0)) {afilename = argv[++i]; continue;}
-    if((strcmp(argv[i],"-k")==0)) {kin = atoi(argv[++i]); continue;}
-    if((strcmp(argv[i],"-ts")==0)) {team_size = atoi(argv[++i]); continue;}
-    if((strcmp(argv[i],"-vl")==0)) {vector_length = atoi(argv[++i]); continue;}
-    //if((strcmp(argv[i],"--offset")==0)) {idx_offset = atoi(argv[++i]); continue;}
-    if((strcmp(argv[i],"--loop")==0)) {loop = atoi(argv[++i]); continue;}
-/*
-    if((strcmp(argv[i],"-afb")==0)) {afilename = argv[++i]; binaryfile = true; continue;}
-    if((strcmp(argv[i],"--schedule")==0)) {
-      i++;
-      if((strcmp(argv[i],"auto")==0))
-        schedule = AUTO;
-      if((strcmp(argv[i],"dynamic")==0))
-        schedule = DYNAMIC;
-      if((strcmp(argv[i],"static")==0))
-        schedule = STATIC;
+    if ((strcmp(argv[i], "-f") == 0)) {
+      afilename = argv[++i];
       continue;
     }
-*/
-    if((strcmp(argv[i],"--help")==0) || (strcmp(argv[i],"-h")==0)) {
+    if ((strcmp(argv[i], "-k") == 0)) {
+      kin = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-ts") == 0)) {
+      team_size = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-vl") == 0)) {
+      vector_length = atoi(argv[++i]);
+      continue;
+    }
+    // if((strcmp(argv[i],"--offset")==0)) {idx_offset = atoi(argv[++i]);
+    // continue;}
+    if ((strcmp(argv[i], "--loop") == 0)) {
+      loop = atoi(argv[++i]);
+      continue;
+    }
+    /*
+        if((strcmp(argv[i],"-afb")==0)) {afilename = argv[++i]; binaryfile =
+       true; continue;} if((strcmp(argv[i],"--schedule")==0)) { i++;
+          if((strcmp(argv[i],"auto")==0))
+            schedule = AUTO;
+          if((strcmp(argv[i],"dynamic")==0))
+            schedule = DYNAMIC;
+          if((strcmp(argv[i],"static")==0))
+            schedule = STATIC;
+          continue;
+        }
+    */
+    if ((strcmp(argv[i], "--help") == 0) || (strcmp(argv[i], "-h") == 0)) {
       print_help_spiluk();
       return 0;
     }
@@ -482,15 +585,15 @@ int main(int argc, char **argv)
     tests.push_back(DEFAULT);
   }
   for (size_t i = 0; i < tests.size(); ++i) {
-     std::cout << "tests[" << i << "] = " << tests[i] << std::endl;
+    std::cout << "tests[" << i << "] = " << tests[i] << std::endl;
   }
-  
-  
-  Kokkos::initialize(argc,argv);
+
+  Kokkos::initialize(argc, argv);
   {
-    int total_errors = test_spiluk_perf(tests, afilename, kin, team_size, vector_length, /*idx_offset,*/ loop);
-    
-    if(total_errors == 0)
+    int total_errors = test_spiluk_perf(tests, afilename, kin, team_size,
+                                        vector_length, /*idx_offset,*/ loop);
+
+    if (total_errors == 0)
       printf("Kokkos::SPILUK Test: Passed\n");
     else
       printf("Kokkos::SPILUK Test: Failed\n");
diff --git a/perf_test/sparse/KokkosSparse_spmv.cpp b/perf_test/sparse/KokkosSparse_spmv.cpp
index a70f7dd683..6b67905adc 100644
--- a/perf_test/sparse/KokkosSparse_spmv.cpp
+++ b/perf_test/sparse/KokkosSparse_spmv.cpp
@@ -75,12 +75,13 @@
 #include <OpenMPSmartStatic_SPMV.hpp>
 #endif
 
-#ifdef KOKKOSKERNELS_ENABLE_TPL_ARMPL
-#include <spmv/ArmPL_SPMV.hpp>
-#endif
-
-int test_crs_matrix_singlevec(Ordinal numRows, Ordinal numCols, int test, const char* filename, Ordinal rows_per_thread, int team_size, int vector_length, int schedule, int loop) {
-  typedef KokkosSparse::CrsMatrix<Scalar, Ordinal, Kokkos::DefaultExecutionSpace, void, Offset> matrix_type;
+int test_crs_matrix_singlevec(Ordinal numRows, Ordinal numCols, int test,
+                              const char* filename, Ordinal rows_per_thread,
+                              int team_size, int vector_length, int schedule,
+                              int loop) {
+  typedef KokkosSparse::CrsMatrix<Scalar, Ordinal,
+                                  Kokkos::DefaultExecutionSpace, void, Offset>
+      matrix_type;
 
   spmv_additional_data data(test);
 
@@ -98,30 +99,34 @@ int test_crs_matrix_singlevec(Ordinal numRows, Ordinal numCols, int test, const
         numRows, numCols, nnz, 0, 0.01 * numRows);
   }
   SPMVTestData test_data = setup_test(&data, A, rows_per_thread, team_size,
-                                 vector_length, schedule, loop);
+                                      vector_length, schedule, loop);
   for (int i = 0; i < loop; i++) {
-
 #ifdef KOKKOSKERNELS_ENABLE_TPL_ARMPL
-  if(test == ARMPL) {
-    if(std::is_same<Scalar, double>::value || std::is_same<Scalar, float>::value) {
-      data.set_armpl_spmat(test_data.numRows, test_data.numCols,
-			   test_data.A.graph.row_map.data(), test_data.A.graph.entries.data(),
-			   test_data.A.values.data());
-    } else {
-      throw std::runtime_error("Can't use ArmPL mat-vec for scalar types other than double and float.");
+    if (test == ARMPL) {
+      if (std::is_same<Scalar, double>::value ||
+          std::is_same<Scalar, float>::value) {
+        data.set_armpl_spmat(test_data.numRows, test_data.numCols,
+                             test_data.A.graph.row_map.data(),
+                             test_data.A.graph.entries.data(),
+                             test_data.A.values.data());
+      } else {
+        throw std::runtime_error(
+            "Can't use ArmPL mat-vec for scalar types other than double and "
+            "float.");
+      }
     }
-  }
 #endif
     run_benchmark(test_data);
   }
 
   // Performance Output
-  double matrix_size =
-      1.0 *
-      ((test_data.nnz * (sizeof(Scalar) + sizeof(Ordinal)) + numRows * sizeof(Offset))) /
-      1024 / 1024;
-  double vector_size      = 2.0 * numRows * sizeof(Scalar) / 1024 / 1024;
-  double vector_readwrite = (test_data.nnz + numCols) * sizeof(Scalar) / 1024 / 1024;
+  double matrix_size = 1.0 *
+                       ((test_data.nnz * (sizeof(Scalar) + sizeof(Ordinal)) +
+                         numRows * sizeof(Offset))) /
+                       1024 / 1024;
+  double vector_size = 2.0 * numRows * sizeof(Scalar) / 1024 / 1024;
+  double vector_readwrite =
+      (test_data.nnz + numCols) * sizeof(Scalar) / 1024 / 1024;
 
   double problem_size = matrix_size + vector_size;
   printf(
@@ -135,8 +140,10 @@ int test_crs_matrix_singlevec(Ordinal numRows, Ordinal numCols, int test, const
       (matrix_size + vector_readwrite) / test_data.ave_time * loop / 1024,
       (matrix_size + vector_readwrite) / test_data.max_time / 1024,
       (matrix_size + vector_readwrite) / test_data.min_time / 1024,
-      2.0 * test_data.nnz * loop / test_data.ave_time / 1e9, 2.0 * test_data.nnz / test_data.max_time / 1e9,
-      2.0 * test_data.nnz / test_data.min_time / 1e9, test_data.ave_time / loop * 1000, test_data.max_time * 1000,
+      2.0 * test_data.nnz * loop / test_data.ave_time / 1e9,
+      2.0 * test_data.nnz / test_data.max_time / 1e9,
+      2.0 * test_data.nnz / test_data.min_time / 1e9,
+      test_data.ave_time / loop * 1000, test_data.max_time * 1000,
       test_data.min_time * 1000, test_data.num_errors);
   return (int)test_data.total_error;
 }
@@ -164,8 +171,12 @@ void print_help() {
       "Inspection)\n");
 #endif
   printf("                      mkl, armpl,cusparse    (Vendor Libraries)\n\n");
-  printf("  --schedule [SCH]: Set schedule for kk variant (static,dynamic,auto [ default ]).\n");
-  printf("  -f [file]       : Read in Matrix Market formatted text file 'file'.\n");
+  printf(
+      "  --schedule [SCH]: Set schedule for kk variant (static,dynamic,auto [ "
+      "default ]).\n");
+  printf(
+      "  -f [file]       : Read in Matrix Market formatted text file "
+      "'file'.\n");
   printf("  -fb [file]      : Read in binary Matrix files 'file'.\n");
   printf(
       "  --write-binary  : In combination with -f, generate binary files.\n");
@@ -206,30 +217,20 @@ int main(int argc, char** argv) {
       continue;
     }
 
+    if ((strcmp(argv[i], "--test") == 0)) {
+      i++;
+      if (i == argc) {
+        std::cerr << "Must pass algorithm name after '--test'";
+        exit(1);
+      }
 
-  if((strcmp(argv[i],"--test")==0)) {
-    i++;
-    if(i == argc)
-    {
-      std::cerr << "Must pass algorithm name after '--test'";
-      exit(1);
-    }
-
-
-    if((strcmp(argv[i],"mkl")==0))
-      test = MKL;
-    if((strcmp(argv[i],"armpl")==0))
-      test = ARMPL;
-    if((strcmp(argv[i],"kk")==0))
-      test = KOKKOS;
-    if((strcmp(argv[i],"cusparse")==0))
-      test = CUSPARSE;
-    if((strcmp(argv[i],"kk-kernels")==0))
-      test = KK_KERNELS;
-    if((strcmp(argv[i],"kk-kernels-insp")==0))
-      test = KK_KERNELS_INSP;
-    if((strcmp(argv[i],"kk-insp")==0))
-      test = KK_INSP;
+      if ((strcmp(argv[i], "mkl") == 0)) test = MKL;
+      if ((strcmp(argv[i], "armpl") == 0)) test = ARMPL;
+      if ((strcmp(argv[i], "kk") == 0)) test = KOKKOS;
+      if ((strcmp(argv[i], "cusparse") == 0)) test = CUSPARSE;
+      if ((strcmp(argv[i], "kk-kernels") == 0)) test = KK_KERNELS;
+      if ((strcmp(argv[i], "kk-kernels-insp") == 0)) test = KK_KERNELS_INSP;
+      if ((strcmp(argv[i], "kk-insp") == 0)) test = KK_INSP;
 #ifdef KOKKOS_ENABLE_OPENMP
       if ((strcmp(argv[i], "omp-static") == 0)) test = OMP_STATIC;
       if ((strcmp(argv[i], "omp-dynamic") == 0)) test = OMP_DYNAMIC;
diff --git a/perf_test/sparse/KokkosSparse_spmv_blockcrs.cpp b/perf_test/sparse/KokkosSparse_spmv_blockcrs.cpp
new file mode 100644
index 0000000000..1eb7f0b8da
--- /dev/null
+++ b/perf_test/sparse/KokkosSparse_spmv_blockcrs.cpp
@@ -0,0 +1,526 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Luc Berger-Vergiat (lberge@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdio>
+
+#include <ctime>
+#include <cstring>
+#include <cstdlib>
+#include <cmath>
+
+#include <limits>
+#include <unordered_map>
+
+#include <Kokkos_Core.hpp>
+#include <KokkosSparse_BlockCrsMatrix.hpp>
+#include <KokkosSparse_CrsMatrix.hpp>
+#include <KokkosKernels_IOUtils.hpp>
+#include <KokkosSparse_spmv.hpp>
+#include "KokkosKernels_default_types.hpp"
+#include <KokkosKernels_Test_Structured_Matrix.hpp>
+
+namespace details {
+
+enum class Implementation : int { KokkosKernels = 0, Cuda = 1, MKL = 2 };
+
+///
+/// Define default types
+///
+typedef double Scalar;
+typedef int Ordinal;
+///
+//////////////////////////
+
+/// Random generator
+template <typename scalar_t>
+inline scalar_t random() {
+  auto const max = static_cast<scalar_t>(RAND_MAX) + static_cast<scalar_t>(1);
+  return static_cast<scalar_t>(std::rand()) / max;
+}
+
+template <typename scalar_t>
+inline void set_random_value(scalar_t &v) {
+  v = random<scalar_t>();
+}
+
+template <typename scalar_t>
+inline void set_random_value(Kokkos::complex<scalar_t> &v) {
+  Scalar vre = random<scalar_t>();
+  Scalar vim = random<scalar_t>();
+  v          = Kokkos::complex<scalar_t>(vre, vim);
+}
+
+template <typename scalar_t>
+inline void set_random_value(std::complex<scalar_t> &v) {
+  scalar_t vre = random<scalar_t>();
+  scalar_t vim = random<scalar_t>();
+  v            = std::complex<scalar_t>(vre, vim);
+}
+
+template <typename scalar_t>
+void make_block_entries(
+    const KokkosSparse::CrsMatrix<scalar_t, Ordinal, Kokkos::HostSpace, void,
+                                  size_t> &mat_b1,
+    int blockSize, std::vector<Ordinal> &mat_rowmap,
+    std::vector<Ordinal> &mat_colidx, std::vector<scalar_t> &mat_val) {
+  Ordinal nRow = blockSize * mat_b1.numRows();
+  size_t nnz = static_cast<size_t>(blockSize) * static_cast<size_t>(blockSize) *
+               mat_b1.nnz();
+
+  mat_val.resize(nnz);
+  for (size_t ii = 0; ii < nnz; ++ii) set_random_value(mat_val[ii]);
+
+  //
+  // Create graph for CrsMatrix
+  //
+
+  mat_rowmap.assign(nRow + 1, 0);
+  mat_colidx.assign(nnz, 0);
+
+  for (Ordinal ir = 0; ir < mat_b1.numRows(); ++ir) {
+    const auto jbeg = mat_b1.graph.row_map(ir);
+    const auto jend = mat_b1.graph.row_map(ir + 1);
+    for (Ordinal ib = 0; ib < blockSize; ++ib) {
+      const Ordinal my_row   = ir * blockSize + ib;
+      mat_rowmap[my_row + 1] = mat_rowmap[my_row] + (jend - jbeg) * blockSize;
+      for (auto ijk = jbeg; ijk < jend; ++ijk) {
+        const auto col0 = mat_b1.graph.entries(ijk);
+        for (Ordinal jb = 0; jb < blockSize; ++jb) {
+          mat_colidx[mat_rowmap[my_row] + (ijk - jbeg) * blockSize + jb] =
+              col0 * blockSize + jb;
+        }
+      }
+    }
+  }  // for (Ordinal ir = 0; ir < mat_b1.numRows(); ++ir)
+}
+
+template <typename scalar_t>
+int test_blockcrs_matrix_single_vec(
+    const char fOp[],
+    KokkosSparse::CrsMatrix<Scalar, Ordinal, Kokkos::HostSpace, void, size_t>
+        mat_b1,
+    int test, int loop, const scalar_t alpha, const scalar_t beta,
+    const int bMax) {
+  typedef typename KokkosSparse::CrsMatrix<
+      scalar_t, Ordinal, Kokkos::DefaultExecutionSpace, void, size_t>
+      crsMat_type;
+
+  typedef typename crsMat_type::values_type::non_const_type scalar_view_t;
+  typedef scalar_view_t x_vector_type;
+  typedef scalar_view_t y_vector_type;
+
+  srand(17312837);
+
+  int num_errors    = 0;
+  const auto bMax_o = static_cast<Ordinal>(bMax);
+  for (Ordinal blockSize = 1; blockSize <= bMax_o; ++blockSize) {
+    Ordinal nRow = blockSize * mat_b1.numRows();
+    Ordinal nCol = nRow;
+    std::vector<Ordinal> mat_rowmap;
+    std::vector<Ordinal> mat_colidx;
+    std::vector<scalar_t> mat_val;
+
+    // Create the entries
+    make_block_entries<scalar_t>(mat_b1, blockSize, mat_rowmap, mat_colidx,
+                                 mat_val);
+
+    // Create the CrsMatrix for the reference computation
+    crsMat_type Acrs("new_crs_matr", nRow, nCol, mat_val.size(), &mat_val[0],
+                     &mat_rowmap[0], &mat_colidx[0]);
+
+    x_vector_type xref("new_right_hand_side", nRow);
+    auto h_xref = Kokkos::create_mirror_view(xref);
+    for (Ordinal ir = 0; ir < nRow; ++ir) {
+      set_random_value(h_xref(ir));
+    }
+    Kokkos::deep_copy(xref, h_xref);
+
+    y_vector_type y0("y_init", nRow);
+    auto h_y0 = Kokkos::create_mirror_view(y0);
+    for (Ordinal ir = 0; ir < nRow; ++ir) set_random_value(h_y0(ir));
+    Kokkos::deep_copy(y0, h_y0);
+
+    y_vector_type ycrs("crs_product_result", nRow);
+    auto h_ycrs = Kokkos::create_mirror_view(ycrs);
+
+    // Time a series of multiplications with the CrsMatrix
+    double time_crs = 0.0;
+    for (int jr = 0; jr < loop; ++jr) {
+      for (Ordinal ir = 0; ir < nRow; ++ir) h_ycrs(ir) = h_y0(ir);
+      Kokkos::deep_copy(ycrs, h_ycrs);
+      Kokkos::Timer timer;
+      KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs);
+      time_crs += timer.seconds();
+    }
+
+    // Create the output vector
+    y_vector_type yblockcrs("product_result", nRow);
+    auto h_yblockcrs = Kokkos::create_mirror_view(yblockcrs);
+
+    double time_blockcrs = 0.0;
+    // Create the BlockCrsMatrix
+    KokkosSparse::Experimental::BlockCrsMatrix<
+        scalar_t, Ordinal, Kokkos::DefaultExecutionSpace, void, size_t>
+        Ablockcrs(Acrs, blockSize);
+
+    switch (static_cast<details::Implementation>(test)) {
+      default:
+      case Implementation::KokkosKernels: {
+        // Time a series of multiplications with the BlockCrsMatrix
+        for (int jr = 0; jr < loop; ++jr) {
+          for (Ordinal ir = 0; ir < nRow; ++ir) h_yblockcrs(ir) = h_y0(ir);
+          Kokkos::deep_copy(yblockcrs, h_yblockcrs);
+          Kokkos::Timer timer;
+          KokkosSparse::spmv(fOp, alpha, Ablockcrs, xref, beta, yblockcrs);
+          time_blockcrs += timer.seconds();
+        }
+      } break;
+    }
+
+    // Check that the numerical result is matching
+    Kokkos::deep_copy(h_ycrs, ycrs);
+    Kokkos::deep_copy(h_yblockcrs, yblockcrs);
+    double error = 0.0, maxNorm = 0.0;
+    for (size_t ir = 0; ir < h_ycrs.extent(0); ++ir) {
+      maxNorm = std::max(maxNorm, Kokkos::ArithTraits<Scalar>::abs(h_ycrs(ir)));
+      error   = std::max(error, Kokkos::ArithTraits<Scalar>::abs(
+                                  h_ycrs(ir) - h_yblockcrs(ir)));
+    }
+
+    double tol =
+        (mat_val.size() / nRow) * std::numeric_limits<double>::epsilon();
+    if (error > tol * maxNorm) {
+      num_errors += 1;
+      std::cout << static_cast<int>(test) << " ";
+      std::cout << fOp << ", " << blockSize << " : "
+                << " error " << error << " maxNorm " << maxNorm << " tol "
+                << tol << " tol * maxNorm " << tol * maxNorm << "\n";
+    }
+
+    //-- Print the number of Gflops for both products
+    if (blockSize == 1) {
+      printf("Op, blockSize: AvgGFlop(CrsMatrix) AvgGFlop(BlockCrsMatrix) \n");
+    }
+    double num_flops     = mat_val.size() * 2 * loop;
+    double crs_flop      = (num_flops / time_crs) * 1.0e-09;
+    double blockcrs_flop = (num_flops / time_blockcrs) * 1.0e-09;
+    std::cout << fOp << ", " << blockSize << "         : ";
+    if (crs_flop < blockcrs_flop) {
+      std::cout << crs_flop << "        <" << blockcrs_flop << ">";
+    } else {
+      std::cout << "<" << crs_flop << ">         " << blockcrs_flop;
+    }
+    std::cout << std::endl;
+
+  }  // for (Ordinal blockSize = 1; blockSize < bMax; ++blockSize)
+
+  return int(num_errors);
+}
+
+template <typename scalar_t>
+int test_blockcrs_matrix_vec(
+    const char fOp[],
+    KokkosSparse::CrsMatrix<Scalar, Ordinal, Kokkos::HostSpace, void, size_t>
+        mat_b1,
+    int nvec, int test, int loop, const scalar_t alpha, const scalar_t beta,
+    const int bMax) {
+  typedef typename KokkosSparse::CrsMatrix<
+      scalar_t, Ordinal, Kokkos::DefaultExecutionSpace, void, size_t>
+      crsMat_type;
+
+  typedef Kokkos::View<scalar_t **, Kokkos::LayoutLeft,
+                       Kokkos::DefaultExecutionSpace>
+      block_vector_t;
+
+  srand(17312837);
+
+  int num_errors    = 0;
+  const auto bMax_o = static_cast<Ordinal>(bMax);
+  for (Ordinal blockSize = 1; blockSize <= bMax_o; ++blockSize) {
+    Ordinal nRow = blockSize * mat_b1.numRows();
+    Ordinal nCol = nRow;
+    std::vector<Ordinal> mat_rowmap;
+    std::vector<Ordinal> mat_colidx;
+    std::vector<scalar_t> mat_val;
+
+    make_block_entries<scalar_t>(mat_b1, blockSize, mat_rowmap, mat_colidx,
+                                 mat_val);
+
+    // Create the CrsMatrix for the reference computation
+    crsMat_type Acrs("new_crs_matr", nRow, nCol, mat_val.size(), &mat_val[0],
+                     &mat_rowmap[0], &mat_colidx[0]);
+
+    block_vector_t xref("new_right_hand_side", nRow, nvec);
+    auto h_xref = Kokkos::create_mirror_view(xref);
+    for (Ordinal jc = 0; jc < nvec; ++jc) {
+      for (Ordinal ir = 0; ir < nRow; ++ir) {
+        set_random_value(h_xref(ir, jc));
+      }
+    }
+    Kokkos::deep_copy(xref, h_xref);
+
+    block_vector_t y0("y_init", nRow, nvec);
+    auto h_y0 = Kokkos::create_mirror_view(y0);
+    for (Ordinal jc = 0; jc < nvec; ++jc)
+      for (Ordinal ir = 0; ir < nRow; ++ir) set_random_value(h_y0(ir, jc));
+    Kokkos::deep_copy(y0, h_y0);
+
+    block_vector_t ycrs("crs_product_result", nRow, nvec);
+    auto h_ycrs = Kokkos::create_mirror_view(ycrs);
+
+    // Time a series of multiplications with the CrsMatrix format
+    double time_crs = 0.0;
+    for (int jr = 0; jr < loop; ++jr) {
+      for (Ordinal jc = 0; jc < nvec; ++jc)
+        for (Ordinal ir = 0; ir < nRow; ++ir) h_ycrs(ir, jc) = h_y0(ir, jc);
+      Kokkos::deep_copy(ycrs, h_ycrs);
+      Kokkos::Timer timer;
+      KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs);
+      time_crs += timer.seconds();
+    }
+
+    // Create the BlockCrsMatrix variable
+    KokkosSparse::Experimental::BlockCrsMatrix<
+        scalar_t, Ordinal, Kokkos::DefaultExecutionSpace, void, size_t>
+        Ablockcrs(Acrs, blockSize);
+
+    block_vector_t yblockcrs("blockcrs_product_result", nRow, nvec);
+    auto h_yblockcrs = Kokkos::create_mirror_view(yblockcrs);
+
+    // Time a series of multiplications with the BlockCrsMatrix
+    double time_blockcrs = 0.0;
+    switch (static_cast<details::Implementation>(test)) {
+      default:
+      case Implementation::KokkosKernels: {
+        // Time a series of multiplications with the BlockCrsMatrix
+        for (int jr = 0; jr < loop; ++jr) {
+          for (Ordinal jc = 0; jc < nvec; ++jc) {
+            for (Ordinal ir = 0; ir < nRow; ++ir)
+              h_yblockcrs(ir, jc) = h_y0(ir, jc);
+          }
+          Kokkos::deep_copy(yblockcrs, h_yblockcrs);
+          Kokkos::Timer timer;
+          KokkosSparse::spmv(fOp, alpha, Ablockcrs, xref, beta, yblockcrs);
+          time_blockcrs += timer.seconds();
+        }
+      } break;
+    }
+
+    // Check that the result is matching
+    Kokkos::deep_copy(h_ycrs, ycrs);
+    Kokkos::deep_copy(h_yblockcrs, yblockcrs);
+    double tol =
+        (mat_val.size() / nRow) * std::numeric_limits<double>::epsilon();
+    for (int jc = 0; jc < nvec; ++jc) {
+      double error = 0.0, maxNorm = 0.0;
+      for (size_t ir = 0; ir < h_ycrs.extent(0); ++ir) {
+        maxNorm =
+            std::max(maxNorm, Kokkos::ArithTraits<Scalar>::abs(h_ycrs(ir, jc)));
+        error = std::max(error, Kokkos::ArithTraits<Scalar>::abs(
+                                    h_ycrs(ir, jc) - h_yblockcrs(ir, jc)));
+      }
+      if (error > tol * maxNorm) {
+        num_errors += 1;
+        std::cout << fOp << ", " << blockSize << " : rhs " << jc << " error "
+                  << error << " maxNorm " << maxNorm << " tol " << tol
+                  << " tol * maxNorm " << tol * maxNorm << "\n";
+      }
+    }
+
+    // Print the number of Gflops
+    if (blockSize == 1) {
+      printf("Op, blockSize: AvgGFlop(CrsMatrix) AvgGFlop(BlockCrsMatrix) \n");
+    }
+    double num_flops     = mat_val.size() * 2 * loop * nvec;
+    double crs_flop      = (num_flops / time_crs) * 1.0e-09;
+    double blockcrs_flop = (num_flops / time_blockcrs) * 1.0e-09;
+    std::cout << fOp << ", " << blockSize << "         ";
+    if (crs_flop < blockcrs_flop) {
+      // std::cout << crs_flop << "        <" << blockcrs_flop << ">";
+      std::cout << crs_flop << "        " << blockcrs_flop << " ";
+    } else {
+      // std::cout << "<" << crs_flop << ">         " << blockcrs_flop;
+      std::cout << " " << crs_flop << "         " << blockcrs_flop;
+    }
+    std::cout << std::endl;
+  }
+
+  return int(num_errors);
+}
+
+void print_help() {
+  printf("BlockCrsMatrix SPMV benchmark code \n");
+  printf("Options:\n");
+  printf(
+      "  -bs             : Maximum blocksize for the sparse matrix (default "
+      "= "
+      "16). \n");
+  printf("  -h              : Help. \n");
+  printf(
+      "  -l [LOOP]       : How many spmv to run to aggregate average time "
+      "(default = 512). \n");
+  printf(
+      "  -nx             : Number of points in the x-direction (default = "
+      "32).\n");
+  printf(
+      "                    The matrix will be of dimension nx (nx - 1) (nx + "
+      "1).\n");
+  printf(
+      "  -nv             : Number of vectors to multiply with (default = 1). "
+      "\n");
+  printf("  --op            : Use different operation \n");
+  printf("                    Options: \n");
+  printf("                    N = normal (default)  y <- alpha A x + beta y\n");
+  printf(
+      "                    C = conjugate         y <- alpha conj(A) x + beta "
+      "y\n");
+  printf(
+      "                    T = transpose         y <- alpha A^T x + beta "
+      "y\n");
+  printf(
+      "                    H = hermitian         y <- alpha A^H x + beta "
+      "y\n");
+}
+}  // namespace details
+
+int main(int argc, char **argv) {
+  int loop = 512;
+  int bMax = 16;
+  int nvec = 1;
+  int nx   = 32;
+
+  char fOp[] = "N";
+
+  int test = static_cast<int>(details::Implementation::KokkosKernels);
+
+  for (int i = 0; i < argc; i++) {
+    if ((strcmp(argv[i], "-bs") == 0)) {
+      int tmp = atoi(argv[++i]);
+      bMax    = (tmp > 0) ? tmp : bMax;
+      continue;
+    }
+
+    if ((strcmp(argv[i], "--tpl") == 0)) {
+      i++;
+      if ((strcmp(argv[i], "cuda") == 0))
+        test = static_cast<int>(details::Implementation::Cuda);
+      if ((strcmp(argv[i], "mkl") == 0))
+        test = static_cast<int>(details::Implementation::MKL);
+      continue;
+    }
+
+    if ((strcmp(argv[i], "--help") == 0) || (strcmp(argv[i], "-h") == 0)) {
+      details::print_help();
+      return 0;
+    }
+
+    if ((strcmp(argv[i], "-l") == 0)) {
+      int tmp = atoi(argv[++i]);
+      loop    = (tmp > 0) ? tmp : loop;
+      continue;
+    }
+
+    if ((strcmp(argv[i], "-nx") == 0)) {
+      int tmp = atoi(argv[++i]);
+      nx      = (tmp > 0) ? tmp : nx;
+      continue;
+    }
+
+    if ((strcmp(argv[i], "-nv") == 0)) {
+      int tmp = atoi(argv[++i]);
+      nvec    = (tmp > 0) ? tmp : nvec;
+      continue;
+    }
+
+    if ((strcmp(argv[i], "--op") == 0)) {
+      i++;
+      if ((strcmp(argv[i], "N") == 0)) strcpy(fOp, "N");
+      if ((strcmp(argv[i], "C") == 0)) strcpy(fOp, "C");
+      if ((strcmp(argv[i], "T") == 0)) strcpy(fOp, "T");
+      if ((strcmp(argv[i], "H") == 0)) strcpy(fOp, "H");
+      continue;
+    }
+  }
+
+  Kokkos::initialize(argc, argv);
+  {
+    // The mat_structure view is used to generate a matrix using
+    // finite difference (FD) or finite element (FE) discretization
+    // on a cartesian grid.
+    Kokkos::View<details::Ordinal * [3], Kokkos::HostSpace> mat_structure(
+        "Matrix Structure", 3);
+    mat_structure(0, 0) = nx;      // Request 8 grid point in 'x' direction
+    mat_structure(0, 1) = 0;       // Add BC to the left
+    mat_structure(0, 2) = 0;       // Add BC to the right
+    mat_structure(1, 0) = nx - 1;  // Request 7 grid point in 'y' direction
+    mat_structure(1, 1) = 0;       // Add BC to the bottom
+    mat_structure(1, 2) = 0;       // Add BC to the top
+    mat_structure(2, 0) = nx + 1;  // Request 9 grid point in 'z' direction
+    mat_structure(2, 1) = 0;       // Add BC to the bottom
+    mat_structure(2, 2) = 0;       // Add BC to the top
+
+    typedef typename KokkosSparse::CrsMatrix<details::Scalar, details::Ordinal,
+                                             Kokkos::HostSpace, void, size_t>
+        h_crsMat_type;
+
+    h_crsMat_type mat_b1 =
+        Test::generate_structured_matrix3D<h_crsMat_type>("FD", mat_structure);
+
+    int total_errors = 0;
+
+    if (nvec == 1)
+      total_errors = details::test_blockcrs_matrix_single_vec(
+          fOp, mat_b1, test, loop, details::Scalar(3.1), details::Scalar(-2.4),
+          bMax);
+    else
+      total_errors = details::test_blockcrs_matrix_vec(
+          fOp, mat_b1, nvec, test, loop, details::Scalar(3.1),
+          details::Scalar(-2.4), bMax);
+
+    if (total_errors != 0) {
+      printf("Kokkos::BlockCrsMatrix SpMV Test: Failed\n");
+    }
+  }
+  Kokkos::finalize();
+}
diff --git a/perf_test/sparse/KokkosSparse_spmv_bsr.cpp b/perf_test/sparse/KokkosSparse_spmv_bsr.cpp
new file mode 100644
index 0000000000..e10c6748a7
--- /dev/null
+++ b/perf_test/sparse/KokkosSparse_spmv_bsr.cpp
@@ -0,0 +1,597 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Luc Berger-Vergiat (lberge@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdio>
+
+#include <ctime>
+#include <cstring>
+#include <cstdlib>
+#include <cmath>
+
+#include <limits>
+#include <unordered_map>
+
+#include <Kokkos_Core.hpp>
+#include <KokkosSparse_BsrMatrix.hpp>
+#include <KokkosSparse_CrsMatrix.hpp>
+#include <KokkosKernels_IOUtils.hpp>
+#include <KokkosSparse_spmv.hpp>
+#include <KokkosKernels_Test_Structured_Matrix.hpp>
+
+namespace details {
+
+enum class Implementation : int { KokkosKernels = 0, Cuda = 1, MKL = 2 };
+
+///
+/// Define default types
+///
+typedef double Scalar;
+typedef int Ordinal;
+///
+//////////////////////////
+
+/// Random generator
+template <typename scalar_t>
+inline scalar_t random() {
+  auto const max = static_cast<scalar_t>(RAND_MAX) + static_cast<scalar_t>(1);
+  return static_cast<scalar_t>(std::rand()) / max;
+}
+
+template <typename scalar_t>
+inline void set_random_value(scalar_t &v) {
+  v = random<scalar_t>();
+}
+
+template <typename scalar_t>
+inline void set_random_value(Kokkos::complex<scalar_t> &v) {
+  Scalar vre = random<scalar_t>();
+  Scalar vim = random<scalar_t>();
+  v          = Kokkos::complex<scalar_t>(vre, vim);
+}
+
+template <typename scalar_t>
+inline void set_random_value(std::complex<scalar_t> &v) {
+  scalar_t vre = random<scalar_t>();
+  scalar_t vim = random<scalar_t>();
+  v            = std::complex<scalar_t>(vre, vim);
+}
+
+template <typename scalar_t>
+void make_block_entries(
+    const KokkosSparse::CrsMatrix<scalar_t, Ordinal, Kokkos::HostSpace, void,
+                                  int> &mat_b1,
+    int blockSize, std::vector<Ordinal> &mat_rowmap,
+    std::vector<Ordinal> &mat_colidx, std::vector<scalar_t> &mat_val) {
+  Ordinal nRow = blockSize * mat_b1.numRows();
+  size_t nnz = static_cast<size_t>(blockSize) * static_cast<size_t>(blockSize) *
+               mat_b1.nnz();
+
+  mat_val.resize(nnz);
+  for (size_t ii = 0; ii < nnz; ++ii) set_random_value(mat_val[ii]);
+
+  //
+  // Create graph for CrsMatrix
+  //
+
+  mat_rowmap.assign(nRow + 1, 0);
+  mat_colidx.assign(nnz, 0);
+
+  for (Ordinal ir = 0; ir < mat_b1.numRows(); ++ir) {
+    const auto jbeg = mat_b1.graph.row_map(ir);
+    const auto jend = mat_b1.graph.row_map(ir + 1);
+    for (Ordinal ib = 0; ib < blockSize; ++ib) {
+      const Ordinal my_row   = ir * blockSize + ib;
+      mat_rowmap[my_row + 1] = mat_rowmap[my_row] + (jend - jbeg) * blockSize;
+      for (Ordinal ijk = jbeg; ijk < jend; ++ijk) {
+        const auto col0 = mat_b1.graph.entries(ijk);
+        for (Ordinal jb = 0; jb < blockSize; ++jb) {
+          mat_colidx[mat_rowmap[my_row] + (ijk - jbeg) * blockSize + jb] =
+              col0 * blockSize + jb;
+        }
+      }
+    }
+  }  // for (Ordinal ir = 0; ir < mat_b1.numRows(); ++ir)
+}
+
+template <typename scalar_t>
+int test_bsr_matrix_single_vec(
+    const char fOp[],
+    KokkosSparse::CrsMatrix<Scalar, Ordinal, Kokkos::HostSpace, void, int>
+        mat_b1,
+    int test, int loop, const scalar_t alpha, const scalar_t beta,
+    const int bMax) {
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, Ordinal,
+                                       Kokkos::DefaultExecutionSpace, void, int>
+          crsMat_type;
+
+  typedef Kokkos::View<scalar_t *, Kokkos::LayoutLeft,
+                       Kokkos::DefaultExecutionSpace>
+      x_vector_type;
+  typedef Kokkos::View<scalar_t *, Kokkos::LayoutLeft,
+                       Kokkos::DefaultExecutionSpace>
+      y_vector_type;
+
+  srand(17312837);
+
+  int num_errors    = 0;
+  const auto bMax_o = static_cast<Ordinal>(bMax);
+  for (Ordinal blockSize = 1; blockSize <= bMax_o; ++blockSize) {
+    Ordinal nRow = blockSize * mat_b1.numRows();
+    Ordinal nCol = nRow;
+    std::vector<Ordinal> mat_rowmap;
+    std::vector<Ordinal> mat_colidx;
+    std::vector<scalar_t> mat_val;
+
+    // Create the entries
+    make_block_entries<scalar_t>(mat_b1, blockSize, mat_rowmap, mat_colidx,
+                                 mat_val);
+
+    // Create the CrsMatrix for the reference computation
+    crsMat_type Acrs("new_crs_matr", nRow, nCol, mat_val.size(), &mat_val[0],
+                     &mat_rowmap[0], &mat_colidx[0]);
+
+    x_vector_type xref("new_right_hand_side", nRow);
+    auto h_xref = Kokkos::create_mirror_view(xref);
+    for (Ordinal ir = 0; ir < nRow; ++ir) {
+      set_random_value(h_xref(ir));
+    }
+    Kokkos::deep_copy(xref, h_xref);
+
+    y_vector_type y0("y_init", nRow);
+    auto h_y0 = Kokkos::create_mirror_view(y0);
+    for (Ordinal ir = 0; ir < nRow; ++ir) set_random_value(h_y0(ir));
+    Kokkos::deep_copy(y0, h_y0);
+
+    y_vector_type ycrs("crs_product_result", nRow);
+    auto h_ycrs = Kokkos::create_mirror_view(ycrs);
+
+    KokkosKernels::Experimental::Controls controls;
+    switch (static_cast<details::Implementation>(test)) {
+      case Implementation::KokkosKernels: {
+        controls.setParameter("algorithm", "native");
+      } break;
+      default: break;
+    }
+
+    // Do the multiplication for warming up
+    for (Ordinal ir = 0; ir < nRow; ++ir) h_ycrs(ir) = h_y0(ir);
+    Kokkos::deep_copy(ycrs, h_ycrs);
+    KokkosSparse::spmv(controls, fOp, alpha, Acrs, xref, beta, ycrs);
+
+    // Time a series of multiplications with the CrsMatrix
+    double time_crs = 0.0;
+    for (int jr = 0; jr < loop; ++jr) {
+      for (Ordinal ir = 0; ir < nRow; ++ir) h_ycrs(ir) = h_y0(ir);
+      Kokkos::deep_copy(ycrs, h_ycrs);
+      Kokkos::Timer timer;
+      KokkosSparse::spmv(controls, fOp, alpha, Acrs, xref, beta, ycrs);
+      time_crs += timer.seconds();
+      Kokkos::fence();
+    }
+
+    // Create the output vector
+    y_vector_type ybsr("product_result", nRow);
+    auto h_ybsr = Kokkos::create_mirror_view(ybsr);
+
+    // Create the BsrMatrix
+    KokkosSparse::Experimental::BsrMatrix<
+        scalar_t, Ordinal, Kokkos::DefaultExecutionSpace, void, int>
+        Absr(Acrs, blockSize);
+
+    // Do the multiplication for warming up
+    for (Ordinal ir = 0; ir < nRow; ++ir) h_ybsr(ir) = h_y0(ir);
+    Kokkos::deep_copy(ybsr, h_ybsr);
+    KokkosSparse::spmv(controls, fOp, alpha, Absr, xref, beta, ybsr);
+
+    // Time a series of multiplications with the BsrMatrix
+    double time_bsr = 0.0;
+    for (int jr = 0; jr < loop; ++jr) {
+      for (Ordinal ir = 0; ir < nRow; ++ir) h_ybsr(ir) = h_y0(ir);
+      Kokkos::deep_copy(ybsr, h_ybsr);
+      Kokkos::Timer timer;
+      KokkosSparse::spmv(controls, fOp, alpha, Absr, xref, beta, ybsr);
+      time_bsr += timer.seconds();
+      Kokkos::fence();
+    }
+
+    // Check that the numerical result is matching
+    Kokkos::deep_copy(h_ycrs, ycrs);
+    Kokkos::deep_copy(h_ybsr, ybsr);
+    double error = 0.0, maxNorm = 0.0;
+    for (size_t ir = 0; ir < h_ycrs.extent(0); ++ir) {
+      maxNorm = std::max(maxNorm, Kokkos::ArithTraits<Scalar>::abs(h_ycrs(ir)));
+      error   = std::max(
+          error, Kokkos::ArithTraits<Scalar>::abs(h_ycrs(ir) - h_ybsr(ir)));
+    }
+
+    double tol =
+        (mat_val.size() / nRow) * std::numeric_limits<double>::epsilon();
+    if (error > tol * maxNorm) {
+      num_errors += 1;
+      std::cout << static_cast<int>(test) << " ";
+      std::cout << fOp << ", " << blockSize << " : "
+                << " error " << error << " maxNorm " << maxNorm << " tol "
+                << tol << " tol * maxNorm " << tol * maxNorm << "\n";
+    }
+
+    //-- Print the number of Gflops for both products
+    if (blockSize == 1) {
+      printf("Op, blockSize: AvgGFlop(CrsMatrix) ");
+      switch (static_cast<details::Implementation>(test)) {
+        default:
+        case Implementation::KokkosKernels:
+          printf(" AvgGFlop(BsrMatrix - KokkosKernels) \n");
+          break;
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+        case Implementation::Cuda:
+          printf(" AvgGFlop(BsrMatrix - CUSPARSE) \n");
+          break;
+#endif
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+        case Implementation::MKL:
+          printf(" AvgGFlop(BsrMatrix - MKL) \n");
+          break;
+#endif
+      }
+    }
+    double num_flops = mat_val.size() * 2 * loop;
+    double crs_flop  = (num_flops / time_crs) * 1.0e-09;
+    double bsr_flop  = (num_flops / time_bsr) * 1.0e-09;
+    std::cout << fOp << ", " << blockSize << "         : ";
+    if (crs_flop < bsr_flop) {
+      std::cout << crs_flop << "        <" << bsr_flop << ">";
+    } else {
+      std::cout << "<" << crs_flop << ">         " << bsr_flop;
+    }
+    std::cout << std::endl;
+
+  }  // for (Ordinal blockSize = 1; blockSize < bMax; ++blockSize)
+
+  return int(num_errors);
+}
+
+template <typename scalar_t>
+int test_bsr_matrix_vec(
+    const char fOp[],
+    KokkosSparse::CrsMatrix<Scalar, Ordinal, Kokkos::HostSpace, void, int>
+        mat_b1,
+    int nvec, int test, int loop, const scalar_t alpha, const scalar_t beta,
+    const int bMax) {
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, Ordinal,
+                                       Kokkos::DefaultExecutionSpace, void, int>
+          crsMat_type;
+
+  typedef Kokkos::View<scalar_t **, Kokkos::LayoutLeft,
+                       Kokkos::DefaultExecutionSpace>
+      block_vector_t;
+
+  srand(17312837);
+
+  int num_errors    = 0;
+  const auto bMax_o = static_cast<Ordinal>(bMax);
+  for (Ordinal blockSize = 1; blockSize <= bMax_o; ++blockSize) {
+    Ordinal nRow = blockSize * mat_b1.numRows();
+    Ordinal nCol = nRow;
+    std::vector<Ordinal> mat_rowmap;
+    std::vector<Ordinal> mat_colidx;
+    std::vector<scalar_t> mat_val;
+
+    make_block_entries<scalar_t>(mat_b1, blockSize, mat_rowmap, mat_colidx,
+                                 mat_val);
+
+    // Create the CrsMatrix for the reference computation
+    crsMat_type Acrs("new_crs_matr", nRow, nCol, mat_val.size(), &mat_val[0],
+                     &mat_rowmap[0], &mat_colidx[0]);
+
+    block_vector_t xref("new_right_hand_side", nRow, nvec);
+    auto h_xref = Kokkos::create_mirror_view(xref);
+    for (Ordinal jc = 0; jc < nvec; ++jc) {
+      for (Ordinal ir = 0; ir < nRow; ++ir) {
+        set_random_value(h_xref(ir, jc));
+      }
+    }
+    Kokkos::deep_copy(xref, h_xref);
+
+    block_vector_t y0("y_init", nRow, nvec);
+    auto h_y0 = Kokkos::create_mirror_view(y0);
+    for (Ordinal jc = 0; jc < nvec; ++jc)
+      for (Ordinal ir = 0; ir < nRow; ++ir) set_random_value(h_y0(ir, jc));
+    Kokkos::deep_copy(y0, h_y0);
+
+    block_vector_t ycrs("crs_product_result", nRow, nvec);
+    auto h_ycrs = Kokkos::create_mirror_view(ycrs);
+
+    KokkosKernels::Experimental::Controls controls;
+    switch (static_cast<details::Implementation>(test)) {
+      case Implementation::KokkosKernels: {
+        controls.setParameter("algorithm", "native");
+      } break;
+      default: break;
+    }
+
+    // Do the multiplication for warming up
+    for (Ordinal jc = 0; jc < nvec; ++jc)
+      for (Ordinal ir = 0; ir < nRow; ++ir) h_ycrs(ir, jc) = h_y0(ir, jc);
+    Kokkos::deep_copy(ycrs, h_ycrs);
+    KokkosSparse::spmv(controls, fOp, alpha, Acrs, xref, beta, ycrs);
+
+    // Time a series of multiplications with the CrsMatrix format
+    double time_crs = 0.0;
+    for (int jr = 0; jr < loop; ++jr) {
+      for (Ordinal jc = 0; jc < nvec; ++jc)
+        for (Ordinal ir = 0; ir < nRow; ++ir) h_ycrs(ir, jc) = h_y0(ir, jc);
+      Kokkos::deep_copy(ycrs, h_ycrs);
+      Kokkos::Timer timer;
+      KokkosSparse::spmv(controls, fOp, alpha, Acrs, xref, beta, ycrs);
+      time_crs += timer.seconds();
+      Kokkos::fence();
+    }
+
+    // Create the BsrMatrix variable
+    KokkosSparse::Experimental::BsrMatrix<
+        scalar_t, Ordinal, Kokkos::DefaultExecutionSpace, void, int>
+        Absr(Acrs, blockSize);
+
+    block_vector_t ybsr("bsr_product_result", nRow, nvec);
+    auto h_ybsr = Kokkos::create_mirror_view(ybsr);
+
+    // Do the multiplication for warming up
+    for (Ordinal jc = 0; jc < nvec; ++jc)
+      for (Ordinal ir = 0; ir < nRow; ++ir) h_ybsr(ir, jc) = h_y0(ir, jc);
+    Kokkos::deep_copy(ybsr, h_ybsr);
+    KokkosSparse::spmv(controls, fOp, alpha, Absr, xref, beta, ybsr);
+
+    // Time a series of multiplications with the BsrMatrix
+    double time_bsr = 0.0;
+    for (int jr = 0; jr < loop; ++jr) {
+      for (Ordinal jc = 0; jc < nvec; ++jc)
+        for (Ordinal ir = 0; ir < nRow; ++ir) h_ybsr(ir, jc) = h_y0(ir, jc);
+      Kokkos::deep_copy(ybsr, h_ybsr);
+      Kokkos::Timer timer;
+      KokkosSparse::spmv(controls, fOp, alpha, Absr, xref, beta, ybsr);
+      time_bsr += timer.seconds();
+      Kokkos::fence();
+    }
+
+    // Check that the result is matching
+    Kokkos::deep_copy(h_ycrs, ycrs);
+    Kokkos::deep_copy(h_ybsr, ybsr);
+    double tol =
+        (mat_val.size() / nRow) * std::numeric_limits<double>::epsilon();
+    for (int jc = 0; jc < nvec; ++jc) {
+      double error = 0.0, maxNorm = 0.0;
+      for (size_t ir = 0; ir < h_ycrs.extent(0); ++ir) {
+        maxNorm =
+            std::max(maxNorm, Kokkos::ArithTraits<Scalar>::abs(h_ycrs(ir, jc)));
+        error = std::max(error, Kokkos::ArithTraits<Scalar>::abs(
+                                    h_ycrs(ir, jc) - h_ybsr(ir, jc)));
+      }
+      if (error > tol * maxNorm) {
+        num_errors += 1;
+        std::cout << fOp << ", " << blockSize << " : rhs " << jc << " error "
+                  << error << " maxNorm " << maxNorm << " tol " << tol
+                  << " tol * maxNorm " << tol * maxNorm << "\n";
+      }
+    }
+
+    // Print the number of Gflops
+    if (blockSize == 1) {
+      printf("Op, blockSize: AvgGFlop(CrsMatrix) ");
+      switch (static_cast<details::Implementation>(test)) {
+        default:
+        case Implementation::KokkosKernels:
+          printf(" AvgGFlop(BsrMatrix - KokkosKernels) \n");
+          break;
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+        case Implementation::Cuda:
+          printf(" AvgGFlop(BsrMatrix - CUSPARSE) \n");
+          break;
+#endif
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+        case Implementation::MKL:
+          printf(" AvgGFlop(BsrMatrix - MKL) \n");
+          break;
+#endif
+      }
+    }
+    double num_flops = mat_val.size() * 2 * loop * nvec;
+    double crs_flop  = (num_flops / time_crs) * 1.0e-09;
+    double bsr_flop  = (num_flops / time_bsr) * 1.0e-09;
+    std::cout << fOp << ", " << blockSize << "         ";
+    if (crs_flop < bsr_flop) {
+      // std::cout << crs_flop << "        <" << bsr_flop << ">";
+      std::cout << crs_flop << "        " << bsr_flop << " ";
+    } else {
+      // std::cout << "<" << crs_flop << ">         " << bsr_flop;
+      std::cout << " " << crs_flop << "         " << bsr_flop;
+    }
+    std::cout << std::endl;
+  }
+
+  return int(num_errors);
+}
+
+void print_help() {
+  printf("BsrMatrix SPMV benchmark code \n");
+  printf("Options:\n");
+  printf(
+      "  -bs             : Maximum blocksize for the sparse matrix (default "
+      "= "
+      "16). \n");
+  printf("  -h              : Help. \n");
+  printf(
+      "  -l [LOOP]       : How many spmv to run to aggregate average time "
+      "(default = 512). \n");
+  printf(
+      "  -nx             : Number of points in the x-direction (default = "
+      "32).\n");
+  printf(
+      "                    The matrix will be of dimension nx (nx - 1) (nx + "
+      "1).\n");
+  printf(
+      "  -nv             : Number of vectors to multiply with (default = 1). "
+      "\n");
+  printf("  --op            : Use different operation \n");
+  printf("                    Options: \n");
+  printf("                    N = normal (default)  y <- alpha A x + beta y\n");
+  printf(
+      "                    C = conjugate         y <- alpha conj(A) x + beta "
+      "y\n");
+  printf(
+      "                    T = transpose         y <- alpha A^T x + beta "
+      "y\n");
+  printf(
+      "                    H = hermitian         y <- alpha A^H x + beta "
+      "y\n");
+}
+}  // namespace details
+
+int main(int argc, char **argv) {
+  int loop = 512;
+  int bMax = 16;
+  int nvec = 1;
+  int nx   = 32;
+
+  char fOp[] = "N";
+
+  int test = static_cast<int>(details::Implementation::KokkosKernels);
+
+  for (int i = 0; i < argc; i++) {
+    if ((strcmp(argv[i], "-bs") == 0)) {
+      int tmp = atoi(argv[++i]);
+      bMax    = (tmp > 0) ? tmp : bMax;
+      continue;
+    }
+
+    if ((strcmp(argv[i], "--tpl") == 0)) {
+      i++;
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+      if ((strcmp(argv[i], "cuda") == 0))
+        test = static_cast<int>(details::Implementation::Cuda);
+#endif
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+      if ((strcmp(argv[i], "mkl") == 0))
+        test = static_cast<int>(details::Implementation::MKL);
+#endif
+      continue;
+    }
+
+    if ((strcmp(argv[i], "--help") == 0) || (strcmp(argv[i], "-h") == 0)) {
+      details::print_help();
+      return 0;
+    }
+
+    if ((strcmp(argv[i], "-l") == 0)) {
+      int tmp = atoi(argv[++i]);
+      loop    = (tmp > 0) ? tmp : loop;
+      continue;
+    }
+
+    if ((strcmp(argv[i], "-nx") == 0)) {
+      int tmp = atoi(argv[++i]);
+      nx      = (tmp > 0) ? tmp : nx;
+      continue;
+    }
+
+    if ((strcmp(argv[i], "-nv") == 0)) {
+      int tmp = atoi(argv[++i]);
+      nvec    = (tmp > 0) ? tmp : nvec;
+      continue;
+    }
+
+    if ((strcmp(argv[i], "--op") == 0)) {
+      i++;
+      if ((strcmp(argv[i], "N") == 0)) strcpy(fOp, "N");
+      if ((strcmp(argv[i], "C") == 0)) strcpy(fOp, "C");
+      if ((strcmp(argv[i], "T") == 0)) strcpy(fOp, "T");
+      if ((strcmp(argv[i], "H") == 0)) strcpy(fOp, "H");
+      continue;
+    }
+  }
+
+  Kokkos::initialize(argc, argv);
+
+  {
+    // The mat_structure view is used to generate a matrix using
+    // finite difference (FD) or finite element (FE) discretization
+    // on a cartesian grid.
+    Kokkos::View<details::Ordinal * [3], Kokkos::HostSpace> mat_structure(
+        "Matrix Structure", 3);
+    mat_structure(0, 0) = nx;      // Request 8 grid point in 'x' direction
+    mat_structure(0, 1) = 0;       // Add BC to the left
+    mat_structure(0, 2) = 0;       // Add BC to the right
+    mat_structure(1, 0) = nx - 1;  // Request 7 grid point in 'y' direction
+    mat_structure(1, 1) = 0;       // Add BC to the bottom
+    mat_structure(1, 2) = 0;       // Add BC to the top
+    mat_structure(2, 0) = nx + 1;  // Request 9 grid point in 'z' direction
+    mat_structure(2, 1) = 0;       // Add BC to the bottom
+    mat_structure(2, 2) = 0;       // Add BC to the top
+
+    typedef typename KokkosSparse::CrsMatrix<details::Scalar, details::Ordinal,
+                                             Kokkos::HostSpace, void, int>
+        h_crsMat_type;
+
+    h_crsMat_type mat_b1 =
+        Test::generate_structured_matrix3D<h_crsMat_type>("FD", mat_structure);
+
+    int total_errors = 0;
+
+    if (nvec == 1)
+      total_errors = details::test_bsr_matrix_single_vec(
+          fOp, mat_b1, test, loop, details::Scalar(3.1), details::Scalar(-2.4),
+          bMax);
+    else
+      total_errors = details::test_bsr_matrix_vec(fOp, mat_b1, nvec, test, loop,
+                                                  details::Scalar(3.1),
+                                                  details::Scalar(-2.4), bMax);
+
+    if (total_errors != 0) {
+      printf("Kokkos::BsrMatrix SpMV Test: Failed\n");
+    }
+  }
+
+  Kokkos::finalize();
+}
diff --git a/perf_test/sparse/KokkosSparse_spmv_merge.cpp b/perf_test/sparse/KokkosSparse_spmv_merge.cpp
index 43a936d6db..e0973bacd8 100644
--- a/perf_test/sparse/KokkosSparse_spmv_merge.cpp
+++ b/perf_test/sparse/KokkosSparse_spmv_merge.cpp
@@ -61,11 +61,11 @@
 #include "KokkosKernels_IOUtils.hpp"
 
 template <class matrix_type>
-matrix_type generate_unbalanced_matrix(const typename matrix_type::ordinal_type numRows,
-				       const typename matrix_type::ordinal_type numEntries,
-				       const typename matrix_type::ordinal_type numLongRows,
-				       const typename matrix_type::ordinal_type numLongEntries) {
-
+matrix_type generate_unbalanced_matrix(
+    const typename matrix_type::ordinal_type numRows,
+    const typename matrix_type::ordinal_type numEntries,
+    const typename matrix_type::ordinal_type numLongRows,
+    const typename matrix_type::ordinal_type numLongEntries) {
   using Scalar = typename matrix_type::value_type;
   using lno_t  = typename matrix_type::ordinal_type;
 
@@ -79,70 +79,82 @@ matrix_type generate_unbalanced_matrix(const typename matrix_type::ordinal_type
   // with numEntries per row.
 
   // Randomly pick the length of the rows using a normal distribution
-  std::mt19937 rand_generator(42); // Seed with 42 for reproducibility
-  std::normal_distribution<Scalar> row_dist{static_cast<Scalar>(numEntries), static_cast<Scalar>(std::sqrt(numEntries))};
+  std::mt19937 rand_generator(42);  // Seed with 42 for reproducibility
+  std::normal_distribution<Scalar> row_dist{
+      static_cast<Scalar>(numEntries),
+      static_cast<Scalar>(std::sqrt(numEntries))};
 
   std::vector<lno_t> permutation(numRows - numLongRows);
   std::vector<lno_t> row_map_vec(numRows + 1);
   row_map_vec[0] = 0;
-  for(lno_t rowIdx = 0; rowIdx < numRows - numLongRows; ++rowIdx) {
-    row_map_vec[rowIdx + 1] = row_map_vec[rowIdx] + static_cast<lno_t>(row_dist(rand_generator));
+  for (lno_t rowIdx = 0; rowIdx < numRows - numLongRows; ++rowIdx) {
+    row_map_vec[rowIdx + 1] =
+        row_map_vec[rowIdx] + static_cast<lno_t>(row_dist(rand_generator));
 
-    // also filling the permutation vector that will be used to construct long rows
+    // also filling the permutation vector that will be used to construct long
+    // rows
     permutation[rowIdx] = rowIdx;
   }
-  
-  std::normal_distribution<Scalar> long_row_dist{static_cast<Scalar>(numLongEntries), static_cast<Scalar>(numLongEntries/2)};
+
+  std::normal_distribution<Scalar> long_row_dist{
+      static_cast<Scalar>(numLongEntries),
+      static_cast<Scalar>(numLongEntries / 2)};
   lno_t rand_number;
-  for(lno_t rowIdx = numRows - numLongRows; rowIdx < numRows; ++rowIdx) {
-    rand_number = static_cast<lno_t>(long_row_dist(rand_generator));
+  for (lno_t rowIdx = numRows - numLongRows; rowIdx < numRows; ++rowIdx) {
+    rand_number             = static_cast<lno_t>(long_row_dist(rand_generator));
     row_map_vec[rowIdx + 1] = row_map_vec[rowIdx] + rand_number;
   }
   const lno_t numNNZ = row_map_vec[numRows];
 
   std::vector<lno_t> colind_vec(row_map_vec[numRows]);
-  // We loop over the first part of the matrix and assume that the bandwidth is 0.01*numRows
-  // i.e. highly concentrated around digaonal
-  std::normal_distribution<Scalar> entry_dist{static_cast<Scalar>(0.0), static_cast<Scalar>(numRows/100)};
-  for(lno_t rowIdx = 0; rowIdx < numRows - numLongRows; ++rowIdx) {
+  // We loop over the first part of the matrix and assume that the bandwidth is
+  // 0.01*numRows i.e. highly concentrated around digaonal
+  std::normal_distribution<Scalar> entry_dist{
+      static_cast<Scalar>(0.0), static_cast<Scalar>(numRows / 100)};
+  for (lno_t rowIdx = 0; rowIdx < numRows - numLongRows; ++rowIdx) {
     const lno_t rowLength = row_map_vec[rowIdx + 1] - row_map_vec[rowIdx];
-    // Making the stencil symmetric because it looks a bit more like a regular discretization
-    for(lno_t entryIdx = 0; entryIdx < (rowLength / 2); ++entryIdx) {
-      colind_vec[row_map_vec[rowIdx] + entryIdx]         = rowIdx - static_cast<lno_t>(entry_dist(rand_generator));
-      colind_vec[row_map_vec[rowIdx + 1] - entryIdx - 1] = rowIdx + static_cast<lno_t>(entry_dist(rand_generator));
+    // Making the stencil symmetric because it looks a bit more like a regular
+    // discretization
+    for (lno_t entryIdx = 0; entryIdx < (rowLength / 2); ++entryIdx) {
+      colind_vec[row_map_vec[rowIdx] + entryIdx] =
+          rowIdx - static_cast<lno_t>(entry_dist(rand_generator));
+      colind_vec[row_map_vec[rowIdx + 1] - entryIdx - 1] =
+          rowIdx + static_cast<lno_t>(entry_dist(rand_generator));
     }
     // Add diagonal entry if row length is an odd number
-    if((rowLength % 2) == 1) {
+    if ((rowLength % 2) == 1) {
       colind_vec[row_map_vec[rowIdx] + rowLength / 2] = rowIdx;
     }
   }
 
-  for(lno_t rowIdx = numRows - numLongRows; rowIdx < numRows; ++rowIdx) {
+  for (lno_t rowIdx = numRows - numLongRows; rowIdx < numRows; ++rowIdx) {
     // Generate a random permutation
     std::shuffle(permutation.begin(), permutation.end(), rand_generator);
 
     lno_t rowLength = row_map_vec[rowIdx + 1] - row_map_vec[rowIdx];
-    for(lno_t entryIdx = 0; entryIdx < rowLength; ++entryIdx) {
+    for (lno_t entryIdx = 0; entryIdx < rowLength; ++entryIdx) {
       colind_vec[row_map_vec[rowIdx] + entryIdx] = permutation[entryIdx];
     }
   }
 
   row_map_type row_map("row map", numRows + 1);
   entries_type entries("entries", numNNZ);
-  values_type  values ("values",  numNNZ);
+  values_type values("values", numNNZ);
 
   // Copy row map values to view
-  typename row_map_type::HostMirror row_map_h = Kokkos::create_mirror_view(row_map);
+  typename row_map_type::HostMirror row_map_h =
+      Kokkos::create_mirror_view(row_map);
   row_map_h(0) = 0;
-  for(lno_t rowIdx = 0; rowIdx < numRows; ++rowIdx) {
+  for (lno_t rowIdx = 0; rowIdx < numRows; ++rowIdx) {
     row_map_h(rowIdx + 1) = row_map_vec[rowIdx + 1];
   }
   Kokkos::deep_copy(row_map, row_map_h);
 
   // Copy column indices to view
-  typename row_map_type::HostMirror entries_h = Kokkos::create_mirror_view(entries);
+  typename row_map_type::HostMirror entries_h =
+      Kokkos::create_mirror_view(entries);
   entries_h(0) = 0;
-  for(lno_t entryIdx = 0; entryIdx < numNNZ; ++entryIdx) {
+  for (lno_t entryIdx = 0; entryIdx < numNNZ; ++entryIdx) {
     entries_h(entryIdx) = colind_vec[entryIdx];
   }
   Kokkos::deep_copy(entries, entries_h);
@@ -151,86 +163,121 @@ matrix_type generate_unbalanced_matrix(const typename matrix_type::ordinal_type
   Kokkos::deep_copy(values, 1.0);
 
   matrix_type unbalanced_matrix("unbalanced matrix", numRows, numRows, numNNZ,
-				values, row_map, entries);
+                                values, row_map, entries);
 
   std::cout << std::endl;
   std::cout << "Matrix statistics:" << std::endl
-	    << "  - average nnz per row: " << row_map_vec[numRows - numLongRows] / (numRows - numLongRows) << std::endl;
+            << "  - average nnz per row: "
+            << row_map_vec[numRows - numLongRows] / (numRows - numLongRows)
+            << std::endl;
 
   return unbalanced_matrix;
 }
 
 void print_help() {
   printf("SPMV merge benchmark code written by Luc Berger-Vergiat.\n");
-  printf("The goal is to test cusSPARSE's merge algorithm on imbalanced matrices.");
+  printf(
+      "The goal is to test cusSPARSE's merge algorithm on imbalanced "
+      "matrices.");
   printf("Options:\n");
-  printf("  --compare       : Compare the performance of the merge algo with the default algo.\n");
-  printf("  -l [LOOP]       : How many spmv to run to aggregate average time. \n");
+  printf(
+      "  --compare       : Compare the performance of the merge algo with the "
+      "default algo.\n");
+  printf(
+      "  -l [LOOP]       : How many spmv to run to aggregate average time. \n");
   printf("  -numRows        : Number of rows the matrix will contain.\n");
   printf("  -numEntries     : Number of entries per row.\n");
-  printf("  -numLongRows    : Number of rows that will contain more entries than the average.\n");
-  printf("  -numLongEntries : Number of entries per row in the unbalanced rows.\n");
+  printf(
+      "  -numLongRows    : Number of rows that will contain more entries than "
+      "the average.\n");
+  printf(
+      "  -numLongEntries : Number of entries per row in the unbalanced "
+      "rows.\n");
 }
 
 int main(int argc, char** argv) {
-
   using Scalar    = default_scalar;
   using lno_t     = default_lno_t;
   using size_type = default_size_type;
 
-  bool  compare        = false;
+  bool compare         = false;
   lno_t loop           = 100;
   lno_t numRows        = 175000;
   lno_t numEntries     = 15;
   lno_t numLongRows    = 4;
   lno_t numLongEntries = 30000;
 
-  if(argc == 1) {
+  if (argc == 1) {
     print_help();
     return 0;
   }
 
-  for(int i = 0; i < argc; i++) {
-    if((strcmp(argv[i],"--compare"        )==0)) {compare=true; continue;}
-    if((strcmp(argv[i],"-l"               )==0)) {loop=atoi(argv[++i]); continue;}
-    if((strcmp(argv[i],"-numRows"         )==0)) {numRows=atoi(argv[++i]); continue;}
-    if((strcmp(argv[i],"-numEntries"      )==0)) {numEntries=atoi(argv[++i]); continue;}
-    if((strcmp(argv[i],"-numLongRows"     )==0)) {numLongRows=atoi(argv[++i]); continue;}
-    if((strcmp(argv[i],"-numLongEntries"  )==0)) {numLongEntries=atoi(argv[++i]); continue;}
-    if((strcmp(argv[i],"--help")==0) || (strcmp(argv[i],"-h")==0)) {
+  for (int i = 0; i < argc; i++) {
+    if ((strcmp(argv[i], "--compare") == 0)) {
+      compare = true;
+      continue;
+    }
+    if ((strcmp(argv[i], "-l") == 0)) {
+      loop = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-numRows") == 0)) {
+      numRows = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-numEntries") == 0)) {
+      numEntries = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-numLongRows") == 0)) {
+      numLongRows = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-numLongEntries") == 0)) {
+      numLongEntries = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "--help") == 0) || (strcmp(argv[i], "-h") == 0)) {
       print_help();
       return 0;
     }
   }
 
-   // We want an odd number of entries in all rows to generate a symmetric matrix
-  if((numEntries / 2) == 0) {++numEntries;}
-  if((numLongEntries / 2) == 0) {++numLongEntries;}
+  // We want an odd number of entries in all rows to generate a symmetric matrix
+  if ((numEntries / 2) == 0) {
+    ++numEntries;
+  }
+  if ((numLongEntries / 2) == 0) {
+    ++numLongEntries;
+  }
 
   std::cout << "Test parameters:" << std::endl
-	    << "  - loop:           " << loop << std::endl
-	    << "  - compare:        " << compare << std::endl
-	    << "  - numRows:        " << numRows << std::endl
-	    << "  - numEntries:     " << numEntries << std::endl
-	    << "  - numLongRows:    " << numLongRows << std::endl
-	    << "  - numLongEntries: " << numLongEntries << std::endl;
+            << "  - loop:           " << loop << std::endl
+            << "  - compare:        " << compare << std::endl
+            << "  - numRows:        " << numRows << std::endl
+            << "  - numEntries:     " << numEntries << std::endl
+            << "  - numLongRows:    " << numLongRows << std::endl
+            << "  - numLongEntries: " << numLongEntries << std::endl;
 
   Kokkos::initialize(argc, argv);
 
   {
-    if(std::is_same<Kokkos::Cuda, Kokkos::DefaultExecutionSpace>::value) {
-      // Note that we template the matrix with entries=lno_t and offsets=lno_t to make sure
-      // it verifies the cusparse requirements
-      using matrix_type = KokkosSparse::CrsMatrix<Scalar, lno_t, Kokkos::DefaultExecutionSpace, void, lno_t>;
-      using values_type  = typename matrix_type::values_type::non_const_type;
+    if (std::is_same<Kokkos::Cuda, Kokkos::DefaultExecutionSpace>::value) {
+      // Note that we template the matrix with entries=lno_t and offsets=lno_t
+      // to make sure it verifies the cusparse requirements
+      using matrix_type =
+          KokkosSparse::CrsMatrix<Scalar, lno_t, Kokkos::DefaultExecutionSpace,
+                                  void, lno_t>;
+      using values_type   = typename matrix_type::values_type::non_const_type;
       const Scalar SC_ONE = Kokkos::ArithTraits<Scalar>::one();
       const Scalar alpha  = SC_ONE + SC_ONE;
       const Scalar beta   = alpha + SC_ONE;
 
-      matrix_type test_matrix = generate_unbalanced_matrix<matrix_type>(numRows, numEntries, numLongRows, numLongEntries);
+      matrix_type test_matrix = generate_unbalanced_matrix<matrix_type>(
+          numRows, numEntries, numLongRows, numLongEntries);
 
       values_type y("right hand side", test_matrix.numRows());
-      values_type x("left hand side",  test_matrix.numCols());
+      values_type x("left hand side", test_matrix.numCols());
       Kokkos::deep_copy(x, SC_ONE);
       Kokkos::deep_copy(y, SC_ONE);
 
@@ -241,62 +288,69 @@ int main(int argc, char** argv) {
       KokkosSparse::spmv(controls, "N", alpha, test_matrix, x, beta, y);
 
       double min_time = 1.0e32, max_time = 0.0, avg_time = 0.0;
-      for(int iterIdx = 0; iterIdx < loop; ++iterIdx) {
-	Kokkos::Timer timer;
-	KokkosSparse::spmv(controls, "N", alpha, test_matrix, x, beta, y);
-	Kokkos::fence();
-	double time = timer.seconds();
-	avg_time += time;
-	if(time>max_time) max_time = time;
-	if(time<min_time) min_time = time;
+      for (int iterIdx = 0; iterIdx < loop; ++iterIdx) {
+        Kokkos::Timer timer;
+        KokkosSparse::spmv(controls, "N", alpha, test_matrix, x, beta, y);
+        Kokkos::fence();
+        double time = timer.seconds();
+        avg_time += time;
+        if (time > max_time) max_time = time;
+        if (time < min_time) min_time = time;
       }
 
       std::cout << "cuSPARSE Merge alg    ---  min: " << min_time
-		<< " max: " << max_time
-		<< " avg: " << avg_time / loop << std::endl;
+                << " max: " << max_time << " avg: " << avg_time / loop
+                << std::endl;
 
       // Run the cusparse default algorithm and native kokkos-kernels algorithm
       // then output timings for comparison
-      if(compare) {
-	controls.setParameter("algorithm", "default");
-
-	min_time = 1.0e32; max_time = 0.0; avg_time = 0.0;
-	for(int iterIdx = 0; iterIdx < loop; ++iterIdx) {
-	  Kokkos::Timer timer;
-	  KokkosSparse::spmv(controls, "N", alpha, test_matrix, x, beta, y);
-	  Kokkos::fence();
-	  double time = timer.seconds();
-	  avg_time += time;
-	  if(time>max_time) max_time = time;
-	  if(time<min_time) min_time = time;
-	}
-      
-	std::cout << "cuSPARSE Default alg  ---  min: " << min_time
-		  << " max: " << max_time
-		  << " avg: " << avg_time / loop << std::endl;
-
-	controls.setParameter("algorithm", "native");
-
-	min_time = 1.0e32; max_time = 0.0; avg_time = 0.0;
-	for(int iterIdx = 0; iterIdx < loop; ++iterIdx) {
-	  Kokkos::Timer timer;
-	  // KokkosSparse::spmv(controls, "N", alpha, test_matrix, x, beta, y);
-	  KokkosSparse::Impl::spmv_beta<matrix_type, values_type, values_type, 1>(controls, "N", alpha, test_matrix, x, beta, y);
-	  Kokkos::fence();
-	  double time = timer.seconds();
-	  avg_time += time;
-	  if(time>max_time) max_time = time;
-	  if(time<min_time) min_time = time;
-	}
-      
-	std::cout << "Kokkos Native alg     ---  min: " << min_time
-		  << " max: " << max_time
-		  << " avg: " << avg_time / loop << std::endl;
+      if (compare) {
+        controls.setParameter("algorithm", "default");
+
+        min_time = 1.0e32;
+        max_time = 0.0;
+        avg_time = 0.0;
+        for (int iterIdx = 0; iterIdx < loop; ++iterIdx) {
+          Kokkos::Timer timer;
+          KokkosSparse::spmv(controls, "N", alpha, test_matrix, x, beta, y);
+          Kokkos::fence();
+          double time = timer.seconds();
+          avg_time += time;
+          if (time > max_time) max_time = time;
+          if (time < min_time) min_time = time;
+        }
+
+        std::cout << "cuSPARSE Default alg  ---  min: " << min_time
+                  << " max: " << max_time << " avg: " << avg_time / loop
+                  << std::endl;
+
+        controls.setParameter("algorithm", "native");
+
+        min_time = 1.0e32;
+        max_time = 0.0;
+        avg_time = 0.0;
+        for (int iterIdx = 0; iterIdx < loop; ++iterIdx) {
+          Kokkos::Timer timer;
+          // KokkosSparse::spmv(controls, "N", alpha, test_matrix, x, beta, y);
+          KokkosSparse::Impl::spmv_beta<matrix_type, values_type, values_type,
+                                        1>(controls, "N", alpha, test_matrix, x,
+                                           beta, y);
+          Kokkos::fence();
+          double time = timer.seconds();
+          avg_time += time;
+          if (time > max_time) max_time = time;
+          if (time < min_time) min_time = time;
+        }
+
+        std::cout << "Kokkos Native alg     ---  min: " << min_time
+                  << " max: " << max_time << " avg: " << avg_time / loop
+                  << std::endl;
       }
     } else {
-      std::cout << "The default execution space is not Cuda, nothing to do!" << std::endl;
+      std::cout << "The default execution space is not Cuda, nothing to do!"
+                << std::endl;
     }
   }
 
   Kokkos::finalize();
-} // main
+}  // main
diff --git a/perf_test/sparse/KokkosSparse_spmv_struct.cpp b/perf_test/sparse/KokkosSparse_spmv_struct.cpp
index 88c69c02f1..1d9853a7ab 100644
--- a/perf_test/sparse/KokkosSparse_spmv_struct.cpp
+++ b/perf_test/sparse/KokkosSparse_spmv_struct.cpp
@@ -61,137 +61,207 @@
 #include <KokkosSparse_spmv_impl.hpp>
 #include "KokkosKernels_default_types.hpp"
 
-enum {STRUCT, UNSTR};
-enum {AUTO, DYNAMIC, STATIC};
+enum { STRUCT, UNSTR };
+enum { AUTO, DYNAMIC, STATIC };
 
 void print_help() {
   printf("SPMV_struct benchmark code written by Luc Berger-Vergiat.\n");
   printf("Options:\n");
-  printf("  --check-errors  : Determine if the result of spmv_struct is compared to serial unstructured spmv.\n");
-  printf("  --compare       : Compare results efficiency of spmv_struct and spmv.\n");
-  printf("  -l [LOOP]       : How many spmv to run to aggregate average time. \n");
+  printf(
+      "  --check-errors  : Determine if the result of spmv_struct is compared "
+      "to serial unstructured spmv.\n");
+  printf(
+      "  --compare       : Compare results efficiency of spmv_struct and "
+      "spmv.\n");
+  printf(
+      "  -l [LOOP]       : How many spmv to run to aggregate average time. \n");
   printf("  -nx             : How many nodes in x direction. \n");
   printf("  -ny             : How many nodes in y direction. \n");
   printf("  -nz             : How many nodes in z direction. \n");
-  printf("  -st             : The stencil type used for discretization: 1 -> FD, 2 -> FE.\n");
-  printf("  -dim            : Number of spacial dimensions used in the problem: 1, 2 or 3\n");
+  printf(
+      "  -st             : The stencil type used for discretization: 1 -> FD, "
+      "2 -> FE.\n");
+  printf(
+      "  -dim            : Number of spacial dimensions used in the problem: "
+      "1, 2 or 3\n");
   printf("  -ws             : Worksets. \n");
   printf("  -ts             : Team Size. \n");
   printf("  -vl             : Vector length. \n");
 }
 
-int main(int argc, char **argv)
-{
-  int  nx = 100;
-  int  ny = 100;
-  int  nz = 100;
-  int  stencil_type = 1;
-  int  numDimensions = 2;
-  int  numVecs = 1;
+int main(int argc, char **argv) {
+  int nx            = 100;
+  int ny            = 100;
+  int nz            = 100;
+  int stencil_type  = 1;
+  int numDimensions = 2;
+  int numVecs       = 1;
   bool check_errors = false;
-  bool compare = false;
-  int  loop = 100;
-  int  vl = -1;
-  int  ts = -1;
-  int  ws = -1;
+  bool compare      = false;
+  int loop          = 100;
+  int vl            = -1;
+  int ts            = -1;
+  int ws            = -1;
 
-  if(argc == 1) {
+  if (argc == 1) {
     print_help();
     return 0;
   }
 
-  for(int i = 0; i < argc; i++)
-    {
-      if((strcmp(argv[i],"-nx" )==0)) {nx=atoi(argv[++i]); continue;}
-      if((strcmp(argv[i],"-ny" )==0)) {ny=atoi(argv[++i]); continue;}
-      if((strcmp(argv[i],"-nz" )==0)) {nz=atoi(argv[++i]); continue;}
-      if((strcmp(argv[i],"-st" )==0)) {stencil_type=atoi(argv[++i]); continue;}
-      if((strcmp(argv[i],"-dim")==0)) {numDimensions=atoi(argv[++i]); continue;}
-      if((strcmp(argv[i],"-mv" )==0)) {numVecs=atoi(argv[++i]); continue;}
-      if((strcmp(argv[i],"-l"  )==0)) {loop=atoi(argv[++i]); continue;}
-      if((strcmp(argv[i],"-vl" )==0)) {vl=atoi(argv[++i]); continue;}
-      if((strcmp(argv[i],"-ts" )==0)) {ts=atoi(argv[++i]); continue;}
-      if((strcmp(argv[i],"-ws" )==0)) {ws=atoi(argv[++i]); continue;}
-      if((strcmp(argv[i],"--check-errors")==0)) {check_errors=true; continue;}
-      if((strcmp(argv[i],"--compare")==0)) {compare=true; continue;}
-      if((strcmp(argv[i],"--help")==0) || (strcmp(argv[i],"-h")==0)) {
-        print_help();
-        return 0;
-      }
+  for (int i = 0; i < argc; i++) {
+    if ((strcmp(argv[i], "-nx") == 0)) {
+      nx = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-ny") == 0)) {
+      ny = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-nz") == 0)) {
+      nz = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-st") == 0)) {
+      stencil_type = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-dim") == 0)) {
+      numDimensions = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-mv") == 0)) {
+      numVecs = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-l") == 0)) {
+      loop = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-vl") == 0)) {
+      vl = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-ts") == 0)) {
+      ts = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-ws") == 0)) {
+      ws = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "--check-errors") == 0)) {
+      check_errors = true;
+      continue;
+    }
+    if ((strcmp(argv[i], "--compare") == 0)) {
+      compare = true;
+      continue;
     }
+    if ((strcmp(argv[i], "--help") == 0) || (strcmp(argv[i], "-h") == 0)) {
+      print_help();
+      return 0;
+    }
+  }
 
-  if(vl < 0) {
+  if (vl < 0) {
     vl = 2;
-  } else if(ts < 0) {
+  } else if (ts < 0) {
     ts = 256 / vl;
-    if(ws < 0) {
-      if(numDimensions == 1) {
-	ws = (nx - 2 + ts - 1) / ts;
-      } else if(numDimensions == 2) {
-	ws = ((nx - 2)*(ny - 2) + ts - 1) / ts;
-      } else if(numDimensions == 3) {
-	ws = ((nx - 2)*(ny - 2)*(nz - 2) + ts - 1) / ts;
+    if (ws < 0) {
+      if (numDimensions == 1) {
+        ws = (nx - 2 + ts - 1) / ts;
+      } else if (numDimensions == 2) {
+        ws = ((nx - 2) * (ny - 2) + ts - 1) / ts;
+      } else if (numDimensions == 3) {
+        ws = ((nx - 2) * (ny - 2) * (nz - 2) + ts - 1) / ts;
       }
     }
   }
 
-  Kokkos::initialize(argc,argv);
+  Kokkos::initialize(argc, argv);
   {
     typedef default_size_type size_type;
     typedef default_lno_t lno_t;
     typedef default_scalar Scalar;
-    typedef KokkosSparse::CrsMatrix<Scalar,lno_t,Kokkos::DefaultExecutionSpace,void,size_type> matrix_type;
-    typedef typename Kokkos::View<Scalar**,Kokkos::LayoutLeft> mv_type;
-    // typedef typename Kokkos::View<Scalar*,Kokkos::LayoutLeft,Kokkos::MemoryRandomAccess > mv_random_read_type;
+    typedef KokkosSparse::CrsMatrix<
+        Scalar, lno_t, Kokkos::DefaultExecutionSpace, void, size_type>
+        matrix_type;
+    typedef typename Kokkos::View<Scalar **, Kokkos::LayoutLeft> mv_type;
+    // typedef typename
+    // Kokkos::View<Scalar*,Kokkos::LayoutLeft,Kokkos::MemoryRandomAccess >
+    // mv_random_read_type;
     typedef typename mv_type::HostMirror h_mv_type;
 
-    int leftBC = 1, rightBC = 1, frontBC = 1, backBC = 1, bottomBC = 1, topBC = 1;
+    int leftBC = 1, rightBC = 1, frontBC = 1, backBC = 1, bottomBC = 1,
+        topBC = 1;
 
-    Kokkos::View<lno_t*, Kokkos::HostSpace> structure("Spmv Structure", numDimensions);
-    Kokkos::View<lno_t*[3], Kokkos::HostSpace> mat_structure("Matrix Structure", numDimensions);
-    if(numDimensions == 1) {
-      structure(0) = nx;
+    Kokkos::View<lno_t *, Kokkos::HostSpace> structure("Spmv Structure",
+                                                       numDimensions);
+    Kokkos::View<lno_t * [3], Kokkos::HostSpace> mat_structure(
+        "Matrix Structure", numDimensions);
+    if (numDimensions == 1) {
+      structure(0)        = nx;
       mat_structure(0, 0) = nx;
-    } else if(numDimensions == 2) {
-      structure(0) = nx;
-      structure(1) = ny;
+    } else if (numDimensions == 2) {
+      structure(0)        = nx;
+      structure(1)        = ny;
       mat_structure(0, 0) = nx;
       mat_structure(1, 0) = ny;
-      if(leftBC   == 1) { mat_structure(0, 1) = 1; }
-      if(rightBC  == 1) { mat_structure(0, 2) = 1; }
-      if(bottomBC == 1) { mat_structure(1, 1) = 1; }
-      if(topBC    == 1) { mat_structure(1, 2) = 1; }
-    } else if(numDimensions == 3) {
-      structure(0) = nx;
-      structure(1) = ny;
-      structure(2) = nz;
+      if (leftBC == 1) {
+        mat_structure(0, 1) = 1;
+      }
+      if (rightBC == 1) {
+        mat_structure(0, 2) = 1;
+      }
+      if (bottomBC == 1) {
+        mat_structure(1, 1) = 1;
+      }
+      if (topBC == 1) {
+        mat_structure(1, 2) = 1;
+      }
+    } else if (numDimensions == 3) {
+      structure(0)        = nx;
+      structure(1)        = ny;
+      structure(2)        = nz;
       mat_structure(0, 0) = nx;
       mat_structure(1, 0) = ny;
       mat_structure(2, 0) = nz;
-      if(leftBC   == 1) { mat_structure(0, 1) = 1; }
-      if(rightBC  == 1) { mat_structure(0, 2) = 1; }
-      if(frontBC  == 1) { mat_structure(1, 1) = 1; }
-      if(backBC   == 1) { mat_structure(1, 2) = 1; }
-      if(bottomBC == 1) { mat_structure(2, 1) = 1; }
-      if(topBC    == 1) { mat_structure(2, 2) = 1; }
+      if (leftBC == 1) {
+        mat_structure(0, 1) = 1;
+      }
+      if (rightBC == 1) {
+        mat_structure(0, 2) = 1;
+      }
+      if (frontBC == 1) {
+        mat_structure(1, 1) = 1;
+      }
+      if (backBC == 1) {
+        mat_structure(1, 2) = 1;
+      }
+      if (bottomBC == 1) {
+        mat_structure(2, 1) = 1;
+      }
+      if (topBC == 1) {
+        mat_structure(2, 2) = 1;
+      }
     }
 
     std::string discrectization_stencil;
-    if(stencil_type == 1) {
+    if (stencil_type == 1) {
       discrectization_stencil = "FD";
-    } else if(stencil_type == 2) {
+    } else if (stencil_type == 2) {
       discrectization_stencil = "FE";
     }
 
     matrix_type A;
-    if(numDimensions == 1) {
+    if (numDimensions == 1) {
       A = Test::generate_structured_matrix1D<matrix_type>(mat_structure);
-    } else if(numDimensions == 2) {
-      A = Test::generate_structured_matrix2D<matrix_type>(discrectization_stencil,
-							  mat_structure);
-    } else if(numDimensions == 3) {
-      A = Test::generate_structured_matrix3D<matrix_type>(discrectization_stencil,
-							  mat_structure);
+    } else if (numDimensions == 2) {
+      A = Test::generate_structured_matrix2D<matrix_type>(
+          discrectization_stencil, mat_structure);
+    } else if (numDimensions == 3) {
+      A = Test::generate_structured_matrix3D<matrix_type>(
+          discrectization_stencil, mat_structure);
     }
 
     mv_type x("X", A.numCols(), numVecs);
@@ -201,32 +271,35 @@ int main(int argc, char **argv)
     h_mv_type h_y = Kokkos::create_mirror_view(y);
     h_mv_type h_y_compare;
 
-    for(int rowIdx = 0; rowIdx < A.numCols(); ++rowIdx) {
-      for(int vecIdx = 0; vecIdx < numVecs; ++vecIdx) {
-        h_x(rowIdx, vecIdx) = static_cast<Scalar>(1.0*(rand()%40)-20.0);
-        h_y(rowIdx, vecIdx) = static_cast<Scalar>(1.0*(rand()%40)-20.0);
+    for (int rowIdx = 0; rowIdx < A.numCols(); ++rowIdx) {
+      for (int vecIdx = 0; vecIdx < numVecs; ++vecIdx) {
+        h_x(rowIdx, vecIdx) = static_cast<Scalar>(1.0 * (rand() % 40) - 20.0);
+        h_y(rowIdx, vecIdx) = static_cast<Scalar>(1.0 * (rand() % 40) - 20.0);
       }
     }
 
-    if(check_errors) {
+    if (check_errors) {
       h_y_compare = Kokkos::create_mirror(y);
-      typename matrix_type::StaticCrsGraphType::HostMirror h_graph = Kokkos::create_mirror(A.graph);
-      typename matrix_type::values_type::HostMirror h_values = Kokkos::create_mirror_view(A.values);
+      typename matrix_type::StaticCrsGraphType::HostMirror h_graph =
+          Kokkos::create_mirror(A.graph);
+      typename matrix_type::values_type::HostMirror h_values =
+          Kokkos::create_mirror_view(A.values);
 
       // Error Check Gold Values
-      for(int rowIdx = 0; rowIdx < A.numRows(); ++rowIdx) {
+      for (int rowIdx = 0; rowIdx < A.numRows(); ++rowIdx) {
         int start = h_graph.row_map(rowIdx);
-        int end = h_graph.row_map(rowIdx + 1);
+        int end   = h_graph.row_map(rowIdx + 1);
 
-        for(int vecIdx = 0; vecIdx < numVecs; ++vecIdx) {
+        for (int vecIdx = 0; vecIdx < numVecs; ++vecIdx) {
           h_y_compare(rowIdx, vecIdx) = 0;
         }
 
-        for(int entryIdx = start; entryIdx < end; ++entryIdx) {
+        for (int entryIdx = start; entryIdx < end; ++entryIdx) {
           // Scalar tmp_val = h_graph.entries(entryIdx) + i;
           int colIdx = h_graph.entries(entryIdx);
-          for(int vecIdx = 0; vecIdx < numVecs; ++vecIdx) {
-            h_y_compare(rowIdx, vecIdx) += h_values(entryIdx)*h_x(colIdx, vecIdx);
+          for (int vecIdx = 0; vecIdx < numVecs; ++vecIdx) {
+            h_y_compare(rowIdx, vecIdx) +=
+                h_values(entryIdx) * h_x(colIdx, vecIdx);
           }
         }
       }
@@ -234,8 +307,10 @@ int main(int argc, char **argv)
 
     Kokkos::deep_copy(x, h_x);
     Kokkos::deep_copy(y, h_y);
-    Kokkos::View<Scalar**, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace> x1("X1", A.numCols(), numVecs);
-    Kokkos::View<Scalar**, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace> y1("Y1", A.numRows(), numVecs);
+    Kokkos::View<Scalar **, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace>
+        x1("X1", A.numCols(), numVecs);
+    Kokkos::View<Scalar **, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace>
+        y1("Y1", A.numRows(), numVecs);
     Kokkos::deep_copy(x1, h_x);
 
     {
@@ -244,93 +319,120 @@ int main(int argc, char **argv)
       double min_time = 1.0e32;
       double max_time = 0.0;
       double ave_time = 0.0;
-      for(int i=0; i<loop; i++) {
-	Kokkos::Timer timer;
-	KokkosSparse::Experimental::spmv_struct("N", stencil_type, structure, 1.0, A, x1, 1.0, y1);
-	Kokkos::fence();
-	double time = timer.seconds();
-	ave_time += time;
-	if(time>max_time) max_time = time;
-	if(time<min_time) min_time = time;
+      for (int i = 0; i < loop; i++) {
+        Kokkos::Timer timer;
+        KokkosSparse::Experimental::spmv_struct("N", stencil_type, structure,
+                                                1.0, A, x1, 1.0, y1);
+        Kokkos::fence();
+        double time = timer.seconds();
+        ave_time += time;
+        if (time > max_time) max_time = time;
+        if (time < min_time) min_time = time;
       }
 
       // Performance Output
-      double matrix_size = 1.0*((A.nnz()*(sizeof(Scalar) + sizeof(int)) + A.numRows()*sizeof(int)))/1024/1024;
-      double vector_size = 2.0*A.numRows()*sizeof(Scalar)/1024/1024;
-      double vector_readwrite = (A.nnz() + A.numCols())*sizeof(Scalar)/1024/1024;
-
-      double problem_size = matrix_size+vector_size;
-      printf("Type NNZ NumRows NumCols ProblemSize(MB) AveBandwidth(GB/s) MinBandwidth(GB/s) MaxBandwidth(GB/s) AveGFlop MinGFlop MaxGFlop aveTime(ms) maxTime(ms) minTime(ms)\n");
-      printf("Struct %zu %zu %zu %6.2lf ( %6.2lf %6.2lf %6.2lf ) ( %6.3lf %6.3lf %6.3lf ) ( %6.3lf %6.3lf %6.3lf )\n",
-	     (size_t) A.nnz(), (size_t) A.numRows(), (size_t) A.numCols(), problem_size,
-	     (matrix_size+vector_readwrite)/ave_time*loop/1024, (matrix_size+vector_readwrite)/max_time/1024, (matrix_size+vector_readwrite)/min_time/1024,
-	     2.0*A.nnz()*loop/ave_time/1e9, 2.0*A.nnz()/max_time/1e9, 2.0*A.nnz()/min_time/1e9,
-	     ave_time/loop*1000, max_time*1000, min_time*1000);
+      double matrix_size = 1.0 *
+                           ((A.nnz() * (sizeof(Scalar) + sizeof(int)) +
+                             A.numRows() * sizeof(int))) /
+                           1024 / 1024;
+      double vector_size = 2.0 * A.numRows() * sizeof(Scalar) / 1024 / 1024;
+      double vector_readwrite =
+          (A.nnz() + A.numCols()) * sizeof(Scalar) / 1024 / 1024;
+
+      double problem_size = matrix_size + vector_size;
+      printf(
+          "Type NNZ NumRows NumCols ProblemSize(MB) AveBandwidth(GB/s) "
+          "MinBandwidth(GB/s) MaxBandwidth(GB/s) AveGFlop MinGFlop MaxGFlop "
+          "aveTime(ms) maxTime(ms) minTime(ms)\n");
+      printf(
+          "Struct %zu %zu %zu %6.2lf ( %6.2lf %6.2lf %6.2lf ) ( %6.3lf %6.3lf "
+          "%6.3lf ) ( %6.3lf %6.3lf %6.3lf )\n",
+          (size_t)A.nnz(), (size_t)A.numRows(), (size_t)A.numCols(),
+          problem_size,
+          (matrix_size + vector_readwrite) / ave_time * loop / 1024,
+          (matrix_size + vector_readwrite) / max_time / 1024,
+          (matrix_size + vector_readwrite) / min_time / 1024,
+          2.0 * A.nnz() * loop / ave_time / 1e9, 2.0 * A.nnz() / max_time / 1e9,
+          2.0 * A.nnz() / min_time / 1e9, ave_time / loop * 1000,
+          max_time * 1000, min_time * 1000);
       Kokkos::Profiling::popRegion();
     }
 
-    if(compare) {
+    if (compare) {
       Kokkos::Profiling::pushRegion("Unstructured spmv test");
       // Benchmark
       double min_time = 1.0e32;
       double max_time = 0.0;
       double ave_time = 0.0;
-      for(int i=0;i<loop;i++) {
-	Kokkos::Timer timer;
-	KokkosSparse::spmv("N", 1.0, A, x1, 1.0, y1);
-	Kokkos::fence();
-	double time = timer.seconds();
-	ave_time += time;
-	if(time>max_time) max_time = time;
-	if(time<min_time) min_time = time;
+      for (int i = 0; i < loop; i++) {
+        Kokkos::Timer timer;
+        KokkosSparse::spmv("N", 1.0, A, x1, 1.0, y1);
+        Kokkos::fence();
+        double time = timer.seconds();
+        ave_time += time;
+        if (time > max_time) max_time = time;
+        if (time < min_time) min_time = time;
       }
 
       // Performance Output
-      double matrix_size = 1.0*((A.nnz()*(sizeof(Scalar)+sizeof(int)) + A.numRows()*sizeof(int)))/1024/1024;
-      double vector_size = 2.0*A.numRows()*sizeof(Scalar)/1024/1024;
-      double vector_readwrite = (A.nnz() + A.numCols())*sizeof(Scalar)/1024/1024;
-
-      double problem_size = matrix_size+vector_size;
-      printf("Unstr %zu %zu %zu %6.2lf ( %6.2lf %6.2lf %6.2lf ) ( %6.3lf %6.3lf %6.3lf ) ( %6.3lf %6.3lf %6.3lf )\n",
-	     (size_t) A.nnz(), (size_t) A.numRows(), (size_t) A.numCols(), problem_size,
-	     (matrix_size+vector_readwrite)/ave_time*loop/1024, (matrix_size+vector_readwrite)/max_time/1024,(matrix_size+vector_readwrite)/min_time/1024,
-	     2.0*A.nnz()*loop/ave_time/1e9, 2.0*A.nnz()/max_time/1e9, 2.0*A.nnz()/min_time/1e9,
-	     ave_time/loop*1000, max_time*1000, min_time*1000);
+      double matrix_size = 1.0 *
+                           ((A.nnz() * (sizeof(Scalar) + sizeof(int)) +
+                             A.numRows() * sizeof(int))) /
+                           1024 / 1024;
+      double vector_size = 2.0 * A.numRows() * sizeof(Scalar) / 1024 / 1024;
+      double vector_readwrite =
+          (A.nnz() + A.numCols()) * sizeof(Scalar) / 1024 / 1024;
+
+      double problem_size = matrix_size + vector_size;
+      printf(
+          "Unstr %zu %zu %zu %6.2lf ( %6.2lf %6.2lf %6.2lf ) ( %6.3lf %6.3lf "
+          "%6.3lf ) ( %6.3lf %6.3lf %6.3lf )\n",
+          (size_t)A.nnz(), (size_t)A.numRows(), (size_t)A.numCols(),
+          problem_size,
+          (matrix_size + vector_readwrite) / ave_time * loop / 1024,
+          (matrix_size + vector_readwrite) / max_time / 1024,
+          (matrix_size + vector_readwrite) / min_time / 1024,
+          2.0 * A.nnz() * loop / ave_time / 1e9, 2.0 * A.nnz() / max_time / 1e9,
+          2.0 * A.nnz() / min_time / 1e9, ave_time / loop * 1000,
+          max_time * 1000, min_time * 1000);
       Kokkos::Profiling::popRegion();
     }
 
-    if(check_errors) {
+    if (check_errors) {
       // Error Check
-      Kokkos::View<Scalar**, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace> x_check("X_check", A.numCols(), numVecs);
-      Kokkos::View<Scalar**, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace> y_check("Y_check", A.numRows(), numVecs);
+      Kokkos::View<Scalar **, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace>
+          x_check("X_check", A.numCols(), numVecs);
+      Kokkos::View<Scalar **, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace>
+          y_check("Y_check", A.numRows(), numVecs);
       Kokkos::deep_copy(x_check, h_x);
-      KokkosSparse::Experimental::spmv_struct("N", stencil_type, structure, 1.0, A, x_check, 1.0, y_check);
+      KokkosSparse::Experimental::spmv_struct("N", stencil_type, structure, 1.0,
+                                              A, x_check, 1.0, y_check);
       Kokkos::fence();
 
       Kokkos::deep_copy(h_y, y_check);
       Scalar error = 0;
-      Scalar sum = 0;
-      for(int rowIdx = 0; rowIdx < A.numRows(); ++rowIdx) {
-        for(int vecIdx = 0; vecIdx < numVecs; ++vecIdx) {
-          error += (h_y_compare(rowIdx, vecIdx) - h_y(rowIdx, vecIdx))*(h_y_compare(rowIdx, vecIdx) - h_y(rowIdx, vecIdx));
-          sum += h_y_compare(rowIdx, vecIdx)*h_y_compare(rowIdx, vecIdx);
+      Scalar sum   = 0;
+      for (int rowIdx = 0; rowIdx < A.numRows(); ++rowIdx) {
+        for (int vecIdx = 0; vecIdx < numVecs; ++vecIdx) {
+          error += (h_y_compare(rowIdx, vecIdx) - h_y(rowIdx, vecIdx)) *
+                   (h_y_compare(rowIdx, vecIdx) - h_y(rowIdx, vecIdx));
+          sum += h_y_compare(rowIdx, vecIdx) * h_y_compare(rowIdx, vecIdx);
         }
       }
 
-      int num_errors = 0;
+      int num_errors     = 0;
       double total_error = 0;
-      double total_sum = 0;
-      num_errors += (error/(sum==0?1:sum))>1e-5?1:0;
+      double total_sum   = 0;
+      num_errors += (error / (sum == 0 ? 1 : sum)) > 1e-5 ? 1 : 0;
       total_error += error;
       total_sum += sum;
 
-      if(total_error == 0) {
-	printf("Kokkos::MultiVector Test: Passed\n");
+      if (total_error == 0) {
+        printf("Kokkos::MultiVector Test: Passed\n");
       } else {
-	printf("Kokkos::MultiVector Test: Failed\n");
+        printf("Kokkos::MultiVector Test: Failed\n");
       }
     }
-
   }
   Kokkos::finalize();
 }
diff --git a/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp b/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp
index 7d8ce3cd5c..ca16f2067e 100644
--- a/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp
+++ b/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp
@@ -64,20 +64,32 @@
 #include <cusparse.h>
 #endif
 
-enum {STRUCT, UNSTR};
+enum { STRUCT, UNSTR };
 
 void print_help() {
   printf("SPMV_struct benchmark code written by Luc Berger-Vergiat.\n");
   printf("Options:\n");
-  printf("  --check-errors     : Determine if the result of spmv_struct is compared to serial unstructured spmv.\n");
-  printf("  --compare          : Compare results efficiency of spmv_struct and spmv.\n");
-  printf("  --compare-cusparse : Compare results efficiency of spmv_struct and cusparseDcsrmv.\n");
-  printf("  -l [LOOP]          : How many spmv to run to aggregate average time. \n");
+  printf(
+      "  --check-errors     : Determine if the result of spmv_struct is "
+      "compared to serial unstructured spmv.\n");
+  printf(
+      "  --compare          : Compare results efficiency of spmv_struct and "
+      "spmv.\n");
+  printf(
+      "  --compare-cusparse : Compare results efficiency of spmv_struct and "
+      "cusparseDcsrmv.\n");
+  printf(
+      "  -l [LOOP]          : How many spmv to run to aggregate average time. "
+      "\n");
   printf("  -nx                : How many nodes in x direction. \n");
   printf("  -ny                : How many nodes in y direction. \n");
   printf("  -nz                : How many nodes in z direction. \n");
-  printf("  -st                : The stencil type used for discretization: 1 -> FD, 2 -> FE.\n");
-  printf("  -dim               : Number of spacial dimensions used in the problem: 1, 2 or 3\n");
+  printf(
+      "  -st                : The stencil type used for discretization: 1 -> "
+      "FD, 2 -> FE.\n");
+  printf(
+      "  -dim               : Number of spacial dimensions used in the "
+      "problem: 1, 2 or 3\n");
   printf("  -ws                : Worksets. \n");
   printf("  -ts                : Team Size. \n");
   printf("  -vl                : Vector length. \n");
@@ -87,123 +99,119 @@ void print_help() {
   printf("  --print-lp         : Print launch parameters to screen.\n");
 }
 
-template<typename graph_type>
+template <typename graph_type>
 struct copy_crs_data {
   using execution_space = typename graph_type::device_type::execution_space;
-  using cusparse_int_type = typename Kokkos::View<int*,
-						  typename graph_type::entries_type::array_layout,
-						  typename graph_type::device_type>;
+  using cusparse_int_type =
+      typename Kokkos::View<int*,
+                            typename graph_type::entries_type::array_layout,
+                            typename graph_type::device_type>;
 
   // Dispatch tags
-  struct rowPtrTag{};
-  struct colIndTag{};
+  struct rowPtrTag {};
+  struct colIndTag {};
 
   typename graph_type::row_map_type::const_type Arowptr;
   typename graph_type::entries_type::const_type Acolind;
   cusparse_int_type cusparse_Arowptr, cusparse_Acolind;
 
   copy_crs_data(typename graph_type::row_map_type::const_type Arowptr_,
-		typename graph_type::entries_type::const_type Acolind_,
-		cusparse_int_type cusparse_Arowptr_,
-		cusparse_int_type cusparse_Acolind_) :
-    Arowptr(Arowptr_), Acolind(Acolind_),
-    cusparse_Arowptr(cusparse_Arowptr_),
-    cusparse_Acolind(cusparse_Acolind_) {};
+                typename graph_type::entries_type::const_type Acolind_,
+                cusparse_int_type cusparse_Arowptr_,
+                cusparse_int_type cusparse_Acolind_)
+      : Arowptr(Arowptr_),
+        Acolind(Acolind_),
+        cusparse_Arowptr(cusparse_Arowptr_),
+        cusparse_Acolind(cusparse_Acolind_){};
 
   void doCopy() {
-    Kokkos::RangePolicy<execution_space, rowPtrTag> rowPtrPolicy(0, Arowptr.extent(0));
+    Kokkos::RangePolicy<execution_space, rowPtrTag> rowPtrPolicy(
+        0, Arowptr.extent(0));
     Kokkos::parallel_for("copy rowPtr to cusparse", rowPtrPolicy, *this);
 
-    Kokkos::RangePolicy<execution_space, colIndTag> colIndPolicy(0, Acolind.extent(0));
+    Kokkos::RangePolicy<execution_space, colIndTag> colIndPolicy(
+        0, Acolind.extent(0));
     Kokkos::parallel_for("copy colInd to cusparse", colIndPolicy, *this);
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const rowPtrTag&, const size_t idx) const {
+  void operator()(const rowPtrTag&, const size_t idx) const {
     cusparse_Arowptr(idx) = int(Arowptr(idx));
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const colIndTag&, const size_t idx) const {
+  void operator()(const colIndTag&, const size_t idx) const {
     cusparse_Acolind(idx) = int(Acolind(idx));
   }
 };
 
-template<class AMatrix,
-         class XVector,
-         class YVector>
+template <class AMatrix, class XVector, class YVector>
 void struct_matvec(const int stencil_type,
-                   const Kokkos::View<typename AMatrix::non_const_ordinal_type*, Kokkos::HostSpace>& structure,
-                   typename YVector::const_value_type& alpha,
-                   const AMatrix& A,
-                   const XVector& x,
-                   typename YVector::const_value_type& beta,
-                   YVector& y,
-                   int team_size_int,
-                   int vector_length,
-                   int64_t rows_per_thread_int,
-		   const bool print_lp)
-{
+                   const Kokkos::View<typename AMatrix::non_const_ordinal_type*,
+                                      Kokkos::HostSpace>& structure,
+                   typename YVector::const_value_type& alpha, const AMatrix& A,
+                   const XVector& x, typename YVector::const_value_type& beta,
+                   YVector& y, int team_size_int, int vector_length,
+                   int64_t rows_per_thread_int, const bool print_lp) {
   typedef typename AMatrix::ordinal_type ordinal_type;
   typedef typename AMatrix::execution_space execution_space;
-  if (A.numRows () <= static_cast<ordinal_type> (0)) {
+  if (A.numRows() <= static_cast<ordinal_type>(0)) {
     return;
   }
 
-  int nnzPerRow = -1;
-  int64_t numInteriorPts = 0;
-  int64_t numExteriorPts = 0;
+  int nnzPerRow             = -1;
+  int64_t numInteriorPts    = 0;
+  int64_t numExteriorPts    = 0;
   int vector_length_default = 0;
 
-  if(structure.extent(0) == 1) {
-    numInteriorPts = structure(0) - 2;
-    numExteriorPts = 2;
+  if (structure.extent(0) == 1) {
+    numInteriorPts        = structure(0) - 2;
+    numExteriorPts        = 2;
     vector_length_default = 1;
-  } else if(structure.extent(0) == 2) {
-    numInteriorPts = (structure(1) - 2)*(structure(0) - 2);
-    numExteriorPts = 2*(structure(1) + structure(0) - 2);
-    if(stencil_type == 1) {
+  } else if (structure.extent(0) == 2) {
+    numInteriorPts = (structure(1) - 2) * (structure(0) - 2);
+    numExteriorPts = 2 * (structure(1) + structure(0) - 2);
+    if (stencil_type == 1) {
       vector_length_default = 2;
-    } else if(stencil_type == 2) {
+    } else if (stencil_type == 2) {
       vector_length_default = 4;
     }
-  } else if(structure.extent(0) == 3) {
-    numInteriorPts = (structure(2) - 2)*(structure(1) - 2)*(structure(0) - 2);
-    numExteriorPts = structure(2)*structure(1)*structure(0) - numInteriorPts;
-    if(stencil_type == 1) {
+  } else if (structure.extent(0) == 3) {
+    numInteriorPts =
+        (structure(2) - 2) * (structure(1) - 2) * (structure(0) - 2);
+    numExteriorPts =
+        structure(2) * structure(1) * structure(0) - numInteriorPts;
+    if (stencil_type == 1) {
       vector_length_default = 2;
-    } else if(stencil_type == 2) {
+    } else if (stencil_type == 2) {
       vector_length_default = 8;
     }
   }
   vector_length = (vector_length == -1 ? vector_length_default : vector_length);
 
-  int64_t rows_per_team_int = KokkosSparse::Impl::
-    spmv_struct_launch_parameters<execution_space>(numInteriorPts,
-                                                   A.nnz(),
-                                                   nnzPerRow,
-                                                   rows_per_thread_int,
-                                                   team_size_int,
-                                                   vector_length);
+  int64_t rows_per_team_int =
+      KokkosSparse::Impl::spmv_struct_launch_parameters<execution_space>(
+          numInteriorPts, A.nnz(), nnzPerRow, rows_per_thread_int,
+          team_size_int, vector_length);
 
-  int64_t worksets_int = (numInteriorPts + rows_per_team_int - 1) / rows_per_team_int;
+  int64_t worksets_int =
+      (numInteriorPts + rows_per_team_int - 1) / rows_per_team_int;
 
   int rows_per_thread_ext = -1;
-  int team_size_ext = -1;
-  int64_t rows_per_team_ext = KokkosSparse::Impl::
-    spmv_struct_launch_parameters<execution_space>(numExteriorPts,
-                                                   A.nnz(),
-                                                   nnzPerRow,
-                                                   rows_per_thread_ext,
-                                                   team_size_ext,
-                                                   vector_length);
+  int team_size_ext       = -1;
+  int64_t rows_per_team_ext =
+      KokkosSparse::Impl::spmv_struct_launch_parameters<execution_space>(
+          numExteriorPts, A.nnz(), nnzPerRow, rows_per_thread_ext,
+          team_size_ext, vector_length);
 
-  int64_t worksets_ext = (numInteriorPts + rows_per_team_ext - 1) / rows_per_team_ext;
+  int64_t worksets_ext =
+      (numInteriorPts + rows_per_team_ext - 1) / rows_per_team_ext;
 
-  KokkosSparse::Impl::SPMV_Struct_Functor<AMatrix,XVector,YVector,1,false>
-    spmv_struct(structure, stencil_type, alpha,A,x,beta,y, rows_per_team_int, rows_per_team_ext);
+  KokkosSparse::Impl::SPMV_Struct_Functor<AMatrix, XVector, YVector, 1, false>
+      spmv_struct(structure, stencil_type, alpha, A, x, beta, y,
+                  rows_per_team_int, rows_per_team_ext);
 
-  if(print_lp) {
+  if (print_lp) {
     std::cout << "worksets=" << worksets_ext << ", team_size=" << team_size_ext
               << ", vector_length=" << vector_length << std::endl;
   }
@@ -211,157 +219,225 @@ void struct_matvec(const int stencil_type,
   spmv_struct.compute_interior(worksets_int, team_size_int, vector_length);
   spmv_struct.compute_exterior(worksets_ext, team_size_ext, vector_length);
 
-} // struct_matvec
-
-template<class AMatrix,
-         class XVector,
-         class YVector>
-void matvec(typename YVector::const_value_type& alpha,
-            const AMatrix& A,
-            const XVector& x,
-            typename YVector::const_value_type& beta,
-            YVector& y,
-            int team_size,
-            int vector_length,
-            int64_t rows_per_thread,
-            const bool print_lp) {
+}  // struct_matvec
+
+template <class AMatrix, class XVector, class YVector>
+void matvec(typename YVector::const_value_type& alpha, const AMatrix& A,
+            const XVector& x, typename YVector::const_value_type& beta,
+            YVector& y, int team_size, int vector_length,
+            int64_t rows_per_thread, const bool print_lp) {
   typedef typename AMatrix::ordinal_type ordinal_type;
   typedef typename AMatrix::execution_space execution_space;
 
-  if (A.numRows () <= static_cast<ordinal_type> (0)) {
+  if (A.numRows() <= static_cast<ordinal_type>(0)) {
     return;
   }
 
-  int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(A.numRows(),
-                                                                                      A.nnz(),
-                                                                                      rows_per_thread,
-                                                                                      team_size,
-                                                                                      vector_length);
-  int64_t worksets = (y.extent(0) + rows_per_team-1) / rows_per_team;
+  int64_t rows_per_team =
+      KokkosSparse::Impl::spmv_launch_parameters<execution_space>(
+          A.numRows(), A.nnz(), rows_per_thread, team_size, vector_length);
+  int64_t worksets = (y.extent(0) + rows_per_team - 1) / rows_per_team;
 
-  KokkosSparse::Impl::SPMV_Functor<AMatrix,XVector,YVector,1,false> func (alpha,A,x,beta,y,rows_per_team);
+  KokkosSparse::Impl::SPMV_Functor<AMatrix, XVector, YVector, 1, false> func(
+      alpha, A, x, beta, y, rows_per_team);
 
-  if(print_lp) {
-    std::cout << "worksets=" << worksets << ", team_size=" << team_size << ", vector_length=" << vector_length << std::endl;
+  if (print_lp) {
+    std::cout << "worksets=" << worksets << ", team_size=" << team_size
+              << ", vector_length=" << vector_length << std::endl;
   }
 
-  Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static> > policy(1,1);
-  if(team_size == -1) {
-    policy = Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static> >(worksets,Kokkos::AUTO,vector_length);
+  Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static> > policy(
+      1, 1);
+  if (team_size == -1) {
+    policy =
+        Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static> >(
+            worksets, Kokkos::AUTO, vector_length);
   } else {
-    policy = Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static> >(worksets,team_size,vector_length);
+    policy =
+        Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static> >(
+            worksets, team_size, vector_length);
   }
   Kokkos::parallel_for("KokkosSparse::spmv<NoTranspose,Static>", policy, func);
 
-} // matvec
+}  // matvec
 
-int main(int argc, char **argv)
-{
+int main(int argc, char** argv) {
   typedef double Scalar;
 
-  int  nx = 100;
-  int  ny = 100;
-  int  nz = 100;
-  int  stencil_type = 1;
-  int  numDimensions = 2;
-  int  numVecs = 1;
-  bool check_errors = false;
-  bool compare = false;
+  int nx                = 100;
+  int ny                = 100;
+  int nz                = 100;
+  int stencil_type      = 1;
+  int numDimensions     = 2;
+  int numVecs           = 1;
+  bool check_errors     = false;
+  bool compare          = false;
   bool compare_cusparse = false;
-  bool print_lp = false;
-  int  loop = 100;
-  int  vl  = -1;
-  int  ts  = -1;
-  int  ws  = -1;
-  int  vlu = -1;
-  int  tsu = -1;
-  int  wsu = -1;
-
-  if(argc == 1) {
+  bool print_lp         = false;
+  int loop              = 100;
+  int vl                = -1;
+  int ts                = -1;
+  int ws                = -1;
+  int vlu               = -1;
+  int tsu               = -1;
+  int wsu               = -1;
+
+  if (argc == 1) {
     print_help();
     return 0;
   }
 
-  for(int i = 0; i < argc; i++)
-    {
-      if((strcmp(argv[i],"-nx" )==0)) {nx=atoi(argv[++i]); continue;}
-      if((strcmp(argv[i],"-ny" )==0)) {ny=atoi(argv[++i]); continue;}
-      if((strcmp(argv[i],"-nz" )==0)) {nz=atoi(argv[++i]); continue;}
-      if((strcmp(argv[i],"-st" )==0)) {stencil_type=atoi(argv[++i]); continue;}
-      if((strcmp(argv[i],"-dim")==0)) {numDimensions=atoi(argv[++i]); continue;}
-      // if((strcmp(argv[i],"-mv" )==0)) {numVecs=atoi(argv[++i]); continue;}
-      if((strcmp(argv[i],"-l"  )==0)) {loop=atoi(argv[++i]); continue;}
-      if((strcmp(argv[i],"-vl" )==0)) {vl=atoi(argv[++i]); continue;}
-      if((strcmp(argv[i],"-ts" )==0)) {ts=atoi(argv[++i]); continue;}
-      if((strcmp(argv[i],"-ws" )==0)) {ws=atoi(argv[++i]); continue;}
-      if((strcmp(argv[i],"-vlu" )==0)) {vlu=atoi(argv[++i]); continue;}
-      if((strcmp(argv[i],"-tsu" )==0)) {tsu=atoi(argv[++i]); continue;}
-      if((strcmp(argv[i],"-wsu" )==0)) {wsu=atoi(argv[++i]); continue;}
-      if((strcmp(argv[i],"--check-errors")==0)) {check_errors=true; continue;}
-      if((strcmp(argv[i],"--compare")==0)) {compare=true; continue;}
-      if((strcmp(argv[i],"--compare-cusparse")==0)) {compare_cusparse=true; continue;}
-      if((strcmp(argv[i],"--print-lp")==0)) {print_lp=true; continue;}
-      if((strcmp(argv[i],"--help")==0) || (strcmp(argv[i],"-h")==0)) {
-        print_help();
-        return 0;
-      }
+  for (int i = 0; i < argc; i++) {
+    if ((strcmp(argv[i], "-nx") == 0)) {
+      nx = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-ny") == 0)) {
+      ny = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-nz") == 0)) {
+      nz = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-st") == 0)) {
+      stencil_type = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-dim") == 0)) {
+      numDimensions = atoi(argv[++i]);
+      continue;
+    }
+    // if((strcmp(argv[i],"-mv" )==0)) {numVecs=atoi(argv[++i]); continue;}
+    if ((strcmp(argv[i], "-l") == 0)) {
+      loop = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-vl") == 0)) {
+      vl = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-ts") == 0)) {
+      ts = atoi(argv[++i]);
+      continue;
     }
+    if ((strcmp(argv[i], "-ws") == 0)) {
+      ws = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-vlu") == 0)) {
+      vlu = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-tsu") == 0)) {
+      tsu = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-wsu") == 0)) {
+      wsu = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "--check-errors") == 0)) {
+      check_errors = true;
+      continue;
+    }
+    if ((strcmp(argv[i], "--compare") == 0)) {
+      compare = true;
+      continue;
+    }
+    if ((strcmp(argv[i], "--compare-cusparse") == 0)) {
+      compare_cusparse = true;
+      continue;
+    }
+    if ((strcmp(argv[i], "--print-lp") == 0)) {
+      print_lp = true;
+      continue;
+    }
+    if ((strcmp(argv[i], "--help") == 0) || (strcmp(argv[i], "-h") == 0)) {
+      print_help();
+      return 0;
+    }
+  }
 
-  Kokkos::initialize(argc,argv);
+  Kokkos::initialize(argc, argv);
 
   {
-
-    using matrix_type = KokkosSparse::CrsMatrix<Scalar,int,Kokkos::DefaultExecutionSpace,void,int>;
-    using mv_type     = typename Kokkos::View<Scalar*,Kokkos::LayoutLeft>;
-    using h_mv_type   = typename mv_type::HostMirror;
-
-    int leftBC = 1, rightBC = 1, frontBC = 1, backBC = 1, bottomBC = 1, topBC = 1;
-
-    Kokkos::View<int*, Kokkos::HostSpace> structure("Spmv Structure", numDimensions);
-    Kokkos::View<int*[3], Kokkos::HostSpace> mat_structure("Matrix Structure", numDimensions);
-    if(numDimensions == 1) {
-      structure(0) = nx;
+    using matrix_type =
+        KokkosSparse::CrsMatrix<Scalar, int, Kokkos::DefaultExecutionSpace,
+                                void, int>;
+    using mv_type   = typename Kokkos::View<Scalar*, Kokkos::LayoutLeft>;
+    using h_mv_type = typename mv_type::HostMirror;
+
+    int leftBC = 1, rightBC = 1, frontBC = 1, backBC = 1, bottomBC = 1,
+        topBC = 1;
+
+    Kokkos::View<int*, Kokkos::HostSpace> structure("Spmv Structure",
+                                                    numDimensions);
+    Kokkos::View<int * [3], Kokkos::HostSpace> mat_structure("Matrix Structure",
+                                                             numDimensions);
+    if (numDimensions == 1) {
+      structure(0)        = nx;
       mat_structure(0, 0) = nx;
-    } else if(numDimensions == 2) {
-      structure(0) = nx;
-      structure(1) = ny;
+    } else if (numDimensions == 2) {
+      structure(0)        = nx;
+      structure(1)        = ny;
       mat_structure(0, 0) = nx;
       mat_structure(1, 0) = ny;
-      if(leftBC   == 1) { mat_structure(0, 1) = 1; }
-      if(rightBC  == 1) { mat_structure(0, 2) = 1; }
-      if(bottomBC == 1) { mat_structure(1, 1) = 1; }
-      if(topBC    == 1) { mat_structure(1, 2) = 1; }
-    } else if(numDimensions == 3) {
-      structure(0) = nx;
-      structure(1) = ny;
-      structure(2) = nz;
+      if (leftBC == 1) {
+        mat_structure(0, 1) = 1;
+      }
+      if (rightBC == 1) {
+        mat_structure(0, 2) = 1;
+      }
+      if (bottomBC == 1) {
+        mat_structure(1, 1) = 1;
+      }
+      if (topBC == 1) {
+        mat_structure(1, 2) = 1;
+      }
+    } else if (numDimensions == 3) {
+      structure(0)        = nx;
+      structure(1)        = ny;
+      structure(2)        = nz;
       mat_structure(0, 0) = nx;
       mat_structure(1, 0) = ny;
       mat_structure(2, 0) = nz;
-      if(leftBC   == 1) { mat_structure(0, 1) = 1; }
-      if(rightBC  == 1) { mat_structure(0, 2) = 1; }
-      if(frontBC  == 1) { mat_structure(1, 1) = 1; }
-      if(backBC   == 1) { mat_structure(1, 2) = 1; }
-      if(bottomBC == 1) { mat_structure(2, 1) = 1; }
-      if(topBC    == 1) { mat_structure(2, 2) = 1; }
+      if (leftBC == 1) {
+        mat_structure(0, 1) = 1;
+      }
+      if (rightBC == 1) {
+        mat_structure(0, 2) = 1;
+      }
+      if (frontBC == 1) {
+        mat_structure(1, 1) = 1;
+      }
+      if (backBC == 1) {
+        mat_structure(1, 2) = 1;
+      }
+      if (bottomBC == 1) {
+        mat_structure(2, 1) = 1;
+      }
+      if (topBC == 1) {
+        mat_structure(2, 2) = 1;
+      }
     }
 
     std::string discrectization_stencil;
-    if(stencil_type == 1) {
+    if (stencil_type == 1) {
       discrectization_stencil = "FD";
-    } else if(stencil_type == 2) {
+    } else if (stencil_type == 2) {
       discrectization_stencil = "FE";
     }
 
     matrix_type A;
-    if(numDimensions == 1) {
+    if (numDimensions == 1) {
       A = Test::generate_structured_matrix1D<matrix_type>(mat_structure);
-    } else if(numDimensions == 2) {
-      A = Test::generate_structured_matrix2D<matrix_type>(discrectization_stencil,
-							  mat_structure);
-    } else if(numDimensions == 3) {
-      A = Test::generate_structured_matrix3D<matrix_type>(discrectization_stencil,
-							  mat_structure);
+    } else if (numDimensions == 2) {
+      A = Test::generate_structured_matrix2D<matrix_type>(
+          discrectization_stencil, mat_structure);
+    } else if (numDimensions == 3) {
+      A = Test::generate_structured_matrix3D<matrix_type>(
+          discrectization_stencil, mat_structure);
     }
 
     mv_type x("X", A.numCols());
@@ -371,32 +447,34 @@ int main(int argc, char **argv)
     h_mv_type h_y = Kokkos::create_mirror_view(y);
     h_mv_type h_y_compare;
 
-    for(int rowIdx = 0; rowIdx < A.numCols(); ++rowIdx) {
-      for(int vecIdx = 0; vecIdx < numVecs; ++vecIdx) {
-        h_x(rowIdx) = static_cast<Scalar>(1.0*(rand()%40)-20.0);
-        h_y(rowIdx) = static_cast<Scalar>(1.0*(rand()%40)-20.0);
+    for (int rowIdx = 0; rowIdx < A.numCols(); ++rowIdx) {
+      for (int vecIdx = 0; vecIdx < numVecs; ++vecIdx) {
+        h_x(rowIdx) = static_cast<Scalar>(1.0 * (rand() % 40) - 20.0);
+        h_y(rowIdx) = static_cast<Scalar>(1.0 * (rand() % 40) - 20.0);
       }
     }
 
-    if(check_errors) {
+    if (check_errors) {
       h_y_compare = Kokkos::create_mirror(y);
-      typename matrix_type::StaticCrsGraphType::HostMirror h_graph = Kokkos::create_mirror(A.graph);
-      typename matrix_type::values_type::HostMirror h_values = Kokkos::create_mirror_view(A.values);
+      typename matrix_type::StaticCrsGraphType::HostMirror h_graph =
+          Kokkos::create_mirror(A.graph);
+      typename matrix_type::values_type::HostMirror h_values =
+          Kokkos::create_mirror_view(A.values);
 
       // Error Check Gold Values
-      for(int rowIdx = 0; rowIdx < A.numRows(); ++rowIdx) {
+      for (int rowIdx = 0; rowIdx < A.numRows(); ++rowIdx) {
         int start = h_graph.row_map(rowIdx);
-        int end = h_graph.row_map(rowIdx + 1);
+        int end   = h_graph.row_map(rowIdx + 1);
 
-        for(int vecIdx = 0; vecIdx < numVecs; ++vecIdx) {
+        for (int vecIdx = 0; vecIdx < numVecs; ++vecIdx) {
           h_y_compare(rowIdx) = 0;
         }
 
-        for(int entryIdx = start; entryIdx < end; ++entryIdx) {
+        for (int entryIdx = start; entryIdx < end; ++entryIdx) {
           // Scalar tmp_val = h_graph.entries(entryIdx) + i;
           int colIdx = h_graph.entries(entryIdx);
-          for(int vecIdx = 0; vecIdx < numVecs; ++vecIdx) {
-            h_y_compare(rowIdx) += h_values(entryIdx)*h_x(colIdx);
+          for (int vecIdx = 0; vecIdx < numVecs; ++vecIdx) {
+            h_y_compare(rowIdx) += h_values(entryIdx) * h_x(colIdx);
           }
         }
       }
@@ -404,108 +482,134 @@ int main(int argc, char **argv)
 
     Kokkos::deep_copy(x, h_x);
     Kokkos::deep_copy(y, h_y);
-    Kokkos::View<Scalar*, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace> x1("X1", A.numCols());
-    Kokkos::View<Scalar*, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace> y1("Y1", A.numRows());
+    Kokkos::View<Scalar*, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace> x1(
+        "X1", A.numCols());
+    Kokkos::View<Scalar*, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace> y1(
+        "Y1", A.numRows());
     Kokkos::deep_copy(x1, h_x);
 
-    printf("Type NNZ NumRows NumCols ProblemSize(MB) AveBandwidth(GB/s) MinBandwidth(GB/s) MaxBandwidth(GB/s) AveGFlop MinGFlop MaxGFlop aveTime(ms) maxTime(ms) minTime(ms)\n");
+    printf(
+        "Type NNZ NumRows NumCols ProblemSize(MB) AveBandwidth(GB/s) "
+        "MinBandwidth(GB/s) MaxBandwidth(GB/s) AveGFlop MinGFlop MaxGFlop "
+        "aveTime(ms) maxTime(ms) minTime(ms)\n");
     {
       Kokkos::Profiling::pushRegion("Structured spmv test");
       // Benchmark
       bool print_lp_struct = print_lp;
-      double min_time = 1.0e32;
-      double max_time = 0.0;
-      double ave_time = 0.0;
-      for(int i=0; i<loop; i++) {
-	Kokkos::Timer timer;
-	struct_matvec(stencil_type, structure, 1.0, A, x1, 1.0, y1, ts, vl, ws, print_lp_struct);
-	Kokkos::fence();
-	print_lp_struct = false;
-	double time = timer.seconds();
-	ave_time += time;
-	if(time>max_time) max_time = time;
-	if(time<min_time) min_time = time;
+      double min_time      = 1.0e32;
+      double max_time      = 0.0;
+      double ave_time      = 0.0;
+      for (int i = 0; i < loop; i++) {
+        Kokkos::Timer timer;
+        struct_matvec(stencil_type, structure, 1.0, A, x1, 1.0, y1, ts, vl, ws,
+                      print_lp_struct);
+        Kokkos::fence();
+        print_lp_struct = false;
+        double time     = timer.seconds();
+        ave_time += time;
+        if (time > max_time) max_time = time;
+        if (time < min_time) min_time = time;
       }
 
       // Performance Output
-      double matrix_size = 1.0*((A.nnz()*(sizeof(Scalar) + sizeof(int)) + A.numRows()*sizeof(int)))/1024/1024;
-      double struct_matrix_size = 1.0*((A.nnz()*sizeof(Scalar) + A.numRows()*sizeof(int)))/1024/1024;
-      double vector_size = 2.0*A.numRows()*sizeof(Scalar)/1024/1024;
-      double vector_readwrite = (A.nnz() + A.numCols())*sizeof(Scalar)/1024/1024;
-
-      double problem_size = matrix_size+vector_size;
-      printf("Struct %i %i %i %6.2lf ( %6.2lf %6.2lf %6.2lf ) ( %6.3lf %6.3lf %6.3lf ) ( %8.5lf %8.5lf %8.5lf )\n",
-	     A.nnz(), A.numRows(), A.numCols(), problem_size,
-	     (struct_matrix_size+vector_readwrite)/ave_time*loop/1024, (struct_matrix_size+vector_readwrite)/max_time/1024, (struct_matrix_size+vector_readwrite)/min_time/1024,
-	     2.0*A.nnz()*loop/ave_time/1e9, 2.0*A.nnz()/max_time/1e9, 2.0*A.nnz()/min_time/1e9,
-	     ave_time/loop*1000, max_time*1000, min_time*1000);
+      double matrix_size = 1.0 *
+                           ((A.nnz() * (sizeof(Scalar) + sizeof(int)) +
+                             A.numRows() * sizeof(int))) /
+                           1024 / 1024;
+      double struct_matrix_size =
+          1.0 * ((A.nnz() * sizeof(Scalar) + A.numRows() * sizeof(int))) /
+          1024 / 1024;
+      double vector_size = 2.0 * A.numRows() * sizeof(Scalar) / 1024 / 1024;
+      double vector_readwrite =
+          (A.nnz() + A.numCols()) * sizeof(Scalar) / 1024 / 1024;
+
+      double problem_size = matrix_size + vector_size;
+      printf(
+          "Struct %i %i %i %6.2lf ( %6.2lf %6.2lf %6.2lf ) ( %6.3lf %6.3lf "
+          "%6.3lf ) ( %8.5lf %8.5lf %8.5lf )\n",
+          A.nnz(), A.numRows(), A.numCols(), problem_size,
+          (struct_matrix_size + vector_readwrite) / ave_time * loop / 1024,
+          (struct_matrix_size + vector_readwrite) / max_time / 1024,
+          (struct_matrix_size + vector_readwrite) / min_time / 1024,
+          2.0 * A.nnz() * loop / ave_time / 1e9, 2.0 * A.nnz() / max_time / 1e9,
+          2.0 * A.nnz() / min_time / 1e9, ave_time / loop * 1000,
+          max_time * 1000, min_time * 1000);
       Kokkos::Profiling::popRegion();
     }
 
-    if(compare) {
+    if (compare) {
       Kokkos::Profiling::pushRegion("Unstructured spmv test");
       // Benchmark
       bool print_lp_unstruct = print_lp;
-      double min_time = 1.0e32;
-      double max_time = 0.0;
-      double ave_time = 0.0;
-      for(int i=0 ;i<loop; i++) {
-	Kokkos::Timer timer;
-	matvec(1.0, A, x1, 1.0, y1, tsu, vlu, wsu, print_lp_unstruct);
-	Kokkos::fence();
+      double min_time        = 1.0e32;
+      double max_time        = 0.0;
+      double ave_time        = 0.0;
+      for (int i = 0; i < loop; i++) {
+        Kokkos::Timer timer;
+        matvec(1.0, A, x1, 1.0, y1, tsu, vlu, wsu, print_lp_unstruct);
+        Kokkos::fence();
         print_lp_unstruct = false;
-	double time = timer.seconds();
-	ave_time += time;
-	if(time>max_time) max_time = time;
-	if(time<min_time) min_time = time;
+        double time       = timer.seconds();
+        ave_time += time;
+        if (time > max_time) max_time = time;
+        if (time < min_time) min_time = time;
       }
 
       // Performance Output
-      double matrix_size = 1.0*((A.nnz()*(sizeof(Scalar)+sizeof(int)) + A.numRows()*sizeof(int)))/1024/1024;
-      double vector_size = 2.0*A.numRows()*sizeof(Scalar)/1024/1024;
-      double vector_readwrite = (A.nnz() + A.numCols())*sizeof(Scalar)/1024/1024;
-
-      double problem_size = matrix_size+vector_size;
-      printf("Unstr  %i %i %i %6.2lf ( %6.2lf %6.2lf %6.2lf ) ( %6.3lf %6.3lf %6.3lf ) ( %8.5lf %8.5lf %8.5lf )\n",
-	     A.nnz(), A.numRows(),A.numCols(), problem_size,
-	     (matrix_size+vector_readwrite)/ave_time*loop/1024, (matrix_size+vector_readwrite)/max_time/1024,(matrix_size+vector_readwrite)/min_time/1024,
-	     2.0*A.nnz()*loop/ave_time/1e9, 2.0*A.nnz()/max_time/1e9, 2.0*A.nnz()/min_time/1e9,
-	     ave_time/loop*1000, max_time*1000, min_time*1000);
+      double matrix_size = 1.0 *
+                           ((A.nnz() * (sizeof(Scalar) + sizeof(int)) +
+                             A.numRows() * sizeof(int))) /
+                           1024 / 1024;
+      double vector_size = 2.0 * A.numRows() * sizeof(Scalar) / 1024 / 1024;
+      double vector_readwrite =
+          (A.nnz() + A.numCols()) * sizeof(Scalar) / 1024 / 1024;
+
+      double problem_size = matrix_size + vector_size;
+      printf(
+          "Unstr  %i %i %i %6.2lf ( %6.2lf %6.2lf %6.2lf ) ( %6.3lf %6.3lf "
+          "%6.3lf ) ( %8.5lf %8.5lf %8.5lf )\n",
+          A.nnz(), A.numRows(), A.numCols(), problem_size,
+          (matrix_size + vector_readwrite) / ave_time * loop / 1024,
+          (matrix_size + vector_readwrite) / max_time / 1024,
+          (matrix_size + vector_readwrite) / min_time / 1024,
+          2.0 * A.nnz() * loop / ave_time / 1e9, 2.0 * A.nnz() / max_time / 1e9,
+          2.0 * A.nnz() / min_time / 1e9, ave_time / loop * 1000,
+          max_time * 1000, min_time * 1000);
       Kokkos::Profiling::popRegion();
     }
 
-    if(compare_cusparse) {
+    if (compare_cusparse) {
 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
 #ifdef CUSPARSE_VERSION
       KokkosKernels::Experimental::Controls controls;
 
       cusparseIndexType_t myCusparseOffsetType = CUSPARSE_INDEX_32I;
       cusparseIndexType_t myCusparseEntryType  = CUSPARSE_INDEX_32I;
-      cudaDataType        myCudaDataType       = CUDA_R_64F;
+      cudaDataType myCudaDataType              = CUDA_R_64F;
 
       /* create matrix */
       cusparseSpMatDescr_t A_cusparse;
-      KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr(&A_cusparse, A.numRows(), A.numCols(), A.nnz(),
-						  (void*) A.graph.row_map.data(),
-						  (void*) A.graph.entries.data(),
-						  (void*) A.values.data(),
-						  myCusparseOffsetType,
-						  myCusparseEntryType,
-						  CUSPARSE_INDEX_BASE_ZERO,
-						  myCudaDataType));
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr(
+          &A_cusparse, A.numRows(), A.numCols(), A.nnz(),
+          (void*)A.graph.row_map.data(), (void*)A.graph.entries.data(),
+          (void*)A.values.data(), myCusparseOffsetType, myCusparseEntryType,
+          CUSPARSE_INDEX_BASE_ZERO, myCudaDataType));
 
       /* create lhs and rhs */
       cusparseDnVecDescr_t vecX, vecY;
-      KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec(&vecX, x1.extent_int(0), (void*) x1.data(), myCudaDataType));
-      KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec(&vecY, y1.extent_int(0), (void*) y1.data(), myCudaDataType));
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec(
+          &vecX, x1.extent_int(0), (void*)x1.data(), myCudaDataType));
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec(
+          &vecY, y1.extent_int(0), (void*)y1.data(), myCudaDataType));
 
       const double alpha = 1.0, beta = 1.0;
       size_t bufferSize     = 0;
-      void*  dBuffer        = NULL;
+      void* dBuffer         = NULL;
       cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT;
-      KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV_bufferSize(controls.getCusparseHandle(), CUSPARSE_OPERATION_NON_TRANSPOSE,
-							&alpha, A_cusparse, vecX, &beta, vecY, myCudaDataType,
-							alg, &bufferSize));
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV_bufferSize(
+          controls.getCusparseHandle(), CUSPARSE_OPERATION_NON_TRANSPOSE,
+          &alpha, A_cusparse, vecX, &beta, vecY, myCudaDataType, alg,
+          &bufferSize));
       KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc(&dBuffer, bufferSize));
 
       /* perform SpMV */
@@ -513,29 +617,39 @@ int main(int argc, char **argv)
       double min_time = 1.0e32;
       double max_time = 0.0;
       double ave_time = 0.0;
-      for(int i=0;i<loop;i++) {
-	Kokkos::Timer timer;
-	KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV(controls.getCusparseHandle(), CUSPARSE_OPERATION_NON_TRANSPOSE,
-					       &alpha, A_cusparse, vecX, &beta, vecY, myCudaDataType,
-					       alg, dBuffer));
-	Kokkos::fence();
-	double time = timer.seconds();
-	ave_time += time;
-	if(time>max_time) max_time = time;
-	if(time<min_time) min_time = time;
+      for (int i = 0; i < loop; i++) {
+        Kokkos::Timer timer;
+        KOKKOS_CUSPARSE_SAFE_CALL(
+            cusparseSpMV(controls.getCusparseHandle(),
+                         CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A_cusparse,
+                         vecX, &beta, vecY, myCudaDataType, alg, dBuffer));
+        Kokkos::fence();
+        double time = timer.seconds();
+        ave_time += time;
+        if (time > max_time) max_time = time;
+        if (time < min_time) min_time = time;
       }
 
       // Performance Output
-      double matrix_size = 1.0*((A.nnz()*(sizeof(Scalar)+sizeof(int)) + A.numRows()*sizeof(int)))/1024/1024;
-      double vector_size = 2.0*A.numRows()*sizeof(Scalar)/1024/1024;
-      double vector_readwrite = (A.nnz() + A.numCols())*sizeof(Scalar)/1024/1024;
-
-      double problem_size = matrix_size+vector_size;
-      printf("cusp   %i %i %i %6.2lf ( %6.2lf %6.2lf %6.2lf ) ( %6.3lf %6.3lf %6.3lf ) ( %8.5lf %8.5lf %8.5lf )\n",
-	     A.nnz(), A.numRows(),A.numCols(), problem_size,
-	     (matrix_size+vector_readwrite)/ave_time*loop/1024, (matrix_size+vector_readwrite)/max_time/1024,(matrix_size+vector_readwrite)/min_time/1024,
-	     2.0*A.nnz()*loop/ave_time/1e9, 2.0*A.nnz()/max_time/1e9, 2.0*A.nnz()/min_time/1e9,
-	     ave_time/loop*1000, max_time*1000, min_time*1000);
+      double matrix_size = 1.0 *
+                           ((A.nnz() * (sizeof(Scalar) + sizeof(int)) +
+                             A.numRows() * sizeof(int))) /
+                           1024 / 1024;
+      double vector_size = 2.0 * A.numRows() * sizeof(Scalar) / 1024 / 1024;
+      double vector_readwrite =
+          (A.nnz() + A.numCols()) * sizeof(Scalar) / 1024 / 1024;
+
+      double problem_size = matrix_size + vector_size;
+      printf(
+          "cusp   %i %i %i %6.2lf ( %6.2lf %6.2lf %6.2lf ) ( %6.3lf %6.3lf "
+          "%6.3lf ) ( %8.5lf %8.5lf %8.5lf )\n",
+          A.nnz(), A.numRows(), A.numCols(), problem_size,
+          (matrix_size + vector_readwrite) / ave_time * loop / 1024,
+          (matrix_size + vector_readwrite) / max_time / 1024,
+          (matrix_size + vector_readwrite) / min_time / 1024,
+          2.0 * A.nnz() * loop / ave_time / 1e9, 2.0 * A.nnz() / max_time / 1e9,
+          2.0 * A.nnz() / min_time / 1e9, ave_time / loop * 1000,
+          max_time * 1000, min_time * 1000);
       Kokkos::Profiling::popRegion();
 
       KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(dBuffer));
@@ -543,24 +657,26 @@ int main(int argc, char **argv)
       KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(vecY));
       KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(A_cusparse));
 #else
-      // The data needs to be reformatted for cusparse before launching the kernel.
-      // Step one, extract raw data
+      // The data needs to be reformatted for cusparse before launching the
+      // kernel. Step one, extract raw data
       using graph_type = typename matrix_type::StaticCrsGraphType;
-      using cusparse_int_type = typename Kokkos::View<int*,
-						      typename graph_type::entries_type::array_layout,
-						      typename graph_type::device_type>;
-
-      typename graph_type::row_map_type::const_type     Arowptr = A.graph.row_map;
-      typename graph_type::entries_type::const_type     Acolind = A.graph.entries;
-      typename matrix_type::values_type::non_const_type Avals   = A.values;
+      using cusparse_int_type =
+          typename Kokkos::View<int*,
+                                typename graph_type::entries_type::array_layout,
+                                typename graph_type::device_type>;
+
+      typename graph_type::row_map_type::const_type Arowptr   = A.graph.row_map;
+      typename graph_type::entries_type::const_type Acolind   = A.graph.entries;
+      typename matrix_type::values_type::non_const_type Avals = A.values;
       cusparse_int_type Arowptr_cusparse("Arowptr", Arowptr.extent(0));
       cusparse_int_type Acolind_cusparse("Acolind", Acolind.extent(0));
-      copy_crs_data<graph_type> myCopyFunctor(Arowptr, Acolind, Arowptr_cusparse, Acolind_cusparse);
+      copy_crs_data<graph_type> myCopyFunctor(
+          Arowptr, Acolind, Arowptr_cusparse, Acolind_cusparse);
       myCopyFunctor.doCopy();
 
-      int*    rows = reinterpret_cast<int*>(Arowptr_cusparse.data());
-      int*    cols = reinterpret_cast<int*>(Acolind_cusparse.data());
-      double* vals = reinterpret_cast<double*>(Avals.data());
+      int* rows          = reinterpret_cast<int*>(Arowptr_cusparse.data());
+      int* cols          = reinterpret_cast<int*>(Acolind_cusparse.data());
+      double* vals       = reinterpret_cast<double*>(Avals.data());
       double* x_cusparse = reinterpret_cast<double*>(x1.data());
       double* y_cusparse = reinterpret_cast<double*>(y1.data());
 
@@ -568,18 +684,18 @@ int main(int argc, char **argv)
       cusparseHandle_t cusparseHandle;
       cusparseStatus_t cusparseStatus;
       cusparseStatus = cusparseCreate(&cusparseHandle);
-      if(cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
-	printf("Error while initialize the cusparse handle!\n");
+      if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
+        printf("Error while initialize the cusparse handle!\n");
       }
 
       cusparseMatDescr_t descrA = 0;
-      cusparseStatus = cusparseCreateMatDescr(&descrA);
-      if(cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
-	printf("Error while creating the matrix descriptor!\n");
+      cusparseStatus            = cusparseCreateMatDescr(&descrA);
+      if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
+        printf("Error while creating the matrix descriptor!\n");
       }
 
-      cusparseSetMatType(descrA,CUSPARSE_MATRIX_TYPE_GENERAL);
-      cusparseSetMatIndexBase(descrA,CUSPARSE_INDEX_BASE_ZERO);
+      cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
+      cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
 
       const double alpha = 1.0;
       const double beta  = 1.0;
@@ -588,37 +704,43 @@ int main(int argc, char **argv)
       double min_time = 1.0e32;
       double max_time = 0.0;
       double ave_time = 0.0;
-      for(int i=0;i<loop;i++) {
-	Kokkos::Timer timer;
-	cusparseStatus = cusparseDcsrmv(cusparseHandle,
-					CUSPARSE_OPERATION_NON_TRANSPOSE,
-					static_cast<int>(A.numRows()),
-					static_cast<int>(A.numCols()),
-					static_cast<int>(A.nnz()),
-					&alpha,
-					descrA, vals, rows, cols,
-					x_cusparse, &beta, y_cusparse);
-	Kokkos::fence();
-	double time = timer.seconds();
-	ave_time += time;
-	if(time>max_time) max_time = time;
-	if(time<min_time) min_time = time;
+      for (int i = 0; i < loop; i++) {
+        Kokkos::Timer timer;
+        cusparseStatus = cusparseDcsrmv(
+            cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+            static_cast<int>(A.numRows()), static_cast<int>(A.numCols()),
+            static_cast<int>(A.nnz()), &alpha, descrA, vals, rows, cols,
+            x_cusparse, &beta, y_cusparse);
+        Kokkos::fence();
+        double time = timer.seconds();
+        ave_time += time;
+        if (time > max_time) max_time = time;
+        if (time < min_time) min_time = time;
       }
-      if(cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
-	printf("Error during the cusparse SpMV!\n");
+      if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
+        printf("Error during the cusparse SpMV!\n");
       }
 
       // Performance Output
-      double matrix_size = 1.0*((A.nnz()*(sizeof(Scalar)+sizeof(int)) + A.numRows()*sizeof(int)))/1024/1024;
-      double vector_size = 2.0*A.numRows()*sizeof(Scalar)/1024/1024;
-      double vector_readwrite = (A.nnz() + A.numCols())*sizeof(Scalar)/1024/1024;
-
-      double problem_size = matrix_size+vector_size;
-      printf("cusp   %i %i %i %6.2lf ( %6.2lf %6.2lf %6.2lf ) ( %6.3lf %6.3lf %6.3lf ) ( %8.5lf %8.5lf %8.5lf )\n",
-	     A.nnz(), A.numRows(),A.numCols(), problem_size,
-	     (matrix_size+vector_readwrite)/ave_time*loop/1024, (matrix_size+vector_readwrite)/max_time/1024,(matrix_size+vector_readwrite)/min_time/1024,
-	     2.0*A.nnz()*loop/ave_time/1e9, 2.0*A.nnz()/max_time/1e9, 2.0*A.nnz()/min_time/1e9,
-	     ave_time/loop*1000, max_time*1000, min_time*1000);
+      double matrix_size = 1.0 *
+                           ((A.nnz() * (sizeof(Scalar) + sizeof(int)) +
+                             A.numRows() * sizeof(int))) /
+                           1024 / 1024;
+      double vector_size = 2.0 * A.numRows() * sizeof(Scalar) / 1024 / 1024;
+      double vector_readwrite =
+          (A.nnz() + A.numCols()) * sizeof(Scalar) / 1024 / 1024;
+
+      double problem_size = matrix_size + vector_size;
+      printf(
+          "cusp   %i %i %i %6.2lf ( %6.2lf %6.2lf %6.2lf ) ( %6.3lf %6.3lf "
+          "%6.3lf ) ( %8.5lf %8.5lf %8.5lf )\n",
+          A.nnz(), A.numRows(), A.numCols(), problem_size,
+          (matrix_size + vector_readwrite) / ave_time * loop / 1024,
+          (matrix_size + vector_readwrite) / max_time / 1024,
+          (matrix_size + vector_readwrite) / min_time / 1024,
+          2.0 * A.nnz() * loop / ave_time / 1e9, 2.0 * A.nnz() / max_time / 1e9,
+          2.0 * A.nnz() / min_time / 1e9, ave_time / loop * 1000,
+          max_time * 1000, min_time * 1000);
       Kokkos::Profiling::popRegion();
 
       // Clean-up cusparse and cublas contexts
@@ -626,42 +748,47 @@ int main(int argc, char **argv)
       // cublasDestroy(cublasHandle);
 #endif
 #else
-      printf("Kokkos was not configure with cusparse, the comparison with cusparse_matvec is not perfromed!\n");
+      printf(
+          "Kokkos was not configure with cusparse, the comparison with "
+          "cusparse_matvec is not perfromed!\n");
 #endif
     }
 
-    if(check_errors) {
+    if (check_errors) {
       // Error Check
-      Kokkos::View<Scalar*, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace> x_check("X_check", A.numCols());
-      Kokkos::View<Scalar*, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace> y_check("Y_check", A.numRows());
+      Kokkos::View<Scalar*, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace>
+          x_check("X_check", A.numCols());
+      Kokkos::View<Scalar*, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace>
+          y_check("Y_check", A.numRows());
       Kokkos::deep_copy(x_check, h_x);
-      KokkosSparse::Experimental::spmv_struct("N", stencil_type, structure, 1.0, A, x_check, 1.0, y_check);
+      KokkosSparse::Experimental::spmv_struct("N", stencil_type, structure, 1.0,
+                                              A, x_check, 1.0, y_check);
       Kokkos::fence();
 
       Kokkos::deep_copy(h_y, y_check);
       Scalar error = 0;
-      Scalar sum = 0;
-      for(int rowIdx = 0; rowIdx < A.numRows(); ++rowIdx) {
-        for(int vecIdx = 0; vecIdx < numVecs; ++vecIdx) {
-          error += (h_y_compare(rowIdx) - h_y(rowIdx))*(h_y_compare(rowIdx) - h_y(rowIdx));
-          sum += h_y_compare(rowIdx)*h_y_compare(rowIdx);
+      Scalar sum   = 0;
+      for (int rowIdx = 0; rowIdx < A.numRows(); ++rowIdx) {
+        for (int vecIdx = 0; vecIdx < numVecs; ++vecIdx) {
+          error += (h_y_compare(rowIdx) - h_y(rowIdx)) *
+                   (h_y_compare(rowIdx) - h_y(rowIdx));
+          sum += h_y_compare(rowIdx) * h_y_compare(rowIdx);
         }
       }
 
-      int num_errors = 0;
+      int num_errors     = 0;
       double total_error = 0;
-      double total_sum = 0;
-      num_errors += (error/(sum==0?1:sum))>1e-5?1:0;
+      double total_sum   = 0;
+      num_errors += (error / (sum == 0 ? 1 : sum)) > 1e-5 ? 1 : 0;
       total_error += error;
       total_sum += sum;
 
-      if(total_error == 0) {
-	printf("Kokkos::MultiVector Test: Passed\n");
+      if (total_error == 0) {
+        printf("Kokkos::MultiVector Test: Passed\n");
       } else {
-	printf("Kokkos::MultiVector Test: Failed\n");
+        printf("Kokkos::MultiVector Test: Failed\n");
       }
     }
-
   }
   Kokkos::finalize();
 }
diff --git a/perf_test/sparse/KokkosSparse_spmv_test.cpp b/perf_test/sparse/KokkosSparse_spmv_test.cpp
index 8f02d6411e..098c7e923b 100644
--- a/perf_test/sparse/KokkosSparse_spmv_test.cpp
+++ b/perf_test/sparse/KokkosSparse_spmv_test.cpp
@@ -66,23 +66,27 @@
 #include <PerfTestUtilities.hpp>
 #endif
 
+#ifdef KOKKOSKERNELS_ENABLE_TPL_ARMPL
+#include <spmv/ArmPL_SPMV.hpp>
+#endif
+
 // return std::make_tuple(newnumRows, newnumCols, A, x1, y1,
 //    rows_per_thread, team_size, vector_length,
 //    test, schedule, ave_time, max_time, min_time);
 
 SPMVTestData setup_test(spmv_additional_data* data, SPMVTestData::matrix_type A,
                         Ordinal rows_per_thread, int team_size,
-                        int vector_length, int schedule, int ) {
+                        int vector_length, int schedule, int) {
   SPMVTestData test_data;
-  using mv_type       = SPMVTestData::mv_type;
-  using h_graph_type  = SPMVTestData::h_graph_type;
-  using h_values_type = SPMVTestData::h_values_type;
-  test_data.A         = A;
-  test_data.numRows   = A.numRows();
-  test_data.numCols   = A.numCols();
-  test_data.num_errors = 0;
+  using mv_type         = SPMVTestData::mv_type;
+  using h_graph_type    = SPMVTestData::h_graph_type;
+  using h_values_type   = SPMVTestData::h_values_type;
+  test_data.A           = A;
+  test_data.numRows     = A.numRows();
+  test_data.numCols     = A.numCols();
+  test_data.num_errors  = 0;
   test_data.total_error = 0;
-  test_data.nnz = A.nnz();
+  test_data.nnz         = A.nnz();
   mv_type x("X", test_data.numCols);
   mv_type y("Y", test_data.numRows);
   test_data.h_x         = Kokkos::create_mirror_view(x);
@@ -173,14 +177,14 @@ test_list construct_kernel_base(const rajaperf::RunParams& run_params) {
     auto& config = std::get<1>(test_case.test_data);
     test_cases.push_back(rajaperf::make_kernel_base(
         "Sparse_SPMV:" + test_case.filename, run_params,
-        [=](const int , const int ) {
+        [=](const int, const int) {
           spmv_additional_data data(config.test);
           return std::make_tuple(
               setup_test(&data, std::get<0>(test_case.test_data),
                          config.rows_per_thread, config.team_size,
                          config.vector_length, config.schedule, config.loop));
         },
-        [&](const int , const int , SPMVTestData& data) {
+        [&](const int, const int, SPMVTestData& data) {
           run_benchmark(data);
         }));
   }
@@ -189,9 +193,7 @@ test_list construct_kernel_base(const rajaperf::RunParams& run_params) {
 
 std::vector<rajaperf::KernelBase*> make_spmv_kernel_base(
     const rajaperf::RunParams& params) {
-
-
   return construct_kernel_base(params);
 }
 
-#endif // KOKKOSKERNELS_ENABLE_TESTS_AND_PERFSUITE 
+#endif  // KOKKOSKERNELS_ENABLE_TESTS_AND_PERFSUITE
diff --git a/perf_test/sparse/KokkosSparse_spmv_test.hpp b/perf_test/sparse/KokkosSparse_spmv_test.hpp
index e802b6f28f..b6ff552faf 100644
--- a/perf_test/sparse/KokkosSparse_spmv_test.hpp
+++ b/perf_test/sparse/KokkosSparse_spmv_test.hpp
@@ -25,7 +25,6 @@
 #include <spmv/OpenMPSmartStatic_SPMV.hpp>
 #endif
 
-
 #ifdef HAVE_CUSPARSE
 #include <spmv/CuSparse_SPMV.hpp>
 #endif
@@ -34,6 +33,9 @@
 #include <spmv/MKL_SPMV.hpp>
 #endif
 
+template <typename AType, typename XType, typename YType>
+void armpl_matvec(AType /*A*/, XType x, YType y, spmv_additional_data* data);
+
 enum {
   KOKKOS,
   MKL,
@@ -57,9 +59,9 @@ using Layout  = default_layout;
 std::vector<rajaperf::KernelBase*> make_spmv_kernel_base(
     const rajaperf::RunParams& params);
 
-
 test_list construct_kernel_base(const rajaperf::RunParams& run_params,
-                                Ordinal numRows, Ordinal numCols, spmv_additional_data* data,
+                                Ordinal numRows, Ordinal numCols,
+                                spmv_additional_data* data,
                                 Ordinal rows_per_thread, int team_size,
                                 int vector_length, int schedule, int loop);
 
@@ -86,7 +88,7 @@ struct SPMVTestData {
   h_mv_type h_y_compare;
   h_mv_type h_x;
   h_mv_type h_y;
-  //int test;
+  // int test;
   spmv_additional_data* data;
   int schedule;
   int num_errors;
@@ -135,7 +137,6 @@ SPMVTestData setup_test(spmv_additional_data* data, SPMVTestData::matrix_type A,
                         Ordinal rows_per_thread, int team_size,
                         int vector_length, int schedule, int loop);
 
-
 template <typename AType, typename XType, typename YType>
 void matvec(AType& A, XType x, YType y, Ordinal rows_per_thread, int team_size,
             int vector_length, spmv_additional_data* data, int schedule) {
@@ -160,49 +161,47 @@ void matvec(AType& A, XType x, YType y, Ordinal rows_per_thread, int team_size,
       break;
 
 #ifdef KOKKOS_ENABLE_OPENMP
-        case OMP_STATIC:
-                openmp_static_matvec<AType, XType, YType, Offset, Ordinal, Scalar>(A, x, y);
-                break;
-        case OMP_DYNAMIC:
-                openmp_dynamic_matvec<AType, XType, YType, Offset, Ordinal, Scalar>(A, x, y);
-                break;
-        case OMP_INSP:
-                openmp_smart_static_matvec<AType, XType, YType, Offset, Ordinal, Scalar>(A, x, y);
-                break;
+    case OMP_STATIC:
+      openmp_static_matvec<AType, XType, YType, Offset, Ordinal, Scalar>(A, x,
+                                                                         y);
+      break;
+    case OMP_DYNAMIC:
+      openmp_dynamic_matvec<AType, XType, YType, Offset, Ordinal, Scalar>(A, x,
+                                                                          y);
+      break;
+    case OMP_INSP:
+      openmp_smart_static_matvec<AType, XType, YType, Offset, Ordinal, Scalar>(
+          A, x, y);
+      break;
 #endif
 
 #ifdef HAVE_MKL
-        case MKL:
-                mkl_matvec(A, x, y);
-                break;
+    case MKL: mkl_matvec(A, x, y); break;
 #endif
 #ifdef HAVE_CUSPARSE
-        case CUSPARSE:
-                cusparse_matvec(A, x, y);
-                break;
+    case CUSPARSE: cusparse_matvec(A, x, y); break;
 #endif
 #ifdef KOKKOSKERNELS_ENABLE_TPL_ARMPL
-	case ARMPL:
-	  armpl_matvec(A, x, y, data);
-	  break;
+    case ARMPL: armpl_matvec(A, x, y, data); break;
 #endif
-        case KK_KERNELS:
-                KokkosSparse::spmv (KokkosSparse::NoTranspose,1.0,A,x,0.0,y);
-                break;
-        case KK_KERNELS_INSP:
-                if(A.graph.row_block_offsets.data()==NULL) {
-                  printf("PTR: %p\n",static_cast<const void*>(A.graph.row_block_offsets.data()));
-                  A.graph.create_block_partitioning(AType::execution_space::concurrency());
-                  printf("PTR2: %p\n",static_cast<const void*>(A.graph.row_block_offsets.data()));
-                }
-                KokkosSparse::spmv (KokkosSparse::NoTranspose,1.0,A,x,0.0,y);
-                break;
-        default:
-          fprintf(stderr, "Selected test is not available.\n");
+    case KK_KERNELS:
+      KokkosSparse::spmv(KokkosSparse::NoTranspose, 1.0, A, x, 0.0, y);
+      break;
+    case KK_KERNELS_INSP:
+      if (A.graph.row_block_offsets.data() == NULL) {
+        printf("PTR: %p\n",
+               static_cast<const void*>(A.graph.row_block_offsets.data()));
+        A.graph.create_block_partitioning(
+            AType::execution_space::concurrency());
+        printf("PTR2: %p\n",
+               static_cast<const void*>(A.graph.row_block_offsets.data()));
       }
+      KokkosSparse::spmv(KokkosSparse::NoTranspose, 1.0, A, x, 0.0, y);
+      break;
+    default: fprintf(stderr, "Selected test is not available.\n");
+  }
 }
 
-
 void run_benchmark(SPMVTestData& data);
 
 #endif  // KOKKOSKERNELS_KOKKOSSPARSE_SPMV_HPP
diff --git a/perf_test/sparse/KokkosSparse_sptrsv.cpp b/perf_test/sparse/KokkosSparse_sptrsv.cpp
index b46d857b32..c6787242d9 100644
--- a/perf_test/sparse/KokkosSparse_sptrsv.cpp
+++ b/perf_test/sparse/KokkosSparse_sptrsv.cpp
@@ -67,7 +67,8 @@
 
 //#define INTERNAL_CUSPARSE
 
-#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA ) && (!defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION ))
+#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) && \
+    (!defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION))
 using namespace KokkosSparse;
 using namespace KokkosSparse::Experimental;
 using namespace KokkosKernels;
@@ -77,13 +78,20 @@ using namespace KokkosKernels::Experimental;
 //#define PRINT_HLEVEL_FREQ_PLOT
 //#define PRINT_LEVEL_LIST
 
-enum {DEFAULT, CUSPARSE, LVLSCHED_RP, LVLSCHED_TP1, /*LVLSCHED_TP2,*/ LVLSCHED_TP1CHAIN, CUSPARSE_K};
+enum {
+  DEFAULT,
+  CUSPARSE,
+  LVLSCHED_RP,
+  LVLSCHED_TP1,
+  /*LVLSCHED_TP2,*/ LVLSCHED_TP1CHAIN,
+  CUSPARSE_K
+};
 
 #ifdef PRINTVIEWSSPTRSVPERF
 template <class ViewType>
 void print_view1d(const ViewType dv) {
   auto v = Kokkos::create_mirror_view(dv);
-  Kokkos::deep_copy(v,dv);
+  Kokkos::deep_copy(v, dv);
   std::cout << "Output for view " << v.label() << std::endl;
   for (size_t i = 0; i < v.extent(0); ++i) {
     std::cout << "v(" << i << ") = " << v(i) << " , ";
@@ -96,57 +104,64 @@ void print_view1d(const ViewType /*dv*/) {}
 #endif
 
 template <class RowMapType, class EntriesType>
-void check_entries_sorted(const RowMapType drow_map, const EntriesType dentries) {
+void check_entries_sorted(const RowMapType drow_map,
+                          const EntriesType dentries) {
   auto row_map = Kokkos::create_mirror_view(drow_map);
-  Kokkos::deep_copy(row_map,drow_map);
+  Kokkos::deep_copy(row_map, drow_map);
   auto entries = Kokkos::create_mirror_view(dentries);
-  Kokkos::deep_copy(entries,dentries);
+  Kokkos::deep_copy(entries, dentries);
 
-  for (size_t row = 0; row < row_map.extent(0)-1; ++row) {
+  for (size_t row = 0; row < row_map.extent(0) - 1; ++row) {
     size_t start = row_map(row);
-    size_t end = row_map(row+1);
-    for (size_t offset = start; offset < end-1; ++offset) {
+    size_t end   = row_map(row + 1);
+    for (size_t offset = start; offset < end - 1; ++offset) {
       size_t pcol = entries(offset);
-      size_t ncol = entries(offset+1);
+      size_t ncol = entries(offset + 1);
       if (pcol > ncol) {
         std::cout << "  UNSORTED!!" << std::endl;
       }
     }
   }
-
 }
 
-int test_sptrsv_perf(std::vector<int> tests, const std::string& lfilename, const std::string& ufilename, const int team_size, const int vector_length, const int /*idx_offset*/, const int loop, const int chain_threshold = 0, const float /*dense_row_percent*/ = -1.0) {
+int test_sptrsv_perf(std::vector<int> tests, const std::string &lfilename,
+                     const std::string &ufilename, const int team_size,
+                     const int vector_length, const int /*idx_offset*/,
+                     const int loop, const int chain_threshold = 0,
+                     const float /*dense_row_percent*/ = -1.0) {
   typedef default_scalar scalar_t;
   typedef default_lno_t lno_t;
   typedef default_size_type size_type;
   typedef Kokkos::DefaultExecutionSpace execution_space;
   typedef typename execution_space::memory_space memory_space;
 
-  typedef KokkosSparse::CrsMatrix<scalar_t, lno_t, execution_space, void, size_type> crsmat_t;
+  typedef KokkosSparse::CrsMatrix<scalar_t, lno_t, execution_space, void,
+                                  size_type>
+      crsmat_t;
   typedef typename crsmat_t::StaticCrsGraphType graph_t;
 
-  typedef Kokkos::View< scalar_t*, memory_space >     ValuesType;
+  typedef Kokkos::View<scalar_t *, memory_space> ValuesType;
 
-  typedef KokkosKernels::Experimental::KokkosKernelsHandle <size_type, lno_t, scalar_t,
-    execution_space, memory_space, memory_space > KernelHandle;
+  typedef KokkosKernels::Experimental::KokkosKernelsHandle<
+      size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>
+      KernelHandle;
 
   const scalar_t ZERO = scalar_t(0);
   const scalar_t ONE  = scalar_t(1);
 
+  // Read lmtx
+  // Run all requested algorithms
+  // Read umtx
+  // Run all requested algorithms
 
-// Read lmtx
-// Run all requested algorithms
-// Read umtx
-// Run all requested algorithms
-
-// LOWERTRI
+  // LOWERTRI
   std::cout << "\n\n" << std::endl;
-  if (!lfilename.empty())
-  {
-    std::cout << "Lower Tri Begin: Read matrix filename " << lfilename << std::endl;
-    crsmat_t triMtx = KokkosKernels::Impl::read_kokkos_crst_matrix<crsmat_t>(lfilename.c_str()); //in_matrix
-    graph_t  graph  = triMtx.graph; // in_graph
+  if (!lfilename.empty()) {
+    std::cout << "Lower Tri Begin: Read matrix filename " << lfilename
+              << std::endl;
+    crsmat_t triMtx = KokkosKernels::Impl::read_kokkos_crst_matrix<crsmat_t>(
+        lfilename.c_str());                // in_matrix
+    graph_t graph         = triMtx.graph;  // in_graph
     const size_type nrows = graph.numRows();
 
     // Create the rhs and lhs_known solution
@@ -161,16 +176,18 @@ int test_sptrsv_perf(std::vector<int> tests, const std::string& lfilename, const
     ValuesType rhs("rhs", nrows);
 
     std::cout << "SPMV" << std::endl;
-    KokkosSparse::spmv( "N", ONE, triMtx, known_lhs, ZERO, rhs);
-
+    KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs);
 
     auto row_map = graph.row_map;
     auto entries = graph.entries;
     auto values  = triMtx.values;
 
-    std::cout << "Lower Perf: row_map.extent(0) = " << row_map.extent(0) << std::endl;
-    std::cout << "Lower Perf: entries.extent(0) = " << entries.extent(0) << std::endl;
-    std::cout << "Lower Perf: values.extent(0) = " << values.extent(0) << std::endl;
+    std::cout << "Lower Perf: row_map.extent(0) = " << row_map.extent(0)
+              << std::endl;
+    std::cout << "Lower Perf: entries.extent(0) = " << entries.extent(0)
+              << std::endl;
+    std::cout << "Lower Perf: values.extent(0) = " << values.extent(0)
+              << std::endl;
 
     std::cout << "Lower Perf: lhs.extent(0) = " << lhs.extent(0) << std::endl;
     std::cout << "Lower Perf: rhs.extent(0) = " << rhs.extent(0) << std::endl;
@@ -185,347 +202,374 @@ int test_sptrsv_perf(std::vector<int> tests, const std::string& lfilename, const
     print_view1d(rhs);
 #endif
 
-#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE)
-  //std::cout << "  cusparse: create handle" << std::endl;
-  cusparseStatus_t status;
-  cusparseHandle_t handle = 0;
-  status = cusparseCreate(&handle);
-  if (CUSPARSE_STATUS_SUCCESS != status)
-    std::cout << "handle create status error name " << (status) << std::endl;
-  cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST);
-  cusparseMatDescr_t descr = 0;
-  csrsv2Info_t info = 0;
-  int pBufferSize;
-  void *pBuffer = 0;
-  int structural_zero;
-  int numerical_zero;
-  const double alpha = 1.;
-  const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
-  const cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE;
-  
-  // step 1: create a descriptor which contains
-  // - matrix L is lower triangular
-  //   (L may not have all diagonal elements.)
-  status = cusparseCreateMatDescr(&descr);
-  if (CUSPARSE_STATUS_SUCCESS != status)
-    std::cout << "matdescr create status error name " << (status) << std::endl;
-  //cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ONE);
-  cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
-  cusparseSetMatFillMode(descr, CUSPARSE_FILL_MODE_LOWER);
-  cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_NON_UNIT);
-  cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
-  //cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_UNIT);
-  
-  // step 2: create a empty info structure
-  //std::cout << "  cusparse: create csrsv2info" << std::endl;
-  status = cusparseCreateCsrsv2Info(&info);
-  if (CUSPARSE_STATUS_SUCCESS != status)
-    std::cout << "csrsv2info create status error name " << (status) << std::endl;
-  
-  // step 3: query how much memory used in csrsv2, and allocate the buffer
-        int nnz = triMtx.nnz();
-  cusparseDcsrsv2_bufferSize(handle, trans, nrows, nnz, descr,
-      values.data(), row_map.data(), entries.data(), info, &pBufferSize);
-  // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes.
-  cudaMalloc((void**)&pBuffer, pBufferSize);
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(INTERNAL_CUSPARSE)
+    // std::cout << "  cusparse: create handle" << std::endl;
+    cusparseStatus_t status;
+    cusparseHandle_t handle = 0;
+    status                  = cusparseCreate(&handle);
+    if (CUSPARSE_STATUS_SUCCESS != status)
+      std::cout << "handle create status error name " << (status) << std::endl;
+    cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST);
+    cusparseMatDescr_t descr = 0;
+    csrsv2Info_t info        = 0;
+    int pBufferSize;
+    void *pBuffer = 0;
+    int structural_zero;
+    int numerical_zero;
+    const double alpha                 = 1.;
+    const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
+    const cusparseOperation_t trans    = CUSPARSE_OPERATION_NON_TRANSPOSE;
+
+    // step 1: create a descriptor which contains
+    // - matrix L is lower triangular
+    //   (L may not have all diagonal elements.)
+    status = cusparseCreateMatDescr(&descr);
+    if (CUSPARSE_STATUS_SUCCESS != status)
+      std::cout << "matdescr create status error name " << (status)
+                << std::endl;
+    // cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ONE);
+    cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
+    cusparseSetMatFillMode(descr, CUSPARSE_FILL_MODE_LOWER);
+    cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_NON_UNIT);
+    cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
+    // cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_UNIT);
+
+    // step 2: create a empty info structure
+    // std::cout << "  cusparse: create csrsv2info" << std::endl;
+    status = cusparseCreateCsrsv2Info(&info);
+    if (CUSPARSE_STATUS_SUCCESS != status)
+      std::cout << "csrsv2info create status error name " << (status)
+                << std::endl;
+
+    // step 3: query how much memory used in csrsv2, and allocate the buffer
+    int nnz = triMtx.nnz();
+    cusparseDcsrsv2_bufferSize(handle, trans, nrows, nnz, descr, values.data(),
+                               row_map.data(), entries.data(), info,
+                               &pBufferSize);
+    // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes.
+    cudaMalloc((void **)&pBuffer, pBufferSize);
 #endif
 
-
-  for ( auto test : tests ) {
-    std::cout << "\ntest = " << test << std::endl;
-
-    KernelHandle kh;
-    bool is_lower_tri = true;
-
-    std::cout << "Create handle (lower)" << std::endl;
-    switch(test) {
-      case LVLSCHED_RP:
-        kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, is_lower_tri);
-        kh.get_sptrsv_handle()->print_algorithm();
-        break;
-      case LVLSCHED_TP1:
-        kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri);
-        std::cout << "TP1 set team_size = " << team_size << std::endl;
-        if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size);
-        kh.get_sptrsv_handle()->print_algorithm();
-        break;
-      case LVLSCHED_TP1CHAIN:
-        printf("TP1 with CHAIN\n");
-        printf("chain_threshold %d\n", chain_threshold);
-        printf("team_size %d\n", team_size);
-        kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, is_lower_tri);
-        kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold);
-        if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size);
-        if (vector_length != -1) kh.get_sptrsv_handle()->set_vector_size(vector_length);
-        kh.get_sptrsv_handle()->print_algorithm();
-        break;
-/*
-      case LVLSCHED_TP2:
-        kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHED_TP2, nrows, is_lower_tri);
-        if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size);
-        if (vector_length != -1) kh.get_sptrsv_handle()->set_vector_size(vector_length);
-        kh.get_sptrsv_handle()->print_algorithm();
-        break;
-*/
-      case CUSPARSE_K:
-        printf("CUSPARSE WRAPPER\n");
-        kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, is_lower_tri);
-        kh.get_sptrsv_handle()->print_algorithm();
-        break;
-      case CUSPARSE:
-#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE)
-        std::cout << "CUSPARSE: No kk interface added yet" << std::endl;
-        //cusparse_matvec(A, x, y, rows_per_thread, team_size, vector_length);
-        break;
+    for (auto test : tests) {
+      std::cout << "\ntest = " << test << std::endl;
+
+      KernelHandle kh;
+      bool is_lower_tri = true;
+
+      std::cout << "Create handle (lower)" << std::endl;
+      switch (test) {
+        case LVLSCHED_RP:
+          kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows,
+                                  is_lower_tri);
+          kh.get_sptrsv_handle()->print_algorithm();
+          break;
+        case LVLSCHED_TP1:
+          kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows,
+                                  is_lower_tri);
+          std::cout << "TP1 set team_size = " << team_size << std::endl;
+          if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size);
+          kh.get_sptrsv_handle()->print_algorithm();
+          break;
+        case LVLSCHED_TP1CHAIN:
+          printf("TP1 with CHAIN\n");
+          printf("chain_threshold %d\n", chain_threshold);
+          printf("team_size %d\n", team_size);
+          kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows,
+                                  is_lower_tri);
+          kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold);
+          if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size);
+          if (vector_length != -1)
+            kh.get_sptrsv_handle()->set_vector_size(vector_length);
+          kh.get_sptrsv_handle()->print_algorithm();
+          break;
+          /*
+                case LVLSCHED_TP2:
+                  kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHED_TP2,
+             nrows, is_lower_tri); if (team_size != -1)
+             kh.get_sptrsv_handle()->set_team_size(team_size); if (vector_length
+             != -1) kh.get_sptrsv_handle()->set_vector_size(vector_length);
+                  kh.get_sptrsv_handle()->print_algorithm();
+                  break;
+          */
+        case CUSPARSE_K:
+          printf("CUSPARSE WRAPPER\n");
+          kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows,
+                                  is_lower_tri);
+          kh.get_sptrsv_handle()->print_algorithm();
+          break;
+        case CUSPARSE:
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(INTERNAL_CUSPARSE)
+          std::cout << "CUSPARSE: No kk interface added yet" << std::endl;
+          // cusparse_matvec(A, x, y, rows_per_thread, team_size,
+          // vector_length);
+          break;
 #else
-        std::cout << "CUSPARSE not enabled: Fall through to defaults" << std::endl;
+          std::cout << "CUSPARSE not enabled: Fall through to defaults"
+                    << std::endl;
 #endif
-      default:
-        kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri);
-        if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size);
-        kh.get_sptrsv_handle()->print_algorithm();
-    }
+        default:
+          kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows,
+                                  is_lower_tri);
+          if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size);
+          kh.get_sptrsv_handle()->print_algorithm();
+      }
 
+      // Init run to clear cache etc.
+      Kokkos::Timer timer;
+      if (test != CUSPARSE) {
+        timer.reset();
+        if (test == CUSPARSE_K) {
+          printf("cusparsek symbolic\n");
+          sptrsv_symbolic(&kh, row_map, entries, values);
+          printf("  finished cusparsek symbolic\n");
+        } else {
+          sptrsv_symbolic(&kh, row_map, entries);
+        }
+        std::cout << "LTRI Symbolic Time: " << timer.seconds() << std::endl;
 
-    // Init run to clear cache etc.
-    Kokkos::Timer timer;
-    if (test != CUSPARSE) {
-    timer.reset();
-    if (test == CUSPARSE_K) {
-      printf("cusparsek symbolic\n");
-      sptrsv_symbolic( &kh, row_map, entries, values );
-      printf("  finished cusparsek symbolic\n");
-    }
-    else {
-      sptrsv_symbolic( &kh, row_map, entries );
-    }
-    std::cout << "LTRI Symbolic Time: " << timer.seconds() << std::endl;
-
-    //std::cout << "TriSolve Solve" << std::endl;
-    timer.reset();
-    sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
-    Kokkos::fence();
-    std::cout << "LTRI Solve Time: " << timer.seconds() << std::endl;
-  
-    }
-#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE)
-// step 4: perform analysis
-    else {
-      //int nnz = triMtx.nnz();
-      //std::cout << "  cusparse path: analysis" << std::endl;
-      //status = cusparseDcsrsv2_analysis(handle, trans, nrows, nnz, descr, (double*)dvalues, (int *)drow_map, (int *)dentries, info, policy, pBuffer);
-      timer.reset();
-      status = cusparseDcsrsv2_analysis(handle, trans, nrows, triMtx.nnz(), descr, values.data(), row_map.data(), entries.data(), info, policy, pBuffer);
-      std::cout << "LTRI Cusparse Symbolic Time: " << timer.seconds() << std::endl;
-      if (CUSPARSE_STATUS_SUCCESS != status)
-        std::cout << "analysis status error name " << (status) << std::endl;
-// L has unit diagonal, so no structural zero is reported.
-
-      //std::cout << "  cusparse path: analysis" << std::endl;
-      status = cusparseXcsrsv2_zeroPivot(handle, info, &structural_zero);
-      if (CUSPARSE_STATUS_ZERO_PIVOT == status){
-         printf("L(%d,%d) is missing\n", structural_zero, structural_zero);
-      }
+        // std::cout << "TriSolve Solve" << std::endl;
+        timer.reset();
+        sptrsv_solve(&kh, row_map, entries, values, rhs, lhs);
+        Kokkos::fence();
+        std::cout << "LTRI Solve Time: " << timer.seconds() << std::endl;
 
-// step 5: solve L*y = x
-      //std::cout << "  cusparse path: solve" << std::endl;
-      //status = cusparseDcsrsv2_solve(handle, trans, nrows, nnz, &alpha, descr, (double*)dvalues, (int *)drow_map, (int *)dentries, info, (double*)drhs, (double*)dlhs, policy, pBuffer);
-      timer.reset();
-      status = cusparseDcsrsv2_solve(handle, trans, nrows, triMtx.nnz(), &alpha, descr, values.data(), row_map.data(), entries.data(), info, rhs.data(), lhs.data(), policy, pBuffer);
-      Kokkos::fence();
-      std::cout << "LTRI Cusparse Solve Time: " << timer.seconds() << std::endl;
-      if (CUSPARSE_STATUS_SUCCESS != status)
-        std::cout << "solve status error name " << (status) << std::endl;
-// L has unit diagonal, so no numerical zero is reported.
-      status = cusparseXcsrsv2_zeroPivot(handle, info, &numerical_zero);
-      if (CUSPARSE_STATUS_ZERO_PIVOT == status){
-         printf("L(%d,%d) is zero\n", numerical_zero, numerical_zero);
       }
-    }
-#endif
-    // Error Check
-    Kokkos::fence();
-    {
-    scalar_t sum = 0.0;
-    Kokkos::parallel_reduce( Kokkos::RangePolicy<execution_space>(0, lhs.extent(0)), 
-      KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) {
-        tsum += (known_lhs(i) - lhs(i))*(known_lhs(i) - lhs(i));
-      }, sum);
-  
-    scalar_t norm_ssd = sqrt(sum / lhs.extent(0));
-    std::cout << "  ssd = " << sum << "  norm_sqrt_ssd = " << norm_ssd << std::endl;
-
-    if ( norm_ssd > 1e-8 ) {
-      std::cout << "Lower Tri Solve FAILURE: norm_ssd = " << norm_ssd << std::endl;
-      return 1;
-    }
-    else {
-     std::cout << "\nLower Tri Solve Init Test: SUCCESS!\n" << std::endl;
-    }
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(INTERNAL_CUSPARSE)
+      // step 4: perform analysis
+      else {
+        // int nnz = triMtx.nnz();
+        // std::cout << "  cusparse path: analysis" << std::endl;
+        // status = cusparseDcsrsv2_analysis(handle, trans, nrows, nnz, descr,
+        // (double*)dvalues, (int *)drow_map, (int *)dentries, info, policy,
+        // pBuffer);
+        timer.reset();
+        status = cusparseDcsrsv2_analysis(
+            handle, trans, nrows, triMtx.nnz(), descr, values.data(),
+            row_map.data(), entries.data(), info, policy, pBuffer);
+        std::cout << "LTRI Cusparse Symbolic Time: " << timer.seconds()
+                  << std::endl;
+        if (CUSPARSE_STATUS_SUCCESS != status)
+          std::cout << "analysis status error name " << (status) << std::endl;
+        // L has unit diagonal, so no structural zero is reported.
+
+        // std::cout << "  cusparse path: analysis" << std::endl;
+        status = cusparseXcsrsv2_zeroPivot(handle, info, &structural_zero);
+        if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
+          printf("L(%d,%d) is missing\n", structural_zero, structural_zero);
+        }
 
-      /*
-    sum = 0.0;
-    Kokkos::parallel_reduce( Kokkos::RangePolicy<execution_space>(0, lhs.extent(0)), 
-      KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) {
-        tsum += lhs(i);
-      }, sum);
-
-    if ( sum != lhs.extent(0) ) {
-      std::cout << "Lower Tri Solve FAILURE: sum = " << sum << std::endl;
-      auto hsoln = Kokkos::create_mirror_view(lhs);
-      Kokkos::deep_copy(hsoln, lhs);
-      for ( size_t i = 0; i < hsoln.extent(0); ++i ) {
-        std::cout << "lhs(" << i << ") = " << hsoln(i) << std::endl;
+        // step 5: solve L*y = x
+        // std::cout << "  cusparse path: solve" << std::endl;
+        // status = cusparseDcsrsv2_solve(handle, trans, nrows, nnz, &alpha,
+        // descr, (double*)dvalues, (int *)drow_map, (int *)dentries, info,
+        // (double*)drhs, (double*)dlhs, policy, pBuffer);
+        timer.reset();
+        status = cusparseDcsrsv2_solve(handle, trans, nrows, triMtx.nnz(),
+                                       &alpha, descr, values.data(),
+                                       row_map.data(), entries.data(), info,
+                                       rhs.data(), lhs.data(), policy, pBuffer);
+        Kokkos::fence();
+        std::cout << "LTRI Cusparse Solve Time: " << timer.seconds()
+                  << std::endl;
+        if (CUSPARSE_STATUS_SUCCESS != status)
+          std::cout << "solve status error name " << (status) << std::endl;
+        // L has unit diagonal, so no numerical zero is reported.
+        status = cusparseXcsrsv2_zeroPivot(handle, info, &numerical_zero);
+        if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
+          printf("L(%d,%d) is zero\n", numerical_zero, numerical_zero);
+        }
       }
-      return 1;
-    }
-    else {
-     std::cout << "\nLower Tri Solve Init Test: SUCCESS!\n" << std::endl;
-    }
-      */
-    }
-
-  
-    // Benchmark
-    Kokkos::fence();
-    double min_time = 1.0e32;
-    double max_time = 0.0;
-    double ave_time = 0.0;
-
-    for(int titer=0;titer<loop;titer++) {
-      timer.reset();
-  
-    if (test != CUSPARSE) {
-    #ifdef CHECKALLRUNRESULTS
-      Kokkos::deep_copy(lhs,0,0);
-    #endif
-      sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
-    #ifdef CHECKALLRUNRESULTS
-        {
+#endif
+      // Error Check
+      Kokkos::fence();
+      {
         scalar_t sum = 0.0;
-        Kokkos::parallel_reduce( Kokkos::RangePolicy<execution_space>(0, lhs.extent(0)), 
-          KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) {
-            tsum += (known_lhs(i) - lhs(i))*(known_lhs(i) - lhs(i));
-          }, sum);
-  
+        Kokkos::parallel_reduce(
+            Kokkos::RangePolicy<execution_space>(0, lhs.extent(0)),
+            KOKKOS_LAMBDA(const lno_t i, scalar_t &tsum) {
+              tsum += (known_lhs(i) - lhs(i)) * (known_lhs(i) - lhs(i));
+            },
+            sum);
+
         scalar_t norm_ssd = sqrt(sum / lhs.extent(0));
-        std::cout << "  ssd = " << sum << "  norm_sqrt_ssd = " << norm_ssd << std::endl;
-        if ( norm_ssd > 1e-8 ) {
-          std::cout << "Lower Tri Solve FAILURE: norm_ssd = " << norm_ssd << std::endl;
+        std::cout << "  ssd = " << sum << "  norm_sqrt_ssd = " << norm_ssd
+                  << std::endl;
+
+        if (norm_ssd > 1e-8) {
+          std::cout << "Lower Tri Solve FAILURE: norm_ssd = " << norm_ssd
+                    << std::endl;
           return 1;
+        } else {
+          std::cout << "\nLower Tri Solve Init Test: SUCCESS!\n" << std::endl;
         }
-        else {
-         std::cout << "\nLower Tri Solve Init Test: SUCCESS!\n" << std::endl;
+
+        /*
+      sum = 0.0;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<execution_space>(0,
+      lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { tsum +=
+      lhs(i);
+        }, sum);
+
+      if ( sum != lhs.extent(0) ) {
+        std::cout << "Lower Tri Solve FAILURE: sum = " << sum << std::endl;
+        auto hsoln = Kokkos::create_mirror_view(lhs);
+        Kokkos::deep_copy(hsoln, lhs);
+        for ( size_t i = 0; i < hsoln.extent(0); ++i ) {
+          std::cout << "lhs(" << i << ") = " << hsoln(i) << std::endl;
         }
+        return 1;
+      }
+      else {
+       std::cout << "\nLower Tri Solve Init Test: SUCCESS!\n" << std::endl;
+      }
+        */
+      }
+
+      // Benchmark
+      Kokkos::fence();
+      double min_time = 1.0e32;
+      double max_time = 0.0;
+      double ave_time = 0.0;
+
+      for (int titer = 0; titer < loop; titer++) {
+        timer.reset();
+
+        if (test != CUSPARSE) {
+#ifdef CHECKALLRUNRESULTS
+          Kokkos::deep_copy(lhs, 0, 0);
+#endif
+          sptrsv_solve(&kh, row_map, entries, values, rhs, lhs);
+#ifdef CHECKALLRUNRESULTS
+          {
+            scalar_t sum = 0.0;
+            Kokkos::parallel_reduce(
+                Kokkos::RangePolicy<execution_space>(0, lhs.extent(0)),
+                KOKKOS_LAMBDA(const lno_t i, scalar_t &tsum) {
+                  tsum += (known_lhs(i) - lhs(i)) * (known_lhs(i) - lhs(i));
+                },
+                sum);
+
+            scalar_t norm_ssd = sqrt(sum / lhs.extent(0));
+            std::cout << "  ssd = " << sum << "  norm_sqrt_ssd = " << norm_ssd
+                      << std::endl;
+            if (norm_ssd > 1e-8) {
+              std::cout << "Lower Tri Solve FAILURE: norm_ssd = " << norm_ssd
+                        << std::endl;
+              return 1;
+            } else {
+              std::cout << "\nLower Tri Solve Init Test: SUCCESS!\n"
+                        << std::endl;
+            }
+          }
+#endif
+        }
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(INTERNAL_CUSPARSE)
+        else {
+          cusparseDcsrsv2_solve(handle, trans, nrows, triMtx.nnz(), &alpha,
+                                descr, values.data(), row_map.data(),
+                                entries.data(), info, rhs.data(), lhs.data(),
+                                policy, pBuffer);
         }
-    #endif
-    }
-#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE)
-    else {
-      cusparseDcsrsv2_solve(handle, trans, nrows, triMtx.nnz(), &alpha, descr, values.data(), row_map.data(), entries.data(), info, rhs.data(), lhs.data(), policy, pBuffer);
-    }
 #endif
-  
-      Kokkos::fence();
-      double time = timer.seconds();
-      ave_time += time;
-      if(time>max_time) max_time = time;
-      if(time<min_time) min_time = time;
-    }
 
-    std::cout << "LOOP_AVG_TIME:  " << ave_time/loop << std::endl;
-    std::cout << "LOOP_MAX_TIME:  " << max_time << std::endl;
-    std::cout << "LOOP_MIN_TIME:  " << min_time << std::endl;
-
-    // Output for level frequency plot
-    #ifdef PRINT_HLEVEL_FREQ_PLOT
-    if (test != CUSPARSE)
-    {
-    auto hnpl = kh.get_sptrsv_handle()->get_host_nodes_per_level();
-    auto nlevels = kh.get_sptrsv_handle()->get_num_levels();
-    std::string algmstring = kh.get_sptrsv_handle()->return_algorithm_string();
-    std::cout << algmstring << std::endl;
-    // Create filename
-    std::string filename = "lower_nodes_per_level_" + algmstring + ".txt";
-    std::cout << filename << std::endl;
-    std::cout << "  nlevels = " << nlevels << std::endl;
-    std::ofstream outfile;
-    outfile.open(filename);
-    if (outfile.is_open()) {
-      for ( int i = 0; i < nlevels; ++i ) {
-        outfile << hnpl(i) << std::endl;
-        //std::cout  << hnpl(i) << std::endl;
+        Kokkos::fence();
+        double time = timer.seconds();
+        ave_time += time;
+        if (time > max_time) max_time = time;
+        if (time < min_time) min_time = time;
       }
-      outfile.close();
-    }
-    else {
-      std::cout << "OUTFILE DID NOT OPEN!!!" << std::endl;
-    }
 
-    auto hngpl = kh.get_sptrsv_handle()->get_host_nodes_grouped_by_level();
-    filename = "lower_nodes_groupby_level_" + algmstring + ".txt";
-    std::cout << filename << std::endl;
-    outfile.open(filename);
-    if (outfile.is_open()) {
-      for ( size_t i = 0; i < hngpl.extent(0); ++i )
-        outfile << hngpl(i) << std::endl;
-      outfile.close();
-    }
-    else {
-      std::cout << "OUTFILE DID NOT OPEN!!!" << std::endl;
-    }
+      std::cout << "LOOP_AVG_TIME:  " << ave_time / loop << std::endl;
+      std::cout << "LOOP_MAX_TIME:  " << max_time << std::endl;
+      std::cout << "LOOP_MIN_TIME:  " << min_time << std::endl;
+
+// Output for level frequency plot
+#ifdef PRINT_HLEVEL_FREQ_PLOT
+      if (test != CUSPARSE) {
+        auto hnpl    = kh.get_sptrsv_handle()->get_host_nodes_per_level();
+        auto nlevels = kh.get_sptrsv_handle()->get_num_levels();
+        std::string algmstring =
+            kh.get_sptrsv_handle()->return_algorithm_string();
+        std::cout << algmstring << std::endl;
+        // Create filename
+        std::string filename = "lower_nodes_per_level_" + algmstring + ".txt";
+        std::cout << filename << std::endl;
+        std::cout << "  nlevels = " << nlevels << std::endl;
+        std::ofstream outfile;
+        outfile.open(filename);
+        if (outfile.is_open()) {
+          for (int i = 0; i < nlevels; ++i) {
+            outfile << hnpl(i) << std::endl;
+            // std::cout  << hnpl(i) << std::endl;
+          }
+          outfile.close();
+        } else {
+          std::cout << "OUTFILE DID NOT OPEN!!!" << std::endl;
+        }
 
-    }
-    #endif
-
-    #ifdef PRINT_LEVEL_LIST
-    if (test != CUSPARSE)
-    {
-    auto level_list = kh.get_sptrsv_handle()->get_level_list();
-    auto hlevel_list = Kokkos::create_mirror_view(level_list);
-    Kokkos::deep_copy(hlevel_list, level_list);
-
-    auto nlevels = kh.get_sptrsv_handle()->get_num_levels();
-
-    std::string algmstring = kh.get_sptrsv_handle()->return_algorithm_string();
-    std::cout << algmstring << std::endl;
-    // Create filename
-    std::string filename = "lower_level_list_" + algmstring + ".txt";
-    std::cout << filename << std::endl;
-    std::cout << "  nlevels = " << nlevels << "  nodes = " << hlevel_list.extent(0) << std::endl;
-    std::ofstream outfile;
-    outfile.open(filename);
-    if (outfile.is_open()) {
-      for ( size_t i = 0; i < hlevel_list.extent(0); ++i )
-        outfile << hlevel_list(i) << std::endl;
-      outfile.close();
-    }
-    else {
-      std::cout << "OUTFILE DID NOT OPEN!!!" << std::endl;
-    }
-    }
-    #endif
+        auto hngpl = kh.get_sptrsv_handle()->get_host_nodes_grouped_by_level();
+        filename   = "lower_nodes_groupby_level_" + algmstring + ".txt";
+        std::cout << filename << std::endl;
+        outfile.open(filename);
+        if (outfile.is_open()) {
+          for (size_t i = 0; i < hngpl.extent(0); ++i)
+            outfile << hngpl(i) << std::endl;
+          outfile.close();
+        } else {
+          std::cout << "OUTFILE DID NOT OPEN!!!" << std::endl;
+        }
+      }
+#endif
 
-    kh.destroy_sptrsv_handle();
-  }
+#ifdef PRINT_LEVEL_LIST
+      if (test != CUSPARSE) {
+        auto level_list  = kh.get_sptrsv_handle()->get_level_list();
+        auto hlevel_list = Kokkos::create_mirror_view(level_list);
+        Kokkos::deep_copy(hlevel_list, level_list);
+
+        auto nlevels = kh.get_sptrsv_handle()->get_num_levels();
+
+        std::string algmstring =
+            kh.get_sptrsv_handle()->return_algorithm_string();
+        std::cout << algmstring << std::endl;
+        // Create filename
+        std::string filename = "lower_level_list_" + algmstring + ".txt";
+        std::cout << filename << std::endl;
+        std::cout << "  nlevels = " << nlevels
+                  << "  nodes = " << hlevel_list.extent(0) << std::endl;
+        std::ofstream outfile;
+        outfile.open(filename);
+        if (outfile.is_open()) {
+          for (size_t i = 0; i < hlevel_list.extent(0); ++i)
+            outfile << hlevel_list(i) << std::endl;
+          outfile.close();
+        } else {
+          std::cout << "OUTFILE DID NOT OPEN!!!" << std::endl;
+        }
+      }
+#endif
+
+      kh.destroy_sptrsv_handle();
+    }
 
-#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE)
-// step 6: free resources
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(INTERNAL_CUSPARSE)
+    // step 6: free resources
     cudaFree(pBuffer);
     cusparseDestroyCsrsv2Info(info);
     cusparseDestroyMatDescr(descr);
     cusparseDestroy(handle);
 #endif
-  } // end lowertri
+  }  // end lowertri
   Kokkos::fence();
 
   std::cout << "\n\n" << std::endl;
-// UPPERTRI
-  if (!ufilename.empty())
-  {
-    std::cout << "Upper Tri Begin: Read matrix filename " << ufilename << std::endl;
-    crsmat_t triMtx = KokkosKernels::Impl::read_kokkos_crst_matrix<crsmat_t>(ufilename.c_str()); //in_matrix
-    graph_t  graph  = triMtx.graph; // in_graph
+  // UPPERTRI
+  if (!ufilename.empty()) {
+    std::cout << "Upper Tri Begin: Read matrix filename " << ufilename
+              << std::endl;
+    crsmat_t triMtx = KokkosKernels::Impl::read_kokkos_crst_matrix<crsmat_t>(
+        ufilename.c_str());                // in_matrix
+    graph_t graph         = triMtx.graph;  // in_graph
     const size_type nrows = graph.numRows();
 
     // Create the rhs and lhs_known solution
@@ -540,15 +584,18 @@ int test_sptrsv_perf(std::vector<int> tests, const std::string& lfilename, const
     ValuesType rhs("rhs", nrows);
 
     std::cout << "SPMV" << std::endl;
-    KokkosSparse::spmv( "N", ONE, triMtx, known_lhs, ZERO, rhs);
+    KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs);
 
     auto row_map = graph.row_map;
     auto entries = graph.entries;
     auto values  = triMtx.values;
 
-    std::cout << "Upper Perf: row_map.extent(0) = " << row_map.extent(0) << std::endl;
-    std::cout << "Upper Perf: entries.extent(0) = " << entries.extent(0) << std::endl;
-    std::cout << "Upper Perf: values.extent(0) = " << values.extent(0) << std::endl;
+    std::cout << "Upper Perf: row_map.extent(0) = " << row_map.extent(0)
+              << std::endl;
+    std::cout << "Upper Perf: entries.extent(0) = " << entries.extent(0)
+              << std::endl;
+    std::cout << "Upper Perf: values.extent(0) = " << values.extent(0)
+              << std::endl;
 
     std::cout << "Upper Perf: lhs.extent(0) = " << lhs.extent(0) << std::endl;
     std::cout << "Upper Perf: rhs.extent(0) = " << rhs.extent(0) << std::endl;
@@ -563,454 +610,520 @@ int test_sptrsv_perf(std::vector<int> tests, const std::string& lfilename, const
     print_view1d(rhs);
 #endif
 
-#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE)
-  //std::cout << "  cusparse: create handle" << std::endl;
-  cusparseStatus_t status;
-  cusparseHandle_t handle = 0;
-  status = cusparseCreate(&handle);
-  if (CUSPARSE_STATUS_SUCCESS != status)
-    std::cout << "handle create status error name " << (status) << std::endl;
-  cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST);
-  cusparseMatDescr_t descr = 0;
-  csrsv2Info_t info = 0;
-  int pBufferSize;
-  void *pBuffer = 0;
-  int structural_zero;
-  int numerical_zero;
-  const double alpha = 1.;
-  const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
-  const cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE;
-  
-  // step 1: create a descriptor which contains
-  //   (L may not have all diagonal elements.)
-  status = cusparseCreateMatDescr(&descr);
-  if (CUSPARSE_STATUS_SUCCESS != status)
-    std::cout << "matdescr create status error name " << (status) << std::endl;
-  //cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ONE);
-  cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
-  cusparseSetMatFillMode(descr, CUSPARSE_FILL_MODE_UPPER);
-  cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_NON_UNIT);
-  cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
-  //cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_TRIANGULAR);
-  //cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_UNIT);
-  
-  // step 2: create a empty info structure
-  //std::cout << "  cusparse: create csrsv2info" << std::endl;
-  status = cusparseCreateCsrsv2Info(&info);
-  if (CUSPARSE_STATUS_SUCCESS != status)
-    std::cout << "csrsv2info create status error name " << (status) << std::endl;
-  
-  // step 3: query how much memory used in csrsv2, and allocate the buffer
-        int nnz = triMtx.nnz();
-  cusparseDcsrsv2_bufferSize(handle, trans, nrows, nnz, descr,
-      values.data(), row_map.data(), entries.data(), info, &pBufferSize);
-  // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes.
-  cudaMalloc((void**)&pBuffer, pBufferSize);
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(INTERNAL_CUSPARSE)
+    // std::cout << "  cusparse: create handle" << std::endl;
+    cusparseStatus_t status;
+    cusparseHandle_t handle = 0;
+    status                  = cusparseCreate(&handle);
+    if (CUSPARSE_STATUS_SUCCESS != status)
+      std::cout << "handle create status error name " << (status) << std::endl;
+    cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST);
+    cusparseMatDescr_t descr = 0;
+    csrsv2Info_t info        = 0;
+    int pBufferSize;
+    void *pBuffer = 0;
+    int structural_zero;
+    int numerical_zero;
+    const double alpha                 = 1.;
+    const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
+    const cusparseOperation_t trans    = CUSPARSE_OPERATION_NON_TRANSPOSE;
+
+    // step 1: create a descriptor which contains
+    //   (L may not have all diagonal elements.)
+    status = cusparseCreateMatDescr(&descr);
+    if (CUSPARSE_STATUS_SUCCESS != status)
+      std::cout << "matdescr create status error name " << (status)
+                << std::endl;
+    // cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ONE);
+    cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
+    cusparseSetMatFillMode(descr, CUSPARSE_FILL_MODE_UPPER);
+    cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_NON_UNIT);
+    cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
+    // cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_TRIANGULAR);
+    // cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_UNIT);
+
+    // step 2: create a empty info structure
+    // std::cout << "  cusparse: create csrsv2info" << std::endl;
+    status = cusparseCreateCsrsv2Info(&info);
+    if (CUSPARSE_STATUS_SUCCESS != status)
+      std::cout << "csrsv2info create status error name " << (status)
+                << std::endl;
+
+    // step 3: query how much memory used in csrsv2, and allocate the buffer
+    int nnz = triMtx.nnz();
+    cusparseDcsrsv2_bufferSize(handle, trans, nrows, nnz, descr, values.data(),
+                               row_map.data(), entries.data(), info,
+                               &pBufferSize);
+    // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes.
+    cudaMalloc((void **)&pBuffer, pBufferSize);
 #endif
 
-
-  for ( auto test : tests ) {
-    std::cout << "\ntest = " << test << std::endl;
-
-    KernelHandle kh;
-    bool is_lower_tri = false;
-
-    std::cout << "Create handle (upper)" << std::endl;
-    switch(test) {
-      case LVLSCHED_RP:
-        kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, is_lower_tri);
-        kh.get_sptrsv_handle()->print_algorithm();
-        break;
-      case LVLSCHED_TP1:
-        kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri);
-        std::cout << "TP1 set team_size = " << team_size << std::endl;
-        if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size);
-        kh.get_sptrsv_handle()->print_algorithm();
-        break;
-      case LVLSCHED_TP1CHAIN:
-        printf("TP1 with CHAIN\n");
-        printf("chain_threshold %d\n", chain_threshold);
-        printf("team_size %d\n", team_size);
-        kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, is_lower_tri);
-        kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold);
-        if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size);
-        if (vector_length != -1) kh.get_sptrsv_handle()->set_vector_size(vector_length);
-        kh.get_sptrsv_handle()->print_algorithm();
-        break;
-/*
-      case LVLSCHED_TP2:
-        kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHED_TP2, nrows, is_lower_tri);
-        if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size);
-        if (vector_length != -1) kh.get_sptrsv_handle()->set_vector_size(vector_length);
-        kh.get_sptrsv_handle()->print_algorithm();
-        break;
-*/
-      case CUSPARSE:
-#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE)
-        std::cout << "CUSPARSE: No kk interface added yet" << std::endl;
-        //cusparse_matvec(A, x, y, rows_per_thread, team_size, vector_length);
-        break;
+    for (auto test : tests) {
+      std::cout << "\ntest = " << test << std::endl;
+
+      KernelHandle kh;
+      bool is_lower_tri = false;
+
+      std::cout << "Create handle (upper)" << std::endl;
+      switch (test) {
+        case LVLSCHED_RP:
+          kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows,
+                                  is_lower_tri);
+          kh.get_sptrsv_handle()->print_algorithm();
+          break;
+        case LVLSCHED_TP1:
+          kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows,
+                                  is_lower_tri);
+          std::cout << "TP1 set team_size = " << team_size << std::endl;
+          if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size);
+          kh.get_sptrsv_handle()->print_algorithm();
+          break;
+        case LVLSCHED_TP1CHAIN:
+          printf("TP1 with CHAIN\n");
+          printf("chain_threshold %d\n", chain_threshold);
+          printf("team_size %d\n", team_size);
+          kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows,
+                                  is_lower_tri);
+          kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold);
+          if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size);
+          if (vector_length != -1)
+            kh.get_sptrsv_handle()->set_vector_size(vector_length);
+          kh.get_sptrsv_handle()->print_algorithm();
+          break;
+          /*
+                case LVLSCHED_TP2:
+                  kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHED_TP2,
+             nrows, is_lower_tri); if (team_size != -1)
+             kh.get_sptrsv_handle()->set_team_size(team_size); if (vector_length
+             != -1) kh.get_sptrsv_handle()->set_vector_size(vector_length);
+                  kh.get_sptrsv_handle()->print_algorithm();
+                  break;
+          */
+        case CUSPARSE:
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(INTERNAL_CUSPARSE)
+          std::cout << "CUSPARSE: No kk interface added yet" << std::endl;
+          // cusparse_matvec(A, x, y, rows_per_thread, team_size,
+          // vector_length);
+          break;
 #else
-        std::cout << "CUSPARSE not enabled: Fall through to defaults" << std::endl;
+          std::cout << "CUSPARSE not enabled: Fall through to defaults"
+                    << std::endl;
 #endif
-      default:
-        kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri);
-        if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size);
-        kh.get_sptrsv_handle()->print_algorithm();
-    }
+        default:
+          kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows,
+                                  is_lower_tri);
+          if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size);
+          kh.get_sptrsv_handle()->print_algorithm();
+      }
 
+      // Init run to clear cache etc.
+      Kokkos::Timer timer;
+      if (test != CUSPARSE) {
+        timer.reset();
+        sptrsv_symbolic(&kh, row_map, entries);
+        std::cout << "UTRI Symbolic Time: " << timer.seconds() << std::endl;
 
-    // Init run to clear cache etc.
-    Kokkos::Timer timer;
-    if (test != CUSPARSE) {
-    timer.reset();
-    sptrsv_symbolic( &kh, row_map, entries );
-    std::cout << "UTRI Symbolic Time: " << timer.seconds() << std::endl;
+        // std::cout << "TriSolve Solve" << std::endl;
+        timer.reset();
+        sptrsv_solve(&kh, row_map, entries, values, rhs, lhs);
+        Kokkos::fence();
+        std::cout << "UTRI Solve Time: " << timer.seconds() << std::endl;
 
-    //std::cout << "TriSolve Solve" << std::endl;
-    timer.reset();
-    sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
-    Kokkos::fence();
-    std::cout << "UTRI Solve Time: " << timer.seconds() << std::endl;
-  
-    }
-#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE)
-// step 4: perform analysis
-    else {
-      //int nnz = triMtx.nnz();
-      //std::cout << "  cusparse path: analysis" << std::endl;
-      //status = cusparseDcsrsv2_analysis(handle, trans, nrows, nnz, descr, (double*)dvalues, (int *)drow_map, (int *)dentries, info, policy, pBuffer);
-      timer.reset();
-      status = cusparseDcsrsv2_analysis(handle, trans, nrows, triMtx.nnz(), descr, values.data(), row_map.data(), entries.data(), info, policy, pBuffer);
-      std::cout << "UTRI Cusparse Symbolic Time: " << timer.seconds() << std::endl;
-      if (CUSPARSE_STATUS_SUCCESS != status)
-        std::cout << "analysis status error name " << (status) << std::endl;
-      // L has unit diagonal, so no structural zero is reported.
-
-      status = cusparseXcsrsv2_zeroPivot(handle, info, &structural_zero);
-      if (CUSPARSE_STATUS_ZERO_PIVOT == status){
-         printf("L(%d,%d) is missing\n", structural_zero, structural_zero);
       }
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(INTERNAL_CUSPARSE)
+      // step 4: perform analysis
+      else {
+        // int nnz = triMtx.nnz();
+        // std::cout << "  cusparse path: analysis" << std::endl;
+        // status = cusparseDcsrsv2_analysis(handle, trans, nrows, nnz, descr,
+        // (double*)dvalues, (int *)drow_map, (int *)dentries, info, policy,
+        // pBuffer);
+        timer.reset();
+        status = cusparseDcsrsv2_analysis(
+            handle, trans, nrows, triMtx.nnz(), descr, values.data(),
+            row_map.data(), entries.data(), info, policy, pBuffer);
+        std::cout << "UTRI Cusparse Symbolic Time: " << timer.seconds()
+                  << std::endl;
+        if (CUSPARSE_STATUS_SUCCESS != status)
+          std::cout << "analysis status error name " << (status) << std::endl;
+        // L has unit diagonal, so no structural zero is reported.
+
+        status = cusparseXcsrsv2_zeroPivot(handle, info, &structural_zero);
+        if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
+          printf("L(%d,%d) is missing\n", structural_zero, structural_zero);
+        }
 
-// step 5: solve L*y = x
-      //std::cout << "  cusparse path: solve" << std::endl;
-      //status = cusparseDcsrsv2_solve(handle, trans, nrows, nnz, &alpha, descr, (double*)dvalues, (int *)drow_map, (int *)dentries, info, (double*)drhs, (double*)dlhs, policy, pBuffer);
-      timer.reset();
-      status = cusparseDcsrsv2_solve(handle, trans, nrows, triMtx.nnz(), &alpha, descr, values.data(), row_map.data(), entries.data(), info, rhs.data(), lhs.data(), policy, pBuffer);
-      Kokkos::fence();
-      std::cout << "UTRI Cusparse Solve Time: " << timer.seconds() << std::endl;
-      if (CUSPARSE_STATUS_SUCCESS != status)
-        std::cout << "solve status error name " << (status) << std::endl;
-      // L has unit diagonal, so no numerical zero is reported.
-      status = cusparseXcsrsv2_zeroPivot(handle, info, &numerical_zero);
-      if (CUSPARSE_STATUS_ZERO_PIVOT == status){
-         printf("L(%d,%d) is zero\n", numerical_zero, numerical_zero);
+        // step 5: solve L*y = x
+        // std::cout << "  cusparse path: solve" << std::endl;
+        // status = cusparseDcsrsv2_solve(handle, trans, nrows, nnz, &alpha,
+        // descr, (double*)dvalues, (int *)drow_map, (int *)dentries, info,
+        // (double*)drhs, (double*)dlhs, policy, pBuffer);
+        timer.reset();
+        status = cusparseDcsrsv2_solve(handle, trans, nrows, triMtx.nnz(),
+                                       &alpha, descr, values.data(),
+                                       row_map.data(), entries.data(), info,
+                                       rhs.data(), lhs.data(), policy, pBuffer);
+        Kokkos::fence();
+        std::cout << "UTRI Cusparse Solve Time: " << timer.seconds()
+                  << std::endl;
+        if (CUSPARSE_STATUS_SUCCESS != status)
+          std::cout << "solve status error name " << (status) << std::endl;
+        // L has unit diagonal, so no numerical zero is reported.
+        status = cusparseXcsrsv2_zeroPivot(handle, info, &numerical_zero);
+        if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
+          printf("L(%d,%d) is zero\n", numerical_zero, numerical_zero);
+        }
       }
-    }
 #endif
-    // Error Check
-    Kokkos::fence();
-    {
-    scalar_t sum = 0.0;
-    Kokkos::parallel_reduce( Kokkos::RangePolicy<execution_space>(0, lhs.extent(0)), 
-      KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) {
-        tsum += (known_lhs(i) - lhs(i))*(known_lhs(i) - lhs(i));
-      }, sum);
-  
-    scalar_t norm_ssd = sqrt(sum / lhs.extent(0));
-    std::cout << "  ssd = " << sum << "  norm_sqrt_ssd = " << norm_ssd << std::endl;
-
-    if ( norm_ssd > 1e-8 ) {
-      std::cout << "Upper Tri Solve FAILURE: norm_ssd = " << norm_ssd << std::endl;
-      return 1;
-    }
-    else {
-     std::cout << "\nUpper Tri Solve Init Test: SUCCESS!\n" << std::endl;
-    }
-
-      /*
-    sum = 0.0;
-    Kokkos::parallel_reduce( Kokkos::RangePolicy<execution_space>(0, lhs.extent(0)), 
-      KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) {
-        tsum += lhs(i);
-      }, sum);
-  
-    if ( sum != scalar_t(lhs.extent(0)) ) {
-      std::cout << "Upper Tri Solve FAILURE: sum = " << sum << std::endl;
-      auto hsoln = Kokkos::create_mirror_view(lhs);
-      Kokkos::deep_copy(hsoln, lhs);
-      for ( size_t i = 0; i < hsoln.extent(0); ++i ) {
-        std::cout << "lhs(" << i << ") = " << hsoln(i) << std::endl;
-      }
-      return 1;
-    }
-    else {
-     std::cout << "\nUpper Tri Solve Init Test: SUCCESS!\n" << std::endl;
-    }
-      */
-    }
-  
-    // Benchmark
-    Kokkos::fence();
-    double min_time = 1.0e32;
-    double max_time = 0.0;
-    double ave_time = 0.0;
-
-    for(int titer=0;titer<loop;titer++) {
-      timer.reset();
-  
-    if (test != CUSPARSE) {
-    #ifdef CHECKALLRUNRESULTS
-      Kokkos::deep_copy(lhs,0,0);
-    #endif
-      sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
-    #ifdef CHECKALLRUNRESULTS
-        {
+      // Error Check
+      Kokkos::fence();
+      {
         scalar_t sum = 0.0;
-        Kokkos::parallel_reduce( Kokkos::RangePolicy<execution_space>(0, lhs.extent(0)), 
-          KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) {
-            tsum += (known_lhs(i) - lhs(i))*(known_lhs(i) - lhs(i));
-          }, sum);
-  
+        Kokkos::parallel_reduce(
+            Kokkos::RangePolicy<execution_space>(0, lhs.extent(0)),
+            KOKKOS_LAMBDA(const lno_t i, scalar_t &tsum) {
+              tsum += (known_lhs(i) - lhs(i)) * (known_lhs(i) - lhs(i));
+            },
+            sum);
+
         scalar_t norm_ssd = sqrt(sum / lhs.extent(0));
-        std::cout << "  ssd = " << sum << "  norm_sqrt_ssd = " << norm_ssd << std::endl;
-        if ( norm_ssd > 1e-8 ) {
-          std::cout << "Upper Tri Solve FAILURE: norm_ssd = " << norm_ssd << std::endl;
+        std::cout << "  ssd = " << sum << "  norm_sqrt_ssd = " << norm_ssd
+                  << std::endl;
+
+        if (norm_ssd > 1e-8) {
+          std::cout << "Upper Tri Solve FAILURE: norm_ssd = " << norm_ssd
+                    << std::endl;
           return 1;
+        } else {
+          std::cout << "\nUpper Tri Solve Init Test: SUCCESS!\n" << std::endl;
         }
-        else {
-         std::cout << "\nUpper Tri Solve Init Test: SUCCESS!\n" << std::endl;
+
+        /*
+      sum = 0.0;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<execution_space>(0,
+      lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { tsum +=
+      lhs(i);
+        }, sum);
+
+      if ( sum != scalar_t(lhs.extent(0)) ) {
+        std::cout << "Upper Tri Solve FAILURE: sum = " << sum << std::endl;
+        auto hsoln = Kokkos::create_mirror_view(lhs);
+        Kokkos::deep_copy(hsoln, lhs);
+        for ( size_t i = 0; i < hsoln.extent(0); ++i ) {
+          std::cout << "lhs(" << i << ") = " << hsoln(i) << std::endl;
         }
+        return 1;
+      }
+      else {
+       std::cout << "\nUpper Tri Solve Init Test: SUCCESS!\n" << std::endl;
+      }
+        */
+      }
+
+      // Benchmark
+      Kokkos::fence();
+      double min_time = 1.0e32;
+      double max_time = 0.0;
+      double ave_time = 0.0;
+
+      for (int titer = 0; titer < loop; titer++) {
+        timer.reset();
+
+        if (test != CUSPARSE) {
+#ifdef CHECKALLRUNRESULTS
+          Kokkos::deep_copy(lhs, 0, 0);
+#endif
+          sptrsv_solve(&kh, row_map, entries, values, rhs, lhs);
+#ifdef CHECKALLRUNRESULTS
+          {
+            scalar_t sum = 0.0;
+            Kokkos::parallel_reduce(
+                Kokkos::RangePolicy<execution_space>(0, lhs.extent(0)),
+                KOKKOS_LAMBDA(const lno_t i, scalar_t &tsum) {
+                  tsum += (known_lhs(i) - lhs(i)) * (known_lhs(i) - lhs(i));
+                },
+                sum);
+
+            scalar_t norm_ssd = sqrt(sum / lhs.extent(0));
+            std::cout << "  ssd = " << sum << "  norm_sqrt_ssd = " << norm_ssd
+                      << std::endl;
+            if (norm_ssd > 1e-8) {
+              std::cout << "Upper Tri Solve FAILURE: norm_ssd = " << norm_ssd
+                        << std::endl;
+              return 1;
+            } else {
+              std::cout << "\nUpper Tri Solve Init Test: SUCCESS!\n"
+                        << std::endl;
+            }
+          }
+#endif
+        }
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(INTERNAL_CUSPARSE)
+        else {
+          cusparseDcsrsv2_solve(handle, trans, nrows, triMtx.nnz(), &alpha,
+                                descr, values.data(), row_map.data(),
+                                entries.data(), info, rhs.data(), lhs.data(),
+                                policy, pBuffer);
         }
-    #endif
-    }
-#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE)
-    else {
-      cusparseDcsrsv2_solve(handle, trans, nrows, triMtx.nnz(), &alpha, descr, values.data(), row_map.data(), entries.data(), info, rhs.data(), lhs.data(), policy, pBuffer);
-    }
 #endif
-  
-      Kokkos::fence();
-      double time = timer.seconds();
-      ave_time += time;
-      if(time>max_time) max_time = time;
-      if(time<min_time) min_time = time;
-    }
 
-    std::cout << "LOOP_AVG_TIME:  " << ave_time/loop << std::endl;
-    std::cout << "LOOP_MAX_TIME:  " << max_time << std::endl;
-    std::cout << "LOOP_MIN_TIME:  " << min_time << std::endl;
-
-    // Output for level frequency plot
-    #ifdef PRINT_HLEVEL_FREQ_PLOT
-    if (test != CUSPARSE)
-    {
-    auto hnpl = kh.get_sptrsv_handle()->get_host_nodes_per_level();
-    auto nlevels = kh.get_sptrsv_handle()->get_num_levels();
-    std::string algmstring = kh.get_sptrsv_handle()->return_algorithm_string();
-    std::cout << algmstring << std::endl;
-    // Create filename
-    std::string filename = "upper_nodes_per_level_" + algmstring + ".txt";
-    std::cout << filename << std::endl;
-    std::cout << "  nlevels = " << nlevels << std::endl;
-    std::ofstream outfile;
-    outfile.open(filename);
-    if (outfile.is_open()) {
-      for ( int i = 0; i < nlevels; ++i ) {
-        outfile << hnpl(i) << std::endl;
-        //std::cout  << hnpl(i) << std::endl;
+        Kokkos::fence();
+        double time = timer.seconds();
+        ave_time += time;
+        if (time > max_time) max_time = time;
+        if (time < min_time) min_time = time;
       }
-      outfile.close();
-    }
-    else {
-      std::cout << "OUTFILE DID NOT OPEN!!!" << std::endl;
-    }
 
-    auto hngpl = kh.get_sptrsv_handle()->get_host_nodes_grouped_by_level();
-    filename = "upper_nodes_groupby_level_" + algmstring + ".txt";
-    std::cout << filename << std::endl;
-    outfile.open(filename);
-    if (outfile.is_open()) {
-      for ( size_t i = 0; i < hngpl.extent(0); ++i )
-        outfile << hngpl(i) << std::endl;
-      outfile.close();
-    }
-    else {
-      std::cout << "OUTFILE DID NOT OPEN!!!" << std::endl;
-    }
+      std::cout << "LOOP_AVG_TIME:  " << ave_time / loop << std::endl;
+      std::cout << "LOOP_MAX_TIME:  " << max_time << std::endl;
+      std::cout << "LOOP_MIN_TIME:  " << min_time << std::endl;
+
+// Output for level frequency plot
+#ifdef PRINT_HLEVEL_FREQ_PLOT
+      if (test != CUSPARSE) {
+        auto hnpl    = kh.get_sptrsv_handle()->get_host_nodes_per_level();
+        auto nlevels = kh.get_sptrsv_handle()->get_num_levels();
+        std::string algmstring =
+            kh.get_sptrsv_handle()->return_algorithm_string();
+        std::cout << algmstring << std::endl;
+        // Create filename
+        std::string filename = "upper_nodes_per_level_" + algmstring + ".txt";
+        std::cout << filename << std::endl;
+        std::cout << "  nlevels = " << nlevels << std::endl;
+        std::ofstream outfile;
+        outfile.open(filename);
+        if (outfile.is_open()) {
+          for (int i = 0; i < nlevels; ++i) {
+            outfile << hnpl(i) << std::endl;
+            // std::cout  << hnpl(i) << std::endl;
+          }
+          outfile.close();
+        } else {
+          std::cout << "OUTFILE DID NOT OPEN!!!" << std::endl;
+        }
 
-    }
-    #endif
-
-    #ifdef PRINT_LEVEL_LIST
-    if (test != CUSPARSE)
-    {
-    auto level_list = kh.get_sptrsv_handle()->get_level_list();
-    auto hlevel_list = Kokkos::create_mirror_view(level_list);
-    Kokkos::deep_copy(hlevel_list, level_list);
-
-    auto nlevels = kh.get_sptrsv_handle()->get_num_levels();
-
-    std::string algmstring = kh.get_sptrsv_handle()->return_algorithm_string();
-    std::cout << algmstring << std::endl;
-    // Create filename
-    std::string filename = "upper_level_list_" + algmstring + ".txt";
-    std::cout << filename << std::endl;
-    std::cout << "  nlevels = " << nlevels << "  nodes = " << hlevel_list.extent(0) << std::endl;
-    std::ofstream outfile;
-    outfile.open(filename);
-    if (outfile.is_open()) {
-      for ( size_t i = 0; i < hlevel_list.extent(0); ++i )
-        outfile << hlevel_list(i) << std::endl;
-      outfile.close();
-    }
-    else {
-      std::cout << "OUTFILE DID NOT OPEN!!!" << std::endl;
-    }
-    }
-    #endif
+        auto hngpl = kh.get_sptrsv_handle()->get_host_nodes_grouped_by_level();
+        filename   = "upper_nodes_groupby_level_" + algmstring + ".txt";
+        std::cout << filename << std::endl;
+        outfile.open(filename);
+        if (outfile.is_open()) {
+          for (size_t i = 0; i < hngpl.extent(0); ++i)
+            outfile << hngpl(i) << std::endl;
+          outfile.close();
+        } else {
+          std::cout << "OUTFILE DID NOT OPEN!!!" << std::endl;
+        }
+      }
+#endif
 
-    kh.destroy_sptrsv_handle();
-  }
+#ifdef PRINT_LEVEL_LIST
+      if (test != CUSPARSE) {
+        auto level_list  = kh.get_sptrsv_handle()->get_level_list();
+        auto hlevel_list = Kokkos::create_mirror_view(level_list);
+        Kokkos::deep_copy(hlevel_list, level_list);
+
+        auto nlevels = kh.get_sptrsv_handle()->get_num_levels();
+
+        std::string algmstring =
+            kh.get_sptrsv_handle()->return_algorithm_string();
+        std::cout << algmstring << std::endl;
+        // Create filename
+        std::string filename = "upper_level_list_" + algmstring + ".txt";
+        std::cout << filename << std::endl;
+        std::cout << "  nlevels = " << nlevels
+                  << "  nodes = " << hlevel_list.extent(0) << std::endl;
+        std::ofstream outfile;
+        outfile.open(filename);
+        if (outfile.is_open()) {
+          for (size_t i = 0; i < hlevel_list.extent(0); ++i)
+            outfile << hlevel_list(i) << std::endl;
+          outfile.close();
+        } else {
+          std::cout << "OUTFILE DID NOT OPEN!!!" << std::endl;
+        }
+      }
+#endif
 
-#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE)
-// step 6: free resources
+      kh.destroy_sptrsv_handle();
+    }
+
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(INTERNAL_CUSPARSE)
+    // step 6: free resources
     cudaFree(pBuffer);
     cusparseDestroyCsrsv2Info(info);
     cusparseDestroyMatDescr(descr);
     cusparseDestroy(handle);
 #endif
-  } // end uppertri
+  }  // end uppertri
   Kokkos::fence();
 
   return 0;
 }
 
-
 void print_help_sptrsv() {
   printf("Options:\n");
   printf("  --test [OPTION] : Use different kernel implementations\n");
   printf("                    Options:\n");
-  printf("                      lvlrp, lvltp1, lvltp2, lvltp1chain, lvldensetp1, lvldensetp2\n\n");
+  printf(
+      "                      lvlrp, lvltp1, lvltp2, lvltp1chain, lvldensetp1, "
+      "lvldensetp2\n\n");
   printf("                      cusparse           (Vendor Libraries)\n\n");
-  printf("  -lf [file]      : Read in Matrix Market formatted text file 'file'.\n");
-  printf("  -uf [file]      : Read in Matrix Market formatted text file 'file'.\n");
+  printf(
+      "  -lf [file]      : Read in Matrix Market formatted text file "
+      "'file'.\n");
+  printf(
+      "  -uf [file]      : Read in Matrix Market formatted text file "
+      "'file'.\n");
   printf("  --offset [O]    : Subtract O from every index.\n");
-  printf("                    Useful in case the matrix market file is not 0 based.\n\n");
+  printf(
+      "                    Useful in case the matrix market file is not 0 "
+      "based.\n\n");
   printf("  -ts [T]         : Number of threads per team.\n");
-  printf("  -vl [V]         : Vector-length (i.e. how many Cuda threads are a Kokkos 'thread').\n");
-  printf("  -ct [V]         : Chain threshold: Only has effect of lvltp1chain algorithm.\n");
-  printf("  -dr [V]         : Dense row percent (as float): Only has effect of lvldensetp1 algorithm.\n");
-  printf("  --loop [LOOP]   : How many spmv to run to aggregate average time. \n");
-//  printf("  --write-lvl-freq: Write output files with number of nodes per level for each matrix and algorithm.\n");
-//  printf("  -s [N]          : generate a semi-random banded (band size 0.01xN) NxN matrix\n");
-//  printf("                    with average of 10 entries per row.\n");
-//  printf("  --schedule [SCH]: Set schedule for kk variant (static,dynamic,auto [ default ]).\n");
-//  printf("  -fb [file]      : Read in binary Matrix files 'file'.\n");
-//  printf("  --write-binary  : In combination with -f, generate binary files.\n");
+  printf(
+      "  -vl [V]         : Vector-length (i.e. how many Cuda threads are a "
+      "Kokkos 'thread').\n");
+  printf(
+      "  -ct [V]         : Chain threshold: Only has effect of lvltp1chain "
+      "algorithm.\n");
+  printf(
+      "  -dr [V]         : Dense row percent (as float): Only has effect of "
+      "lvldensetp1 algorithm.\n");
+  printf(
+      "  --loop [LOOP]   : How many spmv to run to aggregate average time. \n");
+  //  printf("  --write-lvl-freq: Write output files with number of nodes per
+  //  level for each matrix and algorithm.\n"); printf("  -s [N]          :
+  //  generate a semi-random banded (band size 0.01xN) NxN matrix\n"); printf("
+  //  with average of 10 entries per row.\n"); printf("  --schedule [SCH]: Set
+  //  schedule for kk variant (static,dynamic,auto [ default ]).\n"); printf("
+  //  -fb [file]      : Read in binary Matrix files 'file'.\n"); printf("
+  //  --write-binary  : In combination with -f, generate binary files.\n");
 }
 
+int main(int argc, char **argv) {
+  std::vector<int> tests;
+
+  std::string lfilename;
+  std::string ufilename;
+
+  int vector_length       = -1;
+  int team_size           = -1;
+  int idx_offset          = 0;
+  int loop                = 1;
+  int chain_threshold     = 0;
+  float dense_row_percent = -1.0;
+  // int schedule=AUTO;
 
-int main(int argc, char **argv)
-{
- std::vector<int> tests;
-
- std::string lfilename;
- std::string ufilename;
-
- int vector_length = -1;
- int team_size = -1;
- int idx_offset = 0;
- int loop = 1;
- int chain_threshold = 0;
- float dense_row_percent = -1.0;
-// int schedule=AUTO;
-
- if(argc == 1) {
-   print_help_sptrsv();
-   return 0;
- }
-
- for(int i=0;i<argc;i++)
- {
-  if((strcmp(argv[i],"--test")==0)) {
-    i++;
-    if((strcmp(argv[i],"lvlrp")==0)) {
-      tests.push_back( LVLSCHED_RP );
+  if (argc == 1) {
+    print_help_sptrsv();
+    return 0;
+  }
+
+  for (int i = 0; i < argc; i++) {
+    if ((strcmp(argv[i], "--test") == 0)) {
+      i++;
+      if ((strcmp(argv[i], "lvlrp") == 0)) {
+        tests.push_back(LVLSCHED_RP);
+      }
+      if ((strcmp(argv[i], "lvltp1") == 0)) {
+        tests.push_back(LVLSCHED_TP1);
+      }
+      if ((strcmp(argv[i], "lvltp1chain") == 0)) {
+        tests.push_back(LVLSCHED_TP1CHAIN);
+      }
+      /*
+      if((strcmp(argv[i],"lvltp2")==0)) {
+        tests.push_back( LVLSCHED_TP2 );
+      }
+      */
+      if ((strcmp(argv[i], "cusparse") == 0)) {
+        tests.push_back(CUSPARSE);
+      }
+      if ((strcmp(argv[i], "cusparsek") == 0)) {
+        tests.push_back(CUSPARSE_K);
+      }
+      continue;
     }
-    if((strcmp(argv[i],"lvltp1")==0)) {
-      tests.push_back( LVLSCHED_TP1 );
+    if ((strcmp(argv[i], "-lf") == 0)) {
+      lfilename = argv[++i];
+      continue;
     }
-    if((strcmp(argv[i],"lvltp1chain")==0)) {
-      tests.push_back( LVLSCHED_TP1CHAIN );
+    if ((strcmp(argv[i], "-uf") == 0)) {
+      ufilename = argv[++i];
+      continue;
     }
-    /*
-    if((strcmp(argv[i],"lvltp2")==0)) {
-      tests.push_back( LVLSCHED_TP2 );
+    if ((strcmp(argv[i], "-ts") == 0)) {
+      team_size = atoi(argv[++i]);
+      continue;
     }
-    */
-    if((strcmp(argv[i],"cusparse")==0)) {
-      tests.push_back( CUSPARSE );
+    if ((strcmp(argv[i], "-vl") == 0)) {
+      vector_length = atoi(argv[++i]);
+      continue;
     }
-    if((strcmp(argv[i],"cusparsek")==0)) {
-      tests.push_back( CUSPARSE_K );
+    if ((strcmp(argv[i], "-ct") == 0)) {
+      chain_threshold = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-dr") == 0)) {
+      dense_row_percent = atof(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-l") == 0)) {
+      loop = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "--offset") == 0)) {
+      idx_offset = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "--loop") == 0)) {
+      loop = atoi(argv[++i]);
+      continue;
+    }
+    /*
+      if((strcmp(argv[i],"-lfb")==0)) {lfilename = argv[++i]; binaryfile = true;
+      continue;} if((strcmp(argv[i],"-ufb")==0)) {ufilename = argv[++i];
+      binaryfile = true; continue;} if((strcmp(argv[i],"--schedule")==0)) { i++;
+        if((strcmp(argv[i],"auto")==0))
+          schedule = AUTO;
+        if((strcmp(argv[i],"dynamic")==0))
+          schedule = DYNAMIC;
+        if((strcmp(argv[i],"static")==0))
+          schedule = STATIC;
+        continue;
+      }
+    */
+    if ((strcmp(argv[i], "--help") == 0) || (strcmp(argv[i], "-h") == 0)) {
+      print_help_sptrsv();
+      return 0;
     }
-    continue;
-  }
-  if((strcmp(argv[i],"-lf")==0)) {lfilename = argv[++i]; continue;}
-  if((strcmp(argv[i],"-uf")==0)) {ufilename = argv[++i]; continue;}
-  if((strcmp(argv[i],"-ts")==0)) {team_size=atoi(argv[++i]); continue;}
-  if((strcmp(argv[i],"-vl")==0)) {vector_length=atoi(argv[++i]); continue;}
-  if((strcmp(argv[i],"-ct")==0)) {chain_threshold=atoi(argv[++i]); continue;}
-  if((strcmp(argv[i],"-dr")==0)) {dense_row_percent=atof(argv[++i]); continue;}
-  if((strcmp(argv[i],"-l")==0)) {loop=atoi(argv[++i]); continue;}
-  if((strcmp(argv[i],"--offset")==0)) {idx_offset=atoi(argv[++i]); continue;}
-  if((strcmp(argv[i],"--loop")==0)) {loop=atoi(argv[++i]); continue;}
-/*
-  if((strcmp(argv[i],"-lfb")==0)) {lfilename = argv[++i]; binaryfile = true; continue;}
-  if((strcmp(argv[i],"-ufb")==0)) {ufilename = argv[++i]; binaryfile = true; continue;}
-  if((strcmp(argv[i],"--schedule")==0)) {
-    i++;
-    if((strcmp(argv[i],"auto")==0))
-      schedule = AUTO;
-    if((strcmp(argv[i],"dynamic")==0))
-      schedule = DYNAMIC;
-    if((strcmp(argv[i],"static")==0))
-      schedule = STATIC;
-    continue;
-  }
-*/
-  if((strcmp(argv[i],"--help")==0) || (strcmp(argv[i],"-h")==0)) {
-    print_help_sptrsv();
-    return 0;
   }
- }
 
- if (tests.size() == 0) {
-   tests.push_back(DEFAULT);
- }
- for (size_t i = 0; i < tests.size(); ++i) {
+  if (tests.size() == 0) {
+    tests.push_back(DEFAULT);
+  }
+  for (size_t i = 0; i < tests.size(); ++i) {
     std::cout << "tests[" << i << "] = " << tests[i] << std::endl;
- }
-
-
- Kokkos::initialize(argc,argv);
- {
-   int total_errors = test_sptrsv_perf(tests, lfilename, ufilename, team_size, vector_length, idx_offset, loop, chain_threshold, dense_row_percent);
-
-   if(total_errors == 0)
-   printf("Kokkos::SPTRSV Test: Passed\n");
-   else
-   printf("Kokkos::SPTRSV Test: Failed\n");
-
+  }
 
+  Kokkos::initialize(argc, argv);
+  {
+    int total_errors =
+        test_sptrsv_perf(tests, lfilename, ufilename, team_size, vector_length,
+                         idx_offset, loop, chain_threshold, dense_row_percent);
+
+    if (total_errors == 0)
+      printf("Kokkos::SPTRSV Test: Passed\n");
+    else
+      printf("Kokkos::SPTRSV Test: Failed\n");
   }
   Kokkos::finalize();
   return 0;
 }
 #else
 int main() {
-  std::cout << "KokkosSparse_sptrsv: This perf_test will do nothing when Cuda is enabled without lambda support." << std::endl;
+  std::cout << "KokkosSparse_sptrsv: This perf_test will do nothing when Cuda "
+               "is enabled without lambda support."
+            << std::endl;
   return 0;
 }
 #endif
diff --git a/perf_test/sparse/KokkosSparse_sptrsv_aux.hpp b/perf_test/sparse/KokkosSparse_sptrsv_aux.hpp
index f3c89f2cf2..3c96a4d9a5 100644
--- a/perf_test/sparse/KokkosSparse_sptrsv_aux.hpp
+++ b/perf_test/sparse/KokkosSparse_sptrsv_aux.hpp
@@ -56,113 +56,116 @@ namespace KokkosSparse {
 namespace PerfTest {
 namespace Experimental {
 
-/* ========================================================================================= */
-template<typename scalar_t>
-void forwardP_supernode(int n, int *perm_r, int nrhs, scalar_t *B, int ldb, scalar_t *X, int ldx) {
-
+/* =========================================================================================
+ */
+template <typename scalar_t>
+void forwardP_supernode(int n, int *perm_r, int nrhs, scalar_t *B, int ldb,
+                        scalar_t *X, int ldx) {
   /* Permute right hand sides to form Pr*B */
   for (int j = 0; j < nrhs; j++) {
-    scalar_t *rhs_work = &B[j*ldb];
-    scalar_t *sol_work = &X[j*ldx];
+    scalar_t *rhs_work = &B[j * ldb];
+    scalar_t *sol_work = &X[j * ldx];
     for (int k = 0; k < n; k++) {
       sol_work[perm_r[k]] = rhs_work[k];
     }
   }
 }
 
-template<typename scalar_t>
-void backwardP_supernode(int n, int *perm_c, int nrhs, scalar_t *B, int ldb, scalar_t *X, int ldx) {
-
+template <typename scalar_t>
+void backwardP_supernode(int n, int *perm_c, int nrhs, scalar_t *B, int ldb,
+                         scalar_t *X, int ldx) {
   /* Compute the final solution X := Pc*X. */
   for (int j = 0; j < nrhs; j++) {
-    scalar_t *rhs_work = &B[j*ldb];
-    scalar_t *sol_work = &X[j*ldx];
+    scalar_t *rhs_work = &B[j * ldb];
+    scalar_t *sol_work = &X[j * ldx];
     for (int k = 0; k < n; k++) {
       sol_work[k] = rhs_work[perm_c[k]];
     }
   }
 }
 
-
-/* ========================================================================================= */
+/* =========================================================================================
+ */
 template <typename mag_t, typename crsmat_t, typename scalar_view_t>
-bool check_errors(mag_t tol, crsmat_t &Mtx, scalar_view_t rhs, scalar_view_t sol) {
-
+bool check_errors(mag_t tol, crsmat_t &Mtx, scalar_view_t rhs,
+                  scalar_view_t sol) {
   using graph_t        = typename crsmat_t::StaticCrsGraphType;
   using entries_view_t = typename graph_t::entries_type::non_const_type;
   using lno_t          = typename entries_view_t::non_const_value_type;
   using values_view_t  = typename crsmat_t::values_type::non_const_type;
   using scalar_t       = typename values_view_t::value_type;
-  using STS = Kokkos::Details::ArithTraits<scalar_t>;
+  using STS            = Kokkos::Details::ArithTraits<scalar_t>;
 
   using execution_space = typename scalar_view_t::execution_space;
 
-  const mag_t ZERO (0.0);
-  const scalar_t ONE (1.0);
+  const mag_t ZERO(0.0);
+  const scalar_t ONE(1.0);
 
   // normA
   mag_t normA = ZERO;
-  Kokkos::parallel_reduce( Kokkos::RangePolicy<execution_space>(0, Mtx.nnz()), 
-    KOKKOS_LAMBDA ( const lno_t i, mag_t &tsum ) {
-      tsum += STS::abs (Mtx.values(i)) * STS::abs (Mtx.values(i));
-    }, normA);
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<execution_space>(0, Mtx.nnz()),
+      KOKKOS_LAMBDA(const lno_t i, mag_t &tsum) {
+        tsum += STS::abs(Mtx.values(i)) * STS::abs(Mtx.values(i));
+      },
+      normA);
   normA = sqrt(normA);
 
   // normB
-  mag_t normB = KokkosBlas::nrm2 (rhs);
+  mag_t normB = KokkosBlas::nrm2(rhs);
 
   // normX
-  mag_t normX = KokkosBlas::nrm2 (sol);
+  mag_t normX = KokkosBlas::nrm2(sol);
 
   // normR = ||B - AX||
-  KokkosSparse::spmv( "N", -ONE, Mtx, sol, ONE, rhs);
-  mag_t normR = KokkosBlas::nrm2 (rhs);
+  KokkosSparse::spmv("N", -ONE, Mtx, sol, ONE, rhs);
+  mag_t normR = KokkosBlas::nrm2(rhs);
 
-  std::cout << " > check : ||B - AX||/(||B|| + ||A||*||X||) = "
-            << normR << "/(" << normB << " + " << normA << " * " << normX << ") = "
-            << normR/(normB + normA * normX) << std::endl;
+  std::cout << " > check : ||B - AX||/(||B|| + ||A||*||X||) = " << normR << "/("
+            << normB << " + " << normA << " * " << normX
+            << ") = " << normR / (normB + normA * normX) << std::endl;
 
   const int nrows = Mtx.graph.numRows();
-  return (normR/(mag_t(nrows) * (normB + normA * normX)) <= tol);
+  return (normR / (mag_t(nrows) * (normB + normA * normX)) <= tol);
 }
 
-
-/* ========================================================================================= */
+/* =========================================================================================
+ */
 template <typename crsmat_t>
 void print_crsmat(crsmat_t &A) {
-  auto graph = A.graph; // in_graph
+  auto graph   = A.graph;  // in_graph
   auto row_map = graph.row_map;
   auto entries = graph.entries;
   auto values  = A.values;
 
-  int n = graph.numRows ();
+  int n = graph.numRows();
   std::cout << "[";
   for (int i = 0; i < n; i++) {
-    for (int k = row_map[i]; k < row_map[i+1]; k++) {
-      std::cout << i << " " << entries[k] << " " << values[k] << " " << k << std::endl;
+    for (int k = row_map[i]; k < row_map[i + 1]; k++) {
+      std::cout << i << " " << entries[k] << " " << values[k] << " " << k
+                << std::endl;
     }
   }
   std::cout << "];" << std::endl;
 }
 
-
 template <typename graph_t>
 void print_graph(graph_t &graph) {
   auto row_map = graph.row_map;
   auto entries = graph.entries;
 
-  int n = graph.numRows ();
+  int n = graph.numRows();
   std::cout << "[";
   for (int i = 0; i < n; i++) {
-    for (int k = row_map[i]; k < row_map[i+1]; k++) {
+    for (int k = row_map[i]; k < row_map[i + 1]; k++) {
       std::cout << i << " " << entries[k] << " " << std::endl;
     }
   }
   std::cout << "];" << std::endl;
 }
 
-
-/* ========================================================================================= */
+/* =========================================================================================
+ */
 template <typename crsmat_t>
 crsmat_t remove_zeros_crsmat(crsmat_t &A) {
   using graph_t        = typename crsmat_t::StaticCrsGraphType;
@@ -173,91 +176,97 @@ crsmat_t remove_zeros_crsmat(crsmat_t &A) {
   using row_map_view_host_t = typename row_map_view_t::HostMirror;
   using cols_view_host_t    = typename cols_view_t::HostMirror;
   using values_view_host_t  = typename values_view_t::HostMirror;
-  using scalar_t  = typename values_view_t::value_type;
-  using size_type = typename crsmat_t::size_type;
+  using scalar_t            = typename values_view_t::value_type;
+  using size_type           = typename crsmat_t::size_type;
 
   using range_type = Kokkos::pair<int, int>;
 
-  const scalar_t zero (0.0);
+  const scalar_t zero(0.0);
 
-  auto graph = A.graph; // in_graph
-  int n = graph.numRows ();
+  auto graph   = A.graph;  // in_graph
+  int n        = graph.numRows();
   auto row_map = graph.row_map;
   auto entries = graph.entries;
   auto values  = A.values;
 
-  row_map_view_host_t hr = Kokkos::create_mirror_view (row_map);
-  cols_view_host_t    hc = Kokkos::create_mirror_view (entries);
-  values_view_host_t  hv  = Kokkos::create_mirror_view (values);
+  row_map_view_host_t hr = Kokkos::create_mirror_view(row_map);
+  cols_view_host_t hc    = Kokkos::create_mirror_view(entries);
+  values_view_host_t hv  = Kokkos::create_mirror_view(values);
 
-  Kokkos::deep_copy (hr, row_map);
-  Kokkos::deep_copy (hc, entries);
-  Kokkos::deep_copy (hv, values);
+  Kokkos::deep_copy(hr, row_map);
+  Kokkos::deep_copy(hc, entries);
+  Kokkos::deep_copy(hv, values);
 
   // compress
-  size_type nnzA0 = hr (n);
-  size_type nnzA = 0;
+  size_type nnzA0 = hr(n);
+  size_type nnzA  = 0;
   for (int i = 0; i < n; i++) {
     size_type nnz = nnzA;
-    for (int k = hr (i); k < hr (i+1); k++) {
+    for (int k = hr(i); k < hr(i + 1); k++) {
       if (hv(k) != zero) {
-        hv (nnzA) = hv (k);
-        hc (nnzA) = hc (k); 
+        hv(nnzA) = hv(k);
+        hc(nnzA) = hc(k);
         nnzA++;
       }
     }
-    hr (i) = nnz;
+    hr(i) = nnz;
   }
-  hr (n) = nnzA;
-  std::cout << "   > compressed from " << nnzA0 << " to " << nnzA << " nnzs" << std::endl;
+  hr(n) = nnzA;
+  std::cout << "   > compressed from " << nnzA0 << " to " << nnzA << " nnzs"
+            << std::endl;
 
   // allocate & create
-  row_map_view_t new_row_map ("rowmap_view", n+1);
-  cols_view_t    new_entries ("colmap_view", nnzA);
-  values_view_t  new_values  ("values_view", nnzA);
+  row_map_view_t new_row_map("rowmap_view", n + 1);
+  cols_view_t new_entries("colmap_view", nnzA);
+  values_view_t new_values("values_view", nnzA);
 
-  Kokkos::deep_copy (new_row_map, hr);
-  Kokkos::deep_copy (new_entries, subview (hc, range_type(0, nnzA)));
-  Kokkos::deep_copy (new_values,  subview (hv, range_type(0, nnzA)));
+  Kokkos::deep_copy(new_row_map, hr);
+  Kokkos::deep_copy(new_entries, subview(hc, range_type(0, nnzA)));
+  Kokkos::deep_copy(new_values, subview(hv, range_type(0, nnzA)));
 
-  graph_t static_graph (new_entries, new_row_map);
-  crsmat_t crsmat ("CrsMatrix", n, new_values, static_graph);
+  graph_t static_graph(new_entries, new_row_map);
+  crsmat_t crsmat("CrsMatrix", n, new_values, static_graph);
   return crsmat;
 }
 
-
-/* ========================================================================================= */
-#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 
+/* =========================================================================================
+ */
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
 std::string getCuSparseErrorString(cusparseStatus_t status) {
-  #if 0
+#if 0
   return cusparseGetErrorString (status);
-  #else
-  switch(status)
-  {
-    case CUSPARSE_STATUS_SUCCESS                  : return "CUSPARSE_STATUS_SUCCESS";
-    case CUSPARSE_STATUS_NOT_INITIALIZED          : return "CUSPARSE_STATUS_NOT_INITIALIZED";
-    case CUSPARSE_STATUS_ALLOC_FAILED             : return "CUSPARSE_STATUS_ALLOC_FAILED";
-    case CUSPARSE_STATUS_INVALID_VALUE            : return "CUSPARSE_STATUS_INVALID_VALUE";
-    case CUSPARSE_STATUS_ARCH_MISMATCH            : return "CUSPARSE_STATUS_ARCH_MISMATCH";
-    case CUSPARSE_STATUS_EXECUTION_FAILED         : return "USPARSE_STATUS_EXECUTION_FAILED";
-    case CUSPARSE_STATUS_INTERNAL_ERROR           : return "CUSPARSE_STATUS_INTERNAL_ERROR";
-    default                                       : return "un-handled error code";
+#else
+  switch (status) {
+    case CUSPARSE_STATUS_SUCCESS: return "CUSPARSE_STATUS_SUCCESS";
+    case CUSPARSE_STATUS_NOT_INITIALIZED:
+      return "CUSPARSE_STATUS_NOT_INITIALIZED";
+    case CUSPARSE_STATUS_ALLOC_FAILED: return "CUSPARSE_STATUS_ALLOC_FAILED";
+    case CUSPARSE_STATUS_INVALID_VALUE: return "CUSPARSE_STATUS_INVALID_VALUE";
+    case CUSPARSE_STATUS_ARCH_MISMATCH: return "CUSPARSE_STATUS_ARCH_MISMATCH";
+    case CUSPARSE_STATUS_EXECUTION_FAILED:
+      return "USPARSE_STATUS_EXECUTION_FAILED";
+    case CUSPARSE_STATUS_INTERNAL_ERROR:
+      return "CUSPARSE_STATUS_INTERNAL_ERROR";
+    default: return "un-handled error code";
   }
-  #endif
+#endif
 }
 #endif
 
-/* ========================================================================================= */
-#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 
+/* =========================================================================================
+ */
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
 template <typename crsmat_t, typename host_crsmat_t>
-bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, bool col_majorU, crsmat_t &U,
-                    int *perm_r, int *perm_c, double tol, int loop) {
+bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L,
+                    bool col_majorU, crsmat_t &U, int *perm_r, int *perm_c,
+                    double tol, int loop) {
   using values_view_t = typename crsmat_t::values_type::non_const_type;
   using scalar_t      = typename values_view_t::value_type;
   using size_type     = typename crsmat_t::size_type;
 
-  using host_values_view_t = typename host_crsmat_t::values_type::non_const_type;
-  using host_scalar_t      = typename host_values_view_t::value_type;
+  using host_values_view_t =
+      typename host_crsmat_t::values_type::non_const_type;
+  using host_scalar_t = typename host_values_view_t::value_type;
 
   using execution_space = typename values_view_t::execution_space;
   using memory_space    = typename execution_space::memory_space;
@@ -265,233 +274,238 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, bool col_m
   using host_execution_space = typename host_values_view_t::execution_space;
   using host_memory_space    = typename host_execution_space::memory_space;
 
-  using host_scalar_view_t = Kokkos::View< scalar_t*, host_memory_space >;
-  using      scalar_view_t = Kokkos::View< scalar_t*,      memory_space >;
+  using host_scalar_view_t = Kokkos::View<scalar_t *, host_memory_space>;
+  using scalar_view_t      = Kokkos::View<scalar_t *, memory_space>;
 
-  const scalar_t ZERO (0.0);
-  const scalar_t ONE (1.0);
+  const scalar_t ZERO(0.0);
+  const scalar_t ONE(1.0);
 
   Kokkos::Timer timer;
-  const int nrows = Mtx.graph.numRows ();
+  const int nrows = Mtx.graph.numRows();
 
   // ==============================================
   // > create a handle
   cusparseStatus_t status;
-  cusparseHandle_t handle = 0; 
-  status = cusparseCreate (&handle);
+  cusparseHandle_t handle = 0;
+  status                  = cusparseCreate(&handle);
   if (CUSPARSE_STATUS_SUCCESS != status) {
-    std::cout << " ** cusparseCreate failed with " 
-              << getCuSparseErrorString (status)
-              << " ** " << std::endl;
+    std::cout << " ** cusparseCreate failed with "
+              << getCuSparseErrorString(status) << " ** " << std::endl;
   }
-  cusparseSetPointerMode (handle, CUSPARSE_POINTER_MODE_HOST); // scalars are passed by reference on host
+  cusparseSetPointerMode(
+      handle,
+      CUSPARSE_POINTER_MODE_HOST);  // scalars are passed by reference on host
 
   // > create a empty info structure for L-solve (e.g., analysis results)
-  csrsv2Info_t infoL = 0; 
-  status = cusparseCreateCsrsv2Info (&infoL);
+  csrsv2Info_t infoL = 0;
+  status             = cusparseCreateCsrsv2Info(&infoL);
   if (CUSPARSE_STATUS_SUCCESS != status) {
     std::cout << " ** cusparseCreateCsrsv2Info failed with "
-              << getCuSparseErrorString (status) 
-              << " ** " << std::endl;
+              << getCuSparseErrorString(status) << " ** " << std::endl;
   }
 
   // ==============================================
   // Preparing for L-solve
   // step 1: create a descriptor
-  size_type nnzL = L.nnz ();
-  auto graphL = L.graph; // in_graph
-  auto row_mapL = graphL.row_map;
-  auto entriesL = graphL.entries;
-  auto valuesL  = L.values;
+  size_type nnzL = L.nnz();
+  auto graphL    = L.graph;  // in_graph
+  auto row_mapL  = graphL.row_map;
+  auto entriesL  = graphL.entries;
+  auto valuesL   = L.values;
 
   // NOTE: it is stored in CSC = UPPER + TRANSPOSE
   cusparseMatDescr_t descrL = 0;
-  status = cusparseCreateMatDescr (&descrL);
+  status                    = cusparseCreateMatDescr(&descrL);
   if (CUSPARSE_STATUS_SUCCESS != status) {
     std::cout << " ** cusparseCreateMatDescr failed with "
-              << getCuSparseErrorString (status) 
-              << " ** " << std::endl;
+              << getCuSparseErrorString(status) << " ** " << std::endl;
   }
-  cusparseSetMatIndexBase (descrL, CUSPARSE_INDEX_BASE_ZERO);
-  cusparseSetMatFillMode (descrL, CUSPARSE_FILL_MODE_UPPER);
-  cusparseSetMatDiagType (descrL, CUSPARSE_DIAG_TYPE_NON_UNIT);
-  cusparseSetMatType (descrL,CUSPARSE_MATRIX_TYPE_GENERAL);
+  cusparseSetMatIndexBase(descrL, CUSPARSE_INDEX_BASE_ZERO);
+  cusparseSetMatFillMode(descrL, CUSPARSE_FILL_MODE_UPPER);
+  cusparseSetMatDiagType(descrL, CUSPARSE_DIAG_TYPE_NON_UNIT);
+  cusparseSetMatType(descrL, CUSPARSE_MATRIX_TYPE_GENERAL);
 
   // ==============================================
   // step 2: query how much memory used in csrsv2, and allocate the buffer
   // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes.
   int pBufferSize;
-  void *pBufferL = 0;
-  cusparseOperation_t transL = (col_majorL ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE);
+  void *pBufferL             = 0;
+  cusparseOperation_t transL = (col_majorL ? CUSPARSE_OPERATION_TRANSPOSE
+                                           : CUSPARSE_OPERATION_NON_TRANSPOSE);
   if (std::is_same<scalar_t, double>::value) {
-    cusparseDcsrsv2_bufferSize (handle, transL, nrows, nnzL, descrL,
-                                reinterpret_cast <double*> (valuesL.data()), row_mapL.data(), entriesL.data(), infoL,
-                                &pBufferSize);
+    cusparseDcsrsv2_bufferSize(handle, transL, nrows, nnzL, descrL,
+                               reinterpret_cast<double *>(valuesL.data()),
+                               row_mapL.data(), entriesL.data(), infoL,
+                               &pBufferSize);
   } else {
-    cusparseZcsrsv2_bufferSize (handle, transL, nrows, nnzL, descrL,
-                                reinterpret_cast <cuDoubleComplex*> (valuesL.data()), row_mapL.data(), entriesL.data(), infoL,
-                                &pBufferSize);
+    cusparseZcsrsv2_bufferSize(
+        handle, transL, nrows, nnzL, descrL,
+        reinterpret_cast<cuDoubleComplex *>(valuesL.data()), row_mapL.data(),
+        entriesL.data(), infoL, &pBufferSize);
   }
-  cudaMalloc((void**)&pBufferL, pBufferSize);
+  cudaMalloc((void **)&pBufferL, pBufferSize);
 
   // ==============================================
   // step 3: analysis
   std::cout << "  Lower-Triangular" << std::endl;
-  timer.reset ();
+  timer.reset();
   const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
   if (std::is_same<scalar_t, double>::value) {
-    status = cusparseDcsrsv2_analysis (handle, transL, nrows, nnzL, descrL,
-                                       reinterpret_cast <double*> (valuesL.data()), row_mapL.data(), entriesL.data(),
-                                       infoL, policy, pBufferL);
+    status = cusparseDcsrsv2_analysis(
+        handle, transL, nrows, nnzL, descrL,
+        reinterpret_cast<double *>(valuesL.data()), row_mapL.data(),
+        entriesL.data(), infoL, policy, pBufferL);
   } else {
-    status = cusparseZcsrsv2_analysis (handle, transL, nrows, nnzL, descrL,
-                                       reinterpret_cast <cuDoubleComplex*> (valuesL.data()), row_mapL.data(), entriesL.data(),
-                                       infoL, policy, pBufferL);
+    status = cusparseZcsrsv2_analysis(
+        handle, transL, nrows, nnzL, descrL,
+        reinterpret_cast<cuDoubleComplex *>(valuesL.data()), row_mapL.data(),
+        entriesL.data(), infoL, policy, pBufferL);
   }
-  double time_symbolic = timer.seconds ();
+  double time_symbolic = timer.seconds();
   std::cout << "  Cusparse Symbolic Time: " << time_symbolic << std::endl;
   if (CUSPARSE_STATUS_SUCCESS != status) {
     std::cout << " ** cusparseZcsrsv2_analysis failed with "
-              << getCuSparseErrorString (status) 
-              << " ** " << std::endl;
+              << getCuSparseErrorString(status) << " ** " << std::endl;
   }
   // L has unit diagonal, so no structural zero is reported.
 
   int structural_zero;
-  status = cusparseXcsrsv2_zeroPivot (handle, infoL, &structural_zero);
-  if (CUSPARSE_STATUS_ZERO_PIVOT == status){
+  status = cusparseXcsrsv2_zeroPivot(handle, infoL, &structural_zero);
+  if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
     printf("L(%d,%d) is missing\n", structural_zero, structural_zero);
   }
 
   // ==============================================
   // Preaparing for the first solve
   //> create the known solution and set to all 1's ** on host **
-  host_scalar_view_t sol_host ("sol_host", nrows);
-  Kokkos::deep_copy (sol_host, ONE);
+  host_scalar_view_t sol_host("sol_host", nrows);
+  Kokkos::deep_copy(sol_host, ONE);
 
   // > create the rhs ** on host **
   // A*sol generates rhs: rhs is dense, use spmv
-  host_scalar_view_t rhs_host ("rhs_host", nrows);
-  KokkosSparse::spmv ("N", ONE, Mtx, sol_host, ZERO, rhs_host);
+  host_scalar_view_t rhs_host("rhs_host", nrows);
+  KokkosSparse::spmv("N", ONE, Mtx, sol_host, ZERO, rhs_host);
 
   // ==============================================
   // step 1: apply forward-pivot to rhs on the host
-  host_scalar_view_t tmp_host ("temp", nrows);
-  forwardP_supernode<scalar_t> (nrows, perm_r, 1, rhs_host.data(), nrows, tmp_host.data(), nrows);
+  host_scalar_view_t tmp_host("temp", nrows);
+  forwardP_supernode<scalar_t>(nrows, perm_r, 1, rhs_host.data(), nrows,
+                               tmp_host.data(), nrows);
 
   // copy rhs to the default host/device
-  scalar_view_t rhs ("rhs", nrows);
-  scalar_view_t sol ("sol", nrows);
-  Kokkos::deep_copy (rhs, tmp_host);
+  scalar_view_t rhs("rhs", nrows);
+  scalar_view_t sol("sol", nrows);
+  Kokkos::deep_copy(rhs, tmp_host);
 
   // ==============================================
   // step 2: solve L*y = x
-  Kokkos::fence ();
-  timer.reset ();
+  Kokkos::fence();
+  timer.reset();
   if (std::is_same<scalar_t, double>::value) {
     const double alpha = 1.0;
-    status = cusparseDcsrsv2_solve (handle, transL, nrows, nnzL, &alpha, descrL,
-                                    reinterpret_cast <double*> (valuesL.data()), row_mapL.data(), entriesL.data(), infoL,
-                                    reinterpret_cast <double*> (rhs.data()),
-                                    reinterpret_cast <double*> (sol.data()),
-                                    policy, pBufferL);
+    status             = cusparseDcsrsv2_solve(
+        handle, transL, nrows, nnzL, &alpha, descrL,
+        reinterpret_cast<double *>(valuesL.data()), row_mapL.data(),
+        entriesL.data(), infoL, reinterpret_cast<double *>(rhs.data()),
+        reinterpret_cast<double *>(sol.data()), policy, pBufferL);
   } else {
     const cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0);
-    status = cusparseZcsrsv2_solve (handle, transL, nrows, nnzL, &alpha, descrL,
-                                    reinterpret_cast <cuDoubleComplex*> (valuesL.data()), row_mapL.data(), entriesL.data(), infoL,
-                                    reinterpret_cast <cuDoubleComplex*> (rhs.data()),
-                                    reinterpret_cast <cuDoubleComplex*> (sol.data()),
-                                    policy, pBufferL);
+    status                      = cusparseZcsrsv2_solve(
+        handle, transL, nrows, nnzL, &alpha, descrL,
+        reinterpret_cast<cuDoubleComplex *>(valuesL.data()), row_mapL.data(),
+        entriesL.data(), infoL, reinterpret_cast<cuDoubleComplex *>(rhs.data()),
+        reinterpret_cast<cuDoubleComplex *>(sol.data()), policy, pBufferL);
   }
-  Kokkos::fence ();
-  double time_solve = timer.seconds ();
+  Kokkos::fence();
+  double time_solve = timer.seconds();
   std::cout << "  Cusparse Solve Time   : " << time_solve << std::endl;
   if (CUSPARSE_STATUS_SUCCESS != status) {
     std::cout << " ** cusparseZcsrsv2_solve failed with "
-              << getCuSparseErrorString (status) 
-              << " ** " << std::endl;
+              << getCuSparseErrorString(status) << " ** " << std::endl;
   }
   // L has unit diagonal, so no numerical zero is reported.
   int numerical_zero;
-  status = cusparseXcsrsv2_zeroPivot (handle, infoL, &numerical_zero);
-  if (CUSPARSE_STATUS_ZERO_PIVOT == status){
-    printf ("L(%d,%d) is zero\n", numerical_zero, numerical_zero);
+  status = cusparseXcsrsv2_zeroPivot(handle, infoL, &numerical_zero);
+  if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
+    printf("L(%d,%d) is zero\n", numerical_zero, numerical_zero);
   }
 
   // ==============================================
   // Preparing for U-solve
   size_type nnzU = U.nnz();
-  auto graphU = U.graph; // in_graph
-  auto row_mapU = graphU.row_map;
-  auto entriesU = graphU.entries;
-  auto valuesU  = U.values;
+  auto graphU    = U.graph;  // in_graph
+  auto row_mapU  = graphU.row_map;
+  auto entriesU  = graphU.entries;
+  auto valuesU   = U.values;
 
   // > create a empty info structure for U-solve (e.g., analysis results)
-  csrsv2Info_t infoU = 0; 
-  status = cusparseCreateCsrsv2Info (&infoU);
+  csrsv2Info_t infoU = 0;
+  status             = cusparseCreateCsrsv2Info(&infoU);
   if (CUSPARSE_STATUS_SUCCESS != status) {
     std::cout << " ** cusparseCreateCsrsv2Info failed with "
-              << getCuSparseErrorString (status) 
-              << " ** " << std::endl;
+              << getCuSparseErrorString(status) << " ** " << std::endl;
   }
 
   // ==============================================
   // step 1: create a descriptor
   cusparseMatDescr_t descrU = 0;
-  status = cusparseCreateMatDescr (&descrU);
+  status                    = cusparseCreateMatDescr(&descrU);
   if (CUSPARSE_STATUS_SUCCESS != status) {
     std::cout << " ** cusparseCreateMatDescr create status error name "
-              << getCuSparseErrorString (status) 
-              << " ** " << std::endl;
+              << getCuSparseErrorString(status) << " ** " << std::endl;
   }
   // NOTE: if CSR, UPPER+NO-TRANSPOSE, else LOWER+Trans
-  cusparseSetMatIndexBase (descrU, CUSPARSE_INDEX_BASE_ZERO);
+  cusparseSetMatIndexBase(descrU, CUSPARSE_INDEX_BASE_ZERO);
   if (col_majorU) {
-    cusparseSetMatFillMode (descrU, CUSPARSE_FILL_MODE_LOWER);
+    cusparseSetMatFillMode(descrU, CUSPARSE_FILL_MODE_LOWER);
   } else {
-    cusparseSetMatFillMode (descrU, CUSPARSE_FILL_MODE_UPPER);
+    cusparseSetMatFillMode(descrU, CUSPARSE_FILL_MODE_UPPER);
   }
-  cusparseSetMatDiagType (descrU, CUSPARSE_DIAG_TYPE_NON_UNIT);
-  cusparseSetMatType (descrU,CUSPARSE_MATRIX_TYPE_GENERAL);
+  cusparseSetMatDiagType(descrU, CUSPARSE_DIAG_TYPE_NON_UNIT);
+  cusparseSetMatType(descrU, CUSPARSE_MATRIX_TYPE_GENERAL);
 
   // ==============================================
   // step 2: query how much memory used in csrsv2, and allocate the buffer
   // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes.
-  void *pBufferU = 0;
-  cusparseOperation_t transU = (col_majorU ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE);
+  void *pBufferU             = 0;
+  cusparseOperation_t transU = (col_majorU ? CUSPARSE_OPERATION_TRANSPOSE
+                                           : CUSPARSE_OPERATION_NON_TRANSPOSE);
   if (std::is_same<scalar_t, double>::value) {
-    cusparseDcsrsv2_bufferSize (handle, transU, nrows, nnzU, descrU,
-                                reinterpret_cast <double*> (valuesU.data()), row_mapU.data(), entriesU.data(),
-                                infoU, &pBufferSize);
+    cusparseDcsrsv2_bufferSize(handle, transU, nrows, nnzU, descrU,
+                               reinterpret_cast<double *>(valuesU.data()),
+                               row_mapU.data(), entriesU.data(), infoU,
+                               &pBufferSize);
   } else {
-    cusparseZcsrsv2_bufferSize (handle, transU, nrows, nnzU, descrU,
-                                reinterpret_cast <cuDoubleComplex*> (valuesU.data()), row_mapU.data(), entriesU.data(),
-                                infoU, &pBufferSize);
+    cusparseZcsrsv2_bufferSize(
+        handle, transU, nrows, nnzU, descrU,
+        reinterpret_cast<cuDoubleComplex *>(valuesU.data()), row_mapU.data(),
+        entriesU.data(), infoU, &pBufferSize);
   }
-  cudaMalloc((void**)&pBufferU, pBufferSize);
+  cudaMalloc((void **)&pBufferU, pBufferSize);
 
   // ==============================================
   // step 3: analysis
   std::cout << std::endl << "  Upper-Triangular" << std::endl;
   timer.reset();
   if (std::is_same<scalar_t, double>::value) {
-    status = cusparseDcsrsv2_analysis (handle, transU, nrows, nnzU, descrU,
-                                       reinterpret_cast <double*> (valuesU.data()), row_mapU.data(), entriesU.data(),
-                                       infoU, policy, pBufferU);
+    status = cusparseDcsrsv2_analysis(
+        handle, transU, nrows, nnzU, descrU,
+        reinterpret_cast<double *>(valuesU.data()), row_mapU.data(),
+        entriesU.data(), infoU, policy, pBufferU);
   } else {
-    status = cusparseZcsrsv2_analysis (handle, transU, nrows, nnzU, descrU,
-                                       reinterpret_cast <cuDoubleComplex*> (valuesU.data()), row_mapU.data(), entriesU.data(),
-                                       infoU, policy, pBufferU);
+    status = cusparseZcsrsv2_analysis(
+        handle, transU, nrows, nnzU, descrU,
+        reinterpret_cast<cuDoubleComplex *>(valuesU.data()), row_mapU.data(),
+        entriesU.data(), infoU, policy, pBufferU);
   }
-  time_symbolic = timer.seconds ();
+  time_symbolic = timer.seconds();
   std::cout << "  Cusparse Symbolic Time: " << time_symbolic << std::endl;
   if (CUSPARSE_STATUS_SUCCESS != status) {
     std::cout << " ** cusparseDcsrsv2_analysis failed with "
-              << getCuSparseErrorString (status) 
-              << " ** " << std::endl;
+              << getCuSparseErrorString(status) << " ** " << std::endl;
   }
   status = cusparseXcsrsv2_zeroPivot(handle, infoU, &structural_zero);
-  if (CUSPARSE_STATUS_ZERO_PIVOT == status){
+  if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
     printf("U(%d,%d) is missing\n", structural_zero, structural_zero);
   }
 
@@ -500,29 +514,28 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, bool col_m
   timer.reset();
   if (std::is_same<scalar_t, double>::value) {
     const double alpha = 1.0;
-    status = cusparseDcsrsv2_solve (handle, transU, nrows, nnzU, &alpha, descrU,
-                                    reinterpret_cast <double*> (valuesU.data()), row_mapU.data(), entriesU.data(), infoU,
-                                    reinterpret_cast <double*> (sol.data()),
-                                    reinterpret_cast <double*> (rhs.data()),
-                                    policy, pBufferU);
+    status             = cusparseDcsrsv2_solve(
+        handle, transU, nrows, nnzU, &alpha, descrU,
+        reinterpret_cast<double *>(valuesU.data()), row_mapU.data(),
+        entriesU.data(), infoU, reinterpret_cast<double *>(sol.data()),
+        reinterpret_cast<double *>(rhs.data()), policy, pBufferU);
   } else {
     const cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0);
-    status = cusparseZcsrsv2_solve (handle, transU, nrows, nnzU, &alpha, descrU,
-                                    reinterpret_cast <cuDoubleComplex*> (valuesU.data()), row_mapU.data(), entriesU.data(), infoU,
-                                    reinterpret_cast <cuDoubleComplex*> (sol.data()),
-                                    reinterpret_cast <cuDoubleComplex*> (rhs.data()),
-                                    policy, pBufferU);
+    status                      = cusparseZcsrsv2_solve(
+        handle, transU, nrows, nnzU, &alpha, descrU,
+        reinterpret_cast<cuDoubleComplex *>(valuesU.data()), row_mapU.data(),
+        entriesU.data(), infoU, reinterpret_cast<cuDoubleComplex *>(sol.data()),
+        reinterpret_cast<cuDoubleComplex *>(rhs.data()), policy, pBufferU);
   }
   Kokkos::fence();
-  time_solve = timer.seconds ();
+  time_solve = timer.seconds();
   std::cout << "  Cusparse Solve Time   : " << time_solve << std::endl;
   if (CUSPARSE_STATUS_SUCCESS != status) {
     std::cout << " ** usparseDcsrsv2_solve failed with "
-              << getCuSparseErrorString (status) 
-              << " ** " << std::endl;
+              << getCuSparseErrorString(status) << " ** " << std::endl;
   }
-  status = cusparseXcsrsv2_zeroPivot (handle, infoU, &numerical_zero);
-  if (CUSPARSE_STATUS_ZERO_PIVOT == status){
+  status = cusparseXcsrsv2_zeroPivot(handle, infoU, &numerical_zero);
+  if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
     printf("U(%d,%d) is zero\n", numerical_zero, numerical_zero);
   }
 
@@ -530,7 +543,8 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, bool col_m
   // copy solution to host
   Kokkos::deep_copy(tmp_host, rhs);
   // apply backward-pivot
-  backwardP_supernode<scalar_t>(nrows, perm_c, 1, tmp_host.data(), nrows, sol_host.data(), nrows);
+  backwardP_supernode<scalar_t>(nrows, perm_c, 1, tmp_host.data(), nrows,
+                                sol_host.data(), nrows);
 
   // ==============================================
   // Error Check ** on host **
@@ -541,43 +555,47 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, bool col_m
   // Try again?
   if (success) {
     // reinitialize rhs
-    Kokkos::deep_copy (sol_host, ONE);
-    KokkosSparse::spmv ("N", ONE, Mtx, sol_host, ZERO, rhs_host);
+    Kokkos::deep_copy(sol_host, ONE);
+    KokkosSparse::spmv("N", ONE, Mtx, sol_host, ZERO, rhs_host);
 
     // forward pivot
-    forwardP_supernode<scalar_t> (nrows, perm_r, 1, rhs_host.data(), nrows, tmp_host.data(), nrows);
+    forwardP_supernode<scalar_t>(nrows, perm_r, 1, rhs_host.data(), nrows,
+                                 tmp_host.data(), nrows);
 
     // copy & solve & copy back
-    Kokkos::deep_copy (rhs, tmp_host);
+    Kokkos::deep_copy(rhs, tmp_host);
     if (std::is_same<scalar_t, double>::value) {
       const double alpha = 1.0;
-      cusparseDcsrsv2_solve (handle, transL, nrows, nnzL, &alpha, descrL,
-                             reinterpret_cast <double*> (valuesL.data()), row_mapL.data(), entriesL.data(), infoL,
-                             reinterpret_cast <double*> (rhs.data()),
-                             reinterpret_cast <double*> (sol.data()),
-                             policy, pBufferL);
-      cusparseDcsrsv2_solve (handle, transU, nrows, nnzU, &alpha, descrU,
-                             reinterpret_cast <double*> (valuesU.data()), row_mapU.data(), entriesU.data(), infoU,
-                             reinterpret_cast <double*> (sol.data()),
-                             reinterpret_cast <double*> (rhs.data()),
-                             policy, pBufferU);
+      cusparseDcsrsv2_solve(
+          handle, transL, nrows, nnzL, &alpha, descrL,
+          reinterpret_cast<double *>(valuesL.data()), row_mapL.data(),
+          entriesL.data(), infoL, reinterpret_cast<double *>(rhs.data()),
+          reinterpret_cast<double *>(sol.data()), policy, pBufferL);
+      cusparseDcsrsv2_solve(
+          handle, transU, nrows, nnzU, &alpha, descrU,
+          reinterpret_cast<double *>(valuesU.data()), row_mapU.data(),
+          entriesU.data(), infoU, reinterpret_cast<double *>(sol.data()),
+          reinterpret_cast<double *>(rhs.data()), policy, pBufferU);
     } else {
       const cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0);
-      cusparseZcsrsv2_solve (handle, transL, nrows, nnzL, &alpha, descrL,
-                             reinterpret_cast <cuDoubleComplex*> (valuesL.data()), row_mapL.data(), entriesL.data(), infoL,
-                             reinterpret_cast <cuDoubleComplex*> (rhs.data()),
-                             reinterpret_cast <cuDoubleComplex*> (sol.data()),
-                             policy, pBufferL);
-      cusparseZcsrsv2_solve (handle, transU, nrows, nnzU, &alpha, descrU,
-                             reinterpret_cast <cuDoubleComplex*> (valuesU.data()), row_mapU.data(), entriesU.data(), infoU,
-                             reinterpret_cast <cuDoubleComplex*> (sol.data()),
-                             reinterpret_cast <cuDoubleComplex*> (rhs.data()),
-                             policy, pBufferU);
+      cusparseZcsrsv2_solve(handle, transL, nrows, nnzL, &alpha, descrL,
+                            reinterpret_cast<cuDoubleComplex *>(valuesL.data()),
+                            row_mapL.data(), entriesL.data(), infoL,
+                            reinterpret_cast<cuDoubleComplex *>(rhs.data()),
+                            reinterpret_cast<cuDoubleComplex *>(sol.data()),
+                            policy, pBufferL);
+      cusparseZcsrsv2_solve(handle, transU, nrows, nnzU, &alpha, descrU,
+                            reinterpret_cast<cuDoubleComplex *>(valuesU.data()),
+                            row_mapU.data(), entriesU.data(), infoU,
+                            reinterpret_cast<cuDoubleComplex *>(sol.data()),
+                            reinterpret_cast<cuDoubleComplex *>(rhs.data()),
+                            policy, pBufferU);
     }
     Kokkos::deep_copy(tmp_host, rhs);
 
     // backward pivot and check
-    backwardP_supernode<scalar_t> (nrows, perm_c, 1, tmp_host.data(), nrows, sol_host.data(), nrows);
+    backwardP_supernode<scalar_t>(nrows, perm_c, 1, tmp_host.data(), nrows,
+                                  sol_host.data(), nrows);
     success = check_errors(tol, Mtx, rhs_host, sol_host);
   }
   std::cout << std::endl;
@@ -590,35 +608,36 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, bool col_m
     double max_time = 0.0;
     double ave_time = 0.0;
     Kokkos::fence();
-    for(int i = 0; i < loop; i++) {
+    for (int i = 0; i < loop; i++) {
       double time;
       if (std::is_same<scalar_t, double>::value) {
         const double alpha = 1.0;
         timer.reset();
-        cusparseDcsrsv2_solve(handle, transL, nrows, nnzL, &alpha, descrL,
-                              reinterpret_cast <double*> (valuesL.data()), row_mapL.data(), entriesL.data(), infoL,
-                              reinterpret_cast <double*> (rhs.data()),
-                              reinterpret_cast <double*> (sol.data()),
-                              policy, pBufferL);
+        cusparseDcsrsv2_solve(
+            handle, transL, nrows, nnzL, &alpha, descrL,
+            reinterpret_cast<double *>(valuesL.data()), row_mapL.data(),
+            entriesL.data(), infoL, reinterpret_cast<double *>(rhs.data()),
+            reinterpret_cast<double *>(sol.data()), policy, pBufferL);
         Kokkos::fence();
         time = timer.seconds();
       } else {
         const cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0);
         timer.reset();
-        cusparseZcsrsv2_solve(handle, transL, nrows, nnzL, &alpha, descrL,
-                              reinterpret_cast <cuDoubleComplex*> (valuesL.data()), row_mapL.data(), entriesL.data(), infoL,
-                              reinterpret_cast <cuDoubleComplex*> (rhs.data()),
-                              reinterpret_cast <cuDoubleComplex*> (sol.data()),
-                              policy, pBufferL);
+        cusparseZcsrsv2_solve(
+            handle, transL, nrows, nnzL, &alpha, descrL,
+            reinterpret_cast<cuDoubleComplex *>(valuesL.data()),
+            row_mapL.data(), entriesL.data(), infoL,
+            reinterpret_cast<cuDoubleComplex *>(rhs.data()),
+            reinterpret_cast<cuDoubleComplex *>(sol.data()), policy, pBufferL);
         Kokkos::fence();
         time = timer.seconds();
       }
       ave_time += time;
-      if(time > max_time || i == 0) max_time = time;
-      if(time < min_time || i == 0) min_time = time;
+      if (time > max_time || i == 0) max_time = time;
+      if (time < min_time || i == 0) min_time = time;
     }
     std::cout << " L-solve: loop = " << loop << std::endl;
-    std::cout << "  LOOP_AVG_TIME:  " << ave_time/loop << std::endl;
+    std::cout << "  LOOP_AVG_TIME:  " << ave_time / loop << std::endl;
     std::cout << "  LOOP_MAX_TIME:  " << max_time << std::endl;
     std::cout << "  LOOP_MIN_TIME:  " << min_time << std::endl << std::endl;
 
@@ -627,35 +646,36 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, bool col_m
     max_time = 0.0;
     ave_time = 0.0;
     Kokkos::fence();
-    for(int i = 0; i < loop; i++) {
+    for (int i = 0; i < loop; i++) {
       double time;
       if (std::is_same<scalar_t, double>::value) {
         double alpha = 1.0;
         timer.reset();
-        cusparseDcsrsv2_solve(handle, transU, nrows, nnzU, &alpha, descrU,
-                              reinterpret_cast <double*> (valuesU.data()), row_mapU.data(), entriesU.data(), infoU,
-                              reinterpret_cast <double*> (sol.data()),
-                              reinterpret_cast <double*> (rhs.data()),
-                              policy, pBufferU);
+        cusparseDcsrsv2_solve(
+            handle, transU, nrows, nnzU, &alpha, descrU,
+            reinterpret_cast<double *>(valuesU.data()), row_mapU.data(),
+            entriesU.data(), infoU, reinterpret_cast<double *>(sol.data()),
+            reinterpret_cast<double *>(rhs.data()), policy, pBufferU);
         Kokkos::fence();
         time = timer.seconds();
       } else {
         const cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0);
         timer.reset();
-        cusparseZcsrsv2_solve(handle, transU, nrows, nnzU, &alpha, descrU,
-                              reinterpret_cast <cuDoubleComplex*> (valuesU.data()), row_mapU.data(), entriesU.data(), infoU,
-                              reinterpret_cast <cuDoubleComplex*> (sol.data()),
-                              reinterpret_cast <cuDoubleComplex*> (rhs.data()),
-                              policy, pBufferU);
+        cusparseZcsrsv2_solve(
+            handle, transU, nrows, nnzU, &alpha, descrU,
+            reinterpret_cast<cuDoubleComplex *>(valuesU.data()),
+            row_mapU.data(), entriesU.data(), infoU,
+            reinterpret_cast<cuDoubleComplex *>(sol.data()),
+            reinterpret_cast<cuDoubleComplex *>(rhs.data()), policy, pBufferU);
         Kokkos::fence();
         time = timer.seconds();
       }
       ave_time += time;
-      if(time > max_time || i == 0) max_time = time;
-      if(time < min_time || i == 0) min_time = time;
+      if (time > max_time || i == 0) max_time = time;
+      if (time < min_time || i == 0) min_time = time;
     }
     std::cout << " U-solve: loop = " << loop << std::endl;
-    std::cout << "  LOOP_AVG_TIME:  " << ave_time/loop << std::endl;
+    std::cout << "  LOOP_AVG_TIME:  " << ave_time / loop << std::endl;
     std::cout << "  LOOP_MAX_TIME:  " << max_time << std::endl;
     std::cout << "  LOOP_MIN_TIME:  " << min_time << std::endl << std::endl;
   }
@@ -663,14 +683,16 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L, bool col_m
 }
 #else
 template <typename crsmat_t, typename host_crsmat_t>
-bool check_cusparse(host_crsmat_t & /*Mtx*/, bool /*col_majorL*/, crsmat_t & /*L*/, bool /*col_majorU*/, crsmat_t & /*U*/,
-                    int * /*perm_r*/, int * /*perm_c*/, double /*tol*/, int /*loop*/) {
-  printf( " KOKKOSKERNELS_ENABLE_TPL_CUSPARSE **not** enabled\n" );
+bool check_cusparse(host_crsmat_t & /*Mtx*/, bool /*col_majorL*/,
+                    crsmat_t & /*L*/, bool /*col_majorU*/, crsmat_t & /*U*/,
+                    int * /*perm_r*/, int * /*perm_c*/, double /*tol*/,
+                    int /*loop*/) {
+  printf(" KOKKOSKERNELS_ENABLE_TPL_CUSPARSE **not** enabled\n");
   return false;
 }
 #endif
 
-} // namespace Experiment
-} // namespace PerfTest
-} // namespace KokkosKernels
+}  // namespace Experimental
+}  // namespace PerfTest
+}  // namespace KokkosSparse
 #endif
diff --git a/perf_test/sparse/KokkosSparse_sptrsv_cholmod.cpp b/perf_test/sparse/KokkosSparse_sptrsv_cholmod.cpp
index 453544b298..07b1e99bb1 100644
--- a/perf_test/sparse/KokkosSparse_sptrsv_cholmod.cpp
+++ b/perf_test/sparse/KokkosSparse_sptrsv_cholmod.cpp
@@ -47,8 +47,8 @@
 #include "KokkosSparse_sptrsv.hpp"
 #include "KokkosSparse_sptrsv_cholmod.hpp"
 
-#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )         && \
-  (!defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION)) && \
+#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) &&             \
+    (!defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION)) && \
     defined(KOKKOSKERNELS_INST_DOUBLE)
 
 #if defined(KOKKOSKERNELS_ENABLE_TPL_CHOLMOD) && \
@@ -64,29 +64,31 @@ using namespace KokkosSparse;
 using namespace KokkosSparse::Experimental;
 using namespace KokkosSparse::PerfTest::Experimental;
 
-enum {CUSPARSE, SUPERNODAL_NAIVE, SUPERNODAL_ETREE, SUPERNODAL_SPMV};
-
-
-/* ========================================================================================= */
-template<typename cholmod_int_type, typename scalar_type, typename size_type, typename ordinal_type>
-cholmod_factor* factor_cholmod(const size_type nrow, const size_type nnz, scalar_type *nzvals, size_type *rowptr, ordinal_type *colind,
-                               cholmod_common *Comm, int **etree) {
+enum { CUSPARSE, SUPERNODAL_NAIVE, SUPERNODAL_ETREE, SUPERNODAL_SPMV };
 
+/* =========================================================================================
+ */
+template <typename cholmod_int_type, typename scalar_type, typename size_type,
+          typename ordinal_type>
+cholmod_factor *factor_cholmod(const size_type nrow, const size_type nnz,
+                               scalar_type *nzvals, size_type *rowptr,
+                               ordinal_type *colind, cholmod_common *Comm,
+                               int **etree) {
   // Start Cholmod
   cholmod_common *cm = Comm;
   if (std::is_same<cholmod_int_type, long>::value == true) {
     std::cout << " > calling with long " << std::endl;
-    cholmod_l_start (cm);
+    cholmod_l_start(cm);
   } else if (std::is_same<cholmod_int_type, int>::value == true) {
     std::cout << " > calling with int " << std::endl;
-    cholmod_start (cm);
+    cholmod_start(cm);
   }
   cm->supernodal = CHOLMOD_SUPERNODAL;
-  cm->print = 5;
+  cm->print      = 5;
 
   // Manually, initialize the matrix
   cholmod_sparse A;
-  A.stype = 1;   // symmetric
+  A.stype  = 1;  // symmetric
   A.sorted = 0;
   A.packed = 1;
   if (std::is_same<cholmod_int_type, long>::value == true) {
@@ -97,267 +99,296 @@ cholmod_factor* factor_cholmod(const size_type nrow, const size_type nnz, scalar
   if (std::is_same<scalar_type, double>::value == true) {
     A.xtype = CHOLMOD_REAL;
     A.dtype = CHOLMOD_DOUBLE;
-  } else if (std::is_same<scalar_type, Kokkos::complex<double>>::value == true) {
+  } else if (std::is_same<scalar_type, Kokkos::complex<double>>::value ==
+             true) {
     A.xtype = CHOLMOD_COMPLEX;
     A.dtype = CHOLMOD_DOUBLE;
   }
-  A.nrow = nrow;
-  A.ncol = nrow;
+  A.nrow  = nrow;
+  A.ncol  = nrow;
   A.nzmax = nnz;
   // Make a copy of Crs's integer pointers with Cholmod int type
-  A.p = new cholmod_int_type [nrow+1];
-  A.i = new cholmod_int_type [nnz];
-  for (size_type i = 0; i <=nrow; i++) {
-    ((cholmod_int_type*)A.p)[i] = rowptr[i];
+  A.p = new cholmod_int_type[nrow + 1];
+  A.i = new cholmod_int_type[nnz];
+  for (size_type i = 0; i <= nrow; i++) {
+    ((cholmod_int_type *)A.p)[i] = rowptr[i];
   }
   for (size_type i = 0; i < nnz; i++) {
-    ((cholmod_int_type*)A.i)[i] = colind[i];
+    ((cholmod_int_type *)A.i)[i] = colind[i];
   }
   //
   A.x = nzvals;
 
   // Symbolic factorization
   cholmod_factor *L;
-  printf( " ** calling cholmod_analyze **\n" );
+  printf(" ** calling cholmod_analyze **\n");
   if (std::is_same<cholmod_int_type, long>::value == true) {
-    L = cholmod_l_analyze (&A, cm);
+    L = cholmod_l_analyze(&A, cm);
   } else if (std::is_same<cholmod_int_type, int>::value == true) {
-    L = cholmod_analyze (&A, cm);
+    L = cholmod_analyze(&A, cm);
   }
   if (cm->status != CHOLMOD_OK) {
-    printf( " ** cholmod_analyze returned with status = %d **",cm->status );
+    printf(" ** cholmod_analyze returned with status = %d **", cm->status);
   }
 
   // Numerical factorization
   int cholmod_stat = 0;
-  printf( " ** calling cholmod_factorize **\n" );
+  printf(" ** calling cholmod_factorize **\n");
   if (std::is_same<cholmod_int_type, long>::value == true) {
-    cholmod_stat = cholmod_l_factorize (&A, L, cm);
+    cholmod_stat = cholmod_l_factorize(&A, L, cm);
   } else if (std::is_same<cholmod_int_type, int>::value == true) {
-    cholmod_stat = cholmod_factorize (&A, L, cm);
+    cholmod_stat = cholmod_factorize(&A, L, cm);
   }
   if (!cholmod_stat) {
-    printf( " ** cholmod_factorize returned FALSE **\n" );
+    printf(" ** cholmod_factorize returned FALSE **\n");
   }
   if (cm->status != CHOLMOD_OK) {
-    printf( " ** cholmod_factorize returned with status = %d, minor = %ld **",cm->status,L->minor );
+    printf(" ** cholmod_factorize returned with status = %d, minor = %ld **",
+           cm->status, L->minor);
   }
   switch (cm->selected) {
-    case CHOLMOD_NATURAL: printf( "  > NATURAL ordering (%d)\n", CHOLMOD_NATURAL ); break;
-    case CHOLMOD_AMD:     printf( "  > AMD ordering (%d)\n",     CHOLMOD_AMD     ); break;
-    case CHOLMOD_METIS:   printf( "  > METIS ordering (%d)\n",   CHOLMOD_METIS   ); break;
-    case CHOLMOD_NESDIS:  printf( "  > NESDIS ordering (%d)\n",  CHOLMOD_NESDIS  ); break;
+    case CHOLMOD_NATURAL:
+      printf("  > NATURAL ordering (%d)\n", CHOLMOD_NATURAL);
+      break;
+    case CHOLMOD_AMD: printf("  > AMD ordering (%d)\n", CHOLMOD_AMD); break;
+    case CHOLMOD_METIS:
+      printf("  > METIS ordering (%d)\n", CHOLMOD_METIS);
+      break;
+    case CHOLMOD_NESDIS:
+      printf("  > NESDIS ordering (%d)\n", CHOLMOD_NESDIS);
+      break;
   }
-  //print_factor_cholmod<scalar_type>(L, cm);
-  compute_etree_cholmod<cholmod_int_type> (&A, cm, etree);
+  // print_factor_cholmod<scalar_type>(L, cm);
+  compute_etree_cholmod<cholmod_int_type>(&A, cm, etree);
 
   return L;
 }
 
-
-/* ========================================================================================= */
+/* =========================================================================================
+ */
 void free_cholmod(cholmod_factor *L, cholmod_common *cm) {
   /* free matrices */
-  cholmod_free_factor (&L, cm);
-  cholmod_finish (cm);
+  cholmod_free_factor(&L, cm);
+  cholmod_finish(cm);
 }
 
-
-/* ========================================================================================= */
-template<typename scalar_type>
-int test_sptrsv_perf(std::vector<int> tests, std::string& filename, bool u_in_csr, bool invert_diag, bool invert_offdiag,
+/* =========================================================================================
+ */
+template <typename scalar_type>
+int test_sptrsv_perf(std::vector<int> tests, std::string &filename,
+                     bool u_in_csr, bool invert_diag, bool invert_offdiag,
                      int block_size, int loop) {
-
-  using STS = Kokkos::Details::ArithTraits<scalar_type>;
+  using STS      = Kokkos::Details::ArithTraits<scalar_type>;
   using mag_type = typename STS::mag_type;
 
-  //using cholmod_int_type = long;
+  // using cholmod_int_type = long;
   using cholmod_int_type = int;
 
   // using int (for CuSparse on GPU)
   using ordinal_type = int;
   using size_type    = int;
-  //using ordinal_type = long;
-  //using size_type    = long;
+  // using ordinal_type = long;
+  // using size_type    = long;
 
   // Default spaces
   using execution_space = Kokkos::DefaultExecutionSpace;
-  using memory_space = typename execution_space::memory_space;
+  using memory_space    = typename execution_space::memory_space;
 
   // Host spaces
   using host_execution_space = Kokkos::DefaultHostExecutionSpace;
-  using host_memory_space = typename host_execution_space::memory_space;
+  using host_memory_space    = typename host_execution_space::memory_space;
 
   //
-  using host_crsmat_t = KokkosSparse::CrsMatrix<scalar_type, ordinal_type, host_execution_space, void, size_type>;
-  using      crsmat_t = KokkosSparse::CrsMatrix<scalar_type, ordinal_type,      execution_space, void, size_type>;
+  using host_crsmat_t =
+      KokkosSparse::CrsMatrix<scalar_type, ordinal_type, host_execution_space,
+                              void, size_type>;
+  using crsmat_t = KokkosSparse::CrsMatrix<scalar_type, ordinal_type,
+                                           execution_space, void, size_type>;
 
   //
   using graph_t = typename crsmat_t::StaticCrsGraphType;
 
   //
-  using host_scalar_view_t = Kokkos::View<scalar_type*, host_memory_space>;
-  using      scalar_view_t = Kokkos::View<scalar_type*,      memory_space>;
+  using host_scalar_view_t = Kokkos::View<scalar_type *, host_memory_space>;
+  using scalar_view_t      = Kokkos::View<scalar_type *, memory_space>;
 
   //
-  using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle <size_type, ordinal_type, scalar_type,
-    execution_space, memory_space, memory_space >;
+  using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<
+      size_type, ordinal_type, scalar_type, execution_space, memory_space,
+      memory_space>;
 
-  const scalar_type ZERO (0.0);
-  const scalar_type ONE (1.0);
+  const scalar_type ZERO(0.0);
+  const scalar_type ONE(1.0);
 
   // tolerance
   mag_type tol = STS::epsilon();
 
   int num_failed = 0;
   std::cout << std::endl;
-  std::cout << "Execution space: " << execution_space::name () << std::endl;
-  std::cout << "Memory space   : " << memory_space::name () << std::endl;
+  std::cout << "Execution space: " << execution_space::name() << std::endl;
+  std::cout << "Memory space   : " << memory_space::name() << std::endl;
   std::cout << std::endl;
-  if (!filename.empty())
-  {
+  if (!filename.empty()) {
     // ==============================================
     // read the matrix ** on host **
-    std::cout << " CHOLMOD Tester Begin: Read matrix filename " << filename << std::endl;
-    host_crsmat_t Mtx = KokkosKernels::Impl::read_kokkos_crst_matrix<host_crsmat_t>(filename.c_str()); //in_matrix
-    auto  graph_host  = Mtx.graph; // in_graph
+    std::cout << " CHOLMOD Tester Begin: Read matrix filename " << filename
+              << std::endl;
+    host_crsmat_t Mtx =
+        KokkosKernels::Impl::read_kokkos_crst_matrix<host_crsmat_t>(
+            filename.c_str());          // in_matrix
+    auto graph_host       = Mtx.graph;  // in_graph
     const size_type nrows = graph_host.numRows();
-    auto row_map_host = graph_host.row_map;
-    auto entries_host = graph_host.entries;
-    auto values_host  = Mtx.values;
-    //print_factor_cholmod(&Mtx);
+    auto row_map_host     = graph_host.row_map;
+    auto entries_host     = graph_host.entries;
+    auto values_host      = Mtx.values;
+    // print_factor_cholmod(&Mtx);
 
     Kokkos::Timer timer;
     // ==============================================
-    // call CHOLMOD on the host    
+    // call CHOLMOD on the host
     cholmod_common cm;
     cholmod_factor *L = NULL;
     int *etree;
     timer.reset();
     std::cout << " > call CHOLMOD for factorization" << std::endl;
-    L = factor_cholmod<cholmod_int_type, scalar_type>
-         (nrows, Mtx.nnz(), values_host.data(), const_cast<size_type*> (row_map_host.data()), entries_host.data(),
-          &cm, &etree);
+    L = factor_cholmod<cholmod_int_type, scalar_type>(
+        nrows, Mtx.nnz(), values_host.data(),
+        const_cast<size_type *>(row_map_host.data()), entries_host.data(), &cm,
+        &etree);
     double factor_time = timer.seconds();
-    std::cout << "   Factorization Time: " << factor_time << std::endl << std::endl;
-    using integer_view_host_t = typename KernelHandle::SPTRSVHandleType::integer_view_host_t;
-    integer_view_host_t iperm_view ("perm view",  nrows);
-    integer_view_host_t  perm_view ("iperm view", nrows);
-
-    int* iperm = iperm_view.data ();
-    int*  perm =  perm_view.data ();
+    std::cout << "   Factorization Time: " << factor_time << std::endl
+              << std::endl;
+    using integer_view_host_t =
+        typename KernelHandle::SPTRSVHandleType::integer_view_host_t;
+    integer_view_host_t iperm_view("perm view", nrows);
+    integer_view_host_t perm_view("iperm view", nrows);
+
+    int *iperm = iperm_view.data();
+    int *perm  = perm_view.data();
     for (int i = 0; i < nrows; i++) {
-      iperm[i] = ((cholmod_int_type*)(L->Perm))[i];
+      iperm[i]       = ((cholmod_int_type *)(L->Perm))[i];
       perm[iperm[i]] = i;
     }
 
     // ==============================================
     // Run all requested algorithms
-    for ( auto test : tests ) {
+    for (auto test : tests) {
       std::cout << "\ntest = " << test << std::endl;
 
       KernelHandle khL, khU;
-      switch(test) {
+      switch (test) {
         case SUPERNODAL_NAIVE:
         case SUPERNODAL_ETREE:
-        case SUPERNODAL_SPMV:
-        {
+        case SUPERNODAL_SPMV: {
           // ==============================================
           // Create handles for U and U^T solves
           if (test == SUPERNODAL_ETREE) {
-            std::cout << " > create handle for SUPERNODAL_ETREE" << std::endl << std::endl;
-            khL.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_ETREE, nrows, true);
-            khU.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_ETREE, nrows, false);
+            std::cout << " > create handle for SUPERNODAL_ETREE" << std::endl
+                      << std::endl;
+            khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_ETREE, nrows,
+                                     true);
+            khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_ETREE, nrows,
+                                     false);
           } else if (test == SUPERNODAL_SPMV) {
-            std::cout << " > create handle for SUPERNODAL_SPMV" << std::endl << std::endl;
-            khL.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_SPMV, nrows, true);
-            khU.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_SPMV, nrows, false);
+            std::cout << " > create handle for SUPERNODAL_SPMV" << std::endl
+                      << std::endl;
+            khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_SPMV, nrows,
+                                     true);
+            khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_SPMV, nrows,
+                                     false);
           } else {
-            std::cout << " > create handle for SUPERNODAL_NAIVE" << std::endl << std::endl;
-            khL.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows, true);
-            khU.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows, false);
+            std::cout << " > create handle for SUPERNODAL_NAIVE" << std::endl
+                      << std::endl;
+            khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows,
+                                     true);
+            khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows,
+                                     false);
           }
 
           // ==============================================
           // set etree (required)
-          khL.set_sptrsv_etree (etree);
-          khU.set_sptrsv_etree (etree);
+          khL.set_sptrsv_etree(etree);
+          khU.set_sptrsv_etree(etree);
 
           // ==============================================
           // set permutation
-          khL.set_sptrsv_perm (perm);
-          khU.set_sptrsv_perm (perm);
+          khL.set_sptrsv_perm(perm);
+          khU.set_sptrsv_perm(perm);
 
           // ==============================================
           // specify if U is stored in CSR or CSC
           std::cout << "=============================== " << std::endl;
           std::cout << " U in CSR           : " << u_in_csr << std::endl;
-          khU.set_sptrsv_column_major (!u_in_csr);
+          khU.set_sptrsv_column_major(!u_in_csr);
 
           // ==============================================
           // specify wheather to invert diagonal blocks
           std::cout << " Invert diagonal    : " << invert_diag << std::endl;
-          khL.set_sptrsv_invert_diagonal (invert_diag);
-          khU.set_sptrsv_invert_diagonal (invert_diag);
+          khL.set_sptrsv_invert_diagonal(invert_diag);
+          khU.set_sptrsv_invert_diagonal(invert_diag);
 
           // ==============================================
-          // specify wheather to apply diagonal-inversion to off-diagonal blocks (optional, default is false)
+          // specify wheather to apply diagonal-inversion to off-diagonal blocks
+          // (optional, default is false)
           std::cout << " Invert Off-diagonal: " << invert_offdiag << std::endl;
-          khL.set_sptrsv_invert_offdiagonal (invert_offdiag);
-          khU.set_sptrsv_invert_offdiagonal (invert_offdiag);
+          khL.set_sptrsv_invert_offdiagonal(invert_offdiag);
+          khU.set_sptrsv_invert_offdiagonal(invert_offdiag);
 
           // ==============================================
           // block size to switch to device call
           if (block_size >= 0) {
             std::cout << " Block Size         : " << block_size << std::endl;
-            khL.set_sptrsv_diag_supernode_sizes (block_size, block_size);
-            khU.set_sptrsv_diag_supernode_sizes (block_size, block_size);
+            khL.set_sptrsv_diag_supernode_sizes(block_size, block_size);
+            khU.set_sptrsv_diag_supernode_sizes(block_size, block_size);
           }
           std::cout << std::endl;
 
           // ==============================================
           // Do symbolic analysis
           timer.reset();
-          sptrsv_symbolic<cholmod_int_type> (&khL, &khU, L, &cm);
+          sptrsv_symbolic<cholmod_int_type>(&khL, &khU, L, &cm);
           double symbolic_time = timer.seconds();
-          std::cout << "   Symbolic Time   : " << symbolic_time << std::endl << std::endl;
+          std::cout << "   Symbolic Time   : " << symbolic_time << std::endl
+                    << std::endl;
 
           // ==============================================
           // Do numerical compute
           timer.reset();
-          sptrsv_compute<cholmod_int_type> (&khL, &khU, L, &cm);
+          sptrsv_compute<cholmod_int_type>(&khL, &khU, L, &cm);
           double compute_time = timer.seconds();
-          std::cout << "   Numeric Time   : " << compute_time << std::endl << std::endl;
+          std::cout << "   Numeric Time   : " << compute_time << std::endl
+                    << std::endl;
 
           // ==============================================
           // Create the known solution and set to all 1's ** on host **
           host_scalar_view_t sol_host("sol_host", nrows);
-          //Kokkos::deep_copy(sol_host, ONE);
+          // Kokkos::deep_copy(sol_host, ONE);
           Kokkos::Random_XorShift64_Pool<host_execution_space> random(13718);
           Kokkos::fill_random(sol_host, random, scalar_type(1));
 
           // ==============================================
           // Create the rhs ** on host **
-          // A*sol 
+          // A*sol
           host_scalar_view_t rhs_host("rhs_host", nrows);
-          KokkosSparse::spmv( "N", ONE, Mtx, sol_host, ZERO, rhs_host);
+          KokkosSparse::spmv("N", ONE, Mtx, sol_host, ZERO, rhs_host);
 
           // ==============================================
           // apply forward-pivot on the host
-          host_scalar_view_t tmp_host ("temp", nrows);
-          forwardP_supernode<scalar_type> (nrows, perm, 1, rhs_host.data(), nrows, tmp_host.data(), nrows);
+          host_scalar_view_t tmp_host("temp", nrows);
+          forwardP_supernode<scalar_type>(nrows, perm, 1, rhs_host.data(),
+                                          nrows, tmp_host.data(), nrows);
 
           // ==============================================
           // copy rhs to the default host/device
-          scalar_view_t rhs ("rhs", nrows);
-          scalar_view_t sol ("sol", nrows);
-          Kokkos::deep_copy (rhs, tmp_host);
+          scalar_view_t rhs("rhs", nrows);
+          scalar_view_t sol("sol", nrows);
+          Kokkos::deep_copy(rhs, tmp_host);
 
           // ==============================================
           // do L solve
-         // numeric (only rhs is modified) on the default device/host space
+          // numeric (only rhs is modified) on the default device/host space
           timer.reset();
-           sptrsv_solve (&khL, sol, rhs);
+          sptrsv_solve(&khL, sol, rhs);
           Kokkos::fence();
           std::cout << "   Solve Time   : " << timer.seconds() << std::endl;
 
@@ -365,40 +396,41 @@ int test_sptrsv_perf(std::vector<int> tests, std::string& filename, bool u_in_cs
           // do L^T solve
           // numeric (only rhs is modified) on the default device/host space
           timer.reset();
-           sptrsv_solve (&khU, rhs, sol);
-          Kokkos::fence ();
+          sptrsv_solve(&khU, rhs, sol);
+          Kokkos::fence();
           std::cout << "   Solve Time   : " << timer.seconds() << std::endl;
- 
 
           // ==============================================
           // apply backward-pivot
           // > copy solution to host
           Kokkos::deep_copy(tmp_host, rhs);
-          backwardP_supernode<scalar_type>(nrows, perm, 1, tmp_host.data(), nrows, sol_host.data(), nrows);
-
+          backwardP_supernode<scalar_type>(nrows, perm, 1, tmp_host.data(),
+                                           nrows, sol_host.data(), nrows);
 
           // ==============================================
           // Error Check ** on host **
           Kokkos::fence();
           std::cout << std::endl;
           if (!check_errors(tol, Mtx, rhs_host, sol_host)) {
-            num_failed ++;
+            num_failed++;
           }
 
           // try again?
           {
             Kokkos::deep_copy(sol_host, ONE);
-            KokkosSparse::spmv( "N", ONE, Mtx, sol_host, ZERO, rhs_host);
-            forwardP_supernode<scalar_type> (nrows, perm, 1, rhs_host.data(), nrows, tmp_host.data(), nrows);
-            Kokkos::deep_copy (rhs, tmp_host);
-             sptrsv_solve (&khL, sol, rhs);
-             sptrsv_solve (&khU, rhs, sol);
+            KokkosSparse::spmv("N", ONE, Mtx, sol_host, ZERO, rhs_host);
+            forwardP_supernode<scalar_type>(nrows, perm, 1, rhs_host.data(),
+                                            nrows, tmp_host.data(), nrows);
+            Kokkos::deep_copy(rhs, tmp_host);
+            sptrsv_solve(&khL, sol, rhs);
+            sptrsv_solve(&khU, rhs, sol);
             Kokkos::fence();
             Kokkos::deep_copy(tmp_host, rhs);
-            backwardP_supernode<scalar_type>(nrows, perm, 1, tmp_host.data(), nrows, sol_host.data(), nrows);
+            backwardP_supernode<scalar_type>(nrows, perm, 1, tmp_host.data(),
+                                             nrows, sol_host.data(), nrows);
 
             if (!check_errors(tol, Mtx, rhs_host, sol_host)) {
-              num_failed ++;
+              num_failed++;
             }
           }
           std::cout << std::endl;
@@ -409,63 +441,73 @@ int test_sptrsv_perf(std::vector<int> tests, std::string& filename, bool u_in_cs
           double max_time = 0.0;
           double ave_time = 0.0;
           Kokkos::fence();
-          for(int i=0;i<loop;i++) {
+          for (int i = 0; i < loop; i++) {
             timer.reset();
-            sptrsv_solve (&khL, sol, rhs);
+            sptrsv_solve(&khL, sol, rhs);
             Kokkos::fence();
             double time = timer.seconds();
             ave_time += time;
-            if(time>max_time || i == 0) max_time = time;
-            if(time<min_time || i == 0) min_time = time;
+            if (time > max_time || i == 0) max_time = time;
+            if (time < min_time || i == 0) min_time = time;
           }
           std::cout << " L-solve: loop = " << loop << std::endl;
-          std::cout << "  LOOP_AVG_TIME:  " << ave_time/loop << std::endl;
+          std::cout << "  LOOP_AVG_TIME:  " << ave_time / loop << std::endl;
           std::cout << "  LOOP_MAX_TIME:  " << max_time << std::endl;
-          std::cout << "  LOOP_MIN_TIME:  " << min_time << std::endl << std::endl;
+          std::cout << "  LOOP_MIN_TIME:  " << min_time << std::endl
+                    << std::endl;
 
           // U-solve
           min_time = 1.0e32;
           max_time = 0.0;
           ave_time = 0.0;
           Kokkos::fence();
-          for(int i=0;i<loop;i++) {
+          for (int i = 0; i < loop; i++) {
             timer.reset();
-            sptrsv_solve (&khU, rhs, sol);
+            sptrsv_solve(&khU, rhs, sol);
             Kokkos::fence();
             double time = timer.seconds();
             ave_time += time;
-            if(time>max_time) max_time = time;
-            if(time<min_time) min_time = time;
-            //std::cout << time << std::endl;
+            if (time > max_time) max_time = time;
+            if (time < min_time) min_time = time;
+            // std::cout << time << std::endl;
           }
           std::cout << " U-solve: loop = " << loop << std::endl;
-          std::cout << "  LOOP_AVG_TIME:  " << ave_time/loop << std::endl;
+          std::cout << "  LOOP_AVG_TIME:  " << ave_time / loop << std::endl;
           std::cout << "  LOOP_MAX_TIME:  " << max_time << std::endl;
-          std::cout << "  LOOP_MIN_TIME:  " << min_time << std::endl << std::endl;
-        }
-        break;
+          std::cout << "  LOOP_MIN_TIME:  " << min_time << std::endl
+                    << std::endl;
+        } break;
 
-        case CUSPARSE:
-        {
-          std::cout << " > create handle for CuSparse (SUPERNODAL_NAIVE)" << std::endl << std::endl;
-          khL.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows, true);
+        case CUSPARSE: {
+          std::cout << " > create handle for CuSparse (SUPERNODAL_NAIVE)"
+                    << std::endl
+                    << std::endl;
+          khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows,
+                                   true);
 
           // ==============================================
-          // read CHOLMOD factor int crsMatrix on the host (cholmodMat_host) and copy to default host/device (cholmodMtx)
+          // read CHOLMOD factor int crsMatrix on the host (cholmodMat_host) and
+          // copy to default host/device (cholmodMtx)
           timer.reset();
-          std::cout << " > Read Cholmod factor into KokkosSparse::CrsMatrix (invert diagonabl, and copy to device) " << std::endl;
-          khL.set_sptrsv_invert_diagonal (false);
-          auto graph = read_cholmod_graphL<cholmod_int_type, graph_t>(&khL, L, &cm);
-          auto cholmodMtx = read_cholmod_factor<cholmod_int_type, crsmat_t, graph_t> (&khL, L, &cm, graph);
-          std::cout << "   Conversion Time: " << timer.seconds() << std::endl << std::endl;
+          std::cout << " > Read Cholmod factor into KokkosSparse::CrsMatrix "
+                       "(invert diagonabl, and copy to device) "
+                    << std::endl;
+          khL.set_sptrsv_invert_diagonal(false);
+          auto graph =
+              read_cholmod_graphL<cholmod_int_type, graph_t>(&khL, L, &cm);
+          auto cholmodMtx =
+              read_cholmod_factor<cholmod_int_type, crsmat_t, graph_t>(
+                  &khL, L, &cm, graph);
+          std::cout << "   Conversion Time: " << timer.seconds() << std::endl
+                    << std::endl;
 
           bool col_majorL = true;
           bool col_majorU = false;
-          if (!check_cusparse(Mtx, col_majorL, cholmodMtx, col_majorU, cholmodMtx, perm, perm, tol, loop)) {
-            num_failed ++;
+          if (!check_cusparse(Mtx, col_majorL, cholmodMtx, col_majorU,
+                              cholmodMtx, perm, perm, tol, loop)) {
+            num_failed++;
           }
-        }
-        break;
+        } break;
 
         default:
           std::cout << " > Testing only Cholmod < " << std::endl;
@@ -481,24 +523,28 @@ int test_sptrsv_perf(std::vector<int> tests, std::string& filename, bool u_in_cs
   return num_failed;
 }
 
-
 void print_help_sptrsv() {
   printf("Options:\n");
   printf("  --test [OPTION]        : Use different kernel implementations\n");
   printf("                           Options:\n");
   printf("                           cholmod_naive, cholmod_etree\n\n");
-  printf("  -f [file]              : Read in Matrix Market formatted text file 'file'.\n");
+  printf(
+      "  -f [file]              : Read in Matrix Market formatted text file "
+      "'file'.\n");
   printf("  --loop [LOOP]          : How many time to run the test.\n");
-  printf("  --u-in-csc             : To store U-factor in CSC, needed for invert.\n");
+  printf(
+      "  --u-in-csc             : To store U-factor in CSC, needed for "
+      "invert.\n");
   printf("  --invert-diag          : To invert diagonal blocks.\n");
-  printf("  --invert-offdiag       : To apply inverse to off-diagonal blocks.\n");
-  printf("  --block-size [SIZE]    : To specify the threshold to switch device and bached kernel.\n");
+  printf(
+      "  --invert-offdiag       : To apply inverse to off-diagonal blocks.\n");
+  printf(
+      "  --block-size [SIZE]    : To specify the threshold to switch device "
+      "and bached kernel.\n");
   printf("  --scalar-type [d or z] :\n");
 }
 
-
-int main(int argc, char **argv)
-{
+int main(int argc, char **argv) {
   std::vector<int> tests;
   std::string filename;
 
@@ -510,59 +556,57 @@ int main(int argc, char **argv)
   // apply invert of diagonal to offdiagonal
   bool invert_offdiag = false;
   // block size to switch to device call (default is 100)
-  int block_size  = -1;
+  int block_size = -1;
   // scalar type
   std::string char_scalar = "d";
 
-  if(argc == 1)
-  {
+  if (argc == 1) {
     print_help_sptrsv();
     return 0;
   }
 
-  for(int i=0;i<argc;i++)
-  {
-    if((strcmp(argv[i],"--test")==0)) {
+  for (int i = 0; i < argc; i++) {
+    if ((strcmp(argv[i], "--test") == 0)) {
       i++;
-      if((strcmp(argv[i],"cholmod-naive")==0)) {
-        tests.push_back( SUPERNODAL_NAIVE );
-      } else if((strcmp(argv[i],"cholmod-etree")==0)) {
-        tests.push_back( SUPERNODAL_ETREE );
-      } else if((strcmp(argv[i],"cholmod-spmv")==0)) {
-        tests.push_back( SUPERNODAL_SPMV );
-      } else if((strcmp(argv[i],"cusparse")==0)) {
-        tests.push_back( CUSPARSE );
+      if ((strcmp(argv[i], "cholmod-naive") == 0)) {
+        tests.push_back(SUPERNODAL_NAIVE);
+      } else if ((strcmp(argv[i], "cholmod-etree") == 0)) {
+        tests.push_back(SUPERNODAL_ETREE);
+      } else if ((strcmp(argv[i], "cholmod-spmv") == 0)) {
+        tests.push_back(SUPERNODAL_SPMV);
+      } else if ((strcmp(argv[i], "cusparse") == 0)) {
+        tests.push_back(CUSPARSE);
       }
       continue;
     }
-    if((strcmp(argv[i],"-f")==0)) {
+    if ((strcmp(argv[i], "-f") == 0)) {
       filename = argv[++i];
       continue;
     }
-    if((strcmp(argv[i],"--loop")==0)) {
+    if ((strcmp(argv[i], "--loop") == 0)) {
       loop = atoi(argv[++i]);
       continue;
     }
-    if((strcmp(argv[i],"--u-in-csc")==0)) {
+    if ((strcmp(argv[i], "--u-in-csc") == 0)) {
       u_in_csr = false;
       continue;
     }
-    if((strcmp(argv[i],"--invert-diag")==0)) {
+    if ((strcmp(argv[i], "--invert-diag") == 0)) {
       invert_diag = true;
       continue;
     }
-    if((strcmp(argv[i],"--invert-offdiag")==0)) {
+    if ((strcmp(argv[i], "--invert-offdiag") == 0)) {
       invert_offdiag = true;
       continue;
     }
-    if((strcmp(argv[i],"--block-size")==0)) {
+    if ((strcmp(argv[i], "--block-size") == 0)) {
       block_size = atoi(argv[++i]);
       continue;
     }
-    if((strcmp(argv[i],"--scalar-type")==0)) {
+    if ((strcmp(argv[i], "--scalar-type") == 0)) {
       char_scalar = argv[++i];
     }
-    if((strcmp(argv[i],"--help")==0) || (strcmp(argv[i],"-h")==0)) {
+    if ((strcmp(argv[i], "--help") == 0) || (strcmp(argv[i], "-h") == 0)) {
       print_help_sptrsv();
       return 0;
     }
@@ -575,23 +619,32 @@ int main(int argc, char **argv)
 
   {
     // Cholmod may not support single, yet
-    //int total_errors = test_sptrsv_perf<float>(tests, filename, loop);
+    // int total_errors = test_sptrsv_perf<float>(tests, filename, loop);
     int total_errors = 0;
-    Kokkos::ScopeGuard kokkosScope (argc, argv);
+    Kokkos::ScopeGuard kokkosScope(argc, argv);
     if (char_scalar == "z") {
-      #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
-      total_errors = test_sptrsv_perf<Kokkos::complex<double>>(tests, filename, u_in_csr, invert_diag, invert_offdiag, block_size, loop);
-      #else
-      std::cout << std::endl << " KOKKOSKERNELS_INST_COMPLEX_DOUBLE  is not enabled ** " << std::endl << std::endl;
-      #endif
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
+      total_errors = test_sptrsv_perf<Kokkos::complex<double>>(
+          tests, filename, u_in_csr, invert_diag, invert_offdiag, block_size,
+          loop);
+#else
+      std::cout << std::endl
+                << " KOKKOSKERNELS_INST_COMPLEX_DOUBLE  is not enabled ** "
+                << std::endl
+                << std::endl;
+#endif
     } else if (char_scalar == "d") {
-      #if defined(KOKKOSKERNELS_INST_DOUBLE)
-      total_errors = test_sptrsv_perf<double>(tests, filename, u_in_csr, invert_diag, invert_offdiag, block_size, loop);
-      #else
-      std::cout << std::endl << " KOKKOSKERNELS_INST_DOUBLE  is not enabled ** " << std::endl << std::endl;
-      #endif
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+      total_errors =
+          test_sptrsv_perf<double>(tests, filename, u_in_csr, invert_diag,
+                                   invert_offdiag, block_size, loop);
+#else
+      std::cout << std::endl
+                << " KOKKOSKERNELS_INST_DOUBLE  is not enabled ** " << std::endl
+                << std::endl;
+#endif
     }
-    if(total_errors == 0)
+    if (total_errors == 0)
       std::cout << "Kokkos::SPTRSV Test: Passed" << std::endl << std::endl;
     else
       std::cout << "Kokkos::SPTRSV Test: Failed" << std::endl << std::endl;
@@ -599,27 +652,30 @@ int main(int argc, char **argv)
 
   return 0;
 }
-#else // defined(KOKKOSKERNELS_ENABLE_TPL_CHOLMOD)
-int main()
-{
-  std::cout << std::endl << "** CHOLMOD NOT ENABLED **" << std::endl << std::endl;
+#else  // defined(KOKKOSKERNELS_ENABLE_TPL_CHOLMOD)
+int main() {
+  std::cout << std::endl
+            << "** CHOLMOD NOT ENABLED **" << std::endl
+            << std::endl;
   return 1;
 }
 #endif
 
-#else // defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA ) && (!defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION ))
+#else  // defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA ) &&
+       // (!defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION ))
 int main() {
 #if !defined(KOKKOSKERNELS_INST_DOUBLE)
   std::cout << " Only supported with double precision" << std::endl;
 #endif
-#if !defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
-  std::cout << " KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA **not** defined" << std::endl;
+#if !defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+  std::cout << " KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA **not** defined"
+            << std::endl;
 #endif
 #if defined(KOKKOS_ENABLE_CUDA)
   std::cout << " KOKKOS_ENABLE_CUDA defined" << std::endl;
-  #if !defined(KOKKOS_ENABLE_CUDA_LAMBDA)
+#if !defined(KOKKOS_ENABLE_CUDA_LAMBDA)
   std::cout << " KOKKOS_ENABLE_CUDA_LAMBDA not defined\n" << std::endl;
-  #endif
+#endif
   std::cout << " CUDA_VERSION = " << CUDA_VERSION << std::endl;
 #endif
   return 1;
diff --git a/perf_test/sparse/KokkosSparse_sptrsv_superlu.cpp b/perf_test/sparse/KokkosSparse_sptrsv_superlu.cpp
index 6b7c361106..049d560309 100644
--- a/perf_test/sparse/KokkosSparse_sptrsv_superlu.cpp
+++ b/perf_test/sparse/KokkosSparse_sptrsv_superlu.cpp
@@ -48,31 +48,29 @@
 #include "KokkosSparse_sptrsv.hpp"
 #include "KokkosSparse_sptrsv_superlu.hpp"
 
-
-#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )         && \
-  (!defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION))
+#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) && \
+    (!defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION))
 
 #if defined(KOKKOSKERNELS_ENABLE_TPL_SUPERLU) && \
     defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV)
 
 namespace SLU {
- namespace D {
-  #include "slu_ddefs.h"
- }
- namespace S {
-  #include "slu_sdefs.h"
- }
- namespace C {
-  #include "slu_cdefs.h"
- }
- namespace Z {
-  #include "slu_zdefs.h"
- }
+namespace D {
+#include "slu_ddefs.h"
+}
+namespace S {
+#include "slu_sdefs.h"
+}
+namespace C {
+#include "slu_cdefs.h"
+}
+namespace Z {
+#include "slu_zdefs.h"
 }
+}  // namespace SLU
 // auxiliary functions from perf_test (e.g., pivoting, printing)
 #include "KokkosSparse_sptrsv_aux.hpp"
 
-
 #ifdef KOKKOSKERNELS_ENABLE_TPL_METIS
 // optionally, for matrix ordering before SuperLU
 #include "metis.h"
@@ -88,48 +86,58 @@ using namespace SLU::S;
 using namespace SLU::C;
 using namespace SLU::Z;
 
-enum {CUSPARSE, SUPERNODAL_NAIVE, SUPERNODAL_ETREE, SUPERNODAL_DAG, SUPERNODAL_SPMV, SUPERNODAL_SPMV_DAG};
-
-
-/* ========================================================================================= */
-template<typename scalar_type>
-void factor_superlu (bool symm_mode, bool metis,
-                     const int nrow, scalar_type *nzvals, int *rowptr, int *colind,
-                     int panel_size, int relax_size, SuperMatrix &L, SuperMatrix &U,
-                     int **perm_r, int **perm_c, int **parents) {
-
+enum {
+  CUSPARSE,
+  SUPERNODAL_NAIVE,
+  SUPERNODAL_ETREE,
+  SUPERNODAL_DAG,
+  SUPERNODAL_SPMV,
+  SUPERNODAL_SPMV_DAG
+};
+
+/* =========================================================================================
+ */
+template <typename scalar_type>
+void factor_superlu(bool symm_mode, bool metis, const int nrow,
+                    scalar_type *nzvals, int *rowptr, int *colind,
+                    int panel_size, int relax_size, SuperMatrix &L,
+                    SuperMatrix &U, int **perm_r, int **perm_c, int **parents) {
   // allocate permutation vectors for SuperLU
   *perm_c = new int[nrow];
   *perm_r = new int[nrow];
 
   if (metis) {
-    #ifdef KOKKOSKERNELS_ENABLE_TPL_METIS
+#ifdef KOKKOSKERNELS_ENABLE_TPL_METIS
     if (symm_mode) {
-      idx_t n = nrow;
+      idx_t n   = nrow;
       idx_t nnz = rowptr[n];
-      
+
       // remove diagonal elements (and casting to METIS idx_t)
-      idx_t *metis_rowptr = new idx_t[n+1];
+      idx_t *metis_rowptr = new idx_t[n + 1];
       idx_t *metis_colind = new idx_t[nnz];
 
-      nnz = 0;
+      nnz             = 0;
       metis_rowptr[0] = 0;
       for (int i = 0; i < n; i++) {
-        for (int k = rowptr[i]; k < rowptr[i+1]; k++) {
+        for (int k = rowptr[i]; k < rowptr[i + 1]; k++) {
           if (colind[k] != i) {
             metis_colind[nnz] = colind[k];
-            nnz ++;
+            nnz++;
           }
         }
-        metis_rowptr[i+1] = nnz;
+        metis_rowptr[i + 1] = nnz;
       }
 
       // call METIS
-      idx_t *metis_perm = new idx_t[n];
+      idx_t *metis_perm  = new idx_t[n];
       idx_t *metis_iperm = new idx_t[n];
-      std::cout << "  + calling METIS_NodeND: (n=" << n << ", nnz=" << nnz << ") " << std::endl;
-      if (METIS_OK != METIS_NodeND(&n, metis_rowptr, metis_colind, NULL, NULL, metis_perm, metis_iperm)) {
-        std::cout << std::endl << "METIS_NodeND failed" << std::endl << std::endl;
+      std::cout << "  + calling METIS_NodeND: (n=" << n << ", nnz=" << nnz
+                << ") " << std::endl;
+      if (METIS_OK != METIS_NodeND(&n, metis_rowptr, metis_colind, NULL, NULL,
+                                   metis_perm, metis_iperm)) {
+        std::cout << std::endl
+                  << "METIS_NodeND failed" << std::endl
+                  << std::endl;
       }
 
       // copy permutation to SuperLU
@@ -138,79 +146,86 @@ void factor_superlu (bool symm_mode, bool metis,
         (*perm_c)[i] = metis_iperm[i];
       }
 
-      delete [] metis_perm;
-      delete [] metis_iperm;
-      delete [] metis_rowptr;
-      delete [] metis_colind;
+      delete[] metis_perm;
+      delete[] metis_iperm;
+      delete[] metis_rowptr;
+      delete[] metis_colind;
     } else {
-      std::cout << "   + METIS enabled only for symmetric mode" << std::endl << std::endl;
+      std::cout << "   + METIS enabled only for symmetric mode" << std::endl
+                << std::endl;
       metis = false;
     }
-    #else
-    std::cout << std::endl << " ** METIS not ENABLED **" << std::endl << std::endl;
-    #endif
+#else
+    std::cout << std::endl
+              << " ** METIS not ENABLED **" << std::endl
+              << std::endl;
+#endif
   }
 
   SuperMatrix A;
   NCformat *Astore;
-  int      info;
+  int info;
   superlu_options_t options;
   SuperLUStat_t stat;
 
-  set_default_options (&options);
+  set_default_options(&options);
   if (symm_mode) {
     options.SymmetricMode = YES;
   }
-  #ifdef KOKKOSKERNELS_ENABLE_TPL_METIS
+#ifdef KOKKOSKERNELS_ENABLE_TPL_METIS
   if (metis) {
     options.ColPerm = MY_PERMC;
     options.RowPerm = MY_PERMR;
   }
-  #endif
+#endif
 
   int nnz = rowptr[nrow];
   // casting to call d or z version
   scalar_type *nzvals_tran = nzvals;
-  int *rowptr_tran = rowptr;
-  int *colind_tran = colind;
+  int *rowptr_tran         = rowptr;
+  int *colind_tran         = colind;
   if (std::is_same<scalar_type, double>::value == true) {
     if (!symm_mode) {
-      dCompRow_to_CompCol (nrow, nrow, nnz,
-                           reinterpret_cast <double*> (nzvals), colind, rowptr,
-                           reinterpret_cast <double**> (&nzvals_tran), &colind_tran, &rowptr_tran);
+      dCompRow_to_CompCol(nrow, nrow, nnz, reinterpret_cast<double *>(nzvals),
+                          colind, rowptr,
+                          reinterpret_cast<double **>(&nzvals_tran),
+                          &colind_tran, &rowptr_tran);
     }
-    dCreate_CompCol_Matrix (&A, nrow, nrow, nnz,
-                            reinterpret_cast <double*> (nzvals_tran), colind_tran, rowptr_tran,
-                            SLU_NC, SLU_D, SLU_GE);
+    dCreate_CompCol_Matrix(&A, nrow, nrow, nnz,
+                           reinterpret_cast<double *>(nzvals_tran), colind_tran,
+                           rowptr_tran, SLU_NC, SLU_D, SLU_GE);
   } else if (std::is_same<scalar_type, float>::value == true) {
     if (!symm_mode) {
-      sCompRow_to_CompCol (nrow, nrow, nnz,
-                           reinterpret_cast <float*> (nzvals), colind, rowptr,
-                           reinterpret_cast <float**> (&nzvals_tran), &colind_tran, &rowptr_tran);
+      sCompRow_to_CompCol(
+          nrow, nrow, nnz, reinterpret_cast<float *>(nzvals), colind, rowptr,
+          reinterpret_cast<float **>(&nzvals_tran), &colind_tran, &rowptr_tran);
     }
-    sCreate_CompCol_Matrix (&A, nrow, nrow, nnz,
-                            reinterpret_cast <float*> (nzvals_tran), colind_tran, rowptr_tran,
-                            SLU_NC, SLU_D, SLU_GE);
+    sCreate_CompCol_Matrix(&A, nrow, nrow, nnz,
+                           reinterpret_cast<float *>(nzvals_tran), colind_tran,
+                           rowptr_tran, SLU_NC, SLU_D, SLU_GE);
   } else if (std::is_same<scalar_type, std::complex<float>>::value == true ||
              std::is_same<scalar_type, Kokkos::complex<float>>::value == true) {
     if (!symm_mode) {
-      cCompRow_to_CompCol (nrow, nrow, nnz,
-                           reinterpret_cast <complex*> (nzvals), colind, rowptr,
-                           reinterpret_cast <complex**> (&nzvals_tran), &colind_tran, &rowptr_tran);
+      cCompRow_to_CompCol(nrow, nrow, nnz, reinterpret_cast<complex *>(nzvals),
+                          colind, rowptr,
+                          reinterpret_cast<complex **>(&nzvals_tran),
+                          &colind_tran, &rowptr_tran);
     }
-    cCreate_CompCol_Matrix (&A, nrow, nrow, nnz,
-                            reinterpret_cast <complex*> (nzvals_tran), colind_tran, rowptr_tran,
-                            SLU_NC, SLU_Z, SLU_GE);
+    cCreate_CompCol_Matrix(&A, nrow, nrow, nnz,
+                           reinterpret_cast<complex *>(nzvals_tran),
+                           colind_tran, rowptr_tran, SLU_NC, SLU_Z, SLU_GE);
   } else if (std::is_same<scalar_type, std::complex<double>>::value == true ||
-             std::is_same<scalar_type, Kokkos::complex<double>>::value == true) {
+             std::is_same<scalar_type, Kokkos::complex<double>>::value ==
+                 true) {
     if (!symm_mode) {
-      zCompRow_to_CompCol (nrow, nrow, nnz,
-                           reinterpret_cast <doublecomplex*> (nzvals), colind, rowptr,
-                           reinterpret_cast <doublecomplex**> (&nzvals_tran), &colind_tran, &rowptr_tran);
+      zCompRow_to_CompCol(
+          nrow, nrow, nnz, reinterpret_cast<doublecomplex *>(nzvals), colind,
+          rowptr, reinterpret_cast<doublecomplex **>(&nzvals_tran),
+          &colind_tran, &rowptr_tran);
     }
-    zCreate_CompCol_Matrix (&A, nrow, nrow, nnz,
-                            reinterpret_cast <doublecomplex*> (nzvals_tran), colind_tran, rowptr_tran,
-                            SLU_NC, SLU_Z, SLU_GE);
+    zCreate_CompCol_Matrix(&A, nrow, nrow, nnz,
+                           reinterpret_cast<doublecomplex *>(nzvals_tran),
+                           colind_tran, rowptr_tran, SLU_NC, SLU_Z, SLU_GE);
   }
 
   /* Initialize the statistics variables. */
@@ -218,159 +233,167 @@ void factor_superlu (bool symm_mode, bool metis,
   int w1 = (sp_ienv(1) > sp_ienv(2) ? sp_ienv(1) : sp_ienv(2));
   int w2 = (panel_size > relax_size ? panel_size : relax_size);
   if (w2 > w1) {
-    SUPERLU_FREE (stat.panel_histo);
-    stat.panel_histo = intCalloc (w2+1);
+    SUPERLU_FREE(stat.panel_histo);
+    stat.panel_histo = intCalloc(w2 + 1);
   }
 
   /* Call SuperLU to solve the problem. */
   int *etree = new int[A.ncol];
   if (options.ColPerm != MY_PERMC) {
-    get_perm_c (options.ColPerm, &A, *perm_c);
+    get_perm_c(options.ColPerm, &A, *perm_c);
   }
   SuperMatrix AC;
-  sp_preorder (&options, &A, *perm_c, etree, &AC);
+  sp_preorder(&options, &A, *perm_c, etree, &AC);
 
-  Astore = (NCformat*)(A.Store);
-  printf( "  + calling SuperLU dgstrf with panel_size=%d, relax_size=%d..\n",panel_size,relax_size );
-  printf( "   * Dimension %dx%d; # nonzeros %d\n", A.nrow, A.ncol, Astore->nnz);
+  Astore = (NCformat *)(A.Store);
+  printf("  + calling SuperLU dgstrf with panel_size=%d, relax_size=%d..\n",
+         panel_size, relax_size);
+  printf("   * Dimension %dx%d; # nonzeros %d\n", A.nrow, A.ncol, Astore->nnz);
 
-  #ifdef HAVE_KOKKOSKERNELS_SUPERLU5_API
+#ifdef HAVE_KOKKOSKERNELS_SUPERLU5_API
   GlobalLU_t Glu;
-  #endif
+#endif
   int lwork = 0;
   if (std::is_same<scalar_type, double>::value == true) {
-    dgstrf (&options, &AC, relax_size, panel_size, etree,
-            NULL, lwork, *perm_c, *perm_r, &L, &U,
-            #ifdef HAVE_KOKKOSKERNELS_SUPERLU5_API
-            &Glu,
-            #endif
-            &stat, &info);
+    dgstrf(&options, &AC, relax_size, panel_size, etree, NULL, lwork, *perm_c,
+           *perm_r, &L, &U,
+#ifdef HAVE_KOKKOSKERNELS_SUPERLU5_API
+           &Glu,
+#endif
+           &stat, &info);
   } else if (std::is_same<scalar_type, float>::value == true) {
-    sgstrf (&options, &AC, relax_size, panel_size, etree,
-            NULL, lwork, *perm_c, *perm_r, &L, &U,
-            #ifdef HAVE_KOKKOSKERNELS_SUPERLU5_API
-            &Glu,
-            #endif
-            &stat, &info);
+    sgstrf(&options, &AC, relax_size, panel_size, etree, NULL, lwork, *perm_c,
+           *perm_r, &L, &U,
+#ifdef HAVE_KOKKOSKERNELS_SUPERLU5_API
+           &Glu,
+#endif
+           &stat, &info);
   } else if (std::is_same<scalar_type, std::complex<float>>::value == true ||
              std::is_same<scalar_type, Kokkos::complex<float>>::value == true) {
-    cgstrf (&options, &AC, relax_size, panel_size, etree,
-            NULL, lwork, *perm_c, *perm_r, &L, &U,
-            #ifdef HAVE_KOKKOSKERNELS_SUPERLU5_API
-            &Glu,
-            #endif
-            &stat, &info);
+    cgstrf(&options, &AC, relax_size, panel_size, etree, NULL, lwork, *perm_c,
+           *perm_r, &L, &U,
+#ifdef HAVE_KOKKOSKERNELS_SUPERLU5_API
+           &Glu,
+#endif
+           &stat, &info);
   } else {
-    zgstrf (&options, &AC, relax_size, panel_size, etree,
-            NULL, lwork, *perm_c, *perm_r, &L, &U,
-            #ifdef HAVE_KOKKOSKERNELS_SUPERLU5_API
-            &Glu,
-            #endif
-            &stat, &info);
+    zgstrf(&options, &AC, relax_size, panel_size, etree, NULL, lwork, *perm_c,
+           *perm_r, &L, &U,
+#ifdef HAVE_KOKKOSKERNELS_SUPERLU5_API
+           &Glu,
+#endif
+           &stat, &info);
   }
-  if (info != 0) printf( " SuperLU failed with info=%d\n",info );
+  if (info != 0) printf(" SuperLU failed with info=%d\n", info);
   StatFree(&stat);
-  Destroy_SuperMatrix_Store (&A);
-  Destroy_CompCol_Permuted (&AC);
+  Destroy_SuperMatrix_Store(&A);
+  Destroy_CompCol_Permuted(&AC);
   if (!symm_mode) {
-    SUPERLU_FREE (nzvals_tran);
-    SUPERLU_FREE (rowptr_tran);
-    SUPERLU_FREE (colind_tran);
+    SUPERLU_FREE(nzvals_tran);
+    SUPERLU_FREE(rowptr_tran);
+    SUPERLU_FREE(colind_tran);
   }
 
   /* convert etree to parents */
-  SCformat *Lstore = (SCformat*)(L.Store);
-  int nsuper = 1 + Lstore->nsuper;     // # of supernodal columns
-  *parents = new int[nsuper];
+  SCformat *Lstore = (SCformat *)(L.Store);
+  int nsuper       = 1 + Lstore->nsuper;  // # of supernodal columns
+  *parents         = new int[nsuper];
   for (int s = 0; s < nsuper; s++) {
-    int j = Lstore->sup_to_col[s+1]-1; // the last column index of this supernode
+    int j = Lstore->sup_to_col[s + 1] -
+            1;  // the last column index of this supernode
     if (etree[j] == nrow) {
-        (*parents)[s] = -1;
+      (*parents)[s] = -1;
     } else {
-        (*parents)[s] = Lstore->col_to_sup[etree[j]];
+      (*parents)[s] = Lstore->col_to_sup[etree[j]];
     }
   }
-  delete [] etree;
+  delete[] etree;
 
   return;
 }
 
-
-/* ========================================================================================= */
-void free_superlu (SuperMatrix &L, SuperMatrix &U,
-                   int *perm_r, int *perm_c, int *parents) {
-
+/* =========================================================================================
+ */
+void free_superlu(SuperMatrix &L, SuperMatrix &U, int *perm_r, int *perm_c,
+                  int *parents) {
   Destroy_SuperNode_Matrix(&L);
   Destroy_CompCol_Matrix(&U);
 
-  delete [] perm_r;
-  delete [] perm_c;
-  delete [] parents;
+  delete[] perm_r;
+  delete[] perm_c;
+  delete[] parents;
 }
 
-
-/* ========================================================================================= */
-template<typename scalar_type>
-int test_sptrsv_perf (std::vector<int> tests, bool verbose, std::string &filename, bool symm_mode, bool metis, bool merge,
-                      bool invert_diag, bool invert_offdiag, bool u_in_csr, bool trmm_on_device,
-                      int panel_size, int relax_size, int block_size, int loop) {
-
+/* =========================================================================================
+ */
+template <typename scalar_type>
+int test_sptrsv_perf(std::vector<int> tests, bool verbose,
+                     std::string &filename, bool symm_mode, bool metis,
+                     bool merge, bool invert_diag, bool invert_offdiag,
+                     bool u_in_csr, bool trmm_on_device, int panel_size,
+                     int relax_size, int block_size, int loop) {
   using ordinal_type = int;
   using size_type    = int;
-  using STS = Kokkos::Details::ArithTraits<scalar_type>;
-  using mag_type = typename STS::mag_type;
+  using STS          = Kokkos::Details::ArithTraits<scalar_type>;
+  using mag_type     = typename STS::mag_type;
 
   // Default spaces
-  //using execution_space = Kokkos::OpenMP;
+  // using execution_space = Kokkos::OpenMP;
   using execution_space = Kokkos::DefaultExecutionSpace;
-  using memory_space = typename execution_space::memory_space;
+  using memory_space    = typename execution_space::memory_space;
 
   // Host spaces
   using host_execution_space = Kokkos::DefaultHostExecutionSpace;
-  using host_memory_space = typename host_execution_space::memory_space;
+  using host_memory_space    = typename host_execution_space::memory_space;
 
   //
-  using KernelHandle =  KokkosKernels::Experimental::KokkosKernelsHandle
-                        <size_type, ordinal_type, scalar_type, execution_space, memory_space, memory_space >;
+  using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<
+      size_type, ordinal_type, scalar_type, execution_space, memory_space,
+      memory_space>;
 
   //
-  using host_crsmat_t = KokkosSparse::CrsMatrix<scalar_type, ordinal_type, host_execution_space, void, size_type>;
-  using      crsmat_t = KokkosSparse::CrsMatrix<scalar_type, ordinal_type,      execution_space, void, size_type>;
+  using host_crsmat_t =
+      KokkosSparse::CrsMatrix<scalar_type, ordinal_type, host_execution_space,
+                              void, size_type>;
+  using crsmat_t = KokkosSparse::CrsMatrix<scalar_type, ordinal_type,
+                                           execution_space, void, size_type>;
 
   //
   using graph_t = typename crsmat_t::StaticCrsGraphType;
 
   //
-  using host_scalar_view_t = Kokkos::View< scalar_type*, host_memory_space >;
-  using      scalar_view_t = Kokkos::View< scalar_type*,      memory_space >;
+  using host_scalar_view_t = Kokkos::View<scalar_type *, host_memory_space>;
+  using scalar_view_t      = Kokkos::View<scalar_type *, memory_space>;
 
-  const scalar_type ZERO (0.0);
-  const scalar_type ONE (1.0);
+  const scalar_type ZERO(0.0);
+  const scalar_type ONE(1.0);
 
   // tolerance
   mag_type tol = STS::epsilon();
 
   int num_failed = 0;
   std::cout << std::endl;
-  std::cout << "Execution space: " << execution_space::name () << std::endl;
-  std::cout << "Memory space   : " << memory_space::name () << std::endl;
+  std::cout << "Execution space: " << execution_space::name() << std::endl;
+  std::cout << "Memory space   : " << memory_space::name() << std::endl;
   std::cout << std::endl;
-  if (!filename.empty())
-  {
+  if (!filename.empty()) {
     // ==============================================
     // read the matrix ** on host **
-    std::cout << " SuperLU Tester Begin: Read matrix filename " << filename << std::endl;
-    host_crsmat_t Mtx = KokkosKernels::Impl::read_kokkos_crst_matrix<host_crsmat_t> (filename.c_str());
+    std::cout << " SuperLU Tester Begin: Read matrix filename " << filename
+              << std::endl;
+    host_crsmat_t Mtx =
+        KokkosKernels::Impl::read_kokkos_crst_matrix<host_crsmat_t>(
+            filename.c_str());
 
-    const size_type nrows = Mtx.graph.numRows ();
+    const size_type nrows = Mtx.graph.numRows();
 
-    auto  graph_host  = Mtx.graph; // in_graph
+    auto graph_host   = Mtx.graph;  // in_graph
     auto row_map_host = graph_host.row_map;
     auto entries_host = graph_host.entries;
     auto values_host  = Mtx.values;
 
     // ==============================================
-    // call SuperLU on the host    
+    // call SuperLU on the host
     // > data for SuperLU
     int *etree;
     int *perm_r, *perm_c;
@@ -379,141 +402,161 @@ int test_sptrsv_perf (std::vector<int> tests, bool verbose, std::string &filenam
     // > call SuperLU
     Kokkos::Timer timer;
     std::cout << " > call SuperLU for factorization" << std::endl;
-    factor_superlu<scalar_type> (symm_mode, metis, nrows,
-                                 values_host.data(), const_cast<int*> (row_map_host.data()), entries_host.data(),
-                                 panel_size, relax_size, L, U, &perm_r, &perm_c, &etree);
+    factor_superlu<scalar_type>(symm_mode, metis, nrows, values_host.data(),
+                                const_cast<int *>(row_map_host.data()),
+                                entries_host.data(), panel_size, relax_size, L,
+                                U, &perm_r, &perm_c, &etree);
     double factor_time = timer.seconds();
-    std::cout << "   Factorization Time: " << factor_time << std::endl << std::endl;
+    std::cout << "   Factorization Time: " << factor_time << std::endl
+              << std::endl;
 
     // ==============================================
     // Run all requested algorithms
-    for ( auto test : tests ) {
+    for (auto test : tests) {
       std::cout << "\ntest = " << test << std::endl;
 
       KernelHandle khL, khU;
-      switch(test) {
+      switch (test) {
         case SUPERNODAL_NAIVE:
         case SUPERNODAL_ETREE:
         case SUPERNODAL_DAG:
         case SUPERNODAL_SPMV:
-        case SUPERNODAL_SPMV_DAG:
-        {
+        case SUPERNODAL_SPMV_DAG: {
           // ==============================================
           // create an handle
           if (test == SUPERNODAL_NAIVE) {
-            std::cout << " > create handle for SUPERNODAL_NAIVE" << std::endl << std::endl;
-            khL.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows, true);
-            khU.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows, false);
+            std::cout << " > create handle for SUPERNODAL_NAIVE" << std::endl
+                      << std::endl;
+            khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows,
+                                     true);
+            khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows,
+                                     false);
           } else if (test == SUPERNODAL_ETREE) {
-            std::cout << " > create handle for SUPERNODAL_ETREE" << std::endl << std::endl;
-            khL.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_ETREE, nrows, true);
-            khU.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_ETREE, nrows, false);
+            std::cout << " > create handle for SUPERNODAL_ETREE" << std::endl
+                      << std::endl;
+            khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_ETREE, nrows,
+                                     true);
+            khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_ETREE, nrows,
+                                     false);
           } else if (test == SUPERNODAL_DAG) {
-            std::cout << " > create handle for SUPERNODAL_DAG" << std::endl << std::endl;
-            khL.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, true);
-            khU.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, false);
-          } if (test == SUPERNODAL_SPMV) {
-            std::cout << " > create handle for SUPERNODAL_SPMV" << std::endl << std::endl;
-            khL.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_SPMV, nrows, true);
-            khU.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_SPMV, nrows, false);
+            std::cout << " > create handle for SUPERNODAL_DAG" << std::endl
+                      << std::endl;
+            khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows,
+                                     true);
+            khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows,
+                                     false);
+          }
+          if (test == SUPERNODAL_SPMV) {
+            std::cout << " > create handle for SUPERNODAL_SPMV" << std::endl
+                      << std::endl;
+            khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_SPMV, nrows,
+                                     true);
+            khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_SPMV, nrows,
+                                     false);
           } else if (test == SUPERNODAL_SPMV_DAG) {
-            std::cout << " > create handle for SUPERNODAL_SPMV_DAG" << std::endl << std::endl;
-            khL.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG, nrows, true);
-            khU.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG, nrows, false);
+            std::cout << " > create handle for SUPERNODAL_SPMV_DAG" << std::endl
+                      << std::endl;
+            khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG,
+                                     nrows, true);
+            khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG,
+                                     nrows, false);
           }
           // verbose (optional, default is false)
-          khU.set_sptrsv_verbose (verbose);
-          khL.set_sptrsv_verbose (verbose);
+          khU.set_sptrsv_verbose(verbose);
+          khL.set_sptrsv_verbose(verbose);
 
           // specify if U is stored in CSR or CSC
           std::cout << "=============================== " << std::endl;
           std::cout << " U in CSR           : " << u_in_csr << std::endl;
-          khU.set_sptrsv_column_major (!u_in_csr);
+          khU.set_sptrsv_column_major(!u_in_csr);
 
-          // specify wheather to merge supernodes (optional, default merge is false)
+          // specify wheather to merge supernodes (optional, default merge is
+          // false)
           std::cout << " Merge Supernode    : " << merge << std::endl;
-          khL.set_sptrsv_merge_supernodes (merge);
-          khU.set_sptrsv_merge_supernodes (merge);
+          khL.set_sptrsv_merge_supernodes(merge);
+          khU.set_sptrsv_merge_supernodes(merge);
 
           // specify wheather to invert diagonal blocks
           std::cout << " Invert diagonal    : " << invert_diag << std::endl;
-          khL.set_sptrsv_invert_diagonal (invert_diag);
-          khU.set_sptrsv_invert_diagonal (invert_diag);
-          
-          // specify wheather to apply diagonal-inversion to off-diagonal blocks (optional, default is false)
+          khL.set_sptrsv_invert_diagonal(invert_diag);
+          khU.set_sptrsv_invert_diagonal(invert_diag);
+
+          // specify wheather to apply diagonal-inversion to off-diagonal blocks
+          // (optional, default is false)
           std::cout << " Invert Off-diagonal: " << invert_offdiag << std::endl;
-          khL.set_sptrsv_invert_offdiagonal (invert_offdiag);
-          khU.set_sptrsv_invert_offdiagonal (invert_offdiag);
-          
+          khL.set_sptrsv_invert_offdiagonal(invert_offdiag);
+          khU.set_sptrsv_invert_offdiagonal(invert_offdiag);
+
           // set etree (required if SUPERNODAL_ETREE)
           if (test == SUPERNODAL_ETREE || test == SUPERNODAL_SPMV) {
-            khL.set_sptrsv_etree (etree);
-            khU.set_sptrsv_etree (etree);
+            khL.set_sptrsv_etree(etree);
+            khU.set_sptrsv_etree(etree);
           }
 
           // set permutation
-          khL.set_sptrsv_perm (perm_r);
-          khU.set_sptrsv_perm (perm_c);
+          khL.set_sptrsv_perm(perm_r);
+          khU.set_sptrsv_perm(perm_c);
 
           // specify whether to run trmm on device
           std::cout << " TRMM on device     : " << trmm_on_device << std::endl;
-          khL.set_sptrsv_trmm_on_device (trmm_on_device);
-          khU.set_sptrsv_trmm_on_device (trmm_on_device);
+          khL.set_sptrsv_trmm_on_device(trmm_on_device);
+          khU.set_sptrsv_trmm_on_device(trmm_on_device);
 
           // block size to switch to device call
           if (block_size >= 0) {
             std::cout << " Block Size         : " << block_size << std::endl;
-            khL.set_sptrsv_diag_supernode_sizes (block_size, block_size);
-            khU.set_sptrsv_diag_supernode_sizes (block_size, block_size);
+            khL.set_sptrsv_diag_supernode_sizes(block_size, block_size);
+            khU.set_sptrsv_diag_supernode_sizes(block_size, block_size);
           }
           std::cout << std::endl;
 
-
           // ==============================================
-          // do symbolic analysis (preprocssing, e.g., merging supernodes, inverting diagonal/offdiagonal blocks,
-          // and scheduling based on graph/dag)
+          // do symbolic analysis (preprocssing, e.g., merging supernodes,
+          // inverting diagonal/offdiagonal blocks, and scheduling based on
+          // graph/dag)
           timer.reset();
-          sptrsv_symbolic (&khL, &khU, L, U);
+          sptrsv_symbolic(&khL, &khU, L, U);
           double symbolic_time = timer.seconds();
-          std::cout << "   Symbolic Time   : " << symbolic_time << std::endl << std::endl;
-
+          std::cout << "   Symbolic Time   : " << symbolic_time << std::endl
+                    << std::endl;
 
           // ==============================================
-          // do numeric compute (copy numerical values from SuperLU data structure to our sptrsv data structure)
+          // do numeric compute (copy numerical values from SuperLU data
+          // structure to our sptrsv data structure)
           timer.reset();
-          sptrsv_compute (&khL, &khU, L, U);
+          sptrsv_compute(&khL, &khU, L, U);
           double compute_time = timer.seconds();
-          std::cout << "   Numeric Time   : " << compute_time << std::endl << std::endl;
-
+          std::cout << "   Numeric Time   : " << compute_time << std::endl
+                    << std::endl;
 
           // ==============================================
           // Preaparing for the first solve
           //> create the known solution and set to all 1's ** on host **
-          host_scalar_view_t sol_host ("sol_host", nrows);
-          //Kokkos::deep_copy (sol_host, ONE);
+          host_scalar_view_t sol_host("sol_host", nrows);
+          // Kokkos::deep_copy (sol_host, ONE);
           Kokkos::Random_XorShift64_Pool<host_execution_space> random(13718);
           Kokkos::fill_random(sol_host, random, scalar_type(1));
 
           // > create the rhs ** on host **
           // A*sol generates rhs: rhs is dense, use spmv
-          host_scalar_view_t rhs_host ("rhs_host", nrows);
-          KokkosSparse::spmv ( "N", ONE, Mtx, sol_host, ZERO, rhs_host);
-
+          host_scalar_view_t rhs_host("rhs_host", nrows);
+          KokkosSparse::spmv("N", ONE, Mtx, sol_host, ZERO, rhs_host);
 
           // ==============================================
           // apply forward-pivot to rhs on the host
-          host_scalar_view_t tmp_host ("temp", nrows);
-          forwardP_supernode<scalar_type> (nrows, perm_r, 1, rhs_host.data(), nrows, tmp_host.data(), nrows);
+          host_scalar_view_t tmp_host("temp", nrows);
+          forwardP_supernode<scalar_type>(nrows, perm_r, 1, rhs_host.data(),
+                                          nrows, tmp_host.data(), nrows);
 
           // copy rhs to the default host/device
-          scalar_view_t rhs ("rhs", nrows);
-          scalar_view_t sol ("sol", nrows);
-          Kokkos::deep_copy (rhs, tmp_host);
+          scalar_view_t rhs("rhs", nrows);
+          scalar_view_t sol("sol", nrows);
+          Kokkos::deep_copy(rhs, tmp_host);
 
           // ==============================================
           // do L solve
           timer.reset();
-          sptrsv_solve (&khL, sol, rhs);
+          sptrsv_solve(&khL, sol, rhs);
           Kokkos::fence();
           double solveL_time = timer.seconds();
           std::cout << " > Lower-TRI: " << std::endl;
@@ -522,46 +565,48 @@ int test_sptrsv_perf (std::vector<int> tests, bool verbose, std::string &filenam
           // ==============================================
           // do U solve
           timer.reset();
-          sptrsv_solve (&khU, rhs, sol);
-          Kokkos::fence ();
+          sptrsv_solve(&khU, rhs, sol);
+          Kokkos::fence();
           double solveU_time = timer.seconds();
           std::cout << " > Upper-TRI: " << std::endl;
           std::cout << "   Solve Time   : " << solveU_time << std::endl;
- 
+
           // copy solution to host
-          Kokkos::deep_copy (tmp_host, rhs);
+          Kokkos::deep_copy(tmp_host, rhs);
           // apply backward-pivot
-          backwardP_supernode<scalar_type> (nrows, perm_c, 1, tmp_host.data(), nrows, sol_host.data(), nrows);
-
+          backwardP_supernode<scalar_type>(nrows, perm_c, 1, tmp_host.data(),
+                                           nrows, sol_host.data(), nrows);
 
           // ==============================================
           // Error Check ** on host **
-          Kokkos::fence ();
+          Kokkos::fence();
           std::cout << std::endl;
-          if (!check_errors (tol, Mtx, rhs_host, sol_host)) {
-            num_failed ++;
+          if (!check_errors(tol, Mtx, rhs_host, sol_host)) {
+            num_failed++;
           }
 
           // try again?
           {
-            Kokkos::deep_copy (sol_host, ONE);
-            KokkosSparse::spmv ("N", ONE, Mtx, sol_host, ZERO, rhs_host);
-            forwardP_supernode<scalar_type> (nrows, perm_r, 1, rhs_host.data(), nrows, tmp_host.data(), nrows);
-            Kokkos::deep_copy (rhs, tmp_host);
-
-            #if 1
-            sptrsv_solve (&khL, &khU, sol, rhs);
-            #else
-            sptrsv_solve (&khL, sol, rhs);
-            sptrsv_solve (&khU, rhs, sol);
-            #endif
-
-            Kokkos::fence ();
-            Kokkos::deep_copy (tmp_host, rhs);
-            backwardP_supernode<scalar_type> (nrows, perm_c, 1, tmp_host.data(), nrows, sol_host.data(), nrows);
-
-            if (!check_errors (tol, Mtx, rhs_host, sol_host)) {
-              num_failed ++;
+            Kokkos::deep_copy(sol_host, ONE);
+            KokkosSparse::spmv("N", ONE, Mtx, sol_host, ZERO, rhs_host);
+            forwardP_supernode<scalar_type>(nrows, perm_r, 1, rhs_host.data(),
+                                            nrows, tmp_host.data(), nrows);
+            Kokkos::deep_copy(rhs, tmp_host);
+
+#if 1
+            sptrsv_solve(&khL, &khU, sol, rhs);
+#else
+            sptrsv_solve(&khL, sol, rhs);
+            sptrsv_solve(&khU, rhs, sol);
+#endif
+
+            Kokkos::fence();
+            Kokkos::deep_copy(tmp_host, rhs);
+            backwardP_supernode<scalar_type>(nrows, perm_c, 1, tmp_host.data(),
+                                             nrows, sol_host.data(), nrows);
+
+            if (!check_errors(tol, Mtx, rhs_host, sol_host)) {
+              num_failed++;
             }
           }
           std::cout << std::endl;
@@ -571,120 +616,135 @@ int test_sptrsv_perf (std::vector<int> tests, bool verbose, std::string &filenam
           double min_time = 0.0;
           double max_time = 0.0;
           double ave_time = 0.0;
-          Kokkos::fence ();
-          for(int i = 0; i < loop; i++) {
+          Kokkos::fence();
+          for (int i = 0; i < loop; i++) {
             timer.reset();
-            sptrsv_solve (&khL, sol, rhs);
+            sptrsv_solve(&khL, sol, rhs);
             Kokkos::fence();
-            double time = timer.seconds ();
+            double time = timer.seconds();
             ave_time += time;
-            if(time > max_time || i == 0) max_time = time;
-            if(time < min_time || i == 0) min_time = time;
+            if (time > max_time || i == 0) max_time = time;
+            if (time < min_time || i == 0) min_time = time;
           }
           std::cout << " L-solve: loop = " << loop << std::endl;
-          std::cout << "  LOOP_AVG_TIME:  " << ave_time/loop << std::endl;
+          std::cout << "  LOOP_AVG_TIME:  " << ave_time / loop << std::endl;
           std::cout << "  LOOP_MAX_TIME:  " << max_time << std::endl;
-          std::cout << "  LOOP_MIN_TIME:  " << min_time << std::endl << std::endl;
+          std::cout << "  LOOP_MIN_TIME:  " << min_time << std::endl
+                    << std::endl;
 
           // U-solve
           min_time = 0.0;
           max_time = 0.0;
           ave_time = 0.0;
-          Kokkos::fence ();
-          for(int i = 0; i < loop; i++) {
+          Kokkos::fence();
+          for (int i = 0; i < loop; i++) {
             timer.reset();
-            sptrsv_solve (&khU, rhs, sol);
+            sptrsv_solve(&khU, rhs, sol);
             Kokkos::fence();
-            double time = timer.seconds ();
+            double time = timer.seconds();
             ave_time += time;
-            if(time > max_time || i == 0) max_time = time;
-            if(time < min_time || i == 0) min_time = time;
+            if (time > max_time || i == 0) max_time = time;
+            if (time < min_time || i == 0) min_time = time;
           }
           std::cout << " U-solve: loop = " << loop << std::endl;
-          std::cout << "  LOOP_AVG_TIME:  " << ave_time/loop << std::endl;
+          std::cout << "  LOOP_AVG_TIME:  " << ave_time / loop << std::endl;
           std::cout << "  LOOP_MAX_TIME:  " << max_time << std::endl;
-          std::cout << "  LOOP_MIN_TIME:  " << min_time << std::endl << std::endl;
-        }
-        break;
+          std::cout << "  LOOP_MIN_TIME:  " << min_time << std::endl
+                    << std::endl;
+        } break;
 
-        case CUSPARSE:
-        {
-          std::cout << " > create handle for CuSparse (SUPERNODAL_NAIVE)" << std::endl << std::endl;
-          khL.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows, true);
-          khU.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows, false);
+        case CUSPARSE: {
+          std::cout << " > create handle for CuSparse (SUPERNODAL_NAIVE)"
+                    << std::endl
+                    << std::endl;
+          khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows,
+                                   true);
+          khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows,
+                                   false);
 
-          khU.set_sptrsv_column_major (!u_in_csr);
+          khU.set_sptrsv_column_major(!u_in_csr);
 
           // ==============================================
           // read SuperLU factor on the host (and copy to default host/device)
           timer.reset();
           graph_t graphL;
           crsmat_t superluL;
-          khL.set_sptrsv_invert_diagonal (false);
-          graphL = read_superlu_graphL<graph_t> (&khL, &L);
-          double time = timer.seconds ();
-          timer.reset ();
-          superluL = read_superlu_valuesL<crsmat_t> (&khL, &L, graphL);
+          khL.set_sptrsv_invert_diagonal(false);
+          graphL      = read_superlu_graphL<graph_t>(&khL, &L);
+          double time = timer.seconds();
+          timer.reset();
+          superluL          = read_superlu_valuesL<crsmat_t>(&khL, &L, graphL);
           double readL_time = timer.seconds();
-          std::cout << "   Conversion Time for L: " << time << " + " << readL_time << std::endl;
+          std::cout << "   Conversion Time for L: " << time << " + "
+                    << readL_time << std::endl;
 
-          timer.reset ();
+          timer.reset();
           graph_t graphU;
           crsmat_t superluU;
-          khU.set_sptrsv_invert_diagonal (false);
-          graphU = read_superlu_graphU<graph_t> (&khU, &L, &U);
-          time = timer.seconds ();
-          timer.reset ();
-          superluU = read_superlu_valuesU<crsmat_t, graph_t> (&khU, &L, &U, graphU);
+          khU.set_sptrsv_invert_diagonal(false);
+          graphU = read_superlu_graphU<graph_t>(&khU, &L, &U);
+          time   = timer.seconds();
+          timer.reset();
+          superluU =
+              read_superlu_valuesU<crsmat_t, graph_t>(&khU, &L, &U, graphU);
           double readU_time = timer.seconds();
-          std::cout << "   Conversion Time for U: " << time << " + " << readU_time << std::endl;
+          std::cout << "   Conversion Time for U: " << time << " + "
+                    << readU_time << std::endl;
 
           // remove zeros in L/U
           timer.reset();
           std::cout << "   Compress L-factor: " << std::endl;
           superluL = remove_zeros_crsmat(superluL);
           std::cout << "   Compress U-factor: " << std::endl;
-          superluU = remove_zeros_crsmat(superluU);
+          superluU             = remove_zeros_crsmat(superluU);
           double compress_time = timer.seconds();
-          std::cout << "   Compression Time: " << compress_time << std::endl << std::endl;
+          std::cout << "   Compression Time: " << compress_time << std::endl
+                    << std::endl;
 
           bool col_majorL = true;
           bool col_majorU = !u_in_csr;
-          if (!check_cusparse (Mtx, col_majorL, superluL, col_majorU, superluU, perm_r, perm_c, tol, loop)) {
-            num_failed ++;
+          if (!check_cusparse(Mtx, col_majorL, superluL, col_majorU, superluU,
+                              perm_r, perm_c, tol, loop)) {
+            num_failed++;
           }
-        }
-        break;
+        } break;
 
-        default:
-          std::cout << " > Invalid test ID < " << std::endl;
-          exit (0);
+        default: std::cout << " > Invalid test ID < " << std::endl; exit(0);
       }
     }
     // free SuperLU data structures
-    free_superlu (L, U, perm_r, perm_c, etree);
+    free_superlu(L, U, perm_r, perm_c, etree);
   }
   std::cout << std::endl << std::endl;
 
   return num_failed;
 }
 
-
 void print_help_sptrsv() {
   printf("Options:\n");
   printf("  --test [OPTION]         : Use different kernel implementations\n");
   printf("                            Options:\n");
-  printf("                            superlu-naive, superlu-etree, superlu-dag\n\n");
-  printf("  -f [file]               : Read in Matrix Market formatted text file 'file'.\n");
-  printf("  --loop [LOOP]           : How many spmv to run to aggregate average time. \n");
-  printf("  --u-in-csc              : To store U-factor in CSC, needed for invert.\n");
+  printf(
+      "                            superlu-naive, superlu-etree, "
+      "superlu-dag\n\n");
+  printf(
+      "  -f [file]               : Read in Matrix Market formatted text file "
+      "'file'.\n");
+  printf(
+      "  --loop [LOOP]           : How many spmv to run to aggregate average "
+      "time. \n");
+  printf(
+      "  --u-in-csc              : To store U-factor in CSC, needed for "
+      "invert.\n");
   printf("  --invert-diag           : To invert diagonal blocks.\n");
-  printf("  --invert-offdiag        : To apply inverse to off-diagonal blocks.\n");
-  printf("  --block-size [SIZE]     : To specify the threshold to switch device and bached kernel.\n");
+  printf(
+      "  --invert-offdiag        : To apply inverse to off-diagonal blocks.\n");
+  printf(
+      "  --block-size [SIZE]     : To specify the threshold to switch device "
+      "and bached kernel.\n");
   printf("  --scalar-type [s,d,c,z] :\n");
 }
 
-
 int main(int argc, char **argv) {
   std::vector<int> tests;
   std::string filename;
@@ -705,99 +765,99 @@ int main(int argc, char **argv) {
   // specify whether to run KokkosKernels::trmm on device
   bool trmm_on_device = false;
   // block size to switch to device call (default is 100)
-  int block_size  = -1;
+  int block_size = -1;
   // parameters for SuperLU (only affects factorization)
   int panel_size = sp_ienv(1);
   int relax_size = sp_ienv(2);
   // verbose
   bool verbose = true;
   // scalar type
-  std::string char_scalar = "d";
+  std::string char_scalar      = "d";
   std::string scalarTypeString = "(scalar_t = double)";
 
-  if(argc == 1)
-  {
+  if (argc == 1) {
     print_help_sptrsv();
     return 0;
   }
 
-  for(int i = 0; i < argc; i++) {
-    if((strcmp(argv[i],"--test")==0)) {
+  for (int i = 0; i < argc; i++) {
+    if ((strcmp(argv[i], "--test") == 0)) {
       i++;
-      if((strcmp(argv[i],"superlu-naive")==0)) {
-        tests.push_back( SUPERNODAL_NAIVE );
-      } else if((strcmp(argv[i],"superlu-etree")==0)) {
-        tests.push_back( SUPERNODAL_ETREE );
-      } else if((strcmp(argv[i],"superlu-dag")==0)) {
-        tests.push_back( SUPERNODAL_DAG );
-      } else if((strcmp(argv[i],"superlu-spmv")==0)) {
-        tests.push_back( SUPERNODAL_SPMV );
-      } else if((strcmp(argv[i],"superlu-spmv-dag")==0)) {
-        tests.push_back( SUPERNODAL_SPMV_DAG );
-      } else if((strcmp(argv[i],"cusparse")==0)) {
-        tests.push_back( CUSPARSE );
+      if ((strcmp(argv[i], "superlu-naive") == 0)) {
+        tests.push_back(SUPERNODAL_NAIVE);
+      } else if ((strcmp(argv[i], "superlu-etree") == 0)) {
+        tests.push_back(SUPERNODAL_ETREE);
+      } else if ((strcmp(argv[i], "superlu-dag") == 0)) {
+        tests.push_back(SUPERNODAL_DAG);
+      } else if ((strcmp(argv[i], "superlu-spmv") == 0)) {
+        tests.push_back(SUPERNODAL_SPMV);
+      } else if ((strcmp(argv[i], "superlu-spmv-dag") == 0)) {
+        tests.push_back(SUPERNODAL_SPMV_DAG);
+      } else if ((strcmp(argv[i], "cusparse") == 0)) {
+        tests.push_back(CUSPARSE);
       } else {
-        std::cerr << "Invalid --tests option: \"" << argv[i] << "\"" << std::endl;
+        std::cerr << "Invalid --tests option: \"" << argv[i] << "\""
+                  << std::endl;
         return -EINVAL;
       }
       continue;
     }
-    if((strcmp(argv[i],"-f")==0)) {
+    if ((strcmp(argv[i], "-f") == 0)) {
       filename = argv[++i];
       continue;
     }
-    if((strcmp(argv[i],"--quiet")==0)) {
+    if ((strcmp(argv[i], "--quiet") == 0)) {
       verbose = false;
       continue;
     }
-    if((strcmp(argv[i],"--loop")==0)) {
+    if ((strcmp(argv[i], "--loop") == 0)) {
       loop = atoi(argv[++i]);
       continue;
     }
-    if((strcmp(argv[i],"--symm")==0)) {
+    if ((strcmp(argv[i], "--symm") == 0)) {
       symm_mode = true;
       continue;
     }
-    if((strcmp(argv[i],"--metis")==0)) {
+    if ((strcmp(argv[i], "--metis") == 0)) {
       metis = true;
       continue;
     }
-    if((strcmp(argv[i],"--merge")==0)) {
+    if ((strcmp(argv[i], "--merge") == 0)) {
       merge = true;
       continue;
     }
-    if((strcmp(argv[i],"--no-invert-diag")==0)) {
+    if ((strcmp(argv[i], "--no-invert-diag") == 0)) {
       invert_diag = false;
       continue;
     }
-    if((strcmp(argv[i],"--invert-offdiag")==0)) {
+    if ((strcmp(argv[i], "--invert-offdiag") == 0)) {
       invert_offdiag = true;
       continue;
     }
-    if((strcmp(argv[i],"--u-in-csc")==0)) {
+    if ((strcmp(argv[i], "--u-in-csc") == 0)) {
       u_in_csr = false;
       continue;
     }
-    if((strcmp(argv[i],"--trmm-on-device")==0)) {
+    if ((strcmp(argv[i], "--trmm-on-device") == 0)) {
       trmm_on_device = true;
       continue;
     }
-    if((strcmp(argv[i],"--panel-size")==0)) {
+    if ((strcmp(argv[i], "--panel-size") == 0)) {
       panel_size = atoi(argv[++i]);
       continue;
     }
-    if((strcmp(argv[i],"--relax-size")==0)) {
+    if ((strcmp(argv[i], "--relax-size") == 0)) {
       relax_size = atoi(argv[++i]);
       continue;
     }
-    if((strcmp(argv[i],"--block-size")==0)) {
+    if ((strcmp(argv[i], "--block-size") == 0)) {
       block_size = atoi(argv[++i]);
       continue;
     }
-    if((strcmp(argv[i],"--scalar-type")==0)) {
+    if ((strcmp(argv[i], "--scalar-type") == 0)) {
       char_scalar = argv[++i];
     }
-    if((strcmp(argv[i],"--help")==0) || (strcmp(argv[i],"-h")==0)) {
+    if ((strcmp(argv[i], "--help") == 0) || (strcmp(argv[i], "-h") == 0)) {
       print_help_sptrsv();
       return 0;
     }
@@ -808,82 +868,99 @@ int main(int argc, char **argv) {
     std::cout << "tests[" << i << "] = " << tests[i] << std::endl;
   }
 
-  Kokkos::ScopeGuard kokkosScope (argc, argv);
+  Kokkos::ScopeGuard kokkosScope(argc, argv);
 
   int total_errors = 0;
   if (char_scalar == "z") {
-    #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
     scalarTypeString = "(scalar_t = Kokkos::complex<double>)";
-    total_errors = test_sptrsv_perf<Kokkos::complex<double>> (tests, verbose, filename, symm_mode, metis, merge,
-                                                              invert_diag, invert_offdiag, u_in_csr, trmm_on_device,
-                                                              panel_size, relax_size, block_size, loop);
-    #else
-    std::cout << std::endl << " KOKKOSKERNELS_INST_COMPLEX_DOUBLE  is not enabled ** " << std::endl << std::endl;
-    #endif
+    total_errors     = test_sptrsv_perf<Kokkos::complex<double>>(
+        tests, verbose, filename, symm_mode, metis, merge, invert_diag,
+        invert_offdiag, u_in_csr, trmm_on_device, panel_size, relax_size,
+        block_size, loop);
+#else
+    std::cout << std::endl
+              << " KOKKOSKERNELS_INST_COMPLEX_DOUBLE  is not enabled ** "
+              << std::endl
+              << std::endl;
+#endif
   } else if (char_scalar == "c") {
-    #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT)
+#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT)
     scalarTypeString = "(scalar_t = Kokkos::complex<float>)";
-    total_errors = test_sptrsv_perf<Kokkos::complex<float>> (tests, verbose, filename, symm_mode, metis, merge,
-                                                             invert_diag, invert_offdiag, u_in_csr, trmm_on_device,
-                                                             panel_size, relax_size, block_size, loop);
-    #else
-    std::cout << std::endl << " KOKKOSKERNELS_INST_COMPLEX_FLOAT  is not enabled ** " << std::endl << std::endl;
-    #endif
+    total_errors     = test_sptrsv_perf<Kokkos::complex<float>>(
+        tests, verbose, filename, symm_mode, metis, merge, invert_diag,
+        invert_offdiag, u_in_csr, trmm_on_device, panel_size, relax_size,
+        block_size, loop);
+#else
+    std::cout << std::endl
+              << " KOKKOSKERNELS_INST_COMPLEX_FLOAT  is not enabled ** "
+              << std::endl
+              << std::endl;
+#endif
   } else if (char_scalar == "d") {
-    #if defined(KOKKOSKERNELS_INST_DOUBLE)
-      scalarTypeString = "(scalar_t = double)";
-      total_errors = test_sptrsv_perf<double> (tests, verbose, filename, symm_mode, metis, merge,
-                                               invert_diag, invert_offdiag, u_in_csr, trmm_on_device,
-                                               panel_size, relax_size, block_size, loop);
-    #else
-    std::cout << std::endl << " KOKKOSKERNELS_INST_DOUBLE  is not enabled ** " << std::endl << std::endl;
-    #endif
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+    scalarTypeString = "(scalar_t = double)";
+    total_errors     = test_sptrsv_perf<double>(
+        tests, verbose, filename, symm_mode, metis, merge, invert_diag,
+        invert_offdiag, u_in_csr, trmm_on_device, panel_size, relax_size,
+        block_size, loop);
+#else
+    std::cout << std::endl
+              << " KOKKOSKERNELS_INST_DOUBLE  is not enabled ** " << std::endl
+              << std::endl;
+#endif
   } else if (char_scalar == "f") {
-    #if defined(KOKKOSKERNELS_INST_FLOAT)
+#if defined(KOKKOSKERNELS_INST_FLOAT)
     scalarTypeString = "(scalar_t = float)";
-    total_errors = test_sptrsv_perf<float> (tests, verbose, filename, symm_mode, metis, merge,
-                                            invert_diag, invert_offdiag, u_in_csr, trmm_on_device,
-                                            panel_size, relax_size, block_size, loop);
-    #else
-    std::cout << std::endl << " KOKKOSKERNELS_INST_FLOAT  is not enabled ** " << std::endl << std::endl;
-    #endif
+    total_errors     = test_sptrsv_perf<float>(
+        tests, verbose, filename, symm_mode, metis, merge, invert_diag,
+        invert_offdiag, u_in_csr, trmm_on_device, panel_size, relax_size,
+        block_size, loop);
+#else
+    std::cout << std::endl
+              << " KOKKOSKERNELS_INST_FLOAT  is not enabled ** " << std::endl
+              << std::endl;
+#endif
   }
-  if(total_errors == 0)
-    std::cout << "Kokkos::SPTRSV Test: Passed " << scalarTypeString
-              << std::endl << std::endl;
+  if (total_errors == 0)
+    std::cout << "Kokkos::SPTRSV Test: Passed " << scalarTypeString << std::endl
+              << std::endl;
   else
-    std::cout << "Kokkos::SPTRSV Test: Failed (" << total_errors 
-              << " / " << 2*tests.size() << " failed) " << scalarTypeString
-              << std::endl << std::endl;
+    std::cout << "Kokkos::SPTRSV Test: Failed (" << total_errors << " / "
+              << 2 * tests.size() << " failed) " << scalarTypeString
+              << std::endl
+              << std::endl;
 
   return 0;
 }
-#else // defined(KOKKOSKERNELS_ENABLE_TPL_SUPERLU)
+#else  // defined(KOKKOSKERNELS_ENABLE_TPL_SUPERLU)
 int main() {
   std::cout << std::endl;
-  #if !defined(KOKKOSKERNELS_ENABLE_TPL_SUPERLU)
+#if !defined(KOKKOSKERNELS_ENABLE_TPL_SUPERLU)
   std::cout << " ** SUPERLU NOT ENABLED **" << std::endl;
-  #endif
-  #if !defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV)
+#endif
+#if !defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV)
   std::cout << " ** SUPERNODAL SPTRSV NOT ENABLED **" << std::endl;
-  #endif
+#endif
   std::cout << std::endl;
 
   return 1;
 }
 #endif
 
-#else // defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA ) && (!defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION ))
+#else  // defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA ) &&
+       // (!defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION ))
 
 int main() {
-#if !defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
-  std::cout << " KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA **not** defined" << std::endl;
+#if !defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+  std::cout << " KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA **not** defined"
+            << std::endl;
 #endif
 #if defined(KOKKOS_ENABLE_CUDA)
   std::cout << " KOKKOS_ENABLE_CUDA defined" << std::endl;
-  #if !defined(KOKKOS_ENABLE_CUDA_LAMBDA)
+#if !defined(KOKKOS_ENABLE_CUDA_LAMBDA)
   std::cout << " KOKKOS_ENABLE_CUDA_LAMBDA **not** defined" << std::endl;
-  #endif
+#endif
   std::cout << " CUDA_VERSION = " << CUDA_VERSION << std::endl;
 #endif
   return 1;
diff --git a/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp b/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp
index 0bdb64adba..039c88e9c1 100644
--- a/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp
+++ b/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp
@@ -50,8 +50,8 @@
 #include "KokkosSparse_sptrsv.hpp"
 #include "KokkosSparse_sptrsv_supernode.hpp"
 
-#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )         && \
-  (!defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION)) && \
+#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) &&             \
+    (!defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION)) && \
     defined(KOKKOSKERNELS_INST_DOUBLE)
 
 #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV)
@@ -65,208 +65,238 @@ using namespace KokkosSparse;
 using namespace KokkosSparse::Experimental;
 using namespace KokkosSparse::PerfTest::Experimental;
 
-enum {CUSPARSE, SUPERNODAL_NAIVE, SUPERNODAL_ETREE, SUPERNODAL_DAG, SUPERNODAL_SPMV, SUPERNODAL_SPMV_DAG};
-
-
-
-/* ========================================================================================= */
-template<typename scalar_type>
-int test_sptrsv_perf (std::vector<int> tests, bool verbose, std::string& lower_filename, std::string& upper_filename, std::string& supernode_filename,
-                      bool merge, bool invert_offdiag, bool u_in_csr, int loop) {
-
+enum {
+  CUSPARSE,
+  SUPERNODAL_NAIVE,
+  SUPERNODAL_ETREE,
+  SUPERNODAL_DAG,
+  SUPERNODAL_SPMV,
+  SUPERNODAL_SPMV_DAG
+};
+
+/* =========================================================================================
+ */
+template <typename scalar_type>
+int test_sptrsv_perf(std::vector<int> tests, bool verbose,
+                     std::string& lower_filename, std::string& upper_filename,
+                     std::string& supernode_filename, bool merge,
+                     bool invert_offdiag, bool u_in_csr, int loop) {
   using ordinal_type = int;
   using size_type    = int;
-  using STS = Kokkos::Details::ArithTraits<scalar_type>;
-  using mag_type = typename STS::mag_type;
+  using STS          = Kokkos::Details::ArithTraits<scalar_type>;
+  using mag_type     = typename STS::mag_type;
 
   // Default spaces
-  //using execution_space = Kokkos::OpenMP;
+  // using execution_space = Kokkos::OpenMP;
   using execution_space = Kokkos::DefaultExecutionSpace;
-  using memory_space = typename execution_space::memory_space;
+  using memory_space    = typename execution_space::memory_space;
 
   // Host spaces
   using host_execution_space = Kokkos::DefaultHostExecutionSpace;
-  using host_memory_space = typename host_execution_space::memory_space;
+  using host_memory_space    = typename host_execution_space::memory_space;
 
   //
-  using KernelHandle =  KokkosKernels::Experimental::KokkosKernelsHandle
-                        <size_type, ordinal_type, scalar_type, execution_space, memory_space, memory_space >;
+  using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<
+      size_type, ordinal_type, scalar_type, execution_space, memory_space,
+      memory_space>;
 
   //
   using host_crsmat_t = typename KernelHandle::SPTRSVHandleType::host_crsmat_t;
 
   //
-  using host_scalar_view_t = Kokkos::View< scalar_type*, host_memory_space >;
-  using      scalar_view_t = Kokkos::View< scalar_type*,      memory_space >;
+  using host_scalar_view_t = Kokkos::View<scalar_type*, host_memory_space>;
+  using scalar_view_t      = Kokkos::View<scalar_type*, memory_space>;
 
-  const scalar_type ZERO (0.0);
-  const scalar_type ONE (1.0);
+  const scalar_type ZERO(0.0);
+  const scalar_type ONE(1.0);
 
   // tolerance
   mag_type tol = STS::epsilon();
 
   int num_failed = 0;
   std::cout << std::endl;
-  std::cout << "Execution space: " << execution_space::name () << std::endl;
-  std::cout << "Memory space   : " << memory_space::name () << std::endl;
+  std::cout << "Execution space: " << execution_space::name() << std::endl;
+  std::cout << "Memory space   : " << memory_space::name() << std::endl;
   std::cout << std::endl;
-  if ((!lower_filename.empty() || !upper_filename.empty()) && !supernode_filename.empty())
-  {
+  if ((!lower_filename.empty() || !upper_filename.empty()) &&
+      !supernode_filename.empty()) {
     // ==============================================
     // read the CRS matrix ** on host **
-    // it stores the supernodal triangular matrix, stored by blocks with explicit zeros
+    // it stores the supernodal triangular matrix, stored by blocks with
+    // explicit zeros
     std::cout << " Supernode Tester Begin:" << std::endl;
-    std::string matrix_filename = (lower_filename.empty() ? upper_filename : lower_filename);
-    std::cout << " > Read a triangular-matrix filename " << matrix_filename << std::endl;
-    host_crsmat_t M = KokkosKernels::Impl::read_kokkos_crst_matrix<host_crsmat_t> (matrix_filename.c_str());
-    const size_type nrows = M.graph.numRows ();
+    std::string matrix_filename =
+        (lower_filename.empty() ? upper_filename : lower_filename);
+    std::cout << " > Read a triangular-matrix filename " << matrix_filename
+              << std::endl;
+    host_crsmat_t M =
+        KokkosKernels::Impl::read_kokkos_crst_matrix<host_crsmat_t>(
+            matrix_filename.c_str());
+    const size_type nrows = M.graph.numRows();
     // transpose the matrix to be stored in CCS
     //{
-      auto  graphM  = M.graph; // in_graph
-      auto row_mapM = graphM.row_map;
-      auto entriesM = graphM.entries;
-      auto valuesM  = M.values;
-
-      using host_graph_t = typename host_crsmat_t::StaticCrsGraphType;
-      using row_map_view_t = typename host_graph_t::row_map_type::non_const_type;
-      using    cols_view_t = typename host_graph_t::entries_type::non_const_type;
-      using  values_view_t = typename host_crsmat_t::values_type::non_const_type;
-      using in_row_map_view_t = typename host_graph_t::row_map_type;
-      using in_cols_view_t    = typename host_graph_t::entries_type;
-      using in_values_view_t  = typename host_crsmat_t::values_type;
-
-      size_type nnzL = row_mapM (nrows);
-      row_map_view_t row_map ("rowmap_view", nrows+1);
-      cols_view_t    entries ("colmap_view", nnzL);
-      values_view_t  values  ("values_view", nnzL);
-      // transpose L
-      transpose_matrix <in_row_map_view_t, in_cols_view_t, in_values_view_t,
-                           row_map_view_t,    cols_view_t,    values_view_t, 
-                        row_map_view_t, host_execution_space>
-        (nrows, nrows, row_mapM, entriesM, valuesM,
-                       row_map,  entries,  values);
-
-      // store L in CSC
-      host_graph_t static_graph(entries, row_map);
-      host_crsmat_t T ("CrsMatrix", nrows, values, static_graph);
+    auto graphM   = M.graph;  // in_graph
+    auto row_mapM = graphM.row_map;
+    auto entriesM = graphM.entries;
+    auto valuesM  = M.values;
+
+    using host_graph_t   = typename host_crsmat_t::StaticCrsGraphType;
+    using row_map_view_t = typename host_graph_t::row_map_type::non_const_type;
+    using cols_view_t    = typename host_graph_t::entries_type::non_const_type;
+    using values_view_t  = typename host_crsmat_t::values_type::non_const_type;
+    using in_row_map_view_t = typename host_graph_t::row_map_type;
+    using in_cols_view_t    = typename host_graph_t::entries_type;
+    using in_values_view_t  = typename host_crsmat_t::values_type;
+
+    size_type nnzL = row_mapM(nrows);
+    row_map_view_t row_map("rowmap_view", nrows + 1);
+    cols_view_t entries("colmap_view", nnzL);
+    values_view_t values("values_view", nnzL);
+    // transpose L
+    transpose_matrix<in_row_map_view_t, in_cols_view_t, in_values_view_t,
+                     row_map_view_t, cols_view_t, values_view_t, row_map_view_t,
+                     host_execution_space>(nrows, nrows, row_mapM, entriesM,
+                                           valuesM, row_map, entries, values);
+
+    // store L in CSC
+    host_graph_t static_graph(entries, row_map);
+    host_crsmat_t T("CrsMatrix", nrows, values, static_graph);
     //}
     host_crsmat_t A, L;
-    if (!lower_filename.empty()) { 
-      A = M; // original L in CSR
-      L = T; // transposed L in CSC
+    if (!lower_filename.empty()) {
+      A = M;  // original L in CSR
+      L = T;  // transposed L in CSC
     } else {
-      A = T; // original L was in CSC, so transposed L in CSR
-      L = M; // original L in CSC
+      A = T;  // original L was in CSC, so transposed L in CSR
+      L = M;  // original L in CSC
     }
     // read supernode sizes from a file
     // first entry in the file specifies the number of supernode
-    // the rest of the entries specifies the column offsets to the beginning of supernodes
-    std::cout << " > Read a supernode filename " << supernode_filename << std::endl;
+    // the rest of the entries specifies the column offsets to the beginning of
+    // supernodes
+    std::cout << " > Read a supernode filename " << supernode_filename
+              << std::endl;
     std::ifstream fp(supernode_filename.c_str(), std::ios::in);
     if (!fp.is_open()) {
-      std::cout << std::endl << " failed to open " << supernode_filename << std::endl <<std::endl;
+      std::cout << std::endl
+                << " failed to open " << supernode_filename << std::endl
+                << std::endl;
       return 0;
     }
     int nsuper;
     fp >> nsuper;
-    Kokkos::View<int*, Kokkos::HostSpace> supercols ("supercols", 1+nsuper);
-    int *etree = NULL;
+    Kokkos::View<int*, Kokkos::HostSpace> supercols("supercols", 1 + nsuper);
+    int* etree = NULL;
     for (int i = 0; i <= nsuper; i++) {
-      fp >> supercols (i);
+      fp >> supercols(i);
     }
     fp.close();
 
     Kokkos::Timer timer;
     // ==============================================
     // Run all requested algorithms
-    for ( auto test : tests ) {
+    for (auto test : tests) {
       std::cout << "\ntest = " << test << std::endl;
 
       KernelHandle khL;
-      KernelHandle khU; // TODO: can I just make a copy later (khU = khL)?
-      switch(test) {
+      KernelHandle khU;  // TODO: can I just make a copy later (khU = khL)?
+      switch (test) {
         case SUPERNODAL_NAIVE:
         case SUPERNODAL_ETREE:
         case SUPERNODAL_DAG:
         case SUPERNODAL_SPMV:
-        case SUPERNODAL_SPMV_DAG:
-        {
+        case SUPERNODAL_SPMV_DAG: {
           // ==============================================
           // create an handle
           if (test == SUPERNODAL_NAIVE) {
-            std::cout << " > create handle for SUPERNODAL_NAIVE" << std::endl << std::endl;
-            khL.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows, true);
-            khU.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows, true);
+            std::cout << " > create handle for SUPERNODAL_NAIVE" << std::endl
+                      << std::endl;
+            khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows,
+                                     true);
+            khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows,
+                                     true);
           } else if (test == SUPERNODAL_DAG) {
-            std::cout << " > create handle for SUPERNODAL_DAG" << std::endl << std::endl;
-            khL.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, true);
-            khU.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, true);
+            std::cout << " > create handle for SUPERNODAL_DAG" << std::endl
+                      << std::endl;
+            khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows,
+                                     true);
+            khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows,
+                                     true);
           } else if (test == SUPERNODAL_SPMV_DAG) {
-            std::cout << " > create handle for SUPERNODAL_SPMV_DAG" << std::endl << std::endl;
-            khL.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG, nrows, true);
-            khU.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG, nrows, true);
+            std::cout << " > create handle for SUPERNODAL_SPMV_DAG" << std::endl
+                      << std::endl;
+            khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG,
+                                     nrows, true);
+            khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG,
+                                     nrows, true);
           }
           // verbose (optional, default is false)
-          khL.set_sptrsv_verbose (verbose);
-          khU.set_sptrsv_verbose (verbose);
+          khL.set_sptrsv_verbose(verbose);
+          khU.set_sptrsv_verbose(verbose);
 
           // specify if U is stored in CSR or CSC
-          khU.set_sptrsv_column_major (!u_in_csr);
+          khU.set_sptrsv_column_major(!u_in_csr);
+
+          // specify wheather to merge supernodes (optional, default merge is
+          // false)
+          khL.set_sptrsv_merge_supernodes(merge);
+          khU.set_sptrsv_merge_supernodes(merge);
 
-          // specify wheather to merge supernodes (optional, default merge is false)
-          khL.set_sptrsv_merge_supernodes (merge);
-          khU.set_sptrsv_merge_supernodes (merge);
+          // specify wheather to apply diagonal-inversion to off-diagonal blocks
+          // (optional, default is false)
+          khL.set_sptrsv_invert_offdiagonal(invert_offdiag);
+          khU.set_sptrsv_invert_offdiagonal(invert_offdiag);
 
-          // specify wheather to apply diagonal-inversion to off-diagonal blocks (optional, default is false)
-          khL.set_sptrsv_invert_offdiagonal (invert_offdiag);
-          khU.set_sptrsv_invert_offdiagonal (invert_offdiag);
- 
           // ==============================================
-          // do symbolic analysis (preprocssing, e.g., merging supernodes, inverting diagonal/offdiagonal blocks,
-          // and scheduling based on graph/dag)
-          khU.get_sptrsv_handle ()->set_column_major (!khL.get_sptrsv_handle ()->is_column_major ());
-          sptrsv_supernodal_symbolic (nsuper, supercols.data (), etree, L.graph, &khL, L.graph, &khU);
+          // do symbolic analysis (preprocssing, e.g., merging supernodes,
+          // inverting diagonal/offdiagonal blocks, and scheduling based on
+          // graph/dag)
+          khU.get_sptrsv_handle()->set_column_major(
+              !khL.get_sptrsv_handle()->is_column_major());
+          sptrsv_supernodal_symbolic(nsuper, supercols.data(), etree, L.graph,
+                                     &khL, L.graph, &khU);
 
           // ==============================================
-          // do numeric compute (copy numerical values from SuperLU data structure to our sptrsv data structure)
-          sptrsv_compute (&khL, L);
+          // do numeric compute (copy numerical values from SuperLU data
+          // structure to our sptrsv data structure)
+          sptrsv_compute(&khL, L);
 
           // ==============================================
           // Preaparing for the first solve
           //> create the known solution and set to all 1's ** on host **
-          host_scalar_view_t sol_host ("sol_host", nrows);
-          //Kokkos::deep_copy (sol_host, ONE);
+          host_scalar_view_t sol_host("sol_host", nrows);
+          // Kokkos::deep_copy (sol_host, ONE);
           Kokkos::Random_XorShift64_Pool<host_execution_space> random(13718);
           Kokkos::fill_random(sol_host, random, scalar_type(1));
 
           // > create the rhs ** on host **
           // A*sol generates rhs: rhs is dense, use spmv
-          host_scalar_view_t rhs_host ("rhs_host", nrows);
-          KokkosSparse::spmv ( "N", ONE, A, sol_host, ZERO, rhs_host);
+          host_scalar_view_t rhs_host("rhs_host", nrows);
+          KokkosSparse::spmv("N", ONE, A, sol_host, ZERO, rhs_host);
 
           // ==============================================
           // copy rhs to the default host/device
-          scalar_view_t rhs ("rhs", nrows);
-          scalar_view_t sol ("sol", nrows);
-          Kokkos::deep_copy (rhs, rhs_host);
+          scalar_view_t rhs("rhs", nrows);
+          scalar_view_t sol("sol", nrows);
+          Kokkos::deep_copy(rhs, rhs_host);
 
           // ==============================================
           // do L solve
           timer.reset();
-          sptrsv_solve (&khL, sol, rhs);
+          sptrsv_solve(&khL, sol, rhs);
           Kokkos::fence();
           std::cout << " > Lower-TRI: " << std::endl;
           std::cout << "   Solve Time   : " << timer.seconds() << std::endl;
 
           // copy solution to host
-          Kokkos::deep_copy (sol_host, sol);
+          Kokkos::deep_copy(sol_host, sol);
 
           // ==============================================
           // Error Check ** on host **
-          Kokkos::fence ();
+          Kokkos::fence();
           std::cout << std::endl;
-          if (!check_errors (tol, A, rhs_host, sol_host)) {
-            num_failed ++;
+          if (!check_errors(tol, A, rhs_host, sol_host)) {
+            num_failed++;
           }
 
           // Benchmark
@@ -274,27 +304,25 @@ int test_sptrsv_perf (std::vector<int> tests, bool verbose, std::string& lower_f
           double min_time = 1.0e32;
           double max_time = 0.0;
           double ave_time = 0.0;
-          Kokkos::fence ();
-          for(int i = 0; i < loop; i++) {
+          Kokkos::fence();
+          for (int i = 0; i < loop; i++) {
             timer.reset();
-            sptrsv_solve (&khL, sol, rhs);
+            sptrsv_solve(&khL, sol, rhs);
             Kokkos::fence();
-            double time = timer.seconds ();
+            double time = timer.seconds();
             ave_time += time;
-            if(time > max_time) max_time = time;
-            if(time < min_time) min_time = time;
-            //std::cout << time << std::endl;
+            if (time > max_time) max_time = time;
+            if (time < min_time) min_time = time;
+            // std::cout << time << std::endl;
           }
           std::cout << " L-solve: loop = " << loop << std::endl;
-          std::cout << "  LOOP_AVG_TIME:  " << ave_time/loop << std::endl;
+          std::cout << "  LOOP_AVG_TIME:  " << ave_time / loop << std::endl;
           std::cout << "  LOOP_MAX_TIME:  " << max_time << std::endl;
-          std::cout << "  LOOP_MIN_TIME:  " << min_time << std::endl << std::endl;
-        }
-        break;
+          std::cout << "  LOOP_MIN_TIME:  " << min_time << std::endl
+                    << std::endl;
+        } break;
 
-        default:
-          std::cout << " > Invalid test ID < " << std::endl;
-          exit (0);
+        default: std::cout << " > Invalid test ID < " << std::endl; exit(0);
       }
     }
   }
@@ -303,20 +331,24 @@ int test_sptrsv_perf (std::vector<int> tests, bool verbose, std::string& lower_f
   return num_failed;
 }
 
-
 void print_help_sptrsv() {
   printf("Options:\n");
   printf("  --test [OPTION] : Use different kernel implementations\n");
   printf("                    Options:\n");
-  printf("                    superlu-naive, superlu-dag, superlu-spmv-dag\n\n");
-  printf("  -lf [file]      : Read in Lower-triangular matrix in Matrix Market formatted text file 'file'.\n");
-  printf("  -uf [file]      : Read in Upper-triangular matrix in Matrix Market formatted text file 'file'.\n");
+  printf(
+      "                    superlu-naive, superlu-dag, superlu-spmv-dag\n\n");
+  printf(
+      "  -lf [file]      : Read in Lower-triangular matrix in Matrix Market "
+      "formatted text file 'file'.\n");
+  printf(
+      "  -uf [file]      : Read in Upper-triangular matrix in Matrix Market "
+      "formatted text file 'file'.\n");
   printf("  -sf [file]      : Read in Supernode sizes from 'file'.\n");
-  printf("  --loop [LOOP]   : How many spmv to run to aggregate average time. \n");
+  printf(
+      "  --loop [LOOP]   : How many spmv to run to aggregate average time. \n");
 }
 
-
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   std::vector<int> tests;
   std::string lower_filename;
   std::string upper_filename;
@@ -332,46 +364,45 @@ int main(int argc, char **argv) {
   // verbose
   bool verbose = true;
 
-  if(argc == 1)
-  {
+  if (argc == 1) {
     print_help_sptrsv();
     return 0;
   }
 
-  for(int i = 0; i < argc; i++) {
-    if((strcmp(argv[i],"--test")==0)) {
+  for (int i = 0; i < argc; i++) {
+    if ((strcmp(argv[i], "--test") == 0)) {
       i++;
-      if((strcmp(argv[i],"superlu-naive")==0)) {
-        tests.push_back( SUPERNODAL_NAIVE );
+      if ((strcmp(argv[i], "superlu-naive") == 0)) {
+        tests.push_back(SUPERNODAL_NAIVE);
       }
-      if((strcmp(argv[i],"superlu-dag")==0)) {
-        tests.push_back( SUPERNODAL_DAG );
+      if ((strcmp(argv[i], "superlu-dag") == 0)) {
+        tests.push_back(SUPERNODAL_DAG);
       }
-      if((strcmp(argv[i],"superlu-spmv-dag")==0)) {
-        tests.push_back( SUPERNODAL_SPMV_DAG );
+      if ((strcmp(argv[i], "superlu-spmv-dag") == 0)) {
+        tests.push_back(SUPERNODAL_SPMV_DAG);
       }
-      if((strcmp(argv[i],"cusparse")==0)) {
-        tests.push_back( CUSPARSE );
+      if ((strcmp(argv[i], "cusparse") == 0)) {
+        tests.push_back(CUSPARSE);
       }
       continue;
     }
-    if((strcmp(argv[i],"-lf")==0)) {
+    if ((strcmp(argv[i], "-lf") == 0)) {
       lower_filename = argv[++i];
       continue;
     }
-    if((strcmp(argv[i],"-uf")==0)) {
+    if ((strcmp(argv[i], "-uf") == 0)) {
       upper_filename = argv[++i];
       continue;
     }
-    if((strcmp(argv[i],"-sf")==0)) {
+    if ((strcmp(argv[i], "-sf") == 0)) {
       supernode_filename = argv[++i];
       continue;
     }
-    if((strcmp(argv[i],"--quiet")==0)) {
+    if ((strcmp(argv[i], "--quiet") == 0)) {
       verbose = false;
       continue;
     }
-    if((strcmp(argv[i],"--loop")==0)) {
+    if ((strcmp(argv[i], "--loop") == 0)) {
       loop = atoi(argv[++i]);
       continue;
     }
@@ -380,15 +411,15 @@ int main(int argc, char **argv) {
       merge = true;
       continue;
     }*/
-    if((strcmp(argv[i],"--invert-offdiag")==0)) {
+    if ((strcmp(argv[i], "--invert-offdiag") == 0)) {
       invert_offdiag = true;
       continue;
     }
-    if((strcmp(argv[i],"--u-in-csc")==0)) {
+    if ((strcmp(argv[i], "--u-in-csc") == 0)) {
       u_in_csr = false;
       continue;
     }
-    if((strcmp(argv[i],"--help")==0) || (strcmp(argv[i],"-h")==0)) {
+    if ((strcmp(argv[i], "--help") == 0) || (strcmp(argv[i], "-h") == 0)) {
       print_help_sptrsv();
       return 0;
     }
@@ -401,41 +432,46 @@ int main(int argc, char **argv) {
 
   {
     using scalar_t = double;
-    //using scalar_t = Kokkos::complex<double>;
-    Kokkos::ScopeGuard kokkosScope (argc, argv);
-    int total_errors = test_sptrsv_perf<scalar_t> (tests, verbose, lower_filename, upper_filename, supernode_filename,
-                                                   merge, invert_offdiag, u_in_csr, loop);
-    if(total_errors == 0)
-      std::cout << "Kokkos::SPTRSV Test: Passed"
-                << std::endl << std::endl;
+    // using scalar_t = Kokkos::complex<double>;
+    Kokkos::ScopeGuard kokkosScope(argc, argv);
+    int total_errors = test_sptrsv_perf<scalar_t>(
+        tests, verbose, lower_filename, upper_filename, supernode_filename,
+        merge, invert_offdiag, u_in_csr, loop);
+    if (total_errors == 0)
+      std::cout << "Kokkos::SPTRSV Test: Passed" << std::endl << std::endl;
     else
-      std::cout << "Kokkos::SPTRSV Test: Failed (" << total_errors << " / " << 2*tests.size() << " failed)"
-                << std::endl << std::endl;
+      std::cout << "Kokkos::SPTRSV Test: Failed (" << total_errors << " / "
+                << 2 * tests.size() << " failed)" << std::endl
+                << std::endl;
   }
   return 0;
 }
-#else // defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV)
+#else  // defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV)
 int main() {
-  std::cout << std::endl << " ** SUPERNODAL NOT ENABLED **" << std::endl << std::endl;
+  std::cout << std::endl
+            << " ** SUPERNODAL NOT ENABLED **" << std::endl
+            << std::endl;
   exit(0);
   return 1;
 }
 #endif
 
-#else // defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA ) && (!defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION ))
+#else  // defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA ) &&
+       // (!defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION ))
 
 int main() {
 #if !defined(KOKKOSKERNELS_INST_DOUBLE)
   std::cout << " Only supported with double precision" << std::endl;
 #endif
-#if !defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
-  std::cout << " KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA **not** defined" << std::endl;
+#if !defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+  std::cout << " KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA **not** defined"
+            << std::endl;
 #endif
 #if defined(KOKKOS_ENABLE_CUDA)
   std::cout << " KOKKOS_ENABLE_CUDA defined" << std::endl;
-  #if !defined(KOKKOS_ENABLE_CUDA_LAMBDA)
+#if !defined(KOKKOS_ENABLE_CUDA_LAMBDA)
   std::cout << " KOKKOS_ENABLE_CUDA_LAMBDA **not** defined" << std::endl;
-  #endif
+#endif
   std::cout << " CUDA_VERSION = " << CUDA_VERSION << std::endl;
 #endif
   return 1;
diff --git a/perf_test/sparse/spmv/ArmPL_SPMV.hpp b/perf_test/sparse/spmv/ArmPL_SPMV.hpp
index 407032080e..4656fec235 100644
--- a/perf_test/sparse/spmv/ArmPL_SPMV.hpp
+++ b/perf_test/sparse/spmv/ArmPL_SPMV.hpp
@@ -48,36 +48,24 @@
 #ifdef KOKKOSKERNELS_ENABLE_TPL_ARMPL
 #include <armpl.h>
 
-template<typename Scalar>
-void armpl_matvec_wrapper(armpl_spmat_t A, Scalar* x, Scalar* y)
-{
-  throw std::runtime_error("Can't use ArmPL mat-vec for scalar types other than double and float.");
-}
-
-template<>
-void armpl_matvec_wrapper<float>(armpl_spmat_t A, float* x, float* y)
-{
+void armpl_matvec_wrapper(armpl_spmat_t A, float* x, float* y) {
   const float alpha = 1.0;
   const float beta  = 0.0;
-  armpl_spmv_exec_s(ARMPL_SPARSE_OPERATION_NOTRANS,
-		    alpha, A, x, beta, y);
+  armpl_spmv_exec_s(ARMPL_SPARSE_OPERATION_NOTRANS, alpha, A, x, beta, y);
 }
 
-template<>
-void armpl_matvec_wrapper<double>(armpl_spmat_t A, double* x, double* y)
-{
+void armpl_matvec_wrapper(armpl_spmat_t A, double* x, double* y) {
   const double alpha = 1.0;
   const double beta  = 0.0;
-  armpl_spmv_exec_d(ARMPL_SPARSE_OPERATION_NOTRANS,
-		    alpha, A, x, beta, y);
+  armpl_spmv_exec_d(ARMPL_SPARSE_OPERATION_NOTRANS, alpha, A, x, beta, y);
 }
 
-template<typename AType, typename XType, typename YType>
+template <typename AType, typename XType, typename YType>
 void armpl_matvec(AType /*A*/, XType x, YType y, spmv_additional_data* data) {
-  using Scalar = typename AType::non_const_value_type;
-  //Run armpl spmv corresponding to scalar type
-  armpl_matvec_wrapper<Scalar>(data->A, x.data(), y.data());
+  // using Scalar = typename AType::non_const_value_type;
+  // Run armpl spmv corresponding to scalar type
+  armpl_matvec_wrapper(data->A, x.data(), y.data());
 }
 
-#endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL
-#endif // ARMPL_SPMV_HPP_
+#endif  // KOKKOSKERNELS_ENABLE_TPL_ARMPL
+#endif  // ARMPL_SPMV_HPP_
diff --git a/perf_test/sparse/spmv/CuSparse_SPMV.hpp b/perf_test/sparse/spmv/CuSparse_SPMV.hpp
index d570c74c95..878719c9b1 100644
--- a/perf_test/sparse/spmv/CuSparse_SPMV.hpp
+++ b/perf_test/sparse/spmv/CuSparse_SPMV.hpp
@@ -48,63 +48,47 @@
 #ifdef HAVE_CUSPARSE
 #include <cusparse.h>
 
-template<typename Scalar>
-void cusparse_matvec_wrapper(
-    cusparseHandle_t& handle, cusparseMatDescr_t& desc,
-    int numRows, int numCols, int nnz,
-    Scalar* values, int* rowptrs, int* entries,
-    Scalar* x, Scalar* y)
-{
-  throw std::runtime_error("Can't use cuSPARSE mat-vec for scalar types other than double and float.");
+template <typename Scalar>
+void cusparse_matvec_wrapper(cusparseHandle_t& handle, cusparseMatDescr_t& desc,
+                             int numRows, int numCols, int nnz, Scalar* values,
+                             int* rowptrs, int* entries, Scalar* x, Scalar* y) {
+  throw std::runtime_error(
+      "Can't use cuSPARSE mat-vec for scalar types other than double and "
+      "float.");
 }
 
-template<>
-void cusparse_matvec_wrapper<double>(
-    cusparseHandle_t& handle, cusparseMatDescr_t& descr,
-    int numRows, int numCols, int nnz,
-    double* values, int* rowptrs, int* entries,
-    double* x, double* y)
-{
+template <>
+void cusparse_matvec_wrapper<double>(cusparseHandle_t& handle,
+                                     cusparseMatDescr_t& descr, int numRows,
+                                     int numCols, int nnz, double* values,
+                                     int* rowptrs, int* entries, double* x,
+                                     double* y) {
   double s_a = 1.0;
   double s_b = 0.0;
-  cusparseDcsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-      numRows, numCols, nnz,
-      &s_a,
-      descr,
-      values, rowptrs, entries,
-      x,
-      &s_b,
-      y);
+  cusparseDcsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, numRows, numCols,
+                 nnz, &s_a, descr, values, rowptrs, entries, x, &s_b, y);
 }
 
-template<>
-void cusparse_matvec_wrapper<float>(
-    cusparseHandle_t& handle, cusparseMatDescr_t& descr,
-    int numRows, int numCols, int nnz,
-    float* values, int* rowptrs, int* entries,
-    float* x, double* y)
-{
+template <>
+void cusparse_matvec_wrapper<float>(cusparseHandle_t& handle,
+                                    cusparseMatDescr_t& descr, int numRows,
+                                    int numCols, int nnz, float* values,
+                                    int* rowptrs, int* entries, float* x,
+                                    double* y) {
   float s_a = 1.0f;
   float s_b = 0.0f;
-  cusparseScsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-      numRows, numCols, nnz,
-      &s_a,
-      descr,
-      values, rowptrs, entries,
-      x,
-      &s_b,
-      y);
+  cusparseScsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, numRows, numCols,
+                 nnz, &s_a, descr, values, rowptrs, entries, x, &s_b, y);
 }
 
-template<typename AType, typename XType, typename YType>
+template <typename AType, typename XType, typename YType>
 void cusparse_matvec(AType A, XType x, YType y) {
   typedef AType::non_const_value_type Scalar;
-  //Run cuSPARSE spmv corresponding to scalar type
-  cusparse_matvec_wrapper<Scalar>(
-      A.cusparse_handle, A.cusparse_descr,
-      A.numRows(), A.numCols(), A.nnz(),
-      A.values.data(), A.graph.row_map.data(), A.graph.entries.data(),
-      x.data(), y.data());
+  // Run cuSPARSE spmv corresponding to scalar type
+  cusparse_matvec_wrapper<Scalar>(A.cusparse_handle, A.cusparse_descr,
+                                  A.numRows(), A.numCols(), A.nnz(),
+                                  A.values.data(), A.graph.row_map.data(),
+                                  A.graph.entries.data(), x.data(), y.data());
 }
 
 #endif
diff --git a/perf_test/sparse/spmv/KokkosKernels_SPMV.hpp b/perf_test/sparse/spmv/KokkosKernels_SPMV.hpp
index 01922a5878..a54ef17743 100644
--- a/perf_test/sparse/spmv/KokkosKernels_SPMV.hpp
+++ b/perf_test/sparse/spmv/KokkosKernels_SPMV.hpp
@@ -47,29 +47,33 @@
 
 #ifdef MAKE_BUILD
 #ifdef KOKKOS_ENABLE_CUDA
-  #define KOKKOSKERNELS_ETI_MANGLING_TYPEDEFS()  \
-        typedef Kokkos::Device<Kokkos::Cuda, Kokkos::Cuda::memory_space> Kokkos_Device0Kokkos_Cuda_Kokkos_CudaSpace0; \
-        typedef Kokkos::complex<double> Kokkos_complex0double0; \
-        typedef long long longlong;
+#define KOKKOSKERNELS_ETI_MANGLING_TYPEDEFS()                      \
+  typedef Kokkos::Device<Kokkos::Cuda, Kokkos::Cuda::memory_space> \
+      Kokkos_Device0Kokkos_Cuda_Kokkos_CudaSpace0;                 \
+  typedef Kokkos::complex<double> Kokkos_complex0double0;          \
+  typedef long long longlong;
 #else
-  #ifdef KOKKOS_ENABLE_OPENMP
-    #define KOKKOSKERNELS_ETI_MANGLING_TYPEDEFS()  \
-        typedef Kokkos::Device<Kokkos::OpenMP, Kokkos::OpenMP::memory_space> Kokkos_Device0Kokkos_OpenMP_Kokkos_HostSpace0; \
-        typedef Kokkos::complex<double> Kokkos_complex0double0; \
-        typedef long long longlong;
-  #else
-    #ifdef KOKKOS_ENABLE_THREADS
-      #define KOKKOSKERNELS_ETI_MANGLING_TYPEDEFS()  \
-        typedef Kokkos::Device<Kokkos::Threads, Kokkos::Threads::memory_space> Kokkos_Device0Kokkos_Threads_Kokkos_HostSpace0; \
-        typedef Kokkos::complex<double> Kokkos_complex0double0; \
-        typedef long long longlong;
-    #else
-      #define KOKKOSKERNELS_ETI_MANGLING_TYPEDEFS()  \
-        typedef Kokkos::Device<Kokkos::OpenMP, Kokkos::HostSpace> Kokkos_Device0Kokkos_OpenMP_Kokkos_HostSpace0; \
-        typedef Kokkos::complex<double> Kokkos_complex0double0; \
-        typedef long long longlong;
-    #endif
-  #endif
+#ifdef KOKKOS_ENABLE_OPENMP
+#define KOKKOSKERNELS_ETI_MANGLING_TYPEDEFS()                          \
+  typedef Kokkos::Device<Kokkos::OpenMP, Kokkos::OpenMP::memory_space> \
+      Kokkos_Device0Kokkos_OpenMP_Kokkos_HostSpace0;                   \
+  typedef Kokkos::complex<double> Kokkos_complex0double0;              \
+  typedef long long longlong;
+#else
+#ifdef KOKKOS_ENABLE_THREADS
+#define KOKKOSKERNELS_ETI_MANGLING_TYPEDEFS()                            \
+  typedef Kokkos::Device<Kokkos::Threads, Kokkos::Threads::memory_space> \
+      Kokkos_Device0Kokkos_Threads_Kokkos_HostSpace0;                    \
+  typedef Kokkos::complex<double> Kokkos_complex0double0;                \
+  typedef long long longlong;
+#else
+#define KOKKOSKERNELS_ETI_MANGLING_TYPEDEFS()               \
+  typedef Kokkos::Device<Kokkos::OpenMP, Kokkos::HostSpace> \
+      Kokkos_Device0Kokkos_OpenMP_Kokkos_HostSpace0;        \
+  typedef Kokkos::complex<double> Kokkos_complex0double0;   \
+  typedef long long longlong;
+#endif
+#endif
 #endif
 
 #endif
@@ -79,10 +83,10 @@
 
 #ifdef HAVE_KK_KERNELS
 
-
-template<typename AType, typename XType, typename YType>
-void kokkoskernels_matvec(AType A, XType x, YType y, int rows_per_thread, int team_size, int vector_length) {
-  KokkosSparse::spmv (KokkosSparse::NoTranspose,1.0,A,x,0.0,y);
+template <typename AType, typename XType, typename YType>
+void kokkoskernels_matvec(AType A, XType x, YType y, int rows_per_thread,
+                          int team_size, int vector_length) {
+  KokkosSparse::spmv(KokkosSparse::NoTranspose, 1.0, A, x, 0.0, y);
 }
 #endif
 
diff --git a/perf_test/sparse/spmv/KokkosKernels_spmv_data.hpp b/perf_test/sparse/spmv/KokkosKernels_spmv_data.hpp
index cd2d5e86ad..38a9751e6f 100644
--- a/perf_test/sparse/spmv/KokkosKernels_spmv_data.hpp
+++ b/perf_test/sparse/spmv/KokkosKernels_spmv_data.hpp
@@ -58,39 +58,26 @@ struct spmv_additional_data {
 
   spmv_additional_data() = default;
 
-  spmv_additional_data(int test_) : test(test_) { }
+  spmv_additional_data(int test_) : test(test_) {}
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_ARMPL
-  void set_armpl_spmat(int numRows, int numCols,
-		       const int* rowptrs, const int* entries,
-		       const float* values) {
-    armpl_spmat_create_csr_s(&A, numRows,
-			     numCols,
-			     rowptrs,
-			     entries,
-			     values,
-			     0);
+  void set_armpl_spmat(int numRows, int numCols, const int* rowptrs,
+                       const int* entries, const float* values) {
+    armpl_spmat_create_csr_s(&A, numRows, numCols, rowptrs, entries, values, 0);
   }
 
-  void set_armpl_spmat(int numRows, int numCols,
-		       const int* rowptrs, const int* entries,
-		       const double* values) {
-    armpl_spmat_create_csr_d(&A, numRows,
-			     numCols,
-			     rowptrs,
-			     entries,
-			     values,
-			     0);
+  void set_armpl_spmat(int numRows, int numCols, const int* rowptrs,
+                       const int* entries, const double* values) {
+    armpl_spmat_create_csr_d(&A, numRows, numCols, rowptrs, entries, values, 0);
   }
 #endif
 
   ~spmv_additional_data() {
-
 #ifdef KOKKOSKERNELS_ENABLE_TPL_ARMPL
-    if(test == 2) {
+    if (test == 2) {
       armpl_spmat_destroy(A);
     }
-#endif    
+#endif
   }
 };
 
diff --git a/perf_test/sparse/spmv/Kokkos_SPMV.hpp b/perf_test/sparse/spmv/Kokkos_SPMV.hpp
index 602c2abe66..cc98865c81 100644
--- a/perf_test/sparse/spmv/Kokkos_SPMV.hpp
+++ b/perf_test/sparse/spmv/Kokkos_SPMV.hpp
@@ -45,148 +45,153 @@
 #ifndef KOKKOS_SPMV_HPP_
 #define KOKKOS_SPMV_HPP_
 
-template<class AMatrix,
-         class XVector,
-         class YVector,
-         int dobeta,
-         bool conjugate>
+template <class AMatrix, class XVector, class YVector, int dobeta,
+          bool conjugate>
 struct SPMV_Functor {
-  typedef typename AMatrix::execution_space            execution_space;
-  typedef typename AMatrix::non_const_ordinal_type     ordinal_type;
-  typedef typename AMatrix::non_const_value_type       value_type;
+  typedef typename AMatrix::execution_space execution_space;
+  typedef typename AMatrix::non_const_ordinal_type ordinal_type;
+  typedef typename AMatrix::non_const_value_type value_type;
   typedef typename Kokkos::TeamPolicy<execution_space> team_policy;
-  typedef typename team_policy::member_type            team_member;
-  typedef Kokkos::Details::ArithTraits<value_type>     ATV;
+  typedef typename team_policy::member_type team_member;
+  typedef Kokkos::Details::ArithTraits<value_type> ATV;
 
   const value_type alpha;
-  AMatrix  m_A;
+  AMatrix m_A;
   XVector m_x;
   const value_type beta;
   YVector m_y;
 
   const ordinal_type rows_per_team;
 
-  SPMV_Functor (const value_type alpha_,
-               const AMatrix m_A_,
-               const XVector m_x_,
-               const value_type beta_,
-               const YVector m_y_,
-               const int rows_per_team_) :
-    alpha (alpha_), m_A (m_A_), m_x (m_x_),
-    beta (beta_), m_y (m_y_),
-    rows_per_team (rows_per_team_)
-  {
-    static_assert (static_cast<int> (XVector::rank) == 1,
-                   "XVector must be a rank 1 View.");
-    static_assert (static_cast<int> (YVector::rank) == 1,
-                   "YVector must be a rank 1 View.");
+  SPMV_Functor(const value_type alpha_, const AMatrix m_A_, const XVector m_x_,
+               const value_type beta_, const YVector m_y_,
+               const int rows_per_team_)
+      : alpha(alpha_),
+        m_A(m_A_),
+        m_x(m_x_),
+        beta(beta_),
+        m_y(m_y_),
+        rows_per_team(rows_per_team_) {
+    static_assert(static_cast<int>(XVector::rank) == 1,
+                  "XVector must be a rank 1 View.");
+    static_assert(static_cast<int>(YVector::rank) == 1,
+                  "YVector must be a rank 1 View.");
   }
 
-  KOKKOS_INLINE_FUNCTION void
-  operator() (const team_member& dev) const
-  {
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev,0,rows_per_team), [&] (const ordinal_type& loop) {
-
-      const ordinal_type iRow = static_cast<ordinal_type> ( dev.league_rank() ) * rows_per_team + loop;
-      if (iRow >= m_A.numRows ()) {
-        return;
-      }
-      const KokkosSparse::SparseRowViewConst<AMatrix> row = m_A.rowConst(iRow);
-      const ordinal_type row_length = static_cast<ordinal_type> (row.length);
-      value_type sum = 0;
-
-      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev,row_length), [&] (const ordinal_type& iEntry, value_type& lsum) {
-        const value_type val = conjugate ?
-                ATV::conj (row.value(iEntry)) :
-                row.value(iEntry);
-        lsum += val * m_x(row.colidx(iEntry));
-      },sum);
-
-      Kokkos::single(Kokkos::PerThread(dev), [&] () {
-        sum *= alpha;
-
-        if (dobeta == 0) {
-          m_y(iRow) = sum ;
-        } else {
-          m_y(iRow) = beta * m_y(iRow) + sum;
-        }
-      });
-    });
+  KOKKOS_INLINE_FUNCTION void operator()(const team_member& dev) const {
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(dev, 0, rows_per_team),
+        [&](const ordinal_type& loop) {
+          const ordinal_type iRow =
+              static_cast<ordinal_type>(dev.league_rank()) * rows_per_team +
+              loop;
+          if (iRow >= m_A.numRows()) {
+            return;
+          }
+          const KokkosSparse::SparseRowViewConst<AMatrix> row =
+              m_A.rowConst(iRow);
+          const ordinal_type row_length = static_cast<ordinal_type>(row.length);
+          value_type sum                = 0;
+
+          Kokkos::parallel_reduce(
+              Kokkos::ThreadVectorRange(dev, row_length),
+              [&](const ordinal_type& iEntry, value_type& lsum) {
+                const value_type val = conjugate ? ATV::conj(row.value(iEntry))
+                                                 : row.value(iEntry);
+                lsum += val * m_x(row.colidx(iEntry));
+              },
+              sum);
+
+          Kokkos::single(Kokkos::PerThread(dev), [&]() {
+            sum *= alpha;
+
+            if (dobeta == 0) {
+              m_y(iRow) = sum;
+            } else {
+              m_y(iRow) = beta * m_y(iRow) + sum;
+            }
+          });
+        });
   }
 };
 
-template<class execution_space>
-int launch_parameters(int numRows, int nnz, int rows_per_thread, int& team_size, int& vector_length) {
+template <class execution_space>
+int launch_parameters(int numRows, int nnz, int rows_per_thread, int& team_size,
+                      int& vector_length) {
   int rows_per_team;
-  int nnz_per_row = nnz/numRows;
-  if(nnz_per_row < 1) nnz_per_row = 1;
+  int nnz_per_row = nnz / numRows;
+  if (nnz_per_row < 1) nnz_per_row = 1;
 
-  if(vector_length < 1) {
+  if (vector_length < 1) {
     vector_length = 1;
-    while(vector_length<32 && vector_length*6 < nnz_per_row)
-      vector_length*=2;
+    while (vector_length < 32 && vector_length * 6 < nnz_per_row)
+      vector_length *= 2;
   }
 
   // Determine rows per thread
-  if(rows_per_thread < 1) {
-    #ifdef KOKKOS_ENABLE_CUDA
-    if(std::is_same<Kokkos::Cuda,execution_space>::value)
+  if (rows_per_thread < 1) {
+#ifdef KOKKOS_ENABLE_CUDA
+    if (std::is_same<Kokkos::Cuda, execution_space>::value)
       rows_per_thread = 1;
     else
-    #endif
+#endif
     {
-      if(nnz_per_row < 20 && nnz > 5000000 ) {
+      if (nnz_per_row < 20 && nnz > 5000000) {
         rows_per_thread = 256;
       } else
         rows_per_thread = 64;
     }
   }
 
-  #ifdef KOKKOS_ENABLE_CUDA
-  if(team_size < 1)
-    team_size = 256/vector_length;
-  #endif
+#ifdef KOKKOS_ENABLE_CUDA
+  if (team_size < 1) team_size = 256 / vector_length;
+#endif
 
   rows_per_team = rows_per_thread * team_size;
 
-  if(rows_per_team < 0) {
+  if (rows_per_team < 0) {
     int nnz_per_team = 4096;
-    int conc = execution_space::concurrency();
-    while((conc * nnz_per_team * 4> nnz)&&(nnz_per_team>256)) nnz_per_team/=2;
-    int tmp_nnz_per_row = nnz/numRows;
-    rows_per_team = (nnz_per_team+tmp_nnz_per_row - 1)/tmp_nnz_per_row;
+    int conc         = execution_space::concurrency();
+    while ((conc * nnz_per_team * 4 > nnz) && (nnz_per_team > 256))
+      nnz_per_team /= 2;
+    int tmp_nnz_per_row = nnz / numRows;
+    rows_per_team = (nnz_per_team + tmp_nnz_per_row - 1) / tmp_nnz_per_row;
   }
 
-
   return rows_per_team;
 }
 
-template<typename AType, typename XType, typename YType,class ScheduleType>
-void kokkos_matvec(AType A, XType x, YType y, int rows_per_thread, int team_size, int vector_length) {
+template <typename AType, typename XType, typename YType, class ScheduleType>
+void kokkos_matvec(AType A, XType x, YType y, int rows_per_thread,
+                   int team_size, int vector_length) {
   typedef typename AType::execution_space execution_space;
   typedef typename AType::non_const_size_type size_type;
   typedef typename AType::non_const_ordinal_type lno_t;
   typedef typename AType::non_const_value_type scalar_t;
-  typedef KokkosSparse::CrsMatrix<const scalar_t,lno_t,execution_space,void,size_type> matrix_type ;
+  typedef KokkosSparse::CrsMatrix<const scalar_t, lno_t, execution_space, void,
+                                  size_type>
+      matrix_type;
 
-  int rows_per_team = launch_parameters<execution_space>(A.numRows(),A.nnz(),rows_per_thread,team_size,vector_length);
+  int rows_per_team = launch_parameters<execution_space>(
+      A.numRows(), A.nnz(), rows_per_thread, team_size, vector_length);
 
   double s_a = 1.0;
   double s_b = 0.0;
-  SPMV_Functor<matrix_type,XType,YType,0,false> func (s_a,A,x,s_b,y,rows_per_team);
+  SPMV_Functor<matrix_type, XType, YType, 0, false> func(s_a, A, x, s_b, y,
+                                                         rows_per_team);
 
-  int worksets = (y.extent(0)+rows_per_team-1)/rows_per_team;
+  int worksets = (y.extent(0) + rows_per_team - 1) / rows_per_team;
 
-  Kokkos::TeamPolicy<Kokkos::Schedule<ScheduleType> > policy(1,1);
+  Kokkos::TeamPolicy<Kokkos::Schedule<ScheduleType> > policy(1, 1);
 
-  if(team_size>0)
-    policy = Kokkos::TeamPolicy<Kokkos::Schedule<ScheduleType> >(worksets,team_size,vector_length);
+  if (team_size > 0)
+    policy = Kokkos::TeamPolicy<Kokkos::Schedule<ScheduleType> >(
+        worksets, team_size, vector_length);
   else
-    policy = Kokkos::TeamPolicy<Kokkos::Schedule<ScheduleType> >(worksets,Kokkos::AUTO,vector_length);
+    policy = Kokkos::TeamPolicy<Kokkos::Schedule<ScheduleType> >(
+        worksets, Kokkos::AUTO, vector_length);
 
-  Kokkos::parallel_for("KokkosSparse::PerfTest::SpMV", policy,func);
+  Kokkos::parallel_for("KokkosSparse::PerfTest::SpMV", policy, func);
 }
 
-
 #endif /* KOKKOS_SPMV_HPP_ */
-
diff --git a/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp b/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp
index 6c057cd9b6..c32968c177 100644
--- a/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp
+++ b/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp
@@ -45,126 +45,130 @@
 #ifndef KOKKOS_SPMV_INSPECTOR_HPP_
 #define KOKKOS_SPMV_INSPECTOR_HPP_
 
-#include"Kokkos_SPMV.hpp"
+#include "Kokkos_SPMV.hpp"
 
-
-template<class AMatrix,
-         class XVector,
-         class YVector,
-         int dobeta,
-         bool conjugate>
+template <class AMatrix, class XVector, class YVector, int dobeta,
+          bool conjugate>
 struct SPMV_Inspector_Functor {
-  typedef typename AMatrix::execution_space            execution_space;
-  typedef typename AMatrix::non_const_ordinal_type     ordinal_type;
-  typedef typename AMatrix::non_const_value_type       value_type;
-  typedef typename AMatrix::non_const_size_type        size_type;
+  typedef typename AMatrix::execution_space execution_space;
+  typedef typename AMatrix::non_const_ordinal_type ordinal_type;
+  typedef typename AMatrix::non_const_value_type value_type;
+  typedef typename AMatrix::non_const_size_type size_type;
   typedef typename Kokkos::TeamPolicy<execution_space> team_policy;
-  typedef typename team_policy::member_type            team_member;
-  typedef Kokkos::Details::ArithTraits<value_type>     ATV;
+  typedef typename team_policy::member_type team_member;
+  typedef Kokkos::Details::ArithTraits<value_type> ATV;
 
   const value_type alpha;
-  AMatrix  m_A;
+  AMatrix m_A;
   XVector m_x;
   Kokkos::View<const ordinal_type*> m_workset_offsets;
 
   const value_type beta;
   YVector m_y;
 
-  SPMV_Inspector_Functor (const value_type alpha_,
-               const AMatrix m_A_,
-               const XVector m_x_,
-               const Kokkos::View<const ordinal_type*> m_workset_offsets_,
-               const value_type beta_,
-               const YVector m_y_) :
-    alpha (alpha_), m_A (m_A_), m_x (m_x_),
-    m_workset_offsets (m_workset_offsets_),
-    beta (beta_), m_y (m_y_)
-  {
-    static_assert (static_cast<int> (XVector::rank) == 1,
-                   "XVector must be a rank 1 View.");
-    static_assert (static_cast<int> (YVector::rank) == 1,
-                   "YVector must be a rank 1 View.");
+  SPMV_Inspector_Functor(
+      const value_type alpha_, const AMatrix m_A_, const XVector m_x_,
+      const Kokkos::View<const ordinal_type*> m_workset_offsets_,
+      const value_type beta_, const YVector m_y_)
+      : alpha(alpha_),
+        m_A(m_A_),
+        m_x(m_x_),
+        m_workset_offsets(m_workset_offsets_),
+        beta(beta_),
+        m_y(m_y_) {
+    static_assert(static_cast<int>(XVector::rank) == 1,
+                  "XVector must be a rank 1 View.");
+    static_assert(static_cast<int>(YVector::rank) == 1,
+                  "YVector must be a rank 1 View.");
   }
 
-  KOKKOS_INLINE_FUNCTION void
-  operator() (const team_member& dev) const
-  {
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev,m_workset_offsets(dev.league_rank()),
-        m_workset_offsets(dev.league_rank()+1)), [&] (const ordinal_type& iRow) {
-
-      //const ordinal_type iRow = static_cast<ordinal_type> ( dev.league_rank() ) * rows_per_team + loop;
-      if (iRow >= m_A.numRows ()) {
-        return;
-      }
-      const KokkosSparse::SparseRowViewConst<AMatrix> row = m_A.rowConst(iRow);
-      const ordinal_type row_length = static_cast<ordinal_type> (row.length);
-      value_type sum = 0;
-
-      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev,row_length), [&] (const ordinal_type& iEntry, value_type& lsum) {
-        const value_type val = conjugate ?
-                ATV::conj (row.value(iEntry)) :
-                row.value(iEntry);
-        lsum += val * m_x(row.colidx(iEntry));
-      },sum);
-
-      Kokkos::single(Kokkos::PerThread(dev), [&] () {
-        sum *= alpha;
-
-        if (dobeta == 0) {
-          m_y(iRow) = sum ;
-        } else {
-          m_y(iRow) = beta * m_y(iRow) + sum;
-        }
-      });
-    });
+  KOKKOS_INLINE_FUNCTION void operator()(const team_member& dev) const {
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(dev, m_workset_offsets(dev.league_rank()),
+                                m_workset_offsets(dev.league_rank() + 1)),
+        [&](const ordinal_type& iRow) {
+          // const ordinal_type iRow = static_cast<ordinal_type> (
+          // dev.league_rank() ) * rows_per_team + loop;
+          if (iRow >= m_A.numRows()) {
+            return;
+          }
+          const KokkosSparse::SparseRowViewConst<AMatrix> row =
+              m_A.rowConst(iRow);
+          const ordinal_type row_length = static_cast<ordinal_type>(row.length);
+          value_type sum                = 0;
+
+          Kokkos::parallel_reduce(
+              Kokkos::ThreadVectorRange(dev, row_length),
+              [&](const ordinal_type& iEntry, value_type& lsum) {
+                const value_type val = conjugate ? ATV::conj(row.value(iEntry))
+                                                 : row.value(iEntry);
+                lsum += val * m_x(row.colidx(iEntry));
+              },
+              sum);
+
+          Kokkos::single(Kokkos::PerThread(dev), [&]() {
+            sum *= alpha;
+
+            if (dobeta == 0) {
+              m_y(iRow) = sum;
+            } else {
+              m_y(iRow) = beta * m_y(iRow) + sum;
+            }
+          });
+        });
   }
 };
 
-template<typename AType, typename XType, typename YType,class Schedule>
-void kk_inspector_matvec(AType A, XType x, YType y, int team_size, int vector_length) {
-
+template <typename AType, typename XType, typename YType, class Schedule>
+void kk_inspector_matvec(AType A, XType x, YType y, int team_size,
+                         int vector_length) {
   typedef typename AType::execution_space execution_space;
   typedef typename AType::device_type::memory_space memory_space;
   typedef typename AType::non_const_size_type size_type;
   typedef typename AType::non_const_ordinal_type lno_t;
   typedef typename AType::non_const_value_type scalar_t;
 
-  static int worksets = std::is_same<Schedule,Kokkos::Static>::value ?
-                        team_size>0?execution_space::concurrency()/team_size:execution_space::concurrency() : //static
-                        team_size>0?execution_space::concurrency()*32/team_size:execution_space::concurrency()*32 ; //dynamic
+  static int worksets =
+      std::is_same<Schedule, Kokkos::Static>::value
+          ? team_size > 0 ? execution_space::concurrency() / team_size
+                          : execution_space::concurrency()
+          :  // static
+          team_size > 0 ? execution_space::concurrency() * 32 / team_size
+                        : execution_space::concurrency() * 32;  // dynamic
   static Kokkos::View<lno_t*, memory_space> workset_offsets;
-  if(workset_offsets.extent(0) == 0) {
-    workset_offsets = Kokkos::View<lno_t*>("WorksetOffsets", worksets + 1);
+  if (workset_offsets.extent(0) == 0) {
+    workset_offsets     = Kokkos::View<lno_t*>("WorksetOffsets", worksets + 1);
     const size_type nnz = A.nnz();
-    lno_t nnz_per_workset = (nnz+worksets-1)/worksets;
-    workset_offsets(0) = 0;
-    lno_t ws = 1;
-    for(lno_t row = 0; row<A.numRows(); row++) {
-      if(A.graph.row_map(row) > ws*nnz_per_workset) {
+    lno_t nnz_per_workset = (nnz + worksets - 1) / worksets;
+    workset_offsets(0)    = 0;
+    lno_t ws              = 1;
+    for (lno_t row = 0; row < A.numRows(); row++) {
+      if (A.graph.row_map(row) > ws * nnz_per_workset) {
         workset_offsets(ws) = row;
         ws++;
       }
     }
-    if(workset_offsets(ws-1) < A.numRows()) {
+    if (workset_offsets(ws - 1) < A.numRows()) {
       workset_offsets(ws) = A.numRows();
     }
-    printf("Worksets: %i %i\n",worksets,ws);
+    printf("Worksets: %i %i\n", worksets, ws);
     worksets = ws;
   }
   scalar_t s_a(1.0);
   scalar_t s_b(0.0);
-  SPMV_Inspector_Functor<AType, XType, YType, 0 ,false> func(s_a, A, x, workset_offsets, s_b, y);
+  SPMV_Inspector_Functor<AType, XType, YType, 0, false> func(
+      s_a, A, x, workset_offsets, s_b, y);
 
-  Kokkos::TeamPolicy<Kokkos::Schedule<Schedule> > policy(1,1);
+  Kokkos::TeamPolicy<Kokkos::Schedule<Schedule> > policy(1, 1);
 
-  if(team_size>0)
-    policy = Kokkos::TeamPolicy<Kokkos::Schedule<Schedule> >(worksets,team_size,vector_length);
+  if (team_size > 0)
+    policy = Kokkos::TeamPolicy<Kokkos::Schedule<Schedule> >(
+        worksets, team_size, vector_length);
   else
-    policy = Kokkos::TeamPolicy<Kokkos::Schedule<Schedule> >(worksets,Kokkos::AUTO,vector_length);
+    policy = Kokkos::TeamPolicy<Kokkos::Schedule<Schedule> >(
+        worksets, Kokkos::AUTO, vector_length);
 
-  Kokkos::parallel_for("KokkosSparse::PerfTest::SpMV_Inspector", policy,func);
+  Kokkos::parallel_for("KokkosSparse::PerfTest::SpMV_Inspector", policy, func);
 }
 
-
 #endif /* KOKKOS_SPMV_HPP_ */
-
diff --git a/perf_test/sparse/spmv/MKL_SPMV.hpp b/perf_test/sparse/spmv/MKL_SPMV.hpp
index 18720b0121..a1780a29b7 100644
--- a/perf_test/sparse/spmv/MKL_SPMV.hpp
+++ b/perf_test/sparse/spmv/MKL_SPMV.hpp
@@ -48,65 +48,46 @@
 #ifdef HAVE_MKL
 #include <mkl.h>
 
-template<typename Scalar>
-void mkl_matvec_wrapper<Scalar>(
-    int numRows, int numCols, int nnz,
-    Scalar* values, int* rowptrs, int* entries,
-    Scalar* x, Scalar* y)
-{
-  throw std::runtime_error("Can't use cuSPARSE mat-vec for scalar types other than double and float.");
+template <typename Scalar>
+void mkl_matvec_wrapper<Scalar>(int numRows, int numCols, int nnz,
+                                Scalar* values, int* rowptrs, int* entries,
+                                Scalar* x, Scalar* y) {
+  throw std::runtime_error(
+      "Can't use cuSPARSE mat-vec for scalar types other than double and "
+      "float.");
 }
 
-template<>
-void mkl_matvec_wrapper<double>(
-    int numRows, int numCols, int nnz,
-    double* values, int* rowptrs, int* entries,
-    double* x, double* y)
-{
-  double s_a = 1.0;
-  double s_b = 0.0;
+template <>
+void mkl_matvec_wrapper<double>(int numRows, int numCols, int nnz,
+                                double* values, int* rowptrs, int* entries,
+                                double* x, double* y) {
+  double s_a        = 1.0;
+  double s_b        = 0.0;
   char matdescra[6] = "GLNC0";
-  char transa = 'N';
-  mkl_dcsrmv(&transa,
-             &numRows, &numCols,
-             &s_a,
-             matdescra,
-             values, entries, rowptrs, rowptrs + 1,
-             x,
-             &s_b,
-             y);
+  char transa       = 'N';
+  mkl_dcsrmv(&transa, &numRows, &numCols, &s_a, matdescra, values, entries,
+             rowptrs, rowptrs + 1, x, &s_b, y);
 }
 
-template<>
-void mkl_matvec_wrapper<float>(
-    int numRows, int numCols, int nnz,
-    float* values, int* rowptrs, int* entries,
-    float* x, float* y)
-{
-  float s_a = 1.0;
-  float s_b = 0.0;
+template <>
+void mkl_matvec_wrapper<float>(int numRows, int numCols, int nnz, float* values,
+                               int* rowptrs, int* entries, float* x, float* y) {
+  float s_a         = 1.0;
+  float s_b         = 0.0;
   char matdescra[6] = "GLNC0";
-  char transa = 'N';
-  mkl_scsrmv(&transa,
-             &numRows, &numCols,
-             &s_a,
-             matdescra,
-             values, entries, rowptrs, rowptrs + 1,
-             x,
-             &s_b,
-             y);
+  char transa       = 'N';
+  mkl_scsrmv(&transa, &numRows, &numCols, &s_a, matdescra, values, entries,
+             rowptrs, rowptrs + 1, x, &s_b, y);
 }
 
-template<typename AType, typename XType, typename YType>
+template <typename AType, typename XType, typename YType>
 void mkl_matvec(AType A, XType x, YType y) {
   typedef AType::non_const_value_type Scalar;
-  mkl_matvec_wrapper<Scalar>(
-      A.cusparse_handle, A.cusparse_descr,
-      A.numRows(), A.numCols(), A.nnz(),
-      A.values.data(), A.graph.row_map.data(), A.graph.entries.data(),
-      x.data(), y.data());
+  mkl_matvec_wrapper<Scalar>(A.cusparse_handle, A.cusparse_descr, A.numRows(),
+                             A.numCols(), A.nnz(), A.values.data(),
+                             A.graph.row_map.data(), A.graph.entries.data(),
+                             x.data(), y.data());
 }
 #endif
 
 #endif /* MKL_SPMV_HPP_ */
-
diff --git a/perf_test/sparse/spmv/OpenMPDynamic_SPMV.hpp b/perf_test/sparse/spmv/OpenMPDynamic_SPMV.hpp
index ac409eb9b7..652cdf2e63 100644
--- a/perf_test/sparse/spmv/OpenMPDynamic_SPMV.hpp
+++ b/perf_test/sparse/spmv/OpenMPDynamic_SPMV.hpp
@@ -45,10 +45,10 @@
 #ifndef OPENMP_DYNAMIC_SPMV_HPP_
 #define OPENMP_DYNAMIC_SPMV_HPP_
 
-template<typename AType, typename XType, typename YType, typename Offset, typename Ordinal, typename Scalar>
+template <typename AType, typename XType, typename YType, typename Offset,
+          typename Ordinal, typename Scalar>
 void openmp_dynamic_matvec(AType A, XType x, YType y) {
-
-  #define OMP_BENCH_RESTRICT __restrict__
+#define OMP_BENCH_RESTRICT __restrict__
 
   const Scalar s_a(1.0);
   const Scalar s_b(0.0);
@@ -60,29 +60,27 @@ void openmp_dynamic_matvec(AType A, XType x, YType y) {
   const Ordinal* OMP_BENCH_RESTRICT matrixCols      = A.graph.entries.data();
   const Offset* OMP_BENCH_RESTRICT matrixRowOffsets = A.graph.row_map.data();
 
-  #pragma omp parallel for schedule(dynamic)
-  for(Ordinal row = 0; row < rowCount; ++row) {
+#pragma omp parallel for schedule(dynamic)
+  for (Ordinal row = 0; row < rowCount; ++row) {
     const Offset rowStart = matrixRowOffsets[row];
     const Offset rowEnd   = matrixRowOffsets[row + 1];
 
     Scalar sum(0.0);
 
-    for(Offset i = rowStart; i < rowEnd; ++i) {
+    for (Offset i = rowStart; i < rowEnd; ++i) {
       const Ordinal x_entry = matrixCols[i];
       const Scalar alpha_MC = s_a * matrixCoeffs[i];
       sum += alpha_MC * x_ptr[x_entry];
     }
 
-    if(0.0 == s_b) {
+    if (0.0 == s_b) {
       y_ptr[row] = sum;
     } else {
       y_ptr[row] = s_b * y_ptr[row] + sum;
     }
-
   }
 
-  #undef OMP_BENCH_RESTRICT
-
+#undef OMP_BENCH_RESTRICT
 }
 
 #endif /* OPENMP_DYNAMIC_SPMV_HPP_ */
diff --git a/perf_test/sparse/spmv/OpenMPSmartStatic_SPMV.hpp b/perf_test/sparse/spmv/OpenMPSmartStatic_SPMV.hpp
index c39af87077..9d126be2c1 100644
--- a/perf_test/sparse/spmv/OpenMPSmartStatic_SPMV.hpp
+++ b/perf_test/sparse/spmv/OpenMPSmartStatic_SPMV.hpp
@@ -54,46 +54,48 @@
 // Done to prevent conflicting definitions of threadStarts
 extern int* OMP_BENCH_RESTRICT threadStarts;
 
-template<typename AType, typename Offset, typename Ordinal, typename Scalar>
+template <typename AType, typename Offset, typename Ordinal, typename Scalar>
 void establishSmartSchedule(AType A) {
-  const Ordinal rowCount                              = A.numRows();
-  const Offset* OMP_BENCH_RESTRICT matrixRowOffsets  = &A.graph.row_map(0);
+  const Ordinal rowCount                            = A.numRows();
+  const Offset* OMP_BENCH_RESTRICT matrixRowOffsets = &A.graph.row_map(0);
 
   // Generate a schedule
   Ordinal* rowSizes = NULL;
-  posix_memalign((void**) &rowSizes, 64, sizeof(int) * A.numRows());
-  posix_memalign((void**) &threadStarts, 128, sizeof(int) * (omp_get_max_threads() + 1));
+  posix_memalign((void**)&rowSizes, 64, sizeof(int) * A.numRows());
+  posix_memalign((void**)&threadStarts, 128,
+                 sizeof(int) * (omp_get_max_threads() + 1));
 
-  for(int i = 0; i < omp_get_max_threads(); ++i) {
+  for (int i = 0; i < omp_get_max_threads(); ++i) {
     threadStarts[i] = A.numRows();
   }
 
   unsigned long long int nnz = 0;
 
-  #pragma omp parallel for reduction(+:nnz)
-  for(Ordinal row = 0; row < rowCount; ++row) {
-    const Ordinal rowElements = matrixRowOffsets[row + 1] - matrixRowOffsets[row];
+#pragma omp parallel for reduction(+ : nnz)
+  for (Ordinal row = 0; row < rowCount; ++row) {
+    const Ordinal rowElements =
+        matrixRowOffsets[row + 1] - matrixRowOffsets[row];
     rowSizes[row] = rowElements;
     nnz += rowElements;
   }
 
-  Ordinal nzPerThreadTarget = (int)(nnz / (unsigned long long int) omp_get_max_threads());
+  Ordinal nzPerThreadTarget =
+      (int)(nnz / (unsigned long long int)omp_get_max_threads());
 
-  if(nzPerThreadTarget > 128) {
+  if (nzPerThreadTarget > 128) {
     nzPerThreadTarget &= 0xFFFFFFFC;
   }
 
   Ordinal nextRow = 0;
 
   printf("Target NZ Per Thread: %20d\n", nzPerThreadTarget);
-  threadStarts[0] = 0;  	
+  threadStarts[0] = 0;
 
-  for(int thread = 1; thread < omp_get_max_threads(); ++thread) {
+  for (int thread = 1; thread < omp_get_max_threads(); ++thread) {
     Ordinal nzAccum = 0;
 
-    while(nzAccum < nzPerThreadTarget) {
-      if(nextRow >= rowCount) 
-        break;
+    while (nzAccum < nzPerThreadTarget) {
+      if (nextRow >= rowCount) break;
 
       nzAccum += rowSizes[nextRow];
       nextRow++;
@@ -104,63 +106,65 @@ void establishSmartSchedule(AType A) {
 
   threadStarts[omp_get_max_threads()] = A.numRows();
 
-  //printf("Schedule: Target-per-Row=%20d\n", rowsPerThreadTarget);
-  //for(int i = 0; i < omp_get_max_threads(); ++i) {
-  //	printf("thread [%5d] start=%20d end=%20d\n", i, threadStarts[i], threadStarts[i+1]);
+  // printf("Schedule: Target-per-Row=%20d\n", rowsPerThreadTarget);
+  // for(int i = 0; i < omp_get_max_threads(); ++i) {
+  //	printf("thread [%5d] start=%20d end=%20d\n", i, threadStarts[i],
+  // threadStarts[i+1]);
   //}
 
   free(rowSizes);
 }
 
-template<typename AType, typename XType, typename YType, typename Offset, typename Ordinal, typename Scalar>
+template <typename AType, typename XType, typename YType, typename Offset,
+          typename Ordinal, typename Scalar>
 void openmp_smart_static_matvec(AType A, XType x, YType y) {
-
-  if( NULL == threadStarts ) {
-    //printf("Generating Schedule...\n");
+  if (NULL == threadStarts) {
+    // printf("Generating Schedule...\n");
     establishSmartSchedule<AType, Offset, Ordinal, Scalar>(A);
   }
 
-  const Scalar s_a                                = 1.0;
-  const Scalar s_b                                = 0.0;
+  const Scalar s_a = 1.0;
+  const Scalar s_b = 0.0;
 
-  //const Ordinal rowCount                           = A.numRows();
-  const Scalar* OMP_BENCH_RESTRICT x_ptr           	= (Scalar*) x.data();
-  Scalar* OMP_BENCH_RESTRICT y_ptr                	= (Scalar*) y.data();
-  const Scalar* OMP_BENCH_RESTRICT matrixCoeffs   	= A.values.data();
-  const Ordinal* OMP_BENCH_RESTRICT matrixCols        = A.graph.entries.data();
-  const Offset* OMP_BENCH_RESTRICT matrixRowOffsets  = &A.graph.row_map(0);
+  // const Ordinal rowCount                           = A.numRows();
+  const Scalar* OMP_BENCH_RESTRICT x_ptr            = (Scalar*)x.data();
+  Scalar* OMP_BENCH_RESTRICT y_ptr                  = (Scalar*)y.data();
+  const Scalar* OMP_BENCH_RESTRICT matrixCoeffs     = A.values.data();
+  const Ordinal* OMP_BENCH_RESTRICT matrixCols      = A.graph.entries.data();
+  const Offset* OMP_BENCH_RESTRICT matrixRowOffsets = &A.graph.row_map(0);
 
 #ifdef KOKKOS_ENABLE_PROFILING
   uint64_t kpID = 0;
-  if(Kokkos::Profiling::profileLibraryLoaded()) {
-    Kokkos::Profiling::beginParallelFor("KokkosSparse::Test_SPMV_raw_openmp", 0, &kpID);
+  if (Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::beginParallelFor("KokkosSparse::Test_SPMV_raw_openmp", 0,
+                                        &kpID);
   }
 #endif
 
-  #pragma omp parallel
+#pragma omp parallel
   {
 #if defined(KOKKOS_COMPILER_INTEL) && !defined(__clang__)
     __assume_aligned(x_ptr, 64);
     __assume_aligned(y_ptr, 64);
 #endif
 
-    const int myID    = omp_get_thread_num();
+    const int myID        = omp_get_thread_num();
     const Ordinal myStart = threadStarts[myID];
     const Ordinal myEnd   = threadStarts[myID + 1];
 
-    for(int row = myStart; row < myEnd; ++row) {
+    for (int row = myStart; row < myEnd; ++row) {
       const Offset rowStart = matrixRowOffsets[row];
       const Offset rowEnd   = matrixRowOffsets[row + 1];
 
       Scalar sum = 0.0;
 
-      for(Offset i = rowStart; i < rowEnd; ++i) {
+      for (Offset i = rowStart; i < rowEnd; ++i) {
         const Ordinal x_entry = matrixCols[i];
         const Scalar alpha_MC = s_a * matrixCoeffs[i];
         sum += alpha_MC * x_ptr[x_entry];
       }
 
-      if(0.0 == s_b) {
+      if (0.0 == s_b) {
         y_ptr[row] = sum;
       } else {
         y_ptr[row] = s_b * y_ptr[row] + sum;
@@ -168,7 +172,7 @@ void openmp_smart_static_matvec(AType A, XType x, YType y) {
     }
   }
 #ifdef KOKKOS_ENABLE_PROFILING
-  if(Kokkos::Profiling::profileLibraryLoaded()) {
+  if (Kokkos::Profiling::profileLibraryLoaded()) {
     Kokkos::Profiling::endParallelFor(kpID);
   }
 #endif
diff --git a/perf_test/sparse/spmv/OpenMPStatic_SPMV.hpp b/perf_test/sparse/spmv/OpenMPStatic_SPMV.hpp
index 587588616f..b6dd6eeb3e 100644
--- a/perf_test/sparse/spmv/OpenMPStatic_SPMV.hpp
+++ b/perf_test/sparse/spmv/OpenMPStatic_SPMV.hpp
@@ -45,42 +45,41 @@
 #ifndef OPENMP_STATIC_SPMV_HPP_
 #define OPENMP_STATIC_SPMV_HPP_
 
-template<typename AType, typename XType, typename YType, typename Offset, typename Ordinal, typename Scalar>
+template <typename AType, typename XType, typename YType, typename Offset,
+          typename Ordinal, typename Scalar>
 void openmp_static_matvec(AType A, XType x, YType y) {
-
-  #define OMP_BENCH_RESTRICT __restrict__
+#define OMP_BENCH_RESTRICT __restrict__
 
   const Scalar s_a(1.0);
   const Scalar s_b(0.0);
 
-  const Ordinal rowCount                             = A.numRows();
-  const Scalar* OMP_BENCH_RESTRICT x_ptr             = x.data();
-  Scalar* OMP_BENCH_RESTRICT y_ptr                   = y.data();
-  const Scalar* OMP_BENCH_RESTRICT matrixCoeffs      = A.values.data();
-  const Ordinal* OMP_BENCH_RESTRICT matrixCols       = A.graph.entries.data();
-  const Offset* OMP_BENCH_RESTRICT matrixRowOffsets  = &A.graph.row_map(0);
-
-  #pragma omp parallel for
-  for(Ordinal row = 0; row < rowCount; ++row) {
+  const Ordinal rowCount                            = A.numRows();
+  const Scalar* OMP_BENCH_RESTRICT x_ptr            = x.data();
+  Scalar* OMP_BENCH_RESTRICT y_ptr                  = y.data();
+  const Scalar* OMP_BENCH_RESTRICT matrixCoeffs     = A.values.data();
+  const Ordinal* OMP_BENCH_RESTRICT matrixCols      = A.graph.entries.data();
+  const Offset* OMP_BENCH_RESTRICT matrixRowOffsets = &A.graph.row_map(0);
 
+#pragma omp parallel for
+  for (Ordinal row = 0; row < rowCount; ++row) {
     const Offset rowStart = matrixRowOffsets[row];
     const Offset rowEnd   = matrixRowOffsets[row + 1];
 
     Scalar sum = 0.0;
 
-    for(Offset i = rowStart; i < rowEnd; ++i) {
+    for (Offset i = rowStart; i < rowEnd; ++i) {
       const Ordinal x_entry = matrixCols[i];
       const Scalar alpha_MC = s_a * matrixCoeffs[i];
       sum += alpha_MC * x_ptr[x_entry];
     }
 
-    if(0.0 == s_b)
+    if (0.0 == s_b)
       y_ptr[row] = sum;
     else
       y_ptr[row] = s_b * y_ptr[row] + sum;
   }
 
-  #undef OMP_BENCH_RESTRICT
+#undef OMP_BENCH_RESTRICT
 }
 
 #endif /* OPENMP_STATIC_SPMV_HPP_ */
diff --git a/perf_test/sparse/spmv/matrix_market.hpp b/perf_test/sparse/spmv/matrix_market.hpp
index ef7be53dda..9d675e5f47 100644
--- a/perf_test/sparse/spmv/matrix_market.hpp
+++ b/perf_test/sparse/spmv/matrix_market.hpp
@@ -52,313 +52,319 @@
 #include <limits.h>
 
 namespace Impl {
-template< typename OrdinalType>
-void SparseGraph_SortRows(OrdinalType nrows, OrdinalType* rowPtr, OrdinalType* colInd) {
-  #ifdef _OPENMP
-  #pragma omp parallel for
-  #endif
-  for(int row = 0; row < nrows; row++) {
+template <typename OrdinalType>
+void SparseGraph_SortRows(OrdinalType nrows, OrdinalType* rowPtr,
+                          OrdinalType* colInd) {
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+  for (int row = 0; row < nrows; row++) {
     OrdinalType row_start = rowPtr[row];
-    OrdinalType row_end = rowPtr[row+1];
-    for(OrdinalType i = row_start; i < row_end-1; i++) {
-      for(OrdinalType j = row_end-1; j > i; j--) {
-        if(colInd[j] < colInd[j-1]) {
-          int idx = colInd[j];
-          colInd[j] = colInd[j-1];
-          colInd[j-1] = idx;
+    OrdinalType row_end   = rowPtr[row + 1];
+    for (OrdinalType i = row_start; i < row_end - 1; i++) {
+      for (OrdinalType j = row_end - 1; j > i; j--) {
+        if (colInd[j] < colInd[j - 1]) {
+          int idx       = colInd[j];
+          colInd[j]     = colInd[j - 1];
+          colInd[j - 1] = idx;
         }
       }
     }
   }
 
-  #ifdef _OPENMP
-  #pragma omp parallel for
-  #endif
-  for(int row = 0; row < nrows; row++) {
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+  for (int row = 0; row < nrows; row++) {
     OrdinalType row_start = rowPtr[row];
-    OrdinalType row_end = rowPtr[row+1];
-    for(OrdinalType i = row_start; i < row_end-1; i++) {
-      if(colInd[i+1] < colInd[i]) printf("Error Not sorted %i %i | %i %i\n",row,i,colInd[i],colInd[i+1]);
+    OrdinalType row_end   = rowPtr[row + 1];
+    for (OrdinalType i = row_start; i < row_end - 1; i++) {
+      if (colInd[i + 1] < colInd[i])
+        printf("Error Not sorted %i %i | %i %i\n", row, i, colInd[i],
+               colInd[i + 1]);
     }
   }
 }
-}
-
-template< typename ScalarType , typename OrdinalType>
-int SparseMatrix_MatrixMarket_read(const char* filename, OrdinalType &nrows, OrdinalType &ncols, OrdinalType &nnz,
-                                   ScalarType* &values, OrdinalType* &rowPtr, OrdinalType* &colInd, bool sort, OrdinalType idx_offset  = 0)
-{
-  FILE* file = fopen(filename,"r");
+}  // namespace Impl
+
+template <typename ScalarType, typename OrdinalType>
+int SparseMatrix_MatrixMarket_read(const char* filename, OrdinalType& nrows,
+                                   OrdinalType& ncols, OrdinalType& nnz,
+                                   ScalarType*& values, OrdinalType*& rowPtr,
+                                   OrdinalType*& colInd, bool sort,
+                                   OrdinalType idx_offset = 0) {
+  FILE* file = fopen(filename, "r");
   char line[512];
-  line[0]='%';
-  int count=-1;
+  line[0]         = '%';
+  int count       = -1;
   char* symmetric = NULL;
-  char* pattern = NULL;
+  char* pattern   = NULL;
   int nlines;
 
-  while(line[0]=='%')
-  {
-          fgets(line,511,file);
-          count++;
-          if(count==0) {
-            symmetric=strstr(line,"symmetric");
-            pattern=strstr(line,"pattern");
-          }
+  while (line[0] == '%') {
+    fgets(line, 511, file);
+    count++;
+    if (count == 0) {
+      symmetric = strstr(line, "symmetric");
+      pattern   = strstr(line, "pattern");
+    }
   }
   rewind(file);
-  for(int i=0;i<count;i++)
-          fgets(line,511,file);
-  fscanf(file,"%i",&nrows);
-  fscanf(file,"%i",&ncols);
-  fscanf(file,"%i",&nlines);
-  printf("Matrix dimension: %i %i %i %s %s\n",nrows,ncols,nlines,symmetric?"Symmetric":"General",pattern?"Pattern":"Real");
-
-  if(symmetric) nnz=nlines*2;
-  else nnz=nlines;
-
-  OrdinalType* colIndtmp = new OrdinalType[nnz];
-  OrdinalType* rowIndtmp = new OrdinalType[nnz];
-  double* valuestmp = new double[nnz];
+  for (int i = 0; i < count; i++) fgets(line, 511, file);
+  fscanf(file, "%i", &nrows);
+  fscanf(file, "%i", &ncols);
+  fscanf(file, "%i", &nlines);
+  printf("Matrix dimension: %i %i %i %s %s\n", nrows, ncols, nlines,
+         symmetric ? "Symmetric" : "General", pattern ? "Pattern" : "Real");
+
+  if (symmetric)
+    nnz = nlines * 2;
+  else
+    nnz = nlines;
+
+  OrdinalType* colIndtmp            = new OrdinalType[nnz];
+  OrdinalType* rowIndtmp            = new OrdinalType[nnz];
+  double* valuestmp                 = new double[nnz];
   OrdinalType* priorEntrySameRowInd = new OrdinalType[nnz];
-  OrdinalType* lastEntryWithRowInd = new OrdinalType[nrows];
-  for(int i=0;i<nrows;i++) lastEntryWithRowInd[i]=-1;
-  nnz=0;
-  for(int ii=0;ii<nlines;ii++)
-  {
-
-          if(pattern) {
-            fscanf(file,"%i %i",&rowIndtmp[nnz],&colIndtmp[nnz]);
-            valuestmp[nnz] = (1.0*(ii%nrows))/ncols;
-          }
-          else
-            fscanf(file,"%i %i %le",&rowIndtmp[nnz],&colIndtmp[nnz],&valuestmp[nnz]);
-          if(ii<10||ii>nlines-10)
-            printf("Read: %i %i %i %le\n",nnz,rowIndtmp[nnz],colIndtmp[nnz],valuestmp[nnz]);
-          rowIndtmp[nnz]-= idx_offset;
-          colIndtmp[nnz]-= idx_offset;
-          priorEntrySameRowInd[nnz] = lastEntryWithRowInd[rowIndtmp[nnz]-1];
-          lastEntryWithRowInd[rowIndtmp[nnz]-1]=nnz;
-          if((symmetric) && (rowIndtmp[nnz]!=colIndtmp[nnz]))
-          {
-            nnz++;
-            rowIndtmp[nnz]=colIndtmp[nnz-1];
-            colIndtmp[nnz]=rowIndtmp[nnz-1];
-            valuestmp[nnz]=valuestmp[nnz-1];
-            priorEntrySameRowInd[nnz] = lastEntryWithRowInd[rowIndtmp[nnz]-1];
-            lastEntryWithRowInd[rowIndtmp[nnz]-1]=nnz;
-          }
-
-          nnz++;
+  OrdinalType* lastEntryWithRowInd  = new OrdinalType[nrows];
+  for (int i = 0; i < nrows; i++) lastEntryWithRowInd[i] = -1;
+  nnz = 0;
+  for (int ii = 0; ii < nlines; ii++) {
+    if (pattern) {
+      fscanf(file, "%i %i", &rowIndtmp[nnz], &colIndtmp[nnz]);
+      valuestmp[nnz] = (1.0 * (ii % nrows)) / ncols;
+    } else
+      fscanf(file, "%i %i %le", &rowIndtmp[nnz], &colIndtmp[nnz],
+             &valuestmp[nnz]);
+    if (ii < 10 || ii > nlines - 10)
+      printf("Read: %i %i %i %le\n", nnz, rowIndtmp[nnz], colIndtmp[nnz],
+             valuestmp[nnz]);
+    rowIndtmp[nnz] -= idx_offset;
+    colIndtmp[nnz] -= idx_offset;
+    priorEntrySameRowInd[nnz] = lastEntryWithRowInd[rowIndtmp[nnz] - 1];
+    lastEntryWithRowInd[rowIndtmp[nnz] - 1] = nnz;
+    if ((symmetric) && (rowIndtmp[nnz] != colIndtmp[nnz])) {
+      nnz++;
+      rowIndtmp[nnz]            = colIndtmp[nnz - 1];
+      colIndtmp[nnz]            = rowIndtmp[nnz - 1];
+      valuestmp[nnz]            = valuestmp[nnz - 1];
+      priorEntrySameRowInd[nnz] = lastEntryWithRowInd[rowIndtmp[nnz] - 1];
+      lastEntryWithRowInd[rowIndtmp[nnz] - 1] = nnz;
+    }
+
+    nnz++;
   }
 
   values = new ScalarType[nnz];
   colInd = new OrdinalType[nnz];
-  rowPtr = new OrdinalType[nrows+1];
+  rowPtr = new OrdinalType[nrows + 1];
 
   int pos = 0;
-  for(int row=0;row<nrows;row++)
-  {
-        int j = lastEntryWithRowInd[row];
-        rowPtr[row]=pos;
-    while(j>-1)
-    {
-        values[pos] = valuestmp[j];
-        colInd[pos] = colIndtmp[j]-1;
-        j = priorEntrySameRowInd[j];
-        pos++;
+  for (int row = 0; row < nrows; row++) {
+    int j       = lastEntryWithRowInd[row];
+    rowPtr[row] = pos;
+    while (j > -1) {
+      values[pos] = valuestmp[j];
+      colInd[pos] = colIndtmp[j] - 1;
+      j           = priorEntrySameRowInd[j];
+      pos++;
     }
   }
-  rowPtr[nrows]=pos;
+  rowPtr[nrows] = pos;
 
-  printf("Number of Non-Zeros: %i\n",pos);
-  delete [] valuestmp;
-  delete [] colIndtmp;
-  delete [] rowIndtmp;
-  delete [] priorEntrySameRowInd;
-  delete [] lastEntryWithRowInd;
+  printf("Number of Non-Zeros: %i\n", pos);
+  delete[] valuestmp;
+  delete[] colIndtmp;
+  delete[] rowIndtmp;
+  delete[] priorEntrySameRowInd;
+  delete[] lastEntryWithRowInd;
 
-  size_t min_span = nrows+1;
+  size_t min_span = nrows + 1;
   size_t max_span = 0;
   size_t ave_span = 0;
-  for(int row=0; row<nrows;row++) {
-    int min = nrows+1; int max = 0;
-    for(int i=rowPtr[row]; i<rowPtr[row+1]; i++) {
-      if(colInd[i]<min) min = colInd[i];
-      if(colInd[i]>max) max = colInd[i];
+  for (int row = 0; row < nrows; row++) {
+    int min = nrows + 1;
+    int max = 0;
+    for (int i = rowPtr[row]; i < rowPtr[row + 1]; i++) {
+      if (colInd[i] < min) min = colInd[i];
+      if (colInd[i] > max) max = colInd[i];
     }
-    if(rowPtr[row+1]>rowPtr[row]) {
-      size_t span = max-min;
-      if(span<min_span) min_span = span;
-      if(span>max_span) max_span = span;
+    if (rowPtr[row + 1] > rowPtr[row]) {
+      size_t span = max - min;
+      if (span < min_span) min_span = span;
+      if (span > max_span) max_span = span;
       ave_span += span;
-    } else min_span = 0;
+    } else
+      min_span = 0;
   }
 
-  printf("%lu Spans: %lu %lu %lu\n",(size_t) nnz,min_span,max_span,ave_span/nrows);
-  if(sort)
-    Impl::SparseGraph_SortRows<OrdinalType>(nrows,rowPtr,colInd);
+  printf("%lu Spans: %lu %lu %lu\n", (size_t)nnz, min_span, max_span,
+         ave_span / nrows);
+  if (sort) Impl::SparseGraph_SortRows<OrdinalType>(nrows, rowPtr, colInd);
   return nnz;
 }
 
-template< typename ScalarType , typename OrdinalType>
-int SparseMatrix_WriteBinaryFormat(const char* filename, OrdinalType &nrows, OrdinalType &ncols, OrdinalType &nnz,
-                                    ScalarType* &values, OrdinalType* &rowPtr, OrdinalType* &colInd, bool sort, OrdinalType idx_offset = 0)
-{
-  nnz = SparseMatrix_MatrixMarket_read<ScalarType,OrdinalType>(filename,nrows,ncols,nnz,values,rowPtr,colInd,sort, idx_offset);
-
-
-  char * filename_row = new char[strlen(filename)+5];
-  char * filename_col = new char[strlen(filename)+5];
-  char * filename_vals = new char[strlen(filename)+6];
-  char * filename_descr = new char[strlen(filename)+7];
-  strcpy(filename_row,filename);
-  strcpy(filename_col,filename);
-  strcpy(filename_vals,filename);
-  strcpy(filename_descr,filename);
-  strcat(filename_row,"_row");
-  strcat(filename_col,"_col");
-  strcat(filename_vals,"_vals");
-  strcat(filename_descr,"_descr");
-  FILE* RowFile = fopen(filename_row,"w");
-  FILE* ColFile = fopen(filename_col,"w");
-  FILE* ValsFile = fopen(filename_vals,"w");
-  FILE* DescrFile = fopen(filename_descr,"w");
-
-
-  FILE* file = fopen(filename,"r");
+template <typename ScalarType, typename OrdinalType>
+int SparseMatrix_WriteBinaryFormat(const char* filename, OrdinalType& nrows,
+                                   OrdinalType& ncols, OrdinalType& nnz,
+                                   ScalarType*& values, OrdinalType*& rowPtr,
+                                   OrdinalType*& colInd, bool sort,
+                                   OrdinalType idx_offset = 0) {
+  nnz = SparseMatrix_MatrixMarket_read<ScalarType, OrdinalType>(
+      filename, nrows, ncols, nnz, values, rowPtr, colInd, sort, idx_offset);
+
+  char* filename_row   = new char[strlen(filename) + 5];
+  char* filename_col   = new char[strlen(filename) + 5];
+  char* filename_vals  = new char[strlen(filename) + 6];
+  char* filename_descr = new char[strlen(filename) + 7];
+  strcpy(filename_row, filename);
+  strcpy(filename_col, filename);
+  strcpy(filename_vals, filename);
+  strcpy(filename_descr, filename);
+  strcat(filename_row, "_row");
+  strcat(filename_col, "_col");
+  strcat(filename_vals, "_vals");
+  strcat(filename_descr, "_descr");
+  FILE* RowFile   = fopen(filename_row, "w");
+  FILE* ColFile   = fopen(filename_col, "w");
+  FILE* ValsFile  = fopen(filename_vals, "w");
+  FILE* DescrFile = fopen(filename_descr, "w");
+
+  FILE* file = fopen(filename, "r");
   char line[512];
-  line[0]='%';
-  int count=-1;
-  //char* symmetric = NULL;
-  //int nlines;
-
-  while(line[0]=='%')
-  {
-          fgets(line,511,file);
-          line[511] = 0;
-          count++;
-          //if(count==0) symmetric=strstr(line,"symmetric");
-
-          if(line[0]=='%')
-            fprintf ( DescrFile , "%s",line);
-          else
-            fprintf ( DescrFile , "%i %i %i\n",nrows,ncols,nnz);
+  line[0]   = '%';
+  int count = -1;
+  // char* symmetric = NULL;
+  // int nlines;
+
+  while (line[0] == '%') {
+    fgets(line, 511, file);
+    line[511] = 0;
+    count++;
+    // if(count==0) symmetric=strstr(line,"symmetric");
+
+    if (line[0] == '%')
+      fprintf(DescrFile, "%s", line);
+    else
+      fprintf(DescrFile, "%i %i %i\n", nrows, ncols, nnz);
   }
-  fprintf ( DescrFile , "\n");
+  fprintf(DescrFile, "\n");
 
-  fwrite ( rowPtr, sizeof(OrdinalType), nrows+1, RowFile);
-  fwrite ( colInd, sizeof(OrdinalType), nnz, ColFile);
-  fwrite ( values, sizeof(ScalarType), nnz, ValsFile);
+  fwrite(rowPtr, sizeof(OrdinalType), nrows + 1, RowFile);
+  fwrite(colInd, sizeof(OrdinalType), nnz, ColFile);
+  fwrite(values, sizeof(ScalarType), nnz, ValsFile);
 
   fclose(RowFile);
   fclose(ColFile);
   fclose(ValsFile);
   fclose(DescrFile);
 
-  size_t min_span = nrows+1;
+  size_t min_span = nrows + 1;
   size_t max_span = 0;
   size_t ave_span = 0;
-  for(int row=0; row<nrows;row++) {
-    int min = nrows+1; int max = 0;
-    for(int i=rowPtr[row]; i<rowPtr[row+1]; i++) {
-      if(colInd[i]<min) min = colInd[i];
-      if(colInd[i]>max) max = colInd[i];
+  for (int row = 0; row < nrows; row++) {
+    int min = nrows + 1;
+    int max = 0;
+    for (int i = rowPtr[row]; i < rowPtr[row + 1]; i++) {
+      if (colInd[i] < min) min = colInd[i];
+      if (colInd[i] > max) max = colInd[i];
     }
-    if(rowPtr[row+1]>rowPtr[row]) {
-      size_t span = max-min;
-      if(span<min_span) min_span = span;
-      if(span>max_span) max_span = span;
+    if (rowPtr[row + 1] > rowPtr[row]) {
+      size_t span = max - min;
+      if (span < min_span) min_span = span;
+      if (span > max_span) max_span = span;
       ave_span += span;
-    } else min_span = 0;
+    } else
+      min_span = 0;
   }
-  printf("%lu Spans: %lu %lu %lu\n",(size_t) nnz,min_span,max_span,ave_span/nrows);
+  printf("%lu Spans: %lu %lu %lu\n", (size_t)nnz, min_span, max_span,
+         ave_span / nrows);
 
   return nnz;
 }
 
-template< typename ScalarType , typename OrdinalType>
-int SparseMatrix_ReadBinaryFormat(const char* filename, OrdinalType &nrows, OrdinalType &ncols, OrdinalType &nnz, ScalarType* &values, OrdinalType* &rowPtr, OrdinalType* &colInd)
-{
-  char * filename_descr = new char[strlen(filename)+7];
-  strcpy(filename_descr,filename);
-  strcat(filename_descr,"_descr");
-  FILE* file = fopen(filename_descr,"r");
+template <typename ScalarType, typename OrdinalType>
+int SparseMatrix_ReadBinaryFormat(const char* filename, OrdinalType& nrows,
+                                  OrdinalType& ncols, OrdinalType& nnz,
+                                  ScalarType*& values, OrdinalType*& rowPtr,
+                                  OrdinalType*& colInd) {
+  char* filename_descr = new char[strlen(filename) + 7];
+  strcpy(filename_descr, filename);
+  strcat(filename_descr, "_descr");
+  FILE* file = fopen(filename_descr, "r");
   char line[512];
-  line[0]='%';
-  int count=-1;
+  line[0]         = '%';
+  int count       = -1;
   char* symmetric = NULL;
-  //int nlines;
+  // int nlines;
 
-  while(line[0]=='%')
-  {
-          fgets(line,511,file);
-          count++;
-          if(count==0) symmetric=strstr(line,"symmetric");
+  while (line[0] == '%') {
+    fgets(line, 511, file);
+    count++;
+    if (count == 0) symmetric = strstr(line, "symmetric");
   }
   rewind(file);
-  for(int i=0;i<count;i++)
-          fgets(line,511,file);
-  fscanf(file,"%i",&nrows);
-  fscanf(file,"%i",&ncols);
-  fscanf(file,"%i",&nnz);
-  printf("Matrix dimension: %i %i %i %s\n",nrows,ncols,nnz,symmetric?"Symmetric":"General");
+  for (int i = 0; i < count; i++) fgets(line, 511, file);
+  fscanf(file, "%i", &nrows);
+  fscanf(file, "%i", &ncols);
+  fscanf(file, "%i", &nnz);
+  printf("Matrix dimension: %i %i %i %s\n", nrows, ncols, nnz,
+         symmetric ? "Symmetric" : "General");
 
   fclose(file);
 
-  char * filename_row = new char[strlen(filename)+5];
-  char * filename_col = new char[strlen(filename)+5];
-  char * filename_vals = new char[strlen(filename)+6];
-  strcpy(filename_row,filename);
-  strcpy(filename_col,filename);
-  strcpy(filename_vals,filename);
-  strcat(filename_row,"_row");
-  strcat(filename_col,"_col");
-  strcat(filename_vals,"_vals");
-  FILE* RowFile = fopen(filename_row,"r");
-  FILE* ColFile = fopen(filename_col,"r");
-  FILE* ValsFile = fopen(filename_vals,"r");
-
-  bool read_values = false; 
-  if(ValsFile == NULL) 
-    read_values = false;
+  char* filename_row  = new char[strlen(filename) + 5];
+  char* filename_col  = new char[strlen(filename) + 5];
+  char* filename_vals = new char[strlen(filename) + 6];
+  strcpy(filename_row, filename);
+  strcpy(filename_col, filename);
+  strcpy(filename_vals, filename);
+  strcat(filename_row, "_row");
+  strcat(filename_col, "_col");
+  strcat(filename_vals, "_vals");
+  FILE* RowFile  = fopen(filename_row, "r");
+  FILE* ColFile  = fopen(filename_col, "r");
+  FILE* ValsFile = fopen(filename_vals, "r");
+
+  bool read_values = false;
+  if (ValsFile == NULL) read_values = false;
 
   values = new ScalarType[nnz];
-  rowPtr = new OrdinalType[nrows+1];
+  rowPtr = new OrdinalType[nrows + 1];
   colInd = new OrdinalType[nnz];
 
-  fread ( rowPtr, sizeof(OrdinalType), nrows+1, RowFile);
-  fread ( colInd, sizeof(OrdinalType), nnz, ColFile);
-  
-  if(read_values)
+  fread(rowPtr, sizeof(OrdinalType), nrows + 1, RowFile);
+  fread(colInd, sizeof(OrdinalType), nnz, ColFile);
 
-  fclose(RowFile);
+  if (read_values) fclose(RowFile);
   fclose(ColFile);
-  if(read_values) {
-    fread ( values, sizeof(ScalarType), nnz, ValsFile);
+  if (read_values) {
+    fread(values, sizeof(ScalarType), nnz, ValsFile);
     fclose(ValsFile);
   } else {
-    for(int i = 0; i<nnz; i++) 
-      values[i] = 0.001*(rand()%1000);
+    for (int i = 0; i < nnz; i++) values[i] = 0.001 * (rand() % 1000);
   }
 
-  size_t min_span = nrows+1;
+  size_t min_span = nrows + 1;
   size_t max_span = 0;
   size_t ave_span = 0;
-  for(int row=0; row<nrows;row++) {
-    int min = nrows+1; int max = 0;
-    for(int i=rowPtr[row]; i<rowPtr[row+1]; i++) {
-      if(colInd[i]<min) min = colInd[i];
-      if(colInd[i]>max) max = colInd[i];
+  for (int row = 0; row < nrows; row++) {
+    int min = nrows + 1;
+    int max = 0;
+    for (int i = rowPtr[row]; i < rowPtr[row + 1]; i++) {
+      if (colInd[i] < min) min = colInd[i];
+      if (colInd[i] > max) max = colInd[i];
     }
-    if(rowPtr[row+1]>rowPtr[row]) {
-      size_t span = max-min;
-      if(span<min_span) min_span = span;
-      if(span>max_span) max_span = span;
+    if (rowPtr[row + 1] > rowPtr[row]) {
+      size_t span = max - min;
+      if (span < min_span) min_span = span;
+      if (span > max_span) max_span = span;
       ave_span += span;
-    } else min_span = 0;
+    } else
+      min_span = 0;
   }
-  printf("%lu Spans: %lu %lu %lu\n",(size_t) nnz,min_span,max_span,ave_span/nrows);
-
+  printf("%lu Spans: %lu %lu %lu\n", (size_t)nnz, min_span, max_span,
+         ave_span / nrows);
 
   return nnz;
 }
diff --git a/perf_test/sparse/tracked_testing.hpp b/perf_test/sparse/tracked_testing.hpp
index dfa622f371..7eea71e8a7 100644
--- a/perf_test/sparse/tracked_testing.hpp
+++ b/perf_test/sparse/tracked_testing.hpp
@@ -9,13 +9,13 @@
 
 namespace test {
 namespace sparse {
-void build_executor(rajaperf::Executor& exec, int , char* [], const rajaperf::RunParams& params) {
+void build_executor(rajaperf::Executor& exec, int, char*[],
+                    const rajaperf::RunParams& params) {
   exec.registerGroup("Sparse");
-  for(auto* kernel : make_spmv_kernel_base(params)) {
+  for (auto* kernel : make_spmv_kernel_base(params)) {
     exec.registerKernel("Sparse", kernel);
   }
-
 }
-} // namespace sparse
+}  // namespace sparse
 }  // namespace test
 #endif  // KOKKOSKERNELS_TRACKED_TESTING_HPP
diff --git a/perf_test/test_crsmatrix.cpp b/perf_test/test_crsmatrix.cpp
index 93ca0383a0..dc9dc8588d 100644
--- a/perf_test/test_crsmatrix.cpp
+++ b/perf_test/test_crsmatrix.cpp
@@ -63,464 +63,527 @@ typedef long long int LocalOrdinalType;
 #else
 typedef int LocalOrdinalType;
 #endif
-template< typename ScalarType , typename OrdinalType>
-int SparseMatrix_MatrixMarket_read(const char* filename, OrdinalType &nrows, OrdinalType &ncols, OrdinalType &nnz, ScalarType* &values, OrdinalType* &rowPtr, OrdinalType* &colInd)
-{
-  FILE* file = fopen(filename,"r");
+template <typename ScalarType, typename OrdinalType>
+int SparseMatrix_MatrixMarket_read(const char* filename, OrdinalType& nrows,
+                                   OrdinalType& ncols, OrdinalType& nnz,
+                                   ScalarType*& values, OrdinalType*& rowPtr,
+                                   OrdinalType*& colInd) {
+  FILE* file = fopen(filename, "r");
   char line[512];
-  line[0]='%';
-  int count=-1;
+  line[0]         = '%';
+  int count       = -1;
   char* symmetric = NULL;
   int nlines;
 
-/*  //determine the size of the file to allocate memory
-  fseek(file,0,SEEK_END);
-  long long int fileSize = ftell(file);
-  fseek(file,0,SEEK_SET);
+  /*  //determine the size of the file to allocate memory
+    fseek(file,0,SEEK_END);
+    long long int fileSize = ftell(file);
+    fseek(file,0,SEEK_SET);
 
-  //now create array to take the input data from file.
-  char* buffer = (char*) malloc(sizeof(char)*fileSize);
+    //now create array to take the input data from file.
+    char* buffer = (char*) malloc(sizeof(char)*fileSize);
 
-  //read content of file to memory
-  newFileSize = fread (buffer,1,fileSize,file);*/
+    //read content of file to memory
+    newFileSize = fread (buffer,1,fileSize,file);*/
 
-  while(line[0]=='%')
-  {
-          fgets(line,511,file);
-          count++;
-          if(count==0) symmetric=strstr(line,"symmetric");
+  while (line[0] == '%') {
+    fgets(line, 511, file);
+    count++;
+    if (count == 0) symmetric = strstr(line, "symmetric");
   }
   rewind(file);
-  for(int i=0;i<count;i++)
-          fgets(line,511,file);
-  fscanf(file,"%i",&nrows);
-  fscanf(file,"%i",&ncols);
-  fscanf(file,"%i",&nlines);
-  printf("Matrix dimension: %i %i %i %s\n",nrows,ncols,nlines,symmetric?"Symmetric":"General");
-  if(symmetric) nnz=nlines*2;
-  else nnz=nlines;
-
-  OrdinalType* colIndtmp = new OrdinalType[nnz];
-  OrdinalType* rowIndtmp = new OrdinalType[nnz];
-  double* valuestmp = new double[nnz];
+  for (int i = 0; i < count; i++) fgets(line, 511, file);
+  fscanf(file, "%i", &nrows);
+  fscanf(file, "%i", &ncols);
+  fscanf(file, "%i", &nlines);
+  printf("Matrix dimension: %i %i %i %s\n", nrows, ncols, nlines,
+         symmetric ? "Symmetric" : "General");
+  if (symmetric)
+    nnz = nlines * 2;
+  else
+    nnz = nlines;
+
+  OrdinalType* colIndtmp            = new OrdinalType[nnz];
+  OrdinalType* rowIndtmp            = new OrdinalType[nnz];
+  double* valuestmp                 = new double[nnz];
   OrdinalType* priorEntrySameRowInd = new OrdinalType[nnz];
-  OrdinalType* lastEntryWithRowInd = new OrdinalType[nrows];
-  for(int i=0;i<nrows;i++) lastEntryWithRowInd[i]=-1;
-  nnz=0;
-  for(int ii=0;ii<nlines;ii++)
-  {
-
-          fscanf(file,"%i %i %le",&rowIndtmp[nnz],&colIndtmp[nnz],&valuestmp[nnz]);
-          priorEntrySameRowInd[nnz] = lastEntryWithRowInd[rowIndtmp[nnz]-1];
-          lastEntryWithRowInd[rowIndtmp[nnz]-1]=nnz;
-          if((symmetric) && (rowIndtmp[nnz]!=colIndtmp[nnz]))
-          {
-                nnz++;
-            rowIndtmp[nnz]=colIndtmp[nnz-1];
-            colIndtmp[nnz]=rowIndtmp[nnz-1];
-            valuestmp[nnz]=valuestmp[nnz-1];
-            priorEntrySameRowInd[nnz] = lastEntryWithRowInd[rowIndtmp[nnz]-1];
-            lastEntryWithRowInd[rowIndtmp[nnz]-1]=nnz;
-          }
-          nnz++;
+  OrdinalType* lastEntryWithRowInd  = new OrdinalType[nrows];
+  for (int i = 0; i < nrows; i++) lastEntryWithRowInd[i] = -1;
+  nnz = 0;
+  for (int ii = 0; ii < nlines; ii++) {
+    fscanf(file, "%i %i %le", &rowIndtmp[nnz], &colIndtmp[nnz],
+           &valuestmp[nnz]);
+    priorEntrySameRowInd[nnz] = lastEntryWithRowInd[rowIndtmp[nnz] - 1];
+    lastEntryWithRowInd[rowIndtmp[nnz] - 1] = nnz;
+    if ((symmetric) && (rowIndtmp[nnz] != colIndtmp[nnz])) {
+      nnz++;
+      rowIndtmp[nnz]            = colIndtmp[nnz - 1];
+      colIndtmp[nnz]            = rowIndtmp[nnz - 1];
+      valuestmp[nnz]            = valuestmp[nnz - 1];
+      priorEntrySameRowInd[nnz] = lastEntryWithRowInd[rowIndtmp[nnz] - 1];
+      lastEntryWithRowInd[rowIndtmp[nnz] - 1] = nnz;
+    }
+    nnz++;
   }
 
   values = new ScalarType[nnz];
   colInd = new OrdinalType[nnz];
-  rowPtr = new OrdinalType[nrows+1];
+  rowPtr = new OrdinalType[nrows + 1];
 
   int pos = 0;
-  for(int row=0;row<nrows;row++)
-  {
-        int j = lastEntryWithRowInd[row];
-        rowPtr[row]=pos;
-    while(j>-1)
-    {
-        values[pos] = valuestmp[j];
-        colInd[pos] = colIndtmp[j]-1;
-        j = priorEntrySameRowInd[j];
-        pos++;
+  for (int row = 0; row < nrows; row++) {
+    int j       = lastEntryWithRowInd[row];
+    rowPtr[row] = pos;
+    while (j > -1) {
+      values[pos] = valuestmp[j];
+      colInd[pos] = colIndtmp[j] - 1;
+      j           = priorEntrySameRowInd[j];
+      pos++;
     }
   }
-  rowPtr[nrows]=pos;
-  printf("Number of Non-Zeros: %i\n",pos);
-  delete [] valuestmp;
-  delete [] colIndtmp;
-  delete [] rowIndtmp;
-  delete [] priorEntrySameRowInd;
-  delete [] lastEntryWithRowInd;
+  rowPtr[nrows] = pos;
+  printf("Number of Non-Zeros: %i\n", pos);
+  delete[] valuestmp;
+  delete[] colIndtmp;
+  delete[] rowIndtmp;
+  delete[] priorEntrySameRowInd;
+  delete[] lastEntryWithRowInd;
   return nnz;
 }
 
-template< typename ScalarType , typename OrdinalType>
-int SparseMatrix_ExtractBinaryGraph(const char* filename, OrdinalType &nrows, OrdinalType &ncols, OrdinalType &nnz, ScalarType* &values, OrdinalType* &rowPtr, OrdinalType* &colInd)
-{
-  nnz = SparseMatrix_MatrixMarket_read<ScalarType,OrdinalType>(filename,nrows,ncols,nnz,values,rowPtr,colInd);
-  char * filename_row = new char[strlen(filename)+5];
-  char * filename_col = new char[strlen(filename)+5];
-  strcpy(filename_row,filename);
-  strcpy(filename_col,filename);
-  strcat(filename_row,"_row");
-  strcat(filename_row,"_col");
-  FILE* RowFile = fopen(filename_row,"w");
-  FILE* ColFile = fopen(filename_col,"w");
-
-
-  fwrite ( rowPtr, sizeof(OrdinalType), nrows+1, RowFile);
-  fwrite ( colInd, sizeof(OrdinalType), nnz, ColFile);
+template <typename ScalarType, typename OrdinalType>
+int SparseMatrix_ExtractBinaryGraph(const char* filename, OrdinalType& nrows,
+                                    OrdinalType& ncols, OrdinalType& nnz,
+                                    ScalarType*& values, OrdinalType*& rowPtr,
+                                    OrdinalType*& colInd) {
+  nnz = SparseMatrix_MatrixMarket_read<ScalarType, OrdinalType>(
+      filename, nrows, ncols, nnz, values, rowPtr, colInd);
+  char* filename_row = new char[strlen(filename) + 5];
+  char* filename_col = new char[strlen(filename) + 5];
+  strcpy(filename_row, filename);
+  strcpy(filename_col, filename);
+  strcat(filename_row, "_row");
+  strcat(filename_row, "_col");
+  FILE* RowFile = fopen(filename_row, "w");
+  FILE* ColFile = fopen(filename_col, "w");
+
+  fwrite(rowPtr, sizeof(OrdinalType), nrows + 1, RowFile);
+  fwrite(colInd, sizeof(OrdinalType), nnz, ColFile);
   fclose(RowFile);
   fclose(ColFile);
   return nnz;
 }
 
-template< typename ScalarType , typename OrdinalType>
-int SparseMatrix_ReadBinaryGraph(const char* filename, OrdinalType &nrows, OrdinalType &ncols, OrdinalType &nnz, ScalarType* &values, OrdinalType* &rowPtr, OrdinalType* &colInd)
-{
-  char * filename_descr = new char[strlen(filename)+7];
-  strcpy(filename_descr,filename);
-  strcat(filename_descr,"_descr");
-  FILE* file = fopen(filename_descr,"r");
+template <typename ScalarType, typename OrdinalType>
+int SparseMatrix_ReadBinaryGraph(const char* filename, OrdinalType& nrows,
+                                 OrdinalType& ncols, OrdinalType& nnz,
+                                 ScalarType*& values, OrdinalType*& rowPtr,
+                                 OrdinalType*& colInd) {
+  char* filename_descr = new char[strlen(filename) + 7];
+  strcpy(filename_descr, filename);
+  strcat(filename_descr, "_descr");
+  FILE* file = fopen(filename_descr, "r");
   char line[512];
-  line[0]='%';
-  int count=-1;
+  line[0]         = '%';
+  int count       = -1;
   char* symmetric = NULL;
   int nlines;
 
-  while(line[0]=='%')
-  {
-          fgets(line,511,file);
-          count++;
-          if(count==0) symmetric=strstr(line,"symmetric");
+  while (line[0] == '%') {
+    fgets(line, 511, file);
+    count++;
+    if (count == 0) symmetric = strstr(line, "symmetric");
   }
   rewind(file);
-  for(int i=0;i<count;i++)
-          fgets(line,511,file);
-  fscanf(file,"%i",&nrows);
-  fscanf(file,"%i",&ncols);
-  fscanf(file,"%i",&nlines);
-  printf("Matrix dimension: %i %i %i %s\n",nrows,ncols,nlines,symmetric?"Symmetric":"General");
-  if(symmetric) nnz=nlines*2;
-  else nnz=nlines;
+  for (int i = 0; i < count; i++) fgets(line, 511, file);
+  fscanf(file, "%i", &nrows);
+  fscanf(file, "%i", &ncols);
+  fscanf(file, "%i", &nlines);
+  printf("Matrix dimension: %i %i %i %s\n", nrows, ncols, nlines,
+         symmetric ? "Symmetric" : "General");
+  if (symmetric)
+    nnz = nlines * 2;
+  else
+    nnz = nlines;
   fclose(file);
 
-  char * filename_row = new char[strlen(filename)+5];
-  char * filename_col = new char[strlen(filename)+5];
-  strcpy(filename_row,filename);
-  strcpy(filename_col,filename);
-  strcat(filename_row,"_row");
-  strcat(filename_col,"_col");
-  FILE* RowFile = fopen(filename_row,"r");
-  FILE* ColFile = fopen(filename_col,"r");
+  char* filename_row = new char[strlen(filename) + 5];
+  char* filename_col = new char[strlen(filename) + 5];
+  strcpy(filename_row, filename);
+  strcpy(filename_col, filename);
+  strcat(filename_row, "_row");
+  strcat(filename_col, "_col");
+  FILE* RowFile = fopen(filename_row, "r");
+  FILE* ColFile = fopen(filename_col, "r");
 
   values = new ScalarType[nnz];
-  rowPtr = new OrdinalType[nrows+1];
+  rowPtr = new OrdinalType[nrows + 1];
   colInd = new OrdinalType[nnz];
 
-  fread ( rowPtr, sizeof(OrdinalType), nrows+1, RowFile);
-  fread ( colInd, sizeof(OrdinalType), nnz, ColFile);
+  fread(rowPtr, sizeof(OrdinalType), nrows + 1, RowFile);
+  fread(colInd, sizeof(OrdinalType), nnz, ColFile);
   fclose(RowFile);
   fclose(ColFile);
   return nnz;
 }
 
-template< typename ScalarType , typename OrdinalType>
-int SparseMatrix_generate(OrdinalType nrows, OrdinalType ncols, OrdinalType &nnz, OrdinalType varianz_nel_row, OrdinalType width_row, ScalarType* &values, OrdinalType* &rowPtr, OrdinalType* &colInd)
-{
-  rowPtr = new OrdinalType[nrows+1];
+template <typename ScalarType, typename OrdinalType>
+int SparseMatrix_generate(OrdinalType nrows, OrdinalType ncols,
+                          OrdinalType& nnz, OrdinalType varianz_nel_row,
+                          OrdinalType width_row, ScalarType*& values,
+                          OrdinalType*& rowPtr, OrdinalType*& colInd) {
+  rowPtr = new OrdinalType[nrows + 1];
 
-  OrdinalType elements_per_row = nnz/nrows;
+  OrdinalType elements_per_row = nnz / nrows;
   srand(13721);
   rowPtr[0] = 0;
-  for(int row=0;row<nrows;row++)
-  {
-    int varianz = (1.0*rand()/INT_MAX-0.5)*varianz_nel_row;
-    rowPtr[row+1] = rowPtr[row] + elements_per_row+varianz;
+  for (int row = 0; row < nrows; row++) {
+    int varianz     = (1.0 * rand() / INT_MAX - 0.5) * varianz_nel_row;
+    rowPtr[row + 1] = rowPtr[row] + elements_per_row + varianz;
   }
-  nnz = rowPtr[nrows];
+  nnz    = rowPtr[nrows];
   values = new ScalarType[nnz];
   colInd = new OrdinalType[nnz];
-  for(int row=0;row<nrows;row++)
-  {
-         for(int k=rowPtr[row];k<rowPtr[row+1];k++)
-         {
-                int pos = (1.0*rand()/INT_MAX-0.5)*width_row+row;
-                if(pos<0) pos+=ncols;
-                if(pos>=ncols) pos-=ncols;
-                colInd[k]= pos;
-                values[k] = 100.0*rand()/INT_MAX-50.0;
-         }
+  for (int row = 0; row < nrows; row++) {
+    for (int k = rowPtr[row]; k < rowPtr[row + 1]; k++) {
+      int pos = (1.0 * rand() / INT_MAX - 0.5) * width_row + row;
+      if (pos < 0) pos += ncols;
+      if (pos >= ncols) pos -= ncols;
+      colInd[k] = pos;
+      values[k] = 100.0 * rand() / INT_MAX - 50.0;
+    }
   }
   return nnz;
 }
 
-template<typename Scalar>
-int test_crs_matrix_test(LocalOrdinalType numRows, LocalOrdinalType numCols, LocalOrdinalType nnz, LocalOrdinalType numVecs, LocalOrdinalType test, const char* filename,const bool binaryfile) {
-        typedef Kokkos::CrsMatrix<Scalar,LocalOrdinalType,execution_space,void,int> matrix_type ;
-        typedef typename Kokkos::MultiVectorDynamic<Scalar,execution_space>::type mv_type;
-        typedef typename Kokkos::MultiVectorDynamic<Scalar,execution_space>::random_read_type mv_random_read_type;
-        typedef typename mv_type::HostMirror h_mv_type;
-
-        Scalar* val = NULL;
-        LocalOrdinalType* row = NULL;
-        LocalOrdinalType* col = NULL;
-
-        srand(17312837);
-        if(filename==NULL)
-          nnz = SparseMatrix_generate<Scalar,LocalOrdinalType>(numRows,numCols,nnz,nnz/numRows*0.2,numRows*0.01,val,row,col);
-        else
-          if(!binaryfile)
-            nnz = SparseMatrix_MatrixMarket_read<Scalar,LocalOrdinalType>(filename,numRows,numCols,nnz,val,row,col);
-          else
-            nnz = SparseMatrix_ReadBinaryGraph<Scalar,LocalOrdinalType>(filename,numRows,numCols,nnz,val,row,col);
-
-        matrix_type A("CRS::A",numRows,numCols,nnz,val,row,col,false);
-
-        mv_type x("X",numCols,numVecs);
-        mv_random_read_type t_x(x);
-        mv_type y("Y",numRows,numVecs);
-        h_mv_type h_x = Kokkos::create_mirror_view(x);
-        h_mv_type h_y = Kokkos::create_mirror_view(y);
-        h_mv_type h_y_compare = Kokkos::create_mirror(y);
-    typename matrix_type::StaticCrsGraphType::HostMirror h_graph = Kokkos::create_mirror(A.graph);
-    typename matrix_type::values_type::HostMirror h_values = Kokkos::create_mirror_view(A.values);
-
-    //Kokkos::deep_copy(h_graph.row_map,A.graph.row_map);
-    for(LocalOrdinalType k=0;k<numVecs;k++){
-          //h_a(k) = (Scalar) (1.0*(rand()%40)-20.);
-          for(LocalOrdinalType i=0; i<numCols;i++) {
-                  h_x(i,k) = (Scalar) (1.0*(rand()%40)-20.);
-                  h_y(i,k) = (Scalar) (1.0*(rand()%40)-20.);
-          }
+template <typename Scalar>
+int test_crs_matrix_test(LocalOrdinalType numRows, LocalOrdinalType numCols,
+                         LocalOrdinalType nnz, LocalOrdinalType numVecs,
+                         LocalOrdinalType test, const char* filename,
+                         const bool binaryfile) {
+  typedef Kokkos::CrsMatrix<Scalar, LocalOrdinalType, execution_space, void,
+                            int>
+      matrix_type;
+  typedef typename Kokkos::MultiVectorDynamic<Scalar, execution_space>::type
+      mv_type;
+  typedef typename Kokkos::MultiVectorDynamic<
+      Scalar, execution_space>::random_read_type mv_random_read_type;
+  typedef typename mv_type::HostMirror h_mv_type;
+
+  Scalar* val           = NULL;
+  LocalOrdinalType* row = NULL;
+  LocalOrdinalType* col = NULL;
+
+  srand(17312837);
+  if (filename == NULL)
+    nnz = SparseMatrix_generate<Scalar, LocalOrdinalType>(
+        numRows, numCols, nnz, nnz / numRows * 0.2, numRows * 0.01, val, row,
+        col);
+  else if (!binaryfile)
+    nnz = SparseMatrix_MatrixMarket_read<Scalar, LocalOrdinalType>(
+        filename, numRows, numCols, nnz, val, row, col);
+  else
+    nnz = SparseMatrix_ReadBinaryGraph<Scalar, LocalOrdinalType>(
+        filename, numRows, numCols, nnz, val, row, col);
+
+  matrix_type A("CRS::A", numRows, numCols, nnz, val, row, col, false);
+
+  mv_type x("X", numCols, numVecs);
+  mv_random_read_type t_x(x);
+  mv_type y("Y", numRows, numVecs);
+  h_mv_type h_x         = Kokkos::create_mirror_view(x);
+  h_mv_type h_y         = Kokkos::create_mirror_view(y);
+  h_mv_type h_y_compare = Kokkos::create_mirror(y);
+  typename matrix_type::StaticCrsGraphType::HostMirror h_graph =
+      Kokkos::create_mirror(A.graph);
+  typename matrix_type::values_type::HostMirror h_values =
+      Kokkos::create_mirror_view(A.values);
+
+  // Kokkos::deep_copy(h_graph.row_map,A.graph.row_map);
+  for (LocalOrdinalType k = 0; k < numVecs; k++) {
+    // h_a(k) = (Scalar) (1.0*(rand()%40)-20.);
+    for (LocalOrdinalType i = 0; i < numCols; i++) {
+      h_x(i, k) = (Scalar)(1.0 * (rand() % 40) - 20.);
+      h_y(i, k) = (Scalar)(1.0 * (rand() % 40) - 20.);
+    }
+  }
+  for (LocalOrdinalType i = 0; i < numRows; i++) {
+    LocalOrdinalType start = h_graph.row_map(i);
+    LocalOrdinalType end   = h_graph.row_map(i + 1);
+    for (LocalOrdinalType j = start; j < end; j++) {
+      h_values(j) = h_graph.entries(j) + i;
     }
-        for(LocalOrdinalType i=0;i<numRows;i++) {
-                LocalOrdinalType start = h_graph.row_map(i);
-                LocalOrdinalType end = h_graph.row_map(i+1);
-                for(LocalOrdinalType j=start;j<end;j++) {
-                   h_values(j) = h_graph.entries(j) + i;
-                }
-                for(LocalOrdinalType k = 0; k<numVecs; k++)
-                  h_y_compare(i,k) = 0;
-                for(LocalOrdinalType j=start;j<end;j++) {
-                   Scalar val = h_graph.entries(j) + i;
-                   LocalOrdinalType idx = h_graph.entries(j);
-                   for(LocalOrdinalType k = 0; k<numVecs; k++)
-                           h_y_compare(i,k)+=val*h_x(idx,k);
-                }
-        }
-
-        Kokkos::deep_copy(x,h_x);
-        Kokkos::deep_copy(y,h_y);
-        Kokkos::deep_copy(A.graph.entries,h_graph.entries);
-        Kokkos::deep_copy(A.values,h_values);
+    for (LocalOrdinalType k = 0; k < numVecs; k++) h_y_compare(i, k) = 0;
+    for (LocalOrdinalType j = start; j < end; j++) {
+      Scalar val           = h_graph.entries(j) + i;
+      LocalOrdinalType idx = h_graph.entries(j);
+      for (LocalOrdinalType k = 0; k < numVecs; k++)
+        h_y_compare(i, k) += val * h_x(idx, k);
+    }
+  }
+
+  Kokkos::deep_copy(x, h_x);
+  Kokkos::deep_copy(y, h_y);
+  Kokkos::deep_copy(A.graph.entries, h_graph.entries);
+  Kokkos::deep_copy(A.values, h_values);
 
 #ifdef NEWKERNEL
-          KokkosSparse::spmv("N",1.0,A,x,0.0,y);
+  KokkosSparse::spmv("N", 1.0, A, x, 0.0, y);
 #else
-          Kokkos::MV_Multiply(y,A,x);
+  Kokkos::MV_Multiply(y, A, x);
 #endif
-        execution_space().fence();
-        Kokkos::deep_copy(h_y,y);
-        Scalar error[numVecs];
-        Scalar sum[numVecs];
-        for(LocalOrdinalType k = 0; k<numVecs; k++) {
-                error[k] = 0;
-                sum[k] = 0;
-        }
-        for(LocalOrdinalType i=0;i<numRows;i++)
-                for(LocalOrdinalType k = 0; k<numVecs; k++) {
-          error[k]+=(h_y_compare(i,k)-h_y(i,k))*(h_y_compare(i,k)-h_y(i,k));
-          sum[k] += h_y_compare(i,k)*h_y_compare(i,k);
-         // prLocalOrdinalTypef("%i %i %lf %lf %lf\n",i,k,h_y_compare(i,k),h_y(i,k),h_x(i,k));
-                }
-
-        //for(LocalOrdinalType i=0;i<A.nnz;i++) prLocalOrdinalTypef("%i %lf\n",h_graph.entries(i),h_values(i));
-    LocalOrdinalType num_errors = 0;
-    double total_error = 0;
-    double total_sum = 0;
-        for(LocalOrdinalType k = 0; k<numVecs; k++) {
-                num_errors += (error[k]/(sum[k]==0?1:sum[k]))>1e-5?1:0;
-                total_error += error[k];
-                total_sum += sum[k];
-        }
-
-    LocalOrdinalType loop = 10;
-    Kokkos::Timer timer;
-        for(LocalOrdinalType i=0;i<loop;i++)
+  execution_space().fence();
+  Kokkos::deep_copy(h_y, y);
+  Scalar error[numVecs];
+  Scalar sum[numVecs];
+  for (LocalOrdinalType k = 0; k < numVecs; k++) {
+    error[k] = 0;
+    sum[k]   = 0;
+  }
+  for (LocalOrdinalType i = 0; i < numRows; i++)
+    for (LocalOrdinalType k = 0; k < numVecs; k++) {
+      error[k] +=
+          (h_y_compare(i, k) - h_y(i, k)) * (h_y_compare(i, k) - h_y(i, k));
+      sum[k] += h_y_compare(i, k) * h_y_compare(i, k);
+      // prLocalOrdinalTypef("%i %i %lf %lf
+      // %lf\n",i,k,h_y_compare(i,k),h_y(i,k),h_x(i,k));
+    }
+
+  // for(LocalOrdinalType i=0;i<A.nnz;i++) prLocalOrdinalTypef("%i
+  // %lf\n",h_graph.entries(i),h_values(i));
+  LocalOrdinalType num_errors = 0;
+  double total_error          = 0;
+  double total_sum            = 0;
+  for (LocalOrdinalType k = 0; k < numVecs; k++) {
+    num_errors += (error[k] / (sum[k] == 0 ? 1 : sum[k])) > 1e-5 ? 1 : 0;
+    total_error += error[k];
+    total_sum += sum[k];
+  }
+
+  LocalOrdinalType loop = 10;
+  Kokkos::Timer timer;
+  for (LocalOrdinalType i = 0; i < loop; i++)
 #ifdef NEWKERNEL
-          KokkosSparse::spmv("N",1.0,A,x,0.0,y);
+    KokkosSparse::spmv("N", 1.0, A, x, 0.0, y);
 #else
-          Kokkos::MV_Multiply(y,A,x);
+    Kokkos::MV_Multiply(y, A, x);
 #endif
-        execution_space().fence();
-        double time = timer.seconds();
-        double matrix_size = 1.0*((nnz*(sizeof(Scalar)+sizeof(LocalOrdinalType)) + numRows*sizeof(LocalOrdinalType)))/1024/1024;
-        double vector_size = 2.0*numRows*numVecs*sizeof(Scalar)/1024/1024;
-        double vector_readwrite = (nnz+numCols)*numVecs*sizeof(Scalar)/1024/1024;
-
-        double problem_size = matrix_size+vector_size;
-    printf("%i %i %i %i %6.2lf MB %6.2lf GB/s %6.2lf GFlop/s %6.3lf ms %i\n",nnz, numRows,numCols,numVecs,problem_size,(matrix_size+vector_readwrite)/time*loop/1024, 2.0*nnz*numVecs*loop/time/1e9,time/loop*1000, num_errors);
-        return (int)total_error;
+  execution_space().fence();
+  double time        = timer.seconds();
+  double matrix_size = 1.0 *
+                       ((nnz * (sizeof(Scalar) + sizeof(LocalOrdinalType)) +
+                         numRows * sizeof(LocalOrdinalType))) /
+                       1024 / 1024;
+  double vector_size = 2.0 * numRows * numVecs * sizeof(Scalar) / 1024 / 1024;
+  double vector_readwrite =
+      (nnz + numCols) * numVecs * sizeof(Scalar) / 1024 / 1024;
+
+  double problem_size = matrix_size + vector_size;
+  printf("%i %i %i %i %6.2lf MB %6.2lf GB/s %6.2lf GFlop/s %6.3lf ms %i\n", nnz,
+         numRows, numCols, numVecs, problem_size,
+         (matrix_size + vector_readwrite) / time * loop / 1024,
+         2.0 * nnz * numVecs * loop / time / 1e9, time / loop * 1000,
+         num_errors);
+  return (int)total_error;
 }
 
-template<typename Scalar>
-int test_crs_matrix_test_singlevec(int numRows, int numCols, int nnz, int test, const char* filename, const bool binaryfile) {
-        typedef Kokkos::CrsMatrix<Scalar,int,execution_space,void,int> matrix_type ;
-        typedef typename Kokkos::View<Scalar*,Kokkos::LayoutLeft,execution_space> mv_type;
-        typedef typename Kokkos::View<Scalar*,Kokkos::LayoutLeft,execution_space,Kokkos::MemoryRandomAccess > mv_random_read_type;
-        typedef typename mv_type::HostMirror h_mv_type;
-
-        Scalar* val = NULL;
-        int* row = NULL;
-        int* col = NULL;
-
-        srand(17312837);
-        if(filename==NULL)
-          nnz = SparseMatrix_generate<Scalar,int>(numRows,numCols,nnz,nnz/numRows*0.2,numRows*0.01,val,row,col);
-        else
-          if(!binaryfile)
-            nnz = SparseMatrix_MatrixMarket_read<Scalar,int>(filename,numRows,numCols,nnz,val,row,col);
-          else
-            nnz = SparseMatrix_ReadBinaryGraph<Scalar,int>(filename,numRows,numCols,nnz,val,row,col);
-
-        matrix_type A("CRS::A",numRows,numCols,nnz,val,row,col,false);
-
-        mv_type x("X",numCols);
-        mv_random_read_type t_x(x);
-        mv_type y("Y",numRows);
-        h_mv_type h_x = Kokkos::create_mirror_view(x);
-        h_mv_type h_y = Kokkos::create_mirror_view(y);
-        h_mv_type h_y_compare = Kokkos::create_mirror(y);
-    typename matrix_type::StaticCrsGraphType::HostMirror h_graph = Kokkos::create_mirror(A.graph);
-    typename matrix_type::values_type::HostMirror h_values = Kokkos::create_mirror_view(A.values);
-
-    //Kokkos::deep_copy(h_graph.row_map,A.graph.row_map);
-          //h_a(k) = (Scalar) (1.0*(rand()%40)-20.);
-          for(int i=0; i<numCols;i++) {
-                  h_x(i) = (Scalar) (1.0*(rand()%40)-20.);
-                  h_y(i) = (Scalar) (1.0*(rand()%40)-20.);
-          }
-        for(int i=0;i<numRows;i++) {
-                int start = h_graph.row_map(i);
-                int end = h_graph.row_map(i+1);
-                for(int j=start;j<end;j++) {
-                   h_values(j) = h_graph.entries(j) + i;
-                }
-            h_y_compare(i) = 0;
-                for(int j=start;j<end;j++) {
-                   Scalar val = h_graph.entries(j) + i;
-                   int idx = h_graph.entries(j);
-                     h_y_compare(i)+=val*h_x(idx);
-                }
-        }
-
-        Kokkos::deep_copy(x,h_x);
-        Kokkos::deep_copy(y,h_y);
-        Kokkos::deep_copy(A.graph.entries,h_graph.entries);
-        Kokkos::deep_copy(A.values,h_values);
-        /*for(int i=0;i<numRows;i++)
-                for(int k = 0; k<numVecs; k++) {
-          //error[k]+=(h_y_compare(i,k)-h_y(i,k))*(h_y_compare(i,k)-h_y(i,k));
-          printf("%i %i %lf %lf %lf\n",i,k,h_y_compare(i,k),h_y(i,k),h_x(i,k));
-                }*/
-    typename Kokkos::CrsMatrix<Scalar,int,execution_space,void,int>::values_type x1("X1",numCols);
-    typename Kokkos::CrsMatrix<Scalar,int,execution_space,void,int>::values_type y1("Y1",numRows);
+template <typename Scalar>
+int test_crs_matrix_test_singlevec(int numRows, int numCols, int nnz, int test,
+                                   const char* filename,
+                                   const bool binaryfile) {
+  typedef Kokkos::CrsMatrix<Scalar, int, execution_space, void, int>
+      matrix_type;
+  typedef typename Kokkos::View<Scalar*, Kokkos::LayoutLeft, execution_space>
+      mv_type;
+  typedef typename Kokkos::View<Scalar*, Kokkos::LayoutLeft, execution_space,
+                                Kokkos::MemoryRandomAccess>
+      mv_random_read_type;
+  typedef typename mv_type::HostMirror h_mv_type;
+
+  Scalar* val = NULL;
+  int* row    = NULL;
+  int* col    = NULL;
+
+  srand(17312837);
+  if (filename == NULL)
+    nnz = SparseMatrix_generate<Scalar, int>(numRows, numCols, nnz,
+                                             nnz / numRows * 0.2,
+                                             numRows * 0.01, val, row, col);
+  else if (!binaryfile)
+    nnz = SparseMatrix_MatrixMarket_read<Scalar, int>(
+        filename, numRows, numCols, nnz, val, row, col);
+  else
+    nnz = SparseMatrix_ReadBinaryGraph<Scalar, int>(filename, numRows, numCols,
+                                                    nnz, val, row, col);
+
+  matrix_type A("CRS::A", numRows, numCols, nnz, val, row, col, false);
+
+  mv_type x("X", numCols);
+  mv_random_read_type t_x(x);
+  mv_type y("Y", numRows);
+  h_mv_type h_x         = Kokkos::create_mirror_view(x);
+  h_mv_type h_y         = Kokkos::create_mirror_view(y);
+  h_mv_type h_y_compare = Kokkos::create_mirror(y);
+  typename matrix_type::StaticCrsGraphType::HostMirror h_graph =
+      Kokkos::create_mirror(A.graph);
+  typename matrix_type::values_type::HostMirror h_values =
+      Kokkos::create_mirror_view(A.values);
+
+  // Kokkos::deep_copy(h_graph.row_map,A.graph.row_map);
+  // h_a(k) = (Scalar) (1.0*(rand()%40)-20.);
+  for (int i = 0; i < numCols; i++) {
+    h_x(i) = (Scalar)(1.0 * (rand() % 40) - 20.);
+    h_y(i) = (Scalar)(1.0 * (rand() % 40) - 20.);
+  }
+  for (int i = 0; i < numRows; i++) {
+    int start = h_graph.row_map(i);
+    int end   = h_graph.row_map(i + 1);
+    for (int j = start; j < end; j++) {
+      h_values(j) = h_graph.entries(j) + i;
+    }
+    h_y_compare(i) = 0;
+    for (int j = start; j < end; j++) {
+      Scalar val = h_graph.entries(j) + i;
+      int idx    = h_graph.entries(j);
+      h_y_compare(i) += val * h_x(idx);
+    }
+  }
+
+  Kokkos::deep_copy(x, h_x);
+  Kokkos::deep_copy(y, h_y);
+  Kokkos::deep_copy(A.graph.entries, h_graph.entries);
+  Kokkos::deep_copy(A.values, h_values);
+  /*for(int i=0;i<numRows;i++)
+          for(int k = 0; k<numVecs; k++) {
+    //error[k]+=(h_y_compare(i,k)-h_y(i,k))*(h_y_compare(i,k)-h_y(i,k));
+    printf("%i %i %lf %lf %lf\n",i,k,h_y_compare(i,k),h_y(i,k),h_x(i,k));
+          }*/
+  typename Kokkos::CrsMatrix<Scalar, int, execution_space, void,
+                             int>::values_type x1("X1", numCols);
+  typename Kokkos::CrsMatrix<Scalar, int, execution_space, void,
+                             int>::values_type y1("Y1", numRows);
 #ifdef NEWKERNEL
-          KokkosSparse::spmv("N",1.0,A,x1,0.0,y1);
+  KokkosSparse::spmv("N", 1.0, A, x1, 0.0, y1);
 #else
-          Kokkos::MV_Multiply(y1,A,x1);
+  Kokkos::MV_Multiply(y1, A, x1);
 #endif
 
 #ifdef NEWKERNEL
-          KokkosSparse::spmv("N",1.0,A,x,0.0,y);
+  KokkosSparse::spmv("N", 1.0, A, x, 0.0, y);
 #else
-          Kokkos::MV_Multiply(y,A,x);
+  Kokkos::MV_Multiply(y, A, x);
 #endif
-        execution_space().fence();
-        Kokkos::deep_copy(h_y,y);
-        Scalar error = 0;
-        Scalar sum = 0;
-        for(int i=0;i<numRows;i++) {
-          error+=(h_y_compare(i)-h_y(i))*(h_y_compare(i)-h_y(i));
-          sum += h_y_compare(i)*h_y_compare(i);
-         // printf("%i %i %lf %lf %lf\n",i,k,h_y_compare(i,k),h_y(i,k),h_x(i,k));
-                }
-
-        //for(int i=0;i<A.nnz;i++) printf("%i %lf\n",h_graph.entries(i),h_values(i));
-    int num_errors = 0;
-    double total_error = 0;
-    double total_sum = 0;
-                num_errors += (error/(sum==0?1:sum))>1e-5?1:0;
-                total_error += error;
-                total_sum += sum;
-
-    int loop = 100;
-    Kokkos::Timer timer;
-
-        for(int i=0;i<loop;i++)
+  execution_space().fence();
+  Kokkos::deep_copy(h_y, y);
+  Scalar error = 0;
+  Scalar sum   = 0;
+  for (int i = 0; i < numRows; i++) {
+    error += (h_y_compare(i) - h_y(i)) * (h_y_compare(i) - h_y(i));
+    sum += h_y_compare(i) * h_y_compare(i);
+    // printf("%i %i %lf %lf %lf\n",i,k,h_y_compare(i,k),h_y(i,k),h_x(i,k));
+  }
+
+  // for(int i=0;i<A.nnz;i++) printf("%i %lf\n",h_graph.entries(i),h_values(i));
+  int num_errors     = 0;
+  double total_error = 0;
+  double total_sum   = 0;
+  num_errors += (error / (sum == 0 ? 1 : sum)) > 1e-5 ? 1 : 0;
+  total_error += error;
+  total_sum += sum;
+
+  int loop = 100;
+  Kokkos::Timer timer;
+
+  for (int i = 0; i < loop; i++)
 #ifdef NEWKERNEL
-          KokkosSparse::spmv("N",1.0,A,x,0.0,y);
+    KokkosSparse::spmv("N", 1.0, A, x, 0.0, y);
 #else
-        Kokkos::MV_Multiply(y,A,x);
+    Kokkos::MV_Multiply(y, A, x);
 #endif
-        execution_space().fence();
-        double time = timer.seconds();
-        double matrix_size = 1.0*((nnz*(sizeof(Scalar)+sizeof(int)) + numRows*sizeof(int)))/1024/1024;
-        double vector_size = 2.0*numRows*sizeof(Scalar)/1024/1024;
-        double vector_readwrite = (nnz+numCols)*sizeof(Scalar)/1024/1024;
-
-        double problem_size = matrix_size+vector_size;
-    printf("%i %i %i %i %6.2lf MB %6.2lf GB/s %6.2lf GFlop/s %6.3lf ms %i\n",nnz, numRows,numCols,1,problem_size,(matrix_size+vector_readwrite)/time*loop/1024, 2.0*nnz*loop/time/1e9, time/loop*1000, num_errors);
-        return (int)total_error;
+  execution_space().fence();
+  double time = timer.seconds();
+  double matrix_size =
+      1.0 * ((nnz * (sizeof(Scalar) + sizeof(int)) + numRows * sizeof(int))) /
+      1024 / 1024;
+  double vector_size      = 2.0 * numRows * sizeof(Scalar) / 1024 / 1024;
+  double vector_readwrite = (nnz + numCols) * sizeof(Scalar) / 1024 / 1024;
+
+  double problem_size = matrix_size + vector_size;
+  printf("%i %i %i %i %6.2lf MB %6.2lf GB/s %6.2lf GFlop/s %6.3lf ms %i\n", nnz,
+         numRows, numCols, 1, problem_size,
+         (matrix_size + vector_readwrite) / time * loop / 1024,
+         2.0 * nnz * loop / time / 1e9, time / loop * 1000, num_errors);
+  return (int)total_error;
 }
 
-
-int test_crs_matrix_type(int numrows, int numcols, int nnz, int numVecs, int type, int test, const char* filename, const bool binaryfile) {
-  if(numVecs==1)
-    return test_crs_matrix_test_singlevec<double>(numrows,numcols,nnz,test,filename,binaryfile);
+int test_crs_matrix_type(int numrows, int numcols, int nnz, int numVecs,
+                         int type, int test, const char* filename,
+                         const bool binaryfile) {
+  if (numVecs == 1)
+    return test_crs_matrix_test_singlevec<double>(numrows, numcols, nnz, test,
+                                                  filename, binaryfile);
   else
-    return test_crs_matrix_test<double>(numrows,numcols,nnz,numVecs,test,filename,binaryfile);
+    return test_crs_matrix_test<double>(numrows, numcols, nnz, numVecs, test,
+                                        filename, binaryfile);
 }
 
-int main(int argc, char **argv)
-{
- long long int size = 110503; // a prime number
- int numVecs = 4;
- int test=-1;
- int type=-1;
- char* filename = NULL;
- bool binaryfile = false;
-
- for(int i=0;i<argc;i++)
- {
-  if((strcmp(argv[i],"-s")==0)) {size=atoi(argv[++i]); continue;}
-  if((strcmp(argv[i],"-v")==0)) {numVecs=atoi(argv[++i]); continue;}
-  if((strcmp(argv[i],"--test")==0)) {test=atoi(argv[++i]); continue;}
-  if((strcmp(argv[i],"--type")==0)) {type=atoi(argv[++i]); continue;}
-  if((strcmp(argv[i],"-f")==0)) {filename = argv[++i]; continue;}
-  if((strcmp(argv[i],"-fb")==0)) {filename = argv[++i]; binaryfile = true; continue;}
- }
-
-
-
- Kokkos::initialize(argc,argv);
-
- int numVecsList[10] = {1, 2, 3, 4, 5, 8, 11, 15, 16, 17};
- int maxNumVecs = numVecs==-1?17:numVecs;
- int numVecIdx = 0;
- if(numVecs == -1) numVecs = numVecsList[numVecIdx++];
-
- int total_errors = 0;
- while(numVecs<=maxNumVecs) {
-   total_errors += test_crs_matrix_type(size,size,size*10,numVecs,type,test,filename,binaryfile);
-   if(numVecs<maxNumVecs) numVecs = numVecsList[numVecIdx++];
-   else numVecs++;
- }
-
- if(total_errors == 0)
-   printf("Kokkos::MultiVector Test: Passed\n");
- else
-   printf("Kokkos::MultiVector Test: Failed\n");
+int main(int argc, char** argv) {
+  long long int size = 110503;  // a prime number
+  int numVecs        = 4;
+  int test           = -1;
+  int type           = -1;
+  char* filename     = NULL;
+  bool binaryfile    = false;
+
+  for (int i = 0; i < argc; i++) {
+    if ((strcmp(argv[i], "-s") == 0)) {
+      size = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-v") == 0)) {
+      numVecs = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "--test") == 0)) {
+      test = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "--type") == 0)) {
+      type = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-f") == 0)) {
+      filename = argv[++i];
+      continue;
+    }
+    if ((strcmp(argv[i], "-fb") == 0)) {
+      filename   = argv[++i];
+      binaryfile = true;
+      continue;
+    }
+  }
+
+  Kokkos::initialize(argc, argv);
+
+  int numVecsList[10] = {1, 2, 3, 4, 5, 8, 11, 15, 16, 17};
+  int maxNumVecs      = numVecs == -1 ? 17 : numVecs;
+  int numVecIdx       = 0;
+  if (numVecs == -1) numVecs = numVecsList[numVecIdx++];
+
+  int total_errors = 0;
+  while (numVecs <= maxNumVecs) {
+    total_errors += test_crs_matrix_type(size, size, size * 10, numVecs, type,
+                                         test, filename, binaryfile);
+    if (numVecs < maxNumVecs)
+      numVecs = numVecsList[numVecIdx++];
+    else
+      numVecs++;
+  }
 
+  if (total_errors == 0)
+    printf("Kokkos::MultiVector Test: Passed\n");
+  else
+    printf("Kokkos::MultiVector Test: Failed\n");
 
   Kokkos::finalize();
 }
diff --git a/perf_test/test_mv.cpp b/perf_test/test_mv.cpp
index d2ca6c8b39..298909fb34 100644
--- a/perf_test/test_mv.cpp
+++ b/perf_test/test_mv.cpp
@@ -55,7 +55,7 @@
 #ifndef DEVICE
 #define DEVICE 1
 #endif
-#if DEVICE==1
+#if DEVICE == 1
 typedef Kokkos::Threads execution_space;
 #else
 typedef Kokkos::Cuda execution_space;
@@ -65,218 +65,247 @@ typedef double FLOAT;
 
 #define EPSILON 1e-10
 
-typedef MultiVectorDynamic<FLOAT,execution_space>::type mv_type;
+typedef MultiVectorDynamic<FLOAT, execution_space>::type mv_type;
 typedef mv_type::HostMirror h_mv_type;
-typedef Kokkos::View<FLOAT* ,Kokkos::LayoutLeft,execution_space >  vector_type ;
-typedef Kokkos::View<FLOAT* ,Kokkos::LayoutLeft,Kokkos::Threads >  h2_vector_type ;
+typedef Kokkos::View<FLOAT*, Kokkos::LayoutLeft, execution_space> vector_type;
+typedef Kokkos::View<FLOAT*, Kokkos::LayoutLeft, Kokkos::Threads>
+    h2_vector_type;
 typedef vector_type::HostMirror h_vector_type;
-typedef mv_type::size_type            size_type;
-
-void test_mv_dot(int size, int numVecs, int loop)
-{
-  mv_type x("X",size,numVecs);
-  mv_type y("Y",size,numVecs);
-  mv_type r("R",size,numVecs);
-  vector_type a("A",numVecs);
-  h_mv_type h_x = Kokkos::create_mirror_view(x);
-  h_mv_type h_y = Kokkos::create_mirror_view(y);
-  h_mv_type h_rh = Kokkos::create_mirror_view(r);
-  h_mv_type h_rd = Kokkos::create_mirror_view(r);
+typedef mv_type::size_type size_type;
+
+void test_mv_dot(int size, int numVecs, int loop) {
+  mv_type x("X", size, numVecs);
+  mv_type y("Y", size, numVecs);
+  mv_type r("R", size, numVecs);
+  vector_type a("A", numVecs);
+  h_mv_type h_x     = Kokkos::create_mirror_view(x);
+  h_mv_type h_y     = Kokkos::create_mirror_view(y);
+  h_mv_type h_rh    = Kokkos::create_mirror_view(r);
+  h_mv_type h_rd    = Kokkos::create_mirror_view(r);
   h_vector_type h_a = Kokkos::create_mirror_view(a);
-  h2_vector_type h_a2("h2",numVecs);
+  h2_vector_type h_a2("h2", numVecs);
 
   srand(17231);
-  for(int k=0;k<numVecs;k++){
-    h_a2(k) = 0;//(1.0*(1.0*rand()/std::numeric_limits<unsigned int>::max())-0.5)*1;
-	h_a(k) = 0;
-	for(int i=0; i<size;i++) {
-	  h_x(i,k) = (1.0*(1.0*rand()/std::numeric_limits<unsigned int>::max())-0.5)*1;
-	  h_y(i,k) = (1.0*(1.0*rand()/std::numeric_limits<unsigned int>::max())-0.5)*1;
-	  h_a2(k)+= h_y(i,k)*h_x(i,k);
-	}
+  for (int k = 0; k < numVecs; k++) {
+    h_a2(k) =
+        0;  //(1.0*(1.0*rand()/std::numeric_limits<unsigned int>::max())-0.5)*1;
+    h_a(k) = 0;
+    for (int i = 0; i < size; i++) {
+      h_x(i, k) =
+          (1.0 * (1.0 * rand() / std::numeric_limits<unsigned int>::max()) -
+           0.5) *
+          1;
+      h_y(i, k) =
+          (1.0 * (1.0 * rand() / std::numeric_limits<unsigned int>::max()) -
+           0.5) *
+          1;
+      h_a2(k) += h_y(i, k) * h_x(i, k);
+    }
   }
 
-  Kokkos::deep_copy(x,h_x);
-  Kokkos::deep_copy(y,h_y);
-  Kokkos::deep_copy(a,h_a);
+  Kokkos::deep_copy(x, h_x);
+  Kokkos::deep_copy(y, h_y);
+  Kokkos::deep_copy(a, h_a);
   h_vector_type h_b = h_a;
 
-  MV_Dot(a,x,y);
+  MV_Dot(a, x, y);
   execution_space().fence();
 
-  Kokkos::deep_copy(h_a,a);
-  double errorsum=0;
-  int errors=0;
-  for(int k=0;k<numVecs;k++)
-  {
-    errorsum+=fabs((h_a(k)-h_a2(k))/h_a2(k));
-	if(fabs((h_a(k)-h_a2(k))/h_a2(k))>EPSILON) errors++;
+  Kokkos::deep_copy(h_a, a);
+  double errorsum = 0;
+  int errors      = 0;
+  for (int k = 0; k < numVecs; k++) {
+    errorsum += fabs((h_a(k) - h_a2(k)) / h_a2(k));
+    if (fabs((h_a(k) - h_a2(k)) / h_a2(k)) > EPSILON) errors++;
   }
 
-  timespec starttime,endtime;
-  clock_gettime(CLOCK_REALTIME,&starttime);
-  for(int i=0;i<loop;i++)
-    MV_Dot(a,x,y);
+  timespec starttime, endtime;
+  clock_gettime(CLOCK_REALTIME, &starttime);
+  for (int i = 0; i < loop; i++) MV_Dot(a, x, y);
   execution_space().fence();
-  clock_gettime(CLOCK_REALTIME,&endtime);
-  double time = endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
-
+  clock_gettime(CLOCK_REALTIME, &endtime);
+  double time = endtime.tv_sec - starttime.tv_sec +
+                1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
 
   printf("MV_Dot:       %6.2lf GB/s %8i Elements %3i Vectors %s\n",
-		  2*size*numVecs*sizeof(FLOAT)*loop/time*1e-9, size, numVecs, errors==0?"PASSED":"FAILED");
+         2 * size * numVecs * sizeof(FLOAT) * loop / time * 1e-9, size, numVecs,
+         errors == 0 ? "PASSED" : "FAILED");
 }
 
-void test_mv_add(int size, int numVecs, int loop)
-{
-  mv_type x("X",size,numVecs);
-  mv_type y("Y",size,numVecs);
-  mv_type r("R",size,numVecs);
-  vector_type a("A",numVecs);
-  h_mv_type h_x = Kokkos::create_mirror_view(x);
-  h_mv_type h_y = Kokkos::create_mirror_view(y);
-  h_mv_type h_rh = Kokkos::create_mirror_view(r);
-  h_mv_type h_rd = Kokkos::create_mirror_view(r);
+void test_mv_add(int size, int numVecs, int loop) {
+  mv_type x("X", size, numVecs);
+  mv_type y("Y", size, numVecs);
+  mv_type r("R", size, numVecs);
+  vector_type a("A", numVecs);
+  h_mv_type h_x     = Kokkos::create_mirror_view(x);
+  h_mv_type h_y     = Kokkos::create_mirror_view(y);
+  h_mv_type h_rh    = Kokkos::create_mirror_view(r);
+  h_mv_type h_rd    = Kokkos::create_mirror_view(r);
   h_vector_type h_a = Kokkos::create_mirror_view(a);
-  h_vector_type h_b("h_b",numVecs);
+  h_vector_type h_b("h_b", numVecs);
 
   srand(17231);
-  for(int k=0;k<numVecs;k++){
-    h_a(k) = (1.0*(1.0*rand()/std::numeric_limits<unsigned int>::max())-0.5)*1;
-	for(int i=0; i<size;i++) {
-	  h_x(i,k) = (1.0*(1.0*rand()/std::numeric_limits<unsigned int>::max())-0.5)*1;
-	  h_y(i,k) = (1.0*(1.0*rand()/std::numeric_limits<unsigned int>::max())-0.5)*1;
-	  h_rh(i,k) = h_a(k)*h_y(i,k) + h_a(k)*h_x(i,k);
-	}
+  for (int k = 0; k < numVecs; k++) {
+    h_a(k) = (1.0 * (1.0 * rand() / std::numeric_limits<unsigned int>::max()) -
+              0.5) *
+             1;
+    for (int i = 0; i < size; i++) {
+      h_x(i, k) =
+          (1.0 * (1.0 * rand() / std::numeric_limits<unsigned int>::max()) -
+           0.5) *
+          1;
+      h_y(i, k) =
+          (1.0 * (1.0 * rand() / std::numeric_limits<unsigned int>::max()) -
+           0.5) *
+          1;
+      h_rh(i, k) = h_a(k) * h_y(i, k) + h_a(k) * h_x(i, k);
+    }
   }
 
-  Kokkos::deep_copy(x,h_x);
-  Kokkos::deep_copy(y,h_y);
-  Kokkos::deep_copy(a,h_a);
-  MV_Add(r,a,x,a,y);
+  Kokkos::deep_copy(x, h_x);
+  Kokkos::deep_copy(y, h_y);
+  Kokkos::deep_copy(a, h_a);
+  MV_Add(r, a, x, a, y);
   execution_space().fence();
 
-  Kokkos::deep_copy(h_rd,r);
-  for(int k=0;k<numVecs;k++){
-	h_a(k) = 0;
-	h_b(k) = 0;
-    for(int i=0; i<size;i++) {
-	  h_a(k)+= (h_rh(i,k)-h_rd(i,k))*(h_rh(i,k)-h_rd(i,k));
-	  h_b(k)+= h_rh(i,k)*h_rh(i,k);
-	}
+  Kokkos::deep_copy(h_rd, r);
+  for (int k = 0; k < numVecs; k++) {
+    h_a(k) = 0;
+    h_b(k) = 0;
+    for (int i = 0; i < size; i++) {
+      h_a(k) += (h_rh(i, k) - h_rd(i, k)) * (h_rh(i, k) - h_rd(i, k));
+      h_b(k) += h_rh(i, k) * h_rh(i, k);
+    }
   }
 
-  double errorsum=0;
-  int errors=0;
-  for(int k=0;k<numVecs;k++)
-  {
-    errorsum+=fabs((h_a(k))/h_b(k));
-	if(fabs((h_a(k))/h_b(k))>EPSILON) errors++;
+  double errorsum = 0;
+  int errors      = 0;
+  for (int k = 0; k < numVecs; k++) {
+    errorsum += fabs((h_a(k)) / h_b(k));
+    if (fabs((h_a(k)) / h_b(k)) > EPSILON) errors++;
   }
 
-  timespec starttime,endtime;
-  clock_gettime(CLOCK_REALTIME,&starttime);
-  for(int i=0;i<loop;i++)
-    MV_Add(r,a,x,a,y);
+  timespec starttime, endtime;
+  clock_gettime(CLOCK_REALTIME, &starttime);
+  for (int i = 0; i < loop; i++) MV_Add(r, a, x, a, y);
   execution_space().fence();
-  clock_gettime(CLOCK_REALTIME,&endtime);
-  double time = endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
+  clock_gettime(CLOCK_REALTIME, &endtime);
+  double time = endtime.tv_sec - starttime.tv_sec +
+                1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
 
   printf("MV_Add:       %6.2lf GB/s %8i Elements %3i Vectors %s\n",
-		  3*size*numVecs*sizeof(FLOAT)*loop/time*1e-9, size, numVecs, errors==0?"PASSED":"FAILED");
+         3 * size * numVecs * sizeof(FLOAT) * loop / time * 1e-9, size, numVecs,
+         errors == 0 ? "PASSED" : "FAILED");
 }
 
-void test_mv_mulscalar(int size, int numVecs, int loop)
-{
-  mv_type x("X",size,numVecs);
-  mv_type r("R",size,numVecs);
-  vector_type a("A",numVecs);
-  h_mv_type h_x = Kokkos::create_mirror_view(x);
-  h_mv_type h_rh = Kokkos::create_mirror_view(r);
-  h_mv_type h_rd = Kokkos::create_mirror_view(r);
+void test_mv_mulscalar(int size, int numVecs, int loop) {
+  mv_type x("X", size, numVecs);
+  mv_type r("R", size, numVecs);
+  vector_type a("A", numVecs);
+  h_mv_type h_x     = Kokkos::create_mirror_view(x);
+  h_mv_type h_rh    = Kokkos::create_mirror_view(r);
+  h_mv_type h_rd    = Kokkos::create_mirror_view(r);
   h_vector_type h_a = Kokkos::create_mirror_view(a);
-  h_vector_type h_b("h_b",numVecs);
+  h_vector_type h_b("h_b", numVecs);
 
   srand(17231);
-  for(int k=0;k<numVecs;k++){
-    h_a(k) = (1.0*(1.0*rand()/std::numeric_limits<unsigned int>::max())-0.5)*1;
-	for(int i=0; i<size;i++) {
-	  h_x(i,k) = (1.0*(1.0*rand()/std::numeric_limits<unsigned int>::max())-0.5)*1;
-	  h_rh(i,k) = h_a(k)*h_x(i,k);
-	}
+  for (int k = 0; k < numVecs; k++) {
+    h_a(k) = (1.0 * (1.0 * rand() / std::numeric_limits<unsigned int>::max()) -
+              0.5) *
+             1;
+    for (int i = 0; i < size; i++) {
+      h_x(i, k) =
+          (1.0 * (1.0 * rand() / std::numeric_limits<unsigned int>::max()) -
+           0.5) *
+          1;
+      h_rh(i, k) = h_a(k) * h_x(i, k);
+    }
   }
 
-  Kokkos::deep_copy(x,h_x);
-  Kokkos::deep_copy(a,h_a);
-  MV_MulScalar(r,a,x);
+  Kokkos::deep_copy(x, h_x);
+  Kokkos::deep_copy(a, h_a);
+  MV_MulScalar(r, a, x);
   execution_space().fence();
 
-  Kokkos::deep_copy(h_rd,r);
-  for(int k=0;k<numVecs;k++){
-	h_a(k) = 0;
-	h_b(k) = 0;
-    for(int i=0; i<size;i++) {
-	  h_a(k)+= (h_rh(i,k)-h_rd(i,k))*(h_rh(i,k)-h_rd(i,k));
-	  h_b(k)+= h_rh(i,k)*h_rh(i,k);
-	}
+  Kokkos::deep_copy(h_rd, r);
+  for (int k = 0; k < numVecs; k++) {
+    h_a(k) = 0;
+    h_b(k) = 0;
+    for (int i = 0; i < size; i++) {
+      h_a(k) += (h_rh(i, k) - h_rd(i, k)) * (h_rh(i, k) - h_rd(i, k));
+      h_b(k) += h_rh(i, k) * h_rh(i, k);
+    }
   }
 
-  double errorsum=0;
-  int errors=0;
-  for(int k=0;k<numVecs;k++)
-  {
-    errorsum+=fabs((h_a(k))/h_b(k));
-	if(fabs((h_a(k))/h_b(k))>EPSILON) errors++;
+  double errorsum = 0;
+  int errors      = 0;
+  for (int k = 0; k < numVecs; k++) {
+    errorsum += fabs((h_a(k)) / h_b(k));
+    if (fabs((h_a(k)) / h_b(k)) > EPSILON) errors++;
   }
 
-  timespec starttime,endtime;
-  clock_gettime(CLOCK_REALTIME,&starttime);
-  for(int i=0;i<loop;i++)
-	  MV_MulScalar(r,a,x);
+  timespec starttime, endtime;
+  clock_gettime(CLOCK_REALTIME, &starttime);
+  for (int i = 0; i < loop; i++) MV_MulScalar(r, a, x);
   execution_space().fence();
-  clock_gettime(CLOCK_REALTIME,&endtime);
-  double time = endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
+  clock_gettime(CLOCK_REALTIME, &endtime);
+  double time = endtime.tv_sec - starttime.tv_sec +
+                1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
 
   printf("MV_MulScalar: %6.2lf GB/s %8i Elements %3i Vectors %s\n",
-		  2*size*numVecs*sizeof(FLOAT)*loop/time*1e-9, size, numVecs, errors==0?"PASSED":"FAILED");
+         2 * size * numVecs * sizeof(FLOAT) * loop / time * 1e-9, size, numVecs,
+         errors == 0 ? "PASSED" : "FAILED");
 }
 
+int main(int argc, char** argv) {
+  int size             = 200000;
+  int numVecs          = 17;
+  int loop             = 100;
+  int threads_per_numa = 1;
+  int device           = 0;
+  int numa             = 1;
+
+  for (int i = 0; i < argc; i++) {
+    if ((strcmp(argv[i], "-n") == 0)) {
+      loop = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-d") == 0)) {
+      device = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-s") == 0)) {
+      size = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "-v") == 0)) {
+      numVecs = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "--threads") == 0)) {
+      threads_per_numa = atoi(argv[++i]);
+      continue;
+    }
+    if ((strcmp(argv[i], "--numa") == 0)) {
+      numa = atoi(argv[++i]);
+      continue;
+    }
+  }
 
-int main(int argc, char **argv)
-{
- int size = 200000;
- int numVecs = 17;
- int loop = 100;
- int threads_per_numa=1;
- int device = 0;
- int numa=1;
-
- for(int i=0;i<argc;i++)
- {
-  if((strcmp(argv[i],"-n")==0)) {loop=atoi(argv[++i]); continue;}
-  if((strcmp(argv[i],"-d")==0)) {device=atoi(argv[++i]); continue;}
-  if((strcmp(argv[i],"-s")==0)) {size=atoi(argv[++i]); continue;}
-  if((strcmp(argv[i],"-v")==0)) {numVecs=atoi(argv[++i]); continue;}
-  if((strcmp(argv[i],"--threads")==0)) {threads_per_numa=atoi(argv[++i]); continue;}
-  if((strcmp(argv[i],"--numa")==0)) {numa=atoi(argv[++i]); continue;}
- }
-
- Kokkos::InitArguments args_init;
-
- args_init.device_id = device;
-
- if(numa>1 || threads>1)
- {
-   args_init.num_threads = numa*threads_per_numa;
-   args_init.num_numa = numa;
- }
-
- Kokkos::initialize( args_init );
- {
-   test_mv_dot(size,numVecs,loop);
-   test_mv_add(size,numVecs,loop);
-   test_mv_mulscalar(size,numVecs,loop);
- }
- Kokkos::finalize();
+  Kokkos::InitArguments args_init;
 
-}
+  args_init.device_id = device;
 
+  if (numa > 1 || threads > 1) {
+    args_init.num_threads = numa * threads_per_numa;
+    args_init.num_numa    = numa;
+  }
+
+  Kokkos::initialize(args_init);
+  {
+    test_mv_dot(size, numVecs, loop);
+    test_mv_add(size, numVecs, loop);
+    test_mv_mulscalar(size, numVecs, loop);
+  }
+  Kokkos::finalize();
+}
diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia
index 8793b9e33b..c049e6b721 100755
--- a/scripts/cm_test_all_sandia
+++ b/scripts/cm_test_all_sandia
@@ -121,6 +121,10 @@ HIP_ENABLE_CMD=
 #This will be added to reproducer instructions/script.
 MODULE_ENVIRONMENT=
 
+if [[ "$HOSTNAME" == inouye* ]]; then
+  MACHINE=inouye
+fi
+
 if [[ "$HOSTNAME" =~ (white|ride).* ]]; then
   MACHINE=white
   module load git
@@ -141,10 +145,6 @@ if [[ "$HOSTNAME" == *blake* ]]; then # Warning: very generic name
   module load git
 fi
 
-if [[ "$HOSTNAME" == apollo\.* ]]; then
-  MACHINE=apollo
-fi
-
 if [[ "$HOSTNAME" == kokkos-dev-2* ]]; then
   MACHINE=kokkos-dev-2
 fi
@@ -173,7 +173,6 @@ fi
 if [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then
   if [[ "$MACHINE" = "" ]]; then
     MACHINE=sems
-    module load sems-git
   fi
 fi
 
@@ -414,16 +413,19 @@ fi
 #
 
 if [ "$MACHINE" = "sems" ]; then
-  MODULE_ENVIRONMENT="source /projects/sems/modulefiles/utils/sems-modules-init.sh"
+  MODULE_ENVIRONMENT="source /projects/sems/modulefiles/utils/sems-archive-modules-init.sh"
   eval "$MODULE_ENVIRONMENT"
 
   # On unnamed sems machines, assume more restricted rhel7 environment
   # On rhel7 sems machines gcc/7.3.0, clang/4.0.1, and intel/16.0.3 are missing
   # Remove kokkkos-env module use
 
-  module load sems-cmake/3.17.1
-  BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>"
-  CUDA9_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.2.0"
+  module load sems-archive-cmake/3.17.1
+  BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
+  OLDINTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/6.4.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
+  INTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
+  CLANG_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/9.2.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
+  CUDA9_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
   SKIP_HWLOC=True
   # No sems hwloc module
 
@@ -435,14 +437,14 @@ if [ "$MACHINE" = "sems" ]; then
     # Format: (compiler module-list build-list exe-name warning-flag)
     COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS"
                "gcc/7.2.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
-               "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
+               "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
                "cuda/9.2 $CUDA9_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
   elif [ "$SPOT_CHECK_TPLS" = "True" ]; then
     # Format: (compiler module-list build-list exe-name warning-flag)
     COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS"
                "gcc/7.2.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
-               "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
+               "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
                "cuda/9.2 $CUDA9_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
   else
@@ -453,25 +455,28 @@ if [ "$MACHINE" = "sems" ]; then
                "gcc/7.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "clang/5.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/7.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/9.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/10.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/18.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/19.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "clang/5.0.1 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "clang/7.0.1 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "clang/9.0.0 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "clang/10.0.0 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/18.0.5 $OLDINTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/19.0.5 $INTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "cuda/9.2 $CUDA9_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
   fi
   SPACK_CUDA_ARCH="+maxwell50" #use an old one
   SPACK_CUDA_HOST_COMPILER="%gcc@7.2.0"
 elif [ "$MACHINE" = "sogpu" ]; then
-  source /projects/sems/modulefiles/utils/sems-modules-init.sh
-
-  module load sems-cmake/3.17.1 sems-git
-  BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>"
-  CUDA_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.2.0"
-  CUDA11_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/8.3.0"
+  source /projects/sems/modulefiles/utils/sems-archive-modules-init.sh
+
+  module load sems-archive-cmake/3.17.1 sems-archive-git
+  BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
+  OLDINTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/6.4.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
+  INTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
+  CLANG_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/9.2.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
+  CUDA_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
+  CUDA11_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/8.3.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
   SKIP_HWLOC=True
   # No sems hwloc module
 
@@ -487,26 +492,29 @@ elif [ "$MACHINE" = "sogpu" ]; then
                "gcc/7.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "clang/5.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/7.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/9.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/10.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/18.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/19.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "clang/5.0.1 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "clang/7.0.1 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "clang/9.0.0 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "clang/10.0.0 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/18.0.5 $OLDINTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/19.0.5 $INTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "cuda/10.1 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/11.1 $CUDA11_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
               )
 elif [ "$MACHINE" = "kokkos-dev" ]; then
-  MODULE_ENVIRONMENT="source /projects/sems/modulefiles/utils/sems-modules-init.sh"
+  MODULE_ENVIRONMENT="source /projects/sems/modulefiles/utils/sems-archive-modules-init.sh"
   eval "$MODULE_ENVIRONMENT"
 
-  module load sems-cmake/3.17.1
-  BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>"
-  CUDA9_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/6.1.0"
-  CUDA10_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.3.0"
-  CUDA11_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/9.2.0"
-  CLANG7_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-cuda/9.2"
+  module load sems-archive-cmake/3.17.1
+  BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
+  OLDINTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/6.4.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
+  INTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
+  CLANG_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/9.2.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
+  CUDA9_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/6.1.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
+  CUDA10_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.3.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
+  CUDA11_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/9.2.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
+  CLANG7_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,sems-archive-cuda/9.2,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
   SKIP_HWLOC=True
 
   if [ -z "$ARCH_FLAG" ]; then
@@ -517,8 +525,8 @@ elif [ "$MACHINE" = "kokkos-dev" ]; then
     # Format: (compiler module-list build-list exe-name warning-flag)
     COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS"
                "gcc/7.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
-               "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
-               "clang/5.0.1 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
+               "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
+               "clang/5.0.1 $CLANG_BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
                "clang/7.0.1 $CLANG7_MODULE_LIST "Cuda_OpenMP" clang++ $CLANG_WARNING_FLAGS"
                "cuda/9.2 $CUDA9_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
@@ -526,8 +534,8 @@ elif [ "$MACHINE" = "kokkos-dev" ]; then
     # Format: (compiler module-list build-list exe-name warning-flag)
     COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS"
                "gcc/7.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
-               "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
-               "clang/5.0.1 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
+               "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
+               "clang/5.0.1 $CLANG_BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
                "clang/7.0.1 $CLANG7_MODULE_LIST "Cuda_OpenMP" clang++ $CLANG_WARNING_FLAGS"
                "cuda/9.2 $CUDA9_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
@@ -538,13 +546,13 @@ elif [ "$MACHINE" = "kokkos-dev" ]; then
                "gcc/7.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/18.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/19.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "clang/5.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/18.0.5 $OLDINTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/19.0.5 $INTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "clang/5.0.1 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
                "clang/7.0.1 $CLANG7_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/9.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/10.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "clang/9.0.0 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "clang/10.0.0 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
                "cuda/9.2 $CUDA9_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/10.1 $CUDA10_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/11.1 $CUDA11_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
@@ -552,6 +560,49 @@ elif [ "$MACHINE" = "kokkos-dev" ]; then
   fi
   SPACK_CUDA_ARCH="+kepler35"
   SPACK_CUDA_HOST_COMPILER="%gcc@7.3.0"
+elif [ "$MACHINE" = "inouye" ]; then
+  MODULE_ENVIRONMENT="module purge"
+  eval "$MODULE_ENVIRONMENT"
+  SKIP_HWLOC=True
+  export OMP_PROC_BIND=close
+  export OMP_PLACES=cores
+  export OMP_NUM_THREADS=48
+
+  BASE_MODULE_LIST="cmake/3.17.0,<COMPILER_NAME>/<COMPILER_VERSION>"
+
+  ARMPL_MODULE_TPL_LIST="cmake/3.17.0,gcc/10.2.0,<COMPILER_NAME>/<COMPILER_VERSION>"
+  ARMPL_MODULE_TPL_LIST="cmake/3.17.0,gcc/10.2.0,<COMPILER_NAME>/<COMPILER_VERSION>"
+
+  ARMCLANG_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Wsign-compare,-Wtype-limits,-Wuninitialized"
+
+  GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
+
+  if [ "$SPOT_CHECK" = "True" ]; then
+    # Format: (compiler module-list build-list exe-name warning-flag)
+    COMPILERS=("gcc/10.2.0 $BASE_MODULE_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS"
+               #"arm/20.3  $BASE_MODULE_LIST "OpenMP_Serial" armclang++ $ARMCLANG_WARNING_FLAGS"
+    )
+  elif [ "$SPOT_CHECK_TPLS" = "True" ]; then
+    # Format: (compiler module-list build-list exe-name warning-flag)
+    COMPILERS=("armpl/20.3.0 $ARMPL_MODULE_TPL_LIST "OpenMP,Serial" g++ $GCC_WARNING_FLAGS"
+               "armpl/21.0.0 $ARMPL_MODULE_TPL_LIST "OpenMP,Serial" g++ $GCC_WARNING_FLAGS"
+               "armpl/21.1.0 $ARMPL_MODULE_TPL_LIST "OpenMP,Serial" g++ $GCC_WARNING_FLAGS"
+    )
+  else
+    # Format: (compiler module-list build-list exe-name warning-flag)
+    COMPILERS=("gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "armpl/20.3.0 $ARMPL_MODULE_TPL_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "armpl/21.0.0 $ARMPL_MODULE_TPL_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "armpl/21.1.0 $ARMPL_MODULE_TPL_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               #"arm/20.3 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST armclang++ $ARMCLANG_WARNING_FLAGS"
+    )
+  fi
+
+  if [ -z "$ARCH_FLAG" ]; then
+    ARCH_FLAG="--arch=A64FX"
+  fi
+
+  SPACK_HOST_ARCH="+a64fx"
 elif [ "$MACHINE" = "white" ]; then
   MODULE_ENVIRONMENT="source /etc/profile.d/modules.sh"
   eval "$MODULE_ENVIRONMENT"
@@ -710,8 +761,8 @@ elif [ "$MACHINE" = "caraway" ]; then
   HIPCLANG_WARNING_FLAGS=""
 
   # Format: (compiler module-list build-list exe-name warning-flag)
-  COMPILERS=("rocm/4.2.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS"
-             "rocm/4.3.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS"
+  COMPILERS=("rocm/4.3.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS"
+             "rocm/4.5.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS"
   )
 
   if [ -z "$ARCH_FLAG" ]; then
@@ -739,6 +790,7 @@ elif [ "$MACHINE" = "blake" ]; then
       #"pgi/18.7.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS"
     COMPILERS=("intel/19.1.144 $BASE_MODULE_LIST_INTEL "OpenMP_Serial" icpc $INTEL_WARNING_FLAGS"
                "gcc/7.2.0 $BASE_MODULE_LIST "Pthread_Serial,OpenMP" g++ $GCC_WARNING_FLAGS"
+               "clang/10.0.1 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
     )
   elif [ "$SPOT_CHECK_TPLS" = "True" ]; then
       # Format: (compiler module-list build-list exe-name warning-flag)
@@ -761,6 +813,7 @@ elif [ "$MACHINE" = "blake" ]; then
                "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "clang/10.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
     )
 
   fi
@@ -769,79 +822,28 @@ elif [ "$MACHINE" = "blake" ]; then
     ARCH_FLAG="--arch=SKX"
   fi
   SPACK_HOST_ARCH="+skx"
-elif [ "$MACHINE" = "apollo" ]; then
-  MODULE_ENVIRONMENT="source /projects/sems/modulefiles/utils/sems-modules-init.sh ; module use /home/projects/modulefiles/local/x86-64"
-  eval "$MODULE_ENVIRONMENT"
-
-  module load sems-git
-  module load sems-tex
-  module load sems-cmake/3.17.1
-  module load sems-gdb
-  module load binutils
-
-  SKIP_HWLOC=True
-
-  BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>"
-  CUDA9_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0"
-  CUDA10_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0"
-  CUDA101_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.3.0"
-
-  CLANG_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/9.0.69"
-  NVCC_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0"
-#  HPX_MODULE_LIST="sems-env,sems-cmake/3.17.1,hpx/1.2.1,sems-gcc/6.1.0,binutils"
-#  HPX3_MODULE_LIST="sems-env,sems-cmake/3.17.1,compilers/hpx/1.3.0,sems-gcc/6.1.0,binutils"
-
-  BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_OpenMP"
-  BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_Pthread"
-  BUILD_LIST_CLANG="Serial,Pthread,OpenMP"
-
-  if [ "$SPOT_CHECK" = "True" ]; then
-    # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
-               "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
-    )
-  elif [ "$SPOT_CHECK_TPLS" = "True" ]; then
-    # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
-               "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
-    )
-  else
-    # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("cuda/10.0 $CUDA10_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-               "cuda/10.1 $CUDA101_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-               "cuda/10.2 $CUDA101_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-               "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-    )
-  fi
-
-  if [ -z "$ARCH_FLAG" ]; then
-    ARCH_FLAG="--arch=SNB,Volta70"
-  fi
-  SPACK_HOST_ARCH="+snb"
-  SPACK_CUDA_ARCH="+volta70"
-  SPACK_CUDA_HOST_COMPILER="%gcc@6.1.0"
 elif [ "$MACHINE" = "kokkos-dev-2" ]; then
-  MODULE_ENVIRONMENT="source /projects/sems/modulefiles/utils/sems-modules-init.sh ; module use /home/projects/x86-64/modulefiles/local"
+  MODULE_ENVIRONMENT="source /projects/sems/modulefiles/utils/sems-archive-modules-init.sh ; module use /home/projects/x86-64/modulefiles/local"
   eval "$MODULE_ENVIRONMENT"
-  module load sems-env
+  module load sems-archive-env
 
-  module load sems-git
-  module load sems-tex
-  module load sems-cmake/3.17.1
-  module load sems-gdb
+  module load sems-archive-git
+  module load sems-archive-tex
+  module load sems-archive-cmake/3.17.1
+  module load sems-archive-gdb
 
   SKIP_HWLOC=True
 
-  BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>"
-  GCC91_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>"
-  NVCC_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.3.0"
-  NVCC_SEMSMODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.3.0"
-  NVCC11_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/9.2.0"
+  BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
+  OLDINTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/6.4.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
+  INTEL_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
+  CLANG_BASE_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/9.2.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
+  GCC91_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>"
+  NVCC_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.3.0,<COMPILER_NAME>/<COMPILER_VERSION>"
+  NVCC_SEMSMODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.3.0,sems-archive-<COMPILER_NAME>/<COMPILER_VERSION>"
+  NVCC11_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/9.2.0,<COMPILER_NAME>/<COMPILER_VERSION>"
 
-  CLANG_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/6.1.0"
-  CLANG8_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/10.0"
+  CLANG8_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/10.0"
 
   BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_Pthread"
   BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_OpenMP"
@@ -854,11 +856,11 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then
     COMPILERS=("gcc/7.3.0 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS"
                "gcc/8.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
                "gcc/9.1 $GCC91_MODULE_LIST "OpenMP,Serial" g++ $GCC_WARNING_FLAGS"
-               "intel/17.0.1 $BASE_MODULE_LIST "Serial" icpc $INTEL_WARNING_FLAGS"
-               "intel/18.0.5 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
-               "intel/19.0.5 $BASE_MODULE_LIST "Pthread" icpc $INTEL_WARNING_FLAGS"
+               "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST "Serial" icpc $INTEL_WARNING_FLAGS"
+               "intel/18.0.5 $OLDINTEL_BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
+               "intel/19.0.5 $INTEL_BASE_MODULE_LIST "Pthread" icpc $INTEL_WARNING_FLAGS"
                "clang/8.0 $CLANG8_MODULE_LIST "Cuda_OpenMP,Pthread_Serial" clang++ $CLANG8_CUDA_WARNING_FLAGS"
-               "clang/9.0.0 $BASE_MODULE_LIST "Serial,Pthread" clang++ $CLANG_WARNING_FLAGS"
+               "clang/9.0.0 $CLANG_BASE_MODULE_LIST "Serial,Pthread" clang++ $CLANG_WARNING_FLAGS"
                "cuda/10.1 $NVCC_SEMSMODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/9.2 $NVCC_SEMSMODULE_LIST "Cuda_Serial" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/11.0 $NVCC11_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
@@ -868,11 +870,11 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then
     COMPILERS=("gcc/7.3.0 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS"
                "gcc/8.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
                "gcc/9.1 $GCC91_MODULE_LIST "OpenMP,Serial" g++ $GCC_WARNING_FLAGS"
-               "intel/17.0.1 $BASE_MODULE_LIST "Serial" icpc $INTEL_WARNING_FLAGS"
-               "intel/18.0.5 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
-               "intel/19.0.5 $BASE_MODULE_LIST "Pthread" icpc $INTEL_WARNING_FLAGS"
+               "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST "Serial" icpc $INTEL_WARNING_FLAGS"
+               "intel/18.0.5 $OLDINTEL_BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
+               "intel/19.0.5 $INTEL_BASE_MODULE_LIST "Pthread" icpc $INTEL_WARNING_FLAGS"
                "clang/8.0 $CLANG8_MODULE_LIST "Cuda_OpenMP,Pthread_Serial" clang++ $CLANG8_CUDA_WARNING_FLAGS"
-               "clang/9.0.0 $BASE_MODULE_LIST "Serial,Pthread" clang++ $CLANG_WARNING_FLAGS"
+               "clang/9.0.0 $CLANG_BASE_MODULE_LIST "Serial,Pthread" clang++ $CLANG_WARNING_FLAGS"
                "cuda/10.1 $NVCC_SEMSMODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/11.0 $NVCC11_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
@@ -893,13 +895,13 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then
                "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/9.1 $GCC91_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/18.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/19.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "clang/5.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/7.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/9.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/10.0.0 $BASE_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS"
+               "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/18.0.5 $OLDINTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/19.0.5 $INTEL_BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "clang/5.0.1 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "clang/7.0.1 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "clang/9.0.0 $CLANG_BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "clang/10.0.0 $CLANG_BASE_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS"
     )
   fi
 
@@ -1122,7 +1124,11 @@ setup_env() {
     if [[ "$compiler" == intel* ]]; then
       NEW_TPL_LIST="mkl,"
     else
-      NEW_TPL_LIST="blas,${NEW_TPL_LIST}"
+      if [[ "$compiler" == armpl* ]]; then
+        NEW_TPL_LIST="armpl,${NEW_TPL_LIST}"
+      else
+        NEW_TPL_LIST="blas,${NEW_TPL_LIST}"
+      fi
     fi
 
     # Overwrite new tpl list with trailing comma removed
diff --git a/scripts/docker/Dockerfile.hip b/scripts/docker/Dockerfile.hip
index b33b8b477c..f3c9e7f23e 100644
--- a/scripts/docker/Dockerfile.hip
+++ b/scripts/docker/Dockerfile.hip
@@ -10,6 +10,14 @@ RUN apt-get update && apt-get install -y \
 
 ENV PATH=/opt/rocm/bin:$PATH
 
+RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \
+    KEYDUMP_FILE=keydump && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \
+    gpg --import ${KEYDUMP_FILE} && \
+    gpg --verify ${KEYDUMP_FILE}.sig ${KEYDUMP_FILE} && \
+    rm ${KEYDUMP_FILE}*
+
 ARG CMAKE_VERSION=3.18.5
 ENV CMAKE_DIR=/opt/cmake
 RUN CMAKE_KEY=2D2CEF1034921684 && \
@@ -19,7 +27,6 @@ RUN CMAKE_KEY=2D2CEF1034921684 && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \
-    gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && \
     gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \
     grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \
     mkdir -p ${CMAKE_DIR} && \
diff --git a/scripts/docker/Dockerfile.sycl b/scripts/docker/Dockerfile.sycl
new file mode 100644
index 0000000000..888a36d510
--- /dev/null
+++ b/scripts/docker/Dockerfile.sycl
@@ -0,0 +1,53 @@
+ARG BASE=nvidia/cuda:10.2-devel
+FROM $BASE
+
+RUN apt-get update && apt-get install -y \
+        bc \
+        wget \
+        ccache \
+        ninja-build \
+        python3 \
+        git \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \
+    KEYDUMP_FILE=keydump && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \
+    gpg --import ${KEYDUMP_FILE} && \
+    gpg --verify ${KEYDUMP_FILE}.sig ${KEYDUMP_FILE} && \
+    rm ${KEYDUMP_FILE}*
+
+ARG CMAKE_VERSION=3.18.5
+ENV CMAKE_DIR=/opt/cmake
+RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
+    CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \
+    CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \
+    wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \
+    wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \
+    wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \
+    gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \
+    grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \
+    mkdir -p ${CMAKE_DIR} && \
+    sh ${CMAKE_SCRIPT} --skip-license --prefix=${CMAKE_DIR} && \
+    rm cmake*
+ENV PATH=${CMAKE_DIR}/bin:$PATH
+
+ENV SYCL_DIR=/opt/sycl
+RUN SYCL_VERSION=2021-09 && \
+    SYCL_URL=https://github.com/intel/llvm/archive && \
+    SYCL_ARCHIVE=${SYCL_VERSION}.tar.gz && \
+    SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \
+    wget --quiet ${SYCL_URL}/${SYCL_ARCHIVE} && \
+    mkdir llvm && \
+    tar -xf ${SYCL_ARCHIVE} -C llvm --strip-components=1 && \
+    cd llvm && \
+    python3 buildbot/configure.py --cuda && \
+    python3 buildbot/compile.py && \
+    mkdir -p ${SYCL_DIR} && \
+    mv ${SCRATCH_DIR}/llvm/build/install/* ${SYCL_DIR} && \
+    echo "${SYCL_DIR}/lib" > /etc/ld.so.conf.d/sycl.conf && ldconfig && \
+    rm -rf ${SCRATCH_DIR}
+ENV PATH=${SYCL_DIR}/bin:$PATH
diff --git a/scripts/update_lib.sh b/scripts/update_lib.sh
index ee2f66dc40..39335f4b23 100755
--- a/scripts/update_lib.sh
+++ b/scripts/update_lib.sh
@@ -3,38 +3,6 @@
 local machine_input="$1"
 local compiler_input="$2"
 
-check_sems_intel() {
-  ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)"
-  if [[ "${ICPCVER}" = 17.* ]]; then
-    module swap sems-gcc/4.9.3 sems-gcc/6.4.0
-    module list
-  fi
-  if [[ "${ICPCVER}" = 19.* ]]; then
-    # Newer gcc needed for c++ standard beyond c++14
-    module swap sems-gcc/6.1.0 sems-gcc/7.2.0
-    module list
-  fi
-}
-
-check_sems_clang() {
-  CLANGVER=$(clang --version | grep "clang version" | cut -d " " -f 3)
-  if [[ "${CLANGVER}" = 9.* ]] || [[ "${CLANGVER}" = 10.* ]]; then
-    # Newer gcc needed for c++ standard beyond c++14
-    module swap sems-gcc/5.3.0 sems-gcc/8.3.0
-    module list
-  fi
-}
-
-check_compiler_modules() {
-  if [[ "$compiler_input" = clang/* ]]; then
-    echo "  clang compiler - check supporting modules"
-    check_sems_clang
-  elif [[ "$compiler_input" = intel/* ]]; then
-    echo "  intel compiler - check supporting modules"
-    check_sems_intel
-  fi
-}
-
 if [ "$machine_input" = blake ]; then
   ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)"
   if [[ "${ICPCVER}" = 17.* || "${ICPCVER}" = 18.0.128 ]]; then
@@ -42,12 +10,3 @@ if [ "$machine_input" = blake ]; then
     module list
   fi
 fi
-if [ "$machine_input" = kokkos-dev ]; then
-  check_compiler_modules
-fi
-if [ "$machine_input" = kokkos-dev-2 ]; then
-  check_compiler_modules
-fi
-if [ "$machine_input" = sems ] || [ "$machine_input" = sogpu ]; then
-  check_compiler_modules
-fi
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 33ea753cb5..a3460d1413 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -304,6 +304,34 @@ KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv_mv_struct spmv
   TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES
 )
 
+KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv_blockcrsmatrix spmv
+  COMPONENTS  sparse
+  HEADER_LIST ETI_HEADERS
+  SOURCE_LIST SOURCES
+  TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES
+)
+
+KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv_mv_blockcrsmatrix spmv
+  COMPONENTS  sparse
+  HEADER_LIST ETI_HEADERS
+  SOURCE_LIST SOURCES
+  TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES
+)
+
+KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv_bsrmatrix spmv
+  COMPONENTS  sparse
+  HEADER_LIST ETI_HEADERS
+  SOURCE_LIST SOURCES
+  TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES
+)
+
+KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv_mv_bsrmatrix spmv
+  COMPONENTS  sparse
+  HEADER_LIST ETI_HEADERS
+  SOURCE_LIST SOURCES
+  TYPE_LISTS  FLOATS ORDINALS OFFSETS LAYOUTS DEVICES
+)
+
 KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv spmv
   COMPONENTS  sparse
   HEADER_LIST ETI_HEADERS
@@ -395,6 +423,7 @@ LIST(APPEND SOURCES
  batched/KokkosBatched_Util.cpp
  impl/tpls/KokkosBlas_Host_tpl.cpp
  impl/tpls/KokkosBlas_Cuda_tpl.cpp
+ impl/tpls/KokkosBlas_Rocm_tpl.cpp
  impl/tpls/KokkosKernels_tpl_handles.cpp
 )
 
@@ -430,6 +459,20 @@ ELSE()
     $<INSTALL_INTERFACE:${KOKKOSKERNELS_HEADER_INSTALL_DIR}>)
 ENDIF()
 
+IF (KOKKOS_ENABLE_SYCL)
+  SET(KOKKOSKERNELS_INTEL_ARCHS ${Kokkos_ARCH})
+  LIST(FILTER KOKKOSKERNELS_INTEL_ARCHS INCLUDE REGEX ".*INTEL.*")
+  LIST(LENGTH KOKKOSKERNELS_INTEL_ARCHS KOKKOSKERNELS_INTEL_ARCHS_NUM)
+  IF(KOKKOSKERNELS_INTEL_ARCHS_NUM GREATER_EQUAL 1)
+    IF (NOT BUILD_SHARED_LIBS)
+      MESSAGE(SEND_ERROR
+        "At the moment, we require KokkosKernels (and Kokkos) to be built as "
+        "shared libraries to allow querying free and total device memory!"
+      )
+    ENDIF()
+    TARGET_LINK_LIBRARIES(kokkoskernels PUBLIC ze_loader)
+  ENDIF()
+ENDIF()
 
 KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC LAPACK)
 KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC BLAS)
@@ -440,6 +483,8 @@ KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC CHOLMOD)
 KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC MKL)
 KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC CUBLAS)
 KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC CUSPARSE)
+KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC ROCBLAS)
+KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC ROCSPARSE)
 KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC METIS)
 KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC ARMPL)
 KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC MAGMA)
diff --git a/src/KokkosKernels_Half.hpp b/src/KokkosKernels_Half.hpp
index 5ecb959f7e..cf778c12d9 100644
--- a/src/KokkosKernels_Half.hpp
+++ b/src/KokkosKernels_Half.hpp
@@ -48,18 +48,44 @@
 #include "Kokkos_Core.hpp"
 
 namespace KokkosKernels {
-    namespace Experimental {
-        ////////////// BEGIN FP16/binary16 limits //////////////
-        #define KOKKOSKERNELS_IMPL_FP16_MAX 65504.0F           // Maximum normalized number
-        #define KOKKOSKERNELS_IMPL_FP16_MIN 0.000000059604645F // Minimum normalized positive half precision number
-        #define KOKKOSKERNELS_IMPL_FP16_RADIX 2                // Value of the base of the exponent representation. TODO: Confirm this
-        #define KOKKOSKERNELS_IMPL_FP16_MANT_DIG 15            // Number of digits in the matissa that can be represented without losing precision. TODO: Confirm this
-        #define KOKKOSKERNELS_IMPL_FP16_MIN_EXP -14            // This is the smallest possible exponent value
-        #define KOKKOSKERNELS_IMPL_FP16_MAX_EXP 15             // This is the largest possible exponent value
-        #define KOKKOSKERNELS_IMPL_FP16_SIGNIFICAND_BITS 10
-        #define KOKKOSKERNELS_IMPL_FP16_EPSILON 0.0009765625F
-        #define KOKKOSKERNELS_IMPL_HUGE_VALH 0x7c00            // bits [10,14] set.
-        ////////////// END FP16/binary16 limits //////////////
-    } // Experimental
-} // KokkosKernels
-#endif // KOKKOSKERNELS_HALF_HPP
+namespace Experimental {
+////////////// BEGIN FP16/binary16 limits //////////////
+#define KOKKOSKERNELS_IMPL_FP16_MAX 65504.0F  // Maximum normalized number
+#define KOKKOSKERNELS_IMPL_FP16_MIN \
+  0.000000059604645F  // Minimum normalized positive half precision number
+#define KOKKOSKERNELS_IMPL_FP16_RADIX \
+  2  // Value of the base of the exponent representation. TODO: on all archs?
+#define KOKKOSKERNELS_IMPL_FP16_MANT_DIG \
+  15  // Number of digits in the matissa that can be represented without losing
+      // precision. TODO: Confirm this
+#define KOKKOSKERNELS_IMPL_FP16_MIN_EXP \
+  -14  // This is the smallest possible exponent value
+#define KOKKOSKERNELS_IMPL_FP16_MAX_EXP \
+  15  // This is the largest possible exponent value
+#define KOKKOSKERNELS_IMPL_FP16_SIGNIFICAND_BITS 10
+#define KOKKOSKERNELS_IMPL_FP16_EPSILON 0.0009765625F  // 1/2^10
+#define KOKKOSKERNELS_IMPL_HUGE_VALH 0x7c00            // bits [10,14] set.
+////////////// END FP16/binary16 limits //////////////
+
+////////////// BEGIN BF16/float16 limits //////////////
+#define KOKKOSKERNELS_IMPL_BF16_MAX 3.38953139e38  // Maximum normalized number
+#define KOKKOSKERNELS_IMPL_BF16_MIN \
+  1.1754494351e-38  // Minimum normalized positive bhalf number
+#define KOKKOSKERNELS_IMPL_BF16_RADIX \
+  2  // Value of the base of the exponent representation. TODO: on all archs?
+#define KOKKOSKERNELS_IMPL_BF16_MANT_DIG_MIN 2
+#define KOKKOSKERNELS_IMPL_BF16_MANT_DIG_MAX 3
+#define KOKKOSKERNELS_IMPL_BF16_MANT_DIG \
+  KOKKOSKERNELS_IMPL_BF16_MANT_DIG_MIN  // Number of digits in the matissa that
+                                        // can be represented without losing
+                                        // precision.
+#define KOKKOSKERNELS_IMPL_BF16_MIN_EXP \
+  -126  // This is the smallest possible exponent value
+#define KOKKOSKERNELS_IMPL_BF16_MAX_EXP \
+  127  // This is the largest possible exponent value
+#define KOKKOSKERNELS_IMPL_BF16_EPSILON 0.0078125F  // 1/2^7
+////////////// END BF16/bfloat16 limits //////////////
+
+}  // namespace Experimental
+}  // namespace KokkosKernels
+#endif  // KOKKOSKERNELS_HALF_HPP
diff --git a/src/KokkosKernels_Macros.hpp b/src/KokkosKernels_Macros.hpp
index be094136ec..1630028c54 100644
--- a/src/KokkosKernels_Macros.hpp
+++ b/src/KokkosKernels_Macros.hpp
@@ -67,7 +67,8 @@
 #if defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG)
 // GCC 4.8.5 and older do not support #pragma omp simd
 // Do not enable when using GCC 7.2.0 + C++17 due to a bug in gcc
-#if (KOKKOS_COMPILER_GNU > 485) && !(KOKKOS_COMPILER_GNU == 720 && defined(KOKKOS_ENABLE_CXX17))
+#if (KOKKOS_COMPILER_GNU > 485) && \
+    !(KOKKOS_COMPILER_GNU == 720 && defined(KOKKOS_ENABLE_CXX17))
 #define KOKKOSKERNELS_ENABLE_OMP_SIMD
 #endif
 // TODO: Check for a clang version that supports #pragma omp simd
@@ -77,6 +78,26 @@
 #endif
 #endif
 
+// Macro to place before an ordinary loop to force vectorization, based
+// on the pragmas that are supported by the compiler. "Force" means to
+// override the compiler's heuristics and always vectorize.
+// This respects the fact that "omp simd" is incompatible with
+// "vector always" and "ivdep" in the Intel OneAPI toolchain.
+#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
+#define KOKKOSKERNELS_FORCE_SIMD _Pragma("omp simd")
+#else
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) && defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
+#define KOKKOSKERNELS_FORCE_SIMD _Pragma("ivdep") _Pragma("vector always")
+#elif defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
+#define KOKKOSKERNELS_FORCE_SIMD _Pragma("ivdep")
+#elif defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
+#define KOKKOSKERNELS_FORCE_SIMD _Pragma("vector always")
+#else
+// No macros available to suggest vectorization
+#define KOKKOSKERNELS_FORCE_SIMD
+#endif
+#endif
+
 // Macro that tells GCC not to worry if a variable isn't being used.
 // Generalized attributes were not implemented in GCC until 4.8:
 //
diff --git a/src/KokkosLinAlg_config.h b/src/KokkosLinAlg_config.h
index 662e51b363..4ceb632f7c 100644
--- a/src/KokkosLinAlg_config.h
+++ b/src/KokkosLinAlg_config.h
@@ -46,4 +46,4 @@
 
 #include <KokkosKernels_config.h>
 
-#endif // KOKKOSLINALG_CONFIG_H
+#endif  // KOKKOSLINALG_CONFIG_H
diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp
index 17d3f568fe..bf7235e507 100644
--- a/src/Kokkos_ArithTraits.hpp
+++ b/src/Kokkos_ArithTraits.hpp
@@ -54,19 +54,19 @@
 #include <Kokkos_Macros.hpp>
 
 #ifdef HAVE_KOKKOSKERNELS_QUADMATH
-#  include <quadmath.h>
-#endif // HAVE_KOKKOSKERNELS_QUADMATH
+#include <quadmath.h>
+#endif  // HAVE_KOKKOSKERNELS_QUADMATH
 
 #include <cfloat>
 #include <climits>
 #include <cmath>
-#include <complex> // std::complex
-#include <limits> // std::numeric_limits
+#include <complex>  // std::complex
+#include <limits>   // std::numeric_limits
 #ifdef __CUDACC__
-#  include <math_constants.h>
+#include <math_constants.h>
 #endif
 
-namespace { // anonymous
+namespace {  // anonymous
 
 /// \fn intPowImpl
 /// \tparam IntType A built-in integer type.
@@ -76,16 +76,15 @@ namespace { // anonymous
 /// \pre y > 0
 ///
 /// Use intPowSigned or intPowUnsigned for general y.
-template<class IntType>
-KOKKOS_FORCEINLINE_FUNCTION IntType
-intPowImpl (const IntType x, const IntType y)
-{
+template <class IntType>
+KOKKOS_FORCEINLINE_FUNCTION IntType intPowImpl(const IntType x,
+                                               const IntType y) {
   // Recursion (unrolled into while loop): pow(x, 2y) = (x^y)^2
-  IntType prod = x;
+  IntType prod  = x;
   IntType y_cur = 1;
   // If y == 1, then prod stays x.
   while (y_cur < y) {
-    prod = prod * prod;
+    prod  = prod * prod;
     y_cur = y_cur << 1;
   }
   // abs(y - y_cur) < floor(log2(y)), so it won't hurt asymptotic run
@@ -95,8 +94,7 @@ intPowImpl (const IntType x, const IntType y)
     for (IntType k = 0; k < left; ++k) {
       prod = prod * x;
     }
-  }
-  else if (y < y_cur) {
+  } else if (y < y_cur) {
     // There's probably a better way to do this in order to avoid the
     // (expensive) integer division, but I'm not motivated to think of
     // it at the moment.
@@ -126,35 +124,23 @@ intPowImpl (const IntType x, const IntType y)
   // x^2,2 -> x^4,4
 }
 
-
-// Warning free abs function for types where we don't know whether they are signed (like char)
-template<class T, bool is_signed = std::numeric_limits<T>::is_signed >
+// Warning free abs function for types where we don't know whether they are
+// signed (like char)
+template <class T, bool is_signed = std::numeric_limits<T>::is_signed>
 struct integer_abs {
-  static
-  KOKKOS_INLINE_FUNCTION
-  T abs(const T& val);
+  static KOKKOS_INLINE_FUNCTION T abs(const T& val);
 };
 
-template<class T>
-struct integer_abs<T,true> {
-  static
-  KOKKOS_INLINE_FUNCTION
-  T abs(const T& x) {
-    return x<0? -x:x;
-  }
+template <class T>
+struct integer_abs<T, true> {
+  static KOKKOS_INLINE_FUNCTION T abs(const T& x) { return x < 0 ? -x : x; }
 };
 
-template<class T>
-struct integer_abs<T,false> {
-  static
-  KOKKOS_INLINE_FUNCTION
-  T abs(const T& x) {
-    return x;
-  }
+template <class T>
+struct integer_abs<T, false> {
+  static KOKKOS_INLINE_FUNCTION T abs(const T& x) { return x; }
 };
 
-
-
 /// \fn intPowSigned
 /// \tparam IntType A built-in signed integer type.
 /// \brief Compute x raised to the power y.
@@ -162,11 +148,11 @@ struct integer_abs<T,false> {
 /// If the arguments are invalid (e.g., if x and y are both zero), the
 /// result of this function is undefined.  However, this function will
 /// not throw an exception in that case.
-template<class IntType>
-KOKKOS_FORCEINLINE_FUNCTION 
-typename std::enable_if<std::numeric_limits<IntType>::is_signed,IntType>::type
-intPowSigned (const IntType x, const IntType y)
-{
+template <class IntType>
+KOKKOS_FORCEINLINE_FUNCTION
+    typename std::enable_if<std::numeric_limits<IntType>::is_signed,
+                            IntType>::type
+    intPowSigned(const IntType x, const IntType y) {
   // It's not entirely clear what to return if x and y are both zero.
   // In the case of floating-point numbers, 0^0 is NaN.  Here, though,
   // I think it's safe to return 0.
@@ -177,21 +163,19 @@ intPowSigned (const IntType x, const IntType y)
   } else if (y < 0) {
     if (x == 1) {
       return 1;
-    }
-    else if (x == -1) {
+    } else if (x == -1) {
       return (y % 2 == 0) ? 1 : -1;
-    }
-    else {
-      return 0; // round the fraction to zero
+    } else {
+      return 0;  // round the fraction to zero
     }
   }
-  return intPowImpl<IntType> (x, y);
+  return intPowImpl<IntType>(x, y);
 }
-template<class IntType>
-KOKKOS_FORCEINLINE_FUNCTION 
-typename std::enable_if<!std::numeric_limits<IntType>::is_signed,IntType>::type
-intPowSigned (const IntType x, const IntType y)
-{
+template <class IntType>
+KOKKOS_FORCEINLINE_FUNCTION
+    typename std::enable_if<!std::numeric_limits<IntType>::is_signed,
+                            IntType>::type
+    intPowSigned(const IntType x, const IntType y) {
   // It's not entirely clear what to return if x and y are both zero.
   // In the case of floating-point numbers, 0^0 is NaN.  Here, though,
   // I think it's safe to return 0.
@@ -200,7 +184,7 @@ intPowSigned (const IntType x, const IntType y)
   } else if (y == 0) {
     return 1;
   }
-  return intPowImpl<IntType> (x, y);
+  return intPowImpl<IntType>(x, y);
 }
 
 /// \fn intPowUnsigned
@@ -210,10 +194,9 @@ intPowSigned (const IntType x, const IntType y)
 /// If the arguments are invalid (e.g., if x and y are both zero), the
 /// result of this function is undefined.  However, this function will
 /// not throw an exception in that case.
-template<class IntType>
-KOKKOS_FORCEINLINE_FUNCTION IntType
-intPowUnsigned (const IntType x, const IntType y)
-{
+template <class IntType>
+KOKKOS_FORCEINLINE_FUNCTION IntType intPowUnsigned(const IntType x,
+                                                   const IntType y) {
   // It's not entirely clear what to return if x and y are both zero.
   // In the case of floating-point numbers, 0^0 is NaN.  Here, though,
   // I think it's safe to return 0.
@@ -222,7 +205,7 @@ intPowUnsigned (const IntType x, const IntType y)
   } else if (y == 0) {
     return 1;
   } else {
-    return intPowImpl<IntType> (x, y);
+    return intPowImpl<IntType>(x, y);
   }
 }
 
@@ -237,7 +220,7 @@ intPowUnsigned (const IntType x, const IntType y)
 // performance optimization and not required for a reasonable
 // implementation.
 
-} // namespace (anonymous)
+}  // namespace
 
 namespace Kokkos {
 namespace Details {
@@ -357,9 +340,9 @@ namespace Details {
 /// behavior for arbitrary T, but does require specializations for
 /// common types like T = float and double, as well as for other types
 /// T that make sense to use on a CUDA device.
-template<class T>
+template <class T>
 class ArithTraits {
-public:
+ public:
   /// \brief A type that acts like T and works with Kokkos.
   ///
   /// This is usually just an alias for T.  However, some types T do
@@ -398,7 +381,7 @@ class ArithTraits {
   /// Unfortunately we can't call this "isinf" (the equivalent C99
   /// function), because CUDA appears to implement that function using
   /// a macro, rather than using a function (as C++11 requires).
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const T& x);
+  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const T& x);
 
   /// \brief Whether x is NaN (not a number).
   ///
@@ -409,16 +392,16 @@ class ArithTraits {
   /// Unfortunately we can't call this "isnan" (the equivalent C99
   /// function), because CUDA appears to implement that function using
   /// a macro, rather than using a function (as C++11 requires).
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const T& x);
+  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const T& x);
 
   //! The absolute value (magnitude) of x.
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const T& x);
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const T& x);
 
   //! The zero value of T; the arithmetic identity.
-  static KOKKOS_FORCEINLINE_FUNCTION T zero ();
+  static KOKKOS_FORCEINLINE_FUNCTION T zero();
 
   //! The one value of T; the multiplicative identity.
-  static KOKKOS_FORCEINLINE_FUNCTION T one ();
+  static KOKKOS_FORCEINLINE_FUNCTION T one();
 
   /// \brief True if this type T is capable of representing the
   /// positive infinity as a distinct special value, as with
@@ -439,28 +422,28 @@ class ArithTraits {
   ///
   /// If T is a real floating-point type, then this is the minimum
   /// <i>positive</i> value, as with std::numeric_limits<T>::min().
-  static KOKKOS_FORCEINLINE_FUNCTION T min ();
+  static KOKKOS_FORCEINLINE_FUNCTION T min();
 
   //! The maximum possible value of T.
-  static KOKKOS_FORCEINLINE_FUNCTION T max ();
+  static KOKKOS_FORCEINLINE_FUNCTION T max();
 
   /// \brief The real part of x.
   ///
   /// If \c is_complex is false, then this just returns x.
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real (const T& x);
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const T& x);
 
   /// \brief The imaginary part of x.
   ///
   /// If \c is_complex is false, then this just returns zero().
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag (const T&);
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const T&);
 
   /// \brief The complex conjugate of x.
   ///
   /// If \c is_complex is false, then this just returns x.
-  static KOKKOS_FORCEINLINE_FUNCTION T conj (const T&);
+  static KOKKOS_FORCEINLINE_FUNCTION T conj(const T&);
 
   //! x raised to the power y.
-  static KOKKOS_FORCEINLINE_FUNCTION T pow (const T& x, const T& y);
+  static KOKKOS_FORCEINLINE_FUNCTION T pow(const T& x, const T& y);
 
   /// \brief The square root of x.
   ///
@@ -473,7 +456,7 @@ class ArithTraits {
   /// exceptions in device functions.)  Implementations should return
   /// NaN if the type T supports this.  Of course, in that case, the
   /// square of the result will not equal x.
-  static KOKKOS_FORCEINLINE_FUNCTION T sqrt (const T& x);
+  static KOKKOS_FORCEINLINE_FUNCTION T sqrt(const T& x);
 
   /// \brief The cubic root of x.
   ///
@@ -486,15 +469,15 @@ class ArithTraits {
   /// exceptions in device functions.)  Implementations should return
   /// NaN if the type T supports this.  Of course, in that case, the
   /// cubic of the result will not equal x.
-  static KOKKOS_FORCEINLINE_FUNCTION T cbrt (const T& x);  
+  static KOKKOS_FORCEINLINE_FUNCTION T cbrt(const T& x);
 
   /// \brief The natural (base e) exponential function of x.
   ///
-  /// If T is an integer type, this is the floor of the exponential 
-  /// function.  If T is a complex-valued type, then this method 
+  /// If T is an integer type, this is the floor of the exponential
+  /// function.  If T is a complex-valued type, then this method
   /// returns \f$e^{x+iy} = e^x ( cos(y) + i sin(y) )\f$.
   ///
-  static KOKKOS_FORCEINLINE_FUNCTION T exp (const T& x);
+  static KOKKOS_FORCEINLINE_FUNCTION T exp(const T& x);
 
   /// \brief The natural (base e) logarithm of x.
   ///
@@ -507,7 +490,7 @@ class ArithTraits {
   /// throwing exceptions in device functions.)  Implementations
   /// should return NaN if the type T supports this.  Of course, in
   /// that case, if y is the result, \f$e^y\f$ will not equal x.
-  static KOKKOS_FORCEINLINE_FUNCTION T log (const T& x);
+  static KOKKOS_FORCEINLINE_FUNCTION T log(const T& x);
 
   /// \brief The base ten logarithm of the input.
   ///
@@ -520,67 +503,68 @@ class ArithTraits {
   /// throwing exceptions in device functions.)  Implementations
   /// should return NaN if the type T supports this.  Of course, in
   /// that case, if y is the result, \f$10^y\f$ will not equal x.
-  static KOKKOS_FORCEINLINE_FUNCTION T log10 (const T& x);
+  static KOKKOS_FORCEINLINE_FUNCTION T log10(const T& x);
 
-  /// Trigonometric and hyperbolic functions are not available 
+  /// Trigonometric and hyperbolic functions are not available
   /// for integer types. This is because asin(sin(x)) is not x
   /// when x is integer with a rounding error.
   ///
-  ///  KJ: log, exp also has this problem. We probably need to 
-  ///      disable them for integer types instead of providing 
+  ///  KJ: log, exp also has this problem. We probably need to
+  ///      disable them for integer types instead of providing
   ///      functionality with floor.
 
   /// \brief The sin function of x
   ///
-  static KOKKOS_FORCEINLINE_FUNCTION T sin (const T& x);
+  static KOKKOS_FORCEINLINE_FUNCTION T sin(const T& x);
 
   /// \brief The cos function of x
   ///
-  static KOKKOS_FORCEINLINE_FUNCTION T cos (const T& x);
+  static KOKKOS_FORCEINLINE_FUNCTION T cos(const T& x);
 
   /// \brief The tan function of x
   ///
-  static KOKKOS_FORCEINLINE_FUNCTION T tan (const T& x);
-  
+  static KOKKOS_FORCEINLINE_FUNCTION T tan(const T& x);
+
   /// \brief The sin hyperbolic function of x
   ///
-  static KOKKOS_FORCEINLINE_FUNCTION T sinh (const T& x);
+  static KOKKOS_FORCEINLINE_FUNCTION T sinh(const T& x);
 
   /// \brief The cos hyperbolic function of x
   ///
-  static KOKKOS_FORCEINLINE_FUNCTION T cosh (const T& x);
+  static KOKKOS_FORCEINLINE_FUNCTION T cosh(const T& x);
 
   /// \brief The tan hyperbolic function of x
   ///
-  static KOKKOS_FORCEINLINE_FUNCTION T tanh (const T& x);
-  
+  static KOKKOS_FORCEINLINE_FUNCTION T tanh(const T& x);
+
   /// \brief The asin function of x
   ///
-  static KOKKOS_FORCEINLINE_FUNCTION T asin (const T& x);
+  static KOKKOS_FORCEINLINE_FUNCTION T asin(const T& x);
 
   /// \brief The acos function of x
   ///
-  static KOKKOS_FORCEINLINE_FUNCTION T acos (const T& x);
+  static KOKKOS_FORCEINLINE_FUNCTION T acos(const T& x);
 
   /// \brief The atan function of x
   ///
-  static KOKKOS_FORCEINLINE_FUNCTION T atan (const T& x);
+  static KOKKOS_FORCEINLINE_FUNCTION T atan(const T& x);
 
   /// \brief Return a silent NaN, if appropriate for T.
   ///
   /// If T does <i>not</i> implement a silent NaN, the return value is
   /// undefined, but calling this method is still allowed.
-  static KOKKOS_FORCEINLINE_FUNCTION T nan ();
+  static KOKKOS_FORCEINLINE_FUNCTION T nan();
 
   /// \brief Machine epsilon.
   ///
   /// If T is an integer type (std::numeric_traits<T>::is_exact is
   /// true), then epsilon() returns 0.  Otherwise, if T is a
   /// floating-point type, it returns machine epsilon that T.
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon ();
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon();
 
   //@{
-  /// \name Traits defined for backwards compatibility with Teuchos::ScalarTraits
+  /// \name Traits defined for backwards compatibility with
+  /// Teuchos::ScalarTraits
   ///
   /// All of the typedefs, \c bool constants, and class methods in
   /// this section are defined in order that one may replace most uses
@@ -603,8 +587,8 @@ class ArithTraits {
   /// This typedef only makes sense if T is a floating-point type.
   typedef T doublePrecision;
 
-  static const bool isComplex = false;
-  static const bool isOrdinal = false;
+  static const bool isComplex    = false;
+  static const bool isOrdinal    = false;
   static const bool isComparable = false;
 
   /// \brief True if this type T has floating-point parameters.
@@ -616,119 +600,124 @@ class ArithTraits {
   static const bool hasMachineParameters = false;
 
   //! Return relative machine precision.
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type eps ();
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type eps();
 
   //! Return safe minimum (sfmin), such that 1/sfmin does not overflow.
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin ();
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin();
 
   //! Return the base of the scalar type T.
-  static KOKKOS_FORCEINLINE_FUNCTION int base ();
+  static KOKKOS_FORCEINLINE_FUNCTION int base();
 
   //! Return <tt>eps*base</tt>.
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type prec ();
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type prec();
 
   //! Returns the number of (base) digits in the significand.
-  static KOKKOS_FORCEINLINE_FUNCTION int t ();
+  static KOKKOS_FORCEINLINE_FUNCTION int t();
 
   //! 1.0 when rounding occurs in addition, else 0.0.
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd ();
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd();
 
   //! Returns the minimum exponent before (gradual) underflow.
-  static KOKKOS_FORCEINLINE_FUNCTION int emin ();
+  static KOKKOS_FORCEINLINE_FUNCTION int emin();
 
   //! Returns the underflow threshold: <tt>base^(emin-1)</tt>
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin ();
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin();
 
   //! Returns the largest exponent before overflow.
-  static KOKKOS_FORCEINLINE_FUNCTION int emax ();
+  static KOKKOS_FORCEINLINE_FUNCTION int emax();
 
   //! Overflow theshold: <tt>(base^emax)*(1-eps)</tt>
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax ();
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax();
 
   //! Same as abs(); return the magnitude of x.
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude (const T& x);
+  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const T& x);
 
   //! Same as conj(); return the complex conjugate of x.
-  static KOKKOS_FORCEINLINE_FUNCTION T conjugate (const T& x);
+  static KOKKOS_FORCEINLINE_FUNCTION T conjugate(const T& x);
 
   /// \brief Whether x is (silent) NaN or Inf.
   ///
   /// This is the same as <tt>isNan(x) || isInf(x)</tt>.
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf (const T& x);
+  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const T& x);
 
   /// \brief The string name of T.
   ///
   /// Note that this is not a device function.
-  static std::string name ();
+  static std::string name();
 
   //! Same as sqrt(x); the square root of x.
-  static KOKKOS_FORCEINLINE_FUNCTION T squareroot (const T& x);
+  static KOKKOS_FORCEINLINE_FUNCTION T squareroot(const T& x);
   //@}
 };
 
 // Since Kokkos::Experimental::half_t falls back to float, only define
 // ArithTraits if half_t is a backend specialization
-#if defined(KOKKOS_HALF_T_IS_FLOAT) &&\
-    !KOKKOS_HALF_T_IS_FLOAT
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
 template <>
 class ArithTraits<Kokkos::Experimental::half_t> {
-public:
+ public:
   typedef Kokkos::Experimental::half_t val_type;
   typedef val_type mag_type;
 
   static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool is_complex = false;
+  static const bool is_signed      = true;
+  static const bool is_integer     = false;
+  static const bool is_exact       = false;
+  static const bool is_complex     = false;
 
   static constexpr bool has_infinity = true;
-  static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return Kokkos::Experimental::cast_to_half(HUGE_VALF); }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() {
+    return Kokkos::Experimental::cast_to_half(HUGE_VALF);
+  }
 
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type x) {
-    #ifndef __CUDA_ARCH__
+  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) {
+#ifndef __CUDA_ARCH__
     using std::isinf;
-    #endif
-    return isinf (Kokkos::Experimental::cast_from_half<float>(x));
+#endif
+    return isinf(Kokkos::Experimental::cast_from_half<float>(x));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type x) {
-    #ifndef __CUDA_ARCH__
+  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) {
+#ifndef __CUDA_ARCH__
     using std::isnan;
-    #endif
+#endif
     return isnan(Kokkos::Experimental::cast_from_half<float>(x));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const val_type x) {
-    return Kokkos::Experimental::cast_to_half(fabs(Kokkos::Experimental::cast_from_half<float>(x)));
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+        fabs(Kokkos::Experimental::cast_from_half<float>(x)));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero () {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() {
     return Kokkos::Experimental::cast_to_half(0.0F);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one () {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type one() {
     return Kokkos::Experimental::cast_to_half(1.0F);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min () {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type min() {
     return Kokkos::Experimental::cast_to_half(-KOKKOSKERNELS_IMPL_FP16_MAX);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max () {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type max() {
     return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag (const val_type) {
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) {
     return Kokkos::Experimental::cast_to_half(0.0F);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const val_type y) {
-    return Kokkos::Experimental::cast_to_half(::pow(Kokkos::Experimental::cast_from_half<float>(x),
-                 Kokkos::Experimental::cast_from_half<float>(y)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
+                                                  const val_type y) {
+    return Kokkos::Experimental::cast_to_half(
+        ::pow(Kokkos::Experimental::cast_from_half<float>(x),
+              Kokkos::Experimental::cast_from_half<float>(y)));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) {
-    return Kokkos::Experimental::cast_to_half(::sqrt (Kokkos::Experimental::cast_from_half<float>(x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+        ::sqrt(Kokkos::Experimental::cast_from_half<float>(x)));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
     return Kokkos::Experimental::cast_to_half(
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
         sycl::cbrt(Kokkos::Experimental::cast_from_half<float>(x))
@@ -737,37 +726,57 @@ class ArithTraits<Kokkos::Experimental::half_t> {
 #endif
     );
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
-    return Kokkos::Experimental::cast_to_half(::exp (Kokkos::Experimental::cast_from_half<float>(x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+        ::exp(Kokkos::Experimental::cast_from_half<float>(x)));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) {
-    return Kokkos::Experimental::cast_to_half(::log (Kokkos::Experimental::cast_from_half<float>(x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+        ::log(Kokkos::Experimental::cast_from_half<float>(x)));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) {
-    return Kokkos::Experimental::cast_to_half(::log10 (Kokkos::Experimental::cast_from_half<float>(x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+        ::log10(Kokkos::Experimental::cast_from_half<float>(x)));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
-    return Kokkos::Experimental::cast_to_half(::sin (Kokkos::Experimental::cast_from_half<float>(x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+        ::sin(Kokkos::Experimental::cast_from_half<float>(x)));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) {
-    return Kokkos::Experimental::cast_to_half(::cos (Kokkos::Experimental::cast_from_half<float>(x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+        ::cos(Kokkos::Experimental::cast_from_half<float>(x)));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) {
-    return Kokkos::Experimental::cast_to_half(::tan (Kokkos::Experimental::cast_from_half<float>(x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+        sycl::tan(Kokkos::Experimental::cast_from_half<float>(x))
+#else
+        ::tan(Kokkos::Experimental::cast_from_half<float>(x))
+#endif
+    );
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) {
-    return Kokkos::Experimental::cast_to_half(::sinh (Kokkos::Experimental::cast_from_half<float>(x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+        ::sinh(Kokkos::Experimental::cast_from_half<float>(x)));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) {
-    return Kokkos::Experimental::cast_to_half(::cosh (Kokkos::Experimental::cast_from_half<float>(x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+        ::cosh(Kokkos::Experimental::cast_from_half<float>(x)));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) {
-    return Kokkos::Experimental::cast_to_half(::tanh (Kokkos::Experimental::cast_from_half<float>(x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+        ::tanh(Kokkos::Experimental::cast_from_half<float>(x)));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) {
-    return Kokkos::Experimental::cast_to_half(::asin (Kokkos::Experimental::cast_from_half<float>(x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) {
+    return Kokkos::Experimental::cast_to_half(
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+        sycl::asin(Kokkos::Experimental::cast_from_half<float>(x))
+#else
+        ::asin(Kokkos::Experimental::cast_from_half<float>(x))
+#endif
+    );
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) {
     return Kokkos::Experimental::cast_to_half(
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
         sycl::acos(Kokkos::Experimental::cast_from_half<float>(x))
@@ -776,7 +785,7 @@ class ArithTraits<Kokkos::Experimental::half_t> {
 #endif
     );
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) {
     return Kokkos::Experimental::cast_to_half(
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
         sycl::atan(Kokkos::Experimental::cast_from_half<float>(x))
@@ -785,8 +794,8 @@ class ArithTraits<Kokkos::Experimental::half_t> {
 #endif
     );
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () {
-    //return ::pow(2, -KOKKOSKERNELS_IMPL_FP16_SIGNIFICAND_BITS);
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() {
+    // return ::pow(2, -KOKKOSKERNELS_IMPL_FP16_SIGNIFICAND_BITS);
     return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_EPSILON);
   }
   // Backwards compatibility with Teuchos::ScalarTraits.
@@ -795,184 +804,391 @@ class ArithTraits<Kokkos::Experimental::half_t> {
   typedef val_type halfPrecision;
   typedef double doublePrecision;
 
-  static const bool isComplex = false;
-  static const bool isOrdinal = false;
-  static const bool isComparable = true;
+  static const bool isComplex            = false;
+  static const bool isOrdinal            = false;
+  static const bool isComparable         = true;
   static const bool hasMachineParameters = true;
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf (const val_type x) {
-    return isNan (x) || isInf (x);
+  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type x) {
+    return isNan(x) || isInf(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
     return abs(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
     return conj(x);
   }
-  static std::string name () {
-    return "half";
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) {
+  static std::string name() { return "half"; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
     return sqrt(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type nan () {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
 #ifdef __CUDA_ARCH__
     return Kokkos::Experimental::cast_to_half(CUDART_NAN_F);
 #else
-    return Kokkos::Experimental::cast_to_half(std::numeric_limits<float>::quiet_NaN());
-#endif // __CUDA_ARCH__
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type eps () {
-    return epsilon ();
+    return Kokkos::Experimental::cast_to_half(
+        std::numeric_limits<float>::quiet_NaN());
+#endif  // __CUDA_ARCH__
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin () {
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() {
     return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MIN);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION int base () {
+  static KOKKOS_FORCEINLINE_FUNCTION int base() {
     return KOKKOSKERNELS_IMPL_FP16_RADIX;
   }
   // Use float to allow running on both host and device
-  static KOKKOS_FORCEINLINE_FUNCTION float prec () {
+  static KOKKOS_FORCEINLINE_FUNCTION float prec() {
     float e = KOKKOSKERNELS_IMPL_FP16_EPSILON;
-    float b = (float) base();
+    float b = (float)base();
     float r = e * b;
     return r;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION int t () {
+  static KOKKOS_FORCEINLINE_FUNCTION int t() {
     return KOKKOSKERNELS_IMPL_FP16_MANT_DIG;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd () {
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() {
     return Kokkos::Experimental::cast_to_half(1.0);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION int emin () {
+  static KOKKOS_FORCEINLINE_FUNCTION int emin() {
     return KOKKOSKERNELS_IMPL_FP16_MIN_EXP;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin () {
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() {
     return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MIN);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION int emax () {
+  static KOKKOS_FORCEINLINE_FUNCTION int emax() {
     return KOKKOSKERNELS_IMPL_FP16_MAX_EXP;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax () {
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() {
     return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX);
   }
 };
-#endif // KOKKOS_HALF_T_IS_FLOAT && KOKKOS_ENABLE_CUDA_HALF
+#endif  // KOKKOS_HALF_T_IS_FLOAT && KOKKOS_ENABLE_CUDA_HALF
 
-template<>
-class ArithTraits<float> {
-public:
-  typedef float val_type;
+// Since Kokkos::Experimental::bhalf_t falls back to float, only define
+// ArithTraits if bhalf_t is a backend specialization
+#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT
+template <>
+class ArithTraits<Kokkos::Experimental::bhalf_t> {
+ public:
+  typedef Kokkos::Experimental::bhalf_t val_type;
   typedef val_type mag_type;
 
   static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool is_complex = false;
+  static const bool is_signed      = true;
+  static const bool is_integer     = false;
+  static const bool is_exact       = false;
+  static const bool is_complex     = false;
 
   static constexpr bool has_infinity = true;
-  static KOKKOS_FORCEINLINE_FUNCTION float infinity() { return HUGE_VALF; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() {
+    return Kokkos::Experimental::cast_to_bhalf(HUGE_VALF);
+  }
 
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const float x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) {
+#ifndef __CUDA_ARCH__
     using std::isinf;
-#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
-    using sycl::isinf;
 #endif
-    return isinf (x);
+    return isinf(Kokkos::Experimental::cast_from_bhalf<float>(x));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const float x) {
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) {
+#ifndef __CUDA_ARCH__
     using std::isnan;
-#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
-    using sycl::isnan;
 #endif
-    return isnan (x);
+    return isnan(Kokkos::Experimental::cast_from_bhalf<float>(x));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const float x) {
-    return ::fabs (x);
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        fabs(Kokkos::Experimental::cast_from_bhalf<float>(x)));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION float zero () {
-    return 0.0;
+  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() {
+    return Kokkos::Experimental::cast_to_bhalf(0.0F);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION float one () {
-    return 1.0;
+  static KOKKOS_FORCEINLINE_FUNCTION val_type one() {
+    return Kokkos::Experimental::cast_to_bhalf(1.0F);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION float min () {
-    return -FLT_MAX;
+  static KOKKOS_FORCEINLINE_FUNCTION val_type min() {
+    return Kokkos::Experimental::cast_to_bhalf(-KOKKOSKERNELS_IMPL_BF16_MAX);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION float max () {
-    return FLT_MAX;
+  static KOKKOS_FORCEINLINE_FUNCTION val_type max() {
+    return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MAX);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real (const float x) {
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag (const float) {
-    return 0.0;
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) {
+    return Kokkos::Experimental::cast_to_bhalf(0.0F);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION float conj (const float x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION float pow (const float x, const float y) {
-    return ::pow (x, y);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
+                                                  const val_type y) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        ::pow(Kokkos::Experimental::cast_from_bhalf<float>(x),
+              Kokkos::Experimental::cast_from_bhalf<float>(y)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        ::sqrt(Kokkos::Experimental::cast_from_bhalf<float>(x)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+        sycl::cbrt(Kokkos::Experimental::cast_from_bhalf<float>(x))
+#else
+        ::cbrt(Kokkos::Experimental::cast_from_bhalf<float>(x))
+#endif
+    );
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        ::exp(Kokkos::Experimental::cast_from_bhalf<float>(x)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        ::log(Kokkos::Experimental::cast_from_bhalf<float>(x)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        ::log10(Kokkos::Experimental::cast_from_bhalf<float>(x)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        ::sin(Kokkos::Experimental::cast_from_bhalf<float>(x)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        ::cos(Kokkos::Experimental::cast_from_bhalf<float>(x)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+        sycl::tan(Kokkos::Experimental::cast_from_bhalf<float>(x))
+#else
+        ::tan(Kokkos::Experimental::cast_from_bhalf<float>(x))
+#endif
+    );
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        ::sinh(Kokkos::Experimental::cast_from_bhalf<float>(x)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        ::cosh(Kokkos::Experimental::cast_from_bhalf<float>(x)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+        ::tanh(Kokkos::Experimental::cast_from_bhalf<float>(x)));
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+        sycl::asin(Kokkos::Experimental::cast_from_bhalf<float>(x))
+#else
+        ::asin(Kokkos::Experimental::cast_from_bhalf<float>(x))
+#endif
+    );
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+        sycl::acos(Kokkos::Experimental::cast_from_bhalf<float>(x))
+#else
+        ::acos(Kokkos::Experimental::cast_from_bhalf<float>(x))
+#endif
+    );
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) {
+    return Kokkos::Experimental::cast_to_bhalf(
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+        sycl::atan(Kokkos::Experimental::cast_from_bhalf<float>(x))
+#else
+        ::atan(Kokkos::Experimental::cast_from_bhalf<float>(x))
+#endif
+    );
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() {
+    // return ::pow(2, -KOKKOSKERNELS_IMPL_BF16_SIGNIFICAND_BITS);
+    return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_EPSILON);
+  }
+  // Backwards compatibility with Teuchos::ScalarTraits.
+  typedef mag_type magnitudeType;
+  // C++ doesn't have a standard "bhalf-float" type.
+  typedef val_type bhalfPrecision;
+  typedef double doublePrecision;
+
+  static const bool isComplex            = false;
+  static const bool isOrdinal            = false;
+  static const bool isComparable         = true;
+  static const bool hasMachineParameters = true;
+  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type x) {
+    return isNan(x) || isInf(x);
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
+    return abs(x);
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
+    return conj(x);
+  }
+  static std::string name() { return "bhalf"; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
+    return sqrt(x);
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
+#ifdef __CUDA_ARCH__
+    return Kokkos::Experimental::cast_to_bhalf(CUDART_NAN_F);
+#else
+    return Kokkos::Experimental::cast_to_bhalf(
+        std::numeric_limits<float>::quiet_NaN());
+#endif  // __CUDA_ARCH__
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() {
+    return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MIN);
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION int base() {
+    return KOKKOSKERNELS_IMPL_BF16_RADIX;
+  }
+  // Use float to allow running on both host and device
+  static KOKKOS_FORCEINLINE_FUNCTION float prec() {
+    float e = KOKKOSKERNELS_IMPL_BF16_EPSILON;
+    float b = (float)base();
+    float r = e * b;
+    return r;
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION int t() {
+    return KOKKOSKERNELS_IMPL_BF16_MANT_DIG;
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() {
+    return Kokkos::Experimental::cast_to_bhalf(1.0);
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION int emin() {
+    return KOKKOSKERNELS_IMPL_BF16_MIN_EXP;
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() {
+    return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MIN);
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION int emax() {
+    return KOKKOSKERNELS_IMPL_BF16_MAX_EXP;
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() {
+    return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MAX);
+  }
+};
+#endif  // KOKKOS_BHALF_T_IS_FLOAT
+
+template <>
+class ArithTraits<float> {
+ public:
+  typedef float val_type;
+  typedef val_type mag_type;
+
+  static const bool is_specialized = true;
+  static const bool is_signed      = true;
+  static const bool is_integer     = false;
+  static const bool is_exact       = false;
+  static const bool is_complex     = false;
+
+  static constexpr bool has_infinity = true;
+  static KOKKOS_FORCEINLINE_FUNCTION float infinity() { return HUGE_VALF; }
+
+  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const float x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+    using std::isinf;
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
+    using sycl::isinf;
+#endif
+    return isinf(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION float sqrt (const float x) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const float x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+    using std::isnan;
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
+    using sycl::isnan;
+#endif
+    return isnan(x);
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const float x) {
+    return ::fabs(x);
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION float zero() { return 0.0; }
+  static KOKKOS_FORCEINLINE_FUNCTION float one() { return 1.0; }
+  static KOKKOS_FORCEINLINE_FUNCTION float min() { return -FLT_MAX; }
+  static KOKKOS_FORCEINLINE_FUNCTION float max() { return FLT_MAX; }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const float x) { return x; }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const float) { return 0.0; }
+  static KOKKOS_FORCEINLINE_FUNCTION float conj(const float x) { return x; }
+  static KOKKOS_FORCEINLINE_FUNCTION float pow(const float x, const float y) {
+    return ::pow(x, y);
+  }
+  static KOKKOS_FORCEINLINE_FUNCTION float sqrt(const float x) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
     return sycl::sqrt(x);
 #else
     return ::sqrt(x);
 #endif
   }
-  static KOKKOS_FORCEINLINE_FUNCTION float cbrt (const float x) {
+  static KOKKOS_FORCEINLINE_FUNCTION float cbrt(const float x) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
     return sycl::cbrt(x);
 #else
     return ::cbrt(x);
 #endif
   }
-  static KOKKOS_FORCEINLINE_FUNCTION float exp (const float x) {
-    return ::exp (x);
+  static KOKKOS_FORCEINLINE_FUNCTION float exp(const float x) {
+    return ::exp(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION float log (const float x) {
-    return ::log (x);
+  static KOKKOS_FORCEINLINE_FUNCTION float log(const float x) {
+    return ::log(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION float log10 (const float x) {
-    return ::log10 (x);
+  static KOKKOS_FORCEINLINE_FUNCTION float log10(const float x) {
+    return ::log10(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION float sin (const float x) {
-    return ::sin (x);
+  static KOKKOS_FORCEINLINE_FUNCTION float sin(const float x) {
+    return ::sin(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION float cos (const float x) {
-    return ::cos (x);
+  static KOKKOS_FORCEINLINE_FUNCTION float cos(const float x) {
+    return ::cos(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION float tan (const float x) {
-    return ::tan (x);
+  static KOKKOS_FORCEINLINE_FUNCTION float tan(const float x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::tan(x);
+#else
+    return std::tan(x);
+#endif
   }
-  static KOKKOS_FORCEINLINE_FUNCTION float sinh (const float x) {
-    return ::sinh (x);
+  static KOKKOS_FORCEINLINE_FUNCTION float sinh(const float x) {
+    return ::sinh(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION float cosh (const float x) {
-    return ::cosh (x);
+  static KOKKOS_FORCEINLINE_FUNCTION float cosh(const float x) {
+    return ::cosh(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION float tanh (const float x) {
-    return ::tanh (x);
+  static KOKKOS_FORCEINLINE_FUNCTION float tanh(const float x) {
+    return ::tanh(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION float asin (const float x) {
-    return ::asin (x);
+  static KOKKOS_FORCEINLINE_FUNCTION float asin(const float x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::asin(x);
+#else
+    return ::asin(x);
+#endif
   }
-  static KOKKOS_FORCEINLINE_FUNCTION float acos (const float x) {
-    return ::acos (x);
+  static KOKKOS_FORCEINLINE_FUNCTION float acos(const float x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::acos(x);
+#else
+    return ::acos(x);
+#endif
   }
-  static KOKKOS_FORCEINLINE_FUNCTION float atan (const float x) {
+  static KOKKOS_FORCEINLINE_FUNCTION float atan(const float x) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
     return sycl::atan(x);
 #else
     return ::atan(x);
 #endif
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () {
-    return FLT_EPSILON;
-  }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return FLT_EPSILON; }
 
   // Backwards compatibility with Teuchos::ScalarTraits.
   typedef mag_type magnitudeType;
@@ -980,64 +1196,50 @@ class ArithTraits<float> {
   typedef float halfPrecision;
   typedef double doublePrecision;
 
-  static const bool isComplex = false;
-  static const bool isOrdinal = false;
-  static const bool isComparable = true;
+  static const bool isComplex            = false;
+  static const bool isOrdinal            = false;
+  static const bool isComparable         = true;
   static const bool hasMachineParameters = true;
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf (const float x) {
-    return isNan (x) || isInf (x);
+  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const float x) {
+    return isNan(x) || isInf(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude (const float x) {
-    return abs (x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION float conjugate (const float x) {
-    return conj (x);
+  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const float x) {
+    return abs(x);
   }
-  static std::string name () {
-    return "float";
+  static KOKKOS_FORCEINLINE_FUNCTION float conjugate(const float x) {
+    return conj(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION float squareroot (const float x) {
-    return sqrt (x);
+  static std::string name() { return "float"; }
+  static KOKKOS_FORCEINLINE_FUNCTION float squareroot(const float x) {
+    return sqrt(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION float nan () {
+  static KOKKOS_FORCEINLINE_FUNCTION float nan() {
 #if defined(__CUDA_ARCH__)
     return CUDART_NAN_F;
-    //return nan (); //this returns 0???
+    // return nan (); //this returns 0???
 #elif defined(__HIP_DEVICE_COMPILE__)
     return ::nanf("");
 #else
     return std::numeric_limits<float>::quiet_NaN();
-#endif // __CUDA_ARCH__
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type eps () {
-    return epsilon ();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin () {
-    return FLT_MIN; // ???
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION int base () {
-    return FLT_RADIX;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type prec () {
-    return eps () * static_cast<mag_type> (base ());
+#endif  // __CUDA_ARCH__
   }
-  static KOKKOS_FORCEINLINE_FUNCTION int t () {
-    return FLT_MANT_DIG;
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() {
+    return FLT_MIN;  // ???
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd () {
-    return 1.0;
+  static KOKKOS_FORCEINLINE_FUNCTION int base() { return FLT_RADIX; }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() {
+    return eps() * static_cast<mag_type>(base());
   }
-  static KOKKOS_FORCEINLINE_FUNCTION int emin () {
-    return FLT_MIN_EXP;
+  static KOKKOS_FORCEINLINE_FUNCTION int t() { return FLT_MANT_DIG; }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { return 1.0; }
+  static KOKKOS_FORCEINLINE_FUNCTION int emin() { return FLT_MIN_EXP; }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() {
+    return FLT_MIN;  // ??? // should be base^(emin-1)
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin () {
-    return FLT_MIN; // ??? // should be base^(emin-1)
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION int emax () {
-    return FLT_MAX_EXP;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax () {
-    return FLT_MAX; // ??? // should be (base^emax)*(1-eps)
+  static KOKKOS_FORCEINLINE_FUNCTION int emax() { return FLT_MAX_EXP; }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() {
+    return FLT_MAX;  // ??? // should be (base^emax)*(1-eps)
   }
 };
 
@@ -1046,22 +1248,23 @@ class ArithTraits<float> {
 /// The C++ Standard Library (with C++03 at least) only allows
 /// std::complex<RealFloatType> for RealFloatType = float, double, or
 /// long double.
-template<class RealFloatType>
+template <class RealFloatType>
 class ArithTraits<std::complex<RealFloatType> > {
-public:
+ public:
   //! Kokkos internally replaces std::complex with Kokkos::complex.
   typedef ::Kokkos::complex<RealFloatType> val_type;
   typedef RealFloatType mag_type;
 
   static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool is_complex = true;
+  static const bool is_signed      = true;
+  static const bool is_integer     = false;
+  static const bool is_exact       = false;
+  static const bool is_complex     = true;
 
   static constexpr bool has_infinity = true;
   static std::complex<RealFloatType> infinity() {
-    return std::complex<RealFloatType> (ArithTraits<mag_type>::infinity (), ArithTraits<mag_type>::infinity ());
+    return std::complex<RealFloatType>(ArithTraits<mag_type>::infinity(),
+                                       ArithTraits<mag_type>::infinity());
   }
 
 #ifdef KOKKOS_ENABLE_SYCL
@@ -1072,7 +1275,7 @@ class ArithTraits<std::complex<RealFloatType> > {
 #elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
     using sycl::isinf;
 #endif
-    return isinf (real (x)) || isinf (imag (x));
+    return isinf(real(x)) || isinf(imag(x));
   }
   template <>
   static bool isInf<long double>(const std::complex<long double>& x) {
@@ -1084,7 +1287,7 @@ class ArithTraits<std::complex<RealFloatType> > {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
     using std::isinf;
 #endif
-    return isinf (real (x)) || isinf (imag (x));
+    return isinf(real(x)) || isinf(imag(x));
   }
 #endif
 #ifdef KOKKOS_ENABLE_SYCL
@@ -1095,7 +1298,7 @@ class ArithTraits<std::complex<RealFloatType> > {
 #elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
     using sycl::isnan;
 #endif
-    return isnan (real (x)) || isnan (imag (x));
+    return isnan(real(x)) || isnan(imag(x));
   }
   template <>
   static bool isNan<long double>(const std::complex<long double>& x) {
@@ -1107,99 +1310,126 @@ class ArithTraits<std::complex<RealFloatType> > {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
     using std::isnan;
 #endif
-    return isnan (real (x)) || isnan (imag (x));
+    return isnan(real(x)) || isnan(imag(x));
   }
 #endif
-  static mag_type abs (const std::complex<RealFloatType>& x) {
-    return std::abs (x);
+  static mag_type abs(const std::complex<RealFloatType>& x) {
+    return std::abs(x);
   }
-  static std::complex<RealFloatType> zero () {
-    return std::complex<RealFloatType> (ArithTraits<mag_type>::zero (), ArithTraits<mag_type>::zero ());
+  static std::complex<RealFloatType> zero() {
+    return std::complex<RealFloatType>(ArithTraits<mag_type>::zero(),
+                                       ArithTraits<mag_type>::zero());
   }
-  static std::complex<RealFloatType> one () {
-    return std::complex<RealFloatType> (ArithTraits<mag_type>::one (), ArithTraits<mag_type>::zero ());
+  static std::complex<RealFloatType> one() {
+    return std::complex<RealFloatType>(ArithTraits<mag_type>::one(),
+                                       ArithTraits<mag_type>::zero());
   }
-  static std::complex<RealFloatType> min () {
-    return std::complex<RealFloatType> (ArithTraits<mag_type>::min (), ArithTraits<mag_type>::zero ());
+  static std::complex<RealFloatType> min() {
+    return std::complex<RealFloatType>(ArithTraits<mag_type>::min(),
+                                       ArithTraits<mag_type>::zero());
   }
-  static std::complex<RealFloatType> max () {
-    return std::complex<RealFloatType> (ArithTraits<mag_type>::max (), ArithTraits<mag_type>::zero ());
+  static std::complex<RealFloatType> max() {
+    return std::complex<RealFloatType>(ArithTraits<mag_type>::max(),
+                                       ArithTraits<mag_type>::zero());
   }
-  static mag_type real (const std::complex<RealFloatType>& x) {
-    return std::real (x);
+  static mag_type real(const std::complex<RealFloatType>& x) {
+    return std::real(x);
   }
-  static mag_type imag (const std::complex<RealFloatType>& x) {
-    return std::imag (x);
+  static mag_type imag(const std::complex<RealFloatType>& x) {
+    return std::imag(x);
   }
-  static std::complex<RealFloatType> conj (const std::complex<RealFloatType>& x) {
-    return std::conj (x);
+  static std::complex<RealFloatType> conj(
+      const std::complex<RealFloatType>& x) {
+    return std::conj(x);
   }
-  static std::complex<RealFloatType>
-  pow (const std::complex<RealFloatType>& x, const std::complex<RealFloatType>& y) {
+  static std::complex<RealFloatType> pow(const std::complex<RealFloatType>& x,
+                                         const std::complex<RealFloatType>& y) {
     // Fix for some weird gcc 4.2.1 inaccuracy.
-    if (y == one ()) {
+    if (y == one()) {
       return x;
-    } else if (y == one () + one ()) {
+    } else if (y == one() + one()) {
       return x * x;
     } else {
-      return std::pow (x, y);
+      return std::pow(x, y);
     }
   }
-  static std::complex<RealFloatType>
-  pow (const std::complex<RealFloatType>& x, const RealFloatType & y) {
+  static std::complex<RealFloatType> pow(const std::complex<RealFloatType>& x,
+                                         const RealFloatType& y) {
     // Fix for some weird gcc 4.2.1 inaccuracy.
-    if (y == ArithTraits<RealFloatType>::one ()) {
+    if (y == ArithTraits<RealFloatType>::one()) {
       return x;
-    } else if (y == ArithTraits<RealFloatType>::one () + ArithTraits<RealFloatType>::one ()) {
+    } else if (y == ArithTraits<RealFloatType>::one() +
+                        ArithTraits<RealFloatType>::one()) {
       return x * x;
     } else {
-      return std::pow (x, y);
+      return std::pow(x, y);
     }
   }
-  static std::complex<RealFloatType> sqrt (const std::complex<RealFloatType>& x) {
-    return std::sqrt (x);
+  static std::complex<RealFloatType> sqrt(
+      const std::complex<RealFloatType>& x) {
+    return std::sqrt(x);
   }
-  static std::complex<RealFloatType> cbrt (const std::complex<RealFloatType>& x) {
+  static std::complex<RealFloatType> cbrt(
+      const std::complex<RealFloatType>& x) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
     return sycl::cbrt(x);
 #else
     return ::cbrt(x);
 #endif
   }
-  static std::complex<RealFloatType> exp (const std::complex<RealFloatType>& x) {
-    return std::exp (x);
+  static std::complex<RealFloatType> exp(const std::complex<RealFloatType>& x) {
+    return std::exp(x);
   }
-  static std::complex<RealFloatType> log (const std::complex<RealFloatType>& x) {
-    return std::log (x);
+  static std::complex<RealFloatType> log(const std::complex<RealFloatType>& x) {
+    return std::log(x);
   }
-  static std::complex<RealFloatType> log10 (const std::complex<RealFloatType>& x) {
-    return std::log10 (x);
+  static std::complex<RealFloatType> log10(
+      const std::complex<RealFloatType>& x) {
+    return std::log10(x);
   }
-  static std::complex<RealFloatType> sin (const std::complex<RealFloatType>& x) {
-    return std::sin (x);
+  static std::complex<RealFloatType> sin(const std::complex<RealFloatType>& x) {
+    return std::sin(x);
   }
-  static std::complex<RealFloatType> cos (const std::complex<RealFloatType>& x) {
-    return std::cos (x);
+  static std::complex<RealFloatType> cos(const std::complex<RealFloatType>& x) {
+    return std::cos(x);
   }
-  static std::complex<RealFloatType> tan (const std::complex<RealFloatType>& x) {
-    return std::tan (x);
+  static std::complex<RealFloatType> tan(const std::complex<RealFloatType>& x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::tan(x);
+#else
+    return std::tan(x);
+#endif
   }
-  static std::complex<RealFloatType> sinh (const std::complex<RealFloatType>& x) {
-    return std::sinh (x);
+  static std::complex<RealFloatType> sinh(
+      const std::complex<RealFloatType>& x) {
+    return std::sinh(x);
   }
-  static std::complex<RealFloatType> cosh (const std::complex<RealFloatType>& x) {
-    return std::cosh (x);
+  static std::complex<RealFloatType> cosh(
+      const std::complex<RealFloatType>& x) {
+    return std::cosh(x);
   }
-  static std::complex<RealFloatType> tanh (const std::complex<RealFloatType>& x) {
-    return std::tanh (x);
+  static std::complex<RealFloatType> tanh(
+      const std::complex<RealFloatType>& x) {
+    return std::tanh(x);
   }
-  static std::complex<RealFloatType> asin (const std::complex<RealFloatType>& x) {
-    return std::asin (x);
+  static std::complex<RealFloatType> asin(
+      const std::complex<RealFloatType>& x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::asin(x);
+#else
+    return ::asin(x);
+#endif
   }
-  static std::complex<RealFloatType> acos (const std::complex<RealFloatType>& x) {
-    return std::acos (x);
+  static std::complex<RealFloatType> acos(
+      const std::complex<RealFloatType>& x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::acos(x);
+#else
+    return ::acos(x);
+#endif
   }
-  static std::complex<RealFloatType> atan (const std::complex<RealFloatType>& x) {
+  static std::complex<RealFloatType> atan(
+      const std::complex<RealFloatType>& x) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
     using sycl::atan;
 #else
@@ -1207,361 +1437,283 @@ class ArithTraits<std::complex<RealFloatType> > {
 #endif
     return atan(x);
   }
-  static std::complex<RealFloatType> nan () {
-    const mag_type mag_nan = ArithTraits<mag_type>::nan ();
-    return std::complex<RealFloatType> (mag_nan, mag_nan);
-  }
-  static mag_type epsilon () {
-    return ArithTraits<mag_type>::epsilon ();
+  static std::complex<RealFloatType> nan() {
+    const mag_type mag_nan = ArithTraits<mag_type>::nan();
+    return std::complex<RealFloatType>(mag_nan, mag_nan);
   }
+  static mag_type epsilon() { return ArithTraits<mag_type>::epsilon(); }
 
   // Backwards compatibility with Teuchos::ScalarTraits.
   typedef mag_type magnitudeType;
-  typedef std::complex<typename ArithTraits<mag_type>::halfPrecision> halfPrecision;
-  typedef std::complex<typename ArithTraits<mag_type>::doublePrecision> doublePrecision;
-
-  static const bool isComplex = true;
-  static const bool isOrdinal = false;
-  static const bool isComparable = false;
+  typedef std::complex<typename ArithTraits<mag_type>::halfPrecision>
+      halfPrecision;
+  typedef std::complex<typename ArithTraits<mag_type>::doublePrecision>
+      doublePrecision;
+
+  static const bool isComplex            = true;
+  static const bool isOrdinal            = false;
+  static const bool isComparable         = false;
   static const bool hasMachineParameters = true;
-  static bool isnaninf (const std::complex<RealFloatType>& x) {
-    return isNan (x) || isInf (x);
-  }
-  static mag_type magnitude (const std::complex<RealFloatType>& x) {
-    return abs (x);
-  }
-  static std::complex<RealFloatType> conjugate (const std::complex<RealFloatType>& x) {
-    return conj (x);
-  }
-  static std::string name () {
-    return std::string ("std::complex<") + ArithTraits<mag_type>::name () + ">";
-  }
-  static std::complex<RealFloatType> squareroot (const std::complex<RealFloatType>& x) {
-    return sqrt (x);
-  }
-  static mag_type eps () {
-    return epsilon ();
-  }
-  static mag_type sfmin () {
-    return ArithTraits<mag_type>::sfmin ();
-  }
-  static int base () {
-    return ArithTraits<mag_type>::base ();
+  static bool isnaninf(const std::complex<RealFloatType>& x) {
+    return isNan(x) || isInf(x);
   }
-  static mag_type prec () {
-    return ArithTraits<mag_type>::prec ();
-  }
-  static int t () {
-    return ArithTraits<mag_type>::t ();
-  }
-  static mag_type rnd () {
-    return ArithTraits<mag_type>::one ();
-  }
-  static int emin () {
-    return ArithTraits<mag_type>::emin ();
+  static mag_type magnitude(const std::complex<RealFloatType>& x) {
+    return abs(x);
   }
-  static mag_type rmin () {
-    return ArithTraits<mag_type>::rmin ();
+  static std::complex<RealFloatType> conjugate(
+      const std::complex<RealFloatType>& x) {
+    return conj(x);
   }
-  static int emax () {
-    return ArithTraits<mag_type>::emax ();
+  static std::string name() {
+    return std::string("std::complex<") + ArithTraits<mag_type>::name() + ">";
   }
-  static mag_type rmax () {
-    return ArithTraits<mag_type>::rmax ();
+  static std::complex<RealFloatType> squareroot(
+      const std::complex<RealFloatType>& x) {
+    return sqrt(x);
   }
+  static mag_type eps() { return epsilon(); }
+  static mag_type sfmin() { return ArithTraits<mag_type>::sfmin(); }
+  static int base() { return ArithTraits<mag_type>::base(); }
+  static mag_type prec() { return ArithTraits<mag_type>::prec(); }
+  static int t() { return ArithTraits<mag_type>::t(); }
+  static mag_type rnd() { return ArithTraits<mag_type>::one(); }
+  static int emin() { return ArithTraits<mag_type>::emin(); }
+  static mag_type rmin() { return ArithTraits<mag_type>::rmin(); }
+  static int emax() { return ArithTraits<mag_type>::emax(); }
+  static mag_type rmax() { return ArithTraits<mag_type>::rmax(); }
 };
 
-
-template<>
+template <>
 class ArithTraits<double> {
-public:
+ public:
   typedef double val_type;
   typedef val_type mag_type;
 
   static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool is_complex = false;
+  static const bool is_signed      = true;
+  static const bool is_integer     = false;
+  static const bool is_exact       = false;
+  static const bool is_complex     = false;
 
   static constexpr bool has_infinity = true;
   static KOKKOS_FORCEINLINE_FUNCTION double infinity() { return HUGE_VAL; }
 
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type x) {
-    #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
     using std::isinf;
 #elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
     using sycl::isinf;
 #endif
-    return isinf (x);
+    return isinf(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type x) {
-    #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
     using std::isnan;
 #elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
     using sycl::isnan;
 #endif
-    return isnan (x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const val_type x) {
-    return ::fabs (x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero () {
-    return 0.0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one () {
-    return 1.0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min () {
-    return -DBL_MAX;
+    return isnan(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max () {
-    return DBL_MAX;
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
+    return ::fabs(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0.0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1.0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return -DBL_MAX; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return DBL_MAX; }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag (const val_type) {
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) {
     return 0.0;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const val_type y) {
-    return ::pow (x, y);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
+                                                  const val_type y) {
+    return ::pow(x, y);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
     return sycl::sqrt(x);
 #else
     return ::sqrt(x);
 #endif
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
     return sycl::cbrt(x);
 #else
     return ::cbrt(x);
 #endif
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
-    return ::exp (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
+    return ::exp(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) {
-    return ::log (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
+    return ::log(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) {
-    return ::log10 (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
+    return ::log10(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
-    return ::sin (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) {
+    return ::sin(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) {
-    return ::cos (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) {
+    return ::cos(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) {
-    return ::tan (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::tan(x);
+#else
+    return std::tan(x);
+#endif
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) {
-    return ::sinh (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) {
+    return ::sinh(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) {
-    return ::cosh (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) {
+    return ::cosh(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) {
-    return ::tanh (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) {
+    return ::tanh(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) {
-    return ::asin (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::asin(x);
+#else
+    return ::asin(x);
+#endif
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) {
-    return ::acos (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::acos(x);
+#else
+    return ::acos(x);
+#endif
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
     return sycl::atan(x);
 #else
     return ::atan(x);
 #endif
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type nan () {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
 #if defined(__CUDA_ARCH__)
     return CUDART_NAN;
-    //return nan (); // this returns 0 ???
+    // return nan (); // this returns 0 ???
 #elif defined(__HIP_DEVICE_COMPILE__)
     return ::nan("");
 #else
     return std::numeric_limits<val_type>::quiet_NaN();
-#endif // __CUDA_ARCH__
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () {
-    return DBL_EPSILON;
+#endif  // __CUDA_ARCH__
   }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return DBL_EPSILON; }
 
   // Backwards compatibility with Teuchos::ScalarTraits.
   typedef mag_type magnitudeType;
   typedef float halfPrecision;
 #if defined(__CUDA_ARCH__)
-  typedef double doublePrecision; // CUDA doesn't support long double, unfortunately
+  typedef double
+      doublePrecision;  // CUDA doesn't support long double, unfortunately
 #elif defined(__HIP_DEVICE_COMPILE__)
-  typedef double doublePrecision; // HIP does not support long double unfortunately
+  typedef double
+      doublePrecision;  // HIP does not support long double unfortunately
 #else
   typedef long double doublePrecision;
-#endif // __CUDA_ARCH__
-  static const bool isComplex = false;
-  static const bool isOrdinal = false;
-  static const bool isComparable = true;
+#endif  // __CUDA_ARCH__
+  static const bool isComplex            = false;
+  static const bool isOrdinal            = false;
+  static const bool isComparable         = true;
   static const bool hasMachineParameters = true;
-  static bool isnaninf (const val_type& x) {
-    return isNan (x) || isInf (x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type magnitude (const val_type x) {
-    return abs (x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate (const val_type x) {
-    return conj (x);
-  }
-  static std::string name () {
-    return "double";
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) {
-    return sqrt (x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type eps () {
-    return epsilon ();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin () {
-    return DBL_MIN; // ???
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION int base () {
-    return FLT_RADIX; // same for float as for double
+  static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type magnitude(const val_type x) {
+    return abs(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type prec () {
-    return eps () * static_cast<mag_type> (base ());
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
+    return conj(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION int t () {
-    return DBL_MANT_DIG;
+  static std::string name() { return "double"; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
+    return sqrt(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd () {
-    return 1.0;
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() {
+    return DBL_MIN;  // ???
   }
-  static KOKKOS_FORCEINLINE_FUNCTION int emin () {
-    return DBL_MIN_EXP;
+  static KOKKOS_FORCEINLINE_FUNCTION int base() {
+    return FLT_RADIX;  // same for float as for double
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin () {
-    return DBL_MIN; // ??? // should be base^(emin-1)
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() {
+    return eps() * static_cast<mag_type>(base());
   }
-  static KOKKOS_FORCEINLINE_FUNCTION int emax () {
-    return DBL_MAX_EXP;
+  static KOKKOS_FORCEINLINE_FUNCTION int t() { return DBL_MANT_DIG; }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { return 1.0; }
+  static KOKKOS_FORCEINLINE_FUNCTION int emin() { return DBL_MIN_EXP; }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() {
+    return DBL_MIN;  // ??? // should be base^(emin-1)
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax () {
-    return DBL_MAX; // ??? // should be (base^emax)*(1-eps)
+  static KOKKOS_FORCEINLINE_FUNCTION int emax() { return DBL_MAX_EXP; }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() {
+    return DBL_MAX;  // ??? // should be (base^emax)*(1-eps)
   }
 };
 
-
 // CUDA and HIP do not support long double in device functions,
 // so none of the class methods in this specialization are marked
 // as device functions.
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-template<>
+template <>
 class ArithTraits<long double> {
-public:
+ public:
   typedef long double val_type;
   typedef long double mag_type;
 
   static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool is_complex = false;
+  static const bool is_signed      = true;
+  static const bool is_integer     = false;
+  static const bool is_exact       = false;
+  static const bool is_complex     = false;
 
   static constexpr bool has_infinity = true;
   static long double infinity() { return HUGE_VALL; }
 
-  static bool isInf (const val_type& x) {
+  static bool isInf(const val_type& x) {
     using std::isinf;
-    return isinf (x);
+    return isinf(x);
   }
-  static bool isNan (const val_type& x) {
+  static bool isNan(const val_type& x) {
     using std::isnan;
-    return isnan (x);
-  }
-  static mag_type abs (const val_type& x) {
-    return ::fabsl (x);
-  }
-  static val_type zero () {
-    return 0.0;
-  }
-  static val_type one () {
-    return 1.0;
-  }
-  static val_type min () {
-    return -LDBL_MAX;
-  }
-  static val_type max () {
-    return LDBL_MAX;
-  }
-  static mag_type real (const val_type& x) {
-    return x;
-  }
-  static mag_type imag (const val_type&) {
-    return zero ();
-  }
-  static val_type conj (const val_type& x) {
-    return x;
-  }
-  static val_type pow (const val_type& x, const val_type& y) {
-    return ::pow (x, y);
-  }
-  static val_type sqrt (const val_type& x) {
-    return ::sqrt (x);
-  }
-  static val_type cbrt (const val_type& x) {
-    return ::cbrtl (x);
-  }
-  static val_type exp (const val_type& x) {
-    return ::exp (x);
-  }
-  static val_type log (const val_type& x) {
-    return ::log (x);
-  }
-  static val_type log10 (const val_type& x) {
-    return ::log10 (x);
-  }
-  static val_type sin (const val_type& x) {
-    return ::sin (x);
-  }
-  static val_type cos (const val_type& x) {
-    return ::cos (x);
-  }
-  static val_type tan (const val_type& x) {
-    return ::tan (x);
-  }
-  static val_type sinh (const val_type& x) {
-    return ::sinh (x);
-  }
-  static val_type cosh (const val_type& x) {
-    return ::cosh (x);
-  }
-  static val_type tanh (const val_type& x) {
-    return ::tanh (x);
-  }
-  static val_type asin (const val_type& x) {
-    return ::asin (x);
-  }
-  static val_type acos (const val_type& x) {
-    return ::acos (x);
-  }
-  static val_type atan (const val_type& x) {
-    return ::atan (x);
-  }
-  static val_type nan () {
-    return std::numeric_limits<val_type>::quiet_NaN();
-  }
-  static mag_type epsilon () {
-    return LDBL_EPSILON;
-  }
+    return isnan(x);
+  }
+  static mag_type abs(const val_type& x) { return ::fabsl(x); }
+  static val_type zero() { return 0.0; }
+  static val_type one() { return 1.0; }
+  static val_type min() { return -LDBL_MAX; }
+  static val_type max() { return LDBL_MAX; }
+  static mag_type real(const val_type& x) { return x; }
+  static mag_type imag(const val_type&) { return zero(); }
+  static val_type conj(const val_type& x) { return x; }
+  static val_type pow(const val_type& x, const val_type& y) {
+    return ::pow(x, y);
+  }
+  static val_type sqrt(const val_type& x) { return ::sqrt(x); }
+  static val_type cbrt(const val_type& x) { return ::cbrtl(x); }
+  static val_type exp(const val_type& x) { return ::exp(x); }
+  static val_type log(const val_type& x) { return ::log(x); }
+  static val_type log10(const val_type& x) { return ::log10(x); }
+  static val_type sin(const val_type& x) { return ::sin(x); }
+  static val_type cos(const val_type& x) { return ::cos(x); }
+  static val_type tan(const val_type& x) { return ::tan(x); }
+  static val_type sinh(const val_type& x) { return ::sinh(x); }
+  static val_type cosh(const val_type& x) { return ::cosh(x); }
+  static val_type tanh(const val_type& x) { return ::tanh(x); }
+  static val_type asin(const val_type& x) { return ::asin(x); }
+  static val_type acos(const val_type& x) { return ::acos(x); }
+  static val_type atan(const val_type& x) { return ::atan(x); }
+  static val_type nan() { return std::numeric_limits<val_type>::quiet_NaN(); }
+  static mag_type epsilon() { return LDBL_EPSILON; }
 
   // Backwards compatibility with Teuchos::ScalarTraits.
   typedef mag_type magnitudeType;
@@ -1570,156 +1722,79 @@ class ArithTraits<long double> {
   // For now, long double is the most you get.
   typedef val_type doublePrecision;
 
-  static const bool isComplex = false;
-  static const bool isOrdinal = false;
-  static const bool isComparable = true;
+  static const bool isComplex            = false;
+  static const bool isOrdinal            = false;
+  static const bool isComparable         = true;
   static const bool hasMachineParameters = true;
-  static bool isnaninf (const val_type& x) {
-    return isNan (x) || isInf (x);
-  }
-  static mag_type magnitude (const val_type& x) {
-    return abs (x);
-  }
-  static val_type conjugate (const val_type& x) {
-    return conj (x);
-  }
-  static std::string name () {
-    return "long double";
-  }
-  static val_type squareroot (const val_type& x) {
-    return sqrt (x);
-  }
-  static mag_type eps () {
-    return epsilon ();
-  }
-  static mag_type sfmin () {
-    return LDBL_MIN; // ???
-  }
-  static int base () {
-    return FLT_RADIX; // same for float as for double or long double
-  }
-  static mag_type prec () {
-    return eps () * static_cast<mag_type> (base ());
-  }
-  static int t () {
-    return LDBL_MANT_DIG;
-  }
-  static mag_type rnd () {
-    return one ();
-  }
-  static int emin () {
-    return LDBL_MIN_EXP;
-  }
-  static mag_type rmin () {
-    return LDBL_MIN;
-  }
-  static int emax () {
-    return LDBL_MAX_EXP;
-  }
-  static mag_type rmax () {
-    return LDBL_MAX;
-  }
-}; // long double specialization
-#endif // KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+  static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); }
+  static mag_type magnitude(const val_type& x) { return abs(x); }
+  static val_type conjugate(const val_type& x) { return conj(x); }
+  static std::string name() { return "long double"; }
+  static val_type squareroot(const val_type& x) { return sqrt(x); }
+  static mag_type eps() { return epsilon(); }
+  static mag_type sfmin() {
+    return LDBL_MIN;  // ???
+  }
+  static int base() {
+    return FLT_RADIX;  // same for float as for double or long double
+  }
+  static mag_type prec() { return eps() * static_cast<mag_type>(base()); }
+  static int t() { return LDBL_MANT_DIG; }
+  static mag_type rnd() { return one(); }
+  static int emin() { return LDBL_MIN_EXP; }
+  static mag_type rmin() { return LDBL_MIN; }
+  static int emax() { return LDBL_MAX_EXP; }
+  static mag_type rmax() { return LDBL_MAX; }
+};  // long double specialization
 
 #ifdef HAVE_KOKKOSKERNELS_QUADMATH
 
 // CUDA does not support __float128 in device functions, so none of
 // the class methods in this specialization are marked as device
 // functions.
-template<>
+template <>
 class ArithTraits<__float128> {
-public:
+ public:
   typedef __float128 val_type;
   typedef val_type mag_type;
 
   static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool is_complex = false;
+  static const bool is_signed      = true;
+  static const bool is_integer     = false;
+  static const bool is_exact       = false;
+  static const bool is_complex     = false;
 
   static constexpr bool has_infinity = true;
   static __float128 infinity() { return 1.0q / 0.0q; }
 
-  static bool isInf (const __float128 x) {
-    return isinfq (x);
-  }
-  static bool isNan (const __float128 x) {
-    return isnanq (x);
-  }
-  static mag_type abs (const __float128 x) {
-    return fabsq (x);
-  }
-  static __float128 zero () {
-    return 0.0;
-  }
-  static __float128 one () {
-    return 1.0;
-  }
-  static __float128 min () {
-    return FLT128_MIN;
-  }
-  static __float128 max () {
-    return FLT128_MAX;
-  }
-  static mag_type real (const __float128 x) {
-    return x;
-  }
-  static mag_type imag (const __float128 /* x */) {
-    return 0.0;
-  }
-  static __float128 conj (const __float128 x) {
-    return x;
-  }
-  static __float128 pow (const __float128 x, const __float128 y) {
-    return powq (x, y);
-  }
-  static __float128 sqrt (const __float128 x) {
-    return sqrtq (x);
-  }
-  static __float128 cbrt (const __float128 x) {
-    return cbrtq (x);
-  }
-  static __float128 exp (const __float128 x) {
-    return exp (x);
-  }
-  static __float128 log (const __float128 x) {
-    return logq (x);
-  }
-  static __float128 log10 (const __float128 x) {
-    return log10q (x);
-  }
-  static __float128 sin (const __float128 x) {
-    return sinq (x);
-  }
-  static __float128 cos (const __float128 x) {
-    return cosq (x);
-  }
-  static __float128 tan (const __float128 x) {
-    return tanq (x);
-  }
-  static __float128 sinh (const __float128 x) {
-    return sinhq (x);
-  }
-  static __float128 cosh (const __float128 x) {
-    return coshq (x);
-  }
-  static __float128 tanh (const __float128 x) {
-    return tanhq (x);
-  }
-  static __float128 asin (const __float128 x) {
-    return asinq (x);
-  }
-  static __float128 acos (const __float128 x) {
-    return acosq (x);
-  }
-  static __float128 atan (const __float128 x) {
-    return atanq (x);
-  }
-  static mag_type epsilon () {
-    return FLT128_EPSILON;
-  }
+  static bool isInf(const __float128 x) { return isinfq(x); }
+  static bool isNan(const __float128 x) { return isnanq(x); }
+  static mag_type abs(const __float128 x) { return fabsq(x); }
+  static __float128 zero() { return 0.0; }
+  static __float128 one() { return 1.0; }
+  static __float128 min() { return FLT128_MIN; }
+  static __float128 max() { return FLT128_MAX; }
+  static mag_type real(const __float128 x) { return x; }
+  static mag_type imag(const __float128 /* x */) { return 0.0; }
+  static __float128 conj(const __float128 x) { return x; }
+  static __float128 pow(const __float128 x, const __float128 y) {
+    return powq(x, y);
+  }
+  static __float128 sqrt(const __float128 x) { return sqrtq(x); }
+  static __float128 cbrt(const __float128 x) { return cbrtq(x); }
+  static __float128 exp(const __float128 x) { return exp(x); }
+  static __float128 log(const __float128 x) { return logq(x); }
+  static __float128 log10(const __float128 x) { return log10q(x); }
+  static __float128 sin(const __float128 x) { return sinq(x); }
+  static __float128 cos(const __float128 x) { return cosq(x); }
+  static __float128 tan(const __float128 x) { return tanq(x); }
+  static __float128 sinh(const __float128 x) { return sinhq(x); }
+  static __float128 cosh(const __float128 x) { return coshq(x); }
+  static __float128 tanh(const __float128 x) { return tanhq(x); }
+  static __float128 asin(const __float128 x) { return asinq(x); }
+  static __float128 acos(const __float128 x) { return acosq(x); }
+  static __float128 atan(const __float128 x) { return atanq(x); }
+  static mag_type epsilon() { return FLT128_EPSILON; }
 
   // Backwards compatibility with Teuchos::ScalarTraits.
   typedef mag_type magnitudeType;
@@ -1727,128 +1802,114 @@ class ArithTraits<__float128> {
   // Unfortunately, we can't rely on a standard __float256 type.
   typedef __float128 doublePrecision;
 
-  static const bool isComplex = false;
-  static const bool isOrdinal = false;
-  static const bool isComparable = true;
+  static const bool isComplex            = false;
+  static const bool isOrdinal            = false;
+  static const bool isComparable         = true;
   static const bool hasMachineParameters = true;
-  static bool isnaninf (const __float128 x) {
-    return isNan (x) || isInf (x);
-  }
-  static magnitudeType magnitude (const __float128 x) {
-    return abs (x);
-  }
-  static __float128 conjugate (const __float128 x) {
-    return conj (x);
-  }
-  static std::string name () {
-    return "__float128";
-  }
-  static __float128 squareroot (const __float128 x) {
-    return sqrt (x);
-  }
-  static __float128 nan () {
-    return strtoflt128 ("NAN()", NULL); // ???
-  }
-  static mag_type eps () {
-    return epsilon ();
-  }
-  static mag_type sfmin () {
-    return FLT128_MIN; // ???
-  }
-  static int base () {
-    return 2;
-  }
-  static mag_type prec () {
-    return eps () * static_cast<mag_type> (base ());
-  }
-  static int t () {
-    return FLT_MANT_DIG;
-  }
-  static mag_type rnd () {
-    return 1.0;
-  }
-  static int emin () {
-    return FLT128_MIN_EXP;
-  }
-  static mag_type rmin () {
-    return FLT128_MIN; // ??? // should be base^(emin-1)
-  }
-  static int emax () {
-    return FLT128_MAX_EXP;
-  }
-  static mag_type rmax () {
-    return FLT128_MAX; // ??? // should be (base^emax)*(1-eps)
+  static bool isnaninf(const __float128 x) { return isNan(x) || isInf(x); }
+  static magnitudeType magnitude(const __float128 x) { return abs(x); }
+  static __float128 conjugate(const __float128 x) { return conj(x); }
+  static std::string name() { return "__float128"; }
+  static __float128 squareroot(const __float128 x) { return sqrt(x); }
+  static __float128 nan() {
+    return strtoflt128("NAN()", NULL);  // ???
+  }
+  static mag_type eps() { return epsilon(); }
+  static mag_type sfmin() {
+    return FLT128_MIN;  // ???
+  }
+  static int base() { return 2; }
+  static mag_type prec() { return eps() * static_cast<mag_type>(base()); }
+  static int t() { return FLT_MANT_DIG; }
+  static mag_type rnd() { return 1.0; }
+  static int emin() { return FLT128_MIN_EXP; }
+  static mag_type rmin() {
+    return FLT128_MIN;  // ??? // should be base^(emin-1)
+  }
+  static int emax() { return FLT128_MAX_EXP; }
+  static mag_type rmax() {
+    return FLT128_MAX;  // ??? // should be (base^emax)*(1-eps)
   }
 };
-#endif // HAVE_KOKKOSKERNELS_QUADMATH
+#endif  // HAVE_KOKKOSKERNELS_QUADMATH
 
-template<>
+template <>
 class ArithTraits< ::Kokkos::complex<float> > {
-public:
+ public:
   typedef ::Kokkos::complex<float> val_type;
   typedef float mag_type;
 
   static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool is_complex = true;
+  static const bool is_signed      = true;
+  static const bool is_integer     = false;
+  static const bool is_exact       = false;
+  static const bool is_complex     = true;
 
   static constexpr bool has_infinity = true;
   static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() {
-    return val_type (ArithTraits<mag_type>::infinity (), ArithTraits<mag_type>::infinity ());
+    return val_type(ArithTraits<mag_type>::infinity(),
+                    ArithTraits<mag_type>::infinity());
   }
 
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type x) {
-    return ArithTraits<mag_type>::isInf (x.real ()) ||
-      ArithTraits<mag_type>::isInf (x.imag ());
+  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) {
+    return ArithTraits<mag_type>::isInf(x.real()) ||
+           ArithTraits<mag_type>::isInf(x.imag());
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type x) {
-    return ArithTraits<mag_type>::isNan (x.real ()) ||
-      ArithTraits<mag_type>::isNan (x.imag ());
+  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) {
+    return ArithTraits<mag_type>::isNan(x.real()) ||
+           ArithTraits<mag_type>::isNan(x.imag());
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const val_type x) {
-    return std::sqrt (::Kokkos::real (x) * ::Kokkos::real (x) +
-                         ::Kokkos::imag (x) * ::Kokkos::imag (x));
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
+    return std::sqrt(::Kokkos::real(x) * ::Kokkos::real(x) +
+                     ::Kokkos::imag(x) * ::Kokkos::imag(x));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero () {
-    return val_type (ArithTraits<mag_type>::zero (), ArithTraits<mag_type>::zero ());
+  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() {
+    return val_type(ArithTraits<mag_type>::zero(),
+                    ArithTraits<mag_type>::zero());
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one () {
-    return val_type (ArithTraits<mag_type>::one (), ArithTraits<mag_type>::zero ());
+  static KOKKOS_FORCEINLINE_FUNCTION val_type one() {
+    return val_type(ArithTraits<mag_type>::one(),
+                    ArithTraits<mag_type>::zero());
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min () {
-    return val_type (ArithTraits<mag_type>::min (), ArithTraits<mag_type>::min ()); // ???
+  static KOKKOS_FORCEINLINE_FUNCTION val_type min() {
+    return val_type(ArithTraits<mag_type>::min(),
+                    ArithTraits<mag_type>::min());  // ???
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max () {
-    return val_type (ArithTraits<mag_type>::max (), ArithTraits<mag_type>::max ()); // ???
+  static KOKKOS_FORCEINLINE_FUNCTION val_type max() {
+    return val_type(ArithTraits<mag_type>::max(),
+                    ArithTraits<mag_type>::max());  // ???
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real (const val_type x) {
-    return x.real ();
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
+    return x.real();
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag (const val_type x) {
-    return x.imag ();
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type x) {
+    return x.imag();
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj (const val_type x) {
-    return ::Kokkos::conj (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
+    return ::Kokkos::conj(x);
   }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const val_type y) {
+  // static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const
+  // val_type y) {
   //   const mag_type abs_x_square = x.real()*x.real() + x.imag()*x.imag();
   //   const mag_type arg_x = ArithTraits<mag_type>::atan(x.imag()/x.real());
   //   const mag_type half = mag_type(0.5);
-  //   const mag_type alpha = (ArithTraits<mag_type>::pow(abs_x_square, half*y.real()) * 
+  //   const mag_type alpha = (ArithTraits<mag_type>::pow(abs_x_square,
+  //   half*y.real()) *
   //                           ArithTraits<mag_type>::exp(-y.imag()*arg_x));
-  //   return val_type(alpha* ArithTraits<mag_type>::cos(y.real()*arg_x + half*y.imag()*ArithTraits<mag_type>::log(abs_x_square)),
-  //                   alpha* ArithTraits<mag_type>::sin(y.real()*arg_x + half*y.imag()*ArithTraits<mag_type>::log(abs_x_square)));
+  //   return val_type(alpha* ArithTraits<mag_type>::cos(y.real()*arg_x +
+  //   half*y.imag()*ArithTraits<mag_type>::log(abs_x_square)),
+  //                   alpha* ArithTraits<mag_type>::sin(y.real()*arg_x +
+  //                   half*y.imag()*ArithTraits<mag_type>::log(abs_x_square)));
   // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const mag_type y) {
+  // static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const
+  // mag_type y) {
   //   const mag_type arg_x = ArithTraits<mag_type>::atan(x.imag()/x.real());
   //   const mag_type alpha = ArithTraits<mag_type>::pow(abs(x),y);
-  //   return val_type(alpha* ArithTraits<mag_type>::cos(y*arg_x), 
+  //   return val_type(alpha* ArithTraits<mag_type>::cos(y*arg_x),
   //                   alpha* ArithTraits<mag_type>::sin(y*arg_x));
   // }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) {
-    return ::Kokkos::sqrt (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
+    return ::Kokkos::sqrt(x);
   }
   // static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
   //   const mag_type r = ::Kokkos::abs(x);
@@ -1864,7 +1925,8 @@ class ArithTraits< ::Kokkos::complex<float> > {
   //   return val_type(re,im);
   // }
   // static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) {
-  //   return val_type(ArithTraits<mag_type>::log(abs(x)), ArithTraits<mag_type>::atan(x.imag()/x.real()));
+  //   return val_type(ArithTraits<mag_type>::log(abs(x)),
+  //   ArithTraits<mag_type>::atan(x.imag()/x.real()));
   // }
   // static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) {
   //   return log(x)/ArithTraits<mag_type>::log(mag_type(10));
@@ -1914,9 +1976,11 @@ class ArithTraits< ::Kokkos::complex<float> > {
   //   val_type r_val;
   //   const val_type ii = val_type(0, 1);
   //   if (x == ii) {
-  //     r_val = val_type(ArithTraits<mag_type>::nan(),  std::numeric_limits<mag_type>::infinity());
+  //     r_val = val_type(ArithTraits<mag_type>::nan(),
+  //     std::numeric_limits<mag_type>::infinity());
   //   } if (x == -ii) {
-  //     r_val = val_type(ArithTraits<mag_type>::nan(), -std::numeric_limits<mag_type>::infinity());
+  //     r_val = val_type(ArithTraits<mag_type>::nan(),
+  //     -std::numeric_limits<mag_type>::infinity());
   //   } else {
   //     const val_type ii_x = ii*x;
   //     const mag_type half = 0.5;
@@ -1925,138 +1989,143 @@ class ArithTraits< ::Kokkos::complex<float> > {
   //   }
   //   return r_val;
   // }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type nan () {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
     // ???
-    return val_type (ArithTraits<mag_type>::nan (), ArithTraits<mag_type>::nan ());
+    return val_type(ArithTraits<mag_type>::nan(), ArithTraits<mag_type>::nan());
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () {
-    return ArithTraits<mag_type>::epsilon (); // ???
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() {
+    return ArithTraits<mag_type>::epsilon();  // ???
   }
 
   // Backwards compatibility with Teuchos::ScalarTraits.
   typedef mag_type magnitudeType;
   typedef ::Kokkos::complex<ArithTraits<mag_type>::halfPrecision> halfPrecision;
-  typedef ::Kokkos::complex<ArithTraits<mag_type>::doublePrecision> doublePrecision;
+  typedef ::Kokkos::complex<ArithTraits<mag_type>::doublePrecision>
+      doublePrecision;
 
-  static const bool isComplex = true;
-  static const bool isOrdinal = false;
+  static const bool isComplex    = true;
+  static const bool isOrdinal    = false;
   static const bool isComparable = false;
-  static const bool hasMachineParameters = ArithTraits<mag_type>::hasMachineParameters;
-  static bool isnaninf (const val_type& x) {
-    return isNan (x) || isInf (x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type magnitude (const val_type x) {
-    return abs (x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate (const val_type x) {
-    return conj (x);
+  static const bool hasMachineParameters =
+      ArithTraits<mag_type>::hasMachineParameters;
+  static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type magnitude(const val_type x) {
+    return abs(x);
   }
-  static std::string name () {
-    return "Kokkos::complex<float>";
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
+    return conj(x);
   }
+  static std::string name() { return "Kokkos::complex<float>"; }
   // static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) {
   //   return sqrt (x);
   // }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type eps () {
-    return epsilon ();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin () {
-    return ArithTraits<mag_type>::sfmin (); // ???
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() {
+    return ArithTraits<mag_type>::sfmin();  // ???
   }
-  static KOKKOS_FORCEINLINE_FUNCTION int base () {
-    return ArithTraits<mag_type>::base ();
+  static KOKKOS_FORCEINLINE_FUNCTION int base() {
+    return ArithTraits<mag_type>::base();
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type prec () {
-    return ArithTraits<mag_type>::prec (); // ???
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() {
+    return ArithTraits<mag_type>::prec();  // ???
   }
-  static KOKKOS_FORCEINLINE_FUNCTION int t () {
-    return ArithTraits<mag_type>::t ();
+  static KOKKOS_FORCEINLINE_FUNCTION int t() {
+    return ArithTraits<mag_type>::t();
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd () {
-    return ArithTraits<mag_type>::rnd ();
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() {
+    return ArithTraits<mag_type>::rnd();
   }
-  static KOKKOS_FORCEINLINE_FUNCTION int emin () {
-    return ArithTraits<mag_type>::emin ();
+  static KOKKOS_FORCEINLINE_FUNCTION int emin() {
+    return ArithTraits<mag_type>::emin();
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin () {
-    return ArithTraits<mag_type>::rmin ();
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() {
+    return ArithTraits<mag_type>::rmin();
   }
-  static KOKKOS_FORCEINLINE_FUNCTION int emax () {
-    return ArithTraits<mag_type>::emax ();
+  static KOKKOS_FORCEINLINE_FUNCTION int emax() {
+    return ArithTraits<mag_type>::emax();
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax () {
-    return ArithTraits<mag_type>::rmax ();
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() {
+    return ArithTraits<mag_type>::rmax();
   }
 };
 
-
-template<>
+template <>
 class ArithTraits< ::Kokkos::complex<double> > {
-public:
+ public:
   typedef ::Kokkos::complex<double> val_type;
   typedef double mag_type;
 
   static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool is_complex = true;
+  static const bool is_signed      = true;
+  static const bool is_integer     = false;
+  static const bool is_exact       = false;
+  static const bool is_complex     = true;
 
   static constexpr bool has_infinity = true;
   static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() {
-    return val_type (ArithTraits<mag_type>::infinity (), ArithTraits<mag_type>::infinity ());
+    return val_type(ArithTraits<mag_type>::infinity(),
+                    ArithTraits<mag_type>::infinity());
   }
 
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type x) {
-    return ArithTraits<mag_type>::isInf (x.real ()) ||
-      ArithTraits<mag_type>::isInf (x.imag ());
+  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) {
+    return ArithTraits<mag_type>::isInf(x.real()) ||
+           ArithTraits<mag_type>::isInf(x.imag());
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type x) {
-    return ArithTraits<mag_type>::isNan (x.real ()) ||
-      ArithTraits<mag_type>::isNan (x.imag ());
+  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) {
+    return ArithTraits<mag_type>::isNan(x.real()) ||
+           ArithTraits<mag_type>::isNan(x.imag());
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const val_type x) {
-    return ::Kokkos::abs (x);
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
+    return ::Kokkos::abs(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero () {
-    return val_type (ArithTraits<mag_type>::zero (), ArithTraits<mag_type>::zero ());
+  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() {
+    return val_type(ArithTraits<mag_type>::zero(),
+                    ArithTraits<mag_type>::zero());
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one () {
-    return val_type (ArithTraits<mag_type>::one (), ArithTraits<mag_type>::zero ());
+  static KOKKOS_FORCEINLINE_FUNCTION val_type one() {
+    return val_type(ArithTraits<mag_type>::one(),
+                    ArithTraits<mag_type>::zero());
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min () {
-    return val_type (ArithTraits<mag_type>::min (), ArithTraits<mag_type>::min ()); // ???
+  static KOKKOS_FORCEINLINE_FUNCTION val_type min() {
+    return val_type(ArithTraits<mag_type>::min(),
+                    ArithTraits<mag_type>::min());  // ???
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max () {
-    return val_type (ArithTraits<mag_type>::max (), ArithTraits<mag_type>::max ()); // ???
+  static KOKKOS_FORCEINLINE_FUNCTION val_type max() {
+    return val_type(ArithTraits<mag_type>::max(),
+                    ArithTraits<mag_type>::max());  // ???
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real (const val_type x) {
-    return x.real ();
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
+    return x.real();
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag (const val_type x) {
-    return x.imag ();
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type x) {
+    return x.imag();
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj (const val_type x) {
-    return ::Kokkos::conj (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
+    return ::Kokkos::conj(x);
   }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const val_type y) {
+  // static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const
+  // val_type y) {
   //   const mag_type abs_x_square = x.real()*x.real() + x.imag()*x.imag();
   //   const mag_type arg_x = ArithTraits<mag_type>::atan(x.imag()/x.real());
   //   const mag_type half = mag_type(0.5);
-  //   const mag_type alpha = (ArithTraits<mag_type>::pow(abs_x_square, half*y.real()) * 
+  //   const mag_type alpha = (ArithTraits<mag_type>::pow(abs_x_square,
+  //   half*y.real()) *
   //                           ArithTraits<mag_type>::exp(-y.imag()*arg_x));
-  //   return val_type(alpha* ArithTraits<mag_type>::cos(y.real()*arg_x + half*y.imag()*ArithTraits<mag_type>::log(abs_x_square)),
-  //                   alpha* ArithTraits<mag_type>::sin(y.real()*arg_x + half*y.imag()*ArithTraits<mag_type>::log(abs_x_square)));
+  //   return val_type(alpha* ArithTraits<mag_type>::cos(y.real()*arg_x +
+  //   half*y.imag()*ArithTraits<mag_type>::log(abs_x_square)),
+  //                   alpha* ArithTraits<mag_type>::sin(y.real()*arg_x +
+  //                   half*y.imag()*ArithTraits<mag_type>::log(abs_x_square)));
 
   // }
-  // static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const mag_type y) {
+  // static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const
+  // mag_type y) {
   //   const mag_type arg_x = ArithTraits<mag_type>::atan(x.imag()/x.real());
   //   const mag_type alpha = ArithTraits<mag_type>::pow(abs(x),y);
-  //   return val_type(alpha* ArithTraits<mag_type>::cos(y*arg_x), 
+  //   return val_type(alpha* ArithTraits<mag_type>::cos(y*arg_x),
   //                   alpha* ArithTraits<mag_type>::sin(y*arg_x));
   // }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) {
-     return ::Kokkos::sqrt (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
+    return ::Kokkos::sqrt(x);
   }
   // static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
   //   const mag_type r = ::Kokkos::abs(x);
@@ -2072,7 +2141,8 @@ class ArithTraits< ::Kokkos::complex<double> > {
   //   return val_type(re,im);
   // }
   // static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) {
-  //   return val_type(ArithTraits<mag_type>::log(abs(x)), ArithTraits<mag_type>::atan(x.imag()/x.real()));
+  //   return val_type(ArithTraits<mag_type>::log(abs(x)),
+  //   ArithTraits<mag_type>::atan(x.imag()/x.real()));
   // }
   // static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) {
   //   return log(x)/ArithTraits<mag_type>::log(mag_type(10));
@@ -2122,9 +2192,11 @@ class ArithTraits< ::Kokkos::complex<double> > {
   //   val_type r_val;
   //   const val_type ii = val_type(0, 1);
   //   if (x == ii) {
-  //     r_val = val_type(ArithTraits<mag_type>::nan(),  std::numeric_limits<mag_type>::infinity());
+  //     r_val = val_type(ArithTraits<mag_type>::nan(),
+  //     std::numeric_limits<mag_type>::infinity());
   //   } if (x == -ii) {
-  //     r_val = val_type(ArithTraits<mag_type>::nan(), -std::numeric_limits<mag_type>::infinity());
+  //     r_val = val_type(ArithTraits<mag_type>::nan(),
+  //     -std::numeric_limits<mag_type>::infinity());
   //   } else {
   //     const val_type ii_x = ii*x;
   //     const mag_type half = 0.5;
@@ -2133,74 +2205,69 @@ class ArithTraits< ::Kokkos::complex<double> > {
   //   }
   //   return r_val;
   // }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type nan () {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
     // ???
-    return val_type (ArithTraits<mag_type>::nan (), ArithTraits<mag_type>::nan ());
+    return val_type(ArithTraits<mag_type>::nan(), ArithTraits<mag_type>::nan());
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () {
-    return ArithTraits<mag_type>::epsilon (); // ???
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() {
+    return ArithTraits<mag_type>::epsilon();  // ???
   }
 
   // Backwards compatibility with Teuchos::ScalarTraits.
   typedef mag_type magnitudeType;
   typedef ::Kokkos::complex<ArithTraits<mag_type>::halfPrecision> halfPrecision;
-  typedef ::Kokkos::complex<ArithTraits<mag_type>::doublePrecision> doublePrecision;
+  typedef ::Kokkos::complex<ArithTraits<mag_type>::doublePrecision>
+      doublePrecision;
 
-  static const bool isComplex = true;
-  static const bool isOrdinal = false;
+  static const bool isComplex    = true;
+  static const bool isOrdinal    = false;
   static const bool isComparable = false;
-  static const bool hasMachineParameters = ArithTraits<mag_type>::hasMachineParameters;
-  static bool isnaninf (const val_type& x) {
-    return isNan (x) || isInf (x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type magnitude (const val_type x) {
-    return abs (x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate (const val_type x) {
-    return conj (x);
+  static const bool hasMachineParameters =
+      ArithTraits<mag_type>::hasMachineParameters;
+  static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type magnitude(const val_type x) {
+    return abs(x);
   }
-  static std::string name () {
-    return "Kokkos::complex<double>";
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
+    return conj(x);
   }
+  static std::string name() { return "Kokkos::complex<double>"; }
   // static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) {
   //   return sqrt (x);
   // }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type eps () {
-    return epsilon ();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin () {
-    return ArithTraits<mag_type>::sfmin (); // ???
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() {
+    return ArithTraits<mag_type>::sfmin();  // ???
   }
-  static KOKKOS_FORCEINLINE_FUNCTION int base () {
-    return ArithTraits<mag_type>::base ();
+  static KOKKOS_FORCEINLINE_FUNCTION int base() {
+    return ArithTraits<mag_type>::base();
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type prec () {
-    return ArithTraits<mag_type>::prec (); // ???
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() {
+    return ArithTraits<mag_type>::prec();  // ???
   }
-  static KOKKOS_FORCEINLINE_FUNCTION int t () {
-    return ArithTraits<mag_type>::t ();
+  static KOKKOS_FORCEINLINE_FUNCTION int t() {
+    return ArithTraits<mag_type>::t();
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd () {
-    return ArithTraits<mag_type>::rnd ();
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() {
+    return ArithTraits<mag_type>::rnd();
   }
-  static KOKKOS_FORCEINLINE_FUNCTION int emin () {
-    return ArithTraits<mag_type>::emin ();
+  static KOKKOS_FORCEINLINE_FUNCTION int emin() {
+    return ArithTraits<mag_type>::emin();
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin () {
-    return ArithTraits<mag_type>::rmin ();
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() {
+    return ArithTraits<mag_type>::rmin();
   }
-  static KOKKOS_FORCEINLINE_FUNCTION int emax () {
-    return ArithTraits<mag_type>::emax ();
+  static KOKKOS_FORCEINLINE_FUNCTION int emax() {
+    return ArithTraits<mag_type>::emax();
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax () {
-    return ArithTraits<mag_type>::rmax ();
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() {
+    return ArithTraits<mag_type>::rmax();
   }
 };
 
-
-template<>
+template <>
 class ArithTraits<char> {
-public:
+ public:
   typedef char val_type;
   typedef val_type mag_type;
 
@@ -2209,54 +2276,44 @@ class ArithTraits<char> {
   // fact, signed char, unsigned char, and char are distinct types.
   // We can use std::numeric_limits here because it's a const bool,
   // not a class method.
-  static const bool is_signed = std::numeric_limits<char>::is_signed;
+  static const bool is_signed  = std::numeric_limits<char>::is_signed;
   static const bool is_integer = true;
-  static const bool is_exact = true;
+  static const bool is_exact   = true;
   static const bool is_complex = false;
 
   static constexpr bool has_infinity = false;
   static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; }
 
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type ) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) {
     return false;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type ) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) {
     return false;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const val_type x) {
-    // This avoids warnings based on whether char is signed or unsigned 
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
+    // This avoids warnings based on whether char is signed or unsigned
     return integer_abs<char>::abs(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero () {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one () {
-    return 1;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min () {
-    return CHAR_MIN;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max () {
-    return CHAR_MAX;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return CHAR_MIN; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return CHAR_MAX; }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag (const val_type ) {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type
-  pow (const val_type x, const val_type y) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
+                                                  const val_type y) {
     if (is_signed) {
-      return intPowSigned<val_type> (x, y);
+      return intPowSigned<val_type>(x, y);
     } else {
-      return intPowUnsigned<val_type> (x, y);
+      return intPowUnsigned<val_type>(x, y);
     }
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
     // C++11 defines std::sqrt for integer arguments.  However, we
     // currently can't assume C++11.
     //
@@ -2283,7 +2340,7 @@ class ArithTraits<char> {
 #endif
     );
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
     return static_cast<val_type>(
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
         sycl::cbrt(static_cast<float>(abs(x)))
@@ -2292,14 +2349,14 @@ class ArithTraits<char> {
 #endif
     );
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
-    return static_cast<val_type> ( ::exp (static_cast<float> (abs (x))));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
+    return static_cast<val_type>(::exp(static_cast<float>(abs(x))));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) {
-    return static_cast<val_type> ( ::log (static_cast<float> (abs (x))));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
+    return static_cast<val_type>(::log(static_cast<float>(abs(x))));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) {
-    return static_cast<val_type> ( ::log10 (static_cast<float> (abs (x))));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
+    return static_cast<val_type>(::log10(static_cast<float>(abs(x))));
   }
   // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
   //   return static_cast<val_type> ( ::sin (static_cast<float> (x)));
@@ -2328,87 +2385,72 @@ class ArithTraits<char> {
   // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
   //   return static_cast<val_type> ( ::atan (static_cast<float> (x)));
   // }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () {
-    return zero ();
-  }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); }
 
   // Backwards compatibility with Teuchos::ScalarTraits.
   typedef mag_type magnitudeType;
   typedef val_type halfPrecision;
   typedef val_type doublePrecision;
 
-  static const bool isComplex = false;
-  static const bool isOrdinal = true;
-  static const bool isComparable = true;
+  static const bool isComplex            = false;
+  static const bool isOrdinal            = true;
+  static const bool isComparable         = true;
   static const bool hasMachineParameters = false;
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude (const val_type x) {
-    return abs (x);
+  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
+    return abs(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate (const val_type x) {
-    return conj (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
+    return conj(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf (const val_type) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) {
     return false;
   }
-  static std::string name () {
-    return "char";
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) {
-    return sqrt (x);
+  static std::string name() { return "char"; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
+    return sqrt(x);
   }
 };
 
-
-template<>
+template <>
 class ArithTraits<signed char> {
-public:
+ public:
   typedef signed char val_type;
   typedef val_type mag_type;
 
   static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = true;
-  static const bool is_exact = true;
-  static const bool is_complex = false;
+  static const bool is_signed      = true;
+  static const bool is_integer     = true;
+  static const bool is_exact       = true;
+  static const bool is_complex     = false;
 
   static constexpr bool has_infinity = false;
   static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; }
 
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type ) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) {
     return false;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type ) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) {
     return false;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
     return x >= 0 ? x : -x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero () {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one () {
-    return 1;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min () {
-    return SCHAR_MIN;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max () {
-    return SCHAR_MAX;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return SCHAR_MIN; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return SCHAR_MAX; }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag (const val_type ) {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type
-  pow (const val_type x, const val_type y) {
-    return intPowSigned<val_type> (x, y);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
+                                                  const val_type y) {
+    return intPowSigned<val_type>(x, y);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
     return static_cast<val_type>(
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
         sycl::sqrt(static_cast<float>(abs(x)))
@@ -2417,7 +2459,7 @@ class ArithTraits<signed char> {
 #endif
     );
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
     return static_cast<val_type>(
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
         sycl::cbrt(static_cast<float>(abs(x)))
@@ -2426,14 +2468,14 @@ class ArithTraits<signed char> {
 #endif
     );
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
-    return static_cast<val_type> ( ::exp (static_cast<float> (abs (x))));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
+    return static_cast<val_type>(::exp(static_cast<float>(abs(x))));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) {
-    return static_cast<val_type> ( ::log (static_cast<float> (abs (x))));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
+    return static_cast<val_type>(::log(static_cast<float>(abs(x))));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) {
-    return static_cast<val_type> ( ::log10 (static_cast<float> (abs (x))));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
+    return static_cast<val_type>(::log10(static_cast<float>(abs(x))));
   }
   // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
   //   return static_cast<val_type> ( ::sin (static_cast<float> (x)));
@@ -2462,87 +2504,72 @@ class ArithTraits<signed char> {
   // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
   //   return static_cast<val_type> ( ::atan (static_cast<float> (x)));
   // }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () {
-    return zero ();
-  }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); }
 
   // Backwards compatibility with Teuchos::ScalarTraits.
   typedef mag_type magnitudeType;
   typedef val_type halfPrecision;
   typedef val_type doublePrecision;
 
-  static const bool isComplex = false;
-  static const bool isOrdinal = true;
-  static const bool isComparable = true;
+  static const bool isComplex            = false;
+  static const bool isOrdinal            = true;
+  static const bool isComparable         = true;
   static const bool hasMachineParameters = false;
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude (const val_type x) {
-    return abs (x);
+  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
+    return abs(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate (const val_type x) {
-    return conj (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
+    return conj(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf (const val_type) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) {
     return false;
   }
-  static std::string name () {
-    return "signed char";
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) {
-    return sqrt (x);
+  static std::string name() { return "signed char"; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
+    return sqrt(x);
   }
 };
 
-
-template<>
+template <>
 class ArithTraits<unsigned char> {
-public:
+ public:
   typedef unsigned char val_type;
   typedef val_type mag_type;
 
   static const bool is_specialized = true;
-  static const bool is_signed = false;
-  static const bool is_integer = true;
-  static const bool is_exact = true;
-  static const bool is_complex = false;
+  static const bool is_signed      = false;
+  static const bool is_integer     = true;
+  static const bool is_exact       = true;
+  static const bool is_complex     = false;
 
   static constexpr bool has_infinity = false;
   static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; }
 
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type ) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) {
     return false;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type ) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) {
     return false;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const val_type x) {
-    return x; // it's unsigned, so it's positive
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero () {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one () {
-    return 1;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min () {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max () {
-    return UCHAR_MAX;
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
+    return x;  // it's unsigned, so it's positive
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return UCHAR_MAX; }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag (const val_type ) {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type
-  pow (const val_type x, const val_type y) {
-    return intPowUnsigned<val_type> (x, y);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
+                                                  const val_type y) {
+    return intPowUnsigned<val_type>(x, y);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
     // This will result in no loss of accuracy, though it might be
     // more expensive than it should, if we were clever about using
     // bit operations.
@@ -2554,7 +2581,7 @@ class ArithTraits<unsigned char> {
 #endif
     );
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
     return static_cast<val_type>(
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
         sycl::cbrt(static_cast<float>(abs(x)))
@@ -2563,14 +2590,14 @@ class ArithTraits<unsigned char> {
 #endif
     );
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
-    return static_cast<val_type> ( ::exp (static_cast<float> (x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
+    return static_cast<val_type>(::exp(static_cast<float>(x)));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) {
-    return static_cast<val_type> ( ::log (static_cast<float> (x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
+    return static_cast<val_type>(::log(static_cast<float>(x)));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) {
-    return static_cast<val_type> ( ::log10 (static_cast<float> (x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
+    return static_cast<val_type>(::log10(static_cast<float>(x)));
   }
   // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
   //   return static_cast<val_type> ( ::sin (static_cast<float> (x)));
@@ -2599,59 +2626,54 @@ class ArithTraits<unsigned char> {
   // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
   //   return static_cast<val_type> ( ::atan (static_cast<float> (x)));
   // }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () {
-    return zero ();
-  }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); }
 
   // Backwards compatibility with Teuchos::ScalarTraits.
   typedef mag_type magnitudeType;
   typedef val_type halfPrecision;
   typedef val_type doublePrecision;
 
-  static const bool isComplex = false;
-  static const bool isOrdinal = true;
-  static const bool isComparable = true;
+  static const bool isComplex            = false;
+  static const bool isOrdinal            = true;
+  static const bool isComparable         = true;
   static const bool hasMachineParameters = false;
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude (const val_type x) {
-    return abs (x);
+  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
+    return abs(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate (const val_type x) {
-    return conj (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
+    return conj(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf (const val_type) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) {
     return false;
   }
-  static std::string name () {
-    return "unsigned char";
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) {
-    return sqrt (x);
+  static std::string name() { return "unsigned char"; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
+    return sqrt(x);
   }
 };
 
-
-template<>
+template <>
 class ArithTraits<short> {
-public:
+ public:
   typedef short val_type;
   typedef val_type mag_type;
 
   static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = true;
-  static const bool is_exact = true;
-  static const bool is_complex = false;
+  static const bool is_signed      = true;
+  static const bool is_integer     = true;
+  static const bool is_exact       = true;
+  static const bool is_complex     = false;
 
   static constexpr bool has_infinity = false;
   static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; }
 
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type ) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) {
     return false;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type ) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) {
     return false;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
     // std::abs appears to work with CUDA 5.5 at least, but I'll use
     // the ternary expression for maximum generality.  Note that this
     // expression does not necessarily obey the rules for fabs() with
@@ -2659,35 +2681,28 @@ class ArithTraits<short> {
     // It's perfectly fine for signed integer types, though.
     return x >= 0 ? x : -x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero () {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one () {
-    return 1;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min () {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type min() {
     // Macros like this work with CUDA, but
     // std::numeric_limits<val_type>::min() does not, because it is
     // not marked as a __device__ function.
     return SHRT_MIN;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max () {
-    return SHRT_MAX;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return SHRT_MAX; }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag (const val_type ) {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const val_type y) {
-    return intPowSigned<val_type> (x, y);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
+                                                  const val_type y) {
+    return intPowSigned<val_type>(x, y);
   }
   //! Integer square root returns a lower bound.
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
     // This will result in no loss of accuracy, though it might be
     // more expensive than it should, if we were clever about using
     // bit operations.
@@ -2699,7 +2714,7 @@ class ArithTraits<short> {
 #endif
     );
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
     return static_cast<val_type>(
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
         sycl::cbrt(static_cast<float>(abs(x)))
@@ -2708,14 +2723,14 @@ class ArithTraits<short> {
 #endif
     );
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
-    return static_cast<val_type> ( ::exp (static_cast<float> (abs (x))));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
+    return static_cast<val_type>(::exp(static_cast<float>(abs(x))));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) {
-    return static_cast<val_type> ( ::log (static_cast<float> (abs (x))));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
+    return static_cast<val_type>(::log(static_cast<float>(abs(x))));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) {
-    return static_cast<val_type> ( ::log10 (static_cast<float> (abs (x))));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
+    return static_cast<val_type>(::log10(static_cast<float>(abs(x))));
   }
   // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
   //   return static_cast<val_type> ( ::sin (static_cast<float> (x)));
@@ -2744,93 +2759,78 @@ class ArithTraits<short> {
   // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
   //   return static_cast<val_type> ( ::atan (static_cast<float> (x)));
   // }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type nan () {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
     // short doesn't implement a NaN value, but we can still have it
     // return some "flag" value that can help users find use of
     // uninitialized data.
-    return static_cast<val_type> (-1);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () {
-    return zero ();
+    return static_cast<val_type>(-1);
   }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); }
 
   // Backwards compatibility with Teuchos::ScalarTraits.
   typedef mag_type magnitudeType;
   typedef val_type halfPrecision;
   typedef val_type doublePrecision;
 
-  static const bool isComplex = false;
-  static const bool isOrdinal = true;
-  static const bool isComparable = true;
+  static const bool isComplex            = false;
+  static const bool isOrdinal            = true;
+  static const bool isComparable         = true;
   static const bool hasMachineParameters = false;
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude (const val_type x) {
-    return abs (x);
+  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
+    return abs(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate (const val_type x) {
-    return conj (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
+    return conj(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf (const val_type) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) {
     return false;
   }
-  static std::string name () {
-    return "short";
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) {
-    return sqrt (x);
+  static std::string name() { return "short"; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
+    return sqrt(x);
   }
 };
 
-
-template<>
+template <>
 class ArithTraits<unsigned short> {
-public:
+ public:
   typedef unsigned short val_type;
   typedef val_type mag_type;
 
   static const bool is_specialized = true;
-  static const bool is_signed = false;
-  static const bool is_integer = true;
-  static const bool is_exact = true;
-  static const bool is_complex = false;
+  static const bool is_signed      = false;
+  static const bool is_integer     = true;
+  static const bool is_exact       = true;
+  static const bool is_complex     = false;
 
   static constexpr bool has_infinity = false;
   static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; }
 
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type ) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) {
     return false;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type ) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) {
     return false;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const val_type x) {
-    return x; // it's unsigned, so it's positive
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero () {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one () {
-    return 1;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min () {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max () {
-    return USHRT_MAX;
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
+    return x;  // it's unsigned, so it's positive
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return USHRT_MAX; }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag (const val_type ) {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type
-  pow (const val_type x, const val_type y) {
-    return intPowUnsigned<val_type> (x, y);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
+                                                  const val_type y) {
+    return intPowUnsigned<val_type>(x, y);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
     // This will result in no loss of accuracy, though it might be
     // more expensive than it should, if we were clever about using
     // bit operations.
@@ -2842,7 +2842,7 @@ class ArithTraits<unsigned short> {
 #endif
     );
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
     return static_cast<val_type>(
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
         sycl::cbrt(static_cast<float>(abs(x)))
@@ -2851,14 +2851,14 @@ class ArithTraits<unsigned short> {
 #endif
     );
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
-    return static_cast<val_type> ( ::exp (static_cast<float> (x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
+    return static_cast<val_type>(::exp(static_cast<float>(x)));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) {
-    return static_cast<val_type> ( ::log (static_cast<float> (x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
+    return static_cast<val_type>(::log(static_cast<float>(x)));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) {
-    return static_cast<val_type> ( ::log10 (static_cast<float> (x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
+    return static_cast<val_type>(::log10(static_cast<float>(x)));
   }
   // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
   //   return static_cast<val_type> ( ::sin (static_cast<float> (x)));
@@ -2887,65 +2887,60 @@ class ArithTraits<unsigned short> {
   // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
   //   return static_cast<val_type> ( ::atan (static_cast<float> (x)));
   // }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type nan () {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
     // unsigned short doesn't implement a NaN value, but we can still
     // have it return some "flag" value that can help users find use
     // of uninitialized data.
-    return max ();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () {
-    return zero ();
+    return max();
   }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); }
 
   // Backwards compatibility with Teuchos::ScalarTraits.
   typedef mag_type magnitudeType;
   typedef val_type halfPrecision;
   typedef val_type doublePrecision;
 
-  static const bool isComplex = false;
-  static const bool isOrdinal = true;
-  static const bool isComparable = true;
+  static const bool isComplex            = false;
+  static const bool isOrdinal            = true;
+  static const bool isComparable         = true;
   static const bool hasMachineParameters = false;
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude (const val_type x) {
-    return abs (x);
+  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
+    return abs(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate (const val_type x) {
-    return conj (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
+    return conj(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf (const val_type) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) {
     return false;
   }
-  static std::string name () {
-    return "unsigned short";
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) {
-    return sqrt (x);
+  static std::string name() { return "unsigned short"; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
+    return sqrt(x);
   }
 };
 
-
-template<>
+template <>
 class ArithTraits<int> {
-public:
+ public:
   typedef int val_type;
   typedef val_type mag_type;
 
   static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = true;
-  static const bool is_exact = true;
-  static const bool is_complex = false;
+  static const bool is_signed      = true;
+  static const bool is_integer     = true;
+  static const bool is_exact       = true;
+  static const bool is_complex     = false;
 
   static constexpr bool has_infinity = false;
   static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; }
 
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type ) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) {
     return false;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type ) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) {
     return false;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
     // std::abs appears to work with CUDA 5.5 at least, but I'll use
     // the ternary expression for maximum generality.  Note that this
     // expression does not necessarily obey the rules for fabs() with
@@ -2953,35 +2948,27 @@ class ArithTraits<int> {
     // It's perfectly fine for signed integer types, though.
     return x >= 0 ? x : -x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero () {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one () {
-    return 1;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min () {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type min() {
     // Macros like INT_MIN work with CUDA, but
     // std::numeric_limits<val_type>::min() does not, because it is
     // not marked as a __device__ function.
     return INT_MIN;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max () {
-    return INT_MAX;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return INT_MAX; }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag (const val_type ) {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type
-  pow (const val_type x, const val_type y) {
-    return intPowSigned<val_type> (x, y);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
+                                                  const val_type y) {
+    return intPowSigned<val_type>(x, y);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
     // This will result in no loss of accuracy, though it might be
     // more expensive than it should, if we were clever about using
     // bit operations.
@@ -2993,7 +2980,7 @@ class ArithTraits<int> {
 #endif
     );
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
     return static_cast<val_type>(
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
         sycl::cbrt(static_cast<float>(abs(x)))
@@ -3002,14 +2989,14 @@ class ArithTraits<int> {
 #endif
     );
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
-    return static_cast<val_type> ( ::exp (static_cast<double> (abs (x))));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
+    return static_cast<val_type>(::exp(static_cast<double>(abs(x))));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) {
-    return static_cast<val_type> ( ::log (static_cast<double> (abs (x))));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
+    return static_cast<val_type>(::log(static_cast<double>(abs(x))));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) {
-    return static_cast<val_type> ( ::log10 (static_cast<double> (abs (x))));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
+    return static_cast<val_type>(::log10(static_cast<double>(abs(x))));
   }
   // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
   //   return static_cast<val_type> ( ::sin (static_cast<double> (x)));
@@ -3038,93 +3025,78 @@ class ArithTraits<int> {
   // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
   //   return static_cast<val_type> ( ::atan (static_cast<double> (x)));
   // }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type nan () {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
     // int doesn't implement a NaN value, but we can still have it
     // return some "flag" value that can help users find use of
     // uninitialized data.
     return -1;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () {
-    return zero ();
-  }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); }
 
   // Backwards compatibility with Teuchos::ScalarTraits.
   typedef mag_type magnitudeType;
   typedef val_type halfPrecision;
   typedef val_type doublePrecision;
 
-  static const bool isComplex = false;
-  static const bool isOrdinal = true;
-  static const bool isComparable = true;
+  static const bool isComplex            = false;
+  static const bool isOrdinal            = true;
+  static const bool isComparable         = true;
   static const bool hasMachineParameters = false;
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude (const val_type x) {
-    return abs (x);
+  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
+    return abs(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate (const val_type x) {
-    return conj (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
+    return conj(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf (const val_type) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) {
     return false;
   }
-  static std::string name () {
-    return "int";
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) {
-    return sqrt (x);
+  static std::string name() { return "int"; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
+    return sqrt(x);
   }
 };
 
-
-template<>
+template <>
 class ArithTraits<unsigned int> {
-public:
+ public:
   typedef unsigned int val_type;
   typedef val_type mag_type;
 
   static const bool is_specialized = true;
-  static const bool is_signed = false;
-  static const bool is_integer = true;
-  static const bool is_exact = true;
-  static const bool is_complex = false;
+  static const bool is_signed      = false;
+  static const bool is_integer     = true;
+  static const bool is_exact       = true;
+  static const bool is_complex     = false;
 
   static constexpr bool has_infinity = false;
   static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; }
 
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type ) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) {
     return false;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type ) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) {
     return false;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const val_type x) {
-    return x; // it's unsigned, so it's positive
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero () {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one () {
-    return 1;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min () {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max () {
-    return UINT_MAX;
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
+    return x;  // it's unsigned, so it's positive
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return UINT_MAX; }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag (const val_type ) {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type
-  pow (const val_type x, const val_type y) {
-    return intPowUnsigned<val_type> (x, y);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
+                                                  const val_type y) {
+    return intPowUnsigned<val_type>(x, y);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
     // This will result in no loss of accuracy, though it might be
     // more expensive than it should, if we were clever about using
     // bit operations.
@@ -3136,7 +3108,7 @@ class ArithTraits<unsigned int> {
 #endif
     );
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
     return static_cast<val_type>(
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
         sycl::cbrt(static_cast<float>(abs(x)))
@@ -3145,14 +3117,14 @@ class ArithTraits<unsigned int> {
 #endif
     );
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
-    return static_cast<val_type> ( ::exp (static_cast<double> (x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
+    return static_cast<val_type>(::exp(static_cast<double>(x)));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) {
-    return static_cast<val_type> ( ::log (static_cast<double> (x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
+    return static_cast<val_type>(::log(static_cast<double>(x)));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) {
-    return static_cast<val_type> ( ::log10 (static_cast<double> (x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
+    return static_cast<val_type>(::log10(static_cast<double>(x)));
   }
   // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
   //   return static_cast<val_type> ( ::sin (static_cast<double> (x)));
@@ -3181,106 +3153,91 @@ class ArithTraits<unsigned int> {
   // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
   //   return static_cast<val_type> ( ::atan (static_cast<double> (x)));
   // }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type nan () {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
     // unsigned int doesn't implement a NaN value, but we can still
     // have it return some "flag" value that can help users find use
     // of uninitialized data.
-    return max ();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () {
-    return zero ();
+    return max();
   }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); }
 
   // Backwards compatibility with Teuchos::ScalarTraits.
   typedef mag_type magnitudeType;
   typedef val_type halfPrecision;
   typedef val_type doublePrecision;
 
-  static const bool isComplex = false;
-  static const bool isOrdinal = true;
-  static const bool isComparable = true;
+  static const bool isComplex            = false;
+  static const bool isOrdinal            = true;
+  static const bool isComparable         = true;
   static const bool hasMachineParameters = false;
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude (const val_type x) {
-    return abs (x);
+  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
+    return abs(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate (const val_type x) {
-    return conj (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
+    return conj(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf (const val_type) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) {
     return false;
   }
-  static std::string name () {
-    return "unsigned int";
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) {
-    return sqrt (x);
+  static std::string name() { return "unsigned int"; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
+    return sqrt(x);
   }
 };
 
-
-template<>
+template <>
 class ArithTraits<long> {
-public:
+ public:
   typedef long val_type;
   typedef val_type mag_type;
 
   static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = true;
-  static const bool is_exact = true;
-  static const bool is_complex = false;
+  static const bool is_signed      = true;
+  static const bool is_integer     = true;
+  static const bool is_exact       = true;
+  static const bool is_complex     = false;
 
   static constexpr bool has_infinity = false;
   static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; }
 
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type ) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) {
     return false;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type ) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) {
     return false;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
     return x >= 0 ? x : -x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero () {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one () {
-    return 1;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min () {
-    return LONG_MIN;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max () {
-    return LONG_MAX;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return LONG_MIN; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return LONG_MAX; }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag (const val_type ) {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type
-  pow (const val_type x, const val_type y) {
-    return intPowSigned<val_type> (x, y);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
+                                                  const val_type y) {
+    return intPowSigned<val_type>(x, y);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) {
-    using std::sqrt;
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
     using std::abs;
+    using std::sqrt;
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-    return static_cast<val_type> ( sqrt (static_cast<long double> (abs (x))));
+    return static_cast<val_type>(sqrt(static_cast<long double>(abs(x))));
 #else
-    return static_cast<val_type> ( sqrt (static_cast<double> (abs (x))));
+    return static_cast<val_type>(sqrt(static_cast<double>(abs(x))));
 #endif
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) {
-    return static_cast<val_type> ( ::log (static_cast<double> (abs (x))));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
+    return static_cast<val_type>(::log(static_cast<double>(abs(x))));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) {
-    return static_cast<val_type> ( ::log10 (static_cast<double> (abs (x))));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
+    return static_cast<val_type>(::log10(static_cast<double>(abs(x))));
   }
   // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
   //   return static_cast<val_type> ( ::sin (static_cast<double> (x)));
@@ -3309,103 +3266,89 @@ class ArithTraits<long> {
   // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
   //   return static_cast<val_type> ( ::atan (static_cast<double> (x)));
   // }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type nan () {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
     // long doesn't implement a NaN value, but we can still have it
     // return some "flag" value that can help users find use of
     // uninitialized data.
     return -1;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () {
-    return zero ();
-  }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); }
 
   // Backwards compatibility with Teuchos::ScalarTraits.
   typedef mag_type magnitudeType;
   typedef val_type halfPrecision;
   typedef val_type doublePrecision;
 
-  static const bool isComplex = false;
-  static const bool isOrdinal = true;
-  static const bool isComparable = true;
+  static const bool isComplex            = false;
+  static const bool isOrdinal            = true;
+  static const bool isComparable         = true;
   static const bool hasMachineParameters = false;
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude (const val_type x) {
-    return abs (x);
+  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
+    return abs(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate (const val_type x) {
-    return conj (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
+    return conj(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf (const val_type) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) {
     return false;
   }
-  static std::string name () {
-    return "long";
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) {
-    return sqrt (x);
+  static std::string name() { return "long"; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
+    return sqrt(x);
   }
 };
 
-
-template<>
+template <>
 class ArithTraits<unsigned long> {
-public:
+ public:
   typedef unsigned long val_type;
   typedef val_type mag_type;
 
   static const bool is_specialized = true;
-  static const bool is_signed = false;
-  static const bool is_integer = true;
-  static const bool is_exact = true;
-  static const bool is_complex = false;
+  static const bool is_signed      = false;
+  static const bool is_integer     = true;
+  static const bool is_exact       = true;
+  static const bool is_complex     = false;
 
   static constexpr bool has_infinity = false;
   static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; }
 
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type ) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) {
     return false;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type ) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) {
     return false;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero () {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one () {
-    return 1;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min () {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max () {
-    return ULONG_MAX;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return ULONG_MAX; }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag (const val_type ) {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const val_type y) {
-    return intPowUnsigned<val_type> (x, y);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
+                                                  const val_type y) {
+    return intPowUnsigned<val_type>(x, y);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
     using std::sqrt;
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-    return static_cast<val_type> ( sqrt (static_cast<long double> (x)));
+    return static_cast<val_type>(sqrt(static_cast<long double>(x)));
 #else
-    return static_cast<val_type> ( sqrt (static_cast<double> (x)));
+    return static_cast<val_type>(sqrt(static_cast<double>(x)));
 #endif
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
     using std::cbrtl;
-    return static_cast<val_type> ( ::cbrtl (static_cast<long double> (x)));
+    return static_cast<val_type>(::cbrtl(static_cast<long double>(x)));
 #else
     return static_cast<val_type>(
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
@@ -3416,14 +3359,14 @@ class ArithTraits<unsigned long> {
     );
 #endif
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
-    return static_cast<val_type> ( ::exp (static_cast<double> (x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
+    return static_cast<val_type>(::exp(static_cast<double>(x)));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) {
-    return static_cast<long> ( ::log (static_cast<double> (x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
+    return static_cast<long>(::log(static_cast<double>(x)));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) {
-    return static_cast<long> ( ::log10 (static_cast<double> (x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
+    return static_cast<long>(::log10(static_cast<double>(x)));
   }
   // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
   //   return static_cast<val_type> ( ::sin (static_cast<double> (x)));
@@ -3452,101 +3395,86 @@ class ArithTraits<unsigned long> {
   // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
   //   return static_cast<val_type> ( ::atan (static_cast<double> (x)));
   // }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type nan () {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
     // unsigned long doesn't implement a NaN value, but we can still
     // have it return some "flag" value that can help users find use
     // of uninitialized data.
-    return max ();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () {
-    return zero ();
+    return max();
   }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); }
 
   // Backwards compatibility with Teuchos::ScalarTraits.
   typedef mag_type magnitudeType;
   typedef val_type halfPrecision;
   typedef val_type doublePrecision;
 
-  static const bool isComplex = false;
-  static const bool isOrdinal = true;
-  static const bool isComparable = true;
+  static const bool isComplex            = false;
+  static const bool isOrdinal            = true;
+  static const bool isComparable         = true;
   static const bool hasMachineParameters = false;
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude (const val_type x) {
-    return abs (x);
+  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
+    return abs(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate (const val_type x) {
-    return conj (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
+    return conj(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf (const val_type) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) {
     return false;
   }
-  static std::string name () {
-    return "unsigned long";
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) {
-    return sqrt (x);
+  static std::string name() { return "unsigned long"; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
+    return sqrt(x);
   }
 };
 
-
-template<>
+template <>
 class ArithTraits<long long> {
-public:
+ public:
   typedef long long val_type;
   typedef val_type mag_type;
 
   static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = true;
-  static const bool is_exact = true;
-  static const bool is_complex = false;
+  static const bool is_signed      = true;
+  static const bool is_integer     = true;
+  static const bool is_exact       = true;
+  static const bool is_complex     = false;
 
   static constexpr bool has_infinity = false;
   static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; }
 
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type ) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) {
     return false;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type ) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) {
     return false;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
     return x >= 0 ? x : -x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero () {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one () {
-    return 1;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min () {
-    return LLONG_MIN;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max () {
-    return LLONG_MAX;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return LLONG_MIN; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return LLONG_MAX; }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag (const val_type ) {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type
-  pow (const val_type x, const val_type y) {
-    return intPowSigned<val_type> (x, y);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
+                                                  const val_type y) {
+    return intPowSigned<val_type>(x, y);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-    using std::sqrt;
     using std::abs;
+    using std::sqrt;
     // IEEE 754 promises that long double has at least 64 significand
     // bits, so we can use it to represent any signed or unsigned
     // 64-bit integer type exactly.  However, CUDA does not implement
     // long double for device functions.
-    return static_cast<val_type> ( sqrt (static_cast<long double> (abs (x))));
+    return static_cast<val_type>(sqrt(static_cast<long double>(abs(x))));
 #elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
     // Casting from a 64-bit integer type to double does result in a
     // loss of accuracy.  However, it gives us a good first
@@ -3560,28 +3488,28 @@ class ArithTraits<long long> {
     // within 1 of the result.
     return static_cast<val_type>(sycl::sqrt(static_cast<double>(abs(x))));
 #else
-    return static_cast<val_type> ( ::sqrt (static_cast<double> (abs (x))));
+    return static_cast<val_type>(::sqrt(static_cast<double>(abs(x))));
 #endif
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-    using std::cbrtl;
     using std::abs;
-    return static_cast<val_type> ( cbrtl (static_cast<long double> (abs (x))));
+    using std::cbrtl;
+    return static_cast<val_type>(cbrtl(static_cast<long double>(abs(x))));
 #elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
     return static_cast<val_type>(sycl::cbrt(static_cast<double>(abs(x))));
 #else
-    return static_cast<val_type> ( ::cbrt (static_cast<double> (abs (x))));
+    return static_cast<val_type>(::cbrt(static_cast<double>(abs(x))));
 #endif
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
-    return static_cast<val_type> ( ::exp (static_cast<double> (abs (x))));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
+    return static_cast<val_type>(::exp(static_cast<double>(abs(x))));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) {
-    return static_cast<val_type> ( ::log (static_cast<double> (abs (x))));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
+    return static_cast<val_type>(::log(static_cast<double>(abs(x))));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) {
-    return static_cast<val_type> ( ::log10 (static_cast<double> (abs (x))));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
+    return static_cast<val_type>(::log10(static_cast<double>(abs(x))));
   }
   // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
   //   return static_cast<val_type> ( ::sin (static_cast<double> (x)));
@@ -3610,120 +3538,105 @@ class ArithTraits<long long> {
   // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
   //   return static_cast<val_type> ( ::atan (static_cast<double> (x)));
   // }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type nan () {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
     // long long doesn't implement a NaN value, but we can still have
     // it return some "flag" value that can help users find use of
     // uninitialized data.
     return -1;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () {
-    return zero ();
-  }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); }
 
   // Backwards compatibility with Teuchos::ScalarTraits.
   typedef mag_type magnitudeType;
   typedef val_type halfPrecision;
   typedef val_type doublePrecision;
 
-  static const bool isComplex = false;
-  static const bool isOrdinal = true;
-  static const bool isComparable = true;
+  static const bool isComplex            = false;
+  static const bool isOrdinal            = true;
+  static const bool isComparable         = true;
   static const bool hasMachineParameters = false;
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude (const val_type x) {
-    return abs (x);
+  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
+    return abs(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate (const val_type x) {
-    return conj (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
+    return conj(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf (const val_type) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) {
     return false;
   }
-  static std::string name () {
-    return "long long";
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) {
-    return sqrt (x);
+  static std::string name() { return "long long"; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
+    return sqrt(x);
   }
 };
 
-
-template<>
+template <>
 class ArithTraits<unsigned long long> {
-public:
+ public:
   typedef unsigned long long val_type;
   typedef val_type mag_type;
 
   static const bool is_specialized = true;
-  static const bool is_signed = false;
-  static const bool is_integer = true;
-  static const bool is_exact = true;
-  static const bool is_complex = false;
+  static const bool is_signed      = false;
+  static const bool is_integer     = true;
+  static const bool is_exact       = true;
+  static const bool is_complex     = false;
 
   static constexpr bool has_infinity = false;
   static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; }
 
-  static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type ) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) {
     return false;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type ) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) {
     return false;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const val_type x) {
-    return x; // unsigned integers are always nonnegative
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type zero () {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type one () {
-    return 1;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type min () {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type max () {
-    return ULLONG_MAX ;
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) {
+    return x;  // unsigned integers are always nonnegative
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type real (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return ULLONG_MAX; }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag (const val_type ) {
-    return 0;
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conj (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) {
     return x;
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type
-  pow (const val_type x, const val_type y) {
-    return intPowUnsigned<val_type> (x, y);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x,
+                                                  const val_type y) {
+    return intPowUnsigned<val_type>(x, y);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
     using std::sqrt;
-    return static_cast<val_type> ( sqrt (static_cast<long double> (x)));
+    return static_cast<val_type>(sqrt(static_cast<long double>(x)));
 #elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
     return static_cast<val_type>(sycl::sqrt(static_cast<double>(x)));
 #else
-    return static_cast<val_type> ( ::sqrt (static_cast<double> (x)));
+    return static_cast<val_type>(::sqrt(static_cast<double>(x)));
 #endif
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
     using std::cbrtl;
-    return static_cast<val_type> ( cbrtl (static_cast<long double> (x)));
+    return static_cast<val_type>(cbrtl(static_cast<long double>(x)));
 #elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL)
     return static_cast<val_type>(sycl::cbrt(static_cast<double>(x)));
 #else
-    return static_cast<val_type> ( ::cbrt (static_cast<double> (x)));
+    return static_cast<val_type>(::cbrt(static_cast<double>(x)));
 #endif
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) {
-    return static_cast<val_type> ( ::exp (static_cast<double> (x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) {
+    return static_cast<val_type>(::exp(static_cast<double>(x)));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) {
-    return static_cast<val_type> ( ::log (static_cast<double> (x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) {
+    return static_cast<val_type>(::log(static_cast<double>(x)));
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) {
-    return static_cast<val_type> ( ::log10 (static_cast<double> (x)));
+  static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) {
+    return static_cast<val_type>(::log10(static_cast<double>(x)));
   }
   // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
   //   return static_cast<val_type> ( ::sin (static_cast<double> (x)));
@@ -3752,39 +3665,35 @@ class ArithTraits<unsigned long long> {
   // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
   //   return static_cast<val_type> ( ::atan (static_cast<double> (x)));
   // }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type nan () {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type nan() {
     // unsigned long long doesn't implement a NaN value, but we can
     // still have it return some "flag" value that can help users find
     // use of uninitialized data.
-    return max ();
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon () {
-    return zero ();
+    return max();
   }
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); }
 
   // Backwards compatibility with Teuchos::ScalarTraits.
   typedef mag_type magnitudeType;
   typedef val_type halfPrecision;
   typedef val_type doublePrecision;
 
-  static const bool isComplex = false;
-  static const bool isOrdinal = true;
-  static const bool isComparable = true;
+  static const bool isComplex            = false;
+  static const bool isOrdinal            = true;
+  static const bool isComparable         = true;
   static const bool hasMachineParameters = false;
-  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude (const val_type x) {
-    return abs (x);
+  static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) {
+    return abs(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate (const val_type x) {
-    return conj (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) {
+    return conj(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf (const val_type) {
+  static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) {
     return false;
   }
-  static std::string name () {
-    return "unsigned long long";
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) {
-    return sqrt (x);
+  static std::string name() { return "unsigned long long"; }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) {
+    return sqrt(x);
   }
 };
 
@@ -3803,165 +3712,124 @@ class ArithTraits<unsigned long long> {
 // Hence, the class methods of the ArithTraits specializations for
 // dd_real and qd_real are not marked as device functions.
 #ifdef HAVE_KOKKOS_QD
-template<>
-struct ArithTraits<dd_real>
-{
+template <>
+struct ArithTraits<dd_real> {
   typedef dd_real val_type;
   typedef dd_real mag_type;
 
   static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool is_complex = false;
-
-  static inline bool isInf (const val_type& x) {
-    return isinf (x);
-  }
-  static inline bool isNan (const val_type& x) {
-    return isnan (x);
-  }
-  static inline mag_type abs (const val_type& x) {
-    return ::abs (x);
-  }
-  static inline val_type zero () {
-    return val_type (0.0);
-  }
-  static inline val_type one () {
-    return val_type (1.0);
-  }
-  static inline val_type min () {
-    return std::numeric_limits<val_type>::min ();
-  }
-  static inline val_type max () {
-    return std::numeric_limits<val_type>::max ();
-  }
-  static inline mag_type real (const val_type& x) {
-    return x;
-  }
-  static inline mag_type imag (const val_type&) {
-    return zero ();
-  }
-  static inline val_type conj (const val_type& x) {
-    return x;
-  }
-  static inline val_type pow (const val_type& x, const val_type& y) {
-    return ::pow(x,y);
-  }
-  static inline val_type sqrt (const val_type& x) {
+  static const bool is_signed      = true;
+  static const bool is_integer     = false;
+  static const bool is_exact       = false;
+  static const bool is_complex     = false;
+
+  static inline bool isInf(const val_type& x) { return isinf(x); }
+  static inline bool isNan(const val_type& x) { return isnan(x); }
+  static inline mag_type abs(const val_type& x) { return ::abs(x); }
+  static inline val_type zero() { return val_type(0.0); }
+  static inline val_type one() { return val_type(1.0); }
+  static inline val_type min() { return std::numeric_limits<val_type>::min(); }
+  static inline val_type max() { return std::numeric_limits<val_type>::max(); }
+  static inline mag_type real(const val_type& x) { return x; }
+  static inline mag_type imag(const val_type&) { return zero(); }
+  static inline val_type conj(const val_type& x) { return x; }
+  static inline val_type pow(const val_type& x, const val_type& y) {
+    return ::pow(x, y);
+  }
+  static inline val_type sqrt(const val_type& x) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
     return sycl::sqrt(x);
 #else
     return ::sqrt(x);
 #endif
   }
-  static inline val_type cbrt (const val_type& x) {
+  static inline val_type cbrt(const val_type& x) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
     return sycl::cbrt(x);
 #else
     return ::cbrt(x);
 #endif
   }
-  static inline val_type exp (const val_type& x) {
-      return ::exp (x);
-  }
-  static inline val_type log (const val_type& x) {
+  static inline val_type exp(const val_type& x) { return ::exp(x); }
+  static inline val_type log(const val_type& x) {
     // dd_real puts its transcendental functions in the global namespace.
-    return ::log (x);
-  }
-  static inline val_type log10 (const val_type& x) {
-    return ::log10 (x);
+    return ::log(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
-    return ::sin (x);
+  static inline val_type log10(const val_type& x) { return ::log10(x); }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) {
+    return ::sin(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) {
-    return ::cos (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) {
+    return ::cos(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) {
-    return ::tan (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::tan(x);
+#else
+    return std::tan(x);
+#endif
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) {
-    return ::sinh (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) {
+    return ::sinh(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) {
-    return ::cosh (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) {
+    return ::cosh(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) {
-    return ::tanh (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) {
+    return ::tanh(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) {
-    return ::asin (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::asin(x);
+#else
+    return ::asin(x);
+#endif
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) {
-    return ::acos (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::acos(x);
+#else
+    return ::acos(x);
+#endif
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
     return sycl::atan(x);
 #else
     return ::atan(x);
 #endif
   }
-  static inline val_type nan () {
-    return val_type::_nan;
-  }
-  static val_type epsilon () {
-    return std::numeric_limits<val_type>::epsilon ();
-  }
+  static inline val_type nan() { return val_type::_nan; }
+  static val_type epsilon() { return std::numeric_limits<val_type>::epsilon(); }
 
   typedef dd_real magnitudeType;
   typedef double halfPrecision;
   typedef qd_real doublePrecision;
 
-  static const bool isComplex = false;
-  static const bool isOrdinal = false;
-  static const bool isComparable = true;
+  static const bool isComplex            = false;
+  static const bool isOrdinal            = false;
+  static const bool isComparable         = true;
   static const bool hasMachineParameters = true;
 
-  static mag_type eps () {
-    return epsilon ();
-  }
-  static mag_type sfmin () {
-    return min ();
-  }
-  static int base ()  {
-    return std::numeric_limits<val_type>::radix;
-  }
-  static mag_type prec () {
-    return eps () * base ();
-  }
-  static int t () {
-    return std::numeric_limits<val_type>::digits;
-  }
-  static mag_type rnd () {
-    return std::numeric_limits<val_type>::round_style == std::round_to_nearest ?
-      one () :
-      zero ();
-  }
-  static int emin () {
-    return std::numeric_limits<val_type>::min_exponent;
-  }
-  static mag_type rmin () {
-    return std::numeric_limits<val_type>::min ();
-  }
-  static int emax () {
-    return std::numeric_limits<val_type>::max_exponent;
-  }
-  static mag_type rmax () {
-    return std::numeric_limits<val_type>::max ();
-  }
-  static mag_type magnitude (const val_type& x) {
-    return ::abs (x);
-  }
-  static val_type conjugate (const val_type& x) {
-    return conj (x);
-  }
-  static bool isnaninf (const val_type& x) {
-    return isNan (x) || isInf (x);
-  }
-  static std::string name () { return "dd_real"; }
-  static val_type squareroot (const val_type& x) {
+  static mag_type eps() { return epsilon(); }
+  static mag_type sfmin() { return min(); }
+  static int base() { return std::numeric_limits<val_type>::radix; }
+  static mag_type prec() { return eps() * base(); }
+  static int t() { return std::numeric_limits<val_type>::digits; }
+  static mag_type rnd() {
+    return std::numeric_limits<val_type>::round_style == std::round_to_nearest
+               ? one()
+               : zero();
+  }
+  static int emin() { return std::numeric_limits<val_type>::min_exponent; }
+  static mag_type rmin() { return std::numeric_limits<val_type>::min(); }
+  static int emax() { return std::numeric_limits<val_type>::max_exponent; }
+  static mag_type rmax() { return std::numeric_limits<val_type>::max(); }
+  static mag_type magnitude(const val_type& x) { return ::abs(x); }
+  static val_type conjugate(const val_type& x) { return conj(x); }
+  static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); }
+  static std::string name() { return "dd_real"; }
+  static val_type squareroot(const val_type& x) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
     return sycl::sqrt(x);
 #else
@@ -3970,112 +3838,96 @@ struct ArithTraits<dd_real>
   }
 };
 
-
-template<>
-struct ArithTraits<qd_real>
-{
+template <>
+struct ArithTraits<qd_real> {
   typedef qd_real val_type;
   typedef qd_real mag_type;
 
   static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool is_complex = false;
-
-  static inline bool isInf (const val_type& x) {
-    return isinf (x);
-  }
-  static inline bool isNan (const val_type& x) {
-    return isnan (x);
-  }
-  static inline mag_type abs (const val_type& x) {
-    return ::abs (x);
-  }
-  static inline val_type zero () {
-    return val_type (0.0);
-  }
-  static inline val_type one () {
-    return val_type (1.0);
-  }
-  static inline val_type min () {
-    return std::numeric_limits<val_type>::min ();
-  }
-  static inline val_type max () {
-    return std::numeric_limits<val_type>::max ();
-  }
-  static inline mag_type real (const val_type& x) {
-    return x;
-  }
-  static inline mag_type imag (const val_type&) {
-    return zero ();
-  }
-  static inline val_type conj (const val_type& x) {
-    return x;
-  }
-  static inline val_type pow (const val_type& x, const val_type& y) {
-    return ::pow (x, y);
-  }
-  static inline val_type sqrt (const val_type& x) {
+  static const bool is_signed      = true;
+  static const bool is_integer     = false;
+  static const bool is_exact       = false;
+  static const bool is_complex     = false;
+
+  static inline bool isInf(const val_type& x) { return isinf(x); }
+  static inline bool isNan(const val_type& x) { return isnan(x); }
+  static inline mag_type abs(const val_type& x) { return ::abs(x); }
+  static inline val_type zero() { return val_type(0.0); }
+  static inline val_type one() { return val_type(1.0); }
+  static inline val_type min() { return std::numeric_limits<val_type>::min(); }
+  static inline val_type max() { return std::numeric_limits<val_type>::max(); }
+  static inline mag_type real(const val_type& x) { return x; }
+  static inline mag_type imag(const val_type&) { return zero(); }
+  static inline val_type conj(const val_type& x) { return x; }
+  static inline val_type pow(const val_type& x, const val_type& y) {
+    return ::pow(x, y);
+  }
+  static inline val_type sqrt(const val_type& x) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
     return sycl::sqrt(x);
 #else
     return ::sqrt(x);
 #endif
   }
-  static inline val_type cbrt (const val_type& x) {
+  static inline val_type cbrt(const val_type& x) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
     return sycl::cbrt(x);
 #else
     return ::cbrt(x);
 #endif
   }
-  static inline val_type exp (const val_type& x) {
-    return ::exp (x);
-  }
-  static inline val_type log (const val_type& x) {
+  static inline val_type exp(const val_type& x) { return ::exp(x); }
+  static inline val_type log(const val_type& x) {
     // val_type puts its transcendental functions in the global namespace.
-    return ::log (x);
+    return ::log(x);
   }
-  static inline val_type log10 (const val_type& x) {
-    return ::log10 (x);
+  static inline val_type log10(const val_type& x) { return ::log10(x); }
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) {
+    return ::sin(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) {
-    return ::sin (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) {
+    return ::cos(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) {
-    return ::cos (x);
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) {
-    return ::tan (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::tan(x);
+#else
+    return std::tan(x);
+#endif
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) {
-    return ::sinh (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) {
+    return ::sinh(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) {
-    return ::cosh (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) {
+    return ::cosh(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) {
-    return ::tanh (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) {
+    return ::tanh(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) {
-    return ::asin (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::asin(x);
+#else
+    return ::asin(x);
+#endif
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) {
-    return ::acos (x);
+  static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
+    return sycl::acos(x);
+#else
+    return ::acos(x);
+#endif
   }
-  static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) {
+  static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
     return sycl::atan(x);
 #else
     return ::atan(x);
 #endif
   }
-  static inline val_type nan () {
-    return val_type::_nan;
-  }
-  static inline val_type epsilon () {
-    return std::numeric_limits<val_type>::epsilon ();
+  static inline val_type nan() { return val_type::_nan; }
+  static inline val_type epsilon() {
+    return std::numeric_limits<val_type>::epsilon();
   }
 
   typedef qd_real magnitudeType;
@@ -4086,54 +3938,30 @@ struct ArithTraits<qd_real>
   // extended-precision type for qd_real.
   typedef qd_real doublePrecision;
 
-  static const bool isComplex = false;
-  static const bool isOrdinal = false;
-  static const bool isComparable = true;
+  static const bool isComplex            = false;
+  static const bool isOrdinal            = false;
+  static const bool isComparable         = true;
   static const bool hasMachineParameters = true;
 
-  static mag_type eps () {
-    return epsilon ();
-  }
-  static mag_type sfmin () {
-    return min ();
-  }
-  static int base ()  {
-    return std::numeric_limits<val_type>::radix;
-  }
-  static mag_type prec () {
-    return eps () * base ();
-  }
-  static int t () {
-    return std::numeric_limits<val_type>::digits;
-  }
-  static mag_type rnd () {
-    return std::numeric_limits<val_type>::round_style == std::round_to_nearest ?
-      one () :
-      zero ();
-  }
-  static int emin () {
-    return std::numeric_limits<val_type>::min_exponent;
-  }
-  static mag_type rmin () {
-    return std::numeric_limits<val_type>::min ();
-  }
-  static int emax () {
-    return std::numeric_limits<val_type>::max_exponent;
-  }
-  static mag_type rmax () {
-    return std::numeric_limits<val_type>::max ();
-  }
-  static mag_type magnitude (const val_type& x) {
-    return ::abs (x);
-  }
-  static val_type conjugate (const val_type& x) {
-    return conj (x);
-  }
-  static bool isnaninf (const val_type& x) {
-    return isNan (x) || isInf (x);
-  }
-  static std::string name () { return "qd_real"; }
-  static val_type squareroot (const val_type& x) {
+  static mag_type eps() { return epsilon(); }
+  static mag_type sfmin() { return min(); }
+  static int base() { return std::numeric_limits<val_type>::radix; }
+  static mag_type prec() { return eps() * base(); }
+  static int t() { return std::numeric_limits<val_type>::digits; }
+  static mag_type rnd() {
+    return std::numeric_limits<val_type>::round_style == std::round_to_nearest
+               ? one()
+               : zero();
+  }
+  static int emin() { return std::numeric_limits<val_type>::min_exponent; }
+  static mag_type rmin() { return std::numeric_limits<val_type>::min(); }
+  static int emax() { return std::numeric_limits<val_type>::max_exponent; }
+  static mag_type rmax() { return std::numeric_limits<val_type>::max(); }
+  static mag_type magnitude(const val_type& x) { return ::abs(x); }
+  static val_type conjugate(const val_type& x) { return conj(x); }
+  static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); }
+  static std::string name() { return "qd_real"; }
+  static val_type squareroot(const val_type& x) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL
     return sycl::sqrt(x);
 #else
@@ -4141,15 +3969,15 @@ struct ArithTraits<qd_real>
 #endif
   }
 };
-#endif // HAVE_KOKKOS_QD
+#endif  // HAVE_KOKKOS_QD
 
-} // namespace Details
+}  // namespace Details
 
-  // Promote ArithTraits into Kokkos namespace.  At some point, we
-  // will remove it from the Details namespace completely.  We leave
-  // it there for now, because a lot of code depends on it being
-  // there.
-  using Details::ArithTraits;
-} // namespace Kokkos
+// Promote ArithTraits into Kokkos namespace.  At some point, we
+// will remove it from the Details namespace completely.  We leave
+// it there for now, because a lot of code depends on it being
+// there.
+using Details::ArithTraits;
+}  // namespace Kokkos
 
-#endif // KOKKOS_ARITHTRAITS_HPP
+#endif  // KOKKOS_ARITHTRAITS_HPP
diff --git a/src/Kokkos_InnerProductSpaceTraits.hpp b/src/Kokkos_InnerProductSpaceTraits.hpp
index 82cab6cc3b..b43d34c5f3 100644
--- a/src/Kokkos_InnerProductSpaceTraits.hpp
+++ b/src/Kokkos_InnerProductSpaceTraits.hpp
@@ -46,7 +46,8 @@
 #define KOKKOS_INNERPRODUCTSPACETRAITS_HPP
 
 /// \file Kokkos_InnerProductSpaceTraits.hpp
-/// \brief Declaration and definition of Kokkos::Details::InnerProductSpaceTraits
+/// \brief Declaration and definition of
+/// Kokkos::Details::InnerProductSpaceTraits
 
 #include "Kokkos_ArithTraits.hpp"
 
@@ -139,9 +140,9 @@ namespace Details {
 /// macro.)  If CUDA <i>does</i> support using T in device functions,
 /// you <i>must</i> mark norm() and dot() as device functions in order
 /// to use them in device functions.
-template<class T>
+template <class T>
 class InnerProductSpaceTraits {
-public:
+ public:
   //! The type T itself.
   typedef T val_type;
 
@@ -152,8 +153,8 @@ class InnerProductSpaceTraits {
   typedef val_type dot_type;
 
   //! The "norm" (absolute value or magnitude) of a value x of type val_type.
-  static KOKKOS_FORCEINLINE_FUNCTION mag_type norm (const val_type& x) {
-    return ArithTraits<val_type>::abs (x);
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type norm(const val_type& x) {
+    return ArithTraits<val_type>::abs(x);
   }
   /// \brief The "dot product" of two values x and y of type val_type.
   ///
@@ -161,8 +162,8 @@ class InnerProductSpaceTraits {
   /// complex.  In that case, see the partial specialization for
   /// Kokkos::complex below to see our convention for which input gets
   /// conjugated.
-  static KOKKOS_FORCEINLINE_FUNCTION
-  dot_type dot (const val_type& x, const val_type& y) {
+  static KOKKOS_FORCEINLINE_FUNCTION dot_type dot(const val_type& x,
+                                                  const val_type& y) {
     return x * y;
   }
 };
@@ -170,38 +171,32 @@ class InnerProductSpaceTraits {
 /// \brief Partial specialization for long double.
 ///
 /// \warning CUDA does not support long double in device functions.
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-template<>
-struct InnerProductSpaceTraits<long double>
-{
+template <>
+struct InnerProductSpaceTraits<long double> {
   typedef long double val_type;
   typedef ArithTraits<val_type>::mag_type mag_type;
   typedef val_type dot_type;
 
-  static mag_type norm (const val_type& x) {
-    return ArithTraits<val_type>::abs (x);
-  }
-  static dot_type dot (const val_type& x, const val_type& y) {
-    return x * y;
+  static mag_type norm(const val_type& x) {
+    return ArithTraits<val_type>::abs(x);
   }
+  static dot_type dot(const val_type& x, const val_type& y) { return x * y; }
 };
-#endif
 
 //! Partial specialization for Kokkos::complex<T>.
-template<class T>
-class InnerProductSpaceTraits<Kokkos::complex<T> > {
-public:
+template <class T>
+class InnerProductSpaceTraits<Kokkos::complex<T>> {
+ public:
   typedef Kokkos::complex<T> val_type;
   typedef typename ArithTraits<val_type>::mag_type mag_type;
   typedef val_type dot_type;
 
-  static KOKKOS_FORCEINLINE_FUNCTION
-  mag_type norm (const val_type& x) {
-    return ArithTraits<val_type>::abs (x);
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type norm(const val_type& x) {
+    return ArithTraits<val_type>::abs(x);
   }
-  static KOKKOS_FORCEINLINE_FUNCTION
-  dot_type dot (const val_type& x, const val_type& y) {
-    return Kokkos::conj (x) * y;
+  static KOKKOS_FORCEINLINE_FUNCTION dot_type dot(const val_type& x,
+                                                  const val_type& y) {
+    return Kokkos::conj(x) * y;
   }
 };
 
@@ -209,18 +204,17 @@ class InnerProductSpaceTraits<Kokkos::complex<T> > {
 ///
 /// \warning CUDA does not support std::complex<T> in device
 ///   functions.
-template<class T>
-struct InnerProductSpaceTraits<std::complex<T> >
-{
+template <class T>
+struct InnerProductSpaceTraits<std::complex<T>> {
   typedef std::complex<T> val_type;
   typedef typename ArithTraits<val_type>::mag_type mag_type;
   typedef val_type dot_type;
 
-  static mag_type norm (const val_type& x) {
-    return ArithTraits<val_type>::abs (x);
+  static mag_type norm(const val_type& x) {
+    return ArithTraits<val_type>::abs(x);
   }
-  static dot_type dot (const val_type& x, const val_type& y) {
-    return std::conj (x) * y;
+  static dot_type dot(const val_type& x, const val_type& y) {
+    return std::conj(x) * y;
   }
 };
 
@@ -231,22 +225,19 @@ struct InnerProductSpaceTraits<std::complex<T> >
 /// CUDA does not support __float128 in device functions, so none of
 /// the class methods in this specialization are marked as device
 /// functions.
-template<>
-struct InnerProductSpaceTraits<__float128>
-{
+template <>
+struct InnerProductSpaceTraits<__float128> {
   typedef __float128 val_type;
   typedef typename ArithTraits<val_type>::mag_type mag_type;
   typedef val_type dot_type;
 
-  static mag_type norm (const val_type& x) {
-    return ArithTraits<val_type>::abs (x);
-  }
-  static dot_type dot (const val_type& x, const val_type& y) {
-    return x * y;
+  static mag_type norm(const val_type& x) {
+    return ArithTraits<val_type>::abs(x);
   }
+  static dot_type dot(const val_type& x, const val_type& y) { return x * y; }
 };
 
-#endif // HAVE_KOKKOSKERNELS_QUADMATH
+#endif  // HAVE_KOKKOSKERNELS_QUADMATH
 
 // dd_real and qd_real are floating-point types provided by the QD
 // library of David Bailey (LBNL):
@@ -263,64 +254,53 @@ struct InnerProductSpaceTraits<__float128>
 // Hence, the class methods of the ArithTraits specializations for
 // dd_real and qd_real are not marked as device functions.
 #ifdef HAVE_KOKKOS_QD
-template<>
-struct InnerProductSpaceTraits<dd_real>
-{
+template <>
+struct InnerProductSpaceTraits<dd_real> {
   typedef dd_real val_type;
   typedef ArithTraits<val_type>::mag_type mag_type;
   typedef val_type dot_type;
 
-  static mag_type norm (const val_type& x) {
-    return ArithTraits<val_type>::abs (x);
-  }
-  static dot_type dot (const val_type& x, const val_type& y) {
-    return x * y;
+  static mag_type norm(const val_type& x) {
+    return ArithTraits<val_type>::abs(x);
   }
+  static dot_type dot(const val_type& x, const val_type& y) { return x * y; }
 };
 
-template<>
-struct InnerProductSpaceTraits<qd_real>
-{
+template <>
+struct InnerProductSpaceTraits<qd_real> {
   typedef qd_real val_type;
   typedef ArithTraits<val_type>::mag_type mag_type;
   typedef val_type dot_type;
 
-  static mag_type norm (const val_type& x) {
-    return ArithTraits<val_type>::abs (x);
-  }
-  static dot_type dot (const val_type& x, const val_type& y) {
-    return x * y;
+  static mag_type norm(const val_type& x) {
+    return ArithTraits<val_type>::abs(x);
   }
+  static dot_type dot(const val_type& x, const val_type& y) { return x * y; }
 };
-#endif // HAVE_KOKKOS_QD
+#endif  // HAVE_KOKKOS_QD
 
-template<class ResultType, class InputType1, class InputType2>
-KOKKOS_INLINE_FUNCTION void
-updateDot(ResultType& sum, const InputType1& x, const InputType2& y)
-{
+template <class ResultType, class InputType1, class InputType2>
+KOKKOS_INLINE_FUNCTION void updateDot(ResultType& sum, const InputType1& x,
+                                      const InputType2& y) {
   // FIXME (mfh 22 Jan 2020) We should actually pick the type with the
   // greater precision.
   sum += InnerProductSpaceTraits<InputType1>::dot(x, y);
 }
 
-KOKKOS_INLINE_FUNCTION void
-updateDot(double& sum, const double x, const double y)
-{
+KOKKOS_INLINE_FUNCTION void updateDot(double& sum, const double x,
+                                      const double y) {
   sum += x * y;
 }
 
-KOKKOS_INLINE_FUNCTION void
-updateDot(double& sum, const float x, const float y)
-{
+KOKKOS_INLINE_FUNCTION void updateDot(double& sum, const float x,
+                                      const float y) {
   sum += x * y;
 }
 
 // This exists because complex<float> += complex<double> is not defined.
-KOKKOS_INLINE_FUNCTION void
-updateDot(Kokkos::complex<double>& sum,
-          const Kokkos::complex<float> x,
-          const Kokkos::complex<float> y)
-{
+KOKKOS_INLINE_FUNCTION void updateDot(Kokkos::complex<double>& sum,
+                                      const Kokkos::complex<float> x,
+                                      const Kokkos::complex<float> y) {
   const auto tmp = Kokkos::conj(x) * y;
   sum += Kokkos::complex<double>(tmp.real(), tmp.imag());
 }
@@ -328,33 +308,27 @@ updateDot(Kokkos::complex<double>& sum,
 // This exists in case people call the overload of KokkosBlas::dot
 // that takes an output View, and the output View has element type
 // Kokkos::complex<float>.
-KOKKOS_INLINE_FUNCTION void
-updateDot(Kokkos::complex<float>& sum,
-          const Kokkos::complex<float> x,
-          const Kokkos::complex<float> y)
-{
+KOKKOS_INLINE_FUNCTION void updateDot(Kokkos::complex<float>& sum,
+                                      const Kokkos::complex<float> x,
+                                      const Kokkos::complex<float> y) {
   sum += Kokkos::conj(x) * y;
 }
 
 // This exists because Kokkos::complex<double> =
 // Kokkos::complex<float> is not defined.
-template<class Out, class In>
+template <class Out, class In>
 struct CastPossiblyComplex {
-  static Out cast(const In& x) {
-    return x;
-  }
+  static Out cast(const In& x) { return x; }
 };
 
-template<class OutReal, class InReal>
-struct CastPossiblyComplex<Kokkos::complex<OutReal>, Kokkos::complex<InReal>>
-{
-  static Kokkos::complex<OutReal>
-  cast (const Kokkos::complex<InReal>& x) {
+template <class OutReal, class InReal>
+struct CastPossiblyComplex<Kokkos::complex<OutReal>, Kokkos::complex<InReal>> {
+  static Kokkos::complex<OutReal> cast(const Kokkos::complex<InReal>& x) {
     return {static_cast<OutReal>(x.real()), static_cast<OutReal>(x.imag())};
   }
 };
 
-} // namespace Details
-} // namespace Kokkos
+}  // namespace Details
+}  // namespace Kokkos
 
-#endif // KOKKOS_INNERPRODUCTSPACETRAITS_HPP
+#endif  // KOKKOS_INNERPRODUCTSPACETRAITS_HPP
diff --git a/src/batched/KokkosBatched_Util.cpp b/src/batched/KokkosBatched_Util.cpp
index 21eb9db9a3..744ff25634 100644
--- a/src/batched/KokkosBatched_Util.cpp
+++ b/src/batched/KokkosBatched_Util.cpp
@@ -4,42 +4,42 @@
 
 namespace KokkosBatched {
 
-  void print_compiler_info() {
-    printf("  supported pragmas:\n");
+void print_compiler_info() {
+  printf("  supported pragmas:\n");
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-    printf("    #pragma unroll\n");
+  printf("    #pragma unroll\n");
 #endif
 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
-    printf("    #pragma ivdep\n");
+  printf("    #pragma ivdep\n");
 #endif
 #if defined KOKKOS_ENABLE_PRAGMA_VECTOR
-    printf("    #pragma vector always\n");
+  printf("    #pragma vector always\n");
 #endif
 #if !defined(KOKKOS_DEBUG)
 #if defined(KOKKOS_ENABLE_PRAGMA_SIMD)
-    printf("    #pragma simd\n");
+  printf("    #pragma simd\n");
 #endif
 #endif
-    printf("\n");
+  printf("\n");
 
-    printf("  supported avx intrinsics:\n");
+  printf("  supported avx intrinsics:\n");
 #if defined(__AVX__) || defined(__AVX2__)
-    printf("    __m256d : ");
+  printf("    __m256d : ");
 #if defined(__AVX__)
-    printf("AVX ");
+  printf("AVX ");
 #endif
 #if defined(__AVX2__)
-    printf("AVX2 ");
+  printf("AVX2 ");
 #endif
-    printf("\n");
+  printf("\n");
 #endif
 #if defined(__AVX512F__)
-    printf("    __m512d : AVX512F");
+  printf("    __m512d : AVX512F");
 #endif
 #if defined(__FMA__)
-    printf("    FMA is supported\n");
+  printf("    FMA is supported\n");
 #else
-    printf("    FMA is not supported\n");
+  printf("    FMA is not supported\n");
 #endif
-  }
 }
+}  // namespace KokkosBatched
diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp
index c19e9512c4..2b523e1e5f 100644
--- a/src/batched/KokkosBatched_Util.hpp
+++ b/src/batched/KokkosBatched_Util.hpp
@@ -17,22 +17,20 @@
 
 #include <complex>
 
-#include "Kokkos_Core.hpp"
 #include "Kokkos_Complex.hpp"
-#include "Kokkos_ArithTraits.hpp"
-#include "Kokkos_Timer.hpp"
 
 #include "KokkosKernels_config.h"
+#include "KokkosKernels_SimpleUtils.hpp"
 
 // TPL macros
-#if defined (KOKKOSKERNELS_ENABLE_TPL_MKL) 
+#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL)
 #define __KOKKOSBATCHED_ENABLE_INTEL_MKL__ 1
 #include "mkl_version.h"
 #if __INTEL_MKL__ >= 2018
-#define __KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__ 1    
+#define __KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__ 1
 #define __KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__ 1
 #include "mkl.h"
-  //#include "mkl_types.h"
+//#include "mkl_types.h"
 #endif
 #endif
 
@@ -46,759 +44,851 @@ namespace KokkosBatched {
 //////// Helper macros, functions, and classes ////////
 #define Int2StringHelper(A) #A
 #define Int2String(A) Int2StringHelper(A)
-#define StringCat(A,B) A B
-
-  void print_compiler_info();
+#define StringCat(A, B) A B
+
+void print_compiler_info();
+
+template <typename T>
+struct is_vector : public std::false_type {};
+
+template <typename Ta, typename Tb>
+struct is_same_mag_type {
+  static const bool is_specialized =
+      (Kokkos::Details::ArithTraits<Ta>::is_specialized &&
+       Kokkos::Details::ArithTraits<Tb>::is_specialized);
+
+  static const bool is_mag_type_same =
+      std::is_same<typename Kokkos::Details::ArithTraits<Ta>::mag_type,
+                   typename Kokkos::Details::ArithTraits<Tb>::mag_type>::value;
+
+  static const bool value = is_specialized && is_mag_type_same;
+};
+
+// to use double, std::complex<double>, Kokkos::complex<double>
+using std::max;
+using std::min;
+
+// view manipulation
+template <typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
+using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::Unmanaged |
+                                          MemoryTraitsType::RandomAccess |
+                                          //  MemoryTraitsType::Atomic |
+                                          flag>;
+
+template <typename ViewType>
+using UnmanagedViewType = Kokkos::View<
+    typename ViewType::data_type, typename ViewType::array_layout,
+    typename ViewType::device_type,
+    MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
+
+template <typename ViewType>
+using ConstViewType = Kokkos::View<
+    typename ViewType::const_data_type, typename ViewType::array_layout,
+    typename ViewType::device_type, typename ViewType::memory_traits>;
+template <typename ViewType>
+using ConstUnmanagedViewType = ConstViewType<UnmanagedViewType<ViewType> >;
+
+template <typename ViewType>
+using ScratchViewType = Kokkos::View<
+    typename ViewType::data_type, typename ViewType::array_layout,
+    typename ViewType::execution_space::scratch_memory_space,
+    MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
+
+// helper for vector type
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+    typename std::enable_if<std::is_fundamental<T>::value, size_t>::type
+    adjustDimension(const size_t &m) {
+  return m;
+}
+
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+    typename std::enable_if<!std::is_fundamental<T>::value, size_t>::type
+    adjustDimension(const size_t &m) {
+  return (m / T::vector_length + (m % T::vector_length > 0));
+}
+
+template <size_t BufSize, typename SpaceType = Kokkos::DefaultExecutionSpace>
+struct Flush {
+  typedef double value_type;
+
+  // flush a large host buffer
+  Kokkos::View<value_type *, SpaceType> _buf;
+  Flush() : _buf("Flush::buf", BufSize / sizeof(double)) {
+    Kokkos::deep_copy(_buf, 1);
+  }
 
-  template<typename T> struct is_vector : public std::false_type {};
+  KOKKOS_INLINE_FUNCTION
+  void init(value_type &update) { update = 0; }
 
-  template<typename Ta, typename Tb>
-  struct is_same_mag_type {
-    static const bool is_specialized = ( Kokkos::Details::ArithTraits<Ta>::is_specialized &&
-                                         Kokkos::Details::ArithTraits<Tb>::is_specialized );
-      
-    static const bool is_mag_type_same = std::is_same<typename Kokkos::Details::ArithTraits<Ta>::mag_type,
-                                                      typename Kokkos::Details::ArithTraits<Tb>::mag_type>::value;
-      
-    static const bool value = is_specialized && is_mag_type_same;
-  };
-    
-  // to use double, std::complex<double>, Kokkos::complex<double>
-  using std::abs;
-  using std::min;
-  using std::max;
-
-  // view manipulation
-  template <typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
-  using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::Unmanaged |
-                                            MemoryTraitsType::RandomAccess |
-                                            //  MemoryTraitsType::Atomic |
-                                            flag>;
-
-  template <typename ViewType>
-  using UnmanagedViewType
-  = Kokkos::View<typename ViewType::data_type,
-                 typename ViewType::array_layout,
-                 typename ViewType::device_type,
-                 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
-
-  template <typename ViewType>
-  using ConstViewType = Kokkos::View<typename ViewType::const_data_type,
-                                     typename ViewType::array_layout,
-                                     typename ViewType::device_type,
-                                     typename ViewType::memory_traits>;
-  template <typename ViewType>
-  using ConstUnmanagedViewType = ConstViewType<UnmanagedViewType<ViewType> >;
-
-  template <typename ViewType>
-  using ScratchViewType
-  = Kokkos::View<typename ViewType::data_type,
-                 typename ViewType::array_layout,
-                 typename ViewType::execution_space::scratch_memory_space,
-                 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
-
-
-  // helper for vector type
-  template<typename T>
   KOKKOS_INLINE_FUNCTION
-  typename std::enable_if<std::is_fundamental<T>::value,size_t>::type
-  adjustDimension(const size_t &m) {
-    return m;
+  void join(volatile value_type &update, const volatile value_type &input) {
+    update += input;
   }
 
-  template<typename T>
   KOKKOS_INLINE_FUNCTION
-  typename std::enable_if<!std::is_fundamental<T>::value,size_t>::type
-  adjustDimension(const size_t &m) {
-    return (m/T::vector_length + (m%T::vector_length > 0));
+  void operator()(const int i, value_type &update) const { update += _buf[i]; }
+
+  void run() {
+    double sum = 0;
+    Kokkos::parallel_reduce(
+        Kokkos::RangePolicy<SpaceType>(0, BufSize / sizeof(double)), *this,
+        sum);
+    SpaceType().fence();
+    FILE *fp = fopen("/dev/null", "w");
+    fprintf(fp, "%f\n", sum);
+    fclose(fp);
   }
-
-  template<size_t BufSize, typename SpaceType = Kokkos::DefaultExecutionSpace>
-  struct Flush {
-    typedef double value_type;
-
-    // flush a large host buffer
-    Kokkos::View<value_type*,SpaceType> _buf;
-    Flush() : _buf("Flush::buf", BufSize/sizeof(double)) {
-      Kokkos::deep_copy(_buf, 1);
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void init(value_type &update) {
-      update = 0;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void join(volatile value_type &update,
-              const volatile value_type &input) {
-      update += input;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const int i, value_type &update) const {
-      update += _buf[i];
-    }
-
-    void run() {
-      double sum = 0;
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<SpaceType>(0,BufSize/sizeof(double)), *this, sum);
-      SpaceType().fence();
-      FILE *fp = fopen("/dev/null", "w");
-      fprintf(fp, "%f\n", sum);
-      fclose(fp);
-    }
-
-  };
-
-  template<typename T, typename dummy = T>
-  struct Random;
-
-  template<typename T>
-  struct Random<T, typename std::enable_if<std::is_same<T,double>::value ||
-                                           std::is_same<T,float>::value, T>::type> {
-    Random(const unsigned int seed = 0) { srand(seed); }
-    T value() { 
-      const auto val = (rand()/((T) RAND_MAX) - 0.5)*2.0;
-      return val > 0 ? val + 1.0e-3 : val - 1.0e-3;
-    }
-  };
-
-  template<typename T>
-  struct Random<T, typename std::enable_if<std::is_same<T,std::complex<float> >::value ||
-                                           std::is_same<T,std::complex<double> >::value ||
-                                           std::is_same<T,Kokkos::complex<float> >::value ||
-                                           std::is_same<T,Kokkos::complex<double> >::value, T>::type> {
-    Random(const unsigned int seed = 0) { srand(seed); }
-    T value() {
-      const auto rval = (rand()/((double) RAND_MAX) - 0.5)*2.0;        
-      const auto ival = (rand()/((double) RAND_MAX) - 0.5)*2.0;        
-      return T(rval > 0 ? rval + 1.0e-3 : rval - 1.0e-3,
-               ival > 0 ? ival + 1.0e-3 : ival - 1.0e-3);
-    }
-  };
-
-  struct Timer {
-    std::string _label;
-    Kokkos::Timer _clock;
-    Timer (const std::string label)
-      : _label(label), _clock() {};
-
-    void reset() { _clock.reset(); }
-    double seconds() { return _clock.seconds(); }
-    ~Timer() {
-      Kokkos::fence();
-      const double t = _clock.seconds();
-      std::string label = _label; label.resize(24);
-      std::cout << "KokkosKernels::Timer:: " << std::setw(26) << label
-                << std::setw(15) << std::scientific << t << " [sec] " << std::endl;
-    }
-  };
-
-  // Implicit vectorization
-  template<typename T>
-  struct SIMD {
-    static_assert( std::is_same<T,bool>::value                     ||
-                   std::is_same<T,int>::value                      ||
-                   std::is_same<T,size_t>::value                   ||
-                   std::is_same<T,double>::value                   ||
-                   std::is_same<T,float>::value                    ||
-                   std::is_same<T,Kokkos::complex<float> >::value  ||
-                   std::is_same<T,std::complex<float> >::value     ||
-                   std::is_same<T,Kokkos::complex<double> >::value ||
-                   std::is_same<T,std::complex<double> >::value    ||
-                   std::is_same<T,Kokkos::Experimental::half_t>::value,
-                   "KokkosKernels:: Invalid SIMD<> type." );
-    using value_type = T;
-  };
-
-  // Intel AVX instruction device (explicit vectorization)
-  template<typename T>
-  struct AVX {
-    static_assert( std::is_same<T,double>::value                   ||
-                   std::is_same<T,float>::value                    ||
-                   std::is_same<T,Kokkos::complex<double> >::value ||
-                   std::is_same<T,std::complex<double> >::value,
-                   "KokkosKernels:: Invalid AVX<> type." );
-    using value_type = T;
-  };
-
-  //////// Tags for BLAS ////////
-  struct Trans {
-    struct Transpose {};
-    struct NoTranspose {};
-    struct ConjTranspose {};
-  };
-
-  struct Side {
-    struct Left {};
-    struct Right {};
-  };
-
-  struct Uplo {
-    struct Upper {};
-    struct Lower {};
-  };
-
-  struct Diag {
-    struct Unit    { static const bool use_unit_diag = true;  };
-    struct NonUnit { static const bool use_unit_diag = false; };
+};
+
+template <typename T, typename dummy = T>
+struct Random;
+
+template <typename T>
+struct Random<T, typename std::enable_if<std::is_same<T, double>::value ||
+                                             std::is_same<T, float>::value,
+                                         T>::type> {
+  Random(const unsigned int seed = 0) { srand(seed); }
+  T value() {
+    const auto val = (rand() / ((T)RAND_MAX) - 0.5) * 2.0;
+    return val > 0 ? val + 1.0e-3 : val - 1.0e-3;
+  }
+};
+
+template <typename T>
+struct Random<T, typename std::enable_if<
+                     std::is_same<T, std::complex<float> >::value ||
+                         std::is_same<T, std::complex<double> >::value ||
+                         std::is_same<T, Kokkos::complex<float> >::value ||
+                         std::is_same<T, Kokkos::complex<double> >::value,
+                     T>::type> {
+  Random(const unsigned int seed = 0) { srand(seed); }
+  T value() {
+    const auto rval = (rand() / ((double)RAND_MAX) - 0.5) * 2.0;
+    const auto ival = (rand() / ((double)RAND_MAX) - 0.5) * 2.0;
+    return T(rval > 0 ? rval + 1.0e-3 : rval - 1.0e-3,
+             ival > 0 ? ival + 1.0e-3 : ival - 1.0e-3);
+  }
+};
+
+struct Timer {
+  std::string _label;
+  Kokkos::Timer _clock;
+  Timer(const std::string label) : _label(label), _clock(){};
+
+  void reset() { _clock.reset(); }
+  double seconds() { return _clock.seconds(); }
+  ~Timer() {
+    Kokkos::fence();
+    const double t    = _clock.seconds();
+    std::string label = _label;
+    label.resize(24);
+    std::cout << "KokkosKernels::Timer:: " << std::setw(26) << label
+              << std::setw(15) << std::scientific << t << " [sec] "
+              << std::endl;
+  }
+};
+
+// Implicit vectorization
+template <typename T>
+struct SIMD {
+  static_assert(std::is_same<T, bool>::value || std::is_same<T, int>::value ||
+                    std::is_same<T, size_t>::value ||
+                    std::is_same<T, double>::value ||
+                    std::is_same<T, float>::value ||
+                    std::is_same<T, Kokkos::complex<float> >::value ||
+                    std::is_same<T, std::complex<float> >::value ||
+                    std::is_same<T, Kokkos::complex<double> >::value ||
+                    std::is_same<T, std::complex<double> >::value ||
+                    std::is_same<T, Kokkos::Experimental::half_t>::value,
+                "KokkosKernels:: Invalid SIMD<> type.");
+  using value_type = T;
+};
+
+// Intel AVX instruction device (explicit vectorization)
+template <typename T>
+struct AVX {
+  static_assert(std::is_same<T, double>::value ||
+                    std::is_same<T, float>::value ||
+                    std::is_same<T, Kokkos::complex<double> >::value ||
+                    std::is_same<T, std::complex<double> >::value,
+                "KokkosKernels:: Invalid AVX<> type.");
+  using value_type = T;
+};
+
+//////// Tags for BLAS ////////
+struct Trans {
+  struct Transpose {};
+  struct NoTranspose {};
+  struct ConjTranspose {};
+};
+
+struct Side {
+  struct Left {};
+  struct Right {};
+};
+
+struct Uplo {
+  struct Upper {};
+  struct Lower {};
+};
+
+struct Diag {
+  struct Unit {
+    static const bool use_unit_diag = true;
   };
-
-  
-  /// \brief: BatchLayout class used to specify where the batch dimension is allocated in
-  ///         the input views for host-level Batched BLAS/LAPACK routines.
-  /// \var Left  Batch dimension is the leftmost dimension within input views
-  /// \var Right Batch dimension is the rightmost dimension within input views
-  struct BatchLayout {
-    struct Left {};
-    struct Right {};
+  struct NonUnit {
+    static const bool use_unit_diag = false;
   };
-
-  /// \brief ResultsPerThread class used to specify how to divide a given
-  ///        BLAS/LAPACK operation among Kokkos threads
-  /// \var Rank0 Each Kokkos thread calculates a 0-rank result
-  /// \var Rank1 Each Kokkos thread calculates a 1-rank result
-  /// \var Rank2 Each Kokkos thread calculates a 2-rank result
-  struct ResultsPerThread {
-    struct Rank0 {};
-    struct Rank1 {};
-    struct Rank2 {};
+};
+
+/// BatchLayout class used to specify where the batch dimension is
+/// allocated in the input views for host-level Batched BLAS/LAPACK routines.
+struct BatchLayout {
+  /// Batch dimension is the leftmost dimension within input views
+  struct Left {};
+  /// Batch dimension is the rightmost dimension within input views
+  struct Right {};
+};
+
+/// ResultsPerThread class used to specify how to divide a given BLAS/LAPACK
+/// operation among Kokkos threads
+struct ResultsPerThread {
+  /// Each Kokkos thread calculates a 0-rank result
+  struct Rank0 {};
+  /// Each Kokkos thread calculates a 1-rank result
+  struct Rank1 {};
+  /// Each Kokkos thread calculates a 2-rank result
+  struct Rank2 {};
+};
+
+/// BoundsCheck class used to specify whether to check view bounds in
+/// BLAS/LAPACK DblBuf algorithms.
+struct BoundsCheck {
+  /// Use functor with    bounds check
+  struct Yes {};
+  /// Use functor without bounds check
+  struct No {};
+};
+
+/// AlphaTag class used to specify where to apply alpha in BLAS/LAPACK DblBuf
+/// algorithms.
+struct AlphaTag {
+  /// Use function with    alpha factor
+  struct Yes {};
+  /// Use function without alpha factor
+  struct No {};
+};
+
+struct Direct {
+  struct Forward {};
+  struct Backward {};
+};
+
+struct Mode {
+  struct Serial {
+    static const char *name() { return "Serial"; }
   };
-
-  /// \brief BoundsCheck class used to specify whether to check view bounds in
-  ///        BLAS/LAPACK DblBuf algorithms.
-  /// /var Yes Use functor with    bounds check
-  /// /var No  Use functor without bound checks
-  struct BoundsCheck {
-    struct Yes {};
-    struct No {};
+  struct Team {
+    static const char *name() { return "Team"; }
   };
-
-  struct Direct {
-    struct Forward {};
-    struct Backward {};
+  struct TeamVector {
+    static const char *name() { return "TeamVector"; }
   };
+};
 
-  struct Mode {
-    struct Serial {
-      static const char *name() { return "Serial"; }
-    };
-    struct Team {
-      static const char *name() { return "Team"; }
-    };
-    struct TeamVector {
-      static const char *name() { return "TeamVector"; }
-    };
-  };
+#if !defined(KOKKOS_IF_ON_HOST)
 
-  struct Algo {      
-    struct Level3 {
-      struct Unblocked {
-        static const char* name() { return "Unblocked"; }
-      };
-      struct Blocked {
-        static const char* name() { return "Blocked"; }
-        // TODO:: for now harwire the blocksizes; this should reflect
-        // regieter blocking (not about team parallelism).
-        // this mb should vary according to
-        // - team policy (smaller) or range policy (bigger)
-        // - space (gpu vs host)
-        // - blocksize input (blk <= 4 mb = 2, otherwise mb = 4), etc.
+template <class>
+struct algo_level3_blocked_mb_impl;
+template <>
+struct algo_level3_blocked_mb_impl<Kokkos::HostSpace> {
+  static constexpr int value = 4;
+};
 #if defined(KOKKOS_ENABLE_CUDA)
-        template<typename ActiveMemorySpaceType> KOKKOS_INLINE_FUNCTION static constexpr
-        typename std::enable_if<std::is_same<ActiveMemorySpaceType,Kokkos::CudaSpace>::value,int>
-        ::type mb() { return 2; }
+template <>
+struct algo_level3_blocked_mb_impl<Kokkos::CudaSpace> {
+  static constexpr int value = 2;
+};
 #endif
 #if defined(KOKKOS_ENABLE_HIP)
-        template<typename ActiveMemorySpaceType> KOKKOS_INLINE_FUNCTION static constexpr
-        typename std::enable_if<std::is_same<ActiveMemorySpaceType,Kokkos::Experimental::HIPSpace>::value,int>
-        ::type mb() { return 2; }
+template <>
+struct algo_level3_blocked_mb_impl<Kokkos::Experimental::HIPSpace> {
+  static constexpr int value = 2;
+};
 #endif
 #if defined(KOKKOS_ENABLE_SYCL)
-        template <typename ActiveMemorySpaceType>
-        KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if<
-            std::is_same<ActiveMemorySpaceType,
-                         Kokkos::Experimental::SYCLDeviceUSMSpace>::value,
-            int>::type
-        mb() {
-          return 2;
-        }
+template <>
+struct algo_level3_blocked_mb_impl<Kokkos::Experimental::SYCLDeviceUSMSpace> {
+  static constexpr int value = 2;
+};
 #endif
-        template<typename ActiveMemorySpaceType> KOKKOS_INLINE_FUNCTION static constexpr
-        typename std::enable_if<std::is_same<ActiveMemorySpaceType,Kokkos::HostSpace>::value,int>
-        ::type mb() { return 4; }
-      };
-      struct MKL {
-        static const char* name() { return "MKL"; }
-      };
-      struct CompactMKL {
-        static const char* name() { return "CompactMKL"; }
-      };
-
-      // When this is first developed, unblocked algorithm is a naive implementation 
-      // and blocked algorithm uses register blocking variant of algorithm (manual unrolling). 
-      // This distinction is almost meaningless and it just adds more complications. 
-      // Eventually, the blocked version will be removed and we only use the default 
-      // algorithm. For testing and development purpose, we still leave algorithm tag
-      // in the template arguments.
-      using Default = Unblocked;
-    };
 
-    using Gemm = Level3;
-    using Trsm = Level3;
-    using Trmm = Level3;
-    using Trtri = Level3;
-    using LU   = Level3;
-    using InverseLU = Level3;
-    using SolveLU   = Level3;
-    using QR = Level3;
-    using UTV = Level3;
-
-    struct Level2 {
-      struct Unblocked {};
-      struct Blocked {
-        // TODO:: for now harwire the blocksizes; this should reflect
-        // regieter blocking (not about team parallelism).
-        // this mb should vary according to
-        // - team policy (smaller) or range policy (bigger)
-        // - space (cuda vs host)
-        // - blocksize input (blk <= 4 mb = 2, otherwise mb = 4), etc.
+template <class>
+struct algo_level2_blocked_mb_impl;
+template <>
+struct algo_level2_blocked_mb_impl<Kokkos::HostSpace> {
+  static constexpr int value = 4;
+};
 #if defined(KOKKOS_ENABLE_CUDA)
-        template<typename ActiveMemorySpaceType> KOKKOS_INLINE_FUNCTION static constexpr
-        typename std::enable_if<std::is_same<ActiveMemorySpaceType,Kokkos::CudaSpace>::value,int>
-        ::type mb() { return 1; }
+template <>
+struct algo_level2_blocked_mb_impl<Kokkos::CudaSpace> {
+  static constexpr int value = 1;
+};
 #endif
 #if defined(KOKKOS_ENABLE_HIP)
-        template<typename ActiveMemorySpaceType> KOKKOS_INLINE_FUNCTION static constexpr
-        typename std::enable_if<std::is_same<ActiveMemorySpaceType,Kokkos::Experimental::HIPSpace>::value,int>
-        ::type mb() { return 1; }
+template <>
+struct algo_level2_blocked_mb_impl<Kokkos::Experimental::HIPSpace> {
+  static constexpr int value = 1;
+};
 #endif
 #if defined(KOKKOS_ENABLE_SYCL)
-        template <typename ActiveMemorySpaceType>
-        KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if<
-            std::is_same<ActiveMemorySpaceType,
-                         Kokkos::Experimental::SYCLDeviceUSMSpace>::value,
-            int>::type
-        mb() {
-          return 1;
-        }
+template <>
+struct algo_level2_blocked_mb_impl<Kokkos::Experimental::SYCLDeviceUSMSpace> {
+  static constexpr int value = 1;
+};
 #endif
-        template<typename ActiveMemorySpaceType> KOKKOS_INLINE_FUNCTION static constexpr
-        typename std::enable_if<std::is_same<ActiveMemorySpaceType,Kokkos::HostSpace>::value,int>
-        ::type mb() { return 4; }
-      };
-      struct MKL {};
-      struct CompactMKL {};
-
-      // When this is first developed, unblocked algorithm is a naive implementation 
-      // and blocked algorithm uses register blocking variant of algorithm (manual unrolling). 
-      // This distinction is almost meaningless and it just adds more complications. 
-      // Eventually, the blocked version will be removed and we only use the default 
-      // algorithm. For testing and development purpose, we still leave algorithm tag
-      // in the template arguments.
-      using Default = Unblocked;
-    };
-
-    using Gemv = Level2;
-    using Trsv = Level2;
-    using ApplyQ = Level2;
-
-    //         struct Level1 {
-    //           struct Unblocked {};
-    //           struct Blocked {
-    //             // TODO:: for now harwire the blocksizes; this should reflect
-    //             // regieter blocking (not about team parallelism).
-    //             // this mb should vary according to
-    //             // - team policy (smaller) or range policy (bigger)
-    //             // - space (cuda vs host)
-    //             // - blocksize input (blk <= 4 mb = 2, otherwise mb = 4), etc.
-    // #if defined(KOKKOS_ENABLE_CUDA)
-    //             template<typename ActiveMemorySpaceType> KOKKOS_INLINE_FUNCTION static constexpr
-    //             typename std::enable_if<std::is_same<ActiveMemorySpaceType,Kokkos::CudaSpace>::value,int>
-    //             ::type mb() { return 4; }
-    // #endif
-    //             template<typename ActiveMemorySpaceType> KOKKOS_INLINE_FUNCTION static constexpr
-    //             typename std::enable_if<std::is_same<ActiveMemorySpaceType,Kokkos::HostSpace>::value,int>
-    //             ::type mb() { return 4; }
-    //           };
-    //           //struct MKL {};
-    //           //struct CompactMKL {};
-    //         };
 
-  };
+#endif
 
-  struct Util {
-
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static void
-    packColMajor(ValueType *__restrict__ A,
-                 const int m,
-                 const int n,
-                 const ValueType *__restrict__ B,
-                 const int bs0,
-                 const int bs1) {
-      for (int j=0;j<n;++j)
-        for (int i=0;i<m;++i)
-          A[i+j*m] = B[i*bs0+j*bs1];
-    }
-
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static void
-    packRowMajor(ValueType *__restrict__ A,
-                 const int m,
-                 const int n,
-                 const ValueType *__restrict__ B,
-                 const int bs0,
-                 const int bs1) {
-      for (int i=0;i<m;++i)
-        for (int j=0;j<n;++j)
-          A[i*n+j] = B[i*bs0+j*bs1];
-    }
+struct Algo {
+  struct Level3 {
+    struct Unblocked {
+      static const char *name() { return "Unblocked"; }
+    };
+    struct Blocked {
+      static const char *name() { return "Blocked"; }
+      // TODO:: for now harwire the blocksizes; this should reflect
+      // regieter blocking (not about team parallelism).
+      // this mb should vary according to
+      // - team policy (smaller) or range policy (bigger)
+      // - space (gpu vs host)
+      // - blocksize input (blk <= 4 mb = 2, otherwise mb = 4), etc.
+#if defined(KOKKOS_IF_ON_HOST)
+      static constexpr KOKKOS_FUNCTION int mb() {
+        KOKKOS_IF_ON_HOST((return 4;))
+        KOKKOS_IF_ON_DEVICE((return 2;))
+      }
+
+#else  // FIXME remove when requiring minimum version of Kokkos 3.6
+      static constexpr KOKKOS_FUNCTION int mb() {
+        return algo_level3_blocked_mb_impl<
+            Kokkos::Impl::ActiveExecutionMemorySpace>::value;
+      }
 
-  };
+#endif
+    };
+    struct MKL {
+      static const char *name() { return "MKL"; }
+    };
+    struct CompactMKL {
+      static const char *name() { return "CompactMKL"; }
+    };
 
-  template<typename ValueType> struct Partition1x2;
-  template<typename ValueType> struct Partition1x3;
-    
-  template<typename ValueType>
-  struct Partition1x2 {      
-    const int as1;
-    ValueType *AL, *AR;
-
-    KOKKOS_INLINE_FUNCTION
-    Partition1x2(const int arg_as1)
-      : as1(arg_as1), AL(NULL), AR(NULL) {}
-
-    KOKKOS_INLINE_FUNCTION
-    void partWithAL(ValueType *A, const int /* nA */, const int nAL) {
-      AL = A; AR = AL+nAL*as1;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void partWithAR(ValueType *A, const int nA, const int nAR) {
-      AL = A; AR = AL+(nA-nAR)*as1;
-    }
-
-    // A0 A1 are merged into AL
-    KOKKOS_INLINE_FUNCTION
-    void mergeToAL(const Partition1x3<ValueType> &part) {
-      AL = part.A0; AR = part.A2;
-    }      
-
-    // A0 A1 are merged into AL
-    KOKKOS_INLINE_FUNCTION
-    void mergeToAR(const Partition1x3<ValueType> &part) {
-      AL = part.A0; AR = part.A1;
-    }      
+    // When this is first developed, unblocked algorithm is a naive
+    // implementation and blocked algorithm uses register blocking variant of
+    // algorithm (manual unrolling). This distinction is almost meaningless and
+    // it just adds more complications. Eventually, the blocked version will be
+    // removed and we only use the default algorithm. For testing and
+    // development purpose, we still leave algorithm tag in the template
+    // arguments.
+    using Default = Unblocked;
   };
 
-  template<typename ValueType>
-  struct Partition1x3 {      
-    const int as1;
-    ValueType *A0, *A1, *A2;
-
-    KOKKOS_INLINE_FUNCTION
-    Partition1x3(const int arg_as1)
-      : as1(arg_as1), A0(NULL), A1(NULL), A2(NULL) {}
+  using Gemm      = Level3;
+  using Trsm      = Level3;
+  using Trmm      = Level3;
+  using Trtri     = Level3;
+  using LU        = Level3;
+  using InverseLU = Level3;
+  using SolveLU   = Level3;
+  using QR        = Level3;
+  using UTV       = Level3;
+
+  struct Level2 {
+    struct Unblocked {};
+    struct Blocked {
+      // TODO:: for now harwire the blocksizes; this should reflect
+      // regieter blocking (not about team parallelism).
+      // this mb should vary according to
+      // - team policy (smaller) or range policy (bigger)
+      // - space (cuda vs host)
+      // - blocksize input (blk <= 4 mb = 2, otherwise mb = 4), etc.
+#if defined(KOKKOS_IF_ON_HOST)
+      static constexpr KOKKOS_FUNCTION int mb() {
+        KOKKOS_IF_ON_HOST((return 4;))
+        KOKKOS_IF_ON_DEVICE((return 1;))
+      }
+
+#else  // FIXME remove when requiring minimum version of Kokkos 3.6
+      static constexpr KOKKOS_FUNCTION int mb() {
+        return algo_level2_blocked_mb_impl<
+            Kokkos::Impl::ActiveExecutionMemorySpace>::value;
+      }
 
-    KOKKOS_INLINE_FUNCTION
-    void partWithAL(const Partition1x2<ValueType> &part, const int mA1) {
-      A0 = part.AL; A2 = part.AR; A1 = A2 - mA1*as1;
-    }
-    KOKKOS_INLINE_FUNCTION
-    void partWithAR(const Partition1x2<ValueType> &part, const int mA1) {
-      A0 = part.AL; A1 = part.AR; A2 = A1 + mA1*as1;
-    }
+#endif
+    };
+    struct MKL {};
+    struct CompactMKL {};
+
+    // When this is first developed, unblocked algorithm is a naive
+    // implementation and blocked algorithm uses register blocking variant of
+    // algorithm (manual unrolling). This distinction is almost meaningless and
+    // it just adds more complications. Eventually, the blocked version will be
+    // removed and we only use the default algorithm. For testing and
+    // development purpose, we still leave algorithm tag in the template
+    // arguments.
+    using Default = Unblocked;
   };
 
+  using Gemv   = Level2;
+  using Trsv   = Level2;
+  using ApplyQ = Level2;
+};
+
+struct Util {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static void packColMajor(
+      ValueType *KOKKOS_RESTRICT A, const int m, const int n,
+      const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) {
+    for (int j = 0; j < n; ++j)
+      for (int i = 0; i < m; ++i) A[i + j * m] = B[i * bs0 + j * bs1];
+  }
 
-  template<typename ValueType> struct Partition2x1;
-  template<typename ValueType> struct Partition3x1;
-    
-  template<typename ValueType>
-  struct Partition2x1 {      
-    const int as0;
-    ValueType *AT, *AB;
-
-    KOKKOS_INLINE_FUNCTION
-    Partition2x1(const int arg_as0)
-      : as0(arg_as0), AT(NULL), AB(NULL) {}
-
-    KOKKOS_INLINE_FUNCTION
-    void partWithAT(ValueType *A, const int /* mA */, const int mAT) {
-      AT = A;
-      AB = AT+mAT*as0;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void partWithAB(ValueType *A, const int mA, const int mAB) {
-      partWithAT(A, mA, mA-mAB);
-    }
-
-    // A0
-    // A1 is merged into AT
-    KOKKOS_INLINE_FUNCTION
-    void mergeToAT(const Partition3x1<ValueType> &part) {
-      AT = part.A0;
-      AB = part.A2;
-    }      
-
-    KOKKOS_INLINE_FUNCTION
-    void mergeToAB(const Partition3x1<ValueType> &part) {
-      AT = part.A0;
-      AB = part.A1;
-    }      
-  };
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static void packRowMajor(
+      ValueType *KOKKOS_RESTRICT A, const int m, const int n,
+      const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) {
+    for (int i = 0; i < m; ++i)
+      for (int j = 0; j < n; ++j) A[i * n + j] = B[i * bs0 + j * bs1];
+  }
+};
 
-  template<typename ValueType>
-  struct Partition3x1 {      
-    const int as0;
-    ValueType *A0,
-      /* */   *A1,
-      /* */   *A2;
-      
-    KOKKOS_INLINE_FUNCTION
-    Partition3x1(const int arg_as0)
-      : as0(arg_as0), 
-        A0(NULL),  
-        A1(NULL),  
-        A2(NULL) {}
-      
-    KOKKOS_INLINE_FUNCTION
-    void partWithAB(const Partition2x1<ValueType> &part, const int mA1) {
-      A0 = part.AT;      
-      A1 = part.AB;      
-      A2 = A1 + mA1*as0;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void partWithAT(const Partition2x1<ValueType> &part, const int mA1) {
-      A0 = part.AT;      
-      A1 = part.AB - mA1*as0;
-      A2 = part.AB;
-    }
-  };
+template <typename ValueType>
+struct Partition1x2;
+template <typename ValueType>
+struct Partition1x3;
 
-  template<typename ValueType> struct Partition2x2;
-  template<typename ValueType> struct Partition3x3;
-
-  template<typename ValueType>
-  struct Partition2x2 {      
-    const int as0, as1;
-    ValueType *ATL, *ATR, *ABL, *ABR;
-
-    KOKKOS_INLINE_FUNCTION
-    Partition2x2(const int arg_as0, const int arg_as1) 
-      : as0(arg_as0), as1(arg_as1), ATL(NULL), ATR(NULL), ABL(NULL), ABR(NULL) {}
-
-    KOKKOS_INLINE_FUNCTION
-    void partWithATL(ValueType *A, 
-                     const int /* mA */, const int /* nA */, 
-                     const int mATL, const int nATL) {
-      ATL = A;            ATR = ATL+nATL*as1; 
-      ABL = ATL+mATL*as0; ABR = ABL+nATL*as1;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void partWithABR(ValueType *A, 
-                     const int mA, const int nA, 
-                     const int mABR, const int nABR) {
-      partWithATL(A, mA, nA, mA-mABR, nA-nABR);
-    }
-
-    // A00 A01
-    // A10 A11 is merged into ATL
-    KOKKOS_INLINE_FUNCTION
-    void mergeToATL(const Partition3x3<ValueType> &part) {
-      ATL = part.A00; ATR = part.A02;
-      ABL = part.A20; ABR = part.A22;
-    }      
-
-    KOKKOS_INLINE_FUNCTION
-    void mergeToABR(const Partition3x3<ValueType> &part) {
-      ATL = part.A00; ATR = part.A01;
-      ABL = part.A10; ABR = part.A11;
-    }      
-  };
+template <typename ValueType>
+struct Partition1x2 {
+  const int as1;
+  ValueType *AL, *AR;
 
-  template<typename ValueType>
-  struct Partition3x3 {      
-    const int as0, as1;
-    ValueType *A00, *A01, *A02,
-      /* */   *A10, *A11, *A12,
-      /* */   *A20, *A21, *A22;
-
-      
-    KOKKOS_INLINE_FUNCTION
-    Partition3x3(const int arg_as0, const int arg_as1)
-      : as0(arg_as0), as1(arg_as1), 
-        A00(NULL), A01(NULL), A02(NULL), 
-        A10(NULL), A11(NULL), A12(NULL), 
-        A20(NULL), A21(NULL), A22(NULL) {}
-      
-    KOKKOS_INLINE_FUNCTION
-    void partWithABR(const Partition2x2<ValueType> &part, const int mA11, const int nA11) {
-      A00 = part.ATL;
-      A01 = part.ATR;
-      A02 = part.ATR + nA11 * as1;
-      A10 = part.ABL;
-      A11 = part.ABR;
-      A12 = part.ABR + nA11 * as1;
-      A20 = part.ABL + mA11 * as0;
-      A21 = part.ABR + mA11 * as0;
-      A22 = part.ABR + mA11 * as0 + nA11 * as1;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void partWithATL(const Partition2x2<ValueType> &part, const int mA11,
-                     const int nA11) {
-      A00 = part.ATL;
-      A01 = part.ATR - nA11 * as1;
-      A02 = part.ATR;
-      A10 = part.ABL - mA11 * as0;
-      A11 = part.ABR - mA11 * as0 - nA11 * as1;
-      A12 = part.ABR - mA11 * as0;
-      A20 = part.ABL;
-      A21 = part.ABR - nA11 * as1;
-      A22 = part.ABR;
-    }
-  };
+  KOKKOS_INLINE_FUNCTION
+  Partition1x2(const int arg_as1) : as1(arg_as1), AL(NULL), AR(NULL) {}
 
-  template <class ViewType>
-  KOKKOS_INLINE_FUNCTION auto transpose_2d_view(ViewType v, const int *order) {
-    constexpr int rank             = 2;
-    const int dim[]            = {v.extent_int(1), v.extent_int(0)};
-    using view_value_type      = typename ViewType::value_type;
-    using execution_space_type = typename ViewType::execution_space;
-    using view_type = Kokkos::View<view_value_type **, Kokkos::LayoutStride,
-                                   execution_space_type>;
-    Kokkos::LayoutStride stride =
-        Kokkos::LayoutStride::order_dimensions(rank, order, dim);
-
-    return view_type(v.data(), stride);
+  KOKKOS_INLINE_FUNCTION
+  void partWithAL(ValueType *A, const int /* nA */, const int nAL) {
+    AL = A;
+    AR = AL + nAL * as1;
   }
 
-  template <class ViewType>
-  KOKKOS_INLINE_FUNCTION auto transpose_2d_view(ViewType v,
-                                                const BatchLayout::Left &) {
-    const int order[] = {0, 1};  // v is LayoutRight
-    return transpose_2d_view(v, order);
+  KOKKOS_INLINE_FUNCTION
+  void partWithAR(ValueType *A, const int nA, const int nAR) {
+    AL = A;
+    AR = AL + (nA - nAR) * as1;
   }
 
-  template <class ViewType>
-  KOKKOS_INLINE_FUNCTION auto transpose_2d_view(ViewType v,
-                                                const BatchLayout::Right &) {
-    const int order[] = {1, 0};  // v is LayoutLeft
-    return transpose_2d_view(v, order);
+  // A0 A1 are merged into AL
+  KOKKOS_INLINE_FUNCTION
+  void mergeToAL(const Partition1x3<ValueType> &part) {
+    AL = part.A0;
+    AR = part.A2;
   }
 
-  ///// subview_wrapper overloads for handling 3-rank BatchLayout::Left views
-  template <class ViewType, class IdxType1, class IdxType2, class IdxType3>
-  KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1,
-                                              IdxType2 i2, IdxType3 i3,
-                                              const BatchLayout::Left &) {
-    return Kokkos::subview(v, i1, i2, i3);
+  // A0 A1 are merged into AL
+  KOKKOS_INLINE_FUNCTION
+  void mergeToAR(const Partition1x3<ValueType> &part) {
+    AL = part.A0;
+    AR = part.A1;
   }
-  template <class ViewType, class IdxType1, class IdxType2, class IdxType3>
-  KOKKOS_INLINE_FUNCTION auto subview_wrapper(
-      ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3,
-      const BatchLayout::Left &layout_tag, const Trans::NoTranspose) {
-    return subview_wrapper(v, i1, i2, i3, layout_tag);
+};
+
+template <typename ValueType>
+struct Partition1x3 {
+  const int as1;
+  ValueType *A0, *A1, *A2;
+
+  KOKKOS_INLINE_FUNCTION
+  Partition1x3(const int arg_as1)
+      : as1(arg_as1), A0(NULL), A1(NULL), A2(NULL) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void partWithAL(const Partition1x2<ValueType> &part, const int mA1) {
+    A0 = part.AL;
+    A2 = part.AR;
+    A1 = A2 - mA1 * as1;
+  }
+  KOKKOS_INLINE_FUNCTION
+  void partWithAR(const Partition1x2<ValueType> &part, const int mA1) {
+    A0 = part.AL;
+    A1 = part.AR;
+    A2 = A1 + mA1 * as1;
   }
-  template <class ViewType, class IdxType1>
-  KOKKOS_INLINE_FUNCTION auto subview_wrapper(
-      ViewType v, IdxType1 i1, Kokkos::Impl::ALL_t i2, Kokkos::Impl::ALL_t i3,
-      const BatchLayout::Left &layout_tag, const Trans::Transpose) {
-    auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag);
+};
+
+template <typename ValueType>
+struct Partition2x1;
+template <typename ValueType>
+struct Partition3x1;
+
+template <typename ValueType>
+struct Partition2x1 {
+  const int as0;
+  ValueType *AT, *AB;
 
-    return transpose_2d_view(sv_nt, layout_tag);
+  KOKKOS_INLINE_FUNCTION
+  Partition2x1(const int arg_as0) : as0(arg_as0), AT(NULL), AB(NULL) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void partWithAT(ValueType *A, const int /* mA */, const int mAT) {
+    AT = A;
+    AB = AT + mAT * as0;
   }
-  template <class ViewType, class IdxType1, class IdxType2, class IdxType3>
-  KOKKOS_INLINE_FUNCTION auto subview_wrapper(
-      ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3,
-      const BatchLayout::Left &layout_tag, const Trans::Transpose) {
-    auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag);
 
-    return sv_nt;
+  KOKKOS_INLINE_FUNCTION
+  void partWithAB(ValueType *A, const int mA, const int mAB) {
+    partWithAT(A, mA, mA - mAB);
   }
 
-  //// subview_wrapper overloads for handling 3-rank BatchLayout::Right views
-  template <class ViewType, class IdxType1, class IdxType2, class IdxType3>
-  KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1,
-                                              IdxType2 i2, IdxType3 i3,
-                                              const BatchLayout::Right &) {
-    return Kokkos::subview(v, i2, i3, i1);
+  // A0
+  // A1 is merged into AT
+  KOKKOS_INLINE_FUNCTION
+  void mergeToAT(const Partition3x1<ValueType> &part) {
+    AT = part.A0;
+    AB = part.A2;
   }
-  template <class ViewType, class IdxType1, class IdxType2, class IdxType3>
-  KOKKOS_INLINE_FUNCTION auto subview_wrapper(
-      ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3,
-      const BatchLayout::Right &layout_tag, const Trans::NoTranspose &) {
-    return subview_wrapper(v, i1, i2, i3, layout_tag);
+
+  KOKKOS_INLINE_FUNCTION
+  void mergeToAB(const Partition3x1<ValueType> &part) {
+    AT = part.A0;
+    AB = part.A1;
   }
-  template <class ViewType, class IdxType1>
-  KOKKOS_INLINE_FUNCTION auto subview_wrapper(
-      ViewType v, IdxType1 i1, Kokkos::Impl::ALL_t i2, Kokkos::Impl::ALL_t i3,
-      const BatchLayout::Right &layout_tag, const Trans::Transpose &) {
-    auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag);
+};
+
+template <typename ValueType>
+struct Partition3x1 {
+  const int as0;
+  ValueType *A0,
+      /* */ *A1,
+      /* */ *A2;
 
-    return transpose_2d_view(sv_nt, layout_tag);
+  KOKKOS_INLINE_FUNCTION
+  Partition3x1(const int arg_as0)
+      : as0(arg_as0), A0(NULL), A1(NULL), A2(NULL) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void partWithAB(const Partition2x1<ValueType> &part, const int mA1) {
+    A0 = part.AT;
+    A1 = part.AB;
+    A2 = A1 + mA1 * as0;
   }
-  template <class ViewType, class IdxType1, class IdxType2, class IdxType3>
-  KOKKOS_INLINE_FUNCTION auto subview_wrapper(
-      ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3,
-      const BatchLayout::Right &layout_tag, const Trans::Transpose &) {
-    auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag);
 
-    return sv_nt;
+  KOKKOS_INLINE_FUNCTION
+  void partWithAT(const Partition2x1<ValueType> &part, const int mA1) {
+    A0 = part.AT;
+    A1 = part.AB - mA1 * as0;
+    A2 = part.AB;
   }
+};
 
-  template <class ViewValueType, class ViewType>
-  KOKKOS_INLINE_FUNCTION ViewValueType
-  access_view_bounds_check(ViewType v, int m, int n, const BoundsCheck::Yes &) {
-    if (m < v.extent_int(0) && n < v.extent_int(1)) return v(m, n);
-    return (ViewValueType)0.0F;
+template <typename ValueType>
+struct Partition2x2;
+template <typename ValueType>
+struct Partition3x3;
+
+template <typename ValueType>
+struct Partition2x2 {
+  const int as0, as1;
+  ValueType *ATL, *ATR, *ABL, *ABR;
+
+  KOKKOS_INLINE_FUNCTION
+  Partition2x2(const int arg_as0, const int arg_as1)
+      : as0(arg_as0),
+        as1(arg_as1),
+        ATL(NULL),
+        ATR(NULL),
+        ABL(NULL),
+        ABR(NULL) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void partWithATL(ValueType *A, const int /* mA */, const int /* nA */,
+                   const int mATL, const int nATL) {
+    ATL = A;
+    ATR = ATL + nATL * as1;
+    ABL = ATL + mATL * as0;
+    ABR = ABL + nATL * as1;
   }
 
-  template <class ViewValueType, class ViewType>
-  KOKKOS_INLINE_FUNCTION ViewValueType
-  access_view_bounds_check(ViewType v, int m, int n, const BoundsCheck::No &) {
-    return v(m, n);
+  KOKKOS_INLINE_FUNCTION
+  void partWithABR(ValueType *A, const int mA, const int nA, const int mABR,
+                   const int nABR) {
+    partWithATL(A, mA, nA, mA - mABR, nA - nABR);
   }
 
-  template <class ViewType, class SizeType, class ViewValueType,
-            class ScalarType>
-  KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m,
-                                               SizeType n, ViewValueType reg_c,
-                                               ScalarType beta,
-                                               const BoundsCheck::Yes &) {
-    if (m < v.extent_int(0) && n < v.extent_int(1))
-      v(m, n) = reg_c + v(m, n) * beta;
+  // A00 A01
+  // A10 A11 is merged into ATL
+  KOKKOS_INLINE_FUNCTION
+  void mergeToATL(const Partition3x3<ValueType> &part) {
+    ATL = part.A00;
+    ATR = part.A02;
+    ABL = part.A20;
+    ABR = part.A22;
   }
 
-  template <class ViewType, class SizeType, class ViewValueType,
-            class ScalarType>
-  KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m,
-                                               SizeType n, ViewValueType reg_c,
-                                               ScalarType beta,
-                                               const BoundsCheck::No &) {
-    v(m, n) = reg_c + v(m, n) * beta;
+  KOKKOS_INLINE_FUNCTION
+  void mergeToABR(const Partition3x3<ValueType> &part) {
+    ATL = part.A00;
+    ATR = part.A01;
+    ABL = part.A10;
+    ABR = part.A11;
   }
+};
+
+template <typename ValueType>
+struct Partition3x3 {
+  const int as0, as1;
+  ValueType *A00, *A01, *A02,
+      /* */ *A10, *A11, *A12,
+      /* */ *A20, *A21, *A22;
 
-  template <class ViewType, class SizeType, class ScalarType>
-  KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m,
-                                               SizeType n, ScalarType reg_c,
-                                               const BoundsCheck::Yes &) {
-    if (m < v.extent_int(0) && n < v.extent_int(1)) v(m, n) = reg_c;
+  KOKKOS_INLINE_FUNCTION
+  Partition3x3(const int arg_as0, const int arg_as1)
+      : as0(arg_as0),
+        as1(arg_as1),
+        A00(NULL),
+        A01(NULL),
+        A02(NULL),
+        A10(NULL),
+        A11(NULL),
+        A12(NULL),
+        A20(NULL),
+        A21(NULL),
+        A22(NULL) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void partWithABR(const Partition2x2<ValueType> &part, const int mA11,
+                   const int nA11) {
+    A00 = part.ATL;
+    A01 = part.ATR;
+    A02 = part.ATR + nA11 * as1;
+    A10 = part.ABL;
+    A11 = part.ABR;
+    A12 = part.ABR + nA11 * as1;
+    A20 = part.ABL + mA11 * as0;
+    A21 = part.ABR + mA11 * as0;
+    A22 = part.ABR + mA11 * as0 + nA11 * as1;
   }
 
-  template <class ViewType, class SizeType, class ScalarType>
-  KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m,
-                                               SizeType n, ScalarType reg_c,
-                                               const BoundsCheck::No &) {
-    v(m, n) = reg_c;
+  KOKKOS_INLINE_FUNCTION
+  void partWithATL(const Partition2x2<ValueType> &part, const int mA11,
+                   const int nA11) {
+    A00 = part.ATL;
+    A01 = part.ATR - nA11 * as1;
+    A02 = part.ATR;
+    A10 = part.ABL - mA11 * as0;
+    A11 = part.ABR - mA11 * as0 - nA11 * as1;
+    A12 = part.ABR - mA11 * as0;
+    A20 = part.ABL;
+    A21 = part.ABR - nA11 * as1;
+    A22 = part.ABR;
   }
+};
+
+template <typename OrdinalType, typename layout>
+KOKKOS_INLINE_FUNCTION
+    typename std::enable_if<std::is_same<layout, Kokkos::LayoutLeft>::value,
+                            void>::type
+    getIndices(const OrdinalType iTemp, const OrdinalType /*numRows*/,
+               const OrdinalType numMatrices, OrdinalType &iRow,
+               OrdinalType &iMatrix) {
+  iRow    = iTemp / numMatrices;
+  iMatrix = iTemp % numMatrices;
+}
+
+template <typename OrdinalType, typename layout>
+KOKKOS_INLINE_FUNCTION
+    typename std::enable_if<std::is_same<layout, Kokkos::LayoutRight>::value,
+                            void>::type
+    getIndices(const OrdinalType iTemp, const OrdinalType numRows,
+               const OrdinalType /*numMatrices*/, OrdinalType &iRow,
+               OrdinalType &iMatrix) {
+  iRow    = iTemp % numRows;
+  iMatrix = iTemp / numRows;
+}
+
+template <class ViewType>
+KOKKOS_INLINE_FUNCTION auto transpose_2d_view(ViewType v, const int *order) {
+  constexpr int rank         = 2;
+  const int dim[]            = {v.extent_int(1), v.extent_int(0)};
+  using view_value_type      = typename ViewType::value_type;
+  using execution_space_type = typename ViewType::execution_space;
+  using view_type = Kokkos::View<view_value_type **, Kokkos::LayoutStride,
+                                 execution_space_type>;
+  Kokkos::LayoutStride stride =
+      Kokkos::LayoutStride::order_dimensions(rank, order, dim);
+
+  return view_type(v.data(), stride);
+}
+
+template <class ViewType>
+KOKKOS_INLINE_FUNCTION auto transpose_2d_view(ViewType v,
+                                              const BatchLayout::Left &) {
+  const int order[] = {0, 1};  // v is LayoutRight
+  return transpose_2d_view(v, order);
+}
 
-  }  // namespace KokkosBatched
+template <class ViewType>
+KOKKOS_INLINE_FUNCTION auto transpose_2d_view(ViewType v,
+                                              const BatchLayout::Right &) {
+  const int order[] = {1, 0};  // v is LayoutLeft
+  return transpose_2d_view(v, order);
+}
+
+///// subview_wrapper overloads for handling 3-rank BatchLayout::Left views
+template <class ViewType, class IdxType1, class IdxType2, class IdxType3>
+KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1,
+                                            IdxType2 i2, IdxType3 i3,
+                                            const BatchLayout::Left &) {
+  return Kokkos::subview(v, i1, i2, i3);
+}
+template <class ViewType, class IdxType1, class IdxType2, class IdxType3>
+KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1,
+                                            IdxType2 i2, IdxType3 i3,
+                                            const BatchLayout::Left &layout_tag,
+                                            const Trans::NoTranspose) {
+  return subview_wrapper(v, i1, i2, i3, layout_tag);
+}
+template <class ViewType, class IdxType1>
+KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1,
+                                            Kokkos::Impl::ALL_t i2,
+                                            Kokkos::Impl::ALL_t i3,
+                                            const BatchLayout::Left &layout_tag,
+                                            const Trans::Transpose) {
+  auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag);
+
+  return transpose_2d_view(sv_nt, layout_tag);
+}
+template <class ViewType, class IdxType1, class IdxType2, class IdxType3>
+KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1,
+                                            IdxType2 i2, IdxType3 i3,
+                                            const BatchLayout::Left &layout_tag,
+                                            const Trans::Transpose) {
+  auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag);
+
+  return sv_nt;
+}
+
+//// subview_wrapper overloads for handling 3-rank BatchLayout::Right views
+template <class ViewType, class IdxType1, class IdxType2, class IdxType3>
+KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1,
+                                            IdxType2 i2, IdxType3 i3,
+                                            const BatchLayout::Right &) {
+  return Kokkos::subview(v, i2, i3, i1);
+}
+template <class ViewType, class IdxType1, class IdxType2, class IdxType3>
+KOKKOS_INLINE_FUNCTION auto subview_wrapper(
+    ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3,
+    const BatchLayout::Right &layout_tag, const Trans::NoTranspose &) {
+  return subview_wrapper(v, i1, i2, i3, layout_tag);
+}
+template <class ViewType, class IdxType1>
+KOKKOS_INLINE_FUNCTION auto subview_wrapper(
+    ViewType v, IdxType1 i1, Kokkos::Impl::ALL_t i2, Kokkos::Impl::ALL_t i3,
+    const BatchLayout::Right &layout_tag, const Trans::Transpose &) {
+  auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag);
+
+  return transpose_2d_view(sv_nt, layout_tag);
+}
+template <class ViewType, class IdxType1, class IdxType2, class IdxType3>
+KOKKOS_INLINE_FUNCTION auto subview_wrapper(
+    ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3,
+    const BatchLayout::Right &layout_tag, const Trans::Transpose &) {
+  auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag);
+
+  return sv_nt;
+}
+
+/**
+ *
+ * @tparam ViewValueType The value type (Scalar or Vector) of each view element
+ * @tparam ViewType The view type
+ * @param v The view handle
+ * @param m The requested row index of v
+ * @param n The requested col index of v
+ * @return If m and n are within the extents of v, a valid element of v;
+ *         otherwise, the last element of v.
+ */
+template <class ViewValueType, class ViewType>
+KOKKOS_INLINE_FUNCTION ViewValueType
+access_view_bounds_check(ViewType v, int m, int n, const BoundsCheck::Yes &) {
+  return v(KOKKOSKERNELS_MACRO_MIN(m, v.extent_int(0) - 1),
+           KOKKOSKERNELS_MACRO_MIN(n, v.extent_int(1) - 1));
+}
+
+template <class ViewValueType, class ViewType>
+KOKKOS_INLINE_FUNCTION ViewValueType
+access_view_bounds_check(ViewType v, int m, int n, const BoundsCheck::No &) {
+  return v(m, n);
+}
+
+template <class ViewValueType, class ScalarType>
+KOKKOS_INLINE_FUNCTION ViewValueType fma_alpha(ViewValueType reg_c,
+                                               ScalarType alpha,
+                                               const AlphaTag::Yes &) {
+  return reg_c * alpha;
+}
+
+template <class ViewValueType, class ScalarType>
+KOKKOS_INLINE_FUNCTION ViewValueType fma_alpha(ViewValueType reg_c,
+                                               ScalarType alpha,
+                                               const AlphaTag::No &) {
+  return reg_c;
+  (void)alpha;
+}
+
+template <class ViewType, class SizeType, class ViewValueType, class ScalarType,
+          class ArgAlphaFmaTag>
+KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m, SizeType n,
+                                             ViewValueType reg_c,
+                                             ScalarType alpha, ScalarType beta,
+                                             const ArgAlphaFmaTag &alpha_tag,
+                                             const BoundsCheck::Yes &) {
+  if (m < v.extent_int(0) && n < v.extent_int(1))
+    v(m, n) = fma_alpha(reg_c, alpha, alpha_tag) + v(m, n) * beta;
+}
+
+template <class ViewType, class SizeType, class ViewValueType, class ScalarType,
+          class ArgAlphaFmaTag>
+KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m, SizeType n,
+                                             ViewValueType reg_c,
+                                             ScalarType alpha, ScalarType beta,
+                                             const ArgAlphaFmaTag &alpha_tag,
+                                             const BoundsCheck::No &) {
+  v(m, n) = fma_alpha(reg_c, alpha, alpha_tag) + v(m, n) * beta;
+}
+
+template <class ViewType, class SizeType, class ViewValueType, class ScalarType,
+          class ArgAlphaFmaTag>
+KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m, SizeType n,
+                                             ViewValueType reg_c,
+                                             ScalarType alpha,
+                                             const ArgAlphaFmaTag &alpha_tag,
+                                             const BoundsCheck::Yes &) {
+  if (m < v.extent_int(0) && n < v.extent_int(1))
+    v(m, n) = fma_alpha(reg_c, alpha, alpha_tag);
+}
+
+template <class ViewType, class SizeType, class ViewValueType, class ScalarType,
+          class ArgAlphaFmaTag>
+KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m, SizeType n,
+                                             ViewValueType reg_c,
+                                             ScalarType alpha,
+                                             const ArgAlphaFmaTag &alpha_tag,
+                                             const BoundsCheck::No &) {
+  v(m, n) = fma_alpha(reg_c, alpha, alpha_tag);
+}
+
+}  // namespace KokkosBatched
 #endif  // __KOKKOSBATCHED_UTIL_HPP__
diff --git a/src/batched/dense/KokkosBatched_AddRadial_Decl.hpp b/src/batched/dense/KokkosBatched_AddRadial_Decl.hpp
index dc9a73da95..f6004deebc 100644
--- a/src/batched/dense/KokkosBatched_AddRadial_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_AddRadial_Decl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_ADD_RADIAL_DECL_HPP__
 #define __KOKKOSBATCHED_ADD_RADIAL_DECL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,39 +8,33 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// This add tiny values on diagonals so the absolute values of diagonals become larger
-  ///
-
-  ///
-  /// Serial AddRadial
-  ///
-
-  struct SerialAddRadial {
-    template<typename ScalarType,
-             typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType tiny,
-           const AViewType &A);
-  };
-
-  ///
-  /// Team Set
-  ///
-
-  template<typename MemberType>
-  struct TeamAddRadial {
-    template<typename ScalarType,
-             typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType tiny,
-           const AViewType &A);
-  };
-
-}
-
+///
+/// This add tiny values on diagonals so the absolute values of diagonals become
+/// larger
+///
+
+///
+/// Serial AddRadial
+///
+
+struct SerialAddRadial {
+  template <typename ScalarType, typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType tiny,
+                                           const AViewType &A);
+};
+
+///
+/// Team Set
+///
+
+template <typename MemberType>
+struct TeamAddRadial {
+  template <typename ScalarType, typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType tiny,
+                                           const AViewType &A);
+};
+
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/KokkosBatched_ApplyHouseholder_Decl.hpp b/src/batched/dense/KokkosBatched_ApplyHouseholder_Decl.hpp
index 1b0e13d59f..6a4659a01b 100644
--- a/src/batched/dense/KokkosBatched_ApplyHouseholder_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_ApplyHouseholder_Decl.hpp
@@ -1,50 +1,38 @@
 #ifndef __KOKKOSBATCHED_APPLY_HOUSEHOLDER_DECL_HPP__
 #define __KOKKOSBATCHED_APPLY_HOUSEHOLDER_DECL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Householder 
-  ///
-
-  // level 1 operation
-  template<typename ArgSide>
-  struct SerialApplyHouseholder {
-    template<typename uViewType,
-             typename tauViewType,
-             typename AViewType,
-             typename wViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const uViewType &u2,
-           const tauViewType &tau,
-           const AViewType
-           const wViewType &w);
-  };
-
-  // level 1 operation
-  template<typename MemberType, 
-           typename ArgSide>
-  struct TeamVectorApplyHouseholder {
-    template<typename uViewType,
-             typename tauViewType,
-             typename AViewType,
-             typename wViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const uViewType &u2,
-           const tauViewType &tau,
-           const AViewType
-           const wViewType &w);
-  };
-
-}
+///
+/// Serial Householder
+///
+
+// level 1 operation
+template <typename ArgSide>
+struct SerialApplyHouseholder {
+  template <typename uViewType, typename tauViewType, typename AViewType,
+            typename wViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const uViewType &u2,
+                                           const tauViewType &tau,
+                                           const AViewType const wViewType &w);
+};
+
+// level 1 operation
+template <typename MemberType, typename ArgSide>
+struct TeamVectorApplyHouseholder {
+  template <typename uViewType, typename tauViewType, typename AViewType,
+            typename wViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const uViewType &u2,
+                                           const tauViewType &tau,
+                                           const AViewType const wViewType &w);
+};
+
+}  // namespace KokkosBatched
 
 #include "KokkosBatched_ApplyHouseholder_Serial_Impl.hpp"
 #include "KokkosBatched_ApplyHouseholder_TeamVector_Impl.hpp"
diff --git a/src/batched/dense/KokkosBatched_ApplyPivot_Decl.hpp b/src/batched/dense/KokkosBatched_ApplyPivot_Decl.hpp
index 00ba0e5425..d37cfbb8d8 100644
--- a/src/batched/dense/KokkosBatched_ApplyPivot_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_ApplyPivot_Decl.hpp
@@ -1,37 +1,28 @@
 #ifndef __KOKKOSBATCHED_APPLY_PIVOT_DECL_HPP__
 #define __KOKKOSBATCHED_APPLY_PIVOT_DECL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 namespace KokkosBatched {
 
-  ///
-  /// TeamVector 
-  /// ==========
-  template<typename MemberType,
-	   typename ArgSide,
-	   typename ArgDirect>
-  struct TeamVectorApplyPivot {
-    template<typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const int piv,
-	   const AViewType &A);
-
-    template<typename PivViewType,
-             typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const PivViewType piv,
-           const AViewType &A);
-  };
-
-}
+///
+/// TeamVector
+/// ==========
+template <typename MemberType, typename ArgSide, typename ArgDirect>
+struct TeamVectorApplyPivot {
+  template <typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int piv, const AViewType &A);
+
+  template <typename PivViewType, typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const PivViewType piv,
+                                           const AViewType &A);
+};
+
+}  // namespace KokkosBatched
 
 #include "KokkosBatched_ApplyPivot_Impl.hpp"
 
diff --git a/src/batched/dense/KokkosBatched_ApplyQ_Decl.hpp b/src/batched/dense/KokkosBatched_ApplyQ_Decl.hpp
index 7a9b2c8405..11258e7cb9 100644
--- a/src/batched/dense/KokkosBatched_ApplyQ_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_ApplyQ_Decl.hpp
@@ -1,111 +1,86 @@
 #ifndef __KOKKOSBATCHED_APPLY_Q_DECL_HPP__
 #define __KOKKOSBATCHED_APPLY_Q_DECL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
-
 namespace KokkosBatched {
 
-  ///
-  /// Serial ApplyQ
-  ///
+///
+/// Serial ApplyQ
+///
 
-  template<typename ArgSide,
-           typename ArgTrans,
-           typename ArgAlgo>
-  struct SerialApplyQ {
-    template<typename AViewType,
-             typename tViewType,
-             typename BViewType,
-             typename wViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const AViewType &A,
-           const tViewType &t,
-           const BViewType &B,
-           const wViewType &w);
-  };
+template <typename ArgSide, typename ArgTrans, typename ArgAlgo>
+struct SerialApplyQ {
+  template <typename AViewType, typename tViewType, typename BViewType,
+            typename wViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A,
+                                           const tViewType &t,
+                                           const BViewType &B,
+                                           const wViewType &w);
+};
 
-  ///
-  /// Team ApplyQ
-  ///
+///
+/// Team ApplyQ
+///
 
-  template<typename MemberType,
-           typename ArgSide,
-           typename ArgTrans,
-           typename ArgAlgo>
-  struct TeamApplyQ {
-    template<typename AViewType,
-             typename tViewType,
-             typename BViewType,
-             typename wViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const AViewType &A,
-           const tViewType &t,
-           const BViewType &B,
-           const wViewType &w);
-  };
+template <typename MemberType, typename ArgSide, typename ArgTrans,
+          typename ArgAlgo>
+struct TeamApplyQ {
+  template <typename AViewType, typename tViewType, typename BViewType,
+            typename wViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const AViewType &A,
+                                           const tViewType &t,
+                                           const BViewType &B,
+                                           const wViewType &w);
+};
 
-  ///
-  /// TeamVector ApplyQ
-  ///
+///
+/// TeamVector ApplyQ
+///
 
-  template<typename MemberType,
-           typename ArgSide,
-           typename ArgTrans,
-           typename ArgAlgo>
-  struct TeamVectorApplyQ {
-    template<typename AViewType,
-             typename tViewType,
-             typename BViewType,
-             typename wViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const AViewType &A,
-           const tViewType &t,
-           const BViewType &B,
-           const wViewType &w);
-  };
+template <typename MemberType, typename ArgSide, typename ArgTrans,
+          typename ArgAlgo>
+struct TeamVectorApplyQ {
+  template <typename AViewType, typename tViewType, typename BViewType,
+            typename wViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const AViewType &A,
+                                           const tViewType &t,
+                                           const BViewType &B,
+                                           const wViewType &w);
+};
 
-  ///
-  /// Selective Interface
-  ///
-  template<typename MemberType,
-           typename ArgSide,
-           typename ArgTrans,
-           typename ArgMode,
-           typename ArgAlgo>
-  struct ApplyQ {
-    template<typename AViewType,
-             typename tViewType,
-             typename BViewType,
-             typename wViewType>
-    KOKKOS_FORCEINLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const AViewType &A,
-           const tViewType &t,
-           const BViewType &B,
-           const wViewType &w) {
-      int r_val = 0;
-      if (std::is_same<ArgMode,Mode::Serial>::value) {
-        r_val = SerialApplyQ<ArgSide,ArgTrans,ArgAlgo>::invoke(A, t, B, w);
-      } else if (std::is_same<ArgMode,Mode::Team>::value) {
-        r_val = TeamApplyQ<MemberType,ArgSide,ArgTrans,ArgAlgo>::invoke(member, A, t, B, w);
-      } else if (std::is_same<ArgMode,Mode::Team>::value) {
-        r_val = TeamVectorApplyQ<MemberType,ArgSide,ArgTrans,ArgAlgo>::invoke(member, A, t, B, w);
-      } 
-      return r_val;
+///
+/// Selective Interface
+///
+template <typename MemberType, typename ArgSide, typename ArgTrans,
+          typename ArgMode, typename ArgAlgo>
+struct ApplyQ {
+  template <typename AViewType, typename tViewType, typename BViewType,
+            typename wViewType>
+  KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member,
+                                                const AViewType &A,
+                                                const tViewType &t,
+                                                const BViewType &B,
+                                                const wViewType &w) {
+    int r_val = 0;
+    if (std::is_same<ArgMode, Mode::Serial>::value) {
+      r_val = SerialApplyQ<ArgSide, ArgTrans, ArgAlgo>::invoke(A, t, B, w);
+    } else if (std::is_same<ArgMode, Mode::Team>::value) {
+      r_val = TeamApplyQ<MemberType, ArgSide, ArgTrans, ArgAlgo>::invoke(
+          member, A, t, B, w);
+    } else if (std::is_same<ArgMode, Mode::Team>::value) {
+      r_val = TeamVectorApplyQ<MemberType, ArgSide, ArgTrans, ArgAlgo>::invoke(
+          member, A, t, B, w);
     }
-  };      
+    return r_val;
+  }
+};
 
-}
+}  // namespace KokkosBatched
 
 #include "KokkosBatched_ApplyQ_Serial_Impl.hpp"
 #include "KokkosBatched_ApplyQ_TeamVector_Impl.hpp"
diff --git a/src/batched/dense/KokkosBatched_Axpy.hpp b/src/batched/dense/KokkosBatched_Axpy.hpp
new file mode 100644
index 0000000000..d9c33bf889
--- /dev/null
+++ b/src/batched/dense/KokkosBatched_Axpy.hpp
@@ -0,0 +1,141 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+#ifndef __KOKKOSBATCHED_AXPY_HPP__
+#define __KOKKOSBATCHED_AXPY_HPP__
+
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Vector.hpp"
+
+namespace KokkosBatched {
+
+/// \brief Serial Batched AXPY:
+///   y_l <- alpha_l * x_l + y_l for all l = 1, ..., N
+/// where:
+///   * N is the number of vectors,
+///   * x_1, ..., x_N are the N input vectors,
+///   * y_1, ..., y_N are the N output vectors,
+///   * alpha_1, ..., alpha_N are N scaling factors for x_1, ..., x_N.
+///
+/// \tparam XViewType: Input type for X, needs to be a 2D view
+/// \tparam YViewType: Input type for Y, needs to be a 2D view
+/// \tparam alphaViewType: Input type for alpha, needs to be a 1D view
+///
+/// \param alpha [in]: input coefficient for X, a rank 1 view
+/// \param X [in]: Input vector X, a rank 2 view
+/// \param Y [in/out]: Output vector Y, a rank 2 view
+///
+/// No nested parallel_for is used inside of the function.
+///
+
+struct SerialAxpy {
+  template <typename XViewType, typename YViewType, typename alphaViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const alphaViewType &alpha,
+                                           const XViewType &X,
+                                           const YViewType &Y);
+};
+
+/// \brief Team Batched AXPY:
+///   y_l <- alpha_l * x_l + y_l for all l = 1, ..., N
+/// where:
+///   * N is the number of vectors,
+///   * x_1, ..., x_N are the N input vectors,
+///   * y_1, ..., y_N are the N output vectors,
+///   * alpha_1, ..., alpha_N are N scaling factors for x_1, ..., x_N.
+///
+/// \tparam XViewType: Input type for X, needs to be a 2D view
+/// \tparam YViewType: Input type for Y, needs to be a 2D view
+/// \tparam alphaViewType: Input type for alpha, needs to be a 1D view
+///
+/// \param member [in]: TeamPolicy member
+/// \param alpha [in]: input coefficient for X, a rank 1 view
+/// \param X [in]: Input vector X, a rank 2 view
+/// \param Y [in/out]: Output vector Y, a rank 2 view
+///
+/// A nested parallel_for with TeamThreadRange is used.
+///
+
+template <typename MemberType>
+struct TeamAxpy {
+  template <typename XViewType, typename YViewType, typename alphaViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const alphaViewType &alpha,
+                                           const XViewType &X,
+                                           const YViewType &Y);
+};
+
+/// \brief TeamVector Batched AXPY:
+///   y_l <- alpha_l * x_l + y_l for all l = 1, ..., N
+/// where:
+///   * N is the number of vectors,
+///   * x_1, ..., x_N are the N input vectors,
+///   * y_1, ..., y_N are the N output vectors,
+///   * alpha_1, ..., alpha_N are N scaling factors for x_1, ..., x_N.
+///
+/// \tparam XViewType: Input type for X, needs to be a 2D view
+/// \tparam YViewType: Input type for Y, needs to be a 2D view
+/// \tparam alphaViewType: Input type for alpha, needs to be a 1D view
+///
+/// \param member [in]: TeamPolicy member
+/// \param alpha [in]: input coefficient for X, a rank 1 view
+/// \param X [in]: Input vector X, a rank 2 view
+/// \param Y [in/out]: Output vector Y, a rank 2 view
+///
+/// Two nested parallel_for with both TeamThreadRange and ThreadVectorRange
+/// (or one with TeamVectorRange) are used inside.
+///
+
+template <typename MemberType>
+struct TeamVectorAxpy {
+  template <typename XViewType, typename YViewType, typename alphaViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const alphaViewType &alpha,
+                                           const XViewType &X,
+                                           const YViewType &Y);
+};
+
+}  // namespace KokkosBatched
+
+#include "KokkosBatched_Axpy_Impl.hpp"
+
+#endif
diff --git a/src/batched/dense/KokkosBatched_Copy_Decl.hpp b/src/batched/dense/KokkosBatched_Copy_Decl.hpp
index 11cd00bcdc..c12c8d7209 100644
--- a/src/batched/dense/KokkosBatched_Copy_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_Copy_Decl.hpp
@@ -1,109 +1,98 @@
 #ifndef __KOKKOSBATCHED_COPY_DECL_HPP__
 #define __KOKKOSBATCHED_COPY_DECL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Copy
-  ///
-
-  template<typename ArgTrans>
-  struct SerialCopy {
-    template<typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const AViewType &A,
-           const BViewType &B);
-  };
-
-  ///
-  /// Team Copy
-  ///
-
-  template<typename MemberType, typename ArgTrans>
-  struct TeamCopy {
-    template<typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const AViewType &A,
-           const BViewType &B);
-  };
-
-  ///
-  /// TeamVector Copy
-  ///
-
-  template<typename MemberType, typename ArgTrans>
-  struct TeamVectorCopy {
-    template<typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const AViewType &A,
-           const BViewType &B);
-  };
-
-
-  ///
-  /// Selective Interface
-  ///
-  template<typename MemberType,
-           typename ArgTrans,
-           typename ArgMode>
-  struct Copy {
-    template<typename AViewType,
-             typename BViewType>
-    KOKKOS_FORCEINLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const AViewType &A,
-           const BViewType &B) {
-      int r_val = 0;
-      if (std::is_same<ArgMode,Mode::Serial>::value) {
-        r_val = SerialCopy<ArgTrans>::invoke(A, B);
-      } else if (std::is_same<ArgMode,Mode::Team>::value) {
-        r_val = TeamCopy<MemberType,ArgTrans>::invoke(member, A, B);
-      } else if (std::is_same<ArgMode,Mode::TeamVector>::value) {
-        r_val = TeamVectorCopy<MemberType,ArgTrans>::invoke(member, A, B);
-      } 
-      return r_val;
+///
+/// Serial Copy
+///
+
+template <typename ArgTrans, int rank = 2>
+struct SerialCopy {
+  template <typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A,
+                                           const BViewType &B);
+};
+
+///
+/// Team Copy
+///
+
+template <typename MemberType, typename ArgTrans = Trans::NoTranspose,
+          int rank = 2>
+struct TeamCopy {
+  template <typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const AViewType &A,
+                                           const BViewType &B);
+};
+
+///
+/// TeamVector Copy
+///
+
+template <typename MemberType, typename ArgTrans = Trans::NoTranspose,
+          int rank = 2>
+struct TeamVectorCopy {
+  template <typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const AViewType &A,
+                                           const BViewType &B);
+};
+
+///
+/// Selective Interface
+///
+template <typename MemberType, typename ArgTrans, typename ArgMode,
+          int rank = 2>
+struct Copy {
+  template <typename AViewType, typename BViewType>
+  KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member,
+                                                const AViewType &A,
+                                                const BViewType &B) {
+    int r_val = 0;
+    if (std::is_same<ArgMode, Mode::Serial>::value) {
+      r_val = SerialCopy<ArgTrans, rank>::invoke(A, B);
+    } else if (std::is_same<ArgMode, Mode::Team>::value) {
+      r_val = TeamCopy<MemberType, ArgTrans, rank>::invoke(member, A, B);
+    } else if (std::is_same<ArgMode, Mode::TeamVector>::value) {
+      r_val = TeamVectorCopy<MemberType, ArgTrans, rank>::invoke(member, A, B);
     }
-  };      
+    return r_val;
+  }
+};
 
-}
+}  // namespace KokkosBatched
 
 #include "KokkosBatched_Copy_Impl.hpp"
 
-#define KOKKOSBATCHED_SERIAL_COPY_MATRIX_NO_TRANSPOSE_INTERNAL_INVOKE(M,N,A,AS0,AS1,B,BS0,BS1) \
-  KokkosBatched::SerialCopyInternal                                     \
-  ::invoke(M, N, A, AS0, AS1, B, BS0, BS1)
-
-#define KOKKOSBATCHED_TEAM_COPY_MATRIX_NO_TRANSPOSE_INTERNAL_INVOKE(MEMBER,M,N,A,AS0,AS1,B,BS0,BS1) \
-  KokkosBatched::TeamCopyInternal                                       \
-  ::invoke(MEMBER, M, N, A, AS0, AS1, B, BS0, BS1)
-
-#define KOKKOSBATCHED_SERIAL_COPY_VECTOR_INTERNAL_INVOKE(M,A,AS,B,BS)	\
-  KokkosBatched::SerialCopyInternal                                     \
-  ::invoke(M, A, AS, B, BS)
-
-#define KOKKOSBATCHED_TEAM_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(MEMBER,M,A,AS,B,BS) \
-  KokkosBatched::TeamCopyInternal                                       \
-  ::invoke(MEMBER, M, A, AS, B, BS)
-
-#define KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(MODETYPE,MEMBER,M,A,AS,B,BS) \
-  if (std::is_same<MODETYPE,KokkosBatched::Mode::Serial>::value) {      \
-    KOKKOSBATCHED_SERIAL_COPY_VECTOR_INTERNAL_INVOKE(M,A,AS,B,BS);	\
-  } else if (std::is_same<MODETYPE,KokkosBatched::Mode::Team>::value) { \
-    KOKKOSBATCHED_TEAM_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(MEMBER,M,A,AS,B,BS); \
+#define KOKKOSBATCHED_SERIAL_COPY_MATRIX_NO_TRANSPOSE_INTERNAL_INVOKE( \
+    M, N, A, AS0, AS1, B, BS0, BS1)                                    \
+  KokkosBatched::SerialCopyInternal ::invoke(M, N, A, AS0, AS1, B, BS0, BS1)
+
+#define KOKKOSBATCHED_TEAM_COPY_MATRIX_NO_TRANSPOSE_INTERNAL_INVOKE(          \
+    MEMBER, M, N, A, AS0, AS1, B, BS0, BS1)                                   \
+  KokkosBatched::TeamCopyInternal ::invoke(MEMBER, M, N, A, AS0, AS1, B, BS0, \
+                                           BS1)
+
+#define KOKKOSBATCHED_SERIAL_COPY_VECTOR_INTERNAL_INVOKE(M, A, AS, B, BS) \
+  KokkosBatched::SerialCopyInternal ::invoke(M, A, AS, B, BS)
+
+#define KOKKOSBATCHED_TEAM_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE( \
+    MEMBER, M, A, AS, B, BS)                                         \
+  KokkosBatched::TeamCopyInternal ::invoke(MEMBER, M, A, AS, B, BS)
+
+#define KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(               \
+    MODETYPE, MEMBER, M, A, AS, B, BS)                                        \
+  if (std::is_same<MODETYPE, KokkosBatched::Mode::Serial>::value) {           \
+    KOKKOSBATCHED_SERIAL_COPY_VECTOR_INTERNAL_INVOKE(M, A, AS, B, BS);        \
+  } else if (std::is_same<MODETYPE, KokkosBatched::Mode::Team>::value) {      \
+    KOKKOSBATCHED_TEAM_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(MEMBER, M, A, \
+                                                                AS, B, BS);   \
   }
 
 #endif
diff --git a/src/batched/dense/KokkosBatched_Dot.hpp b/src/batched/dense/KokkosBatched_Dot.hpp
new file mode 100644
index 0000000000..43d8c5ee5b
--- /dev/null
+++ b/src/batched/dense/KokkosBatched_Dot.hpp
@@ -0,0 +1,161 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+#ifndef __KOKKOSBATCHED_DOT_HPP__
+#define __KOKKOSBATCHED_DOT_HPP__
+
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Vector.hpp"
+
+namespace KokkosBatched {
+
+/// \brief Serial Batched DOT:
+///
+/// Depending on the ArgTrans template, the dot product is
+/// row-based (ArgTrans == Trans::NoTranspose):
+///
+///   dot_l <- (x_l:, y_l:) for all l = 1, ..., N
+/// where:
+///   * N is the second dimension of X.
+///
+/// Or column-based:
+///   dot_l <- (x_:l, y_:l) for all l = 1, ..., n
+/// where:
+///   * n is the second dimension of X.
+///
+/// \tparam ArgTrans: type of dot product (Trans::NoTranspose by default)
+/// \tparam XViewType: Input type for X, needs to be a 2D view
+/// \tparam YViewType: Input type for Y, needs to be a 2D view
+/// \tparam alphaViewType: Input type for alpha, needs to be a 1D view
+///
+/// \param X [in]: Input vector X, a rank 2 view
+/// \param Y [in]: Input vector Y, a rank 2 view
+/// \param dot [out]: Computed dot product, a rank 1 view
+///
+/// No nested parallel_for is used inside of the function.
+///
+
+template <typename ArgTrans = Trans::NoTranspose>
+struct SerialDot {
+  template <typename XViewType, typename YViewType, typename NormViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const XViewType &X,
+                                           const YViewType &Y,
+                                           const NormViewType &dot);
+};
+
+/// \brief Team Batched DOT:
+///
+/// Depending on the ArgTrans template, the dot product is
+/// row-based (ArgTrans == Trans::NoTranspose):
+///
+///   dot_l <- (x_l:, y_l:) for all l = 1, ..., N
+/// where:
+///   * N is the second dimension of X.
+///
+/// Or column-based:
+///   dot_l <- (x_:l, y_:l) for all l = 1, ..., n
+/// where:
+///   * n is the second dimension of X.
+///
+/// \tparam ArgTrans: type of dot product (Trans::NoTranspose by default)
+/// \tparam XViewType: Input type for X, needs to be a 2D view
+/// \tparam YViewType: Input type for Y, needs to be a 2D view
+/// \tparam alphaViewType: Input type for alpha, needs to be a 1D view
+///
+/// \param X [in]: Input vector X, a rank 2 view
+/// \param Y [in]: Input vector Y, a rank 2 view
+/// \param dot [out]: Computed dot product, a rank 1 view
+///
+/// A nested parallel_for with TeamThreadRange is used.
+///
+
+template <typename MemberType, typename ArgTrans = Trans::NoTranspose>
+struct TeamDot {
+  template <typename XViewType, typename YViewType, typename NormViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const XViewType &X,
+                                           const YViewType &Y,
+                                           const NormViewType &dot);
+};
+
+/// \brief TeamVector Batched DOT:
+///
+/// Depending on the ArgTrans template, the dot product is
+/// row-based (ArgTrans == Trans::NoTranspose):
+///
+///   dot_l <- (x_l:, y_l:) for all l = 1, ..., N
+/// where:
+///   * N is the second dimension of X.
+///
+/// Or column-based:
+///   dot_l <- (x_:l, y_:l) for all l = 1, ..., n
+/// where:
+///   * n is the second dimension of X.
+///
+/// \tparam ArgTrans: type of dot product (Trans::NoTranspose by default)
+/// \tparam XViewType: Input type for X, needs to be a 2D view
+/// \tparam YViewType: Input type for Y, needs to be a 2D view
+/// \tparam alphaViewType: Input type for alpha, needs to be a 1D view
+///
+/// \param X [in]: Input vector X, a rank 2 view
+/// \param Y [in]: Input vector Y, a rank 2 view
+/// \param dot [out]: Computed dot product, a rank 1 view
+///
+/// Two nested parallel_for with both TeamThreadRange and ThreadVectorRange
+/// (or one with TeamVectorRange) are used inside.
+///
+
+template <typename MemberType, typename ArgTrans = Trans::NoTranspose>
+struct TeamVectorDot {
+  template <typename XViewType, typename YViewType, typename NormViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const XViewType &X,
+                                           const YViewType &Y,
+                                           const NormViewType &dot);
+};
+
+}  // namespace KokkosBatched
+
+#include "KokkosBatched_Dot_Internal.hpp"
+
+#endif
diff --git a/src/batched/dense/KokkosBatched_Eigendecomposition_Decl.hpp b/src/batched/dense/KokkosBatched_Eigendecomposition_Decl.hpp
index 4938701253..3a57844f5b 100644
--- a/src/batched/dense/KokkosBatched_Eigendecomposition_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_Eigendecomposition_Decl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_EIGENDECOMPOSITION_DECL_HPP__
 #define __KOKKOSBATCHED_EIGENDECOMPOSITION_DECL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,59 +8,49 @@
 
 namespace KokkosBatched {
 
-  /// Given a general nonsymmetric matrix A (m x m), it performs eigendecomposition 
-  /// of the matrix.
-  /// 
-  /// Parameters:
-  ///   [in] member
-  ///     Team interface only has this argument. Partial specialization can be applied for 
-  ///     a different type of team member.
-  ///   [in/out]A
-  ///     Real general nonsymmetric rank 2 view A(m x m).
-  ///     A is first condensed to a upper Hessenberg form. Then, the Francis 
-  ///     double shift QR algorithm is applied to compute its Schur form. 
-  ///     On exit, A stores a quasi upper triangular matrix of the Schur decomposition.
-  ///   [out]er, [out]ei
-  ///     A real and imaginary eigenvalues, which forms er(m)+ei(m)i
-  ///     For a complex eigen pair, it stores a+bi and a-bi consecutively. 
-  ///   [out]UL, [out]UR
-  ///     Left/right eigenvectors are stored in (m x m) matrices. If zero span view is provided,
-  ///     it does not compute the corresponding eigenvectors. However, both UL and UR cannot have 
-  ///     zero span. If eigenvalues are only requested, use the Eigenvalue interface which 
-  ///     simplifies computations
-  ///   [out]W
-  ///     1D contiguous workspace. The minimum size is (2*m*m+5*m) where m is the dimension of matrix A.
-
-  struct SerialEigendecomposition {
-    template<typename AViewType,
-             typename EViewType,
-             typename UViewType,
-             typename WViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const AViewType &A,
-           const EViewType &er, const EViewType &ei,
-           const UViewType &UL, const UViewType &UR,
-           const WViewType &W);
-  };
-
-  template<typename MemberType>
-  struct TeamVectorEigendecomposition {
-    template<typename AViewType,
-             typename EViewType,
-             typename UViewType,
-             typename WViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const AViewType &A,
-           const EViewType &er, const EViewType &ei,
-           const UViewType &UL, const UViewType &UR,
-           const WViewType &W);
-  };
-
-
-} /// end namespace KokkosBatched
-
+/// Given a general nonsymmetric matrix A (m x m), it performs
+/// eigendecomposition of the matrix.
+///
+/// Parameters:
+///   [in] member
+///     Team interface only has this argument. Partial specialization can be
+///     applied for a different type of team member.
+///   [in/out]A
+///     Real general nonsymmetric rank 2 view A(m x m).
+///     A is first condensed to a upper Hessenberg form. Then, the Francis
+///     double shift QR algorithm is applied to compute its Schur form.
+///     On exit, A stores a quasi upper triangular matrix of the Schur
+///     decomposition.
+///   [out]er, [out]ei
+///     A real and imaginary eigenvalues, which forms er(m)+ei(m)i
+///     For a complex eigen pair, it stores a+bi and a-bi consecutively.
+///   [out]UL, [out]UR
+///     Left/right eigenvectors are stored in (m x m) matrices. If zero span
+///     view is provided, it does not compute the corresponding eigenvectors.
+///     However, both UL and UR cannot have zero span. If eigenvalues are only
+///     requested, use the Eigenvalue interface which simplifies computations
+///   [out]W
+///     1D contiguous workspace. The minimum size is (2*m*m+5*m) where m is the
+///     dimension of matrix A.
+
+struct SerialEigendecomposition {
+  template <typename AViewType, typename EViewType, typename UViewType,
+            typename WViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const AViewType &A, const EViewType &er, const EViewType &ei,
+      const UViewType &UL, const UViewType &UR, const WViewType &W);
+};
+
+template <typename MemberType>
+struct TeamVectorEigendecomposition {
+  template <typename AViewType, typename EViewType, typename UViewType,
+            typename WViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const AViewType &A, const EViewType &er,
+      const EViewType &ei, const UViewType &UL, const UViewType &UR,
+      const WViewType &W);
+};
+
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp
index f9d2c290d3..9e830c95d4 100644
--- a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp
@@ -48,6 +48,7 @@
 // Includes for non-functor-level routines
 #include <KokkosBatched_Gemm_Handle.hpp>
 #include <KokkosKernels_ExecSpaceUtils.hpp>
+#include <KokkosKernels_Error.hpp>
 
 namespace KokkosBatched {
 /********************* BEGIN functor-level routines *********************/
@@ -211,6 +212,7 @@ class BatchedSerialGemm;
 ///                             ResultsPerThread::Rank0 Each thread computes a scalar of C
 ///                             ResultsPerThread::Rank1 Each thread computes a 1-rank chunk of C
 ///                             ResultsPerThread::Rank2 Each thread computes a 2-rank chunk of C
+/// \tparam HandleType          Specifies the handle type of the kernel handle
 /// \tparam ScalarType          Specifies the scalar type of alpha and beta
 /// \tparam AViewType           Input matrix, as either a 3-rank Kokkos::View or a
 ///                             4-rank Kokkos::View for SIMD operations.
@@ -225,6 +227,9 @@ class BatchedSerialGemm;
 ///                             by tile sizes.
 ///                             BoundsCheck::Yes The functor will     perform bound checks (recommended)
 ///                             BoundsCheck::No  The functor will NOT perform bound checks
+/// \tparam ArgAlphaFmaTag      Specifies whether to apply alpha during fmas.
+///                             AlphaFmaTag::Yes alpha will be applied during fma (C = C * alpha + AB).
+///                             AlphaFmaTag::No  alpha will be applied during mul (A * B * alpha).
 /// \tparam TILE_M              Specifies the number of rows in each tile.
 /// \tparam TILE_N              Specifies the number of cols in each tile.
 /// \tparam TILE_K              Specifies the number of cols or rows in a tile of A or tile of B, respectively.
@@ -250,9 +255,58 @@ class BatchedSerialGemm;
 // clang-format on
 template <class ArgTransA, class ArgTransB, class ArgBatchSzDim,
           class HandleType, class ScalarType, class AViewType, class BViewType,
-          class CViewType, class ArgBoundsCheck, int tile_m, int tile_n,
-          int tile_k>
+          class CViewType, class ArgBoundsCheck, class ArgAlphaFmaTag,
+          int tile_m, int tile_n, int tile_k>
 class BatchedDblBufGemm;
+
+// clang-format off
+/// \brief Blocking solve of general matrix multiply on a batch of uniform matrices.
+///
+///
+///        C = alpha * op(A) * op(B) + beta * C
+///
+/// \tparam ArgTransA           Specifies what op does to A:
+///                             Trans::NoTranspose   for non-transpose
+///                             Trans::Transpose     for transpose
+///                             Trans::ConjTranspose for conjugate transpose (unsupported)
+/// \tparam ArgTransB           Specifies what op does to B:
+///                             Trans::NoTranspose   for non-transpose
+///                             Trans::Transpose     for transpose
+///                             Trans::ConjTranspose for conjugate transpose (unsupported)
+/// \tparam HandleType          Specifies the handle type of the kernel handle
+/// \tparam ScalarType          Specifies the scalar type of alpha and beta
+/// \tparam AViewType           Input matrix, as a 3-rank Kokkos::View
+/// \tparam BViewType           Input matrix, as a 3-rank Kokkos::View
+/// \tparam CViewType           Input(RHS)/Output(LHS) matrix, as a 3-rank
+///                             Kokkos::View
+///
+///                             See struct BatchedGemmHandle for details
+/// \param handle [in]          A handle which specifies how to invoke the batched
+///                             gemm. handle->get_tpl_params() returns &ninter.
+///                             ninter: The number of matrices to interleave.
+/// \param alpha [in]           Input coefficient used for multiplication with A
+/// \param A [in]               Input matrix, as a 3-rank Kokkos::View
+///                             If ArgBatchSzDim == "BatchSzDim::Right", matrix A is MxKxB
+///                             If ArgBatchSzDim == "BatchSzDim::Left",  matrix A is BxMxK
+/// \param B [in]               Input matrix, as a 3-rank Kokkos::View
+///                             If ArgBatchSzDim == "BatchSzDim::Right", matrix B is KxNxB
+///                             If ArgBatchSzDim == "BatchSzDim::Left",  matrix B is BxKxN
+/// \param beta [in]            Input coefficient used for multiplication with C
+/// \param C [in/out]           Input/Output matrix, as a 3-rank Kokkos::View
+///                             If ArgBatchSzDim == "BatchSzDim::Right", matrix C is MxNxB
+///                             If ArgBatchSzDim == "BatchSzDim::Left",  matrix C is BxMxN
+/// \return 0 upon success, non-zero otherwise
+///
+
+/// Usage Example:
+///   BatchedArmplGemm<ArgTransA, ArgTransB, ArgBatchSzDim, HandleType,
+///                     ScalarType, AViewType, BViewType, CViewType>
+///                     (handle, alpha, A, B, beta, C).invoke();
+// clang-format on
+template <class ArgTransA, class ArgTransB, class ArgBatchSzDim,
+          class HandleType, class ScalarType, class AViewType, class BViewType,
+          class CViewType>
+class BatchedArmplGemm;
 /********************* END forward declarations *********************/
 }  // namespace Impl
 
@@ -278,8 +332,8 @@ class BatchedDblBufGemm;
 ///                        Trans::ConjTranspose for conjugate transpose
 /// \tparam ArgBatchSzDim  Specifies where the batch dimension is allocated in
 ///                        AViewType, BViewType, and CViewType:
-///                        BatchSzDim::Left  Batch dimension is leftmost
-///                        BatchSzDim::Right Batch dimension is rightmost
+///                        BatchLayout::Left  Batch dimension is leftmost
+///                        BatchLayout::Right Batch dimension is rightmost
 /// \tparam ScalarType     Specifies the scalar type of alpha and beta
 /// \tparam AViewType      Input matrix, as either a 3-rank Kokkos::View or a
 ///                        4-rank Kokkos::View for SIMD operations.
@@ -294,15 +348,15 @@ class BatchedDblBufGemm;
 ///                        See struct BatchedGemmHandle for details.
 /// \param alpha [in]      Input coefficient used for multiplication with A
 /// \param A [in]          Input matrix, as a 3-rank Kokkos::View
-///                        If ArgBatchSzDim == "BatchSzDim::Right", matrix A is MxKxB
-///                        If ArgBatchSzDim == "BatchSzDim::Left",  matrix A is BxMxK
+///                        If ArgBatchSzDim == "BatchLayout::Right", matrix A is MxKxB
+///                        If ArgBatchSzDim == "BatchLayout::Left",  matrix A is BxMxK
 /// \param B [in]          Input matrix, as a 3-rank Kokkos::View
-///                        If ArgBatchSzDim == "BatchSzDim::Right", matrix B is KxNxB
-///                        If ArgBatchSzDim == "BatchSzDim::Left",  matrix B is BxKxN
+///                        If ArgBatchSzDim == "BatchLayout::Right", matrix B is KxNxB
+///                        If ArgBatchSzDim == "BatchLayout::Left",  matrix B is BxKxN
 /// \param beta [in]       Input coefficient used for multiplication with C
 /// \param C [in/out]      Input/Output matrix, as a 3-rank Kokkos::View
-///                        If ArgBatchSzDim == "BatchSzDim::Right", matrix C is MxNxB
-///                        If ArgBatchSzDim == "BatchSzDim::Left",  matrix C is BxMxN
+///                        If ArgBatchSzDim == "BatchLayout::Right", matrix C is MxNxB
+///                        If ArgBatchSzDim == "BatchLayout::Left",  matrix C is BxMxN
 /// \return 0 upon success, non-zero otherwise
 ///
 /// Usage Example:
@@ -319,17 +373,27 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha,
   size_t c_m, c_n;
   using ViewValueType = typename CViewType::value_type;
   // Check for valid input views
-  static_assert(Kokkos::Impl::is_view<AViewType>::value,
+  static_assert(Kokkos::is_view<AViewType>::value,
                 "AViewType must be a Kokkos::View.");
-  static_assert(Kokkos::Impl::is_view<BViewType>::value,
+  static_assert(Kokkos::is_view<BViewType>::value,
                 "BViewType must be a Kokkos::View.");
-  static_assert(Kokkos::Impl::is_view<CViewType>::value,
+  static_assert(Kokkos::is_view<CViewType>::value,
                 "CViewType must be a Kokkos::View.");
+  static_assert(
+      std::is_same<ArgTransA, Trans::NoTranspose>::value ||
+          std::is_same<ArgTransA, Trans::Transpose>::value,
+      "ArgTransA must be either Trans::Transpose or Trans::NoTranspose.");
+  static_assert(
+      std::is_same<ArgTransB, Trans::NoTranspose>::value ||
+          std::is_same<ArgTransB, Trans::Transpose>::value,
+      "ArgTransB must be either Trans::Transpose or Trans::NoTranspose.");
   if (is_vector<ViewValueType>::value) {
     // Check ranks of view with underlying SIMD value types
     // For SIMD views, we can have either 3-rank or 4-ranks inputs.
     switch (handle->get_kernel_algo_type()) {
       case BaseKokkosBatchedAlgos::KK_SERIAL:
+      case BaseHeuristicAlgos::SQUARE:
+      case BaseTplAlgos::ARMPL:
         static_assert(static_cast<int>(AViewType::rank) == 3,
                       "AViewType must have rank 3.");
         static_assert(static_cast<int>(BViewType::rank) == 3,
@@ -353,7 +417,7 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha,
         os << "KokkosBatched::BatchedGemm does not support kernelAlgoType = "
            << std::to_string(handle->get_kernel_algo_type())
            << " with SIMD views." << std::endl;
-        Kokkos::Impl::throw_runtime_exception(os.str());
+        KokkosKernels::Impl::throw_runtime_exception(os.str());
         break;
     }
   } else {
@@ -415,15 +479,6 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha,
   }
 
   switch (handle->get_kernel_algo_type()) {
-    case BaseKokkosBatchedAlgos::KK_SERIAL:
-      ret =
-          Impl::BatchedSerialGemm<ArgTransA, ArgTransB, Algo::Gemm::Unblocked,
-                                  ArgBatchSzDim, ResultsPerThread::Rank2,
-                                  ScalarType, AViewType, BViewType, CViewType>(
-              alpha, A, B, beta, C)
-              .invoke();
-      break;
-
     ////////////// HEURISTIC ALGOS //////////////
     case BaseHeuristicAlgos::SQUARE:
       if (c_m != c_n) {
@@ -432,7 +487,7 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha,
            << std::to_string(handle->get_kernel_algo_type()) << " when c_m("
            << std::to_string(c_m) << ") != c_n(" << std::to_string(c_n) << ")"
            << std::endl;
-        Kokkos::Impl::throw_runtime_exception(os.str());
+        KokkosKernels::Impl::throw_runtime_exception(os.str());
       }
 
       // Select optimal resultsPerThread param for BatchedSerialGemm
@@ -452,55 +507,104 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha,
                                         Algo::Gemm::Blocked>::type>::type>::
           type;
 
+      if (handle->enableDebug) {
+        std::cout << "bsgResultsPerThread: "
+                  << typeid(bsgResultsPerThread).name() << std::endl
+                  << "bsgModeType: " << typeid(bsgModeType).name() << std::endl;
+      }
+
       // if (on_gpu && c_m >= 20 &&
       //     (alpha == 1.0F && beta == 0.0F) ? c_m <= 24 : c_m <= 21) {
       //   // TODO: invoke TeamShmem
       // } else
-      if (on_gpu &&
-          ((std::is_same<layout_type, Kokkos::LayoutLeft>::value)
-               ? (c_m >= 16)
-               : (c_m >= 24 && c_m <= 32) || (c_m >= 45 && c_m <= 64))) {
+      if (on_gpu && ((std::is_same<layout_type, Kokkos::LayoutLeft>::value)
+                         ? (c_m >= 16)
+                         : (c_m >= 24 && c_m <= 32) || c_m >= 40)) {
         handle->teamSz = handle->vecLen = 8;
         constexpr int tile_m = 32, tile_n = 32, tile_k = 8;
-        if (c_m % 32 == 0)  // No bounds checking
-          ret =
-              Impl::BatchedDblBufGemm<ArgTransA, ArgTransB, ArgBatchSzDim,
-                                      BatchedGemmHandleType, ScalarType,
-                                      AViewType, BViewType, CViewType,
-                                      BoundsCheck::No, tile_m, tile_n, tile_k>(
-                  handle, alpha, A, B, beta, C)
-                  .invoke();
-        else
-          ret =
-              Impl::BatchedDblBufGemm<ArgTransA, ArgTransB, ArgBatchSzDim,
-                                      BatchedGemmHandleType, ScalarType,
-                                      AViewType, BViewType, CViewType,
-                                      BoundsCheck::Yes, tile_m, tile_n, tile_k>(handle, alpha, A, B, beta, C)
-                  .invoke();
+#ifdef __CUDACC_RDC__
+        constexpr size_t alpha_in_fma_thresh = 24;
+#else
+        constexpr size_t alpha_in_fma_thresh = 64;
+#endif  // __CUDAACC_RDC__
+
+        if (c_m % 32 == 0) {                 // No bounds checking
+          if (c_m >= alpha_in_fma_thresh) {  // apply alpha in fma
+            ret =
+                Impl::BatchedDblBufGemm<
+                    ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType,
+                    ScalarType, AViewType, BViewType, CViewType,
+                    BoundsCheck::No, AlphaTag::Yes, tile_m, tile_n, tile_k>(
+                    handle, alpha, A, B, beta, C)
+                    .invoke();
+          } else {  // apply alpha in mul
+            ret =
+                Impl::BatchedDblBufGemm<
+                    ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType,
+                    ScalarType, AViewType, BViewType, CViewType,
+                    BoundsCheck::No, AlphaTag::No, tile_m, tile_n, tile_k>(
+                    handle, alpha, A, B, beta, C)
+                    .invoke();
+          }
+        } else {                             // bounds checking
+          if (c_m >= alpha_in_fma_thresh) {  // apply alpha in fma
+            ret =
+                Impl::BatchedDblBufGemm<
+                    ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType,
+                    ScalarType, AViewType, BViewType, CViewType,
+                    BoundsCheck::Yes, AlphaTag::Yes, tile_m, tile_n, tile_k>(
+                    handle, alpha, A, B, beta, C)
+                    .invoke();
+          } else {  // apply alpha in mul
+            ret =
+                Impl::BatchedDblBufGemm<
+                    ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType,
+                    ScalarType, AViewType, BViewType, CViewType,
+                    BoundsCheck::Yes, AlphaTag::No, tile_m, tile_n, tile_k>(
+                    handle, alpha, A, B, beta, C)
+                    .invoke();
+          }
+        }
       } else {
         ret = Impl::BatchedSerialGemm<ArgTransA, ArgTransB, bsgModeType,
                                       ArgBatchSzDim, bsgResultsPerThread,
                                       ScalarType, AViewType, BViewType,
                                       CViewType>(alpha, A, B, beta, C)
-                .invoke();
+                  .invoke();
       }
       break;
 
-    case GemmKokkosBatchedAlgos::KK_DBLBUF:
-      // Note: The tile sizes of 1x1x1 here will not perform well but must be
-      // selected in order to function on all devices since the serial execution
-      // space has a max team size of 1. KokkosKernels API users will need to
-      // follow an approach similar to KK_SQUARE above for best performance.
-
-      // TODO: Add auto-selection of tile size based on inputs and device type
+      //    case BaseHeuristicAlgos::TALL:
+      //
+      //    case BaseHeuristicAlgos::WIDE:
+      ////////////// TPL ALGOS //////////////
+#if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) && ARMPL_BUILD >= 1058
+    case BaseTplAlgos::ARMPL:
+      ret = Impl::BatchedArmplGemm<ArgTransA, ArgTransB, ArgBatchSzDim,
+                                   BatchedGemmHandleType, ScalarType, AViewType,
+                                   BViewType, CViewType>(handle, alpha, A, B,
+                                                         beta, C)
+                .invoke();
+      break;
+#endif  // KOKKOSKERNELS_ENABLE_TPL_ARMPL
+      //    case BaseTplAlgos::MKL:
+      //
+      //    case GemmTplAlgos::CUBLAS:
+      //
+      //    case GemmTplAlgos::MAGMA:
+
+    ////////////// KokkosBatched ALGOS //////////////
+    case BaseKokkosBatchedAlgos::KK_SERIAL:
       ret =
-          Impl::BatchedDblBufGemm<ArgTransA, ArgTransB, ArgBatchSzDim,
-                                  BatchedGemmHandleType, ScalarType, AViewType,
-                                  BViewType, CViewType, BoundsCheck::Yes, 1, 1,
-                                  1>(handle, alpha, A, B, beta, C)
+          Impl::BatchedSerialGemm<ArgTransA, ArgTransB, Algo::Gemm::Unblocked,
+                                  ArgBatchSzDim, ResultsPerThread::Rank2,
+                                  ScalarType, AViewType, BViewType, CViewType>(
+              alpha, A, B, beta, C)
               .invoke();
       break;
 
+      // case GemmKokkosBatchedAlgos::KK_SERIALSIMD:
+
     case GemmKokkosBatchedAlgos::KK_SERIAL_RANK0:
       ret =
           Impl::BatchedSerialGemm<ArgTransA, ArgTransB, Algo::Gemm::Unblocked,
@@ -510,36 +614,31 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha,
               .invoke();
       break;
 
-    case BaseHeuristicAlgos::TALL:
-
-    case BaseHeuristicAlgos::WIDE:
-
-    ////////////// TPL ALGOS //////////////
-    case BaseTplAlgos::ARMPL:
-
-    case BaseTplAlgos::MKL:
-
-    case GemmTplAlgos::CUBLAS:
-
-    case GemmTplAlgos::MAGMA:
-
-      ////////////// KokkosBatched ALGOS //////////////
-
-    case GemmKokkosBatchedAlgos::KK_TEAM:
+      //    case GemmKokkosBatchedAlgos::KK_SERIAL_SHMEM:
+      //    case GemmKokkosBatchedAlgos::KK_TEAM:
+      //    case GemmKokkosBatchedAlgos::KK_TEAMVECTOR:
+      //    case GemmKokkosBatchedAlgos::KK_TEAMSIMD:
 
-    case GemmKokkosBatchedAlgos::KK_TEAMVECTOR:
-
-    case GemmKokkosBatchedAlgos::KK_SERIALSIMD:
-
-    case GemmKokkosBatchedAlgos::KK_TEAMSIMD:
+    case GemmKokkosBatchedAlgos::KK_DBLBUF:
+      // Note: The tile sizes of 1x1x1 here will not perform well but must be
+      // selected in order to function on all devices since the serial execution
+      // space has a max team size of 1. KokkosKernels API users will need to
+      // follow an approach similar to KK_SQUARE above for best performance.
 
-    case GemmKokkosBatchedAlgos::KK_SERIAL_SHMEM:
+      // TODO: Add auto-selection of tile size based on inputs and device type
+      ret = Impl::BatchedDblBufGemm<ArgTransA, ArgTransB, ArgBatchSzDim,
+                                    BatchedGemmHandleType, ScalarType,
+                                    AViewType, BViewType, CViewType,
+                                    BoundsCheck::Yes, AlphaTag::No, 1, 1, 1>(
+                handle, alpha, A, B, beta, C)
+                .invoke();
+      break;
 
     default:
       std::ostringstream os;
       os << "KokkosBatched::BatchedGemm does not support kernelAlgoType = "
          << std::to_string(handle->get_kernel_algo_type()) << "." << std::endl;
-      Kokkos::Impl::throw_runtime_exception(os.str());
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
       break;
   }
   return ret;
@@ -551,5 +650,6 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha,
 #include "KokkosBatched_Gemm_Team_Impl.hpp"
 #include "KokkosBatched_Gemm_TeamVector_Impl.hpp"
 #include "KokkosBatched_Gemm_DblBuf_Impl.hpp"
+#include "KokkosBatched_Gemm_Armpl_Impl.hpp"
 
 #endif
diff --git a/src/batched/dense/KokkosBatched_Gemm_Handle.hpp b/src/batched/dense/KokkosBatched_Gemm_Handle.hpp
index 47a2071851..402f10a91f 100644
--- a/src/batched/dense/KokkosBatched_Gemm_Handle.hpp
+++ b/src/batched/dense/KokkosBatched_Gemm_Handle.hpp
@@ -70,6 +70,15 @@ enum GEMM_KOKKOS_BATCHED_ALGOS : int {
 };
 }
 
+#define GEMM_ALGO_STRS                           \
+  "GemmTplAlgos::CUBLAS", "GemmTplAlgos::MAGMA", \
+      "GemmKokkosBatchedAlgos::KK_TEAM",         \
+      "GemmKokkosBatchedAlgos::KK_TEAMVECTOR",   \
+      "GemmKokkosBatchedAlgos::KK_SERIALSIMD",   \
+      "GemmKokkosBatchedAlgos::KK_TEAMSIMD",     \
+      "GemmKokkosBatchedAlgos::KK_SERIAL_RANK0", \
+      "GemmKokkosBatchedAlgos::KK_SERIAL_SHMEM", \
+      "GemmKokkosBatchedAlgos::KK_DBLBUF"
 // clang-format off
 /// \brief Handle for selecting runtime behavior of the BatchedGemm interface.
 ///
@@ -171,6 +180,14 @@ class BatchedGemmHandle : public BatchedKernelHandle {
     return this->BatchedKernelHandle::get_tpl_params();
 #endif
   }
+
+  std::string get_kernel_algo_type_str() const {
+    return gemm_algo_type_strs[_kernelAlgoType];
+  }
+
+ private:
+  const char *gemm_algo_type_strs[GemmKokkosBatchedAlgos::N] = {BASE_ALGO_STRS,
+                                                                GEMM_ALGO_STRS};
 };
 
 }  // namespace KokkosBatched
diff --git a/src/batched/dense/KokkosBatched_Gemv_Decl.hpp b/src/batched/dense/KokkosBatched_Gemv_Decl.hpp
index 4079742380..c467373014 100644
--- a/src/batched/dense/KokkosBatched_Gemv_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_Gemv_Decl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_GEMV_DECL_HPP__
 #define __KOKKOSBATCHED_GEMV_DECL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,144 +8,131 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Gemv 
-  ///
-
-  template<typename ArgTrans,
-           typename ArgAlgo>
-  struct SerialGemv {
-    template<typename ScalarType,
-             typename AViewType,
-             typename xViewType,
-             typename yViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType /*alpha*/,
-           const AViewType &/*A*/,
-           const xViewType &/*x*/,
-           const ScalarType /*beta*/,
-           const yViewType &/*y*/) {
-      assert(false && "Error: encounter dummy impl");
-      return 0;
-    }
-  };
-    
-  ///
-  /// Team Gemv 
-  ///
-
-  template<typename MemberType,
-           typename ArgTrans,
-           typename ArgAlgo>
-  struct TeamGemv {
-    template<typename ScalarType,
-             typename AViewType,
-             typename xViewType,
-             typename yViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &/*member*/, 
-           const ScalarType /*alpha*/,
-           const AViewType &/*A*/,
-           const xViewType &/*x*/,
-           const ScalarType /*beta*/,
-           const yViewType &/*y*/) {
-      assert(false && "Error: encounter dummy impl");
-      return 0;
-    }
-  };
-
-  ///
-  /// TeamVector Gemv 
-  ///
-
-  template<typename MemberType,
-           typename ArgTrans,
-           typename ArgAlgo>
-  struct TeamVectorGemv {
-    template<typename ScalarType,
-             typename AViewType,
-             typename xViewType,
-             typename yViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &/*member*/, 
-           const ScalarType /*alpha*/,
-           const AViewType &/*A*/,
-           const xViewType &/*x*/,
-           const ScalarType /*beta*/,
-           const yViewType &/*y*/) {
-      assert(false && "Error: encounter dummy impl");
-      return 0;
+///
+/// Serial Gemv
+///
+
+template <typename ArgTrans, typename ArgAlgo>
+struct SerialGemv {
+  template <typename ScalarType, typename AViewType, typename xViewType,
+            typename yViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType /*alpha*/,
+                                           const AViewType & /*A*/,
+                                           const xViewType & /*x*/,
+                                           const ScalarType /*beta*/,
+                                           const yViewType & /*y*/) {
+    assert(false && "Error: encounter dummy impl");
+    return 0;
+  }
+};
+
+///
+/// Team Gemv
+///
+
+template <typename MemberType, typename ArgTrans, typename ArgAlgo>
+struct TeamGemv {
+  template <typename ScalarType, typename AViewType, typename xViewType,
+            typename yViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/,
+                                           const ScalarType /*alpha*/,
+                                           const AViewType & /*A*/,
+                                           const xViewType & /*x*/,
+                                           const ScalarType /*beta*/,
+                                           const yViewType & /*y*/) {
+    assert(false && "Error: encounter dummy impl");
+    return 0;
+  }
+};
+
+///
+/// TeamVector Gemv
+///
+
+template <typename MemberType, typename ArgTrans, typename ArgAlgo>
+struct TeamVectorGemv {
+  template <typename ScalarType, typename AViewType, typename xViewType,
+            typename yViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/,
+                                           const ScalarType /*alpha*/,
+                                           const AViewType & /*A*/,
+                                           const xViewType & /*x*/,
+                                           const ScalarType /*beta*/,
+                                           const yViewType & /*y*/) {
+    assert(false && "Error: encounter dummy impl");
+    return 0;
+  }
+};
+
+///
+/// Selective Interface
+///
+template <typename MemberType, typename ArgTrans, typename ArgMode,
+          typename ArgAlgo>
+struct Gemv {
+  template <typename ScalarType, typename AViewType, typename xViewType,
+            typename yViewType>
+  KOKKOS_FORCEINLINE_FUNCTION static int invoke(
+      const MemberType &member, const ScalarType alpha, const AViewType &A,
+      const xViewType &x, const ScalarType beta, const yViewType &y) {
+    int r_val = 0;
+    if (std::is_same<ArgMode, Mode::Serial>::value) {
+      r_val = SerialGemv<ArgTrans, ArgAlgo>::invoke(alpha, A, x, beta, y);
+    } else if (std::is_same<ArgMode, Mode::Team>::value) {
+      r_val = TeamGemv<MemberType, ArgTrans, ArgAlgo>::invoke(member, alpha, A,
+                                                              x, beta, y);
+    } else if (std::is_same<ArgMode, Mode::TeamVector>::value) {
+      r_val = TeamVectorGemv<MemberType, ArgTrans, ArgAlgo>::invoke(
+          member, alpha, A, x, beta, y);
     }
-  };
-
-  ///
-  /// Selective Interface
-  ///
-  template<typename MemberType,
-           typename ArgTrans,
-           typename ArgMode, typename ArgAlgo>
-  struct Gemv {
-    template<typename ScalarType,
-             typename AViewType,
-             typename xViewType,
-             typename yViewType>
-    KOKKOS_FORCEINLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A,
-           const xViewType &x,
-           const ScalarType beta,
-           const yViewType &y) {
-      int r_val = 0;
-      if (std::is_same<ArgMode,Mode::Serial>::value) {
-        r_val = SerialGemv<ArgTrans,ArgAlgo>::invoke(alpha, A, x, beta, y);
-      } else if (std::is_same<ArgMode,Mode::Team>::value) {
-        r_val = TeamGemv<MemberType,ArgTrans,ArgAlgo>::invoke(member, alpha, A, x, beta, y);
-      } else if (std::is_same<ArgMode,Mode::TeamVector>::value) {
-        r_val = TeamVectorGemv<MemberType,ArgTrans,ArgAlgo>::invoke(member, alpha, A, x, beta, y);
-      } 
-      return r_val;
-    }      
-  };
-
-}
+    return r_val;
+  }
+};
+
+}  // namespace KokkosBatched
 
 #include "KokkosBatched_Gemv_Serial_Impl.hpp"
 #include "KokkosBatched_Gemv_Team_Impl.hpp"
 #include "KokkosBatched_Gemv_TeamVector_Impl.hpp"
 
-#define KOKKOSBATCHED_SERIAL_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,M,N,ALPHA,A,AS0,AS1,X,XS,BETA,Y,YS) \
-  KokkosBatched::SerialGemvInternal<ALGOTYPE>                           \
-  ::invoke(M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS)
-
-#define KOKKOSBATCHED_SERIAL_GEMV_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,M,N,ALPHA,A,AS0,AS1,X,XS,BETA,Y,YS) \
-  KokkosBatched::SerialGemvInternal<ALGOTYPE>                           \
-  ::invoke(N, M, ALPHA, A, AS1, AS0, X, XS, BETA, Y, YS)
-
-#define KOKKOSBATCHED_TEAM_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,MEMBER,M,N,ALPHA,A,AS0,AS1,X,XS,BETA,Y,YS) \
-  KokkosBatched::TeamGemvInternal<ALGOTYPE>                             \
-  ::invoke(MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS)
-
-#define KOKKOSBATCHED_TEAM_GEMV_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,MEMBER,M,N,ALPHA,A,AS0,AS1,X,XS,BETA,Y,YS) \
-  KokkosBatched::TeamGemvInternal<ALGOTYPE>                             \
-  ::invoke(MEMBER, N, M, ALPHA, A, AS1, AS0, X, XS, BETA, Y, YS)
-  
-#define KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(MODETYPE,ALGOTYPE,MEMBER,M,N,ALPHA,A,AS0,AS1,X,XS,BETA,Y,YS) \
-  if (std::is_same<MODETYPE,KokkosBatched::Mode::Serial>::value) {      \
-    KOKKOSBATCHED_SERIAL_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,M,N,ALPHA,A,AS0,AS1,X,XS,BETA,Y,YS); \
-  } else if (std::is_same<MODETYPE,KokkosBatched::Mode::Team>::value) { \
-    KOKKOSBATCHED_TEAM_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,MEMBER,M,N,ALPHA,A,AS0,AS1,X,XS,BETA,Y,YS); \
-  }                                                                   
-
-#define KOKKOSBATCHED_GEMV_TRANSPOSE_INTERNAL_INVOKE(MODETYPE,ALGOTYPE,MEMBER,M,N,ALPHA,A,AS0,AS1,X,XS,BETA,Y,YS) \
-  if (std::is_same<MODETYPE,KokkosBatched::Mode::Serial>::value) {      \
-    KOKKOSBATCHED_SERIAL_GEMV_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,M,N,ALPHA,A,AS0,AS1,X,XS,BETA,Y,YS); \
-  } else if (std::is_same<MODETYPE,KokkosBatched::Mode::Team>::value) { \
-    KOKKOSBATCHED_TEAM_GEMV_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,MEMBER,M,N,ALPHA,A,AS0,AS1,X,XS,BETA,Y,YS); \
+#define KOKKOSBATCHED_SERIAL_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(            \
+    ALGOTYPE, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS)                \
+  KokkosBatched::SerialGemvInternal<ALGOTYPE>::invoke(M, N, ALPHA, A, AS0, \
+                                                      AS1, X, XS, BETA, Y, YS)
+
+#define KOKKOSBATCHED_SERIAL_GEMV_TRANSPOSE_INTERNAL_INVOKE(               \
+    ALGOTYPE, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS)                \
+  KokkosBatched::SerialGemvInternal<ALGOTYPE>::invoke(N, M, ALPHA, A, AS1, \
+                                                      AS0, X, XS, BETA, Y, YS)
+
+#define KOKKOSBATCHED_TEAM_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(       \
+    ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) \
+  KokkosBatched::TeamGemvInternal<ALGOTYPE>::invoke(                \
+      MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS)
+
+#define KOKKOSBATCHED_TEAM_GEMV_TRANSPOSE_INTERNAL_INVOKE(          \
+    ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) \
+  KokkosBatched::TeamGemvInternal<ALGOTYPE>::invoke(                \
+      MEMBER, N, M, ALPHA, A, AS1, AS0, X, XS, BETA, Y, YS)
+
+#define KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(                      \
+    MODETYPE, ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) \
+  if (std::is_same<MODETYPE, KokkosBatched::Mode::Serial>::value) {           \
+    KOKKOSBATCHED_SERIAL_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(                   \
+        ALGOTYPE, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS);              \
+  } else if (std::is_same<MODETYPE, KokkosBatched::Mode::Team>::value) {      \
+    KOKKOSBATCHED_TEAM_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE(                     \
+        ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS);      \
+  }
+
+#define KOKKOSBATCHED_GEMV_TRANSPOSE_INTERNAL_INVOKE(                         \
+    MODETYPE, ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS) \
+  if (std::is_same<MODETYPE, KokkosBatched::Mode::Serial>::value) {           \
+    KOKKOSBATCHED_SERIAL_GEMV_TRANSPOSE_INTERNAL_INVOKE(                      \
+        ALGOTYPE, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS);              \
+  } else if (std::is_same<MODETYPE, KokkosBatched::Mode::Team>::value) {      \
+    KOKKOSBATCHED_TEAM_GEMV_TRANSPOSE_INTERNAL_INVOKE(                        \
+        ALGOTYPE, MEMBER, M, N, ALPHA, A, AS0, AS1, X, XS, BETA, Y, YS);      \
   }
 
 #endif
diff --git a/src/batched/dense/KokkosBatched_HadamardProduct.hpp b/src/batched/dense/KokkosBatched_HadamardProduct.hpp
new file mode 100644
index 0000000000..04ffc07074
--- /dev/null
+++ b/src/batched/dense/KokkosBatched_HadamardProduct.hpp
@@ -0,0 +1,158 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+#ifndef __KOKKOSBATCHED_HADAMARDPRODUCT_HPP__
+#define __KOKKOSBATCHED_HADAMARDPRODUCT_HPP__
+
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Vector.hpp"
+
+namespace KokkosBatched {
+
+/// \brief Serial Batched Hadamard Product:
+///   v_ij <- x_ij * y_ij for all i = 1, ..., n and j = 1, ..., N
+/// where:
+///   * n is the number of rows,
+///   * N is the number of vectors.
+///
+/// \tparam XViewType: Input type for X, needs to be a 2D view
+/// \tparam YViewType: Input type for Y, needs to be a 2D view
+/// \tparam VViewType: Input type for V, needs to be a 2D view
+///
+/// \param X [in]: Input vector X, a rank 2 view
+/// \param Y [in]: Input vector Y, a rank 2 view
+/// \param V [out]: Output vector V, a rank 2 view
+///
+/// No nested parallel_for is used inside of the function.
+///
+
+struct SerialHadamardProduct {
+  template <typename XViewType, typename YViewType, typename VViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const XViewType &X,
+                                           const YViewType &Y,
+                                           const VViewType &V);
+};
+
+/// \brief Team Batched Hadamard Product:
+///   v_ij <- x_ij * y_ij for all i = 1, ..., n and j = 1, ..., N
+/// where:
+///   * n is the number of rows,
+///   * N is the number of vectors.
+///
+/// \tparam XViewType: Input type for X, needs to be a 2D view
+/// \tparam YViewType: Input type for Y, needs to be a 2D view
+/// \tparam VViewType: Input type for V, needs to be a 2D view
+///
+/// \param member [in]: TeamPolicy member
+/// \param X [in]: Input vector X, a rank 2 view
+/// \param Y [in]: Input vector Y, a rank 2 view
+/// \param V [out]: Output vector V, a rank 2 view
+///
+/// A nested parallel_for with TeamThreadRange is used.
+///
+
+template <typename MemberType>
+struct TeamHadamardProduct {
+  template <typename XViewType, typename YViewType, typename VViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const XViewType &X,
+                                           const YViewType &Y,
+                                           const VViewType &V);
+};
+
+/// \brief TeamVector Batched Hadamard Product:
+///   v_ij <- x_ij * y_ij for all i = 1, ..., n and j = 1, ..., N
+/// where:
+///   * n is the number of rows,
+///   * N is the number of vectors.
+///
+/// \tparam XViewType: Input type for X, needs to be a 2D view
+/// \tparam YViewType: Input type for Y, needs to be a 2D view
+/// \tparam VViewType: Input type for V, needs to be a 2D view
+///
+/// \param member [in]: TeamPolicy member
+/// \param X [in]: Input vector X, a rank 2 view
+/// \param Y [in]: Input vector Y, a rank 2 view
+/// \param V [out]: Output vector V, a rank 2 view
+///
+/// Two nested parallel_for with both TeamThreadRange and ThreadVectorRange
+/// (or one with TeamVectorRange) are used inside.
+///
+
+template <typename MemberType>
+struct TeamVectorHadamardProduct {
+  template <typename XViewType, typename YViewType, typename VViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const XViewType &X,
+                                           const YViewType &Y,
+                                           const VViewType &V);
+};
+
+template <typename MemberType, typename ArgMode>
+struct HadamardProduct {
+  template <typename XViewType, typename YViewType, typename VViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const XViewType &X,
+                                           const YViewType &Y,
+                                           const VViewType &V) {
+    int r_val = 0;
+    if (std::is_same<ArgMode, Mode::Serial>::value) {
+      r_val = SerialHadamardProduct::template invoke<XViewType, YViewType,
+                                                     VViewType>(X, Y, V);
+    } else if (std::is_same<ArgMode, Mode::Team>::value) {
+      r_val =
+          TeamHadamardProduct<MemberType>::template invoke<XViewType, YViewType,
+                                                           VViewType>(member, X,
+                                                                      Y, V);
+    } else if (std::is_same<ArgMode, Mode::TeamVector>::value) {
+      r_val = TeamVectorHadamardProduct<MemberType>::template invoke<
+          XViewType, YViewType, VViewType>(member, X, Y, V);
+    }
+    return r_val;
+  }
+};
+}  // namespace KokkosBatched
+
+#include "KokkosBatched_HadamardProduct_Impl.hpp"
+
+#endif
diff --git a/src/batched/dense/KokkosBatched_Householder_Decl.hpp b/src/batched/dense/KokkosBatched_Householder_Decl.hpp
index e07b4a5007..e3a146f1d8 100644
--- a/src/batched/dense/KokkosBatched_Householder_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_Householder_Decl.hpp
@@ -1,47 +1,38 @@
 #ifndef __KOKKOSBATCHED_HOUSEHOLDER_DECL_HPP__
 #define __KOKKOSBATCHED_HOUSEHOLDER_DECL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Householder 
-  ///
-
-  // level 1 operation
-  template<typename ArgSide>
-  struct SerialHouseholder {
-    template<typename aViewType,
-             typename tauViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const aViewType &a,
-           const tauViewType &tau);
-  };
-
-  ///
-  /// TeamVector Householder 
-  ///
-
-  // level 1 operation
-  template<typename MemberType, 
-           typename ArgSide>
-  struct TeamVectorHouseholder {
-    template<typename MemberType,
-             typename aViewType,
-             typename tauViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const aViewType &a,
-           const tauViewType &tau);
-  };
-
-}
+///
+/// Serial Householder
+///
+
+// level 1 operation
+template <typename ArgSide>
+struct SerialHouseholder {
+  template <typename aViewType, typename tauViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const aViewType &a,
+                                           const tauViewType &tau);
+};
+
+///
+/// TeamVector Householder
+///
+
+// level 1 operation
+template <typename MemberType, typename ArgSide>
+struct TeamVectorHouseholder {
+  template <typename MemberType, typename aViewType, typename tauViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const aViewType &a,
+                                           const tauViewType &tau);
+};
+
+}  // namespace KokkosBatched
 
 #include "KokkosBatched_Householder_Serial_Impl.hpp"
 #include "KokkosBatched_Householder_TeamVector_Impl.hpp"
diff --git a/src/batched/dense/KokkosBatched_InnerGemmFixA_Decl.hpp b/src/batched/dense/KokkosBatched_InnerGemmFixA_Decl.hpp
index 11310bc449..de44fc10cc 100644
--- a/src/batched/dense/KokkosBatched_InnerGemmFixA_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_InnerGemmFixA_Decl.hpp
@@ -1,44 +1,36 @@
 #ifndef __KOKKOSBATCHED_INNER_GEMM_FIX_A_DECL_HPP__
 #define __KOKKOSBATCHED_INNER_GEMM_FIX_A_DECL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 namespace KokkosBatched {
 
-  template<int mb, int nb>
-  struct InnerGemmFixA {
-    const int _as0, _as1, _bs0, _bs1, _cs0, _cs1;
-    
-    KOKKOS_INLINE_FUNCTION
-    InnerGemmFixA(const int as0, const int as1, 
-                  const int bs0, const int bs1,
-                  const int cs0, const int cs1)
-      : _as0(as0), _as1(as1), 
-        _bs0(bs0), _bs1(bs1), 
-        _cs0(cs0), _cs1(cs1) {}
-    
-    // serial rank update
-    template<typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    int serial_invoke(const ScalarType alpha,
-                      const ValueType *__restrict__ A,
-                      const ValueType *__restrict__ B,
-                      const int n,
-                      /**/  ValueType *__restrict__ C);
-    
-    // serial rank update for remainder
-    template<typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    int serial_invoke(const ScalarType alpha,
-                      const ValueType *__restrict__ A,
-                      const ValueType *__restrict__ B,
-                      const int m, const int n, const int k,
-                      /**/  ValueType *__restrict__ C);
-  };
-}
+template <int mb, int nb>
+struct InnerGemmFixA {
+  const int _as0, _as1, _bs0, _bs1, _cs0, _cs1;
+
+  KOKKOS_INLINE_FUNCTION
+  InnerGemmFixA(const int as0, const int as1, const int bs0, const int bs1,
+                const int cs0, const int cs1)
+      : _as0(as0), _as1(as1), _bs0(bs0), _bs1(bs1), _cs0(cs0), _cs1(cs1) {}
+
+  // serial rank update
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha,
+                                           const ValueType *KOKKOS_RESTRICT A,
+                                           const ValueType *KOKKOS_RESTRICT B,
+                                           const int n,
+                                           /**/ ValueType *KOKKOS_RESTRICT C);
 
+  // serial rank update for remainder
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha,
+                                           const ValueType *KOKKOS_RESTRICT A,
+                                           const ValueType *KOKKOS_RESTRICT B,
+                                           const int m, const int n,
+                                           const int k,
+                                           /**/ ValueType *KOKKOS_RESTRICT C);
+};
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/KokkosBatched_InnerGemmFixB_Decl.hpp b/src/batched/dense/KokkosBatched_InnerGemmFixB_Decl.hpp
index 0b25e6f0cc..a1e4a2caf4 100644
--- a/src/batched/dense/KokkosBatched_InnerGemmFixB_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_InnerGemmFixB_Decl.hpp
@@ -1,45 +1,36 @@
 #ifndef __KOKKOSBATCHED_INNER_GEMM_FIX_B_DECL_HPP__
 #define __KOKKOSBATCHED_INNER_GEMM_FIX_B_DECL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 namespace KokkosBatched {
 
-  template<int mb, int nb>
-  struct InnerGemmFixB {
-    const int _as0, _as1, _bs0, _bs1, _cs0, _cs1;
-    
-    KOKKOS_INLINE_FUNCTION
-    InnerGemmFixA(const int as0, const int as1, 
-                  const int bs0, const int bs1,
-                  const int cs0, const int cs1)
-      : _as0(as0), _as1(as1), 
-        _bs0(bs0), _bs1(bs1), 
-        _cs0(cs0), _cs1(cs1) {}
-    
-    // serial rank update
-    template<typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    int serial_invoke(const ScalarType alpha,
-                      const ValueType *__restrict__ A,
-                      const ValueType *__restrict__ B,
-                      const int n,
-                      /**/  ValueType *__restrict__ C);
-    
-    // serial rank update for remainder
-    template<typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    int serial_invoke(const ScalarType alpha,
-                      const ValueType *__restrict__ A,
-                      const ValueType *__restrict__ B,
-                      const int m, const int n, const int k,
-                      /**/  ValueType *__restrict__ C);
-  };
-}
+template <int mb, int nb>
+struct InnerGemmFixB {
+  const int _as0, _as1, _bs0, _bs1, _cs0, _cs1;
+
+  KOKKOS_INLINE_FUNCTION
+  InnerGemmFixA(const int as0, const int as1, const int bs0, const int bs1,
+                const int cs0, const int cs1)
+      : _as0(as0), _as1(as1), _bs0(bs0), _bs1(bs1), _cs0(cs0), _cs1(cs1) {}
 
+  // serial rank update
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha,
+                                           const ValueType *KOKKOS_RESTRICT A,
+                                           const ValueType *KOKKOS_RESTRICT B,
+                                           const int n,
+                                           /**/ ValueType *KOKKOS_RESTRICT C);
 
+  // serial rank update for remainder
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha,
+                                           const ValueType *KOKKOS_RESTRICT A,
+                                           const ValueType *KOKKOS_RESTRICT B,
+                                           const int m, const int n,
+                                           const int k,
+                                           /**/ ValueType *KOKKOS_RESTRICT C);
+};
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/KokkosBatched_InnerGemmFixC_Decl.hpp b/src/batched/dense/KokkosBatched_InnerGemmFixC_Decl.hpp
index 118a61060e..a694609731 100644
--- a/src/batched/dense/KokkosBatched_InnerGemmFixC_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_InnerGemmFixC_Decl.hpp
@@ -1,77 +1,61 @@
 #ifndef __KOKKOSBATCHED_INNER_GEMM_FIX_C_DECL_HPP__
 #define __KOKKOSBATCHED_INNER_GEMM_FIX_C_DECL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 namespace KokkosBatched {
 
-  template<int mb=0, int nb=0>
-  struct InnerGemmFixC {
-    const int _as0, _as1, _bs0, _bs1, _cs0, _cs1;
-    
-    KOKKOS_INLINE_FUNCTION
-    InnerGemmFixC(const int as0, const int as1, 
-                  const int bs0, const int bs1,
-                  const int cs0, const int cs1)
-      : _as0(as0), _as1(as1), 
-        _bs0(bs0), _bs1(bs1), 
-        _cs0(cs0), _cs1(cs1) {}
-    
-    // serial rank update
-    template<typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    int serial_invoke(const ScalarType alpha,
-                      const ValueType *__restrict__ A,
-                      const ValueType *__restrict__ B,
-                      const int k,
-                      /**/  ValueType *__restrict__ C);
-
-    // serial rank update for remainder
-    template<typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    int serial_invoke(const ScalarType alpha,
-                      const ValueType *__restrict__ A,
-                      const ValueType *__restrict__ B,
-                      const int m, const int k,
-                      /**/  ValueType *__restrict__ C);
-    
-    // serial rank update for remainder
-    template<typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    int serial_invoke(const ScalarType alpha,
-                      const ValueType *__restrict__ A,
-                      const ValueType *__restrict__ B,
-                      const int m, const int n, const int k,
-                      /**/  ValueType *__restrict__ C);
-
-    template<typename MemberType,
-             typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    int team_invoke(const MemberType &member,
-                    const ScalarType alpha,
-                    const ValueType *__restrict__ A,
-                    const ValueType *__restrict__ B,
-                    const int k,
-                    /**/  ValueType *__restrict__ C);
-    
-    // team rank update for remainder
-    template<typename MemberType,
-             typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    int team_invoke(const MemberType &member,
-                    const ScalarType alpha,
-                    const ValueType *__restrict__ A,
-                    const ValueType *__restrict__ B,
-                    const int m, const int n, const int k,
-                    /**/  ValueType *__restrict__ C);
-  };
-}
-
+template <int mb = 0, int nb = 0>
+struct InnerGemmFixC {
+  const int _as0, _as1, _bs0, _bs1, _cs0, _cs1;
+
+  KOKKOS_INLINE_FUNCTION
+  InnerGemmFixC(const int as0, const int as1, const int bs0, const int bs1,
+                const int cs0, const int cs1)
+      : _as0(as0), _as1(as1), _bs0(bs0), _bs1(bs1), _cs0(cs0), _cs1(cs1) {}
+
+  // serial rank update
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha,
+                                           const ValueType *KOKKOS_RESTRICT A,
+                                           const ValueType *KOKKOS_RESTRICT B,
+                                           const int k,
+                                           /**/ ValueType *KOKKOS_RESTRICT C);
+
+  // serial rank update for remainder
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha,
+                                           const ValueType *KOKKOS_RESTRICT A,
+                                           const ValueType *KOKKOS_RESTRICT B,
+                                           const int m, const int k,
+                                           /**/ ValueType *KOKKOS_RESTRICT C);
+
+  // serial rank update for remainder
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha,
+                                           const ValueType *KOKKOS_RESTRICT A,
+                                           const ValueType *KOKKOS_RESTRICT B,
+                                           const int m, const int n,
+                                           const int k,
+                                           /**/ ValueType *KOKKOS_RESTRICT C);
+
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION int team_invoke(const MemberType &member,
+                                         const ScalarType alpha,
+                                         const ValueType *KOKKOS_RESTRICT A,
+                                         const ValueType *KOKKOS_RESTRICT B,
+                                         const int k,
+                                         /**/ ValueType *KOKKOS_RESTRICT C);
+
+  // team rank update for remainder
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION int team_invoke(const MemberType &member,
+                                         const ScalarType alpha,
+                                         const ValueType *KOKKOS_RESTRICT A,
+                                         const ValueType *KOKKOS_RESTRICT B,
+                                         const int m, const int n, const int k,
+                                         /**/ ValueType *KOKKOS_RESTRICT C);
+};
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/KokkosBatched_InnerLU_Decl.hpp b/src/batched/dense/KokkosBatched_InnerLU_Decl.hpp
index 431df6ce52..484377ffdc 100644
--- a/src/batched/dense/KokkosBatched_InnerLU_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_InnerLU_Decl.hpp
@@ -1,37 +1,31 @@
 #ifndef __KOKKOSBATCHED_INNER_LU_DECL_HPP__
 #define __KOKKOSBATCHED_INNER_LU_DECL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 namespace KokkosBatched {
 
-  template<int bmn>
-  struct InnerLU {
-    const int _as0, _as1;
-    
-    KOKKOS_INLINE_FUNCTION
-    InnerLU(const int as0, const int as1) 
-      : _as0(as0), _as1(as1) {}
-    
-    // lu
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    int serial_invoke(ValueType *__restrict__ A);
-
-    // for remainder square
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    int serial_invoke(const int m, 
-                      ValueType *__restrict__ A);
-    
-    // for remainder
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    int serial_invoke(const int m, const int n,
-                      ValueType *__restrict__ A);
-  };
-}
+template <int bmn>
+struct InnerLU {
+  const int _as0, _as1;
+
+  KOKKOS_INLINE_FUNCTION
+  InnerLU(const int as0, const int as1) : _as0(as0), _as1(as1) {}
+
+  // lu
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION int serial_invoke(ValueType *KOKKOS_RESTRICT A);
+
+  // for remainder square
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION int serial_invoke(const int m,
+                                           ValueType *KOKKOS_RESTRICT A);
 
+  // for remainder
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION int serial_invoke(const int m, const int n,
+                                           ValueType *KOKKOS_RESTRICT A);
+};
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/KokkosBatched_InnerMultipleDotProduct_Decl.hpp b/src/batched/dense/KokkosBatched_InnerMultipleDotProduct_Decl.hpp
index 8683adc0a8..aee475df0c 100644
--- a/src/batched/dense/KokkosBatched_InnerMultipleDotProduct_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_InnerMultipleDotProduct_Decl.hpp
@@ -1,45 +1,33 @@
 #ifndef __KOKKOSBATCHED_INNER_MULTIPLE_DOT_PRODUCT_DECL_HPP__
 #define __KOKKOSBATCHED_INNER_MULTIPLE_DOT_PRODUCT_DECL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 namespace KokkosBatched {
 
-  template<int mb>
-  struct InnerMultipleDotProduct {
-    const int _as0, _as1, _xs0, _ys0;
-
-    KOKKOS_INLINE_FUNCTION
-    InnerMultipleDotProduct(const int as0, const int as1,
-                            const int xs0,
-                            const int ys0) 
-      : _as0(as0), _as1(as1),
-        _xs0(xs0),
-        _ys0(ys0) {}
-
-    template<typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    int 
-    serial_invoke(const ScalarType alpha,
-                  const ValueType *__restrict__ A,
-                  const ValueType *__restrict__ x,
-                  const int n, 
-                  /**/  ValueType *__restrict__ y);
-
-    template<typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    int
-    serial_invoke(const ScalarType alpha,
-                  const ValueType *__restrict__ A,
-                  const ValueType *__restrict__ x,
-                  const int m, const int n, 
-                  /**/  ValueType *__restrict__ y);
-  };
-}
-
-
+template <int mb>
+struct InnerMultipleDotProduct {
+  const int _as0, _as1, _xs0, _ys0;
+
+  KOKKOS_INLINE_FUNCTION
+  InnerMultipleDotProduct(const int as0, const int as1, const int xs0,
+                          const int ys0)
+      : _as0(as0), _as1(as1), _xs0(xs0), _ys0(ys0) {}
+
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha,
+                                           const ValueType *KOKKOS_RESTRICT A,
+                                           const ValueType *KOKKOS_RESTRICT x,
+                                           const int n,
+                                           /**/ ValueType *KOKKOS_RESTRICT y);
+
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION int serial_invoke(const ScalarType alpha,
+                                           const ValueType *KOKKOS_RESTRICT A,
+                                           const ValueType *KOKKOS_RESTRICT x,
+                                           const int m, const int n,
+                                           /**/ ValueType *KOKKOS_RESTRICT y);
+};
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/KokkosBatched_InnerTrsm_Decl.hpp b/src/batched/dense/KokkosBatched_InnerTrsm_Decl.hpp
index 31d41cfd12..a78df609b7 100644
--- a/src/batched/dense/KokkosBatched_InnerTrsm_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_InnerTrsm_Decl.hpp
@@ -1,120 +1,106 @@
 #ifndef __KOKKOSBATCHED_INNER_TRSM_DECL_HPP__
 #define __KOKKOSBATCHED_INNER_TRSM_DECL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 namespace KokkosBatched {
 
-  // specialized for different m and n
-  // Solve L(m x m) X(m x n) = B(m x n)
-  template<int bmn>
-  struct InnerTrsmLeftLowerUnitDiag {
-    const int _as0, _as1, _bs0, _bs1;
-    
-    KOKKOS_INLINE_FUNCTION
-    InnerTrsmLeftLowerUnitDiag(const int as0, const int as1,
-                               const int bs0, const int bs1)
-      : _as0(as0), _as1(as1),
-        _bs0(bs0), _bs1(bs1) {}
-    
-    // trisolve
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    int serial_invoke(const ValueType *__restrict__ A,
-                      const int n,
-                      /**/  ValueType *__restrict__ B);
-
-    // for remainder
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    int serial_invoke(const ValueType *__restrict__ A,
-                      const int m, const int n, 
-                      /**/  ValueType *__restrict__ B);
-  };
-
-  // specialized for different m and n
-  // Solve L(m x m) X(m x n) = B(m x n)
-  template<int bmn>
-  struct InnerTrsmLeftLowerNonUnitDiag {
-    const int _as0, _as1, _bs0, _bs1;
-
-    KOKKOS_INLINE_FUNCTION
-    InnerTrsmLeftLowerNonUnitDiag(const int as0, const int as1,
-                                  const int bs0, const int bs1)
-      : _as0(as0), _as1(as1),
-        _bs0(bs0), _bs1(bs1) {}
-      
-    // trisolve
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    int serial_invoke(const ValueType *__restrict__ A,
-                      const int n,
-                      /**/  ValueType *__restrict__ B);
-
-    // for remainder
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    int serial_invoke(const ValueType *__restrict__ A,
-                      const int m, const int n, 
-                      /**/  ValueType *__restrict__ B);
-  };
-
-  // specialized for different m and n
-  // Solve U(m x m) X(m x n) = B(m x n)
-  template<int bmn>
-  struct InnerTrsmLeftUpperUnitDiag {
-    const int _as0, _as1, _bs0, _bs1;
-
-    KOKKOS_INLINE_FUNCTION
-    InnerTrsmLeftUpperUnitDiag(const int as0, const int as1,
-                               const int bs0, const int bs1)
-      : _as0(as0), _as1(as1),
-        _bs0(bs0), _bs1(bs1) {}
-      
-    // trisolve
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    int serial_invoke(const ValueType *__restrict__ A,
-                      const int n,
-                      /**/  ValueType *__restrict__ B);
-
-    // for remainder
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    int serial_invoke(const ValueType *__restrict__ A,
-                      const int m, const int n, 
-                      /**/  ValueType *__restrict__ B);
-  };
-
-  // specialized for different m and n
-  // Solve U(m x m) X(m x n) = B(m x n)
-  template<int bmn>
-  struct InnerTrsmLeftUpperNonUnitDiag {
-    const int _as0, _as1, _bs0, _bs1;
-
-    KOKKOS_INLINE_FUNCTION
-    InnerTrsmLeftUpperNonUnitDiag(const int as0, const int as1,
-                                  const int bs0, const int bs1)
-      : _as0(as0), _as1(as1),
-        _bs0(bs0), _bs1(bs1) {}
-      
-    // trisolve
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    int serial_invoke(const ValueType *__restrict__ A,
-                      const int n,
-                      /**/  ValueType *__restrict__ B);
-
-    // for remainder
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    int serial_invoke(const ValueType *__restrict__ A,
-                      const int m, const int n, 
-                      /**/  ValueType *__restrict__ B);
-  };
-
-}
-
+// specialized for different m and n
+// Solve L(m x m) X(m x n) = B(m x n)
+template <int bmn>
+struct InnerTrsmLeftLowerUnitDiag {
+  const int _as0, _as1, _bs0, _bs1;
+
+  KOKKOS_INLINE_FUNCTION
+  InnerTrsmLeftLowerUnitDiag(const int as0, const int as1, const int bs0,
+                             const int bs1)
+      : _as0(as0), _as1(as1), _bs0(bs0), _bs1(bs1) {}
+
+  // trisolve
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A,
+                                           const int n,
+                                           /**/ ValueType *KOKKOS_RESTRICT B);
+
+  // for remainder
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A,
+                                           const int m, const int n,
+                                           /**/ ValueType *KOKKOS_RESTRICT B);
+};
+
+// specialized for different m and n
+// Solve L(m x m) X(m x n) = B(m x n)
+template <int bmn>
+struct InnerTrsmLeftLowerNonUnitDiag {
+  const int _as0, _as1, _bs0, _bs1;
+
+  KOKKOS_INLINE_FUNCTION
+  InnerTrsmLeftLowerNonUnitDiag(const int as0, const int as1, const int bs0,
+                                const int bs1)
+      : _as0(as0), _as1(as1), _bs0(bs0), _bs1(bs1) {}
+
+  // trisolve
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A,
+                                           const int n,
+                                           /**/ ValueType *KOKKOS_RESTRICT B);
+
+  // for remainder
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A,
+                                           const int m, const int n,
+                                           /**/ ValueType *KOKKOS_RESTRICT B);
+};
+
+// specialized for different m and n
+// Solve U(m x m) X(m x n) = B(m x n)
+template <int bmn>
+struct InnerTrsmLeftUpperUnitDiag {
+  const int _as0, _as1, _bs0, _bs1;
+
+  KOKKOS_INLINE_FUNCTION
+  InnerTrsmLeftUpperUnitDiag(const int as0, const int as1, const int bs0,
+                             const int bs1)
+      : _as0(as0), _as1(as1), _bs0(bs0), _bs1(bs1) {}
+
+  // trisolve
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A,
+                                           const int n,
+                                           /**/ ValueType *KOKKOS_RESTRICT B);
+
+  // for remainder
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A,
+                                           const int m, const int n,
+                                           /**/ ValueType *KOKKOS_RESTRICT B);
+};
+
+// specialized for different m and n
+// Solve U(m x m) X(m x n) = B(m x n)
+template <int bmn>
+struct InnerTrsmLeftUpperNonUnitDiag {
+  const int _as0, _as1, _bs0, _bs1;
+
+  KOKKOS_INLINE_FUNCTION
+  InnerTrsmLeftUpperNonUnitDiag(const int as0, const int as1, const int bs0,
+                                const int bs1)
+      : _as0(as0), _as1(as1), _bs0(bs0), _bs1(bs1) {}
+
+  // trisolve
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A,
+                                           const int n,
+                                           /**/ ValueType *KOKKOS_RESTRICT B);
+
+  // for remainder
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION int serial_invoke(const ValueType *KOKKOS_RESTRICT A,
+                                           const int m, const int n,
+                                           /**/ ValueType *KOKKOS_RESTRICT B);
+};
+
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/KokkosBatched_InverseLU_Decl.hpp b/src/batched/dense/KokkosBatched_InverseLU_Decl.hpp
index b2cb675734..93dbf44bdf 100644
--- a/src/batched/dense/KokkosBatched_InverseLU_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_InverseLU_Decl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_INVERSELU_DECL_HPP__
 #define __KOKKOSBATCHED_INVERSELU_DECL_HPP__
 
-
 /// \author Vinh Dang (vqdang@sandia.gov)
 
 #include "KokkosBatched_Vector.hpp"
@@ -12,52 +11,45 @@
 #include "KokkosBatched_SolveLU_Decl.hpp"
 
 namespace KokkosBatched {
-      
-  template<typename ArgAlgo>
-  struct SerialInverseLU {
-    template<typename AViewType,
-             typename wViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const AViewType &A,
-           const wViewType &w) {
-      typedef typename wViewType::value_type value_type;
-      // workspace w is always 1D view; reinterpret it 
-      Kokkos::View<value_type**,Kokkos::LayoutRight,Kokkos::AnonymousSpace>
-        W(w.data(), A.extent(0), A.extent(1));
-
-      int r_val[3] = {};
-      r_val[0] = SerialCopy<Trans::NoTranspose>::invoke(A, W);
-      r_val[1] = SerialSetIdentity::invoke(A);
-      r_val[2] = SerialSolveLU<Trans::NoTranspose,ArgAlgo>::invoke(W, A);        
-      return r_val[0]+r_val[1]+r_val[2];
-    }
-  };       
-
-  template<typename MemberType,
-           typename ArgAlgo>
-  struct TeamInverseLU {
-    template<typename AViewType,
-             typename wViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const AViewType &A,
-           const wViewType &w) {
-      typedef typename wViewType::value_type value_type;
-      // workspace w is always 1D view; reinterpret it 
-      Kokkos::View<value_type**,Kokkos::LayoutRight,Kokkos::AnonymousSpace>
-        W(w.data(), A.extent(0), A.extent(1));
-
-      int r_val[3] = {};
-      r_val[0] = TeamCopy<MemberType,Trans::NoTranspose>::invoke(member, A, W);
-      r_val[1] = TeamSetIdentity<MemberType>::invoke(member, A);
-      r_val[2] = TeamSolveLU<MemberType,Trans::NoTranspose,ArgAlgo>::invoke(member, W, A);        
-      return r_val[0]+r_val[1]+r_val[2];
-    }
-  };       
-      
-}
 
+template <typename ArgAlgo>
+struct SerialInverseLU {
+  template <typename AViewType, typename wViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A,
+                                           const wViewType &w) {
+    typedef typename wViewType::value_type value_type;
+    // workspace w is always 1D view; reinterpret it
+    Kokkos::View<value_type **, Kokkos::LayoutRight, Kokkos::AnonymousSpace> W(
+        w.data(), A.extent(0), A.extent(1));
+
+    int r_val[3] = {};
+    r_val[0]     = SerialCopy<Trans::NoTranspose>::invoke(A, W);
+    r_val[1]     = SerialSetIdentity::invoke(A);
+    r_val[2]     = SerialSolveLU<Trans::NoTranspose, ArgAlgo>::invoke(W, A);
+    return r_val[0] + r_val[1] + r_val[2];
+  }
+};
+
+template <typename MemberType, typename ArgAlgo>
+struct TeamInverseLU {
+  template <typename AViewType, typename wViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const AViewType &A,
+                                           const wViewType &w) {
+    typedef typename wViewType::value_type value_type;
+    // workspace w is always 1D view; reinterpret it
+    Kokkos::View<value_type **, Kokkos::LayoutRight, Kokkos::AnonymousSpace> W(
+        w.data(), A.extent(0), A.extent(1));
+
+    int r_val[3] = {};
+    r_val[0] = TeamCopy<MemberType, Trans::NoTranspose>::invoke(member, A, W);
+    r_val[1] = TeamSetIdentity<MemberType>::invoke(member, A);
+    r_val[2] = TeamSolveLU<MemberType, Trans::NoTranspose, ArgAlgo>::invoke(
+        member, W, A);
+    return r_val[0] + r_val[1] + r_val[2];
+  }
+};
+
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/KokkosBatched_Kernel_Handle.hpp b/src/batched/dense/KokkosBatched_Kernel_Handle.hpp
index a32e556ba1..ec8d2ee23f 100644
--- a/src/batched/dense/KokkosBatched_Kernel_Handle.hpp
+++ b/src/batched/dense/KokkosBatched_Kernel_Handle.hpp
@@ -47,12 +47,14 @@
 #ifndef KOKKOSKERNELS_KOKKOSBATCHED_KERNEL_HEADER_HPP
 #define KOKKOSKERNELS_KOKKOSBATCHED_KERNEL_HEADER_HPP
 
+#include "KokkosKernels_Error.hpp"
+
 #if defined(KOKKOSKERNELS_ENABLE_TPL_MKL)
 #include <mkl.h>
 #endif  // KOKKOSKERNELS_ENABLE_TPL_MKL
 
 #if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL)
-// TODO: Add armpl handle type to expose nintern & nbatch?
+#include "armpl.h"
 #endif  // KOKKOSKERNELS_ENABLE_TPL_ARMPL
 
 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS)
@@ -83,17 +85,22 @@ enum BASE_KOKKOS_BATCHED_ALGOS : int { KK_SERIAL = BaseTplAlgos::N, N };
 }
 
 #define N_BASE_ALGOS BaseKokkosBatchedAlgos::N
+#define BASE_ALGO_STRS                                                      \
+  "BaseHeuristicAlgos::SQUARE", "BaseHeuristicAlgos::TALL",                 \
+      "BaseHeuristicAlgos::WIDE", "BaseTplAlgos::ARMPL", "BaseTplAlgosMKL", \
+      "BaseKokkosBatchedAlgos::KK_SERIAL"
 
 /// \brief TplParams abstracts underlying handle or execution queue type.
 struct TplParams {
   union {
 #if defined(KOKKOSKERNELS_ENABLE_TPL_MKL)
-    //queue mkl_queue;
-    // TODO: Add queue header? Cannot find any declarations in intel-18, let alone oneAPI 2021
+    // queue mkl_queue;
+    // TODO: Add queue header? Cannot find any declarations in intel-18, let
+    // alone oneAPI 2021
 #endif  // KOKKOSKERNELS_ENABLE_TPL_MKL
 
 #if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL)
-    // TODO: Add armpl handle type in KokkosKernels to expose nintern & nbatch?
+    armpl_int_t ninter = 1;
 #endif  // KOKKOSKERNELS_ENABLE_TPL_ARMPL
 
 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS)
@@ -169,21 +176,34 @@ class BatchedKernelHandle {
 
   BatchedKernelHandle(int kernelAlgoType = BaseHeuristicAlgos::SQUARE,
                       int teamSize = 0, int vecLength = 0)
-      : teamSz(teamSize), vecLen(vecLength), _kernelAlgoType(kernelAlgoType){};
+      : teamSz(teamSize), vecLen(vecLength), _kernelAlgoType(kernelAlgoType) {
+#if !defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) || ARMPL_BUILD < 1058
+    if (_kernelAlgoType == BaseTplAlgos::ARMPL) {
+      std::ostringstream os;
+      os << "KokkosBatched::BatchedKernelHandle requires "
+            "KOKKOSKERNELS_ENABLE_TPL_ARMPL and armpl version 21.0.0+"
+         << std::endl;
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
+    }
+#endif  // !defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL)
+  };
 
-  decltype(auto) get_tpl_params() {
+  int get_kernel_algo_type() const { return _kernelAlgoType; }
+
+  std::string get_kernel_algo_type_str() const {
+    return algo_type_strs[_kernelAlgoType];
+  }
+
+  decltype(auto) get_tpl_params() const {
 #if _kernelAlgoType == ARMPL && defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL)
-    return "BaseTplAlgos::ARMPL does not support any tpl parameters";
+    return &_tplParamsSingleton.ninter;
 #elif _kernelAlgoType == MKL && defined(KOKKOSKERNELS_ENABLE_TPL_MKL)
     return "BaseTplAlgos::MKL does not support any tpl parameters";
 #else
-    return "Unsupported kernelAlgoType = " + std::to_string(_kernelAlgoType) +
-           ".";
+    return "Unsupported kernelAlgoType:" + get_kernel_algo_type_str() + ".";
 #endif
   }
 
-  int get_kernel_algo_type() const { return _kernelAlgoType; }
-
   // clang-format off
   /// \var _kernelAlgoType Specifies which algorithm to use for invocation (default, SQUARE).
   /// \var _tplParams      a handle or queue specific to the TPL API.
@@ -201,6 +221,9 @@ class BatchedKernelHandle {
   int _kernelAlgoType            = BaseHeuristicAlgos::SQUARE;
   TplParams &_tplParamsSingleton = _get_tpl_params_singleton();
   bool _tplParamsSet             = false;
+
+ private:
+  const char *algo_type_strs[N_BASE_ALGOS] = {BASE_ALGO_STRS};
 };
 
 }  // namespace KokkosBatched
diff --git a/src/batched/dense/KokkosBatched_LU_Decl.hpp b/src/batched/dense/KokkosBatched_LU_Decl.hpp
index d9c217fe9f..8cffbdc766 100644
--- a/src/batched/dense/KokkosBatched_LU_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_LU_Decl.hpp
@@ -1,59 +1,54 @@
 #ifndef __KOKKOSBATCHED_LU_DECL_HPP__
 #define __KOKKOSBATCHED_LU_DECL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 #include "KokkosBatched_Vector.hpp"
 
 namespace KokkosBatched {
-      
-  template<typename ArgAlgo>
-  struct SerialLU {
-    // no piv version
-    template<typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const AViewType &A,
-           const typename MagnitudeScalarType<typename AViewType::non_const_value_type>::type tiny = 0);
-  };       
-
-  template<typename MemberType,
-           typename ArgAlgo>
-  struct TeamLU {
-    // no piv version
-    template<typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const AViewType &A,
-           const typename MagnitudeScalarType<typename AViewType::non_const_value_type>::type tiny = 0);
-  };       
-
-  ///
-  /// Selective Interface
-  ///
-  template<typename MemberType,
-           typename ArgMode, typename ArgAlgo>
-  struct LU {
-    // no piv version
-    template<typename AViewType>
-    KOKKOS_FORCEINLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const AViewType &A,
-           const typename MagnitudeScalarType<typename AViewType::non_const_value_type>::type tiny = 0) {
-      int r_val = 0;
-      if (std::is_same<ArgMode,Mode::Serial>::value) {
-        r_val = SerialLU<ArgAlgo>::invoke(A, tiny);
-      } else if (std::is_same<ArgMode,Mode::Team>::value) {
-        r_val = TeamLU<MemberType,ArgAlgo>::invoke(member, A, tiny);
-      } 
-      return r_val;
+
+template <typename ArgAlgo>
+struct SerialLU {
+  // no piv version
+  template <typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const AViewType &A,
+      const typename MagnitudeScalarType<
+          typename AViewType::non_const_value_type>::type tiny = 0);
+};
+
+template <typename MemberType, typename ArgAlgo>
+struct TeamLU {
+  // no piv version
+  template <typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const AViewType &A,
+      const typename MagnitudeScalarType<
+          typename AViewType::non_const_value_type>::type tiny = 0);
+};
+
+///
+/// Selective Interface
+///
+template <typename MemberType, typename ArgMode, typename ArgAlgo>
+struct LU {
+  // no piv version
+  template <typename AViewType>
+  KOKKOS_FORCEINLINE_FUNCTION static int invoke(
+      const MemberType &member, const AViewType &A,
+      const typename MagnitudeScalarType<
+          typename AViewType::non_const_value_type>::type tiny = 0) {
+    int r_val = 0;
+    if (std::is_same<ArgMode, Mode::Serial>::value) {
+      r_val = SerialLU<ArgAlgo>::invoke(A, tiny);
+    } else if (std::is_same<ArgMode, Mode::Team>::value) {
+      r_val = TeamLU<MemberType, ArgAlgo>::invoke(member, A, tiny);
     }
-  };           
-      
-}
+    return r_val;
+  }
+};
+
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/KokkosBatched_QR_Decl.hpp b/src/batched/dense/KokkosBatched_QR_Decl.hpp
index c74f0d19b1..2888aefa8c 100644
--- a/src/batched/dense/KokkosBatched_QR_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_QR_Decl.hpp
@@ -1,97 +1,76 @@
 #ifndef __KOKKOSBATCHED_QR_DECL_HPP__
 #define __KOKKOSBATCHED_QR_DECL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial QR
-  ///
+///
+/// Serial QR
+///
 
-  template<typename ArgAlgo>
-  struct SerialQR {
-    template<typename AViewType,
-             typename tViewType,
-             typename wViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const AViewType &A,
-           const tViewType &t,
-           const wViewType &w);
-  };
+template <typename ArgAlgo>
+struct SerialQR {
+  template <typename AViewType, typename tViewType, typename wViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A,
+                                           const tViewType &t,
+                                           const wViewType &w);
+};
 
-  ///
-  /// Team QR
-  ///
+///
+/// Team QR
+///
 
-  template<typename MemberType,
-           typename ArgAlgo>
-  struct TeamQR {
-    template<typename AViewType,
-             typename tViewType,
-             typename wViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &/*member*/,
-           const AViewType &/*A*/,
-           const tViewType &/*t*/,
-           const wViewType &/*w*/) {
-      /// not implemented
-      return -1;
-    }
-  };
+template <typename MemberType, typename ArgAlgo>
+struct TeamQR {
+  template <typename AViewType, typename tViewType, typename wViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/,
+                                           const AViewType & /*A*/,
+                                           const tViewType & /*t*/,
+                                           const wViewType & /*w*/) {
+    /// not implemented
+    return -1;
+  }
+};
 
-  ///
-  /// TeamVector QR
-  ///
+///
+/// TeamVector QR
+///
 
-  template<typename MemberType,
-           typename ArgAlgo>
-  struct TeamVectorQR {
-    template<typename AViewType,
-             typename tViewType,
-             typename wViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const AViewType &A,
-           const tViewType &t,
-           const wViewType &w);
-  };
+template <typename MemberType, typename ArgAlgo>
+struct TeamVectorQR {
+  template <typename AViewType, typename tViewType, typename wViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const AViewType &A,
+                                           const tViewType &t,
+                                           const wViewType &w);
+};
 
-  ///
-  /// Selective Interface
-  ///
-  template<typename MemberType,
-           typename ArgMode,
-           typename ArgAlgo>
-  struct QR {
-    template<typename AViewType,
-             typename tViewType,
-             typename wViewType>
-    KOKKOS_FORCEINLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const AViewType &A,
-           const tViewType &t,
-           const wViewType &w) {
-      int r_val = 0;
-      if (std::is_same<ArgMode,Mode::Serial>::value) {
-        r_val = SerialQR<ArgAlgo>::invoke(A, t, w);
-      } else if (std::is_same<ArgMode,Mode::Team>::value) {
-        r_val = TeamQR<MemberType,ArgAlgo>::invoke(member, A, t, w);
-      } else if (std::is_same<ArgMode,Mode::TeamVector>::value) {
-        r_val = TeamVectorQR<MemberType,ArgAlgo>::invoke(member, A, t, w);
-      } 
-      return r_val;
+///
+/// Selective Interface
+///
+template <typename MemberType, typename ArgMode, typename ArgAlgo>
+struct QR {
+  template <typename AViewType, typename tViewType, typename wViewType>
+  KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member,
+                                                const AViewType &A,
+                                                const tViewType &t,
+                                                const wViewType &w) {
+    int r_val = 0;
+    if (std::is_same<ArgMode, Mode::Serial>::value) {
+      r_val = SerialQR<ArgAlgo>::invoke(A, t, w);
+    } else if (std::is_same<ArgMode, Mode::Team>::value) {
+      r_val = TeamQR<MemberType, ArgAlgo>::invoke(member, A, t, w);
+    } else if (std::is_same<ArgMode, Mode::TeamVector>::value) {
+      r_val = TeamVectorQR<MemberType, ArgAlgo>::invoke(member, A, t, w);
     }
-  };      
+    return r_val;
+  }
+};
 
-}
+}  // namespace KokkosBatched
 
 #include "KokkosBatched_QR_Serial_Impl.hpp"
 #include "KokkosBatched_QR_TeamVector_Impl.hpp"
diff --git a/src/batched/dense/KokkosBatched_QR_WithColumnPivoting_Decl.hpp b/src/batched/dense/KokkosBatched_QR_WithColumnPivoting_Decl.hpp
index d53587b2f6..38ea0e4ba0 100644
--- a/src/batched/dense/KokkosBatched_QR_WithColumnPivoting_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_QR_WithColumnPivoting_Decl.hpp
@@ -1,35 +1,29 @@
 #ifndef __KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_DECL_HPP__
 #define __KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_DECL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 namespace KokkosBatched {
 
-  ///
-  /// TeamVector QR
-  ///
-
-  template<typename MemberType,
-           typename ArgAlgo>
-  struct TeamVectorQR_WithColumnPivoting {
-    template<typename AViewType,
-             typename tViewType,
-	     typename pViewType,
-             typename wViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const AViewType &A,
-           const tViewType &t,
-	   const pViewType &p,
-           const wViewType &w,
-	   /* */ int &matrix_rank);
-  };
-
-}
+///
+/// TeamVector QR
+///
+
+template <typename MemberType, typename ArgAlgo>
+struct TeamVectorQR_WithColumnPivoting {
+  template <typename AViewType, typename tViewType, typename pViewType,
+            typename wViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const AViewType &A,
+                                           const tViewType &t,
+                                           const pViewType &p,
+                                           const wViewType &w,
+                                           /* */ int &matrix_rank);
+};
+
+}  // namespace KokkosBatched
 
 #include "KokkosBatched_QR_WithColumnPivoting_TeamVector_Impl.hpp"
 
diff --git a/src/batched/dense/KokkosBatched_SVD_Decl.hpp b/src/batched/dense/KokkosBatched_SVD_Decl.hpp
index f727d00a35..2e154385c2 100644
--- a/src/batched/dense/KokkosBatched_SVD_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_SVD_Decl.hpp
@@ -8,58 +8,55 @@
 
 namespace KokkosBatched {
 
-  /// Given a general matrix A (m x n), compute the full singular value decomposition (SVD):
-  /// U * diag(s) * V^T = A. U/V are orthogonal and s contains nonnegative values in descending order.
-  ///
-  /// Currently only supports real-valued matrices.
-  /// 
-  /// Parameters:
-  ///   [in] A
-  ///     General matrix (rank 2 view), m x n.
-  ///     The contents of A are overwritten and undefined after calling this function.
-  ///   [out] U
-  ///     m left singular vectors (in columns). Dimensions m*m.
-  ///   [out] Vt
-  ///     n right singular vectors (in rows). Dimensions n*n.
-  ///   [out] s
-  ///     min(m, n) singular values.
-  ///   [in] W
-  ///     1D contiguous workspace. The required size is max(m, n).
-  ///
-  /// Preconditions:
-  ///   m == A.extent(0) == U.extent(0) == U.extent(1)
-  ///   n == A.extent(1) == V.extent(0) == V.extent(1)
-  ///   min(m, n) == s.extent(0)
-  ///   W.extent(0) >= max(m, n)
-  ///   W.stride(0) == 1 (contiguous)
-
-  struct SVD_USV_Tag {};
-  struct SVD_S_Tag {};
-  // Note: Could easily add SV or US tags later if needed
-
-  struct SerialSVD {
-    //Version to compute full factorization: A == U * diag(s) * Vt
-    template<typename AViewType,
-             typename UViewType,
-             typename VtViewType,
-             typename SViewType,
-             typename WViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(SVD_USV_Tag, const AViewType &A,
-           const UViewType &U, const SViewType &s,
-           const VtViewType &Vt, const WViewType &W);
-
-    //Version which computes only singular values
-    template<typename AViewType,
-             typename SViewType,
-             typename WViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(SVD_S_Tag, const AViewType &A, const SViewType &s, const WViewType &W);
-  };
-
-} /// end namespace KokkosBatched
+/// Given a general matrix A (m x n), compute the full singular value
+/// decomposition (SVD): U * diag(s) * V^T = A. U/V are orthogonal and s
+/// contains nonnegative values in descending order.
+///
+/// Currently only supports real-valued matrices.
+///
+/// Parameters:
+///   [in] A
+///     General matrix (rank 2 view), m x n.
+///     The contents of A are overwritten and undefined after calling this
+///     function.
+///   [out] U
+///     m left singular vectors (in columns). Dimensions m*m.
+///   [out] Vt
+///     n right singular vectors (in rows). Dimensions n*n.
+///   [out] s
+///     min(m, n) singular values.
+///   [in] W
+///     1D contiguous workspace. The required size is max(m, n).
+///
+/// Preconditions:
+///   m == A.extent(0) == U.extent(0) == U.extent(1)
+///   n == A.extent(1) == V.extent(0) == V.extent(1)
+///   min(m, n) == s.extent(0)
+///   W.extent(0) >= max(m, n)
+///   W.stride(0) == 1 (contiguous)
+
+struct SVD_USV_Tag {};
+struct SVD_S_Tag {};
+// Note: Could easily add SV or US tags later if needed
+
+struct SerialSVD {
+  // Version to compute full factorization: A == U * diag(s) * Vt
+  template <typename AViewType, typename UViewType, typename VtViewType,
+            typename SViewType, typename WViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(SVD_USV_Tag, const AViewType &A,
+                                           const UViewType &U,
+                                           const SViewType &s,
+                                           const VtViewType &Vt,
+                                           const WViewType &W);
+
+  // Version which computes only singular values
+  template <typename AViewType, typename SViewType, typename WViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(SVD_S_Tag, const AViewType &A,
+                                           const SViewType &s,
+                                           const WViewType &W);
+};
+
+}  // namespace KokkosBatched
 
 #include "KokkosBatched_SVD_Serial_Impl.hpp"
 
diff --git a/src/batched/dense/KokkosBatched_Scale_Decl.hpp b/src/batched/dense/KokkosBatched_Scale_Decl.hpp
index a449e70f0f..f3ea9b0aab 100644
--- a/src/batched/dense/KokkosBatched_Scale_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_Scale_Decl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_SCALE_DECL_HPP__
 #define __KOKKOSBATCHED_SCALE_DECL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,50 +8,41 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Scale
-  ///
-
-  struct SerialScale {
-    template<typename ScalarType,
-             typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A);
-  };
-
-  ///
-  /// Team Scale
-  ///
-
-  template<typename MemberType>
-  struct TeamScale {
-    template<typename ScalarType,
-             typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A);
-  };
-
-  ///
-  /// TeamVector Scale
-  ///
-
-  template<typename MemberType>
-  struct TeamVectorScale {
-    template<typename ScalarType,
-             typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A);
-  };
-
-}
+///
+/// Serial Scale
+///
+
+struct SerialScale {
+  template <typename ScalarType, typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A);
+};
+
+///
+/// Team Scale
+///
+
+template <typename MemberType>
+struct TeamScale {
+  template <typename ScalarType, typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A);
+};
+
+///
+/// TeamVector Scale
+///
+
+template <typename MemberType>
+struct TeamVectorScale {
+  template <typename ScalarType, typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A);
+};
+
+}  // namespace KokkosBatched
 
 #include "KokkosBatched_Scale_Impl.hpp"
 
diff --git a/src/batched/dense/KokkosBatched_SetIdentity_Decl.hpp b/src/batched/dense/KokkosBatched_SetIdentity_Decl.hpp
index 40e1e6dec4..3f646475bd 100644
--- a/src/batched/dense/KokkosBatched_SetIdentity_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_SetIdentity_Decl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_SET_IDENTITY_DECL_HPP__
 #define __KOKKOSBATCHED_SET_IDENTITY_DECL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,54 +8,44 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial SetIdentity
-  ///
-
-  struct SerialSetIdentity {
-    template<typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const AViewType &A);
-  };
-
-  ///
-  /// Team Set
-  ///
-
-  template<typename MemberType>
-  struct TeamSetIdentity {
-    template<typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const AViewType &A);
-  };
-
-
-  ///
-  /// Selective Interface
-  ///
-  template<typename MemberType,
-           typename ArgMode>
-  struct SetIdentity {
-    template<typename AViewType>
-    KOKKOS_FORCEINLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const AViewType &A) {
-      int r_val = 0;
-      if (std::is_same<ArgMode,Mode::Serial>::value) {
-        r_val = SerialSetIdentity::invoke(A);
-      } else if (std::is_same<ArgMode,Mode::Team>::value) {
-        r_val = TeamSetIdentity<MemberType>::invoke(member, A);
-      } 
-      return r_val;
+///
+/// Serial SetIdentity
+///
+
+struct SerialSetIdentity {
+  template <typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A);
+};
+
+///
+/// Team Set
+///
+
+template <typename MemberType>
+struct TeamSetIdentity {
+  template <typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const AViewType &A);
+};
+
+///
+/// Selective Interface
+///
+template <typename MemberType, typename ArgMode>
+struct SetIdentity {
+  template <typename AViewType>
+  KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member,
+                                                const AViewType &A) {
+    int r_val = 0;
+    if (std::is_same<ArgMode, Mode::Serial>::value) {
+      r_val = SerialSetIdentity::invoke(A);
+    } else if (std::is_same<ArgMode, Mode::Team>::value) {
+      r_val = TeamSetIdentity<MemberType>::invoke(member, A);
     }
-  };      
-
-}
-
+    return r_val;
+  }
+};
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/KokkosBatched_Set_Decl.hpp b/src/batched/dense/KokkosBatched_Set_Decl.hpp
index dae9c5cff8..4ef0078e50 100644
--- a/src/batched/dense/KokkosBatched_Set_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_Set_Decl.hpp
@@ -1,57 +1,47 @@
 #ifndef __KOKKOSBATCHED_SET_DECL_HPP__
 #define __KOKKOSBATCHED_SET_DECL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 #include "KokkosBatched_Vector.hpp"
 
 namespace KokkosBatched {
-  ///
-  /// Serial Set
-  ///
-
-  struct SerialSet {
-    template<typename ScalarType,
-             typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A);
-  };
-
-  ///
-  /// Team Set
-  ///
-
-  template<typename MemberType>
-  struct TeamSet {
-    template<typename ScalarType,
-             typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A);
-  };
-
-  ///
-  /// TeamVector Set
-  ///
-
-  template<typename MemberType>
-  struct TeamVectorSet {
-    template<typename ScalarType,
-             typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A);
-  };
-
-}
+///
+/// Serial Set
+///
+
+struct SerialSet {
+  template <typename ScalarType, typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A);
+};
+
+///
+/// Team Set
+///
+
+template <typename MemberType>
+struct TeamSet {
+  template <typename ScalarType, typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A);
+};
+
+///
+/// TeamVector Set
+///
+
+template <typename MemberType>
+struct TeamVectorSet {
+  template <typename ScalarType, typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A);
+};
+
+}  // namespace KokkosBatched
 
 #include "KokkosBatched_Set_Impl.hpp"
 
diff --git a/src/batched/dense/KokkosBatched_SolveLU_Decl.hpp b/src/batched/dense/KokkosBatched_SolveLU_Decl.hpp
index 4373c60e76..5ba0b679e8 100644
--- a/src/batched/dense/KokkosBatched_SolveLU_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_SolveLU_Decl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_SOLVELU_DECL_HPP__
 #define __KOKKOSBATCHED_SOLVELU_DECL_HPP__
 
-
 /// \author Vinh Dang (vqdang@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -11,92 +10,85 @@
 #include "KokkosBatched_Trsm_Team_Impl.hpp"
 
 namespace KokkosBatched {
-      
-  template<typename ArgTrans,
-           typename ArgAlgo>
-  struct SerialSolveLU {
-    // no piv version
-    template<typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const AViewType &A,
-           const BViewType &B) {
-      int r_val[2] = {};
-      const typename AViewType::non_const_value_type one(1.0);
-      if (std::is_same<ArgTrans,Trans::NoTranspose>::value) {
-        //First, compute Y (= U*X) by solving the system L*Y = B for Y
-        r_val[0] = SerialTrsm<Side::Left,Uplo::Lower,ArgTrans,Diag::Unit,ArgAlgo>::invoke(one, A, B);
-        //Second, compute X by solving the system U*X = Y for X
-        r_val[1] = SerialTrsm<Side::Left,Uplo::Upper,ArgTrans,Diag::NonUnit,ArgAlgo>::invoke(one, A, B);
-      } else if (std::is_same<ArgTrans,Trans::Transpose>::value || 
-                 std::is_same<ArgTrans,Trans::ConjTranspose>::value) {
-        //First, compute Y (= L'*X) by solving the system U'*Y = B for Y
-        r_val[0] = SerialTrsm<Side::Left,Uplo::Upper,ArgTrans,Diag::NonUnit,ArgAlgo>::invoke(one, A, B); 
-        //Second, compute X by solving the system L'*X = Y for X
-        r_val[1] = SerialTrsm<Side::Left,Uplo::Lower,ArgTrans,Diag::Unit,ArgAlgo>::invoke(one, A, B);
-      }
-      return r_val[0]+r_val[1];
+
+template <typename ArgTrans, typename ArgAlgo>
+struct SerialSolveLU {
+  // no piv version
+  template <typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A,
+                                           const BViewType &B) {
+    int r_val[2] = {};
+    const typename AViewType::non_const_value_type one(1.0);
+    if (std::is_same<ArgTrans, Trans::NoTranspose>::value) {
+      // First, compute Y (= U*X) by solving the system L*Y = B for Y
+      r_val[0] = SerialTrsm<Side::Left, Uplo::Lower, ArgTrans, Diag::Unit,
+                            ArgAlgo>::invoke(one, A, B);
+      // Second, compute X by solving the system U*X = Y for X
+      r_val[1] = SerialTrsm<Side::Left, Uplo::Upper, ArgTrans, Diag::NonUnit,
+                            ArgAlgo>::invoke(one, A, B);
+    } else if (std::is_same<ArgTrans, Trans::Transpose>::value ||
+               std::is_same<ArgTrans, Trans::ConjTranspose>::value) {
+      // First, compute Y (= L'*X) by solving the system U'*Y = B for Y
+      r_val[0] = SerialTrsm<Side::Left, Uplo::Upper, ArgTrans, Diag::NonUnit,
+                            ArgAlgo>::invoke(one, A, B);
+      // Second, compute X by solving the system L'*X = Y for X
+      r_val[1] = SerialTrsm<Side::Left, Uplo::Lower, ArgTrans, Diag::Unit,
+                            ArgAlgo>::invoke(one, A, B);
     }
-  };       
+    return r_val[0] + r_val[1];
+  }
+};
 
-  template<typename MemberType,
-           typename ArgTrans,
-           typename ArgAlgo>
-  struct TeamSolveLU {
-    // no piv version
-    template<typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const AViewType &A,
-           const BViewType &B) {
-      int r_val[2] = {};
-      const typename AViewType::non_const_value_type one(1.0);
-      if (std::is_same<ArgTrans,Trans::NoTranspose>::value) {
-        //First, compute Y (= U*X) by solving the system L*Y = B for Y
-        r_val[0] = TeamTrsm<MemberType,Side::Left,Uplo::Lower,ArgTrans,Diag::Unit,ArgAlgo>::invoke(member, one, A, B);
-        //Second, compute X by solving the system U*X = Y for X
-        r_val[1] = TeamTrsm<MemberType,Side::Left,Uplo::Upper,ArgTrans,Diag::NonUnit,ArgAlgo>::invoke(member, one, A, B);
-      } else if (std::is_same<ArgTrans,Trans::Transpose>::value || 
-                 std::is_same<ArgTrans,Trans::ConjTranspose>::value) {
-        //First, compute Y (= L'*X) by solving the system U'*Y = B for Y
-        r_val[0] = TeamTrsm<MemberType,Side::Left,Uplo::Upper,ArgTrans,Diag::NonUnit,ArgAlgo>::invoke(member, one, A, B); 
-        //Second, compute X by solving the system L'*X = Y for X
-        r_val[1] = TeamTrsm<MemberType,Side::Left,Uplo::Lower,ArgTrans,Diag::Unit,ArgAlgo>::invoke(member, one, A, B);
-      }
-      return r_val[0]+r_val[1];
+template <typename MemberType, typename ArgTrans, typename ArgAlgo>
+struct TeamSolveLU {
+  // no piv version
+  template <typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    int r_val[2] = {};
+    const typename AViewType::non_const_value_type one(1.0);
+    if (std::is_same<ArgTrans, Trans::NoTranspose>::value) {
+      // First, compute Y (= U*X) by solving the system L*Y = B for Y
+      r_val[0] = TeamTrsm<MemberType, Side::Left, Uplo::Lower, ArgTrans,
+                          Diag::Unit, ArgAlgo>::invoke(member, one, A, B);
+      // Second, compute X by solving the system U*X = Y for X
+      r_val[1] = TeamTrsm<MemberType, Side::Left, Uplo::Upper, ArgTrans,
+                          Diag::NonUnit, ArgAlgo>::invoke(member, one, A, B);
+    } else if (std::is_same<ArgTrans, Trans::Transpose>::value ||
+               std::is_same<ArgTrans, Trans::ConjTranspose>::value) {
+      // First, compute Y (= L'*X) by solving the system U'*Y = B for Y
+      r_val[0] = TeamTrsm<MemberType, Side::Left, Uplo::Upper, ArgTrans,
+                          Diag::NonUnit, ArgAlgo>::invoke(member, one, A, B);
+      // Second, compute X by solving the system L'*X = Y for X
+      r_val[1] = TeamTrsm<MemberType, Side::Left, Uplo::Lower, ArgTrans,
+                          Diag::Unit, ArgAlgo>::invoke(member, one, A, B);
     }
-  };       
-      
+    return r_val[0] + r_val[1];
+  }
+};
 
-  ///
-  /// Selective Interface
-  ///
-  template<typename MemberType,
-           typename ArgTrans,
-           typename ArgMode, typename ArgAlgo>
-  struct SolveLU {
-    // no piv version
-    template<typename AViewType,
-             typename BViewType>
-    KOKKOS_FORCEINLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const AViewType &A,
-           const BViewType &B) {
-      int r_val = 0;
-      if (std::is_same<ArgMode,Mode::Serial>::value) {
-        r_val = SerialSolveLU<ArgTrans,ArgAlgo>::invoke(A, B);
-      } else if (std::is_same<ArgMode,Mode::Team>::value) {
-        r_val = TeamSolveLU<MemberType,ArgTrans,ArgAlgo>::invoke(member, A, B);
-      } 
-      return r_val;
+///
+/// Selective Interface
+///
+template <typename MemberType, typename ArgTrans, typename ArgMode,
+          typename ArgAlgo>
+struct SolveLU {
+  // no piv version
+  template <typename AViewType, typename BViewType>
+  KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member,
+                                                const AViewType &A,
+                                                const BViewType &B) {
+    int r_val = 0;
+    if (std::is_same<ArgMode, Mode::Serial>::value) {
+      r_val = SerialSolveLU<ArgTrans, ArgAlgo>::invoke(A, B);
+    } else if (std::is_same<ArgMode, Mode::Team>::value) {
+      r_val = TeamSolveLU<MemberType, ArgTrans, ArgAlgo>::invoke(member, A, B);
     }
-  };           
-    
-}
+    return r_val;
+  }
+};
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/KokkosBatched_SolveUTV_Decl.hpp b/src/batched/dense/KokkosBatched_SolveUTV_Decl.hpp
index e21f6fe4bd..069a1f3c48 100644
--- a/src/batched/dense/KokkosBatched_SolveUTV_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_SolveUTV_Decl.hpp
@@ -1,59 +1,46 @@
 #ifndef __KOKKOSBATCHED_SOLVE_UTV_DECL_HPP__
 #define __KOKKOSBATCHED_SOLVE_UTV_DECL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 namespace KokkosBatched {
 
-  ///
-  /// For given UTV = A P^T, it solves A X = B
-  /// - input:
-  ///   - matrix_rank is computed while UTV factorization
-  ///   - U is m x m real matrix (m x matrix_rank is only used)
-  ///   - T is m x m real matrix (matrix_rank x matrix_rank is only used)
-  ///   - V is m x m real matrix (matrix_Rank x m is only used)
-  ///   - p is m integer vector including pivot indicies
-  ///   - X is a solution matrix (or vector)
-  ///   - B is a right hand side matrix (or vector) 
-  ///   - w is B.span() real vector workspace (contiguous)
-  /// - output:
-  ///   - B is overwritten with its solutions
-  ///
-  /// When A is a full rank i.e., matrix_rank == m, UTV computes QR with column pivoting only
-  /// where Q is stored in U and R is stored in T
-  ///
-  
-  ///
-  /// TeamVector Solve UTV
-  ///
-
-  template<typename MemberType,
-           typename ArgAlgo>
-  struct TeamVectorSolveUTV {
-    template<typename UViewType,
-	     typename TViewType,
-	     typename VViewType,
-	     typename pViewType,
-	     typename XViewType,
-	     typename BViewType,
-             typename wViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-	   const int matrix_rank,
-           const UViewType &U,
-	   const TViewType &T,
-	   const VViewType &V,
-	   const pViewType &p,
-	   const XViewType &X,
-	   const BViewType &B,
-           const wViewType &w);
-  };
-
-}
+///
+/// For given UTV = A P^T, it solves A X = B
+/// - input:
+///   - matrix_rank is computed while UTV factorization
+///   - U is m x m real matrix (m x matrix_rank is only used)
+///   - T is m x m real matrix (matrix_rank x matrix_rank is only used)
+///   - V is m x m real matrix (matrix_Rank x m is only used)
+///   - p is m integer vector including pivot indicies
+///   - X is a solution matrix (or vector)
+///   - B is a right hand side matrix (or vector)
+///   - w is B.span() real vector workspace (contiguous)
+/// - output:
+///   - B is overwritten with its solutions
+///
+/// When A is a full rank i.e., matrix_rank == m, UTV computes QR with column
+/// pivoting only where Q is stored in U and R is stored in T
+///
+
+///
+/// TeamVector Solve UTV
+///
+
+template <typename MemberType, typename ArgAlgo>
+struct TeamVectorSolveUTV {
+  template <typename UViewType, typename TViewType, typename VViewType,
+            typename pViewType, typename XViewType, typename BViewType,
+            typename wViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const int matrix_rank, const UViewType &U,
+      const TViewType &T, const VViewType &V, const pViewType &p,
+      const XViewType &X, const BViewType &B, const wViewType &w);
+};
+
+}  // namespace KokkosBatched
 
 #include "KokkosBatched_SolveUTV_TeamVector_Impl.hpp"
 
diff --git a/src/batched/dense/KokkosBatched_Test_BlockCrs_Util.hpp b/src/batched/dense/KokkosBatched_Test_BlockCrs_Util.hpp
index 8ab1f0ebb6..77fa690908 100644
--- a/src/batched/dense/KokkosBatched_Test_BlockCrs_Util.hpp
+++ b/src/batched/dense/KokkosBatched_Test_BlockCrs_Util.hpp
@@ -5,908 +5,938 @@
 #include <stdexcept>
 
 #include "Kokkos_Core.hpp"
-#include "Kokkos_Timer.hpp"
 
 #include "KokkosBatched_Util.hpp"
 
-#define TEST_ASSERT(m, success)                                 \
-  if ( !(m)) {                                                  \
-    success = false;                                            \
-    printf("FAILED: %s, at %d, %s\n", #m, __LINE__, __FILE__);  \
+#define TEST_ASSERT(m, success)                                \
+  if (!(m)) {                                                  \
+    success = false;                                           \
+    printf("FAILED: %s, at %d, %s\n", #m, __LINE__, __FILE__); \
   }
 
 namespace KokkosBatched {
 
-  namespace Test {
+namespace Test {
 
-    typedef int ordinal_type;
-    typedef int size_type;
-    typedef double scalar_type;
+typedef int ordinal_type;
+typedef int size_type;
+typedef double scalar_type;
 #define BLOCKCRS_MAX_BLOCKSIZE 32
 #define FLOP_MUL 1.0
 #define FLOP_ADD 1.0
-    
-    double LU_FlopCount(int mm, int nn) {
-      double m = (double)mm;    double n = (double)nn;
-      if (m > n)
-        return (FLOP_MUL*(0.5*m*n*n-(1.0/6.0)*n*n*n+0.5*m*n-0.5*n*n+(2.0/3.0)*n) +
-                FLOP_ADD*(0.5*m*n*n-(1.0/6.0)*n*n*n-0.5*m*n+        (1.0/6.0)*n));
-      else
-        return (FLOP_MUL*(0.5*n*m*m-(1.0/6.0)*m*m*m+0.5*n*m-0.5*m*m+(2.0/3.0)*m) +
-                FLOP_ADD*(0.5*n*m*m-(1.0/6.0)*m*m*m-0.5*n*m+        (1.0/6.0)*m));
-    }
 
-    double Trsm_Lower_FlopCountLower(int mm, int nn) {
-      double m = (double)mm;    double n = (double)nn;
-      return (FLOP_MUL*(0.5*m*n*(n+1.0)) +
-              FLOP_ADD*(0.5*m*n*(n-1.0)));
-    }
+double LU_FlopCount(int mm, int nn) {
+  double m = (double)mm;
+  double n = (double)nn;
+  if (m > n)
+    return (FLOP_MUL * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n +
+                        0.5 * m * n - 0.5 * n * n + (2.0 / 3.0) * n) +
+            FLOP_ADD * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n -
+                        0.5 * m * n + (1.0 / 6.0) * n));
+  else
+    return (FLOP_MUL * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m +
+                        0.5 * n * m - 0.5 * m * m + (2.0 / 3.0) * m) +
+            FLOP_ADD * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m -
+                        0.5 * n * m + (1.0 / 6.0) * m));
+}
 
-    double Trsm_Upper_FlopCountUpper(int mm, int nn) {
-      double m = (double)mm;    double n = (double)nn;
-      return (FLOP_MUL*(0.5*m*n*(n+1.0)) +
-              FLOP_ADD*(0.5*m*n*(n-1.0)));
-    }
+double Trsm_Lower_FlopCountLower(int mm, int nn) {
+  double m = (double)mm;
+  double n = (double)nn;
+  return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) +
+          FLOP_ADD * (0.5 * m * n * (n - 1.0)));
+}
 
-    double Gemm_FlopCount(int mm, int nn, int kk) {
-      double m = (double)mm;    double n = (double)nn;    double k = (double)kk;
-      return (FLOP_MUL*(m*n*k) +
-              FLOP_ADD*(m*n*k));
-    }
+double Trsm_Upper_FlopCountUpper(int mm, int nn) {
+  double m = (double)mm;
+  double n = (double)nn;
+  return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) +
+          FLOP_ADD * (0.5 * m * n * (n - 1.0)));
+}
 
-    template <typename aViewType, typename bViewType>
-    double compute_relative_diff(const aViewType a, 
-                                 const bViewType b) {
-      // Bring the vectors to the host. This is just a correctness checker.
-      auto aa = Kokkos::create_mirror_view(a); Kokkos::deep_copy(aa, a);
-      auto bb = Kokkos::create_mirror_view(b); Kokkos::deep_copy(bb, b);
-
-      double diff2 = 0, norm2 = 0;
-      for (ordinal_type i=0,iend=aa.extent(0);i<iend;++i)
-        for (ordinal_type j=0,jend=aa.extent(1);j<jend;++j)
-          for (ordinal_type k=0,kend=aa.extent(2);k<kend;++k)
-            for (ordinal_type l=0,lend=aa.extent(3);l<lend;++l) {
-              const double 
-                val  = aa.access(i,j,k,l),
-                diff = aa.access(i,j,k,l) - bb.access(i,j,k,l);
-              diff2 += diff*diff;
-              norm2 += val*val;
-            }
-
-      return std::sqrt(diff2/norm2);
-    }
+double Gemm_FlopCount(int mm, int nn, int kk) {
+  double m = (double)mm;
+  double n = (double)nn;
+  double k = (double)kk;
+  return (FLOP_MUL * (m * n * k) + FLOP_ADD * (m * n * k));
+}
 
-    // Representation of a structured block mesh. The fastest index is k.
-    struct StencilShape { 
-      enum Enum { cross }; 
-    };
+template <typename aViewType, typename bViewType>
+double compute_relative_diff(const aViewType a, const bViewType b) {
+  // Bring the vectors to the host. This is just a correctness checker.
+  auto aa = Kokkos::create_mirror_view(a);
+  Kokkos::deep_copy(aa, a);
+  auto bb = Kokkos::create_mirror_view(b);
+  Kokkos::deep_copy(bb, b);
+
+  double diff2 = 0, norm2 = 0;
+  for (ordinal_type i = 0, iend = aa.extent(0); i < iend; ++i)
+    for (ordinal_type j = 0, jend = aa.extent(1); j < jend; ++j)
+      for (ordinal_type k = 0, kend = aa.extent(2); k < kend; ++k)
+        for (ordinal_type l = 0, lend = aa.extent(3); l < lend; ++l) {
+          const double val  = aa.access(i, j, k, l),
+                       diff = aa.access(i, j, k, l) - bb.access(i, j, k, l);
+          diff2 += diff * diff;
+          norm2 += val * val;
+        }
 
-    struct StructuredBlock {
-      const ordinal_type ni, nj, nk;
+  return std::sqrt(diff2 / norm2);
+}
 
-      StructuredBlock (const ordinal_type ni_, 
-                       const ordinal_type nj_, 
-                       const ordinal_type nk_)
-        : ni(ni_), nj(nj_), nk(nk_), _njnk(nj_*nk_) {}
+// Representation of a structured block mesh. The fastest index is k.
+struct StencilShape {
+  enum Enum { cross };
+};
 
-      KOKKOS_INLINE_FUNCTION 
-      size_type size () const { return ni*nj*nk; }
+struct StructuredBlock {
+  const ordinal_type ni, nj, nk;
 
-      KOKKOS_INLINE_FUNCTION 
-      size_type ij2id (const ordinal_type i, 
-                       const ordinal_type j) const { 
-        return i*nj + j;
-      }
+  StructuredBlock(const ordinal_type ni_, const ordinal_type nj_,
+                  const ordinal_type nk_)
+      : ni(ni_), nj(nj_), nk(nk_), _njnk(nj_ * nk_) {}
 
-      KOKKOS_INLINE_FUNCTION 
-      void id2ij (const size_type id, 
-                  ordinal_type& i, 
-                  ordinal_type& j) const {
-        i = id / nj;
-        j = id % nj;
-      }
+  KOKKOS_INLINE_FUNCTION
+  size_type size() const { return ni * nj * nk; }
 
-      KOKKOS_INLINE_FUNCTION 
-      size_type ijk2id (const ordinal_type i, 
-                        const ordinal_type j, 
-                        const ordinal_type k) const { 
-        return (i*nj + j)*nk + k; 
-      }
-      
-      KOKKOS_INLINE_FUNCTION 
-      void id2ijk (const size_type id, 
-                   ordinal_type& i, 
-                   ordinal_type& j, 
-                   ordinal_type& k) const {
-        i = id / _njnk;
-        k = id % _njnk;
-        j = k / nk;
-        k = k % nk;
-      }
-    private:
-      const ordinal_type _njnk;
-    };
-    
-    template <typename ExecSpace, typename ArrayLayout>
-    struct CrsGraph {
-      typedef ExecSpace exec_space;
-      typedef ArrayLayout array_layout;      
-
-      typedef Kokkos::View<size_type*,   array_layout,exec_space> row_ptr_type;
-      typedef Kokkos::View<ordinal_type*,array_layout,exec_space> row_idx_type;
-      typedef Kokkos::View<ordinal_type*,array_layout,exec_space> col_idx_type;
-
-      row_ptr_type rowptr;
-      row_idx_type rowidx;
-      col_idx_type colidx;
-      
-      CrsGraph () 
-        : rowptr("rowptr", 1), 
-          rowidx("rowidx", 0),
-          colidx("colidx", 0) {}
-
-      KOKKOS_INLINE_FUNCTION
-      bool isEmpty() const { 
-        return (rowptr.extent(0) <= 1 || colidx.extent(0) == 0 || rowidx.extent(0) == 0); 
-      }
-      
-      KOKKOS_INLINE_FUNCTION
-      ordinal_type NumRows() const { 
-        return (isEmpty() ? 0 : static_cast<ordinal_type>(rowptr.extent(0)) - 1); 
-      }
-      
-      KOKKOS_INLINE_FUNCTION
-      size_type NumNonZeros() const { 
-        return (isEmpty() ? 0 : static_cast<size_type>(colidx.extent(0))); 
-      }
-    };
+  KOKKOS_INLINE_FUNCTION
+  size_type ij2id(const ordinal_type i, const ordinal_type j) const {
+    return i * nj + j;
+  }
 
-    template<typename DstSpace, typename SrcSpace, typename ArrayLayout>
-    inline 
-    CrsGraph<DstSpace,ArrayLayout>
-    create_mirror(const CrsGraph<SrcSpace,ArrayLayout> src) {
-      CrsGraph<DstSpace,ArrayLayout> dst;
+  KOKKOS_INLINE_FUNCTION
+  void id2ij(const size_type id, ordinal_type &i, ordinal_type &j) const {
+    i = id / nj;
+    j = id % nj;
+  }
 
-      dst.rowptr = Kokkos::create_mirror_view(typename DstSpace::memory_space(), src.rowptr);
-      dst.rowidx = Kokkos::create_mirror_view(typename DstSpace::memory_space(), src.rowidx);
-      dst.colidx = Kokkos::create_mirror_view(typename DstSpace::memory_space(), src.colidx);
+  KOKKOS_INLINE_FUNCTION
+  size_type ijk2id(const ordinal_type i, const ordinal_type j,
+                   const ordinal_type k) const {
+    return (i * nj + j) * nk + k;
+  }
 
-      return dst;
-    }
+  KOKKOS_INLINE_FUNCTION
+  void id2ijk(const size_type id, ordinal_type &i, ordinal_type &j,
+              ordinal_type &k) const {
+    i = id / _njnk;
+    k = id % _njnk;
+    j = k / nk;
+    k = k % nk;
+  }
 
-    template<typename DstSpace, typename SrcSpace, typename ArrayLayout>
-    inline 
-    void
-    deep_copy(const CrsGraph<DstSpace,ArrayLayout> dst, const CrsGraph<SrcSpace,ArrayLayout> src) {
-      Kokkos::deep_copy(dst.rowptr, src.rowptr);
-      Kokkos::deep_copy(dst.rowidx, src.rowidx);
-      Kokkos::deep_copy(dst.colidx, src.colidx);
-    }
-    
-    // Given a structured block and a stencil (at present, just a 3D 1-hop cross),
-    // construct a corresponding CRS graph.
-    template<typename ArrayLayout>
-    CrsGraph<Kokkos::DefaultHostExecutionSpace,ArrayLayout>
-    create_graph_host_for_structured_block(const StructuredBlock mesh, 
-                                           const StencilShape::Enum shape) {
-      CrsGraph<Kokkos::DefaultHostExecutionSpace,ArrayLayout> graph;
-
-      Kokkos::resize(graph.rowptr, mesh.size()+1);
-      graph.rowptr[0] = 0;
-      
-      std::vector<ordinal_type> colidx, rowidx;
-      switch (shape) {
-      case StencilShape::cross:
-        for (ordinal_type c=0;c<mesh.size();++c) {
-          ordinal_type i, j, k, n = 0;
-
-          mesh.id2ijk(c, i, j, k);
-          
-          rowidx.push_back(c); colidx.push_back(c); ++n;
-          if (i > 0)         { rowidx.push_back(c); colidx.push_back(mesh.ijk2id(i-1, j, k  )); ++n; }
-          if (i+1 < mesh.ni) { rowidx.push_back(c); colidx.push_back(mesh.ijk2id(i+1, j, k  )); ++n; }
-          if (j > 0)         { rowidx.push_back(c); colidx.push_back(mesh.ijk2id(i, j-1, k  )); ++n; }
-          if (j+1 < mesh.nj) { rowidx.push_back(c); colidx.push_back(mesh.ijk2id(i, j+1, k  )); ++n; }
-          if (k > 0)         { rowidx.push_back(c); colidx.push_back(mesh.ijk2id(i, j,   k-1)); ++n; }
-          if (k+1 < mesh.nk) { rowidx.push_back(c); colidx.push_back(mesh.ijk2id(i, j,   k+1)); ++n; }
-          graph.rowptr[c+1] = graph.rowptr[c] + n;
+ private:
+  const ordinal_type _njnk;
+};
+
+template <typename ExecSpace, typename ArrayLayout>
+struct CrsGraph {
+  typedef ExecSpace exec_space;
+  typedef ArrayLayout array_layout;
+
+  typedef Kokkos::View<size_type *, array_layout, exec_space> row_ptr_type;
+  typedef Kokkos::View<ordinal_type *, array_layout, exec_space> row_idx_type;
+  typedef Kokkos::View<ordinal_type *, array_layout, exec_space> col_idx_type;
+
+  row_ptr_type rowptr;
+  row_idx_type rowidx;
+  col_idx_type colidx;
+
+  CrsGraph() : rowptr("rowptr", 1), rowidx("rowidx", 0), colidx("colidx", 0) {}
+
+  KOKKOS_INLINE_FUNCTION
+  bool isEmpty() const {
+    return (rowptr.extent(0) <= 1 || colidx.extent(0) == 0 ||
+            rowidx.extent(0) == 0);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  ordinal_type NumRows() const {
+    return (isEmpty() ? 0 : static_cast<ordinal_type>(rowptr.extent(0)) - 1);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type NumNonZeros() const {
+    return (isEmpty() ? 0 : static_cast<size_type>(colidx.extent(0)));
+  }
+};
+
+template <typename DstSpace, typename SrcSpace, typename ArrayLayout>
+inline CrsGraph<DstSpace, ArrayLayout> create_mirror(
+    const CrsGraph<SrcSpace, ArrayLayout> src) {
+  CrsGraph<DstSpace, ArrayLayout> dst;
+
+  dst.rowptr =
+      Kokkos::create_mirror_view(typename DstSpace::memory_space(), src.rowptr);
+  dst.rowidx =
+      Kokkos::create_mirror_view(typename DstSpace::memory_space(), src.rowidx);
+  dst.colidx =
+      Kokkos::create_mirror_view(typename DstSpace::memory_space(), src.colidx);
+
+  return dst;
+}
+
+template <typename DstSpace, typename SrcSpace, typename ArrayLayout>
+inline void deep_copy(const CrsGraph<DstSpace, ArrayLayout> dst,
+                      const CrsGraph<SrcSpace, ArrayLayout> src) {
+  Kokkos::deep_copy(dst.rowptr, src.rowptr);
+  Kokkos::deep_copy(dst.rowidx, src.rowidx);
+  Kokkos::deep_copy(dst.colidx, src.colidx);
+}
+
+// Given a structured block and a stencil (at present, just a 3D 1-hop cross),
+// construct a corresponding CRS graph.
+template <typename ArrayLayout>
+CrsGraph<Kokkos::DefaultHostExecutionSpace, ArrayLayout>
+create_graph_host_for_structured_block(const StructuredBlock mesh,
+                                       const StencilShape::Enum shape) {
+  CrsGraph<Kokkos::DefaultHostExecutionSpace, ArrayLayout> graph;
+
+  Kokkos::resize(graph.rowptr, mesh.size() + 1);
+  graph.rowptr[0] = 0;
+
+  std::vector<ordinal_type> colidx, rowidx;
+  switch (shape) {
+    case StencilShape::cross:
+      for (ordinal_type c = 0; c < mesh.size(); ++c) {
+        ordinal_type i, j, k, n = 0;
+
+        mesh.id2ijk(c, i, j, k);
+
+        rowidx.push_back(c);
+        colidx.push_back(c);
+        ++n;
+        if (i > 0) {
+          rowidx.push_back(c);
+          colidx.push_back(mesh.ijk2id(i - 1, j, k));
+          ++n;
         }
-        break;
-      }
-      assert(graph.rowptr[mesh.size()] == static_cast<size_type>(colidx.size()));
-      assert(graph.rowptr[mesh.size()] == static_cast<size_type>(rowidx.size()));
-
-      for (ordinal_type c=0;c<mesh.size();++c)
-        std::sort(colidx.begin() + graph.rowptr[c], colidx.begin() + graph.rowptr[c+1]);
-
-      const ordinal_type nnz = graph.rowptr[mesh.size()];
-      Kokkos::resize(graph.colidx, nnz);
-      Kokkos::resize(graph.rowidx, nnz);
-      for (ordinal_type c=0;c<nnz;++c) {
-        graph.colidx[c] = colidx[c];
-        graph.rowidx[c] = rowidx[c];
+        if (i + 1 < mesh.ni) {
+          rowidx.push_back(c);
+          colidx.push_back(mesh.ijk2id(i + 1, j, k));
+          ++n;
+        }
+        if (j > 0) {
+          rowidx.push_back(c);
+          colidx.push_back(mesh.ijk2id(i, j - 1, k));
+          ++n;
+        }
+        if (j + 1 < mesh.nj) {
+          rowidx.push_back(c);
+          colidx.push_back(mesh.ijk2id(i, j + 1, k));
+          ++n;
+        }
+        if (k > 0) {
+          rowidx.push_back(c);
+          colidx.push_back(mesh.ijk2id(i, j, k - 1));
+          ++n;
+        }
+        if (k + 1 < mesh.nk) {
+          rowidx.push_back(c);
+          colidx.push_back(mesh.ijk2id(i, j, k + 1));
+          ++n;
+        }
+        graph.rowptr[c + 1] = graph.rowptr[c] + n;
       }
-      return graph;
-    }
+      break;
+  }
+  assert(graph.rowptr[mesh.size()] == static_cast<size_type>(colidx.size()));
+  assert(graph.rowptr[mesh.size()] == static_cast<size_type>(rowidx.size()));
+
+  for (ordinal_type c = 0; c < mesh.size(); ++c)
+    std::sort(colidx.begin() + graph.rowptr[c],
+              colidx.begin() + graph.rowptr[c + 1]);
+
+  const ordinal_type nnz = graph.rowptr[mesh.size()];
+  Kokkos::resize(graph.colidx, nnz);
+  Kokkos::resize(graph.rowidx, nnz);
+  for (ordinal_type c = 0; c < nnz; ++c) {
+    graph.colidx[c] = colidx[c];
+    graph.rowidx[c] = rowidx[c];
+  }
+  return graph;
+}
 
-    template <typename ExeSpace, typename ArrayLayout>
-    class BlockCrsMatrix {
-    public:
-      typedef ExeSpace exec_space;
-      typedef ArrayLayout array_layout;
-      typedef Test::CrsGraph<exec_space,array_layout> crs_graph_type;
-
-      typedef scalar_type value_type;
-      typedef Kokkos::View<scalar_type***,array_layout,exec_space> value_array_type;
-
-    private:
-      crs_graph_type _graph;
-      ordinal_type _blocksize;
-      value_array_type _values;
-
-    public:
-      BlockCrsMatrix() 
-        : _graph(), 
-          _blocksize(),
-          _values() {} 
-
-      BlockCrsMatrix(const BlockCrsMatrix &b) 
-        : _graph(b._graph), 
-          _blocksize(b._blocksize),
-          _values(b._values) {} 
-
-      BlockCrsMatrix (const crs_graph_type graph, 
-                      const ordinal_type blocksize )
-        : _graph(graph), 
-          _blocksize(blocksize),
-          _values("BlockCrsMatrix::_values", 
-                  _graph.NumNonZeros(), _blocksize, _blocksize) {}
-
-      BlockCrsMatrix (const crs_graph_type graph, 
-                      const ordinal_type blocksize,
-                      const value_array_type values)
-        : _graph(graph), 
-          _blocksize(blocksize),
-          _values(values) {}
-
-      ordinal_type BlockSize() const { return _blocksize; }
-      crs_graph_type CrsGraph() const { return _graph; }
-      value_array_type Values() const { return _values; }
-    };
-
-    template<typename DstSpace, typename SrcSpace, typename ArrayLayout>
-    inline 
-    BlockCrsMatrix<DstSpace,ArrayLayout>
-    create_mirror(const BlockCrsMatrix<SrcSpace,ArrayLayout> src) {
-      const auto graph = create_mirror<DstSpace>(src.CrsGraph());
-      const auto blocksize = src.BlockSize();
-      const auto values = Kokkos::create_mirror_view(typename DstSpace::memory_space(), src.Values());
-      return BlockCrsMatrix<DstSpace,ArrayLayout>(graph, blocksize, values);
-    }
+template <typename ExeSpace, typename ArrayLayout>
+class BlockCrsMatrix {
+ public:
+  typedef ExeSpace exec_space;
+  typedef ArrayLayout array_layout;
+  typedef Test::CrsGraph<exec_space, array_layout> crs_graph_type;
+
+  typedef scalar_type value_type;
+  typedef Kokkos::View<scalar_type ***, array_layout, exec_space>
+      value_array_type;
+
+ private:
+  crs_graph_type _graph;
+  ordinal_type _blocksize;
+  value_array_type _values;
+
+ public:
+  BlockCrsMatrix() : _graph(), _blocksize(), _values() {}
+
+  BlockCrsMatrix(const BlockCrsMatrix &b)
+      : _graph(b._graph), _blocksize(b._blocksize), _values(b._values) {}
+
+  BlockCrsMatrix(const crs_graph_type graph, const ordinal_type blocksize)
+      : _graph(graph),
+        _blocksize(blocksize),
+        _values("BlockCrsMatrix::_values", _graph.NumNonZeros(), _blocksize,
+                _blocksize) {}
+
+  BlockCrsMatrix(const crs_graph_type graph, const ordinal_type blocksize,
+                 const value_array_type values)
+      : _graph(graph), _blocksize(blocksize), _values(values) {}
+
+  ordinal_type BlockSize() const { return _blocksize; }
+  crs_graph_type CrsGraph() const { return _graph; }
+  value_array_type Values() const { return _values; }
+};
+
+template <typename DstSpace, typename SrcSpace, typename ArrayLayout>
+inline BlockCrsMatrix<DstSpace, ArrayLayout> create_mirror(
+    const BlockCrsMatrix<SrcSpace, ArrayLayout> src) {
+  const auto graph     = create_mirror<DstSpace>(src.CrsGraph());
+  const auto blocksize = src.BlockSize();
+  const auto values    = Kokkos::create_mirror_view(
+      typename DstSpace::memory_space(), src.Values());
+  return BlockCrsMatrix<DstSpace, ArrayLayout>(graph, blocksize, values);
+}
 
-    template<typename DstSpace, typename SrcSpace, typename ArrayLayout>
-    inline 
-    void
-    deep_copy(const BlockCrsMatrix<DstSpace,ArrayLayout> dst, 
-              const BlockCrsMatrix<SrcSpace,ArrayLayout> src) {
-      deep_copy(dst.CrsGraph(), src.CrsGraph());
-      Kokkos::deep_copy(dst.Values(), src.Values());
-    }
-    
-    template<typename ArrayLayout>
-    void fill_block_crs_matrix_host(BlockCrsMatrix<Kokkos::DefaultHostExecutionSpace,ArrayLayout> A) {
-      // extract graph and blocksizes
-      const auto graph = A.CrsGraph();
-      const auto values = A.Values();
-      const ordinal_type blocksize = A.BlockSize();
-
-      scalar_type 
-        tmp[BLOCKCRS_MAX_BLOCKSIZE*BLOCKCRS_MAX_BLOCKSIZE], //[blocksize*blocksize], 
-        diag_block[BLOCKCRS_MAX_BLOCKSIZE][BLOCKCRS_MAX_BLOCKSIZE], //[blocksize][blocksize], 
-        offdiag_block[BLOCKCRS_MAX_BLOCKSIZE][BLOCKCRS_MAX_BLOCKSIZE]; //[blocksize][blocksize];
-      
-      Random<scalar_type> random;
-
-      // for diagonal block, make spd
-      {
-        const ordinal_type iend = blocksize*blocksize;
-        for (ordinal_type i=0;i<iend;++i) 
-          tmp[i] = 2*(random.value() - 0.5);
-        
-        for (ordinal_type i=0;i<blocksize;++i) 
-          for (ordinal_type j=i;j<blocksize;++j) {
-            diag_block[i][j] = 0;
-            for (ordinal_type k=0;k<blocksize;++k) 
-              diag_block[i][j] += tmp[i*blocksize+k]*tmp[j*blocksize+k];
-            if (i != j) diag_block[j][i]  = diag_block[i][j];    // symmetrize
-            else        diag_block[i][j] *= 0.5*blocksize; // improve condition
-          }
-      } 
-      
-      {
-        // for off diagonal; down-weight off-diag blocks to improve conditioning.
-        for (ordinal_type i=0;i<blocksize;++i)
-          for (ordinal_type j=0;j<blocksize;++j) 
-            offdiag_block[i][j] = 0.1 * 2*(random.value() - 0.5);
+template <typename DstSpace, typename SrcSpace, typename ArrayLayout>
+inline void deep_copy(const BlockCrsMatrix<DstSpace, ArrayLayout> dst,
+                      const BlockCrsMatrix<SrcSpace, ArrayLayout> src) {
+  deep_copy(dst.CrsGraph(), src.CrsGraph());
+  Kokkos::deep_copy(dst.Values(), src.Values());
+}
+
+template <typename ArrayLayout>
+void fill_block_crs_matrix_host(
+    BlockCrsMatrix<Kokkos::DefaultHostExecutionSpace, ArrayLayout> A) {
+  // extract graph and blocksizes
+  const auto graph             = A.CrsGraph();
+  const auto values            = A.Values();
+  const ordinal_type blocksize = A.BlockSize();
+
+  scalar_type tmp[BLOCKCRS_MAX_BLOCKSIZE *
+                  BLOCKCRS_MAX_BLOCKSIZE],  //[blocksize*blocksize],
+      diag_block[BLOCKCRS_MAX_BLOCKSIZE]
+                [BLOCKCRS_MAX_BLOCKSIZE],  //[blocksize][blocksize],
+      offdiag_block[BLOCKCRS_MAX_BLOCKSIZE]
+                   [BLOCKCRS_MAX_BLOCKSIZE];  //[blocksize][blocksize];
+
+  Random<scalar_type> random;
+
+  // for diagonal block, make spd
+  {
+    const ordinal_type iend = blocksize * blocksize;
+    for (ordinal_type i = 0; i < iend; ++i) tmp[i] = 2 * (random.value() - 0.5);
+
+    for (ordinal_type i = 0; i < blocksize; ++i)
+      for (ordinal_type j = i; j < blocksize; ++j) {
+        diag_block[i][j] = 0;
+        for (ordinal_type k = 0; k < blocksize; ++k)
+          diag_block[i][j] += tmp[i * blocksize + k] * tmp[j * blocksize + k];
+        if (i != j)
+          diag_block[j][i] = diag_block[i][j];  // symmetrize
+        else
+          diag_block[i][j] *= 0.5 * blocksize;  // improve condition
       }
-      
-      for (ordinal_type r=0;r<graph.NumRows();++r) {
-        // random number generator (-1, 1)
-        const ordinal_type cbegin = graph.rowptr(r), cend = graph.rowptr(r+1);
-        for (ordinal_type c=cbegin;c<cend;++c) {
-          auto block = Kokkos::subview(values, c, Kokkos::ALL(), Kokkos::ALL());
-          
-          if (graph.colidx(c) == r) {
-            for (ordinal_type i=0;i<blocksize;++i) 
-              for (ordinal_type j=i;j<blocksize;++j) 
-                block(i,j) = diag_block[i][j];
-          } else {
-            // for off diagonal; down-weight off-diag blocks to improve conditioning.
-            for (ordinal_type i=0;i<blocksize;++i)
-              for (ordinal_type j=0;j<blocksize;++j) 
-                block(i,j) = offdiag_block[i][j];
-          }
-          
-        }
+  }
+
+  {
+    // for off diagonal; down-weight off-diag blocks to improve conditioning.
+    for (ordinal_type i = 0; i < blocksize; ++i)
+      for (ordinal_type j = 0; j < blocksize; ++j)
+        offdiag_block[i][j] = 0.1 * 2 * (random.value() - 0.5);
+  }
+
+  for (ordinal_type r = 0; r < graph.NumRows(); ++r) {
+    // random number generator (-1, 1)
+    const ordinal_type cbegin = graph.rowptr(r), cend = graph.rowptr(r + 1);
+    for (ordinal_type c = cbegin; c < cend; ++c) {
+      auto block = Kokkos::subview(values, c, Kokkos::ALL(), Kokkos::ALL());
+
+      if (graph.colidx(c) == r) {
+        for (ordinal_type i = 0; i < blocksize; ++i)
+          for (ordinal_type j = i; j < blocksize; ++j)
+            block(i, j) = diag_block[i][j];
+      } else {
+        // for off diagonal; down-weight off-diag blocks to improve
+        // conditioning.
+        for (ordinal_type i = 0; i < blocksize; ++i)
+          for (ordinal_type j = 0; j < blocksize; ++j)
+            block(i, j) = offdiag_block[i][j];
       }
     }
-    
-    // nrhs should go after blocksize to match matrix dimensions consistently
-    template <typename ExeSpace, typename ArrayLayout>
-    class BlockMultiVector {
-    public:
-      typedef ExeSpace exec_space;
-      typedef ArrayLayout array_layout;
-
-      typedef scalar_type value_type;
-      typedef Kokkos::View<scalar_type***,array_layout,exec_space> value_array_type;
-
-    private:
-      value_array_type _values;
-
-    public:
-      BlockMultiVector(const ordinal_type nvecs,
-                       const ordinal_type nrows,                       
-                       const ordinal_type blocksize )
-        : _values("BlockMultiVector::_values", nvecs, nrows, blocksize) {}
-
-      BlockMultiVector(const value_array_type values) 
-        : _values(values) {}
-
-      ordinal_type NumVectors() const { return _values.extent(0); }
-      ordinal_type NumRows() const { return _values.extent(1); }
-      ordinal_type BlockSize() const { return _values.extent(2); }
-
-      value_array_type Values() const { return _values; }
-    };
-    
-    template<typename DstSpace, typename SrcSpace, typename ArrayLayout>
-    inline 
-    BlockMultiVector<DstSpace,ArrayLayout>
-    create_mirror(const BlockMultiVector<SrcSpace,ArrayLayout> src) {
-      return BlockMultiVector<DstSpace,ArrayLayout>
-        (Kokkos::create_mirror_view(typename DstSpace::memory_space(), src.Values()));
-    }
-    
-    template<typename DstSpace, typename SrcSpace, typename ArrayLayout>
-    inline 
-    void
-    deep_copy(const BlockMultiVector<DstSpace,ArrayLayout> dst, 
-              const BlockMultiVector<SrcSpace,ArrayLayout> src) {
-      Kokkos::deep_copy(dst.Values(), src.Values());
-    }
-    
-    template<typename ArrayLayout>
-    void fill_block_multi_vector_host(BlockMultiVector<Kokkos::DefaultHostExecutionSpace,ArrayLayout> B) {
-      const ordinal_type 
-        jend = B.NumVectors(), 
-        iend = B.NumRows(), 
-        kend = B.BlockSize();
-      
-      auto B_val = B.Values();
-      
-      for (ordinal_type j=0;j<jend;++j) 
-        for (ordinal_type i=0;i<iend;++i) 
-          for (ordinal_type k=0;k<kend;++k) 
-            B_val(j, i, k) = static_cast<double>((i+j+k)%7) - 3;
-    }
-    
-    template <typename ExecSpace, typename ValueType, typename ArrayLayout>
-    class BlockTridiagMatrices {
-    public:
-      typedef ExecSpace exec_space;
-      typedef ValueType value_type;
-      typedef ArrayLayout array_layout;
-
-      typedef Kokkos::View<value_type****,array_layout,exec_space> value_array_type;
-      
-    private:
-      const ordinal_type _ntridiags, _nrows, _blocksize;
-      // A B
-      // C
-      value_array_type _A, _B, _C;
-      
-    public:
-
-      BlockTridiagMatrices (const ordinal_type ntridiags,
-                            const ordinal_type nrows,
-                            const ordinal_type blocksize)
-        : _ntridiags(ntridiags), 
-          _nrows(nrows),
-          _blocksize(blocksize), 
-          _A("BlockTridiagMatrix::_A", _ntridiags, _nrows,   _blocksize, _blocksize),
-          _B("BlockTridiagMatrix::_B", _ntridiags, _nrows-1, _blocksize, _blocksize),
-          _C("BlockTridiagMatrix::_C", _ntridiags, _nrows-1, _blocksize, _blocksize) {}
-
-      BlockTridiagMatrices (const ordinal_type ntridiags,
-                            const ordinal_type nrows,
-                            const ordinal_type blocksize,
-                            const value_array_type A_,
-                            const value_array_type B_,
-                            const value_array_type C_)
-        : _ntridiags(ntridiags), 
-          _nrows(nrows),
-          _blocksize(blocksize), 
-          _A(A_),
-          _B(B_),
-          _C(C_) {}
-
-      value_array_type A() const { return _A; }
-      value_array_type B() const { return _B; }
-      value_array_type C() const { return _C; }
-      
-      ordinal_type BlockSize() const { return _blocksize; }
-      ordinal_type NumRows() const { return _nrows; }
-      ordinal_type NumTridiagMatrices() const { return _ntridiags; }
-    };
-
-    template<typename ExecSpace, typename ValueType, typename ArrayLayout>
-    BlockTridiagMatrices<ExecSpace,ValueType,ArrayLayout> 
-    create_block_tridiag_matrices(const ordinal_type ntridiags, 
-                                  const ordinal_type nrows,
-                                  const ordinal_type blocksize) {
-      return BlockTridiagMatrices<ExecSpace,ValueType,ArrayLayout>
-        (adjustDimension<ValueType>(ntridiags), nrows, blocksize);
-    }
+  }
+}
 
-    template<typename DstSpace, typename SrcSpace, typename ValueType, typename ArrayLayout>
-    inline 
-    BlockTridiagMatrices<DstSpace,ValueType,ArrayLayout>
-    create_mirror(const BlockTridiagMatrices<SrcSpace,ValueType,ArrayLayout> src) {
-      return BlockTridiagMatrices<DstSpace,ValueType,ArrayLayout>
-        (src.NumTridiagMatrices(),
-         src.NumRows(),
-         src.BlockSize(),
-         Kokkos::create_mirror_view(typename DstSpace::memory_space(), src.A()),
-         Kokkos::create_mirror_view(typename DstSpace::memory_space(), src.B()),
-         Kokkos::create_mirror_view(typename DstSpace::memory_space(), src.C()));
-    }
-    
-    template<typename DstSpace, typename SrcSpace, typename ValueType, typename ArrayLayout>
-    inline 
-    void
-    deep_copy(const BlockTridiagMatrices<DstSpace,ValueType,ArrayLayout> dst, 
-              const BlockTridiagMatrices<SrcSpace,ValueType,ArrayLayout> src) {
-      Kokkos::deep_copy(dst.A(), src.A());
-      Kokkos::deep_copy(dst.B(), src.B());
-      Kokkos::deep_copy(dst.C(), src.C());
-    }
+// nrhs should go after blocksize to match matrix dimensions consistently
+template <typename ExeSpace, typename ArrayLayout>
+class BlockMultiVector {
+ public:
+  typedef ExeSpace exec_space;
+  typedef ArrayLayout array_layout;
 
-    template<typename ViewType>
-    KOKKOS_INLINE_FUNCTION
-    typename std::enable_if< std::is_same<typename ViewType::value_type,scalar_type>::value, scalar_type&>::type
-    tdiag_val(const ViewType &A, 
-              const ordinal_type &t, 
-              const ordinal_type &i, 
-              const ordinal_type &ii, 
-              const ordinal_type &jj) { 
-      return A(t, i, ii, jj);
-    }
+  typedef scalar_type value_type;
+  typedef Kokkos::View<scalar_type ***, array_layout, exec_space>
+      value_array_type;
 
-    template<typename ViewType>
-    KOKKOS_INLINE_FUNCTION
-    typename std::enable_if< !std::is_same<typename ViewType::value_type,scalar_type>::value, scalar_type&>::type
-    tdiag_val(const ViewType &A,
-              const ordinal_type &t, 
-              const ordinal_type &i, 
-              const ordinal_type &ii, 
-              const ordinal_type &jj) { 
-      typedef typename ViewType::value_type value_type;
-      return A(t/value_type::vector_length, i, ii, jj)[t%value_type::vector_length];
-    }
+ private:
+  value_array_type _values;
 
-    template <typename ExeSpace, typename ValueType, typename ArrayLayout>
-    class PartitionedBlockMultiVector {
-    public:
-      typedef ExeSpace exec_space;
-      typedef ValueType value_type;
-      typedef ArrayLayout array_layout;
-
-      typedef Kokkos::View<value_type****,array_layout,exec_space> value_array_type;
-
-    private:
-      value_array_type _values;
-
-    public:
-      PartitionedBlockMultiVector(const ordinal_type nparts,
-                                  const ordinal_type nvectors,
-                                  const ordinal_type nrows,
-                                  const ordinal_type blocksize)
-        : _values("BlockMultiVector::_values", nparts, nvectors, nrows, blocksize) {}
-
-      PartitionedBlockMultiVector(const value_array_type values) 
-        : _values(values) {}
-      
-      ordinal_type NumPartitions() const { return _values.extent(0); }
-      ordinal_type NumVectors() const { return _values.extent(1); }      
-      ordinal_type NumRows() const { return _values.extent(2); }
-      ordinal_type BlockSize() const { return _values.extent(3); }
-
-      value_array_type Values() const { return _values; }
-    };
-
-    template<typename ExecSpace, typename ValueType, typename ArrayLayout>
-    PartitionedBlockMultiVector<ExecSpace,ValueType,ArrayLayout> 
-    create_partitioned_block_multi_vector(const ordinal_type nparts,
-                                          const ordinal_type nvectors,
-                                          const ordinal_type nrows,
-                                          const ordinal_type blocksize) {
-      return PartitionedBlockMultiVector
-        <ExecSpace,ValueType,ArrayLayout>(adjustDimension<ValueType>(nparts), 
-                                          nvectors,
-                                          nrows, 
-                                          blocksize);
-    }
-    
-    template<typename DstSpace, typename SrcSpace, typename ValueType, typename ArrayLayout>
-    inline 
-    PartitionedBlockMultiVector<DstSpace,ValueType,ArrayLayout>
-    create_mirror(const PartitionedBlockMultiVector<SrcSpace,ValueType,ArrayLayout> src) {
-      return PartitionedBlockMultiVector<DstSpace,ValueType,ArrayLayout>
-        (Kokkos::create_mirror_view(typename DstSpace::memory_space(), src.Values()));
-    }
-    
-    template<typename DstSpace, typename SrcSpace, typename ValueType, typename ArrayLayout>
-    inline 
-    void
-    deep_copy(const PartitionedBlockMultiVector<DstSpace,ValueType,ArrayLayout> dst, 
-              const PartitionedBlockMultiVector<SrcSpace,ValueType,ArrayLayout> src) {
-      Kokkos::deep_copy(dst.Values(), src.Values());
-    }
-    
-    template<typename ValueType, typename ArrayLayout>
-    void fill_partitioned_block_multi_vector_host
-    (PartitionedBlockMultiVector<Kokkos::DefaultHostExecutionSpace,ValueType,ArrayLayout> B,
-     const ordinal_type ninj) {
-      const ordinal_type 
-        iend = ninj, // B.NumPartitions(),
-        jend = B.NumVectors(),
-        kend = B.NumRows(), 
-        lend = B.BlockSize();
-      
-      auto B_val = B.Values();
-      for (ordinal_type i=0;i<iend;++i) 
-        for (ordinal_type j=0;j<jend;++j) 
-          for (ordinal_type k=0;k<kend;++k) 
-            for (ordinal_type l=0;l<lend;++l)
-              tdiag_val(B_val, i,j,k,l) = static_cast<double>((i+j+k+l)%7) - 3;
+ public:
+  BlockMultiVector(const ordinal_type nvecs, const ordinal_type nrows,
+                   const ordinal_type blocksize)
+      : _values("BlockMultiVector::_values", nvecs, nrows, blocksize) {}
+
+  BlockMultiVector(const value_array_type values) : _values(values) {}
+
+  ordinal_type NumVectors() const { return _values.extent(0); }
+  ordinal_type NumRows() const { return _values.extent(1); }
+  ordinal_type BlockSize() const { return _values.extent(2); }
+
+  value_array_type Values() const { return _values; }
+};
+
+template <typename DstSpace, typename SrcSpace, typename ArrayLayout>
+inline BlockMultiVector<DstSpace, ArrayLayout> create_mirror(
+    const BlockMultiVector<SrcSpace, ArrayLayout> src) {
+  return BlockMultiVector<DstSpace, ArrayLayout>(Kokkos::create_mirror_view(
+      typename DstSpace::memory_space(), src.Values()));
+}
+
+template <typename DstSpace, typename SrcSpace, typename ArrayLayout>
+inline void deep_copy(const BlockMultiVector<DstSpace, ArrayLayout> dst,
+                      const BlockMultiVector<SrcSpace, ArrayLayout> src) {
+  Kokkos::deep_copy(dst.Values(), src.Values());
+}
+
+template <typename ArrayLayout>
+void fill_block_multi_vector_host(
+    BlockMultiVector<Kokkos::DefaultHostExecutionSpace, ArrayLayout> B) {
+  const ordinal_type jend = B.NumVectors(), iend = B.NumRows(),
+                     kend = B.BlockSize();
+
+  auto B_val = B.Values();
+
+  for (ordinal_type j = 0; j < jend; ++j)
+    for (ordinal_type i = 0; i < iend; ++i)
+      for (ordinal_type k = 0; k < kend; ++k)
+        B_val(j, i, k) = static_cast<double>((i + j + k) % 7) - 3;
+}
+
+template <typename ExecSpace, typename ValueType, typename ArrayLayout>
+class BlockTridiagMatrices {
+ public:
+  typedef ExecSpace exec_space;
+  typedef ValueType value_type;
+  typedef ArrayLayout array_layout;
+
+  typedef Kokkos::View<value_type ****, array_layout, exec_space>
+      value_array_type;
+
+ private:
+  const ordinal_type _ntridiags, _nrows, _blocksize;
+  // A B
+  // C
+  value_array_type _A, _B, _C;
+
+ public:
+  BlockTridiagMatrices(const ordinal_type ntridiags, const ordinal_type nrows,
+                       const ordinal_type blocksize)
+      : _ntridiags(ntridiags),
+        _nrows(nrows),
+        _blocksize(blocksize),
+        _A("BlockTridiagMatrix::_A", _ntridiags, _nrows, _blocksize,
+           _blocksize),
+        _B("BlockTridiagMatrix::_B", _ntridiags, _nrows - 1, _blocksize,
+           _blocksize),
+        _C("BlockTridiagMatrix::_C", _ntridiags, _nrows - 1, _blocksize,
+           _blocksize) {}
+
+  BlockTridiagMatrices(const ordinal_type ntridiags, const ordinal_type nrows,
+                       const ordinal_type blocksize, const value_array_type A_,
+                       const value_array_type B_, const value_array_type C_)
+      : _ntridiags(ntridiags),
+        _nrows(nrows),
+        _blocksize(blocksize),
+        _A(A_),
+        _B(B_),
+        _C(C_) {}
+
+  value_array_type A() const { return _A; }
+  value_array_type B() const { return _B; }
+  value_array_type C() const { return _C; }
+
+  ordinal_type BlockSize() const { return _blocksize; }
+  ordinal_type NumRows() const { return _nrows; }
+  ordinal_type NumTridiagMatrices() const { return _ntridiags; }
+};
+
+template <typename ExecSpace, typename ValueType, typename ArrayLayout>
+BlockTridiagMatrices<ExecSpace, ValueType, ArrayLayout>
+create_block_tridiag_matrices(const ordinal_type ntridiags,
+                              const ordinal_type nrows,
+                              const ordinal_type blocksize) {
+  return BlockTridiagMatrices<ExecSpace, ValueType, ArrayLayout>(
+      adjustDimension<ValueType>(ntridiags), nrows, blocksize);
+}
+
+template <typename DstSpace, typename SrcSpace, typename ValueType,
+          typename ArrayLayout>
+inline BlockTridiagMatrices<DstSpace, ValueType, ArrayLayout> create_mirror(
+    const BlockTridiagMatrices<SrcSpace, ValueType, ArrayLayout> src) {
+  return BlockTridiagMatrices<DstSpace, ValueType, ArrayLayout>(
+      src.NumTridiagMatrices(), src.NumRows(), src.BlockSize(),
+      Kokkos::create_mirror_view(typename DstSpace::memory_space(), src.A()),
+      Kokkos::create_mirror_view(typename DstSpace::memory_space(), src.B()),
+      Kokkos::create_mirror_view(typename DstSpace::memory_space(), src.C()));
+}
+
+template <typename DstSpace, typename SrcSpace, typename ValueType,
+          typename ArrayLayout>
+inline void deep_copy(
+    const BlockTridiagMatrices<DstSpace, ValueType, ArrayLayout> dst,
+    const BlockTridiagMatrices<SrcSpace, ValueType, ArrayLayout> src) {
+  Kokkos::deep_copy(dst.A(), src.A());
+  Kokkos::deep_copy(dst.B(), src.B());
+  Kokkos::deep_copy(dst.C(), src.C());
+}
+
+template <typename ViewType>
+KOKKOS_INLINE_FUNCTION typename std::enable_if<
+    std::is_same<typename ViewType::value_type, scalar_type>::value,
+    scalar_type &>::type
+tdiag_val(const ViewType &A, const ordinal_type &t, const ordinal_type &i,
+          const ordinal_type &ii, const ordinal_type &jj) {
+  return A(t, i, ii, jj);
+}
+
+template <typename ViewType>
+KOKKOS_INLINE_FUNCTION typename std::enable_if<
+    !std::is_same<typename ViewType::value_type, scalar_type>::value,
+    scalar_type &>::type
+tdiag_val(const ViewType &A, const ordinal_type &t, const ordinal_type &i,
+          const ordinal_type &ii, const ordinal_type &jj) {
+  typedef typename ViewType::value_type value_type;
+  return A(t / value_type::vector_length, i, ii,
+           jj)[t % value_type::vector_length];
+}
+
+template <typename ExeSpace, typename ValueType, typename ArrayLayout>
+class PartitionedBlockMultiVector {
+ public:
+  typedef ExeSpace exec_space;
+  typedef ValueType value_type;
+  typedef ArrayLayout array_layout;
+
+  typedef Kokkos::View<value_type ****, array_layout, exec_space>
+      value_array_type;
+
+ private:
+  value_array_type _values;
+
+ public:
+  PartitionedBlockMultiVector(const ordinal_type nparts,
+                              const ordinal_type nvectors,
+                              const ordinal_type nrows,
+                              const ordinal_type blocksize)
+      : _values("BlockMultiVector::_values", nparts, nvectors, nrows,
+                blocksize) {}
+
+  PartitionedBlockMultiVector(const value_array_type values)
+      : _values(values) {}
+
+  ordinal_type NumPartitions() const { return _values.extent(0); }
+  ordinal_type NumVectors() const { return _values.extent(1); }
+  ordinal_type NumRows() const { return _values.extent(2); }
+  ordinal_type BlockSize() const { return _values.extent(3); }
+
+  value_array_type Values() const { return _values; }
+};
+
+template <typename ExecSpace, typename ValueType, typename ArrayLayout>
+PartitionedBlockMultiVector<ExecSpace, ValueType, ArrayLayout>
+create_partitioned_block_multi_vector(const ordinal_type nparts,
+                                      const ordinal_type nvectors,
+                                      const ordinal_type nrows,
+                                      const ordinal_type blocksize) {
+  return PartitionedBlockMultiVector<ExecSpace, ValueType, ArrayLayout>(
+      adjustDimension<ValueType>(nparts), nvectors, nrows, blocksize);
+}
+
+template <typename DstSpace, typename SrcSpace, typename ValueType,
+          typename ArrayLayout>
+inline PartitionedBlockMultiVector<DstSpace, ValueType, ArrayLayout>
+create_mirror(
+    const PartitionedBlockMultiVector<SrcSpace, ValueType, ArrayLayout> src) {
+  return PartitionedBlockMultiVector<DstSpace, ValueType, ArrayLayout>(
+      Kokkos::create_mirror_view(typename DstSpace::memory_space(),
+                                 src.Values()));
+}
+
+template <typename DstSpace, typename SrcSpace, typename ValueType,
+          typename ArrayLayout>
+inline void deep_copy(
+    const PartitionedBlockMultiVector<DstSpace, ValueType, ArrayLayout> dst,
+    const PartitionedBlockMultiVector<SrcSpace, ValueType, ArrayLayout> src) {
+  Kokkos::deep_copy(dst.Values(), src.Values());
+}
+
+template <typename ValueType, typename ArrayLayout>
+void fill_partitioned_block_multi_vector_host(
+    PartitionedBlockMultiVector<Kokkos::DefaultHostExecutionSpace, ValueType,
+                                ArrayLayout>
+        B,
+    const ordinal_type ninj) {
+  const ordinal_type iend = ninj,  // B.NumPartitions(),
+      jend = B.NumVectors(), kend = B.NumRows(), lend = B.BlockSize();
+
+  auto B_val = B.Values();
+  for (ordinal_type i = 0; i < iend; ++i)
+    for (ordinal_type j = 0; j < jend; ++j)
+      for (ordinal_type k = 0; k < kend; ++k)
+        for (ordinal_type l = 0; l < lend; ++l)
+          tdiag_val(B_val, i, j, k, l) =
+              static_cast<double>((i + j + k + l) % 7) - 3;
+}
+
+template <typename ExecSpace, typename ArrayLayout>
+class BlockCrsMatrixVectorProductByRow {
+ public:
+  typedef BlockCrsMatrix<ExecSpace, ArrayLayout> block_crs_matrix_type;
+  typedef typename block_crs_matrix_type::crs_graph_type crs_graph_type;
+  typedef BlockMultiVector<ExecSpace, ArrayLayout> block_multi_vector_type;
+
+ private:
+  ConstUnmanagedViewType<typename crs_graph_type::row_ptr_type> _rowptr;
+  ConstUnmanagedViewType<typename crs_graph_type::col_idx_type> _colidx;
+
+  ConstUnmanagedViewType<typename block_crs_matrix_type::value_array_type> _A;
+  ConstUnmanagedViewType<typename block_multi_vector_type::value_array_type> _x;
+  /**/ UnmanagedViewType<typename block_multi_vector_type::value_array_type> _y;
+
+  ordinal_type _blocksize;
+
+ public:
+  // A thread maps to a point row of the matrix.
+  // loop = blksize*m
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const ordinal_type idx) const {
+    // index of blockrow and row in a block
+    const ordinal_type i  = idx / _blocksize;
+    const ordinal_type ii = idx % _blocksize;
+
+    // loop over multivectors
+    const ordinal_type jend = _y.extent(0);
+    for (ordinal_type j = 0; j < jend; ++j) {
+      scalar_type tmp = 0;
+
+      // block row
+      const ordinal_type cbegin = _rowptr(i), cend = _rowptr(i + 1);
+
+      for (ordinal_type c = cbegin; c < cend; ++c) {
+        const ordinal_type col = _colidx(c);
+        for (ordinal_type jj = 0; jj < _blocksize; ++jj)
+          tmp += _A(col, ii, jj) * _x(j, col, jj);
+      }
+      _y(j, i, ii) = tmp;
     }
-    
-
-    template <typename ExecSpace, typename ArrayLayout>
-    class BlockCrsMatrixVectorProductByRow {
-    public:
-      typedef BlockCrsMatrix<ExecSpace,ArrayLayout> block_crs_matrix_type;
-      typedef typename block_crs_matrix_type::crs_graph_type crs_graph_type;
-      typedef BlockMultiVector<ExecSpace,ArrayLayout> block_multi_vector_type;
-      
-    private:
-      ConstUnmanagedViewType<typename crs_graph_type::row_ptr_type> _rowptr;
-      ConstUnmanagedViewType<typename crs_graph_type::col_idx_type> _colidx;
-
-      ConstUnmanagedViewType<typename block_crs_matrix_type::value_array_type> _A;
-      ConstUnmanagedViewType<typename block_multi_vector_type::value_array_type> _x;
-      /**/ UnmanagedViewType<typename block_multi_vector_type::value_array_type> _y;
-
-      ordinal_type _blocksize;
-
-    public:
-      // A thread maps to a point row of the matrix.
-      // loop = blksize*m
-      KOKKOS_INLINE_FUNCTION 
-      void operator()(const ordinal_type idx) const {
-        // index of blockrow and row in a block
-        const ordinal_type i  = idx/_blocksize;
-        const ordinal_type ii = idx%_blocksize;
-        
-        // loop over multivectors
-        const ordinal_type jend = _y.extent(0);
-        for (ordinal_type j=0;j<jend;++j) {
+  }
+
+  void run(const block_crs_matrix_type A, const block_multi_vector_type x,
+           const block_multi_vector_type y) {
+    _rowptr = A.CrsGraph().rowptr;
+    _colidx = A.CrsGraph().colidx;
+
+    _blocksize = A.BlockSize();
+
+    _A = A.Values();
+    _x = x.Values();
+    _y = y.Values();
+
+    Kokkos::RangePolicy<ExecSpace> policy(0, _x.extent(1) * _blocksize);
+    Kokkos::parallel_for("BlockCrsMatrixVectorProductByRow::run", policy,
+                         *this);
+  }
+};
+
+template <typename ExecSpace, typename ArrayLayout>
+class BlockCrsMatrixVectorProductByBlockRow {
+ public:
+  typedef BlockCrsMatrix<ExecSpace, ArrayLayout> block_crs_matrix_type;
+  typedef typename block_crs_matrix_type::crs_graph_type crs_graph_type;
+  typedef BlockMultiVector<ExecSpace, ArrayLayout> block_multi_vector_type;
+
+ private:
+  ConstUnmanagedViewType<typename crs_graph_type::row_ptr_type> _rowptr;
+  ConstUnmanagedViewType<typename crs_graph_type::col_idx_type> _colidx;
+
+  ConstUnmanagedViewType<typename block_crs_matrix_type::value_array_type> _A;
+  ConstUnmanagedViewType<typename block_multi_vector_type::value_array_type> _x;
+  /**/ UnmanagedViewType<typename block_multi_vector_type::value_array_type> _y;
+
+  ordinal_type _blocksize;
+
+ public:
+  // A thread maps to a row block of the matrix.
+  // loop = m
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const ordinal_type i) const {
+    // loop over multivector colums
+    const ordinal_type jend = _y.extent(0);
+    for (ordinal_type j = 0; j < jend; ++j) {
+      // set zero
+      for (ordinal_type ii = 0; ii < _blocksize; ++ii) _y(j, i, ii) = 0;
+
+      // block row
+      const ordinal_type cbegin = _rowptr(i), cend = _rowptr(i + 1);
+
+      for (ordinal_type c = cbegin; c < cend; ++c) {
+        const ordinal_type col = _colidx(c);
+        for (ordinal_type ii = 0; ii < _blocksize; ++ii) {
           scalar_type tmp = 0;
-          
-          // block row 
-          const ordinal_type 
-            cbegin = _rowptr(i), cend = _rowptr(i+1);
-
-          for (ordinal_type c=cbegin;c<cend;++c) {
-            const ordinal_type col = _colidx(c);
-            for (ordinal_type jj=0;jj<_blocksize;++jj) 
-              tmp += _A(col,ii,jj)*_x(j, col, jj);
-          }
-          _y(j, i, ii) = tmp;
+          for (ordinal_type jj = 0; jj < _blocksize; ++jj)
+            tmp += _A(col, ii, jj) * _x(j, col, jj);
+          _y(j, i, ii) += tmp;
         }
       }
-      
-      void run(const block_crs_matrix_type A,
-               const block_multi_vector_type x, 
-               const block_multi_vector_type y) {
-        _rowptr = A.CrsGraph().rowptr;
-        _colidx = A.CrsGraph().colidx;
+    }
+  }
 
-        _blocksize = A.BlockSize();
+  void run(const block_crs_matrix_type A, const block_multi_vector_type x,
+           const block_multi_vector_type y) {
+    _rowptr = A.CrsGraph().rowptr;
+    _colidx = A.CrsGraph().colidx;
 
-        _A = A.Values();
-        _x = x.Values();
-        _y = y.Values();
+    _blocksize = A.BlockSize();
 
-        Kokkos::RangePolicy<ExecSpace> policy(0, _x.extent(1)*_blocksize);
-        Kokkos::parallel_for("BlockCrsMatrixVectorProductByRow::run", policy, *this);
-      }
-    };
-
-    template <typename ExecSpace, typename ArrayLayout>
-    class BlockCrsMatrixVectorProductByBlockRow {
-    public:
-      typedef BlockCrsMatrix<ExecSpace,ArrayLayout> block_crs_matrix_type;
-      typedef typename block_crs_matrix_type::crs_graph_type crs_graph_type;
-      typedef BlockMultiVector<ExecSpace,ArrayLayout> block_multi_vector_type;
-
-    private:
-      ConstUnmanagedViewType<typename crs_graph_type::row_ptr_type> _rowptr;
-      ConstUnmanagedViewType<typename crs_graph_type::col_idx_type> _colidx;
-
-      ConstUnmanagedViewType<typename block_crs_matrix_type::value_array_type> _A;
-      ConstUnmanagedViewType<typename block_multi_vector_type::value_array_type> _x;
-      /**/ UnmanagedViewType<typename block_multi_vector_type::value_array_type> _y;
-
-      ordinal_type _blocksize;
-
-    public:
-      
-      // A thread maps to a row block of the matrix.
-      // loop = m
-      KOKKOS_INLINE_FUNCTION 
-      void operator()(const ordinal_type i) const {
-        // loop over multivector colums
-        const ordinal_type jend = _y.extent(0);
-        for (ordinal_type j=0;j<jend;++j) {
-          // set zero
-          for (ordinal_type ii=0;ii<_blocksize;++ii) 
-            _y(j, i, ii) = 0;
-          
-          // block row 
-          const ordinal_type 
-            cbegin = _rowptr(i), cend = _rowptr(i+1);
-          
-          for (ordinal_type c=cbegin;c<cend;++c) {
-            const ordinal_type col = _colidx(c);
-            for (ordinal_type ii=0;ii<_blocksize;++ii) {
-              scalar_type tmp = 0;
-              for (ordinal_type jj=0;jj<_blocksize;++jj) 
-                tmp += _A(col,ii,jj)*_x(j, col, jj);
-              _y(j, i, ii) += tmp;
-            }
-          }
-        }
-      }
-      
-      void run(const block_crs_matrix_type A,
-               const block_multi_vector_type x, 
-               const block_multi_vector_type y) {
-        _rowptr = A.CrsGraph().rowptr;
-        _colidx = A.CrsGraph().colidx;
-
-        _blocksize = A.BlockSize();
-
-        _A = A.Values();
-        _x = x.Values();
-        _y = y.Values();
-        
-        Kokkos::RangePolicy<ExecSpace> policy(0, _x.extent(1));
-        Kokkos::parallel_for(policy, *this);
-      }
-    };
-
-    template <typename ExecSpace, typename ValueType, typename ArrayLayout>
-    class ExtractBlockTridiagMatrices {
-    public:
-      typedef ExecSpace exec_space;
-      typedef ValueType value_type;
-      typedef ArrayLayout array_layout;
-
-      typedef StructuredBlock structured_block_mesh_type;
-      typedef BlockCrsMatrix<exec_space,array_layout> block_crs_matrix_type;
-      typedef typename block_crs_matrix_type::crs_graph_type crs_graph_type;
-      typedef BlockTridiagMatrices<exec_space,value_type,array_layout> block_tridiag_matrices_type;
-
-    private:
-      structured_block_mesh_type _mesh;
-      ordinal_type _blocksize;
-      
-      ConstUnmanagedViewType<typename crs_graph_type::row_ptr_type> _rowptr;
-      ConstUnmanagedViewType<typename crs_graph_type::row_idx_type> _rowidx;
-      ConstUnmanagedViewType<typename crs_graph_type::col_idx_type> _colidx;
-      
-      ConstUnmanagedViewType<typename block_crs_matrix_type::value_array_type> _A;
-      /**/ UnmanagedViewType<typename block_tridiag_matrices_type::value_array_type> _TA, _TB, _TC;
-      
-    public:
-      ExtractBlockTridiagMatrices(const structured_block_mesh_type mesh) 
-        : _mesh(mesh) {}
-
-      template<typename TViewType,
-               typename AViewType>
-      KOKKOS_INLINE_FUNCTION
-      void 
-      elementwise_copy(const TViewType &T,
-                       const AViewType &A,
-                       const ordinal_type ij, 
-                       const ordinal_type k,
-                       const ordinal_type c,
-                       const ordinal_type blocksize) const {
-        for (ordinal_type ii=0;ii<blocksize;++ii)
-          for (ordinal_type jj=0;jj<blocksize;++jj) 
-            tdiag_val(T, ij, k, ii, jj) = A(c, ii, jj);
+    _A = A.Values();
+    _x = x.Values();
+    _y = y.Values();
+
+    Kokkos::RangePolicy<ExecSpace> policy(0, _x.extent(1));
+    Kokkos::parallel_for(policy, *this);
+  }
+};
+
+template <typename ExecSpace, typename ValueType, typename ArrayLayout>
+class ExtractBlockTridiagMatrices {
+ public:
+  typedef ExecSpace exec_space;
+  typedef ValueType value_type;
+  typedef ArrayLayout array_layout;
+
+  typedef StructuredBlock structured_block_mesh_type;
+  typedef BlockCrsMatrix<exec_space, array_layout> block_crs_matrix_type;
+  typedef typename block_crs_matrix_type::crs_graph_type crs_graph_type;
+  typedef BlockTridiagMatrices<exec_space, value_type, array_layout>
+      block_tridiag_matrices_type;
+
+ private:
+  structured_block_mesh_type _mesh;
+  ordinal_type _blocksize;
+
+  ConstUnmanagedViewType<typename crs_graph_type::row_ptr_type> _rowptr;
+  ConstUnmanagedViewType<typename crs_graph_type::row_idx_type> _rowidx;
+  ConstUnmanagedViewType<typename crs_graph_type::col_idx_type> _colidx;
+
+  ConstUnmanagedViewType<typename block_crs_matrix_type::value_array_type> _A;
+  /**/ UnmanagedViewType<typename block_tridiag_matrices_type::value_array_type>
+      _TA, _TB, _TC;
+
+ public:
+  ExtractBlockTridiagMatrices(const structured_block_mesh_type mesh)
+      : _mesh(mesh) {}
+
+  template <typename TViewType, typename AViewType>
+  KOKKOS_INLINE_FUNCTION void elementwise_copy(
+      const TViewType &T, const AViewType &A, const ordinal_type ij,
+      const ordinal_type k, const ordinal_type c,
+      const ordinal_type blocksize) const {
+    for (ordinal_type ii = 0; ii < blocksize; ++ii)
+      for (ordinal_type jj = 0; jj < blocksize; ++jj)
+        tdiag_val(T, ij, k, ii, jj) = A(c, ii, jj);
+  }
+
+  // A thread maps nonzero blocks
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const ordinal_type c) const {
+    const ordinal_type row = _rowidx[c], col = _colidx[c];
+
+    ordinal_type ri, rj, rk, ci, cj, ck;
+    _mesh.id2ijk(row, ri, rj, rk);
+    _mesh.id2ijk(col, ci, cj, ck);
+
+    if (ri == ci && rj == cj) {
+      const ordinal_type ij = _mesh.ij2id(ri, rj);
+      // consider connectivity to k-direction
+      switch (rk - ck) {
+        case 1: elementwise_copy(_TC, _A, ij, ck, c, _blocksize); break;
+        case 0: elementwise_copy(_TA, _A, ij, rk, c, _blocksize); break;
+        case -1: elementwise_copy(_TB, _A, ij, rk, c, _blocksize); break;
       }
-      
-      // A thread maps nonzero blocks
-      KOKKOS_INLINE_FUNCTION 
-      void operator()(const ordinal_type c) const {
-        const ordinal_type row = _rowidx[c], col = _colidx[c];
-
-        ordinal_type ri, rj, rk, ci, cj, ck;
-        _mesh.id2ijk(row, ri, rj, rk);
-        _mesh.id2ijk(col, ci, cj, ck);
-  
-        if (ri == ci && rj == cj) {
-          const ordinal_type ij = _mesh.ij2id(ri, rj);
-          // consider connectivity to k-direction
-          switch (rk - ck) {
-          case  1: elementwise_copy(_TC, _A, ij, ck, c, _blocksize); break;
-          case  0: elementwise_copy(_TA, _A, ij, rk, c, _blocksize); break;
-          case -1: elementwise_copy(_TB, _A, ij, rk, c, _blocksize); break;
+    }
+  }
+
+  void run(const block_crs_matrix_type A, const block_tridiag_matrices_type T) {
+    _rowptr = A.CrsGraph().rowptr;
+    _rowidx = A.CrsGraph().rowidx;
+    _colidx = A.CrsGraph().colidx;
+
+    _A = A.Values();
+
+    _TA = T.A();
+    _TB = T.B();
+    _TC = T.C();
+
+    _blocksize = A.BlockSize();
+    Kokkos::RangePolicy<ExecSpace> policy(0, _A.extent(0));
+    Kokkos::parallel_for(policy, *this);
+  }
+
+  template <typename TViewType, typename AViewType>
+  bool elementwise_check(const TViewType &T, const AViewType &A,
+                         const ordinal_type ij, const ordinal_type k,
+                         const ordinal_type c,
+                         const ordinal_type blocksize) const {
+    const auto eps = 1e2 * std::numeric_limits<scalar_type>::epsilon();
+    for (ordinal_type ii = 0; ii < blocksize; ++ii)
+      for (ordinal_type jj = 0; jj < blocksize; ++jj)
+        if (Kokkos::ArithTraits<scalar_type>::abs(tdiag_val(T, ij, k, ii, jj) -
+                                                  A(c, ii, jj)) >= eps)
+          return false;
+    return true;
+  }
+
+  bool check() const {
+    auto rowptr = Kokkos::create_mirror_view(_rowptr);
+    Kokkos::deep_copy(rowptr, _rowptr);
+    auto colidx = Kokkos::create_mirror_view(_colidx);
+    Kokkos::deep_copy(colidx, _colidx);
+    auto TA = Kokkos::create_mirror_view(_TA);
+    Kokkos::deep_copy(TA, _TA);
+    auto TB = Kokkos::create_mirror_view(_TB);
+    Kokkos::deep_copy(TB, _TB);
+    auto TC = Kokkos::create_mirror_view(_TC);
+    Kokkos::deep_copy(TC, _TC);
+    auto A = Kokkos::create_mirror_view(_A);
+    Kokkos::deep_copy(A, _A);
+
+    const ordinal_type ijend = adjustDimension<value_type>(_mesh.ni * _mesh.nj),
+                       kend  = _mesh.nk;
+
+    assert(ijend == ordinal_type(TA.extent(0)));
+    assert((kend - 0) == ordinal_type(TA.extent(1)));
+    assert(ijend == ordinal_type(TB.extent(0)));
+    assert((kend - 1) == ordinal_type(TB.extent(1)));
+    assert(ijend == ordinal_type(TC.extent(0)));
+    assert((kend - 1) == ordinal_type(TC.extent(1)));
+
+    for (ordinal_type ij = 0; ij < ijend; ++ij) {
+      ordinal_type i, j;
+      _mesh.id2ij(ij, i, j);
+
+      for (ordinal_type k = 0; k < kend; ++k) {
+        const ordinal_type row = _mesh.ijk2id(i, j, k), idx_begin = rowptr[row],
+                           idx_end = rowptr[row + 1];
+
+        // check
+        bool found[3] = {}, same[3] = {};
+        for (ordinal_type idx = idx_begin; idx < idx_end; ++idx) {
+          switch (row - colidx[idx]) {
+            case 1:
+              same[2]  = elementwise_check(TC, A, ij, k - 1, idx, _blocksize);
+              found[2] = true;
+              break;
+            case 0:
+              same[0]  = elementwise_check(TA, A, ij, k, idx, _blocksize);
+              found[0] = true;
+              break;
+            case -1:
+              same[1]  = elementwise_check(TB, A, ij, k, idx, _blocksize);
+              found[1] = true;
+              break;
           }
         }
+        if (k == 0)
+          assert(found[0] & same[0] && found[1] & same[1]);
+        else if (k == (kend - 1))
+          assert(found[0] & same[0] && found[2] & same[2]);
+        else
+          assert(found[0] & same[0] && found[1] & same[1] &&
+                 found[2] & same[2]);
       }
-      
-      void run(const block_crs_matrix_type A,
-               const block_tridiag_matrices_type T) { 
-        _rowptr = A.CrsGraph().rowptr;
-        _rowidx = A.CrsGraph().rowidx;
-        _colidx = A.CrsGraph().colidx;
-
-        _A = A.Values();
-
-        _TA = T.A(); 
-        _TB = T.B(); 
-        _TC = T.C();
-
-        _blocksize = A.BlockSize();
-        Kokkos::RangePolicy<ExecSpace> policy(0, _A.extent(0));
-        Kokkos::parallel_for(policy, *this);
-      }
+    }
+    return true;
+  }
+};
 
-      template<typename TViewType,
-               typename AViewType>
-      bool elementwise_check(const TViewType &T,
-                             const AViewType &A,
-                             const ordinal_type ij, 
-                             const ordinal_type k,
-                             const ordinal_type c,
-                             const ordinal_type blocksize) const {
-        const auto eps = 1e2*std::numeric_limits<scalar_type>::epsilon();
-        for (ordinal_type ii=0;ii<blocksize;++ii)
-          for (ordinal_type jj=0;jj<blocksize;++jj) 
-            if ( std::abs(tdiag_val(T, ij, k, ii, jj) - A(c, ii, jj)) >= eps ) return false; 
-        return true;
-      }
-      
-      bool check() const {        
-        auto rowptr = Kokkos::create_mirror_view(_rowptr); Kokkos::deep_copy(rowptr, _rowptr);
-        auto colidx = Kokkos::create_mirror_view(_colidx); Kokkos::deep_copy(colidx, _colidx);
-        auto TA     = Kokkos::create_mirror_view(_TA);     Kokkos::deep_copy(TA, _TA);
-        auto TB     = Kokkos::create_mirror_view(_TB);     Kokkos::deep_copy(TB, _TB);
-        auto TC     = Kokkos::create_mirror_view(_TC);     Kokkos::deep_copy(TC, _TC);
-        auto A      = Kokkos::create_mirror_view(_A);      Kokkos::deep_copy(A, _A);
-
-        const ordinal_type 
-          ijend = adjustDimension<value_type>(_mesh.ni*_mesh.nj),
-          kend = _mesh.nk;
-
-        assert(ijend == ordinal_type(TA.extent(0))); assert((kend - 0) == ordinal_type(TA.extent(1))); 
-        assert(ijend == ordinal_type(TB.extent(0))); assert((kend - 1) == ordinal_type(TB.extent(1)));
-        assert(ijend == ordinal_type(TC.extent(0))); assert((kend - 1) == ordinal_type(TC.extent(1)));
-
-        for (ordinal_type ij=0;ij<ijend;++ij) {
-          ordinal_type i, j;
-          _mesh.id2ij(ij, i, j);
-
-          for (ordinal_type k=0;k<kend;++k) {
-            const ordinal_type row = _mesh.ijk2id(i, j, k),
-              idx_begin = rowptr[row], 
-              idx_end = rowptr[row+1];
-
-            // check
-            bool found[3] = {}, same[3] = {};
-            for (ordinal_type idx=idx_begin;idx<idx_end;++idx) {
-              switch (row - colidx[idx]) {
-              case  1: same[2] = elementwise_check(TC, A, ij, k-1, idx, _blocksize); found[2] = true; break;
-              case  0: same[0] = elementwise_check(TA, A, ij, k,   idx, _blocksize); found[0] = true; break;
-              case -1: same[1] = elementwise_check(TB, A, ij, k,   idx, _blocksize); found[1] = true; break;
-              }
-            }
-            if      (k == 0)         assert(found[0] & same[0] && found[1] & same[1]); 
-            else if (k == (kend-1))  assert(found[0] & same[0] && found[2] & same[2]); 
-            else                     assert(found[0] & same[0] && found[1] & same[1] && found[2] & same[2]); 
-          }
-        }            
-        return true;
-      }
-    };
+inline bool eq(const std::string &a, const char *const b1,
+               const char *const b2 = 0) {
+  return (a == std::string(b1) || (b2 && a == std::string(b2)) ||
+          a == std::string("-") + std::string(b1));
+}
 
-    inline bool eq (const std::string& a, const char* const b1, const char* const b2 = 0) {
-      return (a == std::string(b1) || (b2 && a == std::string(b2)) ||
-              a == std::string("-") + std::string(b1));
+// Command-line argument parser and holder.
+template <typename ExecSpace>
+struct Input {
+  bool quiet, check;
+  ordinal_type ni, nj, nk;
+  ordinal_type bs;    // block size
+  ordinal_type nrhs;  // #vectors in multivector
+  ordinal_type opf, ops;
+  StencilShape::Enum stencil_shape;
+
+  Input(int argc, char **argv) {
+    quiet = false;
+    check = false;
+    ni = nj = nk = 10;
+    bs           = 5;
+    nrhs         = 1;
+    if (std::is_same<typename ExecSpace::memory_space,
+                     Kokkos::HostSpace>::value) {
+      opf = 0;
+      ops = 0;  // range policy default
+    } else {
+      opf = 1;
+      ops = 1;  // team is default
     }
+    stencil_shape = StencilShape::cross;
+
+    for (ordinal_type i = 1; i < argc; ++i) {
+      const std::string &token = argv[i];
+      if (eq(token, "-nijk"))
+        ni = nj = nk = std::atoi(argv[++i]);
+      else if (eq(token, "-ni"))
+        ni = std::atoi(argv[++i]);
+      else if (eq(token, "-nj"))
+        nj = std::atoi(argv[++i]);
+      else if (eq(token, "-nk"))
+        nk = std::atoi(argv[++i]);
+      else if (eq(token, "-bs"))
+        bs = std::atoi(argv[++i]);
+      else if (eq(token, "-nrhs"))
+        nrhs = std::atoi(argv[++i]);
+      else if (eq(token, "-opf"))
+        opf = std::atoi(argv[++i]);
+      else if (eq(token, "-ops"))
+        ops = std::atoi(argv[++i]);
+      else if (eq(token, "-c", "-check"))
+        check = true;
+    }
+    if (nk <= 1) throw std::runtime_error("k dimension is <= 1; must be >= 2.");
+    if (!quiet) print(std::cout);
+  }
 
-    // Command-line argument parser and holder.
-    template<typename ExecSpace>
-    struct Input {
-      bool quiet, check;
-      ordinal_type ni, nj, nk;
-      ordinal_type bs; // block size
-      ordinal_type nrhs; // #vectors in multivector
-      ordinal_type opf, ops;
-      StencilShape::Enum stencil_shape;
-      
-      Input (int argc, char** argv) {
-        quiet = false;
-        check = false;
-        ni = nj = nk = 10;
-        bs = 5;
-        nrhs = 1;
-        if (std::is_same<typename ExecSpace::memory_space,Kokkos::HostSpace>::value) {
-          opf = 0; ops = 0; // range policy default
-        } else {
-          opf = 1; ops = 1; // team is default
-        }
-        stencil_shape = StencilShape::cross;
-
-        for (ordinal_type i=1;i<argc;++i) {
-          const std::string& token = argv[i];
-          if (eq(token, "-nijk")) ni = nj = nk = std::atoi(argv[++i]);
-          else if (eq(token, "-ni")) ni = std::atoi(argv[++i]);
-          else if (eq(token, "-nj")) nj = std::atoi(argv[++i]);
-          else if (eq(token, "-nk")) nk = std::atoi(argv[++i]);
-          else if (eq(token, "-bs")) bs = std::atoi(argv[++i]);
-          else if (eq(token, "-nrhs")) nrhs = std::atoi(argv[++i]);
-          else if (eq(token, "-opf")) opf = std::atoi(argv[++i]);
-          else if (eq(token, "-ops")) ops = std::atoi(argv[++i]);
-          else if (eq(token, "-c", "-check")) check = true;
-        }
-        if (nk <= 1)
-          throw std::runtime_error("k dimension is <= 1; must be >= 2.");
-        if ( ! quiet) print(std::cout);
-      }
-      
-      void print (std::ostream& os) const {
-        os << "<I> ni " << ni << " nj " << nj << " nk " << nk
-           << " bs " << bs
-           << " nrhs " << nrhs
-           << " opf " << opf << " ops " << ops
-           << " sc " << stencil_shape << "\n";
-      }
-    };
-
+  void print(std::ostream &os) const {
+    os << "<I> ni " << ni << " nj " << nj << " nk " << nk << " bs " << bs
+       << " nrhs " << nrhs << " opf " << opf << " ops " << ops << " sc "
+       << stencil_shape << "\n";
   }
-}
+};
+
+}  // namespace Test
+}  // namespace KokkosBatched
diff --git a/src/batched/dense/KokkosBatched_Trmm_Decl.hpp b/src/batched/dense/KokkosBatched_Trmm_Decl.hpp
index 9844de8431..f00eaadd67 100644
--- a/src/batched/dense/KokkosBatched_Trmm_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_Trmm_Decl.hpp
@@ -50,20 +50,13 @@
 
 namespace KokkosBatched {
 
-  template<typename ArgSide,
-           typename ArgUplo,
-           typename ArgTrans,
-           typename ArgDiag,
-           typename ArgAlgo>
-  struct SerialTrmm {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B);
-  };
-} // namespace KokkosBatched
-#endif // __KOKKOSBATCHED_TRMM_DECL_HPP__
+template <typename ArgSide, typename ArgUplo, typename ArgTrans,
+          typename ArgDiag, typename ArgAlgo>
+struct SerialTrmm {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B);
+};
+}  // namespace KokkosBatched
+#endif  // __KOKKOSBATCHED_TRMM_DECL_HPP__
diff --git a/src/batched/dense/KokkosBatched_Trsm_Decl.hpp b/src/batched/dense/KokkosBatched_Trsm_Decl.hpp
index fb97b9a704..1239ba1de8 100644
--- a/src/batched/dense/KokkosBatched_Trsm_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_Trsm_Decl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_TRSM_DECL_HPP__
 #define __KOKKOSBATCHED_TRSM_DECL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,88 +8,60 @@
 
 namespace KokkosBatched {
 
-  template<typename ArgSide,
-           typename ArgUplo,
-           typename ArgTrans,
-           typename ArgDiag,
-           typename ArgAlgo>
-  struct SerialTrsm {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B);
-  };
+template <typename ArgSide, typename ArgUplo, typename ArgTrans,
+          typename ArgDiag, typename ArgAlgo>
+struct SerialTrsm {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B);
+};
 
-  template<typename MemberType,
-           typename ArgSide,
-           typename ArgUplo,
-           typename ArgTrans,
-           typename ArgDiag,
-           typename ArgAlgo>
-  struct TeamTrsm {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B);
-  };
+template <typename MemberType, typename ArgSide, typename ArgUplo,
+          typename ArgTrans, typename ArgDiag, typename ArgAlgo>
+struct TeamTrsm {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B);
+};
 
-  template<typename MemberType,
-           typename ArgSide,
-           typename ArgUplo,
-           typename ArgTrans,
-           typename ArgDiag,
-           typename ArgAlgo>
-  struct TeamVectorTrsm {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B);
-  };
+template <typename MemberType, typename ArgSide, typename ArgUplo,
+          typename ArgTrans, typename ArgDiag, typename ArgAlgo>
+struct TeamVectorTrsm {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B);
+};
 
-  ///
-  /// Selective Interface
-  ///
-  template<typename MemberType,
-           typename ArgSide,
-           typename ArgUplo,
-           typename ArgTrans,
-           typename ArgDiag,
-           typename ArgMode, typename ArgAlgo>
-  struct Trsm {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_FORCEINLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      int r_val = 0;
-      if (std::is_same<ArgMode,Mode::Serial>::value) {
-        r_val = SerialTrsm<ArgSide,ArgUplo,ArgTrans,ArgDiag,ArgAlgo>::invoke(alpha, A, B);
-      } else if (std::is_same<ArgMode,Mode::Team>::value) {
-        r_val = TeamTrsm<MemberType,ArgSide,ArgUplo,ArgTrans,ArgDiag,ArgAlgo>::invoke(member, alpha, A, B);
-      } 
-      return r_val;
-    }      
-  };
+///
+/// Selective Interface
+///
+template <typename MemberType, typename ArgSide, typename ArgUplo,
+          typename ArgTrans, typename ArgDiag, typename ArgMode,
+          typename ArgAlgo>
+struct Trsm {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member,
+                                                const ScalarType alpha,
+                                                const AViewType &A,
+                                                const BViewType &B) {
+    int r_val = 0;
+    if (std::is_same<ArgMode, Mode::Serial>::value) {
+      r_val = SerialTrsm<ArgSide, ArgUplo, ArgTrans, ArgDiag, ArgAlgo>::invoke(
+          alpha, A, B);
+    } else if (std::is_same<ArgMode, Mode::Team>::value) {
+      r_val = TeamTrsm<MemberType, ArgSide, ArgUplo, ArgTrans, ArgDiag,
+                       ArgAlgo>::invoke(member, alpha, A, B);
+    }
+    return r_val;
+  }
+};
 
-}
+}  // namespace KokkosBatched
 
 #include "KokkosBatched_Trsm_Serial_Impl.hpp"
 #include "KokkosBatched_Trsm_Team_Impl.hpp"
diff --git a/src/batched/dense/KokkosBatched_Trsv_Decl.hpp b/src/batched/dense/KokkosBatched_Trsv_Decl.hpp
index 291c75b435..3e5a35b058 100644
--- a/src/batched/dense/KokkosBatched_Trsv_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_Trsv_Decl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_TRSV_DECL_HPP__
 #define __KOKKOSBATCHED_TRSV_DECL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,187 +8,198 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Trsv
-  ///
-
-  template<typename ArgUplo,
-           typename ArgTrans,
-           typename ArgDiag,
-           typename ArgAlgo>
-  struct SerialTrsv {
-
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType /*alpha*/,
-           const AViewType &/*A*/,
-           const bViewType &/*b*/) {
-      assert(false && "Error: encounter dummy impl");
-      return 0;
-    }
-  };
-
-    
-  ///
-  /// Team Trsv
-  ///
-
-  template<typename MemberType, 
-           typename ArgUplo,
-           typename ArgTrans,
-           typename ArgDiag,
-           typename ArgAlgo>
-  struct TeamTrsv {
-
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &/*member*/,
-           const ScalarType /*alpha*/,
-           const AViewType &/*A*/,
-           const bViewType &/*b*/) {
-      assert(false && "Error: encounter dummy impl");
-      return 0;
-    }
-  };
-
-  ///
-  /// TeamVector Trsv
-  ///
-
-  template<typename MemberType, 
-           typename ArgUplo,
-           typename ArgTrans,
-           typename ArgDiag,
-           typename ArgAlgo>
-  struct TeamVectorTrsv {
-
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &/*member*/,
-           const ScalarType /*alpha*/,
-           const AViewType &/*A*/,
-           const bViewType &/*b*/) {
-      assert(false && "Error: encounter dummy impl");
-      return 0;
+///
+/// Serial Trsv
+///
+
+template <typename ArgUplo, typename ArgTrans, typename ArgDiag,
+          typename ArgAlgo>
+struct SerialTrsv {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType /*alpha*/,
+                                           const AViewType & /*A*/,
+                                           const bViewType & /*b*/) {
+    assert(false && "Error: encounter dummy impl");
+    return 0;
+  }
+};
+
+///
+/// Team Trsv
+///
+
+template <typename MemberType, typename ArgUplo, typename ArgTrans,
+          typename ArgDiag, typename ArgAlgo>
+struct TeamTrsv {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/,
+                                           const ScalarType /*alpha*/,
+                                           const AViewType & /*A*/,
+                                           const bViewType & /*b*/) {
+    assert(false && "Error: encounter dummy impl");
+    return 0;
+  }
+};
+
+///
+/// TeamVector Trsv
+///
+
+template <typename MemberType, typename ArgUplo, typename ArgTrans,
+          typename ArgDiag, typename ArgAlgo>
+struct TeamVectorTrsv {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/,
+                                           const ScalarType /*alpha*/,
+                                           const AViewType & /*A*/,
+                                           const bViewType & /*b*/) {
+    assert(false && "Error: encounter dummy impl");
+    return 0;
+  }
+};
+
+///
+/// Selective Interface
+///
+template <typename MemberType, typename ArgUplo, typename ArgTrans,
+          typename ArgDiag, typename ArgMode, typename ArgAlgo>
+struct Trsv {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const bViewType &b) {
+    int r_val = 0;
+    if (std::is_same<ArgMode, Mode::Serial>::value) {
+      r_val =
+          SerialTrsv<ArgUplo, ArgTrans, ArgDiag, ArgAlgo>::invoke(alpha, A, b);
+    } else if (std::is_same<ArgMode, Mode::Team>::value) {
+      r_val = TeamTrsv<MemberType, ArgUplo, ArgTrans, ArgDiag, ArgAlgo>::invoke(
+          member, alpha, A, b);
+    } else if (std::is_same<ArgMode, Mode::TeamVector>::value) {
+      r_val = TeamVectorTrsv<MemberType, ArgUplo, ArgTrans, ArgDiag,
+                             ArgAlgo>::invoke(member, alpha, A, b);
     }
-  };
-
-
-  ///
-  /// Selective Interface
-  ///
-  template<typename MemberType, 
-           typename ArgUplo,
-           typename ArgTrans,
-           typename ArgDiag,
-           typename ArgMode, typename ArgAlgo>
-  struct Trsv {      
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const bViewType &b) {
-      int r_val = 0;
-      if (std::is_same<ArgMode,Mode::Serial>::value) {
-        r_val = SerialTrsv<ArgUplo,ArgTrans,ArgDiag,ArgAlgo>::invoke(alpha, A, b);
-      } else if (std::is_same<ArgMode,Mode::Team>::value) {
-        r_val = TeamTrsv<MemberType,ArgUplo,ArgTrans,ArgDiag,ArgAlgo>::invoke(member, alpha, A, b);
-      } else if (std::is_same<ArgMode,Mode::TeamVector>::value) {
-        r_val = TeamVectorTrsv<MemberType,ArgUplo,ArgTrans,ArgDiag,ArgAlgo>::invoke(member, alpha, A, b);
-      } 
-      return r_val;
-    }      
-  };
-
-}
+    return r_val;
+  }
+};
+
+}  // namespace KokkosBatched
 
 #include "KokkosBatched_Trsv_Serial_Impl.hpp"
 #include "KokkosBatched_Trsv_Team_Impl.hpp"
 #include "KokkosBatched_Trsv_TeamVector_Impl.hpp"
 
-#define KOKKOSBATCHED_SERIAL_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS) \
-  KokkosBatched::SerialTrsvInternalLower<ALGOTYPE>::invoke(DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS)
-
-#define KOKKOSBATCHED_SERIAL_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS) \
-  KokkosBatched::SerialTrsvInternalUpper<ALGOTYPE>::invoke(DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS)
-
-#define KOKKOSBATCHED_SERIAL_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS) \
-  KokkosBatched::SerialTrsvInternalUpper<ALGOTYPE>::invoke(DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS)
-
-#define KOKKOSBATCHED_SERIAL_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS) \
-  KokkosBatched::SerialTrsvInternalLower<ALGOTYPE>::invoke(DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS) 
-
-#define KOKKOSBATCHED_TEAM_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,MEMBER,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS) \
-  KokkosBatched::TeamTrsvInternalLower<ALGOTYPE>::invoke(MEMBER,DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS)
-
-#define KOKKOSBATCHED_TEAM_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,MEMBER,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS) \
-  KokkosBatched::TeamTrsvInternalUpper<ALGOTYPE>::invoke(MEMBER,DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS)
-
-#define KOKKOSBATCHED_TEAM_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,MEMBER,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS) \
-  KokkosBatched::TeamTrsvInternalUpper<ALGOTYPE>::invoke(MEMBER,DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS)
-
-#define KOKKOSBATCHED_TEAM_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,MEMBER,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS) \
-  KokkosBatched::TeamTrsvInternalLower<ALGOTYPE>::invoke(MEMBER,DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS) 
-
-#define KOKKOSBATCHED_TEAMVECTOR_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,MEMBER,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS) \
-  KokkosBatched::TeamVectorTrsvInternalLower<ALGOTYPE>::invoke(MEMBER,DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS)
-
-#define KOKKOSBATCHED_TEAMVECTOR_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,MEMBER,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS) \
-  KokkosBatched::TeamVectorTrsvInternalUpper<ALGOTYPE>::invoke(MEMBER,DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS)
-
-#define KOKKOSBATCHED_TEAMVECTOR_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,MEMBER,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS) \
-  KokkosBatched::TeamVectorTrsvInternalUpper<ALGOTYPE>::invoke(MEMBER,DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS)
-
-#define KOKKOSBATCHED_TEAMVECTOR_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,MEMBER,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS) \
-  KokkosBatched::TeamVectorTrsvInternalLower<ALGOTYPE>::invoke(MEMBER,DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS) 
-
-#define KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(MODETYPE,ALGOTYPE,MEMBER,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS) \
-  if (std::is_same<MODETYPE,KokkosBatched::Mode::Serial>::value) {      \
-    KOKKOSBATCHED_SERIAL_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS); \
-  } else if (std::is_same<MODETYPE,KokkosBatched::Mode::Team>::value) { \
-    KOKKOSBATCHED_TEAM_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,MEMBER,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS); \
-  } else if (std::is_same<MODETYPE,KokkosBatched::Mode::TeamVector>::value) { \
-    KOKKOSBATCHED_TEAMVECTOR_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,MEMBER,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS); \
-  }                                                                   
+#define KOKKOSBATCHED_SERIAL_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE( \
+    ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS)                  \
+  KokkosBatched::SerialTrsvInternalLower<ALGOTYPE>::invoke(           \
+      DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS)
+
+#define KOKKOSBATCHED_SERIAL_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE( \
+    ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS)               \
+  KokkosBatched::SerialTrsvInternalUpper<ALGOTYPE>::invoke(        \
+      DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS)
+
+#define KOKKOSBATCHED_SERIAL_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE( \
+    ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS)                  \
+  KokkosBatched::SerialTrsvInternalUpper<ALGOTYPE>::invoke(           \
+      DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS)
+
+#define KOKKOSBATCHED_SERIAL_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE( \
+    ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS)               \
+  KokkosBatched::SerialTrsvInternalLower<ALGOTYPE>::invoke(        \
+      DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS)
+
+#define KOKKOSBATCHED_TEAM_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE( \
+    ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS)        \
+  KokkosBatched::TeamTrsvInternalLower<ALGOTYPE>::invoke(           \
+      MEMBER, DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS)
+
+#define KOKKOSBATCHED_TEAM_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE( \
+    ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS)     \
+  KokkosBatched::TeamTrsvInternalUpper<ALGOTYPE>::invoke(        \
+      MEMBER, DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS)
+
+#define KOKKOSBATCHED_TEAM_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE( \
+    ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS)        \
+  KokkosBatched::TeamTrsvInternalUpper<ALGOTYPE>::invoke(           \
+      MEMBER, DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS)
+
+#define KOKKOSBATCHED_TEAM_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE( \
+    ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS)     \
+  KokkosBatched::TeamTrsvInternalLower<ALGOTYPE>::invoke(        \
+      MEMBER, DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS)
+
+#define KOKKOSBATCHED_TEAMVECTOR_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE( \
+    ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS)              \
+  KokkosBatched::TeamVectorTrsvInternalLower<ALGOTYPE>::invoke(           \
+      MEMBER, DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS)
+
+#define KOKKOSBATCHED_TEAMVECTOR_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE( \
+    ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS)           \
+  KokkosBatched::TeamVectorTrsvInternalUpper<ALGOTYPE>::invoke(        \
+      MEMBER, DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS)
+
+#define KOKKOSBATCHED_TEAMVECTOR_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE( \
+    ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS)              \
+  KokkosBatched::TeamVectorTrsvInternalUpper<ALGOTYPE>::invoke(           \
+      MEMBER, DIAG::use_unit_diag, M, ALPHA, A, AS0, AS1, B, BS)
+
+#define KOKKOSBATCHED_TEAMVECTOR_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE( \
+    ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS)           \
+  KokkosBatched::TeamVectorTrsvInternalLower<ALGOTYPE>::invoke(        \
+      MEMBER, DIAG::use_unit_diag, N, ALPHA, A, AS1, AS0, B, BS)
+
+#define KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(                 \
+    MODETYPE, ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS)         \
+  if (std::is_same<MODETYPE, KokkosBatched::Mode::Serial>::value) {            \
+    KOKKOSBATCHED_SERIAL_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(              \
+        ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS);                      \
+  } else if (std::is_same<MODETYPE, KokkosBatched::Mode::Team>::value) {       \
+    KOKKOSBATCHED_TEAM_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(                \
+        ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS);              \
+  } else if (std::is_same<MODETYPE, KokkosBatched::Mode::TeamVector>::value) { \
+    KOKKOSBATCHED_TEAMVECTOR_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE(          \
+        ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS);              \
+  }
 
-#define KOKKOSBATCHED_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE(MODETYPE,ALGOTYPE,MEMBER,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS) \
-  if (std::is_same<MODETYPE,KokkosBatched::Mode::Serial>::value) {      \
-    KOKKOSBATCHED_SERIAL_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS); \
-  } else if (std::is_same<MODETYPE,KokkosBatched::Mode::Team>::value) { \
-    KOKKOSBATCHED_TEAM_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,MEMBER,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS); \
-  } else if (std::is_same<MODETYPE,KokkosBatched::Mode::TeamVector>::value) { \
-    KOKKOSBATCHED_TEAMVECTOR_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,MEMBER,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS); \
+#define KOKKOSBATCHED_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE(                    \
+    MODETYPE, ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS)         \
+  if (std::is_same<MODETYPE, KokkosBatched::Mode::Serial>::value) {            \
+    KOKKOSBATCHED_SERIAL_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE(                 \
+        ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS);                      \
+  } else if (std::is_same<MODETYPE, KokkosBatched::Mode::Team>::value) {       \
+    KOKKOSBATCHED_TEAM_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE(                   \
+        ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS);              \
+  } else if (std::is_same<MODETYPE, KokkosBatched::Mode::TeamVector>::value) { \
+    KOKKOSBATCHED_TEAMVECTOR_TRSV_LOWER_TRANSPOSE_INTERNAL_INVOKE(             \
+        ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS);              \
   }
 
-#define KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(MODETYPE,ALGOTYPE,MEMBER,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS) \
-  if (std::is_same<MODETYPE,KokkosBatched::Mode::Serial>::value) {      \
-    KOKKOSBATCHED_SERIAL_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS); \
-  } else if (std::is_same<MODETYPE,KokkosBatched::Mode::Team>::value) { \
-    KOKKOSBATCHED_TEAM_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,MEMBER,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS); \
-  } else if (std::is_same<MODETYPE,KokkosBatched::Mode::TeamVector>::value) { \
-    KOKKOSBATCHED_TEAMVECTOR_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,MEMBER,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS); \
+#define KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(                 \
+    MODETYPE, ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS)         \
+  if (std::is_same<MODETYPE, KokkosBatched::Mode::Serial>::value) {            \
+    KOKKOSBATCHED_SERIAL_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(              \
+        ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS);                      \
+  } else if (std::is_same<MODETYPE, KokkosBatched::Mode::Team>::value) {       \
+    KOKKOSBATCHED_TEAM_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(                \
+        ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS);              \
+  } else if (std::is_same<MODETYPE, KokkosBatched::Mode::TeamVector>::value) { \
+    KOKKOSBATCHED_TEAMVECTOR_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE(          \
+        ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS);              \
   }
 
-#define KOKKOSBATCHED_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE(MODETYPE,ALGOTYPE,MEMBER,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS) \
-  if (std::is_same<MODETYPE,KokkosBatched::Mode::Serial>::value) {      \
-    KOKKOSBATCHED_SERIAL_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS); \
-  } else if (std::is_same<MODETYPE,KokkosBatched::Mode::Team>::value) { \
-    KOKKOSBATCHED_TEAM_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,MEMBER,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS); \
-  } else if (std::is_same<MODETYPE,KokkosBatched::Mode::TeamVector>::value) { \
-    KOKKOSBATCHED_TEAMVECTOR_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE(ALGOTYPE,MEMBER,DIAG,M,N,ALPHA,A,AS0,AS1,B,BS); \
+#define KOKKOSBATCHED_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE(                    \
+    MODETYPE, ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS)         \
+  if (std::is_same<MODETYPE, KokkosBatched::Mode::Serial>::value) {            \
+    KOKKOSBATCHED_SERIAL_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE(                 \
+        ALGOTYPE, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS);                      \
+  } else if (std::is_same<MODETYPE, KokkosBatched::Mode::Team>::value) {       \
+    KOKKOSBATCHED_TEAM_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE(                   \
+        ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS);              \
+  } else if (std::is_same<MODETYPE, KokkosBatched::Mode::TeamVector>::value) { \
+    KOKKOSBATCHED_TEAMVECTOR_TRSV_UPPER_TRANSPOSE_INTERNAL_INVOKE(             \
+        ALGOTYPE, MEMBER, DIAG, M, N, ALPHA, A, AS0, AS1, B, BS);              \
   }
 
 #endif
diff --git a/src/batched/dense/KokkosBatched_Trtri_Decl.hpp b/src/batched/dense/KokkosBatched_Trtri_Decl.hpp
index 1d5cf0632d..d5beb82773 100644
--- a/src/batched/dense/KokkosBatched_Trtri_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_Trtri_Decl.hpp
@@ -50,15 +50,10 @@
 
 namespace KokkosBatched {
 
-  template<typename ArgUplo,
-           typename ArgDiag,
-           typename ArgAlgo>
-  struct SerialTrtri {
-    template<typename ScalarType,
-             typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const AViewType &A);
-  };
-} // namespace KokkosBatched
-#endif // __KOKKOSBATCHED_TRTRI_DECL_HPP__
+template <typename ArgUplo, typename ArgDiag, typename ArgAlgo>
+struct SerialTrtri {
+  template <typename ScalarType, typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A);
+};
+}  // namespace KokkosBatched
+#endif  // __KOKKOSBATCHED_TRTRI_DECL_HPP__
diff --git a/src/batched/dense/KokkosBatched_UTV_Decl.hpp b/src/batched/dense/KokkosBatched_UTV_Decl.hpp
index 524857229d..60cc51faed 100644
--- a/src/batched/dense/KokkosBatched_UTV_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_UTV_Decl.hpp
@@ -1,61 +1,56 @@
 #ifndef __KOKKOSBATCHED_UTV_DECL_HPP__
 #define __KOKKOSBATCHED_UTV_DECL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 namespace KokkosBatched {
 
-  ///
-  /// For given A, it performs UTV factorization i.e., UTV = A P^T
-  /// - input:
-  ///   - A is m x m real matrix
-  ///   - p is m integer vector
-  ///   - U is m x m real matrix
-  ///   - V is m x m real matrix
-  ///   - w is 3*m real vector workspace (contiguous)
-  /// - output:
-  ///   - A is overwritten as lower triangular matrix_rank x matrix_rank real matrix
-  ///   - P^T includes pivot indicies (note that this is different from permutation indicies)
-  ///   - U is left orthogonal matrix m x matrix_rank
-  ///   - V is right orthogonal matrix matrix_rank x m
-  ///
-  /// When A is a full rank i.e., matrix_rank == m, this only compute a QR with column pivoting
-  /// - output:
-  ///   - A is overwritten as upper triangular matrix
-  ///   - P^T includes pivot indicies (note that this is different from permutation indicies)  
-  ///   - U is an orthogonal matrix m x m
-  ///   - V is not touched
-  ///
-  /// For the solution of a rank-deficient problem, it is recommended to use SolveUTV.
-  ///
-  
-  ///
-  /// TeamVector UTV
-  ///
-
-  template<typename MemberType,
-           typename ArgAlgo>
-  struct TeamVectorUTV {
-    template<typename AViewType,
-	     typename pViewType,
-	     typename UViewType,
-	     typename VViewType,
-             typename wViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const AViewType &A,
-	   const pViewType &p,
-	   const UViewType &U,
-	   const VViewType &V, 
-           const wViewType &w,
-	   int &matrix_rank);
-  };
-
-}
+///
+/// For given A, it performs UTV factorization i.e., UTV = A P^T
+/// - input:
+///   - A is m x m real matrix
+///   - p is m integer vector
+///   - U is m x m real matrix
+///   - V is m x m real matrix
+///   - w is 3*m real vector workspace (contiguous)
+/// - output:
+///   - A is overwritten as lower triangular matrix_rank x matrix_rank real
+///   matrix
+///   - P^T includes pivot indicies (note that this is different from
+///   permutation indicies)
+///   - U is left orthogonal matrix m x matrix_rank
+///   - V is right orthogonal matrix matrix_rank x m
+///
+/// When A is a full rank i.e., matrix_rank == m, this only compute a QR with
+/// column pivoting
+/// - output:
+///   - A is overwritten as upper triangular matrix
+///   - P^T includes pivot indicies (note that this is different from
+///   permutation indicies)
+///   - U is an orthogonal matrix m x m
+///   - V is not touched
+///
+/// For the solution of a rank-deficient problem, it is recommended to use
+/// SolveUTV.
+///
+
+///
+/// TeamVector UTV
+///
+
+template <typename MemberType, typename ArgAlgo>
+struct TeamVectorUTV {
+  template <typename AViewType, typename pViewType, typename UViewType,
+            typename VViewType, typename wViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const AViewType &A, const pViewType &p,
+      const UViewType &U, const VViewType &V, const wViewType &w,
+      int &matrix_rank);
+};
+
+}  // namespace KokkosBatched
 
 #include "KokkosBatched_UTV_TeamVector_Impl.hpp"
 
diff --git a/src/batched/dense/KokkosBatched_Vector.hpp b/src/batched/dense/KokkosBatched_Vector.hpp
index 28a537f885..f91e3dea07 100644
--- a/src/batched/dense/KokkosBatched_Vector.hpp
+++ b/src/batched/dense/KokkosBatched_Vector.hpp
@@ -5,253 +5,293 @@
 
 #include "KokkosBatched_Util.hpp"
 
-
 // forward declaration
 namespace KokkosBatched {
 
-  template<typename T, int l>
-  class Vector;
+template <typename T, int l>
+class Vector;
 
-  template<typename T, int l>
-  struct is_vector<Vector<SIMD<T>,l> > : public std::true_type {};
+template <typename T, int l>
+struct is_vector<Vector<SIMD<T>, l>> : public std::true_type {};
 
-  template<typename ValueType, typename MemorySpace>
-  struct DefaultVectorLength {
-    enum : int { value = 1 };
-  };
+template <typename ValueType, typename MemorySpace>
+struct DefaultVectorLength {
+  enum : int { value = 1 };
+};
 
-  template<>
-  struct DefaultVectorLength<float,Kokkos::HostSpace> {
-#if   defined(__AVX512F__)
-    enum : int { value = 16 };
+template <>
+struct DefaultVectorLength<float, Kokkos::HostSpace> {
+#if defined(__AVX512F__)
+  enum : int{value = 16};
 #elif defined(__AVX__) || defined(__AVX2__)
-    enum : int { value = 8 };
+  enum : int{value = 8};
 #elif defined(__ARM_ARCH)
-    enum : int { value = 8 };    
+  enum : int{value = 8};
 #else
-    enum : int { value = 8 };      
+  enum : int { value = 8 };
 #endif
-  };
-  template<>
-  struct DefaultVectorLength<double,Kokkos::HostSpace> {
-#if   defined(__AVX512F__)
-    enum : int { value = 8 };
+};
+template <>
+struct DefaultVectorLength<double, Kokkos::HostSpace> {
+#if defined(__AVX512F__)
+  enum : int{value = 8};
 #elif defined(__AVX__) || defined(__AVX2__)
-    enum : int { value = 4 };
+  enum : int{value = 4};
 #elif defined(__ARM_ARCH)
-    enum : int { value = 4 };    
+  enum : int{value = 4};
 #else
-    enum : int { value = 4 };      
+  enum : int { value = 4 };
 #endif
-  };
-  template<>    
-  struct DefaultVectorLength<Kokkos::complex<float>,Kokkos::HostSpace> {
-#if   defined(__AVX512F__)
-    enum : int { value = 8 };
+};
+template <>
+struct DefaultVectorLength<Kokkos::complex<float>, Kokkos::HostSpace> {
+#if defined(__AVX512F__)
+  enum : int{value = 8};
 #elif defined(__AVX__) || defined(__AVX2__)
-    enum : int { value = 4 };
+  enum : int{value = 4};
 #elif defined(__ARM_ARCH)
-    enum : int { value = 4 };    
+  enum : int{value = 4};
 #else
-    enum : int { value = 4 };      
+  enum : int { value = 4 };
 #endif
-  };
-  template<>
-  struct DefaultVectorLength<Kokkos::complex<double>,Kokkos::HostSpace> {
-#if   defined(__AVX512F__)
-    enum : int { value = 4 };
+};
+template <>
+struct DefaultVectorLength<Kokkos::complex<double>, Kokkos::HostSpace> {
+#if defined(__AVX512F__)
+  enum : int{value = 4};
 #elif defined(__AVX__) || defined(__AVX2__)
-    enum : int { value = 2 };
+  enum : int{value = 2};
 #elif defined(__ARM_ARCH)
-    enum : int { value = 2 };    
-#else 
-    enum : int { value = 2 };      
+  enum : int{value = 2};
+#else
+  enum : int { value = 2 };
 #endif
-  };
+};
 
 #if defined(KOKKOS_ENABLE_CUDA)
-  template<>
-  struct DefaultVectorLength<float,Kokkos::CudaSpace> {
-    enum : int { value = 8 };
-  };
-  template<>
-  struct DefaultVectorLength<double,Kokkos::CudaSpace> {
-    enum : int { value = 8 };
-  };
-  template<>
-  struct DefaultVectorLength<Kokkos::complex<float>,Kokkos::CudaSpace> {
-    enum : int { value = 8 };
-  };
-  template<>
-  struct DefaultVectorLength<Kokkos::complex<double>,Kokkos::CudaSpace> {
-    enum : int { value = 8 };
-  };
-  template<>
-  struct DefaultVectorLength<float,Kokkos::CudaUVMSpace> {
-    enum : int { value = 8 };
-  };
-  template<>
-  struct DefaultVectorLength<double,Kokkos::CudaUVMSpace> {
-    enum : int { value = 8 };
-  };
-  template<>
-  struct DefaultVectorLength<Kokkos::complex<float>,Kokkos::CudaUVMSpace> {
-    enum : int { value = 8 };
-  };
-  template<>
-  struct DefaultVectorLength<Kokkos::complex<double>,Kokkos::CudaUVMSpace> {
-    enum : int { value = 8 };
-  };
+template <>
+struct DefaultVectorLength<float, Kokkos::CudaSpace> {
+  enum : int { value = 8 };
+};
+template <>
+struct DefaultVectorLength<double, Kokkos::CudaSpace> {
+  enum : int { value = 8 };
+};
+template <>
+struct DefaultVectorLength<Kokkos::complex<float>, Kokkos::CudaSpace> {
+  enum : int { value = 8 };
+};
+template <>
+struct DefaultVectorLength<Kokkos::complex<double>, Kokkos::CudaSpace> {
+  enum : int { value = 8 };
+};
+template <>
+struct DefaultVectorLength<float, Kokkos::CudaUVMSpace> {
+  enum : int { value = 8 };
+};
+template <>
+struct DefaultVectorLength<double, Kokkos::CudaUVMSpace> {
+  enum : int { value = 8 };
+};
+template <>
+struct DefaultVectorLength<Kokkos::complex<float>, Kokkos::CudaUVMSpace> {
+  enum : int { value = 8 };
+};
+template <>
+struct DefaultVectorLength<Kokkos::complex<double>, Kokkos::CudaUVMSpace> {
+  enum : int { value = 8 };
+};
 #endif
 
 #if defined(KOKKOS_ENABLE_HIP)
-  template<>
-  struct DefaultVectorLength<float,Kokkos::Experimental::HIPSpace> {
-    enum : int { value = 16 };
-  };
-  template<>
-  struct DefaultVectorLength<double,Kokkos::Experimental::HIPSpace> {
-    enum : int { value = 16 };
-  };
-  template<>
-  struct DefaultVectorLength<Kokkos::complex<float>,Kokkos::Experimental::HIPSpace> {
-    enum : int { value = 16 };
-  };
-  template<>
-  struct DefaultVectorLength<Kokkos::complex<double>,Kokkos::Experimental::HIPSpace> {
-    enum : int { value = 16 };
-  };
+template <>
+struct DefaultVectorLength<float, Kokkos::Experimental::HIPSpace> {
+  enum : int { value = 16 };
+};
+template <>
+struct DefaultVectorLength<double, Kokkos::Experimental::HIPSpace> {
+  enum : int { value = 16 };
+};
+template <>
+struct DefaultVectorLength<Kokkos::complex<float>,
+                           Kokkos::Experimental::HIPSpace> {
+  enum : int { value = 16 };
+};
+template <>
+struct DefaultVectorLength<Kokkos::complex<double>,
+                           Kokkos::Experimental::HIPSpace> {
+  enum : int { value = 16 };
+};
 #endif
 
-  template<typename ValueType, typename MemorySpace>
-  struct DefaultInternalVectorLength {
-    enum : int { value = 1 };
-  };
-  template<typename ValueType>
-  struct DefaultInternalVectorLength<ValueType,Kokkos::HostSpace> {
-    enum : int { value = DefaultVectorLength<ValueType,Kokkos::HostSpace>::value };      
+template <typename ValueType, typename MemorySpace>
+struct DefaultInternalVectorLength {
+  enum : int { value = 1 };
+};
+template <typename ValueType>
+struct DefaultInternalVectorLength<ValueType, Kokkos::HostSpace> {
+  enum : int {
+    value = DefaultVectorLength<ValueType, Kokkos::HostSpace>::value
   };
-    
+};
+
 #if defined(KOKKOS_ENABLE_CUDA)
-  template<>
-  struct DefaultInternalVectorLength<float,Kokkos::CudaSpace> {
-    enum : int { value = 4 };
-  };
-  template<>
-  struct DefaultInternalVectorLength<double,Kokkos::CudaSpace> {
-    enum : int { value = 2 };
-  };
-  template<>
-  struct DefaultInternalVectorLength<Kokkos::complex<float>,Kokkos::CudaSpace> {
-    enum : int { value = 2 };
-  };
-  template<>
-  struct DefaultInternalVectorLength<Kokkos::complex<double>,Kokkos::CudaSpace> {
-    enum : int { value = 1 };
-  };
-  template<>
-  struct DefaultInternalVectorLength<float,Kokkos::CudaUVMSpace> {
-    enum : int { value = 4 };
-  };
-  template<>
-  struct DefaultInternalVectorLength<double,Kokkos::CudaUVMSpace> {
-    enum : int { value = 2 };
-  };
-  template<>
-  struct DefaultInternalVectorLength<Kokkos::complex<float>,Kokkos::CudaUVMSpace> {
-    enum : int { value = 2 };
-  };
-  template<>
-  struct DefaultInternalVectorLength<Kokkos::complex<double>,Kokkos::CudaUVMSpace> {
-    enum : int { value = 1 };
-  };
+template <>
+struct DefaultInternalVectorLength<float, Kokkos::CudaSpace> {
+  enum : int { value = 4 };
+};
+template <>
+struct DefaultInternalVectorLength<double, Kokkos::CudaSpace> {
+  enum : int { value = 2 };
+};
+template <>
+struct DefaultInternalVectorLength<Kokkos::complex<float>, Kokkos::CudaSpace> {
+  enum : int { value = 2 };
+};
+template <>
+struct DefaultInternalVectorLength<Kokkos::complex<double>, Kokkos::CudaSpace> {
+  enum : int { value = 1 };
+};
+template <>
+struct DefaultInternalVectorLength<float, Kokkos::CudaUVMSpace> {
+  enum : int { value = 4 };
+};
+template <>
+struct DefaultInternalVectorLength<double, Kokkos::CudaUVMSpace> {
+  enum : int { value = 2 };
+};
+template <>
+struct DefaultInternalVectorLength<Kokkos::complex<float>,
+                                   Kokkos::CudaUVMSpace> {
+  enum : int { value = 2 };
+};
+template <>
+struct DefaultInternalVectorLength<Kokkos::complex<double>,
+                                   Kokkos::CudaUVMSpace> {
+  enum : int { value = 1 };
+};
 #endif
 
 #if defined(KOKKOS_ENABLE_HIP)
-  template<>
-  struct DefaultInternalVectorLength<float,Kokkos::Experimental::HIPSpace> {
-    enum : int { value = 8 };
-  };
-  template<>
-  struct DefaultInternalVectorLength<double,Kokkos::Experimental::HIPSpace> {
-    enum : int { value = 4 };
-  };
-  template<>
-  struct DefaultInternalVectorLength<Kokkos::complex<float>,Kokkos::Experimental::HIPSpace> {
-    enum : int { value = 4 };
-  };
-  template<>
-  struct DefaultInternalVectorLength<Kokkos::complex<double>,Kokkos::Experimental::HIPSpace> {
-    enum : int { value = 2 };
-  };
+template <>
+struct DefaultInternalVectorLength<float, Kokkos::Experimental::HIPSpace> {
+  enum : int { value = 8 };
+};
+template <>
+struct DefaultInternalVectorLength<double, Kokkos::Experimental::HIPSpace> {
+  enum : int { value = 4 };
+};
+template <>
+struct DefaultInternalVectorLength<Kokkos::complex<float>,
+                                   Kokkos::Experimental::HIPSpace> {
+  enum : int { value = 4 };
+};
+template <>
+struct DefaultInternalVectorLength<Kokkos::complex<double>,
+                                   Kokkos::Experimental::HIPSpace> {
+  enum : int { value = 2 };
+};
 #endif
-    
-  template<typename T>
-  struct MagnitudeScalarType;
 
-  template<> struct MagnitudeScalarType<float> { typedef float type; };
-  template<> struct MagnitudeScalarType<double> { typedef double type; };
-  template<> struct MagnitudeScalarType<Kokkos::complex<float> > { typedef float type; };
-  template<> struct MagnitudeScalarType<Kokkos::complex<double> > { typedef double type; };
+template <typename T>
+struct MagnitudeScalarType;
 
-  template<int l> struct MagnitudeScalarType<Vector<SIMD<float>,l> > { typedef float type; };
-  template<int l> struct MagnitudeScalarType<Vector<SIMD<double>,l> > { typedef double type; };
-  template<int l> struct MagnitudeScalarType<Vector<SIMD<Kokkos::complex<float> >,l> > { typedef float type; };
-  template<int l> struct MagnitudeScalarType<Vector<SIMD<Kokkos::complex<double> >,l> > { typedef double type; };
+template <>
+struct MagnitudeScalarType<float> {
+  typedef float type;
+};
+template <>
+struct MagnitudeScalarType<double> {
+  typedef double type;
+};
+template <>
+struct MagnitudeScalarType<Kokkos::complex<float>> {
+  typedef float type;
+};
+template <>
+struct MagnitudeScalarType<Kokkos::complex<double>> {
+  typedef double type;
+};
 
-}
+template <int l>
+struct MagnitudeScalarType<Vector<SIMD<float>, l>> {
+  typedef float type;
+};
+template <int l>
+struct MagnitudeScalarType<Vector<SIMD<double>, l>> {
+  typedef double type;
+};
+template <int l>
+struct MagnitudeScalarType<Vector<SIMD<Kokkos::complex<float>>, l>> {
+  typedef float type;
+};
+template <int l>
+struct MagnitudeScalarType<Vector<SIMD<Kokkos::complex<double>>, l>> {
+  typedef double type;
+};
+
+}  // namespace KokkosBatched
 
 #include "KokkosBatched_Vector_SIMD.hpp"
 
 // arith traits overload for vector types
 namespace Kokkos {
-  namespace Details {
+namespace Details {
 
-    // do not use Vector alone as other can use the name.
+// do not use Vector alone as other can use the name.
 
-    template<typename T, int l>
-    class ArithTraits<KokkosBatched::Vector<KokkosBatched::SIMD<T>,l> > { 
-    public:
-      typedef typename ArithTraits<T>::val_type val_scalar_type;
-      typedef typename ArithTraits<T>::mag_type mag_scalar_type;
+template <typename T, int l>
+class ArithTraits<KokkosBatched::Vector<KokkosBatched::SIMD<T>, l>> {
+ public:
+  typedef typename ArithTraits<T>::val_type val_scalar_type;
+  typedef typename ArithTraits<T>::mag_type mag_scalar_type;
 
-      typedef KokkosBatched::Vector<KokkosBatched::SIMD<val_scalar_type>,l> val_type;
-      typedef KokkosBatched::Vector<KokkosBatched::SIMD<mag_scalar_type>,l> mag_type;
-     
-      static KOKKOS_FORCEINLINE_FUNCTION mag_type real (const val_type &val) {
-        return val;
-      }
- 
-      static const bool is_specialized = ArithTraits<T>::is_specialized;
-      static const bool is_signed = ArithTraits<T>::is_signed;
-      static const bool is_integer = ArithTraits<T>::is_integer;
-      static const bool is_exact = ArithTraits<T>::is_exact;
-      static const bool is_complex = ArithTraits<T>::is_complex;
-    };
-	
-	
-    template<typename T, int l>
-    class ArithTraits<KokkosBatched::Vector<KokkosBatched::SIMD<Kokkos::complex<T>>,l> > { 
-    public:
-      typedef typename ArithTraits<T>::val_type val_scalar_type;
-      typedef typename ArithTraits<T>::mag_type mag_scalar_type;
+  typedef KokkosBatched::Vector<KokkosBatched::SIMD<val_scalar_type>, l>
+      val_type;
+  typedef KokkosBatched::Vector<KokkosBatched::SIMD<mag_scalar_type>, l>
+      mag_type;
 
-      typedef KokkosBatched::Vector<KokkosBatched::SIMD<Kokkos::complex<val_scalar_type> >,l> val_type;
-      typedef KokkosBatched::Vector<KokkosBatched::SIMD<mag_scalar_type >,l> mag_type;
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type &val) {
+    return val;
+  }
 
-      static KOKKOS_FORCEINLINE_FUNCTION mag_type real (const val_type &val) {
-        mag_type r_val;
-        for (int i=0;i<l;++i) { r_val[i] = val[i].real(); }
-        return r_val;
-      }
-      static KOKKOS_FORCEINLINE_FUNCTION mag_type imag (const val_type &val) {
-        mag_type r_val;
-        for (int i=0;i<l;++i) { r_val[i] = val[i].imag(); }
-        return r_val;
-      }
-    };
+  static const bool is_specialized = ArithTraits<T>::is_specialized;
+  static const bool is_signed      = ArithTraits<T>::is_signed;
+  static const bool is_integer     = ArithTraits<T>::is_integer;
+  static const bool is_exact       = ArithTraits<T>::is_exact;
+  static const bool is_complex     = ArithTraits<T>::is_complex;
+};
 
+template <typename T, int l>
+class ArithTraits<
+    KokkosBatched::Vector<KokkosBatched::SIMD<Kokkos::complex<T>>, l>> {
+ public:
+  typedef typename ArithTraits<T>::val_type val_scalar_type;
+  typedef typename ArithTraits<T>::mag_type mag_scalar_type;
+
+  typedef KokkosBatched::Vector<
+      KokkosBatched::SIMD<Kokkos::complex<val_scalar_type>>, l>
+      val_type;
+  typedef KokkosBatched::Vector<KokkosBatched::SIMD<mag_scalar_type>, l>
+      mag_type;
+
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type &val) {
+    mag_type r_val;
+    for (int i = 0; i < l; ++i) {
+      r_val[i] = val[i].real();
+    }
+    return r_val;
   }
-}
+  static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type &val) {
+    mag_type r_val;
+    for (int i = 0; i < l; ++i) {
+      r_val[i] = val[i].imag();
+    }
+    return r_val;
+  }
+};
+
+}  // namespace Details
+}  // namespace Kokkos
 
 #endif
diff --git a/src/batched/dense/KokkosBatched_Vector_SIMD.hpp b/src/batched/dense/KokkosBatched_Vector_SIMD.hpp
index d7d3d58080..13cf1eacac 100644
--- a/src/batched/dense/KokkosBatched_Vector_SIMD.hpp
+++ b/src/batched/dense/KokkosBatched_Vector_SIMD.hpp
@@ -5,9 +5,10 @@
 
 #include <Kokkos_Complex.hpp>
 #include <KokkosBatched_Vector.hpp>
+#include "KokkosKernels_Macros.hpp"
 
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-#undef  __KOKKOSBATCHED_ENABLE_AVX__
+#undef __KOKKOSBATCHED_ENABLE_AVX__
 #else
 // compiler bug with AVX in some architectures
 #define __KOKKOSBATCHED_ENABLE_AVX__
@@ -15,676 +16,638 @@
 
 namespace KokkosBatched {
 
-  template<typename T, int l>
-  class Vector<SIMD<T>,l> {
-  public:
-    using type = Vector<SIMD<T>,l>;
-    using value_type = T;
-    using mag_type = typename Kokkos::Details::ArithTraits<T>::mag_type;
-
-    enum : int { vector_length = l };
-
-    typedef value_type data_type[vector_length];
-
-    KOKKOS_INLINE_FUNCTION
-    static const char* label() { return "SIMD"; }
-
-    template<typename,int>
-    friend class Vector;
-
-  private:
-    mutable data_type _data;
-
-  public:
-    KOKKOS_INLINE_FUNCTION Vector() {
-      //static_assert(std::is_same<Kokkos::Impl::ActiveExecutionMemorySpace,Kokkos::HostSpace>::value,
-      //              "Vector SIMD should not be instanciated in CudaSpace");
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
-#pragma ivdep
-#endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
-#pragma vector always
-#endif
-#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
-#pragma omp simd
-#endif
-      for (int i=0;i<vector_length;++i)
-        _data[i] = 0;
-    }
-    template<typename ArgValueType>
-    KOKKOS_INLINE_FUNCTION Vector(const ArgValueType &val) {
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
-#pragma ivdep
-#endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
-#pragma vector always
-#endif
-#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
-#pragma omp simd
-#endif
-      for (int i=0;i<vector_length;++i)
-        _data[i] = val;
-    }
-    template<typename ArgValueType>
-    KOKKOS_INLINE_FUNCTION Vector(const Vector<SIMD<ArgValueType>,vector_length> &b) {
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
-#pragma ivdep
-#endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
-#pragma vector always
-#endif
-#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
-#pragma omp simd
-#endif
-      for (int i=0;i<vector_length;++i)
-        _data[i] = b[i];
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    type& loadAligned(const value_type *p) {
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
-#pragma ivdep
-#endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
-#pragma vector always
-#endif
-#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
-#pragma omp simd
-#endif
-      for (int i=0;i<vector_length;++i)
-        _data[i] = p[i];
-      return *this;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    type& loadUnaligned(const value_type *p) {
-      return loadAligned(p);
-    }
-      
-    KOKKOS_INLINE_FUNCTION
-    void storeAligned(value_type *p) const {
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
-#pragma ivdep
-#endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
-#pragma vector always
-#endif
-#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
-#pragma omp simd
-#endif
-      for (int i=0;i<vector_length;++i)
-        p[i] = _data[i];
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void storeUnaligned(value_type *p) const {
-      storeAligned(p);
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    value_type& operator[](const int &i) const {
-      return _data[i];
-    }
-  };
-}
-
+template <typename T, int l>
+class Vector<SIMD<T>, l> {
+ public:
+  using type       = Vector<SIMD<T>, l>;
+  using value_type = T;
+  using mag_type   = typename Kokkos::Details::ArithTraits<T>::mag_type;
+
+  enum : int { vector_length = l };
+
+  typedef value_type data_type[vector_length];
+
+  KOKKOS_INLINE_FUNCTION
+  static const char *label() { return "SIMD"; }
+
+  template <typename, int>
+  friend class Vector;
+
+ private:
+  mutable data_type _data;
+
+ public:
+  KOKKOS_INLINE_FUNCTION Vector() {
+    // NOTE Not meant to be instantiated for CUDA
+    KOKKOSKERNELS_FORCE_SIMD
+    for (int i = 0; i < vector_length; ++i) _data[i] = 0;
+  }
+  template <typename ArgValueType>
+  KOKKOS_INLINE_FUNCTION Vector(const ArgValueType &val) {
+    KOKKOSKERNELS_FORCE_SIMD
+    for (int i = 0; i < vector_length; ++i) _data[i] = val;
+  }
+  template <typename ArgValueType>
+  KOKKOS_INLINE_FUNCTION Vector(
+      const Vector<SIMD<ArgValueType>, vector_length> &b) {
+    KOKKOSKERNELS_FORCE_SIMD
+    for (int i = 0; i < vector_length; ++i) _data[i] = b[i];
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  type &loadAligned(const value_type *p) {
+    KOKKOSKERNELS_FORCE_SIMD
+    for (int i = 0; i < vector_length; ++i) _data[i] = p[i];
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  type &loadUnaligned(const value_type *p) { return loadAligned(p); }
+
+  KOKKOS_INLINE_FUNCTION
+  void storeAligned(value_type *p) const {
+    KOKKOSKERNELS_FORCE_SIMD
+    for (int i = 0; i < vector_length; ++i) p[i] = _data[i];
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void storeUnaligned(value_type *p) const { storeAligned(p); }
+
+  KOKKOS_INLINE_FUNCTION
+  value_type &operator[](const int &i) const { return _data[i]; }
+};
+}  // namespace KokkosBatched
 
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
 namespace KokkosBatched {
 
-  template<>
-  class Vector<SIMD<float>,2> {
-  public:
-    using type = Vector<SIMD<float>,2>;
-    using value_type = float;
-    using mag_type = float;
-
-    enum : int { vector_length = 2 };
-    typedef float2 data_type;
-
-    KOKKOS_INLINE_FUNCTION
-    static const char* label() { return "GpuFloat2"; }
-
-    template<typename,int>
-    friend class Vector;
-
-  private:
-    mutable data_type _data;
-
-  public:
-    KOKKOS_INLINE_FUNCTION Vector() { _data.x = 0; _data.y = 0; }
-    KOKKOS_INLINE_FUNCTION Vector(const value_type &val) { _data.x = val; _data.y = val; }
-    KOKKOS_INLINE_FUNCTION Vector(const type &b) { _data.x = b._data.x; _data.y = b._data.y; }
-    KOKKOS_INLINE_FUNCTION Vector(const float2 &val) { _data.x = val.x; _data.y = val.y; }
-
-    template<typename ArgValueType>
-    KOKKOS_INLINE_FUNCTION Vector(const ArgValueType &val) {
-      _data.x = val; 
-      _data.y = val;
-    }
-
-    template<typename ArgValueType>
-    KOKKOS_INLINE_FUNCTION Vector(const Vector<SIMD<ArgValueType>,vector_length> &b) {
-      _data.x = b[0];
-      _data.y = b[1];
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    type& operator=(const float2 &val) {
-      _data.x = val.x;
-      _data.y = val.y;
-      return *this;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    float2 float2() const {
-      return _data;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    type& loadAligned(const value_type *p) {
-      _data.x = *(p  );
-      _data.y = *(p+1);
-      return *this;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    type& loadUnaligned(const value_type *p) {
-      _data.x = *(p  );
-      _data.y = *(p+1);
-      return *this;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void storeAligned(value_type *p) const {
-      *(p  ) = _data.x;
-      *(p+1) = _data.y;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void storeUnaligned(value_type *p) const {
-      *(p  ) = _data.x;
-      *(p+1) = _data.y;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    value_type& operator[](const int &i) const {
-      return reinterpret_cast<value_type*>(&_data)[i];
-    }
-  };
-
-  template<>
-  class Vector<SIMD<double>,2> {
-  public:
-    using type = Vector<SIMD<double>,2>;
-    using value_type = double;
-    using mag_type = double;
-
-    enum : int { vector_length = 2 };
-    typedef double2 data_type;
-
-    KOKKOS_INLINE_FUNCTION
-    static const char* label() { return "GpuDouble2"; }
-
-    template<typename,int>
-    friend class Vector;
-
-  private:
-    mutable data_type _data;
-
-  public:
-    KOKKOS_INLINE_FUNCTION Vector() { _data.x = 0; _data.y = 0; }
-    KOKKOS_INLINE_FUNCTION Vector(const value_type &val) { _data.x = val; _data.y = val; }
-    KOKKOS_INLINE_FUNCTION Vector(const type &b) { _data.x = b._data.x; _data.y = b._data.y; }
-    KOKKOS_INLINE_FUNCTION Vector(const double2 &val) { _data.x = val.x; _data.y = val.y; }
-
-    template<typename ArgValueType>
-    KOKKOS_INLINE_FUNCTION Vector(const ArgValueType &val) {
-      _data.x = val; 
-      _data.y = val;
-    }
-
-    template<typename ArgValueType>
-    KOKKOS_INLINE_FUNCTION Vector(const Vector<SIMD<ArgValueType>,vector_length> &b) {
-      _data.x = b[0];
-      _data.y = b[1];
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    type& operator=(const double2 &val) {
-      _data.x = val.x;
-      _data.y = val.y;
-      return *this;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    double2 double2() const {
-      return _data;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    type& loadAligned(const value_type *p) {
-      _data.x = *(p  );
-      _data.y = *(p+1);
-      return *this;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    type& loadUnaligned(const value_type *p) {
-      _data.x = *(p  );
-      _data.y = *(p+1);
-      return *this;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void storeAligned(value_type *p) const {
-      *(p  ) = _data.x;
-      *(p+1) = _data.y;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void storeUnaligned(value_type *p) const {
-      *(p  ) = _data.x;
-      *(p+1) = _data.y;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    value_type& operator[](const int &i) const {
-      return reinterpret_cast<value_type*>(&_data)[i];
-    }
-  };
-
-  template<>
-  class Vector<SIMD<float>,4> {
-  public:
-    using type = Vector<SIMD<float>,4>;
-    using value_type = float;
-    using mag_type = float;
-
-    enum : int { vector_length = 4 };
-    typedef float4 data_type;
-
-    KOKKOS_INLINE_FUNCTION
-    static const char* label() { return "GpuFloat4"; }
-
-    template<typename,int>
-    friend class Vector;
-
-  private:
-    mutable data_type _data;
-
-  public:
-    KOKKOS_INLINE_FUNCTION Vector() { _data.x = 0; _data.y = 0; _data.z = 0; _data.w = 0; }
-    KOKKOS_INLINE_FUNCTION Vector(const value_type &val) { _data.x = val; _data.y = val; _data.z = val; _data.w = val; }
-    KOKKOS_INLINE_FUNCTION Vector(const type &b) { _data.x = b._data.x; _data.y = b._data.y; _data.z = b._data.z; _data.w = b._data.w; }
-    KOKKOS_INLINE_FUNCTION Vector(const float4 &val) { _data.x = val.x; _data.y = val.y; _data.z = val.z; _data.w = val.w; }
-
-    template<typename ArgValueType>
-    KOKKOS_INLINE_FUNCTION Vector(const ArgValueType &val) {
-      _data.x = val; 
-      _data.y = val;
-      _data.z = val; 
-      _data.w = val;
-    }
-
-    template<typename ArgValueType>
-    KOKKOS_INLINE_FUNCTION Vector(const Vector<SIMD<ArgValueType>,vector_length> &b) {
-      _data.x = b[0];
-      _data.y = b[1];
-      _data.z = b[2];
-      _data.w = b[3];
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    type& operator=(const float4 &val) {
-      _data.x = val.x;
-      _data.y = val.y;
-      _data.z = val.z;
-      _data.w = val.w;
-      return *this;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    float4 float4() const {
-      return _data;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    type& loadAligned(const value_type *p) {
-      _data.x = *(p  );
-      _data.y = *(p+1);
-      _data.z = *(p+2);
-      _data.w = *(p+3);
-      return *this;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    type& loadUnaligned(const value_type *p) {
-      _data.x = *(p  );
-      _data.y = *(p+1);
-      _data.z = *(p+2);
-      _data.w = *(p+3);
-      return *this;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void storeAligned(value_type *p) const {
-      *(p  ) = _data.x;
-      *(p+1) = _data.y;
-      *(p+2) = _data.z;
-      *(p+3) = _data.w;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void storeUnaligned(value_type *p) const {
-      *(p  ) = _data.x;
-      *(p+1) = _data.y;
-      *(p+2) = _data.z;
-      *(p+3) = _data.w;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    value_type& operator[](const int &i) const {
-      return reinterpret_cast<value_type*>(&_data)[i];
-    }
-  };
-
-  template<>
-  class Vector<SIMD<double>,4> {
-  public:
-    using type = Vector<SIMD<double>,4>;
-    using value_type = double;
-    using mag_type = double;
-
-    enum : int { vector_length = 4 };
-    typedef double4 data_type;
-
-    KOKKOS_INLINE_FUNCTION
-    static const char* label() { return "GpuDouble4"; }
-
-    template<typename,int>
-    friend class Vector;
-
-  private:
-    mutable data_type _data;
-
-  public:
-    KOKKOS_INLINE_FUNCTION Vector() { _data.x = 0; _data.y = 0; _data.z = 0; _data.w = 0; }
-    KOKKOS_INLINE_FUNCTION Vector(const value_type &val) { _data.x = val; _data.y = val; _data.z = val; _data.w = val; }
-    KOKKOS_INLINE_FUNCTION Vector(const type &b) { _data.x = b._data.x; _data.y = b._data.y; _data.z = b._data.z; _data.w = b._data.w; }
-    KOKKOS_INLINE_FUNCTION Vector(const double4 &val) { _data.x = val.x; _data.y = val.y; _data.z = val.z; _data.w = val.w; }
-
-    template<typename ArgValueType>
-    KOKKOS_INLINE_FUNCTION Vector(const ArgValueType &val) {
-      _data.x = val; 
-      _data.y = val;
-      _data.z = val; 
-      _data.w = val;
-    }
-
-    template<typename ArgValueType>
-    KOKKOS_INLINE_FUNCTION Vector(const Vector<SIMD<ArgValueType>,vector_length> &b) {
-      _data.x = b[0];
-      _data.y = b[1];
-      _data.z = b[2];
-      _data.w = b[3];
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    type& operator=(const double4 &val) {
-      _data.x = val.x;
-      _data.y = val.y;
-      _data.z = val.z;
-      _data.w = val.w;
-      return *this;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    double4 double4() const {
-      return _data;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    type& loadAligned(const value_type *p) {
-      _data.x = *(p  );
-      _data.y = *(p+1);
-      _data.z = *(p+2);
-      _data.w = *(p+3);
-      return *this;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    type& loadUnaligned(const value_type *p) {
-      _data.x = *(p  );
-      _data.y = *(p+1);
-      _data.z = *(p+2);
-      _data.w = *(p+3);
-      return *this;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void storeAligned(value_type *p) const {
-      *(p  ) = _data.x;
-      *(p+1) = _data.y;
-      *(p+2) = _data.z;
-      *(p+3) = _data.w;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void storeUnaligned(value_type *p) const {
-      *(p  ) = _data.x;
-      *(p+1) = _data.y;
-      *(p+2) = _data.z;
-      *(p+3) = _data.w;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    value_type& operator[](const int &i) const {
-      return reinterpret_cast<value_type*>(&_data)[i];
-    }
-  };
-
-
-}
+template <>
+class Vector<SIMD<float>, 2> {
+ public:
+  using type       = Vector<SIMD<float>, 2>;
+  using value_type = float;
+  using mag_type   = float;
+
+  enum : int { vector_length = 2 };
+  typedef float2 data_type;
+
+  KOKKOS_INLINE_FUNCTION
+  static const char *label() { return "GpuFloat2"; }
+
+  template <typename, int>
+  friend class Vector;
+
+ private:
+  mutable data_type _data;
+
+ public:
+  KOKKOS_INLINE_FUNCTION Vector() {
+    _data.x = 0;
+    _data.y = 0;
+  }
+  KOKKOS_INLINE_FUNCTION Vector(const value_type &val) {
+    _data.x = val;
+    _data.y = val;
+  }
+  KOKKOS_INLINE_FUNCTION Vector(const type &b) {
+    _data.x = b._data.x;
+    _data.y = b._data.y;
+  }
+  KOKKOS_INLINE_FUNCTION Vector(const float2 &val) {
+    _data.x = val.x;
+    _data.y = val.y;
+  }
+
+  template <typename ArgValueType>
+  KOKKOS_INLINE_FUNCTION Vector(const ArgValueType &val) {
+    _data.x = val;
+    _data.y = val;
+  }
+
+  template <typename ArgValueType>
+  KOKKOS_INLINE_FUNCTION Vector(
+      const Vector<SIMD<ArgValueType>, vector_length> &b) {
+    _data.x = b[0];
+    _data.y = b[1];
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  type &operator=(const float2 &val) {
+    _data.x = val.x;
+    _data.y = val.y;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  float2 float2() const { return _data; }
+
+  KOKKOS_INLINE_FUNCTION
+  type &loadAligned(const value_type *p) {
+    _data.x = *(p);
+    _data.y = *(p + 1);
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  type &loadUnaligned(const value_type *p) {
+    _data.x = *(p);
+    _data.y = *(p + 1);
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void storeAligned(value_type *p) const {
+    *(p)     = _data.x;
+    *(p + 1) = _data.y;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void storeUnaligned(value_type *p) const {
+    *(p)     = _data.x;
+    *(p + 1) = _data.y;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  value_type &operator[](const int &i) const {
+    return reinterpret_cast<value_type *>(&_data)[i];
+  }
+};
+
+template <>
+class Vector<SIMD<double>, 2> {
+ public:
+  using type       = Vector<SIMD<double>, 2>;
+  using value_type = double;
+  using mag_type   = double;
+
+  enum : int { vector_length = 2 };
+  typedef double2 data_type;
+
+  KOKKOS_INLINE_FUNCTION
+  static const char *label() { return "GpuDouble2"; }
+
+  template <typename, int>
+  friend class Vector;
+
+ private:
+  mutable data_type _data;
+
+ public:
+  KOKKOS_INLINE_FUNCTION Vector() {
+    _data.x = 0;
+    _data.y = 0;
+  }
+  KOKKOS_INLINE_FUNCTION Vector(const value_type &val) {
+    _data.x = val;
+    _data.y = val;
+  }
+  KOKKOS_INLINE_FUNCTION Vector(const type &b) {
+    _data.x = b._data.x;
+    _data.y = b._data.y;
+  }
+  KOKKOS_INLINE_FUNCTION Vector(const double2 &val) {
+    _data.x = val.x;
+    _data.y = val.y;
+  }
+
+  template <typename ArgValueType>
+  KOKKOS_INLINE_FUNCTION Vector(const ArgValueType &val) {
+    _data.x = val;
+    _data.y = val;
+  }
+
+  template <typename ArgValueType>
+  KOKKOS_INLINE_FUNCTION Vector(
+      const Vector<SIMD<ArgValueType>, vector_length> &b) {
+    _data.x = b[0];
+    _data.y = b[1];
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  type &operator=(const double2 &val) {
+    _data.x = val.x;
+    _data.y = val.y;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  double2 double2() const { return _data; }
+
+  KOKKOS_INLINE_FUNCTION
+  type &loadAligned(const value_type *p) {
+    _data.x = *(p);
+    _data.y = *(p + 1);
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  type &loadUnaligned(const value_type *p) {
+    _data.x = *(p);
+    _data.y = *(p + 1);
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void storeAligned(value_type *p) const {
+    *(p)     = _data.x;
+    *(p + 1) = _data.y;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void storeUnaligned(value_type *p) const {
+    *(p)     = _data.x;
+    *(p + 1) = _data.y;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  value_type &operator[](const int &i) const {
+    return reinterpret_cast<value_type *>(&_data)[i];
+  }
+};
+
+template <>
+class Vector<SIMD<float>, 4> {
+ public:
+  using type       = Vector<SIMD<float>, 4>;
+  using value_type = float;
+  using mag_type   = float;
+
+  enum : int { vector_length = 4 };
+  typedef float4 data_type;
+
+  KOKKOS_INLINE_FUNCTION
+  static const char *label() { return "GpuFloat4"; }
+
+  template <typename, int>
+  friend class Vector;
+
+ private:
+  mutable data_type _data;
+
+ public:
+  KOKKOS_INLINE_FUNCTION Vector() {
+    _data.x = 0;
+    _data.y = 0;
+    _data.z = 0;
+    _data.w = 0;
+  }
+  KOKKOS_INLINE_FUNCTION Vector(const value_type &val) {
+    _data.x = val;
+    _data.y = val;
+    _data.z = val;
+    _data.w = val;
+  }
+  KOKKOS_INLINE_FUNCTION Vector(const type &b) {
+    _data.x = b._data.x;
+    _data.y = b._data.y;
+    _data.z = b._data.z;
+    _data.w = b._data.w;
+  }
+  KOKKOS_INLINE_FUNCTION Vector(const float4 &val) {
+    _data.x = val.x;
+    _data.y = val.y;
+    _data.z = val.z;
+    _data.w = val.w;
+  }
+
+  template <typename ArgValueType>
+  KOKKOS_INLINE_FUNCTION Vector(const ArgValueType &val) {
+    _data.x = val;
+    _data.y = val;
+    _data.z = val;
+    _data.w = val;
+  }
+
+  template <typename ArgValueType>
+  KOKKOS_INLINE_FUNCTION Vector(
+      const Vector<SIMD<ArgValueType>, vector_length> &b) {
+    _data.x = b[0];
+    _data.y = b[1];
+    _data.z = b[2];
+    _data.w = b[3];
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  type &operator=(const float4 &val) {
+    _data.x = val.x;
+    _data.y = val.y;
+    _data.z = val.z;
+    _data.w = val.w;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  float4 float4() const { return _data; }
+
+  KOKKOS_INLINE_FUNCTION
+  type &loadAligned(const value_type *p) {
+    _data.x = *(p);
+    _data.y = *(p + 1);
+    _data.z = *(p + 2);
+    _data.w = *(p + 3);
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  type &loadUnaligned(const value_type *p) {
+    _data.x = *(p);
+    _data.y = *(p + 1);
+    _data.z = *(p + 2);
+    _data.w = *(p + 3);
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void storeAligned(value_type *p) const {
+    *(p)     = _data.x;
+    *(p + 1) = _data.y;
+    *(p + 2) = _data.z;
+    *(p + 3) = _data.w;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void storeUnaligned(value_type *p) const {
+    *(p)     = _data.x;
+    *(p + 1) = _data.y;
+    *(p + 2) = _data.z;
+    *(p + 3) = _data.w;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  value_type &operator[](const int &i) const {
+    return reinterpret_cast<value_type *>(&_data)[i];
+  }
+};
+
+template <>
+class Vector<SIMD<double>, 4> {
+ public:
+  using type       = Vector<SIMD<double>, 4>;
+  using value_type = double;
+  using mag_type   = double;
+
+  enum : int { vector_length = 4 };
+  typedef double4 data_type;
+
+  KOKKOS_INLINE_FUNCTION
+  static const char *label() { return "GpuDouble4"; }
+
+  template <typename, int>
+  friend class Vector;
+
+ private:
+  mutable data_type _data;
+
+ public:
+  KOKKOS_INLINE_FUNCTION Vector() {
+    _data.x = 0;
+    _data.y = 0;
+    _data.z = 0;
+    _data.w = 0;
+  }
+  KOKKOS_INLINE_FUNCTION Vector(const value_type &val) {
+    _data.x = val;
+    _data.y = val;
+    _data.z = val;
+    _data.w = val;
+  }
+  KOKKOS_INLINE_FUNCTION Vector(const type &b) {
+    _data.x = b._data.x;
+    _data.y = b._data.y;
+    _data.z = b._data.z;
+    _data.w = b._data.w;
+  }
+  KOKKOS_INLINE_FUNCTION Vector(const double4 &val) {
+    _data.x = val.x;
+    _data.y = val.y;
+    _data.z = val.z;
+    _data.w = val.w;
+  }
+
+  template <typename ArgValueType>
+  KOKKOS_INLINE_FUNCTION Vector(const ArgValueType &val) {
+    _data.x = val;
+    _data.y = val;
+    _data.z = val;
+    _data.w = val;
+  }
+
+  template <typename ArgValueType>
+  KOKKOS_INLINE_FUNCTION Vector(
+      const Vector<SIMD<ArgValueType>, vector_length> &b) {
+    _data.x = b[0];
+    _data.y = b[1];
+    _data.z = b[2];
+    _data.w = b[3];
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  type &operator=(const double4 &val) {
+    _data.x = val.x;
+    _data.y = val.y;
+    _data.z = val.z;
+    _data.w = val.w;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  double4 double4() const { return _data; }
+
+  KOKKOS_INLINE_FUNCTION
+  type &loadAligned(const value_type *p) {
+    _data.x = *(p);
+    _data.y = *(p + 1);
+    _data.z = *(p + 2);
+    _data.w = *(p + 3);
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  type &loadUnaligned(const value_type *p) {
+    _data.x = *(p);
+    _data.y = *(p + 1);
+    _data.z = *(p + 2);
+    _data.w = *(p + 3);
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void storeAligned(value_type *p) const {
+    *(p)     = _data.x;
+    *(p + 1) = _data.y;
+    *(p + 2) = _data.z;
+    *(p + 3) = _data.w;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void storeUnaligned(value_type *p) const {
+    *(p)     = _data.x;
+    *(p + 1) = _data.y;
+    *(p + 2) = _data.z;
+    *(p + 3) = _data.w;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  value_type &operator[](const int &i) const {
+    return reinterpret_cast<value_type *>(&_data)[i];
+  }
+};
+
+}  // namespace KokkosBatched
 #endif
 
-
 #if defined(__KOKKOSBATCHED_ENABLE_AVX__)
 #if defined(__AVX__) || defined(__AVX2__)
 #include <immintrin.h>
 
 namespace KokkosBatched {
 
-  template<>
-  class Vector<SIMD<double>,4> {
-  public:
-    using type = Vector<SIMD<double>,4>;
-    using value_type = double;
-    using mag_type = double;
-
-    enum : int { vector_length = 4 };
-    typedef __m256d data_type __attribute__ ((aligned(32)));
-
-    inline
-    static const char* label() { return "AVX256"; }
-
-    template<typename,int>
-    friend class Vector;
-
-  private:
-    mutable data_type _data;
-
-  public:
-    inline Vector() { _data = _mm256_setzero_pd(); }
-    inline Vector(const value_type &val) { _data = _mm256_set1_pd(val); }
-    inline Vector(const type &b) { _data = b._data; }
-    inline Vector(const __m256d &val) { _data = val; }
-
-    template<typename ArgValueType>
-    inline Vector(const ArgValueType &val) {
-      auto d = reinterpret_cast<value_type*>(&_data);
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
-#pragma ivdep
-#endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
-#pragma vector always
-#endif
-#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
-#pragma omp simd
-#endif
-      for (int i=0;i<vector_length;++i)
-        d[i] = val;
-    }
-
-    template<typename ArgValueType>
-    inline Vector(const Vector<SIMD<ArgValueType>,vector_length> &b) {
-      auto dd = reinterpret_cast<value_type*>(&_data);
-      auto bb = reinterpret_cast<ArgValueType*>(&b._data);
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
-#pragma ivdep
-#endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
-#pragma vector always
-#endif
-#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
-#pragma omp simd
-#endif
-      for (int i=0;i<vector_length;++i)
-        dd[i] = bb[i];
-    }
-
-    inline
-    type& operator=(const __m256d &val) {
-      _data = val;
-      return *this;
-    }
-
-    inline
-    operator __m256d() const {
-      return _data;
-    }
-
-    inline
-    type& loadAligned(const value_type *p) {
-      _data = _mm256_load_pd(p);
-      return *this;
-    }
-
-    inline
-    type& loadUnaligned(const value_type *p) {
-      _data = _mm256_loadu_pd(p);
-      return *this;
-    }
-
-    inline
-    void storeAligned(value_type *p) const {
-      _mm256_store_pd(p, _data);
-    }
-
-    inline
-    void storeUnaligned(value_type *p) const {
-      _mm256_storeu_pd(p, _data);
-    }
-
-    inline
-    value_type& operator[](const int &i) const {
-      return reinterpret_cast<value_type*>(&_data)[i];
-    }
-  };
-
-  template<>
-  class Vector<SIMD<Kokkos::complex<double> >,2> {
-  public:
-    using type = Vector<SIMD<Kokkos::complex<double> >,2>;
-    using value_type = Kokkos::complex<double>;
-    using mag_type = double;
-
-    static const int vector_length = 2;
-    typedef __m256d data_type __attribute__ ((aligned(32)));
-
-    inline
-    static const char* label() { return "AVX256"; }
-      
-    template<typename,int>
-    friend class Vector;
-
-  private:
-    mutable data_type _data;
-
-  public:
-    inline Vector() { _data = _mm256_setzero_pd(); }
-    inline Vector(const value_type &val) { _data = _mm256_broadcast_pd((const __m128d *)&val);}
-    inline Vector(const mag_type &val) { const value_type a(val); _data = _mm256_broadcast_pd((__m128d const *)&a); }
-    inline Vector(const type &b) { _data = b._data; }
-    inline Vector(const __m256d &val) { _data = val; }
-      
-    //       template<typename ArgValueType>
-    //       inline Vector(const ArgValueType val) {
-    // #if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
-    // #pragma ivdep
-    // #endif
-    // #if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
-    // #pragma vector always
-    // #endif
-    //         for (int i=0;i<vector_length;++i)
-    //           _data.d[i] = value_type(val);
-    //       }
-    template<typename ArgValueType>
-    inline Vector(const Vector<SIMD<ArgValueType>,vector_length> &b) {
-      auto dd = reinterpret_cast<value_type*>(&_data);
-      auto bb = reinterpret_cast<ArgValueType*>(&b._data);
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
-#pragma ivdep
-#endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
-#pragma vector always
-#endif
-#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
-#pragma omp simd
-#endif
-      for (int i=0;i<vector_length;++i)
-        dd[i] = bb[i];
-    }
-
-    inline
-    type& operator=(const __m256d &val) {
-      _data = val;
-      return *this;
-    }
-
-    inline
-    operator __m256d() const {
-      return _data;
-    }
-
-    inline
-    type& loadAligned(const value_type *p) {
-      _data = _mm256_load_pd((mag_type*)p);
-      return *this;
-    }
-
-    inline
-    type& loadUnaligned(const value_type *p) {
-      _data = _mm256_loadu_pd((mag_type*)p);
-      return *this;
-    }
-
-    inline
-    void storeAligned(value_type *p) const {
-      _mm256_store_pd((mag_type*)p, _data);
-    }
-
-    inline
-    void storeUnaligned(value_type *p) const {
-      _mm256_storeu_pd((mag_type*)p, _data);
-    }
-
-    inline
-    value_type& operator[](const int &i) const {
-      return reinterpret_cast<value_type*>(&_data)[i];
-    }
-  };
-}
+template <>
+class Vector<SIMD<double>, 4> {
+ public:
+  using type       = Vector<SIMD<double>, 4>;
+  using value_type = double;
+  using mag_type   = double;
+
+  enum : int { vector_length = 4 };
+  typedef __m256d data_type __attribute__((aligned(32)));
+
+  inline static const char *label() { return "AVX256"; }
+
+  template <typename, int>
+  friend class Vector;
+
+ private:
+  mutable data_type _data;
+
+ public:
+  inline Vector() { _data = _mm256_setzero_pd(); }
+  inline Vector(const value_type &val) { _data = _mm256_set1_pd(val); }
+  inline Vector(const type &b) { _data = b._data; }
+  inline Vector(const __m256d &val) { _data = val; }
+
+  template <typename ArgValueType>
+  inline Vector(const ArgValueType &val) {
+    auto d = reinterpret_cast<value_type *>(&_data);
+    KOKKOSKERNELS_FORCE_SIMD
+    for (int i = 0; i < vector_length; ++i) d[i] = val;
+  }
+
+  template <typename ArgValueType>
+  inline Vector(const Vector<SIMD<ArgValueType>, vector_length> &b) {
+    auto dd = reinterpret_cast<value_type *>(&_data);
+    auto bb = reinterpret_cast<ArgValueType *>(&b._data);
+    KOKKOSKERNELS_FORCE_SIMD
+    for (int i = 0; i < vector_length; ++i) dd[i] = bb[i];
+  }
+
+  inline type &operator=(const __m256d &val) {
+    _data = val;
+    return *this;
+  }
+
+  inline operator __m256d() const { return _data; }
+
+  inline type &loadAligned(const value_type *p) {
+    _data = _mm256_load_pd(p);
+    return *this;
+  }
+
+  inline type &loadUnaligned(const value_type *p) {
+    _data = _mm256_loadu_pd(p);
+    return *this;
+  }
+
+  inline void storeAligned(value_type *p) const { _mm256_store_pd(p, _data); }
+
+  inline void storeUnaligned(value_type *p) const {
+    _mm256_storeu_pd(p, _data);
+  }
+
+  inline value_type &operator[](const int &i) const {
+    return reinterpret_cast<value_type *>(&_data)[i];
+  }
+};
+
+template <>
+class Vector<SIMD<Kokkos::complex<double> >, 2> {
+ public:
+  using type       = Vector<SIMD<Kokkos::complex<double> >, 2>;
+  using value_type = Kokkos::complex<double>;
+  using mag_type   = double;
+
+  static const int vector_length = 2;
+  typedef __m256d data_type __attribute__((aligned(32)));
+
+  inline static const char *label() { return "AVX256"; }
+
+  template <typename, int>
+  friend class Vector;
+
+ private:
+  mutable data_type _data;
+
+ public:
+  inline Vector() { _data = _mm256_setzero_pd(); }
+  inline Vector(const value_type &val) {
+    _data = _mm256_broadcast_pd((const __m128d *)&val);
+  }
+  inline Vector(const mag_type &val) {
+    const value_type a(val);
+    _data = _mm256_broadcast_pd((__m128d const *)&a);
+  }
+  inline Vector(const type &b) { _data = b._data; }
+  inline Vector(const __m256d &val) { _data = val; }
+
+  //       template<typename ArgValueType>
+  //       inline Vector(const ArgValueType val) {
+  // #if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+  // #pragma ivdep
+  // #endif
+  // #if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
+  // #pragma vector always
+  // #endif
+  //         for (int i=0;i<vector_length;++i)
+  //           _data.d[i] = value_type(val);
+  //       }
+  template <typename ArgValueType>
+  inline Vector(const Vector<SIMD<ArgValueType>, vector_length> &b) {
+    auto dd = reinterpret_cast<value_type *>(&_data);
+    auto bb = reinterpret_cast<ArgValueType *>(&b._data);
+    KOKKOSKERNELS_FORCE_SIMD
+    for (int i = 0; i < vector_length; ++i) dd[i] = bb[i];
+  }
+
+  inline type &operator=(const __m256d &val) {
+    _data = val;
+    return *this;
+  }
+
+  inline operator __m256d() const { return _data; }
+
+  inline type &loadAligned(const value_type *p) {
+    _data = _mm256_load_pd((mag_type *)p);
+    return *this;
+  }
+
+  inline type &loadUnaligned(const value_type *p) {
+    _data = _mm256_loadu_pd((mag_type *)p);
+    return *this;
+  }
+
+  inline void storeAligned(value_type *p) const {
+    _mm256_store_pd((mag_type *)p, _data);
+  }
+
+  inline void storeUnaligned(value_type *p) const {
+    _mm256_storeu_pd((mag_type *)p, _data);
+  }
+
+  inline value_type &operator[](const int &i) const {
+    return reinterpret_cast<value_type *>(&_data)[i];
+  }
+};
+}  // namespace KokkosBatched
 #endif /* #if defined(__AVX__) || defined(__AVX2__) */
 
 #if defined(__AVX512F__)
@@ -692,203 +655,147 @@ namespace KokkosBatched {
 
 namespace KokkosBatched {
 
-  template<>
-  class Vector<SIMD<double>,8> {
-  public:
-    using type = Vector<SIMD<double>,8>;
-    using value_type = double;
-    using mag_type = double;
-      
-    enum : int { vector_length = 8 };
-    typedef __m512d data_type __attribute__ ((aligned(64)));
-
-    inline
-    static const char* label() { return "AVX512"; }
-
-    template<typename,int>
-    friend class Vector;
-
-  private:
-    mutable data_type _data;
-
-  public:
-    inline Vector() { _data = _mm512_setzero_pd(); }
-    inline Vector(const value_type &val) { _data = _mm512_set1_pd(val); }
-    inline Vector(const type &b) { _data = b._data; }
-    inline Vector(const __m512d &val) { _data = val; }
-
-    template<typename ArgValueType>
-    inline Vector(const ArgValueType &val) {
-      auto d = reinterpret_cast<value_type*>(&_data);
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
-#pragma ivdep
-#endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
-#pragma vector always
-#endif
-#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
-#pragma omp simd
-#endif
-      for (int i=0;i<vector_length;++i)
-        d[i] = val;
-    }
-    template<typename ArgValueType>
-    inline Vector(const Vector<SIMD<ArgValueType>,vector_length> &b) {
-      auto dd = reinterpret_cast<value_type*>(&_data);
-      auto bb = reinterpret_cast<ArgValueType*>(&b._data);
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
-#pragma ivdep
-#endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
-#pragma vector always
-#endif
-#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
-#pragma omp simd
-#endif
-      for (int i=0;i<vector_length;++i)
-        dd[i] = bb[i];
-    }
-
-    inline
-    type& operator=(const __m512d &val) {
-      _data = val;
-      return *this;
-    }
-
-    inline
-    operator __m512d() const {
-      return _data;
-    }
-
-    inline
-    type& loadAligned(const value_type *p) {
-      _data = _mm512_load_pd(p);
-      return *this;
-    }
-
-    inline
-    type& loadUnaligned(const value_type *p) {
-      _data = _mm512_loadu_pd(p);
-      return *this;
-    }
-
-    inline
-    void storeAligned(value_type *p) const {
-      _mm512_store_pd(p, _data);
-    }
-
-    inline
-    void storeUnaligned(value_type *p) const {
-      _mm512_storeu_pd(p, _data);
-    }
-
-    inline
-    value_type& operator[](const int &i) const {
-      return reinterpret_cast<value_type*>(&_data)[i];
-    }
-  };
-
-  template<>
-  class Vector<SIMD<Kokkos::complex<double> >,4> {
-  public:
-    using type = Vector<SIMD<Kokkos::complex<double> >,4>;
-    using value_type = Kokkos::complex<double>;
-    using mag_type = double;
-
-    enum : int { vector_length = 4 };
-    typedef __m512d data_type __attribute__ ((aligned(64)));
-
-    inline
-    static const char* label() { return "AVX512"; }
-
-    template<typename,int>
-    friend class Vector;
-
-  private:
-    mutable data_type _data;
-
-  public:
-    inline Vector() { _data = _mm512_setzero_pd(); }
-    inline Vector(const value_type &val) {
-      _data = _mm512_mask_broadcast_f64x4(_mm512_set1_pd(val.imag()), 0x55, _mm256_set1_pd(val.real()));
-    }
-    inline Vector(const mag_type &val) {
-      _data = _mm512_mask_broadcast_f64x4(_mm512_setzero_pd(), 0x55, _mm256_set1_pd(val));
-    }
-    inline Vector(const type &b) { _data = b._data; }
-    inline Vector(const __m512d &val) { _data = val; }
-
-    template<typename ArgValueType>
-    inline Vector(const ArgValueType &val) {
-      auto d = reinterpret_cast<value_type*>(&_data);
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
-#pragma ivdep
-#endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
-#pragma vector always
-#endif
-#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
-#pragma omp simd
-#endif
-      for (int i=0;i<vector_length;++i)
-        d[i] = val;
-    }
-    template<typename ArgValueType>
-    inline Vector(const Vector<SIMD<ArgValueType>,vector_length> &b) {
-      auto dd = reinterpret_cast<value_type*>(&_data);
-      auto bb = reinterpret_cast<value_type*>(&b._data);
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
-#pragma ivdep
-#endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
-#pragma vector always
-#endif
-#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
-#pragma omp simd
-#endif
-      for (int i=0;i<vector_length;++i)
-        dd[i] = bb[i];
-    }
-
-    inline
-    type& operator=(const __m512d &val) {
-      _data = val;
-      return *this;
-    }
-
-    inline
-    operator __m512d() const {
-      return _data;
-    }
-
-    inline
-    type& loadAligned(const value_type *p) {
-      _data = _mm512_load_pd((mag_type*)p);
-      return *this;
-    }
-
-    inline
-    type& loadUnaligned(const value_type *p) {
-      _data = _mm512_loadu_pd((mag_type*)p);
-      return *this;
-    }
-
-    inline
-    void storeAligned(value_type *p) const {
-      _mm512_store_pd((mag_type*)p, _data);
-    }
-
-    inline
-    void storeUnaligned(value_type *p) const {
-      _mm512_storeu_pd((mag_type*)p, _data);
-    }
-
-    inline
-    value_type& operator[](const int &i) const {
-      return reinterpret_cast<value_type*>(&_data)[i];
-    }
-  };
-}
+template <>
+class Vector<SIMD<double>, 8> {
+ public:
+  using type       = Vector<SIMD<double>, 8>;
+  using value_type = double;
+  using mag_type   = double;
+
+  enum : int { vector_length = 8 };
+  typedef __m512d data_type __attribute__((aligned(64)));
+
+  inline static const char *label() { return "AVX512"; }
+
+  template <typename, int>
+  friend class Vector;
+
+ private:
+  mutable data_type _data;
+
+ public:
+  inline Vector() { _data = _mm512_setzero_pd(); }
+  inline Vector(const value_type &val) { _data = _mm512_set1_pd(val); }
+  inline Vector(const type &b) { _data = b._data; }
+  inline Vector(const __m512d &val) { _data = val; }
+
+  template <typename ArgValueType>
+  inline Vector(const ArgValueType &val) {
+    auto d = reinterpret_cast<value_type *>(&_data);
+    KOKKOSKERNELS_FORCE_SIMD
+    for (int i = 0; i < vector_length; ++i) d[i] = val;
+  }
+  template <typename ArgValueType>
+  inline Vector(const Vector<SIMD<ArgValueType>, vector_length> &b) {
+    auto dd = reinterpret_cast<value_type *>(&_data);
+    auto bb = reinterpret_cast<ArgValueType *>(&b._data);
+    KOKKOSKERNELS_FORCE_SIMD
+    for (int i = 0; i < vector_length; ++i) dd[i] = bb[i];
+  }
+
+  inline type &operator=(const __m512d &val) {
+    _data = val;
+    return *this;
+  }
+
+  inline operator __m512d() const { return _data; }
+
+  inline type &loadAligned(const value_type *p) {
+    _data = _mm512_load_pd(p);
+    return *this;
+  }
+
+  inline type &loadUnaligned(const value_type *p) {
+    _data = _mm512_loadu_pd(p);
+    return *this;
+  }
+
+  inline void storeAligned(value_type *p) const { _mm512_store_pd(p, _data); }
+
+  inline void storeUnaligned(value_type *p) const {
+    _mm512_storeu_pd(p, _data);
+  }
+
+  inline value_type &operator[](const int &i) const {
+    return reinterpret_cast<value_type *>(&_data)[i];
+  }
+};
+
+template <>
+class Vector<SIMD<Kokkos::complex<double> >, 4> {
+ public:
+  using type       = Vector<SIMD<Kokkos::complex<double> >, 4>;
+  using value_type = Kokkos::complex<double>;
+  using mag_type   = double;
+
+  enum : int { vector_length = 4 };
+  typedef __m512d data_type __attribute__((aligned(64)));
+
+  inline static const char *label() { return "AVX512"; }
+
+  template <typename, int>
+  friend class Vector;
+
+ private:
+  mutable data_type _data;
+
+ public:
+  inline Vector() { _data = _mm512_setzero_pd(); }
+  inline Vector(const value_type &val) {
+    _data = _mm512_mask_broadcast_f64x4(_mm512_set1_pd(val.imag()), 0x55,
+                                        _mm256_set1_pd(val.real()));
+  }
+  inline Vector(const mag_type &val) {
+    _data = _mm512_mask_broadcast_f64x4(_mm512_setzero_pd(), 0x55,
+                                        _mm256_set1_pd(val));
+  }
+  inline Vector(const type &b) { _data = b._data; }
+  inline Vector(const __m512d &val) { _data = val; }
+
+  template <typename ArgValueType>
+  inline Vector(const ArgValueType &val) {
+    auto d = reinterpret_cast<value_type *>(&_data);
+    KOKKOSKERNELS_FORCE_SIMD
+    for (int i = 0; i < vector_length; ++i) d[i] = val;
+  }
+  template <typename ArgValueType>
+  inline Vector(const Vector<SIMD<ArgValueType>, vector_length> &b) {
+    auto dd = reinterpret_cast<value_type *>(&_data);
+    auto bb = reinterpret_cast<value_type *>(&b._data);
+    KOKKOSKERNELS_FORCE_SIMD
+    for (int i = 0; i < vector_length; ++i) dd[i] = bb[i];
+  }
+
+  inline type &operator=(const __m512d &val) {
+    _data = val;
+    return *this;
+  }
+
+  inline operator __m512d() const { return _data; }
+
+  inline type &loadAligned(const value_type *p) {
+    _data = _mm512_load_pd((mag_type *)p);
+    return *this;
+  }
+
+  inline type &loadUnaligned(const value_type *p) {
+    _data = _mm512_loadu_pd((mag_type *)p);
+    return *this;
+  }
+
+  inline void storeAligned(value_type *p) const {
+    _mm512_store_pd((mag_type *)p, _data);
+  }
+
+  inline void storeUnaligned(value_type *p) const {
+    _mm512_storeu_pd((mag_type *)p, _data);
+  }
+
+  inline value_type &operator[](const int &i) const {
+    return reinterpret_cast<value_type *>(&_data)[i];
+  }
+};
+}  // namespace KokkosBatched
 
 #endif /* #if defined(__AVX512F__) */
 #endif /* #if defined(__KOKKOSBATCHED_ENABLE_AVX__) */
diff --git a/src/batched/dense/KokkosBatched_Xpay.hpp b/src/batched/dense/KokkosBatched_Xpay.hpp
new file mode 100644
index 0000000000..26cb96835e
--- /dev/null
+++ b/src/batched/dense/KokkosBatched_Xpay.hpp
@@ -0,0 +1,141 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+#ifndef __KOKKOSBATCHED_XPAY_HPP__
+#define __KOKKOSBATCHED_XPAY_HPP__
+
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Vector.hpp"
+
+namespace KokkosBatched {
+
+/// \brief Serial Batched XPAY:
+///   y_l <- x_l + alpha_l * y_l for all l = 1, ..., N
+/// where:
+///   * N is the number of vectors,
+///   * x_1, ..., x_N are the N input vectors,
+///   * y_1, ..., y_N are the N output vectors,
+///   * alpha_1, ..., alpha_N are N scaling factors for y_1, ..., y_N.
+///
+/// \tparam XViewType: Input type for X, needs to be a 2D view
+/// \tparam YViewType: Input type for Y, needs to be a 2D view
+/// \tparam alphaViewType: Input type for alpha, needs to be a 1D view
+///
+/// \param alpha [in]: input coefficient for Y, a rank 1 view
+/// \param X [in]: Input vector X, a rank 2 view
+/// \param Y [in/out]: Output vector Y, a rank 2 view
+///
+/// No nested parallel_for is used inside of the function.
+///
+
+struct SerialXpay {
+  template <typename ViewType, typename alphaViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const alphaViewType &alpha,
+                                           const ViewType &X,
+                                           const ViewType &Y);
+};
+
+/// \brief Team Batched XPAY:
+///   y_l <- x_l + alpha_l * y_l for all l = 1, ..., N
+/// where:
+///   * N is the number of vectors,
+///   * x_1, ..., x_N are the N input vectors,
+///   * y_1, ..., y_N are the N output vectors,
+///   * alpha_1, ..., alpha_N are N scaling factors for y_1, ..., y_N.
+///
+/// \tparam XViewType: Input type for X, needs to be a 2D view
+/// \tparam YViewType: Input type for Y, needs to be a 2D view
+/// \tparam alphaViewType: Input type for alpha, needs to be a 1D view
+///
+/// \param member [in]: TeamPolicy member
+/// \param alpha [in]: input coefficient for Y, a rank 1 view
+/// \param X [in]: Input vector X, a rank 2 view
+/// \param Y [in/out]: Output vector Y, a rank 2 view
+///
+/// A nested parallel_for with TeamThreadRange is used.
+///
+
+template <typename MemberType>
+struct TeamXpay {
+  template <typename ViewType, typename alphaViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const alphaViewType &alpha,
+                                           const ViewType &X,
+                                           const ViewType &Y);
+};
+
+/// \brief TeamVector Batched XPAY:
+///   y_l <- x_l + alpha_l * y_l for all l = 1, ..., N
+/// where:
+///   * N is the number of vectors,
+///   * x_1, ..., x_N are the N input vectors,
+///   * y_1, ..., y_N are the N output vectors,
+///   * alpha_1, ..., alpha_N are N scaling factors for y_1, ..., y_N.
+///
+/// \tparam XViewType: Input type for X, needs to be a 2D view
+/// \tparam YViewType: Input type for Y, needs to be a 2D view
+/// \tparam alphaViewType: Input type for alpha, needs to be a 1D view
+///
+/// \param member [in]: TeamPolicy member
+/// \param alpha [in]: input coefficient for Y, a rank 1 view
+/// \param X [in]: Input vector X, a rank 2 view
+/// \param Y [in/out]: Output vector Y, a rank 2 view
+///
+/// Two nested parallel_for with both TeamThreadRange and ThreadVectorRange
+/// (or one with TeamVectorRange) are used inside.
+///
+
+template <typename MemberType>
+struct TeamVectorXpay {
+  template <typename ViewType, typename alphaViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const alphaViewType &alpha,
+                                           const ViewType &X,
+                                           const ViewType &Y);
+};
+
+}  // namespace KokkosBatched
+
+#include "KokkosBatched_Xpay_Impl.hpp"
+
+#endif
diff --git a/src/batched/dense/impl/KokkosBatched_AddRadial_Impl.hpp b/src/batched/dense/impl/KokkosBatched_AddRadial_Impl.hpp
index 10afb581c2..d2c6024a6e 100644
--- a/src/batched/dense/impl/KokkosBatched_AddRadial_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_AddRadial_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_ADD_RADIAL_IMPL_HPP__
 #define __KOKKOSBATCHED_ADD_RADIAL_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,44 +8,31 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Impl
-  /// ===========
-      
-  template<typename ScalarType,
-           typename AViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialAddRadial::
-  invoke(const ScalarType tiny,
-         const AViewType &A) {
-    return SerialAddRadialInternal::
-      invoke((A.extent(0) < A.extent(1) ? A.extent(0) : A.extent(1)),
-             tiny, 
-             A.data(), (A.stride_0() + A.stride_1()));
-  }
-
-  ///
-  /// Team Impl
-  /// =========
-              
-  template<typename MemberType>
-  template<typename ScalarType,
-           typename AViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamAddRadial<MemberType>::
-  invoke(const MemberType &member, 
-         const ScalarType tiny,
-         const AViewType &A) {
-    return TeamAddRadialInternal::
-      invoke(member, 
-             (A.extent(0) < A.extent(1) ? A.extent(0) : A.extent(1)),
-             tiny, 
-             A.data(), (A.stride_0() + A.stride_1()));
-  }
-    
-} //end namespace KokkosBatched
-
+///
+/// Serial Impl
+/// ===========
+
+template <typename ScalarType, typename AViewType>
+KOKKOS_INLINE_FUNCTION int SerialAddRadial::invoke(const ScalarType tiny,
+                                                   const AViewType &A) {
+  return SerialAddRadialInternal::invoke(
+      (A.extent(0) < A.extent(1) ? A.extent(0) : A.extent(1)), tiny, A.data(),
+      (A.stride_0() + A.stride_1()));
+}
+
+///
+/// Team Impl
+/// =========
+
+template <typename MemberType>
+template <typename ScalarType, typename AViewType>
+KOKKOS_INLINE_FUNCTION int TeamAddRadial<MemberType>::invoke(
+    const MemberType &member, const ScalarType tiny, const AViewType &A) {
+  return TeamAddRadialInternal::invoke(
+      member, (A.extent(0) < A.extent(1) ? A.extent(0) : A.extent(1)), tiny,
+      A.data(), (A.stride_0() + A.stride_1()));
+}
+
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp
index 6fa51eda1d..a3ed5696e1 100644
--- a/src/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp
@@ -1,72 +1,62 @@
 #ifndef __KOKKOSBATCHED_ADD_RADIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_ADD_RADIAL_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
-  struct SerialAddRadialInternal {
-    template<typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m, 
-           const ScalarType tiny, 
-           /* */ ValueType *__restrict__ A, const int as) {
-      const auto       abs_tiny =  tiny > 0 ? tiny : -tiny;
-      const auto minus_abs_tiny = -abs_tiny;
+///
+/// Serial Internal Impl
+/// ====================
+struct SerialAddRadialInternal {
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType tiny,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as) {
+    const auto abs_tiny       = tiny > 0 ? tiny : -tiny;
+    const auto minus_abs_tiny = -abs_tiny;
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int i=0;i<m;++i) {
-        //const auto a_real = RealPart(A[i*as]);
-        const auto a_real = Kokkos::Details::ArithTraits<ValueType>::real(A[i*as]);
-        A[i*as] +=  ValueType(minus_abs_tiny)*ValueType(a_real <  0);
-        A[i*as] +=  ValueType(      abs_tiny)*ValueType(a_real >= 0);
-      }
-        
-      return 0;
+    for (int i = 0; i < m; ++i) {
+      // const auto a_real = RealPart(A[i*as]);
+      const auto a_real =
+          Kokkos::Details::ArithTraits<ValueType>::real(A[i * as]);
+      A[i * as] += ValueType(minus_abs_tiny) * ValueType(a_real < 0);
+      A[i * as] += ValueType(abs_tiny) * ValueType(a_real >= 0);
     }
-  };
-
-  ///
-  /// Team Internal Impl
-  /// ==================
-  struct TeamAddRadialInternal {
-    template<typename MemberType,
-             typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const int m, 
-           const ScalarType tiny, 
-           /* */ ValueType *__restrict__ A, const int as) {
-      const auto       abs_tiny =  tiny > 0 ? tiny : -tiny;
-      const auto minus_abs_tiny = -abs_tiny;
-        
-      Kokkos::parallel_for
-        (Kokkos::TeamThreadRange(member,m),
-         [&](const int &i) {
-          //const auto a_real = RealPart(A[i*as]);
-          const auto a_real = Kokkos::Details::ArithTraits<ValueType>::real(A[i*as]);
-          A[i*as] +=  ValueType(minus_abs_tiny)*ValueType(a_real <  0);
-          A[i*as] +=  ValueType(      abs_tiny)*ValueType(a_real >= 0);
-        });
-
-      return 0;
-    }
-      
-  };
-
-} // end namespace KokkosBatched
 
+    return 0;
+  }
+};
+
+///
+/// Team Internal Impl
+/// ==================
+struct TeamAddRadialInternal {
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int m, const ScalarType tiny,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as) {
+    const auto abs_tiny       = tiny > 0 ? tiny : -tiny;
+    const auto minus_abs_tiny = -abs_tiny;
+
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) {
+      // const auto a_real = RealPart(A[i*as]);
+      const auto a_real =
+          Kokkos::Details::ArithTraits<ValueType>::real(A[i * as]);
+      A[i * as] += ValueType(minus_abs_tiny) * ValueType(a_real < 0);
+      A[i * as] += ValueType(abs_tiny) * ValueType(a_real >= 0);
+    });
+
+    return 0;
+  }
+};
+
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_ApplyGivens_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_ApplyGivens_Serial_Internal.hpp
index f45fbf8e23..2d54265446 100644
--- a/src/batched/dense/impl/KokkosBatched_ApplyGivens_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_ApplyGivens_Serial_Internal.hpp
@@ -1,182 +1,170 @@
 #ifndef __KOKKOSBATCHED_APPLY_GIVENS_SERIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_APPLY_GIVENS_SERIAL_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
-  ///
-  /// this impl follows the flame interface of householder transformation
-  ///
-  struct SerialApplyLeftGivensInternal {
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const Kokkos::pair<ValueType,ValueType> G,
-           const int n, 
-           /* */ ValueType * a1t, const int a1ts,
-           /* */ ValueType * a2t, const int a2ts) {
-      typedef ValueType value_type;
-      if (n == 0) return 0; // quick return
-      if (G.first == value_type(1) && G.second == value_type(0)) return 0;
-      /// G = [ gamma -sigma;
-      ///       sigma  gamma ];
-      /// A := G A
-      /// where gamma is G.first and sigma is G.second 
-        
-      const value_type gamma = G.first;
-      const value_type sigma = G.second;
+///
+/// Serial Internal Impl
+/// ====================
+///
+/// this impl follows the flame interface of householder transformation
+///
+struct SerialApplyLeftGivensInternal {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const Kokkos::pair<ValueType, ValueType> G, const int n,
+      /* */ ValueType *a1t, const int a1ts,
+      /* */ ValueType *a2t, const int a2ts) {
+    typedef ValueType value_type;
+    if (n == 0) return 0;  // quick return
+    if (G.first == value_type(1) && G.second == value_type(0)) return 0;
+    /// G = [ gamma -sigma;
+    ///       sigma  gamma ];
+    /// A := G A
+    /// where gamma is G.first and sigma is G.second
+
+    const value_type gamma = G.first;
+    const value_type sigma = G.second;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int j=0;j<n;++j) {
-        const value_type alpha1 = a1t[j*a1ts];
-        const value_type alpha2 = a2t[j*a2ts];
-        a1t[j*a1ts] = gamma*alpha1 - sigma*alpha2;
-        a2t[j*a1ts] = sigma*alpha1 + gamma*alpha2;
-      }
-      return 0;
+    for (int j = 0; j < n; ++j) {
+      const value_type alpha1 = a1t[j * a1ts];
+      const value_type alpha2 = a2t[j * a2ts];
+      a1t[j * a1ts]           = gamma * alpha1 - sigma * alpha2;
+      a2t[j * a1ts]           = sigma * alpha1 + gamma * alpha2;
     }
-  };
-
-  struct SerialApplyRightGivensInternal {
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const Kokkos::pair<ValueType,ValueType> G,
-           const int m, 
-           /* */ ValueType * a1, const int a1s,
-           /* */ ValueType * a2, const int a2s) {
-      typedef ValueType value_type;
-      if (m == 0) return 0; // quick return
-      if (G.first == value_type(1) && G.second == value_type(0)) return 0;
-      /// G = [ gamma -sigma;
-      ///       sigma  gamma ];
-      /// A := A G'
-      /// where gamma is G.first and sigma is G.second 
-          
-      const value_type gamma = G.first;
-      const value_type sigma = G.second;
+    return 0;
+  }
+};
+
+struct SerialApplyRightGivensInternal {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const Kokkos::pair<ValueType, ValueType> G, const int m,
+      /* */ ValueType *a1, const int a1s,
+      /* */ ValueType *a2, const int a2s) {
+    typedef ValueType value_type;
+    if (m == 0) return 0;  // quick return
+    if (G.first == value_type(1) && G.second == value_type(0)) return 0;
+    /// G = [ gamma -sigma;
+    ///       sigma  gamma ];
+    /// A := A G'
+    /// where gamma is G.first and sigma is G.second
+
+    const value_type gamma = G.first;
+    const value_type sigma = G.second;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
-#endif        
-      for (int i=0;i<m;++i) {
-        const value_type alpha1 = a1[i*a1s];
-        const value_type alpha2 = a2[i*a2s];
-        a1[i*a1s] = gamma*alpha1 - sigma*alpha2;
-        a2[i*a1s] = sigma*alpha1 + gamma*alpha2;
-      }
-      return 0;
+#endif
+    for (int i = 0; i < m; ++i) {
+      const value_type alpha1 = a1[i * a1s];
+      const value_type alpha2 = a2[i * a2s];
+      a1[i * a1s]             = gamma * alpha1 - sigma * alpha2;
+      a2[i * a1s]             = sigma * alpha1 + gamma * alpha2;
     }
-  };
-
-  struct SerialApplyLeftRightGivensInternal {
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const Kokkos::pair<ValueType,ValueType> &G12,
-           const int &m, const int &n,
-           /* */ ValueType *__restrict__ a1t, 
-           /* */ ValueType *__restrict__ a2t, 
-           /* */ ValueType *__restrict__ a1, 
-           /* */ ValueType *__restrict__ a2, 
-           const int &as0, const int &as1) {
-      typedef ValueType value_type;
-      if (G12.first == value_type(1) && G12.second == value_type(0)) return 0;
-      if (m == 0 && n == 0) return 0; // quick return
-
-      const value_type gamma12 = G12.first;
-      const value_type sigma12 = G12.second;
+    return 0;
+  }
+};
+
+struct SerialApplyLeftRightGivensInternal {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const Kokkos::pair<ValueType, ValueType> &G12, const int &m, const int &n,
+      /* */ ValueType *KOKKOS_RESTRICT a1t,
+      /* */ ValueType *KOKKOS_RESTRICT a2t,
+      /* */ ValueType *KOKKOS_RESTRICT a1,
+      /* */ ValueType *KOKKOS_RESTRICT a2, const int &as0, const int &as1) {
+    typedef ValueType value_type;
+    if (G12.first == value_type(1) && G12.second == value_type(0)) return 0;
+    if (m == 0 && n == 0) return 0;  // quick return
+
+    const value_type gamma12 = G12.first;
+    const value_type sigma12 = G12.second;
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int j=0;j<n;++j) {
-        const value_type alpha1 = a1t[j*as1];
-        const value_type alpha2 = a2t[j*as1];
-        a1t[j*as1] = ( gamma12*alpha1 - sigma12*alpha2 );
-        a2t[j*as1] = ( sigma12*alpha1 + gamma12*alpha2 );
-      }
+    for (int j = 0; j < n; ++j) {
+      const value_type alpha1 = a1t[j * as1];
+      const value_type alpha2 = a2t[j * as1];
+      a1t[j * as1]            = (gamma12 * alpha1 - sigma12 * alpha2);
+      a2t[j * as1]            = (sigma12 * alpha1 + gamma12 * alpha2);
+    }
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int i=0;i<m;++i) {
-        const value_type alpha1 = a1[i*as0];
-        const value_type alpha2 = a2[i*as0];
-        a1[i*as0] = ( gamma12*alpha1 - sigma12*alpha2 );
-        a2[i*as0] = ( sigma12*alpha1 + gamma12*alpha2 );
-      }
-      return 0;
+    for (int i = 0; i < m; ++i) {
+      const value_type alpha1 = a1[i * as0];
+      const value_type alpha2 = a2[i * as0];
+      a1[i * as0]             = (gamma12 * alpha1 - sigma12 * alpha2);
+      a2[i * as0]             = (sigma12 * alpha1 + gamma12 * alpha2);
     }
-
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const Kokkos::pair<ValueType,ValueType> &G12,
-           const Kokkos::pair<ValueType,ValueType> &G13,
-           const int &m, const int &n, 
-           /* */ ValueType *__restrict__ a1t, 
-           /* */ ValueType *__restrict__ a2t, 
-           /* */ ValueType *__restrict__ a3t, 
-           /* */ ValueType *__restrict__ a1, 
-           /* */ ValueType *__restrict__ a2, 
-           /* */ ValueType *__restrict__ a3, 
-           const int &as0, const int &as1) {
-      typedef ValueType value_type;
-      if (m == 0 && n == 0) return 0; // quick return
-
-      const value_type gamma12 = G12.first;
-      const value_type sigma12 = G12.second;
-      const value_type gamma13 = G13.first;
-      const value_type sigma13 = G13.second;
+    return 0;
+  }
+
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const Kokkos::pair<ValueType, ValueType> &G12,
+      const Kokkos::pair<ValueType, ValueType> &G13, const int &m, const int &n,
+      /* */ ValueType *KOKKOS_RESTRICT a1t,
+      /* */ ValueType *KOKKOS_RESTRICT a2t,
+      /* */ ValueType *KOKKOS_RESTRICT a3t,
+      /* */ ValueType *KOKKOS_RESTRICT a1,
+      /* */ ValueType *KOKKOS_RESTRICT a2,
+      /* */ ValueType *KOKKOS_RESTRICT a3, const int &as0, const int &as1) {
+    typedef ValueType value_type;
+    if (m == 0 && n == 0) return 0;  // quick return
+
+    const value_type gamma12 = G12.first;
+    const value_type sigma12 = G12.second;
+    const value_type gamma13 = G13.first;
+    const value_type sigma13 = G13.second;
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int j=0;j<n;++j) {
-        const value_type alpha2 = a2t[j*as1];
-        const value_type alpha3 = a3t[j*as1];
-        {
-          const value_type alpha1 = a1t[j*as1];
-          a1t[j*as1] = ( gamma12*alpha1 - sigma12*alpha2 );
-          a2t[j*as1] = ( sigma12*alpha1 + gamma12*alpha2 ); 
-        }
-        {
-          const value_type alpha1 = a1t[j*as1];
-          a1t[j*as1] = ( gamma13*alpha1 - sigma13*alpha3 );
-          a3t[j*as1] = ( sigma13*alpha1 + gamma13*alpha3 );
-        }
+    for (int j = 0; j < n; ++j) {
+      const value_type alpha2 = a2t[j * as1];
+      const value_type alpha3 = a3t[j * as1];
+      {
+        const value_type alpha1 = a1t[j * as1];
+        a1t[j * as1]            = (gamma12 * alpha1 - sigma12 * alpha2);
+        a2t[j * as1]            = (sigma12 * alpha1 + gamma12 * alpha2);
+      }
+      {
+        const value_type alpha1 = a1t[j * as1];
+        a1t[j * as1]            = (gamma13 * alpha1 - sigma13 * alpha3);
+        a3t[j * as1]            = (sigma13 * alpha1 + gamma13 * alpha3);
       }
+    }
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int i=0;i<m;++i) {
-        const value_type alpha2 = a2[i*as0];
-        const value_type alpha3 = a3[i*as0];
-        {
-          const value_type alpha1 = a1[i*as0];
-          a1[i*as0] = ( gamma12*alpha1 - sigma12*alpha2 );
-          a2[i*as0] = ( sigma12*alpha1 + gamma12*alpha2 );
-        }
-        {
-          const value_type alpha1 = a1[i*as0];
-          a1[i*as0] = ( gamma13*alpha1 - sigma13*alpha3 );
-          a3[i*as0] = ( sigma13*alpha1 + gamma13*alpha3 );
-        }
+    for (int i = 0; i < m; ++i) {
+      const value_type alpha2 = a2[i * as0];
+      const value_type alpha3 = a3[i * as0];
+      {
+        const value_type alpha1 = a1[i * as0];
+        a1[i * as0]             = (gamma12 * alpha1 - sigma12 * alpha2);
+        a2[i * as0]             = (sigma12 * alpha1 + gamma12 * alpha2);
+      }
+      {
+        const value_type alpha1 = a1[i * as0];
+        a1[i * as0]             = (gamma13 * alpha1 - sigma13 * alpha3);
+        a3[i * as0]             = (sigma13 * alpha1 + gamma13 * alpha3);
       }
-      return 0;
     }
-  };
-    
-} // end namespace KokkosBatched
+    return 0;
+  }
+};
 
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp b/src/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp
index 472cbe16b1..04aa2d9b57 100644
--- a/src/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_IMPL_HPP__
 #define __KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,53 +8,34 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Impl
-  /// ===========
-
-  template<>
-  template<typename uViewType,
-           typename tauViewType,
-           typename AViewType,
-           typename wViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialApplyHouseholder<Side::Left>::
-  invoke(const uViewType &u2,
-         const tauViewType &tau,
-         const AViewType &A,
-         const wViewType &w) {
-    return SerialApplyLeftHouseholderInternal::
-      invoke(A.extent(0)-1, A.extent(1),
-             tau.data(),
-             u2.data(), u2.stride(0),
-             A.data(), A.stride(1), 
-             A.data()+A.stride(0), A.stride(0), A.stride(1),
-             w.data());
-  }
-
-  template<>
-  template<typename uViewType,
-           typename tauViewType,
-           typename AViewType,
-           typename wViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialApplyHouseholder<Side::Right>::
-  invoke(const uViewType &u2,
-         const tauViewType &tau,
-         const AViewType &A,
-         const wViewType &w) {
-    return SerialApplyRightHouseholderInternal::
-      invoke(A.extent(0), A.extent(1)-1,
-             tau.data(),
-             u2.data(), u2.stride(0),
-             A.data(), A.stride(0), 
-             A.data()+A.stride(1), A.stride(0), A.stride(1),
-             w.data());
-  }
-        
+///
+/// Serial Impl
+/// ===========
+
+template <>
+template <typename uViewType, typename tauViewType, typename AViewType,
+          typename wViewType>
+KOKKOS_INLINE_FUNCTION int SerialApplyHouseholder<Side::Left>::invoke(
+    const uViewType &u2, const tauViewType &tau, const AViewType &A,
+    const wViewType &w) {
+  return SerialApplyLeftHouseholderInternal::invoke(
+      A.extent(0) - 1, A.extent(1), tau.data(), u2.data(), u2.stride(0),
+      A.data(), A.stride(1), A.data() + A.stride(0), A.stride(0), A.stride(1),
+      w.data());
+}
+
+template <>
+template <typename uViewType, typename tauViewType, typename AViewType,
+          typename wViewType>
+KOKKOS_INLINE_FUNCTION int SerialApplyHouseholder<Side::Right>::invoke(
+    const uViewType &u2, const tauViewType &tau, const AViewType &A,
+    const wViewType &w) {
+  return SerialApplyRightHouseholderInternal::invoke(
+      A.extent(0), A.extent(1) - 1, tau.data(), u2.data(), u2.stride(0),
+      A.data(), A.stride(0), A.data() + A.stride(1), A.stride(0), A.stride(1),
+      w.data());
 }
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp
index 71045b313c..5f051000dd 100644
--- a/src/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp
@@ -1,114 +1,105 @@
 #ifndef __KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
-  ///
-  /// this impl follows the flame interface of householder transformation
-  ///
-  struct SerialApplyLeftHouseholderInternal {
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m,
-           const int n,
-           const ValueType * tau,
-           /* */ ValueType * u2,  const int u2s,
-           /* */ ValueType * a1t, const int a1ts,
-           /* */ ValueType * A2,  const int as0, const int as1,
-           /* */ ValueType * w1t) {
-      typedef ValueType value_type;
-
-      /// u2  m x 1
-      /// a1t 1 x n
-      /// A2  m x n
-
-      // apply a single householder transform H from the left to a row vector a1t 
-      // and a matrix A2
-      const value_type inv_tau = value_type(1)/(*tau);
-
-      // compute the followings:
-      // a1t -=    inv(tau)(a1t + u2'A2)
-      // A2  -= u2 inv(tau)(a1t + u2'A2)
-
-      // w1t = a1t + u2'A2 = A2^T conj(u2)
-      // w1t /= tau
-      for (int j=0;j<n;++j) {
-        value_type tmp = a1t[j*a1ts];
-        for (int i=0;i<m;++i) 
-          tmp += Kokkos::Details::ArithTraits<value_type>::conj(u2[i*u2s])*A2[i*as0+j*as1];
-        w1t[j] = tmp*inv_tau; // /= (*tau);
-      }
-
-      // a1t -= w1t    (axpy)
-      for (int j=0;j<n;++j) 
-        a1t[j*a1ts] -= w1t[j];
-
-      // A2  -= u2 w1t (ger)
-      for (int j=0;j<n;++j) 
-        for (int i=0;i<m;++i)
-          A2[i*as0+j*as1] -= u2[i*u2s]*w1t[j];
-
-      return 0;
+///
+/// Serial Internal Impl
+/// ====================
+///
+/// this impl follows the flame interface of householder transformation
+///
+struct SerialApplyLeftHouseholderInternal {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n,
+                                           const ValueType* tau,
+                                           /* */ ValueType* u2, const int u2s,
+                                           /* */ ValueType* a1t, const int a1ts,
+                                           /* */ ValueType* A2, const int as0,
+                                           const int as1,
+                                           /* */ ValueType* w1t) {
+    typedef ValueType value_type;
+
+    /// u2  m x 1
+    /// a1t 1 x n
+    /// A2  m x n
+
+    // apply a single householder transform H from the left to a row vector a1t
+    // and a matrix A2
+    const value_type inv_tau = value_type(1) / (*tau);
+
+    // compute the followings:
+    // a1t -=    inv(tau)(a1t + u2'A2)
+    // A2  -= u2 inv(tau)(a1t + u2'A2)
+
+    // w1t = a1t + u2'A2 = A2^T conj(u2)
+    // w1t /= tau
+    for (int j = 0; j < n; ++j) {
+      value_type tmp = a1t[j * a1ts];
+      for (int i = 0; i < m; ++i)
+        tmp += Kokkos::Details::ArithTraits<value_type>::conj(u2[i * u2s]) *
+               A2[i * as0 + j * as1];
+      w1t[j] = tmp * inv_tau;  // /= (*tau);
     }
-  };
-
-
-  struct SerialApplyRightHouseholderInternal {
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m,
-           const int n,
-           const ValueType * tau,
-           /* */ ValueType * u2,  const int u2s,
-           /* */ ValueType * a1,  const int a1s,
-           /* */ ValueType * A2,  const int as0, const int as1,
-           /* */ ValueType * w1) {
-      typedef ValueType value_type;
-      /// u2 n x 1
-      /// a1 m x 1
-      /// A2 m x n
-
-      // apply a single householder transform H from the left to a row vector a1t 
-      // and a matrix A2
-      const value_type inv_tau = value_type(1)/(*tau);
-        
-      // compute the followings:
-      // a1 -= inv(tau)(a1 + A2 u2)
-      // A2 -= inv(tau)(a1 + A2 u2) u2'
-
-      // w1 = a1 + A2 u2
-      // w1 /= tau
-      for (int i=0;i<m;++i) { 
-        value_type tmp = a1[i*a1s];
-        for (int j=0;j<n;++j) 
-          tmp += A2[i*as0+j*as1]*u2[j*u2s];
-        w1[i] = tmp*inv_tau; // \= (*tau);
-      }
-
-      // a1 -= w1 (axpy)
-      for (int i=0;i<m;++i) 
-        a1[i*a1s] -= w1[i];
-
-      // A2 -= w1 * u2' (ger with conjugate)
-      for (int j=0;j<n;++j)
-        for (int i=0;i<m;++i) 
-          A2[i*as0+j*as1] -= w1[i]*Kokkos::Details::ArithTraits<ValueType>::conj(u2[j*u2s]);
-
-      return 0;
+
+    // a1t -= w1t    (axpy)
+    for (int j = 0; j < n; ++j) a1t[j * a1ts] -= w1t[j];
+
+    // A2  -= u2 w1t (ger)
+    for (int j = 0; j < n; ++j)
+      for (int i = 0; i < m; ++i) A2[i * as0 + j * as1] -= u2[i * u2s] * w1t[j];
+
+    return 0;
+  }
+};
+
+struct SerialApplyRightHouseholderInternal {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n,
+                                           const ValueType* tau,
+                                           /* */ ValueType* u2, const int u2s,
+                                           /* */ ValueType* a1, const int a1s,
+                                           /* */ ValueType* A2, const int as0,
+                                           const int as1,
+                                           /* */ ValueType* w1) {
+    typedef ValueType value_type;
+    /// u2 n x 1
+    /// a1 m x 1
+    /// A2 m x n
+
+    // apply a single householder transform H from the left to a row vector a1t
+    // and a matrix A2
+    const value_type inv_tau = value_type(1) / (*tau);
+
+    // compute the followings:
+    // a1 -= inv(tau)(a1 + A2 u2)
+    // A2 -= inv(tau)(a1 + A2 u2) u2'
+
+    // w1 = a1 + A2 u2
+    // w1 /= tau
+    for (int i = 0; i < m; ++i) {
+      value_type tmp = a1[i * a1s];
+      for (int j = 0; j < n; ++j) tmp += A2[i * as0 + j * as1] * u2[j * u2s];
+      w1[i] = tmp * inv_tau;  // \= (*tau);
     }
-  };
 
-} // end namespace KokkosBatched
+    // a1 -= w1 (axpy)
+    for (int i = 0; i < m; ++i) a1[i * a1s] -= w1[i];
+
+    // A2 -= w1 * u2' (ger with conjugate)
+    for (int j = 0; j < n; ++j)
+      for (int i = 0; i < m; ++i)
+        A2[i * as0 + j * as1] -=
+            w1[i] * Kokkos::Details::ArithTraits<ValueType>::conj(u2[j * u2s]);
+
+    return 0;
+  }
+};
 
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Impl.hpp b/src/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Impl.hpp
index e9936c2e8c..02990004fd 100644
--- a/src/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_APPLY_HOUSEHOLDER_TEAMVECTOR_IMPL_HPP__
 #define __KOKKOSBATCHED_APPLY_HOUSEHOLDER_TEAMVECTOR_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,60 +8,42 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Impl
-  /// ===========
-
-  template<typename MemberType>
-  struct TeamVectorApplyHouseholder<MemberType,Side::Left> {
-
-    template<typename uViewType,
-             typename tauViewType,
-             typename AViewType,
-             typename wViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const uViewType &u2,
-           const tauViewType &tau,
-           const AViewType &A,
-           const wViewType &w) {
-      return TeamVectorApplyLeftHouseholderInternal::
-        invoke(member, 
-               A.extent(0)-1, A.extent(1),
-               tau.data(),
-               u2.data(), u2.stride(0),
-               A.data(), A.stride(1), 
-               A.data()+A.stride(0), A.stride(0), A.stride(1),
-               w.data());
-    }
-  };
-
-  template<typename MemberType>
-  struct TeamVectorApplyHouseholder<MemberType,Side::Right> {
-    template<typename uViewType,
-             typename tauViewType,
-             typename AViewType,
-             typename wViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const uViewType &u2,
-           const tauViewType &tau,
-           const AViewType &A,
-           const wViewType &w) {
-      return TeamVectorApplyRightHouseholderInternal::
-        invoke(member, 
-               A.extent(0), A.extent(1)-1,
-               tau.data(),
-               u2.data(), u2.stride(0),
-               A.data(), A.stride(0), 
-               A.data()+A.stride(1), A.stride(0), A.stride(1),
-               w.data());
-    }
-  };
-        
-}
-
+///
+/// Serial Impl
+/// ===========
+
+template <typename MemberType>
+struct TeamVectorApplyHouseholder<MemberType, Side::Left> {
+  template <typename uViewType, typename tauViewType, typename AViewType,
+            typename wViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const uViewType &u2,
+                                           const tauViewType &tau,
+                                           const AViewType &A,
+                                           const wViewType &w) {
+    return TeamVectorApplyLeftHouseholderInternal::invoke(
+        member, A.extent(0) - 1, A.extent(1), tau.data(), u2.data(),
+        u2.stride(0), A.data(), A.stride(1), A.data() + A.stride(0),
+        A.stride(0), A.stride(1), w.data());
+  }
+};
+
+template <typename MemberType>
+struct TeamVectorApplyHouseholder<MemberType, Side::Right> {
+  template <typename uViewType, typename tauViewType, typename AViewType,
+            typename wViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const uViewType &u2,
+                                           const tauViewType &tau,
+                                           const AViewType &A,
+                                           const wViewType &w) {
+    return TeamVectorApplyRightHouseholderInternal::invoke(
+        member, A.extent(0), A.extent(1) - 1, tau.data(), u2.data(),
+        u2.stride(0), A.data(), A.stride(0), A.data() + A.stride(1),
+        A.stride(0), A.stride(1), w.data());
+  }
+};
+
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp
index 16806d35a5..bfbb04ecba 100644
--- a/src/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp
@@ -1,148 +1,156 @@
 #ifndef __KOKKOSBATCHED_APPLY_HOUSEHOLDER_TEAMVECTOR_INTERNAL_HPP__
 #define __KOKKOSBATCHED_APPLY_HOUSEHOLDER_TEAMVECTOR_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 namespace KokkosBatched {
 
-  ///
-  /// TeamVector Internal Impl
-  /// ======================== 
-  ///
-  /// this impl follows the flame interface of householder transformation
-  ///
-  struct TeamVectorApplyLeftHouseholderInternal {
-    template<typename MemberType, 
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const int m,
-           const int n,
-           const ValueType * tau,
-           /* */ ValueType * u2,  const int u2s,
-           /* */ ValueType * a1t, const int a1ts,
-           /* */ ValueType * A2,  const int as0, const int as1,
-           /* */ ValueType * w1t) {
-      typedef ValueType value_type;
-
-      /// u2  m x 1
-      /// a1t 1 x n
-      /// A2  m x n
-
-      // apply a single householder transform H from the left to a row vector a1t 
-      // and a matrix A2
-      const value_type inv_tau = value_type(1)/(*tau);
-
-      // compute the followings:
-      // a1t -=    inv(tau)(a1t + u2'A2)
-      // A2  -= u2 inv(tau)(a1t + u2'A2)
-
-      // w1t = a1t + u2'A2 = A2^T conj(u2)
-      // w1t /= tau
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &j) {
-          value_type tmp(0);
-          Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(member, m), [&](const int &i, value_type &val) {        
-              val += Kokkos::Details::ArithTraits<value_type>::conj(u2[i*u2s])*A2[i*as0+j*as1];
-            }, tmp);
-          Kokkos::single(Kokkos::PerThread(member), [&]() {
-              w1t[j] = (tmp+a1t[j*a1ts])*inv_tau; // /= (*tau);
-            });
-        });
-      member.team_barrier();
-
-      // a1t -= w1t    (axpy)
-      Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), [&](const int &j) {
-          a1t[j*a1ts] -= w1t[j];
-        });
-
-      // A2  -= u2 w1t (ger)
-      if (as0 <= as1) {
-        Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &j) {
-            Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, m),[&](const int &i) {
-                A2[i*as0+j*as1] -= u2[i*u2s]*w1t[j];
-              });
+///
+/// TeamVector Internal Impl
+/// ========================
+///
+/// this impl follows the flame interface of householder transformation
+///
+struct TeamVectorApplyLeftHouseholderInternal {
+  template <typename MemberType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int m, const int n,
+                                           const ValueType *tau,
+                                           /* */ ValueType *u2, const int u2s,
+                                           /* */ ValueType *a1t, const int a1ts,
+                                           /* */ ValueType *A2, const int as0,
+                                           const int as1,
+                                           /* */ ValueType *w1t) {
+    typedef ValueType value_type;
+
+    /// u2  m x 1
+    /// a1t 1 x n
+    /// A2  m x n
+
+    // apply a single householder transform H from the left to a row vector a1t
+    // and a matrix A2
+    const value_type inv_tau = value_type(1) / (*tau);
+
+    // compute the followings:
+    // a1t -=    inv(tau)(a1t + u2'A2)
+    // A2  -= u2 inv(tau)(a1t + u2'A2)
+
+    // w1t = a1t + u2'A2 = A2^T conj(u2)
+    // w1t /= tau
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &j) {
+      value_type tmp(0);
+      Kokkos::parallel_reduce(
+          Kokkos::ThreadVectorRange(member, m),
+          [&](const int &i, value_type &val) {
+            val += Kokkos::Details::ArithTraits<value_type>::conj(u2[i * u2s]) *
+                   A2[i * as0 + j * as1];
+          },
+          tmp);
+      Kokkos::single(Kokkos::PerThread(member), [&]() {
+        w1t[j] = (tmp + a1t[j * a1ts]) * inv_tau;  // /= (*tau);
+      });
+    });
+    member.team_barrier();
+
+    // a1t -= w1t    (axpy)
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n),
+                         [&](const int &j) { a1t[j * a1ts] -= w1t[j]; });
+
+    // A2  -= u2 w1t (ger)
+    if (as0 <= as1) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, n), [&](const int &j) {
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(member, m), [&](const int &i) {
+                  A2[i * as0 + j * as1] -= u2[i * u2s] * w1t[j];
+                });
           });
-      } else {
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), [&](const int &j) {
-            Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m),[&](const int &i) {
-                A2[i*as0+j*as1] -= u2[i*u2s]*w1t[j];
-              });
+    } else {
+      Kokkos::parallel_for(
+          Kokkos::ThreadVectorRange(member, n), [&](const int &j) {
+            Kokkos::parallel_for(
+                Kokkos::TeamThreadRange(member, m), [&](const int &i) {
+                  A2[i * as0 + j * as1] -= u2[i * u2s] * w1t[j];
+                });
           });
-      }
-
-      return 0;
     }
-  };
-
-
-  struct TeamVectorApplyRightHouseholderInternal {
-    template<typename MemberType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const int m,
-           const int n,
-           const ValueType * tau,
-           /* */ ValueType * u2,  const int u2s,
-           /* */ ValueType * a1,  const int a1s,
-           /* */ ValueType * A2,  const int as0, const int as1,
-           /* */ ValueType * w1) {
-      typedef ValueType value_type;
-      /// u2 n x 1
-      /// a1 m x 1
-      /// A2 m x n
-
-      // apply a single householder transform H from the left to a row vector a1t 
-      // and a matrix A2
-      const value_type inv_tau = value_type(1)/(*tau);
-        
-      // compute the followings:
-      // a1 -= inv(tau)(a1 + A2 u2)
-      // A2 -= inv(tau)(a1 + A2 u2) u2'
-
-      // w1 = a1 + A2 u2
-      // w1 /= tau
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) {
-          value_type tmp(0);
-          Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(member, n), [&](const int &j, value_type &val) {
-              val += A2[i*as0+j*as1]*u2[j*u2s];
-            }, tmp);
-          Kokkos::single(Kokkos::PerThread(member), [&]() {
-              w1[i] = (tmp+a1[i*a1s])*inv_tau; // \= (*tau);
-            });
-        });
-      member.team_barrier();
-
-      // a1 -= w1 (axpy)
-      Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int &i) {
-          a1[i*a1s] -= w1[i];
-        });
-      
-      // A2 -= w1 * u2' (ger with conjugate)
-      if (as0 <= as1) {
-        Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &j) {
-            Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, m),[&](const int &i) {
-                A2[i*as0+j*as1] -= w1[i]*Kokkos::Details::ArithTraits<ValueType>::conj(u2[j*u2s]);
-              });
+
+    return 0;
+  }
+};
+
+struct TeamVectorApplyRightHouseholderInternal {
+  template <typename MemberType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int m, const int n,
+                                           const ValueType *tau,
+                                           /* */ ValueType *u2, const int u2s,
+                                           /* */ ValueType *a1, const int a1s,
+                                           /* */ ValueType *A2, const int as0,
+                                           const int as1,
+                                           /* */ ValueType *w1) {
+    typedef ValueType value_type;
+    /// u2 n x 1
+    /// a1 m x 1
+    /// A2 m x n
+
+    // apply a single householder transform H from the left to a row vector a1t
+    // and a matrix A2
+    const value_type inv_tau = value_type(1) / (*tau);
+
+    // compute the followings:
+    // a1 -= inv(tau)(a1 + A2 u2)
+    // A2 -= inv(tau)(a1 + A2 u2) u2'
+
+    // w1 = a1 + A2 u2
+    // w1 /= tau
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) {
+      value_type tmp(0);
+      Kokkos::parallel_reduce(
+          Kokkos::ThreadVectorRange(member, n),
+          [&](const int &j, value_type &val) {
+            val += A2[i * as0 + j * as1] * u2[j * u2s];
+          },
+          tmp);
+      Kokkos::single(Kokkos::PerThread(member), [&]() {
+        w1[i] = (tmp + a1[i * a1s]) * inv_tau;  // \= (*tau);
+      });
+    });
+    member.team_barrier();
+
+    // a1 -= w1 (axpy)
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m),
+                         [&](const int &i) { a1[i * a1s] -= w1[i]; });
+
+    // A2 -= w1 * u2' (ger with conjugate)
+    if (as0 <= as1) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, n), [&](const int &j) {
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(member, m), [&](const int &i) {
+                  A2[i * as0 + j * as1] -=
+                      w1[i] * Kokkos::Details::ArithTraits<ValueType>::conj(
+                                  u2[j * u2s]);
+                });
           });
-      } else {
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), [&](const int &j) {
-            Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m),[&](const int &i) {
-                A2[i*as0+j*as1] -= w1[i]*Kokkos::Details::ArithTraits<ValueType>::conj(u2[j*u2s]);
-              });
+    } else {
+      Kokkos::parallel_for(
+          Kokkos::ThreadVectorRange(member, n), [&](const int &j) {
+            Kokkos::parallel_for(
+                Kokkos::TeamThreadRange(member, m), [&](const int &i) {
+                  A2[i * as0 + j * as1] -=
+                      w1[i] * Kokkos::Details::ArithTraits<ValueType>::conj(
+                                  u2[j * u2s]);
+                });
           });
-      }
-
-      return 0;
     }
-  };
 
-} // end namespace KokkosBatched
+    return 0;
+  }
+};
 
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_ApplyPivot_Impl.hpp b/src/batched/dense/impl/KokkosBatched_ApplyPivot_Impl.hpp
index a84fec3a3f..1f355efc5a 100644
--- a/src/batched/dense/impl/KokkosBatched_ApplyPivot_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_ApplyPivot_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_APPLY_PIVOT_IMPL_HPP__
 #define __KOKKOSBATCHED_APPLY_PIVOT_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,180 +8,166 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// TeamVector Internal Impl
-  /// ========================
-
-  ///
-  /// Forward pivot apply
-  ///
-
-  /// row swap
-  template<typename MemberType>
-  struct TeamVectorApplyPivot<MemberType,Side::Left,Direct::Forward> {
-    template<typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const int piv,
-	   const AViewType &A) {
-      if (AViewType::rank == 1) {
-	const int as0 = A.stride(0);
-	TeamVectorApplyPivotVectorForwardInternal::invoke(member, piv, A.data(), as0);
-      } else if (AViewType::rank == 2) {
-	const int n = A.extent(1), as0 = A.stride(0), as1 = A.stride(1);
-	TeamVectorApplyPivotMatrixForwardInternal::invoke(member, n, piv, A.data(), as0, as1);
-      }
-      return 0;
+///
+/// TeamVector Internal Impl
+/// ========================
+
+///
+/// Forward pivot apply
+///
+
+/// row swap
+template <typename MemberType>
+struct TeamVectorApplyPivot<MemberType, Side::Left, Direct::Forward> {
+  template <typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int piv, const AViewType &A) {
+    if (AViewType::rank == 1) {
+      const int as0 = A.stride(0);
+      TeamVectorApplyPivotVectorForwardInternal::invoke(member, piv, A.data(),
+                                                        as0);
+    } else if (AViewType::rank == 2) {
+      const int n = A.extent(1), as0 = A.stride(0), as1 = A.stride(1);
+      TeamVectorApplyPivotMatrixForwardInternal::invoke(member, n, piv,
+                                                        A.data(), as0, as1);
     }
-
-    template<typename PivViewType,
-             typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const PivViewType piv,
-	   const AViewType &A) {
-      if (AViewType::rank == 1) {
-	const int plen = piv.extent(0), ps0 = piv.stride(0), as0 = A.stride(0);
-	TeamVectorApplyPivotVectorForwardInternal::invoke(member, plen, piv.data(), ps0, A.data(), as0);
-      } else if (AViewType::rank == 2) {
-	// row permutation
-	const int plen = piv.extent(0), ps0 = piv.stride(0), n = A.extent(1), as0 = A.stride(0), as1 = A.stride(1);
-	TeamVectorApplyPivotMatrixForwardInternal::invoke(member, n, plen, piv.data(), ps0, A.data(), as0, as1);
-      }
-      return 0;
+    return 0;
+  }
+
+  template <typename PivViewType, typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const PivViewType piv,
+                                           const AViewType &A) {
+    if (AViewType::rank == 1) {
+      const int plen = piv.extent(0), ps0 = piv.stride(0), as0 = A.stride(0);
+      TeamVectorApplyPivotVectorForwardInternal::invoke(
+          member, plen, piv.data(), ps0, A.data(), as0);
+    } else if (AViewType::rank == 2) {
+      // row permutation
+      const int plen = piv.extent(0), ps0 = piv.stride(0), n = A.extent(1),
+                as0 = A.stride(0), as1 = A.stride(1);
+      TeamVectorApplyPivotMatrixForwardInternal::invoke(
+          member, n, plen, piv.data(), ps0, A.data(), as0, as1);
     }
-
-  };
-
-  /// column swap
-  template<typename MemberType> 	
-  struct TeamVectorApplyPivot<MemberType,Side::Right,Direct::Forward> {
-    template<typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const int piv,
-	   const AViewType &A) {
-      if (AViewType::rank == 1) {
-	const int as0 = A.stride(0);
-	TeamVectorApplyPivotVectorForwardInternal::invoke(member, piv, A.data(), as0);
-      } else if (AViewType::rank == 2) {
-	const int m = A.extent(0), as0 = A.stride(0), as1 = A.stride(1);
-	TeamVectorApplyPivotMatrixForwardInternal::invoke(member, m, piv, A.data(), as1, as0);
-      }
-      return 0;
+    return 0;
+  }
+};
+
+/// column swap
+template <typename MemberType>
+struct TeamVectorApplyPivot<MemberType, Side::Right, Direct::Forward> {
+  template <typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int piv, const AViewType &A) {
+    if (AViewType::rank == 1) {
+      const int as0 = A.stride(0);
+      TeamVectorApplyPivotVectorForwardInternal::invoke(member, piv, A.data(),
+                                                        as0);
+    } else if (AViewType::rank == 2) {
+      const int m = A.extent(0), as0 = A.stride(0), as1 = A.stride(1);
+      TeamVectorApplyPivotMatrixForwardInternal::invoke(member, m, piv,
+                                                        A.data(), as1, as0);
     }
-
-    template<typename PivViewType,
-             typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const PivViewType &piv,
-	   const AViewType &A) {
-      if (AViewType::rank == 1) {
-	const int plen = piv.extent(0), as0 = A.stride(0);
-	TeamVectorApplyPivotVectorForwardInternal
-	  ::invoke(member, plen, piv.data(), A.data(), as0);
-      } else if (AViewType::rank == 2) {
-	// column permutation
-	const int plen = piv.extent(0), ps = piv.stride(0), m = A.extent(0), as0 = A.stride(0), as1 = A.stride(1);
-	TeamVectorApplyPivotMatrixForwardInternal
-	  ::invoke(member, m, plen, piv.data(), ps, A.data(), as1, as0);
-      }
-      return 0;
+    return 0;
+  }
+
+  template <typename PivViewType, typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const PivViewType &piv,
+                                           const AViewType &A) {
+    if (AViewType::rank == 1) {
+      const int plen = piv.extent(0), as0 = A.stride(0);
+      TeamVectorApplyPivotVectorForwardInternal ::invoke(
+          member, plen, piv.data(), A.data(), as0);
+    } else if (AViewType::rank == 2) {
+      // column permutation
+      const int plen = piv.extent(0), ps = piv.stride(0), m = A.extent(0),
+                as0 = A.stride(0), as1 = A.stride(1);
+      TeamVectorApplyPivotMatrixForwardInternal ::invoke(
+          member, m, plen, piv.data(), ps, A.data(), as1, as0);
     }
-
-  };
-
-
-  ///
-  /// Backward pivot apply
-  ///
-
-  /// row swap
-  template<typename MemberType>
-  struct TeamVectorApplyPivot<MemberType,Side::Left,Direct::Backward> {
-    template<typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const int piv,
-	   const AViewType &A) {
-      if (AViewType::rank == 1) {
-	const int as0 = A.stride(0);
-	TeamVectorApplyPivotVectorBackwardInternal::invoke(member, piv, A.data(), as0);
-      } else if (AViewType::rank == 2) {
-	const int n = A.extent(1), as0 = A.stride(0), as1 = A.stride(1);
-	TeamVectorApplyPivotMatrixBackwardInternal::invoke(member, n, piv, A.data(), as0, as1);
-      }
-      return 0;
+    return 0;
+  }
+};
+
+///
+/// Backward pivot apply
+///
+
+/// row swap
+template <typename MemberType>
+struct TeamVectorApplyPivot<MemberType, Side::Left, Direct::Backward> {
+  template <typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int piv, const AViewType &A) {
+    if (AViewType::rank == 1) {
+      const int as0 = A.stride(0);
+      TeamVectorApplyPivotVectorBackwardInternal::invoke(member, piv, A.data(),
+                                                         as0);
+    } else if (AViewType::rank == 2) {
+      const int n = A.extent(1), as0 = A.stride(0), as1 = A.stride(1);
+      TeamVectorApplyPivotMatrixBackwardInternal::invoke(member, n, piv,
+                                                         A.data(), as0, as1);
     }
-
-    template<typename PivViewType,
-             typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const PivViewType piv,
-	   const AViewType &A) {
-      if (AViewType::rank == 1) {
-	const int plen = piv.extent(0), ps0 = piv.stride(0), as0 = A.stride(0);
-	TeamVectorApplyPivotVectorBackwardInternal::invoke(member, plen, piv.data(), ps0, A.data(), as0);
-      } else if (AViewType::rank == 2) {
-	// row permutation
-	const int plen = piv.extent(0), ps0 = piv.stride(0), n = A.extent(1), as0 = A.stride(0), as1 = A.stride(1);
-	TeamVectorApplyPivotMatrixBackwardInternal::invoke(member, n, plen, piv.data(), ps0, A.data(), as0, as1);
-      }
-      return 0;
+    return 0;
+  }
+
+  template <typename PivViewType, typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const PivViewType piv,
+                                           const AViewType &A) {
+    if (AViewType::rank == 1) {
+      const int plen = piv.extent(0), ps0 = piv.stride(0), as0 = A.stride(0);
+      TeamVectorApplyPivotVectorBackwardInternal::invoke(
+          member, plen, piv.data(), ps0, A.data(), as0);
+    } else if (AViewType::rank == 2) {
+      // row permutation
+      const int plen = piv.extent(0), ps0 = piv.stride(0), n = A.extent(1),
+                as0 = A.stride(0), as1 = A.stride(1);
+      TeamVectorApplyPivotMatrixBackwardInternal::invoke(
+          member, n, plen, piv.data(), ps0, A.data(), as0, as1);
     }
-
-  };
-
-  /// column swap
-  template<typename MemberType> 	
-  struct TeamVectorApplyPivot<MemberType,Side::Right,Direct::Backward> {
-    template<typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const int piv,
-	   const AViewType &A) {
-      if (AViewType::rank == 1) {
-	const int as0 = A.stride(0);
-	TeamVectorApplyPivotVectorBackwardInternal::invoke(member, piv, A.data(), as0);
-      } else if (AViewType::rank == 2) {
-	const int m = A.extent(0), as0 = A.stride(0), as1 = A.stride(1);
-	TeamVectorApplyPivotMatrixBackwardInternal::invoke(member, m, piv, A.data(), as1, as0);
-      }
-      return 0;
+    return 0;
+  }
+};
+
+/// column swap
+template <typename MemberType>
+struct TeamVectorApplyPivot<MemberType, Side::Right, Direct::Backward> {
+  template <typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int piv, const AViewType &A) {
+    if (AViewType::rank == 1) {
+      const int as0 = A.stride(0);
+      TeamVectorApplyPivotVectorBackwardInternal::invoke(member, piv, A.data(),
+                                                         as0);
+    } else if (AViewType::rank == 2) {
+      const int m = A.extent(0), as0 = A.stride(0), as1 = A.stride(1);
+      TeamVectorApplyPivotMatrixBackwardInternal::invoke(member, m, piv,
+                                                         A.data(), as1, as0);
     }
-
-    template<typename PivViewType,
-             typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const PivViewType &piv,
-	   const AViewType &A) {
-      if (AViewType::rank == 1) {
-	const int plen = piv.extent(0), as0 = A.stride(0);
-	TeamVectorApplyPivotVectorBackwardInternal
-	  ::invoke(member, plen, piv.data(), A.data(), as0);
-      } else if (AViewType::rank == 2) {
-	// column permutation
-	const int plen = piv.extent(0), ps = piv.stride(0), m = A.extent(0), as0 = A.stride(0), as1 = A.stride(1);
-	TeamVectorApplyPivotMatrixBackwardInternal
-	  ::invoke(member, m, plen, piv.data(), ps, A.data(), as1, as0);
-      }
-      return 0;
+    return 0;
+  }
+
+  template <typename PivViewType, typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const PivViewType &piv,
+                                           const AViewType &A) {
+    if (AViewType::rank == 1) {
+      const int plen = piv.extent(0), as0 = A.stride(0);
+      TeamVectorApplyPivotVectorBackwardInternal ::invoke(
+          member, plen, piv.data(), A.data(), as0);
+    } else if (AViewType::rank == 2) {
+      // column permutation
+      const int plen = piv.extent(0), ps = piv.stride(0), m = A.extent(0),
+                as0 = A.stride(0), as1 = A.stride(1);
+      TeamVectorApplyPivotMatrixBackwardInternal ::invoke(
+          member, m, plen, piv.data(), ps, A.data(), as1, as0);
     }
+    return 0;
+  }
+};
 
-  };
-
-}
-
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_ApplyPivot_Internal.hpp b/src/batched/dense/impl/KokkosBatched_ApplyPivot_Internal.hpp
index 2ff14bea19..bf9d47c1ee 100644
--- a/src/batched/dense/impl/KokkosBatched_ApplyPivot_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_ApplyPivot_Internal.hpp
@@ -1,221 +1,186 @@
 #ifndef __KOKKOSBATCHED_APPLY_PIVOT_INTERNAL_HPP__
 #define __KOKKOSBATCHED_APPLY_PIVOT_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 namespace KokkosBatched {
 
-  ///
-  /// TeamVector Internal Impl
-  /// ========================
-
-  ///
-  /// Forward
-  ///
-  struct TeamVectorApplyPivotVectorForwardInternal {
-    template<typename MemberType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const int piv,
-           /* */ ValueType *__restrict__ A, const int as0) {
-      if (piv != 0) {
-	Kokkos::single
-	  (Kokkos::PerTeam(member),
-	   [&]() {
-	     const int idx_p = piv*as0;
-	     const ValueType tmp = A[0];
-	     A[0] = A[idx_p];
-	     A[idx_p] = tmp;
-	   });
-      }
-      return 0;
-    }
-
-    template<typename MemberType,
-	     typename IntType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-	   const int plen,
-	   const IntType *__restrict__ p, const int ps0,
-           /* */ ValueType *__restrict__ A, const int as0) {
-      Kokkos::single
-	(Kokkos::PerTeam(member),
-	 [&]() {
-	   for (int i=0;i<plen;++i) {
-	     const int piv = p[i*ps0];
-	     if (piv != 0) {
-	       const int idx_i = i*as0, idx_p = (i+piv)*as0;
-	       const ValueType tmp = A[idx_i];
-	       A[idx_i] = A[idx_p];
-	       A[idx_p] = tmp;
-	     }
-	   }
-	 });
-      return 0;
+///
+/// TeamVector Internal Impl
+/// ========================
+
+///
+/// Forward
+///
+struct TeamVectorApplyPivotVectorForwardInternal {
+  template <typename MemberType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int piv,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0) {
+    if (piv != 0) {
+      Kokkos::single(Kokkos::PerTeam(member), [&]() {
+        const int idx_p     = piv * as0;
+        const ValueType tmp = A[0];
+        A[0]                = A[idx_p];
+        A[idx_p]            = tmp;
+      });
     }
-  };
-
-  /// Pivot a row
-  struct TeamVectorApplyPivotMatrixForwardInternal {
-    template<typename MemberType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-	   const int n,
-           const int piv,
-           /* */ ValueType *__restrict__ A, const int as0, const int as1) {
-      if (piv != 0) {
-	Kokkos::parallel_for
-	  (Kokkos::TeamVectorRange(member, n),
-	   [&](const int &j) {
-	     ValueType *__restrict__ A_at_j = A + j*as1;
-	     const int idx_p = piv*as0;
-	     const ValueType tmp = A_at_j[0];
-	     A_at_j[0] = A_at_j[idx_p];
-	     A_at_j[idx_p] = tmp;
-	   });
+    return 0;
+  }
+
+  template <typename MemberType, typename IntType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int plen,
+                                           const IntType *KOKKOS_RESTRICT p,
+                                           const int ps0,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0) {
+    Kokkos::single(Kokkos::PerTeam(member), [&]() {
+      for (int i = 0; i < plen; ++i) {
+        const int piv = p[i * ps0];
+        if (piv != 0) {
+          const int idx_i = i * as0, idx_p = (i + piv) * as0;
+          const ValueType tmp = A[idx_i];
+          A[idx_i]            = A[idx_p];
+          A[idx_p]            = tmp;
+        }
       }
-      return 0;
-    }
-
-    template<typename MemberType,
-	     typename IntType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-	   const int n, const int plen,
-           const IntType *__restrict__ p, const int ps0,
-           /* */ ValueType *__restrict__ A, const int as0, const int as1) {
-      Kokkos::parallel_for
-	(Kokkos::TeamVectorRange(member, n),
-	 [&](const int &j) {
-	   ValueType *__restrict__ A_at_j = A + j*as1;	   
-	   for (int i=0;i<plen;++i) {
-	     const int piv = p[i*ps0];
-	     if (piv != 0) {
-	       const int idx_i = i*as0, idx_p = (i+piv)*as0;
-	       const ValueType tmp = A_at_j[idx_i];
-	       A_at_j[idx_i] = A_at_j[idx_p];
-	       A_at_j[idx_p] = tmp;
-	     }
-	   }
-	 });
-      return 0;
+    });
+    return 0;
+  }
+};
+
+/// Pivot a row
+struct TeamVectorApplyPivotMatrixForwardInternal {
+  template <typename MemberType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int n, const int piv,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1) {
+    if (piv != 0) {
+      Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n),
+                           [&](const int &j) {
+                             ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1;
+                             const int idx_p                   = piv * as0;
+                             const ValueType tmp               = A_at_j[0];
+                             A_at_j[0]                         = A_at_j[idx_p];
+                             A_at_j[idx_p]                     = tmp;
+                           });
     }
-  };
-
-
-  ///
-  /// Backward
-  ///
-  struct TeamVectorApplyPivotVectorBackwardInternal {
-    template<typename MemberType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const int piv,
-           /* */ ValueType *__restrict__ A, const int as0) {
-      if (piv != 0) {
-	Kokkos::single
-	  (Kokkos::PerTeam(member),
-	   [&]() {
-	     const int idx_p = piv*as0;
-	     const ValueType tmp = A[0];
-	     A[0] = A[idx_p];
-	     A[idx_p] = tmp;
-	   });
+    return 0;
+  }
+
+  template <typename MemberType, typename IntType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int n, const int plen,
+                                           const IntType *KOKKOS_RESTRICT p,
+                                           const int ps0,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1) {
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), [&](const int &j) {
+      ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1;
+      for (int i = 0; i < plen; ++i) {
+        const int piv = p[i * ps0];
+        if (piv != 0) {
+          const int idx_i = i * as0, idx_p = (i + piv) * as0;
+          const ValueType tmp = A_at_j[idx_i];
+          A_at_j[idx_i]       = A_at_j[idx_p];
+          A_at_j[idx_p]       = tmp;
+        }
       }
-      return 0;
-    }
-
-    template<typename MemberType,
-	     typename IntType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-	   const int plen,
-	   const IntType *__restrict__ p, const int ps0,
-           /* */ ValueType *__restrict__ A, const int as0) {
-      Kokkos::single
-	(Kokkos::PerTeam(member),
-	 [&]() {
-	   for (int i=(plen-1);i>=0;--i) {
-	     const int piv = p[i*ps0];
-	     if (piv != 0) {
-	       const int idx_i = i*as0, idx_p = (i+piv)*as0;
-	       const ValueType tmp = A[idx_i];
-	       A[idx_i] = A[idx_p];
-	       A[idx_p] = tmp;
-	     }
-	   }
-	 });
-      return 0;
+    });
+    return 0;
+  }
+};
+
+///
+/// Backward
+///
+struct TeamVectorApplyPivotVectorBackwardInternal {
+  template <typename MemberType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int piv,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0) {
+    if (piv != 0) {
+      Kokkos::single(Kokkos::PerTeam(member), [&]() {
+        const int idx_p     = piv * as0;
+        const ValueType tmp = A[0];
+        A[0]                = A[idx_p];
+        A[idx_p]            = tmp;
+      });
     }
-  };
-
-  /// Pivot a row
-  struct TeamVectorApplyPivotMatrixBackwardInternal {
-    template<typename MemberType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-	   const int n,
-           const int piv,
-           /* */ ValueType *__restrict__ A, const int as0, const int as1) {
-      if (piv != 0) {
-	Kokkos::parallel_for
-	  (Kokkos::TeamVectorRange(member, n),
-	   [&](const int &j) {
-	     ValueType *__restrict__ A_at_j = A + j*as1;
-	     const int idx_p = piv*as0;
-	     const ValueType tmp = A_at_j[0];
-	     A_at_j[0] = A_at_j[idx_p];
-	     A_at_j[idx_p] = tmp;
-	   });
+    return 0;
+  }
+
+  template <typename MemberType, typename IntType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int plen,
+                                           const IntType *KOKKOS_RESTRICT p,
+                                           const int ps0,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0) {
+    Kokkos::single(Kokkos::PerTeam(member), [&]() {
+      for (int i = (plen - 1); i >= 0; --i) {
+        const int piv = p[i * ps0];
+        if (piv != 0) {
+          const int idx_i = i * as0, idx_p = (i + piv) * as0;
+          const ValueType tmp = A[idx_i];
+          A[idx_i]            = A[idx_p];
+          A[idx_p]            = tmp;
+        }
       }
-      return 0;
+    });
+    return 0;
+  }
+};
+
+/// Pivot a row
+struct TeamVectorApplyPivotMatrixBackwardInternal {
+  template <typename MemberType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int n, const int piv,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1) {
+    if (piv != 0) {
+      Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n),
+                           [&](const int &j) {
+                             ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1;
+                             const int idx_p                   = piv * as0;
+                             const ValueType tmp               = A_at_j[0];
+                             A_at_j[0]                         = A_at_j[idx_p];
+                             A_at_j[idx_p]                     = tmp;
+                           });
     }
+    return 0;
+  }
+
+  template <typename MemberType, typename IntType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int n, const int plen,
+                                           const IntType *KOKKOS_RESTRICT p,
+                                           const int ps0,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1) {
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), [&](const int &j) {
+      ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1;
+      for (int i = (plen - 1); i >= 0; --i) {
+        const int piv = p[i * ps0];
+        if (piv != 0) {
+          const int idx_i = i * as0, idx_p = (i + piv) * as0;
+          const ValueType tmp = A_at_j[idx_i];
+          A_at_j[idx_i]       = A_at_j[idx_p];
+          A_at_j[idx_p]       = tmp;
+        }
+      }
+    });
+    return 0;
+  }
+};
 
-    template<typename MemberType,
-	     typename IntType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-	   const int n, const int plen,
-           const IntType *__restrict__ p, const int ps0,
-           /* */ ValueType *__restrict__ A, const int as0, const int as1) {
-      Kokkos::parallel_for
-	(Kokkos::TeamVectorRange(member, n),
-	 [&](const int &j) {
-	   ValueType *__restrict__ A_at_j = A + j*as1;	   
-	   for (int i=(plen-1);i>=0;--i) {
-	     const int piv = p[i*ps0];
-	     if (piv != 0) {
-	       const int idx_i = i*as0, idx_p = (i+piv)*as0;
-	       const ValueType tmp = A_at_j[idx_i];
-	       A_at_j[idx_i] = A_at_j[idx_p];
-	       A_at_j[idx_p] = tmp;
-	     }
-	   }
-	 });
-      return 0;
-    }
-  };
-
-}
-
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp b/src/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp
index befdd67d84..755aa1cbba 100644
--- a/src/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_APPLY_Q_SERIAL_IMPL_HPP__
 #define __KOKKOSBATCHED_APPLY_Q_SERIAL_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,72 +8,49 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Impl
-  /// ===========
-
-  template<>
-  template<typename AViewType,
-           typename tViewType,
-           typename BViewType,
-           typename wViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialApplyQ<Side::Left,Trans::NoTranspose,Algo::ApplyQ::Unblocked>::
-  invoke(const AViewType &A,
-         const tViewType &t,
-         const BViewType &B,
-         const wViewType &w) {
-    return SerialApplyQ_LeftForwardInternal::
-      invoke(B.extent(0), B.extent(1), A.extent(1), 
-             A.data(), A.stride_0(), A.stride_1(),
-             t.data(), t.stride_0(), 
-             B.data(), B.stride_0(), B.stride_1(),
-             w.data());
-  }
-
-  template<>
-  template<typename AViewType,
-           typename tViewType,
-           typename BViewType,
-           typename wViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialApplyQ<Side::Left,Trans::Transpose,Algo::ApplyQ::Unblocked>::
-  invoke(const AViewType &A,
-         const tViewType &t,
-         const BViewType &B,
-         const wViewType &w) {
-    return SerialApplyQ_LeftBackwardInternal::
-      invoke(B.extent(0), B.extent(1), A.extent(1), 
-             A.data(), A.stride_0(), A.stride_1(),
-             t.data(), t.stride_0(), 
-             B.data(), B.stride_0(), B.stride_1(),
-             w.data());
-  }
+///
+/// Serial Impl
+/// ===========
+
+template <>
+template <typename AViewType, typename tViewType, typename BViewType,
+          typename wViewType>
+KOKKOS_INLINE_FUNCTION int
+SerialApplyQ<Side::Left, Trans::NoTranspose, Algo::ApplyQ::Unblocked>::invoke(
+    const AViewType &A, const tViewType &t, const BViewType &B,
+    const wViewType &w) {
+  return SerialApplyQ_LeftForwardInternal::invoke(
+      B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(),
+      A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(),
+      B.stride_1(), w.data());
+}
 
-  template<>
-  template<typename AViewType,
-           typename tViewType,
-           typename BViewType,
-           typename wViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialApplyQ<Side::Right,Trans::NoTranspose,Algo::ApplyQ::Unblocked>::
-  invoke(const AViewType &A,
-         const tViewType &t,
-         const BViewType &B,
-         const wViewType &w) {
-    return SerialApplyQ_RightForwardInternal::
-      invoke(B.extent(0), B.extent(1), A.extent(1), 
-             A.data(), A.stride_0(), A.stride_1(),
-             t.data(), t.stride_0(), 
-             B.data(), B.stride_0(), B.stride_1(),
-             w.data());
-  }
-  
+template <>
+template <typename AViewType, typename tViewType, typename BViewType,
+          typename wViewType>
+KOKKOS_INLINE_FUNCTION int
+SerialApplyQ<Side::Left, Trans::Transpose, Algo::ApplyQ::Unblocked>::invoke(
+    const AViewType &A, const tViewType &t, const BViewType &B,
+    const wViewType &w) {
+  return SerialApplyQ_LeftBackwardInternal::invoke(
+      B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(),
+      A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(),
+      B.stride_1(), w.data());
 }
 
+template <>
+template <typename AViewType, typename tViewType, typename BViewType,
+          typename wViewType>
+KOKKOS_INLINE_FUNCTION int
+SerialApplyQ<Side::Right, Trans::NoTranspose, Algo::ApplyQ::Unblocked>::invoke(
+    const AViewType &A, const tViewType &t, const BViewType &B,
+    const wViewType &w) {
+  return SerialApplyQ_RightForwardInternal::invoke(
+      B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(),
+      A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(),
+      B.stride_1(), w.data());
+}
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp
index 84578735f8..31b9f19231 100644
--- a/src/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_APPLY_Q_SERIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_APPLY_Q_SERIAL_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,200 +8,186 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
-  ///
-  /// this impl follows the flame interface of householder transformation
-  ///
-
-
-  struct SerialApplyQ_LeftForwardInternal {
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m, 
-           const int n, 
-           const int k, 
-           /* */ ValueType * A, const int as0, const int as1,
-           /* */ ValueType * t, const int ts,
-           /* */ ValueType * B, const int bs0, const int bs1,
-           /* */ ValueType * w) {
-      typedef ValueType value_type;
-        
-      /// Given a matrix A that includes a series of householder vectors,
-      /// it applies a unitary matrix Q to B from left without transpose
-      ///   B = Q B = (H0 H1 H2 H3 ... H(k-1)) B
-      /// where
-      ///   A is m x k (holding H0, H1 ... H(k-1)
-      ///   t is k x 1 
-      ///   B is m x n
-
-      // partitions used for loop iteration 
-      Partition2x2<value_type> A_part2x2(as0, as1);
-      Partition3x3<value_type> A_part3x3(as0, as1);
-
-      Partition2x1<value_type> t_part2x1(ts);
-      Partition3x1<value_type> t_part3x1(ts);
-
-      Partition2x1<value_type> B_part2x1(bs0);
-      Partition3x1<value_type> B_part3x1(bs0);
-
-      // initial partition of A where ATL has a zero dimension
-      A_part2x2.partWithABR(A, m, k, m-k, 0);
-      t_part2x1.partWithAB (t, k,    0     );
-      B_part2x1.partWithAB (B, m,    m-k   );
-
-      for (int m_A0=(k-1);m_A0>=0;--m_A0) {
-        // part 2x2 into 3x3
-        A_part3x3.partWithATL(A_part2x2, 1, 1);
-        t_part3x1.partWithAT (t_part2x1, 1);
-        value_type *tau = t_part3x1.A1;
-
-        B_part3x1.partWithAT (B_part2x1, 1);
-        const int m_A2 = m - m_A0 - 1;
-        /// -----------------------------------------------------
-        // left apply householder to partitioned B1 and B2
-        SerialApplyLeftHouseholderInternal::invoke(m_A2, n,
-                                                   tau,
-                                                   A_part3x3.A21, as0,
-                                                   B_part3x1.A1,  bs1,
-                                                   B_part3x1.A2,  bs0, bs1, 
-                                                   w);            
-
-        /// -----------------------------------------------------
-        A_part2x2.mergeToABR(A_part3x3);
-        t_part2x1.mergeToAB (t_part3x1);
-        B_part2x1.mergeToAB (B_part3x1);
-      }
-      return 0;
+///
+/// Serial Internal Impl
+/// ====================
+///
+/// this impl follows the flame interface of householder transformation
+///
+
+struct SerialApplyQ_LeftForwardInternal {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n,
+                                           const int k,
+                                           /* */ ValueType *A, const int as0,
+                                           const int as1,
+                                           /* */ ValueType *t, const int ts,
+                                           /* */ ValueType *B, const int bs0,
+                                           const int bs1,
+                                           /* */ ValueType *w) {
+    typedef ValueType value_type;
+
+    /// Given a matrix A that includes a series of householder vectors,
+    /// it applies a unitary matrix Q to B from left without transpose
+    ///   B = Q B = (H0 H1 H2 H3 ... H(k-1)) B
+    /// where
+    ///   A is m x k (holding H0, H1 ... H(k-1)
+    ///   t is k x 1
+    ///   B is m x n
+
+    // partitions used for loop iteration
+    Partition2x2<value_type> A_part2x2(as0, as1);
+    Partition3x3<value_type> A_part3x3(as0, as1);
+
+    Partition2x1<value_type> t_part2x1(ts);
+    Partition3x1<value_type> t_part3x1(ts);
+
+    Partition2x1<value_type> B_part2x1(bs0);
+    Partition3x1<value_type> B_part3x1(bs0);
+
+    // initial partition of A where ATL has a zero dimension
+    A_part2x2.partWithABR(A, m, k, m - k, 0);
+    t_part2x1.partWithAB(t, k, 0);
+    B_part2x1.partWithAB(B, m, m - k);
+
+    for (int m_A0 = (k - 1); m_A0 >= 0; --m_A0) {
+      // part 2x2 into 3x3
+      A_part3x3.partWithATL(A_part2x2, 1, 1);
+      t_part3x1.partWithAT(t_part2x1, 1);
+      value_type *tau = t_part3x1.A1;
+
+      B_part3x1.partWithAT(B_part2x1, 1);
+      const int m_A2 = m - m_A0 - 1;
+      /// -----------------------------------------------------
+      // left apply householder to partitioned B1 and B2
+      SerialApplyLeftHouseholderInternal::invoke(m_A2, n, tau, A_part3x3.A21,
+                                                 as0, B_part3x1.A1, bs1,
+                                                 B_part3x1.A2, bs0, bs1, w);
+
+      /// -----------------------------------------------------
+      A_part2x2.mergeToABR(A_part3x3);
+      t_part2x1.mergeToAB(t_part3x1);
+      B_part2x1.mergeToAB(B_part3x1);
     }
-  };
-
-  struct SerialApplyQ_LeftBackwardInternal {
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m, 
-           const int n, 
-           const int k, 
-           /* */ ValueType * A, const int as0, const int as1,
-           /* */ ValueType * t, const int ts,
-           /* */ ValueType * B, const int bs0, const int bs1,
-           /* */ ValueType * w) {
-      typedef ValueType value_type;
-        
-      /// Given a matrix A that includes a series of householder vectors,
-      /// it applies a unitary matrix Q to B from left without transpose
-      ///   B = Q^H B = (H(k-1) H(k-2) ... H0) B
-      /// where
-      ///   A is m x k (holding H0, H1 ... H(k-1)
-      ///   t is k x 1 
-      ///   B is m x n
-
-      // partitions used for loop iteration 
-      Partition2x2<value_type> A_part2x2(as0, as1);
-      Partition3x3<value_type> A_part3x3(as0, as1);
-
-      Partition2x1<value_type> t_part2x1(ts);
-      Partition3x1<value_type> t_part3x1(ts);
-
-      Partition2x1<value_type> B_part2x1(bs0);
-      Partition3x1<value_type> B_part3x1(bs0);
-
-      // initial partition of A where ATL has a zero dimension
-      A_part2x2.partWithATL(A, m, k, 0, 0);
-      t_part2x1.partWithAT (t, k,    0   );
-      B_part2x1.partWithAT (B, m,    0   );
-
-      for (int m_A0=0;m_A0<k;++m_A0) {
-        // part 2x2 into 3x3
-        A_part3x3.partWithABR(A_part2x2, 1, 1);
-        t_part3x1.partWithAB (t_part2x1, 1);
-        value_type *tau = t_part3x1.A1;
-
-        B_part3x1.partWithAB (B_part2x1, 1);
-        const int m_A2 = m - m_A0 - 1;
-        /// -----------------------------------------------------
-        // left apply householder to partitioned B1 and B2
-        SerialApplyLeftHouseholderInternal::invoke(m_A2, n,
-                                                   tau,
-                                                   A_part3x3.A21, as0,
-                                                   B_part3x1.A1,  bs1,
-                                                   B_part3x1.A2,  bs0, bs1, 
-                                                   w);            
-
-        /// -----------------------------------------------------
-        A_part2x2.mergeToATL(A_part3x3);
-        t_part2x1.mergeToAT (t_part3x1);
-        B_part2x1.mergeToAT (B_part3x1);
-      }
-      return 0;
+    return 0;
+  }
+};
+
+struct SerialApplyQ_LeftBackwardInternal {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n,
+                                           const int k,
+                                           /* */ ValueType *A, const int as0,
+                                           const int as1,
+                                           /* */ ValueType *t, const int ts,
+                                           /* */ ValueType *B, const int bs0,
+                                           const int bs1,
+                                           /* */ ValueType *w) {
+    typedef ValueType value_type;
+
+    /// Given a matrix A that includes a series of householder vectors,
+    /// it applies a unitary matrix Q to B from left without transpose
+    ///   B = Q^H B = (H(k-1) H(k-2) ... H0) B
+    /// where
+    ///   A is m x k (holding H0, H1 ... H(k-1)
+    ///   t is k x 1
+    ///   B is m x n
+
+    // partitions used for loop iteration
+    Partition2x2<value_type> A_part2x2(as0, as1);
+    Partition3x3<value_type> A_part3x3(as0, as1);
+
+    Partition2x1<value_type> t_part2x1(ts);
+    Partition3x1<value_type> t_part3x1(ts);
+
+    Partition2x1<value_type> B_part2x1(bs0);
+    Partition3x1<value_type> B_part3x1(bs0);
+
+    // initial partition of A where ATL has a zero dimension
+    A_part2x2.partWithATL(A, m, k, 0, 0);
+    t_part2x1.partWithAT(t, k, 0);
+    B_part2x1.partWithAT(B, m, 0);
+
+    for (int m_A0 = 0; m_A0 < k; ++m_A0) {
+      // part 2x2 into 3x3
+      A_part3x3.partWithABR(A_part2x2, 1, 1);
+      t_part3x1.partWithAB(t_part2x1, 1);
+      value_type *tau = t_part3x1.A1;
+
+      B_part3x1.partWithAB(B_part2x1, 1);
+      const int m_A2 = m - m_A0 - 1;
+      /// -----------------------------------------------------
+      // left apply householder to partitioned B1 and B2
+      SerialApplyLeftHouseholderInternal::invoke(m_A2, n, tau, A_part3x3.A21,
+                                                 as0, B_part3x1.A1, bs1,
+                                                 B_part3x1.A2, bs0, bs1, w);
+
+      /// -----------------------------------------------------
+      A_part2x2.mergeToATL(A_part3x3);
+      t_part2x1.mergeToAT(t_part3x1);
+      B_part2x1.mergeToAT(B_part3x1);
     }
-  };
-
-  struct SerialApplyQ_RightForwardInternal {
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m,
-           const int n,
-           const int k,
-           /* */ ValueType * A, const int as0, const int as1,
-           /* */ ValueType * t, const int ts,
-           /* */ ValueType * B, const int bs0, const int bs1,
-           /* */ ValueType * w) {
-      typedef ValueType value_type;
-        
-      /// Given a matrix A that includes a series of householder vectors,
-      /// it applies a unitary matrix Q to B from left without transpose
-      ///   B = B Q = B (H0 H1 H2 H3 ... H(k-1))
-      /// where
-      ///   A is n x k (holding H0, H1 ... H(k-1)
-      ///   t is k x 1 
-      ///   B is m x n
-
-      // partitions used for loop iteration 
-      Partition2x2<value_type> A_part2x2(as0, as1);
-      Partition3x3<value_type> A_part3x3(as0, as1);
-
-      Partition2x1<value_type> t_part2x1(ts);
-      Partition3x1<value_type> t_part3x1(ts);
-
-      Partition1x2<value_type> B_part1x2(bs1);
-      Partition1x3<value_type> B_part1x3(bs1);
-
-      // initial partition of A where ATL has a zero dimension
-      A_part2x2.partWithATL(A, n, k, 0, 0);
-      t_part2x1.partWithAT (t, k,    0   );
-      B_part1x2.partWithAL (B, n,    0   );
-
-      for (int n_A0=0;n_A0<k;++n_A0) {
-        // part 2x2 into 3x3
-        A_part3x3.partWithABR(A_part2x2, 1, 1);
-        t_part3x1.partWithAB (t_part2x1, 1);
-        value_type *tau = t_part3x1.A1;
-
-        B_part1x3.partWithAR (B_part1x2, 1);
-        const int n_B2 = n - n_A0 - 1;
-        /// -----------------------------------------------------
-        // right apply householder to partitioned B1 and B2
-        SerialApplyRightHouseholderInternal::invoke(m, n_B2,
-                                                    tau,
-                                                    A_part3x3.A21, as0,
-                                                    B_part1x3.A1,  bs0,
-                                                    B_part1x3.A2,  bs0, bs1, 
-                                                    w);
-        /// -----------------------------------------------------
-        A_part2x2.mergeToATL(A_part3x3);
-        t_part2x1.mergeToAT (t_part3x1);
-        B_part1x2.mergeToAL (B_part1x3);
-      }
-      return 0;
+    return 0;
+  }
+};
+
+struct SerialApplyQ_RightForwardInternal {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n,
+                                           const int k,
+                                           /* */ ValueType *A, const int as0,
+                                           const int as1,
+                                           /* */ ValueType *t, const int ts,
+                                           /* */ ValueType *B, const int bs0,
+                                           const int bs1,
+                                           /* */ ValueType *w) {
+    typedef ValueType value_type;
+
+    /// Given a matrix A that includes a series of householder vectors,
+    /// it applies a unitary matrix Q to B from left without transpose
+    ///   B = B Q = B (H0 H1 H2 H3 ... H(k-1))
+    /// where
+    ///   A is n x k (holding H0, H1 ... H(k-1)
+    ///   t is k x 1
+    ///   B is m x n
+
+    // partitions used for loop iteration
+    Partition2x2<value_type> A_part2x2(as0, as1);
+    Partition3x3<value_type> A_part3x3(as0, as1);
+
+    Partition2x1<value_type> t_part2x1(ts);
+    Partition3x1<value_type> t_part3x1(ts);
+
+    Partition1x2<value_type> B_part1x2(bs1);
+    Partition1x3<value_type> B_part1x3(bs1);
+
+    // initial partition of A where ATL has a zero dimension
+    A_part2x2.partWithATL(A, n, k, 0, 0);
+    t_part2x1.partWithAT(t, k, 0);
+    B_part1x2.partWithAL(B, n, 0);
+
+    for (int n_A0 = 0; n_A0 < k; ++n_A0) {
+      // part 2x2 into 3x3
+      A_part3x3.partWithABR(A_part2x2, 1, 1);
+      t_part3x1.partWithAB(t_part2x1, 1);
+      value_type *tau = t_part3x1.A1;
+
+      B_part1x3.partWithAR(B_part1x2, 1);
+      const int n_B2 = n - n_A0 - 1;
+      /// -----------------------------------------------------
+      // right apply householder to partitioned B1 and B2
+      SerialApplyRightHouseholderInternal::invoke(m, n_B2, tau, A_part3x3.A21,
+                                                  as0, B_part1x3.A1, bs0,
+                                                  B_part1x3.A2, bs0, bs1, w);
+      /// -----------------------------------------------------
+      A_part2x2.mergeToATL(A_part3x3);
+      t_part2x1.mergeToAT(t_part3x1);
+      B_part1x2.mergeToAL(B_part1x3);
     }
-  };
-
-} // end namespace KokkosBatched
+    return 0;
+  }
+};
 
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Impl.hpp b/src/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Impl.hpp
index 4561e0bf71..f0204f6e7e 100644
--- a/src/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_APPLY_Q_TEAMVECTOR_IMPL_HPP__
 #define __KOKKOSBATCHED_APPLY_Q_TEAMVECTOR_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,81 +8,61 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// TeamVector Impl
-  /// ===============
-
-  template<typename MemberType>
-  struct TeamVectorApplyQ<MemberType,Side::Left,Trans::NoTranspose,Algo::ApplyQ::Unblocked> {
-    template<typename AViewType,
-             typename tViewType,
-             typename BViewType,
-             typename wViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const AViewType &A,
-           const tViewType &t,
-           const BViewType &B,
-           const wViewType &w) {
-      return TeamVectorApplyQ_LeftForwardInternal::
-        invoke(member,
-               B.extent(0), B.extent(1), A.extent(1), 
-               A.data(), A.stride_0(), A.stride_1(),
-               t.data(), t.stride_0(), 
-               B.data(), B.stride_0(), B.stride_1(),
-               w.data());
-    }
-  };
+///
+/// TeamVector Impl
+/// ===============
 
-  template<typename MemberType>
-  struct TeamVectorApplyQ<MemberType,Side::Left,Trans::Transpose,Algo::ApplyQ::Unblocked> {
-    template<typename AViewType,
-             typename tViewType,
-             typename BViewType,
-             typename wViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const AViewType &A,
-           const tViewType &t,
-           const BViewType &B,
-           const wViewType &w) {
-      return TeamVectorApplyQ_LeftBackwardInternal::
-        invoke(member,
-               B.extent(0), B.extent(1), A.extent(1), 
-               A.data(), A.stride_0(), A.stride_1(),
-               t.data(), t.stride_0(), 
-               B.data(), B.stride_0(), B.stride_1(),
-               w.data());
-    }
-  };
+template <typename MemberType>
+struct TeamVectorApplyQ<MemberType, Side::Left, Trans::NoTranspose,
+                        Algo::ApplyQ::Unblocked> {
+  template <typename AViewType, typename tViewType, typename BViewType,
+            typename wViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const AViewType &A,
+                                           const tViewType &t,
+                                           const BViewType &B,
+                                           const wViewType &w) {
+    return TeamVectorApplyQ_LeftForwardInternal::invoke(
+        member, B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(),
+        A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(),
+        B.stride_1(), w.data());
+  }
+};
 
-  template<typename MemberType>
-  struct TeamVectorApplyQ<MemberType,Side::Right,Trans::NoTranspose,Algo::ApplyQ::Unblocked> {
-    template<typename AViewType,
-             typename tViewType,
-             typename BViewType,
-             typename wViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const AViewType &A,
-           const tViewType &t,
-           const BViewType &B,
-           const wViewType &w) {
-      return TeamVectorApplyQ_RightForwardInternal::
-        invoke(member,
-               B.extent(0), B.extent(1), A.extent(1), 
-               A.data(), A.stride_0(), A.stride_1(),
-               t.data(), t.stride_0(), 
-               B.data(), B.stride_0(), B.stride_1(),
-               w.data());
-    }
-  };
-  
-}
+template <typename MemberType>
+struct TeamVectorApplyQ<MemberType, Side::Left, Trans::Transpose,
+                        Algo::ApplyQ::Unblocked> {
+  template <typename AViewType, typename tViewType, typename BViewType,
+            typename wViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const AViewType &A,
+                                           const tViewType &t,
+                                           const BViewType &B,
+                                           const wViewType &w) {
+    return TeamVectorApplyQ_LeftBackwardInternal::invoke(
+        member, B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(),
+        A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(),
+        B.stride_1(), w.data());
+  }
+};
 
+template <typename MemberType>
+struct TeamVectorApplyQ<MemberType, Side::Right, Trans::NoTranspose,
+                        Algo::ApplyQ::Unblocked> {
+  template <typename AViewType, typename tViewType, typename BViewType,
+            typename wViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const AViewType &A,
+                                           const tViewType &t,
+                                           const BViewType &B,
+                                           const wViewType &w) {
+    return TeamVectorApplyQ_RightForwardInternal::invoke(
+        member, B.extent(0), B.extent(1), A.extent(1), A.data(), A.stride_0(),
+        A.stride_1(), t.data(), t.stride_0(), B.data(), B.stride_0(),
+        B.stride_1(), w.data());
+  }
+};
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Internal.hpp
index 946ef52f29..f290a18054 100644
--- a/src/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_APPLY_Q_TEAMVECTOR_INTERNAL_HPP__
 #define __KOKKOSBATCHED_APPLY_Q_TEAMVECTOR_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,210 +8,181 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
-  ///
-  /// this impl follows the flame interface of householder transformation
-  ///
-
-
-  struct TeamVectorApplyQ_LeftForwardInternal {
-    template<typename MemberType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const int m, 
-           const int n, 
-           const int k, 
-           /* */ ValueType * A, const int as0, const int as1,
-           /* */ ValueType * t, const int ts,
-           /* */ ValueType * B, const int bs0, const int bs1,
-           /* */ ValueType * w) {
-      typedef ValueType value_type;
-        
-      /// Given a matrix A that includes a series of householder vectors,
-      /// it applies a unitary matrix Q to B from left without transpose
-      ///   B = Q B = (H0 H1 H2 H3 ... H(k-1)) B
-      /// where
-      ///   A is m x k (holding H0, H1 ... H(k-1)
-      ///   t is k x 1 
-      ///   B is m x n
-
-      // partitions used for loop iteration 
-      Partition2x2<value_type> A_part2x2(as0, as1);
-      Partition3x3<value_type> A_part3x3(as0, as1);
-
-      Partition2x1<value_type> t_part2x1(ts);
-      Partition3x1<value_type> t_part3x1(ts);
-
-      Partition2x1<value_type> B_part2x1(bs0);
-      Partition3x1<value_type> B_part3x1(bs0);
-
-      // initial partition of A where ATL has a zero dimension
-      A_part2x2.partWithABR(A, m, k, m-k, 0);
-      t_part2x1.partWithAB (t, k,    0     );
-      B_part2x1.partWithAB (B, m,    m-k   );
-
-      for (int m_A0=(k-1);m_A0>=0;--m_A0) {
-        // part 2x2 into 3x3
-        A_part3x3.partWithATL(A_part2x2, 1, 1);
-        t_part3x1.partWithAT (t_part2x1, 1);
-        value_type *tau = t_part3x1.A1;
-
-        B_part3x1.partWithAT (B_part2x1, 1);
-        const int m_A2 = m - m_A0 - 1;
-        /// -----------------------------------------------------
-        // left apply householder to partitioned B1 and B2
-        TeamVectorApplyLeftHouseholderInternal::invoke(member, 
-                                                       m_A2, n,
-                                                       tau,
-                                                       A_part3x3.A21, as0,
-                                                       B_part3x1.A1,  bs1,
-                                                       B_part3x1.A2,  bs0, bs1, 
-                                                       w);            
-        member.team_barrier();
-        /// -----------------------------------------------------
-        A_part2x2.mergeToABR(A_part3x3);
-        t_part2x1.mergeToAB (t_part3x1);
-        B_part2x1.mergeToAB (B_part3x1);
-      }
-      return 0;
+///
+/// Serial Internal Impl
+/// ====================
+///
+/// this impl follows the flame interface of householder transformation
+///
+
+struct TeamVectorApplyQ_LeftForwardInternal {
+  template <typename MemberType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const int m, const int n, const int k,
+      /* */ ValueType *A, const int as0, const int as1,
+      /* */ ValueType *t, const int ts,
+      /* */ ValueType *B, const int bs0, const int bs1,
+      /* */ ValueType *w) {
+    typedef ValueType value_type;
+
+    /// Given a matrix A that includes a series of householder vectors,
+    /// it applies a unitary matrix Q to B from left without transpose
+    ///   B = Q B = (H0 H1 H2 H3 ... H(k-1)) B
+    /// where
+    ///   A is m x k (holding H0, H1 ... H(k-1)
+    ///   t is k x 1
+    ///   B is m x n
+
+    // partitions used for loop iteration
+    Partition2x2<value_type> A_part2x2(as0, as1);
+    Partition3x3<value_type> A_part3x3(as0, as1);
+
+    Partition2x1<value_type> t_part2x1(ts);
+    Partition3x1<value_type> t_part3x1(ts);
+
+    Partition2x1<value_type> B_part2x1(bs0);
+    Partition3x1<value_type> B_part3x1(bs0);
+
+    // initial partition of A where ATL has a zero dimension
+    A_part2x2.partWithABR(A, m, k, m - k, 0);
+    t_part2x1.partWithAB(t, k, 0);
+    B_part2x1.partWithAB(B, m, m - k);
+
+    for (int m_A0 = (k - 1); m_A0 >= 0; --m_A0) {
+      // part 2x2 into 3x3
+      A_part3x3.partWithATL(A_part2x2, 1, 1);
+      t_part3x1.partWithAT(t_part2x1, 1);
+      value_type *tau = t_part3x1.A1;
+
+      B_part3x1.partWithAT(B_part2x1, 1);
+      const int m_A2 = m - m_A0 - 1;
+      /// -----------------------------------------------------
+      // left apply householder to partitioned B1 and B2
+      TeamVectorApplyLeftHouseholderInternal::invoke(
+          member, m_A2, n, tau, A_part3x3.A21, as0, B_part3x1.A1, bs1,
+          B_part3x1.A2, bs0, bs1, w);
+      member.team_barrier();
+      /// -----------------------------------------------------
+      A_part2x2.mergeToABR(A_part3x3);
+      t_part2x1.mergeToAB(t_part3x1);
+      B_part2x1.mergeToAB(B_part3x1);
     }
-  };
-
-  struct TeamVectorApplyQ_LeftBackwardInternal {
-    template<typename MemberType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const int m, 
-           const int n, 
-           const int k, 
-           /* */ ValueType * A, const int as0, const int as1,
-           /* */ ValueType * t, const int ts,
-           /* */ ValueType * B, const int bs0, const int bs1,
-           /* */ ValueType * w) {
-      typedef ValueType value_type;
-        
-      /// Given a matrix A that includes a series of householder vectors,
-      /// it applies a unitary matrix Q to B from left without transpose
-      ///   B = Q B = (H0 H1 H2 H3 ... H(k-1)) B
-      /// where
-      ///   A is m x k (holding H0, H1 ... H(k-1)
-      ///   t is k x 1 
-      ///   B is m x n
-
-      // partitions used for loop iteration 
-      Partition2x2<value_type> A_part2x2(as0, as1);
-      Partition3x3<value_type> A_part3x3(as0, as1);
-
-      Partition2x1<value_type> t_part2x1(ts);
-      Partition3x1<value_type> t_part3x1(ts);
-
-      Partition2x1<value_type> B_part2x1(bs0);
-      Partition3x1<value_type> B_part3x1(bs0);
-
-      // initial partition of A where ATL has a zero dimension
-      A_part2x2.partWithATL(A, m, k, 0, 0);
-      t_part2x1.partWithAT (t, k,    0   );
-      B_part2x1.partWithAT (B, m,    0   );
-
-      for (int m_A0=0;m_A0<k;++m_A0) {
-        // part 2x2 into 3x3
-        A_part3x3.partWithABR(A_part2x2, 1, 1);
-        t_part3x1.partWithAB (t_part2x1, 1);
-        value_type *tau = t_part3x1.A1;
-
-        B_part3x1.partWithAB (B_part2x1, 1);
-        const int m_A2 = m - m_A0 - 1;
-        /// -----------------------------------------------------
-        // left apply householder to partitioned B1 and B2
-        TeamVectorApplyLeftHouseholderInternal::invoke(member, 
-                                                       m_A2, n,
-                                                       tau,
-                                                       A_part3x3.A21, as0,
-                                                       B_part3x1.A1,  bs1,
-                                                       B_part3x1.A2,  bs0, bs1, 
-                                                       w);            
-        member.team_barrier();
-        /// -----------------------------------------------------
-        A_part2x2.mergeToATL(A_part3x3);
-        t_part2x1.mergeToAT (t_part3x1);
-        B_part2x1.mergeToAT (B_part3x1);
-      }
-      return 0;
+    return 0;
+  }
+};
+
+struct TeamVectorApplyQ_LeftBackwardInternal {
+  template <typename MemberType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const int m, const int n, const int k,
+      /* */ ValueType *A, const int as0, const int as1,
+      /* */ ValueType *t, const int ts,
+      /* */ ValueType *B, const int bs0, const int bs1,
+      /* */ ValueType *w) {
+    typedef ValueType value_type;
+
+    /// Given a matrix A that includes a series of householder vectors,
+    /// it applies a unitary matrix Q to B from left without transpose
+    ///   B = Q B = (H0 H1 H2 H3 ... H(k-1)) B
+    /// where
+    ///   A is m x k (holding H0, H1 ... H(k-1)
+    ///   t is k x 1
+    ///   B is m x n
+
+    // partitions used for loop iteration
+    Partition2x2<value_type> A_part2x2(as0, as1);
+    Partition3x3<value_type> A_part3x3(as0, as1);
+
+    Partition2x1<value_type> t_part2x1(ts);
+    Partition3x1<value_type> t_part3x1(ts);
+
+    Partition2x1<value_type> B_part2x1(bs0);
+    Partition3x1<value_type> B_part3x1(bs0);
+
+    // initial partition of A where ATL has a zero dimension
+    A_part2x2.partWithATL(A, m, k, 0, 0);
+    t_part2x1.partWithAT(t, k, 0);
+    B_part2x1.partWithAT(B, m, 0);
+
+    for (int m_A0 = 0; m_A0 < k; ++m_A0) {
+      // part 2x2 into 3x3
+      A_part3x3.partWithABR(A_part2x2, 1, 1);
+      t_part3x1.partWithAB(t_part2x1, 1);
+      value_type *tau = t_part3x1.A1;
+
+      B_part3x1.partWithAB(B_part2x1, 1);
+      const int m_A2 = m - m_A0 - 1;
+      /// -----------------------------------------------------
+      // left apply householder to partitioned B1 and B2
+      TeamVectorApplyLeftHouseholderInternal::invoke(
+          member, m_A2, n, tau, A_part3x3.A21, as0, B_part3x1.A1, bs1,
+          B_part3x1.A2, bs0, bs1, w);
+      member.team_barrier();
+      /// -----------------------------------------------------
+      A_part2x2.mergeToATL(A_part3x3);
+      t_part2x1.mergeToAT(t_part3x1);
+      B_part2x1.mergeToAT(B_part3x1);
     }
-  };
-
-  struct TeamVectorApplyQ_RightForwardInternal {
-    template<typename MemberType, 
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const int m,
-           const int n,
-           const int k,
-           /* */ ValueType * A, const int as0, const int as1,
-           /* */ ValueType * t, const int ts,
-           /* */ ValueType * B, const int bs0, const int bs1,
-           /* */ ValueType * w) {
-      typedef ValueType value_type;
-        
-      /// Given a matrix A that includes a series of householder vectors,
-      /// it applies a unitary matrix Q to B from left without transpose
-      ///   B = B Q = B (H0 H1 H2 H3 ... H(k-1))
-      /// where
-      ///   A is n x k (holding H0, H1 ... H(k-1)
-      ///   t is k x 1 
-      ///   B is m x n
-
-      // partitions used for loop iteration 
-      Partition2x2<value_type> A_part2x2(as0, as1);
-      Partition3x3<value_type> A_part3x3(as0, as1);
-
-      Partition2x1<value_type> t_part2x1(ts);
-      Partition3x1<value_type> t_part3x1(ts);
-
-      Partition1x2<value_type> B_part1x2(bs1);
-      Partition1x3<value_type> B_part1x3(bs1);
-
-      // initial partition of A where ATL has a zero dimension
-      A_part2x2.partWithATL(A, n, k, 0, 0);
-      t_part2x1.partWithAT (t, k,    0   );
-      B_part1x2.partWithAL (B, n,    0   );
-
-      for (int n_A0=0;n_A0<k;++n_A0) {
-        // part 2x2 into 3x3
-        A_part3x3.partWithABR(A_part2x2, 1, 1);
-        t_part3x1.partWithAB (t_part2x1, 1);
-        value_type *tau = t_part3x1.A1;
-
-        B_part1x3.partWithAR (B_part1x2, 1);
-        const int n_B2 = n - n_A0 - 1;
-        /// -----------------------------------------------------
-        // right apply householder to partitioned B1 and B2
-        TeamVectorApplyRightHouseholderInternal::invoke(member, 
-                                                        m, n_B2,
-                                                        tau,
-                                                        A_part3x3.A21, as0,
-                                                        B_part1x3.A1,  bs0,
-                                                        B_part1x3.A2,  bs0, bs1, 
-                                                        w);
-        member.team_barrier();
-        /// -----------------------------------------------------
-        A_part2x2.mergeToATL(A_part3x3);
-        t_part2x1.mergeToAT (t_part3x1);
-        B_part1x2.mergeToAL (B_part1x3);
-      }
-      return 0;
+    return 0;
+  }
+};
+
+struct TeamVectorApplyQ_RightForwardInternal {
+  template <typename MemberType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const int m, const int n, const int k,
+      /* */ ValueType *A, const int as0, const int as1,
+      /* */ ValueType *t, const int ts,
+      /* */ ValueType *B, const int bs0, const int bs1,
+      /* */ ValueType *w) {
+    typedef ValueType value_type;
+
+    /// Given a matrix A that includes a series of householder vectors,
+    /// it applies a unitary matrix Q to B from left without transpose
+    ///   B = B Q = B (H0 H1 H2 H3 ... H(k-1))
+    /// where
+    ///   A is n x k (holding H0, H1 ... H(k-1)
+    ///   t is k x 1
+    ///   B is m x n
+
+    // partitions used for loop iteration
+    Partition2x2<value_type> A_part2x2(as0, as1);
+    Partition3x3<value_type> A_part3x3(as0, as1);
+
+    Partition2x1<value_type> t_part2x1(ts);
+    Partition3x1<value_type> t_part3x1(ts);
+
+    Partition1x2<value_type> B_part1x2(bs1);
+    Partition1x3<value_type> B_part1x3(bs1);
+
+    // initial partition of A where ATL has a zero dimension
+    A_part2x2.partWithATL(A, n, k, 0, 0);
+    t_part2x1.partWithAT(t, k, 0);
+    B_part1x2.partWithAL(B, n, 0);
+
+    for (int n_A0 = 0; n_A0 < k; ++n_A0) {
+      // part 2x2 into 3x3
+      A_part3x3.partWithABR(A_part2x2, 1, 1);
+      t_part3x1.partWithAB(t_part2x1, 1);
+      value_type *tau = t_part3x1.A1;
+
+      B_part1x3.partWithAR(B_part1x2, 1);
+      const int n_B2 = n - n_A0 - 1;
+      /// -----------------------------------------------------
+      // right apply householder to partitioned B1 and B2
+      TeamVectorApplyRightHouseholderInternal::invoke(
+          member, m, n_B2, tau, A_part3x3.A21, as0, B_part1x3.A1, bs0,
+          B_part1x3.A2, bs0, bs1, w);
+      member.team_barrier();
+      /// -----------------------------------------------------
+      A_part2x2.mergeToATL(A_part3x3);
+      t_part2x1.mergeToAT(t_part3x1);
+      B_part1x2.mergeToAL(B_part1x3);
     }
-  };
-
-} // end namespace KokkosBatched
+    return 0;
+  }
+};
 
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp
new file mode 100644
index 0000000000..1d45931bc4
--- /dev/null
+++ b/src/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp
@@ -0,0 +1,347 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+#ifndef __KOKKOSBATCHED_AXPY_IMPL_HPP__
+#define __KOKKOSBATCHED_AXPY_IMPL_HPP__
+
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+
+namespace KokkosBatched {
+
+///
+/// Serial Internal Impl
+/// ====================
+struct SerialAxpyInternal {
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType alpha,
+                                           const ValueType* KOKKOS_RESTRICT X,
+                                           const int xs0,
+                                           /* */ ValueType* KOKKOS_RESTRICT Y,
+                                           const int ys0) {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+    for (int i = 0; i < m; ++i) Y[i * ys0] += alpha * X[i * xs0];
+
+    return 0;
+  }
+
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const int m, const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0,
+      const ValueType* KOKKOS_RESTRICT X, const int xs0,
+      /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+    for (int i = 0; i < m; ++i) Y[i * ys0] += alpha[i * alphas0] * X[i * xs0];
+
+    return 0;
+  }
+
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const int m, const int n, const ScalarType* KOKKOS_RESTRICT alpha,
+      const int alphas0, const ValueType* KOKKOS_RESTRICT X, const int xs0,
+      const int xs1,
+      /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) {
+    if (xs0 > xs1)
+      for (int i = 0; i < m; ++i)
+        invoke(n, alpha[i * alphas0], X + i * xs0, xs1, Y + i * ys0, ys1);
+    else
+      for (int j = 0; j < n; ++j)
+        invoke(m, alpha, alphas0, X + j * xs1, xs0, Y + j * ys1, ys0);
+
+    return 0;
+  }
+};
+
+///
+/// Team Internal Impl
+/// ====================
+struct TeamAxpyInternal {
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member,
+                                           const int m, const ScalarType alpha,
+                                           const ValueType* KOKKOS_RESTRICT X,
+                                           const int xs0,
+                                           /* */ ValueType* KOKKOS_RESTRICT Y,
+                                           const int ys0) {
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int& i) {
+      Y[i * ys0] += alpha * X[i * xs0];
+    });
+    // member.team_barrier();
+    return 0;
+  }
+
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType& member, const int m,
+      const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0,
+      const ValueType* KOKKOS_RESTRICT X, const int xs0,
+      /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) {
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int& i) {
+      Y[i * ys0] += alpha[i * alphas0] * X[i * xs0];
+    });
+    // member.team_barrier();
+    return 0;
+  }
+
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType& member, const int m, const int n,
+      const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0,
+      const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1,
+      /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) {
+    if (m > n) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, m), [&](const int& i) {
+            SerialAxpyInternal::invoke(n, alpha[i * alphas0], X + i * xs0, xs1,
+                                       Y + i * ys0, ys1);
+          });
+    } else {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, n), [&](const int& j) {
+            SerialAxpyInternal::invoke(m, alpha, alphas0, X + j * xs1, xs0,
+                                       Y + j * ys1, ys0);
+          });
+    }
+    // member.team_barrier();
+    return 0;
+  }
+};
+
+///
+/// TeamVector Internal Impl
+/// ========================
+struct TeamVectorAxpyInternal {
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member,
+                                           const int m, const ScalarType alpha,
+                                           const ValueType* KOKKOS_RESTRICT X,
+                                           const int xs0,
+                                           /* */ ValueType* KOKKOS_RESTRICT Y,
+                                           const int ys0) {
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int& i) {
+      Y[i * ys0] += alpha * X[i * xs0];
+    });
+    // member.team_barrier();
+    return 0;
+  }
+
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType& member, const int m,
+      const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0,
+      const ValueType* KOKKOS_RESTRICT X, const int xs0,
+      /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) {
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int& i) {
+      Y[i * ys0] += alpha[i * alphas0] * X[i * xs0];
+    });
+    // member.team_barrier();
+    return 0;
+  }
+
+  template <typename MemberType, typename ScalarType, typename ValueType,
+            typename layout>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType& member, const int m, const int n,
+      const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0,
+      const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1,
+      /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) {
+    Kokkos::parallel_for(
+        Kokkos::TeamVectorRange(member, 0, m * n), [&](const int& iTemp) {
+          int i, j;
+          getIndices<int, layout>(iTemp, n, m, j, i);
+          Y[i * ys0 + j * ys1] += alpha[i * alphas0] * X[i * xs0 + j * xs1];
+        });
+    // member.team_barrier();
+    return 0;
+  }
+};
+
+///
+/// Serial Impl
+/// ===========
+template <typename XViewType, typename YViewType, typename alphaViewType>
+KOKKOS_INLINE_FUNCTION int SerialAxpy::invoke(const alphaViewType& alpha,
+                                              const XViewType& X,
+                                              const YViewType& Y) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+  static_assert(Kokkos::is_view<XViewType>::value,
+                "KokkosBatched::axpy: XViewType is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<YViewType>::value,
+                "KokkosBatched::axpy: YViewType is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<alphaViewType>::value,
+                "KokkosBatched::axpy: alphaViewType is not a Kokkos::View.");
+  static_assert(XViewType::Rank == 2,
+                "KokkosBatched::axpy: XViewType must have rank 2.");
+  static_assert(YViewType::Rank == 2,
+                "KokkosBatched::axpy: YViewType must have rank 2.");
+  static_assert(alphaViewType::Rank == 1,
+                "KokkosBatched::axpy: alphaViewType must have rank 1.");
+
+  // Check compatibility of dimensions at run time.
+  if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) {
+    KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+        "KokkosBatched::axpy: Dimensions of X and Y do not match: X: %d x %d, "
+        "Y: %d x %d\n",
+        (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1));
+    return 1;
+  }
+  if (X.extent(0) != alpha.extent(0)) {
+    KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+        "KokkosBatched::axpy: First dimension of X and alpha do not match: X: "
+        "%d x %d, alpha: %d\n",
+        (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0));
+    return 1;
+  }
+#endif
+
+  return SerialAxpyInternal::template invoke<
+      typename alphaViewType::non_const_value_type,
+      typename XViewType::non_const_value_type>(
+      X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), X.data(),
+      X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(), Y.stride_1());
+}
+
+///
+/// Team Impl
+/// =========
+
+template <typename MemberType>
+template <typename XViewType, typename YViewType, typename alphaViewType>
+KOKKOS_INLINE_FUNCTION int TeamAxpy<MemberType>::invoke(
+    const MemberType& member, const alphaViewType& alpha, const XViewType& X,
+    const YViewType& Y) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+  static_assert(Kokkos::is_view<XViewType>::value,
+                "KokkosBatched::axpy: XViewType is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<YViewType>::value,
+                "KokkosBatched::axpy: YViewType is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<alphaViewType>::value,
+                "KokkosBatched::axpy: alphaViewType is not a Kokkos::View.");
+  static_assert(XViewType::Rank == 2,
+                "KokkosBatched::axpy: XViewType must have rank 2.");
+  static_assert(YViewType::Rank == 2,
+                "KokkosBatched::axpy: YViewType must have rank 2.");
+  static_assert(alphaViewType::Rank == 1,
+                "KokkosBatched::axpy: alphaViewType must have rank 1.");
+
+  // Check compatibility of dimensions at run time.
+  if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) {
+    KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+        "KokkosBatched::axpy: Dimensions of X and Y do not match: X: %d x %d, "
+        "Y: %d x %d\n",
+        (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1));
+    return 1;
+  }
+  if (X.extent(0) != alpha.extent(0)) {
+    KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+        "KokkosBatched::axpy: First dimension of X and alpha do not match: X: "
+        "%d x %d, alpha: %d\n",
+        (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0));
+    return 1;
+  }
+#endif
+
+  return TeamAxpyInternal::template invoke<
+      MemberType, typename alphaViewType::non_const_value_type,
+      typename XViewType::non_const_value_type>(
+      member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(),
+      X.data(), X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(),
+      Y.stride_1());
+}
+
+///
+/// TeamVector Impl
+/// ===============
+
+template <typename MemberType>
+template <typename XViewType, typename YViewType, typename alphaViewType>
+KOKKOS_INLINE_FUNCTION int TeamVectorAxpy<MemberType>::invoke(
+    const MemberType& member, const alphaViewType& alpha, const XViewType& X,
+    const YViewType& Y) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+  static_assert(Kokkos::is_view<XViewType>::value,
+                "KokkosBatched::axpy: XViewType is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<YViewType>::value,
+                "KokkosBatched::axpy: YViewType is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<alphaViewType>::value,
+                "KokkosBatched::axpy: alphaViewType is not a Kokkos::View.");
+  static_assert(XViewType::Rank == 2,
+                "KokkosBatched::axpy: XViewType must have rank 2.");
+  static_assert(YViewType::Rank == 2,
+                "KokkosBatched::axpy: YViewType must have rank 2.");
+  static_assert(alphaViewType::Rank == 1,
+                "KokkosBatched::axpy: alphaViewType must have rank 1.");
+
+  // Check compatibility of dimensions at run time.
+  if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) {
+    KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+        "KokkosBatched::axpy: Dimensions of X and Y do not match: X: %d x %d, "
+        "Y: %d x %d\n",
+        (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1));
+    return 1;
+  }
+  if (X.extent(0) != alpha.extent(0)) {
+    KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+        "KokkosBatched::axpy: First dimension of X and alpha do not match: X: "
+        "%d x %d, alpha: %d\n",
+        (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0));
+    return 1;
+  }
+#endif
+
+  return TeamVectorAxpyInternal::invoke<
+      MemberType, typename alphaViewType::non_const_value_type,
+      typename XViewType::non_const_value_type,
+      typename XViewType::array_layout>(member, X.extent(0), X.extent(1),
+                                        alpha.data(), alpha.stride_0(),
+                                        X.data(), X.stride_0(), X.stride_1(),
+                                        Y.data(), Y.stride_0(), Y.stride_1());
+}
+
+}  // namespace KokkosBatched
+
+#endif
diff --git a/src/batched/dense/impl/KokkosBatched_Copy_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Copy_Impl.hpp
index 8ab3064c69..f54e7d7f44 100644
--- a/src/batched/dense/impl/KokkosBatched_Copy_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Copy_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_COPY_IMPL_HPP__
 #define __KOKKOSBATCHED_COPY_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,122 +8,144 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Impl
-  /// ===========
-      
-  template<>
-  template<typename AViewType,
-           typename BViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialCopy<Trans::NoTranspose>::
-  invoke(const AViewType &A,
-         const BViewType &B) {
-    return SerialCopyInternal::
-      invoke(A.extent(0), 
-             A.extent(1), 
-             A.data(), A.stride_0(), A.stride_1(),
-             B.data(), B.stride_0(), B.stride_1());
+///
+/// Serial Impl
+/// ===========
+
+template <>
+template <typename AViewType, typename BViewType>
+KOKKOS_INLINE_FUNCTION int SerialCopy<Trans::NoTranspose, 1>::invoke(
+    const AViewType &A, const BViewType &B) {
+  return SerialCopyInternal::invoke(A.extent(0), A.data(), A.stride_0(),
+                                    B.data(), B.stride_0());
+}
+
+template <>
+template <typename AViewType, typename BViewType>
+KOKKOS_INLINE_FUNCTION int SerialCopy<Trans::Transpose, 1>::invoke(
+    const AViewType &A, const BViewType &B) {
+  return SerialCopyInternal::invoke(A.extent(0), A.data(), A.stride_0(),
+                                    B.data(), B.stride_0());
+}
+
+template <>
+template <typename AViewType, typename BViewType>
+KOKKOS_INLINE_FUNCTION int SerialCopy<Trans::NoTranspose, 2>::invoke(
+    const AViewType &A, const BViewType &B) {
+  return SerialCopyInternal::invoke(A.extent(0), A.extent(1), A.data(),
+                                    A.stride_0(), A.stride_1(), B.data(),
+                                    B.stride_0(), B.stride_1());
+}
+
+template <>
+template <typename AViewType, typename BViewType>
+KOKKOS_INLINE_FUNCTION int SerialCopy<Trans::Transpose, 2>::invoke(
+    const AViewType &A, const BViewType &B) {
+  return SerialCopyInternal::invoke(A.extent(1), A.extent(0), A.data(),
+                                    A.stride_1(), A.stride_0(), B.data(),
+                                    B.stride_0(), B.stride_1());
+}
+
+///
+/// Team Impl
+/// =========
+
+template <typename MemberType>
+struct TeamCopy<MemberType, Trans::NoTranspose, 1> {
+  template <typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return TeamCopyInternal::invoke(member, A.extent(0), A.data(), A.stride_0(),
+                                    B.data(), B.stride_0());
+  }
+};
+
+template <typename MemberType>
+struct TeamCopy<MemberType, Trans::Transpose, 1> {
+  template <typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return TeamCopyInternal::invoke(member, A.extent(0), A.data(), A.stride_0(),
+                                    B.data(), B.stride_0());
+  }
+};
+
+template <typename MemberType>
+struct TeamCopy<MemberType, Trans::NoTranspose, 2> {
+  template <typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return TeamCopyInternal::invoke(member, A.extent(0), A.extent(1), A.data(),
+                                    A.stride_0(), A.stride_1(), B.data(),
+                                    B.stride_0(), B.stride_1());
   }
-    
-  template<>
-  template<typename AViewType,
-           typename BViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialCopy<Trans::Transpose>::
-  invoke(const AViewType &A,
-         const BViewType &B) {
-    return SerialCopyInternal::
-      invoke(A.extent(1), 
-             A.extent(0), 
-             A.data(), A.stride_1(), A.stride_0(),
-             B.data(), B.stride_0(), B.stride_1());
+};
+
+template <typename MemberType>
+struct TeamCopy<MemberType, Trans::Transpose, 2> {
+  template <typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return TeamCopyInternal::invoke(member, A.extent(1), A.extent(0), A.data(),
+                                    A.stride_1(), A.stride_0(), B.data(),
+                                    B.stride_0(), B.stride_1());
+  }
+};
+
+///
+/// TeamVector Impl
+/// =========
+
+template <typename MemberType>
+struct TeamVectorCopy<MemberType, Trans::NoTranspose, 1> {
+  template <typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return TeamVectorCopyInternal::invoke(member, A.extent(0), A.data(),
+                                          A.stride_0(), B.data(), B.stride_0());
+  }
+};
+
+template <typename MemberType>
+struct TeamVectorCopy<MemberType, Trans::Transpose, 1> {
+  template <typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return TeamVectorCopyInternal::invoke(member, A.extent(0), A.data(),
+                                          A.stride_0(), B.data(), B.stride_0());
+  }
+};
+
+template <typename MemberType>
+struct TeamVectorCopy<MemberType, Trans::NoTranspose, 2> {
+  template <typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return TeamVectorCopyInternal::invoke(member, A.extent(0), A.extent(1),
+                                          A.data(), A.stride_0(), A.stride_1(),
+                                          B.data(), B.stride_0(), B.stride_1());
+  }
+};
+
+template <typename MemberType>
+struct TeamVectorCopy<MemberType, Trans::Transpose, 2> {
+  template <typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return TeamVectorCopyInternal::invoke(member, A.extent(1), A.extent(0),
+                                          A.data(), A.stride_1(), A.stride_0(),
+                                          B.data(), B.stride_0(), B.stride_1());
   }
-    
-    
-  ///
-  /// Team Impl
-  /// =========
-    
-  template<typename MemberType>
-  struct TeamCopy<MemberType,Trans::NoTranspose> {
-    template<typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const AViewType &A,
-           const BViewType &B) {
-      return TeamCopyInternal::
-        invoke(member,
-               A.extent(0), 
-               A.extent(1), 
-               A.data(), A.stride_0(), A.stride_1(),
-               B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-    
-  template<typename MemberType>
-  struct TeamCopy<MemberType,Trans::Transpose> {
-    template<typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const AViewType &A,
-           const BViewType &B) {
-      return TeamCopyInternal::
-        invoke(member,
-               A.extent(1), 
-               A.extent(0), 
-               A.data(), A.stride_1(), A.stride_0(),
-               B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-
-  ///
-  /// TeamVector Impl
-  /// =========
-    
-  template<typename MemberType>
-  struct TeamVectorCopy<MemberType,Trans::NoTranspose> {
-    template<typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const AViewType &A,
-           const BViewType &B) {
-      return TeamVectorCopyInternal::
-        invoke(member,
-               A.extent(0), 
-               A.extent(1), 
-               A.data(), A.stride_0(), A.stride_1(),
-               B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-    
-  template<typename MemberType>
-  struct TeamVectorCopy<MemberType,Trans::Transpose> {
-    template<typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const AViewType &A,
-           const BViewType &B) {
-      return TeamVectorCopyInternal::
-        invoke(member,
-               A.extent(1), 
-               A.extent(0), 
-               A.data(), A.stride_1(), A.stride_0(),
-               B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-      
-} //end namespace KokkosBatched
+};
 
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Copy_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Copy_Internal.hpp
index 394caca082..24a8b95609 100644
--- a/src/batched/dense/impl/KokkosBatched_Copy_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Copy_Internal.hpp
@@ -1,142 +1,117 @@
 #ifndef __KOKKOSBATCHED_COPY_INTERNAL_HPP__
 #define __KOKKOSBATCHED_COPY_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
-        
-  struct SerialCopyInternal {
-    template<typename ValueType>
-    KOKKOS_FORCEINLINE_FUNCTION
-    static int
-    invoke(const int m, 
-           const ValueType *__restrict__ A, const int as0, 
-           /* */ ValueType *__restrict__ B, const int bs0) {
-        
+///
+/// Serial Internal Impl
+/// ====================
+
+struct SerialCopyInternal {
+  template <typename ValueType>
+  KOKKOS_FORCEINLINE_FUNCTION static int invoke(
+      const int m, const ValueType *KOKKOS_RESTRICT A, const int as0,
+      /* */ ValueType *KOKKOS_RESTRICT B, const int bs0) {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int i=0;i<m;++i) 
-        B[i*bs0] = A[i*as0];
-        
-      return 0;
-    }
-    template<typename ValueType>
-    KOKKOS_FORCEINLINE_FUNCTION
-    static int
-    invoke(const int m, const int n, 
-           const ValueType *__restrict__ A, const int as0, const int as1,
-           /* */ ValueType *__restrict__ B, const int bs0, const int bs1) {
-      if (as1 < as0) 
-        for (int i=0;i<m;++i) 
-          invoke(n, A+i*as0, as1, B+i*bs0, bs1);
-      else 
-        for (int j=0;j<n;++j)               
-          invoke(m, A+j*as1, as0, B+j*bs1, bs0);
-      return 0;
-    }
-  };        
-    
-  ///
-  /// Team Internal Impl
-  /// ==================
-  struct TeamCopyInternal {
-    template<typename MemberType,
-             typename ValueType>
-    KOKKOS_FORCEINLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const int m, 
-           const ValueType *__restrict__ A, const int as0, 
-           /* */ ValueType *__restrict__ B, const int bs0) {
-      Kokkos::parallel_for
-        (Kokkos::TeamThreadRange(member,m),[&](const int &i) {
-          B[i*bs0] = A[i*as0];
-        });
-      //member.team_barrier();
-      return 0;
-    }
-    template<typename MemberType,
-             typename ValueType>
-    KOKKOS_FORCEINLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const int m, const int n, 
-           const ValueType *__restrict__ A, const int as0, const int as1,
-           /* */ ValueType *__restrict__ B, const int bs0, const int bs1) {
-      if (m >= n) { 
-        Kokkos::parallel_for
-          (Kokkos::TeamThreadRange(member,m),[&](const int &i) {
-            SerialCopyInternal::invoke(n, A+i*as0, as1, B+i*bs0, bs1);
+    for (int i = 0; i < m; ++i) B[i * bs0] = A[i * as0];
+
+    return 0;
+  }
+  template <typename ValueType>
+  KOKKOS_FORCEINLINE_FUNCTION static int invoke(
+      const int m, const int n, const ValueType *KOKKOS_RESTRICT A,
+      const int as0, const int as1,
+      /* */ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) {
+    if (as1 < as0)
+      for (int i = 0; i < m; ++i) invoke(n, A + i * as0, as1, B + i * bs0, bs1);
+    else
+      for (int j = 0; j < n; ++j) invoke(m, A + j * as1, as0, B + j * bs1, bs0);
+    return 0;
+  }
+};
+
+///
+/// Team Internal Impl
+/// ==================
+struct TeamCopyInternal {
+  template <typename MemberType, typename ValueType>
+  KOKKOS_FORCEINLINE_FUNCTION static int invoke(
+      const MemberType &member, const int m, const ValueType *KOKKOS_RESTRICT A,
+      const int as0,
+      /* */ ValueType *KOKKOS_RESTRICT B, const int bs0) {
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m),
+                         [&](const int &i) { B[i * bs0] = A[i * as0]; });
+    // member.team_barrier();
+    return 0;
+  }
+  template <typename MemberType, typename ValueType>
+  KOKKOS_FORCEINLINE_FUNCTION static int invoke(
+      const MemberType &member, const int m, const int n,
+      const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+      /* */ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) {
+    if (m >= n) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, m), [&](const int &i) {
+            SerialCopyInternal::invoke(n, A + i * as0, as1, B + i * bs0, bs1);
           });
-      } else {
-        Kokkos::parallel_for
-          (Kokkos::TeamThreadRange(member,n),[&](const int &j) {
-            SerialCopyInternal::invoke(m, A+j*as1, as0, B+j*bs1, bs0);
+    } else {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, n), [&](const int &j) {
+            SerialCopyInternal::invoke(m, A + j * as1, as0, B + j * bs1, bs0);
           });
-      }
-      //member.team_barrier();
-      return 0;
     }
-  };
+    // member.team_barrier();
+    return 0;
+  }
+};
 
-  ///
-  /// TeamVector Internal Impl
-  /// ========================
-  struct TeamVectorCopyInternal {
-    template<typename MemberType,
-             typename ValueType>
-    KOKKOS_FORCEINLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const int m, 
-           const ValueType *__restrict__ A, const int as0, 
-           /* */ ValueType *__restrict__ B, const int bs0) {
-      Kokkos::parallel_for
-        (Kokkos::TeamVectorRange(member,m),[&](const int &i) {
-          B[i*bs0] = A[i*as0];
-        });
-      //member.team_barrier();
-      return 0;
-    }
-    template<typename MemberType,
-             typename ValueType>
-    KOKKOS_FORCEINLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const int m, const int n, 
-           const ValueType *__restrict__ A, const int as0, const int as1,
-           /* */ ValueType *__restrict__ B, const int bs0, const int bs1) {
-      if (as0 > as1) {
-        Kokkos::parallel_for
-          (Kokkos::TeamThreadRange(member,m),[&](const int &i) {
-            Kokkos::parallel_for
-              (Kokkos::ThreadVectorRange(member,n),[&](const int &j) {
-                B[i*bs0+j*bs1] = A[i*as0+j*as1];
-              });
+///
+/// TeamVector Internal Impl
+/// ========================
+struct TeamVectorCopyInternal {
+  template <typename MemberType, typename ValueType>
+  KOKKOS_FORCEINLINE_FUNCTION static int invoke(
+      const MemberType &member, const int m, const ValueType *KOKKOS_RESTRICT A,
+      const int as0,
+      /* */ ValueType *KOKKOS_RESTRICT B, const int bs0) {
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m),
+                         [&](const int &i) { B[i * bs0] = A[i * as0]; });
+    // member.team_barrier();
+    return 0;
+  }
+  template <typename MemberType, typename ValueType>
+  KOKKOS_FORCEINLINE_FUNCTION static int invoke(
+      const MemberType &member, const int m, const int n,
+      const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+      /* */ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) {
+    if (as0 > as1) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, m), [&](const int &i) {
+            Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n),
+                                 [&](const int &j) {
+                                   B[i * bs0 + j * bs1] = A[i * as0 + j * as1];
+                                 });
           });
-      } else {
-        Kokkos::parallel_for
-          (Kokkos::ThreadVectorRange(member,m),[&](const int &i) {
-            Kokkos::parallel_for
-              (Kokkos::TeamThreadRange(member,n),[&](const int &j) {
-                B[i*bs0+j*bs1] = A[i*as0+j*as1];
-              });
+    } else {
+      Kokkos::parallel_for(
+          Kokkos::ThreadVectorRange(member, m), [&](const int &i) {
+            Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n),
+                                 [&](const int &j) {
+                                   B[i * bs0 + j * bs1] = A[i * as0 + j * as1];
+                                 });
           });
-      }
-      //member.team_barrier();
-      return 0;
     }
-  };
-
-} // end namespace KokkosBatched
+    // member.team_barrier();
+    return 0;
+  }
+};
 
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Dot_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Dot_Internal.hpp
index 13663ad0ff..e737434138 100644
--- a/src/batched/dense/impl/KokkosBatched_Dot_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Dot_Internal.hpp
@@ -1,122 +1,435 @@
 #ifndef __KOKKOSBATCHED_DOT_INTERNAL_HPP__
 #define __KOKKOSBATCHED_DOT_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
-        
-  struct SerialDotInternal {
-
-    // i \in [0,m)  
-    // C = conj(A(:))*B(:)  
-    template<typename ValueType>
-    KOKKOS_FORCEINLINE_FUNCTION
-    static int
-    invoke(const int m, 
-           const ValueType *__restrict__ A, const int as0,
-	   const ValueType *__restrict__ B, const int bs0, 
-           /* */ ValueType *__restrict__ C) {
-      using ats = Kokkos::ArithTraits<ValueType>;
-      C[0] = ValueType(0);
+///
+/// Serial Internal Impl
+/// ====================
+
+struct SerialDotInternal {
+  // i \in [0,m)
+  // C = conj(A(:))*B(:)
+  template <typename ValueType, typename MagnitudeType>
+  KOKKOS_FORCEINLINE_FUNCTION static int invoke(
+      const int m, const ValueType *KOKKOS_RESTRICT A, const int as0,
+      const ValueType *KOKKOS_RESTRICT B, const int bs0,
+      /* */ MagnitudeType *KOKKOS_RESTRICT C) {
+    using ats = Kokkos::ArithTraits<ValueType>;
+    C[0]      = ValueType(0);
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int i=0;i<m;++i) {
-	const int idx_a = i*as0, idx_b = i*bs0;
-	C[0] += ats::conj(A[idx_a])*B[idx_b];
+    for (int i = 0; i < m; ++i) {
+      const int idx_a = i * as0, idx_b = i * bs0;
+      C[0] += ats::conj(A[idx_a]) * B[idx_b];
+    }
+    return 0;
+  }
+
+  // j \in [0,n), i \in [0,m)
+  // C(j) = conj(A(:,j))*B(:,j)
+  template <typename ValueType, typename MagnitudeType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const int m, const int n, const ValueType *KOKKOS_RESTRICT A,
+      const int as0, const int as1, const ValueType *KOKKOS_RESTRICT B,
+      const int bs0, const int bs1,
+      /* */ MagnitudeType *KOKKOS_RESTRICT C, const int cs) {
+    for (int j = 0; j < n; ++j)
+      invoke(m, A + j * as1, as0, B + j * bs1, bs0, C + j * cs);
+    return 0;
+  }
+};
+
+///
+/// Team Internal Impl
+/// ========================
+
+// i \in [0,m)
+// C = conj(A(:))*B(:)
+struct TeamDotInternal {
+  template <typename MemberType, typename ValueType, typename MagnitudeType>
+  KOKKOS_FORCEINLINE_FUNCTION static int invoke(
+      const MemberType &member, const int m, const ValueType *KOKKOS_RESTRICT A,
+      const int as0, const ValueType *KOKKOS_RESTRICT B, const int bs0,
+      /* */ MagnitudeType *KOKKOS_RESTRICT C) {
+    using ats = Kokkos::ArithTraits<ValueType>;
+    ValueType t(0);
+    Kokkos::parallel_reduce(
+        Kokkos::TeamThreadRange(member, m),
+        [&](const int &i, ValueType &update) {
+          const int idx_a = i * as0, idx_b = i * bs0;
+          update += ats::conj(A[idx_a]) * B[idx_b];
+        },
+        t);
+    Kokkos::single(Kokkos::PerThread(member), [&]() { C[0] = t; });
+    return 0;
+  }
+
+  // j \in [0,n), i \in [0,m)
+  // C(j) = conj(A(:,j))*B(:,j)
+  template <typename MemberType, typename ValueType, typename MagnitudeType>
+  KOKKOS_FORCEINLINE_FUNCTION static int invoke(
+      const MemberType &member, const int m, const int n,
+      const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+      const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1,
+      /* */ MagnitudeType *KOKKOS_RESTRICT C, const int cs) {
+    using ats = Kokkos::ArithTraits<ValueType>;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &j) {
+      ValueType t(0);
+      const ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1;
+      const ValueType *KOKKOS_RESTRICT B_at_j = B + j * bs1;
+      for (int i = 0; i < m; ++i) {
+        const int idx_a = i * as0, idx_b = i * bs0;
+        t += ats::conj(A_at_j[idx_a]) * B_at_j[idx_b];
       }
-      return 0;
+      Kokkos::single(Kokkos::PerThread(member), [&]() { C[j * cs] = t; });
+    });
+    return 0;
+  }
+};
+
+///
+/// TeamVector Internal Impl
+/// ========================
+
+// i \in [0,m)
+// C = conj(A(:))*B(:)
+struct TeamVectorDotInternal {
+  template <typename MemberType, typename ValueType, typename MagnitudeType>
+  KOKKOS_FORCEINLINE_FUNCTION static int invoke(
+      const MemberType &member, const int m, const ValueType *KOKKOS_RESTRICT A,
+      const int as0, const ValueType *KOKKOS_RESTRICT B, const int bs0,
+      /* */ MagnitudeType *KOKKOS_RESTRICT C) {
+    using ats = Kokkos::ArithTraits<ValueType>;
+    ValueType t(0);
+    Kokkos::parallel_reduce(
+        Kokkos::TeamVectorRange(member, m),
+        [&](const int &i, ValueType &update) {
+          const int idx_a = i * as0, idx_b = i * bs0;
+          update += ats::conj(A[idx_a]) * B[idx_b];
+        },
+        t);
+    Kokkos::single(Kokkos::PerThread(member), [&]() { C[0] = t; });
+    return 0;
+  }
+
+  // j \in [0,n), i \in [0,m)
+  // C(j) = conj(A(:,j))*B(:,j)
+  template <typename MemberType, typename ValueType, typename MagnitudeType>
+  KOKKOS_FORCEINLINE_FUNCTION static int invoke(
+      const MemberType &member, const int m, const int n,
+      const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+      const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1,
+      /* */ MagnitudeType *KOKKOS_RESTRICT C, const int cs) {
+    using ats = Kokkos::ArithTraits<ValueType>;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &j) {
+      ValueType t(0);
+      const ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1;
+      const ValueType *KOKKOS_RESTRICT B_at_j = B + j * bs1;
+      Kokkos::parallel_reduce(
+          Kokkos::ThreadVectorRange(member, m),
+          [&](const int &i, ValueType &update) {
+            const int idx_a = i * as0, idx_b = i * bs0;
+            update += ats::conj(A_at_j[idx_a]) * B_at_j[idx_b];
+          },
+          t);
+      Kokkos::single(Kokkos::PerThread(member), [&]() { C[j * cs] = t; });
+    });
+    return 0;
+  }
+};
+
+///
+/// Serial Impl
+/// ===========
+template <>
+struct SerialDot<Trans::Transpose> {
+  template <typename XViewType, typename YViewType, typename NormViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const XViewType &X,
+                                           const YViewType &Y,
+                                           const NormViewType &dot) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    static_assert(Kokkos::is_view<XViewType>::value,
+                  "KokkosBatched::dot: XViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<YViewType>::value,
+                  "KokkosBatched::dot: YViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<NormViewType>::value,
+                  "KokkosBatched::dot: NormViewType is not a Kokkos::View.");
+    static_assert(XViewType::Rank == 2,
+                  "KokkosBatched::dot: XViewType must have rank 2.");
+    static_assert(YViewType::Rank == 2,
+                  "KokkosBatched::dot: YViewType must have rank 2.");
+    static_assert(NormViewType::Rank == 1,
+                  "KokkosBatched::dot: NormViewType must have rank 1.");
+
+    // Check compatibility of dimensions at run time.
+    if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, "
+          "Y: %d x %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0),
+          (int)Y.extent(1));
+      return 1;
     }
+    if (X.extent(1) != dot.extent(0)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::dot: Second dimension of X and alpha do not match: "
+          "X: "
+          "%d x %d, dot: %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0));
+      return 1;
+    }
+#endif
+    return SerialDotInternal::template invoke<
+        typename XViewType::non_const_value_type,
+        typename NormViewType::non_const_value_type>(
+        X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(),
+        Y.data(), Y.stride_0(), Y.stride_1(), dot.data(), dot.stride_0());
+  }
+};
 
-    // j \in [0,n), i \in [0,m)
-    // C(j) = conj(A(:,j))*B(:,j)
-    template<typename ValueType>
-    KOKKOS_FORCEINLINE_FUNCTION
-    static int
-    invoke(const int m, const int n, 
-           const ValueType *__restrict__ A, const int as0, const int as1,
-	   const ValueType *__restrict__ B, const int bs0, const int bs1,
-           /* */ ValueType *__restrict__ C, const int cs) {
-      for (int j=0;j<n;++j)               
-	invoke(m, A+j*as1, as0, B+j*bs1, bs0, C+j*cs);
-      return 0;
+template <>
+struct SerialDot<Trans::NoTranspose> {
+  template <typename XViewType, typename YViewType, typename NormViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const XViewType &X,
+                                           const YViewType &Y,
+                                           const NormViewType &dot) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    static_assert(Kokkos::is_view<XViewType>::value,
+                  "KokkosBatched::dot: XViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<YViewType>::value,
+                  "KokkosBatched::dot: YViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<NormViewType>::value,
+                  "KokkosBatched::dot: NormViewType is not a Kokkos::View.");
+    static_assert(XViewType::Rank == 2,
+                  "KokkosBatched::dot: XViewType must have rank 2.");
+    static_assert(YViewType::Rank == 2,
+                  "KokkosBatched::dot: YViewType must have rank 2.");
+    static_assert(NormViewType::Rank == 1,
+                  "KokkosBatched::dot: NormViewType must have rank 1.");
+
+    // Check compatibility of dimensions at run time.
+    if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, "
+          "Y: %d x %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0),
+          (int)Y.extent(1));
+      return 1;
     }
-  };        
-    
-  ///
-  /// TeamVector Internal Impl
-  /// ========================
-
-  // i \in [0,m)  
-  // C = conj(A(:))*B(:)  
-  struct TeamVectorDotInternal {
-    template<typename MemberType,
-             typename ValueType>
-    KOKKOS_FORCEINLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const int m, 
-           const ValueType *__restrict__ A, const int as0,
-	   const ValueType *__restrict__ B, const int bs0, 
-           /* */ ValueType *__restrict__ C) {
-      using ats = Kokkos::ArithTraits<ValueType>;
-      ValueType t(0);
-      Kokkos::parallel_reduce
-        (Kokkos::TeamVectorRange(member,m),
-	 [&](const int &i, ValueType &update) {
-	   const int idx_a = i*as0, idx_b = i*bs0; 
-	   update += ats::conj(A[idx_a])*B[idx_b];
-	 }, t);
-       Kokkos::single
-	 (Kokkos::PerThread(member),
-	  [&]() {
-	    C[0] = t;
-	  });
-      return 0;
+    if (X.extent(0) != dot.extent(0)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::dot: First dimension of X and alpha do not match: X: "
+          "%d x %d, dot: %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0));
+      return 1;
     }
+#endif
+    return SerialDotInternal::template invoke<
+        typename XViewType::non_const_value_type,
+        typename NormViewType::non_const_value_type>(
+        X.extent(1), X.extent(0), X.data(), X.stride_1(), X.stride_0(),
+        Y.data(), Y.stride_1(), Y.stride_0(), dot.data(), dot.stride_0());
+  }
+};
+
+///
+/// Team Impl
+/// ===============
+template <typename MemberType>
+struct TeamDot<MemberType, Trans::Transpose> {
+  template <typename XViewType, typename YViewType, typename NormViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const XViewType &X,
+                                           const YViewType &Y,
+                                           const NormViewType &dot) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    static_assert(Kokkos::is_view<XViewType>::value,
+                  "KokkosBatched::dot: XViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<YViewType>::value,
+                  "KokkosBatched::dot: YViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<NormViewType>::value,
+                  "KokkosBatched::dot: NormViewType is not a Kokkos::View.");
+    static_assert(XViewType::Rank == 2,
+                  "KokkosBatched::dot: XViewType must have rank 2.");
+    static_assert(YViewType::Rank == 2,
+                  "KokkosBatched::dot: YViewType must have rank 2.");
+    static_assert(NormViewType::Rank == 1,
+                  "KokkosBatched::dot: NormViewType must have rank 1.");
+
+    // Check compatibility of dimensions at run time.
+    if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, "
+          "Y: %d x %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0),
+          (int)Y.extent(1));
+      return 1;
+    }
+    if (X.extent(1) != dot.extent(0)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::dot: Second dimension of X and alpha do not match: "
+          "X: "
+          "%d x %d, dot: %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0));
+      return 1;
+    }
+#endif
+    return TeamDotInternal::template invoke<
+        MemberType, typename XViewType::non_const_value_type,
+        typename NormViewType::non_const_value_type>(
+        member, X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(),
+        Y.data(), Y.stride_0(), Y.stride_1(), dot.data(), dot.stride_0());
+  }
+};
+
+template <typename MemberType>
+struct TeamDot<MemberType, Trans::NoTranspose> {
+  template <typename XViewType, typename YViewType, typename NormViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const XViewType &X,
+                                           const YViewType &Y,
+                                           const NormViewType &dot) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    static_assert(Kokkos::is_view<XViewType>::value,
+                  "KokkosBatched::dot: XViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<YViewType>::value,
+                  "KokkosBatched::dot: YViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<NormViewType>::value,
+                  "KokkosBatched::dot: NormViewType is not a Kokkos::View.");
+    static_assert(XViewType::Rank == 2,
+                  "KokkosBatched::dot: XViewType must have rank 2.");
+    static_assert(YViewType::Rank == 2,
+                  "KokkosBatched::dot: YViewType must have rank 2.");
+    static_assert(NormViewType::Rank == 1,
+                  "KokkosBatched::dot: NormViewType must have rank 1.");
+
+    // Check compatibility of dimensions at run time.
+    if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, "
+          "Y: %d x %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0),
+          (int)Y.extent(1));
+      return 1;
+    }
+    if (X.extent(0) != dot.extent(0)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::dot: First dimension of X and alpha do not match: X: "
+          "%d x %d, dot: %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0));
+      return 1;
+    }
+#endif
+    return TeamDotInternal::template invoke<
+        MemberType, typename XViewType::non_const_value_type,
+        typename NormViewType::non_const_value_type>(
+        member, X.extent(1), X.extent(0), X.data(), X.stride_1(), X.stride_0(),
+        Y.data(), Y.stride_1(), Y.stride_0(), dot.data(), dot.stride_0());
+  }
+};
+
+///
+/// TeamVector Impl
+/// ===============
+template <typename MemberType>
+struct TeamVectorDot<MemberType, Trans::Transpose> {
+  template <typename XViewType, typename YViewType, typename NormViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const XViewType &X,
+                                           const YViewType &Y,
+                                           const NormViewType &dot) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    static_assert(Kokkos::is_view<XViewType>::value,
+                  "KokkosBatched::dot: XViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<YViewType>::value,
+                  "KokkosBatched::dot: YViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<NormViewType>::value,
+                  "KokkosBatched::dot: NormViewType is not a Kokkos::View.");
+    static_assert(XViewType::Rank == 2,
+                  "KokkosBatched::dot: XViewType must have rank 2.");
+    static_assert(YViewType::Rank == 2,
+                  "KokkosBatched::dot: YViewType must have rank 2.");
+    static_assert(NormViewType::Rank == 1,
+                  "KokkosBatched::dot: NormViewType must have rank 1.");
 
-    // j \in [0,n), i \in [0,m)
-    // C(j) = conj(A(:,j))*B(:,j)
-    template<typename MemberType,
-             typename ValueType>
-    KOKKOS_FORCEINLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const int m, const int n, 
-           const ValueType *__restrict__ A, const int as0, const int as1,
-	   const ValueType *__restrict__ B, const int bs0, const int bs1,
-           /* */ ValueType *__restrict__ C, const int cs) {
-      using ats = Kokkos::ArithTraits<ValueType>;
-      Kokkos::parallel_for
-	(Kokkos::TeamThreadRange(member,n),
-	 [&](const int &j) {
-	   ValueType t(0);
-	   const ValueType *__restrict__ A_at_j = A + j*as1;
-	   const ValueType *__restrict__ B_at_j = B + j*bs1;
-	   Kokkos::parallel_reduce
-	     (Kokkos::ThreadVectorRange(member,m),
-	      [&](const int &i, ValueType &update) {
-		const int idx_a = i*as0, idx_b = i*bs0;
-		update += ats::conj(A_at_j[idx_a])*B_at_j[idx_b];
-	      }, t);
-	   Kokkos::single
-	     (Kokkos::PerThread(member),
-	      [&]() {
-		C[j*cs] = t;
-	      });
-	 });
-      return 0;
+    // Check compatibility of dimensions at run time.
+    if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, "
+          "Y: %d x %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0),
+          (int)Y.extent(1));
+      return 1;
     }
-  };
+    if (X.extent(1) != dot.extent(0)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::dot: Second dimension of X and alpha do not match: "
+          "X: "
+          "%d x %d, dot: %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0));
+      return 1;
+    }
+#endif
+    return TeamVectorDotInternal::template invoke<
+        MemberType, typename XViewType::non_const_value_type,
+        typename NormViewType::non_const_value_type>(
+        member, X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(),
+        Y.data(), Y.stride_0(), Y.stride_1(), dot.data(), dot.stride_0());
+  }
+};
 
-} // end namespace KokkosBatched
+template <typename MemberType>
+struct TeamVectorDot<MemberType, Trans::NoTranspose> {
+  template <typename XViewType, typename YViewType, typename NormViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const XViewType &X,
+                                           const YViewType &Y,
+                                           const NormViewType &dot) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    static_assert(Kokkos::is_view<XViewType>::value,
+                  "KokkosBatched::dot: XViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<YViewType>::value,
+                  "KokkosBatched::dot: YViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<NormViewType>::value,
+                  "KokkosBatched::dot: NormViewType is not a Kokkos::View.");
+    static_assert(XViewType::Rank == 2,
+                  "KokkosBatched::dot: XViewType must have rank 2.");
+    static_assert(YViewType::Rank == 2,
+                  "KokkosBatched::dot: YViewType must have rank 2.");
+    static_assert(NormViewType::Rank == 1,
+                  "KokkosBatched::dot: NormViewType must have rank 1.");
+
+    // Check compatibility of dimensions at run time.
+    if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, "
+          "Y: %d x %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0),
+          (int)Y.extent(1));
+      return 1;
+    }
+    if (X.extent(0) != dot.extent(0)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::dot: First dimension of X and alpha do not match: X: "
+          "%d x %d, dot: %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0));
+      return 1;
+    }
+#endif
+    return TeamVectorDotInternal::template invoke<
+        MemberType, typename XViewType::non_const_value_type,
+        typename NormViewType::non_const_value_type>(
+        member, X.extent(1), X.extent(0), X.data(), X.stride_1(), X.stride_0(),
+        Y.data(), Y.stride_1(), Y.stride_0(), dot.data(), dot.stride_0());
+  }
+};
 
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Impl.hpp
index 855693147d..60ca253183 100644
--- a/src/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_IMPL_HPP__
 #define __KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,45 +8,44 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Impl
-  /// ===========
-  template<typename AViewType,
-           typename EViewType,
-           typename UViewType,
-           typename WViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialEigendecomposition::
-  invoke(const AViewType &A,
-         const EViewType &er, const EViewType &ei,
-         const UViewType &UL, const UViewType &UR,
-         const WViewType &W) {
-    /// view checking
-    const int m = A.extent(0);
-    assert(m == int(A.extent(1))  && "Eigendecomposition: A is not square");
-    assert(m == int(er.extent(0)) && "Eigendecomposition: Length of er does not match to A's dimension");
-    assert(m == int(ei.extent(0)) && "Eigendecomposition: Length of ei does not match to A's dimension");
-    assert(m == int(UL.extent(0)) && "Eigendecomposition: Length of UL does not match to A's dimension");
-    assert(m == int(UL.extent(1)) && "Eigendecomposition: Width of UL does not match to A's dimension");
-    assert(m == int(UR.extent(0)) && "Eigendecomposition: Length of UR does not match to A's dimension");
-    assert(m == int(UR.extent(1)) && "Eigendecomposition: Width of UR does not match to A's dimension");
-    //assert(int(W.extent(0)) >= int(2*m*m+5*m) && "Eigendecomposition: workspace size is too small");
-    assert(int(W.stride(0)) == int(1)  && "Eigendecomposition: Provided workspace is not contiguous");
-
-    /// static assert A,er,ei,UL,UR,W has the same value_type
-    /// static assert all views have the same memory space
-    return m ? SerialEigendecompositionInternal
-      ::invoke(A.extent(0),
-               A.data(), A.stride(0), A.stride(1),
-               er.data(), er.stride(0),
-               ei.data(), ei.stride(0),
-               UL.data(), UL.stride(0), UL.stride(1),
-               UR.data(), UR.stride(0), UR.stride(1),
-               W.data(), W.extent(0)) : 0;
-  }
-
-} /// end namespace KokkosBatched
-
+///
+/// Serial Impl
+/// ===========
+template <typename AViewType, typename EViewType, typename UViewType,
+          typename WViewType>
+KOKKOS_INLINE_FUNCTION int SerialEigendecomposition::invoke(
+    const AViewType &A, const EViewType &er, const EViewType &ei,
+    const UViewType &UL, const UViewType &UR, const WViewType &W) {
+  /// view checking
+  const int m = A.extent(0);
+  assert(m == int(A.extent(1)) && "Eigendecomposition: A is not square");
+  assert(m == int(er.extent(0)) &&
+         "Eigendecomposition: Length of er does not match to A's dimension");
+  assert(m == int(ei.extent(0)) &&
+         "Eigendecomposition: Length of ei does not match to A's dimension");
+  assert(m == int(UL.extent(0)) &&
+         "Eigendecomposition: Length of UL does not match to A's dimension");
+  assert(m == int(UL.extent(1)) &&
+         "Eigendecomposition: Width of UL does not match to A's dimension");
+  assert(m == int(UR.extent(0)) &&
+         "Eigendecomposition: Length of UR does not match to A's dimension");
+  assert(m == int(UR.extent(1)) &&
+         "Eigendecomposition: Width of UR does not match to A's dimension");
+  // assert(int(W.extent(0)) >= int(2*m*m+5*m) && "Eigendecomposition: workspace
+  // size is too small");
+  assert(int(W.stride(0)) == int(1) &&
+         "Eigendecomposition: Provided workspace is not contiguous");
+
+  /// static assert A,er,ei,UL,UR,W has the same value_type
+  /// static assert all views have the same memory space
+  return m ? SerialEigendecompositionInternal ::invoke(
+                 A.extent(0), A.data(), A.stride(0), A.stride(1), er.data(),
+                 er.stride(0), ei.data(), ei.stride(0), UL.data(), UL.stride(0),
+                 UL.stride(1), UR.data(), UR.stride(0), UR.stride(1), W.data(),
+                 W.extent(0))
+           : 0;
+}
+
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp
index f46a278e8b..c56fd6ba87 100644
--- a/src/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -17,388 +16,403 @@
 
 namespace KokkosBatched {
 
+///
+/// Serial Internal Impl
+/// ====================
+
+struct SerialEigendecompositionInternal {
+  /// Given a general nonsymmetric matrix A (m x m), it performs
+  /// eigendecomposition of the matrix.
   ///
-  /// Serial Internal Impl
-  /// ==================== 
-
-  struct SerialEigendecompositionInternal {
-
-    /// Given a general nonsymmetric matrix A (m x m), it performs eigendecomposition 
-    /// of the matrix.
-    /// 
-    /// Parameters:
-    ///   [in]m 
-    ///     A dimension of the square matrix H.
-    ///   [in/out]A, [in]as0, [in]as1 
-    ///     Real general nonsymmetric matrix A(m x m) with strides as0 and as1.
-    ///     A is first condensed to a upper Hessenberg form. Then, the Francis 
-    ///     double shift QR algorithm is applied to compute its Schur form. 
-    ///     On exit, A stores a quasi upper triangular matrix of the Schur decomposition.
-    ///   [out]er, [in]ers, [out]ei, [in]eis
-    ///     A complex vector er(m)+ei(m)i with a stride ers and eis to store computed eigenvalues.
-    ///     For a complex eigen pair, it stores a+bi and a-bi consecutively. 
-    ///   [out]UL, [in]uls0, [in] uls1
-    ///     Left eigenvectors UL(m x m) with strides uls0 and uls1. When UL is NULL, the left 
-    ///     eigenvectors are not computed.
-    ///   [out]UR, [in]urs0, [in] urs1
-    ///     Right eigenvectors UR(m x m) with strides urs0 and urs1. When UR is NULL, the right
-    ///     eigenvectors are not computed.
-    ///   [out]w, [in]wlen
-    ///     Workspace
-    template<typename RealType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    device_invoke(const int m,
-		  RealType * A, const int as0, const int as1,
-		  RealType * er, const int ers,
-		  RealType * ei, const int eis,
-		  RealType * UL, const int uls0, const int uls1,
-		  RealType * UR, const int urs0, const int urs1,
-		  RealType * w,  const int wlen) {
-      /// until debugging is done, comment out the code
-      /// testing happens only for TPLs on host.
-      static_assert(false, "Serial eigendecomposition on device and/or without LAPACK is not implemented yet");
-//       typedef RealType real_type;
-//       typedef Kokkos::Details::ArithTraits<real_type> ats;
-
-//       const real_type one(1), zero(0), tol = 1e2*ats::epsilon();
-//       //const Kokkos::pair<real_type,real_type> identity(one, zero);
-
-//       /// step 0: input checking
-//       assert( (wlen >= (2*m*m+5*m)) && "Eigendecomposition: workspace size is too small");
-//       real_type *w_now = w;
-//       int wlen_now = wlen;
-//       assert( (wlen_now >= 0) && "Eigendecomposition: workspace size is negative");
-
-//       const bool is_UL = UL != NULL, is_UR = UR != NULL;
-//       assert((is_UL || is_UR) && "Eigendecomposition: neither left nor right eigenvectors were requested. Use SerialEigenvalueInternal instead.");
-        
-//       real_type *QZ = w_now; w_now += (m*m); wlen_now -= (m*m);
-//       assert( (wlen_now >= 0) && "Eigendecomposition: QZ allocation fails");
-
-//       const int qzs0 = m, qzs1 = 1, qzs = m+1; /// row major
-//       const int as = as0+as1;
-
-//       /// step 1: Hessenberg reduction A = Q H Q^H
-//       ///         Q is stored in QZ
-// #if (defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && (__INTEL_MKL__ >= 2018)) && defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
-//       {
-//         real_type *t  = w_now; w_now += m; wlen_now -= m;
-//         assert( (wlen_now >= 0) && "Eigendecomposition: Hessenberg reduction workspace t allocation fail");
-          
-//         if (as0 == 1 || as1 == 1) { /// if mkl can be interfaced, use it 
-//           const auto matrix_layout = ( as1 == 1 ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR );            
-//           LAPACKE_dgehrd(matrix_layout, m, 1, m, A, m, t);
-
-//           SerialSetIdentityInternal::invoke(m, QZ, qzs0, qzs1);          
-//           LAPACKE_dorghr(matrix_layout, m, 1, m, QZ, m, t);
-//         } else { /// for arbitrary strides, there is no choice to use tpls
-//           real_type *ww = w_now; w_now += m; wlen_now -= m;
-//           assert( (wlen_now >= 0) && "Eigendecomposition: Hessenberg reduction workspace ww allocation fail");
-
-//           SerialHessenbergInternal::invoke(m, m,
-//                                            A, as0, as1,
-//                                            t, 1,
-//                                            ww);
-            
-//           SerialSetIdentityInternal::invoke(m, QZ, qzs0, qzs1);          
-//           SerialApplyQ_LeftForwardInternal::invoke(m-1, m-1, m-1,
-//                                                    A+as0, as0, as1,
-//                                                    t, 1,
-//                                                    QZ+qzs, qzs0, qzs1,
-//                                                    ww);
-//           /// recovery of workspace for ww
-//           w_now -= m; wlen_now += m;
-//         }
-//         /// recovery of workspace for t
-//         w_now -= m; wlen_now += m;
-
-//         /// clean up H
-//         SerialSetLowerTriangularInternal::invoke(m, m,
-//                                                  2,
-//                                                  zero,
-//                                                  A, as0, as1);
-//       }
-// #else
-//       {
-//         real_type *t  = w_now; w_now += m; wlen_now -= m;
-//         real_type *ww = w_now; w_now += m; wlen_now -= m;
-//         assert( (wlen_now >= 0) && "Eigendecomposition: Hessenberg reduction workspace t and ww allocation fail");
-
-//         SerialHessenbergInternal::invoke(m, m,
-//                                          A, as0, as1,
-//                                          t, 1,
-//                                          ww);
-          
-//         SerialSetIdentityInternal::invoke(m, QZ, qzs0, qzs1);          
-//         SerialApplyQ_LeftForwardInternal::invoke(m-1, m-1, m-1,
-//                                                  A+as0, as0, as1,
-//                                                  t, 1,
-//                                                  QZ+qzs, qzs0, qzs1,
-//                                                  ww);
-
-//         /// clean up H
-//         SerialSetLowerTriangularInternal::invoke(m, m,
-//                                                  2,
-//                                                  zero,
-//                                                  A, as0, as1);
-
-//         /// recover workspace
-//         w_now -= (2*m); wlen_now += (2*m);
-//       }
-// #endif   
-//       /// step 2: Schur decomposition H = Z T Z^H
-//       ///         Z is applied to QZ
-//       {
-//         int r_val = 0;
-//         real_type *ww = w_now; w_now += (5*m); wlen_now -= (5*m);
-//         assert( (wlen_now >= 0) && "Eigendecomposition: Schur decomposition workspace ww allocation fails");
-//         do {
-//           const bool restart = (r_val < 0);            
-//           r_val = SerialSchurInternal::invoke(m,
-//                                               A, as0, as1,
-//                                               QZ, qzs0, qzs1,
-//                                               ww, 5*m,
-//                                               restart);
-//         } while (r_val < 0 && false); 
-//         // for a testing purpose, we run the Schur decomposition with a finite number of Francis iterations
-//         w_now -= (5*m); wlen_now += (5*m);          
-//       }
-        
-//       /// Step 3: Extract iigenvalues and eigenvectors from T = V S V^-1
-//       /// 
-//       {
-//         /// extract eigenvalues 
-//         real_type *AA = A-as1;
-//         int *blks = (int*)w_now; w_now += m; wlen_now -= m;
-//         assert( (wlen_now >= 0) && "Eigendecomposition: Eigenvector workspace blks allocation fails");
-
-//         {
-//           int i=0;
-//           for (;i<(m-1);) {
-//             const real_type subdiag = ats::abs(AA[(i+1)*as]);
-//             const real_type diag = A[i*as];
-//             if (subdiag < tol) {
-//               er[i*ers] = diag; 
-//               ei[i*eis] = zero;
-//               blks[i] = 1;
-//               i+=1;
-//             } else {
-//               const real_type offdiag = ats::abs(A[i*as+as1]);
-//               const real_type sqrt_mult_suboffdiags = ats::sqrt(subdiag*offdiag);
-//               er[(i  )*ers] =  diag;
-//               er[(i+1)*ers] =  diag;
-//               ei[(i  )*eis] =  sqrt_mult_suboffdiags;
-//               ei[(i+1)*eis] = -sqrt_mult_suboffdiags;
-//               blks[i  ] = 2;
-//               blks[i+1] = 2; /// consider backward iteration
-//               i+=2;
-//             }
-//           }            
-//           if (i<m) { /// take care the remainder
-//             er[i*ers] = A[i*as];
-//             ei[i*eis] = zero;
-//             blks[i] = 1;
-//           }
-//         }
-
-//         {
-//           real_type *V = w_now; w_now += (m*m); wlen_now -= (m*m);
-//           assert( (wlen_now >= 0) && "Eigendecomposition: Eigenvector workspace V allocation fails");
-
-//           const int vs0 = 1, vs1 = m;
-//           real_type *ww = w_now; w_now += 2*m; wlen_now -= 2*m;
-//           assert( (wlen_now >= 0) && "Eigendecomposition: Eigenvector workspace w allocation fails");
-          
-//           /// Right eigenvectors V 
-//           if (is_UR) {
-//             SerialRightEigenvectorFromSchurInternal
-//               ::invoke(m, 
-//                        A, as0, as1,
-//                        V, vs0, vs1,
-//                        ww,
-//                        blks);
-              
-//             /// QZ V
-//             SerialGemmInternal<Algo::Gemm::Unblocked>::
-//               invoke(m, m, m,
-//                      one, 
-//                      QZ, qzs0, qzs1,
-//                      V, vs0, vs1,
-//                      zero,
-//                      UR, urs0, urs1);
-//             int j=0;
-//             for (;j<m;) {
-//               if (ats::abs(ei[j*eis]) < tol) {
-//                 /// a real eigenvalue
-//                 SerialNormalizeInternal::invoke(m, 
-//                                                 UR+j*urs1, urs0);
-//                 j+=1;
-//               } else {
-//                 /// a complex pair of eigenvalues
-//                 SerialNormalizeInternal::invoke(m, 
-//                                                 UR+(j  )*urs1, urs0,
-//                                                 UR+(j+1)*urs1, urs0);
-//                 j+=2;
-//               }
-//             }
-//           }
-            
-//           /// Left eigenvectors V stores V
-//           if (is_UL) {
-//             SerialLeftEigenvectorFromSchurInternal
-//               ::invoke(m, 
-//                        A, as0, as1,
-//                        V, vs0, vs1,
-//                        ww,
-//                        blks);
-                            
-//             /// V^-1 (QZ)^H 
-//             SerialGemmInternal<Algo::Gemm::Unblocked>::
-//               invoke(m, m, m,
-//                      one, 
-//                      V, vs0, vs1,
-//                      QZ, qzs1, qzs0,
-//                      zero,
-//                      UL, uls0, uls1);            
-              
-//             int i=0;
-//             for (;i<m;) {
-//               /// normalize row vectors
-//               if (ats::abs(ei[i*eis]) < tol) {
-//                 /// a real eigenvalue
-//                 SerialNormalizeInternal::invoke(m, 
-//                                                 UL+i*uls0, uls1);
-//                 i+=1;
-//               } else {
-//                 /// a complex pair of eigenvalues
-//                 SerialNormalizeInternal::invoke(m, 
-//                                                 UL+(i  )*uls0, uls1,
-//                                                 UL+(i+1)*uls0, uls1);
-//                 i+=2;
-//               }
-//             }              
-//           }
-//           w_now -= (m*m+2*m); wlen_now += (m*m+2*m);
-//         }
-//         // deallocate blks
-//         w_now -= m; wlen_now += m;
-//       }
-//       /// deallocate QZ
-//       w_now -= (m*m); wlen_now += (m*m);
-
-//       assert( (w == w_now) && "Eigendecomposition: workspace tracking fails");
-//       assert( (wlen == wlen_now) && "Eigendecomposition: workspace counting fails");
-      return 0;
+  /// Parameters:
+  ///   [in]m
+  ///     A dimension of the square matrix H.
+  ///   [in/out]A, [in]as0, [in]as1
+  ///     Real general nonsymmetric matrix A(m x m) with strides as0 and as1.
+  ///     A is first condensed to a upper Hessenberg form. Then, the Francis
+  ///     double shift QR algorithm is applied to compute its Schur form.
+  ///     On exit, A stores a quasi upper triangular matrix of the Schur
+  ///     decomposition.
+  ///   [out]er, [in]ers, [out]ei, [in]eis
+  ///     A complex vector er(m)+ei(m)i with a stride ers and eis to store
+  ///     computed eigenvalues. For a complex eigen pair, it stores a+bi and
+  ///     a-bi consecutively.
+  ///   [out]UL, [in]uls0, [in] uls1
+  ///     Left eigenvectors UL(m x m) with strides uls0 and uls1. When UL is
+  ///     NULL, the left eigenvectors are not computed.
+  ///   [out]UR, [in]urs0, [in] urs1
+  ///     Right eigenvectors UR(m x m) with strides urs0 and urs1. When UR is
+  ///     NULL, the right eigenvectors are not computed.
+  ///   [out]w, [in]wlen
+  ///     Workspace
+  template <typename RealType>
+  KOKKOS_INLINE_FUNCTION static int device_invoke(
+      const int m, RealType* A, const int as0, const int as1, RealType* er,
+      const int ers, RealType* ei, const int eis, RealType* UL, const int uls0,
+      const int uls1, RealType* UR, const int urs0, const int urs1, RealType* w,
+      const int wlen) {
+    /// until debugging is done, comment out the code
+    /// testing happens only for TPLs on host.
+    static_assert(false,
+                  "Serial eigendecomposition on device and/or without LAPACK "
+                  "is not implemented yet");
+    //       typedef RealType real_type;
+    //       typedef Kokkos::Details::ArithTraits<real_type> ats;
+
+    //       const real_type one(1), zero(0), tol = 1e2*ats::epsilon();
+    //       //const Kokkos::pair<real_type,real_type> identity(one, zero);
+
+    //       /// step 0: input checking
+    //       assert( (wlen >= (2*m*m+5*m)) && "Eigendecomposition: workspace
+    //       size is too small"); real_type *w_now = w; int wlen_now = wlen;
+    //       assert( (wlen_now >= 0) && "Eigendecomposition: workspace size is
+    //       negative");
+
+    //       const bool is_UL = UL != NULL, is_UR = UR != NULL;
+    //       assert((is_UL || is_UR) && "Eigendecomposition: neither left nor
+    //       right eigenvectors were requested. Use SerialEigenvalueInternal
+    //       instead.");
+
+    //       real_type *QZ = w_now; w_now += (m*m); wlen_now -= (m*m);
+    //       assert( (wlen_now >= 0) && "Eigendecomposition: QZ allocation
+    //       fails");
+
+    //       const int qzs0 = m, qzs1 = 1, qzs = m+1; /// row major
+    //       const int as = as0+as1;
+
+    //       /// step 1: Hessenberg reduction A = Q H Q^H
+    //       ///         Q is stored in QZ
+    //
+    ////////////////////////////////////////////////////////////////////////////
+    // DO NOT USE
+    //
+    //     #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+    //     <host code>
+    //     #else
+    //     <device code>
+    //     #endif
+    //
+    // DO THIS INSTEAD
+    //
+    //     KOKKOS_IF_ON_HOST((<host code>))
+    //     KOKKOS_IF_ON_DEVICE((<device code>))
+    //
+    ////////////////////////////////////////////////////////////////////////////
+    // #if (defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && (__INTEL_MKL__ >= 2018)) &&
+    // defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
+    //       {
+    //         real_type *t  = w_now; w_now += m; wlen_now -= m;
+    //         assert( (wlen_now >= 0) && "Eigendecomposition: Hessenberg
+    //         reduction workspace t allocation fail");
+
+    //         if (as0 == 1 || as1 == 1) { /// if mkl can be interfaced, use it
+    //           const auto matrix_layout = ( as1 == 1 ? LAPACK_ROW_MAJOR :
+    //           LAPACK_COL_MAJOR ); LAPACKE_dgehrd(matrix_layout, m, 1, m, A,
+    //           m, t);
+
+    //           SerialSetIdentityInternal::invoke(m, QZ, qzs0, qzs1);
+    //           LAPACKE_dorghr(matrix_layout, m, 1, m, QZ, m, t);
+    //         } else { /// for arbitrary strides, there is no choice to use
+    //         tpls
+    //           real_type *ww = w_now; w_now += m; wlen_now -= m;
+    //           assert( (wlen_now >= 0) && "Eigendecomposition: Hessenberg
+    //           reduction workspace ww allocation fail");
+
+    //           SerialHessenbergInternal::invoke(m, m,
+    //                                            A, as0, as1,
+    //                                            t, 1,
+    //                                            ww);
+
+    //           SerialSetIdentityInternal::invoke(m, QZ, qzs0, qzs1);
+    //           SerialApplyQ_LeftForwardInternal::invoke(m-1, m-1, m-1,
+    //                                                    A+as0, as0, as1,
+    //                                                    t, 1,
+    //                                                    QZ+qzs, qzs0, qzs1,
+    //                                                    ww);
+    //           /// recovery of workspace for ww
+    //           w_now -= m; wlen_now += m;
+    //         }
+    //         /// recovery of workspace for t
+    //         w_now -= m; wlen_now += m;
+
+    //         /// clean up H
+    //         SerialSetLowerTriangularInternal::invoke(m, m,
+    //                                                  2,
+    //                                                  zero,
+    //                                                  A, as0, as1);
+    //       }
+    // #else
+    //       {
+    //         real_type *t  = w_now; w_now += m; wlen_now -= m;
+    //         real_type *ww = w_now; w_now += m; wlen_now -= m;
+    //         assert( (wlen_now >= 0) && "Eigendecomposition: Hessenberg
+    //         reduction workspace t and ww allocation fail");
+
+    //         SerialHessenbergInternal::invoke(m, m,
+    //                                          A, as0, as1,
+    //                                          t, 1,
+    //                                          ww);
+
+    //         SerialSetIdentityInternal::invoke(m, QZ, qzs0, qzs1);
+    //         SerialApplyQ_LeftForwardInternal::invoke(m-1, m-1, m-1,
+    //                                                  A+as0, as0, as1,
+    //                                                  t, 1,
+    //                                                  QZ+qzs, qzs0, qzs1,
+    //                                                  ww);
+
+    //         /// clean up H
+    //         SerialSetLowerTriangularInternal::invoke(m, m,
+    //                                                  2,
+    //                                                  zero,
+    //                                                  A, as0, as1);
+
+    //         /// recover workspace
+    //         w_now -= (2*m); wlen_now += (2*m);
+    //       }
+    // #endif
+    //       /// step 2: Schur decomposition H = Z T Z^H
+    //       ///         Z is applied to QZ
+    //       {
+    //         int r_val = 0;
+    //         real_type *ww = w_now; w_now += (5*m); wlen_now -= (5*m);
+    //         assert( (wlen_now >= 0) && "Eigendecomposition: Schur
+    //         decomposition workspace ww allocation fails"); do {
+    //           const bool restart = (r_val < 0);
+    //           r_val = SerialSchurInternal::invoke(m,
+    //                                               A, as0, as1,
+    //                                               QZ, qzs0, qzs1,
+    //                                               ww, 5*m,
+    //                                               restart);
+    //         } while (r_val < 0 && false);
+    //         // for a testing purpose, we run the Schur decomposition with a
+    //         finite number of Francis iterations w_now -= (5*m); wlen_now +=
+    //         (5*m);
+    //       }
+
+    //       /// Step 3: Extract iigenvalues and eigenvectors from T = V S V^-1
+    //       ///
+    //       {
+    //         /// extract eigenvalues
+    //         real_type *AA = A-as1;
+    //         int *blks = (int*)w_now; w_now += m; wlen_now -= m;
+    //         assert( (wlen_now >= 0) && "Eigendecomposition: Eigenvector
+    //         workspace blks allocation fails");
+
+    //         {
+    //           int i=0;
+    //           for (;i<(m-1);) {
+    //             const real_type subdiag = ats::abs(AA[(i+1)*as]);
+    //             const real_type diag = A[i*as];
+    //             if (subdiag < tol) {
+    //               er[i*ers] = diag;
+    //               ei[i*eis] = zero;
+    //               blks[i] = 1;
+    //               i+=1;
+    //             } else {
+    //               const real_type offdiag = ats::abs(A[i*as+as1]);
+    //               const real_type sqrt_mult_suboffdiags =
+    //               ats::sqrt(subdiag*offdiag); er[(i  )*ers] =  diag;
+    //               er[(i+1)*ers] =  diag;
+    //               ei[(i  )*eis] =  sqrt_mult_suboffdiags;
+    //               ei[(i+1)*eis] = -sqrt_mult_suboffdiags;
+    //               blks[i  ] = 2;
+    //               blks[i+1] = 2; /// consider backward iteration
+    //               i+=2;
+    //             }
+    //           }
+    //           if (i<m) { /// take care the remainder
+    //             er[i*ers] = A[i*as];
+    //             ei[i*eis] = zero;
+    //             blks[i] = 1;
+    //           }
+    //         }
+
+    //         {
+    //           real_type *V = w_now; w_now += (m*m); wlen_now -= (m*m);
+    //           assert( (wlen_now >= 0) && "Eigendecomposition: Eigenvector
+    //           workspace V allocation fails");
+
+    //           const int vs0 = 1, vs1 = m;
+    //           real_type *ww = w_now; w_now += 2*m; wlen_now -= 2*m;
+    //           assert( (wlen_now >= 0) && "Eigendecomposition: Eigenvector
+    //           workspace w allocation fails");
+
+    //           /// Right eigenvectors V
+    //           if (is_UR) {
+    //             SerialRightEigenvectorFromSchurInternal
+    //               ::invoke(m,
+    //                        A, as0, as1,
+    //                        V, vs0, vs1,
+    //                        ww,
+    //                        blks);
+
+    //             /// QZ V
+    //             SerialGemmInternal<Algo::Gemm::Unblocked>::
+    //               invoke(m, m, m,
+    //                      one,
+    //                      QZ, qzs0, qzs1,
+    //                      V, vs0, vs1,
+    //                      zero,
+    //                      UR, urs0, urs1);
+    //             int j=0;
+    //             for (;j<m;) {
+    //               if (ats::abs(ei[j*eis]) < tol) {
+    //                 /// a real eigenvalue
+    //                 SerialNormalizeInternal::invoke(m,
+    //                                                 UR+j*urs1, urs0);
+    //                 j+=1;
+    //               } else {
+    //                 /// a complex pair of eigenvalues
+    //                 SerialNormalizeInternal::invoke(m,
+    //                                                 UR+(j  )*urs1, urs0,
+    //                                                 UR+(j+1)*urs1, urs0);
+    //                 j+=2;
+    //               }
+    //             }
+    //           }
+
+    //           /// Left eigenvectors V stores V
+    //           if (is_UL) {
+    //             SerialLeftEigenvectorFromSchurInternal
+    //               ::invoke(m,
+    //                        A, as0, as1,
+    //                        V, vs0, vs1,
+    //                        ww,
+    //                        blks);
+
+    //             /// V^-1 (QZ)^H
+    //             SerialGemmInternal<Algo::Gemm::Unblocked>::
+    //               invoke(m, m, m,
+    //                      one,
+    //                      V, vs0, vs1,
+    //                      QZ, qzs1, qzs0,
+    //                      zero,
+    //                      UL, uls0, uls1);
+
+    //             int i=0;
+    //             for (;i<m;) {
+    //               /// normalize row vectors
+    //               if (ats::abs(ei[i*eis]) < tol) {
+    //                 /// a real eigenvalue
+    //                 SerialNormalizeInternal::invoke(m,
+    //                                                 UL+i*uls0, uls1);
+    //                 i+=1;
+    //               } else {
+    //                 /// a complex pair of eigenvalues
+    //                 SerialNormalizeInternal::invoke(m,
+    //                                                 UL+(i  )*uls0, uls1,
+    //                                                 UL+(i+1)*uls0, uls1);
+    //                 i+=2;
+    //               }
+    //             }
+    //           }
+    //           w_now -= (m*m+2*m); wlen_now += (m*m+2*m);
+    //         }
+    //         // deallocate blks
+    //         w_now -= m; wlen_now += m;
+    //       }
+    //       /// deallocate QZ
+    //       w_now -= (m*m); wlen_now += (m*m);
+
+    //       assert( (w == w_now) && "Eigendecomposition: workspace tracking
+    //       fails"); assert( (wlen == wlen_now) && "Eigendecomposition:
+    //       workspace counting fails");
+    return 0;
+  }
+
+  template <typename RealType>
+  inline static int host_invoke(const int m, RealType* A, const int as0,
+                                const int as1, RealType* er, const int ers,
+                                RealType* ei, const int eis, RealType* UL,
+                                const int uls0, const int uls1, RealType* UR,
+                                const int urs0, const int urs1, RealType* w,
+                                const int wlen) {
+#if defined(__KOKKOSBATCHED_ENABLE_LAPACKE__) || \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__)
+    int matrix_layout(0), lda(0), uls(0), urs(0);
+    if (as0 == 1) {
+      assert(uls0 == 1 && "UL is not column major");
+      assert(urs0 == 1 && "UL is not column major");
+
+      matrix_layout = LAPACK_COL_MAJOR;
+      lda           = as1;
+      uls           = uls1;
+      urs           = urs1;
     }
-    
-    template<typename RealType>
-    inline
-    static int
-    host_invoke(const int m,
-		RealType * A, const int as0, const int as1,
-		RealType * er, const int ers,
-		RealType * ei, const int eis,
-		RealType * UL, const int uls0, const int uls1,
-		RealType * UR, const int urs0, const int urs1,
-		RealType * w,  const int wlen) {
-
-#if defined (__KOKKOSBATCHED_ENABLE_LAPACKE__) || defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__)      
-      int matrix_layout(0), lda(0), uls(0), urs(0);
-      if      (as0 == 1) {
-	assert(uls0 == 1 && "UL is not column major");
-	assert(urs0 == 1 && "UL is not column major");
-
-	matrix_layout = LAPACK_COL_MAJOR;
-	lda = as1;
-	uls = uls1;
-	urs = urs1;
-      }
-
-      if (as1 == 1) {
-	matrix_layout = LAPACK_ROW_MAJOR;
-	assert(uls1 == 1 && "UR is not row major");
-	assert(urs1 == 1 && "UR is not row major");
-	lda = as0;
-	uls = uls0;
-	urs = urs0;
-      }
-      assert(matrix_layout != 0 && "Either stride of A is not unit");
-      if        (std::is_same<RealType,float>::value) {
-	LAPACKE_sgeev(matrix_layout, 'V', 'V', m, (float*)A, lda, (float*)er, (float*)ei, (float*)UL, uls, (float*)UR, urs);	
-      } else if (std::is_same<RealType,double>::value) {
-	LAPACKE_dgeev(matrix_layout, 'V', 'V', m, (double*)A, lda, (double*)er, (double*)ei, (double*)UL, uls, (double*)UR, urs);		
-      } else {
-	// no complex is needed for this moment
-	assert(false && "complex type is not supported");
-      }
-#else
-      device_invoke(m,
-		    A, as0, as1,
-		    er, ers,
-		    ei, eis,
-		    UL, uls0, uls1,
-		    UR, urs0, urs1,
-		    w,  wlen);
-      
-#endif
-      return 0;
+
+    if (as1 == 1) {
+      matrix_layout = LAPACK_ROW_MAJOR;
+      assert(uls1 == 1 && "UR is not row major");
+      assert(urs1 == 1 && "UR is not row major");
+      lda = as0;
+      uls = uls0;
+      urs = urs0;
+    }
+    assert(matrix_layout != 0 && "Either stride of A is not unit");
+    if (std::is_same<RealType, float>::value) {
+      LAPACKE_sgeev(matrix_layout, 'V', 'V', m, (float*)A, lda, (float*)er,
+                    (float*)ei, (float*)UL, uls, (float*)UR, urs);
+    } else if (std::is_same<RealType, double>::value) {
+      LAPACKE_dgeev(matrix_layout, 'V', 'V', m, (double*)A, lda, (double*)er,
+                    (double*)ei, (double*)UL, uls, (double*)UR, urs);
+    } else {
+      // no complex is needed for this moment
+      assert(false && "complex type is not supported");
     }
-		
-
-    template<typename RealType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m,
-	   RealType * A, const int as0, const int as1,
-	   RealType * er, const int ers,
-	   RealType * ei, const int eis,
-	   RealType * UL, const int uls0, const int uls1,
-	   RealType * UR, const int urs0, const int urs1,
-	   RealType * w,  const int wlen) {
-#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
-      //if (as0 == 1 || as1 == 1) {
-	/// column major or row major and it runs on host
-	/// potentially it can run tpls internally 
-        // NOTE BMK: If LAPACK not enabled, this will static_assert.
-        //           If neither stride is unit, will runtime assert.
-        //           Otherwise will succeed using LAPACK.
-	host_invoke(m,
-		    A, as0, as1,
-		    er, ers,
-		    ei, eis,
-		    UL, uls0, uls1,
-		    UR, urs0, urs1,
-		    w, wlen);
-        /*
-      } else {
-	/// arbitrary strides should be handled by native implementation
-	device_invoke(m,
-		      A, as0, as1,
-		      er, ers,
-		      ei, eis,
-		      UL, uls0, uls1,
-		      UR, urs0, urs1,
-		      w, wlen);
-        throw std::runtime_error("Serial eigendecomposition without unit stride  implemented yet.");
-      }
-      */
 #else
-      /// device code runs 
-      device_invoke(m,
-		    A, as0, as1,
-		    er, ers,
-		    ei, eis,
-		    UL, uls0, uls1,
-		    UR, urs0, urs1,
-		    w, wlen);
+    device_invoke(m, A, as0, as1, er, ers, ei, eis, UL, uls0, uls1, UR, urs0,
+                  urs1, w, wlen);
+
 #endif
-      return 0;
-    }
-    
-  };
+    return 0;
+  }
 
-} /// end namespace KokkosBatched
+  template <typename RealType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const int m, RealType* A, const int as0, const int as1, RealType* er,
+      const int ers, RealType* ei, const int eis, RealType* UL, const int uls0,
+      const int uls1, RealType* UR, const int urs0, const int urs1, RealType* w,
+      const int wlen) {
+#if defined(KOKKOS_IF_ON_HOST)
+    KOKKOS_IF_ON_HOST((host_invoke(m, A, as0, as1, er, ers, ei, eis, UL, uls0,
+                                   uls1, UR, urs0, urs1, w, wlen);))
+    KOKKOS_IF_ON_DEVICE((device_invoke(m, A, as0, as1, er, ers, ei, eis, UL,
+                                       uls0, uls1, UR, urs0, urs1, w, wlen);))
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)  // FIXME remove when
+                                                          // requiring minimum
+                                                          // version of
+                                                          // Kokkos 3.6
+    // if (as0 == 1 || as1 == 1) {
+    /// column major or row major and it runs on host
+    /// potentially it can run tpls internally
+    // NOTE BMK: If LAPACK not enabled, this will static_assert.
+    //           If neither stride is unit, will runtime assert.
+    //           Otherwise will succeed using LAPACK.
+    host_invoke(m, A, as0, as1, er, ers, ei, eis, UL, uls0, uls1, UR, urs0,
+                urs1, w, wlen);
+    /*
+  } else {
+    /// arbitrary strides should be handled by native implementation
+    device_invoke(m,
+                  A, as0, as1,
+                  er, ers,
+                  ei, eis,
+                  UL, uls0, uls1,
+                  UR, urs0, urs1,
+                  w, wlen);
+    throw std::runtime_error("Serial eigendecomposition without unit stride
+  implemented yet.");
+  }
+  */
+#else
+    /// device code runs
+    device_invoke(m, A, as0, as1, er, ers, ei, eis, UL, uls0, uls1, UR, urs0,
+                  urs1, w, wlen);
+#endif
+    return 0;
+  }
+};
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Impl.hpp
index 85c0eba54b..c43dcbb4fa 100644
--- a/src/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_EIGENDECOMPOSITION_TEAMVECTOR_IMPL_HPP__
 #define __KOKKOSBATCHED_EIGENDECOMPOSITION_TEAMVECTOR_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,47 +8,45 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Team Impl
-  /// =========
-
-  template<typename MemberType>
-  template<typename AViewType,
-	   typename EViewType,
-	   typename UViewType,
-	   typename WViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamVectorEigendecomposition<MemberType>::
-  invoke(const MemberType &member, 
-	 const AViewType &A,
-	 const EViewType &er, const EViewType &ei,
-	 const UViewType &UL, const UViewType &UR,
-	 const WViewType &W) {
-    /// view checking
-    const int m = A.extent(0);
-    assert(m == A.extent(1)  && "Eigendecomposition: A is not square");
-    assert(m == er.extent(0) && "Eigendecomposition: Length of er does not match to A's dimension");
-    assert(m == ei.extent(0) && "Eigendecomposition: Length of ei does not match to A's dimension");
-    assert(m == UL.extent(0) && "Eigendecomposition: Length of UL does not match to A's dimension");
-    assert(m == UL.extent(1) && "Eigendecomposition: Width of UL does not match to A's dimension");
-    assert(m == UR.extent(0) && "Eigendecomposition: Length of UR does not match to A's dimension");
-    assert(m == UR.extent(1) && "Eigendecomposition: Width of UR does not match to A's dimension");
-    //assert(W.extent(0) >= (2*m*m+5*m) && "Eigendecomposition: workspace size is too small");
-    assert(W.stride(0) == 1  && "Eigendecomposition: Provided workspace is not contiguous");
-    
-    return m ? TeamVectorEigendecompositionInternal
-      ::invoke(member,
-	       A.extent(0),
-	       A.data(), A.stride(0), A.stride(1),
-	       er.data(), er.stride(0),
-	       ei.data(), ei.stride(0),
-	       UL.data(), UL.stride(0), UL.stride(1),
-	       UR.data(), UR.stride(0), UR.stride(1),
-	       W.data(), W.extent(0)) : 0;
-  }
-
-} /// end namespace KokkosBatched
-
+///
+/// Team Impl
+/// =========
+
+template <typename MemberType>
+template <typename AViewType, typename EViewType, typename UViewType,
+          typename WViewType>
+KOKKOS_INLINE_FUNCTION int TeamVectorEigendecomposition<MemberType>::invoke(
+    const MemberType &member, const AViewType &A, const EViewType &er,
+    const EViewType &ei, const UViewType &UL, const UViewType &UR,
+    const WViewType &W) {
+  /// view checking
+  const int m = A.extent(0);
+  assert(m == A.extent(1) && "Eigendecomposition: A is not square");
+  assert(m == er.extent(0) &&
+         "Eigendecomposition: Length of er does not match to A's dimension");
+  assert(m == ei.extent(0) &&
+         "Eigendecomposition: Length of ei does not match to A's dimension");
+  assert(m == UL.extent(0) &&
+         "Eigendecomposition: Length of UL does not match to A's dimension");
+  assert(m == UL.extent(1) &&
+         "Eigendecomposition: Width of UL does not match to A's dimension");
+  assert(m == UR.extent(0) &&
+         "Eigendecomposition: Length of UR does not match to A's dimension");
+  assert(m == UR.extent(1) &&
+         "Eigendecomposition: Width of UR does not match to A's dimension");
+  // assert(W.extent(0) >= (2*m*m+5*m) && "Eigendecomposition: workspace size is
+  // too small");
+  assert(W.stride(0) == 1 &&
+         "Eigendecomposition: Provided workspace is not contiguous");
+
+  return m ? TeamVectorEigendecompositionInternal ::invoke(
+                 member, A.extent(0), A.data(), A.stride(0), A.stride(1),
+                 er.data(), er.stride(0), ei.data(), ei.stride(0), UL.data(),
+                 UL.stride(0), UL.stride(1), UR.data(), UR.stride(0),
+                 UR.stride(1), W.data(), W.extent(0))
+           : 0;
+}
+
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp
index 7c4026d1e9..ca46c3e9e7 100644
--- a/src/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_EIGENDECOMPOSITION_TEAMVECTOR_INTERNAL_HPP__
 #define __KOKKOSBATCHED_EIGENDECOMPOSITION_TEAMVECTOR_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -20,106 +19,109 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// TeamVector Internal Impl
-  /// ==================== 
+///
+/// TeamVector Internal Impl
+/// ====================
 
-  struct TeamVectorEigendecompositionInternal {
+struct TeamVectorEigendecompositionInternal {
+  template <typename MemberType, typename RealType>
+  KOKKOS_INLINE_FUNCTION static int device_invoke(
+      const MemberType &member, const int m, RealType *A, const int as0,
+      const int as1, RealType *er, const int ers, RealType *ei, const int eis,
+      RealType *UL, const int uls0, const int uls1, RealType *UR,
+      const int urs0, const int urs1, RealType *w, const int wlen) {
+    /// not yet implemented
+    return 0;
+  }
 
-    template<typename MemberType,
-	     typename RealType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    device_invoke(const MemberType &member,
-		  const int m,
-		  RealType * A, const int as0, const int as1,
-		  RealType * er, const int ers,
-		  RealType * ei, const int eis,
-		  RealType * UL, const int uls0, const int uls1,
-		  RealType * UR, const int urs0, const int urs1,
-		  RealType * w,  const int wlen) {
-      /// not yet implemented
-      return 0;
-    }
-
-    /// Given a general nonsymmetric matrix A (m x m), it performs eigendecomposition 
-    /// of the matrix.
-    /// 
-    /// Parameters:
-    ///   [in]m 
-    ///     A dimension of the square matrix H.
-    ///   [in/out]A, [in]as0, [in]as1 
-    ///     Real general nonsymmetric matrix A(m x m) with strides as0 and as1.
-    ///     A is first condensed to a upper Hessenberg form. Then, the Francis 
-    ///     double shift QR algorithm is applied to compute its Schur form. 
-    ///     On exit, A stores a quasi upper triangular matrix of the Schur decomposition.
-    ///   [out]er, [in]ers, [out]ei, [in]eis
-    ///     A complex vector er(m)+ei(m)i with a stride ers and eis to store computed eigenvalues.
-    ///     For a complex eigen pair, it stores a+bi and a-bi consecutively. 
-    ///   [out]UL, [in]uls0, [in] uls1
-    ///     Left eigenvectors UL(m x m) with strides uls0 and uls1. When UL is NULL, the left 
-    ///     eigenvectors are not computed.
-    ///   [out]UR, [in]urs0, [in] urs1
-    ///     Right eigenvectors UR(m x m) with strides urs0 and urs1. When UR is NULL, the right
-    ///     eigenvectors are not computed.
-    ///   [out]w, [in]wlen
-    ///     Workspace
-    template<typename MemberType,
-	     typename RealType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-	   const int m,
-           RealType * A, const int as0, const int as1,
-           RealType * er, const int ers,
-           RealType * ei, const int eis,
-           RealType * UL, const int uls0, const int uls1,
-           RealType * UR, const int urs0, const int urs1,
-           RealType * w,  const int wlen) {
-      static_assert(false, "TeamVector eigendecomposition is not implemented yet.");
-      /*
+  /// Given a general nonsymmetric matrix A (m x m), it performs
+  /// eigendecomposition of the matrix.
+  ///
+  /// Parameters:
+  ///   [in]m
+  ///     A dimension of the square matrix H.
+  ///   [in/out]A, [in]as0, [in]as1
+  ///     Real general nonsymmetric matrix A(m x m) with strides as0 and as1.
+  ///     A is first condensed to a upper Hessenberg form. Then, the Francis
+  ///     double shift QR algorithm is applied to compute its Schur form.
+  ///     On exit, A stores a quasi upper triangular matrix of the Schur
+  ///     decomposition.
+  ///   [out]er, [in]ers, [out]ei, [in]eis
+  ///     A complex vector er(m)+ei(m)i with a stride ers and eis to store
+  ///     computed eigenvalues. For a complex eigen pair, it stores a+bi and
+  ///     a-bi consecutively.
+  ///   [out]UL, [in]uls0, [in] uls1
+  ///     Left eigenvectors UL(m x m) with strides uls0 and uls1. When UL is
+  ///     NULL, the left eigenvectors are not computed.
+  ///   [out]UR, [in]urs0, [in] urs1
+  ///     Right eigenvectors UR(m x m) with strides urs0 and urs1. When UR is
+  ///     NULL, the right eigenvectors are not computed.
+  ///   [out]w, [in]wlen
+  ///     Workspace
+  template <typename MemberType, typename RealType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const int m, RealType *A, const int as0,
+      const int as1, RealType *er, const int ers, RealType *ei, const int eis,
+      RealType *UL, const int uls0, const int uls1, RealType *UR,
+      const int urs0, const int urs1, RealType *w, const int wlen) {
+    static_assert(false,
+                  "TeamVector eigendecomposition is not implemented yet.");
+    /*
+    // DO NOT USE
+    //
+    //     #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+    //     <host code>
+    //     #else
+    //     <device code>
+    //     #endif
+    //
+    // DO THIS INSTEAD
+    //
+    //     KOKKOS_IF_ON_HOST((<host code>))
+    //     KOKKOS_IF_ON_DEVICE((<device code>))
+    //
 #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
-      if (as0 == 1 || as1 == 1) {
-	/// column major or row major and it runs on host
-	/// potentially it can run tpls internally 
-	Kokkos::single
-	  (Kokkos::PerTeam(member),
-	   [&]() {
-	     SerialEigendecompositionInternal::host_invoke(m,
-							   A, as0, as1,
-							   er, ers,
-							   ei, eis,
-							   UL, uls0, uls1,
-							   UR, urs0, urs1,
-							   w, wlen);
-	   });
-      } else {
-	/// arbitrary strides should be handled by native implementation
-	device_invoke(member, m,
-		      A, as0, as1,
-		      er, ers,
-		      ei, eis,
-		      UL, uls0, uls1,
-		      UR, urs0, urs1,
-		      w, wlen);
-        throw std::runtime_error("TeamVector eigendecomposition is not implemented yet.");
-      }
-#else
-      /// device code runs 
+    if (as0 == 1 || as1 == 1) {
+      /// column major or row major and it runs on host
+      /// potentially it can run tpls internally
+      Kokkos::single
+        (Kokkos::PerTeam(member),
+         [&]() {
+           SerialEigendecompositionInternal::host_invoke(m,
+                                                         A, as0, as1,
+                                                         er, ers,
+                                                         ei, eis,
+                                                         UL, uls0, uls1,
+                                                         UR, urs0, urs1,
+                                                         w, wlen);
+         });
+    } else {
+      /// arbitrary strides should be handled by native implementation
       device_invoke(member, m,
-		    A, as0, as1,
-		    er, ers,
-		    ei, eis,
-		    UL, uls0, uls1,
-		    UR, urs0, urs1,
-		    w, wlen);
+                    A, as0, as1,
+                    er, ers,
+                    ei, eis,
+                    UL, uls0, uls1,
+                    UR, urs0, urs1,
+                    w, wlen);
+      throw std::runtime_error("TeamVector eigendecomposition is not implemented
+yet.");
+    }
+#else
+    /// device code runs
+    device_invoke(member, m,
+                  A, as0, as1,
+                  er, ers,
+                  ei, eis,
+                  UL, uls0, uls1,
+                  UR, urs0, urs1,
+                  w, wlen);
 #endif
 */
-      return 0;
-    }
-  };
-
-} /// end namespace KokkosBatched
+    return 0;
+  }
+};
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp
index 74061d2871..8ec4cddb7d 100644
--- a/src/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_EIGENVALUE_SERIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_EIGENVALUE_SERIAL_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -12,103 +11,109 @@
 
 namespace KokkosBatched {
 
+///
+/// Serial Internal Impl
+/// ====================
+///
+/// this impl follows the flame interface of householder transformation
+///
+struct SerialEigenvalueInternal {
+  /// Given a strictly Hessenberg matrix H (m x m), this computes all
+  /// eigenvalues using the Francis method and stores them into a vector e. This
+  /// routine does not scale nor balance the matrix for the numerical stability.
   ///
-  /// Serial Internal Impl
-  /// ==================== 
-  ///
-  /// this impl follows the flame interface of householder transformation
-  ///
-  struct SerialEigenvalueInternal {
-    /// Given a strictly Hessenberg matrix H (m x m), this computes all eigenvalues
-    /// using the Francis method and stores them into a vector e. This routine does 
-    /// not scale nor balance the matrix for the numerical stability.
-    /// 
-    /// Parameters:
-    ///   [in]m 
-    ///     A dimension of the square matrix H.
-    ///   [in/out]H, [in]hs0, [in]hs1 
-    ///     Real Hessenberg matrix H(m x m) with strides hs0 and hs1.
-    ///     Entering the routine, H is assumed to have a upper Hessenberg form, where
-    ///     all subdiagonals are zero. The matrix is overwritten on exit.
-    ///   [out]er, [in]ers, [out]ei, [in]eis
-    ///     A complex vector er(m)+ei(m)i with a stride ers and eis to store computed eigenvalues.
-    ///     For a complex eigen pair, it stores a+bi and a-bi consecutively. 
-    ///   [in]restart(false)
-    ///     With a restart option, the routine assume that the matrix H and the vector e 
-    ///     contain the partial results from the previous run. When m = 1 or 2, this option
-    ///     won't work as the routine always computes the all eigenvalues.
-    ///   [in]max_iteration(300)
-    ///     Unlike LAPACK which uses various methods for different types of matrices,
-    ///     this routine uses the Francis method only. A user can set the maximum number 
-    ///     of iterations. When it reaches the maximum iteration counts without converging 
-    ///     all eigenvalues, the routine returns -1. 
-    template<typename RealType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m,
-           /* */ RealType * H, const int hs0, const int hs1,
-           /* */ RealType * er, const int ers,
-           /* */ RealType * ei, const int eis,
-           const bool restart = false,
-           const int user_max_iteration = -1) {
-      typedef RealType real_type;
-      typedef Kokkos::Details::ArithTraits<real_type> ats;
-      const real_type zero(0), nan(ats::nan()), tol = 1e2*ats::epsilon();
-      const int max_iteration = user_max_iteration < 0 ? 300 : user_max_iteration;
-
-      int r_val = 0;
-      if (restart) {
-        if (m <= 2) {
-          Kokkos::abort("Error: restart option cannot be used for m=1 or m=2");
-        }
-      } else {
-        for (int i=0;i<m;++i) 
-          er[i*ers] = nan; 
+  /// Parameters:
+  ///   [in]m
+  ///     A dimension of the square matrix H.
+  ///   [in/out]H, [in]hs0, [in]hs1
+  ///     Real Hessenberg matrix H(m x m) with strides hs0 and hs1.
+  ///     Entering the routine, H is assumed to have a upper Hessenberg form,
+  ///     where all subdiagonals are zero. The matrix is overwritten on exit.
+  ///   [out]er, [in]ers, [out]ei, [in]eis
+  ///     A complex vector er(m)+ei(m)i with a stride ers and eis to store
+  ///     computed eigenvalues. For a complex eigen pair, it stores a+bi and
+  ///     a-bi consecutively.
+  ///   [in]restart(false)
+  ///     With a restart option, the routine assume that the matrix H and the
+  ///     vector e contain the partial results from the previous run. When m = 1
+  ///     or 2, this option won't work as the routine always computes the all
+  ///     eigenvalues.
+  ///   [in]max_iteration(300)
+  ///     Unlike LAPACK which uses various methods for different types of
+  ///     matrices, this routine uses the Francis method only. A user can set
+  ///     the maximum number of iterations. When it reaches the maximum
+  ///     iteration counts without converging all eigenvalues, the routine
+  ///     returns -1.
+  template <typename RealType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m,
+                                           /* */ RealType *H, const int hs0,
+                                           const int hs1,
+                                           /* */ RealType *er, const int ers,
+                                           /* */ RealType *ei, const int eis,
+                                           const bool restart           = false,
+                                           const int user_max_iteration = -1) {
+    typedef RealType real_type;
+    typedef Kokkos::Details::ArithTraits<real_type> ats;
+    const real_type zero(0), nan(ats::nan()), tol = 1e2 * ats::epsilon();
+    const int max_iteration = user_max_iteration < 0 ? 300 : user_max_iteration;
+
+    int r_val = 0;
+    if (restart) {
+      if (m <= 2) {
+        Kokkos::abort("Error: restart option cannot be used for m=1 or m=2");
       }
+    } else {
+      for (int i = 0; i < m; ++i) er[i * ers] = nan;
+    }
 
-      const int hs = hs0+hs1;  /// diagonal stride        
-      switch (m) {
-      case 0: { /* do nothing */ break; }
-      case 1: { er[0] = H[0]; ei[0] = zero; break; }
+    const int hs = hs0 + hs1;  /// diagonal stride
+    switch (m) {
+      case 0: { /* do nothing */ break;
+      }
+      case 1: {
+        er[0] = H[0];
+        ei[0] = zero;
+        break;
+      }
       case 2: {
         /// compute eigenvalues from the characteristic determinant equation
         bool is_complex;
         Kokkos::complex<real_type> lambda1, lambda2;
-        SerialWilkinsonShiftInternal::invoke(H[0],   H[hs1], 
-                                             H[hs0], H[hs],
-                                             &lambda1, &lambda2,
-                                             &is_complex);
-        er[0] = lambda1.real(); ei[0] = lambda1.imag();
-        er[1] = lambda2.real(); ei[1] = lambda2.imag();
+        SerialWilkinsonShiftInternal::invoke(H[0], H[hs1], H[hs0], H[hs],
+                                             &lambda1, &lambda2, &is_complex);
+        er[0] = lambda1.real();
+        ei[0] = lambda1.imag();
+        er[1] = lambda2.real();
+        ei[1] = lambda2.imag();
         break;
       }
       default: {
-        /// Francis method 
-        int iter(0);             /// iteration count
-        bool converge = false;   /// bool to check all eigenvalues are converged
+        /// Francis method
+        int iter(0);            /// iteration count
+        bool converge = false;  /// bool to check all eigenvalues are converged
 
         while (!converge && iter < max_iteration) {
-          /// Step 1: find a set of unrevealed eigenvalues 
+          /// Step 1: find a set of unrevealed eigenvalues
           int cnt = 1;
-            
+
           /// find mbeg (first nonzero subdiag value)
-          for (;cnt<m;++cnt) {
-            const auto val = ats::abs(*(H+cnt*hs-hs1));
+          for (; cnt < m; ++cnt) {
+            const auto val = ats::abs(*(H + cnt * hs - hs1));
             if (val > tol) break;
           }
-          const int mbeg = cnt-1;
-            
+          const int mbeg = cnt - 1;
+
           /// find mend (first zero subdiag value)
-          for (;cnt<m;++cnt) {
-            const auto val = ats::abs(*(H+cnt*hs-hs1));
-            if (val < tol) break;              
+          for (; cnt < m; ++cnt) {
+            const auto val = ats::abs(*(H + cnt * hs - hs1));
+            if (val < tol) break;
           }
-          const int mend = cnt;
+          const int mend  = cnt;
           const int mdiff = mend - mbeg;
 
           /// Step 2: if there exist non-converged eigen values
           if (1 < mdiff) {
-#             if 0 /// implicit QR with shift for testing 
+#if 0  /// implicit QR with shift for testing 
             {
               /// Rayleigh quotient shift 
               const real_type shift = *(H+(mend-1)*hs); 
@@ -121,54 +126,56 @@ namespace KokkosBatched {
                 er[(mend-1)*ers] = sub2x2[hs]; ei[(mend-1)*eis] = zero;
               }
             }
-#             endif
+#endif
 
-#             if 1 /// Francis double shift method
+#if 1  /// Francis double shift method
             {
               /// find a complex eigen pair
               Kokkos::complex<real_type> lambda1, lambda2;
               bool is_complex;
-              real_type *sub2x2 = H+(mend-2)*hs;
+              real_type *sub2x2 = H + (mend - 2) * hs;
               if (2 == mdiff) {
-                SerialWilkinsonShiftInternal::invoke(sub2x2[0],   sub2x2[hs1], 
-                                                     sub2x2[hs0], sub2x2[hs],
-                                                     &lambda1, &lambda2,
-                                                     &is_complex);                    
+                SerialWilkinsonShiftInternal::invoke(
+                    sub2x2[0], sub2x2[hs1], sub2x2[hs0], sub2x2[hs], &lambda1,
+                    &lambda2, &is_complex);
                 sub2x2[hs0] = zero;
 
                 /// eigenvalues are from wilkinson shift
-                er[(mbeg+0)*ers] = lambda1.real(); ei[(mbeg+0)*eis] = lambda1.imag();
-                er[(mbeg+1)*ers] = lambda2.real(); ei[(mbeg+1)*eis] = lambda2.imag();
+                er[(mbeg + 0) * ers] = lambda1.real();
+                ei[(mbeg + 0) * eis] = lambda1.imag();
+                er[(mbeg + 1) * ers] = lambda2.real();
+                ei[(mbeg + 1) * eis] = lambda2.imag();
               } else {
-                SerialWilkinsonShiftInternal::invoke(sub2x2[0],   sub2x2[hs1], 
-                                                     sub2x2[hs0], sub2x2[hs],
-                                                     &lambda1, &lambda2,
-                                                     &is_complex);                    
-                  
-                SerialFrancisInternal::invoke(0, mdiff, mdiff, 
-                                              H+hs*mbeg, hs0, hs1,
-                                              lambda1, lambda2,
+                SerialWilkinsonShiftInternal::invoke(
+                    sub2x2[0], sub2x2[hs1], sub2x2[hs0], sub2x2[hs], &lambda1,
+                    &lambda2, &is_complex);
+
+                SerialFrancisInternal::invoke(0, mdiff, mdiff, H + hs * mbeg,
+                                              hs0, hs1, lambda1, lambda2,
                                               is_complex);
-                /* */ auto    &val1 = *(sub2x2+hs0);
-                /* */ auto    &val2 = *(sub2x2-hs1);
+                /* */ auto &val1    = *(sub2x2 + hs0);
+                /* */ auto &val2    = *(sub2x2 - hs1);
                 const auto abs_val1 = ats::abs(val1);
                 const auto abs_val2 = ats::abs(val2);
 
                 /// convergence check
-                if (abs_val1 < tol) { 
-                  er[(mend-1)*ers] = sub2x2[hs]; ei[(mend-1)*eis] = zero;
-                  val1 = zero;
+                if (abs_val1 < tol) {
+                  er[(mend - 1) * ers] = sub2x2[hs];
+                  ei[(mend - 1) * eis] = zero;
+                  val1                 = zero;
                 } else if (abs_val2 < tol) {
-                  er[(mend-2)*ers] = lambda1.real(); ei[(mend-2)*eis] = lambda1.imag();
-                  er[(mend-1)*ers] = lambda2.real(); ei[(mend-1)*eis] = lambda2.imag();
+                  er[(mend - 2) * ers] = lambda1.real();
+                  ei[(mend - 2) * eis] = lambda1.imag();
+                  er[(mend - 1) * ers] = lambda2.real();
+                  ei[(mend - 1) * eis] = lambda2.imag();
 
                   val1 = zero;
                   val2 = zero;
                 }
               }
             }
-#             endif
-                                             
+#endif
+
           } else {
             /// all eigenvalues are converged
             converge = true;
@@ -178,45 +185,38 @@ namespace KokkosBatched {
         /// Step 3: record missing real eigenvalues from the diagonals
         if (converge) {
           // record undetected eigenvalues
-          for (int i=0;i<m;++i) 
-            if (ats::isNan(er[i*ers])) { 
-              er[i*ers] = H[i*hs]; 
-              ei[i*eis] = zero;
+          for (int i = 0; i < m; ++i)
+            if (ats::isNan(er[i * ers])) {
+              er[i * ers] = H[i * hs];
+              ei[i * eis] = zero;
             }
           r_val = 0;
         } else {
           r_val = -1;
         }
         break;
-      }          
       }
-      return r_val;
-    }
-
-    /// complex interface
-    template<typename RealType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m,
-           /* */ RealType * H, const int hs0, const int hs1,
-           /* */ Kokkos::complex<RealType> * e, const int es,
-           const int max_iteration = 300,
-           const RealType user_tolerence = RealType(-1),
-           const bool restart = false) {
-      RealType * er = (RealType*)e; 
-      RealType * ei = er+1;
-      const int two_es = 2*es;
-      return invoke(m,
-                    H, hs0, hs1, 
-                    er, two_es,
-                    ei, two_es,
-                    user_tolerence,
-                    restart,
-                    max_iteration);
     }
-  };
-
-} /// end namespace KokkosBatched
-
+    return r_val;
+  }
+
+  /// complex interface
+  template <typename RealType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const int m,
+      /* */ RealType *H, const int hs0, const int hs1,
+      /* */ Kokkos::complex<RealType> *e, const int es,
+      const int max_iteration       = 300,
+      const RealType user_tolerence = RealType(-1),
+      const bool restart            = false) {
+    RealType *er     = (RealType *)e;
+    RealType *ei     = er + 1;
+    const int two_es = 2 * es;
+    return invoke(m, H, hs0, hs1, er, two_es, ei, two_es, user_tolerence,
+                  restart, max_iteration);
+  }
+};
+
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp b/src/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp
index e535f53cac..f11210253e 100644
--- a/src/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp
@@ -1,81 +1,68 @@
 #ifndef __KOKKOSBATCHED_FIND_AMAX_INTERNAL_HPP__
 #define __KOKKOSBATCHED_FIND_AMAX_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// =====================
-  struct SerialFindAmaxInternal {
-    template<typename ValueType,
-	     typename IntType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m,
-           const ValueType *__restrict__ A, const int as0,
-           /**/  IntType *__restrict__ idx) {
-      ValueType max_val(A[0]);
-      IntType val_loc(0);
-      for (int i=1;i<m;++i) {
-	const int idx_a = i*as0;
-	if (A[idx_a] > max_val) {
-	  max_val = A[idx_a];
-	  val_loc = i;
-	}
+///
+/// Serial Internal Impl
+/// =====================
+struct SerialFindAmaxInternal {
+  template <typename ValueType, typename IntType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m,
+                                           const ValueType *KOKKOS_RESTRICT A,
+                                           const int as0,
+                                           /**/ IntType *KOKKOS_RESTRICT idx) {
+    ValueType max_val(A[0]);
+    IntType val_loc(0);
+    for (int i = 1; i < m; ++i) {
+      const int idx_a = i * as0;
+      if (A[idx_a] > max_val) {
+        max_val = A[idx_a];
+        val_loc = i;
       }
-      *idx = val_loc;
-      return 0;
     }
-  };
+    *idx = val_loc;
+    return 0;
+  }
+};
 
-  ///
-  /// TeamVector Internal Impl
-  /// ========================
-  struct TeamVectorFindAmaxInternal {
-    template<typename MemberType,
-             typename ValueType,
-	     typename IntType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const int m,
-           const ValueType *__restrict__ A, const int as0,
-           /**/  IntType *__restrict__ idx) {
-      if (m > 0) {
-        using reducer_value_type = typename Kokkos::MaxLoc<ValueType,IntType>::value_type;
-        reducer_value_type value; 
-        Kokkos::MaxLoc<ValueType,IntType> reducer_value(value);
-        Kokkos::parallel_reduce
-          (Kokkos::TeamVectorRange(member, m),
-           [&](const int &i, reducer_value_type &update) {
-            const int idx_a = i*as0;
+///
+/// TeamVector Internal Impl
+/// ========================
+struct TeamVectorFindAmaxInternal {
+  template <typename MemberType, typename ValueType, typename IntType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int m,
+                                           const ValueType *KOKKOS_RESTRICT A,
+                                           const int as0,
+                                           /**/ IntType *KOKKOS_RESTRICT idx) {
+    if (m > 0) {
+      using reducer_value_type =
+          typename Kokkos::MaxLoc<ValueType, IntType>::value_type;
+      reducer_value_type value;
+      Kokkos::MaxLoc<ValueType, IntType> reducer_value(value);
+      Kokkos::parallel_reduce(
+          Kokkos::TeamVectorRange(member, m),
+          [&](const int &i, reducer_value_type &update) {
+            const int idx_a = i * as0;
             if (A[idx_a] > update.val) {
               update.val = A[idx_a];
               update.loc = i;
             }
-          }, reducer_value);
-        Kokkos::single
-          (Kokkos::PerTeam(member),
-           [&]() {
-            *idx = value.loc;
-          });
-      } else {
-        Kokkos::single
-          (Kokkos::PerTeam(member),
-           [&]() {
-            *idx = 0;
-          });
-      }
-      return 0;
+          },
+          reducer_value);
+      Kokkos::single(Kokkos::PerTeam(member), [&]() { *idx = value.loc; });
+    } else {
+      Kokkos::single(Kokkos::PerTeam(member), [&]() { *idx = 0; });
     }
-  };
-
-}
+    return 0;
+  }
+};
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp
index 78041b82eb..9b8652bdd3 100644
--- a/src/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_FRANCIS_SERIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_FRANCIS_SERIAL_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -10,205 +9,184 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ====================
-  ///
-  /// this impl follows the flame interface of householder transformation
-  ///
-  struct SerialFrancisInternal {
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int mbeg, const int mend, const int morg,
-           /* */ ValueType * HH, const int hs0, const int hs1,
-           const Kokkos::complex<ValueType> lambda1, 
-           const Kokkos::complex<ValueType> lambda2,
-           const bool is_complex,
-           /* */ Kokkos::pair<ValueType,ValueType> * GG, const bool request_schur) {
-      typedef ValueType value_type;
-        
-      const int hs = hs0+hs1;
-      const value_type one(1), zero(0);
-      const Kokkos::pair<value_type,value_type> identity(one,zero);
-
-      /// redefine variables
-      const int m = mend-mbeg, mrst = morg-mend, mbeg_mult_hs0 = mbeg*hs0;
-      value_type *H = HH+hs*mbeg;
-        
-      /// initialize Gs
-      Kokkos::pair<value_type,value_type> *Gs = NULL;
-      if (request_schur) {
-        Gs = (Kokkos::pair<value_type,value_type> *)(GG+mbeg*2); 
-        for (int i=0;i<morg;++i) {
-          GG[2*i  ] = identity;
-          GG[2*i+1] = identity;
-        }
+///
+/// Serial Internal Impl
+/// ====================
+///
+/// this impl follows the flame interface of householder transformation
+///
+struct SerialFrancisInternal {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const int mbeg, const int mend, const int morg,
+      /* */ ValueType *HH, const int hs0, const int hs1,
+      const Kokkos::complex<ValueType> lambda1,
+      const Kokkos::complex<ValueType> lambda2, const bool is_complex,
+      /* */ Kokkos::pair<ValueType, ValueType> *GG, const bool request_schur) {
+    typedef ValueType value_type;
+
+    const int hs = hs0 + hs1;
+    const value_type one(1), zero(0);
+    const Kokkos::pair<value_type, value_type> identity(one, zero);
+
+    /// redefine variables
+    const int m = mend - mbeg, mrst = morg - mend, mbeg_mult_hs0 = mbeg * hs0;
+    value_type *H = HH + hs * mbeg;
+
+    /// initialize Gs
+    Kokkos::pair<value_type, value_type> *Gs = NULL;
+    if (request_schur) {
+      Gs = (Kokkos::pair<value_type, value_type> *)(GG + mbeg * 2);
+      for (int i = 0; i < morg; ++i) {
+        GG[2 * i]     = identity;
+        GG[2 * i + 1] = identity;
       }
+    }
 
-      /// Given a strict Hessenberg matrix H (m x m),
-      /// it computes a single implicit QR step with a given shift
-      /// - it assumes H has zeros on subdiagonal entries (
-      /// givens rotation is defined as G = [gamma -sigma
-      ///                                    sigma  gamma]
-      ///   G' [chi1 chi2]^t = [alpha 0]^T
-      /// where G is stored as a pair of gamma and sigma
-      Kokkos::pair<value_type,value_type> G[2];
-        
-      /// 0. compute 1st double shift vector
-      value_type v[3];
-      {
-        // this needs m>=3
-        // v = M e_1 = (H*H - 2 Re(lambda) H + |lambda|^2 I)e_1
-        value_type s, t;
-        const value_type 
-          h00 = H[0*hs0+0*hs1], h01 = H[0*hs0+1*hs1], 
-          h10 = H[1*hs0+0*hs1], h11 = H[1*hs0+1*hs1], 
-          /* */                 h21 = H[2*hs0+1*hs1];          
-        if (is_complex) {
-          s = 2*lambda1.real();
-          t = lambda1.real()*lambda1.real()+lambda1.imag()*lambda1.imag();
-        } else {
-          const value_type val = H[(m-1)*hs];
-          const auto dist_lambda1 = Kokkos::Details::ArithTraits<value_type>::abs(lambda1.real() - val);
-          const auto dist_lambda2 = Kokkos::Details::ArithTraits<value_type>::abs(lambda2.real() - val);
-          const value_type lambda = dist_lambda1 < dist_lambda2 ? lambda1.real() : lambda2.real();
-          s = 2*lambda;
-          t = lambda*lambda;
-        }
-        v[0] = h00*h00+h01*h10 /* H^2 e_1 */ - s*h00 /* 2 Re(lambda) */ + t;
-        v[1] = h10*h00+h11*h10 /*         */ - s*h10;
-        v[2] = h21*h10;
-      }
-        
-      /// 1. compute the first two givens rotations that introduces a bulge
-      {
-        SerialGivensInternal::invoke(v[0], v[1],
-                                     &G[0],
-                                     &v[0]);
-        SerialGivensInternal::invoke(v[0], v[2],
-                                     &G[1],
-                                     &v[0]);
-        // record 
-        if (request_schur) {
-          Gs[0] = G[0];
-          Gs[1] = G[1];
-        }
-
-        // apply G' from left and right
-        G[0].second = -G[0].second;
-        G[1].second = -G[1].second; 
-
-        const int mm = m < 4 ? m : 4, nn = m;
-        value_type *Hs = H-mbeg_mult_hs0;
-        SerialApplyLeftRightGivensInternal
-                           ::invoke (G[0], G[1], 
-                                     mm+mbeg, nn+mrst,
-                                     H,  H +hs0,H +2*hs0,
-                                     Hs, Hs+hs1,Hs+2*hs1,
-                                     hs0, hs1);
+    /// Given a strict Hessenberg matrix H (m x m),
+    /// it computes a single implicit QR step with a given shift
+    /// - it assumes H has zeros on subdiagonal entries (
+    /// givens rotation is defined as G = [gamma -sigma
+    ///                                    sigma  gamma]
+    ///   G' [chi1 chi2]^t = [alpha 0]^T
+    /// where G is stored as a pair of gamma and sigma
+    Kokkos::pair<value_type, value_type> G[2];
+
+    /// 0. compute 1st double shift vector
+    value_type v[3];
+    {
+      // this needs m>=3
+      // v = M e_1 = (H*H - 2 Re(lambda) H + |lambda|^2 I)e_1
+      value_type s, t;
+      const value_type h00 = H[0 * hs0 + 0 * hs1], h01 = H[0 * hs0 + 1 * hs1],
+                       h10 = H[1 * hs0 + 0 * hs1], h11 = H[1 * hs0 + 1 * hs1],
+                       /* */ h21 = H[2 * hs0 + 1 * hs1];
+      if (is_complex) {
+        s = 2 * lambda1.real();
+        t = lambda1.real() * lambda1.real() + lambda1.imag() * lambda1.imag();
+      } else {
+        const value_type val = H[(m - 1) * hs];
+        const auto dist_lambda1 =
+            Kokkos::Details::ArithTraits<value_type>::abs(lambda1.real() - val);
+        const auto dist_lambda2 =
+            Kokkos::Details::ArithTraits<value_type>::abs(lambda2.real() - val);
+        const value_type lambda =
+            dist_lambda1 < dist_lambda2 ? lambda1.real() : lambda2.real();
+        s = 2 * lambda;
+        t = lambda * lambda;
       }
+      v[0] =
+          h00 * h00 + h01 * h10 /* H^2 e_1 */ - s * h00 /* 2 Re(lambda) */ + t;
+      v[1] = h10 * h00 + h11 * h10 /*         */ - s * h10;
+      v[2] = h21 * h10;
+    }
 
-      /// 1. chase the bulge
-
-      // partitions used for loop iteration
-      Partition2x2<value_type> H_part2x2(hs0, hs1);
-      Partition3x3<value_type> H_part3x3(hs0, hs1);
-
-      // initial partition of A where ATL has a zero dimension
-      int m_htl = 1;
-      H_part2x2.partWithATL(H, m, m, m_htl, m_htl);
-      for (;m_htl<(m-2);++m_htl) {
-        // part 2x2 into 3x3
-        H_part3x3.partWithABR(H_part2x2, 1, 1);
-        /// -----------------------------------------------------
-        value_type *chi1 = H_part3x3.A11-hs1;
-        value_type *chi2 = chi1+hs0; 
-        value_type *chi3 = chi2+hs0; 
-
-        SerialGivensInternal::invoke(*chi1, *chi2,
-                                     &G[0],
-                                     chi1); *chi2 = zero;
-        SerialGivensInternal::invoke(*chi1, *chi3,
-                                     &G[1],
-                                     chi1); *chi3 = zero;
-        // record 
-        if (request_schur) {
-          Gs[2*m_htl  ] = G[0];
-          Gs[2*m_htl+1] = G[1];
-        }
-
-        G[0].second = -G[0].second;
-        G[1].second = -G[1].second;
-
-        const int nn = m-m_htl, mtmp = m_htl+4, mm = mtmp < m ? mtmp : m;
-        value_type *a1t = H_part3x3.A11;
-        value_type *a2t = a1t+hs0;
-        value_type *a3t = a2t+hs0;
-        value_type *a1  = H_part3x3.A01-mbeg_mult_hs0;
-        value_type *a2  = a1+hs1;
-        value_type *a3  = a2+hs1;
-
-        SerialApplyLeftRightGivensInternal
-                                                            ::invoke (G[0], G[1], 
-                                                                      mm+mbeg, nn+mrst,
-                                                                      a1t, a2t, a3t,
-                                                                      a1,  a2,  a3,
-                                                                      hs0, hs1);
-        /// -----------------------------------------------------
-        H_part2x2.mergeToATL(H_part3x3);
+    /// 1. compute the first two givens rotations that introduces a bulge
+    {
+      SerialGivensInternal::invoke(v[0], v[1], &G[0], &v[0]);
+      SerialGivensInternal::invoke(v[0], v[2], &G[1], &v[0]);
+      // record
+      if (request_schur) {
+        Gs[0] = G[0];
+        Gs[1] = G[1];
       }
 
-      // last 3x3 block
-      {
-        // part 2x2 into 3x3
-        H_part3x3.partWithABR(H_part2x2, 1, 1);
-        /// -----------------------------------------------------
-        value_type *chi1 = H_part3x3.A11-hs1;
-        value_type *chi2 = chi1+hs0; 
-        SerialGivensInternal::invoke(*chi1, *chi2,
-                                     &G[0],
-                                     chi1); *chi2 = zero;
-        Gs[2*m_htl  ] = G[0];
-        Gs[2*m_htl+1] = identity;
-
-        G[0].second = -G[0].second; 
-
-        const int mm = m, nn = 2;
-        value_type *a1t = H_part3x3.A11;
-        value_type *a2t = a1t+hs0;
-        value_type *a1  = H_part3x3.A01-mbeg_mult_hs0;
-        value_type *a2  = a1+hs1;
-        SerialApplyLeftRightGivensInternal
-          ::invoke (G[0], 
-                    mm+mbeg, nn+mrst,
-                    a1t, a2t,
-                    a1,  a2,
-                    hs0, hs1);
-
-        /// -----------------------------------------------------
-        H_part2x2.mergeToATL(H_part3x3);
-      }
+      // apply G' from left and right
+      G[0].second = -G[0].second;
+      G[1].second = -G[1].second;
 
-      return 0;
+      const int mm = m < 4 ? m : 4, nn = m;
+      value_type *Hs = H - mbeg_mult_hs0;
+      SerialApplyLeftRightGivensInternal ::invoke(
+          G[0], G[1], mm + mbeg, nn + mrst, H, H + hs0, H + 2 * hs0, Hs,
+          Hs + hs1, Hs + 2 * hs1, hs0, hs1);
     }
 
-    template<typename ValueType>
-    KOKKOS_FORCEINLINE_FUNCTION
-    static int
-    invoke(const int mbeg, const int mend, const int morg,
-           /* */ ValueType * HH, const int hs0, const int hs1,
-           const Kokkos::complex<ValueType> lambda1, 
-           const Kokkos::complex<ValueType> lambda2,
-           const bool is_complex) {
-      return invoke(mbeg, mend, morg,
-                    HH, hs0, hs1,
-                    lambda1, lambda2, is_complex,
-                    (Kokkos::pair<ValueType,ValueType>*)NULL, false);
+    /// 1. chase the bulge
+
+    // partitions used for loop iteration
+    Partition2x2<value_type> H_part2x2(hs0, hs1);
+    Partition3x3<value_type> H_part3x3(hs0, hs1);
+
+    // initial partition of A where ATL has a zero dimension
+    int m_htl = 1;
+    H_part2x2.partWithATL(H, m, m, m_htl, m_htl);
+    for (; m_htl < (m - 2); ++m_htl) {
+      // part 2x2 into 3x3
+      H_part3x3.partWithABR(H_part2x2, 1, 1);
+      /// -----------------------------------------------------
+      value_type *chi1 = H_part3x3.A11 - hs1;
+      value_type *chi2 = chi1 + hs0;
+      value_type *chi3 = chi2 + hs0;
+
+      SerialGivensInternal::invoke(*chi1, *chi2, &G[0], chi1);
+      *chi2 = zero;
+      SerialGivensInternal::invoke(*chi1, *chi3, &G[1], chi1);
+      *chi3 = zero;
+      // record
+      if (request_schur) {
+        Gs[2 * m_htl]     = G[0];
+        Gs[2 * m_htl + 1] = G[1];
+      }
+
+      G[0].second = -G[0].second;
+      G[1].second = -G[1].second;
+
+      const int nn = m - m_htl, mtmp = m_htl + 4, mm = mtmp < m ? mtmp : m;
+      value_type *a1t = H_part3x3.A11;
+      value_type *a2t = a1t + hs0;
+      value_type *a3t = a2t + hs0;
+      value_type *a1  = H_part3x3.A01 - mbeg_mult_hs0;
+      value_type *a2  = a1 + hs1;
+      value_type *a3  = a2 + hs1;
+
+      SerialApplyLeftRightGivensInternal ::invoke(G[0], G[1], mm + mbeg,
+                                                  nn + mrst, a1t, a2t, a3t, a1,
+                                                  a2, a3, hs0, hs1);
+      /// -----------------------------------------------------
+      H_part2x2.mergeToATL(H_part3x3);
     }
-  };
 
-} // end namespace KokkosBatched
+    // last 3x3 block
+    {
+      // part 2x2 into 3x3
+      H_part3x3.partWithABR(H_part2x2, 1, 1);
+      /// -----------------------------------------------------
+      value_type *chi1 = H_part3x3.A11 - hs1;
+      value_type *chi2 = chi1 + hs0;
+      SerialGivensInternal::invoke(*chi1, *chi2, &G[0], chi1);
+      *chi2             = zero;
+      Gs[2 * m_htl]     = G[0];
+      Gs[2 * m_htl + 1] = identity;
+
+      G[0].second = -G[0].second;
+
+      const int mm = m, nn = 2;
+      value_type *a1t = H_part3x3.A11;
+      value_type *a2t = a1t + hs0;
+      value_type *a1  = H_part3x3.A01 - mbeg_mult_hs0;
+      value_type *a2  = a1 + hs1;
+      SerialApplyLeftRightGivensInternal ::invoke(G[0], mm + mbeg, nn + mrst,
+                                                  a1t, a2t, a1, a2, hs0, hs1);
+
+      /// -----------------------------------------------------
+      H_part2x2.mergeToATL(H_part3x3);
+    }
 
+    return 0;
+  }
+
+  template <typename ValueType>
+  KOKKOS_FORCEINLINE_FUNCTION static int invoke(
+      const int mbeg, const int mend, const int morg,
+      /* */ ValueType *HH, const int hs0, const int hs1,
+      const Kokkos::complex<ValueType> lambda1,
+      const Kokkos::complex<ValueType> lambda2, const bool is_complex) {
+    return invoke(mbeg, mend, morg, HH, hs0, hs1, lambda1, lambda2, is_complex,
+                  (Kokkos::pair<ValueType, ValueType> *)NULL, false);
+  }
+};
+
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_Armpl_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_Armpl_Impl.hpp
new file mode 100644
index 0000000000..63de04a50f
--- /dev/null
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_Armpl_Impl.hpp
@@ -0,0 +1,289 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+#ifndef __KOKKOSBATCHED_GEMM_ARMPL_IMPL_HPP__
+#define __KOKKOSBATCHED_GEMM_ARMPL_IMPL_HPP__
+#if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) && ARMPL_BUILD >= 1058
+#include "KokkosBatched_Util.hpp"
+#include "KokkosKernels_Error.hpp"
+
+namespace KokkosBatched {
+namespace Impl {
+/********************* BEGIN non-functor-level routines *********************/
+template <class ArgTransA, class ArgTransB, class ArgBatchSzDim,
+          class HandleType, class ScalarType, class AViewType, class BViewType,
+          class CViewType>
+class BatchedArmplGemm {
+ private:
+  HandleType *const __handle;
+  using avt = typename AViewType::value_type;
+  using bvt = typename BViewType::value_type;
+  using cvt = typename CViewType::value_type;
+  cvt tag;
+
+  AViewType __A;
+  avt *__Adp = nullptr;
+  armpl_int_t __Ajstrd, __Aistrd, __Abstrd;
+
+  BViewType __B;
+  bvt *__Bdp = nullptr;
+  armpl_int_t __Bjstrd, __Bistrd, __Bbstrd;
+
+  CViewType __C;
+  cvt *__Cdp = nullptr;
+  armpl_int_t __Cjstrd, __Cistrd, __Cbstrd;
+
+  ScalarType __alpha, __beta;
+  armpl_int_t __ninter, __nbatch;
+
+  char __transa, __transb;
+
+  ArgTransA __transa_tag;
+  ArgTransB __transb_tag;
+  Trans::NoTranspose __no_trans_tag;
+  ArgBatchSzDim __batch_layout_tag;
+
+  armpl_int_t __Am, __An, __Bm, __Bn, __Cm, __Cn;
+
+  template <class T>
+  std::enable_if_t<std::is_same<T, double>::value, void> __unpack_views(T &) {
+    for (int ib = 0; ib < __nbatch; ++ib) {
+      for (int i = 0; i < __ninter; ++i) {
+        auto svA =
+            subview_wrapper(__A, ib * __ninter + i, Kokkos::ALL(),
+                            Kokkos::ALL(), __batch_layout_tag, __no_trans_tag);
+        auto svB =
+            subview_wrapper(__B, ib * __ninter + i, Kokkos::ALL(),
+                            Kokkos::ALL(), __batch_layout_tag, __no_trans_tag);
+        auto svC =
+            subview_wrapper(__C, ib * __ninter + i, Kokkos::ALL(),
+                            Kokkos::ALL(), __batch_layout_tag, __no_trans_tag);
+
+        auto info = armpl_dge_interleave(
+            __ninter, i, __Am, __An, svA.data(), svA.stride(0), svA.stride(1),
+            &__Adp[__Abstrd * ib], __Aistrd, __Ajstrd);
+        if (info != ARMPL_STATUS_SUCCESS) {
+          std::ostringstream os;
+          os << "armpl_dge_interleave(A) returned:" << info << std::endl;
+          KokkosKernels::Impl::throw_runtime_exception(os.str());
+        }
+
+        info = armpl_dge_interleave(__ninter, i, __Bm, __Bn, svB.data(),
+                                    svB.stride(0), svB.stride(1),
+                                    &__Bdp[__Bbstrd * ib], __Bistrd, __Bjstrd);
+        if (info != ARMPL_STATUS_SUCCESS) {
+          std::ostringstream os;
+          os << "armpl_dge_interleave(B) returned:" << info << std::endl;
+          KokkosKernels::Impl::throw_runtime_exception(os.str());
+        }
+
+        info = armpl_dge_interleave(__ninter, i, __Cm, __Cn, svC.data(),
+                                    svC.stride(0), svC.stride(1),
+                                    &__Cdp[__Cbstrd * ib], __Cistrd, __Cjstrd);
+        if (info != ARMPL_STATUS_SUCCESS) {
+          std::ostringstream os;
+          os << "armpl_dge_interleave(C) returned:" << info << std::endl;
+          KokkosKernels::Impl::throw_runtime_exception(os.str());
+        }
+      }
+    }
+    return;
+  }
+
+  template <class T>
+  std::enable_if_t<std::is_same<T, double>::value, void> __repack_view(T &) {
+    for (int ib = 0; ib < __nbatch; ++ib) {
+      for (int i = 0; i < __ninter; ++i) {
+        auto svC =
+            subview_wrapper(__C, ib * __ninter + i, Kokkos::ALL(),
+                            Kokkos::ALL(), __batch_layout_tag, __no_trans_tag);
+
+        auto info = armpl_dge_deinterleave(
+            __ninter, i, __Cm, __Cn, svC.data(), svC.stride(0), svC.stride(1),
+            &__Cdp[__Cbstrd * ib], __Cistrd, __Cjstrd);
+        if (info != ARMPL_STATUS_SUCCESS) {
+          std::ostringstream os;
+          os << "armpl_dge_deinterleave returned:" << info << std::endl;
+          KokkosKernels::Impl::throw_runtime_exception(os.str());
+        }
+      }
+    }
+    return;
+  }
+
+  template <class T>
+  std::enable_if_t<std::is_same<T, double>::value, void> __run(T &) {
+    auto info = armpl_dgemm_interleave_batch(
+        __ninter, __nbatch, __transa, __transb, __Cm, __Cn,
+        std::is_same<ArgTransA, Trans::NoTranspose>::value ? __An : __Am,
+        __alpha, __Adp, __Abstrd, __Aistrd, __Ajstrd, __Bdp, __Bbstrd, __Bistrd,
+        __Bjstrd, __beta, __Cdp, __Cbstrd, __Cistrd, __Cjstrd);
+    if (info != ARMPL_STATUS_SUCCESS) {
+      std::ostringstream os;
+      os << "armpl_dgemm_interleave_batch returned :" << info << std::endl;
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
+    }
+    return;
+  }
+
+  // Fallback overloads for any type other than double
+  // These must be provided to allow compilation in and throw a runtime error
+  template <class T>
+  std::enable_if_t<!std::is_same<T, double>::value, void> __unpack_views(T &) {}
+  template <class T>
+  std::enable_if_t<!std::is_same<T, double>::value, void> __repack_view(T &) {}
+  template <class T>
+  std::enable_if_t<!std::is_same<T, double>::value, void> __run(T &) {}
+
+ public:
+  BatchedArmplGemm(HandleType *const handle, ScalarType alpha, AViewType A,
+                   BViewType B, ScalarType beta, CViewType C)
+      : __handle(handle), __A(A), __B(B), __C(C), __alpha(alpha), __beta(beta) {
+    __ninter = __handle->get_tpl_params()[0];
+
+    if (std::is_same<ArgBatchSzDim, BatchLayout::Left>::value) {
+      __Am     = __A.extent(1);
+      __An     = __A.extent(2);
+      __Bm     = __B.extent(1);
+      __Bn     = __B.extent(2);
+      __Cm     = __C.extent(1);
+      __Cn     = __C.extent(2);
+      __nbatch = __C.extent(0);
+    } else {
+      __Am     = __A.extent(0);
+      __An     = __A.extent(1);
+      __Bm     = __B.extent(0);
+      __Bn     = __B.extent(1);
+      __Cm     = __C.extent(0);
+      __Cn     = __C.extent(1);
+      __nbatch = __C.extent(2);
+    }
+
+    __Ajstrd = __ninter;
+    __Aistrd = __Ajstrd * __An;
+    __Abstrd = __Aistrd * __Am;
+
+    __Bjstrd = __ninter;
+    __Bistrd = __Bjstrd * __Bn;
+    __Bbstrd = __Bistrd * __Bm;
+
+    __Cjstrd = __ninter;
+    __Cistrd = __Cjstrd * __Cn;
+    __Cbstrd = __Cistrd * __Cm;
+
+    __transa = std::is_same<ArgTransA, Trans::NoTranspose>::value ? 'N' : 'T';
+    __transb = std::is_same<ArgTransB, Trans::NoTranspose>::value ? 'N' : 'T';
+  }
+
+  int invoke() {
+    if (__handle->enableDebug) {
+      std::cerr << "__nbatch:" << std::to_string(__nbatch)
+                << ", __ninter:" << std::to_string(__ninter)
+                << ", __Am:" << std::to_string(__Am)
+                << ", __An:" << std::to_string(__An) << std::endl;
+    }
+
+    if (!std::is_same<avt, double>::value ||
+        !std::is_same<bvt, double>::value ||
+        !std::is_same<cvt, double>::value ||
+        !std::is_same<ScalarType, double>::value) {
+      std::ostringstream os;
+      os << "KokkosBatched::Impl::BatchedArmplGemm only supports 'double' "
+            "scalar types."
+         << std::endl;
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
+    }
+
+    if (__nbatch != 0) {
+      if (__ninter == 0 || __nbatch % __ninter) {
+        std::ostringstream os;
+        os << "batch size must be evenly divisible by ninter. __nbatch: "
+           << std::to_string(__nbatch)
+           << ", __ninter: " << std::to_string(__ninter) << std::endl;
+        KokkosKernels::Impl::throw_runtime_exception(os.str());
+      }
+
+      // Calculate internal batch size for interleaving
+      __nbatch /= __ninter;
+
+      //      Kokkos::Timer timer;
+
+      // Assume that matrices are interleaved properly if the ViewValueType is
+      // SIMD
+      using ViewValueType = typename CViewType::value_type;
+      if (is_vector<ViewValueType>::value || __ninter == 1) {
+        __Adp = __A.data();
+        __Bdp = __B.data();
+        __Cdp = __C.data();
+      } else {
+        // Allocate space for interleaving
+        __Adp = new avt[__Abstrd * __nbatch];
+        __Bdp = new bvt[__Bbstrd * __nbatch];
+        __Cdp = new cvt[__Cbstrd * __nbatch];
+
+        //        timer.reset();
+        __unpack_views<cvt>(tag);
+        // std::cout << "TIME(s): __unpack_views: " << timer.seconds() <<
+        // std::endl;
+      }
+
+      //      timer.reset();
+      __run<cvt>(tag);
+      //      std::cout << "TIME(s): __run: " << timer.seconds() << std::endl;
+
+      if (!(is_vector<ViewValueType>::value || __ninter == 1)) {
+        delete __Adp;
+        delete __Bdp;
+        //        timer.reset();
+        __repack_view<cvt>(tag);
+        //        std::cout << "TIME(s): __repack_view: " << timer.seconds() <<
+        //        std::endl;
+        delete __Cdp;
+      }
+    }
+    return 0;
+  }
+};
+/********************* END non-functor-level routines *********************/
+}  // namespace Impl
+}  // namespace KokkosBatched
+#endif  // KOKKOSKERNELS_ENABLE_TPL_ARMPL
+#endif
diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
index 36dc68272f..7bc5529fcc 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
@@ -43,6 +43,7 @@
 #define __KOKKOSBATCHED_GEMM_DBLBUF_IMPL_HPP__
 
 #include "KokkosBatched_Util.hpp"
+#include "KokkosKernels_Error.hpp"
 
 namespace KokkosBatched {
 /********************* BEGIN functor-level routines *********************/
@@ -58,12 +59,25 @@ namespace Impl {
 /// CT/NT, NT/CT, CT/CT
 ///
 
+// TODO - scaling between (32x32, 64x64)
+//   Option 0: Increase number of tiles and figure out how to map kokkos teams
+//             into cuda grid. Keep team size and vector lanes constant.
+//             TODO: write up small example and ask Christian. [DONE,
+//             MdRangePolicy not applicable here]
+//   Option 1: Increase register sizes to handle rows/cols past tile size
+//   Option 2: Fix league_size and have single team solve full tile followed
+//   by same team solving extra rows/cols (without multiplying by the
+//   zero rows/cols)
 template <class ArgTransA, class ArgTransB, class ArgBatchSzDim,
           class HandleType, class ScalarType, class AViewType, class BViewType,
-          class CViewType, class ArgBoundsCheck, int TILE_M, int TILE_N,
-          int TILE_K>
+          class CViewType, class ArgBoundsCheck, class ArgAlphaFmaTag,
+          int TILE_M, int TILE_N, int TILE_K>
 class BatchedDblBufGemm {
  private:
+  using AlphaMulTag =
+      std::conditional_t<std::is_same<ArgAlphaFmaTag, AlphaTag::Yes>::value,
+                         AlphaTag::No, AlphaTag::Yes>;
+
   HandleType *const __handle;
   AViewType __A;
   BViewType __B;
@@ -73,6 +87,8 @@ class BatchedDblBufGemm {
   ArgTransB __transB_tag;
   ArgBatchSzDim __batch_layout_tag;
   ArgBoundsCheck __bounds_check_tag;
+  ArgAlphaFmaTag __alpha_fma_tag;
+  AlphaMulTag __alpha_mul_tag;
   int __c_batch_size, __c_m, __c_n;
 
   using view_value_type      = typename CViewType::value_type;
@@ -101,7 +117,7 @@ class BatchedDblBufGemm {
 
  private:
   void __run() {
-    using policy_type = Kokkos::TeamPolicy<execution_space_type>;
+    using policy_type = Kokkos::TeamPolicy<layout_type, execution_space_type>;
     using member_type = typename policy_type::member_type;
 
     // Compile-time expressions required for functor-level register allocations:
@@ -117,7 +133,7 @@ class BatchedDblBufGemm {
     constexpr int stride_n = TILE_N / reg_n;
     using functor_type = Functor<member_type, reg_m, reg_n, stride_m, stride_n>;
 
-    functor_type functor(*this, __A, __B, __C, TILE_M, TILE_N, TILE_K);
+    functor_type functor(*this, __A, __B, __C);
 
     if (__handle->enableDebug) {
       std::cout << "algo_type:" << __handle->get_kernel_algo_type() << std::endl
@@ -145,7 +161,7 @@ class BatchedDblBufGemm {
          << " does not support team_size > " << std::to_string(max_team_size)
          << "." << std::endl
          << " The tile dimensions must be adjusted." << std::endl;
-      Kokkos::Impl::throw_runtime_exception(os.str());
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
     }
 
     const int max_vector_len =
@@ -157,7 +173,7 @@ class BatchedDblBufGemm {
          << " does not support vector_len > " << std::to_string(max_vector_len)
          << "." << std::endl
          << " The tile dimensions must be adjusted." << std::endl;
-      Kokkos::Impl::throw_runtime_exception(os.str());
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
     }
 
     if (__handle->enableDebug) {
@@ -193,8 +209,8 @@ class BatchedDblBufGemm {
     CViewType __C;
     ScalarType __alpha, __beta;
     int __k;
-    size_t __n_tile_k_tiles, __n_sub_tiles;
-    unsigned __tile_m, __tile_n, __tile_k, __tiles_per_col, __tiles_per_row;
+    size_t __n_sub_tiles;
+    unsigned __tiles_per_col, __tiles_per_row;
 
    public:
     size_t get_n_sub_tiles() { return __n_sub_tiles; }
@@ -203,15 +219,8 @@ class BatchedDblBufGemm {
     // below. If those are used, we  get an invalid memory error from cuda. I
     // suspect this is due the values not being copied to device and then
     // runtime resolution of the host address &__ei.
-    Functor(BatchedDblBufGemm &ei, AViewType A, BViewType B, CViewType C,
-              unsigned tile_m = 1, unsigned tile_n = 1, unsigned tile_k = 1)
-        : __ei(ei),
-          __A(A),
-          __B(B),
-          __C(C),
-          __tile_m(tile_m),
-          __tile_n(tile_n),
-          __tile_k(tile_k) {
+    Functor(BatchedDblBufGemm &ei, AViewType A, BViewType B, CViewType C)
+        : __ei(ei), __A(A), __B(B), __C(C) {
       if (std::is_same<ArgBatchSzDim, BatchLayout::Left>::value) {
         ei.__c_batch_size = ei.__C.extent_int(0);
         ei.__c_m          = ei.__C.extent_int(1);
@@ -235,54 +244,258 @@ class BatchedDblBufGemm {
       // with '!!'. This extra tile will hang off the edge of the 2-rank matrix.
       // For cases where tiles hang off the edge, we over-compute 0s within
       // registers via a conditional bounds check selected at compile-time.
-      __tiles_per_row = ei.__c_m / __tile_m + !!((unsigned)ei.__c_m % __tile_m);
-      __tiles_per_col = ei.__c_n / __tile_n + !!((unsigned)ei.__c_n % __tile_n);
+      __tiles_per_row = ei.__c_m / TILE_M + !!((unsigned)ei.__c_m % TILE_M);
+      __tiles_per_col = ei.__c_n / TILE_N + !!((unsigned)ei.__c_n % TILE_N);
+
+      __n_sub_tiles = __tiles_per_row * __tiles_per_col;
+    }
 
-      // To handle truncation of __n_tile_k_tile, we have logic within the
-      // operator for handling a partial __tile_k tile.
-      __n_tile_k_tiles = __k / __tile_k;
-      __n_sub_tiles    = __tiles_per_row * __tiles_per_col;
+    KOKKOS_INLINE_FUNCTION
+    void __mul(view_value_type a, view_value_type b, view_value_type &c,
+               const AlphaTag::No &) const {
+      c += a * b;
     }
 
     KOKKOS_INLINE_FUNCTION
-    void __rshmem_and_mult(const MemberType &member, const unsigned &nk,
-                           const unsigned &tile_m, const unsigned &tile_n,
-                           view_value_type reg_a[REG_M],
-                           view_value_type reg_b[REG_N],
-                           view_value_type reg_c[REG_M][REG_N],
-                           view_type_2d_scratch &svA_scr,
-                           view_type_2d_scratch &svB_scr) const {
+    void __mul(view_value_type a, view_value_type b, view_value_type &c,
+               const AlphaTag::Yes &) const {
+      c += a * b * __alpha;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void __rshmem_and_mul(const int &thread_id, const int &vlane_id,
+                          const unsigned &nk, view_value_type reg_a[REG_M],
+                          view_value_type reg_b[REG_N],
+                          view_value_type reg_c[REG_M][REG_N],
+                          view_type_2d_scratch &svA_scr,
+                          view_type_2d_scratch &svB_scr) const {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+      for (unsigned k = 0; k < nk; ++k) {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+        for (int m = 0; m < REG_M; ++m)
+          reg_a[m] = svA_scr(thread_id + m * STRIDE_M, k);
+
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+        for (int n = 0; n < REG_N; ++n)
+          reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N);
+
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+        for (int m = 0; m < REG_M; ++m) {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+          for (int n = 0; n < REG_N; ++n)
+            __mul(reg_a[m], reg_b[n], reg_c[m][n], __ei.__alpha_mul_tag);
+        }
+      }
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void __rshmem_and_mul_ll(const int &thread_id, const int &vlane_id,
+                             const unsigned &nk, view_value_type reg_a[REG_M],
+                             view_value_type reg_b[REG_N],
+                             view_value_type reg_c[REG_M][REG_N],
+                             view_type_2d_scratch &svA_scr,
+                             view_type_2d_scratch &svB_scr) const {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+      for (unsigned k = 0; k < nk; ++k) {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+        for (int m = 0; m < REG_M; ++m)
+          reg_a[m] = svA_scr(k, vlane_id + m * STRIDE_M);
+
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+        for (int n = 0; n < REG_N; ++n)
+          reg_b[n] = svB_scr(thread_id + n * STRIDE_N, k);
+
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+        for (int m = 0; m < REG_M; ++m) {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+          for (int n = 0; n < REG_N; ++n)
+            __mul(reg_a[m], reg_b[n], reg_c[m][n], __ei.__alpha_mul_tag);
+        }
+      }
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const Kokkos::LayoutRight &,
+                    const MemberType &member) const {
+      // TODO: use Kokkos view with compile-time size to allocating register??
+      //  Then we can use local deep copy for prefetch_reg population.
+      // Allocate registers used for prefetching
+      view_value_type prefetch_reg_a[REG_M] = {0}, prefetch_reg_b[REG_N] = {0};
+
+      // Allocate registers used for FMAs
+      view_value_type reg_a[REG_M] = {0}, reg_b[REG_N] = {0},
+                      reg_c[REG_M][REG_N] = {{0}};
+      // TODO: look at local loads and stores via nvprof
+      // TODO: look at GPU trace in nvprof to find out how many registers are
+      // used.
+
+      unsigned batch_idx = member.league_rank() / __n_sub_tiles;
+
+      // Compute starting tile offsets for each team into svA, svB, svC
+      unsigned local_team_idx = member.league_rank() % __n_sub_tiles;
+      unsigned start_m        = (local_team_idx / __tiles_per_col) * TILE_M;
+      unsigned start_n        = (local_team_idx % __tiles_per_col) * TILE_N;
+
+      int kk;
+
+      // Fetch entire 2-rank sub-matrix
+      auto svA = subview_wrapper(__A, batch_idx, Kokkos::ALL(), Kokkos::ALL(),
+                                 __ei.__batch_layout_tag, __ei.__transA_tag);
+      auto svB = subview_wrapper(__B, batch_idx, Kokkos::ALL(), Kokkos::ALL(),
+                                 __ei.__batch_layout_tag, __ei.__transB_tag);
+      auto svC = subview_wrapper(__C, batch_idx, Kokkos::ALL(), Kokkos::ALL(),
+                                 __ei.__batch_layout_tag);
+
+      // Allocate scratch memory buffers used for prefetching
+      view_type_2d_scratch svA_scr(member.team_scratch(0), TILE_M, TILE_K);
+      view_type_2d_scratch svB_scr(member.team_scratch(0), TILE_K, TILE_N);
+
       Kokkos::parallel_for(
-          Kokkos::TeamThreadRange(member, 0, tile_m / REG_M),
+          Kokkos::TeamThreadRange(member, 0, STRIDE_M),
           [&](const int &thread_id) {
+            int m_offset = thread_id + start_m;
+
             Kokkos::parallel_for(
-                Kokkos::ThreadVectorRange(member, 0, tile_n / REG_N),
+                Kokkos::ThreadVectorRange(member, 0, STRIDE_N),
                 [&](const int &vlane_id) {
+                  int n_offset = vlane_id + start_n;
+
+          // Here we populate scratch memory with one or more "k" tiles for
+          // every thread of the team!
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                  for (unsigned k = 0; k < nk; ++k) {
+                  for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N)
+                    svB_scr(thread_id, vlane_id + i) =
+                        access_view_bounds_check<view_value_type>(
+                            svB, thread_id, n_offset + i,
+                            __ei.__bounds_check_tag);
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                    for (int m = 0; m < REG_M; ++m)
-                      reg_a[m] = svA_scr(k, thread_id + m * STRIDE_M);
+                  for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M)
+                    svA_scr(thread_id + i, vlane_id) =
+                        access_view_bounds_check<view_value_type>(
+                            svA, m_offset + i, vlane_id,
+                            __ei.__bounds_check_tag);
 
+                  // Wait for A, B to reside in scratch memory
+                  member.team_barrier();
+
+                  // Each thread calculates a single dot product in chunks of
+                  // size TILE_K
+                  for (kk = 0; kk < __k - TILE_K; kk += TILE_K) {
+                    int k_tile_offset = kk + TILE_K;
+
+            // Get this threads next TILE_K entries from global memory
+            // Each thread has its own copy of prefetch_reg_b.
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+                    for (int i = 0; i < REG_N; ++i)
+                      prefetch_reg_b[i] =
+                          access_view_bounds_check<view_value_type>(
+                              svB, thread_id + k_tile_offset,
+                              n_offset + i * STRIDE_N, __ei.__bounds_check_tag);
+
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+                    for (int i = 0; i < REG_M; ++i)
+                      prefetch_reg_a[i] =
+                          access_view_bounds_check<view_value_type>(
+                              svA, m_offset + i * STRIDE_M,
+                              vlane_id + k_tile_offset,
+                              __ei.__bounds_check_tag);
+
+                    __rshmem_and_mul(thread_id, vlane_id, TILE_K, reg_a, reg_b,
+                                     reg_c, svA_scr, svB_scr);
+
+                    // Wait for:
+                    //   1. prefetch_regs to be populated
+                    //   2. for shmem to no longer be read from
+                    member.team_barrier();
+
+            // populate shmem from prefetch registers. Each thread has its own
+            // copy of prefetch_reg_b.
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                    for (int n = 0; n < REG_N; ++n)
-                      reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N);
+                    for (int i = 0; i < REG_N; ++i)
+                      svB_scr(thread_id, vlane_id + i * STRIDE_N) =
+                          prefetch_reg_b[i];
+
+              // populate shmem from prefetch registers. Each thread has its own
+              // copy of prefetch_reg_a.
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+                    for (int i = 0; i < REG_M; ++i)
+                      svA_scr(thread_id + i * STRIDE_M, vlane_id) =
+                          prefetch_reg_a[i];
+
+                    // Wait for shmem stores to land before performing next
+                    // TILE_K multiply
+                    member.team_barrier();
+                  }  // end n_tile_k_tiles loop
+
+                  // Multiply last tile, may be a partial tile
+                  __rshmem_and_mul(thread_id, vlane_id, __k - kk, reg_a, reg_b,
+                                   reg_c, svA_scr, svB_scr);
 
+                  // store results back to global memory
+                  if (__beta == 0.0F) {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+                    for (int m = 0; m < REG_M; ++m) {
+                      int cm = m_offset + m * STRIDE_M;
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+                      for (int n = 0; n < REG_N; ++n) {
+                        int cn = n_offset + n * STRIDE_N;
+                        fma_bounds_check(svC, cm, cn, reg_c[m][n], __alpha,
+                                         __ei.__alpha_fma_tag,
+                                         __ei.__bounds_check_tag);
+                      }
+                    }
+                  } else {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int m = 0; m < REG_M; ++m) {
+                      int cm = m_offset + m * STRIDE_M;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                      for (int n = 0; n < REG_N; ++n)
-                        reg_c[m][n] += reg_a[m] * reg_b[n] * __alpha;
+                      for (int n = 0; n < REG_N; ++n) {
+                        int cn = n_offset + n * STRIDE_N;
+                        fma_bounds_check(svC, cm, cn, reg_c[m][n], __alpha,
+                                         __beta, __ei.__alpha_fma_tag,
+                                         __ei.__bounds_check_tag);
+                      }
                     }
                   }
                 });
@@ -290,20 +503,28 @@ class BatchedDblBufGemm {
     }
 
     KOKKOS_INLINE_FUNCTION
-    void operator()(const MemberType &member) const {
+    void operator()(const Kokkos::LayoutLeft &,
+                    const MemberType &member) const {
+      // TODO: use Kokkos view with compile-time size to allocating register??
+      //  Then we can use local deep copy for prefetch_reg population.
       // Allocate registers used for prefetching
       view_value_type prefetch_reg_a[REG_M] = {0}, prefetch_reg_b[REG_N] = {0};
 
       // Allocate registers used for FMAs
       view_value_type reg_a[REG_M] = {0}, reg_b[REG_N] = {0},
                       reg_c[REG_M][REG_N] = {{0}};
+      // TODO: look at local loads and stores via nvprof
+      // TODO: look at GPU trace in nvprof to find out how many registers are
+      // used.
 
       unsigned batch_idx = member.league_rank() / __n_sub_tiles;
 
       // Compute starting tile offsets for each team into svA, svB, svC
       unsigned local_team_idx = member.league_rank() % __n_sub_tiles;
-      unsigned start_m        = (local_team_idx / __tiles_per_col) * __tile_m;
-      unsigned start_n        = (local_team_idx % __tiles_per_col) * __tile_n;
+      unsigned start_m        = (local_team_idx % __tiles_per_row) * TILE_M;
+      unsigned start_n        = (local_team_idx / __tiles_per_row) * TILE_N;
+
+      int kk;
 
       // Fetch entire 2-rank sub-matrix
       auto svA = subview_wrapper(__A, batch_idx, Kokkos::ALL(), Kokkos::ALL(),
@@ -314,71 +535,49 @@ class BatchedDblBufGemm {
                                  __ei.__batch_layout_tag);
 
       // Allocate scratch memory buffers used for prefetching
-      view_type_2d_scratch svA_scr(member.team_scratch(0), __tile_k, __tile_m);
-      view_type_2d_scratch svB_scr(member.team_scratch(0), __tile_k, __tile_n);
+      view_type_2d_scratch svA_scr(member.team_scratch(0), TILE_K, TILE_M);
+      view_type_2d_scratch svB_scr(member.team_scratch(0), TILE_N, TILE_K);
 
-      // Here we populate scratch memory with one or more "k" tiles for every
-      // thread of the team!
       Kokkos::parallel_for(
-          Kokkos::TeamThreadRange(member, 0, __tile_n / REG_N),
+          Kokkos::TeamThreadRange(member, 0, STRIDE_N),
           [&](const int &thread_id) {
-            auto thread_offset = thread_id + start_n;
+            int n_offset = thread_id + start_n;
+
             Kokkos::parallel_for(
-                Kokkos::ThreadVectorRange(member, 0, __tile_k),
+                Kokkos::ThreadVectorRange(member, 0, STRIDE_M),
                 [&](const int &vlane_id) {
+                  int m_offset = vlane_id + start_m;
+
+          // Here we populate scratch memory with one or more "k" tiles for
+          // every thread of the team!
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                   for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N)
-                    svB_scr(vlane_id, thread_id + i) =
+                    svB_scr(thread_id + i, vlane_id) =
                         access_view_bounds_check<view_value_type>(
-                            svB, vlane_id, thread_offset + i,
+                            svB, vlane_id, n_offset + i,
                             __ei.__bounds_check_tag);
-                });
-          });
-      Kokkos::parallel_for(
-          Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M),
-          [&](const int &thread_id) {
-            auto thread_offset = thread_id + start_m;
-            Kokkos::parallel_for(
-                Kokkos::ThreadVectorRange(member, 0, __tile_k),
-                [&](const int &vlane_id) {
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                   for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M)
-                    svA_scr(vlane_id, thread_id + i) =
+                    svA_scr(thread_id, vlane_id + i) =
                         access_view_bounds_check<view_value_type>(
-                            svA, thread_offset + i, vlane_id,
+                            svA, m_offset + i, thread_id,
                             __ei.__bounds_check_tag);
-                });
-          });
 
-      // Check whether we have a partial tile
-      unsigned partial_tile_k = __k - (__n_tile_k_tiles * __tile_k);
-      int partial_tile        = !!(partial_tile_k);
+                  // Wait for A, B to reside in scratch memory
+                  member.team_barrier();
 
-      // Wait for A, B to reside in scratch memory
-      member.team_barrier();
+                  // Each thread calculates a single dot product in chunks of
+                  // size TILE_K
+                  for (kk = 0; kk < __k - TILE_K; kk += TILE_K) {
+                    int k_tile_offset = kk + TILE_K;
 
-      // Each thread calculates a single dot product in chunks of size __tile_k
-#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-#pragma unroll
-#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-      for (unsigned k_tile_idx = 1;
-           k_tile_idx < __n_tile_k_tiles + partial_tile; ++k_tile_idx) {
-        auto k_tile_offset = k_tile_idx * __tile_k;
-
-        // Get this threads next __tile_k entries from global memory
-        // Each thread has its own copy of prefetch_reg_b. TeamThreadRange runs
-        // over all threads in the team.
-        Kokkos::parallel_for(
-            Kokkos::TeamThreadRange(member, 0, __tile_n / REG_N),
-            [&](const int &thread_id) {
-              auto thread_offset = thread_id + start_n;
-              Kokkos::parallel_for(
-                  Kokkos::ThreadVectorRange(member, 0, __tile_k),
-                  [&](const int &vlane_id) {
+            // Get this threads next TILE_K entries from global memory
+            // Each thread has its own copy of prefetch_reg_b.
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
@@ -386,141 +585,91 @@ class BatchedDblBufGemm {
                       prefetch_reg_b[i] =
                           access_view_bounds_check<view_value_type>(
                               svB, vlane_id + k_tile_offset,
-                              thread_offset + i * STRIDE_N,
-                              __ei.__bounds_check_tag);
-                  });
-            });
-
-        // Get this threads next __tile_k entries from global memory
-        // Each thread has its own copy of prefetch_reg_b. TeamThreadRange runs
-        // over all threads in the team.
-        Kokkos::parallel_for(
-            Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M),
-            [&](const int &thread_id) {
-              auto thread_offset = thread_id + start_m;
-              Kokkos::parallel_for(
-                  Kokkos::ThreadVectorRange(member, 0, __tile_k),
-                  [&](const int &vlane_id) {
+                              n_offset + i * STRIDE_N, __ei.__bounds_check_tag);
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int i = 0; i < REG_M; ++i)
                       prefetch_reg_a[i] =
                           access_view_bounds_check<view_value_type>(
-                              svA, thread_offset + i * STRIDE_M,
-                              vlane_id + k_tile_offset,
+                              svA, m_offset + i * STRIDE_M,
+                              thread_id + k_tile_offset,
                               __ei.__bounds_check_tag);
-                  });
-            });
-
-        // Multiply
-        __rshmem_and_mult(member, __tile_k, __tile_m, __tile_n, reg_a, reg_b,
-                          reg_c, svA_scr, svB_scr);
-        // Wait for:
-        //   1. prefetch_regs to be populated
-        //   2. for shmem to no longer be read from
-        member.team_barrier();
-
-        // populate shmem from prefetch registers. Each thread has its own copy
-        // of prefetch_reg_a.
-        Kokkos::parallel_for(
-            Kokkos::TeamThreadRange(member, 0, __tile_n / REG_N),
-            [&](const int &thread_id) {
-              auto thread_offset = thread_id;
-              Kokkos::parallel_for(
-                  Kokkos::ThreadVectorRange(member, 0, __tile_k),
-                  [&](const int &vlane_id) {
+
+                    __rshmem_and_mul_ll(thread_id, vlane_id, TILE_K, reg_a,
+                                        reg_b, reg_c, svA_scr, svB_scr);
+
+                    // Wait for:
+                    //   1. prefetch_regs to be populated
+                    //   2. for shmem to no longer be read from
+                    member.team_barrier();
+
+            // populate shmem from prefetch registers. Each thread has its own
+            // copy of prefetch_reg_b.
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int i = 0; i < REG_N; ++i)
-                      svB_scr(vlane_id, thread_offset + i * STRIDE_N) =
+                      svB_scr(thread_id + i * STRIDE_N, vlane_id) =
                           prefetch_reg_b[i];
-                  });
-            });
-
-        // populate shmem from prefetch registers. Each thread has its own copy
-        // of prefetch_reg_b.
-        Kokkos::parallel_for(
-            Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M),
-            [&](const int &thread_id) {
-              auto thread_offset = thread_id;
-              Kokkos::parallel_for(
-                  Kokkos::ThreadVectorRange(member, 0, __tile_k),
-                  [&](const int &vlane_id) {
+
+              // populate shmem from prefetch registers. Each thread has its own
+              // copy of prefetch_reg_a.
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int i = 0; i < REG_M; ++i)
-                      svA_scr(vlane_id, thread_offset + i * STRIDE_M) =
+                      svA_scr(thread_id, vlane_id + i * STRIDE_M) =
                           prefetch_reg_a[i];
-                  });
-            });
-
-        // Wait for shmem stores to land before performing next __tile_k
-        // multiply
-        member.team_barrier();
-
-      }  // end n_tile_k_tiles loop
-
-      // Multiply last tile, may be a partial tile
-      partial_tile_k = partial_tile ? partial_tile_k : __tile_k;
-      __rshmem_and_mult(member, partial_tile_k, __tile_m, __tile_n, reg_a,
-                        reg_b, reg_c, svA_scr, svB_scr);
-
-      if (__beta == 0.0F) {
-        // store results back to global memory
-        Kokkos::parallel_for(
-            Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M),
-            [&](const int &thread_id) {
-              auto thread_m_offset = thread_id + start_m;
-              Kokkos::parallel_for(
-                  Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N),
-                  [&](const int &vlane_id) {
-                    auto thread_n_offset = vlane_id + start_n;
+
+                    // Wait for shmem stores to land before performing next
+                    // TILE_K multiply
+                    member.team_barrier();
+                  }  // end n_tile_k_tiles loop
+
+                  // Multiply last tile, may be a partial tile
+                  __rshmem_and_mul_ll(thread_id, vlane_id, __k - kk, reg_a,
+                                      reg_b, reg_c, svA_scr, svB_scr);
+
+                  // store results back to global memory
+                  if (__beta == 0.0F) {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                    for (int m = 0; m < REG_M; ++m) {
-                      int cm = thread_m_offset + m * STRIDE_M;
+                    for (int n = 0; n < REG_N; ++n) {
+                      int cn = n_offset + n * STRIDE_N;
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                      for (int n = 0; n < REG_N; ++n) {
-                        int cn = thread_n_offset + n * STRIDE_N;
-                        fma_bounds_check(svC, cm, cn, reg_c[m][n],
+                      for (int m = 0; m < REG_M; ++m) {
+                        int cm = m_offset + m * STRIDE_M;
+                        fma_bounds_check(svC, cm, cn, reg_c[m][n], __alpha,
+                                         __ei.__alpha_fma_tag,
                                          __ei.__bounds_check_tag);
                       }
                     }
-                  });
-            });
-      } else {
-        // store results back to global memory
-        Kokkos::parallel_for(
-            Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M),
-            [&](const int &thread_id) {
-              auto thread_m_offset = thread_id + start_m;
-              Kokkos::parallel_for(
-                  Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N),
-                  [&](const int &vlane_id) {
-                    auto thread_n_offset = vlane_id + start_n;
+                  } else {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                    for (int m = 0; m < REG_M; ++m) {
-                      int cm = thread_m_offset + m * STRIDE_M;
+                    for (int n = 0; n < REG_N; ++n) {
+                      int cn = n_offset + n * STRIDE_N;
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                      for (int n = 0; n < REG_N; ++n) {
-                        int cn = thread_n_offset + n * STRIDE_N;
-                        fma_bounds_check(svC, cm, cn, reg_c[m][n], __beta,
+                      for (int m = 0; m < REG_M; ++m) {
+                        int cm = m_offset + m * STRIDE_M;
+                        fma_bounds_check(svC, cm, cn, reg_c[m][n], __alpha,
+                                         __beta, __ei.__alpha_fma_tag,
                                          __ei.__bounds_check_tag);
                       }
                     }
-                  });
-            });
-      }
+                  }
+                });
+          });
     }
   };
 };
diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp
index 5875029dd1..f2b009fe2f 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_GEMM_SERIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_GEMM_SERIAL_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -13,143 +12,126 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
-
-  template<typename ArgAlgo>
-  struct SerialGemmInternal {
-    template<typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m, const int n, const int k,
-           const ScalarType alpha, 
-           const ValueType *__restrict__ A, const int as0, const int as1,
-           const ValueType *__restrict__ B, const int bs0, const int bs1,
-           const ScalarType beta,
-           /**/  ValueType *__restrict__ C, const int cs0, const int cs1);
-  };
-        
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialGemmInternal<Algo::Gemm::Unblocked>::
-  invoke(const int m, const int n, const int k,
-         const ScalarType alpha, 
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         const ValueType *__restrict__ B, const int bs0, const int bs1,
-         const ScalarType beta,
-         /**/  ValueType *__restrict__ C, const int cs0, const int cs1) {
-    // C = beta C + alpha A B
-    // C (m x n), A(m x k), B(k x n)
-
-    const ScalarType one(1.0), zero(0.0);
-
-    if      (beta == zero) SerialSetInternal  ::invoke(m, n, zero, C, cs0, cs1);
-    else if (beta != one ) SerialScaleInternal::invoke(m, n, beta, C, cs0, cs1);
-      
-    if (alpha != zero) {
-      if (m <= 0 || n <= 0 || k <= 0) return 0;
-        
-      ValueType
-        *__restrict__ pC = C;
-      for (int p=0;p<k;++p) {
-        const ValueType
-          *__restrict__ pA = A+p*as1,
-          *__restrict__ pB = B+p*bs0;
-        for (int i=0;i<m;++i) {
-          const ValueType tA(alpha*pA[i*as0]);
+///
+/// Serial Internal Impl
+/// ====================
+
+template <typename ArgAlgo>
+struct SerialGemmInternal {
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const int m, const int n, const int k, const ScalarType alpha,
+      const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+      const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1,
+      const ScalarType beta,
+      /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1);
+};
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int SerialGemmInternal<Algo::Gemm::Unblocked>::invoke(
+    const int m, const int n, const int k, const ScalarType alpha,
+    const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+    const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1,
+    const ScalarType beta,
+    /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1) {
+  // C = beta C + alpha A B
+  // C (m x n), A(m x k), B(k x n)
+
+  const ScalarType one(1.0), zero(0.0);
+
+  if (beta == zero)
+    SerialSetInternal ::invoke(m, n, zero, C, cs0, cs1);
+  else if (beta != one)
+    SerialScaleInternal::invoke(m, n, beta, C, cs0, cs1);
+
+  if (alpha != zero) {
+    if (m <= 0 || n <= 0 || k <= 0) return 0;
+
+    ValueType *KOKKOS_RESTRICT pC = C;
+    for (int p = 0; p < k; ++p) {
+      const ValueType *KOKKOS_RESTRICT pA                  = A + p * as1,
+                                       *KOKKOS_RESTRICT pB = B + p * bs0;
+      for (int i = 0; i < m; ++i) {
+        const ValueType tA(alpha * pA[i * as0]);
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-          for (int j=0;j<n;++j)
-            pC[i*cs0+j*cs1] += tA*pB[j*bs1];
-        }
+        for (int j = 0; j < n; ++j) pC[i * cs0 + j * cs1] += tA * pB[j * bs1];
       }
     }
-    return 0;
   }
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int SerialGemmInternal<Algo::Gemm::Blocked>::invoke(
+    const int m, const int n, const int k, const ScalarType alpha,
+    const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+    const ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1,
+    const ScalarType beta,
+    /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1) {
+  // C = beta C + alpha A B
+  // C (m x n), A(m x k), B(k x n)
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialGemmInternal<Algo::Gemm::Blocked>::
-  invoke(const int m, const int n, const int k,
-         const ScalarType alpha, 
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         const ValueType *__restrict__ B, const int bs0, const int bs1,
-         const ScalarType beta,
-         /**/  ValueType *__restrict__ C, const int cs0, const int cs1) {
-    // C = beta C + alpha A B
-    // C (m x n), A(m x k), B(k x n)
-
-    enum : int {
-      mbAlgo = Algo::Gemm::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>(),
-      nbAlgo = Algo::Gemm::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>() 
+  constexpr int mbAlgo = Algo::Gemm::Blocked::mb();
+  constexpr int nbAlgo = Algo::Gemm::Blocked::mb();
+
+  const ScalarType one(1.0), zero(0.0);
+
+  if (beta == zero)
+    SerialSetInternal ::invoke(m, n, zero, C, cs0, cs1);
+  else if (beta != one)
+    SerialScaleInternal::invoke(m, n, beta, C, cs0, cs1);
+
+  if (alpha != zero) {
+    if (m <= 0 || n <= 0 || k <= 0) return 0;
+    const ValueType alpha_value(alpha);
+
+    InnerGemmFixC<mbAlgo, nbAlgo> inner(as0, as1, bs0, bs1, cs0, cs1);
+    auto gemm = [&](const int ib, const int jb, const int pb,
+                    const ValueType *KOKKOS_RESTRICT AA,
+                    const ValueType *KOKKOS_RESTRICT BB,
+                    /**/ ValueType *KOKKOS_RESTRICT CC) {
+      const int mb = mbAlgo, nb = nbAlgo;
+      for (int i = 0; i < ib; i += mb)
+        for (int j = 0; j < jb; j += nb)
+          inner.serial_invoke(alpha_value, AA + i * as0, BB + j * bs1,
+                              (i + mb) > ib ? (ib - i) : mb,
+                              (j + nb) > jb ? (jb - j) : nb, pb,
+                              CC + i * cs0 + j * cs1);
     };
-      
-    const ScalarType one(1.0), zero(0.0);      
-      
-    if      (beta == zero) SerialSetInternal  ::invoke(m, n, zero, C, cs0, cs1);
-    else if (beta != one ) SerialScaleInternal::invoke(m, n, beta, C, cs0, cs1);
-      
-    if (alpha != zero) {
-      if (m <= 0 || n <= 0 || k <= 0) return 0;
-      const ValueType alpha_value(alpha);
-
-      InnerGemmFixC<mbAlgo,nbAlgo> inner(as0, as1, bs0, bs1, cs0, cs1);
-      auto gemm = [&](const int ib, 
-                      const int jb,
-                      const int pb,
-                      const ValueType *__restrict__ AA,
-                      const ValueType *__restrict__ BB,
-                      /**/  ValueType *__restrict__ CC) {
-        const int mb = mbAlgo, nb = nbAlgo;
-        for (int i=0;i<ib;i+=mb) 
-          for (int j=0;j<jb;j+=nb)
-            inner.serial_invoke(alpha_value, 
-                                AA+i*as0, BB+j*bs1, 
-                                (i+mb) > ib ? (ib-i) : mb, 
-                                (j+nb) > jb ? (jb-j) : nb, 
-                                pb, 
-                                CC+i*cs0+j*cs1);
-      };          
-            
-      const bool is_small = true; //(m*n*k <= 64*64*64);
-      if (is_small) {
-        gemm(m, n, k, A, B, C);
-      } else {
-        // // cache blocking
-        // const int 
-        //   nc = nb*10, kc = mb*4, mc = mb*4;
-              
-        // for (int jj=0;jj<n;jj+=nc) {
-        //   const int tj = n-jj, jb = (tj < nc ? tj : nc);
-        //   for (int pp=0;pp<k;pp+=kc) {
-        //     const int tp = k-pp, pb = (tp < kc ? tp : kc);
-        //     //const int pb = k, pp = 0;
-        //     for (int ii=0;ii<m;ii+=mc) {
-        //       const int ti = m-ii, ib = (ti < mc ? ti : mc);
-              
-        //       const ValueType *__restrict__ AA = A+ii*as0+pp*as1;
-        //       const ValueType *__restrict__ BB = B+pp*bs0+jj*bs1;
-        //       /**/  ValueType *__restrict__ CC = C+ii*cs0+jj*cs1;
-              
-        //       gemm(ib, jb, pb, AA, BB, CC);                  
-        //     } // for ii
-        //   } // for pp
-        // } // for jj
-      }
+
+    const bool is_small = true;  //(m*n*k <= 64*64*64);
+    if (is_small) {
+      gemm(m, n, k, A, B, C);
+    } else {
+      // // cache blocking
+      // const int
+      //   nc = nb*10, kc = mb*4, mc = mb*4;
+
+      // for (int jj=0;jj<n;jj+=nc) {
+      //   const int tj = n-jj, jb = (tj < nc ? tj : nc);
+      //   for (int pp=0;pp<k;pp+=kc) {
+      //     const int tp = k-pp, pb = (tp < kc ? tp : kc);
+      //     //const int pb = k, pp = 0;
+      //     for (int ii=0;ii<m;ii+=mc) {
+      //       const int ti = m-ii, ib = (ti < mc ? ti : mc);
+
+      //       const ValueType *KOKKOS_RESTRICT AA = A+ii*as0+pp*as1;
+      //       const ValueType *KOKKOS_RESTRICT BB = B+pp*bs0+jj*bs1;
+      //       /**/  ValueType *KOKKOS_RESTRICT CC = C+ii*cs0+jj*cs1;
+
+      //       gemm(ib, jb, pb, AA, BB, CC);
+      //     } // for ii
+      //   } // for pp
+      // } // for jj
     }
-    return 0;
   }
-    
+  return 0;
 }
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Impl.hpp
index 0b68727f0e..b8df40099a 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_GEMM_TEAMVECTOR_IMPL_HPP__
 #define __KOKKOSBATCHED_GEMM_TEAMVECTOR_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,147 +8,102 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Team Impl
-  /// =========
+///
+/// Team Impl
+/// =========
+
+///
+/// Implemented:
+/// NT/NT, T/NT, NT/T, T/T
+///
+/// Not yet implemented (ConjTranspose)
+/// CT/NT, NT/CT, CT/CT
+///
+
+///
+/// NT/NT
+///
+
+template <typename MemberType>
+struct TeamVectorGemm<MemberType, Trans::NoTranspose, Trans::NoTranspose,
+                      Algo::Gemm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType,
+            typename CViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const ScalarType alpha, const AViewType &A,
+      const BViewType &B, const ScalarType beta, const CViewType &C) {
+    // C = beta C + alpha A B
+    // C (m x n), A(m x k), B(k x n)
+    return TeamVectorGemmInternal<Algo::Gemm::Unblocked>::invoke(
+        member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(),
+        A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1(), beta,
+        C.data(), C.stride_0(), C.stride_1());
+  }
+};
 
-  ///
-  /// Implemented:
-  /// NT/NT, T/NT, NT/T, T/T
-  ///
-  /// Not yet implemented (ConjTranspose)
-  /// CT/NT, NT/CT, CT/CT
-  ///
+///
+/// T/NT
+///
 
-  ///
-  /// NT/NT
-  ///
-    
-  template<typename MemberType>
-  struct TeamVectorGemm<MemberType,Trans::NoTranspose,Trans::NoTranspose,Algo::Gemm::Unblocked> {
+template <typename MemberType>
+struct TeamVectorGemm<MemberType, Trans::Transpose, Trans::NoTranspose,
+                      Algo::Gemm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType,
+            typename CViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const ScalarType alpha, const AViewType &A,
+      const BViewType &B, const ScalarType beta, const CViewType &C) {
+    // C = beta C + alpha A B
+    // C (m x n), A(m x k), B(k x n)
+    return TeamVectorGemmInternal<Algo::Gemm::Unblocked>::invoke(
+        member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(),
+        A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1(), beta,
+        C.data(), C.stride_0(), C.stride_1());
+  }
+};
 
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType,
-             typename CViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B,
-           const ScalarType beta,
-           const CViewType &C) {
-      // C = beta C + alpha A B
-      // C (m x n), A(m x k), B(k x n)
-      return TeamVectorGemmInternal<Algo::Gemm::Unblocked>::
-        invoke(member,
-               C.extent(0), C.extent(1), A.extent(1),
-               alpha, 
-               A.data(), A.stride_0(), A.stride_1(),
-               B.data(), B.stride_0(), B.stride_1(),
-               beta,
-               C.data(), C.stride_0(), C.stride_1());
-    }
-  };
-    
-  ///
-  /// T/NT
-  ///
-    
-  template<typename MemberType>
-  struct TeamVectorGemm<MemberType,Trans::Transpose,Trans::NoTranspose,Algo::Gemm::Unblocked> {
+///
+/// NT/T
+///
 
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType,
-             typename CViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B,
-           const ScalarType beta,
-           const CViewType &C) {
-      // C = beta C + alpha A B
-      // C (m x n), A(m x k), B(k x n)
-      return TeamVectorGemmInternal<Algo::Gemm::Unblocked>::
-        invoke(member,
-               C.extent(0), C.extent(1), A.extent(0),
-               alpha, 
-               A.data(), A.stride_1(), A.stride_0(),
-               B.data(), B.stride_0(), B.stride_1(),
-               beta,
-               C.data(), C.stride_0(), C.stride_1());
-    }
-  };
-    
-  ///
-  /// NT/T
-  ///
-    
-  template<typename MemberType>
-  struct TeamVectorGemm<MemberType,Trans::NoTranspose,Trans::Transpose,Algo::Gemm::Unblocked> {
+template <typename MemberType>
+struct TeamVectorGemm<MemberType, Trans::NoTranspose, Trans::Transpose,
+                      Algo::Gemm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType,
+            typename CViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const ScalarType alpha, const AViewType &A,
+      const BViewType &B, const ScalarType beta, const CViewType &C) {
+    // C = beta C + alpha A B
+    // C (m x n), A(m x k), B(k x n)
+    return TeamVectorGemmInternal<Algo::Gemm::Unblocked>::invoke(
+        member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(),
+        A.stride_0(), A.stride_1(), B.data(), B.stride_1(), B.stride_0(), beta,
+        C.data(), C.stride_0(), C.stride_1());
+  }
+};
 
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType,
-             typename CViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B,
-           const ScalarType beta,
-           const CViewType &C) {
-      // C = beta C + alpha A B
-      // C (m x n), A(m x k), B(k x n)
-      return TeamVectorGemmInternal<Algo::Gemm::Unblocked>::
-        invoke(member,
-               C.extent(0), C.extent(1), A.extent(1),
-               alpha, 
-               A.data(), A.stride_0(), A.stride_1(),
-               B.data(), B.stride_1(), B.stride_0(),
-               beta,
-               C.data(), C.stride_0(), C.stride_1());
-    }
-  };
-    
-  ///
-  /// T/T
-  ///
-    
-  template<typename MemberType>
-  struct TeamVectorGemm<MemberType,Trans::Transpose,Trans::Transpose,Algo::Gemm::Unblocked> {
+///
+/// T/T
+///
 
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType,
-             typename CViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B,
-           const ScalarType beta,
-           const CViewType &C) {
-      // C = beta C + alpha A B
-      // C (m x n), A(m x k), B(k x n)
-      return TeamVectorGemmInternal<Algo::Gemm::Unblocked>::
-        invoke(member,
-               C.extent(0), C.extent(1), A.extent(0),
-               alpha, 
-               A.data(), A.stride_1(), A.stride_0(),
-               B.data(), B.stride_1(), B.stride_0(),
-               beta,
-               C.data(), C.stride_0(), C.stride_1());
-    }
-  };
-    
-}
+template <typename MemberType>
+struct TeamVectorGemm<MemberType, Trans::Transpose, Trans::Transpose,
+                      Algo::Gemm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType,
+            typename CViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const ScalarType alpha, const AViewType &A,
+      const BViewType &B, const ScalarType beta, const CViewType &C) {
+    // C = beta C + alpha A B
+    // C (m x n), A(m x k), B(k x n)
+    return TeamVectorGemmInternal<Algo::Gemm::Unblocked>::invoke(
+        member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(),
+        A.stride_1(), A.stride_0(), B.data(), B.stride_1(), B.stride_0(), beta,
+        C.data(), C.stride_0(), C.stride_1());
+  }
+};
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp
index 971389902e..b0c1f9c1ae 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_GEMM_TEAMVECTOR_INTERNAL_HPP__
 #define __KOKKOSBATCHED_GEMM_TEAMVECTOR_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -11,76 +10,101 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// TeamVector Internal Impl
-  /// ==================== 
-  template<typename ArgAlgo>
-  struct TeamVectorGemmInternal {
-    template<typename MemberType,
-             typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const int m, const int n, const int k,
-           const ScalarType alpha, 
-           const ValueType *__restrict__ A, const int as0, const int as1,
-           const ValueType *__restrict__ B, const int bs0, const int bs1,
-           const ScalarType beta,
-           /**/  ValueType *__restrict__ C, const int cs0, const int cs1);
-  };
-
-  template<>
-  template<typename MemberType,
-           typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamVectorGemmInternal<Algo::Gemm::Unblocked>::
-  invoke(const MemberType &member, 
-         const int m, const int n, const int k,
-         const ScalarType alpha, 
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         const ValueType *__restrict__ B, const int bs0, const int bs1,
-         const ScalarType beta,
-         /**/  ValueType *__restrict__ C, const int cs0, const int cs1) {
-
-    // C = beta C + alpha A B
-    // C (m x n), A(m x k), B(k x n)
-      
-    const ScalarType one(1.0), zero(0.0);
-        
-    if      (beta == zero) TeamVectorSetInternal  ::invoke(member, m, n, zero, C, cs0, cs1);
-    else if (beta != one ) TeamVectorScaleInternal::invoke(member, m, n, beta, C, cs0, cs1);
-        
-    if (alpha != ScalarType(0.0)) {
-      if (m <= 0 || n <= 0 || k <= 0) return 0;
-
-      if (beta != one) 
-        member.team_barrier();
-            
-      Kokkos::parallel_for
-	(Kokkos::TeamThreadRange(member,m),
-	 [&](const int &i) {
-	   const ValueType
-	     *__restrict__ pA = A+i*as0;
-	   Kokkos::parallel_for
-	     (Kokkos::ThreadVectorRange(member,n),
-	      [&](const int &j) {								   
-		const ValueType
-		  *__restrict__ pB = B+j*bs1;
-		
-		ValueType c = ValueType(0);
-		for (int p=0;p<k;++p) 
-		  c += pA[p*as1]*pB[p*bs0];
-		C[i*cs0+j*cs1] += alpha*c;
-	      });
-	 });
-    }
-    return 0;
+///
+/// TeamVector Internal Impl
+/// ====================
+template <typename ArgAlgo, bool useConjA = false>
+struct TeamVectorGemmInternal {
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const int m, const int n, const int k,
+      const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0,
+      const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0,
+      const int bs1, const ScalarType beta,
+      /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1);
+};
+
+template <>
+template <typename MemberType, typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+TeamVectorGemmInternal<Algo::Gemm::Unblocked, false>::invoke(
+    const MemberType &member, const int m, const int n, const int k,
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0,
+    const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0,
+    const int bs1, const ScalarType beta,
+    /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1) {
+  // C = beta C + alpha A B
+  // C (m x n), A(m x k), B(k x n)
+
+  const ScalarType one(1.0), zero(0.0);
+
+  if (beta == zero)
+    TeamVectorSetInternal ::invoke(member, m, n, zero, C, cs0, cs1);
+  else if (beta != one)
+    TeamVectorScaleInternal::invoke(member, m, n, beta, C, cs0, cs1);
+
+  if (alpha != ScalarType(0.0)) {
+    if (m <= 0 || n <= 0 || k <= 0) return 0;
+
+    if (beta != one) member.team_barrier();
+
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) {
+      const ValueType *KOKKOS_RESTRICT pA = A + i * as0;
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n),
+                           [&](const int &j) {
+                             const ValueType *KOKKOS_RESTRICT pB = B + j * bs1;
+
+                             ValueType c = ValueType(0);
+                             for (int p = 0; p < k; ++p)
+                               c += pA[p * as1] * pB[p * bs0];
+                             C[i * cs0 + j * cs1] += alpha * c;
+                           });
+    });
   }
+  return 0;
+}
+
+template <>
+template <typename MemberType, typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+TeamVectorGemmInternal<Algo::Gemm::Unblocked, true>::invoke(
+    const MemberType &member, const int m, const int n, const int k,
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0,
+    const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0,
+    const int bs1, const ScalarType beta,
+    /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1) {
+  // C = beta C + alpha A B
+  // C (m x n), A(m x k), B(k x n)
+
+  const ScalarType one(1.0), zero(0.0);
+
+  if (beta == zero)
+    TeamVectorSetInternal ::invoke(member, m, n, zero, C, cs0, cs1);
+  else if (beta != one)
+    TeamVectorScaleInternal::invoke(member, m, n, beta, C, cs0, cs1);
 
+  if (alpha != ScalarType(0.0)) {
+    if (m <= 0 || n <= 0 || k <= 0) return 0;
+
+    if (beta != one) member.team_barrier();
+
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) {
+      const ValueType *KOKKOS_RESTRICT pA = A + i * as0;
+      Kokkos::parallel_for(
+          Kokkos::ThreadVectorRange(member, n), [&](const int &j) {
+            const ValueType *KOKKOS_RESTRICT pB = B + j * bs1;
+
+            ValueType c = ValueType(0);
+            for (int p = 0; p < k; ++p)
+              c += Kokkos::ArithTraits<ValueType>::conj(pA[p * as1]) *
+                   pB[p * bs0];
+            C[i * cs0 + j * cs1] += alpha * c;
+          });
+    });
+  }
+  return 0;
 }
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_Team_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_Team_Impl.hpp
index 6f7a29bf64..62526fc96e 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_Team_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_Team_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_GEMM_TEAM_IMPL_HPP__
 #define __KOKKOSBATCHED_GEMM_TEAM_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,255 +8,170 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Team Impl
-  /// =========
-
-  ///
-  /// Implemented:
-  /// NT/NT, T/NT, NT/T, T/T
-  ///
-  /// Not yet implemented (ConjTranspose)
-  /// CT/NT, NT/CT, CT/CT
-  ///
-
-  ///
-  /// NT/NT
-  ///
-    
-  template<typename MemberType>
-  struct TeamGemm<MemberType,Trans::NoTranspose,Trans::NoTranspose,Algo::Gemm::Unblocked> {
-
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType,
-             typename CViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B,
-           const ScalarType beta,
-           const CViewType &C) {
-      // C = beta C + alpha A B
-      // C (m x n), A(m x k), B(k x n)
-      return TeamGemmInternal<Algo::Gemm::Unblocked>::
-        invoke(member,
-               C.extent(0), C.extent(1), A.extent(1),
-               alpha, 
-               A.data(), A.stride_0(), A.stride_1(),
-               B.data(), B.stride_0(), B.stride_1(),
-               beta,
-               C.data(), C.stride_0(), C.stride_1());
-    }
-  };
-    
-  template<typename MemberType>
-  struct TeamGemm<MemberType,Trans::NoTranspose,Trans::NoTranspose,Algo::Gemm::Blocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType,
-             typename CViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B,
-           const ScalarType beta,
-           const CViewType &C) {
-      // C = beta C + alpha A B
-      // C (m x n), A(m x k), B(k x n)
-      return TeamGemmInternal<Algo::Gemm::Blocked>::
-        invoke(member,
-               C.extent(0), C.extent(1), A.extent(1),
-               alpha, 
-               A.data(), A.stride_0(), A.stride_1(),
-               B.data(), B.stride_0(), B.stride_1(),
-               beta,
-               C.data(), C.stride_0(), C.stride_1());
-    }
-  };
-
-  ///
-  /// T/NT
-  ///
-    
-  template<typename MemberType>
-  struct TeamGemm<MemberType,Trans::Transpose,Trans::NoTranspose,Algo::Gemm::Unblocked> {
-
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType,
-             typename CViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B,
-           const ScalarType beta,
-           const CViewType &C) {
-      // C = beta C + alpha A B
-      // C (m x n), A(m x k), B(k x n)
-      return TeamGemmInternal<Algo::Gemm::Unblocked>::
-        invoke(member,
-               C.extent(0), C.extent(1), A.extent(0),
-               alpha, 
-               A.data(), A.stride_1(), A.stride_0(),
-               B.data(), B.stride_0(), B.stride_1(),
-               beta,
-               C.data(), C.stride_0(), C.stride_1());
-    }
-  };
-    
-  template<typename MemberType>
-  struct TeamGemm<MemberType,Trans::Transpose,Trans::NoTranspose,Algo::Gemm::Blocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType,
-             typename CViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B,
-           const ScalarType beta,
-           const CViewType &C) {
-      // C = beta C + alpha A B
-      // C (m x n), A(m x k), B(k x n)
-      return TeamGemmInternal<Algo::Gemm::Blocked>::
-        invoke(member,
-               C.extent(0), C.extent(1), A.extent(0),
-               alpha, 
-               A.data(), A.stride_1(), A.stride_0(),
-               B.data(), B.stride_0(), B.stride_1(),
-               beta,
-               C.data(), C.stride_0(), C.stride_1());
-    }
-  };
-
-  ///
-  /// NT/T
-  ///
-    
-  template<typename MemberType>
-  struct TeamGemm<MemberType,Trans::NoTranspose,Trans::Transpose,Algo::Gemm::Unblocked> {
-
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType,
-             typename CViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B,
-           const ScalarType beta,
-           const CViewType &C) {
-      // C = beta C + alpha A B
-      // C (m x n), A(m x k), B(k x n)
-      return TeamGemmInternal<Algo::Gemm::Unblocked>::
-        invoke(member,
-               C.extent(0), C.extent(1), A.extent(1),
-               alpha, 
-               A.data(), A.stride_0(), A.stride_1(),
-               B.data(), B.stride_1(), B.stride_0(),
-               beta,
-               C.data(), C.stride_0(), C.stride_1());
-    }
-  };
-    
-  template<typename MemberType>
-  struct TeamGemm<MemberType,Trans::NoTranspose,Trans::Transpose,Algo::Gemm::Blocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType,
-             typename CViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B,
-           const ScalarType beta,
-           const CViewType &C) {
-      // C = beta C + alpha A B
-      // C (m x n), A(m x k), B(k x n)
-      return TeamGemmInternal<Algo::Gemm::Blocked>::
-        invoke(member,
-               C.extent(0), C.extent(1), A.extent(1),
-               alpha, 
-               A.data(), A.stride_0(), A.stride_1(),
-               B.data(), B.stride_1(), B.stride_0(),
-               beta,
-               C.data(), C.stride_0(), C.stride_1());
-    }
-  };
-
-  ///
-  /// T/T
-  ///
-    
-  template<typename MemberType>
-  struct TeamGemm<MemberType,Trans::Transpose,Trans::Transpose,Algo::Gemm::Unblocked> {
-
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType,
-             typename CViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B,
-           const ScalarType beta,
-           const CViewType &C) {
-      // C = beta C + alpha A B
-      // C (m x n), A(m x k), B(k x n)
-      return TeamGemmInternal<Algo::Gemm::Unblocked>::
-        invoke(member,
-               C.extent(0), C.extent(1), A.extent(0),
-               alpha, 
-               A.data(), A.stride_1(), A.stride_0(),
-               B.data(), B.stride_1(), B.stride_0(),
-               beta,
-               C.data(), C.stride_0(), C.stride_1());
-    }
-  };
-    
-  template<typename MemberType>
-  struct TeamGemm<MemberType,Trans::Transpose,Trans::Transpose,Algo::Gemm::Blocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType,
-             typename CViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B,
-           const ScalarType beta,
-           const CViewType &C) {
-      // C = beta C + alpha A B
-      // C (m x n), A(m x k), B(k x n)
-      return TeamGemmInternal<Algo::Gemm::Blocked>::
-        invoke(member,
-               C.extent(0), C.extent(1), A.extent(0),
-               alpha, 
-               A.data(), A.stride_1(), A.stride_0(),
-               B.data(), B.stride_1(), B.stride_0(),
-               beta,
-               C.data(), C.stride_0(), C.stride_1());
-    }
-  };
-
-}
-
+///
+/// Team Impl
+/// =========
+
+///
+/// Implemented:
+/// NT/NT, T/NT, NT/T, T/T
+///
+/// Not yet implemented (ConjTranspose)
+/// CT/NT, NT/CT, CT/CT
+///
+
+///
+/// NT/NT
+///
+
+template <typename MemberType>
+struct TeamGemm<MemberType, Trans::NoTranspose, Trans::NoTranspose,
+                Algo::Gemm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType,
+            typename CViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const ScalarType alpha, const AViewType &A,
+      const BViewType &B, const ScalarType beta, const CViewType &C) {
+    // C = beta C + alpha A B
+    // C (m x n), A(m x k), B(k x n)
+    return TeamGemmInternal<Algo::Gemm::Unblocked>::invoke(
+        member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(),
+        A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1(), beta,
+        C.data(), C.stride_0(), C.stride_1());
+  }
+};
+
+template <typename MemberType>
+struct TeamGemm<MemberType, Trans::NoTranspose, Trans::NoTranspose,
+                Algo::Gemm::Blocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType,
+            typename CViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const ScalarType alpha, const AViewType &A,
+      const BViewType &B, const ScalarType beta, const CViewType &C) {
+    // C = beta C + alpha A B
+    // C (m x n), A(m x k), B(k x n)
+    return TeamGemmInternal<Algo::Gemm::Blocked>::invoke(
+        member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(),
+        A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1(), beta,
+        C.data(), C.stride_0(), C.stride_1());
+  }
+};
+
+///
+/// T/NT
+///
+
+template <typename MemberType>
+struct TeamGemm<MemberType, Trans::Transpose, Trans::NoTranspose,
+                Algo::Gemm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType,
+            typename CViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const ScalarType alpha, const AViewType &A,
+      const BViewType &B, const ScalarType beta, const CViewType &C) {
+    // C = beta C + alpha A B
+    // C (m x n), A(m x k), B(k x n)
+    return TeamGemmInternal<Algo::Gemm::Unblocked>::invoke(
+        member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(),
+        A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1(), beta,
+        C.data(), C.stride_0(), C.stride_1());
+  }
+};
+
+template <typename MemberType>
+struct TeamGemm<MemberType, Trans::Transpose, Trans::NoTranspose,
+                Algo::Gemm::Blocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType,
+            typename CViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const ScalarType alpha, const AViewType &A,
+      const BViewType &B, const ScalarType beta, const CViewType &C) {
+    // C = beta C + alpha A B
+    // C (m x n), A(m x k), B(k x n)
+    return TeamGemmInternal<Algo::Gemm::Blocked>::invoke(
+        member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(),
+        A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1(), beta,
+        C.data(), C.stride_0(), C.stride_1());
+  }
+};
+
+///
+/// NT/T
+///
+
+template <typename MemberType>
+struct TeamGemm<MemberType, Trans::NoTranspose, Trans::Transpose,
+                Algo::Gemm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType,
+            typename CViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const ScalarType alpha, const AViewType &A,
+      const BViewType &B, const ScalarType beta, const CViewType &C) {
+    // C = beta C + alpha A B
+    // C (m x n), A(m x k), B(k x n)
+    return TeamGemmInternal<Algo::Gemm::Unblocked>::invoke(
+        member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(),
+        A.stride_0(), A.stride_1(), B.data(), B.stride_1(), B.stride_0(), beta,
+        C.data(), C.stride_0(), C.stride_1());
+  }
+};
+
+template <typename MemberType>
+struct TeamGemm<MemberType, Trans::NoTranspose, Trans::Transpose,
+                Algo::Gemm::Blocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType,
+            typename CViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const ScalarType alpha, const AViewType &A,
+      const BViewType &B, const ScalarType beta, const CViewType &C) {
+    // C = beta C + alpha A B
+    // C (m x n), A(m x k), B(k x n)
+    return TeamGemmInternal<Algo::Gemm::Blocked>::invoke(
+        member, C.extent(0), C.extent(1), A.extent(1), alpha, A.data(),
+        A.stride_0(), A.stride_1(), B.data(), B.stride_1(), B.stride_0(), beta,
+        C.data(), C.stride_0(), C.stride_1());
+  }
+};
+
+///
+/// T/T
+///
+
+template <typename MemberType>
+struct TeamGemm<MemberType, Trans::Transpose, Trans::Transpose,
+                Algo::Gemm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType,
+            typename CViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const ScalarType alpha, const AViewType &A,
+      const BViewType &B, const ScalarType beta, const CViewType &C) {
+    // C = beta C + alpha A B
+    // C (m x n), A(m x k), B(k x n)
+    return TeamGemmInternal<Algo::Gemm::Unblocked>::invoke(
+        member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(),
+        A.stride_1(), A.stride_0(), B.data(), B.stride_1(), B.stride_0(), beta,
+        C.data(), C.stride_0(), C.stride_1());
+  }
+};
+
+template <typename MemberType>
+struct TeamGemm<MemberType, Trans::Transpose, Trans::Transpose,
+                Algo::Gemm::Blocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType,
+            typename CViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const ScalarType alpha, const AViewType &A,
+      const BViewType &B, const ScalarType beta, const CViewType &C) {
+    // C = beta C + alpha A B
+    // C (m x n), A(m x k), B(k x n)
+    return TeamGemmInternal<Algo::Gemm::Blocked>::invoke(
+        member, C.extent(0), C.extent(1), A.extent(0), alpha, A.data(),
+        A.stride_1(), A.stride_0(), B.data(), B.stride_1(), B.stride_0(), beta,
+        C.data(), C.stride_0(), C.stride_1());
+  }
+};
+
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp
index b223a71fac..73d831586b 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_GEMM_TEAM_INTERNAL_HPP__
 #define __KOKKOSBATCHED_GEMM_TEAM_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -14,171 +13,143 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Team Internal Impl
-  /// ==================== 
-  template<typename ArgAlgo>
-  struct TeamGemmInternal {
-    template<typename MemberType,
-             typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const int m, const int n, const int k,
-           const ScalarType alpha, 
-           const ValueType *__restrict__ A, const int as0, const int as1,
-           const ValueType *__restrict__ B, const int bs0, const int bs1,
-           const ScalarType beta,
-           /**/  ValueType *__restrict__ C, const int cs0, const int cs1);
-  };
-
-  template<>
-  template<typename MemberType,
-           typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamGemmInternal<Algo::Gemm::Unblocked>::
-  invoke(const MemberType &member, 
-         const int m, const int n, const int k,
-         const ScalarType alpha, 
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         const ValueType *__restrict__ B, const int bs0, const int bs1,
-         const ScalarType beta,
-         /**/  ValueType *__restrict__ C, const int cs0, const int cs1) {
-
-    // C = beta C + alpha A B
-    // C (m x n), A(m x k), B(k x n)
-
-    const ScalarType one(1.0), zero(0.0);
-        
-    if      (beta == zero) TeamSetInternal  ::invoke(member, m, n, zero, C, cs0, cs1);
-    else if (beta != one ) TeamScaleInternal::invoke(member, m, n, beta, C, cs0, cs1);
-        
-    if (alpha != ScalarType(0.0)) {
-      if (m <= 0 || n <= 0 || k <= 0) return 0;
-
-      if (beta != one) 
-        member.team_barrier();
-            
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,m*n),[&](const int &ij) {
+///
+/// Team Internal Impl
+/// ====================
+template <typename ArgAlgo>
+struct TeamGemmInternal {
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const int m, const int n, const int k,
+      const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0,
+      const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0,
+      const int bs1, const ScalarType beta,
+      /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1);
+};
+
+template <>
+template <typename MemberType, typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int TeamGemmInternal<Algo::Gemm::Unblocked>::invoke(
+    const MemberType &member, const int m, const int n, const int k,
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0,
+    const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0,
+    const int bs1, const ScalarType beta,
+    /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1) {
+  // C = beta C + alpha A B
+  // C (m x n), A(m x k), B(k x n)
+
+  const ScalarType one(1.0), zero(0.0);
+
+  if (beta == zero)
+    TeamSetInternal ::invoke(member, m, n, zero, C, cs0, cs1);
+  else if (beta != one)
+    TeamScaleInternal::invoke(member, m, n, beta, C, cs0, cs1);
+
+  if (alpha != ScalarType(0.0)) {
+    if (m <= 0 || n <= 0 || k <= 0) return 0;
+
+    if (beta != one) member.team_barrier();
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(member, 0, m * n), [&](const int &ij) {
           // assume layout right for batched computation
-          const int i = ij/n, j = ij%n;
-          const ValueType
-            *__restrict__ pA = A+i*as0,
-            *__restrict__ pB = B+j*bs1;
-            
+          const int i = ij / n, j = ij % n;
+          const ValueType *KOKKOS_RESTRICT pA                  = A + i * as0,
+                                           *KOKKOS_RESTRICT pB = B + j * bs1;
+
           ValueType c = ValueType(0);
-          for (int p=0;p<k;++p) 
-            c += pA[p*as1]*pB[p*bs0];
-          C[i*cs0+j*cs1] += alpha*c;
+          for (int p = 0; p < k; ++p) c += pA[p * as1] * pB[p * bs0];
+          C[i * cs0 + j * cs1] += alpha * c;
         });
-    }
-    return 0;
   }
-    
-  template<>
-  template<typename MemberType,
-           typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamGemmInternal<Algo::Gemm::Blocked>::
-  invoke(const MemberType &member, 
-         const int m, const int n, const int k,
-         const ScalarType alpha, 
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         const ValueType *__restrict__ B, const int bs0, const int bs1,
-         const ScalarType beta,
-         /**/  ValueType *__restrict__ C, const int cs0, const int cs1) {
-    // C = beta C + alpha A B
-    // C (m x n), A(m x k), B(k x n)
-
-    enum : int {
-      mbAlgo = Algo::Gemm::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>(),
-      nbAlgo = Algo::Gemm::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>() 
-    };
+  return 0;
+}
 
-    const ScalarType one(1.0), zero(0.0);
-          
-    if      (beta == zero) TeamSetInternal  ::invoke(member, m, n, zero, C, cs0, cs1);
-    else if (beta != one ) TeamScaleInternal::invoke(member, m, n, beta, C, cs0, cs1);
-
-    if (alpha != ScalarType(0.0)) {
-      if (m <= 0 || n <= 0 || k <= 0) return 0;
-
-      if (beta != one)
-        member.team_barrier();
-
-      ///
-      /// GPU case: team size is large and blocksize (mb,nb) is small
-      InnerGemmFixC<mbAlgo,nbAlgo> inner(as0, as1, bs0, bs1, cs0, cs1);
-      auto gemm = [&](const int ib, 
-                      const int jb,
-                      const int pb,
-                      const ValueType *__restrict__ AA,
-                      const ValueType *__restrict__ BB,
-                      /**/  ValueType *__restrict__ CC) {
-        // Made this non-const in order to WORKAROUND issue #349
-        int              
-        mb = mbAlgo, mp = (ib%mb), mq = (ib/mb) + (mp>0),
-        nb = nbAlgo, np = (jb%nb), nq = (jb/nb) + (np>0);
-              
-        // square tiling
-        Kokkos::parallel_for
-        (Kokkos::TeamThreadRange(member, mq*nq ),
-         [&](const int &ij) {
-          int i, j;
-          //note: the condition is constexpr
-          if(KokkosKernels::Impl::kk_is_gpu_exec_space<typename MemberType::execution_space>()) {
-            i = ij%mq*mb;
-            j = ij/mq*nb;
-          }
-          else {
-            i = ij/nq*mb;
-            j = ij%nq*nb;
-          }
-          inner.serial_invoke(alpha, 
-                              AA+i*as0, BB+j*bs1, 
-                              (i+mb) > ib ? mp : mb, 
-                              (j+nb) > jb ? np : nb, 
-                              pb, 
-                              CC+i*cs0+j*cs1);
-        });
-      };          
-            
-      const bool is_small = true; //(m*n*k <= 64*64*64);
-      if (is_small) {
-        gemm(m, n, k, A, B, C);
-      } else {
-        // // cache blocking
-        // const int 
-        //   nc = nb*10, kc = mb*4, mc = mb*4;
-          
-        // for (int jj=0;jj<n;jj+=nc) {
-        //   const int tj = n-jj, jb = (tj < nc ? tj : nc);
-        //   for (int pp=0;pp<k;pp+=kc) {
-        //     const int tp = k-pp, pb = (tp < kc ? tp : kc);
-        //     //const int pb = k, pp = 0;
-        //     for (int ii=0;ii<m;ii+=mc) {
-        //       const int ti = m-ii, ib = (ti < mc ? ti : mc);
-                
-        //       const ValueType *__restrict__ AA = A+ii*as0+pp*as1;
-        //       const ValueType *__restrict__ BB = B+pp*bs0+jj*bs1;
-        //       /**/  ValueType *__restrict__ CC = C+ii*cs0+jj*cs1;
-                
-        //       gemm(ib, jb, pb, AA, BB, CC);                  
-        //     } // for ii
-        //   } // for pp
-        // } // for jj          
-      }
+template <>
+template <typename MemberType, typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int TeamGemmInternal<Algo::Gemm::Blocked>::invoke(
+    const MemberType &member, const int m, const int n, const int k,
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0,
+    const int as1, const ValueType *KOKKOS_RESTRICT B, const int bs0,
+    const int bs1, const ScalarType beta,
+    /**/ ValueType *KOKKOS_RESTRICT C, const int cs0, const int cs1) {
+  // C = beta C + alpha A B
+  // C (m x n), A(m x k), B(k x n)
+
+  constexpr int mbAlgo = Algo::Gemm::Blocked::mb();
+  constexpr int nbAlgo = Algo::Gemm::Blocked::mb();
+
+  const ScalarType one(1.0), zero(0.0);
+
+  if (beta == zero)
+    TeamSetInternal ::invoke(member, m, n, zero, C, cs0, cs1);
+  else if (beta != one)
+    TeamScaleInternal::invoke(member, m, n, beta, C, cs0, cs1);
+
+  if (alpha != ScalarType(0.0)) {
+    if (m <= 0 || n <= 0 || k <= 0) return 0;
+
+    if (beta != one) member.team_barrier();
+
+    ///
+    /// GPU case: team size is large and blocksize (mb,nb) is small
+    InnerGemmFixC<mbAlgo, nbAlgo> inner(as0, as1, bs0, bs1, cs0, cs1);
+    auto gemm = [&](const int ib, const int jb, const int pb,
+                    const ValueType *KOKKOS_RESTRICT AA,
+                    const ValueType *KOKKOS_RESTRICT BB,
+                    /**/ ValueType *KOKKOS_RESTRICT CC) {
+      // Made this non-const in order to WORKAROUND issue #349
+      int mb = mbAlgo, mp = (ib % mb), mq = (ib / mb) + (mp > 0), nb = nbAlgo,
+          np = (jb % nb), nq = (jb / nb) + (np > 0);
+
+      // square tiling
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, mq * nq), [&](const int &ij) {
+            int i, j;
+            // note: the condition is constexpr
+            if (KokkosKernels::Impl::kk_is_gpu_exec_space<
+                    typename MemberType::execution_space>()) {
+              i = ij % mq * mb;
+              j = ij / mq * nb;
+            } else {
+              i = ij / nq * mb;
+              j = ij % nq * nb;
+            }
+            inner.serial_invoke(
+                alpha, AA + i * as0, BB + j * bs1, (i + mb) > ib ? mp : mb,
+                (j + nb) > jb ? np : nb, pb, CC + i * cs0 + j * cs1);
+          });
+    };
 
+    const bool is_small = true;  //(m*n*k <= 64*64*64);
+    if (is_small) {
+      gemm(m, n, k, A, B, C);
+    } else {
+      // // cache blocking
+      // const int
+      //   nc = nb*10, kc = mb*4, mc = mb*4;
+
+      // for (int jj=0;jj<n;jj+=nc) {
+      //   const int tj = n-jj, jb = (tj < nc ? tj : nc);
+      //   for (int pp=0;pp<k;pp+=kc) {
+      //     const int tp = k-pp, pb = (tp < kc ? tp : kc);
+      //     //const int pb = k, pp = 0;
+      //     for (int ii=0;ii<m;ii+=mc) {
+      //       const int ti = m-ii, ib = (ti < mc ? ti : mc);
+
+      //       const ValueType *KOKKOS_RESTRICT AA = A+ii*as0+pp*as1;
+      //       const ValueType *KOKKOS_RESTRICT BB = B+pp*bs0+jj*bs1;
+      //       /**/  ValueType *KOKKOS_RESTRICT CC = C+ii*cs0+jj*cs1;
+
+      //       gemm(ib, jb, pb, AA, BB, CC);
+      //     } // for ii
+      //   } // for pp
+      // } // for jj
     }
-    return 0;
   }
-
+  return 0;
 }
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_Serial_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_Serial_Impl.hpp
index fb41dbafc5..c0ba6471be 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemv_Serial_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemv_Serial_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_GEMV_SERIAL_IMPL_HPP__
 #define __KOKKOSBATCHED_GEMV_SERIAL_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,227 +8,159 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Impl
-  /// ===========
-  /// CompactMKL does not exist on Gemv
-
-  ///
-  /// Implemented:
-  /// NT, T
-  ///
-  /// Not yet implemented
-  /// CT
-
-  ///
-  /// NT
-  ///
-
-#if                                                     \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&               \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) &&       \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
-  template<>
-  template<typename ScalarType,
-           typename AViewType,
-           typename xViewType,
-           typename yViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialGemv<Trans::NoTranspose,Algo::Gemv::CompactMKL>::
-  invoke(const ScalarType alpha,
-         const AViewType &A,
-         const xViewType &x,
-         const ScalarType beta,
-         const yViewType &y) {
-    typedef typename yViewType::value_type vector_type;
-    //typedef typename vector_type::value_type value_type;
-
-    const int
-      m = A.extent(0),
-      n = 1,
-      k = A.extent(1);
-
-    static_assert(is_vector<vector_type>::value, "value type is not vector type");      
-    static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, 
-                  "AVX, AVX2 and AVX512 is supported");
-    const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ?  MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
-
-    // no error check
-    int r_val = 0;
-    if (A.stride_0() == 1) {
-      mkl_dgemm_compact(MKL_COL_MAJOR, MKL_NOTRANS, MKL_NOTRANS,
-                        m, n, k, 
-                        alpha, 
-                        (const double*)A.data(), A.stride_1(), 
-                        (const double*)x.data(), x.stride_0(),
-                        beta,
-                        (double*)y.data(), y.stride_0(),
-                        format, (MKL_INT)vector_type::vector_length);
-    } else if (A.stride_1() == 1) {
-      mkl_dgemm_compact(MKL_ROW_MAJOR, MKL_NOTRANS, MKL_NOTRANS,
-                        m, n, k, 
-                        alpha, 
-                        (const double*)A.data(), A.stride_0(), 
-                        (const double*)x.data(), x.stride_0(),
-                        beta,
-                        (double*)y.data(), y.stride_0(),
-                        format, (MKL_INT)vector_type::vector_length);
-    } else {
-      r_val = -1;
-    }
-    return r_val;
+///
+/// Serial Impl
+/// ===========
+/// CompactMKL does not exist on Gemv
+
+///
+/// Implemented:
+/// NT, T
+///
+/// Not yet implemented
+/// CT
+
+///
+/// NT
+///
+
+#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&         \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
+template <>
+template <typename ScalarType, typename AViewType, typename xViewType,
+          typename yViewType>
+KOKKOS_INLINE_FUNCTION int
+SerialGemv<Trans::NoTranspose, Algo::Gemv::CompactMKL>::invoke(
+    const ScalarType alpha, const AViewType &A, const xViewType &x,
+    const ScalarType beta, const yViewType &y) {
+  typedef typename yViewType::value_type vector_type;
+  // typedef typename vector_type::value_type value_type;
+
+  const int m = A.extent(0), n = 1, k = A.extent(1);
+
+  static_assert(is_vector<vector_type>::value, "value type is not vector type");
+  static_assert(
+      vector_type::vector_length == 4 || vector_type::vector_length == 8,
+      "AVX, AVX2 and AVX512 is supported");
+  const MKL_COMPACT_PACK format =
+      vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
+
+  // no error check
+  int r_val = 0;
+  if (A.stride_0() == 1) {
+    mkl_dgemm_compact(MKL_COL_MAJOR, MKL_NOTRANS, MKL_NOTRANS, m, n, k, alpha,
+                      (const double *)A.data(), A.stride_1(),
+                      (const double *)x.data(), x.stride_0(), beta,
+                      (double *)y.data(), y.stride_0(), format,
+                      (MKL_INT)vector_type::vector_length);
+  } else if (A.stride_1() == 1) {
+    mkl_dgemm_compact(MKL_ROW_MAJOR, MKL_NOTRANS, MKL_NOTRANS, m, n, k, alpha,
+                      (const double *)A.data(), A.stride_0(),
+                      (const double *)x.data(), x.stride_0(), beta,
+                      (double *)y.data(), y.stride_0(), format,
+                      (MKL_INT)vector_type::vector_length);
+  } else {
+    r_val = -1;
   }
+  return r_val;
+}
 #endif
 
-  template<>
-  template<typename ScalarType,
-           typename AViewType,
-           typename xViewType,
-           typename yViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialGemv<Trans::NoTranspose,Algo::Gemv::Unblocked>::
-  invoke(const ScalarType alpha,
-         const AViewType &A,
-         const xViewType &x,
-         const ScalarType beta,
-         const yViewType &y) {
-    return SerialGemvInternal<Algo::Gemv::Unblocked>::
-      invoke(A.extent(0), A.extent(1),
-             alpha, 
-             A.data(), A.stride_0(), A.stride_1(),
-             x.data(), x.stride_0(),
-             beta,
-             y.data(), y.stride_0());
-  }
-    
-  template<>
-  template<typename ScalarType,
-           typename AViewType,
-           typename xViewType,
-           typename yViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialGemv<Trans::NoTranspose,Algo::Gemv::Blocked>::
-  invoke(const ScalarType alpha,
-         const AViewType &A,
-         const xViewType &x,
-         const ScalarType beta,
-         const yViewType &y) {
-    return SerialGemvInternal<Algo::Gemv::Blocked>::
-      invoke(A.extent(0), A.extent(1),
-             alpha, 
-             A.data(), A.stride_0(), A.stride_1(),
-             x.data(), x.stride_0(),
-             beta,
-             y.data(), y.stride_0());
-  }
-  ///
-  /// T
-  ///
-
-#if                                                     \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&               \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) &&       \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
-  template<>
-  template<typename ScalarType,
-           typename AViewType,
-           typename xViewType,
-           typename yViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialGemv<Trans::Transpose,Algo::Gemv::CompactMKL>::
-  invoke(const ScalarType alpha,
-         const AViewType &A,
-         const xViewType &x,
-         const ScalarType beta,
-         const yViewType &y) {
-    typedef typename yViewType::value_type vector_type;
-    //typedef typename vector_type::value_type value_type;
-
-    const int
-      m = A.extent(0),
-      n = 1,
-      k = A.extent(1);
-
-    static_assert(is_vector<vector_type>::value, "value type is not vector type");      
-    static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, 
-                  "AVX, AVX2 and AVX512 is supported");
-    const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ?  MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
-
-    // no error check
-    int r_val = 0;
-    if (A.stride_0() == 1) {
-      mkl_dgemm_compact(MKL_COL_MAJOR, MKL_TRANS, MKL_NOTRANS,
-                        m, n, k, 
-                        alpha, 
-                        (const double*)A.data(), A.stride_1(), 
-                        (const double*)x.data(), x.stride_0(),
-                        beta,
-                        (double*)y.data(), y.stride_0(),
-                        format, (MKL_INT)vector_type::vector_length);
-    } else if (A.stride_1() == 1) {
-      mkl_dgemm_compact(MKL_ROW_MAJOR, MKL_TRANS, MKL_NOTRANS,
-                        m, n, k, 
-                        alpha, 
-                        (const double*)A.data(), A.stride_0(), 
-                        (const double*)x.data(), x.stride_0(),
-                        beta,
-                        (double*)y.data(), y.stride_0(),
-                        format, (MKL_INT)vector_type::vector_length);
-    } else {
-      r_val = -1;
-    }
-    return r_val;
+template <>
+template <typename ScalarType, typename AViewType, typename xViewType,
+          typename yViewType>
+KOKKOS_INLINE_FUNCTION int
+SerialGemv<Trans::NoTranspose, Algo::Gemv::Unblocked>::invoke(
+    const ScalarType alpha, const AViewType &A, const xViewType &x,
+    const ScalarType beta, const yViewType &y) {
+  return SerialGemvInternal<Algo::Gemv::Unblocked>::invoke(
+      A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(),
+      x.data(), x.stride_0(), beta, y.data(), y.stride_0());
+}
+
+template <>
+template <typename ScalarType, typename AViewType, typename xViewType,
+          typename yViewType>
+KOKKOS_INLINE_FUNCTION int
+SerialGemv<Trans::NoTranspose, Algo::Gemv::Blocked>::invoke(
+    const ScalarType alpha, const AViewType &A, const xViewType &x,
+    const ScalarType beta, const yViewType &y) {
+  return SerialGemvInternal<Algo::Gemv::Blocked>::invoke(
+      A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(),
+      x.data(), x.stride_0(), beta, y.data(), y.stride_0());
+}
+///
+/// T
+///
+
+#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&         \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
+template <>
+template <typename ScalarType, typename AViewType, typename xViewType,
+          typename yViewType>
+KOKKOS_INLINE_FUNCTION int
+SerialGemv<Trans::Transpose, Algo::Gemv::CompactMKL>::invoke(
+    const ScalarType alpha, const AViewType &A, const xViewType &x,
+    const ScalarType beta, const yViewType &y) {
+  typedef typename yViewType::value_type vector_type;
+  // typedef typename vector_type::value_type value_type;
+
+  const int m = A.extent(0), n = 1, k = A.extent(1);
+
+  static_assert(is_vector<vector_type>::value, "value type is not vector type");
+  static_assert(
+      vector_type::vector_length == 4 || vector_type::vector_length == 8,
+      "AVX, AVX2 and AVX512 is supported");
+  const MKL_COMPACT_PACK format =
+      vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
+
+  // no error check
+  int r_val = 0;
+  if (A.stride_0() == 1) {
+    mkl_dgemm_compact(MKL_COL_MAJOR, MKL_TRANS, MKL_NOTRANS, m, n, k, alpha,
+                      (const double *)A.data(), A.stride_1(),
+                      (const double *)x.data(), x.stride_0(), beta,
+                      (double *)y.data(), y.stride_0(), format,
+                      (MKL_INT)vector_type::vector_length);
+  } else if (A.stride_1() == 1) {
+    mkl_dgemm_compact(MKL_ROW_MAJOR, MKL_TRANS, MKL_NOTRANS, m, n, k, alpha,
+                      (const double *)A.data(), A.stride_0(),
+                      (const double *)x.data(), x.stride_0(), beta,
+                      (double *)y.data(), y.stride_0(), format,
+                      (MKL_INT)vector_type::vector_length);
+  } else {
+    r_val = -1;
   }
+  return r_val;
+}
 #endif
 
-  template<>
-  template<typename ScalarType,
-           typename AViewType,
-           typename xViewType,
-           typename yViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialGemv<Trans::Transpose,Algo::Gemv::Unblocked>::
-  invoke(const ScalarType alpha,
-         const AViewType &A,
-         const xViewType &x,
-         const ScalarType beta,
-         const yViewType &y) {
-    return SerialGemvInternal<Algo::Gemv::Unblocked>::
-      invoke(A.extent(1), A.extent(0),
-             alpha, 
-             A.data(), A.stride_1(), A.stride_0(),
-             x.data(), x.stride_0(),
-             beta,
-             y.data(), y.stride_0());
-  }
-    
-  template<>
-  template<typename ScalarType,
-           typename AViewType,
-           typename xViewType,
-           typename yViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialGemv<Trans::Transpose,Algo::Gemv::Blocked>::
-  invoke(const ScalarType alpha,
-         const AViewType &A,
-         const xViewType &x,
-         const ScalarType beta,
-         const yViewType &y) {
-    return SerialGemvInternal<Algo::Gemv::Blocked>::
-      invoke(A.extent(1), A.extent(0),
-             alpha, 
-             A.data(), A.stride_1(), A.stride_0(),
-             x.data(), x.stride_0(),
-             beta,
-             y.data(), y.stride_0());
-  }
+template <>
+template <typename ScalarType, typename AViewType, typename xViewType,
+          typename yViewType>
+KOKKOS_INLINE_FUNCTION int
+SerialGemv<Trans::Transpose, Algo::Gemv::Unblocked>::invoke(
+    const ScalarType alpha, const AViewType &A, const xViewType &x,
+    const ScalarType beta, const yViewType &y) {
+  return SerialGemvInternal<Algo::Gemv::Unblocked>::invoke(
+      A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), A.stride_0(),
+      x.data(), x.stride_0(), beta, y.data(), y.stride_0());
+}
 
+template <>
+template <typename ScalarType, typename AViewType, typename xViewType,
+          typename yViewType>
+KOKKOS_INLINE_FUNCTION int
+SerialGemv<Trans::Transpose, Algo::Gemv::Blocked>::invoke(
+    const ScalarType alpha, const AViewType &A, const xViewType &x,
+    const ScalarType beta, const yViewType &y) {
+  return SerialGemvInternal<Algo::Gemv::Blocked>::invoke(
+      A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), A.stride_0(),
+      x.data(), x.stride_0(), beta, y.data(), y.stride_0());
 }
 
+}  // namespace KokkosBatched
+
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_Serial_Internal.hpp
index bbcfc9cbac..fbd4a1e2d3 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemv_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemv_Serial_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_GEMV_SERIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_GEMV_SERIAL_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -13,103 +12,88 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ====================
-
-  template<typename ArgAlgo>
-  struct SerialGemvInternal {
-    template<typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m, const int n, 
-           const ScalarType alpha,
-           const ValueType *__restrict__ A, const int as0, const int as1,
-           const ValueType *__restrict__ x, const int xs0, 
-           const ScalarType beta,
-           /**/  ValueType *__restrict__ y, const int ys0);
-  };
-
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialGemvInternal<Algo::Gemv::Unblocked>::
-  invoke(const int m, const int n, 
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         const ValueType *__restrict__ x, const int xs0,
-         const ScalarType beta,
-         /**/  ValueType *__restrict__ y, const int ys0) {
-
-    const ScalarType one(1.0), zero(0.0);
-
-    // y = beta y + alpha A x
-    // y (m), A(m x n), B(n)
-
-    if      (beta == zero) SerialSetInternal  ::invoke(m, zero, y, ys0);
-    else if (beta != one)  SerialScaleInternal::invoke(m, beta, y, ys0);
-      
-    if (alpha != zero) {
-      if (m <= 0 || n <= 0) return 0;
-
-      for (int i=0;i<m;++i) {
-        ValueType t(0);
-        const ValueType *__restrict__ tA = (A + i*as0);
+///
+/// Serial Internal Impl
+/// ====================
+
+template <typename ArgAlgo>
+struct SerialGemvInternal {
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const int m, const int n, const ScalarType alpha,
+      const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+      const ValueType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta,
+      /**/ ValueType *KOKKOS_RESTRICT y, const int ys0);
+};
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int SerialGemvInternal<Algo::Gemv::Unblocked>::invoke(
+    const int m, const int n, const ScalarType alpha,
+    const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+    const ValueType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta,
+    /**/ ValueType *KOKKOS_RESTRICT y, const int ys0) {
+  const ScalarType one(1.0), zero(0.0);
+
+  // y = beta y + alpha A x
+  // y (m), A(m x n), B(n)
+
+  if (beta == zero)
+    SerialSetInternal ::invoke(m, zero, y, ys0);
+  else if (beta != one)
+    SerialScaleInternal::invoke(m, beta, y, ys0);
+
+  if (alpha != zero) {
+    if (m <= 0 || n <= 0) return 0;
+
+    for (int i = 0; i < m; ++i) {
+      ValueType t(0);
+      const ValueType *KOKKOS_RESTRICT tA = (A + i * as0);
 
 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
 #pragma ivdep
-#endif               
+#endif
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-        for (int j=0;j<n;++j)
-          t += tA[j*as1]*x[j*xs0];
-        y[i*ys0] += alpha*t;
-      }
+      for (int j = 0; j < n; ++j) t += tA[j * as1] * x[j * xs0];
+      y[i * ys0] += alpha * t;
     }
-    return 0;
   }
+  return 0;
+}
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialGemvInternal<Algo::Gemv::Blocked>::
-  invoke(const int m, const int n, 
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         const ValueType *__restrict__ x, const int xs0,
-         const ScalarType beta,
-         /**/  ValueType *__restrict__ y, const int ys0) {
-
-    const ScalarType one(1.0), zero(0.0);
-
-    // y = beta y + alpha A x
-    // y (m), A(m x n), B(n)
-
-    enum : int {
-      mbAlgo = Algo::Gemv::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
-    };
-
-    if      (beta == zero) SerialSetInternal  ::invoke(m, zero, y, ys0);
-    else if (beta != one)  SerialScaleInternal::invoke(m, beta, y, ys0);
-      
-    if (alpha != zero) {
-      if (m <= 0 || n <= 0) return 0;
-
-      InnerMultipleDotProduct<mbAlgo> inner(as0, as1, xs0, ys0);
-      const int mb = mbAlgo;
-      for (int i=0;i<m;i+=mb) 
-        inner.serial_invoke(alpha, A+i*as0, x, (i+mb) > m ? (m-i) : mb, n, y+i*ys0 );
-    }
-    return 0;
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int SerialGemvInternal<Algo::Gemv::Blocked>::invoke(
+    const int m, const int n, const ScalarType alpha,
+    const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+    const ValueType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta,
+    /**/ ValueType *KOKKOS_RESTRICT y, const int ys0) {
+  const ScalarType one(1.0), zero(0.0);
+
+  // y = beta y + alpha A x
+  // y (m), A(m x n), B(n)
+
+  constexpr int mbAlgo = Algo::Gemv::Blocked::mb();
+
+  if (beta == zero)
+    SerialSetInternal ::invoke(m, zero, y, ys0);
+  else if (beta != one)
+    SerialScaleInternal::invoke(m, beta, y, ys0);
+
+  if (alpha != zero) {
+    if (m <= 0 || n <= 0) return 0;
+
+    InnerMultipleDotProduct<mbAlgo> inner(as0, as1, xs0, ys0);
+    const int mb = mbAlgo;
+    for (int i = 0; i < m; i += mb)
+      inner.serial_invoke(alpha, A + i * as0, x, (i + mb) > m ? (m - i) : mb, n,
+                          y + i * ys0);
   }
-
+  return 0;
 }
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp
index 8acaca96b6..7e21019f94 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_GEMV_TEAMVECTOR_IMPL_HPP__
 #define __KOKKOSBATCHED_GEMV_TEAMVECTOR_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,130 +8,77 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Team Impl
-  /// =========
-
-  ///
-  /// Implemented:
-  /// NT, T
-  ///
-  /// Not yet implemented
-  /// CT
-
-  ///
-  /// NT
-  ///
-
-  template<typename MemberType>
-  struct TeamVectorGemv<MemberType,Trans::NoTranspose,Algo::Gemv::Unblocked> {
-          
-    template<typename ScalarType,
-             typename AViewType,
-             typename xViewType,
-             typename yViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const xViewType &x,
-           const ScalarType beta,
-           const yViewType &y) {
-      return TeamVectorGemvInternal<Algo::Gemv::Unblocked>::
-        invoke(member,
-               A.extent(0), A.extent(1),
-               alpha, 
-               A.data(), A.stride_0(), A.stride_1(),
-               x.data(), x.stride_0(),
-               beta,
-               y.data(), y.stride_0());
-    }
-  };
-    
-  template<typename MemberType>
-  struct TeamVectorGemv<MemberType,Trans::NoTranspose,Algo::Gemv::Blocked> {
-
-    template<typename ScalarType,
-             typename AViewType,
-             typename xViewType,
-             typename yViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const xViewType &x,
-           const ScalarType beta,
-           const yViewType &y) {
-      return TeamVectorGemvInternal<Algo::Gemv::Blocked>::
-        invoke(member,
-               A.extent(0), A.extent(1),
-               alpha, 
-               A.data(), A.stride_0(), A.stride_1(),
-               x.data(), x.stride_0(),
-               beta,
-               y.data(), y.stride_0());
-    }
-  };
-
-  ///
-  /// T
-  ///
-
-  template<typename MemberType>
-  struct TeamVectorGemv<MemberType,Trans::Transpose,Algo::Gemv::Unblocked> {
-
-    template<typename ScalarType,
-             typename AViewType,
-             typename xViewType,
-             typename yViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const xViewType &x,
-           const ScalarType beta,
-           const yViewType &y) {
-      return TeamVectorGemvInternal<Algo::Gemv::Unblocked>::
-        invoke(member,
-               A.extent(1), A.extent(0),
-               alpha, 
-               A.data(), A.stride_1(), A.stride_0(),
-               x.data(), x.stride_0(),
-               beta,
-               y.data(), y.stride_0());
-    }
-  };
-        
-  template<typename MemberType>
-  struct TeamVectorGemv<MemberType,Trans::Transpose,Algo::Gemv::Blocked> {
-
-    template<typename ScalarType,
-             typename AViewType,
-             typename xViewType,
-             typename yViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A,
-           const xViewType &x,
-           const ScalarType beta,
-           const yViewType &y) {
-      return TeamVectorGemvInternal<Algo::Gemv::Blocked>::
-        invoke(member,
-               A.extent(1), A.extent(0),
-               alpha, 
-               A.data(), A.stride_1(), A.stride_0(),
-               x.data(), x.stride_0(),
-               beta,
-               y.data(), y.stride_0());
-    }
-  };
-
-}
-
+///
+/// Team Impl
+/// =========
+
+///
+/// Implemented:
+/// NT, T
+///
+/// Not yet implemented
+/// CT
+
+///
+/// NT
+///
+
+template <typename MemberType>
+struct TeamVectorGemv<MemberType, Trans::NoTranspose, Algo::Gemv::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename xViewType,
+            typename yViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const ScalarType alpha, const AViewType &A,
+      const xViewType &x, const ScalarType beta, const yViewType &y) {
+    return TeamVectorGemvInternal<Algo::Gemv::Unblocked>::invoke(
+        member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(),
+        A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0());
+  }
+};
+
+template <typename MemberType>
+struct TeamVectorGemv<MemberType, Trans::NoTranspose, Algo::Gemv::Blocked> {
+  template <typename ScalarType, typename AViewType, typename xViewType,
+            typename yViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const ScalarType alpha, const AViewType &A,
+      const xViewType &x, const ScalarType beta, const yViewType &y) {
+    return TeamVectorGemvInternal<Algo::Gemv::Blocked>::invoke(
+        member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(),
+        A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0());
+  }
+};
+
+///
+/// T
+///
+
+template <typename MemberType>
+struct TeamVectorGemv<MemberType, Trans::Transpose, Algo::Gemv::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename xViewType,
+            typename yViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const ScalarType alpha, const AViewType &A,
+      const xViewType &x, const ScalarType beta, const yViewType &y) {
+    return TeamVectorGemvInternal<Algo::Gemv::Unblocked>::invoke(
+        member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(),
+        A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0());
+  }
+};
+
+template <typename MemberType>
+struct TeamVectorGemv<MemberType, Trans::Transpose, Algo::Gemv::Blocked> {
+  template <typename ScalarType, typename AViewType, typename xViewType,
+            typename yViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const ScalarType alpha, const AViewType &A,
+      const xViewType &x, const ScalarType beta, const yViewType &y) {
+    return TeamVectorGemvInternal<Algo::Gemv::Blocked>::invoke(
+        member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(),
+        A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0());
+  }
+};
+
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp
index ad2432868e..f4054030a3 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_GEMV_TEAMVECTOR_INTERNAL_HPP__
 #define __KOKKOSBATCHED_GEMV_TEAMVECTOR_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -13,71 +12,63 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Team Internal Impl
-  /// ====================
-  template<typename ArgAlgo>
-  struct TeamVectorGemvInternal {
-    template<typename MemberType,
-             typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &/*member*/,
-           const int /*m*/, const int /*n*/, 
-           const ScalarType /*alpha*/,
-           const ValueType *__restrict__ /*A*/, const int /*as0*/, const int /*as1*/,
-           const ValueType *__restrict__ /*x*/, const int /*xs0*/, 
-           const ScalarType /*beta*/,
-           /**/  ValueType *__restrict__ /*y*/, const int /*ys0*/) {
-      assert(false && "Error: encounter dummy impl");
-      return 0;
-    }
-  };
-    
-  template<>
-  template<typename MemberType,
-           typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamVectorGemvInternal<Algo::Gemv::Unblocked>::
-  invoke(const MemberType &member,
-         const int m, const int n, 
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         const ValueType *__restrict__ x, const int xs0,
-         const ScalarType beta,
-         /**/  ValueType *__restrict__ y, const int ys0) {
-    const ScalarType one(1.0), zero(0.0);
-
-    // y = beta y + alpha A x
-    // y (m), A(m x n), B(n)
-      
-    if      (beta == zero) TeamVectorSetInternal  ::invoke(member, m, zero, y, ys0);
-    else if (beta != one)  TeamVectorScaleInternal::invoke(member, m, beta, y, ys0);
-      
-    if (alpha != zero) {
-      if (m <= 0 || n <= 0) return 0;
-        
-      if (beta != one) 
-        member.team_barrier();
-        
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(member,m),[&](const int &i) {
-          ValueType t(0);
-          const ValueType *__restrict__ tA = (A + i*as0);
-          Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(member,n),[&](const int &j, ValueType &update) {
-              update += tA[j*as1]*x[j*xs0];
-            }, t);
-          Kokkos::single(Kokkos::PerThread(member), [&]() {
-              y[i*ys0] += alpha*t;
-            });
-        });
-    }
+///
+/// Team Internal Impl
+/// ====================
+template <typename ArgAlgo>
+struct TeamVectorGemvInternal {
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType & /*member*/, const int /*m*/, const int /*n*/,
+      const ScalarType /*alpha*/, const ValueType *KOKKOS_RESTRICT /*A*/,
+      const int /*as0*/, const int /*as1*/,
+      const ValueType *KOKKOS_RESTRICT /*x*/, const int /*xs0*/,
+      const ScalarType /*beta*/,
+      /**/ ValueType *KOKKOS_RESTRICT /*y*/, const int /*ys0*/) {
+    assert(false && "Error: encounter dummy impl");
     return 0;
   }
+};
+
+template <>
+template <typename MemberType, typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+TeamVectorGemvInternal<Algo::Gemv::Unblocked>::invoke(
+    const MemberType &member, const int m, const int n, const ScalarType alpha,
+    const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+    const ValueType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta,
+    /**/ ValueType *KOKKOS_RESTRICT y, const int ys0) {
+  const ScalarType one(1.0), zero(0.0);
+
+  // y = beta y + alpha A x
+  // y (m), A(m x n), B(n)
 
+  if (beta == zero)
+    TeamVectorSetInternal ::invoke(member, m, zero, y, ys0);
+  else if (beta != one)
+    TeamVectorScaleInternal::invoke(member, m, beta, y, ys0);
+
+  if (alpha != zero) {
+    if (m <= 0 || n <= 0) return 0;
+
+    if (beta != one) member.team_barrier();
+
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) {
+      ValueType t(0);
+      const ValueType *KOKKOS_RESTRICT tA = (A + i * as0);
+      Kokkos::parallel_reduce(
+          Kokkos::ThreadVectorRange(member, n),
+          [&](const int &j, ValueType &update) {
+            update += tA[j * as1] * x[j * xs0];
+          },
+          t);
+      Kokkos::single(Kokkos::PerThread(member),
+                     [&]() { y[i * ys0] += alpha * t; });
+    });
+  }
+  return 0;
 }
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp
index 4fa6c4044c..73ee2b9ad3 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_GEMV_TEAM_IMPL_HPP__
 #define __KOKKOSBATCHED_GEMV_TEAM_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,130 +8,77 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Team Impl
-  /// =========
-
-  ///
-  /// Implemented:
-  /// NT, T
-  ///
-  /// Not yet implemented
-  /// CT
-
-  ///
-  /// NT
-  ///
-
-  template<typename MemberType>
-  struct TeamGemv<MemberType,Trans::NoTranspose,Algo::Gemv::Unblocked> {
-          
-    template<typename ScalarType,
-             typename AViewType,
-             typename xViewType,
-             typename yViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const xViewType &x,
-           const ScalarType beta,
-           const yViewType &y) {
-      return TeamGemvInternal<Algo::Gemv::Unblocked>::
-        invoke(member,
-               A.extent(0), A.extent(1),
-               alpha, 
-               A.data(), A.stride_0(), A.stride_1(),
-               x.data(), x.stride_0(),
-               beta,
-               y.data(), y.stride_0());
-    }
-  };
-    
-  template<typename MemberType>
-  struct TeamGemv<MemberType,Trans::NoTranspose,Algo::Gemv::Blocked> {
-
-    template<typename ScalarType,
-             typename AViewType,
-             typename xViewType,
-             typename yViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const xViewType &x,
-           const ScalarType beta,
-           const yViewType &y) {
-      return TeamGemvInternal<Algo::Gemv::Blocked>::
-        invoke(member,
-               A.extent(0), A.extent(1),
-               alpha, 
-               A.data(), A.stride_0(), A.stride_1(),
-               x.data(), x.stride_0(),
-               beta,
-               y.data(), y.stride_0());
-    }
-  };
-
-  ///
-  /// T
-  ///
-
-  template<typename MemberType>
-  struct TeamGemv<MemberType,Trans::Transpose,Algo::Gemv::Unblocked> {
-
-    template<typename ScalarType,
-             typename AViewType,
-             typename xViewType,
-             typename yViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const xViewType &x,
-           const ScalarType beta,
-           const yViewType &y) {
-      return TeamGemvInternal<Algo::Gemv::Unblocked>::
-        invoke(member,
-               A.extent(1), A.extent(0),
-               alpha, 
-               A.data(), A.stride_1(), A.stride_0(),
-               x.data(), x.stride_0(),
-               beta,
-               y.data(), y.stride_0());
-    }
-  };
-        
-  template<typename MemberType>
-  struct TeamGemv<MemberType,Trans::Transpose,Algo::Gemv::Blocked> {
-
-    template<typename ScalarType,
-             typename AViewType,
-             typename xViewType,
-             typename yViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A,
-           const xViewType &x,
-           const ScalarType beta,
-           const yViewType &y) {
-      return TeamGemvInternal<Algo::Gemv::Blocked>::
-        invoke(member,
-               A.extent(1), A.extent(0),
-               alpha, 
-               A.data(), A.stride_1(), A.stride_0(),
-               x.data(), x.stride_0(),
-               beta,
-               y.data(), y.stride_0());
-    }
-  };
-
-}
-
+///
+/// Team Impl
+/// =========
+
+///
+/// Implemented:
+/// NT, T
+///
+/// Not yet implemented
+/// CT
+
+///
+/// NT
+///
+
+template <typename MemberType>
+struct TeamGemv<MemberType, Trans::NoTranspose, Algo::Gemv::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename xViewType,
+            typename yViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const ScalarType alpha, const AViewType &A,
+      const xViewType &x, const ScalarType beta, const yViewType &y) {
+    return TeamGemvInternal<Algo::Gemv::Unblocked>::invoke(
+        member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(),
+        A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0());
+  }
+};
+
+template <typename MemberType>
+struct TeamGemv<MemberType, Trans::NoTranspose, Algo::Gemv::Blocked> {
+  template <typename ScalarType, typename AViewType, typename xViewType,
+            typename yViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const ScalarType alpha, const AViewType &A,
+      const xViewType &x, const ScalarType beta, const yViewType &y) {
+    return TeamGemvInternal<Algo::Gemv::Blocked>::invoke(
+        member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(),
+        A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0());
+  }
+};
+
+///
+/// T
+///
+
+template <typename MemberType>
+struct TeamGemv<MemberType, Trans::Transpose, Algo::Gemv::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename xViewType,
+            typename yViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const ScalarType alpha, const AViewType &A,
+      const xViewType &x, const ScalarType beta, const yViewType &y) {
+    return TeamGemvInternal<Algo::Gemv::Unblocked>::invoke(
+        member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(),
+        A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0());
+  }
+};
+
+template <typename MemberType>
+struct TeamGemv<MemberType, Trans::Transpose, Algo::Gemv::Blocked> {
+  template <typename ScalarType, typename AViewType, typename xViewType,
+            typename yViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const ScalarType alpha, const AViewType &A,
+      const xViewType &x, const ScalarType beta, const yViewType &y) {
+    return TeamGemvInternal<Algo::Gemv::Blocked>::invoke(
+        member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(),
+        A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0());
+  }
+};
+
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp
index 8179f9e86c..98415cd034 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_GEMV_TEAM_INTERNAL_HPP__
 #define __KOKKOSBATCHED_GEMV_TEAM_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -13,118 +12,99 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Team Internal Impl
-  /// ====================
-  template<typename ArgAlgo>
-  struct TeamGemvInternal {
-    template<typename MemberType,
-             typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const int m, const int n, 
-           const ScalarType alpha,
-           const ValueType *__restrict__ A, const int as0, const int as1,
-           const ValueType *__restrict__ x, const int xs0, 
-           const ScalarType beta,
-           /**/  ValueType *__restrict__ y, const int ys0);
-  };
-    
-  template<>
-  template<typename MemberType,
-           typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamGemvInternal<Algo::Gemv::Unblocked>::
-  invoke(const MemberType &member,
-         const int m, const int n, 
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         const ValueType *__restrict__ x, const int xs0,
-         const ScalarType beta,
-         /**/  ValueType *__restrict__ y, const int ys0) {
-    const ScalarType one(1.0), zero(0.0);
-
-    // y = beta y + alpha A x
-    // y (m), A(m x n), B(n)
-      
-    if      (beta == zero) TeamSetInternal  ::invoke(member, m, zero, y, ys0);
-    else if (beta != one)  TeamScaleInternal::invoke(member, m, beta, y, ys0);
-      
-    if (alpha != zero) {
-      if (m <= 0 || n <= 0) return 0;
-        
-      if (beta != one) 
-        member.team_barrier();
-        
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,m),[&](const int &i) {
-          ValueType t(0);
-          const ValueType *__restrict__ tA = (A + i*as0);
+///
+/// Team Internal Impl
+/// ====================
+template <typename ArgAlgo>
+struct TeamGemvInternal {
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const int m, const int n,
+      const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0,
+      const int as1, const ValueType *KOKKOS_RESTRICT x, const int xs0,
+      const ScalarType beta,
+      /**/ ValueType *KOKKOS_RESTRICT y, const int ys0);
+};
+
+template <>
+template <typename MemberType, typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int TeamGemvInternal<Algo::Gemv::Unblocked>::invoke(
+    const MemberType &member, const int m, const int n, const ScalarType alpha,
+    const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+    const ValueType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta,
+    /**/ ValueType *KOKKOS_RESTRICT y, const int ys0) {
+  const ScalarType one(1.0), zero(0.0);
+
+  // y = beta y + alpha A x
+  // y (m), A(m x n), B(n)
+
+  if (beta == zero)
+    TeamSetInternal ::invoke(member, m, zero, y, ys0);
+  else if (beta != one)
+    TeamScaleInternal::invoke(member, m, beta, y, ys0);
+
+  if (alpha != zero) {
+    if (m <= 0 || n <= 0) return 0;
+
+    if (beta != one) member.team_barrier();
+
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, m),
+                         [&](const int &i) {
+                           ValueType t(0);
+                           const ValueType *KOKKOS_RESTRICT tA = (A + i * as0);
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-          for (int j=0;j<n;++j)
-            t += tA[j*as1]*x[j*xs0];
-          y[i*ys0] += alpha*t;
-        });
-    }
-    return 0;
-  }
-    
-  template<>
-  template<typename MemberType,
-           typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamGemvInternal<Algo::Gemv::Blocked>::
-  invoke(const MemberType &member,
-         const int m, const int n, 
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         const ValueType *__restrict__ x, const int xs0,
-         const ScalarType beta,
-         /**/  ValueType *__restrict__ y, const int ys0) {
-
-    const ScalarType one(1.0), zero(0.0);
-
-    // y = beta y + alpha A x
-    // y (m), A(m x n), B(n)
-      
-    enum : int {
-      mbAlgo = Algo::Gemv::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
-    };
-      
-    if      (beta == zero) TeamSetInternal  ::invoke(member, m, zero, y, ys0);
-    else if (beta != one)  TeamScaleInternal::invoke(member, m, beta, y, ys0);
-      
-    if (alpha != zero) {
-      if (m <= 0 || n <= 0) return 0;
-        
-      if (beta != one) 
-        member.team_barrier();
-        
-      InnerMultipleDotProduct<mbAlgo> inner(as0, as1, xs0, ys0);
-      const int tsize = member.team_size();
-      const int mb_a = m/tsize + (m%tsize>0), mb_b = mbAlgo;
-      // Made this non-const in order to WORKAROUND issue #349
-      int mb = mb_a < mb_b ? mb_a : mb_b, mp = m%mb;
-        
-      Kokkos::parallel_for
-                      (Kokkos::TeamThreadRange(member, (m/mb) + (mp>0)),
-                       [&](const int &ii) {
-                        const int i = ii*mb;
-                        inner.serial_invoke(alpha, A+i*as0, x, (i+mb) > m ? (m-i) : mb, n, y+i*ys0 );
-                      });
-      member.team_barrier();
-    }
-      
-    return 0;
+                           for (int j = 0; j < n; ++j)
+                             t += tA[j * as1] * x[j * xs0];
+                           y[i * ys0] += alpha * t;
+                         });
   }
+  return 0;
 }
 
+template <>
+template <typename MemberType, typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int TeamGemvInternal<Algo::Gemv::Blocked>::invoke(
+    const MemberType &member, const int m, const int n, const ScalarType alpha,
+    const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+    const ValueType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta,
+    /**/ ValueType *KOKKOS_RESTRICT y, const int ys0) {
+  const ScalarType one(1.0), zero(0.0);
+
+  // y = beta y + alpha A x
+  // y (m), A(m x n), B(n)
+
+  constexpr int mbAlgo = Algo::Gemv::Blocked::mb();
+
+  if (beta == zero)
+    TeamSetInternal ::invoke(member, m, zero, y, ys0);
+  else if (beta != one)
+    TeamScaleInternal::invoke(member, m, beta, y, ys0);
+
+  if (alpha != zero) {
+    if (m <= 0 || n <= 0) return 0;
+
+    if (beta != one) member.team_barrier();
+
+    InnerMultipleDotProduct<mbAlgo> inner(as0, as1, xs0, ys0);
+    const int tsize = member.team_size();
+    const int mb_a = m / tsize + (m % tsize > 0), mb_b = mbAlgo;
+    // Made this non-const in order to WORKAROUND issue #349
+    int mb = mb_a < mb_b ? mb_a : mb_b, mp = m % mb;
+
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, (m / mb) + (mp > 0)),
+                         [&](const int &ii) {
+                           const int i = ii * mb;
+                           inner.serial_invoke(alpha, A + i * as0, x,
+                                               (i + mb) > m ? (m - i) : mb, n,
+                                               y + i * ys0);
+                         });
+    member.team_barrier();
+  }
+
+  return 0;
+}
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp
index e8e425bf5c..105d6196cb 100644
--- a/src/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp
@@ -1,68 +1,66 @@
 #ifndef __KOKKOSBATCHED_GIVENS_SERIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_GIVENS_SERIAL_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
-  ///
-  /// this impl follows the flame interface of householder transformation
-  ///
-  struct SerialGivensInternal {
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ValueType   chi1,
-           const ValueType   chi2, 
-           /* */ Kokkos::pair<ValueType,ValueType> * G, 
-           /* */ ValueType * chi1_new) {
-      typedef ValueType value_type;        
-      const value_type zero(0), one(1);
-      /// compute G = [ gamma -sigma;
-      ///               sigma  gamma ];
-      /// G.first = gamma and G.second = sigma
-      /// this rotation satisfy the following
-      ///   G' [chi1; = [ alpha;
-      ///       chi2]     zero ];
-      value_type cs, sn, r;
-      if        (chi2 == zero) {
-        r  = chi1;
-        cs = one;
-        sn = zero;
-      } else if (chi1 == zero) {
-        r  = chi2;
-        cs = zero;
-        sn = one;
-      } else {
-        // here we do not care overflow caused by the division although it is probable....
-        r = Kokkos::Details::ArithTraits<value_type>::sqrt(chi1*chi1 + chi2*chi2);
-        cs = chi1/r;
-        sn = chi2/r;
-
-        if (Kokkos::Details::ArithTraits<value_type>::abs(chi1) > 
-            Kokkos::Details::ArithTraits<value_type>::abs(chi2) && cs < zero) {
-          cs = -cs;
-          sn = -sn;
-          r  = -r;
-        }
+///
+/// Serial Internal Impl
+/// ====================
+///
+/// this impl follows the flame interface of householder transformation
+///
+struct SerialGivensInternal {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const ValueType chi1, const ValueType chi2,
+      /* */ Kokkos::pair<ValueType, ValueType>* G,
+      /* */ ValueType* chi1_new) {
+    typedef ValueType value_type;
+    const value_type zero(0), one(1);
+    /// compute G = [ gamma -sigma;
+    ///               sigma  gamma ];
+    /// G.first = gamma and G.second = sigma
+    /// this rotation satisfy the following
+    ///   G' [chi1; = [ alpha;
+    ///       chi2]     zero ];
+    value_type cs, sn, r;
+    if (chi2 == zero) {
+      r  = chi1;
+      cs = one;
+      sn = zero;
+    } else if (chi1 == zero) {
+      r  = chi2;
+      cs = zero;
+      sn = one;
+    } else {
+      // here we do not care overflow caused by the division although it is
+      // probable....
+      r  = Kokkos::Details::ArithTraits<value_type>::sqrt(chi1 * chi1 +
+                                                         chi2 * chi2);
+      cs = chi1 / r;
+      sn = chi2 / r;
 
+      if (Kokkos::Details::ArithTraits<value_type>::abs(chi1) >
+              Kokkos::Details::ArithTraits<value_type>::abs(chi2) &&
+          cs < zero) {
+        cs = -cs;
+        sn = -sn;
+        r  = -r;
       }
-
-      G->first = cs;
-      G->second = sn;
-      *chi1_new = r;
-
-      return 0;
     }
-  };
 
-} // end namespace KokkosBatched
+    G->first  = cs;
+    G->second = sn;
+    *chi1_new = r;
+
+    return 0;
+  }
+};
 
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp b/src/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp
new file mode 100644
index 0000000000..7d86368c3d
--- /dev/null
+++ b/src/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp
@@ -0,0 +1,270 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+#ifndef __KOKKOSBATCHED_HADAMARDPRODUCT_IMPL_HPP__
+#define __KOKKOSBATCHED_HADAMARDPRODUCT_IMPL_HPP__
+
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+
+namespace KokkosBatched {
+
+///
+/// Serial Internal Impl
+/// ====================
+struct SerialHadamardProductInternal {
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n,
+                                           const ValueType* KOKKOS_RESTRICT X,
+                                           const int xs0, const int xs1,
+                                           const ValueType* KOKKOS_RESTRICT Y,
+                                           const int ys0, const int ys1,
+                                           /* */ ValueType* KOKKOS_RESTRICT V,
+                                           const int vs0, const int vs1) {
+    for (int i = 0; i < m; ++i)
+      for (int j = 0; j < n; ++j)
+        V[i * vs0 + j * vs1] = X[i * xs0 + j * xs1] * Y[i * ys0 + j * ys1];
+
+    return 0;
+  }
+};
+
+///
+/// Team Internal Impl
+/// ====================
+struct TeamHadamardProductInternal {
+  template <typename MemberType, typename ValueType, typename layout>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType& member, const int m, const int n,
+      const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1,
+      const ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1,
+      /* */ ValueType* KOKKOS_RESTRICT V, const int vs0, const int vs1) {
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(member, 0, m * n), [&](const int& iTemp) {
+          int i, j;
+          getIndices<int, layout>(iTemp, n, m, j, i);
+          V[i * vs0 + j * vs1] = X[i * xs0 + j * xs1] * Y[i * ys0 + j * ys1];
+        });
+    // member.team_barrier();
+    return 0;
+  }
+};
+
+///
+/// TeamVector Internal Impl
+/// ========================
+struct TeamVectorHadamardProductInternal {
+  template <typename MemberType, typename ValueType, typename layout>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType& member, const int m, const int n,
+      const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1,
+      const ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1,
+      /* */ ValueType* KOKKOS_RESTRICT V, const int vs0, const int vs1) {
+    Kokkos::parallel_for(
+        Kokkos::TeamVectorRange(member, 0, m * n), [&](const int& iTemp) {
+          int i, j;
+          getIndices<int, layout>(iTemp, n, m, j, i);
+          V[i * vs0 + j * vs1] = X[i * xs0 + j * xs1] * Y[i * ys0 + j * ys1];
+        });
+    // member.team_barrier();
+    return 0;
+  }
+};
+
+///
+/// Serial Impl
+/// ===========
+template <typename XViewType, typename YViewType, typename VViewType>
+KOKKOS_INLINE_FUNCTION int SerialHadamardProduct::invoke(const XViewType& X,
+                                                         const YViewType& Y,
+                                                         const VViewType& V) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+  static_assert(
+      Kokkos::is_view<XViewType>::value,
+      "KokkosBatched::HadamardProduct: XViewType is not a Kokkos::View.");
+  static_assert(
+      Kokkos::is_view<YViewType>::value,
+      "KokkosBatched::HadamardProduct: YViewType is not a Kokkos::View.");
+  static_assert(
+      Kokkos::is_view<VViewType>::value,
+      "KokkosBatched::HadamardProduct: VViewType is not a Kokkos::View.");
+  static_assert(XViewType::Rank == 2,
+                "KokkosBatched::HadamardProduct: XViewType must have rank 2.");
+  static_assert(YViewType::Rank == 2,
+                "KokkosBatched::HadamardProduct: YViewType must have rank 2.");
+  static_assert(VViewType::Rank == 2,
+                "KokkosBatched::HadamardProduct: VViewType must have rank 2.");
+
+  // Check compatibility of dimensions at run time.
+  if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) {
+    KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+        "KokkosBatched::HadamardProduct: Dimensions of X and Y do not match: "
+        "X: %d x %d, "
+        "Y: %d x %d\n",
+        (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1));
+    return 1;
+  }
+  if (X.extent(0) != V.extent(0) || X.extent(1) != V.extent(1)) {
+    KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+        "KokkosBatched::HadamardProduct: Dimensions of X and V do not match: "
+        "X: %d x %d, "
+        "V: %d x %d\n",
+        (int)X.extent(0), (int)X.extent(1), (int)V.extent(0), (int)V.extent(1));
+    return 1;
+  }
+#endif
+
+  return SerialHadamardProductInternal::template invoke<
+      typename XViewType::non_const_value_type>(
+      X.extent(0), X.extent(1), X.data(), X.stride_0(), X.stride_1(), Y.data(),
+      Y.stride_0(), Y.stride_1(), V.data(), V.stride_0(), V.stride_1());
+}
+
+///
+/// Team Impl
+/// =========
+
+template <typename MemberType>
+template <typename XViewType, typename YViewType, typename VViewType>
+KOKKOS_INLINE_FUNCTION int TeamHadamardProduct<MemberType>::invoke(
+    const MemberType& member, const XViewType& X, const YViewType& Y,
+    const VViewType& V) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+  static_assert(
+      Kokkos::is_view<XViewType>::value,
+      "KokkosBatched::HadamardProduct: XViewType is not a Kokkos::View.");
+  static_assert(
+      Kokkos::is_view<YViewType>::value,
+      "KokkosBatched::HadamardProduct: YViewType is not a Kokkos::View.");
+  static_assert(
+      Kokkos::is_view<VViewType>::value,
+      "KokkosBatched::HadamardProduct: VViewType is not a Kokkos::View.");
+  static_assert(XViewType::Rank == 2,
+                "KokkosBatched::HadamardProduct: XViewType must have rank 2.");
+  static_assert(YViewType::Rank == 2,
+                "KokkosBatched::HadamardProduct: YViewType must have rank 2.");
+  static_assert(VViewType::Rank == 2,
+                "KokkosBatched::HadamardProduct: VViewType must have rank 2.");
+
+  // Check compatibility of dimensions at run time.
+  if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) {
+    KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+        "KokkosBatched::HadamardProduct: Dimensions of X and Y do not match: "
+        "X: %d x %d, "
+        "Y: %d x %d\n",
+        (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1));
+    return 1;
+  }
+  if (X.extent(0) != V.extent(0) || X.extent(1) != V.extent(1)) {
+    KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+        "KokkosBatched::HadamardProduct: Dimensions of X and V do not match: "
+        "X: %d x %d, "
+        "V: %d x %d\n",
+        (int)X.extent(0), (int)X.extent(1), (int)V.extent(0), (int)V.extent(1));
+    return 1;
+  }
+#endif
+
+  return TeamHadamardProductInternal::template invoke<
+      MemberType, typename XViewType::non_const_value_type,
+      typename XViewType::array_layout>(member, X.extent(0), X.extent(1),
+                                        X.data(), X.stride_0(), X.stride_1(),
+                                        Y.data(), Y.stride_0(), Y.stride_1(),
+                                        V.data(), V.stride_0(), V.stride_1());
+}
+
+///
+/// TeamVector Impl
+/// ===============
+
+template <typename MemberType>
+template <typename XViewType, typename YViewType, typename VViewType>
+KOKKOS_INLINE_FUNCTION int TeamVectorHadamardProduct<MemberType>::invoke(
+    const MemberType& member, const XViewType& X, const YViewType& Y,
+    const VViewType& V) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+  static_assert(
+      Kokkos::is_view<XViewType>::value,
+      "KokkosBatched::HadamardProduct: XViewType is not a Kokkos::View.");
+  static_assert(
+      Kokkos::is_view<YViewType>::value,
+      "KokkosBatched::HadamardProduct: YViewType is not a Kokkos::View.");
+  static_assert(
+      Kokkos::is_view<VViewType>::value,
+      "KokkosBatched::HadamardProduct: VViewType is not a Kokkos::View.");
+  static_assert(XViewType::Rank == 2,
+                "KokkosBatched::HadamardProduct: XViewType must have rank 2.");
+  static_assert(YViewType::Rank == 2,
+                "KokkosBatched::HadamardProduct: YViewType must have rank 2.");
+  static_assert(VViewType::Rank == 2,
+                "KokkosBatched::HadamardProduct: VViewType must have rank 2.");
+
+  // Check compatibility of dimensions at run time.
+  if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) {
+    KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+        "KokkosBatched::HadamardProduct: Dimensions of X and Y do not match: "
+        "X: %d x %d, "
+        "Y: %d x %d\n",
+        (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1));
+    return 1;
+  }
+  if (X.extent(0) != V.extent(0) || X.extent(1) != V.extent(1)) {
+    KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+        "KokkosBatched::HadamardProduct: Dimensions of X and V do not match: "
+        "X: %d x %d, "
+        "V: %d x %d\n",
+        (int)X.extent(0), (int)X.extent(1), (int)V.extent(0), (int)V.extent(1));
+    return 1;
+  }
+#endif
+
+  return TeamVectorHadamardProductInternal::invoke<
+      MemberType, typename XViewType::non_const_value_type,
+      typename XViewType::array_layout>(member, X.extent(0), X.extent(1),
+                                        X.data(), X.stride_0(), X.stride_1(),
+                                        Y.data(), Y.stride_0(), Y.stride_1(),
+                                        V.data(), V.stride_0(), V.stride_1());
+}
+
+}  // namespace KokkosBatched
+
+#endif
diff --git a/src/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp
index 5ee839e275..58cd9bad2d 100644
--- a/src/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_HESSENBERG_FORM_Q_SERIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_HESSENBERG_FORM_Q_SERIAL_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -11,48 +10,43 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
-  ///
-  /// this impl follows the flame interface of householder transformation
-  ///
-  struct SerialHessenbergFormQInternal {
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m, 
-           const int k, 
-           /* */ ValueType * A, const int as0, const int as1,
-           /* */ ValueType * t, const int ts,
-           /* */ ValueType * Q, const int qs0, const int qs1,
-           /* */ ValueType * w,
-           const bool is_Q_zero = false) {
-      typedef ValueType value_type;
-        
-      /// Given a matrix A that includes Hessenberg factorization
-      /// it forms a unitary matrix Q 
-      ///   B = Q = (H0 H1 H2 H3 ... H(k-2)) I
-      /// where
-      ///   A is m x k (holding H0, H1 ... H(k-2)
-      ///   t is k x 1 
-      ///   B is m x m
-      // set identity
-      if (is_Q_zero) 
-        SerialSetInternal::invoke(m, value_type(1), Q, qs0+qs1);
-      else
-        SerialSetIdentityInternal::invoke(m, Q, qs0, qs1);
-        
-      return SerialApplyQ_LeftNoTransForwardInternal
-        ::invoke(m-1, m-1, k-1,
-                 A+as0, as0, as1, 
-                 t, ts,
-                 Q+qs0+qs1, qs1, qs0,
-                 w);
-    }
-  };
-
-} // end namespace KokkosBatched
-
+///
+/// Serial Internal Impl
+/// ====================
+///
+/// this impl follows the flame interface of householder transformation
+///
+struct SerialHessenbergFormQInternal {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int k,
+                                           /* */ ValueType* A, const int as0,
+                                           const int as1,
+                                           /* */ ValueType* t, const int ts,
+                                           /* */ ValueType* Q, const int qs0,
+                                           const int qs1,
+                                           /* */ ValueType* w,
+                                           const bool is_Q_zero = false) {
+    typedef ValueType value_type;
+
+    /// Given a matrix A that includes Hessenberg factorization
+    /// it forms a unitary matrix Q
+    ///   B = Q = (H0 H1 H2 H3 ... H(k-2)) I
+    /// where
+    ///   A is m x k (holding H0, H1 ... H(k-2)
+    ///   t is k x 1
+    ///   B is m x m
+    // set identity
+    if (is_Q_zero)
+      SerialSetInternal::invoke(m, value_type(1), Q, qs0 + qs1);
+    else
+      SerialSetIdentityInternal::invoke(m, Q, qs0, qs1);
+
+    return SerialApplyQ_LeftNoTransForwardInternal ::invoke(
+        m - 1, m - 1, k - 1, A + as0, as0, as1, t, ts, Q + qs0 + qs1, qs1, qs0,
+        w);
+  }
+};
+
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp
index c889a06f0b..c6336f6acc 100644
--- a/src/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_HESSENBERG_QR_WITH_SHIFT_SERIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_HESSENBERG_QR_WITH_SHIFT_SERIAL_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -10,129 +9,118 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ====================
-  ///
-  /// this impl follows the flame interface of householder transformation
-  ///
-  struct SerialHessenbergQR_WithShiftInternal {
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int mbeg, const int mend, const int morg,
-           /* */ ValueType * HH, const int hs0, const int hs1,
-           const ValueType shift,
-           /* */ Kokkos::pair<ValueType,ValueType> * GG, const bool request_schur) {
-      typedef ValueType value_type;
-      //typedef Kokkos::Details::ArithTraits<value_type> ats;
-
-      const int hs = hs0+hs1;
-      const value_type zero(0), one(1);
-      const Kokkos::pair<value_type,value_type> identity(one,zero);
-
-      /// redefine variables
-      const int m = mend-mbeg, mbeg_mult_hs0 = mbeg*hs0;
-      value_type *H = HH+mbeg*hs;
-
-      /// initialize Gs
-      Kokkos::pair<value_type,value_type> *Gs = NULL;
-      if (request_schur) {
-        for (int i=0;i<morg;++i) GG[i] = identity;
-        Gs = GG+mbeg;
-      }
-        
-      /// Given a strict Hessenberg matrix H (m x m),
-      /// it computes a single implicit QR step with a given shift
-      /// - it assumes H has zeros on subdiagonal entries (
-      /// givens rotation is defined as G = [gamma -sigma
-      ///                                    sigma  gamma]
-      ///   G' [chi1 chi2]^t = [alpha 0]^T
-      /// where G is stored as a pair of gamma and sigma
-      Kokkos::pair<value_type,value_type> G;
-
-      /// 0. compute the first givens rotation that introduces a bulge
-      {
-        const value_type chi1 = H[0] - shift;
-        const value_type chi2 = H[hs0];
-        /* */ value_type alpha;
-        SerialGivensInternal::invoke(chi1, chi2,
-                                     &G,
-                                     &alpha);
-        // record G
-        if (request_schur) Gs[0] = G;          
-
-        value_type *h11 = H;
-        value_type *h21 = H + hs0;
-        value_type *h12 = H + hs1;
-
-        // apply G' from left
-        G.second = -G.second; // transpose G
-        const int nn = m;
-        SerialApplyLeftGivensInternal::invoke (G, nn+(morg-mend),
-                                               h11, hs1,
-                                               h21, hs1);
-          
-        // apply (G')' from right
-        const int mm = m < 3 ? m : 3;
-        SerialApplyRightGivensInternal::invoke(G, mm+mbeg,
-                                               h11-mbeg_mult_hs0, hs0,
-                                               h12-mbeg_mult_hs0, hs0);
-      }
-
-      /// 1. chase the bulge
-
-      // partitions used for loop iteration
-      Partition2x2<value_type> H_part2x2(hs0, hs1);
-      Partition3x3<value_type> H_part3x3(hs0, hs1);
-
-      // initial partition of A where ATL has a zero dimension
-      H_part2x2.partWithATL(H, m, m, 1, 1);
-
-      for (int m_htl=1;m_htl<(m-1);++m_htl) {
-        // part 2x2 into 3x3
-        H_part3x3.partWithABR(H_part2x2, 1, 1);
-        //const int n_hbr = m - m_htl;
-        /// -----------------------------------------------------
-        value_type *chi1 = H_part3x3.A11-hs1;
-        value_type *chi2 = H_part3x3.A21-hs1;
-        SerialGivensInternal::invoke(*chi1, *chi2,
-                                     &G,
-                                     chi1); *chi2 = zero;
-        // record G
-        if (request_schur) Gs[m_htl] = G;
-          
-        G.second = -G.second; // transpose G
-
-        const int nn = m - m_htl;
-        SerialApplyLeftGivensInternal::invoke (G, nn+(morg-mend),
-                                               H_part3x3.A11, hs1,
-                                               H_part3x3.A21, hs1);
-
-        const int mtmp = m_htl+3, mm = mtmp < m ? mtmp : m;
-        SerialApplyRightGivensInternal::invoke(G, mm+mbeg,
-                                               H_part3x3.A01-mbeg_mult_hs0, hs0,
-                                               H_part3x3.A02-mbeg_mult_hs0, hs0);
-        /// -----------------------------------------------------
-        H_part2x2.mergeToATL(H_part3x3);
-      }
-      return 0;
+///
+/// Serial Internal Impl
+/// ====================
+///
+/// this impl follows the flame interface of householder transformation
+///
+struct SerialHessenbergQR_WithShiftInternal {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const int mbeg, const int mend, const int morg,
+      /* */ ValueType *HH, const int hs0, const int hs1, const ValueType shift,
+      /* */ Kokkos::pair<ValueType, ValueType> *GG, const bool request_schur) {
+    typedef ValueType value_type;
+    // typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+    const int hs = hs0 + hs1;
+    const value_type zero(0), one(1);
+    const Kokkos::pair<value_type, value_type> identity(one, zero);
+
+    /// redefine variables
+    const int m = mend - mbeg, mbeg_mult_hs0 = mbeg * hs0;
+    value_type *H = HH + mbeg * hs;
+
+    /// initialize Gs
+    Kokkos::pair<value_type, value_type> *Gs = NULL;
+    if (request_schur) {
+      for (int i = 0; i < morg; ++i) GG[i] = identity;
+      Gs = GG + mbeg;
     }
 
-    template<typename ValueType>
-    KOKKOS_FORCEINLINE_FUNCTION
-    static int
-    invoke(const int mbeg, const int mend, const int morg,
-           /* */ ValueType * HH, const int hs0, const int hs1,
-           const ValueType shift) {
-      return invoke(mbeg, mend, morg, 
-                    HH, hs0, hs1, shift,
-                    (Kokkos::pair<ValueType,ValueType>*)NULL, false);
-        
+    /// Given a strict Hessenberg matrix H (m x m),
+    /// it computes a single implicit QR step with a given shift
+    /// - it assumes H has zeros on subdiagonal entries (
+    /// givens rotation is defined as G = [gamma -sigma
+    ///                                    sigma  gamma]
+    ///   G' [chi1 chi2]^t = [alpha 0]^T
+    /// where G is stored as a pair of gamma and sigma
+    Kokkos::pair<value_type, value_type> G;
+
+    /// 0. compute the first givens rotation that introduces a bulge
+    {
+      const value_type chi1 = H[0] - shift;
+      const value_type chi2 = H[hs0];
+      /* */ value_type alpha;
+      SerialGivensInternal::invoke(chi1, chi2, &G, &alpha);
+      // record G
+      if (request_schur) Gs[0] = G;
+
+      value_type *h11 = H;
+      value_type *h21 = H + hs0;
+      value_type *h12 = H + hs1;
+
+      // apply G' from left
+      G.second     = -G.second;  // transpose G
+      const int nn = m;
+      SerialApplyLeftGivensInternal::invoke(G, nn + (morg - mend), h11, hs1,
+                                            h21, hs1);
+
+      // apply (G')' from right
+      const int mm = m < 3 ? m : 3;
+      SerialApplyRightGivensInternal::invoke(G, mm + mbeg, h11 - mbeg_mult_hs0,
+                                             hs0, h12 - mbeg_mult_hs0, hs0);
     }
-  };
-    
-} // end namespace KokkosBatched
 
+    /// 1. chase the bulge
+
+    // partitions used for loop iteration
+    Partition2x2<value_type> H_part2x2(hs0, hs1);
+    Partition3x3<value_type> H_part3x3(hs0, hs1);
+
+    // initial partition of A where ATL has a zero dimension
+    H_part2x2.partWithATL(H, m, m, 1, 1);
+
+    for (int m_htl = 1; m_htl < (m - 1); ++m_htl) {
+      // part 2x2 into 3x3
+      H_part3x3.partWithABR(H_part2x2, 1, 1);
+      // const int n_hbr = m - m_htl;
+      /// -----------------------------------------------------
+      value_type *chi1 = H_part3x3.A11 - hs1;
+      value_type *chi2 = H_part3x3.A21 - hs1;
+      SerialGivensInternal::invoke(*chi1, *chi2, &G, chi1);
+      *chi2 = zero;
+      // record G
+      if (request_schur) Gs[m_htl] = G;
+
+      G.second = -G.second;  // transpose G
+
+      const int nn = m - m_htl;
+      SerialApplyLeftGivensInternal::invoke(
+          G, nn + (morg - mend), H_part3x3.A11, hs1, H_part3x3.A21, hs1);
+
+      const int mtmp = m_htl + 3, mm = mtmp < m ? mtmp : m;
+      SerialApplyRightGivensInternal::invoke(
+          G, mm + mbeg, H_part3x3.A01 - mbeg_mult_hs0, hs0,
+          H_part3x3.A02 - mbeg_mult_hs0, hs0);
+      /// -----------------------------------------------------
+      H_part2x2.mergeToATL(H_part3x3);
+    }
+    return 0;
+  }
+
+  template <typename ValueType>
+  KOKKOS_FORCEINLINE_FUNCTION static int invoke(const int mbeg, const int mend,
+                                                const int morg,
+                                                /* */ ValueType *HH,
+                                                const int hs0, const int hs1,
+                                                const ValueType shift) {
+    return invoke(mbeg, mend, morg, HH, hs0, hs1, shift,
+                  (Kokkos::pair<ValueType, ValueType> *)NULL, false);
+  }
+};
+
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Hessenberg_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Hessenberg_Serial_Internal.hpp
index 967e44b077..aa4129d419 100644
--- a/src/batched/dense/impl/KokkosBatched_Hessenberg_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Hessenberg_Serial_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_HESSENBERG_SERIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_HESSENBERG_SERIAL_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -10,97 +9,86 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
-  ///
-  /// this impl follows the flame interface of householder transformation
-  ///
-  struct SerialHessenbergInternal {
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m, // m = NumRows(A)
-           const int n, // n = NumCols(A)
-           /* */ ValueType * A, const int as0, const int as1,
-           /* */ ValueType * t, const int ts,
-           /* */ ValueType * w) {
-      typedef ValueType value_type;
-        
-      /// Given a matrix A, it computes hessenberg decomposition of the matrix
-      ///  - t is to store tau and w is for workspace
-      ///  - H = Q^H A Q and A = Q H Q^H
-
-      // partitions used for loop iteration 
-      Partition2x2<value_type> A_part2x2(as0, as1);
-      Partition3x3<value_type> A_part3x3(as0, as1);
-
-      Partition2x1<value_type> t_part2x1(ts);
-      Partition3x1<value_type> t_part3x1(ts);
-
-      // partitions used in loop body
-      Partition2x1<value_type> A21_part2x1(as0);
-      Partition2x1<value_type> A22_part2x1(as0);
-      Partition1x2<value_type> A2_part1x2 (as1);
-
-      // initial partition of A where ATL has a zero dimension
-      A_part2x2.partWithATL(A, m, n, 0, 0);
-      t_part2x1.partWithAT (t, m,    0   );
-
-      for (int m_atl=0;m_atl<m;++m_atl) {
-        // part 2x2 into 3x3
-        A_part3x3.partWithABR(A_part2x2, 1, 1);
-        const int m_A22 = m - m_atl - 1;
-        const int n_A22 = n - m_atl - 1;
-          
-        t_part3x1.partWithAB (t_part2x1, 1);
-        value_type *tau = t_part3x1.A1;
-
-        /// -----------------------------------------------------
-        if (m_A22 > 0) {
-          // partition A21 into 2x1
-          A21_part2x1.partWithAT(A_part3x3.A21, m_A22, 1);
-            
-          // perform householder transformation
-          const int m_A22_b = m_A22 - 1;
-          SerialLeftHouseholderInternal::invoke(m_A22_b,
-                                                A21_part2x1.AT,
-                                                A21_part2x1.AB, as0,
-                                                tau);
-            
-          // partition A22 into 2x1
-          A22_part2x1.partWithAT(A_part3x3.A22, m_A22, 1);          
-            
-          // left apply householder to partitioned A22
-          SerialApplyLeftHouseholderInternal::invoke(m_A22_b, n_A22,
-                                                     tau,
-                                                     A21_part2x1.AB, as0,
-                                                     A22_part2x1.AT, as1,
-                                                     A22_part2x1.AB, as0, as1, 
-                                                     w);
-            
-            
-          // partition A*2 column into 1x2
-          A2_part1x2.partWithAL(A_part3x3.A02, n_A22, 1); 
-            
-          // right apply householder to A*2 colums
-          const int n_A22_r = n_A22 - 1;
-          SerialApplyRightHouseholderInternal::invoke(m, n_A22_r,
-                                                      tau,
-                                                      A21_part2x1.AB, as0,
-                                                      A2_part1x2.AL, as0,
-                                                      A2_part1x2.AR, as0, as1,
-                                                      w);
-        }
-        /// -----------------------------------------------------
-        A_part2x2.mergeToATL(A_part3x3);
-        t_part2x1.mergeToAT (t_part3x1);
+///
+/// Serial Internal Impl
+/// ====================
+///
+/// this impl follows the flame interface of householder transformation
+///
+struct SerialHessenbergInternal {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m,  // m = NumRows(A)
+                                           const int n,  // n = NumCols(A)
+                                           /* */ ValueType *A, const int as0,
+                                           const int as1,
+                                           /* */ ValueType *t, const int ts,
+                                           /* */ ValueType *w) {
+    typedef ValueType value_type;
+
+    /// Given a matrix A, it computes hessenberg decomposition of the matrix
+    ///  - t is to store tau and w is for workspace
+    ///  - H = Q^H A Q and A = Q H Q^H
+
+    // partitions used for loop iteration
+    Partition2x2<value_type> A_part2x2(as0, as1);
+    Partition3x3<value_type> A_part3x3(as0, as1);
+
+    Partition2x1<value_type> t_part2x1(ts);
+    Partition3x1<value_type> t_part3x1(ts);
+
+    // partitions used in loop body
+    Partition2x1<value_type> A21_part2x1(as0);
+    Partition2x1<value_type> A22_part2x1(as0);
+    Partition1x2<value_type> A2_part1x2(as1);
+
+    // initial partition of A where ATL has a zero dimension
+    A_part2x2.partWithATL(A, m, n, 0, 0);
+    t_part2x1.partWithAT(t, m, 0);
+
+    for (int m_atl = 0; m_atl < m; ++m_atl) {
+      // part 2x2 into 3x3
+      A_part3x3.partWithABR(A_part2x2, 1, 1);
+      const int m_A22 = m - m_atl - 1;
+      const int n_A22 = n - m_atl - 1;
+
+      t_part3x1.partWithAB(t_part2x1, 1);
+      value_type *tau = t_part3x1.A1;
+
+      /// -----------------------------------------------------
+      if (m_A22 > 0) {
+        // partition A21 into 2x1
+        A21_part2x1.partWithAT(A_part3x3.A21, m_A22, 1);
+
+        // perform householder transformation
+        const int m_A22_b = m_A22 - 1;
+        SerialLeftHouseholderInternal::invoke(m_A22_b, A21_part2x1.AT,
+                                              A21_part2x1.AB, as0, tau);
+
+        // partition A22 into 2x1
+        A22_part2x1.partWithAT(A_part3x3.A22, m_A22, 1);
+
+        // left apply householder to partitioned A22
+        SerialApplyLeftHouseholderInternal::invoke(
+            m_A22_b, n_A22, tau, A21_part2x1.AB, as0, A22_part2x1.AT, as1,
+            A22_part2x1.AB, as0, as1, w);
+
+        // partition A*2 column into 1x2
+        A2_part1x2.partWithAL(A_part3x3.A02, n_A22, 1);
+
+        // right apply householder to A*2 colums
+        const int n_A22_r = n_A22 - 1;
+        SerialApplyRightHouseholderInternal::invoke(
+            m, n_A22_r, tau, A21_part2x1.AB, as0, A2_part1x2.AL, as0,
+            A2_part1x2.AR, as0, as1, w);
       }
-      return 0;
+      /// -----------------------------------------------------
+      A_part2x2.mergeToATL(A_part3x3);
+      t_part2x1.mergeToAT(t_part3x1);
     }
-  };
-
-} // end namespace KokkosBatched
+    return 0;
+  }
+};
 
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Householder_Serial_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Householder_Serial_Impl.hpp
index e837526353..a42d5bbb59 100644
--- a/src/batched/dense/impl/KokkosBatched_Householder_Serial_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Householder_Serial_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_HOUSEHOLDER_SERIAL_IMPL_HPP__
 #define __KOKKOSBATCHED_HOUSEHOLDER_SERIAL_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,26 +8,19 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Impl
-  /// ===========
-
-  template<>
-  template<typename aViewType,
-           typename tauViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialHouseholder<Side::Left>::
-  invoke(const aViewType &a,
-         const tauViewType &tau) {
-    return SerialLeftHouseholderInternal::
-      invoke(a.extent(0)-1,
-             a.data(),
-             a.data()+a.stride(0), a.stride(0),
-             tau.data());
-  }
-        
+///
+/// Serial Impl
+/// ===========
+
+template <>
+template <typename aViewType, typename tauViewType>
+KOKKOS_INLINE_FUNCTION int SerialHouseholder<Side::Left>::invoke(
+    const aViewType &a, const tauViewType &tau) {
+  return SerialLeftHouseholderInternal::invoke(a.extent(0) - 1, a.data(),
+                                               a.data() + a.stride(0),
+                                               a.stride(0), tau.data());
 }
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp
index cecca70b9c..565807b3c1 100644
--- a/src/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp
@@ -1,80 +1,78 @@
 #ifndef __KOKKOSBATCHED_HOUSEHOLDER_SERIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_HOUSEHOLDER_SERIAL_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
-  ///
-  /// this impl follows the flame interface of householder transformation
-  ///
-  struct SerialLeftHouseholderInternal {
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m_x2, 
-           /* */ ValueType * chi1,
-           /* */ ValueType * x2, const int x2s,
-           /* */ ValueType * tau) {
-      typedef ValueType value_type;
-      typedef typename Kokkos::Details::ArithTraits<ValueType>::mag_type mag_type;
-        
-      const mag_type zero(0);
-      const mag_type half(0.5);
-      const mag_type one(1);
-      const mag_type minus_one(-1);
-
-      /// compute the 2norm of x2
-      mag_type norm_x2_square(0);
-      for (int i=0;i<m_x2;++i) {
-        const auto x2_at_i = x2[i*x2s];
-        norm_x2_square += x2_at_i*x2_at_i;
-      }
-        
-      /// if norm_x2 is zero, return with trivial values
-      if (norm_x2_square == zero) {
-        *chi1 = -(*chi1);
-        *tau = half;
-          
-        return 0;
-      }
-
-      /// compute magnitude of chi1, equal to norm2 of chi1
-      const mag_type norm_chi1 = Kokkos::Details::ArithTraits<value_type>::abs(*chi1);
-
-      /// compute 2 norm of x using norm_chi1 and norm_x2
-      const mag_type norm_x = Kokkos::Details::ArithTraits<mag_type>::sqrt(norm_x2_square + norm_chi1*norm_chi1);
-
-      /// compute alpha
-      const mag_type alpha = (*chi1 < 0 ? one : minus_one)*norm_x;
-
-      /// overwrite x2 with u2
-      const value_type chi1_minus_alpha = *chi1 - alpha;
-      const value_type inv_chi1_minus_alpha = one/chi1_minus_alpha;
-      for (int i=0;i<m_x2;++i) 
-        x2[i*x2s] *= inv_chi1_minus_alpha;
-
-      // later consider to use the following
-      // SerialScaleInternal::invoke(m_x2, inv_chi1_minus_alpha, x2, x2s);
-
-      /// compute tau
-      const mag_type chi1_minus_alpha_square = chi1_minus_alpha*chi1_minus_alpha;
-      *tau = half + half*(norm_x2_square/chi1_minus_alpha_square);
-
-      /// overwrite chi1 with alpha
-      *chi1 = alpha;
+///
+/// Serial Internal Impl
+/// ====================
+///
+/// this impl follows the flame interface of householder transformation
+///
+struct SerialLeftHouseholderInternal {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m_x2,
+                                           /* */ ValueType* chi1,
+                                           /* */ ValueType* x2, const int x2s,
+                                           /* */ ValueType* tau) {
+    typedef ValueType value_type;
+    typedef typename Kokkos::Details::ArithTraits<ValueType>::mag_type mag_type;
+
+    const mag_type zero(0);
+    const mag_type half(0.5);
+    const mag_type one(1);
+    const mag_type minus_one(-1);
+
+    /// compute the 2norm of x2
+    mag_type norm_x2_square(0);
+    for (int i = 0; i < m_x2; ++i) {
+      const auto x2_at_i = x2[i * x2s];
+      norm_x2_square += x2_at_i * x2_at_i;
+    }
+
+    /// if norm_x2 is zero, return with trivial values
+    if (norm_x2_square == zero) {
+      *chi1 = -(*chi1);
+      *tau  = half;
 
       return 0;
     }
-  };
 
-} // end namespace KokkosBatched
+    /// compute magnitude of chi1, equal to norm2 of chi1
+    const mag_type norm_chi1 =
+        Kokkos::Details::ArithTraits<value_type>::abs(*chi1);
+
+    /// compute 2 norm of x using norm_chi1 and norm_x2
+    const mag_type norm_x = Kokkos::Details::ArithTraits<mag_type>::sqrt(
+        norm_x2_square + norm_chi1 * norm_chi1);
+
+    /// compute alpha
+    const mag_type alpha = (*chi1 < 0 ? one : minus_one) * norm_x;
+
+    /// overwrite x2 with u2
+    const value_type chi1_minus_alpha     = *chi1 - alpha;
+    const value_type inv_chi1_minus_alpha = one / chi1_minus_alpha;
+    for (int i = 0; i < m_x2; ++i) x2[i * x2s] *= inv_chi1_minus_alpha;
+
+    // later consider to use the following
+    // SerialScaleInternal::invoke(m_x2, inv_chi1_minus_alpha, x2, x2s);
+
+    /// compute tau
+    const mag_type chi1_minus_alpha_square =
+        chi1_minus_alpha * chi1_minus_alpha;
+    *tau = half + half * (norm_x2_square / chi1_minus_alpha_square);
+
+    /// overwrite chi1 with alpha
+    *chi1 = alpha;
+
+    return 0;
+  }
+};
 
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Householder_TeamVector_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Householder_TeamVector_Impl.hpp
index c111b1366e..ac6436888b 100644
--- a/src/batched/dense/impl/KokkosBatched_Householder_TeamVector_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Householder_TeamVector_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_HOUSEHOLDER_TEAMVECTOR_IMPL_HPP__
 #define __KOKKOSBATCHED_HOUSEHOLDER_TEAMVECTOR_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,28 +8,19 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// TeamVector Impl
-  /// ===============
-
-  template<typename MemberType>
-  template<typename aViewType,
-           typename tauViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamVectorHouseholder<Side::Left>::
-  invoke(const MemberType &member,
-         const aViewType &a,
-         const tauViewType &tau) {
-    return TeamVectorLeftHouseholderInternal::
-      invoke(member, 
-             a.extent(0)-1,
-             a.data(),
-             a.data()+a.stride(0), a.stride(0),
-             tau.data());
-  }
-        
+///
+/// TeamVector Impl
+/// ===============
+
+template <typename MemberType>
+template <typename aViewType, typename tauViewType>
+KOKKOS_INLINE_FUNCTION int TeamVectorHouseholder<Side::Left>::invoke(
+    const MemberType &member, const aViewType &a, const tauViewType &tau) {
+  return TeamVectorLeftHouseholderInternal::invoke(
+      member, a.extent(0) - 1, a.data(), a.data() + a.stride(0), a.stride(0),
+      tau.data());
 }
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp
index b63ca28fcf..28c678e13c 100644
--- a/src/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp
@@ -1,92 +1,89 @@
 #ifndef __KOKKOSBATCHED_HOUSEHOLDER_TEAMVECTOR_INTERNAL_HPP__
 #define __KOKKOSBATCHED_HOUSEHOLDER_TEAMVECTOR_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
-  ///
-  /// this impl follows the flame interface of householder transformation
-  ///
-  struct TeamVectorLeftHouseholderInternal {
-    template<typename MemberType, 
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const int m_x2, 
-           /* */ ValueType * chi1,
-           /* */ ValueType * x2, const int x2s,
-           /* */ ValueType * tau) {
-      typedef ValueType value_type;
-      typedef typename Kokkos::Details::ArithTraits<ValueType>::mag_type mag_type;
-        
-      const mag_type zero(0);
-      const mag_type half(0.5);
-      const mag_type one(1);
-      const mag_type minus_one(-1);
-
-      /// compute the 2norm of x2
-      mag_type norm_x2_square(0);
-      Kokkos::parallel_reduce
-        (Kokkos::TeamVectorRange(member, m_x2),
-         [&](const int &i, mag_type &val) { 
-          const auto x2_at_i = x2[i*x2s];
-          val += x2_at_i*x2_at_i;
-        }, norm_x2_square);
-        
-      /// if norm_x2 is zero, return with trivial values
-      if (norm_x2_square == zero) {
-        Kokkos::single(Kokkos::PerTeam(member), [&]() { 
-            *chi1 = -(*chi1);
-            *tau = half;
-          });
-        member.team_barrier();
-        return 0;
-      }
-
-      /// compute magnitude of chi1, equal to norm2 of chi1
-      const mag_type norm_chi1 = Kokkos::Details::ArithTraits<value_type>::abs(*chi1);
-
-      /// compute 2 norm of x using norm_chi1 and norm_x2
-      const mag_type norm_x = Kokkos::Details::ArithTraits<mag_type>::sqrt(norm_x2_square + norm_chi1*norm_chi1);
-
-      /// compute alpha
-      const mag_type alpha = (*chi1 < 0 ? one : minus_one)*norm_x;
-
-      /// overwrite x2 with u2
-      const value_type chi1_minus_alpha = *chi1 - alpha;
-      const value_type inv_chi1_minus_alpha = one/chi1_minus_alpha;
-      Kokkos::parallel_for
-        (Kokkos::TeamVectorRange(member, m_x2),
-         [&](const int &i) {
-          x2[i*x2s] *= inv_chi1_minus_alpha;
-        });
+///
+/// Serial Internal Impl
+/// ====================
+///
+/// this impl follows the flame interface of householder transformation
+///
+struct TeamVectorLeftHouseholderInternal {
+  template <typename MemberType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int m_x2,
+                                           /* */ ValueType *chi1,
+                                           /* */ ValueType *x2, const int x2s,
+                                           /* */ ValueType *tau) {
+    typedef ValueType value_type;
+    typedef typename Kokkos::Details::ArithTraits<ValueType>::mag_type mag_type;
+
+    const mag_type zero(0);
+    const mag_type half(0.5);
+    const mag_type one(1);
+    const mag_type minus_one(-1);
+
+    /// compute the 2norm of x2
+    mag_type norm_x2_square(0);
+    Kokkos::parallel_reduce(
+        Kokkos::TeamVectorRange(member, m_x2),
+        [&](const int &i, mag_type &val) {
+          const auto x2_at_i = x2[i * x2s];
+          val += x2_at_i * x2_at_i;
+        },
+        norm_x2_square);
+
+    /// if norm_x2 is zero, return with trivial values
+    if (norm_x2_square == zero) {
+      Kokkos::single(Kokkos::PerTeam(member), [&]() {
+        *chi1 = -(*chi1);
+        *tau  = half;
+      });
       member.team_barrier();
+      return 0;
+    }
 
-      // later consider to use the following
-      // SerialScaleInternal::invoke(m_x2, inv_chi1_minus_alpha, x2, x2s);
+    /// compute magnitude of chi1, equal to norm2 of chi1
+    const mag_type norm_chi1 =
+        Kokkos::Details::ArithTraits<value_type>::abs(*chi1);
 
-      /// compute tau
-      Kokkos::single(Kokkos::PerTeam(member), [&]() {
-          const mag_type chi1_minus_alpha_square = chi1_minus_alpha*chi1_minus_alpha;
-          *tau = half + half*(norm_x2_square/chi1_minus_alpha_square);
+    /// compute 2 norm of x using norm_chi1 and norm_x2
+    const mag_type norm_x = Kokkos::Details::ArithTraits<mag_type>::sqrt(
+        norm_x2_square + norm_chi1 * norm_chi1);
 
-          /// overwrite chi1 with alpha
-          *chi1 = alpha;
-        });
+    /// compute alpha
+    const mag_type alpha = (*chi1 < 0 ? one : minus_one) * norm_x;
 
-      return 0;
-    }
-  };
+    /// overwrite x2 with u2
+    const value_type chi1_minus_alpha     = *chi1 - alpha;
+    const value_type inv_chi1_minus_alpha = one / chi1_minus_alpha;
+    Kokkos::parallel_for(
+        Kokkos::TeamVectorRange(member, m_x2),
+        [&](const int &i) { x2[i * x2s] *= inv_chi1_minus_alpha; });
+    member.team_barrier();
+
+    // later consider to use the following
+    // SerialScaleInternal::invoke(m_x2, inv_chi1_minus_alpha, x2, x2s);
+
+    /// compute tau
+    Kokkos::single(Kokkos::PerTeam(member), [&]() {
+      const mag_type chi1_minus_alpha_square =
+          chi1_minus_alpha * chi1_minus_alpha;
+      *tau = half + half * (norm_x2_square / chi1_minus_alpha_square);
+
+      /// overwrite chi1 with alpha
+      *chi1 = alpha;
+    });
 
-} // end namespace KokkosBatched
+    return 0;
+  }
+};
 
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_InnerGemmFixA_Serial_Impl.hpp b/src/batched/dense/impl/KokkosBatched_InnerGemmFixA_Serial_Impl.hpp
index fcb7d77db0..17ea32aa74 100644
--- a/src/batched/dense/impl/KokkosBatched_InnerGemmFixA_Serial_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_InnerGemmFixA_Serial_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_INNER_GEMM_FIX_A_SERIAL_IMPL_HPP__
 #define __KOKKOSBATCHED_INNER_GEMM_FIX_A_SERIAL_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,1280 +8,1303 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Inner kernel (5x5)
-  /// ==================
-
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<5,5>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int n,
-                /**/  ValueType *__restrict__ C) {
-    if (n <= 0) return 0;
-
-    const ValueType
-      a_00 = A[0*_as0+0*_as1], a_01 = A[0*_as0+1*_as1], a_02 = A[0*_as0+2*_as1], a_03 = A[0*_as0+3*_as1], a_04 = A[0*_as0+4*_as1],
-      a_10 = A[1*_as0+0*_as1], a_11 = A[1*_as0+1*_as1], a_12 = A[1*_as0+2*_as1], a_13 = A[1*_as0+3*_as1], a_14 = A[1*_as0+4*_as1],
-      a_20 = A[2*_as0+0*_as1], a_21 = A[2*_as0+1*_as1], a_22 = A[2*_as0+2*_as1], a_23 = A[2*_as0+3*_as1], a_24 = A[2*_as0+4*_as1],
-      a_30 = A[3*_as0+0*_as1], a_31 = A[3*_as0+1*_as1], a_32 = A[3*_as0+2*_as1], a_33 = A[3*_as0+3*_as1], a_34 = A[3*_as0+4*_as1],
-      a_40 = A[4*_as0+0*_as1], a_41 = A[4*_as0+1*_as1], a_42 = A[4*_as0+2*_as1], a_43 = A[4*_as0+3*_as1], a_44 = A[4*_as0+4*_as1];
-      
-    ValueType
-      b_0p, c_0p,
-      b_1p, c_1p,
-      b_2p, c_2p,
-      b_3p, c_3p,
-      b_4p, c_4p;
-    
-    const int
-      ib0 = 0*_bs0, ib1 = 1*_bs0, ib2 = 2*_bs0, ib3 = 3*_bs0, ib4 = 4*_bs0,
-      ic0 = 0*_cs0, ic1 = 1*_cs0, ic2 = 2*_cs0, ic3 = 3*_cs0, ic4 = 4*_cs0;
-    
-    for (int p=0;p<n;++p) {
-      b_0p = B[ib0+p*_bs1];
-      b_1p = B[ib1+p*_bs1];
-      b_2p = B[ib2+p*_bs1];
-      b_3p = B[ib3+p*_bs1];
-      b_4p = B[ib4+p*_bs1];      
-
-      c_0p = a_00*b_0p; c_0p += a_01*b_1p; c_0p += a_02*b_2p; c_0p += a_03*b_3p; c_0p += a_04*b_4p;
-      c_1p = a_10*b_0p; c_1p += a_11*b_1p; c_1p += a_12*b_2p; c_1p += a_13*b_3p; c_1p += a_14*b_4p;
-      c_2p = a_20*b_0p; c_2p += a_21*b_1p; c_2p += a_22*b_2p; c_2p += a_23*b_3p; c_2p += a_24*b_4p;
-      c_3p = a_30*b_0p; c_3p += a_31*b_1p; c_3p += a_32*b_2p; c_3p += a_33*b_3p; c_3p += a_34*b_4p;
-      c_4p = a_40*b_0p; c_4p += a_41*b_1p; c_4p += a_42*b_2p; c_4p += a_43*b_3p; c_4p += a_44*b_4p;
-
-      C[ic0+p*_cs1] += alpha * c_0p;
-      C[ic1+p*_cs1] += alpha * c_1p;
-      C[ic2+p*_cs1] += alpha * c_2p;
-      C[ic3+p*_cs1] += alpha * c_3p;
-      C[ic4+p*_cs1] += alpha * c_4p;
-    }
-    
-    return 0;
+///
+/// Inner kernel (5x5)
+/// ==================
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 5>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (n <= 0) return 0;
+
+  const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1],
+                  a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1],
+                  a_04 = A[0 * _as0 + 4 * _as1], a_10 = A[1 * _as0 + 0 * _as1],
+                  a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1],
+                  a_13 = A[1 * _as0 + 3 * _as1], a_14 = A[1 * _as0 + 4 * _as1],
+                  a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1],
+                  a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1],
+                  a_24 = A[2 * _as0 + 4 * _as1], a_30 = A[3 * _as0 + 0 * _as1],
+                  a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1],
+                  a_33 = A[3 * _as0 + 3 * _as1], a_34 = A[3 * _as0 + 4 * _as1],
+                  a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1],
+                  a_42 = A[4 * _as0 + 2 * _as1], a_43 = A[4 * _as0 + 3 * _as1],
+                  a_44 = A[4 * _as0 + 4 * _as1];
+
+  ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p, b_3p, c_3p, b_4p, c_4p;
+
+  const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0,
+            ib4 = 4 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0,
+            ic3 = 3 * _cs0, ic4 = 4 * _cs0;
+
+  for (int p = 0; p < n; ++p) {
+    b_0p = B[ib0 + p * _bs1];
+    b_1p = B[ib1 + p * _bs1];
+    b_2p = B[ib2 + p * _bs1];
+    b_3p = B[ib3 + p * _bs1];
+    b_4p = B[ib4 + p * _bs1];
+
+    c_0p = a_00 * b_0p;
+    c_0p += a_01 * b_1p;
+    c_0p += a_02 * b_2p;
+    c_0p += a_03 * b_3p;
+    c_0p += a_04 * b_4p;
+    c_1p = a_10 * b_0p;
+    c_1p += a_11 * b_1p;
+    c_1p += a_12 * b_2p;
+    c_1p += a_13 * b_3p;
+    c_1p += a_14 * b_4p;
+    c_2p = a_20 * b_0p;
+    c_2p += a_21 * b_1p;
+    c_2p += a_22 * b_2p;
+    c_2p += a_23 * b_3p;
+    c_2p += a_24 * b_4p;
+    c_3p = a_30 * b_0p;
+    c_3p += a_31 * b_1p;
+    c_3p += a_32 * b_2p;
+    c_3p += a_33 * b_3p;
+    c_3p += a_34 * b_4p;
+    c_4p = a_40 * b_0p;
+    c_4p += a_41 * b_1p;
+    c_4p += a_42 * b_2p;
+    c_4p += a_43 * b_3p;
+    c_4p += a_44 * b_4p;
+
+    C[ic0 + p * _cs1] += alpha * c_0p;
+    C[ic1 + p * _cs1] += alpha * c_1p;
+    C[ic2 + p * _cs1] += alpha * c_2p;
+    C[ic3 + p * _cs1] += alpha * c_3p;
+    C[ic4 + p * _cs1] += alpha * c_4p;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<5,4>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int n,
-                /**/  ValueType *__restrict__ C) {
-    if (n <= 0) return 0;
-
-    const ValueType
-      a_00 = A[0*_as0+0*_as1], a_01 = A[0*_as0+1*_as1], a_02 = A[0*_as0+2*_as1], a_03 = A[0*_as0+3*_as1],
-      a_10 = A[1*_as0+0*_as1], a_11 = A[1*_as0+1*_as1], a_12 = A[1*_as0+2*_as1], a_13 = A[1*_as0+3*_as1],
-      a_20 = A[2*_as0+0*_as1], a_21 = A[2*_as0+1*_as1], a_22 = A[2*_as0+2*_as1], a_23 = A[2*_as0+3*_as1],
-      a_30 = A[3*_as0+0*_as1], a_31 = A[3*_as0+1*_as1], a_32 = A[3*_as0+2*_as1], a_33 = A[3*_as0+3*_as1],
-      a_40 = A[4*_as0+0*_as1], a_41 = A[4*_as0+1*_as1], a_42 = A[4*_as0+2*_as1], a_43 = A[4*_as0+3*_as1];
-      
-    ValueType
-      b_0p, c_0p,
-      b_1p, c_1p,
-      b_2p, c_2p,
-      b_3p, c_3p,
-      /**/  c_4p;
-    
-    const int
-      ib0 = 0*_bs0, ib1 = 1*_bs0, ib2 = 2*_bs0, ib3 = 3*_bs0, 
-      ic0 = 0*_cs0, ic1 = 1*_cs0, ic2 = 2*_cs0, ic3 = 3*_cs0, ic4 = 4*_cs0;
-    
-    for (int p=0;p<n;++p) {
-      b_0p = B[ib0+p*_bs1];
-      b_1p = B[ib1+p*_bs1];
-      b_2p = B[ib2+p*_bs1];
-      b_3p = B[ib3+p*_bs1];
-
-      c_0p = a_00*b_0p; c_0p += a_01*b_1p; c_0p += a_02*b_2p; c_0p += a_03*b_3p;
-      c_1p = a_10*b_0p; c_1p += a_11*b_1p; c_1p += a_12*b_2p; c_1p += a_13*b_3p;
-      c_2p = a_20*b_0p; c_2p += a_21*b_1p; c_2p += a_22*b_2p; c_2p += a_23*b_3p;
-      c_3p = a_30*b_0p; c_3p += a_31*b_1p; c_3p += a_32*b_2p; c_3p += a_33*b_3p;
-      c_4p = a_40*b_0p; c_4p += a_41*b_1p; c_4p += a_42*b_2p; c_4p += a_43*b_3p;
-
-      C[ic0+p*_cs1] += alpha * c_0p;
-      C[ic1+p*_cs1] += alpha * c_1p;
-      C[ic2+p*_cs1] += alpha * c_2p;
-      C[ic3+p*_cs1] += alpha * c_3p;
-      C[ic4+p*_cs1] += alpha * c_4p;
-    }
-    
-    return 0;
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 4>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (n <= 0) return 0;
+
+  const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1],
+                  a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1],
+                  a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1],
+                  a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1],
+                  a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1],
+                  a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1],
+                  a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1],
+                  a_32 = A[3 * _as0 + 2 * _as1], a_33 = A[3 * _as0 + 3 * _as1],
+                  a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1],
+                  a_42 = A[4 * _as0 + 2 * _as1], a_43 = A[4 * _as0 + 3 * _as1];
+
+  ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p, b_3p, c_3p,
+      /**/ c_4p;
+
+  const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0,
+            ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, ic3 = 3 * _cs0,
+            ic4 = 4 * _cs0;
+
+  for (int p = 0; p < n; ++p) {
+    b_0p = B[ib0 + p * _bs1];
+    b_1p = B[ib1 + p * _bs1];
+    b_2p = B[ib2 + p * _bs1];
+    b_3p = B[ib3 + p * _bs1];
+
+    c_0p = a_00 * b_0p;
+    c_0p += a_01 * b_1p;
+    c_0p += a_02 * b_2p;
+    c_0p += a_03 * b_3p;
+    c_1p = a_10 * b_0p;
+    c_1p += a_11 * b_1p;
+    c_1p += a_12 * b_2p;
+    c_1p += a_13 * b_3p;
+    c_2p = a_20 * b_0p;
+    c_2p += a_21 * b_1p;
+    c_2p += a_22 * b_2p;
+    c_2p += a_23 * b_3p;
+    c_3p = a_30 * b_0p;
+    c_3p += a_31 * b_1p;
+    c_3p += a_32 * b_2p;
+    c_3p += a_33 * b_3p;
+    c_4p = a_40 * b_0p;
+    c_4p += a_41 * b_1p;
+    c_4p += a_42 * b_2p;
+    c_4p += a_43 * b_3p;
+
+    C[ic0 + p * _cs1] += alpha * c_0p;
+    C[ic1 + p * _cs1] += alpha * c_1p;
+    C[ic2 + p * _cs1] += alpha * c_2p;
+    C[ic3 + p * _cs1] += alpha * c_3p;
+    C[ic4 + p * _cs1] += alpha * c_4p;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<5,3>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int n,
-                /**/  ValueType *__restrict__ C) {
-    if (n <= 0) return 0;
-
-    const ValueType
-      a_00 = A[0*_as0+0*_as1], a_01 = A[0*_as0+1*_as1], a_02 = A[0*_as0+2*_as1],
-      a_10 = A[1*_as0+0*_as1], a_11 = A[1*_as0+1*_as1], a_12 = A[1*_as0+2*_as1],
-      a_20 = A[2*_as0+0*_as1], a_21 = A[2*_as0+1*_as1], a_22 = A[2*_as0+2*_as1],
-      a_30 = A[3*_as0+0*_as1], a_31 = A[3*_as0+1*_as1], a_32 = A[3*_as0+2*_as1],
-      a_40 = A[4*_as0+0*_as1], a_41 = A[4*_as0+1*_as1], a_42 = A[4*_as0+2*_as1];
-      
-    ValueType
-      b_0p, c_0p,
-      b_1p, c_1p,
-      b_2p, c_2p,
-      /**/  c_3p,
-      /**/  c_4p;
-    
-    const int
-      ib0 = 0*_bs0, ib1 = 1*_bs0, ib2 = 2*_bs0, 
-      ic0 = 0*_cs0, ic1 = 1*_cs0, ic2 = 2*_cs0, ic3 = 3*_cs0, ic4 = 4*_cs0;
-    
-    for (int p=0;p<n;++p) {
-      b_0p = B[ib0+p*_bs1];
-      b_1p = B[ib1+p*_bs1];
-      b_2p = B[ib2+p*_bs1];
-
-      c_0p = a_00*b_0p; c_0p += a_01*b_1p; c_0p += a_02*b_2p;
-      c_1p = a_10*b_0p; c_1p += a_11*b_1p; c_1p += a_12*b_2p;
-      c_2p = a_20*b_0p; c_2p += a_21*b_1p; c_2p += a_22*b_2p;
-      c_3p = a_30*b_0p; c_3p += a_31*b_1p; c_3p += a_32*b_2p;
-      c_4p = a_40*b_0p; c_4p += a_41*b_1p; c_4p += a_42*b_2p;
-
-      C[ic0+p*_cs1] += alpha * c_0p;
-      C[ic1+p*_cs1] += alpha * c_1p;
-      C[ic2+p*_cs1] += alpha * c_2p;
-      C[ic3+p*_cs1] += alpha * c_3p;
-      C[ic4+p*_cs1] += alpha * c_4p;
-    }
-       
-    return 0;
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 3>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (n <= 0) return 0;
+
+  const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1],
+                  a_02 = A[0 * _as0 + 2 * _as1], a_10 = A[1 * _as0 + 0 * _as1],
+                  a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1],
+                  a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1],
+                  a_22 = A[2 * _as0 + 2 * _as1], a_30 = A[3 * _as0 + 0 * _as1],
+                  a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1],
+                  a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1],
+                  a_42 = A[4 * _as0 + 2 * _as1];
+
+  ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p,
+      /**/ c_3p,
+      /**/ c_4p;
+
+  const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ic0 = 0 * _cs0,
+            ic1 = 1 * _cs0, ic2 = 2 * _cs0, ic3 = 3 * _cs0, ic4 = 4 * _cs0;
+
+  for (int p = 0; p < n; ++p) {
+    b_0p = B[ib0 + p * _bs1];
+    b_1p = B[ib1 + p * _bs1];
+    b_2p = B[ib2 + p * _bs1];
+
+    c_0p = a_00 * b_0p;
+    c_0p += a_01 * b_1p;
+    c_0p += a_02 * b_2p;
+    c_1p = a_10 * b_0p;
+    c_1p += a_11 * b_1p;
+    c_1p += a_12 * b_2p;
+    c_2p = a_20 * b_0p;
+    c_2p += a_21 * b_1p;
+    c_2p += a_22 * b_2p;
+    c_3p = a_30 * b_0p;
+    c_3p += a_31 * b_1p;
+    c_3p += a_32 * b_2p;
+    c_4p = a_40 * b_0p;
+    c_4p += a_41 * b_1p;
+    c_4p += a_42 * b_2p;
+
+    C[ic0 + p * _cs1] += alpha * c_0p;
+    C[ic1 + p * _cs1] += alpha * c_1p;
+    C[ic2 + p * _cs1] += alpha * c_2p;
+    C[ic3 + p * _cs1] += alpha * c_3p;
+    C[ic4 + p * _cs1] += alpha * c_4p;
   }
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<5,2>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int n,
-                /**/  ValueType *__restrict__ C) {
-    if (n <= 0) return 0;
-
-    const ValueType
-      a_00 = A[0*_as0+0*_as1], a_01 = A[0*_as0+1*_as1],
-      a_10 = A[1*_as0+0*_as1], a_11 = A[1*_as0+1*_as1],
-      a_20 = A[2*_as0+0*_as1], a_21 = A[2*_as0+1*_as1],
-      a_30 = A[3*_as0+0*_as1], a_31 = A[3*_as0+1*_as1],
-      a_40 = A[4*_as0+0*_as1], a_41 = A[4*_as0+1*_as1];
-      
-    ValueType
-      b_0p, c_0p,
-      b_1p, c_1p,
-      /**/  c_2p,
-      /**/  c_3p,
-      /**/  c_4p;
-    
-    const int
-      ib0 = 0*_bs0, ib1 = 1*_bs0, 
-      ic0 = 0*_cs0, ic1 = 1*_cs0, ic2 = 2*_cs0, ic3 = 3*_cs0, ic4 = 4*_cs0;
-    
-    for (int p=0;p<n;++p) {
-      b_0p = B[ib0+p*_bs1];
-      b_1p = B[ib1+p*_bs1];
-
-      c_0p = a_00*b_0p; c_0p += a_01*b_1p;
-      c_1p = a_10*b_0p; c_1p += a_11*b_1p;
-      c_2p = a_20*b_0p; c_2p += a_21*b_1p;
-      c_3p = a_30*b_0p; c_3p += a_31*b_1p;
-      c_4p = a_40*b_0p; c_4p += a_41*b_1p;
-
-      C[ic0+p*_cs1] += alpha * c_0p;
-      C[ic1+p*_cs1] += alpha * c_1p;
-      C[ic2+p*_cs1] += alpha * c_2p;
-      C[ic3+p*_cs1] += alpha * c_3p;
-      C[ic4+p*_cs1] += alpha * c_4p;
-    }
-    
-    return 0;
+
+  return 0;
+}
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 2>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (n <= 0) return 0;
+
+  const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1],
+                  a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1],
+                  a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1],
+                  a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1],
+                  a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1];
+
+  ValueType b_0p, c_0p, b_1p, c_1p,
+      /**/ c_2p,
+      /**/ c_3p,
+      /**/ c_4p;
+
+  const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0,
+            ic2 = 2 * _cs0, ic3 = 3 * _cs0, ic4 = 4 * _cs0;
+
+  for (int p = 0; p < n; ++p) {
+    b_0p = B[ib0 + p * _bs1];
+    b_1p = B[ib1 + p * _bs1];
+
+    c_0p = a_00 * b_0p;
+    c_0p += a_01 * b_1p;
+    c_1p = a_10 * b_0p;
+    c_1p += a_11 * b_1p;
+    c_2p = a_20 * b_0p;
+    c_2p += a_21 * b_1p;
+    c_3p = a_30 * b_0p;
+    c_3p += a_31 * b_1p;
+    c_4p = a_40 * b_0p;
+    c_4p += a_41 * b_1p;
+
+    C[ic0 + p * _cs1] += alpha * c_0p;
+    C[ic1 + p * _cs1] += alpha * c_1p;
+    C[ic2 + p * _cs1] += alpha * c_2p;
+    C[ic3 + p * _cs1] += alpha * c_3p;
+    C[ic4 + p * _cs1] += alpha * c_4p;
   }
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<5,1>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int n,
-                /**/  ValueType *__restrict__ C) {
-    if (n <= 0) return 0;
-
-    const ValueType
-      a_00 = A[0*_as0+0*_as1],
-      a_10 = A[1*_as0+0*_as1],
-      a_20 = A[2*_as0+0*_as1],
-      a_30 = A[3*_as0+0*_as1],
-      a_40 = A[4*_as0+0*_as1];
-      
-    ValueType
-      b_0p, c_0p,
-      /**/  c_1p,
-      /**/  c_2p,
-      /**/  c_3p,
-      /**/  c_4p;
-    
-    const int
-      ib0 = 0*_bs0, 
-      ic0 = 0*_cs0, ic1 = 1*_cs0, ic2 = 2*_cs0, ic3 = 3*_cs0, ic4 = 4*_cs0;
-    
-    for (int p=0;p<n;++p) {
-      b_0p = B[ib0+p*_bs1];
-
-      c_0p = a_00*b_0p; 
-      c_1p = a_10*b_0p; 
-      c_2p = a_20*b_0p; 
-      c_3p = a_30*b_0p; 
-      c_4p = a_40*b_0p; 
-
-      C[ic0+p*_cs1] += alpha * c_0p;
-      C[ic1+p*_cs1] += alpha * c_1p;
-      C[ic2+p*_cs1] += alpha * c_2p;
-      C[ic3+p*_cs1] += alpha * c_3p;
-      C[ic4+p*_cs1] += alpha * c_4p;
-    }
 
-    return 0;
+  return 0;
+}
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 1>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (n <= 0) return 0;
+
+  const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_10 = A[1 * _as0 + 0 * _as1],
+                  a_20 = A[2 * _as0 + 0 * _as1], a_30 = A[3 * _as0 + 0 * _as1],
+                  a_40 = A[4 * _as0 + 0 * _as1];
+
+  ValueType b_0p, c_0p,
+      /**/ c_1p,
+      /**/ c_2p,
+      /**/ c_3p,
+      /**/ c_4p;
+
+  const int ib0 = 0 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0,
+            ic3 = 3 * _cs0, ic4 = 4 * _cs0;
+
+  for (int p = 0; p < n; ++p) {
+    b_0p = B[ib0 + p * _bs1];
+
+    c_0p = a_00 * b_0p;
+    c_1p = a_10 * b_0p;
+    c_2p = a_20 * b_0p;
+    c_3p = a_30 * b_0p;
+    c_4p = a_40 * b_0p;
+
+    C[ic0 + p * _cs1] += alpha * c_0p;
+    C[ic1 + p * _cs1] += alpha * c_1p;
+    C[ic2 + p * _cs1] += alpha * c_2p;
+    C[ic3 + p * _cs1] += alpha * c_3p;
+    C[ic4 + p * _cs1] += alpha * c_4p;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<4,5>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int n,
-                /**/  ValueType *__restrict__ C) {
-    if (n <= 0) return 0;
-
-    const ValueType
-      a_00 = A[0*_as0+0*_as1], a_01 = A[0*_as0+1*_as1], a_02 = A[0*_as0+2*_as1], a_03 = A[0*_as0+3*_as1], a_04 = A[0*_as0+4*_as1],
-      a_10 = A[1*_as0+0*_as1], a_11 = A[1*_as0+1*_as1], a_12 = A[1*_as0+2*_as1], a_13 = A[1*_as0+3*_as1], a_14 = A[1*_as0+4*_as1],
-      a_20 = A[2*_as0+0*_as1], a_21 = A[2*_as0+1*_as1], a_22 = A[2*_as0+2*_as1], a_23 = A[2*_as0+3*_as1], a_24 = A[2*_as0+4*_as1],
-      a_30 = A[3*_as0+0*_as1], a_31 = A[3*_as0+1*_as1], a_32 = A[3*_as0+2*_as1], a_33 = A[3*_as0+3*_as1], a_34 = A[3*_as0+4*_as1];
-      
-    ValueType
-      b_0p, c_0p,
-      b_1p, c_1p,
-      b_2p, c_2p,
-      b_3p, c_3p,
-      b_4p;
-    
-    const int
-      ib0 = 0*_bs0, ib1 = 1*_bs0, ib2 = 2*_bs0, ib3 = 3*_bs0, ib4 = 4*_bs0,
-      ic0 = 0*_cs0, ic1 = 1*_cs0, ic2 = 2*_cs0, ic3 = 3*_cs0;
-    
-    for (int p=0;p<n;++p) {
-      b_0p = B[ib0+p*_bs1];
-      b_1p = B[ib1+p*_bs1];
-      b_2p = B[ib2+p*_bs1];
-      b_3p = B[ib3+p*_bs1];
-      b_4p = B[ib4+p*_bs1];
-
-      c_0p = a_00*b_0p; c_0p += a_01*b_1p; c_0p += a_02*b_2p; c_0p += a_03*b_3p; c_0p += a_04*b_4p;
-      c_1p = a_10*b_0p; c_1p += a_11*b_1p; c_1p += a_12*b_2p; c_1p += a_13*b_3p; c_1p += a_14*b_4p;
-      c_2p = a_20*b_0p; c_2p += a_21*b_1p; c_2p += a_22*b_2p; c_2p += a_23*b_3p; c_2p += a_24*b_4p;
-      c_3p = a_30*b_0p; c_3p += a_31*b_1p; c_3p += a_32*b_2p; c_3p += a_33*b_3p; c_3p += a_34*b_4p;
-
-      C[ic0+p*_cs1] += alpha * c_0p;
-      C[ic1+p*_cs1] += alpha * c_1p;
-      C[ic2+p*_cs1] += alpha * c_2p;
-      C[ic3+p*_cs1] += alpha * c_3p;
-    }
+  return 0;
+}
 
-    return 0;
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 5>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (n <= 0) return 0;
+
+  const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1],
+                  a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1],
+                  a_04 = A[0 * _as0 + 4 * _as1], a_10 = A[1 * _as0 + 0 * _as1],
+                  a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1],
+                  a_13 = A[1 * _as0 + 3 * _as1], a_14 = A[1 * _as0 + 4 * _as1],
+                  a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1],
+                  a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1],
+                  a_24 = A[2 * _as0 + 4 * _as1], a_30 = A[3 * _as0 + 0 * _as1],
+                  a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1],
+                  a_33 = A[3 * _as0 + 3 * _as1], a_34 = A[3 * _as0 + 4 * _as1];
+
+  ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p, b_3p, c_3p, b_4p;
+
+  const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0,
+            ib4 = 4 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0,
+            ic3 = 3 * _cs0;
+
+  for (int p = 0; p < n; ++p) {
+    b_0p = B[ib0 + p * _bs1];
+    b_1p = B[ib1 + p * _bs1];
+    b_2p = B[ib2 + p * _bs1];
+    b_3p = B[ib3 + p * _bs1];
+    b_4p = B[ib4 + p * _bs1];
+
+    c_0p = a_00 * b_0p;
+    c_0p += a_01 * b_1p;
+    c_0p += a_02 * b_2p;
+    c_0p += a_03 * b_3p;
+    c_0p += a_04 * b_4p;
+    c_1p = a_10 * b_0p;
+    c_1p += a_11 * b_1p;
+    c_1p += a_12 * b_2p;
+    c_1p += a_13 * b_3p;
+    c_1p += a_14 * b_4p;
+    c_2p = a_20 * b_0p;
+    c_2p += a_21 * b_1p;
+    c_2p += a_22 * b_2p;
+    c_2p += a_23 * b_3p;
+    c_2p += a_24 * b_4p;
+    c_3p = a_30 * b_0p;
+    c_3p += a_31 * b_1p;
+    c_3p += a_32 * b_2p;
+    c_3p += a_33 * b_3p;
+    c_3p += a_34 * b_4p;
+
+    C[ic0 + p * _cs1] += alpha * c_0p;
+    C[ic1 + p * _cs1] += alpha * c_1p;
+    C[ic2 + p * _cs1] += alpha * c_2p;
+    C[ic3 + p * _cs1] += alpha * c_3p;
   }
 
+  return 0;
+}
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<3,5>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int n,
-                /**/  ValueType *__restrict__ C) {
-    if (n <= 0) return 0;
-
-    const ValueType
-      a_00 = A[0*_as0+0*_as1], a_01 = A[0*_as0+1*_as1], a_02 = A[0*_as0+2*_as1], a_03 = A[0*_as0+3*_as1], a_04 = A[0*_as0+4*_as1],
-      a_10 = A[1*_as0+0*_as1], a_11 = A[1*_as0+1*_as1], a_12 = A[1*_as0+2*_as1], a_13 = A[1*_as0+3*_as1], a_14 = A[1*_as0+4*_as1],
-      a_20 = A[2*_as0+0*_as1], a_21 = A[2*_as0+1*_as1], a_22 = A[2*_as0+2*_as1], a_23 = A[2*_as0+3*_as1], a_24 = A[2*_as0+4*_as1];
-      
-    ValueType
-      b_0p, c_0p,
-      b_1p, c_1p,
-      b_2p, c_2p,
-      b_3p,
-      b_4p;
-    
-    const int
-      ib0 = 0*_bs0, ib1 = 1*_bs0, ib2 = 2*_bs0, ib3 = 3*_bs0, ib4 = 4*_bs0,
-      ic0 = 0*_cs0, ic1 = 1*_cs0, ic2 = 2*_cs0;
-    
-    for (int p=0;p<n;++p) {
-      b_0p = B[ib0+p*_bs1];
-      b_1p = B[ib1+p*_bs1];
-      b_2p = B[ib2+p*_bs1];
-      b_3p = B[ib3+p*_bs1];
-      b_4p = B[ib4+p*_bs1];
-
-      c_0p = a_00*b_0p; c_0p += a_01*b_1p; c_0p += a_02*b_2p; c_0p += a_03*b_3p; c_0p += a_04*b_4p;
-      c_1p = a_10*b_0p; c_1p += a_11*b_1p; c_1p += a_12*b_2p; c_1p += a_13*b_3p; c_1p += a_14*b_4p;
-      c_2p = a_20*b_0p; c_2p += a_21*b_1p; c_2p += a_22*b_2p; c_2p += a_23*b_3p; c_2p += a_24*b_4p;
-
-      C[ic0+p*_cs1] += alpha * c_0p;
-      C[ic1+p*_cs1] += alpha * c_1p;
-      C[ic2+p*_cs1] += alpha * c_2p;
-    }
-
-    return 0;
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 5>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (n <= 0) return 0;
+
+  const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1],
+                  a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1],
+                  a_04 = A[0 * _as0 + 4 * _as1], a_10 = A[1 * _as0 + 0 * _as1],
+                  a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1],
+                  a_13 = A[1 * _as0 + 3 * _as1], a_14 = A[1 * _as0 + 4 * _as1],
+                  a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1],
+                  a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1],
+                  a_24 = A[2 * _as0 + 4 * _as1];
+
+  ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p, b_3p, b_4p;
+
+  const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0,
+            ib4 = 4 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0;
+
+  for (int p = 0; p < n; ++p) {
+    b_0p = B[ib0 + p * _bs1];
+    b_1p = B[ib1 + p * _bs1];
+    b_2p = B[ib2 + p * _bs1];
+    b_3p = B[ib3 + p * _bs1];
+    b_4p = B[ib4 + p * _bs1];
+
+    c_0p = a_00 * b_0p;
+    c_0p += a_01 * b_1p;
+    c_0p += a_02 * b_2p;
+    c_0p += a_03 * b_3p;
+    c_0p += a_04 * b_4p;
+    c_1p = a_10 * b_0p;
+    c_1p += a_11 * b_1p;
+    c_1p += a_12 * b_2p;
+    c_1p += a_13 * b_3p;
+    c_1p += a_14 * b_4p;
+    c_2p = a_20 * b_0p;
+    c_2p += a_21 * b_1p;
+    c_2p += a_22 * b_2p;
+    c_2p += a_23 * b_3p;
+    c_2p += a_24 * b_4p;
+
+    C[ic0 + p * _cs1] += alpha * c_0p;
+    C[ic1 + p * _cs1] += alpha * c_1p;
+    C[ic2 + p * _cs1] += alpha * c_2p;
   }
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<2,5>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int n,
-                /**/  ValueType *__restrict__ C) {
-    if (n <= 0) return 0;
-
-    const ValueType
-      a_00 = A[0*_as0+0*_as1], a_01 = A[0*_as0+1*_as1], a_02 = A[0*_as0+2*_as1], a_03 = A[0*_as0+3*_as1], a_04 = A[0*_as0+4*_as1],
-      a_10 = A[1*_as0+0*_as1], a_11 = A[1*_as0+1*_as1], a_12 = A[1*_as0+2*_as1], a_13 = A[1*_as0+3*_as1], a_14 = A[1*_as0+4*_as1];
-      
-    ValueType
-      b_0p, c_0p,
-      b_1p, c_1p,
-      b_2p,
-      b_3p,
-      b_4p;
-    
-    const int
-      ib0 = 0*_bs0, ib1 = 1*_bs0, ib2 = 2*_bs0, ib3 = 3*_bs0, ib4 = 4*_bs0,
-      ic0 = 0*_cs0, ic1 = 1*_cs0;
-    
-    for (int p=0;p<n;++p) {
-      b_0p = B[ib0+p*_bs1];
-      b_1p = B[ib1+p*_bs1];
-      b_2p = B[ib2+p*_bs1];
-      b_3p = B[ib3+p*_bs1];
-      b_4p = B[ib4+p*_bs1];
-
-      c_0p = a_00*b_0p; c_0p += a_01*b_1p; c_0p += a_02*b_2p; c_0p += a_03*b_3p; c_0p += a_04*b_4p;
-      c_1p = a_10*b_0p; c_1p += a_11*b_1p; c_1p += a_12*b_2p; c_1p += a_13*b_3p; c_1p += a_14*b_4p;
-
-      C[ic0+p*_cs1] += alpha * c_0p;
-      C[ic1+p*_cs1] += alpha * c_1p;
-    }
 
-    return 0;
+  return 0;
+}
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 5>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (n <= 0) return 0;
+
+  const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1],
+                  a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1],
+                  a_04 = A[0 * _as0 + 4 * _as1], a_10 = A[1 * _as0 + 0 * _as1],
+                  a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1],
+                  a_13 = A[1 * _as0 + 3 * _as1], a_14 = A[1 * _as0 + 4 * _as1];
+
+  ValueType b_0p, c_0p, b_1p, c_1p, b_2p, b_3p, b_4p;
+
+  const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0,
+            ib4 = 4 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0;
+
+  for (int p = 0; p < n; ++p) {
+    b_0p = B[ib0 + p * _bs1];
+    b_1p = B[ib1 + p * _bs1];
+    b_2p = B[ib2 + p * _bs1];
+    b_3p = B[ib3 + p * _bs1];
+    b_4p = B[ib4 + p * _bs1];
+
+    c_0p = a_00 * b_0p;
+    c_0p += a_01 * b_1p;
+    c_0p += a_02 * b_2p;
+    c_0p += a_03 * b_3p;
+    c_0p += a_04 * b_4p;
+    c_1p = a_10 * b_0p;
+    c_1p += a_11 * b_1p;
+    c_1p += a_12 * b_2p;
+    c_1p += a_13 * b_3p;
+    c_1p += a_14 * b_4p;
+
+    C[ic0 + p * _cs1] += alpha * c_0p;
+    C[ic1 + p * _cs1] += alpha * c_1p;
   }
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<1,5>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int n,
-                /**/  ValueType *__restrict__ C) {
-    if (n <= 0) return 0;
-
-    const ValueType
-      a_00 = A[0*_as0+0*_as1], a_01 = A[0*_as0+1*_as1], a_02 = A[0*_as0+2*_as1], a_03 = A[0*_as0+3*_as1], a_04 = A[0*_as0+4*_as1];
-      
-    ValueType
-      b_0p, c_0p,
-      b_1p,
-      b_2p,
-      b_3p,
-      b_4p;
-    
-    const int
-      ib0 = 0*_bs0, ib1 = 1*_bs0, ib2 = 2*_bs0, ib3 = 3*_bs0, ib4 = 4*_bs0,
-      ic0 = 0*_cs0;
-    
-    for (int p=0;p<n;++p) {
-      b_0p = B[ib0+p*_bs1];
-      b_1p = B[ib1+p*_bs1];
-      b_2p = B[ib2+p*_bs1];
-      b_3p = B[ib3+p*_bs1];
-      b_4p = B[ib4+p*_bs1];
-
-      c_0p = a_00*b_0p; c_0p += a_01*b_1p; c_0p += a_02*b_2p; c_0p += a_03*b_3p; c_0p += a_04*b_4p;
-
-      C[ic0+p*_cs1] += alpha * c_0p;
-    }
 
-    return 0;
+  return 0;
+}
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 5>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (n <= 0) return 0;
+
+  const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1],
+                  a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1],
+                  a_04 = A[0 * _as0 + 4 * _as1];
+
+  ValueType b_0p, c_0p, b_1p, b_2p, b_3p, b_4p;
+
+  const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0,
+            ib4 = 4 * _bs0, ic0 = 0 * _cs0;
+
+  for (int p = 0; p < n; ++p) {
+    b_0p = B[ib0 + p * _bs1];
+    b_1p = B[ib1 + p * _bs1];
+    b_2p = B[ib2 + p * _bs1];
+    b_3p = B[ib3 + p * _bs1];
+    b_4p = B[ib4 + p * _bs1];
+
+    c_0p = a_00 * b_0p;
+    c_0p += a_01 * b_1p;
+    c_0p += a_02 * b_2p;
+    c_0p += a_03 * b_3p;
+    c_0p += a_04 * b_4p;
+
+    C[ic0 + p * _cs1] += alpha * c_0p;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<5,5>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m, const int n, const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (m <=0 || n <= 0 || k <= 0) return 0;
-
-    switch (m*10+k) {
-    case 54: { InnerGemmFixA<5,4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, n, C); break; }
-    case 53: { InnerGemmFixA<5,3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, n, C); break; }
-    case 52: { InnerGemmFixA<5,2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, n, C); break; }
-    case 51: { InnerGemmFixA<5,1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, n, C); break; }
-    case 45: { InnerGemmFixA<4,5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, n, C); break; }
-    case 35: { InnerGemmFixA<3,5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, n, C); break; }
-    case 25: { InnerGemmFixA<2,5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, n, C); break; }
-    case 15: { InnerGemmFixA<1,5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, n, C); break; }
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<5, 5>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0 || n <= 0 || k <= 0) return 0;
+
+  switch (m * 10 + k) {
+    case 54: {
+      InnerGemmFixA<5, 4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, n, C);
+      break;
+    }
+    case 53: {
+      InnerGemmFixA<5, 3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, n, C);
+      break;
+    }
+    case 52: {
+      InnerGemmFixA<5, 2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, n, C);
+      break;
+    }
+    case 51: {
+      InnerGemmFixA<5, 1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, n, C);
+      break;
+    }
+    case 45: {
+      InnerGemmFixA<4, 5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, n, C);
+      break;
+    }
+    case 35: {
+      InnerGemmFixA<3, 5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, n, C);
+      break;
+    }
+    case 25: {
+      InnerGemmFixA<2, 5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, n, C);
+      break;
+    }
+    case 15: {
+      InnerGemmFixA<1, 5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, n, C);
+      break;
+    }
     default: {
       if (m < 5 && n < 5) {
-        InnerGemmFixA<2,2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
-        for (int i=0;i<m;i+=2)
-          for (int p=0;p<k;p+=2)
-            inner.serial_invoke(alpha, A+i*_as0+p*_as1, B+p*_bs0, (i+2 > m ? 1 : 2), n, (p+2 > k ? 1 : 2), C+i*_cs0);
+        InnerGemmFixA<2, 2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+        for (int i = 0; i < m; i += 2)
+          for (int p = 0; p < k; p += 2)
+            inner.serial_invoke(alpha, A + i * _as0 + p * _as1, B + p * _bs0,
+                                (i + 2 > m ? 1 : 2), n, (p + 2 > k ? 1 : 2),
+                                C + i * _cs0);
       } else {
-        Kokkos::abort("InnerGemmFixA<5,5>::serial_invoke, assert failure (m<5 && n<5)");
+        Kokkos::abort(
+            "InnerGemmFixA<5,5>::serial_invoke, assert failure (m<5 && n<5)");
       }
       break;
     }
-    }
-        
-    return 0;
   }
 
-  ///
-  /// Inner kernel (4x4)
-  /// ==================
-  
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<4,4>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int n,
-                /**/  ValueType *__restrict__ C) {
-    if (n <= 0) return 0;
-
-    const ValueType
-      a_00 = A[0*_as0+0*_as1], a_01 = A[0*_as0+1*_as1], a_02 = A[0*_as0+2*_as1], a_03 = A[0*_as0+3*_as1], 
-      a_10 = A[1*_as0+0*_as1], a_11 = A[1*_as0+1*_as1], a_12 = A[1*_as0+2*_as1], a_13 = A[1*_as0+3*_as1], 
-      a_20 = A[2*_as0+0*_as1], a_21 = A[2*_as0+1*_as1], a_22 = A[2*_as0+2*_as1], a_23 = A[2*_as0+3*_as1], 
-      a_30 = A[3*_as0+0*_as1], a_31 = A[3*_as0+1*_as1], a_32 = A[3*_as0+2*_as1], a_33 = A[3*_as0+3*_as1]; 
-      
-    ValueType
-      b_0p, c_0p,
-      b_1p, c_1p,
-      b_2p, c_2p,
-      b_3p, c_3p;
-    
-    const int
-      ib0 = 0*_bs0, ib1 = 1*_bs0, ib2 = 2*_bs0, ib3 = 3*_bs0,
-      ic0 = 0*_cs0, ic1 = 1*_cs0, ic2 = 2*_cs0, ic3 = 3*_cs0;
-    
-    for (int p=0;p<n;++p) {
-      b_0p = B[ib0+p*_bs1];
-      b_1p = B[ib1+p*_bs1];
-      b_2p = B[ib2+p*_bs1];
-      b_3p = B[ib3+p*_bs1];
-
-      c_0p = a_00*b_0p; c_0p += a_01*b_1p; c_0p += a_02*b_2p; c_0p += a_03*b_3p;
-      c_1p = a_10*b_0p; c_1p += a_11*b_1p; c_1p += a_12*b_2p; c_1p += a_13*b_3p;
-      c_2p = a_20*b_0p; c_2p += a_21*b_1p; c_2p += a_22*b_2p; c_2p += a_23*b_3p;
-      c_3p = a_30*b_0p; c_3p += a_31*b_1p; c_3p += a_32*b_2p; c_3p += a_33*b_3p;
-
-      C[ic0+p*_cs1] += alpha * c_0p;
-      C[ic1+p*_cs1] += alpha * c_1p;
-      C[ic2+p*_cs1] += alpha * c_2p;
-      C[ic3+p*_cs1] += alpha * c_3p;
-    }
+  return 0;
+}
 
-    return 0;
+///
+/// Inner kernel (4x4)
+/// ==================
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 4>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (n <= 0) return 0;
+
+  const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1],
+                  a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1],
+                  a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1],
+                  a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1],
+                  a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1],
+                  a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1],
+                  a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1],
+                  a_32 = A[3 * _as0 + 2 * _as1], a_33 = A[3 * _as0 + 3 * _as1];
+
+  ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p, b_3p, c_3p;
+
+  const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0,
+            ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0, ic3 = 3 * _cs0;
+
+  for (int p = 0; p < n; ++p) {
+    b_0p = B[ib0 + p * _bs1];
+    b_1p = B[ib1 + p * _bs1];
+    b_2p = B[ib2 + p * _bs1];
+    b_3p = B[ib3 + p * _bs1];
+
+    c_0p = a_00 * b_0p;
+    c_0p += a_01 * b_1p;
+    c_0p += a_02 * b_2p;
+    c_0p += a_03 * b_3p;
+    c_1p = a_10 * b_0p;
+    c_1p += a_11 * b_1p;
+    c_1p += a_12 * b_2p;
+    c_1p += a_13 * b_3p;
+    c_2p = a_20 * b_0p;
+    c_2p += a_21 * b_1p;
+    c_2p += a_22 * b_2p;
+    c_2p += a_23 * b_3p;
+    c_3p = a_30 * b_0p;
+    c_3p += a_31 * b_1p;
+    c_3p += a_32 * b_2p;
+    c_3p += a_33 * b_3p;
+
+    C[ic0 + p * _cs1] += alpha * c_0p;
+    C[ic1 + p * _cs1] += alpha * c_1p;
+    C[ic2 + p * _cs1] += alpha * c_2p;
+    C[ic3 + p * _cs1] += alpha * c_3p;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<4,3>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int n,
-                /**/  ValueType *__restrict__ C) {
-    if (n <= 0) return 0;
-
-    const ValueType
-      a_00 = A[0*_as0+0*_as1], a_01 = A[0*_as0+1*_as1], a_02 = A[0*_as0+2*_as1], 
-      a_10 = A[1*_as0+0*_as1], a_11 = A[1*_as0+1*_as1], a_12 = A[1*_as0+2*_as1], 
-      a_20 = A[2*_as0+0*_as1], a_21 = A[2*_as0+1*_as1], a_22 = A[2*_as0+2*_as1], 
-      a_30 = A[3*_as0+0*_as1], a_31 = A[3*_as0+1*_as1], a_32 = A[3*_as0+2*_as1]; 
-      
-    ValueType
-      b_0p, c_0p,
-      b_1p, c_1p,
-      b_2p, c_2p,
-      /**/  c_3p;
-    
-    const int
-      ib0 = 0*_bs0, ib1 = 1*_bs0, ib2 = 2*_bs0, 
-      ic0 = 0*_cs0, ic1 = 1*_cs0, ic2 = 2*_cs0, ic3 = 3*_cs0;
-    
-    for (int p=0;p<n;++p) {
-      b_0p = B[ib0+p*_bs1];
-      b_1p = B[ib1+p*_bs1];
-      b_2p = B[ib2+p*_bs1];
-
-      c_0p = a_00*b_0p; c_0p += a_01*b_1p; c_0p += a_02*b_2p; 
-      c_1p = a_10*b_0p; c_1p += a_11*b_1p; c_1p += a_12*b_2p; 
-      c_2p = a_20*b_0p; c_2p += a_21*b_1p; c_2p += a_22*b_2p; 
-      c_3p = a_30*b_0p; c_3p += a_31*b_1p; c_3p += a_32*b_2p; 
-
-      C[ic0+p*_cs1] += alpha * c_0p;
-      C[ic1+p*_cs1] += alpha * c_1p;
-      C[ic2+p*_cs1] += alpha * c_2p;
-      C[ic3+p*_cs1] += alpha * c_3p;
-    }
+  return 0;
+}
 
-    return 0;
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 3>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (n <= 0) return 0;
+
+  const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1],
+                  a_02 = A[0 * _as0 + 2 * _as1], a_10 = A[1 * _as0 + 0 * _as1],
+                  a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1],
+                  a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1],
+                  a_22 = A[2 * _as0 + 2 * _as1], a_30 = A[3 * _as0 + 0 * _as1],
+                  a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1];
+
+  ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p,
+      /**/ c_3p;
+
+  const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ic0 = 0 * _cs0,
+            ic1 = 1 * _cs0, ic2 = 2 * _cs0, ic3 = 3 * _cs0;
+
+  for (int p = 0; p < n; ++p) {
+    b_0p = B[ib0 + p * _bs1];
+    b_1p = B[ib1 + p * _bs1];
+    b_2p = B[ib2 + p * _bs1];
+
+    c_0p = a_00 * b_0p;
+    c_0p += a_01 * b_1p;
+    c_0p += a_02 * b_2p;
+    c_1p = a_10 * b_0p;
+    c_1p += a_11 * b_1p;
+    c_1p += a_12 * b_2p;
+    c_2p = a_20 * b_0p;
+    c_2p += a_21 * b_1p;
+    c_2p += a_22 * b_2p;
+    c_3p = a_30 * b_0p;
+    c_3p += a_31 * b_1p;
+    c_3p += a_32 * b_2p;
+
+    C[ic0 + p * _cs1] += alpha * c_0p;
+    C[ic1 + p * _cs1] += alpha * c_1p;
+    C[ic2 + p * _cs1] += alpha * c_2p;
+    C[ic3 + p * _cs1] += alpha * c_3p;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<4,2>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int n,
-                /**/  ValueType *__restrict__ C) {
-    if (n <= 0) return 0;
-
-    const ValueType
-      a_00 = A[0*_as0+0*_as1], a_01 = A[0*_as0+1*_as1], 
-      a_10 = A[1*_as0+0*_as1], a_11 = A[1*_as0+1*_as1], 
-      a_20 = A[2*_as0+0*_as1], a_21 = A[2*_as0+1*_as1], 
-      a_30 = A[3*_as0+0*_as1], a_31 = A[3*_as0+1*_as1]; 
-      
-    ValueType
-      b_0p, c_0p,
-      b_1p, c_1p,
-      /**/  c_2p,
-      /**/  c_3p;
-    
-    const int
-      ib0 = 0*_bs0, ib1 = 1*_bs0, 
-      ic0 = 0*_cs0, ic1 = 1*_cs0, ic2 = 2*_cs0, ic3 = 3*_cs0;
-    
-    for (int p=0;p<n;++p) {
-      b_0p = B[ib0+p*_bs1];
-      b_1p = B[ib1+p*_bs1];
-
-      c_0p = a_00*b_0p; c_0p += a_01*b_1p;  
-      c_1p = a_10*b_0p; c_1p += a_11*b_1p;  
-      c_2p = a_20*b_0p; c_2p += a_21*b_1p;  
-      c_3p = a_30*b_0p; c_3p += a_31*b_1p;  
-
-      C[ic0+p*_cs1] += alpha * c_0p;
-      C[ic1+p*_cs1] += alpha * c_1p;
-      C[ic2+p*_cs1] += alpha * c_2p;
-      C[ic3+p*_cs1] += alpha * c_3p;
-    }
+  return 0;
+}
 
-    return 0;
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 2>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (n <= 0) return 0;
+
+  const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1],
+                  a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1],
+                  a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1],
+                  a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1];
+
+  ValueType b_0p, c_0p, b_1p, c_1p,
+      /**/ c_2p,
+      /**/ c_3p;
+
+  const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0,
+            ic2 = 2 * _cs0, ic3 = 3 * _cs0;
+
+  for (int p = 0; p < n; ++p) {
+    b_0p = B[ib0 + p * _bs1];
+    b_1p = B[ib1 + p * _bs1];
+
+    c_0p = a_00 * b_0p;
+    c_0p += a_01 * b_1p;
+    c_1p = a_10 * b_0p;
+    c_1p += a_11 * b_1p;
+    c_2p = a_20 * b_0p;
+    c_2p += a_21 * b_1p;
+    c_3p = a_30 * b_0p;
+    c_3p += a_31 * b_1p;
+
+    C[ic0 + p * _cs1] += alpha * c_0p;
+    C[ic1 + p * _cs1] += alpha * c_1p;
+    C[ic2 + p * _cs1] += alpha * c_2p;
+    C[ic3 + p * _cs1] += alpha * c_3p;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<4,1>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int n,
-                /**/  ValueType *__restrict__ C) {
-    if (n <= 0) return 0;
-
-    const ValueType
-      a_00 = A[0*_as0+0*_as1], 
-      a_10 = A[1*_as0+0*_as1], 
-      a_20 = A[2*_as0+0*_as1], 
-      a_30 = A[3*_as0+0*_as1]; 
-      
-    ValueType
-      b_0p, c_0p,
-      /**/  c_1p,
-      /**/  c_2p,
-      /**/  c_3p;
-    
-    const int
-      ib0 = 0*_bs0, 
-      ic0 = 0*_cs0, ic1 = 1*_cs0, ic2 = 2*_cs0, ic3 = 3*_cs0;
-    
-    for (int p=0;p<n;++p) {
-      b_0p = B[ib0+p*_bs1];
-
-      c_0p = a_00*b_0p;   
-      c_1p = a_10*b_0p;   
-      c_2p = a_20*b_0p;   
-      c_3p = a_30*b_0p;   
-
-      C[ic0+p*_cs1] += alpha * c_0p;
-      C[ic1+p*_cs1] += alpha * c_1p;
-      C[ic2+p*_cs1] += alpha * c_2p;
-      C[ic3+p*_cs1] += alpha * c_3p;
-    }
+  return 0;
+}
 
-    return 0;
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 1>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (n <= 0) return 0;
+
+  const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_10 = A[1 * _as0 + 0 * _as1],
+                  a_20 = A[2 * _as0 + 0 * _as1], a_30 = A[3 * _as0 + 0 * _as1];
+
+  ValueType b_0p, c_0p,
+      /**/ c_1p,
+      /**/ c_2p,
+      /**/ c_3p;
+
+  const int ib0 = 0 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0,
+            ic3 = 3 * _cs0;
+
+  for (int p = 0; p < n; ++p) {
+    b_0p = B[ib0 + p * _bs1];
+
+    c_0p = a_00 * b_0p;
+    c_1p = a_10 * b_0p;
+    c_2p = a_20 * b_0p;
+    c_3p = a_30 * b_0p;
+
+    C[ic0 + p * _cs1] += alpha * c_0p;
+    C[ic1 + p * _cs1] += alpha * c_1p;
+    C[ic2 + p * _cs1] += alpha * c_2p;
+    C[ic3 + p * _cs1] += alpha * c_3p;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<3,4>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int n,
-                /**/  ValueType *__restrict__ C) {
-    if (n <= 0) return 0;
-
-    const ValueType
-      a_00 = A[0*_as0+0*_as1], a_01 = A[0*_as0+1*_as1], a_02 = A[0*_as0+2*_as1], a_03 = A[0*_as0+3*_as1], 
-      a_10 = A[1*_as0+0*_as1], a_11 = A[1*_as0+1*_as1], a_12 = A[1*_as0+2*_as1], a_13 = A[1*_as0+3*_as1], 
-      a_20 = A[2*_as0+0*_as1], a_21 = A[2*_as0+1*_as1], a_22 = A[2*_as0+2*_as1], a_23 = A[2*_as0+3*_as1];
-      
-    ValueType
-      b_0p, c_0p,
-      b_1p, c_1p,
-      b_2p, c_2p,
-      b_3p;
-    
-    const int
-      ib0 = 0*_bs0, ib1 = 1*_bs0, ib2 = 2*_bs0, ib3 = 3*_bs0,
-      ic0 = 0*_cs0, ic1 = 1*_cs0, ic2 = 2*_cs0;
-    
-    for (int p=0;p<n;++p) {
-      b_0p = B[ib0+p*_bs1];
-      b_1p = B[ib1+p*_bs1];
-      b_2p = B[ib2+p*_bs1];
-      b_3p = B[ib3+p*_bs1];
-
-      c_0p = a_00*b_0p; c_0p += a_01*b_1p; c_0p += a_02*b_2p; c_0p += a_03*b_3p;
-      c_1p = a_10*b_0p; c_1p += a_11*b_1p; c_1p += a_12*b_2p; c_1p += a_13*b_3p;
-      c_2p = a_20*b_0p; c_2p += a_21*b_1p; c_2p += a_22*b_2p; c_2p += a_23*b_3p;
-
-      C[ic0+p*_cs1] += alpha * c_0p;
-      C[ic1+p*_cs1] += alpha * c_1p;
-      C[ic2+p*_cs1] += alpha * c_2p;
-    }
+  return 0;
+}
 
-    return 0;
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 4>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (n <= 0) return 0;
+
+  const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1],
+                  a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1],
+                  a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1],
+                  a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1],
+                  a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1],
+                  a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1];
+
+  ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p, b_3p;
+
+  const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0,
+            ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0;
+
+  for (int p = 0; p < n; ++p) {
+    b_0p = B[ib0 + p * _bs1];
+    b_1p = B[ib1 + p * _bs1];
+    b_2p = B[ib2 + p * _bs1];
+    b_3p = B[ib3 + p * _bs1];
+
+    c_0p = a_00 * b_0p;
+    c_0p += a_01 * b_1p;
+    c_0p += a_02 * b_2p;
+    c_0p += a_03 * b_3p;
+    c_1p = a_10 * b_0p;
+    c_1p += a_11 * b_1p;
+    c_1p += a_12 * b_2p;
+    c_1p += a_13 * b_3p;
+    c_2p = a_20 * b_0p;
+    c_2p += a_21 * b_1p;
+    c_2p += a_22 * b_2p;
+    c_2p += a_23 * b_3p;
+
+    C[ic0 + p * _cs1] += alpha * c_0p;
+    C[ic1 + p * _cs1] += alpha * c_1p;
+    C[ic2 + p * _cs1] += alpha * c_2p;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<2,4>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int n,
-                /**/  ValueType *__restrict__ C) {
-    if (n <= 0) return 0;
-
-    const ValueType
-      a_00 = A[0*_as0+0*_as1], a_01 = A[0*_as0+1*_as1], a_02 = A[0*_as0+2*_as1], a_03 = A[0*_as0+3*_as1], 
-      a_10 = A[1*_as0+0*_as1], a_11 = A[1*_as0+1*_as1], a_12 = A[1*_as0+2*_as1], a_13 = A[1*_as0+3*_as1];
-      
-    ValueType
-      b_0p, c_0p,
-      b_1p, c_1p,
-      b_2p, 
-      b_3p;
-    
-    const int
-      ib0 = 0*_bs0, ib1 = 1*_bs0, ib2 = 2*_bs0, ib3 = 3*_bs0,
-      ic0 = 0*_cs0, ic1 = 1*_cs0;
-    
-    for (int p=0;p<n;++p) {
-      b_0p = B[ib0+p*_bs1];
-      b_1p = B[ib1+p*_bs1];
-      b_2p = B[ib2+p*_bs1];
-      b_3p = B[ib3+p*_bs1];
-
-      c_0p = a_00*b_0p; c_0p += a_01*b_1p; c_0p += a_02*b_2p; c_0p += a_03*b_3p;
-      c_1p = a_10*b_0p; c_1p += a_11*b_1p; c_1p += a_12*b_2p; c_1p += a_13*b_3p;
-
-      C[ic0+p*_cs1] += alpha * c_0p;
-      C[ic1+p*_cs1] += alpha * c_1p;
-    }
+  return 0;
+}
 
-    return 0;
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 4>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (n <= 0) return 0;
+
+  const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1],
+                  a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1],
+                  a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1],
+                  a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1];
+
+  ValueType b_0p, c_0p, b_1p, c_1p, b_2p, b_3p;
+
+  const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0,
+            ic0 = 0 * _cs0, ic1 = 1 * _cs0;
+
+  for (int p = 0; p < n; ++p) {
+    b_0p = B[ib0 + p * _bs1];
+    b_1p = B[ib1 + p * _bs1];
+    b_2p = B[ib2 + p * _bs1];
+    b_3p = B[ib3 + p * _bs1];
+
+    c_0p = a_00 * b_0p;
+    c_0p += a_01 * b_1p;
+    c_0p += a_02 * b_2p;
+    c_0p += a_03 * b_3p;
+    c_1p = a_10 * b_0p;
+    c_1p += a_11 * b_1p;
+    c_1p += a_12 * b_2p;
+    c_1p += a_13 * b_3p;
+
+    C[ic0 + p * _cs1] += alpha * c_0p;
+    C[ic1 + p * _cs1] += alpha * c_1p;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<1,4>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int n,
-                /**/  ValueType *__restrict__ C) {
-    if (n <= 0) return 0;
-
-    const ValueType
-      a_00 = A[0*_as0+0*_as1], a_01 = A[0*_as0+1*_as1], a_02 = A[0*_as0+2*_as1], a_03 = A[0*_as0+3*_as1];
-      
-    ValueType
-      b_0p, c_0p,
-      b_1p, 
-      b_2p, 
-      b_3p;
-    
-    const int
-      ib0 = 0*_bs0, ib1 = 1*_bs0, ib2 = 2*_bs0, ib3 = 3*_bs0,
-      ic0 = 0*_cs0;
-    
-    for (int p=0;p<n;++p) {
-      b_0p = B[ib0+p*_bs1];
-      b_1p = B[ib1+p*_bs1];
-      b_2p = B[ib2+p*_bs1];
-      b_3p = B[ib3+p*_bs1];
-
-      c_0p = a_00*b_0p; c_0p += a_01*b_1p; c_0p += a_02*b_2p; c_0p += a_03*b_3p;
-
-      C[ic0+p*_cs1] += alpha * c_0p;
-    }
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 4>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (n <= 0) return 0;
+
+  const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1],
+                  a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1];
+
+  ValueType b_0p, c_0p, b_1p, b_2p, b_3p;
+
+  const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ib3 = 3 * _bs0,
+            ic0 = 0 * _cs0;
+
+  for (int p = 0; p < n; ++p) {
+    b_0p = B[ib0 + p * _bs1];
+    b_1p = B[ib1 + p * _bs1];
+    b_2p = B[ib2 + p * _bs1];
+    b_3p = B[ib3 + p * _bs1];
 
-    return 0;
+    c_0p = a_00 * b_0p;
+    c_0p += a_01 * b_1p;
+    c_0p += a_02 * b_2p;
+    c_0p += a_03 * b_3p;
+
+    C[ic0 + p * _cs1] += alpha * c_0p;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<4,4>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m, const int n, const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (m <=0 || n <= 0 || k <= 0) return 0;
-
-    switch (m*10+k) {
-    case 44: { InnerGemmFixA<4,4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, n, C); break; }
-    case 43: { InnerGemmFixA<4,3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, n, C); break; }
-    case 42: { InnerGemmFixA<4,2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, n, C); break; }
-    case 41: { InnerGemmFixA<4,1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, n, C); break; }
-    case 34: { InnerGemmFixA<3,4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, n, C); break; }
-    case 24: { InnerGemmFixA<2,4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, n, C); break; }
-    case 14: { InnerGemmFixA<1,4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, n, C); break; }
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<4, 4>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0 || n <= 0 || k <= 0) return 0;
+
+  switch (m * 10 + k) {
+    case 44: {
+      InnerGemmFixA<4, 4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, n, C);
+      break;
+    }
+    case 43: {
+      InnerGemmFixA<4, 3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, n, C);
+      break;
+    }
+    case 42: {
+      InnerGemmFixA<4, 2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, n, C);
+      break;
+    }
+    case 41: {
+      InnerGemmFixA<4, 1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, n, C);
+      break;
+    }
+    case 34: {
+      InnerGemmFixA<3, 4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, n, C);
+      break;
+    }
+    case 24: {
+      InnerGemmFixA<2, 4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, n, C);
+      break;
+    }
+    case 14: {
+      InnerGemmFixA<1, 4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, n, C);
+      break;
+    }
     default: {
       if (m < 4 && n < 4) {
-        InnerGemmFixA<2,2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
-        for (int i=0;i<m;i+=2)
-          for (int p=0;p<k;p+=2)
-            inner.serial_invoke(alpha, A+i*_as0+p*_as1, B+p*_bs0, (i+2 > m ? 1 : 2), n, (p+2 > k ? 1 : 2), C+i*_cs0);
+        InnerGemmFixA<2, 2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+        for (int i = 0; i < m; i += 2)
+          for (int p = 0; p < k; p += 2)
+            inner.serial_invoke(alpha, A + i * _as0 + p * _as1, B + p * _bs0,
+                                (i + 2 > m ? 1 : 2), n, (p + 2 > k ? 1 : 2),
+                                C + i * _cs0);
       } else {
-        Kokkos::abort("InnerGemmFixA<4,4>::serial_invoke, assert failure (m<4 && n<4)");
+        Kokkos::abort(
+            "InnerGemmFixA<4,4>::serial_invoke, assert failure (m<4 && n<4)");
       }
       break;
     }
-    }
-
-    return 0;
   }
 
-  ///
-  /// Inner kernel (3x3)
-  /// ==================
-
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<3,3>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int n,
-                /**/  ValueType *__restrict__ C) {
-    if (n <= 0) return 0;
-
-    const ValueType
-      a_00 = A[0*_as0+0*_as1], a_01 = A[0*_as0+1*_as1], a_02 = A[0*_as0+2*_as1], 
-      a_10 = A[1*_as0+0*_as1], a_11 = A[1*_as0+1*_as1], a_12 = A[1*_as0+2*_as1],
-      a_20 = A[2*_as0+0*_as1], a_21 = A[2*_as0+1*_as1], a_22 = A[2*_as0+2*_as1];
-      
-    ValueType
-      b_0p, c_0p,
-      b_1p, c_1p,
-      b_2p, c_2p;
-    
-    const int
-      ib0 = 0*_bs0, ib1 = 1*_bs0, ib2 = 2*_bs0,
-      ic0 = 0*_cs0, ic1 = 1*_cs0, ic2 = 2*_cs0;
-    
-    for (int p=0;p<n;++p) {
-      b_0p = B[ib0+p*_bs1];
-      b_1p = B[ib1+p*_bs1];
-      b_2p = B[ib2+p*_bs1];
-
-      c_0p = a_00*b_0p; c_0p += a_01*b_1p; c_0p += a_02*b_2p;
-      c_1p = a_10*b_0p; c_1p += a_11*b_1p; c_1p += a_12*b_2p;
-      c_2p = a_20*b_0p; c_2p += a_21*b_1p; c_2p += a_22*b_2p;
-
-      C[ic0+p*_cs1] += alpha * c_0p;
-      C[ic1+p*_cs1] += alpha * c_1p;
-      C[ic2+p*_cs1] += alpha * c_2p;
-    }
+  return 0;
+}
 
-    return 0;
+///
+/// Inner kernel (3x3)
+/// ==================
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 3>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (n <= 0) return 0;
+
+  const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1],
+                  a_02 = A[0 * _as0 + 2 * _as1], a_10 = A[1 * _as0 + 0 * _as1],
+                  a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1],
+                  a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1],
+                  a_22 = A[2 * _as0 + 2 * _as1];
+
+  ValueType b_0p, c_0p, b_1p, c_1p, b_2p, c_2p;
+
+  const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ic0 = 0 * _cs0,
+            ic1 = 1 * _cs0, ic2 = 2 * _cs0;
+
+  for (int p = 0; p < n; ++p) {
+    b_0p = B[ib0 + p * _bs1];
+    b_1p = B[ib1 + p * _bs1];
+    b_2p = B[ib2 + p * _bs1];
+
+    c_0p = a_00 * b_0p;
+    c_0p += a_01 * b_1p;
+    c_0p += a_02 * b_2p;
+    c_1p = a_10 * b_0p;
+    c_1p += a_11 * b_1p;
+    c_1p += a_12 * b_2p;
+    c_2p = a_20 * b_0p;
+    c_2p += a_21 * b_1p;
+    c_2p += a_22 * b_2p;
+
+    C[ic0 + p * _cs1] += alpha * c_0p;
+    C[ic1 + p * _cs1] += alpha * c_1p;
+    C[ic2 + p * _cs1] += alpha * c_2p;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<3,2>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int n,
-                /**/  ValueType *__restrict__ C) {
-    if (n <= 0) return 0;
-
-    const ValueType
-      a_00 = A[0*_as0+0*_as1], a_01 = A[0*_as0+1*_as1], 
-      a_10 = A[1*_as0+0*_as1], a_11 = A[1*_as0+1*_as1],
-      a_20 = A[2*_as0+0*_as1], a_21 = A[2*_as0+1*_as1];
-      
-    ValueType
-      b_0p, c_0p,
-      b_1p, c_1p,
-      /**/  c_2p;
-    
-    const int
-      ib0 = 0*_bs0, ib1 = 1*_bs0, 
-      ic0 = 0*_cs0, ic1 = 1*_cs0, ic2 = 2*_cs0;
-    
-    for (int p=0;p<n;++p) {
-      b_0p = B[ib0+p*_bs1];
-      b_1p = B[ib1+p*_bs1];
-
-      c_0p = a_00*b_0p; c_0p += a_01*b_1p; 
-      c_1p = a_10*b_0p; c_1p += a_11*b_1p; 
-      c_2p = a_20*b_0p; c_2p += a_21*b_1p; 
-
-      C[ic0+p*_cs1] += alpha * c_0p;
-      C[ic1+p*_cs1] += alpha * c_1p;
-      C[ic2+p*_cs1] += alpha * c_2p;
-    }
+  return 0;
+}
 
-    return 0;
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 2>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (n <= 0) return 0;
+
+  const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1],
+                  a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1],
+                  a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1];
+
+  ValueType b_0p, c_0p, b_1p, c_1p,
+      /**/ c_2p;
+
+  const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0,
+            ic2 = 2 * _cs0;
+
+  for (int p = 0; p < n; ++p) {
+    b_0p = B[ib0 + p * _bs1];
+    b_1p = B[ib1 + p * _bs1];
+
+    c_0p = a_00 * b_0p;
+    c_0p += a_01 * b_1p;
+    c_1p = a_10 * b_0p;
+    c_1p += a_11 * b_1p;
+    c_2p = a_20 * b_0p;
+    c_2p += a_21 * b_1p;
+
+    C[ic0 + p * _cs1] += alpha * c_0p;
+    C[ic1 + p * _cs1] += alpha * c_1p;
+    C[ic2 + p * _cs1] += alpha * c_2p;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<3,1>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int n,
-                /**/  ValueType *__restrict__ C) {
-    if (n <= 0) return 0;
-
-    const ValueType
-      a_00 = A[0*_as0+0*_as1], 
-      a_10 = A[1*_as0+0*_as1], 
-      a_20 = A[2*_as0+0*_as1];
-      
-    ValueType
-      b_0p, c_0p,
-      /**/  c_1p,
-      /**/  c_2p;
-    
-    const int
-      ib0 = 0*_bs0, 
-      ic0 = 0*_cs0, ic1 = 1*_cs0, ic2 = 2*_cs0;
-    
-    for (int p=0;p<n;++p) {
-      b_0p = B[ib0+p*_bs1];
-
-      c_0p = a_00*b_0p;  
-      c_1p = a_10*b_0p;  
-      c_2p = a_20*b_0p;  
-
-      C[ic0+p*_cs1] += alpha * c_0p;
-      C[ic1+p*_cs1] += alpha * c_1p;
-      C[ic2+p*_cs1] += alpha * c_2p;
-    }
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 1>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (n <= 0) return 0;
+
+  const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_10 = A[1 * _as0 + 0 * _as1],
+                  a_20 = A[2 * _as0 + 0 * _as1];
+
+  ValueType b_0p, c_0p,
+      /**/ c_1p,
+      /**/ c_2p;
+
+  const int ib0 = 0 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0, ic2 = 2 * _cs0;
+
+  for (int p = 0; p < n; ++p) {
+    b_0p = B[ib0 + p * _bs1];
 
-    return 0;
+    c_0p = a_00 * b_0p;
+    c_1p = a_10 * b_0p;
+    c_2p = a_20 * b_0p;
+
+    C[ic0 + p * _cs1] += alpha * c_0p;
+    C[ic1 + p * _cs1] += alpha * c_1p;
+    C[ic2 + p * _cs1] += alpha * c_2p;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<2,3>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int n,
-                /**/  ValueType *__restrict__ C) {
-    if (n <= 0) return 0;
-
-    const ValueType
-      a_00 = A[0*_as0+0*_as1], a_01 = A[0*_as0+1*_as1], a_02 = A[0*_as0+2*_as1], 
-      a_10 = A[1*_as0+0*_as1], a_11 = A[1*_as0+1*_as1], a_12 = A[1*_as0+2*_as1];
-      
-    ValueType
-      b_0p, c_0p,
-      b_1p, c_1p,
-      b_2p;
-    
-    const int
-      ib0 = 0*_bs0, ib1 = 1*_bs0, ib2 = 2*_bs0,
-      ic0 = 0*_cs0, ic1 = 1*_cs0;
-    
-    for (int p=0;p<n;++p) {
-      b_0p = B[ib0+p*_bs1];
-      b_1p = B[ib1+p*_bs1];
-      b_2p = B[ib2+p*_bs1];
-
-      c_0p = a_00*b_0p; c_0p += a_01*b_1p; c_0p += a_02*b_2p;
-      c_1p = a_10*b_0p; c_1p += a_11*b_1p; c_1p += a_12*b_2p;
-
-      C[ic0+p*_cs1] += alpha * c_0p;
-      C[ic1+p*_cs1] += alpha * c_1p;
-    }
+  return 0;
+}
 
-    return 0;
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 3>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (n <= 0) return 0;
+
+  const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1],
+                  a_02 = A[0 * _as0 + 2 * _as1], a_10 = A[1 * _as0 + 0 * _as1],
+                  a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1];
+
+  ValueType b_0p, c_0p, b_1p, c_1p, b_2p;
+
+  const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ic0 = 0 * _cs0,
+            ic1 = 1 * _cs0;
+
+  for (int p = 0; p < n; ++p) {
+    b_0p = B[ib0 + p * _bs1];
+    b_1p = B[ib1 + p * _bs1];
+    b_2p = B[ib2 + p * _bs1];
+
+    c_0p = a_00 * b_0p;
+    c_0p += a_01 * b_1p;
+    c_0p += a_02 * b_2p;
+    c_1p = a_10 * b_0p;
+    c_1p += a_11 * b_1p;
+    c_1p += a_12 * b_2p;
+
+    C[ic0 + p * _cs1] += alpha * c_0p;
+    C[ic1 + p * _cs1] += alpha * c_1p;
   }
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<1,3>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int n,
-                /**/  ValueType *__restrict__ C) {
-    if (n <= 0) return 0;
-
-    const ValueType
-      a_00 = A[0*_as0+0*_as1], a_01 = A[0*_as0+1*_as1], a_02 = A[0*_as0+2*_as1];
-      
-    ValueType
-      b_0p, c_0p,
-      b_1p, 
-      b_2p;
-    
-    const int
-      ib0 = 0*_bs0, ib1 = 1*_bs0, ib2 = 2*_bs0,
-      ic0 = 0*_cs0;
-    
-    for (int p=0;p<n;++p) {
-      b_0p = B[ib0+p*_bs1];
-      b_1p = B[ib1+p*_bs1];
-      b_2p = B[ib2+p*_bs1];
-
-      c_0p = a_00*b_0p; c_0p += a_01*b_1p; c_0p += a_02*b_2p;
-
-      C[ic0+p*_cs1] += alpha * c_0p;
-    }
 
-    return 0;
+  return 0;
+}
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 3>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (n <= 0) return 0;
+
+  const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1],
+                  a_02 = A[0 * _as0 + 2 * _as1];
+
+  ValueType b_0p, c_0p, b_1p, b_2p;
+
+  const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ib2 = 2 * _bs0, ic0 = 0 * _cs0;
+
+  for (int p = 0; p < n; ++p) {
+    b_0p = B[ib0 + p * _bs1];
+    b_1p = B[ib1 + p * _bs1];
+    b_2p = B[ib2 + p * _bs1];
+
+    c_0p = a_00 * b_0p;
+    c_0p += a_01 * b_1p;
+    c_0p += a_02 * b_2p;
+
+    C[ic0 + p * _cs1] += alpha * c_0p;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<3,3>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m, const int n, const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (m <=0 || n <= 0 || k <= 0) return 0;
-
-    switch (m*10+k) {
-    case 33: { InnerGemmFixA<3,3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, n, C); break; }
-    case 32: { InnerGemmFixA<3,2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, n, C); break; }
-    case 31: { InnerGemmFixA<3,1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, n, C); break; }
-    case 23: { InnerGemmFixA<2,3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, n, C); break; }
-    case 13: { InnerGemmFixA<1,3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, n, C); break; }
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<3, 3>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0 || n <= 0 || k <= 0) return 0;
+
+  switch (m * 10 + k) {
+    case 33: {
+      InnerGemmFixA<3, 3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, n, C);
+      break;
+    }
+    case 32: {
+      InnerGemmFixA<3, 2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, n, C);
+      break;
+    }
+    case 31: {
+      InnerGemmFixA<3, 1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, n, C);
+      break;
+    }
+    case 23: {
+      InnerGemmFixA<2, 3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, n, C);
+      break;
+    }
+    case 13: {
+      InnerGemmFixA<1, 3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, n, C);
+      break;
+    }
     default: {
       if (m < 3 && n < 3) {
-        InnerGemmFixA<2,2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
-        for (int i=0;i<m;i+=2)
-          for (int p=0;p<k;p+=2)
-            inner.serial_invoke(alpha, A+i*_as0+p*_as1, B+p*_bs0, (i+2 > m ? 1 : 2), n, (p+2 > k ? 1 : 2), C+i*_cs0);
+        InnerGemmFixA<2, 2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+        for (int i = 0; i < m; i += 2)
+          for (int p = 0; p < k; p += 2)
+            inner.serial_invoke(alpha, A + i * _as0 + p * _as1, B + p * _bs0,
+                                (i + 2 > m ? 1 : 2), n, (p + 2 > k ? 1 : 2),
+                                C + i * _cs0);
       } else {
-        Kokkos::abort("InnerGemmFixA<3,3>::serial_invoke, assert failure (m<3 && n<3)");
+        Kokkos::abort(
+            "InnerGemmFixA<3,3>::serial_invoke, assert failure (m<3 && n<3)");
       }
       break;
     }
-    }
-        
-    return 0;
   }
 
-  ///
-  /// Inner kernel (2x2)
-  /// ==================
-
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<2,2>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int n,
-                /**/  ValueType *__restrict__ C) {
-    if (n <= 0) return 0;
-
-    const ValueType
-      a_00 = A[0*_as0+0*_as1], a_01 = A[0*_as0+1*_as1], 
-      a_10 = A[1*_as0+0*_as1], a_11 = A[1*_as0+1*_as1];
-      
-    ValueType
-      b_0p, c_0p,
-      b_1p, c_1p;
-    
-    const int
-      ib0 = 0*_bs0, ib1 = 1*_bs0,
-      ic0 = 0*_cs0, ic1 = 1*_cs0;
-    
-    for (int p=0;p<n;++p) {
-      b_0p = B[ib0+p*_bs1];
-      b_1p = B[ib1+p*_bs1];
-
-      c_0p = a_00*b_0p; c_0p += a_01*b_1p; 
-      c_1p = a_10*b_0p; c_1p += a_11*b_1p; 
-
-      C[ic0+p*_cs1] += alpha * c_0p;
-      C[ic1+p*_cs1] += alpha * c_1p;
-    }
+  return 0;
+}
+
+///
+/// Inner kernel (2x2)
+/// ==================
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 2>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (n <= 0) return 0;
+
+  const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1],
+                  a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1];
+
+  ValueType b_0p, c_0p, b_1p, c_1p;
 
-    return 0;
+  const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0;
+
+  for (int p = 0; p < n; ++p) {
+    b_0p = B[ib0 + p * _bs1];
+    b_1p = B[ib1 + p * _bs1];
+
+    c_0p = a_00 * b_0p;
+    c_0p += a_01 * b_1p;
+    c_1p = a_10 * b_0p;
+    c_1p += a_11 * b_1p;
+
+    C[ic0 + p * _cs1] += alpha * c_0p;
+    C[ic1 + p * _cs1] += alpha * c_1p;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<2,1>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int n,
-                /**/  ValueType *__restrict__ C) {
-    if (n <= 0) return 0;
-
-    const ValueType
-      a_00 = A[0*_as0+0*_as1], 
-      a_10 = A[1*_as0+0*_as1];
-      
-    ValueType
-      b_0p, c_0p,
-      /**/  c_1p;
-    
-    const int
-      ib0 = 0*_bs0, 
-      ic0 = 0*_cs0, ic1 = 1*_cs0;
-    
-    for (int p=0;p<n;++p) {
-      b_0p = B[ib0+p*_bs1];
-
-      c_0p = a_00*b_0p; 
-      c_1p = a_10*b_0p; 
-
-      C[ic0+p*_cs1] += alpha * c_0p;
-      C[ic1+p*_cs1] += alpha * c_1p;
-    }
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 1>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (n <= 0) return 0;
+
+  const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_10 = A[1 * _as0 + 0 * _as1];
+
+  ValueType b_0p, c_0p,
+      /**/ c_1p;
+
+  const int ib0 = 0 * _bs0, ic0 = 0 * _cs0, ic1 = 1 * _cs0;
+
+  for (int p = 0; p < n; ++p) {
+    b_0p = B[ib0 + p * _bs1];
 
-    return 0;
+    c_0p = a_00 * b_0p;
+    c_1p = a_10 * b_0p;
+
+    C[ic0 + p * _cs1] += alpha * c_0p;
+    C[ic1 + p * _cs1] += alpha * c_1p;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<1,2>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int n,
-                /**/  ValueType *__restrict__ C) {
-    if (n <= 0) return 0;
-
-    const ValueType
-      a_00 = A[0*_as0+0*_as1], a_01 = A[0*_as0+1*_as1];
-      
-    ValueType
-      b_0p, c_0p,
-      b_1p;
-    
-    const int
-      ib0 = 0*_bs0, ib1 = 1*_bs0,
-      ic0 = 0*_cs0;
-    
-    for (int p=0;p<n;++p) {
-      b_0p = B[ib0+p*_bs1];
-      b_1p = B[ib1+p*_bs1];
-
-      c_0p = a_00*b_0p; c_0p += a_01*b_1p; 
-
-      C[ic0+p*_cs1] += alpha * c_0p;
-    }
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 2>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (n <= 0) return 0;
+
+  const ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1];
 
-    return 0;
+  ValueType b_0p, c_0p, b_1p;
+
+  const int ib0 = 0 * _bs0, ib1 = 1 * _bs0, ic0 = 0 * _cs0;
+
+  for (int p = 0; p < n; ++p) {
+    b_0p = B[ib0 + p * _bs1];
+    b_1p = B[ib1 + p * _bs1];
+
+    c_0p = a_00 * b_0p;
+    c_0p += a_01 * b_1p;
+
+    C[ic0 + p * _cs1] += alpha * c_0p;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<2,2>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m, const int n, const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (m <=0 || n <= 0 || k <= 0) return 0;
-
-    switch (m*10+k) {
-    case 22: { InnerGemmFixA<2,2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, n, C); break; }
-    case 21: { InnerGemmFixA<2,1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, n, C); break; }
-    case 12: { InnerGemmFixA<1,2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, n, C); break; }
-    case 11: { InnerGemmFixA<1,1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, n, C); break; }
-    default: {
-      Kokkos::abort("InnerGemmFixA<2,2>::serial_invoke, assert failure (m<2 && n<2)");
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<2, 2>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0 || n <= 0 || k <= 0) return 0;
+
+  switch (m * 10 + k) {
+    case 22: {
+      InnerGemmFixA<2, 2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, n, C);
       break;
     }
+    case 21: {
+      InnerGemmFixA<2, 1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, n, C);
+      break;
+    }
+    case 12: {
+      InnerGemmFixA<1, 2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, n, C);
+      break;
+    }
+    case 11: {
+      InnerGemmFixA<1, 1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, n, C);
+      break;
+    }
+    default: {
+      Kokkos::abort(
+          "InnerGemmFixA<2,2>::serial_invoke, assert failure (m<2 && n<2)");
+      break;
     }
-
-    return 0;
   }
 
-  ///
-  /// Inner kernel (1x1)
-  /// ==================
-
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixA<1,1>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int n,
-                /**/  ValueType *__restrict__ C) {
-    if (n <= 0) return 0;
-
-    const ValueType
-      a_00 = A[0*_as0+0*_as1];
-      
-    ValueType
-      b_0p, c_0p;
-    
-    const int
-      ib0 = 0*_bs0,
-      ic0 = 0*_cs0;
-    
-    for (int p=0;p<n;++p) {
-      b_0p = B[ib0+p*_bs1];
-
-      c_0p = a_00*b_0p; 
-
-      C[ic0+p*_cs1] += alpha * c_0p;
-    }
+  return 0;
+}
+
+///
+/// Inner kernel (1x1)
+/// ==================
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixA<1, 1>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (n <= 0) return 0;
+
+  const ValueType a_00 = A[0 * _as0 + 0 * _as1];
+
+  ValueType b_0p, c_0p;
+
+  const int ib0 = 0 * _bs0, ic0 = 0 * _cs0;
+
+  for (int p = 0; p < n; ++p) {
+    b_0p = B[ib0 + p * _bs1];
+
+    c_0p = a_00 * b_0p;
 
-    return 0;
+    C[ic0 + p * _cs1] += alpha * c_0p;
   }
 
+  return 0;
 }
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_InnerGemmFixB_Serial_Impl.hpp b/src/batched/dense/impl/KokkosBatched_InnerGemmFixB_Serial_Impl.hpp
index e7f8b1b009..b948b115f8 100644
--- a/src/batched/dense/impl/KokkosBatched_InnerGemmFixB_Serial_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_InnerGemmFixB_Serial_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_INNER_GEMM_FIX_B_SERIAL_IMPL_HPP__
 #define __KOKKOSBATCHED_INNER_GEMM_FIX_B_SERIAL_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,1136 +8,1270 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Inner kernel (5x5)
-  /// ==================
-
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<5,5>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0) return 0;
-
-    const ValueType
-      b_00 = B[0*_bs0+0*_bs1], b_01 = B[0*_bs0+1*_bs1], b_02 = B[0*_bs0+2*_bs1], b_03 = B[0*_bs0+3*_bs1], b_04 = B[0*_bs0+4*_bs1],
-      b_10 = B[1*_bs0+0*_bs1], b_11 = B[1*_bs0+1*_bs1], b_12 = B[1*_bs0+2*_bs1], b_13 = B[1*_bs0+3*_bs1], b_14 = B[1*_bs0+4*_bs1],
-      b_20 = B[2*_bs0+0*_bs1], b_21 = B[2*_bs0+1*_bs1], b_22 = B[2*_bs0+2*_bs1], b_23 = B[2*_bs0+3*_bs1], b_24 = B[2*_bs0+4*_bs1],
-      b_30 = B[3*_bs0+0*_bs1], b_31 = B[3*_bs0+1*_bs1], b_32 = B[3*_bs0+2*_bs1], b_33 = B[3*_bs0+3*_bs1], b_34 = B[3*_bs0+4*_bs1],
-      b_40 = B[4*_bs0+0*_bs1], b_41 = B[4*_bs0+1*_bs1], b_42 = B[4*_bs0+2*_bs1], b_43 = B[4*_bs0+3*_bs1], b_44 = B[4*_bs0+4*_bs1];
-      
-    ValueType
-      a_p0, a_p1, a_p2, a_p3, a_p4,
-      c_p0, c_p1, c_p2, c_p3, c_p4;
-
-    const int
-      ja0 = 0*_as1, ja1 = 1*_as1, ja2 = 2*_as1, ja3 = 3*_as1, ja4 = 4*_as1,
-      jc0 = 0*_cs1, jc1 = 1*_cs1, jc2 = 2*_cs1, jc3 = 3*_cs1, jc4 = 4*_cs1;
-    
-    for (int p=0;p<m;++p) {
-      a_p0 = A[p*_bs0+ja0]; a_p1 = A[p*_bs0+ja1]; a_p2 = A[p*_bs0+ja2]; a_p3 = A[p*_bs0+ja3]; a_p4 = A[p*_bs0+ja4];      
-
-      c_p0  = a_p0*b_00; c_p1  = a_p0*b_01; c_p2  = a_p0*b_02; c_p3  = a_p0*b_03; c_p4  = a_p0*b_04; 
-      c_p0 += a_p1*b_10; c_p1 += a_p1*b_11; c_p2 += a_p1*b_12; c_p3 += a_p1*b_13; c_p4 += a_p1*b_14;
-      c_p0 += a_p2*b_20; c_p1 += a_p2*b_21; c_p2 += a_p2*b_22; c_p3 += a_p2*b_23; c_p4 += a_p2*b_24;
-      c_p0 += a_p3*b_30; c_p1 += a_p3*b_31; c_p2 += a_p3*b_32; c_p3 += a_p3*b_33; c_p4 += a_p3*b_34;
-      c_p0 += a_p4*b_40; c_p1 += a_p4*b_41; c_p2 += a_p4*b_42; c_p3 += a_p4*b_43; c_p4 += a_p4*b_44;
-
-      C[p*_cs0+jc0] += alpha * c_p0; C[p*_cs0+jc1] += alpha * c_p1; C[p*_cs0+jc2] += alpha * c_p2; C[p*_cs0+jc3] += alpha * c_p3; C[p*_cs0+jc4] += alpha * c_p4;
-    }
-    
-    return 0;
+///
+/// Inner kernel (5x5)
+/// ==================
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 5>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0) return 0;
+
+  const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1],
+                  b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1],
+                  b_04 = B[0 * _bs0 + 4 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1],
+                  b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1],
+                  b_13 = B[1 * _bs0 + 3 * _bs1], b_14 = B[1 * _bs0 + 4 * _bs1],
+                  b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1],
+                  b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1],
+                  b_24 = B[2 * _bs0 + 4 * _bs1], b_30 = B[3 * _bs0 + 0 * _bs1],
+                  b_31 = B[3 * _bs0 + 1 * _bs1], b_32 = B[3 * _bs0 + 2 * _bs1],
+                  b_33 = B[3 * _bs0 + 3 * _bs1], b_34 = B[3 * _bs0 + 4 * _bs1],
+                  b_40 = B[4 * _bs0 + 0 * _bs1], b_41 = B[4 * _bs0 + 1 * _bs1],
+                  b_42 = B[4 * _bs0 + 2 * _bs1], b_43 = B[4 * _bs0 + 3 * _bs1],
+                  b_44 = B[4 * _bs0 + 4 * _bs1];
+
+  ValueType a_p0, a_p1, a_p2, a_p3, a_p4, c_p0, c_p1, c_p2, c_p3, c_p4;
+
+  const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1,
+            ja4 = 4 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1,
+            jc3 = 3 * _cs1, jc4 = 4 * _cs1;
+
+  for (int p = 0; p < m; ++p) {
+    a_p0 = A[p * _bs0 + ja0];
+    a_p1 = A[p * _bs0 + ja1];
+    a_p2 = A[p * _bs0 + ja2];
+    a_p3 = A[p * _bs0 + ja3];
+    a_p4 = A[p * _bs0 + ja4];
+
+    c_p0 = a_p0 * b_00;
+    c_p1 = a_p0 * b_01;
+    c_p2 = a_p0 * b_02;
+    c_p3 = a_p0 * b_03;
+    c_p4 = a_p0 * b_04;
+    c_p0 += a_p1 * b_10;
+    c_p1 += a_p1 * b_11;
+    c_p2 += a_p1 * b_12;
+    c_p3 += a_p1 * b_13;
+    c_p4 += a_p1 * b_14;
+    c_p0 += a_p2 * b_20;
+    c_p1 += a_p2 * b_21;
+    c_p2 += a_p2 * b_22;
+    c_p3 += a_p2 * b_23;
+    c_p4 += a_p2 * b_24;
+    c_p0 += a_p3 * b_30;
+    c_p1 += a_p3 * b_31;
+    c_p2 += a_p3 * b_32;
+    c_p3 += a_p3 * b_33;
+    c_p4 += a_p3 * b_34;
+    c_p0 += a_p4 * b_40;
+    c_p1 += a_p4 * b_41;
+    c_p2 += a_p4 * b_42;
+    c_p3 += a_p4 * b_43;
+    c_p4 += a_p4 * b_44;
+
+    C[p * _cs0 + jc0] += alpha * c_p0;
+    C[p * _cs0 + jc1] += alpha * c_p1;
+    C[p * _cs0 + jc2] += alpha * c_p2;
+    C[p * _cs0 + jc3] += alpha * c_p3;
+    C[p * _cs0 + jc4] += alpha * c_p4;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<5,4>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0) return 0;
-
-    const ValueType
-      b_00 = B[0*_bs0+0*_bs1], b_01 = B[0*_bs0+1*_bs1], b_02 = B[0*_bs0+2*_bs1], b_03 = B[0*_bs0+3*_bs1],
-      b_10 = B[1*_bs0+0*_bs1], b_11 = B[1*_bs0+1*_bs1], b_12 = B[1*_bs0+2*_bs1], b_13 = B[1*_bs0+3*_bs1],
-      b_20 = B[2*_bs0+0*_bs1], b_21 = B[2*_bs0+1*_bs1], b_22 = B[2*_bs0+2*_bs1], b_23 = B[2*_bs0+3*_bs1],
-      b_30 = B[3*_bs0+0*_bs1], b_31 = B[3*_bs0+1*_bs1], b_32 = B[3*_bs0+2*_bs1], b_33 = B[3*_bs0+3*_bs1],
-      b_40 = B[4*_bs0+0*_bs1], b_41 = B[4*_bs0+1*_bs1], b_42 = B[4*_bs0+2*_bs1], b_43 = B[4*_bs0+3*_bs1];
-      
-    ValueType
-      a_p0, a_p1, a_p2, a_p3, a_p4,
-      c_p0, c_p1, c_p2, c_p3;
-
-    const int
-      ja0 = 0*_as1, ja1 = 1*_as1, ja2 = 2*_as1, ja3 = 3*_as1, ja4 = 4*_as1,
-      jc0 = 0*_cs1, jc1 = 1*_cs1, jc2 = 2*_cs1, jc3 = 3*_cs1;
-    
-    for (int p=0;p<m;++p) {
-      a_p0 = A[p*_bs0+ja0]; a_p1 = A[p*_bs0+ja1]; a_p2 = A[p*_bs0+ja2]; a_p3 = A[p*_bs0+ja3]; a_p4 = A[p*_bs0+ja4];      
-
-      c_p0  = a_p0*b_00; c_p1  = a_p0*b_01; c_p2  = a_p0*b_02; c_p3  = a_p0*b_03; 
-      c_p0 += a_p1*b_10; c_p1 += a_p1*b_11; c_p2 += a_p1*b_12; c_p3 += a_p1*b_13;
-      c_p0 += a_p2*b_20; c_p1 += a_p2*b_21; c_p2 += a_p2*b_22; c_p3 += a_p2*b_23;
-      c_p0 += a_p3*b_30; c_p1 += a_p3*b_31; c_p2 += a_p3*b_32; c_p3 += a_p3*b_33;
-      c_p0 += a_p4*b_40; c_p1 += a_p4*b_41; c_p2 += a_p4*b_42; c_p3 += a_p4*b_43;
-
-      C[p*_cs0+jc0] += alpha * c_p0; C[p*_cs0+jc1] += alpha * c_p1; C[p*_cs0+jc2] += alpha * c_p2; C[p*_cs0+jc3] += alpha * c_p3;
-    }
-    
-    return 0;
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 4>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0) return 0;
+
+  const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1],
+                  b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1],
+                  b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1],
+                  b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1],
+                  b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1],
+                  b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1],
+                  b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1],
+                  b_32 = B[3 * _bs0 + 2 * _bs1], b_33 = B[3 * _bs0 + 3 * _bs1],
+                  b_40 = B[4 * _bs0 + 0 * _bs1], b_41 = B[4 * _bs0 + 1 * _bs1],
+                  b_42 = B[4 * _bs0 + 2 * _bs1], b_43 = B[4 * _bs0 + 3 * _bs1];
+
+  ValueType a_p0, a_p1, a_p2, a_p3, a_p4, c_p0, c_p1, c_p2, c_p3;
+
+  const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1,
+            ja4 = 4 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1,
+            jc3 = 3 * _cs1;
+
+  for (int p = 0; p < m; ++p) {
+    a_p0 = A[p * _bs0 + ja0];
+    a_p1 = A[p * _bs0 + ja1];
+    a_p2 = A[p * _bs0 + ja2];
+    a_p3 = A[p * _bs0 + ja3];
+    a_p4 = A[p * _bs0 + ja4];
+
+    c_p0 = a_p0 * b_00;
+    c_p1 = a_p0 * b_01;
+    c_p2 = a_p0 * b_02;
+    c_p3 = a_p0 * b_03;
+    c_p0 += a_p1 * b_10;
+    c_p1 += a_p1 * b_11;
+    c_p2 += a_p1 * b_12;
+    c_p3 += a_p1 * b_13;
+    c_p0 += a_p2 * b_20;
+    c_p1 += a_p2 * b_21;
+    c_p2 += a_p2 * b_22;
+    c_p3 += a_p2 * b_23;
+    c_p0 += a_p3 * b_30;
+    c_p1 += a_p3 * b_31;
+    c_p2 += a_p3 * b_32;
+    c_p3 += a_p3 * b_33;
+    c_p0 += a_p4 * b_40;
+    c_p1 += a_p4 * b_41;
+    c_p2 += a_p4 * b_42;
+    c_p3 += a_p4 * b_43;
+
+    C[p * _cs0 + jc0] += alpha * c_p0;
+    C[p * _cs0 + jc1] += alpha * c_p1;
+    C[p * _cs0 + jc2] += alpha * c_p2;
+    C[p * _cs0 + jc3] += alpha * c_p3;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<5,3>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0) return 0;
-
-    const ValueType
-      b_00 = B[0*_bs0+0*_bs1], b_01 = B[0*_bs0+1*_bs1], b_02 = B[0*_bs0+2*_bs1],
-      b_10 = B[1*_bs0+0*_bs1], b_11 = B[1*_bs0+1*_bs1], b_12 = B[1*_bs0+2*_bs1],
-      b_20 = B[2*_bs0+0*_bs1], b_21 = B[2*_bs0+1*_bs1], b_22 = B[2*_bs0+2*_bs1],
-      b_30 = B[3*_bs0+0*_bs1], b_31 = B[3*_bs0+1*_bs1], b_32 = B[3*_bs0+2*_bs1],
-      b_40 = B[4*_bs0+0*_bs1], b_41 = B[4*_bs0+1*_bs1], b_42 = B[4*_bs0+2*_bs1];
-      
-    ValueType
-      a_p0, a_p1, a_p2, a_p3, a_p4,
-      c_p0, c_p1, c_p2;
-
-    const int
-      ja0 = 0*_as1, ja1 = 1*_as1, ja2 = 2*_as1, ja3 = 3*_as1, ja4 = 4*_as1,
-      jc0 = 0*_cs1, jc1 = 1*_cs1, jc2 = 2*_cs1;
-    
-    for (int p=0;p<m;++p) {
-      a_p0 = A[p*_bs0+ja0]; a_p1 = A[p*_bs0+ja1]; a_p2 = A[p*_bs0+ja2]; a_p3 = A[p*_bs0+ja3]; a_p4 = A[p*_bs0+ja4];      
-
-      c_p0  = a_p0*b_00; c_p1  = a_p0*b_01; c_p2  = a_p0*b_02; 
-      c_p0 += a_p1*b_10; c_p1 += a_p1*b_11; c_p2 += a_p1*b_12;
-      c_p0 += a_p2*b_20; c_p1 += a_p2*b_21; c_p2 += a_p2*b_22;
-      c_p0 += a_p3*b_30; c_p1 += a_p3*b_31; c_p2 += a_p3*b_32;
-      c_p0 += a_p4*b_40; c_p1 += a_p4*b_41; c_p2 += a_p4*b_42;
-
-      C[p*_cs0+jc0] += alpha * c_p0; C[p*_cs0+jc1] += alpha * c_p1; C[p*_cs0+jc2] += alpha * c_p2;
-    }
-       
-    return 0;
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 3>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0) return 0;
+
+  const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1],
+                  b_02 = B[0 * _bs0 + 2 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1],
+                  b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1],
+                  b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1],
+                  b_22 = B[2 * _bs0 + 2 * _bs1], b_30 = B[3 * _bs0 + 0 * _bs1],
+                  b_31 = B[3 * _bs0 + 1 * _bs1], b_32 = B[3 * _bs0 + 2 * _bs1],
+                  b_40 = B[4 * _bs0 + 0 * _bs1], b_41 = B[4 * _bs0 + 1 * _bs1],
+                  b_42 = B[4 * _bs0 + 2 * _bs1];
+
+  ValueType a_p0, a_p1, a_p2, a_p3, a_p4, c_p0, c_p1, c_p2;
+
+  const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1,
+            ja4 = 4 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1;
+
+  for (int p = 0; p < m; ++p) {
+    a_p0 = A[p * _bs0 + ja0];
+    a_p1 = A[p * _bs0 + ja1];
+    a_p2 = A[p * _bs0 + ja2];
+    a_p3 = A[p * _bs0 + ja3];
+    a_p4 = A[p * _bs0 + ja4];
+
+    c_p0 = a_p0 * b_00;
+    c_p1 = a_p0 * b_01;
+    c_p2 = a_p0 * b_02;
+    c_p0 += a_p1 * b_10;
+    c_p1 += a_p1 * b_11;
+    c_p2 += a_p1 * b_12;
+    c_p0 += a_p2 * b_20;
+    c_p1 += a_p2 * b_21;
+    c_p2 += a_p2 * b_22;
+    c_p0 += a_p3 * b_30;
+    c_p1 += a_p3 * b_31;
+    c_p2 += a_p3 * b_32;
+    c_p0 += a_p4 * b_40;
+    c_p1 += a_p4 * b_41;
+    c_p2 += a_p4 * b_42;
+
+    C[p * _cs0 + jc0] += alpha * c_p0;
+    C[p * _cs0 + jc1] += alpha * c_p1;
+    C[p * _cs0 + jc2] += alpha * c_p2;
   }
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<5,2>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0) return 0;
-
-    const ValueType
-      b_00 = B[0*_bs0+0*_bs1], b_01 = B[0*_bs0+1*_bs1],
-      b_10 = B[1*_bs0+0*_bs1], b_11 = B[1*_bs0+1*_bs1],
-      b_20 = B[2*_bs0+0*_bs1], b_21 = B[2*_bs0+1*_bs1],
-      b_30 = B[3*_bs0+0*_bs1], b_31 = B[3*_bs0+1*_bs1],
-      b_40 = B[4*_bs0+0*_bs1], b_41 = B[4*_bs0+1*_bs1];
-      
-    ValueType
-      a_p0, a_p1, a_p2, a_p3, a_p4,
-      c_p0, c_p1;
-
-    const int
-      ja0 = 0*_as1, ja1 = 1*_as1, ja2 = 2*_as1, ja3 = 3*_as1, ja4 = 4*_as1,
-      jc0 = 0*_cs1, jc1 = 1*_cs1;
-    
-    for (int p=0;p<m;++p) {
-      a_p0 = A[p*_bs0+ja0]; a_p1 = A[p*_bs0+ja1]; a_p2 = A[p*_bs0+ja2]; a_p3 = A[p*_bs0+ja3]; a_p4 = A[p*_bs0+ja4];      
-
-      c_p0  = a_p0*b_00; c_p1  = a_p0*b_01; 
-      c_p0 += a_p1*b_10; c_p1 += a_p1*b_11;
-      c_p0 += a_p2*b_20; c_p1 += a_p2*b_21;
-      c_p0 += a_p3*b_30; c_p1 += a_p3*b_31;
-      c_p0 += a_p4*b_40; c_p1 += a_p4*b_41;
-
-      C[p*_cs0+jc0] += alpha * c_p0; C[p*_cs0+jc1] += alpha * c_p1; 
-    }
-           
-    return 0;
+
+  return 0;
+}
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 2>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0) return 0;
+
+  const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1],
+                  b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1],
+                  b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1],
+                  b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1],
+                  b_40 = B[4 * _bs0 + 0 * _bs1], b_41 = B[4 * _bs0 + 1 * _bs1];
+
+  ValueType a_p0, a_p1, a_p2, a_p3, a_p4, c_p0, c_p1;
+
+  const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1,
+            ja4 = 4 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1;
+
+  for (int p = 0; p < m; ++p) {
+    a_p0 = A[p * _bs0 + ja0];
+    a_p1 = A[p * _bs0 + ja1];
+    a_p2 = A[p * _bs0 + ja2];
+    a_p3 = A[p * _bs0 + ja3];
+    a_p4 = A[p * _bs0 + ja4];
+
+    c_p0 = a_p0 * b_00;
+    c_p1 = a_p0 * b_01;
+    c_p0 += a_p1 * b_10;
+    c_p1 += a_p1 * b_11;
+    c_p0 += a_p2 * b_20;
+    c_p1 += a_p2 * b_21;
+    c_p0 += a_p3 * b_30;
+    c_p1 += a_p3 * b_31;
+    c_p0 += a_p4 * b_40;
+    c_p1 += a_p4 * b_41;
+
+    C[p * _cs0 + jc0] += alpha * c_p0;
+    C[p * _cs0 + jc1] += alpha * c_p1;
   }
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<5,1>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0) return 0;
-
-    const ValueType
-      b_00 = B[0*_bs0+0*_bs1],
-      b_10 = B[1*_bs0+0*_bs1],
-      b_20 = B[2*_bs0+0*_bs1],
-      b_30 = B[3*_bs0+0*_bs1],
-      b_40 = B[4*_bs0+0*_bs1];
-      
-    ValueType
-      a_p0, a_p1, a_p2, a_p3, a_p4,
-      c_p0;
-
-    const int
-      ja0 = 0*_as1, ja1 = 1*_as1, ja2 = 2*_as1, ja3 = 3*_as1, ja4 = 4*_as1,
-      jc0 = 0*_cs1;
-    
-    for (int p=0;p<m;++p) {
-      a_p0 = A[p*_bs0+ja0]; a_p1 = A[p*_bs0+ja1]; a_p2 = A[p*_bs0+ja2]; a_p3 = A[p*_bs0+ja3]; a_p4 = A[p*_bs0+ja4];      
-
-      c_p0  = a_p0*b_00; 
-      c_p0 += a_p1*b_10;
-      c_p0 += a_p2*b_20;
-      c_p0 += a_p3*b_30;
-      c_p0 += a_p4*b_40;
-
-      C[p*_cs0+jc0] += alpha * c_p0;
-    }
-           
-    return 0;
+
+  return 0;
+}
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 1>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0) return 0;
+
+  const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1],
+                  b_20 = B[2 * _bs0 + 0 * _bs1], b_30 = B[3 * _bs0 + 0 * _bs1],
+                  b_40 = B[4 * _bs0 + 0 * _bs1];
+
+  ValueType a_p0, a_p1, a_p2, a_p3, a_p4, c_p0;
+
+  const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1,
+            ja4 = 4 * _as1, jc0 = 0 * _cs1;
+
+  for (int p = 0; p < m; ++p) {
+    a_p0 = A[p * _bs0 + ja0];
+    a_p1 = A[p * _bs0 + ja1];
+    a_p2 = A[p * _bs0 + ja2];
+    a_p3 = A[p * _bs0 + ja3];
+    a_p4 = A[p * _bs0 + ja4];
+
+    c_p0 = a_p0 * b_00;
+    c_p0 += a_p1 * b_10;
+    c_p0 += a_p2 * b_20;
+    c_p0 += a_p3 * b_30;
+    c_p0 += a_p4 * b_40;
+
+    C[p * _cs0 + jc0] += alpha * c_p0;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<4,5>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0) return 0;
-
-    const ValueType
-      b_00 = B[0*_bs0+0*_bs1], b_01 = B[0*_bs0+1*_bs1], b_02 = B[0*_bs0+2*_bs1], b_03 = B[0*_bs0+3*_bs1], b_04 = B[0*_bs0+4*_bs1],
-      b_10 = B[1*_bs0+0*_bs1], b_11 = B[1*_bs0+1*_bs1], b_12 = B[1*_bs0+2*_bs1], b_13 = B[1*_bs0+3*_bs1], b_14 = B[1*_bs0+4*_bs1],
-      b_20 = B[2*_bs0+0*_bs1], b_21 = B[2*_bs0+1*_bs1], b_22 = B[2*_bs0+2*_bs1], b_23 = B[2*_bs0+3*_bs1], b_24 = B[2*_bs0+4*_bs1],
-      b_30 = B[3*_bs0+0*_bs1], b_31 = B[3*_bs0+1*_bs1], b_32 = B[3*_bs0+2*_bs1], b_33 = B[3*_bs0+3*_bs1], b_34 = B[3*_bs0+4*_bs1];
-      
-    ValueType
-      a_p0, a_p1, a_p2, a_p3,
-      c_p0, c_p1, c_p2, c_p3, c_p4;
-
-    const int
-      ja0 = 0*_as1, ja1 = 1*_as1, ja2 = 2*_as1, ja3 = 3*_as1,
-      jc0 = 0*_cs1, jc1 = 1*_cs1, jc2 = 2*_cs1, jc3 = 3*_cs1, jc4 = 4*_cs1;
-    
-    for (int p=0;p<m;++p) {
-      a_p0 = A[p*_bs0+ja0]; a_p1 = A[p*_bs0+ja1]; a_p2 = A[p*_bs0+ja2]; a_p3 = A[p*_bs0+ja3];
-
-      c_p0  = a_p0*b_00; c_p1  = a_p0*b_01; c_p2  = a_p0*b_02; c_p3  = a_p0*b_03; c_p4  = a_p0*b_04; 
-      c_p0 += a_p1*b_10; c_p1 += a_p1*b_11; c_p2 += a_p1*b_12; c_p3 += a_p1*b_13; c_p4 += a_p1*b_14;
-      c_p0 += a_p2*b_20; c_p1 += a_p2*b_21; c_p2 += a_p2*b_22; c_p3 += a_p2*b_23; c_p4 += a_p2*b_24;
-      c_p0 += a_p3*b_30; c_p1 += a_p3*b_31; c_p2 += a_p3*b_32; c_p3 += a_p3*b_33; c_p4 += a_p3*b_34;
-
-      C[p*_cs0+jc0] += alpha * c_p0; C[p*_cs0+jc1] += alpha * c_p1; C[p*_cs0+jc2] += alpha * c_p2; C[p*_cs0+jc3] += alpha * c_p3; C[p*_cs0+jc4] += alpha * c_p4;
-    }
+  return 0;
+}
 
-    return 0;
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 5>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0) return 0;
+
+  const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1],
+                  b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1],
+                  b_04 = B[0 * _bs0 + 4 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1],
+                  b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1],
+                  b_13 = B[1 * _bs0 + 3 * _bs1], b_14 = B[1 * _bs0 + 4 * _bs1],
+                  b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1],
+                  b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1],
+                  b_24 = B[2 * _bs0 + 4 * _bs1], b_30 = B[3 * _bs0 + 0 * _bs1],
+                  b_31 = B[3 * _bs0 + 1 * _bs1], b_32 = B[3 * _bs0 + 2 * _bs1],
+                  b_33 = B[3 * _bs0 + 3 * _bs1], b_34 = B[3 * _bs0 + 4 * _bs1];
+
+  ValueType a_p0, a_p1, a_p2, a_p3, c_p0, c_p1, c_p2, c_p3, c_p4;
+
+  const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1,
+            jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, jc3 = 3 * _cs1,
+            jc4 = 4 * _cs1;
+
+  for (int p = 0; p < m; ++p) {
+    a_p0 = A[p * _bs0 + ja0];
+    a_p1 = A[p * _bs0 + ja1];
+    a_p2 = A[p * _bs0 + ja2];
+    a_p3 = A[p * _bs0 + ja3];
+
+    c_p0 = a_p0 * b_00;
+    c_p1 = a_p0 * b_01;
+    c_p2 = a_p0 * b_02;
+    c_p3 = a_p0 * b_03;
+    c_p4 = a_p0 * b_04;
+    c_p0 += a_p1 * b_10;
+    c_p1 += a_p1 * b_11;
+    c_p2 += a_p1 * b_12;
+    c_p3 += a_p1 * b_13;
+    c_p4 += a_p1 * b_14;
+    c_p0 += a_p2 * b_20;
+    c_p1 += a_p2 * b_21;
+    c_p2 += a_p2 * b_22;
+    c_p3 += a_p2 * b_23;
+    c_p4 += a_p2 * b_24;
+    c_p0 += a_p3 * b_30;
+    c_p1 += a_p3 * b_31;
+    c_p2 += a_p3 * b_32;
+    c_p3 += a_p3 * b_33;
+    c_p4 += a_p3 * b_34;
+
+    C[p * _cs0 + jc0] += alpha * c_p0;
+    C[p * _cs0 + jc1] += alpha * c_p1;
+    C[p * _cs0 + jc2] += alpha * c_p2;
+    C[p * _cs0 + jc3] += alpha * c_p3;
+    C[p * _cs0 + jc4] += alpha * c_p4;
   }
 
+  return 0;
+}
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<3,5>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0) return 0;
-
-    const ValueType
-      b_00 = B[0*_bs0+0*_bs1], b_01 = B[0*_bs0+1*_bs1], b_02 = B[0*_bs0+2*_bs1], b_03 = B[0*_bs0+3*_bs1], b_04 = B[0*_bs0+4*_bs1],
-      b_10 = B[1*_bs0+0*_bs1], b_11 = B[1*_bs0+1*_bs1], b_12 = B[1*_bs0+2*_bs1], b_13 = B[1*_bs0+3*_bs1], b_14 = B[1*_bs0+4*_bs1],
-      b_20 = B[2*_bs0+0*_bs1], b_21 = B[2*_bs0+1*_bs1], b_22 = B[2*_bs0+2*_bs1], b_23 = B[2*_bs0+3*_bs1], b_24 = B[2*_bs0+4*_bs1];
-      
-    ValueType
-      a_p0, a_p1, a_p2, 
-      c_p0, c_p1, c_p2, c_p3, c_p4;
-
-    const int
-      ja0 = 0*_as1, ja1 = 1*_as1, ja2 = 2*_as1,
-      jc0 = 0*_cs1, jc1 = 1*_cs1, jc2 = 2*_cs1, jc3 = 3*_cs1, jc4 = 4*_cs1;
-    
-    for (int p=0;p<m;++p) {
-      a_p0 = A[p*_bs0+ja0]; a_p1 = A[p*_bs0+ja1]; a_p2 = A[p*_bs0+ja2]; 
-
-      c_p0  = a_p0*b_00; c_p1  = a_p0*b_01; c_p2  = a_p0*b_02; c_p3  = a_p0*b_03; c_p4  = a_p0*b_04; 
-      c_p0 += a_p1*b_10; c_p1 += a_p1*b_11; c_p2 += a_p1*b_12; c_p3 += a_p1*b_13; c_p4 += a_p1*b_14;
-      c_p0 += a_p2*b_20; c_p1 += a_p2*b_21; c_p2 += a_p2*b_22; c_p3 += a_p2*b_23; c_p4 += a_p2*b_24;
-
-      C[p*_cs0+jc0] += alpha * c_p0; C[p*_cs0+jc1] += alpha * c_p1; C[p*_cs0+jc2] += alpha * c_p2; C[p*_cs0+jc3] += alpha * c_p3; C[p*_cs0+jc4] += alpha * c_p4;
-    }
-
-    return 0;
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 5>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0) return 0;
+
+  const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1],
+                  b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1],
+                  b_04 = B[0 * _bs0 + 4 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1],
+                  b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1],
+                  b_13 = B[1 * _bs0 + 3 * _bs1], b_14 = B[1 * _bs0 + 4 * _bs1],
+                  b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1],
+                  b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1],
+                  b_24 = B[2 * _bs0 + 4 * _bs1];
+
+  ValueType a_p0, a_p1, a_p2, c_p0, c_p1, c_p2, c_p3, c_p4;
+
+  const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1,
+            jc1 = 1 * _cs1, jc2 = 2 * _cs1, jc3 = 3 * _cs1, jc4 = 4 * _cs1;
+
+  for (int p = 0; p < m; ++p) {
+    a_p0 = A[p * _bs0 + ja0];
+    a_p1 = A[p * _bs0 + ja1];
+    a_p2 = A[p * _bs0 + ja2];
+
+    c_p0 = a_p0 * b_00;
+    c_p1 = a_p0 * b_01;
+    c_p2 = a_p0 * b_02;
+    c_p3 = a_p0 * b_03;
+    c_p4 = a_p0 * b_04;
+    c_p0 += a_p1 * b_10;
+    c_p1 += a_p1 * b_11;
+    c_p2 += a_p1 * b_12;
+    c_p3 += a_p1 * b_13;
+    c_p4 += a_p1 * b_14;
+    c_p0 += a_p2 * b_20;
+    c_p1 += a_p2 * b_21;
+    c_p2 += a_p2 * b_22;
+    c_p3 += a_p2 * b_23;
+    c_p4 += a_p2 * b_24;
+
+    C[p * _cs0 + jc0] += alpha * c_p0;
+    C[p * _cs0 + jc1] += alpha * c_p1;
+    C[p * _cs0 + jc2] += alpha * c_p2;
+    C[p * _cs0 + jc3] += alpha * c_p3;
+    C[p * _cs0 + jc4] += alpha * c_p4;
   }
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<2,5>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0) return 0;
-
-    const ValueType
-      b_00 = B[0*_bs0+0*_bs1], b_01 = B[0*_bs0+1*_bs1], b_02 = B[0*_bs0+2*_bs1], b_03 = B[0*_bs0+3*_bs1], b_04 = B[0*_bs0+4*_bs1],
-      b_10 = B[1*_bs0+0*_bs1], b_11 = B[1*_bs0+1*_bs1], b_12 = B[1*_bs0+2*_bs1], b_13 = B[1*_bs0+3*_bs1], b_14 = B[1*_bs0+4*_bs1];
-      
-    ValueType
-      a_p0, a_p1, 
-      c_p0, c_p1, c_p2, c_p3, c_p4;
-
-    const int
-      ja0 = 0*_as1, ja1 = 1*_as1, 
-      jc0 = 0*_cs1, jc1 = 1*_cs1, jc2 = 2*_cs1, jc3 = 3*_cs1, jc4 = 4*_cs1;
-    
-    for (int p=0;p<m;++p) {
-      a_p0 = A[p*_bs0+ja0]; a_p1 = A[p*_bs0+ja1]; 
-
-      c_p0  = a_p0*b_00; c_p1  = a_p0*b_01; c_p2  = a_p0*b_02; c_p3  = a_p0*b_03; c_p4  = a_p0*b_04; 
-      c_p0 += a_p1*b_10; c_p1 += a_p1*b_11; c_p2 += a_p1*b_12; c_p3 += a_p1*b_13; c_p4 += a_p1*b_14;
-
-      C[p*_cs0+jc0] += alpha * c_p0; C[p*_cs0+jc1] += alpha * c_p1; C[p*_cs0+jc2] += alpha * c_p2; C[p*_cs0+jc3] += alpha * c_p3; C[p*_cs0+jc4] += alpha * c_p4;
-    }
 
-    return 0;
+  return 0;
+}
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 5>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0) return 0;
+
+  const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1],
+                  b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1],
+                  b_04 = B[0 * _bs0 + 4 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1],
+                  b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1],
+                  b_13 = B[1 * _bs0 + 3 * _bs1], b_14 = B[1 * _bs0 + 4 * _bs1];
+
+  ValueType a_p0, a_p1, c_p0, c_p1, c_p2, c_p3, c_p4;
+
+  const int ja0 = 0 * _as1, ja1 = 1 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1,
+            jc2 = 2 * _cs1, jc3 = 3 * _cs1, jc4 = 4 * _cs1;
+
+  for (int p = 0; p < m; ++p) {
+    a_p0 = A[p * _bs0 + ja0];
+    a_p1 = A[p * _bs0 + ja1];
+
+    c_p0 = a_p0 * b_00;
+    c_p1 = a_p0 * b_01;
+    c_p2 = a_p0 * b_02;
+    c_p3 = a_p0 * b_03;
+    c_p4 = a_p0 * b_04;
+    c_p0 += a_p1 * b_10;
+    c_p1 += a_p1 * b_11;
+    c_p2 += a_p1 * b_12;
+    c_p3 += a_p1 * b_13;
+    c_p4 += a_p1 * b_14;
+
+    C[p * _cs0 + jc0] += alpha * c_p0;
+    C[p * _cs0 + jc1] += alpha * c_p1;
+    C[p * _cs0 + jc2] += alpha * c_p2;
+    C[p * _cs0 + jc3] += alpha * c_p3;
+    C[p * _cs0 + jc4] += alpha * c_p4;
   }
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<1,5>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0) return 0;
-
-    const ValueType
-      b_00 = B[0*_bs0+0*_bs1], b_01 = B[0*_bs0+1*_bs1], b_02 = B[0*_bs0+2*_bs1], b_03 = B[0*_bs0+3*_bs1], b_04 = B[0*_bs0+4*_bs1];
-      
-    ValueType
-      a_p0, 
-      c_p0, c_p1, c_p2, c_p3, c_p4;
-
-    const int
-      ja0 = 0*_as1,
-      jc0 = 0*_cs1, jc1 = 1*_cs1, jc2 = 2*_cs1, jc3 = 3*_cs1, jc4 = 4*_cs1;
-    
-    for (int p=0;p<m;++p) {
-      a_p0 = A[p*_bs0+ja0]; 
-
-      c_p0  = a_p0*b_00; c_p1  = a_p0*b_01; c_p2  = a_p0*b_02; c_p3  = a_p0*b_03; c_p4  = a_p0*b_04; 
-
-      C[p*_cs0+jc0] += alpha * c_p0; C[p*_cs0+jc1] += alpha * c_p1; C[p*_cs0+jc2] += alpha * c_p2; C[p*_cs0+jc3] += alpha * c_p3; C[p*_cs0+jc4] += alpha * c_p4;
-    }
 
-    return 0;
+  return 0;
+}
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 5>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0) return 0;
+
+  const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1],
+                  b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1],
+                  b_04 = B[0 * _bs0 + 4 * _bs1];
+
+  ValueType a_p0, c_p0, c_p1, c_p2, c_p3, c_p4;
+
+  const int ja0 = 0 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1,
+            jc3 = 3 * _cs1, jc4 = 4 * _cs1;
+
+  for (int p = 0; p < m; ++p) {
+    a_p0 = A[p * _bs0 + ja0];
+
+    c_p0 = a_p0 * b_00;
+    c_p1 = a_p0 * b_01;
+    c_p2 = a_p0 * b_02;
+    c_p3 = a_p0 * b_03;
+    c_p4 = a_p0 * b_04;
+
+    C[p * _cs0 + jc0] += alpha * c_p0;
+    C[p * _cs0 + jc1] += alpha * c_p1;
+    C[p * _cs0 + jc2] += alpha * c_p2;
+    C[p * _cs0 + jc3] += alpha * c_p3;
+    C[p * _cs0 + jc4] += alpha * c_p4;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<5,5>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m, const int n, const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (m <=0 || n <= 0 || k <= 0) return 0;
-
-    switch (k*10+n) {
-    case 54: { InnerGemmFixB<5,4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, C); break; }
-    case 53: { InnerGemmFixB<5,3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, C); break; }
-    case 52: { InnerGemmFixB<5,2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, C); break; }
-    case 51: { InnerGemmFixB<5,1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, C); break; }
-    case 45: { InnerGemmFixB<4,5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, C); break; }
-    case 35: { InnerGemmFixB<3,5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, C); break; }
-    case 25: { InnerGemmFixB<2,5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, C); break; }
-    case 15: { InnerGemmFixB<1,5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, C); break; }
-    }
+  return 0;
+}
 
-    return 0;
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<5, 5>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0 || n <= 0 || k <= 0) return 0;
+
+  switch (k * 10 + n) {
+    case 54: {
+      InnerGemmFixB<5, 4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, m, C);
+      break;
+    }
+    case 53: {
+      InnerGemmFixB<5, 3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, m, C);
+      break;
+    }
+    case 52: {
+      InnerGemmFixB<5, 2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, m, C);
+      break;
+    }
+    case 51: {
+      InnerGemmFixB<5, 1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, m, C);
+      break;
+    }
+    case 45: {
+      InnerGemmFixB<4, 5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, m, C);
+      break;
+    }
+    case 35: {
+      InnerGemmFixB<3, 5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, m, C);
+      break;
+    }
+    case 25: {
+      InnerGemmFixB<2, 5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, m, C);
+      break;
+    }
+    case 15: {
+      InnerGemmFixB<1, 5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, m, C);
+      break;
+    }
   }
 
-  ///
-  /// Inner kernel (4x4)
-  /// ==================
-  
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<4,4>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0) return 0;
-
-    const ValueType
-      b_00 = B[0*_bs0+0*_bs1], b_01 = B[0*_bs0+1*_bs1], b_02 = B[0*_bs0+2*_bs1], b_03 = B[0*_bs0+3*_bs1],
-      b_10 = B[1*_bs0+0*_bs1], b_11 = B[1*_bs0+1*_bs1], b_12 = B[1*_bs0+2*_bs1], b_13 = B[1*_bs0+3*_bs1],
-      b_20 = B[2*_bs0+0*_bs1], b_21 = B[2*_bs0+1*_bs1], b_22 = B[2*_bs0+2*_bs1], b_23 = B[2*_bs0+3*_bs1],
-      b_30 = B[3*_bs0+0*_bs1], b_31 = B[3*_bs0+1*_bs1], b_32 = B[3*_bs0+2*_bs1], b_33 = B[3*_bs0+3*_bs1];
-      
-    ValueType
-      a_p0, a_p1, a_p2, a_p3,
-      c_p0, c_p1, c_p2, c_p3;
-
-    const int
-      ja0 = 0*_as1, ja1 = 1*_as1, ja2 = 2*_as1, ja3 = 3*_as1, 
-      jc0 = 0*_cs1, jc1 = 1*_cs1, jc2 = 2*_cs1, jc3 = 3*_cs1;
-    
-    for (int p=0;p<m;++p) {
-      a_p0 = A[p*_bs0+ja0]; a_p1 = A[p*_bs0+ja1]; a_p2 = A[p*_bs0+ja2]; a_p3 = A[p*_bs0+ja3]; 
-
-      c_p0  = a_p0*b_00; c_p1  = a_p0*b_01; c_p2  = a_p0*b_02; c_p3  = a_p0*b_03; 
-      c_p0 += a_p1*b_10; c_p1 += a_p1*b_11; c_p2 += a_p1*b_12; c_p3 += a_p1*b_13;
-      c_p0 += a_p2*b_20; c_p1 += a_p2*b_21; c_p2 += a_p2*b_22; c_p3 += a_p2*b_23;
-      c_p0 += a_p3*b_30; c_p1 += a_p3*b_31; c_p2 += a_p3*b_32; c_p3 += a_p3*b_33;
-
-      C[p*_cs0+jc0] += alpha * c_p0; C[p*_cs0+jc1] += alpha * c_p1; C[p*_cs0+jc2] += alpha * c_p2; C[p*_cs0+jc3] += alpha * c_p3; 
-    }
+  return 0;
+}
 
-    return 0;
+///
+/// Inner kernel (4x4)
+/// ==================
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 4>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0) return 0;
+
+  const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1],
+                  b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1],
+                  b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1],
+                  b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1],
+                  b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1],
+                  b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1],
+                  b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1],
+                  b_32 = B[3 * _bs0 + 2 * _bs1], b_33 = B[3 * _bs0 + 3 * _bs1];
+
+  ValueType a_p0, a_p1, a_p2, a_p3, c_p0, c_p1, c_p2, c_p3;
+
+  const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1,
+            jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1, jc3 = 3 * _cs1;
+
+  for (int p = 0; p < m; ++p) {
+    a_p0 = A[p * _bs0 + ja0];
+    a_p1 = A[p * _bs0 + ja1];
+    a_p2 = A[p * _bs0 + ja2];
+    a_p3 = A[p * _bs0 + ja3];
+
+    c_p0 = a_p0 * b_00;
+    c_p1 = a_p0 * b_01;
+    c_p2 = a_p0 * b_02;
+    c_p3 = a_p0 * b_03;
+    c_p0 += a_p1 * b_10;
+    c_p1 += a_p1 * b_11;
+    c_p2 += a_p1 * b_12;
+    c_p3 += a_p1 * b_13;
+    c_p0 += a_p2 * b_20;
+    c_p1 += a_p2 * b_21;
+    c_p2 += a_p2 * b_22;
+    c_p3 += a_p2 * b_23;
+    c_p0 += a_p3 * b_30;
+    c_p1 += a_p3 * b_31;
+    c_p2 += a_p3 * b_32;
+    c_p3 += a_p3 * b_33;
+
+    C[p * _cs0 + jc0] += alpha * c_p0;
+    C[p * _cs0 + jc1] += alpha * c_p1;
+    C[p * _cs0 + jc2] += alpha * c_p2;
+    C[p * _cs0 + jc3] += alpha * c_p3;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<4,3>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0) return 0;
-
-    const ValueType
-      b_00 = B[0*_bs0+0*_bs1], b_01 = B[0*_bs0+1*_bs1], b_02 = B[0*_bs0+2*_bs1],
-      b_10 = B[1*_bs0+0*_bs1], b_11 = B[1*_bs0+1*_bs1], b_12 = B[1*_bs0+2*_bs1],
-      b_20 = B[2*_bs0+0*_bs1], b_21 = B[2*_bs0+1*_bs1], b_22 = B[2*_bs0+2*_bs1],
-      b_30 = B[3*_bs0+0*_bs1], b_31 = B[3*_bs0+1*_bs1], b_32 = B[3*_bs0+2*_bs1];
-      
-    ValueType
-      a_p0, a_p1, a_p2, a_p3,
-      c_p0, c_p1, c_p2;
-
-    const int
-      ja0 = 0*_as1, ja1 = 1*_as1, ja2 = 2*_as1, ja3 = 3*_as1, 
-      jc0 = 0*_cs1, jc1 = 1*_cs1, jc2 = 2*_cs1;
-    
-    for (int p=0;p<m;++p) {
-      a_p0 = A[p*_bs0+ja0]; a_p1 = A[p*_bs0+ja1]; a_p2 = A[p*_bs0+ja2]; a_p3 = A[p*_bs0+ja3]; 
-
-      c_p0  = a_p0*b_00; c_p1  = a_p0*b_01; c_p2  = a_p0*b_02;  
-      c_p0 += a_p1*b_10; c_p1 += a_p1*b_11; c_p2 += a_p1*b_12; 
-      c_p0 += a_p2*b_20; c_p1 += a_p2*b_21; c_p2 += a_p2*b_22; 
-      c_p0 += a_p3*b_30; c_p1 += a_p3*b_31; c_p2 += a_p3*b_32; 
-
-      C[p*_cs0+jc0] += alpha * c_p0; C[p*_cs0+jc1] += alpha * c_p1; C[p*_cs0+jc2] += alpha * c_p2; 
-    }
+  return 0;
+}
 
-    return 0;
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 3>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0) return 0;
+
+  const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1],
+                  b_02 = B[0 * _bs0 + 2 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1],
+                  b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1],
+                  b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1],
+                  b_22 = B[2 * _bs0 + 2 * _bs1], b_30 = B[3 * _bs0 + 0 * _bs1],
+                  b_31 = B[3 * _bs0 + 1 * _bs1], b_32 = B[3 * _bs0 + 2 * _bs1];
+
+  ValueType a_p0, a_p1, a_p2, a_p3, c_p0, c_p1, c_p2;
+
+  const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1,
+            jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1;
+
+  for (int p = 0; p < m; ++p) {
+    a_p0 = A[p * _bs0 + ja0];
+    a_p1 = A[p * _bs0 + ja1];
+    a_p2 = A[p * _bs0 + ja2];
+    a_p3 = A[p * _bs0 + ja3];
+
+    c_p0 = a_p0 * b_00;
+    c_p1 = a_p0 * b_01;
+    c_p2 = a_p0 * b_02;
+    c_p0 += a_p1 * b_10;
+    c_p1 += a_p1 * b_11;
+    c_p2 += a_p1 * b_12;
+    c_p0 += a_p2 * b_20;
+    c_p1 += a_p2 * b_21;
+    c_p2 += a_p2 * b_22;
+    c_p0 += a_p3 * b_30;
+    c_p1 += a_p3 * b_31;
+    c_p2 += a_p3 * b_32;
+
+    C[p * _cs0 + jc0] += alpha * c_p0;
+    C[p * _cs0 + jc1] += alpha * c_p1;
+    C[p * _cs0 + jc2] += alpha * c_p2;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<4,2>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0) return 0;
-
-    const ValueType
-      b_00 = B[0*_bs0+0*_bs1], b_01 = B[0*_bs0+1*_bs1],
-      b_10 = B[1*_bs0+0*_bs1], b_11 = B[1*_bs0+1*_bs1],
-      b_20 = B[2*_bs0+0*_bs1], b_21 = B[2*_bs0+1*_bs1],
-      b_30 = B[3*_bs0+0*_bs1], b_31 = B[3*_bs0+1*_bs1];
-      
-    ValueType
-      a_p0, a_p1, a_p2, a_p3,
-      c_p0, c_p1;
-
-    const int
-      ja0 = 0*_as1, ja1 = 1*_as1, ja2 = 2*_as1, ja3 = 3*_as1, 
-      jc0 = 0*_cs1, jc1 = 1*_cs1;
-    
-    for (int p=0;p<m;++p) {
-      a_p0 = A[p*_bs0+ja0]; a_p1 = A[p*_bs0+ja1]; a_p2 = A[p*_bs0+ja2]; a_p3 = A[p*_bs0+ja3]; 
-
-      c_p0  = a_p0*b_00; c_p1  = a_p0*b_01;   
-      c_p0 += a_p1*b_10; c_p1 += a_p1*b_11;  
-      c_p0 += a_p2*b_20; c_p1 += a_p2*b_21;  
-      c_p0 += a_p3*b_30; c_p1 += a_p3*b_31;  
-
-      C[p*_cs0+jc0] += alpha * c_p0; C[p*_cs0+jc1] += alpha * c_p1; 
-    }
+  return 0;
+}
 
-    return 0;
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 2>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0) return 0;
+
+  const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1],
+                  b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1],
+                  b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1],
+                  b_30 = B[3 * _bs0 + 0 * _bs1], b_31 = B[3 * _bs0 + 1 * _bs1];
+
+  ValueType a_p0, a_p1, a_p2, a_p3, c_p0, c_p1;
+
+  const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1,
+            jc0 = 0 * _cs1, jc1 = 1 * _cs1;
+
+  for (int p = 0; p < m; ++p) {
+    a_p0 = A[p * _bs0 + ja0];
+    a_p1 = A[p * _bs0 + ja1];
+    a_p2 = A[p * _bs0 + ja2];
+    a_p3 = A[p * _bs0 + ja3];
+
+    c_p0 = a_p0 * b_00;
+    c_p1 = a_p0 * b_01;
+    c_p0 += a_p1 * b_10;
+    c_p1 += a_p1 * b_11;
+    c_p0 += a_p2 * b_20;
+    c_p1 += a_p2 * b_21;
+    c_p0 += a_p3 * b_30;
+    c_p1 += a_p3 * b_31;
+
+    C[p * _cs0 + jc0] += alpha * c_p0;
+    C[p * _cs0 + jc1] += alpha * c_p1;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<4,1>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0) return 0;
-
-    const ValueType
-      b_00 = B[0*_bs0+0*_bs1],
-      b_10 = B[1*_bs0+0*_bs1],
-      b_20 = B[2*_bs0+0*_bs1],
-      b_30 = B[3*_bs0+0*_bs1];
-      
-    ValueType
-      a_p0, a_p1, a_p2, a_p3,
-      c_p0;
-
-    const int
-      ja0 = 0*_as1, ja1 = 1*_as1, ja2 = 2*_as1, ja3 = 3*_as1, 
-      jc0 = 0*_cs1;
-    
-    for (int p=0;p<m;++p) {
-      a_p0 = A[p*_bs0+ja0]; a_p1 = A[p*_bs0+ja1]; a_p2 = A[p*_bs0+ja2]; a_p3 = A[p*_bs0+ja3]; 
-
-      c_p0  = a_p0*b_00;    
-      c_p0 += a_p1*b_10;   
-      c_p0 += a_p2*b_20;   
-      c_p0 += a_p3*b_30;   
-
-      C[p*_cs0+jc0] += alpha * c_p0; 
-    }
+  return 0;
+}
 
-    return 0;
-  }
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 1>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0) return 0;
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<3,4>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0) return 0;
-
-    const ValueType
-      b_00 = B[0*_bs0+0*_bs1], b_01 = B[0*_bs0+1*_bs1], b_02 = B[0*_bs0+2*_bs1], b_03 = B[0*_bs0+3*_bs1],
-      b_10 = B[1*_bs0+0*_bs1], b_11 = B[1*_bs0+1*_bs1], b_12 = B[1*_bs0+2*_bs1], b_13 = B[1*_bs0+3*_bs1],
-      b_20 = B[2*_bs0+0*_bs1], b_21 = B[2*_bs0+1*_bs1], b_22 = B[2*_bs0+2*_bs1], b_23 = B[2*_bs0+3*_bs1];
-      
-    ValueType
-      a_p0, a_p1, a_p2, 
-      c_p0, c_p1, c_p2, c_p3;
-
-    const int
-      ja0 = 0*_as1, ja1 = 1*_as1, ja2 = 2*_as1, 
-      jc0 = 0*_cs1, jc1 = 1*_cs1, jc2 = 2*_cs1, jc3 = 3*_cs1;
-    
-    for (int p=0;p<m;++p) {
-      a_p0 = A[p*_bs0+ja0]; a_p1 = A[p*_bs0+ja1]; a_p2 = A[p*_bs0+ja2]; 
-
-      c_p0  = a_p0*b_00; c_p1  = a_p0*b_01; c_p2  = a_p0*b_02; c_p3  = a_p0*b_03; 
-      c_p0 += a_p1*b_10; c_p1 += a_p1*b_11; c_p2 += a_p1*b_12; c_p3 += a_p1*b_13;
-      c_p0 += a_p2*b_20; c_p1 += a_p2*b_21; c_p2 += a_p2*b_22; c_p3 += a_p2*b_23;
-
-      C[p*_cs0+jc0] += alpha * c_p0; C[p*_cs0+jc1] += alpha * c_p1; C[p*_cs0+jc2] += alpha * c_p2; C[p*_cs0+jc3] += alpha * c_p3; 
-    }
+  const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1],
+                  b_20 = B[2 * _bs0 + 0 * _bs1], b_30 = B[3 * _bs0 + 0 * _bs1];
 
-    return 0;
-  }
+  ValueType a_p0, a_p1, a_p2, a_p3, c_p0;
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<2,4>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0) return 0;
-
-    const ValueType
-      b_00 = B[0*_bs0+0*_bs1], b_01 = B[0*_bs0+1*_bs1], b_02 = B[0*_bs0+2*_bs1], b_03 = B[0*_bs0+3*_bs1],
-      b_10 = B[1*_bs0+0*_bs1], b_11 = B[1*_bs0+1*_bs1], b_12 = B[1*_bs0+2*_bs1], b_13 = B[1*_bs0+3*_bs1];
-      
-    ValueType
-      a_p0, a_p1, 
-      c_p0, c_p1, c_p2, c_p3;
-
-    const int
-      ja0 = 0*_as1, ja1 = 1*_as1, 
-      jc0 = 0*_cs1, jc1 = 1*_cs1, jc2 = 2*_cs1, jc3 = 3*_cs1;
-    
-    for (int p=0;p<m;++p) {
-      a_p0 = A[p*_bs0+ja0]; a_p1 = A[p*_bs0+ja1]; 
-
-      c_p0  = a_p0*b_00; c_p1  = a_p0*b_01; c_p2  = a_p0*b_02; c_p3  = a_p0*b_03; 
-      c_p0 += a_p1*b_10; c_p1 += a_p1*b_11; c_p2 += a_p1*b_12; c_p3 += a_p1*b_13;
-
-      C[p*_cs0+jc0] += alpha * c_p0; C[p*_cs0+jc1] += alpha * c_p1; C[p*_cs0+jc2] += alpha * c_p2; C[p*_cs0+jc3] += alpha * c_p3; 
-    }
+  const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, ja3 = 3 * _as1,
+            jc0 = 0 * _cs1;
 
-    return 0;
-  }
+  for (int p = 0; p < m; ++p) {
+    a_p0 = A[p * _bs0 + ja0];
+    a_p1 = A[p * _bs0 + ja1];
+    a_p2 = A[p * _bs0 + ja2];
+    a_p3 = A[p * _bs0 + ja3];
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<1,4>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0) return 0;
-
-    const ValueType
-      b_00 = B[0*_bs0+0*_bs1], b_01 = B[0*_bs0+1*_bs1], b_02 = B[0*_bs0+2*_bs1], b_03 = B[0*_bs0+3*_bs1];
-      
-    ValueType
-      a_p0,
-      c_p0, c_p1, c_p2, c_p3;
-
-    const int
-      ja0 = 0*_as1,
-      jc0 = 0*_cs1, jc1 = 1*_cs1, jc2 = 2*_cs1, jc3 = 3*_cs1;
-    
-    for (int p=0;p<m;++p) {
-      a_p0 = A[p*_bs0+ja0]; 
-
-      c_p0  = a_p0*b_00; c_p1  = a_p0*b_01; c_p2  = a_p0*b_02; c_p3  = a_p0*b_03; 
-
-      C[p*_cs0+jc0] += alpha * c_p0; C[p*_cs0+jc1] += alpha * c_p1; C[p*_cs0+jc2] += alpha * c_p2; C[p*_cs0+jc3] += alpha * c_p3; 
-    }
+    c_p0 = a_p0 * b_00;
+    c_p0 += a_p1 * b_10;
+    c_p0 += a_p2 * b_20;
+    c_p0 += a_p3 * b_30;
 
-    return 0;
+    C[p * _cs0 + jc0] += alpha * c_p0;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<4,4>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m, const int n, const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (m <=0 || n <= 0 || k <= 0) return 0;
-
-    switch (k*10+n) {
-    case 43: { InnerGemmFixB<4,3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, C); break; }
-    case 42: { InnerGemmFixB<4,2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, C); break; }
-    case 41: { InnerGemmFixB<4,1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, C); break; }
-    case 34: { InnerGemmFixB<3,4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, C); break; }
-    case 24: { InnerGemmFixB<2,4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, C); break; }
-    case 14: { InnerGemmFixB<1,4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, C); break; }
-    }
+  return 0;
+}
 
-    return 0;
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 4>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0) return 0;
+
+  const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1],
+                  b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1],
+                  b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1],
+                  b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1],
+                  b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1],
+                  b_22 = B[2 * _bs0 + 2 * _bs1], b_23 = B[2 * _bs0 + 3 * _bs1];
+
+  ValueType a_p0, a_p1, a_p2, c_p0, c_p1, c_p2, c_p3;
+
+  const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1,
+            jc1 = 1 * _cs1, jc2 = 2 * _cs1, jc3 = 3 * _cs1;
+
+  for (int p = 0; p < m; ++p) {
+    a_p0 = A[p * _bs0 + ja0];
+    a_p1 = A[p * _bs0 + ja1];
+    a_p2 = A[p * _bs0 + ja2];
+
+    c_p0 = a_p0 * b_00;
+    c_p1 = a_p0 * b_01;
+    c_p2 = a_p0 * b_02;
+    c_p3 = a_p0 * b_03;
+    c_p0 += a_p1 * b_10;
+    c_p1 += a_p1 * b_11;
+    c_p2 += a_p1 * b_12;
+    c_p3 += a_p1 * b_13;
+    c_p0 += a_p2 * b_20;
+    c_p1 += a_p2 * b_21;
+    c_p2 += a_p2 * b_22;
+    c_p3 += a_p2 * b_23;
+
+    C[p * _cs0 + jc0] += alpha * c_p0;
+    C[p * _cs0 + jc1] += alpha * c_p1;
+    C[p * _cs0 + jc2] += alpha * c_p2;
+    C[p * _cs0 + jc3] += alpha * c_p3;
   }
 
-  ///
-  /// Inner kernel (3x3)
-  /// ==================
-
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<3,3>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0) return 0;
-
-    const ValueType
-      b_00 = B[0*_bs0+0*_bs1], b_01 = B[0*_bs0+1*_bs1], b_02 = B[0*_bs0+2*_bs1],
-      b_10 = B[1*_bs0+0*_bs1], b_11 = B[1*_bs0+1*_bs1], b_12 = B[1*_bs0+2*_bs1],
-      b_20 = B[2*_bs0+0*_bs1], b_21 = B[2*_bs0+1*_bs1], b_22 = B[2*_bs0+2*_bs1];
-      
-    ValueType
-      a_p0, a_p1, a_p2,
-      c_p0, c_p1, c_p2;
-
-    const int
-      ja0 = 0*_as1, ja1 = 1*_as1, ja2 = 2*_as1, 
-      jc0 = 0*_cs1, jc1 = 1*_cs1, jc2 = 2*_cs1;
-    
-    for (int p=0;p<m;++p) {
-      a_p0 = A[p*_bs0+ja0]; a_p1 = A[p*_bs0+ja1]; a_p2 = A[p*_bs0+ja2]; 
-
-      c_p0  = a_p0*b_00; c_p1  = a_p0*b_01; c_p2  = a_p0*b_02;
-      c_p0 += a_p1*b_10; c_p1 += a_p1*b_11; c_p2 += a_p1*b_12;
-      c_p0 += a_p2*b_20; c_p1 += a_p2*b_21; c_p2 += a_p2*b_22;
-      
-      C[p*_cs0+jc0] += alpha * c_p0; C[p*_cs0+jc1] += alpha * c_p1; C[p*_cs0+jc2] += alpha * c_p2; 
-    }
+  return 0;
+}
 
-    return 0;
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 4>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0) return 0;
+
+  const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1],
+                  b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1],
+                  b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1],
+                  b_12 = B[1 * _bs0 + 2 * _bs1], b_13 = B[1 * _bs0 + 3 * _bs1];
+
+  ValueType a_p0, a_p1, c_p0, c_p1, c_p2, c_p3;
+
+  const int ja0 = 0 * _as1, ja1 = 1 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1,
+            jc2 = 2 * _cs1, jc3 = 3 * _cs1;
+
+  for (int p = 0; p < m; ++p) {
+    a_p0 = A[p * _bs0 + ja0];
+    a_p1 = A[p * _bs0 + ja1];
+
+    c_p0 = a_p0 * b_00;
+    c_p1 = a_p0 * b_01;
+    c_p2 = a_p0 * b_02;
+    c_p3 = a_p0 * b_03;
+    c_p0 += a_p1 * b_10;
+    c_p1 += a_p1 * b_11;
+    c_p2 += a_p1 * b_12;
+    c_p3 += a_p1 * b_13;
+
+    C[p * _cs0 + jc0] += alpha * c_p0;
+    C[p * _cs0 + jc1] += alpha * c_p1;
+    C[p * _cs0 + jc2] += alpha * c_p2;
+    C[p * _cs0 + jc3] += alpha * c_p3;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<3,2>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0) return 0;
-
-    const ValueType
-      b_00 = B[0*_bs0+0*_bs1], b_01 = B[0*_bs0+1*_bs1],
-      b_10 = B[1*_bs0+0*_bs1], b_11 = B[1*_bs0+1*_bs1],
-      b_20 = B[2*_bs0+0*_bs1], b_21 = B[2*_bs0+1*_bs1];
-      
-    ValueType
-      a_p0, a_p1, a_p2,
-      c_p0, c_p1;
-
-    const int
-      ja0 = 0*_as1, ja1 = 1*_as1, ja2 = 2*_as1, 
-      jc0 = 0*_cs1, jc1 = 1*_cs1;
-    
-    for (int p=0;p<m;++p) {
-      a_p0 = A[p*_bs0+ja0]; a_p1 = A[p*_bs0+ja1]; a_p2 = A[p*_bs0+ja2]; 
-
-      c_p0  = a_p0*b_00; c_p1  = a_p0*b_01;
-      c_p0 += a_p1*b_10; c_p1 += a_p1*b_11;
-      c_p0 += a_p2*b_20; c_p1 += a_p2*b_21;
-      
-      C[p*_cs0+jc0] += alpha * c_p0; C[p*_cs0+jc1] += alpha * c_p1; 
-    }
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 4>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0) return 0;
 
-    return 0;
+  const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1],
+                  b_02 = B[0 * _bs0 + 2 * _bs1], b_03 = B[0 * _bs0 + 3 * _bs1];
+
+  ValueType a_p0, c_p0, c_p1, c_p2, c_p3;
+
+  const int ja0 = 0 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1, jc2 = 2 * _cs1,
+            jc3 = 3 * _cs1;
+
+  for (int p = 0; p < m; ++p) {
+    a_p0 = A[p * _bs0 + ja0];
+
+    c_p0 = a_p0 * b_00;
+    c_p1 = a_p0 * b_01;
+    c_p2 = a_p0 * b_02;
+    c_p3 = a_p0 * b_03;
+
+    C[p * _cs0 + jc0] += alpha * c_p0;
+    C[p * _cs0 + jc1] += alpha * c_p1;
+    C[p * _cs0 + jc2] += alpha * c_p2;
+    C[p * _cs0 + jc3] += alpha * c_p3;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<3,1>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0) return 0;
-
-    const ValueType
-      b_00 = B[0*_bs0+0*_bs1],
-      b_10 = B[1*_bs0+0*_bs1],
-      b_20 = B[2*_bs0+0*_bs1];
-      
-    ValueType
-      a_p0, a_p1, a_p2,
-      c_p0;
-
-    const int
-      ja0 = 0*_as1, ja1 = 1*_as1, ja2 = 2*_as1, 
-      jc0 = 0*_cs1;
-    
-    for (int p=0;p<m;++p) {
-      a_p0 = A[p*_bs0+ja0]; a_p1 = A[p*_bs0+ja1]; a_p2 = A[p*_bs0+ja2]; 
-
-      c_p0  = a_p0*b_00;
-      c_p0 += a_p1*b_10;
-      c_p0 += a_p2*b_20;
-      
-      C[p*_cs0+jc0] += alpha * c_p0;
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<4, 4>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0 || n <= 0 || k <= 0) return 0;
+
+  switch (k * 10 + n) {
+    case 43: {
+      InnerGemmFixB<4, 3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, m, C);
+      break;
+    }
+    case 42: {
+      InnerGemmFixB<4, 2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, m, C);
+      break;
+    }
+    case 41: {
+      InnerGemmFixB<4, 1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, m, C);
+      break;
     }
+    case 34: {
+      InnerGemmFixB<3, 4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, m, C);
+      break;
+    }
+    case 24: {
+      InnerGemmFixB<2, 4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, m, C);
+      break;
+    }
+    case 14: {
+      InnerGemmFixB<1, 4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, m, C);
+      break;
+    }
+  }
 
-    return 0;
+  return 0;
+}
+
+///
+/// Inner kernel (3x3)
+/// ==================
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 3>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0) return 0;
+
+  const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1],
+                  b_02 = B[0 * _bs0 + 2 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1],
+                  b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1],
+                  b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1],
+                  b_22 = B[2 * _bs0 + 2 * _bs1];
+
+  ValueType a_p0, a_p1, a_p2, c_p0, c_p1, c_p2;
+
+  const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1,
+            jc1 = 1 * _cs1, jc2 = 2 * _cs1;
+
+  for (int p = 0; p < m; ++p) {
+    a_p0 = A[p * _bs0 + ja0];
+    a_p1 = A[p * _bs0 + ja1];
+    a_p2 = A[p * _bs0 + ja2];
+
+    c_p0 = a_p0 * b_00;
+    c_p1 = a_p0 * b_01;
+    c_p2 = a_p0 * b_02;
+    c_p0 += a_p1 * b_10;
+    c_p1 += a_p1 * b_11;
+    c_p2 += a_p1 * b_12;
+    c_p0 += a_p2 * b_20;
+    c_p1 += a_p2 * b_21;
+    c_p2 += a_p2 * b_22;
+
+    C[p * _cs0 + jc0] += alpha * c_p0;
+    C[p * _cs0 + jc1] += alpha * c_p1;
+    C[p * _cs0 + jc2] += alpha * c_p2;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<2,3>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0) return 0;
-
-    const ValueType
-      b_00 = B[0*_bs0+0*_bs1], b_01 = B[0*_bs0+1*_bs1], b_02 = B[0*_bs0+2*_bs1],
-      b_10 = B[1*_bs0+0*_bs1], b_11 = B[1*_bs0+1*_bs1], b_12 = B[1*_bs0+2*_bs1];
-      
-    ValueType
-      a_p0, a_p1, a_p2,
-      c_p0, c_p1;
-
-    const int
-      ja0 = 0*_as1, ja1 = 1*_as1, ja2 = 2*_as1, 
-      jc0 = 0*_cs1, jc1 = 1*_cs1;
-    
-    for (int p=0;p<m;++p) {
-      a_p0 = A[p*_bs0+ja0]; a_p1 = A[p*_bs0+ja1]; 
-
-      c_p0  = a_p0*b_00; c_p1  = a_p0*b_01; c_p2  = a_p0*b_02;
-      c_p0 += a_p1*b_10; c_p1 += a_p1*b_11; c_p2 += a_p1*b_12;
-      
-      C[p*_cs0+jc0] += alpha * c_p0; C[p*_cs0+jc1] += alpha * c_p1; C[p*_cs0+jc2] += alpha * c_p2; 
-    }
+  return 0;
+}
 
-    return 0;
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 2>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0) return 0;
+
+  const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1],
+                  b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1],
+                  b_20 = B[2 * _bs0 + 0 * _bs1], b_21 = B[2 * _bs0 + 1 * _bs1];
+
+  ValueType a_p0, a_p1, a_p2, c_p0, c_p1;
+
+  const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1,
+            jc1 = 1 * _cs1;
+
+  for (int p = 0; p < m; ++p) {
+    a_p0 = A[p * _bs0 + ja0];
+    a_p1 = A[p * _bs0 + ja1];
+    a_p2 = A[p * _bs0 + ja2];
+
+    c_p0 = a_p0 * b_00;
+    c_p1 = a_p0 * b_01;
+    c_p0 += a_p1 * b_10;
+    c_p1 += a_p1 * b_11;
+    c_p0 += a_p2 * b_20;
+    c_p1 += a_p2 * b_21;
+
+    C[p * _cs0 + jc0] += alpha * c_p0;
+    C[p * _cs0 + jc1] += alpha * c_p1;
   }
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<1,3>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0) return 0;
-
-    const ValueType
-      b_00 = B[0*_bs0+0*_bs1], b_01 = B[0*_bs0+1*_bs1],
-      b_10 = B[1*_bs0+0*_bs1], b_11 = B[1*_bs0+1*_bs1];
-      
-    ValueType
-      a_p0, a_p1, a_p2,
-      c_p0;
-
-    const int
-      ja0 = 0*_as1, ja1 = 1*_as1, ja2 = 2*_as1, 
-      jc0 = 0*_cs1;
-    
-    for (int p=0;p<m;++p) {
-      a_p0 = A[p*_bs0+ja0]; 
-
-      c_p0  = a_p0*b_00; c_p1  = a_p0*b_01; c_p2  = a_p0*b_02;
-      
-      C[p*_cs0+jc0] += alpha * c_p0; C[p*_cs0+jc1] += alpha * c_p1; C[p*_cs0+jc2] += alpha * c_p2; 
-    }
 
-    return 0;
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 1>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0) return 0;
+
+  const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1],
+                  b_20 = B[2 * _bs0 + 0 * _bs1];
+
+  ValueType a_p0, a_p1, a_p2, c_p0;
+
+  const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1;
+
+  for (int p = 0; p < m; ++p) {
+    a_p0 = A[p * _bs0 + ja0];
+    a_p1 = A[p * _bs0 + ja1];
+    a_p2 = A[p * _bs0 + ja2];
+
+    c_p0 = a_p0 * b_00;
+    c_p0 += a_p1 * b_10;
+    c_p0 += a_p2 * b_20;
+
+    C[p * _cs0 + jc0] += alpha * c_p0;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<3,3>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m, const int n, const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (m <=0 || n <= 0 || k <= 0) return 0;
-
-    switch (k*10+n) {
-    case 32: { InnerGemmFixB<3,2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, C); break; }
-    case 31: { InnerGemmFixB<3,1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, C); break; }
-    case 23: { InnerGemmFixB<2,3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, C); break; }
-    case 13: { InnerGemmFixB<1,3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, C); break; }
-    }
+  return 0;
+}
 
-    return 0;
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 3>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0) return 0;
+
+  const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1],
+                  b_02 = B[0 * _bs0 + 2 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1],
+                  b_11 = B[1 * _bs0 + 1 * _bs1], b_12 = B[1 * _bs0 + 2 * _bs1];
+
+  ValueType a_p0, a_p1, a_p2, c_p0, c_p1;
+
+  const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1,
+            jc1 = 1 * _cs1;
+
+  for (int p = 0; p < m; ++p) {
+    a_p0 = A[p * _bs0 + ja0];
+    a_p1 = A[p * _bs0 + ja1];
+
+    c_p0 = a_p0 * b_00;
+    c_p1 = a_p0 * b_01;
+    c_p2 = a_p0 * b_02;
+    c_p0 += a_p1 * b_10;
+    c_p1 += a_p1 * b_11;
+    c_p2 += a_p1 * b_12;
+
+    C[p * _cs0 + jc0] += alpha * c_p0;
+    C[p * _cs0 + jc1] += alpha * c_p1;
+    C[p * _cs0 + jc2] += alpha * c_p2;
   }
 
-  ///
-  /// Inner kernel (2x2)
-  /// ==================
-
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<2,2>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0) return 0;
-
-    const ValueType
-      b_00 = B[0*_bs0+0*_bs1], b_01 = B[0*_bs0+1*_bs1],
-      b_10 = B[1*_bs0+0*_bs1], b_11 = B[1*_bs0+1*_bs1];
-      
-    ValueType
-      a_p0, a_p1, 
-      c_p0, c_p1;
-
-    const int
-      ja0 = 0*_as1, ja1 = 1*_as1, 
-      jc0 = 0*_cs1, jc1 = 1*_cs1;
-    
-    for (int p=0;p<m;++p) {
-      a_p0 = A[p*_bs0+ja0]; a_p1 = A[p*_bs0+ja1]; 
-
-      c_p0  = a_p0*b_00; c_p1  = a_p0*b_01; 
-      c_p0 += a_p1*b_10; c_p1 += a_p1*b_11; 
-      
-      C[p*_cs0+jc0] += alpha * c_p0; C[p*_cs0+jc1] += alpha * c_p1; 
-    }
+  return 0;
+}
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 3>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0) return 0;
+
+  const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1],
+                  b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1];
+
+  ValueType a_p0, a_p1, a_p2, c_p0;
+
+  const int ja0 = 0 * _as1, ja1 = 1 * _as1, ja2 = 2 * _as1, jc0 = 0 * _cs1;
+
+  for (int p = 0; p < m; ++p) {
+    a_p0 = A[p * _bs0 + ja0];
+
+    c_p0 = a_p0 * b_00;
+    c_p1 = a_p0 * b_01;
+    c_p2 = a_p0 * b_02;
 
-    return 0;
+    C[p * _cs0 + jc0] += alpha * c_p0;
+    C[p * _cs0 + jc1] += alpha * c_p1;
+    C[p * _cs0 + jc2] += alpha * c_p2;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<2,1>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0) return 0;
-
-    const ValueType
-      b_00 = B[0*_bs0+0*_bs1],
-      b_10 = B[1*_bs0+0*_bs1];
-      
-    ValueType
-      a_p0, a_p1, 
-      c_p0;
-
-    const int
-      ja0 = 0*_as1, ja1 = 1*_as1, 
-      jc0 = 0*_cs1;
-    
-    for (int p=0;p<m;++p) {
-      a_p0 = A[p*_bs0+ja0]; a_p1 = A[p*_bs0+ja1]; 
-
-      c_p0  = a_p0*b_00; 
-      c_p0 += a_p1*b_10; 
-      
-      C[p*_cs0+jc0] += alpha * c_p0; 
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<3, 3>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0 || n <= 0 || k <= 0) return 0;
+
+  switch (k * 10 + n) {
+    case 32: {
+      InnerGemmFixB<3, 2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, m, C);
+      break;
+    }
+    case 31: {
+      InnerGemmFixB<3, 1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, m, C);
+      break;
+    }
+    case 23: {
+      InnerGemmFixB<2, 3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, m, C);
+      break;
+    }
+    case 13: {
+      InnerGemmFixB<1, 3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, m, C);
+      break;
     }
+  }
+
+  return 0;
+}
+
+///
+/// Inner kernel (2x2)
+/// ==================
 
-    return 0;
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 2>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0) return 0;
+
+  const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1],
+                  b_10 = B[1 * _bs0 + 0 * _bs1], b_11 = B[1 * _bs0 + 1 * _bs1];
+
+  ValueType a_p0, a_p1, c_p0, c_p1;
+
+  const int ja0 = 0 * _as1, ja1 = 1 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1;
+
+  for (int p = 0; p < m; ++p) {
+    a_p0 = A[p * _bs0 + ja0];
+    a_p1 = A[p * _bs0 + ja1];
+
+    c_p0 = a_p0 * b_00;
+    c_p1 = a_p0 * b_01;
+    c_p0 += a_p1 * b_10;
+    c_p1 += a_p1 * b_11;
+
+    C[p * _cs0 + jc0] += alpha * c_p0;
+    C[p * _cs0 + jc1] += alpha * c_p1;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<1,2>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0) return 0;
-
-    const ValueType
-      b_00 = B[0*_bs0+0*_bs1], b_01 = B[0*_bs0+1*_bs1];
-      
-    ValueType
-      a_p0, 
-      c_p0, c_p1;
-
-    const int
-      ja0 = 0*_as1, 
-      jc0 = 0*_cs1, jc1 = 1*_cs1;
-    
-    for (int p=0;p<m;++p) {
-      a_p0 = A[p*_bs0+ja0]; 
-
-      c_p0  = a_p0*b_00; c_p1  = a_p0*b_01; 
-      
-      C[p*_cs0+jc0] += alpha * c_p0; C[p*_cs0+jc1] += alpha * c_p1; 
-    }
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 1>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0) return 0;
 
-    return 0;
+  const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_10 = B[1 * _bs0 + 0 * _bs1];
+
+  ValueType a_p0, a_p1, c_p0;
+
+  const int ja0 = 0 * _as1, ja1 = 1 * _as1, jc0 = 0 * _cs1;
+
+  for (int p = 0; p < m; ++p) {
+    a_p0 = A[p * _bs0 + ja0];
+    a_p1 = A[p * _bs0 + ja1];
+
+    c_p0 = a_p0 * b_00;
+    c_p0 += a_p1 * b_10;
+
+    C[p * _cs0 + jc0] += alpha * c_p0;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<2,2>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m, const int n, const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (m <=0 || n <= 0 || k <= 0) return 0;
-
-    switch (k*10+n) {
-    case 21: { InnerGemmFixB<2,1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, C); break; }
-    case 12: { InnerGemmFixB<1,2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, C); break; }
-    }
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 2>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0) return 0;
+
+  const ValueType b_00 = B[0 * _bs0 + 0 * _bs1], b_01 = B[0 * _bs0 + 1 * _bs1];
 
-    return 0;
+  ValueType a_p0, c_p0, c_p1;
+
+  const int ja0 = 0 * _as1, jc0 = 0 * _cs1, jc1 = 1 * _cs1;
+
+  for (int p = 0; p < m; ++p) {
+    a_p0 = A[p * _bs0 + ja0];
+
+    c_p0 = a_p0 * b_00;
+    c_p1 = a_p0 * b_01;
+
+    C[p * _cs0 + jc0] += alpha * c_p0;
+    C[p * _cs0 + jc1] += alpha * c_p1;
   }
 
-  ///
-  /// Inner kernel (1x1)
-  /// ==================
-
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<1,1>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0) return 0;
-
-    const ValueType
-      b_00 = B[0*_bs0+0*_bs1];
-      
-    ValueType
-      a_p0, 
-      c_p0;
-
-    const int
-      ja0 = 0*_as1, 
-      jc0 = 0*_cs1;
-    
-    for (int p=0;p<m;++p) {
-      a_p0 = A[p*_bs0+ja0]; 
-
-      c_p0  = a_p0*b_00; 
-      
-      C[p*_cs0+jc0] += alpha * c_p0; 
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<2, 2>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0 || n <= 0 || k <= 0) return 0;
+
+  switch (k * 10 + n) {
+    case 21: {
+      InnerGemmFixB<2, 1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, m, C);
+      break;
+    }
+    case 12: {
+      InnerGemmFixB<1, 2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, m, C);
+      break;
     }
+  }
 
-    return 0;
+  return 0;
+}
+
+///
+/// Inner kernel (1x1)
+/// ==================
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<1, 1>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0) return 0;
+
+  const ValueType b_00 = B[0 * _bs0 + 0 * _bs1];
+
+  ValueType a_p0, c_p0;
+
+  const int ja0 = 0 * _as1, jc0 = 0 * _cs1;
+
+  for (int p = 0; p < m; ++p) {
+    a_p0 = A[p * _bs0 + ja0];
+
+    c_p0 = a_p0 * b_00;
+
+    C[p * _cs0 + jc0] += alpha * c_p0;
   }
 
-  ///
-  /// Inner kernel (remainders)
-  /// =========================
-
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixB<0,0>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m, const int n, const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (m <=0 || n <= 0 || k <= 0) return 0;
-
-    if (k == n) {
-      switch (k) {
-      case 5: { InnerGemmFixB<5,5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, C); break; }
-      case 4: { InnerGemmFixB<4,4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, C); break; }
-      case 3: { InnerGemmFixB<3,3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, C); break; }
-      case 2: { InnerGemmFixB<2,2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, C); break; }
-      case 1: { InnerGemmFixB<1,1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, C); break; }
+  return 0;
+}
+
+///
+/// Inner kernel (remainders)
+/// =========================
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixB<0, 0>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0 || n <= 0 || k <= 0) return 0;
+
+  if (k == n) {
+    switch (k) {
+      case 5: {
+        InnerGemmFixB<5, 5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+        inner.serial_invoke(alpha, A, B, m, C);
+        break;
+      }
+      case 4: {
+        InnerGemmFixB<4, 4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+        inner.serial_invoke(alpha, A, B, m, C);
+        break;
+      }
+      case 3: {
+        InnerGemmFixB<3, 3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+        inner.serial_invoke(alpha, A, B, m, C);
+        break;
+      }
+      case 2: {
+        InnerGemmFixB<2, 2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+        inner.serial_invoke(alpha, A, B, m, C);
+        break;
       }
-    } else {
-      for (int i=0;i<m;++i) {
-        const ValueType *__restrict__ iA = A + i*_as0;
-        /**/  ValueType *__restrict__ iC = C + i*_cs0;
-        for (int j=0;j<n;++j) {
-          const ValueType *__restrict__ jB = B + j*_bs1;
-          /**/  ValueType tC = 0;
-          for (int p=0;p<k;++p) 
-            tC += iA[p*_as1]*jB[p*_bs0];
-          pC[i*_cs0] += alpha*tC;
-        }
+      case 1: {
+        InnerGemmFixB<1, 1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+        inner.serial_invoke(alpha, A, B, m, C);
+        break;
+      }
+    }
+  } else {
+    for (int i = 0; i < m; ++i) {
+      const ValueType *KOKKOS_RESTRICT iA = A + i * _as0;
+      /**/ ValueType *KOKKOS_RESTRICT iC    = C + i * _cs0;
+      for (int j = 0; j < n; ++j) {
+        const ValueType *KOKKOS_RESTRICT jB = B + j * _bs1;
+        /**/ ValueType tC                     = 0;
+        for (int p = 0; p < k; ++p) tC += iA[p * _as1] * jB[p * _bs0];
+        pC[i * _cs0] += alpha * tC;
       }
     }
-    return 0;
   }
+  return 0;
 }
-
-
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp b/src/batched/dense/impl/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp
index 2e62c20f32..247f232dce 100644
--- a/src/batched/dense/impl/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_INNER_GEMM_FIX_C_SERIAL_IMPL_HPP__
 #define __KOKKOSBATCHED_INNER_GEMM_FIX_C_SERIAL_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,1302 +8,1553 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Inner kernel (5x5)
-  /// ==================
-
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<5,5>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (k <= 0) return 0;
-
-    ValueType
-      a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0,
-      a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, c_13 = 0, c_14 = 0,
-      a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0, c_23 = 0, c_24 = 0,
-      a_3p, b_p3, c_30 = 0, c_31 = 0, c_32 = 0, c_33 = 0, c_34 = 0,
-      a_4p, b_p4, c_40 = 0, c_41 = 0, c_42 = 0, c_43 = 0, c_44 = 0;
-    
-    const int
-      i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0, i3 = 3*_as0, i4 = 4*_as0,
-      j0 = 0*_bs1, j1 = 1*_bs1, j2 = 2*_bs1, j3 = 3*_bs1, j4 = 4*_bs1;
-
-            
+///
+/// Inner kernel (5x5)
+/// ==================
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 5>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (k <= 0) return 0;
+
+  ValueType a_0p, b_p0,
+      c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0, a_1p, b_p1, c_10 = 0,
+      c_11 = 0, c_12 = 0, c_13 = 0, c_14 = 0, a_2p, b_p2, c_20 = 0, c_21 = 0,
+      c_22 = 0, c_23 = 0, c_24 = 0, a_3p, b_p3, c_30 = 0, c_31 = 0, c_32 = 0,
+      c_33 = 0, c_34 = 0, a_4p, b_p4, c_40 = 0, c_41 = 0, c_42 = 0, c_43 = 0,
+      c_44 = 0;
+
+  const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0,
+            i4 = 4 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1,
+            j3 = 3 * _bs1, j4 = 4 * _bs1;
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<k;++p) {
-      a_0p = A[i0+p*_as1]; b_p0 = B[p*_bs0+j0];
-      a_1p = A[i1+p*_as1]; b_p1 = B[p*_bs0+j1];
-      a_2p = A[i2+p*_as1]; b_p2 = B[p*_bs0+j2];
-      a_3p = A[i3+p*_as1]; b_p3 = B[p*_bs0+j3];
-      a_4p = A[i4+p*_as1]; b_p4 = B[p*_bs0+j4];
-      
-      c_00 += a_0p * b_p0; c_01 += a_0p * b_p1; c_02 += a_0p * b_p2; c_03 += a_0p * b_p3; c_04 += a_0p * b_p4;
-      c_10 += a_1p * b_p0; c_11 += a_1p * b_p1; c_12 += a_1p * b_p2; c_13 += a_1p * b_p3; c_14 += a_1p * b_p4;
-      c_20 += a_2p * b_p0; c_21 += a_2p * b_p1; c_22 += a_2p * b_p2; c_23 += a_2p * b_p3; c_24 += a_2p * b_p4;
-      c_30 += a_3p * b_p0; c_31 += a_3p * b_p1; c_32 += a_3p * b_p2; c_33 += a_3p * b_p3; c_34 += a_3p * b_p4;
-      c_40 += a_4p * b_p0; c_41 += a_4p * b_p1; c_42 += a_4p * b_p2; c_43 += a_4p * b_p3; c_44 += a_4p * b_p4;
-    }
-    
-    C[0*_cs0+0*_cs1] += alpha * c_00; C[0*_cs0+1*_cs1] += alpha * c_01; C[0*_cs0+2*_cs1] += alpha * c_02; C[0*_cs0+3*_cs1] += alpha * c_03; C[0*_cs0+4*_cs1] += alpha * c_04;
-    C[1*_cs0+0*_cs1] += alpha * c_10; C[1*_cs0+1*_cs1] += alpha * c_11; C[1*_cs0+2*_cs1] += alpha * c_12; C[1*_cs0+3*_cs1] += alpha * c_13; C[1*_cs0+4*_cs1] += alpha * c_14;
-    C[2*_cs0+0*_cs1] += alpha * c_20; C[2*_cs0+1*_cs1] += alpha * c_21; C[2*_cs0+2*_cs1] += alpha * c_22; C[2*_cs0+3*_cs1] += alpha * c_23; C[2*_cs0+4*_cs1] += alpha * c_24;
-    C[3*_cs0+0*_cs1] += alpha * c_30; C[3*_cs0+1*_cs1] += alpha * c_31; C[3*_cs0+2*_cs1] += alpha * c_32; C[3*_cs0+3*_cs1] += alpha * c_33; C[3*_cs0+4*_cs1] += alpha * c_34;
-    C[4*_cs0+0*_cs1] += alpha * c_40; C[4*_cs0+1*_cs1] += alpha * c_41; C[4*_cs0+2*_cs1] += alpha * c_42; C[4*_cs0+3*_cs1] += alpha * c_43; C[4*_cs0+4*_cs1] += alpha * c_44;
-    
-    return 0;
+  for (int p = 0; p < k; ++p) {
+    a_0p = A[i0 + p * _as1];
+    b_p0 = B[p * _bs0 + j0];
+    a_1p = A[i1 + p * _as1];
+    b_p1 = B[p * _bs0 + j1];
+    a_2p = A[i2 + p * _as1];
+    b_p2 = B[p * _bs0 + j2];
+    a_3p = A[i3 + p * _as1];
+    b_p3 = B[p * _bs0 + j3];
+    a_4p = A[i4 + p * _as1];
+    b_p4 = B[p * _bs0 + j4];
+
+    c_00 += a_0p * b_p0;
+    c_01 += a_0p * b_p1;
+    c_02 += a_0p * b_p2;
+    c_03 += a_0p * b_p3;
+    c_04 += a_0p * b_p4;
+    c_10 += a_1p * b_p0;
+    c_11 += a_1p * b_p1;
+    c_12 += a_1p * b_p2;
+    c_13 += a_1p * b_p3;
+    c_14 += a_1p * b_p4;
+    c_20 += a_2p * b_p0;
+    c_21 += a_2p * b_p1;
+    c_22 += a_2p * b_p2;
+    c_23 += a_2p * b_p3;
+    c_24 += a_2p * b_p4;
+    c_30 += a_3p * b_p0;
+    c_31 += a_3p * b_p1;
+    c_32 += a_3p * b_p2;
+    c_33 += a_3p * b_p3;
+    c_34 += a_3p * b_p4;
+    c_40 += a_4p * b_p0;
+    c_41 += a_4p * b_p1;
+    c_42 += a_4p * b_p2;
+    c_43 += a_4p * b_p3;
+    c_44 += a_4p * b_p4;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<5,4>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (k <= 0) return 0;
-
-    ValueType
-      a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0,
-      a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, c_13 = 0,
-      a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0, c_23 = 0,
-      a_3p, b_p3, c_30 = 0, c_31 = 0, c_32 = 0, c_33 = 0,
-      a_4p,       c_40 = 0, c_41 = 0, c_42 = 0, c_43 = 0;
-    
-    const int
-      i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0, i3 = 3*_as0, i4 = 4*_as0,
-      j0 = 0*_bs1, j1 = 1*_bs1, j2 = 2*_bs1, j3 = 3*_bs1;
-    
-            
+  C[0 * _cs0 + 0 * _cs1] += alpha * c_00;
+  C[0 * _cs0 + 1 * _cs1] += alpha * c_01;
+  C[0 * _cs0 + 2 * _cs1] += alpha * c_02;
+  C[0 * _cs0 + 3 * _cs1] += alpha * c_03;
+  C[0 * _cs0 + 4 * _cs1] += alpha * c_04;
+  C[1 * _cs0 + 0 * _cs1] += alpha * c_10;
+  C[1 * _cs0 + 1 * _cs1] += alpha * c_11;
+  C[1 * _cs0 + 2 * _cs1] += alpha * c_12;
+  C[1 * _cs0 + 3 * _cs1] += alpha * c_13;
+  C[1 * _cs0 + 4 * _cs1] += alpha * c_14;
+  C[2 * _cs0 + 0 * _cs1] += alpha * c_20;
+  C[2 * _cs0 + 1 * _cs1] += alpha * c_21;
+  C[2 * _cs0 + 2 * _cs1] += alpha * c_22;
+  C[2 * _cs0 + 3 * _cs1] += alpha * c_23;
+  C[2 * _cs0 + 4 * _cs1] += alpha * c_24;
+  C[3 * _cs0 + 0 * _cs1] += alpha * c_30;
+  C[3 * _cs0 + 1 * _cs1] += alpha * c_31;
+  C[3 * _cs0 + 2 * _cs1] += alpha * c_32;
+  C[3 * _cs0 + 3 * _cs1] += alpha * c_33;
+  C[3 * _cs0 + 4 * _cs1] += alpha * c_34;
+  C[4 * _cs0 + 0 * _cs1] += alpha * c_40;
+  C[4 * _cs0 + 1 * _cs1] += alpha * c_41;
+  C[4 * _cs0 + 2 * _cs1] += alpha * c_42;
+  C[4 * _cs0 + 3 * _cs1] += alpha * c_43;
+  C[4 * _cs0 + 4 * _cs1] += alpha * c_44;
+
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 4>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (k <= 0) return 0;
+
+  ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, a_1p, b_p1,
+                        c_10 = 0, c_11 = 0, c_12 = 0, c_13 = 0, a_2p, b_p2,
+                        c_20 = 0, c_21 = 0, c_22 = 0, c_23 = 0, a_3p, b_p3,
+                        c_30 = 0, c_31 = 0, c_32 = 0, c_33 = 0, a_4p, c_40 = 0,
+                        c_41 = 0, c_42 = 0, c_43 = 0;
+
+  const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0,
+            i4 = 4 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1,
+            j3 = 3 * _bs1;
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<k;++p) {
-      a_0p = A[i0+p*_as1]; b_p0 = B[p*_bs0+j0];
-      a_1p = A[i1+p*_as1]; b_p1 = B[p*_bs0+j1];
-      a_2p = A[i2+p*_as1]; b_p2 = B[p*_bs0+j2];
-      a_3p = A[i3+p*_as1]; b_p3 = B[p*_bs0+j3];
-      a_4p = A[i4+p*_as1]; 
-      
-      c_00 += a_0p * b_p0; c_01 += a_0p * b_p1; c_02 += a_0p * b_p2; c_03 += a_0p * b_p3; 
-      c_10 += a_1p * b_p0; c_11 += a_1p * b_p1; c_12 += a_1p * b_p2; c_13 += a_1p * b_p3; 
-      c_20 += a_2p * b_p0; c_21 += a_2p * b_p1; c_22 += a_2p * b_p2; c_23 += a_2p * b_p3; 
-      c_30 += a_3p * b_p0; c_31 += a_3p * b_p1; c_32 += a_3p * b_p2; c_33 += a_3p * b_p3; 
-      c_40 += a_4p * b_p0; c_41 += a_4p * b_p1; c_42 += a_4p * b_p2; c_43 += a_4p * b_p3; 
-    }
-    
-    C[0*_cs0+0*_cs1] += alpha * c_00; C[0*_cs0+1*_cs1] += alpha * c_01; C[0*_cs0+2*_cs1] += alpha * c_02; C[0*_cs0+3*_cs1] += alpha * c_03; 
-    C[1*_cs0+0*_cs1] += alpha * c_10; C[1*_cs0+1*_cs1] += alpha * c_11; C[1*_cs0+2*_cs1] += alpha * c_12; C[1*_cs0+3*_cs1] += alpha * c_13; 
-    C[2*_cs0+0*_cs1] += alpha * c_20; C[2*_cs0+1*_cs1] += alpha * c_21; C[2*_cs0+2*_cs1] += alpha * c_22; C[2*_cs0+3*_cs1] += alpha * c_23; 
-    C[3*_cs0+0*_cs1] += alpha * c_30; C[3*_cs0+1*_cs1] += alpha * c_31; C[3*_cs0+2*_cs1] += alpha * c_32; C[3*_cs0+3*_cs1] += alpha * c_33; 
-    C[4*_cs0+0*_cs1] += alpha * c_40; C[4*_cs0+1*_cs1] += alpha * c_41; C[4*_cs0+2*_cs1] += alpha * c_42; C[4*_cs0+3*_cs1] += alpha * c_43; 
-    
-    return 0;
+  for (int p = 0; p < k; ++p) {
+    a_0p = A[i0 + p * _as1];
+    b_p0 = B[p * _bs0 + j0];
+    a_1p = A[i1 + p * _as1];
+    b_p1 = B[p * _bs0 + j1];
+    a_2p = A[i2 + p * _as1];
+    b_p2 = B[p * _bs0 + j2];
+    a_3p = A[i3 + p * _as1];
+    b_p3 = B[p * _bs0 + j3];
+    a_4p = A[i4 + p * _as1];
+
+    c_00 += a_0p * b_p0;
+    c_01 += a_0p * b_p1;
+    c_02 += a_0p * b_p2;
+    c_03 += a_0p * b_p3;
+    c_10 += a_1p * b_p0;
+    c_11 += a_1p * b_p1;
+    c_12 += a_1p * b_p2;
+    c_13 += a_1p * b_p3;
+    c_20 += a_2p * b_p0;
+    c_21 += a_2p * b_p1;
+    c_22 += a_2p * b_p2;
+    c_23 += a_2p * b_p3;
+    c_30 += a_3p * b_p0;
+    c_31 += a_3p * b_p1;
+    c_32 += a_3p * b_p2;
+    c_33 += a_3p * b_p3;
+    c_40 += a_4p * b_p0;
+    c_41 += a_4p * b_p1;
+    c_42 += a_4p * b_p2;
+    c_43 += a_4p * b_p3;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<5,3>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (k <= 0) return 0;
-
-    ValueType
-      a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0,
-      a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0,
-      a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0,
-      a_3p,       c_30 = 0, c_31 = 0, c_32 = 0,
-      a_4p,       c_40 = 0, c_41 = 0, c_42 = 0;
-    
-    const int
-      i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0, i3 = 3*_as0, i4 = 4*_as0,
-      j0 = 0*_bs1, j1 = 1*_bs1, j2 = 2*_bs1;
-    
-            
+  C[0 * _cs0 + 0 * _cs1] += alpha * c_00;
+  C[0 * _cs0 + 1 * _cs1] += alpha * c_01;
+  C[0 * _cs0 + 2 * _cs1] += alpha * c_02;
+  C[0 * _cs0 + 3 * _cs1] += alpha * c_03;
+  C[1 * _cs0 + 0 * _cs1] += alpha * c_10;
+  C[1 * _cs0 + 1 * _cs1] += alpha * c_11;
+  C[1 * _cs0 + 2 * _cs1] += alpha * c_12;
+  C[1 * _cs0 + 3 * _cs1] += alpha * c_13;
+  C[2 * _cs0 + 0 * _cs1] += alpha * c_20;
+  C[2 * _cs0 + 1 * _cs1] += alpha * c_21;
+  C[2 * _cs0 + 2 * _cs1] += alpha * c_22;
+  C[2 * _cs0 + 3 * _cs1] += alpha * c_23;
+  C[3 * _cs0 + 0 * _cs1] += alpha * c_30;
+  C[3 * _cs0 + 1 * _cs1] += alpha * c_31;
+  C[3 * _cs0 + 2 * _cs1] += alpha * c_32;
+  C[3 * _cs0 + 3 * _cs1] += alpha * c_33;
+  C[4 * _cs0 + 0 * _cs1] += alpha * c_40;
+  C[4 * _cs0 + 1 * _cs1] += alpha * c_41;
+  C[4 * _cs0 + 2 * _cs1] += alpha * c_42;
+  C[4 * _cs0 + 3 * _cs1] += alpha * c_43;
+
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 3>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (k <= 0) return 0;
+
+  ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, a_1p, b_p1, c_10 = 0,
+                        c_11 = 0, c_12 = 0, a_2p, b_p2, c_20 = 0, c_21 = 0,
+                        c_22 = 0, a_3p, c_30 = 0, c_31 = 0, c_32 = 0, a_4p,
+                        c_40 = 0, c_41 = 0, c_42 = 0;
+
+  const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0,
+            i4 = 4 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1;
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<k;++p) {
-      a_0p = A[i0+p*_as1]; b_p0 = B[p*_bs0+j0];
-      a_1p = A[i1+p*_as1]; b_p1 = B[p*_bs0+j1];
-      a_2p = A[i2+p*_as1]; b_p2 = B[p*_bs0+j2];
-      a_3p = A[i3+p*_as1]; 
-      a_4p = A[i4+p*_as1];
-      
-      c_00 += a_0p * b_p0; c_01 += a_0p * b_p1; c_02 += a_0p * b_p2; 
-      c_10 += a_1p * b_p0; c_11 += a_1p * b_p1; c_12 += a_1p * b_p2; 
-      c_20 += a_2p * b_p0; c_21 += a_2p * b_p1; c_22 += a_2p * b_p2; 
-      c_30 += a_3p * b_p0; c_31 += a_3p * b_p1; c_32 += a_3p * b_p2; 
-      c_40 += a_4p * b_p0; c_41 += a_4p * b_p1; c_42 += a_4p * b_p2; 
-    }
-    
-    C[0*_cs0+0*_cs1] += alpha * c_00; C[0*_cs0+1*_cs1] += alpha * c_01; C[0*_cs0+2*_cs1] += alpha * c_02; 
-    C[1*_cs0+0*_cs1] += alpha * c_10; C[1*_cs0+1*_cs1] += alpha * c_11; C[1*_cs0+2*_cs1] += alpha * c_12; 
-    C[2*_cs0+0*_cs1] += alpha * c_20; C[2*_cs0+1*_cs1] += alpha * c_21; C[2*_cs0+2*_cs1] += alpha * c_22; 
-    C[3*_cs0+0*_cs1] += alpha * c_30; C[3*_cs0+1*_cs1] += alpha * c_31; C[3*_cs0+2*_cs1] += alpha * c_32; 
-    C[4*_cs0+0*_cs1] += alpha * c_40; C[4*_cs0+1*_cs1] += alpha * c_41; C[4*_cs0+2*_cs1] += alpha * c_42; 
-    
-    return 0;
+  for (int p = 0; p < k; ++p) {
+    a_0p = A[i0 + p * _as1];
+    b_p0 = B[p * _bs0 + j0];
+    a_1p = A[i1 + p * _as1];
+    b_p1 = B[p * _bs0 + j1];
+    a_2p = A[i2 + p * _as1];
+    b_p2 = B[p * _bs0 + j2];
+    a_3p = A[i3 + p * _as1];
+    a_4p = A[i4 + p * _as1];
+
+    c_00 += a_0p * b_p0;
+    c_01 += a_0p * b_p1;
+    c_02 += a_0p * b_p2;
+    c_10 += a_1p * b_p0;
+    c_11 += a_1p * b_p1;
+    c_12 += a_1p * b_p2;
+    c_20 += a_2p * b_p0;
+    c_21 += a_2p * b_p1;
+    c_22 += a_2p * b_p2;
+    c_30 += a_3p * b_p0;
+    c_31 += a_3p * b_p1;
+    c_32 += a_3p * b_p2;
+    c_40 += a_4p * b_p0;
+    c_41 += a_4p * b_p1;
+    c_42 += a_4p * b_p2;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<5,2>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (k <= 0) return 0;
-
-    ValueType
-      a_0p, b_p0, c_00 = 0, c_01 = 0,
-      a_1p, b_p1, c_10 = 0, c_11 = 0,
-      a_2p,       c_20 = 0, c_21 = 0,
-      a_3p,       c_30 = 0, c_31 = 0,
-      a_4p,       c_40 = 0, c_41 = 0;
-    
-    const int
-      i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0, i3 = 3*_as0, i4 = 4*_as0,
-      j0 = 0*_bs1, j1 = 1*_bs1;
-    
-            
+  C[0 * _cs0 + 0 * _cs1] += alpha * c_00;
+  C[0 * _cs0 + 1 * _cs1] += alpha * c_01;
+  C[0 * _cs0 + 2 * _cs1] += alpha * c_02;
+  C[1 * _cs0 + 0 * _cs1] += alpha * c_10;
+  C[1 * _cs0 + 1 * _cs1] += alpha * c_11;
+  C[1 * _cs0 + 2 * _cs1] += alpha * c_12;
+  C[2 * _cs0 + 0 * _cs1] += alpha * c_20;
+  C[2 * _cs0 + 1 * _cs1] += alpha * c_21;
+  C[2 * _cs0 + 2 * _cs1] += alpha * c_22;
+  C[3 * _cs0 + 0 * _cs1] += alpha * c_30;
+  C[3 * _cs0 + 1 * _cs1] += alpha * c_31;
+  C[3 * _cs0 + 2 * _cs1] += alpha * c_32;
+  C[4 * _cs0 + 0 * _cs1] += alpha * c_40;
+  C[4 * _cs0 + 1 * _cs1] += alpha * c_41;
+  C[4 * _cs0 + 2 * _cs1] += alpha * c_42;
+
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 2>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (k <= 0) return 0;
+
+  ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, a_1p, b_p1, c_10 = 0, c_11 = 0,
+                        a_2p, c_20 = 0, c_21 = 0, a_3p, c_30 = 0, c_31 = 0,
+                        a_4p, c_40 = 0, c_41 = 0;
+
+  const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0,
+            i4 = 4 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1;
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<k;++p) {
-      a_0p = A[i0+p*_as1]; b_p0 = B[p*_bs0+j0];
-      a_1p = A[i1+p*_as1]; b_p1 = B[p*_bs0+j1];
-      a_2p = A[i2+p*_as1]; 
-      a_3p = A[i3+p*_as1]; 
-      a_4p = A[i4+p*_as1]; 
-      
-      c_00 += a_0p * b_p0; c_01 += a_0p * b_p1; 
-      c_10 += a_1p * b_p0; c_11 += a_1p * b_p1; 
-      c_20 += a_2p * b_p0; c_21 += a_2p * b_p1; 
-      c_30 += a_3p * b_p0; c_31 += a_3p * b_p1; 
-      c_40 += a_4p * b_p0; c_41 += a_4p * b_p1; 
-    }
-    
-    C[0*_cs0+0*_cs1] += alpha * c_00; C[0*_cs0+1*_cs1] += alpha * c_01; 
-    C[1*_cs0+0*_cs1] += alpha * c_10; C[1*_cs0+1*_cs1] += alpha * c_11; 
-    C[2*_cs0+0*_cs1] += alpha * c_20; C[2*_cs0+1*_cs1] += alpha * c_21; 
-    C[3*_cs0+0*_cs1] += alpha * c_30; C[3*_cs0+1*_cs1] += alpha * c_31; 
-    C[4*_cs0+0*_cs1] += alpha * c_40; C[4*_cs0+1*_cs1] += alpha * c_41; 
-    
-    return 0;
+  for (int p = 0; p < k; ++p) {
+    a_0p = A[i0 + p * _as1];
+    b_p0 = B[p * _bs0 + j0];
+    a_1p = A[i1 + p * _as1];
+    b_p1 = B[p * _bs0 + j1];
+    a_2p = A[i2 + p * _as1];
+    a_3p = A[i3 + p * _as1];
+    a_4p = A[i4 + p * _as1];
+
+    c_00 += a_0p * b_p0;
+    c_01 += a_0p * b_p1;
+    c_10 += a_1p * b_p0;
+    c_11 += a_1p * b_p1;
+    c_20 += a_2p * b_p0;
+    c_21 += a_2p * b_p1;
+    c_30 += a_3p * b_p0;
+    c_31 += a_3p * b_p1;
+    c_40 += a_4p * b_p0;
+    c_41 += a_4p * b_p1;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<5,1>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (k <= 0) return 0;
-
-    ValueType
-      a_0p, b_p0, c_00 = 0,
-      a_1p,       c_10 = 0,
-      a_2p,       c_20 = 0,
-      a_3p,       c_30 = 0,
-      a_4p,       c_40 = 0;
-    
-    const int
-      i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0, i3 = 3*_as0, i4 = 4*_as0,
-      j0 = 0*_bs1;
-
-                
+  C[0 * _cs0 + 0 * _cs1] += alpha * c_00;
+  C[0 * _cs0 + 1 * _cs1] += alpha * c_01;
+  C[1 * _cs0 + 0 * _cs1] += alpha * c_10;
+  C[1 * _cs0 + 1 * _cs1] += alpha * c_11;
+  C[2 * _cs0 + 0 * _cs1] += alpha * c_20;
+  C[2 * _cs0 + 1 * _cs1] += alpha * c_21;
+  C[3 * _cs0 + 0 * _cs1] += alpha * c_30;
+  C[3 * _cs0 + 1 * _cs1] += alpha * c_31;
+  C[4 * _cs0 + 0 * _cs1] += alpha * c_40;
+  C[4 * _cs0 + 1 * _cs1] += alpha * c_41;
+
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 1>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (k <= 0) return 0;
+
+  ValueType a_0p, b_p0, c_00 = 0, a_1p, c_10 = 0, a_2p, c_20 = 0, a_3p,
+                        c_30 = 0, a_4p, c_40 = 0;
+
+  const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0,
+            i4 = 4 * _as0, j0 = 0 * _bs1;
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<k;++p) {
-      a_0p = A[i0+p*_as1]; b_p0 = B[p*_bs0+j0];
-      a_1p = A[i1+p*_as1]; 
-      a_2p = A[i2+p*_as1]; 
-      a_3p = A[i3+p*_as1]; 
-      a_4p = A[i4+p*_as1]; 
-      
-      c_00 += a_0p * b_p0; 
-      c_10 += a_1p * b_p0; 
-      c_20 += a_2p * b_p0; 
-      c_30 += a_3p * b_p0; 
-      c_40 += a_4p * b_p0; 
-    }
-    
-    C[0*_cs0+0*_cs1] += alpha * c_00; 
-    C[1*_cs0+0*_cs1] += alpha * c_10; 
-    C[2*_cs0+0*_cs1] += alpha * c_20; 
-    C[3*_cs0+0*_cs1] += alpha * c_30; 
-    C[4*_cs0+0*_cs1] += alpha * c_40; 
-    
-    return 0;
+  for (int p = 0; p < k; ++p) {
+    a_0p = A[i0 + p * _as1];
+    b_p0 = B[p * _bs0 + j0];
+    a_1p = A[i1 + p * _as1];
+    a_2p = A[i2 + p * _as1];
+    a_3p = A[i3 + p * _as1];
+    a_4p = A[i4 + p * _as1];
+
+    c_00 += a_0p * b_p0;
+    c_10 += a_1p * b_p0;
+    c_20 += a_2p * b_p0;
+    c_30 += a_3p * b_p0;
+    c_40 += a_4p * b_p0;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<4,5>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (k <= 0) return 0;
-
-    ValueType
-      a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0,
-      a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, c_13 = 0, c_14 = 0,
-      a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0, c_23 = 0, c_24 = 0,
-      a_3p, b_p3, c_30 = 0, c_31 = 0, c_32 = 0, c_33 = 0, c_34 = 0,
-      /**/  b_p4;
-    
-    const int
-      i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0, i3 = 3*_as0, 
-      j0 = 0*_bs1, j1 = 1*_bs1, j2 = 2*_bs1, j3 = 3*_bs1, j4 = 4*_bs1;
-    
-            
+  C[0 * _cs0 + 0 * _cs1] += alpha * c_00;
+  C[1 * _cs0 + 0 * _cs1] += alpha * c_10;
+  C[2 * _cs0 + 0 * _cs1] += alpha * c_20;
+  C[3 * _cs0 + 0 * _cs1] += alpha * c_30;
+  C[4 * _cs0 + 0 * _cs1] += alpha * c_40;
+
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 5>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (k <= 0) return 0;
+
+  ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0, a_1p,
+                        b_p1, c_10 = 0, c_11 = 0, c_12 = 0, c_13 = 0, c_14 = 0,
+                        a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0, c_23 = 0,
+                        c_24 = 0, a_3p, b_p3, c_30 = 0, c_31 = 0, c_32 = 0,
+                        c_33 = 0, c_34 = 0,
+                        /**/ b_p4;
+
+  const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0,
+            j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, j3 = 3 * _bs1,
+            j4 = 4 * _bs1;
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<k;++p) {
-      a_0p = A[i0+p*_as1]; b_p0 = B[p*_bs0+j0];
-      a_1p = A[i1+p*_as1]; b_p1 = B[p*_bs0+j1];
-      a_2p = A[i2+p*_as1]; b_p2 = B[p*_bs0+j2];
-      a_3p = A[i3+p*_as1]; b_p3 = B[p*_bs0+j3];
-      /**/                 b_p4 = B[p*_bs0+j4];
-      
-      c_00 += a_0p * b_p0; c_01 += a_0p * b_p1; c_02 += a_0p * b_p2; c_03 += a_0p * b_p3; c_04 += a_0p * b_p4;
-      c_10 += a_1p * b_p0; c_11 += a_1p * b_p1; c_12 += a_1p * b_p2; c_13 += a_1p * b_p3; c_14 += a_1p * b_p4;
-      c_20 += a_2p * b_p0; c_21 += a_2p * b_p1; c_22 += a_2p * b_p2; c_23 += a_2p * b_p3; c_24 += a_2p * b_p4;
-      c_30 += a_3p * b_p0; c_31 += a_3p * b_p1; c_32 += a_3p * b_p2; c_33 += a_3p * b_p3; c_34 += a_3p * b_p4;
-    }
-    
-    C[0*_cs0+0*_cs1] += alpha * c_00; C[0*_cs0+1*_cs1] += alpha * c_01; C[0*_cs0+2*_cs1] += alpha * c_02; C[0*_cs0+3*_cs1] += alpha * c_03; C[0*_cs0+4*_cs1] += alpha * c_04;
-    C[1*_cs0+0*_cs1] += alpha * c_10; C[1*_cs0+1*_cs1] += alpha * c_11; C[1*_cs0+2*_cs1] += alpha * c_12; C[1*_cs0+3*_cs1] += alpha * c_13; C[1*_cs0+4*_cs1] += alpha * c_14;
-    C[2*_cs0+0*_cs1] += alpha * c_20; C[2*_cs0+1*_cs1] += alpha * c_21; C[2*_cs0+2*_cs1] += alpha * c_22; C[2*_cs0+3*_cs1] += alpha * c_23; C[2*_cs0+4*_cs1] += alpha * c_24;
-    C[3*_cs0+0*_cs1] += alpha * c_30; C[3*_cs0+1*_cs1] += alpha * c_31; C[3*_cs0+2*_cs1] += alpha * c_32; C[3*_cs0+3*_cs1] += alpha * c_33; C[3*_cs0+4*_cs1] += alpha * c_34;
-    
-    return 0;
+  for (int p = 0; p < k; ++p) {
+    a_0p    = A[i0 + p * _as1];
+    b_p0    = B[p * _bs0 + j0];
+    a_1p    = A[i1 + p * _as1];
+    b_p1    = B[p * _bs0 + j1];
+    a_2p    = A[i2 + p * _as1];
+    b_p2    = B[p * _bs0 + j2];
+    a_3p    = A[i3 + p * _as1];
+    b_p3    = B[p * _bs0 + j3];
+    /**/ b_p4 = B[p * _bs0 + j4];
+
+    c_00 += a_0p * b_p0;
+    c_01 += a_0p * b_p1;
+    c_02 += a_0p * b_p2;
+    c_03 += a_0p * b_p3;
+    c_04 += a_0p * b_p4;
+    c_10 += a_1p * b_p0;
+    c_11 += a_1p * b_p1;
+    c_12 += a_1p * b_p2;
+    c_13 += a_1p * b_p3;
+    c_14 += a_1p * b_p4;
+    c_20 += a_2p * b_p0;
+    c_21 += a_2p * b_p1;
+    c_22 += a_2p * b_p2;
+    c_23 += a_2p * b_p3;
+    c_24 += a_2p * b_p4;
+    c_30 += a_3p * b_p0;
+    c_31 += a_3p * b_p1;
+    c_32 += a_3p * b_p2;
+    c_33 += a_3p * b_p3;
+    c_34 += a_3p * b_p4;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<3,5>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (k <= 0) return 0;
-
-    ValueType
-      a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0,
-      a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, c_13 = 0, c_14 = 0,
-      a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0, c_23 = 0, c_24 = 0,
-      /**/  b_p3, 
-      /**/  b_p4;
-    
-    const int
-      i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0, 
-      j0 = 0*_bs1, j1 = 1*_bs1, j2 = 2*_bs1, j3 = 3*_bs1, j4 = 4*_bs1;
-    
-            
+  C[0 * _cs0 + 0 * _cs1] += alpha * c_00;
+  C[0 * _cs0 + 1 * _cs1] += alpha * c_01;
+  C[0 * _cs0 + 2 * _cs1] += alpha * c_02;
+  C[0 * _cs0 + 3 * _cs1] += alpha * c_03;
+  C[0 * _cs0 + 4 * _cs1] += alpha * c_04;
+  C[1 * _cs0 + 0 * _cs1] += alpha * c_10;
+  C[1 * _cs0 + 1 * _cs1] += alpha * c_11;
+  C[1 * _cs0 + 2 * _cs1] += alpha * c_12;
+  C[1 * _cs0 + 3 * _cs1] += alpha * c_13;
+  C[1 * _cs0 + 4 * _cs1] += alpha * c_14;
+  C[2 * _cs0 + 0 * _cs1] += alpha * c_20;
+  C[2 * _cs0 + 1 * _cs1] += alpha * c_21;
+  C[2 * _cs0 + 2 * _cs1] += alpha * c_22;
+  C[2 * _cs0 + 3 * _cs1] += alpha * c_23;
+  C[2 * _cs0 + 4 * _cs1] += alpha * c_24;
+  C[3 * _cs0 + 0 * _cs1] += alpha * c_30;
+  C[3 * _cs0 + 1 * _cs1] += alpha * c_31;
+  C[3 * _cs0 + 2 * _cs1] += alpha * c_32;
+  C[3 * _cs0 + 3 * _cs1] += alpha * c_33;
+  C[3 * _cs0 + 4 * _cs1] += alpha * c_34;
+
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 5>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (k <= 0) return 0;
+
+  ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0, a_1p,
+                        b_p1, c_10 = 0, c_11 = 0, c_12 = 0, c_13 = 0, c_14 = 0,
+                        a_2p, b_p2, c_20 = 0, c_21 = 0, c_22 = 0, c_23 = 0,
+                        c_24 = 0,
+                        /**/ b_p3,
+                        /**/ b_p4;
+
+  const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, j0 = 0 * _bs1,
+            j1 = 1 * _bs1, j2 = 2 * _bs1, j3 = 3 * _bs1, j4 = 4 * _bs1;
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<k;++p) {
-      a_0p = A[i0+p*_as1]; b_p0 = B[p*_bs0+j0];
-      a_1p = A[i1+p*_as1]; b_p1 = B[p*_bs0+j1];
-      a_2p = A[i2+p*_as1]; b_p2 = B[p*_bs0+j2];
-      /**/                 b_p3 = B[p*_bs0+j3];
-      /**/                 b_p4 = B[p*_bs0+j4];
-      
-      c_00 += a_0p * b_p0; c_01 += a_0p * b_p1; c_02 += a_0p * b_p2; c_03 += a_0p * b_p3; c_04 += a_0p * b_p4;
-      c_10 += a_1p * b_p0; c_11 += a_1p * b_p1; c_12 += a_1p * b_p2; c_13 += a_1p * b_p3; c_14 += a_1p * b_p4;
-      c_20 += a_2p * b_p0; c_21 += a_2p * b_p1; c_22 += a_2p * b_p2; c_23 += a_2p * b_p3; c_24 += a_2p * b_p4;
-    }
-    
-    C[0*_cs0+0*_cs1] += alpha * c_00; C[0*_cs0+1*_cs1] += alpha * c_01; C[0*_cs0+2*_cs1] += alpha * c_02; C[0*_cs0+3*_cs1] += alpha * c_03; C[0*_cs0+4*_cs1] += alpha * c_04;
-    C[1*_cs0+0*_cs1] += alpha * c_10; C[1*_cs0+1*_cs1] += alpha * c_11; C[1*_cs0+2*_cs1] += alpha * c_12; C[1*_cs0+3*_cs1] += alpha * c_13; C[1*_cs0+4*_cs1] += alpha * c_14;
-    C[2*_cs0+0*_cs1] += alpha * c_20; C[2*_cs0+1*_cs1] += alpha * c_21; C[2*_cs0+2*_cs1] += alpha * c_22; C[2*_cs0+3*_cs1] += alpha * c_23; C[2*_cs0+4*_cs1] += alpha * c_24;
-    
-    return 0;
+  for (int p = 0; p < k; ++p) {
+    a_0p    = A[i0 + p * _as1];
+    b_p0    = B[p * _bs0 + j0];
+    a_1p    = A[i1 + p * _as1];
+    b_p1    = B[p * _bs0 + j1];
+    a_2p    = A[i2 + p * _as1];
+    b_p2    = B[p * _bs0 + j2];
+    /**/ b_p3 = B[p * _bs0 + j3];
+    /**/ b_p4 = B[p * _bs0 + j4];
+
+    c_00 += a_0p * b_p0;
+    c_01 += a_0p * b_p1;
+    c_02 += a_0p * b_p2;
+    c_03 += a_0p * b_p3;
+    c_04 += a_0p * b_p4;
+    c_10 += a_1p * b_p0;
+    c_11 += a_1p * b_p1;
+    c_12 += a_1p * b_p2;
+    c_13 += a_1p * b_p3;
+    c_14 += a_1p * b_p4;
+    c_20 += a_2p * b_p0;
+    c_21 += a_2p * b_p1;
+    c_22 += a_2p * b_p2;
+    c_23 += a_2p * b_p3;
+    c_24 += a_2p * b_p4;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<2,5>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (k <= 0) return 0;
-
-    ValueType
-      a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0,
-      a_1p, b_p1, c_10 = 0, c_11 = 0, c_12 = 0, c_13 = 0, c_14 = 0,
-      /**/  b_p2, 
-      /**/  b_p3, 
-      /**/  b_p4; 
-    
-    const int
-      i0 = 0*_as0, i1 = 1*_as0, 
-      j0 = 0*_bs1, j1 = 1*_bs1, j2 = 2*_bs1, j3 = 3*_bs1, j4 = 4*_bs1;
-    
-            
+  C[0 * _cs0 + 0 * _cs1] += alpha * c_00;
+  C[0 * _cs0 + 1 * _cs1] += alpha * c_01;
+  C[0 * _cs0 + 2 * _cs1] += alpha * c_02;
+  C[0 * _cs0 + 3 * _cs1] += alpha * c_03;
+  C[0 * _cs0 + 4 * _cs1] += alpha * c_04;
+  C[1 * _cs0 + 0 * _cs1] += alpha * c_10;
+  C[1 * _cs0 + 1 * _cs1] += alpha * c_11;
+  C[1 * _cs0 + 2 * _cs1] += alpha * c_12;
+  C[1 * _cs0 + 3 * _cs1] += alpha * c_13;
+  C[1 * _cs0 + 4 * _cs1] += alpha * c_14;
+  C[2 * _cs0 + 0 * _cs1] += alpha * c_20;
+  C[2 * _cs0 + 1 * _cs1] += alpha * c_21;
+  C[2 * _cs0 + 2 * _cs1] += alpha * c_22;
+  C[2 * _cs0 + 3 * _cs1] += alpha * c_23;
+  C[2 * _cs0 + 4 * _cs1] += alpha * c_24;
+
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 5>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (k <= 0) return 0;
+
+  ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0, a_1p,
+                        b_p1, c_10 = 0, c_11 = 0, c_12 = 0, c_13 = 0, c_14 = 0,
+                        /**/ b_p2,
+                        /**/ b_p3,
+                        /**/ b_p4;
+
+  const int i0 = 0 * _as0, i1 = 1 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1,
+            j2 = 2 * _bs1, j3 = 3 * _bs1, j4 = 4 * _bs1;
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<k;++p) {
-      a_0p = A[i0+p*_as1]; b_p0 = B[p*_bs0+j0];
-      a_1p = A[i1+p*_as1]; b_p1 = B[p*_bs0+j1];
-      /**/                 b_p2 = B[p*_bs0+j2];
-      /**/                 b_p3 = B[p*_bs0+j3];
-      /**/                 b_p4 = B[p*_bs0+j4];
-      
-      c_00 += a_0p * b_p0; c_01 += a_0p * b_p1; c_02 += a_0p * b_p2; c_03 += a_0p * b_p3; c_04 += a_0p * b_p4;
-      c_10 += a_1p * b_p0; c_11 += a_1p * b_p1; c_12 += a_1p * b_p2; c_13 += a_1p * b_p3; c_14 += a_1p * b_p4;
-    }
-    
-    C[0*_cs0+0*_cs1] += alpha * c_00; C[0*_cs0+1*_cs1] += alpha * c_01; C[0*_cs0+2*_cs1] += alpha * c_02; C[0*_cs0+3*_cs1] += alpha * c_03; C[0*_cs0+4*_cs1] += alpha * c_04;
-    C[1*_cs0+0*_cs1] += alpha * c_10; C[1*_cs0+1*_cs1] += alpha * c_11; C[1*_cs0+2*_cs1] += alpha * c_12; C[1*_cs0+3*_cs1] += alpha * c_13; C[1*_cs0+4*_cs1] += alpha * c_14;
-    
-    return 0;
+  for (int p = 0; p < k; ++p) {
+    a_0p    = A[i0 + p * _as1];
+    b_p0    = B[p * _bs0 + j0];
+    a_1p    = A[i1 + p * _as1];
+    b_p1    = B[p * _bs0 + j1];
+    /**/ b_p2 = B[p * _bs0 + j2];
+    /**/ b_p3 = B[p * _bs0 + j3];
+    /**/ b_p4 = B[p * _bs0 + j4];
+
+    c_00 += a_0p * b_p0;
+    c_01 += a_0p * b_p1;
+    c_02 += a_0p * b_p2;
+    c_03 += a_0p * b_p3;
+    c_04 += a_0p * b_p4;
+    c_10 += a_1p * b_p0;
+    c_11 += a_1p * b_p1;
+    c_12 += a_1p * b_p2;
+    c_13 += a_1p * b_p3;
+    c_14 += a_1p * b_p4;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<1,5>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (k <= 0) return 0;
-
-    ValueType
-      a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0,
-      /**/  b_p1, 
-      /**/  b_p2, 
-      /**/  b_p3, 
-      /**/  b_p4;
-    
-    const int
-      i0 = 0*_as0, 
-      j0 = 0*_bs1, j1 = 1*_bs1, j2 = 2*_bs1, j3 = 3*_bs1, j4 = 4*_bs1;
-    
-            
+  C[0 * _cs0 + 0 * _cs1] += alpha * c_00;
+  C[0 * _cs0 + 1 * _cs1] += alpha * c_01;
+  C[0 * _cs0 + 2 * _cs1] += alpha * c_02;
+  C[0 * _cs0 + 3 * _cs1] += alpha * c_03;
+  C[0 * _cs0 + 4 * _cs1] += alpha * c_04;
+  C[1 * _cs0 + 0 * _cs1] += alpha * c_10;
+  C[1 * _cs0 + 1 * _cs1] += alpha * c_11;
+  C[1 * _cs0 + 2 * _cs1] += alpha * c_12;
+  C[1 * _cs0 + 3 * _cs1] += alpha * c_13;
+  C[1 * _cs0 + 4 * _cs1] += alpha * c_14;
+
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 5>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (k <= 0) return 0;
+
+  ValueType a_0p, b_p0, c_00 = 0, c_01 = 0, c_02 = 0, c_03 = 0, c_04 = 0,
+                        /**/ b_p1,
+                        /**/ b_p2,
+                        /**/ b_p3,
+                        /**/ b_p4;
+
+  const int i0 = 0 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1,
+            j3 = 3 * _bs1, j4 = 4 * _bs1;
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<k;++p) {
-      a_0p = A[i0+p*_as1]; b_p0 = B[p*_bs0+j0];
-      /**/                 b_p1 = B[p*_bs0+j1];
-      /**/                 b_p2 = B[p*_bs0+j2];
-      /**/                 b_p3 = B[p*_bs0+j3];
-      /**/                 b_p4 = B[p*_bs0+j4];
-      
-      c_00 += a_0p * b_p0; c_01 += a_0p * b_p1; c_02 += a_0p * b_p2; c_03 += a_0p * b_p3; c_04 += a_0p * b_p4;
-    }
-    
-    C[0*_cs0+0*_cs1] += alpha * c_00; C[0*_cs0+1*_cs1] += alpha * c_01; C[0*_cs0+2*_cs1] += alpha * c_02; C[0*_cs0+3*_cs1] += alpha * c_03; C[0*_cs0+4*_cs1] += alpha * c_04;
-    
-    return 0;
+  for (int p = 0; p < k; ++p) {
+    a_0p    = A[i0 + p * _as1];
+    b_p0    = B[p * _bs0 + j0];
+    /**/ b_p1 = B[p * _bs0 + j1];
+    /**/ b_p2 = B[p * _bs0 + j2];
+    /**/ b_p3 = B[p * _bs0 + j3];
+    /**/ b_p4 = B[p * _bs0 + j4];
+
+    c_00 += a_0p * b_p0;
+    c_01 += a_0p * b_p1;
+    c_02 += a_0p * b_p2;
+    c_03 += a_0p * b_p3;
+    c_04 += a_0p * b_p4;
   }
-  ///
-  /// Inner kernel (4x4)
-  /// ==================
-  
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<4,4>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (k <= 0) return 0;
-
-    ValueType
-      a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), c_03 = ValueType(0),
-      a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), c_13 = ValueType(0),
-      a_2p, b_p2, c_20 = ValueType(0), c_21 = ValueType(0), c_22 = ValueType(0), c_23 = ValueType(0),
-      a_3p, b_p3, c_30 = ValueType(0), c_31 = ValueType(0), c_32 = ValueType(0), c_33 = ValueType(0);
-
-    const int
-      i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0, i3 = 3*_as0,
-      j0 = 0*_bs1, j1 = 1*_bs1, j2 = 2*_bs1, j3 = 3*_bs1;
-
-            
+
+  C[0 * _cs0 + 0 * _cs1] += alpha * c_00;
+  C[0 * _cs0 + 1 * _cs1] += alpha * c_01;
+  C[0 * _cs0 + 2 * _cs1] += alpha * c_02;
+  C[0 * _cs0 + 3 * _cs1] += alpha * c_03;
+  C[0 * _cs0 + 4 * _cs1] += alpha * c_04;
+
+  return 0;
+}
+///
+/// Inner kernel (4x4)
+/// ==================
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 4>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (k <= 0) return 0;
+
+  ValueType a_0p, b_p0,
+      c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0),
+      c_03 = ValueType(0), a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0),
+      c_12 = ValueType(0), c_13 = ValueType(0), a_2p, b_p2, c_20 = ValueType(0),
+      c_21 = ValueType(0), c_22 = ValueType(0), c_23 = ValueType(0), a_3p, b_p3,
+      c_30 = ValueType(0), c_31 = ValueType(0), c_32 = ValueType(0),
+      c_33 = ValueType(0);
+
+  const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0,
+            j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1, j3 = 3 * _bs1;
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<k;++p) {
-      a_0p = A[i0+p*_as1]; b_p0 = B[p*_bs0+j0];
-      a_1p = A[i1+p*_as1]; b_p1 = B[p*_bs0+j1];
-      a_2p = A[i2+p*_as1]; b_p2 = B[p*_bs0+j2];
-      a_3p = A[i3+p*_as1]; b_p3 = B[p*_bs0+j3];
-
-      c_00 += a_0p * b_p0; c_01 += a_0p * b_p1; c_02 += a_0p * b_p2; c_03 += a_0p * b_p3;
-      c_10 += a_1p * b_p0; c_11 += a_1p * b_p1; c_12 += a_1p * b_p2; c_13 += a_1p * b_p3;
-      c_20 += a_2p * b_p0; c_21 += a_2p * b_p1; c_22 += a_2p * b_p2; c_23 += a_2p * b_p3;
-      c_30 += a_3p * b_p0; c_31 += a_3p * b_p1; c_32 += a_3p * b_p2; c_33 += a_3p * b_p3;
-    }
+  for (int p = 0; p < k; ++p) {
+    a_0p = A[i0 + p * _as1];
+    b_p0 = B[p * _bs0 + j0];
+    a_1p = A[i1 + p * _as1];
+    b_p1 = B[p * _bs0 + j1];
+    a_2p = A[i2 + p * _as1];
+    b_p2 = B[p * _bs0 + j2];
+    a_3p = A[i3 + p * _as1];
+    b_p3 = B[p * _bs0 + j3];
+
+    c_00 += a_0p * b_p0;
+    c_01 += a_0p * b_p1;
+    c_02 += a_0p * b_p2;
+    c_03 += a_0p * b_p3;
+    c_10 += a_1p * b_p0;
+    c_11 += a_1p * b_p1;
+    c_12 += a_1p * b_p2;
+    c_13 += a_1p * b_p3;
+    c_20 += a_2p * b_p0;
+    c_21 += a_2p * b_p1;
+    c_22 += a_2p * b_p2;
+    c_23 += a_2p * b_p3;
+    c_30 += a_3p * b_p0;
+    c_31 += a_3p * b_p1;
+    c_32 += a_3p * b_p2;
+    c_33 += a_3p * b_p3;
+  }
+
+  C[0 * _cs0 + 0 * _cs1] += alpha * c_00;
+  C[0 * _cs0 + 1 * _cs1] += alpha * c_01;
+  C[0 * _cs0 + 2 * _cs1] += alpha * c_02;
+  C[0 * _cs0 + 3 * _cs1] += alpha * c_03;
+  C[1 * _cs0 + 0 * _cs1] += alpha * c_10;
+  C[1 * _cs0 + 1 * _cs1] += alpha * c_11;
+  C[1 * _cs0 + 2 * _cs1] += alpha * c_12;
+  C[1 * _cs0 + 3 * _cs1] += alpha * c_13;
+  C[2 * _cs0 + 0 * _cs1] += alpha * c_20;
+  C[2 * _cs0 + 1 * _cs1] += alpha * c_21;
+  C[2 * _cs0 + 2 * _cs1] += alpha * c_22;
+  C[2 * _cs0 + 3 * _cs1] += alpha * c_23;
+  C[3 * _cs0 + 0 * _cs1] += alpha * c_30;
+  C[3 * _cs0 + 1 * _cs1] += alpha * c_31;
+  C[3 * _cs0 + 2 * _cs1] += alpha * c_32;
+  C[3 * _cs0 + 3 * _cs1] += alpha * c_33;
+
+  return 0;
+}
 
-    C[0*_cs0+0*_cs1] += alpha * c_00; C[0*_cs0+1*_cs1] += alpha * c_01; C[0*_cs0+2*_cs1] += alpha * c_02; C[0*_cs0+3*_cs1] += alpha * c_03;
-    C[1*_cs0+0*_cs1] += alpha * c_10; C[1*_cs0+1*_cs1] += alpha * c_11; C[1*_cs0+2*_cs1] += alpha * c_12; C[1*_cs0+3*_cs1] += alpha * c_13;
-    C[2*_cs0+0*_cs1] += alpha * c_20; C[2*_cs0+1*_cs1] += alpha * c_21; C[2*_cs0+2*_cs1] += alpha * c_22; C[2*_cs0+3*_cs1] += alpha * c_23;
-    C[3*_cs0+0*_cs1] += alpha * c_30; C[3*_cs0+1*_cs1] += alpha * c_31; C[3*_cs0+2*_cs1] += alpha * c_32; C[3*_cs0+3*_cs1] += alpha * c_33;
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 3>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (k <= 0) return 0;
 
-    return 0;
-  }
+  ValueType a_0p, b_p0,
+      c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), a_1p, b_p1,
+      c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), a_2p, b_p2,
+      c_20 = ValueType(0), c_21 = ValueType(0), c_22 = ValueType(0), a_3p,
+      c_30 = ValueType(0), c_31 = ValueType(0), c_32 = ValueType(0);
+
+  const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0,
+            j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1;
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<4,3>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (k <= 0) return 0;
-
-    ValueType
-      a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0),
-      a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0),
-      a_2p, b_p2, c_20 = ValueType(0), c_21 = ValueType(0), c_22 = ValueType(0),
-      a_3p,       c_30 = ValueType(0), c_31 = ValueType(0), c_32 = ValueType(0);
-
-    const int
-      i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0, i3 = 3*_as0,
-      j0 = 0*_bs1, j1 = 1*_bs1, j2 = 2*_bs1;
-
-            
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<k;++p) {
-      a_0p = A[i0+p*_as1]; b_p0 = B[p*_bs0+j0];
-      a_1p = A[i1+p*_as1]; b_p1 = B[p*_bs0+j1];
-      a_2p = A[i2+p*_as1]; b_p2 = B[p*_bs0+j2];
-      a_3p = A[i3+p*_as1]; 
-
-      c_00 += a_0p * b_p0; c_01 += a_0p * b_p1; c_02 += a_0p * b_p2; 
-      c_10 += a_1p * b_p0; c_11 += a_1p * b_p1; c_12 += a_1p * b_p2; 
-      c_20 += a_2p * b_p0; c_21 += a_2p * b_p1; c_22 += a_2p * b_p2; 
-      c_30 += a_3p * b_p0; c_31 += a_3p * b_p1; c_32 += a_3p * b_p2; 
-    }
+  for (int p = 0; p < k; ++p) {
+    a_0p = A[i0 + p * _as1];
+    b_p0 = B[p * _bs0 + j0];
+    a_1p = A[i1 + p * _as1];
+    b_p1 = B[p * _bs0 + j1];
+    a_2p = A[i2 + p * _as1];
+    b_p2 = B[p * _bs0 + j2];
+    a_3p = A[i3 + p * _as1];
+
+    c_00 += a_0p * b_p0;
+    c_01 += a_0p * b_p1;
+    c_02 += a_0p * b_p2;
+    c_10 += a_1p * b_p0;
+    c_11 += a_1p * b_p1;
+    c_12 += a_1p * b_p2;
+    c_20 += a_2p * b_p0;
+    c_21 += a_2p * b_p1;
+    c_22 += a_2p * b_p2;
+    c_30 += a_3p * b_p0;
+    c_31 += a_3p * b_p1;
+    c_32 += a_3p * b_p2;
+  }
+
+  C[0 * _cs0 + 0 * _cs1] += alpha * c_00;
+  C[0 * _cs0 + 1 * _cs1] += alpha * c_01;
+  C[0 * _cs0 + 2 * _cs1] += alpha * c_02;
+  C[1 * _cs0 + 0 * _cs1] += alpha * c_10;
+  C[1 * _cs0 + 1 * _cs1] += alpha * c_11;
+  C[1 * _cs0 + 2 * _cs1] += alpha * c_12;
+  C[2 * _cs0 + 0 * _cs1] += alpha * c_20;
+  C[2 * _cs0 + 1 * _cs1] += alpha * c_21;
+  C[2 * _cs0 + 2 * _cs1] += alpha * c_22;
+  C[3 * _cs0 + 0 * _cs1] += alpha * c_30;
+  C[3 * _cs0 + 1 * _cs1] += alpha * c_31;
+  C[3 * _cs0 + 2 * _cs1] += alpha * c_32;
+
+  return 0;
+}
 
-    C[0*_cs0+0*_cs1] += alpha * c_00; C[0*_cs0+1*_cs1] += alpha * c_01; C[0*_cs0+2*_cs1] += alpha * c_02; 
-    C[1*_cs0+0*_cs1] += alpha * c_10; C[1*_cs0+1*_cs1] += alpha * c_11; C[1*_cs0+2*_cs1] += alpha * c_12; 
-    C[2*_cs0+0*_cs1] += alpha * c_20; C[2*_cs0+1*_cs1] += alpha * c_21; C[2*_cs0+2*_cs1] += alpha * c_22; 
-    C[3*_cs0+0*_cs1] += alpha * c_30; C[3*_cs0+1*_cs1] += alpha * c_31; C[3*_cs0+2*_cs1] += alpha * c_32; 
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 2>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (k <= 0) return 0;
 
-    return 0;
-  }
+  ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), a_1p, b_p1,
+                        c_10 = ValueType(0), c_11 = ValueType(0), a_2p,
+                        c_20 = ValueType(0), c_21 = ValueType(0), a_3p,
+                        c_30 = ValueType(0), c_31 = ValueType(0);
+
+  const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0,
+            j0 = 0 * _bs1, j1 = 1 * _bs1;
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<4,2>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (k <= 0) return 0;
-
-    ValueType
-      a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), 
-      a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), 
-      a_2p,       c_20 = ValueType(0), c_21 = ValueType(0), 
-      a_3p,       c_30 = ValueType(0), c_31 = ValueType(0);
-
-    const int
-      i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0, i3 = 3*_as0,
-      j0 = 0*_bs1, j1 = 1*_bs1;
-
-            
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<k;++p) {
-      a_0p = A[i0+p*_as1]; b_p0 = B[p*_bs0+j0];
-      a_1p = A[i1+p*_as1]; b_p1 = B[p*_bs0+j1];
-      a_2p = A[i2+p*_as1]; 
-      a_3p = A[i3+p*_as1]; 
-
-      c_00 += a_0p * b_p0; c_01 += a_0p * b_p1; 
-      c_10 += a_1p * b_p0; c_11 += a_1p * b_p1; 
-      c_20 += a_2p * b_p0; c_21 += a_2p * b_p1; 
-      c_30 += a_3p * b_p0; c_31 += a_3p * b_p1; 
-    }
+  for (int p = 0; p < k; ++p) {
+    a_0p = A[i0 + p * _as1];
+    b_p0 = B[p * _bs0 + j0];
+    a_1p = A[i1 + p * _as1];
+    b_p1 = B[p * _bs0 + j1];
+    a_2p = A[i2 + p * _as1];
+    a_3p = A[i3 + p * _as1];
+
+    c_00 += a_0p * b_p0;
+    c_01 += a_0p * b_p1;
+    c_10 += a_1p * b_p0;
+    c_11 += a_1p * b_p1;
+    c_20 += a_2p * b_p0;
+    c_21 += a_2p * b_p1;
+    c_30 += a_3p * b_p0;
+    c_31 += a_3p * b_p1;
+  }
 
-    C[0*_cs0+0*_cs1] += alpha * c_00; C[0*_cs0+1*_cs1] += alpha * c_01; 
-    C[1*_cs0+0*_cs1] += alpha * c_10; C[1*_cs0+1*_cs1] += alpha * c_11; 
-    C[2*_cs0+0*_cs1] += alpha * c_20; C[2*_cs0+1*_cs1] += alpha * c_21; 
-    C[3*_cs0+0*_cs1] += alpha * c_30; C[3*_cs0+1*_cs1] += alpha * c_31; 
+  C[0 * _cs0 + 0 * _cs1] += alpha * c_00;
+  C[0 * _cs0 + 1 * _cs1] += alpha * c_01;
+  C[1 * _cs0 + 0 * _cs1] += alpha * c_10;
+  C[1 * _cs0 + 1 * _cs1] += alpha * c_11;
+  C[2 * _cs0 + 0 * _cs1] += alpha * c_20;
+  C[2 * _cs0 + 1 * _cs1] += alpha * c_21;
+  C[3 * _cs0 + 0 * _cs1] += alpha * c_30;
+  C[3 * _cs0 + 1 * _cs1] += alpha * c_31;
 
-    return 0;
-  }
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 1>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (k <= 0) return 0;
+
+  ValueType a_0p, b_p0, c_00 = ValueType(0), a_1p, c_10 = ValueType(0), a_2p,
+                        c_20 = ValueType(0), a_3p, c_30 = ValueType(0);
+
+  const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0,
+            j0 = 0 * _bs1;
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<4,1>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (k <= 0) return 0;
-
-    ValueType
-      a_0p, b_p0, c_00 = ValueType(0),
-      a_1p,       c_10 = ValueType(0),
-      a_2p,       c_20 = ValueType(0),
-      a_3p,       c_30 = ValueType(0);
-
-    const int
-      i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0, i3 = 3*_as0,
-      j0 = 0*_bs1;
-
-            
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<k;++p) {
-      a_0p = A[i0+p*_as1]; b_p0 = B[p*_bs0+j0];
-      a_1p = A[i1+p*_as1]; 
-      a_2p = A[i2+p*_as1]; 
-      a_3p = A[i3+p*_as1]; 
-
-      c_00 += a_0p * b_p0; 
-      c_10 += a_1p * b_p0; 
-      c_20 += a_2p * b_p0; 
-      c_30 += a_3p * b_p0; 
-    }
+  for (int p = 0; p < k; ++p) {
+    a_0p = A[i0 + p * _as1];
+    b_p0 = B[p * _bs0 + j0];
+    a_1p = A[i1 + p * _as1];
+    a_2p = A[i2 + p * _as1];
+    a_3p = A[i3 + p * _as1];
+
+    c_00 += a_0p * b_p0;
+    c_10 += a_1p * b_p0;
+    c_20 += a_2p * b_p0;
+    c_30 += a_3p * b_p0;
+  }
 
-    C[0*_cs0+0*_cs1] += alpha * c_00;
-    C[1*_cs0+0*_cs1] += alpha * c_10;
-    C[2*_cs0+0*_cs1] += alpha * c_20;
-    C[3*_cs0+0*_cs1] += alpha * c_30;
+  C[0 * _cs0 + 0 * _cs1] += alpha * c_00;
+  C[1 * _cs0 + 0 * _cs1] += alpha * c_10;
+  C[2 * _cs0 + 0 * _cs1] += alpha * c_20;
+  C[3 * _cs0 + 0 * _cs1] += alpha * c_30;
 
-    return 0;
-  }
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 4>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (k <= 0) return 0;
+
+  ValueType a_0p, b_p0,
+      c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0),
+      c_03 = ValueType(0), a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0),
+      c_12 = ValueType(0), c_13 = ValueType(0), a_2p, b_p2, c_20 = ValueType(0),
+      c_21 = ValueType(0), c_22 = ValueType(0), c_23 = ValueType(0),
+      /**/ b_p3;
+
+  const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, j0 = 0 * _bs1,
+            j1 = 1 * _bs1, j2 = 2 * _bs1, j3 = 3 * _bs1;
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<3,4>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (k <= 0) return 0;
-
-    ValueType
-      a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), c_03 = ValueType(0),
-      a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), c_13 = ValueType(0),
-      a_2p, b_p2, c_20 = ValueType(0), c_21 = ValueType(0), c_22 = ValueType(0), c_23 = ValueType(0),
-      /**/  b_p3;
-
-    const int
-      i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0, 
-      j0 = 0*_bs1, j1 = 1*_bs1, j2 = 2*_bs1, j3 = 3*_bs1;
-
-            
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<k;++p) {
-      a_0p = A[i0+p*_as1]; b_p0 = B[p*_bs0+j0];
-      a_1p = A[i1+p*_as1]; b_p1 = B[p*_bs0+j1];
-      a_2p = A[i2+p*_as1]; b_p2 = B[p*_bs0+j2];
-      /**/                 b_p3 = B[p*_bs0+j3];
-
-      c_00 += a_0p * b_p0; c_01 += a_0p * b_p1; c_02 += a_0p * b_p2; c_03 += a_0p * b_p3;
-      c_10 += a_1p * b_p0; c_11 += a_1p * b_p1; c_12 += a_1p * b_p2; c_13 += a_1p * b_p3;
-      c_20 += a_2p * b_p0; c_21 += a_2p * b_p1; c_22 += a_2p * b_p2; c_23 += a_2p * b_p3;
-    }
+  for (int p = 0; p < k; ++p) {
+    a_0p    = A[i0 + p * _as1];
+    b_p0    = B[p * _bs0 + j0];
+    a_1p    = A[i1 + p * _as1];
+    b_p1    = B[p * _bs0 + j1];
+    a_2p    = A[i2 + p * _as1];
+    b_p2    = B[p * _bs0 + j2];
+    /**/ b_p3 = B[p * _bs0 + j3];
+
+    c_00 += a_0p * b_p0;
+    c_01 += a_0p * b_p1;
+    c_02 += a_0p * b_p2;
+    c_03 += a_0p * b_p3;
+    c_10 += a_1p * b_p0;
+    c_11 += a_1p * b_p1;
+    c_12 += a_1p * b_p2;
+    c_13 += a_1p * b_p3;
+    c_20 += a_2p * b_p0;
+    c_21 += a_2p * b_p1;
+    c_22 += a_2p * b_p2;
+    c_23 += a_2p * b_p3;
+  }
 
-    C[0*_cs0+0*_cs1] += alpha * c_00; C[0*_cs0+1*_cs1] += alpha * c_01; C[0*_cs0+2*_cs1] += alpha * c_02; C[0*_cs0+3*_cs1] += alpha * c_03;
-    C[1*_cs0+0*_cs1] += alpha * c_10; C[1*_cs0+1*_cs1] += alpha * c_11; C[1*_cs0+2*_cs1] += alpha * c_12; C[1*_cs0+3*_cs1] += alpha * c_13;
-    C[2*_cs0+0*_cs1] += alpha * c_20; C[2*_cs0+1*_cs1] += alpha * c_21; C[2*_cs0+2*_cs1] += alpha * c_22; C[2*_cs0+3*_cs1] += alpha * c_23;
+  C[0 * _cs0 + 0 * _cs1] += alpha * c_00;
+  C[0 * _cs0 + 1 * _cs1] += alpha * c_01;
+  C[0 * _cs0 + 2 * _cs1] += alpha * c_02;
+  C[0 * _cs0 + 3 * _cs1] += alpha * c_03;
+  C[1 * _cs0 + 0 * _cs1] += alpha * c_10;
+  C[1 * _cs0 + 1 * _cs1] += alpha * c_11;
+  C[1 * _cs0 + 2 * _cs1] += alpha * c_12;
+  C[1 * _cs0 + 3 * _cs1] += alpha * c_13;
+  C[2 * _cs0 + 0 * _cs1] += alpha * c_20;
+  C[2 * _cs0 + 1 * _cs1] += alpha * c_21;
+  C[2 * _cs0 + 2 * _cs1] += alpha * c_22;
+  C[2 * _cs0 + 3 * _cs1] += alpha * c_23;
+
+  return 0;
+}
 
-    return 0;
-  }
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 4>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (k <= 0) return 0;
+
+  ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0),
+                        c_02 = ValueType(0), c_03 = ValueType(0), a_1p, b_p1,
+                        c_10 = ValueType(0), c_11 = ValueType(0),
+                        c_12 = ValueType(0), c_13 = ValueType(0),
+                        /**/ b_p2,
+                        /**/ b_p3;
+
+  const int i0 = 0 * _as0, i1 = 1 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1,
+            j2 = 2 * _bs1, j3 = 3 * _bs1;
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<2,4>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (k <= 0) return 0;
-
-    ValueType
-      a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), c_03 = ValueType(0),
-      a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), c_13 = ValueType(0),
-      /**/  b_p2, 
-      /**/  b_p3;
-
-    const int
-      i0 = 0*_as0, i1 = 1*_as0, 
-      j0 = 0*_bs1, j1 = 1*_bs1, j2 = 2*_bs1, j3 = 3*_bs1;
-
-            
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<k;++p) {
-      a_0p = A[i0+p*_as1]; b_p0 = B[p*_bs0+j0];
-      a_1p = A[i1+p*_as1]; b_p1 = B[p*_bs0+j1];
-      /**/                 b_p2 = B[p*_bs0+j2];
-      /**/                 b_p3 = B[p*_bs0+j3];
-
-      c_00 += a_0p * b_p0; c_01 += a_0p * b_p1; c_02 += a_0p * b_p2; c_03 += a_0p * b_p3;
-      c_10 += a_1p * b_p0; c_11 += a_1p * b_p1; c_12 += a_1p * b_p2; c_13 += a_1p * b_p3;
-    }
+  for (int p = 0; p < k; ++p) {
+    a_0p    = A[i0 + p * _as1];
+    b_p0    = B[p * _bs0 + j0];
+    a_1p    = A[i1 + p * _as1];
+    b_p1    = B[p * _bs0 + j1];
+    /**/ b_p2 = B[p * _bs0 + j2];
+    /**/ b_p3 = B[p * _bs0 + j3];
+
+    c_00 += a_0p * b_p0;
+    c_01 += a_0p * b_p1;
+    c_02 += a_0p * b_p2;
+    c_03 += a_0p * b_p3;
+    c_10 += a_1p * b_p0;
+    c_11 += a_1p * b_p1;
+    c_12 += a_1p * b_p2;
+    c_13 += a_1p * b_p3;
+  }
 
-    C[0*_cs0+0*_cs1] += alpha * c_00; C[0*_cs0+1*_cs1] += alpha * c_01; C[0*_cs0+2*_cs1] += alpha * c_02; C[0*_cs0+3*_cs1] += alpha * c_03;
-    C[1*_cs0+0*_cs1] += alpha * c_10; C[1*_cs0+1*_cs1] += alpha * c_11; C[1*_cs0+2*_cs1] += alpha * c_12; C[1*_cs0+3*_cs1] += alpha * c_13;
+  C[0 * _cs0 + 0 * _cs1] += alpha * c_00;
+  C[0 * _cs0 + 1 * _cs1] += alpha * c_01;
+  C[0 * _cs0 + 2 * _cs1] += alpha * c_02;
+  C[0 * _cs0 + 3 * _cs1] += alpha * c_03;
+  C[1 * _cs0 + 0 * _cs1] += alpha * c_10;
+  C[1 * _cs0 + 1 * _cs1] += alpha * c_11;
+  C[1 * _cs0 + 2 * _cs1] += alpha * c_12;
+  C[1 * _cs0 + 3 * _cs1] += alpha * c_13;
 
-    return 0;
-  }
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 4>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (k <= 0) return 0;
+
+  ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0),
+                        c_02 = ValueType(0), c_03 = ValueType(0),
+                        /**/ b_p1,
+                        /**/ b_p2,
+                        /**/ b_p3;
+
+  const int i0 = 0 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1,
+            j3 = 3 * _bs1;
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<1,4>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (k <= 0) return 0;
-
-    ValueType
-      a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), c_03 = ValueType(0),
-      /**/  b_p1, 
-      /**/  b_p2, 
-      /**/  b_p3; 
-
-    const int
-      i0 = 0*_as0, 
-      j0 = 0*_bs1, j1 = 1*_bs1, j2 = 2*_bs1, j3 = 3*_bs1;
-
-            
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<k;++p) {
-      a_0p = A[i0+p*_as1]; b_p0 = B[p*_bs0+j0];
-      /**/                 b_p1 = B[p*_bs0+j1];
-      /**/                 b_p2 = B[p*_bs0+j2];
-      /**/                 b_p3 = B[p*_bs0+j3];
+  for (int p = 0; p < k; ++p) {
+    a_0p    = A[i0 + p * _as1];
+    b_p0    = B[p * _bs0 + j0];
+    /**/ b_p1 = B[p * _bs0 + j1];
+    /**/ b_p2 = B[p * _bs0 + j2];
+    /**/ b_p3 = B[p * _bs0 + j3];
+
+    c_00 += a_0p * b_p0;
+    c_01 += a_0p * b_p1;
+    c_02 += a_0p * b_p2;
+    c_03 += a_0p * b_p3;
+  }
 
-      c_00 += a_0p * b_p0; c_01 += a_0p * b_p1; c_02 += a_0p * b_p2; c_03 += a_0p * b_p3;
-    }
+  C[0 * _cs0 + 0 * _cs1] += alpha * c_00;
+  C[0 * _cs0 + 1 * _cs1] += alpha * c_01;
+  C[0 * _cs0 + 2 * _cs1] += alpha * c_02;
+  C[0 * _cs0 + 3 * _cs1] += alpha * c_03;
 
-    C[0*_cs0+0*_cs1] += alpha * c_00; C[0*_cs0+1*_cs1] += alpha * c_01; C[0*_cs0+2*_cs1] += alpha * c_02; C[0*_cs0+3*_cs1] += alpha * c_03;
+  return 0;
+}
 
-    return 0;
-  }
+///
+/// Inner kernel (3x3)
+/// ==================
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 3>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (k <= 0) return 0;
+
+  ValueType a_0p, b_p0,
+      c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0), a_1p, b_p1,
+      c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0), a_2p, b_p2,
+      c_20 = ValueType(0), c_21 = ValueType(0), c_22 = ValueType(0);
+
+  const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, j0 = 0 * _bs1,
+            j1 = 1 * _bs1, j2 = 2 * _bs1;
 
-  ///
-  /// Inner kernel (3x3)
-  /// ==================
-
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<3,3>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (k <= 0) return 0;
-
-    ValueType
-      a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0),
-      a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0),
-      a_2p, b_p2, c_20 = ValueType(0), c_21 = ValueType(0), c_22 = ValueType(0);
-
-    const int
-      i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0,
-      j0 = 0*_bs1, j1 = 1*_bs1, j2 = 2*_bs1;
-
-            
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<k;++p) {
-      a_0p = A[i0+p*_as1]; b_p0 = B[p*_bs0+j0];
-      a_1p = A[i1+p*_as1]; b_p1 = B[p*_bs0+j1];
-      a_2p = A[i2+p*_as1]; b_p2 = B[p*_bs0+j2];
-
-      c_00 += a_0p * b_p0; c_01 += a_0p * b_p1; c_02 += a_0p * b_p2;
-      c_10 += a_1p * b_p0; c_11 += a_1p * b_p1; c_12 += a_1p * b_p2;
-      c_20 += a_2p * b_p0; c_21 += a_2p * b_p1; c_22 += a_2p * b_p2;
-    }
+  for (int p = 0; p < k; ++p) {
+    a_0p = A[i0 + p * _as1];
+    b_p0 = B[p * _bs0 + j0];
+    a_1p = A[i1 + p * _as1];
+    b_p1 = B[p * _bs0 + j1];
+    a_2p = A[i2 + p * _as1];
+    b_p2 = B[p * _bs0 + j2];
+
+    c_00 += a_0p * b_p0;
+    c_01 += a_0p * b_p1;
+    c_02 += a_0p * b_p2;
+    c_10 += a_1p * b_p0;
+    c_11 += a_1p * b_p1;
+    c_12 += a_1p * b_p2;
+    c_20 += a_2p * b_p0;
+    c_21 += a_2p * b_p1;
+    c_22 += a_2p * b_p2;
+  }
 
-    C[0*_cs0+0*_cs1] += alpha * c_00; C[0*_cs0+1*_cs1] += alpha * c_01; C[0*_cs0+2*_cs1] += alpha * c_02;
-    C[1*_cs0+0*_cs1] += alpha * c_10; C[1*_cs0+1*_cs1] += alpha * c_11; C[1*_cs0+2*_cs1] += alpha * c_12;
-    C[2*_cs0+0*_cs1] += alpha * c_20; C[2*_cs0+1*_cs1] += alpha * c_21; C[2*_cs0+2*_cs1] += alpha * c_22;
+  C[0 * _cs0 + 0 * _cs1] += alpha * c_00;
+  C[0 * _cs0 + 1 * _cs1] += alpha * c_01;
+  C[0 * _cs0 + 2 * _cs1] += alpha * c_02;
+  C[1 * _cs0 + 0 * _cs1] += alpha * c_10;
+  C[1 * _cs0 + 1 * _cs1] += alpha * c_11;
+  C[1 * _cs0 + 2 * _cs1] += alpha * c_12;
+  C[2 * _cs0 + 0 * _cs1] += alpha * c_20;
+  C[2 * _cs0 + 1 * _cs1] += alpha * c_21;
+  C[2 * _cs0 + 2 * _cs1] += alpha * c_22;
+
+  return 0;
+}
 
-    return 0;
-  }
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 2>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (k <= 0) return 0;
+
+  ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), a_1p, b_p1,
+                        c_10 = ValueType(0), c_11 = ValueType(0), a_2p,
+                        c_20 = ValueType(0), c_21 = ValueType(0);
+
+  const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, j0 = 0 * _bs1,
+            j1 = 1 * _bs1;
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<3,2>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (k <= 0) return 0;
-
-    ValueType
-      a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), 
-      a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), 
-      a_2p,       c_20 = ValueType(0), c_21 = ValueType(0);
-
-    const int
-      i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0,
-      j0 = 0*_bs1, j1 = 1*_bs1;
-
-            
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<k;++p) {
-      a_0p = A[i0+p*_as1]; b_p0 = B[p*_bs0+j0];
-      a_1p = A[i1+p*_as1]; b_p1 = B[p*_bs0+j1];
-      a_2p = A[i2+p*_as1]; 
-
-      c_00 += a_0p * b_p0; c_01 += a_0p * b_p1; 
-      c_10 += a_1p * b_p0; c_11 += a_1p * b_p1; 
-      c_20 += a_2p * b_p0; c_21 += a_2p * b_p1; 
-    }
+  for (int p = 0; p < k; ++p) {
+    a_0p = A[i0 + p * _as1];
+    b_p0 = B[p * _bs0 + j0];
+    a_1p = A[i1 + p * _as1];
+    b_p1 = B[p * _bs0 + j1];
+    a_2p = A[i2 + p * _as1];
+
+    c_00 += a_0p * b_p0;
+    c_01 += a_0p * b_p1;
+    c_10 += a_1p * b_p0;
+    c_11 += a_1p * b_p1;
+    c_20 += a_2p * b_p0;
+    c_21 += a_2p * b_p1;
+  }
 
-    C[0*_cs0+0*_cs1] += alpha * c_00; C[0*_cs0+1*_cs1] += alpha * c_01; 
-    C[1*_cs0+0*_cs1] += alpha * c_10; C[1*_cs0+1*_cs1] += alpha * c_11; 
-    C[2*_cs0+0*_cs1] += alpha * c_20; C[2*_cs0+1*_cs1] += alpha * c_21; 
+  C[0 * _cs0 + 0 * _cs1] += alpha * c_00;
+  C[0 * _cs0 + 1 * _cs1] += alpha * c_01;
+  C[1 * _cs0 + 0 * _cs1] += alpha * c_10;
+  C[1 * _cs0 + 1 * _cs1] += alpha * c_11;
+  C[2 * _cs0 + 0 * _cs1] += alpha * c_20;
+  C[2 * _cs0 + 1 * _cs1] += alpha * c_21;
 
-    return 0;
-  }
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 1>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (k <= 0) return 0;
+
+  ValueType a_0p, b_p0, c_00 = ValueType(0), a_1p, c_10 = ValueType(0), a_2p,
+                        c_20 = ValueType(0);
+
+  const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, j0 = 0 * _bs1;
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<3,1>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (k <= 0) return 0;
-
-    ValueType
-      a_0p, b_p0, c_00 = ValueType(0), 
-      a_1p,       c_10 = ValueType(0), 
-      a_2p,       c_20 = ValueType(0);
-
-    const int
-      i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0,
-      j0 = 0*_bs1;
-
-            
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<k;++p) {
-      a_0p = A[i0+p*_as1]; b_p0 = B[p*_bs0+j0];
-      a_1p = A[i1+p*_as1]; 
-      a_2p = A[i2+p*_as1]; 
-
-      c_00 += a_0p * b_p0; 
-      c_10 += a_1p * b_p0; 
-      c_20 += a_2p * b_p0; 
-    }
+  for (int p = 0; p < k; ++p) {
+    a_0p = A[i0 + p * _as1];
+    b_p0 = B[p * _bs0 + j0];
+    a_1p = A[i1 + p * _as1];
+    a_2p = A[i2 + p * _as1];
+
+    c_00 += a_0p * b_p0;
+    c_10 += a_1p * b_p0;
+    c_20 += a_2p * b_p0;
+  }
 
-    C[0*_cs0+0*_cs1] += alpha * c_00; 
-    C[1*_cs0+0*_cs1] += alpha * c_10; 
-    C[2*_cs0+0*_cs1] += alpha * c_20; 
+  C[0 * _cs0 + 0 * _cs1] += alpha * c_00;
+  C[1 * _cs0 + 0 * _cs1] += alpha * c_10;
+  C[2 * _cs0 + 0 * _cs1] += alpha * c_20;
 
-    return 0;
-  }
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 3>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (k <= 0) return 0;
+
+  ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0),
+                        c_02 = ValueType(0), a_1p, b_p1, c_10 = ValueType(0),
+                        c_11 = ValueType(0), c_12 = ValueType(0),
+                        /**/ b_p2;
+
+  const int i0 = 0 * _as0, i1 = 1 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1,
+            j2 = 2 * _bs1;
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<2,3>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (k <= 0) return 0;
-
-    ValueType
-      a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0),
-      a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0), c_12 = ValueType(0),
-      /**/  b_p2;
-
-    const int
-      i0 = 0*_as0, i1 = 1*_as0, 
-      j0 = 0*_bs1, j1 = 1*_bs1, j2 = 2*_bs1;
-
-            
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<k;++p) {
-      a_0p = A[i0+p*_as1]; b_p0 = B[p*_bs0+j0];
-      a_1p = A[i1+p*_as1]; b_p1 = B[p*_bs0+j1];
-      /**/                 b_p2 = B[p*_bs0+j2];
+  for (int p = 0; p < k; ++p) {
+    a_0p    = A[i0 + p * _as1];
+    b_p0    = B[p * _bs0 + j0];
+    a_1p    = A[i1 + p * _as1];
+    b_p1    = B[p * _bs0 + j1];
+    /**/ b_p2 = B[p * _bs0 + j2];
+
+    c_00 += a_0p * b_p0;
+    c_01 += a_0p * b_p1;
+    c_02 += a_0p * b_p2;
+    c_10 += a_1p * b_p0;
+    c_11 += a_1p * b_p1;
+    c_12 += a_1p * b_p2;
+  }
 
-      c_00 += a_0p * b_p0; c_01 += a_0p * b_p1; c_02 += a_0p * b_p2;
-      c_10 += a_1p * b_p0; c_11 += a_1p * b_p1; c_12 += a_1p * b_p2;
-    }
+  C[0 * _cs0 + 0 * _cs1] += alpha * c_00;
+  C[0 * _cs0 + 1 * _cs1] += alpha * c_01;
+  C[0 * _cs0 + 2 * _cs1] += alpha * c_02;
+  C[1 * _cs0 + 0 * _cs1] += alpha * c_10;
+  C[1 * _cs0 + 1 * _cs1] += alpha * c_11;
+  C[1 * _cs0 + 2 * _cs1] += alpha * c_12;
 
-    C[0*_cs0+0*_cs1] += alpha * c_00; C[0*_cs0+1*_cs1] += alpha * c_01; C[0*_cs0+2*_cs1] += alpha * c_02;
-    C[1*_cs0+0*_cs1] += alpha * c_10; C[1*_cs0+1*_cs1] += alpha * c_11; C[1*_cs0+2*_cs1] += alpha * c_12;
+  return 0;
+}
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 3>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (k <= 0) return 0;
+
+  ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0),
+                        c_02 = ValueType(0),
+                        /**/ b_p1,
+                        /**/ b_p2;
+
+  const int i0 = 0 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1, j2 = 2 * _bs1;
 
-    return 0;
-  }
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<1,3>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (k <= 0) return 0;
-
-    ValueType
-      a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), c_02 = ValueType(0),
-      /**/  b_p1, 
-      /**/  b_p2; 
-
-    const int
-      i0 = 0*_as0, 
-      j0 = 0*_bs1, j1 = 1*_bs1, j2 = 2*_bs1;
-
-            
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<k;++p) {
-      a_0p = A[i0+p*_as1]; b_p0 = B[p*_bs0+j0];
-      /**/                 b_p1 = B[p*_bs0+j1];
-      /**/                 b_p2 = B[p*_bs0+j2];
+  for (int p = 0; p < k; ++p) {
+    a_0p    = A[i0 + p * _as1];
+    b_p0    = B[p * _bs0 + j0];
+    /**/ b_p1 = B[p * _bs0 + j1];
+    /**/ b_p2 = B[p * _bs0 + j2];
+
+    c_00 += a_0p * b_p0;
+    c_01 += a_0p * b_p1;
+    c_02 += a_0p * b_p2;
+  }
 
-      c_00 += a_0p * b_p0; c_01 += a_0p * b_p1; c_02 += a_0p * b_p2;
-    }
+  C[0 * _cs0 + 0 * _cs1] += alpha * c_00;
+  C[0 * _cs0 + 1 * _cs1] += alpha * c_01;
+  C[0 * _cs0 + 2 * _cs1] += alpha * c_02;
 
-    C[0*_cs0+0*_cs1] += alpha * c_00; C[0*_cs0+1*_cs1] += alpha * c_01; C[0*_cs0+2*_cs1] += alpha * c_02;
+  return 0;
+}
 
-    return 0;
-  }
+///
+/// Inner kernel (2x2)
+/// ==================
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 2>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (k <= 0) return 0;
+
+  ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0), a_1p, b_p1,
+                        c_10 = ValueType(0), c_11 = ValueType(0);
+
+  const int i0 = 0 * _as0, i1 = 1 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1;
 
-  ///
-  /// Inner kernel (2x2)
-  /// ==================
-
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<2,2>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (k <= 0) return 0;
-
-    ValueType
-      a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0),
-      a_1p, b_p1, c_10 = ValueType(0), c_11 = ValueType(0);
-
-    const int
-      i0 = 0*_as0, i1 = 1*_as0,
-      j0 = 0*_bs1, j1 = 1*_bs1;
-
-            
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<k;++p) {
-      a_0p = A[i0+p*_as1]; b_p0 = B[p*_bs0+j0];
-      a_1p = A[i1+p*_as1]; b_p1 = B[p*_bs0+j1];
+  for (int p = 0; p < k; ++p) {
+    a_0p = A[i0 + p * _as1];
+    b_p0 = B[p * _bs0 + j0];
+    a_1p = A[i1 + p * _as1];
+    b_p1 = B[p * _bs0 + j1];
+
+    c_00 += a_0p * b_p0;
+    c_01 += a_0p * b_p1;
+    c_10 += a_1p * b_p0;
+    c_11 += a_1p * b_p1;
+  }
 
-      c_00 += a_0p * b_p0; c_01 += a_0p * b_p1;
-      c_10 += a_1p * b_p0; c_11 += a_1p * b_p1;
-    }
+  C[0 * _cs0 + 0 * _cs1] += alpha * c_00;
+  C[0 * _cs0 + 1 * _cs1] += alpha * c_01;
+  C[1 * _cs0 + 0 * _cs1] += alpha * c_10;
+  C[1 * _cs0 + 1 * _cs1] += alpha * c_11;
+
+  return 0;
+}
 
-    C[0*_cs0+0*_cs1] += alpha * c_00; C[0*_cs0+1*_cs1] += alpha * c_01;
-    C[1*_cs0+0*_cs1] += alpha * c_10; C[1*_cs0+1*_cs1] += alpha * c_11;
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 1>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (k <= 0) return 0;
 
-    return 0;
-  }
+  ValueType a_0p, b_p0, c_00 = ValueType(0), a_1p, c_10 = ValueType(0);
+
+  const int i0 = 0 * _as0, i1 = 1 * _as0, j0 = 0 * _bs1;
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<2,1>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (k <= 0) return 0;
-
-    ValueType
-      a_0p, b_p0, c_00 = ValueType(0),
-      a_1p,       c_10 = ValueType(0);
-
-    const int
-      i0 = 0*_as0, i1 = 1*_as0,
-      j0 = 0*_bs1;
-
-            
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<k;++p) {
-      a_0p = A[i0+p*_as1]; b_p0 = B[p*_bs0+j0];
-      a_1p = A[i1+p*_as1]; 
+  for (int p = 0; p < k; ++p) {
+    a_0p = A[i0 + p * _as1];
+    b_p0 = B[p * _bs0 + j0];
+    a_1p = A[i1 + p * _as1];
 
-      c_00 += a_0p * b_p0; 
-      c_10 += a_1p * b_p0; 
-    }
-
-    C[0*_cs0+0*_cs1] += alpha * c_00; 
-    C[1*_cs0+0*_cs1] += alpha * c_10; 
-    
-    return 0;
+    c_00 += a_0p * b_p0;
+    c_10 += a_1p * b_p0;
   }
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<1,2>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (k <= 0) return 0;
-
-    ValueType
-      a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0),
-      /**/  b_p1;
-    const int
-      i0 = 0*_as0, 
-      j0 = 0*_bs1, j1 = 1*_bs1;
-
-            
+  C[0 * _cs0 + 0 * _cs1] += alpha * c_00;
+  C[1 * _cs0 + 0 * _cs1] += alpha * c_10;
+
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 2>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (k <= 0) return 0;
+
+  ValueType a_0p, b_p0, c_00 = ValueType(0), c_01 = ValueType(0),
+                        /**/ b_p1;
+  const int i0 = 0 * _as0, j0 = 0 * _bs1, j1 = 1 * _bs1;
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<k;++p) {
-      a_0p = A[i0+p*_as1]; b_p0 = B[p*_bs0+j0];
-      /* */                b_p1 = B[p*_bs0+j1];
-      c_00 += a_0p * b_p0; c_01 += a_0p * b_p1;
-    }
-
-    C[0*_cs0+0*_cs1] += alpha * c_00; C[0*_cs0+1*_cs1] += alpha * c_01;
-
-    return 0;
+  for (int p = 0; p < k; ++p) {
+    a_0p       = A[i0 + p * _as1];
+    b_p0       = B[p * _bs0 + j0];
+    /* */ b_p1 = B[p * _bs0 + j1];
+    c_00 += a_0p * b_p0;
+    c_01 += a_0p * b_p1;
   }
 
+  C[0 * _cs0 + 0 * _cs1] += alpha * c_00;
+  C[0 * _cs0 + 1 * _cs1] += alpha * c_01;
+
+  return 0;
+}
 
-  ///
-  /// Inner kernel (1x1)
-  /// ==================
+///
+/// Inner kernel (1x1)
+/// ==================
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<1,1>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (k <= 0) return 0;
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 1>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (k <= 0) return 0;
 
-    ValueType
-      a_0p, b_p0, c_00 = ValueType(0);
+  ValueType a_0p, b_p0, c_00 = ValueType(0);
 
-    const int
-      i0 = 0*_as0,
-      j0 = 0*_bs1;
+  const int i0 = 0 * _as0, j0 = 0 * _bs1;
 
-            
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<k;++p) {
-      a_0p = A[i0+p*_as1]; b_p0 = B[p*_bs0+j0];
-      c_00 += a_0p * b_p0; 
-    }
-    C[0*_cs0+0*_cs1] += alpha * c_00; 
-
-    return 0;
+  for (int p = 0; p < k; ++p) {
+    a_0p = A[i0 + p * _as1];
+    b_p0 = B[p * _bs0 + j0];
+    c_00 += a_0p * b_p0;
   }
+  C[0 * _cs0 + 0 * _cs1] += alpha * c_00;
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<0,1>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m, const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (m <= 0 || k <= 0) return 0;
-
-    switch (m) {
-    case 5: { InnerGemmFixC<5,1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    case 4: { InnerGemmFixC<4,1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    case 3: { InnerGemmFixC<3,1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    case 2: { InnerGemmFixC<2,1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    case 1: { InnerGemmFixC<1,1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<0, 1>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0 || k <= 0) return 0;
+
+  switch (m) {
+    case 5: {
+      InnerGemmFixC<5, 1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
+    case 4: {
+      InnerGemmFixC<4, 1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
+    case 3: {
+      InnerGemmFixC<3, 1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
+    case 2: {
+      InnerGemmFixC<2, 1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
+    case 1: {
+      InnerGemmFixC<1, 1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
     default: {
       Kokkos::abort("InnerGemmFixC<0,1>::serial_invoke, assert failure (m<=5)");
       break;
     }
-    }
-    return 0;
   }
+  return 0;
+}
 
-
-
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<5,5>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m, const int n, const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (m <=0 || n <= 0 || k <= 0) return 0;
-    if (!(m<=5 && n<=5))
-      Kokkos::abort("InnerGemmFixC<5,5>::serial_invoke, assert failure (m<=5 && n<=5)");
-
-    switch (m*10+n) {
-    case 55: { InnerGemmFixC<5,5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    case 54: { InnerGemmFixC<5,4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    case 53: { InnerGemmFixC<5,3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    case 52: { InnerGemmFixC<5,2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    case 51: { InnerGemmFixC<5,1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    case 45: { InnerGemmFixC<4,5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    case 35: { InnerGemmFixC<3,5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    case 25: { InnerGemmFixC<2,5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    case 15: { InnerGemmFixC<1,5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<5, 5>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0 || n <= 0 || k <= 0) return 0;
+  if (!(m <= 5 && n <= 5))
+    Kokkos::abort(
+        "InnerGemmFixC<5,5>::serial_invoke, assert failure (m<=5 && n<=5)");
+
+  switch (m * 10 + n) {
+    case 55: {
+      InnerGemmFixC<5, 5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
+    case 54: {
+      InnerGemmFixC<5, 4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
+    case 53: {
+      InnerGemmFixC<5, 3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
+    case 52: {
+      InnerGemmFixC<5, 2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
+    case 51: {
+      InnerGemmFixC<5, 1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
+    case 45: {
+      InnerGemmFixC<4, 5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
+    case 35: {
+      InnerGemmFixC<3, 5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
+    case 25: {
+      InnerGemmFixC<2, 5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
+    case 15: {
+      InnerGemmFixC<1, 5> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
     default: {
-      InnerGemmFixC<4,4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, n, k, C); 
+      InnerGemmFixC<4, 4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, m, n, k, C);
       break;
     }
-    }        
-    return 0;
   }
+  return 0;
+}
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<4,4>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m, const int n, const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (m <=0 || n <= 0 || k <= 0) return 0;
-    if (!(m<=4 && n<=4))
-      Kokkos::abort("InnerGemmFixC<4,4>::serial_invoke, assert failure (m<=4 && n<=4)");
-
-    switch (m*10+n) {
-    case 44: { InnerGemmFixC<4,4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    case 43: { InnerGemmFixC<4,3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    case 42: { InnerGemmFixC<4,2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    case 41: { InnerGemmFixC<4,1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    case 34: { InnerGemmFixC<3,4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    case 24: { InnerGemmFixC<2,4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    case 14: { InnerGemmFixC<1,4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    default: {
-      InnerGemmFixC<3,3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, n, k, C); 
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<4, 4>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0 || n <= 0 || k <= 0) return 0;
+  if (!(m <= 4 && n <= 4))
+    Kokkos::abort(
+        "InnerGemmFixC<4,4>::serial_invoke, assert failure (m<=4 && n<=4)");
+
+  switch (m * 10 + n) {
+    case 44: {
+      InnerGemmFixC<4, 4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
       break;
     }
+    case 43: {
+      InnerGemmFixC<4, 3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
     }
-    return 0;
-  }
-
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<3,3>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m, const int n, const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (m <=0 || n <= 0 || k <= 0) return 0;
-    if (!(m<=3 && n<=3))
-      Kokkos::abort("InnerGemmFixC<3,3>::serial_invoke, assert failure (m<=3 && n<=3)");
-
-    switch (m*10+n) {
-    case 33: { InnerGemmFixC<3,3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    case 32: { InnerGemmFixC<3,2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    case 31: { InnerGemmFixC<3,1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    case 23: { InnerGemmFixC<2,3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    case 13: { InnerGemmFixC<1,3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    default: {
-      InnerGemmFixC<2,2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, m, n, k, C); 
+    case 42: {
+      InnerGemmFixC<4, 2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
+    case 41: {
+      InnerGemmFixC<4, 1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
+    case 34: {
+      InnerGemmFixC<3, 4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
+    case 24: {
+      InnerGemmFixC<2, 4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
       break;
     }
+    case 14: {
+      InnerGemmFixC<1, 4> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
+    default: {
+      InnerGemmFixC<3, 3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, m, n, k, C);
+      break;
     }
-    return 0;
   }
+  return 0;
+}
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<2,2>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m, const int n, const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (m <=0 || n <= 0 || k <= 0) return 0;
-    if (!(m<=2 && n<=2))
-      Kokkos::abort("InnerGemmFixC<2,2>::serial_invoke, assert failure (m<=2 && n<=2)");
-
-    switch (m*10+n) {
-    case 22: { InnerGemmFixC<2,2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    case 21: { InnerGemmFixC<2,1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    case 12: { InnerGemmFixC<1,2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
-    case 11: { InnerGemmFixC<1,1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1); inner.serial_invoke(alpha, A, B, k, C); break; }
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<3, 3>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0 || n <= 0 || k <= 0) return 0;
+  if (!(m <= 3 && n <= 3))
+    Kokkos::abort(
+        "InnerGemmFixC<3,3>::serial_invoke, assert failure (m<=3 && n<=3)");
+
+  switch (m * 10 + n) {
+    case 33: {
+      InnerGemmFixC<3, 3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
+    case 32: {
+      InnerGemmFixC<3, 2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
+    case 31: {
+      InnerGemmFixC<3, 1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
+    case 23: {
+      InnerGemmFixC<2, 3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
+    case 13: {
+      InnerGemmFixC<1, 3> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
+    default: {
+      InnerGemmFixC<2, 2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, m, n, k, C);
+      break;
     }
-    return 0;
   }
+  return 0;
+}
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<1,1>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ B,
-                const int m, const int n, const int k,
-                /**/  ValueType *__restrict__ C) {
-    if (m <=0 || n <= 0 || k <= 0) return 0;
-    if (!(m<=1 && n<=1))
-      Kokkos::abort("InnerGemmFixC<1,1>::serial_invoke, assert failure (m<=1 && n<=1)");
-
-    return serial_invoke(alpha, A, B, k, C);;
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<2, 2>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0 || n <= 0 || k <= 0) return 0;
+  if (!(m <= 2 && n <= 2))
+    Kokkos::abort(
+        "InnerGemmFixC<2,2>::serial_invoke, assert failure (m<=2 && n<=2)");
+
+  switch (m * 10 + n) {
+    case 22: {
+      InnerGemmFixC<2, 2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
+    case 21: {
+      InnerGemmFixC<2, 1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
+    case 12: {
+      InnerGemmFixC<1, 2> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
+    case 11: {
+      InnerGemmFixC<1, 1> inner(_as0, _as1, _bs0, _bs1, _cs0, _cs1);
+      inner.serial_invoke(alpha, A, B, k, C);
+      break;
+    }
   }
-
+  return 0;
 }
 
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<1, 1>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT B, const int m, const int n, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  if (m <= 0 || n <= 0 || k <= 0) return 0;
+  if (!(m <= 1 && n <= 1))
+    Kokkos::abort(
+        "InnerGemmFixC<1,1>::serial_invoke, assert failure (m<=1 && n<=1)");
+
+  return serial_invoke(alpha, A, B, k, C);
+  ;
+}
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_InnerGemmFixC_Team_Impl.hpp b/src/batched/dense/impl/KokkosBatched_InnerGemmFixC_Team_Impl.hpp
index 5c136cd24f..f39dc60a70 100644
--- a/src/batched/dense/impl/KokkosBatched_InnerGemmFixC_Team_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_InnerGemmFixC_Team_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_INNER_GEMM_FIX_C_TEAM_IMPL_HPP__
 #define __KOKKOSBATCHED_INNER_GEMM_FIX_C_TEAM_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,66 +8,47 @@
 
 namespace KokkosBatched {
 
-  template<int mb, int nb>
-  template<typename MemberType,
-           typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<mb,nb>::
-  team_invoke(const MemberType &member,
-              const ScalarType alpha,
-              const ValueType *__restrict__ A,
-              const ValueType *__restrict__ B,
-              const int k,
-              /**/  ValueType *__restrict__ C) {
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,mb*nb),[&](const int &ij) {
-        const int
-          i = ij/nb,
-          j = ij%nb;
+template <int mb, int nb>
+template <typename MemberType, typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<mb, nb>::team_invoke(
+    const MemberType &member, const ScalarType alpha,
+    const ValueType *KOKKOS_RESTRICT A, const ValueType *KOKKOS_RESTRICT B,
+    const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  Kokkos::parallel_for(
+      Kokkos::TeamThreadRange(member, 0, mb * nb), [&](const int &ij) {
+        const int i = ij / nb, j = ij % nb;
 
-        const ValueType
-          *__restrict__ pA = A+i*_as0,
-          *__restrict__ pB = B+j*_bs1;
+        const ValueType *KOKKOS_RESTRICT pA                  = A + i * _as0,
+                                         *KOKKOS_RESTRICT pB = B + j * _bs1;
 
         ValueType c = 0;
-        for (int p=0;p<k;++p)
-          c += pA[p*_as1]*pB[p*_bs0];
-        C[i*_cs0+j*_cs1] += alpha*c;
+        for (int p = 0; p < k; ++p) c += pA[p * _as1] * pB[p * _bs0];
+        C[i * _cs0 + j * _cs1] += alpha * c;
       });
-    return 0;
-  }
+  return 0;
+}
 
-  template<int mb, int nb>
-  template<typename MemberType,
-           typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerGemmFixC<mb,nb>::
-  team_invoke(const MemberType &member,
-              const ScalarType alpha,
-              const ValueType *__restrict__ A,
-              const ValueType *__restrict__ B,
-              const int m, const int n, const int k,
-              /**/  ValueType *__restrict__ C) {
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,m*n),[&](const int &ij) {
-        const int
-          i = ij/n,
-          j = ij%n;
+template <int mb, int nb>
+template <typename MemberType, typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerGemmFixC<mb, nb>::team_invoke(
+    const MemberType &member, const ScalarType alpha,
+    const ValueType *KOKKOS_RESTRICT A, const ValueType *KOKKOS_RESTRICT B,
+    const int m, const int n, const int k,
+    /**/ ValueType *KOKKOS_RESTRICT C) {
+  Kokkos::parallel_for(
+      Kokkos::TeamThreadRange(member, 0, m * n), [&](const int &ij) {
+        const int i = ij / n, j = ij % n;
 
-        const ValueType
-          *__restrict__ pA = A+i*_as0,
-          *__restrict__ pB = B+j*_bs1;
+        const ValueType *KOKKOS_RESTRICT pA                  = A + i * _as0,
+                                         *KOKKOS_RESTRICT pB = B + j * _bs1;
 
         ValueType c = 0;
-        for (int p=0;p<k;++p)
-          c += pA[p*_as1]*pB[p*_bs0];
-        C[i*_cs0+j*_cs1] += alpha*c;
+        for (int p = 0; p < k; ++p) c += pA[p * _as1] * pB[p * _bs0];
+        C[i * _cs0 + j * _cs1] += alpha * c;
       });
-    return 0;
-  }
+  return 0;
 }
-
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_InnerLU_Serial_Impl.hpp b/src/batched/dense/impl/KokkosBatched_InnerLU_Serial_Impl.hpp
index f529ae2983..d50e6bdd7c 100644
--- a/src/batched/dense/impl/KokkosBatched_InnerLU_Serial_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_InnerLU_Serial_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_INNER_LU_SERIAL_IMPL_HPP__
 #define __KOKKOSBATCHED_INNER_LU_SERIAL_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,268 +8,387 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Fixed size LU
-  /// ================
-
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerLU<5>::
-  serial_invoke(ValueType *__restrict__ A) {
-    // load
-    ValueType
-      a_00 = A[0*_as0+0*_as1], a_01 = A[0*_as0+1*_as1], a_02 = A[0*_as0+2*_as1], a_03 = A[0*_as0+3*_as1], a_04 = A[0*_as0+4*_as1],
-      a_10 = A[1*_as0+0*_as1], a_11 = A[1*_as0+1*_as1], a_12 = A[1*_as0+2*_as1], a_13 = A[1*_as0+3*_as1], a_14 = A[1*_as0+4*_as1],
-      a_20 = A[2*_as0+0*_as1], a_21 = A[2*_as0+1*_as1], a_22 = A[2*_as0+2*_as1], a_23 = A[2*_as0+3*_as1], a_24 = A[2*_as0+4*_as1],
-      a_30 = A[3*_as0+0*_as1], a_31 = A[3*_as0+1*_as1], a_32 = A[3*_as0+2*_as1], a_33 = A[3*_as0+3*_as1], a_34 = A[3*_as0+4*_as1],
-      a_40 = A[4*_as0+0*_as1], a_41 = A[4*_as0+1*_as1], a_42 = A[4*_as0+2*_as1], a_43 = A[4*_as0+3*_as1], a_44 = A[4*_as0+4*_as1];
-    
-    // 0 iteration
-    a_10 /= a_00;  a_11 -= a_10*a_01; a_12 -= a_10*a_02; a_13 -= a_10*a_03; a_14 -= a_10*a_04;
-    a_20 /= a_00;  a_21 -= a_20*a_01; a_22 -= a_20*a_02; a_23 -= a_20*a_03; a_24 -= a_20*a_04;
-    a_30 /= a_00;  a_31 -= a_30*a_01; a_32 -= a_30*a_02; a_33 -= a_30*a_03; a_34 -= a_30*a_04;
-    a_40 /= a_00;  a_41 -= a_40*a_01; a_42 -= a_40*a_02; a_43 -= a_40*a_03; a_44 -= a_40*a_04;
-    
-    // 1 iteration
-    a_21 /= a_11;  a_22 -= a_21*a_12; a_23 -= a_21*a_13; a_24 -= a_21*a_14;
-    a_31 /= a_11;  a_32 -= a_31*a_12; a_33 -= a_31*a_13; a_34 -= a_31*a_14;
-    a_41 /= a_11;  a_42 -= a_41*a_12; a_43 -= a_41*a_13; a_44 -= a_41*a_14;
-    
-    // 2 iteration
-    a_32 /= a_22; a_33 -= a_32*a_23; a_34 -= a_32*a_24;
-    a_42 /= a_22; a_43 -= a_42*a_23; a_44 -= a_42*a_24;
-    
-    // 3 iteration
-    a_43 /= a_33; a_44 -= a_43*a_34; 
-    
-    // store
-    A[1*_as0+0*_as1] = a_10; A[1*_as0+1*_as1] = a_11; A[1*_as0+2*_as1] = a_12; A[1*_as0+3*_as1] = a_13; A[1*_as0+4*_as1] = a_14;
-    A[2*_as0+0*_as1] = a_20; A[2*_as0+1*_as1] = a_21; A[2*_as0+2*_as1] = a_22; A[2*_as0+3*_as1] = a_23; A[2*_as0+4*_as1] = a_24;
-    A[3*_as0+0*_as1] = a_30; A[3*_as0+1*_as1] = a_31; A[3*_as0+2*_as1] = a_32; A[3*_as0+3*_as1] = a_33; A[3*_as0+4*_as1] = a_34;
-    A[4*_as0+0*_as1] = a_40; A[4*_as0+1*_as1] = a_41; A[4*_as0+2*_as1] = a_42; A[4*_as0+3*_as1] = a_43; A[4*_as0+4*_as1] = a_44;
-    
-    return 0;
-  }
-    
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerLU<4>::
-  serial_invoke(ValueType *__restrict__ A) {
-    // load
-    ValueType
-      a_00 = A[0*_as0+0*_as1], a_01 = A[0*_as0+1*_as1], a_02 = A[0*_as0+2*_as1], a_03 = A[0*_as0+3*_as1],
-      a_10 = A[1*_as0+0*_as1], a_11 = A[1*_as0+1*_as1], a_12 = A[1*_as0+2*_as1], a_13 = A[1*_as0+3*_as1],
-      a_20 = A[2*_as0+0*_as1], a_21 = A[2*_as0+1*_as1], a_22 = A[2*_as0+2*_as1], a_23 = A[2*_as0+3*_as1],
-      a_30 = A[3*_as0+0*_as1], a_31 = A[3*_as0+1*_as1], a_32 = A[3*_as0+2*_as1], a_33 = A[3*_as0+3*_as1];
-    
-    // 0 iteration
-    a_10 /= a_00;  a_11 -= a_10*a_01; a_12 -= a_10*a_02; a_13 -= a_10*a_03;
-    a_20 /= a_00;  a_21 -= a_20*a_01; a_22 -= a_20*a_02; a_23 -= a_20*a_03;
-    a_30 /= a_00;  a_31 -= a_30*a_01; a_32 -= a_30*a_02; a_33 -= a_30*a_03;
-    
-    // 1 iteration
-    a_21 /= a_11;  a_22 -= a_21*a_12; a_23 -= a_21*a_13;
-    a_31 /= a_11;  a_32 -= a_31*a_12; a_33 -= a_31*a_13;
-    
-    // 2 iteration
-    a_32 /= a_22; a_33 -= a_32*a_23;
-    
-    // store
-    A[1*_as0+0*_as1] = a_10; A[1*_as0+1*_as1] = a_11; A[1*_as0+2*_as1] = a_12; A[1*_as0+3*_as1] = a_13;
-    A[2*_as0+0*_as1] = a_20; A[2*_as0+1*_as1] = a_21; A[2*_as0+2*_as1] = a_22; A[2*_as0+3*_as1] = a_23;
-    A[3*_as0+0*_as1] = a_30; A[3*_as0+1*_as1] = a_31; A[3*_as0+2*_as1] = a_32; A[3*_as0+3*_as1] = a_33;
-    
-    return 0;
-  }
-  
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerLU<3>::
-  serial_invoke(ValueType *__restrict__ A) {
-    // load
-    ValueType
-      a_00 = A[0*_as0+0*_as1], a_01 = A[0*_as0+1*_as1], a_02 = A[0*_as0+2*_as1],
-      a_10 = A[1*_as0+0*_as1], a_11 = A[1*_as0+1*_as1], a_12 = A[1*_as0+2*_as1],
-      a_20 = A[2*_as0+0*_as1], a_21 = A[2*_as0+1*_as1], a_22 = A[2*_as0+2*_as1];
-    
-    // 0 iteration
-    a_10 /= a_00;  a_11 -= a_10*a_01; a_12 -= a_10*a_02; 
-    a_20 /= a_00;  a_21 -= a_20*a_01; a_22 -= a_20*a_02; 
-    
-    // 1 iteration
-    a_21 /= a_11;  a_22 -= a_21*a_12; 
-    
-    // store
-    A[1*_as0+0*_as1] = a_10; A[1*_as0+1*_as1] = a_11; A[1*_as0+2*_as1] = a_12; 
-    A[2*_as0+0*_as1] = a_20; A[2*_as0+1*_as1] = a_21; A[2*_as0+2*_as1] = a_22; 
-    
-    return 0;
-  }  
-  
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerLU<2>::
-  serial_invoke(ValueType *__restrict__ A) {
-    // load
-    ValueType
-      a_00 = A[0*_as0+0*_as1], a_01 = A[0*_as0+1*_as1],
-      a_10 = A[1*_as0+0*_as1], a_11 = A[1*_as0+1*_as1];
-    
-    // 0 iteration
-    a_10 /= a_00;  a_11 -= a_10*a_01;
-    
-    // store
-    A[1*_as0+0*_as1] = a_10; A[1*_as0+1*_as1] = a_11; 
-    
-    return 0;
-  }
-  
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerLU<1>::
-  serial_invoke(ValueType *__restrict__ /* A */) {
-    return 0;
-  }
+///
+/// Fixed size LU
+/// ================
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerLU<5>::serial_invoke(
+    ValueType *KOKKOS_RESTRICT A) {
+  // load
+  ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1],
+            a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1],
+            a_04 = A[0 * _as0 + 4 * _as1], a_10 = A[1 * _as0 + 0 * _as1],
+            a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1],
+            a_13 = A[1 * _as0 + 3 * _as1], a_14 = A[1 * _as0 + 4 * _as1],
+            a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1],
+            a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1],
+            a_24 = A[2 * _as0 + 4 * _as1], a_30 = A[3 * _as0 + 0 * _as1],
+            a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1],
+            a_33 = A[3 * _as0 + 3 * _as1], a_34 = A[3 * _as0 + 4 * _as1],
+            a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1],
+            a_42 = A[4 * _as0 + 2 * _as1], a_43 = A[4 * _as0 + 3 * _as1],
+            a_44 = A[4 * _as0 + 4 * _as1];
+
+  // 0 iteration
+  a_10 /= a_00;
+  a_11 -= a_10 * a_01;
+  a_12 -= a_10 * a_02;
+  a_13 -= a_10 * a_03;
+  a_14 -= a_10 * a_04;
+  a_20 /= a_00;
+  a_21 -= a_20 * a_01;
+  a_22 -= a_20 * a_02;
+  a_23 -= a_20 * a_03;
+  a_24 -= a_20 * a_04;
+  a_30 /= a_00;
+  a_31 -= a_30 * a_01;
+  a_32 -= a_30 * a_02;
+  a_33 -= a_30 * a_03;
+  a_34 -= a_30 * a_04;
+  a_40 /= a_00;
+  a_41 -= a_40 * a_01;
+  a_42 -= a_40 * a_02;
+  a_43 -= a_40 * a_03;
+  a_44 -= a_40 * a_04;
+
+  // 1 iteration
+  a_21 /= a_11;
+  a_22 -= a_21 * a_12;
+  a_23 -= a_21 * a_13;
+  a_24 -= a_21 * a_14;
+  a_31 /= a_11;
+  a_32 -= a_31 * a_12;
+  a_33 -= a_31 * a_13;
+  a_34 -= a_31 * a_14;
+  a_41 /= a_11;
+  a_42 -= a_41 * a_12;
+  a_43 -= a_41 * a_13;
+  a_44 -= a_41 * a_14;
+
+  // 2 iteration
+  a_32 /= a_22;
+  a_33 -= a_32 * a_23;
+  a_34 -= a_32 * a_24;
+  a_42 /= a_22;
+  a_43 -= a_42 * a_23;
+  a_44 -= a_42 * a_24;
+
+  // 3 iteration
+  a_43 /= a_33;
+  a_44 -= a_43 * a_34;
+
+  // store
+  A[1 * _as0 + 0 * _as1] = a_10;
+  A[1 * _as0 + 1 * _as1] = a_11;
+  A[1 * _as0 + 2 * _as1] = a_12;
+  A[1 * _as0 + 3 * _as1] = a_13;
+  A[1 * _as0 + 4 * _as1] = a_14;
+  A[2 * _as0 + 0 * _as1] = a_20;
+  A[2 * _as0 + 1 * _as1] = a_21;
+  A[2 * _as0 + 2 * _as1] = a_22;
+  A[2 * _as0 + 3 * _as1] = a_23;
+  A[2 * _as0 + 4 * _as1] = a_24;
+  A[3 * _as0 + 0 * _as1] = a_30;
+  A[3 * _as0 + 1 * _as1] = a_31;
+  A[3 * _as0 + 2 * _as1] = a_32;
+  A[3 * _as0 + 3 * _as1] = a_33;
+  A[3 * _as0 + 4 * _as1] = a_34;
+  A[4 * _as0 + 0 * _as1] = a_40;
+  A[4 * _as0 + 1 * _as1] = a_41;
+  A[4 * _as0 + 2 * _as1] = a_42;
+  A[4 * _as0 + 3 * _as1] = a_43;
+  A[4 * _as0 + 4 * _as1] = a_44;
+
+  return 0;
+}
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerLU<4>::serial_invoke(
+    ValueType *KOKKOS_RESTRICT A) {
+  // load
+  ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1],
+            a_02 = A[0 * _as0 + 2 * _as1], a_03 = A[0 * _as0 + 3 * _as1],
+            a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1],
+            a_12 = A[1 * _as0 + 2 * _as1], a_13 = A[1 * _as0 + 3 * _as1],
+            a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1],
+            a_22 = A[2 * _as0 + 2 * _as1], a_23 = A[2 * _as0 + 3 * _as1],
+            a_30 = A[3 * _as0 + 0 * _as1], a_31 = A[3 * _as0 + 1 * _as1],
+            a_32 = A[3 * _as0 + 2 * _as1], a_33 = A[3 * _as0 + 3 * _as1];
+
+  // 0 iteration
+  a_10 /= a_00;
+  a_11 -= a_10 * a_01;
+  a_12 -= a_10 * a_02;
+  a_13 -= a_10 * a_03;
+  a_20 /= a_00;
+  a_21 -= a_20 * a_01;
+  a_22 -= a_20 * a_02;
+  a_23 -= a_20 * a_03;
+  a_30 /= a_00;
+  a_31 -= a_30 * a_01;
+  a_32 -= a_30 * a_02;
+  a_33 -= a_30 * a_03;
+
+  // 1 iteration
+  a_21 /= a_11;
+  a_22 -= a_21 * a_12;
+  a_23 -= a_21 * a_13;
+  a_31 /= a_11;
+  a_32 -= a_31 * a_12;
+  a_33 -= a_31 * a_13;
+
+  // 2 iteration
+  a_32 /= a_22;
+  a_33 -= a_32 * a_23;
+
+  // store
+  A[1 * _as0 + 0 * _as1] = a_10;
+  A[1 * _as0 + 1 * _as1] = a_11;
+  A[1 * _as0 + 2 * _as1] = a_12;
+  A[1 * _as0 + 3 * _as1] = a_13;
+  A[2 * _as0 + 0 * _as1] = a_20;
+  A[2 * _as0 + 1 * _as1] = a_21;
+  A[2 * _as0 + 2 * _as1] = a_22;
+  A[2 * _as0 + 3 * _as1] = a_23;
+  A[3 * _as0 + 0 * _as1] = a_30;
+  A[3 * _as0 + 1 * _as1] = a_31;
+  A[3 * _as0 + 2 * _as1] = a_32;
+  A[3 * _as0 + 3 * _as1] = a_33;
+
+  return 0;
+}
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerLU<3>::serial_invoke(
+    ValueType *KOKKOS_RESTRICT A) {
+  // load
+  ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1],
+            a_02 = A[0 * _as0 + 2 * _as1], a_10 = A[1 * _as0 + 0 * _as1],
+            a_11 = A[1 * _as0 + 1 * _as1], a_12 = A[1 * _as0 + 2 * _as1],
+            a_20 = A[2 * _as0 + 0 * _as1], a_21 = A[2 * _as0 + 1 * _as1],
+            a_22 = A[2 * _as0 + 2 * _as1];
+
+  // 0 iteration
+  a_10 /= a_00;
+  a_11 -= a_10 * a_01;
+  a_12 -= a_10 * a_02;
+  a_20 /= a_00;
+  a_21 -= a_20 * a_01;
+  a_22 -= a_20 * a_02;
+
+  // 1 iteration
+  a_21 /= a_11;
+  a_22 -= a_21 * a_12;
+
+  // store
+  A[1 * _as0 + 0 * _as1] = a_10;
+  A[1 * _as0 + 1 * _as1] = a_11;
+  A[1 * _as0 + 2 * _as1] = a_12;
+  A[2 * _as0 + 0 * _as1] = a_20;
+  A[2 * _as0 + 1 * _as1] = a_21;
+  A[2 * _as0 + 2 * _as1] = a_22;
+
+  return 0;
+}
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerLU<2>::serial_invoke(
+    ValueType *KOKKOS_RESTRICT A) {
+  // load
+  ValueType a_00 = A[0 * _as0 + 0 * _as1], a_01 = A[0 * _as0 + 1 * _as1],
+            a_10 = A[1 * _as0 + 0 * _as1], a_11 = A[1 * _as0 + 1 * _as1];
+
+  // 0 iteration
+  a_10 /= a_00;
+  a_11 -= a_10 * a_01;
 
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerLU<5>::
-  serial_invoke(const int m, 
-                ValueType *__restrict__ A) {
-    if (m > 5)
-      Kokkos::abort("InnerLU<5>::serial_invoke, assert failure (m<=5)");
-    if (m <= 0) return 0;
-    
-    switch (m) {
-    case 5: { InnerLU<5> inner(_as0, _as1); inner.serial_invoke(A); break; }
-    case 4: { InnerLU<4> inner(_as0, _as1); inner.serial_invoke(A); break; }
-    case 3: { InnerLU<3> inner(_as0, _as1); inner.serial_invoke(A); break; }
-    case 2: { InnerLU<2> inner(_as0, _as1); inner.serial_invoke(A); break; }
-    case 1: { InnerLU<1> inner(_as0, _as1); inner.serial_invoke(A); break; }
+  // store
+  A[1 * _as0 + 0 * _as1] = a_10;
+  A[1 * _as0 + 1 * _as1] = a_11;
+
+  return 0;
+}
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerLU<1>::serial_invoke(
+    ValueType *KOKKOS_RESTRICT /* A */) {
+  return 0;
+}
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerLU<5>::serial_invoke(
+    const int m, ValueType *KOKKOS_RESTRICT A) {
+  if (m > 5) Kokkos::abort("InnerLU<5>::serial_invoke, assert failure (m<=5)");
+  if (m <= 0) return 0;
+
+  switch (m) {
+    case 5: {
+      InnerLU<5> inner(_as0, _as1);
+      inner.serial_invoke(A);
+      break;
+    }
+    case 4: {
+      InnerLU<4> inner(_as0, _as1);
+      inner.serial_invoke(A);
+      break;
+    }
+    case 3: {
+      InnerLU<3> inner(_as0, _as1);
+      inner.serial_invoke(A);
+      break;
+    }
+    case 2: {
+      InnerLU<2> inner(_as0, _as1);
+      inner.serial_invoke(A);
+      break;
+    }
+    case 1: {
+      InnerLU<1> inner(_as0, _as1);
+      inner.serial_invoke(A);
+      break;
     }
-    return 0;
   }
+  return 0;
+}
 
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerLU<4>::
-  serial_invoke(const int m, 
-                ValueType *__restrict__ A) {
-    if (m > 4)
-      Kokkos::abort("InnerLU<4>::serial_invoke, assert failure (m<=4)");
-    if (m <= 0) return 0;
-    
-    switch (m) {
-    case 4: { InnerLU<4> inner(_as0, _as1); inner.serial_invoke(A); break; }
-    case 3: { InnerLU<3> inner(_as0, _as1); inner.serial_invoke(A); break; }
-    case 2: { InnerLU<2> inner(_as0, _as1); inner.serial_invoke(A); break; }
-    case 1: { InnerLU<1> inner(_as0, _as1); inner.serial_invoke(A); break; }
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerLU<4>::serial_invoke(
+    const int m, ValueType *KOKKOS_RESTRICT A) {
+  if (m > 4) Kokkos::abort("InnerLU<4>::serial_invoke, assert failure (m<=4)");
+  if (m <= 0) return 0;
+
+  switch (m) {
+    case 4: {
+      InnerLU<4> inner(_as0, _as1);
+      inner.serial_invoke(A);
+      break;
+    }
+    case 3: {
+      InnerLU<3> inner(_as0, _as1);
+      inner.serial_invoke(A);
+      break;
+    }
+    case 2: {
+      InnerLU<2> inner(_as0, _as1);
+      inner.serial_invoke(A);
+      break;
+    }
+    case 1: {
+      InnerLU<1> inner(_as0, _as1);
+      inner.serial_invoke(A);
+      break;
     }
-    return 0;
   }
+  return 0;
+}
 
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerLU<3>::
-  serial_invoke(const int m, 
-                ValueType *__restrict__ A) {
-    if (m > 3)
-      Kokkos::abort("InnerLU<3>::serial_invoke, assert failure (m<=3)");
-    if (m <= 0) return 0;
-    
-    switch (m) {
-    case 3: { InnerLU<3> inner(_as0, _as1); inner.serial_invoke(A); break; }
-    case 2: { InnerLU<2> inner(_as0, _as1); inner.serial_invoke(A); break; }
-    case 1: { InnerLU<1> inner(_as0, _as1); inner.serial_invoke(A); break; }
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerLU<3>::serial_invoke(
+    const int m, ValueType *KOKKOS_RESTRICT A) {
+  if (m > 3) Kokkos::abort("InnerLU<3>::serial_invoke, assert failure (m<=3)");
+  if (m <= 0) return 0;
+
+  switch (m) {
+    case 3: {
+      InnerLU<3> inner(_as0, _as1);
+      inner.serial_invoke(A);
+      break;
+    }
+    case 2: {
+      InnerLU<2> inner(_as0, _as1);
+      inner.serial_invoke(A);
+      break;
+    }
+    case 1: {
+      InnerLU<1> inner(_as0, _as1);
+      inner.serial_invoke(A);
+      break;
     }
-    return 0;
   }
+  return 0;
+}
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerLU<2>::serial_invoke(
+    const int m, ValueType *KOKKOS_RESTRICT A) {
+  if (m > 2) Kokkos::abort("InnerLU<2>::serial_invoke, assert failure (m<=2)");
+  if (m <= 0) return 0;
 
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerLU<2>::
-  serial_invoke(const int m, 
-                ValueType *__restrict__ A) {
-    if (m > 2)
-      Kokkos::abort("InnerLU<2>::serial_invoke, assert failure (m<=2)");
-    if (m <= 0) return 0;
-    
-    switch (m) {
-    case 2: { InnerLU<2> inner(_as0, _as1); inner.serial_invoke(A); break; }
-    case 1: { InnerLU<1> inner(_as0, _as1); inner.serial_invoke(A); break; }
+  switch (m) {
+    case 2: {
+      InnerLU<2> inner(_as0, _as1);
+      inner.serial_invoke(A);
+      break;
+    }
+    case 1: {
+      InnerLU<1> inner(_as0, _as1);
+      inner.serial_invoke(A);
+      break;
     }
-    return 0;
   }
+  return 0;
+}
 
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerLU<1>::
-  serial_invoke(const int m, 
-                ValueType *__restrict__ A) {
-    if (m > 1)
-      Kokkos::abort("InnerLU<1>::serial_invoke, assert failure (m<=1)");
-    if (m <= 0) return 0;
-    
-    switch (m) {
-    case 1: { InnerLU<1> inner(_as0, _as1); inner.serial_invoke(A); break; }
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerLU<1>::serial_invoke(
+    const int m, ValueType *KOKKOS_RESTRICT A) {
+  if (m > 1) Kokkos::abort("InnerLU<1>::serial_invoke, assert failure (m<=1)");
+  if (m <= 0) return 0;
+
+  switch (m) {
+    case 1: {
+      InnerLU<1> inner(_as0, _as1);
+      inner.serial_invoke(A);
+      break;
     }
-    return 0;
   }
-      
-  // template<int bmn>
-  // template<typename ValueType>
-  // KOKKOS_INLINE_FUNCTION
-  // int
-  // InnerLU<bmn>::
-  // serial_invoke(const int m, const int n, 
-  //               ValueType *__restrict__ A) {
-  //   if (m <= 0 || n <= 0) return 0;
-  //   const int k = m < n ? m : n;
-  //   for (int p=0;p<k;++p) {
-  //     const ValueType 
-  //       // inv_alpha11 = 1.0/A[p*_as0+p*_as1],  
-  //       alpha11 = A[p*_as0+p*_as1],
-  //       *__restrict__ a12t = A + (p  )*_as0 + (p+1)*_as1;
-          
-  //     ValueType
-  //       *__restrict__ a21  = A + (p+1)*_as0 + (p  )*_as1,
-  //       *__restrict__ A22  = A + (p+1)*_as0 + (p+1)*_as1;
-          
-  //     const int
-  //       iend = m-p-1,
-  //       jend = n-p-1;
-          
-  //     for (int i=0;i<iend;++i) {
-  //       // a21[i*_as0] *= inv_alpha11; 
-  //       a21[i*_as0] /= alpha11;
-  //       for (int j=0;j<jend;++j)
-  //         A22[i*_as0+j*_as1] -= a21[i*_as0] * a12t[j*_as1];
-  //     }
-  //   }
-  //   return 0;
-  // }
-      
+  return 0;
 }
 
+// template<int bmn>
+// template<typename ValueType>
+// KOKKOS_INLINE_FUNCTION
+// int
+// InnerLU<bmn>::
+// serial_invoke(const int m, const int n,
+//               ValueType *KOKKOS_RESTRICT A) {
+//   if (m <= 0 || n <= 0) return 0;
+//   const int k = m < n ? m : n;
+//   for (int p=0;p<k;++p) {
+//     const ValueType
+//       // inv_alpha11 = 1.0/A[p*_as0+p*_as1],
+//       alpha11 = A[p*_as0+p*_as1],
+//       *KOKKOS_RESTRICT a12t = A + (p  )*_as0 + (p+1)*_as1;
+
+//     ValueType
+//       *KOKKOS_RESTRICT a21  = A + (p+1)*_as0 + (p  )*_as1,
+//       *KOKKOS_RESTRICT A22  = A + (p+1)*_as0 + (p+1)*_as1;
+
+//     const int
+//       iend = m-p-1,
+//       jend = n-p-1;
+
+//     for (int i=0;i<iend;++i) {
+//       // a21[i*_as0] *= inv_alpha11;
+//       a21[i*_as0] /= alpha11;
+//       for (int j=0;j<jend;++j)
+//         A22[i*_as0+j*_as1] -= a21[i*_as0] * a12t[j*_as1];
+//     }
+//   }
+//   return 0;
+// }
+
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_InnerMultipleDotProduct_Serial_Impl.hpp b/src/batched/dense/impl/KokkosBatched_InnerMultipleDotProduct_Serial_Impl.hpp
index 205fc9a6fc..70354c5e15 100644
--- a/src/batched/dense/impl/KokkosBatched_InnerMultipleDotProduct_Serial_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_InnerMultipleDotProduct_Serial_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_INNER_MULTIPLE_DOT_PRODUCT_SERIAL_IMPL_HPP__
 #define __KOKKOSBATCHED_INNER_MULTIPLE_DOT_PRODUCT_SERIAL_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,304 +8,298 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Dot Product for GEMV
-  /// ====================
-
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int 
-  InnerMultipleDotProduct<5>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ x,
-                const int n, 
-                /**/  ValueType *__restrict__ y) {
-    if (n <= 0) return 0;
-
-    const int 
-      i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0, i3 = 3*_as0, i4 = 4*_as0;
-
-    // unroll by rows
-    ValueType
-      y_0 = 0, y_1 = 0, y_2 = 0, y_3 = 0, y_4 = 0;
-
-                    
+///
+/// Dot Product for GEMV
+/// ====================
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<5>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT x, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT y) {
+  if (n <= 0) return 0;
+
+  const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0,
+            i4 = 4 * _as0;
+
+  // unroll by rows
+  ValueType y_0 = 0, y_1 = 0, y_2 = 0, y_3 = 0, y_4 = 0;
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int j=0;j<n;++j) {
-      const int jj = j*_as1;
-      const ValueType x_j = x[j*_xs0];
-
-      y_0 += A[i0+jj]*x_j;
-      y_1 += A[i1+jj]*x_j;
-      y_2 += A[i2+jj]*x_j;
-      y_3 += A[i3+jj]*x_j;
-      y_4 += A[i4+jj]*x_j;
-    }
+  for (int j = 0; j < n; ++j) {
+    const int jj        = j * _as1;
+    const ValueType x_j = x[j * _xs0];
+
+    y_0 += A[i0 + jj] * x_j;
+    y_1 += A[i1 + jj] * x_j;
+    y_2 += A[i2 + jj] * x_j;
+    y_3 += A[i3 + jj] * x_j;
+    y_4 += A[i4 + jj] * x_j;
+  }
 
-    y[0*_ys0] += alpha*y_0;
-    y[1*_ys0] += alpha*y_1;
-    y[2*_ys0] += alpha*y_2;
-    y[3*_ys0] += alpha*y_3;
-    y[4*_ys0] += alpha*y_4;
+  y[0 * _ys0] += alpha * y_0;
+  y[1 * _ys0] += alpha * y_1;
+  y[2 * _ys0] += alpha * y_2;
+  y[3 * _ys0] += alpha * y_3;
+  y[4 * _ys0] += alpha * y_4;
 
-    return 0;
-  }
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<4>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT x, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT y) {
+  if (!n) return 0;
+
+  const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0, i3 = 3 * _as0;
+
+  // unroll by rows
+  ValueType y_0 = 0, y_1 = 0, y_2 = 0, y_3 = 0;
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int 
-  InnerMultipleDotProduct<4>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ x,
-                const int n, 
-                /**/  ValueType *__restrict__ y) {
-    if (!n) return 0;
-
-    const int 
-      i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0, i3 = 3*_as0;
-
-    // unroll by rows
-    ValueType
-      y_0 = 0, y_1 = 0, y_2 = 0, y_3 = 0;
-            
-                    
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int j=0;j<n;++j) {
-      const int jj = j*_as1;
-      const ValueType x_j = x[j*_xs0];
-
-      y_0 += A[i0+jj]*x_j;
-      y_1 += A[i1+jj]*x_j;
-      y_2 += A[i2+jj]*x_j;
-      y_3 += A[i3+jj]*x_j;
-    }
+  for (int j = 0; j < n; ++j) {
+    const int jj        = j * _as1;
+    const ValueType x_j = x[j * _xs0];
+
+    y_0 += A[i0 + jj] * x_j;
+    y_1 += A[i1 + jj] * x_j;
+    y_2 += A[i2 + jj] * x_j;
+    y_3 += A[i3 + jj] * x_j;
+  }
 
-    y[0*_ys0] += alpha*y_0;
-    y[1*_ys0] += alpha*y_1;
-    y[2*_ys0] += alpha*y_2;
-    y[3*_ys0] += alpha*y_3;
+  y[0 * _ys0] += alpha * y_0;
+  y[1 * _ys0] += alpha * y_1;
+  y[2 * _ys0] += alpha * y_2;
+  y[3 * _ys0] += alpha * y_3;
 
-    return 0;
-  }
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<3>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT x, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT y) {
+  if (n <= 0) return 0;
+
+  const int i0 = 0 * _as0, i1 = 1 * _as0, i2 = 2 * _as0;
+
+  // unroll by rows
+  ValueType y_0 = 0, y_1 = 0, y_2 = 0;
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int 
-  InnerMultipleDotProduct<3>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ x,
-                const int n, 
-                /**/  ValueType *__restrict__ y) {
-    if (n <= 0) return 0;
-
-    const int 
-      i0 = 0*_as0, i1 = 1*_as0, i2 = 2*_as0;
-
-    // unroll by rows
-    ValueType
-      y_0 = 0, y_1 = 0, y_2 = 0;
-            
-                    
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int j=0;j<n;++j) {
-      const int jj = j*_as1;
-      const ValueType x_j = x[j*_xs0];
+  for (int j = 0; j < n; ++j) {
+    const int jj        = j * _as1;
+    const ValueType x_j = x[j * _xs0];
 
-      y_0 += A[i0+jj]*x_j;
-      y_1 += A[i1+jj]*x_j;
-      y_2 += A[i2+jj]*x_j;
-    }
+    y_0 += A[i0 + jj] * x_j;
+    y_1 += A[i1 + jj] * x_j;
+    y_2 += A[i2 + jj] * x_j;
+  }
 
-    y[0*_ys0] += alpha*y_0;
-    y[1*_ys0] += alpha*y_1;
-    y[2*_ys0] += alpha*y_2;
+  y[0 * _ys0] += alpha * y_0;
+  y[1 * _ys0] += alpha * y_1;
+  y[2 * _ys0] += alpha * y_2;
 
-    return 0;
-  }
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<2>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT x, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT y) {
+  if (n <= 0) return 0;
+
+  const int i0 = 0 * _as0, i1 = 1 * _as0;
+
+  // unroll by rows
+  ValueType y_0 = 0, y_1 = 0;
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int 
-  InnerMultipleDotProduct<2>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ x,
-                const int n, 
-                /**/  ValueType *__restrict__ y) {
-    if (n <= 0) return 0;
-
-    const int 
-      i0 = 0*_as0, i1 = 1*_as0;
-
-    // unroll by rows
-    ValueType
-      y_0 = 0, y_1 = 0;
-            
-                    
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int j=0;j<n;++j) {
-      const int jj = j*_as1;
-      const ValueType x_j = x[j*_xs0];
+  for (int j = 0; j < n; ++j) {
+    const int jj        = j * _as1;
+    const ValueType x_j = x[j * _xs0];
 
-      y_0 += A[i0+jj]*x_j;
-      y_1 += A[i1+jj]*x_j;
-    }
+    y_0 += A[i0 + jj] * x_j;
+    y_1 += A[i1 + jj] * x_j;
+  }
 
-    y[0*_ys0] += alpha*y_0;
-    y[1*_ys0] += alpha*y_1;
+  y[0 * _ys0] += alpha * y_0;
+  y[1 * _ys0] += alpha * y_1;
 
-    return 0;
-  }
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<1>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT x, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT y) {
+  if (n <= 0) return 0;
+
+  // unroll by rows
+  ValueType y_0 = 0;
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int 
-  InnerMultipleDotProduct<1>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ x,
-                const int n, 
-                /**/  ValueType *__restrict__ y) {
-    if (n <= 0) return 0;
-
-    // unroll by rows
-    ValueType
-      y_0 = 0;
-            
-                    
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int j=0;j<n;++j) 
-      y_0 += A[j*_as1]*x[j*_xs0];
+  for (int j = 0; j < n; ++j) y_0 += A[j * _as1] * x[j * _xs0];
 
-    y[0] += alpha*y_0;
+  y[0] += alpha * y_0;
 
-    return 0;
-  }
+  return 0;
+}
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int 
-  InnerMultipleDotProduct<5>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ x,
-                const int m, const int n, 
-                /**/  ValueType *__restrict__ y) {
-    if (m <= 0 || n <= 0) return 0;
-    switch (m) {
-    case 5: { InnerMultipleDotProduct<5> inner(_as0, _as1, _xs0, _ys0); inner.serial_invoke(alpha, A, x, n, y); break; }
-    case 4: { InnerMultipleDotProduct<4> inner(_as0, _as1, _xs0, _ys0); inner.serial_invoke(alpha, A, x, n, y); break; }
-    case 3: { InnerMultipleDotProduct<3> inner(_as0, _as1, _xs0, _ys0); inner.serial_invoke(alpha, A, x, n, y); break; }
-    case 2: { InnerMultipleDotProduct<2> inner(_as0, _as1, _xs0, _ys0); inner.serial_invoke(alpha, A, x, n, y); break; }
-    case 1: { InnerMultipleDotProduct<1> inner(_as0, _as1, _xs0, _ys0); inner.serial_invoke(alpha, A, x, n, y); break; }
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<5>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT x, const int m, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT y) {
+  if (m <= 0 || n <= 0) return 0;
+  switch (m) {
+    case 5: {
+      InnerMultipleDotProduct<5> inner(_as0, _as1, _xs0, _ys0);
+      inner.serial_invoke(alpha, A, x, n, y);
+      break;
+    }
+    case 4: {
+      InnerMultipleDotProduct<4> inner(_as0, _as1, _xs0, _ys0);
+      inner.serial_invoke(alpha, A, x, n, y);
+      break;
+    }
+    case 3: {
+      InnerMultipleDotProduct<3> inner(_as0, _as1, _xs0, _ys0);
+      inner.serial_invoke(alpha, A, x, n, y);
+      break;
+    }
+    case 2: {
+      InnerMultipleDotProduct<2> inner(_as0, _as1, _xs0, _ys0);
+      inner.serial_invoke(alpha, A, x, n, y);
+      break;
+    }
+    case 1: {
+      InnerMultipleDotProduct<1> inner(_as0, _as1, _xs0, _ys0);
+      inner.serial_invoke(alpha, A, x, n, y);
+      break;
     }
-    return 0;
   }
+  return 0;
+}
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int 
-  InnerMultipleDotProduct<4>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ x,
-                const int m, const int n, 
-                /**/  ValueType *__restrict__ y) {
-    if (m <= 0 || n <= 0) return 0;
-    switch (m) {
-    case 4: { InnerMultipleDotProduct<4> inner(_as0, _as1, _xs0, _ys0); inner.serial_invoke(alpha, A, x, n, y); break; }
-    case 3: { InnerMultipleDotProduct<3> inner(_as0, _as1, _xs0, _ys0); inner.serial_invoke(alpha, A, x, n, y); break; }
-    case 2: { InnerMultipleDotProduct<2> inner(_as0, _as1, _xs0, _ys0); inner.serial_invoke(alpha, A, x, n, y); break; }
-    case 1: { InnerMultipleDotProduct<1> inner(_as0, _as1, _xs0, _ys0); inner.serial_invoke(alpha, A, x, n, y); break; }
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<4>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT x, const int m, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT y) {
+  if (m <= 0 || n <= 0) return 0;
+  switch (m) {
+    case 4: {
+      InnerMultipleDotProduct<4> inner(_as0, _as1, _xs0, _ys0);
+      inner.serial_invoke(alpha, A, x, n, y);
+      break;
+    }
+    case 3: {
+      InnerMultipleDotProduct<3> inner(_as0, _as1, _xs0, _ys0);
+      inner.serial_invoke(alpha, A, x, n, y);
+      break;
+    }
+    case 2: {
+      InnerMultipleDotProduct<2> inner(_as0, _as1, _xs0, _ys0);
+      inner.serial_invoke(alpha, A, x, n, y);
+      break;
+    }
+    case 1: {
+      InnerMultipleDotProduct<1> inner(_as0, _as1, _xs0, _ys0);
+      inner.serial_invoke(alpha, A, x, n, y);
+      break;
     }
-    return 0;
   }
+  return 0;
+}
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int 
-  InnerMultipleDotProduct<3>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ x,
-                const int m, const int n, 
-                /**/  ValueType *__restrict__ y) {
-    if (m <= 0 || n <= 0) return 0;
-    switch (m) {
-    case 3: { InnerMultipleDotProduct<3> inner(_as0, _as1, _xs0, _ys0); inner.serial_invoke(alpha, A, x, n, y); break; }
-    case 2: { InnerMultipleDotProduct<2> inner(_as0, _as1, _xs0, _ys0); inner.serial_invoke(alpha, A, x, n, y); break; }
-    case 1: { InnerMultipleDotProduct<1> inner(_as0, _as1, _xs0, _ys0); inner.serial_invoke(alpha, A, x, n, y); break; }
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<3>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT x, const int m, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT y) {
+  if (m <= 0 || n <= 0) return 0;
+  switch (m) {
+    case 3: {
+      InnerMultipleDotProduct<3> inner(_as0, _as1, _xs0, _ys0);
+      inner.serial_invoke(alpha, A, x, n, y);
+      break;
+    }
+    case 2: {
+      InnerMultipleDotProduct<2> inner(_as0, _as1, _xs0, _ys0);
+      inner.serial_invoke(alpha, A, x, n, y);
+      break;
+    }
+    case 1: {
+      InnerMultipleDotProduct<1> inner(_as0, _as1, _xs0, _ys0);
+      inner.serial_invoke(alpha, A, x, n, y);
+      break;
     }
-    return 0;
   }
+  return 0;
+}
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int 
-  InnerMultipleDotProduct<2>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ x,
-                const int m, const int n, 
-                /**/  ValueType *__restrict__ y) {
-    if (m <= 0 || n <= 0) return 0;
-    switch (m) {
-    case 2: { InnerMultipleDotProduct<2> inner(_as0, _as1, _xs0, _ys0); inner.serial_invoke(alpha, A, x, n, y); break; }
-    case 1: { InnerMultipleDotProduct<1> inner(_as0, _as1, _xs0, _ys0); inner.serial_invoke(alpha, A, x, n, y); break; }
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<2>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT x, const int m, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT y) {
+  if (m <= 0 || n <= 0) return 0;
+  switch (m) {
+    case 2: {
+      InnerMultipleDotProduct<2> inner(_as0, _as1, _xs0, _ys0);
+      inner.serial_invoke(alpha, A, x, n, y);
+      break;
+    }
+    case 1: {
+      InnerMultipleDotProduct<1> inner(_as0, _as1, _xs0, _ys0);
+      inner.serial_invoke(alpha, A, x, n, y);
+      break;
     }
-    return 0;
   }
+  return 0;
+}
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int 
-  InnerMultipleDotProduct<1>::
-  serial_invoke(const ScalarType alpha,
-                const ValueType *__restrict__ A,
-                const ValueType *__restrict__ x,
-                const int m, const int n, 
-                /**/  ValueType *__restrict__ y) {
-    if (m <= 0 || n <= 0) return 0;
-    switch (m) {
-    case 1: { InnerMultipleDotProduct<1> inner(_as0, _as1, _xs0, _ys0); inner.serial_invoke(alpha, A, x, n, y); break; }
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<1>::serial_invoke(
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const ValueType *KOKKOS_RESTRICT x, const int m, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT y) {
+  if (m <= 0 || n <= 0) return 0;
+  switch (m) {
+    case 1: {
+      InnerMultipleDotProduct<1> inner(_as0, _as1, _xs0, _ys0);
+      inner.serial_invoke(alpha, A, x, n, y);
+      break;
     }
-    return 0;
   }
+  return 0;
 }
-
-
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_InnerTrsm_Serial_Impl.hpp b/src/batched/dense/impl/KokkosBatched_InnerTrsm_Serial_Impl.hpp
index eb3d792a5a..401b13df1c 100644
--- a/src/batched/dense/impl/KokkosBatched_InnerTrsm_Serial_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_InnerTrsm_Serial_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_INNER_TRSM_SERIAL_IMPL_HPP__
 #define __KOKKOSBATCHED_INNER_TRSM_SERIAL_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,1478 +8,1570 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Fixed size TRSM
-  /// ================
-  /// L(m x m) X(m x n) = B (m x n)
-
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftLowerUnitDiag<5>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (n <= 0) return 0;
-
-    const ValueType 
-      a_10 = A[1*_as0+0*_as1], 
-      a_20 = A[2*_as0+0*_as1], a_21 = A[2*_as0+1*_as1], 
-      a_30 = A[3*_as0+0*_as1], a_31 = A[3*_as0+1*_as1], a_32 = A[3*_as0+2*_as1],
-      a_40 = A[4*_as0+0*_as1], a_41 = A[4*_as0+1*_as1], a_42 = A[4*_as0+2*_as1], a_43 = A[4*_as0+3*_as1];
-
-    auto trsv = [&](const int p,
-                    ValueType &b_0p, 
-                    ValueType &b_1p, 
-                    ValueType &b_2p, 
-                    ValueType &b_3p,
-                    ValueType &b_4p) {
-      // load
-      b_0p = B[0*_bs0+p*_bs1];
-      b_1p = B[1*_bs0+p*_bs1];
-      b_2p = B[2*_bs0+p*_bs1];
-      b_3p = B[3*_bs0+p*_bs1];
-      b_4p = B[4*_bs0+p*_bs1];
-        
-      // 0 iteration
-      b_1p -= a_10 * b_0p; 
-      b_2p -= a_20 * b_0p; 
-      b_3p -= a_30 * b_0p; 
-      b_4p -= a_40 * b_0p; 
-
-      // 1 iteration
-      b_2p -= a_21 * b_1p;
-      b_3p -= a_31 * b_1p;
-      b_4p -= a_41 * b_1p; 
-
-      // 2 iteration
-      b_3p -= a_32 * b_2p; 
-      b_4p -= a_42 * b_2p; 
-
-      // 3 iteration
-      b_4p -= a_43 * b_3p; 
-
-      // store
-      B[1*_bs0+p*_bs1] = b_1p;
-      B[2*_bs0+p*_bs1] = b_2p;
-      B[3*_bs0+p*_bs1] = b_3p;
-      B[4*_bs0+p*_bs1] = b_4p;
-    };
-
-        
+///
+/// Fixed size TRSM
+/// ================
+/// L(m x m) X(m x n) = B (m x n)
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<5>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (n <= 0) return 0;
+
+  const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1],
+                  a_21 = A[2 * _as0 + 1 * _as1], a_30 = A[3 * _as0 + 0 * _as1],
+                  a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1],
+                  a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1],
+                  a_42 = A[4 * _as0 + 2 * _as1], a_43 = A[4 * _as0 + 3 * _as1];
+
+  auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p,
+                  ValueType &b_2p, ValueType &b_3p, ValueType &b_4p) {
+    // load
+    b_0p = B[0 * _bs0 + p * _bs1];
+    b_1p = B[1 * _bs0 + p * _bs1];
+    b_2p = B[2 * _bs0 + p * _bs1];
+    b_3p = B[3 * _bs0 + p * _bs1];
+    b_4p = B[4 * _bs0 + p * _bs1];
+
+    // 0 iteration
+    b_1p -= a_10 * b_0p;
+    b_2p -= a_20 * b_0p;
+    b_3p -= a_30 * b_0p;
+    b_4p -= a_40 * b_0p;
+
+    // 1 iteration
+    b_2p -= a_21 * b_1p;
+    b_3p -= a_31 * b_1p;
+    b_4p -= a_41 * b_1p;
+
+    // 2 iteration
+    b_3p -= a_32 * b_2p;
+    b_4p -= a_42 * b_2p;
+
+    // 3 iteration
+    b_4p -= a_43 * b_3p;
+
+    // store
+    B[1 * _bs0 + p * _bs1] = b_1p;
+    B[2 * _bs0 + p * _bs1] = b_2p;
+    B[3 * _bs0 + p * _bs1] = b_3p;
+    B[4 * _bs0 + p * _bs1] = b_4p;
+  };
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<n;++p) {
-      ValueType b_p[5];
-      trsv(p, b_p[0], b_p[1], b_p[2], b_p[3], b_p[4]);
-    }
-    return 0;
+  for (int p = 0; p < n; ++p) {
+    ValueType b_p[5];
+    trsv(p, b_p[0], b_p[1], b_p[2], b_p[3], b_p[4]);
   }
+  return 0;
+}
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<4>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (n <= 0) return 0;
+
+  const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1],
+                  a_21 = A[2 * _as0 + 1 * _as1], a_30 = A[3 * _as0 + 0 * _as1],
+                  a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1];
+
+  auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p,
+                  ValueType &b_2p, ValueType &b_3p) {
+    // load
+    b_0p = B[0 * _bs0 + p * _bs1];
+    b_1p = B[1 * _bs0 + p * _bs1];
+    b_2p = B[2 * _bs0 + p * _bs1];
+    b_3p = B[3 * _bs0 + p * _bs1];
+
+    // 0 iteration
+    b_1p -= a_10 * b_0p;
+    b_2p -= a_20 * b_0p;
+    b_3p -= a_30 * b_0p;
+
+    // 1 iteration
+    b_2p -= a_21 * b_1p;
+    b_3p -= a_31 * b_1p;
+
+    // 2 iteration
+    b_3p -= a_32 * b_2p;
+
+    // store
+    B[1 * _bs0 + p * _bs1] = b_1p;
+    B[2 * _bs0 + p * _bs1] = b_2p;
+    B[3 * _bs0 + p * _bs1] = b_3p;
+  };
 
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftLowerUnitDiag<4>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (n <= 0) return 0;
-
-    const ValueType 
-      a_10 = A[1*_as0+0*_as1], 
-      a_20 = A[2*_as0+0*_as1], a_21 = A[2*_as0+1*_as1], 
-      a_30 = A[3*_as0+0*_as1], a_31 = A[3*_as0+1*_as1], a_32 = A[3*_as0+2*_as1];
-
-    auto trsv = [&](const int p,
-                    ValueType &b_0p, 
-                    ValueType &b_1p, 
-                    ValueType &b_2p, 
-                    ValueType &b_3p) {
-      // load
-      b_0p = B[0*_bs0+p*_bs1];
-      b_1p = B[1*_bs0+p*_bs1];
-      b_2p = B[2*_bs0+p*_bs1];
-      b_3p = B[3*_bs0+p*_bs1];
-        
-      // 0 iteration
-      b_1p -= a_10 * b_0p; 
-      b_2p -= a_20 * b_0p; 
-      b_3p -= a_30 * b_0p; 
-
-      // 1 iteration
-      b_2p -= a_21 * b_1p;
-      b_3p -= a_31 * b_1p;
-
-      // 2 iteration
-      b_3p -= a_32 * b_2p; 
-
-      // store
-      B[1*_bs0+p*_bs1] = b_1p;
-      B[2*_bs0+p*_bs1] = b_2p;
-      B[3*_bs0+p*_bs1] = b_3p;
-    };
-
-        
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<n;++p) {
-      ValueType b_p[4];
-      trsv(p, b_p[0], b_p[1], b_p[2], b_p[3]);
-    }
-    return 0;
+  for (int p = 0; p < n; ++p) {
+    ValueType b_p[4];
+    trsv(p, b_p[0], b_p[1], b_p[2], b_p[3]);
   }
+  return 0;
+}
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<3>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (n <= 0) return 0;
+
+  const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1],
+                  a_21 = A[2 * _as0 + 1 * _as1];
+
+  auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p,
+                  ValueType &b_2p) {
+    // load
+    b_0p = B[0 * _bs0 + p * _bs1];
+    b_1p = B[1 * _bs0 + p * _bs1];
+    b_2p = B[2 * _bs0 + p * _bs1];
+
+    // 0 iteration
+    b_1p -= a_10 * b_0p;
+    b_2p -= a_20 * b_0p;
+
+    // 1 iteration
+    b_2p -= a_21 * b_1p;
+
+    // store
+    B[1 * _bs0 + p * _bs1] = b_1p;
+    B[2 * _bs0 + p * _bs1] = b_2p;
+  };
 
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftLowerUnitDiag<3>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (n <= 0) return 0;
-
-    const ValueType 
-      a_10 = A[1*_as0+0*_as1], 
-      a_20 = A[2*_as0+0*_as1], a_21 = A[2*_as0+1*_as1];
-
-    auto trsv = [&](const int p,
-                    ValueType &b_0p, 
-                    ValueType &b_1p, 
-                    ValueType &b_2p) {
-
-      // load
-      b_0p = B[0*_bs0+p*_bs1];
-      b_1p = B[1*_bs0+p*_bs1];
-      b_2p = B[2*_bs0+p*_bs1];
-        
-      // 0 iteration
-      b_1p -= a_10 * b_0p; 
-      b_2p -= a_20 * b_0p; 
-
-      // 1 iteration
-      b_2p -= a_21 * b_1p;
-
-      // store
-      B[1*_bs0+p*_bs1] = b_1p;
-      B[2*_bs0+p*_bs1] = b_2p;
-    };
-        
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<n;++p) {
-      ValueType b_p[3];
-      trsv(p, b_p[0], b_p[1], b_p[2]);
-    }
-    return 0;
+  for (int p = 0; p < n; ++p) {
+    ValueType b_p[3];
+    trsv(p, b_p[0], b_p[1], b_p[2]);
   }
+  return 0;
+}
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<2>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (n <= 0) return 0;
+
+  const ValueType a_10 = A[1 * _as0 + 0 * _as1];
+
+  auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p) {
+    // load
+    b_0p = B[0 * _bs0 + p * _bs1];
+    b_1p = B[1 * _bs0 + p * _bs1];
+
+    // 0 iteration
+    b_1p -= a_10 * b_0p;
+
+    // store
+    B[1 * _bs0 + p * _bs1] = b_1p;
+  };
 
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftLowerUnitDiag<2>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (n <= 0) return 0;
-
-    const ValueType 
-      a_10 = A[1*_as0+0*_as1];
-
-    auto trsv = [&](const int p,
-                    ValueType &b_0p, 
-                    ValueType &b_1p) {
-
-      // load
-      b_0p = B[0*_bs0+p*_bs1];
-      b_1p = B[1*_bs0+p*_bs1];
-        
-      // 0 iteration
-      b_1p -= a_10 * b_0p; 
-
-      // store
-      B[1*_bs0+p*_bs1] = b_1p;
-    };
-
-        
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<n;++p) {
-      ValueType b_p[2];
-      trsv(p, b_p[0], b_p[1]);
-    }
-
-    return 0;
+  for (int p = 0; p < n; ++p) {
+    ValueType b_p[2];
+    trsv(p, b_p[0], b_p[1]);
   }
 
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftLowerUnitDiag<1>::
-  serial_invoke(const ValueType *__restrict__ /* A */,
-                const int /* n */,
-                /**/  ValueType *__restrict__ /* B */) {
-    return 0;
-  }
+  return 0;
+}
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<1>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT /* A */, const int /* n */,
+    /**/ ValueType *KOKKOS_RESTRICT /* B */) {
+  return 0;
+}
 
-  ///
-  /// TRSM
-  /// ====
-  /// L(m x m) X(m x n) = B (m x n)
-    
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftLowerUnitDiag<5>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int m, const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (m > 5)
-      Kokkos::abort("InnerTrsmLeftLowerUnitDiag<5>::serial_invoke, assert failure (m<=5)");
-    if (m <= 0 || n <= 0) return 0;        
-    switch (m) {
-    case 5: { InnerTrsmLeftLowerUnitDiag<5> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 4: { InnerTrsmLeftLowerUnitDiag<4> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 3: { InnerTrsmLeftLowerUnitDiag<3> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 2: { InnerTrsmLeftLowerUnitDiag<2> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 1: { InnerTrsmLeftLowerUnitDiag<1> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
+///
+/// TRSM
+/// ====
+/// L(m x m) X(m x n) = B (m x n)
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<5>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int m, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (m > 5)
+    Kokkos::abort(
+        "InnerTrsmLeftLowerUnitDiag<5>::serial_invoke, assert failure (m<=5)");
+  if (m <= 0 || n <= 0) return 0;
+  switch (m) {
+    case 5: {
+      InnerTrsmLeftLowerUnitDiag<5> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 4: {
+      InnerTrsmLeftLowerUnitDiag<4> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 3: {
+      InnerTrsmLeftLowerUnitDiag<3> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 2: {
+      InnerTrsmLeftLowerUnitDiag<2> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 1: {
+      InnerTrsmLeftLowerUnitDiag<1> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
     }
-    return 0;
   }
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftLowerUnitDiag<4>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int m, const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (m > 4)
-      Kokkos::abort("InnerTrsmLeftLowerUnitDiag<4>::serial_invoke, assert failure (m<=4)");
-    if (m <= 0 || n <= 0) return 0;
-    switch (m) {
-    case 4: { InnerTrsmLeftLowerUnitDiag<4> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 3: { InnerTrsmLeftLowerUnitDiag<3> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 2: { InnerTrsmLeftLowerUnitDiag<2> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 1: { InnerTrsmLeftLowerUnitDiag<1> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
+  return 0;
+}
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<4>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int m, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (m > 4)
+    Kokkos::abort(
+        "InnerTrsmLeftLowerUnitDiag<4>::serial_invoke, assert failure (m<=4)");
+  if (m <= 0 || n <= 0) return 0;
+  switch (m) {
+    case 4: {
+      InnerTrsmLeftLowerUnitDiag<4> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 3: {
+      InnerTrsmLeftLowerUnitDiag<3> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 2: {
+      InnerTrsmLeftLowerUnitDiag<2> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 1: {
+      InnerTrsmLeftLowerUnitDiag<1> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
     }
-    return 0;
   }
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftLowerUnitDiag<3>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int m, const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (m > 3)
-      Kokkos::abort("InnerTrsmLeftLowerUnitDiag<3>::serial_invoke, assert failure (m<=3)");
-    if (m <= 0 || n <= 0) return 0;
-    switch (m) {
-    case 3: { InnerTrsmLeftLowerUnitDiag<3> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 2: { InnerTrsmLeftLowerUnitDiag<2> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 1: { InnerTrsmLeftLowerUnitDiag<1> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
+  return 0;
+}
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<3>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int m, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (m > 3)
+    Kokkos::abort(
+        "InnerTrsmLeftLowerUnitDiag<3>::serial_invoke, assert failure (m<=3)");
+  if (m <= 0 || n <= 0) return 0;
+  switch (m) {
+    case 3: {
+      InnerTrsmLeftLowerUnitDiag<3> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 2: {
+      InnerTrsmLeftLowerUnitDiag<2> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 1: {
+      InnerTrsmLeftLowerUnitDiag<1> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
     }
-    return 0;
   }
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftLowerUnitDiag<2>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int m, const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (m > 2)
-      Kokkos::abort("InnerTrsmLeftLowerUnitDiag<2>::serial_invoke, assert failure (m<=2)");
-    if (m <= 0 || n <= 0) return 0;
-    switch (m) {
-    case 2: { InnerTrsmLeftLowerUnitDiag<2> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 1: { InnerTrsmLeftLowerUnitDiag<1> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
+  return 0;
+}
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<2>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int m, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (m > 2)
+    Kokkos::abort(
+        "InnerTrsmLeftLowerUnitDiag<2>::serial_invoke, assert failure (m<=2)");
+  if (m <= 0 || n <= 0) return 0;
+  switch (m) {
+    case 2: {
+      InnerTrsmLeftLowerUnitDiag<2> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 1: {
+      InnerTrsmLeftLowerUnitDiag<1> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
     }
-    return 0;
   }
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftLowerUnitDiag<1>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int m, const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (m > 1)
-      Kokkos::abort("InnerTrsmLeftLowerUnitDiag<1>::serial_invoke, assert failure (m<=1)");
-    if (m <= 0 || n <= 0) return 0;
-    switch (m) {
-    case 1: { InnerTrsmLeftLowerUnitDiag<1> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
+  return 0;
+}
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerUnitDiag<1>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int m, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (m > 1)
+    Kokkos::abort(
+        "InnerTrsmLeftLowerUnitDiag<1>::serial_invoke, assert failure (m<=1)");
+  if (m <= 0 || n <= 0) return 0;
+  switch (m) {
+    case 1: {
+      InnerTrsmLeftLowerUnitDiag<1> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
     }
-    return 0;
   }
+  return 0;
+}
+
+///
+/// Fixed size TRSM
+/// ================
+/// L(m x m) X(m x n) = B (m x n)
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<5>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (n <= 0) return 0;
+
+  const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1],
+                  a_21 = A[2 * _as0 + 1 * _as1], a_30 = A[3 * _as0 + 0 * _as1],
+                  a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1],
+                  a_40 = A[4 * _as0 + 0 * _as1], a_41 = A[4 * _as0 + 1 * _as1],
+                  a_42 = A[4 * _as0 + 2 * _as1], a_43 = A[4 * _as0 + 3 * _as1];
+
+  // const ValueType
+  //   a_00 = A[0*_as0+0*_as1],
+  //   a_11 = A[1*_as0+1*_as1],
+  //   a_22 = A[2*_as0+2*_as1],
+  //   a_33 = A[3*_as0+3*_as1],
+  //   a_44 = A[4*_as0+4*_as1];
+
+  const ValueType inv_a_00 =
+                      static_cast<ValueType>(1.0) / A[0 * _as0 + 0 * _as1],
+                  inv_a_11 =
+                      static_cast<ValueType>(1.0) / A[1 * _as0 + 1 * _as1],
+                  inv_a_22 =
+                      static_cast<ValueType>(1.0) / A[2 * _as0 + 2 * _as1],
+                  inv_a_33 =
+                      static_cast<ValueType>(1.0) / A[3 * _as0 + 3 * _as1],
+                  inv_a_44 =
+                      static_cast<ValueType>(1.0) / A[4 * _as0 + 4 * _as1];
+
+  auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p,
+                  ValueType &b_2p, ValueType &b_3p, ValueType &b_4p) {
+    // load
+    b_0p = B[0 * _bs0 + p * _bs1];
+    b_1p = B[1 * _bs0 + p * _bs1];
+    b_2p = B[2 * _bs0 + p * _bs1];
+    b_3p = B[3 * _bs0 + p * _bs1];
+    b_4p = B[4 * _bs0 + p * _bs1];
+
+    // 0 iteration
+    b_0p *= inv_a_00; /* b_0p /= a_00;*/
+    b_1p -= a_10 * b_0p;
+    b_2p -= a_20 * b_0p;
+    b_3p -= a_30 * b_0p;
+    b_4p -= a_40 * b_0p;
+
+    // 1 iteration
+    b_1p *= inv_a_11; /* b_1p /= a_11; */
+    b_2p -= a_21 * b_1p;
+    b_3p -= a_31 * b_1p;
+    b_4p -= a_41 * b_1p;
+
+    // 2 iteration
+    b_2p *= inv_a_22; /* b_2p /= a_22; */
+    b_3p -= a_32 * b_2p;
+    b_4p -= a_42 * b_2p;
+
+    // 3 iteration
+    b_3p *= inv_a_33; /* b_3p /= a_33; */
+    b_4p -= a_43 * b_3p;
+
+    // 4 iteration
+    b_4p *= inv_a_44; /* b_4p /= a_44; */
+
+    // store
+    B[0 * _bs0 + p * _bs1] = b_0p;
+    B[1 * _bs0 + p * _bs1] = b_1p;
+    B[2 * _bs0 + p * _bs1] = b_2p;
+    B[3 * _bs0 + p * _bs1] = b_3p;
+    B[4 * _bs0 + p * _bs1] = b_4p;
+  };
 
-  ///
-  /// Fixed size TRSM
-  /// ================
-  /// L(m x m) X(m x n) = B (m x n)
-
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftLowerNonUnitDiag<5>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (n <= 0) return 0;
-
-    const ValueType 
-      a_10 = A[1*_as0+0*_as1], 
-      a_20 = A[2*_as0+0*_as1], a_21 = A[2*_as0+1*_as1], 
-      a_30 = A[3*_as0+0*_as1], a_31 = A[3*_as0+1*_as1], a_32 = A[3*_as0+2*_as1],
-      a_40 = A[4*_as0+0*_as1], a_41 = A[4*_as0+1*_as1], a_42 = A[4*_as0+2*_as1], a_43 = A[4*_as0+3*_as1];
-
-    // const ValueType 
-    //   a_00 = A[0*_as0+0*_as1], 
-    //   a_11 = A[1*_as0+1*_as1], 
-    //   a_22 = A[2*_as0+2*_as1], 
-    //   a_33 = A[3*_as0+3*_as1],
-    //   a_44 = A[4*_as0+4*_as1];
-
-    const ValueType           
-      inv_a_00 = static_cast<ValueType>(1.0)/A[0*_as0+0*_as1],
-      inv_a_11 = static_cast<ValueType>(1.0)/A[1*_as0+1*_as1],
-      inv_a_22 = static_cast<ValueType>(1.0)/A[2*_as0+2*_as1],
-      inv_a_33 = static_cast<ValueType>(1.0)/A[3*_as0+3*_as1],
-      inv_a_44 = static_cast<ValueType>(1.0)/A[4*_as0+4*_as1];
-      
-    auto trsv = [&](const int p,
-                    ValueType &b_0p, 
-                    ValueType &b_1p, 
-                    ValueType &b_2p,                       
-                    ValueType &b_3p,                       
-                    ValueType &b_4p) {
-      // load
-      b_0p = B[0*_bs0+p*_bs1]; 
-      b_1p = B[1*_bs0+p*_bs1]; 
-      b_2p = B[2*_bs0+p*_bs1]; 
-      b_3p = B[3*_bs0+p*_bs1]; 
-      b_4p = B[4*_bs0+p*_bs1]; 
-
-      // 0 iteration
-      b_0p *= inv_a_00; /* b_0p /= a_00;*/   
-      b_1p -= a_10 * b_0p;                  
-      b_2p -= a_20 * b_0p;                  
-      b_3p -= a_30 * b_0p;                    
-      b_4p -= a_40 * b_0p;                    
-
-      // 1 iteration                         
-      b_1p *= inv_a_11; /* b_1p /= a_11; */  
-      b_2p -= a_21 * b_1p;                   
-      b_3p -= a_31 * b_1p;                   
-      b_4p -= a_41 * b_1p;                    
-                                                
-      // 2 iteration                         
-      b_2p *= inv_a_22; /* b_2p /= a_22; */  
-      b_3p -= a_32 * b_2p;                   
-      b_4p -= a_42 * b_2p;                    
-                                                
-      // 3 iteration                         
-      b_3p *= inv_a_33; /* b_3p /= a_33; */     
-      b_4p -= a_43 * b_3p;                    
-
-      // 4 iteration                         
-      b_4p *= inv_a_44; /* b_4p /= a_44; */     
-
-      // store
-      B[0*_bs0+p*_bs1] = b_0p; 
-      B[1*_bs0+p*_bs1] = b_1p; 
-      B[2*_bs0+p*_bs1] = b_2p; 
-      B[3*_bs0+p*_bs1] = b_3p; 
-      B[4*_bs0+p*_bs1] = b_4p; 
-    };
-
-        
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<n;++p) {
-      ValueType b_p[5];
-      trsv(p, b_p[0], b_p[1], b_p[2], b_p[3], b_p[4]);
-    }
-
-    return 0;
+  for (int p = 0; p < n; ++p) {
+    ValueType b_p[5];
+    trsv(p, b_p[0], b_p[1], b_p[2], b_p[3], b_p[4]);
   }
 
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftLowerNonUnitDiag<4>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (n <= 0) return 0;
-
-    const ValueType 
-      a_10 = A[1*_as0+0*_as1], 
-      a_20 = A[2*_as0+0*_as1], a_21 = A[2*_as0+1*_as1], 
-      a_30 = A[3*_as0+0*_as1], a_31 = A[3*_as0+1*_as1], a_32 = A[3*_as0+2*_as1];
-
-    // const ValueType 
-    //   a_00 = A[0*_as0+0*_as1], 
-    //   a_11 = A[1*_as0+1*_as1], 
-    //   a_22 = A[2*_as0+2*_as1], 
-    //   a_33 = A[3*_as0+3*_as1];
-
-    const ValueType           
-      inv_a_00 = static_cast<ValueType>(1.0)/A[0*_as0+0*_as1],
-      inv_a_11 = static_cast<ValueType>(1.0)/A[1*_as0+1*_as1],
-      inv_a_22 = static_cast<ValueType>(1.0)/A[2*_as0+2*_as1],
-      inv_a_33 = static_cast<ValueType>(1.0)/A[3*_as0+3*_as1];
-      
-    auto trsv = [&](const int p,
-                    ValueType &b_0p, 
-                    ValueType &b_1p, 
-                    ValueType &b_2p, 
-                    ValueType &b_3p) {
-      // load
-      b_0p = B[0*_bs0+p*_bs1]; 
-      b_1p = B[1*_bs0+p*_bs1]; 
-      b_2p = B[2*_bs0+p*_bs1]; 
-      b_3p = B[3*_bs0+p*_bs1]; 
-
-      // 0 iteration
-      b_0p *= inv_a_00; /* b_0p /= a_00;*/   
-      b_1p -= a_10 * b_0p;                  
-      b_2p -= a_20 * b_0p;                  
-      b_3p -= a_30 * b_0p;                    
-
-      // 1 iteration                         
-      b_1p *= inv_a_11; /* b_1p /= a_11; */  
-      b_2p -= a_21 * b_1p;                   
-      b_3p -= a_31 * b_1p;                   
-                                                
-      // 2 iteration                         
-      b_2p *= inv_a_22; /* b_2p /= a_22; */  
-      b_3p -= a_32 * b_2p;                   
-                                                
-      // 3 iteration                         
-      b_3p *= inv_a_33; /* b_3p /= a_33; */     
-
-      // store
-      B[0*_bs0+p*_bs1] = b_0p; 
-      B[1*_bs0+p*_bs1] = b_1p; 
-      B[2*_bs0+p*_bs1] = b_2p; 
-      B[3*_bs0+p*_bs1] = b_3p; 
-    };
-
-
-        
+  return 0;
+}
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<4>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (n <= 0) return 0;
+
+  const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1],
+                  a_21 = A[2 * _as0 + 1 * _as1], a_30 = A[3 * _as0 + 0 * _as1],
+                  a_31 = A[3 * _as0 + 1 * _as1], a_32 = A[3 * _as0 + 2 * _as1];
+
+  // const ValueType
+  //   a_00 = A[0*_as0+0*_as1],
+  //   a_11 = A[1*_as0+1*_as1],
+  //   a_22 = A[2*_as0+2*_as1],
+  //   a_33 = A[3*_as0+3*_as1];
+
+  const ValueType inv_a_00 =
+                      static_cast<ValueType>(1.0) / A[0 * _as0 + 0 * _as1],
+                  inv_a_11 =
+                      static_cast<ValueType>(1.0) / A[1 * _as0 + 1 * _as1],
+                  inv_a_22 =
+                      static_cast<ValueType>(1.0) / A[2 * _as0 + 2 * _as1],
+                  inv_a_33 =
+                      static_cast<ValueType>(1.0) / A[3 * _as0 + 3 * _as1];
+
+  auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p,
+                  ValueType &b_2p, ValueType &b_3p) {
+    // load
+    b_0p = B[0 * _bs0 + p * _bs1];
+    b_1p = B[1 * _bs0 + p * _bs1];
+    b_2p = B[2 * _bs0 + p * _bs1];
+    b_3p = B[3 * _bs0 + p * _bs1];
+
+    // 0 iteration
+    b_0p *= inv_a_00; /* b_0p /= a_00;*/
+    b_1p -= a_10 * b_0p;
+    b_2p -= a_20 * b_0p;
+    b_3p -= a_30 * b_0p;
+
+    // 1 iteration
+    b_1p *= inv_a_11; /* b_1p /= a_11; */
+    b_2p -= a_21 * b_1p;
+    b_3p -= a_31 * b_1p;
+
+    // 2 iteration
+    b_2p *= inv_a_22; /* b_2p /= a_22; */
+    b_3p -= a_32 * b_2p;
+
+    // 3 iteration
+    b_3p *= inv_a_33; /* b_3p /= a_33; */
+
+    // store
+    B[0 * _bs0 + p * _bs1] = b_0p;
+    B[1 * _bs0 + p * _bs1] = b_1p;
+    B[2 * _bs0 + p * _bs1] = b_2p;
+    B[3 * _bs0 + p * _bs1] = b_3p;
+  };
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<n;++p) {
-      ValueType b_p[4];
-      trsv(p, b_p[0], b_p[1], b_p[2], b_p[3]);
-    }
-
-    return 0;
+  for (int p = 0; p < n; ++p) {
+    ValueType b_p[4];
+    trsv(p, b_p[0], b_p[1], b_p[2], b_p[3]);
   }
 
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftLowerNonUnitDiag<3>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (n <= 0) return 0;
-
-    const ValueType 
-      a_10 = A[1*_as0+0*_as1], 
-      a_20 = A[2*_as0+0*_as1], a_21 = A[2*_as0+1*_as1];
-
-    // const ValueType 
-    //   a_00 = A[0*_as0+0*_as1], 
-    //   a_11 = A[1*_as0+1*_as1], 
-    //   a_22 = A[2*_as0+2*_as1];
-
-    const ValueType           
-      inv_a_00 = static_cast<ValueType>(1.0)/A[0*_as0+0*_as1],
-      inv_a_11 = static_cast<ValueType>(1.0)/A[1*_as0+1*_as1],
-      inv_a_22 = static_cast<ValueType>(1.0)/A[2*_as0+2*_as1];
-      
-    auto trsv = [&](const int p,
-                    ValueType &b_0p, 
-                    ValueType &b_1p, 
-                    ValueType &b_2p) {
-      // load
-      b_0p = B[0*_bs0+p*_bs1]; 
-      b_1p = B[1*_bs0+p*_bs1]; 
-      b_2p = B[2*_bs0+p*_bs1]; 
-
-      // 0 iteration
-      b_0p *= inv_a_00; /* b_0p /= a_00;*/   
-      b_1p -= a_10 * b_0p;                  
-      b_2p -= a_20 * b_0p;                  
-
-      // 1 iteration                         
-      b_1p *= inv_a_11; /* b_1p /= a_11; */  
-      b_2p -= a_21 * b_1p;                   
-                                                
-      // 2 iteration                         
-      b_2p *= inv_a_22; /* b_2p /= a_22; */  
-
-      // store
-      B[0*_bs0+p*_bs1] = b_0p; 
-      B[1*_bs0+p*_bs1] = b_1p; 
-      B[2*_bs0+p*_bs1] = b_2p; 
-    };
-
-        
+  return 0;
+}
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<3>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (n <= 0) return 0;
+
+  const ValueType a_10 = A[1 * _as0 + 0 * _as1], a_20 = A[2 * _as0 + 0 * _as1],
+                  a_21 = A[2 * _as0 + 1 * _as1];
+
+  // const ValueType
+  //   a_00 = A[0*_as0+0*_as1],
+  //   a_11 = A[1*_as0+1*_as1],
+  //   a_22 = A[2*_as0+2*_as1];
+
+  const ValueType inv_a_00 =
+                      static_cast<ValueType>(1.0) / A[0 * _as0 + 0 * _as1],
+                  inv_a_11 =
+                      static_cast<ValueType>(1.0) / A[1 * _as0 + 1 * _as1],
+                  inv_a_22 =
+                      static_cast<ValueType>(1.0) / A[2 * _as0 + 2 * _as1];
+
+  auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p,
+                  ValueType &b_2p) {
+    // load
+    b_0p = B[0 * _bs0 + p * _bs1];
+    b_1p = B[1 * _bs0 + p * _bs1];
+    b_2p = B[2 * _bs0 + p * _bs1];
+
+    // 0 iteration
+    b_0p *= inv_a_00; /* b_0p /= a_00;*/
+    b_1p -= a_10 * b_0p;
+    b_2p -= a_20 * b_0p;
+
+    // 1 iteration
+    b_1p *= inv_a_11; /* b_1p /= a_11; */
+    b_2p -= a_21 * b_1p;
+
+    // 2 iteration
+    b_2p *= inv_a_22; /* b_2p /= a_22; */
+
+    // store
+    B[0 * _bs0 + p * _bs1] = b_0p;
+    B[1 * _bs0 + p * _bs1] = b_1p;
+    B[2 * _bs0 + p * _bs1] = b_2p;
+  };
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<n;++p) {
-      ValueType b_p[3];
-      trsv(p, b_p[0], b_p[1], b_p[2]);
-    }
-
-    return 0;
+  for (int p = 0; p < n; ++p) {
+    ValueType b_p[3];
+    trsv(p, b_p[0], b_p[1], b_p[2]);
   }
 
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftLowerNonUnitDiag<2>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (n <= 0) return 0;
-
-    const ValueType 
-      a_10 = A[1*_as0+0*_as1];
-
-    // const ValueType 
-    //   a_00 = A[0*_as0+0*_as1], 
-    //   a_11 = A[1*_as0+1*_as1];
-
-    const ValueType           
-      inv_a_00 = static_cast<ValueType>(1.0)/A[0*_as0+0*_as1],
-      inv_a_11 = static_cast<ValueType>(1.0)/A[1*_as0+1*_as1];
-      
-    auto trsv = [&](const int p,
-                    ValueType &b_0p, 
-                    ValueType &b_1p) {
-      // load
-      b_0p = B[0*_bs0+p*_bs1]; 
-      b_1p = B[1*_bs0+p*_bs1]; 
-
-      // 0 iteration
-      b_0p *= inv_a_00; /* b_0p /= a_00;*/   
-      b_1p -= a_10 * b_0p;                  
-
-      // 1 iteration                         
-      b_1p *= inv_a_11; /* b_1p /= a_11; */  
-
-      // store
-      B[0*_bs0+p*_bs1] = b_0p; 
-      B[1*_bs0+p*_bs1] = b_1p; 
-    };
-
-        
+  return 0;
+}
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<2>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (n <= 0) return 0;
+
+  const ValueType a_10 = A[1 * _as0 + 0 * _as1];
+
+  // const ValueType
+  //   a_00 = A[0*_as0+0*_as1],
+  //   a_11 = A[1*_as0+1*_as1];
+
+  const ValueType inv_a_00 =
+                      static_cast<ValueType>(1.0) / A[0 * _as0 + 0 * _as1],
+                  inv_a_11 =
+                      static_cast<ValueType>(1.0) / A[1 * _as0 + 1 * _as1];
+
+  auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p) {
+    // load
+    b_0p = B[0 * _bs0 + p * _bs1];
+    b_1p = B[1 * _bs0 + p * _bs1];
+
+    // 0 iteration
+    b_0p *= inv_a_00; /* b_0p /= a_00;*/
+    b_1p -= a_10 * b_0p;
+
+    // 1 iteration
+    b_1p *= inv_a_11; /* b_1p /= a_11; */
+
+    // store
+    B[0 * _bs0 + p * _bs1] = b_0p;
+    B[1 * _bs0 + p * _bs1] = b_1p;
+  };
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<n;++p) {
-      ValueType b_p[2];
-      trsv(p, b_p[0], b_p[1]);
-    }
-
-    return 0;
+  for (int p = 0; p < n; ++p) {
+    ValueType b_p[2];
+    trsv(p, b_p[0], b_p[1]);
   }
 
+  return 0;
+}
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<1>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (n <= 0) return 0;
+
+  // const ValueType
+  //   a_00 = A[0*_as0+0*_as1];
+
+  const ValueType inv_a_00 =
+      static_cast<ValueType>(1.0) / A[0 * _as0 + 0 * _as1];
+
+  auto trsv = [&](const int p, ValueType & /* b_0p */) {
+    B[0 * _bs0 + p * _bs1] *= inv_a_00; /* b_0p /= a_00;*/
+  };
 
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftLowerNonUnitDiag<1>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (n <= 0) return 0;
-
-    // const ValueType 
-    //   a_00 = A[0*_as0+0*_as1];
-
-    const ValueType           
-      inv_a_00 = static_cast<ValueType>(1.0)/A[0*_as0+0*_as1];
-      
-    auto trsv = [&](const int p,
-                    ValueType &/* b_0p */) {
-      B[0*_bs0+p*_bs1] *= inv_a_00; /* b_0p /= a_00;*/   
-    };
-      
-        
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<n;++p) {
-      ValueType b_p;
-      trsv(p, b_p);
-    }
-
-    return 0;
+  for (int p = 0; p < n; ++p) {
+    ValueType b_p;
+    trsv(p, b_p);
   }
 
-  ///
-  /// TRSM
-  /// ==============
-  /// L(m x m) X(m x n) = B (m x n)
-
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftLowerNonUnitDiag<5>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int m, const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (m > 5)
-      Kokkos::abort("InnerTrsmLeftLowerNonUnitDiag<5>::serial_invoke, assert failure (m<=5)");
-    if (m <= 0 || n <= 0) return 0;
-    switch (m) {
-    case 5: { InnerTrsmLeftLowerNonUnitDiag<5> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 4: { InnerTrsmLeftLowerNonUnitDiag<4> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 3: { InnerTrsmLeftLowerNonUnitDiag<3> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 2: { InnerTrsmLeftLowerNonUnitDiag<2> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 1: { InnerTrsmLeftLowerNonUnitDiag<1> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
+  return 0;
+}
+
+///
+/// TRSM
+/// ==============
+/// L(m x m) X(m x n) = B (m x n)
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<5>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int m, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (m > 5)
+    Kokkos::abort(
+        "InnerTrsmLeftLowerNonUnitDiag<5>::serial_invoke, assert failure "
+        "(m<=5)");
+  if (m <= 0 || n <= 0) return 0;
+  switch (m) {
+    case 5: {
+      InnerTrsmLeftLowerNonUnitDiag<5> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 4: {
+      InnerTrsmLeftLowerNonUnitDiag<4> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 3: {
+      InnerTrsmLeftLowerNonUnitDiag<3> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 2: {
+      InnerTrsmLeftLowerNonUnitDiag<2> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 1: {
+      InnerTrsmLeftLowerNonUnitDiag<1> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
     }
-    return 0;
   }
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftLowerNonUnitDiag<4>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int m, const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (m > 4)
-      Kokkos::abort("InnerTrsmLeftLowerNonUnitDiag<4>::serial_invoke, assert failure (m<=4)");
-    if (m <= 0 || n <= 0) return 0;
-    switch (m) {
-    case 4: { InnerTrsmLeftLowerNonUnitDiag<4> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 3: { InnerTrsmLeftLowerNonUnitDiag<3> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 2: { InnerTrsmLeftLowerNonUnitDiag<2> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 1: { InnerTrsmLeftLowerNonUnitDiag<1> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
+  return 0;
+}
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<4>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int m, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (m > 4)
+    Kokkos::abort(
+        "InnerTrsmLeftLowerNonUnitDiag<4>::serial_invoke, assert failure "
+        "(m<=4)");
+  if (m <= 0 || n <= 0) return 0;
+  switch (m) {
+    case 4: {
+      InnerTrsmLeftLowerNonUnitDiag<4> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 3: {
+      InnerTrsmLeftLowerNonUnitDiag<3> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 2: {
+      InnerTrsmLeftLowerNonUnitDiag<2> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 1: {
+      InnerTrsmLeftLowerNonUnitDiag<1> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
     }
-    return 0;
   }
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftLowerNonUnitDiag<3>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int m, const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (m > 3)
-      Kokkos::abort("InnerTrsmLeftLowerNonUnitDiag<3>::serial_invoke, assert failure (m<=3)");
-    if (m <= 0 || n <= 0) return 0;
-    switch (m) {
-    case 3: { InnerTrsmLeftLowerNonUnitDiag<3> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 2: { InnerTrsmLeftLowerNonUnitDiag<2> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 1: { InnerTrsmLeftLowerNonUnitDiag<1> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
+  return 0;
+}
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<3>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int m, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (m > 3)
+    Kokkos::abort(
+        "InnerTrsmLeftLowerNonUnitDiag<3>::serial_invoke, assert failure "
+        "(m<=3)");
+  if (m <= 0 || n <= 0) return 0;
+  switch (m) {
+    case 3: {
+      InnerTrsmLeftLowerNonUnitDiag<3> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 2: {
+      InnerTrsmLeftLowerNonUnitDiag<2> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 1: {
+      InnerTrsmLeftLowerNonUnitDiag<1> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
     }
-    return 0;
   }
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftLowerNonUnitDiag<2>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int m, const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (m > 2)
-      Kokkos::abort("InnerTrsmLeftLowerNonUnitDiag<2>::serial_invoke, assert failure (m<=2)");
-    if (m <= 0 || n <= 0) return 0;
-    switch (m) {
-    case 2: { InnerTrsmLeftLowerNonUnitDiag<2> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 1: { InnerTrsmLeftLowerNonUnitDiag<1> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
+  return 0;
+}
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<2>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int m, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (m > 2)
+    Kokkos::abort(
+        "InnerTrsmLeftLowerNonUnitDiag<2>::serial_invoke, assert failure "
+        "(m<=2)");
+  if (m <= 0 || n <= 0) return 0;
+  switch (m) {
+    case 2: {
+      InnerTrsmLeftLowerNonUnitDiag<2> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 1: {
+      InnerTrsmLeftLowerNonUnitDiag<1> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
     }
-    return 0;
   }
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftLowerNonUnitDiag<1>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int m, const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (m > 1)
-      Kokkos::abort("InnerTrsmLeftLowerNonUnitDiag<1>::serial_invoke, assert failure (m<=1)");
-    if (m <= 0 || n <= 0) return 0;
-    switch (m) {
-    case 1: { InnerTrsmLeftLowerNonUnitDiag<1> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
+  return 0;
+}
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftLowerNonUnitDiag<1>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int m, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (m > 1)
+    Kokkos::abort(
+        "InnerTrsmLeftLowerNonUnitDiag<1>::serial_invoke, assert failure "
+        "(m<=1)");
+  if (m <= 0 || n <= 0) return 0;
+  switch (m) {
+    case 1: {
+      InnerTrsmLeftLowerNonUnitDiag<1> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
     }
-    return 0;
   }
+  return 0;
+}
+
+///
+/// Fixed size TRSM
+/// ================
+/// L(m x m) X(m x n) = B (m x n)
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<5>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (n <= 0) return 0;
+
+  const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1],
+                  a_03 = A[0 * _as0 + 3 * _as1], a_04 = A[0 * _as0 + 4 * _as1],
+                  /**/ a_12 = A[1 * _as0 + 2 * _as1],
+                  a_13 = A[1 * _as0 + 3 * _as1], a_14 = A[1 * _as0 + 4 * _as1],
+                  /**/ a_23 = A[2 * _as0 + 3 * _as1],
+                  a_24      = A[2 * _as0 + 4 * _as1],
+                  /**/ a_34 = A[3 * _as0 + 4 * _as1];
+
+  auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p,
+                  ValueType &b_2p, ValueType &b_3p, ValueType &b_4p) {
+    // load
+    b_0p = B[0 * _bs0 + p * _bs1];
+    b_1p = B[1 * _bs0 + p * _bs1];
+    b_2p = B[2 * _bs0 + p * _bs1];
+    b_3p = B[3 * _bs0 + p * _bs1];
+    b_4p = B[4 * _bs0 + p * _bs1];
+
+    // 0 iteration
+    b_0p -= a_04 * b_4p;
+    b_1p -= a_14 * b_4p;
+    b_2p -= a_24 * b_4p;
+    b_3p -= a_34 * b_4p;
+
+    // 1 iteration
+    b_0p -= a_03 * b_3p;
+    b_1p -= a_13 * b_3p;
+    b_2p -= a_23 * b_3p;
+
+    // 2 iteration
+    b_0p -= a_02 * b_2p;
+    b_1p -= a_12 * b_2p;
+
+    // 1 iteration
+    b_0p -= a_01 * b_1p;
+
+    // store
+    B[0 * _bs0 + p * _bs1] = b_0p;
+    B[1 * _bs0 + p * _bs1] = b_1p;
+    B[2 * _bs0 + p * _bs1] = b_2p;
+    B[3 * _bs0 + p * _bs1] = b_3p;
+  };
 
-  ///
-  /// Fixed size TRSM
-  /// ================
-  /// L(m x m) X(m x n) = B (m x n)
-
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftUpperUnitDiag<5>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (n <= 0) return 0;
-
-    const ValueType 
-      a_01 = A[0*_as0+1*_as1], a_02 = A[0*_as0+2*_as1], a_03 = A[0*_as0+3*_as1], a_04 = A[0*_as0+4*_as1], 
-      /**/                     a_12 = A[1*_as0+2*_as1], a_13 = A[1*_as0+3*_as1], a_14 = A[1*_as0+4*_as1],
-      /**/                                              a_23 = A[2*_as0+3*_as1], a_24 = A[2*_as0+4*_as1],
-      /**/                                                                       a_34 = A[3*_as0+4*_as1];
-
-    auto trsv = [&](const int p,
-                    ValueType &b_0p, 
-                    ValueType &b_1p, 
-                    ValueType &b_2p, 
-                    ValueType &b_3p,
-                    ValueType &b_4p) {
-      // load
-      b_0p = B[0*_bs0+p*_bs1];
-      b_1p = B[1*_bs0+p*_bs1];
-      b_2p = B[2*_bs0+p*_bs1];
-      b_3p = B[3*_bs0+p*_bs1];
-      b_4p = B[4*_bs0+p*_bs1];
-        
-      // 0 iteration
-      b_0p -= a_04 * b_4p;
-      b_1p -= a_14 * b_4p;
-      b_2p -= a_24 * b_4p;
-      b_3p -= a_34 * b_4p;
-
-      // 1 iteration
-      b_0p -= a_03 * b_3p; 
-      b_1p -= a_13 * b_3p;
-      b_2p -= a_23 * b_3p;
-
-      // 2 iteration
-      b_0p -= a_02 * b_2p;
-      b_1p -= a_12 * b_2p;
-
-      // 1 iteration
-      b_0p -= a_01 * b_1p;
-
-      // store
-      B[0*_bs0+p*_bs1] = b_0p;
-      B[1*_bs0+p*_bs1] = b_1p;
-      B[2*_bs0+p*_bs1] = b_2p;
-      B[3*_bs0+p*_bs1] = b_3p;
-    };
-      
-        
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<n;++p) {
-      ValueType b_p[5];
-      trsv(p, b_p[0], b_p[1], b_p[2], b_p[3], b_p[4]);
-    }
-
-    return 0;
+  for (int p = 0; p < n; ++p) {
+    ValueType b_p[5];
+    trsv(p, b_p[0], b_p[1], b_p[2], b_p[3], b_p[4]);
   }
 
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftUpperUnitDiag<4>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (n <= 0) return 0;
-
-    const ValueType 
-      a_01 = A[0*_as0+1*_as1], a_02 = A[0*_as0+2*_as1], a_03 = A[0*_as0+3*_as1], 
-      /**/                     a_12 = A[1*_as0+2*_as1], a_13 = A[1*_as0+3*_as1], 
-      /**/                                              a_23 = A[2*_as0+3*_as1];
-
-    auto trsv = [&](const int p,
-                    ValueType &b_0p, 
-                    ValueType &b_1p, 
-                    ValueType &b_2p, 
-                    ValueType &b_3p) {
-      // load
-      b_0p = B[0*_bs0+p*_bs1];
-      b_1p = B[1*_bs0+p*_bs1];
-      b_2p = B[2*_bs0+p*_bs1];
-      b_3p = B[3*_bs0+p*_bs1];
-        
-      // 0 iteration
-      b_0p -= a_03 * b_3p;
-      b_1p -= a_13 * b_3p;
-      b_2p -= a_23 * b_3p;
-
-      // 1 iteration
-      b_0p -= a_02 * b_2p; 
-      b_1p -= a_12 * b_2p;
-
-      // 2 iteration
-      b_0p -= a_01 * b_1p;
-
-      // store
-      B[0*_bs0+p*_bs1] = b_0p;
-      B[1*_bs0+p*_bs1] = b_1p;
-      B[2*_bs0+p*_bs1] = b_2p;
-    };
-      
-        
+  return 0;
+}
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<4>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (n <= 0) return 0;
+
+  const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1],
+                  a_03      = A[0 * _as0 + 3 * _as1],
+                  /**/ a_12 = A[1 * _as0 + 2 * _as1],
+                  a_13      = A[1 * _as0 + 3 * _as1],
+                  /**/ a_23 = A[2 * _as0 + 3 * _as1];
+
+  auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p,
+                  ValueType &b_2p, ValueType &b_3p) {
+    // load
+    b_0p = B[0 * _bs0 + p * _bs1];
+    b_1p = B[1 * _bs0 + p * _bs1];
+    b_2p = B[2 * _bs0 + p * _bs1];
+    b_3p = B[3 * _bs0 + p * _bs1];
+
+    // 0 iteration
+    b_0p -= a_03 * b_3p;
+    b_1p -= a_13 * b_3p;
+    b_2p -= a_23 * b_3p;
+
+    // 1 iteration
+    b_0p -= a_02 * b_2p;
+    b_1p -= a_12 * b_2p;
+
+    // 2 iteration
+    b_0p -= a_01 * b_1p;
+
+    // store
+    B[0 * _bs0 + p * _bs1] = b_0p;
+    B[1 * _bs0 + p * _bs1] = b_1p;
+    B[2 * _bs0 + p * _bs1] = b_2p;
+  };
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<n;++p) {
-      ValueType b_p[4];
-      trsv(p, b_p[0], b_p[1], b_p[2], b_p[3]);
-    }
-
-    return 0;
+  for (int p = 0; p < n; ++p) {
+    ValueType b_p[4];
+    trsv(p, b_p[0], b_p[1], b_p[2], b_p[3]);
   }
 
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftUpperUnitDiag<3>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (n <= 0) return 0;
-
-    const ValueType 
-      a_01 = A[0*_as0+1*_as1], a_02 = A[0*_as0+2*_as1], 
-      /**/                     a_12 = A[1*_as0+2*_as1];
-
-    auto trsv = [&](const int p,
-                    ValueType &b_0p, 
-                    ValueType &b_1p, 
-                    ValueType &b_2p) {
-      // load
-      b_0p = B[0*_bs0+p*_bs1];
-      b_1p = B[1*_bs0+p*_bs1];
-      b_2p = B[2*_bs0+p*_bs1];
-        
-      // 0 iteration
-      b_0p -= a_02 * b_2p;
-      b_1p -= a_12 * b_2p;
-
-      // 1 iteration
-      b_0p -= a_01 * b_1p; 
-
-      // store
-      B[0*_bs0+p*_bs1] = b_0p;
-      B[1*_bs0+p*_bs1] = b_1p;
-    };
-      
-        
+  return 0;
+}
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<3>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (n <= 0) return 0;
+
+  const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1],
+                  /**/ a_12 = A[1 * _as0 + 2 * _as1];
+
+  auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p,
+                  ValueType &b_2p) {
+    // load
+    b_0p = B[0 * _bs0 + p * _bs1];
+    b_1p = B[1 * _bs0 + p * _bs1];
+    b_2p = B[2 * _bs0 + p * _bs1];
+
+    // 0 iteration
+    b_0p -= a_02 * b_2p;
+    b_1p -= a_12 * b_2p;
+
+    // 1 iteration
+    b_0p -= a_01 * b_1p;
+
+    // store
+    B[0 * _bs0 + p * _bs1] = b_0p;
+    B[1 * _bs0 + p * _bs1] = b_1p;
+  };
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<n;++p) {
-      ValueType b_p[3];
-      trsv(p, b_p[0], b_p[1], b_p[2]);
-    }
-
-    return 0;
+  for (int p = 0; p < n; ++p) {
+    ValueType b_p[3];
+    trsv(p, b_p[0], b_p[1], b_p[2]);
   }
 
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftUpperUnitDiag<2>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (n <= 0) return 0;
-
-    const ValueType 
-      a_01 = A[0*_as0+1*_as1];
-      
-    auto trsv = [&](const int p,
-                    ValueType &b_0p, 
-                    ValueType &b_1p) {
-      // load
-      b_0p = B[0*_bs0+p*_bs1];
-      b_1p = B[1*_bs0+p*_bs1];
-        
-      // 0 iteration
-      b_0p -= a_01 * b_1p;
-
-      // store
-      B[0*_bs0+p*_bs1] = b_0p;
-    };
-      
-        
+  return 0;
+}
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<2>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (n <= 0) return 0;
+
+  const ValueType a_01 = A[0 * _as0 + 1 * _as1];
+
+  auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p) {
+    // load
+    b_0p = B[0 * _bs0 + p * _bs1];
+    b_1p = B[1 * _bs0 + p * _bs1];
+
+    // 0 iteration
+    b_0p -= a_01 * b_1p;
+
+    // store
+    B[0 * _bs0 + p * _bs1] = b_0p;
+  };
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<n;++p) {
-      ValueType b_p[2];
-      trsv(p, b_p[0], b_p[1]);
-    }
-
-    return 0;
+  for (int p = 0; p < n; ++p) {
+    ValueType b_p[2];
+    trsv(p, b_p[0], b_p[1]);
   }
 
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftUpperUnitDiag<1>::
-  serial_invoke(const ValueType *__restrict__ /* A */,
-                const int /* n */,
-                /**/  ValueType *__restrict__ /* B */) {
-    return 0;
-  }
+  return 0;
+}
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<1>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT /* A */, const int /* n */,
+    /**/ ValueType *KOKKOS_RESTRICT /* B */) {
+  return 0;
+}
 
-  ///
-  /// TRSM
-  /// ====
-  /// L(m x m) X(m x n) = B (m x n)
-
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftUpperUnitDiag<5>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int m, const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (m > 5)
-      Kokkos::abort("InnerTrsmLeftUpperUnitDiag<5>::serial_invoke, assert failure (m<=5)");
-    if (m <= 0 || n <= 0) return 0;
-    switch (m) {
-    case 5: { InnerTrsmLeftUpperUnitDiag<5> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 4: { InnerTrsmLeftUpperUnitDiag<4> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 3: { InnerTrsmLeftUpperUnitDiag<3> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 2: { InnerTrsmLeftUpperUnitDiag<2> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 1: { InnerTrsmLeftUpperUnitDiag<1> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
+///
+/// TRSM
+/// ====
+/// L(m x m) X(m x n) = B (m x n)
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<5>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int m, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (m > 5)
+    Kokkos::abort(
+        "InnerTrsmLeftUpperUnitDiag<5>::serial_invoke, assert failure (m<=5)");
+  if (m <= 0 || n <= 0) return 0;
+  switch (m) {
+    case 5: {
+      InnerTrsmLeftUpperUnitDiag<5> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 4: {
+      InnerTrsmLeftUpperUnitDiag<4> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 3: {
+      InnerTrsmLeftUpperUnitDiag<3> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 2: {
+      InnerTrsmLeftUpperUnitDiag<2> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 1: {
+      InnerTrsmLeftUpperUnitDiag<1> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
     }
-    return 0;
   }
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftUpperUnitDiag<4>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int m, const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (m > 4)
-      Kokkos::abort("InnerTrsmLeftUpperUnitDiag<4>::serial_invoke, assert failure (m<=4)");
-    if (m <= 0 || n <= 0) return 0;
-    switch (m) {
-    case 4: { InnerTrsmLeftUpperUnitDiag<4> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 3: { InnerTrsmLeftUpperUnitDiag<3> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 2: { InnerTrsmLeftUpperUnitDiag<2> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 1: { InnerTrsmLeftUpperUnitDiag<1> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
+  return 0;
+}
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<4>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int m, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (m > 4)
+    Kokkos::abort(
+        "InnerTrsmLeftUpperUnitDiag<4>::serial_invoke, assert failure (m<=4)");
+  if (m <= 0 || n <= 0) return 0;
+  switch (m) {
+    case 4: {
+      InnerTrsmLeftUpperUnitDiag<4> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 3: {
+      InnerTrsmLeftUpperUnitDiag<3> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 2: {
+      InnerTrsmLeftUpperUnitDiag<2> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 1: {
+      InnerTrsmLeftUpperUnitDiag<1> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
     }
-    return 0;
   }
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftUpperUnitDiag<3>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int m, const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (m > 3)
-      Kokkos::abort("InnerTrsmLeftUpperUnitDiag<3>::serial_invoke, assert failure (m<=3)");
-    if (m <= 0 || n <= 0) return 0;
-    switch (m) {
-    case 3: { InnerTrsmLeftUpperUnitDiag<3> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 2: { InnerTrsmLeftUpperUnitDiag<2> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 1: { InnerTrsmLeftUpperUnitDiag<1> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
+  return 0;
+}
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<3>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int m, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (m > 3)
+    Kokkos::abort(
+        "InnerTrsmLeftUpperUnitDiag<3>::serial_invoke, assert failure (m<=3)");
+  if (m <= 0 || n <= 0) return 0;
+  switch (m) {
+    case 3: {
+      InnerTrsmLeftUpperUnitDiag<3> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 2: {
+      InnerTrsmLeftUpperUnitDiag<2> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 1: {
+      InnerTrsmLeftUpperUnitDiag<1> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
     }
-    return 0;
   }
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftUpperUnitDiag<2>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int m, const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (m > 2)
-      Kokkos::abort("InnerTrsmLeftUpperUnitDiag<2>::serial_invoke, assert failure (m<=2)");
-    if (m <= 0 || n <= 0) return 0;
-    switch (m) {
-    case 2: { InnerTrsmLeftUpperUnitDiag<2> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 1: { InnerTrsmLeftUpperUnitDiag<1> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
+  return 0;
+}
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<2>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int m, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (m > 2)
+    Kokkos::abort(
+        "InnerTrsmLeftUpperUnitDiag<2>::serial_invoke, assert failure (m<=2)");
+  if (m <= 0 || n <= 0) return 0;
+  switch (m) {
+    case 2: {
+      InnerTrsmLeftUpperUnitDiag<2> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 1: {
+      InnerTrsmLeftUpperUnitDiag<1> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
     }
-    return 0;
   }
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftUpperUnitDiag<1>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int m, const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (m > 1)
-      Kokkos::abort("InnerTrsmLeftUpperUnitDiag<1>::serial_invoke, assert failure (m<=1)");
-    if (m <= 0 || n <= 0) return 0;
-    switch (m) {
-    case 1: { InnerTrsmLeftUpperUnitDiag<1> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
+  return 0;
+}
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperUnitDiag<1>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int m, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (m > 1)
+    Kokkos::abort(
+        "InnerTrsmLeftUpperUnitDiag<1>::serial_invoke, assert failure (m<=1)");
+  if (m <= 0 || n <= 0) return 0;
+  switch (m) {
+    case 1: {
+      InnerTrsmLeftUpperUnitDiag<1> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
     }
-    return 0;
   }
+  return 0;
+}
+
+///
+/// Fixed size TRSM
+/// ================
+/// L(m x m) X(m x n) = B (m x n)
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<5>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (n <= 0) return 0;
+
+  const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1],
+                  a_03 = A[0 * _as0 + 3 * _as1], a_04 = A[0 * _as0 + 4 * _as1],
+                  /**/ a_12 = A[1 * _as0 + 2 * _as1],
+                  a_13 = A[1 * _as0 + 3 * _as1], a_14 = A[1 * _as0 + 4 * _as1],
+                  /**/ a_23 = A[2 * _as0 + 3 * _as1],
+                  a_24      = A[2 * _as0 + 4 * _as1],
+                  /**/ a_34 = A[3 * _as0 + 4 * _as1];
+
+  // const ValueType
+  //   a_00 = A[0*_as0+0*_as1],
+  //   a_11 = A[1*_as0+1*_as1],
+  //   a_22 = A[2*_as0+2*_as1],
+  //   a_33 = A[3*_as0+3*_as1],
+  //   a_44 = A[4*_as0+4*_as1];
+
+  const ValueType inv_a_00 =
+                      static_cast<ValueType>(1.0) / A[0 * _as0 + 0 * _as1],
+                  inv_a_11 =
+                      static_cast<ValueType>(1.0) / A[1 * _as0 + 1 * _as1],
+                  inv_a_22 =
+                      static_cast<ValueType>(1.0) / A[2 * _as0 + 2 * _as1],
+                  inv_a_33 =
+                      static_cast<ValueType>(1.0) / A[3 * _as0 + 3 * _as1],
+                  inv_a_44 =
+                      static_cast<ValueType>(1.0) / A[4 * _as0 + 4 * _as1];
+
+  auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p,
+                  ValueType &b_2p, ValueType &b_3p, ValueType &b_4p) {
+    // load
+    b_0p = B[0 * _bs0 + p * _bs1];
+    b_1p = B[1 * _bs0 + p * _bs1];
+    b_2p = B[2 * _bs0 + p * _bs1];
+    b_3p = B[3 * _bs0 + p * _bs1];
+    b_4p = B[4 * _bs0 + p * _bs1];
+
+    // 0 iteration
+    b_4p *= inv_a_44; /* b_4p /= a_44;*/
+    b_3p -= a_34 * b_4p;
+    b_2p -= a_24 * b_4p;
+    b_1p -= a_14 * b_4p;
+    b_0p -= a_04 * b_4p;
+
+    // 1 iterationls
+    b_3p *= inv_a_33; /* b_3p /= a_33;*/
+    b_2p -= a_23 * b_3p;
+    b_1p -= a_13 * b_3p;
+    b_0p -= a_03 * b_3p;
+
+    // 2 iteration
+    b_2p *= inv_a_22; /* b_2p /= a_22; */
+    b_1p -= a_12 * b_2p;
+    b_0p -= a_02 * b_2p;
+
+    // 3 iteration
+    b_1p *= inv_a_11; /* b_1p /= a_11; */
+    b_0p -= a_01 * b_1p;
+
+    // 4 iteration
+    b_0p *= inv_a_00; /* b_0p /= a_00; */
+
+    // store
+    B[0 * _bs0 + p * _bs1] = b_0p;
+    B[1 * _bs0 + p * _bs1] = b_1p;
+    B[2 * _bs0 + p * _bs1] = b_2p;
+    B[3 * _bs0 + p * _bs1] = b_3p;
+    B[4 * _bs0 + p * _bs1] = b_4p;
+  };
 
-  ///
-  /// Fixed size TRSM
-  /// ================
-  /// L(m x m) X(m x n) = B (m x n)
-
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftUpperNonUnitDiag<5>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (n <= 0) return 0;
-
-    const ValueType 
-      a_01 = A[0*_as0+1*_as1], a_02 = A[0*_as0+2*_as1], a_03 = A[0*_as0+3*_as1], a_04 = A[0*_as0+4*_as1], 
-      /**/                     a_12 = A[1*_as0+2*_as1], a_13 = A[1*_as0+3*_as1], a_14 = A[1*_as0+4*_as1], 
-      /**/                                              a_23 = A[2*_as0+3*_as1], a_24 = A[2*_as0+4*_as1],
-      /**/                                                                       a_34 = A[3*_as0+4*_as1];
-      
-    // const ValueType 
-    //   a_00 = A[0*_as0+0*_as1], 
-    //   a_11 = A[1*_as0+1*_as1], 
-    //   a_22 = A[2*_as0+2*_as1], 
-    //   a_33 = A[3*_as0+3*_as1],
-    //   a_44 = A[4*_as0+4*_as1];
-
-    const ValueType           
-      inv_a_00 = static_cast<ValueType>(1.0)/A[0*_as0+0*_as1],
-      inv_a_11 = static_cast<ValueType>(1.0)/A[1*_as0+1*_as1],
-      inv_a_22 = static_cast<ValueType>(1.0)/A[2*_as0+2*_as1],
-      inv_a_33 = static_cast<ValueType>(1.0)/A[3*_as0+3*_as1],
-      inv_a_44 = static_cast<ValueType>(1.0)/A[4*_as0+4*_as1];
-
-    auto trsv = [&](const int p,
-                    ValueType &b_0p, 
-                    ValueType &b_1p, 
-                    ValueType &b_2p, 
-                    ValueType &b_3p,
-                    ValueType &b_4p) {
-      // load
-      b_0p = B[0*_bs0+p*_bs1];
-      b_1p = B[1*_bs0+p*_bs1];
-      b_2p = B[2*_bs0+p*_bs1];
-      b_3p = B[3*_bs0+p*_bs1];
-      b_4p = B[4*_bs0+p*_bs1];
-        
-      // 0 iteration
-      b_4p *= inv_a_44; /* b_4p /= a_44;*/   
-      b_3p -= a_34 * b_4p;
-      b_2p -= a_24 * b_4p;
-      b_1p -= a_14 * b_4p;
-      b_0p -= a_04 * b_4p;
-
-      // 1 iterationls
-      b_3p *= inv_a_33; /* b_3p /= a_33;*/   
-      b_2p -= a_23 * b_3p;
-      b_1p -= a_13 * b_3p;
-      b_0p -= a_03 * b_3p;
-
-      // 2 iteration
-      b_2p *= inv_a_22; /* b_2p /= a_22; */  
-      b_1p -= a_12 * b_2p;
-      b_0p -= a_02 * b_2p; 
-
-      // 3 iteration
-      b_1p *= inv_a_11; /* b_1p /= a_11; */  
-      b_0p -= a_01 * b_1p;
-
-      // 4 iteration                         
-      b_0p *= inv_a_00; /* b_0p /= a_00; */     
-
-      // store
-      B[0*_bs0+p*_bs1] = b_0p;
-      B[1*_bs0+p*_bs1] = b_1p;
-      B[2*_bs0+p*_bs1] = b_2p;
-      B[3*_bs0+p*_bs1] = b_3p; 
-      B[4*_bs0+p*_bs1] = b_4p; 
-    };
-      
-        
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<n;++p) {
-      ValueType b_p[5];
-      trsv(p, b_p[0], b_p[1], b_p[2], b_p[3], b_p[4]);
-    }
-
-    return 0;
+  for (int p = 0; p < n; ++p) {
+    ValueType b_p[5];
+    trsv(p, b_p[0], b_p[1], b_p[2], b_p[3], b_p[4]);
   }
 
+  return 0;
+}
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<4>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (n <= 0) return 0;
+
+  const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1],
+                  a_03      = A[0 * _as0 + 3 * _as1],
+                  /**/ a_12 = A[1 * _as0 + 2 * _as1],
+                  a_13      = A[1 * _as0 + 3 * _as1],
+                  /**/ a_23 = A[2 * _as0 + 3 * _as1];
+
+  // const ValueType
+  //   a_00 = A[0*_as0+0*_as1],
+  //   a_11 = A[1*_as0+1*_as1],
+  //   a_22 = A[2*_as0+2*_as1],
+  //   a_33 = A[3*_as0+3*_as1];
+
+  const ValueType inv_a_00 =
+                      static_cast<ValueType>(1.0) / A[0 * _as0 + 0 * _as1],
+                  inv_a_11 =
+                      static_cast<ValueType>(1.0) / A[1 * _as0 + 1 * _as1],
+                  inv_a_22 =
+                      static_cast<ValueType>(1.0) / A[2 * _as0 + 2 * _as1],
+                  inv_a_33 =
+                      static_cast<ValueType>(1.0) / A[3 * _as0 + 3 * _as1];
+
+  auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p,
+                  ValueType &b_2p, ValueType &b_3p) {
+    // load
+    b_0p = B[0 * _bs0 + p * _bs1];
+    b_1p = B[1 * _bs0 + p * _bs1];
+    b_2p = B[2 * _bs0 + p * _bs1];
+    b_3p = B[3 * _bs0 + p * _bs1];
+
+    // 0 iteration
+    b_3p *= inv_a_33; /* b_3p /= a_33;*/
+    b_2p -= a_23 * b_3p;
+    b_1p -= a_13 * b_3p;
+    b_0p -= a_03 * b_3p;
+
+    // 1 iteration
+    b_2p *= inv_a_22; /* b_2p /= a_22; */
+    b_1p -= a_12 * b_2p;
+    b_0p -= a_02 * b_2p;
+
+    // 2 iteration
+    b_1p *= inv_a_11; /* b_1p /= a_11; */
+    b_0p -= a_01 * b_1p;
+
+    // 3 iteration
+    b_0p *= inv_a_00; /* b_0p /= a_00; */
+
+    // store
+    B[0 * _bs0 + p * _bs1] = b_0p;
+    B[1 * _bs0 + p * _bs1] = b_1p;
+    B[2 * _bs0 + p * _bs1] = b_2p;
+    B[3 * _bs0 + p * _bs1] = b_3p;
+  };
 
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftUpperNonUnitDiag<4>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (n <= 0) return 0;
-
-    const ValueType 
-      a_01 = A[0*_as0+1*_as1], a_02 = A[0*_as0+2*_as1], a_03 = A[0*_as0+3*_as1], 
-      /**/                     a_12 = A[1*_as0+2*_as1], a_13 = A[1*_as0+3*_as1], 
-      /**/                                              a_23 = A[2*_as0+3*_as1];
-
-
-    // const ValueType 
-    //   a_00 = A[0*_as0+0*_as1], 
-    //   a_11 = A[1*_as0+1*_as1], 
-    //   a_22 = A[2*_as0+2*_as1], 
-    //   a_33 = A[3*_as0+3*_as1];
-
-    const ValueType           
-      inv_a_00 = static_cast<ValueType>(1.0)/A[0*_as0+0*_as1],
-      inv_a_11 = static_cast<ValueType>(1.0)/A[1*_as0+1*_as1],
-      inv_a_22 = static_cast<ValueType>(1.0)/A[2*_as0+2*_as1],
-      inv_a_33 = static_cast<ValueType>(1.0)/A[3*_as0+3*_as1];
-
-    auto trsv = [&](const int p,
-                    ValueType &b_0p, 
-                    ValueType &b_1p, 
-                    ValueType &b_2p, 
-                    ValueType &b_3p) {
-      // load
-      b_0p = B[0*_bs0+p*_bs1];
-      b_1p = B[1*_bs0+p*_bs1];
-      b_2p = B[2*_bs0+p*_bs1];
-      b_3p = B[3*_bs0+p*_bs1];
-        
-      // 0 iteration
-      b_3p *= inv_a_33; /* b_3p /= a_33;*/   
-      b_2p -= a_23 * b_3p;
-      b_1p -= a_13 * b_3p;
-      b_0p -= a_03 * b_3p;
-
-      // 1 iteration
-      b_2p *= inv_a_22; /* b_2p /= a_22; */  
-      b_1p -= a_12 * b_2p;
-      b_0p -= a_02 * b_2p; 
-
-      // 2 iteration
-      b_1p *= inv_a_11; /* b_1p /= a_11; */  
-      b_0p -= a_01 * b_1p;
-
-      // 3 iteration                         
-      b_0p *= inv_a_00; /* b_0p /= a_00; */     
-
-      // store
-      B[0*_bs0+p*_bs1] = b_0p;
-      B[1*_bs0+p*_bs1] = b_1p;
-      B[2*_bs0+p*_bs1] = b_2p;
-      B[3*_bs0+p*_bs1] = b_3p; 
-    };
-      
-        
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<n;++p) {
-      ValueType b_p[4];
-      trsv(p, b_p[0], b_p[1], b_p[2], b_p[3]);
-    }
-
-    return 0;
+  for (int p = 0; p < n; ++p) {
+    ValueType b_p[4];
+    trsv(p, b_p[0], b_p[1], b_p[2], b_p[3]);
   }
 
+  return 0;
+}
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<3>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (n <= 0) return 0;
+
+  const ValueType a_01 = A[0 * _as0 + 1 * _as1], a_02 = A[0 * _as0 + 2 * _as1],
+                  /**/ a_12 = A[1 * _as0 + 2 * _as1];
+
+  // const ValueType
+  //   a_00 = A[0*_as0+0*_as1],
+  //   a_11 = A[1*_as0+1*_as1],
+  //   a_22 = A[2*_as0+2*_as1];
+
+  const ValueType inv_a_00 =
+                      static_cast<ValueType>(1.0) / A[0 * _as0 + 0 * _as1],
+                  inv_a_11 =
+                      static_cast<ValueType>(1.0) / A[1 * _as0 + 1 * _as1],
+                  inv_a_22 =
+                      static_cast<ValueType>(1.0) / A[2 * _as0 + 2 * _as1];
+
+  auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p,
+                  ValueType &b_2p) {
+    // load
+    b_0p = B[0 * _bs0 + p * _bs1];
+    b_1p = B[1 * _bs0 + p * _bs1];
+    b_2p = B[2 * _bs0 + p * _bs1];
+
+    // 0 iteration
+    b_2p *= inv_a_22; /* b_2p /= a_22; */
+    b_1p -= a_12 * b_2p;
+    b_0p -= a_02 * b_2p;
+
+    // 1 iteration
+    b_1p *= inv_a_11; /* b_1p /= a_11; */
+    b_0p -= a_01 * b_1p;
+
+    // 2 iteration
+    b_0p *= inv_a_00; /* b_0p /= a_00; */
+
+    // store
+    B[0 * _bs0 + p * _bs1] = b_0p;
+    B[1 * _bs0 + p * _bs1] = b_1p;
+    B[2 * _bs0 + p * _bs1] = b_2p;
+  };
 
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftUpperNonUnitDiag<3>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (n <= 0) return 0;
-
-    const ValueType 
-      a_01 = A[0*_as0+1*_as1], a_02 = A[0*_as0+2*_as1], 
-      /**/                     a_12 = A[1*_as0+2*_as1]; 
-      
-      
-    // const ValueType 
-    //   a_00 = A[0*_as0+0*_as1], 
-    //   a_11 = A[1*_as0+1*_as1], 
-    //   a_22 = A[2*_as0+2*_as1];
-
-    const ValueType           
-      inv_a_00 = static_cast<ValueType>(1.0)/A[0*_as0+0*_as1],
-      inv_a_11 = static_cast<ValueType>(1.0)/A[1*_as0+1*_as1],
-      inv_a_22 = static_cast<ValueType>(1.0)/A[2*_as0+2*_as1];
-
-    auto trsv = [&](const int p,
-                    ValueType &b_0p, 
-                    ValueType &b_1p, 
-                    ValueType &b_2p) {
-      // load
-      b_0p = B[0*_bs0+p*_bs1];
-      b_1p = B[1*_bs0+p*_bs1];
-      b_2p = B[2*_bs0+p*_bs1];
-
-      // 0 iteration
-      b_2p *= inv_a_22; /* b_2p /= a_22; */  
-      b_1p -= a_12 * b_2p;
-      b_0p -= a_02 * b_2p; 
-
-      // 1 iteration
-      b_1p *= inv_a_11; /* b_1p /= a_11; */  
-      b_0p -= a_01 * b_1p;
-
-      // 2 iteration                         
-      b_0p *= inv_a_00; /* b_0p /= a_00; */     
-
-      // store
-      B[0*_bs0+p*_bs1] = b_0p;
-      B[1*_bs0+p*_bs1] = b_1p;
-      B[2*_bs0+p*_bs1] = b_2p;
-    };
-      
-        
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<n;++p) {
-      ValueType b_p[3];
-      trsv(p, b_p[0], b_p[1], b_p[2]);
-    }
-
-    return 0;
+  for (int p = 0; p < n; ++p) {
+    ValueType b_p[3];
+    trsv(p, b_p[0], b_p[1], b_p[2]);
   }
 
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftUpperNonUnitDiag<2>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (n <= 0) return 0;
-
-    const ValueType 
-      a_01 = A[0*_as0+1*_as1];
-
-    // const ValueType 
-    //   a_00 = A[0*_as0+0*_as1], 
-    //   a_11 = A[1*_as0+1*_as1];
-
-    const ValueType           
-      inv_a_00 = static_cast<ValueType>(1.0)/A[0*_as0+0*_as1],
-      inv_a_11 = static_cast<ValueType>(1.0)/A[1*_as0+1*_as1];
-
-    auto trsv = [&](const int p,
-                    ValueType &b_0p, 
-                    ValueType &b_1p) {
-      // load
-      b_0p = B[0*_bs0+p*_bs1];
-      b_1p = B[1*_bs0+p*_bs1];
-
-      // 2 iteration
-      b_1p *= inv_a_11; /* b_1p /= a_11; */  
-      b_0p -= a_01 * b_1p;
-
-      // 3 iteration                         
-      b_0p *= inv_a_00; /* b_0p /= a_00; */     
-
-      // store
-      B[0*_bs0+p*_bs1] = b_0p;
-      B[1*_bs0+p*_bs1] = b_1p;
-    };
-      
-        
+  return 0;
+}
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<2>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (n <= 0) return 0;
+
+  const ValueType a_01 = A[0 * _as0 + 1 * _as1];
+
+  // const ValueType
+  //   a_00 = A[0*_as0+0*_as1],
+  //   a_11 = A[1*_as0+1*_as1];
+
+  const ValueType inv_a_00 =
+                      static_cast<ValueType>(1.0) / A[0 * _as0 + 0 * _as1],
+                  inv_a_11 =
+                      static_cast<ValueType>(1.0) / A[1 * _as0 + 1 * _as1];
+
+  auto trsv = [&](const int p, ValueType &b_0p, ValueType &b_1p) {
+    // load
+    b_0p = B[0 * _bs0 + p * _bs1];
+    b_1p = B[1 * _bs0 + p * _bs1];
+
+    // 2 iteration
+    b_1p *= inv_a_11; /* b_1p /= a_11; */
+    b_0p -= a_01 * b_1p;
+
+    // 3 iteration
+    b_0p *= inv_a_00; /* b_0p /= a_00; */
+
+    // store
+    B[0 * _bs0 + p * _bs1] = b_0p;
+    B[1 * _bs0 + p * _bs1] = b_1p;
+  };
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<n;++p) {
-      ValueType b_p[2];
-      trsv(p, b_p[0], b_p[1]);
-    }
-
-    return 0;
+  for (int p = 0; p < n; ++p) {
+    ValueType b_p[2];
+    trsv(p, b_p[0], b_p[1]);
   }
 
+  return 0;
+}
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<1>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (n <= 0) return 0;
+
+  // const ValueType
+  //   a_00 = A[0*_as0+0*_as1];
+
+  const ValueType inv_a_00 =
+      static_cast<ValueType>(1.0) / A[0 * _as0 + 0 * _as1];
+
+  auto trsv = [&](const int p, ValueType & /* b_0p */) {
+    // 0 iteration
+    B[0 * _bs0 + p * _bs1] *= inv_a_00; /* b_0p /= a_00; */
+  };
 
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftUpperNonUnitDiag<1>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (n <= 0) return 0;
-
-    // const ValueType 
-    //   a_00 = A[0*_as0+0*_as1];
-
-    const ValueType           
-      inv_a_00 = static_cast<ValueType>(1.0)/A[0*_as0+0*_as1];
-
-    auto trsv = [&](const int p,
-                    ValueType &/* b_0p */) {
-      // 0 iteration                         
-      B[0*_bs0+p*_bs1] *= inv_a_00; /* b_0p /= a_00; */     
-    };
-      
-        
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int p=0;p<n;++p) {
-      ValueType b_p;
-      trsv(p, b_p);
-    }
-
-    return 0;
+  for (int p = 0; p < n; ++p) {
+    ValueType b_p;
+    trsv(p, b_p);
   }
 
+  return 0;
+}
 
-  ///
-  /// TRSM
-  /// ====
-  /// L(m x m) X(m x n) = B (m x n)
-
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftUpperNonUnitDiag<5>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int m, const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (m > 5)
-      Kokkos::abort("InnerTrsmLeftUpperNonUnitDiag<5>::serial_invoke, assert failure (m<=5)");
-    if (m <= 0 || n <= 0) return 0;
-    switch (m) {
-    case 5: { InnerTrsmLeftUpperNonUnitDiag<5> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 4: { InnerTrsmLeftUpperNonUnitDiag<4> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 3: { InnerTrsmLeftUpperNonUnitDiag<3> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 2: { InnerTrsmLeftUpperNonUnitDiag<2> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 1: { InnerTrsmLeftUpperNonUnitDiag<1> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
+///
+/// TRSM
+/// ====
+/// L(m x m) X(m x n) = B (m x n)
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<5>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int m, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (m > 5)
+    Kokkos::abort(
+        "InnerTrsmLeftUpperNonUnitDiag<5>::serial_invoke, assert failure "
+        "(m<=5)");
+  if (m <= 0 || n <= 0) return 0;
+  switch (m) {
+    case 5: {
+      InnerTrsmLeftUpperNonUnitDiag<5> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 4: {
+      InnerTrsmLeftUpperNonUnitDiag<4> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 3: {
+      InnerTrsmLeftUpperNonUnitDiag<3> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 2: {
+      InnerTrsmLeftUpperNonUnitDiag<2> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 1: {
+      InnerTrsmLeftUpperNonUnitDiag<1> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
     }
-    return 0;
   }
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftUpperNonUnitDiag<4>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int m, const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (m > 4)
-      Kokkos::abort("InnerTrsmLeftUpperNonUnitDiag<4>::serial_invoke, assert failure (m<=4)");
-    if (m <= 0 || n <= 0) return 0;
-    switch (m) {
-    case 4: { InnerTrsmLeftUpperNonUnitDiag<4> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 3: { InnerTrsmLeftUpperNonUnitDiag<3> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 2: { InnerTrsmLeftUpperNonUnitDiag<2> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 1: { InnerTrsmLeftUpperNonUnitDiag<1> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
+  return 0;
+}
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<4>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int m, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (m > 4)
+    Kokkos::abort(
+        "InnerTrsmLeftUpperNonUnitDiag<4>::serial_invoke, assert failure "
+        "(m<=4)");
+  if (m <= 0 || n <= 0) return 0;
+  switch (m) {
+    case 4: {
+      InnerTrsmLeftUpperNonUnitDiag<4> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 3: {
+      InnerTrsmLeftUpperNonUnitDiag<3> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 2: {
+      InnerTrsmLeftUpperNonUnitDiag<2> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 1: {
+      InnerTrsmLeftUpperNonUnitDiag<1> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
     }
-    return 0;
   }
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftUpperNonUnitDiag<3>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int m, const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (m > 3)
-      Kokkos::abort("InnerTrsmLeftUpperNonUnitDiag<3>::serial_invoke, assert failure (m<=3)");
-    if (m <= 0 || n <= 0) return 0;
-    switch (m) {
-    case 3: { InnerTrsmLeftUpperNonUnitDiag<3> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 2: { InnerTrsmLeftUpperNonUnitDiag<2> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 1: { InnerTrsmLeftUpperNonUnitDiag<1> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
+  return 0;
+}
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<3>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int m, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (m > 3)
+    Kokkos::abort(
+        "InnerTrsmLeftUpperNonUnitDiag<3>::serial_invoke, assert failure "
+        "(m<=3)");
+  if (m <= 0 || n <= 0) return 0;
+  switch (m) {
+    case 3: {
+      InnerTrsmLeftUpperNonUnitDiag<3> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 2: {
+      InnerTrsmLeftUpperNonUnitDiag<2> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 1: {
+      InnerTrsmLeftUpperNonUnitDiag<1> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
     }
-    return 0;
   }
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftUpperNonUnitDiag<2>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int m, const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (m > 2)
-      Kokkos::abort("InnerTrsmLeftUpperNonUnitDiag<2>::serial_invoke, assert failure (m<=2)");
-    if (m <= 0 || n <= 0) return 0;
-    switch (m) {
-    case 2: { InnerTrsmLeftUpperNonUnitDiag<2> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
-    case 1: { InnerTrsmLeftUpperNonUnitDiag<1> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
+  return 0;
+}
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<2>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int m, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (m > 2)
+    Kokkos::abort(
+        "InnerTrsmLeftUpperNonUnitDiag<2>::serial_invoke, assert failure "
+        "(m<=2)");
+  if (m <= 0 || n <= 0) return 0;
+  switch (m) {
+    case 2: {
+      InnerTrsmLeftUpperNonUnitDiag<2> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
+    }
+    case 1: {
+      InnerTrsmLeftUpperNonUnitDiag<1> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
     }
-    return 0;
   }
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  InnerTrsmLeftUpperNonUnitDiag<1>::
-  serial_invoke(const ValueType *__restrict__ A,
-                const int m, const int n,
-                /**/  ValueType *__restrict__ B) {
-    if (m > 1)
-      Kokkos::abort("InnerTrsmLeftUpperNonUnitDiag<1>::serial_invoke, assert failure (m<=1)");
-    if (m <= 0 || n <= 0) return 0;
-    switch (m) {
-    case 1: { InnerTrsmLeftUpperNonUnitDiag<1> inner(_as0, _as1, _bs0, _bs1); inner.serial_invoke(A, n, B); break; }
+  return 0;
+}
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int InnerTrsmLeftUpperNonUnitDiag<1>::serial_invoke(
+    const ValueType *KOKKOS_RESTRICT A, const int m, const int n,
+    /**/ ValueType *KOKKOS_RESTRICT B) {
+  if (m > 1)
+    Kokkos::abort(
+        "InnerTrsmLeftUpperNonUnitDiag<1>::serial_invoke, assert failure "
+        "(m<=1)");
+  if (m <= 0 || n <= 0) return 0;
+  switch (m) {
+    case 1: {
+      InnerTrsmLeftUpperNonUnitDiag<1> inner(_as0, _as1, _bs0, _bs1);
+      inner.serial_invoke(A, n, B);
+      break;
     }
-    return 0;
   }
-
+  return 0;
 }
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_InverseLU_Serial_Impl.hpp b/src/batched/dense/impl/KokkosBatched_InverseLU_Serial_Impl.hpp
index 55db18b428..3ed623984f 100644
--- a/src/batched/dense/impl/KokkosBatched_InverseLU_Serial_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_InverseLU_Serial_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_INVERSELU_SERIAL_IMPL_HPP__
 #define __KOKKOSBATCHED_INVERSELU_SERIAL_IMPL_HPP__
 
-
 /// \author Vinh Dang (vqdang@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -10,65 +9,64 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Impl
-  /// =========
+///
+/// Serial Impl
+/// =========
 
-  ///
-  /// InverseLU no piv
-  ///
+///
+/// InverseLU no piv
+///
 
-#if                                                     \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&               \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) &&       \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
-  template<>
-  template<typename AViewType,
-           typename WViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialInverseLU<Algo::InverseLU::CompactMKL>::
-  invoke(const AViewType &A,
-         const WViewType &W) {
-    typedef typename AViewType::value_type vector_type;
-    //typedef typename vector_type::value_type value_type;
+#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&         \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
+template <>
+template <typename AViewType, typename WViewType>
+KOKKOS_INLINE_FUNCTION int SerialInverseLU<Algo::InverseLU::CompactMKL>::invoke(
+    const AViewType &A, const WViewType &W) {
+  typedef typename AViewType::value_type vector_type;
+  // typedef typename vector_type::value_type value_type;
 
-    const int
-      m = A.extent(0),
-      n = A.extent(1);
+  const int m = A.extent(0), n = A.extent(1);
 
-    static_assert(is_vector<vector_type>::value, "value type is not vector type");      
-    static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, 
-                  "AVX, AVX2 and AVX512 is supported");
-    static_assert(AViewType::rank == 2, "A should have two dimensions");
-    static_assert(WViewType::rank == 1, "W should have one dimension");
-    static_assert(std::is_same<typename AViewType::memory_space, typename WViewType::memory_space>::value, "A and W should be on the same memory space");
-    static_assert(!std::is_same<typename WViewType::array_layout, Kokkos::LayoutStride>::value, "W should be an contiguous 1D array");
-    assert(A.extent(0)*A.extent(1)*sizeof(typename AViewType::value_type) <= W.span()*sizeof(typename WViewType::value_type));
-    assert(m==n);
+  static_assert(is_vector<vector_type>::value, "value type is not vector type");
+  static_assert(
+      vector_type::vector_length == 4 || vector_type::vector_length == 8,
+      "AVX, AVX2 and AVX512 is supported");
+  static_assert(AViewType::rank == 2, "A should have two dimensions");
+  static_assert(WViewType::rank == 1, "W should have one dimension");
+  static_assert(std::is_same<typename AViewType::memory_space,
+                             typename WViewType::memory_space>::value,
+                "A and W should be on the same memory space");
+  static_assert(!std::is_same<typename WViewType::array_layout,
+                              Kokkos::LayoutStride>::value,
+                "W should be an contiguous 1D array");
+  assert(A.extent(0) * A.extent(1) * sizeof(typename AViewType::value_type) <=
+         W.span() * sizeof(typename WViewType::value_type));
+  assert(m == n);
 
-    const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ?  MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
+  const MKL_COMPACT_PACK format =
+      vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
 
-    int r_val = 0;
-    if (A.stride(0) == 1) {
-      mkl_dgetrinp_compact (MKL_COL_MAJOR, n, 
-                            (double*)A.data(), A.stride(1), 
-                            (double*)W.data(), (MKL_INT)(n*n*vector_type::vector_length), 
-                            (MKL_INT*)&r_val, format, (MKL_INT)vector_type::vector_length);
+  int r_val = 0;
+  if (A.stride(0) == 1) {
+    mkl_dgetrinp_compact(
+        MKL_COL_MAJOR, n, (double *)A.data(), A.stride(1), (double *)W.data(),
+        (MKL_INT)(n * n * vector_type::vector_length), (MKL_INT *)&r_val,
+        format, (MKL_INT)vector_type::vector_length);
 
-    } else if (A.stride(1) == 1) {
-      mkl_dgetrinp_compact (MKL_ROW_MAJOR, n, 
-                            (double*)A.data(), A.stride(0), 
-                            (double*)W.data(), (MKL_INT)(n*n*vector_type::vector_length), 
-                            (MKL_INT*)&r_val, format, (MKL_INT)vector_type::vector_length);
-    } else {
-      r_val = -1;
-    }
-    return r_val;
+  } else if (A.stride(1) == 1) {
+    mkl_dgetrinp_compact(
+        MKL_ROW_MAJOR, n, (double *)A.data(), A.stride(0), (double *)W.data(),
+        (MKL_INT)(n * n * vector_type::vector_length), (MKL_INT *)&r_val,
+        format, (MKL_INT)vector_type::vector_length);
+  } else {
+    r_val = -1;
   }
-#endif
-
+  return r_val;
 }
+#endif
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_LU_Serial_Impl.hpp b/src/batched/dense/impl/KokkosBatched_LU_Serial_Impl.hpp
index 5c3721a674..89173aed30 100644
--- a/src/batched/dense/impl/KokkosBatched_LU_Serial_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_LU_Serial_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_LU_SERIAL_IMPL_HPP__
 #define __KOKKOSBATCHED_LU_SERIAL_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,80 +8,71 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Impl
-  /// =========
+///
+/// Serial Impl
+/// =========
 
-  ///
-  /// SerialLU no piv
-  ///
+///
+/// SerialLU no piv
+///
 
-#if                                                     \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&               \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) &&       \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
-  template<>
-  template<typename AViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialLU<Algo::LU::CompactMKL>::
-  invoke(const AViewType &A,
-         const typename MagnitudeScalarType<typename AViewType::non_const_value_type>::type tiny) {
-    typedef typename AViewType::value_type vector_type;
-    //typedef typename vector_type::value_type value_type;
+#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&         \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
+template <>
+template <typename AViewType>
+KOKKOS_INLINE_FUNCTION int SerialLU<Algo::LU::CompactMKL>::invoke(
+    const AViewType &A,
+    const typename MagnitudeScalarType<
+        typename AViewType::non_const_value_type>::type tiny) {
+  typedef typename AViewType::value_type vector_type;
+  // typedef typename vector_type::value_type value_type;
 
-    const int
-      m = A.extent(0),
-      n = A.extent(1);
+  const int m = A.extent(0), n = A.extent(1);
 
-    static_assert(is_vector<vector_type>::value, "value type is not vector type");      
-    static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, 
-                  "AVX, AVX2 and AVX512 is supported");
-    const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ?  MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
+  static_assert(is_vector<vector_type>::value, "value type is not vector type");
+  static_assert(
+      vector_type::vector_length == 4 || vector_type::vector_length == 8,
+      "AVX, AVX2 and AVX512 is supported");
+  const MKL_COMPACT_PACK format =
+      vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
 
-    int r_val = 0;
-    if (A.stride_0() == 1) {
-      mkl_dgetrfnp_compact(MKL_COL_MAJOR, 
-                           m, n, 
-                           (double*)A.data(), A.stride_1(), 
-                           (MKL_INT*)&r_val, format, (MKL_INT)vector_type::vector_length);
-    } else if (A.stride_1() == 1) {
-      mkl_dgetrfnp_compact(MKL_ROW_MAJOR, 
-                           m, n, 
-                           (double*)A.data(), A.stride_0(), 
-                           (MKL_INT*)&r_val, format, (MKL_INT)vector_type::vector_length);
-    } else {
-      r_val = -1;
-    }
-    return r_val;
+  int r_val = 0;
+  if (A.stride_0() == 1) {
+    mkl_dgetrfnp_compact(MKL_COL_MAJOR, m, n, (double *)A.data(), A.stride_1(),
+                         (MKL_INT *)&r_val, format,
+                         (MKL_INT)vector_type::vector_length);
+  } else if (A.stride_1() == 1) {
+    mkl_dgetrfnp_compact(MKL_ROW_MAJOR, m, n, (double *)A.data(), A.stride_0(),
+                         (MKL_INT *)&r_val, format,
+                         (MKL_INT)vector_type::vector_length);
+  } else {
+    r_val = -1;
   }
+  return r_val;
+}
 #endif
 
-  template<>
-  template<typename AViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialLU<Algo::LU::Unblocked>::
-  invoke(const AViewType &A,
-         const typename MagnitudeScalarType<typename AViewType::non_const_value_type>::type tiny) {
-    return SerialLU_Internal<Algo::LU::Unblocked>::invoke(A.extent(0), A.extent(1),
-                                                          A.data(), A.stride_0(), A.stride_1(),
-                                                          tiny);
-  }
-    
-  template<>
-  template<typename AViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialLU<Algo::LU::Blocked>::
-  invoke(const AViewType &A,
-         const typename MagnitudeScalarType<typename AViewType::non_const_value_type>::type tiny) {
-    return SerialLU_Internal<Algo::LU::Blocked>::invoke(A.extent(0), A.extent(1),
-                                                        A.data(), A.stride_0(), A.stride_1(),
-                                                        tiny);
-  }
+template <>
+template <typename AViewType>
+KOKKOS_INLINE_FUNCTION int SerialLU<Algo::LU::Unblocked>::invoke(
+    const AViewType &A,
+    const typename MagnitudeScalarType<
+        typename AViewType::non_const_value_type>::type tiny) {
+  return SerialLU_Internal<Algo::LU::Unblocked>::invoke(
+      A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), tiny);
+}
 
+template <>
+template <typename AViewType>
+KOKKOS_INLINE_FUNCTION int SerialLU<Algo::LU::Blocked>::invoke(
+    const AViewType &A,
+    const typename MagnitudeScalarType<
+        typename AViewType::non_const_value_type>::type tiny) {
+  return SerialLU_Internal<Algo::LU::Blocked>::invoke(
+      A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), tiny);
 }
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp
index 8c5f987b33..5523f20653 100644
--- a/src/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_LU_SERIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_LU_SERIAL_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -11,135 +10,119 @@
 #include "KokkosBatched_Gemm_Serial_Internal.hpp"
 
 namespace KokkosBatched {
-  
-  ///
-  /// Serial Internal Impl
-  /// ====================
-
-  template<typename AlgoType>
-  struct SerialLU_Internal {
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int 
-    invoke(const int m, const int n,
-           ValueType *__restrict__ A, const int as0, const int as1,
-           const typename MagnitudeScalarType<ValueType>::type tiny);
-  };
 
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialLU_Internal<Algo::LU::Unblocked>::
-  invoke(const int m, const int n,
-         ValueType *__restrict__ A, const int as0, const int as1,
-         const typename MagnitudeScalarType<ValueType>::type tiny) {
-    const int k = (m < n ? m : n);
-    if (k <= 0) return 0;
-
-    using mst = typename MagnitudeScalarType<ValueType>::type;
-    const auto       abs_tiny =  tiny > 0 ? tiny : mst(-tiny);
-    const auto minus_abs_tiny = -abs_tiny;
-
-    for (int p=0;p<k;++p) {
-      const int iend = m-p-1, jend = n-p-1;
-
-      const ValueType
-        *__restrict__ a12t = A+(p  )*as0+(p+1)*as1;
-        
-      ValueType
-        *__restrict__ a21  = A+(p+1)*as0+(p  )*as1,
-        *__restrict__ A22  = A+(p+1)*as0+(p+1)*as1;
-
-      if (tiny != 0) {
-        ValueType &alpha11_reference = A[p*as0+p*as1];
-        const auto alpha11_real = Kokkos::Details::ArithTraits<ValueType>::real(alpha11_reference);
-        alpha11_reference += minus_abs_tiny*ValueType(alpha11_real <  0);
-        alpha11_reference +=       abs_tiny*ValueType(alpha11_real >= 0);
-      }
-
-      const ValueType
-        alpha11 = A[p*as0+p*as1];
-        
-      for (int i=0;i<iend;++i) {
-        // a21[i*as0] *= inv_alpha11; 
-        a21[i*as0] /= alpha11;
-              
+///
+/// Serial Internal Impl
+/// ====================
+
+template <typename AlgoType>
+struct SerialLU_Internal {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const int m, const int n, ValueType *KOKKOS_RESTRICT A, const int as0,
+      const int as1, const typename MagnitudeScalarType<ValueType>::type tiny);
+};
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int SerialLU_Internal<Algo::LU::Unblocked>::invoke(
+    const int m, const int n, ValueType *KOKKOS_RESTRICT A, const int as0,
+    const int as1, const typename MagnitudeScalarType<ValueType>::type tiny) {
+  const int k = (m < n ? m : n);
+  if (k <= 0) return 0;
+
+  using mst                 = typename MagnitudeScalarType<ValueType>::type;
+  const auto abs_tiny       = tiny > 0 ? tiny : mst(-tiny);
+  const auto minus_abs_tiny = -abs_tiny;
+
+  for (int p = 0; p < k; ++p) {
+    const int iend = m - p - 1, jend = n - p - 1;
+
+    const ValueType *KOKKOS_RESTRICT a12t = A + (p)*as0 + (p + 1) * as1;
+
+    ValueType *KOKKOS_RESTRICT a21 = A + (p + 1) * as0 + (p)*as1,
+                               *KOKKOS_RESTRICT A22 =
+                                   A + (p + 1) * as0 + (p + 1) * as1;
+
+    if (tiny != 0) {
+      ValueType &alpha11_reference = A[p * as0 + p * as1];
+      const auto alpha11_real =
+          Kokkos::Details::ArithTraits<ValueType>::real(alpha11_reference);
+      alpha11_reference += minus_abs_tiny * ValueType(alpha11_real < 0);
+      alpha11_reference += abs_tiny * ValueType(alpha11_real >= 0);
+    }
+
+    const ValueType alpha11 = A[p * as0 + p * as1];
+
+    for (int i = 0; i < iend; ++i) {
+      // a21[i*as0] *= inv_alpha11;
+      a21[i * as0] /= alpha11;
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-        for (int j=0;j<jend;++j)
-          A22[i*as0+j*as1] -= a21[i*as0] * a12t[j*as1];
-      }
+      for (int j = 0; j < jend; ++j)
+        A22[i * as0 + j * as1] -= a21[i * as0] * a12t[j * as1];
     }
-    return 0;
   }
-    
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialLU_Internal<Algo::LU::Blocked>::
-  invoke(const int m, const int n,
-         ValueType *__restrict__ A, const int as0, const int as1,
-         const typename MagnitudeScalarType<ValueType>::type /*tiny*/) {
-    enum : int {
-      mbAlgo = Algo::LU::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
-    };
-    const typename MagnitudeScalarType<ValueType>::type one(1.0), minus_one(-1.0);
-
-    const int k = (m < n ? m : n);
-    if (k <= 0) return 0;
-      
-    InnerLU<mbAlgo> lu(as0, as1);
-          
-    InnerTrsmLeftLowerUnitDiag<mbAlgo>    trsm_llu(as0, as1, as0, as1);
-    InnerTrsmLeftLowerNonUnitDiag<mbAlgo> trsm_run(as1, as0, as1, as0);
-
-    auto lu_factorize = [&](const int ib,
-                            const int jb,
-                            ValueType *__restrict__ AA) {
-      const int mb = mbAlgo;
-      const int kb = ib < jb ? ib : jb; 
-      for (int p=0;p<kb;p+=mb) {
-        const int pb = (p+mb) > kb ? (kb-p) : mb;
-
-        // diagonal block
-        ValueType *__restrict__ Ap = AA+p*as0+p*as1;
-
-        // lu on a block             
-        lu.serial_invoke(pb, Ap);
-
-        // dimension ABR
-        const int m_abr = ib-p-mb, n_abr = jb-p-mb;
-
-        // trsm update
-        trsm_llu.serial_invoke(Ap, pb, n_abr, Ap+mb*as1);
-        trsm_run.serial_invoke(Ap, pb, m_abr, Ap+mb*as0);
-            
-        // gemm update
-        SerialGemmInternal<Algo::Gemm::Blocked>::
-          invoke(m_abr, n_abr, pb,
-                 minus_one,
-                 Ap+mb*as0, as0, as1,
-                 Ap+mb*as1, as0, as1,
-                 one,
-                 Ap+mb*as0+mb*as1, as0, as1);
-      }
-    };
-
-    const bool is_small = true; //(m*n <= 64*64);
-    if (is_small) {
-      lu_factorize(m, n, A);
-    } else {
-      // // some cache blocking may need (not priority yet);
-      // lu_factorize(m, n, A);
+  return 0;
+}
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int SerialLU_Internal<Algo::LU::Blocked>::invoke(
+    const int m, const int n, ValueType *KOKKOS_RESTRICT A, const int as0,
+    const int as1,
+    const typename MagnitudeScalarType<ValueType>::type /*tiny*/) {
+  constexpr int mbAlgo = Algo::LU::Blocked::mb();
+  const typename MagnitudeScalarType<ValueType>::type one(1.0), minus_one(-1.0);
+
+  const int k = (m < n ? m : n);
+  if (k <= 0) return 0;
+
+  InnerLU<mbAlgo> lu(as0, as1);
+
+  InnerTrsmLeftLowerUnitDiag<mbAlgo> trsm_llu(as0, as1, as0, as1);
+  InnerTrsmLeftLowerNonUnitDiag<mbAlgo> trsm_run(as1, as0, as1, as0);
+
+  auto lu_factorize = [&](const int ib, const int jb,
+                          ValueType *KOKKOS_RESTRICT AA) {
+    const int mb = mbAlgo;
+    const int kb = ib < jb ? ib : jb;
+    for (int p = 0; p < kb; p += mb) {
+      const int pb = (p + mb) > kb ? (kb - p) : mb;
+
+      // diagonal block
+      ValueType *KOKKOS_RESTRICT Ap = AA + p * as0 + p * as1;
+
+      // lu on a block
+      lu.serial_invoke(pb, Ap);
+
+      // dimension ABR
+      const int m_abr = ib - p - mb, n_abr = jb - p - mb;
+
+      // trsm update
+      trsm_llu.serial_invoke(Ap, pb, n_abr, Ap + mb * as1);
+      trsm_run.serial_invoke(Ap, pb, m_abr, Ap + mb * as0);
+
+      // gemm update
+      SerialGemmInternal<Algo::Gemm::Blocked>::invoke(
+          m_abr, n_abr, pb, minus_one, Ap + mb * as0, as0, as1, Ap + mb * as1,
+          as0, as1, one, Ap + mb * as0 + mb * as1, as0, as1);
     }
+  };
 
-    return 0;
+  const bool is_small = true;  //(m*n <= 64*64);
+  if (is_small) {
+    lu_factorize(m, n, A);
+  } else {
+    // // some cache blocking may need (not priority yet);
+    // lu_factorize(m, n, A);
   }
 
+  return 0;
 }
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_LU_Team_Impl.hpp b/src/batched/dense/impl/KokkosBatched_LU_Team_Impl.hpp
index 4604de622c..5a410ccde6 100644
--- a/src/batched/dense/impl/KokkosBatched_LU_Team_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_LU_Team_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_LU_TEAM_IMPL_HPP__
 #define __KOKKOSBATCHED_LU_TEAM_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,43 +8,40 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Team Impl
-  /// =========
-
-  ///
-  /// LU no piv
-  ///
-    
-  template<typename MemberType>
-  struct TeamLU<MemberType,Algo::LU::Unblocked> {
-    template<typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, const AViewType &A,
-           const typename MagnitudeScalarType<typename AViewType::non_const_value_type>::type tiny = 0) {
-      return TeamLU_Internal<Algo::LU::Unblocked>::invoke(member,
-                                                          A.extent(0), A.extent(1),
-                                                          A.data(), A.stride_0(), A.stride_1(),
-                                                          tiny);
-    }
-  };
-    
-  template<typename MemberType>
-  struct TeamLU<MemberType,Algo::LU::Blocked> {
-    template<typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, const AViewType &A,
-           const typename MagnitudeScalarType<typename AViewType::non_const_value_type>::type tiny = 0) {
-      return TeamLU_Internal<Algo::LU::Blocked>::invoke(member,
-                                                        A.extent(0), A.extent(1),
-                                                        A.data(), A.stride_0(), A.stride_1(),
-                                                        tiny);
-    }
-  };
-
-}
-
+///
+/// Team Impl
+/// =========
+
+///
+/// LU no piv
+///
+
+template <typename MemberType>
+struct TeamLU<MemberType, Algo::LU::Unblocked> {
+  template <typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const AViewType &A,
+      const typename MagnitudeScalarType<
+          typename AViewType::non_const_value_type>::type tiny = 0) {
+    return TeamLU_Internal<Algo::LU::Unblocked>::invoke(
+        member, A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(),
+        tiny);
+  }
+};
+
+template <typename MemberType>
+struct TeamLU<MemberType, Algo::LU::Blocked> {
+  template <typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const AViewType &A,
+      const typename MagnitudeScalarType<
+          typename AViewType::non_const_value_type>::type tiny = 0) {
+    return TeamLU_Internal<Algo::LU::Blocked>::invoke(
+        member, A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(),
+        tiny);
+  }
+};
+
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp
index a1c0c525e0..77b327e625 100644
--- a/src/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_LU_TEAM_INTERNAL_HPP__
 #define __KOKKOSBATCHED_LU_TEAM_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -13,167 +12,148 @@
 #include "KokkosBatched_Gemm_Team_Internal.hpp"
 
 namespace KokkosBatched {
-  
-  ///
-  /// Team Internal Impl
-  /// ==================
-
-  template<typename AlgoType>
-  struct TeamLU_Internal {
-    template<typename MemberType, typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int 
-    invoke(const MemberType &member,
-           const int m, const int n,
-           ValueType *__restrict__ A, const int as0, const int as1,
-           const typename MagnitudeScalarType<ValueType>::type tiny);
-  };
-
-  template<>
-  template<typename MemberType, typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamLU_Internal<Algo::LU::Unblocked>::
-  invoke(const MemberType &member, 
-         const int m, const int n,
-         ValueType *__restrict__ A, const int as0, const int as1,
-         const typename MagnitudeScalarType<ValueType>::type tiny) {
 
-    const int k = (m < n ? m : n);
-    if (k <= 0) return 0;
-
-    const auto       abs_tiny =  tiny > 0 ? tiny : -tiny;
-    const auto minus_abs_tiny = -abs_tiny;
-
-    for (int p=0;p<k;++p) {
-      // Made this non-const in order to WORKAROUND issue #349
-      int iend = m-p-1;
-      int jend = n-p-1;
-
-      const ValueType 
-        *__restrict__ a12t = A+(p  )*as0+(p+1)*as1;
-
-      ValueType
-        *__restrict__ a21  = A+(p+1)*as0+(p  )*as1,
-        *__restrict__ A22  = A+(p+1)*as0+(p+1)*as1;
-
-      if (tiny != 0) {
-        if (member.team_rank() == 0) {
-          ValueType &alpha11_reference = A[p*as0+p*as1];
-          const auto alpha11_real = Kokkos::Details::ArithTraits<ValueType>::real(alpha11_reference);
-          alpha11_reference += minus_abs_tiny*ValueType(alpha11_real <  0);
-          alpha11_reference +=       abs_tiny*ValueType(alpha11_real >= 0);
-        }
+///
+/// Team Internal Impl
+/// ==================
+
+template <typename AlgoType>
+struct TeamLU_Internal {
+  template <typename MemberType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const int m, const int n,
+      ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+      const typename MagnitudeScalarType<ValueType>::type tiny);
+};
+
+template <>
+template <typename MemberType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int TeamLU_Internal<Algo::LU::Unblocked>::invoke(
+    const MemberType &member, const int m, const int n,
+    ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+    const typename MagnitudeScalarType<ValueType>::type tiny) {
+  const int k = (m < n ? m : n);
+  if (k <= 0) return 0;
+
+  const auto abs_tiny       = tiny > 0 ? tiny : -tiny;
+  const auto minus_abs_tiny = -abs_tiny;
+
+  for (int p = 0; p < k; ++p) {
+    // Made this non-const in order to WORKAROUND issue #349
+    int iend = m - p - 1;
+    int jend = n - p - 1;
+
+    const ValueType *KOKKOS_RESTRICT a12t = A + (p)*as0 + (p + 1) * as1;
+
+    ValueType *KOKKOS_RESTRICT a21 = A + (p + 1) * as0 + (p)*as1,
+                               *KOKKOS_RESTRICT A22 =
+                                   A + (p + 1) * as0 + (p + 1) * as1;
+
+    if (tiny != 0) {
+      if (member.team_rank() == 0) {
+        ValueType &alpha11_reference = A[p * as0 + p * as1];
+        const auto alpha11_real =
+            Kokkos::Details::ArithTraits<ValueType>::real(alpha11_reference);
+        alpha11_reference += minus_abs_tiny * ValueType(alpha11_real < 0);
+        alpha11_reference += abs_tiny * ValueType(alpha11_real >= 0);
       }
+    }
 
-      member.team_barrier();
-      const ValueType
-        alpha11 = A[p*as0+p*as1];
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,iend),[&](const int &i) {
-          // a21[i*as0] *= inv_alpha11; 
-          a21[i*as0] /= alpha11;
-        });
-            
-      member.team_barrier();
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,iend*jend),[&](const int &ij) {
+    member.team_barrier();
+    const ValueType alpha11 = A[p * as0 + p * as1];
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, iend),
+                         [&](const int &i) {
+                           // a21[i*as0] *= inv_alpha11;
+                           a21[i * as0] /= alpha11;
+                         });
+
+    member.team_barrier();
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(member, 0, iend * jend), [&](const int &ij) {
           // assume layout right for batched computation
-          const int i = ij/jend, j = ij%jend;
-          A22[i*as0+j*as1] -= a21[i*as0] * a12t[j*as1];
+          const int i = ij / jend, j = ij % jend;
+          A22[i * as0 + j * as1] -= a21[i * as0] * a12t[j * as1];
         });
-    }
-    return 0;
   }
-    
-  template<>
-  template<typename MemberType, typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamLU_Internal<Algo::LU::Blocked>::
-  invoke(const MemberType &member, 
-         const int m, const int n,
-         ValueType *__restrict__ A, const int as0, const int as1,
-         const typename MagnitudeScalarType<ValueType>::type /*tiny*/) {
-
-    enum : int {
-      mbAlgo = Algo::LU::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
-    };
-
-    const int k = (m < n ? m : n);
-    if (k <= 0) return 0;
-
-    const typename MagnitudeScalarType<ValueType>::type one(1.0), minus_one(-1.0);
-
-    InnerLU<mbAlgo> lu(as0, as1);
-          
-    InnerTrsmLeftLowerUnitDiag<mbAlgo>    trsm_llu(as0, as1, as0, as1);
-    InnerTrsmLeftLowerNonUnitDiag<mbAlgo> trsm_run(as1, as0, as1, as0);
-    auto lu_factorize = [&](const int ib,
-                            const int jb,
-                            ValueType *__restrict__ AA) {
-      const int tsize = member.team_size();
+  return 0;
+}
+
+template <>
+template <typename MemberType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int TeamLU_Internal<Algo::LU::Blocked>::invoke(
+    const MemberType &member, const int m, const int n,
+    ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+    const typename MagnitudeScalarType<ValueType>::type /*tiny*/) {
+  constexpr int mbAlgo = Algo::LU::Blocked::mb();
+
+  const int k = (m < n ? m : n);
+  if (k <= 0) return 0;
+
+  const typename MagnitudeScalarType<ValueType>::type one(1.0), minus_one(-1.0);
+
+  InnerLU<mbAlgo> lu(as0, as1);
+
+  InnerTrsmLeftLowerUnitDiag<mbAlgo> trsm_llu(as0, as1, as0, as1);
+  InnerTrsmLeftLowerNonUnitDiag<mbAlgo> trsm_run(as1, as0, as1, as0);
+  auto lu_factorize = [&](const int ib, const int jb,
+                          ValueType *KOKKOS_RESTRICT AA) {
+    const int tsize = member.team_size();
+    // Made this non-const in order to WORKAROUND issue #349
+    int mb = mbAlgo;
+    int nb = ((jb - mb) + (ib - mb)) > 0
+                 ? ((jb - mb) + (ib - mb)) / tsize +
+                       (((jb - mb) + (ib - mb)) % tsize > 0)
+                 : 1;
+    const int kb = ib < jb ? ib : jb;
+
+    for (int p = 0; p < kb; p += mb) {
+      const int pb = (p + mb) > kb ? (kb - p) : mb;
+
+      // diagonal block
+      ValueType *KOKKOS_RESTRICT Ap = AA + p * as0 + p * as1;
+
+      // lu on a block
+      member.team_barrier();
+      if (member.team_rank() == 0) lu.serial_invoke(pb, Ap);
+      member.team_barrier();
+
       // Made this non-const in order to WORKAROUND issue #349
-      int mb = mbAlgo;
-      int nb = ((jb-mb) + (ib-mb)) > 0?
-      ((jb-mb) + (ib-mb))/tsize + (((jb-mb) + (ib-mb))%tsize > 0):
-      1;
-      const int kb = ib < jb ? ib : jb; 
-
-      for (int p=0;p<kb;p+=mb) {
-        const int pb = (p+mb) > kb ? (kb-p) : mb;
-
-        // diagonal block
-        ValueType *__restrict__ Ap = AA+p*as0+p*as1;
-
-        // lu on a block             
-        member.team_barrier();
-        if (member.team_rank() == 0)
-          lu.serial_invoke(pb, Ap);
-        member.team_barrier();
-
-        // Made this non-const in order to WORKAROUND issue #349
-        int 
-          m_abr  = ib-p-mb,               n_abr  = jb-p-mb,
-          mp_abr = m_abr%nb,              np_abr = n_abr%nb,
-          mq_abr = (m_abr/nb)+(mp_abr>0), nq_abr = (n_abr/nb)+(np_abr>0);
-
-        // trsm update
-        Kokkos::parallel_for
-          (Kokkos::TeamThreadRange(member,0,mq_abr+nq_abr),
-           [&](const int &ij) {
+      int m_abr = ib - p - mb, n_abr = jb - p - mb, mp_abr = m_abr % nb,
+          np_abr = n_abr % nb, mq_abr = (m_abr / nb) + (mp_abr > 0),
+          nq_abr = (n_abr / nb) + (np_abr > 0);
+
+      // trsm update
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, 0, mq_abr + nq_abr),
+          [&](const int &ij) {
             if (ij < nq_abr) {
-              const int j = (ij)*nb, qb = (j+nb) > n_abr ? np_abr : nb;
-              trsm_llu.serial_invoke(Ap, pb, qb, Ap+(j+mb)*as1);
+              const int j = (ij)*nb, qb = (j + nb) > n_abr ? np_abr : nb;
+              trsm_llu.serial_invoke(Ap, pb, qb, Ap + (j + mb) * as1);
             } else {
-              const int i = (ij-nq_abr)*nb , qb = (i+nb) > m_abr ? mp_abr : nb;
-              trsm_run.serial_invoke(Ap, pb, qb, Ap+(i+mb)*as0);
+              const int i  = (ij - nq_abr) * nb,
+                        qb = (i + nb) > m_abr ? mp_abr : nb;
+              trsm_run.serial_invoke(Ap, pb, qb, Ap + (i + mb) * as0);
             }
           });
-        member.team_barrier();
-
-        // gemm update
-        TeamGemmInternal<Algo::Gemm::Blocked>::
-          invoke(member, 
-                 m_abr, n_abr, pb,
-                 minus_one,
-                 Ap+mb*as0, as0, as1,
-                 Ap+mb*as1, as0, as1,
-                 one,
-                 Ap+mb*as0+mb*as1, as0, as1);
-      }
-    };
-            
-    const bool is_small = true; //(m*n <= 64*64);
-    if (is_small) {
-      lu_factorize(m, n, A);
-    } else {
-      // // some cache blocking may need (not priority yet);
-      // lu_factorize(m, n, A);
+      member.team_barrier();
+
+      // gemm update
+      TeamGemmInternal<Algo::Gemm::Blocked>::invoke(
+          member, m_abr, n_abr, pb, minus_one, Ap + mb * as0, as0, as1,
+          Ap + mb * as1, as0, as1, one, Ap + mb * as0 + mb * as1, as0, as1);
     }
+  };
 
-    return 0;
+  const bool is_small = true;  //(m*n <= 64*64);
+  if (is_small) {
+    lu_factorize(m, n, A);
+  } else {
+    // // some cache blocking may need (not priority yet);
+    // lu_factorize(m, n, A);
   }
-}
-
 
+  return 0;
+}
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp
index db21a13588..10e55e1302 100644
--- a/src/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_LEFT_EIGENVECTOR_FROM_SCHUR_SERIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_LEFT_EIGENVECTOR_FROM_SCHUR_SERIAL_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,140 +8,138 @@
 
 namespace KokkosBatched {
 
+///
+/// Serial Internal Impl
+/// ====================
+///
+/// this impl follows the flame interface of householder transformation
+///
+struct SerialLeftEigenvectorFromSchurInternal {
+  /// Given a quasi upper triangular matrix S (m x m), this computes all left
+  /// eigenvectors.
   ///
-  /// Serial Internal Impl
-  /// ==================== 
-  ///
-  /// this impl follows the flame interface of householder transformation
-  ///
-  struct SerialLeftEigenvectorFromSchurInternal {
-    /// Given a quasi upper triangular matrix S (m x m), this computes all left 
-    /// eigenvectors.
-    /// 
-    /// Parameters:
-    ///   [in]m 
-    ///     A dimension of the square matrix S.
-    ///   [in]S, [in]ss0, [in]ss1 
-    ///     A quasi upper triangular part of Schur decomposition which is computed 
-    ///       A = U^H S U
-    ///   [out]V, [in]vs0, [out]vs1 
-    ///     A set of left eigen vectors.
-    ///   [in]w
-    ///     contiguous workspace that can hold complex array (m)
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m,
-           /* */ ValueType * S, const int ss0, const int ss1,
-           /* */ ValueType * V, const int vs0, const int vs1,
-           /* */ ValueType * w,
-           const int * blks) {
-      typedef ValueType value_type;
-      typedef Kokkos::Details::ArithTraits<value_type> ats;
-      //typedef typename ats::mag_type mag_type;
-      typedef Kokkos::complex<value_type> complex_type;
-
-      const value_type zero(0), one(1);
-      /// SerialSetInternal::invoke(m, m, zero, V, vs0, vs1);
-
-      value_type *b = w; // consider complex case
-
-      /// partitions used for loop iteration
-      Partition2x2<value_type> S_part2x2(ss0, ss1);
-      Partition3x3<value_type> S_part3x3(ss0, ss1);
-        
-      Partition2x1<value_type> V_part2x1(vs0);
-      Partition3x1<value_type> V_part3x1(vs0);
-        
-      /// initial partition of S where ATL has a zero dimension
-      S_part2x2.partWithATL(S, m, m, 0, 0);
-      V_part2x1.partWithAT(V, m, 0);
-
-      //const mag_type tol = ats::epsilon();
-      int m_stl = 0;
-      for (;m_stl<(m-1);) {
-        /// part 2x2 into 3x3
-        const int mA11 = blks[m_stl];
-        assert( ((mA11 == 1) || (mA11 == 2)) && "LeftEigenvectorFromSchur: blk is not 1x1 nor 2x2");
-
-        S_part3x3.partWithABR(S_part2x2, mA11, mA11);
-        V_part3x1.partWithAB(V_part2x1, mA11);
-
-        const int m_stl_plus_mA11 = m_stl+mA11;
-        if (mA11 == 1) {
-          /// real eigenvalue 
-          const value_type lambda = *S_part3x3.A11;
-
-          /// initialize a left hand side
-          b[m_stl] = one;
-          for (int j=0;j<(m-m_stl_plus_mA11);++j) 
-            b[j+m_stl_plus_mA11] = -S_part3x3.A12[j*ss1];
-
-          /// perform shifted trsv (transposed)
-          SerialShiftedTrsvInternalLower::invoke(m-m_stl_plus_mA11, lambda,
-                                                 S_part3x3.A22, ss1, ss0,
-                                                 b+m_stl_plus_mA11, 1,
-                                                 blks+m_stl_plus_mA11);
-            
-          /// copy back to V (row wise copy)
-          for (int j=0;j<m_stl;++j) V_part3x1.A1[j*vs1] = zero;
-          for (int j=m_stl;j<m;++j) V_part3x1.A1[j*vs1] = b[j];                          
-        } else {
-          /// complex eigen pair  
-          const value_type 
-            alpha11 = S_part3x3.A11[0],
-            alpha12 = S_part3x3.A11[ss1],
-            alpha21 = S_part3x3.A11[ss0],
-            beta = ats::sqrt(-alpha12*alpha21);
-
-          const complex_type lambda(alpha11, beta);
-          complex_type * bc = (complex_type*)(b);
-            
-          /// initialize a left hand side
-          bc[m_stl  ] = complex_type(beta, zero);
-          bc[m_stl+1] = complex_type(zero, -alpha12);
-
-          const value_type * S_A12_a = S_part3x3.A12;
-          const value_type * S_A12_b = S_part3x3.A12 + ss0;
-          for (int j=0;j<(m-m_stl_plus_mA11);++j)
-            bc[j+m_stl_plus_mA11] = complex_type(-S_A12_a[j*ss1]*beta, S_A12_b[j*ss1]*alpha12);
-            
-          /// perform shifted trsv
-          SerialShiftedTrsvInternalLower::invoke(m-m_stl_plus_mA11, lambda,
-                                                 S_part3x3.A22, ss1, ss0,
-                                                 bc+m_stl_plus_mA11, 1,
-                                                 blks+m_stl_plus_mA11);
-            
-          /// copy back to V
-          value_type * V_A1_r = V_part3x1.A1;
-          value_type * V_A1_i = V_part3x1.A1 + vs0;
-          for (int j=0;j<m_stl;++j) { 
-            V_A1_r[j*vs1] = zero;
-            V_A1_i[j*vs1] = zero;
-          }
-          for (int j=m_stl;j<m;++j) { 
-            V_A1_r[j*vs1] = bc[j].real();              
-            V_A1_i[j*vs1] = bc[j].imag();
-          }              
-          /// ---------------------------------------------------
+  /// Parameters:
+  ///   [in]m
+  ///     A dimension of the square matrix S.
+  ///   [in]S, [in]ss0, [in]ss1
+  ///     A quasi upper triangular part of Schur decomposition which is computed
+  ///       A = U^H S U
+  ///   [out]V, [in]vs0, [out]vs1
+  ///     A set of left eigen vectors.
+  ///   [in]w
+  ///     contiguous workspace that can hold complex array (m)
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m,
+                                           /* */ ValueType *S, const int ss0,
+                                           const int ss1,
+                                           /* */ ValueType *V, const int vs0,
+                                           const int vs1,
+                                           /* */ ValueType *w,
+                                           const int *blks) {
+    typedef ValueType value_type;
+    typedef Kokkos::Details::ArithTraits<value_type> ats;
+    // typedef typename ats::mag_type mag_type;
+    typedef Kokkos::complex<value_type> complex_type;
+
+    const value_type zero(0), one(1);
+    /// SerialSetInternal::invoke(m, m, zero, V, vs0, vs1);
+
+    value_type *b = w;  // consider complex case
+
+    /// partitions used for loop iteration
+    Partition2x2<value_type> S_part2x2(ss0, ss1);
+    Partition3x3<value_type> S_part3x3(ss0, ss1);
+
+    Partition2x1<value_type> V_part2x1(vs0);
+    Partition3x1<value_type> V_part3x1(vs0);
+
+    /// initial partition of S where ATL has a zero dimension
+    S_part2x2.partWithATL(S, m, m, 0, 0);
+    V_part2x1.partWithAT(V, m, 0);
+
+    // const mag_type tol = ats::epsilon();
+    int m_stl = 0;
+    for (; m_stl < (m - 1);) {
+      /// part 2x2 into 3x3
+      const int mA11 = blks[m_stl];
+      assert(((mA11 == 1) || (mA11 == 2)) &&
+             "LeftEigenvectorFromSchur: blk is not 1x1 nor 2x2");
+
+      S_part3x3.partWithABR(S_part2x2, mA11, mA11);
+      V_part3x1.partWithAB(V_part2x1, mA11);
+
+      const int m_stl_plus_mA11 = m_stl + mA11;
+      if (mA11 == 1) {
+        /// real eigenvalue
+        const value_type lambda = *S_part3x3.A11;
+
+        /// initialize a left hand side
+        b[m_stl] = one;
+        for (int j = 0; j < (m - m_stl_plus_mA11); ++j)
+          b[j + m_stl_plus_mA11] = -S_part3x3.A12[j * ss1];
+
+        /// perform shifted trsv (transposed)
+        SerialShiftedTrsvInternalLower::invoke(
+            m - m_stl_plus_mA11, lambda, S_part3x3.A22, ss1, ss0,
+            b + m_stl_plus_mA11, 1, blks + m_stl_plus_mA11);
+
+        /// copy back to V (row wise copy)
+        for (int j = 0; j < m_stl; ++j) V_part3x1.A1[j * vs1] = zero;
+        for (int j = m_stl; j < m; ++j) V_part3x1.A1[j * vs1] = b[j];
+      } else {
+        /// complex eigen pair
+        const value_type alpha11 = S_part3x3.A11[0],
+                         alpha12 = S_part3x3.A11[ss1],
+                         alpha21 = S_part3x3.A11[ss0],
+                         beta    = ats::sqrt(-alpha12 * alpha21);
+
+        const complex_type lambda(alpha11, beta);
+        complex_type *bc = (complex_type *)(b);
+
+        /// initialize a left hand side
+        bc[m_stl]     = complex_type(beta, zero);
+        bc[m_stl + 1] = complex_type(zero, -alpha12);
+
+        const value_type *S_A12_a = S_part3x3.A12;
+        const value_type *S_A12_b = S_part3x3.A12 + ss0;
+        for (int j = 0; j < (m - m_stl_plus_mA11); ++j)
+          bc[j + m_stl_plus_mA11] = complex_type(-S_A12_a[j * ss1] * beta,
+                                                 S_A12_b[j * ss1] * alpha12);
+
+        /// perform shifted trsv
+        SerialShiftedTrsvInternalLower::invoke(
+            m - m_stl_plus_mA11, lambda, S_part3x3.A22, ss1, ss0,
+            bc + m_stl_plus_mA11, 1, blks + m_stl_plus_mA11);
+
+        /// copy back to V
+        value_type *V_A1_r = V_part3x1.A1;
+        value_type *V_A1_i = V_part3x1.A1 + vs0;
+        for (int j = 0; j < m_stl; ++j) {
+          V_A1_r[j * vs1] = zero;
+          V_A1_i[j * vs1] = zero;
         }
-        S_part2x2.mergeToATL(S_part3x3);
-        V_part2x1.mergeToAT(V_part3x1);
-        m_stl += mA11;
-      }
-        
-      /// case: m_stl = m-1
-      if (m_stl < m) {
-        value_type * VV = V+m_stl*vs0;
-        for (int j=0;j<m_stl;++j) VV[j*vs1] = zero;
-        VV[m_stl*vs1] = one;
+        for (int j = m_stl; j < m; ++j) {
+          V_A1_r[j * vs1] = bc[j].real();
+          V_A1_i[j * vs1] = bc[j].imag();
+        }
+        /// ---------------------------------------------------
       }
+      S_part2x2.mergeToATL(S_part3x3);
+      V_part2x1.mergeToAT(V_part3x1);
+      m_stl += mA11;
+    }
 
-      return 0;
+    /// case: m_stl = m-1
+    if (m_stl < m) {
+      value_type *VV = V + m_stl * vs0;
+      for (int j = 0; j < m_stl; ++j) VV[j * vs1] = zero;
+      VV[m_stl * vs1] = one;
     }
-  };
 
-} /// end namespace KokkosBatched
+    return 0;
+  }
+};
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp
index 88a23d76f1..28eb159580 100644
--- a/src/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp
@@ -1,77 +1,73 @@
 #ifndef __KOKKOSBATCHED_NORMALIZE_INTERNAL_HPP__
 #define __KOKKOSBATCHED_NORMALIZE_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
-  struct SerialNormalizeInternal {
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m,
-           /* */ ValueType *__restrict__ v, const int vs) {
-      typedef ValueType value_type;
-      typedef Kokkos::Details::ArithTraits<value_type> ats;
-      typedef typename ats::mag_type mag_type;
+///
+/// Serial Internal Impl
+/// ====================
+struct SerialNormalizeInternal {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m,
+                                           /* */ ValueType *KOKKOS_RESTRICT v,
+                                           const int vs) {
+    typedef ValueType value_type;
+    typedef Kokkos::Details::ArithTraits<value_type> ats;
+    typedef typename ats::mag_type mag_type;
 
-      mag_type norm(0);
+    mag_type norm(0);
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int i=0;i<m;++i) {
-        const auto v_at_i = v[i*vs];
-        norm += ats::real(v_at_i*ats::conj(v_at_i));
-      }
-      norm = Kokkos::Details::ArithTraits<mag_type>::sqrt(norm);
+    for (int i = 0; i < m; ++i) {
+      const auto v_at_i = v[i * vs];
+      norm += ats::real(v_at_i * ats::conj(v_at_i));
+    }
+    norm = Kokkos::Details::ArithTraits<mag_type>::sqrt(norm);
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int i=0;i<m;++i) 
-        v[i*vs] /= norm;
+    for (int i = 0; i < m; ++i) v[i * vs] /= norm;
 
-      return 0;
-    }
+    return 0;
+  }
 
-    template<typename RealType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m,
-           /* */ RealType *__restrict__ vr, const int vrs,
-           /* */ RealType *__restrict__ vi, const int vis) {
-      typedef RealType real_type;
-      typedef Kokkos::Details::ArithTraits<real_type> ats;
-      typedef typename ats::mag_type mag_type;
+  template <typename RealType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m,
+                                           /* */ RealType *KOKKOS_RESTRICT vr,
+                                           const int vrs,
+                                           /* */ RealType *KOKKOS_RESTRICT vi,
+                                           const int vis) {
+    typedef RealType real_type;
+    typedef Kokkos::Details::ArithTraits<real_type> ats;
+    typedef typename ats::mag_type mag_type;
 
-      mag_type norm(0);
+    mag_type norm(0);
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int i=0;i<m;++i) {
-        const auto vr_at_i = vr[i*vrs];
-        const auto vi_at_i = vi[i*vis];
-        norm += vr_at_i*vr_at_i+vi_at_i*vi_at_i;
-      }
-      norm = Kokkos::Details::ArithTraits<mag_type>::sqrt(norm);
+    for (int i = 0; i < m; ++i) {
+      const auto vr_at_i = vr[i * vrs];
+      const auto vi_at_i = vi[i * vis];
+      norm += vr_at_i * vr_at_i + vi_at_i * vi_at_i;
+    }
+    norm = Kokkos::Details::ArithTraits<mag_type>::sqrt(norm);
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int i=0;i<m;++i) { 
-        vr[i*vrs] /= norm;
-        vi[i*vis] /= norm;
-      }
-
-      return 0;
+    for (int i = 0; i < m; ++i) {
+      vr[i * vrs] /= norm;
+      vi[i * vis] /= norm;
     }
-  };
 
-} // end namespace KokkosBatched
+    return 0;
+  }
+};
 
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp
index e0d70cb998..46feefb91b 100644
--- a/src/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_QR_FORM_Q_SERIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_QR_FORM_Q_SERIAL_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -11,49 +10,43 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
-  ///
-  /// this impl follows the flame interface of householder transformation
-  ///
-  struct SerialQR_FormQ_Internal {
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m, 
-           const int k, 
-           /* */ ValueType * A, const int as0, const int as1,
-           /* */ ValueType * t, const int ts,
-           /* */ ValueType * Q, const int qs0, const int qs1,
-           /* */ ValueType * w,
-           const bool is_Q_zero = false) {
-      typedef ValueType value_type;
-        
-      /// Given a matrix A that includes QR factorization
-      /// it forms a unitary matrix Q 
-      ///   B = Q = (H0 H1 H2 H3 ... H(k-1)) I
-      /// where
-      ///   A is m x k (holding H0, H1 ... H(k-1)
-      ///   t is k x 1 
-      ///   B is m x m
-
-      // set identity
-      if (is_Q_zero)
-        SerialSetInternal::invoke(m, value_type(1), Q, qs0+qs1);
-      else
-        SerialSetIdentityInternal::invoke(m, Q, qs0, qs1);
-
-      return SerialApplyQ_LeftNoTransForwardInternal
-        ::invoke(m, m, k,
-                 A, as0, as1, 
-                 t, ts,
-                 Q, qs0, qs1,
-                 w);
-    }
-  };
-
-} // end namespace KokkosBatched
-
+///
+/// Serial Internal Impl
+/// ====================
+///
+/// this impl follows the flame interface of householder transformation
+///
+struct SerialQR_FormQ_Internal {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int k,
+                                           /* */ ValueType* A, const int as0,
+                                           const int as1,
+                                           /* */ ValueType* t, const int ts,
+                                           /* */ ValueType* Q, const int qs0,
+                                           const int qs1,
+                                           /* */ ValueType* w,
+                                           const bool is_Q_zero = false) {
+    typedef ValueType value_type;
+
+    /// Given a matrix A that includes QR factorization
+    /// it forms a unitary matrix Q
+    ///   B = Q = (H0 H1 H2 H3 ... H(k-1)) I
+    /// where
+    ///   A is m x k (holding H0, H1 ... H(k-1)
+    ///   t is k x 1
+    ///   B is m x m
+
+    // set identity
+    if (is_Q_zero)
+      SerialSetInternal::invoke(m, value_type(1), Q, qs0 + qs1);
+    else
+      SerialSetIdentityInternal::invoke(m, Q, qs0, qs1);
+
+    return SerialApplyQ_LeftNoTransForwardInternal ::invoke(
+        m, m, k, A, as0, as1, t, ts, Q, qs0, qs1, w);
+  }
+};
+
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp
index d443bad513..52178a095a 100644
--- a/src/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_QR_FORM_Q_TEAMVECTOR_INTERNAL_HPP__
 #define __KOKKOSBATCHED_QR_FORM_Q_TEAMVECTOR_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -11,53 +10,42 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// TeamVector Internal Impl
-  /// ======================== 
-  ///
-  /// this impl follows the flame interface of householder transformation
-  ///
-  struct TeamVectorQR_FormQ_Internal {
-    template<typename MemberType,
-	     typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-	   const int m, const int n, 
-           const int k, 
-           /* */ ValueType * A, const int as0, const int as1,
-           /* */ ValueType * t, const int ts,
-           /* */ ValueType * Q, const int qs0, const int qs1,
-           /* */ ValueType * w,
-           const bool is_Q_zero = false) {
-      typedef ValueType value_type;
-        
-      /// Given a matrix A that includes QR factorization
-      /// it forms a unitary matrix Q 
-      ///   B = Q = (H0 H1 H2 H3 ... H(k-1)) I
-      /// where
-      ///   A is m x k (holding H0, H1 ... H(k-1)
-      ///   t is k x 1 
-      ///   B is m x m
-
-      // set identity
-      if (is_Q_zero)
-        TeamVectorSetInternal::invoke(member, m, value_type(1), Q, qs0+qs1);
-      else
-        TeamVectorSetIdentityInternal::invoke(member, m, n, Q, qs0, qs1);
-      member.team_barrier();
-      
-      return TeamVectorApplyQ_LeftForwardInternal
-        ::invoke(member,
-		 m, n, k,
-                 A, as0, as1, 
-                 t, ts,
-                 Q, qs0, qs1,
-                 w);
-    }
-  };
-
-} // end namespace KokkosBatched
-
+///
+/// TeamVector Internal Impl
+/// ========================
+///
+/// this impl follows the flame interface of householder transformation
+///
+struct TeamVectorQR_FormQ_Internal {
+  template <typename MemberType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const int m, const int n, const int k,
+      /* */ ValueType *A, const int as0, const int as1,
+      /* */ ValueType *t, const int ts,
+      /* */ ValueType *Q, const int qs0, const int qs1,
+      /* */ ValueType *w, const bool is_Q_zero = false) {
+    typedef ValueType value_type;
+
+    /// Given a matrix A that includes QR factorization
+    /// it forms a unitary matrix Q
+    ///   B = Q = (H0 H1 H2 H3 ... H(k-1)) I
+    /// where
+    ///   A is m x k (holding H0, H1 ... H(k-1)
+    ///   t is k x 1
+    ///   B is m x m
+
+    // set identity
+    if (is_Q_zero)
+      TeamVectorSetInternal::invoke(member, m, value_type(1), Q, qs0 + qs1);
+    else
+      TeamVectorSetIdentityInternal::invoke(member, m, n, Q, qs0, qs1);
+    member.team_barrier();
+
+    return TeamVectorApplyQ_LeftForwardInternal ::invoke(
+        member, m, n, k, A, as0, as1, t, ts, Q, qs0, qs1, w);
+  }
+};
+
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_QR_Serial_Impl.hpp b/src/batched/dense/impl/KokkosBatched_QR_Serial_Impl.hpp
index d700617623..41f60c5aae 100644
--- a/src/batched/dense/impl/KokkosBatched_QR_Serial_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_QR_Serial_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_QR_SERIAL_IMPL_HPP__
 #define __KOKKOSBATCHED_QR_SERIAL_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,29 +8,19 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Impl
-  /// ===========
-
-  template<>
-  template<typename AViewType,
-           typename tViewType,
-           typename wViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialQR<Algo::QR::Unblocked>::
-  invoke(const AViewType &A,
-         const tViewType &t,
-         const wViewType &w) {
-    return SerialQR_Internal::
-      invoke(A.extent(0), A.extent(1), 
-             A.data(), A.stride_0(), A.stride_1(),
-             t.data(), t.stride_0(), 
-             w.data());
-  }
-        
+///
+/// Serial Impl
+/// ===========
+
+template <>
+template <typename AViewType, typename tViewType, typename wViewType>
+KOKKOS_INLINE_FUNCTION int SerialQR<Algo::QR::Unblocked>::invoke(
+    const AViewType &A, const tViewType &t, const wViewType &w) {
+  return SerialQR_Internal::invoke(A.extent(0), A.extent(1), A.data(),
+                                   A.stride_0(), A.stride_1(), t.data(),
+                                   t.stride_0(), w.data());
 }
 
-
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp
index a0b231b614..8137f6afbe 100644
--- a/src/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_QR_SERIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_QR_SERIAL_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -10,70 +9,63 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
-  ///
-  /// this impl follows the flame interface of householder transformation
-  ///
-  struct SerialQR_Internal {
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m, // m = NumRows(A)
-           const int n, // n = NumCols(A)
-           /* */ ValueType * A, const int as0, const int as1,
-           /* */ ValueType * t, const int ts,
-           /* */ ValueType * w) {
-      typedef ValueType value_type;
-        
-      /// Given a matrix A, it computes QR decomposition of the matrix
-      ///  - t is to store tau and w is for workspace
+///
+/// Serial Internal Impl
+/// ====================
+///
+/// this impl follows the flame interface of householder transformation
+///
+struct SerialQR_Internal {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m,  // m = NumRows(A)
+                                           const int n,  // n = NumCols(A)
+                                           /* */ ValueType *A, const int as0,
+                                           const int as1,
+                                           /* */ ValueType *t, const int ts,
+                                           /* */ ValueType *w) {
+    typedef ValueType value_type;
 
-      // partitions used for loop iteration 
-      Partition2x2<value_type> A_part2x2(as0, as1);
-      Partition3x3<value_type> A_part3x3(as0, as1);
+    /// Given a matrix A, it computes QR decomposition of the matrix
+    ///  - t is to store tau and w is for workspace
 
-      Partition2x1<value_type> t_part2x1(ts);
-      Partition3x1<value_type> t_part3x1(ts);
+    // partitions used for loop iteration
+    Partition2x2<value_type> A_part2x2(as0, as1);
+    Partition3x3<value_type> A_part3x3(as0, as1);
 
-      // initial partition of A where ATL has a zero dimension
-      A_part2x2.partWithATL(A, m, n, 0, 0);
-      t_part2x1.partWithAT (t, m, 0);
+    Partition2x1<value_type> t_part2x1(ts);
+    Partition3x1<value_type> t_part3x1(ts);
 
-      for (int m_atl=0;m_atl<m;++m_atl) {
-        // part 2x2 into 3x3
-        A_part3x3.partWithABR(A_part2x2, 1, 1);
-        const int m_A22 = m - m_atl - 1;
-        const int n_A22 = n - m_atl - 1;
-          
-        t_part3x1.partWithAB (t_part2x1, 1);
-        value_type *tau = t_part3x1.A1;
+    // initial partition of A where ATL has a zero dimension
+    A_part2x2.partWithATL(A, m, n, 0, 0);
+    t_part2x1.partWithAT(t, m, 0);
 
-        /// -----------------------------------------------------
+    for (int m_atl = 0; m_atl < m; ++m_atl) {
+      // part 2x2 into 3x3
+      A_part3x3.partWithABR(A_part2x2, 1, 1);
+      const int m_A22 = m - m_atl - 1;
+      const int n_A22 = n - m_atl - 1;
 
-        // perform householder transformation
-        SerialLeftHouseholderInternal::invoke(m_A22,
-                                              A_part3x3.A11,
-                                              A_part3x3.A21, as0,
-                                              tau);
-            
-        // left apply householder to A22
-        SerialApplyLeftHouseholderInternal::invoke(m_A22, n_A22,
-                                                   tau,
-                                                   A_part3x3.A21, as0,
-                                                   A_part3x3.A12, as1,
-                                                   A_part3x3.A22, as0, as1, 
-                                                   w);            
-        /// -----------------------------------------------------
-        A_part2x2.mergeToATL(A_part3x3);
-        t_part2x1.mergeToAT (t_part3x1);
-      }
-      return 0;
-    }
-  };
+      t_part3x1.partWithAB(t_part2x1, 1);
+      value_type *tau = t_part3x1.A1;
+
+      /// -----------------------------------------------------
 
-} // end namespace KokkosBatched
+      // perform householder transformation
+      SerialLeftHouseholderInternal::invoke(m_A22, A_part3x3.A11, A_part3x3.A21,
+                                            as0, tau);
+
+      // left apply householder to A22
+      SerialApplyLeftHouseholderInternal::invoke(
+          m_A22, n_A22, tau, A_part3x3.A21, as0, A_part3x3.A12, as1,
+          A_part3x3.A22, as0, as1, w);
+      /// -----------------------------------------------------
+      A_part2x2.mergeToATL(A_part3x3);
+      t_part2x1.mergeToAT(t_part3x1);
+    }
+    return 0;
+  }
+};
 
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_QR_TeamVector_Impl.hpp b/src/batched/dense/impl/KokkosBatched_QR_TeamVector_Impl.hpp
index 5e7798c4cc..db78add3c5 100644
--- a/src/batched/dense/impl/KokkosBatched_QR_TeamVector_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_QR_TeamVector_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_QR_TEAMVECTOR_IMPL_HPP__
 #define __KOKKOSBATCHED_QR_TEAMVECTOR_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,32 +8,23 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// TeamVector Impl
-  /// ===============
-
-  template<typename MemberType>
-  struct TeamVectorQR<MemberType,Algo::QR::Unblocked> {
-    template<typename AViewType,
-             typename tViewType,
-             typename wViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const AViewType &A,
-           const tViewType &t,
-           const wViewType &w) {
-      return TeamVectorQR_Internal::
-        invoke(member,
-               A.extent(0), A.extent(1), 
-               A.data(), A.stride_0(), A.stride_1(),
-               t.data(), t.stride_0(), 
-               w.data());
-    }
-  };
-        
-}
-
-
+///
+/// TeamVector Impl
+/// ===============
+
+template <typename MemberType>
+struct TeamVectorQR<MemberType, Algo::QR::Unblocked> {
+  template <typename AViewType, typename tViewType, typename wViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const AViewType &A,
+                                           const tViewType &t,
+                                           const wViewType &w) {
+    return TeamVectorQR_Internal::invoke(member, A.extent(0), A.extent(1),
+                                         A.data(), A.stride_0(), A.stride_1(),
+                                         t.data(), t.stride_0(), w.data());
+  }
+};
+
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_QR_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_QR_TeamVector_Internal.hpp
index dbc36cf864..fa16f194d7 100644
--- a/src/batched/dense/impl/KokkosBatched_QR_TeamVector_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_QR_TeamVector_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_QR_TEAMVECTOR_INTERNAL_HPP__
 #define __KOKKOSBATCHED_QR_TEAMVECTOR_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -10,76 +9,66 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
-  ///
-  /// this impl follows the flame interface of householder transformation
-  ///
-  struct TeamVectorQR_Internal {
-    template<typename MemberType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const int m, // m = NumRows(A)
-           const int n, // n = NumCols(A)
-           /* */ ValueType * A, const int as0, const int as1,
-           /* */ ValueType * t, const int ts,
-           /* */ ValueType * w) {
-      typedef ValueType value_type;
-        
-      /// Given a matrix A, it computes QR decomposition of the matrix
-      ///  - t is to store tau and w is for workspace
+///
+/// Serial Internal Impl
+/// ====================
+///
+/// this impl follows the flame interface of householder transformation
+///
+struct TeamVectorQR_Internal {
+  template <typename MemberType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int m,  // m = NumRows(A)
+                                           const int n,  // n = NumCols(A)
+                                           /* */ ValueType *A, const int as0,
+                                           const int as1,
+                                           /* */ ValueType *t, const int ts,
+                                           /* */ ValueType *w) {
+    typedef ValueType value_type;
 
-      // partitions used for loop iteration 
-      Partition2x2<value_type> A_part2x2(as0, as1);
-      Partition3x3<value_type> A_part3x3(as0, as1);
+    /// Given a matrix A, it computes QR decomposition of the matrix
+    ///  - t is to store tau and w is for workspace
 
-      Partition2x1<value_type> t_part2x1(ts);
-      Partition3x1<value_type> t_part3x1(ts);
+    // partitions used for loop iteration
+    Partition2x2<value_type> A_part2x2(as0, as1);
+    Partition3x3<value_type> A_part3x3(as0, as1);
 
-      // initial partition of A where ATL has a zero dimension
-      A_part2x2.partWithATL(A, m, n, 0, 0);
-      t_part2x1.partWithAT (t, m, 0);
+    Partition2x1<value_type> t_part2x1(ts);
+    Partition3x1<value_type> t_part3x1(ts);
 
-      for (int m_atl=0;m_atl<m;++m_atl) {
-        // part 2x2 into 3x3
-        A_part3x3.partWithABR(A_part2x2, 1, 1);
-        const int m_A22 = m - m_atl - 1;
-        const int n_A22 = n - m_atl - 1;
-          
-        t_part3x1.partWithAB (t_part2x1, 1);
-        value_type *tau = t_part3x1.A1;
+    // initial partition of A where ATL has a zero dimension
+    A_part2x2.partWithATL(A, m, n, 0, 0);
+    t_part2x1.partWithAT(t, m, 0);
 
-        /// -----------------------------------------------------
+    for (int m_atl = 0; m_atl < m; ++m_atl) {
+      // part 2x2 into 3x3
+      A_part3x3.partWithABR(A_part2x2, 1, 1);
+      const int m_A22 = m - m_atl - 1;
+      const int n_A22 = n - m_atl - 1;
 
-        // perform householder transformation
-        TeamVectorLeftHouseholderInternal::invoke(member, 
-                                                  m_A22,
-                                                  A_part3x3.A11,
-                                                  A_part3x3.A21, as0,
-                                                  tau);
-        member.team_barrier();
+      t_part3x1.partWithAB(t_part2x1, 1);
+      value_type *tau = t_part3x1.A1;
 
-        // left apply householder to A22
-        TeamVectorApplyLeftHouseholderInternal::invoke(member, 
-                                                       m_A22, n_A22,
-                                                       tau,
-                                                       A_part3x3.A21, as0,
-                                                       A_part3x3.A12, as1,
-                                                       A_part3x3.A22, as0, as1, 
-                                                       w);            
-        member.team_barrier();
-        /// -----------------------------------------------------
-        A_part2x2.mergeToATL(A_part3x3);
-        t_part2x1.mergeToAT (t_part3x1);
-      }
-      return 0;
-    }
-  };
+      /// -----------------------------------------------------
 
-} // end namespace KokkosBatched
+      // perform householder transformation
+      TeamVectorLeftHouseholderInternal::invoke(member, m_A22, A_part3x3.A11,
+                                                A_part3x3.A21, as0, tau);
+      member.team_barrier();
+
+      // left apply householder to A22
+      TeamVectorApplyLeftHouseholderInternal::invoke(
+          member, m_A22, n_A22, tau, A_part3x3.A21, as0, A_part3x3.A12, as1,
+          A_part3x3.A22, as0, as1, w);
+      member.team_barrier();
+      /// -----------------------------------------------------
+      A_part2x2.mergeToATL(A_part3x3);
+      t_part2x1.mergeToAT(t_part3x1);
+    }
+    return 0;
+  }
+};
 
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Impl.hpp b/src/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Impl.hpp
index 3fe101eb4e..a08ab80491 100644
--- a/src/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_TEAMVECTOR_IMPL_HPP__
 #define __KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_TEAMVECTOR_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,37 +8,26 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// TeamVector Impl
-  /// ===============
-
-  template<typename MemberType>
-  struct TeamVectorQR_WithColumnPivoting<MemberType,Algo::QR::Unblocked> {
-    template<typename AViewType,
-             typename tViewType,
-	     typename pViewType,
-             typename wViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const AViewType &A,
-           const tViewType &t,
-	   const pViewType &p,
-           const wViewType &w,
-	   /* */ int &matrix_rank) {
-      return TeamVectorQR_WithColumnPivotingInternal::
-        invoke(member,
-               A.extent(0), A.extent(1), 
-               A.data(), A.stride_0(), A.stride_1(),
-               t.data(), t.stride_0(),
-	       p.data(), p.stride_0(),
-               w.data(),
-	       matrix_rank);
-    }
-  };
-        
-}
-
-
+///
+/// TeamVector Impl
+/// ===============
+
+template <typename MemberType>
+struct TeamVectorQR_WithColumnPivoting<MemberType, Algo::QR::Unblocked> {
+  template <typename AViewType, typename tViewType, typename pViewType,
+            typename wViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const AViewType &A,
+                                           const tViewType &t,
+                                           const pViewType &p,
+                                           const wViewType &w,
+                                           /* */ int &matrix_rank) {
+    return TeamVectorQR_WithColumnPivotingInternal::invoke(
+        member, A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(),
+        t.data(), t.stride_0(), p.data(), p.stride_0(), w.data(), matrix_rank);
+  }
+};
+
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp
index 08439b0b28..c598a30931 100644
--- a/src/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp
@@ -1,13 +1,12 @@
 #ifndef __KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_TEAMVECTOR_INTERNAL_HPP__
 #define __KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_TEAMVECTOR_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 #include "KokkosBatched_FindAmax_Internal.hpp"
-#include "KokkosBatched_Dot_Internal.hpp"
+#include "KokkosBatched_Dot.hpp"
 #include "KokkosBatched_ApplyPivot_Internal.hpp"
 
 #include "KokkosBatched_Householder_TeamVector_Internal.hpp"
@@ -15,176 +14,149 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// TeamVector Internal
-  /// =================== 
-  ///
-  /// this impl follows the flame interface of householder transformation
-  ///
-  struct TeamVectorUpdateColumnNormsInternal {
-    template<typename MemberType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const int n, 
-           const ValueType *__restrict__ a, const int as0,
-	   /* */ ValueType *__restrict__ norm, const int ns0) {
-      using ats = Kokkos::ArithTraits<ValueType>;
-      Kokkos::parallel_for
-	(Kokkos::TeamVectorRange(member,n),
-	 [&](const int &j) {
-	   const int idx_a = j*as0, idx_n = j*ns0;
-	   norm[idx_n] -= ats::conj(a[idx_a])*a[idx_a];
-	 });
-      return 0;
-    }
-  };
-
-  struct TeamVectorQR_WithColumnPivotingInternal {
-    template<typename MemberType,
-             typename ValueType,
-	     typename IntType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const int m, // m = NumRows(A)
-           const int n, // n = NumCols(A)
-           /* */ ValueType * A, const int as0, const int as1,
-           /* */ ValueType * t, const int ts0,
-	   /* */ IntType   * p, const int ps0,
-           /* */ ValueType * w,
-	   /* */ int &matrix_rank) {
-      using value_type = ValueType;
-      using int_type = IntType;
-      using ats = Kokkos::ArithTraits<value_type>;      
-      
-      /// Given a matrix A, it computes QR decomposition of the matrix
-      ///  - t is to store tau and w is for workspace
-      
-      // partitions used for loop iteration 
-      Partition2x2<value_type> A_part2x2(as0, as1);
-      Partition3x3<value_type> A_part3x3(as0, as1);
-
-      // column vector of tau (size of min_mn)
-      Partition2x1<value_type> t_part2x1(ts0);
-      Partition3x1<value_type> t_part3x1(ts0);
-
-      // row vector for norm and p (size of n)
-      Partition1x2<int_type> p_part1x2(ps0);
-      Partition1x3<int_type> p_part1x3(ps0);      
-
-      Partition1x2<value_type> norm_part1x2(1);
-      Partition1x3<value_type> norm_part1x3(1);      
-      
-      // loop size
-      const int min_mn = m < n ? m : n;
-
-      // workspace (norm and householder application, 2*max(m,n) is needed)
-      value_type * norm = w; w += n;      
-      
-      // initial partition of A where ATL has a zero dimension
-      A_part2x2.partWithATL  (A, m, n, 0, 0);
-      t_part2x1.partWithAT   (t, min_mn, 0);
-
-      p_part1x2.partWithAL   (p, n, 0);
-      norm_part1x2.partWithAL(norm, n, 0); 
-
-      // compute initial column norms (replaced by dot product)
-      TeamVectorDotInternal::invoke(member,
-				    m, n,
-				    A, as0, as1,
-				    A, as0, as1,
-				    norm, 1);
-	member.team_barrier();
-
-      const bool finish_when_rank_found = (matrix_rank == -1);
-      
-      matrix_rank = min_mn;
-      value_type max_diag(0);
-      for (int m_atl=0;m_atl<min_mn;++m_atl) {
-        const int n_AR = n - m_atl;
-
-        // part 2x2 into 3x3
-        A_part3x3.partWithABR(A_part2x2, 1, 1);
-        const int m_A22 = m - m_atl - 1;
-        const int n_A22 = n - m_atl - 1;
-
-        t_part3x1.partWithAB(t_part2x1, 1);
-        value_type *tau = t_part3x1.A1;
-
-	p_part1x3.partWithAR(p_part1x2, 1);
-	int_type *pividx = p_part1x3.A1;
-	
-	norm_part1x3.partWithAR(norm_part1x2, 1);
-	
-        /// -----------------------------------------------------
-	// find max location
-	TeamVectorFindAmaxInternal::invoke(member,
-                                           n_AR,
-					   norm_part1x2.AR, 1,
-					   pividx); 
-	member.team_barrier();
-
-	// apply pivot
-	TeamVectorApplyPivotVectorForwardInternal::invoke(member,
-                                                          *pividx,
-                                                          norm_part1x2.AR, 1);
-	TeamVectorApplyPivotMatrixForwardInternal::invoke(member,
-                                                          m,
-                                                          *pividx,
-                                                          A_part2x2.ATR, as1, as0);
-	member.team_barrier();
-	
-        // perform householder transformation
-        TeamVectorLeftHouseholderInternal::invoke(member, 
-                                                  m_A22,
-                                                  A_part3x3.A11,
-                                                  A_part3x3.A21, as0,
-                                                  tau);
-        member.team_barrier();
-
-        // left apply householder to A22
-        TeamVectorApplyLeftHouseholderInternal::invoke(member, 
-                                                       m_A22, n_A22,
-                                                       tau,
-                                                       A_part3x3.A21, as0,
-                                                       A_part3x3.A12, as1,
-                                                       A_part3x3.A22, as0, as1, 
-                                                       w);            
-        member.team_barrier();
-
-	// break condition
-	if (matrix_rank == min_mn) {
-	  if (m_atl == 0) max_diag = ats::abs(A[0]);
-	  const value_type
-	    val_diag = ats::abs(A_part3x3.A11[0]),
-	    threshold(10*max_diag*ats::epsilon());
-	  if (val_diag < threshold) {
-	    matrix_rank = m_atl;
-	    if (finish_when_rank_found)
-	      break;
-	  }
-	}
-
-	// norm update
-	TeamVectorUpdateColumnNormsInternal::invoke(member,
-						    n_A22,
-						    A_part3x3.A12, as1,
-						    norm_part1x3.A2, 1);
+///
+/// TeamVector Internal
+/// ===================
+///
+/// this impl follows the flame interface of householder transformation
+///
+struct TeamVectorUpdateColumnNormsInternal {
+  template <typename MemberType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const int n, const ValueType *KOKKOS_RESTRICT a,
+      const int as0,
+      /* */ ValueType *KOKKOS_RESTRICT norm, const int ns0) {
+    using ats = Kokkos::ArithTraits<ValueType>;
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), [&](const int &j) {
+      const int idx_a = j * as0, idx_n = j * ns0;
+      norm[idx_n] -= ats::conj(a[idx_a]) * a[idx_a];
+    });
+    return 0;
+  }
+};
+
+struct TeamVectorQR_WithColumnPivotingInternal {
+  template <typename MemberType, typename ValueType, typename IntType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int m,  // m = NumRows(A)
+                                           const int n,  // n = NumCols(A)
+                                           /* */ ValueType *A, const int as0,
+                                           const int as1,
+                                           /* */ ValueType *t, const int ts0,
+                                           /* */ IntType *p, const int ps0,
+                                           /* */ ValueType *w,
+                                           /* */ int &matrix_rank) {
+    using value_type = ValueType;
+    using int_type   = IntType;
+    using ats        = Kokkos::ArithTraits<value_type>;
+
+    /// Given a matrix A, it computes QR decomposition of the matrix
+    ///  - t is to store tau and w is for workspace
+
+    // partitions used for loop iteration
+    Partition2x2<value_type> A_part2x2(as0, as1);
+    Partition3x3<value_type> A_part3x3(as0, as1);
+
+    // column vector of tau (size of min_mn)
+    Partition2x1<value_type> t_part2x1(ts0);
+    Partition3x1<value_type> t_part3x1(ts0);
+
+    // row vector for norm and p (size of n)
+    Partition1x2<int_type> p_part1x2(ps0);
+    Partition1x3<int_type> p_part1x3(ps0);
+
+    Partition1x2<value_type> norm_part1x2(1);
+    Partition1x3<value_type> norm_part1x3(1);
+
+    // loop size
+    const int min_mn = m < n ? m : n;
+
+    // workspace (norm and householder application, 2*max(m,n) is needed)
+    value_type *norm = w;
+    w += n;
+
+    // initial partition of A where ATL has a zero dimension
+    A_part2x2.partWithATL(A, m, n, 0, 0);
+    t_part2x1.partWithAT(t, min_mn, 0);
+
+    p_part1x2.partWithAL(p, n, 0);
+    norm_part1x2.partWithAL(norm, n, 0);
+
+    // compute initial column norms (replaced by dot product)
+    TeamVectorDotInternal::invoke(member, m, n, A, as0, as1, A, as0, as1, norm,
+                                  1);
     member.team_barrier();
-	/// -----------------------------------------------------
-        A_part2x2.mergeToATL  (A_part3x3);
-        t_part2x1.mergeToAT   (t_part3x1);
-	p_part1x2.mergeToAL   (p_part1x3);
-	norm_part1x2.mergeToAL(norm_part1x3);
+
+    const bool finish_when_rank_found = (matrix_rank == -1);
+
+    matrix_rank = min_mn;
+    value_type max_diag(0);
+    for (int m_atl = 0; m_atl < min_mn; ++m_atl) {
+      const int n_AR = n - m_atl;
+
+      // part 2x2 into 3x3
+      A_part3x3.partWithABR(A_part2x2, 1, 1);
+      const int m_A22 = m - m_atl - 1;
+      const int n_A22 = n - m_atl - 1;
+
+      t_part3x1.partWithAB(t_part2x1, 1);
+      value_type *tau = t_part3x1.A1;
+
+      p_part1x3.partWithAR(p_part1x2, 1);
+      int_type *pividx = p_part1x3.A1;
+
+      norm_part1x3.partWithAR(norm_part1x2, 1);
+
+      /// -----------------------------------------------------
+      // find max location
+      TeamVectorFindAmaxInternal::invoke(member, n_AR, norm_part1x2.AR, 1,
+                                         pividx);
+      member.team_barrier();
+
+      // apply pivot
+      TeamVectorApplyPivotVectorForwardInternal::invoke(member, *pividx,
+                                                        norm_part1x2.AR, 1);
+      TeamVectorApplyPivotMatrixForwardInternal::invoke(
+          member, m, *pividx, A_part2x2.ATR, as1, as0);
+      member.team_barrier();
+
+      // perform householder transformation
+      TeamVectorLeftHouseholderInternal::invoke(member, m_A22, A_part3x3.A11,
+                                                A_part3x3.A21, as0, tau);
+      member.team_barrier();
+
+      // left apply householder to A22
+      TeamVectorApplyLeftHouseholderInternal::invoke(
+          member, m_A22, n_A22, tau, A_part3x3.A21, as0, A_part3x3.A12, as1,
+          A_part3x3.A22, as0, as1, w);
+      member.team_barrier();
+
+      // break condition
+      if (matrix_rank == min_mn) {
+        if (m_atl == 0) max_diag = ats::abs(A[0]);
+        const value_type val_diag = ats::abs(A_part3x3.A11[0]),
+                         threshold(10 * max_diag * ats::epsilon());
+        if (val_diag < threshold) {
+          matrix_rank = m_atl;
+          if (finish_when_rank_found) break;
+        }
       }
 
-      return 0;
+      // norm update
+      TeamVectorUpdateColumnNormsInternal::invoke(member, n_A22, A_part3x3.A12,
+                                                  as1, norm_part1x3.A2, 1);
+      member.team_barrier();
+      /// -----------------------------------------------------
+      A_part2x2.mergeToATL(A_part3x3);
+      t_part2x1.mergeToAT(t_part3x1);
+      p_part1x2.mergeToAL(p_part1x3);
+      norm_part1x2.mergeToAL(norm_part1x3);
     }
-  };
 
-} // end namespace KokkosBatched
+    return 0;
+  }
+};
 
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp
index 191c0a7d9e..d2f7fe0784 100644
--- a/src/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_RIGHT_EIGENVECTOR_FROM_SCHUR_SERIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_RIGHT_EIGENVECTOR_FROM_SCHUR_SERIAL_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,132 +8,128 @@
 
 namespace KokkosBatched {
 
+///
+/// Serial Internal Impl
+/// ====================
+///
+/// this impl follows the flame interface of householder transformation
+///
+struct SerialRightEigenvectorFromSchurInternal {
+  /// Given a quasi upper triangular matrix S (m x m), this computes all right
+  /// eigenvectors.
   ///
-  /// Serial Internal Impl
-  /// ==================== 
-  ///
-  /// this impl follows the flame interface of householder transformation
-  ///
-  struct SerialRightEigenvectorFromSchurInternal {
-    /// Given a quasi upper triangular matrix S (m x m), this computes all right 
-    /// eigenvectors.
-    /// 
-    /// Parameters:
-    ///   [in]m 
-    ///     A dimension of the square matrix S.
-    ///   [in]S, [in]ss0, [in]ss1 
-    ///     A quasi upper triangular part of Schur decomposition which is computed 
-    ///       A = U^H S U
-    ///   [out]V, [in]vs0, [out]vs1 
-    ///     A set of right eigen vectors.
-    ///   [in]w
-    ///     contiguous workspace that can hold complex array (m)
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m,
-           /* */ ValueType * S, const int ss0, const int ss1,
-           /* */ ValueType * V, const int vs0, const int vs1,
-           /* */ ValueType * w,
-           const int * blks) {
-      typedef ValueType value_type;
-      typedef Kokkos::Details::ArithTraits<value_type> ats;
-      //typedef typename ats::mag_type mag_type;
-      typedef Kokkos::complex<value_type> complex_type;
-
-      const value_type zero(0), one(1);
-      //const int ss(ss0+ss1);
-      /// SerialSetInternal::invoke(m, m, zero, V, vs0, vs1);
-
-      value_type *b = w; // consider complex case
-
-      /// partitions used for loop iteration
-      Partition2x2<value_type> S_part2x2(ss0, ss1);
-      Partition3x3<value_type> S_part3x3(ss0, ss1);
-        
-      Partition1x2<value_type> V_part1x2(vs1);
-      Partition1x3<value_type> V_part1x3(vs1);
-        
-      /// initial partition of S where ABR has a zero dimension
-      S_part2x2.partWithABR(S, m, m, 0, 0);
-      V_part1x2.partWithAR(V, m, 0);
-
-      //const mag_type tol = ats::epsilon();
-      int m_stl = m;
-      for (;m_stl>0;) {
-        /// part 2x2 into 3x3
-        const int mA11 = blks[m_stl-1];
-        assert( ((mA11 == 1) || (mA11 == 2)) && "RightEigenvectorFromSchur: blk is not 1x1 nor 2x2");
-
-        S_part3x3.partWithATL(S_part2x2, mA11, mA11);
-        V_part1x3.partWithAL(V_part1x2, mA11);
-
-        const int m_stl_minus_mA11 = m_stl - mA11;
-        if (mA11 == 1) {
-          /// real eigenvalue 
-          const value_type lambda = *S_part3x3.A11;
-            
-          /// initialize a right eigen vector
-          for (int i=0;i<m_stl_minus_mA11;++i) 
-            b[i] = -S_part3x3.A01[i*ss0];
-          b[m_stl-1] = one;
-            
-          /// perform shifted trsv
-          SerialShiftedTrsvInternalUpper::invoke(m_stl_minus_mA11, lambda,
-                                                 S_part3x3.A00, ss0, ss1,
-                                                 w, 1,
-                                                 blks);
-            
-          /// copy back to V
-          for (int i=0;i<m_stl;++i) V_part1x3.A1[i*vs0] = w[i];              
-          for (int i=m_stl;i<m;++i) V_part1x3.A1[i*vs0] = zero;
-        } else {
-          /// complex eigen pair  
-          const value_type 
-            alpha11 = S_part3x3.A11[0],
-            alpha12 = S_part3x3.A11[ss1],
-            alpha21 = S_part3x3.A11[ss0],
-            beta = ats::sqrt(-alpha12*alpha21);
-
-          const complex_type lambda(alpha11, beta);
-          complex_type * bc = (complex_type*)(b);
-            
-          /// initialize a right eigen vector
-          const value_type * S_A01_a = S_part3x3.A01;
-          const value_type * S_A01_b = S_part3x3.A01 + ss1;
-          for (int i=0;i<m_stl_minus_mA11;++i) 
-            bc[i] = complex_type(-S_A01_a[i*ss0]*beta, S_A01_b[i*ss0]*alpha21);
-          bc[m_stl-2] = complex_type(beta,  zero);
-          bc[m_stl-1] = complex_type(zero, -alpha21);
-            
-          /// perform shifted trsv
-          SerialShiftedTrsvInternalUpper::invoke(m_stl_minus_mA11, lambda,
-                                                 S_part3x3.A00, ss0, ss1,
-                                                 bc, 1,
-                                                 blks);
-            
-          /// copy back to V
-          value_type * V_A1_r = V_part1x3.A1;
-          value_type * V_A1_i = V_part1x3.A1 + vs1;
-          for (int i=0;i<m_stl;++i) { 
-            V_A1_r[i*vs0] = bc[i].real();              
-            V_A1_i[i*vs0] = bc[i].imag();
-          }              
-          for (int i=m_stl;i<m;++i) { 
-            V_A1_r[i*vs0] = zero;
-            V_A1_i[i*vs0] = zero;
-          }
-          /// ---------------------------------------------------
+  /// Parameters:
+  ///   [in]m
+  ///     A dimension of the square matrix S.
+  ///   [in]S, [in]ss0, [in]ss1
+  ///     A quasi upper triangular part of Schur decomposition which is computed
+  ///       A = U^H S U
+  ///   [out]V, [in]vs0, [out]vs1
+  ///     A set of right eigen vectors.
+  ///   [in]w
+  ///     contiguous workspace that can hold complex array (m)
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m,
+                                           /* */ ValueType *S, const int ss0,
+                                           const int ss1,
+                                           /* */ ValueType *V, const int vs0,
+                                           const int vs1,
+                                           /* */ ValueType *w,
+                                           const int *blks) {
+    typedef ValueType value_type;
+    typedef Kokkos::Details::ArithTraits<value_type> ats;
+    // typedef typename ats::mag_type mag_type;
+    typedef Kokkos::complex<value_type> complex_type;
+
+    const value_type zero(0), one(1);
+    // const int ss(ss0+ss1);
+    /// SerialSetInternal::invoke(m, m, zero, V, vs0, vs1);
+
+    value_type *b = w;  // consider complex case
+
+    /// partitions used for loop iteration
+    Partition2x2<value_type> S_part2x2(ss0, ss1);
+    Partition3x3<value_type> S_part3x3(ss0, ss1);
+
+    Partition1x2<value_type> V_part1x2(vs1);
+    Partition1x3<value_type> V_part1x3(vs1);
+
+    /// initial partition of S where ABR has a zero dimension
+    S_part2x2.partWithABR(S, m, m, 0, 0);
+    V_part1x2.partWithAR(V, m, 0);
+
+    // const mag_type tol = ats::epsilon();
+    int m_stl = m;
+    for (; m_stl > 0;) {
+      /// part 2x2 into 3x3
+      const int mA11 = blks[m_stl - 1];
+      assert(((mA11 == 1) || (mA11 == 2)) &&
+             "RightEigenvectorFromSchur: blk is not 1x1 nor 2x2");
+
+      S_part3x3.partWithATL(S_part2x2, mA11, mA11);
+      V_part1x3.partWithAL(V_part1x2, mA11);
+
+      const int m_stl_minus_mA11 = m_stl - mA11;
+      if (mA11 == 1) {
+        /// real eigenvalue
+        const value_type lambda = *S_part3x3.A11;
+
+        /// initialize a right eigen vector
+        for (int i = 0; i < m_stl_minus_mA11; ++i)
+          b[i] = -S_part3x3.A01[i * ss0];
+        b[m_stl - 1] = one;
+
+        /// perform shifted trsv
+        SerialShiftedTrsvInternalUpper::invoke(
+            m_stl_minus_mA11, lambda, S_part3x3.A00, ss0, ss1, w, 1, blks);
+
+        /// copy back to V
+        for (int i = 0; i < m_stl; ++i) V_part1x3.A1[i * vs0] = w[i];
+        for (int i = m_stl; i < m; ++i) V_part1x3.A1[i * vs0] = zero;
+      } else {
+        /// complex eigen pair
+        const value_type alpha11 = S_part3x3.A11[0],
+                         alpha12 = S_part3x3.A11[ss1],
+                         alpha21 = S_part3x3.A11[ss0],
+                         beta    = ats::sqrt(-alpha12 * alpha21);
+
+        const complex_type lambda(alpha11, beta);
+        complex_type *bc = (complex_type *)(b);
+
+        /// initialize a right eigen vector
+        const value_type *S_A01_a = S_part3x3.A01;
+        const value_type *S_A01_b = S_part3x3.A01 + ss1;
+        for (int i = 0; i < m_stl_minus_mA11; ++i)
+          bc[i] = complex_type(-S_A01_a[i * ss0] * beta,
+                               S_A01_b[i * ss0] * alpha21);
+        bc[m_stl - 2] = complex_type(beta, zero);
+        bc[m_stl - 1] = complex_type(zero, -alpha21);
+
+        /// perform shifted trsv
+        SerialShiftedTrsvInternalUpper::invoke(
+            m_stl_minus_mA11, lambda, S_part3x3.A00, ss0, ss1, bc, 1, blks);
+
+        /// copy back to V
+        value_type *V_A1_r = V_part1x3.A1;
+        value_type *V_A1_i = V_part1x3.A1 + vs1;
+        for (int i = 0; i < m_stl; ++i) {
+          V_A1_r[i * vs0] = bc[i].real();
+          V_A1_i[i * vs0] = bc[i].imag();
         }
-        S_part2x2.mergeToABR(S_part3x3);
-        V_part1x2.mergeToAR(V_part1x3);
-        m_stl -= mA11;
+        for (int i = m_stl; i < m; ++i) {
+          V_A1_r[i * vs0] = zero;
+          V_A1_i[i * vs0] = zero;
+        }
+        /// ---------------------------------------------------
       }
-      return 0;
+      S_part2x2.mergeToABR(S_part3x3);
+      V_part1x2.mergeToAR(V_part1x3);
+      m_stl -= mA11;
     }
-  };
-
-} /// end namespace KokkosBatched
+    return 0;
+  }
+};
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp b/src/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp
index cd943e71b9..25ad92ffad 100644
--- a/src/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp
@@ -6,46 +6,32 @@
 #include "KokkosBatched_SVD_Serial_Internal.hpp"
 
 namespace KokkosBatched {
-  //Version which computes the full factorization
-  template<typename AViewType,
-           typename UViewType,
-           typename VViewType,
-           typename SViewType,
-           typename WViewType>
-  KOKKOS_INLINE_FUNCTION
-  int SerialSVD::
-  invoke(SVD_USV_Tag, const AViewType &A,
-         const UViewType &U, const SViewType &sigma,
-         const VViewType &Vt, const WViewType &work)
-  {
-    using value_type = typename AViewType::non_const_value_type;
-    return KokkosBatched::SerialSVDInternal::invoke<value_type>
-      (A.extent(0), A.extent(1),
-       A.data(), A.stride(0), A.stride(1),
-       U.data(), U.stride(0), U.stride(1),
-       Vt.data(), Vt.stride(0), Vt.stride(1),
-       sigma.data(), sigma.stride(0),
-       work.data());
-  }
+// Version which computes the full factorization
+template <typename AViewType, typename UViewType, typename VViewType,
+          typename SViewType, typename WViewType>
+KOKKOS_INLINE_FUNCTION int SerialSVD::invoke(SVD_USV_Tag, const AViewType &A,
+                                             const UViewType &U,
+                                             const SViewType &sigma,
+                                             const VViewType &Vt,
+                                             const WViewType &work) {
+  using value_type = typename AViewType::non_const_value_type;
+  return KokkosBatched::SerialSVDInternal::invoke<value_type>(
+      A.extent(0), A.extent(1), A.data(), A.stride(0), A.stride(1), U.data(),
+      U.stride(0), U.stride(1), Vt.data(), Vt.stride(0), Vt.stride(1),
+      sigma.data(), sigma.stride(0), work.data());
+}
 
-  //Version which computes only singular values
-  template<typename AViewType,
-           typename SViewType,
-           typename WViewType>
-  KOKKOS_INLINE_FUNCTION
-  int SerialSVD::
-  invoke(SVD_S_Tag, const AViewType &A, const SViewType &sigma, const WViewType &work)
-  {
-    using value_type = typename AViewType::non_const_value_type;
-    return KokkosBatched::SerialSVDInternal::invoke<value_type>
-      (A.extent(0), A.extent(1),
-       A.data(), A.stride(0), A.stride(1),
-       nullptr, 0, 0,
-       nullptr, 0, 0,
-       sigma.data(), sigma.stride(0),
-       work.data());
-  }
+// Version which computes only singular values
+template <typename AViewType, typename SViewType, typename WViewType>
+KOKKOS_INLINE_FUNCTION int SerialSVD::invoke(SVD_S_Tag, const AViewType &A,
+                                             const SViewType &sigma,
+                                             const WViewType &work) {
+  using value_type = typename AViewType::non_const_value_type;
+  return KokkosBatched::SerialSVDInternal::invoke<value_type>(
+      A.extent(0), A.extent(1), A.data(), A.stride(0), A.stride(1), nullptr, 0,
+      0, nullptr, 0, 0, sigma.data(), sigma.stride(0), work.data());
+}
 
-} /// end namespace KokkosBatched
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp
index 1a5ca961b6..0c7007bdf3 100644
--- a/src/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_SVD_SERIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_SVD_SERIAL_INTERNAL_HPP__
 
-
 /// \author Brian Kelley (bmkelle@sandia.gov)
 
 #include "Kokkos_MathematicalFunctions.hpp"
@@ -11,343 +10,353 @@
 #include "KokkosBatched_Householder_Serial_Internal.hpp"
 #include "KokkosBatched_ApplyHouseholder_Serial_Internal.hpp"
 
-//Use this macro to handle raw pointer/stride based 2D indexing in this file (just for readability)
-//Requires that for pointer X, the corresponding row/col strides are named Xs0 and Xs1.
-#define SVDIND(arr, i, j) arr[(i) * arr##s0 + (j) * arr##s1]
-#define SVDSWAP(a, b) {auto tmp = a; a = b; b = tmp;}
+// Use this macro to handle raw pointer/stride based 2D indexing in this file
+// (just for readability) Requires that for pointer X, the corresponding row/col
+// strides are named Xs0 and Xs1.
+#define SVDIND(arr, i, j) arr[(i)*arr##s0 + (j)*arr##s1]
+#define SVDSWAP(a, b) \
+  {                   \
+    auto tmp = a;     \
+    a        = b;     \
+    b        = tmp;   \
+  }
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
+///
+/// Serial Internal Impl
+/// ====================
 
-  struct SerialSVDInternal {
+struct SerialSVDInternal {
+  // Find the two eigenvalues of [a11 a21 ; a21 a22] by solving the
+  // characteristic quadratic. Since matrix is symmetric these will be real.
+  // NOTE: this is essentially the Wilkinson shift routine already in Batched,
+  // however this is simpler because it exploits the symmetric structure, and
+  // the realness of the eigenvalues.
+  template <typename value_type>
+  KOKKOS_INLINE_FUNCTION static void symEigen2x2(value_type a11, value_type a21,
+                                                 value_type a22, value_type& e1,
+                                                 value_type& e2) {
+    value_type a       = Kokkos::ArithTraits<value_type>::one();
+    value_type b       = -a11 - a22;
+    value_type c       = a11 * a22 - a21 * a21;
+    value_type sqrtDet = Kokkos::Experimental::sqrt(b * b - 4 * a * c);
+    e1                 = (-b + sqrtDet) / (2 * a);
+    e2                 = (-b - sqrtDet) / (2 * a);
+  }
 
-    //Find the two eigenvalues of [a11 a21 ; a21 a22] by solving the characteristic quadratic.
-    //Since matrix is symmetric these will be real.
-    //NOTE: this is essentially the Wilkinson shift routine already in Batched,
-    //however this is simpler because it exploits the symmetric structure, and the realness of the eigenvalues.
-    template<typename value_type>
-    KOKKOS_INLINE_FUNCTION static void symEigen2x2(value_type a11, value_type a21, value_type a22, value_type& e1, value_type& e2)
-    {
-      value_type a = Kokkos::ArithTraits<value_type>::one();
-      value_type b = -a11 - a22;
-      value_type c = a11 * a22 - a21 * a21;
-      value_type sqrtDet = Kokkos::Experimental::sqrt(b * b - 4 * a * c);
-      e1 = (-b + sqrtDet) / (2 * a);
-      e2 = (-b - sqrtDet) / (2 * a);
+  // B is a square submatrix on the diagonal.
+  // Usub is a subset of columns of U
+  // Vtsub is a subset of rows of Vt
+  //
+  // B22 is nsub * nsub, Usub is m * nsub, and Vtsub is nsub * n
+  template <typename value_type>
+  KOKKOS_INLINE_FUNCTION static void svdStep(value_type* B, value_type* U,
+                                             value_type* Vt, int um, int vn,
+                                             int n, int Bs0, int Bs1, int Us0,
+                                             int Us1, int Vts0, int Vts1) {
+    using KAT = Kokkos::ArithTraits<value_type>;
+    // Compute the eigenvalues of trailing 2x2
+    value_type dn     = SVDIND(B, n - 1, n - 1);
+    value_type dm     = SVDIND(B, n - 2, n - 2);
+    value_type fm     = SVDIND(B, n - 2, n - 1);
+    value_type fmm1   = (n > 2) ? SVDIND(B, n - 3, n - 2) : KAT::zero();
+    value_type target = dn * dn + fm * fm;
+    value_type e1, e2, mu;
+    symEigen2x2(dm * dm + fmm1 * fmm1, dm * fm, target, e1, e2);
+    // the shift is the eigenvalue closer to the last diagonal entry of B^T*B
+    if (fabs(e1 - target) < fabs(e2 - target))
+      mu = e1;
+    else
+      mu = e2;
+    value_type y = SVDIND(B, 0, 0) * SVDIND(B, 0, 0) - mu;
+    value_type z = SVDIND(B, 0, 0) * SVDIND(B, 0, 1);
+    for (int k = 0; k < n - 1; k++) {
+      // Use Givens to zero out z in [y; z]
+      Kokkos::pair<value_type, value_type> G;
+      value_type discard;  // Don't actually write [alpha; 0] anywhere
+      KokkosBatched::SerialGivensInternal::invoke<value_type>(y, z, &G,
+                                                              &discard);
+      // apply the Givens transformation to B on the right, to columns k,k+1
+      // B := BG(k, k+1, theta)
+      int minrow = KOKKOSKERNELS_MACRO_MAX(0, k - 1);
+      int maxrow = KOKKOSKERNELS_MACRO_MIN(n, k + 2);
+      KokkosBatched::SerialApplyRightGivensInternal::invoke<value_type>(
+          G, maxrow - minrow, &SVDIND(B, minrow, k + 1), Bs0,
+          &SVDIND(B, minrow, k), Bs0);
+      if (Vt) {
+        KokkosBatched::SerialApplyLeftGivensInternal::invoke<value_type>(
+            G, vn, &SVDIND(Vt, k + 1, 0), Vts1, &SVDIND(Vt, k, 0), Vts1);
+      }
+      y = SVDIND(B, k, k);
+      z = SVDIND(B, k + 1, k);
+      KokkosBatched::SerialGivensInternal::invoke<value_type>(y, z, &G,
+                                                              &SVDIND(B, k, k));
+      SVDIND(B, k + 1, k) = KAT::zero();
+      int mincol          = k + 1;
+      int maxcol          = KOKKOSKERNELS_MACRO_MIN(n, k + 3);
+      // apply Givens transformation to B on the left, to rows k, k + 1
+      // B := G(k, k+1, theta)^T * B
+      KokkosBatched::SerialApplyLeftGivensInternal::invoke<value_type>(
+          G, maxcol - mincol, &SVDIND(B, k + 1, mincol), Bs1,
+          &SVDIND(B, k, mincol), Bs1);
+      if (U) {
+        KokkosBatched::SerialApplyRightGivensInternal::invoke<value_type>(
+            G, um, &SVDIND(U, 0, k + 1), Us0, &SVDIND(U, 0, k), Us0);
+      }
+      if (k < n - 2) {
+        y = SVDIND(B, k, k + 1);
+        z = SVDIND(B, k, k + 2);
+      }
     }
+  }
 
-    // B is a square submatrix on the diagonal.
-    // Usub is a subset of columns of U
-    // Vtsub is a subset of rows of Vt
-    //
-    // B22 is nsub * nsub, Usub is m * nsub, and Vtsub is nsub * n
-    template<typename value_type>
-    KOKKOS_INLINE_FUNCTION static void svdStep(value_type* B, value_type* U, value_type* Vt, int um, int vn, int n, int Bs0, int Bs1, int Us0, int Us1, int Vts0, int Vts1)
-    {
-      using KAT = Kokkos::ArithTraits<value_type>;
-      //Compute the eigenvalues of trailing 2x2
-      value_type dn = SVDIND(B, n-1, n-1);
-      value_type dm = SVDIND(B, n-2, n-2);
-      value_type fm = SVDIND(B, n-2, n-1);
-      value_type fmm1 = (n > 2) ? SVDIND(B, n-3, n-2) : KAT::zero();
-      value_type target = dn * dn + fm * fm;
-      value_type e1, e2, mu;
-      symEigen2x2(dm * dm + fmm1 * fmm1, dm * fm, target, e1, e2);
-      //the shift is the eigenvalue closer to the last diagonal entry of B^T*B
-      if(fabs(e1 - target) < fabs(e2 - target))
-        mu = e1;
-      else
-        mu = e2;
-      value_type y = SVDIND(B, 0, 0) * SVDIND(B, 0, 0) - mu;
-      value_type z = SVDIND(B, 0, 0) * SVDIND(B, 0, 1);
-      for(int k = 0; k < n - 1; k++)
-      {
-        //Use Givens to zero out z in [y; z]
-        Kokkos::pair<value_type, value_type> G;
-        value_type discard; //Don't actually write [alpha; 0] anywhere
-        KokkosBatched::SerialGivensInternal::invoke<value_type>(y, z, &G, &discard);
-        //apply the Givens transformation to B on the right, to columns k,k+1
-        //B := BG(k, k+1, theta)
-        int minrow = KOKKOSKERNELS_MACRO_MAX(0, k - 1);
-        int maxrow = KOKKOSKERNELS_MACRO_MIN(n, k + 2);
-        KokkosBatched::SerialApplyRightGivensInternal::invoke<value_type>(G, maxrow - minrow, &SVDIND(B, minrow, k + 1), Bs0, &SVDIND(B, minrow, k), Bs0);
-        if(Vt)
-        {
-          KokkosBatched::SerialApplyLeftGivensInternal::invoke<value_type>(G, vn, &SVDIND(Vt, k + 1, 0), Vts1, &SVDIND(Vt, k, 0), Vts1);
-        }
-        y = SVDIND(B, k, k);
-        z = SVDIND(B, k + 1, k);
-        KokkosBatched::SerialGivensInternal::invoke<value_type>(y, z, &G, &SVDIND(B, k, k));
-        SVDIND(B, k + 1, k) = KAT::zero();
-        int mincol = k + 1;
-        int maxcol = KOKKOSKERNELS_MACRO_MIN(n, k + 3);
-        //apply Givens transformation to B on the left, to rows k, k + 1
-        //B := G(k, k+1, theta)^T * B
-        KokkosBatched::SerialApplyLeftGivensInternal::invoke<value_type>(G, maxcol - mincol, &SVDIND(B, k + 1, mincol), Bs1, &SVDIND(B, k, mincol), Bs1);
-        if(U)
-        {
-          KokkosBatched::SerialApplyRightGivensInternal::invoke<value_type>(G, um, &SVDIND(U, 0, k + 1), Us0, &SVDIND(U, 0, k), Us0);
-        }
-        if(k < n - 2)
-        {
-          y = SVDIND(B, k, k + 1);
-          z = SVDIND(B, k, k + 2);
-        }
+  // Deal with B(i, i) = 0, by chasing superdiagonal nonzero across row i.
+  // Assumes i is not the last row.
+  // U is m*m, B is n*n
+  template <typename value_type>
+  KOKKOS_INLINE_FUNCTION static void svdZeroRow(int i, value_type* B, int n,
+                                                int Bs0, int Bs1, value_type* U,
+                                                int m, int Us0, int Us1) {
+    Kokkos::pair<value_type, value_type> G;
+    for (int j = i + 1; j < n; j++) {
+      // Zero out B(i, j) against diagonal j, introducing nonzero in B(i, j + 1)
+      KokkosBatched::SerialGivensInternal::invoke<value_type>(
+          SVDIND(B, j, j), SVDIND(B, i, j), &G, &SVDIND(B, j, j));
+      SVDIND(B, i, j) = Kokkos::ArithTraits<value_type>::zero();
+      // Now, only need to apply givens to a single column (if not already at
+      // the end), introducing the next nonzero
+      if (j < n - 1) {
+        KokkosBatched::SerialApplyLeftGivensInternal::invoke<value_type>(
+            G, 1, &SVDIND(B, i, j + 1), Bs1, &SVDIND(B, j, j + 1), Bs1);
+      }
+      if (U) {
+        KokkosBatched::SerialApplyRightGivensInternal::invoke<value_type>(
+            G, m, &SVDIND(U, 0, i), Us0, &SVDIND(U, 0, j), Us0);
       }
     }
+  }
 
-    //Deal with B(i, i) = 0, by chasing superdiagonal nonzero across row i.
-    //Assumes i is not the last row.
-    //U is m*m, B is n*n
-    template<typename value_type>
-    KOKKOS_INLINE_FUNCTION static void svdZeroRow(int i, value_type* B, int n, int Bs0, int Bs1, value_type* U, int m, int Us0, int Us1)
-    {
-      Kokkos::pair<value_type, value_type> G;
-      for(int j = i + 1; j < n; j++)
-      {
-        //Zero out B(i, j) against diagonal j, introducing nonzero in B(i, j + 1)
-        KokkosBatched::SerialGivensInternal::invoke<value_type>(SVDIND(B, j, j), SVDIND(B, i, j), &G, &SVDIND(B, j, j));
-        SVDIND(B, i, j) = Kokkos::ArithTraits<value_type>::zero();
-        //Now, only need to apply givens to a single column (if not already at the end),
-        //introducing the next nonzero
-        if(j < n - 1)
-        {
-          KokkosBatched::SerialApplyLeftGivensInternal::invoke<value_type>(G, 1, &SVDIND(B, i, j + 1), Bs1, &SVDIND(B, j, j + 1), Bs1);
-        }
-        if(U)
-        {
-          KokkosBatched::SerialApplyRightGivensInternal::invoke<value_type>(G, m, &SVDIND(U, 0, i), Us0, &SVDIND(U, 0, j), Us0);
-        }
+  template <typename value_type>
+  KOKKOS_INLINE_FUNCTION static void svdZeroLastColumn(value_type* B, int n,
+                                                       int Bs0, int Bs1,
+                                                       value_type* Vt, int Vts0,
+                                                       int Vts1) {
+    // Deal with B(n-1, n-1) = 0, by chasing the superdiagonal nonzero up the
+    // last column.
+    Kokkos::pair<value_type, value_type> G;
+    for (int j = n - 2; j >= 0; j--) {
+      KokkosBatched::SerialGivensInternal::invoke<value_type>(
+          SVDIND(B, j, j), SVDIND(B, j, n - 1), &G, &SVDIND(B, j, j));
+      SVDIND(B, j, n - 1) = Kokkos::ArithTraits<value_type>::zero();
+      if (j != 0) {
+        KokkosBatched::SerialApplyRightGivensInternal::invoke<value_type>(
+            G, 1, &SVDIND(B, j - 1, n - 1), Bs0, &SVDIND(B, j - 1, j), Bs0);
+      }
+      if (Vt) {
+        KokkosBatched::SerialApplyLeftGivensInternal::invoke<value_type>(
+            G, n, &SVDIND(Vt, n - 1, 0), Vts1, &SVDIND(Vt, j, 0), Vts1);
       }
     }
+  }
 
-    template<typename value_type>
-    KOKKOS_INLINE_FUNCTION static void svdZeroLastColumn(value_type* B, int n, int Bs0, int Bs1, value_type* Vt, int Vts0, int Vts1)
-    {
-      //Deal with B(n-1, n-1) = 0, by chasing the superdiagonal nonzero up the last column.
-      Kokkos::pair<value_type, value_type> G;
-      for(int j = n - 2; j >= 0; j--)
-      {
-        KokkosBatched::SerialGivensInternal::invoke<value_type>(SVDIND(B, j, j), SVDIND(B, j, n - 1), &G, &SVDIND(B, j, j));
-        SVDIND(B, j, n - 1) = Kokkos::ArithTraits<value_type>::zero();
-        if(j != 0)
-        {
-          KokkosBatched::SerialApplyRightGivensInternal::invoke<value_type>(G, 1, &SVDIND(B, j - 1, n - 1), Bs0, &SVDIND(B, j - 1, j), Bs0);
+  template <typename value_type>
+  KOKKOS_INLINE_FUNCTION static void bidiagonalize(
+      int m, int n, value_type* A, int As0, int As1, value_type* U, int Us0,
+      int Us1, value_type* Vt, int Vts0, int Vts1, value_type* work) {
+    using KAT = Kokkos::ArithTraits<value_type>;
+    value_type tau;
+    for (int i = 0; i < n; i++) {
+      // Eliminating column i of A below the diagonal
+      KokkosBatched::SerialLeftHouseholderInternal::invoke<value_type>(
+          m - i - 1, &SVDIND(A, i, i), &SVDIND(A, i + 1, i), As0, &tau);
+      if (n - i > 1) {
+        KokkosBatched::SerialApplyLeftHouseholderInternal::invoke<value_type>(
+            m - i - 1, n - i - 1, &tau, &SVDIND(A, i + 1, i), As0,
+            &SVDIND(A, i, i + 1), As1, &SVDIND(A, i + 1, i + 1), As0, As1,
+            work);
+      }
+      if (U) {
+        KokkosBatched::SerialApplyRightHouseholderInternal::invoke<value_type>(
+            m, m - i - 1, &tau, &SVDIND(A, i + 1, i), As0, &SVDIND(U, 0, i),
+            Us0, &SVDIND(U, 0, i + 1), Us0, Us1, work);
+      }
+      // Zero out A subdiag explicitly (NOTE: may not be necessary...)
+      for (int j = i + 1; j < m; j++) {
+        SVDIND(A, j, i) = KAT::zero();
+      }
+      if (i < n - 2) {
+        // Eliminating row i of A to the right of the 1st superdiagonal
+        KokkosBatched::SerialLeftHouseholderInternal::invoke<value_type>(
+            n - i - 2, &SVDIND(A, i, i + 1), &SVDIND(A, i, i + 2), As1, &tau);
+        if (m - i > 1) {
+          KokkosBatched::SerialApplyRightHouseholderInternal::invoke<
+              value_type>(m - i - 1, n - i - 2, &tau, &SVDIND(A, i, i + 2), As1,
+                          &SVDIND(A, i + 1, i + 1), As0,
+                          &SVDIND(A, i + 1, i + 2), As0, As1, work);
+        }
+        if (Vt) {
+          KokkosBatched::SerialApplyLeftHouseholderInternal::invoke<value_type>(
+              n - i - 2, n, &tau, &SVDIND(A, i, i + 2), As1,
+              &SVDIND(Vt, i + 1, 0), Vts1, &SVDIND(Vt, i + 2, 0), Vts0, Vts1,
+              work);
         }
-        if(Vt)
-        {
-          KokkosBatched::SerialApplyLeftGivensInternal::invoke<value_type>(G, n, &SVDIND(Vt, n - 1, 0), Vts1, &SVDIND(Vt, j, 0), Vts1);
+        // Zero out A superdiag row explicitly
+        for (int j = i + 2; j < n; j++) {
+          SVDIND(A, i, j) = KAT::zero();
         }
       }
     }
+  }
 
-    template<typename value_type>
-    KOKKOS_INLINE_FUNCTION static void bidiagonalize(int m, int n, value_type* A, int As0, int As1, value_type* U, int Us0, int Us1, value_type* Vt, int Vts0, int Vts1, value_type* work)
-    {
-      using KAT = Kokkos::ArithTraits<value_type>;
-      value_type tau;
-      for(int i = 0; i < n; i++)
-      {
-        //Eliminating column i of A below the diagonal
-        KokkosBatched::SerialLeftHouseholderInternal::invoke<value_type>(m - i - 1, &SVDIND(A, i, i), &SVDIND(A, i + 1, i), As0, &tau);
-        if(n - i > 1)
-        {
-          KokkosBatched::SerialApplyLeftHouseholderInternal::invoke<value_type>(m - i - 1, n - i - 1, &tau, &SVDIND(A, i + 1, i), As0, &SVDIND(A, i, i + 1), As1, &SVDIND(A, i + 1, i + 1), As0, As1, work);
+  // Compute the SVD of a bidiagonal matrix B. Apply inverse transformations to
+  // U and Vt to maintain the product U*B*Vt. At the end, the singular values
+  // are copied to sigma.
+  template <typename value_type>
+  KOKKOS_INLINE_FUNCTION static void bidiSVD(int m, int n, value_type* B,
+                                             int Bs0, int Bs1, value_type* U,
+                                             int Us0, int Us1, value_type* Vt,
+                                             int Vts0, int Vts1,
+                                             value_type* sigma, int ss) {
+    using KAT            = Kokkos::ArithTraits<value_type>;
+    const value_type eps = Kokkos::ArithTraits<value_type>::epsilon();
+    int p                = 0;
+    int q                = 0;
+    while (true) {
+      // Zero out tiny superdiagonal entries
+      for (int i = 0; i < n - 1; i++) {
+        if (fabs(SVDIND(B, i, i + 1)) <
+            eps * (fabs(SVDIND(B, i, i)) + fabs(SVDIND(B, i + 1, i + 1)))) {
+          SVDIND(B, i, i + 1) = KAT::zero();
         }
-        if(U)
-        {
-          KokkosBatched::SerialApplyRightHouseholderInternal::invoke<value_type>(m, m - i - 1, &tau, &SVDIND(A, i + 1, i), As0, &SVDIND(U, 0, i), Us0, &SVDIND(U, 0, i + 1), Us0, Us1, work);
-        }
-        //Zero out A subdiag explicitly (NOTE: may not be necessary...)
-        for(int j = i + 1; j < m; j++)
-        {
-          SVDIND(A, j, i) = KAT::zero();
-        }
-        if(i < n - 2)
-        {
-          //Eliminating row i of A to the right of the 1st superdiagonal
-          KokkosBatched::SerialLeftHouseholderInternal::invoke<value_type>(n - i - 2, &SVDIND(A, i, i + 1), &SVDIND(A, i, i + 2), As1, &tau);
-          if(m - i > 1)
-          {
-            KokkosBatched::SerialApplyRightHouseholderInternal::invoke<value_type>(m - i - 1, n - i - 2, &tau, &SVDIND(A, i, i + 2), As1, &SVDIND(A, i + 1, i + 1), As0, &SVDIND(A, i + 1, i + 2), As0, As1, work);
-          }
-          if(Vt)
-          {
-            KokkosBatched::SerialApplyLeftHouseholderInternal::invoke<value_type>(n - i - 2, n, &tau, &SVDIND(A, i, i + 2), As1, &SVDIND(Vt, i + 1, 0), Vts1, &SVDIND(Vt, i + 2, 0), Vts0, Vts1, work);
-          }
-          //Zero out A superdiag row explicitly
-          for(int j = i + 2; j < n; j++)
-          {
-            SVDIND(A, i, j) = KAT::zero();
+      }
+      // Find q: first column from the end with nonzero superdiagonal.
+      // If no such columns, will be 0.
+      for (q = n - 1; q > 0; q--) {
+        if (SVDIND(B, q - 1, q) != KAT::zero()) break;
+      }
+      if (q == 0) {
+        // B is completely diagonal, so it contains singular values and we are
+        // done.
+        break;
+      }
+      q++;
+      // now, q is the upper (exclusive) bound of submatrix on which to do SVD
+      // step. Find min p, so that [p, q) x [p, q) submatrix has all nonzero
+      // superdiagonals.
+      for (p = q - 1; p > 0; p--) {
+        if (SVDIND(B, p - 1, p) == KAT::zero()) break;
+      }
+      // If there are zero diagonals in this range, eliminate the entire row
+      //(effectively decoupling into two subproblems)
+      for (int i = q - 1; i >= p; i--) {
+        if (SVDIND(B, i, i) == KAT::zero()) {
+          if (i == n - 1) {
+            // Last diagonal entry being 0 is a special case.
+            // Zero out the superdiagonal above it.
+            // Deal with B(n-1, n-1) = 0, by chasing the superdiagonal nonzero
+            // up the last column.
+            svdZeroLastColumn(B, n, Bs0, Bs1, Vt, Vts0, Vts1);
+          } else if (SVDIND(B, i, i + 1) != KAT::zero()) {
+            svdZeroRow(i, B, n, Bs0, Bs1, U, m, Us0, Us1);
           }
         }
+        continue;
       }
+      int nsub = q - p;
+      // B22 is nsub * nsub, Usub is m * nsub, and Vtsub is nsub * n
+      svdStep(&SVDIND(B, p, p), &SVDIND(U, 0, p), &SVDIND(Vt, p, 0), m, n, nsub,
+              Bs0, Bs1, Us0, Us1, Vts0, Vts1);
     }
+    for (int i = 0; i < n; i++) {
+      sigma[i * ss] = SVDIND(B, i, i);
+    }
+  }
 
-    //Compute the SVD of a bidiagonal matrix B. Apply inverse transformations to U and Vt to maintain the product U*B*Vt.
-    //At the end, the singular values are copied to sigma.
-    template<typename value_type>
-    KOKKOS_INLINE_FUNCTION static void bidiSVD(int m, int n, value_type* B, int Bs0, int Bs1, value_type* U, int Us0, int Us1, value_type* Vt, int Vts0, int Vts1, value_type* sigma, int ss)
-    {
-      using KAT = Kokkos::ArithTraits<value_type>;
-      const value_type eps = Kokkos::ArithTraits<value_type>::epsilon();
-      int p = 0;
-      int q = 0;
-      while(true)
-      {
-        //Zero out tiny superdiagonal entries
-        for(int i = 0; i < n - 1; i++)
-        {
-          if(fabs(SVDIND(B, i, i + 1)) < eps * (fabs(SVDIND(B, i, i)) + fabs(SVDIND(B, i + 1, i + 1))))
-          {
-            SVDIND(B, i, i + 1) = KAT::zero();
-          }
-        }
-        //Find q: first column from the end with nonzero superdiagonal.
-        //If no such columns, will be 0.
-        for(q = n - 1; q > 0; q--)
-        {
-          if(SVDIND(B, q - 1, q) != KAT::zero())
-            break;
+  // Convert SVD into conventional form: singular values positive and in
+  // descending order
+  template <typename value_type>
+  KOKKOS_INLINE_FUNCTION static void postprocessSVD(int m, int n, value_type* U,
+                                                    int Us0, int Us1,
+                                                    value_type* Vt, int Vts0,
+                                                    int Vts1, value_type* sigma,
+                                                    int ss) {
+    // First step: flip signs on negative singular values
+    for (int i = 0; i < n; i++) {
+      if (sigma[i * ss] < 0) {
+        sigma[i * ss] = -sigma[i * ss];
+        if (Vt) {
+          for (int j = 0; j < n; j++) SVDIND(Vt, i, j) = -SVDIND(Vt, i, j);
         }
-        if(q == 0)
-        {
-          //B is completely diagonal, so it contains singular values and we are done.
-          break;
-        }
-        q++;
-        //now, q is the upper (exclusive) bound of submatrix on which to do SVD step.
-        //Find min p, so that [p, q) x [p, q) submatrix has all nonzero superdiagonals.
-        for(p = q - 1; p > 0; p--)
-        {
-          if(SVDIND(B, p - 1, p) == KAT::zero())
-            break;
-        }
-        //If there are zero diagonals in this range, eliminate the entire row
-        //(effectively decoupling into two subproblems)
-        for(int i = q - 1; i >= p; i--)
-        {
-          if(SVDIND(B, i, i) == KAT::zero())
-          {
-            if(i == n - 1)
-            {
-              //Last diagonal entry being 0 is a special case.
-              //Zero out the superdiagonal above it.
-              //Deal with B(n-1, n-1) = 0, by chasing the superdiagonal nonzero up the last column.
-              svdZeroLastColumn(B, n, Bs0, Bs1, Vt, Vts0, Vts1);
-            }
-            else if(SVDIND(B, i, i + 1) != KAT::zero())
-            {
-              svdZeroRow(i, B, n, Bs0, Bs1, U, m, Us0, Us1);
-            }
-          } 
-          continue;
-        }
-        int nsub = q - p;
-        //B22 is nsub * nsub, Usub is m * nsub, and Vtsub is nsub * n
-        svdStep(&SVDIND(B, p, p), &SVDIND(U, 0, p), &SVDIND(Vt, p, 0), m, n, nsub, Bs0, Bs1, Us0, Us1, Vts0, Vts1);
-      }
-      for(int i = 0; i < n; i++)
-      {
-        sigma[i * ss] = SVDIND(B, i, i);
       }
     }
-
-    //Convert SVD into conventional form: singular values positive and in descending order
-    template<typename value_type>
-    KOKKOS_INLINE_FUNCTION static void postprocessSVD(int m, int n, value_type* U, int Us0, int Us1, value_type* Vt, int Vts0, int Vts1, value_type* sigma, int ss)
-    {
-      //First step: flip signs on negative singular values
-      for(int i = 0; i < n; i++)
-      {
-        if(sigma[i * ss] < 0)
-        {
-          sigma[i * ss] = -sigma[i * ss];
-          if(Vt)
-          {
-            for(int j = 0; j < n; j++)
-              SVDIND(Vt, i, j) = -SVDIND(Vt, i, j);
-          }
+    // Second step: stable selection sort to put singular values in order.
+    // Using selection sort because the quadratic part only applies to sigma
+    // (O(n^2) total), and it minimizes column swaps in U,V (O(mn) total
+    // movement).
+    for (int i = 0; i < n - 1; i++) {
+      // find the proper singular value to go in position i
+      value_type maxval = sigma[i * ss];
+      int maxloc        = i;
+      for (int j = i + 1; j < n; j++) {
+        if (sigma[j * ss] > maxval) {
+          maxval = sigma[j * ss];
+          maxloc = j;
         }
       }
-      //Second step: stable selection sort to put singular values in order.
-      //Using selection sort because the quadratic part only applies to sigma (O(n^2) total), and it minimizes column swaps in U,V (O(mn) total movement).
-      for(int i = 0; i < n - 1; i++)
-      {
-        //find the proper singular value to go in position i
-        value_type maxval = sigma[i * ss];
-        int maxloc = i;
-        for(int j = i + 1; j < n; j++)
-        {
-          if(sigma[j * ss] > maxval)
-          {
-            maxval = sigma[j * ss];
-            maxloc = j;
-          }
+      // swap singular values and U/V columns i and maxloc (if maxloc is not
+      // already in the right place)
+      if (i != maxloc) {
+        SVDSWAP(sigma[i * ss], sigma[maxloc * ss]);
+        if (U) {
+          for (int j = 0; j < m; j++)
+            SVDSWAP(SVDIND(U, j, i), SVDIND(U, j, maxloc))
         }
-        //swap singular values and U/V columns i and maxloc (if maxloc is not already in the right place)
-        if(i != maxloc)
-        {
-          SVDSWAP(sigma[i * ss], sigma[maxloc * ss]);
-          if(U)
-          {
-            for(int j = 0; j < m; j++)
-              SVDSWAP(SVDIND(U, j, i), SVDIND(U, j, maxloc))
-          }
-          if(Vt)
-          {
-            for(int j = 0; j < n; j++)
-              SVDSWAP(SVDIND(Vt, i, j), SVDIND(Vt, maxloc, j))
-          }
+        if (Vt) {
+          for (int j = 0; j < n; j++)
+            SVDSWAP(SVDIND(Vt, i, j), SVDIND(Vt, maxloc, j))
         }
       }
     }
+  }
 
-    template<typename value_type>
-    KOKKOS_INLINE_FUNCTION static int
-    invoke(int m, int n,
-        value_type* A, int As0, int As1,
-        value_type* U, int Us0, int Us1,
-        value_type* Vt, int Vts0, int Vts1,
-        value_type* sigma, int ss,
-        value_type* work)
-    {
-      //First, if m < n, need to instead compute (V, s, U^T) = A^T.
-      //This just means swapping U & Vt, and implicitly transposing A, U and Vt.
-      if(m < n)
-      {
-        //Transpose A
-        SVDSWAP(m, n);
-        SVDSWAP(As0, As1);
-        //Transpose and swap U, Vt
-        SVDSWAP(U, Vt);
-        SVDSWAP(Us0, Vts1);
-        SVDSWAP(Us1, Vts0);
-      }
-      if(U)
-      {
-        KokkosBatched::SerialSetIdentityInternal::invoke<value_type>(m, m, U, Us0, Us1);
-      }
-      if(Vt)
-      {
-        KokkosBatched::SerialSetIdentityInternal::invoke<value_type>(n, n, Vt, Vts0, Vts1);
-      }
-      if(m == 0 || n == 0)
-      {
-        //sigma is length 0, so there's nothing left to compute
-        return 0;
-      }
-      bidiagonalize(m, n, A, As0, As1, U, Us0, Us1, Vt, Vts0, Vts1, work);
-      bidiSVD(m, n, A, As0, As1, U, Us0, Us1, Vt, Vts0, Vts1, sigma, ss);
-      postprocessSVD(m, n, U, Us0, Us1, Vt, Vts0, Vts1, sigma, ss);
+  template <typename value_type>
+  KOKKOS_INLINE_FUNCTION static int invoke(int m, int n, value_type* A, int As0,
+                                           int As1, value_type* U, int Us0,
+                                           int Us1, value_type* Vt, int Vts0,
+                                           int Vts1, value_type* sigma, int ss,
+                                           value_type* work) {
+    // First, if m < n, need to instead compute (V, s, U^T) = A^T.
+    // This just means swapping U & Vt, and implicitly transposing A, U and Vt.
+    if (m < n) {
+      // Transpose A
+      SVDSWAP(m, n);
+      SVDSWAP(As0, As1);
+      // Transpose and swap U, Vt
+      SVDSWAP(U, Vt);
+      SVDSWAP(Us0, Vts1);
+      SVDSWAP(Us1, Vts0);
+    }
+    if (U) {
+      KokkosBatched::SerialSetIdentityInternal::invoke<value_type>(m, m, U, Us0,
+                                                                   Us1);
+    }
+    if (Vt) {
+      KokkosBatched::SerialSetIdentityInternal::invoke<value_type>(n, n, Vt,
+                                                                   Vts0, Vts1);
+    }
+    if (m == 0 || n == 0) {
+      // sigma is length 0, so there's nothing left to compute
       return 0;
     }
-  };
+    bidiagonalize(m, n, A, As0, As1, U, Us0, Us1, Vt, Vts0, Vts1, work);
+    bidiSVD(m, n, A, As0, As1, U, Us0, Us1, Vt, Vts0, Vts1, sigma, ss);
+    postprocessSVD(m, n, U, Us0, Us1, Vt, Vts0, Vts1, sigma, ss);
+    return 0;
+  }
+};
 
-} /// end namespace KokkosBatched
+}  // namespace KokkosBatched
 
 #undef SVDIND
 #undef SVDSWAP
diff --git a/src/batched/dense/impl/KokkosBatched_Scale_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Scale_Impl.hpp
index 89dda38757..b4e865ddea 100644
--- a/src/batched/dense/impl/KokkosBatched_Scale_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Scale_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_SCALE_IMPL_HPP__
 #define __KOKKOSBATCHED_SCALE_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,63 +8,41 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Impl
-  /// ===========
-  template<typename ScalarType,
-           typename AViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialScale::
-  invoke(const ScalarType alpha,
-         const AViewType &A) {
-    return SerialScaleInternal::
-      invoke(A.extent(0), A.extent(1),
-             alpha, 
-             A.data(), A.stride_0(), A.stride_1());
-  }
+///
+/// Serial Impl
+/// ===========
+template <typename ScalarType, typename AViewType>
+KOKKOS_INLINE_FUNCTION int SerialScale::invoke(const ScalarType alpha,
+                                               const AViewType &A) {
+  return SerialScaleInternal::invoke(A.extent(0), A.extent(1), alpha, A.data(),
+                                     A.stride_0(), A.stride_1());
+}
+
+///
+/// Team Impl
+/// =========
 
-  ///
-  /// Team Impl
-  /// =========
-    
-  template<typename MemberType>
-  template<typename ScalarType,
-           typename AViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamScale<MemberType>::
-  invoke(const MemberType &member, 
-         const ScalarType alpha,
-         const AViewType &A) {
-    return TeamScaleInternal::
-      invoke(member, 
-             A.extent(0), A.extent(1),
-             alpha, 
-             A.data(), A.stride_0(), A.stride_1());
-  }
+template <typename MemberType>
+template <typename ScalarType, typename AViewType>
+KOKKOS_INLINE_FUNCTION int TeamScale<MemberType>::invoke(
+    const MemberType &member, const ScalarType alpha, const AViewType &A) {
+  return TeamScaleInternal::invoke(member, A.extent(0), A.extent(1), alpha,
+                                   A.data(), A.stride_0(), A.stride_1());
+}
 
-  ///
-  /// TeamVector Impl
-  /// ===============
-    
-  template<typename MemberType>
-  template<typename ScalarType,
-           typename AViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamVectorScale<MemberType>::
-  invoke(const MemberType &member, 
-         const ScalarType alpha,
-         const AViewType &A) {
-    return TeamVectorScaleInternal::
-      invoke(member, 
-             A.extent(0), A.extent(1),
-             alpha, 
-             A.data(), A.stride_0(), A.stride_1());
-  }
-  
+///
+/// TeamVector Impl
+/// ===============
+
+template <typename MemberType>
+template <typename ScalarType, typename AViewType>
+KOKKOS_INLINE_FUNCTION int TeamVectorScale<MemberType>::invoke(
+    const MemberType &member, const ScalarType alpha, const AViewType &A) {
+  return TeamVectorScaleInternal::invoke(member, A.extent(0), A.extent(1),
+                                         alpha, A.data(), A.stride_0(),
+                                         A.stride_1());
 }
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Scale_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Scale_Internal.hpp
index d39c8e142a..6f313ea919 100644
--- a/src/batched/dense/impl/KokkosBatched_Scale_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Scale_Internal.hpp
@@ -1,162 +1,120 @@
 #ifndef __KOKKOSBATCHED_SCALE_INTERNAL_HPP__
 #define __KOKKOSBATCHED_SCALE_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
-  struct SerialScaleInternal {
-    template<typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m, 
-           const ScalarType alpha, 
-           /* */ ValueType *__restrict__ A, const int as0) {
-
+///
+/// Serial Internal Impl
+/// ====================
+struct SerialScaleInternal {
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType alpha,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0) {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int i=0;i<m;++i)
-        A[i*as0] *= alpha;
-        
-      return 0;
-    }
-      
-    template<typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m, const int n, 
-           const ScalarType alpha, 
-           /* */ ValueType *__restrict__ A, const int as0, const int as1) {
+    for (int i = 0; i < m; ++i) A[i * as0] *= alpha;
 
-      if (as0 > as1)
-        for (int i=0;i<m;++i)
-          invoke(n, alpha, A+i*as0, as1);
-      else
-        for (int j=0;j<n;++j)
-          invoke(m, alpha, A+j*as1, as0);
-        
-      return 0;
-    }
-  };
+    return 0;
+  }
 
-  ///
-  /// Team Internal Impl
-  /// ==================== 
-  struct TeamScaleInternal {
-    template<typename MemberType,
-             typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const int m, 
-           const ScalarType alpha, 
-           /* */ ValueType *__restrict__ A, const int as0) {
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n,
+                                           const ScalarType alpha,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1) {
+    if (as0 > as1)
+      for (int i = 0; i < m; ++i) invoke(n, alpha, A + i * as0, as1);
+    else
+      for (int j = 0; j < n; ++j) invoke(m, alpha, A + j * as1, as0);
 
-      Kokkos::parallel_for
-        (Kokkos::TeamThreadRange(member,m),
-         [&](const int &i) {
-          A[i*as0] *= alpha;
-        });
-      //member.team_barrier();
-      return 0;
-    }
-      
-    template<typename MemberType,
-             typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const int m, const int n, 
-           const ScalarType alpha, 
-           /* */ ValueType *__restrict__ A, const int as0, const int as1) {
-      if (m > n) {
-        Kokkos::parallel_for
-          (Kokkos::TeamThreadRange(member,m),
-           [&](const int &i) {
-            SerialScaleInternal::invoke(n, alpha, A+i*as0, as1);
+    return 0;
+  }
+};
+
+///
+/// Team Internal Impl
+/// ====================
+struct TeamScaleInternal {
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int m, const ScalarType alpha,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0) {
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m),
+                         [&](const int &i) { A[i * as0] *= alpha; });
+    // member.team_barrier();
+    return 0;
+  }
+
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int m, const int n,
+                                           const ScalarType alpha,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1) {
+    if (m > n) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, m), [&](const int &i) {
+            SerialScaleInternal::invoke(n, alpha, A + i * as0, as1);
           });
-      } else {
-        Kokkos::parallel_for
-          (Kokkos::TeamThreadRange(member,n),
-           [&](const int &j) {
-            SerialScaleInternal::invoke(m, alpha, A+j*as1, as0);
+    } else {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, n), [&](const int &j) {
+            SerialScaleInternal::invoke(m, alpha, A + j * as1, as0);
           });
-      }
-      //member.team_barrier();
-      return 0;
     }
-  };
+    // member.team_barrier();
+    return 0;
+  }
+};
 
-  ///
-  /// TeamVector Internal Impl
-  /// ======================== 
-  struct TeamVectorScaleInternal {
-    template<typename MemberType,
-             typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const int m, 
-           const ScalarType alpha, 
-           /* */ ValueType *__restrict__ A, const int as0) {
+///
+/// TeamVector Internal Impl
+/// ========================
+struct TeamVectorScaleInternal {
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int m, const ScalarType alpha,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0) {
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m),
+                         [&](const int &i) { A[i * as0] *= alpha; });
+    // member.team_barrier();
+    return 0;
+  }
 
-      Kokkos::parallel_for
-        (Kokkos::TeamVectorRange(member,m),
-         [&](const int &i) {
-          A[i*as0] *= alpha;
-        });
-      //member.team_barrier();
-      return 0;
-    }
-      
-    template<typename MemberType,
-             typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const int m, const int n, 
-           const ScalarType alpha, 
-           /* */ ValueType *__restrict__ A, const int as0, const int as1) {
-      if (as0 > as1) {
-        Kokkos::parallel_for
-          (Kokkos::TeamThreadRange(member,m),
-           [&](const int &i) {
-            Kokkos::parallel_for
-              (Kokkos::ThreadVectorRange(member,n),
-               [&](const int &j) {
-                A[i*as0+j*as1] *= alpha;
-              });
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int m, const int n,
+                                           const ScalarType alpha,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1) {
+    if (as0 > as1) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, m), [&](const int &i) {
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(member, n),
+                [&](const int &j) { A[i * as0 + j * as1] *= alpha; });
           });
-      } else {
-        Kokkos::parallel_for
-          (Kokkos::ThreadVectorRange(member,m),
-           [&](const int &i) {
-            Kokkos::parallel_for
-              (Kokkos::TeamThreadRange(member,n),
-               [&](const int &j) {
-                A[i*as0+j*as1] *= alpha;
-              });
+    } else {
+      Kokkos::parallel_for(
+          Kokkos::ThreadVectorRange(member, m), [&](const int &i) {
+            Kokkos::parallel_for(
+                Kokkos::TeamThreadRange(member, n),
+                [&](const int &j) { A[i * as0 + j * as1] *= alpha; });
           });
-      }
-      //member.team_barrier();
-      return 0;
     }
-  };
-
-}
+    // member.team_barrier();
+    return 0;
+  }
+};
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp
index 06f4d961aa..3c5ccf0942 100644
--- a/src/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp
@@ -1,128 +1,133 @@
 #ifndef __KOKKOSBATCHED_SCHUR2X2_SERIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_SCHUR2X2_SERIAL_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
-
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
-  ///
-  /// this impl follows the flame interface of householder transformation
-  ///
-  struct SerialSchur2x2Internal {
-    template<typename RealType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(RealType * alpha00, RealType * alpha01,
-           RealType * alpha10, RealType * alpha11,
-           Kokkos::pair<RealType,RealType> * G,
-           Kokkos::complex<RealType> * lambda1,
-           Kokkos::complex<RealType> * lambda2,
-           bool * is_complex) {
-      typedef RealType real_type;        
-      typedef Kokkos::Details::ArithTraits<real_type> ats;
-      const real_type zero(0), one(1), half(0.5), minus_one(-1);
-      /// compute G = [ gamma -sigma;
-      ///               sigma  gamma ];
-      /// G.first = gamma and G.second = sigma
-      /// this rotation satisfy the following
-      ///   G' [alpha00 alpha01   G = [ beta00 beta01;
-      ///       alpha10 alpha11 ]       beta10 beta11 ];
-      /// where either 
-      ///   1) beta00 = beta11 and beta01*beta10 < 0
-      ///   2) beta10 = 0
-      const real_type tol = ats::epsilon()*real_type(100);
-      if (ats::abs(*alpha10) < tol) {
-        /// no rotation
-        *G = Kokkos::pair<real_type,real_type>(one, zero);
-        /// two real eigen values
-        *lambda1 = Kokkos::complex<real_type>(*alpha00, zero);
-        *lambda2 = Kokkos::complex<real_type>(*alpha11, zero);
-        *is_complex = false;
-      } else if (ats::abs(*alpha01) < tol) {
-        /// 90 degree rotation (permutation)
-        *G = Kokkos::pair<real_type,real_type>(zero, one);
-        /// [ 0 1 ][alpha00 0       [ 0 -1  --> [ alpha11 -alpha10
-        ///  -1 0 ] alpha10 alpha11]  1  0]       0        alpha00]
-        const real_type tmp = *alpha00; *alpha00 = *alpha11; *alpha11 = tmp;
-        *alpha01 = -(*alpha10); *alpha10 = zero;
-        /// two real eigen values 
-        *lambda1 = Kokkos::complex<real_type>(*alpha00, zero);
-        *lambda2 = Kokkos::complex<real_type>(*alpha11, zero);
-        *is_complex = false;
-      } else if (ats::abs(*alpha00-*alpha11) < tol && (*alpha01)*(*alpha10) > zero) {
-        // no rotation (already the standard schur form)
-        *G = Kokkos::pair<real_type,real_type>(one, zero);
-        /// two real eigen values
-        *lambda1 = Kokkos::complex<real_type>(*alpha00, zero);
-        *lambda2 = Kokkos::complex<real_type>(*alpha11, zero);
-        *is_complex = false;          
-      } else {
-        /// rotation to equalize diagonals
-        const real_type a = (*alpha00)-(*alpha11);
-        const real_type b = (*alpha01)+(*alpha10);
-        const real_type l = ats::sqrt(a*a+b*b);
-        const real_type c = ats::sqrt(half*(one+ats::abs(b)/l));
-        const real_type s = -((half*a)/(l*c))*(b > zero ? one : minus_one);
-        *G = Kokkos::pair<real_type,real_type>(c, s);
-        /// [ gamma sigma ][ alpha00 alpha01  [ gamma -sigma  --> [ alpha11 -alpha10
-        ///  -sigma gamma ]  alpha10 alpha11 ]  sigma  gamma ]       0        alpha00]
-        const real_type a00 = *alpha00, a01 = *alpha01;
-        const real_type a10 = *alpha10, a11 = *alpha11;
-        const real_type cc = c*c, cs = c*s, ss= s*s;
-        *alpha00 =  cc*a00 + cs*a01 + cs*a10 + ss*a11; 
-        *alpha01 = -cs*a00 + cc*a01 - ss*a10 + cs*a11;
-        *alpha10 = -cs*a00 - ss*a01 + cc*a10 + cs*a11;
-        *alpha11 =  ss*a00 - cs*a01 - cs*a10 + cc*a11;
+///
+/// Serial Internal Impl
+/// ====================
+///
+/// this impl follows the flame interface of householder transformation
+///
+struct SerialSchur2x2Internal {
+  template <typename RealType>
+  KOKKOS_INLINE_FUNCTION static int invoke(RealType* alpha00, RealType* alpha01,
+                                           RealType* alpha10, RealType* alpha11,
+                                           Kokkos::pair<RealType, RealType>* G,
+                                           Kokkos::complex<RealType>* lambda1,
+                                           Kokkos::complex<RealType>* lambda2,
+                                           bool* is_complex) {
+    typedef RealType real_type;
+    typedef Kokkos::Details::ArithTraits<real_type> ats;
+    const real_type zero(0), one(1), half(0.5), minus_one(-1);
+    /// compute G = [ gamma -sigma;
+    ///               sigma  gamma ];
+    /// G.first = gamma and G.second = sigma
+    /// this rotation satisfy the following
+    ///   G' [alpha00 alpha01   G = [ beta00 beta01;
+    ///       alpha10 alpha11 ]       beta10 beta11 ];
+    /// where either
+    ///   1) beta00 = beta11 and beta01*beta10 < 0
+    ///   2) beta10 = 0
+    const real_type tol = ats::epsilon() * real_type(100);
+    if (ats::abs(*alpha10) < tol) {
+      /// no rotation
+      *G = Kokkos::pair<real_type, real_type>(one, zero);
+      /// two real eigen values
+      *lambda1    = Kokkos::complex<real_type>(*alpha00, zero);
+      *lambda2    = Kokkos::complex<real_type>(*alpha11, zero);
+      *is_complex = false;
+    } else if (ats::abs(*alpha01) < tol) {
+      /// 90 degree rotation (permutation)
+      *G = Kokkos::pair<real_type, real_type>(zero, one);
+      /// [ 0 1 ][alpha00 0       [ 0 -1  --> [ alpha11 -alpha10
+      ///  -1 0 ] alpha10 alpha11]  1  0]       0        alpha00]
+      const real_type tmp = *alpha00;
+      *alpha00            = *alpha11;
+      *alpha11            = tmp;
+      *alpha01            = -(*alpha10);
+      *alpha10            = zero;
+      /// two real eigen values
+      *lambda1    = Kokkos::complex<real_type>(*alpha00, zero);
+      *lambda2    = Kokkos::complex<real_type>(*alpha11, zero);
+      *is_complex = false;
+    } else if (ats::abs(*alpha00 - *alpha11) < tol &&
+               (*alpha01) * (*alpha10) > zero) {
+      // no rotation (already the standard schur form)
+      *G = Kokkos::pair<real_type, real_type>(one, zero);
+      /// two real eigen values
+      *lambda1    = Kokkos::complex<real_type>(*alpha00, zero);
+      *lambda2    = Kokkos::complex<real_type>(*alpha11, zero);
+      *is_complex = false;
+    } else {
+      /// rotation to equalize diagonals
+      const real_type a = (*alpha00) - (*alpha11);
+      const real_type b = (*alpha01) + (*alpha10);
+      const real_type l = ats::sqrt(a * a + b * b);
+      const real_type c = ats::sqrt(half * (one + ats::abs(b) / l));
+      const real_type s =
+          -((half * a) / (l * c)) * (b > zero ? one : minus_one);
+      *G = Kokkos::pair<real_type, real_type>(c, s);
+      /// [ gamma sigma ][ alpha00 alpha01  [ gamma -sigma  --> [ alpha11
+      /// -alpha10
+      ///  -sigma gamma ]  alpha10 alpha11 ]  sigma  gamma ]       0 alpha00]
+      const real_type a00 = *alpha00, a01 = *alpha01;
+      const real_type a10 = *alpha10, a11 = *alpha11;
+      const real_type cc = c * c, cs = c * s, ss = s * s;
+      *alpha00 = cc * a00 + cs * a01 + cs * a10 + ss * a11;
+      *alpha01 = -cs * a00 + cc * a01 - ss * a10 + cs * a11;
+      *alpha10 = -cs * a00 - ss * a01 + cc * a10 + cs * a11;
+      *alpha11 = ss * a00 - cs * a01 - cs * a10 + cc * a11;
 
-        const real_type tmp = (*alpha00 + *alpha11)*half;
-        *alpha00 = tmp; 
-        *alpha11 = tmp;
+      const real_type tmp = (*alpha00 + *alpha11) * half;
+      *alpha00            = tmp;
+      *alpha11            = tmp;
 
-        const real_type mult_alpha_offdiags = (*alpha10)*(*alpha01);
-        if (mult_alpha_offdiags > zero) {
-          /// transforms the matrix into a upper triangular
-          const real_type sqrt_mult_alpha_offdiags = ats::sqrt(mult_alpha_offdiags);
+      const real_type mult_alpha_offdiags = (*alpha10) * (*alpha01);
+      if (mult_alpha_offdiags > zero) {
+        /// transforms the matrix into a upper triangular
+        const real_type sqrt_mult_alpha_offdiags =
+            ats::sqrt(mult_alpha_offdiags);
 
-          /// redefine the rotation matrix
-          //const real_type sqrt_abs_alpha01 = ats::sqrt(ats::abs(*alpha01));
-          //const real_type sqrt_abs_alpha10 = ats::sqrt(ats::abs(*alpha10));
-          const real_type abs_sum_offidags = ats::abs((*alpha01)+(*alpha10));
-          const real_type c1 = ats::sqrt(ats::abs(*alpha01)/abs_sum_offidags);
-          const real_type s1 = ats::sqrt(ats::abs(*alpha10)/abs_sum_offidags);
-          const real_type sign_alpha10 = *alpha10 > zero ? one : minus_one;
+        /// redefine the rotation matrix
+        // const real_type sqrt_abs_alpha01 = ats::sqrt(ats::abs(*alpha01));
+        // const real_type sqrt_abs_alpha10 = ats::sqrt(ats::abs(*alpha10));
+        const real_type abs_sum_offidags = ats::abs((*alpha01) + (*alpha10));
+        const real_type c1 = ats::sqrt(ats::abs(*alpha01) / abs_sum_offidags);
+        const real_type s1 = ats::sqrt(ats::abs(*alpha10) / abs_sum_offidags);
+        const real_type sign_alpha10 = *alpha10 > zero ? one : minus_one;
 
-          *G = Kokkos::pair<real_type,real_type>(c*c1-s*s1,c*s1+s*c1);
+        *G = Kokkos::pair<real_type, real_type>(c * c1 - s * s1,
+                                                c * s1 + s * c1);
 
-          /// apply rotation to 2x2 matrix so that alpha10 becomes zero
-          *alpha00 = tmp + sign_alpha10*sqrt_mult_alpha_offdiags;
-          *alpha11 = tmp - sign_alpha10*sqrt_mult_alpha_offdiags;
-          *alpha01 = (*alpha01)-(*alpha10);
-          *alpha10 = zero;
+        /// apply rotation to 2x2 matrix so that alpha10 becomes zero
+        *alpha00 = tmp + sign_alpha10 * sqrt_mult_alpha_offdiags;
+        *alpha11 = tmp - sign_alpha10 * sqrt_mult_alpha_offdiags;
+        *alpha01 = (*alpha01) - (*alpha10);
+        *alpha10 = zero;
 
-          // two real eigen values
-          *lambda1 = Kokkos::complex<real_type>(*alpha00);
-          *lambda2 = Kokkos::complex<real_type>(*alpha11);
-          *is_complex = false;
-        } else {
-          /// two complex eigen values
-          const real_type sqrt_mult_alpha_offdiags = ats::sqrt(-mult_alpha_offdiags);
-          *lambda1 = Kokkos::complex<real_type>(tmp, sqrt_mult_alpha_offdiags);
-          *lambda2 = Kokkos::complex<real_type>(lambda1->real(), -lambda1->imag());
-          *is_complex = true;
-        }
+        // two real eigen values
+        *lambda1    = Kokkos::complex<real_type>(*alpha00);
+        *lambda2    = Kokkos::complex<real_type>(*alpha11);
+        *is_complex = false;
+      } else {
+        /// two complex eigen values
+        const real_type sqrt_mult_alpha_offdiags =
+            ats::sqrt(-mult_alpha_offdiags);
+        *lambda1 = Kokkos::complex<real_type>(tmp, sqrt_mult_alpha_offdiags);
+        *lambda2 =
+            Kokkos::complex<real_type>(lambda1->real(), -lambda1->imag());
+        *is_complex = true;
       }
-      return 0;
     }
-  };
-
-} // end namespace KokkosBatched
+    return 0;
+  }
+};
 
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp
index c66d00ddd6..9a0bacd835 100644
--- a/src/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_SCHUR_SERIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_SCHUR_SERIAL_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -13,124 +12,125 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
-  ///
-  /// this impl follows the flame interface of householder transformation
-  ///
-  struct SerialSchurInternal {
-    /// Given a strictly Hessenberg matrix H (m x m), this computes schur decomposition
-    /// using the Francis method and stores them into a vector e. This routine does 
-    /// not scale nor balance the matrix for the numerical stability.
-    ///    H = Z T Z^H and T = Z^H H Z
-    /// Parameters:
-    ///   [in]m 
-    ///     A dimension of the square matrix H.
-    ///   [in/out]H, [in]hs0, [in]hs1 
-    ///     Real Hessenberg matrix H(m x m) with strides hs0 and hs1.
-    ///     Entering the routine, H is assumed to have a upper Hessenberg form, where
-    ///     all subdiagonals are zero. The matrix is overwritten as a upper triangular T 
-    ///     on exit.
-    ///   [in/out]Z, [in]zs0, [in]zs1
-    ///     Unitary matrix resulting from Schur decomposition. With a restarting option,
-    ///     the matrix may contain previous partial computation results.
-    ///   [in/out]w, [in]wlen
-    ///     Contiguous workspace of which size is wlen. When restart is true, this 
-    ///     workspace is not corrupted after the previous iteration. Temporarily, it stores
-    ///     subdiag values and given rotations. wlen should be at least 3*m.
-    ///   [in]restart(false)
-    ///     With a restart option, the routine assume that the matrix H and the vector e 
-    ///     contain the partial results from the previous run. When m = 1 or 2, this option
-    ///     won't work as the routine always computes the all eigenvalues.
-    ///   [in]user_max_iteration(300)
-    ///     Unlike LAPACK which uses various methods for different types of matrices,
-    ///     this routine uses the Francis method only. A user can set the maximum number 
-    ///     of iterations. When it reaches the maximum iteration counts without converging 
-    ///     all eigenvalues, the routine returns -1. 
-    template<typename RealType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m,
-           /* */ RealType * H, const int hs0, const int hs1,
-           /* */ RealType * Z, const int zs0, const int zs1,
-           /* */ RealType * w, const int wlen, 
-           const bool restart = false,
-           const int user_max_iteration = -1) {
-      typedef RealType real_type;
-      typedef Kokkos::Details::ArithTraits<real_type> ats;
-      const real_type /* one(1), */zero(0), tol = 1e2*ats::epsilon();
-      const int max_iteration = user_max_iteration < 0 ? 300 : user_max_iteration;
-      if (wlen < m*5) 
-        Kokkos::abort("Error: provided workspace is smaller than 3*m");
+///
+/// Serial Internal Impl
+/// ====================
+///
+/// this impl follows the flame interface of householder transformation
+///
+struct SerialSchurInternal {
+  /// Given a strictly Hessenberg matrix H (m x m), this computes schur
+  /// decomposition using the Francis method and stores them into a vector e.
+  /// This routine does not scale nor balance the matrix for the numerical
+  /// stability.
+  ///    H = Z T Z^H and T = Z^H H Z
+  /// Parameters:
+  ///   [in]m
+  ///     A dimension of the square matrix H.
+  ///   [in/out]H, [in]hs0, [in]hs1
+  ///     Real Hessenberg matrix H(m x m) with strides hs0 and hs1.
+  ///     Entering the routine, H is assumed to have a upper Hessenberg form,
+  ///     where all subdiagonals are zero. The matrix is overwritten as a upper
+  ///     triangular T on exit.
+  ///   [in/out]Z, [in]zs0, [in]zs1
+  ///     Unitary matrix resulting from Schur decomposition. With a restarting
+  ///     option, the matrix may contain previous partial computation results.
+  ///   [in/out]w, [in]wlen
+  ///     Contiguous workspace of which size is wlen. When restart is true, this
+  ///     workspace is not corrupted after the previous iteration. Temporarily,
+  ///     it stores subdiag values and given rotations. wlen should be at least
+  ///     3*m.
+  ///   [in]restart(false)
+  ///     With a restart option, the routine assume that the matrix H and the
+  ///     vector e contain the partial results from the previous run. When m = 1
+  ///     or 2, this option won't work as the routine always computes the all
+  ///     eigenvalues.
+  ///   [in]user_max_iteration(300)
+  ///     Unlike LAPACK which uses various methods for different types of
+  ///     matrices, this routine uses the Francis method only. A user can set
+  ///     the maximum number of iterations. When it reaches the maximum
+  ///     iteration counts without converging all eigenvalues, the routine
+  ///     returns -1.
+  template <typename RealType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m,
+                                           /* */ RealType *H, const int hs0,
+                                           const int hs1,
+                                           /* */ RealType *Z, const int zs0,
+                                           const int zs1,
+                                           /* */ RealType *w, const int wlen,
+                                           const bool restart           = false,
+                                           const int user_max_iteration = -1) {
+    typedef RealType real_type;
+    typedef Kokkos::Details::ArithTraits<real_type> ats;
+    const real_type /* one(1), */ zero(0), tol = 1e2 * ats::epsilon();
+    const int max_iteration = user_max_iteration < 0 ? 300 : user_max_iteration;
+    if (wlen < m * 5)
+      Kokkos::abort("Error: provided workspace is smaller than 3*m");
 
-      int r_val = 0;
-      if (restart) {
-        if (m <= 2) 
-          Kokkos::abort("Error: restart option cannot be used for m=1 or m=2");
-      } else {
-        /// do not touch input
-        /// SerialSetIdentityInternal::invoke(m, Z, zs0, zs1);
-      }
+    int r_val = 0;
+    if (restart) {
+      if (m <= 2)
+        Kokkos::abort("Error: restart option cannot be used for m=1 or m=2");
+    } else {
+      /// do not touch input
+      /// SerialSetIdentityInternal::invoke(m, Z, zs0, zs1);
+    }
 
-      // workspaces
-      real_type *subdiags = w; 
-      Kokkos::pair<real_type,real_type> *Gs = (Kokkos::pair<real_type,real_type>*)(w+m);
-      if (!restart) {
-        /// initialize workspace and Gs
-        for (int i=0;i<m;++i) 
-          subdiags[i] = zero;
-      }         
+    // workspaces
+    real_type *subdiags = w;
+    Kokkos::pair<real_type, real_type> *Gs =
+        (Kokkos::pair<real_type, real_type> *)(w + m);
+    if (!restart) {
+      /// initialize workspace and Gs
+      for (int i = 0; i < m; ++i) subdiags[i] = zero;
+    }
 
-      const int hs = hs0+hs1;  /// diagonal stride        
-      switch (m) {
-      case 0: { /* do nothing */ break; }
-      case 1: { /* do nothing */ break; }
+    const int hs = hs0 + hs1;  /// diagonal stride
+    switch (m) {
+      case 0: { /* do nothing */ break;
+      }
+      case 1: { /* do nothing */ break;
+      }
       case 2: {
         /// compute eigenvalues from the characteristic determinant equation
         bool is_complex;
         Kokkos::complex<real_type> lambda1, lambda2;
-        Kokkos::pair<real_type,real_type> G;
-        SerialSchur2x2Internal::invoke(H,     H+hs1,
-                                       H+hs0, H+hs, 
-                                       &G,
-                                       &lambda1, &lambda2,
-                                       &is_complex);
+        Kokkos::pair<real_type, real_type> G;
+        SerialSchur2x2Internal::invoke(H, H + hs1, H + hs0, H + hs, &G,
+                                       &lambda1, &lambda2, &is_complex);
 
-        G.second = -G.second; // transpose          
-        SerialApplyRightGivensInternal::invoke(G,     2, 
-                                               Z,     zs0,           
-                                               Z+zs1, zs0);          
+        G.second = -G.second;  // transpose
+        SerialApplyRightGivensInternal::invoke(G, 2, Z, zs0, Z + zs1, zs0);
         break;
       }
       default: {
-        /// Francis method 
-        int iter(0);             /// iteration count
-        bool converge = false;   /// bool to check all eigenvalues are converged
+        /// Francis method
+        int iter(0);            /// iteration count
+        bool converge = false;  /// bool to check all eigenvalues are converged
 
         while (!converge && iter < max_iteration) {
-          /// Step 1: find a set of unrevealed eigenvalues 
+          /// Step 1: find a set of unrevealed eigenvalues
           int cnt = 1;
-            
+
           /// find mbeg (first nonzero subdiag value)
-          for (;cnt<m;++cnt) {
-            const auto val = ats::abs(*(H+cnt*hs-hs1));
+          for (; cnt < m; ++cnt) {
+            const auto val = ats::abs(*(H + cnt * hs - hs1));
             if (val > tol) break;
           }
-          const int mbeg = cnt-1;
-            
+          const int mbeg = cnt - 1;
+
           /// find mend (first zero subdiag value)
-          for (;cnt<m;++cnt) {
-            const auto val = ats::abs(*(H+cnt*hs-hs1));
-            if (val < tol) break;              
+          for (; cnt < m; ++cnt) {
+            const auto val = ats::abs(*(H + cnt * hs - hs1));
+            if (val < tol) break;
           }
-          const int mend = cnt;
-          const int mdiff = mend - mbeg;
-          const int mend_minus_two_mult_hs0 = (mend-2)*hs0;
+          const int mend                    = cnt;
+          const int mdiff                   = mend - mbeg;
+          const int mend_minus_two_mult_hs0 = (mend - 2) * hs0;
 
           /// Step 2: if there exist non-converged eigen values
           if (1 < mdiff) {
-#             if 0 /// implicit QR with shift for testing 
+#if 0  /// implicit QR with shift for testing 
             {
               /// Rayleigh quotient shift 
               const real_type shift = *(H+(mend-1)*hs); 
@@ -146,101 +146,93 @@ namespace KokkosBatched {
                                                        Z+i*zs1+zs1, zs0);                    
               }
             }
-#             endif
+#endif
 
-#             if 1
+#if 1
             {
               /// find a complex eigen pair
               Kokkos::complex<real_type> lambda1, lambda2;
               bool is_complex;
-              real_type *sub2x2 = H+(mend-2)*hs;
+              real_type *sub2x2 = H + (mend - 2) * hs;
               if (2 == mdiff) {
-                Kokkos::pair<real_type,real_type> G;
-                SerialSchur2x2Internal::invoke(sub2x2,     sub2x2+hs1,
-                                               sub2x2+hs0, sub2x2+hs,
-                                               &G,
-                                               &lambda1, &lambda2,
-                                               &is_complex);
-                subdiags[mend-1] = sub2x2[hs0];
-                    
+                Kokkos::pair<real_type, real_type> G;
+                SerialSchur2x2Internal::invoke(sub2x2, sub2x2 + hs1,
+                                               sub2x2 + hs0, sub2x2 + hs, &G,
+                                               &lambda1, &lambda2, &is_complex);
+                subdiags[mend - 1] = sub2x2[hs0];
+
                 /// apply G' from left
                 G.second = -G.second;
-                SerialApplyLeftGivensInternal::invoke (G, m-mend,
-                                                       sub2x2    +2*hs1, hs1,
-                                                       sub2x2+hs0+2*hs1, hs1);
+                SerialApplyLeftGivensInternal::invoke(
+                    G, m - mend, sub2x2 + 2 * hs1, hs1, sub2x2 + hs0 + 2 * hs1,
+                    hs1);
 
                 /// apply (G')' from right
-                SerialApplyRightGivensInternal::invoke(G, mend-2,
-                                                       sub2x2    -mend_minus_two_mult_hs0, hs0,
-                                                       sub2x2+hs1-mend_minus_two_mult_hs0, hs0);
+                SerialApplyRightGivensInternal::invoke(
+                    G, mend - 2, sub2x2 - mend_minus_two_mult_hs0, hs0,
+                    sub2x2 + hs1 - mend_minus_two_mult_hs0, hs0);
                 sub2x2[hs0] = zero;
 
                 /// apply (G')' from right to compute Z
-                SerialApplyRightGivensInternal::invoke(G, m,
-                                                       Z+(mend-2)*zs1,     zs0,
-                                                       Z+(mend-1)*zs1, zs0);                    
-                  
+                SerialApplyRightGivensInternal::invoke(
+                    G, m, Z + (mend - 2) * zs1, zs0, Z + (mend - 1) * zs1, zs0);
+
               } else {
-                SerialWilkinsonShiftInternal::invoke(sub2x2[0],   sub2x2[hs1], 
-                                                     sub2x2[hs0], sub2x2[hs],
-                                                     &lambda1, &lambda2,
-                                                     &is_complex);                    
-                  
-                SerialFrancisInternal::invoke(mbeg, mend, m, 
-                                              H, hs0, hs1,
-                                              lambda1, lambda2,
-                                              is_complex,
-                                              Gs, true);
-                /* */ auto    &val1 = *(sub2x2+hs0);
-                /* */ auto    &val2 = *(sub2x2-hs1);
+                SerialWilkinsonShiftInternal::invoke(
+                    sub2x2[0], sub2x2[hs1], sub2x2[hs0], sub2x2[hs], &lambda1,
+                    &lambda2, &is_complex);
+
+                SerialFrancisInternal::invoke(mbeg, mend, m, H, hs0, hs1,
+                                              lambda1, lambda2, is_complex, Gs,
+                                              true);
+                /* */ auto &val1    = *(sub2x2 + hs0);
+                /* */ auto &val2    = *(sub2x2 - hs1);
                 const auto abs_val1 = ats::abs(val1);
                 const auto abs_val2 = ats::abs(val2);
 
-                for (int i=mbeg;i<(mend-1);++i) {
-                  const Kokkos::pair<real_type,real_type> G0(Gs[2*i  ].first, -Gs[2*i  ].second);
-                  const Kokkos::pair<real_type,real_type> G1(Gs[2*i+1].first, -Gs[2*i+1].second);
-                  SerialApplyRightGivensInternal::invoke(G0, m,
-                                                         Z+i*zs1,       zs0,
-                                                         Z+i*zs1+1*zs1, zs0);                    
-                  SerialApplyRightGivensInternal::invoke(G1, m,
-                                                         Z+i*zs1,       zs0,
-                                                         Z+i*zs1+2*zs1, zs0);                    
+                for (int i = mbeg; i < (mend - 1); ++i) {
+                  const Kokkos::pair<real_type, real_type> G0(
+                      Gs[2 * i].first, -Gs[2 * i].second);
+                  const Kokkos::pair<real_type, real_type> G1(
+                      Gs[2 * i + 1].first, -Gs[2 * i + 1].second);
+                  SerialApplyRightGivensInternal::invoke(
+                      G0, m, Z + i * zs1, zs0, Z + i * zs1 + 1 * zs1, zs0);
+                  SerialApplyRightGivensInternal::invoke(
+                      G1, m, Z + i * zs1, zs0, Z + i * zs1 + 2 * zs1, zs0);
                 }
-                  
+
                 /// convergence check
-                if (abs_val1 < tol) { 
+                if (abs_val1 < tol) {
                   val1 = zero;
                 } else if (abs_val2 < tol) {
                   /// preserve the standard schur form
-                  Kokkos::pair<real_type,real_type> G;
-                  SerialSchur2x2Internal::invoke(sub2x2,     sub2x2+hs1,
-                                                 sub2x2+hs0, sub2x2+hs,
-                                                 &G,
-                                                 &lambda1, &lambda2,
-                                                 &is_complex);
-                  subdiags[mend-1] = val1;
-                    
+                  Kokkos::pair<real_type, real_type> G;
+                  SerialSchur2x2Internal::invoke(
+                      sub2x2, sub2x2 + hs1, sub2x2 + hs0, sub2x2 + hs, &G,
+                      &lambda1, &lambda2, &is_complex);
+                  subdiags[mend - 1] = val1;
+
                   /// apply G' from left
                   G.second = -G.second;
-                  SerialApplyLeftGivensInternal::invoke (G, m-mend,
-                                                         sub2x2    +2*hs1, hs1,
-                                                         sub2x2+hs0+2*hs1, hs1);
+                  SerialApplyLeftGivensInternal::invoke(
+                      G, m - mend, sub2x2 + 2 * hs1, hs1,
+                      sub2x2 + hs0 + 2 * hs1, hs1);
 
                   // apply (G')' from right
-                  SerialApplyRightGivensInternal::invoke(G, mend-2,
-                                                         sub2x2    -mend_minus_two_mult_hs0, hs0,
-                                                         sub2x2+hs1-mend_minus_two_mult_hs0, hs0);
+                  SerialApplyRightGivensInternal::invoke(
+                      G, mend - 2, sub2x2 - mend_minus_two_mult_hs0, hs0,
+                      sub2x2 + hs1 - mend_minus_two_mult_hs0, hs0);
                   val1 = zero;
                   val2 = zero;
 
                   // apply (G')' from right
-                  SerialApplyRightGivensInternal::invoke(G, m,
-                                                         Z+(mend-2)*zs1, zs0,
-                                                         Z+(mend-1)*zs1, zs0);                    
+                  SerialApplyRightGivensInternal::invoke(
+                      G, m, Z + (mend - 2) * zs1, zs0, Z + (mend - 1) * zs1,
+                      zs0);
                 }
               }
             }
-#             endif 
+#endif
           } else {
             /// all eigenvalues are converged
             converge = true;
@@ -249,23 +241,22 @@ namespace KokkosBatched {
         }
         /// Step 3: record missing real eigenvalues from the diagonals
         if (converge) {
-          // recover subdiags            
-          real_type *Hs = H-hs1;
-          for (int i=1;i<m;++i) {                 
-            Hs[i*hs] = subdiags[i];
+          // recover subdiags
+          real_type *Hs = H - hs1;
+          for (int i = 1; i < m; ++i) {
+            Hs[i * hs] = subdiags[i];
           }
           r_val = 0;
         } else {
           r_val = -1;
         }
         break;
-      }          
       }
-      return r_val;
     }
-  };
-
-} /// end namespace KokkosBatched
+    return r_val;
+  }
+};
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_SetIdentity_Impl.hpp b/src/batched/dense/impl/KokkosBatched_SetIdentity_Impl.hpp
index 0bf12243ee..37db69b968 100644
--- a/src/batched/dense/impl/KokkosBatched_SetIdentity_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_SetIdentity_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_SET_IDENTITY_IMPL_HPP__
 #define __KOKKOSBATCHED_SET_IDENTITY_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,38 +8,28 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Impl
-  /// ===========
-      
-  template<typename AViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialSetIdentity::
-  invoke(const AViewType &A) {
-    return SerialSetIdentityInternal::
-      invoke(A.extent(0), A.extent(1),
-             A.data(), A.stride_0(), A.stride_1());
-  }
-
-  ///
-  /// Team Impl
-  /// =========
-              
-  template<typename MemberType>
-  template<typename AViewType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamSetIdentity<MemberType>::
-  invoke(const MemberType &member, 
-         const AViewType &A) {
-    return TeamSetIdentityInternal::
-      invoke(member, 
-             A.extent(0), A.extent(1),
-             A.data(), A.stride_0(), A.stride_1());
-  }
- 
-} //end namespace KokkosBatched
-
+///
+/// Serial Impl
+/// ===========
+
+template <typename AViewType>
+KOKKOS_INLINE_FUNCTION int SerialSetIdentity::invoke(const AViewType &A) {
+  return SerialSetIdentityInternal::invoke(A.extent(0), A.extent(1), A.data(),
+                                           A.stride_0(), A.stride_1());
+}
+
+///
+/// Team Impl
+/// =========
+
+template <typename MemberType>
+template <typename AViewType>
+KOKKOS_INLINE_FUNCTION int TeamSetIdentity<MemberType>::invoke(
+    const MemberType &member, const AViewType &A) {
+  return TeamSetIdentityInternal::invoke(member, A.extent(0), A.extent(1),
+                                         A.data(), A.stride_0(), A.stride_1());
+}
+
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_SetIdentity_Internal.hpp b/src/batched/dense/impl/KokkosBatched_SetIdentity_Internal.hpp
index 8f7f6cf3f9..c9ec973500 100644
--- a/src/batched/dense/impl/KokkosBatched_SetIdentity_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_SetIdentity_Internal.hpp
@@ -1,89 +1,75 @@
 #ifndef __KOKKOSBATCHED_SET_IDENTITY_INTERNAL_HPP__
 #define __KOKKOSBATCHED_SET_IDENTITY_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
-  struct SerialSetIdentityInternal {
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m, const int n,
-           /* */ ValueType *__restrict__ A, const int as0, const int as1) {
-      const ValueType one(1), zero(0);
-      for (int j=0;j<n;++j) {
+///
+/// Serial Internal Impl
+/// ====================
+struct SerialSetIdentityInternal {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1) {
+    const ValueType one(1), zero(0);
+    for (int j = 0; j < n; ++j) {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-        for (int i=0;i<m;++i) {
-          A[i*as0+j*as1] = i == j ? one : zero;
-        }
+      for (int i = 0; i < m; ++i) {
+        A[i * as0 + j * as1] = i == j ? one : zero;
       }
-        
-      return 0;
     }
-  };
 
-  ///
-  /// Team Internal Impl
-  /// ==================
-  struct TeamSetIdentityInternal {
-    template<typename MemberType, 
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const int m, const int n,
-           /* */ ValueType *__restrict__ A, const int as0, const int as1) {
-      const ValueType one(1), zero(0);
-      Kokkos::parallel_for
-        (Kokkos::TeamThreadRange(member,m),
-         [&](const int &i) {
+    return 0;
+  }
+};
+
+///
+/// Team Internal Impl
+/// ==================
+struct TeamSetIdentityInternal {
+  template <typename MemberType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int m, const int n,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1) {
+    const ValueType one(1), zero(0);
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-          for (int j=0;j<n;++j) 
-            A[i*as0+j*as1] = i == j ? one : zero;
-        });
-        
-      return 0;
-    }
-  };
+      for (int j = 0; j < n; ++j) A[i * as0 + j * as1] = i == j ? one : zero;
+    });
 
-  ///
-  /// TeamVector Internal Impl
-  /// ========================
-  struct TeamVectorSetIdentityInternal {
-    template<typename MemberType, 
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const int m, const int n,
-           /* */ ValueType *__restrict__ A, const int as0, const int as1) {
-      const ValueType one(1), zero(0);
-      Kokkos::parallel_for
-        (Kokkos::TeamThreadRange(member,m),
-         [&](const int &i) {
-	   Kokkos::parallel_for
-	     (Kokkos::ThreadVectorRange(member,n),
-	      [&](const int &j) {
-		A[i*as0+j*as1] = i == j ? one : zero;
-	      });
-	 });
-      
-      return 0;
-    }
-  };
+    return 0;
+  }
+};
+
+///
+/// TeamVector Internal Impl
+/// ========================
+struct TeamVectorSetIdentityInternal {
+  template <typename MemberType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int m, const int n,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1) {
+    const ValueType one(1), zero(0);
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) {
+      Kokkos::parallel_for(
+          Kokkos::ThreadVectorRange(member, n),
+          [&](const int &j) { A[i * as0 + j * as1] = i == j ? one : zero; });
+    });
 
-} // end namespace KokkosBatched
+    return 0;
+  }
+};
 
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_SetTriangular_Internal.hpp b/src/batched/dense/impl/KokkosBatched_SetTriangular_Internal.hpp
index fd74915ea9..cb8de1102e 100644
--- a/src/batched/dense/impl/KokkosBatched_SetTriangular_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_SetTriangular_Internal.hpp
@@ -7,59 +7,48 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
-  struct SerialSetLowerTriangularInternal {
-    template<typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m, const int n,
-           const int dist,
-           const ScalarType alpha,
-           /* */ ValueType *__restrict__ A, const int as0, const int as1) {
-      for (int j=0;j<n;++j) {
+///
+/// Serial Internal Impl
+/// ====================
+struct SerialSetLowerTriangularInternal {
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n,
+                                           const int dist,
+                                           const ScalarType alpha,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1) {
+    for (int j = 0; j < n; ++j) {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-        for (int i=(j+dist);i<m;++i) {
-          A[i*as0+j*as1] = alpha;
-        }
+      for (int i = (j + dist); i < m; ++i) {
+        A[i * as0 + j * as1] = alpha;
       }
-        
-      return 0;
     }
-  };
-
-
-  struct TeamVectorSetLowerTriangularInternal {
-    template<typename MemberType,
-	     typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-	   const int m, const int n,
-           const int dist,
-           const ScalarType alpha,
-           /* */ ValueType *__restrict__ A, const int as0, const int as1) {
-      Kokkos::parallel_for
-	(Kokkos::TeamThreadRange(member, n),
-	 [&](const int &j) {
-	   const int jdist = j+ dist;
-	   Kokkos::parallel_for
-	     (Kokkos::ThreadVectorRange(member, m),
-	      [=](const int &i) {
-		if (i >= jdist)
-		  A[i*as0+j*as1] = alpha;
-	      });
-	 });
-      return 0;
-    }
-  };
-  
-} // end namespace KokkosBatched
 
+    return 0;
+  }
+};
+
+struct TeamVectorSetLowerTriangularInternal {
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int m, const int n,
+                                           const int dist,
+                                           const ScalarType alpha,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1) {
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &j) {
+      const int jdist = j + dist;
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, m),
+                           [=](const int &i) {
+                             if (i >= jdist) A[i * as0 + j * as1] = alpha;
+                           });
+    });
+    return 0;
+  }
+};
+
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Set_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Set_Impl.hpp
index 6318b4fff2..148e051ce4 100644
--- a/src/batched/dense/impl/KokkosBatched_Set_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Set_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_SET_IMPL_HPP__
 #define __KOKKOSBATCHED_SET_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,63 +8,41 @@
 
 namespace KokkosBatched {
 
-    ///
-    /// Serial Impl
-    /// ===========
-      
-    template<typename ScalarType,
-             typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    int
-    SerialSet::
-    invoke(const ScalarType alpha,
-           const AViewType &A) {
-      return SerialSetInternal::
-        invoke(A.extent(0), A.extent(1),
-               alpha, 
-               A.data(), A.stride_0(), A.stride_1());
-    }
-
-    ///
-    /// Team Impl
-    /// =========
-              
-    template<typename MemberType>
-    template<typename ScalarType,
-             typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    int
-    TeamSet<MemberType>::
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A) {
-      return TeamSetInternal::
-        invoke(member, 
-               A.extent(0), A.extent(1),
-               alpha, 
-               A.data(), A.stride_0(), A.stride_1());
-    }
-
-    ///
-    /// TeamVector Impl
-    /// ===============
-              
-    template<typename MemberType>
-    template<typename ScalarType,
-             typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    int
-    TeamVectorSet<MemberType>::
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A) {
-      return TeamVectorSetInternal::
-        invoke(member, 
-               A.extent(0), A.extent(1),
-               alpha, 
-               A.data(), A.stride_0(), A.stride_1());
-    }
-} //end namespace KokkosBatched
-
+///
+/// Serial Impl
+/// ===========
+
+template <typename ScalarType, typename AViewType>
+KOKKOS_INLINE_FUNCTION int SerialSet::invoke(const ScalarType alpha,
+                                             const AViewType &A) {
+  return SerialSetInternal::invoke(A.extent(0), A.extent(1), alpha, A.data(),
+                                   A.stride_0(), A.stride_1());
+}
+
+///
+/// Team Impl
+/// =========
+
+template <typename MemberType>
+template <typename ScalarType, typename AViewType>
+KOKKOS_INLINE_FUNCTION int TeamSet<MemberType>::invoke(const MemberType &member,
+                                                       const ScalarType alpha,
+                                                       const AViewType &A) {
+  return TeamSetInternal::invoke(member, A.extent(0), A.extent(1), alpha,
+                                 A.data(), A.stride_0(), A.stride_1());
+}
+
+///
+/// TeamVector Impl
+/// ===============
+
+template <typename MemberType>
+template <typename ScalarType, typename AViewType>
+KOKKOS_INLINE_FUNCTION int TeamVectorSet<MemberType>::invoke(
+    const MemberType &member, const ScalarType alpha, const AViewType &A) {
+  return TeamVectorSetInternal::invoke(member, A.extent(0), A.extent(1), alpha,
+                                       A.data(), A.stride_0(), A.stride_1());
+}
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Set_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Set_Internal.hpp
index 091ef59d2d..f18ac4355c 100644
--- a/src/batched/dense/impl/KokkosBatched_Set_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Set_Internal.hpp
@@ -1,161 +1,120 @@
 #ifndef __KOKKOSBATCHED_SET_INTERNAL_HPP__
 #define __KOKKOSBATCHED_SET_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
-  struct SerialSetInternal {
-    template<typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m, 
-           const ScalarType alpha, 
-           /* */ ValueType *__restrict__ A, const int as0) {
-
+///
+/// Serial Internal Impl
+/// ====================
+struct SerialSetInternal {
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType alpha,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0) {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int i=0;i<m;++i)
-        A[i*as0] = alpha;
-        
-      return 0;
-    }
-      
-    template<typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m, const int n, 
-           const ScalarType alpha, 
-           /* */ ValueType *__restrict__ A, const int as0, const int as1) {
-      if (as0 > as1)
-        for (int i=0;i<m;++i)
-          invoke(n, alpha, A+i*as0, as1);
-      else
-        for (int j=0;j<n;++j)
-          invoke(m, alpha, A+j*as1, as0);
-        
-      return 0;
-    }
-  };
+    for (int i = 0; i < m; ++i) A[i * as0] = alpha;
 
-  ///
-  /// Team Internal Impl
-  /// ==================
-  struct TeamSetInternal {
-    template<typename MemberType,
-             typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const int m, 
-           const ScalarType alpha, 
-           /* */ ValueType *__restrict__ A, const int as0) {
-        
-      Kokkos::parallel_for
-        (Kokkos::TeamThreadRange(member,m),
-         [&](const int &i) {
-          A[i*as0] = alpha;
-        });
-      //member.team_barrier();
-      return 0;
-    }
-      
-    template<typename MemberType,
-             typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const int m, const int n, 
-           const ScalarType alpha, 
-           /* */ ValueType *__restrict__ A, const int as0, const int as1) {
-      if (m > n) {
-        Kokkos::parallel_for
-          (Kokkos::TeamThreadRange(member,m),
-           [&](const int &i) {
-            SerialSetInternal::invoke(n, alpha, A+i*as0, as1);
+    return 0;
+  }
+
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n,
+                                           const ScalarType alpha,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1) {
+    if (as0 > as1)
+      for (int i = 0; i < m; ++i) invoke(n, alpha, A + i * as0, as1);
+    else
+      for (int j = 0; j < n; ++j) invoke(m, alpha, A + j * as1, as0);
+
+    return 0;
+  }
+};
+
+///
+/// Team Internal Impl
+/// ==================
+struct TeamSetInternal {
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int m, const ScalarType alpha,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0) {
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m),
+                         [&](const int &i) { A[i * as0] = alpha; });
+    // member.team_barrier();
+    return 0;
+  }
+
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int m, const int n,
+                                           const ScalarType alpha,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1) {
+    if (m > n) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, m), [&](const int &i) {
+            SerialSetInternal::invoke(n, alpha, A + i * as0, as1);
           });
-      } else {
-        Kokkos::parallel_for
-          (Kokkos::TeamThreadRange(member,n),
-           [&](const int &j) {
-            SerialSetInternal::invoke(m, alpha, A+j*as1, as0);
+    } else {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, n), [&](const int &j) {
+            SerialSetInternal::invoke(m, alpha, A + j * as1, as0);
           });
-      }
-      //member.team_barrier();
-      return 0;
     }
-  };
+    // member.team_barrier();
+    return 0;
+  }
+};
 
-  ///
-  /// TeamVector Internal Impl
-  /// ========================
-  struct TeamVectorSetInternal {
-    template<typename MemberType,
-             typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const int m, 
-           const ScalarType alpha, 
-           /* */ ValueType *__restrict__ A, const int as0) {
-        
-      Kokkos::parallel_for
-        (Kokkos::TeamVectorRange(member,m),
-         [&](const int &i) {
-          A[i*as0] = alpha;
-        });
-      //member.team_barrier();
-      return 0;
-    }
-      
-    template<typename MemberType,
-             typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const int m, const int n, 
-           const ScalarType alpha, 
-           /* */ ValueType *__restrict__ A, const int as0, const int as1) {
-      if (m > n) {
-        Kokkos::parallel_for
-          (Kokkos::TeamThreadRange(member,m),
-           [&](const int &i) {
-            Kokkos::parallel_for
-              (Kokkos::ThreadVectorRange(member,n),
-               [&](const int &j) {
-                A[i*as0+j*as1] = alpha;
-               });
-           });
-      } else {
-        Kokkos::parallel_for
-          (Kokkos::ThreadVectorRange(member,m),
-           [&](const int &i) {
-            Kokkos::parallel_for
-              (Kokkos::TeamThreadRange(member,n),
-               [&](const int &j) {
-                A[i*as0+j*as1] = alpha;
-               });
-           });
-      }
-      //member.team_barrier();
-      return 0;
-    }
-  };
+///
+/// TeamVector Internal Impl
+/// ========================
+struct TeamVectorSetInternal {
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int m, const ScalarType alpha,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0) {
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m),
+                         [&](const int &i) { A[i * as0] = alpha; });
+    // member.team_barrier();
+    return 0;
+  }
 
-} // end namespace KokkosBatched
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const int m, const int n,
+                                           const ScalarType alpha,
+                                           /* */ ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1) {
+    if (m > n) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, m), [&](const int &i) {
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(member, n),
+                [&](const int &j) { A[i * as0 + j * as1] = alpha; });
+          });
+    } else {
+      Kokkos::parallel_for(
+          Kokkos::ThreadVectorRange(member, m), [&](const int &i) {
+            Kokkos::parallel_for(
+                Kokkos::TeamThreadRange(member, n),
+                [&](const int &j) { A[i * as0 + j * as1] = alpha; });
+          });
+    }
+    // member.team_barrier();
+    return 0;
+  }
+};
 
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp
index 3cf1c774c5..b0e2ea5b80 100644
--- a/src/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_SHIFTED_TRSV_SERIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_SHIFTED_TRSV_SERIAL_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -14,143 +13,139 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ====================
-
-  ///
-  /// Lower
-  ///
-
-  struct SerialShiftedTrsvInternalLower {
-    template<typename ScalarType,
-             typename ValueTypeA,
-             typename ValueTypeB>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m,
-           const ScalarType lambda,
-           const ValueTypeA *__restrict__ A, const int as0, const int as1,
-           /* */ ValueTypeB *__restrict__ b, const int bs0,
-           const int *__restrict__ blks) {
-      const int as = as0+as1;
-
-      int p = 0;
-      for (;p<m;) {
-        const int blk = blks[p], iend = m-p-blk;
-        assert( ((blk == 1) || (blk == 2)) && "ShiftedTrsvLower: blocks are not 1x1 or 2x2");
-        if (blk == 1) {
-          const auto alpha11 = A[p*as]-lambda;
-          ValueTypeB *__restrict__ beta1 = b+p*bs0;
-
-          // with __restrict__ a compiler assumes that the pointer is not accessed by others
-          // op(/=) uses this pointer and changes the associated values, which brings a compiler problem
-          *beta1 = *beta1 / alpha11;
-
-          if (iend) {
-            const ValueTypeA *__restrict__ a21 = A+p*as+as0; 
-            ValueTypeB *__restrict__ b2 = beta1+bs0;
-            for (int i=0;i<iend;++i)
-              b2[i*bs0] -= a21[i*as0] * (*beta1);              
-          }
-        } else {
-          const int p_plus_one = p+1;
-          const auto alpha11 = A[p*as]-lambda;
-          const auto alpha12 = A[p*as+as1];
-          const auto alpha21 = A[p*as+as0];
-          const auto alpha22 = A[p_plus_one*as]-lambda;
-          const auto det = alpha11*alpha22-alpha12*alpha21;
-
-          ValueTypeB *__restrict__ beta1 = b+p*bs0;            
-          ValueTypeB *__restrict__ beta2 = b+p_plus_one*bs0;            
-
-          const ValueTypeB beta_one = *beta1;
-          const ValueTypeB beta_two = *beta2;           
-
-          *beta1 = ( alpha22*(beta_one)-alpha12*(beta_two))/det;
-          *beta2 = (-alpha21*(beta_one)+alpha11*(beta_two))/det;
-            
-          if (iend) {
-            const ValueTypeA *__restrict__ A21 = A+p*as+2*as0; 
-            ValueTypeB *__restrict__ b2 = beta1+2*bs0;
-              
-            for (int i=0;i<iend;++i)
-              b2[i*bs0] -= (A21[i*as0]*(*beta1) + A21[i*as0+as1]*(*beta2));              
-          }
+///
+/// Serial Internal Impl
+/// ====================
+
+///
+/// Lower
+///
+
+struct SerialShiftedTrsvInternalLower {
+  template <typename ScalarType, typename ValueTypeA, typename ValueTypeB>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType lambda,
+                                           const ValueTypeA *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1,
+                                           /* */ ValueTypeB *KOKKOS_RESTRICT b,
+                                           const int bs0,
+                                           const int *KOKKOS_RESTRICT blks) {
+    const int as = as0 + as1;
+
+    int p = 0;
+    for (; p < m;) {
+      const int blk = blks[p], iend = m - p - blk;
+      assert(((blk == 1) || (blk == 2)) &&
+             "ShiftedTrsvLower: blocks are not 1x1 or 2x2");
+      if (blk == 1) {
+        const auto alpha11                = A[p * as] - lambda;
+        ValueTypeB *KOKKOS_RESTRICT beta1 = b + p * bs0;
+
+        // with KOKKOS_RESTRICT a compiler assumes that the pointer is not
+        // accessed by others op(/=) uses this pointer and changes the
+        // associated values, which brings a compiler problem
+        *beta1 = *beta1 / alpha11;
+
+        if (iend) {
+          const ValueTypeA *KOKKOS_RESTRICT a21 = A + p * as + as0;
+          ValueTypeB *KOKKOS_RESTRICT b2        = beta1 + bs0;
+          for (int i = 0; i < iend; ++i) b2[i * bs0] -= a21[i * as0] * (*beta1);
+        }
+      } else {
+        const int p_plus_one = p + 1;
+        const auto alpha11   = A[p * as] - lambda;
+        const auto alpha12   = A[p * as + as1];
+        const auto alpha21   = A[p * as + as0];
+        const auto alpha22   = A[p_plus_one * as] - lambda;
+        const auto det       = alpha11 * alpha22 - alpha12 * alpha21;
+
+        ValueTypeB *KOKKOS_RESTRICT beta1 = b + p * bs0;
+        ValueTypeB *KOKKOS_RESTRICT beta2 = b + p_plus_one * bs0;
+
+        const ValueTypeB beta_one = *beta1;
+        const ValueTypeB beta_two = *beta2;
+
+        *beta1 = (alpha22 * (beta_one)-alpha12 * (beta_two)) / det;
+        *beta2 = (-alpha21 * (beta_one) + alpha11 * (beta_two)) / det;
+
+        if (iend) {
+          const ValueTypeA *KOKKOS_RESTRICT A21 = A + p * as + 2 * as0;
+          ValueTypeB *KOKKOS_RESTRICT b2        = beta1 + 2 * bs0;
+
+          for (int i = 0; i < iend; ++i)
+            b2[i * bs0] -=
+                (A21[i * as0] * (*beta1) + A21[i * as0 + as1] * (*beta2));
         }
-        p += blk;
       }
-      return 0;
+      p += blk;
     }
-  };
-
-  ///
-  /// Upper
-  ///
-    
-  struct SerialShiftedTrsvInternalUpper {
-    template<typename ScalarType,
-             typename ValueTypeA,
-             typename ValueTypeB>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const int m,
-           const ScalarType lambda,
-           const ValueTypeA *__restrict__ A, const int as0, const int as1,
-           /**/  ValueTypeB *__restrict__ b, const int bs0,
-           const int *__restrict__ blks) {
-      const int as = as0+as1;
-
-      ValueTypeB *__restrict__ b0 = b;
-        
-      int p = m-1;
-      for (;p>=0;) {
-        const int blk = blks[p], iend = p+1-blk;
-        assert( ((blk == 1) || (blk == 2)) && "ShiftedTrsvUpper: blocks are not 1x1 or 2x2");          
-        if (blk == 1) {
-          const auto alpha11 = A[p*as]-lambda;
-          /**/  ValueTypeB *__restrict__ beta1 = b+p*bs0;
-
-          // with __restrict__ a compiler assumes that the pointer is not accessed by others
-          // op(/=) uses this pointer and changes the associated values, which brings a compiler problem
-          *beta1 = *beta1 / alpha11;
-
-          if (iend) {
-            const ValueTypeA *__restrict__ a01   = A+p*as1;
-            for (int i=0;i<iend;++i)
-              b0[i*bs0] -= a01[i*as0] * (*beta1);
-          }
-        } else {
-
-          const int p_minus_one = p-1;
-          const auto alpha11 = A[p_minus_one*as]-lambda;
-          const auto alpha12 = A[p_minus_one*as+as1];
-          const auto alpha21 = A[p_minus_one*as+as0];
-          const auto alpha22 = A[p*as]-lambda;
-          const auto det = alpha11*alpha22-alpha12*alpha21;
-
-          ValueTypeB *__restrict__ beta1 = b+p_minus_one*bs0;            
-          ValueTypeB *__restrict__ beta2 = b+p*bs0;            
-
-          const ValueTypeB beta_one = *beta1;
-          const ValueTypeB beta_two = *beta2;           
-
-          *beta1 = ( alpha22*(beta_one)-alpha12*(beta_two))/det;
-          *beta2 = (-alpha21*(beta_one)+alpha11*(beta_two))/det;
-            
-          if (iend) {
-            const ValueTypeA *__restrict__ A01 = A+p_minus_one*as1; 
-            for (int i=0;i<iend;++i)
-              b0[i*bs0] -= (A01[i*as0]*(*beta1) + A01[i*as0+as1]*(*beta2));              
-          }
+    return 0;
+  }
+};
+
+///
+/// Upper
+///
+
+struct SerialShiftedTrsvInternalUpper {
+  template <typename ScalarType, typename ValueTypeA, typename ValueTypeB>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType lambda,
+                                           const ValueTypeA *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1,
+                                           /**/ ValueTypeB *KOKKOS_RESTRICT b,
+                                           const int bs0,
+                                           const int *KOKKOS_RESTRICT blks) {
+    const int as = as0 + as1;
+
+    ValueTypeB *KOKKOS_RESTRICT b0 = b;
+
+    int p = m - 1;
+    for (; p >= 0;) {
+      const int blk = blks[p], iend = p + 1 - blk;
+      assert(((blk == 1) || (blk == 2)) &&
+             "ShiftedTrsvUpper: blocks are not 1x1 or 2x2");
+      if (blk == 1) {
+        const auto alpha11                   = A[p * as] - lambda;
+        /**/ ValueTypeB *KOKKOS_RESTRICT beta1 = b + p * bs0;
+
+        // with KOKKOS_RESTRICT a compiler assumes that the pointer is not
+        // accessed by others op(/=) uses this pointer and changes the
+        // associated values, which brings a compiler problem
+        *beta1 = *beta1 / alpha11;
+
+        if (iend) {
+          const ValueTypeA *KOKKOS_RESTRICT a01 = A + p * as1;
+          for (int i = 0; i < iend; ++i) b0[i * bs0] -= a01[i * as0] * (*beta1);
+        }
+      } else {
+        const int p_minus_one = p - 1;
+        const auto alpha11    = A[p_minus_one * as] - lambda;
+        const auto alpha12    = A[p_minus_one * as + as1];
+        const auto alpha21    = A[p_minus_one * as + as0];
+        const auto alpha22    = A[p * as] - lambda;
+        const auto det        = alpha11 * alpha22 - alpha12 * alpha21;
+
+        ValueTypeB *KOKKOS_RESTRICT beta1 = b + p_minus_one * bs0;
+        ValueTypeB *KOKKOS_RESTRICT beta2 = b + p * bs0;
+
+        const ValueTypeB beta_one = *beta1;
+        const ValueTypeB beta_two = *beta2;
+
+        *beta1 = (alpha22 * (beta_one)-alpha12 * (beta_two)) / det;
+        *beta2 = (-alpha21 * (beta_one) + alpha11 * (beta_two)) / det;
+
+        if (iend) {
+          const ValueTypeA *KOKKOS_RESTRICT A01 = A + p_minus_one * as1;
+          for (int i = 0; i < iend; ++i)
+            b0[i * bs0] -=
+                (A01[i * as0] * (*beta1) + A01[i * as0 + as1] * (*beta2));
         }
-        p -= blk;
       }
-      return 0;
+      p -= blk;
     }
-  };
-    
-}
+    return 0;
+  }
+};
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Impl.hpp b/src/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Impl.hpp
index 68e2fa17c6..aad14f645e 100644
--- a/src/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_SOLVE_UTV_TEAMVECTOR_IMPL_HPP__
 #define __KOKKOSBATCHED_SOLVE_UTV_TEAMVECTOR_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,59 +8,35 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// TeamVector Impl
-  /// ===============
-  template<typename MemberType>
-  struct TeamVectorSolveUTV<MemberType,Algo::UTV::Unblocked> {
-    template<typename UViewType,
-	     typename TViewType,
-	     typename VViewType,
-	     typename pViewType,
-	     typename XViewType,
-	     typename BViewType,
-             typename wViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-	   const int matrix_rank,
-           const UViewType &U,
-	   const TViewType &T,
-	   const VViewType &V,
-	   const pViewType &p,
-	   const XViewType &X,
-	   const BViewType &B,
-           const wViewType &w) {
-      if (BViewType::rank == 1)
-	TeamVectorSolveUTV_Internal::
-	  invoke(member,
-		 matrix_rank,
-		 T.extent(0), V.extent(0),
-		 U.data(), U.stride(0), U.stride(1),
-		 T.data(), T.stride(0), T.stride(1),
-		 V.data(), V.stride(0), V.stride(1),
-		 p.data(), p.stride(0),
-		 X.data(), X.stride(0),
-		 B.data(), B.stride(0),
-		 w.data());
-      else
-	TeamVectorSolveUTV_Internal::
-	  invoke(member,
-		 matrix_rank, 
-		 T.extent(0), V.extent(0), B.extent(1),
-		 U.data(), U.stride(0), U.stride(1),
-		 T.data(), T.stride(0), T.stride(1),
-		 V.data(), V.stride(0), V.stride(1),
-		 p.data(), p.stride(0),
-		 X.data(), X.stride(0), X.stride(1),
-		 B.data(), B.stride(0), B.stride(1),
-		 w.data());
-      return 0;
-    }
-  };
-        
-}
-
-
+///
+/// TeamVector Impl
+/// ===============
+template <typename MemberType>
+struct TeamVectorSolveUTV<MemberType, Algo::UTV::Unblocked> {
+  template <typename UViewType, typename TViewType, typename VViewType,
+            typename pViewType, typename XViewType, typename BViewType,
+            typename wViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const int matrix_rank, const UViewType &U,
+      const TViewType &T, const VViewType &V, const pViewType &p,
+      const XViewType &X, const BViewType &B, const wViewType &w) {
+    if (BViewType::rank == 1)
+      TeamVectorSolveUTV_Internal::invoke(
+          member, matrix_rank, T.extent(0), V.extent(0), U.data(), U.stride(0),
+          U.stride(1), T.data(), T.stride(0), T.stride(1), V.data(),
+          V.stride(0), V.stride(1), p.data(), p.stride(0), X.data(),
+          X.stride(0), B.data(), B.stride(0), w.data());
+    else
+      TeamVectorSolveUTV_Internal::invoke(
+          member, matrix_rank, T.extent(0), V.extent(0), B.extent(1), U.data(),
+          U.stride(0), U.stride(1), T.data(), T.stride(0), T.stride(1),
+          V.data(), V.stride(0), V.stride(1), p.data(), p.stride(0), X.data(),
+          X.stride(0), X.stride(1), B.data(), B.stride(0), B.stride(1),
+          w.data());
+    return 0;
+  }
+};
+
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Internal.hpp
index 91803e5396..fa0d4c2a31 100644
--- a/src/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_SOLVE_UTV_TEAMVECTOR_INTERNAL_HPP__
 #define __KOKKOSBATCHED_SOLVE_UTV_TEAMVECTOR_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -14,183 +13,110 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// TeamVector Internal
-  /// =================== 
-  struct TeamVectorSolveUTV_Internal {
-    template<typename MemberType,
-             typename ValueType,
-	     typename IntType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-	   const int matrix_rank,
-           const int m, const int /*n*/,
-           const ValueType * U, const int us0, const int us1,
-	   const ValueType * T, const int ts0, const int ts1,
-	   const ValueType * V, const int vs0, const int vs1,
-	   const IntType   * p, const int ps0,
-	   /* */ ValueType * x, const int xs0,
-	   /* */ ValueType * b, const int bs0,
-           /* */ ValueType * w) {
-      typedef ValueType value_type;
-      //typedef IntType int_type;
-
-      const value_type one(1), zero(0);
-      const int ws0 = 1;
-
-      if (matrix_rank < m) {
-	/// w = U^T b
-	TeamVectorGemvInternal<Algo::Gemv::Unblocked>
-	  ::invoke(member,
-		   matrix_rank, m,
-		   one,
-		   U, us1, us0,
-		   b, bs0,
-		   zero,
-		   w, ws0);
-	
-	/// w = T^{-1} w
-	TeamVectorTrsvInternalLower<Algo::Trsv::Unblocked>
-	  ::invoke(member,
-		   false,
-		   matrix_rank,
-		   one,
-		   T, ts0, ts1,
-		   w, ws0);
-	
-	/// x = V^T w
-	TeamVectorGemvInternal<Algo::Gemv::Unblocked>
-	  ::invoke(member,
-		   m, matrix_rank, 
-		   one,
-		   V, vs1, vs0,
-		   w, ws0,
-		   zero,
-		   x, xs0);
-      } else {
-	TeamVectorGemvInternal<Algo::Gemv::Unblocked>
-	  ::invoke(member,
-		   matrix_rank, m,
-		   one,
-		   U, us1, us0,
-		   b, bs0,
-		   zero,
-		   x, xs0);
-
-	TeamVectorTrsvInternalUpper<Algo::Trsv::Unblocked>
-	  ::invoke(member,
-		   false,
-		   matrix_rank,
-		   one,
-		   T, ts0, ts1,
-		   x, xs0);	
-      }
-
-      /// x = P^T x
-      TeamVectorApplyPivotVectorBackwardInternal
-	::invoke(member,
-		 m,
-		 p, ps0,
-		 x, xs0);
-
-      return 0;
+///
+/// TeamVector Internal
+/// ===================
+struct TeamVectorSolveUTV_Internal {
+  template <typename MemberType, typename ValueType, typename IntType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const int matrix_rank, const int m,
+      const int /*n*/, const ValueType *U, const int us0, const int us1,
+      const ValueType *T, const int ts0, const int ts1, const ValueType *V,
+      const int vs0, const int vs1, const IntType *p, const int ps0,
+      /* */ ValueType *x, const int xs0,
+      /* */ ValueType *b, const int bs0,
+      /* */ ValueType *w) {
+    typedef ValueType value_type;
+    // typedef IntType int_type;
+
+    const value_type one(1), zero(0);
+    const int ws0 = 1;
+
+    if (matrix_rank < m) {
+      /// w = U^T b
+      TeamVectorGemvInternal<Algo::Gemv::Unblocked>::invoke(
+          member, matrix_rank, m, one, U, us1, us0, b, bs0, zero, w, ws0);
+
+      /// w = T^{-1} w
+      TeamVectorTrsvInternalLower<Algo::Trsv::Unblocked>::invoke(
+          member, false, matrix_rank, one, T, ts0, ts1, w, ws0);
+
+      /// x = V^T w
+      TeamVectorGemvInternal<Algo::Gemv::Unblocked>::invoke(
+          member, m, matrix_rank, one, V, vs1, vs0, w, ws0, zero, x, xs0);
+    } else {
+      TeamVectorGemvInternal<Algo::Gemv::Unblocked>::invoke(
+          member, matrix_rank, m, one, U, us1, us0, b, bs0, zero, x, xs0);
+
+      TeamVectorTrsvInternalUpper<Algo::Trsv::Unblocked>::invoke(
+          member, false, matrix_rank, one, T, ts0, ts1, x, xs0);
     }
 
-    template<typename MemberType,
-             typename ValueType,
-	     typename IntType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-	   const int matrix_rank,
-           const int m, const int n, const int nrhs,
-           const ValueType * U, const int us0, const int us1,
-	   const ValueType * T, const int ts0, const int ts1,
-	   const ValueType * V, const int vs0, const int vs1,
-	   const IntType   * p, const int ps0,
-	   /* */ ValueType * X, const int xs0, const int xs1,
-	   /* */ ValueType * B, const int bs0, const int bs1,
-           /* */ ValueType * w) {
-      typedef ValueType value_type;
-      //typedef IntType int_type;
-
-      const value_type one(1), zero(0);
-
-      value_type * W = w; /// m x nrhs
-      const int ws0 = xs0 < xs1 ? 1 : nrhs, ws1 = xs0 < xs1 ? m : 1;
-
-      if (matrix_rank < n) {
-	/// U is m x matrix_rank
-	/// T is matrix_rank x matrix_rank
-	/// V is matrix_rank x n
-	/// W = U^T B
-	TeamVectorGemmInternal<Algo::Gemm::Unblocked>
-	  ::invoke(member,
-		   matrix_rank, nrhs, m,
-		   one,
-		   U, us1, us0,
-		   B, bs0, bs1,
-		   zero,
-		   W, ws0, ws1);
-	member.team_barrier();
-
-	/// W = T^{-1} W
-	TeamVectorTrsmInternalLeftLower<Algo::Trsm::Unblocked>
-	  ::invoke(member,
-		   false,
-		   matrix_rank, nrhs,
-		   one,
-		   T, ts0, ts1,
-		   W, ws0, ws1);
-	member.team_barrier();
-	
-	/// X = V^T W
-	TeamVectorGemmInternal<Algo::Gemm::Unblocked>
-	  ::invoke(member,
-		   n, nrhs, matrix_rank, 
-		   one,
-		   V, vs1, vs0,
-		   W, ws0, ws1,
-		   zero,
-		   X, xs0, xs1);
-	member.team_barrier();
-      } else {
-	/// W = U^T B
-	TeamVectorGemmInternal<Algo::Gemm::Unblocked>
-	  ::invoke(member,
-		   matrix_rank, nrhs, m, 
-		   one,
-		   U, us1, us0,
-		   B, bs0, bs1,
-		   zero,
-		   X, xs0, xs1);
-    member.team_barrier();
-
-	/// X = T^{-1} X
-	TeamVectorTrsmInternalLeftUpper<Algo::Trsm::Unblocked>
-	  ::invoke(member,
-		   false,
-		   matrix_rank, nrhs,
-		   one,
-		   T, ts0, ts1,
-		   X, xs0, xs1);		
-	member.team_barrier();
-      }
-      
-      /// X = P^T X
-      TeamVectorApplyPivotMatrixBackwardInternal
-      	::invoke(member,
-      		 nrhs, n,
-      		 p, ps0,
-      		 X, xs0, xs1);
-
-      return 0;
+    /// x = P^T x
+    TeamVectorApplyPivotVectorBackwardInternal ::invoke(member, m, p, ps0, x,
+                                                        xs0);
+
+    return 0;
+  }
+
+  template <typename MemberType, typename ValueType, typename IntType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const int matrix_rank, const int m, const int n,
+      const int nrhs, const ValueType *U, const int us0, const int us1,
+      const ValueType *T, const int ts0, const int ts1, const ValueType *V,
+      const int vs0, const int vs1, const IntType *p, const int ps0,
+      /* */ ValueType *X, const int xs0, const int xs1,
+      /* */ ValueType *B, const int bs0, const int bs1,
+      /* */ ValueType *w) {
+    typedef ValueType value_type;
+    // typedef IntType int_type;
+
+    const value_type one(1), zero(0);
+
+    value_type *W = w;  /// m x nrhs
+    const int ws0 = xs0 < xs1 ? 1 : nrhs, ws1 = xs0 < xs1 ? m : 1;
+
+    if (matrix_rank < n) {
+      /// U is m x matrix_rank
+      /// T is matrix_rank x matrix_rank
+      /// V is matrix_rank x n
+      /// W = U^T B
+      TeamVectorGemmInternal<Algo::Gemm::Unblocked>::invoke(
+          member, matrix_rank, nrhs, m, one, U, us1, us0, B, bs0, bs1, zero, W,
+          ws0, ws1);
+      member.team_barrier();
+
+      /// W = T^{-1} W
+      TeamVectorTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(
+          member, false, matrix_rank, nrhs, one, T, ts0, ts1, W, ws0, ws1);
+      member.team_barrier();
+
+      /// X = V^T W
+      TeamVectorGemmInternal<Algo::Gemm::Unblocked>::invoke(
+          member, n, nrhs, matrix_rank, one, V, vs1, vs0, W, ws0, ws1, zero, X,
+          xs0, xs1);
+      member.team_barrier();
+    } else {
+      /// W = U^T B
+      TeamVectorGemmInternal<Algo::Gemm::Unblocked>::invoke(
+          member, matrix_rank, nrhs, m, one, U, us1, us0, B, bs0, bs1, zero, X,
+          xs0, xs1);
+      member.team_barrier();
+
+      /// X = T^{-1} X
+      TeamVectorTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(
+          member, false, matrix_rank, nrhs, one, T, ts0, ts1, X, xs0, xs1);
+      member.team_barrier();
     }
 
-  };
+    /// X = P^T X
+    TeamVectorApplyPivotMatrixBackwardInternal ::invoke(member, nrhs, n, p, ps0,
+                                                        X, xs0, xs1);
 
-} // end namespace KokkosBatched
+    return 0;
+  }
+};
 
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Trmm_Serial_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Trmm_Serial_Impl.hpp
index 4d52388d3e..fd99e8413f 100644
--- a/src/batched/dense/impl/KokkosBatched_Trmm_Serial_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Trmm_Serial_Impl.hpp
@@ -49,240 +49,168 @@
 #include "KokkosBatched_Trmm_Serial_Internal.hpp"
 
 namespace KokkosBatched {
-  //// Lower non-transpose ////
-  template<typename ArgDiag>
-  struct SerialTrmm<Side::Left,Uplo::Lower,Trans::NoTranspose,ArgDiag,Algo::Trmm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return SerialTrmmInternalLeftLower<Algo::Trmm::Unblocked>::invoke(ArgDiag::use_unit_diag,
-                                                                        false,
-                                                                        A.extent(0), A.extent(1),
-                                                                        B.extent(0), B.extent(1),
-                                                                        alpha, 
-                                                                        A.data(), A.stride_0(), A.stride_1(),
-                                                                        B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-  template<typename ArgDiag>
-  struct SerialTrmm<Side::Right,Uplo::Lower,Trans::NoTranspose,ArgDiag,Algo::Trmm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return SerialTrmmInternalRightLower<Algo::Trmm::Unblocked>::invoke(ArgDiag::use_unit_diag,
-                                                                        false,
-                                                                        A.extent(0), A.extent(1),
-                                                                        B.extent(0), B.extent(1),
-                                                                        alpha, 
-                                                                        A.data(), A.stride_0(), A.stride_1(),
-                                                                        B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-  //// Lower transpose /////
-  template<typename ArgDiag>
-  struct SerialTrmm<Side::Left,Uplo::Lower,Trans::Transpose,ArgDiag,Algo::Trmm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return SerialTrmmInternalLeftUpper<Algo::Trmm::Unblocked>::invoke(ArgDiag::use_unit_diag,
-                                                                        false,
-                                                                        A.extent(1), A.extent(0),
-                                                                        B.extent(0), B.extent(1),
-                                                                        alpha, 
-                                                                        A.data(), A.stride_1(), A.stride_0(),
-                                                                        B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-  template<typename ArgDiag>
-  struct SerialTrmm<Side::Right,Uplo::Lower,Trans::Transpose,ArgDiag,Algo::Trmm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return SerialTrmmInternalRightUpper<Algo::Trmm::Unblocked>::invoke(ArgDiag::use_unit_diag,
-                                                                        false,
-                                                                        A.extent(1), A.extent(0),
-                                                                        B.extent(0), B.extent(1),
-                                                                        alpha, 
-                                                                        A.data(), A.stride_1(), A.stride_0(),
-                                                                        B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-  //// Lower conjugate-transpose ////
-  template<typename ArgDiag>
-  struct SerialTrmm<Side::Left,Uplo::Lower,Trans::ConjTranspose,ArgDiag,Algo::Trmm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return SerialTrmmInternalLeftUpper<Algo::Trmm::Unblocked>::invoke(ArgDiag::use_unit_diag,
-                                                                        true,
-                                                                        A.extent(1), A.extent(0),
-                                                                        B.extent(0), B.extent(1),
-                                                                        alpha, 
-                                                                        A.data(), A.stride_1(), A.stride_0(),
-                                                                        B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-  template<typename ArgDiag>
-  struct SerialTrmm<Side::Right,Uplo::Lower,Trans::ConjTranspose,ArgDiag,Algo::Trmm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return SerialTrmmInternalRightUpper<Algo::Trmm::Unblocked>::invoke(ArgDiag::use_unit_diag,
-                                                                        true,
-                                                                        A.extent(1), A.extent(0),
-                                                                        B.extent(0), B.extent(1),
-                                                                        alpha, 
-                                                                        A.data(), A.stride_1(), A.stride_0(),
-                                                                        B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-  //// Upper non-transpose ////
-  template<typename ArgDiag>
-  struct SerialTrmm<Side::Left,Uplo::Upper,Trans::NoTranspose,ArgDiag,Algo::Trmm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return SerialTrmmInternalLeftUpper<Algo::Trmm::Unblocked>::invoke(ArgDiag::use_unit_diag,
-                                                                        false,
-                                                                        A.extent(0), A.extent(1),
-                                                                        B.extent(0), B.extent(1),
-                                                                        alpha, 
-                                                                        A.data(), A.stride_0(), A.stride_1(),
-                                                                        B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-  template<typename ArgDiag>
-  struct SerialTrmm<Side::Right,Uplo::Upper,Trans::NoTranspose,ArgDiag,Algo::Trmm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return SerialTrmmInternalRightUpper<Algo::Trmm::Unblocked>::invoke(ArgDiag::use_unit_diag,
-                                                                        false,
-                                                                        A.extent(0), A.extent(1),
-                                                                        B.extent(0), B.extent(1),
-                                                                        alpha, 
-                                                                        A.data(), A.stride_0(), A.stride_1(),
-                                                                        B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-  //// Upper transpose /////
-  template<typename ArgDiag>
-  struct SerialTrmm<Side::Left,Uplo::Upper,Trans::Transpose,ArgDiag,Algo::Trmm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return SerialTrmmInternalLeftLower<Algo::Trmm::Unblocked>::invoke(ArgDiag::use_unit_diag,
-                                                                        false,
-                                                                        A.extent(1), A.extent(0),
-                                                                        B.extent(0), B.extent(1),
-                                                                        alpha, 
-                                                                        A.data(), A.stride_1(), A.stride_0(),
-                                                                        B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-  template<typename ArgDiag>
-  struct SerialTrmm<Side::Right,Uplo::Upper,Trans::Transpose,ArgDiag,Algo::Trmm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return SerialTrmmInternalRightLower<Algo::Trmm::Unblocked>::invoke(ArgDiag::use_unit_diag,
-                                                                        false,
-                                                                        A.extent(1), A.extent(0),
-                                                                        B.extent(0), B.extent(1),
-                                                                        alpha, 
-                                                                        A.data(), A.stride_1(), A.stride_0(),
-                                                                        B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-  //// Upper conjugate-transpose ////
-  template<typename ArgDiag>
-  struct SerialTrmm<Side::Left,Uplo::Upper,Trans::ConjTranspose,ArgDiag,Algo::Trmm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return SerialTrmmInternalLeftLower<Algo::Trmm::Unblocked>::invoke(ArgDiag::use_unit_diag,
-                                                                        true,
-                                                                        A.extent(1), A.extent(0),
-                                                                        B.extent(0), B.extent(1),
-                                                                        alpha, 
-                                                                        A.data(), A.stride_1(), A.stride_0(),
-                                                                        B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-  template<typename ArgDiag>
-  struct SerialTrmm<Side::Right,Uplo::Upper,Trans::ConjTranspose,ArgDiag,Algo::Trmm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return SerialTrmmInternalRightLower<Algo::Trmm::Unblocked>::invoke(ArgDiag::use_unit_diag,
-                                                                        true,
-                                                                        A.extent(1), A.extent(0),
-                                                                        B.extent(0), B.extent(1),
-                                                                        alpha, 
-                                                                        A.data(), A.stride_1(), A.stride_0(),
-                                                                        B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-} // namespace KokkosBatched
+//// Lower non-transpose ////
+template <typename ArgDiag>
+struct SerialTrmm<Side::Left, Uplo::Lower, Trans::NoTranspose, ArgDiag,
+                  Algo::Trmm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return SerialTrmmInternalLeftLower<Algo::Trmm::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, false, A.extent(0), A.extent(1), B.extent(0),
+        B.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(), B.data(),
+        B.stride_0(), B.stride_1());
+  }
+};
+template <typename ArgDiag>
+struct SerialTrmm<Side::Right, Uplo::Lower, Trans::NoTranspose, ArgDiag,
+                  Algo::Trmm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return SerialTrmmInternalRightLower<Algo::Trmm::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, false, A.extent(0), A.extent(1), B.extent(0),
+        B.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(), B.data(),
+        B.stride_0(), B.stride_1());
+  }
+};
+//// Lower transpose /////
+template <typename ArgDiag>
+struct SerialTrmm<Side::Left, Uplo::Lower, Trans::Transpose, ArgDiag,
+                  Algo::Trmm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return SerialTrmmInternalLeftUpper<Algo::Trmm::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, false, A.extent(1), A.extent(0), B.extent(0),
+        B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(),
+        B.stride_0(), B.stride_1());
+  }
+};
+template <typename ArgDiag>
+struct SerialTrmm<Side::Right, Uplo::Lower, Trans::Transpose, ArgDiag,
+                  Algo::Trmm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return SerialTrmmInternalRightUpper<Algo::Trmm::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, false, A.extent(1), A.extent(0), B.extent(0),
+        B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(),
+        B.stride_0(), B.stride_1());
+  }
+};
+//// Lower conjugate-transpose ////
+template <typename ArgDiag>
+struct SerialTrmm<Side::Left, Uplo::Lower, Trans::ConjTranspose, ArgDiag,
+                  Algo::Trmm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return SerialTrmmInternalLeftUpper<Algo::Trmm::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, true, A.extent(1), A.extent(0), B.extent(0),
+        B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(),
+        B.stride_0(), B.stride_1());
+  }
+};
+template <typename ArgDiag>
+struct SerialTrmm<Side::Right, Uplo::Lower, Trans::ConjTranspose, ArgDiag,
+                  Algo::Trmm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return SerialTrmmInternalRightUpper<Algo::Trmm::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, true, A.extent(1), A.extent(0), B.extent(0),
+        B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(),
+        B.stride_0(), B.stride_1());
+  }
+};
+//// Upper non-transpose ////
+template <typename ArgDiag>
+struct SerialTrmm<Side::Left, Uplo::Upper, Trans::NoTranspose, ArgDiag,
+                  Algo::Trmm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return SerialTrmmInternalLeftUpper<Algo::Trmm::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, false, A.extent(0), A.extent(1), B.extent(0),
+        B.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(), B.data(),
+        B.stride_0(), B.stride_1());
+  }
+};
+template <typename ArgDiag>
+struct SerialTrmm<Side::Right, Uplo::Upper, Trans::NoTranspose, ArgDiag,
+                  Algo::Trmm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return SerialTrmmInternalRightUpper<Algo::Trmm::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, false, A.extent(0), A.extent(1), B.extent(0),
+        B.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(), B.data(),
+        B.stride_0(), B.stride_1());
+  }
+};
+//// Upper transpose /////
+template <typename ArgDiag>
+struct SerialTrmm<Side::Left, Uplo::Upper, Trans::Transpose, ArgDiag,
+                  Algo::Trmm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return SerialTrmmInternalLeftLower<Algo::Trmm::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, false, A.extent(1), A.extent(0), B.extent(0),
+        B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(),
+        B.stride_0(), B.stride_1());
+  }
+};
+template <typename ArgDiag>
+struct SerialTrmm<Side::Right, Uplo::Upper, Trans::Transpose, ArgDiag,
+                  Algo::Trmm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return SerialTrmmInternalRightLower<Algo::Trmm::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, false, A.extent(1), A.extent(0), B.extent(0),
+        B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(),
+        B.stride_0(), B.stride_1());
+  }
+};
+//// Upper conjugate-transpose ////
+template <typename ArgDiag>
+struct SerialTrmm<Side::Left, Uplo::Upper, Trans::ConjTranspose, ArgDiag,
+                  Algo::Trmm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return SerialTrmmInternalLeftLower<Algo::Trmm::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, true, A.extent(1), A.extent(0), B.extent(0),
+        B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(),
+        B.stride_0(), B.stride_1());
+  }
+};
+template <typename ArgDiag>
+struct SerialTrmm<Side::Right, Uplo::Upper, Trans::ConjTranspose, ArgDiag,
+                  Algo::Trmm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return SerialTrmmInternalRightLower<Algo::Trmm::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, true, A.extent(1), A.extent(0), B.extent(0),
+        B.extent(1), alpha, A.data(), A.stride_1(), A.stride_0(), B.data(),
+        B.stride_0(), B.stride_1());
+  }
+};
+}  // namespace KokkosBatched
 
-#endif // __KOKKOSBATCHED_TRMM_SERIAL_IMPL_HPP__
+#endif  // __KOKKOSBATCHED_TRMM_SERIAL_IMPL_HPP__
diff --git a/src/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp
index beaeca2300..9b5cc055e3 100644
--- a/src/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp
@@ -52,386 +52,377 @@
 
 namespace KokkosBatched {
 
-  template<typename AlgoType>
-  struct SerialTrmmInternalLeftLower {
-    template<typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int 
-    invoke(const bool use_unit_diag,
-           const bool do_conj,
-           const int am, const int an, 
-           const int bm, const int bn, 
-           const ScalarType alpha,
-           const ValueType *__restrict__ A, const int as0, const int as1,
-           /**/  ValueType *__restrict__ B, const int bs0, const int bs1);
-  };
+template <typename AlgoType>
+struct SerialTrmmInternalLeftLower {
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const bool use_unit_diag, const bool do_conj, const int am, const int an,
+      const int bm, const int bn, const ScalarType alpha,
+      const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+      /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1);
+};
 
-  template<typename AlgoType>
-  struct SerialTrmmInternalLeftUpper {
-    template<typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int 
-    invoke(const bool use_unit_diag,
-           const bool do_conj,
-           const int am, const int an, 
-           const int bm, const int bn, 
-           const ScalarType alpha,
-           const ValueType *__restrict__ A, const int as0, const int as1,
-           /**/  ValueType *__restrict__ B, const int bs0, const int bs1);
-  };
+template <typename AlgoType>
+struct SerialTrmmInternalLeftUpper {
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const bool use_unit_diag, const bool do_conj, const int am, const int an,
+      const int bm, const int bn, const ScalarType alpha,
+      const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+      /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1);
+};
 
-  template<typename AlgoType>
-  struct SerialTrmmInternalRightLower {
-    template<typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int 
-    invoke(const bool use_unit_diag,
-           const bool do_conj,
-           const int am, const int an, 
-           const int bm, const int bn, 
-           const ScalarType alpha,
-           const ValueType *__restrict__ A, const int as0, const int as1,
-           /**/  ValueType *__restrict__ B, const int bs0, const int bs1);
-  };
+template <typename AlgoType>
+struct SerialTrmmInternalRightLower {
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const bool use_unit_diag, const bool do_conj, const int am, const int an,
+      const int bm, const int bn, const ScalarType alpha,
+      const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+      /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1);
+};
 
-  template<typename AlgoType>
-  struct SerialTrmmInternalRightUpper {
-    template<typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int 
-    invoke(const bool use_unit_diag,
-           const bool do_conj,
-           const int am, const int an, 
-           const int bm, const int bn, 
-           const ScalarType alpha,
-           const ValueType *__restrict__ A, const int as0, const int as1,
-           /**/  ValueType *__restrict__ B, const int bs0, const int bs1);
-  };
+template <typename AlgoType>
+struct SerialTrmmInternalRightUpper {
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const bool use_unit_diag, const bool do_conj, const int am, const int an,
+      const int bm, const int bn, const ScalarType alpha,
+      const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+      /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1);
+};
+
+// ech-note: use_unit_diag intentionally ignored for now. Compiler can optimize
+// it out. Assuming that branching logic (especially on GPU) to handle
+// use_unit_diag will use more cycles than simply doing 1.0*B[idx] for the copy
+// if use_unit_diag.
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+SerialTrmmInternalLeftLower<Algo::Trmm::Unblocked>::invoke(
+    const bool /*use_unit_diag*/, const bool do_conj, const int am,
+    const int an, const int bm, const int bn, const ScalarType alpha,
+    const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) {
+  const ScalarType one(1.0), zero(0.0);
+  typedef Kokkos::Details::ArithTraits<ValueType> AT;
+  int left_m  = am;
+  int right_n = bn;
+  // echo-TODO: See about coniditionally setting conjOp at compile time.
+  // auto conjOp = noop;
+  // if (do_conj) {
+  //  conjOp = AT::conj;
+  //}
+  // printf("SerialTrmmInternalLeftLower\n");
 
-  // ech-note: use_unit_diag intentionally ignored for now. Compiler can optimize
-  // it out. Assuming that branching logic (especially on GPU) to handle use_unit_diag
-  // will use more cycles than simply doing 1.0*B[idx] for the copy if use_unit_diag.
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialTrmmInternalLeftLower<Algo::Trmm::Unblocked>::
-  invoke(const bool /*use_unit_diag*/,
-         const bool do_conj,
-         const int am, const int an,
-         const int bm, const int bn,
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         /**/  ValueType *__restrict__ B, const int bs0, const int bs1) {
-
-    const ScalarType one(1.0), zero(0.0);
-    typedef Kokkos::Details::ArithTraits<ValueType> AT;
-    int left_m = am;
-    int right_n = bn;
-    //echo-TODO: See about coniditionally setting conjOp at compile time.
-    //auto conjOp = noop;
-    //if (do_conj) {
-    //  conjOp = AT::conj;
-    //}
-    //printf("SerialTrmmInternalLeftLower\n");
-    
-    auto dotLowerLeftConj = [&](const ValueType *__restrict__ __A, const int __as0, const int __as1, const int __left_row, ValueType *__restrict__ __B, const int __bs0, const int __bs1, const int __right_col) {
-      auto B_elems = __left_row;
-      ScalarType sum = 0;
+  auto dotLowerLeftConj =
+      [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0,
+          const int __as1, const int __left_row, ValueType *KOKKOS_RESTRICT __B,
+          const int __bs0, const int __bs1, const int __right_col) {
+        auto B_elems   = __left_row;
+        ScalarType sum = 0;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int i = 0; i <= B_elems; i++) {
-        // sum += A[left_row, i] * B[i, right_col]
-        sum += AT::conj(__A[__left_row*__as0 + i*__as1]) * __B[i*__bs0 + __bs1*__right_col];
-      }
-      return sum;
-    };
+        for (int i = 0; i <= B_elems; i++) {
+          // sum += A[left_row, i] * B[i, right_col]
+          sum += AT::conj(__A[__left_row * __as0 + i * __as1]) *
+                 __B[i * __bs0 + __bs1 * __right_col];
+        }
+        return sum;
+      };
 
-    auto dotLowerLeft = [&](const ValueType *__restrict__ __A, const int __as0, const int __as1, const int __left_row, ValueType *__restrict__ __B, const int __bs0, const int __bs1, const int __right_col) {
-      auto B_elems = __left_row;
-      ScalarType sum = 0;
+  auto dotLowerLeft = [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0,
+                          const int __as1, const int __left_row,
+                          ValueType *KOKKOS_RESTRICT __B, const int __bs0,
+                          const int __bs1, const int __right_col) {
+    auto B_elems   = __left_row;
+    ScalarType sum = 0;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int i = 0; i <= B_elems; i++) {
-        // sum += A[left_row, i] * B[i, right_col]
-        sum += __A[__left_row*__as0 + i*__as1] * __B[i*__bs0 + __bs1*__right_col];
-      }
-      return sum;
-    };
+    for (int i = 0; i <= B_elems; i++) {
+      // sum += A[left_row, i] * B[i, right_col]
+      sum += __A[__left_row * __as0 + i * __as1] *
+             __B[i * __bs0 + __bs1 * __right_col];
+    }
+    return sum;
+  };
 
-    if (bm <= 0 || bn <= 0 || am <= 0 || an <= 0)
-      return 0;
+  if (bm <= 0 || bn <= 0 || am <= 0 || an <= 0) return 0;
 
-    if (alpha == zero)
-      SerialSetInternal::invoke(bm, bn, zero,  B, bs0, bs1);
-    else {
-      if (alpha != one)
-        SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1);
+  if (alpha == zero)
+    SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1);
+  else {
+    if (alpha != one) SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1);
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int m = left_m-1; m >= 0; m--) {
+    for (int m = left_m - 1; m >= 0; m--) {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-        for (int n = 0; n < right_n; n++) {
-          if (do_conj) {
-            B[m*bs0 + n*bs1] = dotLowerLeftConj(A, as0, as1, m, B, bs0, bs1, n);
-          } else {
-            B[m*bs0 + n*bs1] = dotLowerLeft(A, as0, as1, m, B, bs0, bs1, n);
-          }
+      for (int n = 0; n < right_n; n++) {
+        if (do_conj) {
+          B[m * bs0 + n * bs1] =
+              dotLowerLeftConj(A, as0, as1, m, B, bs0, bs1, n);
+        } else {
+          B[m * bs0 + n * bs1] = dotLowerLeft(A, as0, as1, m, B, bs0, bs1, n);
         }
       }
     }
-    return 0;
   }
+  return 0;
+}
+
+// ech-note: use_unit_diag intentionally ignored for now. Compiler can optimize
+// it out. Assuming that branching logic (especially on GPU) to handle
+// use_unit_diag will use more cycles than simply doing 1.0*B[idx] for the copy
+// if use_unit_diag.
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+SerialTrmmInternalRightLower<Algo::Trmm::Unblocked>::invoke(
+    const bool /*use_unit_diag*/, const bool do_conj, const int am,
+    const int an, const int bm, const int bn, const ScalarType alpha,
+    const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) {
+  const ScalarType one(1.0), zero(0.0);
+  typedef Kokkos::Details::ArithTraits<ValueType> AT;
+  int left_m  = bm;
+  int right_n = an;
+  // echo-TODO: See about coniditionally setting conjOp at compile time.
+  // auto conjOp = noop;
+  // if (do_conj) {
+  //  conjOp = AT::conj;
+  //}
 
-  // ech-note: use_unit_diag intentionally ignored for now. Compiler can optimize
-  // it out. Assuming that branching logic (especially on GPU) to handle use_unit_diag
-  // will use more cycles than simply doing 1.0*B[idx] for the copy if use_unit_diag.
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialTrmmInternalRightLower<Algo::Trmm::Unblocked>::
-  invoke(const bool /*use_unit_diag*/,
-         const bool do_conj,
-         const int am, const int an,
-         const int bm, const int bn,
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         /**/  ValueType *__restrict__ B, const int bs0, const int bs1) {
-
-    const ScalarType one(1.0), zero(0.0);
-    typedef Kokkos::Details::ArithTraits<ValueType> AT;
-    int left_m = bm;
-    int right_n = an;
-    //echo-TODO: See about coniditionally setting conjOp at compile time.
-    //auto conjOp = noop;
-    //if (do_conj) {
-    //  conjOp = AT::conj;
-    //}
-    
-    // Lower triangular matrix is on RHS with the base facing down.
-    // Everytime we compute a new output row of B, we must shift over to the
-    // right by one in A's column to ensure we skip the 0's.
-    auto dotLowerRightConj = [&](const ValueType *__restrict__ __A, const int __as0, const int __as1, const int __am, const int __left_row, ValueType *__restrict__ __B, const int __bs0, const int __bs1, const int __right_col) {
-      auto B_elems = __am - 1;
-      ScalarType sum = 0;
+  // Lower triangular matrix is on RHS with the base facing down.
+  // Everytime we compute a new output row of B, we must shift over to the
+  // right by one in A's column to ensure we skip the 0's.
+  auto dotLowerRightConj = [&](const ValueType *KOKKOS_RESTRICT __A,
+                               const int __as0, const int __as1, const int __am,
+                               const int __left_row,
+                               ValueType *KOKKOS_RESTRICT __B, const int __bs0,
+                               const int __bs1, const int __right_col) {
+    auto B_elems   = __am - 1;
+    ScalarType sum = 0;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int i = __right_col; i <= B_elems; i++) {
-        // sum += B[left_row, i] * A[i, right_col]
-        sum += __B[__bs0*__left_row + i*__bs1] * AT::conj(__A[i*__as0 + __right_col*__as1]);
-      }
-      return sum;
-    };
+    for (int i = __right_col; i <= B_elems; i++) {
+      // sum += B[left_row, i] * A[i, right_col]
+      sum += __B[__bs0 * __left_row + i * __bs1] *
+             AT::conj(__A[i * __as0 + __right_col * __as1]);
+    }
+    return sum;
+  };
 
-    auto dotLowerRight = [&](const ValueType *__restrict__ __A, const int __as0, const int __as1, const int __am, const int __left_row, ValueType *__restrict__ __B, const int __bs0, const int __bs1, const int __right_col) {
-      auto B_elems = __am - 1;
-      ScalarType sum = 0;
+  auto dotLowerRight = [&](const ValueType *KOKKOS_RESTRICT __A,
+                           const int __as0, const int __as1, const int __am,
+                           const int __left_row, ValueType *KOKKOS_RESTRICT __B,
+                           const int __bs0, const int __bs1,
+                           const int __right_col) {
+    auto B_elems   = __am - 1;
+    ScalarType sum = 0;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int i = __right_col; i <= B_elems; i++) {
-        // sum += B[left_row, i] * A[i, right_col]
-        sum += __B[__bs0*__left_row + i*__bs1] * __A[i*__as0 + __right_col*__as1];
-      }
-      return sum;
-    };
+    for (int i = __right_col; i <= B_elems; i++) {
+      // sum += B[left_row, i] * A[i, right_col]
+      sum += __B[__bs0 * __left_row + i * __bs1] *
+             __A[i * __as0 + __right_col * __as1];
+    }
+    return sum;
+  };
 
-    if (bm <= 0 || bn <= 0 || am <= 0 || an <= 0)
-      return 0;
+  if (bm <= 0 || bn <= 0 || am <= 0 || an <= 0) return 0;
 
-    if (alpha == zero)
-      SerialSetInternal::invoke(bm, bn, zero,  B, bs0, bs1);
-    else {
-      if (alpha != one)
-        SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1);
+  if (alpha == zero)
+    SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1);
+  else {
+    if (alpha != one) SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1);
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int m = 0; m < left_m; m++) {
+    for (int m = 0; m < left_m; m++) {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-        for (int n = 0; n < right_n; n++) {
-          if (do_conj) {
-            B[m*bs0 + n*bs1] = dotLowerRightConj(A, as0, as1, am, m, B, bs0, bs1, n);
-          } else {
-            B[m*bs0 + n*bs1] = dotLowerRight(A, as0, as1, am, m, B, bs0, bs1, n);
-          }
+      for (int n = 0; n < right_n; n++) {
+        if (do_conj) {
+          B[m * bs0 + n * bs1] =
+              dotLowerRightConj(A, as0, as1, am, m, B, bs0, bs1, n);
+        } else {
+          B[m * bs0 + n * bs1] =
+              dotLowerRight(A, as0, as1, am, m, B, bs0, bs1, n);
         }
       }
     }
-    return 0;
-  }  
-
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialTrmmInternalLeftUpper<Algo::Trmm::Unblocked>::
-  invoke(const bool /*use_unit_diag*/,
-         const bool do_conj,
-         const int am, const int an,
-         const int bm, const int bn,
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         /**/  ValueType *__restrict__ B, const int bs0, const int bs1) {
-
-    const ScalarType one(1.0), zero(0.0);
-    typedef Kokkos::Details::ArithTraits<ValueType> AT;
-    int left_m = am;
-    int right_n = bn;
-    //echo-TODO: See about coniditionally setting conjOp at compile time.
-    //auto conjOp = noop;
-    //if (do_conj) {
-    //  conjOp = AT::conj;
-    //}
-
-    auto dotUpperLeftConj = [&](const ValueType *__restrict__ __A, const int __as0, const int __as1, const int __an, const int __left_row, ValueType *__restrict__ __B, const int __bs0, const int __bs1, const int __right_col) {
-      auto B_elems = __an - __left_row - 1;
-      ScalarType sum = 0;
+  }
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+SerialTrmmInternalLeftUpper<Algo::Trmm::Unblocked>::invoke(
+    const bool /*use_unit_diag*/, const bool do_conj, const int am,
+    const int an, const int bm, const int bn, const ScalarType alpha,
+    const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) {
+  const ScalarType one(1.0), zero(0.0);
+  typedef Kokkos::Details::ArithTraits<ValueType> AT;
+  int left_m  = am;
+  int right_n = bn;
+  // echo-TODO: See about coniditionally setting conjOp at compile time.
+  // auto conjOp = noop;
+  // if (do_conj) {
+  //  conjOp = AT::conj;
+  //}
+
+  auto dotUpperLeftConj = [&](const ValueType *KOKKOS_RESTRICT __A,
+                              const int __as0, const int __as1, const int __an,
+                              const int __left_row,
+                              ValueType *KOKKOS_RESTRICT __B, const int __bs0,
+                              const int __bs1, const int __right_col) {
+    auto B_elems   = __an - __left_row - 1;
+    ScalarType sum = 0;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int i = 0; i <= B_elems; i++) {
-        // sum += A[left_row, i+left_row] * B[i+left_row, right_col]
-        sum += AT::conj(__A[__left_row*__as0 + (i+__left_row)*__as1]) * __B[(i+__left_row)*__bs0 + __bs1*__right_col];
-      }
-      return sum;
-    };
-    
-    auto dotUpperLeft = [&](const ValueType *__restrict__ __A, const int __as0, const int __as1, const int __an, const int __left_row, ValueType *__restrict__ __B, const int __bs0, const int __bs1, const int __right_col) {
-      auto B_elems = __an - __left_row - 1;
-      ScalarType sum = 0;
+    for (int i = 0; i <= B_elems; i++) {
+      // sum += A[left_row, i+left_row] * B[i+left_row, right_col]
+      sum += AT::conj(__A[__left_row * __as0 + (i + __left_row) * __as1]) *
+             __B[(i + __left_row) * __bs0 + __bs1 * __right_col];
+    }
+    return sum;
+  };
+
+  auto dotUpperLeft = [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0,
+                          const int __as1, const int __an, const int __left_row,
+                          ValueType *KOKKOS_RESTRICT __B, const int __bs0,
+                          const int __bs1, const int __right_col) {
+    auto B_elems   = __an - __left_row - 1;
+    ScalarType sum = 0;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int i = 0; i <= B_elems; i++) {
-        // sum += A[left_row, i+left_row] * B[i+left_row, right_col]
-        sum += __A[__left_row*__as0 + (i+__left_row)*__as1] * __B[(i+__left_row)*__bs0 + __bs1*__right_col];
-      }
-      return sum;
-    };
-
-    if (bm <= 0 || bn <= 0 || am <= 0 || an <= 0)
-      return 0;
-
-    if (alpha == zero)
-      SerialSetInternal::invoke(bm, bn, zero,  B, bs0, bs1);
-    else {
-      if (alpha != one)
-        SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1);
-      
+    for (int i = 0; i <= B_elems; i++) {
+      // sum += A[left_row, i+left_row] * B[i+left_row, right_col]
+      sum += __A[__left_row * __as0 + (i + __left_row) * __as1] *
+             __B[(i + __left_row) * __bs0 + __bs1 * __right_col];
+    }
+    return sum;
+  };
+
+  if (bm <= 0 || bn <= 0 || am <= 0 || an <= 0) return 0;
+
+  if (alpha == zero)
+    SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1);
+  else {
+    if (alpha != one) SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1);
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int m = 0; m < left_m; ++m) {
+    for (int m = 0; m < left_m; ++m) {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-        for (int n = 0; n < right_n; ++n) {
-          if (do_conj) {
-            B[m*bs0 + n*bs1] = dotUpperLeftConj(A, as0, as1, an, m, B, bs0, bs1, n);
-          } else {
-            B[m*bs0 + n*bs1] = dotUpperLeft(A, as0, as1, an, m, B, bs0, bs1, n);
-          }
+      for (int n = 0; n < right_n; ++n) {
+        if (do_conj) {
+          B[m * bs0 + n * bs1] =
+              dotUpperLeftConj(A, as0, as1, an, m, B, bs0, bs1, n);
+        } else {
+          B[m * bs0 + n * bs1] =
+              dotUpperLeft(A, as0, as1, an, m, B, bs0, bs1, n);
         }
       }
     }
-    return 0;
   }
-  
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialTrmmInternalRightUpper<Algo::Trmm::Unblocked>::
-  invoke(const bool /*use_unit_diag*/,
-         const bool do_conj,
-         const int am, const int an,
-         const int bm, const int bn,
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         /**/  ValueType *__restrict__ B, const int bs0, const int bs1) {
-
-    const ScalarType one(1.0), zero(0.0);
-    typedef Kokkos::Details::ArithTraits<ValueType> AT;
-    int left_m = bm;
-    int right_n = an;
-    //echo-TODO: See about coniditionally setting conjOp at compile time.
-    //auto conjOp = noop;
-    //if (do_conj) {
-    //  conjOp = AT::conj;
-    //}
-
-    auto dotUpperRightConj = [&](const ValueType *__restrict__ __A, const int __as0, const int __as1, const int __left_row, ValueType *__restrict__ __B, const int __bs0, const int __bs1, const int __right_col) {
-      auto B_elems = __right_col;
-      ScalarType sum = 0;
+  return 0;
+}
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+SerialTrmmInternalRightUpper<Algo::Trmm::Unblocked>::invoke(
+    const bool /*use_unit_diag*/, const bool do_conj, const int am,
+    const int an, const int bm, const int bn, const ScalarType alpha,
+    const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) {
+  const ScalarType one(1.0), zero(0.0);
+  typedef Kokkos::Details::ArithTraits<ValueType> AT;
+  int left_m  = bm;
+  int right_n = an;
+  // echo-TODO: See about coniditionally setting conjOp at compile time.
+  // auto conjOp = noop;
+  // if (do_conj) {
+  //  conjOp = AT::conj;
+  //}
+
+  auto dotUpperRightConj =
+      [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0,
+          const int __as1, const int __left_row, ValueType *KOKKOS_RESTRICT __B,
+          const int __bs0, const int __bs1, const int __right_col) {
+        auto B_elems   = __right_col;
+        ScalarType sum = 0;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int i = 0; i <= B_elems; i++) {
-        // sum += B[left_row, i] * A[i, right_col]
-        sum += __B[__left_row*__bs0 + i*__bs1] * AT::conj(__A[i*__as0 + __right_col*__as1]);
-      }
-      return sum;
-    };
+        for (int i = 0; i <= B_elems; i++) {
+          // sum += B[left_row, i] * A[i, right_col]
+          sum += __B[__left_row * __bs0 + i * __bs1] *
+                 AT::conj(__A[i * __as0 + __right_col * __as1]);
+        }
+        return sum;
+      };
 
-    auto dotUpperRight = [&](const ValueType *__restrict__ __A, const int __as0, const int __as1, const int __left_row, ValueType *__restrict__ __B, const int __bs0, const int __bs1, const int __right_col) {
-      auto B_elems = __right_col;
-      ScalarType sum = 0;
+  auto dotUpperRight =
+      [&](const ValueType *KOKKOS_RESTRICT __A, const int __as0,
+          const int __as1, const int __left_row, ValueType *KOKKOS_RESTRICT __B,
+          const int __bs0, const int __bs1, const int __right_col) {
+        auto B_elems   = __right_col;
+        ScalarType sum = 0;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int i = 0; i <= B_elems; i++) {
-        // sum += B[left_row, i] * A[i, right_col]
-        sum += __B[__left_row*__bs0 + i*__bs1] * __A[i*__as0 + __right_col*__as1];
-      }
-      return sum;
-    };
+        for (int i = 0; i <= B_elems; i++) {
+          // sum += B[left_row, i] * A[i, right_col]
+          sum += __B[__left_row * __bs0 + i * __bs1] *
+                 __A[i * __as0 + __right_col * __as1];
+        }
+        return sum;
+      };
 
-    if (bm <= 0 || bn <= 0 || am <= 0 || an <= 0)
-      return 0;
+  if (bm <= 0 || bn <= 0 || am <= 0 || an <= 0) return 0;
 
-    if (alpha == zero)
-      SerialSetInternal::invoke(bm, bn, zero,  B, bs0, bs1);
-    else {
-      if (alpha != one)
-        SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1);
+  if (alpha == zero)
+    SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1);
+  else {
+    if (alpha != one) SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1);
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      for (int m = 0; m < left_m; ++m) {
+    for (int m = 0; m < left_m; ++m) {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-        for (int n = right_n - 1; n >= 0; --n) {
-          if (do_conj) {
-            B[m*bs0 + n*bs1] = dotUpperRightConj(A, as0, as1, m, B, bs0, bs1, n);
-          } else {
-            B[m*bs0 + n*bs1] = dotUpperRight(A, as0, as1, m, B, bs0, bs1, n);
-          }
+      for (int n = right_n - 1; n >= 0; --n) {
+        if (do_conj) {
+          B[m * bs0 + n * bs1] =
+              dotUpperRightConj(A, as0, as1, m, B, bs0, bs1, n);
+        } else {
+          B[m * bs0 + n * bs1] = dotUpperRight(A, as0, as1, m, B, bs0, bs1, n);
         }
       }
     }
-    return 0;
   }
-} // namespace KokkosBatched
-#endif // __KOKKOSBATCHED_TRMM_SERIAL_INTERNAL_HPP__
+  return 0;
+}
+}  // namespace KokkosBatched
+#endif  // __KOKKOSBATCHED_TRMM_SERIAL_INTERNAL_HPP__
diff --git a/src/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp
index 60a0c89510..765b30d3e7 100644
--- a/src/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_TRSM_SERIAL_IMPL_HPP__
 #define __KOKKOSBATCHED_TRSM_SERIAL_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,483 +8,387 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// L/L/NT
-  ///
-  /// B := inv(tril(A)) (alpha*B)
-  /// A(m x m), B(m x n)
-
-#if                                                     \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&               \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) &&       \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)    
-  template<typename ArgDiag>
-  struct SerialTrsm<Side::Left,Uplo::Lower,Trans::NoTranspose,ArgDiag,Algo::Trsm::CompactMKL> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      typedef typename BViewType::value_type vector_type;
-      //typedef typename vector_type::value_type value_type;
-        
-      const int
-        m = B.extent(0),
-        n = B.extent(1);
-
-      static_assert(is_vector<vector_type>::value, "value type is not vector type");      
-      static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, 
-                    "AVX, AVX2 and AVX512 is supported");
-      const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ?  MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
-        
-      // no error check
-      int r_val = 0;
-      if (A.stride_0() == 1 && B.stride_0() == 1) {
-        mkl_dtrsm_compact(MKL_COL_MAJOR, 
-                          MKL_LEFT, MKL_LOWER, MKL_NOTRANS, 
-                          ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT,
-                          m, n, 
-                          alpha, 
-                          (const double*)A.data(), A.stride_1(), 
-                          (double*)B.data(), B.stride_1(), 
-                          format, (MKL_INT)vector_type::vector_length);
-      } else if (A.stride_1() == 1 && B.stride_1() == 1) {
-        mkl_dtrsm_compact(MKL_ROW_MAJOR, 
-                          MKL_LEFT, MKL_LOWER, MKL_NOTRANS, 
-                          ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT,
-                          m, n, 
-                          alpha, 
-                          (const double*)A.data(), A.stride_0(), 
-                          (double*)B.data(), B.stride_0(), 
-                          format, (MKL_INT)vector_type::vector_length);
-      } else {
-        r_val = -1;
-      }
-      return r_val;
-    }
-  };
-#endif    
-
-  template<typename ArgDiag>
-  struct SerialTrsm<Side::Left,Uplo::Lower,Trans::NoTranspose,ArgDiag,Algo::Trsm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return SerialTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(ArgDiag::use_unit_diag,
-                                                                        B.extent(0), B.extent(1),
-                                                                        alpha, 
-                                                                        A.data(), A.stride_0(), A.stride_1(),
-                                                                        B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-
-  template<typename ArgDiag>
-  struct SerialTrsm<Side::Left,Uplo::Lower,Trans::NoTranspose,ArgDiag,Algo::Trsm::Blocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return SerialTrsmInternalLeftLower<Algo::Trsm::Blocked>::invoke(ArgDiag::use_unit_diag,
-                                                                      B.extent(0), B.extent(1),
-                                                                      alpha, 
-                                                                      A.data(), A.stride_0(), A.stride_1(),
-                                                                      B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-
-  ///
-  /// R/U/NT
-  ///
-  /// B := (alpha*B) inv(triu(A))
-  /// A(n x n), B(m x n)
-#if                                                     \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&               \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) &&       \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
-  template<typename ArgDiag>
-  struct SerialTrsm<Side::Right,Uplo::Upper,Trans::NoTranspose,ArgDiag,Algo::Trsm::CompactMKL> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      typedef typename BViewType::value_type vector_type;
-      //typedef typename vector_type::value_type value_type;
-        
-      const int
-        m = B.extent(0),
-        n = B.extent(1);
-
-      static_assert(is_vector<vector_type>::value, "value type is not vector type");      
-      static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, 
-                    "AVX, AVX2 and AVX512 is supported");
-      const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ?  MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
-
-      // no error check
-      int r_val = 0;
-      if (A.stride_0() == 1 && B.stride_0() == 1) {
-        mkl_dtrsm_compact(MKL_COL_MAJOR, 
-                          MKL_RIGHT, MKL_UPPER, MKL_NOTRANS, 
-                          ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT,
-                          m, n, 
-                          alpha, 
-                          (const double*)A.data(), A.stride_1(), 
-                          (double*)B.data(), B.stride_1(), 
-                          format, (MKL_INT)vector_type::vector_length);
-      } else if (A.stride_1() == 1 && B.stride_1() == 1) {
-        mkl_dtrsm_compact(MKL_ROW_MAJOR, 
-                          MKL_RIGHT, MKL_UPPER, MKL_NOTRANS, 
-                          ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT,
-                          m, n, 
-                          alpha, 
-                          (const double*)A.data(), A.stride_0(), 
-                          (double*)B.data(), B.stride_0(), 
-                          format, (MKL_INT)vector_type::vector_length);
-      } else {
-        r_val = -1;
-      }
-      return r_val;
+///
+/// L/L/NT
+///
+/// B := inv(tril(A)) (alpha*B)
+/// A(m x m), B(m x n)
+
+#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&         \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
+template <typename ArgDiag>
+struct SerialTrsm<Side::Left, Uplo::Lower, Trans::NoTranspose, ArgDiag,
+                  Algo::Trsm::CompactMKL> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    typedef typename BViewType::value_type vector_type;
+    // typedef typename vector_type::value_type value_type;
+
+    const int m = B.extent(0), n = B.extent(1);
+
+    static_assert(is_vector<vector_type>::value,
+                  "value type is not vector type");
+    static_assert(
+        vector_type::vector_length == 4 || vector_type::vector_length == 8,
+        "AVX, AVX2 and AVX512 is supported");
+    const MKL_COMPACT_PACK format =
+        vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
+
+    // no error check
+    int r_val = 0;
+    if (A.stride_0() == 1 && B.stride_0() == 1) {
+      mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_LOWER, MKL_NOTRANS,
+                        ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n,
+                        alpha, (const double *)A.data(), A.stride_1(),
+                        (double *)B.data(), B.stride_1(), format,
+                        (MKL_INT)vector_type::vector_length);
+    } else if (A.stride_1() == 1 && B.stride_1() == 1) {
+      mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_LOWER, MKL_NOTRANS,
+                        ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n,
+                        alpha, (const double *)A.data(), A.stride_0(),
+                        (double *)B.data(), B.stride_0(), format,
+                        (MKL_INT)vector_type::vector_length);
+    } else {
+      r_val = -1;
     }
-  };
+    return r_val;
+  }
+};
 #endif
 
-  template<typename ArgDiag>
-  struct SerialTrsm<Side::Right,Uplo::Upper,Trans::NoTranspose,ArgDiag,Algo::Trsm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return SerialTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(ArgDiag::use_unit_diag,
-                                                                        B.extent(1), B.extent(0),
-                                                                        alpha, 
-                                                                        A.data(), A.stride_1(), A.stride_0(),
-                                                                        B.data(), B.stride_1(), B.stride_0());
-    }
-  };
-
-  template<typename ArgDiag>
-  struct SerialTrsm<Side::Right,Uplo::Upper,Trans::NoTranspose,ArgDiag,Algo::Trsm::Blocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return SerialTrsmInternalLeftLower<Algo::Trsm::Blocked>::invoke(ArgDiag::use_unit_diag,
-                                                                      B.extent(1), B.extent(0),
-                                                                      alpha, 
-                                                                      A.data(), A.stride_1(), A.stride_0(),
-                                                                      B.data(), B.stride_1(), B.stride_0());
-    }      
-  };
-    
-  ///
-  /// L/U/NT
-  ///
-  /// B := inv(triu(A)) (alpha*B) 
-  /// A(m x m), B(m x n)
-#if                                                     \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&               \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) &&       \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
-  template<typename ArgDiag>
-  struct SerialTrsm<Side::Left,Uplo::Upper,Trans::NoTranspose,ArgDiag,Algo::Trsm::CompactMKL> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      typedef typename BViewType::value_type vector_type;
-      //typedef typename vector_type::value_type value_type;
-        
-      const int
-        m = B.extent(0),
-        n = B.extent(1);
-
-      static_assert(is_vector<vector_type>::value, "value type is not vector type");      
-      static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, 
-                    "AVX, AVX2 and AVX512 is supported");
-      const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ?  MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
-
-      // no error check
-      int r_val = 0;
-      if (A.stride_0() == 1 && B.stride_0() == 1) {
-        mkl_dtrsm_compact(MKL_COL_MAJOR, 
-                          MKL_LEFT, MKL_UPPER, MKL_NOTRANS, 
-                          ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT,
-                          m, n, 
-                          alpha, 
-                          (const double*)A.data(), A.stride_1(), 
-                          (double*)B.data(), B.stride_1(), 
-                          format, (MKL_INT)vector_type::vector_length);
-      } else if (A.stride_1() == 1 && B.stride_1() == 1) {
-        mkl_dtrsm_compact(MKL_ROW_MAJOR, 
-                          MKL_LEFT, MKL_UPPER, MKL_NOTRANS, 
-                          ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT,
-                          m, n, 
-                          alpha, 
-                          (const double*)A.data(), A.stride_0(), 
-                          (double*)B.data(), B.stride_0(), 
-                          format, (MKL_INT)vector_type::vector_length);
-      } else {
-        r_val = -1;
-      }
-      return r_val;
+template <typename ArgDiag>
+struct SerialTrsm<Side::Left, Uplo::Lower, Trans::NoTranspose, ArgDiag,
+                  Algo::Trsm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return SerialTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(),
+        A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1());
+  }
+};
+
+template <typename ArgDiag>
+struct SerialTrsm<Side::Left, Uplo::Lower, Trans::NoTranspose, ArgDiag,
+                  Algo::Trsm::Blocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return SerialTrsmInternalLeftLower<Algo::Trsm::Blocked>::invoke(
+        ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(),
+        A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1());
+  }
+};
+
+///
+/// R/U/NT
+///
+/// B := (alpha*B) inv(triu(A))
+/// A(n x n), B(m x n)
+#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&         \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
+template <typename ArgDiag>
+struct SerialTrsm<Side::Right, Uplo::Upper, Trans::NoTranspose, ArgDiag,
+                  Algo::Trsm::CompactMKL> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    typedef typename BViewType::value_type vector_type;
+    // typedef typename vector_type::value_type value_type;
+
+    const int m = B.extent(0), n = B.extent(1);
+
+    static_assert(is_vector<vector_type>::value,
+                  "value type is not vector type");
+    static_assert(
+        vector_type::vector_length == 4 || vector_type::vector_length == 8,
+        "AVX, AVX2 and AVX512 is supported");
+    const MKL_COMPACT_PACK format =
+        vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
+
+    // no error check
+    int r_val = 0;
+    if (A.stride_0() == 1 && B.stride_0() == 1) {
+      mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_RIGHT, MKL_UPPER, MKL_NOTRANS,
+                        ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n,
+                        alpha, (const double *)A.data(), A.stride_1(),
+                        (double *)B.data(), B.stride_1(), format,
+                        (MKL_INT)vector_type::vector_length);
+    } else if (A.stride_1() == 1 && B.stride_1() == 1) {
+      mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_RIGHT, MKL_UPPER, MKL_NOTRANS,
+                        ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n,
+                        alpha, (const double *)A.data(), A.stride_0(),
+                        (double *)B.data(), B.stride_0(), format,
+                        (MKL_INT)vector_type::vector_length);
+    } else {
+      r_val = -1;
     }
-  };
+    return r_val;
+  }
+};
 #endif
 
-  template<typename ArgDiag>
-  struct SerialTrsm<Side::Left,Uplo::Upper,Trans::NoTranspose,ArgDiag,Algo::Trsm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return SerialTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(ArgDiag::use_unit_diag,
-                                                                        B.extent(0), B.extent(1),
-                                                                        alpha, 
-                                                                        A.data(), A.stride_0(), A.stride_1(),
-                                                                        B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-
-  template<typename ArgDiag>
-  struct SerialTrsm<Side::Left,Uplo::Upper,Trans::NoTranspose,ArgDiag,Algo::Trsm::Blocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return SerialTrsmInternalLeftUpper<Algo::Trsm::Blocked>::invoke(ArgDiag::use_unit_diag,
-                                                                      B.extent(0), B.extent(1),
-                                                                      alpha, 
-                                                                      A.data(), A.stride_0(), A.stride_1(),
-                                                                      B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-
-  ///
-  /// L/L/T
-  ///
-  /// B := inv(tril(AT)) (alpha*B)
-  /// A(m x m), B(m x n)
-
-#if                                                     \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&               \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) &&       \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)    
-  template<typename ArgDiag>
-  struct SerialTrsm<Side::Left,Uplo::Lower,Trans::Transpose,ArgDiag,Algo::Trsm::CompactMKL> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      typedef typename BViewType::value_type vector_type;
-      //typedef typename vector_type::value_type value_type;
-        
-      const int
-        m = B.extent(0),
-        n = B.extent(1);
-
-      static_assert(is_vector<vector_type>::value, "value type is not vector type");      
-      static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, 
-                    "AVX, AVX2 and AVX512 is supported");
-      const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ?  MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
-        
-      // no error check
-      int r_val = 0;
-      if (A.stride_0() == 1 && B.stride_0() == 1) {
-        mkl_dtrsm_compact(MKL_COL_MAJOR, 
-                          MKL_LEFT, MKL_LOWER, MKL_TRANS, 
-                          ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT,
-                          m, n, 
-                          alpha, 
-                          (const double*)A.data(), A.stride_1(), 
-                          (double*)B.data(), B.stride_1(), 
-                          format, (MKL_INT)vector_type::vector_length);
-      } else if (A.stride_1() == 1 && B.stride_1() == 1) {
-        mkl_dtrsm_compact(MKL_ROW_MAJOR, 
-                          MKL_LEFT, MKL_LOWER, MKL_TRANS, 
-                          ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT,
-                          m, n, 
-                          alpha, 
-                          (const double*)A.data(), A.stride_0(), 
-                          (double*)B.data(), B.stride_0(), 
-                          format, (MKL_INT)vector_type::vector_length);
-      } else {
-        r_val = -1;
-      }
-      return r_val;
-    }
-  };
-#endif    
-
-  template<typename ArgDiag>
-  struct SerialTrsm<Side::Left,Uplo::Lower,Trans::Transpose,ArgDiag,Algo::Trsm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return SerialTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(ArgDiag::use_unit_diag,
-                                                                        B.extent(0), B.extent(1),
-                                                                        alpha, 
-                                                                        A.data(), A.stride_1(), A.stride_0(),
-                                                                        B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-
-  template<typename ArgDiag>
-  struct SerialTrsm<Side::Left,Uplo::Lower,Trans::Transpose,ArgDiag,Algo::Trsm::Blocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return SerialTrsmInternalLeftUpper<Algo::Trsm::Blocked>::invoke(ArgDiag::use_unit_diag,
-                                                                      B.extent(0), B.extent(1),
-                                                                      alpha, 
-                                                                      A.data(), A.stride_1(), A.stride_0(),
-                                                                      B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-  ///
-  /// L/U/NT
-  ///
-  /// B := inv(triu(AT)) (alpha*B) 
-  /// A(m x m), B(m x n)
-#if                                                     \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&               \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) &&       \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
-  template<typename ArgDiag>
-  struct SerialTrsm<Side::Left,Uplo::Upper,Trans::Transpose,ArgDiag,Algo::Trsm::CompactMKL> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      typedef typename BViewType::value_type vector_type;
-      //typedef typename vector_type::value_type value_type;
-        
-      const int
-        m = B.extent(0),
-        n = B.extent(1);
-
-      static_assert(is_vector<vector_type>::value, "value type is not vector type");      
-      static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, 
-                    "AVX, AVX2 and AVX512 is supported");
-      const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ?  MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
-
-      // no error check
-      int r_val = 0;
-      if (A.stride_0() == 1 && B.stride_0() == 1) {
-        mkl_dtrsm_compact(MKL_COL_MAJOR, 
-                          MKL_LEFT, MKL_UPPER, MKL_TRANS, 
-                          ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT,
-                          m, n, 
-                          alpha, 
-                          (const double*)A.data(), A.stride_1(), 
-                          (double*)B.data(), B.stride_1(), 
-                          format, (MKL_INT)vector_type::vector_length);
-      } else if (A.stride_1() == 1 && B.stride_1() == 1) {
-        mkl_dtrsm_compact(MKL_ROW_MAJOR, 
-                          MKL_LEFT, MKL_UPPER, MKL_TRANS, 
-                          ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT,
-                          m, n, 
-                          alpha, 
-                          (const double*)A.data(), A.stride_0(), 
-                          (double*)B.data(), B.stride_0(), 
-                          format, (MKL_INT)vector_type::vector_length);
-      } else {
-        r_val = -1;
-      }
-      return r_val;
+template <typename ArgDiag>
+struct SerialTrsm<Side::Right, Uplo::Upper, Trans::NoTranspose, ArgDiag,
+                  Algo::Trsm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return SerialTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(),
+        A.stride_1(), A.stride_0(), B.data(), B.stride_1(), B.stride_0());
+  }
+};
+
+template <typename ArgDiag>
+struct SerialTrsm<Side::Right, Uplo::Upper, Trans::NoTranspose, ArgDiag,
+                  Algo::Trsm::Blocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return SerialTrsmInternalLeftLower<Algo::Trsm::Blocked>::invoke(
+        ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(),
+        A.stride_1(), A.stride_0(), B.data(), B.stride_1(), B.stride_0());
+  }
+};
+
+///
+/// L/U/NT
+///
+/// B := inv(triu(A)) (alpha*B)
+/// A(m x m), B(m x n)
+#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&         \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
+template <typename ArgDiag>
+struct SerialTrsm<Side::Left, Uplo::Upper, Trans::NoTranspose, ArgDiag,
+                  Algo::Trsm::CompactMKL> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    typedef typename BViewType::value_type vector_type;
+    // typedef typename vector_type::value_type value_type;
+
+    const int m = B.extent(0), n = B.extent(1);
+
+    static_assert(is_vector<vector_type>::value,
+                  "value type is not vector type");
+    static_assert(
+        vector_type::vector_length == 4 || vector_type::vector_length == 8,
+        "AVX, AVX2 and AVX512 is supported");
+    const MKL_COMPACT_PACK format =
+        vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
+
+    // no error check
+    int r_val = 0;
+    if (A.stride_0() == 1 && B.stride_0() == 1) {
+      mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_UPPER, MKL_NOTRANS,
+                        ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n,
+                        alpha, (const double *)A.data(), A.stride_1(),
+                        (double *)B.data(), B.stride_1(), format,
+                        (MKL_INT)vector_type::vector_length);
+    } else if (A.stride_1() == 1 && B.stride_1() == 1) {
+      mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_UPPER, MKL_NOTRANS,
+                        ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n,
+                        alpha, (const double *)A.data(), A.stride_0(),
+                        (double *)B.data(), B.stride_0(), format,
+                        (MKL_INT)vector_type::vector_length);
+    } else {
+      r_val = -1;
     }
-  };
+    return r_val;
+  }
+};
 #endif
 
-  template<typename ArgDiag>
-  struct SerialTrsm<Side::Left,Uplo::Upper,Trans::Transpose,ArgDiag,Algo::Trsm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return SerialTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(ArgDiag::use_unit_diag,
-                                                                        B.extent(0), B.extent(1),
-                                                                        alpha, 
-                                                                        A.data(), A.stride_1(), A.stride_0(),
-                                                                        B.data(), B.stride_0(), B.stride_1());
+template <typename ArgDiag>
+struct SerialTrsm<Side::Left, Uplo::Upper, Trans::NoTranspose, ArgDiag,
+                  Algo::Trsm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return SerialTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(),
+        A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1());
+  }
+};
+
+template <typename ArgDiag>
+struct SerialTrsm<Side::Left, Uplo::Upper, Trans::NoTranspose, ArgDiag,
+                  Algo::Trsm::Blocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return SerialTrsmInternalLeftUpper<Algo::Trsm::Blocked>::invoke(
+        ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(),
+        A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1());
+  }
+};
+
+///
+/// L/L/T
+///
+/// B := inv(tril(AT)) (alpha*B)
+/// A(m x m), B(m x n)
+
+#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&         \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
+template <typename ArgDiag>
+struct SerialTrsm<Side::Left, Uplo::Lower, Trans::Transpose, ArgDiag,
+                  Algo::Trsm::CompactMKL> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    typedef typename BViewType::value_type vector_type;
+    // typedef typename vector_type::value_type value_type;
+
+    const int m = B.extent(0), n = B.extent(1);
+
+    static_assert(is_vector<vector_type>::value,
+                  "value type is not vector type");
+    static_assert(
+        vector_type::vector_length == 4 || vector_type::vector_length == 8,
+        "AVX, AVX2 and AVX512 is supported");
+    const MKL_COMPACT_PACK format =
+        vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
+
+    // no error check
+    int r_val = 0;
+    if (A.stride_0() == 1 && B.stride_0() == 1) {
+      mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_LOWER, MKL_TRANS,
+                        ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n,
+                        alpha, (const double *)A.data(), A.stride_1(),
+                        (double *)B.data(), B.stride_1(), format,
+                        (MKL_INT)vector_type::vector_length);
+    } else if (A.stride_1() == 1 && B.stride_1() == 1) {
+      mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_LOWER, MKL_TRANS,
+                        ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n,
+                        alpha, (const double *)A.data(), A.stride_0(),
+                        (double *)B.data(), B.stride_0(), format,
+                        (MKL_INT)vector_type::vector_length);
+    } else {
+      r_val = -1;
     }
-  };
-
-  template<typename ArgDiag>
-  struct SerialTrsm<Side::Left,Uplo::Upper,Trans::Transpose,ArgDiag,Algo::Trsm::Blocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return SerialTrsmInternalLeftLower<Algo::Trsm::Blocked>::invoke(ArgDiag::use_unit_diag,
-                                                                      B.extent(0), B.extent(1),
-                                                                      alpha, 
-                                                                      A.data(), A.stride_1(), A.stride_0(),
-                                                                      B.data(), B.stride_0(), B.stride_1());
+    return r_val;
+  }
+};
+#endif
+
+template <typename ArgDiag>
+struct SerialTrsm<Side::Left, Uplo::Lower, Trans::Transpose, ArgDiag,
+                  Algo::Trsm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return SerialTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(),
+        A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1());
+  }
+};
+
+template <typename ArgDiag>
+struct SerialTrsm<Side::Left, Uplo::Lower, Trans::Transpose, ArgDiag,
+                  Algo::Trsm::Blocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return SerialTrsmInternalLeftUpper<Algo::Trsm::Blocked>::invoke(
+        ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(),
+        A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1());
+  }
+};
+///
+/// L/U/NT
+///
+/// B := inv(triu(AT)) (alpha*B)
+/// A(m x m), B(m x n)
+#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&         \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
+template <typename ArgDiag>
+struct SerialTrsm<Side::Left, Uplo::Upper, Trans::Transpose, ArgDiag,
+                  Algo::Trsm::CompactMKL> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    typedef typename BViewType::value_type vector_type;
+    // typedef typename vector_type::value_type value_type;
+
+    const int m = B.extent(0), n = B.extent(1);
+
+    static_assert(is_vector<vector_type>::value,
+                  "value type is not vector type");
+    static_assert(
+        vector_type::vector_length == 4 || vector_type::vector_length == 8,
+        "AVX, AVX2 and AVX512 is supported");
+    const MKL_COMPACT_PACK format =
+        vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
+
+    // no error check
+    int r_val = 0;
+    if (A.stride_0() == 1 && B.stride_0() == 1) {
+      mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_UPPER, MKL_TRANS,
+                        ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n,
+                        alpha, (const double *)A.data(), A.stride_1(),
+                        (double *)B.data(), B.stride_1(), format,
+                        (MKL_INT)vector_type::vector_length);
+    } else if (A.stride_1() == 1 && B.stride_1() == 1) {
+      mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_UPPER, MKL_TRANS,
+                        ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n,
+                        alpha, (const double *)A.data(), A.stride_0(),
+                        (double *)B.data(), B.stride_0(), format,
+                        (MKL_INT)vector_type::vector_length);
+    } else {
+      r_val = -1;
     }
-  };
+    return r_val;
+  }
+};
+#endif
+
+template <typename ArgDiag>
+struct SerialTrsm<Side::Left, Uplo::Upper, Trans::Transpose, ArgDiag,
+                  Algo::Trsm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return SerialTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(),
+        A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1());
+  }
+};
 
-}
+template <typename ArgDiag>
+struct SerialTrsm<Side::Left, Uplo::Upper, Trans::Transpose, ArgDiag,
+                  Algo::Trsm::Blocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return SerialTrsmInternalLeftLower<Algo::Trsm::Blocked>::invoke(
+        ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(),
+        A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1());
+  }
+};
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp
index 9317d775b0..b317bed4f7 100644
--- a/src/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_TRSM_SERIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_TRSM_SERIAL_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -14,262 +13,241 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ====================
-
-  template<typename AlgoType>
-  struct SerialTrsmInternalLeftLower {
-    template<typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int 
-    invoke(const bool use_unit_diag,
-           const int m, const int n, 
-           const ScalarType alpha,
-           const ValueType *__restrict__ A, const int as0, const int as1,
-           /**/  ValueType *__restrict__ B, const int bs0, const int bs1);
-  };
-
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialTrsmInternalLeftLower<Algo::Trsm::Unblocked>::
-  invoke(const bool use_unit_diag,
-         const int m, const int n,
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         /**/  ValueType *__restrict__ B, const int bs0, const int bs1) {
-
-    const ScalarType one(1.0), zero(0.0);
-        
-    if (alpha == zero)   SerialSetInternal  ::invoke(m, n, zero,  B, bs0, bs1);
-    else {
-      if (alpha != one)  SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1);
-      if (m <= 0 || n <= 0) return 0;
-
-      for (int p=0;p<m;++p) {
-        const int iend = m-p-1, jend = n;
-          
-        const ValueType
-          *__restrict__ a21 = iend ? A+(p+1)*as0+p*as1 : NULL;
-            
-        ValueType
-          *__restrict__ b1t =        B+p*bs0,
-          *__restrict__ B2  = iend ? B+(p+1)*bs0 : NULL;
-          
-        if (!use_unit_diag) {
-          const ValueType alpha11 = A[p*as0+p*as1];
-                
+///
+/// Serial Internal Impl
+/// ====================
+
+template <typename AlgoType>
+struct SerialTrsmInternalLeftLower {
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag,
+                                           const int m, const int n,
+                                           const ScalarType alpha,
+                                           const ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1,
+                                           /**/ ValueType *KOKKOS_RESTRICT B,
+                                           const int bs0, const int bs1);
+};
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+SerialTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(
+    const bool use_unit_diag, const int m, const int n, const ScalarType alpha,
+    const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) {
+  const ScalarType one(1.0), zero(0.0);
+
+  if (alpha == zero)
+    SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1);
+  else {
+    if (alpha != one) SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1);
+    if (m <= 0 || n <= 0) return 0;
+
+    for (int p = 0; p < m; ++p) {
+      const int iend = m - p - 1, jend = n;
+
+      const ValueType *KOKKOS_RESTRICT a21 =
+          iend ? A + (p + 1) * as0 + p * as1 : NULL;
+
+      ValueType *KOKKOS_RESTRICT b1t = B + p * bs0,
+                                 *KOKKOS_RESTRICT B2 =
+                                     iend ? B + (p + 1) * bs0 : NULL;
+
+      if (!use_unit_diag) {
+        const ValueType alpha11 = A[p * as0 + p * as1];
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-          for (int j=0;j<jend;++j)
-            b1t[j*bs1] = b1t[j*bs1] / alpha11;
-        }
-          
-        for (int i=0;i<iend;++i)
-                
+        for (int j = 0; j < jend; ++j) b1t[j * bs1] = b1t[j * bs1] / alpha11;
+      }
+
+      for (int i = 0; i < iend; ++i)
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-          for (int j=0;j<jend;++j)
-            B2[i*bs0+j*bs1] -= a21[i*as0] * b1t[j*bs1];
-      }
-    }      
-    return 0;
+        for (int j = 0; j < jend; ++j)
+          B2[i * bs0 + j * bs1] -= a21[i * as0] * b1t[j * bs1];
+    }
   }
+  return 0;
+}
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialTrsmInternalLeftLower<Algo::Trsm::Blocked>::
-  invoke(const bool use_unit_diag,
-         const int m, const int n,
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         /**/  ValueType *__restrict__ B, const int bs0, const int bs1) {
-    enum : int {
-      mbAlgo = Algo::Trsm::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
-    };
-
-    const ScalarType one(1.0), zero(0.0), minus_one(-1.0);
-
-    if (alpha == zero)  SerialSetInternal  ::invoke(m, n, zero,  B, bs0, bs1);
-    else {
-      if (alpha != one) SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1);
-      if (m <= 0 || n <= 0) return 0;
-
-      InnerTrsmLeftLowerUnitDiag<mbAlgo>    trsm_u(as0, as1, bs0, bs1);
-      InnerTrsmLeftLowerNonUnitDiag<mbAlgo> trsm_n(as0, as1, bs0, bs1);
-            
-      InnerGemmFixA<mbAlgo,mbAlgo> gemm(as0, as1, bs0, bs1, bs0, bs1);          
-      auto trsm = [&](const int ib, 
-                      const int jb,
-                      const ValueType *__restrict__ AA,
-                      /**/  ValueType *__restrict__ BB) {
-        const int mb = mbAlgo;
-        for (int p=0;p<ib;p+=mb) {
-          const int pb = (p+mb) > ib ? (ib-p) : mb;
-                
-          // trsm update
-          const ValueType *__restrict__ Ap = AA+p*as0+p*as1;
-          /**/  ValueType *__restrict__ Bp = BB+p*bs0;
-                
-          if (use_unit_diag) trsm_u.serial_invoke(Ap, pb, jb, Bp);
-          else               trsm_n.serial_invoke(Ap, pb, jb, Bp);
-                
-          // gemm update
-          for (int i=p+mb;i<ib;i+=mb) {
-            const int mm = (i+mb) > ib ? (ib-i) : mb;
-            gemm.serial_invoke(minus_one, AA+i*as0+p*as1, BB+p*bs0, mm, jb, pb, BB+i*bs0);
-          }
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+SerialTrsmInternalLeftLower<Algo::Trsm::Blocked>::invoke(
+    const bool use_unit_diag, const int m, const int n, const ScalarType alpha,
+    const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) {
+  constexpr int mbAlgo = Algo::Trsm::Blocked::mb();
+
+  const ScalarType one(1.0), zero(0.0), minus_one(-1.0);
+
+  if (alpha == zero)
+    SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1);
+  else {
+    if (alpha != one) SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1);
+    if (m <= 0 || n <= 0) return 0;
+
+    InnerTrsmLeftLowerUnitDiag<mbAlgo> trsm_u(as0, as1, bs0, bs1);
+    InnerTrsmLeftLowerNonUnitDiag<mbAlgo> trsm_n(as0, as1, bs0, bs1);
+
+    InnerGemmFixA<mbAlgo, mbAlgo> gemm(as0, as1, bs0, bs1, bs0, bs1);
+    auto trsm = [&](const int ib, const int jb,
+                    const ValueType *KOKKOS_RESTRICT AA,
+                    /**/ ValueType *KOKKOS_RESTRICT BB) {
+      const int mb = mbAlgo;
+      for (int p = 0; p < ib; p += mb) {
+        const int pb = (p + mb) > ib ? (ib - p) : mb;
+
+        // trsm update
+        const ValueType *KOKKOS_RESTRICT Ap = AA + p * as0 + p * as1;
+        /**/ ValueType *KOKKOS_RESTRICT Bp    = BB + p * bs0;
+
+        if (use_unit_diag)
+          trsm_u.serial_invoke(Ap, pb, jb, Bp);
+        else
+          trsm_n.serial_invoke(Ap, pb, jb, Bp);
+
+        // gemm update
+        for (int i = p + mb; i < ib; i += mb) {
+          const int mm = (i + mb) > ib ? (ib - i) : mb;
+          gemm.serial_invoke(minus_one, AA + i * as0 + p * as1, BB + p * bs0,
+                             mm, jb, pb, BB + i * bs0);
         }
-      };
-            
-      const bool is_small = true; //(m*n <= 64*64);
-      if (is_small) {
-        trsm(m, n, A, B);
-      } else {
-        // // some cache blocking may need (not priority yet);
-        // trsm(m, n, A, B);
       }
+    };
+
+    const bool is_small = true;  //(m*n <= 64*64);
+    if (is_small) {
+      trsm(m, n, A, B);
+    } else {
+      // // some cache blocking may need (not priority yet);
+      // trsm(m, n, A, B);
     }
-    return 0;
   }
+  return 0;
+}
+
+template <typename AlgoType>
+struct SerialTrsmInternalLeftUpper {
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag,
+                                           const int m, const int n,
+                                           const ScalarType alpha,
+                                           const ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1,
+                                           /**/ ValueType *KOKKOS_RESTRICT B,
+                                           const int bs0, const int bs1);
+};
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+SerialTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(
+    const bool use_unit_diag, const int m, const int n, const ScalarType alpha,
+    const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) {
+  const ScalarType one(1.0), zero(0.0);
+
+  if (alpha == zero)
+    SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1);
+  else {
+    if (alpha != one) SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1);
+    if (m <= 0 || n <= 0) return 0;
+
+    ValueType *KOKKOS_RESTRICT B0 = B;
+    for (int p = (m - 1); p >= 0; --p) {
+      const int iend = p, jend = n;
+
+      const ValueType *KOKKOS_RESTRICT a01 = A + p * as1;
+      ValueType *KOKKOS_RESTRICT b1t       = B + p * bs0;
+
+      if (!use_unit_diag) {
+        const ValueType alpha11 = A[p * as0 + p * as1];
 
-  template<typename AlgoType>
-  struct SerialTrsmInternalLeftUpper {
-    template<typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const bool use_unit_diag,
-           const int m, const int n, 
-           const ScalarType alpha,
-           const ValueType *__restrict__ A, const int as0, const int as1,
-           /**/  ValueType *__restrict__ B, const int bs0, const int bs1);
-  };
-
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::
-  invoke(const bool use_unit_diag,
-         const int m, const int n,
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         /**/  ValueType *__restrict__ B, const int bs0, const int bs1) {
-
-    const ScalarType one(1.0), zero(0.0);
-  
-    if (alpha == zero)  SerialSetInternal  ::invoke(m, n, zero,  B, bs0, bs1);
-    else {
-      if (alpha != one) SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1);
-      if (m <= 0 || n <= 0) return 0;
-        
-      ValueType *__restrict__ B0 = B;
-      for (int p=(m-1);p>=0;--p) {
-        const int iend = p, jend = n;
-
-        const ValueType *__restrict__ a01 = A+p*as1;
-        ValueType *__restrict__ b1t = B+p*bs0;
-
-        if (!use_unit_diag) {
-          const ValueType alpha11 = A[p*as0+p*as1];
-                
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-          for (int j=0;j<n;++j)
-            b1t[j*bs1] = b1t[j*bs1] / alpha11;
-        }
-          
-        if (p>0){//Note: A workaround to produce correct results for complex<double> with Intel-18.2.199
-          for (int i=0;i<iend;++i)
-                
+        for (int j = 0; j < n; ++j) b1t[j * bs1] = b1t[j * bs1] / alpha11;
+      }
+
+      if (p > 0) {  // Note: A workaround to produce correct results for
+                    // complex<double> with Intel-18.2.199
+        for (int i = 0; i < iend; ++i)
+
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-            for (int j=0;j<jend;++j)
-              B0[i*bs0+j*bs1] -= a01[i*as0] * b1t[j*bs1];
-        }
+          for (int j = 0; j < jend; ++j)
+            B0[i * bs0 + j * bs1] -= a01[i * as0] * b1t[j * bs1];
       }
     }
-    return 0;
   }
+  return 0;
+}
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialTrsmInternalLeftUpper<Algo::Trsm::Blocked>::
-  invoke(const bool use_unit_diag,
-         const int m, const int n,
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         /**/  ValueType *__restrict__ B, const int bs0, const int bs1) {
-
-    const ScalarType one(1.0), zero(0.0), minus_one(-1.0);
-
-    enum : int {
-      mbAlgo = Algo::Trsm::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
-    };
-
-    if (alpha == zero)  SerialSetInternal  ::invoke(m, n, zero,  B, bs0, bs1);
-    else {
-      if (alpha != one) SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1);
-      if (m <= 0 || n <= 0) return 0;
-
-      InnerTrsmLeftUpperUnitDiag<mbAlgo>    trsm_u(as0, as1, bs0, bs1);
-      InnerTrsmLeftUpperNonUnitDiag<mbAlgo> trsm_n(as0, as1, bs0, bs1);
-            
-      InnerGemmFixA<mbAlgo,mbAlgo> gemm(as0, as1, bs0, bs1, bs0, bs1);
-            
-      auto trsm = [&](const int ib, 
-                      const int jb,
-                      const ValueType *__restrict__ AA,
-                      /**/  ValueType *__restrict__ BB) {
-        const int mb = mbAlgo;
-        for (int pp=0;pp<ib;pp+=mb) {
-          const int 
-            ptmp = ib - pp - mb,
-            p = ptmp < 0 ? 0 : ptmp,
-                       pb = mb + (ptmp < 0)*ptmp;
-                
-          // trsm update
-          const ValueType *__restrict__ Ap = AA+p*as0+p*as1;
-          /**/  ValueType *__restrict__ Bp = BB+p*bs0;
-                
-          if (use_unit_diag) trsm_u.serial_invoke(Ap, pb, jb, Bp);
-          else               trsm_n.serial_invoke(Ap, pb, jb, Bp);
-                
-          // gemm update
-          for (int i=0;i<p;i+=mb) {
-            gemm.serial_invoke(minus_one, AA+i*as0+p*as1, Bp, (i+mb) > p ? (p-i) : mb, jb, pb, BB+i*bs0);
-          }
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+SerialTrsmInternalLeftUpper<Algo::Trsm::Blocked>::invoke(
+    const bool use_unit_diag, const int m, const int n, const ScalarType alpha,
+    const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) {
+  const ScalarType one(1.0), zero(0.0), minus_one(-1.0);
+
+  constexpr int mbAlgo = Algo::Trsm::Blocked::mb();
+
+  if (alpha == zero)
+    SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1);
+  else {
+    if (alpha != one) SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1);
+    if (m <= 0 || n <= 0) return 0;
+
+    InnerTrsmLeftUpperUnitDiag<mbAlgo> trsm_u(as0, as1, bs0, bs1);
+    InnerTrsmLeftUpperNonUnitDiag<mbAlgo> trsm_n(as0, as1, bs0, bs1);
+
+    InnerGemmFixA<mbAlgo, mbAlgo> gemm(as0, as1, bs0, bs1, bs0, bs1);
+
+    auto trsm = [&](const int ib, const int jb,
+                    const ValueType *KOKKOS_RESTRICT AA,
+                    /**/ ValueType *KOKKOS_RESTRICT BB) {
+      const int mb = mbAlgo;
+      for (int pp = 0; pp < ib; pp += mb) {
+        const int ptmp = ib - pp - mb, p = ptmp < 0 ? 0 : ptmp,
+                  pb = mb + (ptmp < 0) * ptmp;
+
+        // trsm update
+        const ValueType *KOKKOS_RESTRICT Ap = AA + p * as0 + p * as1;
+        /**/ ValueType *KOKKOS_RESTRICT Bp    = BB + p * bs0;
+
+        if (use_unit_diag)
+          trsm_u.serial_invoke(Ap, pb, jb, Bp);
+        else
+          trsm_n.serial_invoke(Ap, pb, jb, Bp);
+
+        // gemm update
+        for (int i = 0; i < p; i += mb) {
+          gemm.serial_invoke(minus_one, AA + i * as0 + p * as1, Bp,
+                             (i + mb) > p ? (p - i) : mb, jb, pb, BB + i * bs0);
         }
-      };
-            
-      const bool is_small = (m*n <= 64*64);
-      if (is_small) {
-        trsm(m, n, A, B);
-      } else {
-        // // some cache blocking may need (not priority yet);
-        // trsm(m, n, A, B);
       }
-    }        
-    return 0;      
-  }
+    };
 
+    const bool is_small = (m * n <= 64 * 64);
+    if (is_small) {
+      trsm(m, n, A, B);
+    } else {
+      // // some cache blocking may need (not priority yet);
+      // trsm(m, n, A, B);
+    }
+  }
+  return 0;
 }
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Impl.hpp
index 31e95dae8e..92ce1493e9 100644
--- a/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_TRSM_TEAMVECTOR_IMPL_HPP__
 #define __KOKKOSBATCHED_TRSM_TEAMVECTOR_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,141 +8,115 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Team Impl
-  /// =========
-
-  ///
-  /// L/L/NT
-  ///
-  /// B := inv(tril(A)) (alpha*B)
-  /// A(m x m), B(m x n)
-
-  template<typename MemberType, typename ArgDiag>
-  struct TeamVectorTrsm<MemberType,Side::Left,Uplo::Lower,Trans::NoTranspose,ArgDiag,Algo::Trsm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return TeamVectorTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(member,
-									    ArgDiag::use_unit_diag,
-									    B.extent(0), B.extent(1),
-									    alpha, 
-									    A.data(), A.stride_0(), A.stride_1(),
-									    B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-
-  ///
-  /// R/U/NT
-  ///
-  /// B := (alpha*B) inv(triu(A))
-  /// A(n x n), B(m x n)
-
-  template<typename MemberType, typename ArgDiag>
-  struct TeamVectorTrsm<MemberType,Side::Right,Uplo::Upper,Trans::NoTranspose,ArgDiag,Algo::Trsm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return TeamVectorTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(member,
-									    ArgDiag::use_unit_diag,
-									    B.extent(1), B.extent(0),
-									    alpha, 
-									    A.data(), A.stride_1(), A.stride_0(),
-									    B.data(), B.stride_1(), B.stride_0());
-    }
-  };
-
-  ///
-  /// L/U/NT
-  ///
-  /// B := inv(triu(A)) (alpha*B) 
-  /// A(m x m), B(m x n)
-
-  template<typename MemberType, typename ArgDiag>
-  struct TeamVectorTrsm<MemberType,Side::Left,Uplo::Upper,Trans::NoTranspose,ArgDiag,Algo::Trsm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return TeamVectorTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(member,
-									    ArgDiag::use_unit_diag,
-									    B.extent(0), B.extent(1),
-									    alpha, 
-									    A.data(), A.stride_0(), A.stride_1(),
-									    B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-
-  ///
-  /// L/L/T
-  ///
-  /// B := inv(tril(AT)) (alpha*B)
-  /// A(m x m), B(m x n)
-
-  template<typename MemberType, typename ArgDiag>
-  struct TeamVectorTrsm<MemberType,Side::Left,Uplo::Lower,Trans::Transpose,ArgDiag,Algo::Trsm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return TeamVectorTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(member,
-									    ArgDiag::use_unit_diag,
-									    B.extent(0), B.extent(1),
-									    alpha, 
-									    A.data(), A.stride_1(), A.stride_0(),
-									    B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-
-  ///
-  /// L/U/T
-  ///
-  /// B := inv(triu(AT)) (alpha*B) 
-  /// A(m x m), B(m x n)
-
-  template<typename MemberType, typename ArgDiag>
-  struct TeamVectorTrsm<MemberType,Side::Left,Uplo::Upper,Trans::Transpose,ArgDiag,Algo::Trsm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return TeamVectorTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(member,
-									    ArgDiag::use_unit_diag,
-									    B.extent(0), B.extent(1),
-									    alpha, 
-									    A.data(), A.stride_1(), A.stride_0(),
-									    B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-
-}
-
+///
+/// Team Impl
+/// =========
+
+///
+/// L/L/NT
+///
+/// B := inv(tril(A)) (alpha*B)
+/// A(m x m), B(m x n)
+
+template <typename MemberType, typename ArgDiag>
+struct TeamVectorTrsm<MemberType, Side::Left, Uplo::Lower, Trans::NoTranspose,
+                      ArgDiag, Algo::Trsm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return TeamVectorTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(
+        member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha,
+        A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(),
+        B.stride_1());
+  }
+};
+
+///
+/// R/U/NT
+///
+/// B := (alpha*B) inv(triu(A))
+/// A(n x n), B(m x n)
+
+template <typename MemberType, typename ArgDiag>
+struct TeamVectorTrsm<MemberType, Side::Right, Uplo::Upper, Trans::NoTranspose,
+                      ArgDiag, Algo::Trsm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return TeamVectorTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(
+        member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha,
+        A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_1(),
+        B.stride_0());
+  }
+};
+
+///
+/// L/U/NT
+///
+/// B := inv(triu(A)) (alpha*B)
+/// A(m x m), B(m x n)
+
+template <typename MemberType, typename ArgDiag>
+struct TeamVectorTrsm<MemberType, Side::Left, Uplo::Upper, Trans::NoTranspose,
+                      ArgDiag, Algo::Trsm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return TeamVectorTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(
+        member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha,
+        A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(),
+        B.stride_1());
+  }
+};
+
+///
+/// L/L/T
+///
+/// B := inv(tril(AT)) (alpha*B)
+/// A(m x m), B(m x n)
+
+template <typename MemberType, typename ArgDiag>
+struct TeamVectorTrsm<MemberType, Side::Left, Uplo::Lower, Trans::Transpose,
+                      ArgDiag, Algo::Trsm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return TeamVectorTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(
+        member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha,
+        A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(),
+        B.stride_1());
+  }
+};
+
+///
+/// L/U/T
+///
+/// B := inv(triu(AT)) (alpha*B)
+/// A(m x m), B(m x n)
+
+template <typename MemberType, typename ArgDiag>
+struct TeamVectorTrsm<MemberType, Side::Left, Uplo::Upper, Trans::Transpose,
+                      ArgDiag, Algo::Trsm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return TeamVectorTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(
+        member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha,
+        A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(),
+        B.stride_1());
+  }
+};
+
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp
index 43f2a58cf5..0afa92ae6e 100644
--- a/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_TRSM_TEAMVECTOR_INTERNAL_HPP__
 #define __KOKKOSBATCHED_TRSM_TEAMVECTOR_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -11,152 +10,128 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Team Internal Impl
-  /// ====================
-
-  template<typename AlgoType>
-  struct TeamVectorTrsmInternalLeftLower {
-    template<typename MemberType,
-             typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const bool use_unit_diag,
-           const int m, const int n, 
-           const ScalarType alpha,
-           const ValueType *__restrict__ A, const int as0, const int as1,
-           /**/  ValueType *__restrict__ B, const int bs0, const int bs1);
-  };
-
-  template<>
-  template<typename MemberType,
-           typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamVectorTrsmInternalLeftLower<Algo::Trsm::Unblocked>::
-  invoke(const MemberType &member, 
-         const bool use_unit_diag,
-         const int m, const int n,
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         /**/  ValueType *__restrict__ B, const int bs0, const int bs1) {
-
-    const ScalarType one(1.0), zero(0.0);
-
-    if (alpha == zero)   TeamVectorSetInternal  ::invoke(member, m, n, zero,  B, bs0, bs1);
-    else {
-      if (alpha != one)  TeamVectorScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1);
-      if (m <= 0 || n <= 0) return 0;
-
-      for (int p=0;p<m;++p) {
-        // Made this non-const in order to WORKAROUND issue #349
-        int iend = m-p-1;
-        int jend = n;
-          
-        const ValueType
-          *__restrict__ a21 = iend ? A+(p+1)*as0+p*as1 : NULL;
-            
-        ValueType
-          *__restrict__ b1t =        B+p*bs0,
-          *__restrict__ B2  = iend ? B+(p+1)*bs0 : NULL;
-
+///
+/// Team Internal Impl
+/// ====================
+
+template <typename AlgoType>
+struct TeamVectorTrsmInternalLeftLower {
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const bool use_unit_diag, const int m,
+      const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+      const int as0, const int as1,
+      /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1);
+};
+
+template <>
+template <typename MemberType, typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+TeamVectorTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(
+    const MemberType &member, const bool use_unit_diag, const int m,
+    const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const int as0, const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) {
+  const ScalarType one(1.0), zero(0.0);
+
+  if (alpha == zero)
+    TeamVectorSetInternal ::invoke(member, m, n, zero, B, bs0, bs1);
+  else {
+    if (alpha != one)
+      TeamVectorScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1);
+    if (m <= 0 || n <= 0) return 0;
+
+    for (int p = 0; p < m; ++p) {
+      // Made this non-const in order to WORKAROUND issue #349
+      int iend = m - p - 1;
+      int jend = n;
+
+      const ValueType *KOKKOS_RESTRICT a21 =
+          iend ? A + (p + 1) * as0 + p * as1 : NULL;
+
+      ValueType *KOKKOS_RESTRICT b1t = B + p * bs0,
+                                 *KOKKOS_RESTRICT B2 =
+                                     iend ? B + (p + 1) * bs0 : NULL;
+
+      member.team_barrier();
+      if (!use_unit_diag) {
+        const ValueType alpha11 = A[p * as0 + p * as1];
+        Kokkos::parallel_for(
+            Kokkos::TeamVectorRange(member, 0, jend),
+            [&](const int &j) { b1t[j * bs1] = b1t[j * bs1] / alpha11; });
         member.team_barrier();
-        if (!use_unit_diag) {
-          const ValueType alpha11 = A[p*as0+p*as1];
-          Kokkos::parallel_for(Kokkos::TeamVectorRange(member,0,jend),[&](const int &j) {
-              b1t[j*bs1] = b1t[j*bs1] / alpha11;
-            });
-          member.team_barrier();
-        }
-        Kokkos::parallel_for
-	  (Kokkos::TeamThreadRange(member,iend),
-	   [&](const int &i) {
-	     Kokkos::parallel_for
-	       (Kokkos::ThreadVectorRange(member,jend),
-		[&](const int &j) {
-		  // assume layout right for batched computation
-		  B2[i*bs0+j*bs1] -= a21[i*as0] * b1t[j*bs1];
-		});
-	   });
       }
-    }      
-    return 0;
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, iend), [&](const int &i) {
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(member, jend), [&](const int &j) {
+                  // assume layout right for batched computation
+                  B2[i * bs0 + j * bs1] -= a21[i * as0] * b1t[j * bs1];
+                });
+          });
+    }
   }
+  return 0;
+}
 
-  template<typename AlgoType>
-  struct TeamVectorTrsmInternalLeftUpper {
-    template<typename MemberType,
-             typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const bool use_unit_diag,
-           const int m, const int n, 
-           const ScalarType alpha,
-           const ValueType *__restrict__ A, const int as0, const int as1,
-           /**/  ValueType *__restrict__ B, const int bs0, const int bs1);
-  };
-
-  template<>
-  template<typename MemberType,
-           typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamVectorTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::
-  invoke(const MemberType &member, 
-         const bool use_unit_diag,
-         const int m, const int n,
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         /**/  ValueType *__restrict__ B, const int bs0, const int bs1) {
-
-    const ScalarType one(1.0), zero(0.0);
-
-    // note that parallel range is different ( m*n vs m-1*n);        
-    if (alpha == zero)  TeamVectorSetInternal  ::invoke(member, m, n, zero,  B, bs0, bs1);
-    else {
-      if (alpha != one) TeamVectorScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1);
-      if (m <= 0 || n <= 0) return 0;
-        
-      ValueType *__restrict__ B0 = B;
-      for (int p=(m-1);p>=0;--p) {
-        // Made this non-const in order to WORKAROUND issue #349
-        int iend = p;
-        int jend = n;
-
-        const ValueType *__restrict__ a01 = A+p*as1;
-        /**/  ValueType *__restrict__ b1t = B+p*bs0;
-            
+template <typename AlgoType>
+struct TeamVectorTrsmInternalLeftUpper {
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const bool use_unit_diag, const int m,
+      const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+      const int as0, const int as1,
+      /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1);
+};
+
+template <>
+template <typename MemberType, typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+TeamVectorTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(
+    const MemberType &member, const bool use_unit_diag, const int m,
+    const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const int as0, const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) {
+  const ScalarType one(1.0), zero(0.0);
+
+  // note that parallel range is different ( m*n vs m-1*n);
+  if (alpha == zero)
+    TeamVectorSetInternal ::invoke(member, m, n, zero, B, bs0, bs1);
+  else {
+    if (alpha != one)
+      TeamVectorScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1);
+    if (m <= 0 || n <= 0) return 0;
+
+    ValueType *KOKKOS_RESTRICT B0 = B;
+    for (int p = (m - 1); p >= 0; --p) {
+      // Made this non-const in order to WORKAROUND issue #349
+      int iend = p;
+      int jend = n;
+
+      const ValueType *KOKKOS_RESTRICT a01 = A + p * as1;
+      /**/ ValueType *KOKKOS_RESTRICT b1t    = B + p * bs0;
+
+      member.team_barrier();
+      if (!use_unit_diag) {
+        const ValueType alpha11 = A[p * as0 + p * as1];
+        Kokkos::parallel_for(
+            Kokkos::TeamVectorRange(member, 0, jend),
+            [&](const int &j) { b1t[j * bs1] = b1t[j * bs1] / alpha11; });
         member.team_barrier();
-        if (!use_unit_diag) {
-          const ValueType alpha11 = A[p*as0+p*as1];
-          Kokkos::parallel_for(Kokkos::TeamVectorRange(member,0,jend),[&](const int &j) {
-              b1t[j*bs1] = b1t[j*bs1] / alpha11;
-            });
-          member.team_barrier();
-        }
-
-        Kokkos::parallel_for
-	  (Kokkos::TeamThreadRange(member,iend),
-	   [&](const int &i) {
-	     Kokkos::parallel_for
-	       (Kokkos::ThreadVectorRange(member,jend),
-		[&](const int &j) {
-		  B0[i*bs0+j*bs1] -= a01[i*as0] * b1t[j*bs1];
-		});
-	   });
       }
+
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, iend), [&](const int &i) {
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(member, jend), [&](const int &j) {
+                  B0[i * bs0 + j * bs1] -= a01[i * as0] * b1t[j * bs1];
+                });
+          });
     }
-    return 0;
   }
-
-
+  return 0;
 }
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp
index c3f79247cf..13684cee80 100644
--- a/src/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_TRSM_TEAM_IMPL_HPP__
 #define __KOKKOSBATCHED_TRSM_TEAM_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,241 +8,190 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Team Impl
-  /// =========
-
-  ///
-  /// L/L/NT
-  ///
-  /// B := inv(tril(A)) (alpha*B)
-  /// A(m x m), B(m x n)
-
-  template<typename MemberType, typename ArgDiag>
-  struct TeamTrsm<MemberType,Side::Left,Uplo::Lower,Trans::NoTranspose,ArgDiag,Algo::Trsm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return TeamTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(member,
-                                                                      ArgDiag::use_unit_diag,
-                                                                      B.extent(0), B.extent(1),
-                                                                      alpha, 
-                                                                      A.data(), A.stride_0(), A.stride_1(),
-                                                                      B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-
-  template<typename MemberType, typename ArgDiag>
-  struct TeamTrsm<MemberType,Side::Left,Uplo::Lower,Trans::NoTranspose,ArgDiag,Algo::Trsm::Blocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return TeamTrsmInternalLeftLower<Algo::Trsm::Blocked>::invoke(member,
-                                                                    ArgDiag::use_unit_diag,
-                                                                    B.extent(0), B.extent(1),
-                                                                    alpha, 
-                                                                    A.data(), A.stride_0(), A.stride_1(),
-                                                                    B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-
-  ///
-  /// R/U/NT
-  ///
-  /// B := (alpha*B) inv(triu(A))
-  /// A(n x n), B(m x n)
-
-  template<typename MemberType, typename ArgDiag>
-  struct TeamTrsm<MemberType,Side::Right,Uplo::Upper,Trans::NoTranspose,ArgDiag,Algo::Trsm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return TeamTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(member,
-                                                                      ArgDiag::use_unit_diag,
-                                                                      B.extent(1), B.extent(0),
-                                                                      alpha, 
-                                                                      A.data(), A.stride_1(), A.stride_0(),
-                                                                      B.data(), B.stride_1(), B.stride_0());
-    }
-  };
-
-  template<typename MemberType, typename ArgDiag>
-  struct TeamTrsm<MemberType,Side::Right,Uplo::Upper,Trans::NoTranspose,ArgDiag,Algo::Trsm::Blocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return TeamTrsmInternalLeftLower<Algo::Trsm::Blocked>::invoke(member,
-                                                                    ArgDiag::use_unit_diag,
-                                                                    B.extent(1), B.extent(0),
-                                                                    alpha, 
-                                                                    A.data(), A.stride_1(), A.stride_0(),
-                                                                    B.data(), B.stride_1(), B.stride_0());
-    }      
-  };
-    
-  ///
-  /// L/U/NT
-  ///
-  /// B := inv(triu(A)) (alpha*B) 
-  /// A(m x m), B(m x n)
-
-  template<typename MemberType, typename ArgDiag>
-  struct TeamTrsm<MemberType,Side::Left,Uplo::Upper,Trans::NoTranspose,ArgDiag,Algo::Trsm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return TeamTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(member,
-                                                                      ArgDiag::use_unit_diag,
-                                                                      B.extent(0), B.extent(1),
-                                                                      alpha, 
-                                                                      A.data(), A.stride_0(), A.stride_1(),
-                                                                      B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-
-  template<typename MemberType, typename ArgDiag>
-  struct TeamTrsm<MemberType,Side::Left,Uplo::Upper,Trans::NoTranspose,ArgDiag,Algo::Trsm::Blocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return TeamTrsmInternalLeftUpper<Algo::Trsm::Blocked>::invoke(member, 
-                                                                    ArgDiag::use_unit_diag,
-                                                                    B.extent(0), B.extent(1),
-                                                                    alpha, 
-                                                                    A.data(), A.stride_0(), A.stride_1(),
-                                                                    B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-
-  ///
-  /// L/L/T
-  ///
-  /// B := inv(tril(AT)) (alpha*B)
-  /// A(m x m), B(m x n)
-
-  template<typename MemberType, typename ArgDiag>
-  struct TeamTrsm<MemberType,Side::Left,Uplo::Lower,Trans::Transpose,ArgDiag,Algo::Trsm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return TeamTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(member,
-                                                                      ArgDiag::use_unit_diag,
-                                                                      B.extent(0), B.extent(1),
-                                                                      alpha, 
-                                                                      A.data(), A.stride_1(), A.stride_0(),
-                                                                      B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-
-  template<typename MemberType, typename ArgDiag>
-  struct TeamTrsm<MemberType,Side::Left,Uplo::Lower,Trans::Transpose,ArgDiag,Algo::Trsm::Blocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return TeamTrsmInternalLeftUpper<Algo::Trsm::Blocked>::invoke(member,
-                                                                    ArgDiag::use_unit_diag,
-                                                                    B.extent(0), B.extent(1),
-                                                                    alpha, 
-                                                                    A.data(), A.stride_1(), A.stride_0(),
-                                                                    B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-
-  ///
-  /// L/U/T
-  ///
-  /// B := inv(triu(AT)) (alpha*B) 
-  /// A(m x m), B(m x n)
-
-  template<typename MemberType, typename ArgDiag>
-  struct TeamTrsm<MemberType,Side::Left,Uplo::Upper,Trans::Transpose,ArgDiag,Algo::Trsm::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return TeamTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(member,
-                                                                      ArgDiag::use_unit_diag,
-                                                                      B.extent(0), B.extent(1),
-                                                                      alpha, 
-                                                                      A.data(), A.stride_1(), A.stride_0(),
-                                                                      B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-
-  template<typename MemberType, typename ArgDiag>
-  struct TeamTrsm<MemberType,Side::Left,Uplo::Upper,Trans::Transpose,ArgDiag,Algo::Trsm::Blocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename BViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const BViewType &B) {
-      return TeamTrsmInternalLeftLower<Algo::Trsm::Blocked>::invoke(member, 
-                                                                    ArgDiag::use_unit_diag,
-                                                                    B.extent(0), B.extent(1),
-                                                                    alpha, 
-                                                                    A.data(), A.stride_1(), A.stride_0(),
-                                                                    B.data(), B.stride_0(), B.stride_1());
-    }
-  };
-	
-}
-
+///
+/// Team Impl
+/// =========
+
+///
+/// L/L/NT
+///
+/// B := inv(tril(A)) (alpha*B)
+/// A(m x m), B(m x n)
+
+template <typename MemberType, typename ArgDiag>
+struct TeamTrsm<MemberType, Side::Left, Uplo::Lower, Trans::NoTranspose,
+                ArgDiag, Algo::Trsm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return TeamTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(
+        member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha,
+        A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(),
+        B.stride_1());
+  }
+};
+
+template <typename MemberType, typename ArgDiag>
+struct TeamTrsm<MemberType, Side::Left, Uplo::Lower, Trans::NoTranspose,
+                ArgDiag, Algo::Trsm::Blocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return TeamTrsmInternalLeftLower<Algo::Trsm::Blocked>::invoke(
+        member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha,
+        A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(),
+        B.stride_1());
+  }
+};
+
+///
+/// R/U/NT
+///
+/// B := (alpha*B) inv(triu(A))
+/// A(n x n), B(m x n)
+
+template <typename MemberType, typename ArgDiag>
+struct TeamTrsm<MemberType, Side::Right, Uplo::Upper, Trans::NoTranspose,
+                ArgDiag, Algo::Trsm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return TeamTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(
+        member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha,
+        A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_1(),
+        B.stride_0());
+  }
+};
+
+template <typename MemberType, typename ArgDiag>
+struct TeamTrsm<MemberType, Side::Right, Uplo::Upper, Trans::NoTranspose,
+                ArgDiag, Algo::Trsm::Blocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return TeamTrsmInternalLeftLower<Algo::Trsm::Blocked>::invoke(
+        member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha,
+        A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_1(),
+        B.stride_0());
+  }
+};
+
+///
+/// L/U/NT
+///
+/// B := inv(triu(A)) (alpha*B)
+/// A(m x m), B(m x n)
+
+template <typename MemberType, typename ArgDiag>
+struct TeamTrsm<MemberType, Side::Left, Uplo::Upper, Trans::NoTranspose,
+                ArgDiag, Algo::Trsm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return TeamTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(
+        member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha,
+        A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(),
+        B.stride_1());
+  }
+};
+
+template <typename MemberType, typename ArgDiag>
+struct TeamTrsm<MemberType, Side::Left, Uplo::Upper, Trans::NoTranspose,
+                ArgDiag, Algo::Trsm::Blocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return TeamTrsmInternalLeftUpper<Algo::Trsm::Blocked>::invoke(
+        member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha,
+        A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(),
+        B.stride_1());
+  }
+};
+
+///
+/// L/L/T
+///
+/// B := inv(tril(AT)) (alpha*B)
+/// A(m x m), B(m x n)
+
+template <typename MemberType, typename ArgDiag>
+struct TeamTrsm<MemberType, Side::Left, Uplo::Lower, Trans::Transpose, ArgDiag,
+                Algo::Trsm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return TeamTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(
+        member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha,
+        A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(),
+        B.stride_1());
+  }
+};
+
+template <typename MemberType, typename ArgDiag>
+struct TeamTrsm<MemberType, Side::Left, Uplo::Lower, Trans::Transpose, ArgDiag,
+                Algo::Trsm::Blocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return TeamTrsmInternalLeftUpper<Algo::Trsm::Blocked>::invoke(
+        member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha,
+        A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(),
+        B.stride_1());
+  }
+};
+
+///
+/// L/U/T
+///
+/// B := inv(triu(AT)) (alpha*B)
+/// A(m x m), B(m x n)
+
+template <typename MemberType, typename ArgDiag>
+struct TeamTrsm<MemberType, Side::Left, Uplo::Upper, Trans::Transpose, ArgDiag,
+                Algo::Trsm::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return TeamTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(
+        member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha,
+        A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(),
+        B.stride_1());
+  }
+};
+
+template <typename MemberType, typename ArgDiag>
+struct TeamTrsm<MemberType, Side::Left, Uplo::Upper, Trans::Transpose, ArgDiag,
+                Algo::Trsm::Blocked> {
+  template <typename ScalarType, typename AViewType, typename BViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const BViewType &B) {
+    return TeamTrsmInternalLeftLower<Algo::Trsm::Blocked>::invoke(
+        member, ArgDiag::use_unit_diag, B.extent(0), B.extent(1), alpha,
+        A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(),
+        B.stride_1());
+  }
+};
+
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp
index 64d8368f16..37e5051675 100644
--- a/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_TRSM_TEAM_INTERNAL_HPP__
 #define __KOKKOSBATCHED_TRSM_TEAM_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -15,310 +14,276 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Team Internal Impl
-  /// ====================
-
-  template<typename AlgoType>
-  struct TeamTrsmInternalLeftLower {
-    template<typename MemberType,
-             typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const bool use_unit_diag,
-           const int m, const int n, 
-           const ScalarType alpha,
-           const ValueType *__restrict__ A, const int as0, const int as1,
-           /**/  ValueType *__restrict__ B, const int bs0, const int bs1);
-  };
-
-  template<>
-  template<typename MemberType,
-           typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamTrsmInternalLeftLower<Algo::Trsm::Unblocked>::
-  invoke(const MemberType &member, 
-         const bool use_unit_diag,
-         const int m, const int n,
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         /**/  ValueType *__restrict__ B, const int bs0, const int bs1) {
-
-    const ScalarType one(1.0), zero(0.0);
-
-    if (alpha == zero)   TeamSetInternal  ::invoke(member, m, n, zero,  B, bs0, bs1);
-    else {
-      if (alpha != one)  TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1);
-      if (m <= 0 || n <= 0) return 0;
-
-      for (int p=0;p<m;++p) {
-        // Made this non-const in order to WORKAROUND issue #349
-        int iend = m-p-1;
-        int jend = n;
-          
-        const ValueType
-          *__restrict__ a21 = iend ? A+(p+1)*as0+p*as1 : NULL;
-            
-        ValueType
-          *__restrict__ b1t =        B+p*bs0,
-          *__restrict__ B2  = iend ? B+(p+1)*bs0 : NULL;
-
+///
+/// Team Internal Impl
+/// ====================
+
+template <typename AlgoType>
+struct TeamTrsmInternalLeftLower {
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const bool use_unit_diag, const int m,
+      const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+      const int as0, const int as1,
+      /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1);
+};
+
+template <>
+template <typename MemberType, typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+TeamTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(
+    const MemberType &member, const bool use_unit_diag, const int m,
+    const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const int as0, const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) {
+  const ScalarType one(1.0), zero(0.0);
+
+  if (alpha == zero)
+    TeamSetInternal ::invoke(member, m, n, zero, B, bs0, bs1);
+  else {
+    if (alpha != one)
+      TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1);
+    if (m <= 0 || n <= 0) return 0;
+
+    for (int p = 0; p < m; ++p) {
+      // Made this non-const in order to WORKAROUND issue #349
+      int iend = m - p - 1;
+      int jend = n;
+
+      const ValueType *KOKKOS_RESTRICT a21 =
+          iend ? A + (p + 1) * as0 + p * as1 : NULL;
+
+      ValueType *KOKKOS_RESTRICT b1t = B + p * bs0,
+                                 *KOKKOS_RESTRICT B2 =
+                                     iend ? B + (p + 1) * bs0 : NULL;
+
+      member.team_barrier();
+      if (!use_unit_diag) {
+        const ValueType alpha11 = A[p * as0 + p * as1];
+        Kokkos::parallel_for(
+            Kokkos::TeamThreadRange(member, 0, jend),
+            [&](const int &j) { b1t[j * bs1] = b1t[j * bs1] / alpha11; });
         member.team_barrier();
-        if (!use_unit_diag) {
-          const ValueType alpha11 = A[p*as0+p*as1];
-          Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,jend),[&](const int &j) {
-              b1t[j*bs1] = b1t[j*bs1] / alpha11;
-            });
-          member.team_barrier();
-        }
-        Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,iend*jend),[&](const int &ij) {
-            // assume layout right for batched computation
-            const int i = ij/jend, j = ij%jend;
-            B2[i*bs0+j*bs1] -= a21[i*as0] * b1t[j*bs1];
-          });          
       }
-    }      
-    return 0;
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, 0, iend * jend), [&](const int &ij) {
+            // assume layout right for batched computation
+            const int i = ij / jend, j = ij % jend;
+            B2[i * bs0 + j * bs1] -= a21[i * as0] * b1t[j * bs1];
+          });
+    }
   }
+  return 0;
+}
 
-  template<>
-  template<typename MemberType,
-           typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamTrsmInternalLeftLower<Algo::Trsm::Blocked>::
-  invoke(const MemberType &member, 
-         const bool use_unit_diag,
-         const int m, const int n,
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         /**/  ValueType *__restrict__ B, const int bs0, const int bs1) {
-
-    enum : int {
-      mbAlgo = Algo::Trsm::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
-    };
-
-    const ScalarType one(1.0), zero(0.0), minus_one(-1.0);
-
-    // note that parallel range is different ( m*n vs m-1*n);        
-    if (alpha == zero)  TeamSetInternal  ::invoke(member, m, n, zero,  B, bs0, bs1);
-    else {
-      if (alpha != one) TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1);
-      if (m <= 0 || n <= 0) return 0;
-
-      ///
-      /// case host: team size is small and blocksize (mb,nb) is large
-            
-      ///
-      /// case GPU: team size is large and blocksize (mb,nb) is small
-      InnerTrsmLeftLowerUnitDiag<mbAlgo>    trsm_u(as0, as1, bs0, bs1);
-      InnerTrsmLeftLowerNonUnitDiag<mbAlgo> trsm_n(as0, as1, bs0, bs1);
-            
-      auto trsm = [&](const int ib, 
-                      const int jb,
-                      const ValueType *__restrict__ AA,
-                      /**/  ValueType *__restrict__ BB) {
-        const int mb = mbAlgo;
-        const int tsize = member.team_size();
+template <>
+template <typename MemberType, typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+TeamTrsmInternalLeftLower<Algo::Trsm::Blocked>::invoke(
+    const MemberType &member, const bool use_unit_diag, const int m,
+    const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const int as0, const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) {
+  constexpr int mbAlgo = Algo::Trsm::Blocked::mb();
+
+  const ScalarType one(1.0), zero(0.0), minus_one(-1.0);
+
+  // note that parallel range is different ( m*n vs m-1*n);
+  if (alpha == zero)
+    TeamSetInternal ::invoke(member, m, n, zero, B, bs0, bs1);
+  else {
+    if (alpha != one)
+      TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1);
+    if (m <= 0 || n <= 0) return 0;
+
+    ///
+    /// case host: team size is small and blocksize (mb,nb) is large
+
+    ///
+    /// case GPU: team size is large and blocksize (mb,nb) is small
+    InnerTrsmLeftLowerUnitDiag<mbAlgo> trsm_u(as0, as1, bs0, bs1);
+    InnerTrsmLeftLowerNonUnitDiag<mbAlgo> trsm_n(as0, as1, bs0, bs1);
+
+    auto trsm = [&](const int ib, const int jb,
+                    const ValueType *KOKKOS_RESTRICT AA,
+                    /**/ ValueType *KOKKOS_RESTRICT BB) {
+      const int mb    = mbAlgo;
+      const int tsize = member.team_size();
+      // Made this non-const in order to WORKAROUND issue #349
+      int nb = (jb / tsize + jb % tsize > 0);
+      int np = jb % nb;
+      for (int p = 0; p < ib; p += mb) {
         // Made this non-const in order to WORKAROUND issue #349
-        int nb = (jb/tsize + jb%tsize > 0);
-        int np = jb%nb;
-        for (int p=0;p<ib;p+=mb) {
-          // Made this non-const in order to WORKAROUND issue #349
-          int pb = ((p+mb) > ib ? (ib-p) : mb); 
-                
-          // trsm update
-          const ValueType *__restrict__ Ap = AA+p*as0+p*as1;
-          /**/  ValueType *__restrict__ Bp = BB+p*bs0;
-                
-          member.team_barrier();                  
-          Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,(jb/nb)+(np>0)),[&](const int jj) {
+        int pb = ((p + mb) > ib ? (ib - p) : mb);
+
+        // trsm update
+        const ValueType *KOKKOS_RESTRICT Ap = AA + p * as0 + p * as1;
+        /**/ ValueType *KOKKOS_RESTRICT Bp    = BB + p * bs0;
+
+        member.team_barrier();
+        Kokkos::parallel_for(
+            Kokkos::TeamThreadRange(member, 0, (jb / nb) + (np > 0)),
+            [&](const int jj) {
               // Made this non-const in order to WORKAROUND issue #349
-              int j = jj*nb, qb = (j+nb) > jb ? np : nb;
-              if (use_unit_diag) trsm_u.serial_invoke(Ap, pb, qb, Bp+j*bs1);
-              else               trsm_n.serial_invoke(Ap, pb, qb, Bp+j*bs1);
+              int j = jj * nb, qb = (j + nb) > jb ? np : nb;
+              if (use_unit_diag)
+                trsm_u.serial_invoke(Ap, pb, qb, Bp + j * bs1);
+              else
+                trsm_n.serial_invoke(Ap, pb, qb, Bp + j * bs1);
             });
-          member.team_barrier();
-                
-          // gemm update
-          TeamGemmInternal<Algo::Gemm::Blocked>
-            ::invoke(member,
-                     ib-p-pb, jb, pb,
-                     minus_one,
-                     Ap+pb*as0, as0, as1,
-                     Bp, bs0, bs1,
-                     one,
-                     Bp+pb*bs0, bs0, bs1);
-        }
-      };
-            
-      const bool is_small = true; //(m*n <= 64*64);
-      if (is_small) {
-        trsm(m, n, A, B);
-      } else {
-        // // some cache blocking may need (not priority yet);
-        // trsm(m, n, A, B);
+        member.team_barrier();
+
+        // gemm update
+        TeamGemmInternal<Algo::Gemm::Blocked>::invoke(
+            member, ib - p - pb, jb, pb, minus_one, Ap + pb * as0, as0, as1, Bp,
+            bs0, bs1, one, Bp + pb * bs0, bs0, bs1);
       }
+    };
+
+    const bool is_small = true;  //(m*n <= 64*64);
+    if (is_small) {
+      trsm(m, n, A, B);
+    } else {
+      // // some cache blocking may need (not priority yet);
+      // trsm(m, n, A, B);
     }
-    return 0;
   }
+  return 0;
+}
 
-  template<typename AlgoType>
-  struct TeamTrsmInternalLeftUpper {
-    template<typename MemberType,
-             typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const bool use_unit_diag,
-           const int m, const int n, 
-           const ScalarType alpha,
-           const ValueType *__restrict__ A, const int as0, const int as1,
-           /**/  ValueType *__restrict__ B, const int bs0, const int bs1);
-  };
-
-  template<>
-  template<typename MemberType,
-           typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::
-  invoke(const MemberType &member, 
-         const bool use_unit_diag,
-         const int m, const int n,
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         /**/  ValueType *__restrict__ B, const int bs0, const int bs1) {
-    const ScalarType one(1.0), zero(0.0);
-
-    // note that parallel range is different ( m*n vs m-1*n);        
-    if (alpha == zero)  TeamSetInternal  ::invoke(member, m, n, zero,  B, bs0, bs1);
-    else {
-      if (alpha != one) TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1);
-      if (m <= 0 || n <= 0) return 0;
-        
-      ValueType *__restrict__ B0 = B;
-      for (int p=(m-1);p>=0;--p) {
-        // Made this non-const in order to WORKAROUND issue #349
-        int iend = p;
-        int jend = n;
-
-        const ValueType *__restrict__ a01 = A+p*as1;
-        /**/  ValueType *__restrict__ b1t = B+p*bs0;
-            
+template <typename AlgoType>
+struct TeamTrsmInternalLeftUpper {
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const bool use_unit_diag, const int m,
+      const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+      const int as0, const int as1,
+      /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1);
+};
+
+template <>
+template <typename MemberType, typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+TeamTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(
+    const MemberType &member, const bool use_unit_diag, const int m,
+    const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const int as0, const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) {
+  const ScalarType one(1.0), zero(0.0);
+
+  // note that parallel range is different ( m*n vs m-1*n);
+  if (alpha == zero)
+    TeamSetInternal ::invoke(member, m, n, zero, B, bs0, bs1);
+  else {
+    if (alpha != one)
+      TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1);
+    if (m <= 0 || n <= 0) return 0;
+
+    ValueType *KOKKOS_RESTRICT B0 = B;
+    for (int p = (m - 1); p >= 0; --p) {
+      // Made this non-const in order to WORKAROUND issue #349
+      int iend = p;
+      int jend = n;
+
+      const ValueType *KOKKOS_RESTRICT a01 = A + p * as1;
+      /**/ ValueType *KOKKOS_RESTRICT b1t    = B + p * bs0;
+
+      member.team_barrier();
+      if (!use_unit_diag) {
+        const ValueType alpha11 = A[p * as0 + p * as1];
+        Kokkos::parallel_for(
+            Kokkos::TeamThreadRange(member, 0, jend),
+            [&](const int &j) { b1t[j * bs1] = b1t[j * bs1] / alpha11; });
         member.team_barrier();
-        if (!use_unit_diag) {
-          const ValueType alpha11 = A[p*as0+p*as1];
-          Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,jend),[&](const int &j) {
-              b1t[j*bs1] = b1t[j*bs1] / alpha11;
-            });
-          member.team_barrier();
-        }
+      }
 
-        Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,iend*jend),[&](const int &ij) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, 0, iend * jend), [&](const int &ij) {
             int i, j;
-            if(KokkosKernels::Impl::kk_is_gpu_exec_space<typename MemberType::execution_space>()) {
-              i = ij%iend;
-              j = ij/iend;
+            if (KokkosKernels::Impl::kk_is_gpu_exec_space<
+                    typename MemberType::execution_space>()) {
+              i = ij % iend;
+              j = ij / iend;
+            } else {
+              i = ij / jend;
+              j = ij % jend;
             }
-            else {
-              i = ij/jend;
-              j = ij%jend;
-            }
-            B0[i*bs0+j*bs1] -= a01[i*as0] * b1t[j*bs1];
-          });          
-      }
+            B0[i * bs0 + j * bs1] -= a01[i * as0] * b1t[j * bs1];
+          });
     }
-    return 0;
   }
+  return 0;
+}
 
-  template<>
-  template<typename MemberType,
-           typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamTrsmInternalLeftUpper<Algo::Trsm::Blocked>::
-  invoke(const MemberType &member,
-         const bool use_unit_diag,
-         const int m, const int n,
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         /**/  ValueType *__restrict__ B, const int bs0, const int bs1) {
-
-    enum : int {
-      mbAlgo = Algo::Trsm::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
-    };
+template <>
+template <typename MemberType, typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+TeamTrsmInternalLeftUpper<Algo::Trsm::Blocked>::invoke(
+    const MemberType &member, const bool use_unit_diag, const int m,
+    const int n, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A,
+    const int as0, const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) {
+  constexpr int mbAlgo = Algo::Trsm::Blocked::mb();
+
+  const ScalarType one(1.0), zero(0.0), minus_one(-1.0);
+
+  // note that parallel range is different ( m*n vs m-1*n);
+  if (alpha == zero)
+    TeamSetInternal ::invoke(member, m, n, zero, B, bs0, bs1);
+  else {
+    if (alpha != one)
+      TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1);
+    if (m <= 0 || n <= 0) return 0;
+
+    InnerTrsmLeftUpperUnitDiag<mbAlgo> trsm_u(as0, as1, bs0, bs1);
+    InnerTrsmLeftUpperNonUnitDiag<mbAlgo> trsm_n(as0, as1, bs0, bs1);
+
+    auto trsm = [&](const int ib, const int jb,
+                    const ValueType *KOKKOS_RESTRICT AA,
+                    /**/ ValueType *KOKKOS_RESTRICT BB) {
+      const int mb    = mbAlgo;  //(ib <=5 ? ib : mbAlgo);
+      const int tsize = member.team_size();
+      // Made this non-const in order to WORKAROUND issue #349
+      int nb = (jb / tsize + jb % tsize > 0);
+      int np = jb % nb;
+      for (int pp = 0; pp < ib; pp += mb) {
+        const int ptmp = (ib - pp - mb), p = (ptmp < 0 ? 0 : ptmp),
+                  pb = (mb + (ptmp < 0) * ptmp);
+
+        // trsm update
+        const ValueType *KOKKOS_RESTRICT Ap = AA + p * as0 + p * as1;
+        /**/ ValueType *KOKKOS_RESTRICT Bp    = BB + p * bs0;
 
-    const ScalarType one(1.0), zero(0.0), minus_one(-1.0);
-
-    // note that parallel range is different ( m*n vs m-1*n);        
-    if (alpha == zero)  TeamSetInternal  ::invoke(member, m, n, zero,  B, bs0, bs1);
-    else {
-      if (alpha != one) TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1);
-      if (m <= 0 || n <= 0) return 0;
-
-      InnerTrsmLeftUpperUnitDiag<mbAlgo>    trsm_u(as0, as1, bs0, bs1);
-      InnerTrsmLeftUpperNonUnitDiag<mbAlgo> trsm_n(as0, as1, bs0, bs1);
-            
-      auto trsm = [&](const int ib, 
-                      const int jb,
-                      const ValueType *__restrict__ AA,
-                      /**/  ValueType *__restrict__ BB) {
-        const int mb = mbAlgo; //(ib <=5 ? ib : mbAlgo);
-        const int tsize = member.team_size();
-        // Made this non-const in order to WORKAROUND issue #349
-        int nb = (jb/tsize + jb%tsize > 0);
-        int np = jb%nb;
-        for (int pp=0;pp<ib;pp+=mb) {
-          const int 
-            ptmp = (ib - pp - mb), 
-            p = (ptmp < 0 ? 0 : ptmp), 
-            pb = (mb + (ptmp < 0)*ptmp);
-                  
-          // trsm update
-          const ValueType *__restrict__ Ap = AA+p*as0+p*as1;
-          /**/  ValueType *__restrict__ Bp = BB+p*bs0;
-
-          member.team_barrier();
-          Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,(jb/nb)+(np>0)),[&](const int &jj) {
-              const int j = jj*nb, qb = (j+nb) > jb ? np : nb;     
-              if (use_unit_diag) trsm_u.serial_invoke(Ap, pb, qb, Bp+j*bs1);
-              else               trsm_n.serial_invoke(Ap, pb, qb, Bp+j*bs1);
+        member.team_barrier();
+        Kokkos::parallel_for(
+            Kokkos::TeamThreadRange(member, 0, (jb / nb) + (np > 0)),
+            [&](const int &jj) {
+              const int j = jj * nb, qb = (j + nb) > jb ? np : nb;
+              if (use_unit_diag)
+                trsm_u.serial_invoke(Ap, pb, qb, Bp + j * bs1);
+              else
+                trsm_n.serial_invoke(Ap, pb, qb, Bp + j * bs1);
             });
-          member.team_barrier();
-                  
-          // gemm update
-          TeamGemmInternal<Algo::Gemm::Blocked>
-            ::invoke(member,
-                     p, jb, pb,
-                     minus_one,
-                     Ap-p*as0, as0, as1,
-                     Bp, bs0, bs1,
-                     one,
-                     BB, bs0, bs1);
-        }
-      };
-          
-      const bool is_small = true; //(m*n <= 64*64);
-      if (is_small) {
-        trsm(m, n, A, B);
-      } else {
-        // // some cache blocking may need (not priority yet);
-        // trsm(m, n, A, B);
+        member.team_barrier();
+
+        // gemm update
+        TeamGemmInternal<Algo::Gemm::Blocked>::invoke(
+            member, p, jb, pb, minus_one, Ap - p * as0, as0, as1, Bp, bs0, bs1,
+            one, BB, bs0, bs1);
       }
-    }        
-    return 0;      
-  }
+    };
 
+    const bool is_small = true;  //(m*n <= 64*64);
+    if (is_small) {
+      trsm(m, n, A, B);
+    } else {
+      // // some cache blocking may need (not priority yet);
+      // trsm(m, n, A, B);
+    }
+  }
+  return 0;
 }
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Trsv_Serial_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Trsv_Serial_Impl.hpp
index 3b9acce67f..6e26b0fec1 100644
--- a/src/batched/dense/impl/KokkosBatched_Trsv_Serial_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Trsv_Serial_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_TRSV_SERIAL_IMPL_HPP__
 #define __KOKKOSBATCHED_TRSV_SERIAL_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,402 +8,315 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Impl
-  /// ===========
-
-  ///
-  /// Implemented:
-  /// L/NT, U/NT, L/T, U/T
-  /// 
-  /// Not yet implemented
-  /// L/CT, U/CT 
-
-  ///
-  /// L/NT
-  ///
-
-#if                                                     \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&               \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) &&       \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
-  template<typename ArgDiag>
-  struct SerialTrsv<Uplo::Lower,Trans::NoTranspose,ArgDiag,Algo::Trsv::CompactMKL> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const bViewType &b) {
-      typedef typename bViewType::value_type vector_type;
-      //typedef typename vector_type::value_type value_type;
-        
-      const int
-        m = b.extent(0),
-        n = 1;
-
-      static_assert(is_vector<vector_type>::value, "value type is not vector type");      
-      static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, 
-                    "AVX, AVX2 and AVX512 is supported");
-      const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ?  MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
-        
-      // no error check
-      int r_val = 0;
-      if (A.stride_0() == 1) { 
-        mkl_dtrsm_compact(MKL_COL_MAJOR, 
-                          MKL_LEFT, MKL_LOWER, MKL_NOTRANS, 
-                          ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT,
-                          m, n, 
-                          alpha, 
-                          (const double*)A.data(), A.stride_0(), 
-                          (double*)b.data(), b.stride_0(), 
-                          format, (MKL_INT)vector_type::vector_length);
-      } else if (A.stride_1() == 1) {  
-        mkl_dtrsm_compact(MKL_ROW_MAJOR, 
-                          MKL_LEFT, MKL_LOWER, MKL_NOTRANS, 
-                          ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT,
-                          m, n, 
-                          alpha, 
-                          (const double*)A.data(), A.stride_0(), 
-                          (double*)b.data(), b.stride_0(), 
-                          format, (MKL_INT)vector_type::vector_length);
-      } else {
-        r_val = -1;
-      }
-      return r_val;
+///
+/// Serial Impl
+/// ===========
+
+///
+/// Implemented:
+/// L/NT, U/NT, L/T, U/T
+///
+/// Not yet implemented
+/// L/CT, U/CT
+
+///
+/// L/NT
+///
+
+#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&         \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
+template <typename ArgDiag>
+struct SerialTrsv<Uplo::Lower, Trans::NoTranspose, ArgDiag,
+                  Algo::Trsv::CompactMKL> {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const bViewType &b) {
+    typedef typename bViewType::value_type vector_type;
+    // typedef typename vector_type::value_type value_type;
+
+    const int m = b.extent(0), n = 1;
+
+    static_assert(is_vector<vector_type>::value,
+                  "value type is not vector type");
+    static_assert(
+        vector_type::vector_length == 4 || vector_type::vector_length == 8,
+        "AVX, AVX2 and AVX512 is supported");
+    const MKL_COMPACT_PACK format =
+        vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
+
+    // no error check
+    int r_val = 0;
+    if (A.stride_0() == 1) {
+      mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_LOWER, MKL_NOTRANS,
+                        ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n,
+                        alpha, (const double *)A.data(), A.stride_0(),
+                        (double *)b.data(), b.stride_0(), format,
+                        (MKL_INT)vector_type::vector_length);
+    } else if (A.stride_1() == 1) {
+      mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_LOWER, MKL_NOTRANS,
+                        ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n,
+                        alpha, (const double *)A.data(), A.stride_0(),
+                        (double *)b.data(), b.stride_0(), format,
+                        (MKL_INT)vector_type::vector_length);
+    } else {
+      r_val = -1;
     }
-  };
+    return r_val;
+  }
+};
 #endif
-    
-  template<typename ArgDiag>
-  struct SerialTrsv<Uplo::Lower,Trans::NoTranspose,ArgDiag,Algo::Trsv::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const bViewType &b) {
-      return SerialTrsvInternalLower<Algo::Trsv::Unblocked>::
-        invoke(ArgDiag::use_unit_diag,
-               A.extent(0), 
-               alpha,
-               A.data(), A.stride_0(), A.stride_1(),
-               b.data(), b.stride_0());
-    }
-  };
-
-  template<typename ArgDiag>
-  struct SerialTrsv<Uplo::Lower,Trans::NoTranspose,ArgDiag,Algo::Trsv::Blocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const bViewType &b) {
-      return SerialTrsvInternalLower<Algo::Trsv::Blocked>::
-        invoke(ArgDiag::use_unit_diag,
-               A.extent(0), 
-               alpha,
-               A.data(), A.stride_0(), A.stride_1(),
-               b.data(), b.stride_0());
-    }
-  };
-
-  ///
-  /// L/T
-  ///
-
-#if                                                     \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&               \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) &&       \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
-  template<typename ArgDiag>
-  struct SerialTrsv<Uplo::Lower,Trans::Transpose,ArgDiag,Algo::Trsv::CompactMKL> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const bViewType &b) {
-      typedef typename bViewType::value_type vector_type;
-      //typedef typename vector_type::value_type value_type;
-        
-      const int
-        m = b.extent(0),
-        n = 1;
-
-      static_assert(is_vector<vector_type>::value, "value type is not vector type");      
-      static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, 
-                    "AVX, AVX2 and AVX512 is supported");
-      const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ?  MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
-        
-      // no error check
-      int r_val = 0;
-      if (A.stride_0() == 1) { 
-        mkl_dtrsm_compact(MKL_COL_MAJOR, 
-                          MKL_LEFT, MKL_LOWER, MKL_TRANS, 
-                          ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT,
-                          m, n, 
-                          alpha, 
-                          (const double*)A.data(), A.stride_0(), 
-                          (double*)b.data(), b.stride_0(), 
-                          format, (MKL_INT)vector_type::vector_length);
-      } else if (A.stride_1() == 1) {  
-        mkl_dtrsm_compact(MKL_ROW_MAJOR, 
-                          MKL_LEFT, MKL_LOWER, MKL_TRANS, 
-                          ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT,
-                          m, n, 
-                          alpha, 
-                          (const double*)A.data(), A.stride_0(), 
-                          (double*)b.data(), b.stride_0(), 
-                          format, (MKL_INT)vector_type::vector_length);
-      } else {
-        r_val = -1;
-      }
-      return r_val;
+
+template <typename ArgDiag>
+struct SerialTrsv<Uplo::Lower, Trans::NoTranspose, ArgDiag,
+                  Algo::Trsv::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const bViewType &b) {
+    return SerialTrsvInternalLower<Algo::Trsv::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), A.stride_0(),
+        A.stride_1(), b.data(), b.stride_0());
+  }
+};
+
+template <typename ArgDiag>
+struct SerialTrsv<Uplo::Lower, Trans::NoTranspose, ArgDiag,
+                  Algo::Trsv::Blocked> {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const bViewType &b) {
+    return SerialTrsvInternalLower<Algo::Trsv::Blocked>::invoke(
+        ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), A.stride_0(),
+        A.stride_1(), b.data(), b.stride_0());
+  }
+};
+
+///
+/// L/T
+///
+
+#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&         \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
+template <typename ArgDiag>
+struct SerialTrsv<Uplo::Lower, Trans::Transpose, ArgDiag,
+                  Algo::Trsv::CompactMKL> {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const bViewType &b) {
+    typedef typename bViewType::value_type vector_type;
+    // typedef typename vector_type::value_type value_type;
+
+    const int m = b.extent(0), n = 1;
+
+    static_assert(is_vector<vector_type>::value,
+                  "value type is not vector type");
+    static_assert(
+        vector_type::vector_length == 4 || vector_type::vector_length == 8,
+        "AVX, AVX2 and AVX512 is supported");
+    const MKL_COMPACT_PACK format =
+        vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
+
+    // no error check
+    int r_val = 0;
+    if (A.stride_0() == 1) {
+      mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_LOWER, MKL_TRANS,
+                        ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n,
+                        alpha, (const double *)A.data(), A.stride_0(),
+                        (double *)b.data(), b.stride_0(), format,
+                        (MKL_INT)vector_type::vector_length);
+    } else if (A.stride_1() == 1) {
+      mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_LOWER, MKL_TRANS,
+                        ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n,
+                        alpha, (const double *)A.data(), A.stride_0(),
+                        (double *)b.data(), b.stride_0(), format,
+                        (MKL_INT)vector_type::vector_length);
+    } else {
+      r_val = -1;
     }
-  };
+    return r_val;
+  }
+};
 #endif
-    
-  template<typename ArgDiag>
-  struct SerialTrsv<Uplo::Lower,Trans::Transpose,ArgDiag,Algo::Trsv::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const bViewType &b) {
-      return SerialTrsvInternalUpper<Algo::Trsv::Unblocked>::
-        invoke(ArgDiag::use_unit_diag,
-               A.extent(1), 
-               alpha,
-               A.data(), A.stride_1(), A.stride_0(),
-               b.data(), b.stride_0());
-    }
-  };
-
-  template<typename ArgDiag>
-  struct SerialTrsv<Uplo::Lower,Trans::Transpose,ArgDiag,Algo::Trsv::Blocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const bViewType &b) {
-      return SerialTrsvInternalUpper<Algo::Trsv::Blocked>::
-        invoke(ArgDiag::use_unit_diag,
-               A.extent(1), 
-               alpha,
-               A.data(), A.stride_1(), A.stride_0(),
-               b.data(), b.stride_0());
-    }
-  };
-
-  ///
-  /// U/NT
-  ///
-
-#if                                                     \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&               \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) &&       \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
-  template<typename ArgDiag>
-  struct SerialTrsv<Uplo::Upper,Trans::NoTranspose,ArgDiag,Algo::Trsv::CompactMKL> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const bViewType &b) {
-      typedef typename bViewType::value_type vector_type;
-      //typedef typename vector_type::value_type value_type;
-        
-      const int
-        m = b.extent(0),
-        n = 1;
-
-      static_assert(is_vector<vector_type>::value, "value type is not vector type");      
-      static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, 
-                    "AVX, AVX2 and AVX512 is supported");
-      const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ?  MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
-
-      // no error check
-      int r_val = 0;
-      if (A.stride_0() == 1) { 
-        mkl_dtrsm_compact(MKL_COL_MAJOR, 
-                          MKL_LEFT, MKL_UPPER, MKL_NOTRANS, 
-                          ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT,
-                          m, n, 
-                          alpha, 
-                          (const double*)A.data(), A.stride_0(), 
-                          (double*)b.data(), b.stride_0(), 
-                          format, (MKL_INT)vector_type::vector_length);
-      } else if (A.stride_1() == 1) {  
-        mkl_dtrsm_compact(MKL_ROW_MAJOR, 
-                          MKL_LEFT, MKL_UPPER, MKL_NOTRANS, 
-                          ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT,
-                          m, n, 
-                          alpha, 
-                          (const double*)A.data(), A.stride_0(), 
-                          (double*)b.data(), b.stride_0(), 
-                          format, (MKL_INT)vector_type::vector_length);
-      } else {
-        r_val = -1;
-      }
-      return r_val;
+
+template <typename ArgDiag>
+struct SerialTrsv<Uplo::Lower, Trans::Transpose, ArgDiag,
+                  Algo::Trsv::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const bViewType &b) {
+    return SerialTrsvInternalUpper<Algo::Trsv::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), A.stride_1(),
+        A.stride_0(), b.data(), b.stride_0());
+  }
+};
+
+template <typename ArgDiag>
+struct SerialTrsv<Uplo::Lower, Trans::Transpose, ArgDiag, Algo::Trsv::Blocked> {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const bViewType &b) {
+    return SerialTrsvInternalUpper<Algo::Trsv::Blocked>::invoke(
+        ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), A.stride_1(),
+        A.stride_0(), b.data(), b.stride_0());
+  }
+};
+
+///
+/// U/NT
+///
+
+#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&         \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
+template <typename ArgDiag>
+struct SerialTrsv<Uplo::Upper, Trans::NoTranspose, ArgDiag,
+                  Algo::Trsv::CompactMKL> {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const bViewType &b) {
+    typedef typename bViewType::value_type vector_type;
+    // typedef typename vector_type::value_type value_type;
+
+    const int m = b.extent(0), n = 1;
+
+    static_assert(is_vector<vector_type>::value,
+                  "value type is not vector type");
+    static_assert(
+        vector_type::vector_length == 4 || vector_type::vector_length == 8,
+        "AVX, AVX2 and AVX512 is supported");
+    const MKL_COMPACT_PACK format =
+        vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
+
+    // no error check
+    int r_val = 0;
+    if (A.stride_0() == 1) {
+      mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_UPPER, MKL_NOTRANS,
+                        ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n,
+                        alpha, (const double *)A.data(), A.stride_0(),
+                        (double *)b.data(), b.stride_0(), format,
+                        (MKL_INT)vector_type::vector_length);
+    } else if (A.stride_1() == 1) {
+      mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_UPPER, MKL_NOTRANS,
+                        ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n,
+                        alpha, (const double *)A.data(), A.stride_0(),
+                        (double *)b.data(), b.stride_0(), format,
+                        (MKL_INT)vector_type::vector_length);
+    } else {
+      r_val = -1;
     }
-  };
+    return r_val;
+  }
+};
 #endif
 
-  template<typename ArgDiag>
-  struct SerialTrsv<Uplo::Upper,Trans::NoTranspose,ArgDiag,Algo::Trsv::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const bViewType &b) {
-      return SerialTrsvInternalUpper<Algo::Trsv::Unblocked>::
-        invoke(ArgDiag::use_unit_diag,
-               A.extent(0), 
-               alpha,
-               A.data(), A.stride_0(), A.stride_1(),
-               b.data(), b.stride_0());
-    }
-  };
-
-  template<typename ArgDiag>
-  struct SerialTrsv<Uplo::Upper,Trans::NoTranspose,ArgDiag,Algo::Trsv::Blocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const bViewType &b) {
-      return SerialTrsvInternalUpper<Algo::Trsv::Blocked>::
-        invoke(ArgDiag::use_unit_diag,
-               A.extent(0), 
-               alpha,
-               A.data(), A.stride_0(), A.stride_1(),
-               b.data(), b.stride_0());
-    }
-  };
-
-  ///
-  /// U/T
-  ///
-
-#if                                                     \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&               \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) &&       \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
-  template<typename ArgDiag>
-  struct SerialTrsv<Uplo::Upper,Trans::Transpose,ArgDiag,Algo::Trsv::CompactMKL> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const bViewType &b) {
-      typedef typename bViewType::value_type vector_type;
-      //typedef typename vector_type::value_type value_type;
-        
-      const int
-        m = b.extent(0),
-        n = 1;
-
-      static_assert(is_vector<vector_type>::value, "value type is not vector type");      
-      static_assert(vector_type::vector_length == 4 || vector_type::vector_length == 8, 
-                    "AVX, AVX2 and AVX512 is supported");
-      const MKL_COMPACT_PACK format = vector_type::vector_length == 8 ?  MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
-
-      // no error check
-      int r_val = 0;
-      if (A.stride_0() == 1) { 
-        mkl_dtrsm_compact(MKL_COL_MAJOR, 
-                          MKL_LEFT, MKL_UPPER, MKL_TRANS, 
-                          ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT,
-                          m, n, 
-                          alpha, 
-                          (const double*)A.data(), A.stride_0(), 
-                          (double*)b.data(), b.stride_0(), 
-                          format, (MKL_INT)vector_type::vector_length);
-      } else if (A.stride_1() == 1) {  
-        mkl_dtrsm_compact(MKL_ROW_MAJOR, 
-                          MKL_LEFT, MKL_UPPER, MKL_TRANS, 
-                          ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT,
-                          m, n, 
-                          alpha, 
-                          (const double*)A.data(), A.stride_0(), 
-                          (double*)b.data(), b.stride_0(), 
-                          format, (MKL_INT)vector_type::vector_length);
-      } else {
-        r_val = -1;
-      }
-      return r_val;
+template <typename ArgDiag>
+struct SerialTrsv<Uplo::Upper, Trans::NoTranspose, ArgDiag,
+                  Algo::Trsv::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const bViewType &b) {
+    return SerialTrsvInternalUpper<Algo::Trsv::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), A.stride_0(),
+        A.stride_1(), b.data(), b.stride_0());
+  }
+};
+
+template <typename ArgDiag>
+struct SerialTrsv<Uplo::Upper, Trans::NoTranspose, ArgDiag,
+                  Algo::Trsv::Blocked> {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const bViewType &b) {
+    return SerialTrsvInternalUpper<Algo::Trsv::Blocked>::invoke(
+        ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(), A.stride_0(),
+        A.stride_1(), b.data(), b.stride_0());
+  }
+};
+
+///
+/// U/T
+///
+
+#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&         \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
+template <typename ArgDiag>
+struct SerialTrsv<Uplo::Upper, Trans::Transpose, ArgDiag,
+                  Algo::Trsv::CompactMKL> {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const bViewType &b) {
+    typedef typename bViewType::value_type vector_type;
+    // typedef typename vector_type::value_type value_type;
+
+    const int m = b.extent(0), n = 1;
+
+    static_assert(is_vector<vector_type>::value,
+                  "value type is not vector type");
+    static_assert(
+        vector_type::vector_length == 4 || vector_type::vector_length == 8,
+        "AVX, AVX2 and AVX512 is supported");
+    const MKL_COMPACT_PACK format =
+        vector_type::vector_length == 8 ? MKL_COMPACT_AVX512 : MKL_COMPACT_AVX;
+
+    // no error check
+    int r_val = 0;
+    if (A.stride_0() == 1) {
+      mkl_dtrsm_compact(MKL_COL_MAJOR, MKL_LEFT, MKL_UPPER, MKL_TRANS,
+                        ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n,
+                        alpha, (const double *)A.data(), A.stride_0(),
+                        (double *)b.data(), b.stride_0(), format,
+                        (MKL_INT)vector_type::vector_length);
+    } else if (A.stride_1() == 1) {
+      mkl_dtrsm_compact(MKL_ROW_MAJOR, MKL_LEFT, MKL_UPPER, MKL_TRANS,
+                        ArgDiag::use_unit_diag ? MKL_UNIT : MKL_NONUNIT, m, n,
+                        alpha, (const double *)A.data(), A.stride_0(),
+                        (double *)b.data(), b.stride_0(), format,
+                        (MKL_INT)vector_type::vector_length);
+    } else {
+      r_val = -1;
     }
-  };
+    return r_val;
+  }
+};
 #endif
 
-  template<typename ArgDiag>
-  struct SerialTrsv<Uplo::Upper,Trans::Transpose,ArgDiag,Algo::Trsv::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const bViewType &b) {
-      return SerialTrsvInternalLower<Algo::Trsv::Unblocked>::
-        invoke(ArgDiag::use_unit_diag,
-               A.extent(1), 
-               alpha,
-               A.data(), A.stride_1(), A.stride_0(),
-               b.data(), b.stride_0());
-    }
-  };
-
-  template<typename ArgDiag>
-  struct SerialTrsv<Uplo::Upper,Trans::Transpose,ArgDiag,Algo::Trsv::Blocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ScalarType alpha,
-           const AViewType &A,
-           const bViewType &b) {
-      return SerialTrsvInternalLower<Algo::Trsv::Blocked>::
-        invoke(ArgDiag::use_unit_diag,
-               A.extent(1), 
-               alpha,
-               A.data(), A.stride_1(), A.stride_0(),
-               b.data(), b.stride_0());
-    }
-  };
-    
-}
+template <typename ArgDiag>
+struct SerialTrsv<Uplo::Upper, Trans::Transpose, ArgDiag,
+                  Algo::Trsv::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const bViewType &b) {
+    return SerialTrsvInternalLower<Algo::Trsv::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), A.stride_1(),
+        A.stride_0(), b.data(), b.stride_0());
+  }
+};
+
+template <typename ArgDiag>
+struct SerialTrsv<Uplo::Upper, Trans::Transpose, ArgDiag, Algo::Trsv::Blocked> {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha,
+                                           const AViewType &A,
+                                           const bViewType &b) {
+    return SerialTrsvInternalLower<Algo::Trsv::Blocked>::invoke(
+        ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), A.stride_1(),
+        A.stride_0(), b.data(), b.stride_0());
+  }
+};
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp
index 5bf26f0865..fb28ea5a9c 100644
--- a/src/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_TRSV_SERIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_TRSV_SERIAL_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -14,230 +13,193 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ====================
-
-  ///
-  /// Lower
-  ///
-
-  template<typename AlgoType>
-  struct SerialTrsvInternalLower {
-    template<typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const bool use_unit_diag,
-           const int m,
-           const ScalarType alpha,
-           const ValueType *__restrict__ A, const int as0, const int as1,
-           /**/  ValueType *__restrict__ b, const int bs0);
-  };
-
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialTrsvInternalLower<Algo::Trsv::Unblocked>::
-  invoke(const bool use_unit_diag,
-         const int m,
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         /**/  ValueType *__restrict__ b, const int bs0) {
-
-    const ScalarType one(1.0), zero(0.0);
-
-    if (alpha == zero)  SerialSetInternal::invoke(m, zero, b, bs0);
-    else {
-      if (alpha != one) SerialScaleInternal::invoke(m, alpha, b, bs0);
-      if (m <= 0) return 0;
-
-      for (int p=0;p<m;++p) {
-        const int iend = m-p-1;
-
-        const ValueType
-          *__restrict__ a21   = iend ? A+(p+1)*as0+p*as1 : NULL;
-
-        ValueType
-          *__restrict__ beta1 =        b+p*bs0,
-          *__restrict__ b2    = iend ? beta1+bs0 : NULL;
-
-        // with __restrict__ a compiler assumes that the pointer is not accessed by others
-        // op(/=) uses this pointer and changes the associated values, which brings a compiler problem
-        if (!use_unit_diag)
-          *beta1 = *beta1 / A[p*as0+p*as1];
-
-        for (int i=0;i<iend;++i)
-          b2[i*bs0] -= a21[i*as0] * (*beta1);
-      }
+///
+/// Serial Internal Impl
+/// ====================
+
+///
+/// Lower
+///
+
+template <typename AlgoType>
+struct SerialTrsvInternalLower {
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag,
+                                           const int m, const ScalarType alpha,
+                                           const ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1,
+                                           /**/ ValueType *KOKKOS_RESTRICT b,
+                                           const int bs0);
+};
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+SerialTrsvInternalLower<Algo::Trsv::Unblocked>::invoke(
+    const bool use_unit_diag, const int m, const ScalarType alpha,
+    const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) {
+  const ScalarType one(1.0), zero(0.0);
+
+  if (alpha == zero)
+    SerialSetInternal::invoke(m, zero, b, bs0);
+  else {
+    if (alpha != one) SerialScaleInternal::invoke(m, alpha, b, bs0);
+    if (m <= 0) return 0;
+
+    for (int p = 0; p < m; ++p) {
+      const int iend = m - p - 1;
+
+      const ValueType *KOKKOS_RESTRICT a21 =
+          iend ? A + (p + 1) * as0 + p * as1 : NULL;
+
+      ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0,
+                                 *KOKKOS_RESTRICT b2 =
+                                     iend ? beta1 + bs0 : NULL;
+
+      // with KOKKOS_RESTRICT a compiler assumes that the pointer is not
+      // accessed by others op(/=) uses this pointer and changes the associated
+      // values, which brings a compiler problem
+      if (!use_unit_diag) *beta1 = *beta1 / A[p * as0 + p * as1];
+
+      for (int i = 0; i < iend; ++i) b2[i * bs0] -= a21[i * as0] * (*beta1);
     }
-    return 0;
   }
+  return 0;
+}
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialTrsvInternalLower<Algo::Trsv::Blocked>::
-  invoke(const bool use_unit_diag,
-         const int m,
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         /**/  ValueType *__restrict__ b, const int bs0) {
-
-    const ScalarType one(1.0), zero(0.0), minus_one(-1.0);
-
-    enum : int {
-      mbAlgo = Algo::Trsv::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
-    };
-
-    if (alpha == zero)   SerialSetInternal::invoke(m, zero, b, bs0);
-    else {
-      if (alpha != one) SerialScaleInternal::invoke(m, alpha, b, bs0);
-      if (m <= 0) return 0;
-
-      /// case GPU: team size is large and blocksize (mb,nb) is small
-      InnerTrsmLeftLowerUnitDiag<mbAlgo>    trsm_u(as0, as1, bs0, 0);
-      InnerTrsmLeftLowerNonUnitDiag<mbAlgo> trsm_n(as0, as1, bs0, 0);
-
-      const int mb = mbAlgo;
-      for (int p=0;p<m;p+=mb) {
-        const int pb = ((p+mb) > m ? (m-p) : mb);
-
-        // trsm update
-        const ValueType *__restrict__ Ap = A+p*as0+p*as1;
-        /**/  ValueType *__restrict__ bp = b+p*bs0;
-
-        if (use_unit_diag) trsm_u.serial_invoke(Ap, pb, 1, bp);
-        else               trsm_n.serial_invoke(Ap, pb, 1, bp);
-
-        // gemv update
-        SerialGemvInternal<Algo::Gemv::Blocked>
-          ::invoke(m-p-pb, pb,
-                   minus_one,
-                   Ap+pb*as0, as0, as1,
-                   bp, bs0,
-                   one,
-                   bp+pb*bs0, bs0);
-      }
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int SerialTrsvInternalLower<Algo::Trsv::Blocked>::invoke(
+    const bool use_unit_diag, const int m, const ScalarType alpha,
+    const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) {
+  const ScalarType one(1.0), zero(0.0), minus_one(-1.0);
+
+  constexpr int mbAlgo = Algo::Trsv::Blocked::mb();
+
+  if (alpha == zero)
+    SerialSetInternal::invoke(m, zero, b, bs0);
+  else {
+    if (alpha != one) SerialScaleInternal::invoke(m, alpha, b, bs0);
+    if (m <= 0) return 0;
+
+    /// case GPU: team size is large and blocksize (mb,nb) is small
+    InnerTrsmLeftLowerUnitDiag<mbAlgo> trsm_u(as0, as1, bs0, 0);
+    InnerTrsmLeftLowerNonUnitDiag<mbAlgo> trsm_n(as0, as1, bs0, 0);
+
+    const int mb = mbAlgo;
+    for (int p = 0; p < m; p += mb) {
+      const int pb = ((p + mb) > m ? (m - p) : mb);
+
+      // trsm update
+      const ValueType *KOKKOS_RESTRICT Ap = A + p * as0 + p * as1;
+      /**/ ValueType *KOKKOS_RESTRICT bp    = b + p * bs0;
+
+      if (use_unit_diag)
+        trsm_u.serial_invoke(Ap, pb, 1, bp);
+      else
+        trsm_n.serial_invoke(Ap, pb, 1, bp);
+
+      // gemv update
+      SerialGemvInternal<Algo::Gemv::Blocked>::invoke(
+          m - p - pb, pb, minus_one, Ap + pb * as0, as0, as1, bp, bs0, one,
+          bp + pb * bs0, bs0);
     }
-    return 0;
   }
+  return 0;
+}
 
-  ///
-  /// Upper
-  ///
-
-  template<typename AlgoType>
-  struct SerialTrsvInternalUpper {
-    template<typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const bool use_unit_diag,
-           const int m,
-           const ScalarType alpha,
-           const ValueType *__restrict__ A, const int as0, const int as1,
-           /**/  ValueType *__restrict__ b, const int bs0);
-
-  };
-
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialTrsvInternalUpper<Algo::Trsv::Unblocked>::
-  invoke(const bool use_unit_diag,
-         const int m,
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         /**/  ValueType *__restrict__ b, const int bs0) {
-
-    const ScalarType one(1.0), zero(0.0);
-
-    if (alpha == zero)  SerialSetInternal::invoke(m, zero, b, bs0);
-    else {
-      if (alpha != one) SerialScaleInternal::invoke(m, alpha, b, bs0);
-      if (m <= 0) return 0;
-
-      ValueType *__restrict__ b0 = b;
-      for (int p=(m-1);p>=0;--p) {
-        const int iend = p;
-
-        const ValueType *__restrict__ a01   = A+p*as1;
-        /**/  ValueType *__restrict__ beta1 = b+p*bs0;
-
-        // with __restrict__ a compiler assumes that the pointer is not accessed by others
-        // op(/=) uses this pointer and changes the associated values, which brings a compiler problem
-        if (!use_unit_diag)
-          *beta1 = *beta1 / A[p*as0+p*as1];
-
-        for (int i=0;i<iend;++i)
-          b0[i*bs0] -= a01[i*as0] * (*beta1);
-      }
+///
+/// Upper
+///
+
+template <typename AlgoType>
+struct SerialTrsvInternalUpper {
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag,
+                                           const int m, const ScalarType alpha,
+                                           const ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1,
+                                           /**/ ValueType *KOKKOS_RESTRICT b,
+                                           const int bs0);
+};
+
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+SerialTrsvInternalUpper<Algo::Trsv::Unblocked>::invoke(
+    const bool use_unit_diag, const int m, const ScalarType alpha,
+    const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) {
+  const ScalarType one(1.0), zero(0.0);
+
+  if (alpha == zero)
+    SerialSetInternal::invoke(m, zero, b, bs0);
+  else {
+    if (alpha != one) SerialScaleInternal::invoke(m, alpha, b, bs0);
+    if (m <= 0) return 0;
+
+    ValueType *KOKKOS_RESTRICT b0 = b;
+    for (int p = (m - 1); p >= 0; --p) {
+      const int iend = p;
+
+      const ValueType *KOKKOS_RESTRICT a01 = A + p * as1;
+      /**/ ValueType *KOKKOS_RESTRICT beta1  = b + p * bs0;
+
+      // with KOKKOS_RESTRICT a compiler assumes that the pointer is not
+      // accessed by others op(/=) uses this pointer and changes the associated
+      // values, which brings a compiler problem
+      if (!use_unit_diag) *beta1 = *beta1 / A[p * as0 + p * as1];
+
+      for (int i = 0; i < iend; ++i) b0[i * bs0] -= a01[i * as0] * (*beta1);
     }
-    return 0;
   }
+  return 0;
+}
 
-  template<>
-  template<typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialTrsvInternalUpper<Algo::Trsv::Blocked>::
-  invoke(const bool use_unit_diag,
-         const int m,
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         /**/  ValueType *__restrict__ b, const int bs0) {
-
-    const ScalarType one(1.0), zero(0.0), minus_one(-1.0);
-
-    enum : int {
-      mbAlgo = Algo::Trsm::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
-    };
-
-    // note that parallel range is different ( m*n vs m-1*n);
-    if (alpha == zero)  SerialSetInternal::invoke(m, zero, b, bs0);
-    else {
-      if (alpha != one) SerialScaleInternal::invoke(m, alpha, b, bs0);
-      if (m <= 0) return 0;
-
-      InnerTrsmLeftUpperUnitDiag<mbAlgo>    trsm_u(as0, as1, bs0, 0);
-      InnerTrsmLeftUpperNonUnitDiag<mbAlgo> trsm_n(as0, as1, bs0, 0);
-
-      const int mb = mbAlgo;
-      for (int pp=0;pp<m;pp+=mb) {
-        const int
-          ptmp = (m - pp - mb),
-          p = (ptmp < 0 ? 0 : ptmp),
-          pb = (mb + (ptmp < 0)*ptmp);
-
-        // trsm update
-        const ValueType *__restrict__ Ap = A+p*as0+p*as1;
-        /**/  ValueType *__restrict__ bp = b+p*bs0;
-
-        if (use_unit_diag) trsm_u.serial_invoke(Ap, pb, 1, bp);
-        else               trsm_n.serial_invoke(Ap, pb, 1, bp);
-
-        // gemv update
-        SerialGemvInternal<Algo::Gemv::Blocked>
-          ::invoke(p, pb,
-                   minus_one,
-                   Ap-p*as0, as0, as1,
-                   bp, bs0,
-                   one,
-                   b, bs0);
-      }
+template <>
+template <typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int SerialTrsvInternalUpper<Algo::Trsv::Blocked>::invoke(
+    const bool use_unit_diag, const int m, const ScalarType alpha,
+    const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) {
+  const ScalarType one(1.0), zero(0.0), minus_one(-1.0);
+
+  constexpr int mbAlgo = Algo::Trsm::Blocked::mb();
+
+  // note that parallel range is different ( m*n vs m-1*n);
+  if (alpha == zero)
+    SerialSetInternal::invoke(m, zero, b, bs0);
+  else {
+    if (alpha != one) SerialScaleInternal::invoke(m, alpha, b, bs0);
+    if (m <= 0) return 0;
+
+    InnerTrsmLeftUpperUnitDiag<mbAlgo> trsm_u(as0, as1, bs0, 0);
+    InnerTrsmLeftUpperNonUnitDiag<mbAlgo> trsm_n(as0, as1, bs0, 0);
+
+    const int mb = mbAlgo;
+    for (int pp = 0; pp < m; pp += mb) {
+      const int ptmp = (m - pp - mb), p = (ptmp < 0 ? 0 : ptmp),
+                pb = (mb + (ptmp < 0) * ptmp);
+
+      // trsm update
+      const ValueType *KOKKOS_RESTRICT Ap = A + p * as0 + p * as1;
+      /**/ ValueType *KOKKOS_RESTRICT bp    = b + p * bs0;
+
+      if (use_unit_diag)
+        trsm_u.serial_invoke(Ap, pb, 1, bp);
+      else
+        trsm_n.serial_invoke(Ap, pb, 1, bp);
+
+      // gemv update
+      SerialGemvInternal<Algo::Gemv::Blocked>::invoke(
+          p, pb, minus_one, Ap - p * as0, as0, as1, bp, bs0, one, b, bs0);
     }
-    return 0;
   }
-
+  return 0;
 }
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Impl.hpp
index fbe99b39b7..4d4d114840 100644
--- a/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Impl.hpp
@@ -1,127 +1,95 @@
 #ifndef __KOKKOSBATCHED_TRSV_TEAMVECTOR_IMPL_HPP__
 #define __KOKKOSBATCHED_TRSV_TEAMVECTOR_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 #include "KokkosBatched_Trsv_TeamVector_Internal.hpp"
 
 namespace KokkosBatched {
-  ///
-  /// Team Impl
-  /// ===========
-
-  ///
-  /// Implemented:
-  /// L/NT, U/NT, L/T, U/T
-  /// 
-  /// Not yet implemented
-  /// L/CT, U/CT 
-
-  ///
-  /// L/NT
-  ///
-    
-  template<typename MemberType, typename ArgDiag>
-  struct TeamVectorTrsv<MemberType,Uplo::Lower,Trans::NoTranspose,ArgDiag,Algo::Trsv::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A,
-           const bViewType &b) {
-      return TeamVectorTrsvInternalLower<Algo::Trsv::Unblocked>::
-        invoke(member,
-               ArgDiag::use_unit_diag,
-               A.extent(0), 
-               alpha,
-               A.data(), A.stride_0(), A.stride_1(),
-               b.data(), b.stride_0());
-    }
-  };
-
-  ///
-  /// L/T
-  ///
-
-  template<typename MemberType, typename ArgDiag>
-  struct TeamVectorTrsv<MemberType,Uplo::Lower,Trans::Transpose,ArgDiag,Algo::Trsv::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A,
-           const bViewType &b) {
-      return TeamVectorTrsvInternalUpper<Algo::Trsv::Unblocked>::
-        invoke(member,
-               ArgDiag::use_unit_diag,
-               A.extent(1), 
-               alpha,
-               A.data(), A.stride_1(), A.stride_0(),
-               b.data(), b.stride_0());
-    }
-  };
-
-  ///
-  /// U/NT
-  ///
-
-  template<typename MemberType, typename ArgDiag>
-  struct TeamVectorTrsv<MemberType,Uplo::Upper,Trans::NoTranspose,ArgDiag,Algo::Trsv::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A,
-           const bViewType &b) {
-      return TeamVectorTrsvInternalUpper<Algo::Trsv::Unblocked>::
-        invoke(member,
-               ArgDiag::use_unit_diag,
-               A.extent(0), 
-               alpha,
-               A.data(), A.stride_0(), A.stride_1(),
-               b.data(), b.stride_0());
-    }
-  };
-
-
-  ///
-  /// U/T
-  ///
-
-  template<typename MemberType, typename ArgDiag>
-  struct TeamVectorTrsv<MemberType,Uplo::Upper,Trans::Transpose,ArgDiag,Algo::Trsv::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const bViewType &b) {
-      return TeamVectorTrsvInternalLower<Algo::Trsv::Unblocked>::
-        invoke(member,
-               ArgDiag::use_unit_diag,
-               A.extent(1), 
-               alpha,
-               A.data(), A.stride_1(), A.stride_0(),
-               b.data(), b.stride_0());
-    }
-  };
-
-}
-
-
+///
+/// Team Impl
+/// ===========
+
+///
+/// Implemented:
+/// L/NT, U/NT, L/T, U/T
+///
+/// Not yet implemented
+/// L/CT, U/CT
+
+///
+/// L/NT
+///
+
+template <typename MemberType, typename ArgDiag>
+struct TeamVectorTrsv<MemberType, Uplo::Lower, Trans::NoTranspose, ArgDiag,
+                      Algo::Trsv::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const bViewType &b) {
+    return TeamVectorTrsvInternalLower<Algo::Trsv::Unblocked>::invoke(
+        member, ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(),
+        A.stride_0(), A.stride_1(), b.data(), b.stride_0());
+  }
+};
+
+///
+/// L/T
+///
+
+template <typename MemberType, typename ArgDiag>
+struct TeamVectorTrsv<MemberType, Uplo::Lower, Trans::Transpose, ArgDiag,
+                      Algo::Trsv::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const bViewType &b) {
+    return TeamVectorTrsvInternalUpper<Algo::Trsv::Unblocked>::invoke(
+        member, ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(),
+        A.stride_1(), A.stride_0(), b.data(), b.stride_0());
+  }
+};
+
+///
+/// U/NT
+///
+
+template <typename MemberType, typename ArgDiag>
+struct TeamVectorTrsv<MemberType, Uplo::Upper, Trans::NoTranspose, ArgDiag,
+                      Algo::Trsv::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const bViewType &b) {
+    return TeamVectorTrsvInternalUpper<Algo::Trsv::Unblocked>::invoke(
+        member, ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(),
+        A.stride_0(), A.stride_1(), b.data(), b.stride_0());
+  }
+};
+
+///
+/// U/T
+///
+
+template <typename MemberType, typename ArgDiag>
+struct TeamVectorTrsv<MemberType, Uplo::Upper, Trans::Transpose, ArgDiag,
+                      Algo::Trsv::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const bViewType &b) {
+    return TeamVectorTrsvInternalLower<Algo::Trsv::Unblocked>::invoke(
+        member, ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(),
+        A.stride_1(), A.stride_0(), b.data(), b.stride_0());
+  }
+};
+
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp
index 0fa3a8abed..ad50e6fc2a 100644
--- a/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_TRSV_TEAMVECTOR_INTERNAL_HPP__
 #define __KOKKOSBATCHED_TRSV_TEAMVECTOR_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -11,148 +10,130 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Team Internal Impl
-  /// ====================
-
-  ///
-  /// Lower
-  ///
-    
-  template<typename AlgoType>
-  struct TeamVectorTrsvInternalLower {
-    template<typename MemberType,
-             typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &/*member*/,
-           const bool /*use_unit_diag*/,
-           const int /*m*/,
-           const ScalarType /*alpha*/,
-           const ValueType *__restrict__ /*A*/, const int /*as0*/, const int /*as1*/,
-           /**/  ValueType *__restrict__ /*b*/, const int /*bs0*/) {
-      assert(false && "Error: encounter dummy impl");
-      return 0;
-    }
-  };
-  
-  template<>
-  template<typename MemberType,
-           typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamVectorTrsvInternalLower<Algo::Trsv::Unblocked>::
-  invoke(const MemberType &member,
-         const bool use_unit_diag,
-         const int m,
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         /**/  ValueType *__restrict__ b, const int bs0) {
-    const ScalarType one(1.0), zero(0.0);
-    
-    if (alpha == zero)  TeamVectorSetInternal::invoke(member, m, zero, b, bs0);
-    else {
-      if (alpha != one) TeamVectorScaleInternal::invoke(member, m, alpha, b, bs0);
-      if (m <= 0) return 0;
-      
-      for (int p=0;p<m;++p) {
-        const int iend = m-p-1;
-        
-        const ValueType
-          *__restrict__ a21   = iend ? A+(p+1)*as0+p*as1 : NULL;
-        
-        ValueType
-          *__restrict__ beta1 =        b+p*bs0,
-          *__restrict__ b2    = iend ? beta1+bs0 : NULL;
-        
+///
+/// Team Internal Impl
+/// ====================
+
+///
+/// Lower
+///
+
+template <typename AlgoType>
+struct TeamVectorTrsvInternalLower {
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType & /*member*/, const bool /*use_unit_diag*/,
+      const int /*m*/, const ScalarType /*alpha*/,
+      const ValueType *KOKKOS_RESTRICT /*A*/, const int /*as0*/,
+      const int /*as1*/,
+      /**/ ValueType *KOKKOS_RESTRICT /*b*/, const int /*bs0*/) {
+    assert(false && "Error: encounter dummy impl");
+    return 0;
+  }
+};
+
+template <>
+template <typename MemberType, typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+TeamVectorTrsvInternalLower<Algo::Trsv::Unblocked>::invoke(
+    const MemberType &member, const bool use_unit_diag, const int m,
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0,
+    const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) {
+  const ScalarType one(1.0), zero(0.0);
+
+  if (alpha == zero)
+    TeamVectorSetInternal::invoke(member, m, zero, b, bs0);
+  else {
+    if (alpha != one) TeamVectorScaleInternal::invoke(member, m, alpha, b, bs0);
+    if (m <= 0) return 0;
+
+    for (int p = 0; p < m; ++p) {
+      const int iend = m - p - 1;
+
+      const ValueType *KOKKOS_RESTRICT a21 =
+          iend ? A + (p + 1) * as0 + p * as1 : NULL;
+
+      ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0,
+                                 *KOKKOS_RESTRICT b2 =
+                                     iend ? beta1 + bs0 : NULL;
+
+      member.team_barrier();
+      ValueType local_beta1 = *beta1;
+      if (!use_unit_diag) {
+        const ValueType alpha11 = A[p * as0 + p * as1];
+        local_beta1             = local_beta1 / alpha11;
+
         member.team_barrier();
-        ValueType local_beta1 = *beta1;
-        if (!use_unit_diag) {
-          const ValueType alpha11 = A[p*as0+p*as1];
-          local_beta1 = local_beta1 / alpha11;
-          
-          member.team_barrier();
-          Kokkos::single(Kokkos::PerTeam(member), [&]() {
-              *beta1 = local_beta1;
-            });
-        }
-        Kokkos::parallel_for(Kokkos::TeamVectorRange(member,0,iend),[&](const int &i) {
-            b2[i*bs0] -= a21[i*as0] * local_beta1;
-          });
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [&]() { *beta1 = local_beta1; });
       }
+      Kokkos::parallel_for(
+          Kokkos::TeamVectorRange(member, 0, iend),
+          [&](const int &i) { b2[i * bs0] -= a21[i * as0] * local_beta1; });
     }
+  }
+  return 0;
+}
+
+///
+/// Upper
+///
+
+template <typename AlgoType>
+struct TeamVectorTrsvInternalUpper {
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType & /*member*/, const bool /*use_unit_diag*/,
+      const int /*m*/, const ScalarType /*alpha*/,
+      const ValueType *KOKKOS_RESTRICT /*A*/, const int /*as0*/,
+      const int /*as1*/,
+      /**/ ValueType *KOKKOS_RESTRICT /*b*/, const int /*bs0*/) {
+    assert(false && "Error: encounter dummy impl");
     return 0;
   }
+};
+
+template <>
+template <typename MemberType, typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+TeamVectorTrsvInternalUpper<Algo::Trsv::Unblocked>::invoke(
+    const MemberType &member, const bool use_unit_diag, const int m,
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0,
+    const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) {
+  const ScalarType one(1.0), zero(0.0);
+
+  if (alpha == zero)
+    TeamVectorSetInternal::invoke(member, m, zero, b, bs0);
+  else {
+    if (alpha != one) TeamVectorScaleInternal::invoke(member, m, alpha, b, bs0);
+    if (m <= 0) return 0;
+
+    ValueType *KOKKOS_RESTRICT b0 = b;
+    for (int p = (m - 1); p >= 0; --p) {
+      const int iend = p;
+
+      const ValueType *KOKKOS_RESTRICT a01 = A + p * as1;
+      /**/ ValueType *KOKKOS_RESTRICT beta1  = b + p * bs0;
+
+      member.team_barrier();
+      ValueType local_beta1 = *beta1;
+      if (!use_unit_diag) {
+        const ValueType alpha11 = A[p * as0 + p * as1];
+        local_beta1             = local_beta1 / alpha11;
 
-  ///
-  /// Upper
-  ///
-
-  template<typename AlgoType>
-  struct TeamVectorTrsvInternalUpper {
-    template<typename MemberType,
-             typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &/*member*/,
-           const bool /*use_unit_diag*/,
-           const int /*m*/,
-           const ScalarType /*alpha*/,
-           const ValueType *__restrict__ /*A*/, const int /*as0*/, const int /*as1*/,
-           /**/  ValueType *__restrict__ /*b*/, const int /*bs0*/) {
-      assert(false && "Error: encounter dummy impl");
-      return 0;
-    }
-  };
-
-  template<>
-  template<typename MemberType,
-           typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamVectorTrsvInternalUpper<Algo::Trsv::Unblocked>::
-  invoke(const MemberType &member,
-         const bool use_unit_diag,
-         const int m,
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         /**/  ValueType *__restrict__ b, const int bs0) {
-    const ScalarType one(1.0), zero(0.0);
-    
-    if (alpha == zero)  TeamVectorSetInternal::invoke(member, m, zero, b, bs0);
-    else {
-      if (alpha != one) TeamVectorScaleInternal::invoke(member, m, alpha, b, bs0);
-      if (m <= 0) return 0;
-      
-      ValueType *__restrict__ b0 = b;
-      for (int p=(m-1);p>=0;--p) {
-        const int iend = p;
-        
-        const ValueType *__restrict__ a01   = A+p*as1;
-        /**/  ValueType *__restrict__ beta1 = b+p*bs0;
-        
         member.team_barrier();
-        ValueType local_beta1 = *beta1;
-        if (!use_unit_diag) {
-          const ValueType alpha11 = A[p*as0+p*as1];
-          local_beta1 = local_beta1 / alpha11;
-          
-          member.team_barrier();
-          Kokkos::single(Kokkos::PerTeam(member), [&]() {
-              *beta1 = local_beta1;
-            });
-        }
-        Kokkos::parallel_for(Kokkos::TeamVectorRange(member,0,iend),[&](const int &i) {
-            b0[i*bs0] -= a01[i*as0] * local_beta1;
-          });
+        Kokkos::single(Kokkos::PerTeam(member),
+                       [&]() { *beta1 = local_beta1; });
       }
+      Kokkos::parallel_for(
+          Kokkos::TeamVectorRange(member, 0, iend),
+          [&](const int &i) { b0[i * bs0] -= a01[i * as0] * local_beta1; });
     }
-    return 0;
   }
+  return 0;
 }
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Trsv_Team_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Trsv_Team_Impl.hpp
index a55f5a474a..3fb6be6703 100644
--- a/src/batched/dense/impl/KokkosBatched_Trsv_Team_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Trsv_Team_Impl.hpp
@@ -1,208 +1,150 @@
 #ifndef __KOKKOSBATCHED_TRSV_TEAM_IMPL_HPP__
 #define __KOKKOSBATCHED_TRSV_TEAM_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 #include "KokkosBatched_Trsv_Team_Internal.hpp"
 
 namespace KokkosBatched {
-  ///
-  /// Team Impl
-  /// ===========
-
-  ///
-  /// Implemented:
-  /// L/NT, U/NT, L/T, U/T
-  /// 
-  /// Not yet implemented
-  /// L/CT, U/CT 
-
-  ///
-  /// L/NT
-  ///
-    
-  template<typename MemberType, typename ArgDiag>
-  struct TeamTrsv<MemberType,Uplo::Lower,Trans::NoTranspose,ArgDiag,Algo::Trsv::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A,
-           const bViewType &b) {
-      return TeamTrsvInternalLower<Algo::Trsv::Unblocked>::
-        invoke(member,
-               ArgDiag::use_unit_diag,
-               A.extent(0), 
-               alpha,
-               A.data(), A.stride_0(), A.stride_1(),
-               b.data(), b.stride_0());
-    }
-  };
-
-  template<typename MemberType, typename ArgDiag>
-  struct TeamTrsv<MemberType,Uplo::Lower,Trans::NoTranspose,ArgDiag,Algo::Trsv::Blocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A,
-           const bViewType &b) {
-      return TeamTrsvInternalLower<Algo::Trsv::Blocked>::
-        invoke(member,
-               ArgDiag::use_unit_diag,
-               A.extent(0), 
-               alpha,
-               A.data(), A.stride_0(), A.stride_1(),
-               b.data(), b.stride_0());
-    }
-  };
-
-  ///
-  /// L/T
-  ///
-
-  template<typename MemberType, typename ArgDiag>
-  struct TeamTrsv<MemberType,Uplo::Lower,Trans::Transpose,ArgDiag,Algo::Trsv::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A,
-           const bViewType &b) {
-      return TeamTrsvInternalUpper<Algo::Trsv::Unblocked>::
-        invoke(member,
-               ArgDiag::use_unit_diag,
-               A.extent(1), 
-               alpha,
-               A.data(), A.stride_1(), A.stride_0(),
-               b.data(), b.stride_0());
-    }
-  };
-
-  template<typename MemberType, typename ArgDiag>
-  struct TeamTrsv<MemberType,Uplo::Lower,Trans::Transpose,ArgDiag,Algo::Trsv::Blocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &/*member*/,
-           const ScalarType alpha,
-           const AViewType &A,
-           const bViewType &b) {
-      return TeamTrsvInternalUpper<Algo::Trsv::Blocked>::
-        invoke(ArgDiag::use_unit_diag,
-               A.extent(1), 
-               alpha,
-               A.data(), A.stride_1(), A.stride_0(),
-               b.data(), b.stride_0());
-    }
-  };
-
-  ///
-  /// U/NT
-  ///
-
-  template<typename MemberType, typename ArgDiag>
-  struct TeamTrsv<MemberType,Uplo::Upper,Trans::NoTranspose,ArgDiag,Algo::Trsv::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A,
-           const bViewType &b) {
-      return TeamTrsvInternalUpper<Algo::Trsv::Unblocked>::
-        invoke(member,
-               ArgDiag::use_unit_diag,
-               A.extent(0), 
-               alpha,
-               A.data(), A.stride_0(), A.stride_1(),
-               b.data(), b.stride_0());
-    }
-  };
-
-  template<typename MemberType, typename ArgDiag>
-  struct TeamTrsv<MemberType,Uplo::Upper,Trans::NoTranspose,ArgDiag,Algo::Trsv::Blocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const ScalarType alpha,
-           const AViewType &A,
-           const bViewType &b) {
-      return TeamTrsvInternalUpper<Algo::Trsv::Blocked>::
-        invoke(member,
-               ArgDiag::use_unit_diag,
-               A.extent(0), 
-               alpha,
-               A.data(), A.stride_0(), A.stride_1(),
-               b.data(), b.stride_0());
-    }
-  };
-
-  ///
-  /// U/T
-  ///
-
-  template<typename MemberType, typename ArgDiag>
-  struct TeamTrsv<MemberType,Uplo::Upper,Trans::Transpose,ArgDiag,Algo::Trsv::Unblocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const bViewType &b) {
-      return TeamTrsvInternalLower<Algo::Trsv::Unblocked>::
-        invoke(member,
-               ArgDiag::use_unit_diag,
-               A.extent(1), 
-               alpha,
-               A.data(), A.stride_1(), A.stride_0(),
-               b.data(), b.stride_0());
-    }
-  };
-
-  template<typename MemberType, typename ArgDiag>
-  struct TeamTrsv<MemberType,Uplo::Upper,Trans::Transpose,ArgDiag,Algo::Trsv::Blocked> {
-    template<typename ScalarType,
-             typename AViewType,
-             typename bViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const ScalarType alpha,
-           const AViewType &A,
-           const bViewType &b) {
-      return TeamTrsvInternalLower<Algo::Trsv::Blocked>::
-        invoke(member,
-               ArgDiag::use_unit_diag,
-               A.extent(1), 
-               alpha,
-               A.data(), A.stride_1(), A.stride_0(),
-               b.data(), b.stride_0());
-    }
-  };
-}
-
-
+///
+/// Team Impl
+/// ===========
+
+///
+/// Implemented:
+/// L/NT, U/NT, L/T, U/T
+///
+/// Not yet implemented
+/// L/CT, U/CT
+
+///
+/// L/NT
+///
+
+template <typename MemberType, typename ArgDiag>
+struct TeamTrsv<MemberType, Uplo::Lower, Trans::NoTranspose, ArgDiag,
+                Algo::Trsv::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const bViewType &b) {
+    return TeamTrsvInternalLower<Algo::Trsv::Unblocked>::invoke(
+        member, ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(),
+        A.stride_0(), A.stride_1(), b.data(), b.stride_0());
+  }
+};
+
+template <typename MemberType, typename ArgDiag>
+struct TeamTrsv<MemberType, Uplo::Lower, Trans::NoTranspose, ArgDiag,
+                Algo::Trsv::Blocked> {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const bViewType &b) {
+    return TeamTrsvInternalLower<Algo::Trsv::Blocked>::invoke(
+        member, ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(),
+        A.stride_0(), A.stride_1(), b.data(), b.stride_0());
+  }
+};
+
+///
+/// L/T
+///
+
+template <typename MemberType, typename ArgDiag>
+struct TeamTrsv<MemberType, Uplo::Lower, Trans::Transpose, ArgDiag,
+                Algo::Trsv::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const bViewType &b) {
+    return TeamTrsvInternalUpper<Algo::Trsv::Unblocked>::invoke(
+        member, ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(),
+        A.stride_1(), A.stride_0(), b.data(), b.stride_0());
+  }
+};
+
+template <typename MemberType, typename ArgDiag>
+struct TeamTrsv<MemberType, Uplo::Lower, Trans::Transpose, ArgDiag,
+                Algo::Trsv::Blocked> {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const bViewType &b) {
+    return TeamTrsvInternalUpper<Algo::Trsv::Blocked>::invoke(
+        ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(), A.stride_1(),
+        A.stride_0(), b.data(), b.stride_0());
+  }
+};
+
+///
+/// U/NT
+///
+
+template <typename MemberType, typename ArgDiag>
+struct TeamTrsv<MemberType, Uplo::Upper, Trans::NoTranspose, ArgDiag,
+                Algo::Trsv::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const bViewType &b) {
+    return TeamTrsvInternalUpper<Algo::Trsv::Unblocked>::invoke(
+        member, ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(),
+        A.stride_0(), A.stride_1(), b.data(), b.stride_0());
+  }
+};
+
+template <typename MemberType, typename ArgDiag>
+struct TeamTrsv<MemberType, Uplo::Upper, Trans::NoTranspose, ArgDiag,
+                Algo::Trsv::Blocked> {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const bViewType &b) {
+    return TeamTrsvInternalUpper<Algo::Trsv::Blocked>::invoke(
+        member, ArgDiag::use_unit_diag, A.extent(0), alpha, A.data(),
+        A.stride_0(), A.stride_1(), b.data(), b.stride_0());
+  }
+};
+
+///
+/// U/T
+///
+
+template <typename MemberType, typename ArgDiag>
+struct TeamTrsv<MemberType, Uplo::Upper, Trans::Transpose, ArgDiag,
+                Algo::Trsv::Unblocked> {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const bViewType &b) {
+    return TeamTrsvInternalLower<Algo::Trsv::Unblocked>::invoke(
+        member, ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(),
+        A.stride_1(), A.stride_0(), b.data(), b.stride_0());
+  }
+};
+
+template <typename MemberType, typename ArgDiag>
+struct TeamTrsv<MemberType, Uplo::Upper, Trans::Transpose, ArgDiag,
+                Algo::Trsv::Blocked> {
+  template <typename ScalarType, typename AViewType, typename bViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member,
+                                           const ScalarType alpha,
+                                           const AViewType &A,
+                                           const bViewType &b) {
+    return TeamTrsvInternalLower<Algo::Trsv::Blocked>::invoke(
+        member, ArgDiag::use_unit_diag, A.extent(1), alpha, A.data(),
+        A.stride_1(), A.stride_0(), b.data(), b.stride_0());
+  }
+};
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp
index 4607b177d2..60b941e1ba 100644
--- a/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_TRSV_TEAM_INTERNAL_HPP__
 #define __KOKKOSBATCHED_TRSV_TEAM_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -14,271 +13,223 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// Team Internal Impl
-  /// ====================
-
-  ///
-  /// Lower
-  ///
-    
-  template<typename AlgoType>
-  struct TeamTrsvInternalLower {
-    template<typename MemberType,
-             typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &/*member*/,
-           const bool /*use_unit_diag*/,
-           const int /*m*/,
-           const ScalarType /*alpha*/,
-           const ValueType *__restrict__ /*A*/, const int /*as0*/, const int /*as1*/,
-           /**/  ValueType *__restrict__ /*b*/, const int /*bs0*/) {
-      assert(false && "Error: encounter dummy impl");
-      return 0;
-    }
-  };
-
-  template<>
-  template<typename MemberType,
-           typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamTrsvInternalLower<Algo::Trsv::Unblocked>::
-  invoke(const MemberType &member,
-         const bool use_unit_diag,
-         const int m,
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         /**/  ValueType *__restrict__ b, const int bs0) {
-
-    const ScalarType one(1.0), zero(0.0);
-
-    if (alpha == zero)  TeamSetInternal::invoke(member, m, zero, b, bs0);
-    else {
-      if (alpha != one) TeamScaleInternal::invoke(member, m, alpha, b, bs0);
-      if (m <= 0) return 0;
-
-      for (int p=0;p<m;++p) {
-        const int iend = m-p-1;
-
-        const ValueType
-          *__restrict__ a21   = iend ? A+(p+1)*as0+p*as1 : NULL;
-
-        ValueType
-          *__restrict__ beta1 =        b+p*bs0,
-          *__restrict__ b2    = iend ? beta1+bs0 : NULL;
-
-        member.team_barrier();
-        ValueType local_beta1 = *beta1;
-        if (!use_unit_diag) {
-          const ValueType alpha11 = A[p*as0+p*as1];
-          local_beta1 = local_beta1 / alpha11;
-          /// make sure all local beta1 is same for threads
-          member.team_barrier();
-          if (member.team_rank() == 0)
-            *beta1 = local_beta1;
-        }
-        ///member.team_barrier();
-        Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,iend),[&](const int &i) {
-            b2[i*bs0] -= a21[i*as0] * local_beta1;
-          });
-      }
-    }
+///
+/// Team Internal Impl
+/// ====================
+
+///
+/// Lower
+///
+
+template <typename AlgoType>
+struct TeamTrsvInternalLower {
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType & /*member*/, const bool /*use_unit_diag*/,
+      const int /*m*/, const ScalarType /*alpha*/,
+      const ValueType *KOKKOS_RESTRICT /*A*/, const int /*as0*/,
+      const int /*as1*/,
+      /**/ ValueType *KOKKOS_RESTRICT /*b*/, const int /*bs0*/) {
+    assert(false && "Error: encounter dummy impl");
     return 0;
   }
-
-  template<>
-  template<typename MemberType,
-           typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamTrsvInternalLower<Algo::Trsv::Blocked>::
-  invoke(const MemberType &member,
-         const bool use_unit_diag,
-         const int m,
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         /**/  ValueType *__restrict__ b, const int bs0) {
-
-    const ScalarType one(1.0), zero(0.0), minus_one(-1.0);
-
-    enum : int {
-      mbAlgo = Algo::Trsv::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
-    };
-
-    if (alpha == zero)  TeamSetInternal::invoke(member, m, zero, b, bs0);
-    else {
-      if (alpha != one) TeamScaleInternal::invoke(member, m, alpha, b, bs0);
-      if (m <= 0) return 0;
-
-      /// case GPU: team size is large and blocksize (mb,nb) is small
-      InnerTrsmLeftLowerUnitDiag<mbAlgo>    trsm_u(as0, as1, bs0, 0);
-      InnerTrsmLeftLowerNonUnitDiag<mbAlgo> trsm_n(as0, as1, bs0, 0);
-            
-      const int mb = mbAlgo;
-      //const int tsize = member.team_size();
-      for (int p=0;p<m;p+=mb) {
-        const int pb = ((p+mb) > m ? (m-p) : mb);
-              
-        // trsm update
-        const ValueType *__restrict__ Ap = A+p*as0+p*as1;
-        /**/  ValueType *__restrict__ bp = b+p*bs0;
-          
-        member.team_barrier();
-        if (member.team_rank() == 0) {
-          if (use_unit_diag) trsm_u.serial_invoke(Ap, pb, 1, bp);
-          else               trsm_n.serial_invoke(Ap, pb, 1, bp);
-        }
-
-        // gemv update
+};
+
+template <>
+template <typename MemberType, typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower<Algo::Trsv::Unblocked>::invoke(
+    const MemberType &member, const bool use_unit_diag, const int m,
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0,
+    const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) {
+  const ScalarType one(1.0), zero(0.0);
+
+  if (alpha == zero)
+    TeamSetInternal::invoke(member, m, zero, b, bs0);
+  else {
+    if (alpha != one) TeamScaleInternal::invoke(member, m, alpha, b, bs0);
+    if (m <= 0) return 0;
+
+    for (int p = 0; p < m; ++p) {
+      const int iend = m - p - 1;
+
+      const ValueType *KOKKOS_RESTRICT a21 =
+          iend ? A + (p + 1) * as0 + p * as1 : NULL;
+
+      ValueType *KOKKOS_RESTRICT beta1 = b + p * bs0,
+                                 *KOKKOS_RESTRICT b2 =
+                                     iend ? beta1 + bs0 : NULL;
+
+      member.team_barrier();
+      ValueType local_beta1 = *beta1;
+      if (!use_unit_diag) {
+        const ValueType alpha11 = A[p * as0 + p * as1];
+        local_beta1             = local_beta1 / alpha11;
+        /// make sure all local beta1 is same for threads
         member.team_barrier();
-        TeamGemvInternal<Algo::Gemv::Blocked>
-          ::invoke(member,
-                   m-p-pb, pb,
-                   minus_one,
-                   Ap+pb*as0, as0, as1,
-                   bp, 1,
-                   one,
-                   bp+pb*bs0, bs0);
+        if (member.team_rank() == 0) *beta1 = local_beta1;
       }
+      /// member.team_barrier();
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, 0, iend),
+          [&](const int &i) { b2[i * bs0] -= a21[i * as0] * local_beta1; });
     }
-    return 0;
   }
+  return 0;
+}
+
+template <>
+template <typename MemberType, typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower<Algo::Trsv::Blocked>::invoke(
+    const MemberType &member, const bool use_unit_diag, const int m,
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0,
+    const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) {
+  const ScalarType one(1.0), zero(0.0), minus_one(-1.0);
+
+  constexpr int mbAlgo = Algo::Trsv::Blocked::mb();
+
+  if (alpha == zero)
+    TeamSetInternal::invoke(member, m, zero, b, bs0);
+  else {
+    if (alpha != one) TeamScaleInternal::invoke(member, m, alpha, b, bs0);
+    if (m <= 0) return 0;
+
+    /// case GPU: team size is large and blocksize (mb,nb) is small
+    InnerTrsmLeftLowerUnitDiag<mbAlgo> trsm_u(as0, as1, bs0, 0);
+    InnerTrsmLeftLowerNonUnitDiag<mbAlgo> trsm_n(as0, as1, bs0, 0);
+
+    const int mb = mbAlgo;
+    // const int tsize = member.team_size();
+    for (int p = 0; p < m; p += mb) {
+      const int pb = ((p + mb) > m ? (m - p) : mb);
+
+      // trsm update
+      const ValueType *KOKKOS_RESTRICT Ap = A + p * as0 + p * as1;
+      /**/ ValueType *KOKKOS_RESTRICT bp    = b + p * bs0;
+
+      member.team_barrier();
+      if (member.team_rank() == 0) {
+        if (use_unit_diag)
+          trsm_u.serial_invoke(Ap, pb, 1, bp);
+        else
+          trsm_n.serial_invoke(Ap, pb, 1, bp);
+      }
 
-  ///
-  /// Upper
-  ///
-
-  template<typename AlgoType>
-  struct TeamTrsvInternalUpper {
-    template<typename MemberType,
-             typename ScalarType,
-             typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &/*member*/,
-           const bool /*use_unit_diag*/,
-           const int /*m*/,
-           const ScalarType /*alpha*/,
-           const ValueType *__restrict__ /*A*/, const int /*as0*/, const int /*as1*/,
-           /**/  ValueType *__restrict__ /*b*/, const int /*bs0*/) {
-      assert(false && "Error: encounter dummy impl");
-      return 0;
+      // gemv update
+      member.team_barrier();
+      TeamGemvInternal<Algo::Gemv::Blocked>::invoke(
+          member, m - p - pb, pb, minus_one, Ap + pb * as0, as0, as1, bp, 1,
+          one, bp + pb * bs0, bs0);
     }
-  };
-
-  template<>
-  template<typename MemberType,
-           typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamTrsvInternalUpper<Algo::Trsv::Unblocked>::
-  invoke(const MemberType &member,
-         const bool use_unit_diag,
-         const int m,
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         /**/  ValueType *__restrict__ b, const int bs0) {
-
-    const ScalarType one(1.0), zero(0.0);
-
-    if (alpha == zero)  TeamSetInternal::invoke(member, m, zero, b, bs0);
-    else {
-      if (alpha != one) TeamScaleInternal::invoke(member, m, alpha, b, bs0);
-      if (m <= 0) return 0;
-
-      ValueType *__restrict__ b0 = b;
-      for (int p=(m-1);p>=0;--p) {
-        const int iend = p;
-
-        const ValueType *__restrict__ a01   = A+p*as1;
-        /**/  ValueType *__restrict__ beta1 = b+p*bs0;
+  }
+  return 0;
+}
 
+///
+/// Upper
+///
+
+template <typename AlgoType>
+struct TeamTrsvInternalUpper {
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType & /*member*/, const bool /*use_unit_diag*/,
+      const int /*m*/, const ScalarType /*alpha*/,
+      const ValueType *KOKKOS_RESTRICT /*A*/, const int /*as0*/,
+      const int /*as1*/,
+      /**/ ValueType *KOKKOS_RESTRICT /*b*/, const int /*bs0*/) {
+    assert(false && "Error: encounter dummy impl");
+    return 0;
+  }
+};
+
+template <>
+template <typename MemberType, typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper<Algo::Trsv::Unblocked>::invoke(
+    const MemberType &member, const bool use_unit_diag, const int m,
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0,
+    const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) {
+  const ScalarType one(1.0), zero(0.0);
+
+  if (alpha == zero)
+    TeamSetInternal::invoke(member, m, zero, b, bs0);
+  else {
+    if (alpha != one) TeamScaleInternal::invoke(member, m, alpha, b, bs0);
+    if (m <= 0) return 0;
+
+    ValueType *KOKKOS_RESTRICT b0 = b;
+    for (int p = (m - 1); p >= 0; --p) {
+      const int iend = p;
+
+      const ValueType *KOKKOS_RESTRICT a01 = A + p * as1;
+      /**/ ValueType *KOKKOS_RESTRICT beta1  = b + p * bs0;
+
+      member.team_barrier();
+      ValueType local_beta1 = *beta1;
+      if (!use_unit_diag) {
+        const ValueType alpha11 = A[p * as0 + p * as1];
+        local_beta1             = local_beta1 / alpha11;
+        /// make sure local beta is same for all threads
         member.team_barrier();
-        ValueType local_beta1 = *beta1;
-        if (!use_unit_diag) {
-          const ValueType alpha11 = A[p*as0+p*as1];
-          local_beta1 = local_beta1 / alpha11;
-          /// make sure local beta is same for all threads
-          member.team_barrier();
-          if (member.team_rank() == 0)
-            *beta1 = local_beta1;
-        }
-        //member.team_barrier();
-        Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,iend),[&](const int &i) {
-            b0[i*bs0] -= a01[i*as0] * local_beta1;
-          });
+        if (member.team_rank() == 0) *beta1 = local_beta1;
       }
+      // member.team_barrier();
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, 0, iend),
+          [&](const int &i) { b0[i * bs0] -= a01[i * as0] * local_beta1; });
     }
-    return 0;
   }
+  return 0;
+}
 
-  template<>
-  template<typename MemberType,
-           typename ScalarType,
-           typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  TeamTrsvInternalUpper<Algo::Trsv::Blocked>::
-  invoke(const MemberType &member,
-         const bool use_unit_diag,
-         const int m,
-         const ScalarType alpha,
-         const ValueType *__restrict__ A, const int as0, const int as1,
-         /**/  ValueType *__restrict__ b, const int bs0) {
-
-    const ScalarType one(1.0), zero(0.0), minus_one(-1.0);
-
-    enum : int {
-      mbAlgo = Algo::Trsm::Blocked::mb<Kokkos::Impl::ActiveExecutionMemorySpace>()
-    };
-
-    // note that parallel range is different ( m*n vs m-1*n);
-    if (alpha == zero)   TeamSetInternal::invoke(member, m, zero, b, bs0);
-    else {
-      if (alpha != one) TeamScaleInternal::invoke(member, m, alpha, b, bs0);
-      if (m <= 0) return 0;
-
-      InnerTrsmLeftUpperUnitDiag<mbAlgo>    trsm_u(as0, as1, bs0, 0);
-      InnerTrsmLeftUpperNonUnitDiag<mbAlgo> trsm_n(as0, as1, bs0, 0);
-            
-      const int mb = mbAlgo;
-      for (int pp=0;pp<m;pp+=mb) {
-        const int
-          ptmp = (m - pp - mb),
-          p = (ptmp < 0 ? 0 : ptmp),
-          pb = (mb + (ptmp < 0)*ptmp);
-              
-        // trsm update
-        const ValueType *__restrict__ Ap = A+p*as0+p*as1;
-        /**/  ValueType *__restrict__ bp = b+p*bs0;
-              
-        member.team_barrier();
-        if (member.team_rank() == 0) {
-          if (use_unit_diag) trsm_u.serial_invoke(Ap, pb, 1, bp);
-          else               trsm_n.serial_invoke(Ap, pb, 1, bp);
-        }
-              
-        // gemv update
-        member.team_barrier();
-        TeamGemvInternal<Algo::Gemv::Unblocked>
-          ::invoke(member,
-                   p, pb,
-                   minus_one,
-                   Ap-p*as0, as0, as1,
-                   bp, 1,
-                   one,
-                   b, bs0);
+template <>
+template <typename MemberType, typename ScalarType, typename ValueType>
+KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper<Algo::Trsv::Blocked>::invoke(
+    const MemberType &member, const bool use_unit_diag, const int m,
+    const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0,
+    const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT b, const int bs0) {
+  const ScalarType one(1.0), zero(0.0), minus_one(-1.0);
+
+  constexpr int mbAlgo = Algo::Trsm::Blocked::mb();
+
+  // note that parallel range is different ( m*n vs m-1*n);
+  if (alpha == zero)
+    TeamSetInternal::invoke(member, m, zero, b, bs0);
+  else {
+    if (alpha != one) TeamScaleInternal::invoke(member, m, alpha, b, bs0);
+    if (m <= 0) return 0;
+
+    InnerTrsmLeftUpperUnitDiag<mbAlgo> trsm_u(as0, as1, bs0, 0);
+    InnerTrsmLeftUpperNonUnitDiag<mbAlgo> trsm_n(as0, as1, bs0, 0);
+
+    const int mb = mbAlgo;
+    for (int pp = 0; pp < m; pp += mb) {
+      const int ptmp = (m - pp - mb), p = (ptmp < 0 ? 0 : ptmp),
+                pb = (mb + (ptmp < 0) * ptmp);
+
+      // trsm update
+      const ValueType *KOKKOS_RESTRICT Ap = A + p * as0 + p * as1;
+      /**/ ValueType *KOKKOS_RESTRICT bp    = b + p * bs0;
+
+      member.team_barrier();
+      if (member.team_rank() == 0) {
+        if (use_unit_diag)
+          trsm_u.serial_invoke(Ap, pb, 1, bp);
+        else
+          trsm_n.serial_invoke(Ap, pb, 1, bp);
       }
+
+      // gemv update
+      member.team_barrier();
+      TeamGemvInternal<Algo::Gemv::Unblocked>::invoke(
+          member, p, pb, minus_one, Ap - p * as0, as0, as1, bp, 1, one, b, bs0);
     }
-    return 0;
   }
+  return 0;
 }
-
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Trtri_Serial_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Trtri_Serial_Impl.hpp
index bc2da0e066..703f46ed25 100644
--- a/src/batched/dense/impl/KokkosBatched_Trtri_Serial_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Trtri_Serial_Impl.hpp
@@ -49,28 +49,24 @@
 #include "KokkosBatched_Trtri_Serial_Internal.hpp"
 
 namespace KokkosBatched {
-  template<typename ArgDiag>
-  struct SerialTrtri<Uplo::Lower,ArgDiag,Algo::Trtri::Unblocked> {
-    template<typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const AViewType &A) {
-      return SerialTrtriInternalLower<Algo::Trtri::Unblocked>::invoke(ArgDiag::use_unit_diag,
-                                                              A.extent(0), A.extent(1),
-                                                              A.data(), A.stride_0(), A.stride_1());
-    }
-  };
-  template<typename ArgDiag>
-  struct SerialTrtri<Uplo::Upper,ArgDiag,Algo::Trtri::Unblocked> {
-    template<typename AViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const AViewType &A) {
-      return SerialTrtriInternalUpper<Algo::Trtri::Unblocked>::invoke(ArgDiag::use_unit_diag,
-                                                              A.extent(0), A.extent(1),
-                                                              A.data(), A.stride(0), A.stride(1));
-    }
-  };
-} // namespace KokkosBatched
+template <typename ArgDiag>
+struct SerialTrtri<Uplo::Lower, ArgDiag, Algo::Trtri::Unblocked> {
+  template <typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A) {
+    return SerialTrtriInternalLower<Algo::Trtri::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, A.extent(0), A.extent(1), A.data(),
+        A.stride_0(), A.stride_1());
+  }
+};
+template <typename ArgDiag>
+struct SerialTrtri<Uplo::Upper, ArgDiag, Algo::Trtri::Unblocked> {
+  template <typename AViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A) {
+    return SerialTrtriInternalUpper<Algo::Trtri::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, A.extent(0), A.extent(1), A.data(), A.stride(0),
+        A.stride(1));
+  }
+};
+}  // namespace KokkosBatched
 
-#endif // __KOKKOSBATCHED_TRTRI_SERIAL_IMPL_HPP__
+#endif  // __KOKKOSBATCHED_TRTRI_SERIAL_IMPL_HPP__
diff --git a/src/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp
index 59dfa100f6..ee14040aed 100644
--- a/src/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp
@@ -50,137 +50,118 @@
 
 namespace KokkosBatched {
 
-  template<typename AlgoType>
-  struct SerialTrtriInternalLower {
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int 
-    invoke(const bool use_unit_diag,
-           const int am, const int an, 
-           ValueType *__restrict__ A, const int as0, const int as1);
-  };
-
-  template<typename AlgoType>
-  struct SerialTrtriInternalUpper {
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int 
-    invoke(const bool use_unit_diag,
-           const int am, const int an, 
-           ValueType *__restrict__ A, const int as0, const int as1);
-  };
-
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialTrtriInternalLower<Algo::Trtri::Unblocked>::
-  invoke(const bool use_unit_diag,
-         const int am, const int /*an*/,
-         ValueType *__restrict__ A, const int as0, const int as1) {
-    ValueType one(1.0), zero(0.0), A_ii;
-    if (!use_unit_diag) {
+template <typename AlgoType>
+struct SerialTrtriInternalLower {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag,
+                                           const int am, const int an,
+                                           ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1);
+};
+
+template <typename AlgoType>
+struct SerialTrtriInternalUpper {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag,
+                                           const int am, const int an,
+                                           ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1);
+};
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+SerialTrtriInternalLower<Algo::Trtri::Unblocked>::invoke(
+    const bool use_unit_diag, const int am, const int /*an*/,
+    ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) {
+  ValueType one(1.0), zero(0.0), A_ii;
+  if (!use_unit_diag) {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      // Check for singularity
-      for (int i = 0; i < am; ++i)
-        if (A[i*as0 + i*as1] == zero)
-          return i+1;
-    }
+    // Check for singularity
+    for (int i = 0; i < am; ++i)
+      if (A[i * as0 + i * as1] == zero) return i + 1;
+  }
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int i = am - 1; i >= 0; --i) {
-      A[i*as0 + i*as1] = one / A[i*as0 + i*as1];
-
-      if (i < am - 1) {
-        if (use_unit_diag)
-          A_ii = -one;
-        else
-          A_ii = -A[i*as0 + i*as1];
-
-        ValueType *__restrict__ A_subblock = &A[(i+1)*as0 + (i+1)*as1];
-        int A_subblock_m = am - i - 1, 
-            A_subblock_n = am - i - 1;
-        ValueType *__restrict__ A_col_vec  = &A[(i+1)*as0 + i*as1];
-        int A_col_vec_m  = am - i - 1,
-            A_col_vec_n  = 1;
-        // TRMV/TRMM −− x=Ax
-        // A((j+1):n,j) = A((j+1):n,(j+1):n) ∗ A((j+1):n,j) ;
-        SerialTrmmInternalLeftLower<Algo::Trmm::Unblocked>::invoke(use_unit_diag,
-                                                                    false,
-                                                                    A_subblock_m, A_subblock_n,
-                                                                    A_col_vec_m,  A_col_vec_n,
-                                                                    one,
-                                                                    A_subblock, as0, as1,
-                                                                    A_col_vec,  as0, as1);
-        
-        // SCAL -- x=ax
-        // A((j+1):n,j) = A_ii * A((j+1):n,j)
-        SerialScaleInternal::invoke(A_col_vec_m, A_col_vec_n, A_ii, A_col_vec, as0, as1);
-      }
+  for (int i = am - 1; i >= 0; --i) {
+    A[i * as0 + i * as1] = one / A[i * as0 + i * as1];
+
+    if (i < am - 1) {
+      if (use_unit_diag)
+        A_ii = -one;
+      else
+        A_ii = -A[i * as0 + i * as1];
+
+      ValueType *KOKKOS_RESTRICT A_subblock = &A[(i + 1) * as0 + (i + 1) * as1];
+      int A_subblock_m = am - i - 1, A_subblock_n = am - i - 1;
+      ValueType *KOKKOS_RESTRICT A_col_vec = &A[(i + 1) * as0 + i * as1];
+      int A_col_vec_m = am - i - 1, A_col_vec_n = 1;
+      // TRMV/TRMM −− x=Ax
+      // A((j+1):n,j) = A((j+1):n,(j+1):n) ∗ A((j+1):n,j) ;
+      SerialTrmmInternalLeftLower<Algo::Trmm::Unblocked>::invoke(
+          use_unit_diag, false, A_subblock_m, A_subblock_n, A_col_vec_m,
+          A_col_vec_n, one, A_subblock, as0, as1, A_col_vec, as0, as1);
+
+      // SCAL -- x=ax
+      // A((j+1):n,j) = A_ii * A((j+1):n,j)
+      SerialScaleInternal::invoke(A_col_vec_m, A_col_vec_n, A_ii, A_col_vec,
+                                  as0, as1);
     }
-    return 0;
   }
+  return 0;
+}
 
-  template<>
-  template<typename ValueType>
-  KOKKOS_INLINE_FUNCTION
-  int
-  SerialTrtriInternalUpper<Algo::Trtri::Unblocked>::
-  invoke(const bool use_unit_diag,
-         const int am, const int /*an*/,
-         ValueType *__restrict__ A, const int as0, const int as1) {
-    ValueType one(1.0), zero(0.0), A_ii;
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+SerialTrtriInternalUpper<Algo::Trtri::Unblocked>::invoke(
+    const bool use_unit_diag, const int am, const int /*an*/,
+    ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) {
+  ValueType one(1.0), zero(0.0), A_ii;
 
-
-    if (!use_unit_diag) {
+  if (!use_unit_diag) {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-      // Check for singularity
-      for (int i = 0; i < am; ++i)
-        if (A[i*as0 + i*as1] == zero)
-          return i+1;
-    }
+    // Check for singularity
+    for (int i = 0; i < am; ++i)
+      if (A[i * as0 + i * as1] == zero) return i + 1;
+  }
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif
-    for (int i = 0; i < am; ++i) {
-      A[i*as0 + i*as1] = one / A[i*as0 + i*as1];
-
-      if (i > 0) {
-        if (use_unit_diag)
-          A_ii = -one;
-        else
-          A_ii = -A[i*as0 + i*as1];
-
-        ValueType *__restrict__ A_subblock = &A[0*as0 + 0*as1];
-        int A_subblock_m = i,
-            A_subblock_n = i;
-        ValueType *__restrict__ A_col_vec  = &A[0*as0 + i*as1];
-        int A_col_vec_m  = i,
-            A_col_vec_n  = 1;
-        // TRMV/TRMM −− x=Ax
-        // A(1:(j-1),j) = A(1:(j-1),1:(j-1)) ∗ A(1:(j-1),j) ;
-        //SerialTrmm<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::NoUnit,Algo::Trmm::Unblocked>
-        SerialTrmmInternalLeftUpper<Algo::Trmm::Unblocked>::invoke(use_unit_diag,
-                                                                    false,
-                                                                    A_subblock_m, A_subblock_n,
-                                                                    A_col_vec_m,  A_col_vec_n,
-                                                                    one,
-                                                                    A_subblock, as0, as1,
-                                                                    A_col_vec,  as0, as1);
-        
-        // SCAL -- x=ax
-        // A((j+1):n,j) = A_ii * A((j+1):n,j)
-        SerialScaleInternal::invoke(A_col_vec_m, A_col_vec_n, A_ii, A_col_vec, as0, as1);
-      }
+  for (int i = 0; i < am; ++i) {
+    A[i * as0 + i * as1] = one / A[i * as0 + i * as1];
+
+    if (i > 0) {
+      if (use_unit_diag)
+        A_ii = -one;
+      else
+        A_ii = -A[i * as0 + i * as1];
+
+      ValueType *KOKKOS_RESTRICT A_subblock = &A[0 * as0 + 0 * as1];
+      int A_subblock_m = i, A_subblock_n = i;
+      ValueType *KOKKOS_RESTRICT A_col_vec = &A[0 * as0 + i * as1];
+      int A_col_vec_m = i, A_col_vec_n = 1;
+      // TRMV/TRMM −− x=Ax
+      // A(1:(j-1),j) = A(1:(j-1),1:(j-1)) ∗ A(1:(j-1),j) ;
+      // SerialTrmm<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::NoUnit,Algo::Trmm::Unblocked>
+      SerialTrmmInternalLeftUpper<Algo::Trmm::Unblocked>::invoke(
+          use_unit_diag, false, A_subblock_m, A_subblock_n, A_col_vec_m,
+          A_col_vec_n, one, A_subblock, as0, as1, A_col_vec, as0, as1);
+
+      // SCAL -- x=ax
+      // A((j+1):n,j) = A_ii * A((j+1):n,j)
+      SerialScaleInternal::invoke(A_col_vec_m, A_col_vec_n, A_ii, A_col_vec,
+                                  as0, as1);
     }
-    return 0;
   }
-} // namespace KokkosBatched
-#endif // __KOKKOSBATCHED_TRTRI_SERIAL_INTERNAL_HPP__
+  return 0;
+}
+}  // namespace KokkosBatched
+#endif  // __KOKKOSBATCHED_TRTRI_SERIAL_INTERNAL_HPP__
diff --git a/src/batched/dense/impl/KokkosBatched_UTV_TeamVector_Impl.hpp b/src/batched/dense/impl/KokkosBatched_UTV_TeamVector_Impl.hpp
index b06c76b02a..2f1c075ecd 100644
--- a/src/batched/dense/impl/KokkosBatched_UTV_TeamVector_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_UTV_TeamVector_Impl.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_UTV_TEAMVECTOR_IMPL_HPP__
 #define __KOKKOSBATCHED_UTV_TEAMVECTOR_IMPL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -9,40 +8,25 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// TeamVector Impl
-  /// ===============
-
-  template<typename MemberType>
-  struct TeamVectorUTV<MemberType,Algo::UTV::Unblocked> {
-    template<typename AViewType,
-	     typename pViewType,
-	     typename UViewType,
-	     typename VViewType,
-             typename wViewType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member,
-           const AViewType &A,
-	   const pViewType &p,
-	   const UViewType &U,
-	   const VViewType &V,
-           const wViewType &w,
-	   int &matrix_rank) {
-      return TeamVectorUTV_Internal::
-        invoke(member,
-               A.extent(0), A.extent(1), 
-               A.data(), A.stride(0), A.stride(1),
-	       p.data(), p.stride(0),
-	       U.data(), U.stride(0), U.stride(1),
-	       V.data(), V.stride(0), V.stride(1),
-               w.data(),
-	       matrix_rank);
-    }
-  };
-        
-}
-
-
+///
+/// TeamVector Impl
+/// ===============
+
+template <typename MemberType>
+struct TeamVectorUTV<MemberType, Algo::UTV::Unblocked> {
+  template <typename AViewType, typename pViewType, typename UViewType,
+            typename VViewType, typename wViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const AViewType &A, const pViewType &p,
+      const UViewType &U, const VViewType &V, const wViewType &w,
+      int &matrix_rank) {
+    return TeamVectorUTV_Internal::invoke(
+        member, A.extent(0), A.extent(1), A.data(), A.stride(0), A.stride(1),
+        p.data(), p.stride(0), U.data(), U.stride(0), U.stride(1), V.data(),
+        V.stride(0), V.stride(1), w.data(), matrix_rank);
+  }
+};
+
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_UTV_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_UTV_TeamVector_Internal.hpp
index 354dfa7c44..c8a84a974f 100644
--- a/src/batched/dense/impl/KokkosBatched_UTV_TeamVector_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_UTV_TeamVector_Internal.hpp
@@ -1,7 +1,6 @@
 #ifndef __KOKKOSBATCHED_UTV_TEAMVECTOR_INTERNAL_HPP__
 #define __KOKKOSBATCHED_UTV_TEAMVECTOR_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
@@ -13,80 +12,55 @@
 
 namespace KokkosBatched {
 
-  ///
-  /// TeamVector Internal
-  /// =================== 
-  struct TeamVectorUTV_Internal {
-    template<typename MemberType,
-             typename ValueType,
-	     typename IntType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const MemberType &member, 
-           const int m, const int n, // m = NumRows(A), n = NumCols(A)
-           /* */ ValueType * A, const int as0, const int as1,
-	   /* */ IntType   * p, const int ps0,
-	   /* */ ValueType * U, const int us0, const int us1,
-	   /* */ ValueType * V, const int vs0, const int vs1,
-           /* */ ValueType * w, // 3*m, tau, norm, householder workspace
-	   /* */ int &matrix_rank) {
-      typedef ValueType value_type;
-      //typedef IntType int_type;
+///
+/// TeamVector Internal
+/// ===================
+struct TeamVectorUTV_Internal {
+  template <typename MemberType, typename ValueType, typename IntType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const int m,
+      const int n,  // m = NumRows(A), n = NumCols(A)
+      /* */ ValueType *A, const int as0, const int as1,
+      /* */ IntType *p, const int ps0,
+      /* */ ValueType *U, const int us0, const int us1,
+      /* */ ValueType *V, const int vs0, const int vs1,
+      /* */ ValueType *w,  // 3*m, tau, norm, householder workspace
+      /* */ int &matrix_rank) {
+    typedef ValueType value_type;
+    // typedef IntType int_type;
+
+    value_type *t = w;
+    w += m;
+    const int ts0(1);
 
-      value_type *t = w; w+= m;
-      const int ts0(1);
+    value_type *work = w;
 
-      value_type *work = w;
+    matrix_rank = -1;
+    TeamVectorQR_WithColumnPivotingInternal ::invoke(
+        member, m, n, A, as0, as1, t, ts0, p, ps0, work, matrix_rank);
 
-      matrix_rank = -1;
-      TeamVectorQR_WithColumnPivotingInternal
-      	::invoke(member,
-      		 m, n,
-      		 A, as0, as1,
-      		 t, ts0,
-      		 p, ps0,
-      		 work,
-      		 matrix_rank);
+    TeamVectorQR_FormQ_Internal ::invoke(member, m, matrix_rank, matrix_rank, A,
+                                         as0, as1, t, ts0, U, us0, us1, work);
+    member.team_barrier();
 
-      TeamVectorQR_FormQ_Internal
-      	::invoke(member,
-      		 m, matrix_rank, matrix_rank,
-      		 A, as0, as1,
-      		 t, ts0,
-      		 U, us0, us1,
-      		 work);
-      member.team_barrier();
+    /// for rank deficient matrix
+    if (matrix_rank < n) {
+      const value_type zero(0);
+      TeamVectorSetLowerTriangularInternal ::invoke(
+          member, matrix_rank, matrix_rank, 1, zero, A, as0, as1);
 
-      /// for rank deficient matrix
-      if (matrix_rank < n) {
-	const value_type zero(0);
-	TeamVectorSetLowerTriangularInternal
-	  ::invoke(member,
-		   matrix_rank, matrix_rank,
-		   1, zero,
-		   A, as0, as1);
-	
-	TeamVectorQR_Internal
-	  ::invoke(member,
-		   n, matrix_rank,
-		   A, as1, as0,
-		   t, ts0,
-		   work);
-	
-	TeamVectorQR_FormQ_Internal
-	  ::invoke(member,
-		   n, matrix_rank, matrix_rank,
-		   A, as1, as0,
-		   t, ts0,
-		   V, vs1, vs0,
-		   work);
-      }
+      TeamVectorQR_Internal ::invoke(member, n, matrix_rank, A, as1, as0, t,
+                                     ts0, work);
 
-      return 0;
+      TeamVectorQR_FormQ_Internal ::invoke(member, n, matrix_rank, matrix_rank,
+                                           A, as1, as0, t, ts0, V, vs1, vs0,
+                                           work);
     }
-  };
 
-} // end namespace KokkosBatched
+    return 0;
+  }
+};
 
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_UpdateGivens_Internal.hpp b/src/batched/dense/impl/KokkosBatched_UpdateGivens_Internal.hpp
index 4a2ac41b12..15cae3936f 100644
--- a/src/batched/dense/impl/KokkosBatched_UpdateGivens_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_UpdateGivens_Internal.hpp
@@ -1,35 +1,31 @@
 #ifndef __KOKKOSBATCHED_UPDATE_GIVENS_INTERNAL_HPP__
 #define __KOKKOSBATCHED_UPDATE_GIVENS_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 namespace KokkosBatched {
 
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
-  ///
-  /// this impl follows the flame interface of householder transformation
-  ///
-  struct SerialUpdateGivensInternal {
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const Kokkos::pair<ValueType,ValueType> &S,
-           /* */ Kokkos::pair<ValueType,ValueType> &G) {
-      const ValueType
-        tmp    = S.first*G.first -S.second*G.second;
-      G.second = S.first*G.second+S.second*G.first;
-      G.first  = tmp;
-
-      return 0;
-    }
-  };
-
-} // end namespace KokkosBatched
-
+///
+/// Serial Internal Impl
+/// ====================
+///
+/// this impl follows the flame interface of householder transformation
+///
+struct SerialUpdateGivensInternal {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const Kokkos::pair<ValueType, ValueType> &S,
+      /* */ Kokkos::pair<ValueType, ValueType> &G) {
+    const ValueType tmp = S.first * G.first - S.second * G.second;
+    G.second            = S.first * G.second + S.second * G.first;
+    G.first             = tmp;
+
+    return 0;
+  }
+};
+
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp b/src/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp
index 49317ca9d4..90bf2e4dc6 100644
--- a/src/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp
@@ -8,1006 +8,880 @@
 
 namespace KokkosBatched {
 
-  //#define KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(A) typename std::enable_if<std::is_same<Kokkos::Impl::ActiveExecutionMemorySpace,Kokkos::HostSpace>::value,A >::type
-#define KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T,l) Vector< SIMD< T >, l >
-#define KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T,l) Vector< SIMD < T >, l> &
+#define KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) Vector<SIMD<T>, l>
+#define KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T, l) \
+  Vector<SIMD<T>, l> &
 
-  /// simd, simd
+/// simd, simd
 #if defined(__KOKKOSBATCHED_ENABLE_AVX__)
 #if defined(__AVX512F__)
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double,8)
-    operator + (const Vector<SIMD<double>,8> &a, const Vector<SIMD<double>,8> &b) {
-    return _mm512_add_pd(a, b);
-  }
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 8) operator+(
+    const Vector<SIMD<double>, 8> &a, const Vector<SIMD<double>, 8> &b) {
+  return _mm512_add_pd(a, b);
+}
 
 #if !defined(KOKKOS_COMPILER_GNU)
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<double>,4)
-    operator + (const Vector<SIMD<Kokkos::complex<double> >,4> &a, const Vector<SIMD<Kokkos::complex<double> >,4> &b) {
-    return _mm512_add_pd(a, b);
-  }
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<double>, 4)
+operator+(const Vector<SIMD<Kokkos::complex<double> >, 4> &a,
+          const Vector<SIMD<Kokkos::complex<double> >, 4> &b) {
+  return _mm512_add_pd(a, b);
+}
 #endif
 
 #endif
 #if defined(__AVX__) || defined(__AVX2__)
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double,4)
-    operator + (const Vector<SIMD<double>,4> & a, const Vector<SIMD<double>,4> & b) {
-    return _mm256_add_pd(a, b);
-  }
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator+(
+    const Vector<SIMD<double>, 4> &a, const Vector<SIMD<double>, 4> &b) {
+  return _mm256_add_pd(a, b);
+}
 
 #if !defined(KOKKOS_COMPILER_GNU)
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<double>,2)
-    operator + (const Vector<SIMD<Kokkos::complex<double> >,2> & a, const Vector<SIMD<Kokkos::complex<double> >,2> & b) {
-    return _mm256_add_pd(a, b);
-  }
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<double>, 2)
+operator+(const Vector<SIMD<Kokkos::complex<double> >, 2> &a,
+          const Vector<SIMD<Kokkos::complex<double> >, 2> &b) {
+  return _mm256_add_pd(a, b);
+}
 #endif
 
 #endif
 #endif
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION 
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T,l)
-    operator + (const Vector<SIMD<T>,l> &a,  const Vector<SIMD<T>,l> &b) {
-    Vector<SIMD<T>,l> r_val;
-    if (std::is_fundamental<T>::value) {
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
-#pragma ivdep
-#endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
-#pragma vector always
-#endif
-#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
-#pragma omp simd
-#endif
-      for (int i=0;i<l;++i)
-        r_val[i] = a[i] + b[i];
-    } else {
-      for (int i=0;i<l;++i)
-        r_val[i] = a[i] + b[i];
-    }
-    return r_val;
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l)
+operator+(const Vector<SIMD<T>, l> &a, const Vector<SIMD<T>, l> &b) {
+  Vector<SIMD<T>, l> r_val;
+  if (std::is_fundamental<T>::value) {
+    KOKKOSKERNELS_FORCE_SIMD
+    for (int i = 0; i < l; ++i) r_val[i] = a[i] + b[i];
+  } else {
+    for (int i = 0; i < l; ++i) r_val[i] = a[i] + b[i];
   }
-    
+  return r_val;
+}
+
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-  KOKKOS_FORCEINLINE_FUNCTION 
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2)
-    operator + (const Vector<SIMD<float>,2> &a,  const Vector<SIMD<float>,2> &b) {
-    float2 r_val;
-    r_val.x = a.float2().x + b.float2().x;
-    r_val.y = a.float2().y + b.float2().y;
-    return r_val;
-  }
-  KOKKOS_FORCEINLINE_FUNCTION 
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double,2)
-    operator + (const Vector<SIMD<double>,2> &a,  const Vector<SIMD<double>,2> &b) {
-    double2 r_val;
-    r_val.x = a.double2().x + b.double2().x;
-    r_val.y = a.double2().y + b.double2().y;
-    return r_val;
-  }
-  KOKKOS_FORCEINLINE_FUNCTION 
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,4)
-    operator + (const Vector<SIMD<float>,4> &a,  const Vector<SIMD<float>,4> &b) {
-    float4 r_val;
-    r_val.x = a.float4().x + b.float4().x;
-    r_val.y = a.float4().y + b.float4().y;
-    r_val.z = a.float4().z + b.float4().z;
-    r_val.w = a.float4().w + b.float4().w;
-    return r_val;
-  }
-  KOKKOS_FORCEINLINE_FUNCTION 
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double,4)
-    operator + (const Vector<SIMD<double>,4> &a,  const Vector<SIMD<double>,4> &b) {
-    double4 r_val;
-    r_val.x = a.double4().x + b.double4().x;
-    r_val.y = a.double4().y + b.double4().y;
-    r_val.z = a.double4().z + b.double4().z;
-    r_val.w = a.double4().w + b.double4().w;
-    return r_val;
-  }
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 2) operator+(
+    const Vector<SIMD<float>, 2> &a, const Vector<SIMD<float>, 2> &b) {
+  float2 r_val;
+  r_val.x = a.float2().x + b.float2().x;
+  r_val.y = a.float2().y + b.float2().y;
+  return r_val;
+}
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 2) operator+(
+    const Vector<SIMD<double>, 2> &a, const Vector<SIMD<double>, 2> &b) {
+  double2 r_val;
+  r_val.x = a.double2().x + b.double2().x;
+  r_val.y = a.double2().y + b.double2().y;
+  return r_val;
+}
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator+(
+    const Vector<SIMD<float>, 4> &a, const Vector<SIMD<float>, 4> &b) {
+  float4 r_val;
+  r_val.x = a.float4().x + b.float4().x;
+  r_val.y = a.float4().y + b.float4().y;
+  r_val.z = a.float4().z + b.float4().z;
+  r_val.w = a.float4().w + b.float4().w;
+  return r_val;
+}
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator+(
+    const Vector<SIMD<double>, 4> &a, const Vector<SIMD<double>, 4> &b) {
+  double4 r_val;
+  r_val.x = a.double4().x + b.double4().x;
+  r_val.y = a.double4().y + b.double4().y;
+  r_val.z = a.double4().z + b.double4().z;
+  r_val.w = a.double4().w + b.double4().w;
+  return r_val;
+}
 
 #endif
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T,l)
-    operator += (Vector<SIMD<T>,l> &a, const Vector<SIMD<T>,l> &b) {
-    a = a + b;
-    return a;
-  }
-    
-  /// simd, real
-    
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T,l)
-    operator + (const Vector<SIMD<T>,l> &a, const T b) {
-    return a + Vector<SIMD<T>,l>(b);
-  }
-    
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T,l)
-    operator + (const T a, const Vector<SIMD<T>,l> &b) {
-    return Vector<SIMD<T>,l>(a) + b;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(
+    T, l)
+operator+=(Vector<SIMD<T>, l> &a, const Vector<SIMD<T>, l> &b) {
+  a = a + b;
+  return a;
+}
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T,l)
-    operator += (Vector<SIMD<T>,l> &a, const T b) {
-    a = a + b;
-    return a;
-  }
+/// simd, real
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T,l)
-    operator ++ (Vector<SIMD<T>,l> &a, int) {
-    Vector<SIMD<T>,l> a0 = a;
-    a = a + typename Kokkos::Details::ArithTraits<T>::mag_type(1);
-    return a0;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l)
+operator+(const Vector<SIMD<T>, l> &a, const T b) {
+  return a + Vector<SIMD<T>, l>(b);
+}
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T,l)
-    operator ++ (Vector<SIMD<T>,l> &a) {
-    a = a + typename Kokkos::Details::ArithTraits<T>::mag_type(1);
-    return a;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l)
+operator+(const T a, const Vector<SIMD<T>, l> &b) {
+  return Vector<SIMD<T>, l>(a) + b;
+}
 
-  /// simd complex, real
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(
+    T, l)
+operator+=(Vector<SIMD<T>, l> &a, const T b) {
+  a = a + b;
+  return a;
+}
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<T>,l)
-    operator + (const Vector<SIMD<Kokkos::complex<T> >,l> &a, const T b) {
-    return a + Vector<SIMD<Kokkos::complex<T> >,l>(b);
-  }
-    
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<T>,l)
-    operator + (const T a, const Vector<SIMD<Kokkos::complex<T> >,l> &b) {
-    return Vector<SIMD<Kokkos::complex<T> >,l>(a) + b;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l)
+operator++(Vector<SIMD<T>, l> &a, int) {
+  Vector<SIMD<T>, l> a0 = a;
+  a = a + typename Kokkos::Details::ArithTraits<T>::mag_type(1);
+  return a0;
+}
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(Kokkos::complex<T>,l)
-    operator += (Vector<SIMD<Kokkos::complex<T> >,l> &a, const T b) {
-    a = a + b;
-    return a;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(
+    T, l)
+operator++(Vector<SIMD<T>, l> &a) {
+  a = a + typename Kokkos::Details::ArithTraits<T>::mag_type(1);
+  return a;
+}
 
-  /// simd complex, complex 
+/// simd complex, real
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<T>,l)
-    operator + (const Vector<SIMD<Kokkos::complex<T> >,l> &a, const Kokkos::complex<T> b) {
-    return a + Vector<SIMD<Kokkos::complex<T> >,l>(b);
-  }
-    
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<T>,l)
-    operator + (const Kokkos::complex<T> a, const Vector<SIMD<Kokkos::complex<T> >,l> &b) {
-    return Vector<SIMD<Kokkos::complex<T> >,l>(a) + b;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(
+    Kokkos::complex<T>, l)
+operator+(const Vector<SIMD<Kokkos::complex<T> >, l> &a, const T b) {
+  return a + Vector<SIMD<Kokkos::complex<T> >, l>(b);
+}
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(Kokkos::complex<T>,l)
-    operator += (Vector<SIMD<Kokkos::complex<T> >,l> &a, const Kokkos::complex<T> b) {
-    a = a + b;
-    return a;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(
+    Kokkos::complex<T>, l)
+operator+(const T a, const Vector<SIMD<Kokkos::complex<T> >, l> &b) {
+  return Vector<SIMD<Kokkos::complex<T> >, l>(a) + b;
+}
+
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(
+    Kokkos::complex<T>, l)
+operator+=(Vector<SIMD<Kokkos::complex<T> >, l> &a, const T b) {
+  a = a + b;
+  return a;
+}
 
-  /// ---------------------------------------------------------------------------------------------------
+/// simd complex, complex
+
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(
+    Kokkos::complex<T>, l)
+operator+(const Vector<SIMD<Kokkos::complex<T> >, l> &a,
+          const Kokkos::complex<T> b) {
+  return a + Vector<SIMD<Kokkos::complex<T> >, l>(b);
+}
+
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(
+    Kokkos::complex<T>, l)
+operator+(const Kokkos::complex<T> a,
+          const Vector<SIMD<Kokkos::complex<T> >, l> &b) {
+  return Vector<SIMD<Kokkos::complex<T> >, l>(a) + b;
+}
+
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(
+    Kokkos::complex<T>, l)
+operator+=(Vector<SIMD<Kokkos::complex<T> >, l> &a,
+           const Kokkos::complex<T> b) {
+  a = a + b;
+  return a;
+}
 
-  /// simd, simd
+/// ---------------------------------------------------------------------------------------------------
+
+/// simd, simd
 
 #if defined(__KOKKOSBATCHED_ENABLE_AVX__)
 #if defined(__AVX512F__)
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double,8)
-    operator - (const Vector<SIMD<double>,8> &a, const Vector<SIMD<double>,8> &b) {
-    return _mm512_sub_pd(a, b);
-  }
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 8) operator-(
+    const Vector<SIMD<double>, 8> &a, const Vector<SIMD<double>, 8> &b) {
+  return _mm512_sub_pd(a, b);
+}
 
 #if !defined(KOKKOS_COMPILER_GNU)
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<double>,4)
-    operator - (const Vector<SIMD<Kokkos::complex<double> >,4> &a, const Vector<SIMD<Kokkos::complex<double> >,4> &b) {
-    return _mm512_sub_pd(a, b);
-  }
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<double>, 4)
+operator-(const Vector<SIMD<Kokkos::complex<double> >, 4> &a,
+          const Vector<SIMD<Kokkos::complex<double> >, 4> &b) {
+  return _mm512_sub_pd(a, b);
+}
 #endif
 
 #endif
 #if defined(__AVX__) || defined(__AVX2__)
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double,4)
-    operator - (const Vector<SIMD<double>,4> & a, const Vector<SIMD<double>,4> & b) {
-    return _mm256_sub_pd(a, b);
-  }
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator-(
+    const Vector<SIMD<double>, 4> &a, const Vector<SIMD<double>, 4> &b) {
+  return _mm256_sub_pd(a, b);
+}
 
 #if !defined(KOKKOS_COMPILER_GNU)
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<double>,2)
-    operator - (const Vector<SIMD<Kokkos::complex<double> >,2> & a, const Vector<SIMD<Kokkos::complex<double> >,2> & b) {
-    return _mm256_sub_pd(a, b);
-  }
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<double>, 2)
+operator-(const Vector<SIMD<Kokkos::complex<double> >, 2> &a,
+          const Vector<SIMD<Kokkos::complex<double> >, 2> &b) {
+  return _mm256_sub_pd(a, b);
+}
 #endif
 
 #endif
 #endif
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T,l)
-    operator - (const Vector<SIMD<T>,l> &a, const Vector<SIMD<T>,l> &b) {
-    Vector<SIMD<T>,l> r_val;
-    if (std::is_fundamental<T>::value) {
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
-#pragma ivdep
-#endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
-#pragma vector always
-#endif
-#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
-#pragma omp simd
-#endif
-      for (int i=0;i<l;++i)
-        r_val[i] = a[i] - b[i];
-    } else {
-      for (int i=0;i<l;++i)
-        r_val[i] = a[i] - b[i];
-    }
-    return r_val;
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l)
+operator-(const Vector<SIMD<T>, l> &a, const Vector<SIMD<T>, l> &b) {
+  Vector<SIMD<T>, l> r_val;
+  if (std::is_fundamental<T>::value) {
+    KOKKOSKERNELS_FORCE_SIMD
+    for (int i = 0; i < l; ++i) r_val[i] = a[i] - b[i];
+  } else {
+    for (int i = 0; i < l; ++i) r_val[i] = a[i] - b[i];
   }
+  return r_val;
+}
 
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2)
-    operator - (const Vector<SIMD<float>,2> &a, const Vector<SIMD<float>,2> &b) {
-    float2 r_val;
-    r_val.x = a.float2().x - b.float2().x;
-    r_val.y = a.float2().y - b.float2().y;
-    return r_val;
-  }
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double,2)
-    operator - (const Vector<SIMD<double>,2> &a, const Vector<SIMD<double>,2> &b) {
-    double2 r_val;
-    r_val.x = a.double2().x - b.double2().x;
-    r_val.y = a.double2().y - b.double2().y;
-    return r_val;
-  }
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,4)
-    operator - (const Vector<SIMD<float>,4> &a, const Vector<SIMD<float>,4> &b) {
-    float4 r_val;
-    r_val.x = a.float4().x - b.float4().x;
-    r_val.y = a.float4().y - b.float4().y;
-    r_val.z = a.float4().z - b.float4().z;
-    r_val.w = a.float4().w - b.float4().w;
-    return r_val;
-  }
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double,4)
-    operator - (const Vector<SIMD<double>,4> &a, const Vector<SIMD<double>,4> &b) {
-    double4 r_val;
-    r_val.x = a.double4().x - b.double4().x;
-    r_val.y = a.double4().y - b.double4().y;
-    r_val.z = a.double4().z - b.double4().z;
-    r_val.w = a.double4().w - b.double4().w;
-    return r_val;
-  }
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 2) operator-(
+    const Vector<SIMD<float>, 2> &a, const Vector<SIMD<float>, 2> &b) {
+  float2 r_val;
+  r_val.x = a.float2().x - b.float2().x;
+  r_val.y = a.float2().y - b.float2().y;
+  return r_val;
+}
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 2) operator-(
+    const Vector<SIMD<double>, 2> &a, const Vector<SIMD<double>, 2> &b) {
+  double2 r_val;
+  r_val.x = a.double2().x - b.double2().x;
+  r_val.y = a.double2().y - b.double2().y;
+  return r_val;
+}
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator-(
+    const Vector<SIMD<float>, 4> &a, const Vector<SIMD<float>, 4> &b) {
+  float4 r_val;
+  r_val.x = a.float4().x - b.float4().x;
+  r_val.y = a.float4().y - b.float4().y;
+  r_val.z = a.float4().z - b.float4().z;
+  r_val.w = a.float4().w - b.float4().w;
+  return r_val;
+}
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator-(
+    const Vector<SIMD<double>, 4> &a, const Vector<SIMD<double>, 4> &b) {
+  double4 r_val;
+  r_val.x = a.double4().x - b.double4().x;
+  r_val.y = a.double4().y - b.double4().y;
+  r_val.z = a.double4().z - b.double4().z;
+  r_val.w = a.double4().w - b.double4().w;
+  return r_val;
+}
 #endif
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T,l)
-    operator - (const Vector<SIMD<T>,l> &a) {
-    Vector<SIMD<T>,l> r_val;
-    if (std::is_fundamental<T>::value) {
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
-#pragma ivdep
-#endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
-#pragma vector always
-#endif
-#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
-#pragma omp simd
-#endif
-      for (int i=0;i<l;++i)
-        r_val[i] = -a[i];
-    } else {
-      for (int i=0;i<l;++i)
-        r_val[i] = -a[i];
-    }
-    return r_val;
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l)
+operator-(const Vector<SIMD<T>, l> &a) {
+  Vector<SIMD<T>, l> r_val;
+  if (std::is_fundamental<T>::value) {
+    KOKKOSKERNELS_FORCE_SIMD
+    for (int i = 0; i < l; ++i) r_val[i] = -a[i];
+  } else {
+    for (int i = 0; i < l; ++i) r_val[i] = -a[i];
   }
+  return r_val;
+}
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T,l)
-    operator -= (Vector<SIMD<T>,l> &a, const Vector<SIMD<T>,l> &b) {
-    a = a - b;
-    return a;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(
+    T, l)
+operator-=(Vector<SIMD<T>, l> &a, const Vector<SIMD<T>, l> &b) {
+  a = a - b;
+  return a;
+}
 
-  /// simd, real
+/// simd, real
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T,l)
-    operator - (const Vector<SIMD<T>,l> &a, const T b) {
-    return a - Vector<SIMD<T>,l>(b);
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l)
+operator-(const Vector<SIMD<T>, l> &a, const T b) {
+  return a - Vector<SIMD<T>, l>(b);
+}
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T,l)
-    operator - (const T a, const Vector<SIMD<T>,l> &b) {
-    return Vector<SIMD<T>,l>(a) - b;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l)
+operator-(const T a, const Vector<SIMD<T>, l> &b) {
+  return Vector<SIMD<T>, l>(a) - b;
+}
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T,l)
-    operator -= (Vector<SIMD<T>,l> &a, const T b) {
-    a = a - b;
-    return a;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(
+    T, l)
+operator-=(Vector<SIMD<T>, l> &a, const T b) {
+  a = a - b;
+  return a;
+}
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T,l)
-    operator -- (Vector<SIMD<T>,l> &a, int) {
-    Vector<SIMD<T>,l> a0 = a;
-    a = a - typename Kokkos::Details::ArithTraits<T>::mag_type(1);
-    return a0;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l)
+operator--(Vector<SIMD<T>, l> &a, int) {
+  Vector<SIMD<T>, l> a0 = a;
+  a = a - typename Kokkos::Details::ArithTraits<T>::mag_type(1);
+  return a0;
+}
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T,l)
-    operator -- (Vector<SIMD<T>,l> &a) {
-    a = a - typename Kokkos::Details::ArithTraits<T>::mag_type(1);
-    return a;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(
+    T, l)
+operator--(Vector<SIMD<T>, l> &a) {
+  a = a - typename Kokkos::Details::ArithTraits<T>::mag_type(1);
+  return a;
+}
 
-  /// simd complex, real
-    
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<T>,l)
-    operator - (const Vector<SIMD<Kokkos::complex<T> >,l> &a, const T b) {
-    return a - Vector<SIMD<Kokkos::complex<T> >,l>(b);
-  }
+/// simd complex, real
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<T>,l)
-    operator - (const T a, const Vector<SIMD<Kokkos::complex<T> >,l> &b) {
-    return Vector<SIMD<Kokkos::complex<T> >,l>(a) - b;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(
+    Kokkos::complex<T>, l)
+operator-(const Vector<SIMD<Kokkos::complex<T> >, l> &a, const T b) {
+  return a - Vector<SIMD<Kokkos::complex<T> >, l>(b);
+}
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(Kokkos::complex<T>,l)
-    operator -= (Vector<SIMD<Kokkos::complex<T> >,l> &a, const T b) {
-    a = a - b;
-    return a;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(
+    Kokkos::complex<T>, l)
+operator-(const T a, const Vector<SIMD<Kokkos::complex<T> >, l> &b) {
+  return Vector<SIMD<Kokkos::complex<T> >, l>(a) - b;
+}
 
-  /// simd complex, complex
-    
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<T>,l)
-    operator - (const Vector<SIMD<Kokkos::complex<T> >,l> &a, const Kokkos::complex<T> b) {
-    return a - Vector<SIMD<Kokkos::complex<T> >,l>(b);
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(
+    Kokkos::complex<T>, l)
+operator-=(Vector<SIMD<Kokkos::complex<T> >, l> &a, const T b) {
+  a = a - b;
+  return a;
+}
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<T>,l)
-    operator - (const Kokkos::complex<T> a, const Vector<SIMD<Kokkos::complex<T> >,l> &b) {
-    return Vector<SIMD<Kokkos::complex<T> >,l>(a) - b;
-  }
+/// simd complex, complex
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(Kokkos::complex<T>,l)
-    operator -= (Vector<SIMD<Kokkos::complex<T> >,l> &a, const Kokkos::complex<T> b) {
-    a = a - b;
-    return a;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(
+    Kokkos::complex<T>, l)
+operator-(const Vector<SIMD<Kokkos::complex<T> >, l> &a,
+          const Kokkos::complex<T> b) {
+  return a - Vector<SIMD<Kokkos::complex<T> >, l>(b);
+}
+
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(
+    Kokkos::complex<T>, l)
+operator-(const Kokkos::complex<T> a,
+          const Vector<SIMD<Kokkos::complex<T> >, l> &b) {
+  return Vector<SIMD<Kokkos::complex<T> >, l>(a) - b;
+}
+
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(
+    Kokkos::complex<T>, l)
+operator-=(Vector<SIMD<Kokkos::complex<T> >, l> &a,
+           const Kokkos::complex<T> b) {
+  a = a - b;
+  return a;
+}
 
-  /// ---------------------------------------------------------------------------------------------------    
+/// ---------------------------------------------------------------------------------------------------
 
-  /// simd, simd
+/// simd, simd
 
 #if defined(__KOKKOSBATCHED_ENABLE_AVX__)
 #if defined(__AVX512F__)
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double,8)
-    operator * (const Vector<SIMD<double>,8> &a, const Vector<SIMD<double>,8> &b) {
-    return _mm512_mul_pd(a, b);
-  }
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 8) operator*(
+    const Vector<SIMD<double>, 8> &a, const Vector<SIMD<double>, 8> &b) {
+  return _mm512_mul_pd(a, b);
+}
 
 #if !defined(KOKKOS_COMPILER_GNU)
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<double>,4)
-    operator * (const Vector<SIMD<Kokkos::complex<double> >,4> &a, const Vector<SIMD<Kokkos::complex<double> >,4> &b) {
-    const __m512d
-      as = _mm512_permute_pd(a, 0x55),
-      br = _mm512_permute_pd(b, 0x00),
-      bi = _mm512_permute_pd(b, 0xff);
-      
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<double>, 4) operator
+    *(const Vector<SIMD<Kokkos::complex<double> >, 4> &a,
+      const Vector<SIMD<Kokkos::complex<double> >, 4> &b) {
+  const __m512d as = _mm512_permute_pd(a, 0x55),
+                br = _mm512_permute_pd(b, 0x00),
+                bi = _mm512_permute_pd(b, 0xff);
+
 #if defined(__FMA__)
-    // latency 7, throughput 0.5
-    return _mm512_fmaddsub_pd(a, br, _mm512_mul_pd(as, bi));
+  // latency 7, throughput 0.5
+  return _mm512_fmaddsub_pd(a, br, _mm512_mul_pd(as, bi));
 #else
-    return _mm512_add_pd(_mm512_mul_pd(a, br),
-                         _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(_mm512_mul_pd(as, bi)),
-                                                              _mm512_castpd_si512(_mm512_mask_broadcast_f64x4(_mm512_setzero_pd(), 0x55,
-                                                                                                              _mm256_set1_pd(-0.0))))));
-    // const __mm512d cc = _mm512_mul_pd(as, bi);
-    // return _mm512_mask_sub_pd(_mm512_mask_add_pd(_mm512_mul_pd(a, br), 0x55, cc), 0xaa, cc);
+  return _mm512_add_pd(
+      _mm512_mul_pd(a, br),
+      _mm512_castsi512_pd(_mm512_xor_si512(
+          _mm512_castpd_si512(_mm512_mul_pd(as, bi)),
+          _mm512_castpd_si512(_mm512_mask_broadcast_f64x4(
+              _mm512_setzero_pd(), 0x55, _mm256_set1_pd(-0.0))))));
+  // const __mm512d cc = _mm512_mul_pd(as, bi);
+  // return _mm512_mask_sub_pd(_mm512_mask_add_pd(_mm512_mul_pd(a, br), 0x55,
+  // cc), 0xaa, cc);
 #endif
-  }
+}
 #endif
 
 #endif
 #if defined(__AVX__) || defined(__AVX2__)
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double,4)
-    operator * (const Vector<SIMD<double>,4> & a, const Vector<SIMD<double>,4> & b) {
-    return _mm256_mul_pd(a, b);
-  }
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator*(
+    const Vector<SIMD<double>, 4> &a, const Vector<SIMD<double>, 4> &b) {
+  return _mm256_mul_pd(a, b);
+}
 
 #if !defined(KOKKOS_COMPILER_GNU)
-  KOKKOS_FORCEINLINE_FUNCTION
-  static Vector<SIMD<Kokkos::complex<double> >,2>
-  operator * (const Vector<SIMD<Kokkos::complex<double> >,2> & a, const Vector<SIMD<Kokkos::complex<double> >,2> & b) {
-    const __m256d
-      as = _mm256_permute_pd(a, 0x5),
-      br = _mm256_permute_pd(b, 0x0),
-      bi = _mm256_permute_pd(b, 0xf);
-      
+KOKKOS_FORCEINLINE_FUNCTION
+static Vector<SIMD<Kokkos::complex<double> >, 2> operator*(
+    const Vector<SIMD<Kokkos::complex<double> >, 2> &a,
+    const Vector<SIMD<Kokkos::complex<double> >, 2> &b) {
+  const __m256d as = _mm256_permute_pd(a, 0x5), br = _mm256_permute_pd(b, 0x0),
+                bi = _mm256_permute_pd(b, 0xf);
+
 #if defined(__FMA__)
-    return _mm256_fmaddsub_pd(a, br, _mm256_mul_pd(as, bi));
+  return _mm256_fmaddsub_pd(a, br, _mm256_mul_pd(as, bi));
 #else
-    return _mm256_add_pd(_mm256_mul_pd(a, br),
-                         _mm256_xor_pd(_mm256_mul_pd(as, bi),
-                                       _mm256_set_pd( 0.0, -0.0, 0.0, -0.0)));
+  return _mm256_add_pd(_mm256_mul_pd(a, br),
+                       _mm256_xor_pd(_mm256_mul_pd(as, bi),
+                                     _mm256_set_pd(0.0, -0.0, 0.0, -0.0)));
 #endif
-  }
+}
 #endif
 
 #endif
 #endif
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T,l)
-    operator * (const Vector<SIMD<T>,l> &a, const Vector<SIMD<T>,l> &b) {
-    Vector<SIMD<T>,l> r_val;
-    if (std::is_fundamental<T>::value) {
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
-#pragma ivdep
-#endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
-#pragma vector always
-#endif
-#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
-#pragma omp simd
-#endif
-      for (int i=0;i<l;++i)
-        r_val[i] = a[i] * b[i];
-    } else {
-      for (int i=0;i<l;++i)
-        r_val[i] = a[i] * b[i];
-    }
-    return r_val;
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l)
+operator*(const Vector<SIMD<T>, l> &a, const Vector<SIMD<T>, l> &b) {
+  Vector<SIMD<T>, l> r_val;
+  if (std::is_fundamental<T>::value) {
+    KOKKOSKERNELS_FORCE_SIMD
+    for (int i = 0; i < l; ++i) r_val[i] = a[i] * b[i];
+  } else {
+    for (int i = 0; i < l; ++i) r_val[i] = a[i] * b[i];
   }
+  return r_val;
+}
 
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2)
-    operator * (const Vector<SIMD<float>,2> &a, const Vector<SIMD<float>,2> &b) {
-    float2 r_val;
-    r_val.x = a.float2().x * b.float2().x;
-    r_val.y = a.float2().y * b.float2().y;
-    return r_val;
-  }
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double,2)
-    operator * (const Vector<SIMD<double>,2> &a, const Vector<SIMD<double>,2> &b) {
-    double2 r_val;
-    r_val.x = a.double2().x * b.double2().x;
-    r_val.y = a.double2().y * b.double2().y;
-    return r_val;
-  }
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,4)
-    operator * (const Vector<SIMD<float>,4> &a, const Vector<SIMD<float>,4> &b) {
-    float4 r_val;
-    r_val.x = a.float4().x * b.float4().x;
-    r_val.y = a.float4().y * b.float4().y;
-    r_val.z = a.float4().z * b.float4().z;
-    r_val.w = a.float4().w * b.float4().w;
-    return r_val;
-  }
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double,4)
-    operator * (const Vector<SIMD<double>,4> &a, const Vector<SIMD<double>,4> &b) {
-    double4 r_val;
-    r_val.x = a.double4().x * b.double4().x;
-    r_val.y = a.double4().y * b.double4().y;
-    r_val.z = a.double4().z * b.double4().z;
-    r_val.w = a.double4().w * b.double4().w;
-    return r_val;
-  }
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 2) operator*(
+    const Vector<SIMD<float>, 2> &a, const Vector<SIMD<float>, 2> &b) {
+  float2 r_val;
+  r_val.x = a.float2().x * b.float2().x;
+  r_val.y = a.float2().y * b.float2().y;
+  return r_val;
+}
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 2) operator*(
+    const Vector<SIMD<double>, 2> &a, const Vector<SIMD<double>, 2> &b) {
+  double2 r_val;
+  r_val.x = a.double2().x * b.double2().x;
+  r_val.y = a.double2().y * b.double2().y;
+  return r_val;
+}
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator*(
+    const Vector<SIMD<float>, 4> &a, const Vector<SIMD<float>, 4> &b) {
+  float4 r_val;
+  r_val.x = a.float4().x * b.float4().x;
+  r_val.y = a.float4().y * b.float4().y;
+  r_val.z = a.float4().z * b.float4().z;
+  r_val.w = a.float4().w * b.float4().w;
+  return r_val;
+}
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator*(
+    const Vector<SIMD<double>, 4> &a, const Vector<SIMD<double>, 4> &b) {
+  double4 r_val;
+  r_val.x = a.double4().x * b.double4().x;
+  r_val.y = a.double4().y * b.double4().y;
+  r_val.z = a.double4().z * b.double4().z;
+  r_val.w = a.double4().w * b.double4().w;
+  return r_val;
+}
 #endif
-        
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T,l)
-    operator *= (Vector<SIMD<T>,l> &a, const Vector<SIMD<T>,l> &b) {
-    a = a * b;
-    return a;
-  }
 
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(
+    T, l)
+operator*=(Vector<SIMD<T>, l> &a, const Vector<SIMD<T>, l> &b) {
+  a = a * b;
+  return a;
+}
 
-  /// simd, real
+/// simd, real
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T,l)
-    operator * (const Vector<SIMD<T>,l> &a, const T b) {
-    return a * Vector<SIMD<T>,l>(b);
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l)
+operator*(const Vector<SIMD<T>, l> &a, const T b) {
+  return a * Vector<SIMD<T>, l>(b);
+}
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T,l)
-    operator * (const T a, const Vector<SIMD<T>,l> &b) {
-    return Vector<SIMD<T>,l>(a) * b;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l)
+operator*(const T a, const Vector<SIMD<T>, l> &b) {
+  return Vector<SIMD<T>, l>(a) * b;
+}
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T,l)
-    operator *= (Vector<SIMD<T>,l> &a, const T b) {
-    a = a * b;
-    return a;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(
+    T, l)
+operator*=(Vector<SIMD<T>, l> &a, const T b) {
+  a = a * b;
+  return a;
+}
 
-  /// simd complex, real
+/// simd complex, real
 
 #if defined(__KOKKOSBATCHED_ENABLE_AVX__)
 #if defined(__AVX512F__)
 
 #if !defined(KOKKOS_COMPILER_GNU)
-  KOKKOS_FORCEINLINE_FUNCTION
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<double>,4)
-    operator * (const Vector<SIMD<Kokkos::complex<double> >,4> &a, const double b) {
-    return _mm512_mul_pd(a, _mm512_set1_pd(b));
-  }
+KOKKOS_FORCEINLINE_FUNCTION
+KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<double>, 4)
+operator*(const Vector<SIMD<Kokkos::complex<double> >, 4> &a, const double b) {
+  return _mm512_mul_pd(a, _mm512_set1_pd(b));
+}
 #endif
 
 #endif
 #if defined(__AVX__) || defined(__AVX2__)
 
 #if !defined(KOKKOS_COMPILER_GNU)
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<double>,2)
-    operator * (const Vector<SIMD<Kokkos::complex<double> >,2> & a, const double b) {
-    return _mm256_mul_pd(a, _mm256_set1_pd(b));
-  }
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<double>, 2) operator
+    *(const Vector<SIMD<Kokkos::complex<double> >, 2> &a, const double b) {
+  return _mm256_mul_pd(a, _mm256_set1_pd(b));
+}
 #endif
 
 #endif
 #endif
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<T>,l)
-    operator * (const Vector<SIMD<Kokkos::complex<T> >,l> &a, const T b) {
-    return a * Vector<SIMD<Kokkos::complex<T> >,l>(b);
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(
+    Kokkos::complex<T>, l)
+operator*(const Vector<SIMD<Kokkos::complex<T> >, l> &a, const T b) {
+  return a * Vector<SIMD<Kokkos::complex<T> >, l>(b);
+}
 
 #if defined(__KOKKOSBATCHED_ENABLE_AVX__)
 #if defined(__AVX512F__)
 
 #if !defined(KOKKOS_COMPILER_GNU)
-  KOKKOS_FORCEINLINE_FUNCTION
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<double>,4)
-    operator * (const double a, const Vector<SIMD<Kokkos::complex<double> >,4> &b) {
-    return _mm512_mul_pd(_mm512_set1_pd(a), b);
-  }
+KOKKOS_FORCEINLINE_FUNCTION
+KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<double>, 4)
+operator*(const double a, const Vector<SIMD<Kokkos::complex<double> >, 4> &b) {
+  return _mm512_mul_pd(_mm512_set1_pd(a), b);
+}
 #endif
 
 #endif
 #if defined(__AVX__) || defined(__AVX2__)
 
 #if !defined(KOKKOS_COMPILER_GNU)
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<double>,2)
-    operator * (const double a, const Vector<SIMD<Kokkos::complex<double> >,2> & b) {
-    return _mm256_mul_pd(_mm256_set1_pd(a), b);
-  }
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<double>, 2) operator
+    *(const double a, const Vector<SIMD<Kokkos::complex<double> >, 2> &b) {
+  return _mm256_mul_pd(_mm256_set1_pd(a), b);
+}
 #endif
 
 #endif
 #endif
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<T>,l)
-    operator * (const T a, const Vector<SIMD<Kokkos::complex<T> >,l> &b) {
-    return Vector<SIMD<Kokkos::complex<T> >,l>(a) * b;
-  }
-
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(
+    Kokkos::complex<T>, l)
+operator*(const T a, const Vector<SIMD<Kokkos::complex<T> >, l> &b) {
+  return Vector<SIMD<Kokkos::complex<T> >, l>(a) * b;
+}
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(Kokkos::complex<T>,l)
-    operator *= (Vector<SIMD<Kokkos::complex<T> >,l> &a, const T b) {
-    a = a * b;
-    return a;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(
+    Kokkos::complex<T>, l)
+operator*=(Vector<SIMD<Kokkos::complex<T> >, l> &a, const T b) {
+  a = a * b;
+  return a;
+}
 
-  /// simd complex, complex
+/// simd complex, complex
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<T>,l)
-    operator * (const Vector<SIMD<Kokkos::complex<T> >,l> &a, const Kokkos::complex<T> b) {
-    return a * Vector<SIMD<Kokkos::complex<T> >,l>(b);
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(
+    Kokkos::complex<T>, l)
+operator*(const Vector<SIMD<Kokkos::complex<T> >, l> &a,
+          const Kokkos::complex<T> b) {
+  return a * Vector<SIMD<Kokkos::complex<T> >, l>(b);
+}
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<T>,l)
-    operator * (const Kokkos::complex<T> a, const Vector<SIMD<Kokkos::complex<T> >,l> &b) {
-    return Vector<SIMD<Kokkos::complex<T> >,l>(a) * b;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(
+    Kokkos::complex<T>, l)
+operator*(const Kokkos::complex<T> a,
+          const Vector<SIMD<Kokkos::complex<T> >, l> &b) {
+  return Vector<SIMD<Kokkos::complex<T> >, l>(a) * b;
+}
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(Kokkos::complex<T>,l)
-    operator *= (Vector<SIMD<Kokkos::complex<T> >,l> &a, const Kokkos::complex<T> b) {
-    a = a * b;
-    return a;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(
+    Kokkos::complex<T>, l)
+operator*=(Vector<SIMD<Kokkos::complex<T> >, l> &a,
+           const Kokkos::complex<T> b) {
+  a = a * b;
+  return a;
+}
 
-  /// ---------------------------------------------------------------------------------------------------    
+/// ---------------------------------------------------------------------------------------------------
 
-  /// simd, simd
+/// simd, simd
 
 #if defined(__KOKKOSBATCHED_ENABLE_AVX__)
 #if defined(__AVX512F__)
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double,8)
-    operator / (const Vector<SIMD<double>,8> &a, const Vector<SIMD<double>,8> &b) {
-    return _mm512_div_pd(a, b);
-  }
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 8) operator/(
+    const Vector<SIMD<double>, 8> &a, const Vector<SIMD<double>, 8> &b) {
+  return _mm512_div_pd(a, b);
+}
 
 #if !defined(KOKKOS_COMPILER_GNU)
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<double>,4)
-    operator / (const Vector<SIMD<Kokkos::complex<double> >,4> &a, const Vector<SIMD<Kokkos::complex<double> >,4> &b) {
-    const __m512d
-      as = _mm512_permute_pd(a, 0x55),
-      cb = _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(b),
-                                                _mm512_castpd_si512(_mm512_mask_broadcast_f64x4(_mm512_setzero_pd(), 0xAA,
-                                                                                                _mm256_set1_pd(-0.0))))),
-      br = _mm512_permute_pd(cb, 0x00),
-      bi = _mm512_permute_pd(cb, 0xff);
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<double>, 4)
+operator/(const Vector<SIMD<Kokkos::complex<double> >, 4> &a,
+          const Vector<SIMD<Kokkos::complex<double> >, 4> &b) {
+  const __m512d as = _mm512_permute_pd(a, 0x55),
+                cb = _mm512_castsi512_pd(_mm512_xor_si512(
+                    _mm512_castpd_si512(b),
+                    _mm512_castpd_si512(_mm512_mask_broadcast_f64x4(
+                        _mm512_setzero_pd(), 0xAA, _mm256_set1_pd(-0.0))))),
+                br = _mm512_permute_pd(cb, 0x00),
+                bi = _mm512_permute_pd(cb, 0xff);
 
 #if defined(__FMA__)
-    return _mm512_div_pd(_mm512_fmaddsub_pd(a,  br, _mm512_mul_pd(as, bi)),
-                         _mm512_fmadd_pd   (br, br, _mm512_mul_pd(bi, bi)));
+  return _mm512_div_pd(_mm512_fmaddsub_pd(a, br, _mm512_mul_pd(as, bi)),
+                       _mm512_fmadd_pd(br, br, _mm512_mul_pd(bi, bi)));
 #else
-    return _mm512_div_pd(_mm512_add_pd(_mm512_mul_pd(a, br),
-                                       _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(_mm512_mul_pd(as, bi)),
-                                                                            _mm512_castpd_si512(_mm512_mask_broadcast_f64x4(_mm512_setzero_pd(), 0xAA,
-                                                                                                                            _mm256_set1_pd(-0.0)))))),
-                         _mm512_add_pd(_mm512_mul_pd(br, br), _mm512_mul_pd(bi, bi)));
-    // const __mm512d cc = _mm512_mul_pd(as, bi);
-    // return _mm512_div_pd(_mm512_mask_sub_pd(_mm512_mask_add_pd(_mm512_mul_pd(a, br), 0x55, cc), 0xaa, cc),
-    //                      _mm512_add_pd(_mm512_mul_pd(br, br), _mm512_mul_pd(bi, bi)));
+  return _mm512_div_pd(
+      _mm512_add_pd(
+          _mm512_mul_pd(a, br),
+          _mm512_castsi512_pd(_mm512_xor_si512(
+              _mm512_castpd_si512(_mm512_mul_pd(as, bi)),
+              _mm512_castpd_si512(_mm512_mask_broadcast_f64x4(
+                  _mm512_setzero_pd(), 0xAA, _mm256_set1_pd(-0.0)))))),
+      _mm512_add_pd(_mm512_mul_pd(br, br), _mm512_mul_pd(bi, bi)));
+  // const __mm512d cc = _mm512_mul_pd(as, bi);
+  // return _mm512_div_pd(_mm512_mask_sub_pd(_mm512_mask_add_pd(_mm512_mul_pd(a,
+  // br), 0x55, cc), 0xaa, cc),
+  //                      _mm512_add_pd(_mm512_mul_pd(br, br), _mm512_mul_pd(bi,
+  //                      bi)));
 #endif
-  }
+}
 #endif
 
 #endif
 
 #if defined(__AVX__) || defined(__AVX2__)
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double,4)
-    operator / (const Vector<SIMD<double>,4> & a, const Vector<SIMD<double>,4> & b) {
-    return _mm256_div_pd(a, b);
-  }
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator/(
+    const Vector<SIMD<double>, 4> &a, const Vector<SIMD<double>, 4> &b) {
+  return _mm256_div_pd(a, b);
+}
 
 #if !defined(KOKKOS_COMPILER_GNU)
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<double>,2)
-    operator / (Vector<SIMD<Kokkos::complex<double> >,2> const & a, Vector<SIMD<Kokkos::complex<double> >,2> const & b) {
-    const __m256d
-      as = _mm256_permute_pd(a, 0x5),
-      cb = _mm256_xor_pd(b, _mm256_set_pd(-0.0, 0.0, -0.0, 0.0)),
-      br = _mm256_permute_pd(cb, 0x0),
-      bi = _mm256_permute_pd(cb, 0xf);
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<double>, 2)
+operator/(Vector<SIMD<Kokkos::complex<double> >, 2> const &a,
+          Vector<SIMD<Kokkos::complex<double> >, 2> const &b) {
+  const __m256d as = _mm256_permute_pd(a, 0x5),
+                cb = _mm256_xor_pd(b, _mm256_set_pd(-0.0, 0.0, -0.0, 0.0)),
+                br = _mm256_permute_pd(cb, 0x0),
+                bi = _mm256_permute_pd(cb, 0xf);
 
 #if defined(__FMA__)
-    return _mm256_div_pd(_mm256_fmaddsub_pd(a, br, _mm256_mul_pd(as, bi)),
-                         _mm256_add_pd(_mm256_mul_pd(br, br), _mm256_mul_pd(bi, bi)));
+  return _mm256_div_pd(
+      _mm256_fmaddsub_pd(a, br, _mm256_mul_pd(as, bi)),
+      _mm256_add_pd(_mm256_mul_pd(br, br), _mm256_mul_pd(bi, bi)));
 #else
-    return _mm256_div_pd(_mm256_add_pd(_mm256_mul_pd(a, br),
-                                       _mm256_xor_pd(_mm256_mul_pd(as, bi),
-                                                     _mm256_set_pd( 0.0, -0.0, 0.0, -0.0))),
-                         _mm256_add_pd(_mm256_mul_pd(br, br), _mm256_mul_pd(bi, bi)));
+  return _mm256_div_pd(
+      _mm256_add_pd(_mm256_mul_pd(a, br),
+                    _mm256_xor_pd(_mm256_mul_pd(as, bi),
+                                  _mm256_set_pd(0.0, -0.0, 0.0, -0.0))),
+      _mm256_add_pd(_mm256_mul_pd(br, br), _mm256_mul_pd(bi, bi)));
 #endif
-  }
+}
 #endif
 
 #endif
 #endif
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T,l)
-    operator / (const Vector<SIMD<T>,l> &a, const Vector<SIMD<T>,l> &b) {
-    Vector<SIMD<T>,l> r_val;
-    if (std::is_fundamental<T>::value) {
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
-#pragma ivdep
-#endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
-#pragma vector always
-#endif
-#ifdef KOKKOSKERNELS_ENABLE_OMP_SIMD
-#pragma omp simd
-#endif
-      for (int i=0;i<l;++i)
-        r_val[i] = a[i] / b[i];
-    } else {
-      for (int i=0;i<l;++i)
-        r_val[i] = a[i] / b[i];
-    }
-    return r_val;
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l)
+operator/(const Vector<SIMD<T>, l> &a, const Vector<SIMD<T>, l> &b) {
+  Vector<SIMD<T>, l> r_val;
+  if (std::is_fundamental<T>::value) {
+    KOKKOSKERNELS_FORCE_SIMD
+    for (int i = 0; i < l; ++i) r_val[i] = a[i] / b[i];
+  } else {
+    for (int i = 0; i < l; ++i) r_val[i] = a[i] / b[i];
   }
+  return r_val;
+}
 
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2)
-    operator / (const Vector<SIMD<float>,2> &a, const Vector<SIMD<float>,2> &b) {
-    float2 r_val;
-    r_val.x = a.float2().x / b.float2().x;
-    r_val.y = a.float2().y / b.float2().y;
-    return r_val;
-  }
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double,2)
-    operator / (const Vector<SIMD<double>,2> &a, const Vector<SIMD<double>,2> &b) {
-    double2 r_val;
-    r_val.x = a.double2().x / b.double2().x;
-    r_val.y = a.double2().y / b.double2().y;
-    return r_val;
-  }
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,4)
-    operator / (const Vector<SIMD<float>,4> &a, const Vector<SIMD<float>,4> &b) {
-    float4 r_val;
-    r_val.x = a.float4().x / b.float4().x;
-    r_val.y = a.float4().y / b.float4().y;
-    r_val.z = a.float4().z / b.float4().z;
-    r_val.w = a.float4().w / b.float4().w;
-    return r_val;
-  }
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double,4)
-    operator / (const Vector<SIMD<double>,4> &a, const Vector<SIMD<double>,4> &b) {
-    double4 r_val;
-    r_val.x = a.double4().x / b.double4().x;
-    r_val.y = a.double4().y / b.double4().y;
-    r_val.z = a.double4().z / b.double4().z;
-    r_val.w = a.double4().w / b.double4().w;
-    return r_val;
-  }
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 2) operator/(
+    const Vector<SIMD<float>, 2> &a, const Vector<SIMD<float>, 2> &b) {
+  float2 r_val;
+  r_val.x = a.float2().x / b.float2().x;
+  r_val.y = a.float2().y / b.float2().y;
+  return r_val;
+}
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 2) operator/(
+    const Vector<SIMD<double>, 2> &a, const Vector<SIMD<double>, 2> &b) {
+  double2 r_val;
+  r_val.x = a.double2().x / b.double2().x;
+  r_val.y = a.double2().y / b.double2().y;
+  return r_val;
+}
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float, 4) operator/(
+    const Vector<SIMD<float>, 4> &a, const Vector<SIMD<float>, 4> &b) {
+  float4 r_val;
+  r_val.x = a.float4().x / b.float4().x;
+  r_val.y = a.float4().y / b.float4().y;
+  r_val.z = a.float4().z / b.float4().z;
+  r_val.w = a.float4().w / b.float4().w;
+  return r_val;
+}
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(double, 4) operator/(
+    const Vector<SIMD<double>, 4> &a, const Vector<SIMD<double>, 4> &b) {
+  double4 r_val;
+  r_val.x = a.double4().x / b.double4().x;
+  r_val.y = a.double4().y / b.double4().y;
+  r_val.z = a.double4().z / b.double4().z;
+  r_val.w = a.double4().w / b.double4().w;
+  return r_val;
+}
 #endif
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T,l)
-    operator /= (Vector<SIMD<T>,l> &a, const Vector<SIMD<T>,l> &b) {
-    a = a / b;
-    return a;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(
+    T, l)
+operator/=(Vector<SIMD<T>, l> &a, const Vector<SIMD<T>, l> &b) {
+  a = a / b;
+  return a;
+}
 
-  /// simd, real
+/// simd, real
 #if defined(__KOKKOSBATCHED_ENABLE_AVX__)
 #if defined(__AVX512F__)
 
 #if !defined(KOKKOS_COMPILER_GNU)
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<double>,4)
-    operator / (const Vector<SIMD<Kokkos::complex<double> >,4> &a, const double b) {
-    return _mm512_div_pd(a, _mm512_set1_pd(b));
-  }
+KOKKOS_FORCEINLINE_FUNCTION
+static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<double>, 4)
+operator/(const Vector<SIMD<Kokkos::complex<double> >, 4> &a, const double b) {
+  return _mm512_div_pd(a, _mm512_set1_pd(b));
+}
 #endif
 
 #endif
 #endif
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T,l)
-    operator / (const Vector<SIMD<T>,l> &a, const T b) {
-    return a / Vector<SIMD<T>,l>(b);
-  }
-
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l)
+operator/(const Vector<SIMD<T>, l> &a, const T b) {
+  return a / Vector<SIMD<T>, l>(b);
+}
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T,l)
-    operator / (const T a, const Vector<SIMD<T>,l> &b) {
-    return Vector<SIMD<T>,l>(a) / b;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l)
+operator/(const T a, const Vector<SIMD<T>, l> &b) {
+  return Vector<SIMD<T>, l>(a) / b;
+}
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(T,l)
-    operator /= (Vector<SIMD<T>,l> &a, const T b) {
-    a = a / b;
-    return a;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(
+    T, l)
+operator/=(Vector<SIMD<T>, l> &a, const T b) {
+  a = a / b;
+  return a;
+}
 
-  /// simd complex, real
+/// simd complex, real
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<T>,l)
-    operator / (const Vector<SIMD<Kokkos::complex<T> >,l> &a, const T b) {
-    return a / Vector<SIMD<Kokkos::complex<T> >,l>(b);
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(
+    Kokkos::complex<T>, l)
+operator/(const Vector<SIMD<Kokkos::complex<T> >, l> &a, const T b) {
+  return a / Vector<SIMD<Kokkos::complex<T> >, l>(b);
+}
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<T>,l)
-    operator / (const T a, const Vector<SIMD<Kokkos::complex<T> >,l> &b) {
-    return Vector<SIMD<Kokkos::complex<T> >,l>(a) / b;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(
+    Kokkos::complex<T>, l)
+operator/(const T a, const Vector<SIMD<Kokkos::complex<T> >, l> &b) {
+  return Vector<SIMD<Kokkos::complex<T> >, l>(a) / b;
+}
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(Kokkos::complex<T>,l)
-    operator /= (Vector<SIMD<Kokkos::complex<T> >,l> &a, const T b) {
-    a = a / b;
-    return a;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(
+    Kokkos::complex<T>, l)
+operator/=(Vector<SIMD<Kokkos::complex<T> >, l> &a, const T b) {
+  a = a / b;
+  return a;
+}
 
-  /// simd complex, complex
+/// simd complex, complex
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static 
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<T>,l)
-    operator / (const Vector<SIMD<Kokkos::complex<T> >,l> &a, const Kokkos::complex<T> b) {
-    return a / Vector<SIMD<Kokkos::complex<T> >,l>(b);
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(
+    Kokkos::complex<T>, l)
+operator/(const Vector<SIMD<Kokkos::complex<T> >, l> &a,
+          const Kokkos::complex<T> b) {
+  return a / Vector<SIMD<Kokkos::complex<T> >, l>(b);
+}
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(Kokkos::complex<T>,l)
-    operator / (const Kokkos::complex<T> a, const Vector<SIMD<Kokkos::complex<T> >,l> &b) {
-    return Vector<SIMD<Kokkos::complex<T> >,l>(a) / b;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(
+    Kokkos::complex<T>, l)
+operator/(const Kokkos::complex<T> a,
+          const Vector<SIMD<Kokkos::complex<T> >, l> &b) {
+  return Vector<SIMD<Kokkos::complex<T> >, l>(a) / b;
+}
 
-  template<typename T, int l>
-  KOKKOS_FORCEINLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(Kokkos::complex<T>,l)
-    operator /= (Vector<SIMD<Kokkos::complex<T> >,l> &a, const Kokkos::complex<T> b) {
-    a = a / b;
-    return a;
-  }
+template <typename T, int l>
+KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE(
+    Kokkos::complex<T>, l)
+operator/=(Vector<SIMD<Kokkos::complex<T> >, l> &a,
+           const Kokkos::complex<T> b) {
+  a = a / b;
+  return a;
+}
 #undef KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE
 #undef KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE
-}
-
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Vector_SIMD_Logical.hpp b/src/batched/dense/impl/KokkosBatched_Vector_SIMD_Logical.hpp
index 854d5dbc30..23f20490db 100644
--- a/src/batched/dense/impl/KokkosBatched_Vector_SIMD_Logical.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Vector_SIMD_Logical.hpp
@@ -7,128 +7,117 @@
 
 namespace KokkosBatched {
 
-#define KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0,T1,l) typename std::enable_if<std::is_integral< T0 >::value && std::is_integral< T1 >::value,const Vector<SIMD<bool>,l> >::type 
+#define KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0, T1, l) \
+  typename std::enable_if<std::is_integral<T0>::value &&       \
+                              std::is_integral<T1>::value,     \
+                          const Vector<SIMD<bool>, l> >::type
 
-  template<typename T, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  typename std::enable_if<std::is_integral< T >::value,const Vector<SIMD<bool>,l> >::type 
-  operator!(const Vector<SIMD<T>,l> &a) {
-    Vector<SIMD<bool>,l> r_val;
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+template <typename T, int l>
+KOKKOS_INLINE_FUNCTION static
+    typename std::enable_if<std::is_integral<T>::value,
+                            const Vector<SIMD<bool>, l> >::type
+    operator!(const Vector<SIMD<T>, l> &a) {
+  Vector<SIMD<bool>, l> r_val;
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
 #pragma ivdep
 #endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
+#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
 #pragma vector always
 #endif
-    for (int i=0;i<l;++i)
-      r_val[i] = !a[i];
-    return r_val;
-  }
+  for (int i = 0; i < l; ++i) r_val[i] = !a[i];
+  return r_val;
+}
 
-  template<typename T0, typename T1, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0,T1,l)
-    operator||(const Vector<SIMD<T0>,l> &a, const Vector<SIMD<T1>,l> &b) {
-    Vector<SIMD<bool>,l> r_val;
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+template <typename T0, typename T1, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0,
+                                                                          T1, l)
+operator||(const Vector<SIMD<T0>, l> &a, const Vector<SIMD<T1>, l> &b) {
+  Vector<SIMD<bool>, l> r_val;
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
 #pragma ivdep
 #endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
+#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
 #pragma vector always
 #endif
-    for (int i=0;i<l;++i)
-      r_val[i] = a[i] || b[i];
-    return r_val;
-  }
+  for (int i = 0; i < l; ++i) r_val[i] = a[i] || b[i];
+  return r_val;
+}
 
-  template<typename T0, typename T1, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0,T1,l)
-    operator&&(const Vector<SIMD<T0>,l> &a, const Vector<SIMD<T1>,l> &b) {
-    Vector<SIMD<bool>,l> r_val;
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+template <typename T0, typename T1, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0,
+                                                                          T1, l)
+operator&&(const Vector<SIMD<T0>, l> &a, const Vector<SIMD<T1>, l> &b) {
+  Vector<SIMD<bool>, l> r_val;
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
 #pragma ivdep
 #endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
+#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
 #pragma vector always
 #endif
-    for (int i=0;i<l;++i)
-      r_val[i] = a[i] && b[i];
-    return r_val;
-  }
+  for (int i = 0; i < l; ++i) r_val[i] = a[i] && b[i];
+  return r_val;
+}
 
-  template<typename T0, typename T1, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0,T1,l)
-    operator||(const Vector<SIMD<T0>,l> &a, const T1 &b) {
-    Vector<SIMD<bool>,l> r_val;
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+template <typename T0, typename T1, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0,
+                                                                          T1, l)
+operator||(const Vector<SIMD<T0>, l> &a, const T1 &b) {
+  Vector<SIMD<bool>, l> r_val;
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
 #pragma ivdep
 #endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
+#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
 #pragma vector always
 #endif
-    for (int i=0;i<l;++i)
-      r_val[i] = a[i] || b;
-    return r_val;
-  }
+  for (int i = 0; i < l; ++i) r_val[i] = a[i] || b;
+  return r_val;
+}
 
-  template<typename T0, typename T1, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0,T1,l)    
-    operator&&(const Vector<SIMD<T0>,l> &a, const T1 &b) {
-    Vector<SIMD<bool>,l> r_val;
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+template <typename T0, typename T1, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0,
+                                                                          T1, l)
+operator&&(const Vector<SIMD<T0>, l> &a, const T1 &b) {
+  Vector<SIMD<bool>, l> r_val;
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
 #pragma ivdep
 #endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
+#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
 #pragma vector always
 #endif
-    for (int i=0;i<l;++i)
-      r_val[i] = a[i] && b;
-    return r_val;
-  }
+  for (int i = 0; i < l; ++i) r_val[i] = a[i] && b;
+  return r_val;
+}
 
-  template<typename T0, typename T1, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0,T1,l)
-    operator||(const T0 &a, const Vector<SIMD<T1>,l> &b) {
-    Vector<SIMD<bool>,l> r_val;
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+template <typename T0, typename T1, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0,
+                                                                          T1, l)
+operator||(const T0 &a, const Vector<SIMD<T1>, l> &b) {
+  Vector<SIMD<bool>, l> r_val;
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
 #pragma ivdep
 #endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
+#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
 #pragma vector always
 #endif
-    for (int i=0;i<l;++i)
-      r_val[i] = a || b[i];
-    return r_val;
-  }
+  for (int i = 0; i < l; ++i) r_val[i] = a || b[i];
+  return r_val;
+}
 
-  template<typename T0, typename T1, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0,T1,l)
-    operator&&(const T0 &a, const Vector<SIMD<T1>,l> &b) {
-    Vector<SIMD<bool>,l> r_val;
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+template <typename T0, typename T1, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE(T0,
+                                                                          T1, l)
+operator&&(const T0 &a, const Vector<SIMD<T1>, l> &b) {
+  Vector<SIMD<bool>, l> r_val;
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
 #pragma ivdep
 #endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
+#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
 #pragma vector always
 #endif
-    for (int i=0;i<l;++i)
-      r_val[i] = a && b[i];
-    return r_val;
-  }
-#undef KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE
+  for (int i = 0; i < l; ++i) r_val[i] = a && b[i];
+  return r_val;
 }
-
+#undef KOKKOSKERNELS_SIMD_LOGICAL_RETURN_BOOL_TYPE
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp b/src/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp
index 28a87ae2b1..fca23e0a06 100644
--- a/src/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp
@@ -7,351 +7,295 @@
 
 namespace KokkosBatched {
 
-  //#define KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE typename std::enable_if<std::is_same<Kokkos::Impl::ActiveExecutionMemorySpace,Kokkos::HostSpace>::value,Vector<SIMD<T>,l> >::type
-#define KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T,l) Vector< SIMD< T >, l >
-#define KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T,l) typename std::enable_if<!std::is_integral< T >::value,Vector<SIMD< T >,l> >::type
-
-  /// simd 
-
-  template<typename T, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T,l)
-    sqrt(const Vector<SIMD<T>,l> &a) {
-    typedef Kokkos::Details::ArithTraits<T> ats;
-    Vector<SIMD<T>,l> r_val;
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+#define KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) Vector<SIMD<T>, l>
+#define KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) \
+  typename std::enable_if<!std::is_integral<T>::value,  \
+                          Vector<SIMD<T>, l> >::type
+
+/// simd
+
+template <typename T, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l)
+    sqrt(const Vector<SIMD<T>, l> &a) {
+  typedef Kokkos::Details::ArithTraits<T> ats;
+  Vector<SIMD<T>, l> r_val;
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
 #pragma ivdep
 #endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
+#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
 #pragma vector always
 #endif
-    for (int i=0;i<l;++i) 
-      r_val[i] = ats::sqrt(a[i]);
+  for (int i = 0; i < l; ++i) r_val[i] = ats::sqrt(a[i]);
 
-    return r_val;
-  }
+  return r_val;
+}
 
-  template<typename T, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T,l)
-    cbrt(const Vector<SIMD<T>,l> &a) {
-    typedef Kokkos::Details::ArithTraits<T> ats;
-    Vector<SIMD<T>,l> r_val;
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+template <typename T, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l)
+    cbrt(const Vector<SIMD<T>, l> &a) {
+  typedef Kokkos::Details::ArithTraits<T> ats;
+  Vector<SIMD<T>, l> r_val;
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
 #pragma ivdep
 #endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
+#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
 #pragma vector always
 #endif
-    for (int i=0;i<l;++i)
-      r_val[i] = ats::cbrt(a[i]);
+  for (int i = 0; i < l; ++i) r_val[i] = ats::cbrt(a[i]);
 
-    return r_val;
-  }
+  return r_val;
+}
 
-  template<typename T, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T,l)
-    log(const Vector<SIMD<T>,l> &a) {
-    typedef Kokkos::Details::ArithTraits<T> ats;
-    Vector<SIMD<T>,l> r_val;
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+template <typename T, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l)
+    log(const Vector<SIMD<T>, l> &a) {
+  typedef Kokkos::Details::ArithTraits<T> ats;
+  Vector<SIMD<T>, l> r_val;
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
 #pragma ivdep
 #endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
+#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
 #pragma vector always
 #endif
-    for (int i=0;i<l;++i)
-      r_val[i] = ats::log(a[i]);
+  for (int i = 0; i < l; ++i) r_val[i] = ats::log(a[i]);
 
-    return r_val;
-  }
+  return r_val;
+}
 
-  template<typename T, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T,l)
-    log10(const Vector<SIMD<T>,l> &a) {
-    typedef Kokkos::Details::ArithTraits<T> ats;
-    Vector<SIMD<T>,l> r_val;
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+template <typename T, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l)
+    log10(const Vector<SIMD<T>, l> &a) {
+  typedef Kokkos::Details::ArithTraits<T> ats;
+  Vector<SIMD<T>, l> r_val;
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
 #pragma ivdep
 #endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
+#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
 #pragma vector always
 #endif
-    for (int i=0;i<l;++i)
-      r_val[i] = ats::log10(a[i]);
+  for (int i = 0; i < l; ++i) r_val[i] = ats::log10(a[i]);
 
-    return r_val;
-  }
+  return r_val;
+}
 
-  template<typename T, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T,l)
-    exp(const Vector<SIMD<T>,l> &a) {
-    typedef Kokkos::Details::ArithTraits<T> ats;
-    Vector<SIMD<T>,l> r_val;
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+template <typename T, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l)
+    exp(const Vector<SIMD<T>, l> &a) {
+  typedef Kokkos::Details::ArithTraits<T> ats;
+  Vector<SIMD<T>, l> r_val;
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
 #pragma ivdep
 #endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
+#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
 #pragma vector always
 #endif
-    for (int i=0;i<l;++i)
-      r_val[i] = ats::exp(a[i]);
+  for (int i = 0; i < l; ++i) r_val[i] = ats::exp(a[i]);
 
-    return r_val;
-  }
+  return r_val;
+}
 
-  template<typename T0, typename T1, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T0,l)
-    pow(const Vector<SIMD<T0>,l> &a, const Vector<SIMD<T1>,l> &b) {
-    typedef Kokkos::Details::ArithTraits<T0> ats;
-    Vector<SIMD<T0>,l> r_val;
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+template <typename T0, typename T1, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T0, l)
+    pow(const Vector<SIMD<T0>, l> &a, const Vector<SIMD<T1>, l> &b) {
+  typedef Kokkos::Details::ArithTraits<T0> ats;
+  Vector<SIMD<T0>, l> r_val;
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
 #pragma ivdep
 #endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
+#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
 #pragma vector always
 #endif
-    for (int i=0;i<l;++i)
-      r_val[i] = ats::pow(a[i], b[i]);
-
-    return r_val;
-  }
-
-  template<typename T0, typename T1, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T0,l)
-    pow(const T0 &a, const Vector<SIMD<T1>,l> &b) {
-    return pow(Vector<SIMD<T0>,l>(a), b);
-  }
-
-  template<typename T0, typename T1, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T0,l)
-    pow(const Vector<SIMD<T0>,l> &a, const T1 &b) {
-    return pow(a, Vector<SIMD<T1>,l>(b));
-  }
-
-  template<typename T, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T,l)
-    sin(const Vector<SIMD<T>,l> &a) {
-    typedef Kokkos::Details::ArithTraits<T> ats;
-    Vector<SIMD<T>,l> r_val;
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+  for (int i = 0; i < l; ++i) r_val[i] = ats::pow(a[i], b[i]);
+
+  return r_val;
+}
+
+template <typename T0, typename T1, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T0, l)
+    pow(const T0 &a, const Vector<SIMD<T1>, l> &b) {
+  return pow(Vector<SIMD<T0>, l>(a), b);
+}
+
+template <typename T0, typename T1, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T0, l)
+    pow(const Vector<SIMD<T0>, l> &a, const T1 &b) {
+  return pow(a, Vector<SIMD<T1>, l>(b));
+}
+
+template <typename T, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l)
+    sin(const Vector<SIMD<T>, l> &a) {
+  typedef Kokkos::Details::ArithTraits<T> ats;
+  Vector<SIMD<T>, l> r_val;
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
 #pragma ivdep
 #endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
+#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
 #pragma vector always
 #endif
-    for (int i=0;i<l;++i)
-      r_val[i] = ats::sin(a[i]);
+  for (int i = 0; i < l; ++i) r_val[i] = ats::sin(a[i]);
 
-    return r_val;
-  }
+  return r_val;
+}
 
-  template<typename T, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T,l)
-    cos(const Vector<SIMD<T>,l> &a) {
-    typedef Kokkos::Details::ArithTraits<T> ats;
-    Vector<SIMD<T>,l> r_val;
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+template <typename T, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l)
+    cos(const Vector<SIMD<T>, l> &a) {
+  typedef Kokkos::Details::ArithTraits<T> ats;
+  Vector<SIMD<T>, l> r_val;
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
 #pragma ivdep
 #endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
+#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
 #pragma vector always
 #endif
-    for (int i=0;i<l;++i)
-      r_val[i] = ats::cos(a[i]);
+  for (int i = 0; i < l; ++i) r_val[i] = ats::cos(a[i]);
 
-    return r_val;
-  }
+  return r_val;
+}
 
-  template<typename T, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T,l)
-    tan(const Vector<SIMD<T>,l> &a) {
-    typedef Kokkos::Details::ArithTraits<T> ats;
-    Vector<SIMD<T>,l> r_val;
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+template <typename T, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l)
+    tan(const Vector<SIMD<T>, l> &a) {
+  typedef Kokkos::Details::ArithTraits<T> ats;
+  Vector<SIMD<T>, l> r_val;
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
 #pragma ivdep
 #endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
+#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
 #pragma vector always
 #endif
-    for (int i=0;i<l;++i)
-      r_val[i] = ats::tan(a[i]);
+  for (int i = 0; i < l; ++i) r_val[i] = ats::tan(a[i]);
 
-    return r_val;
-  }
+  return r_val;
+}
 
-  template<typename T, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T,l)
-    sinh(const Vector<SIMD<T>,l> &a) {
-    typedef Kokkos::Details::ArithTraits<T> ats;
-    Vector<SIMD<T>,l> r_val;
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+template <typename T, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l)
+    sinh(const Vector<SIMD<T>, l> &a) {
+  typedef Kokkos::Details::ArithTraits<T> ats;
+  Vector<SIMD<T>, l> r_val;
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
 #pragma ivdep
 #endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
+#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
 #pragma vector always
 #endif
-    for (int i=0;i<l;++i)
-      r_val[i] = ats::sinh(a[i]);
+  for (int i = 0; i < l; ++i) r_val[i] = ats::sinh(a[i]);
 
-    return r_val;
-  }
+  return r_val;
+}
 
-  template<typename T, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T,l)
-    cosh(const Vector<SIMD<T>,l> &a) {
-    typedef Kokkos::Details::ArithTraits<T> ats;
-    Vector<SIMD<T>,l> r_val;
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+template <typename T, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l)
+    cosh(const Vector<SIMD<T>, l> &a) {
+  typedef Kokkos::Details::ArithTraits<T> ats;
+  Vector<SIMD<T>, l> r_val;
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
 #pragma ivdep
 #endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
+#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
 #pragma vector always
 #endif
-    for (int i=0;i<l;++i)
-      r_val[i] = ats::cosh(a[i]);
-      
-    return r_val;
-  }
-
-  template<typename T, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T,l)
-    tanh(const Vector<SIMD<T>,l> &a) {
-    typedef Kokkos::Details::ArithTraits<T> ats;
-    Vector<SIMD<T>,l> r_val;
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+  for (int i = 0; i < l; ++i) r_val[i] = ats::cosh(a[i]);
+
+  return r_val;
+}
+
+template <typename T, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l)
+    tanh(const Vector<SIMD<T>, l> &a) {
+  typedef Kokkos::Details::ArithTraits<T> ats;
+  Vector<SIMD<T>, l> r_val;
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
 #pragma ivdep
 #endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
+#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
 #pragma vector always
 #endif
-    for (int i=0;i<l;++i)
-      r_val[i] = ats::tanh(a[i]);
+  for (int i = 0; i < l; ++i) r_val[i] = ats::tanh(a[i]);
 
-    return r_val;
-  }
+  return r_val;
+}
 
-  template<typename T, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T,l)
-    asin(const Vector<SIMD<T>,l> &a) {
-    typedef Kokkos::Details::ArithTraits<T> ats;
-    Vector<SIMD<T>,l> r_val;
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+template <typename T, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l)
+    asin(const Vector<SIMD<T>, l> &a) {
+  typedef Kokkos::Details::ArithTraits<T> ats;
+  Vector<SIMD<T>, l> r_val;
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
 #pragma ivdep
 #endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
+#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
 #pragma vector always
 #endif
-    for (int i=0;i<l;++i)
-      r_val[i] = ats::asin(a[i]);
+  for (int i = 0; i < l; ++i) r_val[i] = ats::asin(a[i]);
 
-    return r_val;
-  }
+  return r_val;
+}
 
-  template<typename T, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T,l)
-    acos(const Vector<SIMD<T>,l> &a) {
-    typedef Kokkos::Details::ArithTraits<T> ats;
-    Vector<SIMD<T>,l> r_val;
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+template <typename T, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l)
+    acos(const Vector<SIMD<T>, l> &a) {
+  typedef Kokkos::Details::ArithTraits<T> ats;
+  Vector<SIMD<T>, l> r_val;
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
 #pragma ivdep
 #endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
+#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
 #pragma vector always
 #endif
-    for (int i=0;i<l;++i)
-      r_val[i] = ats::acos(a[i]);
+  for (int i = 0; i < l; ++i) r_val[i] = ats::acos(a[i]);
 
-    return r_val;
-  }
+  return r_val;
+}
 
-  template<typename T, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T,l)
-    atan(const Vector<SIMD<T>,l> &a) {
-    typedef Kokkos::Details::ArithTraits<T> ats;
-    Vector<SIMD<T>,l> r_val;
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+template <typename T, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l)
+    atan(const Vector<SIMD<T>, l> &a) {
+  typedef Kokkos::Details::ArithTraits<T> ats;
+  Vector<SIMD<T>, l> r_val;
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
 #pragma ivdep
 #endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
+#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
 #pragma vector always
 #endif
-    for (int i=0;i<l;++i)
-      r_val[i] = ats::atan(a[i]);
+  for (int i = 0; i < l; ++i) r_val[i] = ats::atan(a[i]);
 
-    return r_val;
-  }
+  return r_val;
+}
 
-  template<typename T, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T,l)
-    atan2(const Vector<SIMD<T>,l> &a, const Vector<SIMD<T>,l> &b) {
-    //typedef Kokkos::Details::ArithTraits<T> ats;
-    Vector<SIMD<T>,l> r_val;
-#if defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+template <typename T, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l)
+    atan2(const Vector<SIMD<T>, l> &a, const Vector<SIMD<T>, l> &b) {
+  // typedef Kokkos::Details::ArithTraits<T> ats;
+  Vector<SIMD<T>, l> r_val;
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
 #pragma ivdep
 #endif
-#if defined( KOKKOS_ENABLE_PRAGMA_VECTOR )
+#if defined(KOKKOS_ENABLE_PRAGMA_VECTOR)
 #pragma vector always
 #endif
-    for (int i=0;i<l;++i)
-      r_val[i] = std::atan2(a[i], b[i]);
-
-    return r_val;
-  }
-
-  template<typename T, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T,l)
-    atan2(const T &a, const Vector<SIMD<T>,l> &b) {
-    return atan2(Vector<SIMD<T>,l>(a), b);
-  }
-
-  template<typename T, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T,l)
-    atan2(const Vector<SIMD<T>,l> &a, const T &b) {
-    return atan2(a, Vector<SIMD<T>,l>(b));
-  }
-  
-#undef KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE
-#undef KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE
+  for (int i = 0; i < l; ++i) r_val[i] = std::atan2(a[i], b[i]);
+
+  return r_val;
+}
+
+template <typename T, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l)
+    atan2(const T &a, const Vector<SIMD<T>, l> &b) {
+  return atan2(Vector<SIMD<T>, l>(a), b);
 }
 
+template <typename T, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l)
+    atan2(const Vector<SIMD<T>, l> &a, const T &b) {
+  return atan2(a, Vector<SIMD<T>, l>(b));
+}
+
+#undef KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE
+#undef KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Vector_SIMD_Misc.hpp b/src/batched/dense/impl/KokkosBatched_Vector_SIMD_Misc.hpp
index 03f5161162..a07d8ab1cc 100644
--- a/src/batched/dense/impl/KokkosBatched_Vector_SIMD_Misc.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Vector_SIMD_Misc.hpp
@@ -7,202 +7,159 @@
 
 namespace KokkosBatched {
 
-#define KOKKOSKERNELS_SIMD_MISC_RETURN_TYPE(T,l) Vector< SIMD< T >, l >
-#define KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE(T0,T1,T2,l) void
-  //typename std::enable_if<std::is_convertible< T1 , T0 >::value && std::is_convertible< T2 , T0 >::value, void >::type
-
-  // scalar, scalar
-
-  template<typename T>
-  KOKKOS_INLINE_FUNCTION
-  static
-  T
-  conditional_assign(const bool cond,
-                     const T &if_true_val,
-                     const T &if_false_val) {
-    return cond ? if_true_val : if_false_val;
-  }
-    
-  template<typename T0, typename T1, typename T2>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE(T0,T1,T2,l)
-    conditional_assign(/* */ T0 &r_val,
-                       const bool cond,
-                       const T1 &if_true_val,
+#define KOKKOSKERNELS_SIMD_MISC_RETURN_TYPE(T, l) Vector<SIMD<T>, l>
+#define KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE(T0, T1, T2, l) void
+// typename std::enable_if<std::is_convertible< T1 , T0 >::value &&
+// std::is_convertible< T2 , T0 >::value, void >::type
+
+// scalar, scalar
+
+template <typename T>
+KOKKOS_INLINE_FUNCTION static T conditional_assign(const bool cond,
+                                                   const T &if_true_val,
+                                                   const T &if_false_val) {
+  return cond ? if_true_val : if_false_val;
+}
+
+template <typename T0, typename T1, typename T2>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE(
+    T0, T1, T2, l)
+    conditional_assign(/* */ T0 &r_val, const bool cond, const T1 &if_true_val,
                        const T2 &if_false_val) {
-    r_val = cond ? if_true_val : if_false_val;
-  }
+  r_val = cond ? if_true_val : if_false_val;
+}
 
-  // vector, scalar
+// vector, scalar
 
-  template<typename T, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MISC_RETURN_TYPE(T,l)
-    conditional_assign(const Vector<SIMD<bool>,l> &cond,
-                       const Vector<SIMD<T>,l> &if_true_val,
+template <typename T, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_RETURN_TYPE(T, l)
+    conditional_assign(const Vector<SIMD<bool>, l> &cond,
+                       const Vector<SIMD<T>, l> &if_true_val,
                        const T &if_false_val) {
-    Vector<SIMD<T>,l> r_val;
-    for (int i=0;i<l;++i)
-      r_val[i] = cond[i] ? if_true_val[i] : if_false_val;
-    return r_val;
-  }
-
-  template<typename T0, typename T1, typename T2, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE(T0,T1,T2,l)
-    conditional_assign(/* */ Vector<SIMD<T0>,l> &r_val,
-                       const Vector<SIMD<bool>,l> &cond,
-                       const Vector<SIMD<T1>,l> &if_true_val,
-                       const T2 &if_false_val) {
-    for (int i=0;i<l;++i)
-      r_val[i] = cond[i] ? if_true_val[i] : if_false_val;
-  }
-    
-  // scalar, vector
-    
-  template<typename T, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MISC_RETURN_TYPE(T,l)
-    conditional_assign(const Vector<SIMD<bool>,l> &cond,
-                       const T &if_true_val,
-                       const Vector<SIMD<T>,l> &if_false_val) {
-    Vector<SIMD<T>,l> r_val;
-    for (int i=0;i<l;++i)
-      r_val[i] = cond[i] ? if_true_val : if_false_val[i];
-    return r_val;
-  }
-
-  template<typename T0, typename T1, typename T2, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE(T0,T1,T2,l)
-    conditional_assign(/* */ Vector<SIMD<T0>,l> &r_val,
-                       const Vector<SIMD<bool>,l> &cond,
-                       const T1 &if_true_val,
-                       const Vector<SIMD<T2>,l> &if_false_val){
-    for (int i=0;i<l;++i)
-      r_val[i] = cond[i] ? if_true_val : if_false_val[i];
-  }
-
-  // vector, vector
-
-  template<typename T, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MISC_RETURN_TYPE(T,l)
-    conditional_assign(const Vector<SIMD<bool>,l> &cond,
-                       const Vector<SIMD<T>,l> &if_true_val,
-                       const Vector<SIMD<T>,l> &if_false_val) {
-    Vector<SIMD<T>,l> r_val;
-    for (int i=0;i<l;++i)
-      r_val[i] = cond[i] ? if_true_val[i] : if_false_val[i];
-    return r_val;
-  }
-
-  template<typename T0, typename T1, typename T2, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE(T0,T1,T2,l)
-    conditional_assign(/* */ Vector<SIMD<T0>,l> &r_val,
-                       const Vector<SIMD<bool>,l> &cond,
-                       const Vector<SIMD<T1>,l> &if_true_val,
-                       const Vector<SIMD<T2>,l> &if_false_val){
-    for (int i=0;i<l;++i)
-      r_val[i] = cond[i] ? if_true_val[i] : if_false_val[i];
-  }
-    
-  template<typename T, int l, typename BinaryOp>
-  KOKKOS_INLINE_FUNCTION
-  static
-  T
-  reduce(const Vector<SIMD<T>,l> &val, const BinaryOp &func) {
-    T r_val = val[0];
-    for (int i=1;i<l;++i)
-      r_val = func(r_val, val[i]);
-    return r_val;
-  }
-
-  template<typename T, int l, typename BinaryOp>
-  KOKKOS_INLINE_FUNCTION
-  static
-  T
-  reduce(const Vector<SIMD<T>,l> &val, const BinaryOp &func, const T init) {
-    T r_val = init;
-    for (int i=0;i<l;++i)
-      r_val = func(r_val, val[i]);
-    return r_val;
-  }
-
-  template<int l>
-  KOKKOS_INLINE_FUNCTION    
-  static
-  bool
-  is_all_true(const Vector<SIMD<bool>,l> &cond) {
-    return reduce(cond, [](const bool left, const bool right) -> bool {
-        return (left && right);
-      });
-  } 
-
-  template<int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  bool
-  is_any_true(const Vector<SIMD<bool>,l> &cond) {
-    return reduce(cond, [](const bool left, const bool right) -> bool {
-        return left || right;
-      });
-  } 
-
-  template<typename T, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  T
-  min(const Vector<SIMD<T>,l> &val) {
-    return reduce(val, [](const T left, const T right) -> T {
-        const auto tmp = left < right;
-        return tmp*left + !tmp*right;
-      });
-  } 
-
-  template<typename T, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  T
-  max(const Vector<SIMD<T>,l> &val) {
-    return reduce(val, [](const T left, const T right) -> T {
-        const auto tmp = left > right;
-        return tmp*left + !tmp*right;
-      });
-  } 
-
-  template<typename T, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  T
-  sum(const Vector<SIMD<T>,l> &val) {
-    return reduce(val, [](const T left, const T right) -> T {
-        return left + right;
-      }, T(0));
-  } 
-
-  template<typename T, int l>
-  KOKKOS_INLINE_FUNCTION
-  static
-  T
-  prod(const Vector<SIMD<T>,l> &val) {
-    return reduce(val, [](const T left, const T right) -> T {
-        return left * right;
-      }, T(1));
-  } 
+  Vector<SIMD<T>, l> r_val;
+  for (int i = 0; i < l; ++i)
+    r_val[i] = cond[i] ? if_true_val[i] : if_false_val;
+  return r_val;
+}
+
+template <typename T0, typename T1, typename T2, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE(
+    T0, T1, T2, l) conditional_assign(/* */ Vector<SIMD<T0>, l> &r_val,
+                                      const Vector<SIMD<bool>, l> &cond,
+                                      const Vector<SIMD<T1>, l> &if_true_val,
+                                      const T2 &if_false_val) {
+  for (int i = 0; i < l; ++i)
+    r_val[i] = cond[i] ? if_true_val[i] : if_false_val;
+}
+
+// scalar, vector
+
+template <typename T, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_RETURN_TYPE(T, l)
+    conditional_assign(const Vector<SIMD<bool>, l> &cond, const T &if_true_val,
+                       const Vector<SIMD<T>, l> &if_false_val) {
+  Vector<SIMD<T>, l> r_val;
+  for (int i = 0; i < l; ++i)
+    r_val[i] = cond[i] ? if_true_val : if_false_val[i];
+  return r_val;
+}
+
+template <typename T0, typename T1, typename T2, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE(
+    T0, T1, T2, l)
+    conditional_assign(/* */ Vector<SIMD<T0>, l> &r_val,
+                       const Vector<SIMD<bool>, l> &cond, const T1 &if_true_val,
+                       const Vector<SIMD<T2>, l> &if_false_val) {
+  for (int i = 0; i < l; ++i)
+    r_val[i] = cond[i] ? if_true_val : if_false_val[i];
+}
+
+// vector, vector
+
+template <typename T, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_RETURN_TYPE(T, l)
+    conditional_assign(const Vector<SIMD<bool>, l> &cond,
+                       const Vector<SIMD<T>, l> &if_true_val,
+                       const Vector<SIMD<T>, l> &if_false_val) {
+  Vector<SIMD<T>, l> r_val;
+  for (int i = 0; i < l; ++i)
+    r_val[i] = cond[i] ? if_true_val[i] : if_false_val[i];
+  return r_val;
+}
+
+template <typename T0, typename T1, typename T2, int l>
+KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE(
+    T0, T1, T2, l) conditional_assign(/* */ Vector<SIMD<T0>, l> &r_val,
+                                      const Vector<SIMD<bool>, l> &cond,
+                                      const Vector<SIMD<T1>, l> &if_true_val,
+                                      const Vector<SIMD<T2>, l> &if_false_val) {
+  for (int i = 0; i < l; ++i)
+    r_val[i] = cond[i] ? if_true_val[i] : if_false_val[i];
+}
+
+template <typename T, int l, typename BinaryOp>
+KOKKOS_INLINE_FUNCTION static T reduce(const Vector<SIMD<T>, l> &val,
+                                       const BinaryOp &func) {
+  T r_val = val[0];
+  for (int i = 1; i < l; ++i) r_val = func(r_val, val[i]);
+  return r_val;
+}
+
+template <typename T, int l, typename BinaryOp>
+KOKKOS_INLINE_FUNCTION static T reduce(const Vector<SIMD<T>, l> &val,
+                                       const BinaryOp &func, const T init) {
+  T r_val = init;
+  for (int i = 0; i < l; ++i) r_val = func(r_val, val[i]);
+  return r_val;
+}
+
+template <int l>
+KOKKOS_INLINE_FUNCTION static bool is_all_true(
+    const Vector<SIMD<bool>, l> &cond) {
+  return reduce(cond, [](const bool left, const bool right) -> bool {
+    return (left && right);
+  });
+}
+
+template <int l>
+KOKKOS_INLINE_FUNCTION static bool is_any_true(
+    const Vector<SIMD<bool>, l> &cond) {
+  return reduce(cond, [](const bool left, const bool right) -> bool {
+    return left || right;
+  });
+}
+
+template <typename T, int l>
+KOKKOS_INLINE_FUNCTION static T min(const Vector<SIMD<T>, l> &val) {
+  return reduce(val, [](const T left, const T right) -> T {
+    const auto tmp = left < right;
+    return tmp * left + !tmp * right;
+  });
+}
+
+template <typename T, int l>
+KOKKOS_INLINE_FUNCTION static T max(const Vector<SIMD<T>, l> &val) {
+  return reduce(val, [](const T left, const T right) -> T {
+    const auto tmp = left > right;
+    return tmp * left + !tmp * right;
+  });
+}
+
+template <typename T, int l>
+KOKKOS_INLINE_FUNCTION static T sum(const Vector<SIMD<T>, l> &val) {
+  return reduce(
+      val, [](const T left, const T right) -> T { return left + right; }, T(0));
+}
+
+template <typename T, int l>
+KOKKOS_INLINE_FUNCTION static T prod(const Vector<SIMD<T>, l> &val) {
+  return reduce(
+      val, [](const T left, const T right) -> T { return left * right; }, T(1));
+}
 
 #undef KOKKOSKERNELS_SIMD_MISC_RETURN_TYPE
 #undef KOKKOSKERNELS_SIMD_MISC_CONVERTIBLE_RETURN_VOID_TYPE
-    
-}
 
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Vector_SIMD_Relation.hpp b/src/batched/dense/impl/KokkosBatched_Vector_SIMD_Relation.hpp
index 6ef8942f16..4283d77d9c 100644
--- a/src/batched/dense/impl/KokkosBatched_Vector_SIMD_Relation.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Vector_SIMD_Relation.hpp
@@ -7,65 +7,62 @@
 
 namespace KokkosBatched {
 
-  // vector, vector
+// vector, vector
 
-#undef  KOKKOSBATCHED_RELATION_OPERATOR
-#define KOKKOSBATCHED_RELATION_OPERATOR(op)                             \
-  template<typename T1, typename T2, int l>                             \
-  KOKKOS_INLINE_FUNCTION                                                \
-  const Vector<SIMD<bool>,l> operator op (const Vector<SIMD<T1>,l> &a, const Vector<SIMD<T2>,l> &b) { \
-    Vector<SIMD<bool>,l> r_val;                                         \
-    for (int i=0;i<l;++i)                                               \
-      r_val[i] = a[i] op b[i];                                          \
-    return r_val;                                                       \
-  }                                                                   
+#undef KOKKOSBATCHED_RELATION_OPERATOR
+#define KOKKOSBATCHED_RELATION_OPERATOR(op)                         \
+  template <typename T1, typename T2, int l>                        \
+  KOKKOS_INLINE_FUNCTION const Vector<SIMD<bool>, l> operator op(   \
+      const Vector<SIMD<T1>, l> &a, const Vector<SIMD<T2>, l> &b) { \
+    Vector<SIMD<bool>, l> r_val;                                    \
+    for (int i = 0; i < l; ++i) r_val[i] = a[i] op b[i];            \
+    return r_val;                                                   \
+  }
 
-  KOKKOSBATCHED_RELATION_OPERATOR(<)
-  KOKKOSBATCHED_RELATION_OPERATOR(>)
-  KOKKOSBATCHED_RELATION_OPERATOR(<=)
-  KOKKOSBATCHED_RELATION_OPERATOR(>=)
-  KOKKOSBATCHED_RELATION_OPERATOR(==)
-  KOKKOSBATCHED_RELATION_OPERATOR(!=)
+KOKKOSBATCHED_RELATION_OPERATOR(<)
+KOKKOSBATCHED_RELATION_OPERATOR(>)
+KOKKOSBATCHED_RELATION_OPERATOR(<=)
+KOKKOSBATCHED_RELATION_OPERATOR(>=)
+KOKKOSBATCHED_RELATION_OPERATOR(==)
+KOKKOSBATCHED_RELATION_OPERATOR(!=)
 
-  // vector, scalar
-#undef  KOKKOSBATCHED_RELATION_OPERATOR
-#define KOKKOSBATCHED_RELATION_OPERATOR(op)                             \
-  template<typename T1, typename T2, int l>                             \
-  KOKKOS_INLINE_FUNCTION                                                \
-  const Vector<SIMD<bool>,l> operator op (const Vector<SIMD<T1>,l> &a, const T2 &b) { \
-  Vector<SIMD<bool>,l> r_val;                                           \
-  for (int i=0;i<l;++i)                                                 \
-    r_val[i] = a[i] op b;                                               \
-  return r_val;                                                         \
-}                                                                   
+// vector, scalar
+#undef KOKKOSBATCHED_RELATION_OPERATOR
+#define KOKKOSBATCHED_RELATION_OPERATOR(op)                       \
+  template <typename T1, typename T2, int l>                      \
+  KOKKOS_INLINE_FUNCTION const Vector<SIMD<bool>, l> operator op( \
+      const Vector<SIMD<T1>, l> &a, const T2 &b) {                \
+    Vector<SIMD<bool>, l> r_val;                                  \
+    for (int i = 0; i < l; ++i) r_val[i] = a[i] op b;             \
+    return r_val;                                                 \
+  }
 
-  KOKKOSBATCHED_RELATION_OPERATOR(<)
-  KOKKOSBATCHED_RELATION_OPERATOR(>)
-  KOKKOSBATCHED_RELATION_OPERATOR(<=)
-  KOKKOSBATCHED_RELATION_OPERATOR(>=)
-  KOKKOSBATCHED_RELATION_OPERATOR(==)
-  KOKKOSBATCHED_RELATION_OPERATOR(!=)
+KOKKOSBATCHED_RELATION_OPERATOR(<)
+KOKKOSBATCHED_RELATION_OPERATOR(>)
+KOKKOSBATCHED_RELATION_OPERATOR(<=)
+KOKKOSBATCHED_RELATION_OPERATOR(>=)
+KOKKOSBATCHED_RELATION_OPERATOR(==)
+KOKKOSBATCHED_RELATION_OPERATOR(!=)
 
-  // scalar, vector
-#undef  KOKKOSBATCHED_RELATION_OPERATOR
-#define KOKKOSBATCHED_RELATION_OPERATOR(op)                             \
-  template<typename T1, typename T2, int l>                             \
-  KOKKOS_INLINE_FUNCTION                                                \
-  const Vector<SIMD<bool>,l> operator op (const T1 &a, const Vector<SIMD<T2>,l> &b) { \
-    Vector<SIMD<bool>,l> r_val;                                         \
-    for (int i=0;i<l;++i)                                               \
-      r_val[i] = a op b[i];                                             \
-    return r_val;                                                       \
-  }                                                                   
+// scalar, vector
+#undef KOKKOSBATCHED_RELATION_OPERATOR
+#define KOKKOSBATCHED_RELATION_OPERATOR(op)                       \
+  template <typename T1, typename T2, int l>                      \
+  KOKKOS_INLINE_FUNCTION const Vector<SIMD<bool>, l> operator op( \
+      const T1 &a, const Vector<SIMD<T2>, l> &b) {                \
+    Vector<SIMD<bool>, l> r_val;                                  \
+    for (int i = 0; i < l; ++i) r_val[i] = a op b[i];             \
+    return r_val;                                                 \
+  }
 
-  KOKKOSBATCHED_RELATION_OPERATOR(<)
-  KOKKOSBATCHED_RELATION_OPERATOR(>)
-  KOKKOSBATCHED_RELATION_OPERATOR(<=)
-  KOKKOSBATCHED_RELATION_OPERATOR(>=)
-  KOKKOSBATCHED_RELATION_OPERATOR(==)
-  KOKKOSBATCHED_RELATION_OPERATOR(!=)
+KOKKOSBATCHED_RELATION_OPERATOR(<)
+KOKKOSBATCHED_RELATION_OPERATOR(>)
+KOKKOSBATCHED_RELATION_OPERATOR(<=)
+KOKKOSBATCHED_RELATION_OPERATOR(>=)
+KOKKOSBATCHED_RELATION_OPERATOR(==)
+KOKKOSBATCHED_RELATION_OPERATOR(!=)
 
-#undef  KOKKOSBATCHED_RELATION_OPERATOR
-}
+#undef KOKKOSBATCHED_RELATION_OPERATOR
+}  // namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp b/src/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp
index 38d4628238..90a8528c4f 100644
--- a/src/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp
@@ -6,191 +6,257 @@
 #pragma GCC diagnostic ignored "-Wswitch"
 
 namespace KokkosBatched {
-    
-  template<int dim>
-  struct PackDim {
-    enum : int { value = dim };
-  }; 
-    
-  // temporary solution until kokkos support SIMD layout or I do support it 
-  template<typename ViewType, typename PackDim>
-  struct SimdViewAccess {
-  private:
-    ViewType _a;
-      
-  public:
-    typedef typename ViewType::reference_type  reference_simd_type ;
-    typedef typename ViewType::pointer_type    pointer_simd_type ;
-    typedef typename ViewType::value_type      value_simd_type;
-
-    typedef typename value_simd_type::value_type value_type;
-    typedef value_type& reference_type;
-    typedef value_type* pointer_type; 
-      
-    enum : int { rank = ViewType::rank };
-    enum : int { pack_dim = PackDim::value };
-    enum : int { vector_length = value_simd_type::vector_length };
-      
-    SimdViewAccess() : _a() {} 
-    SimdViewAccess(const ViewType &a) : _a(a) {} 
-
-    SimdViewAccess & operator=(const ViewType &b) {
-      _a = b;
-      return *this;
-    }
-    SimdViewAccess & operator=(const SimdViewAccess &b) {
-      if (this != &b) {
-        _a = b._a;
-      }
-      return *this;
-    }
 
-    template< typename iType >
-    KOKKOS_INLINE_FUNCTION constexpr
-    typename std::enable_if< std::is_integral<iType>::value , size_t >::type
-    extent( const iType & r ) const
-    { return _a.extent(r)*(r == PackDim::value ? vector_length : 1); }
-    
-    template< typename iType >
-    KOKKOS_INLINE_FUNCTION constexpr
-    typename std::enable_if< std::is_integral<iType>::value , int >::type
-    extent_int( const iType & r ) const
-    { return static_cast<int>(_a.extent(r)*(r == PackDim::value ? vector_length : 1)); }
-
-    KOKKOS_INLINE_FUNCTION constexpr size_t size() const { 
-      return (_a.size() * vector_length);
-    }
+template <int dim>
+struct PackDim {
+  enum : int { value = dim };
+};
+
+// temporary solution until kokkos support SIMD layout or I do support it
+template <typename ViewType, typename PackDim>
+struct SimdViewAccess {
+ private:
+  ViewType _a;
+
+ public:
+  typedef typename ViewType::reference_type reference_simd_type;
+  typedef typename ViewType::pointer_type pointer_simd_type;
+  typedef typename ViewType::value_type value_simd_type;
 
-    KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return _a.span()*vector_length; }
-    KOKKOS_INLINE_FUNCTION constexpr bool   span_span_is_contiguous() const { return _a.span_span_is_contiguous(); }
-    KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { return _a.data(); }
-
-    /// rank 0
-    /// this does not make sense as this is flat view to simd view
-      
-    /// rank 1
-    template< typename I0 , class ... Args >
-    KOKKOS_FORCEINLINE_FUNCTION
-    typename std::enable_if<Kokkos::Impl::are_integral<I0,Args...>::value && 1 == ViewType::rank, reference_type >::type
-    operator()( const I0 & i0 , Args ... /*args*/ ) const {
-      return _a(i0/vector_length)[i0%vector_length];
+  typedef typename value_simd_type::value_type value_type;
+  typedef value_type &reference_type;
+  typedef value_type *pointer_type;
+
+  enum : int { rank = ViewType::rank };
+  enum : int { pack_dim = PackDim::value };
+  enum : int { vector_length = value_simd_type::vector_length };
+
+  SimdViewAccess() : _a() {}
+  SimdViewAccess(const ViewType &a) : _a(a) {}
+
+  SimdViewAccess &operator=(const ViewType &b) {
+    _a = b;
+    return *this;
+  }
+  SimdViewAccess &operator=(const SimdViewAccess &b) {
+    if (this != &b) {
+      _a = b._a;
     }
-      
-    /// rank 2 
-    template< typename I0 , typename I1 , class ... Args >
-    KOKKOS_FORCEINLINE_FUNCTION
-    typename std::enable_if<Kokkos::Impl::are_integral<I0,I1,Args...>::value && 2 == ViewType::rank, reference_type >::type
-    operator()( const I0 & i0 , const I1 & i1 , Args ... /*args*/ ) const {
-      switch (PackDim::value) {
-      case 0: return _a(i0/vector_length,i1)[i0%vector_length];
+    return *this;
+  }
+
+  template <typename iType>
+  KOKKOS_INLINE_FUNCTION constexpr
+      typename std::enable_if<std::is_integral<iType>::value, size_t>::type
+      extent(const iType &r) const {
+    return _a.extent(r) * (r == PackDim::value ? vector_length : 1);
+  }
+
+  template <typename iType>
+  KOKKOS_INLINE_FUNCTION constexpr
+      typename std::enable_if<std::is_integral<iType>::value, int>::type
+      extent_int(const iType &r) const {
+    return static_cast<int>(_a.extent(r) *
+                            (r == PackDim::value ? vector_length : 1));
+  }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t size() const {
+    return (_a.size() * vector_length);
+  }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t span() const {
+    return _a.span() * vector_length;
+  }
+  KOKKOS_INLINE_FUNCTION constexpr bool span_span_is_contiguous() const {
+    return _a.span_span_is_contiguous();
+  }
+  KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const {
+    return _a.data();
+  }
+
+  /// rank 0
+  /// this does not make sense as this is flat view to simd view
+
+  /// rank 1
+  template <typename I0, class... Args>
+  KOKKOS_FORCEINLINE_FUNCTION
+      typename std::enable_if<Kokkos::Impl::are_integral<I0, Args...>::value &&
+                                  1 == ViewType::rank,
+                              reference_type>::type
+      operator()(const I0 &i0, Args... /*args*/) const {
+    return _a(i0 / vector_length)[i0 % vector_length];
+  }
+
+  /// rank 2
+  template <typename I0, typename I1, class... Args>
+  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
+      Kokkos::Impl::are_integral<I0, I1, Args...>::value && 2 == ViewType::rank,
+      reference_type>::type
+  operator()(const I0 &i0, const I1 &i1, Args... /*args*/) const {
+    switch (PackDim::value) {
+      case 0: return _a(i0 / vector_length, i1)[i0 % vector_length];
       case 1: break;
-      default:break;
-      }
-      return _a(i0,i1/vector_length)[i1%vector_length];        
+      default: break;
     }
+    return _a(i0, i1 / vector_length)[i1 % vector_length];
+  }
 
-    /// rank 3
-    template< typename I0 , typename I1 , typename I2 , class ... Args >
-    KOKKOS_FORCEINLINE_FUNCTION
-    typename std::enable_if<Kokkos::Impl::are_integral<I0,I1,I2,Args...>::value && 3 == ViewType::rank, reference_type >::type
-    operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , Args ... /*args*/ ) const {
-      switch (PackDim::value) {
-      case 0: return _a(i0/vector_length,i1,i2)[i0%vector_length];
-      case 1: return _a(i0,i1/vector_length,i2)[i1%vector_length];
+  /// rank 3
+  template <typename I0, typename I1, typename I2, class... Args>
+  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
+      Kokkos::Impl::are_integral<I0, I1, I2, Args...>::value &&
+          3 == ViewType::rank,
+      reference_type>::type
+  operator()(const I0 &i0, const I1 &i1, const I2 &i2, Args... /*args*/) const {
+    switch (PackDim::value) {
+      case 0: return _a(i0 / vector_length, i1, i2)[i0 % vector_length];
+      case 1: return _a(i0, i1 / vector_length, i2)[i1 % vector_length];
       case 2: break;
-      default:break;
-      }
-      return _a(i0,i1,i2/vector_length)[i2%vector_length];
+      default: break;
     }
-      
-    /// rank 4
-    template< typename I0 , typename I1 , typename I2 , typename I3 , class ... Args >
-    KOKKOS_FORCEINLINE_FUNCTION
-    typename std::enable_if<Kokkos::Impl::are_integral<I0,I1,I2,I3,Args...>::value && 4 == ViewType::rank, reference_type >::type
-    operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 , Args ... /*args*/ ) const {
-      switch (PackDim::value) {
-      case 0: return _a(i0/vector_length,i1,i2,i3)[i0%vector_length];
-      case 1: return _a(i0,i1/vector_length,i2,i3)[i1%vector_length];
-      case 2: return _a(i0,i1,i2/vector_length,i3)[i2%vector_length];
+    return _a(i0, i1, i2 / vector_length)[i2 % vector_length];
+  }
+
+  /// rank 4
+  template <typename I0, typename I1, typename I2, typename I3, class... Args>
+  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
+      Kokkos::Impl::are_integral<I0, I1, I2, I3, Args...>::value &&
+          4 == ViewType::rank,
+      reference_type>::type
+  operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3,
+             Args... /*args*/) const {
+    switch (PackDim::value) {
+      case 0: return _a(i0 / vector_length, i1, i2, i3)[i0 % vector_length];
+      case 1: return _a(i0, i1 / vector_length, i2, i3)[i1 % vector_length];
+      case 2: return _a(i0, i1, i2 / vector_length, i3)[i2 % vector_length];
       case 3: break;
-      default:break;
-      }
-      return _a(i0,i1,i2,i3/vector_length)[i3%vector_length];
+      default: break;
     }
+    return _a(i0, i1, i2, i3 / vector_length)[i3 % vector_length];
+  }
 
-    /// rank 5
-    template< typename I0 , typename I1 , typename I2 , typename I3 , typename I4 , class ... Args >
-    KOKKOS_FORCEINLINE_FUNCTION
-    typename std::enable_if<Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,Args...>::value && 5 == ViewType::rank, reference_type >::type
-    operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 , const I4 & i4 , Args ... /*args*/ ) const {
-      switch (PackDim::value) {
-      case 0: return _a(i0/vector_length,i1,i2,i3,i4)[i0%vector_length];
-      case 1: return _a(i0,i1/vector_length,i2,i3,i4)[i1%vector_length];
-      case 2: return _a(i0,i1,i2/vector_length,i3,i4)[i2%vector_length];
-      case 3: return _a(i0,i1,i2,i3/vector_length,i4)[i3%vector_length];
+  /// rank 5
+  template <typename I0, typename I1, typename I2, typename I3, typename I4,
+            class... Args>
+  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
+      Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, Args...>::value &&
+          5 == ViewType::rank,
+      reference_type>::type
+  operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3,
+             const I4 &i4, Args... /*args*/) const {
+    switch (PackDim::value) {
+      case 0: return _a(i0 / vector_length, i1, i2, i3, i4)[i0 % vector_length];
+      case 1: return _a(i0, i1 / vector_length, i2, i3, i4)[i1 % vector_length];
+      case 2: return _a(i0, i1, i2 / vector_length, i3, i4)[i2 % vector_length];
+      case 3: return _a(i0, i1, i2, i3 / vector_length, i4)[i3 % vector_length];
       case 4: break;
-      default:break;
-      }
-      return _a(i0,i1,i2,i3,i4/vector_length)[i4%vector_length];
+      default: break;
     }
+    return _a(i0, i1, i2, i3, i4 / vector_length)[i4 % vector_length];
+  }
 
-    /// rank 6
-    template< typename I0 , typename I1 , typename I2 , typename I3 , typename I4 , typename I5 , class ... Args >
-    KOKKOS_FORCEINLINE_FUNCTION
-    typename std::enable_if<Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,Args...>::value && 6 == ViewType::rank, reference_type >::type
-    operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 , const I4 & i4 , const I5 & i5 , Args ... /*args*/ ) const {
-      switch (PackDim::value) {
-      case 0: return _a(i0/vector_length,i1,i2,i3,i4,i5)[i0%vector_length];
-      case 1: return _a(i0,i1/vector_length,i2,i3,i4,i5)[i1%vector_length];
-      case 2: return _a(i0,i1,i2/vector_length,i3,i4,i5)[i2%vector_length];
-      case 3: return _a(i0,i1,i2,i3/vector_length,i4,i5)[i3%vector_length];
-      case 4: return _a(i0,i1,i2,i3,i4/vector_length,i5)[i4%vector_length];
+  /// rank 6
+  template <typename I0, typename I1, typename I2, typename I3, typename I4,
+            typename I5, class... Args>
+  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
+      Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, Args...>::value &&
+          6 == ViewType::rank,
+      reference_type>::type
+  operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3,
+             const I4 &i4, const I5 &i5, Args... /*args*/) const {
+    switch (PackDim::value) {
+      case 0:
+        return _a(i0 / vector_length, i1, i2, i3, i4, i5)[i0 % vector_length];
+      case 1:
+        return _a(i0, i1 / vector_length, i2, i3, i4, i5)[i1 % vector_length];
+      case 2:
+        return _a(i0, i1, i2 / vector_length, i3, i4, i5)[i2 % vector_length];
+      case 3:
+        return _a(i0, i1, i2, i3 / vector_length, i4, i5)[i3 % vector_length];
+      case 4:
+        return _a(i0, i1, i2, i3, i4 / vector_length, i5)[i4 % vector_length];
       case 5: break;
-      default:break;
-      }
-      return _a(i0,i1,i2,i3,i4,i5/vector_length)[i5%vector_length];
+      default: break;
     }
-    
-    /// rank 7
-    template< typename I0 , typename I1 , typename I2 , typename I3 , typename I4 , typename I5 , typename I6 , class ... Args >
-    KOKKOS_FORCEINLINE_FUNCTION
-    typename std::enable_if<Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,Args...>::value && 7 == ViewType::rank, reference_type >::type
-    operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 , const I4 & i4 , const I5 & i5 , const I6 & i6 , Args ... /*args*/ ) const {
-      switch (PackDim::value) {
-      case 0: return _a(i0/vector_length,i1,i2,i3,i4,i5,i6)[i0%vector_length];
-      case 1: return _a(i0,i1/vector_length,i2,i3,i4,i5,i6)[i1%vector_length];
-      case 2: return _a(i0,i1,i2/vector_length,i3,i4,i5,i6)[i2%vector_length];
-      case 3: return _a(i0,i1,i2,i3/vector_length,i4,i5,i6)[i3%vector_length];
-      case 4: return _a(i0,i1,i2,i3,i4/vector_length,i5,i6)[i4%vector_length];
-      case 5: return _a(i0,i1,i2,i3,i4,i5/vector_length,i6)[i5%vector_length];
+    return _a(i0, i1, i2, i3, i4, i5 / vector_length)[i5 % vector_length];
+  }
+
+  /// rank 7
+  template <typename I0, typename I1, typename I2, typename I3, typename I4,
+            typename I5, typename I6, class... Args>
+  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
+      Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6, Args...>::value &&
+          7 == ViewType::rank,
+      reference_type>::type
+  operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3,
+             const I4 &i4, const I5 &i5, const I6 &i6, Args... /*args*/) const {
+    switch (PackDim::value) {
+      case 0:
+        return _a(i0 / vector_length, i1, i2, i3, i4, i5,
+                  i6)[i0 % vector_length];
+      case 1:
+        return _a(i0, i1 / vector_length, i2, i3, i4, i5,
+                  i6)[i1 % vector_length];
+      case 2:
+        return _a(i0, i1, i2 / vector_length, i3, i4, i5,
+                  i6)[i2 % vector_length];
+      case 3:
+        return _a(i0, i1, i2, i3 / vector_length, i4, i5,
+                  i6)[i3 % vector_length];
+      case 4:
+        return _a(i0, i1, i2, i3, i4 / vector_length, i5,
+                  i6)[i4 % vector_length];
+      case 5:
+        return _a(i0, i1, i2, i3, i4, i5 / vector_length,
+                  i6)[i5 % vector_length];
       case 6: break;
-      default:break;
-      }
-      return _a(i0,i1,i2,i3,i4,i5,i6/vector_length)[i6%vector_length];
+      default: break;
     }
+    return _a(i0, i1, i2, i3, i4, i5, i6 / vector_length)[i6 % vector_length];
+  }
 
-    /// rank 8
-    template< typename I0 , typename I1 , typename I2 , typename I3 , typename I4 , typename I5 , typename I6 , typename I7 , class ... Args >
-    KOKKOS_FORCEINLINE_FUNCTION
-    typename std::enable_if<Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,I7,Args...>::value && 8 == ViewType::rank, reference_type >::type
-    operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7 , Args ... /*args*/ ) const {
-      switch (PackDim::value) {
-      case 0: return _a(i0/vector_length,i1,i2,i3,i4,i5,i6,i7)[i0%vector_length];
-      case 1: return _a(i0,i1/vector_length,i2,i3,i4,i5,i6,i7)[i1%vector_length];
-      case 2: return _a(i0,i1,i2/vector_length,i3,i4,i5,i6,i7)[i2%vector_length];
-      case 3: return _a(i0,i1,i2,i3/vector_length,i4,i5,i6,i7)[i3%vector_length];
-      case 4: return _a(i0,i1,i2,i3,i4/vector_length,i5,i6,i7)[i4%vector_length];
-      case 5: return _a(i0,i1,i2,i3,i4,i5/vector_length,i6,i7)[i5%vector_length];
-      case 6: return _a(i0,i1,i2,i3,i4,i5,i6/vector_length,i7)[i6%vector_length];
+  /// rank 8
+  template <typename I0, typename I1, typename I2, typename I3, typename I4,
+            typename I5, typename I6, typename I7, class... Args>
+  KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if<
+      Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6, I7,
+                                 Args...>::value &&
+          8 == ViewType::rank,
+      reference_type>::type
+  operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3,
+             const I4 &i4, const I5 &i5, const I6 &i6, const I7 &i7,
+             Args... /*args*/) const {
+    switch (PackDim::value) {
+      case 0:
+        return _a(i0 / vector_length, i1, i2, i3, i4, i5, i6,
+                  i7)[i0 % vector_length];
+      case 1:
+        return _a(i0, i1 / vector_length, i2, i3, i4, i5, i6,
+                  i7)[i1 % vector_length];
+      case 2:
+        return _a(i0, i1, i2 / vector_length, i3, i4, i5, i6,
+                  i7)[i2 % vector_length];
+      case 3:
+        return _a(i0, i1, i2, i3 / vector_length, i4, i5, i6,
+                  i7)[i3 % vector_length];
+      case 4:
+        return _a(i0, i1, i2, i3, i4 / vector_length, i5, i6,
+                  i7)[i4 % vector_length];
+      case 5:
+        return _a(i0, i1, i2, i3, i4, i5 / vector_length, i6,
+                  i7)[i5 % vector_length];
+      case 6:
+        return _a(i0, i1, i2, i3, i4, i5, i6 / vector_length,
+                  i7)[i6 % vector_length];
       case 7: break;
-      default:break;
-      }
-      return _a(i0,i1,i2,i3,i4,i5,i6,i7/vector_length)[i7%vector_length];
+      default: break;
     }
-  };
-}
-
+    return _a(i0, i1, i2, i3, i4, i5, i6,
+              i7 / vector_length)[i7 % vector_length];
+  }
+};
+}  // namespace KokkosBatched
 
 #pragma GCC diagnostic pop
-  
+
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp
index c4bfa908ae..3b231d1412 100644
--- a/src/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp
+++ b/src/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp
@@ -1,60 +1,59 @@
 #ifndef __KOKKOSBATCHED_WILKINSON_SHIFT_SERIAL_INTERNAL_HPP__
 #define __KOKKOSBATCHED_WILKINSON_SHIFT_SERIAL_INTERNAL_HPP__
 
-
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
 
 namespace KokkosBatched {
-  ///
-  /// Serial Internal Impl
-  /// ==================== 
-  ///
-  /// this impl follows the flame interface of householder transformation
-  ///
-  struct SerialWilkinsonShiftInternal {
-    template<typename ValueType>
-    KOKKOS_INLINE_FUNCTION
-    static int
-    invoke(const ValueType a, const ValueType b,
-           const ValueType c, const ValueType d,
-           /* */ Kokkos::complex<ValueType> * lambda1,   
-           /* */ Kokkos::complex<ValueType> * lambda2,   
-           /* */ bool * is_complex) {
-      /// compute eigenvalues of 2x2 system [a b;
-      ///                                    c d]
-      /// when the system has a real complex values,
-      /// lambda1 and lambda2 are real eigenvalues
-      /// if the system has a complex eigenvalue pair, 
-      /// then lambda1 and lambda2 are redefined as follows
-      ///   lambda1 := lambda1 + lambda2
-      ///   lambda2 := lambda1 * lambda2
-      typedef ValueType value_type;
-          
-      const value_type half(0.5);
-      const value_type p = (a+d)*half;
-      const value_type q = (b*c-a*d);
-      const value_type v = p*p+q;
-
-      if (v < 0) {
-        // complex 
-        const value_type sqrt_v = Kokkos::Details::ArithTraits<value_type>::sqrt(-v);
-        *lambda1 = Kokkos::complex<value_type>(p, sqrt_v);
-        *lambda2 = Kokkos::complex<value_type>(p,-sqrt_v);
-        *is_complex = true;
-      } else {
-        // real
-        const value_type sqrt_v = Kokkos::Details::ArithTraits<value_type>::sqrt(v);
-        *lambda1 = Kokkos::complex<value_type>(p+sqrt_v);
-        *lambda2 = Kokkos::complex<value_type>(p-sqrt_v);
-        *is_complex = false;
-      }
-      return 0;
+///
+/// Serial Internal Impl
+/// ====================
+///
+/// this impl follows the flame interface of householder transformation
+///
+struct SerialWilkinsonShiftInternal {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const ValueType a, const ValueType b, const ValueType c,
+      const ValueType d,
+      /* */ Kokkos::complex<ValueType>* lambda1,
+      /* */ Kokkos::complex<ValueType>* lambda2,
+      /* */ bool* is_complex) {
+    /// compute eigenvalues of 2x2 system [a b;
+    ///                                    c d]
+    /// when the system has a real complex values,
+    /// lambda1 and lambda2 are real eigenvalues
+    /// if the system has a complex eigenvalue pair,
+    /// then lambda1 and lambda2 are redefined as follows
+    ///   lambda1 := lambda1 + lambda2
+    ///   lambda2 := lambda1 * lambda2
+    typedef ValueType value_type;
+
+    const value_type half(0.5);
+    const value_type p = (a + d) * half;
+    const value_type q = (b * c - a * d);
+    const value_type v = p * p + q;
+
+    if (v < 0) {
+      // complex
+      const value_type sqrt_v =
+          Kokkos::Details::ArithTraits<value_type>::sqrt(-v);
+      *lambda1    = Kokkos::complex<value_type>(p, sqrt_v);
+      *lambda2    = Kokkos::complex<value_type>(p, -sqrt_v);
+      *is_complex = true;
+    } else {
+      // real
+      const value_type sqrt_v =
+          Kokkos::Details::ArithTraits<value_type>::sqrt(v);
+      *lambda1    = Kokkos::complex<value_type>(p + sqrt_v);
+      *lambda2    = Kokkos::complex<value_type>(p - sqrt_v);
+      *is_complex = false;
     }
-  };
-
-} // end namespace KokkosBatched
+    return 0;
+  }
+};
 
+}  // end namespace KokkosBatched
 
 #endif
diff --git a/src/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp
new file mode 100644
index 0000000000..df18e16573
--- /dev/null
+++ b/src/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp
@@ -0,0 +1,345 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+#ifndef __KOKKOSBATCHED_XPAY_IMPL_HPP__
+#define __KOKKOSBATCHED_XPAY_IMPL_HPP__
+
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+
+namespace KokkosBatched {
+
+///
+/// Serial Internal Impl
+/// ====================
+struct SerialXpayInternal {
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType alpha,
+                                           const ValueType* KOKKOS_RESTRICT X,
+                                           const int xs0,
+                                           /* */ ValueType* KOKKOS_RESTRICT Y,
+                                           const int ys0) {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+    for (int i = 0; i < m; ++i) {
+      Y[i * ys0] *= alpha;
+      Y[i * ys0] += X[i * xs0];
+    }
+
+    return 0;
+  }
+
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const int m, const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0,
+      const ValueType* KOKKOS_RESTRICT X, const int xs0,
+      /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+    for (int i = 0; i < m; ++i) {
+      Y[i * ys0] *= alpha[i * alphas0];
+      Y[i * ys0] += X[i * xs0];
+    }
+
+    return 0;
+  }
+
+  template <typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const int m, const int n, const ScalarType* KOKKOS_RESTRICT alpha,
+      const int alphas0, const ValueType* KOKKOS_RESTRICT X, const int xs0,
+      const int xs1,
+      /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) {
+    if (xs0 > xs1)
+      for (int i = 0; i < m; ++i)
+        invoke(n, alpha[i * alphas0], X + i * xs0, xs1, Y + i * ys0, ys1);
+    else
+      for (int j = 0; j < n; ++j)
+        invoke(m, alpha, alphas0, X + j * xs1, xs0, Y + j * ys1, ys0);
+
+    return 0;
+  }
+};
+
+///
+/// Team Internal Impl
+/// ====================
+struct TeamXpayInternal {
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member,
+                                           const int m, const ScalarType alpha,
+                                           const ValueType* KOKKOS_RESTRICT X,
+                                           const int xs0,
+                                           /* */ ValueType* KOKKOS_RESTRICT Y,
+                                           const int ys0) {
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int& i) {
+      Y[i * ys0] *= alpha;
+      Y[i * ys0] += X[i * xs0];
+    });
+    // member.team_barrier();
+    return 0;
+  }
+
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType& member, const int m,
+      const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0,
+      const ValueType* KOKKOS_RESTRICT X, const int xs0,
+      /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) {
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int& i) {
+      Y[i * ys0] *= alpha[i * alphas0];
+      Y[i * ys0] += X[i * xs0];
+    });
+    // member.team_barrier();
+    return 0;
+  }
+
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType& member, const int m, const int n,
+      const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0,
+      const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1,
+      /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) {
+    if (m > n) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, m), [&](const int& i) {
+            SerialXpayInternal::invoke(n, alpha[i * alphas0], X + i * xs0, xs1,
+                                       Y + i * ys0, ys1);
+          });
+    } else {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, n), [&](const int& j) {
+            SerialXpayInternal::invoke(m, alpha, alphas0, X + j * xs1, xs0,
+                                       Y + j * ys1, ys0);
+          });
+    }
+    // member.team_barrier();
+    return 0;
+  }
+};
+
+///
+/// TeamVector Internal Impl
+/// ========================
+struct TeamVectorXpayInternal {
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member,
+                                           const int m, const ScalarType alpha,
+                                           const ValueType* KOKKOS_RESTRICT X,
+                                           const int xs0,
+                                           /* */ ValueType* KOKKOS_RESTRICT Y,
+                                           const int ys0) {
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int& i) {
+      Y[i * ys0] *= alpha;
+      Y[i * ys0] += X[i * xs0];
+    });
+    // member.team_barrier();
+    return 0;
+  }
+
+  template <typename MemberType, typename ScalarType, typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType& member, const int m,
+      const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0,
+      const ValueType* KOKKOS_RESTRICT X, const int xs0,
+      /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) {
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int& i) {
+      Y[i * ys0] *= alpha[i * alphas0];
+      Y[i * ys0] += X[i * xs0];
+    });
+    // member.team_barrier();
+    return 0;
+  }
+
+  template <typename MemberType, typename ScalarType, typename ValueType,
+            typename layout>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType& member, const int m, const int n,
+      const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0,
+      const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1,
+      /* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) {
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, m * n),
+                         [&](const int& iTemp) {
+                           int i, j;
+                           getIndices<int, layout>(iTemp, n, m, j, i);
+                           Y[i * ys0 + j * ys1] *= alpha[i * alphas0];
+                           Y[i * ys0 + j * ys1] += X[i * xs0 + j * xs1];
+                         });
+    // member.team_barrier();
+    return 0;
+  }
+};
+
+///
+/// Serial Impl
+/// ===========
+template <typename ViewType, typename alphaViewType>
+KOKKOS_INLINE_FUNCTION int SerialXpay::invoke(const alphaViewType& alpha,
+                                              const ViewType& X,
+                                              const ViewType& Y) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+  static_assert(Kokkos::is_view<ViewType>::value,
+                "KokkosBatched::xpay: ViewType is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<alphaViewType>::value,
+                "KokkosBatched::xpay: alphaViewType is not a Kokkos::View.");
+  static_assert(ViewType::Rank == 2,
+                "KokkosBatched::xpay: ViewType must have rank 2.");
+  static_assert(alphaViewType::Rank == 1,
+                "KokkosBatched::xpay: alphaViewType must have rank 1.");
+
+  // Check compatibility of dimensions at run time.
+  if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) {
+    KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+        "KokkosBatched::xpay: Dimensions of X and Y do not match: X: %d x %d, "
+        "Y: %d x %d\n",
+        (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1));
+    return 1;
+  }
+  if (X.extent(0) != alpha.extent(0)) {
+    KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+        "KokkosBatched::xpay: First dimension of X and alpha do not match: X: "
+        "%d x %d, alpha: %d\n",
+        (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0));
+    return 1;
+  }
+#endif
+
+  return SerialXpayInternal::template invoke<
+      typename alphaViewType::non_const_value_type,
+      typename ViewType::non_const_value_type>(
+      X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), X.data(),
+      X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(), Y.stride_1());
+}
+
+///
+/// Team Impl
+/// =========
+
+template <typename MemberType>
+template <typename ViewType, typename alphaViewType>
+KOKKOS_INLINE_FUNCTION int TeamXpay<MemberType>::invoke(
+    const MemberType& member, const alphaViewType& alpha, const ViewType& X,
+    const ViewType& Y) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+  static_assert(Kokkos::is_view<ViewType>::value,
+                "KokkosBatched::xpay: ViewType is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<alphaViewType>::value,
+                "KokkosBatched::xpay: alphaViewType is not a Kokkos::View.");
+  static_assert(ViewType::Rank == 2,
+                "KokkosBatched::xpay: ViewType must have rank 2.");
+  static_assert(alphaViewType::Rank == 1,
+                "KokkosBatched::xpay: alphaViewType must have rank 1.");
+
+  // Check compatibility of dimensions at run time.
+  if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) {
+    KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+        "KokkosBatched::xpay: Dimensions of X and Y do not match: X: %d x %d, "
+        "Y: %d x %d\n",
+        (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1));
+    return 1;
+  }
+  if (X.extent(0) != alpha.extent(0)) {
+    KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+        "KokkosBatched::xpay: First dimension of X and alpha do not match: X: "
+        "%d x %d, alpha: %d\n",
+        (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0));
+    return 1;
+  }
+#endif
+
+  return TeamXpayInternal::template invoke<
+      MemberType, typename alphaViewType::non_const_value_type,
+      typename ViewType::non_const_value_type>(
+      member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(),
+      X.data(), X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(),
+      Y.stride_1());
+}
+
+///
+/// TeamVector Impl
+/// ===============
+
+template <typename MemberType>
+template <typename ViewType, typename alphaViewType>
+KOKKOS_INLINE_FUNCTION int TeamVectorXpay<MemberType>::invoke(
+    const MemberType& member, const alphaViewType& alpha, const ViewType& X,
+    const ViewType& Y) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+  static_assert(Kokkos::is_view<ViewType>::value,
+                "KokkosBatched::xpay: ViewType is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<alphaViewType>::value,
+                "KokkosBatched::xpay: alphaViewType is not a Kokkos::View.");
+  static_assert(ViewType::Rank == 2,
+                "KokkosBatched::xpay: ViewType must have rank 2.");
+  static_assert(alphaViewType::Rank == 1,
+                "KokkosBatched::xpay: alphaViewType must have rank 1.");
+
+  // Check compatibility of dimensions at run time.
+  if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) {
+    KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+        "KokkosBatched::xpay: Dimensions of X and Y do not match: X: %d x %d, "
+        "Y: %d x %d\n",
+        (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1));
+    return 1;
+  }
+  if (X.extent(0) != alpha.extent(0)) {
+    KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+        "KokkosBatched::xpay: First dimension of X and alpha do not match: X: "
+        "%d x %d, alpha: %d\n",
+        (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0));
+    return 1;
+  }
+#endif
+
+  return TeamVectorXpayInternal::invoke<
+      MemberType, typename alphaViewType::non_const_value_type,
+      typename ViewType::non_const_value_type, typename ViewType::array_layout>(
+      member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(),
+      X.data(), X.stride_0(), X.stride_1(), Y.data(), Y.stride_0(),
+      Y.stride_1());
+}
+
+}  // namespace KokkosBatched
+
+#endif
diff --git a/src/batched/sparse/KokkosBatched_CG.hpp b/src/batched/sparse/KokkosBatched_CG.hpp
new file mode 100644
index 0000000000..e1e6b5d6a4
--- /dev/null
+++ b/src/batched/sparse/KokkosBatched_CG.hpp
@@ -0,0 +1,92 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+#ifndef __KOKKOSBATCHED_CG_HPP__
+#define __KOKKOSBATCHED_CG_HPP__
+
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Vector.hpp"
+
+/// \brief Batched CG: Selective Interface
+///
+/// \tparam OperatorType: The type of the operator of the system
+/// \tparam VectorViewType: Input type for the right-hand side and the solution,
+/// needs to be a 2D view
+///
+/// \param member [in]: TeamPolicy member
+/// \param A [in]: batched operator (can be a batched matrix or a (left or right
+/// or both) preconditioned batched matrix) \param B [in]: right-hand side, a
+/// rank 2 view \param X [in/out]: initial guess and solution, a rank 2 view
+/// \param handle [in]: a handle which provides different information such as
+/// the tolerance or the maximal number of iterations of the solver.
+
+#include "KokkosBatched_Krylov_Handle.hpp"
+#include "KokkosBatched_CG_Team_Impl.hpp"
+#include "KokkosBatched_CG_TeamVector_Impl.hpp"
+
+namespace KokkosBatched {
+
+template <typename MemberType, typename ArgMode>
+struct CG {
+  template <typename OperatorType, typename VectorViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const OperatorType &A, const VectorViewType &B,
+      const VectorViewType &X,
+      const KrylovHandle<typename VectorViewType::non_const_value_type>
+          &handle) {
+    int status = 0;
+    if (std::is_same<ArgMode, Mode::Team>::value) {
+      status =
+          TeamCG<MemberType>::template invoke<OperatorType, VectorViewType>(
+              member, A, B, X, handle);
+    } else if (std::is_same<ArgMode, Mode::TeamVector>::value) {
+      status = TeamVectorCG<MemberType>::template invoke<OperatorType,
+                                                         VectorViewType>(
+          member, A, B, X, handle);
+    }
+    return status;
+  }
+};
+
+}  // namespace KokkosBatched
+#endif
diff --git a/src/batched/sparse/KokkosBatched_CrsMatrix.hpp b/src/batched/sparse/KokkosBatched_CrsMatrix.hpp
new file mode 100644
index 0000000000..5448c4684c
--- /dev/null
+++ b/src/batched/sparse/KokkosBatched_CrsMatrix.hpp
@@ -0,0 +1,195 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+#ifndef __KOKKOSBATCHED_CRSMATRIX_HPP__
+#define __KOKKOSBATCHED_CRSMATRIX_HPP__
+
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+namespace KokkosBatched {
+
+/// \brief Batched CrsMatrix:
+///
+/// \tparam ValuesViewType: Input type for the values of the batched crs matrix,
+/// needs to be a 2D view \tparam IntView: Input type for row offset array and
+/// column-index array, needs to be a 1D view
+
+template <class ValuesViewType, class IntViewType>
+class CrsMatrix {
+ public:
+  using ScalarType = typename ValuesViewType::non_const_value_type;
+  using MagnitudeType =
+      typename Kokkos::Details::ArithTraits<ScalarType>::mag_type;
+
+ private:
+  ValuesViewType values;
+  IntViewType row_ptr;
+  IntViewType colIndices;
+  int n_operators;
+  int n_rows;
+  int n_colums;
+
+ public:
+  KOKKOS_INLINE_FUNCTION
+  CrsMatrix(const ValuesViewType &_values, const IntViewType &_row_ptr,
+            const IntViewType &_colIndices)
+      : values(_values), row_ptr(_row_ptr), colIndices(_colIndices) {
+    n_operators = _values.extent(0);
+    n_rows      = _row_ptr.extent(0) - 1;
+    n_colums    = n_rows;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  ~CrsMatrix() {}
+
+  /// \brief apply version that uses constant coefficients alpha and beta
+  ///
+  ///   y_l <- alpha * A_l * x_l + beta * y_l for all l = 1, ..., N
+  /// where:
+  ///   * N is the number of matrices,
+  ///   * A_1, ..., A_N are N sparse matrices which share the same sparsity
+  ///   pattern,
+  ///   * x_1, ..., x_N are the N input vectors,
+  ///   * y_1, ..., y_N are the N output vectors,
+  ///   * alpha is a scaling factor for x_1, ..., x_N,
+  ///   * beta is a scaling factor for y_1, ..., y_N.
+  ///
+  /// \tparam MemberType: Input type for the TeamPolicy member
+  /// \tparam XViewType: Input type for X, needs to be a 2D view
+  /// \tparam YViewType: Input type for Y, needs to be a 2D view
+  /// \tparam ArgTrans: Argument for transpose or notranspose
+  /// \tparam ArgMode: Argument for the parallelism used in the apply
+  ///
+  /// \param member [in]: TeamPolicy member
+  /// \param alpha [in]: input coefficient for X (default value 1.)
+  /// \param X [in]: Input vector X, a rank 2 view
+  /// \param beta [in]: input coefficient for Y (default value 0.)
+  /// \param Y [in/out]: Output vector Y, a rank 2 view
+
+  template <typename MemberType, typename XViewType, typename YViewType,
+            typename ArgTrans, typename ArgMode>
+  KOKKOS_INLINE_FUNCTION void apply(
+      const MemberType &member, const XViewType &X, const YViewType &Y,
+      MagnitudeType alpha = Kokkos::Details::ArithTraits<MagnitudeType>::one(),
+      MagnitudeType beta =
+          Kokkos::Details::ArithTraits<MagnitudeType>::zero()) const {
+    if (beta == 0)
+      KokkosBatched::Spmv<MemberType, ArgTrans, ArgMode>::template invoke<
+          ValuesViewType, IntViewType, XViewType, YViewType, 0>(
+          member, alpha, values, row_ptr, colIndices, X, beta, Y);
+    else
+      KokkosBatched::Spmv<MemberType, ArgTrans, ArgMode>::template invoke<
+          ValuesViewType, IntViewType, XViewType, YViewType, 1>(
+          member, alpha, values, row_ptr, colIndices, X, beta, Y);
+  }
+
+  /// \brief apply version that uses variable coefficient alpha and no beta
+  ///   y_l <- alpha_l * A_l * x_l  for all l = 1, ..., N
+  /// where:
+  ///   * N is the number of matrices,
+  ///   * A_1, ..., A_N are N sparse matrices which share the same sparsity
+  ///   pattern,
+  ///   * x_1, ..., x_N are the N input vectors,
+  ///   * y_1, ..., y_N are the N output vectors,
+  ///   * alpha_1, ..., alpha_N are N scaling factors for x_1, ..., x_N.
+  ///
+  /// \tparam MemberType: Input type for the TeamPolicy member
+  /// \tparam XViewType: Input type for X, needs to be a 2D view
+  /// \tparam YViewType: Input type for Y, needs to be a 2D view
+  /// \tparam ArgTrans: Argument for transpose or notranspose
+  /// \tparam ArgMode: Argument for the parallelism used in the apply
+  ///
+  /// \param member [in]: TeamPolicy member
+  /// \param alpha [in]: input coefficient for X, a rank 1 view
+  /// \param X [in]: Input vector X, a rank 2 view
+  /// \param Y [out]: Output vector Y, a rank 2 view
+
+  template <typename MemberType, typename XViewType, typename YViewType,
+            typename NormViewType, typename ArgTrans, typename ArgMode>
+  KOKKOS_INLINE_FUNCTION void apply(const MemberType &member,
+                                    const XViewType &X, const YViewType &Y,
+                                    NormViewType alpha) const {
+    KokkosBatched::Spmv<MemberType, ArgTrans, ArgMode>::template invoke<
+        ValuesViewType, IntViewType, XViewType, YViewType, NormViewType,
+        NormViewType, 0>(member, alpha, values, row_ptr, colIndices, X, alpha,
+                         Y);
+  }
+
+  /// \brief apply version that uses variable coefficients alpha and beta
+  ///   y_l <- alpha_l * A_l * x_l + beta_l * y_l for all l = 1, ..., N
+  /// where:
+  ///   * N is the number of matrices,
+  ///   * A_1, ..., A_N are N sparse matrices which share the same sparsity
+  ///   pattern,
+  ///   * x_1, ..., x_N are the N input vectors,
+  ///   * y_1, ..., y_N are the N output vectors,
+  ///   * alpha_1, ..., alpha_N are N scaling factors for x_1, ..., x_N,
+  ///   * beta_1, ..., beta_N are N scaling factors for y_1, ..., y_N.
+  ///
+  /// \tparam MemberType: Input type for the TeamPolicy member
+  /// \tparam XViewType: Input type for X, needs to be a 2D view
+  /// \tparam YViewType: Input type for Y, needs to be a 2D view
+  /// \tparam NormViewType: Input type for alpha and beta, needs to be a 1D view
+  /// \tparam ArgTrans: Argument for transpose or notranspose
+  /// \tparam ArgMode: Argument for the parallelism used in the apply
+  ///
+  /// \param member [in]: TeamPolicy member
+  /// \param alpha [in]: input coefficient for X, a rank 1 view
+  /// \param X [in]: Input vector X, a rank 2 view
+  /// \param beta [in]: input coefficient for Y, a rank 1 view
+  /// \param Y [in/out]: Output vector Y, a rank 2 view
+
+  template <typename MemberType, typename XViewType, typename YViewType,
+            typename NormViewType, typename ArgTrans, typename ArgMode>
+  KOKKOS_INLINE_FUNCTION void apply(const MemberType &member,
+                                    const XViewType &X, const YViewType &Y,
+                                    const NormViewType &alpha,
+                                    const NormViewType &beta) const {
+    KokkosBatched::Spmv<MemberType, ArgTrans, ArgMode>::template invoke<
+        ValuesViewType, IntViewType, XViewType, YViewType, NormViewType,
+        NormViewType, 1>(member, alpha, values, row_ptr, colIndices, X, beta,
+                         Y);
+  }
+};
+
+}  // namespace KokkosBatched
+
+#endif
\ No newline at end of file
diff --git a/src/batched/sparse/KokkosBatched_GMRES.hpp b/src/batched/sparse/KokkosBatched_GMRES.hpp
new file mode 100644
index 0000000000..512970006b
--- /dev/null
+++ b/src/batched/sparse/KokkosBatched_GMRES.hpp
@@ -0,0 +1,92 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+#ifndef __KOKKOSBATCHED_GMRES_HPP__
+#define __KOKKOSBATCHED_GMRES_HPP__
+
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Vector.hpp"
+
+/// \brief Batched GMRES: Selective Interface
+///
+/// \tparam OperatorType: The type of the operator of the system
+/// \tparam VectorViewType: Input type for the right-hand side and the solution,
+/// needs to be a 2D view
+///
+/// \param member [in]: TeamPolicy member
+/// \param A [in]: batched operator (can be a batched matrix or a (left or right
+/// or both) preconditioned batched matrix) \param B [in]: right-hand side, a
+/// rank 2 view \param X [in/out]: initial guess and solution, a rank 2 view
+/// \param handle [in]: a handle which provides different information such as
+/// the tolerance or the maximal number of iterations of the solver.
+
+#include "KokkosBatched_Krylov_Handle.hpp"
+#include "KokkosBatched_GMRES_Team_Impl.hpp"
+#include "KokkosBatched_GMRES_TeamVector_Impl.hpp"
+
+namespace KokkosBatched {
+
+template <typename MemberType, typename ArgMode>
+struct GMRES {
+  template <typename OperatorType, typename VectorViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const OperatorType &A, const VectorViewType &B,
+      const VectorViewType &X,
+      const KrylovHandle<typename VectorViewType::non_const_value_type>
+          &handle) {
+    int status = 0;
+    if (std::is_same<ArgMode, Mode::Team>::value) {
+      status =
+          TeamGMRES<MemberType>::template invoke<OperatorType, VectorViewType>(
+              member, A, B, X, handle);
+    } else if (std::is_same<ArgMode, Mode::TeamVector>::value) {
+      status = TeamVectorGMRES<MemberType>::template invoke<OperatorType,
+                                                            VectorViewType>(
+          member, A, B, X, handle);
+    }
+    return status;
+  }
+};
+
+}  // namespace KokkosBatched
+#endif
diff --git a/src/batched/sparse/KokkosBatched_Identity.hpp b/src/batched/sparse/KokkosBatched_Identity.hpp
new file mode 100644
index 0000000000..57934df66a
--- /dev/null
+++ b/src/batched/sparse/KokkosBatched_Identity.hpp
@@ -0,0 +1,83 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+#ifndef __KOKKOSBATCHED_IDENTITY_HPP__
+#define __KOKKOSBATCHED_IDENTITY_HPP__
+
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+
+#include "KokkosBatched_Copy_Decl.hpp"
+
+namespace KokkosBatched {
+
+/// \brief Batched Identity Operator:
+
+class Identity {
+ public:
+  KOKKOS_INLINE_FUNCTION
+  Identity() {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~Identity() {}
+
+  template <typename MemberType, typename XViewType, typename YViewType,
+            typename ArgTrans, typename ArgMode, int sameXY>
+  KOKKOS_INLINE_FUNCTION void apply(const MemberType &member,
+                                    const XViewType &X,
+                                    const YViewType &Y) const {
+    if (sameXY == 0) {
+      if (std::is_same<ArgMode, KokkosBatched::Mode::Serial>::value) {
+        SerialCopy<Trans::NoTranspose>::invoke(X, Y);
+      } else if (std::is_same<ArgMode, KokkosBatched::Mode::Team>::value) {
+        TeamCopy<MemberType>::invoke(member, X, Y);
+      } else if (std::is_same<ArgMode,
+                              KokkosBatched::Mode::TeamVector>::value) {
+        TeamVectorCopy<MemberType>::invoke(member, X, Y);
+      }
+    }
+  }
+};
+
+}  // namespace KokkosBatched
+
+#endif
\ No newline at end of file
diff --git a/src/batched/sparse/KokkosBatched_JacobiPrec.hpp b/src/batched/sparse/KokkosBatched_JacobiPrec.hpp
new file mode 100644
index 0000000000..129378ed43
--- /dev/null
+++ b/src/batched/sparse/KokkosBatched_JacobiPrec.hpp
@@ -0,0 +1,161 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+#ifndef __KOKKOSBATCHED_JACOBIPREC_HPP__
+#define __KOKKOSBATCHED_JACOBIPREC_HPP__
+
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "KokkosBatched_HadamardProduct.hpp"
+
+namespace KokkosBatched {
+
+/// \brief Batched Jacobi Preconditioner:
+///
+/// \tparam ValuesViewType: Input type for the values of the diagonal
+
+template <class ValuesViewType>
+class JacobiPrec {
+ public:
+  using ScalarType = typename ValuesViewType::non_const_value_type;
+  using MagnitudeType =
+      typename Kokkos::Details::ArithTraits<ScalarType>::mag_type;
+
+ private:
+  ValuesViewType diag_values;
+  int n_operators;
+  int n_rows;
+  int n_colums;
+  mutable bool computed_inverse = false;
+
+ public:
+  KOKKOS_INLINE_FUNCTION
+  JacobiPrec(const ValuesViewType &_diag_values) : diag_values(_diag_values) {
+    n_operators = _diag_values.extent(0);
+    n_rows      = _diag_values.extent(1);
+    n_colums    = n_rows;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  ~JacobiPrec() {}
+
+  template <typename MemberType, typename ArgMode>
+  KOKKOS_INLINE_FUNCTION void computeInverse(const MemberType &member) const {
+    auto one     = Kokkos::Details::ArithTraits<MagnitudeType>::one();
+    auto epsilon = Kokkos::Details::ArithTraits<MagnitudeType>::epsilon();
+    int tooSmall = 0;
+    if (std::is_same<ArgMode, Mode::Serial>::value) {
+      for (int i = 0; i < n_operators; ++i)
+        for (int j = 0; j < n_colums; ++j) {
+          if (Kokkos::abs<ScalarType>(diag_values(i, j)) <= epsilon) {
+            ++tooSmall;
+            diag_values(i, j) = one;
+          } else
+            diag_values(i, j) = one / diag_values(i, j);
+        }
+    } else if (std::is_same<ArgMode, Mode::Team>::value) {
+      auto diag_values_array = diag_values.data();
+      auto vs0               = diag_values.stride_0();
+      auto vs1               = diag_values.stride_1();
+
+      Kokkos::parallel_reduce(
+          Kokkos::TeamThreadRange(member, 0, n_operators * n_rows),
+          [&](const int &iTemp, int &ltooSmall) {
+            int i, j;
+            getIndices<int, typename ValuesViewType::array_layout>(
+                iTemp, n_rows, n_operators, j, i);
+            if (Kokkos::abs<ScalarType>(diag_values_array[i * vs0 + j * vs1]) <=
+                epsilon) {
+              ltooSmall++;
+              diag_values_array[i * vs0 + j * vs1] = one;
+            } else
+              diag_values_array[i * vs0 + j * vs1] =
+                  one / diag_values_array[i * vs0 + j * vs1];
+          },
+          tooSmall);
+    } else if (std::is_same<ArgMode, Mode::TeamVector>::value) {
+      auto diag_values_array = diag_values.data();
+      auto vs0               = diag_values.stride_0();
+      auto vs1               = diag_values.stride_1();
+
+      Kokkos::parallel_reduce(
+          Kokkos::TeamVectorRange(member, 0, n_operators * n_rows),
+          [&](const int &iTemp, int &ltooSmall) {
+            int i, j;
+            getIndices<int, typename ValuesViewType::array_layout>(
+                iTemp, n_rows, n_operators, j, i);
+            if (Kokkos::abs<ScalarType>(diag_values_array[i * vs0 + j * vs1]) <=
+                epsilon) {
+              ltooSmall++;
+              diag_values_array[i * vs0 + j * vs1] = one;
+            } else
+              diag_values_array[i * vs0 + j * vs1] =
+                  one / diag_values_array[i * vs0 + j * vs1];
+          },
+          tooSmall);
+    }
+
+    if (tooSmall > 0)
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::JacobiPrec: %d entrie(s) has/have a too small "
+          "magnitude and have been replaced by one, \n",
+          (int)tooSmall);
+    computed_inverse = true;
+  }
+
+  template <typename MemberType, typename XViewType, typename YViewType,
+            typename ArgTrans, typename ArgMode, int sameXY>
+  KOKKOS_INLINE_FUNCTION void apply(const MemberType &member,
+                                    const XViewType &X,
+                                    const YViewType &Y) const {
+    if (!computed_inverse) {
+      this->computeInverse<MemberType, ArgMode>(member);
+      member.team_barrier();  // Finish writing to this->diag_values
+    }
+
+    KokkosBatched::HadamardProduct<MemberType, ArgMode>::template invoke<
+        ValuesViewType, XViewType, YViewType>(member, diag_values, X, Y);
+  }
+};
+
+}  // namespace KokkosBatched
+
+#endif
\ No newline at end of file
diff --git a/src/batched/sparse/KokkosBatched_Krylov_Handle.hpp b/src/batched/sparse/KokkosBatched_Krylov_Handle.hpp
new file mode 100644
index 0000000000..f14eac7065
--- /dev/null
+++ b/src/batched/sparse/KokkosBatched_Krylov_Handle.hpp
@@ -0,0 +1,109 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <iostream>
+#include <string>
+
+#ifndef __KOKKOSBATCHED_KRYLOV_HANDLE_HPP__
+#define __KOKKOSBATCHED_KRYLOV_HANDLE_HPP__
+//#define VERBOSE
+
+namespace KokkosBatched {
+
+/// \brief KrylovHandle
+///
+/// \tparam scalar_type: Scalar type of the linear solver
+
+template <class scalar_type>
+class KrylovHandle {
+ public:
+  using norm_type =
+      typename Kokkos::Details::ArithTraits<scalar_type>::mag_type;
+
+ private:
+  norm_type tolerance;
+  int max_iteration;
+
+ public:
+  KOKKOS_INLINE_FUNCTION
+  KrylovHandle() {
+    tolerance     = Kokkos::Details::ArithTraits<norm_type>::epsilon();
+    max_iteration = 200;
+  }
+
+  /// \brief set_tolerance
+  ///   Set the tolerance of the batched Krylov solver
+  ///
+  /// \param _tolerance [in]: New tolerance
+
+  KOKKOS_INLINE_FUNCTION
+  void set_tolerance(norm_type _tolerance) { tolerance = _tolerance; }
+
+  /// \brief get_tolerance
+  ///   Get the tolerance of the batched Krylov solver
+
+  KOKKOS_INLINE_FUNCTION
+  norm_type get_tolerance() const { return tolerance; }
+
+  /// \brief set_max_iteration
+  ///   Set the maximum number of iterations of the batched Krylov solver
+  ///
+  /// \param _max_iteration [in]: New maximum number of iterations
+
+  KOKKOS_INLINE_FUNCTION
+  void set_max_iteration(norm_type _max_iteration) {
+    max_iteration = _max_iteration;
+  }
+
+  /// \brief get_max_iteration
+  ///   Get the maximum number of iterations of the batched Krylov solver
+
+  KOKKOS_INLINE_FUNCTION
+  int get_max_iteration() const { return max_iteration; }
+};
+
+}  // namespace KokkosBatched
+
+#endif
diff --git a/src/batched/sparse/KokkosBatched_Spmv.hpp b/src/batched/sparse/KokkosBatched_Spmv.hpp
new file mode 100644
index 0000000000..14ce074e41
--- /dev/null
+++ b/src/batched/sparse/KokkosBatched_Spmv.hpp
@@ -0,0 +1,334 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+#ifndef __KOKKOSBATCHED_SPMV_HPP__
+#define __KOKKOSBATCHED_SPMV_HPP__
+
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Vector.hpp"
+
+namespace KokkosBatched {
+
+/// \brief Serial Batched SPMV:
+///   y_l <- alpha_l * A_l * x_l + beta_l * y_l for all l = 1, ..., N
+/// where:
+///   * N is the number of matrices,
+///   * A_1, ..., A_N are N sparse matrices which share the same sparsity
+///   pattern,
+///   * x_1, ..., x_N are the N input vectors,
+///   * y_1, ..., y_N are the N output vectors,
+///   * alpha_1, ..., alpha_N are N scaling factors for x_1, ..., x_N,
+///   * beta_1, ..., beta_N are N scaling factors for y_1, ..., y_N.
+///
+/// \tparam ValuesViewType: Input type for the values of the batched crs matrix,
+/// needs to be a 2D view \tparam IntView: Input type for row offset array and
+/// column-index array, needs to be a 1D view \tparam xViewType: Input type for
+/// X, needs to be a 2D view \tparam yViewType: Input type for Y, needs to be a
+/// 2D view \tparam alphaViewType: Input type for alpha, needs to be a 1D view
+/// \tparam betaViewType: Input type for beta, needs to be a 1D view
+/// \tparam dobeta: Int which sepcifies if beta_l * y_l is used or not (if
+/// dobeta == 0, beta_l * y_l is not added to the result of alpha_l * A_l * x_l)
+///
+/// \param alpha [in]: input coefficient for X, a rank 1 view
+/// \param values [in]: values of the batched crs matrix, a rank 2 view
+/// \param row_ptr [in]: row offset array of the batched crs matrix, a rank 1
+/// view \param colIndices [in]: column-index array of the batched crs matrix, a
+/// rank 1 view \param X [in]: Input vector X, a rank 2 view \param beta [in]:
+/// input coefficient for Y (if dobeta != 0), a rank 1 view \param Y [in/out]:
+/// Output vector Y, a rank 2 view
+///
+/// The matrices are represented using a Compressed Row Storage (CRS) format and
+/// the shared sparsity pattern is reused from one matrix to the others.
+///
+/// Concretely, instead of providing an array of N matrices to the batched SPMV
+/// kernel, the user provides one row offset array (1D view), one column-index
+/// array (1D view), and one value array (2D view, one dimension for the
+/// non-zero indices and one for the matrix indices).
+///
+/// No nested parallel_for is used inside of the function.
+///
+
+template <typename ArgTrans = Trans::NoTranspose>
+struct SerialSpmv {
+  template <typename ValuesViewType, typename IntView, typename xViewType,
+            typename yViewType, typename alphaViewType, typename betaViewType,
+            int dobeta>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const alphaViewType &alpha, const ValuesViewType &values,
+      const IntView &row_ptr, const IntView &colIndices, const xViewType &x,
+      const betaViewType &beta, const yViewType &Y);
+
+  template <typename ValuesViewType, typename IntView, typename xViewType,
+            typename yViewType, int dobeta>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const typename Kokkos::Details::ArithTraits<
+          typename ValuesViewType::non_const_value_type>::mag_type &alpha,
+      const ValuesViewType &values, const IntView &row_ptr,
+      const IntView &colIndices, const xViewType &X,
+      const typename Kokkos::Details::ArithTraits<
+          typename ValuesViewType::non_const_value_type>::mag_type &beta,
+      const yViewType &Y);
+};
+
+/// \brief Team Batched SPMV:
+///   y_l <- alpha_l * A_l * x_l + beta_l * y_l for all l = 1, ..., N
+/// where:
+///   * N is the number of matrices,
+///   * A_1, ..., A_N are N sparse matrices which share the same sparsity
+///   pattern,
+///   * x_1, ..., x_N are the N input vectors,
+///   * y_1, ..., y_N are the N output vectors,
+///   * alpha_1, ..., alpha_N are N scaling factors for x_1, ..., x_N,
+///   * beta_1, ..., beta_N are N scaling factors for y_1, ..., y_N.
+///
+/// \tparam ValuesViewType: Input type for the values of the batched crs matrix,
+/// needs to be a 2D view \tparam IntView: Input type for row offset array and
+/// column-index array, needs to be a 1D view \tparam xViewType: Input type for
+/// X, needs to be a 2D view \tparam yViewType: Input type for Y, needs to be a
+/// 2D view \tparam alphaViewType: Input type for alpha, needs to be a 1D view
+/// \tparam betaViewType: Input type for beta, needs to be a 1D view
+/// \tparam dobeta: Int which sepcifies if beta_l * y_l is used or not (if
+/// dobeta == 0, beta_l * y_l is not added to the result of alpha_l * A_l * x_l)
+///
+/// \param member [in]: TeamPolicy member
+/// \param alpha [in]: input coefficient for X, a rank 1 view
+/// \param values [in]: values of the batched crs matrix, a rank 2 view
+/// \param row_ptr [in]: row offset array of the batched crs matrix, a rank 1
+/// view \param colIndices [in]: column-index array of the batched crs matrix, a
+/// rank 1 view \param X [in]: Input vector X, a rank 2 view \param beta [in]:
+/// input coefficient for Y (if dobeta != 0), a rank 1 view \param Y [in/out]:
+/// Output vector Y, a rank 2 view
+///
+/// The matrices are represented using a Compressed Row Storage (CRS) format and
+/// the shared sparsity pattern is reused from one matrix to the others.
+///
+/// Concretely, instead of providing an array of N matrices to the batched SPMV
+/// kernel, the user provides one row offset array (1D view), one column-index
+/// array (1D view), and one value array (2D view, one dimension for the
+/// non-zero indices and one for the matrix indices).
+///
+/// A nested parallel_for with TeamThreadRange is used.
+///
+
+template <typename MemberType, typename ArgTrans = Trans::NoTranspose>
+struct TeamSpmv {
+  template <typename ValuesViewType, typename IntView, typename xViewType,
+            typename yViewType, typename alphaViewType, typename betaViewType,
+            int dobeta>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const alphaViewType &alpha,
+      const ValuesViewType &values, const IntView &row_ptr,
+      const IntView &colIndices, const xViewType &x, const betaViewType &beta,
+      const yViewType &y);
+
+  template <typename ValuesViewType, typename IntView, typename xViewType,
+            typename yViewType, int dobeta>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member,
+      const typename Kokkos::Details::ArithTraits<
+          typename ValuesViewType::non_const_value_type>::mag_type &alpha,
+      const ValuesViewType &values, const IntView &row_ptr,
+      const IntView &colIndices, const xViewType &x,
+      const typename Kokkos::Details::ArithTraits<
+          typename ValuesViewType::non_const_value_type>::mag_type &beta,
+      const yViewType &y);
+};
+
+/// \brief TeamVector Batched SPMV:
+///   y_l <- alpha_l * A_l * x_l + beta_l * y_l for all l = 1, ..., N
+/// where:
+///   * N is the number of matrices,
+///   * A_1, ..., A_N are N sparse matrices which share the same sparsity
+///   pattern,
+///   * x_1, ..., x_N are the N input vectors,
+///   * y_1, ..., y_N are the N output vectors,
+///   * alpha_1, ..., alpha_N are N scaling factors for x_1, ..., x_N,
+///   * beta_1, ..., beta_N are N scaling factors for y_1, ..., y_N.
+///
+/// \tparam ValuesViewType: Input type for the values of the batched crs matrix,
+/// needs to be a 2D view \tparam IntView: Input type for row offset array and
+/// column-index array, needs to be a 1D view \tparam xViewType: Input type for
+/// X, needs to be a 2D view \tparam yViewType: Input type for Y, needs to be a
+/// 2D view \tparam alphaViewType: Input type for alpha, needs to be a 1D view
+/// \tparam betaViewType: Input type for beta, needs to be a 1D view
+/// \tparam dobeta: Int which sepcifies if beta_l * y_l is used or not (if
+/// dobeta == 0, beta_l * y_l is not added to the result of alpha_l * A_l * x_l)
+///
+/// \param member [in]: TeamPolicy member
+/// \param alpha [in]: input coefficient for X, a rank 1 view
+/// \param values [in]: values of the batched crs matrix, a rank 2 view
+/// \param row_ptr [in]: row offset array of the batched crs matrix, a rank 1
+/// view \param colIndices [in]: column-index array of the batched crs matrix, a
+/// rank 1 view \param X [in]: Input vector X, a rank 2 view \param beta [in]:
+/// input coefficient for Y (if dobeta != 0), a rank 1 view \param Y [in/out]:
+/// Output vector Y, a rank 2 view
+///
+/// The matrices are represented using a Compressed Row Storage (CRS) format and
+/// the shared sparsity pattern is reused from one matrix to the others.
+///
+/// Concretely, instead of providing an array of N matrices to the batched SPMV
+/// kernel, the user provides one row offset array (1D view), one column-index
+/// array (1D view), and one value array (2D view, one dimension for the
+/// non-zero indices and one for the matrix indices).
+///
+/// Two nested parallel_for with both TeamThreadRange and ThreadVectorRange
+/// (or one with TeamVectorRange) are used inside.
+///
+
+template <typename MemberType, typename ArgTrans = Trans::NoTranspose>
+struct TeamVectorSpmv {
+  template <typename ValuesViewType, typename IntView, typename xViewType,
+            typename yViewType, typename alphaViewType, typename betaViewType,
+            int dobeta>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const alphaViewType &alpha,
+      const ValuesViewType &values, const IntView &row_ptr,
+      const IntView &colIndices, const xViewType &x, const betaViewType &beta,
+      const yViewType &y);
+
+  template <typename ValuesViewType, typename IntView, typename xViewType,
+            typename yViewType, int dobeta>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member,
+      const typename Kokkos::Details::ArithTraits<
+          typename ValuesViewType::non_const_value_type>::mag_type &alpha,
+      const ValuesViewType &values, const IntView &row_ptr,
+      const IntView &colIndices, const xViewType &x,
+      const typename Kokkos::Details::ArithTraits<
+          typename ValuesViewType::non_const_value_type>::mag_type &beta,
+      const yViewType &y);
+};
+
+/// \brief Batched SPMV: Selective Interface
+///   y_l <- alpha_l * A_l * x_l + beta_l * y_l for all l = 1, ..., N
+/// where:
+///   * N is the number of matrices,
+///   * A_1, ..., A_N are N sparse matrices which share the same sparsity
+///   pattern,
+///   * x_1, ..., x_N are the N input vectors,
+///   * y_1, ..., y_N are the N output vectors,
+///   * alpha_1, ..., alpha_N are N scaling factors for x_1, ..., x_N,
+///   * beta_1, ..., beta_N are N scaling factors for y_1, ..., y_N.
+///
+/// \tparam ValuesViewType: Input type for the values of the batched crs matrix,
+/// needs to be a 2D view \tparam IntView: Input type for row offset array and
+/// column-index array, needs to be a 1D view \tparam xViewType: Input type for
+/// X, needs to be a 2D view \tparam yViewType: Input type for Y, needs to be a
+/// 2D view \tparam alphaViewType: Input type for alpha, needs to be a 1D view
+/// \tparam betaViewType: Input type for beta, needs to be a 1D view
+/// \tparam dobeta: Int which sepcifies if beta_l * y_l is used or not (if
+/// dobeta == 0, beta_l * y_l is not added to the result of alpha_l * A_l * x_l)
+///
+/// \param member [in]: TeamPolicy member
+/// \param alpha [in]: input coefficient for X, a rank 1 view
+/// \param values [in]: values of the batched crs matrix, a rank 2 view
+/// \param row_ptr [in]: row offset array of the batched crs matrix, a rank 1
+/// view \param colIndices [in]: column-index array of the batched crs matrix, a
+/// rank 1 view \param X [in]: Input vector X, a rank 2 view \param beta [in]:
+/// input coefficient for Y (if dobeta != 0), a rank 1 view \param Y [in/out]:
+/// Output vector Y, a rank 2 view
+
+template <typename MemberType, typename ArgTrans, typename ArgMode>
+struct Spmv {
+  template <typename ValuesViewType, typename IntView, typename xViewType,
+            typename yViewType, typename alphaViewType, typename betaViewType,
+            int dobeta>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member, const alphaViewType &alpha,
+      const ValuesViewType &values, const IntView &row_ptr,
+      const IntView &colIndices, const xViewType &x, const betaViewType &beta,
+      const yViewType &y) {
+    int r_val = 0;
+    if (std::is_same<ArgMode, Mode::Serial>::value) {
+      r_val = SerialSpmv<ArgTrans>::template invoke<
+          ValuesViewType, IntView, xViewType, yViewType, alphaViewType,
+          betaViewType, dobeta>(alpha, values, row_ptr, colIndices, x, beta, y);
+    } else if (std::is_same<ArgMode, Mode::Team>::value) {
+      r_val = TeamSpmv<MemberType, ArgTrans>::template invoke<
+          ValuesViewType, IntView, xViewType, yViewType, alphaViewType,
+          betaViewType, dobeta>(member, alpha, values, row_ptr, colIndices, x,
+                                beta, y);
+    } else if (std::is_same<ArgMode, Mode::TeamVector>::value) {
+      r_val = TeamVectorSpmv<MemberType, ArgTrans>::template invoke<
+          ValuesViewType, IntView, xViewType, yViewType, alphaViewType,
+          betaViewType, dobeta>(member, alpha, values, row_ptr, colIndices, x,
+                                beta, y);
+    }
+    return r_val;
+  }
+
+  template <typename ValuesViewType, typename IntView, typename xViewType,
+            typename yViewType, int dobeta>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType &member,
+      const typename Kokkos::Details::ArithTraits<
+          typename ValuesViewType::non_const_value_type>::mag_type &alpha,
+      const ValuesViewType &values, const IntView &row_ptr,
+      const IntView &colIndices, const xViewType &x,
+      const typename Kokkos::Details::ArithTraits<
+          typename ValuesViewType::non_const_value_type>::mag_type &beta,
+      const yViewType &y) {
+    int r_val = 0;
+    if (std::is_same<ArgMode, Mode::Serial>::value) {
+      r_val =
+          SerialSpmv<ArgTrans>::template invoke<ValuesViewType, IntView,
+                                                xViewType, yViewType, dobeta>(
+              alpha, values, row_ptr, colIndices, x, beta, y);
+    } else if (std::is_same<ArgMode, Mode::Team>::value) {
+      r_val = TeamSpmv<MemberType, ArgTrans>::template invoke<
+          ValuesViewType, IntView, xViewType, yViewType, dobeta>(
+          member, alpha, values, row_ptr, colIndices, x, beta, y);
+    } else if (std::is_same<ArgMode, Mode::TeamVector>::value) {
+      r_val = TeamVectorSpmv<MemberType, ArgTrans>::template invoke<
+          ValuesViewType, IntView, xViewType, yViewType, dobeta>(
+          member, alpha, values, row_ptr, colIndices, x, beta, y);
+    }
+    return r_val;
+  }
+};
+}  // namespace KokkosBatched
+
+#include "KokkosBatched_Spmv_Serial_Impl.hpp"
+#include "KokkosBatched_Spmv_Team_Impl.hpp"
+#include "KokkosBatched_Spmv_TeamVector_Impl.hpp"
+#endif
diff --git a/src/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp
new file mode 100644
index 0000000000..83e8fb90ed
--- /dev/null
+++ b/src/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp
@@ -0,0 +1,199 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+#ifndef __KOKKOSBATCHED_CG_TEAMVECTOR_IMPL_HPP__
+#define __KOKKOSBATCHED_CG_TEAMVECTOR_IMPL_HPP__
+
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+
+#include "KokkosBatched_Axpy.hpp"
+#include "KokkosBatched_Copy_Decl.hpp"
+#include "KokkosBatched_Dot.hpp"
+#include "KokkosBatched_Spmv.hpp"
+#include "KokkosBatched_Xpay.hpp"
+
+namespace KokkosBatched {
+
+///
+/// TeamVector CG
+///   Two nested parallel_for with both TeamVectorRange and ThreadVectorRange
+///   (or one with TeamVectorRange) are used inside.
+///
+
+template <typename MemberType>
+struct TeamVectorCG {
+  template <typename OperatorType, typename VectorViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType& member, const OperatorType& A, const VectorViewType& _B,
+      const VectorViewType& _X,
+      const KrylovHandle<typename VectorViewType::non_const_value_type>&
+          handle) {
+    typedef int OrdinalType;
+    typedef typename Kokkos::Details::ArithTraits<
+        typename VectorViewType::non_const_value_type>::mag_type MagnitudeType;
+
+    const size_t maximum_iteration = handle.get_max_iteration();
+    const MagnitudeType tolerance  = handle.get_tolerance();
+
+    using ScratchPadNormViewType = Kokkos::View<
+        MagnitudeType*,
+        typename VectorViewType::execution_space::scratch_memory_space>;
+    using ScratchPadVectorViewType = Kokkos::View<
+        typename VectorViewType::non_const_value_type**,
+        typename VectorViewType::array_layout,
+        typename VectorViewType::execution_space::scratch_memory_space>;
+    using TeamVectorCopy1D = TeamVectorCopy<MemberType, Trans::NoTranspose, 1>;
+
+    const OrdinalType numMatrices = _X.extent(0);
+    const OrdinalType numRows     = _X.extent(1);
+
+    ScratchPadVectorViewType P(member.team_scratch(0), numMatrices, numRows);
+    ScratchPadVectorViewType Q(member.team_scratch(0), numMatrices, numRows);
+    ScratchPadVectorViewType R(member.team_scratch(0), numMatrices, numRows);
+    ScratchPadVectorViewType X(member.team_scratch(0), numMatrices, numRows);
+
+    ScratchPadNormViewType sqr_norm_0(member.team_scratch(0), numMatrices);
+    ScratchPadNormViewType sqr_norm_j(member.team_scratch(0), numMatrices);
+    ScratchPadNormViewType alpha(member.team_scratch(0), numMatrices);
+    ScratchPadNormViewType mask(member.team_scratch(0), numMatrices);
+    ScratchPadNormViewType tmp(member.team_scratch(0), numMatrices);
+
+    TeamVectorCopy<MemberType>::invoke(member, _X, X);
+    // Deep copy of b into r_0:
+    TeamVectorCopy<MemberType>::invoke(member, _B, R);
+
+    // r_0 := b - A x_0
+    member.team_barrier();
+    A.template apply<MemberType, ScratchPadVectorViewType,
+                     ScratchPadVectorViewType, Trans::NoTranspose,
+                     Mode::TeamVector>(member, X, R, -1, 1);
+    member.team_barrier();
+
+    // Deep copy of r_0 into p_0:
+    TeamVectorCopy<MemberType>::invoke(member, R, P);
+
+    TeamVectorDot<MemberType>::invoke(member, R, R, sqr_norm_0);
+    member.team_barrier();
+
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices),
+                         [&](const OrdinalType& i) {
+                           mask(i) =
+                               sqr_norm_0(i) > tolerance * tolerance ? 1. : 0;
+                         });
+
+    TeamVectorCopy1D::invoke(member, sqr_norm_0, sqr_norm_j);
+
+    int status               = 1;
+    int number_not_converged = 0;
+
+    for (size_t j = 0; j < maximum_iteration; ++j) {
+      // q := A p_j
+      A.template apply<MemberType, ScratchPadVectorViewType,
+                       ScratchPadVectorViewType, Trans::NoTranspose,
+                       Mode::TeamVector>(member, P, Q);
+      member.team_barrier();
+
+      TeamVectorDot<MemberType>::invoke(member, P, Q, tmp);
+      member.team_barrier();
+
+      Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices),
+                           [&](const OrdinalType& i) {
+                             alpha(i) =
+                                 mask(i) != 0. ? sqr_norm_j(i) / tmp(i) : 0.;
+                           });
+      member.team_barrier();
+
+      // x_{j+1} := alpha p_j + x_j
+      TeamVectorAxpy<MemberType>::invoke(member, alpha, P, X);
+      member.team_barrier();
+
+      // r_{j+1} := - alpha q + r_j
+      Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices),
+                           [&](const OrdinalType& i) { alpha(i) = -alpha(i); });
+      member.team_barrier();
+
+      TeamVectorAxpy<MemberType>::invoke(member, alpha, Q, R);
+      member.team_barrier();
+
+      TeamVectorDot<MemberType>::invoke(member, R, R, tmp);
+      member.team_barrier();
+
+      Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices),
+                           [&](const OrdinalType& i) {
+                             alpha(i) =
+                                 mask(i) != 0. ? tmp(i) / sqr_norm_j(i) : 0.;
+                           });
+
+      TeamVectorCopy1D::invoke(member, tmp, sqr_norm_j);
+
+      // Relative convergence check:
+      number_not_converged = 0;
+      Kokkos::parallel_reduce(
+          Kokkos::TeamVectorRange(member, 0, numMatrices),
+          [&](const OrdinalType& i, int& lnumber_not_converged) {
+            if (sqr_norm_j(i) / sqr_norm_0(i) > tolerance * tolerance)
+              ++lnumber_not_converged;
+            else
+              mask(i) = 0.;
+          },
+          number_not_converged);
+
+      member.team_barrier();
+
+      if (number_not_converged == 0) {
+        status = 0;
+        break;
+      }
+
+      // p_{j+1} := alpha p_j + r_{j+1}
+      TeamVectorXpay<MemberType>::invoke(member, alpha, R, P);
+      member.team_barrier();
+    }
+
+    TeamVectorCopy<MemberType>::invoke(member, X, _X);
+    return status;
+  }
+};
+}  // namespace KokkosBatched
+
+#endif
diff --git a/src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp
new file mode 100644
index 0000000000..2bc611aa32
--- /dev/null
+++ b/src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp
@@ -0,0 +1,198 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+#ifndef __KOKKOSBATCHED_CG_TEAM_IMPL_HPP__
+#define __KOKKOSBATCHED_CG_TEAM_IMPL_HPP__
+
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+
+#include "KokkosBatched_Axpy.hpp"
+#include "KokkosBatched_Copy_Decl.hpp"
+#include "KokkosBatched_Dot.hpp"
+#include "KokkosBatched_Spmv.hpp"
+#include "KokkosBatched_Xpay.hpp"
+
+namespace KokkosBatched {
+
+///
+/// Team CG
+///   A nested parallel_for with TeamThreadRange is used.
+///
+
+template <typename MemberType>
+struct TeamCG {
+  template <typename OperatorType, typename VectorViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType& member, const OperatorType& A, const VectorViewType& _B,
+      const VectorViewType& _X,
+      const KrylovHandle<typename VectorViewType::non_const_value_type>&
+          handle) {
+    typedef int OrdinalType;
+    typedef typename Kokkos::Details::ArithTraits<
+        typename VectorViewType::non_const_value_type>::mag_type MagnitudeType;
+
+    size_t maximum_iteration      = handle.get_max_iteration();
+    const MagnitudeType tolerance = handle.get_tolerance();
+
+    using ScratchPadNormViewType = Kokkos::View<
+        MagnitudeType*,
+        typename VectorViewType::execution_space::scratch_memory_space>;
+    using ScratchPadVectorViewType = Kokkos::View<
+        typename VectorViewType::non_const_value_type**,
+        typename VectorViewType::array_layout,
+        typename VectorViewType::execution_space::scratch_memory_space>;
+    using TeamCopy1D = TeamCopy<MemberType, Trans::NoTranspose, 1>;
+
+    const OrdinalType numMatrices = _X.extent(0);
+    const OrdinalType numRows     = _X.extent(1);
+
+    ScratchPadVectorViewType P(member.team_scratch(0), numMatrices, numRows);
+    ScratchPadVectorViewType Q(member.team_scratch(0), numMatrices, numRows);
+    ScratchPadVectorViewType R(member.team_scratch(0), numMatrices, numRows);
+    ScratchPadVectorViewType X(member.team_scratch(0), numMatrices, numRows);
+
+    ScratchPadNormViewType sqr_norm_0(member.team_scratch(0), numMatrices);
+    ScratchPadNormViewType sqr_norm_j(member.team_scratch(0), numMatrices);
+    ScratchPadNormViewType alpha(member.team_scratch(0), numMatrices);
+    ScratchPadNormViewType mask(member.team_scratch(0), numMatrices);
+    ScratchPadNormViewType tmp(member.team_scratch(0), numMatrices);
+
+    TeamCopy<MemberType>::invoke(member, _X, X);
+    // Deep copy of b into r_0:
+    TeamCopy<MemberType>::invoke(member, _B, R);
+
+    // r_0 := b - A x_0
+    member.team_barrier();
+    A.template apply<MemberType, ScratchPadVectorViewType,
+                     ScratchPadVectorViewType, Trans::NoTranspose, Mode::Team>(
+        member, X, R, -1, 1);
+    member.team_barrier();
+
+    // Deep copy of r_0 into p_0:
+    TeamCopy<MemberType>::invoke(member, R, P);
+
+    TeamDot<MemberType>::invoke(member, R, R, sqr_norm_0);
+    member.team_barrier();
+
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices),
+                         [&](const OrdinalType& i) {
+                           mask(i) =
+                               sqr_norm_0(i) > tolerance * tolerance ? 1. : 0;
+                         });
+
+    TeamCopy1D::invoke(member, sqr_norm_0, sqr_norm_j);
+
+    int status               = 1;
+    int number_not_converged = 0;
+
+    for (size_t j = 0; j < maximum_iteration; ++j) {
+      // q := A p_j
+      A.template apply<MemberType, ScratchPadVectorViewType,
+                       ScratchPadVectorViewType, Trans::NoTranspose,
+                       Mode::Team>(member, P, Q);
+      member.team_barrier();
+
+      TeamDot<MemberType>::invoke(member, P, Q, tmp);
+      member.team_barrier();
+
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices),
+                           [&](const OrdinalType& i) {
+                             alpha(i) =
+                                 mask(i) != 0. ? sqr_norm_j(i) / tmp(i) : 0.;
+                           });
+      member.team_barrier();
+
+      // x_{j+1} := alpha p_j + x_j
+      TeamAxpy<MemberType>::invoke(member, alpha, P, X);
+      member.team_barrier();
+
+      // r_{j+1} := - alpha q + r_j
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices),
+                           [&](const OrdinalType& i) { alpha(i) = -alpha(i); });
+      member.team_barrier();
+
+      TeamAxpy<MemberType>::invoke(member, alpha, Q, R);
+      member.team_barrier();
+
+      TeamDot<MemberType>::invoke(member, R, R, tmp);
+      member.team_barrier();
+
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices),
+                           [&](const OrdinalType& i) {
+                             alpha(i) =
+                                 mask(i) != 0. ? tmp(i) / sqr_norm_j(i) : 0.;
+                           });
+
+      TeamCopy1D::invoke(member, tmp, sqr_norm_j);
+
+      // Relative convergence check:
+      number_not_converged = 0;
+      Kokkos::parallel_reduce(
+          Kokkos::TeamThreadRange(member, 0, numMatrices),
+          [&](const OrdinalType& i, int& lnumber_not_converged) {
+            if (sqr_norm_j(i) / sqr_norm_0(i) > tolerance * tolerance)
+              ++lnumber_not_converged;
+            else
+              mask(i) = 0.;
+          },
+          number_not_converged);
+
+      member.team_barrier();
+
+      if (number_not_converged == 0) {
+        status = 0;
+        break;
+      }
+
+      // p_{j+1} := alpha p_j + r_{j+1}
+      TeamXpay<MemberType>::invoke(member, alpha, R, P);
+      member.team_barrier();
+    }
+
+    TeamCopy<MemberType>::invoke(member, X, _X);
+    return status;
+  }
+};
+}  // namespace KokkosBatched
+
+#endif
diff --git a/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp
new file mode 100644
index 0000000000..8e45b97556
--- /dev/null
+++ b/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp
@@ -0,0 +1,306 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+#ifndef __KOKKOSBATCHED_GMRES_TEAMVECTOR_IMPL_HPP__
+#define __KOKKOSBATCHED_GMRES_TEAMVECTOR_IMPL_HPP__
+
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+
+#include "KokkosBatched_Axpy.hpp"
+#include "KokkosBatched_Copy_Decl.hpp"
+#include "KokkosBatched_Dot.hpp"
+#include "KokkosBatched_Spmv.hpp"
+#include "KokkosBatched_Xpay.hpp"
+#include "KokkosBatched_Givens_Serial_Internal.hpp"
+#include "KokkosBatched_Trsm_Decl.hpp"
+#include "KokkosBatched_Identity.hpp"
+
+namespace KokkosBatched {
+
+///
+/// TeamVector GMRES
+///   Two nested parallel_for with both TeamVectorRange and ThreadVectorRange
+///   (or one with TeamVectorRange) are used inside.
+///
+
+template <typename MemberType>
+struct TeamVectorGMRES {
+  template <typename OperatorType, typename VectorViewType,
+            typename PrecOperatorType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType& member, const OperatorType& A, const VectorViewType& _B,
+      const VectorViewType& _X, const PrecOperatorType& P,
+      const KrylovHandle<typename VectorViewType::non_const_value_type>&
+          handle) {
+    typedef int OrdinalType;
+    typedef typename Kokkos::Details::ArithTraits<
+        typename VectorViewType::non_const_value_type>::mag_type MagnitudeType;
+    typedef Kokkos::Details::ArithTraits<MagnitudeType> ATM;
+
+    using ScratchPadNormViewType = Kokkos::View<
+        MagnitudeType*,
+        typename VectorViewType::execution_space::scratch_memory_space>;
+    using ScratchPadVectorViewType = Kokkos::View<
+        typename VectorViewType::non_const_value_type**,
+        typename VectorViewType::array_layout,
+        typename VectorViewType::execution_space::scratch_memory_space>;
+    using ScratchPadMultiVectorViewType = Kokkos::View<
+        typename VectorViewType::non_const_value_type***,
+        typename VectorViewType::array_layout,
+        typename VectorViewType::execution_space::scratch_memory_space>;
+    using TeamVectorCopy1D = TeamVectorCopy<MemberType, Trans::NoTranspose, 1>;
+
+    const OrdinalType numMatrices = _X.extent(0);
+    const OrdinalType numRows     = _X.extent(1);
+
+    size_t maximum_iteration = handle.get_max_iteration() < numRows
+                                   ? handle.get_max_iteration()
+                                   : numRows;
+    const MagnitudeType tolerance     = handle.get_tolerance();
+    const MagnitudeType max_tolerance = 0.;
+
+    ScratchPadMultiVectorViewType V(member.team_scratch(1), numMatrices,
+                                    maximum_iteration + 1, numRows);
+    ScratchPadMultiVectorViewType H(member.team_scratch(1), numMatrices,
+                                    maximum_iteration + 1, maximum_iteration);
+    ScratchPadMultiVectorViewType Givens(member.team_scratch(1), numMatrices,
+                                         maximum_iteration, 2);
+    ScratchPadVectorViewType G(member.team_scratch(1), numMatrices,
+                               maximum_iteration + 1);
+
+    ScratchPadVectorViewType W(member.team_scratch(0), numMatrices, numRows);
+    ScratchPadVectorViewType Q(member.team_scratch(0), numMatrices, numRows);
+    ScratchPadVectorViewType R(member.team_scratch(0), numMatrices, numRows);
+    ScratchPadVectorViewType X(member.team_scratch(0), numMatrices, numRows);
+
+    ScratchPadNormViewType beta(member.team_scratch(0), numMatrices);
+    ScratchPadNormViewType mask(member.team_scratch(0), numMatrices);
+    ScratchPadNormViewType tmp(member.team_scratch(0), numMatrices);
+
+    TeamVectorCopy<MemberType>::invoke(member, _X, X);
+    // Deep copy of b into r_0:
+    TeamVectorCopy<MemberType>::invoke(member, _B, R);
+
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices),
+                         [&](const OrdinalType& i) { mask(i) = 1.; });
+
+    // r_0 := b - A x_0
+    member.team_barrier();
+    A.template apply<MemberType, ScratchPadVectorViewType,
+                     ScratchPadVectorViewType, Trans::NoTranspose,
+                     Mode::TeamVector>(member, X, R, -1, 1);
+    member.team_barrier();
+
+    P.template apply<MemberType, ScratchPadVectorViewType,
+                     ScratchPadVectorViewType, Trans::NoTranspose,
+                     Mode::TeamVector, 1>(member, R, R);
+    member.team_barrier();
+
+    TeamVectorDot<MemberType>::invoke(member, R, R, beta);
+    member.team_barrier();
+
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices),
+                         [&](const OrdinalType& i) {
+                           beta(i) = ATM::sqrt(beta(i));
+                           G(i, 0) = beta(i) > max_tolerance ? beta(i) : 0.;
+                           tmp(i) = beta(i) > max_tolerance ? 1. / beta(i) : 0.;
+                         });
+
+    member.team_barrier();  // Finish writing to tmp
+
+    Kokkos::parallel_for(
+        Kokkos::TeamVectorRange(member, 0, numMatrices * numRows),
+        [&](const OrdinalType& iTemp) {
+          OrdinalType iRow, iMatrix;
+          getIndices<OrdinalType, typename VectorViewType::array_layout>(
+              iTemp, numRows, numMatrices, iRow, iMatrix);
+          V(iMatrix, 0, iRow) = R(iMatrix, iRow) * tmp(iMatrix);
+        });
+
+    int status = 1;
+    // int number_not_converged = 0;
+
+    for (size_t j = 0; j < maximum_iteration; ++j) {
+      member.team_barrier();  // Finish writing to V
+      // q := A p_j
+      auto V_j = Kokkos::subview(V, Kokkos::ALL, j, Kokkos::ALL);
+
+      A.template apply<MemberType, ScratchPadVectorViewType,
+                       ScratchPadVectorViewType, Trans::NoTranspose,
+                       Mode::TeamVector>(member, V_j, W);
+      member.team_barrier();
+      P.template apply<MemberType, ScratchPadVectorViewType,
+                       ScratchPadVectorViewType, Trans::NoTranspose,
+                       Mode::TeamVector, 1>(member, W, W);
+
+      for (size_t i = 0; i < j + 1; ++i) {
+        member.team_barrier();  // Finish writing to W
+        auto V_i = Kokkos::subview(V, Kokkos::ALL, i, Kokkos::ALL);
+        TeamVectorDot<MemberType>::invoke(member, W, V_i, tmp);
+        member.team_barrier();
+        TeamVectorCopy1D::invoke(member, tmp,
+                                 Kokkos::subview(H, Kokkos::ALL, i, j));
+
+        member.team_barrier();  // Don't start modifying tmp until copy above
+                                // finishes
+        Kokkos::parallel_for(
+            Kokkos::TeamVectorRange(member, 0, numMatrices),
+            [&](const OrdinalType& ii) { tmp(ii) = -tmp(ii); });
+
+        member.team_barrier();  // Finish writing to tmp
+
+        TeamVectorAxpy<MemberType>::invoke(member, tmp, V_i, W);
+      }
+
+      member.team_barrier();  // Finish writing to W
+      TeamVectorDot<MemberType>::invoke(member, W, W, tmp);
+      member.team_barrier();
+      Kokkos::parallel_for(
+          Kokkos::TeamVectorRange(member, 0, numMatrices),
+          [&](const OrdinalType& i) {
+            H(i, j + 1, j) = ATM::sqrt(tmp(i));
+            tmp(i) = H(i, j + 1, j) > max_tolerance ? 1. / H(i, j + 1, j) : 0.;
+          });
+      member.team_barrier();
+      Kokkos::parallel_for(
+          Kokkos::TeamVectorRange(member, 0, numMatrices * numRows),
+          [&](const OrdinalType& iTemp) {
+            OrdinalType iRow, iMatrix;
+            getIndices<OrdinalType, typename VectorViewType::array_layout>(
+                iTemp, numRows, numMatrices, iRow, iMatrix);
+            V(iMatrix, j + 1, iRow) = W(iMatrix, iRow) * tmp(iMatrix);
+          });
+
+      Kokkos::parallel_for(
+          Kokkos::TeamVectorRange(member, 0, numMatrices),
+          [&](const OrdinalType& l) {
+            // Apply the previous Givens rotations:
+            auto H_j = Kokkos::subview(H, l, Kokkos::ALL, j);
+
+            if (mask(l) == 1.) {
+              for (size_t i = 0; i < j; ++i) {
+                auto tmp1 =
+                    Givens(l, i, 0) * H_j(i) + Givens(l, i, 1) * H_j(i + 1);
+                auto tmp2 =
+                    -Givens(l, i, 1) * H_j(i) + Givens(l, i, 0) * H_j(i + 1);
+                H_j(i)     = tmp1;
+                H_j(i + 1) = tmp2;
+              }
+
+              // Compute the new Givens rotation:
+              Kokkos::pair<typename VectorViewType::non_const_value_type,
+                           typename VectorViewType::non_const_value_type>
+                  G_new(1, 0);
+              typename VectorViewType::non_const_value_type alpha = 0;
+              SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha);
+
+              Givens(l, j, 0) = G_new.first;
+              Givens(l, j, 1) = G_new.second;
+
+              // Apply the new Givens rotation:
+              auto tmp1 =
+                  Givens(l, j, 0) * H_j(j) + Givens(l, j, 1) * H_j(j + 1);
+              auto tmp2 =
+                  -Givens(l, j, 1) * H_j(j) + Givens(l, j, 0) * H_j(j + 1);
+              H_j(j)     = tmp1;
+              H_j(j + 1) = tmp2;
+
+              G(l, j + 1) = -Givens(l, j, 1) * G(l, j);
+              G(l, j) *= Givens(l, j, 0);
+            } else {
+              H_j(j)      = 1.;
+              G(l, j + 1) = 0.;
+            }
+
+            if (mask(l) == 1. &&
+                Kokkos::ArithTraits<double>::abs(G(l, j + 1)) / beta(l) <
+                    tolerance) {
+              mask(l)     = 0.;
+              G(l, j + 1) = 0.;
+            }
+          });
+    }
+
+    member.team_barrier();  // Finish writing to G
+
+    Kokkos::parallel_for(
+        Kokkos::TeamVectorRange(member, 0, numMatrices),
+        [&](const OrdinalType& l) {
+          SerialTrsm<Side::Left, Uplo::Upper, Trans::NoTranspose, Diag::NonUnit,
+                     Algo::Trsm::Unblocked>::template invoke(1,
+                                                             Kokkos::subview(
+                                                                 H, l,
+                                                                 Kokkos::ALL,
+                                                                 Kokkos::ALL),
+                                                             Kokkos::subview(
+                                                                 G, l,
+                                                                 Kokkos::ALL));
+        });
+
+    member.team_barrier();  // Finish writing to G
+
+    for (size_t j = 0; j < maximum_iteration; ++j) {
+      TeamVectorAxpy<MemberType>::invoke(
+          member, Kokkos::subview(G, Kokkos::ALL, j),
+          Kokkos::subview(V, Kokkos::ALL, j, Kokkos::ALL), X);
+      member.team_barrier();  // Finish writing to X
+    }
+
+    TeamVectorCopy<MemberType>::invoke(member, X, _X);
+    return status;
+  }
+
+  template <typename OperatorType, typename VectorViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType& member, const OperatorType& A, const VectorViewType& _B,
+      const VectorViewType& _X,
+      const KrylovHandle<typename VectorViewType::non_const_value_type>&
+          handle) {
+    Identity P;
+    return invoke<OperatorType, VectorViewType, Identity>(member, A, _B, _X, P,
+                                                          handle);
+  }
+};
+}  // namespace KokkosBatched
+
+#endif
diff --git a/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp
new file mode 100644
index 0000000000..4b4bd06bc0
--- /dev/null
+++ b/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp
@@ -0,0 +1,303 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+#ifndef __KOKKOSBATCHED_GMRES_TEAM_IMPL_HPP__
+#define __KOKKOSBATCHED_GMRES_TEAM_IMPL_HPP__
+
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+
+#include "KokkosBatched_Axpy.hpp"
+#include "KokkosBatched_Copy_Decl.hpp"
+#include "KokkosBatched_Dot.hpp"
+#include "KokkosBatched_Spmv.hpp"
+#include "KokkosBatched_Xpay.hpp"
+#include "KokkosBatched_Givens_Serial_Internal.hpp"
+#include "KokkosBatched_Trsm_Decl.hpp"
+#include "KokkosBatched_Identity.hpp"
+
+namespace KokkosBatched {
+
+///
+/// Team GMRES
+///   A nested parallel_for with TeamThreadRange is used.
+///
+
+template <typename MemberType>
+struct TeamGMRES {
+  template <typename OperatorType, typename VectorViewType,
+            typename PrecOperatorType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType& member, const OperatorType& A, const VectorViewType& _B,
+      const VectorViewType& _X, const PrecOperatorType& P,
+      const KrylovHandle<typename VectorViewType::non_const_value_type>&
+          handle) {
+    typedef int OrdinalType;
+    typedef typename Kokkos::Details::ArithTraits<
+        typename VectorViewType::non_const_value_type>::mag_type MagnitudeType;
+    typedef Kokkos::Details::ArithTraits<MagnitudeType> ATM;
+
+    using ScratchPadNormViewType = Kokkos::View<
+        MagnitudeType*,
+        typename VectorViewType::execution_space::scratch_memory_space>;
+    using ScratchPadVectorViewType = Kokkos::View<
+        typename VectorViewType::non_const_value_type**,
+        typename VectorViewType::array_layout,
+        typename VectorViewType::execution_space::scratch_memory_space>;
+    using ScratchPadMultiVectorViewType = Kokkos::View<
+        typename VectorViewType::non_const_value_type***,
+        typename VectorViewType::array_layout,
+        typename VectorViewType::execution_space::scratch_memory_space>;
+    using TeamCopy1D = TeamCopy<MemberType, Trans::NoTranspose, 1>;
+
+    const OrdinalType numMatrices = _X.extent(0);
+    const OrdinalType numRows     = _X.extent(1);
+
+    size_t maximum_iteration = handle.get_max_iteration() < numRows
+                                   ? handle.get_max_iteration()
+                                   : numRows;
+    const MagnitudeType tolerance     = handle.get_tolerance();
+    const MagnitudeType max_tolerance = 0.;
+
+    ScratchPadMultiVectorViewType V(member.team_scratch(1), numMatrices,
+                                    maximum_iteration + 1, numRows);
+    ScratchPadMultiVectorViewType H(member.team_scratch(1), numMatrices,
+                                    maximum_iteration + 1, maximum_iteration);
+    ScratchPadMultiVectorViewType Givens(member.team_scratch(1), numMatrices,
+                                         maximum_iteration, 2);
+    ScratchPadVectorViewType G(member.team_scratch(1), numMatrices,
+                               maximum_iteration + 1);
+
+    ScratchPadVectorViewType W(member.team_scratch(0), numMatrices, numRows);
+    ScratchPadVectorViewType Q(member.team_scratch(0), numMatrices, numRows);
+    ScratchPadVectorViewType R(member.team_scratch(0), numMatrices, numRows);
+    ScratchPadVectorViewType X(member.team_scratch(0), numMatrices, numRows);
+
+    ScratchPadNormViewType beta(member.team_scratch(0), numMatrices);
+    ScratchPadNormViewType mask(member.team_scratch(0), numMatrices);
+    ScratchPadNormViewType tmp(member.team_scratch(0), numMatrices);
+
+    TeamCopy<MemberType>::invoke(member, _X, X);
+    // Deep copy of b into r_0:
+    TeamCopy<MemberType>::invoke(member, _B, R);
+
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices),
+                         [&](const OrdinalType& i) { mask(i) = 1.; });
+
+    // r_0 := b - A x_0
+    member.team_barrier();
+    A.template apply<MemberType, ScratchPadVectorViewType,
+                     ScratchPadVectorViewType, Trans::NoTranspose, Mode::Team>(
+        member, X, R, -1, 1);
+    member.team_barrier();
+
+    P.template apply<MemberType, ScratchPadVectorViewType,
+                     ScratchPadVectorViewType, Trans::NoTranspose, Mode::Team,
+                     1>(member, R, R);
+    member.team_barrier();
+
+    TeamDot<MemberType>::invoke(member, R, R, beta);
+    member.team_barrier();
+
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices),
+                         [&](const OrdinalType& i) {
+                           beta(i) = ATM::sqrt(beta(i));
+                           G(i, 0) = beta(i) > max_tolerance ? beta(i) : 0.;
+                           tmp(i) = beta(i) > max_tolerance ? 1. / beta(i) : 0.;
+                         });
+
+    member.team_barrier();  // Finish writing to tmp
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(member, 0, numMatrices * numRows),
+        [&](const OrdinalType& iTemp) {
+          OrdinalType iRow, iMatrix;
+          getIndices<OrdinalType, typename VectorViewType::array_layout>(
+              iTemp, numRows, numMatrices, iRow, iMatrix);
+          V(iMatrix, 0, iRow) = R(iMatrix, iRow) * tmp(iMatrix);
+        });
+
+    int status = 1;
+    // int number_not_converged = 0;
+
+    for (size_t j = 0; j < maximum_iteration; ++j) {
+      member.team_barrier();  // Finish writing to V
+      // q := A p_j
+      auto V_j = Kokkos::subview(V, Kokkos::ALL, j, Kokkos::ALL);
+
+      A.template apply<MemberType, ScratchPadVectorViewType,
+                       ScratchPadVectorViewType, Trans::NoTranspose,
+                       Mode::Team>(member, V_j, W);
+      member.team_barrier();
+      P.template apply<MemberType, ScratchPadVectorViewType,
+                       ScratchPadVectorViewType, Trans::NoTranspose, Mode::Team,
+                       1>(member, W, W);
+
+      for (size_t i = 0; i < j + 1; ++i) {
+        member.team_barrier();  // Finish writing to W
+        auto V_i = Kokkos::subview(V, Kokkos::ALL, i, Kokkos::ALL);
+        TeamDot<MemberType>::invoke(member, W, V_i, tmp);
+        member.team_barrier();
+        TeamCopy1D::invoke(member, tmp, Kokkos::subview(H, Kokkos::ALL, i, j));
+        member.team_barrier();  // Don't start modifying tmp until copy above
+                                // finishes
+        Kokkos::parallel_for(
+            Kokkos::TeamThreadRange(member, 0, numMatrices),
+            [&](const OrdinalType& ii) { tmp(ii) = -tmp(ii); });
+
+        member.team_barrier();  // Finish writing to tmp
+
+        TeamAxpy<MemberType>::invoke(member, tmp, V_i, W);
+      }
+
+      member.team_barrier();  // Finish writing to W
+      TeamDot<MemberType>::invoke(member, W, W, tmp);
+      member.team_barrier();
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, 0, numMatrices),
+          [&](const OrdinalType& i) {
+            H(i, j + 1, j) = ATM::sqrt(tmp(i));
+            tmp(i) = H(i, j + 1, j) > max_tolerance ? 1. / H(i, j + 1, j) : 0.;
+          });
+      member.team_barrier();
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, 0, numMatrices * numRows),
+          [&](const OrdinalType& iTemp) {
+            OrdinalType iRow, iMatrix;
+            getIndices<OrdinalType, typename VectorViewType::array_layout>(
+                iTemp, numRows, numMatrices, iRow, iMatrix);
+            V(iMatrix, j + 1, iRow) = W(iMatrix, iRow) * tmp(iMatrix);
+          });
+
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, 0, numMatrices),
+          [&](const OrdinalType& l) {
+            // Apply the previous Givens rotations:
+            auto H_j = Kokkos::subview(H, l, Kokkos::ALL, j);
+
+            if (mask(l) == 1.) {
+              for (size_t i = 0; i < j; ++i) {
+                auto tmp1 =
+                    Givens(l, i, 0) * H_j(i) + Givens(l, i, 1) * H_j(i + 1);
+                auto tmp2 =
+                    -Givens(l, i, 1) * H_j(i) + Givens(l, i, 0) * H_j(i + 1);
+                H_j(i)     = tmp1;
+                H_j(i + 1) = tmp2;
+              }
+
+              // Compute the new Givens rotation:
+              Kokkos::pair<typename VectorViewType::non_const_value_type,
+                           typename VectorViewType::non_const_value_type>
+                  G_new(1, 0);
+              typename VectorViewType::non_const_value_type alpha = 0;
+              SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha);
+
+              Givens(l, j, 0) = G_new.first;
+              Givens(l, j, 1) = G_new.second;
+
+              // Apply the new Givens rotation:
+              auto tmp1 =
+                  Givens(l, j, 0) * H_j(j) + Givens(l, j, 1) * H_j(j + 1);
+              auto tmp2 =
+                  -Givens(l, j, 1) * H_j(j) + Givens(l, j, 0) * H_j(j + 1);
+              H_j(j)     = tmp1;
+              H_j(j + 1) = tmp2;
+
+              G(l, j + 1) = -Givens(l, j, 1) * G(l, j);
+              G(l, j) *= Givens(l, j, 0);
+            } else {
+              H_j(j)      = 1.;
+              G(l, j + 1) = 0.;
+            }
+
+            if (mask(l) == 1. &&
+                Kokkos::ArithTraits<double>::abs(G(l, j + 1)) / beta(l) <
+                    tolerance) {
+              mask(l)     = 0.;
+              G(l, j + 1) = 0.;
+            }
+          });
+    }
+
+    member.team_barrier();  // Finish writing to G
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(member, 0, numMatrices),
+        [&](const OrdinalType& l) {
+          SerialTrsm<Side::Left, Uplo::Upper, Trans::NoTranspose, Diag::NonUnit,
+                     Algo::Trsm::Unblocked>::template invoke(1,
+                                                             Kokkos::subview(
+                                                                 H, l,
+                                                                 Kokkos::ALL,
+                                                                 Kokkos::ALL),
+                                                             Kokkos::subview(
+                                                                 G, l,
+                                                                 Kokkos::ALL));
+        });
+
+    member.team_barrier();  // Finish writing to G
+
+    for (size_t j = 0; j < maximum_iteration; ++j) {
+      TeamAxpy<MemberType>::invoke(
+          member, Kokkos::subview(G, Kokkos::ALL, j),
+          Kokkos::subview(V, Kokkos::ALL, j, Kokkos::ALL), X);
+      member.team_barrier();  // Finish writing to X
+    }
+
+    TeamCopy<MemberType>::invoke(member, X, _X);
+    return status;
+  }
+
+  template <typename OperatorType, typename VectorViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType& member, const OperatorType& A, const VectorViewType& _B,
+      const VectorViewType& _X,
+      const KrylovHandle<typename VectorViewType::non_const_value_type>&
+          handle) {
+    Identity P;
+    return invoke<OperatorType, VectorViewType, Identity>(member, A, _B, _X, P,
+                                                          handle);
+  }
+};
+}  // namespace KokkosBatched
+
+#endif
diff --git a/src/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp
new file mode 100644
index 0000000000..ed9ae13c6c
--- /dev/null
+++ b/src/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp
@@ -0,0 +1,320 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+#ifndef __KOKKOSBATCHED_SPMV_SERIAL_IMPL_HPP__
+#define __KOKKOSBATCHED_SPMV_SERIAL_IMPL_HPP__
+
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+
+namespace KokkosBatched {
+
+///
+/// Serial Internal Impl
+/// ====================
+struct SerialSpmvInternal {
+  template <typename ScalarType, typename ValueType, typename OrdinalType,
+            typename layout, int dobeta>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const OrdinalType numMatrices, const OrdinalType numRows,
+      const ScalarType* KOKKOS_RESTRICT alpha, const OrdinalType alphas0,
+      const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0,
+      const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr,
+      const OrdinalType row_ptrs0,
+      const OrdinalType* KOKKOS_RESTRICT colIndices,
+      const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X,
+      const OrdinalType xs0, const OrdinalType xs1,
+      const ScalarType* KOKKOS_RESTRICT beta, const OrdinalType betas0,
+      /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0,
+      const OrdinalType ys1) {
+    for (OrdinalType iMatrix = 0; iMatrix < numMatrices; ++iMatrix) {
+      for (OrdinalType iRow = 0; iRow < numRows; ++iRow) {
+        const OrdinalType rowLength =
+            row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0];
+        ValueType sum = 0;
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+        for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) {
+          sum += values[iMatrix * valuess0 +
+                        (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] *
+                 X[iMatrix * xs0 +
+                   colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) *
+                              colIndicess0] *
+                       xs1];
+        }
+
+        sum *= alpha[iMatrix * alphas0];
+
+        if (dobeta == 0) {
+          Y[iMatrix * ys0 + iRow * ys1] = sum;
+        } else {
+          Y[iMatrix * ys0 + iRow * ys1] =
+              beta[iMatrix * betas0] * Y[iMatrix * ys0 + iRow * ys1] + sum;
+        }
+      }
+    }
+
+    return 0;
+  }
+
+  template <typename ScalarType, typename ValueType, typename OrdinalType,
+            typename layout, int dobeta>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const OrdinalType numMatrices, const OrdinalType numRows,
+      const ScalarType alpha, const ValueType* KOKKOS_RESTRICT values,
+      const OrdinalType valuess0, const OrdinalType valuess1,
+      const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0,
+      const OrdinalType* KOKKOS_RESTRICT colIndices,
+      const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X,
+      const OrdinalType xs0, const OrdinalType xs1, const ScalarType beta,
+      /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0,
+      const OrdinalType ys1) {
+    for (OrdinalType iMatrix = 0; iMatrix < numMatrices; ++iMatrix) {
+      for (OrdinalType iRow = 0; iRow < numRows; ++iRow) {
+        const OrdinalType rowLength =
+            row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0];
+        ValueType sum = 0;
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+        for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) {
+          sum += values[iMatrix * valuess0 +
+                        (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] *
+                 X[iMatrix * xs0 +
+                   colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) *
+                              colIndicess0] *
+                       xs1];
+        }
+
+        sum *= alpha;
+
+        if (dobeta == 0) {
+          Y[iMatrix * ys0 + iRow * ys1] = sum;
+        } else {
+          Y[iMatrix * ys0 + iRow * ys1] =
+              beta * Y[iMatrix * ys0 + iRow * ys1] + sum;
+        }
+      }
+    }
+
+    return 0;
+  }
+};
+
+template <>
+struct SerialSpmv<Trans::NoTranspose> {
+  template <typename ValuesViewType, typename IntView, typename xViewType,
+            typename yViewType, typename alphaViewType, typename betaViewType,
+            int dobeta>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const alphaViewType& alpha, const ValuesViewType& values,
+      const IntView& row_ptr, const IntView& colIndices, const xViewType& X,
+      const betaViewType& beta, const yViewType& Y) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    static_assert(Kokkos::is_view<ValuesViewType>::value,
+                  "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<IntView>::value,
+                  "KokkosBatched::spmv: IntView is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<xViewType>::value,
+                  "KokkosBatched::spmv: xViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<yViewType>::value,
+                  "KokkosBatched::spmv: yViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<alphaViewType>::value,
+                  "KokkosBatched::spmv: alphaViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<betaViewType>::value,
+                  "KokkosBatched::spmv: betaViewType is not a Kokkos::View.");
+
+    static_assert(ValuesViewType::Rank == 2,
+                  "KokkosBatched::spmv: ValuesViewType must have rank 2.");
+    static_assert(IntView::Rank == 1,
+                  "KokkosBatched::spmv: IntView must have rank 2.");
+    static_assert(xViewType::Rank == 2,
+                  "KokkosBatched::spmv: xViewType must have rank 2.");
+    static_assert(yViewType::Rank == 2,
+                  "KokkosBatched::spmv: yViewType must have rank 2.");
+    static_assert(alphaViewType::Rank == 1,
+                  "KokkosBatched::spmv: alphaViewType must have rank 1.");
+    static_assert(betaViewType::Rank == 1,
+                  "KokkosBatched::spmv: betaViewType must have rank 1.");
+
+    // Check compatibility of dimensions at run time.
+    if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x "
+          "%d, Y: %d x %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0),
+          (int)Y.extent(1));
+      return 1;
+    }
+    if (X.extent(0) != alpha.extent(0)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: First dimension of X and alpha do not match: "
+          "X: %d x %d, alpha: %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0));
+      return 1;
+    }
+    if (X.extent(0) != beta.extent(0)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: First dimension of X and beta do not match: X: "
+          "%d x %d, beta: %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)beta.extent(0));
+      return 1;
+    }
+    if (X.extent(0) != values.extent(0)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: First dimension of X and the first dimension "
+          "of values do not match: X: %d x %d, values: %d x %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)values.extent(0),
+          (int)values.extent(1));
+      return 1;
+    }
+    if (colIndices.extent(0) != values.extent(1)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: Dimension of colIndices and the second "
+          "dimension of values do not match: colIndices: %d , values: %d x "
+          "%d\n",
+          (int)colIndices.extent(0), (int)values.extent(0),
+          (int)values.extent(1));
+      return 1;
+    }
+    if (row_ptr.extent(0) - 1 != X.extent(1)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: Dimension of row_ptr and the second dimension "
+          "of X do not match: colIndices (-1): %d , values: %d x %d\n",
+          (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1));
+      return 1;
+    }
+#endif
+
+    return SerialSpmvInternal::template invoke<
+        typename alphaViewType::non_const_value_type,
+        typename ValuesViewType::non_const_value_type,
+        typename IntView::non_const_value_type,
+        typename ValuesViewType::array_layout, dobeta>(
+        X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(), values.data(),
+        values.stride_0(), values.stride_1(), row_ptr.data(),
+        row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(),
+        X.stride_0(), X.stride_1(), beta.data(), beta.stride_0(), Y.data(),
+        Y.stride_0(), Y.stride_1());
+  }
+
+  template <typename ValuesViewType, typename IntView, typename xViewType,
+            typename yViewType, int dobeta>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const typename Kokkos::Details::ArithTraits<
+          typename ValuesViewType::non_const_value_type>::mag_type& alpha,
+      const ValuesViewType& values, const IntView& row_ptr,
+      const IntView& colIndices, const xViewType& X,
+      const typename Kokkos::Details::ArithTraits<
+          typename ValuesViewType::non_const_value_type>::mag_type& beta,
+      const yViewType& Y) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    static_assert(Kokkos::is_view<ValuesViewType>::value,
+                  "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<IntView>::value,
+                  "KokkosBatched::spmv: IntView is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<xViewType>::value,
+                  "KokkosBatched::spmv: xViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<yViewType>::value,
+                  "KokkosBatched::spmv: yViewType is not a Kokkos::View.");
+
+    static_assert(ValuesViewType::Rank == 2,
+                  "KokkosBatched::spmv: ValuesViewType must have rank 2.");
+    static_assert(IntView::Rank == 1,
+                  "KokkosBatched::spmv: IntView must have rank 2.");
+    static_assert(xViewType::Rank == 2,
+                  "KokkosBatched::spmv: xViewType must have rank 2.");
+    static_assert(yViewType::Rank == 2,
+                  "KokkosBatched::spmv: yViewType must have rank 2.");
+
+    // Check compatibility of dimensions at run time.
+    if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x "
+          "%d, Y: %d x %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0),
+          (int)Y.extent(1));
+      return 1;
+    }
+    if (X.extent(0) != values.extent(0)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: First dimension of X and the first dimension "
+          "of values do not match: X: %d x %d, values: %d x %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)values.extent(0),
+          (int)values.extent(1));
+      return 1;
+    }
+    if (colIndices.extent(0) != values.extent(1)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: Dimension of colIndices and the second "
+          "dimension of values do not match: colIndices: %d , values: %d x "
+          "%d\n",
+          (int)colIndices.extent(0), (int)values.extent(0),
+          (int)values.extent(1));
+      return 1;
+    }
+    if (row_ptr.extent(0) - 1 != X.extent(1)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: Dimension of row_ptr and the second dimension "
+          "of X do not match: colIndices (-1): %d , values: %d x %d\n",
+          (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1));
+      return 1;
+    }
+#endif
+
+    return SerialSpmvInternal::template invoke<
+        typename Kokkos::Details::ArithTraits<
+            typename ValuesViewType::non_const_value_type>::mag_type,
+        typename ValuesViewType::non_const_value_type,
+        typename IntView::non_const_value_type,
+        typename ValuesViewType::array_layout, dobeta>(
+        X.extent(0), X.extent(1), alpha, values.data(), values.stride_0(),
+        values.stride_1(), row_ptr.data(), row_ptr.stride_0(),
+        colIndices.data(), colIndices.stride_0(), X.data(), X.stride_0(),
+        X.stride_1(), beta, Y.data(), Y.stride_0(), Y.stride_1());
+  }
+};
+
+}  // namespace KokkosBatched
+
+#endif
diff --git a/src/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp
new file mode 100644
index 0000000000..7e812621e7
--- /dev/null
+++ b/src/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp
@@ -0,0 +1,364 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+#ifndef __KOKKOSBATCHED_SPMV_TEAMVECTOR_IMPL_HPP__
+#define __KOKKOSBATCHED_SPMV_TEAMVECTOR_IMPL_HPP__
+
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+
+namespace KokkosBatched {
+
+///
+/// TeamVector Internal Impl
+/// ====================
+struct TeamVectorSpmvInternal {
+  template <typename MemberType, typename ScalarType, typename ValueType,
+            typename OrdinalType, typename layout, int dobeta>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType& member, const OrdinalType numMatrices,
+      const OrdinalType numRows, const ScalarType* KOKKOS_RESTRICT alpha,
+      const OrdinalType alphas0, const ValueType* KOKKOS_RESTRICT values,
+      const OrdinalType valuess0, const OrdinalType valuess1,
+      const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0,
+      const OrdinalType* KOKKOS_RESTRICT colIndices,
+      const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X,
+      const OrdinalType xs0, const OrdinalType xs1,
+      const ScalarType* KOKKOS_RESTRICT beta, const OrdinalType betas0,
+      /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0,
+      const OrdinalType ys1);
+
+  template <typename MemberType, typename ScalarType, typename ValueType,
+            typename OrdinalType, typename layout, int dobeta>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType& member, const OrdinalType numMatrices,
+      const OrdinalType numRows, const ScalarType alpha,
+      const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0,
+      const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr,
+      const OrdinalType row_ptrs0,
+      const OrdinalType* KOKKOS_RESTRICT colIndices,
+      const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X,
+      const OrdinalType xs0, const OrdinalType xs1, const ScalarType beta,
+      /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0,
+      const OrdinalType ys1);
+};
+
+template <typename MemberType, typename ScalarType, typename ValueType,
+          typename OrdinalType, typename layout, int dobeta>
+KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke(
+    const MemberType& member, const OrdinalType numMatrices,
+    const OrdinalType numRows, const ScalarType* KOKKOS_RESTRICT alpha,
+    const OrdinalType alphas0, const ValueType* KOKKOS_RESTRICT values,
+    const OrdinalType valuess0, const OrdinalType valuess1,
+    const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0,
+    const OrdinalType* KOKKOS_RESTRICT colIndices,
+    const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X,
+    const OrdinalType xs0, const OrdinalType xs1,
+    const ScalarType* KOKKOS_RESTRICT beta, const OrdinalType betas0,
+    /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0,
+    const OrdinalType ys1) {
+  Kokkos::parallel_for(
+      Kokkos::TeamVectorRange(member, 0, numMatrices * numRows),
+      [&](const OrdinalType& iTemp) {
+        OrdinalType iRow, iMatrix;
+        getIndices<OrdinalType, layout>(iTemp, numRows, numMatrices, iRow,
+                                        iMatrix);
+
+        const OrdinalType rowLength =
+            row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0];
+        ValueType sum = 0;
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+        for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) {
+          sum += values[iMatrix * valuess0 +
+                        (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] *
+                 X[iMatrix * xs0 +
+                   colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) *
+                              colIndicess0] *
+                       xs1];
+        }
+
+        sum *= alpha[iMatrix * alphas0];
+
+        if (dobeta == 0) {
+          Y[iMatrix * ys0 + iRow * ys1] = sum;
+        } else {
+          Y[iMatrix * ys0 + iRow * ys1] =
+              beta[iMatrix * betas0] * Y[iMatrix * ys0 + iRow * ys1] + sum;
+        }
+      });
+
+  return 0;
+}
+
+template <typename MemberType, typename ScalarType, typename ValueType,
+          typename OrdinalType, typename layout, int dobeta>
+KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke(
+    const MemberType& member, const OrdinalType numMatrices,
+    const OrdinalType numRows, const ScalarType alpha,
+    const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0,
+    const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr,
+    const OrdinalType row_ptrs0, const OrdinalType* KOKKOS_RESTRICT colIndices,
+    const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X,
+    const OrdinalType xs0, const OrdinalType xs1, const ScalarType beta,
+    /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0,
+    const OrdinalType ys1) {
+  Kokkos::parallel_for(
+      Kokkos::TeamVectorRange(member, 0, numMatrices * numRows),
+      [&](const OrdinalType& iTemp) {
+        OrdinalType iRow, iMatrix;
+        getIndices<OrdinalType, layout>(iTemp, numRows, numMatrices, iRow,
+                                        iMatrix);
+
+        const OrdinalType rowLength =
+            row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0];
+        ValueType sum = 0;
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+        for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) {
+          sum += values[iMatrix * valuess0 +
+                        (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] *
+                 X[iMatrix * xs0 +
+                   colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) *
+                              colIndicess0] *
+                       xs1];
+        }
+
+        sum *= alpha;
+
+        if (dobeta == 0) {
+          Y[iMatrix * ys0 + iRow * ys1] = sum;
+        } else {
+          Y[iMatrix * ys0 + iRow * ys1] =
+              beta * Y[iMatrix * ys0 + iRow * ys1] + sum;
+        }
+      });
+
+  return 0;
+}
+
+template <typename MemberType>
+struct TeamVectorSpmv<MemberType, Trans::NoTranspose> {
+  template <typename ValuesViewType, typename IntView, typename xViewType,
+            typename yViewType, typename alphaViewType, typename betaViewType,
+            int dobeta>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType& member, const alphaViewType& alpha,
+      const ValuesViewType& values, const IntView& row_ptr,
+      const IntView& colIndices, const xViewType& X, const betaViewType& beta,
+      const yViewType& Y) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    static_assert(Kokkos::is_view<ValuesViewType>::value,
+                  "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<IntView>::value,
+                  "KokkosBatched::spmv: IntView is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<xViewType>::value,
+                  "KokkosBatched::spmv: xViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<yViewType>::value,
+                  "KokkosBatched::spmv: yViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<alphaViewType>::value,
+                  "KokkosBatched::spmv: alphaViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<betaViewType>::value,
+                  "KokkosBatched::spmv: betaViewType is not a Kokkos::View.");
+
+    static_assert(ValuesViewType::Rank == 2,
+                  "KokkosBatched::spmv: ValuesViewType must have rank 2.");
+    static_assert(IntView::Rank == 1,
+                  "KokkosBatched::spmv: IntView must have rank 2.");
+    static_assert(xViewType::Rank == 2,
+                  "KokkosBatched::spmv: xViewType must have rank 2.");
+    static_assert(yViewType::Rank == 2,
+                  "KokkosBatched::spmv: yViewType must have rank 2.");
+    static_assert(alphaViewType::Rank == 1,
+                  "KokkosBatched::spmv: alphaViewType must have rank 1.");
+    static_assert(betaViewType::Rank == 1,
+                  "KokkosBatched::spmv: betaViewType must have rank 1.");
+    static_assert(alphaViewType::Rank == 1,
+                  "KokkosBatched::spmv: alphaViewType must have rank 1.");
+    static_assert(betaViewType::Rank == 1,
+                  "KokkosBatched::spmv: betaViewType must have rank 1.");
+
+    // Check compatibility of dimensions at run time.
+    if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x "
+          "%d, Y: %d x %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0),
+          (int)Y.extent(1));
+      return 1;
+    }
+    if (X.extent(0) != alpha.extent(0)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: First dimension of X and alpha do not match: "
+          "X: %d x %d, alpha: %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0));
+      return 1;
+    }
+    if (X.extent(0) != beta.extent(0)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: First dimension of X and beta do not match: X: "
+          "%d x %d, beta: %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)beta.extent(0));
+      return 1;
+    }
+    if (X.extent(0) != values.extent(0)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: First dimension of X and the first dimension "
+          "of values do not match: X: %d x %d, values: %d x %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)values.extent(0),
+          (int)values.extent(1));
+      return 1;
+    }
+    if (colIndices.extent(0) != values.extent(1)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: Dimension of colIndices and the second "
+          "dimension of values do not match: colIndices: %d , values: %d x "
+          "%d\n",
+          (int)colIndices.extent(0), (int)values.extent(0),
+          (int)values.extent(1));
+      return 1;
+    }
+    if (row_ptr.extent(0) - 1 != X.extent(1)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: Dimension of row_ptr and the second dimension "
+          "of X do not match: colIndices (-1): %d , values: %d x %d\n",
+          (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1));
+      return 1;
+    }
+#endif
+
+    return TeamVectorSpmvInternal::template invoke<
+        MemberType, typename alphaViewType::non_const_value_type,
+        typename ValuesViewType::non_const_value_type,
+        typename IntView::non_const_value_type,
+        typename ValuesViewType::array_layout, dobeta>(
+        member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(),
+        values.data(), values.stride_0(), values.stride_1(), row_ptr.data(),
+        row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(),
+        X.stride_0(), X.stride_1(), beta.data(), beta.stride_0(), Y.data(),
+        Y.stride_0(), Y.stride_1());
+  }
+
+  template <typename ValuesViewType, typename IntView, typename xViewType,
+            typename yViewType, int dobeta>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType& member,
+      const typename Kokkos::Details::ArithTraits<
+          typename ValuesViewType::non_const_value_type>::mag_type& alpha,
+      const ValuesViewType& values, const IntView& row_ptr,
+      const IntView& colIndices, const xViewType& X,
+      const typename Kokkos::Details::ArithTraits<
+          typename ValuesViewType::non_const_value_type>::mag_type& beta,
+      const yViewType& Y) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    static_assert(Kokkos::is_view<ValuesViewType>::value,
+                  "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<IntView>::value,
+                  "KokkosBatched::spmv: IntView is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<xViewType>::value,
+                  "KokkosBatched::spmv: xViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<yViewType>::value,
+                  "KokkosBatched::spmv: yViewType is not a Kokkos::View.");
+
+    static_assert(ValuesViewType::Rank == 2,
+                  "KokkosBatched::spmv: ValuesViewType must have rank 2.");
+    static_assert(IntView::Rank == 1,
+                  "KokkosBatched::spmv: IntView must have rank 2.");
+    static_assert(xViewType::Rank == 2,
+                  "KokkosBatched::spmv: xViewType must have rank 2.");
+    static_assert(yViewType::Rank == 2,
+                  "KokkosBatched::spmv: yViewType must have rank 2.");
+
+    // Check compatibility of dimensions at run time.
+    if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x "
+          "%d, Y: %d x %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0),
+          (int)Y.extent(1));
+      return 1;
+    }
+    if (X.extent(0) != values.extent(0)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: First dimension of X and the first dimension "
+          "of values do not match: X: %d x %d, values: %d x %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)values.extent(0),
+          (int)values.extent(1));
+      return 1;
+    }
+    if (colIndices.extent(0) != values.extent(1)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: Dimension of colIndices and the second "
+          "dimension of values do not match: colIndices: %d , values: %d x "
+          "%d\n",
+          (int)colIndices.extent(0), (int)values.extent(0),
+          (int)values.extent(1));
+      return 1;
+    }
+    if (row_ptr.extent(0) - 1 != X.extent(1)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: Dimension of row_ptr and the second dimension "
+          "of X do not match: colIndices (-1): %d , values: %d x %d\n",
+          (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1));
+      return 1;
+    }
+#endif
+
+    return TeamVectorSpmvInternal::template invoke<
+        MemberType,
+        typename Kokkos::Details::ArithTraits<
+            typename ValuesViewType::non_const_value_type>::mag_type,
+        typename ValuesViewType::non_const_value_type,
+        typename IntView::non_const_value_type,
+        typename ValuesViewType::array_layout, dobeta>(
+        member, X.extent(0), X.extent(1), alpha, values.data(),
+        values.stride_0(), values.stride_1(), row_ptr.data(),
+        row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(),
+        X.stride_0(), X.stride_1(), beta, Y.data(), Y.stride_0(), Y.stride_1());
+  }
+};
+
+}  // namespace KokkosBatched
+
+#endif
diff --git a/src/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp
new file mode 100644
index 0000000000..981c59f871
--- /dev/null
+++ b/src/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp
@@ -0,0 +1,360 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.4
+//       Copyright (2021) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+#ifndef __KOKKOSBATCHED_SPMV_TEAM_IMPL_HPP__
+#define __KOKKOSBATCHED_SPMV_TEAM_IMPL_HPP__
+
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+
+namespace KokkosBatched {
+
+///
+/// Team Internal Impl
+/// ====================
+struct TeamSpmvInternal {
+  template <typename MemberType, typename ScalarType, typename ValueType,
+            typename OrdinalType, typename layout, int dobeta>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType& member, const OrdinalType numMatrices,
+      const OrdinalType numRows, const ScalarType* KOKKOS_RESTRICT alpha,
+      const OrdinalType alphas0, const ValueType* KOKKOS_RESTRICT values,
+      const OrdinalType valuess0, const OrdinalType valuess1,
+      const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0,
+      const OrdinalType* KOKKOS_RESTRICT colIndices,
+      const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X,
+      const OrdinalType xs0, const OrdinalType xs1,
+      const ScalarType* KOKKOS_RESTRICT beta, const OrdinalType betas0,
+      /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0,
+      const OrdinalType ys1);
+
+  template <typename MemberType, typename ScalarType, typename ValueType,
+            typename OrdinalType, typename layout, int dobeta>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType& member, const OrdinalType numMatrices,
+      const OrdinalType numRows, const ScalarType alpha,
+      const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0,
+      const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr,
+      const OrdinalType row_ptrs0,
+      const OrdinalType* KOKKOS_RESTRICT colIndices,
+      const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X,
+      const OrdinalType xs0, const OrdinalType xs1, const ScalarType beta,
+      /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0,
+      const OrdinalType ys1);
+};
+
+template <typename MemberType, typename ScalarType, typename ValueType,
+          typename OrdinalType, typename layout, int dobeta>
+KOKKOS_INLINE_FUNCTION int TeamSpmvInternal::invoke(
+    const MemberType& member, const OrdinalType numMatrices,
+    const OrdinalType numRows, const ScalarType* KOKKOS_RESTRICT alpha,
+    const OrdinalType alphas0, const ValueType* KOKKOS_RESTRICT values,
+    const OrdinalType valuess0, const OrdinalType valuess1,
+    const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0,
+    const OrdinalType* KOKKOS_RESTRICT colIndices,
+    const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X,
+    const OrdinalType xs0, const OrdinalType xs1,
+    const ScalarType* KOKKOS_RESTRICT beta, const OrdinalType betas0,
+    /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0,
+    const OrdinalType ys1) {
+  Kokkos::parallel_for(
+      Kokkos::TeamThreadRange(member, 0, numMatrices * numRows),
+      [&](const OrdinalType& iTemp) {
+        OrdinalType iRow, iMatrix;
+        getIndices<OrdinalType, layout>(iTemp, numRows, numMatrices, iRow,
+                                        iMatrix);
+
+        const OrdinalType rowLength =
+            row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0];
+        ValueType sum = 0;
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+        for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) {
+          sum += values[iMatrix * valuess0 +
+                        (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] *
+                 X[iMatrix * xs0 +
+                   colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) *
+                              colIndicess0] *
+                       xs1];
+        }
+
+        sum *= alpha[iMatrix * alphas0];
+
+        if (dobeta == 0) {
+          Y[iMatrix * ys0 + iRow * ys1] = sum;
+        } else {
+          Y[iMatrix * ys0 + iRow * ys1] =
+              beta[iMatrix * betas0] * Y[iMatrix * ys0 + iRow * ys1] + sum;
+        }
+      });
+
+  return 0;
+}
+
+template <typename MemberType, typename ScalarType, typename ValueType,
+          typename OrdinalType, typename layout, int dobeta>
+KOKKOS_INLINE_FUNCTION int TeamSpmvInternal::invoke(
+    const MemberType& member, const OrdinalType numMatrices,
+    const OrdinalType numRows, const ScalarType alpha,
+    const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0,
+    const OrdinalType valuess1, const OrdinalType* KOKKOS_RESTRICT row_ptr,
+    const OrdinalType row_ptrs0, const OrdinalType* KOKKOS_RESTRICT colIndices,
+    const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT X,
+    const OrdinalType xs0, const OrdinalType xs1, const ScalarType beta,
+    /**/ ValueType* KOKKOS_RESTRICT Y, const OrdinalType ys0,
+    const OrdinalType ys1) {
+  Kokkos::parallel_for(
+      Kokkos::TeamThreadRange(member, 0, numMatrices * numRows),
+      [&](const OrdinalType& iTemp) {
+        OrdinalType iRow, iMatrix;
+        getIndices<OrdinalType, layout>(iTemp, numRows, numMatrices, iRow,
+                                        iMatrix);
+
+        const OrdinalType rowLength =
+            row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0];
+        ValueType sum = 0;
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+        for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) {
+          sum += values[iMatrix * valuess0 +
+                        (row_ptr[iRow * row_ptrs0] + iEntry) * valuess1] *
+                 X[iMatrix * xs0 +
+                   colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) *
+                              colIndicess0] *
+                       xs1];
+        }
+
+        sum *= alpha;
+
+        if (dobeta == 0) {
+          Y[iMatrix * ys0 + iRow * ys1] = sum;
+        } else {
+          Y[iMatrix * ys0 + iRow * ys1] =
+              beta * Y[iMatrix * ys0 + iRow * ys1] + sum;
+        }
+      });
+
+  return 0;
+}
+
+template <typename MemberType>
+struct TeamSpmv<MemberType, Trans::NoTranspose> {
+  template <typename ValuesViewType, typename IntView, typename xViewType,
+            typename yViewType, typename alphaViewType, typename betaViewType,
+            int dobeta>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType& member, const alphaViewType& alpha,
+      const ValuesViewType& values, const IntView& row_ptr,
+      const IntView& colIndices, const xViewType& X, const betaViewType& beta,
+      const yViewType& Y) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    static_assert(Kokkos::is_view<ValuesViewType>::value,
+                  "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<IntView>::value,
+                  "KokkosBatched::spmv: IntView is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<xViewType>::value,
+                  "KokkosBatched::spmv: xViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<yViewType>::value,
+                  "KokkosBatched::spmv: yViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<alphaViewType>::value,
+                  "KokkosBatched::spmv: alphaViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<betaViewType>::value,
+                  "KokkosBatched::spmv: betaViewType is not a Kokkos::View.");
+
+    static_assert(ValuesViewType::Rank == 2,
+                  "KokkosBatched::spmv: ValuesViewType must have rank 2.");
+    static_assert(IntView::Rank == 1,
+                  "KokkosBatched::spmv: IntView must have rank 2.");
+    static_assert(xViewType::Rank == 2,
+                  "KokkosBatched::spmv: xViewType must have rank 2.");
+    static_assert(yViewType::Rank == 2,
+                  "KokkosBatched::spmv: yViewType must have rank 2.");
+    static_assert(alphaViewType::Rank == 1,
+                  "KokkosBatched::spmv: alphaViewType must have rank 1.");
+    static_assert(betaViewType::Rank == 1,
+                  "KokkosBatched::spmv: betaViewType must have rank 1.");
+
+    // Check compatibility of dimensions at run time.
+    if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x "
+          "%d, Y: %d x %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0),
+          (int)Y.extent(1));
+      return 1;
+    }
+    if (X.extent(0) != alpha.extent(0)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: First dimension of X and alpha do not match: "
+          "X: %d x %d, alpha: %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0));
+      return 1;
+    }
+    if (X.extent(0) != beta.extent(0)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: First dimension of X and beta do not match: X: "
+          "%d x %d, beta: %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)beta.extent(0));
+      return 1;
+    }
+    if (X.extent(0) != values.extent(0)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: First dimension of X and the first dimension "
+          "of values do not match: X: %d x %d, values: %d x %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)values.extent(0),
+          (int)values.extent(1));
+      return 1;
+    }
+    if (colIndices.extent(0) != values.extent(1)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: Dimension of colIndices and the second "
+          "dimension of values do not match: colIndices: %d , values: %d x "
+          "%d\n",
+          (int)colIndices.extent(0), (int)values.extent(0),
+          (int)values.extent(1));
+      return 1;
+    }
+    if (row_ptr.extent(0) - 1 != X.extent(1)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: Dimension of row_ptr and the second dimension "
+          "of X do not match: colIndices (-1): %d , values: %d x %d\n",
+          (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1));
+      return 1;
+    }
+#endif
+
+    return TeamSpmvInternal::template invoke<
+        MemberType, typename alphaViewType::non_const_value_type,
+        typename ValuesViewType::non_const_value_type,
+        typename IntView::non_const_value_type,
+        typename ValuesViewType::array_layout, dobeta>(
+        member, X.extent(0), X.extent(1), alpha.data(), alpha.stride_0(),
+        values.data(), values.stride_0(), values.stride_1(), row_ptr.data(),
+        row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(),
+        X.stride_0(), X.stride_1(), beta.data(), beta.stride_0(), Y.data(),
+        Y.stride_0(), Y.stride_1());
+  }
+
+  template <typename ValuesViewType, typename IntView, typename xViewType,
+            typename yViewType, int dobeta>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const MemberType& member,
+      const typename Kokkos::Details::ArithTraits<
+          typename ValuesViewType::non_const_value_type>::mag_type& alpha,
+      const ValuesViewType& values, const IntView& row_ptr,
+      const IntView& colIndices, const xViewType& X,
+      const typename Kokkos::Details::ArithTraits<
+          typename ValuesViewType::non_const_value_type>::mag_type& beta,
+      const yViewType& Y) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    static_assert(Kokkos::is_view<ValuesViewType>::value,
+                  "KokkosBatched::spmv: ValuesViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<IntView>::value,
+                  "KokkosBatched::spmv: IntView is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<xViewType>::value,
+                  "KokkosBatched::spmv: xViewType is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<yViewType>::value,
+                  "KokkosBatched::spmv: yViewType is not a Kokkos::View.");
+
+    static_assert(ValuesViewType::Rank == 2,
+                  "KokkosBatched::spmv: ValuesViewType must have rank 2.");
+    static_assert(IntView::Rank == 1,
+                  "KokkosBatched::spmv: IntView must have rank 2.");
+    static_assert(xViewType::Rank == 2,
+                  "KokkosBatched::spmv: xViewType must have rank 2.");
+    static_assert(yViewType::Rank == 2,
+                  "KokkosBatched::spmv: yViewType must have rank 2.");
+
+    // Check compatibility of dimensions at run time.
+    if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x "
+          "%d, Y: %d x %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0),
+          (int)Y.extent(1));
+      return 1;
+    }
+    if (X.extent(0) != values.extent(0)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: First dimension of X and the first dimension "
+          "of values do not match: X: %d x %d, values: %d x %d\n",
+          (int)X.extent(0), (int)X.extent(1), (int)values.extent(0),
+          (int)values.extent(1));
+      return 1;
+    }
+    if (colIndices.extent(0) != values.extent(1)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: Dimension of colIndices and the second "
+          "dimension of values do not match: colIndices: %d , values: %d x "
+          "%d\n",
+          (int)colIndices.extent(0), (int)values.extent(0),
+          (int)values.extent(1));
+      return 1;
+    }
+    if (row_ptr.extent(0) - 1 != X.extent(1)) {
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "KokkosBatched::spmv: Dimension of row_ptr and the second dimension "
+          "of X do not match: colIndices (-1): %d , values: %d x %d\n",
+          (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1));
+      return 1;
+    }
+#endif
+
+    return TeamSpmvInternal::template invoke<
+        MemberType,
+        typename Kokkos::Details::ArithTraits<
+            typename ValuesViewType::non_const_value_type>::mag_type,
+        typename ValuesViewType::non_const_value_type,
+        typename IntView::non_const_value_type,
+        typename ValuesViewType::array_layout, dobeta>(
+        member, X.extent(0), X.extent(1), alpha, values.data(),
+        values.stride_0(), values.stride_1(), row_ptr.data(),
+        row_ptr.stride_0(), colIndices.data(), colIndices.stride_0(), X.data(),
+        X.stride_0(), X.stride_1(), beta, Y.data(), Y.stride_0(), Y.stride_1());
+  }
+};
+
+}  // namespace KokkosBatched
+
+#endif
diff --git a/src/blas/KokkosBlas.hpp b/src/blas/KokkosBlas.hpp
index 11c83e2575..ed677dd69b 100644
--- a/src/blas/KokkosBlas.hpp
+++ b/src/blas/KokkosBlas.hpp
@@ -45,25 +45,23 @@
 #ifndef KOKKOSBLAS_HPP_
 #define KOKKOSBLAS_HPP_
 
-#include<KokkosBlas1_abs.hpp>
-#include<KokkosBlas1_axpby.hpp>
-#include<KokkosBlas1_dot.hpp>
-#include<KokkosBlas1_fill.hpp>
-#include<KokkosBlas1_mult.hpp>
-#include<KokkosBlas1_nrm1.hpp>
-#include<KokkosBlas1_nrm2.hpp>
-#include<KokkosBlas1_nrm2_squared.hpp>
-#include<KokkosBlas1_nrm2w.hpp>
-#include<KokkosBlas1_nrm2w_squared.hpp>
-#include<KokkosBlas1_nrminf.hpp>
-#include<KokkosBlas1_reciprocal.hpp>
-#include<KokkosBlas1_scal.hpp>
-#include<KokkosBlas1_sum.hpp>
-#include<KokkosBlas1_update.hpp>
+#include <KokkosBlas1_abs.hpp>
+#include <KokkosBlas1_axpby.hpp>
+#include <KokkosBlas1_dot.hpp>
+#include <KokkosBlas1_fill.hpp>
+#include <KokkosBlas1_mult.hpp>
+#include <KokkosBlas1_nrm1.hpp>
+#include <KokkosBlas1_nrm2.hpp>
+#include <KokkosBlas1_nrm2_squared.hpp>
+#include <KokkosBlas1_nrm2w.hpp>
+#include <KokkosBlas1_nrm2w_squared.hpp>
+#include <KokkosBlas1_nrminf.hpp>
+#include <KokkosBlas1_reciprocal.hpp>
+#include <KokkosBlas1_scal.hpp>
+#include <KokkosBlas1_sum.hpp>
+#include <KokkosBlas1_update.hpp>
 
+#include <KokkosBlas2_gemv.hpp>
 
-#include<KokkosBlas2_gemv.hpp>
-
-
-#include<KokkosBlas3_gemm.hpp>
+#include <KokkosBlas3_gemm.hpp>
 #endif
diff --git a/src/blas/KokkosBlas1_abs.hpp b/src/blas/KokkosBlas1_abs.hpp
index b62ba9a8bb..81b5b5bfff 100644
--- a/src/blas/KokkosBlas1_abs.hpp
+++ b/src/blas/KokkosBlas1_abs.hpp
@@ -45,8 +45,9 @@
 #ifndef KOKKOSBLAS1_ABS_HPP_
 #define KOKKOSBLAS1_ABS_HPP_
 
-#include<KokkosBlas1_abs_spec.hpp>
-#include<KokkosKernels_helpers.hpp>
+#include <KokkosBlas1_abs_spec.hpp>
+#include <KokkosKernels_helpers.hpp>
+#include <KokkosKernels_Error.hpp>
 
 namespace KokkosBlas {
 
@@ -59,59 +60,56 @@ namespace KokkosBlas {
 /// \tparam XMV 1-D or 2-D Kokkos::View specialization.  It must have
 ///   the same rank as RMV, and its entries must be assignable to
 ///   those of RMV.
-template<class RMV, class XMV>
-void
-abs (const RMV& R, const XMV& X)
-{
-  static_assert (Kokkos::Impl::is_view<RMV>::value, "KokkosBlas::abs: "
-                 "R is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::abs: "
-                 "X is not a Kokkos::View.");
-  static_assert (std::is_same<typename RMV::value_type,
-                 typename RMV::non_const_value_type>::value,
-                 "KokkosBlas::abs: R is const.  "
-                 "It must be nonconst, because it is an output argument "
-                 "(we have to be able to write to its entries).");
-  static_assert (int(RMV::rank) == int(XMV::rank), "KokkosBlas::abs: "
-                 "R and X must have the same rank.");
-  static_assert (RMV::rank == 1 || RMV::rank == 2, "KokkosBlas::abs: "
-                 "RMV and XMV must either have rank 1 or rank 2.");
+template <class RMV, class XMV>
+void abs(const RMV& R, const XMV& X) {
+  static_assert(Kokkos::is_view<RMV>::value,
+                "KokkosBlas::abs: "
+                "R is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<XMV>::value,
+                "KokkosBlas::abs: "
+                "X is not a Kokkos::View.");
+  static_assert(std::is_same<typename RMV::value_type,
+                             typename RMV::non_const_value_type>::value,
+                "KokkosBlas::abs: R is const.  "
+                "It must be nonconst, because it is an output argument "
+                "(we have to be able to write to its entries).");
+  static_assert(int(RMV::rank) == int(XMV::rank),
+                "KokkosBlas::abs: "
+                "R and X must have the same rank.");
+  static_assert(RMV::rank == 1 || RMV::rank == 2,
+                "KokkosBlas::abs: "
+                "RMV and XMV must either have rank 1 or rank 2.");
 
   // Check compatibility of dimensions at run time.
-  if (X.extent(0) != R.extent(0) ||
-      X.extent(1) != R.extent(1)) {
+  if (X.extent(0) != R.extent(0) || X.extent(1) != R.extent(1)) {
     std::ostringstream os;
     os << "KokkosBlas::abs (MV): Dimensions of R and X do not match: "
-       << "R: " << R.extent(0) << " x " << R.extent(1)
-       << ", X: " << X.extent(0) << " x " << X.extent(1);
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+       << "R: " << R.extent(0) << " x " << R.extent(1) << ", X: " << X.extent(0)
+       << " x " << X.extent(1);
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
   // Create unmanaged versions of the input Views.  RMV and XMV may be
   // rank 1 or rank 2.
   typedef Kokkos::View<
-    typename Kokkos::Impl::if_c<
-      RMV::rank == 1,
-      typename RMV::non_const_value_type*,
-      typename RMV::non_const_value_type** >::type,
-    typename KokkosKernels::Impl::GetUnifiedLayout<RMV>::array_layout,
-    typename RMV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > RMV_Internal;
+      typename std::conditional<RMV::rank == 1,
+                                typename RMV::non_const_value_type*,
+                                typename RMV::non_const_value_type**>::type,
+      typename KokkosKernels::Impl::GetUnifiedLayout<RMV>::array_layout,
+      typename RMV::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      RMV_Internal;
   typedef Kokkos::View<
-    typename Kokkos::Impl::if_c<
-      XMV::rank == 1,
-      typename XMV::const_value_type*,
-      typename XMV::const_value_type** >::type,
-    typename KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout,
-    typename XMV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > XMV_Internal;
+      typename std::conditional<XMV::rank == 1, typename XMV::const_value_type*,
+                                typename XMV::const_value_type**>::type,
+      typename KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout,
+      typename XMV::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      XMV_Internal;
 
   RMV_Internal R_internal = R;
   XMV_Internal X_internal = X;
 
-  Impl::Abs<RMV_Internal, XMV_Internal>::abs (R_internal, X_internal);
+  Impl::Abs<RMV_Internal, XMV_Internal>::abs(R_internal, X_internal);
 }
-}
-
-#endif // KOKKOSBLAS1_ABS_HPP_
+}  // namespace KokkosBlas
 
+#endif  // KOKKOSBLAS1_ABS_HPP_
diff --git a/src/blas/KokkosBlas1_axpby.hpp b/src/blas/KokkosBlas1_axpby.hpp
index 7d57b75d66..cae0cc7102 100644
--- a/src/blas/KokkosBlas1_axpby.hpp
+++ b/src/blas/KokkosBlas1_axpby.hpp
@@ -45,8 +45,9 @@
 #ifndef KOKKOSBLAS1_AXPBY_HPP_
 #define KOKKOSBLAS1_AXPBY_HPP_
 
-#include<KokkosBlas1_axpby_spec.hpp>
-#include<KokkosKernels_helpers.hpp>
+#include <KokkosBlas1_axpby_spec.hpp>
+#include <KokkosKernels_helpers.hpp>
+#include <KokkosKernels_Error.hpp>
 
 // axpby() accepts both scalar coefficients a and b, and vector
 // coefficients (apply one for each column of the input multivectors).
@@ -56,74 +57,73 @@
 
 namespace KokkosBlas {
 
-
-template<class AV, class XMV, class BV, class YMV>
-void
-axpby (const AV& a, const XMV& X, const BV& b, const YMV& Y)
-{
-  static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::axpby: "
-                 "X is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<YMV>::value, "KokkosBlas::axpby: "
-                 "Y is not a Kokkos::View.");
-  static_assert (std::is_same<typename YMV::value_type,
-                 typename YMV::non_const_value_type>::value,
-                 "KokkosBlas::axpby: Y is const.  It must be nonconst, "
-                 "because it is an output argument "
-                 "(we must be able to write to its entries).");
-  static_assert (int(YMV::Rank) == int(XMV::Rank), "KokkosBlas::axpby: "
-                 "X and Y must have the same rank.");
-  static_assert (YMV::Rank == 1 || YMV::Rank == 2, "KokkosBlas::axpby: "
-                 "XMV and YMV must either have rank 1 or rank 2.");
+template <class AV, class XMV, class BV, class YMV>
+void axpby(const AV& a, const XMV& X, const BV& b, const YMV& Y) {
+  static_assert(Kokkos::is_view<XMV>::value,
+                "KokkosBlas::axpby: "
+                "X is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<YMV>::value,
+                "KokkosBlas::axpby: "
+                "Y is not a Kokkos::View.");
+  static_assert(std::is_same<typename YMV::value_type,
+                             typename YMV::non_const_value_type>::value,
+                "KokkosBlas::axpby: Y is const.  It must be nonconst, "
+                "because it is an output argument "
+                "(we must be able to write to its entries).");
+  static_assert(int(YMV::Rank) == int(XMV::Rank),
+                "KokkosBlas::axpby: "
+                "X and Y must have the same rank.");
+  static_assert(YMV::Rank == 1 || YMV::Rank == 2,
+                "KokkosBlas::axpby: "
+                "XMV and YMV must either have rank 1 or rank 2.");
 
   // Check compatibility of dimensions at run time.
-  if (X.extent(0) != Y.extent(0) ||
-      X.extent(1) != Y.extent(1)) {
+  if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) {
     std::ostringstream os;
     os << "KokkosBlas::axpby: Dimensions of X and Y do not match: "
-       << "X: " << X.extent(0) << " x " << X.extent(1)
-       << ", Y: " << Y.extent(0) << " x " << Y.extent(1);
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+       << "X: " << X.extent(0) << " x " << X.extent(1) << ", Y: " << Y.extent(0)
+       << " x " << Y.extent(1);
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
-  using UnifiedXLayout = typename
-    KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout;
-  using UnifiedYLayout = typename
-    KokkosKernels::Impl::GetUnifiedLayoutPreferring<YMV, UnifiedXLayout>::array_layout;
+  using UnifiedXLayout =
+      typename KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout;
+  using UnifiedYLayout =
+      typename KokkosKernels::Impl::GetUnifiedLayoutPreferring<
+          YMV, UnifiedXLayout>::array_layout;
 
   // Create unmanaged versions of the input Views.  XMV and YMV may be
   // rank 1 or rank 2.  AV and BV may be either rank-1 Views, or
   // scalar values.
-  typedef Kokkos::View<
-    typename XMV::const_data_type,
-    UnifiedXLayout,
-    typename XMV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > XMV_Internal;
-  typedef Kokkos::View<
-    typename YMV::non_const_data_type,
-    UnifiedYLayout,
-    typename YMV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > YMV_Internal;
+  typedef Kokkos::View<typename XMV::const_data_type, UnifiedXLayout,
+                       typename XMV::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      XMV_Internal;
+  typedef Kokkos::View<typename YMV::non_const_data_type, UnifiedYLayout,
+                       typename YMV::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      YMV_Internal;
   typedef typename KokkosKernels::Impl::GetUnifiedScalarViewType<
-    AV, XMV_Internal, true>::type AV_Internal;
+      AV, XMV_Internal, true>::type AV_Internal;
   typedef typename KokkosKernels::Impl::GetUnifiedScalarViewType<
-    BV, YMV_Internal, true>::type BV_Internal;
+      BV, YMV_Internal, true>::type BV_Internal;
 
-  AV_Internal  a_internal = a;
+  AV_Internal a_internal  = a;
   XMV_Internal X_internal = X;
-  BV_Internal  b_internal = b;
+  BV_Internal b_internal  = b;
   YMV_Internal Y_internal = Y;
 
-  Impl::Axpby<AV_Internal, XMV_Internal, BV_Internal,
-    YMV_Internal>::axpby (a_internal, X_internal, b_internal, Y_internal);
+  Impl::Axpby<AV_Internal, XMV_Internal, BV_Internal, YMV_Internal>::axpby(
+      a_internal, X_internal, b_internal, Y_internal);
 }
 
-template<class AV, class XMV, class YMV>
-void
-axpy (const AV& a, const XMV& X, const YMV& Y)
-{
-  axpby(a,X,Kokkos::Details::ArithTraits<typename YMV::non_const_value_type>::one(),Y);
+template <class AV, class XMV, class YMV>
+void axpy(const AV& a, const XMV& X, const YMV& Y) {
+  axpby(a, X,
+        Kokkos::Details::ArithTraits<typename YMV::non_const_value_type>::one(),
+        Y);
 }
 
-} // KokkosBlas
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/blas/KokkosBlas1_dot.hpp b/src/blas/KokkosBlas1_dot.hpp
index a939706594..bd1de1d35a 100644
--- a/src/blas/KokkosBlas1_dot.hpp
+++ b/src/blas/KokkosBlas1_dot.hpp
@@ -45,8 +45,9 @@
 #ifndef KOKKOSBLAS1_DOT_HPP_
 #define KOKKOSBLAS1_DOT_HPP_
 
-#include<KokkosBlas1_dot_spec.hpp>
-#include<KokkosKernels_helpers.hpp>
+#include <KokkosBlas1_dot_spec.hpp>
+#include <KokkosKernels_helpers.hpp>
+#include <KokkosKernels_Error.hpp>
 
 namespace KokkosBlas {
 
@@ -59,18 +60,19 @@ namespace KokkosBlas {
 /// \param y [in] Input 1-D View.
 ///
 /// \return The dot product result; a single value.
-template<class XVector,class YVector>
-typename Kokkos::Details::InnerProductSpaceTraits<typename XVector::non_const_value_type>::dot_type
-dot (const XVector& x, const YVector& y)
-{
-  static_assert (Kokkos::Impl::is_view<XVector>::value,
-                 "KokkosBlas::dot: XVector must be a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<YVector>::value,
-                 "KokkosBlas::dot: YVector must be a Kokkos::View.");
-  static_assert ((int) XVector::rank == (int) YVector::rank,
-                 "KokkosBlas::dot: Vector ranks do not match.");
-  static_assert (XVector::rank == 1, "KokkosBlas::dot: "
-                 "Both Vector inputs must have rank 1.");
+template <class XVector, class YVector>
+typename Kokkos::Details::InnerProductSpaceTraits<
+    typename XVector::non_const_value_type>::dot_type
+dot(const XVector& x, const YVector& y) {
+  static_assert(Kokkos::is_view<XVector>::value,
+                "KokkosBlas::dot: XVector must be a Kokkos::View.");
+  static_assert(Kokkos::is_view<YVector>::value,
+                "KokkosBlas::dot: YVector must be a Kokkos::View.");
+  static_assert((int)XVector::rank == (int)YVector::rank,
+                "KokkosBlas::dot: Vector ranks do not match.");
+  static_assert(XVector::rank == 1,
+                "KokkosBlas::dot: "
+                "Both Vector inputs must have rank 1.");
 
   // Check compatibility of dimensions at run time.
   if (x.extent(0) != y.extent(0)) {
@@ -78,52 +80,53 @@ dot (const XVector& x, const YVector& y)
     os << "KokkosBlas::dot: Dimensions do not match: "
        << ", x: " << x.extent(0) << " x 1"
        << ", y: " << y.extent(0) << " x 1";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
+  typedef Kokkos::View<
+      typename XVector::const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
+      typename XVector::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      XVector_Internal;
+  typedef Kokkos::View<
+      typename YVector::const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<YVector>::array_layout,
+      typename YVector::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      YVector_Internal;
 
-  typedef Kokkos::View<typename XVector::const_value_type*,
-    typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
-    typename XVector::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > XVector_Internal;
-  typedef Kokkos::View<typename YVector::const_value_type*,
-    typename KokkosKernels::Impl::GetUnifiedLayout<YVector>::array_layout,
-    typename YVector::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > YVector_Internal;
-
-  using dot_type =
-    typename Kokkos::Details::InnerProductSpaceTraits<
+  using dot_type = typename Kokkos::Details::InnerProductSpaceTraits<
       typename XVector::non_const_value_type>::dot_type;
-  //result_type is usually just dot_type, except:
+  // result_type is usually just dot_type, except:
   //  if dot_type is float, result_type is double
   //  if dot_type is complex<float>, result_type is complex<double>
-  //These special cases are to maintain accuracy.
+  // These special cases are to maintain accuracy.
   using result_type =
-    typename KokkosBlas::Impl::DotAccumulatingScalar<dot_type>::type;
-  using RVector_Internal = Kokkos::View<dot_type,
-    typename XVector_Internal::array_layout,
-    Kokkos::HostSpace,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-  using RVector_Result = Kokkos::View<result_type,
-    typename XVector_Internal::array_layout,
-    Kokkos::HostSpace,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-
-  result_type result {};
-  RVector_Result R = RVector_Result(&result);
+      typename KokkosBlas::Impl::DotAccumulatingScalar<dot_type>::type;
+  using RVector_Internal =
+      Kokkos::View<dot_type, typename XVector_Internal::array_layout,
+                   Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+  using RVector_Result =
+      Kokkos::View<result_type, typename XVector_Internal::array_layout,
+                   Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+
+  result_type result{};
+  RVector_Result R   = RVector_Result(&result);
   XVector_Internal X = x;
   YVector_Internal Y = y;
 
-  //Even though RVector is the template parameter, Dot::dot has an overload that
-  //accepts RVector_Internal (with the special accumulator, if dot_type is 32-bit precision).
-  //Impl::Dot needs to support both cases, and it's easier to do this with overloading than
-  //by extending the ETI to deal with two different scalar types.
-  Impl::DotSpecialAccumulator<RVector_Internal,XVector_Internal,YVector_Internal>::dot(R,X,Y);
+  // Even though RVector is the template parameter, Dot::dot has an overload
+  // that accepts RVector_Internal (with the special accumulator, if dot_type is
+  // 32-bit precision). Impl::Dot needs to support both cases, and it's easier
+  // to do this with overloading than by extending the ETI to deal with two
+  // different scalar types.
+  Impl::DotSpecialAccumulator<RVector_Internal, XVector_Internal,
+                              YVector_Internal>::dot(R, X, Y);
   Kokkos::fence();
   // mfh 22 Jan 2020: We need the line below because
   // Kokkos::complex<T> lacks a constructor that takes a
   // Kokkos::complex<U> with U != T.
-  return Kokkos::Details::CastPossiblyComplex<dot_type, result_type>::cast(result);
+  return Kokkos::Details::CastPossiblyComplex<dot_type, result_type>::cast(
+      result);
 }
 
 /// \brief Compute the column-wise dot products of two multivectors.
@@ -152,35 +155,36 @@ dot (const XVector& x, const YVector& y)
 /// \note To implementers: We use enable_if here so that the compiler
 ///   doesn't confuse this version of dot() with the three-argument
 ///   version of dot() in Kokkos_Blas1.hpp.
-template<class RV, class XMV, class YMV>
-void
-dot (const RV& R, const XMV& X, const YMV& Y,
-     typename std::enable_if<Kokkos::Impl::is_view<RV>::value, int>::type = 0)
-{
-  static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::dot: "
-                 "R is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::dot: "
-                 "X is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<YMV>::value, "KokkosBlas::dot: "
-                 "Y is not a Kokkos::View.");
-  static_assert (std::is_same<typename RV::value_type,
-                   typename RV::non_const_value_type>::value,
-                 "KokkosBlas::dot: R is const.  "
-                 "It must be nonconst, because it is an output argument "
-                 "(we have to be able to write to its entries).");
-  static_assert (RV::rank == 0 || RV::rank == 1,
-                 "KokkosBlas::dot: R must have rank 0 or 1.");
-  static_assert (XMV::rank == 1 || XMV::rank == 2,
-                 "KokkosBlas::dot: X must have rank 1 or 2.");
-  static_assert (YMV::rank == 1 || YMV::rank == 2,
-                 "KokkosBlas::dot: Y must have rank 1 or 2.");
-  static_assert ((XMV::rank == 2 && YMV::rank == 2 && RV::rank == 1) ||
-                 (XMV::rank == 1 && YMV::rank == 1 && RV::rank == 0) ||
-                 (XMV::rank == 2 && YMV::rank == 1 && RV::rank == 1) ||
-                 (XMV::rank == 1 && YMV::rank == 2 && RV::rank == 1),
-                 "KokkosBlas::dot: Ranks of RV, XMV, and YMV don't match.  "
-                 "See this function's documentation for the allowed "
-                 "combinations of ranks.");
+template <class RV, class XMV, class YMV>
+void dot(const RV& R, const XMV& X, const YMV& Y,
+         typename std::enable_if<Kokkos::is_view<RV>::value, int>::type = 0) {
+  static_assert(Kokkos::is_view<RV>::value,
+                "KokkosBlas::dot: "
+                "R is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<XMV>::value,
+                "KokkosBlas::dot: "
+                "X is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<YMV>::value,
+                "KokkosBlas::dot: "
+                "Y is not a Kokkos::View.");
+  static_assert(std::is_same<typename RV::value_type,
+                             typename RV::non_const_value_type>::value,
+                "KokkosBlas::dot: R is const.  "
+                "It must be nonconst, because it is an output argument "
+                "(we have to be able to write to its entries).");
+  static_assert(RV::rank == 0 || RV::rank == 1,
+                "KokkosBlas::dot: R must have rank 0 or 1.");
+  static_assert(XMV::rank == 1 || XMV::rank == 2,
+                "KokkosBlas::dot: X must have rank 1 or 2.");
+  static_assert(YMV::rank == 1 || YMV::rank == 2,
+                "KokkosBlas::dot: Y must have rank 1 or 2.");
+  static_assert((XMV::rank == 2 && YMV::rank == 2 && RV::rank == 1) ||
+                    (XMV::rank == 1 && YMV::rank == 1 && RV::rank == 0) ||
+                    (XMV::rank == 2 && YMV::rank == 1 && RV::rank == 1) ||
+                    (XMV::rank == 1 && YMV::rank == 2 && RV::rank == 1),
+                "KokkosBlas::dot: Ranks of RV, XMV, and YMV don't match.  "
+                "See this function's documentation for the allowed "
+                "combinations of ranks.");
 
   // Check compatibility of dimensions at run time.
 
@@ -188,67 +192,60 @@ dot (const RV& R, const XMV& X, const YMV& Y,
   bool dimsMatch = true;
   if (X.extent(0) != Y.extent(0)) {
     dimsMatch = false;
-  }
-  else if (X.extent(1) != Y.extent(1) &&
-           X.extent(1) != 1 &&
-           Y.extent(1) != 1) {
+  } else if (X.extent(1) != Y.extent(1) && X.extent(1) != 1 &&
+             Y.extent(1) != 1) {
     // Numbers of columns don't match, and neither X nor Y have one column.
     dimsMatch = false;
   }
-  const auto maxNumCols = X.extent(1) > Y.extent(1) ?
-    X.extent(1) : Y.extent(1);
+  const auto maxNumCols = X.extent(1) > Y.extent(1) ? X.extent(1) : Y.extent(1);
   if (RV::rank == 1 && R.extent(0) != maxNumCols) {
     dimsMatch = false;
   }
 
-  if (! dimsMatch) {
+  if (!dimsMatch) {
     std::ostringstream os;
     os << "KokkosBlas::dot: Dimensions of R, X, and Y do not match: ";
     if (RV::rank == 1) {
       os << "R: " << R.extent(0) << " x " << X.extent(1) << ", ";
     }
-    os << "X: " << X.extent(0) << " x " << X.extent(1)
-       << ", Y: " << Y.extent(0) << " x " << Y.extent(1);
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+    os << "X: " << X.extent(0) << " x " << X.extent(1) << ", Y: " << Y.extent(0)
+       << " x " << Y.extent(1);
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
   // Create unmanaged versions of the input Views.
-  using UnifiedXLayout = typename
-    KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout;
-  using UnifiedRVLayout = typename
-    KokkosKernels::Impl::GetUnifiedLayoutPreferring<RV, UnifiedXLayout>::array_layout;
-
-  typedef Kokkos::View<
-    typename Kokkos::Impl::if_c<
-      RV::rank == 0,
-      typename RV::non_const_value_type,
-      typename RV::non_const_value_type* >::type,
-    UnifiedRVLayout,
-    typename RV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV_Internal;
+  using UnifiedXLayout =
+      typename KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout;
+  using UnifiedRVLayout =
+      typename KokkosKernels::Impl::GetUnifiedLayoutPreferring<
+          RV, UnifiedXLayout>::array_layout;
+
+  typedef Kokkos::View<typename std::conditional<
+                           RV::rank == 0, typename RV::non_const_value_type,
+                           typename RV::non_const_value_type*>::type,
+                       UnifiedRVLayout, typename RV::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      RV_Internal;
   typedef Kokkos::View<
-    typename Kokkos::Impl::if_c<
-      XMV::rank == 1,
-      typename XMV::const_value_type*,
-      typename XMV::const_value_type** >::type,
-    UnifiedXLayout,
-    typename XMV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > XMV_Internal;
+      typename std::conditional<XMV::rank == 1, typename XMV::const_value_type*,
+                                typename XMV::const_value_type**>::type,
+      UnifiedXLayout, typename XMV::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      XMV_Internal;
   typedef Kokkos::View<
-    typename Kokkos::Impl::if_c<
-      YMV::rank == 1,
-      typename YMV::const_value_type*,
-      typename YMV::const_value_type** >::type,
-    typename KokkosKernels::Impl::GetUnifiedLayout<YMV>::array_layout,
-    typename YMV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > YMV_Internal;
-
-  RV_Internal R_internal = R;
+      typename std::conditional<YMV::rank == 1, typename YMV::const_value_type*,
+                                typename YMV::const_value_type**>::type,
+      typename KokkosKernels::Impl::GetUnifiedLayout<YMV>::array_layout,
+      typename YMV::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      YMV_Internal;
+
+  RV_Internal R_internal  = R;
   XMV_Internal X_internal = X;
   YMV_Internal Y_internal = Y;
 
-  Impl::Dot<RV_Internal, XMV_Internal, YMV_Internal>::dot(R_internal, X_internal, Y_internal);
-}
+  Impl::Dot<RV_Internal, XMV_Internal, YMV_Internal>::dot(
+      R_internal, X_internal, Y_internal);
 }
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/blas/KokkosBlas1_fill.hpp b/src/blas/KokkosBlas1_fill.hpp
index d950b54480..9f8b681073 100644
--- a/src/blas/KokkosBlas1_fill.hpp
+++ b/src/blas/KokkosBlas1_fill.hpp
@@ -45,7 +45,7 @@
 #ifndef KOKKOSBLAS1_FILL_HPP_
 #define KOKKOSBLAS1_FILL_HPP_
 
-#include<Kokkos_Core.hpp>
+#include <Kokkos_Core.hpp>
 
 namespace KokkosBlas {
 
@@ -55,14 +55,13 @@ namespace KokkosBlas {
 ///
 /// \param X [out] Output View (1-D or 2-D).
 /// \param val [in] Value with which to fill the entries of X.
-template<class XMV>
-void fill (const XMV& X, const typename XMV::non_const_value_type& val) {
+template <class XMV>
+void fill(const XMV& X, const typename XMV::non_const_value_type& val) {
   Kokkos::Profiling::pushRegion("KokkosBlas::fill");
-  Kokkos::deep_copy(X,val);
+  Kokkos::deep_copy(X, val);
   Kokkos::Profiling::popRegion();
 }
 
-}
-
-#endif // KOKKOSBLAS1_ABS_HPP_
+}  // namespace KokkosBlas
 
+#endif  // KOKKOSBLAS1_ABS_HPP_
diff --git a/src/blas/KokkosBlas1_iamax.hpp b/src/blas/KokkosBlas1_iamax.hpp
index a2dbb4e068..70363fc340 100644
--- a/src/blas/KokkosBlas1_iamax.hpp
+++ b/src/blas/KokkosBlas1_iamax.hpp
@@ -45,125 +45,127 @@
 #ifndef KOKKOSBLAS1_IAMAX_HPP_
 #define KOKKOSBLAS1_IAMAX_HPP_
 
-#include<KokkosBlas1_iamax_spec.hpp>
-#include<KokkosKernels_helpers.hpp>
+#include <KokkosBlas1_iamax_spec.hpp>
+#include <KokkosKernels_helpers.hpp>
+#include <KokkosKernels_Error.hpp>
 
 namespace KokkosBlas {
 
-/// \brief Return the (smallest) index of the element of the maximum magnitude of the vector x.
+/// \brief Return the (smallest) index of the element of the maximum magnitude
+/// of the vector x.
 ///
 /// \tparam XVector Type of the first vector x; a 1-D Kokkos::View.
 ///
 /// \param x [in] Input 1-D View.
 ///
-/// \return The (smallest) index of the element of the maximum magnitude; a single value.
+/// \return The (smallest) index of the element of the maximum magnitude; a
+/// single value.
 ///         Note: Returned index is 1-based for compatibility with Fortran.
-template<class XVector>
-typename XVector::size_type iamax (const XVector& x)
-{
-  static_assert (Kokkos::Impl::is_view<XVector>::value,
-                 "KokkosBlas::iamax: XVector must be a Kokkos::View.");
-  static_assert (XVector::rank == 1, "KokkosBlas::iamax: "
-                 "Both Vector inputs must have rank 1.");
+template <class XVector>
+typename XVector::size_type iamax(const XVector& x) {
+  static_assert(Kokkos::is_view<XVector>::value,
+                "KokkosBlas::iamax: XVector must be a Kokkos::View.");
+  static_assert(XVector::rank == 1,
+                "KokkosBlas::iamax: "
+                "Both Vector inputs must have rank 1.");
 
   typedef typename XVector::size_type index_type;
 
-  typedef Kokkos::View<typename XVector::const_value_type*,
-    typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
-    typename XVector::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > XVector_Internal;
+  typedef Kokkos::View<
+      typename XVector::const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
+      typename XVector::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      XVector_Internal;
 
-  typedef Kokkos::View<index_type,
-    typename XVector_Internal::array_layout,
-    Kokkos::HostSpace,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > RVector_Internal;
+  typedef Kokkos::View<index_type, typename XVector_Internal::array_layout,
+                       Kokkos::HostSpace,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      RVector_Internal;
 
   index_type result;
   RVector_Internal R = RVector_Internal(&result);
   XVector_Internal X = x;
 
-  Impl::Iamax<RVector_Internal,XVector_Internal>::iamax (R,X);
+  Impl::Iamax<RVector_Internal, XVector_Internal>::iamax(R, X);
   Kokkos::fence();
   return result;
 }
 
 /// \brief R(j) = iamax(X(i,j))
 ///
-/// Replace each entry in R with the (smallest) index of the element of the maximum magnitude of the
-/// corresponding entry in X.
+/// Replace each entry in R with the (smallest) index of the element of the
+/// maximum magnitude of the corresponding entry in X.
 ///
 /// \tparam RMV 0-D or 1-D Kokkos::View specialization.
 /// \tparam XMV 1-D or 2-D Kokkos::View specialization.
 ///
-/// Note for TPL cuBLAS: When TPL cuBLAS iamax is used and returns result to a view, RMV must be 0-D view and XMV must be 1-D view.
-template<class RV, class XMV>
-void
-iamax (const RV& R, const XMV& X,
-      typename std::enable_if<Kokkos::Impl::is_view<RV>::value, int>::type = 0)
-{
-  static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::iamax: "
-                 "R is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::iamax: "
-                 "X is not a Kokkos::View.");
-  static_assert (std::is_same<typename RV::value_type,
-                 typename RV::non_const_value_type>::value,
-                 "KokkosBlas::iamax: R is const.  "
-                 "It must be nonconst, because it is an output argument "
-                 "(we have to be able to write to its entries).");
-  static_assert (((RV::rank == 0) && (XMV::rank == 1)) ||
-                 ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::iamax: "
-                 "RV and XMV must either have rank 0 and 1 or rank 1 and 2.");
+/// Note for TPL cuBLAS: When TPL cuBLAS iamax is used and returns result to a
+/// view, RMV must be 0-D view and XMV must be 1-D view.
+template <class RV, class XMV>
+void iamax(const RV& R, const XMV& X,
+           typename std::enable_if<Kokkos::is_view<RV>::value, int>::type = 0) {
+  static_assert(Kokkos::is_view<RV>::value,
+                "KokkosBlas::iamax: "
+                "R is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<XMV>::value,
+                "KokkosBlas::iamax: "
+                "X is not a Kokkos::View.");
+  static_assert(std::is_same<typename RV::value_type,
+                             typename RV::non_const_value_type>::value,
+                "KokkosBlas::iamax: R is const.  "
+                "It must be nonconst, because it is an output argument "
+                "(we have to be able to write to its entries).");
+  static_assert(((RV::rank == 0) && (XMV::rank == 1)) ||
+                    ((RV::rank == 1) && (XMV::rank == 2)),
+                "KokkosBlas::iamax: "
+                "RV and XMV must either have rank 0 and 1 or rank 1 and 2.");
 
   typedef typename XMV::size_type index_type;
-  static_assert (std::is_same<typename RV::value_type,
-                 index_type>::value,
-                 "KokkosBlas::iamax: R must have the type of"
-                 "the Xvectors size_type it is an output argument "
-                 "(we have to be able to write to its entries).");
+  static_assert(std::is_same<typename RV::value_type, index_type>::value,
+                "KokkosBlas::iamax: R must have the type of"
+                "the Xvectors size_type it is an output argument "
+                "(we have to be able to write to its entries).");
 
   // Check compatibility of dimensions at run time.
   if (X.extent(1) != R.extent(0)) {
     std::ostringstream os;
     os << "KokkosBlas::iamax (MV): Dimensions of R and X do not match: "
-       << "R: " << R.extent(0)
-       << ", X: " << X.extent(0) << " x " << X.extent(1);
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+       << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x "
+       << X.extent(1);
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
-  using UnifiedXLayout = typename
-    KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout;
-  using UnifiedRVLayout = typename
-    KokkosKernels::Impl::GetUnifiedLayoutPreferring<RV, UnifiedXLayout>::array_layout;
+  using UnifiedXLayout =
+      typename KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout;
+  using UnifiedRVLayout =
+      typename KokkosKernels::Impl::GetUnifiedLayoutPreferring<
+          RV, UnifiedXLayout>::array_layout;
 
   // Create unmanaged versions of the input Views.  RV may be rank 0 or rank 1.
   // XMV may be rank 1 or rank 2.
+  typedef Kokkos::View<typename std::conditional<
+                           RV::rank == 0, typename RV::non_const_value_type,
+                           typename RV::non_const_value_type*>::type,
+                       UnifiedRVLayout,
+                       typename std::conditional<
+                           std::is_same<typename RV::device_type::memory_space,
+                                        Kokkos::HostSpace>::value,
+                           Kokkos::HostSpace, typename RV::device_type>::type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      RV_Internal;
   typedef Kokkos::View<
-    typename std::conditional<
-      RV::rank == 0,
-      typename RV::non_const_value_type,
-      typename RV::non_const_value_type* >::type,
-    UnifiedRVLayout,
-    typename std::conditional<
-      std::is_same<typename RV::device_type::memory_space, Kokkos::HostSpace>::value,
-      Kokkos::HostSpace,
-      typename RV::device_type >::type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV_Internal;
-  typedef Kokkos::View<
-    typename std::conditional<
-      XMV::rank == 1,
-      typename XMV::const_value_type*,
-      typename XMV::const_value_type** >::type,
-    UnifiedXLayout,
-    typename XMV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > XMV_Internal;
-
-  RV_Internal R_internal = R;
-  XMV_Internal X_internal = X;
+      typename std::conditional<XMV::rank == 1, typename XMV::const_value_type*,
+                                typename XMV::const_value_type**>::type,
+      UnifiedXLayout, typename XMV::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      XMV_Internal;
 
-  Impl::Iamax<RV_Internal, XMV_Internal>::iamax (R_internal, X_internal);
-}
+  RV_Internal R_internal  = R;
+  XMV_Internal X_internal = X;
 
+  Impl::Iamax<RV_Internal, XMV_Internal>::iamax(R_internal, X_internal);
 }
 
-#endif // KOKKOSBLAS1_IAMAX_HPP_
+}  // namespace KokkosBlas
 
+#endif  // KOKKOSBLAS1_IAMAX_HPP_
diff --git a/src/blas/KokkosBlas1_mult.hpp b/src/blas/KokkosBlas1_mult.hpp
index d3a6ee8836..f99bc067a9 100644
--- a/src/blas/KokkosBlas1_mult.hpp
+++ b/src/blas/KokkosBlas1_mult.hpp
@@ -45,77 +45,74 @@
 #ifndef KOKKOSBLAS1_MULT_HPP_
 #define KOKKOSBLAS1_MULT_HPP_
 
-#include<KokkosBlas1_mult_spec.hpp>
-#include<KokkosKernels_helpers.hpp>
+#include <KokkosBlas1_mult_spec.hpp>
+#include <KokkosKernels_helpers.hpp>
+#include <KokkosKernels_Error.hpp>
 
 namespace KokkosBlas {
 
-template<class YMV, class AV, class XMV>
-void
-mult (typename YMV::const_value_type& gamma,
-      const YMV& Y,
-      typename AV::const_value_type& alpha,
-      const AV& A,
-      const XMV& X)
-{
-  static_assert (Kokkos::Impl::is_view<YMV>::value, "KokkosBlas::mult: "
-                 "Y is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<AV>::value, "KokkosBlas::mult: "
-                 "A is not a Kokkos::View.");
-  static_assert (std::is_same<typename YMV::value_type,
-                   typename YMV::non_const_value_type>::value,
-                 "KokkosBlas::mult: Y is const.  "
-                 "It must be nonconst, because it is an output argument "
-                 "(we have to be able to write to its entries).");
-  static_assert ((XMV::rank == 1 && YMV::rank == 1) ||
-                 (XMV::rank == 2 && YMV::rank == 2),
-                 "KokkosBlas::mult: Y and X must be either both rank 1, "
-                 "or both rank 2.");
-  static_assert (AV::rank == 1, "KokkosBlas::mult: A must have rank 1.");
+template <class YMV, class AV, class XMV>
+void mult(typename YMV::const_value_type& gamma, const YMV& Y,
+          typename AV::const_value_type& alpha, const AV& A, const XMV& X) {
+  static_assert(Kokkos::is_view<YMV>::value,
+                "KokkosBlas::mult: "
+                "Y is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<AV>::value,
+                "KokkosBlas::mult: "
+                "A is not a Kokkos::View.");
+  static_assert(std::is_same<typename YMV::value_type,
+                             typename YMV::non_const_value_type>::value,
+                "KokkosBlas::mult: Y is const.  "
+                "It must be nonconst, because it is an output argument "
+                "(we have to be able to write to its entries).");
+  static_assert(
+      (XMV::rank == 1 && YMV::rank == 1) || (XMV::rank == 2 && YMV::rank == 2),
+      "KokkosBlas::mult: Y and X must be either both rank 1, "
+      "or both rank 2.");
+  static_assert(AV::rank == 1, "KokkosBlas::mult: A must have rank 1.");
 
   // Check compatibility of dimensions at run time.
-  if (Y.extent(0) != A.extent(0) ||
-      Y.extent(0) != X.extent(0) ||
+  if (Y.extent(0) != A.extent(0) || Y.extent(0) != X.extent(0) ||
       Y.extent(1) != X.extent(1)) {
     std::ostringstream os;
     os << "KokkosBlas::mult: Dimensions do not match: "
-       << "Y: " << Y.extent(0) << " x " << Y.extent(1)
-       << ", A: " << A.extent(0) << " x " << A.extent(0)
-       << ", X: " << X.extent(0) << " x " << X.extent(1);
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+       << "Y: " << Y.extent(0) << " x " << Y.extent(1) << ", A: " << A.extent(0)
+       << " x " << A.extent(0) << ", X: " << X.extent(0) << " x "
+       << X.extent(1);
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
-  using YUnifiedLayout = typename KokkosKernels::Impl::GetUnifiedLayout<YMV>::array_layout;
-  using AUnifiedLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring<AV, YUnifiedLayout>::array_layout;
-  using XUnifiedLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring<XMV, YUnifiedLayout>::array_layout;
+  using YUnifiedLayout =
+      typename KokkosKernels::Impl::GetUnifiedLayout<YMV>::array_layout;
+  using AUnifiedLayout =
+      typename KokkosKernels::Impl::GetUnifiedLayoutPreferring<
+          AV, YUnifiedLayout>::array_layout;
+  using XUnifiedLayout =
+      typename KokkosKernels::Impl::GetUnifiedLayoutPreferring<
+          XMV, YUnifiedLayout>::array_layout;
 
   // Create unmanaged versions of the input Views.
-  typedef Kokkos::View<
-    typename YMV::non_const_data_type,
-    YUnifiedLayout,
-    typename YMV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > YMV_Internal;
-  typedef Kokkos::View<
-    typename AV::const_value_type*,
-    AUnifiedLayout,
-    typename AV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > AV_Internal;
-  typedef Kokkos::View<
-    typename XMV::const_data_type,
-    XUnifiedLayout,
-    typename XMV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > XMV_Internal;
+  typedef Kokkos::View<typename YMV::non_const_data_type, YUnifiedLayout,
+                       typename YMV::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      YMV_Internal;
+  typedef Kokkos::View<typename AV::const_value_type*, AUnifiedLayout,
+                       typename AV::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      AV_Internal;
+  typedef Kokkos::View<typename XMV::const_data_type, XUnifiedLayout,
+                       typename XMV::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      XMV_Internal;
 
   YMV_Internal Y_internal = Y;
-  AV_Internal A_internal = A;
+  AV_Internal A_internal  = A;
   XMV_Internal X_internal = X;
 
-  Impl::Mult<YMV_Internal, AV_Internal, XMV_Internal>::mult (gamma, Y_internal, alpha,
-                                                             A_internal, X_internal);
+  Impl::Mult<YMV_Internal, AV_Internal, XMV_Internal>::mult(
+      gamma, Y_internal, alpha, A_internal, X_internal);
 }
 
+}  // namespace KokkosBlas
 
-}
-
-#endif // KOKKOSBLAS1_MULT_HPP_
-
+#endif  // KOKKOSBLAS1_MULT_HPP_
diff --git a/src/blas/KokkosBlas1_nrm1.hpp b/src/blas/KokkosBlas1_nrm1.hpp
index 3343118599..fb453a7024 100644
--- a/src/blas/KokkosBlas1_nrm1.hpp
+++ b/src/blas/KokkosBlas1_nrm1.hpp
@@ -45,8 +45,9 @@
 #ifndef KOKKOSBLAS1_NRM1_HPP_
 #define KOKKOSBLAS1_NRM1_HPP_
 
-#include<KokkosBlas1_nrm1_spec.hpp>
-#include<KokkosKernels_helpers.hpp>
+#include <KokkosBlas1_nrm1_spec.hpp>
+#include <KokkosKernels_helpers.hpp>
+#include <KokkosKernels_Error.hpp>
 
 namespace KokkosBlas {
 
@@ -57,31 +58,34 @@ namespace KokkosBlas {
 /// \param x [in] Input 1-D View.
 ///
 /// \return The nrm1 product result; a single value.
-template<class XVector>
-typename Kokkos::Details::InnerProductSpaceTraits<typename XVector::non_const_value_type>::mag_type
-nrm1 (const XVector& x)
-{
-  static_assert (Kokkos::Impl::is_view<XVector>::value,
-                 "KokkosBlas::nrm1: XVector must be a Kokkos::View.");
-  static_assert (XVector::rank == 1, "KokkosBlas::nrm1: "
-                 "Both Vector inputs must have rank 1.");
-  typedef typename Kokkos::Details::InnerProductSpaceTraits<typename XVector::non_const_value_type>::mag_type mag_type;
-
-  typedef Kokkos::View<typename XVector::const_value_type*,
-    typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
-    typename XVector::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > XVector_Internal;
-
-  typedef Kokkos::View<mag_type,
-    typename XVector_Internal::array_layout,
-    Kokkos::HostSpace,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > RVector_Internal;
+template <class XVector>
+typename Kokkos::Details::InnerProductSpaceTraits<
+    typename XVector::non_const_value_type>::mag_type
+nrm1(const XVector& x) {
+  static_assert(Kokkos::is_view<XVector>::value,
+                "KokkosBlas::nrm1: XVector must be a Kokkos::View.");
+  static_assert(XVector::rank == 1,
+                "KokkosBlas::nrm1: "
+                "Both Vector inputs must have rank 1.");
+  typedef typename Kokkos::Details::InnerProductSpaceTraits<
+      typename XVector::non_const_value_type>::mag_type mag_type;
+
+  typedef Kokkos::View<
+      typename XVector::const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
+      typename XVector::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      XVector_Internal;
+
+  typedef Kokkos::View<mag_type, typename XVector_Internal::array_layout,
+                       Kokkos::HostSpace,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      RVector_Internal;
 
   mag_type result;
   RVector_Internal R = RVector_Internal(&result);
   XVector_Internal X = x;
 
-  Impl::Nrm1<RVector_Internal,XVector_Internal>::nrm1 (R,X);
+  Impl::Nrm1<RVector_Internal, XVector_Internal>::nrm1(R, X);
   Kokkos::fence();
   return result;
 }
@@ -95,78 +99,76 @@ nrm1 (const XVector& x)
 /// \tparam XMV 1-D or 2-D Kokkos::View specialization.  It must have
 ///   the same rank as RMV, and its entries must be assignable to
 ///   those of RMV.
-template<class RV, class XMV>
-void
-nrm1 (const RV& R, const XMV& X,
-      typename std::enable_if<Kokkos::Impl::is_view<RV>::value, int>::type = 0)
-{
-  static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::nrm1: "
-                 "R is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::nrm1: "
-                 "X is not a Kokkos::View.");
-  static_assert (std::is_same<typename RV::value_type,
-                 typename RV::non_const_value_type>::value,
-                 "KokkosBlas::nrm1: R is const.  "
-                 "It must be nonconst, because it is an output argument "
-                 "(we have to be able to write to its entries).");
-  static_assert (((RV::rank == 0) && (XMV::rank == 1)) ||
-                 ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::nrm1: "
-                 "RV and XMV must either have rank 0 and 1 or rank 1 and 2.");
-
-  typedef typename Kokkos::Details::InnerProductSpaceTraits<typename XMV::non_const_value_type>::mag_type mag_type;
-  static_assert (std::is_same<typename RV::value_type,
-                 mag_type>::value,
-                 "KokkosBlas::nrm1: R must have the magnitude type of"
-                 "the xvectors value_type it is an output argument "
-                 "(we have to be able to write to its entries).");
+template <class RV, class XMV>
+void nrm1(const RV& R, const XMV& X,
+          typename std::enable_if<Kokkos::is_view<RV>::value, int>::type = 0) {
+  static_assert(Kokkos::is_view<RV>::value,
+                "KokkosBlas::nrm1: "
+                "R is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<XMV>::value,
+                "KokkosBlas::nrm1: "
+                "X is not a Kokkos::View.");
+  static_assert(std::is_same<typename RV::value_type,
+                             typename RV::non_const_value_type>::value,
+                "KokkosBlas::nrm1: R is const.  "
+                "It must be nonconst, because it is an output argument "
+                "(we have to be able to write to its entries).");
+  static_assert(((RV::rank == 0) && (XMV::rank == 1)) ||
+                    ((RV::rank == 1) && (XMV::rank == 2)),
+                "KokkosBlas::nrm1: "
+                "RV and XMV must either have rank 0 and 1 or rank 1 and 2.");
+
+  typedef typename Kokkos::Details::InnerProductSpaceTraits<
+      typename XMV::non_const_value_type>::mag_type mag_type;
+  static_assert(std::is_same<typename RV::value_type, mag_type>::value,
+                "KokkosBlas::nrm1: R must have the magnitude type of"
+                "the xvectors value_type it is an output argument "
+                "(we have to be able to write to its entries).");
 
   // Check compatibility of dimensions at run time.
   if (X.extent(1) != R.extent(0)) {
     std::ostringstream os;
     os << "KokkosBlas::nrm1 (MV): Dimensions of R and X do not match: "
-       << "R: " << R.extent(0)
-       << ", X: " << X.extent(0) << " x " << X.extent(1);
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+       << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x "
+       << X.extent(1);
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
-  using UnifiedXLayout = typename
-    KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout;
-  using UnifiedRVLayout = typename
-    KokkosKernels::Impl::GetUnifiedLayoutPreferring<RV, UnifiedXLayout>::array_layout;
+  using UnifiedXLayout =
+      typename KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout;
+  using UnifiedRVLayout =
+      typename KokkosKernels::Impl::GetUnifiedLayoutPreferring<
+          RV, UnifiedXLayout>::array_layout;
 
   // Create unmanaged versions of the input Views.  RV and XMV may be
   // rank 1 or rank 2.
+  typedef Kokkos::View<typename std::conditional<
+                           RV::rank == 0, typename RV::non_const_value_type,
+                           typename RV::non_const_value_type*>::type,
+                       UnifiedRVLayout, typename RV::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      RV_Internal;
   typedef Kokkos::View<
-    typename Kokkos::Impl::if_c<
-      RV::rank == 0,
-      typename RV::non_const_value_type,
-      typename RV::non_const_value_type* >::type,
-    UnifiedRVLayout,
-    typename RV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV_Internal;
-  typedef Kokkos::View<
-    typename Kokkos::Impl::if_c<
-      XMV::rank == 1,
-      typename XMV::const_value_type*,
-      typename XMV::const_value_type** >::type,
-    UnifiedXLayout,
-    typename XMV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > XMV_Internal;
-
-  RV_Internal R_internal = R;
+      typename std::conditional<XMV::rank == 1, typename XMV::const_value_type*,
+                                typename XMV::const_value_type**>::type,
+      UnifiedXLayout, typename XMV::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      XMV_Internal;
+
+  RV_Internal R_internal  = R;
   XMV_Internal X_internal = X;
 
-  Impl::Nrm1<RV_Internal, XMV_Internal>::nrm1 (R_internal, X_internal);
+  Impl::Nrm1<RV_Internal, XMV_Internal>::nrm1(R_internal, X_internal);
 }
 
 /// \brief Return the nrm1 of the vector x via asum (the actual blas name).
-template<class XVector>
-typename Kokkos::Details::InnerProductSpaceTraits<typename XVector::non_const_value_type>::mag_type
-asum (const XVector& x) {
+template <class XVector>
+typename Kokkos::Details::InnerProductSpaceTraits<
+    typename XVector::non_const_value_type>::mag_type
+asum(const XVector& x) {
   return nrm1(x);
 }
 
-}
-
-#endif // KOKKOSBLAS1_NRM1_HPP_
+}  // namespace KokkosBlas
 
+#endif  // KOKKOSBLAS1_NRM1_HPP_
diff --git a/src/blas/KokkosBlas1_nrm2.hpp b/src/blas/KokkosBlas1_nrm2.hpp
index 967e641a8b..3a10e48a4d 100644
--- a/src/blas/KokkosBlas1_nrm2.hpp
+++ b/src/blas/KokkosBlas1_nrm2.hpp
@@ -45,8 +45,9 @@
 #ifndef KOKKOSBLAS1_NRM2_HPP_
 #define KOKKOSBLAS1_NRM2_HPP_
 
-#include<KokkosBlas1_nrm2_spec.hpp>
-#include<KokkosKernels_helpers.hpp>
+#include <KokkosBlas1_nrm2_spec.hpp>
+#include <KokkosKernels_helpers.hpp>
+#include <KokkosKernels_Error.hpp>
 
 namespace KokkosBlas {
 
@@ -57,31 +58,34 @@ namespace KokkosBlas {
 /// \param x [in] Input 1-D View.
 ///
 /// \return The nrm2 product result; a single value.
-template<class XVector>
-typename Kokkos::Details::InnerProductSpaceTraits<typename XVector::non_const_value_type>::mag_type
-nrm2 (const XVector& x)
-{
-  static_assert (Kokkos::Impl::is_view<XVector>::value,
-                 "KokkosBlas::nrm2: XVector must be a Kokkos::View.");
-  static_assert (XVector::rank == 1, "KokkosBlas::nrm2: "
-                 "XVector must have rank 1.");
-  typedef typename Kokkos::Details::InnerProductSpaceTraits<typename XVector::non_const_value_type>::mag_type mag_type;
-
-  typedef Kokkos::View<typename XVector::const_value_type*,
-    typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
-    typename XVector::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > XVector_Internal;
-
-  typedef Kokkos::View<mag_type,
-    typename XVector_Internal::array_layout,
-    Kokkos::HostSpace,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > RVector_Internal;
+template <class XVector>
+typename Kokkos::Details::InnerProductSpaceTraits<
+    typename XVector::non_const_value_type>::mag_type
+nrm2(const XVector& x) {
+  static_assert(Kokkos::is_view<XVector>::value,
+                "KokkosBlas::nrm2: XVector must be a Kokkos::View.");
+  static_assert(XVector::rank == 1,
+                "KokkosBlas::nrm2: "
+                "XVector must have rank 1.");
+  typedef typename Kokkos::Details::InnerProductSpaceTraits<
+      typename XVector::non_const_value_type>::mag_type mag_type;
+
+  typedef Kokkos::View<
+      typename XVector::const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
+      typename XVector::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      XVector_Internal;
+
+  typedef Kokkos::View<mag_type, typename XVector_Internal::array_layout,
+                       Kokkos::HostSpace,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      RVector_Internal;
 
   mag_type result;
   RVector_Internal R = RVector_Internal(&result);
   XVector_Internal X = x;
 
-  Impl::Nrm2<RVector_Internal,XVector_Internal>::nrm2 (R,X,true);
+  Impl::Nrm2<RVector_Internal, XVector_Internal>::nrm2(R, X, true);
   Kokkos::fence();
   return result;
 }
@@ -95,64 +99,63 @@ nrm2 (const XVector& x)
 /// \tparam XMV 1-D or 2-D Kokkos::View specialization.  It must have
 ///   the same rank as RMV, and its entries must be assignable to
 ///   those of RMV.
-template<class RV, class XMV>
-void
-nrm2 (const RV& R, const XMV& X,
-      typename std::enable_if<Kokkos::Impl::is_view<RV>::value, int>::type = 0)
-{
-  static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::nrm2: "
-                 "R is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::nrm2: "
-                 "X is not a Kokkos::View.");
-  static_assert (std::is_same<typename RV::value_type,
-                 typename RV::non_const_value_type>::value,
-                 "KokkosBlas::nrm2: R is const.  "
-                 "It must be nonconst, because it is an output argument "
-                 "(we have to be able to write to its entries).");
-  static_assert (((RV::rank == 0) && (XMV::rank == 1)) ||
-                 ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::nrm2: "
-                 "RV and XMV must either have rank 0 and 1 or rank 1 and 2.");
-
-  typedef typename Kokkos::Details::InnerProductSpaceTraits<typename XMV::non_const_value_type>::mag_type mag_type;
-  static_assert (std::is_same<typename RV::value_type,
-                 mag_type>::value,
-                 "KokkosBlas::nrm2: R must have the magnitude type of"
-                 "the xvectors value_type it is an output argument "
-                 "(we have to be able to write to its entries).");
+template <class RV, class XMV>
+void nrm2(const RV& R, const XMV& X,
+          typename std::enable_if<Kokkos::is_view<RV>::value, int>::type = 0) {
+  static_assert(Kokkos::is_view<RV>::value,
+                "KokkosBlas::nrm2: "
+                "R is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<XMV>::value,
+                "KokkosBlas::nrm2: "
+                "X is not a Kokkos::View.");
+  static_assert(std::is_same<typename RV::value_type,
+                             typename RV::non_const_value_type>::value,
+                "KokkosBlas::nrm2: R is const.  "
+                "It must be nonconst, because it is an output argument "
+                "(we have to be able to write to its entries).");
+  static_assert(((RV::rank == 0) && (XMV::rank == 1)) ||
+                    ((RV::rank == 1) && (XMV::rank == 2)),
+                "KokkosBlas::nrm2: "
+                "RV and XMV must either have rank 0 and 1 or rank 1 and 2.");
+
+  typedef typename Kokkos::Details::InnerProductSpaceTraits<
+      typename XMV::non_const_value_type>::mag_type mag_type;
+  static_assert(std::is_same<typename RV::value_type, mag_type>::value,
+                "KokkosBlas::nrm2: R must have the magnitude type of"
+                "the xvectors value_type it is an output argument "
+                "(we have to be able to write to its entries).");
 
   // Check compatibility of dimensions at run time.
   if (X.extent(1) != R.extent(0)) {
     std::ostringstream os;
     os << "KokkosBlas::nrm2 (MV): Dimensions of R and X do not match: "
-       << "R: " << R.extent(0)
-       << ", X: " << X.extent(0) << " x " << X.extent(1);
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+       << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x "
+       << X.extent(1);
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
-  using UnifiedXLayout = typename
-    KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout;
-  using UnifiedRVLayout = typename
-    KokkosKernels::Impl::GetUnifiedLayoutPreferring<RV, UnifiedXLayout>::array_layout;
+  using UnifiedXLayout =
+      typename KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout;
+  using UnifiedRVLayout =
+      typename KokkosKernels::Impl::GetUnifiedLayoutPreferring<
+          RV, UnifiedXLayout>::array_layout;
 
   // Create unmanaged versions of the input Views.  RV and XMV may be
   // rank 1 or rank 2.
-  typedef Kokkos::View<
-    typename RV::non_const_data_type,
-    UnifiedRVLayout,
-    typename RV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV_Internal;
-  typedef Kokkos::View<
-    typename XMV::const_data_type,
-    UnifiedXLayout,
-    typename XMV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > XMV_Internal;
-
-  RV_Internal R_internal = R;
+  typedef Kokkos::View<typename RV::non_const_data_type, UnifiedRVLayout,
+                       typename RV::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      RV_Internal;
+  typedef Kokkos::View<typename XMV::const_data_type, UnifiedXLayout,
+                       typename XMV::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      XMV_Internal;
+
+  RV_Internal R_internal  = R;
   XMV_Internal X_internal = X;
 
-  Impl::Nrm2<RV_Internal, XMV_Internal>::nrm2 (R_internal, X_internal,true);
+  Impl::Nrm2<RV_Internal, XMV_Internal>::nrm2(R_internal, X_internal, true);
 }
-}
-
-#endif // KOKKOSBLAS1_NRM2_HPP_
+}  // namespace KokkosBlas
 
+#endif  // KOKKOSBLAS1_NRM2_HPP_
diff --git a/src/blas/KokkosBlas1_nrm2_squared.hpp b/src/blas/KokkosBlas1_nrm2_squared.hpp
index a05c7e9e3a..e4054e944a 100644
--- a/src/blas/KokkosBlas1_nrm2_squared.hpp
+++ b/src/blas/KokkosBlas1_nrm2_squared.hpp
@@ -45,8 +45,9 @@
 #ifndef KOKKOSBLAS1_NRM2_SQUARED_HPP_
 #define KOKKOSBLAS1_NRM2_SQUARED_HPP_
 
-#include<KokkosBlas1_nrm2_spec.hpp>
-#include<KokkosKernels_helpers.hpp>
+#include <KokkosBlas1_nrm2_spec.hpp>
+#include <KokkosKernels_helpers.hpp>
+#include <KokkosKernels_Error.hpp>
 
 namespace KokkosBlas {
 
@@ -57,32 +58,35 @@ namespace KokkosBlas {
 /// \param x [in] Input 1-D View.
 ///
 /// \return The nrm2 product result; a single value.
-template<class XVector>
-typename Kokkos::Details::InnerProductSpaceTraits<typename XVector::non_const_value_type>::mag_type
-nrm2_squared (const XVector& x)
-{
-  static_assert (Kokkos::Impl::is_view<XVector>::value,
-                 "KokkosBlas::nrm2_squared: XVector must be a Kokkos::View.");
-  static_assert (XVector::rank == 1, "KokkosBlas::nrm2_squared: "
-                 "Both Vector inputs must have rank 1.");
-
-  typedef typename Kokkos::Details::InnerProductSpaceTraits<typename XVector::non_const_value_type>::mag_type mag_type;
-
-  typedef Kokkos::View<typename XVector::const_value_type*,
-    typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
-    typename XVector::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > XVector_Internal;
-
-  typedef Kokkos::View<mag_type,
-    typename XVector_Internal::array_layout,
-    Kokkos::HostSpace,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > RVector_Internal;
+template <class XVector>
+typename Kokkos::Details::InnerProductSpaceTraits<
+    typename XVector::non_const_value_type>::mag_type
+nrm2_squared(const XVector& x) {
+  static_assert(Kokkos::is_view<XVector>::value,
+                "KokkosBlas::nrm2_squared: XVector must be a Kokkos::View.");
+  static_assert(XVector::rank == 1,
+                "KokkosBlas::nrm2_squared: "
+                "Both Vector inputs must have rank 1.");
+
+  typedef typename Kokkos::Details::InnerProductSpaceTraits<
+      typename XVector::non_const_value_type>::mag_type mag_type;
+
+  typedef Kokkos::View<
+      typename XVector::const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
+      typename XVector::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      XVector_Internal;
+
+  typedef Kokkos::View<mag_type, typename XVector_Internal::array_layout,
+                       Kokkos::HostSpace,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      RVector_Internal;
 
   mag_type result;
   RVector_Internal R = RVector_Internal(&result);
   XVector_Internal X = x;
 
-  Impl::Nrm2<RVector_Internal,XVector_Internal>::nrm2 (R,X,false);
+  Impl::Nrm2<RVector_Internal, XVector_Internal>::nrm2(R, X, false);
   Kokkos::fence();
   return result;
 }
@@ -96,63 +100,63 @@ nrm2_squared (const XVector& x)
 /// \tparam XMV 1-D or 2-D Kokkos::View specialization.  It must have
 ///   the same rank as RMV, and its entries must be assignable to
 ///   those of RMV.
-template<class RV, class XMV>
-void
-nrm2_squared (const RV& R, const XMV& X,
-      typename std::enable_if<Kokkos::Impl::is_view<RV>::value, int>::type = 0)
-{
-  static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::nrm2_squared: "
-                 "R is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::nrm2_squared: "
-                 "X is not a Kokkos::View.");
-  static_assert (std::is_same<typename RV::value_type,
-                 typename RV::non_const_value_type>::value,
-                 "KokkosBlas::nrm2_squared: R is const.  "
-                 "It must be nonconst, because it is an output argument "
-                 "(we have to be able to write to its entries).");
-  static_assert (((RV::rank == 0) && (XMV::rank == 1)) ||
-                 ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::nrm2_squared: "
-                 "RV and XMV must either have rank 0 and 1 or rank 1 and 2.");
-  typedef typename Kokkos::Details::InnerProductSpaceTraits<typename XMV::non_const_value_type>::mag_type mag_type;
-  static_assert (std::is_same<typename RV::value_type,
-                 mag_type>::value,
-                 "KokkosBlas::nrm2: R must have the magnitude type of"
-                 "the xvectors value_type it is an output argument "
-                 "(we have to be able to write to its entries).");
+template <class RV, class XMV>
+void nrm2_squared(
+    const RV& R, const XMV& X,
+    typename std::enable_if<Kokkos::is_view<RV>::value, int>::type = 0) {
+  static_assert(Kokkos::is_view<RV>::value,
+                "KokkosBlas::nrm2_squared: "
+                "R is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<XMV>::value,
+                "KokkosBlas::nrm2_squared: "
+                "X is not a Kokkos::View.");
+  static_assert(std::is_same<typename RV::value_type,
+                             typename RV::non_const_value_type>::value,
+                "KokkosBlas::nrm2_squared: R is const.  "
+                "It must be nonconst, because it is an output argument "
+                "(we have to be able to write to its entries).");
+  static_assert(((RV::rank == 0) && (XMV::rank == 1)) ||
+                    ((RV::rank == 1) && (XMV::rank == 2)),
+                "KokkosBlas::nrm2_squared: "
+                "RV and XMV must either have rank 0 and 1 or rank 1 and 2.");
+  typedef typename Kokkos::Details::InnerProductSpaceTraits<
+      typename XMV::non_const_value_type>::mag_type mag_type;
+  static_assert(std::is_same<typename RV::value_type, mag_type>::value,
+                "KokkosBlas::nrm2: R must have the magnitude type of"
+                "the xvectors value_type it is an output argument "
+                "(we have to be able to write to its entries).");
 
   // Check compatibility of dimensions at run time.
   if (X.extent(1) != R.extent(0)) {
     std::ostringstream os;
     os << "KokkosBlas::nrm2 (MV): Dimensions of R and X do not match: "
-       << "R: " << R.extent(0)
-       << ", X: " << X.extent(0) << " x " << X.extent(1);
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+       << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x "
+       << X.extent(1);
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
-  using UnifiedXLayout = typename
-    KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout;
-  using UnifiedRVLayout = typename
-    KokkosKernels::Impl::GetUnifiedLayoutPreferring<RV, UnifiedXLayout>::array_layout;
+  using UnifiedXLayout =
+      typename KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout;
+  using UnifiedRVLayout =
+      typename KokkosKernels::Impl::GetUnifiedLayoutPreferring<
+          RV, UnifiedXLayout>::array_layout;
 
   // Create unmanaged versions of the input Views.  RV and XMV may be
   // rank 1 or rank 2.
-  typedef Kokkos::View<
-    typename RV::non_const_data_type,
-    UnifiedRVLayout,
-    typename RV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV_Internal;
-  typedef Kokkos::View<
-    typename XMV::const_data_type,
-    UnifiedXLayout,
-    typename XMV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > XMV_Internal;
-
-  RV_Internal R_internal = R;
+  typedef Kokkos::View<typename RV::non_const_data_type, UnifiedRVLayout,
+                       typename RV::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      RV_Internal;
+  typedef Kokkos::View<typename XMV::const_data_type, UnifiedXLayout,
+                       typename XMV::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      XMV_Internal;
+
+  RV_Internal R_internal  = R;
   XMV_Internal X_internal = X;
 
-  Impl::Nrm2<RV_Internal, XMV_Internal>::nrm2 (R_internal, X_internal,false);
+  Impl::Nrm2<RV_Internal, XMV_Internal>::nrm2(R_internal, X_internal, false);
 }
-}
-
-#endif // KOKKOSBLAS1_NRM2_HPP_
+}  // namespace KokkosBlas
 
+#endif  // KOKKOSBLAS1_NRM2_HPP_
diff --git a/src/blas/KokkosBlas1_nrm2w.hpp b/src/blas/KokkosBlas1_nrm2w.hpp
index c0e303395f..981897d9ae 100644
--- a/src/blas/KokkosBlas1_nrm2w.hpp
+++ b/src/blas/KokkosBlas1_nrm2w.hpp
@@ -45,8 +45,9 @@
 #ifndef KOKKOSBLAS1_NRM2W_HPP_
 #define KOKKOSBLAS1_NRM2W_HPP_
 
-#include<KokkosBlas1_nrm2w_spec.hpp>
-#include<KokkosKernels_helpers.hpp>
+#include <KokkosBlas1_nrm2w_spec.hpp>
+#include <KokkosKernels_helpers.hpp>
+#include <KokkosKernels_Error.hpp>
 
 namespace KokkosBlas {
 
@@ -57,32 +58,34 @@ namespace KokkosBlas {
 /// \param x [in] Input 1-D View.
 ///
 /// \return The nrm2w product result; a single value.
-template<class XVector>
-typename Kokkos::Details::InnerProductSpaceTraits<typename XVector::non_const_value_type>::mag_type
-nrm2w (const XVector& x, const XVector& w)
-{
-  static_assert (Kokkos::Impl::is_view<XVector>::value,
-                 "KokkosBlas::nrm2w: XVector must be a Kokkos::View.");
-  static_assert (XVector::rank == 1, "KokkosBlas::nrm2w: "
-                 "Both Vector inputs must have rank 1.");
-  typedef typename Kokkos::Details::InnerProductSpaceTraits<typename XVector::non_const_value_type>::mag_type mag_type;
+template <class XVector>
+typename Kokkos::Details::InnerProductSpaceTraits<
+    typename XVector::non_const_value_type>::mag_type
+nrm2w(const XVector& x, const XVector& w) {
+  static_assert(Kokkos::is_view<XVector>::value,
+                "KokkosBlas::nrm2w: XVector must be a Kokkos::View.");
+  static_assert(XVector::rank == 1,
+                "KokkosBlas::nrm2w: "
+                "Both Vector inputs must have rank 1.");
+  typedef typename Kokkos::Details::InnerProductSpaceTraits<
+      typename XVector::non_const_value_type>::mag_type mag_type;
 
-  typedef Kokkos::View<typename XVector::const_value_type*,
-    typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
-    typename XVector::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > XVector_Internal;
+  typedef Kokkos::View<
+      typename XVector::const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
+      typename XVector::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      XVector_Internal;
 
-  typedef Kokkos::View<mag_type,
-    Kokkos::LayoutLeft,
-    Kokkos::HostSpace,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > RVector_Internal;
+  typedef Kokkos::View<mag_type, Kokkos::LayoutLeft, Kokkos::HostSpace,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      RVector_Internal;
 
   mag_type result;
   RVector_Internal R = RVector_Internal(&result);
   XVector_Internal X = x;
   XVector_Internal W = w;
 
-  Impl::Nrm2w<RVector_Internal,XVector_Internal>::nrm2w (R,X,W,true);
+  Impl::Nrm2w<RVector_Internal, XVector_Internal>::nrm2w(R, X, W, true);
   Kokkos::fence();
   return result;
 }
@@ -96,66 +99,64 @@ nrm2w (const XVector& x, const XVector& w)
 /// \tparam XMV 1-D or 2-D Kokkos::View specialization.  It must have
 ///   the same rank as RMV, and its entries must be assignable to
 ///   those of RMV.
-template<class RV, class XMV>
-void
-nrm2w (const RV& R, const XMV& X, const XMV& W,
-      typename std::enable_if<Kokkos::Impl::is_view<RV>::value, int>::type = 0)
-{
-  static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::nrm2w: "
-                 "R is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::nrm2w: "
-                 "X is not a Kokkos::View.");
-  static_assert (std::is_same<typename RV::value_type,
-                 typename RV::non_const_value_type>::value,
-                 "KokkosBlas::nrm2w: R is const.  "
-                 "It must be nonconst, because it is an output argument "
-                 "(we have to be able to write to its entries).");
-  static_assert (((RV::rank == 0) && (XMV::rank == 1)) ||
-                 ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::nrm2w: "
-                 "RV and XMV must either have rank 0 and 1 or rank 1 and 2.");
-
-  typedef typename Kokkos::Details::InnerProductSpaceTraits<typename XMV::non_const_value_type>::mag_type mag_type;
-  static_assert (std::is_same<typename RV::value_type,
-                 mag_type>::value,
-                 "KokkosBlas::nrm2w: R must have the magnitude type of"
-                 "the xvectors value_type it is an output argument "
-                 "(we have to be able to write to its entries).");
+template <class RV, class XMV>
+void nrm2w(const RV& R, const XMV& X, const XMV& W,
+           typename std::enable_if<Kokkos::is_view<RV>::value, int>::type = 0) {
+  static_assert(Kokkos::is_view<RV>::value,
+                "KokkosBlas::nrm2w: "
+                "R is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<XMV>::value,
+                "KokkosBlas::nrm2w: "
+                "X is not a Kokkos::View.");
+  static_assert(std::is_same<typename RV::value_type,
+                             typename RV::non_const_value_type>::value,
+                "KokkosBlas::nrm2w: R is const.  "
+                "It must be nonconst, because it is an output argument "
+                "(we have to be able to write to its entries).");
+  static_assert(((RV::rank == 0) && (XMV::rank == 1)) ||
+                    ((RV::rank == 1) && (XMV::rank == 2)),
+                "KokkosBlas::nrm2w: "
+                "RV and XMV must either have rank 0 and 1 or rank 1 and 2.");
+
+  typedef typename Kokkos::Details::InnerProductSpaceTraits<
+      typename XMV::non_const_value_type>::mag_type mag_type;
+  static_assert(std::is_same<typename RV::value_type, mag_type>::value,
+                "KokkosBlas::nrm2w: R must have the magnitude type of"
+                "the xvectors value_type it is an output argument "
+                "(we have to be able to write to its entries).");
 
   // Check compatibility of dimensions at run time.
   if (X.extent(1) != R.extent(0)) {
     std::ostringstream os;
     os << "KokkosBlas::nrm2w (MV): Dimensions of R and X do not match: "
-       << "R: " << R.extent(0)
-       << ", X: " << X.extent(0) << " x " << X.extent(1);
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+       << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x "
+       << X.extent(1);
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
   // Create unmanaged versions of the input Views.  RV and XMV may be
   // rank 1 or rank 2.
   typedef Kokkos::View<
-    typename Kokkos::Impl::if_c<
-      RV::rank == 0,
-      typename RV::non_const_value_type,
-      typename RV::non_const_value_type* >::type,
-    typename KokkosKernels::Impl::GetUnifiedLayout<RV>::array_layout,
-    typename RV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV_Internal;
+      typename std::conditional<RV::rank == 0,
+                                typename RV::non_const_value_type,
+                                typename RV::non_const_value_type*>::type,
+      typename KokkosKernels::Impl::GetUnifiedLayout<RV>::array_layout,
+      typename RV::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      RV_Internal;
   typedef Kokkos::View<
-    typename Kokkos::Impl::if_c<
-      XMV::rank == 1,
-      typename XMV::const_value_type*,
-      typename XMV::const_value_type** >::type,
-    typename KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout,
-    typename XMV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > XMV_Internal;
+      typename std::conditional<XMV::rank == 1, typename XMV::const_value_type*,
+                                typename XMV::const_value_type**>::type,
+      typename KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout,
+      typename XMV::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      XMV_Internal;
 
-  RV_Internal R_internal = R;
+  RV_Internal R_internal  = R;
   XMV_Internal X_internal = X;
   XMV_Internal W_internal = W;
 
-  Impl::Nrm2w<RV_Internal, XMV_Internal>::nrm2w (R_internal, X_internal, W_internal,true);
+  Impl::Nrm2w<RV_Internal, XMV_Internal>::nrm2w(R_internal, X_internal,
+                                                W_internal, true);
 }
-}
-
-#endif // KOKKOSBLAS1_NRM2W_HPP_
+}  // namespace KokkosBlas
 
+#endif  // KOKKOSBLAS1_NRM2W_HPP_
diff --git a/src/blas/KokkosBlas1_nrm2w_squared.hpp b/src/blas/KokkosBlas1_nrm2w_squared.hpp
index f5c8df54b9..2ab07af0c5 100644
--- a/src/blas/KokkosBlas1_nrm2w_squared.hpp
+++ b/src/blas/KokkosBlas1_nrm2w_squared.hpp
@@ -45,8 +45,9 @@
 #ifndef KOKKOSBLAS1_NRM2W_SQUARED_HPP_
 #define KOKKOSBLAS1_NRM2W_SQUARED_HPP_
 
-#include<KokkosBlas1_nrm2w_spec.hpp>
-#include<KokkosKernels_helpers.hpp>
+#include <KokkosBlas1_nrm2w_spec.hpp>
+#include <KokkosKernels_helpers.hpp>
+#include <KokkosKernels_Error.hpp>
 
 namespace KokkosBlas {
 
@@ -57,33 +58,35 @@ namespace KokkosBlas {
 /// \param x [in] Input 1-D View.
 ///
 /// \return The nrm2w product result; a single value.
-template<class XVector>
-typename Kokkos::Details::InnerProductSpaceTraits<typename XVector::non_const_value_type>::mag_type
-nrm2w_squared (const XVector& x, const XVector& w)
-{
-  static_assert (Kokkos::Impl::is_view<XVector>::value,
-                 "KokkosBlas::nrm2w_squared: XVector must be a Kokkos::View.");
-  static_assert (XVector::rank == 1, "KokkosBlas::nrm2w_squared: "
-                 "Both Vector inputs must have rank 1.");
+template <class XVector>
+typename Kokkos::Details::InnerProductSpaceTraits<
+    typename XVector::non_const_value_type>::mag_type
+nrm2w_squared(const XVector& x, const XVector& w) {
+  static_assert(Kokkos::is_view<XVector>::value,
+                "KokkosBlas::nrm2w_squared: XVector must be a Kokkos::View.");
+  static_assert(XVector::rank == 1,
+                "KokkosBlas::nrm2w_squared: "
+                "Both Vector inputs must have rank 1.");
+
+  typedef typename Kokkos::Details::InnerProductSpaceTraits<
+      typename XVector::non_const_value_type>::mag_type mag_type;
 
-  typedef typename Kokkos::Details::InnerProductSpaceTraits<typename XVector::non_const_value_type>::mag_type mag_type;
-
-  typedef Kokkos::View<typename XVector::const_value_type*,
-    typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
-    typename XVector::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > XVector_Internal;
+  typedef Kokkos::View<
+      typename XVector::const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
+      typename XVector::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      XVector_Internal;
 
-  typedef Kokkos::View<mag_type,
-    Kokkos::LayoutLeft,
-    Kokkos::HostSpace,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > RVector_Internal;
+  typedef Kokkos::View<mag_type, Kokkos::LayoutLeft, Kokkos::HostSpace,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      RVector_Internal;
 
   mag_type result;
   RVector_Internal R = RVector_Internal(&result);
   XVector_Internal X = x;
   XVector_Internal W = w;
 
-  Impl::Nrm2w<RVector_Internal,XVector_Internal>::nrm2w (R,X,W,false);
+  Impl::Nrm2w<RVector_Internal, XVector_Internal>::nrm2w(R, X, W, false);
   Kokkos::fence();
   return result;
 }
@@ -97,65 +100,64 @@ nrm2w_squared (const XVector& x, const XVector& w)
 /// \tparam XMV 1-D or 2-D Kokkos::View specialization.  It must have
 ///   the same rank as RMV, and its entries must be assignable to
 ///   those of RMV.
-template<class RV, class XMV>
-void
-nrm2w_squared (const RV& R, const XMV& X, const XMV& W,
-      typename std::enable_if<Kokkos::Impl::is_view<RV>::value, int>::type = 0)
-{
-  static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::nrm2w_squared: "
-                 "R is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::nrm2w_squared: "
-                 "X is not a Kokkos::View.");
-  static_assert (std::is_same<typename RV::value_type,
-                 typename RV::non_const_value_type>::value,
-                 "KokkosBlas::nrm2w_squared: R is const.  "
-                 "It must be nonconst, because it is an output argument "
-                 "(we have to be able to write to its entries).");
-  static_assert (((RV::rank == 0) && (XMV::rank == 1)) ||
-                 ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::nrm2w_squared: "
-                 "RV and XMV must either have rank 0 and 1 or rank 1 and 2.");
-  typedef typename Kokkos::Details::InnerProductSpaceTraits<typename XMV::non_const_value_type>::mag_type mag_type;
-  static_assert (std::is_same<typename RV::value_type,
-                 mag_type>::value,
-                 "KokkosBlas::nrm2w: R must have the magnitude type of"
-                 "the xvectors value_type it is an output argument "
-                 "(we have to be able to write to its entries).");
+template <class RV, class XMV>
+void nrm2w_squared(
+    const RV& R, const XMV& X, const XMV& W,
+    typename std::enable_if<Kokkos::is_view<RV>::value, int>::type = 0) {
+  static_assert(Kokkos::is_view<RV>::value,
+                "KokkosBlas::nrm2w_squared: "
+                "R is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<XMV>::value,
+                "KokkosBlas::nrm2w_squared: "
+                "X is not a Kokkos::View.");
+  static_assert(std::is_same<typename RV::value_type,
+                             typename RV::non_const_value_type>::value,
+                "KokkosBlas::nrm2w_squared: R is const.  "
+                "It must be nonconst, because it is an output argument "
+                "(we have to be able to write to its entries).");
+  static_assert(((RV::rank == 0) && (XMV::rank == 1)) ||
+                    ((RV::rank == 1) && (XMV::rank == 2)),
+                "KokkosBlas::nrm2w_squared: "
+                "RV and XMV must either have rank 0 and 1 or rank 1 and 2.");
+  typedef typename Kokkos::Details::InnerProductSpaceTraits<
+      typename XMV::non_const_value_type>::mag_type mag_type;
+  static_assert(std::is_same<typename RV::value_type, mag_type>::value,
+                "KokkosBlas::nrm2w: R must have the magnitude type of"
+                "the xvectors value_type it is an output argument "
+                "(we have to be able to write to its entries).");
 
   // Check compatibility of dimensions at run time.
   if (X.extent(1) != R.extent(0)) {
     std::ostringstream os;
     os << "KokkosBlas::nrm2w (MV): Dimensions of R and X do not match: "
-       << "R: " << R.extent(0)
-       << ", X: " << X.extent(0) << " x " << X.extent(1);
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+       << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x "
+       << X.extent(1);
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
   // Create unmanaged versions of the input Views.  RV and XMV may be
   // rank 1 or rank 2.
   typedef Kokkos::View<
-    typename Kokkos::Impl::if_c<
-      RV::rank == 0,
-      typename RV::non_const_value_type,
-      typename RV::non_const_value_type* >::type,
-    typename KokkosKernels::Impl::GetUnifiedLayout<RV>::array_layout,
-    typename RV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV_Internal;
+      typename std::conditional<RV::rank == 0,
+                                typename RV::non_const_value_type,
+                                typename RV::non_const_value_type*>::type,
+      typename KokkosKernels::Impl::GetUnifiedLayout<RV>::array_layout,
+      typename RV::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      RV_Internal;
   typedef Kokkos::View<
-    typename Kokkos::Impl::if_c<
-      XMV::rank == 1,
-      typename XMV::const_value_type*,
-      typename XMV::const_value_type** >::type,
-    typename KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout,
-    typename XMV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > XMV_Internal;
+      typename std::conditional<XMV::rank == 1, typename XMV::const_value_type*,
+                                typename XMV::const_value_type**>::type,
+      typename KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout,
+      typename XMV::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      XMV_Internal;
 
-  RV_Internal R_internal = R;
+  RV_Internal R_internal  = R;
   XMV_Internal X_internal = X;
   XMV_Internal W_internal = W;
 
-  Impl::Nrm2w<RV_Internal, XMV_Internal>::nrm2w (R_internal, X_internal, W_internal,false);
+  Impl::Nrm2w<RV_Internal, XMV_Internal>::nrm2w(R_internal, X_internal,
+                                                W_internal, false);
 }
-}
-
-#endif // KOKKOSBLAS1_NRM2W_HPP_
+}  // namespace KokkosBlas
 
+#endif  // KOKKOSBLAS1_NRM2W_HPP_
diff --git a/src/blas/KokkosBlas1_nrminf.hpp b/src/blas/KokkosBlas1_nrminf.hpp
index 5133154120..915c085376 100644
--- a/src/blas/KokkosBlas1_nrminf.hpp
+++ b/src/blas/KokkosBlas1_nrminf.hpp
@@ -45,8 +45,9 @@
 #ifndef KOKKOSBLAS1_NRMINF_HPP_
 #define KOKKOSBLAS1_NRMINF_HPP_
 
-#include<KokkosBlas1_nrminf_spec.hpp>
-#include<KokkosKernels_helpers.hpp>
+#include <KokkosBlas1_nrminf_spec.hpp>
+#include <KokkosKernels_helpers.hpp>
+#include <KokkosKernels_Error.hpp>
 
 namespace KokkosBlas {
 
@@ -57,31 +58,34 @@ namespace KokkosBlas {
 /// \param x [in] Input 1-D View.
 ///
 /// \return The nrminf product result; a single value.
-template<class XVector>
-typename Kokkos::Details::InnerProductSpaceTraits<typename XVector::non_const_value_type>::mag_type
-nrminf (const XVector& x)
-{
-  static_assert (Kokkos::Impl::is_view<XVector>::value,
-                 "KokkosBlas::nrminf: XVector must be a Kokkos::View.");
-  static_assert (XVector::rank == 1, "KokkosBlas::nrminf: "
-                 "Both Vector inputs must have rank 1.");
-  typedef typename Kokkos::Details::InnerProductSpaceTraits<typename XVector::non_const_value_type>::mag_type mag_type;
-
-  typedef Kokkos::View<typename XVector::const_value_type*,
-    typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
-    typename XVector::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > XVector_Internal;
-
-  typedef Kokkos::View<mag_type,
-    typename XVector_Internal::array_layout,
-    Kokkos::HostSpace,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > RVector_Internal;
+template <class XVector>
+typename Kokkos::Details::InnerProductSpaceTraits<
+    typename XVector::non_const_value_type>::mag_type
+nrminf(const XVector& x) {
+  static_assert(Kokkos::is_view<XVector>::value,
+                "KokkosBlas::nrminf: XVector must be a Kokkos::View.");
+  static_assert(XVector::rank == 1,
+                "KokkosBlas::nrminf: "
+                "Both Vector inputs must have rank 1.");
+  typedef typename Kokkos::Details::InnerProductSpaceTraits<
+      typename XVector::non_const_value_type>::mag_type mag_type;
+
+  typedef Kokkos::View<
+      typename XVector::const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
+      typename XVector::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      XVector_Internal;
+
+  typedef Kokkos::View<mag_type, typename XVector_Internal::array_layout,
+                       Kokkos::HostSpace,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      RVector_Internal;
 
   mag_type result;
   RVector_Internal R = RVector_Internal(&result);
   XVector_Internal X = x;
 
-  Impl::NrmInf<RVector_Internal,XVector_Internal>::nrminf (R,X);
+  Impl::NrmInf<RVector_Internal, XVector_Internal>::nrminf(R, X);
   Kokkos::fence();
   return result;
 }
@@ -95,71 +99,69 @@ nrminf (const XVector& x)
 /// \tparam XMV 1-D or 2-D Kokkos::View specialization.  It must have
 ///   the same rank as RMV, and its entries must be assignable to
 ///   those of RMV.
-template<class RV, class XMV>
-void
-nrminf (const RV& R, const XMV& X,
-      typename std::enable_if<Kokkos::Impl::is_view<RV>::value, int>::type = 0)
-{
-  static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::nrminf: "
-                 "R is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::nrminf: "
-                 "X is not a Kokkos::View.");
-  static_assert (std::is_same<typename RV::value_type,
-                 typename RV::non_const_value_type>::value,
-                 "KokkosBlas::nrminf: R is const.  "
-                 "It must be nonconst, because it is an output argument "
-                 "(we have to be able to write to its entries).");
-  static_assert (((RV::rank == 0) && (XMV::rank == 1)) ||
-                 ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::nrminf: "
-                 "RV and XMV must either have rank 0 and 1 or rank 1 and 2.");
-
-  typedef typename Kokkos::Details::InnerProductSpaceTraits<typename XMV::non_const_value_type>::mag_type mag_type;
-  static_assert (std::is_same<typename RV::value_type,
-                 mag_type>::value,
-                 "KokkosBlas::nrminf: R must have the magnitude type of"
-                 "the xvectors value_type it is an output argument "
-                 "(we have to be able to write to its entries).");
+template <class RV, class XMV>
+void nrminf(
+    const RV& R, const XMV& X,
+    typename std::enable_if<Kokkos::is_view<RV>::value, int>::type = 0) {
+  static_assert(Kokkos::is_view<RV>::value,
+                "KokkosBlas::nrminf: "
+                "R is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<XMV>::value,
+                "KokkosBlas::nrminf: "
+                "X is not a Kokkos::View.");
+  static_assert(std::is_same<typename RV::value_type,
+                             typename RV::non_const_value_type>::value,
+                "KokkosBlas::nrminf: R is const.  "
+                "It must be nonconst, because it is an output argument "
+                "(we have to be able to write to its entries).");
+  static_assert(((RV::rank == 0) && (XMV::rank == 1)) ||
+                    ((RV::rank == 1) && (XMV::rank == 2)),
+                "KokkosBlas::nrminf: "
+                "RV and XMV must either have rank 0 and 1 or rank 1 and 2.");
+
+  typedef typename Kokkos::Details::InnerProductSpaceTraits<
+      typename XMV::non_const_value_type>::mag_type mag_type;
+  static_assert(std::is_same<typename RV::value_type, mag_type>::value,
+                "KokkosBlas::nrminf: R must have the magnitude type of"
+                "the xvectors value_type it is an output argument "
+                "(we have to be able to write to its entries).");
 
   // Check compatibility of dimensions at run time.
   if (X.extent(1) != R.extent(0)) {
     std::ostringstream os;
     os << "KokkosBlas::nrminf (MV): Dimensions of R and X do not match: "
-       << "R: " << R.extent(0)
-       << ", X: " << X.extent(0) << " x " << X.extent(1);
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+       << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x "
+       << X.extent(1);
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
-  using UnifiedXLayout = typename
-    KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout;
-  using UnifiedRVLayout = typename
-    KokkosKernels::Impl::GetUnifiedLayoutPreferring<RV, UnifiedXLayout>::array_layout;
+  using UnifiedXLayout =
+      typename KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout;
+  using UnifiedRVLayout =
+      typename KokkosKernels::Impl::GetUnifiedLayoutPreferring<
+          RV, UnifiedXLayout>::array_layout;
 
   // Create unmanaged versions of the input Views.  RV and XMV may be
   // rank 1 or rank 2.
+  typedef Kokkos::View<typename std::conditional<
+                           RV::rank == 0, typename RV::non_const_value_type,
+                           typename RV::non_const_value_type*>::type,
+                       UnifiedRVLayout, typename RV::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      RV_Internal;
   typedef Kokkos::View<
-    typename Kokkos::Impl::if_c<
-      RV::rank == 0,
-      typename RV::non_const_value_type,
-      typename RV::non_const_value_type* >::type,
-    UnifiedRVLayout,
-    typename RV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV_Internal;
-  typedef Kokkos::View<
-    typename Kokkos::Impl::if_c<
-      XMV::rank == 1,
-      typename XMV::const_value_type*,
-      typename XMV::const_value_type** >::type,
-    UnifiedXLayout,
-    typename XMV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > XMV_Internal;
-
-  RV_Internal R_internal = R;
-  XMV_Internal X_internal = X;
+      typename std::conditional<XMV::rank == 1, typename XMV::const_value_type*,
+                                typename XMV::const_value_type**>::type,
+      UnifiedXLayout, typename XMV::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      XMV_Internal;
 
-  Impl::NrmInf<RV_Internal, XMV_Internal>::nrminf (R_internal, X_internal);
-}
+  RV_Internal R_internal  = R;
+  XMV_Internal X_internal = X;
 
+  Impl::NrmInf<RV_Internal, XMV_Internal>::nrminf(R_internal, X_internal);
 }
 
-#endif // KOKKOSBLAS1_NRMINF_HPP_
+}  // namespace KokkosBlas
 
+#endif  // KOKKOSBLAS1_NRMINF_HPP_
diff --git a/src/blas/KokkosBlas1_reciprocal.hpp b/src/blas/KokkosBlas1_reciprocal.hpp
index eb58ddf30f..1d185d73da 100644
--- a/src/blas/KokkosBlas1_reciprocal.hpp
+++ b/src/blas/KokkosBlas1_reciprocal.hpp
@@ -45,8 +45,9 @@
 #ifndef KOKKOSBLAS1_RECIPROCAL_HPP_
 #define KOKKOSBLAS1_RECIPROCAL_HPP_
 
-#include<KokkosBlas1_reciprocal_spec.hpp>
-#include<KokkosKernels_helpers.hpp>
+#include <KokkosBlas1_reciprocal_spec.hpp>
+#include <KokkosKernels_helpers.hpp>
+#include <KokkosKernels_Error.hpp>
 
 namespace KokkosBlas {
 
@@ -59,59 +60,57 @@ namespace KokkosBlas {
 /// \tparam XMV 1-D or 2-D Kokkos::View specialization.  It must have
 ///   the same rank as RMV, and its entries must be assignable to
 ///   those of RMV.
-template<class RMV, class XMV>
-void
-reciprocal (const RMV& R, const XMV& X)
-{
-  static_assert (Kokkos::Impl::is_view<RMV>::value, "KokkosBlas::reciprocal: "
-                 "R is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::reciprocal: "
-                 "X is not a Kokkos::View.");
-  static_assert (std::is_same<typename RMV::value_type,
-                 typename RMV::non_const_value_type>::value,
-                 "KokkosBlas::reciprocal: R is const.  "
-                 "It must be nonconst, because it is an output argument "
-                 "(we have to be able to write to its entries).");
-  static_assert (int(RMV::rank) == int(XMV::rank), "KokkosBlas::reciprocal: "
-                 "R and X must have the same rank.");
-  static_assert (RMV::rank == 1 || RMV::rank == 2, "KokkosBlas::reciprocal: "
-                 "RMV and XMV must either have rank 1 or rank 2.");
+template <class RMV, class XMV>
+void reciprocal(const RMV& R, const XMV& X) {
+  static_assert(Kokkos::is_view<RMV>::value,
+                "KokkosBlas::reciprocal: "
+                "R is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<XMV>::value,
+                "KokkosBlas::reciprocal: "
+                "X is not a Kokkos::View.");
+  static_assert(std::is_same<typename RMV::value_type,
+                             typename RMV::non_const_value_type>::value,
+                "KokkosBlas::reciprocal: R is const.  "
+                "It must be nonconst, because it is an output argument "
+                "(we have to be able to write to its entries).");
+  static_assert(int(RMV::rank) == int(XMV::rank),
+                "KokkosBlas::reciprocal: "
+                "R and X must have the same rank.");
+  static_assert(RMV::rank == 1 || RMV::rank == 2,
+                "KokkosBlas::reciprocal: "
+                "RMV and XMV must either have rank 1 or rank 2.");
 
   // Check compatibility of dimensions at run time.
-  if (X.extent(0) != R.extent(0) ||
-      X.extent(1) != R.extent(1)) {
+  if (X.extent(0) != R.extent(0) || X.extent(1) != R.extent(1)) {
     std::ostringstream os;
     os << "KokkosBlas::reciprocal (MV): Dimensions of R and X do not match: "
-       << "R: " << R.extent(0) << " x " << R.extent(1)
-       << ", X: " << X.extent(0) << " x " << X.extent(1);
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+       << "R: " << R.extent(0) << " x " << R.extent(1) << ", X: " << X.extent(0)
+       << " x " << X.extent(1);
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
   // Create unmanaged versions of the input Views.  RMV and XMV may be
   // rank 1 or rank 2.
   typedef Kokkos::View<
-    typename Kokkos::Impl::if_c<
-      RMV::rank == 1,
-      typename RMV::non_const_value_type*,
-      typename RMV::non_const_value_type** >::type,
-    typename KokkosKernels::Impl::GetUnifiedLayout<RMV>::array_layout,
-    typename RMV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > RMV_Internal;
+      typename std::conditional<RMV::rank == 1,
+                                typename RMV::non_const_value_type*,
+                                typename RMV::non_const_value_type**>::type,
+      typename KokkosKernels::Impl::GetUnifiedLayout<RMV>::array_layout,
+      typename RMV::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      RMV_Internal;
   typedef Kokkos::View<
-    typename Kokkos::Impl::if_c<
-      XMV::rank == 1,
-      typename XMV::const_value_type*,
-      typename XMV::const_value_type** >::type,
-    typename KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout,
-    typename XMV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > XMV_Internal;
+      typename std::conditional<XMV::rank == 1, typename XMV::const_value_type*,
+                                typename XMV::const_value_type**>::type,
+      typename KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout,
+      typename XMV::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      XMV_Internal;
 
   RMV_Internal R_internal = R;
   XMV_Internal X_internal = X;
 
-  Impl::Reciprocal<RMV_Internal, XMV_Internal>::reciprocal (R_internal, X_internal);
+  Impl::Reciprocal<RMV_Internal, XMV_Internal>::reciprocal(R_internal,
+                                                           X_internal);
 }
-}
-
-#endif // KOKKOSBLAS1_RECIPROCAL_HPP_
+}  // namespace KokkosBlas
 
+#endif  // KOKKOSBLAS1_RECIPROCAL_HPP_
diff --git a/src/blas/KokkosBlas1_scal.hpp b/src/blas/KokkosBlas1_scal.hpp
index b06fbd9f98..2fc4f92f58 100644
--- a/src/blas/KokkosBlas1_scal.hpp
+++ b/src/blas/KokkosBlas1_scal.hpp
@@ -45,67 +45,69 @@
 #ifndef KOKKOSBLAS1_SCAL_HPP_
 #define KOKKOSBLAS1_SCAL_HPP_
 
-#include<KokkosBlas1_scal_spec.hpp>
-#include<KokkosKernels_helpers.hpp>
+#include <KokkosBlas1_scal_spec.hpp>
+#include <KokkosKernels_helpers.hpp>
+#include <KokkosKernels_Error.hpp>
 
 namespace KokkosBlas {
 
-template<class RMV, class AV, class XMV>
-void
-scal (const RMV& R, const AV& a, const XMV& X)
-{
-  static_assert (Kokkos::Impl::is_view<RMV>::value, "KokkosBlas::scal: "
-                 "R is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::scal: "
-                 "X is not a Kokkos::View.");
-  static_assert (std::is_same<typename RMV::value_type,
-                 typename RMV::non_const_value_type>::value,
-                 "KokkosBlas::scal: R is const.  "
-                 "It must be nonconst, because it is an output argument "
-                 "(we have to be able to write to its entries).");
-  static_assert ((int) RMV::rank == (int) XMV::rank, "KokkosBlas::scal: "
-                 "R and X must have the same rank.");
-  static_assert (RMV::rank == 1 || RMV::rank == 2, "KokkosBlas::scal: "
-                 "RMV and XMV must either have rank 1 or rank 2.");
+template <class RMV, class AV, class XMV>
+void scal(const RMV& R, const AV& a, const XMV& X) {
+  static_assert(Kokkos::is_view<RMV>::value,
+                "KokkosBlas::scal: "
+                "R is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<XMV>::value,
+                "KokkosBlas::scal: "
+                "X is not a Kokkos::View.");
+  static_assert(std::is_same<typename RMV::value_type,
+                             typename RMV::non_const_value_type>::value,
+                "KokkosBlas::scal: R is const.  "
+                "It must be nonconst, because it is an output argument "
+                "(we have to be able to write to its entries).");
+  static_assert((int)RMV::rank == (int)XMV::rank,
+                "KokkosBlas::scal: "
+                "R and X must have the same rank.");
+  static_assert(RMV::rank == 1 || RMV::rank == 2,
+                "KokkosBlas::scal: "
+                "RMV and XMV must either have rank 1 or rank 2.");
 
   // Check compatibility of dimensions at run time.
-  if (X.extent(0) != R.extent(0) ||
-      X.extent(1) != R.extent(1)) {
+  if (X.extent(0) != R.extent(0) || X.extent(1) != R.extent(1)) {
     std::ostringstream os;
     os << "KokkosBlas::scal: Dimensions of R and X do not match: "
-       << "R: " << R.extent(0) << " x " << R.extent(1)
-       << ", X: " << X.extent(0) << " x " << X.extent(1);
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+       << "R: " << R.extent(0) << " x " << R.extent(1) << ", X: " << X.extent(0)
+       << " x " << X.extent(1);
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
-  using UnifiedRLayout = typename
-    KokkosKernels::Impl::GetUnifiedLayout<RMV>::array_layout;
-  using UnifiedXLayout = typename
-    KokkosKernels::Impl::GetUnifiedLayoutPreferring<XMV, UnifiedRLayout>::array_layout;
+  using UnifiedRLayout =
+      typename KokkosKernels::Impl::GetUnifiedLayout<RMV>::array_layout;
+  using UnifiedXLayout =
+      typename KokkosKernels::Impl::GetUnifiedLayoutPreferring<
+          XMV, UnifiedRLayout>::array_layout;
 
   // Create unmanaged versions of the input Views.  RMV and XMV may be
   // rank 1 or rank 2.  AV may be either a rank-1 View, or a scalar
   // value.
-  typedef Kokkos::View<
-    typename RMV::non_const_data_type,
-    UnifiedRLayout,
-    typename RMV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > RMV_Internal;
-  typedef Kokkos::View<
-    typename XMV::const_data_type,
-    UnifiedXLayout,
-    typename XMV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > XMV_Internal;
+  typedef Kokkos::View<typename RMV::non_const_data_type, UnifiedRLayout,
+                       typename RMV::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      RMV_Internal;
+  typedef Kokkos::View<typename XMV::const_data_type, UnifiedXLayout,
+                       typename XMV::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      XMV_Internal;
   typedef typename KokkosKernels::Impl::GetUnifiedScalarViewType<
-    AV, XMV_Internal, true>::type AV_Internal;
+      AV, XMV_Internal, true>::type AV_Internal;
 
   RMV_Internal R_internal = R;
-  AV_Internal  a_internal = a;
+  AV_Internal a_internal  = a;
   XMV_Internal X_internal = X;
 
-  Impl::Scal<RMV_Internal, AV_Internal, XMV_Internal>::scal (R_internal, a_internal, X_internal);
+  Impl::Scal<RMV_Internal, AV_Internal, XMV_Internal>::scal(
+      R_internal, a_internal, X_internal);
 }
 
-}
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/blas/KokkosBlas1_sum.hpp b/src/blas/KokkosBlas1_sum.hpp
index 3bfd1c0d9b..546b34440a 100644
--- a/src/blas/KokkosBlas1_sum.hpp
+++ b/src/blas/KokkosBlas1_sum.hpp
@@ -45,8 +45,9 @@
 #ifndef KOKKOSBLAS1_SUM_HPP_
 #define KOKKOSBLAS1_SUM_HPP_
 
-#include<KokkosBlas1_sum_spec.hpp>
-#include<KokkosKernels_helpers.hpp>
+#include <KokkosBlas1_sum_spec.hpp>
+#include <KokkosKernels_helpers.hpp>
+#include <KokkosKernels_Error.hpp>
 
 namespace KokkosBlas {
 
@@ -57,31 +58,31 @@ namespace KokkosBlas {
 /// \param x [in] Input 1-D View.
 ///
 /// \return The sum product result; a single value.
-template<class XVector>
-typename XVector::non_const_value_type
-sum (const XVector& x)
-{
-  static_assert (Kokkos::Impl::is_view<XVector>::value,
-                 "KokkosBlas::sum: XVector must be a Kokkos::View.");
-  static_assert (XVector::rank == 1, "KokkosBlas::sum: "
-                 "Both Vector inputs must have rank 1.");
-
-  typedef Kokkos::View<typename XVector::const_value_type*,
-    typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
-    typename XVector::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > XVector_Internal;
+template <class XVector>
+typename XVector::non_const_value_type sum(const XVector& x) {
+  static_assert(Kokkos::is_view<XVector>::value,
+                "KokkosBlas::sum: XVector must be a Kokkos::View.");
+  static_assert(XVector::rank == 1,
+                "KokkosBlas::sum: "
+                "Both Vector inputs must have rank 1.");
 
   typedef Kokkos::View<
-    typename XVector::non_const_value_type,
-    typename XVector_Internal::array_layout,
-    Kokkos::HostSpace,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > RVector_Internal;
+      typename XVector::const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
+      typename XVector::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      XVector_Internal;
+
+  typedef Kokkos::View<typename XVector::non_const_value_type,
+                       typename XVector_Internal::array_layout,
+                       Kokkos::HostSpace,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      RVector_Internal;
 
   typename XVector::non_const_value_type result;
   RVector_Internal R = RVector_Internal(&result);
   XVector_Internal X = x;
 
-  Impl::Sum<RVector_Internal,XVector_Internal>::sum (R,X);
+  Impl::Sum<RVector_Internal, XVector_Internal>::sum(R, X);
   Kokkos::fence();
   return result;
 }
@@ -95,58 +96,57 @@ sum (const XVector& x)
 /// \tparam XMV 1-D or 2-D Kokkos::View specialization.  It must have
 ///   the same rank as RMV, and its entries must be assignable to
 ///   those of RMV.
-template<class RV, class XMV>
-void
-sum (const RV& R, const XMV& X,
-      typename std::enable_if<Kokkos::Impl::is_view<RV>::value, int>::type = 0)
-{
-  static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::sum: "
-                 "R is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::sum: "
-                 "X is not a Kokkos::View.");
-  static_assert (std::is_same<typename RV::value_type,
-                 typename RV::non_const_value_type>::value,
-                 "KokkosBlas::sum: R is const.  "
-                 "It must be nonconst, because it is an output argument "
-                 "(we have to be able to write to its entries).");
-  static_assert (((RV::rank == 0) && (XMV::rank == 1)) ||
-                 ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::sum: "
-                 "RV and XMV must either have rank 0 and 1 or rank 1 and 2.");
+template <class RV, class XMV>
+void sum(const RV& R, const XMV& X,
+         typename std::enable_if<Kokkos::is_view<RV>::value, int>::type = 0) {
+  static_assert(Kokkos::is_view<RV>::value,
+                "KokkosBlas::sum: "
+                "R is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<XMV>::value,
+                "KokkosBlas::sum: "
+                "X is not a Kokkos::View.");
+  static_assert(std::is_same<typename RV::value_type,
+                             typename RV::non_const_value_type>::value,
+                "KokkosBlas::sum: R is const.  "
+                "It must be nonconst, because it is an output argument "
+                "(we have to be able to write to its entries).");
+  static_assert(((RV::rank == 0) && (XMV::rank == 1)) ||
+                    ((RV::rank == 1) && (XMV::rank == 2)),
+                "KokkosBlas::sum: "
+                "RV and XMV must either have rank 0 and 1 or rank 1 and 2.");
 
   // Check compatibility of dimensions at run time.
   if (X.extent(1) != R.extent(0)) {
     std::ostringstream os;
     os << "KokkosBlas::sum (MV): Dimensions of R and X do not match: "
-       << "R: " << R.extent(0)
-       << ", X: " << X.extent(0) << " x " << X.extent(1);
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+       << "R: " << R.extent(0) << ", X: " << X.extent(0) << " x "
+       << X.extent(1);
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
-  using UnifiedXLayout = typename
-    KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout;
-  using UnifiedRVLayout = typename
-    KokkosKernels::Impl::GetUnifiedLayoutPreferring<RV, UnifiedXLayout>::array_layout;
+  using UnifiedXLayout =
+      typename KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout;
+  using UnifiedRVLayout =
+      typename KokkosKernels::Impl::GetUnifiedLayoutPreferring<
+          RV, UnifiedXLayout>::array_layout;
 
   // Create unmanaged versions of the input Views.  RV and XMV may be
   // rank 1 or rank 2.
-  typedef Kokkos::View<
-    typename RV::non_const_data_type,
-    UnifiedRVLayout,
-    typename RV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV_Internal;
-  typedef Kokkos::View<
-    typename XMV::const_data_type,
-    UnifiedXLayout,
-    typename XMV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > XMV_Internal;
-
-  RV_Internal R_internal = R;
+  typedef Kokkos::View<typename RV::non_const_data_type, UnifiedRVLayout,
+                       typename RV::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      RV_Internal;
+  typedef Kokkos::View<typename XMV::const_data_type, UnifiedXLayout,
+                       typename XMV::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      XMV_Internal;
+
+  RV_Internal R_internal  = R;
   XMV_Internal X_internal = X;
 
-  Impl::Sum<RV_Internal, XMV_Internal>::sum (R_internal, X_internal);
-}
-
+  Impl::Sum<RV_Internal, XMV_Internal>::sum(R_internal, X_internal);
 }
 
-#endif // KOKKOSBLAS1_SUM_HPP_
+}  // namespace KokkosBlas
 
+#endif  // KOKKOSBLAS1_SUM_HPP_
diff --git a/src/blas/KokkosBlas1_team_abs.hpp b/src/blas/KokkosBlas1_team_abs.hpp
index b244471c45..eb37fe05f6 100644
--- a/src/blas/KokkosBlas1_team_abs.hpp
+++ b/src/blas/KokkosBlas1_team_abs.hpp
@@ -45,20 +45,18 @@
 #ifndef KOKKOSBLAS1_TEAM_ABS_HPP_
 #define KOKKOSBLAS1_TEAM_ABS_HPP_
 
-#include<KokkosBlas1_team_abs_spec.hpp>
+#include <KokkosBlas1_team_abs_spec.hpp>
 
 namespace KokkosBlas {
 namespace Experimental {
 
-template<class TeamType, class RVector,class XVector>
-void KOKKOS_INLINE_FUNCTION abs (const TeamType& team,
-                                 const RVector& r,
-                                 const XVector& x)
-{
-  Impl::TeamAbs<TeamType,RVector,XVector>::team_abs(team,r,x);
+template <class TeamType, class RVector, class XVector>
+void KOKKOS_INLINE_FUNCTION abs(const TeamType& team, const RVector& r,
+                                const XVector& x) {
+  Impl::TeamAbs<TeamType, RVector, XVector>::team_abs(team, r, x);
 }
 
-}
-}
+}  // namespace Experimental
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/blas/KokkosBlas1_team_axpby.hpp b/src/blas/KokkosBlas1_team_axpby.hpp
index f1127fc0d6..b636a7fbdf 100644
--- a/src/blas/KokkosBlas1_team_axpby.hpp
+++ b/src/blas/KokkosBlas1_team_axpby.hpp
@@ -45,26 +45,32 @@
 #ifndef KOKKOSBLAS1_TEAM_AXPBY_HPP_
 #define KOKKOSBLAS1_TEAM_AXPBY_HPP_
 
-#include<KokkosBlas1_team_axpby_spec.hpp>
+#include <KokkosBlas1_team_axpby_spec.hpp>
 
 namespace KokkosBlas {
 namespace Experimental {
 
-template<class TeamType, class XVector,class YVector>
-void KOKKOS_INLINE_FUNCTION axpby (const TeamType& team,
-                 const typename XVector::non_const_value_type& a, const XVector& x,
-                 const typename YVector::non_const_value_type& b, const YVector& y)
-{
-  return Impl::TeamAXPBY<TeamType,XVector,YVector>::team_axpby(team,a,x,b,y);
+template <class TeamType, class XVector, class YVector>
+void KOKKOS_INLINE_FUNCTION
+axpby(const TeamType& team, const typename XVector::non_const_value_type& a,
+      const XVector& x, const typename YVector::non_const_value_type& b,
+      const YVector& y) {
+  return Impl::TeamAXPBY<TeamType, XVector, YVector>::team_axpby(team, a, x, b,
+                                                                 y);
 }
 
-template<class TeamType, class XVector,class YVector>
-void KOKKOS_INLINE_FUNCTION axpy (const TeamType& team, const typename XVector::non_const_value_type& a, const XVector& x, const YVector& y)
-{
-  KokkosBlas::Experimental::axpby<TeamType,XVector,YVector>(team,a,x,Kokkos::Details::ArithTraits<typename YVector::non_const_value_type>::one(),y);
+template <class TeamType, class XVector, class YVector>
+void KOKKOS_INLINE_FUNCTION
+axpy(const TeamType& team, const typename XVector::non_const_value_type& a,
+     const XVector& x, const YVector& y) {
+  KokkosBlas::Experimental::axpby<TeamType, XVector, YVector>(
+      team, a, x,
+      Kokkos::Details::ArithTraits<
+          typename YVector::non_const_value_type>::one(),
+      y);
 }
 
-}
-}
+}  // namespace Experimental
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/blas/KokkosBlas1_team_dot.hpp b/src/blas/KokkosBlas1_team_dot.hpp
index 7f04bb24d3..443335840f 100644
--- a/src/blas/KokkosBlas1_team_dot.hpp
+++ b/src/blas/KokkosBlas1_team_dot.hpp
@@ -45,19 +45,19 @@
 #ifndef KOKKOSBLAS1_TEAM_DOT_HPP_
 #define KOKKOSBLAS1_TEAM_DOT_HPP_
 
-#include<KokkosBlas1_team_dot_spec.hpp>
+#include <KokkosBlas1_team_dot_spec.hpp>
 
 namespace KokkosBlas {
 namespace Experimental {
 
-template<class TeamType, class XVector,class YVector>
-typename Kokkos::Details::InnerProductSpaceTraits<typename XVector::non_const_value_type>::dot_type
-KOKKOS_INLINE_FUNCTION dot (const TeamType& team, const XVector& x, const YVector& y)
-{
-  return Impl::TeamDot<TeamType,XVector,YVector>::team_dot(team,x,y);
+template <class TeamType, class XVector, class YVector>
+typename Kokkos::Details::InnerProductSpaceTraits<
+    typename XVector::non_const_value_type>::dot_type KOKKOS_INLINE_FUNCTION
+dot(const TeamType& team, const XVector& x, const YVector& y) {
+  return Impl::TeamDot<TeamType, XVector, YVector>::team_dot(team, x, y);
 }
 
-}
-}
+}  // namespace Experimental
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/blas/KokkosBlas1_team_mult.hpp b/src/blas/KokkosBlas1_team_mult.hpp
index ce01fea63a..03c04bb8da 100644
--- a/src/blas/KokkosBlas1_team_mult.hpp
+++ b/src/blas/KokkosBlas1_team_mult.hpp
@@ -45,19 +45,21 @@
 #ifndef KOKKOSBLAS1_TEAM_MULT_HPP_
 #define KOKKOSBLAS1_TEAM_MULT_HPP_
 
-#include<KokkosBlas1_team_mult_spec.hpp>
+#include <KokkosBlas1_team_mult_spec.hpp>
 
 namespace KokkosBlas {
 namespace Experimental {
 
-template<class TeamType, class YVector, class AVector, class XVector>
-void KOKKOS_INLINE_FUNCTION mult (const TeamType& team, const typename YVector::non_const_value_type& gamma, const YVector& y,
-                                                        const typename AVector::non_const_value_type& alpha, const AVector& a, const XVector& x)
-{
-  return Impl::TeamMult<TeamType,YVector,AVector,XVector>::team_mult(team,gamma,y,alpha,a,x);
+template <class TeamType, class YVector, class AVector, class XVector>
+void KOKKOS_INLINE_FUNCTION
+mult(const TeamType& team, const typename YVector::non_const_value_type& gamma,
+     const YVector& y, const typename AVector::non_const_value_type& alpha,
+     const AVector& a, const XVector& x) {
+  return Impl::TeamMult<TeamType, YVector, AVector, XVector>::team_mult(
+      team, gamma, y, alpha, a, x);
 }
 
-}
-}
+}  // namespace Experimental
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/blas/KokkosBlas1_team_nrm2.hpp b/src/blas/KokkosBlas1_team_nrm2.hpp
index 45c9e230c5..d089a3babd 100644
--- a/src/blas/KokkosBlas1_team_nrm2.hpp
+++ b/src/blas/KokkosBlas1_team_nrm2.hpp
@@ -45,19 +45,19 @@
 #ifndef KOKKOSBLAS1_TEAM_NRM2_HPP_
 #define KOKKOSBLAS1_TEAM_NRM2_HPP_
 
-#include<KokkosBlas1_team_nrm2_spec.hpp>
+#include <KokkosBlas1_team_nrm2_spec.hpp>
 
 namespace KokkosBlas {
 namespace Experimental {
 
-template<class TeamType, class XVector>
-typename Kokkos::Details::InnerProductSpaceTraits<typename XVector::non_const_value_type>::mag_type
-KOKKOS_INLINE_FUNCTION nrm2 (const TeamType& team, const XVector& x)
-{
-  return Impl::TeamNrm2<TeamType,XVector>::team_nrm2(team,x);
+template <class TeamType, class XVector>
+typename Kokkos::Details::InnerProductSpaceTraits<
+    typename XVector::non_const_value_type>::mag_type KOKKOS_INLINE_FUNCTION
+nrm2(const TeamType& team, const XVector& x) {
+  return Impl::TeamNrm2<TeamType, XVector>::team_nrm2(team, x);
 }
 
-}
-}
+}  // namespace Experimental
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/blas/KokkosBlas1_team_scal.hpp b/src/blas/KokkosBlas1_team_scal.hpp
index c1dad00afd..5fbe9688d1 100644
--- a/src/blas/KokkosBlas1_team_scal.hpp
+++ b/src/blas/KokkosBlas1_team_scal.hpp
@@ -45,21 +45,19 @@
 #ifndef KOKKOSBLAS1_TEAM_SCAL_HPP_
 #define KOKKOSBLAS1_TEAM_SCAL_HPP_
 
-#include<KokkosBlas1_team_scal_spec.hpp>
+#include <KokkosBlas1_team_scal_spec.hpp>
 
 namespace KokkosBlas {
 namespace Experimental {
 
-template<class TeamType, class RVector, class XVector>
-void KOKKOS_INLINE_FUNCTION scal (const TeamType& team,
-                                  const RVector& r,
-                                  const typename XVector::non_const_value_type& a, 
-                                  const XVector& x)
-{
-  return Impl::TeamScal<TeamType,RVector,XVector>::team_scal(team,r,a,x);
+template <class TeamType, class RVector, class XVector>
+void KOKKOS_INLINE_FUNCTION
+scal(const TeamType& team, const RVector& r,
+     const typename XVector::non_const_value_type& a, const XVector& x) {
+  return Impl::TeamScal<TeamType, RVector, XVector>::team_scal(team, r, a, x);
 }
 
-}
-}
+}  // namespace Experimental
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/blas/KokkosBlas1_team_update.hpp b/src/blas/KokkosBlas1_team_update.hpp
index a7592cc2c2..10dfac719f 100644
--- a/src/blas/KokkosBlas1_team_update.hpp
+++ b/src/blas/KokkosBlas1_team_update.hpp
@@ -45,20 +45,22 @@
 #ifndef KOKKOSBLAS1_TEAM_UPDATE_HPP_
 #define KOKKOSBLAS1_TEAM_UPDATE_HPP_
 
-#include<KokkosBlas1_team_update_spec.hpp>
+#include <KokkosBlas1_team_update_spec.hpp>
 
 namespace KokkosBlas {
 namespace Experimental {
 
-template<class TeamType, class XVector, class YVector, class ZVector>
-void KOKKOS_INLINE_FUNCTION update (const TeamType& team, const typename XVector::non_const_value_type& alpha, const XVector& x,
-                                                          const typename YVector::non_const_value_type& beta,  const YVector& y, 
-                                                          const typename ZVector::non_const_value_type& gamma, const ZVector& z)
-{
-  return Impl::TeamUpdate<TeamType,XVector,YVector,ZVector>::team_update(team,alpha,x,beta,y,gamma,z);
+template <class TeamType, class XVector, class YVector, class ZVector>
+void KOKKOS_INLINE_FUNCTION
+update(const TeamType& team,
+       const typename XVector::non_const_value_type& alpha, const XVector& x,
+       const typename YVector::non_const_value_type& beta, const YVector& y,
+       const typename ZVector::non_const_value_type& gamma, const ZVector& z) {
+  return Impl::TeamUpdate<TeamType, XVector, YVector, ZVector>::team_update(
+      team, alpha, x, beta, y, gamma, z);
 }
 
-}
-}
+}  // namespace Experimental
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/blas/KokkosBlas1_update.hpp b/src/blas/KokkosBlas1_update.hpp
index 7820b985ae..cb934fd17b 100644
--- a/src/blas/KokkosBlas1_update.hpp
+++ b/src/blas/KokkosBlas1_update.hpp
@@ -45,8 +45,9 @@
 #ifndef KOKKOSBLAS1_UPDATE_HPP_
 #define KOKKOSBLAS1_UPDATE_HPP_
 
-#include<KokkosBlas1_update_spec.hpp>
-#include<KokkosKernels_helpers.hpp>
+#include <KokkosBlas1_update_spec.hpp>
+#include <KokkosKernels_helpers.hpp>
+#include <KokkosKernels_Error.hpp>
 
 namespace KokkosBlas {
 
@@ -59,72 +60,69 @@ namespace KokkosBlas {
 ///   the same rank as XMV and YMV, and it must make sense to add up
 ///   the entries of XMV and YMV and assign them to the entries of
 ///   ZMV.
-template<class XMV, class YMV, class ZMV>
-void
-update (const typename XMV::non_const_value_type& alpha, const XMV& X,
-        const typename YMV::non_const_value_type& beta, const YMV& Y,
-        const typename ZMV::non_const_value_type& gamma, const ZMV& Z)
-{
-  static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::update: "
-                 "X is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<YMV>::value, "KokkosBlas::update: "
-                 "Y is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<ZMV>::value, "KokkosBlas::update: "
-                 "Z is not a Kokkos::View.");
-  static_assert (std::is_same<typename ZMV::value_type,
-                 typename ZMV::non_const_value_type>::value,
-                 "KokkosBlas::update: Z is const.  "
-                 "It must be nonconst, because it is an output argument "
-                 "(we have to be able to write to its entries).");
-  static_assert (int(ZMV::rank) == int(XMV::rank), "KokkosBlas::update: "
-                 "X and Z must have the same rank.");
-  static_assert (int(ZMV::rank) == int(YMV::rank), "KokkosBlas::update: "
-                 "Y and Z must have the same rank.");
-  static_assert (ZMV::rank == 1 || ZMV::rank == 2, "KokkosBlas::update: "
-                 "XMV, YMV, and ZMV must either have rank 1 or rank 2.");
+template <class XMV, class YMV, class ZMV>
+void update(const typename XMV::non_const_value_type& alpha, const XMV& X,
+            const typename YMV::non_const_value_type& beta, const YMV& Y,
+            const typename ZMV::non_const_value_type& gamma, const ZMV& Z) {
+  static_assert(Kokkos::is_view<XMV>::value,
+                "KokkosBlas::update: "
+                "X is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<YMV>::value,
+                "KokkosBlas::update: "
+                "Y is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<ZMV>::value,
+                "KokkosBlas::update: "
+                "Z is not a Kokkos::View.");
+  static_assert(std::is_same<typename ZMV::value_type,
+                             typename ZMV::non_const_value_type>::value,
+                "KokkosBlas::update: Z is const.  "
+                "It must be nonconst, because it is an output argument "
+                "(we have to be able to write to its entries).");
+  static_assert(int(ZMV::rank) == int(XMV::rank),
+                "KokkosBlas::update: "
+                "X and Z must have the same rank.");
+  static_assert(int(ZMV::rank) == int(YMV::rank),
+                "KokkosBlas::update: "
+                "Y and Z must have the same rank.");
+  static_assert(ZMV::rank == 1 || ZMV::rank == 2,
+                "KokkosBlas::update: "
+                "XMV, YMV, and ZMV must either have rank 1 or rank 2.");
 
   // Check compatibility of dimensions at run time.
-  if (X.extent(0) != Y.extent(0) ||
-      X.extent(1) != Y.extent(1) ||
-      X.extent(0) != Z.extent(0) ||
-      X.extent(1) != Z.extent(1)) {
+  if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1) ||
+      X.extent(0) != Z.extent(0) || X.extent(1) != Z.extent(1)) {
     std::ostringstream os;
     os << "KokkosBlas::update (MV): Dimensions of X, Y, and Z do not match: "
-       << "Z: " << Z.extent(0) << " x " << Z.extent(1)
-       << ", X: " << X.extent(0) << " x " << X.extent(1)
-       << ", Y: " << Y.extent(0) << " x " << Y.extent(1);
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+       << "Z: " << Z.extent(0) << " x " << Z.extent(1) << ", X: " << X.extent(0)
+       << " x " << X.extent(1) << ", Y: " << Y.extent(0) << " x "
+       << Y.extent(1);
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
   // Create unmanaged versions of the input Views.  XMV, YMV, and ZMV
   // may be rank 1 or rank 2, but they must all have the same rank.
 
   typedef Kokkos::View<
-    typename Kokkos::Impl::if_c<
-      XMV::rank == 1,
-      typename XMV::const_value_type*,
-      typename XMV::const_value_type** >::type,
-    typename KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout,
-    typename XMV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > XMV_Internal;
+      typename std::conditional<XMV::rank == 1, typename XMV::const_value_type*,
+                                typename XMV::const_value_type**>::type,
+      typename KokkosKernels::Impl::GetUnifiedLayout<XMV>::array_layout,
+      typename XMV::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      XMV_Internal;
 
   typedef Kokkos::View<
-    typename Kokkos::Impl::if_c<
-      YMV::rank == 1,
-      typename YMV::const_value_type*,
-      typename YMV::const_value_type** >::type,
-    typename KokkosKernels::Impl::GetUnifiedLayout<YMV>::array_layout,
-    typename YMV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > YMV_Internal;
+      typename std::conditional<YMV::rank == 1, typename YMV::const_value_type*,
+                                typename YMV::const_value_type**>::type,
+      typename KokkosKernels::Impl::GetUnifiedLayout<YMV>::array_layout,
+      typename YMV::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      YMV_Internal;
 
   typedef Kokkos::View<
-    typename Kokkos::Impl::if_c<
-      ZMV::rank == 1,
-      typename ZMV::non_const_value_type*,
-      typename ZMV::non_const_value_type** >::type,
-    typename KokkosKernels::Impl::GetUnifiedLayout<ZMV>::array_layout,
-    typename ZMV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > ZMV_Internal;
+      typename std::conditional<ZMV::rank == 1,
+                                typename ZMV::non_const_value_type*,
+                                typename ZMV::non_const_value_type**>::type,
+      typename KokkosKernels::Impl::GetUnifiedLayout<ZMV>::array_layout,
+      typename ZMV::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      ZMV_Internal;
 
   XMV_Internal X_internal = X;
   YMV_Internal Y_internal = Y;
@@ -134,18 +132,16 @@ update (const typename XMV::non_const_value_type& alpha, const XMV& X,
   using std::cerr;
   using std::endl;
   cerr << "KokkosBlas::update:" << endl
-       << "  XMV_Internal: " << demangledTypeName (X_internal) << endl
-       << "  YMV_Internal: " << demangledTypeName (Y_internal) << endl
-       << "  ZMV_Internal: " << demangledTypeName (Z_internal) << endl
+       << "  XMV_Internal: " << demangledTypeName(X_internal) << endl
+       << "  YMV_Internal: " << demangledTypeName(Y_internal) << endl
+       << "  ZMV_Internal: " << demangledTypeName(Z_internal) << endl
        << endl;
-#endif // KOKKOSKERNELS_PRINT_DEMANGLED_TYPE_INFO
+#endif  // KOKKOSKERNELS_PRINT_DEMANGLED_TYPE_INFO
 
-  return Impl::Update<XMV_Internal, YMV_Internal,
-    ZMV_Internal>::update (alpha, X_internal, beta, Y_internal,
-                           gamma, Z_internal);
+  return Impl::Update<XMV_Internal, YMV_Internal, ZMV_Internal>::update(
+      alpha, X_internal, beta, Y_internal, gamma, Z_internal);
 }
 
-}
-
-#endif // KOKKOSBLAS1_UPDATE_HPP_
+}  // namespace KokkosBlas
 
+#endif  // KOKKOSBLAS1_UPDATE_HPP_
diff --git a/src/blas/KokkosBlas2_gemv.hpp b/src/blas/KokkosBlas2_gemv.hpp
index 08a62e6c47..5c37b74c9b 100644
--- a/src/blas/KokkosBlas2_gemv.hpp
+++ b/src/blas/KokkosBlas2_gemv.hpp
@@ -50,8 +50,9 @@
 
 #include <KokkosBlas2_gemv_spec.hpp>
 #include <KokkosKernels_helpers.hpp>
+#include <KokkosKernels_Error.hpp>
 #include <sstream>
-#include <type_traits> // requires C++11, but so does Kokkos
+#include <type_traits>  // requires C++11, but so does Kokkos
 
 namespace KokkosBlas {
 
@@ -63,6 +64,9 @@ namespace KokkosBlas {
 /// \tparam AlphaCoeffType Type of input coefficient alpha
 /// \tparam BetaCoeffType Type of input coefficient beta
 ///
+/// \param space [in] execution space instance on which to run the
+///   kernel. This may contain information about which stream to
+///   run on.
 /// \param trans [in] "N" for non-transpose, "T" for transpose, "C"
 ///   for conjugate transpose.  All characters after the first are
 ///   ignored.  This works just like the BLAS routines.
@@ -71,29 +75,23 @@ namespace KokkosBlas {
 /// \param x [in] Input vector, as a 1-D Kokkos::View
 /// \param beta [in] Input coefficient of y
 /// \param y [in/out] Output vector, as a nonconst 1-D Kokkos::View
-template<class AViewType,
-         class XViewType,
-         class YViewType>
-void
-gemv (const char trans[],
-      typename AViewType::const_value_type& alpha,
-      const AViewType& A,
-      const XViewType& x,
-      typename YViewType::const_value_type& beta,
-      const YViewType& y)
-{
-  static_assert (Kokkos::Impl::is_view<AViewType>::value,
-                 "AViewType must be a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<XViewType>::value,
-                 "XViewType must be a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<YViewType>::value,
-                 "YViewType must be a Kokkos::View.");
-  static_assert (static_cast<int> (AViewType::rank) == 2,
-                 "AViewType must have rank 2.");
-  static_assert (static_cast<int> (XViewType::rank) == 1,
-                 "XViewType must have rank 1.");
-  static_assert (static_cast<int> (YViewType::rank) == 1,
-                 "YViewType must have rank 1.");
+template <class AViewType, class XViewType, class YViewType>
+void gemv(const typename AViewType::execution_space& space, const char trans[],
+          typename AViewType::const_value_type& alpha, const AViewType& A,
+          const XViewType& x, typename YViewType::const_value_type& beta,
+          const YViewType& y) {
+  static_assert(Kokkos::is_view<AViewType>::value,
+                "AViewType must be a Kokkos::View.");
+  static_assert(Kokkos::is_view<XViewType>::value,
+                "XViewType must be a Kokkos::View.");
+  static_assert(Kokkos::is_view<YViewType>::value,
+                "YViewType must be a Kokkos::View.");
+  static_assert(static_cast<int>(AViewType::rank) == 2,
+                "AViewType must have rank 2.");
+  static_assert(static_cast<int>(XViewType::rank) == 1,
+                "XViewType must have rank 1.");
+  static_assert(static_cast<int>(YViewType::rank) == 1,
+                "YViewType must have rank 1.");
 
   // Check compatibility of dimensions at run time.
   if (trans[0] == 'N' || trans[0] == 'n') {
@@ -102,26 +100,24 @@ gemv (const char trans[],
       os << "KokkosBlas::gemv: Dimensions of A, x, and y do not match: "
          << "A: " << A.extent(0) << " x " << A.extent(1)
          << ", x: " << x.extent(0) << ", y: " << y.extent(0);
-      Kokkos::Impl::throw_runtime_exception (os.str ());
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
     }
-  }
-  else if (trans[0] == 'T' || trans[0] == 't' ||
-           trans[0] == 'C' || trans[0] == 'c' ||
-           trans[0] == 'H' || trans[0] == 'h') {
+  } else if (trans[0] == 'T' || trans[0] == 't' || trans[0] == 'C' ||
+             trans[0] == 'c' || trans[0] == 'H' || trans[0] == 'h') {
     if (A.extent(1) != y.extent(0) || A.extent(0) != x.extent(0)) {
       std::ostringstream os;
       os << "KokkosBlas::dot: Dimensions of A, x, and y do not match: "
          << "A: " << A.extent(0) << " x " << A.extent(1)
          << ", x: " << x.extent(0) << ", y: " << y.extent(0);
-      Kokkos::Impl::throw_runtime_exception (os.str ());
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
     }
-  }
-  else {
+  } else {
     std::ostringstream os;
-    os << "KokkosBlas::gemv: trans[0] = '" << trans[0] << "'.  Valid values "
-      "include 'N' (No transpose), 'T' (Transpose), and 'C' (Conjugate "
-      "transpose).";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+    os << "KokkosBlas::gemv: trans[0] = '" << trans[0]
+       << "'.  Valid values "
+          "include 'N' (No transpose), 'T' (Transpose), and 'C' (Conjugate "
+          "transpose).";
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
   using ALayout = typename AViewType::array_layout;
@@ -129,35 +125,87 @@ gemv (const char trans[],
   // Minimize the number of Impl::GEMV instantiations, by
   // standardizing on particular View specializations for its template
   // parameters.
-  typedef Kokkos::View<typename AViewType::const_value_type**,
-    ALayout,
-    typename AViewType::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > AVT;
+  typedef Kokkos::View<typename AViewType::const_value_type**, ALayout,
+                       typename AViewType::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      AVT;
   typedef Kokkos::View<typename XViewType::const_value_type*,
-    typename KokkosKernels::Impl::GetUnifiedLayoutPreferring<XViewType, ALayout>::array_layout,
-    typename XViewType::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > XVT;
+                       typename KokkosKernels::Impl::GetUnifiedLayoutPreferring<
+                           XViewType, ALayout>::array_layout,
+                       typename XViewType::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      XVT;
   typedef Kokkos::View<typename YViewType::non_const_value_type*,
-    typename KokkosKernels::Impl::GetUnifiedLayoutPreferring<YViewType, ALayout>::array_layout,
-    typename YViewType::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > YVT;
+                       typename KokkosKernels::Impl::GetUnifiedLayoutPreferring<
+                           YViewType, ALayout>::array_layout,
+                       typename YViewType::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      YVT;
 
-  // Degenerate case is essentially same as scal - use fallback impl 
-  // to avoid potential (unlikely?) circular dependence issues by including other KokkosBlas headers
-  if (A.extent(0) == 0 || A.extent(1) == 0)
-  {
-    const bool eti_spec_avail = KokkosBlas::Impl::gemv_eti_spec_avail<AVT, XVT, YVT>::value;
+  // Degenerate case is essentially same as scal - use fallback impl
+  // to avoid potential (unlikely?) circular dependence issues by including
+  // other KokkosBlas headers
+  bool useFallback = A.extent(0) == 0 || A.extent(1) == 0;
+  // If A is LayoutRight and we have the BLAS, cuBLAS or rocBLAS TPL, use
+  // fallback because those only support LayoutLeft
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
+  useFallback = useFallback || (tolower(*trans) == 'c' &&
+                                std::is_same<typename AViewType::array_layout,
+                                             Kokkos::LayoutRight>::value &&
+                                std::is_same<typename AViewType::memory_space,
+                                             Kokkos::CudaSpace>::value);
+#endif
+#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS
+  useFallback =
+      useFallback || (tolower(*trans) == 'c' &&
+                      std::is_same<typename AViewType::array_layout,
+                                   Kokkos::LayoutRight>::value &&
+                      std::is_same<typename AViewType::memory_space,
+                                   Kokkos::Experimental::HIPSpace>::value);
+#endif
+#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
+  useFallback = useFallback || (tolower(*trans) == 'c' &&
+                                std::is_same<typename AViewType::array_layout,
+                                             Kokkos::LayoutRight>::value &&
+                                std::is_same<typename AViewType::memory_space,
+                                             Kokkos::HostSpace>::value);
+#endif
+  if (useFallback) {
+    const bool eti_spec_avail =
+        KokkosBlas::Impl::gemv_eti_spec_avail<AVT, XVT, YVT>::value;
     typedef Impl::GEMV<AVT, XVT, YVT, false, eti_spec_avail> fallback_impl_type;
-    fallback_impl_type::gemv (trans, alpha, A, x, beta, y);
-  }
-  else 
-  {
+    fallback_impl_type::gemv(space, trans, alpha, A, x, beta, y);
+  } else {
     typedef Impl::GEMV<AVT, XVT, YVT> impl_type;
-    impl_type::gemv (trans, alpha, A, x, beta, y);
+    impl_type::gemv(space, trans, alpha, A, x, beta, y);
   }
+}
 
+/// \brief Dense matrix-vector multiply: y = beta*y + alpha*A*x.
+///
+/// \tparam AViewType Input matrix, as a 2-D Kokkos::View
+/// \tparam XViewType Input vector, as a 1-D Kokkos::View
+/// \tparam YViewType Output vector, as a nonconst 1-D Kokkos::View
+/// \tparam AlphaCoeffType Type of input coefficient alpha
+/// \tparam BetaCoeffType Type of input coefficient beta
+///
+/// \param trans [in] "N" for non-transpose, "T" for transpose, "C"
+///   for conjugate transpose.  All characters after the first are
+///   ignored.  This works just like the BLAS routines.
+/// \param alpha [in] Input coefficient of A*x
+/// \param A [in] Input matrix, as a 2-D Kokkos::View
+/// \param x [in] Input vector, as a 1-D Kokkos::View
+/// \param beta [in] Input coefficient of y
+/// \param y [in/out] Output vector, as a nonconst 1-D Kokkos::View
+template <class AViewType, class XViewType, class YViewType>
+void gemv(const char trans[], typename AViewType::const_value_type& alpha,
+          const AViewType& A, const XViewType& x,
+          typename YViewType::const_value_type& beta, const YViewType& y) {
+  const typename AViewType::execution_space space =
+      typename AViewType::execution_space();
+  gemv(space, trans, alpha, A, x, beta, y);
 }
 
-} // namespace KokkosBlas
+}  // namespace KokkosBlas
 
-#endif // KOKKOS_BLAS2_MV_HPP_
+#endif  // KOKKOS_BLAS2_MV_HPP_
diff --git a/src/blas/KokkosBlas2_team_gemv.hpp b/src/blas/KokkosBlas2_team_gemv.hpp
index 1e511d967a..874f8919df 100644
--- a/src/blas/KokkosBlas2_team_gemv.hpp
+++ b/src/blas/KokkosBlas2_team_gemv.hpp
@@ -45,29 +45,29 @@
 #ifndef KOKKOSBLAS2_TEAM_GEMV_HPP_
 #define KOKKOSBLAS2_TEAM_GEMV_HPP_
 
-#include<KokkosBlas2_team_gemv_spec.hpp>
+#include <KokkosBlas2_team_gemv_spec.hpp>
 
 namespace KokkosBlas {
 namespace Experimental {
 
-template<class TeamType, class MatrixType, class XVector,class YVector>
-void KOKKOS_INLINE_FUNCTION gemv (const TeamType& team,
-                 const char trans,
-                 const typename MatrixType::non_const_value_type& alpha,
-                 const MatrixType& A,
-                 const XVector& x,
-                 const typename YVector::non_const_value_type& beta,
-                 const YVector& y)
-{
+template <class TeamType, class MatrixType, class XVector, class YVector>
+void KOKKOS_INLINE_FUNCTION
+gemv(const TeamType& team, const char trans,
+     const typename MatrixType::non_const_value_type& alpha,
+     const MatrixType& A, const XVector& x,
+     const typename YVector::non_const_value_type& beta, const YVector& y) {
   if (trans == 'N' || trans == 'n')
-    return Impl::TeamGEMV<TeamType,MatrixType,XVector,YVector,0>::team_gemv(team,alpha,A,x,beta,y);
+    return Impl::TeamGEMV<TeamType, MatrixType, XVector, YVector, 0>::team_gemv(
+        team, alpha, A, x, beta, y);
   if (trans == 'T' || trans == 't')
-    return Impl::TeamGEMV<TeamType,MatrixType,XVector,YVector,1>::team_gemv(team,alpha,A,x,beta,y);
+    return Impl::TeamGEMV<TeamType, MatrixType, XVector, YVector, 1>::team_gemv(
+        team, alpha, A, x, beta, y);
   if (trans == 'C' || trans == 'c')
-    return Impl::TeamGEMV<TeamType,MatrixType,XVector,YVector,2>::team_gemv(team,alpha,A,x,beta,y);
+    return Impl::TeamGEMV<TeamType, MatrixType, XVector, YVector, 2>::team_gemv(
+        team, alpha, A, x, beta, y);
 }
 
-}
-}
+}  // namespace Experimental
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/blas/KokkosBlas3_gemm.hpp b/src/blas/KokkosBlas3_gemm.hpp
index e9ebce9911..3d36ad86c9 100644
--- a/src/blas/KokkosBlas3_gemm.hpp
+++ b/src/blas/KokkosBlas3_gemm.hpp
@@ -51,70 +51,71 @@
 #include <KokkosBlas2_gemv.hpp>
 #include <KokkosBlas1_scal.hpp>
 #include <KokkosKernels_helpers.hpp>
+#include <KokkosKernels_Error.hpp>
 #include <sstream>
 #include <type_traits>
 
 namespace KokkosBlas {
 
 namespace Impl {
-  // Special codepath for when B/C have 1 column: use GEMV (matrix-vector) instead.
-  // GEMV performs better than tiled GEMM in this case.
-  //
-  // Returns true if the criteria are met and GEMV was run, false otherwise.
-  //
-  // This case must be intercepted here rather than impl in order to call TPL
-  // GEMV instead of TPL GEMM. This codepath was measured to be profitable with cuBLAS.
-  template<class AViewType,
-           class BViewType,
-           class CViewType>
-  bool
-  gemv_based_gemm
-       (const char transA[],
-        const char transB[],
-        typename AViewType::const_value_type& alpha,
-        const AViewType& A,
-        const BViewType& B,
-        typename CViewType::const_value_type& beta,
-        const CViewType& C,
-        typename std::enable_if<
-          !std::is_same<typename BViewType::array_layout, Kokkos::LayoutStride>::value &&
-          !std::is_same<typename CViewType::array_layout, Kokkos::LayoutStride>::value>::type* = nullptr)
-  {
-    if(toupper(transA[0]) == 'N' && toupper(transB[0]) == 'N' && B.extent(1) == size_t(1))
-    {
-      // since B/C both have a single column and are not LayoutStride,
-      // can create a raw contiguous rank-1 vector from them rather than using subview.
-      Kokkos::View<typename BViewType::value_type*, typename BViewType::array_layout,
-        typename BViewType::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>> Bvec(B.data(), B.extent(0));
-      Kokkos::View<typename CViewType::value_type*, typename CViewType::array_layout,
-        typename CViewType::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>> Cvec(C.data(), C.extent(0));
-      KokkosBlas::gemv("N", alpha, A, Bvec, beta, Cvec);
-      return true;
-    }
-    return false;
+// Special codepath for when B/C have 1 column: use GEMV (matrix-vector)
+// instead. GEMV performs better than tiled GEMM in this case.
+//
+// Returns true if the criteria are met and GEMV was run, false otherwise.
+//
+// This case must be intercepted here rather than impl in order to call TPL
+// GEMV instead of TPL GEMM. This codepath was measured to be profitable with
+// cuBLAS.
+template <class AViewType, class BViewType, class CViewType>
+bool gemv_based_gemm(
+    const typename CViewType::execution_space& space, const char transA[],
+    const char transB[], typename AViewType::const_value_type& alpha,
+    const AViewType& A, const BViewType& B,
+    typename CViewType::const_value_type& beta, const CViewType& C,
+    typename std::enable_if<!std::is_same<typename BViewType::array_layout,
+                                          Kokkos::LayoutStride>::value &&
+                            !std::is_same<typename CViewType::array_layout,
+                                          Kokkos::LayoutStride>::value>::type* =
+        nullptr) {
+  if (toupper(transA[0]) == 'N' && toupper(transB[0]) == 'N' &&
+      B.extent(1) == size_t(1)) {
+    // since B/C both have a single column and are not LayoutStride,
+    // can create a raw contiguous rank-1 vector from them rather than using
+    // subview.
+    Kokkos::View<typename BViewType::value_type*,
+                 typename BViewType::array_layout,
+                 typename BViewType::device_type,
+                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+        Bvec(B.data(), B.extent(0));
+    Kokkos::View<typename CViewType::value_type*,
+                 typename CViewType::array_layout,
+                 typename CViewType::device_type,
+                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+        Cvec(C.data(), C.extent(0));
+    KokkosBlas::gemv(space, "N", alpha, A, Bvec, beta, Cvec);
+    return true;
   }
+  return false;
+}
 
-  // Don't attempt to call GEMV with LayoutStride vectors.
-  // GEMV is not ETI'd for this case, so there would be undefined symbol errors in tests.
-  template<class AViewType,
-           class BViewType,
-           class CViewType>
-  bool
-  gemv_based_gemm
-       (const char /*transA*/[],
-        const char /*transB*/[],
-        typename AViewType::const_value_type& /*alpha*/,
-        const AViewType& /*A*/,
-        const BViewType& /*B*/,
-        typename CViewType::const_value_type& /*beta*/,
-        const CViewType& /*C*/,
-        typename std::enable_if<
-          std::is_same<typename BViewType::array_layout, Kokkos::LayoutStride>::value ||
-          std::is_same<typename CViewType::array_layout, Kokkos::LayoutStride>::value>::type* = nullptr)
-  {
-    return false;
-  }
+// Don't attempt to call GEMV with LayoutStride vectors.
+// GEMV is not ETI'd for this case, so there would be undefined symbol errors in
+// tests.
+template <class AViewType, class BViewType, class CViewType>
+bool gemv_based_gemm(
+    const typename CViewType::execution_space& /*space*/,
+    const char /*transA*/[], const char /*transB*/[],
+    typename AViewType::const_value_type& /*alpha*/, const AViewType& /*A*/,
+    const BViewType& /*B*/, typename CViewType::const_value_type& /*beta*/,
+    const CViewType& /*C*/,
+    typename std::enable_if<std::is_same<typename BViewType::array_layout,
+                                         Kokkos::LayoutStride>::value ||
+                            std::is_same<typename CViewType::array_layout,
+                                         Kokkos::LayoutStride>::value>::type* =
+        nullptr) {
+  return false;
 }
+}  // namespace Impl
 
 /// \brief Dense matrix-matrix multiply: C = beta*C + alpha*op(A)*op(B).
 ///
@@ -122,6 +123,7 @@ namespace Impl {
 /// \tparam BViewType Input matrix, as a 2-D Kokkos::View
 /// \tparam CViewType Output matrix, as a nonconst 2-D Kokkos::View
 ///
+/// \param space [in] an execution space instance
 /// \param transA [in] "N" for non-transpose, "T" for transpose, "C"
 ///   for conjugate transpose.  All characters after the first are
 ///   ignored.  This works just like the BLAS routines.
@@ -133,32 +135,24 @@ namespace Impl {
 /// \param B [in] Input matrix, as a 2-D Kokkos::View
 /// \param beta [in] Input coefficient of C
 /// \param C [in/out] Output vector, as a nonconst 2-D Kokkos::View
-template<class AViewType,
-         class BViewType,
-         class CViewType>
-void
-gemm (const char transA[],
-      const char transB[],
-      typename AViewType::const_value_type& alpha,
-      const AViewType& A,
-      const BViewType& B,
-      typename CViewType::const_value_type& beta,
-      const CViewType& C)
-{
-
-  #if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
-  static_assert (Kokkos::Impl::is_view<AViewType>::value,
-                 "AViewType must be a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<BViewType>::value,
-                 "BViewType must be a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<CViewType>::value,
-                 "CViewType must be a Kokkos::View.");
-  static_assert (static_cast<int> (AViewType::rank) == 2,
-                 "AViewType must have rank 2.");
-  static_assert (static_cast<int> (BViewType::rank) == 2,
-                 "BViewType must have rank 2.");
-  static_assert (static_cast<int> (CViewType::rank) == 2,
-                 "CViewType must have rank 2.");
+template <class AViewType, class BViewType, class CViewType>
+void gemm(const typename CViewType::execution_space& space, const char transA[],
+          const char transB[], typename AViewType::const_value_type& alpha,
+          const AViewType& A, const BViewType& B,
+          typename CViewType::const_value_type& beta, const CViewType& C) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+  static_assert(Kokkos::is_view<AViewType>::value,
+                "AViewType must be a Kokkos::View.");
+  static_assert(Kokkos::is_view<BViewType>::value,
+                "BViewType must be a Kokkos::View.");
+  static_assert(Kokkos::is_view<CViewType>::value,
+                "CViewType must be a Kokkos::View.");
+  static_assert(static_cast<int>(AViewType::rank) == 2,
+                "AViewType must have rank 2.");
+  static_assert(static_cast<int>(BViewType::rank) == 2,
+                "BViewType must have rank 2.");
+  static_assert(static_cast<int>(CViewType::rank) == 2,
+                "CViewType must have rank 2.");
 
   // Check validity of transpose argument
   bool valid_transA = (transA[0] == 'N') || (transA[0] == 'n') ||
@@ -167,73 +161,101 @@ gemm (const char transA[],
   bool valid_transB = (transB[0] == 'N') || (transB[0] == 'n') ||
                       (transB[0] == 'T') || (transB[0] == 't') ||
                       (transB[0] == 'C') || (transB[0] == 'c');
-  if(!(valid_transA && valid_transB)) {
+  if (!(valid_transA && valid_transB)) {
     std::ostringstream os;
-    os << "KokkosBlas::gemm: transA[0] = '" << transA[0] << " transB[0] = '" << transB[0] << "'. " <<
-      "Valid values include 'N' or 'n' (No transpose), 'T' or 't' (Transpose), "
-      "and 'C' or 'c' (Conjugate transpose).";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+    os << "KokkosBlas::gemm: transA[0] = '" << transA[0] << " transB[0] = '"
+       << transB[0] << "'. "
+       << "Valid values include 'N' or 'n' (No transpose), 'T' or 't' "
+          "(Transpose), "
+          "and 'C' or 'c' (Conjugate transpose).";
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
   // Check compatibility of dimensions at run time.
-  bool A_t = !(transA[0] == 'N' || transA[0] == 'n');
-  bool B_t = !(transB[0] == 'N' || transB[0] == 'n');
+  bool A_t   = !(transA[0] == 'N' || transA[0] == 'n');
+  bool B_t   = !(transB[0] == 'N' || transB[0] == 'n');
   int64_t A0 = A.extent(0);
   int64_t A1 = A.extent(1);
-  // B0 is a `#define`'d constant in 
+  // B0 is a `#define`'d constant in
   // certain MacOSX SDKs in termios.h:291
   int64_t B_0 = B.extent(0);
-  int64_t B1 = B.extent(1);
-  int64_t C0 = C.extent(0);
-  int64_t C1 = C.extent(1);
-
-  if ( ((A_t?A1:A0) != C0) ||
-       ((B_t?B_0:B1) != C1) ||
-       ((A_t?A0:A1) != (B_t?B1:B_0)) ) {
-      std::ostringstream os;
-      os << "KokkosBlas::gemm: Dimensions of A, B, and C do not match: "
-         << "transA: " << transA[0] << " transB: " << transB[0]
-         << " A: " << A.extent(0) << " x " << A.extent(1)
-         << " B: " << B.extent(0) << " x " << B.extent(1)
-         << " C: " << C.extent(0) << " x " << C.extent(1);
-      Kokkos::Impl::throw_runtime_exception (os.str ());
-    }
-  #endif // KOKKOSKERNELS_DEBUG_LEVEL > 0
+  int64_t B1  = B.extent(1);
+  int64_t C0  = C.extent(0);
+  int64_t C1  = C.extent(1);
+
+  if (((A_t ? A1 : A0) != C0) || ((B_t ? B_0 : B1) != C1) ||
+      ((A_t ? A0 : A1) != (B_t ? B1 : B_0))) {
+    std::ostringstream os;
+    os << "KokkosBlas::gemm: Dimensions of A, B, and C do not match: "
+       << "transA: " << transA[0] << " transB: " << transB[0]
+       << " A: " << A.extent(0) << " x " << A.extent(1) << " B: " << B.extent(0)
+       << " x " << B.extent(1) << " C: " << C.extent(0) << " x " << C.extent(1);
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
+  }
+#endif  // KOKKOSKERNELS_DEBUG_LEVEL > 0
 
   // Return if C matrix is degenerated
-  if((C.extent(0) == 0) || (C.extent(1) == 0)) {
+  if ((C.extent(0) == 0) || (C.extent(1) == 0)) {
     return;
   }
 
   // Simply scale C if A matrix is degenerated
-  if(A.extent(1) == 0) {
+  if (A.extent(1) == 0) {
     scal(C, beta, C);
     return;
   }
 
   // Check if gemv code path is allowed and profitable, and if so run it.
-  if(Impl::gemv_based_gemm(transA, transB, alpha, A, B, beta, C))
+  if (Impl::gemv_based_gemm(space, transA, transB, alpha, A, B, beta, C))
     return;
 
   // Minimize the number of Impl::GEMM instantiations, by
   // standardizing on particular View specializations for its template
   // parameters.
-  typedef Kokkos::View<typename AViewType::const_value_type**,
-    typename AViewType::array_layout,
-    typename AViewType::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > AVT;
-  typedef Kokkos::View<typename BViewType::const_value_type**,
-    typename BViewType::array_layout,
-    typename BViewType::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > BVT;
+  typedef Kokkos::View<
+      typename AViewType::const_value_type**, typename AViewType::array_layout,
+      typename AViewType::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      AVT;
+  typedef Kokkos::View<
+      typename BViewType::const_value_type**, typename BViewType::array_layout,
+      typename BViewType::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      BVT;
   typedef Kokkos::View<typename CViewType::non_const_value_type**,
-    typename CViewType::array_layout,
-    typename CViewType::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > CVT;
+                       typename CViewType::array_layout,
+                       typename CViewType::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      CVT;
   typedef Impl::GEMM<AVT, BVT, CVT> impl_type;
-  impl_type::gemm (transA, transB, alpha, A, B, beta, C);
+  impl_type::gemm(space, transA, transB, alpha, A, B, beta, C);
+}
+
+/// \brief Dense matrix-matrix multiply: C = beta*C + alpha*op(A)*op(B).
+///
+/// \tparam AViewType Input matrix, as a 2-D Kokkos::View
+/// \tparam BViewType Input matrix, as a 2-D Kokkos::View
+/// \tparam CViewType Output matrix, as a nonconst 2-D Kokkos::View
+///
+/// \param transA [in] "N" for non-transpose, "T" for transpose, "C"
+///   for conjugate transpose.  All characters after the first are
+///   ignored.  This works just like the BLAS routines.
+/// \param transB [in] "N" for non-transpose, "T" for transpose, "C"
+///   for conjugate transpose.  All characters after the first are
+///   ignored.  This works just like the BLAS routines.
+/// \param alpha [in] Input coefficient of A*x
+/// \param A [in] Input matrix, as a 2-D Kokkos::View
+/// \param B [in] Input matrix, as a 2-D Kokkos::View
+/// \param beta [in] Input coefficient of C
+/// \param C [in/out] Output vector, as a nonconst 2-D Kokkos::View
+template <class AViewType, class BViewType, class CViewType>
+void gemm(const char transA[], const char transB[],
+          typename AViewType::const_value_type& alpha, const AViewType& A,
+          const BViewType& B, typename CViewType::const_value_type& beta,
+          const CViewType& C) {
+  const typename CViewType::execution_space space =
+      typename CViewType::execution_space();
+  gemm(space, transA, transB, alpha, A, B, beta, C);
 }
 
-} // namespace KokkosBlas
+}  // namespace KokkosBlas
 
-#endif // KOKKOS_BLAS3_MV_HPP_
+#endif  // KOKKOS_BLAS3_MV_HPP_
diff --git a/src/blas/KokkosBlas3_trmm.hpp b/src/blas/KokkosBlas3_trmm.hpp
index 0d0a9fa521..b760f5d41e 100644
--- a/src/blas/KokkosBlas3_trmm.hpp
+++ b/src/blas/KokkosBlas3_trmm.hpp
@@ -49,94 +49,93 @@
 #include "KokkosKernels_Macros.hpp"
 #include "KokkosBlas3_trmm_spec.hpp"
 #include "KokkosKernels_helpers.hpp"
+#include "KokkosKernels_Error.hpp"
 #include <sstream>
 #include <type_traits>
 
 namespace KokkosBlas {
 
-/// \brief Solve triangular linear system with multiple RHSs: 
+/// \brief Solve triangular linear system with multiple RHSs:
 ///        B = alpha * op(A) * B if side == "L" or "l"
 ///        B = alpha * B * op(A) if side == "R" or "r"
 ///
 /// \tparam AViewType Input matrix, as a 2-D Kokkos::View
-/// \tparam BViewType Input(RHS)/Output(solution) M-by-N matrix, as a 2-D Kokkos::View
+/// \tparam BViewType Input(RHS)/Output(solution) M-by-N matrix, as a 2-D
+/// Kokkos::View
 ///
 /// \param side  [in] "L" or "l" indicates matrix A is on the left of B
 ///                   "R" or "r" indicates matrix A is on the right of B
-/// \param uplo  [in] "U" or "u" indicates matrix A is an upper triangular matrix
+/// \param uplo  [in] "U" or "u" indicates matrix A is an upper triangular
+/// matrix
 ///                   "L" or "l" indicates matrix A is a lower triangular matrix
 /// \param trans [in] Specifies what op does to A:
-//                    "N" or "n" for non-transpose, 
-//                    "T" or "t" for transpose, 
+//                    "N" or "n" for non-transpose,
+//                    "T" or "t" for transpose,
 //                    "C" or "c" for conjugate transpose.
-/// \param diag  [in] "U" or "u" indicates the diagonal of A is assumed to be unit
-//                    "N" or "n" indicates the diagonal of A is assumed to be non-unit
+/// \param diag  [in] "U" or "u" indicates the diagonal of A is assumed to be
+/// unit
+//                    "N" or "n" indicates the diagonal of A is assumed to be
+//                    non-unit
 /// \param alpha [in] Input coefficient used for
 //                    multiplication with either A or B
-/// \param A [in]     Input matrix, as a 2-D Kokkos::View 
-///                   If side == "L" or "l", matrix A is a M-by-M triangular matrix;
-///                   otherwise, matrix A is a N-by-N triangular matrix 
-/// \param B [in,out] Input/Output matrix, as a 2-D Kokkos::View 
+/// \param A [in]     Input matrix, as a 2-D Kokkos::View
+///                   If side == "L" or "l", matrix A is a M-by-M triangular
+///                   matrix; otherwise, matrix A is a N-by-N triangular matrix
+/// \param B [in,out] Input/Output matrix, as a 2-D Kokkos::View
 ///                   On entry, M-by-N matrix
 ///                   On exit, overwritten with the solution
-template<class AViewType,
-         class BViewType>
-void
-trmm (const char side[],
-      const char uplo[],
-      const char trans[],
-      const char diag[],
-      typename BViewType::const_value_type& alpha,
-      const AViewType& A,
-      const BViewType& B)
-{
-
-  static_assert (Kokkos::Impl::is_view<AViewType>::value,
-                 "AViewType must be a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<BViewType>::value,
-                 "BViewType must be a Kokkos::View.");
-  static_assert (static_cast<int> (AViewType::rank) == 2,
-                 "AViewType must have rank 2.");
-  static_assert (static_cast<int> (BViewType::rank) == 2,
-                 "BViewType must have rank 2.");
+template <class AViewType, class BViewType>
+void trmm(const char side[], const char uplo[], const char trans[],
+          const char diag[], typename BViewType::const_value_type& alpha,
+          const AViewType& A, const BViewType& B) {
+  static_assert(Kokkos::is_view<AViewType>::value,
+                "AViewType must be a Kokkos::View.");
+  static_assert(Kokkos::is_view<BViewType>::value,
+                "BViewType must be a Kokkos::View.");
+  static_assert(static_cast<int>(AViewType::rank) == 2,
+                "AViewType must have rank 2.");
+  static_assert(static_cast<int>(BViewType::rank) == 2,
+                "BViewType must have rank 2.");
 
   // Check validity of indicator argument
-  bool valid_side  = (side[0] == 'L' ) || (side[0] == 'l' )||
-                     (side[0] == 'R' ) || (side[0] == 'r' );
-  bool valid_uplo  = (uplo[0] == 'U' ) || (uplo[0] == 'u' )||
-                     (uplo[0] == 'L' ) || (uplo[0] == 'l' );
-  bool valid_trans = (trans[0] == 'N') || (trans[0] == 'n')||
-                     (trans[0] == 'T') || (trans[0] == 't')||
+  bool valid_side = (side[0] == 'L') || (side[0] == 'l') || (side[0] == 'R') ||
+                    (side[0] == 'r');
+  bool valid_uplo = (uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') ||
+                    (uplo[0] == 'l');
+  bool valid_trans = (trans[0] == 'N') || (trans[0] == 'n') ||
+                     (trans[0] == 'T') || (trans[0] == 't') ||
                      (trans[0] == 'C') || (trans[0] == 'c');
-  bool valid_diag  = (diag[0] == 'U' ) || (diag[0] == 'u' )||
-                     (diag[0] == 'N' ) || (diag[0] == 'n' );
-  if(!valid_side) {
+  bool valid_diag = (diag[0] == 'U') || (diag[0] == 'u') || (diag[0] == 'N') ||
+                    (diag[0] == 'n');
+  if (!valid_side) {
     std::ostringstream os;
-    os << "KokkosBlas::trmm: side = '" << side[0] << "'. " <<
-      "Valid values include 'L' or 'l' (A is on the left of X), "
-      "'R' or 'r' (A is on the right of X).";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+    os << "KokkosBlas::trmm: side = '" << side[0] << "'. "
+       << "Valid values include 'L' or 'l' (A is on the left of X), "
+          "'R' or 'r' (A is on the right of X).";
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
-  if(!valid_uplo) {
+  if (!valid_uplo) {
     std::ostringstream os;
-    os << "KokkosBlas::trmm: uplo = '" << uplo[0] << "'. " <<
-      "Valid values include 'U' or 'u' (A is upper triangular), "
-      "'L' or 'l' (A is lower triangular).";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+    os << "KokkosBlas::trmm: uplo = '" << uplo[0] << "'. "
+       << "Valid values include 'U' or 'u' (A is upper triangular), "
+          "'L' or 'l' (A is lower triangular).";
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
-  if(!valid_trans) {
+  if (!valid_trans) {
     std::ostringstream os;
-    os << "KokkosBlas::trmm: trans = '" << trans[0] << "'. " <<
-      "Valid values include 'N' or 'n' (No transpose), 'T' or 't' (Transpose), "
-      "and 'C' or 'c' (Conjugate transpose).";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+    os << "KokkosBlas::trmm: trans = '" << trans[0] << "'. "
+       << "Valid values include 'N' or 'n' (No transpose), 'T' or 't' "
+          "(Transpose), "
+          "and 'C' or 'c' (Conjugate transpose).";
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
-  if(!valid_diag) {
+  if (!valid_diag) {
     std::ostringstream os;
-    os << "KokkosBlas::trmm: diag = '" << diag[0] << "'. " <<
-      "Valid values include 'U' or 'u' (the diagonal of A is assumed to be unit), "
-     "'N' or 'n' (the diagonal of A is assumed to be non-unit).";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+    os << "KokkosBlas::trmm: diag = '" << diag[0] << "'. "
+       << "Valid values include 'U' or 'u' (the diagonal of A is assumed to be "
+          "unit), "
+          "'N' or 'n' (the diagonal of A is assumed to be non-unit).";
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
   bool is_A_lower_triangle = (side[0] == 'L' || side[0] == 'l');
@@ -147,34 +146,35 @@ trmm (const char side[],
   int64_t B_n = B.extent(1);
 
   // Return if degenerated matrices are provided
-  if(A_m == 0 || A_n == 0 || B_m == 0 || B_n == 0)
-    return;
+  if (A_m == 0 || A_n == 0 || B_m == 0 || B_n == 0) return;
 
   // Ensure that the dimensions of A match and that we can legally perform A*B
   // or B*A
-  if (A_m != A_n || (is_A_lower_triangle?B_m:B_n) != A_n) {
+  if (A_m != A_n || (is_A_lower_triangle ? B_m : B_n) != A_n) {
     std::ostringstream os;
     os << "KokkosBlas::trmm: Dimensions of A and B do not match: "
-       << "side: " << side[0]
-       << " A: " << A.extent(0) << " x " << A.extent(1)
+       << "side: " << side[0] << " A: " << A.extent(0) << " x " << A.extent(1)
        << " B: " << B.extent(0) << " x " << B.extent(1);
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
   // Create A matrix view type alias
-  using AViewInternalType = Kokkos::View<typename AViewType::const_value_type**,
-                           typename AViewType::array_layout,
-                           typename AViewType::device_type,
-                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
+  using AViewInternalType =
+      Kokkos::View<typename AViewType::const_value_type**,
+                   typename AViewType::array_layout,
+                   typename AViewType::device_type,
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
   // Crease B matrix view type alias
-  using BViewInternalType = Kokkos::View<typename BViewType::non_const_value_type**,
-                           typename BViewType::array_layout,
-                           typename BViewType::device_type,
-                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
+  using BViewInternalType =
+      Kokkos::View<typename BViewType::non_const_value_type**,
+                   typename BViewType::array_layout,
+                   typename BViewType::device_type,
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
 
-  KokkosBlas::Impl::TRMM<AViewInternalType, BViewInternalType>::trmm (side, uplo, trans, diag, alpha, A, B);
+  KokkosBlas::Impl::TRMM<AViewInternalType, BViewInternalType>::trmm(
+      side, uplo, trans, diag, alpha, A, B);
 }
 
-} // namespace KokkosBlas
+}  // namespace KokkosBlas
 
-#endif // KOKKOS_BLAS3_TRMM_HPP_
+#endif  // KOKKOS_BLAS3_TRMM_HPP_
diff --git a/src/blas/KokkosBlas3_trsm.hpp b/src/blas/KokkosBlas3_trsm.hpp
index 20c2848c2e..1e73d92049 100644
--- a/src/blas/KokkosBlas3_trsm.hpp
+++ b/src/blas/KokkosBlas3_trsm.hpp
@@ -49,111 +49,111 @@
 #include "KokkosKernels_Macros.hpp"
 #include "KokkosBlas3_trsm_spec.hpp"
 #include "KokkosKernels_helpers.hpp"
+#include "KokkosKernels_Error.hpp"
 #include <sstream>
 #include <type_traits>
 
 namespace KokkosBlas {
 
-/// \brief Solve triangular linear system with multiple RHSs: 
+/// \brief Solve triangular linear system with multiple RHSs:
 ///        op(A)*X = alpha*B if side == "L" or "l"
 ///        X*op(A) = alpha*B if side == "R" or "r"
 ///
 /// \tparam AViewType Input matrix, as a 2-D Kokkos::View
-/// \tparam BViewType Input(RHS)/Output(solution) M-by-N matrix, as a 2-D Kokkos::View
+/// \tparam BViewType Input(RHS)/Output(solution) M-by-N matrix, as a 2-D
+/// Kokkos::View
 ///
 /// \param side  [in] "L" or "l" indicates matrix A is on the left of X
 ///                   "R" or "r" indicates matrix A is on the right of X
-/// \param uplo  [in] "U" or "u" indicates matrix A upper part is stored, the other part is not referenced
-///                   "L" or "l" indicates matrix A lower part is stored, the other part is not referenced
-/// \param trans [in] "N" or "n" for non-transpose, "T" or "t" for transpose, "C" or "c" for conjugate transpose.  
-/// \param diag  [in] "U" or "u" indicates the diagonal of A is assumed to be unit
-//                    "N" or "n" indicated the diagonal of A is assumed to be non-unit
+/// \param uplo  [in] "U" or "u" indicates matrix A upper part is stored, the
+/// other part is not referenced
+///                   "L" or "l" indicates matrix A lower part is stored, the
+///                   other part is not referenced
+/// \param trans [in] "N" or "n" for non-transpose, "T" or "t" for transpose,
+/// "C" or "c" for conjugate transpose. \param diag  [in] "U" or "u" indicates
+/// the diagonal of A is assumed to be unit
+//                    "N" or "n" indicated the diagonal of A is assumed to be
+//                    non-unit
 /// \param alpha [in] Input coefficient used for multiplication with B
-/// \param A [in]     Input matrix, as a 2-D Kokkos::View 
-///                   If side == "L" or "l", matrix A is a M-by-M triangular matrix;
-///                   otherwise, matrix A is a N-by-N triangular matrix 
-/// \param B [in,out] Input/Output matrix, as a 2-D Kokkos::View 
+/// \param A [in]     Input matrix, as a 2-D Kokkos::View
+///                   If side == "L" or "l", matrix A is a M-by-M triangular
+///                   matrix; otherwise, matrix A is a N-by-N triangular matrix
+/// \param B [in,out] Input/Output matrix, as a 2-D Kokkos::View
 ///                   On entry, M-by-N matrix of multile RHS
 ///                   On exit, overwritten with the solution X
-template<class AViewType,
-         class BViewType>
-void
-trsm (const char side[],
-      const char uplo[],
-      const char trans[],
-      const char diag[],
-      typename BViewType::const_value_type& alpha,
-      const AViewType& A,
-      const BViewType& B)
-{
-
-  static_assert (Kokkos::Impl::is_view<AViewType>::value,
-                 "AViewType must be a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<BViewType>::value,
-                 "BViewType must be a Kokkos::View.");
-  static_assert (static_cast<int> (AViewType::rank) == 2,
-                 "AViewType must have rank 2.");
-  static_assert (static_cast<int> (BViewType::rank) == 2,
-                 "BViewType must have rank 2.");
+template <class AViewType, class BViewType>
+void trsm(const char side[], const char uplo[], const char trans[],
+          const char diag[], typename BViewType::const_value_type& alpha,
+          const AViewType& A, const BViewType& B) {
+  static_assert(Kokkos::is_view<AViewType>::value,
+                "AViewType must be a Kokkos::View.");
+  static_assert(Kokkos::is_view<BViewType>::value,
+                "BViewType must be a Kokkos::View.");
+  static_assert(static_cast<int>(AViewType::rank) == 2,
+                "AViewType must have rank 2.");
+  static_assert(static_cast<int>(BViewType::rank) == 2,
+                "BViewType must have rank 2.");
 
   // Check validity of indicator argument
-  bool valid_side  = (side[0] == 'L' ) || (side[0] == 'l' )||
-                     (side[0] == 'R' ) || (side[0] == 'r' );
-  bool valid_uplo  = (uplo[0] == 'U' ) || (uplo[0] == 'u' )||
-                     (uplo[0] == 'L' ) || (uplo[0] == 'l' );
-  bool valid_trans = (trans[0] == 'N') || (trans[0] == 'n')||
-                     (trans[0] == 'T') || (trans[0] == 't')||
+  bool valid_side = (side[0] == 'L') || (side[0] == 'l') || (side[0] == 'R') ||
+                    (side[0] == 'r');
+  bool valid_uplo = (uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') ||
+                    (uplo[0] == 'l');
+  bool valid_trans = (trans[0] == 'N') || (trans[0] == 'n') ||
+                     (trans[0] == 'T') || (trans[0] == 't') ||
                      (trans[0] == 'C') || (trans[0] == 'c');
-  bool valid_diag  = (diag[0] == 'U' ) || (diag[0] == 'u' )||
-                     (diag[0] == 'N' ) || (diag[0] == 'n' );
-  if(!valid_side) {
+  bool valid_diag = (diag[0] == 'U') || (diag[0] == 'u') || (diag[0] == 'N') ||
+                    (diag[0] == 'n');
+  if (!valid_side) {
     std::ostringstream os;
-    os << "KokkosBlas::trsm: side = '" << side[0] << "'. " <<
-      "Valid values include 'L' or 'l' (A is on the left of X), "
-      "'R' or 'r' (A is on the right of X).";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+    os << "KokkosBlas::trsm: side = '" << side[0] << "'. "
+       << "Valid values include 'L' or 'l' (A is on the left of X), "
+          "'R' or 'r' (A is on the right of X).";
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
-  if(!valid_uplo) {
+  if (!valid_uplo) {
     std::ostringstream os;
-    os << "KokkosBlas::trsm: uplo = '" << uplo[0] << "'. " <<
-      "Valid values include 'U' or 'u' (A is upper triangular), "
-      "'L' or 'l' (A is lower triangular).";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+    os << "KokkosBlas::trsm: uplo = '" << uplo[0] << "'. "
+       << "Valid values include 'U' or 'u' (A is upper triangular), "
+          "'L' or 'l' (A is lower triangular).";
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
-  if(!valid_trans) {
+  if (!valid_trans) {
     std::ostringstream os;
-    os << "KokkosBlas::trsm: trans = '" << trans[0] << "'. " <<
-      "Valid values include 'N' or 'n' (No transpose), 'T' or 't' (Transpose), "
-      "and 'C' or 'c' (Conjugate transpose).";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+    os << "KokkosBlas::trsm: trans = '" << trans[0] << "'. "
+       << "Valid values include 'N' or 'n' (No transpose), 'T' or 't' "
+          "(Transpose), "
+          "and 'C' or 'c' (Conjugate transpose).";
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
-  if(!valid_diag) {
+  if (!valid_diag) {
     std::ostringstream os;
-    os << "KokkosBlas::trsm: diag = '" << diag[0] << "'. " <<
-      "Valid values include 'U' or 'u' (the diagonal of A is assumed to be unit), "
-     "'N' or 'n' (the diagonal of A is assumed to be non-unit).";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+    os << "KokkosBlas::trsm: diag = '" << diag[0] << "'. "
+       << "Valid values include 'U' or 'u' (the diagonal of A is assumed to be "
+          "unit), "
+          "'N' or 'n' (the diagonal of A is assumed to be non-unit).";
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
   // Check compatibility of dimensions at run time.
-  bool A_s =  (side[0] == 'L' || side[0] == 'l');
+  bool A_s = (side[0] == 'L' || side[0] == 'l');
 
   int64_t A0 = A.extent(0);
   int64_t A1 = A.extent(1);
   int64_t B0 = B.extent(0);
   int64_t B1 = B.extent(1);
 
-  if ((A0 != A1) || ((A_s?B0:B1) != A1)) {
+  if ((A0 != A1) || ((A_s ? B0 : B1) != A1)) {
     std::ostringstream os;
     os << "KokkosBlas::trsm: Dimensions of A and B do not match: "
-       << "side: " << side[0]
-       << " A: " << A.extent(0) << " x " << A.extent(1)
+       << "side: " << side[0] << " A: " << A.extent(0) << " x " << A.extent(1)
        << " B: " << B.extent(0) << " x " << B.extent(1);
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
   // Return if degenerated matrices are provided
-  if((A.extent(0) == 0) || (A.extent(1) == 0) || (B.extent(0) == 0) || (B.extent(1) == 0))
+  if ((A.extent(0) == 0) || (A.extent(1) == 0) || (B.extent(0) == 0) ||
+      (B.extent(1) == 0))
     return;
 
   // Minimize the number of Impl::TRSM instantiations, by
@@ -168,9 +168,9 @@ trsm (const char side[],
                            typename BViewType::device_type,
                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
 
-  KokkosBlas::Impl::TRSM<AVT, BVT>::trsm (side, uplo, trans, diag, alpha, A, B);
+  KokkosBlas::Impl::TRSM<AVT, BVT>::trsm(side, uplo, trans, diag, alpha, A, B);
 }
 
-} // namespace KokkosBlas
+}  // namespace KokkosBlas
 
-#endif // KOKKOS_BLAS3_TRSM_HPP_
+#endif  // KOKKOS_BLAS3_TRSM_HPP_
diff --git a/src/blas/KokkosBlas_gesv.hpp b/src/blas/KokkosBlas_gesv.hpp
index eeaefdb9e4..e96b965546 100644
--- a/src/blas/KokkosBlas_gesv.hpp
+++ b/src/blas/KokkosBlas_gesv.hpp
@@ -47,7 +47,8 @@
 ///
 /// This file provides KokkosBlas::gesv. This function performs a
 /// local (no MPI) dense linear solve on a system of linear equations
-/// A * X = B where A is a general N-by-N matrix and X and B are N-by-NRHS matrices.
+/// A * X = B where A is a general N-by-N matrix and X and B are N-by-NRHS
+/// matrices.
 
 #ifndef KOKKOSBLAS_GESV_HPP_
 #define KOKKOSBLAS_GESV_HPP_
@@ -55,41 +56,46 @@
 #include <type_traits>
 
 #include "KokkosBlas_gesv_spec.hpp"
+#include "KokkosKernels_Error.hpp"
 
 namespace KokkosBlas {
 
 /// \brief Solve the dense linear equation system A*X = B.
 ///
 /// \tparam AMatrix Input matrix/Output LU, as a 2-D Kokkos::View.
-/// \tparam BXMV Input (right-hand side)/Output (solution) (multi)vector, as a 1-D or 2-D Kokkos::View.
-/// \tparam IPIVV Output pivot indices, as a 1-D Kokkos::View
+/// \tparam BXMV Input (right-hand side)/Output (solution) (multi)vector, as a
+/// 1-D or 2-D Kokkos::View. \tparam IPIVV Output pivot indices, as a 1-D
+/// Kokkos::View
 ///
-/// \param A [in,out] On entry, the N-by-N matrix to be solved. On exit, the factors L and U from
-///   the factorization A = P*L*U; the unit diagonal elements of L are not stored.
-/// \param B [in,out] On entry, the right hand side (multi)vector B. On exit, the solution (multi)vector X.
-/// \param IPIV [out] On exit, the pivot indices (for partial pivoting). If the View extents are zero and 
+/// \param A [in,out] On entry, the N-by-N matrix to be solved. On exit, the
+/// factors L and U from
+///   the factorization A = P*L*U; the unit diagonal elements of L are not
+///   stored.
+/// \param B [in,out] On entry, the right hand side (multi)vector B. On exit,
+/// the solution (multi)vector X. \param IPIV [out] On exit, the pivot indices
+/// (for partial pivoting). If the View extents are zero and
 ///   its data pointer is NULL, pivoting is not used.
 ///
 template <class AMatrix, class BXMV, class IPIVV>
-void
-gesv (const AMatrix& A, const BXMV& B, const IPIVV& IPIV)
-{
+void gesv(const AMatrix& A, const BXMV& B, const IPIVV& IPIV) {
   // NOTE: Currently, KokkosBlas::gesv only supports for MAGMA TPL and BLAS TPL.
-  //       MAGMA TPL should be enabled to call the MAGMA GPU interface for device views 
-  //       BLAS TPL should be enabled to call the BLAS interface for host views 
-
-  static_assert (Kokkos::Impl::is_view<AMatrix>::value,
-                 "KokkosBlas::gesv: A must be a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<BXMV>::value,
-                 "KokkosBlas::gesv: B must be a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<IPIVV>::value,
-                 "KokkosBlas::gesv: IPIV must be a Kokkos::View.");
-  static_assert (static_cast<int> (AMatrix::rank) == 2,
-                 "KokkosBlas::gesv: A must have rank 2.");
-  static_assert (static_cast<int> (BXMV::rank) == 1 || static_cast<int> (BXMV::rank) == 2,
-                 "KokkosBlas::gesv: B must have either rank 1 or rank 2.");
-  static_assert (static_cast<int> (IPIVV::rank) == 1,
-                 "KokkosBlas::gesv: IPIV must have rank 1.");
+  //       MAGMA TPL should be enabled to call the MAGMA GPU interface for
+  //       device views BLAS TPL should be enabled to call the BLAS interface
+  //       for host views
+
+  static_assert(Kokkos::is_view<AMatrix>::value,
+                "KokkosBlas::gesv: A must be a Kokkos::View.");
+  static_assert(Kokkos::is_view<BXMV>::value,
+                "KokkosBlas::gesv: B must be a Kokkos::View.");
+  static_assert(Kokkos::is_view<IPIVV>::value,
+                "KokkosBlas::gesv: IPIV must be a Kokkos::View.");
+  static_assert(static_cast<int>(AMatrix::rank) == 2,
+                "KokkosBlas::gesv: A must have rank 2.");
+  static_assert(
+      static_cast<int>(BXMV::rank) == 1 || static_cast<int>(BXMV::rank) == 2,
+      "KokkosBlas::gesv: B must have either rank 1 or rank 2.");
+  static_assert(static_cast<int>(IPIVV::rank) == 1,
+                "KokkosBlas::gesv: IPIV must have rank 1.");
 
   int64_t IPIV0 = IPIV.extent(0);
   int64_t A0    = A.extent(0);
@@ -97,74 +103,76 @@ gesv (const AMatrix& A, const BXMV& B, const IPIVV& IPIV)
   int64_t B0    = B.extent(0);
 
   // Check validity of pivot argument
-  bool valid_pivot = (IPIV0 == A1) || ((IPIV0 == 0) && (IPIV.data()==nullptr));
-  if(!(valid_pivot)) {
+  bool valid_pivot =
+      (IPIV0 == A1) || ((IPIV0 == 0) && (IPIV.data() == nullptr));
+  if (!(valid_pivot)) {
     std::ostringstream os;
-    os << "KokkosBlas::gesv: IPIV: " << IPIV0 << ". " <<
-      "Valid options include zero-extent 1-D view (no pivoting), or 1-D View with size of "<< A0 << " (partial pivoting).";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+    os << "KokkosBlas::gesv: IPIV: " << IPIV0 << ". "
+       << "Valid options include zero-extent 1-D view (no pivoting), or 1-D "
+          "View with size of "
+       << A0 << " (partial pivoting).";
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
   // Check for no pivoting case. Only MAGMA supports no pivoting interface
-#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA //have MAGMA TPL
-  #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS //and have BLAS TPL
-  if((!std::is_same< typename AMatrix::device_type::memory_space, Kokkos::CudaSpace >::value) &&
-     (IPIV0 == 0) && (IPIV.data()==nullptr)) {
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA  // have MAGMA TPL
+#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS   // and have BLAS TPL
+  if ((!std::is_same<typename AMatrix::device_type::memory_space,
+                     Kokkos::CudaSpace>::value) &&
+      (IPIV0 == 0) && (IPIV.data() == nullptr)) {
     std::ostringstream os;
-    os << "KokkosBlas::gesv: IPIV: " << IPIV0 << ". " <<
-      "BLAS TPL does not support no pivoting.";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+    os << "KokkosBlas::gesv: IPIV: " << IPIV0 << ". "
+       << "BLAS TPL does not support no pivoting.";
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
-  #endif
-#else //not have MAGMA TPL
-  #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS //but have BLAS TPL
-  if((IPIV0 == 0) && (IPIV.data()==nullptr)) {
+#endif
+#else                                 // not have MAGMA TPL
+#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS  // but have BLAS TPL
+  if ((IPIV0 == 0) && (IPIV.data() == nullptr)) {
     std::ostringstream os;
-    os << "KokkosBlas::gesv: IPIV: " << IPIV0 << ". " <<
-      "BLAS TPL does not support no pivoting.";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+    os << "KokkosBlas::gesv: IPIV: " << IPIV0 << ". "
+       << "BLAS TPL does not support no pivoting.";
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
-  #endif
+#endif
 #endif
 
   // Check compatibility of dimensions at run time.
-  if ( (A0 < A1) ||
-       (A0 != B0) ) {
-      std::ostringstream os;
-      os << "KokkosBlas::gesv: Dimensions of A, and B do not match: "
-         << " A: " << A.extent(0) << " x " << A.extent(1)
-         << " B: " << B.extent(0) << " x " << B.extent(1);
-      Kokkos::Impl::throw_runtime_exception (os.str ());
-    }
+  if ((A0 < A1) || (A0 != B0)) {
+    std::ostringstream os;
+    os << "KokkosBlas::gesv: Dimensions of A, and B do not match: "
+       << " A: " << A.extent(0) << " x " << A.extent(1) << " B: " << B.extent(0)
+       << " x " << B.extent(1);
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
+  }
 
-  typedef Kokkos::View<typename AMatrix::non_const_value_type**,
-                       typename AMatrix::array_layout,
-                       typename AMatrix::device_type,
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > AMatrix_Internal;
+  typedef Kokkos::View<
+      typename AMatrix::non_const_value_type**, typename AMatrix::array_layout,
+      typename AMatrix::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      AMatrix_Internal;
   typedef Kokkos::View<typename BXMV::non_const_value_type**,
-                       typename BXMV::array_layout,
-                       typename BXMV::device_type,
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > BXMV_Internal;
-  typedef Kokkos::View<typename IPIVV::non_const_value_type*,
-                       typename IPIVV::array_layout,
-                       typename IPIVV::device_type,
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > IPIVV_Internal;
+                       typename BXMV::array_layout, typename BXMV::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      BXMV_Internal;
+  typedef Kokkos::View<
+      typename IPIVV::non_const_value_type*, typename IPIVV::array_layout,
+      typename IPIVV::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      IPIVV_Internal;
   AMatrix_Internal A_i = A;
-  //BXMV_Internal B_i = B;
+  // BXMV_Internal B_i = B;
   IPIVV_Internal IPIV_i = IPIV;
 
   if (BXMV::rank == 1) {
     auto B_i = BXMV_Internal(B.data(), B.extent(0), 1);
-    KokkosBlas::Impl::GESV<AMatrix_Internal, BXMV_Internal, IPIVV_Internal>::gesv (A_i, B_i, IPIV_i);
-  }
-  else { //BXMV::rank == 2
+    KokkosBlas::Impl::GESV<AMatrix_Internal, BXMV_Internal,
+                           IPIVV_Internal>::gesv(A_i, B_i, IPIV_i);
+  } else {  // BXMV::rank == 2
     auto B_i = BXMV_Internal(B.data(), B.extent(0), B.extent(1));
-    KokkosBlas::Impl::GESV<AMatrix_Internal, BXMV_Internal, IPIVV_Internal>::gesv (A_i, B_i, IPIV_i);
+    KokkosBlas::Impl::GESV<AMatrix_Internal, BXMV_Internal,
+                           IPIVV_Internal>::gesv(A_i, B_i, IPIV_i);
   }
-
 }
 
-} // namespace KokkosBlas
-
-#endif // KOKKOSBLAS_GESV_HPP_
+}  // namespace KokkosBlas
 
+#endif  // KOKKOSBLAS_GESV_HPP_
diff --git a/src/blas/KokkosBlas_trtri.hpp b/src/blas/KokkosBlas_trtri.hpp
index 39c191f4d7..0402b11104 100644
--- a/src/blas/KokkosBlas_trtri.hpp
+++ b/src/blas/KokkosBlas_trtri.hpp
@@ -51,6 +51,7 @@
 #include "KokkosKernels_helpers.hpp"
 #include <sstream>
 #include <type_traits>
+#include "KokkosKernels_Error.hpp"
 
 namespace KokkosBlas {
 
@@ -59,56 +60,56 @@ namespace KokkosBlas {
 ///
 /// \tparam AViewType Input matrix, as a 2-D Kokkos::View
 ///
-/// \param uplo  [in] "U" or "u" indicates matrix A is an upper triangular matrix
+/// \param uplo  [in] "U" or "u" indicates matrix A is an upper triangular
+/// matrix
 ///                   "L" or "l" indicates matrix A is a lower triangular matrix
-/// \param diag  [in] "U" or "u" indicates the diagonal of A is assumed to be unit
-//                    "N" or "n" indicates the diagonal of A is assumed to be non-unit
-/// \param A [in,out] Input matrix, as a 2-D Kokkos::View 
+/// \param diag  [in] "U" or "u" indicates the diagonal of A is assumed to be
+/// unit
+//                    "N" or "n" indicates the diagonal of A is assumed to be
+//                    non-unit
+/// \param A [in,out] Input matrix, as a 2-D Kokkos::View
 ///                   On entry, A
 ///                   On successful exit, inv(A)
-/// \return           0 upon success, 
+/// \return           0 upon success,
 //                    i if the i-th diagonal elemet of A is zero, A is singular,
 //                    and the inversion could not be completed.
 // source: https://software.intel.com/en-us/mkl-developer-reference-c-trtri
-template<class AViewType>
-int
-trtri (const char uplo[],
-      const char diag[],
-      const AViewType& A)
-{
-
-  static_assert (Kokkos::Impl::is_view<AViewType>::value,
-                 "AViewType must be a Kokkos::View.");
-  static_assert (static_cast<int> (AViewType::rank) == 2,
-                 "AViewType must have rank 2.");
+template <class AViewType>
+int trtri(const char uplo[], const char diag[], const AViewType& A) {
+  static_assert(Kokkos::is_view<AViewType>::value,
+                "AViewType must be a Kokkos::View.");
+  static_assert(static_cast<int>(AViewType::rank) == 2,
+                "AViewType must have rank 2.");
 
   // Check validity of indicator argument
-  bool valid_uplo  = (uplo[0] == 'U' ) || (uplo[0] == 'u' )||
-                     (uplo[0] == 'L' ) || (uplo[0] == 'l' );
-  bool valid_diag  = (diag[0] == 'U' ) || (diag[0] == 'u' )||
-                     (diag[0] == 'N' ) || (diag[0] == 'n' );
+  bool valid_uplo = (uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') ||
+                    (uplo[0] == 'l');
+  bool valid_diag = (diag[0] == 'U') || (diag[0] == 'u') || (diag[0] == 'N') ||
+                    (diag[0] == 'n');
 
-  if(!valid_uplo) {
+  if (!valid_uplo) {
     std::ostringstream os;
-    os << "KokkosBlas::trtri: uplo = '" << uplo[0] << "'. " <<
-      "Valid values include 'U' or 'u' (A is upper triangular), "
-      "'L' or 'l' (A is lower triangular).";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+    os << "KokkosBlas::trtri: uplo = '" << uplo[0] << "'. "
+       << "Valid values include 'U' or 'u' (A is upper triangular), "
+          "'L' or 'l' (A is lower triangular).";
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
-  if(!valid_diag) {
+  if (!valid_diag) {
     std::ostringstream os;
-    os << "KokkosBlas::trtri: diag = '" << diag[0] << "'. " <<
-      "Valid values include 'U' or 'u' (the diagonal of A is assumed to be unit), "
-     "'N' or 'n' (the diagonal of A is assumed to be non-unit).";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+    os << "KokkosBlas::trtri: diag = '" << diag[0] << "'. "
+       << "Valid values include 'U' or 'u' (the diagonal of A is assumed to be "
+          "unit), "
+          "'N' or 'n' (the diagonal of A is assumed to be non-unit).";
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
   int64_t A_m = A.extent(0);
   int64_t A_n = A.extent(1);
 
   // Return if degenerated matrices are provided
-  if(A_m == 0 || A_n == 0)
-    return 0; // This is success as the inverse of a matrix with no elements is itself.
+  if (A_m == 0 || A_n == 0)
+    return 0;  // This is success as the inverse of a matrix with no elements is
+               // itself.
 
   // Ensure that the dimensions of A match and that we can legally perform A*B
   // or B*A
@@ -116,29 +117,30 @@ trtri (const char uplo[],
     std::ostringstream os;
     os << "KokkosBlas::trtri: Dimensions of A do not match,"
        << " A: " << A.extent(0) << " x " << A.extent(1);
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
   // Create A matrix view type alias
-  using AViewInternalType = Kokkos::View<typename AViewType::non_const_value_type**,
-                           typename AViewType::array_layout,
-                           typename AViewType::device_type,
-                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
+  using AViewInternalType =
+      Kokkos::View<typename AViewType::non_const_value_type**,
+                   typename AViewType::array_layout,
+                   typename AViewType::device_type,
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
 
   // This is the return value type and should always reside on host
-  using RViewInternalType = Kokkos::View<int,
-                           typename AViewType::array_layout,
-                           Kokkos::HostSpace,
-                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
+  using RViewInternalType =
+      Kokkos::View<int, typename AViewType::array_layout, Kokkos::HostSpace,
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
 
   int result;
   RViewInternalType R = RViewInternalType(&result);
 
-  KokkosBlas::Impl::TRTRI<RViewInternalType, AViewInternalType>::trtri (R, uplo, diag, A);
+  KokkosBlas::Impl::TRTRI<RViewInternalType, AViewInternalType>::trtri(R, uplo,
+                                                                       diag, A);
 
   return result;
 }
 
-} // namespace KokkosBlas
+}  // namespace KokkosBlas
 
-#endif // KOKKOS_BLASLAPACK_TRTRI_HPP_
+#endif  // KOKKOS_BLASLAPACK_TRTRI_HPP_
diff --git a/src/blas/impl/KokkosBlas1_abs_impl.hpp b/src/blas/impl/KokkosBlas1_abs_impl.hpp
index a6dc3ab4c4..06fb23301b 100644
--- a/src/blas/impl/KokkosBlas1_abs_impl.hpp
+++ b/src/blas/impl/KokkosBlas1_abs_impl.hpp
@@ -56,186 +56,186 @@ namespace Impl {
 //
 
 // Entry-wise absolute value / magnitude: R(i,j) = abs(X(i,j)).
-template<class RMV, class XMV, class SizeType = typename RMV::size_type>
-struct MV_Abs_Functor
-{
+template <class RMV, class XMV, class SizeType = typename RMV::size_type>
+struct MV_Abs_Functor {
   typedef typename RMV::execution_space execution_space;
-  typedef SizeType                            size_type;
+  typedef SizeType size_type;
   typedef Kokkos::Details::ArithTraits<typename XMV::non_const_value_type> ATS;
 
   const size_type numCols;
   RMV R_;
   XMV X_;
 
-  MV_Abs_Functor (const RMV& R, const XMV& X) :
-    numCols (X.extent(1)), R_ (R), X_ (X)
-  {
-    static_assert (Kokkos::Impl::is_view<RMV>::value, "KokkosBlas::Impl::"
-                   "MV_Abs_Functor: RMV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "MV_Abs_Functor: XMV is not a Kokkos::View.");
-    static_assert (RMV::rank == 2, "KokkosBlas::Impl::"
-                   "MV_Abs_Functor: RMV is not rank 2");
-    static_assert (XMV::rank == 2, "KokkosBlas::Impl::"
-                   "MV_Abs_Functor: XMV is not rank 2");
+  MV_Abs_Functor(const RMV& R, const XMV& X)
+      : numCols(X.extent(1)), R_(R), X_(X) {
+    static_assert(Kokkos::is_view<RMV>::value,
+                  "KokkosBlas::Impl::"
+                  "MV_Abs_Functor: RMV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "MV_Abs_Functor: XMV is not a Kokkos::View.");
+    static_assert(RMV::rank == 2,
+                  "KokkosBlas::Impl::"
+                  "MV_Abs_Functor: RMV is not rank 2");
+    static_assert(XMV::rank == 2,
+                  "KokkosBlas::Impl::"
+                  "MV_Abs_Functor: XMV is not rank 2");
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i) const
-  {
+  void operator()(const size_type& i) const {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
     for (size_type j = 0; j < numCols; ++j) {
-      R_(i,j) = ATS::abs (X_(i,j));
+      R_(i, j) = ATS::abs(X_(i, j));
     }
   }
 };
 
 // Entry-wise, in-place absolute value / magnitude: R(i,j) = abs(R(i,j)).
-template<class RMV, class SizeType = typename RMV::size_type>
-struct MV_AbsSelf_Functor
-{
+template <class RMV, class SizeType = typename RMV::size_type>
+struct MV_AbsSelf_Functor {
   typedef typename RMV::execution_space execution_space;
-  typedef SizeType                            size_type;
+  typedef SizeType size_type;
   typedef Kokkos::Details::ArithTraits<typename RMV::non_const_value_type> ATS;
 
   const size_type numCols;
   RMV R_;
 
-  MV_AbsSelf_Functor (const RMV& R) :
-    numCols (R.extent(1)), R_ (R)
-  {
-    static_assert (Kokkos::Impl::is_view<RMV>::value, "KokkosBlas::Impl::"
-                   "MV_Abs_Functor: RMV is not a Kokkos::View.");
-    static_assert (RMV::rank == 2, "KokkosBlas::Impl::"
-                   "MV_Abs_Functor: RMV is not rank 2");
+  MV_AbsSelf_Functor(const RMV& R) : numCols(R.extent(1)), R_(R) {
+    static_assert(Kokkos::is_view<RMV>::value,
+                  "KokkosBlas::Impl::"
+                  "MV_Abs_Functor: RMV is not a Kokkos::View.");
+    static_assert(RMV::rank == 2,
+                  "KokkosBlas::Impl::"
+                  "MV_Abs_Functor: RMV is not rank 2");
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i) const
-  {
+  void operator()(const size_type& i) const {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
     for (size_type j = 0; j < numCols; ++j) {
-      R_(i,j) = ATS::abs (R_(i,j));
+      R_(i, j) = ATS::abs(R_(i, j));
     }
   }
 };
 
 // Single-vector, entry-wise absolute value / magnitude: R(i) = abs(X(i)).
-template<class RV, class XV, class SizeType = typename RV::size_type>
-struct V_Abs_Functor
-{
+template <class RV, class XV, class SizeType = typename RV::size_type>
+struct V_Abs_Functor {
   typedef typename RV::execution_space execution_space;
-  typedef SizeType                            size_type;
+  typedef SizeType size_type;
   typedef Kokkos::Details::ArithTraits<typename XV::non_const_value_type> ATS;
 
   RV R_;
   XV X_;
 
-  V_Abs_Functor (const RV& R, const XV& X) : R_ (R), X_ (X)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::Impl::"
-                   "V_Abs_Functor: RV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XV>::value, "KokkosBlas::Impl::"
-                   "V_Abs_Functor: XV is not a Kokkos::View.");
-    static_assert (RV::rank == 1, "KokkosBlas::Impl::"
-                   "V_Abs_Functor: RV is not rank 1");
-    static_assert (XV::rank == 1, "KokkosBlas::Impl::"
-                   "V_Abs_Functor: XV is not rank 1");
+  V_Abs_Functor(const RV& R, const XV& X) : R_(R), X_(X) {
+    static_assert(Kokkos::is_view<RV>::value,
+                  "KokkosBlas::Impl::"
+                  "V_Abs_Functor: RV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XV>::value,
+                  "KokkosBlas::Impl::"
+                  "V_Abs_Functor: XV is not a Kokkos::View.");
+    static_assert(RV::rank == 1,
+                  "KokkosBlas::Impl::"
+                  "V_Abs_Functor: RV is not rank 1");
+    static_assert(XV::rank == 1,
+                  "KokkosBlas::Impl::"
+                  "V_Abs_Functor: XV is not rank 1");
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i) const
-  {
-    R_(i) = ATS::abs (X_(i));
-  }
+  void operator()(const size_type& i) const { R_(i) = ATS::abs(X_(i)); }
 };
 
-// Single-vector, entry-wise, in-place absolute value / magnitude: R(i) = abs(R(i)).
-template<class RV, class SizeType = typename RV::size_type>
-struct V_AbsSelf_Functor
-{
+// Single-vector, entry-wise, in-place absolute value / magnitude: R(i) =
+// abs(R(i)).
+template <class RV, class SizeType = typename RV::size_type>
+struct V_AbsSelf_Functor {
   typedef typename RV::execution_space execution_space;
-  typedef SizeType                            size_type;
+  typedef SizeType size_type;
   typedef Kokkos::Details::ArithTraits<typename RV::non_const_value_type> ATS;
 
   RV R_;
 
-  V_AbsSelf_Functor (const RV& R) : R_ (R)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::Impl::"
-                   "V_Abs_Functor: RV is not a Kokkos::View.");
-    static_assert (RV::rank == 1, "KokkosBlas::Impl::"
-                   "V_Abs_Functor: RV is not rank 1");
+  V_AbsSelf_Functor(const RV& R) : R_(R) {
+    static_assert(Kokkos::is_view<RV>::value,
+                  "KokkosBlas::Impl::"
+                  "V_Abs_Functor: RV is not a Kokkos::View.");
+    static_assert(RV::rank == 1,
+                  "KokkosBlas::Impl::"
+                  "V_Abs_Functor: RV is not rank 1");
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i) const
-  {
-    R_(i) = ATS::abs (R_(i));
-  }
+  void operator()(const size_type& i) const { R_(i) = ATS::abs(R_(i)); }
 };
 
 // Invoke the "generic" (not unrolled) multivector functor that
 // computes entry-wise absolute value.
-template<class RMV, class XMV, class SizeType>
-void
-MV_Abs_Generic (const RMV& R, const XMV& X)
-{
-  static_assert (Kokkos::Impl::is_view<RMV>::value, "KokkosBlas::Impl::"
-                 "MV_Abs_Generic: RMV is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                 "MV_Abs_Generic: XMV is not a Kokkos::View.");
-  static_assert (RMV::rank == 2, "KokkosBlas::Impl::"
-                 "MV_Abs_Generic: RMV is not rank 2");
-  static_assert (XMV::rank == 2, "KokkosBlas::Impl::"
-                 "MV_Abs_Generic: XMV is not rank 2");
+template <class RMV, class XMV, class SizeType>
+void MV_Abs_Generic(const RMV& R, const XMV& X) {
+  static_assert(Kokkos::is_view<RMV>::value,
+                "KokkosBlas::Impl::"
+                "MV_Abs_Generic: RMV is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<XMV>::value,
+                "KokkosBlas::Impl::"
+                "MV_Abs_Generic: XMV is not a Kokkos::View.");
+  static_assert(RMV::rank == 2,
+                "KokkosBlas::Impl::"
+                "MV_Abs_Generic: RMV is not rank 2");
+  static_assert(XMV::rank == 2,
+                "KokkosBlas::Impl::"
+                "MV_Abs_Generic: XMV is not rank 2");
 
   typedef typename XMV::execution_space execution_space;
   const SizeType numRows = X.extent(0);
-  Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
-
-  if((void*) (R.data()) == (void*) (X.data())) { // if R and X are the same (alias one another)
-    MV_AbsSelf_Functor<RMV, SizeType> op (R);
-    Kokkos::parallel_for ("KokkosBlas::Abs::S0", policy, op);
-  }
-  else {
-    MV_Abs_Functor<RMV, XMV, SizeType> op (R, X);
-    Kokkos::parallel_for ("KokkosBlas::Abs::S1", policy, op);
+  Kokkos::RangePolicy<execution_space, SizeType> policy(0, numRows);
+
+  if ((void*)(R.data()) ==
+      (void*)(X.data())) {  // if R and X are the same (alias one another)
+    MV_AbsSelf_Functor<RMV, SizeType> op(R);
+    Kokkos::parallel_for("KokkosBlas::Abs::S0", policy, op);
+  } else {
+    MV_Abs_Functor<RMV, XMV, SizeType> op(R, X);
+    Kokkos::parallel_for("KokkosBlas::Abs::S1", policy, op);
   }
 }
 
 // Variant of MV_Abs_Generic for single vectors (1-D Views) R and X.
-template<class RV, class XV, class SizeType>
-void
-V_Abs_Generic (const RV& R, const XV& X)
-{
-  static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::Impl::"
-                 "V_Abs_Generic: RV is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<XV>::value, "KokkosBlas::Impl::"
-                 "V_Abs_Generic: XV is not a Kokkos::View.");
-  static_assert (RV::rank == 1, "KokkosBlas::Impl::"
-                 "V_Abs_Generic: RV is not rank 1");
-  static_assert (XV::rank == 1, "KokkosBlas::Impl::"
-                 "V_Abs_Generic: XV is not rank 1");
+template <class RV, class XV, class SizeType>
+void V_Abs_Generic(const RV& R, const XV& X) {
+  static_assert(Kokkos::is_view<RV>::value,
+                "KokkosBlas::Impl::"
+                "V_Abs_Generic: RV is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<XV>::value,
+                "KokkosBlas::Impl::"
+                "V_Abs_Generic: XV is not a Kokkos::View.");
+  static_assert(RV::rank == 1,
+                "KokkosBlas::Impl::"
+                "V_Abs_Generic: RV is not rank 1");
+  static_assert(XV::rank == 1,
+                "KokkosBlas::Impl::"
+                "V_Abs_Generic: XV is not rank 1");
 
   typedef typename XV::execution_space execution_space;
   const SizeType numRows = X.extent(0);
-  Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
-
-  if((void*) (R.data()) == (void*) (X.data())) { // if R and X are the same (alias one another)
-    V_AbsSelf_Functor<RV, SizeType> op (R);
-    Kokkos::parallel_for ("KokkosBlas::Abs::S2", policy, op);
-  }
-  else {
-    V_Abs_Functor<RV, XV, SizeType> op (R, X);
-    Kokkos::parallel_for ("KokkosBlas::Abs::S3", policy, op);
+  Kokkos::RangePolicy<execution_space, SizeType> policy(0, numRows);
+
+  if ((void*)(R.data()) ==
+      (void*)(X.data())) {  // if R and X are the same (alias one another)
+    V_AbsSelf_Functor<RV, SizeType> op(R);
+    Kokkos::parallel_for("KokkosBlas::Abs::S2", policy, op);
+  } else {
+    V_Abs_Functor<RV, XV, SizeType> op(R, X);
+    Kokkos::parallel_for("KokkosBlas::Abs::S3", policy, op);
   }
 }
 
-}
-}
-#endif // KOKKOS_BLAS1_MV_IMPL_ABS_HPP_
+}  // namespace Impl
+}  // namespace KokkosBlas
+#endif  // KOKKOS_BLAS1_MV_IMPL_ABS_HPP_
diff --git a/src/blas/impl/KokkosBlas1_abs_spec.hpp b/src/blas/impl/KokkosBlas1_abs_spec.hpp
index 5da1219de5..eb7d083253 100644
--- a/src/blas/impl/KokkosBlas1_abs_spec.hpp
+++ b/src/blas/impl/KokkosBlas1_abs_spec.hpp
@@ -49,19 +49,19 @@
 #include <Kokkos_ArithTraits.hpp>
 
 // Include the actual functors
-#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY 
+#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 #include <KokkosBlas1_abs_impl.hpp>
 #endif
 
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class RMV, class XMV, int rank = RMV::rank>
+template <class RMV, class XMV, int rank = RMV::rank>
 struct abs_eti_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization availability
@@ -70,14 +70,17 @@ struct abs_eti_spec_avail {
 // We may spread out definitions (see _INST macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_ABS_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct abs_eti_spec_avail< \
-        Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_ABS_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  template <>                                                                 \
+  struct abs_eti_spec_avail<                                                  \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                     \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      1> {                                                                    \
+    enum : bool { value = true };                                             \
+  };
 
 //
 // Macro for declaration of full specialization availability
@@ -86,111 +89,126 @@ struct abs_eti_spec_avail {
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_ABS_MV_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct abs_eti_spec_avail< \
-        Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        2> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_ABS_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE,       \
+                                          MEM_SPACE)                        \
+  template <>                                                               \
+  struct abs_eti_spec_avail<                                                \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      2> {                                                                  \
+    enum : bool { value = true };                                           \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosBlas1_abs_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_abs_eti_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_abs_mv_eti_spec_avail.hpp>
+#include <KokkosBlas1_abs_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_abs_eti_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_abs_mv_eti_spec_avail.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
 
 // Unification layer
-template<class RMV, class XMV, int rank = RMV::rank,
-         bool tpl_spec_avail = abs_tpl_spec_avail<RMV,XMV>::value,
-         bool eti_spec_avail = abs_eti_spec_avail<RMV,XMV>::value>
+template <class RMV, class XMV, int rank = RMV::rank,
+          bool tpl_spec_avail = abs_tpl_spec_avail<RMV, XMV>::value,
+          bool eti_spec_avail = abs_eti_spec_avail<RMV, XMV>::value>
 struct Abs {
-  static void abs (const RMV& R, const XMV& X);
+  static void abs(const RMV& R, const XMV& X);
 };
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 //! Full specialization of Abs for single vectors (1-D Views).
-template<class RMV, class XMV>
-struct Abs<RMV, XMV, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>
-{
+template <class RMV, class XMV>
+struct Abs<RMV, XMV, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename XMV::size_type size_type;
 
-  static void abs (const RMV& R, const XMV& X)
-  {
-    static_assert (Kokkos::Impl::is_view<RMV>::value, "KokkosBlas::Impl::"
-                   "Abs<1-D>: RMV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "Abs<1-D>: XMV is not a Kokkos::View.");
-    static_assert (RMV::rank == 1, "KokkosBlas::Impl::Abs<1-D>: "
-                   "RMV is not rank 1.");
-    static_assert (XMV::rank == 1, "KokkosBlas::Impl::Abs<1-D>: "
-                   "XMV is not rank 1.");
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::abs[ETI]":"KokkosBlas::abs[noETI]");
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::abs<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name());
+  static void abs(const RMV& R, const XMV& X) {
+    static_assert(Kokkos::is_view<RMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Abs<1-D>: RMV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Abs<1-D>: XMV is not a Kokkos::View.");
+    static_assert(RMV::rank == 1,
+                  "KokkosBlas::Impl::Abs<1-D>: "
+                  "RMV is not rank 1.");
+    static_assert(XMV::rank == 1,
+                  "KokkosBlas::Impl::Abs<1-D>: "
+                  "XMV is not rank 1.");
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::abs[ETI]"
+                                      : "KokkosBlas::abs[noETI]");
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas1::abs<> ETI specialization for < %s , %s >\n",
+             typeid(RMV).name(), typeid(XMV).name());
     else {
-      printf("KokkosBlas1::abs<> non-ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name());
+      printf("KokkosBlas1::abs<> non-ETI specialization for < %s , %s >\n",
+             typeid(RMV).name(), typeid(XMV).name());
     }
-    #endif
+#endif
     const size_type numRows = X.extent(0);
 
-    if (numRows < static_cast<size_type> (INT_MAX)) {
+    if (numRows < static_cast<size_type>(INT_MAX)) {
       typedef int index_type;
-      V_Abs_Generic<RMV, XMV, index_type> (R, X);
-    }
-    else {
+      V_Abs_Generic<RMV, XMV, index_type>(R, X);
+    } else {
       typedef std::int64_t index_type;
-      V_Abs_Generic<RMV, XMV, index_type> (R, X);
+      V_Abs_Generic<RMV, XMV, index_type>(R, X);
     }
     Kokkos::Profiling::popRegion();
   }
 };
-  
-template<class RMV, class XMV>
+
+template <class RMV, class XMV>
 struct Abs<RMV, XMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename XMV::size_type size_type;
 
-  static void abs (const RMV& R, const XMV& X)
-  {
-    static_assert (Kokkos::Impl::is_view<RMV>::value, "KokkosBlas::Impl::"
-                   "Abs<2-D>: RMV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "Abs<2-D>: XMV is not a Kokkos::View.");
-    static_assert (RMV::rank == 2, "KokkosBlas::Impl::Abs<2-D>: "
-                   "RMV is not rank 2.");
-    static_assert (XMV::rank == 2, "KokkosBlas::Impl::Abs<2-D>: "
-                   "XMV is not rank 2.");
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::abs[ETI]":"KokkosBlas::abs[noETI]");
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::abs<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name());
+  static void abs(const RMV& R, const XMV& X) {
+    static_assert(Kokkos::is_view<RMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Abs<2-D>: RMV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Abs<2-D>: XMV is not a Kokkos::View.");
+    static_assert(RMV::rank == 2,
+                  "KokkosBlas::Impl::Abs<2-D>: "
+                  "RMV is not rank 2.");
+    static_assert(XMV::rank == 2,
+                  "KokkosBlas::Impl::Abs<2-D>: "
+                  "XMV is not rank 2.");
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::abs[ETI]"
+                                      : "KokkosBlas::abs[noETI]");
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas1::abs<> ETI specialization for < %s , %s >\n",
+             typeid(RMV).name(), typeid(XMV).name());
     else {
-      printf("KokkosBlas1::asb<> non-ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name());
+      printf("KokkosBlas1::asb<> non-ETI specialization for < %s , %s >\n",
+             typeid(RMV).name(), typeid(XMV).name());
     }
-    #endif
+#endif
 
     const size_type numRows = X.extent(0);
     const size_type numCols = X.extent(1);
-    if (numRows < static_cast<size_type> (INT_MAX) &&
-        numRows * numCols < static_cast<size_type> (INT_MAX)) {
+    if (numRows < static_cast<size_type>(INT_MAX) &&
+        numRows * numCols < static_cast<size_type>(INT_MAX)) {
       typedef int index_type;
-      MV_Abs_Generic<RMV, XMV, index_type> (R, X);
-    }
-    else {
+      MV_Abs_Generic<RMV, XMV, index_type>(R, X);
+    } else {
       typedef std::int64_t index_type;
-      MV_Abs_Generic<RMV, XMV, index_type> (R, X);
+      MV_Abs_Generic<RMV, XMV, index_type>(R, X);
     }
     Kokkos::Profiling::popRegion();
   }
 };
 #endif
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization of
@@ -199,26 +217,28 @@ struct Abs<RMV, XMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_ABS_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct Abs< \
-         Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         1, false, true>;
+#define KOKKOSBLAS1_ABS_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  extern template struct Abs<                                                \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<const SCALAR*, LAYOUT,                                    \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      1, false, true>;
 
 //
 // Macro for definition of full specialization of
 // KokkosBlas::Impl::Abs for rank == 2.  This is NOT for users!!!  We
 // use this macro in one or more .cpp files in this directory.
 //
-#define KOKKOSBLAS1_ABS_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct Abs< \
-         Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         1, false, true>;
+#define KOKKOSBLAS1_ABS_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  template struct Abs<                                                       \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<const SCALAR*, LAYOUT,                                    \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      1, false, true>;
 
 //
 // Macro for declaration of full specialization of
@@ -227,29 +247,33 @@ template struct Abs< \
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_ABS_MV_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct Abs< \
-         Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         2, false, true>;
+#define KOKKOSBLAS1_ABS_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE,        \
+                                         MEM_SPACE)                         \
+  extern template struct Abs<                                               \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      2, false, true>;
 
 //
 // Macro for definition of full specialization of
 // KokkosBlas::Impl::Abs for rank == 2.  This is NOT for users!!!  We
 // use this macro in one or more .cpp files in this directory.
 //
-#define KOKKOSBLAS1_ABS_MV_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct Abs< \
-         Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         2, false, true>;
+#define KOKKOSBLAS1_ABS_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE,        \
+                                         MEM_SPACE)                         \
+  template struct Abs<                                                      \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      2, false, true>;
 
-#include<KokkosBlas1_abs_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_abs_eti_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_abs_mv_eti_spec_decl.hpp>
+#include <KokkosBlas1_abs_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_abs_eti_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_abs_mv_eti_spec_decl.hpp>
 
-#endif // KOKKOS_BLAS1_MV_IMPL_ABS_HPP_
+#endif  // KOKKOS_BLAS1_MV_IMPL_ABS_HPP_
diff --git a/src/blas/impl/KokkosBlas1_axpby_impl.hpp b/src/blas/impl/KokkosBlas1_axpby_impl.hpp
index ccd7288c67..ed66569d7f 100644
--- a/src/blas/impl/KokkosBlas1_axpby_impl.hpp
+++ b/src/blas/impl/KokkosBlas1_axpby_impl.hpp
@@ -50,7 +50,7 @@
 
 #ifndef KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY
 #define KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY 2
-#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY
+#endif  // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY
 
 namespace KokkosBlas {
 namespace Impl {
@@ -77,8 +77,8 @@ namespace Impl {
 // coefficients.  Any literal coefficient of zero has BLAS semantics
 // of ignoring the corresponding (multi)vector entry.  This does not
 // apply to coefficients in the a and b vectors, if they are used.
-template<class AV, class XV, class BV, class YV,
-         int scalar_x, int scalar_y, class SizeType>
+template <class AV, class XV, class BV, class YV, int scalar_x, int scalar_y,
+          class SizeType>
 struct Axpby_Functor {
   typedef typename YV::execution_space execution_space;
   typedef SizeType size_type;
@@ -89,34 +89,37 @@ struct Axpby_Functor {
   AV m_a;
   BV m_b;
 
-  Axpby_Functor (const XV& x, const YV& y,
-                   const AV& a, const BV& b,
-                   const SizeType startingColumn) :
-    m_x (x), m_y (y), m_a (a), m_b (b)
-  {
-    static_assert (Kokkos::Impl::is_view<XV>::value, "KokkosBlas::Impl::"
-                   "Axpby_Functor: X is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<YV>::value, "KokkosBlas::Impl::"
-                   "Axpby_Functor: Y is not a Kokkos::View.");
-    static_assert (std::is_same<typename YV::value_type,
-                   typename YV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::Axpby_Functor: Y is const.  "
-                   "It must be nonconst, because it is an output argument "
-                   "(we have to be able to write to its entries).");
-    static_assert ((int) YV::Rank == (int) XV::Rank, "KokkosBlas::Impl::"
-                   "Axpby_Functor: X and Y must have the same rank.");
-    static_assert (YV::Rank == 1, "KokkosBlas::Impl::Axpby_Functor: "
-                   "XV and YV must have rank 1.");
+  Axpby_Functor(const XV& x, const YV& y, const AV& a, const BV& b,
+                const SizeType startingColumn)
+      : m_x(x), m_y(y), m_a(a), m_b(b) {
+    static_assert(Kokkos::is_view<XV>::value,
+                  "KokkosBlas::Impl::"
+                  "Axpby_Functor: X is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<YV>::value,
+                  "KokkosBlas::Impl::"
+                  "Axpby_Functor: Y is not a Kokkos::View.");
+    static_assert(std::is_same<typename YV::value_type,
+                               typename YV::non_const_value_type>::value,
+                  "KokkosBlas::Impl::Axpby_Functor: Y is const.  "
+                  "It must be nonconst, because it is an output argument "
+                  "(we have to be able to write to its entries).");
+    static_assert((int)YV::Rank == (int)XV::Rank,
+                  "KokkosBlas::Impl::"
+                  "Axpby_Functor: X and Y must have the same rank.");
+    static_assert(YV::Rank == 1,
+                  "KokkosBlas::Impl::Axpby_Functor: "
+                  "XV and YV must have rank 1.");
 
     if (startingColumn != 0) {
-      m_a = Kokkos::subview (a, std::make_pair (startingColumn, SizeType(a.extent(0))));
-      m_b = Kokkos::subview (b, std::make_pair (startingColumn, SizeType(b.extent(0))));
+      m_a = Kokkos::subview(
+          a, std::make_pair(startingColumn, SizeType(a.extent(0))));
+      m_b = Kokkos::subview(
+          b, std::make_pair(startingColumn, SizeType(b.extent(0))));
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i) const
-  {
+  void operator()(const size_type& i) const {
     // scalar_x and scalar_y are compile-time constants (since they
     // are template parameters), so the compiler should evaluate these
     // branches at compile time.
@@ -124,31 +127,31 @@ struct Axpby_Functor {
 #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY <= 2
 
     if (scalar_x == 0 && scalar_y == 0) {
-      m_y(i) = ATS::zero ();
+      m_y(i) = ATS::zero();
     }
     if (scalar_x == 0 && scalar_y == 2) {
-      m_y(i) = m_b(0)*m_y(i);
+      m_y(i) = m_b(0) * m_y(i);
     }
     if (scalar_x == 2 && scalar_y == 0) {
-      m_y(i) = m_a(0)*m_x(i);
+      m_y(i) = m_a(0) * m_x(i);
     }
     if (scalar_x == 2 && scalar_y == 2) {
-      m_y(i) = m_a(0)*m_x(i) + m_b(0)*m_y(i);
+      m_y(i) = m_a(0) * m_x(i) + m_b(0) * m_y(i);
     }
 
-#else // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
+#else  // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
 
     if (scalar_x == 0 && scalar_y == 0) {
-      m_y(i) = ATS::zero ();
+      m_y(i) = ATS::zero();
     }
     if (scalar_x == 0 && scalar_y == -1) {
       m_y(i) = -m_y(i);
     }
     if (scalar_x == 0 && scalar_y == 1) {
-      return; // m_y(i) = m_y(i);
+      return;  // m_y(i) = m_y(i);
     }
     if (scalar_x == 0 && scalar_y == 2) {
-      m_y(i) = m_b(0)*m_y(i);
+      m_y(i) = m_b(0) * m_y(i);
     }
     if (scalar_x == -1 && scalar_y == 0) {
       m_y(i) = -m_x(i);
@@ -160,7 +163,7 @@ struct Axpby_Functor {
       m_y(i) = -m_x(i) + m_y(i);
     }
     if (scalar_x == -1 && scalar_y == 2) {
-      m_y(i) = -m_x(i) + m_b(0)*m_y(i);
+      m_y(i) = -m_x(i) + m_b(0) * m_y(i);
     }
     if (scalar_x == 1 && scalar_y == 0) {
       m_y(i) = m_x(i);
@@ -172,26 +175,25 @@ struct Axpby_Functor {
       m_y(i) = m_x(i) + m_y(i);
     }
     if (scalar_x == 1 && scalar_y == 2) {
-      m_y(i) = m_x(i) + m_b(0)*m_y(i);
+      m_y(i) = m_x(i) + m_b(0) * m_y(i);
     }
     if (scalar_x == 2 && scalar_y == 0) {
-      m_y(i) = m_a(0)*m_x(i);
+      m_y(i) = m_a(0) * m_x(i);
     }
     if (scalar_x == 2 && scalar_y == -1) {
-      m_y(i) = m_a(0)*m_x(i) - m_y(i);
+      m_y(i) = m_a(0) * m_x(i) - m_y(i);
     }
     if (scalar_x == 2 && scalar_y == 1) {
-      m_y(i) = m_a(0)*m_x(i) + m_y(i);
+      m_y(i) = m_a(0) * m_x(i) + m_y(i);
     }
     if (scalar_x == 2 && scalar_y == 2) {
-      m_y(i) = m_a(0)*m_x(i) + m_b(0)*m_y(i);
+      m_y(i) = m_a(0) * m_x(i) + m_b(0) * m_y(i);
     }
 
-#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY
+#endif  // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY
   }
 };
 
-
 // Partial specialization of Axpby_Functor that lets a and b be
 // scalars (rather than 1-D Views, as in the most general version
 // above).  This functor computes any of the following:
@@ -208,11 +210,10 @@ struct Axpby_Functor {
 // coefficients.  Any literal coefficient of zero has BLAS semantics
 // of ignoring the corresponding (multi)vector entry.  This does not
 // apply to coefficients in the a and b vectors, if they are used.
-template<class XV, class YV,
-         int scalar_x, int scalar_y, class SizeType>
+template <class XV, class YV, int scalar_x, int scalar_y, class SizeType>
 struct Axpby_Functor<typename XV::non_const_value_type, XV,
-                       typename YV::non_const_value_type, YV,
-                       scalar_x, scalar_y, SizeType> {
+                     typename YV::non_const_value_type, YV, scalar_x, scalar_y,
+                     SizeType> {
   typedef typename YV::execution_space execution_space;
   typedef SizeType size_type;
   typedef Kokkos::Details::ArithTraits<typename YV::non_const_value_type> ATS;
@@ -222,30 +223,32 @@ struct Axpby_Functor<typename XV::non_const_value_type, XV,
   const typename XV::non_const_value_type m_a;
   const typename YV::non_const_value_type m_b;
 
-  Axpby_Functor (const XV& x, const YV& y,
-                   const typename XV::non_const_value_type& a,
-                   const typename YV::non_const_value_type& b,
-                   const SizeType /* startingColumn */) :
-    m_x (x), m_y (y), m_a (a), m_b (b)
-  {
-    static_assert (Kokkos::Impl::is_view<XV>::value, "KokkosBlas::Impl::"
-                   "Axpby_Functor: X is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<YV>::value, "KokkosBlas::Impl::"
-                   "Axpby_Functor: Y is not a Kokkos::View.");
-    static_assert (std::is_same<typename YV::value_type,
-                   typename YV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::Axpby_Functor: R is const.  "
-                   "It must be nonconst, because it is an output argument "
-                   "(we have to be able to write to its entries).");
-    static_assert ((int) YV::Rank == (int) XV::Rank, "KokkosBlas::Impl::"
-                   "Axpby_Functor: X and Y must have the same rank.");
-    static_assert (YV::Rank == 1, "KokkosBlas::Impl::Axpby_Functor: "
-                   "XV and YV must have rank 1.");
+  Axpby_Functor(const XV& x, const YV& y,
+                const typename XV::non_const_value_type& a,
+                const typename YV::non_const_value_type& b,
+                const SizeType /* startingColumn */)
+      : m_x(x), m_y(y), m_a(a), m_b(b) {
+    static_assert(Kokkos::is_view<XV>::value,
+                  "KokkosBlas::Impl::"
+                  "Axpby_Functor: X is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<YV>::value,
+                  "KokkosBlas::Impl::"
+                  "Axpby_Functor: Y is not a Kokkos::View.");
+    static_assert(std::is_same<typename YV::value_type,
+                               typename YV::non_const_value_type>::value,
+                  "KokkosBlas::Impl::Axpby_Functor: R is const.  "
+                  "It must be nonconst, because it is an output argument "
+                  "(we have to be able to write to its entries).");
+    static_assert((int)YV::Rank == (int)XV::Rank,
+                  "KokkosBlas::Impl::"
+                  "Axpby_Functor: X and Y must have the same rank.");
+    static_assert(YV::Rank == 1,
+                  "KokkosBlas::Impl::Axpby_Functor: "
+                  "XV and YV must have rank 1.");
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i) const
-  {
+  void operator()(const size_type& i) const {
     // scalar_x and scalar_y are compile-time constants (since they
     // are template parameters), so the compiler should evaluate these
     // branches at compile time.
@@ -266,19 +269,19 @@ struct Axpby_Functor<typename XV::non_const_value_type, XV,
                                                               m_b * m_y(i));
     }
 
-#else // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
+#else  // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
 
     if (scalar_x == 0 && scalar_y == 0) {
-      m_y(i) = ATS::zero ();
+      m_y(i) = ATS::zero();
     }
     if (scalar_x == 0 && scalar_y == -1) {
       m_y(i) = -m_y(i);
     }
     if (scalar_x == 0 && scalar_y == 1) {
-      return; // m_y(i) = m_y(i);
+      return;  // m_y(i) = m_y(i);
     }
     if (scalar_x == 0 && scalar_y == 2) {
-      m_y(i) = m_b*m_y(i);
+      m_y(i) = m_b * m_y(i);
     }
     if (scalar_x == -1 && scalar_y == 0) {
       m_y(i) = -m_x(i);
@@ -290,7 +293,7 @@ struct Axpby_Functor<typename XV::non_const_value_type, XV,
       m_y(i) = -m_x(i) + m_y(i);
     }
     if (scalar_x == -1 && scalar_y == 2) {
-      m_y(i) = -m_x(i) + m_b*m_y(i);
+      m_y(i) = -m_x(i) + m_b * m_y(i);
     }
     if (scalar_x == 1 && scalar_y == 0) {
       m_y(i) = m_x(i);
@@ -302,22 +305,22 @@ struct Axpby_Functor<typename XV::non_const_value_type, XV,
       m_y(i) = m_x(i) + m_y(i);
     }
     if (scalar_x == 1 && scalar_y == 2) {
-      m_y(i) = m_x(i) + m_b*m_y(i);
+      m_y(i) = m_x(i) + m_b * m_y(i);
     }
     if (scalar_x == 2 && scalar_y == 0) {
-      m_y(i) = m_a*m_x(i);
+      m_y(i) = m_a * m_x(i);
     }
     if (scalar_x == 2 && scalar_y == -1) {
-      m_y(i) = m_a*m_x(i) - m_y(i);
+      m_y(i) = m_a * m_x(i) - m_y(i);
     }
     if (scalar_x == 2 && scalar_y == 1) {
-      m_y(i) = m_a*m_x(i) + m_y(i);
+      m_y(i) = m_a * m_x(i) + m_y(i);
     }
     if (scalar_x == 2 && scalar_y == 2) {
-      m_y(i) = m_a*m_x(i) + m_b*m_y(i);
+      m_y(i) = m_a * m_x(i) + m_b * m_y(i);
     }
 
-#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY
+#endif  // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY
   }
 };
 
@@ -327,126 +330,142 @@ struct Axpby_Functor<typename XV::non_const_value_type, XV,
 //
 // This takes the starting column, so that if av and bv are both 1-D
 // Views, then the functor can take a subview if appropriate.
-template<class AV, class XV, class BV, class YV, class SizeType>
-void
-Axpby_Generic (const AV& av, const XV& x,
-                 const BV& bv, const YV& y,
-                 const SizeType startingColumn,
-                 int a = 2, int b = 2)
-{
-  static_assert (Kokkos::Impl::is_view<XV>::value, "KokkosBlas::Impl::"
-                 "Axpby_Generic: X is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<YV>::value, "KokkosBlas::Impl::"
-                 "Axpby_Generic: Y is not a Kokkos::View.");
-  static_assert (std::is_same<typename YV::value_type,
-                   typename YV::non_const_value_type>::value,
-                 "KokkosBlas::Impl::Axpby_Generic: Y is const.  "
-                 "It must be nonconst, because it is an output argument "
-                 "(we have to be able to write to its entries).");
-  static_assert ((int) YV::Rank == (int) XV::Rank, "KokkosBlas::Impl::"
-                 "Axpby_Generic: X and Y must have the same rank.");
-  static_assert (YV::Rank == 1, "KokkosBlas::Impl::Axpby_Generic: "
-                 "XV and YV must have rank 1.");
+template <class AV, class XV, class BV, class YV, class SizeType>
+void Axpby_Generic(const AV& av, const XV& x, const BV& bv, const YV& y,
+                   const SizeType startingColumn, int a = 2, int b = 2) {
+  static_assert(Kokkos::is_view<XV>::value,
+                "KokkosBlas::Impl::"
+                "Axpby_Generic: X is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<YV>::value,
+                "KokkosBlas::Impl::"
+                "Axpby_Generic: Y is not a Kokkos::View.");
+  static_assert(std::is_same<typename YV::value_type,
+                             typename YV::non_const_value_type>::value,
+                "KokkosBlas::Impl::Axpby_Generic: Y is const.  "
+                "It must be nonconst, because it is an output argument "
+                "(we have to be able to write to its entries).");
+  static_assert((int)YV::Rank == (int)XV::Rank,
+                "KokkosBlas::Impl::"
+                "Axpby_Generic: X and Y must have the same rank.");
+  static_assert(YV::Rank == 1,
+                "KokkosBlas::Impl::Axpby_Generic: "
+                "XV and YV must have rank 1.");
 
   typedef typename YV::execution_space execution_space;
   const SizeType numRows = x.extent(0);
-  Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
+  Kokkos::RangePolicy<execution_space, SizeType> policy(0, numRows);
 
   if (a == 0 && b == 0) {
-    Axpby_Functor<AV, XV, BV, YV, 0, 0, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::S0", policy, op);
+    Axpby_Functor<AV, XV, BV, YV, 0, 0, SizeType> op(x, y, av, bv,
+                                                     startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::S0", policy, op);
     return;
   }
 
 #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
   if (a == 0 && b == -1) {
-    Axpby_Functor<AV, XV, BV, YV, 0, -1, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::S1", policy, op);
+    Axpby_Functor<AV, XV, BV, YV, 0, -1, SizeType> op(x, y, av, bv,
+                                                      startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::S1", policy, op);
     return;
   }
   if (a == 0 && b == 1) {
-    Axpby_Functor<AV, XV, BV, YV, 0, 1, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::S2", policy, op);
+    Axpby_Functor<AV, XV, BV, YV, 0, 1, SizeType> op(x, y, av, bv,
+                                                     startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::S2", policy, op);
     return;
   }
-#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
+#endif  // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
 
   if (a == 0 && b == 2) {
-    Axpby_Functor<AV, XV, BV, YV, 0, 2, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::S3", policy, op);
+    Axpby_Functor<AV, XV, BV, YV, 0, 2, SizeType> op(x, y, av, bv,
+                                                     startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::S3", policy, op);
     return;
   }
 
 #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
   // a == -1
   if (a == -1 && b == 0) {
-    Axpby_Functor<AV, XV, BV, YV, -1, 0, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::S4", policy, op);
+    Axpby_Functor<AV, XV, BV, YV, -1, 0, SizeType> op(x, y, av, bv,
+                                                      startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::S4", policy, op);
     return;
   }
   if (a == -1 && b == -1) {
-    Axpby_Functor<AV, XV, BV, YV, -1, -1, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::S5", policy, op);
+    Axpby_Functor<AV, XV, BV, YV, -1, -1, SizeType> op(x, y, av, bv,
+                                                       startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::S5", policy, op);
     return;
   }
   if (a == -1 && b == 1) {
-    Axpby_Functor<AV, XV, BV, YV, -1, 1, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::S6", policy, op);
+    Axpby_Functor<AV, XV, BV, YV, -1, 1, SizeType> op(x, y, av, bv,
+                                                      startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::S6", policy, op);
     return;
   }
   if (a == -1 && b == 2) {
-    Axpby_Functor<AV, XV, BV, YV, -1, 2, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::S7", policy, op);
+    Axpby_Functor<AV, XV, BV, YV, -1, 2, SizeType> op(x, y, av, bv,
+                                                      startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::S7", policy, op);
     return;
   }
   // a == 1
   if (a == 1 && b == 0) {
-    Axpby_Functor<AV, XV, BV, YV, 1, 0, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::S8", policy, op);
+    Axpby_Functor<AV, XV, BV, YV, 1, 0, SizeType> op(x, y, av, bv,
+                                                     startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::S8", policy, op);
     return;
   }
   if (a == 1 && b == -1) {
-    Axpby_Functor<AV, XV, BV, YV, 1, -1, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::S9", policy, op);
+    Axpby_Functor<AV, XV, BV, YV, 1, -1, SizeType> op(x, y, av, bv,
+                                                      startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::S9", policy, op);
     return;
   }
   if (a == 1 && b == 1) {
-    Axpby_Functor<AV, XV, BV, YV, 1, 1, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::S10", policy, op);
+    Axpby_Functor<AV, XV, BV, YV, 1, 1, SizeType> op(x, y, av, bv,
+                                                     startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::S10", policy, op);
     return;
   }
   if (a == 1 && b == 2) {
-    Axpby_Functor<AV, XV, BV, YV, 1, 2, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::S11", policy, op);
+    Axpby_Functor<AV, XV, BV, YV, 1, 2, SizeType> op(x, y, av, bv,
+                                                     startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::S11", policy, op);
     return;
   }
-#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
+#endif  // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
 
   // a == 2
   if (a == 2 && b == 0) {
-    Axpby_Functor<AV, XV, BV, YV, 2, 0, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::S12", policy, op);
+    Axpby_Functor<AV, XV, BV, YV, 2, 0, SizeType> op(x, y, av, bv,
+                                                     startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::S12", policy, op);
     return;
   }
 
 #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
   if (a == 2 && b == -1) {
-    Axpby_Functor<AV, XV, BV, YV, 2, -1, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::S13", policy, op);
+    Axpby_Functor<AV, XV, BV, YV, 2, -1, SizeType> op(x, y, av, bv,
+                                                      startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::S13", policy, op);
     return;
   }
   if (a == 2 && b == 1) {
-    Axpby_Functor<AV, XV, BV, YV, 2, 1, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::S14", policy, op);
+    Axpby_Functor<AV, XV, BV, YV, 2, 1, SizeType> op(x, y, av, bv,
+                                                     startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::S14", policy, op);
     return;
   }
-#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
+#endif  // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
 
   // a and b arbitrary (not -1, 0, or 1)
-  Axpby_Functor<AV, XV, BV, YV, 2, 2, SizeType> op (x, y, av, bv, startingColumn);
-  Kokkos::parallel_for ("KokkosBlas::Axpby::S15", policy, op);
+  Axpby_Functor<AV, XV, BV, YV, 2, 2, SizeType> op(x, y, av, bv,
+                                                   startingColumn);
+  Kokkos::parallel_for("KokkosBlas::Axpby::S15", policy, op);
 }
 
-}
-}
-#endif // KOKKOSBLAS1_AXPBY_IMPL_HPP_
+}  // namespace Impl
+}  // namespace KokkosBlas
+#endif  // KOKKOSBLAS1_AXPBY_IMPL_HPP_
diff --git a/src/blas/impl/KokkosBlas1_axpby_mv_impl.hpp b/src/blas/impl/KokkosBlas1_axpby_mv_impl.hpp
index 81bbde00ad..a7ea69bdd2 100644
--- a/src/blas/impl/KokkosBlas1_axpby_mv_impl.hpp
+++ b/src/blas/impl/KokkosBlas1_axpby_mv_impl.hpp
@@ -44,7 +44,7 @@
 #ifndef KOKKOSBLAS1_AXPBY_MV_IMPL_HPP_
 #define KOKKOSBLAS1_AXPBY_MV_IMPL_HPP_
 
-#include<KokkosBlas1_axpby_impl.hpp>
+#include <KokkosBlas1_axpby_impl.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
@@ -68,10 +68,9 @@ namespace Impl {
 // coefficients.  Any literal coefficient of zero has BLAS semantics
 // of ignoring the corresponding (multi)vector entry.  This does not
 // apply to coefficients in the a and b vectors, if they are used.
-template<class AV, class XMV, class BV, class YMV,
-         int scalar_x, int scalar_y, class SizeType = typename YMV::size_type>
-struct Axpby_MV_Functor
-{
+template <class AV, class XMV, class BV, class YMV, int scalar_x, int scalar_y,
+          class SizeType = typename YMV::size_type>
+struct Axpby_MV_Functor {
   typedef typename YMV::execution_space execution_space;
   typedef SizeType size_type;
   typedef Kokkos::Details::ArithTraits<typename YMV::non_const_value_type> ATS;
@@ -82,37 +81,43 @@ struct Axpby_MV_Functor
   AV m_a;
   BV m_b;
 
-  Axpby_MV_Functor (const XMV& X, const YMV& Y, const AV& a, const BV& b) :
-    numCols (X.extent(1)), m_x (X), m_y (Y), m_a (a), m_b (b)
-  {
+  Axpby_MV_Functor(const XMV& X, const YMV& Y, const AV& a, const BV& b)
+      : numCols(X.extent(1)), m_x(X), m_y(Y), m_a(a), m_b(b) {
     // XMV and YMV must be Kokkos::View specializations.
-    static_assert (Kokkos::Impl::is_view<AV>::value, "KokkosBlas::Impl::"
-                   "Axpby_MV_Functor: a is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "Axpby_MV_Functor: X is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<BV>::value, "KokkosBlas::Impl::"
-                   "Axpby_MV_Functor: b is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<YMV>::value, "KokkosBlas::Impl::"
-                   "Axpby_MV_Functor: Y is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<AV>::value,
+                  "KokkosBlas::Impl::"
+                  "Axpby_MV_Functor: a is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Axpby_MV_Functor: X is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<BV>::value,
+                  "KokkosBlas::Impl::"
+                  "Axpby_MV_Functor: b is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<YMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Axpby_MV_Functor: Y is not a Kokkos::View.");
     // YMV must be nonconst (else it can't be an output argument).
-    static_assert (std::is_same<typename YMV::value_type,
-                   typename YMV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::Axpby_MV_Functor: Y is const.  "
-                   "It must be nonconst, because it is an output argument "
-                   "(we have to be able to write to its entries).");
-    static_assert ((int) YMV::Rank == (int) XMV::Rank, "KokkosBlas::Impl::Axpby_MV_Functor: "
-                   "X and Y must have the same rank.");
-    static_assert (YMV::Rank == 2, "KokkosBlas::Impl::Axpby_MV_Functor: "
-                   "XMV and YMV must have rank 2.");
-    static_assert (AV::Rank == 1, "KokkosBlas::Impl::Axpby_MV_Functor: "
-                   "AV must have rank 1.");
-    static_assert (BV::Rank == 1, "KokkosBlas::Impl::Axpby_MV_Functor: "
-                   "BV must have rank 1.");
+    static_assert(std::is_same<typename YMV::value_type,
+                               typename YMV::non_const_value_type>::value,
+                  "KokkosBlas::Impl::Axpby_MV_Functor: Y is const.  "
+                  "It must be nonconst, because it is an output argument "
+                  "(we have to be able to write to its entries).");
+    static_assert((int)YMV::Rank == (int)XMV::Rank,
+                  "KokkosBlas::Impl::Axpby_MV_Functor: "
+                  "X and Y must have the same rank.");
+    static_assert(YMV::Rank == 2,
+                  "KokkosBlas::Impl::Axpby_MV_Functor: "
+                  "XMV and YMV must have rank 2.");
+    static_assert(AV::Rank == 1,
+                  "KokkosBlas::Impl::Axpby_MV_Functor: "
+                  "AV must have rank 1.");
+    static_assert(BV::Rank == 1,
+                  "KokkosBlas::Impl::Axpby_MV_Functor: "
+                  "BV must have rank 1.");
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i) const
-  {
+  void operator()(const size_type& i) const {
     // scalar_x and scalar_y are compile-time constants (since they
     // are template parameters), so the compiler should evaluate these
     // branches at compile time.
@@ -124,7 +129,7 @@ struct Axpby_MV_Functor
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = ATS::zero ();
+        m_y(i, k) = ATS::zero();
       }
     }
     if (scalar_x == 0 && scalar_y == -1) {
@@ -135,11 +140,11 @@ struct Axpby_MV_Functor
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = -m_y(i,k);
+        m_y(i, k) = -m_y(i, k);
       }
     }
     if (scalar_x == 0 && scalar_y == 1) {
-      return; // Y(i,j) := Y(i,j)
+      return;  // Y(i,j) := Y(i,j)
     }
     if (scalar_x == 0 && scalar_y == 2) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
@@ -149,7 +154,7 @@ struct Axpby_MV_Functor
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = m_b(k)*m_y(i,k);
+        m_y(i, k) = m_b(k) * m_y(i, k);
       }
     }
     if (scalar_x == -1 && scalar_y == 0) {
@@ -160,7 +165,7 @@ struct Axpby_MV_Functor
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = -m_x(i,k);
+        m_y(i, k) = -m_x(i, k);
       }
     }
     if (scalar_x == -1 && scalar_y == -1) {
@@ -171,7 +176,7 @@ struct Axpby_MV_Functor
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = -m_x(i,k) - m_y(i,k);
+        m_y(i, k) = -m_x(i, k) - m_y(i, k);
       }
     }
     if (scalar_x == -1 && scalar_y == 1) {
@@ -182,7 +187,7 @@ struct Axpby_MV_Functor
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = -m_x(i,k) + m_y(i,k);
+        m_y(i, k) = -m_x(i, k) + m_y(i, k);
       }
     }
     if (scalar_x == -1 && scalar_y == 2) {
@@ -193,7 +198,7 @@ struct Axpby_MV_Functor
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = -m_x(i,k) + m_b(k)*m_y(i,k);
+        m_y(i, k) = -m_x(i, k) + m_b(k) * m_y(i, k);
       }
     }
     if (scalar_x == 1 && scalar_y == 0) {
@@ -204,7 +209,7 @@ struct Axpby_MV_Functor
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = m_x(i,k);
+        m_y(i, k) = m_x(i, k);
       }
     }
     if (scalar_x == 1 && scalar_y == -1) {
@@ -215,7 +220,7 @@ struct Axpby_MV_Functor
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = m_x(i,k) - m_y(i,k);
+        m_y(i, k) = m_x(i, k) - m_y(i, k);
       }
     }
     if (scalar_x == 1 && scalar_y == 1) {
@@ -226,7 +231,7 @@ struct Axpby_MV_Functor
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = m_x(i,k) + m_y(i,k);
+        m_y(i, k) = m_x(i, k) + m_y(i, k);
       }
     }
     if (scalar_x == 1 && scalar_y == 2) {
@@ -237,7 +242,7 @@ struct Axpby_MV_Functor
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = m_x(i,k) + m_b(k)*m_y(i,k);
+        m_y(i, k) = m_x(i, k) + m_b(k) * m_y(i, k);
       }
     }
     if (scalar_x == 2 && scalar_y == 0) {
@@ -248,7 +253,7 @@ struct Axpby_MV_Functor
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = m_a(k)*m_x(i,k);
+        m_y(i, k) = m_a(k) * m_x(i, k);
       }
     }
     if (scalar_x == 2 && scalar_y == -1) {
@@ -259,7 +264,7 @@ struct Axpby_MV_Functor
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = m_a(k)*m_x(i,k) - m_y(i,k);
+        m_y(i, k) = m_a(k) * m_x(i, k) - m_y(i, k);
       }
     }
     if (scalar_x == 2 && scalar_y == 1) {
@@ -270,7 +275,7 @@ struct Axpby_MV_Functor
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = m_a(k)*m_x(i,k) + m_y(i,k);
+        m_y(i, k) = m_a(k) * m_x(i, k) + m_y(i, k);
       }
     }
     if (scalar_x == 2 && scalar_y == 2) {
@@ -281,7 +286,7 @@ struct Axpby_MV_Functor
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = m_a(k)*m_x(i,k) + m_b(k)*m_y(i,k);
+        m_y(i, k) = m_a(k) * m_x(i, k) + m_b(k) * m_y(i, k);
       }
     }
   }
@@ -305,11 +310,10 @@ struct Axpby_MV_Functor
 //
 // This version works by partial specialization on AV and BV.
 // In this partial specialization, both AV and BV are scalars.
-template<class XMV, class YMV, int scalar_x, int scalar_y, class SizeType>
+template <class XMV, class YMV, int scalar_x, int scalar_y, class SizeType>
 struct Axpby_MV_Functor<typename XMV::non_const_value_type, XMV,
-                        typename YMV::non_const_value_type, YMV,
-                        scalar_x, scalar_y, SizeType>
-{
+                        typename YMV::non_const_value_type, YMV, scalar_x,
+                        scalar_y, SizeType> {
   typedef typename YMV::execution_space execution_space;
   typedef SizeType size_type;
   typedef Kokkos::Details::ArithTraits<typename YMV::non_const_value_type> ATS;
@@ -320,29 +324,31 @@ struct Axpby_MV_Functor<typename XMV::non_const_value_type, XMV,
   const typename XMV::non_const_value_type m_a;
   const typename YMV::non_const_value_type m_b;
 
-  Axpby_MV_Functor (const XMV& X, const YMV& Y,
-                    const typename XMV::non_const_value_type& a,
-                    const typename YMV::non_const_value_type& b) :
-    numCols (X.extent(1)), m_x (X), m_y (Y), m_a (a), m_b (b)
-  {
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "Axpby_MV_Functor: X is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<YMV>::value, "KokkosBlas::Impl::"
-                   "Axpby_MV_Functor: Y is not a Kokkos::View.");
-    static_assert (std::is_same<typename YMV::value_type,
-                   typename YMV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::Axpby_MV_Functor: Y is const.  "
-                   "It must be nonconst, because it is an output argument "
-                   "(we have to be able to write to its entries).");
-    static_assert ((int) YMV::Rank == (int) XMV::Rank, "KokkosBlas::Impl::"
-                   "Axpby_MV_Functor: X and Y must have the same rank.");
-    static_assert (YMV::Rank == 2, "KokkosBlas::Impl::Axpby_MV_Functor: "
-                   "XMV and YMV must have rank 2.");
+  Axpby_MV_Functor(const XMV& X, const YMV& Y,
+                   const typename XMV::non_const_value_type& a,
+                   const typename YMV::non_const_value_type& b)
+      : numCols(X.extent(1)), m_x(X), m_y(Y), m_a(a), m_b(b) {
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Axpby_MV_Functor: X is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<YMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Axpby_MV_Functor: Y is not a Kokkos::View.");
+    static_assert(std::is_same<typename YMV::value_type,
+                               typename YMV::non_const_value_type>::value,
+                  "KokkosBlas::Impl::Axpby_MV_Functor: Y is const.  "
+                  "It must be nonconst, because it is an output argument "
+                  "(we have to be able to write to its entries).");
+    static_assert((int)YMV::Rank == (int)XMV::Rank,
+                  "KokkosBlas::Impl::"
+                  "Axpby_MV_Functor: X and Y must have the same rank.");
+    static_assert(YMV::Rank == 2,
+                  "KokkosBlas::Impl::Axpby_MV_Functor: "
+                  "XMV and YMV must have rank 2.");
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i) const
-  {
+  void operator()(const size_type& i) const {
     // scalar_x and scalar_y are compile-time constants (since they
     // are template parameters), so the compiler should evaluate these
     // branches at compile time.
@@ -354,7 +360,7 @@ struct Axpby_MV_Functor<typename XMV::non_const_value_type, XMV,
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = ATS::zero ();
+        m_y(i, k) = ATS::zero();
       }
     }
     if (scalar_x == 0 && scalar_y == -1) {
@@ -365,11 +371,11 @@ struct Axpby_MV_Functor<typename XMV::non_const_value_type, XMV,
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = -m_y(i,k);
+        m_y(i, k) = -m_y(i, k);
       }
     }
     if (scalar_x == 0 && scalar_y == 1) {
-      return; // Y(i,j) := Y(i,j)
+      return;  // Y(i,j) := Y(i,j)
     }
     if (scalar_x == 0 && scalar_y == 2) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
@@ -379,7 +385,7 @@ struct Axpby_MV_Functor<typename XMV::non_const_value_type, XMV,
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = m_b*m_y(i,k);
+        m_y(i, k) = m_b * m_y(i, k);
       }
     }
     if (scalar_x == -1 && scalar_y == 0) {
@@ -390,7 +396,7 @@ struct Axpby_MV_Functor<typename XMV::non_const_value_type, XMV,
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = -m_x(i,k);
+        m_y(i, k) = -m_x(i, k);
       }
     }
     if (scalar_x == -1 && scalar_y == -1) {
@@ -401,7 +407,7 @@ struct Axpby_MV_Functor<typename XMV::non_const_value_type, XMV,
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = -m_x(i,k) - m_y(i,k);
+        m_y(i, k) = -m_x(i, k) - m_y(i, k);
       }
     }
     if (scalar_x == -1 && scalar_y == 1) {
@@ -412,7 +418,7 @@ struct Axpby_MV_Functor<typename XMV::non_const_value_type, XMV,
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = -m_x(i,k) + m_y(i,k);
+        m_y(i, k) = -m_x(i, k) + m_y(i, k);
       }
     }
     if (scalar_x == -1 && scalar_y == 2) {
@@ -423,7 +429,7 @@ struct Axpby_MV_Functor<typename XMV::non_const_value_type, XMV,
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = -m_x(i,k) + m_b*m_y(i,k);
+        m_y(i, k) = -m_x(i, k) + m_b * m_y(i, k);
       }
     }
     if (scalar_x == 1 && scalar_y == 0) {
@@ -434,7 +440,7 @@ struct Axpby_MV_Functor<typename XMV::non_const_value_type, XMV,
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = m_x(i,k);
+        m_y(i, k) = m_x(i, k);
       }
     }
     if (scalar_x == 1 && scalar_y == -1) {
@@ -445,7 +451,7 @@ struct Axpby_MV_Functor<typename XMV::non_const_value_type, XMV,
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = m_x(i,k) - m_y(i,k);
+        m_y(i, k) = m_x(i, k) - m_y(i, k);
       }
     }
     if (scalar_x == 1 && scalar_y == 1) {
@@ -456,7 +462,7 @@ struct Axpby_MV_Functor<typename XMV::non_const_value_type, XMV,
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = m_x(i,k) + m_y(i,k);
+        m_y(i, k) = m_x(i, k) + m_y(i, k);
       }
     }
     if (scalar_x == 1 && scalar_y == 2) {
@@ -467,7 +473,7 @@ struct Axpby_MV_Functor<typename XMV::non_const_value_type, XMV,
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = m_x(i,k) + m_b*m_y(i,k);
+        m_y(i, k) = m_x(i, k) + m_b * m_y(i, k);
       }
     }
     if (scalar_x == 2 && scalar_y == 0) {
@@ -478,7 +484,7 @@ struct Axpby_MV_Functor<typename XMV::non_const_value_type, XMV,
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = m_a*m_x(i,k);
+        m_y(i, k) = m_a * m_x(i, k);
       }
     }
     if (scalar_x == 2 && scalar_y == -1) {
@@ -489,7 +495,7 @@ struct Axpby_MV_Functor<typename XMV::non_const_value_type, XMV,
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = m_a*m_x(i,k) - m_y(i,k);
+        m_y(i, k) = m_a * m_x(i, k) - m_y(i, k);
       }
     }
     if (scalar_x == 2 && scalar_y == 1) {
@@ -500,7 +506,7 @@ struct Axpby_MV_Functor<typename XMV::non_const_value_type, XMV,
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = m_a*m_x(i,k) + m_y(i,k);
+        m_y(i, k) = m_a * m_x(i, k) + m_y(i, k);
       }
     }
     if (scalar_x == 2 && scalar_y == 2) {
@@ -511,19 +517,17 @@ struct Axpby_MV_Functor<typename XMV::non_const_value_type, XMV,
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_y(i,k) = m_a*m_x(i,k) + m_b*m_y(i,k);
+        m_y(i, k) = m_a * m_x(i, k) + m_b * m_y(i, k);
       }
     }
   }
 };
 
-
 // Column-unrolled variant of Axpby_MV_Functor.  The number of columns
 // in X and Y, UNROLL, is a compile-time constant.
-template<class AV, class XMV, class BV, class YMV,
-         int scalar_x, int scalar_y, int UNROLL, class SizeType>
-struct Axpby_MV_Unroll_Functor
-{
+template <class AV, class XMV, class BV, class YMV, int scalar_x, int scalar_y,
+          int UNROLL, class SizeType>
+struct Axpby_MV_Unroll_Functor {
   typedef typename YMV::execution_space execution_space;
   typedef SizeType size_type;
   typedef Kokkos::Details::ArithTraits<typename YMV::non_const_value_type> ATS;
@@ -533,43 +537,49 @@ struct Axpby_MV_Unroll_Functor
   AV m_a;
   BV m_b;
 
-  Axpby_MV_Unroll_Functor (const XMV& x, const YMV& y,
-                           const AV& a, const BV& b,
-                           const SizeType startingColumn) :
-    m_x (x), m_y (y), m_a (a), m_b (b)
-  {
-    static_assert (Kokkos::Impl::is_view<AV>::value, "KokkosBlas::Impl::"
-                   "Axpby_MV_Unroll_Functor: a is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "Axpby_MV_Unroll_Functor: X is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<BV>::value, "KokkosBlas::Impl::"
-                   "Axpby_MV_Unroll_Functor: b is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<YMV>::value, "KokkosBlas::Impl::"
-                   "Axpby_MV_Unroll_Functor: Y is not a Kokkos::View.");
-    static_assert (std::is_same<typename YMV::value_type,
-                   typename YMV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: Y is const.  "
-                   "It must be nonconst, because it is an output argument "
-                   "(we have to be able to write to its entries).");
-    static_assert ((int) YMV::Rank == (int) XMV::Rank,
-                   "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: "
-                   "X and Y must have the same rank.");
-    static_assert (YMV::Rank == 2, "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: "
-                   "XMV and YMV must have rank 2.");
-    static_assert (AV::Rank == 1, "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: "
-                   "AV must have rank 1.");
-    static_assert (BV::Rank == 1, "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: "
-                   "BV must have rank 1.");
+  Axpby_MV_Unroll_Functor(const XMV& x, const YMV& y, const AV& a, const BV& b,
+                          const SizeType startingColumn)
+      : m_x(x), m_y(y), m_a(a), m_b(b) {
+    static_assert(Kokkos::is_view<AV>::value,
+                  "KokkosBlas::Impl::"
+                  "Axpby_MV_Unroll_Functor: a is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Axpby_MV_Unroll_Functor: X is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<BV>::value,
+                  "KokkosBlas::Impl::"
+                  "Axpby_MV_Unroll_Functor: b is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<YMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Axpby_MV_Unroll_Functor: Y is not a Kokkos::View.");
+    static_assert(std::is_same<typename YMV::value_type,
+                               typename YMV::non_const_value_type>::value,
+                  "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: Y is const.  "
+                  "It must be nonconst, because it is an output argument "
+                  "(we have to be able to write to its entries).");
+    static_assert((int)YMV::Rank == (int)XMV::Rank,
+                  "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: "
+                  "X and Y must have the same rank.");
+    static_assert(YMV::Rank == 2,
+                  "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: "
+                  "XMV and YMV must have rank 2.");
+    static_assert(AV::Rank == 1,
+                  "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: "
+                  "AV must have rank 1.");
+    static_assert(BV::Rank == 1,
+                  "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: "
+                  "BV must have rank 1.");
 
     if (startingColumn != 0) {
-      m_a = Kokkos::subview (a, std::make_pair (startingColumn, SizeType(a.extent(0))));
-      m_b = Kokkos::subview (b, std::make_pair (startingColumn, SizeType(b.extent(0))));
+      m_a = Kokkos::subview(
+          a, std::make_pair(startingColumn, SizeType(a.extent(0))));
+      m_b = Kokkos::subview(
+          b, std::make_pair(startingColumn, SizeType(b.extent(0))));
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i) const
-  {
+  void operator()(const size_type& i) const {
     // scalar_x and scalar_y are compile-time constants (since they
     // are template parameters), so the compiler should evaluate these
     // branches at compile time.
@@ -581,7 +591,7 @@ struct Axpby_MV_Unroll_Functor
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = ATS::zero ();
+        m_y(i, k) = ATS::zero();
       }
     }
     if (scalar_x == 0 && scalar_y == 2) {
@@ -589,7 +599,7 @@ struct Axpby_MV_Unroll_Functor
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = m_b(k)*m_y(i,k);
+        m_y(i, k) = m_b(k) * m_y(i, k);
       }
     }
     if (scalar_x == 2 && scalar_y == 0) {
@@ -597,7 +607,7 @@ struct Axpby_MV_Unroll_Functor
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = m_a(k)*m_x(i,k);
+        m_y(i, k) = m_a(k) * m_x(i, k);
       }
     }
     if (scalar_x == 2 && scalar_y == 2) {
@@ -605,18 +615,18 @@ struct Axpby_MV_Unroll_Functor
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = m_a(k)*m_x(i,k) + m_b(k)*m_y(i,k);
+        m_y(i, k) = m_a(k) * m_x(i, k) + m_b(k) * m_y(i, k);
       }
     }
 
-#else // KOKKOSBLAS_OPTIMIZATION_LEVEL >= 3
+#else  // KOKKOSBLAS_OPTIMIZATION_LEVEL >= 3
 
     if (scalar_x == 0 && scalar_y == 0) {
 #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = ATS::zero ();
+        m_y(i, k) = ATS::zero();
       }
     }
     if (scalar_x == 0 && scalar_y == -1) {
@@ -624,18 +634,18 @@ struct Axpby_MV_Unroll_Functor
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = -m_y(i,k);
+        m_y(i, k) = -m_y(i, k);
       }
     }
     if (scalar_x == 0 && scalar_y == 1) {
-      return; // Y(i,j) := Y(i,j)
+      return;  // Y(i,j) := Y(i,j)
     }
     if (scalar_x == 0 && scalar_y == 2) {
 #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = m_b(k)*m_y(i,k);
+        m_y(i, k) = m_b(k) * m_y(i, k);
       }
     }
     if (scalar_x == -1 && scalar_y == 0) {
@@ -643,7 +653,7 @@ struct Axpby_MV_Unroll_Functor
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = -m_x(i,k);
+        m_y(i, k) = -m_x(i, k);
       }
     }
     if (scalar_x == -1 && scalar_y == -1) {
@@ -651,7 +661,7 @@ struct Axpby_MV_Unroll_Functor
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = -m_x(i,k) - m_y(i,k);
+        m_y(i, k) = -m_x(i, k) - m_y(i, k);
       }
     }
     if (scalar_x == -1 && scalar_y == 1) {
@@ -659,7 +669,7 @@ struct Axpby_MV_Unroll_Functor
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = -m_x(i,k) + m_y(i,k);
+        m_y(i, k) = -m_x(i, k) + m_y(i, k);
       }
     }
     if (scalar_x == -1 && scalar_y == 2) {
@@ -667,7 +677,7 @@ struct Axpby_MV_Unroll_Functor
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = -m_x(i,k) + m_b(k)*m_y(i,k);
+        m_y(i, k) = -m_x(i, k) + m_b(k) * m_y(i, k);
       }
     }
     if (scalar_x == 1 && scalar_y == 0) {
@@ -675,7 +685,7 @@ struct Axpby_MV_Unroll_Functor
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = m_x(i,k);
+        m_y(i, k) = m_x(i, k);
       }
     }
     if (scalar_x == 1 && scalar_y == -1) {
@@ -683,7 +693,7 @@ struct Axpby_MV_Unroll_Functor
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = m_x(i,k) - m_y(i,k);
+        m_y(i, k) = m_x(i, k) - m_y(i, k);
       }
     }
     if (scalar_x == 1 && scalar_y == 1) {
@@ -691,7 +701,7 @@ struct Axpby_MV_Unroll_Functor
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = m_x(i,k) + m_y(i,k);
+        m_y(i, k) = m_x(i, k) + m_y(i, k);
       }
     }
     if (scalar_x == 1 && scalar_y == 2) {
@@ -699,7 +709,7 @@ struct Axpby_MV_Unroll_Functor
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = m_x(i,k) + m_b(k)*m_y(i,k);
+        m_y(i, k) = m_x(i, k) + m_b(k) * m_y(i, k);
       }
     }
     if (scalar_x == 2 && scalar_y == 0) {
@@ -707,7 +717,7 @@ struct Axpby_MV_Unroll_Functor
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = m_a(k)*m_x(i,k);
+        m_y(i, k) = m_a(k) * m_x(i, k);
       }
     }
     if (scalar_x == 2 && scalar_y == -1) {
@@ -715,7 +725,7 @@ struct Axpby_MV_Unroll_Functor
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = m_a(k)*m_x(i,k) - m_y(i,k);
+        m_y(i, k) = m_a(k) * m_x(i, k) - m_y(i, k);
       }
     }
     if (scalar_x == 2 && scalar_y == 1) {
@@ -723,7 +733,7 @@ struct Axpby_MV_Unroll_Functor
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = m_a(k)*m_x(i,k) + m_y(i,k);
+        m_y(i, k) = m_a(k) * m_x(i, k) + m_y(i, k);
       }
     }
     if (scalar_x == 2 && scalar_y == 2) {
@@ -731,23 +741,21 @@ struct Axpby_MV_Unroll_Functor
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = m_a(k)*m_x(i,k) + m_b(k)*m_y(i,k);
+        m_y(i, k) = m_a(k) * m_x(i, k) + m_b(k) * m_y(i, k);
       }
     }
-#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY
+#endif  // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY
   }
 };
 
-
 // Variant of Axpby_MV_Unroll_Functor for single coefficients (rather
 // than vectors of coefficients) a and b.  The number of columns in X
 // and Y, UNROLL, is a compile-time constant.
-template<class XMV, class YMV, int scalar_x, int scalar_y,
-         int UNROLL, class SizeType>
+template <class XMV, class YMV, int scalar_x, int scalar_y, int UNROLL,
+          class SizeType>
 struct Axpby_MV_Unroll_Functor<typename XMV::non_const_value_type, XMV,
                                typename YMV::non_const_value_type, YMV,
-                               scalar_x, scalar_y, UNROLL, SizeType>
-{
+                               scalar_x, scalar_y, UNROLL, SizeType> {
   typedef typename YMV::execution_space execution_space;
   typedef SizeType size_type;
   typedef Kokkos::Details::ArithTraits<typename YMV::non_const_value_type> ATS;
@@ -757,30 +765,32 @@ struct Axpby_MV_Unroll_Functor<typename XMV::non_const_value_type, XMV,
   const typename XMV::non_const_value_type m_a;
   const typename YMV::non_const_value_type m_b;
 
-  Axpby_MV_Unroll_Functor (const XMV& X, const YMV& Y,
-                           const typename XMV::non_const_value_type& a,
-                           const typename YMV::non_const_value_type& b,
-                           const SizeType /* startingColumn */) :
-    m_x (X), m_y (Y), m_a (a), m_b (b)
-  {
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "Axpby_MV_Unroll_Functor: X is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<YMV>::value, "KokkosBlas::Impl::"
-                   "Axpby_MV_Unroll_Functor: Y is not a Kokkos::View.");
-    static_assert (std::is_same<typename YMV::value_type,
-                   typename YMV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: Y is const.  "
-                   "It must be nonconst, because it is an output argument "
-                   "(we have to be able to write to its entries).");
-    static_assert ((int) YMV::Rank == (int) XMV::Rank, "KokkosBlas::Impl::"
-                   "Axpby_MV_Unroll_Functor: X and Y must have the same rank.");
-    static_assert (YMV::Rank == 2, "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: "
-                   "XMV and YMV must have rank 2.");
+  Axpby_MV_Unroll_Functor(const XMV& X, const YMV& Y,
+                          const typename XMV::non_const_value_type& a,
+                          const typename YMV::non_const_value_type& b,
+                          const SizeType /* startingColumn */)
+      : m_x(X), m_y(Y), m_a(a), m_b(b) {
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Axpby_MV_Unroll_Functor: X is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<YMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Axpby_MV_Unroll_Functor: Y is not a Kokkos::View.");
+    static_assert(std::is_same<typename YMV::value_type,
+                               typename YMV::non_const_value_type>::value,
+                  "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: Y is const.  "
+                  "It must be nonconst, because it is an output argument "
+                  "(we have to be able to write to its entries).");
+    static_assert((int)YMV::Rank == (int)XMV::Rank,
+                  "KokkosBlas::Impl::"
+                  "Axpby_MV_Unroll_Functor: X and Y must have the same rank.");
+    static_assert(YMV::Rank == 2,
+                  "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: "
+                  "XMV and YMV must have rank 2.");
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i) const
-  {
+  void operator()(const size_type& i) const {
     // scalar_x and scalar_y are compile-time constants (since they
     // are template parameters), so the compiler should evaluate these
     // branches at compile time.
@@ -792,7 +802,7 @@ struct Axpby_MV_Unroll_Functor<typename XMV::non_const_value_type, XMV,
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = ATS::zero ();
+        m_y(i, k) = ATS::zero();
       }
     }
     if (scalar_x == 0 && scalar_y == 2) {
@@ -800,7 +810,7 @@ struct Axpby_MV_Unroll_Functor<typename XMV::non_const_value_type, XMV,
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = m_b*m_y(i,k);
+        m_y(i, k) = m_b * m_y(i, k);
       }
     }
     if (scalar_x == 2 && scalar_y == 0) {
@@ -808,7 +818,7 @@ struct Axpby_MV_Unroll_Functor<typename XMV::non_const_value_type, XMV,
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = m_a*m_x(i,k);
+        m_y(i, k) = m_a * m_x(i, k);
       }
     }
     if (scalar_x == 2 && scalar_y == 2) {
@@ -816,18 +826,18 @@ struct Axpby_MV_Unroll_Functor<typename XMV::non_const_value_type, XMV,
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = m_a*m_x(i,k) + m_b*m_y(i,k);
+        m_y(i, k) = m_a * m_x(i, k) + m_b * m_y(i, k);
       }
     }
 
-#else // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
+#else  // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
 
     if (scalar_x == 0 && scalar_y == 0) {
 #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = ATS::zero ();
+        m_y(i, k) = ATS::zero();
       }
     }
     if (scalar_x == 0 && scalar_y == -1) {
@@ -835,18 +845,18 @@ struct Axpby_MV_Unroll_Functor<typename XMV::non_const_value_type, XMV,
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = -m_y(i,k);
+        m_y(i, k) = -m_y(i, k);
       }
     }
     if (scalar_x == 0 && scalar_y == 1) {
-      return; // Y(i,j) := Y(i,j)
+      return;  // Y(i,j) := Y(i,j)
     }
     if (scalar_x == 0 && scalar_y == 2) {
 #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = m_b*m_y(i,k);
+        m_y(i, k) = m_b * m_y(i, k);
       }
     }
     if (scalar_x == -1 && scalar_y == 0) {
@@ -854,7 +864,7 @@ struct Axpby_MV_Unroll_Functor<typename XMV::non_const_value_type, XMV,
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = -m_x(i,k);
+        m_y(i, k) = -m_x(i, k);
       }
     }
     if (scalar_x == -1 && scalar_y == -1) {
@@ -862,7 +872,7 @@ struct Axpby_MV_Unroll_Functor<typename XMV::non_const_value_type, XMV,
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = -m_x(i,k) - m_y(i,k);
+        m_y(i, k) = -m_x(i, k) - m_y(i, k);
       }
     }
     if (scalar_x == -1 && scalar_y == 1) {
@@ -870,7 +880,7 @@ struct Axpby_MV_Unroll_Functor<typename XMV::non_const_value_type, XMV,
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = -m_x(i,k) + m_y(i,k);
+        m_y(i, k) = -m_x(i, k) + m_y(i, k);
       }
     }
     if (scalar_x == -1 && scalar_y == 2) {
@@ -878,7 +888,7 @@ struct Axpby_MV_Unroll_Functor<typename XMV::non_const_value_type, XMV,
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = -m_x(i,k) + m_b*m_y(i,k);
+        m_y(i, k) = -m_x(i, k) + m_b * m_y(i, k);
       }
     }
     if (scalar_x == 1 && scalar_y == 0) {
@@ -886,7 +896,7 @@ struct Axpby_MV_Unroll_Functor<typename XMV::non_const_value_type, XMV,
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = m_x(i,k);
+        m_y(i, k) = m_x(i, k);
       }
     }
     if (scalar_x == 1 && scalar_y == -1) {
@@ -894,7 +904,7 @@ struct Axpby_MV_Unroll_Functor<typename XMV::non_const_value_type, XMV,
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = m_x(i,k) - m_y(i,k);
+        m_y(i, k) = m_x(i, k) - m_y(i, k);
       }
     }
     if (scalar_x == 1 && scalar_y == 1) {
@@ -902,7 +912,7 @@ struct Axpby_MV_Unroll_Functor<typename XMV::non_const_value_type, XMV,
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = m_x(i,k) + m_y(i,k);
+        m_y(i, k) = m_x(i, k) + m_y(i, k);
       }
     }
     if (scalar_x == 1 && scalar_y == 2) {
@@ -910,7 +920,7 @@ struct Axpby_MV_Unroll_Functor<typename XMV::non_const_value_type, XMV,
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = m_x(i,k) + m_b*m_y(i,k);
+        m_y(i, k) = m_x(i, k) + m_b * m_y(i, k);
       }
     }
     if (scalar_x == 2 && scalar_y == 0) {
@@ -918,7 +928,7 @@ struct Axpby_MV_Unroll_Functor<typename XMV::non_const_value_type, XMV,
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = m_a*m_x(i,k);
+        m_y(i, k) = m_a * m_x(i, k);
       }
     }
     if (scalar_x == 2 && scalar_y == -1) {
@@ -926,7 +936,7 @@ struct Axpby_MV_Unroll_Functor<typename XMV::non_const_value_type, XMV,
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = m_a*m_x(i,k) - m_y(i,k);
+        m_y(i, k) = m_a * m_x(i, k) - m_y(i, k);
       }
     }
     if (scalar_x == 2 && scalar_y == 1) {
@@ -934,7 +944,7 @@ struct Axpby_MV_Unroll_Functor<typename XMV::non_const_value_type, XMV,
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = m_a*m_x(i,k) + m_y(i,k);
+        m_y(i, k) = m_a * m_x(i, k) + m_y(i, k);
       }
     }
     if (scalar_x == 2 && scalar_y == 2) {
@@ -942,11 +952,11 @@ struct Axpby_MV_Unroll_Functor<typename XMV::non_const_value_type, XMV,
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_y(i,k) = m_a*m_x(i,k) + m_b*m_y(i,k);
+        m_y(i, k) = m_a * m_x(i, k) + m_b * m_y(i, k);
       }
     }
 
-#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY
+#endif  // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY
   }
 };
 
@@ -969,125 +979,140 @@ struct Axpby_MV_Unroll_Functor<typename XMV::non_const_value_type, XMV,
 // coefficients in av and bv vectors, if they are used.
 //
 // Either av and bv are both 1-D Views, or av and bv are both scalars.
-template<class AV, class XMV, class BV, class YMV,
-         int UNROLL, class SizeType>
-void
-Axpby_MV_Unrolled (const AV& av, const XMV& x,
-                   const BV& bv, const YMV& y,
-                   const SizeType startingColumn,
-                   int a = 2, int b = 2)
-{
-  static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                 "Axpby_MV_Unrolled: X is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<YMV>::value, "KokkosBlas::Impl::"
-                 "Axpby_MV_Unrolled: Y is not a Kokkos::View.");
-  static_assert (std::is_same<typename YMV::value_type,
-                   typename YMV::non_const_value_type>::value,
-                 "KokkosBlas::Impl::Axpby_MV_Unrolled: Y is const.  "
-                 "It must be nonconst, because it is an output argument "
-                 "(we have to be able to write to its entries).");
-  static_assert ((int) YMV::Rank == (int) XMV::Rank, "KokkosBlas::Impl::"
-                 "Axpby_MV_Unrolled: X and Y must have the same rank.");
-  static_assert (YMV::Rank == 2, "KokkosBlas::Impl::Axpby_MV_Unrolled: "
-                 "XMV and YMV must have rank 2.");
+template <class AV, class XMV, class BV, class YMV, int UNROLL, class SizeType>
+void Axpby_MV_Unrolled(const AV& av, const XMV& x, const BV& bv, const YMV& y,
+                       const SizeType startingColumn, int a = 2, int b = 2) {
+  static_assert(Kokkos::is_view<XMV>::value,
+                "KokkosBlas::Impl::"
+                "Axpby_MV_Unrolled: X is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<YMV>::value,
+                "KokkosBlas::Impl::"
+                "Axpby_MV_Unrolled: Y is not a Kokkos::View.");
+  static_assert(std::is_same<typename YMV::value_type,
+                             typename YMV::non_const_value_type>::value,
+                "KokkosBlas::Impl::Axpby_MV_Unrolled: Y is const.  "
+                "It must be nonconst, because it is an output argument "
+                "(we have to be able to write to its entries).");
+  static_assert((int)YMV::Rank == (int)XMV::Rank,
+                "KokkosBlas::Impl::"
+                "Axpby_MV_Unrolled: X and Y must have the same rank.");
+  static_assert(YMV::Rank == 2,
+                "KokkosBlas::Impl::Axpby_MV_Unrolled: "
+                "XMV and YMV must have rank 2.");
 
   typedef typename YMV::execution_space execution_space;
   const SizeType numRows = x.extent(0);
-  Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
+  Kokkos::RangePolicy<execution_space, SizeType> policy(0, numRows);
 
   if (a == 0 && b == 0) {
-    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, 0, 0, UNROLL, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S0", policy, op);
+    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, 0, 0, UNROLL, SizeType> op(
+        x, y, av, bv, startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S0", policy, op);
     return;
   }
 
 #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
   if (a == 0 && b == -1) {
-    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, 0, -1, UNROLL, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S1", policy, op);
+    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, 0, -1, UNROLL, SizeType> op(
+        x, y, av, bv, startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S1", policy, op);
     return;
   }
   if (a == 0 && b == 1) {
-    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, 0, 1, UNROLL, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S2", policy, op);
+    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, 0, 1, UNROLL, SizeType> op(
+        x, y, av, bv, startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S2", policy, op);
     return;
   }
-#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY
+#endif  // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY
 
   if (a == 0 && b == 2) {
-    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, 0, 2, UNROLL, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S3", policy, op);
+    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, 0, 2, UNROLL, SizeType> op(
+        x, y, av, bv, startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S3", policy, op);
     return;
   }
 
 #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
   // a == -1
   if (a == -1 && b == 0) {
-    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, -1, 0, UNROLL, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S4", policy, op);
+    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, -1, 0, UNROLL, SizeType> op(
+        x, y, av, bv, startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S4", policy, op);
     return;
   }
   if (a == -1 && b == -1) {
-    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, -1, -1, UNROLL, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S5", policy, op);
+    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, -1, -1, UNROLL, SizeType> op(
+        x, y, av, bv, startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S5", policy, op);
     return;
   }
   if (a == -1 && b == 1) {
-    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, -1, 1, UNROLL, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S6", policy, op);
+    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, -1, 1, UNROLL, SizeType> op(
+        x, y, av, bv, startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S6", policy, op);
     return;
   }
   if (a == -1 && b == 2) {
-    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, -1, 2, UNROLL, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S7", policy, op);
+    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, -1, 2, UNROLL, SizeType> op(
+        x, y, av, bv, startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S7", policy, op);
     return;
   }
   // a == 1
   if (a == 1 && b == 0) {
-    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, 1, 0, UNROLL, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S8", policy, op);
+    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, 1, 0, UNROLL, SizeType> op(
+        x, y, av, bv, startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S8", policy, op);
     return;
   }
   if (a == 1 && b == -1) {
-    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, 1, -1, UNROLL, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S9", policy, op);
+    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, 1, -1, UNROLL, SizeType> op(
+        x, y, av, bv, startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S9", policy, op);
     return;
   }
   if (a == 1 && b == 1) {
-    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, 1, 1, UNROLL, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S10", policy, op);
+    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, 1, 1, UNROLL, SizeType> op(
+        x, y, av, bv, startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S10", policy, op);
     return;
   }
   if (a == 1 && b == 2) {
-    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, 1, 2, UNROLL, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S11", policy, op);
+    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, 1, 2, UNROLL, SizeType> op(
+        x, y, av, bv, startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S11", policy, op);
     return;
   }
-#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
+#endif  // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
 
   // a == 2
   if (a == 2 && b == 0) {
-    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, 2, 0, UNROLL, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S12", policy, op);
+    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, 2, 0, UNROLL, SizeType> op(
+        x, y, av, bv, startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S12", policy, op);
     return;
   }
 
 #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
   if (a == 2 && b == -1) {
-    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, 2, -1, UNROLL, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S13", policy, op);
+    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, 2, -1, UNROLL, SizeType> op(
+        x, y, av, bv, startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S13", policy, op);
     return;
   }
   if (a == 2 && b == 1) {
-    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, 2, 1, UNROLL, SizeType> op (x, y, av, bv, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S14", policy, op);
+    Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, 2, 1, UNROLL, SizeType> op(
+        x, y, av, bv, startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S14", policy, op);
     return;
   }
-#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
+#endif  // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
 
   // a and b arbitrary (not -1, 0, or 1)
-  Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, 2, 2, UNROLL, SizeType> op (x, y, av, bv, startingColumn);
-  Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S15", policy, op);
+  Axpby_MV_Unroll_Functor<AV, XMV, BV, YMV, 2, 2, UNROLL, SizeType> op(
+      x, y, av, bv, startingColumn);
+  Kokkos::parallel_for("KokkosBlas::Axpby::MV::S15", policy, op);
 }
 
 // Invoke the "generic" (not unrolled) multivector functor that
@@ -1109,123 +1134,124 @@ Axpby_MV_Unrolled (const AV& av, const XMV& x,
 // coefficients in av and bv vectors, if they are used.
 //
 // Either av and bv are both 1-D Views, or av and bv are both scalars.
-template<class AV, class XMV, class BV, class YMV, class SizeType>
-void
-Axpby_MV_Generic (const AV& av, const XMV& x,
-                  const BV& bv, const YMV& y,
-                  int a = 2, int b = 2)
-{
-  static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                 "Axpby_MV_Generic: X is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<YMV>::value, "KokkosBlas::Impl::"
-                 "Axpby_MV_Generic: Y is not a Kokkos::View.");
-  static_assert (std::is_same<typename YMV::value_type,
-                   typename YMV::non_const_value_type>::value,
-                 "KokkosBlas::Impl::Axpby_MV_Generic: Y is const.  "
-                 "It must be nonconst, because it is an output argument "
-                 "(we have to be able to write to its entries).");
-  static_assert ((int) YMV::Rank == (int) XMV::Rank, "KokkosBlas::Impl::"
-                 "Axpby_MV_Generic: X and Y must have the same rank.");
-  static_assert (YMV::Rank == 2, "KokkosBlas::Impl::Axpby_MV_Generic: "
-                 "XMV and YMV must have rank 2.");
+template <class AV, class XMV, class BV, class YMV, class SizeType>
+void Axpby_MV_Generic(const AV& av, const XMV& x, const BV& bv, const YMV& y,
+                      int a = 2, int b = 2) {
+  static_assert(Kokkos::is_view<XMV>::value,
+                "KokkosBlas::Impl::"
+                "Axpby_MV_Generic: X is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<YMV>::value,
+                "KokkosBlas::Impl::"
+                "Axpby_MV_Generic: Y is not a Kokkos::View.");
+  static_assert(std::is_same<typename YMV::value_type,
+                             typename YMV::non_const_value_type>::value,
+                "KokkosBlas::Impl::Axpby_MV_Generic: Y is const.  "
+                "It must be nonconst, because it is an output argument "
+                "(we have to be able to write to its entries).");
+  static_assert((int)YMV::Rank == (int)XMV::Rank,
+                "KokkosBlas::Impl::"
+                "Axpby_MV_Generic: X and Y must have the same rank.");
+  static_assert(YMV::Rank == 2,
+                "KokkosBlas::Impl::Axpby_MV_Generic: "
+                "XMV and YMV must have rank 2.");
 
   typedef typename YMV::execution_space execution_space;
   const SizeType numRows = x.extent(0);
-  Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
+  Kokkos::RangePolicy<execution_space, SizeType> policy(0, numRows);
 
   if (a == 0 && b == 0) {
-    Axpby_MV_Functor<AV, XMV, BV, YMV, 0, 0, SizeType> op (x, y, av, bv);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S16", policy, op);
+    Axpby_MV_Functor<AV, XMV, BV, YMV, 0, 0, SizeType> op(x, y, av, bv);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S16", policy, op);
     return;
   }
 
 #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
   if (a == 0 && b == -1) {
-    Axpby_MV_Functor<AV, XMV, BV, YMV, 0, -1, SizeType> op (x, y, av, bv);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S17", policy, op);
+    Axpby_MV_Functor<AV, XMV, BV, YMV, 0, -1, SizeType> op(x, y, av, bv);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S17", policy, op);
     return;
   }
   if (a == 0 && b == 1) {
-    Axpby_MV_Functor<AV, XMV, BV, YMV, 0, 1, SizeType> op (x, y, av, bv);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S18", policy, op);
+    Axpby_MV_Functor<AV, XMV, BV, YMV, 0, 1, SizeType> op(x, y, av, bv);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S18", policy, op);
     return;
   }
-#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
+#endif  // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
 
   if (a == 0 && b == 2) {
-    Axpby_MV_Functor<AV, XMV, BV, YMV, 0, 2, SizeType> op (x, y, av, bv);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S19", policy, op);
+    Axpby_MV_Functor<AV, XMV, BV, YMV, 0, 2, SizeType> op(x, y, av, bv);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S19", policy, op);
     return;
   }
 
 #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
   // a == -1
   if (a == -1 && b == 0) {
-    Axpby_MV_Functor<AV, XMV, BV, YMV, -1, 0, SizeType> op (x, y, av, bv);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S20", policy, op);
+    Axpby_MV_Functor<AV, XMV, BV, YMV, -1, 0, SizeType> op(x, y, av, bv);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S20", policy, op);
     return;
   }
   if (a == -1 && b == -1) {
-    Axpby_MV_Functor<AV, XMV, BV, YMV, -1, -1, SizeType> op (x, y, av, bv);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S21", policy, op);
+    Axpby_MV_Functor<AV, XMV, BV, YMV, -1, -1, SizeType> op(x, y, av, bv);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S21", policy, op);
     return;
   }
   if (a == -1 && b == 1) {
-    Axpby_MV_Functor<AV, XMV, BV, YMV, -1, 1, SizeType> op (x, y, av, bv);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S22", policy, op);
+    Axpby_MV_Functor<AV, XMV, BV, YMV, -1, 1, SizeType> op(x, y, av, bv);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S22", policy, op);
     return;
   }
   if (a == -1 && b == 2) {
-    Axpby_MV_Functor<AV, XMV, BV, YMV, -1, 2, SizeType> op (x, y, av, bv);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S23", policy, op);
+    Axpby_MV_Functor<AV, XMV, BV, YMV, -1, 2, SizeType> op(x, y, av, bv);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S23", policy, op);
     return;
   }
   // a == 1
   if (a == 1 && b == 0) {
-    Axpby_MV_Functor<AV, XMV, BV, YMV, 1, 0, SizeType> op (x, y, av, bv);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S24", policy, op);
+    Axpby_MV_Functor<AV, XMV, BV, YMV, 1, 0, SizeType> op(x, y, av, bv);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S24", policy, op);
     return;
   }
   if (a == 1 && b == -1) {
-    Axpby_MV_Functor<AV, XMV, BV, YMV, 1, -1, SizeType> op (x, y, av, bv);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S25", policy, op);
+    Axpby_MV_Functor<AV, XMV, BV, YMV, 1, -1, SizeType> op(x, y, av, bv);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S25", policy, op);
     return;
   }
   if (a == 1 && b == 1) {
-    Axpby_MV_Functor<AV, XMV, BV, YMV, 1, 1, SizeType> op (x, y, av, bv);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S26", policy, op);
+    Axpby_MV_Functor<AV, XMV, BV, YMV, 1, 1, SizeType> op(x, y, av, bv);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S26", policy, op);
     return;
   }
   if (a == 1 && b == 2) {
-    Axpby_MV_Functor<AV, XMV, BV, YMV, 1, 2, SizeType> op (x, y, av, bv);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S27", policy, op);
+    Axpby_MV_Functor<AV, XMV, BV, YMV, 1, 2, SizeType> op(x, y, av, bv);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S27", policy, op);
     return;
   }
-#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
+#endif  // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
 
   // a == 2
   if (a == 2 && b == 0) {
-    Axpby_MV_Functor<AV, XMV, BV, YMV, 2, 0, SizeType> op (x, y, av, bv);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S28", policy, op);
+    Axpby_MV_Functor<AV, XMV, BV, YMV, 2, 0, SizeType> op(x, y, av, bv);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S28", policy, op);
     return;
   }
 
 #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
   if (a == 2 && b == -1) {
-    Axpby_MV_Functor<AV, XMV, BV, YMV, 2, -1, SizeType> op (x, y, av, bv);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S29", policy, op);
+    Axpby_MV_Functor<AV, XMV, BV, YMV, 2, -1, SizeType> op(x, y, av, bv);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S29", policy, op);
     return;
   }
   if (a == 2 && b == 1) {
-    Axpby_MV_Functor<AV, XMV, BV, YMV, 2, 1, SizeType> op (x, y, av, bv);
-    Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S30", policy, op);
+    Axpby_MV_Functor<AV, XMV, BV, YMV, 2, 1, SizeType> op(x, y, av, bv);
+    Kokkos::parallel_for("KokkosBlas::Axpby::MV::S30", policy, op);
     return;
   }
-#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
+#endif  // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
 
   // a and b arbitrary (not -1, 0, or 1)
-  Axpby_MV_Functor<AV, XMV, BV, YMV, 2, 2, SizeType> op (x, y, av, bv);
-  Kokkos::parallel_for ("KokkosBlas::Axpby::MV::S31", policy, op);
+  Axpby_MV_Functor<AV, XMV, BV, YMV, 2, 2, SizeType> op(x, y, av, bv);
+  Kokkos::parallel_for("KokkosBlas::Axpby::MV::S31", policy, op);
 }
 
 // Compute any of the following, in a way optimized for X and Y
@@ -1247,64 +1273,66 @@ Axpby_MV_Generic (const AV& av, const XMV& x,
 // coefficients in av and bv vectors, if they are used.
 //
 // Either av and bv are both 1-D Views, or av and bv are both scalars.
-template<class AV, class XMV, class BV, class YMV, class SizeType>
-struct
-Axpby_MV_Invoke_Left {
+template <class AV, class XMV, class BV, class YMV, class SizeType>
+struct Axpby_MV_Invoke_Left {
+  static void run(const AV& av, const XMV& x, const BV& bv, const YMV& y,
+                  int a = 2, int b = 2) {
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Axpby_MV_Invoke_Left: X is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<YMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Axpby_MV_Invoke_Left: Y is not a Kokkos::View.");
+    static_assert(std::is_same<typename YMV::value_type,
+                               typename YMV::non_const_value_type>::value,
+                  "KokkosBlas::Impl::Axpby_MV_Invoke_Left: Y is const.  "
+                  "It must be nonconst, because it is an output argument "
+                  "(we have to be able to write to its entries).");
+    static_assert((int)YMV::Rank == (int)XMV::Rank,
+                  "KokkosBlas::Impl::"
+                  "Axpby_MV_Invoke_Left: X and Y must have the same rank.");
+    static_assert(YMV::Rank == 2,
+                  "KokkosBlas::Impl::Axpby_MV_Invoke_Left: "
+                  "X and Y must have rank 2.");
 
-  static void run(const AV& av, const XMV& x,
-                  const BV& bv, const YMV& y,
-                  int a = 2, int b = 2)
-{
-  static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                 "Axpby_MV_Invoke_Left: X is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<YMV>::value, "KokkosBlas::Impl::"
-                 "Axpby_MV_Invoke_Left: Y is not a Kokkos::View.");
-  static_assert (std::is_same<typename YMV::value_type,
-                 typename YMV::non_const_value_type>::value,
-                 "KokkosBlas::Impl::Axpby_MV_Invoke_Left: Y is const.  "
-                 "It must be nonconst, because it is an output argument "
-                 "(we have to be able to write to its entries).");
-  static_assert ((int) YMV::Rank == (int) XMV::Rank, "KokkosBlas::Impl::"
-                 "Axpby_MV_Invoke_Left: X and Y must have the same rank.");
-  static_assert (YMV::Rank == 2, "KokkosBlas::Impl::Axpby_MV_Invoke_Left: "
-                 "X and Y must have rank 2.");
+    const SizeType numCols = x.extent(1);
 
-  const SizeType numCols = x.extent(1);
+    // Strip-mine by 8, then 4.  After that, do one column at a time.
+    // We limit the number of strip-mine values in order to keep down
+    // the amount of code to compile.
+    SizeType j = 0;
+    for (; j + 8 <= numCols; j += 8) {
+      XMV X_cur = Kokkos::subview(x, Kokkos::ALL(), std::make_pair(j, j + 8));
+      YMV Y_cur = Kokkos::subview(y, Kokkos::ALL(), std::make_pair(j, j + 8));
 
-  // Strip-mine by 8, then 4.  After that, do one column at a time.
-  // We limit the number of strip-mine values in order to keep down
-  // the amount of code to compile.
-  SizeType j = 0;
-  for ( ; j + 8 <= numCols; j += 8) {
-    XMV X_cur = Kokkos::subview (x, Kokkos::ALL (), std::make_pair (j, j+8));
-    YMV Y_cur = Kokkos::subview (y, Kokkos::ALL (), std::make_pair (j, j+8));
+      // Passing in the starting column index lets the functor take
+      // subviews of av and bv, if they are Views.  If they are scalars,
+      // the functor doesn't have to do anything to them.
+      Axpby_MV_Unrolled<AV, XMV, BV, YMV, 8, SizeType>(av, X_cur, bv, Y_cur, j,
+                                                       a, b);
+    }
+    for (; j + 4 <= numCols; j += 4) {
+      XMV X_cur = Kokkos::subview(x, Kokkos::ALL(), std::make_pair(j, j + 4));
+      YMV Y_cur = Kokkos::subview(y, Kokkos::ALL(), std::make_pair(j, j + 4));
 
-    // Passing in the starting column index lets the functor take
-    // subviews of av and bv, if they are Views.  If they are scalars,
-    // the functor doesn't have to do anything to them.
-    Axpby_MV_Unrolled<AV, XMV, BV, YMV, 8, SizeType> (av, X_cur, bv, Y_cur, j, a, b);
-  }
-  for ( ; j + 4 <= numCols; j += 4) {
-    XMV X_cur = Kokkos::subview (x, Kokkos::ALL (), std::make_pair (j, j+4));
-    YMV Y_cur = Kokkos::subview (y, Kokkos::ALL (), std::make_pair (j, j+4));
+      // Passing in the starting column index lets the functor take
+      // subviews of av and bv, if they are Views.  If they are scalars,
+      // the functor doesn't have to do anything to them.
+      Axpby_MV_Unrolled<AV, XMV, BV, YMV, 4, SizeType>(av, X_cur, bv, Y_cur, j,
+                                                       a, b);
+    }
+    for (; j < numCols; ++j) {
+      auto x_cur = Kokkos::subview(x, Kokkos::ALL(), j);
+      auto y_cur = Kokkos::subview(y, Kokkos::ALL(), j);
 
-    // Passing in the starting column index lets the functor take
-    // subviews of av and bv, if they are Views.  If they are scalars,
-    // the functor doesn't have to do anything to them.
-    Axpby_MV_Unrolled<AV, XMV, BV, YMV, 4, SizeType> (av, X_cur, bv, Y_cur, j, a, b);
-  }
-  for ( ; j < numCols; ++j) {
-    auto x_cur = Kokkos::subview (x, Kokkos::ALL (), j);
-    auto y_cur = Kokkos::subview (y, Kokkos::ALL (), j);
-
-    // Passing in the starting column index lets the functor take
-    // subviews of av and bv, if they are Views.  If they are scalars,
-    // the functor doesn't have to do anything to them.
-    typedef decltype (x_cur) XV;
-    typedef decltype (y_cur) YV;
-    Axpby_Generic<AV, XV, BV, YV, SizeType> (av, x_cur, bv, y_cur, j, a, b);
+      // Passing in the starting column index lets the functor take
+      // subviews of av and bv, if they are Views.  If they are scalars,
+      // the functor doesn't have to do anything to them.
+      typedef decltype(x_cur) XV;
+      typedef decltype(y_cur) YV;
+      Axpby_Generic<AV, XV, BV, YV, SizeType>(av, x_cur, bv, y_cur, j, a, b);
+    }
   }
-}
 };
 
 // Compute any of the following, in a way optimized for X, Y, and R
@@ -1326,43 +1354,42 @@ Axpby_MV_Invoke_Left {
 // coefficients in av and bv vectors, if they are used.
 //
 // Either av and bv are both 1-D Views, or av and bv are both scalars.
-template<class AV, class XMV, class BV, class YMV, class SizeType>
-struct
-Axpby_MV_Invoke_Right {
+template <class AV, class XMV, class BV, class YMV, class SizeType>
+struct Axpby_MV_Invoke_Right {
+  static void run(const AV& av, const XMV& x, const BV& bv, const YMV& y,
+                  int a = 2, int b = 2) {
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Axpby_MV_Invoke_Right: X is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<YMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Axpby_MV_Invoke_Right: Y is not a Kokkos::View.");
+    static_assert(std::is_same<typename YMV::value_type,
+                               typename YMV::non_const_value_type>::value,
+                  "KokkosBlas::Impl::Axpby_MV_Invoke_Right: Y is const.  "
+                  "It must be nonconst, because it is an output argument "
+                  "(we have to be able to write to its entries).");
+    static_assert((int)YMV::Rank == (int)XMV::Rank,
+                  "KokkosBlas::Impl::"
+                  "Axpby_MV_Invoke_Right: X and Y must have the same rank.");
+    static_assert(YMV::Rank == 2,
+                  "KokkosBlas::Impl::Axpby_MV_Invoke_Right: "
+                  "X and Y must have rank 2.");
 
-static void run(const AV& av, const XMV& x,
-                const BV& bv, const YMV& y,
-                int a = 2, int b = 2)
-{
-  static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                 "Axpby_MV_Invoke_Right: X is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<YMV>::value, "KokkosBlas::Impl::"
-                 "Axpby_MV_Invoke_Right: Y is not a Kokkos::View.");
-  static_assert (std::is_same<typename YMV::value_type,
-                 typename YMV::non_const_value_type>::value,
-                 "KokkosBlas::Impl::Axpby_MV_Invoke_Right: Y is const.  "
-                 "It must be nonconst, because it is an output argument "
-                 "(we have to be able to write to its entries).");
-  static_assert ((int) YMV::Rank == (int) XMV::Rank, "KokkosBlas::Impl::"
-                 "Axpby_MV_Invoke_Right: X and Y must have the same rank.");
-  static_assert (YMV::Rank == 2, "KokkosBlas::Impl::Axpby_MV_Invoke_Right: "
-                 "X and Y must have rank 2.");
-
-  const SizeType numCols = x.extent(1);
-  if (numCols == 1) {
-    auto x_0 = Kokkos::subview (x, Kokkos::ALL (), 0);
-    auto y_0 = Kokkos::subview (y, Kokkos::ALL (), 0);
-    typedef decltype (x_0) XV;
-    typedef decltype (y_0) YV;
-    Axpby_Generic<AV, XV, BV, YV, SizeType> (av, x_0, bv, y_0, 0, a, b);
-  }
-  else {
-    Axpby_MV_Generic<AV, XMV, BV, YMV, SizeType> (av, x, bv, y, a, b);
+    const SizeType numCols = x.extent(1);
+    if (numCols == 1) {
+      auto x_0 = Kokkos::subview(x, Kokkos::ALL(), 0);
+      auto y_0 = Kokkos::subview(y, Kokkos::ALL(), 0);
+      typedef decltype(x_0) XV;
+      typedef decltype(y_0) YV;
+      Axpby_Generic<AV, XV, BV, YV, SizeType>(av, x_0, bv, y_0, 0, a, b);
+    } else {
+      Axpby_MV_Generic<AV, XMV, BV, YMV, SizeType>(av, x, bv, y, a, b);
+    }
   }
-}
 };
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
-#endif // KOKKOSBLAS1_AXPBY_MV_IMPL_HPP_
+#endif  // KOKKOSBLAS1_AXPBY_MV_IMPL_HPP_
diff --git a/src/blas/impl/KokkosBlas1_axpby_spec.hpp b/src/blas/impl/KokkosBlas1_axpby_spec.hpp
index 15040fff4e..0269d2376c 100644
--- a/src/blas/impl/KokkosBlas1_axpby_spec.hpp
+++ b/src/blas/impl/KokkosBlas1_axpby_spec.hpp
@@ -49,19 +49,19 @@
 #include "Kokkos_InnerProductSpaceTraits.hpp"
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
-#include<KokkosBlas1_axpby_impl.hpp>
-#include<KokkosBlas1_axpby_mv_impl.hpp>
+#include <KokkosBlas1_axpby_impl.hpp>
+#include <KokkosBlas1_axpby_mv_impl.hpp>
 #endif
 
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class AV, class XMV, class BV, class YMV, int rank = YMV::Rank>
+template <class AV, class XMV, class BV, class YMV, int rank = YMV::Rank>
 struct axpby_eti_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization availability
@@ -70,16 +70,20 @@ struct axpby_eti_spec_avail {
 // We may spread out definitions (see _INST macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_AXPBY_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct axpby_eti_spec_avail< \
-         SCALAR, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         SCALAR, \
-         Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         1> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_AXPBY_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE,       \
+                                         MEM_SPACE)                        \
+  template <>                                                              \
+  struct axpby_eti_spec_avail<                                             \
+      SCALAR,                                                              \
+      Kokkos::View<const SCALAR*, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      SCALAR,                                                              \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      1> {                                                                 \
+    enum : bool { value = true };                                          \
+  };
 
 //
 // Macro for declaration of full specialization availability
@@ -88,32 +92,41 @@ struct axpby_eti_spec_avail {
 // We may spread out definitions (see _INST macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct axpby_eti_spec_avail< \
-         SCALAR, \
-         Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         SCALAR, \
-         Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         2> { enum : bool { value = true }; }; \
-    template<> \
-    struct axpby_eti_spec_avail< \
-         Kokkos::View<const SCALAR*, Kokkos::LayoutLeft, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,\
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, Kokkos::LayoutLeft, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,\
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         2> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE,     \
+                                            MEM_SPACE)                      \
+  template <>                                                               \
+  struct axpby_eti_spec_avail<                                              \
+      SCALAR,                                                               \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      SCALAR,                                                               \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      2> {                                                                  \
+    enum : bool { value = true };                                           \
+  };                                                                        \
+  template <>                                                               \
+  struct axpby_eti_spec_avail<                                              \
+      Kokkos::View<const SCALAR*, Kokkos::LayoutLeft,                       \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR*, Kokkos::LayoutLeft,                       \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      2> {                                                                  \
+    enum : bool { value = true };                                           \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosBlas1_axpby_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_axpby_eti_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_axpby_mv_eti_spec_avail.hpp>
+#include <KokkosBlas1_axpby_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_axpby_eti_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_axpby_mv_eti_spec_avail.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
@@ -142,52 +155,62 @@ namespace Impl {
 /// Any <i>scalar</i> coefficient of zero has BLAS semantics of
 /// ignoring the corresponding (multi)vector entry.  This does NOT
 /// apply to coefficients in av and bv vectors, if they are used.
-template<class AV, class XMV, class BV, class YMV, int rank = YMV::Rank,
-         bool tpl_spec_avail = axpby_tpl_spec_avail<AV,XMV,BV,YMV>::value,
-         bool eti_spec_avail = axpby_eti_spec_avail<AV,XMV,BV,YMV>::value>
+template <class AV, class XMV, class BV, class YMV, int rank = YMV::Rank,
+          bool tpl_spec_avail = axpby_tpl_spec_avail<AV, XMV, BV, YMV>::value,
+          bool eti_spec_avail = axpby_eti_spec_avail<AV, XMV, BV, YMV>::value>
 struct Axpby {
-  static void axpby (const AV& av, const XMV& X, const BV& bv, const YMV& Y);
+  static void axpby(const AV& av, const XMV& X, const BV& bv, const YMV& Y);
 };
 
-template<class AV, class XMV, class BV, class YMV>
-struct Axpby<AV,XMV,BV,YMV,0,true,true> {
-  static void axpby (const AV& /* av */, const XMV& /* X */, const BV& /* bv */, const YMV& /* Y */) {
-    static_assert(YMV::Rank==0,"Oh My God");
+template <class AV, class XMV, class BV, class YMV>
+struct Axpby<AV, XMV, BV, YMV, 0, true, true> {
+  static void axpby(const AV& /* av */, const XMV& /* X */, const BV& /* bv */,
+                    const YMV& /* Y */) {
+    static_assert(YMV::Rank == 0, "Oh My God");
   }
 };
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 // Full specialization for XMV and YMV rank-2 Views.
-template<class AV, class XMV, class BV, class YMV>
-struct Axpby<AV, XMV, BV, YMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>
-{
+template <class AV, class XMV, class BV, class YMV>
+struct Axpby<AV, XMV, BV, YMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename YMV::size_type size_type;
 
-  static void
-  axpby (const AV& av, const XMV& X, const BV& bv, const YMV& Y)
-  {
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "Axpby<rank-2>::axpby: X is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<YMV>::value, "KokkosBlas::Impl::"
-                   "Axpby<rank-2>::axpby: Y is not a Kokkos::View.");
-    static_assert (std::is_same<typename YMV::value_type,
-                     typename YMV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::Axpby<rank-2>::axpby: Y is const.  "
-                   "It must be nonconst, because it is an output argument "
-                   "(we have to be able to write to its entries).");
-    static_assert ((int) YMV::Rank == (int) XMV::Rank,
-                   "KokkosBlas::Impl::Axpby<rank-2>::axpby (MV): "
-                   "X and Y must have the same rank.");
-    static_assert (YMV::Rank == 2, "KokkosBlas::Impl::Axpby<rank-2>::axpby: "
-                   "X and Y must have rank 2.");
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::axpby[ETI]":"KokkosBlas::axpby[noETI]");
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::axpby<> ETI specialization for < %s , %s , %s , %s >\n",typeid(AV).name(),typeid(XMV).name(),typeid(BV).name(),typeid(YMV).name());
+  static void axpby(const AV& av, const XMV& X, const BV& bv, const YMV& Y) {
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Axpby<rank-2>::axpby: X is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<YMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Axpby<rank-2>::axpby: Y is not a Kokkos::View.");
+    static_assert(std::is_same<typename YMV::value_type,
+                               typename YMV::non_const_value_type>::value,
+                  "KokkosBlas::Impl::Axpby<rank-2>::axpby: Y is const.  "
+                  "It must be nonconst, because it is an output argument "
+                  "(we have to be able to write to its entries).");
+    static_assert((int)YMV::Rank == (int)XMV::Rank,
+                  "KokkosBlas::Impl::Axpby<rank-2>::axpby (MV): "
+                  "X and Y must have the same rank.");
+    static_assert(YMV::Rank == 2,
+                  "KokkosBlas::Impl::Axpby<rank-2>::axpby: "
+                  "X and Y must have rank 2.");
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::axpby[ETI]"
+                                      : "KokkosBlas::axpby[noETI]");
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf(
+          "KokkosBlas1::axpby<> ETI specialization for < %s , %s , %s , %s >\n",
+          typeid(AV).name(), typeid(XMV).name(), typeid(BV).name(),
+          typeid(YMV).name());
     else {
-      printf("KokkosBlas1::axpby<> non-ETI specialization for < %s , %s , %s , %s >\n",typeid(AV).name(),typeid(XMV).name(),typeid(BV).name(),typeid(YMV).name());
+      printf(
+          "KokkosBlas1::axpby<> non-ETI specialization for < %s , %s , %s , %s "
+          ">\n",
+          typeid(AV).name(), typeid(XMV).name(), typeid(BV).name(),
+          typeid(YMV).name());
     }
-    #endif
+#endif
 
     const size_type numRows = X.extent(0);
     const size_type numCols = X.extent(1);
@@ -199,19 +222,22 @@ struct Axpby<AV, XMV, BV, YMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>
       b = 0;
     }
 
-    if (numRows < static_cast<size_type> (INT_MAX) &&
-        numRows * numCols < static_cast<size_type> (INT_MAX)) {
+    if (numRows < static_cast<size_type>(INT_MAX) &&
+        numRows * numCols < static_cast<size_type>(INT_MAX)) {
       typedef int index_type;
-      typedef typename std::conditional<std::is_same<typename XMV::array_layout,Kokkos::LayoutLeft>::value,
-        Axpby_MV_Invoke_Right<AV, XMV, BV, YMV, index_type>,
-        Axpby_MV_Invoke_Left<AV, XMV, BV, YMV, index_type> >::type Axpby_MV_Invoke_Layout;
+      typedef typename std::conditional<
+          std::is_same<typename XMV::array_layout, Kokkos::LayoutLeft>::value,
+          Axpby_MV_Invoke_Right<AV, XMV, BV, YMV, index_type>,
+          Axpby_MV_Invoke_Left<AV, XMV, BV, YMV, index_type> >::type
+          Axpby_MV_Invoke_Layout;
       Axpby_MV_Invoke_Layout::run(av, X, bv, Y, a, b);
-    }
-    else {
+    } else {
       typedef typename XMV::size_type index_type;
-      typedef typename std::conditional<std::is_same<typename XMV::array_layout,Kokkos::LayoutLeft>::value,
-        Axpby_MV_Invoke_Right<AV, XMV, BV, YMV, index_type>,
-        Axpby_MV_Invoke_Left<AV, XMV, BV, YMV, index_type> >::type Axpby_MV_Invoke_Layout;
+      typedef typename std::conditional<
+          std::is_same<typename XMV::array_layout, Kokkos::LayoutLeft>::value,
+          Axpby_MV_Invoke_Right<AV, XMV, BV, YMV, index_type>,
+          Axpby_MV_Invoke_Left<AV, XMV, BV, YMV, index_type> >::type
+          Axpby_MV_Invoke_Layout;
       Axpby_MV_Invoke_Layout::run(av, X, bv, Y, a, b);
     }
     Kokkos::Profiling::popRegion();
@@ -220,94 +246,101 @@ struct Axpby<AV, XMV, BV, YMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>
 
 // Partial specialization for XMV, and YMV rank-2 Views,
 // and AV and BV scalars.
-template<class XMV, class YMV>
+template <class XMV, class YMV>
 struct Axpby<typename XMV::non_const_value_type, XMV,
-             typename YMV::non_const_value_type, YMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY >
-{
+             typename YMV::non_const_value_type, YMV, 2, false,
+             KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename XMV::non_const_value_type AV;
   typedef typename YMV::non_const_value_type BV;
   typedef typename YMV::size_type size_type;
   typedef Kokkos::Details::ArithTraits<typename XMV::non_const_value_type> ATA;
   typedef Kokkos::Details::ArithTraits<typename YMV::non_const_value_type> ATB;
 
-  static void
-  axpby (const AV& alpha, const XMV& X, const BV& beta, const YMV& Y)
-  {
-    static_assert (Kokkos::Impl::is_view<XMV>::value,
-                   "KokkosBlas::Impl::Axpby::axpby (MV): "
-                   "X is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<YMV>::value,
-                   "KokkosBlas::Impl::Axpby::axpby (MV): "
-                   "Y is not a Kokkos::View.");
-    static_assert (std::is_same<typename YMV::value_type,
-                     typename YMV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::Axpby::axpby (MV): Y is const.  "
-                   "It must be nonconst, because it is an output argument "
-                   "(we have to be able to write to its entries).");
-    static_assert ((int) YMV::Rank == (int) XMV::Rank,
-                   "KokkosBlas::Impl::Axpby::axpby (MV): "
-                   "X and Y must have the same rank.");
-    static_assert (YMV::Rank == 2, "KokkosBlas::Impl::Axpby::axpby (MV): "
-                   "X and Y must have rank 2.");
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::axpby[ETI]":"KokkosBlas::axpby[noETI]");
-
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::axpby<> ETI specialization for < %s , %s , %s , %s >\n",typeid(AV).name(),typeid(XMV).name(),typeid(BV).name(),typeid(YMV).name());
+  static void axpby(const AV& alpha, const XMV& X, const BV& beta,
+                    const YMV& Y) {
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::Axpby::axpby (MV): "
+                  "X is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<YMV>::value,
+                  "KokkosBlas::Impl::Axpby::axpby (MV): "
+                  "Y is not a Kokkos::View.");
+    static_assert(std::is_same<typename YMV::value_type,
+                               typename YMV::non_const_value_type>::value,
+                  "KokkosBlas::Impl::Axpby::axpby (MV): Y is const.  "
+                  "It must be nonconst, because it is an output argument "
+                  "(we have to be able to write to its entries).");
+    static_assert((int)YMV::Rank == (int)XMV::Rank,
+                  "KokkosBlas::Impl::Axpby::axpby (MV): "
+                  "X and Y must have the same rank.");
+    static_assert(YMV::Rank == 2,
+                  "KokkosBlas::Impl::Axpby::axpby (MV): "
+                  "X and Y must have rank 2.");
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::axpby[ETI]"
+                                      : "KokkosBlas::axpby[noETI]");
+
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf(
+          "KokkosBlas1::axpby<> ETI specialization for < %s , %s , %s , %s >\n",
+          typeid(AV).name(), typeid(XMV).name(), typeid(BV).name(),
+          typeid(YMV).name());
     else {
-      printf("KokkosBlas1::axpby<> non-ETI specialization for < %s , %s , %s , %s >\n",typeid(AV).name(),typeid(XMV).name(),typeid(BV).name(),typeid(YMV).name());
+      printf(
+          "KokkosBlas1::axpby<> non-ETI specialization for < %s , %s , %s , %s "
+          ">\n",
+          typeid(AV).name(), typeid(XMV).name(), typeid(BV).name(),
+          typeid(YMV).name());
     }
-    #endif
+#endif
 
     const size_type numRows = X.extent(0);
     const size_type numCols = X.extent(1);
     int a, b;
-    if (alpha == ATA::zero ()) {
+    if (alpha == ATA::zero()) {
       a = 0;
     }
 #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
-    else if (alpha == -ATA::one ()) {
+    else if (alpha == -ATA::one()) {
       a = -1;
-    }
-    else if (alpha == ATA::one ()) {
+    } else if (alpha == ATA::one()) {
       a = 1;
     }
-#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
+#endif  // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
     else {
       a = 2;
     }
-    if (beta == ATB::zero ()) {
+    if (beta == ATB::zero()) {
       b = 0;
     }
 #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
-    else if (beta == -ATB::one ()) {
+    else if (beta == -ATB::one()) {
       b = -1;
-    }
-    else if (beta == ATB::one ()) {
+    } else if (beta == ATB::one()) {
       b = 1;
     }
-#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
+#endif  // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
     else {
       b = 2;
     }
 
-
-    if (numRows < static_cast<size_type> (INT_MAX) &&
-        numRows * numCols < static_cast<size_type> (INT_MAX)) {
+    if (numRows < static_cast<size_type>(INT_MAX) &&
+        numRows * numCols < static_cast<size_type>(INT_MAX)) {
       typedef int index_type;
-      typedef typename std::conditional<std::is_same<typename XMV::array_layout,Kokkos::LayoutLeft>::value,
-        Axpby_MV_Invoke_Right<AV, XMV, BV, YMV, index_type>,
-        Axpby_MV_Invoke_Left<AV, XMV, BV, YMV, index_type> >::type Axpby_MV_Invoke_Layout;
-      Axpby_MV_Invoke_Layout::run(alpha, X,
-                                                          beta, Y, a, b);
-    }
-    else {
+      typedef typename std::conditional<
+          std::is_same<typename XMV::array_layout, Kokkos::LayoutLeft>::value,
+          Axpby_MV_Invoke_Right<AV, XMV, BV, YMV, index_type>,
+          Axpby_MV_Invoke_Left<AV, XMV, BV, YMV, index_type> >::type
+          Axpby_MV_Invoke_Layout;
+      Axpby_MV_Invoke_Layout::run(alpha, X, beta, Y, a, b);
+    } else {
       typedef typename XMV::size_type index_type;
-      typedef typename std::conditional<std::is_same<typename XMV::array_layout,Kokkos::LayoutLeft>::value,
-        Axpby_MV_Invoke_Right<AV, XMV, BV, YMV, index_type>,
-        Axpby_MV_Invoke_Left<AV, XMV, BV, YMV, index_type> >::type Axpby_MV_Invoke_Layout;
-      Axpby_MV_Invoke_Layout::run(alpha, X,
-                                                          beta, Y, a, b);
+      typedef typename std::conditional<
+          std::is_same<typename XMV::array_layout, Kokkos::LayoutLeft>::value,
+          Axpby_MV_Invoke_Right<AV, XMV, BV, YMV, index_type>,
+          Axpby_MV_Invoke_Left<AV, XMV, BV, YMV, index_type> >::type
+          Axpby_MV_Invoke_Layout;
+      Axpby_MV_Invoke_Layout::run(alpha, X, beta, Y, a, b);
     }
     Kokkos::Profiling::popRegion();
   }
@@ -315,90 +348,97 @@ struct Axpby<typename XMV::non_const_value_type, XMV,
 
 // Partial specialization for XV and YV rank-1 Views,
 // and AV and BV scalars.
-template<class XV, class YV>
+template <class XV, class YV>
 struct Axpby<typename XV::non_const_value_type, XV,
-             typename YV::non_const_value_type, YV, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>
-{
+             typename YV::non_const_value_type, YV, 1, false,
+             KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename XV::non_const_value_type AV;
   typedef typename YV::non_const_value_type BV;
   typedef typename YV::size_type size_type;
   typedef Kokkos::Details::ArithTraits<typename XV::non_const_value_type> ATA;
   typedef Kokkos::Details::ArithTraits<typename YV::non_const_value_type> ATB;
 
-  static void
-  axpby (const AV& alpha, const XV& X, const BV& beta, const YV& Y)
-  {
-    static_assert (Kokkos::Impl::is_view<XV>::value, "KokkosBlas::Impl::"
-                   "Axpby<rank-1>::axpby: X is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<YV>::value, "KokkosBlas::Impl::"
-                   "Axpby<rank-1>::axpby: Y is not a Kokkos::View.");
-    static_assert (std::is_same<typename YV::value_type,
-                     typename YV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::Axpby<rank-1>::axpby: Y is const.  "
-                   "It must be nonconst, because it is an output argument "
-                   "(we have to be able to write to its entries).");
-    static_assert ((int) YV::Rank == (int) XV::Rank, "KokkosBlas::Impl::"
-                   "Axpby<rank-1>::axpby: X and Y must have the same rank.");
-    static_assert (YV::Rank == 1, "KokkosBlas::Impl::Axpby<rank-1>::axpby: "
-                   "X and Y must have rank 1.");
-
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::axpby[ETI]":"KokkosBlas::axpby[noETI]");
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::axpby<> ETI specialization for < %s , %s , %s , %s >\n",typeid(AV).name(),typeid(XV).name(),typeid(BV).name(),typeid(YV).name());
+  static void axpby(const AV& alpha, const XV& X, const BV& beta, const YV& Y) {
+    static_assert(Kokkos::is_view<XV>::value,
+                  "KokkosBlas::Impl::"
+                  "Axpby<rank-1>::axpby: X is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<YV>::value,
+                  "KokkosBlas::Impl::"
+                  "Axpby<rank-1>::axpby: Y is not a Kokkos::View.");
+    static_assert(std::is_same<typename YV::value_type,
+                               typename YV::non_const_value_type>::value,
+                  "KokkosBlas::Impl::Axpby<rank-1>::axpby: Y is const.  "
+                  "It must be nonconst, because it is an output argument "
+                  "(we have to be able to write to its entries).");
+    static_assert((int)YV::Rank == (int)XV::Rank,
+                  "KokkosBlas::Impl::"
+                  "Axpby<rank-1>::axpby: X and Y must have the same rank.");
+    static_assert(YV::Rank == 1,
+                  "KokkosBlas::Impl::Axpby<rank-1>::axpby: "
+                  "X and Y must have rank 1.");
+
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::axpby[ETI]"
+                                      : "KokkosBlas::axpby[noETI]");
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf(
+          "KokkosBlas1::axpby<> ETI specialization for < %s , %s , %s , %s >\n",
+          typeid(AV).name(), typeid(XV).name(), typeid(BV).name(),
+          typeid(YV).name());
     else {
-      printf("KokkosBlas1::axpby<> non-ETI specialization for < %s , %s , %s , %s >\n",typeid(AV).name(),typeid(XV).name(),typeid(BV).name(),typeid(YV).name());
+      printf(
+          "KokkosBlas1::axpby<> non-ETI specialization for < %s , %s , %s , %s "
+          ">\n",
+          typeid(AV).name(), typeid(XV).name(), typeid(BV).name(),
+          typeid(YV).name());
     }
-    #endif
+#endif
 
     const size_type numRows = X.extent(0);
-    int a = 2;
-    if (alpha == ATA::zero ()) {
+    int a                   = 2;
+    if (alpha == ATA::zero()) {
       a = 0;
     }
 #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
-    else if (alpha == -ATA::one ()) {
+    else if (alpha == -ATA::one()) {
       a = -1;
-    }
-    else if (alpha == ATA::one ()) {
+    } else if (alpha == ATA::one()) {
       a = 1;
     }
-#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
+#endif  // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
 
     int b = 2;
-    if (beta == ATB::zero ()) {
+    if (beta == ATB::zero()) {
       b = 0;
     }
 #if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
-    else if (beta == -ATB::one ()) {
+    else if (beta == -ATB::one()) {
       b = -1;
-    }
-    else if (beta == ATB::one ()) {
+    } else if (beta == ATB::one()) {
       b = 1;
     }
-#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
+#endif  // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2
 
-    if (numRows < static_cast<size_type> (INT_MAX)) {
+    if (numRows < static_cast<size_type>(INT_MAX)) {
       typedef int index_type;
       Axpby_Generic<typename XV::non_const_value_type, XV,
-        typename YV::non_const_value_type, YV,
-        index_type> (alpha, X, beta, Y, 0, a, b);
-    }
-    else {
+                    typename YV::non_const_value_type, YV, index_type>(
+          alpha, X, beta, Y, 0, a, b);
+    } else {
       typedef typename XV::size_type index_type;
       Axpby_Generic<typename XV::non_const_value_type, XV,
-        typename YV::non_const_value_type, YV,
-        index_type> (alpha, X, beta, Y, 0, a, b);
+                    typename YV::non_const_value_type, YV, index_type>(
+          alpha, X, beta, Y, 0, a, b);
     }
     Kokkos::Profiling::popRegion();
   }
 };
-#endif //!defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
-
-
+#endif  //! defined(KOKKOSKERNELS_ETI_ONLY) ||
+        //! KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 
-} // namespace Impl
-} // namespace KokkosBlas
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization of
@@ -408,25 +448,27 @@ struct Axpby<typename XV::non_const_value_type, XV,
 // one or more .cpp files.
 //
 
-#define KOKKOSBLAS1_AXPBY_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct Axpby< \
-        SCALAR, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        SCALAR, \
-        Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1, false, true>;
-
-#define KOKKOSBLAS1_AXPBY_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct Axpby< \
-        SCALAR, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        SCALAR, \
-        Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1, false, true>;
+#define KOKKOSBLAS1_AXPBY_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  extern template struct Axpby<                                                \
+      SCALAR,                                                                  \
+      Kokkos::View<const SCALAR*, LAYOUT,                                      \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      SCALAR,                                                                  \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1, false, true>;
+
+#define KOKKOSBLAS1_AXPBY_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  template struct Axpby<                                                       \
+      SCALAR,                                                                  \
+      Kokkos::View<const SCALAR*, LAYOUT,                                      \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      SCALAR,                                                                  \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1, false, true>;
 
 //
 // Macro for declaration of full specialization of
@@ -436,49 +478,58 @@ template struct Axpby< \
 // one or more .cpp files.
 //
 
-#define KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct Axpby< \
-     SCALAR, \
-     Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     SCALAR, \
-     Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     2, false, true>; \
-extern template struct Axpby< \
-     Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,\
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,\
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     2, false, true>;
-
-#define KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct Axpby< \
-     SCALAR, \
-     Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     SCALAR, \
-     Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     2, false, true>; \
-template struct Axpby< \
-     Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,\
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,\
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     2, false, true>;
-
-
-#include<KokkosBlas1_axpby_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_axpby_eti_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_axpby_mv_eti_spec_decl.hpp>
-
-#endif // KOKKOS_BLAS1_MV_IMPL_AXPBY_HPP_
+#define KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE,      \
+                                           MEM_SPACE)                       \
+  extern template struct Axpby<                                             \
+      SCALAR,                                                               \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      SCALAR,                                                               \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      2, false, true>;                                                      \
+  extern template struct Axpby<                                             \
+      Kokkos::View<const SCALAR*, LAYOUT,                                   \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR*, LAYOUT,                                   \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      2, false, true>;
+
+#define KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE,      \
+                                           MEM_SPACE)                       \
+  template struct Axpby<                                                    \
+      SCALAR,                                                               \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      SCALAR,                                                               \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      2, false, true>;                                                      \
+  template struct Axpby<                                                    \
+      Kokkos::View<const SCALAR*, LAYOUT,                                   \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR*, LAYOUT,                                   \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      2, false, true>;
+
+#include <KokkosBlas1_axpby_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_axpby_eti_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_axpby_mv_eti_spec_decl.hpp>
+
+#endif  // KOKKOS_BLAS1_MV_IMPL_AXPBY_HPP_
diff --git a/src/blas/impl/KokkosBlas1_dot_impl.hpp b/src/blas/impl/KokkosBlas1_dot_impl.hpp
index 21d10c71d0..cb8db757f8 100644
--- a/src/blas/impl/KokkosBlas1_dot_impl.hpp
+++ b/src/blas/impl/KokkosBlas1_dot_impl.hpp
@@ -58,56 +58,47 @@ namespace Impl {
 /// \tparam YVector Type of the second vector y; 1-D View
 /// \tparam SizeType Type of the row index used in the dot product.
 ///   For best performance, use int instead of size_t here.
-template<class AV, class XVector, class YVector, typename SizeType>
-struct DotFunctor
-{
-  typedef typename XVector::execution_space        execution_space;
-  typedef SizeType                                 size_type;
-  typedef typename AV::non_const_value_type   avalue_type;
-  typedef Kokkos::Details::InnerProductSpaceTraits<avalue_type>  IPT;
-  typedef typename IPT::dot_type                   value_type;
+template <class AV, class XVector, class YVector, typename SizeType>
+struct DotFunctor {
+  typedef typename XVector::execution_space execution_space;
+  typedef SizeType size_type;
+  typedef typename AV::non_const_value_type avalue_type;
+  typedef Kokkos::Details::InnerProductSpaceTraits<avalue_type> IPT;
+  typedef typename IPT::dot_type value_type;
 
-  XVector  m_x;
-  YVector  m_y;
+  XVector m_x;
+  YVector m_y;
 
-  DotFunctor (const XVector& x, const YVector& y) : m_x (x), m_y (y) {}
+  DotFunctor(const XVector& x, const YVector& y) : m_x(x), m_y(y) {}
 
   void run(const char* label, AV result) {
-    Kokkos::RangePolicy<execution_space,size_type> policy(0,m_x.extent(0));
-    Kokkos::parallel_reduce(label,policy,*this,result);
+    Kokkos::RangePolicy<execution_space, size_type> policy(0, m_x.extent(0));
+    Kokkos::parallel_reduce(label, policy, *this, result);
   }
 
   // Prefer const size_type& to const size_type or size_type,
   // since the compiler has an easier time inlining the former.
-  KOKKOS_FORCEINLINE_FUNCTION void
-  operator() (const size_type &i, value_type& sum) const
-  {
-    Kokkos::Details::updateDot(sum, m_x(i), m_y(i)); // sum += m_x(i) * m_y(i)
+  KOKKOS_FORCEINLINE_FUNCTION void operator()(const size_type& i,
+                                              value_type& sum) const {
+    Kokkos::Details::updateDot(sum, m_x(i), m_y(i));  // sum += m_x(i) * m_y(i)
   }
 
-  KOKKOS_INLINE_FUNCTION void
-  init (volatile value_type& update) const
-  {
-    update = Kokkos::Details::ArithTraits<value_type>::zero ();
+  KOKKOS_INLINE_FUNCTION void init(volatile value_type& update) const {
+    update = Kokkos::Details::ArithTraits<value_type>::zero();
   }
 
-  KOKKOS_INLINE_FUNCTION void
-  join (value_type& update,
-        const value_type& source) const
-  {
-    update += source ;
+  KOKKOS_INLINE_FUNCTION void join(value_type& update,
+                                   const value_type& source) const {
+    update += source;
   }
 
-  KOKKOS_INLINE_FUNCTION void
-  join (volatile value_type& update,
-        const volatile value_type& source) const
-  {
-    update += source ;
+  KOKKOS_INLINE_FUNCTION void join(volatile value_type& update,
+                                   const volatile value_type& source) const {
+    update += source;
   }
 };
 
+}  // namespace Impl
+}  // namespace KokkosBlas
 
-} // namespace Impl
-} // namespace KokkosBlas
-
-#endif // KOKKOSBLAS1_IMPL_DOT_IMPL_HPP_
+#endif  // KOKKOSBLAS1_IMPL_DOT_IMPL_HPP_
diff --git a/src/blas/impl/KokkosBlas1_dot_mv_impl.hpp b/src/blas/impl/KokkosBlas1_dot_mv_impl.hpp
index 85ecca96bc..500dc035ca 100644
--- a/src/blas/impl/KokkosBlas1_dot_mv_impl.hpp
+++ b/src/blas/impl/KokkosBlas1_dot_mv_impl.hpp
@@ -44,630 +44,122 @@
 #ifndef KOKKOSBLAS1_IMPL_DOT_MV_IMPL_HPP_
 #define KOKKOSBLAS1_IMPL_DOT_MV_IMPL_HPP_
 
-#ifndef KOKKOSBLAS_OPTIMIZATION_LEVEL_DOT
-#define KOKKOSBLAS_OPTIMIZATION_LEVEL_DOT 2
-#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_DOT
-
 #include <KokkosKernels_config.h>
 #include <Kokkos_Core.hpp>
 #include <Kokkos_InnerProductSpaceTraits.hpp>
-#include <type_traits>
-#include <KokkosBlas1_dot_impl.hpp>
+#include <KokkosBlas_util.hpp>
+#include <sstream>
 
 namespace KokkosBlas {
 namespace Impl {
 
+template <class ExecSpace, class RV, class XV, class YV, class size_type>
+struct Dot_MV_Functor {
+  using Scalar = typename RV::non_const_value_type;
+  using IPT    = Kokkos::Details::InnerProductSpaceTraits<
+      typename XV::non_const_value_type>;
+  using dot_type = typename IPT::dot_type;
+  using KAT      = Kokkos::ArithTraits<dot_type>;
 
-/// \brief Dot product functor for a multivector times a single
-///   vector, or vice versa.
-///
-/// "Multivector dot a single vector" means that each column of the
-/// multivector gets dotted with the same vector, as if the latter
-/// vector were replicated.  "Single vector dot a multivector" means
-/// the (conjugate) transpose of that.  We combine both (multivector
-/// dot vector) and (vector dot multivector) cases into a single
-/// functor, to avoid code duplication.
-///
-/// \tparam RV 1-D output View
-/// \tparam XMV 2-D input View (the multivector)
-/// \tparam YV 1-D input View (the single vector)
-/// \tparam SizeType Index type.  Use int (32 bits) if possible.
-template<class RV, class XMV, class YV, class SizeType = typename XMV::size_type>
-struct MV_V_Dot_Functor
-{
-  typedef typename XMV::execution_space              execution_space;
-  typedef SizeType                                         size_type;
-  typedef typename XMV::non_const_value_type             xvalue_type;
-  typedef Kokkos::Details::InnerProductSpaceTraits<xvalue_type>  IPT;
-  typedef Kokkos::Details::ArithTraits<typename IPT::dot_type>    AT;
-  typedef typename IPT::dot_type                        value_type[];
-
-  size_type value_count;
-  RV m_r;
-  typename XMV::const_type m_x;
-  typename YV::const_type m_y;
-  //! If true, do y dot x instead of x dot y.
-  bool reverseOrder_;
-
-  MV_V_Dot_Functor (const RV& r, const XMV& x, const YV& y,
-                    const bool reverseOrder) :
-    value_count (x.extent(1)), m_r (r), m_x (x), m_y (y),
-    reverseOrder_ (reverseOrder)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::Impl::"
-                   "MV_V_Dot_Functor: R is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "MV_V_Dot_Functor: X is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<YV>::value, "KokkosBlas::Impl::"
-                   "MV_V_Dot_Functor: Y is not a Kokkos::View.");
-    static_assert (std::is_same<typename RV::value_type,
-                   typename RV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::MV_V_Dot_Functor: R is const.  "
-                   "It must be nonconst, because it is an output argument "
-                   "(we have to be able to write to its entries).");
-    static_assert (static_cast<int> (XMV::rank) == 2,
-                   "KokkosBlas::Impl::MV_V_Dot_Functor: X must have rank 2.");
-    static_assert (static_cast<int> (YV::rank) == 1,
-                   "KokkosBlas::Impl::MV_V_Dot_Functor: Y must have rank 1.");
-    static_assert (static_cast<int> (RV::rank) == 1,
-                   "KokkosBlas::Impl::MV_V_Dot_Functor: RV must have rank 1.");
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i, value_type sum) const
-  {
-    const size_type numVecs = value_count;
-    if (reverseOrder_) {
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-      for (size_type k = 0; k < numVecs; ++k) {
-        sum[k] += IPT::dot (m_y(i), m_x(i,k)); // m_x(i,k) * m_y(i)
-      }
-    }
-    else {
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-      for (size_type k = 0; k < numVecs; ++k) {
-        sum[k] += IPT::dot (m_x(i,k), m_y(i)); // m_x(i,k) * m_y(i)
-      }
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION void init (value_type update) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type k = 0; k < numVecs; ++k) {
-      update[k] = AT::zero ();
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION void
-  join (volatile value_type update,
-        const volatile value_type source) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type k = 0; k < numVecs; ++k) {
-      update[k] += source[k];
-    }
-  }
-
-  // On device, write the reduction result to the output View.
-  /*KOKKOS_INLINE_FUNCTION void
-  final (const value_type dst) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type k = 0; k < numVecs; ++k) {
-      m_r(k) = dst[k];
-    }
-  }*/
-};
-
-/// \brief Column-wise dot product functor for multivectors; works for
-///   any layout, but best performance with LayoutRight.
-///
-/// \tparam RV 1-D output View
-/// \tparam XMV 2-D input View
-/// \tparam YMV 2-D input View
-/// \tparam SizeType Index type.  Use int (32 bits) if possible.
-template<class RV, class XMV, class YMV, class SizeType = typename XMV::size_type>
-struct MV_Dot_Right_FunctorVector
-{
-  typedef typename XMV::execution_space             execution_space;
-  typedef SizeType                                        size_type;
-  typedef typename XMV::non_const_value_type            xvalue_type;
-  typedef Kokkos::Details::InnerProductSpaceTraits<xvalue_type> IPT;
-  typedef Kokkos::Details::ArithTraits<typename IPT::dot_type>   AT;
-  typedef typename IPT::dot_type                       value_type[];
-
-  size_type value_count;
-  RV m_r;
-  typename XMV::const_type m_x;
-  typename YMV::const_type m_y;
-
-  MV_Dot_Right_FunctorVector (const RV& r, const XMV& x, const YMV& y) :
-    value_count (x.extent(1)), m_r (r), m_x (x), m_y (y)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::Impl::"
-                   "MV_Dot_Right_FunctorVector: R is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "MV_Dot_Right_FunctorVector: X is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<YMV>::value, "KokkosBlas::Impl::"
-                   "MV_Dot_Right_FunctorVector: Y is not a Kokkos::View.");
-    static_assert (std::is_same<typename RV::value_type,
-                   typename RV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::MV_Dot_Right_FunctorVector: R is const.  "
-                   "It must be nonconst, because it is an output argument "
-                   "(we have to be able to write to its entries).");
-    static_assert (XMV::rank == YMV::rank,
-                   "KokkosBlas::Impl::MV_Dot_Right_FunctorVector: "
-                   "X and Y must have the same rank.");
-    static_assert (RV::rank == 1 && XMV::rank == 2,
-                   "KokkosBlas::Impl::MV_Dot_Right_FunctorVector: "
-                   "RV must have rank 1 and XMV and YMV must have rank 2.");
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i, value_type sum) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type k = 0; k < numVecs; ++k) {
-      sum[k] += IPT::dot (m_x(i,k), m_y(i,k)); // m_x(i,k) * m_y(i,k)
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION void init (value_type update) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type k = 0; k < numVecs; ++k) {
-      update[k] = AT::zero ();
-    }
-  }
+  using TeamMem = typename Kokkos::TeamPolicy<ExecSpace>::member_type;
 
-  KOKKOS_INLINE_FUNCTION void
-  join (volatile value_type update,
-        const volatile value_type source) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type k = 0; k < numVecs; ++k) {
-      update[k] += source[k];
-    }
-  }
+  RV r;
+  XV x;
+  YV y;
 
-  // On device, write the reduction result to the output View.
-  /*KOKKOS_INLINE_FUNCTION void
-  final (const value_type dst) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type k = 0; k < numVecs; ++k) {
-      m_r(k) = dst[k];
-    }
-  }*/
-};
-
-/// \brief Column-wise dot product functor for multivectors with
-///   number of columns known at compile time; works for any layout,
-///   but best performance with LayoutRight.
-///
-/// \tparam RV 1-D output View
-/// \tparam XMV 2-D input View
-/// \tparam YMV 2-D input View
-/// \tparam UNROLL Number of columns (vectors)
-/// \tparam SizeType Index type.  Use int (32 bits) if possible.
-template<class RV, class XMV, class YMV, int UNROLL, class SizeType = typename XMV::size_type>
-struct MV_Dot_Right_FunctorUnroll
-{
-  typedef typename XMV::execution_space             execution_space;
-  typedef SizeType                                        size_type;
-  typedef typename XMV::non_const_value_type            xvalue_type;
-  typedef Kokkos::Details::InnerProductSpaceTraits<xvalue_type> IPT;
-  typedef Kokkos::Details::ArithTraits<typename IPT::dot_type>   AT;
-  typedef typename IPT::dot_type                       value_type[];
+  size_type
+      teamsPerDot;  // number of teams collectively performing a dot product
 
-  size_type value_count;
-  RV m_r;
-  typename XMV::const_type m_x;
-  typename YMV::const_type m_y;
-
-  MV_Dot_Right_FunctorUnroll (const RV& r, const XMV& x, const YMV& y) :
-    value_count (x.extent(1)), m_r (r), m_x (x), m_y (y)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::Impl::"
-                   "MV_Dot_Right_FunctorUnroll: R is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "MV_Dot_Right_FunctorUnroll: X is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<YMV>::value, "KokkosBlas::Impl::"
-                   "MV_Dot_Right_FunctorUnroll: Y is not a Kokkos::View.");
-    static_assert (std::is_same<typename RV::value_type,
-                   typename RV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::MV_Dot_Right_FunctorUnroll: R is const.  "
-                   "It must be nonconst, because it is an output argument "
-                   "(we have to be able to write to its entries).");
-    static_assert (int(XMV::rank) == int(YMV::rank),
-                   "KokkosBlas::Impl::MV_Dot_Right_FunctorUnroll: "
-                   "X and Y must have the same rank.");
-    static_assert (RV::rank == 1 && XMV::rank == 2,
-                   "KokkosBlas::Impl::MV_Dot_Right_FunctorUnroll: "
-                   "RV must have rank 1 and XMV and YMV must have rank 2.");
-  }
+  Dot_MV_Functor(const RV& r_, const XV& x_, const YV& y_, int teamsPerDot_)
+      : r(r_), x(x_), y(y_), teamsPerDot(teamsPerDot_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i, value_type sum) const
-  {
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-    for (int k = 0; k < UNROLL; ++k) {
-      sum[k] += IPT::dot (m_x(i,k), m_y(i,k)); // m_x(i,k) * m_y(i,k)
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION void init (volatile value_type update) const
-  {
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-    for (int k = 0; k < UNROLL; ++k) {
-      update[k] = AT::zero ();
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION void
-  join (volatile value_type update,
-        const volatile value_type source) const
-  {
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-    for (int k = 0; k < UNROLL; ++k) {
-      update[k] += source[k];
-    }
-  }
-
-  // On device, write the reduction result to the output View.
-  /*KOKKOS_INLINE_FUNCTION void
-  final (const value_type dst) const
-  {
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-    for (int k = 0; k < UNROLL; ++k) {
-      m_r(k) = dst[k];
-    }
-  }*/
-};
-
-
-//! Implementation detail of MV_V_Dot_Invoke (see below).
-template<class RV, class XMV, class YMV, class SizeType,
-         const int XMV_rank = XMV::rank,
-         const int YMV_rank = YMV::rank>
-struct MV_V_Dot_Invoke_Impl
-{
-  static void
-  run (const RV& r, const XMV& X, const YMV& Y, const SizeType numRows);
-};
-
-template<class RV, class XMV, class YMV, class SizeType>
-struct MV_V_Dot_Invoke_Impl<RV, XMV, YMV, SizeType, 2, 1>
-{
-  static void
-  run (const RV& r, const XMV& X, const YMV& Y, const SizeType numRows)
-  {
-    static_assert (static_cast<int> (XMV::rank) == 2 && static_cast<int> (YMV::rank) == 1,
-                   "XMV must have rank 2, and YMV must have rank 1.");
-    static_assert (static_cast<int> (RV::rank) == 1, "Rank of r must be 1.");
-    static_assert (std::is_integral<SizeType>::value,
-                   "SizeType must be a built-in integer type.");
-
-    typedef typename XMV::execution_space execution_space;
-    typedef SizeType size_type;
-    typedef Kokkos::RangePolicy<execution_space, size_type> range_type;
-
-    typedef MV_V_Dot_Functor<RV, XMV, YMV, size_type> op_type;
-    constexpr bool reverseOrder = false;
-    op_type op (r, X, Y, reverseOrder);
-    Kokkos::parallel_reduce ("KokkosBlas::Dot::2_1", range_type (0, numRows), op, r);
-  }
-};
-
-template<class RV, class XMV, class YMV, class SizeType>
-struct MV_V_Dot_Invoke_Impl<RV, XMV, YMV, SizeType, 1, 2>
-{
-  static void
-  run (const RV& r, const XMV& X, const YMV& Y, const SizeType numRows)
-  {
-    static_assert (static_cast<int> (XMV::rank) == 1 && static_cast<int> (YMV::rank) == 2,
-                   "XMV must have rank 1, and YMV must have rank 2.");
-    static_assert (static_cast<int> (RV::rank) == 1, "Rank of r must be 1.");
-    static_assert (std::is_integral<SizeType>::value,
-                   "SizeType must be a built-in integer type.");
-
-    typedef typename XMV::execution_space execution_space;
-    typedef SizeType size_type;
-    typedef Kokkos::RangePolicy<execution_space, size_type> range_type;
-
-    // Use the "reverse arguments" mode of the functor.
-    typedef MV_V_Dot_Functor<RV, YMV, XMV, size_type> op_type;
-    constexpr bool reverseOrder = true;
-    op_type op (r, Y, X, reverseOrder);
-    Kokkos::parallel_reduce ("KokkosBlas::Dot::1_2", range_type (0, numRows), op, r);
+  void operator()(const TeamMem& t) const {
+    size_type globalRank = t.league_rank();
+    size_type localRank  = globalRank % teamsPerDot;
+    size_type i          = globalRank / teamsPerDot;
+    size_type xcol       = x.extent(1) == 1 ? 0 : i;
+    size_type ycol       = y.extent(1) == 1 ? 0 : i;
+
+    dot_type localResult = KAT::zero();
+    size_type begin      = localRank * (x.extent(0) / teamsPerDot);
+    size_type end        = (localRank + 1) * (x.extent(0) / teamsPerDot);
+    if (localRank == teamsPerDot - 1) end = x.extent(0);
+    Kokkos::parallel_reduce(
+        Kokkos::TeamThreadRange(t, begin, end),
+        [&](size_type k, dot_type& update) {
+          Kokkos::Details::updateDot(update, x.access(k, xcol),
+                                     y.access(k, ycol));
+        },
+        localResult);
+
+    Kokkos::single(Kokkos::PerTeam(t),
+                   [&]() { Kokkos::atomic_add(&r(i), Scalar(localResult)); });
   }
 };
 
-//! Special case where XMV has rank 2 and YMV has rank 1, or vice versa.
-template<class RV, class XMV, class YMV, class SizeType>
-void
-MV_V_Dot_Invoke (const RV& r, const XMV& X, const YMV& Y, const SizeType numRows)
-{
-  MV_V_Dot_Invoke_Impl<RV, XMV, YMV, SizeType>::run (r, X, Y, numRows);
+// Main version: the result view is accessible from execution space, so it can
+// be computed in-place
+template <class RV, class XV, class YV, class size_type>
+void MV_Dot_Invoke(
+    const RV& r, const XV& x, const YV& y,
+    typename std::enable_if<Kokkos::SpaceAccessibility<
+        typename XV::execution_space,
+        typename RV::memory_space>::accessible>::type* = nullptr) {
+  using execution_space = typename XV::execution_space;
+  size_type numDots     = std::max(x.extent(1), y.extent(1));
+  if (x.extent(0) != y.extent(0)) {
+    std::ostringstream oss;
+    oss << "KokkosBlas::dot (rank-2): x and y have different lengths ("
+        << x.extent(0) << " and " << y.extent(0) << ")";
+    throw std::runtime_error(oss.str());
+  }
+  if ((x.extent(1) != size_t(1) && x.extent(1) != size_t(numDots)) ||
+      (y.extent(1) != size_t(1) && y.extent(1) != size_t(numDots))) {
+    std::ostringstream oss;
+    oss << "KokkosBlas::dot (rank-2): x and y have incompatible numbers of "
+           "columns ("
+        << x.extent(1) << " and " << y.extent(1) << ")";
+    throw std::runtime_error(oss.str());
+  }
+  if (r.extent(0) != size_t(numDots)) {
+    std::ostringstream oss;
+    oss << "KokkosBlas::dot (rank-2): result vector has wrong length ("
+        << r.extent(0) << ", but " << numDots
+        << " dot products will be computed)";
+    throw std::runtime_error(oss.str());
+  }
+  // Zero out the result vector
+  Kokkos::deep_copy(
+      r, Kokkos::ArithTraits<typename RV::non_const_value_type>::zero());
+  size_type teamsPerDot;
+  KokkosBlas::Impl::multipleReductionWorkDistribution<execution_space,
+                                                      size_type>(
+      x.extent(0), numDots, teamsPerDot);
+  size_type numTeams = numDots * teamsPerDot;
+  Kokkos::TeamPolicy<execution_space> pol(numTeams, Kokkos::AUTO);
+  Kokkos::parallel_for("Dot_MV", pol,
+                       Dot_MV_Functor<execution_space, RV, XV, YV, size_type>(
+                           r, x, y, teamsPerDot));
 }
 
-//! Special case where XMV and YMV both have rank 2.
-template<class RV, class XMV, class YMV, class SizeType>
-void
-MV_Dot_Invoke (const RV& r, const XMV& X, const YMV& Y)
-{
-  const SizeType numRows = static_cast<SizeType> (X.extent(0));
-  const SizeType numCols = static_cast<SizeType> (X.extent(1));
-  Kokkos::RangePolicy<typename XMV::execution_space, SizeType> policy (0, numRows);
-
-  if (static_cast<int> (X.extent(1)) != 1 && static_cast<int> (Y.extent(1)) == 1) {
-    // X has > 1 columns, and Y has 1 column.
-    auto Y_0 = Kokkos::subview (Y, Kokkos::ALL (), 0);
-    typedef typename decltype (Y_0)::const_type YV;
-    MV_V_Dot_Invoke<RV, XMV, YV, SizeType> (r, X, Y_0, numRows);
-    return;
-  }
-  else if (static_cast<int> (X.extent(1)) == 1 && static_cast<int> (Y.extent(1)) != 1) {
-    // X has 1 column, and Y has > 1 columns.
-    auto X_0 = Kokkos::subview (X, Kokkos::ALL (), 0);
-    typedef typename decltype (X_0)::const_type XV;
-    MV_V_Dot_Invoke<RV, XV, YMV, SizeType> (r, X_0, Y, numRows);
-    return;
-  }
-
-#if KOKKOSBLAS_OPTIMIZATION_LEVEL_DOT <= 2
-
-  // Strip-mine by 8, then 4.  After that, do one column at a time.
-  // We limit the number of strip-mine values in order to keep down
-  // the amount of code to compile.
-
-  SizeType j = 0; // the current column of X and Y
-  for ( ; j + 8 <= numCols; j += 8) {
-    auto X_cur = Kokkos::subview (X, Kokkos::ALL (), std::make_pair (j, j+8));
-    auto Y_cur = Kokkos::subview (Y, Kokkos::ALL (), std::make_pair (j, j+8));
-    auto r_cur = Kokkos::subview (r, std::make_pair (j, j+8));
-
-    MV_Dot_Right_FunctorUnroll<RV, XMV, YMV, 8, SizeType> op (r_cur, X_cur, Y_cur);
-    Kokkos::parallel_reduce ("KokkosBlas::Dot::2_2::8", policy, op, r_cur);
-  }
-  for ( ; j + 4 <= numCols; j += 4) {
-    auto X_cur = Kokkos::subview (X, Kokkos::ALL (), std::make_pair (j, j+4));
-    auto Y_cur = Kokkos::subview (Y, Kokkos::ALL (), std::make_pair (j, j+4));
-    auto r_cur = Kokkos::subview (r, std::make_pair (j, j+4));
-
-    MV_Dot_Right_FunctorUnroll<RV, XMV, YMV, 4, SizeType> op (r_cur, X_cur, Y_cur);
-    Kokkos::parallel_reduce ("KokkosBlas::Dot::2_2::4", policy, op, r_cur);
-  }
-  for ( ; j < numCols; ++j) {
-    // RV needs to turn 0-D, and XMV and YMV need to turn 1-D.
-    auto x_cur = Kokkos::subview (X, Kokkos::ALL (), j);
-    auto y_cur = Kokkos::subview (Y, Kokkos::ALL (), j);
-    auto r_cur = Kokkos::subview (r, j);
-    typedef decltype (r_cur) RV0D;
-    typedef decltype (x_cur) XMV1D;
-    typedef decltype (y_cur) YMV1D;
-
-    DotFunctor<RV0D, XMV1D, YMV1D, SizeType> op(x_cur, y_cur);
-    Kokkos::parallel_reduce ("KokkosBlas::Dot::2_2::1", policy, op, r_cur);
-  }
-
-#else // KOKKOSBLAS_OPTIMIZATION_LEVEL_DOT > 2
-
-  if (numCols > 16) {
-    MV_Dot_Right_FunctorVector<RV, XMV, YMV, SizeType> op (r, X, Y);
-    Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::16Above", policy, op, r);
-  }
-  else {
-    switch (numCols) {
-    case 16: {
-      MV_Dot_Right_FunctorUnroll<RV, XMV, YMV, 16, SizeType> op (r, X, Y);
-      Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::16", policy, op, r);
-      break;
-    }
-    case 15: {
-      MV_Dot_Right_FunctorUnroll<RV, XMV, YMV, 15, SizeType> op (r, X, Y);
-      Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::15", policy, op, r);
-      break;
-    }
-    case 14: {
-      MV_Dot_Right_FunctorUnroll<RV, XMV, YMV, 14, SizeType> op (r, X, Y);
-      Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::14", policy, op, r);
-      break;
-    }
-    case 13: {
-      MV_Dot_Right_FunctorUnroll<RV, XMV, YMV, 13, SizeType> op (r, X, Y);
-      Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::13", policy, op, r);
-      break;
-    }
-    case 12: {
-      MV_Dot_Right_FunctorUnroll<RV, XMV, YMV, 12, SizeType> op (r, X, Y);
-      Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::12", policy, op, r);
-      break;
-    }
-    case 11: {
-      MV_Dot_Right_FunctorUnroll<RV, XMV, YMV, 11, SizeType> op (r, X, Y);
-      Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::11", policy, op, r);
-      break;
-    }
-    case 10: {
-      MV_Dot_Right_FunctorUnroll<RV, XMV, YMV, 10, SizeType> op (r, X, Y);
-      Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::10", policy, op, r);
-      break;
-    }
-    case 9: {
-      MV_Dot_Right_FunctorUnroll<RV, XMV, YMV, 9, SizeType> op (r, X, Y);
-      Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::9", policy, op, r);
-      break;
-    }
-    case 8: {
-      MV_Dot_Right_FunctorUnroll<RV, XMV, YMV, 8, SizeType> op (r, X, Y);
-      Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::8", policy, op, r);
-      break;
-    }
-    case 7: {
-      MV_Dot_Right_FunctorUnroll<RV, XMV, YMV, 7, SizeType> op (r, X, Y);
-      Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::7", policy, op, r);
-      break;
-    }
-    case 6: {
-      MV_Dot_Right_FunctorUnroll<RV, XMV, YMV, 6, SizeType> op (r, X, Y);
-      Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::6", policy, op, r);
-      break;
-    }
-    case 5: {
-      MV_Dot_Right_FunctorUnroll<RV, XMV, YMV, 5, SizeType> op (r, X, Y);
-      Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::5", policy, op, r);
-      break;
-    }
-    case 4: {
-      MV_Dot_Right_FunctorUnroll<RV, XMV, YMV, 4, SizeType> op (r, X, Y);
-      Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::4", policy, op, r);
-      break;
-    }
-    case 3: {
-      MV_Dot_Right_FunctorUnroll<RV, XMV, YMV, 3, SizeType> op (r, X, Y);
-      Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::3", policy, op, r);
-      break;
-    }
-    case 2: {
-      MV_Dot_Right_FunctorUnroll<RV, XMV, YMV, 2, SizeType> op (r, X, Y);
-      Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::2", policy, op, r);
-      break;
-    }
-    case 1: {
-      // RV needs to turn 0-D, and XMV and YMV need to turn 1-D.
-      auto r_0 = Kokkos::subview (r, 0);
-      auto X_0 = Kokkos::subview (X, Kokkos::ALL (), 0);
-      auto Y_0 = Kokkos::subview (Y, Kokkos::ALL (), 0);
-      typedef decltype (r_0) RV0D;
-      typedef decltype (X_0) XMV1D;
-      typedef decltype (Y_0) YMV1D;
-
-      typedef V_Dot_Functor<RV0D, XMV1D, YMV1D, SizeType> op_type;
-      op_type op (r_0, X_0, Y_0);
-      Kokkos::parallel_reduce ("KokkosBlas::Dot::NumCols::1", policy, op, r_0);
-      break;
-    }
-    } // switch
-  } // if-else
-
-#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_DOT
+// Version for when a temporary result view is needed (implemented in terms of
+// the other version)
+template <class RV, class XV, class YV, class size_type>
+void MV_Dot_Invoke(
+    const RV& r, const XV& x, const YV& y,
+    typename std::enable_if<!Kokkos::SpaceAccessibility<
+        typename XV::execution_space,
+        typename RV::memory_space>::accessible>::type* = nullptr) {
+  Kokkos::View<typename RV::non_const_value_type*, typename XV::memory_space>
+      tempResult(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "Dot_MV temp result"),
+          r.extent(0));
+  MV_Dot_Invoke<decltype(tempResult), XV, YV, size_type>(tempResult, x, y);
+  Kokkos::deep_copy(r, tempResult);
 }
 
-/// \brief Implementation of KokkosBlas::dot for multivectors or
-///   single vectors.
-///
-/// \tparam RV Type of return View of dot product results.
-/// \tparam XMV Type of first input (multi)vector X View.
-/// \tparam YMV Type of second input (multi)vector Y View.
-/// \tparam XMV_rank The rank of XMY.  If 2, it is a multivector; if
-///   1, it is a single vector.
-/// \tparam YMV_rank The rank of YMV.  If 2, it is a multivector; if
-///   1, it is a single vector.
-template<class RV, class XMV, class YMV, class SizeType,
-         const int XMV_rank = XMV::rank,
-         const int YMV_rnak = YMV::rank>
-struct Dot_MV;
-
-template<class RV, class XMV, class YMV, class SizeType>
-struct Dot_MV<RV, XMV, YMV, SizeType, 2, 2>
-{
-  /// \brief Compute the dot product(s) of the column(s) of the
-  ///   multivectors (2-D views) X and Y, and store result(s) in the
-  ///   1-D View r.
-  static void dot (const RV& r, const XMV& X, const YMV& Y)
-  {
-      MV_Dot_Invoke<RV, XMV, YMV, SizeType> (r, X, Y);
-  }
-};
-
-/// \brief Partial specialization for XMV_rank == 2 and YMV_rank == 1
-///   (X is a multivector, and Y is a single column).
-template<class RV, class XMV, class YV, class SizeType>
-struct Dot_MV<RV, XMV, YV, SizeType, 2, 1> {
-  /// \brief Compute the dot product(s) of each column of X with the
-  ///   single vector Y, and store result(s) in the 1-D View r.
-  static void dot (const RV& r, const XMV& X, const YV& Y)
-  {
-    MV_V_Dot_Invoke<RV, XMV, YV, SizeType> (r, X, Y, static_cast<int> (X.extent(0)));
-  }
-};
-
-/// \brief Partial specialization for XMV_rank == 1 and YMV_rank == 2
-///   (X is a single column, and Y is a multivector).
-template<class RV, class XV, class YMV, class SizeType>
-struct Dot_MV<RV, XV, YMV, SizeType, 1, 2> {
-  /// \brief Compute the dot product(s) of the single vector X with
-  ///   each column of Y, and store result(s) in the 1-D View r.
-  static void dot (const RV& r, const XV& X, const YMV& Y)
-  {
-    const SizeType numRows = X.extent(0);
-    MV_V_Dot_Invoke<RV, XV, YMV, SizeType> (r, X, Y, numRows);
-  }
-};
-
-} // namespace Impl
-} // namespace KokkosBlas
+}  // namespace Impl
+}  // namespace KokkosBlas
 
-#endif // KOKKOSBLAS1_IMPL_DOT_MV_IMPL_HPP_
+#endif  // KOKKOSBLAS1_IMPL_DOT_MV_IMPL_HPP_
diff --git a/src/blas/impl/KokkosBlas1_dot_spec.hpp b/src/blas/impl/KokkosBlas1_dot_spec.hpp
index adb078dfbb..350934230d 100644
--- a/src/blas/impl/KokkosBlas1_dot_spec.hpp
+++ b/src/blas/impl/KokkosBlas1_dot_spec.hpp
@@ -63,40 +63,39 @@ namespace Impl {
 // failures by using a higher-precision type for intermediate dot
 // product sums.
 //
-// Note that this is not the same thing as InnerProductSpaceTraits<scalar>::dot_type
-template<typename scalar_t>
-struct DotAccumulatingScalar
-{
+// Note that this is not the same thing as
+// InnerProductSpaceTraits<scalar>::dot_type
+template <typename scalar_t>
+struct DotAccumulatingScalar {
   using type = scalar_t;
 };
 
-template<>
-struct DotAccumulatingScalar<float>
-{
+template <>
+struct DotAccumulatingScalar<float> {
   using type = double;
 };
 
-template<>
-struct DotAccumulatingScalar<Kokkos::complex<float>>
-{
+template <>
+struct DotAccumulatingScalar<Kokkos::complex<float>> {
   using type = Kokkos::complex<double>;
 };
 
-template<typename scalar_t>
-struct HasSpecialAccumulator
-{
+template <typename scalar_t>
+struct HasSpecialAccumulator {
   enum : bool {
-      value = !std::is_same<scalar_t, typename DotAccumulatingScalar<scalar_t>::type>::value
+    value = !std::is_same<scalar_t,
+                          typename DotAccumulatingScalar<scalar_t>::type>::value
   };
 };
 
 // Specialization struct which defines whether a specialization exists
-template<class AV, class XV, class YV, int Xrank = XV::rank, int Yrank = YV::rank>
+template <class AV, class XV, class YV, int Xrank = XV::rank,
+          int Yrank = YV::rank>
 struct dot_eti_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization availability
@@ -104,25 +103,33 @@ struct dot_eti_spec_avail {
 // the declarations of full specializations go in this header file.
 // We may spread out definitions (see _INST macro below) across one or
 // more .cpp files.
-#define KOKKOSBLAS1_DOT_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct dot_eti_spec_avail< \
-        Kokkos::View<SCALAR, LAYOUT, Kokkos::HostSpace, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1,1> { enum : bool { value = true }; }; \
-    template<> \
-    struct dot_eti_spec_avail< \
-        Kokkos::View<SCALAR, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1,1> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_DOT_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  template <>                                                                 \
+  struct dot_eti_spec_avail<                                                  \
+      Kokkos::View<SCALAR, LAYOUT, Kokkos::HostSpace,                         \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                  \
+      Kokkos::View<const SCALAR*, LAYOUT,                                     \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                  \
+      Kokkos::View<const SCALAR*, LAYOUT,                                     \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                  \
+      1, 1> {                                                                 \
+    enum : bool { value = true };                                             \
+  };                                                                          \
+  template <>                                                                 \
+  struct dot_eti_spec_avail<                                                  \
+      Kokkos::View<SCALAR, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                  \
+      Kokkos::View<const SCALAR*, LAYOUT,                                     \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                  \
+      Kokkos::View<const SCALAR*, LAYOUT,                                     \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                  \
+      1, 1> {                                                                 \
+    enum : bool { value = true };                                             \
+  };
 
 //
 // Macro for declaration of full specialization availability
@@ -131,233 +138,276 @@ struct dot_eti_spec_avail {
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_DOT_MV_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct dot_eti_spec_avail< \
-        Kokkos::View<SCALAR*, LAYOUT, \
-                     Kokkos::Device<Kokkos::DefaultHostExecutionSpace,Kokkos::HostSpace>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        2,2> { enum : bool { value = true }; }; \
-    template<> \
-    struct dot_eti_spec_avail< \
-        Kokkos::View<SCALAR*, LAYOUT, \
-                     Kokkos::Device<Kokkos::DefaultHostExecutionSpace,Kokkos::HostSpace>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        2,1> { enum : bool { value = true }; }; \
-    template<> \
-    struct dot_eti_spec_avail< \
-        Kokkos::View<SCALAR*, \
-                     LAYOUT, \
-                     Kokkos::Device<Kokkos::DefaultHostExecutionSpace,Kokkos::HostSpace>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1,2> { enum : bool { value = true }; };
-
+#define KOKKOSBLAS1_DOT_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \
+                                          MEM_SPACE)                  \
+  template <>                                                         \
+  struct dot_eti_spec_avail<                                          \
+      Kokkos::View<SCALAR*, LAYOUT,                                   \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace,  \
+                                  Kokkos::HostSpace>,                 \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,          \
+      Kokkos::View<const SCALAR**, LAYOUT,                            \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,             \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,          \
+      Kokkos::View<const SCALAR**, LAYOUT,                            \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,             \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,          \
+      2, 2> {                                                         \
+    enum : bool { value = true };                                     \
+  };                                                                  \
+  template <>                                                         \
+  struct dot_eti_spec_avail<                                          \
+      Kokkos::View<SCALAR*, LAYOUT,                                   \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace,  \
+                                  Kokkos::HostSpace>,                 \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,          \
+      Kokkos::View<const SCALAR**, LAYOUT,                            \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,             \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,          \
+      Kokkos::View<const SCALAR**, LAYOUT,                            \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,             \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,          \
+      2, 1> {                                                         \
+    enum : bool { value = true };                                     \
+  };                                                                  \
+  template <>                                                         \
+  struct dot_eti_spec_avail<                                          \
+      Kokkos::View<SCALAR*, LAYOUT,                                   \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace,  \
+                                  Kokkos::HostSpace>,                 \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,          \
+      Kokkos::View<const SCALAR**, LAYOUT,                            \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,             \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,          \
+      Kokkos::View<const SCALAR**, LAYOUT,                            \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,             \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,          \
+      1, 2> {                                                         \
+    enum : bool { value = true };                                     \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosBlas1_dot_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_dot_eti_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_dot_mv_eti_spec_avail.hpp>
+#include <KokkosBlas1_dot_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_dot_eti_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_dot_mv_eti_spec_avail.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
 
 // Unification layer
-template<class RV, class XV, class YV, int XV_Rank = XV::rank, int YV_Rank = YV::rank,
-         bool tpl_spec_avail = dot_tpl_spec_avail<RV,XV,YV>::value,
-         bool eti_spec_avail = dot_eti_spec_avail<RV,XV,YV>::value>
+template <class RV, class XV, class YV, int XV_Rank = XV::rank,
+          int YV_Rank         = YV::rank,
+          bool tpl_spec_avail = dot_tpl_spec_avail<RV, XV, YV>::value,
+          bool eti_spec_avail = dot_eti_spec_avail<RV, XV, YV>::value>
 struct Dot {
-  static void dot (const RV&, const XV& R, const YV& X);
+  static void dot(const RV&, const XV& R, const YV& X);
 };
 
-//This version never has TPL support, but it does use the same ETI system
-template<class RV, class XV, class YV, bool eti_spec_avail = dot_eti_spec_avail<RV,XV,YV>::value>
+// This version never has TPL support, but it does use the same ETI system
+template <class RV, class XV, class YV,
+          bool eti_spec_avail = dot_eti_spec_avail<RV, XV, YV>::value>
 struct DotSpecialAccumulator {
-  //Note: not doing the static_asserts to validate RV, XV, YV since those errors
-  //would have already arisen when building the library.
+  // Note: not doing the static_asserts to validate RV, XV, YV since those
+  // errors would have already arisen when building the library.
   using size_type = typename YV::size_type;
-  using dot_type = typename Kokkos::Details::InnerProductSpaceTraits<
-    typename XV::non_const_value_type>::dot_type;
+  using dot_type  = typename Kokkos::Details::InnerProductSpaceTraits<
+      typename XV::non_const_value_type>::dot_type;
   using accum_type = typename DotAccumulatingScalar<dot_type>::type;
-  //This is the same View type as RV, but using the special accumulator as the value type
+  // This is the same View type as RV, but using the special accumulator as the
+  // value type
   using RV_Result = Kokkos::View<accum_type, typename RV::array_layout,
-        typename RV::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
+                                 typename RV::device_type,
+                                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
 
-  static void dot (const RV_Result& R, const XV& X, const YV& Y);
+  static void dot(const RV_Result& R, const XV& X, const YV& Y);
 };
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 //! Full specialization of Dot for single vectors (1-D Views).
-//  The rank-1 case is currently the only one that may use a different accumulator
-//  type than <tt>InnerProductSpaceTraits::dot_type</tt>.
-template<class RV, class XV, class YV>
-struct Dot<RV, XV, YV, 1, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>
-{
-  //Check some things about the template parameters at compile time to get nice error messages,
-  //before using them under the assumption they are valid.
-  static_assert (Kokkos::Impl::is_view<XV>::value, "KokkosBlas::Impl::"
-                 "Dot<1-D>: XV is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<YV>::value, "KokkosBlas::Impl::"
-                 "Dot<1-D>: YV is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::Impl::"
-                 "Dot<1-D>: RV is not a Kokkos::View.");
-  static_assert (RV::rank == 0, "KokkosBlas::Impl::Dot<1-D>: "
-                 "RV is not rank 0.");
-  static_assert (XV::rank == 1, "KokkosBlas::Impl::Dot<1-D>: "
-                 "XV is not rank 1.");
-  static_assert (YV::rank == 1, "KokkosBlas::Impl::Dot<1-D>: "
-                 "YV is not rank 1.");
-  static_assert (std::is_same<typename RV::value_type,typename RV::non_const_value_type>::value,
-                 "KokkosBlas::Dot<1D>: R is const.  "
-                 "It must be nonconst, because it is an output argument "
-                 "(we have to be able to write to its entries).");
+//  The rank-1 case is currently the only one that may use a different
+//  accumulator type than <tt>InnerProductSpaceTraits::dot_type</tt>.
+template <class RV, class XV, class YV>
+struct Dot<RV, XV, YV, 1, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  // Check some things about the template parameters at compile time to get nice
+  // error messages, before using them under the assumption they are valid.
+  static_assert(Kokkos::is_view<XV>::value,
+                "KokkosBlas::Impl::"
+                "Dot<1-D>: XV is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<YV>::value,
+                "KokkosBlas::Impl::"
+                "Dot<1-D>: YV is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<RV>::value,
+                "KokkosBlas::Impl::"
+                "Dot<1-D>: RV is not a Kokkos::View.");
+  static_assert(RV::rank == 0,
+                "KokkosBlas::Impl::Dot<1-D>: "
+                "RV is not rank 0.");
+  static_assert(XV::rank == 1,
+                "KokkosBlas::Impl::Dot<1-D>: "
+                "XV is not rank 1.");
+  static_assert(YV::rank == 1,
+                "KokkosBlas::Impl::Dot<1-D>: "
+                "YV is not rank 1.");
+  static_assert(std::is_same<typename RV::value_type,
+                             typename RV::non_const_value_type>::value,
+                "KokkosBlas::Dot<1D>: R is const.  "
+                "It must be nonconst, because it is an output argument "
+                "(we have to be able to write to its entries).");
 
   typedef typename YV::size_type size_type;
   typedef typename RV::non_const_value_type dot_type;
   typedef typename DotAccumulatingScalar<dot_type>::type special_result_type;
 
-  //This is the same View type as RV, but using the special accumulator as the value type
-  typedef Kokkos::View<
-    special_result_type,
-    typename RV::array_layout,
-    typename RV::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV_Result;
-
-  static void dot (const RV& R, const XV& X, const YV& Y)
-  {
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::dot[ETI]":"KokkosBlas::dot[noETI]");
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas::dot<> ETI specialization for < %s , %s >\n",typeid(XV).name(),typeid(YV).name());
+  // This is the same View type as RV, but using the special accumulator as the
+  // value type
+  typedef Kokkos::View<special_result_type, typename RV::array_layout,
+                       typename RV::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      RV_Result;
+
+  static void dot(const RV& R, const XV& X, const YV& Y) {
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::dot[ETI]"
+                                      : "KokkosBlas::dot[noETI]");
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas::dot<> ETI specialization for < %s , %s >\n",
+             typeid(XV).name(), typeid(YV).name());
     else {
-      printf("KokkosBlas::dot<> non-ETI specialization for < %s , %s >\n",typeid(XV).name(),typeid(YV).name());
+      printf("KokkosBlas::dot<> non-ETI specialization for < %s , %s >\n",
+             typeid(XV).name(), typeid(YV).name());
     }
-    #endif
+#endif
     const size_type numElems = X.extent(0);
 
-    if (numElems < static_cast<size_type> (INT_MAX)) {
+    if (numElems < static_cast<size_type>(INT_MAX)) {
       typedef int index_type;
-      DotFunctor<RV,XV,YV,index_type> f(X,Y);
-      f.run("KokkosBlas::dot<1D>",R);
-    }
-    else {
+      DotFunctor<RV, XV, YV, index_type> f(X, Y);
+      f.run("KokkosBlas::dot<1D>", R);
+    } else {
       typedef int64_t index_type;
-      DotFunctor<RV,XV,YV,index_type> f(X,Y);
-      f.run("KokkosBlas::dot<1D>",R);
+      DotFunctor<RV, XV, YV, index_type> f(X, Y);
+      f.run("KokkosBlas::dot<1D>", R);
     }
     Kokkos::Profiling::popRegion();
   }
 };
 
-//Implementation that has the same template args as Dot, but which internally uses
-//DotAccumulatingScalar for the result view.
+// Implementation that has the same template args as Dot, but which internally
+// uses DotAccumulatingScalar for the result view.
 //
-//Is never supported by TPLs, but uses the same dot_eti_spec_avail::value.
-template<class RV, class XV, class YV>
-struct DotSpecialAccumulator<RV, XV, YV, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>
-{
-  static_assert (Kokkos::Impl::is_view<XV>::value, "KokkosBlas::Impl::"
-                 "DotSpecialAccumulator: XV is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<YV>::value, "KokkosBlas::Impl::"
-                 "DotSpecialAccumulator: YV is not a Kokkos::View.");
-  static_assert (XV::rank == YV::rank, "KokkosBlas::Impl::"
-                 "DotSpecialAccumulator: X and Y have different ranks.");
-  static_assert (XV::rank == 1, "KokkosBlas::Impl::"
-                 "DotSpecialAccumulator: X and Y are not rank-1 Views.");
-  static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::Impl::"
-                 "DotSpecialAccumulator: RV is not a Kokkos::View.");
-  static_assert (std::is_same<typename XV::non_const_value_type, typename YV::non_const_value_type>::value,
-                 "KokkosBlas::Impl::DotSpecialAccumulator: X and Y have different scalar types.");
-  static_assert (std::is_same<typename RV::value_type,typename RV::non_const_value_type>::value,
-                 "KokkosBlas::Dot<1D>: R is const.  "
-                 "It must be nonconst, because it is an output argument "
-                 "(we have to be able to write to its entries).");
+// Is never supported by TPLs, but uses the same dot_eti_spec_avail::value.
+template <class RV, class XV, class YV>
+struct DotSpecialAccumulator<RV, XV, YV, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  static_assert(Kokkos::is_view<XV>::value,
+                "KokkosBlas::Impl::"
+                "DotSpecialAccumulator: XV is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<YV>::value,
+                "KokkosBlas::Impl::"
+                "DotSpecialAccumulator: YV is not a Kokkos::View.");
+  static_assert(XV::rank == YV::rank,
+                "KokkosBlas::Impl::"
+                "DotSpecialAccumulator: X and Y have different ranks.");
+  static_assert(XV::rank == 1,
+                "KokkosBlas::Impl::"
+                "DotSpecialAccumulator: X and Y are not rank-1 Views.");
+  static_assert(Kokkos::is_view<RV>::value,
+                "KokkosBlas::Impl::"
+                "DotSpecialAccumulator: RV is not a Kokkos::View.");
+  static_assert(std::is_same<typename XV::non_const_value_type,
+                             typename YV::non_const_value_type>::value,
+                "KokkosBlas::Impl::DotSpecialAccumulator: X and Y have "
+                "different scalar types.");
+  static_assert(std::is_same<typename RV::value_type,
+                             typename RV::non_const_value_type>::value,
+                "KokkosBlas::Dot<1D>: R is const.  "
+                "It must be nonconst, because it is an output argument "
+                "(we have to be able to write to its entries).");
 
   using size_type = typename YV::size_type;
-  using dot_type = typename Kokkos::Details::InnerProductSpaceTraits<
-    typename XV::non_const_value_type>::dot_type;
+  using dot_type  = typename Kokkos::Details::InnerProductSpaceTraits<
+      typename XV::non_const_value_type>::dot_type;
   using accum_type = typename DotAccumulatingScalar<dot_type>::type;
-  //This is the same View type as RV, but using the special accumulator as the value type
+  // This is the same View type as RV, but using the special accumulator as the
+  // value type
   using RV_Result = Kokkos::View<accum_type, typename RV::array_layout,
-        typename RV::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
-
-  static void dot (const RV_Result& R, const XV& X, const YV& Y)
-  {
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::dot[ETI]":"KokkosBlas::dot[noETI]");
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas::dot<> ETI specialization for < %s , %s >\n",typeid(XV).name(),typeid(YV).name());
+                                 typename RV::device_type,
+                                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+
+  static void dot(const RV_Result& R, const XV& X, const YV& Y) {
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::dot[ETI]"
+                                      : "KokkosBlas::dot[noETI]");
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas::dot<> ETI specialization for < %s , %s >\n",
+             typeid(XV).name(), typeid(YV).name());
     else {
-      printf("KokkosBlas::dot<> non-ETI specialization for < %s , %s >\n",typeid(XV).name(),typeid(YV).name());
+      printf("KokkosBlas::dot<> non-ETI specialization for < %s , %s >\n",
+             typeid(XV).name(), typeid(YV).name());
     }
-    #endif
+#endif
     const size_type numElems = X.extent(0);
 
-    if (numElems < static_cast<size_type> (INT_MAX)) {
+    if (numElems < static_cast<size_type>(INT_MAX)) {
       typedef int index_type;
-      DotFunctor<RV_Result,XV,YV,index_type> f(X,Y);
-      f.run("KokkosBlas::dot<1D>",R);
-    }
-    else {
+      DotFunctor<RV_Result, XV, YV, index_type> f(X, Y);
+      f.run("KokkosBlas::dot<1D>", R);
+    } else {
       typedef int64_t index_type;
-      DotFunctor<RV_Result,XV,YV,index_type> f(X,Y);
-      f.run("KokkosBlas::dot<1D>",R);
+      DotFunctor<RV_Result, XV, YV, index_type> f(X, Y);
+      f.run("KokkosBlas::dot<1D>", R);
     }
     Kokkos::Profiling::popRegion();
   }
 };
 
-template<class RV,class XV, class YV, int X_Rank, int Y_Rank>
-struct Dot<RV, XV, YV, X_Rank, Y_Rank, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
-  static_assert (Kokkos::Impl::is_view<XV>::value, "KokkosBlas::Impl::"
-                 "Dot<2-D>: XV is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<YV>::value, "KokkosBlas::Impl::"
-                 "Dot<2-D>: YV is not a Kokkos::View.");
-  static_assert (RV::rank == 1, "KokkosBlas::Impl::Dot<2-D>: "
-                 "RV is not rank 1.");
+template <class RV, class XV, class YV, int X_Rank, int Y_Rank>
+struct Dot<RV, XV, YV, X_Rank, Y_Rank, false,
+           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  static_assert(Kokkos::is_view<XV>::value,
+                "KokkosBlas::Impl::"
+                "Dot<2-D>: XV is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<YV>::value,
+                "KokkosBlas::Impl::"
+                "Dot<2-D>: YV is not a Kokkos::View.");
+  static_assert(RV::rank == 1,
+                "KokkosBlas::Impl::Dot<2-D>: "
+                "RV is not rank 1.");
 
   typedef typename YV::size_type size_type;
 
-  static void dot (const RV& R, const XV& X, const YV& Y)
-  {
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::dot[ETI]":"KokkosBlas::dot[noETI]");
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::dot<> ETI specialization for < %s , %s , %s >\n",typeid(RV).name(),typeid(XV).name(),typeid(YV).name());
+  static void dot(const RV& R, const XV& X, const YV& Y) {
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::dot[ETI]"
+                                      : "KokkosBlas::dot[noETI]");
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas1::dot<> ETI specialization for < %s , %s , %s >\n",
+             typeid(RV).name(), typeid(XV).name(), typeid(YV).name());
     else {
-      printf("KokkosBlas1::dot<> non-ETI specialization for < %s , %s , %s >\n",typeid(RV).name(),typeid(XV).name(),typeid(YV).name());
+      printf("KokkosBlas1::dot<> non-ETI specialization for < %s , %s , %s >\n",
+             typeid(RV).name(), typeid(XV).name(), typeid(YV).name());
     }
-    #endif
+#endif
 
     const size_type numRows = X.extent(0);
     const size_type numCols = X.extent(1);
-    if (numRows < static_cast<size_type> (INT_MAX) &&
-        numRows * numCols < static_cast<size_type> (INT_MAX)) {
+    if (numRows < static_cast<size_type>(INT_MAX) &&
+        numRows * numCols < static_cast<size_type>(INT_MAX)) {
       typedef int index_type;
-      Dot_MV<RV,XV,YV,index_type>::dot(R,X,Y);
-    }
-    else {
+      MV_Dot_Invoke<RV, XV, YV, index_type>(R, X, Y);
+    } else {
       typedef std::int64_t index_type;
-      Dot_MV<RV,XV,YV,index_type>::dot(R,X,Y);
+      MV_Dot_Invoke<RV, XV, YV, index_type>(R, X, Y);
     }
     Kokkos::Profiling::popRegion();
   }
 };
 #endif
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization of
@@ -366,69 +416,88 @@ struct Dot<RV, XV, YV, X_Rank, Y_Rank, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_DOT_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct Dot< \
-        Kokkos::View<SCALAR, LAYOUT, Kokkos::HostSpace, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1,1,false,true>; \
-extern template struct Dot< \
-        Kokkos::View<SCALAR, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1,1,false,true>; \
-extern template struct DotSpecialAccumulator< \
-        Kokkos::View<SCALAR, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, true>; \
-extern template struct DotSpecialAccumulator< \
-        Kokkos::View<SCALAR, LAYOUT, Kokkos::HostSpace, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, true>;
-
-#define KOKKOSBLAS1_DOT_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct Dot< \
-        Kokkos::View<SCALAR, LAYOUT, Kokkos::HostSpace, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1,1,false,true>; \
-template struct Dot< \
-        Kokkos::View<SCALAR, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1,1,false,true>; \
-template struct DotSpecialAccumulator< \
-        Kokkos::View<SCALAR, LAYOUT, Kokkos::HostSpace, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, true>; \
-template struct DotSpecialAccumulator< \
-        Kokkos::View<SCALAR, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, true>;
+#define KOKKOSBLAS1_DOT_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  extern template struct Dot<                                                \
+      Kokkos::View<SCALAR, LAYOUT, Kokkos::HostSpace,                        \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                    \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                    \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                 \
+      1, 1, false, true>;                                                    \
+  extern template struct Dot<                                                \
+      Kokkos::View<SCALAR, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                    \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                    \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                 \
+      1, 1, false, true>;                                                    \
+  extern template struct DotSpecialAccumulator<                              \
+      Kokkos::View<SCALAR, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                    \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                    \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                 \
+      true>;                                                                 \
+  extern template struct DotSpecialAccumulator<                              \
+      Kokkos::View<SCALAR, LAYOUT, Kokkos::HostSpace,                        \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                    \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                    \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                 \
+      true>;
+
+#define KOKKOSBLAS1_DOT_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  template struct Dot<Kokkos::View<SCALAR, LAYOUT, Kokkos::HostSpace,        \
+                                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>, \
+                      Kokkos::View<const SCALAR*, LAYOUT,                    \
+                                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,    \
+                                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>, \
+                      Kokkos::View<const SCALAR*, LAYOUT,                    \
+                                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,    \
+                                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>, \
+                      1, 1, false, true>;                                    \
+  template struct Dot<                                                       \
+      Kokkos::View<SCALAR, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                    \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                    \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                 \
+      1, 1, false, true>;                                                    \
+  template struct DotSpecialAccumulator<                                     \
+      Kokkos::View<SCALAR, LAYOUT, Kokkos::HostSpace,                        \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                    \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                    \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                 \
+      true>;                                                                 \
+  template struct DotSpecialAccumulator<                                     \
+      Kokkos::View<SCALAR, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                    \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                    \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                 \
+      true>;
 
 //
 //
@@ -436,66 +505,86 @@ template struct DotSpecialAccumulator< \
 // KokkosBlas::Impl::Dot for rank == 2.  This is NOT for users!!!  We
 // use this macro in one or more .cpp files in this directory.
 //
-#define KOKKOSBLAS1_DOT_MV_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct Dot< \
-        Kokkos::View<SCALAR*, LAYOUT, \
-                     Kokkos::Device<Kokkos::DefaultHostExecutionSpace,Kokkos::HostSpace>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        2,2,false,true>; \
-extern template struct Dot< \
-        Kokkos::View<SCALAR*, LAYOUT, \
-                     Kokkos::Device<Kokkos::DefaultHostExecutionSpace,Kokkos::HostSpace>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        2,1,false,true>; \
-extern template struct Dot< \
-        Kokkos::View<SCALAR*, LAYOUT, \
-                     Kokkos::Device<Kokkos::DefaultHostExecutionSpace,Kokkos::HostSpace>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1,2,false,true>;
-
-#define KOKKOSBLAS1_DOT_MV_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct Dot< \
-        Kokkos::View<SCALAR*, LAYOUT, \
-                     Kokkos::Device<Kokkos::DefaultHostExecutionSpace,Kokkos::HostSpace>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        2,2,false,true>; \
-template struct Dot< \
-        Kokkos::View<SCALAR*, LAYOUT, \
-                     Kokkos::Device<Kokkos::DefaultHostExecutionSpace,Kokkos::HostSpace>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        2,1,false,true>; \
-template struct Dot< \
-        Kokkos::View<SCALAR*, LAYOUT, \
-                     Kokkos::Device<Kokkos::DefaultHostExecutionSpace,Kokkos::HostSpace>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1,2,false,true>;
-
-#include<KokkosBlas1_dot_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_dot_eti_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_dot_mv_eti_spec_decl.hpp>
-
-#endif // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_
+#define KOKKOSBLAS1_DOT_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \
+                                         MEM_SPACE)                  \
+  extern template struct Dot<                                        \
+      Kokkos::View<SCALAR*, LAYOUT,                                  \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace, \
+                                  Kokkos::HostSpace>,                \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,         \
+      Kokkos::View<const SCALAR**, LAYOUT,                           \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,         \
+      Kokkos::View<const SCALAR**, LAYOUT,                           \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,         \
+      2, 2, false, true>;                                            \
+  extern template struct Dot<                                        \
+      Kokkos::View<SCALAR*, LAYOUT,                                  \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace, \
+                                  Kokkos::HostSpace>,                \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,         \
+      Kokkos::View<const SCALAR**, LAYOUT,                           \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,         \
+      Kokkos::View<const SCALAR*, LAYOUT,                            \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,         \
+      2, 1, false, true>;                                            \
+  extern template struct Dot<                                        \
+      Kokkos::View<SCALAR*, LAYOUT,                                  \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace, \
+                                  Kokkos::HostSpace>,                \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,         \
+      Kokkos::View<const SCALAR*, LAYOUT,                            \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,         \
+      Kokkos::View<const SCALAR**, LAYOUT,                           \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,         \
+      1, 2, false, true>;
+
+#define KOKKOSBLAS1_DOT_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \
+                                         MEM_SPACE)                  \
+  template struct Dot<                                               \
+      Kokkos::View<SCALAR*, LAYOUT,                                  \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace, \
+                                  Kokkos::HostSpace>,                \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,         \
+      Kokkos::View<const SCALAR**, LAYOUT,                           \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,         \
+      Kokkos::View<const SCALAR**, LAYOUT,                           \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,         \
+      2, 2, false, true>;                                            \
+  template struct Dot<                                               \
+      Kokkos::View<SCALAR*, LAYOUT,                                  \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace, \
+                                  Kokkos::HostSpace>,                \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,         \
+      Kokkos::View<const SCALAR**, LAYOUT,                           \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,         \
+      Kokkos::View<const SCALAR*, LAYOUT,                            \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,         \
+      2, 1, false, true>;                                            \
+  template struct Dot<                                               \
+      Kokkos::View<SCALAR*, LAYOUT,                                  \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace, \
+                                  Kokkos::HostSpace>,                \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,         \
+      Kokkos::View<const SCALAR*, LAYOUT,                            \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,         \
+      Kokkos::View<const SCALAR**, LAYOUT,                           \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,         \
+      1, 2, false, true>;
+
+#include <KokkosBlas1_dot_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_dot_eti_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_dot_mv_eti_spec_decl.hpp>
+
+#endif  // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_
diff --git a/src/blas/impl/KokkosBlas1_iamax_impl.hpp b/src/blas/impl/KokkosBlas1_iamax_impl.hpp
index 6d5037c453..dc30edf7da 100644
--- a/src/blas/impl/KokkosBlas1_iamax_impl.hpp
+++ b/src/blas/impl/KokkosBlas1_iamax_impl.hpp
@@ -57,244 +57,97 @@ namespace Impl {
 /// \tparam XV 1-D input View
 /// \tparam MagType Magnitude type
 /// \tparam SizeType Index type.  Use int (32 bits) if possible.
-template<class RV, class XV, class MagType, class SizeType = typename XV::size_type>
-struct V_Iamax_Functor
-{
+template <class RV, class XV, class MagType,
+          class SizeType = typename XV::size_type>
+struct V_Iamax_Functor {
   using size_type   = SizeType;
   using mag_type    = MagType;
   using xvalue_type = typename XV::non_const_value_type;
   using IPT         = Kokkos::Details::InnerProductSpaceTraits<xvalue_type>;
   using value_type  = typename RV::value_type;
-  
+
   typename XV::const_type m_x;
 
-  V_Iamax_Functor (const XV& x) :
-    m_x (x)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value,
-                   "KokkosBlas::Impl::V_Iamax_Functor: "
-                   "R is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XV>::value,
-                   "KokkosBlas::Impl::V_Iamax_Functor: "
-                   "X is not a Kokkos::View.");
-    static_assert (std::is_same<typename RV::value_type,
-                   typename RV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::V_Iamax_Functor: R is const.  "
-                   "It must be nonconst, because it is an output argument "
-                   "(we have to be able to write to its entries).");
-    static_assert (RV::rank == 0 && XV::rank == 1,
-                   "KokkosBlas::Impl::V_Iamax_Functor: "
-                   "RV must have rank 0 and XV must have rank 1.");
+  V_Iamax_Functor(const XV& x) : m_x(x) {
+    static_assert(Kokkos::is_view<RV>::value,
+                  "KokkosBlas::Impl::V_Iamax_Functor: "
+                  "R is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XV>::value,
+                  "KokkosBlas::Impl::V_Iamax_Functor: "
+                  "X is not a Kokkos::View.");
+    static_assert(std::is_same<typename RV::value_type,
+                               typename RV::non_const_value_type>::value,
+                  "KokkosBlas::Impl::V_Iamax_Functor: R is const.  "
+                  "It must be nonconst, because it is an output argument "
+                  "(we have to be able to write to its entries).");
+    static_assert(RV::rank == 0 && XV::rank == 1,
+                  "KokkosBlas::Impl::V_Iamax_Functor: "
+                  "RV must have rank 0 and XV must have rank 1.");
   }
 
-  KOKKOS_INLINE_FUNCTION void
-  operator() (const size_type i, value_type& lmaxloc) const
-  {
-    mag_type val    = IPT::norm (m_x(i-1));
-    mag_type maxval = IPT::norm (m_x(lmaxloc-1));
-    if(val > maxval) lmaxloc = i;
-  }  
- 
-  KOKKOS_INLINE_FUNCTION void
-  init (value_type& update) const
-  {
-    update = Kokkos::reduction_identity<typename RV::value_type>::max()+1;
-  }
-  
-  KOKKOS_INLINE_FUNCTION void
-  join (volatile value_type& update, const volatile value_type& source) const
-  {
-    mag_type source_val = IPT::norm (m_x(source-1));
-    mag_type update_val = IPT::norm (m_x(update-1));
-    if(update_val < source_val)
-      update = source;
+  KOKKOS_INLINE_FUNCTION void operator()(const size_type i,
+                                         value_type& lmaxloc) const {
+    mag_type val    = IPT::norm(m_x(i - 1));
+    mag_type maxval = IPT::norm(m_x(lmaxloc - 1));
+    if (val > maxval) lmaxloc = i;
   }
 
-  KOKKOS_INLINE_FUNCTION void
-  join (value_type& update, const value_type& source) const
-  {
-    mag_type source_val = IPT::norm (m_x(source-1));
-    mag_type update_val = IPT::norm (m_x(update-1));
-    if(update_val < source_val)
-      update = source;
-  }
-};
-
-/// \brief Column-wise Iamax functor for multivectors.
-///
-/// \tparam RV 1-D output View
-/// \tparam XMV 2-D input View
-/// \tparam MagType Magnitude type
-/// \tparam SizeType Index type.  Use int (32 bits) if possible.
-template<class RV, class XMV, class MagType, class SizeType = typename XMV::size_type>
-struct MV_Iamax_FunctorVector
-{
-  using execution_space = typename XMV::execution_space;
-  using memory_space    = typename XMV::memory_space;
-  using size_type       = SizeType;
-  using mag_type        = MagType;
-  using xvalue_type     = typename XMV::non_const_value_type;
-  using IPT             = Kokkos::Details::InnerProductSpaceTraits<xvalue_type>;
-  using value_type      = typename RV::value_type[];
-
-  size_type value_count;
-  typename XMV::const_type m_x;
-  
-  MV_Iamax_FunctorVector (const XMV& x) :
-    value_count (x.extent(1)), m_x (x)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value,
-                   "KokkosBlas::Impl::MV_Iamax_FunctorVector: "
-                   "R is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value,
-                   "KokkosBlas::Impl::MV_Iamax_FunctorVector: "
-                   "X is not a Kokkos::View.");
-    static_assert (std::is_same<typename RV::value_type,
-                   typename RV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::MV_Iamax_FunctorVector: "
-                   "R is const.  It must be nonconst, because it is an output "
-                   "argument (we must be able to write to its entries).");
-    static_assert (RV::rank == 1 && XMV::rank == 2,
-                   "KokkosBlas::Impl::MV_Iamax_FunctorVector: "
-                   "RV must have rank 1 and XMV must have rank 2.");
+  KOKKOS_INLINE_FUNCTION void init(value_type& update) const {
+    update = Kokkos::reduction_identity<typename RV::value_type>::max() + 1;
   }
 
-  KOKKOS_INLINE_FUNCTION void
-  operator() (const size_type i, value_type lmaxloc) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type j = 0; j < numVecs; ++j) {
-      mag_type val    = IPT::norm (m_x(i-1,j));
-      mag_type maxval = IPT::norm (m_x(lmaxloc[j]-1,j));
-      if(val > maxval) lmaxloc[j] = i;
-    }
-  }  
- 
-  KOKKOS_INLINE_FUNCTION void
-  init (value_type update) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type j = 0; j < numVecs; ++j) {
-      update[j] = Kokkos::reduction_identity<typename RV::value_type>::max()+1;
-    }
-  }
-  
-  KOKKOS_INLINE_FUNCTION void
-  join (volatile value_type update, const volatile value_type source) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type j = 0; j < numVecs; ++j) {
-      mag_type source_val = IPT::norm (m_x(source[j]-1,j));
-      mag_type update_val = IPT::norm (m_x(update[j]-1,j));
-      if(update_val < source_val)
-        update[j] = source[j];
-    }
+  KOKKOS_INLINE_FUNCTION void join(volatile value_type& update,
+                                   const volatile value_type& source) const {
+    mag_type source_val = IPT::norm(m_x(source - 1));
+    mag_type update_val = IPT::norm(m_x(update - 1));
+    if (update_val < source_val) update = source;
   }
 
-  KOKKOS_INLINE_FUNCTION void
-  join (value_type update, const value_type source) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type j = 0; j < numVecs; ++j) {
-      mag_type source_val = IPT::norm (m_x(source[j]-1,j));
-      mag_type update_val = IPT::norm (m_x(update[j]-1,j));
-      if(update_val < source_val)
-        update[j] = source[j];
-    }
+  KOKKOS_INLINE_FUNCTION void join(value_type& update,
+                                   const value_type& source) const {
+    mag_type source_val = IPT::norm(m_x(source - 1));
+    mag_type update_val = IPT::norm(m_x(update - 1));
+    if (update_val < source_val) update = source;
   }
 };
 
-
-/// \brief Find the index of the element with the maximum magnitude of the single vector (1-D
+/// \brief Find the index of the element with the maximum magnitude of the
+/// single vector (1-D
 ///   View) X, and store the result in the 0-D View r.
-template<class RV, class XV, class SizeType>
-void
-V_Iamax_Invoke (const RV& r, const XV& X)
-{
+template <class RV, class XV, class SizeType>
+void V_Iamax_Invoke(const RV& r, const XV& X) {
   using execution_space = typename XV::execution_space;
-  using AT              = Kokkos::Details::ArithTraits<typename XV::non_const_value_type>;
-  using mag_type        = typename AT::mag_type;
+  using AT = Kokkos::Details::ArithTraits<typename XV::non_const_value_type>;
+  using mag_type = typename AT::mag_type;
 
-  const SizeType numRows = static_cast<SizeType> (X.extent(0));
+  const SizeType numRows = static_cast<SizeType>(X.extent(0));
 
   // Avoid MaxLoc Reduction if this is a zero length view
-  if( numRows == 0 ) {
-    Kokkos::deep_copy(r,0);
+  if (numRows == 0) {
+    Kokkos::deep_copy(r, 0);
     return;
   }
 
-  Kokkos::RangePolicy<execution_space, SizeType> policy (1, numRows+1);
+  Kokkos::RangePolicy<execution_space, SizeType> policy(1, numRows + 1);
 
   using functor_type = V_Iamax_Functor<RV, XV, mag_type, SizeType>;
-  functor_type op (X);
-  Kokkos::parallel_reduce ("KokkosBlas::Iamax::S0", policy, op, r);
+  functor_type op(X);
+  Kokkos::parallel_reduce("KokkosBlas::Iamax::S0", policy, op, r);
 }
 
-
-/// \brief Find the index of the element with the maximum magnitude of the columns of the
+/// \brief Find the index of the element with the maximum magnitude of the
+/// columns of the
 ///   multivector (2-D View) X, and store result(s) in the 1-D View r.
-template<class RV, class XMV, class SizeType>
-void
-MV_Iamax_Invoke (const RV& r, const XMV& X)
-{
-  using execution_space = typename XMV::execution_space;
-  using AT              = Kokkos::Details::ArithTraits<typename XMV::non_const_value_type>;
-  using mag_type        = typename AT::mag_type;
-
-  const SizeType numRows = static_cast<SizeType> (X.extent(0));
-
-  // Avoid MaxLoc Reduction if this is a zero length view
-  if( numRows == 0 ) {
-    Kokkos::deep_copy(r,0);
-    return;
-  }
-
-  Kokkos::RangePolicy<execution_space, SizeType> policy (1, numRows+1);
-
-  // If the input multivector (2-D View) has only one column, invoke
-  // the single-vector version of the kernel.
-  if (X.extent(1) == 1) {
-    using RV0D = Kokkos::View<typename RV::non_const_value_type,
-                              typename RV::array_layout,
-                              typename RV::device_type,
-                              Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-    RV0D r_0(r, 0);
-    auto X_0 = Kokkos::subview (X, Kokkos::ALL (), 0);
-    using XV1D = decltype(X_0);
-    V_Iamax_Invoke<RV0D, XV1D, SizeType> (r_0, X_0);
-  }
-  else {
-    using functor_type = MV_Iamax_FunctorVector<RV, XMV, mag_type, SizeType>;
-    functor_type op (X);
-    Kokkos::parallel_reduce ("KokkosBlas::Iamax::S1", policy, op, r);
+template <class RV, class XMV, class SizeType>
+void MV_Iamax_Invoke(const RV& r, const XMV& X) {
+  for (size_t i = 0; i < X.extent(1); i++) {
+    auto ri = Kokkos::subview(r, i);
+    auto Xi = Kokkos::subview(X, Kokkos::ALL(), i);
+    V_Iamax_Invoke<decltype(ri), decltype(Xi), SizeType>(ri, Xi);
   }
 }
 
-} // namespace Impl
-} // namespace KokkosBlas
+}  // namespace Impl
+}  // namespace KokkosBlas
 
-#endif // KOKKOSBLAS1_IAMAX_IMPL_HPP_
+#endif  // KOKKOSBLAS1_IAMAX_IMPL_HPP_
diff --git a/src/blas/impl/KokkosBlas1_iamax_spec.hpp b/src/blas/impl/KokkosBlas1_iamax_spec.hpp
index d130428f1a..37461389a1 100644
--- a/src/blas/impl/KokkosBlas1_iamax_spec.hpp
+++ b/src/blas/impl/KokkosBlas1_iamax_spec.hpp
@@ -57,12 +57,12 @@
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class RMV, class XMV, int rank = XMV::rank>
+template <class RMV, class XMV, int rank = XMV::rank>
 struct iamax_eti_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization availability
@@ -71,26 +71,37 @@ struct iamax_eti_spec_avail {
 // We may spread out definitions (see _INST macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL_INDEX( INDEX_TYPE, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct iamax_eti_spec_avail< \
-        Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::HostSpace, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1> { enum : bool { value = true }; }; \
-    template<> \
-    struct iamax_eti_spec_avail< \
-        Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL_INDEX(INDEX_TYPE, SCALAR, LAYOUT,    \
+                                               EXEC_SPACE, MEM_SPACE)         \
+  template <>                                                                 \
+  struct iamax_eti_spec_avail<                                                \
+      Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::HostSpace,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                     \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      1> {                                                                    \
+    enum : bool { value = true };                                             \
+  };                                                                          \
+  template <>                                                                 \
+  struct iamax_eti_spec_avail<                                                \
+      Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                     \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      1> {                                                                    \
+    enum : bool { value = true };                                             \
+  };
 
-#define KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL_INDEX( unsigned long, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
-    KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL_INDEX( unsigned int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
-    KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL_INDEX( int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE)
+#define KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE,      \
+                                         MEM_SPACE)                       \
+  KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL_INDEX(unsigned long, SCALAR, LAYOUT,   \
+                                         EXEC_SPACE, MEM_SPACE)           \
+  KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL_INDEX(unsigned int, SCALAR, LAYOUT,    \
+                                         EXEC_SPACE, MEM_SPACE)           \
+  KOKKOSBLAS1_IAMAX_ETI_SPEC_AVAIL_INDEX(int, SCALAR, LAYOUT, EXEC_SPACE, \
+                                         MEM_SPACE)
 
 //
 // Macro for declaration of full specialization availability
@@ -99,126 +110,143 @@ struct iamax_eti_spec_avail {
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL_INDEX_HOST( INDEX_TYPE, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct iamax_eti_spec_avail< \
-        Kokkos::View<INDEX_TYPE*, \
-                     LAYOUT, \
-                     Kokkos::HostSpace, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        2> { enum : bool { value = true }; }; \
-    template<> \
-    struct iamax_eti_spec_avail< \
-        Kokkos::View<INDEX_TYPE*, \
-                     LAYOUT, \
-                     Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        2> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL_INDEX_HOST(                        \
+    INDEX_TYPE, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE)                         \
+  template <>                                                                  \
+  struct iamax_eti_spec_avail<                                                 \
+      Kokkos::View<INDEX_TYPE*, LAYOUT, Kokkos::HostSpace,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const SCALAR**, LAYOUT,                                     \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      2> {                                                                     \
+    enum : bool { value = true };                                              \
+  };                                                                           \
+  template <>                                                                  \
+  struct iamax_eti_spec_avail<                                                 \
+      Kokkos::View<INDEX_TYPE*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const SCALAR**, LAYOUT,                                     \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      2> {                                                                     \
+    enum : bool { value = true };                                              \
+  };
 
-#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL_INDEX_HOST( unsigned long, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
-    KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL_INDEX_HOST( unsigned int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
-    KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL_INDEX_HOST( int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE)
+#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE,        \
+                                            MEM_SPACE)                         \
+  KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL_INDEX_HOST(                              \
+      unsigned long, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE)                    \
+  KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL_INDEX_HOST(unsigned int, SCALAR, LAYOUT, \
+                                                 EXEC_SPACE, MEM_SPACE)        \
+  KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_AVAIL_INDEX_HOST(int, SCALAR, LAYOUT,          \
+                                                 EXEC_SPACE, MEM_SPACE)
 
 // Include the actual specialization declarations
-#include<KokkosBlas1_iamax_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_iamax_eti_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_iamax_mv_eti_spec_avail.hpp>
+#include <KokkosBlas1_iamax_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_iamax_eti_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_iamax_mv_eti_spec_avail.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
 
 // Unification layer
-template<class RMV, class XMV, int rank = XMV::rank,
-         bool tpl_spec_avail = iamax_tpl_spec_avail<RMV,XMV>::value,
-         bool eti_spec_avail = iamax_eti_spec_avail<RMV,XMV>::value>
+template <class RMV, class XMV, int rank = XMV::rank,
+          bool tpl_spec_avail = iamax_tpl_spec_avail<RMV, XMV>::value,
+          bool eti_spec_avail = iamax_eti_spec_avail<RMV, XMV>::value>
 struct Iamax {
-  static void iamax (const RMV& R, const XMV& X);
+  static void iamax(const RMV& R, const XMV& X);
 };
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 //! Full specialization of Iamax for single vectors (1-D Views).
-template<class RMV, class XMV>
-struct Iamax<RMV, XMV, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>
-{
+template <class RMV, class XMV>
+struct Iamax<RMV, XMV, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename XMV::size_type size_type;
 
-  static void iamax (const RMV& R, const XMV& X)
-  {
-    static_assert (Kokkos::Impl::is_view<RMV>::value, "KokkosBlas::Impl::"
-                   "Iamax<1-D>: RMV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "Iamax<1-D>: XMV is not a Kokkos::View.");
-    static_assert (RMV::rank == 0, "KokkosBlas::Impl::Iamax<1-D>: "
-                   "RMV is not rank 0.");
-    static_assert (XMV::rank == 1, "KokkosBlas::Impl::Iamax<1-D>: "
-                   "XMV is not rank 1.");
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::iamax[ETI]":"KokkosBlas::iamax[noETI]");
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::iamax<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name());
+  static void iamax(const RMV& R, const XMV& X) {
+    static_assert(Kokkos::is_view<RMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Iamax<1-D>: RMV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Iamax<1-D>: XMV is not a Kokkos::View.");
+    static_assert(RMV::rank == 0,
+                  "KokkosBlas::Impl::Iamax<1-D>: "
+                  "RMV is not rank 0.");
+    static_assert(XMV::rank == 1,
+                  "KokkosBlas::Impl::Iamax<1-D>: "
+                  "XMV is not rank 1.");
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::iamax[ETI]"
+                                      : "KokkosBlas::iamax[noETI]");
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas1::iamax<> ETI specialization for < %s , %s >\n",
+             typeid(RMV).name(), typeid(XMV).name());
     else {
-      printf("KokkosBlas1::iamax<> non-ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name());
+      printf("KokkosBlas1::iamax<> non-ETI specialization for < %s , %s >\n",
+             typeid(RMV).name(), typeid(XMV).name());
     }
-    #endif
+#endif
     const size_type numRows = X.extent(0);
 
-    if (numRows < static_cast<size_type> (INT_MAX) ) {
-      V_Iamax_Invoke<RMV, XMV, int> (R, X);
-    }
-    else {
+    if (numRows < static_cast<size_type>(INT_MAX)) {
+      V_Iamax_Invoke<RMV, XMV, int>(R, X);
+    } else {
       typedef std::int64_t index_type;
-      V_Iamax_Invoke<RMV, XMV, index_type> (R, X);
+      V_Iamax_Invoke<RMV, XMV, index_type>(R, X);
     }
     Kokkos::Profiling::popRegion();
   }
 };
 
-
-template<class RV, class XMV>
+template <class RV, class XMV>
 struct Iamax<RV, XMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename XMV::size_type size_type;
 
-  static void iamax (const RV& R, const XMV& X)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::Impl::"
-                   "Iamax<2-D>: RV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "Iamax<2-D>: XMV is not a Kokkos::View.");
-    static_assert (RV::rank == 1, "KokkosBlas::Impl::Iamax<2-D>: "
-                   "RV is not rank 1.");
-    static_assert (XMV::rank == 2, "KokkosBlas::Impl::Iamax<2-D>: "
-                   "XMV is not rank 2.");
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::iamax[ETI]":"KokkosBlas::iamax[noETI]");
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::iamax<> ETI specialization for < %s , %s >\n",typeid(RV).name(),typeid(XMV).name());
+  static void iamax(const RV& R, const XMV& X) {
+    static_assert(Kokkos::is_view<RV>::value,
+                  "KokkosBlas::Impl::"
+                  "Iamax<2-D>: RV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Iamax<2-D>: XMV is not a Kokkos::View.");
+    static_assert(RV::rank == 1,
+                  "KokkosBlas::Impl::Iamax<2-D>: "
+                  "RV is not rank 1.");
+    static_assert(XMV::rank == 2,
+                  "KokkosBlas::Impl::Iamax<2-D>: "
+                  "XMV is not rank 2.");
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::iamax[ETI]"
+                                      : "KokkosBlas::iamax[noETI]");
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas1::iamax<> ETI specialization for < %s , %s >\n",
+             typeid(RV).name(), typeid(XMV).name());
     else {
-      printf("KokkosBlas1::iamax<> non-ETI specialization for < %s , %s >\n",typeid(RV).name(),typeid(XMV).name());
+      printf("KokkosBlas1::iamax<> non-ETI specialization for < %s , %s >\n",
+             typeid(RV).name(), typeid(XMV).name());
     }
-    #endif
+#endif
 
     const size_type numRows = X.extent(0);
     const size_type numCols = X.extent(1);
-    if (numRows < static_cast<size_type> (INT_MAX) &&
-        numRows * numCols < static_cast<size_type> (INT_MAX)) {
-      MV_Iamax_Invoke<RV, XMV, int> (R, X);
-    }
-    else {
+    if (numRows < static_cast<size_type>(INT_MAX) &&
+        numRows * numCols < static_cast<size_type>(INT_MAX)) {
+      MV_Iamax_Invoke<RV, XMV, int>(R, X);
+    } else {
       typedef std::int64_t index_type;
-      MV_Iamax_Invoke<RV, XMV, index_type> (R, X);
+      MV_Iamax_Invoke<RV, XMV, index_type>(R, X);
     }
     Kokkos::Profiling::popRegion();
   }
 };
 #endif
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization of
@@ -227,48 +255,60 @@ struct Iamax<RV, XMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_INDEX( INDEX_TYPE, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct Iamax< \
-         Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::HostSpace, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         1, false, true>; \
-extern template struct Iamax< \
-         Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         1, false, true>;
+#define KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_INDEX(INDEX_TYPE, SCALAR, LAYOUT,     \
+                                              EXEC_SPACE, MEM_SPACE)          \
+  extern template struct Iamax<                                               \
+      Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::HostSpace,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                     \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      1, false, true>;                                                        \
+  extern template struct Iamax<                                               \
+      Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                     \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      1, false, true>;
 
-#define KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_INDEX( unsigned long, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
-    KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_INDEX( unsigned int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
-    KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_INDEX( int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE)
+#define KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_INDEX(unsigned long, SCALAR, LAYOUT,         \
+                                        EXEC_SPACE, MEM_SPACE)                 \
+  KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_INDEX(unsigned int, SCALAR, LAYOUT,          \
+                                        EXEC_SPACE, MEM_SPACE)                 \
+  KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_INDEX(int, SCALAR, LAYOUT, EXEC_SPACE,       \
+                                        MEM_SPACE)
 
 //
 // Macro for definition of full specialization of
 // KokkosBlas::Impl::Iamax for rank == 1.  This is NOT for users!!!  We
 // use this macro in one or more .cpp files in this directory.
 //
-#define KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX( INDEX_TYPE, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct Iamax< \
-         Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::HostSpace, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         1, false, true>; \
-template struct Iamax< \
-         Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         1, false, true>;
+#define KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX(INDEX_TYPE, SCALAR, LAYOUT,     \
+                                              EXEC_SPACE, MEM_SPACE)          \
+  template struct Iamax<                                                      \
+      Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::HostSpace,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                     \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      1, false, true>;                                                        \
+  template struct Iamax<                                                      \
+      Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                     \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      1, false, true>;
 
-#define KOKKOSBLAS1_IAMAX_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX( unsigned long, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
-    KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX( unsigned int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
-    KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX( int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE)
+#define KOKKOSBLAS1_IAMAX_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX(unsigned long, SCALAR, LAYOUT,         \
+                                        EXEC_SPACE, MEM_SPACE)                 \
+  KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX(unsigned int, SCALAR, LAYOUT,          \
+                                        EXEC_SPACE, MEM_SPACE)                 \
+  KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX(int, SCALAR, LAYOUT, EXEC_SPACE,       \
+                                        MEM_SPACE)
 
 //
 // Macro for declaration of full specialization of
@@ -277,59 +317,65 @@ template struct Iamax< \
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_INDEX( INDEX_TYPE, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct Iamax< \
-         Kokkos::View<INDEX_TYPE*, \
-                      LAYOUT, \
-                      Kokkos::HostSpace, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         2, false, true>; \
-extern template struct Iamax< \
-         Kokkos::View<INDEX_TYPE*, \
-                      LAYOUT, \
-                      Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         2, false, true>;
+#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_INDEX(INDEX_TYPE, SCALAR, LAYOUT,   \
+                                                 EXEC_SPACE, MEM_SPACE)        \
+  extern template struct Iamax<                                                \
+      Kokkos::View<INDEX_TYPE*, LAYOUT, Kokkos::HostSpace,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const SCALAR*, LAYOUT,                                      \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      2, false, true>;                                                         \
+  extern template struct Iamax<                                                \
+      Kokkos::View<INDEX_TYPE*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const SCALAR*, LAYOUT,                                      \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      2, false, true>;
 
-#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_INDEX( unsigned long, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
-    KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_INDEX( unsigned int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
-    KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_INDEX( int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE)
+#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE,      \
+                                           MEM_SPACE)                       \
+  KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_INDEX(unsigned long, SCALAR, LAYOUT,   \
+                                           EXEC_SPACE, MEM_SPACE)           \
+  KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_INDEX(unsigned int, SCALAR, LAYOUT,    \
+                                           EXEC_SPACE, MEM_SPACE)           \
+  KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_INDEX(int, SCALAR, LAYOUT, EXEC_SPACE, \
+                                           MEM_SPACE)
 
 //
 // Macro for definition of full specialization of
 // KokkosBlas::Impl::Iamax for rank == 2.  This is NOT for users!!!  We
 // use this macro in one or more .cpp files in this directory.
 //
-#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX( INDEX_TYPE, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct Iamax< \
-         Kokkos::View<INDEX_TYPE*, \
-                      LAYOUT, \
-                      Kokkos::HostSpace, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         2, false, true>; \
-template struct Iamax< \
-         Kokkos::View<INDEX_TYPE*, \
-                      LAYOUT, \
-                      Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         2, false, true>;
+#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX(INDEX_TYPE, SCALAR, LAYOUT,   \
+                                                 EXEC_SPACE, MEM_SPACE)        \
+  template struct Iamax<                                                       \
+      Kokkos::View<INDEX_TYPE*, LAYOUT, Kokkos::HostSpace,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const SCALAR**, LAYOUT,                                     \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      2, false, true>;                                                         \
+  template struct Iamax<                                                       \
+      Kokkos::View<INDEX_TYPE*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const SCALAR**, LAYOUT,                                     \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      2, false, true>;
 
-#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX( unsigned long, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
-    KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX( unsigned int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
-    KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX( int, SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE)
+#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE,      \
+                                           MEM_SPACE)                       \
+  KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX(unsigned long, SCALAR, LAYOUT,   \
+                                           EXEC_SPACE, MEM_SPACE)           \
+  KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX(unsigned int, SCALAR, LAYOUT,    \
+                                           EXEC_SPACE, MEM_SPACE)           \
+  KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX(int, SCALAR, LAYOUT, EXEC_SPACE, \
+                                           MEM_SPACE)
 
-#include<KokkosBlas1_iamax_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_iamax_eti_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_iamax_mv_eti_spec_decl.hpp>
+#include <KokkosBlas1_iamax_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_iamax_eti_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_iamax_mv_eti_spec_decl.hpp>
 
-#endif // KOKKOSBLAS1_IAMAX_SPEC_HPP_
+#endif  // KOKKOSBLAS1_IAMAX_SPEC_HPP_
diff --git a/src/blas/impl/KokkosBlas1_mult_impl.hpp b/src/blas/impl/KokkosBlas1_mult_impl.hpp
index 908bd49e26..9b99a2b1b9 100644
--- a/src/blas/impl/KokkosBlas1_mult_impl.hpp
+++ b/src/blas/impl/KokkosBlas1_mult_impl.hpp
@@ -62,11 +62,9 @@ namespace Impl {
 ///
 /// C(i,j) = c * C(i,j) + ab * A(i) * B(i,j), subject to the usual
 /// BLAS update rules.
-template<class CMV, class AV, class BMV,
-         int scalar_ab, int scalar_c,
-         class SizeType = typename CMV::size_type>
-struct MV_MultFunctor
-{
+template <class CMV, class AV, class BMV, int scalar_ab, int scalar_c,
+          class SizeType = typename CMV::size_type>
+struct MV_MultFunctor {
   typedef typename CMV::execution_space execution_space;
   typedef SizeType size_type;
   typedef Kokkos::Details::ArithTraits<typename CMV::non_const_value_type> ATS;
@@ -78,52 +76,43 @@ struct MV_MultFunctor
   AV m_A;
   BMV m_B;
 
-  MV_MultFunctor (typename CMV::const_value_type& c,
-                  const CMV& C,
-                  typename AV::const_value_type& ab,
-                  const AV& A,
-                  const BMV& B) :
-    m_n (C.extent(1)),
-    m_c (c), m_C (C), m_ab (ab), m_A (A), m_B (B)
-  {}
+  MV_MultFunctor(typename CMV::const_value_type& c, const CMV& C,
+                 typename AV::const_value_type& ab, const AV& A, const BMV& B)
+      : m_n(C.extent(1)), m_c(c), m_C(C), m_ab(ab), m_A(A), m_B(B) {}
 
-  KOKKOS_INLINE_FUNCTION void
-  operator () (const size_type& i) const
-  {
+  KOKKOS_INLINE_FUNCTION void operator()(const size_type& i) const {
     if (scalar_c == 0) {
       if (scalar_ab == 0) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
         for (size_type j = 0; j < m_n; ++j) {
-          m_C(i,j) = ATS::zero ();
+          m_C(i, j) = ATS::zero();
         }
-      }
-      else { // ab != 0, c == 0
+      } else {  // ab != 0, c == 0
         typename AV::const_value_type Ai = m_A(i);
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
         for (size_type j = 0; j < m_n; ++j) {
-          m_C(i,j) = m_ab * Ai * m_B(i,j);
+          m_C(i, j) = m_ab * Ai * m_B(i, j);
         }
       }
-    } else { // c != 0
+    } else {  // c != 0
       if (scalar_ab == 0) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
         for (size_type j = 0; j < m_n; ++j) {
-          m_C(i,j) = m_c * m_C(i,j);
+          m_C(i, j) = m_c * m_C(i, j);
         }
-      }
-      else { // m_ab != 0, and m_c != 0
+      } else {  // m_ab != 0, and m_c != 0
         typename AV::const_value_type Ai = m_A(i);
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
         for (size_type j = 0; j < m_n; ++j) {
-          m_C(i,j) = m_c * m_C(i,j) + m_ab * Ai * m_B(i,j);
+          m_C(i, j) = m_c * m_C(i, j) + m_ab * Ai * m_B(i, j);
         }
       }
     }
@@ -141,11 +130,9 @@ struct MV_MultFunctor
 ///
 /// C(i) = c * C(i) + ab * A(i) * B(i), subject to the usual
 /// BLAS update rules.
-template<class CV, class AV, class BV,
-         int scalar_ab, int scalar_c,
-         class SizeType = typename CV::size_type>
-struct V_MultFunctor
-{
+template <class CV, class AV, class BV, int scalar_ab, int scalar_c,
+          class SizeType = typename CV::size_type>
+struct V_MultFunctor {
   typedef typename CV::execution_space execution_space;
   typedef SizeType size_type;
   typedef Kokkos::Details::ArithTraits<typename CV::non_const_value_type> ATS;
@@ -156,29 +143,21 @@ struct V_MultFunctor
   AV m_A;
   BV m_B;
 
-  V_MultFunctor (typename CV::const_value_type& c,
-                 const CV& C,
-                 typename AV::const_value_type& ab,
-                 const AV& A,
-                 const BV& B) :
-    m_c (c), m_C (C), m_ab (ab), m_A (A), m_B (B)
-  {}
+  V_MultFunctor(typename CV::const_value_type& c, const CV& C,
+                typename AV::const_value_type& ab, const AV& A, const BV& B)
+      : m_c(c), m_C(C), m_ab(ab), m_A(A), m_B(B) {}
 
-  KOKKOS_INLINE_FUNCTION void
-  operator () (const size_type& i) const
-  {
+  KOKKOS_INLINE_FUNCTION void operator()(const size_type& i) const {
     if (scalar_c == 0) {
       if (scalar_ab == 0) {
-        m_C(i) = ATS::zero ();
-      }
-      else { // ab != 0, c == 0
+        m_C(i) = ATS::zero();
+      } else {  // ab != 0, c == 0
         m_C(i) = m_ab * m_A(i) * m_B(i);
       }
-    } else { // c != 0
+    } else {  // c != 0
       if (scalar_ab == 0) {
         m_C(i) = m_c * m_C(i);
-      }
-      else { // m_ab != 0, and m_c != 0
+      } else {  // m_ab != 0, and m_c != 0
         m_C(i) = m_c * m_C(i) + m_ab * m_A(i) * m_B(i);
       }
     }
@@ -195,14 +174,10 @@ struct V_MultFunctor
 ///
 /// C(i) = c * C(i) + ab * A(i) * B(i), subject to the usual BLAS
 /// update rules.
-template<class CV, class AV, class BV, class SizeType>
-void
-V_Mult_Generic (typename CV::const_value_type& c,
-                const CV& C,
-                typename AV::const_value_type& ab,
-                const AV& A,
-                const BV& B)
-{
+template <class CV, class AV, class BV, class SizeType>
+void V_Mult_Generic(typename CV::const_value_type& c, const CV& C,
+                    typename AV::const_value_type& ab, const AV& A,
+                    const BV& B) {
   using Kokkos::ALL;
   using Kokkos::subview;
   typedef Kokkos::Details::ArithTraits<typename AV::non_const_value_type> ATA;
@@ -210,30 +185,27 @@ V_Mult_Generic (typename CV::const_value_type& c,
   typedef typename CV::execution_space execution_space;
 
   const SizeType numRows = C.extent(0);
-  Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
+  Kokkos::RangePolicy<execution_space, SizeType> policy(0, numRows);
 
-  if (c == ATC::zero ()) {
-    if (ab == ATA::zero ()) {
+  if (c == ATC::zero()) {
+    if (ab == ATA::zero()) {
       typedef V_MultFunctor<CV, AV, BV, 0, 0, SizeType> functor_type;
-      functor_type op (c, C, ab, A, B);
-      Kokkos::parallel_for ("KokkosBlas::Mult::S0", policy, op);
-    }
-    else {
+      functor_type op(c, C, ab, A, B);
+      Kokkos::parallel_for("KokkosBlas::Mult::S0", policy, op);
+    } else {
       typedef V_MultFunctor<CV, AV, BV, 2, 0, SizeType> functor_type;
-      functor_type op (c, C, ab, A, B);
-      Kokkos::parallel_for ("KokkosBlas::Mult::S1", policy, op);
+      functor_type op(c, C, ab, A, B);
+      Kokkos::parallel_for("KokkosBlas::Mult::S1", policy, op);
     }
-  }
-  else { // c != 0
-    if (ab == ATA::zero ()) {
+  } else {  // c != 0
+    if (ab == ATA::zero()) {
       typedef V_MultFunctor<CV, AV, BV, 0, 2, SizeType> functor_type;
-      functor_type op (c, C, ab, A, B);
-      Kokkos::parallel_for ("KokkosBlas::Mult::S2", policy, op);
-    }
-    else {
+      functor_type op(c, C, ab, A, B);
+      Kokkos::parallel_for("KokkosBlas::Mult::S2", policy, op);
+    } else {
       typedef V_MultFunctor<CV, AV, BV, 2, 2, SizeType> functor_type;
-      functor_type op (c, C, ab, A, B);
-      Kokkos::parallel_for ("KokkosBlas::Mult::S3", policy, op);
+      functor_type op(c, C, ab, A, B);
+      Kokkos::parallel_for("KokkosBlas::Mult::S3", policy, op);
     }
   }
 }
@@ -249,59 +221,51 @@ V_Mult_Generic (typename CV::const_value_type& c,
 ///
 /// C(i,j) = c * C(i,j) + ab * A(i) * B(i,j), subject to the usual
 /// BLAS update rules.
-template<class CMV, class AV, class BMV, class SizeType>
-void
-MV_Mult_Generic (typename CMV::const_value_type& c,
-                 const CMV& C,
-                 typename AV::const_value_type& ab,
-                 const AV& A,
-                 const BMV& B)
-{
+template <class CMV, class AV, class BMV, class SizeType>
+void MV_Mult_Generic(typename CMV::const_value_type& c, const CMV& C,
+                     typename AV::const_value_type& ab, const AV& A,
+                     const BMV& B) {
   typedef Kokkos::Details::ArithTraits<typename AV::non_const_value_type> ATA;
   typedef Kokkos::Details::ArithTraits<typename CMV::non_const_value_type> ATC;
   typedef typename CMV::execution_space execution_space;
 
   if (C.extent(1) == 1) {
-    auto C_0 = Kokkos::subview (C, Kokkos::ALL (), 0);
-    auto B_0 = Kokkos::subview (B, Kokkos::ALL (), 0);
-    typedef decltype (C_0) CV;
-    typedef decltype (B_0) BV;
+    auto C_0 = Kokkos::subview(C, Kokkos::ALL(), 0);
+    auto B_0 = Kokkos::subview(B, Kokkos::ALL(), 0);
+    typedef decltype(C_0) CV;
+    typedef decltype(B_0) BV;
 
-    V_Mult_Generic<CV, AV, BV, SizeType> (c, C_0, ab, A, B_0);
+    V_Mult_Generic<CV, AV, BV, SizeType>(c, C_0, ab, A, B_0);
     return;
   }
 
   const SizeType numRows = C.extent(0);
-  Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
+  Kokkos::RangePolicy<execution_space, SizeType> policy(0, numRows);
 
-  if (c == ATC::zero ()) {
-    if (ab == ATA::zero ()) {
+  if (c == ATC::zero()) {
+    if (ab == ATA::zero()) {
       typedef MV_MultFunctor<CMV, AV, BMV, 0, 0, SizeType> functor_type;
-      functor_type op (c, C, ab, A, B);
-      Kokkos::parallel_for ("KokkosBlas::Mult::S4", policy, op);
-    }
-    else {
+      functor_type op(c, C, ab, A, B);
+      Kokkos::parallel_for("KokkosBlas::Mult::S4", policy, op);
+    } else {
       typedef MV_MultFunctor<CMV, AV, BMV, 2, 0, SizeType> functor_type;
-      functor_type op (c, C, ab, A, B);
-      Kokkos::parallel_for ("KokkosBlas::Mult::S5", policy, op);
+      functor_type op(c, C, ab, A, B);
+      Kokkos::parallel_for("KokkosBlas::Mult::S5", policy, op);
     }
-  }
-  else { // c != 0
-    if (ab == ATA::zero ()) {
+  } else {  // c != 0
+    if (ab == ATA::zero()) {
       typedef MV_MultFunctor<CMV, AV, BMV, 0, 2, SizeType> functor_type;
-      functor_type op (c, C, ab, A, B);
-      Kokkos::parallel_for ("KokkosBlas::Mult::S6", policy, op);
-    }
-    else {
+      functor_type op(c, C, ab, A, B);
+      Kokkos::parallel_for("KokkosBlas::Mult::S6", policy, op);
+    } else {
       typedef MV_MultFunctor<CMV, AV, BMV, 2, 2, SizeType> functor_type;
-      functor_type op (c, C, ab, A, B);
-      Kokkos::parallel_for ("KokkosBlas::Mult::S7", policy, op);
+      functor_type op(c, C, ab, A, B);
+      Kokkos::parallel_for("KokkosBlas::Mult::S7", policy, op);
     }
   }
 }
 
+}  // namespace Impl
+}  // namespace KokkosBlas
 
-} // namespace Impl
-} // namespace KokkosBlas
-
-#endif // KOKKOSBLAS1_MULT_IMPL_HPP_
+#endif  // KOKKOSBLAS1_MULT_IMPL_HPP_
diff --git a/src/blas/impl/KokkosBlas1_mult_spec.hpp b/src/blas/impl/KokkosBlas1_mult_spec.hpp
index bb45594352..5e227e3480 100644
--- a/src/blas/impl/KokkosBlas1_mult_spec.hpp
+++ b/src/blas/impl/KokkosBlas1_mult_spec.hpp
@@ -49,18 +49,18 @@
 #include "Kokkos_InnerProductSpaceTraits.hpp"
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
-#include<KokkosBlas1_mult_impl.hpp>
+#include <KokkosBlas1_mult_impl.hpp>
 #endif
 
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class YMV, class AV, class XMV, int rank = XMV::rank>
+template <class YMV, class AV, class XMV, int rank = XMV::rank>
 struct mult_eti_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization availability
@@ -69,22 +69,20 @@ struct mult_eti_spec_avail {
 // We may spread out definitions (see _INST macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_MULT_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct mult_eti_spec_avail< \
-         Kokkos::View<SCALAR*, \
-                      LAYOUT, \
-                      Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, \
-                      LAYOUT, \
-                      Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, \
-                      LAYOUT, \
-                      Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         1> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_MULT_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  template <>                                                                  \
+  struct mult_eti_spec_avail<                                                  \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const SCALAR*, LAYOUT,                                      \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const SCALAR*, LAYOUT,                                      \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1> {                                                                     \
+    enum : bool { value = true };                                              \
+  };
 
 //
 // Macro for declaration of full specialization availability
@@ -93,24 +91,26 @@ struct mult_eti_spec_avail {
 // We may spread out definitions (see _INST macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_MULT_MV_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct mult_eti_spec_avail< \
-         Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, \
-                      LAYOUT, \
-                      Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         2> { enum : bool { value = true }; };
-
+#define KOKKOSBLAS1_MULT_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE,      \
+                                           MEM_SPACE)                       \
+  template <>                                                               \
+  struct mult_eti_spec_avail<                                               \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR*, LAYOUT,                                   \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      2> {                                                                  \
+    enum : bool { value = true };                                           \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosBlas1_mult_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_mult_eti_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_mult_mv_eti_spec_avail.hpp>
+#include <KokkosBlas1_mult_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_mult_eti_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_mult_mv_eti_spec_avail.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
@@ -126,124 +126,132 @@ namespace Impl {
 /// Y(i,j) = alpha*A(i,j)*X(i,j) + gamma*Y(i,j)
 ///
 /// with special cases for alpha, or gamma = 0.
-template<class YMV, class AV, class XMV, int rank = XMV::rank,
-    bool tpl_spec_avail = mult_tpl_spec_avail<YMV,AV,XMV>::value,
-    bool eti_spec_avail = mult_eti_spec_avail<YMV,AV,XMV>::value>
+template <class YMV, class AV, class XMV, int rank = XMV::rank,
+          bool tpl_spec_avail = mult_tpl_spec_avail<YMV, AV, XMV>::value,
+          bool eti_spec_avail = mult_eti_spec_avail<YMV, AV, XMV>::value>
 struct Mult {
-  static void
-    mult (const typename YMV::non_const_value_type& gamma, const YMV& Y,
-          const typename XMV::non_const_value_type& alpha, const AV& A, const XMV& X);
+  static void mult(const typename YMV::non_const_value_type& gamma,
+                   const YMV& Y,
+                   const typename XMV::non_const_value_type& alpha, const AV& A,
+                   const XMV& X);
 };
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 // Partial specialization for YMV, AV, and XMV rank-2 Views.
-template<class YMV, class AV, class XMV>
-struct Mult<YMV, AV, XMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>
-{
+template <class YMV, class AV, class XMV>
+struct Mult<YMV, AV, XMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename YMV::size_type size_type;
   typedef typename YMV::non_const_value_type YMV_scalar;
   typedef typename XMV::non_const_value_type XMV_scalar;
 
-  static void
-  mult (const YMV_scalar& gamma, const YMV& Y,
-        const XMV_scalar& alpha, const AV& A, const XMV& X)
-  {
-    static_assert (Kokkos::Impl::is_view<YMV>::value, "KokkosBlas::Impl::"
-                   "Mult<rank 2>::mult: Y is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<AV>::value, "KokkosBlas::Impl::"
-                   "Mult<rank 2>::mult: A is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "Mult<rank 2>::mult: X is not a Kokkos::View.");
-    static_assert (std::is_same<typename YMV::value_type,
-                     typename YMV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::Mult<rank 2>::mult: Y is const.  "
-                   "It must be nonconst, because it is an output argument "
-                   "(we have to be able to write to its entries).");
+  static void mult(const YMV_scalar& gamma, const YMV& Y,
+                   const XMV_scalar& alpha, const AV& A, const XMV& X) {
+    static_assert(Kokkos::is_view<YMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Mult<rank 2>::mult: Y is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<AV>::value,
+                  "KokkosBlas::Impl::"
+                  "Mult<rank 2>::mult: A is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Mult<rank 2>::mult: X is not a Kokkos::View.");
+    static_assert(std::is_same<typename YMV::value_type,
+                               typename YMV::non_const_value_type>::value,
+                  "KokkosBlas::Impl::Mult<rank 2>::mult: Y is const.  "
+                  "It must be nonconst, because it is an output argument "
+                  "(we have to be able to write to its entries).");
     // Casting to int avoids compiler warnings about comparing
     // different kinds of enum values.
-    static_assert ((int) XMV::rank == (int) YMV::rank &&
-                   (int) XMV::rank == 2,
-                   "KokkosBlas::Impl::Mult<rank 2>::mult: "
-                   "X, and Y must have the rank 2.");
-    static_assert (AV::rank == 1, "KokkosBlas::Impl::Mult<rank 2>::mult: "
-                   "AV must have rank 1.");
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::mult[ETI]":"KokkosBlas::mult[noETI]");
+    static_assert((int)XMV::rank == (int)YMV::rank && (int)XMV::rank == 2,
+                  "KokkosBlas::Impl::Mult<rank 2>::mult: "
+                  "X, and Y must have the rank 2.");
+    static_assert(AV::rank == 1,
+                  "KokkosBlas::Impl::Mult<rank 2>::mult: "
+                  "AV must have rank 1.");
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::mult[ETI]"
+                                      : "KokkosBlas::mult[noETI]");
 
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::mult<> ETI specialization for < %s , %s , %s >\n",typeid(YMV).name(),typeid(AV).name(),typeid(XMV).name());
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas1::mult<> ETI specialization for < %s , %s , %s >\n",
+             typeid(YMV).name(), typeid(AV).name(), typeid(XMV).name());
     else {
-      printf("KokkosBlas1::mult<> non-ETI specialization for < %s , %s , %s >\n",typeid(YMV).name(),typeid(AV).name(),typeid(XMV).name());
+      printf(
+          "KokkosBlas1::mult<> non-ETI specialization for < %s , %s , %s >\n",
+          typeid(YMV).name(), typeid(AV).name(), typeid(XMV).name());
     }
-    #endif
+#endif
 
     const size_type numRows = X.extent(0);
     const size_type numCols = X.extent(1);
 
-    if (numRows < static_cast<int> (INT_MAX) &&
-        numRows * numCols < static_cast<int> (INT_MAX)) {
-      MV_Mult_Generic<YMV, AV, XMV, int> (gamma, Y, alpha, A, X);
-    }
-    else {
-      MV_Mult_Generic<YMV, AV, XMV, int64_t> (gamma, Y, alpha, A, X);
+    if (numRows < static_cast<int>(INT_MAX) &&
+        numRows * numCols < static_cast<int>(INT_MAX)) {
+      MV_Mult_Generic<YMV, AV, XMV, int>(gamma, Y, alpha, A, X);
+    } else {
+      MV_Mult_Generic<YMV, AV, XMV, int64_t>(gamma, Y, alpha, A, X);
     }
     Kokkos::Profiling::popRegion();
   }
 };
 
 // Partial specialization for YV, AV, and XV rank-1 Views.
-template<class YV, class AV, class XV>
-struct Mult<YV, AV, XV, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>
-{
+template <class YV, class AV, class XV>
+struct Mult<YV, AV, XV, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename YV::size_type size_type;
   typedef typename YV::non_const_value_type YV_scalar;
   typedef typename XV::non_const_value_type XV_scalar;
 
-  static void
-  mult (const YV_scalar& gamma, const YV& Y,
-        const XV_scalar& alpha, const AV& A, const XV& X)
-  {
+  static void mult(const YV_scalar& gamma, const YV& Y, const XV_scalar& alpha,
+                   const AV& A, const XV& X) {
     // YV, AV, and XV must be Kokkos::View specializations.
-    static_assert (Kokkos::Impl::is_view<YV>::value, "KokkosBlas::Impl::"
-                   "Mult<rank 1>::mult: Y is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<AV>::value, "KokkosBlas::Impl::"
-                   "Mult<rank 1>::mult: A is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XV>::value, "KokkosBlas::Impl::"
-                   "Mult<rank 1>::mult: X is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<YV>::value,
+                  "KokkosBlas::Impl::"
+                  "Mult<rank 1>::mult: Y is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<AV>::value,
+                  "KokkosBlas::Impl::"
+                  "Mult<rank 1>::mult: A is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XV>::value,
+                  "KokkosBlas::Impl::"
+                  "Mult<rank 1>::mult: X is not a Kokkos::View.");
     // XV must be nonconst (else it can't be an output argument).
-    static_assert (std::is_same<typename YV::value_type,
-                     typename YV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::Mult<rank 1>::mult: Y is const.  "
-                   "It must be nonconst, because it is an output argument "
-                   "(we have to be able to write to its entries).");
-    static_assert ((int) XV::rank == (int) YV::rank && (int) AV::rank == 1,
-                   "KokkosBlas::Impl::Mult<rank 1>::mult: "
-                   "X, Y, and Z must have rank 1.");
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::mult[ETI]":"KokkosBlas::mult[noETI]");
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::mult<> ETI specialization for < %s , %s , %s >\n",typeid(YV).name(),typeid(AV).name(),typeid(XV).name());
+    static_assert(std::is_same<typename YV::value_type,
+                               typename YV::non_const_value_type>::value,
+                  "KokkosBlas::Impl::Mult<rank 1>::mult: Y is const.  "
+                  "It must be nonconst, because it is an output argument "
+                  "(we have to be able to write to its entries).");
+    static_assert((int)XV::rank == (int)YV::rank && (int)AV::rank == 1,
+                  "KokkosBlas::Impl::Mult<rank 1>::mult: "
+                  "X, Y, and Z must have rank 1.");
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::mult[ETI]"
+                                      : "KokkosBlas::mult[noETI]");
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas1::mult<> ETI specialization for < %s , %s , %s >\n",
+             typeid(YV).name(), typeid(AV).name(), typeid(XV).name());
     else {
-      printf("KokkosBlas1::mult<> non-ETI specialization for < %s , %s , %s >\n",typeid(YV).name(),typeid(AV).name(),typeid(XV).name());
+      printf(
+          "KokkosBlas1::mult<> non-ETI specialization for < %s , %s , %s >\n",
+          typeid(YV).name(), typeid(AV).name(), typeid(XV).name());
     }
-    #endif
+#endif
 
     const size_type numRows = Y.extent(0);
-    if (numRows < static_cast<int> (INT_MAX)) {
-      V_Mult_Generic<YV, AV, XV, int> (gamma, Y, alpha, A, X);
-    }
-    else {
-      V_Mult_Generic<YV, AV, XV, int64_t> (gamma, Y, alpha, A, X);
+    if (numRows < static_cast<int>(INT_MAX)) {
+      V_Mult_Generic<YV, AV, XV, int>(gamma, Y, alpha, A, X);
+    } else {
+      V_Mult_Generic<YV, AV, XV, int64_t>(gamma, Y, alpha, A, X);
     }
     Kokkos::Profiling::popRegion();
   }
 };
-#endif //!defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
-
+#endif  //! defined(KOKKOSKERNELS_ETI_ONLY) ||
+        //! KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 
-
-} // namespace Impl
-} // namespace KokkosBlas
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization of
@@ -253,37 +261,29 @@ struct Mult<YV, AV, XV, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>
 // one or more .cpp files.
 //
 
-#define KOKKOSBLAS1_MULT_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct Mult< \
-         Kokkos::View<SCALAR*, \
-                      LAYOUT, \
-                      Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, \
-                      LAYOUT, \
-                      Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, \
-                      LAYOUT, \
-                      Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1, false, true>;
+#define KOKKOSBLAS1_MULT_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  extern template struct Mult<                                                \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                     \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                     \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      1, false, true>;
 
-#define KOKKOSBLAS1_MULT_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct Mult< \
-         Kokkos::View<SCALAR*, \
-                      LAYOUT, \
-                      Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, \
-                      LAYOUT, \
-                      Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, \
-                      LAYOUT, \
-                      Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1, false, true>;
+#define KOKKOSBLAS1_MULT_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  template struct Mult<                                                       \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                     \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                     \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      1, false, true>;
 
 //
 // Macro for declaration of full specialization of
@@ -293,33 +293,34 @@ template struct Mult< \
 // one or more .cpp files.
 //
 
-#define KOKKOSBLAS1_MULT_MV_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct Mult< \
-     Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const SCALAR*, \
-                  LAYOUT, \
-                  Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     2, false, true>;
-
-#define KOKKOSBLAS1_MULT_MV_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct Mult< \
-     Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const SCALAR*, \
-                  LAYOUT, \
-                  Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     2, false, true>;
+#define KOKKOSBLAS1_MULT_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE,       \
+                                          MEM_SPACE)                        \
+  extern template struct Mult<                                              \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR*, LAYOUT,                                   \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      2, false, true>;
 
+#define KOKKOSBLAS1_MULT_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE,       \
+                                          MEM_SPACE)                        \
+  template struct Mult<                                                     \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR*, LAYOUT,                                   \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      2, false, true>;
 
-#include<KokkosBlas1_mult_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_mult_eti_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_mult_mv_eti_spec_decl.hpp>
+#include <KokkosBlas1_mult_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_mult_eti_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_mult_mv_eti_spec_decl.hpp>
 
-#endif // KOKKOSBLAS1_MULT_SPEC_HPP_
+#endif  // KOKKOSBLAS1_MULT_SPEC_HPP_
diff --git a/src/blas/impl/KokkosBlas1_nrm1_impl.hpp b/src/blas/impl/KokkosBlas1_nrm1_impl.hpp
index 296c424b3c..07422035b7 100644
--- a/src/blas/impl/KokkosBlas1_nrm1_impl.hpp
+++ b/src/blas/impl/KokkosBlas1_nrm1_impl.hpp
@@ -47,6 +47,7 @@
 #include <KokkosKernels_config.h>
 #include <Kokkos_Core.hpp>
 #include <KokkosBlas1_nrm1_spec.hpp>
+#include <KokkosBlas_util.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
@@ -60,210 +61,144 @@ namespace Impl {
 /// \tparam RV 0-D output View
 /// \tparam XV 1-D input View
 /// \tparam SizeType Index type.  Use int (32 bits) if possible.
-template<class RV, class XV, class SizeType = typename XV::size_type>
-struct V_Nrm1_Functor
-{
-  typedef typename XV::execution_space      execution_space;
-  typedef SizeType                          size_type;
+template <class RV, class XV, class SizeType = typename XV::size_type>
+struct V_Nrm1_Functor {
+  typedef typename XV::execution_space execution_space;
+  typedef SizeType size_type;
   typedef typename XV::non_const_value_type xvalue_type;
-  typedef Kokkos::ArithTraits<xvalue_type>  XAT;
-  typedef typename XAT::mag_type            value_type;
-  typedef Kokkos::ArithTraits<value_type>   MAT;
+  typedef Kokkos::ArithTraits<xvalue_type> XAT;
+  typedef typename XAT::mag_type value_type;
+  typedef Kokkos::ArithTraits<value_type> MAT;
 
   typename XV::const_type m_x;
 
-  V_Nrm1_Functor (const XV& x) :
-    m_x (x)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value,
-                   "KokkosBlas::Impl::V_Nrm1_Functor: "
-                   "R is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XV>::value,
-                   "KokkosBlas::Impl::V_Nrm1_Functor: "
-                   "X is not a Kokkos::View.");
-    static_assert (std::is_same<typename RV::value_type,
-                   typename RV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::V_Nrm1_Functor: R is const.  "
-                   "It must be nonconst, because it is an output argument "
-                   "(we have to be able to write to its entries).");
-    static_assert (RV::rank == 0 && XV::rank == 1,
-                   "KokkosBlas::Impl::V_Nrm1_Functor: "
-                   "RV must have rank 0 and XV must have rank 1.");
+  V_Nrm1_Functor(const XV& x) : m_x(x) {
+    static_assert(Kokkos::is_view<RV>::value,
+                  "KokkosBlas::Impl::V_Nrm1_Functor: "
+                  "R is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XV>::value,
+                  "KokkosBlas::Impl::V_Nrm1_Functor: "
+                  "X is not a Kokkos::View.");
+    static_assert(std::is_same<typename RV::value_type,
+                               typename RV::non_const_value_type>::value,
+                  "KokkosBlas::Impl::V_Nrm1_Functor: R is const.  "
+                  "It must be nonconst, because it is an output argument "
+                  "(we have to be able to write to its entries).");
+    static_assert(RV::rank == 0 && XV::rank == 1,
+                  "KokkosBlas::Impl::V_Nrm1_Functor: "
+                  "RV must have rank 0 and XV must have rank 1.");
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i, value_type& sum) const
-  {
+  void operator()(const size_type& i, value_type& sum) const {
     xvalue_type val = m_x(i);
     sum += MAT::abs(XAT::real(val)) + MAT::abs(XAT::imag(val));
   }
-
-  KOKKOS_INLINE_FUNCTION void init (value_type& update) const
-  {
-    update = MAT::zero ();
-  }
-
-  KOKKOS_INLINE_FUNCTION void
-  join (value_type& update,
-        const value_type& source) const
-  {
-    update += source;
-  }
-
-  KOKKOS_INLINE_FUNCTION void
-  join (volatile value_type& update,
-        const volatile value_type& source) const
-  {
-    update += source;
-  }
 };
 
-/// \brief Column-wise 1-norm functor for multivectors; works for
-///   any layout, but best performance with LayoutRight.
-///
-/// \tparam RV 1-D output View
-/// \tparam XMV 2-D input View
-/// \tparam SizeType Index type.  Use int (32 bits) if possible.
-template<class RV, class XMV, class SizeType = typename XMV::size_type>
-struct MV_Nrm1_Right_FunctorVector
-{
-  typedef typename XMV::execution_space               execution_space;
-  typedef SizeType                                    size_type;
-  typedef typename XMV::non_const_value_type          xvalue_type;
-  typedef Kokkos::ArithTraits<xvalue_type>            XAT;
-  typedef Kokkos::ArithTraits<typename XAT::mag_type> MAT;
-  typedef typename XAT::mag_type                      value_type[];
-
-  size_type value_count;
-  typename XMV::const_type m_x;
+template <class ExecSpace, class RV, class XV, class size_type>
+struct Nrm1_MV_Functor {
+  typedef typename RV::non_const_value_type rvalue_type;
+  typedef typename XV::non_const_value_type xvalue_type;
+  typedef Kokkos::ArithTraits<xvalue_type> XAT;
+  typedef typename XAT::mag_type value_type;
+  typedef Kokkos::ArithTraits<value_type> MAT;
 
-  MV_Nrm1_Right_FunctorVector (const XMV& x) :
-    value_count (x.extent(1)), m_x (x)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value,
-                   "KokkosBlas::Impl::MV_Nrm1_Right_FunctorVector: "
-                   "R is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value,
-                   "KokkosBlas::Impl::MV_Nrm1_Right_FunctorVector: "
-                   "X is not a Kokkos::View.");
-    static_assert (std::is_same<typename RV::value_type,
-                   typename RV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::MV_Nrm1_Right_FunctorVector: "
-                   "R is const.  It must be nonconst, because it is an output "
-                   "argument (we must be able to write to its entries).");
-    static_assert (RV::rank == 1 && XMV::rank == 2,
-                   "KokkosBlas::Impl::MV_Nrm1_Right_FunctorVector: "
-                   "RV must have rank 1 and XMV must have rank 2.");
-  }
+  using TeamMem = typename Kokkos::TeamPolicy<ExecSpace>::member_type;
 
-  KOKKOS_INLINE_FUNCTION void
-  operator() (const size_type i, value_type sum) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type j = 0; j < numVecs; ++j) {
-      xvalue_type val = m_x(i, j);
-      sum[j] += MAT::abs(XAT::real(val)) + MAT::abs(XAT::imag(val));
-    }
-  }
+  RV r;
+  XV x;
 
-  KOKKOS_INLINE_FUNCTION void
-  init (value_type update) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type j = 0; j < numVecs; ++j) {
-      update[j] = MAT::zero ();
-    }
-  }
+  size_type
+      teamsPerVec;  // number of teams collectively performing a dot product
 
-  KOKKOS_INLINE_FUNCTION void
-  join (volatile value_type update,
-        const volatile value_type source) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type j = 0; j < numVecs; ++j) {
-      update[j] += source[j];
-    }
-  }
+  Nrm1_MV_Functor(const RV& r_, const XV& x_, int teamsPerVec_)
+      : r(r_), x(x_), teamsPerVec(teamsPerVec_) {}
 
-  KOKKOS_INLINE_FUNCTION void
-  join (value_type update,
-        const value_type source) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type j = 0; j < numVecs; ++j) {
-      update[j] += source[j];
-    }
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TeamMem& t) const {
+    size_type globalRank = t.league_rank();
+    size_type localRank  = globalRank % teamsPerVec;
+    size_type i          = globalRank / teamsPerVec;
+    size_type begin      = localRank * (x.extent(0) / teamsPerVec);
+    size_type end        = (localRank + 1) * (x.extent(0) / teamsPerVec);
+    if (localRank == teamsPerVec - 1) end = x.extent(0);
+    value_type localResult = MAT::zero();
+    Kokkos::parallel_reduce(
+        Kokkos::TeamThreadRange(t, begin, end),
+        [&](size_type k, value_type& update) {
+          auto val = x(k, i);
+          update += MAT::abs(XAT::real(val)) + MAT::abs(XAT::imag(val));
+        },
+        localResult);
+
+    Kokkos::single(Kokkos::PerTeam(t), [&]() {
+      Kokkos::atomic_add(&r(i), rvalue_type(localResult));
+    });
   }
 };
 
-
 /// \brief Compute the 2-norm (or its square) of the single vector (1-D
 ///   View) X, and store the result in the 0-D View r.
-template<class RV, class XV, class SizeType>
-void
-V_Nrm1_Invoke (const RV& r, const XV& X)
-{
+template <class RV, class XV, class SizeType>
+void V_Nrm1_Invoke(const RV& r, const XV& X) {
   typedef typename XV::execution_space execution_space;
-  const SizeType numRows = static_cast<SizeType> (X.extent(0));
-  Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
+  const SizeType numRows = static_cast<SizeType>(X.extent(0));
+  Kokkos::RangePolicy<execution_space, SizeType> policy(0, numRows);
 
   typedef V_Nrm1_Functor<RV, XV, SizeType> functor_type;
-  functor_type op (X);
-  Kokkos::parallel_reduce ("KokkosBlas1::Nrm1::S0", policy, op, r);
+  functor_type op(X);
+  Kokkos::parallel_reduce("KokkosBlas1::Nrm1::S0", policy, op, r);
 }
 
-
 /// \brief Compute the 2-norms (or their square) of the columns of the
 ///   multivector (2-D View) X, and store result(s) in the 1-D View r.
-template<class RV, class XMV, class SizeType>
-void
-MV_Nrm1_Invoke (const RV& r, const XMV& X)
-{
-  typedef typename XMV::execution_space execution_space;
-  const SizeType numRows = static_cast<SizeType> (X.extent(0));
-  Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
-
-  // If the input multivector (2-D View) has only one column, invoke
-  // the single-vector version of the kernel.
-  if (X.extent(1) == 1) {
-    auto r_0 = Kokkos::subview (r, 0);
-    auto X_0 = Kokkos::subview (X, Kokkos::ALL (), 0);
-    typedef decltype (r_0) RV0D;
-    typedef decltype (X_0) XV1D;
-    V_Nrm1_Invoke<RV0D, XV1D, SizeType> (r_0, X_0);
-  }
-  else {
-    typedef MV_Nrm1_Right_FunctorVector<RV, XMV, SizeType> functor_type;
-    functor_type op (X);
-    Kokkos::parallel_reduce ("KokkosBlas1::Nrm1::S1", policy, op, r);
+// Main version: the result view is accessible from execution space, so it can
+// be computed in-place
+template <class RV, class XV, class size_type>
+void MV_Nrm1_Invoke(
+    const RV& r, const XV& x,
+    typename std::enable_if<Kokkos::SpaceAccessibility<
+        typename XV::execution_space,
+        typename RV::memory_space>::accessible>::type* = nullptr) {
+  using execution_space = typename XV::execution_space;
+  if (r.extent(0) != x.extent(1)) {
+    std::ostringstream oss;
+    oss << "KokkosBlas::nrm1 (rank-2): result vector has wrong length ("
+        << r.extent(0) << ", but x has " << x.extent(1) << " columns)";
+    throw std::runtime_error(oss.str());
   }
+  // Zero out the result vector
+  Kokkos::deep_copy(
+      r, Kokkos::ArithTraits<typename RV::non_const_value_type>::zero());
+  size_type teamsPerVec;
+  KokkosBlas::Impl::multipleReductionWorkDistribution<execution_space,
+                                                      size_type>(
+      x.extent(0), x.extent(1), teamsPerVec);
+  size_type numTeams = x.extent(1) * teamsPerVec;
+  Kokkos::TeamPolicy<execution_space> pol(numTeams, Kokkos::AUTO);
+  Kokkos::parallel_for(
+      "KokkosBlas1::Nrm1::S1", pol,
+      Nrm1_MV_Functor<execution_space, RV, XV, size_type>(r, x, teamsPerVec));
+}
+
+// Version for when a temporary result view is needed (implemented in terms of
+// the other version)
+template <class RV, class XV, class size_type>
+void MV_Nrm1_Invoke(
+    const RV& r, const XV& x,
+    typename std::enable_if<!Kokkos::SpaceAccessibility<
+        typename XV::execution_space,
+        typename RV::memory_space>::accessible>::type* = nullptr) {
+  Kokkos::View<typename RV::non_const_value_type*, typename XV::memory_space>
+      tempResult(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm1 temp result"),
+          r.extent(0));
+  MV_Nrm1_Invoke<decltype(tempResult), XV, size_type>(tempResult, x);
+  Kokkos::deep_copy(r, tempResult);
 }
 
-} // namespace Impl
-} // namespace KokkosBlas
+}  // namespace Impl
+}  // namespace KokkosBlas
 
-#endif // KOKKOSBLAS1_NRM1_IMPL_HPP_
+#endif  // KOKKOSBLAS1_NRM1_IMPL_HPP_
diff --git a/src/blas/impl/KokkosBlas1_nrm1_spec.hpp b/src/blas/impl/KokkosBlas1_nrm1_spec.hpp
index a469c1a6a8..df86d00fa2 100644
--- a/src/blas/impl/KokkosBlas1_nrm1_spec.hpp
+++ b/src/blas/impl/KokkosBlas1_nrm1_spec.hpp
@@ -57,12 +57,12 @@
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class RMV, class XMV, int rank = XMV::rank>
+template <class RMV, class XMV, int rank = XMV::rank>
 struct nrm1_eti_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization availability
@@ -71,14 +71,19 @@ struct nrm1_eti_spec_avail {
 // We may spread out definitions (see _INST macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_NRM1_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct nrm1_eti_spec_avail< \
-        Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, LAYOUT, Kokkos::HostSpace, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_NRM1_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  template <>                                                                  \
+  struct nrm1_eti_spec_avail<                                                  \
+      Kokkos::View<                                                            \
+          typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, \
+          LAYOUT, Kokkos::HostSpace,                                           \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                           \
+      Kokkos::View<const SCALAR*, LAYOUT,                                      \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1> {                                                                     \
+    enum : bool { value = true };                                              \
+  };
 
 //
 // Macro for declaration of full specialization availability
@@ -87,112 +92,128 @@ struct nrm1_eti_spec_avail {
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_NRM1_MV_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct nrm1_eti_spec_avail< \
-        Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type*, \
-                     LAYOUT, \
-                     Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        2> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_NRM1_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \
+                                           MEM_SPACE)                  \
+  template <>                                                          \
+  struct nrm1_eti_spec_avail<                                          \
+      Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<  \
+                       SCALAR>::mag_type*,                             \
+                   LAYOUT,                                             \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace,   \
+                                  Kokkos::HostSpace>,                  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      Kokkos::View<const SCALAR**, LAYOUT,                             \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,              \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      2> {                                                             \
+    enum : bool { value = true };                                      \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosBlas1_nrm1_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_nrm1_eti_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_nrm1_mv_eti_spec_avail.hpp>
+#include <KokkosBlas1_nrm1_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_nrm1_eti_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_nrm1_mv_eti_spec_avail.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
 
 // Unification layer
-template<class RMV, class XMV, int rank = XMV::rank,
-         bool tpl_spec_avail = nrm1_tpl_spec_avail<RMV,XMV>::value,
-         bool eti_spec_avail = nrm1_eti_spec_avail<RMV,XMV>::value>
+template <class RMV, class XMV, int rank = XMV::rank,
+          bool tpl_spec_avail = nrm1_tpl_spec_avail<RMV, XMV>::value,
+          bool eti_spec_avail = nrm1_eti_spec_avail<RMV, XMV>::value>
 struct Nrm1 {
-  static void nrm1 (const RMV& R, const XMV& X);
+  static void nrm1(const RMV& R, const XMV& X);
 };
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 //! Full specialization of Nrm1 for single vectors (1-D Views).
-template<class RMV, class XMV>
-struct Nrm1<RMV, XMV, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>
-{
+template <class RMV, class XMV>
+struct Nrm1<RMV, XMV, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename XMV::size_type size_type;
 
-  static void nrm1 (const RMV& R, const XMV& X)
-  {
-    static_assert (Kokkos::Impl::is_view<RMV>::value, "KokkosBlas::Impl::"
-                   "Nrm1<1-D>: RMV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "Nrm1<1-D>: XMV is not a Kokkos::View.");
-    static_assert (RMV::rank == 0, "KokkosBlas::Impl::Nrm1<1-D>: "
-                   "RMV is not rank 0.");
-    static_assert (XMV::rank == 1, "KokkosBlas::Impl::Nrm1<1-D>: "
-                   "XMV is not rank 1.");
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::nrm1[ETI]":"KokkosBlas::nrm1[noETI]");
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::nrm1<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name());
+  static void nrm1(const RMV& R, const XMV& X) {
+    static_assert(Kokkos::is_view<RMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Nrm1<1-D>: RMV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Nrm1<1-D>: XMV is not a Kokkos::View.");
+    static_assert(RMV::rank == 0,
+                  "KokkosBlas::Impl::Nrm1<1-D>: "
+                  "RMV is not rank 0.");
+    static_assert(XMV::rank == 1,
+                  "KokkosBlas::Impl::Nrm1<1-D>: "
+                  "XMV is not rank 1.");
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::nrm1[ETI]"
+                                      : "KokkosBlas::nrm1[noETI]");
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas1::nrm1<> ETI specialization for < %s , %s >\n",
+             typeid(RMV).name(), typeid(XMV).name());
     else {
-      printf("KokkosBlas1::nrm1<> non-ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name());
+      printf("KokkosBlas1::nrm1<> non-ETI specialization for < %s , %s >\n",
+             typeid(RMV).name(), typeid(XMV).name());
     }
-    #endif
+#endif
     const size_type numRows = X.extent(0);
 
-    if (numRows < static_cast<size_type> (INT_MAX) ) {
-      V_Nrm1_Invoke<RMV, XMV, int> (R, X);
-    }
-    else {
+    if (numRows < static_cast<size_type>(INT_MAX)) {
+      V_Nrm1_Invoke<RMV, XMV, int>(R, X);
+    } else {
       typedef std::int64_t index_type;
-      V_Nrm1_Invoke<RMV, XMV, index_type> (R, X);
+      V_Nrm1_Invoke<RMV, XMV, index_type>(R, X);
     }
     Kokkos::Profiling::popRegion();
   }
 };
 
-
-template<class RV, class XMV>
+template <class RV, class XMV>
 struct Nrm1<RV, XMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename XMV::size_type size_type;
 
-  static void nrm1 (const RV& R, const XMV& X)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::Impl::"
-                   "Nrm1<2-D>: RV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "Nrm1<2-D>: XMV is not a Kokkos::View.");
-    static_assert (RV::rank == 1, "KokkosBlas::Impl::Nrm1<2-D>: "
-                   "RV is not rank 1.");
-    static_assert (XMV::rank == 2, "KokkosBlas::Impl::Nrm1<2-D>: "
-                   "XMV is not rank 2.");
+  static void nrm1(const RV& R, const XMV& X) {
+    static_assert(Kokkos::is_view<RV>::value,
+                  "KokkosBlas::Impl::"
+                  "Nrm1<2-D>: RV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Nrm1<2-D>: XMV is not a Kokkos::View.");
+    static_assert(RV::rank == 1,
+                  "KokkosBlas::Impl::Nrm1<2-D>: "
+                  "RV is not rank 1.");
+    static_assert(XMV::rank == 2,
+                  "KokkosBlas::Impl::Nrm1<2-D>: "
+                  "XMV is not rank 2.");
 
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::nrm1<> ETI specialization for < %s , %s >\n",typeid(RV).name(),typeid(XMV).name());
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas1::nrm1<> ETI specialization for < %s , %s >\n",
+             typeid(RV).name(), typeid(XMV).name());
     else {
-      printf("KokkosBlas1::nrm1<> non-ETI specialization for < %s , %s >\n",typeid(RV).name(),typeid(XMV).name());
+      printf("KokkosBlas1::nrm1<> non-ETI specialization for < %s , %s >\n",
+             typeid(RV).name(), typeid(XMV).name());
     }
-    #endif
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::nrm1[ETI]":"KokkosBlas::nrm1[noETI]");
+#endif
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::nrm1[ETI]"
+                                      : "KokkosBlas::nrm1[noETI]");
     const size_type numRows = X.extent(0);
     const size_type numCols = X.extent(1);
-    if (numRows < static_cast<size_type> (INT_MAX) &&
-        numRows * numCols < static_cast<size_type> (INT_MAX)) {
-      MV_Nrm1_Invoke<RV, XMV, int> (R, X);
-    }
-    else {
+    if (numRows < static_cast<size_type>(INT_MAX) &&
+        numRows * numCols < static_cast<size_type>(INT_MAX)) {
+      MV_Nrm1_Invoke<RV, XMV, int>(R, X);
+    } else {
       typedef std::int64_t index_type;
-      MV_Nrm1_Invoke<RV, XMV, index_type> (R, X);
+      MV_Nrm1_Invoke<RV, XMV, index_type>(R, X);
     }
     Kokkos::Profiling::popRegion();
   }
 };
 #endif
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization of
@@ -201,28 +222,32 @@ struct Nrm1<RV, XMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_NRM1_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct Nrm1< \
-         Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, \
-                      LAYOUT, Kokkos::HostSpace, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         1, false, true>;
+#define KOKKOSBLAS1_NRM1_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE)  \
+  extern template struct Nrm1<                                                 \
+      Kokkos::View<                                                            \
+          typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, \
+          LAYOUT, Kokkos::HostSpace,                                           \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                           \
+      Kokkos::View<const SCALAR*, LAYOUT,                                      \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1, false, true>;
 
 //
 // Macro for definition of full specialization of
 // KokkosBlas::Impl::Nrm1 for rank == 2.  This is NOT for users!!!  We
 // use this macro in one or more .cpp files in this directory.
 //
-#define KOKKOSBLAS1_NRM1_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct Nrm1< \
-         Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, \
-                      LAYOUT, Kokkos::HostSpace, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         1, false, true>;
+#define KOKKOSBLAS1_NRM1_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE)  \
+  template struct Nrm1<                                                        \
+      Kokkos::View<                                                            \
+          typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, \
+          LAYOUT, Kokkos::HostSpace,                                           \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                           \
+      Kokkos::View<const SCALAR*, LAYOUT,                                      \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1, false, true>;
 
 //
 // Macro for declaration of full specialization of
@@ -231,33 +256,41 @@ template struct Nrm1< \
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_NRM1_MV_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct Nrm1< \
-         Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type*, \
-                      LAYOUT, \
-                      Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         2, false, true>;
+#define KOKKOSBLAS1_NRM1_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \
+                                          MEM_SPACE)                  \
+  extern template struct Nrm1<                                        \
+      Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits< \
+                       SCALAR>::mag_type*,                            \
+                   LAYOUT,                                            \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace,  \
+                                  Kokkos::HostSpace>,                 \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+      Kokkos::View<const SCALAR*, LAYOUT,                             \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,             \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+      2, false, true>;
 
 //
 // Macro for definition of full specialization of
 // KokkosBlas::Impl::Nrm1 for rank == 2.  This is NOT for users!!!  We
 // use this macro in one or more .cpp files in this directory.
 //
-#define KOKKOSBLAS1_NRM1_MV_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct Nrm1< \
-         Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type*, \
-                      LAYOUT, \
-                      Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         2, false, true>;
+#define KOKKOSBLAS1_NRM1_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \
+                                          MEM_SPACE)                  \
+  template struct Nrm1<                                               \
+      Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits< \
+                       SCALAR>::mag_type*,                            \
+                   LAYOUT,                                            \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace,  \
+                                  Kokkos::HostSpace>,                 \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+      Kokkos::View<const SCALAR**, LAYOUT,                            \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,             \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+      2, false, true>;
 
-#include<KokkosBlas1_nrm1_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_nrm1_eti_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_nrm1_mv_eti_spec_decl.hpp>
+#include <KokkosBlas1_nrm1_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_nrm1_eti_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_nrm1_mv_eti_spec_decl.hpp>
 
-#endif // KOKKOSBLAS1_NRM1_SPEC_HPP_
+#endif  // KOKKOSBLAS1_NRM1_SPEC_HPP_
diff --git a/src/blas/impl/KokkosBlas1_nrm2_impl.hpp b/src/blas/impl/KokkosBlas1_nrm2_impl.hpp
index 6a64fc8a89..4efc0e6c6d 100644
--- a/src/blas/impl/KokkosBlas1_nrm2_impl.hpp
+++ b/src/blas/impl/KokkosBlas1_nrm2_impl.hpp
@@ -47,6 +47,7 @@
 #include <KokkosKernels_config.h>
 #include <Kokkos_Core.hpp>
 #include <KokkosBlas1_nrm2_spec.hpp>
+#include <KokkosBlas_util.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
@@ -60,234 +61,179 @@ namespace Impl {
 /// \tparam RV 0-D output View
 /// \tparam XV 1-D input View
 /// \tparam SizeType Index type.  Use int (32 bits) if possible.
-template<class RV, class XV, class SizeType = typename XV::size_type>
-struct V_Nrm2_Functor
-{
-  typedef typename XV::execution_space              execution_space;
-  typedef SizeType                                        size_type;
-  typedef typename XV::non_const_value_type             xvalue_type;
+template <class RV, class XV, class SizeType = typename XV::size_type>
+struct V_Nrm2_Functor {
+  typedef typename XV::execution_space execution_space;
+  typedef SizeType size_type;
+  typedef typename XV::non_const_value_type xvalue_type;
   typedef Kokkos::Details::InnerProductSpaceTraits<xvalue_type> IPT;
-  typedef Kokkos::Details::ArithTraits<typename IPT::mag_type>   AT;
-  typedef typename IPT::mag_type                         value_type;
+  typedef Kokkos::Details::ArithTraits<typename IPT::mag_type> AT;
+  typedef typename IPT::mag_type value_type;
 
   typename XV::const_type m_x;
   bool m_take_sqrt;
 
-  V_Nrm2_Functor (const XV& x, bool take_sqrt) :
-    m_x (x),m_take_sqrt(take_sqrt)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value,
-                   "KokkosBlas::Impl::V_Nrm2_Functor: "
-                   "R is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XV>::value,
-                   "KokkosBlas::Impl::V_Nrm2_Functor: "
-                   "X is not a Kokkos::View.");
-    static_assert (std::is_same<typename RV::value_type,
-                   typename RV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::V_Nrm2_Functor: R is const.  "
-                   "It must be nonconst, because it is an output argument "
-                   "(we have to be able to write to its entries).");
-    static_assert (RV::rank == 0 && XV::rank == 1,
-                   "KokkosBlas::Impl::V_Nrm2_Functor: "
-                   "RV must have rank 0 and XV must have rank 1.");
+  V_Nrm2_Functor(const XV& x, bool take_sqrt) : m_x(x), m_take_sqrt(take_sqrt) {
+    static_assert(Kokkos::is_view<RV>::value,
+                  "KokkosBlas::Impl::V_Nrm2_Functor: "
+                  "R is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XV>::value,
+                  "KokkosBlas::Impl::V_Nrm2_Functor: "
+                  "X is not a Kokkos::View.");
+    static_assert(std::is_same<typename RV::value_type,
+                               typename RV::non_const_value_type>::value,
+                  "KokkosBlas::Impl::V_Nrm2_Functor: R is const.  "
+                  "It must be nonconst, because it is an output argument "
+                  "(we have to be able to write to its entries).");
+    static_assert(RV::rank == 0 && XV::rank == 1,
+                  "KokkosBlas::Impl::V_Nrm2_Functor: "
+                  "RV must have rank 0 and XV must have rank 1.");
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i, value_type& sum) const
-  {
-    const typename IPT::mag_type tmp = IPT::norm (m_x(i));
+  void operator()(const size_type& i, value_type& sum) const {
+    const typename IPT::mag_type tmp = IPT::norm(m_x(i));
     sum += tmp * tmp;
   }
 
-  KOKKOS_INLINE_FUNCTION void init (value_type& update) const
-  {
-    update = AT::zero ();
+  KOKKOS_INLINE_FUNCTION void init(value_type& update) const {
+    update = AT::zero();
   }
 
-  KOKKOS_INLINE_FUNCTION void
-  join (value_type& update,
-        const value_type& source) const
-  {
+  KOKKOS_INLINE_FUNCTION void join(value_type& update,
+                                   const value_type& source) const {
     update += source;
   }
 
-  KOKKOS_INLINE_FUNCTION void
-  join (volatile value_type& update,
-        const volatile value_type& source) const
-  {
+  KOKKOS_INLINE_FUNCTION void join(volatile value_type& update,
+                                   const volatile value_type& source) const {
     update += source;
   }
 
-  KOKKOS_INLINE_FUNCTION void
-  final (value_type& update) const {
-    if(m_take_sqrt)
-      update = Kokkos::Details::ArithTraits<typename RV::non_const_value_type>::sqrt(update);
+  KOKKOS_INLINE_FUNCTION void final(value_type& update) const {
+    if (m_take_sqrt)
+      update =
+          Kokkos::Details::ArithTraits<typename RV::non_const_value_type>::sqrt(
+              update);
   }
 };
 
 /// \brief Column-wise 2-norm functor for multivectors; works for
-///   any layout, but best performance with LayoutRight.
+///   any layout, but best performance with LayoutLeft.
 ///
 /// \tparam RV 1-D output View
 /// \tparam XMV 2-D input View
 /// \tparam SizeType Index type.  Use int (32 bits) if possible.
-template<class RV, class XMV, class SizeType = typename XMV::size_type>
-struct MV_Nrm2_Right_FunctorVector
-{
-  typedef typename XMV::execution_space             execution_space;
-  typedef SizeType                                        size_type;
-  typedef typename XMV::non_const_value_type            xvalue_type;
+template <class ExecSpace, class RV, class XV, class size_type>
+struct Nrm2_MV_Functor {
+  typedef typename RV::non_const_value_type rvalue_type;
+  typedef typename XV::non_const_value_type xvalue_type;
   typedef Kokkos::Details::InnerProductSpaceTraits<xvalue_type> IPT;
-  typedef Kokkos::Details::ArithTraits<typename IPT::mag_type>   AT;
-  typedef typename IPT::mag_type                       value_type[];
-
-  size_type value_count;
-  typename XMV::const_type m_x;
-  bool m_take_sqrt;
+  typedef Kokkos::Details::ArithTraits<typename IPT::mag_type> AT;
+  typedef typename IPT::mag_type value_type;
 
-  MV_Nrm2_Right_FunctorVector (const XMV& x, const bool& take_sqrt) :
-    value_count (x.extent(1)), m_x (x), m_take_sqrt(take_sqrt)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value,
-                   "KokkosBlas::Impl::MV_Nrm2_Right_FunctorVector: "
-                   "R is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value,
-                   "KokkosBlas::Impl::MV_Nrm2_Right_FunctorVector: "
-                   "X is not a Kokkos::View.");
-    static_assert (std::is_same<typename RV::value_type,
-                   typename RV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::MV_Nrm2_Right_FunctorVector: "
-                   "R is const.  It must be nonconst, because it is an output "
-                   "argument (we must be able to write to its entries).");
-    static_assert (RV::rank == 1 && XMV::rank == 2,
-                   "KokkosBlas::Impl::MV_Nrm2_Right_FunctorVector: "
-                   "RV must have rank 1 and XMV must have rank 2.");
-  }
+  using TeamMem = typename Kokkos::TeamPolicy<ExecSpace>::member_type;
 
-  KOKKOS_INLINE_FUNCTION void
-  operator() (const size_type i, value_type sum) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type j = 0; j < numVecs; ++j) {
-      const typename IPT::mag_type tmp = IPT::norm (m_x(i,j));
-      sum[j] += tmp * tmp;
-    }
-  }
+  RV r;
+  XV x;
 
-  KOKKOS_INLINE_FUNCTION void
-  init (value_type update) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type j = 0; j < numVecs; ++j) {
-      update[j] = AT::zero ();
-    }
-  }
+  size_type
+      teamsPerVec;  // number of teams collectively performing a dot product
 
-  KOKKOS_INLINE_FUNCTION void
-  join (volatile value_type update,
-        const volatile value_type source) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type j = 0; j < numVecs; ++j) {
-      update[j] += source[j];
-    }
-  }
+  Nrm2_MV_Functor(const RV& r_, const XV& x_, int teamsPerVec_)
+      : r(r_), x(x_), teamsPerVec(teamsPerVec_) {}
 
-  KOKKOS_INLINE_FUNCTION void
-  join (value_type update,
-        const value_type source) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type j = 0; j < numVecs; ++j) {
-      update[j] += source[j];
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION void
-  final (value_type update) const {
-    if(m_take_sqrt) {
-      const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-      for (size_type j = 0; j < numVecs; ++j) {
-        update[j] = Kokkos::Details::ArithTraits<typename RV::non_const_value_type>::sqrt(update[j]);
-      }
-    }
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TeamMem& t) const {
+    size_type globalRank = t.league_rank();
+    size_type localRank  = globalRank % teamsPerVec;
+    size_type i          = globalRank / teamsPerVec;
+
+    value_type localResult = AT::zero();
+    size_type begin        = localRank * (x.extent(0) / teamsPerVec);
+    size_type end          = (localRank + 1) * (x.extent(0) / teamsPerVec);
+    if (localRank == teamsPerVec - 1) end = x.extent(0);
+
+    Kokkos::parallel_reduce(
+        Kokkos::TeamThreadRange(t, begin, end),
+        [&](size_type k, value_type& update) {
+          value_type tmp = IPT::norm(x(k, i));
+          update += tmp * tmp;
+        },
+        localResult);
+
+    Kokkos::single(Kokkos::PerTeam(t), [&]() {
+      Kokkos::atomic_add(&r(i), rvalue_type(localResult));
+    });
   }
 };
 
-
 /// \brief Compute the 2-norm (or its square) of the single vector (1-D
 ///   View) X, and store the result in the 0-D View r.
-template<class RV, class XV, class SizeType>
-void
-V_Nrm2_Invoke (const RV& r, const XV& X, const bool& take_sqrt)
-{
+template <class RV, class XV, class SizeType>
+void V_Nrm2_Invoke(const RV& r, const XV& X, const bool& take_sqrt) {
   typedef typename XV::execution_space execution_space;
-  const SizeType numRows = static_cast<SizeType> (X.extent(0));
-  Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
+  const SizeType numRows = static_cast<SizeType>(X.extent(0));
+  Kokkos::RangePolicy<execution_space, SizeType> policy(0, numRows);
 
   typedef V_Nrm2_Functor<RV, XV, SizeType> functor_type;
-  functor_type op (X, take_sqrt);
-  Kokkos::parallel_reduce ("KokkosBlas::Nrm2::S0", policy,  op, r);
+  functor_type op(X, take_sqrt);
+  Kokkos::parallel_reduce("KokkosBlas::Nrm2::S0", policy, op, r);
 }
 
-
 /// \brief Compute the 2-norms (or their square) of the columns of the
 ///   multivector (2-D View) X, and store result(s) in the 1-D View r.
-template<class RV, class XMV, class SizeType>
-void
-MV_Nrm2_Invoke (const RV& r, const XMV& X, const bool& take_sqrt)
-{
-  typedef typename XMV::execution_space execution_space;
-  const SizeType numRows = static_cast<SizeType> (X.extent(0));
-  Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
-
-  // If the input multivector (2-D View) has only one column, invoke
-  // the single-vector version of the kernel.
-  if (X.extent(1) == 1) {
-    auto r_0 = Kokkos::subview (r, 0);
-    auto X_0 = Kokkos::subview (X, Kokkos::ALL (), 0);
-    typedef decltype (r_0) RV0D;
-    typedef decltype (X_0) XV1D;
-    V_Nrm2_Invoke<RV0D, XV1D, SizeType> (r_0, X_0, take_sqrt);
+// Main version: the result view is accessible from execution space, so it can
+// be computed in-place
+template <class RV, class XV, class size_type>
+void MV_Nrm2_Invoke(
+    const RV& r, const XV& x, bool take_sqrt,
+    typename std::enable_if<Kokkos::SpaceAccessibility<
+        typename XV::execution_space,
+        typename RV::memory_space>::accessible>::type* = nullptr) {
+  using execution_space = typename XV::execution_space;
+  if (r.extent(0) != x.extent(1)) {
+    std::ostringstream oss;
+    oss << "KokkosBlas::nrm2 (rank-2): result vector has wrong length ("
+        << r.extent(0) << ", but x has " << x.extent(1) << " columns)";
+    throw std::runtime_error(oss.str());
   }
-  else {
-    typedef MV_Nrm2_Right_FunctorVector<RV, XMV, SizeType> functor_type;
-    functor_type op (X, take_sqrt);
-    Kokkos::parallel_reduce ("KokkosBlas::Nrm2::S1", policy,  op, r);
+  // Zero out the result vector
+  Kokkos::deep_copy(
+      r, Kokkos::ArithTraits<typename RV::non_const_value_type>::zero());
+  size_type teamsPerVec;
+  KokkosBlas::Impl::multipleReductionWorkDistribution<execution_space,
+                                                      size_type>(
+      x.extent(0), x.extent(1), teamsPerVec);
+  size_type numTeams = x.extent(1) * teamsPerVec;
+  Kokkos::TeamPolicy<execution_space> pol(numTeams, Kokkos::AUTO);
+  Kokkos::parallel_for(
+      "KokkosBlas1::Nrm2::S1", pol,
+      Nrm2_MV_Functor<execution_space, RV, XV, size_type>(r, x, teamsPerVec));
+  if (take_sqrt) {
+    Kokkos::parallel_for("KokkosBlas1::Nrm2::Sqrt",
+                         Kokkos::RangePolicy<execution_space>(0, r.extent(0)),
+                         TakeSqrtFunctor<RV>(r));
   }
 }
 
-} // namespace Impl
-} // namespace KokkosBlas
+// Version for when a temporary result view is needed (implemented in terms of
+// the other version)
+template <class RV, class XV, class size_type>
+void MV_Nrm2_Invoke(
+    const RV& r, const XV& x, bool take_sqrt,
+    typename std::enable_if<!Kokkos::SpaceAccessibility<
+        typename XV::execution_space,
+        typename RV::memory_space>::accessible>::type* = nullptr) {
+  Kokkos::View<typename RV::non_const_value_type*, typename XV::memory_space>
+      tempResult(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm2 temp result"),
+          r.extent(0));
+  MV_Nrm2_Invoke<decltype(tempResult), XV, size_type>(tempResult, x, take_sqrt);
+  Kokkos::deep_copy(r, tempResult);
+}
+
+}  // namespace Impl
+}  // namespace KokkosBlas
 
-#endif // KOKKOSBLAS1_NRM2_IMPL_HPP_
+#endif  // KOKKOSBLAS1_NRM2_IMPL_HPP_
diff --git a/src/blas/impl/KokkosBlas1_nrm2_spec.hpp b/src/blas/impl/KokkosBlas1_nrm2_spec.hpp
index 30f4e00d95..340d78fdf1 100644
--- a/src/blas/impl/KokkosBlas1_nrm2_spec.hpp
+++ b/src/blas/impl/KokkosBlas1_nrm2_spec.hpp
@@ -57,12 +57,12 @@
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class RMV, class XMV, int rank = XMV::rank>
+template <class RMV, class XMV, int rank = XMV::rank>
 struct nrm2_eti_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization availability
@@ -71,14 +71,19 @@ struct nrm2_eti_spec_avail {
 // We may spread out definitions (see _INST macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_NRM2_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct nrm2_eti_spec_avail< \
-        Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, LAYOUT, Kokkos::HostSpace, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_NRM2_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  template <>                                                                  \
+  struct nrm2_eti_spec_avail<                                                  \
+      Kokkos::View<                                                            \
+          typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, \
+          LAYOUT, Kokkos::HostSpace,                                           \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                           \
+      Kokkos::View<const SCALAR*, LAYOUT,                                      \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1> {                                                                     \
+    enum : bool { value = true };                                              \
+  };
 
 //
 // Macro for declaration of full specialization availability
@@ -87,112 +92,128 @@ struct nrm2_eti_spec_avail {
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_NRM2_MV_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct nrm2_eti_spec_avail< \
-        Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type*, \
-                     LAYOUT, \
-                     Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        2> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_NRM2_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \
+                                           MEM_SPACE)                  \
+  template <>                                                          \
+  struct nrm2_eti_spec_avail<                                          \
+      Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<  \
+                       SCALAR>::mag_type*,                             \
+                   LAYOUT,                                             \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace,   \
+                                  Kokkos::HostSpace>,                  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      Kokkos::View<const SCALAR**, LAYOUT,                             \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,              \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      2> {                                                             \
+    enum : bool { value = true };                                      \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosBlas1_nrm2_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_nrm2_eti_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_nrm2_mv_eti_spec_avail.hpp>
+#include <KokkosBlas1_nrm2_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_nrm2_eti_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_nrm2_mv_eti_spec_avail.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
 
 // Unification layer
-template<class RMV, class XMV, int rank = XMV::rank,
-         bool tpl_spec_avail = nrm2_tpl_spec_avail<RMV,XMV>::value,
-         bool eti_spec_avail = nrm2_eti_spec_avail<RMV,XMV>::value>
+template <class RMV, class XMV, int rank = XMV::rank,
+          bool tpl_spec_avail = nrm2_tpl_spec_avail<RMV, XMV>::value,
+          bool eti_spec_avail = nrm2_eti_spec_avail<RMV, XMV>::value>
 struct Nrm2 {
-  static void nrm2 (const RMV& R, const XMV& X, const bool& take_sqrt);
+  static void nrm2(const RMV& R, const XMV& X, const bool& take_sqrt);
 };
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 //! Full specialization of Nrm2 for single vectors (1-D Views).
-template<class RMV, class XMV>
-struct Nrm2<RMV, XMV, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>
-{
+template <class RMV, class XMV>
+struct Nrm2<RMV, XMV, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename XMV::size_type size_type;
 
-  static void nrm2 (const RMV& R, const XMV& X, const bool& take_sqrt)
-  {
-    static_assert (Kokkos::Impl::is_view<RMV>::value, "KokkosBlas::Impl::"
-                   "Nrm2<1-D>: RMV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "Nrm2<1-D>: XMV is not a Kokkos::View.");
-    static_assert (RMV::rank == 0, "KokkosBlas::Impl::Nrm2<1-D>: "
-                   "RMV is not rank 0.");
-    static_assert (XMV::rank == 1, "KokkosBlas::Impl::Nrm2<1-D>: "
-                   "XMV is not rank 1.");
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::nrm2[ETI]":"KokkosBlas::nrm2[noETI]");
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::nrm2<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name());
+  static void nrm2(const RMV& R, const XMV& X, const bool& take_sqrt) {
+    static_assert(Kokkos::is_view<RMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Nrm2<1-D>: RMV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Nrm2<1-D>: XMV is not a Kokkos::View.");
+    static_assert(RMV::rank == 0,
+                  "KokkosBlas::Impl::Nrm2<1-D>: "
+                  "RMV is not rank 0.");
+    static_assert(XMV::rank == 1,
+                  "KokkosBlas::Impl::Nrm2<1-D>: "
+                  "XMV is not rank 1.");
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::nrm2[ETI]"
+                                      : "KokkosBlas::nrm2[noETI]");
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas1::nrm2<> ETI specialization for < %s , %s >\n",
+             typeid(RMV).name(), typeid(XMV).name());
     else {
-      printf("KokkosBlas1::nrm2<> non-ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name());
+      printf("KokkosBlas1::nrm2<> non-ETI specialization for < %s , %s >\n",
+             typeid(RMV).name(), typeid(XMV).name());
     }
-    #endif
+#endif
     const size_type numRows = X.extent(0);
 
-    if (numRows < static_cast<size_type> (INT_MAX) ) {
-      V_Nrm2_Invoke<RMV, XMV, int> (R, X, take_sqrt);
-    }
-    else {
+    if (numRows < static_cast<size_type>(INT_MAX)) {
+      V_Nrm2_Invoke<RMV, XMV, int>(R, X, take_sqrt);
+    } else {
       typedef std::int64_t index_type;
-      V_Nrm2_Invoke<RMV, XMV, index_type> (R, X, take_sqrt);
+      V_Nrm2_Invoke<RMV, XMV, index_type>(R, X, take_sqrt);
     }
     Kokkos::Profiling::popRegion();
   }
 };
 
-
-template<class RV, class XMV>
+template <class RV, class XMV>
 struct Nrm2<RV, XMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename XMV::size_type size_type;
 
-  static void nrm2 (const RV& R, const XMV& X, const bool& take_sqrt)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::Impl::"
-                   "Nrm2<2-D>: RV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "Nrm2<2-D>: XMV is not a Kokkos::View.");
-    static_assert (RV::rank == 1, "KokkosBlas::Impl::Nrm2<2-D>: "
-                   "RV is not rank 1.");
-    static_assert (XMV::rank == 2, "KokkosBlas::Impl::Nrm2<2-D>: "
-                   "XMV is not rank 2.");
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::nrm2[ETI]":"KokkosBlas::nrm2[noETI]");
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::nrm2<> ETI specialization for < %s , %s >\n",typeid(RV).name(),typeid(XMV).name());
+  static void nrm2(const RV& R, const XMV& X, const bool& take_sqrt) {
+    static_assert(Kokkos::is_view<RV>::value,
+                  "KokkosBlas::Impl::"
+                  "Nrm2<2-D>: RV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Nrm2<2-D>: XMV is not a Kokkos::View.");
+    static_assert(RV::rank == 1,
+                  "KokkosBlas::Impl::Nrm2<2-D>: "
+                  "RV is not rank 1.");
+    static_assert(XMV::rank == 2,
+                  "KokkosBlas::Impl::Nrm2<2-D>: "
+                  "XMV is not rank 2.");
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::nrm2[ETI]"
+                                      : "KokkosBlas::nrm2[noETI]");
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas1::nrm2<> ETI specialization for < %s , %s >\n",
+             typeid(RV).name(), typeid(XMV).name());
     else {
-      printf("KokkosBlas1::nrm2<> non-ETI specialization for < %s , %s >\n",typeid(RV).name(),typeid(XMV).name());
+      printf("KokkosBlas1::nrm2<> non-ETI specialization for < %s , %s >\n",
+             typeid(RV).name(), typeid(XMV).name());
     }
-    #endif
+#endif
 
     const size_type numRows = X.extent(0);
     const size_type numCols = X.extent(1);
-    if (numRows < static_cast<size_type> (INT_MAX) &&
-        numRows * numCols < static_cast<size_type> (INT_MAX)) {
-      MV_Nrm2_Invoke<RV, XMV, int> (R, X, take_sqrt);
-    }
-    else {
+    if (numRows < static_cast<size_type>(INT_MAX) &&
+        numRows * numCols < static_cast<size_type>(INT_MAX)) {
+      MV_Nrm2_Invoke<RV, XMV, int>(R, X, take_sqrt);
+    } else {
       typedef std::int64_t index_type;
-      MV_Nrm2_Invoke<RV, XMV, index_type> (R, X, take_sqrt);
+      MV_Nrm2_Invoke<RV, XMV, index_type>(R, X, take_sqrt);
     }
     Kokkos::Profiling::popRegion();
   }
 };
 #endif
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization of
@@ -201,28 +222,32 @@ struct Nrm2<RV, XMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_NRM2_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct Nrm2< \
-         Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, \
-                      LAYOUT, Kokkos::HostSpace, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         1, false, true>;
+#define KOKKOSBLAS1_NRM2_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE)  \
+  extern template struct Nrm2<                                                 \
+      Kokkos::View<                                                            \
+          typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, \
+          LAYOUT, Kokkos::HostSpace,                                           \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                           \
+      Kokkos::View<const SCALAR*, LAYOUT,                                      \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1, false, true>;
 
 //
 // Macro for definition of full specialization of
 // KokkosBlas::Impl::Nrm2 for rank == 2.  This is NOT for users!!!  We
 // use this macro in one or more .cpp files in this directory.
 //
-#define KOKKOSBLAS1_NRM2_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct Nrm2< \
-         Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, \
-                      LAYOUT, Kokkos::HostSpace, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         1, false, true>;
+#define KOKKOSBLAS1_NRM2_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE)  \
+  template struct Nrm2<                                                        \
+      Kokkos::View<                                                            \
+          typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, \
+          LAYOUT, Kokkos::HostSpace,                                           \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                           \
+      Kokkos::View<const SCALAR*, LAYOUT,                                      \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1, false, true>;
 
 //
 // Macro for declaration of full specialization of
@@ -231,33 +256,41 @@ template struct Nrm2< \
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_NRM2_MV_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct Nrm2< \
-         Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type*, \
-                      LAYOUT, \
-                      Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         2, false, true>;
+#define KOKKOSBLAS1_NRM2_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \
+                                          MEM_SPACE)                  \
+  extern template struct Nrm2<                                        \
+      Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits< \
+                       SCALAR>::mag_type*,                            \
+                   LAYOUT,                                            \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace,  \
+                                  Kokkos::HostSpace>,                 \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+      Kokkos::View<const SCALAR*, LAYOUT,                             \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,             \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+      2, false, true>;
 
 //
 // Macro for definition of full specialization of
 // KokkosBlas::Impl::Nrm2 for rank == 2.  This is NOT for users!!!  We
 // use this macro in one or more .cpp files in this directory.
 //
-#define KOKKOSBLAS1_NRM2_MV_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct Nrm2< \
-         Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type*, \
-                      LAYOUT, \
-                      Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         2, false, true>;
+#define KOKKOSBLAS1_NRM2_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \
+                                          MEM_SPACE)                  \
+  template struct Nrm2<                                               \
+      Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits< \
+                       SCALAR>::mag_type*,                            \
+                   LAYOUT,                                            \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace,  \
+                                  Kokkos::HostSpace>,                 \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+      Kokkos::View<const SCALAR**, LAYOUT,                            \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,             \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+      2, false, true>;
 
-#include<KokkosBlas1_nrm2_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_nrm2_eti_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_nrm2_mv_eti_spec_decl.hpp>
+#include <KokkosBlas1_nrm2_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_nrm2_eti_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_nrm2_mv_eti_spec_decl.hpp>
 
-#endif // KOKKOSBLAS1_NRM2_SPEC_HPP_
+#endif  // KOKKOSBLAS1_NRM2_SPEC_HPP_
diff --git a/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp b/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp
index 7c0a009f9e..3013fd17f8 100644
--- a/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp
+++ b/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp
@@ -48,6 +48,7 @@
 #include <Kokkos_Core.hpp>
 #include <Kokkos_InnerProductSpaceTraits.hpp>
 #include <KokkosBlas1_nrm2w_spec.hpp>
+#include <KokkosBlas_util.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
@@ -61,235 +62,178 @@ namespace Impl {
 /// \tparam RV 0-D output View
 /// \tparam XV 1-D input View
 /// \tparam SizeType Index type.  Use int (32 bits) if possible.
-template<class RV, class XV, class SizeType = typename XV::size_type>
-struct V_Nrm2w_Functor
-{
-  typedef typename XV::execution_space              execution_space;
-  typedef SizeType                                        size_type;
-  typedef typename XV::non_const_value_type             xvalue_type;
+template <class RV, class XV, class SizeType = typename XV::size_type>
+struct V_Nrm2w_Functor {
+  typedef typename XV::execution_space execution_space;
+  typedef SizeType size_type;
+  typedef typename XV::non_const_value_type xvalue_type;
   typedef Kokkos::Details::InnerProductSpaceTraits<xvalue_type> IPT;
-  typedef Kokkos::Details::ArithTraits<typename IPT::mag_type>   AT;
-  typedef typename IPT::mag_type                         value_type;
+  typedef Kokkos::Details::ArithTraits<typename IPT::mag_type> AT;
+  typedef typename IPT::mag_type value_type;
 
   typename XV::const_type m_x, m_w;
   bool m_take_sqrt;
 
-  V_Nrm2w_Functor (const XV& x, const XV& w, bool take_sqrt) :
-    m_x (x), m_w (w), m_take_sqrt (take_sqrt)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value,
-                   "KokkosBlas::Impl::V_Nrm2w_Functor: "
-                   "R is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XV>::value,
-                   "KokkosBlas::Impl::V_Nrm2w_Functor: "
-                   "X is not a Kokkos::View.");
-    static_assert (std::is_same<typename RV::value_type,
-                   typename RV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::V_Nrm2w_Functor: R is const.  "
-                   "It must be nonconst, because it is an output argument "
-                   "(we have to be able to write to its entries).");
-    static_assert (RV::rank == 0 && XV::rank == 1,
-                   "KokkosBlas::Impl::V_Nrm2w_Functor: "
-                   "RV must have rank 0 and XV must have rank 1.");
+  V_Nrm2w_Functor(const XV& x, const XV& w, bool take_sqrt)
+      : m_x(x), m_w(w), m_take_sqrt(take_sqrt) {
+    static_assert(Kokkos::is_view<RV>::value,
+                  "KokkosBlas::Impl::V_Nrm2w_Functor: "
+                  "R is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XV>::value,
+                  "KokkosBlas::Impl::V_Nrm2w_Functor: "
+                  "X is not a Kokkos::View.");
+    static_assert(std::is_same<typename RV::value_type,
+                               typename RV::non_const_value_type>::value,
+                  "KokkosBlas::Impl::V_Nrm2w_Functor: R is const.  "
+                  "It must be nonconst, because it is an output argument "
+                  "(we have to be able to write to its entries).");
+    static_assert(RV::rank == 0 && XV::rank == 1,
+                  "KokkosBlas::Impl::V_Nrm2w_Functor: "
+                  "RV must have rank 0 and XV must have rank 1.");
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i, value_type& sum) const
-  {
-    const value_type tmp =  IPT::norm (m_x(i))/IPT::norm (m_w(i));
-    sum += tmp * tmp;;
+  void operator()(const size_type& i, value_type& sum) const {
+    const value_type tmp = IPT::norm(m_x(i)) / IPT::norm(m_w(i));
+    sum += tmp * tmp;
+    ;
   }
 
-  KOKKOS_INLINE_FUNCTION void init (value_type& update) const
-  {
-    update = AT::zero ();
+  KOKKOS_INLINE_FUNCTION void init(value_type& update) const {
+    update = AT::zero();
   }
 
-  KOKKOS_INLINE_FUNCTION void
-  join (value_type& update,
-        const value_type& source) const
-  {
+  KOKKOS_INLINE_FUNCTION void join(value_type& update,
+                                   const value_type& source) const {
     update += source;
   }
 
-  KOKKOS_INLINE_FUNCTION void
-  join (volatile value_type& update,
-        const volatile value_type& source) const
-  {
+  KOKKOS_INLINE_FUNCTION void join(volatile value_type& update,
+                                   const volatile value_type& source) const {
     update += source;
   }
 
-  KOKKOS_INLINE_FUNCTION void
-  final (value_type& update) const {
-    if(m_take_sqrt)
-      update = Kokkos::Details::ArithTraits<typename RV::non_const_value_type>::sqrt(update);
+  KOKKOS_INLINE_FUNCTION void final(value_type& update) const {
+    if (m_take_sqrt)
+      update =
+          Kokkos::Details::ArithTraits<typename RV::non_const_value_type>::sqrt(
+              update);
   }
 };
 
-/// \brief Column-wise 2-norm functor for multivectors; works for
-///   any layout, but best performance with LayoutRight.
-///
-/// \tparam RV 1-D output View
-/// \tparam XMV 2-D input View
-/// \tparam SizeType Index type.  Use int (32 bits) if possible.
-template<class RV, class XMV, class SizeType = typename XMV::size_type>
-struct MV_Nrm2w_Right_FunctorVector
-{
-  typedef typename XMV::execution_space             execution_space;
-  typedef SizeType                                        size_type;
-  typedef typename XMV::non_const_value_type            xvalue_type;
+template <class ExecSpace, class RV, class XV, class size_type>
+struct Nrm2w_MV_Functor {
+  typedef typename RV::non_const_value_type rvalue_type;
+  typedef typename XV::non_const_value_type xvalue_type;
   typedef Kokkos::Details::InnerProductSpaceTraits<xvalue_type> IPT;
-  typedef Kokkos::Details::ArithTraits<typename IPT::mag_type>   AT;
-  typedef typename IPT::mag_type                       value_type[];
+  typedef Kokkos::Details::ArithTraits<typename IPT::mag_type> AT;
+  typedef typename IPT::mag_type value_type;
 
-  size_type value_count;
-  typename XMV::const_type m_x, m_w;
-  bool m_take_sqrt;
+  using TeamMem = typename Kokkos::TeamPolicy<ExecSpace>::member_type;
 
-  MV_Nrm2w_Right_FunctorVector (const XMV& x, const XMV& /* w */, const bool& take_sqrt) :
-    value_count (x.extent(1)), m_x (x), m_w (x), m_take_sqrt(take_sqrt)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value,
-                   "KokkosBlas::Impl::MV_Nrm2w_Right_FunctorVector: "
-                   "R is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value,
-                   "KokkosBlas::Impl::MV_Nrm2w_Right_FunctorVector: "
-                   "X is not a Kokkos::View.");
-    static_assert (std::is_same<typename RV::value_type,
-                   typename RV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::MV_Nrm2w_Right_FunctorVector: "
-                   "R is const.  It must be nonconst, because it is an output "
-                   "argument (we must be able to write to its entries).");
-    static_assert (RV::rank == 1 && XMV::rank == 2,
-                   "KokkosBlas::Impl::MV_Nrm2w_Right_FunctorVector: "
-                   "RV must have rank 1 and XMV must have rank 2.");
-  }
+  RV r;
+  XV x;
+  XV w;
 
-  KOKKOS_INLINE_FUNCTION void
-  operator() (const size_type i, value_type sum) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type j = 0; j < numVecs; ++j) {
-      const typename IPT::mag_type tmp =  IPT::norm (m_x(i,j))/IPT::norm (m_w(i,j));
-      sum[j] += tmp * tmp;;
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION void
-  init (value_type update) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type j = 0; j < numVecs; ++j) {
-      update[j] = AT::zero ();
-    }
-  }
+  size_type
+      teamsPerVec;  // number of teams collectively performing a dot product
 
-  KOKKOS_INLINE_FUNCTION void
-  join (volatile value_type update,
-        const volatile value_type source) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type j = 0; j < numVecs; ++j) {
-      update[j] += source[j];
-    }
-  }
+  Nrm2w_MV_Functor(const RV& r_, const XV& x_, const XV& w_, int teamsPerVec_)
+      : r(r_), x(x_), w(w_), teamsPerVec(teamsPerVec_) {}
 
-  KOKKOS_INLINE_FUNCTION void
-  join (value_type update,
-        const value_type source) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type j = 0; j < numVecs; ++j) {
-      update[j] += source[j];
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION void
-  final (value_type update) const {
-    if(m_take_sqrt) {
-      const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-      for (size_type j = 0; j < numVecs; ++j) {
-        update[j] = Kokkos::Details::ArithTraits<typename RV::non_const_value_type>::sqrt(update[j]);
-      }
-    }
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TeamMem& t) const {
+    size_type globalRank = t.league_rank();
+    size_type localRank  = globalRank % teamsPerVec;
+    size_type i          = globalRank / teamsPerVec;
+
+    value_type localResult = AT::zero();
+    size_type begin        = localRank * (x.extent(0) / teamsPerVec);
+    size_type end          = (localRank + 1) * (x.extent(0) / teamsPerVec);
+    if (localRank == teamsPerVec - 1) end = x.extent(0);
+    Kokkos::parallel_reduce(
+        Kokkos::TeamThreadRange(t, begin, end),
+        [&](size_type k, value_type& update) {
+          const typename IPT::mag_type tmp =
+              IPT::norm(x(k, i)) / IPT::norm(w(k, i));
+          update += tmp * tmp;
+        },
+        localResult);
+
+    Kokkos::single(Kokkos::PerTeam(t), [&]() {
+      Kokkos::atomic_add(&r(i), rvalue_type(localResult));
+    });
   }
 };
 
-
 /// \brief Compute the 2-norm (or its square) of the single vector (1-D
 ///   View) X, and store the result in the 0-D View r.
-template<class RV, class XV, class SizeType>
-void
-V_Nrm2w_Invoke (const RV& r, const XV& X, const XV& W, const bool& take_sqrt)
-{
+template <class RV, class XV, class SizeType>
+void V_Nrm2w_Invoke(const RV& r, const XV& X, const XV& W,
+                    const bool& take_sqrt) {
   typedef typename XV::execution_space execution_space;
-  const SizeType numRows = static_cast<SizeType> (X.extent(0));
-  Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
+  const SizeType numRows = static_cast<SizeType>(X.extent(0));
+  Kokkos::RangePolicy<execution_space, SizeType> policy(0, numRows);
 
   typedef V_Nrm2w_Functor<RV, XV, SizeType> functor_type;
-  functor_type op (X, W, take_sqrt);
-  Kokkos::parallel_reduce ("KokkosBlas::Nrm2w::S0", policy, op, r);
+  functor_type op(X, W, take_sqrt);
+  Kokkos::parallel_reduce("KokkosBlas::Nrm2w::S0", policy, op, r);
 }
 
-
-/// \brief Compute the 2-norms (or their square) of the columns of the
+/// \brief Compute the weighted 2-norms (or their square) of the columns of the
 ///   multivector (2-D View) X, and store result(s) in the 1-D View r.
-template<class RV, class XMV, class SizeType>
-void
-MV_Nrm2w_Invoke (const RV& r, const XMV& X, const XMV& W, const bool& take_sqrt)
-{
-  typedef typename XMV::execution_space execution_space;
-  const SizeType numRows = static_cast<SizeType> (X.extent(0));
-  Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
-
-  // If the input multivector (2-D View) has only one column, invoke
-  // the single-vector version of the kernel.
-  if (X.extent(1) == 1) {
-    auto r_0 = Kokkos::subview (r, 0);
-    auto X_0 = Kokkos::subview (X, Kokkos::ALL (), 0);
-    auto W_0 = Kokkos::subview (W, Kokkos::ALL (), 0);
-    typedef decltype (r_0) RV0D;
-    typedef decltype (X_0) XV1D;
-    V_Nrm2w_Invoke<RV0D, XV1D, SizeType> (r_0, X_0, W_0, take_sqrt);
+// Main version: the result view is accessible from execution space, so it can
+// be computed in-place
+template <class RV, class XV, class size_type>
+void MV_Nrm2w_Invoke(
+    const RV& r, const XV& x, const XV& w, bool take_sqrt,
+    typename std::enable_if<Kokkos::SpaceAccessibility<
+        typename XV::execution_space,
+        typename RV::memory_space>::accessible>::type* = nullptr) {
+  using execution_space = typename XV::execution_space;
+  if (r.extent(0) != x.extent(1)) {
+    std::ostringstream oss;
+    oss << "KokkosBlas::nrm2w (rank-2): result vector has wrong length ("
+        << r.extent(0) << ", but x has " << x.extent(1) << " columns)";
+    throw std::runtime_error(oss.str());
   }
-  else {
-    typedef MV_Nrm2w_Right_FunctorVector<RV, XMV, SizeType> functor_type;
-    functor_type op (X, W, take_sqrt);
-    Kokkos::parallel_reduce ("KokkosBlas::Nrm2w::S1", policy, op, r);
+  // Zero out the result vector
+  Kokkos::deep_copy(
+      r, Kokkos::ArithTraits<typename RV::non_const_value_type>::zero());
+  size_type teamsPerVec;
+  KokkosBlas::Impl::multipleReductionWorkDistribution<execution_space,
+                                                      size_type>(
+      x.extent(0), x.extent(1), teamsPerVec);
+  size_type numTeams = x.extent(1) * teamsPerVec;
+  Kokkos::TeamPolicy<execution_space> pol(numTeams, Kokkos::AUTO);
+  Kokkos::parallel_for("KokkosBlas1::Nrm2w::S1", pol,
+                       Nrm2w_MV_Functor<execution_space, RV, XV, size_type>(
+                           r, x, w, teamsPerVec));
+  if (take_sqrt) {
+    Kokkos::parallel_for("KokkosBlas1::Nrm2w::Sqrt",
+                         Kokkos::RangePolicy<execution_space>(0, r.extent(0)),
+                         TakeSqrtFunctor<RV>(r));
   }
 }
 
-} // namespace Impl
-} // namespace KokkosBlas
+// Version for when a temporary result view is needed (implemented in terms of
+// the other version)
+template <class RV, class XV, class size_type>
+void MV_Nrm2w_Invoke(
+    const RV& r, const XV& x, const XV& w, bool take_sqrt,
+    typename std::enable_if<!Kokkos::SpaceAccessibility<
+        typename XV::execution_space,
+        typename RV::memory_space>::accessible>::type* = nullptr) {
+  Kokkos::View<typename RV::non_const_value_type*, typename XV::memory_space>
+      tempResult(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm2w temp result"),
+          r.extent(0));
+  MV_Nrm2w_Invoke<decltype(tempResult), XV, size_type>(tempResult, x, w,
+                                                       take_sqrt);
+  Kokkos::deep_copy(r, tempResult);
+}
+
+}  // namespace Impl
+}  // namespace KokkosBlas
 
-#endif // KOKKOSBLAS1_NRM2W_IMPL_HPP_
+#endif  // KOKKOSBLAS1_NRM2W_IMPL_HPP_
diff --git a/src/blas/impl/KokkosBlas1_nrm2w_spec.hpp b/src/blas/impl/KokkosBlas1_nrm2w_spec.hpp
index 52ec21c35a..fe437bbc5c 100644
--- a/src/blas/impl/KokkosBlas1_nrm2w_spec.hpp
+++ b/src/blas/impl/KokkosBlas1_nrm2w_spec.hpp
@@ -49,19 +49,19 @@
 #include <Kokkos_ArithTraits.hpp>
 
 // Include the actual functors
-#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY 
+#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 #include <KokkosBlas1_nrm2w_impl.hpp>
 #endif
 
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class RMV, class XMV, int rank = XMV::rank>
+template <class RMV, class XMV, int rank = XMV::rank>
 struct nrm2w_eti_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization availability
@@ -70,14 +70,20 @@ struct nrm2w_eti_spec_avail {
 // We may spread out definitions (see _INST macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_NRM2W_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct nrm2w_eti_spec_avail< \
-        Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, LAYOUT, Kokkos::HostSpace, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_NRM2W_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE,           \
+                                         MEM_SPACE)                            \
+  template <>                                                                  \
+  struct nrm2w_eti_spec_avail<                                                 \
+      Kokkos::View<                                                            \
+          typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, \
+          LAYOUT, Kokkos::HostSpace,                                           \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                           \
+      Kokkos::View<const SCALAR*, LAYOUT,                                      \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1> {                                                                     \
+    enum : bool { value = true };                                              \
+  };
 
 //
 // Macro for declaration of full specialization availability
@@ -86,110 +92,129 @@ struct nrm2w_eti_spec_avail {
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct nrm2w_eti_spec_avail< \
-        Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        2> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \
+                                            MEM_SPACE)                  \
+  template <>                                                           \
+  struct nrm2w_eti_spec_avail<                                          \
+      Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<   \
+                       SCALAR>::mag_type*,                              \
+                   LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,           \
+      Kokkos::View<const SCALAR**, LAYOUT,                              \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,               \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,           \
+      2> {                                                              \
+    enum : bool { value = true };                                       \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosBlas1_nrm2w_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_nrm2w_eti_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_nrm2w_mv_eti_spec_avail.hpp>
+#include <KokkosBlas1_nrm2w_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_nrm2w_eti_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_nrm2w_mv_eti_spec_avail.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
 
 // Unification layer
-template<class RMV, class XMV, int rank = XMV::rank,
-         bool tpl_spec_avail = nrm2w_tpl_spec_avail<RMV,XMV>::value,
-         bool eti_spec_avail = nrm2w_eti_spec_avail<RMV,XMV>::value>
+template <class RMV, class XMV, int rank = XMV::rank,
+          bool tpl_spec_avail = nrm2w_tpl_spec_avail<RMV, XMV>::value,
+          bool eti_spec_avail = nrm2w_eti_spec_avail<RMV, XMV>::value>
 struct Nrm2w {
-  static void nrm2w (const RMV& R, const XMV& X, const XMV& W, const bool& take_sqrt);
+  static void nrm2w(const RMV& R, const XMV& X, const XMV& W,
+                    const bool& take_sqrt);
 };
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 //! Full specialization of Nrm2w for single vectors (1-D Views).
-template<class RMV, class XMV>
-struct Nrm2w<RMV, XMV, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>
-{
+template <class RMV, class XMV>
+struct Nrm2w<RMV, XMV, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename XMV::size_type size_type;
 
-  static void nrm2w (const RMV& R, const XMV& X, const XMV& W, const bool& take_sqrt)
-  {
-    static_assert (Kokkos::Impl::is_view<RMV>::value, "KokkosBlas::Impl::"
-                   "Nrm2w<1-D>: RMV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "Nrm2w<1-D>: XMV is not a Kokkos::View.");
-    static_assert (RMV::rank == 0, "KokkosBlas::Impl::Nrm2w<1-D>: "
-                   "RMV is not rank 0.");
-    static_assert (XMV::rank == 1, "KokkosBlas::Impl::Nrm2w<1-D>: "
-                   "XMV is not rank 1.");
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::nrm2w[ETI]":"KokkosBlas::nrm2w[noETI]");
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::nrm2w<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name());
+  static void nrm2w(const RMV& R, const XMV& X, const XMV& W,
+                    const bool& take_sqrt) {
+    static_assert(Kokkos::is_view<RMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Nrm2w<1-D>: RMV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Nrm2w<1-D>: XMV is not a Kokkos::View.");
+    static_assert(RMV::rank == 0,
+                  "KokkosBlas::Impl::Nrm2w<1-D>: "
+                  "RMV is not rank 0.");
+    static_assert(XMV::rank == 1,
+                  "KokkosBlas::Impl::Nrm2w<1-D>: "
+                  "XMV is not rank 1.");
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::nrm2w[ETI]"
+                                      : "KokkosBlas::nrm2w[noETI]");
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas1::nrm2w<> ETI specialization for < %s , %s >\n",
+             typeid(RMV).name(), typeid(XMV).name());
     else {
-      printf("KokkosBlas1::nrm2w<> non-ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name());
+      printf("KokkosBlas1::nrm2w<> non-ETI specialization for < %s , %s >\n",
+             typeid(RMV).name(), typeid(XMV).name());
     }
-    #endif
+#endif
     const size_type numRows = X.extent(0);
 
-    if (numRows < static_cast<size_type> (INT_MAX) ) {
-      V_Nrm2w_Invoke<RMV, XMV, int> (R, X, W, take_sqrt);
-    }
-    else {
+    if (numRows < static_cast<size_type>(INT_MAX)) {
+      V_Nrm2w_Invoke<RMV, XMV, int>(R, X, W, take_sqrt);
+    } else {
       typedef std::int64_t index_type;
-      V_Nrm2w_Invoke<RMV, XMV, index_type> (R, X, W, take_sqrt);
+      V_Nrm2w_Invoke<RMV, XMV, index_type>(R, X, W, take_sqrt);
     }
     Kokkos::Profiling::popRegion();
   }
 };
 
-
-template<class RV, class XMV>
+template <class RV, class XMV>
 struct Nrm2w<RV, XMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename XMV::size_type size_type;
 
-  static void nrm2w (const RV& R, const XMV& X, const XMV& W, const bool& take_sqrt)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::Impl::"
-                   "Nrm2w<2-D>: RV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "Nrm2w<2-D>: XMV is not a Kokkos::View.");
-    static_assert (RV::rank == 1, "KokkosBlas::Impl::Nrm2w<2-D>: "
-                   "RV is not rank 1.");
-    static_assert (XMV::rank == 2, "KokkosBlas::Impl::Nrm2w<2-D>: "
-                   "XMV is not rank 2.");
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::nrm2w[ETI]":"KokkosBlas::nrm2w[noETI]");
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::nrm2w<> ETI specialization for < %s , %s >\n",typeid(RV).name(),typeid(XMV).name());
+  static void nrm2w(const RV& R, const XMV& X, const XMV& W,
+                    const bool& take_sqrt) {
+    static_assert(Kokkos::is_view<RV>::value,
+                  "KokkosBlas::Impl::"
+                  "Nrm2w<2-D>: RV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Nrm2w<2-D>: XMV is not a Kokkos::View.");
+    static_assert(RV::rank == 1,
+                  "KokkosBlas::Impl::Nrm2w<2-D>: "
+                  "RV is not rank 1.");
+    static_assert(XMV::rank == 2,
+                  "KokkosBlas::Impl::Nrm2w<2-D>: "
+                  "XMV is not rank 2.");
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::nrm2w[ETI]"
+                                      : "KokkosBlas::nrm2w[noETI]");
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas1::nrm2w<> ETI specialization for < %s , %s >\n",
+             typeid(RV).name(), typeid(XMV).name());
     else {
-      printf("KokkosBlas1::nrm2w<> non-ETI specialization for < %s , %s >\n",typeid(RV).name(),typeid(XMV).name());
+      printf("KokkosBlas1::nrm2w<> non-ETI specialization for < %s , %s >\n",
+             typeid(RV).name(), typeid(XMV).name());
     }
-    #endif
+#endif
 
     const size_type numRows = X.extent(0);
     const size_type numCols = X.extent(1);
-    if (numRows < static_cast<size_type> (INT_MAX) &&
-        numRows * numCols < static_cast<size_type> (INT_MAX)) {
-      MV_Nrm2w_Invoke<RV, XMV, int> (R, X, W, take_sqrt);
-    }
-    else {
+    if (numRows < static_cast<size_type>(INT_MAX) &&
+        numRows * numCols < static_cast<size_type>(INT_MAX)) {
+      MV_Nrm2w_Invoke<RV, XMV, int>(R, X, W, take_sqrt);
+    } else {
       typedef std::int64_t index_type;
-      MV_Nrm2w_Invoke<RV, XMV, index_type> (R, X, W, take_sqrt);
+      MV_Nrm2w_Invoke<RV, XMV, index_type>(R, X, W, take_sqrt);
     }
     Kokkos::Profiling::popRegion();
   }
 };
 #endif
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization of
@@ -198,28 +223,32 @@ struct Nrm2w<RV, XMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_NRM2W_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct Nrm2w< \
-         Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, \
-                      LAYOUT, Kokkos::HostSpace, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         1, false, true>;
+#define KOKKOSBLAS1_NRM2W_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  extern template struct Nrm2w<                                                \
+      Kokkos::View<                                                            \
+          typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, \
+          LAYOUT, Kokkos::HostSpace,                                           \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                           \
+      Kokkos::View<const SCALAR*, LAYOUT,                                      \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1, false, true>;
 
 //
 // Macro for definition of full specialization of
 // KokkosBlas::Impl::Nrm2w for rank == 2.  This is NOT for users!!!  We
 // use this macro in one or more .cpp files in this directory.
 //
-#define KOKKOSBLAS1_NRM2W_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct Nrm2w< \
-         Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, \
-                      LAYOUT, Kokkos::HostSpace, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         1, false, true>;
+#define KOKKOSBLAS1_NRM2W_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  template struct Nrm2w<                                                       \
+      Kokkos::View<                                                            \
+          typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, \
+          LAYOUT, Kokkos::HostSpace,                                           \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                           \
+      Kokkos::View<const SCALAR*, LAYOUT,                                      \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1, false, true>;
 
 //
 // Macro for declaration of full specialization of
@@ -228,31 +257,37 @@ template struct Nrm2w< \
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct Nrm2w< \
-         Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type*, \
-                      LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         2, false, true>;
+#define KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \
+                                           MEM_SPACE)                  \
+  extern template struct Nrm2w<                                        \
+      Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<  \
+                       SCALAR>::mag_type*,                             \
+                   LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      Kokkos::View<const SCALAR*, LAYOUT,                              \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,              \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      2, false, true>;
 
 //
 // Macro for definition of full specialization of
 // KokkosBlas::Impl::Nrm2w for rank == 2.  This is NOT for users!!!  We
 // use this macro in one or more .cpp files in this directory.
 //
-#define KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct Nrm2w< \
-         Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type*, \
-                      LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         2, false, true>;
+#define KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \
+                                           MEM_SPACE)                  \
+  template struct Nrm2w<                                               \
+      Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<  \
+                       SCALAR>::mag_type*,                             \
+                   LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      Kokkos::View<const SCALAR**, LAYOUT,                             \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,              \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      2, false, true>;
 
-#include<KokkosBlas1_nrm2w_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_nrm2w_eti_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_nrm2w_mv_eti_spec_decl.hpp>
+#include <KokkosBlas1_nrm2w_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_nrm2w_eti_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_nrm2w_mv_eti_spec_decl.hpp>
 
-#endif // KOKKOSBLAS1_NRM2W_SPEC_HPP_
+#endif  // KOKKOSBLAS1_NRM2W_SPEC_HPP_
diff --git a/src/blas/impl/KokkosBlas1_nrminf_impl.hpp b/src/blas/impl/KokkosBlas1_nrminf_impl.hpp
index fce061369a..8950daf848 100644
--- a/src/blas/impl/KokkosBlas1_nrminf_impl.hpp
+++ b/src/blas/impl/KokkosBlas1_nrminf_impl.hpp
@@ -60,214 +60,76 @@ namespace Impl {
 /// \tparam RV 0-D output View
 /// \tparam XV 1-D input View
 /// \tparam SizeType Index type.  Use int (32 bits) if possible.
-template<class RV, class XV, class SizeType = typename XV::size_type>
-struct V_NrmInf_Functor
-{
-  typedef typename XV::execution_space              execution_space;
-  typedef SizeType                                        size_type;
-  typedef typename XV::non_const_value_type             xvalue_type;
+template <class RV, class XV, class SizeType = typename XV::size_type>
+struct V_NrmInf_Functor {
+  typedef typename XV::execution_space execution_space;
+  typedef SizeType size_type;
+  typedef typename XV::non_const_value_type xvalue_type;
   typedef Kokkos::Details::InnerProductSpaceTraits<xvalue_type> IPT;
-  typedef Kokkos::Details::ArithTraits<typename IPT::mag_type>   AT;
-  typedef typename IPT::mag_type                         value_type;
+  typedef Kokkos::Details::ArithTraits<typename IPT::mag_type> AT;
+  typedef typename IPT::mag_type value_type;
 
   typename XV::const_type m_x;
 
-  V_NrmInf_Functor (const XV& x) :
-    m_x (x)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value,
-                   "KokkosBlas::Impl::V_NrmInf_Functor: "
-                   "R is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XV>::value,
-                   "KokkosBlas::Impl::V_NrmInf_Functor: "
-                   "X is not a Kokkos::View.");
-    static_assert (std::is_same<typename RV::value_type,
-                   typename RV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::V_NrmInf_Functor: R is const.  "
-                   "It must be nonconst, because it is an output argument "
-                   "(we have to be able to write to its entries).");
-    static_assert (RV::rank == 0 && XV::rank == 1,
-                   "KokkosBlas::Impl::V_NrmInf_Functor: "
-                   "RV must have rank 0 and XV must have rank 1.");
+  V_NrmInf_Functor(const XV& x) : m_x(x) {
+    static_assert(Kokkos::is_view<RV>::value,
+                  "KokkosBlas::Impl::V_NrmInf_Functor: "
+                  "R is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XV>::value,
+                  "KokkosBlas::Impl::V_NrmInf_Functor: "
+                  "X is not a Kokkos::View.");
+    static_assert(std::is_same<typename RV::value_type,
+                               typename RV::non_const_value_type>::value,
+                  "KokkosBlas::Impl::V_NrmInf_Functor: R is const.  "
+                  "It must be nonconst, because it is an output argument "
+                  "(we have to be able to write to its entries).");
+    static_assert(RV::rank == 0 && XV::rank == 1,
+                  "KokkosBlas::Impl::V_NrmInf_Functor: "
+                  "RV must have rank 0 and XV must have rank 1.");
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i, value_type& max) const
-  {
-    value_type val =  IPT::norm (m_x(i));
-    if(val>max) max = val;
-  }
-};
-
-/// \brief Column-wise 2-norm functor for multivectors; works for
-///   any layout, but best performance with LayoutRight.
-///
-/// \tparam RV 1-D output View
-/// \tparam XMV 2-D input View
-/// \tparam SizeType Index type.  Use int (32 bits) if possible.
-template<class RV, class XMV, class SizeType = typename XMV::size_type>
-struct MV_NrmInf_Right_FunctorVector
-{
-  typedef typename XMV::execution_space             execution_space;
-  typedef SizeType                                        size_type;
-  typedef typename XMV::non_const_value_type            xvalue_type;
-  typedef Kokkos::Details::InnerProductSpaceTraits<xvalue_type> IPT;
-  typedef Kokkos::Details::ArithTraits<typename IPT::mag_type>   AT;
-  typedef typename IPT::mag_type                       value_type[];
-
-  size_type value_count;
-  typename XMV::const_type m_x;
-
-  MV_NrmInf_Right_FunctorVector (const XMV& x) :
-    value_count (x.extent(1)), m_x (x)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value,
-                   "KokkosBlas::Impl::MV_NrmInf_Right_FunctorVector: "
-                   "R is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value,
-                   "KokkosBlas::Impl::MV_NrmInf_Right_FunctorVector: "
-                   "X is not a Kokkos::View.");
-    static_assert (std::is_same<typename RV::value_type,
-                   typename RV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::MV_NrmInf_Right_FunctorVector: "
-                   "R is const.  It must be nonconst, because it is an output "
-                   "argument (we must be able to write to its entries).");
-    static_assert (RV::rank == 1 && XMV::rank == 2,
-                   "KokkosBlas::Impl::MV_NrmInf_Right_FunctorVector: "
-                   "RV must have rank 1 and XMV must have rank 2.");
-  }
-
-  KOKKOS_INLINE_FUNCTION void
-  operator() (const size_type i, value_type max) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type j = 0; j < numVecs; ++j) {
-      typename IPT::mag_type val =  IPT::norm (m_x(i,j));
-      if(val>max[j]) max[j] = val;
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION void
-  init (value_type update) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type j = 0; j < numVecs; ++j) {
-      update[j] = AT::min ();
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION void
-  join (volatile value_type update,
-        const volatile value_type source) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type j = 0; j < numVecs; ++j) {
-      if(update[j] < source[j])
-        update[j] = source[j];
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION void
-  join (value_type update,
-        const value_type source) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type j = 0; j < numVecs; ++j) {
-      if(update[j] < source[j])
-        update[j] = source[j];
-    }
+  void operator()(const size_type& i, value_type& max) const {
+    value_type val = IPT::norm(m_x(i));
+    if (val > max) max = val;
   }
 };
 
-
 /// \brief Compute the 2-norm (or its square) of the single vector (1-D
 ///   View) X, and store the result in the 0-D View r.
-template<class RV, class XV, class SizeType>
-void
-V_NrmInf_Invoke (const RV& r, const XV& X)
-{
+template <class RV, class XV, class SizeType>
+void V_NrmInf_Invoke(const RV& r, const XV& X) {
   typedef typename XV::execution_space execution_space;
   typedef Kokkos::Details::ArithTraits<typename RV::non_const_value_type> AT;
 
-  const SizeType numRows = static_cast<SizeType> (X.extent(0));
+  const SizeType numRows = static_cast<SizeType>(X.extent(0));
 
   // Avoid Max Reduction if this is a zero length view
-  if( numRows == 0 ) {
-    Kokkos::deep_copy(r,AT::zero());
+  if (numRows == 0) {
+    Kokkos::deep_copy(r, AT::zero());
     return;
   }
 
-  Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
+  Kokkos::RangePolicy<execution_space, SizeType> policy(0, numRows);
 
   typedef V_NrmInf_Functor<RV, XV, SizeType> functor_type;
-  functor_type op (X);
-  Kokkos::parallel_reduce ("KokkosBlas::NrmInf::S0", policy, op, Kokkos::Max<typename RV::non_const_value_type>(r()));
+  functor_type op(X);
+  Kokkos::parallel_reduce("KokkosBlas::NrmInf::S0", policy, op,
+                          Kokkos::Max<typename RV::non_const_value_type>(r()));
 }
 
-
 /// \brief Compute the 2-norms (or their square) of the columns of the
 ///   multivector (2-D View) X, and store result(s) in the 1-D View r.
-template<class RV, class XMV, class SizeType>
-void
-MV_NrmInf_Invoke (const RV& r, const XMV& X)
-{
-  typedef typename XMV::execution_space execution_space;
-  typedef Kokkos::Details::ArithTraits<typename RV::non_const_value_type> AT;
-
-  const SizeType numRows = static_cast<SizeType> (X.extent(0));
-
-  // Avoid Max Reduction if this is a zero length view
-  if( numRows == 0 ) {
-    Kokkos::deep_copy(r,AT::zero());
-    return;
-  }
-
-  Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
-
-  // If the input multivector (2-D View) has only one column, invoke
-  // the single-vector version of the kernel.
-  if (X.extent(1) == 1) {
-    typedef Kokkos::View<typename RV::non_const_value_type,
-                         typename RV::array_layout,
-                         typename RV::device_type,
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged>> RV0D;
-    RV0D r_0(r, 0);
-    auto X_0 = Kokkos::subview (X, Kokkos::ALL (), 0);
-    typedef decltype (X_0) XV1D;
-    V_NrmInf_Invoke<RV0D, XV1D, SizeType> (r_0, X_0);
-  }
-  else {
-    typedef MV_NrmInf_Right_FunctorVector<RV, XMV, SizeType> functor_type;
-    functor_type op (X);
-    Kokkos::parallel_reduce ("KokkosBlas::NrmInf::S1", policy, op, r);
+template <class RV, class XMV, class SizeType>
+void MV_NrmInf_Invoke(const RV& r, const XMV& X) {
+  for (size_t i = 0; i < X.extent(1); i++) {
+    auto ri = Kokkos::subview(r, i);
+    auto Xi = Kokkos::subview(X, Kokkos::ALL(), i);
+    V_NrmInf_Invoke<decltype(ri), decltype(Xi), SizeType>(ri, Xi);
   }
 }
 
-} // namespace Impl
-} // namespace KokkosBlas
+}  // namespace Impl
+}  // namespace KokkosBlas
 
-#endif // KOKKOSBLAS1_NRMINF_IMPL_HPP_
+#endif  // KOKKOSBLAS1_NRMINF_IMPL_HPP_
diff --git a/src/blas/impl/KokkosBlas1_nrminf_spec.hpp b/src/blas/impl/KokkosBlas1_nrminf_spec.hpp
index 686c5aec20..91a7e3278f 100644
--- a/src/blas/impl/KokkosBlas1_nrminf_spec.hpp
+++ b/src/blas/impl/KokkosBlas1_nrminf_spec.hpp
@@ -57,12 +57,12 @@
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class RMV, class XMV, int rank = XMV::rank>
+template <class RMV, class XMV, int rank = XMV::rank>
 struct nrminf_eti_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization availability
@@ -71,14 +71,20 @@ struct nrminf_eti_spec_avail {
 // We may spread out definitions (see _INST macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_NRMINF_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct nrminf_eti_spec_avail< \
-        Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, LAYOUT, Kokkos::HostSpace, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_NRMINF_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE,          \
+                                          MEM_SPACE)                           \
+  template <>                                                                  \
+  struct nrminf_eti_spec_avail<                                                \
+      Kokkos::View<                                                            \
+          typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, \
+          LAYOUT, Kokkos::HostSpace,                                           \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                           \
+      Kokkos::View<const SCALAR*, LAYOUT,                                      \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1> {                                                                     \
+    enum : bool { value = true };                                              \
+  };
 
 //
 // Macro for declaration of full specialization availability
@@ -87,112 +93,128 @@ struct nrminf_eti_spec_avail {
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct nrminf_eti_spec_avail< \
-        Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type*, \
-                     LAYOUT, \
-                     Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        2> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \
+                                             MEM_SPACE)                  \
+  template <>                                                            \
+  struct nrminf_eti_spec_avail<                                          \
+      Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<    \
+                       SCALAR>::mag_type*,                               \
+                   LAYOUT,                                               \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace,     \
+                                  Kokkos::HostSpace>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,            \
+      Kokkos::View<const SCALAR**, LAYOUT,                               \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,            \
+      2> {                                                               \
+    enum : bool { value = true };                                        \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosBlas1_nrminf_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_nrminf_eti_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_nrminf_mv_eti_spec_avail.hpp>
+#include <KokkosBlas1_nrminf_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_nrminf_eti_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_nrminf_mv_eti_spec_avail.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
 
 // Unification layer
-template<class RMV, class XMV, int rank = XMV::rank,
-         bool tpl_spec_avail = nrminf_tpl_spec_avail<RMV,XMV>::value,
-         bool eti_spec_avail = nrminf_eti_spec_avail<RMV,XMV>::value>
+template <class RMV, class XMV, int rank = XMV::rank,
+          bool tpl_spec_avail = nrminf_tpl_spec_avail<RMV, XMV>::value,
+          bool eti_spec_avail = nrminf_eti_spec_avail<RMV, XMV>::value>
 struct NrmInf {
-  static void nrminf (const RMV& R, const XMV& X);
+  static void nrminf(const RMV& R, const XMV& X);
 };
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 //! Full specialization of NrmInf for single vectors (1-D Views).
-template<class RMV, class XMV>
-struct NrmInf<RMV, XMV, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>
-{
+template <class RMV, class XMV>
+struct NrmInf<RMV, XMV, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename XMV::size_type size_type;
 
-  static void nrminf (const RMV& R, const XMV& X)
-  {
-    static_assert (Kokkos::Impl::is_view<RMV>::value, "KokkosBlas::Impl::"
-                   "NrmInf<1-D>: RMV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "NrmInf<1-D>: XMV is not a Kokkos::View.");
-    static_assert (RMV::rank == 0, "KokkosBlas::Impl::NrmInf<1-D>: "
-                   "RMV is not rank 0.");
-    static_assert (XMV::rank == 1, "KokkosBlas::Impl::NrmInf<1-D>: "
-                   "XMV is not rank 1.");
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::nrminf[ETI]":"KokkosBlas::nrminf[noETI]");
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::nrminf<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name());
+  static void nrminf(const RMV& R, const XMV& X) {
+    static_assert(Kokkos::is_view<RMV>::value,
+                  "KokkosBlas::Impl::"
+                  "NrmInf<1-D>: RMV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "NrmInf<1-D>: XMV is not a Kokkos::View.");
+    static_assert(RMV::rank == 0,
+                  "KokkosBlas::Impl::NrmInf<1-D>: "
+                  "RMV is not rank 0.");
+    static_assert(XMV::rank == 1,
+                  "KokkosBlas::Impl::NrmInf<1-D>: "
+                  "XMV is not rank 1.");
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::nrminf[ETI]"
+                                      : "KokkosBlas::nrminf[noETI]");
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas1::nrminf<> ETI specialization for < %s , %s >\n",
+             typeid(RMV).name(), typeid(XMV).name());
     else {
-      printf("KokkosBlas1::nrminf<> non-ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name());
+      printf("KokkosBlas1::nrminf<> non-ETI specialization for < %s , %s >\n",
+             typeid(RMV).name(), typeid(XMV).name());
     }
-    #endif
+#endif
     const size_type numRows = X.extent(0);
 
-    if (numRows < static_cast<size_type> (INT_MAX) ) {
-      V_NrmInf_Invoke<RMV, XMV, int> (R, X);
-    }
-    else {
+    if (numRows < static_cast<size_type>(INT_MAX)) {
+      V_NrmInf_Invoke<RMV, XMV, int>(R, X);
+    } else {
       typedef std::int64_t index_type;
-      V_NrmInf_Invoke<RMV, XMV, index_type> (R, X);
+      V_NrmInf_Invoke<RMV, XMV, index_type>(R, X);
     }
     Kokkos::Profiling::popRegion();
   }
 };
 
-
-template<class RV, class XMV>
+template <class RV, class XMV>
 struct NrmInf<RV, XMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename XMV::size_type size_type;
 
-  static void nrminf (const RV& R, const XMV& X)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::Impl::"
-                   "NrmInf<2-D>: RV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "NrmInf<2-D>: XMV is not a Kokkos::View.");
-    static_assert (RV::rank == 1, "KokkosBlas::Impl::NrmInf<2-D>: "
-                   "RV is not rank 1.");
-    static_assert (XMV::rank == 2, "KokkosBlas::Impl::NrmInf<2-D>: "
-                   "XMV is not rank 2.");
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::nrminf[ETI]":"KokkosBlas::nrminf[noETI]");
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::nrminf<> ETI specialization for < %s , %s >\n",typeid(RV).name(),typeid(XMV).name());
+  static void nrminf(const RV& R, const XMV& X) {
+    static_assert(Kokkos::is_view<RV>::value,
+                  "KokkosBlas::Impl::"
+                  "NrmInf<2-D>: RV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "NrmInf<2-D>: XMV is not a Kokkos::View.");
+    static_assert(RV::rank == 1,
+                  "KokkosBlas::Impl::NrmInf<2-D>: "
+                  "RV is not rank 1.");
+    static_assert(XMV::rank == 2,
+                  "KokkosBlas::Impl::NrmInf<2-D>: "
+                  "XMV is not rank 2.");
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::nrminf[ETI]"
+                                      : "KokkosBlas::nrminf[noETI]");
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas1::nrminf<> ETI specialization for < %s , %s >\n",
+             typeid(RV).name(), typeid(XMV).name());
     else {
-      printf("KokkosBlas1::nrminf<> non-ETI specialization for < %s , %s >\n",typeid(RV).name(),typeid(XMV).name());
+      printf("KokkosBlas1::nrminf<> non-ETI specialization for < %s , %s >\n",
+             typeid(RV).name(), typeid(XMV).name());
     }
-    #endif
+#endif
 
     const size_type numRows = X.extent(0);
     const size_type numCols = X.extent(1);
-    if (numRows < static_cast<size_type> (INT_MAX) &&
-        numRows * numCols < static_cast<size_type> (INT_MAX)) {
-      MV_NrmInf_Invoke<RV, XMV, int> (R, X);
-    }
-    else {
+    if (numRows < static_cast<size_type>(INT_MAX) &&
+        numRows * numCols < static_cast<size_type>(INT_MAX)) {
+      MV_NrmInf_Invoke<RV, XMV, int>(R, X);
+    } else {
       typedef std::int64_t index_type;
-      MV_NrmInf_Invoke<RV, XMV, index_type> (R, X);
+      MV_NrmInf_Invoke<RV, XMV, index_type>(R, X);
     }
     Kokkos::Profiling::popRegion();
   }
 };
 #endif
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization of
@@ -201,28 +223,34 @@ struct NrmInf<RV, XMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_NRMINF_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct NrmInf< \
-         Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, \
-                      LAYOUT, Kokkos::HostSpace, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         1, false, true>;
+#define KOKKOSBLAS1_NRMINF_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE,           \
+                                         MEM_SPACE)                            \
+  extern template struct NrmInf<                                               \
+      Kokkos::View<                                                            \
+          typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, \
+          LAYOUT, Kokkos::HostSpace,                                           \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                           \
+      Kokkos::View<const SCALAR*, LAYOUT,                                      \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1, false, true>;
 
 //
 // Macro for definition of full specialization of
 // KokkosBlas::Impl::NrmInf for rank == 2.  This is NOT for users!!!  We
 // use this macro in one or more .cpp files in this directory.
 //
-#define KOKKOSBLAS1_NRMINF_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct NrmInf< \
-         Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, \
-                      LAYOUT, Kokkos::HostSpace, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         1, false, true>;
+#define KOKKOSBLAS1_NRMINF_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE,           \
+                                         MEM_SPACE)                            \
+  template struct NrmInf<                                                      \
+      Kokkos::View<                                                            \
+          typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, \
+          LAYOUT, Kokkos::HostSpace,                                           \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                           \
+      Kokkos::View<const SCALAR*, LAYOUT,                                      \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1, false, true>;
 
 //
 // Macro for declaration of full specialization of
@@ -231,33 +259,41 @@ template struct NrmInf< \
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct NrmInf< \
-         Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type*, \
-                      LAYOUT, \
-                      Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         2, false, true>;
+#define KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \
+                                            MEM_SPACE)                  \
+  extern template struct NrmInf<                                        \
+      Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<   \
+                       SCALAR>::mag_type*,                              \
+                   LAYOUT,                                              \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace,    \
+                                  Kokkos::HostSpace>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,           \
+      Kokkos::View<const SCALAR*, LAYOUT,                               \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,               \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,           \
+      2, false, true>;
 
 //
 // Macro for definition of full specialization of
 // KokkosBlas::Impl::NrmInf for rank == 2.  This is NOT for users!!!  We
 // use this macro in one or more .cpp files in this directory.
 //
-#define KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct NrmInf< \
-         Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type*, \
-                      LAYOUT, \
-                      Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         2, false, true>;
+#define KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \
+                                            MEM_SPACE)                  \
+  template struct NrmInf<                                               \
+      Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<   \
+                       SCALAR>::mag_type*,                              \
+                   LAYOUT,                                              \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace,    \
+                                  Kokkos::HostSpace>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,           \
+      Kokkos::View<const SCALAR**, LAYOUT,                              \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,               \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,           \
+      2, false, true>;
 
-#include<KokkosBlas1_nrminf_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_nrminf_eti_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_nrminf_mv_eti_spec_decl.hpp>
+#include <KokkosBlas1_nrminf_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_nrminf_eti_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_nrminf_mv_eti_spec_decl.hpp>
 
-#endif // KOKKOSBLAS1_NRMINF_SPEC_HPP_
+#endif  // KOKKOSBLAS1_NRMINF_SPEC_HPP_
diff --git a/src/blas/impl/KokkosBlas1_reciprocal_impl.hpp b/src/blas/impl/KokkosBlas1_reciprocal_impl.hpp
index 131b70ec56..74cf69dee7 100644
--- a/src/blas/impl/KokkosBlas1_reciprocal_impl.hpp
+++ b/src/blas/impl/KokkosBlas1_reciprocal_impl.hpp
@@ -56,186 +56,186 @@ namespace Impl {
 //
 
 // Entry-wise reciprocalolute value / magnitude: R(i,j) = reciprocal(X(i,j)).
-template<class RMV, class XMV, class SizeType = typename RMV::size_type>
-struct MV_Reciprocal_Functor
-{
+template <class RMV, class XMV, class SizeType = typename RMV::size_type>
+struct MV_Reciprocal_Functor {
   typedef typename RMV::execution_space execution_space;
-  typedef SizeType                            size_type;
+  typedef SizeType size_type;
   typedef Kokkos::Details::ArithTraits<typename XMV::non_const_value_type> ATS;
 
   const size_type numCols;
   RMV R_;
   XMV X_;
 
-  MV_Reciprocal_Functor (const RMV& R, const XMV& X) :
-    numCols (X.extent(1)), R_ (R), X_ (X)
-  {
-    static_assert (Kokkos::Impl::is_view<RMV>::value, "KokkosBlas::Impl::"
-                   "MV_Reciprocal_Functor: RMV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "MV_Reciprocal_Functor: XMV is not a Kokkos::View.");
-    static_assert (RMV::rank == 2, "KokkosBlas::Impl::"
-                   "MV_Reciprocal_Functor: RMV is not rank 2");
-    static_assert (XMV::rank == 2, "KokkosBlas::Impl::"
-                   "MV_Reciprocal_Functor: XMV is not rank 2");
+  MV_Reciprocal_Functor(const RMV& R, const XMV& X)
+      : numCols(X.extent(1)), R_(R), X_(X) {
+    static_assert(Kokkos::is_view<RMV>::value,
+                  "KokkosBlas::Impl::"
+                  "MV_Reciprocal_Functor: RMV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "MV_Reciprocal_Functor: XMV is not a Kokkos::View.");
+    static_assert(RMV::rank == 2,
+                  "KokkosBlas::Impl::"
+                  "MV_Reciprocal_Functor: RMV is not rank 2");
+    static_assert(XMV::rank == 2,
+                  "KokkosBlas::Impl::"
+                  "MV_Reciprocal_Functor: XMV is not rank 2");
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i) const
-  {
+  void operator()(const size_type& i) const {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
     for (size_type j = 0; j < numCols; ++j) {
-      R_(i,j) = ATS::one() / X_(i,j);
+      R_(i, j) = ATS::one() / X_(i, j);
     }
   }
 };
 
-// Entry-wise, in-place reciprocalolute value / magnitude: R(i,j) = reciprocal(R(i,j)).
-template<class RMV, class SizeType = typename RMV::size_type>
-struct MV_ReciprocalSelf_Functor
-{
+// Entry-wise, in-place reciprocalolute value / magnitude: R(i,j) =
+// reciprocal(R(i,j)).
+template <class RMV, class SizeType = typename RMV::size_type>
+struct MV_ReciprocalSelf_Functor {
   typedef typename RMV::execution_space execution_space;
-  typedef SizeType                            size_type;
+  typedef SizeType size_type;
   typedef Kokkos::Details::ArithTraits<typename RMV::non_const_value_type> ATS;
 
   const size_type numCols;
   RMV R_;
 
-  MV_ReciprocalSelf_Functor (const RMV& R) :
-    numCols (R.extent(1)), R_ (R)
-  {
-    static_assert (Kokkos::Impl::is_view<RMV>::value, "KokkosBlas::Impl::"
-                   "MV_Reciprocal_Functor: RMV is not a Kokkos::View.");
-    static_assert (RMV::rank == 2, "KokkosBlas::Impl::"
-                   "MV_Reciprocal_Functor: RMV is not rank 2");
+  MV_ReciprocalSelf_Functor(const RMV& R) : numCols(R.extent(1)), R_(R) {
+    static_assert(Kokkos::is_view<RMV>::value,
+                  "KokkosBlas::Impl::"
+                  "MV_Reciprocal_Functor: RMV is not a Kokkos::View.");
+    static_assert(RMV::rank == 2,
+                  "KokkosBlas::Impl::"
+                  "MV_Reciprocal_Functor: RMV is not rank 2");
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i) const
-  {
+  void operator()(const size_type& i) const {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
     for (size_type j = 0; j < numCols; ++j) {
-      R_(i,j) = ATS::one() / R_(i,j);
+      R_(i, j) = ATS::one() / R_(i, j);
     }
   }
 };
 
-// Single-vector, entry-wise reciprocalolute value / magnitude: R(i) = reciprocal(X(i)).
-template<class RV, class XV, class SizeType = typename RV::size_type>
-struct V_Reciprocal_Functor
-{
+// Single-vector, entry-wise reciprocalolute value / magnitude: R(i) =
+// reciprocal(X(i)).
+template <class RV, class XV, class SizeType = typename RV::size_type>
+struct V_Reciprocal_Functor {
   typedef typename RV::execution_space execution_space;
-  typedef SizeType                            size_type;
+  typedef SizeType size_type;
   typedef Kokkos::Details::ArithTraits<typename XV::non_const_value_type> ATS;
 
   RV R_;
   XV X_;
 
-  V_Reciprocal_Functor (const RV& R, const XV& X) : R_ (R), X_ (X)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::Impl::"
-                   "V_Reciprocal_Functor: RV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XV>::value, "KokkosBlas::Impl::"
-                   "V_Reciprocal_Functor: XV is not a Kokkos::View.");
-    static_assert (RV::rank == 1, "KokkosBlas::Impl::"
-                   "V_Reciprocal_Functor: RV is not rank 1");
-    static_assert (XV::rank == 1, "KokkosBlas::Impl::"
-                   "V_Reciprocal_Functor: XV is not rank 1");
+  V_Reciprocal_Functor(const RV& R, const XV& X) : R_(R), X_(X) {
+    static_assert(Kokkos::is_view<RV>::value,
+                  "KokkosBlas::Impl::"
+                  "V_Reciprocal_Functor: RV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XV>::value,
+                  "KokkosBlas::Impl::"
+                  "V_Reciprocal_Functor: XV is not a Kokkos::View.");
+    static_assert(RV::rank == 1,
+                  "KokkosBlas::Impl::"
+                  "V_Reciprocal_Functor: RV is not rank 1");
+    static_assert(XV::rank == 1,
+                  "KokkosBlas::Impl::"
+                  "V_Reciprocal_Functor: XV is not rank 1");
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i) const
-  {
-    R_(i) = ATS::one() / X_(i);
-  }
+  void operator()(const size_type& i) const { R_(i) = ATS::one() / X_(i); }
 };
 
-// Single-vector, entry-wise, in-place reciprocalolute value / magnitude: R(i) = reciprocal(R(i)).
-template<class RV, class SizeType = typename RV::size_type>
-struct V_ReciprocalSelf_Functor
-{
+// Single-vector, entry-wise, in-place reciprocalolute value / magnitude: R(i) =
+// reciprocal(R(i)).
+template <class RV, class SizeType = typename RV::size_type>
+struct V_ReciprocalSelf_Functor {
   typedef typename RV::execution_space execution_space;
-  typedef SizeType                            size_type;
+  typedef SizeType size_type;
   typedef Kokkos::Details::ArithTraits<typename RV::non_const_value_type> ATS;
 
   RV R_;
 
-  V_ReciprocalSelf_Functor (const RV& R) : R_ (R)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::Impl::"
-                   "V_Reciprocal_Functor: RV is not a Kokkos::View.");
-    static_assert (RV::rank == 1, "KokkosBlas::Impl::"
-                   "V_Reciprocal_Functor: RV is not rank 1");
+  V_ReciprocalSelf_Functor(const RV& R) : R_(R) {
+    static_assert(Kokkos::is_view<RV>::value,
+                  "KokkosBlas::Impl::"
+                  "V_Reciprocal_Functor: RV is not a Kokkos::View.");
+    static_assert(RV::rank == 1,
+                  "KokkosBlas::Impl::"
+                  "V_Reciprocal_Functor: RV is not rank 1");
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i) const
-  {
-    R_(i) = ATS::one() / R_(i);
-  }
+  void operator()(const size_type& i) const { R_(i) = ATS::one() / R_(i); }
 };
 
 // Invoke the "generic" (not unrolled) multivector functor that
 // computes entry-wise reciprocalolute value.
-template<class RMV, class XMV, class SizeType>
-void
-MV_Reciprocal_Generic (const RMV& R, const XMV& X)
-{
-  static_assert (Kokkos::Impl::is_view<RMV>::value, "KokkosBlas::Impl::"
-                 "MV_Reciprocal_Generic: RMV is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                 "MV_Reciprocal_Generic: XMV is not a Kokkos::View.");
-  static_assert (RMV::rank == 2, "KokkosBlas::Impl::"
-                 "MV_Reciprocal_Generic: RMV is not rank 2");
-  static_assert (XMV::rank == 2, "KokkosBlas::Impl::"
-                 "MV_Reciprocal_Generic: XMV is not rank 2");
+template <class RMV, class XMV, class SizeType>
+void MV_Reciprocal_Generic(const RMV& R, const XMV& X) {
+  static_assert(Kokkos::is_view<RMV>::value,
+                "KokkosBlas::Impl::"
+                "MV_Reciprocal_Generic: RMV is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<XMV>::value,
+                "KokkosBlas::Impl::"
+                "MV_Reciprocal_Generic: XMV is not a Kokkos::View.");
+  static_assert(RMV::rank == 2,
+                "KokkosBlas::Impl::"
+                "MV_Reciprocal_Generic: RMV is not rank 2");
+  static_assert(XMV::rank == 2,
+                "KokkosBlas::Impl::"
+                "MV_Reciprocal_Generic: XMV is not rank 2");
 
   typedef typename XMV::execution_space execution_space;
   const SizeType numRows = X.extent(0);
-  Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
-
-  if (R == X) { // if R and X are the same (alias one another)
-    MV_ReciprocalSelf_Functor<RMV, SizeType> op (R);
-    Kokkos::parallel_for ("KokkosBlas::Reciprocal::S0", policy, op);
-  }
-  else {
-    MV_Reciprocal_Functor<RMV, XMV, SizeType> op (R, X);
-    Kokkos::parallel_for ("KokkosBlas::Reciprocal::S1", policy, op);
+  Kokkos::RangePolicy<execution_space, SizeType> policy(0, numRows);
+
+  if (R == X) {  // if R and X are the same (alias one another)
+    MV_ReciprocalSelf_Functor<RMV, SizeType> op(R);
+    Kokkos::parallel_for("KokkosBlas::Reciprocal::S0", policy, op);
+  } else {
+    MV_Reciprocal_Functor<RMV, XMV, SizeType> op(R, X);
+    Kokkos::parallel_for("KokkosBlas::Reciprocal::S1", policy, op);
   }
 }
 
 // Variant of MV_Reciprocal_Generic for single vectors (1-D Views) R and X.
-template<class RV, class XV, class SizeType>
-void
-V_Reciprocal_Generic (const RV& R, const XV& X)
-{
-  static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::Impl::"
-                 "V_Reciprocal_Generic: RV is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<XV>::value, "KokkosBlas::Impl::"
-                 "V_Reciprocal_Generic: XV is not a Kokkos::View.");
-  static_assert (RV::rank == 1, "KokkosBlas::Impl::"
-                 "V_Reciprocal_Generic: RV is not rank 1");
-  static_assert (XV::rank == 1, "KokkosBlas::Impl::"
-                 "V_Reciprocal_Generic: XV is not rank 1");
+template <class RV, class XV, class SizeType>
+void V_Reciprocal_Generic(const RV& R, const XV& X) {
+  static_assert(Kokkos::is_view<RV>::value,
+                "KokkosBlas::Impl::"
+                "V_Reciprocal_Generic: RV is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<XV>::value,
+                "KokkosBlas::Impl::"
+                "V_Reciprocal_Generic: XV is not a Kokkos::View.");
+  static_assert(RV::rank == 1,
+                "KokkosBlas::Impl::"
+                "V_Reciprocal_Generic: RV is not rank 1");
+  static_assert(XV::rank == 1,
+                "KokkosBlas::Impl::"
+                "V_Reciprocal_Generic: XV is not rank 1");
 
   typedef typename XV::execution_space execution_space;
   const SizeType numRows = X.extent(0);
-  Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
-
-  if (R == X) { // if R and X are the same (alias one another)
-    V_ReciprocalSelf_Functor<RV, SizeType> op (R);
-    Kokkos::parallel_for ("KokkosBlas::Reciprocal::S2", policy, op);
-  }
-  else {
-    V_Reciprocal_Functor<RV, XV, SizeType> op (R, X);
-    Kokkos::parallel_for ("KokkosBlas::Reciprocal::S3", policy, op);
+  Kokkos::RangePolicy<execution_space, SizeType> policy(0, numRows);
+
+  if (R == X) {  // if R and X are the same (alias one another)
+    V_ReciprocalSelf_Functor<RV, SizeType> op(R);
+    Kokkos::parallel_for("KokkosBlas::Reciprocal::S2", policy, op);
+  } else {
+    V_Reciprocal_Functor<RV, XV, SizeType> op(R, X);
+    Kokkos::parallel_for("KokkosBlas::Reciprocal::S3", policy, op);
   }
 }
 
-}
-}
-#endif // KOKKOS_BLAS1_MV_IMPL_RECIPROCAL_HPP_
+}  // namespace Impl
+}  // namespace KokkosBlas
+#endif  // KOKKOS_BLAS1_MV_IMPL_RECIPROCAL_HPP_
diff --git a/src/blas/impl/KokkosBlas1_reciprocal_spec.hpp b/src/blas/impl/KokkosBlas1_reciprocal_spec.hpp
index ba1502f016..3050be8aa3 100644
--- a/src/blas/impl/KokkosBlas1_reciprocal_spec.hpp
+++ b/src/blas/impl/KokkosBlas1_reciprocal_spec.hpp
@@ -49,19 +49,19 @@
 #include <Kokkos_ArithTraits.hpp>
 
 // Include the actual functors
-#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY 
+#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 #include <KokkosBlas1_reciprocal_impl.hpp>
 #endif
 
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class RMV, class XMV, int rank = RMV::rank>
+template <class RMV, class XMV, int rank = RMV::rank>
 struct reciprocal_eti_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization availability
@@ -70,14 +70,18 @@ struct reciprocal_eti_spec_avail {
 // We may spread out definitions (see _INST macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct reciprocal_eti_spec_avail< \
-        Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE,  \
+                                              MEM_SPACE)                   \
+  template <>                                                              \
+  struct reciprocal_eti_spec_avail<                                        \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      Kokkos::View<const SCALAR*, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      1> {                                                                 \
+    enum : bool { value = true };                                          \
+  };
 
 //
 // Macro for declaration of full specialization availability
@@ -86,111 +90,127 @@ struct reciprocal_eti_spec_avail {
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct reciprocal_eti_spec_avail< \
-        Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        2> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \
+                                                 MEM_SPACE)                  \
+  template <>                                                                \
+  struct reciprocal_eti_spec_avail<                                          \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<const SCALAR**, LAYOUT,                                   \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      2> {                                                                   \
+    enum : bool { value = true };                                            \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosBlas1_reciprocal_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_reciprocal_eti_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_reciprocal_mv_eti_spec_avail.hpp>
+#include <KokkosBlas1_reciprocal_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_reciprocal_eti_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_reciprocal_mv_eti_spec_avail.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
 
 // Unification layer
-template<class RMV, class XMV, int rank = RMV::rank,
-         bool tpl_spec_avail = reciprocal_tpl_spec_avail<RMV,XMV>::value,
-         bool eti_spec_avail = reciprocal_eti_spec_avail<RMV,XMV>::value>
+template <class RMV, class XMV, int rank = RMV::rank,
+          bool tpl_spec_avail = reciprocal_tpl_spec_avail<RMV, XMV>::value,
+          bool eti_spec_avail = reciprocal_eti_spec_avail<RMV, XMV>::value>
 struct Reciprocal {
-  static void reciprocal (const RMV& R, const XMV& X);
+  static void reciprocal(const RMV& R, const XMV& X);
 };
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 //! Full specialization of Reciprocal for single vectors (1-D Views).
-template<class RMV, class XMV>
-struct Reciprocal<RMV, XMV, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>
-{
+template <class RMV, class XMV>
+struct Reciprocal<RMV, XMV, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename XMV::size_type size_type;
 
-  static void reciprocal (const RMV& R, const XMV& X)
-  {
-    static_assert (Kokkos::Impl::is_view<RMV>::value, "KokkosBlas::Impl::"
-                   "Reciprocal<1-D>: RMV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "Reciprocal<1-D>: XMV is not a Kokkos::View.");
-    static_assert (RMV::rank == 1, "KokkosBlas::Impl::Reciprocal<1-D>: "
-                   "RMV is not rank 1.");
-    static_assert (XMV::rank == 1, "KokkosBlas::Impl::Reciprocal<1-D>: "
-                   "XMV is not rank 1.");
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::reciprocal[ETI]":"KokkosBlas::reciprocal[noETI]");
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::reciprocal<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name());
+  static void reciprocal(const RMV& R, const XMV& X) {
+    static_assert(Kokkos::is_view<RMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Reciprocal<1-D>: RMV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Reciprocal<1-D>: XMV is not a Kokkos::View.");
+    static_assert(RMV::rank == 1,
+                  "KokkosBlas::Impl::Reciprocal<1-D>: "
+                  "RMV is not rank 1.");
+    static_assert(XMV::rank == 1,
+                  "KokkosBlas::Impl::Reciprocal<1-D>: "
+                  "XMV is not rank 1.");
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::reciprocal[ETI]"
+                                      : "KokkosBlas::reciprocal[noETI]");
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas1::reciprocal<> ETI specialization for < %s , %s >\n",
+             typeid(RMV).name(), typeid(XMV).name());
     else {
-      printf("KokkosBlas1::reciprocal<> non-ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name());
+      printf(
+          "KokkosBlas1::reciprocal<> non-ETI specialization for < %s , %s >\n",
+          typeid(RMV).name(), typeid(XMV).name());
     }
-    #endif
+#endif
     const size_type numRows = X.extent(0);
 
-    if (numRows < static_cast<size_type> (INT_MAX)) {
+    if (numRows < static_cast<size_type>(INT_MAX)) {
       typedef int index_type;
-      V_Reciprocal_Generic<RMV, XMV, index_type> (R, X);
-    }
-    else {
+      V_Reciprocal_Generic<RMV, XMV, index_type>(R, X);
+    } else {
       typedef std::int64_t index_type;
-      V_Reciprocal_Generic<RMV, XMV, index_type> (R, X);
+      V_Reciprocal_Generic<RMV, XMV, index_type>(R, X);
     }
     Kokkos::Profiling::popRegion();
   }
 };
 
-template<class RMV, class XMV>
+template <class RMV, class XMV>
 struct Reciprocal<RMV, XMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename XMV::size_type size_type;
 
-  static void reciprocal (const RMV& R, const XMV& X)
-  {
-    static_assert (Kokkos::Impl::is_view<RMV>::value, "KokkosBlas::Impl::"
-                   "Reciprocal<2-D>: RMV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "Reciprocal<2-D>: XMV is not a Kokkos::View.");
-    static_assert (RMV::rank == 2, "KokkosBlas::Impl::Reciprocal<2-D>: "
-                   "RMV is not rank 2.");
-    static_assert (XMV::rank == 2, "KokkosBlas::Impl::Reciprocal<2-D>: "
-                   "XMV is not rank 2.");
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::reciprocal[ETI]":"KokkosBlas::reciprocal[noETI]");
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::reciprocal<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name());
+  static void reciprocal(const RMV& R, const XMV& X) {
+    static_assert(Kokkos::is_view<RMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Reciprocal<2-D>: RMV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Reciprocal<2-D>: XMV is not a Kokkos::View.");
+    static_assert(RMV::rank == 2,
+                  "KokkosBlas::Impl::Reciprocal<2-D>: "
+                  "RMV is not rank 2.");
+    static_assert(XMV::rank == 2,
+                  "KokkosBlas::Impl::Reciprocal<2-D>: "
+                  "XMV is not rank 2.");
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::reciprocal[ETI]"
+                                      : "KokkosBlas::reciprocal[noETI]");
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas1::reciprocal<> ETI specialization for < %s , %s >\n",
+             typeid(RMV).name(), typeid(XMV).name());
     else {
-      printf("KokkosBlas1::asb<> non-ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name());
+      printf("KokkosBlas1::asb<> non-ETI specialization for < %s , %s >\n",
+             typeid(RMV).name(), typeid(XMV).name());
     }
-    #endif
+#endif
 
     const size_type numRows = X.extent(0);
     const size_type numCols = X.extent(1);
-    if (numRows < static_cast<size_type> (INT_MAX) &&
-        numRows * numCols < static_cast<size_type> (INT_MAX)) {
+    if (numRows < static_cast<size_type>(INT_MAX) &&
+        numRows * numCols < static_cast<size_type>(INT_MAX)) {
       typedef int index_type;
-      MV_Reciprocal_Generic<RMV, XMV, index_type> (R, X);
-    }
-    else {
+      MV_Reciprocal_Generic<RMV, XMV, index_type>(R, X);
+    } else {
       typedef std::int64_t index_type;
-      MV_Reciprocal_Generic<RMV, XMV, index_type> (R, X);
+      MV_Reciprocal_Generic<RMV, XMV, index_type>(R, X);
     }
     Kokkos::Profiling::popRegion();
   }
 };
 #endif
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization of
@@ -199,26 +219,30 @@ struct Reciprocal<RMV, XMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct Reciprocal< \
-         Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         1, false, true>;
+#define KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE,   \
+                                             MEM_SPACE)                    \
+  extern template struct Reciprocal<                                       \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      Kokkos::View<const SCALAR*, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      1, false, true>;
 
 //
 // Macro for definition of full specialization of
 // KokkosBlas::Impl::Reciprocal for rank == 2.  This is NOT for users!!!  We
 // use this macro in one or more .cpp files in this directory.
 //
-#define KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct Reciprocal< \
-         Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         1, false, true>;
+#define KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE,   \
+                                             MEM_SPACE)                    \
+  template struct Reciprocal<                                              \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      Kokkos::View<const SCALAR*, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      1, false, true>;
 
 //
 // Macro for declaration of full specialization of
@@ -227,29 +251,33 @@ template struct Reciprocal< \
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct Reciprocal< \
-         Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         2, false, true>;
+#define KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \
+                                                MEM_SPACE)                  \
+  extern template struct Reciprocal<                                        \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      2, false, true>;
 
 //
 // Macro for definition of full specialization of
 // KokkosBlas::Impl::Reciprocal for rank == 2.  This is NOT for users!!!  We
 // use this macro in one or more .cpp files in this directory.
 //
-#define KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct Reciprocal< \
-         Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         2, false, true>;
+#define KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \
+                                                MEM_SPACE)                  \
+  template struct Reciprocal<                                               \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      2, false, true>;
 
-#include<KokkosBlas1_reciprocal_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_reciprocal_eti_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_reciprocal_mv_eti_spec_decl.hpp>
+#include <KokkosBlas1_reciprocal_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_reciprocal_eti_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_reciprocal_mv_eti_spec_decl.hpp>
 
-#endif // KOKKOS_BLAS1_MV_IMPL_RECIPROCAL_HPP_
+#endif  // KOKKOS_BLAS1_MV_IMPL_RECIPROCAL_HPP_
diff --git a/src/blas/impl/KokkosBlas1_scal_impl.hpp b/src/blas/impl/KokkosBlas1_scal_impl.hpp
index 67bf2939fb..9324b971f0 100644
--- a/src/blas/impl/KokkosBlas1_scal_impl.hpp
+++ b/src/blas/impl/KokkosBlas1_scal_impl.hpp
@@ -51,12 +51,11 @@
 
 #ifndef KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL
 #define KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL 2
-#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL
+#endif  // KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL
 
 namespace KokkosBlas {
 namespace Impl {
 
-
 // Single-vector version of MV_Scal_Functor.  By default, a is still a
 // 1-D View.  Below is a partial specialization that lets a be a
 // scalar.  This functor computes any of the following:
@@ -71,7 +70,7 @@ namespace Impl {
 // Any literal coefficient of zero has BLAS semantics of ignoring the
 // corresponding (multi)vector entry.  This does not apply to
 // coefficients in the a vector, if used.
-template<class RV, class AV, class XV, int scalar_x, class SizeType>
+template <class RV, class AV, class XV, int scalar_x, class SizeType>
 struct V_Scal_Functor {
   typedef typename RV::execution_space execution_space;
   typedef SizeType size_type;
@@ -81,36 +80,33 @@ struct V_Scal_Functor {
   XV m_x;
   AV m_a;
 
-  V_Scal_Functor (const RV& r, const XV& x, const AV& a,
-                  const SizeType startingColumn) :
-    m_r (r), m_x (x), m_a (a)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value,
-                   "V_Scal_Functor: RV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<AV>::value,
-                   "V_Scal_Functor: AV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XV>::value,
-                   "V_Scal_Functor: XV is not a Kokkos::View.");
-    static_assert (RV::rank == 1,
-                   "V_Scal_Functor: RV is not rank 1.");
-    static_assert (AV::rank == 1,
-                   "V_Scal_Functor: AV is not rank 1.");
-    static_assert (XV::rank == 1,
-                   "V_Scal_Functor: XV is not rank 1.");
+  V_Scal_Functor(const RV& r, const XV& x, const AV& a,
+                 const SizeType startingColumn)
+      : m_r(r), m_x(x), m_a(a) {
+    static_assert(Kokkos::is_view<RV>::value,
+                  "V_Scal_Functor: RV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<AV>::value,
+                  "V_Scal_Functor: AV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XV>::value,
+                  "V_Scal_Functor: XV is not a Kokkos::View.");
+    static_assert(RV::rank == 1, "V_Scal_Functor: RV is not rank 1.");
+    static_assert(AV::rank == 1, "V_Scal_Functor: AV is not rank 1.");
+    static_assert(XV::rank == 1, "V_Scal_Functor: XV is not rank 1.");
 
     if (startingColumn != 0) {
-      m_a = Kokkos::subview (a, std::make_pair (startingColumn, static_cast<SizeType> (a.extent(0))));
+      m_a = Kokkos::subview(
+          a,
+          std::make_pair(startingColumn, static_cast<SizeType>(a.extent(0))));
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i) const
-  {
+  void operator()(const size_type& i) const {
     // scalar_x is a compile-time constant (since it is a template
     // parameter), so the compiler should evaluate these branches at
     // compile time.
     if (scalar_x == 0) {
-      m_r(i) = ATS::zero ();
+      m_r(i) = ATS::zero();
     }
     if (scalar_x == -1) {
       m_r(i) = -m_x(i);
@@ -119,7 +115,7 @@ struct V_Scal_Functor {
       m_r(i) = m_x(i);
     }
     if (scalar_x == 2) {
-      m_r(i) = m_a(0)*m_x(i);
+      m_r(i) = m_a(0) * m_x(i);
     }
   }
 };
@@ -130,9 +126,9 @@ struct V_Scal_Functor {
 //
 // 1. Y(i) = alpha*X(i) for alpha in -1,0,1
 // 2. Y(i) = a*X(i)
-template<class RV, class XV, int scalar_x, class SizeType>
-struct V_Scal_Functor<RV, typename XV::non_const_value_type,
-                      XV, scalar_x, SizeType> {
+template <class RV, class XV, int scalar_x, class SizeType>
+struct V_Scal_Functor<RV, typename XV::non_const_value_type, XV, scalar_x,
+                      SizeType> {
   typedef typename RV::execution_space execution_space;
   typedef SizeType size_type;
   typedef Kokkos::Details::ArithTraits<typename RV::non_const_value_type> ATS;
@@ -141,17 +137,15 @@ struct V_Scal_Functor<RV, typename XV::non_const_value_type,
   XV m_x;
   const typename XV::non_const_value_type m_a;
 
-  V_Scal_Functor (const RV& r, const XV& x,
-                  const typename XV::non_const_value_type& a,
-                  const SizeType /* startingColumn */ ) :
-    m_r (r), m_x (x), m_a (a)
-  {}
+  V_Scal_Functor(const RV& r, const XV& x,
+                 const typename XV::non_const_value_type& a,
+                 const SizeType /* startingColumn */)
+      : m_r(r), m_x(x), m_a(a) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i) const
-  {
+  void operator()(const size_type& i) const {
     if (scalar_x == 0) {
-      m_r(i) = ATS::zero ();
+      m_r(i) = ATS::zero();
     }
     if (scalar_x == -1) {
       m_r(i) = -m_x(i);
@@ -160,56 +154,50 @@ struct V_Scal_Functor<RV, typename XV::non_const_value_type,
       m_r(i) = m_x(i);
     }
     if (scalar_x == 2) {
-      m_r(i) = m_a*m_x(i);
+      m_r(i) = m_a * m_x(i);
     }
   }
 };
 
-
 // Variant of MV_Scal_Generic for single vectors (1-D Views) r and x.
 // As above, av is either a 1-D View (and only its first entry will be
 // read), or a scalar.
-template<class RV, class AV, class XV, class SizeType>
-void
-V_Scal_Generic (const RV& r, const AV& av, const XV& x,
-                const SizeType startingColumn, int a = 2)
-{
-  static_assert (Kokkos::Impl::is_view<RV>::value,
-                 "V_Scal_Generic: RV is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<XV>::value,
-                 "V_Scal_Generic: XV is not a Kokkos::View.");
-  static_assert (RV::rank == 1,
-                 "V_Scal_Generic: RV is not rank 1.");
-  static_assert (XV::rank == 1,
-                 "V_Scal_Generic: XV is not rank 1.");
+template <class RV, class AV, class XV, class SizeType>
+void V_Scal_Generic(const RV& r, const AV& av, const XV& x,
+                    const SizeType startingColumn, int a = 2) {
+  static_assert(Kokkos::is_view<RV>::value,
+                "V_Scal_Generic: RV is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<XV>::value,
+                "V_Scal_Generic: XV is not a Kokkos::View.");
+  static_assert(RV::rank == 1, "V_Scal_Generic: RV is not rank 1.");
+  static_assert(XV::rank == 1, "V_Scal_Generic: XV is not rank 1.");
 
   typedef typename RV::execution_space execution_space;
   const SizeType numRows = x.extent(0);
-  Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
+  Kokkos::RangePolicy<execution_space, SizeType> policy(0, numRows);
 
   if (a == 0) {
-    V_Scal_Functor<RV, AV, XV, 0, SizeType> op (r, x, av, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Scal::S0", policy, op);
+    V_Scal_Functor<RV, AV, XV, 0, SizeType> op(r, x, av, startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Scal::S0", policy, op);
     return;
   }
   if (a == -1) {
-    V_Scal_Functor<RV, AV, XV, -1, SizeType> op (r, x, av, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Scal::S1", policy, op);
+    V_Scal_Functor<RV, AV, XV, -1, SizeType> op(r, x, av, startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Scal::S1", policy, op);
     return;
   }
   if (a == 1) {
-    V_Scal_Functor<RV, AV, XV, 1, SizeType> op (r, x, av, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Scal::S2", policy, op);
+    V_Scal_Functor<RV, AV, XV, 1, SizeType> op(r, x, av, startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Scal::S2", policy, op);
     return;
   }
 
   // a arbitrary (not -1, 0, or 1)
-  V_Scal_Functor<RV, AV, XV, 2, SizeType> op (r, x, av, startingColumn);
-  Kokkos::parallel_for ("KokkosBlas::Scal::S3", policy, op);
+  V_Scal_Functor<RV, AV, XV, 2, SizeType> op(r, x, av, startingColumn);
+  Kokkos::parallel_for("KokkosBlas::Scal::S3", policy, op);
 }
 
+}  // namespace Impl
+}  // namespace KokkosBlas
 
-} // namespace Impl
-} // namespace KokkosBlas
-
-#endif // KOKKOSBLAS1_SCAL_IMPL_HPP_
+#endif  // KOKKOSBLAS1_SCAL_IMPL_HPP_
diff --git a/src/blas/impl/KokkosBlas1_scal_mv_impl.hpp b/src/blas/impl/KokkosBlas1_scal_mv_impl.hpp
index f04e531be1..efe31471ae 100644
--- a/src/blas/impl/KokkosBlas1_scal_mv_impl.hpp
+++ b/src/blas/impl/KokkosBlas1_scal_mv_impl.hpp
@@ -52,7 +52,7 @@
 
 #ifndef KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL
 #define KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL 2
-#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL
+#endif  // KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL
 
 namespace KokkosBlas {
 namespace Impl {
@@ -70,10 +70,9 @@ namespace Impl {
 // coefficient of zero has BLAS semantics of ignoring the
 // corresponding (multi)vector entry.  This does not apply to
 // coefficients in the a vector, if they are used.
-template<class RMV, class aVector, class XMV, int scalar_x,
-         class SizeType = typename RMV::size_type>
-struct MV_Scal_Functor
-{
+template <class RMV, class aVector, class XMV, int scalar_x,
+          class SizeType = typename RMV::size_type>
+struct MV_Scal_Functor {
   typedef typename RMV::execution_space execution_space;
   typedef SizeType size_type;
   typedef Kokkos::Details::ArithTraits<typename RMV::non_const_value_type> ATS;
@@ -83,19 +82,18 @@ struct MV_Scal_Functor
   XMV X_;
   aVector a_;
 
-  MV_Scal_Functor (const RMV& R, const XMV& X, const aVector& a,
-                   const SizeType startingColumn) :
-    numCols (X.extent(1)), R_ (R), X_ (X), a_ (a)
-  {
+  MV_Scal_Functor(const RMV& R, const XMV& X, const aVector& a,
+                  const SizeType startingColumn)
+      : numCols(X.extent(1)), R_(R), X_(X), a_(a) {
     if (startingColumn != 0) {
-      auto rng = std::make_pair (startingColumn, static_cast<SizeType> (a.extent(0)));
-      a_ = Kokkos::subview (a, rng);
+      auto rng =
+          std::make_pair(startingColumn, static_cast<SizeType>(a.extent(0)));
+      a_ = Kokkos::subview(a, rng);
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i) const
-  {
+  void operator()(const size_type& i) const {
     // scalar_x is a compile-time constant (since it is a template
     // parameter), so the compiler should evaluate these branches at
     // compile time.
@@ -107,7 +105,7 @@ struct MV_Scal_Functor
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        R_(i,k) = ATS::zero ();
+        R_(i, k) = ATS::zero();
       }
     }
     if (scalar_x == -1) {
@@ -118,7 +116,7 @@ struct MV_Scal_Functor
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        R_(i,k) = -X_(i,k);
+        R_(i, k) = -X_(i, k);
       }
     }
     if (scalar_x == 1) {
@@ -129,7 +127,7 @@ struct MV_Scal_Functor
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        R_(i,k) = X_(i,k);
+        R_(i, k) = X_(i, k);
       }
     }
     if (scalar_x == 2) {
@@ -140,7 +138,7 @@ struct MV_Scal_Functor
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        R_(i,k) = a_(k)*X_(i,k);
+        R_(i, k) = a_(k) * X_(i, k);
       }
     }
   }
@@ -154,10 +152,9 @@ struct MV_Scal_Functor
 //
 // This version works by partial specialization on aVector.
 // In this partial specialization, aVector is a scalar.
-template<class RMV, class XMV, int scalar_x, class SizeType>
-struct MV_Scal_Functor<RMV, typename XMV::non_const_value_type,
-                       XMV, scalar_x, SizeType>
-{
+template <class RMV, class XMV, int scalar_x, class SizeType>
+struct MV_Scal_Functor<RMV, typename XMV::non_const_value_type, XMV, scalar_x,
+                       SizeType> {
   typedef typename RMV::execution_space execution_space;
   typedef SizeType size_type;
   typedef Kokkos::Details::ArithTraits<typename RMV::non_const_value_type> ATS;
@@ -167,15 +164,13 @@ struct MV_Scal_Functor<RMV, typename XMV::non_const_value_type,
   XMV m_x;
   const typename XMV::non_const_value_type m_a;
 
-  MV_Scal_Functor (const RMV& r, const XMV& x,
-                   const typename XMV::non_const_value_type& a,
-                   const SizeType /* startingColumn */) :
-    numCols (x.extent(1)), m_r (r), m_x (x), m_a (a)
-  {}
+  MV_Scal_Functor(const RMV& r, const XMV& x,
+                  const typename XMV::non_const_value_type& a,
+                  const SizeType /* startingColumn */)
+      : numCols(x.extent(1)), m_r(r), m_x(x), m_a(a) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i) const
-  {
+  void operator()(const size_type& i) const {
     // scalar_x and scalar_y are compile-time constants (since they
     // are template parameters), so the compiler should evaluate these
     // branches at compile time.
@@ -187,7 +182,7 @@ struct MV_Scal_Functor<RMV, typename XMV::non_const_value_type,
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_r(i,k) = ATS::zero ();
+        m_r(i, k) = ATS::zero();
       }
     }
     if (scalar_x == -1) {
@@ -198,7 +193,7 @@ struct MV_Scal_Functor<RMV, typename XMV::non_const_value_type,
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_r(i,k) = -m_x(i,k);
+        m_r(i, k) = -m_x(i, k);
       }
     }
     if (scalar_x == 1) {
@@ -209,7 +204,7 @@ struct MV_Scal_Functor<RMV, typename XMV::non_const_value_type,
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_r(i,k) = m_x(i,k);
+        m_r(i, k) = m_x(i, k);
       }
     }
     if (scalar_x == 2) {
@@ -220,7 +215,7 @@ struct MV_Scal_Functor<RMV, typename XMV::non_const_value_type,
 #pragma vector always
 #endif
       for (size_type k = 0; k < numCols; ++k) {
-        m_r(i,k) = m_a*m_x(i,k);
+        m_r(i, k) = m_a * m_x(i, k);
       }
     }
   }
@@ -228,10 +223,9 @@ struct MV_Scal_Functor<RMV, typename XMV::non_const_value_type,
 
 // Column-unrolled variant of MV_Scal_Functor.  The number of columns
 // in X and Y, UNROLL, is a compile-time constant.
-template<class RMV, class aVector, class XMV,
-         int scalar_x, int UNROLL, class SizeType>
-struct MV_Scal_Unroll_Functor
-{
+template <class RMV, class aVector, class XMV, int scalar_x, int UNROLL,
+          class SizeType>
+struct MV_Scal_Unroll_Functor {
   typedef typename RMV::execution_space execution_space;
   typedef SizeType size_type;
   typedef Kokkos::Details::ArithTraits<typename RMV::non_const_value_type> ATS;
@@ -240,25 +234,24 @@ struct MV_Scal_Unroll_Functor
   XMV m_x;
   aVector m_a;
 
-  MV_Scal_Unroll_Functor (const RMV& r, const XMV& x, const aVector& a,
-                          const SizeType startingColumn) :
-    m_r (r), m_x (x), m_a (a)
-  {
+  MV_Scal_Unroll_Functor(const RMV& r, const XMV& x, const aVector& a,
+                         const SizeType startingColumn)
+      : m_r(r), m_x(x), m_a(a) {
     if (startingColumn != 0) {
-      auto rng = std::make_pair (startingColumn, static_cast<SizeType> (a.extent(0)));
-      m_a = Kokkos::subview (a, rng);
+      auto rng =
+          std::make_pair(startingColumn, static_cast<SizeType>(a.extent(0)));
+      m_a = Kokkos::subview(a, rng);
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i) const
-  {
+  void operator()(const size_type& i) const {
     if (scalar_x == 0) {
 #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_r(i,k) = ATS::zero ();
+        m_r(i, k) = ATS::zero();
       }
     }
     if (scalar_x == -1) {
@@ -266,7 +259,7 @@ struct MV_Scal_Unroll_Functor
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_r(i,k) = -m_x(i,k);
+        m_r(i, k) = -m_x(i, k);
       }
     }
     if (scalar_x == 1) {
@@ -274,7 +267,7 @@ struct MV_Scal_Unroll_Functor
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_r(i,k) = m_x(i,k);
+        m_r(i, k) = m_x(i, k);
       }
     }
     if (scalar_x == 2) {
@@ -282,7 +275,7 @@ struct MV_Scal_Unroll_Functor
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_r(i,k) = m_a(k)*m_x(i,k);
+        m_r(i, k) = m_a(k) * m_x(i, k);
       }
     }
   }
@@ -291,10 +284,9 @@ struct MV_Scal_Unroll_Functor
 // Variant of MV_Scal_Unroll_Functor for a single coefficient (rather
 // than a vector of coefficients) a.  The number of columns in X,
 // UNROLL, is a compile-time constant.
-template<class RMV, class XMV, int scalar_x, int UNROLL, class SizeType>
-struct MV_Scal_Unroll_Functor<RMV, typename XMV::non_const_value_type,
-                              XMV, scalar_x, UNROLL, SizeType>
-{
+template <class RMV, class XMV, int scalar_x, int UNROLL, class SizeType>
+struct MV_Scal_Unroll_Functor<RMV, typename XMV::non_const_value_type, XMV,
+                              scalar_x, UNROLL, SizeType> {
   typedef typename RMV::execution_space execution_space;
   typedef SizeType size_type;
   typedef Kokkos::Details::ArithTraits<typename RMV::non_const_value_type> ATS;
@@ -303,21 +295,19 @@ struct MV_Scal_Unroll_Functor<RMV, typename XMV::non_const_value_type,
   XMV m_x;
   const typename XMV::non_const_value_type m_a;
 
-  MV_Scal_Unroll_Functor (const RMV& r, const XMV& x,
-                          const typename XMV::non_const_value_type& a,
-                          const SizeType /* startingColumn */ ) :
-    m_r (r), m_x (x), m_a (a)
-  {}
+  MV_Scal_Unroll_Functor(const RMV& r, const XMV& x,
+                         const typename XMV::non_const_value_type& a,
+                         const SizeType /* startingColumn */)
+      : m_r(r), m_x(x), m_a(a) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i) const
-  {
+  void operator()(const size_type& i) const {
     if (scalar_x == 0) {
 #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_r(i,k) = ATS::zero ();
+        m_r(i, k) = ATS::zero();
       }
     }
     if (scalar_x == -1) {
@@ -325,7 +315,7 @@ struct MV_Scal_Unroll_Functor<RMV, typename XMV::non_const_value_type,
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_r(i,k) = -m_x(i,k);
+        m_r(i, k) = -m_x(i, k);
       }
     }
     if (scalar_x == 1) {
@@ -333,7 +323,7 @@ struct MV_Scal_Unroll_Functor<RMV, typename XMV::non_const_value_type,
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_r(i,k) = m_x(i,k);
+        m_r(i, k) = m_x(i, k);
       }
     }
     if (scalar_x == 2) {
@@ -341,7 +331,7 @@ struct MV_Scal_Unroll_Functor<RMV, typename XMV::non_const_value_type,
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        m_r(i,k) = m_a*m_x(i,k);
+        m_r(i, k) = m_a * m_x(i, k);
       }
     }
   }
@@ -361,40 +351,42 @@ struct MV_Scal_Unroll_Functor<RMV, typename XMV::non_const_value_type,
 // Any literal coefficient of zero has BLAS semantics of ignoring the
 // corresponding (multi)vector entry.  This does NOT apply to
 // coefficient(s) in av, if used.
-template<class RMV, class aVector, class XMV, int UNROLL, class SizeType>
-void
-MV_Scal_Unrolled (const RMV& r, const aVector& av, const XMV& x,
-                  const SizeType startingColumn, int a = 2)
-{
+template <class RMV, class aVector, class XMV, int UNROLL, class SizeType>
+void MV_Scal_Unrolled(const RMV& r, const aVector& av, const XMV& x,
+                      const SizeType startingColumn, int a = 2) {
   typedef typename XMV::execution_space execution_space;
 
   if (a == 0) {
-    MV_Scal_Unroll_Functor<RMV, aVector, XMV, 0, UNROLL, SizeType> op (r, x, av, startingColumn);
+    MV_Scal_Unroll_Functor<RMV, aVector, XMV, 0, UNROLL, SizeType> op(
+        r, x, av, startingColumn);
     const SizeType numRows = x.extent(0);
-    Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
-    Kokkos::parallel_for ("KokkosBlas::Scal::MV::S0", policy, op);
+    Kokkos::RangePolicy<execution_space, SizeType> policy(0, numRows);
+    Kokkos::parallel_for("KokkosBlas::Scal::MV::S0", policy, op);
     return;
   }
   if (a == -1) {
-    MV_Scal_Unroll_Functor<RMV, aVector, XMV, -1, UNROLL, SizeType> op (r, x, av, startingColumn);
+    MV_Scal_Unroll_Functor<RMV, aVector, XMV, -1, UNROLL, SizeType> op(
+        r, x, av, startingColumn);
     const SizeType numRows = x.extent(0);
-    Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
-    Kokkos::parallel_for ("KokkosBlas::Scal::MV::S1", policy, op);
+    Kokkos::RangePolicy<execution_space, SizeType> policy(0, numRows);
+    Kokkos::parallel_for("KokkosBlas::Scal::MV::S1", policy, op);
     return;
   }
   if (a == 1) {
-    MV_Scal_Unroll_Functor<RMV, aVector, XMV, 1, UNROLL, SizeType> op (r, x, av, startingColumn);
+    MV_Scal_Unroll_Functor<RMV, aVector, XMV, 1, UNROLL, SizeType> op(
+        r, x, av, startingColumn);
     const SizeType numRows = x.extent(0);
-    Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
-    Kokkos::parallel_for ("KokkosBlas::Scal::MV::S2", policy, op);
+    Kokkos::RangePolicy<execution_space, SizeType> policy(0, numRows);
+    Kokkos::parallel_for("KokkosBlas::Scal::MV::S2", policy, op);
     return;
   }
 
   // a arbitrary (not -1, 0, or 1)
-  MV_Scal_Unroll_Functor<RMV, aVector, XMV, 2, UNROLL, SizeType> op (r, x, av, startingColumn);
+  MV_Scal_Unroll_Functor<RMV, aVector, XMV, 2, UNROLL, SizeType> op(
+      r, x, av, startingColumn);
   const SizeType numRows = x.extent(0);
-  Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
-  Kokkos::parallel_for ("KokkosBlas::Scal::MV::S3", policy, op);
+  Kokkos::RangePolicy<execution_space, SizeType> policy(0, numRows);
+  Kokkos::parallel_for("KokkosBlas::Scal::MV::S3", policy, op);
 }
 
 // Invoke the "generic" (not unrolled) multivector functor that
@@ -411,40 +403,38 @@ MV_Scal_Unrolled (const RMV& r, const aVector& av, const XMV& x,
 // Any literal coefficient of zero has BLAS semantics of ignoring the
 // corresponding (multi)vector entry.  This does NOT apply to
 // coefficient(s) in av, if used.
-template<class RVector, class aVector, class XVector, class SizeType>
-void
-MV_Scal_Generic (const RVector& r,
-                 const aVector& av,
-                 const XVector& x,
-                 const SizeType startingColumn,
-                 int a = 2)
-{
+template <class RVector, class aVector, class XVector, class SizeType>
+void MV_Scal_Generic(const RVector& r, const aVector& av, const XVector& x,
+                     const SizeType startingColumn, int a = 2) {
   typedef typename XVector::execution_space execution_space;
   const SizeType numRows = x.extent(0);
-  Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
+  Kokkos::RangePolicy<execution_space, SizeType> policy(0, numRows);
 
   if (a == 0) {
-    MV_Scal_Functor<RVector, aVector, XVector, 0, SizeType> op (r, x, av, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Scal::MV::S4", policy, op);
+    MV_Scal_Functor<RVector, aVector, XVector, 0, SizeType> op(r, x, av,
+                                                               startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Scal::MV::S4", policy, op);
     return;
   }
   if (a == -1) {
-    MV_Scal_Functor<RVector, aVector, XVector, -1, SizeType> op (r, x, av, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Scal::MV::S5", policy, op);
+    MV_Scal_Functor<RVector, aVector, XVector, -1, SizeType> op(r, x, av,
+                                                                startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Scal::MV::S5", policy, op);
     return;
   }
   if (a == 1) {
-    MV_Scal_Functor<RVector, aVector, XVector, 1, SizeType> op (r, x, av, startingColumn);
-    Kokkos::parallel_for ("KokkosBlas::Scal::MV::S6", policy, op);
+    MV_Scal_Functor<RVector, aVector, XVector, 1, SizeType> op(r, x, av,
+                                                               startingColumn);
+    Kokkos::parallel_for("KokkosBlas::Scal::MV::S6", policy, op);
     return;
   }
 
   // a arbitrary (not -1, 0, or 1)
-  MV_Scal_Functor<RVector, aVector, XVector, 2, SizeType> op (r, x, av, startingColumn);
-  Kokkos::parallel_for ("KokkosBlas::Scal::MV::S7", policy, op);
+  MV_Scal_Functor<RVector, aVector, XVector, 2, SizeType> op(r, x, av,
+                                                             startingColumn);
+  Kokkos::parallel_for("KokkosBlas::Scal::MV::S7", policy, op);
 }
 
-
 // Compute any of the following, in a way optimized for X, Y, and R
 // being LayoutLeft:
 //
@@ -459,10 +449,8 @@ MV_Scal_Generic (const RVector& r,
 // Any literal coefficient of zero has BLAS semantics of ignoring the
 // corresponding (multi)vector entry.  This does NOT apply to
 // coefficient(s) in av, if used.
-template<class RMV, class AV, class XMV, class SizeType>
-void
-MV_Scal_Invoke_Left (const RMV& r, const AV& av, const XMV& x, int a = 2)
-{
+template <class RMV, class AV, class XMV, class SizeType>
+void MV_Scal_Invoke_Left(const RMV& r, const AV& av, const XMV& x, int a = 2) {
   const SizeType numCols = x.extent(1);
 
 #if KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL <= 2
@@ -471,97 +459,80 @@ MV_Scal_Invoke_Left (const RMV& r, const AV& av, const XMV& x, int a = 2)
   // We limit the number of strip-mine values in order to keep down
   // the amount of code to compile.
 
-  SizeType j = 0; // the current column of X and Y
-  for ( ; j + 8 <= numCols; j += 8) {
-    const std::pair<SizeType, SizeType> rng (j, j+8);
-    auto X_cur = Kokkos::subview (x, Kokkos::ALL (), rng);
-    auto R_cur = Kokkos::subview (r, Kokkos::ALL (), rng);
-    typedef decltype (X_cur) XMV2D;
-    typedef decltype (R_cur) RMV2D;
+  SizeType j = 0;  // the current column of X and Y
+  for (; j + 8 <= numCols; j += 8) {
+    const std::pair<SizeType, SizeType> rng(j, j + 8);
+    auto X_cur = Kokkos::subview(x, Kokkos::ALL(), rng);
+    auto R_cur = Kokkos::subview(r, Kokkos::ALL(), rng);
+    typedef decltype(X_cur) XMV2D;
+    typedef decltype(R_cur) RMV2D;
 
-    MV_Scal_Unrolled<RMV2D, AV, XMV2D, 8, SizeType> (R_cur, av, X_cur, j, a);
+    MV_Scal_Unrolled<RMV2D, AV, XMV2D, 8, SizeType>(R_cur, av, X_cur, j, a);
   }
-  for ( ; j + 4 <= numCols; j += 4) {
-    const std::pair<SizeType, SizeType> rng (j, j+4);
-    auto X_cur = Kokkos::subview (x, Kokkos::ALL (), rng);
-    auto R_cur = Kokkos::subview (r, Kokkos::ALL (), rng);
-    typedef decltype (X_cur) XMV2D;
-    typedef decltype (R_cur) RMV2D;
-
-    MV_Scal_Unrolled<RMV2D, AV, XMV2D, 4, SizeType> (R_cur, av, X_cur, j, a);
+  for (; j + 4 <= numCols; j += 4) {
+    const std::pair<SizeType, SizeType> rng(j, j + 4);
+    auto X_cur = Kokkos::subview(x, Kokkos::ALL(), rng);
+    auto R_cur = Kokkos::subview(r, Kokkos::ALL(), rng);
+    typedef decltype(X_cur) XMV2D;
+    typedef decltype(R_cur) RMV2D;
+
+    MV_Scal_Unrolled<RMV2D, AV, XMV2D, 4, SizeType>(R_cur, av, X_cur, j, a);
   }
-  for ( ; j < numCols; ++j) {
+  for (; j < numCols; ++j) {
     // RMV and XMV need to turn 1-D.
-    auto x_cur = Kokkos::subview (x, Kokkos::ALL (), j);
-    auto r_cur = Kokkos::subview (r, Kokkos::ALL (), j);
-    typedef decltype (r_cur) RV;
-    typedef decltype (x_cur) XV;
+    auto x_cur = Kokkos::subview(x, Kokkos::ALL(), j);
+    auto r_cur = Kokkos::subview(r, Kokkos::ALL(), j);
+    typedef decltype(r_cur) RV;
+    typedef decltype(x_cur) XV;
 
-    V_Scal_Generic<RV, AV, XV, SizeType> (r_cur, av, x_cur, j, a);
+    V_Scal_Generic<RV, AV, XV, SizeType>(r_cur, av, x_cur, j, a);
   }
 
-#else // KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL > 2
+#else  // KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL > 2
 
   switch (numCols) {
-  case 1: {
-    auto r_0 = Kokkos::subview (r, Kokkos::ALL (), 0);
-    auto x_0 = Kokkos::subview (x, Kokkos::ALL (), 0);
-    typedef decltype (r_0) RV;
-    typedef decltype (x_0) XV;
-
-    V_Scal_Generic<RV, AV, XV, SizeType> (r_0, av, x_0, 0, a);
-    break;
-  }
-  case 2:
-    MV_Scal_Unrolled<RMV, AV, XMV, 2, SizeType> (r, av, x, 0, a);
-    break;
-  case 3:
-    MV_Scal_Unrolled<RMV, AV, XMV, 3, SizeType> (r, av, x, 0, a);
-    break;
-  case 4:
-    MV_Scal_Unrolled<RMV, AV, XMV, 4, SizeType> (r, av, x, 0, a);
-    break;
-  case 5:
-    MV_Scal_Unrolled<RMV, AV, XMV, 5, SizeType> (r, av, x, 0, a);
-    break;
-  case 6:
-    MV_Scal_Unrolled<RMV, AV, XMV, 6, SizeType> (r, av, x, 0, a);
-    break;
-  case 7:
-    MV_Scal_Unrolled<RMV, AV, XMV, 7, SizeType> (r, av, x, 0, a);
-    break;
-  case 8:
-    MV_Scal_Unrolled<RMV, AV, XMV, 8, SizeType> (r, av, x, 0, a);
-    break;
-  case 9:
-    MV_Scal_Unrolled<RMV, AV, XMV, 9, SizeType> (r, av, x, 0, a);
-    break;
-  case 10:
-    MV_Scal_Unrolled<RMV, AV, XMV, 10, SizeType> (r, av, x, 0, a);
-    break;
-  case 11:
-    MV_Scal_Unrolled<RMV, AV, XMV, 11, SizeType> (r, av, x, 0, a);
-    break;
-  case 12:
-    MV_Scal_Unrolled<RMV, AV, XMV, 12, SizeType> (r, av, x, 0, a);
-    break;
-  case 13:
-    MV_Scal_Unrolled<RMV, AV, XMV, 13, SizeType> (r, av, x, 0, a);
-    break;
-  case 14:
-    MV_Scal_Unrolled<RMV, AV, XMV, 14, SizeType> (r, av, x, 0, a);
-    break;
-  case 15:
-    MV_Scal_Unrolled<RMV, AV, XMV, 15, SizeType> (r, av, x, 0, a);
-    break;
-  case 16:
-    MV_Scal_Unrolled<RMV, AV, XMV, 16, SizeType> (r, av, x, 0, a);
-    break;
-  default:
-    MV_Scal_Generic<RMV, AV, XMV, SizeType> (r, av, x, 0, a);
+    case 1: {
+      auto r_0 = Kokkos::subview(r, Kokkos::ALL(), 0);
+      auto x_0 = Kokkos::subview(x, Kokkos::ALL(), 0);
+      typedef decltype(r_0) RV;
+      typedef decltype(x_0) XV;
+
+      V_Scal_Generic<RV, AV, XV, SizeType>(r_0, av, x_0, 0, a);
+      break;
+    }
+    case 2: MV_Scal_Unrolled<RMV, AV, XMV, 2, SizeType>(r, av, x, 0, a); break;
+    case 3: MV_Scal_Unrolled<RMV, AV, XMV, 3, SizeType>(r, av, x, 0, a); break;
+    case 4: MV_Scal_Unrolled<RMV, AV, XMV, 4, SizeType>(r, av, x, 0, a); break;
+    case 5: MV_Scal_Unrolled<RMV, AV, XMV, 5, SizeType>(r, av, x, 0, a); break;
+    case 6: MV_Scal_Unrolled<RMV, AV, XMV, 6, SizeType>(r, av, x, 0, a); break;
+    case 7: MV_Scal_Unrolled<RMV, AV, XMV, 7, SizeType>(r, av, x, 0, a); break;
+    case 8: MV_Scal_Unrolled<RMV, AV, XMV, 8, SizeType>(r, av, x, 0, a); break;
+    case 9: MV_Scal_Unrolled<RMV, AV, XMV, 9, SizeType>(r, av, x, 0, a); break;
+    case 10:
+      MV_Scal_Unrolled<RMV, AV, XMV, 10, SizeType>(r, av, x, 0, a);
+      break;
+    case 11:
+      MV_Scal_Unrolled<RMV, AV, XMV, 11, SizeType>(r, av, x, 0, a);
+      break;
+    case 12:
+      MV_Scal_Unrolled<RMV, AV, XMV, 12, SizeType>(r, av, x, 0, a);
+      break;
+    case 13:
+      MV_Scal_Unrolled<RMV, AV, XMV, 13, SizeType>(r, av, x, 0, a);
+      break;
+    case 14:
+      MV_Scal_Unrolled<RMV, AV, XMV, 14, SizeType>(r, av, x, 0, a);
+      break;
+    case 15:
+      MV_Scal_Unrolled<RMV, AV, XMV, 15, SizeType>(r, av, x, 0, a);
+      break;
+    case 16:
+      MV_Scal_Unrolled<RMV, AV, XMV, 16, SizeType>(r, av, x, 0, a);
+      break;
+    default: MV_Scal_Generic<RMV, AV, XMV, SizeType>(r, av, x, 0, a);
   }
 
-#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL
+#endif  // KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL
 }
 
 // Compute any of the following, in a way optimized for X, Y, and R
@@ -578,30 +549,28 @@ MV_Scal_Invoke_Left (const RMV& r, const AV& av, const XMV& x, int a = 2)
 // Any literal coefficient of zero has BLAS semantics of ignoring the
 // corresponding (multi)vector entry.  This does NOT apply to
 // coefficient(s) in av, if used.
-template<class RMV, class aVector, class XMV, class SizeType>
-void
-MV_Scal_Invoke_Right (const RMV& r, const aVector& av, const XMV& x, int a = 2)
-{
+template <class RMV, class aVector, class XMV, class SizeType>
+void MV_Scal_Invoke_Right(const RMV& r, const aVector& av, const XMV& x,
+                          int a = 2) {
   const SizeType numCols = x.extent(1);
 
   if (numCols == 1) {
     typedef Kokkos::View<typename RMV::value_type*, typename RMV::array_layout,
-      typename RMV::device_type, typename RMV::memory_traits> RV;
+                         typename RMV::device_type, typename RMV::memory_traits>
+        RV;
     typedef Kokkos::View<typename XMV::value_type*, typename XMV::array_layout,
-      typename XMV::device_type, typename XMV::memory_traits> XV;
-
-    RV r_0 = Kokkos::subview (r, Kokkos::ALL (), 0);
-    XV x_0 = Kokkos::subview (x, Kokkos::ALL (), 0);
-    V_Scal_Generic<RMV, aVector, XMV, 1, SizeType> (r_0, av, x_0, a);
-  }
-  else {
-    MV_Scal_Generic<RMV, aVector, XMV, SizeType> (r, av, x, a);
+                         typename XMV::device_type, typename XMV::memory_traits>
+        XV;
+
+    RV r_0 = Kokkos::subview(r, Kokkos::ALL(), 0);
+    XV x_0 = Kokkos::subview(x, Kokkos::ALL(), 0);
+    V_Scal_Generic<RMV, aVector, XMV, 1, SizeType>(r_0, av, x_0, a);
+  } else {
+    MV_Scal_Generic<RMV, aVector, XMV, SizeType>(r, av, x, a);
   }
 }
 
+}  // namespace Impl
+}  // namespace KokkosBlas
 
-
-} // namespace Impl
-} // namespace KokkosBlas
-
-#endif // KOKKOSBLAS1_SCAL_MV_IMPL_HPP_
+#endif  // KOKKOSBLAS1_SCAL_MV_IMPL_HPP_
diff --git a/src/blas/impl/KokkosBlas1_scal_spec.hpp b/src/blas/impl/KokkosBlas1_scal_spec.hpp
index fe5ebf89a8..1ec18c7469 100644
--- a/src/blas/impl/KokkosBlas1_scal_spec.hpp
+++ b/src/blas/impl/KokkosBlas1_scal_spec.hpp
@@ -57,12 +57,12 @@
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class RV, class AV, class XV, int Xrank = XV::rank>
+template <class RV, class AV, class XV, int Xrank = XV::rank>
 struct scal_eti_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization availability
@@ -71,15 +71,18 @@ struct scal_eti_spec_avail {
 // We may spread out definitions (see _INST macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_SCAL_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct scal_eti_spec_avail< \
-        Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        SCALAR, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_SCAL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  template <>                                                                  \
+  struct scal_eti_spec_avail<                                                  \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      SCALAR,                                                                  \
+      Kokkos::View<const SCALAR*, LAYOUT,                                      \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1> {                                                                     \
+    enum : bool { value = true };                                              \
+  };
 
 //
 // Macro for declaration of full specialization availability
@@ -88,91 +91,101 @@ struct scal_eti_spec_avail {
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_SCAL_MV_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct scal_eti_spec_avail< \
-        Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        2> { enum : bool { value = true }; }; \
-    template<> \
-    struct scal_eti_spec_avail< \
-        Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        SCALAR, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        2> { enum : bool { value = true }; };
-
-
+#define KOKKOSBLAS1_SCAL_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE,      \
+                                           MEM_SPACE)                       \
+  template <>                                                               \
+  struct scal_eti_spec_avail<                                               \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR*, LAYOUT,                                   \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      2> {                                                                  \
+    enum : bool { value = true };                                           \
+  };                                                                        \
+  template <>                                                               \
+  struct scal_eti_spec_avail<                                               \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      SCALAR,                                                               \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      2> {                                                                  \
+    enum : bool { value = true };                                           \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosBlas1_scal_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_scal_eti_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_scal_mv_eti_spec_avail.hpp>
+#include <KokkosBlas1_scal_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_scal_eti_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_scal_mv_eti_spec_avail.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
 
 // Unification layer
-template<class RV, class AV, class XV, int XV_Rank = XV::rank,
-         bool tpl_spec_avail = scal_tpl_spec_avail<RV,AV,XV>::value,
-         bool eti_spec_avail = scal_eti_spec_avail<RV,AV,XV>::value>
+template <class RV, class AV, class XV, int XV_Rank = XV::rank,
+          bool tpl_spec_avail = scal_tpl_spec_avail<RV, AV, XV>::value,
+          bool eti_spec_avail = scal_eti_spec_avail<RV, AV, XV>::value>
 struct Scal {
-  static void scal (const RV& R, const AV& A, const XV& X);
+  static void scal(const RV& R, const AV& A, const XV& X);
 };
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 //! Full specialization of Scal for single vectors (1-D Views).
-template<class RV, class XV>
-struct Scal<RV,  typename XV::non_const_value_type, XV, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>
-{
+template <class RV, class XV>
+struct Scal<RV, typename XV::non_const_value_type, XV, 1, false,
+            KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename XV::non_const_value_type AV;
   typedef typename XV::size_type size_type;
   typedef Kokkos::Details::ArithTraits<typename XV::non_const_value_type> ATA;
 
-  static void
-  scal (const RV& R, const AV& alpha, const XV& X)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::Impl::"
-                   "Scal<1-D>: RV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XV>::value, "KokkosBlas::Impl::"
-                   "Scal<1-D>: XV is not a Kokkos::View.");
-    static_assert (RV::rank == 1, "KokkosBlas::Impl::Scal<1-D>: "
-                   "RV is not rank 1.");
-    static_assert (XV::rank == 1, "KokkosBlas::Impl::Scal<1-D>: "
-                   "XV is not rank 1.");
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::scal[ETI]":"KokkosBlas::scal[noETI]");
+  static void scal(const RV& R, const AV& alpha, const XV& X) {
+    static_assert(Kokkos::is_view<RV>::value,
+                  "KokkosBlas::Impl::"
+                  "Scal<1-D>: RV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XV>::value,
+                  "KokkosBlas::Impl::"
+                  "Scal<1-D>: XV is not a Kokkos::View.");
+    static_assert(RV::rank == 1,
+                  "KokkosBlas::Impl::Scal<1-D>: "
+                  "RV is not rank 1.");
+    static_assert(XV::rank == 1,
+                  "KokkosBlas::Impl::Scal<1-D>: "
+                  "XV is not rank 1.");
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::scal[ETI]"
+                                      : "KokkosBlas::scal[noETI]");
 
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::scal<1D> ETI specialization for < %s , %s , %s >\n",typeid(RV).name(),typeid(AV).name(),typeid(XV).name());
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas1::scal<1D> ETI specialization for < %s , %s , %s >\n",
+             typeid(RV).name(), typeid(AV).name(), typeid(XV).name());
     else
-      printf("KokkosBlas1::scal<1D> non-ETI specialization for < %s , %s , %s >\n",typeid(RV).name(),typeid(AV).name(),typeid(XV).name());
-    #endif
+      printf(
+          "KokkosBlas1::scal<1D> non-ETI specialization for < %s , %s , %s >\n",
+          typeid(RV).name(), typeid(AV).name(), typeid(XV).name());
+#endif
 
     const size_type numRows = X.extent(0);
-    int a = 2;
-    if (alpha == ATA::zero ()) {
+    int a                   = 2;
+    if (alpha == ATA::zero()) {
       a = 0;
-    }
-    else if (alpha == -ATA::one ()) {
+    } else if (alpha == -ATA::one()) {
       a = -1;
-    }
-    else if (alpha == ATA::one ()) {
+    } else if (alpha == ATA::one()) {
       a = 1;
     }
 
-    if (numRows < static_cast<size_type> (INT_MAX)) {
+    if (numRows < static_cast<size_type>(INT_MAX)) {
       typedef int index_type;
-      V_Scal_Generic<RV, AV, XV, index_type> (R, alpha, X, a);
-    }
-    else {
+      V_Scal_Generic<RV, AV, XV, index_type>(R, alpha, X, a);
+    } else {
       typedef typename XV::size_type index_type;
-      V_Scal_Generic<RV, AV, XV, index_type> (R, alpha, X, a);
+      V_Scal_Generic<RV, AV, XV, index_type>(R, alpha, X, a);
     }
     Kokkos::Profiling::popRegion();
   }
@@ -184,45 +197,53 @@ struct Scal<RV,  typename XV::non_const_value_type, XV, 1, false, KOKKOSKERNELS_
 ///
 /// 1. R(i,j) = a*X(i,j) for a in -1,0,1
 /// 2. R(i,j) = alpha(j)*X(i,j)
-template<class RMV, class AV, class XMV>
+template <class RMV, class AV, class XMV>
 struct Scal<RMV, AV, XMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename XMV::size_type size_type;
   typedef Kokkos::Details::ArithTraits<typename XMV::non_const_value_type> ATA;
 
-  static void
-  scal (const RMV& R, const AV& av, const XMV& X)
-  {
-    static_assert (Kokkos::Impl::is_view<RMV>::value, "KokkosBlas::Impl::"
-                   "Scal<2-D>: RMV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<AV>::value, "KokkosBlas::Impl::"
-                   "Scal<2-D>: AV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "Scal<2-D>: XMV is not a Kokkos::View.");
-    static_assert (RMV::rank == 2, "KokkosBlas::Impl::Scal<2-D>: "
-                   "RMV is not rank 2.");
-    static_assert (AV::rank == 1, "KokkosBlas::Impl::Scal<2-D>: "
-                   "AV is not rank 1.");
-    static_assert (XMV::rank == 2, "KokkosBlas::Impl::Scal<2-D>: "
-                   "XMV is not rank 2.");
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::scal[ETI]":"KokkosBlas::scal[noETI]");
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::scal<2D> ETI specialization for < %s , %s , %s >\n",typeid(RMV).name(),typeid(AV).name(),typeid(XMV).name());
+  static void scal(const RMV& R, const AV& av, const XMV& X) {
+    static_assert(Kokkos::is_view<RMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Scal<2-D>: RMV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<AV>::value,
+                  "KokkosBlas::Impl::"
+                  "Scal<2-D>: AV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Scal<2-D>: XMV is not a Kokkos::View.");
+    static_assert(RMV::rank == 2,
+                  "KokkosBlas::Impl::Scal<2-D>: "
+                  "RMV is not rank 2.");
+    static_assert(AV::rank == 1,
+                  "KokkosBlas::Impl::Scal<2-D>: "
+                  "AV is not rank 1.");
+    static_assert(XMV::rank == 2,
+                  "KokkosBlas::Impl::Scal<2-D>: "
+                  "XMV is not rank 2.");
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::scal[ETI]"
+                                      : "KokkosBlas::scal[noETI]");
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas1::scal<2D> ETI specialization for < %s , %s , %s >\n",
+             typeid(RMV).name(), typeid(AV).name(), typeid(XMV).name());
     else
-      printf("KokkosBlas1::scal<2D> non-ETI specialization for < %s , %s , %s >\n",typeid(RMV).name(),typeid(AV).name(),typeid(XMV).name());
-    #endif
+      printf(
+          "KokkosBlas1::scal<2D> non-ETI specialization for < %s , %s , %s >\n",
+          typeid(RMV).name(), typeid(AV).name(), typeid(XMV).name());
+#endif
 
     const size_type numRows = X.extent(0);
     const size_type numCols = X.extent(1);
-    const int a = (av.extent(0) == 0) ? 0 : 2;
-    if (numRows < static_cast<size_type> (INT_MAX) &&
-        numRows * numCols < static_cast<size_type> (INT_MAX)) {
+    const int a             = (av.extent(0) == 0) ? 0 : 2;
+    if (numRows < static_cast<size_type>(INT_MAX) &&
+        numRows * numCols < static_cast<size_type>(INT_MAX)) {
       typedef int index_type;
-      MV_Scal_Invoke_Left<RMV, AV, XMV, index_type> (R, av, X, a);
-    }
-    else {
+      MV_Scal_Invoke_Left<RMV, AV, XMV, index_type>(R, av, X, a);
+    } else {
       typedef typename XMV::size_type index_type;
-      MV_Scal_Invoke_Left<RMV, AV, XMV, index_type> (R, av, X, a);
+      MV_Scal_Invoke_Left<RMV, AV, XMV, index_type>(R, av, X, a);
     }
     Kokkos::Profiling::popRegion();
   }
@@ -234,63 +255,68 @@ struct Scal<RMV, AV, XMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
 ///
 /// 1. R(i,j) = a*X(i,j) for a in -1,0,1
 /// 2. R(i,j) = alpha*X(i,j)
-template<class RMV, class XMV>
-struct Scal<RMV, typename XMV::non_const_value_type, XMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+template <class RMV, class XMV>
+struct Scal<RMV, typename XMV::non_const_value_type, XMV, 2, false,
+            KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename XMV::non_const_value_type AV;
   typedef typename XMV::size_type size_type;
   typedef Kokkos::Details::ArithTraits<typename XMV::non_const_value_type> ATA;
 
-  static void
-  scal (const RMV& R, const AV& alpha, const XMV& X)
-  {
-    static_assert (Kokkos::Impl::is_view<RMV>::value, "KokkosBlas::Impl::"
-                   "Scal<2-D, AV=scalar>: RMV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "Scal<2-D, AV=scalar>: XMV is not a Kokkos::View.");
-    static_assert (RMV::rank == 2, "KokkosBlas::Impl::Scal<2-D, AV=scalar>: "
-                   "RMV is not rank 2.");
-    static_assert (XMV::rank == 2, "KokkosBlas::Impl::Scal<2-D, AV=scalar>: "
-                   "XMV is not rank 2.");
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::scal[ETI]":"KokkosBlas::scal[noETI]");
+  static void scal(const RMV& R, const AV& alpha, const XMV& X) {
+    static_assert(Kokkos::is_view<RMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Scal<2-D, AV=scalar>: RMV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Scal<2-D, AV=scalar>: XMV is not a Kokkos::View.");
+    static_assert(RMV::rank == 2,
+                  "KokkosBlas::Impl::Scal<2-D, AV=scalar>: "
+                  "RMV is not rank 2.");
+    static_assert(XMV::rank == 2,
+                  "KokkosBlas::Impl::Scal<2-D, AV=scalar>: "
+                  "XMV is not rank 2.");
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::scal[ETI]"
+                                      : "KokkosBlas::scal[noETI]");
 
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::scal<2D> ETI specialization for < %s , %s , %s >\n",typeid(RMV).name(),typeid(AV).name(),typeid(XMV).name());
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas1::scal<2D> ETI specialization for < %s , %s , %s >\n",
+             typeid(RMV).name(), typeid(AV).name(), typeid(XMV).name());
     else
-      printf("KokkosBlas1::scal<2D> non-ETI specialization for < %s , %s , %s >\n",typeid(RMV).name(),typeid(AV).name(),typeid(XMV).name());
-    #endif
+      printf(
+          "KokkosBlas1::scal<2D> non-ETI specialization for < %s , %s , %s >\n",
+          typeid(RMV).name(), typeid(AV).name(), typeid(XMV).name());
+#endif
 
     const size_type numRows = X.extent(0);
     const size_type numCols = X.extent(1);
-    int a = 2;
-    if (alpha == ATA::zero ()) {
+    int a                   = 2;
+    if (alpha == ATA::zero()) {
       a = 0;
-    }
-    else if (alpha == -ATA::one ()) {
+    } else if (alpha == -ATA::one()) {
       a = -1;
-    }
-    else if (alpha == ATA::one ()) {
+    } else if (alpha == ATA::one()) {
       a = 1;
     }
 
-    if (numRows < static_cast<size_type> (INT_MAX) &&
-        numRows * numCols < static_cast<size_type> (INT_MAX)) {
+    if (numRows < static_cast<size_type>(INT_MAX) &&
+        numRows * numCols < static_cast<size_type>(INT_MAX)) {
       typedef int index_type;
       MV_Scal_Invoke_Left<RMV, typename XMV::non_const_value_type, XMV,
-        index_type> (R, alpha, X, a);
-    }
-    else {
+                          index_type>(R, alpha, X, a);
+    } else {
       typedef typename XMV::size_type index_type;
       MV_Scal_Invoke_Left<RMV, typename XMV::non_const_value_type, XMV,
-        index_type> (R, alpha, X, a);
+                          index_type>(R, alpha, X, a);
     }
     Kokkos::Profiling::popRegion();
   }
 };
 #endif
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization of
@@ -299,23 +325,25 @@ struct Scal<RMV, typename XMV::non_const_value_type, XMV, 2, false, KOKKOSKERNEL
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_SCAL_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct Scal< \
-        Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        SCALAR, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1,false,true>;
+#define KOKKOSBLAS1_SCAL_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  extern template struct Scal<                                                \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      SCALAR,                                                                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                     \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      1, false, true>;
 
-#define KOKKOSBLAS1_SCAL_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct Scal< \
-        Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        SCALAR, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1,false,true>;
+#define KOKKOSBLAS1_SCAL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  template struct Scal<                                                       \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      SCALAR,                                                                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                     \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      1, false, true>;
 
 //
 //
@@ -323,43 +351,50 @@ template struct Scal< \
 // KokkosBlas::Impl::Scal for rank == 2.  This is NOT for users!!!  We
 // use this macro in one or more .cpp files in this directory.
 //
-#define KOKKOSBLAS1_SCAL_MV_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct Scal< \
-        Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        2,false,true>; \
-extern template struct Scal< \
-        Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        SCALAR, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        2,false,true>;
+#define KOKKOSBLAS1_SCAL_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE,       \
+                                          MEM_SPACE)                        \
+  extern template struct Scal<                                              \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR*, LAYOUT,                                   \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      2, false, true>;                                                      \
+  extern template struct Scal<                                              \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      SCALAR,                                                               \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      2, false, true>;
 
-#define KOKKOSBLAS1_SCAL_MV_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct Scal< \
-        Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, \
-                     Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        2,false,true>; \
-template struct Scal< \
-        Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        SCALAR, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        2,false,true>;
+#define KOKKOSBLAS1_SCAL_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE,       \
+                                          MEM_SPACE)                        \
+  template struct Scal<                                                     \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR*, LAYOUT,                                   \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      2, false, true>;                                                      \
+  template struct Scal<                                                     \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      SCALAR,                                                               \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      2, false, true>;
 
-#include<KokkosBlas1_scal_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_scal_eti_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_scal_mv_eti_spec_decl.hpp>
+#include <KokkosBlas1_scal_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_scal_eti_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_scal_mv_eti_spec_decl.hpp>
 
-#endif // KOKKOS_BLAS1_MV_IMPL_SCAL_HPP_
+#endif  // KOKKOS_BLAS1_MV_IMPL_SCAL_HPP_
diff --git a/src/blas/impl/KokkosBlas1_sum_impl.hpp b/src/blas/impl/KokkosBlas1_sum_impl.hpp
index cd3bd57331..05cede0f0d 100644
--- a/src/blas/impl/KokkosBlas1_sum_impl.hpp
+++ b/src/blas/impl/KokkosBlas1_sum_impl.hpp
@@ -48,6 +48,7 @@
 #include <Kokkos_Core.hpp>
 #include <Kokkos_InnerProductSpaceTraits.hpp>
 #include <KokkosBlas1_sum_spec.hpp>
+#include <KokkosBlas_util.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
@@ -61,189 +62,135 @@ namespace Impl {
 /// \tparam RV 0-D output View
 /// \tparam XV 1-D input View
 /// \tparam SizeType Index type.  Use int (32 bits) if possible.
-template<class RV, class XV, class SizeType = typename XV::size_type>
-struct V_Sum_Functor
-{
-  typedef typename XV::execution_space              execution_space;
-  typedef SizeType                                        size_type;
-  typedef typename XV::non_const_value_type             xvalue_type;
+template <class RV, class XV, class SizeType = typename XV::size_type>
+struct V_Sum_Functor {
+  typedef typename XV::execution_space execution_space;
+  typedef SizeType size_type;
+  typedef typename XV::non_const_value_type xvalue_type;
   typedef Kokkos::Details::InnerProductSpaceTraits<xvalue_type> IPT;
-  typedef Kokkos::Details::ArithTraits<typename IPT::mag_type>   AT;
-  typedef typename RV::non_const_value_type              value_type;
+  typedef Kokkos::Details::ArithTraits<typename IPT::mag_type> AT;
+  typedef typename RV::non_const_value_type value_type;
 
   typename XV::const_type m_x;
 
-  V_Sum_Functor (const XV& x) :
-    m_x (x)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value,
-                   "KokkosBlas::Impl::V_Sum_Functor: "
-                   "R is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XV>::value,
-                   "KokkosBlas::Impl::V_Sum_Functor: "
-                   "X is not a Kokkos::View.");
-    static_assert (std::is_same<typename RV::value_type,
-                   typename RV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::V_Sum_Functor: R is const.  "
-                   "It must be nonconst, because it is an output argument "
-                   "(we have to be able to write to its entries).");
-    static_assert (RV::rank == 0 && XV::rank == 1,
-                   "KokkosBlas::Impl::V_Sum_Functor: "
-                   "RV must have rank 0 and XV must have rank 1.");
+  V_Sum_Functor(const XV& x) : m_x(x) {
+    static_assert(Kokkos::is_view<RV>::value,
+                  "KokkosBlas::Impl::V_Sum_Functor: "
+                  "R is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XV>::value,
+                  "KokkosBlas::Impl::V_Sum_Functor: "
+                  "X is not a Kokkos::View.");
+    static_assert(std::is_same<typename RV::value_type,
+                               typename RV::non_const_value_type>::value,
+                  "KokkosBlas::Impl::V_Sum_Functor: R is const.  "
+                  "It must be nonconst, because it is an output argument "
+                  "(we have to be able to write to its entries).");
+    static_assert(RV::rank == 0 && XV::rank == 1,
+                  "KokkosBlas::Impl::V_Sum_Functor: "
+                  "RV must have rank 0 and XV must have rank 1.");
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i, value_type& sum) const
-  {
-    sum += m_x(i);
-  }
+  void operator()(const size_type& i, value_type& sum) const { sum += m_x(i); }
 };
 
-/// \brief Column-wise 2-norm functor for multivectors; works for
-///   any layout, but best performance with LayoutRight.
-///
-/// \tparam RV 1-D output View
-/// \tparam XMV 2-D input View
-/// \tparam SizeType Index type.  Use int (32 bits) if possible.
-template<class RV, class XMV, class SizeType = typename XMV::size_type>
-struct MV_Sum_Right_FunctorVector
-{
-  typedef typename XMV::execution_space             execution_space;
-  typedef SizeType                                        size_type;
-  typedef typename XMV::non_const_value_type            xvalue_type;
-  typedef Kokkos::Details::InnerProductSpaceTraits<xvalue_type> IPT;
-  typedef Kokkos::Details::ArithTraits<typename IPT::mag_type>   AT;
-  typedef typename RV::non_const_value_type            value_type[];
-
-  size_type value_count;
-  typename XMV::const_type m_x;
-
-  MV_Sum_Right_FunctorVector (const XMV& x) :
-    value_count (x.extent(1)), m_x (x)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value,
-                   "KokkosBlas::Impl::MV_Sum_Right_FunctorVector: "
-                   "R is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value,
-                   "KokkosBlas::Impl::MV_Sum_Right_FunctorVector: "
-                   "X is not a Kokkos::View.");
-    static_assert (std::is_same<typename RV::value_type,
-                   typename RV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::MV_Sum_Right_FunctorVector: "
-                   "R is const.  It must be nonconst, because it is an output "
-                   "argument (we must be able to write to its entries).");
-    static_assert (RV::rank == 1 && XMV::rank == 2,
-                   "KokkosBlas::Impl::MV_Sum_Right_FunctorVector: "
-                   "RV must have rank 1 and XMV must have rank 2.");
-  }
+template <class ExecSpace, class RV, class XV, class size_type>
+struct Sum_MV_Functor {
+  typedef typename RV::non_const_value_type value_type;
+  typedef Kokkos::ArithTraits<value_type> AT;
 
-  KOKKOS_INLINE_FUNCTION void
-  operator() (const size_type i, value_type sum) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type j = 0; j < numVecs; ++j) {
-      sum[j] += m_x(i,j);
-    }
-  }
+  using TeamMem = typename Kokkos::TeamPolicy<ExecSpace>::member_type;
 
- /* KOKKOS_INLINE_FUNCTION void
-  init (value_type update) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type j = 0; j < numVecs; ++j) {
-      update[j] = AT::zero ();
-    }
-  }
+  RV r;
+  XV x;
 
-  KOKKOS_INLINE_FUNCTION void
-  join (volatile value_type update,
-        const volatile value_type source) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type j = 0; j < numVecs; ++j) {
-      update[j] += source[j];
-    }
-  }
+  size_type
+      teamsPerVec;  // number of teams collectively performing a dot product
 
-  KOKKOS_INLINE_FUNCTION void
-  join (value_type update,
-        const value_type source) const
-  {
-    const size_type numVecs = value_count;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
-#pragma vector always
-#endif
-    for (size_type j = 0; j < numVecs; ++j) {
-      update[j] += source[j];
-    }
-  }*/
-};
+  Sum_MV_Functor(const RV& r_, const XV& x_, int teamsPerVec_)
+      : r(r_), x(x_), teamsPerVec(teamsPerVec_) {}
 
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TeamMem& t) const {
+    size_type globalRank = t.league_rank();
+    size_type localRank  = globalRank % teamsPerVec;
+    size_type i          = globalRank / teamsPerVec;
+    size_type begin      = localRank * (x.extent(0) / teamsPerVec);
+    size_type end        = (localRank + 1) * (x.extent(0) / teamsPerVec);
+    if (localRank == teamsPerVec - 1) end = x.extent(0);
+
+    value_type localResult = AT::zero();
+    Kokkos::parallel_reduce(
+        Kokkos::TeamThreadRange(t, begin, end),
+        [&](size_type k, value_type& update) { update += x(k, i); },
+        localResult);
+
+    Kokkos::single(Kokkos::PerTeam(t),
+                   [&]() { Kokkos::atomic_add(&r(i), localResult); });
+  }
+};
 
 /// \brief Compute the 2-norm (or its square) of the single vector (1-D
 ///   View) X, and store the result in the 0-D View r.
-template<class RV, class XV, class SizeType>
-void
-V_Sum_Invoke (const RV& r, const XV& X)
-{
+template <class RV, class XV, class SizeType>
+void V_Sum_Invoke(const RV& r, const XV& X) {
   typedef typename XV::execution_space execution_space;
-  const SizeType numRows = static_cast<SizeType> (X.extent(0));
-  Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
+  const SizeType numRows = static_cast<SizeType>(X.extent(0));
+  Kokkos::RangePolicy<execution_space, SizeType> policy(0, numRows);
 
   typedef V_Sum_Functor<RV, XV, SizeType> functor_type;
-  functor_type op (X);
-  Kokkos::parallel_reduce ("KokkosBlas::Sum::S0", policy, op, r);
+  functor_type op(X);
+  Kokkos::parallel_reduce("KokkosBlas::Sum::S0", policy, op, r);
 }
 
-
 /// \brief Compute the 2-norms (or their square) of the columns of the
 ///   multivector (2-D View) X, and store result(s) in the 1-D View r.
-template<class RV, class XMV, class SizeType>
-void
-MV_Sum_Invoke (const RV& r, const XMV& X)
-{
-  typedef typename XMV::execution_space execution_space;
-  const SizeType numRows = static_cast<SizeType> (X.extent(0));
-  Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
-
-  // If the input multivector (2-D View) has only one column, invoke
-  // the single-vector version of the kernel.
-  if (X.extent(1) == 1) {
-    auto r_0 = Kokkos::subview (r, 0);
-    auto X_0 = Kokkos::subview (X, Kokkos::ALL (), 0);
-    typedef decltype (r_0) RV0D;
-    typedef decltype (X_0) XV1D;
-    V_Sum_Invoke<RV0D, XV1D, SizeType> (r_0, X_0);
-  }
-  else {
-    typedef MV_Sum_Right_FunctorVector<RV, XMV, SizeType> functor_type;
-    functor_type op (X);
-    Kokkos::parallel_reduce ("KokkosBlas::Sum::S1", policy, op, r);
+// Main version: the result view is accessible from execution space, so it can
+// be computed in-place
+template <class RV, class XV, class size_type>
+void MV_Sum_Invoke(
+    const RV& r, const XV& x,
+    typename std::enable_if<Kokkos::SpaceAccessibility<
+        typename XV::execution_space,
+        typename RV::memory_space>::accessible>::type* = nullptr) {
+  using execution_space = typename XV::execution_space;
+  if (r.extent(0) != x.extent(1)) {
+    std::ostringstream oss;
+    oss << "KokkosBlas::Sum (rank-2): result vector has wrong length ("
+        << r.extent(0) << ", but x has " << x.extent(1) << " columns)";
+    throw std::runtime_error(oss.str());
   }
+  // Zero out the result vector
+  Kokkos::deep_copy(
+      r, Kokkos::ArithTraits<typename RV::non_const_value_type>::zero());
+  size_type teamsPerVec;
+  KokkosBlas::Impl::multipleReductionWorkDistribution<execution_space,
+                                                      size_type>(
+      x.extent(0), x.extent(1), teamsPerVec);
+  size_type numTeams = x.extent(1) * teamsPerVec;
+  Kokkos::TeamPolicy<execution_space> pol(numTeams, Kokkos::AUTO);
+  Kokkos::parallel_for(
+      "KokkosBlas1::Sum::S1", pol,
+      Sum_MV_Functor<execution_space, RV, XV, size_type>(r, x, teamsPerVec));
+}
+
+// Version for when a temporary result view is needed (implemented in terms of
+// the other version)
+template <class RV, class XV, class size_type>
+void MV_Sum_Invoke(
+    const RV& r, const XV& x,
+    typename std::enable_if<!Kokkos::SpaceAccessibility<
+        typename XV::execution_space,
+        typename RV::memory_space>::accessible>::type* = nullptr) {
+  Kokkos::View<typename RV::non_const_value_type*, typename XV::memory_space>
+      tempResult(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "Sum temp result"),
+          r.extent(0));
+  MV_Sum_Invoke<decltype(tempResult), XV, size_type>(tempResult, x);
+  Kokkos::deep_copy(r, tempResult);
 }
 
-} // namespace Impl
-} // namespace KokkosBlas
+}  // namespace Impl
+}  // namespace KokkosBlas
 
-#endif // KOKKOSBLAS1_SUM_IMPL_HPP_
+#endif  // KOKKOSBLAS1_SUM_IMPL_HPP_
diff --git a/src/blas/impl/KokkosBlas1_sum_spec.hpp b/src/blas/impl/KokkosBlas1_sum_spec.hpp
index 01e2e2eb8e..505296cab9 100644
--- a/src/blas/impl/KokkosBlas1_sum_spec.hpp
+++ b/src/blas/impl/KokkosBlas1_sum_spec.hpp
@@ -57,12 +57,12 @@
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class RMV, class XMV, int rank = XMV::rank>
+template <class RMV, class XMV, int rank = XMV::rank>
 struct sum_eti_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization availability
@@ -71,14 +71,17 @@ struct sum_eti_spec_avail {
 // We may spread out definitions (see _INST macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_SUM_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct sum_eti_spec_avail< \
-        Kokkos::View<SCALAR, LAYOUT, Kokkos::HostSpace, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_SUM_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  template <>                                                                 \
+  struct sum_eti_spec_avail<                                                  \
+      Kokkos::View<SCALAR, LAYOUT, Kokkos::HostSpace,                         \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                     \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      1> {                                                                    \
+    enum : bool { value = true };                                             \
+  };
 
 //
 // Macro for declaration of full specialization availability
@@ -87,113 +90,127 @@ struct sum_eti_spec_avail {
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_SUM_MV_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct sum_eti_spec_avail< \
-        Kokkos::View<SCALAR*, \
-                     LAYOUT, \
-                     Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        2> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_SUM_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \
+                                          MEM_SPACE)                  \
+  template <>                                                         \
+  struct sum_eti_spec_avail<                                          \
+      Kokkos::View<SCALAR*, LAYOUT,                                   \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace,  \
+                                  Kokkos::HostSpace>,                 \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+      Kokkos::View<const SCALAR**, LAYOUT,                            \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,             \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+      2> {                                                            \
+    enum : bool { value = true };                                     \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosBlas1_sum_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_sum_eti_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_sum_mv_eti_spec_avail.hpp>
+#include <KokkosBlas1_sum_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_sum_eti_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_sum_mv_eti_spec_avail.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
 
 // Unification layer
-template<class RMV, class XMV, int rank = XMV::rank,
-         bool tpl_spec_avail = sum_tpl_spec_avail<RMV,XMV>::value,
-         bool eti_spec_avail = sum_eti_spec_avail<RMV,XMV>::value>
+template <class RMV, class XMV, int rank = XMV::rank,
+          bool tpl_spec_avail = sum_tpl_spec_avail<RMV, XMV>::value,
+          bool eti_spec_avail = sum_eti_spec_avail<RMV, XMV>::value>
 struct Sum {
-  static void sum (const RMV& R, const XMV& X);
+  static void sum(const RMV& R, const XMV& X);
 };
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 //! Full specialization of Sum for single vectors (1-D Views).
-template<class RMV, class XMV>
-struct Sum<RMV, XMV, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>
-{
+template <class RMV, class XMV>
+struct Sum<RMV, XMV, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename XMV::size_type size_type;
 
-  static void sum (const RMV& R, const XMV& X)
-  {
-    static_assert (Kokkos::Impl::is_view<RMV>::value, "KokkosBlas::Impl::"
-                   "Sum<1-D>: RMV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "Sum<1-D>: XMV is not a Kokkos::View.");
-    static_assert (RMV::rank == 0, "KokkosBlas::Impl::Sum<1-D>: "
-                   "RMV is not rank 0.");
-    static_assert (XMV::rank == 1, "KokkosBlas::Impl::Sum<1-D>: "
-                   "XMV is not rank 1.");
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::sum[ETI]":"KokkosBlas::sum[noETI]");
+  static void sum(const RMV& R, const XMV& X) {
+    static_assert(Kokkos::is_view<RMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Sum<1-D>: RMV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Sum<1-D>: XMV is not a Kokkos::View.");
+    static_assert(RMV::rank == 0,
+                  "KokkosBlas::Impl::Sum<1-D>: "
+                  "RMV is not rank 0.");
+    static_assert(XMV::rank == 1,
+                  "KokkosBlas::Impl::Sum<1-D>: "
+                  "XMV is not rank 1.");
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::sum[ETI]"
+                                      : "KokkosBlas::sum[noETI]");
 
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::sum<> ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name());
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas1::sum<> ETI specialization for < %s , %s >\n",
+             typeid(RMV).name(), typeid(XMV).name());
     else {
-      printf("KokkosBlas1::sum<> non-ETI specialization for < %s , %s >\n",typeid(RMV).name(),typeid(XMV).name());
+      printf("KokkosBlas1::sum<> non-ETI specialization for < %s , %s >\n",
+             typeid(RMV).name(), typeid(XMV).name());
     }
-    #endif
+#endif
     const size_type numRows = X.extent(0);
 
-    if (numRows < static_cast<size_type> (INT_MAX) ) {
-      V_Sum_Invoke<RMV, XMV, int> (R, X);
-    }
-    else {
+    if (numRows < static_cast<size_type>(INT_MAX)) {
+      V_Sum_Invoke<RMV, XMV, int>(R, X);
+    } else {
       typedef std::int64_t index_type;
-      V_Sum_Invoke<RMV, XMV, index_type> (R, X);
+      V_Sum_Invoke<RMV, XMV, index_type>(R, X);
     }
     Kokkos::Profiling::popRegion();
   }
 };
 
-
-template<class RV, class XMV>
+template <class RV, class XMV>
 struct Sum<RV, XMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename XMV::size_type size_type;
 
-  static void sum (const RV& R, const XMV& X)
-  {
-    static_assert (Kokkos::Impl::is_view<RV>::value, "KokkosBlas::Impl::"
-                   "Sum<2-D>: RV is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "Sum<2-D>: XMV is not a Kokkos::View.");
-    static_assert (RV::rank == 1, "KokkosBlas::Impl::Sum<2-D>: "
-                   "RV is not rank 1.");
-    static_assert (XMV::rank == 2, "KokkosBlas::Impl::Sum<2-D>: "
-                   "XMV is not rank 2.");
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::sum[ETI]":"KokkosBlas::sum[noETI]");
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::sum<> ETI specialization for < %s , %s >\n",typeid(RV).name(),typeid(XMV).name());
+  static void sum(const RV& R, const XMV& X) {
+    static_assert(Kokkos::is_view<RV>::value,
+                  "KokkosBlas::Impl::"
+                  "Sum<2-D>: RV is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Sum<2-D>: XMV is not a Kokkos::View.");
+    static_assert(RV::rank == 1,
+                  "KokkosBlas::Impl::Sum<2-D>: "
+                  "RV is not rank 1.");
+    static_assert(XMV::rank == 2,
+                  "KokkosBlas::Impl::Sum<2-D>: "
+                  "XMV is not rank 2.");
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::sum[ETI]"
+                                      : "KokkosBlas::sum[noETI]");
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas1::sum<> ETI specialization for < %s , %s >\n",
+             typeid(RV).name(), typeid(XMV).name());
     else {
-      printf("KokkosBlas1::sum<> non-ETI specialization for < %s , %s >\n",typeid(RV).name(),typeid(XMV).name());
+      printf("KokkosBlas1::sum<> non-ETI specialization for < %s , %s >\n",
+             typeid(RV).name(), typeid(XMV).name());
     }
-    #endif
+#endif
 
     const size_type numRows = X.extent(0);
     const size_type numCols = X.extent(1);
-    if (numRows < static_cast<size_type> (INT_MAX) &&
-        numRows * numCols < static_cast<size_type> (INT_MAX)) {
-      MV_Sum_Invoke<RV, XMV, int> (R, X);
-    }
-    else {
+    if (numRows < static_cast<size_type>(INT_MAX) &&
+        numRows * numCols < static_cast<size_type>(INT_MAX)) {
+      MV_Sum_Invoke<RV, XMV, int>(R, X);
+    } else {
       typedef std::int64_t index_type;
-      MV_Sum_Invoke<RV, XMV, index_type> (R, X);
+      MV_Sum_Invoke<RV, XMV, index_type>(R, X);
     }
     Kokkos::Profiling::popRegion();
   }
 };
 #endif
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization of
@@ -202,26 +219,27 @@ struct Sum<RV, XMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_SUM_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct Sum< \
-         Kokkos::View<SCALAR, LAYOUT, Kokkos::HostSpace, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         1, false, true>;
+#define KOKKOSBLAS1_SUM_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  extern template struct Sum<                                                \
+      Kokkos::View<SCALAR, LAYOUT, Kokkos::HostSpace,                        \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<const SCALAR*, LAYOUT,                                    \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      1, false, true>;
 
 //
 // Macro for definition of full specialization of
 // KokkosBlas::Impl::Sum for rank == 2.  This is NOT for users!!!  We
 // use this macro in one or more .cpp files in this directory.
 //
-#define KOKKOSBLAS1_SUM_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct Sum< \
-         Kokkos::View<SCALAR, LAYOUT, Kokkos::HostSpace, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         1, false, true>;
+#define KOKKOSBLAS1_SUM_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE)  \
+  template struct Sum<Kokkos::View<SCALAR, LAYOUT, Kokkos::HostSpace,         \
+                                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
+                      Kokkos::View<const SCALAR*, LAYOUT,                     \
+                                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,     \
+                                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
+                      1, false, true>;
 
 //
 // Macro for declaration of full specialization of
@@ -230,33 +248,37 @@ template struct Sum< \
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_SUM_MV_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct Sum< \
-         Kokkos::View<SCALAR*, \
-                      LAYOUT, \
-                      Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         2, false, true>;
+#define KOKKOSBLAS1_SUM_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \
+                                         MEM_SPACE)                  \
+  extern template struct Sum<                                        \
+      Kokkos::View<SCALAR*, LAYOUT,                                  \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace, \
+                                  Kokkos::HostSpace>,                \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,        \
+      Kokkos::View<const SCALAR*, LAYOUT,                            \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,        \
+      2, false, true>;
 
 //
 // Macro for definition of full specialization of
 // KokkosBlas::Impl::Sum for rank == 2.  This is NOT for users!!!  We
 // use this macro in one or more .cpp files in this directory.
 //
-#define KOKKOSBLAS1_SUM_MV_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct Sum< \
-         Kokkos::View<SCALAR*, \
-                      LAYOUT, \
-                      Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         2, false, true>;
+#define KOKKOSBLAS1_SUM_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \
+                                         MEM_SPACE)                  \
+  template struct Sum<                                               \
+      Kokkos::View<SCALAR*, LAYOUT,                                  \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace, \
+                                  Kokkos::HostSpace>,                \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,        \
+      Kokkos::View<const SCALAR**, LAYOUT,                           \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,        \
+      2, false, true>;
 
-#include<KokkosBlas1_sum_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_sum_eti_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_sum_mv_eti_spec_decl.hpp>
+#include <KokkosBlas1_sum_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_sum_eti_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_sum_mv_eti_spec_decl.hpp>
 
-#endif // KOKKOSBLAS1_SUM_SPEC_HPP_
+#endif  // KOKKOSBLAS1_SUM_SPEC_HPP_
diff --git a/src/blas/impl/KokkosBlas1_team_abs_spec.hpp b/src/blas/impl/KokkosBlas1_team_abs_spec.hpp
index 0afcb351e1..4f72339db3 100644
--- a/src/blas/impl/KokkosBlas1_team_abs_spec.hpp
+++ b/src/blas/impl/KokkosBlas1_team_abs_spec.hpp
@@ -54,34 +54,35 @@ namespace KokkosBlas {
 namespace Experimental {
 namespace Impl {
 
-
-template<class RV, class XV>
+template <class RV, class XV>
 struct team_abs_tpl_spec_avail {
   constexpr static bool value = false;
 };
 
 // Unification and Specialization layer
-template<class TeamType, class RV, class XV, bool tpl_spec_avail = team_abs_tpl_spec_avail<RV,XV>::value>
+template <class TeamType, class RV, class XV,
+          bool tpl_spec_avail = team_abs_tpl_spec_avail<RV, XV>::value>
 struct TeamAbs {
   typedef Kokkos::Details::ArithTraits<typename XV::non_const_value_type> ATS;
 
-  static KOKKOS_INLINE_FUNCTION void team_abs (const TeamType& team, const RV& R, const XV& X);
+  static KOKKOS_INLINE_FUNCTION void team_abs(const TeamType& team, const RV& R,
+                                              const XV& X);
 };
 
-template<class TeamType, class RV, class XV>
+template <class TeamType, class RV, class XV>
 struct TeamAbs<TeamType, RV, XV, false> {
   typedef Kokkos::Details::ArithTraits<typename XV::non_const_value_type> ATS;
 
-  static KOKKOS_INLINE_FUNCTION void team_abs (const TeamType& team, const RV& R, const XV& X) {
+  static KOKKOS_INLINE_FUNCTION void team_abs(const TeamType& team, const RV& R,
+                                              const XV& X) {
     int N = X.extent(0);
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(team,N), [&] (const int& i) {
-      R(i) =  ATS::abs(X(i));
-    });
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N),
+                         [&](const int& i) { R(i) = ATS::abs(X(i)); });
   }
 };
 
-}
-}
-}
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/blas/impl/KokkosBlas1_team_axpby_spec.hpp b/src/blas/impl/KokkosBlas1_team_axpby_spec.hpp
index 25ac9778ce..c270b608da 100644
--- a/src/blas/impl/KokkosBlas1_team_axpby_spec.hpp
+++ b/src/blas/impl/KokkosBlas1_team_axpby_spec.hpp
@@ -54,36 +54,36 @@ namespace KokkosBlas {
 namespace Experimental {
 namespace Impl {
 
-template<class XV, class YV>
+template <class XV, class YV>
 struct team_axpby_tpl_spec_avail {
   constexpr static bool value = false;
 };
 
-
 // Unification and Specialization layer
-template<class TeamType, class XVector, class YVector,
-         bool tpl_spec_avail = team_axpby_tpl_spec_avail<XVector,YVector>::value>
+template <class TeamType, class XVector, class YVector,
+          bool tpl_spec_avail =
+              team_axpby_tpl_spec_avail<XVector, YVector>::value>
 struct TeamAXPBY {
-  static KOKKOS_INLINE_FUNCTION void team_axpby (const TeamType& team,
-      const typename XVector::non_const_value_type& a, const XVector& x,
-      const typename YVector::non_const_value_type& b, const YVector& y);
+  static KOKKOS_INLINE_FUNCTION void team_axpby(
+      const TeamType& team, const typename XVector::non_const_value_type& a,
+      const XVector& x, const typename YVector::non_const_value_type& b,
+      const YVector& y);
 };
 
-template<class TeamType, class XVector, class YVector>
-struct TeamAXPBY<TeamType, XVector, YVector, false>
-{
-  static KOKKOS_INLINE_FUNCTION void team_axpby (const TeamType& team,
-      const typename XVector::non_const_value_type& a, const XVector& x,
-      const typename YVector::non_const_value_type& b, const YVector& y) {
+template <class TeamType, class XVector, class YVector>
+struct TeamAXPBY<TeamType, XVector, YVector, false> {
+  static KOKKOS_INLINE_FUNCTION void team_axpby(
+      const TeamType& team, const typename XVector::non_const_value_type& a,
+      const XVector& x, const typename YVector::non_const_value_type& b,
+      const YVector& y) {
     const int N = x.extent(0);
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(team,N), [&] (const int& i) {
-      y(i) = b*y(i) + a*x(i);
-    });
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N),
+                         [&](const int& i) { y(i) = b * y(i) + a * x(i); });
   }
 };
 
-}
-}
-}
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/blas/impl/KokkosBlas1_team_dot_spec.hpp b/src/blas/impl/KokkosBlas1_team_dot_spec.hpp
index 956b8e74d0..1ceed25102 100644
--- a/src/blas/impl/KokkosBlas1_team_dot_spec.hpp
+++ b/src/blas/impl/KokkosBlas1_team_dot_spec.hpp
@@ -54,38 +54,47 @@ namespace KokkosBlas {
 namespace Experimental {
 namespace Impl {
 
-
-template<class XV, class YV>
+template <class XV, class YV>
 struct team_dot_tpl_spec_avail {
   constexpr static bool value = false;
 };
 
 // Unification and Specialization layer
-template<class TeamType, class XV, class YV, bool tpl_spec_avail = team_dot_tpl_spec_avail<XV,YV>::value>
+template <class TeamType, class XV, class YV,
+          bool tpl_spec_avail = team_dot_tpl_spec_avail<XV, YV>::value>
 struct TeamDot {
-  typedef          Kokkos::Details::InnerProductSpaceTraits<typename XV::non_const_value_type> IPT;
+  typedef Kokkos::Details::InnerProductSpaceTraits<
+      typename XV::non_const_value_type>
+      IPT;
   typedef typename IPT::dot_type dot_type;
 
-  static KOKKOS_INLINE_FUNCTION dot_type team_dot (const TeamType& team, const XV& X, const YV& Y);
+  static KOKKOS_INLINE_FUNCTION dot_type team_dot(const TeamType& team,
+                                                  const XV& X, const YV& Y);
 };
 
-template<class TeamType, class XV, class YV>
+template <class TeamType, class XV, class YV>
 struct TeamDot<TeamType, XV, YV, false> {
-  typedef          Kokkos::Details::InnerProductSpaceTraits<typename XV::non_const_value_type> IPT;
+  typedef Kokkos::Details::InnerProductSpaceTraits<
+      typename XV::non_const_value_type>
+      IPT;
   typedef typename IPT::dot_type dot_type;
 
-  static KOKKOS_INLINE_FUNCTION dot_type team_dot (const TeamType& team, const XV& X, const YV& Y) {
-    dot_type result = 0.0; //Kokkos::Details::ArithTraits<dot_type>zero();
-    int N = X.extent(0);
-    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,N), [&] (const int& i, dot_type& val) {
-      val += IPT::dot(X(i),Y(i));// X(i) * Y(i)
-    },result);
+  static KOKKOS_INLINE_FUNCTION dot_type team_dot(const TeamType& team,
+                                                  const XV& X, const YV& Y) {
+    dot_type result = 0.0;  // Kokkos::Details::ArithTraits<dot_type>zero();
+    int N           = X.extent(0);
+    Kokkos::parallel_reduce(
+        Kokkos::TeamThreadRange(team, N),
+        [&](const int& i, dot_type& val) {
+          val += IPT::dot(X(i), Y(i));  // X(i) * Y(i)
+        },
+        result);
     return result;
   }
 };
 
-}
-}
-}
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/blas/impl/KokkosBlas1_team_mult_spec.hpp b/src/blas/impl/KokkosBlas1_team_mult_spec.hpp
index 9de540b25b..5b4eaa6ee7 100644
--- a/src/blas/impl/KokkosBlas1_team_mult_spec.hpp
+++ b/src/blas/impl/KokkosBlas1_team_mult_spec.hpp
@@ -54,35 +54,37 @@ namespace KokkosBlas {
 namespace Experimental {
 namespace Impl {
 
-template<class YV, class AV, class XV>
+template <class YV, class AV, class XV>
 struct team_mult_tpl_spec_avail {
   constexpr static bool value = false;
 };
 
-
 // Unification and Specialization layer
-template<class TeamType, class YVector, class AVector, class XVector, bool tpl_spec_avail = team_mult_tpl_spec_avail<YVector,AVector,XVector>::value>
+template <class TeamType, class YVector, class AVector, class XVector,
+          bool tpl_spec_avail =
+              team_mult_tpl_spec_avail<YVector, AVector, XVector>::value>
 struct TeamMult {
-  static KOKKOS_INLINE_FUNCTION void team_mult (const TeamType& team,
-      const typename YVector::non_const_value_type& gamma, const YVector& y,
-      const typename AVector::non_const_value_type& alpha, const AVector& a, const XVector& x);
+  static KOKKOS_INLINE_FUNCTION void team_mult(
+      const TeamType& team, const typename YVector::non_const_value_type& gamma,
+      const YVector& y, const typename AVector::non_const_value_type& alpha,
+      const AVector& a, const XVector& x);
 };
 
-template<class TeamType, class YVector, class AVector, class XVector>
-struct TeamMult<TeamType, YVector, AVector, XVector, false>
-{
-  static KOKKOS_INLINE_FUNCTION void team_mult (const TeamType& team,
-      const typename YVector::non_const_value_type& gamma, const YVector& y,
-      const typename AVector::non_const_value_type& alpha, const AVector& a, const XVector& x) {
+template <class TeamType, class YVector, class AVector, class XVector>
+struct TeamMult<TeamType, YVector, AVector, XVector, false> {
+  static KOKKOS_INLINE_FUNCTION void team_mult(
+      const TeamType& team, const typename YVector::non_const_value_type& gamma,
+      const YVector& y, const typename AVector::non_const_value_type& alpha,
+      const AVector& a, const XVector& x) {
     const int N = x.extent(0);
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(team,N), [&] (const int& i) {
-      y(i) = gamma*y(i) + alpha*a(i)*x(i);
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) {
+      y(i) = gamma * y(i) + alpha * a(i) * x(i);
     });
   }
 };
 
-}
-}
-}
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/blas/impl/KokkosBlas1_team_nrm2_spec.hpp b/src/blas/impl/KokkosBlas1_team_nrm2_spec.hpp
index f104e3c4d3..bdc3f11801 100644
--- a/src/blas/impl/KokkosBlas1_team_nrm2_spec.hpp
+++ b/src/blas/impl/KokkosBlas1_team_nrm2_spec.hpp
@@ -54,42 +54,53 @@ namespace KokkosBlas {
 namespace Experimental {
 namespace Impl {
 
-
-template<class XV>
+template <class XV>
 struct team_nrm2_tpl_spec_avail {
   constexpr static bool value = false;
 };
 
 // Unification and Specialization layer
-template<class TeamType, class XV, bool tpl_spec_avail = team_nrm2_tpl_spec_avail<XV>::value>
+template <class TeamType, class XV,
+          bool tpl_spec_avail = team_nrm2_tpl_spec_avail<XV>::value>
 struct TeamNrm2 {
-  typedef typename Kokkos::Details::InnerProductSpaceTraits<typename XV::non_const_value_type>::mag_type mag_type;
-  typedef Kokkos::Details::InnerProductSpaceTraits<typename XV::non_const_value_type> IPT;
-  typedef Kokkos::Details::ArithTraits<typename IPT::mag_type>   AT;
-  
-  static KOKKOS_INLINE_FUNCTION mag_type team_nrm2 (const TeamType& team, const XV& X);
+  typedef typename Kokkos::Details::InnerProductSpaceTraits<
+      typename XV::non_const_value_type>::mag_type mag_type;
+  typedef Kokkos::Details::InnerProductSpaceTraits<
+      typename XV::non_const_value_type>
+      IPT;
+  typedef Kokkos::Details::ArithTraits<typename IPT::mag_type> AT;
+
+  static KOKKOS_INLINE_FUNCTION mag_type team_nrm2(const TeamType& team,
+                                                   const XV& X);
 };
 
-template<class TeamType, class XV>
+template <class TeamType, class XV>
 struct TeamNrm2<TeamType, XV, false> {
-  typedef typename Kokkos::Details::InnerProductSpaceTraits<typename XV::non_const_value_type>::mag_type mag_type;
-  typedef Kokkos::Details::InnerProductSpaceTraits<typename XV::non_const_value_type> IPT;
-  typedef Kokkos::Details::ArithTraits<typename IPT::mag_type>   AT;
+  typedef typename Kokkos::Details::InnerProductSpaceTraits<
+      typename XV::non_const_value_type>::mag_type mag_type;
+  typedef Kokkos::Details::InnerProductSpaceTraits<
+      typename XV::non_const_value_type>
+      IPT;
+  typedef Kokkos::Details::ArithTraits<typename IPT::mag_type> AT;
 
-  static KOKKOS_INLINE_FUNCTION mag_type team_nrm2 (const TeamType& team, const XV& X) {
-    mag_type result = 0.0; //Kokkos::Details::ArithTraits<mag_type>zero();
-    int N = X.extent(0);
-    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,N), [&] (const int& i, mag_type& val) {
-      const typename IPT::mag_type tmp = IPT::norm (X(i));
-      val += tmp * tmp;
-    },result);
+  static KOKKOS_INLINE_FUNCTION mag_type team_nrm2(const TeamType& team,
+                                                   const XV& X) {
+    mag_type result = 0.0;  // Kokkos::Details::ArithTraits<mag_type>zero();
+    int N           = X.extent(0);
+    Kokkos::parallel_reduce(
+        Kokkos::TeamThreadRange(team, N),
+        [&](const int& i, mag_type& val) {
+          const typename IPT::mag_type tmp = IPT::norm(X(i));
+          val += tmp * tmp;
+        },
+        result);
     result = AT::sqrt(result);
     return result;
   }
 };
 
-}
-}
-}
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/blas/impl/KokkosBlas1_team_scal_spec.hpp b/src/blas/impl/KokkosBlas1_team_scal_spec.hpp
index 8ed4a91022..27c2f595b4 100644
--- a/src/blas/impl/KokkosBlas1_team_scal_spec.hpp
+++ b/src/blas/impl/KokkosBlas1_team_scal_spec.hpp
@@ -54,31 +54,33 @@ namespace KokkosBlas {
 namespace Experimental {
 namespace Impl {
 
-template<class RV, class XV>
+template <class RV, class XV>
 struct team_scal_tpl_spec_avail {
   constexpr static bool value = false;
 };
 
-
 // Unification and Specialization layer
-template<class TeamType, class RV, class XV, bool tpl_spec_avail = team_scal_tpl_spec_avail<RV,XV>::value>
+template <class TeamType, class RV, class XV,
+          bool tpl_spec_avail = team_scal_tpl_spec_avail<RV, XV>::value>
 struct TeamScal {
-  static KOKKOS_INLINE_FUNCTION void team_scal (const TeamType& team, const RV& R, const typename XV::non_const_value_type& a, const XV& X);
+  static KOKKOS_INLINE_FUNCTION void team_scal(
+      const TeamType& team, const RV& R,
+      const typename XV::non_const_value_type& a, const XV& X);
 };
 
-template<class TeamType, class RV, class XV>
-struct TeamScal<TeamType, RV, XV, false>
-{
-  static KOKKOS_INLINE_FUNCTION void team_scal (const TeamType& team, const RV& R, const typename XV::non_const_value_type& a, const XV& X) {
+template <class TeamType, class RV, class XV>
+struct TeamScal<TeamType, RV, XV, false> {
+  static KOKKOS_INLINE_FUNCTION void team_scal(
+      const TeamType& team, const RV& R,
+      const typename XV::non_const_value_type& a, const XV& X) {
     const int N = X.extent(0);
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(team,N), [&] (const int& i) {
-      R(i) = a*X(i);
-    });
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N),
+                         [&](const int& i) { R(i) = a * X(i); });
   }
 };
 
-}
-}
-}
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/blas/impl/KokkosBlas1_team_update_spec.hpp b/src/blas/impl/KokkosBlas1_team_update_spec.hpp
index 1c5309fd13..b2bc67ab7d 100644
--- a/src/blas/impl/KokkosBlas1_team_update_spec.hpp
+++ b/src/blas/impl/KokkosBlas1_team_update_spec.hpp
@@ -54,37 +54,39 @@ namespace KokkosBlas {
 namespace Experimental {
 namespace Impl {
 
-template<class XV, class YV, class ZV>
+template <class XV, class YV, class ZV>
 struct team_update_tpl_spec_avail {
   constexpr static bool value = false;
 };
 
-
 // Unification and Specialization layer
-template<class TeamType, class XVector, class YVector, class ZVector, bool tpl_spec_avail = team_update_tpl_spec_avail<XVector,YVector,ZVector>::value>
+template <class TeamType, class XVector, class YVector, class ZVector,
+          bool tpl_spec_avail =
+              team_update_tpl_spec_avail<XVector, YVector, ZVector>::value>
 struct TeamUpdate {
-  static KOKKOS_INLINE_FUNCTION void team_update (const TeamType& team,
-      const typename XVector::non_const_value_type& alpha, const XVector& x,
-      const typename YVector::non_const_value_type& beta,  const YVector& y, 
-      const typename ZVector::non_const_value_type& gamma, const ZVector& z);
+  static KOKKOS_INLINE_FUNCTION void team_update(
+      const TeamType& team, const typename XVector::non_const_value_type& alpha,
+      const XVector& x, const typename YVector::non_const_value_type& beta,
+      const YVector& y, const typename ZVector::non_const_value_type& gamma,
+      const ZVector& z);
 };
 
-template<class TeamType, class XVector, class YVector, class ZVector>
-struct TeamUpdate<TeamType, XVector, YVector, ZVector, false>
-{
-  static KOKKOS_INLINE_FUNCTION void team_update (const TeamType& team,
-      const typename XVector::non_const_value_type& alpha, const XVector& x,
-      const typename YVector::non_const_value_type& beta,  const YVector& y, 
-      const typename ZVector::non_const_value_type& gamma, const ZVector& z) {
+template <class TeamType, class XVector, class YVector, class ZVector>
+struct TeamUpdate<TeamType, XVector, YVector, ZVector, false> {
+  static KOKKOS_INLINE_FUNCTION void team_update(
+      const TeamType& team, const typename XVector::non_const_value_type& alpha,
+      const XVector& x, const typename YVector::non_const_value_type& beta,
+      const YVector& y, const typename ZVector::non_const_value_type& gamma,
+      const ZVector& z) {
     const int N = x.extent(0);
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(team,N), [&] (const int& i) {
-      z(i) = gamma*z(i) + alpha*x(i) + beta*y(i);
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) {
+      z(i) = gamma * z(i) + alpha * x(i) + beta * y(i);
     });
   }
 };
 
-}
-}
-}
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/blas/impl/KokkosBlas1_update_impl.hpp b/src/blas/impl/KokkosBlas1_update_impl.hpp
index 144c21183b..37e76306a0 100644
--- a/src/blas/impl/KokkosBlas1_update_impl.hpp
+++ b/src/blas/impl/KokkosBlas1_update_impl.hpp
@@ -68,13 +68,11 @@ namespace Impl {
 // corresponding input coefficient.  Any literal coefficient of zero
 // has BLAS semantics of ignoring the corresponding (multi)vector
 // entry.
-template<class XMV, class YMV, class ZMV,
-         int scalar_x, int scalar_y, int scalar_z,
-         class SizeType = typename ZMV::size_type>
-struct MV_Update_Functor
-{
+template <class XMV, class YMV, class ZMV, int scalar_x, int scalar_y,
+          int scalar_z, class SizeType = typename ZMV::size_type>
+struct MV_Update_Functor {
   typedef typename ZMV::execution_space execution_space;
-  typedef SizeType                            size_type;
+  typedef SizeType size_type;
   typedef Kokkos::Details::ArithTraits<typename ZMV::non_const_value_type> ATS;
 
   const size_type numCols;
@@ -85,38 +83,46 @@ struct MV_Update_Functor
   const typename ZMV::non_const_value_type gamma_;
   ZMV Z_;
 
-  MV_Update_Functor (const typename XMV::non_const_value_type& alpha, const XMV& X,
-                     const typename YMV::non_const_value_type& beta, const YMV& Y,
-                     const typename ZMV::non_const_value_type& gamma, const ZMV& Z) :
-    numCols (X.extent(1)),
-    alpha_ (alpha), X_ (X),
-    beta_ (beta), Y_ (Y),
-    gamma_ (gamma), Z_ (Z)
-  {
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "MV_Update_Functor: X is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<YMV>::value, "KokkosBlas::Impl::"
-                   "MV_Update_Functor: Y is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<ZMV>::value, "KokkosBlas::Impl::"
-                   "MV_Update_Functor: Z is not a Kokkos::View.");
-    static_assert (std::is_same<typename ZMV::value_type,
-                   typename ZMV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::MV_Update_Functor: Z is const.  "
-                   "It must be nonconst, because it is an output argument "
-                   "(we have to be able to write to its entries).");
+  MV_Update_Functor(const typename XMV::non_const_value_type& alpha,
+                    const XMV& X,
+                    const typename YMV::non_const_value_type& beta,
+                    const YMV& Y,
+                    const typename ZMV::non_const_value_type& gamma,
+                    const ZMV& Z)
+      : numCols(X.extent(1)),
+        alpha_(alpha),
+        X_(X),
+        beta_(beta),
+        Y_(Y),
+        gamma_(gamma),
+        Z_(Z) {
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "MV_Update_Functor: X is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<YMV>::value,
+                  "KokkosBlas::Impl::"
+                  "MV_Update_Functor: Y is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<ZMV>::value,
+                  "KokkosBlas::Impl::"
+                  "MV_Update_Functor: Z is not a Kokkos::View.");
+    static_assert(std::is_same<typename ZMV::value_type,
+                               typename ZMV::non_const_value_type>::value,
+                  "KokkosBlas::Impl::MV_Update_Functor: Z is const.  "
+                  "It must be nonconst, because it is an output argument "
+                  "(we have to be able to write to its entries).");
     // Casting enum values to int avoids compiler warnings about
     // comparing different kinds of enum values.
-    static_assert ((int) ZMV::rank == (int) XMV::rank &&
-                   (int) ZMV::rank == (int) YMV::rank,
-                   "KokkosBlas::Impl::MV_Update_Functor: "
-                   "X, Y, and Z must have the same rank.");
-    static_assert (ZMV::rank == 2, "KokkosBlas::Impl::MV_Update_Functor: "
-                   "XMV, YMV, and ZMV must have rank 2.");
+    static_assert(
+        (int)ZMV::rank == (int)XMV::rank && (int)ZMV::rank == (int)YMV::rank,
+        "KokkosBlas::Impl::MV_Update_Functor: "
+        "X, Y, and Z must have the same rank.");
+    static_assert(ZMV::rank == 2,
+                  "KokkosBlas::Impl::MV_Update_Functor: "
+                  "XMV, YMV, and ZMV must have rank 2.");
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i) const
-  {
+  void operator()(const size_type& i) const {
     // scalar_x, scalar_y, and scalar_z are compile-time constants
     // (since they are template parameters), so the compiler should
     // evaluate these branches at compile time.
@@ -130,10 +136,9 @@ struct MV_Update_Functor
 #pragma vector always
 #endif
           for (size_type k = 0; k < numCols; ++k) {
-            Z_(i,k) = ATS::zero ();
+            Z_(i, k) = ATS::zero();
           }
-        }
-        else {
+        } else {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
@@ -141,11 +146,10 @@ struct MV_Update_Functor
 #pragma vector always
 #endif
           for (size_type k = 0; k < numCols; ++k) {
-            Z_(i,k) = gamma_ * Z_(i,k);
+            Z_(i, k) = gamma_ * Z_(i, k);
           }
         }
-      }
-      else {
+      } else {
         if (scalar_z == 0) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
@@ -154,10 +158,9 @@ struct MV_Update_Functor
 #pragma vector always
 #endif
           for (size_type k = 0; k < numCols; ++k) {
-            Z_(i,k) = beta_ * Y_(i,k);
+            Z_(i, k) = beta_ * Y_(i, k);
           }
-        }
-        else {
+        } else {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
@@ -165,7 +168,7 @@ struct MV_Update_Functor
 #pragma vector always
 #endif
           for (size_type k = 0; k < numCols; ++k) {
-            Z_(i,k) = beta_ * Y_(i,k) + gamma_ * Z_(i,k);
+            Z_(i, k) = beta_ * Y_(i, k) + gamma_ * Z_(i, k);
           }
         }
       }
@@ -183,10 +186,9 @@ struct MV_Update_Functor
 #pragma vector always
 #endif
           for (size_type k = 0; k < numCols; ++k) {
-            Z_(i,k) = alpha_ * X_(i,k);
+            Z_(i, k) = alpha_ * X_(i, k);
           }
-        }
-        else {
+        } else {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
@@ -194,11 +196,10 @@ struct MV_Update_Functor
 #pragma vector always
 #endif
           for (size_type k = 0; k < numCols; ++k) {
-            Z_(i,k) = alpha_ * X_(i,k) + gamma_ * Z_(i,k);
+            Z_(i, k) = alpha_ * X_(i, k) + gamma_ * Z_(i, k);
           }
         }
-      }
-      else {
+      } else {
         if (scalar_z == 0) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
@@ -207,10 +208,9 @@ struct MV_Update_Functor
 #pragma vector always
 #endif
           for (size_type k = 0; k < numCols; ++k) {
-            Z_(i,k) = alpha_ * X_(i,k) + beta_ * Y_(i,k);
+            Z_(i, k) = alpha_ * X_(i, k) + beta_ * Y_(i, k);
           }
-        }
-        else {
+        } else {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
@@ -218,7 +218,7 @@ struct MV_Update_Functor
 #pragma vector always
 #endif
           for (size_type k = 0; k < numCols; ++k) {
-            Z_(i,k) = alpha_ * X_(i,k) + beta_ * Y_(i,k) + gamma_ * Z_(i,k);
+            Z_(i, k) = alpha_ * X_(i, k) + beta_ * Y_(i, k) + gamma_ * Z_(i, k);
           }
         }
       }
@@ -238,13 +238,11 @@ struct MV_Update_Functor
 // coefficients.  The value 2 tells the functor to use the
 // corresponding input coefficient.  Any literal coefficient of zero
 // has BLAS semantics of ignoring the corresponding vector entry.
-template<class XV, class YV, class ZV,
-         int scalar_x, int scalar_y, int scalar_z,
-         class SizeType = typename ZV::size_type>
-struct V_Update_Functor
-{
+template <class XV, class YV, class ZV, int scalar_x, int scalar_y,
+          int scalar_z, class SizeType = typename ZV::size_type>
+struct V_Update_Functor {
   typedef typename ZV::execution_space execution_space;
-  typedef SizeType                            size_type;
+  typedef SizeType size_type;
   typedef Kokkos::Details::ArithTraits<typename ZV::non_const_value_type> ATS;
 
   const size_type numCols;
@@ -255,55 +253,57 @@ struct V_Update_Functor
   const typename ZV::non_const_value_type gamma_;
   ZV Z_;
 
-  V_Update_Functor (const typename XV::non_const_value_type& alpha, const XV& X,
-                    const typename YV::non_const_value_type& beta, const YV& Y,
-                    const typename ZV::non_const_value_type& gamma, const ZV& Z) :
-    numCols (X.extent(1)),
-    alpha_ (alpha), X_ (X),
-    beta_ (beta), Y_ (Y),
-    gamma_ (gamma), Z_ (Z)
-  {
-    static_assert (Kokkos::Impl::is_view<XV>::value, "KokkosBlas::Impl::"
-                   "V_Update_Functor: X is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<YV>::value, "KokkosBlas::Impl::"
-                   "V_Update_Functor: Y is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<ZV>::value, "KokkosBlas::Impl::"
-                   "V_Update_Functor: Z is not a Kokkos::View.");
-    static_assert (std::is_same<typename ZV::value_type,
-                   typename ZV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::V_Update_Functor: Z is const.  "
-                   "It must be nonconst, because it is an output argument "
-                   "(we have to be able to write to its entries).");
+  V_Update_Functor(const typename XV::non_const_value_type& alpha, const XV& X,
+                   const typename YV::non_const_value_type& beta, const YV& Y,
+                   const typename ZV::non_const_value_type& gamma, const ZV& Z)
+      : numCols(X.extent(1)),
+        alpha_(alpha),
+        X_(X),
+        beta_(beta),
+        Y_(Y),
+        gamma_(gamma),
+        Z_(Z) {
+    static_assert(Kokkos::is_view<XV>::value,
+                  "KokkosBlas::Impl::"
+                  "V_Update_Functor: X is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<YV>::value,
+                  "KokkosBlas::Impl::"
+                  "V_Update_Functor: Y is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<ZV>::value,
+                  "KokkosBlas::Impl::"
+                  "V_Update_Functor: Z is not a Kokkos::View.");
+    static_assert(std::is_same<typename ZV::value_type,
+                               typename ZV::non_const_value_type>::value,
+                  "KokkosBlas::Impl::V_Update_Functor: Z is const.  "
+                  "It must be nonconst, because it is an output argument "
+                  "(we have to be able to write to its entries).");
     // Casting to int avoids compiler warnings about comparing
     // different kinds of enum values.
-    static_assert ((int) ZV::rank == (int) XV::rank &&
-                   (int) ZV::rank == (int) YV::rank,
-                   "KokkosBlas::Impl::V_Update_Functor: "
-                   "X, Y, and Z must have the same rank.");
-    static_assert (ZV::rank == 1, "KokkosBlas::Impl::V_Update_Functor: "
-                   "XV, YV, and ZV must have rank 1.");
+    static_assert(
+        (int)ZV::rank == (int)XV::rank && (int)ZV::rank == (int)YV::rank,
+        "KokkosBlas::Impl::V_Update_Functor: "
+        "X, Y, and Z must have the same rank.");
+    static_assert(ZV::rank == 1,
+                  "KokkosBlas::Impl::V_Update_Functor: "
+                  "XV, YV, and ZV must have rank 1.");
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const size_type& i) const
-  {
+  void operator()(const size_type& i) const {
     // scalar_x, scalar_y, and scalar_z are compile-time constants
     // (since they are template parameters), so the compiler should
     // evaluate these branches at compile time.
     if (scalar_x == 0) {
       if (scalar_y == 0) {
         if (scalar_z == 0) {
-          Z_(i) = ATS::zero ();
-        }
-        else {
+          Z_(i) = ATS::zero();
+        } else {
           Z_(i) = gamma_ * Z_(i);
         }
-      }
-      else {
+      } else {
         if (scalar_z == 0) {
           Z_(i) = beta_ * Y_(i);
-        }
-        else {
+        } else {
           Z_(i) = beta_ * Y_(i) + gamma_ * Z_(i);
         }
       }
@@ -315,16 +315,13 @@ struct V_Update_Functor
       if (scalar_y == 0) {
         if (scalar_z == 0) {
           Z_(i) = alpha_ * X_(i);
-        }
-        else {
+        } else {
           Z_(i) = alpha_ * X_(i) + gamma_ * Z_(i);
         }
-      }
-      else {
+      } else {
         if (scalar_z == 0) {
           Z_(i) = alpha_ * X_(i) + beta_ * Y_(i);
-        }
-        else {
+        } else {
           Z_(i) = alpha_ * X_(i) + beta_ * Y_(i) + gamma_ * Z_(i);
         }
       }
@@ -347,56 +344,61 @@ struct V_Update_Functor
 //
 // Any literal coefficient of zero has BLAS semantics of ignoring the
 // corresponding multivector entry.
-template<class XMV, class YMV, class ZMV, class SizeType>
-void
-MV_Update_Generic (const typename XMV::non_const_value_type& alpha, const XMV& X,
-                   const typename YMV::non_const_value_type& beta, const YMV& Y,
-                   const typename ZMV::non_const_value_type& gamma, const ZMV& Z,
-                   int a = 2, int b = 2, int c = 2)
-{
-  static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                 "MV_Update_Generic: X is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<YMV>::value, "KokkosBlas::Impl::"
-                 "MV_Update_Generic: Y is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<ZMV>::value, "KokkosBlas::Impl::"
-                 "MV_Update_Generic: Z is not a Kokkos::View.");
-  static_assert (std::is_same<typename ZMV::value_type,
-                 typename ZMV::non_const_value_type>::value,
-                 "KokkosBlas::Impl::MV_Update_Generic: Z is const.  "
-                 "It must be nonconst, because it is an output argument "
-                 "(we have to be able to write to its entries).");
+template <class XMV, class YMV, class ZMV, class SizeType>
+void MV_Update_Generic(const typename XMV::non_const_value_type& alpha,
+                       const XMV& X,
+                       const typename YMV::non_const_value_type& beta,
+                       const YMV& Y,
+                       const typename ZMV::non_const_value_type& gamma,
+                       const ZMV& Z, int a = 2, int b = 2, int c = 2) {
+  static_assert(Kokkos::is_view<XMV>::value,
+                "KokkosBlas::Impl::"
+                "MV_Update_Generic: X is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<YMV>::value,
+                "KokkosBlas::Impl::"
+                "MV_Update_Generic: Y is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<ZMV>::value,
+                "KokkosBlas::Impl::"
+                "MV_Update_Generic: Z is not a Kokkos::View.");
+  static_assert(std::is_same<typename ZMV::value_type,
+                             typename ZMV::non_const_value_type>::value,
+                "KokkosBlas::Impl::MV_Update_Generic: Z is const.  "
+                "It must be nonconst, because it is an output argument "
+                "(we have to be able to write to its entries).");
   // Casting to int avoids compiler warnings about comparing different
   // kinds of enum values.
-  static_assert ((int) ZMV::rank == (int) XMV::rank &&
-                 (int) ZMV::rank == (int) YMV::rank,
-                 "KokkosBlas::Impl::MV_Update_Generic: "
-                 "X, Y, and Z must have the same rank.");
-  static_assert (ZMV::rank == 2, "KokkosBlas::Impl::MV_Update_Generic: "
-                 "XMV, YMV, and ZMV must have rank 2.");
+  static_assert(
+      (int)ZMV::rank == (int)XMV::rank && (int)ZMV::rank == (int)YMV::rank,
+      "KokkosBlas::Impl::MV_Update_Generic: "
+      "X, Y, and Z must have the same rank.");
+  static_assert(ZMV::rank == 2,
+                "KokkosBlas::Impl::MV_Update_Generic: "
+                "XMV, YMV, and ZMV must have rank 2.");
 
   typedef typename XMV::execution_space execution_space;
   const SizeType numRows = X.extent(0);
-  Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
+  Kokkos::RangePolicy<execution_space, SizeType> policy(0, numRows);
 
   if (a == 0) {
     if (b == 0) {
       if (c == 0) {
-        MV_Update_Functor<XMV, YMV, ZMV, 0, 0, 0, SizeType> op (alpha, X, beta, Y, gamma, Z);
-        Kokkos::parallel_for ("KokkosBlas::update<MV,0,0,0>",policy, op);
+        MV_Update_Functor<XMV, YMV, ZMV, 0, 0, 0, SizeType> op(alpha, X, beta,
+                                                               Y, gamma, Z);
+        Kokkos::parallel_for("KokkosBlas::update<MV,0,0,0>", policy, op);
+      } else {
+        MV_Update_Functor<XMV, YMV, ZMV, 0, 0, 2, SizeType> op(alpha, X, beta,
+                                                               Y, gamma, Z);
+        Kokkos::parallel_for("KokkosBlas::update<MV,0,0,c>", policy, op);
       }
-      else {
-        MV_Update_Functor<XMV, YMV, ZMV, 0, 0, 2, SizeType> op (alpha, X, beta, Y, gamma, Z);
-        Kokkos::parallel_for ("KokkosBlas::update<MV,0,0,c>",policy, op);
-      }
-    }
-    else {
+    } else {
       if (c == 0) {
-        MV_Update_Functor<XMV, YMV, ZMV, 0, 2, 0, SizeType> op (alpha, X, beta, Y, gamma, Z);
-        Kokkos::parallel_for ("KokkosBlas::update<MV,0,b,0>",policy, op);
-      }
-      else {
-        MV_Update_Functor<XMV, YMV, ZMV, 0, 2, 2, SizeType> op (alpha, X, beta, Y, gamma, Z);
-        Kokkos::parallel_for ("KokkosBlas::update<MV,0,b,c>",policy, op);
+        MV_Update_Functor<XMV, YMV, ZMV, 0, 2, 0, SizeType> op(alpha, X, beta,
+                                                               Y, gamma, Z);
+        Kokkos::parallel_for("KokkosBlas::update<MV,0,b,0>", policy, op);
+      } else {
+        MV_Update_Functor<XMV, YMV, ZMV, 0, 2, 2, SizeType> op(alpha, X, beta,
+                                                               Y, gamma, Z);
+        Kokkos::parallel_for("KokkosBlas::update<MV,0,b,c>", policy, op);
       }
     }
   }
@@ -406,28 +408,28 @@ MV_Update_Generic (const typename XMV::non_const_value_type& alpha, const XMV& X
   else {
     if (b == 0) {
       if (c == 0) {
-        MV_Update_Functor<XMV, YMV, ZMV, 2, 0, 0, SizeType> op (alpha, X, beta, Y, gamma, Z);
-        Kokkos::parallel_for ("KokkosBlas::update<MV,a,0,0>",policy, op);
+        MV_Update_Functor<XMV, YMV, ZMV, 2, 0, 0, SizeType> op(alpha, X, beta,
+                                                               Y, gamma, Z);
+        Kokkos::parallel_for("KokkosBlas::update<MV,a,0,0>", policy, op);
+      } else {
+        MV_Update_Functor<XMV, YMV, ZMV, 2, 0, 2, SizeType> op(alpha, X, beta,
+                                                               Y, gamma, Z);
+        Kokkos::parallel_for("KokkosBlas::update<MV,a,0,c>", policy, op);
       }
-      else {
-        MV_Update_Functor<XMV, YMV, ZMV, 2, 0, 2, SizeType> op (alpha, X, beta, Y, gamma, Z);
-        Kokkos::parallel_for ("KokkosBlas::update<MV,a,0,c>",policy, op);
-      }
-    }
-    else {
+    } else {
       if (c == 0) {
-        MV_Update_Functor<XMV, YMV, ZMV, 2, 2, 0, SizeType> op (alpha, X, beta, Y, gamma, Z);
-        Kokkos::parallel_for ("KokkosBlas::update<MV,a,b,0>",policy, op);
-      }
-      else {
-        MV_Update_Functor<XMV, YMV, ZMV, 2, 2, 2, SizeType> op (alpha, X, beta, Y, gamma, Z);
-        Kokkos::parallel_for ("KokkosBlas::update<MV,a,b,c>",policy, op);
+        MV_Update_Functor<XMV, YMV, ZMV, 2, 2, 0, SizeType> op(alpha, X, beta,
+                                                               Y, gamma, Z);
+        Kokkos::parallel_for("KokkosBlas::update<MV,a,b,0>", policy, op);
+      } else {
+        MV_Update_Functor<XMV, YMV, ZMV, 2, 2, 2, SizeType> op(alpha, X, beta,
+                                                               Y, gamma, Z);
+        Kokkos::parallel_for("KokkosBlas::update<MV,a,b,c>", policy, op);
       }
     }
   }
 }
 
-
 // Invoke the "generic" (not unrolled) single-vector functor that
 // computes
 //
@@ -443,56 +445,61 @@ MV_Update_Generic (const typename XMV::non_const_value_type& alpha, const XMV& X
 //
 // Any literal coefficient of zero has BLAS semantics of ignoring the
 // corresponding vector entry.
-template<class XV, class YV, class ZV, class SizeType>
-void
-V_Update_Generic (const typename XV::non_const_value_type& alpha, const XV& X,
-                  const typename YV::non_const_value_type& beta, const YV& Y,
-                  const typename ZV::non_const_value_type& gamma, const ZV& Z,
-                  int a = 2, int b = 2, int c = 2)
-{
-    static_assert (Kokkos::Impl::is_view<XV>::value, "KokkosBlas::Impl::"
-                   "V_Update_Generic: X is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<YV>::value, "KokkosBlas::Impl::"
-                   "V_Update_Generic: Y is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<ZV>::value, "KokkosBlas::Impl::"
-                   "V_Update_Generic: Z is not a Kokkos::View.");
-    static_assert (std::is_same<typename ZV::value_type,
-                   typename ZV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::V_Update_Generic: Z is const.  "
-                   "It must be nonconst, because it is an output argument "
-                   "(we have to be able to write to its entries).");
-    // Casting to int avoids compiler warnings about comparing
-    // different kinds of enum values.
-    static_assert ((int) ZV::rank == (int) XV::rank &&
-                   (int) ZV::rank == (int) YV::rank,
-                   "KokkosBlas::Impl::V_Update_Generic: "
-                   "X, Y, and Z must have the same rank.");
-    static_assert (ZV::rank == 1, "KokkosBlas::Impl::V_Update_Generic: "
-                   "XV, YV, and ZV must have rank 1.");
+template <class XV, class YV, class ZV, class SizeType>
+void V_Update_Generic(const typename XV::non_const_value_type& alpha,
+                      const XV& X,
+                      const typename YV::non_const_value_type& beta,
+                      const YV& Y,
+                      const typename ZV::non_const_value_type& gamma,
+                      const ZV& Z, int a = 2, int b = 2, int c = 2) {
+  static_assert(Kokkos::is_view<XV>::value,
+                "KokkosBlas::Impl::"
+                "V_Update_Generic: X is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<YV>::value,
+                "KokkosBlas::Impl::"
+                "V_Update_Generic: Y is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<ZV>::value,
+                "KokkosBlas::Impl::"
+                "V_Update_Generic: Z is not a Kokkos::View.");
+  static_assert(std::is_same<typename ZV::value_type,
+                             typename ZV::non_const_value_type>::value,
+                "KokkosBlas::Impl::V_Update_Generic: Z is const.  "
+                "It must be nonconst, because it is an output argument "
+                "(we have to be able to write to its entries).");
+  // Casting to int avoids compiler warnings about comparing
+  // different kinds of enum values.
+  static_assert(
+      (int)ZV::rank == (int)XV::rank && (int)ZV::rank == (int)YV::rank,
+      "KokkosBlas::Impl::V_Update_Generic: "
+      "X, Y, and Z must have the same rank.");
+  static_assert(ZV::rank == 1,
+                "KokkosBlas::Impl::V_Update_Generic: "
+                "XV, YV, and ZV must have rank 1.");
 
   typedef typename XV::execution_space execution_space;
   const SizeType numRows = X.extent(0);
-  Kokkos::RangePolicy<execution_space, SizeType> policy (0, numRows);
+  Kokkos::RangePolicy<execution_space, SizeType> policy(0, numRows);
 
   if (a == 0) {
     if (b == 0) {
       if (c == 0) {
-        V_Update_Functor<XV, YV, ZV, 0, 0, 0, SizeType> op (alpha, X, beta, Y, gamma, Z);
-        Kokkos::parallel_for ("KokkosBlas::update<0,0,0>",policy, op);
-      }
-      else {
-        V_Update_Functor<XV, YV, ZV, 0, 0, 2, SizeType> op (alpha, X, beta, Y, gamma, Z);
-        Kokkos::parallel_for ("KokkosBlas::update<0,0,c>",policy, op);
+        V_Update_Functor<XV, YV, ZV, 0, 0, 0, SizeType> op(alpha, X, beta, Y,
+                                                           gamma, Z);
+        Kokkos::parallel_for("KokkosBlas::update<0,0,0>", policy, op);
+      } else {
+        V_Update_Functor<XV, YV, ZV, 0, 0, 2, SizeType> op(alpha, X, beta, Y,
+                                                           gamma, Z);
+        Kokkos::parallel_for("KokkosBlas::update<0,0,c>", policy, op);
       }
-    }
-    else {
+    } else {
       if (c == 0) {
-        V_Update_Functor<XV, YV, ZV, 0, 2, 0, SizeType> op (alpha, X, beta, Y, gamma, Z);
-        Kokkos::parallel_for ("KokkosBlas::update<0,b,0>",policy, op);
-      }
-      else {
-        V_Update_Functor<XV, YV, ZV, 0, 2, 2, SizeType> op (alpha, X, beta, Y, gamma, Z);
-        Kokkos::parallel_for ("KokkosBlas::update<0,b,c>",policy, op);
+        V_Update_Functor<XV, YV, ZV, 0, 2, 0, SizeType> op(alpha, X, beta, Y,
+                                                           gamma, Z);
+        Kokkos::parallel_for("KokkosBlas::update<0,b,0>", policy, op);
+      } else {
+        V_Update_Functor<XV, YV, ZV, 0, 2, 2, SizeType> op(alpha, X, beta, Y,
+                                                           gamma, Z);
+        Kokkos::parallel_for("KokkosBlas::update<0,b,c>", policy, op);
       }
     }
   }
@@ -502,28 +509,29 @@ V_Update_Generic (const typename XV::non_const_value_type& alpha, const XV& X,
   else {
     if (b == 0) {
       if (c == 0) {
-        V_Update_Functor<XV, YV, ZV, 2, 0, 0, SizeType> op (alpha, X, beta, Y, gamma, Z);
-        Kokkos::parallel_for ("KokkosBlas::update<a,0,0>",policy, op);
-      }
-      else {
-        V_Update_Functor<XV, YV, ZV, 2, 0, 2, SizeType> op (alpha, X, beta, Y, gamma, Z);
-        Kokkos::parallel_for ("KokkosBlas::update<a,0,c>",policy, op);
+        V_Update_Functor<XV, YV, ZV, 2, 0, 0, SizeType> op(alpha, X, beta, Y,
+                                                           gamma, Z);
+        Kokkos::parallel_for("KokkosBlas::update<a,0,0>", policy, op);
+      } else {
+        V_Update_Functor<XV, YV, ZV, 2, 0, 2, SizeType> op(alpha, X, beta, Y,
+                                                           gamma, Z);
+        Kokkos::parallel_for("KokkosBlas::update<a,0,c>", policy, op);
       }
-    }
-    else {
+    } else {
       if (c == 0) {
-        V_Update_Functor<XV, YV, ZV, 2, 2, 0, SizeType> op (alpha, X, beta, Y, gamma, Z);
-        Kokkos::parallel_for ("KokkosBlas::update<a,b,0>",policy, op);
-      }
-      else {
-        V_Update_Functor<XV, YV, ZV, 2, 2, 2, SizeType> op (alpha, X, beta, Y, gamma, Z);
-        Kokkos::parallel_for ("KokkosBlas::update<a,b,c>",policy, op);
+        V_Update_Functor<XV, YV, ZV, 2, 2, 0, SizeType> op(alpha, X, beta, Y,
+                                                           gamma, Z);
+        Kokkos::parallel_for("KokkosBlas::update<a,b,0>", policy, op);
+      } else {
+        V_Update_Functor<XV, YV, ZV, 2, 2, 2, SizeType> op(alpha, X, beta, Y,
+                                                           gamma, Z);
+        Kokkos::parallel_for("KokkosBlas::update<a,b,c>", policy, op);
       }
     }
   }
 }
 
-} // namespace Impl
-} // namespace KokkosBlas
+}  // namespace Impl
+}  // namespace KokkosBlas
 
-#endif // KOKKOSBLAS1_UPDATE_IMPL_HPP_
+#endif  // KOKKOSBLAS1_UPDATE_IMPL_HPP_
diff --git a/src/blas/impl/KokkosBlas1_update_spec.hpp b/src/blas/impl/KokkosBlas1_update_spec.hpp
index 157c66c06d..306f35f3c7 100644
--- a/src/blas/impl/KokkosBlas1_update_spec.hpp
+++ b/src/blas/impl/KokkosBlas1_update_spec.hpp
@@ -49,18 +49,18 @@
 #include "Kokkos_InnerProductSpaceTraits.hpp"
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
-#include<KokkosBlas1_update_impl.hpp>
+#include <KokkosBlas1_update_impl.hpp>
 #endif
 
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class XMV, class YMV, class ZMV, int rank = ZMV::rank>
+template <class XMV, class YMV, class ZMV, int rank = ZMV::rank>
 struct update_eti_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization availability
@@ -69,16 +69,21 @@ struct update_eti_spec_avail {
 // We may spread out definitions (see _INST macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_UPDATE_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct update_eti_spec_avail< \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         1> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_UPDATE_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE,      \
+                                          MEM_SPACE)                       \
+  template <>                                                              \
+  struct update_eti_spec_avail<                                            \
+      Kokkos::View<const SCALAR*, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      Kokkos::View<const SCALAR*, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      1> {                                                                 \
+    enum : bool { value = true };                                          \
+  };
 
 //
 // Macro for declaration of full specialization availability
@@ -87,22 +92,26 @@ struct update_eti_spec_avail {
 // We may spread out definitions (see _INST macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct update_eti_spec_avail< \
-         Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         2> { enum : bool { value = true }; };
-
+#define KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE,    \
+                                             MEM_SPACE)                     \
+  template <>                                                               \
+  struct update_eti_spec_avail<                                             \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      2> {                                                                  \
+    enum : bool { value = true };                                           \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosBlas1_update_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_update_eti_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_update_mv_eti_spec_avail.hpp>
+#include <KokkosBlas1_update_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_update_eti_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_update_mv_eti_spec_avail.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
@@ -119,107 +128,118 @@ namespace Impl {
 /// Z(i,j) = alpha*X(i,j) + beta*Y(i,j) + gamma*Z(i,j),
 ///
 /// with special cases for alpha, beta, or gamma = 0.
-template<class XMV, class YMV, class ZMV, int rank = ZMV::rank,
-    bool tpl_spec_avail = update_tpl_spec_avail<XMV,YMV,ZMV>::value,
-    bool eti_spec_avail = update_eti_spec_avail<XMV,YMV,ZMV>::value>
+template <class XMV, class YMV, class ZMV, int rank = ZMV::rank,
+          bool tpl_spec_avail = update_tpl_spec_avail<XMV, YMV, ZMV>::value,
+          bool eti_spec_avail = update_eti_spec_avail<XMV, YMV, ZMV>::value>
 struct Update {
-  static void
-    update (const typename XMV::non_const_value_type& alpha, const XMV& X,
-            const typename YMV::non_const_value_type& beta, const YMV& Y,
-            const typename ZMV::non_const_value_type& gamma, const ZMV& Z);
+  static void update(const typename XMV::non_const_value_type& alpha,
+                     const XMV& X,
+                     const typename YMV::non_const_value_type& beta,
+                     const YMV& Y,
+                     const typename ZMV::non_const_value_type& gamma,
+                     const ZMV& Z);
 };
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 // Partial specialization for XMV, YMV, and ZMV rank-2 Views.
-template<class XMV, class YMV, class ZMV>
-struct Update<XMV, YMV, ZMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>
-{
+template <class XMV, class YMV, class ZMV>
+struct Update<XMV, YMV, ZMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename XMV::size_type size_type;
   typedef Kokkos::Details::ArithTraits<typename XMV::non_const_value_type> ATA;
   typedef Kokkos::Details::ArithTraits<typename YMV::non_const_value_type> ATB;
   typedef Kokkos::Details::ArithTraits<typename ZMV::non_const_value_type> ATC;
 
-  static void
-  update (const typename XMV::non_const_value_type& alpha, const XMV& X,
-          const typename YMV::non_const_value_type& beta, const YMV& Y,
-          const typename ZMV::non_const_value_type& gamma, const ZMV& Z)
-  {
-    static_assert (Kokkos::Impl::is_view<XMV>::value, "KokkosBlas::Impl::"
-                   "Update<rank 2>::update: X is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<YMV>::value, "KokkosBlas::Impl::"
-                   "Update<rank 2>::update: Y is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<ZMV>::value, "KokkosBlas::Impl::"
-                   "Update<rank 2>::update: Z is not a Kokkos::View.");
-    static_assert (std::is_same<typename ZMV::value_type,
-                     typename ZMV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::Update<rank 2>::update: Z is const.  "
-                   "It must be nonconst, because it is an output argument "
-                   "(we have to be able to write to its entries).");
+  static void update(const typename XMV::non_const_value_type& alpha,
+                     const XMV& X,
+                     const typename YMV::non_const_value_type& beta,
+                     const YMV& Y,
+                     const typename ZMV::non_const_value_type& gamma,
+                     const ZMV& Z) {
+    static_assert(Kokkos::is_view<XMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Update<rank 2>::update: X is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<YMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Update<rank 2>::update: Y is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<ZMV>::value,
+                  "KokkosBlas::Impl::"
+                  "Update<rank 2>::update: Z is not a Kokkos::View.");
+    static_assert(std::is_same<typename ZMV::value_type,
+                               typename ZMV::non_const_value_type>::value,
+                  "KokkosBlas::Impl::Update<rank 2>::update: Z is const.  "
+                  "It must be nonconst, because it is an output argument "
+                  "(we have to be able to write to its entries).");
     // Casting to int avoids compiler warnings about comparing
     // different kinds of enum values.
-    static_assert ((int) ZMV::rank == (int) XMV::rank &&
-                   (int) ZMV::rank == (int) YMV::rank,
-                   "KokkosBlas::Impl::Update<rank 2>::update: "
-                   "X, Y, and Z must have the same rank.");
-    static_assert (ZMV::rank == 2, "KokkosBlas::Impl::Update<rank 2>::update: "
-                   "XMV, YMV, and ZMV must have rank 2.");
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::update[ETI]":"KokkosBlas::update[noETI]");
-
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::update<> ETI specialization for < %s , %s , %s >\n",typeid(XMV).name(),typeid(YMV).name(),typeid(ZMV).name());
+    static_assert(
+        (int)ZMV::rank == (int)XMV::rank && (int)ZMV::rank == (int)YMV::rank,
+        "KokkosBlas::Impl::Update<rank 2>::update: "
+        "X, Y, and Z must have the same rank.");
+    static_assert(ZMV::rank == 2,
+                  "KokkosBlas::Impl::Update<rank 2>::update: "
+                  "XMV, YMV, and ZMV must have rank 2.");
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::update[ETI]"
+                                      : "KokkosBlas::update[noETI]");
+
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas1::update<> ETI specialization for < %s , %s , %s >\n",
+             typeid(XMV).name(), typeid(YMV).name(), typeid(ZMV).name());
     else {
-      printf("KokkosBlas1::update<> non-ETI specialization for < %s , %s , %s >\n",typeid(XMV).name(),typeid(YMV).name(),typeid(ZMV).name());
+      printf(
+          "KokkosBlas1::update<> non-ETI specialization for < %s , %s , %s >\n",
+          typeid(XMV).name(), typeid(YMV).name(), typeid(ZMV).name());
     }
-    #endif
+#endif
 
     const size_type numRows = X.extent(0);
     const size_type numCols = X.extent(1);
     int a = 2, b = 2, c = 2;
 
-    if (alpha == ATA::zero ()) {
+    if (alpha == ATA::zero()) {
       a = 0;
-    }
-    else {
+    } else {
       a = 2;
     }
-    if (beta == ATB::zero ()) {
+    if (beta == ATB::zero()) {
       b = 0;
-    }
-    else {
+    } else {
       b = 2;
     }
-    if (gamma == ATC::zero ()) {
+    if (gamma == ATC::zero()) {
       c = 0;
-    }
-    else {
+    } else {
       c = 2;
     }
 
-    if (numCols == static_cast<size_type> (1)) {
+    if (numCols == static_cast<size_type>(1)) {
       // Special case: ZMV has rank 2, but only 1 column.
       // Dispatch to the rank-1 version for better performance.
-      auto X_0 = Kokkos::subview (X, Kokkos::ALL (), 0);
-      auto Y_0 = Kokkos::subview (Y, Kokkos::ALL (), 0);
-      auto Z_0 = Kokkos::subview (Z, Kokkos::ALL (), 0);
+      auto X_0 = Kokkos::subview(X, Kokkos::ALL(), 0);
+      auto Y_0 = Kokkos::subview(Y, Kokkos::ALL(), 0);
+      auto Z_0 = Kokkos::subview(Z, Kokkos::ALL(), 0);
 
-      if (numRows * numCols < static_cast<size_type> (INT_MAX)) {
+      if (numRows * numCols < static_cast<size_type>(INT_MAX)) {
         typedef int index_type;
-        V_Update_Generic<decltype (X_0), decltype (Y_0), decltype (Z_0), index_type> (alpha, X_0, beta, Y_0, gamma, Z_0, a, b, c);
-      }
-      else {
+        V_Update_Generic<decltype(X_0), decltype(Y_0), decltype(Z_0),
+                         index_type>(alpha, X_0, beta, Y_0, gamma, Z_0, a, b,
+                                     c);
+      } else {
         typedef typename XMV::size_type index_type;
-        V_Update_Generic<decltype (X_0), decltype (Y_0), decltype (Z_0), index_type> (alpha, X_0, beta, Y_0, gamma, Z_0, a, b, c);
+        V_Update_Generic<decltype(X_0), decltype(Y_0), decltype(Z_0),
+                         index_type>(alpha, X_0, beta, Y_0, gamma, Z_0, a, b,
+                                     c);
       }
-    }
-    else {
-      if (numRows * numCols < static_cast<size_type> (INT_MAX)) {
+    } else {
+      if (numRows * numCols < static_cast<size_type>(INT_MAX)) {
         typedef int index_type;
-        MV_Update_Generic<XMV, YMV, ZMV, index_type> (alpha, X, beta, Y, gamma, Z, a, b, c);
-      }
-      else {
+        MV_Update_Generic<XMV, YMV, ZMV, index_type>(alpha, X, beta, Y, gamma,
+                                                     Z, a, b, c);
+      } else {
         typedef typename XMV::size_type index_type;
-        MV_Update_Generic<XMV, YMV, ZMV, index_type> (alpha, X, beta, Y, gamma, Z, a, b, c);
+        MV_Update_Generic<XMV, YMV, ZMV, index_type>(alpha, X, beta, Y, gamma,
+                                                     Z, a, b, c);
       }
     }
     Kokkos::Profiling::popRegion();
@@ -227,87 +247,93 @@ struct Update<XMV, YMV, ZMV, 2, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>
 };
 
 // Partial specialization for XV, YV, and ZV rank-1 Views.
-template<class XV, class YV, class ZV>
-struct Update<XV, YV, ZV, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>
-{
+template <class XV, class YV, class ZV>
+struct Update<XV, YV, ZV, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename XV::size_type size_type;
   typedef Kokkos::Details::ArithTraits<typename XV::non_const_value_type> ATA;
   typedef Kokkos::Details::ArithTraits<typename YV::non_const_value_type> ATB;
   typedef Kokkos::Details::ArithTraits<typename ZV::non_const_value_type> ATC;
 
-  static void
-  update (const typename XV::non_const_value_type& alpha, const XV& X,
-          const typename YV::non_const_value_type& beta, const YV& Y,
-          const typename ZV::non_const_value_type& gamma, const ZV& Z)
-  {
+  static void update(const typename XV::non_const_value_type& alpha,
+                     const XV& X, const typename YV::non_const_value_type& beta,
+                     const YV& Y,
+                     const typename ZV::non_const_value_type& gamma,
+                     const ZV& Z) {
     // XV, YV, and ZV must be Kokkos::View specializations.
-    static_assert (Kokkos::Impl::is_view<XV>::value, "KokkosBlas::Impl::"
-                   "Update<rank 1>::update: X is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<YV>::value, "KokkosBlas::Impl::"
-                   "Update<rank 1>::update: Y is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<ZV>::value, "KokkosBlas::Impl::"
-                   "Update<rank 1>::update: Z is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<XV>::value,
+                  "KokkosBlas::Impl::"
+                  "Update<rank 1>::update: X is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<YV>::value,
+                  "KokkosBlas::Impl::"
+                  "Update<rank 1>::update: Y is not a Kokkos::View.");
+    static_assert(Kokkos::is_view<ZV>::value,
+                  "KokkosBlas::Impl::"
+                  "Update<rank 1>::update: Z is not a Kokkos::View.");
     // ZV must be nonconst (else it can't be an output argument).
-    static_assert (std::is_same<typename ZV::value_type,
-                     typename ZV::non_const_value_type>::value,
-                   "KokkosBlas::Impl::Update<rank 1>::update: Z is const.  "
-                   "It must be nonconst, because it is an output argument "
-                   "(we have to be able to write to its entries).");
-    static_assert ((int) ZV::rank == (int) XV::rank && (int) ZV::rank == (int) YV::rank,
-                   "KokkosBlas::Impl::Update<rank 1>::update: "
-                   "X, Y, and Z must have the same rank.");
-    static_assert (ZV::rank == 1, "KokkosBlas::Impl::Update<rank 1>::update: "
-                   "XV, YV, and ZV must have rank 1.");
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::update[ETI]":"KokkosBlas::update[noETI]");
-    #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-    if(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-      printf("KokkosBlas1::update<> ETI specialization for < %s , %s , %s >\n",typeid(XV).name(),typeid(YV).name(),typeid(ZV).name());
+    static_assert(std::is_same<typename ZV::value_type,
+                               typename ZV::non_const_value_type>::value,
+                  "KokkosBlas::Impl::Update<rank 1>::update: Z is const.  "
+                  "It must be nonconst, because it is an output argument "
+                  "(we have to be able to write to its entries).");
+    static_assert(
+        (int)ZV::rank == (int)XV::rank && (int)ZV::rank == (int)YV::rank,
+        "KokkosBlas::Impl::Update<rank 1>::update: "
+        "X, Y, and Z must have the same rank.");
+    static_assert(ZV::rank == 1,
+                  "KokkosBlas::Impl::Update<rank 1>::update: "
+                  "XV, YV, and ZV must have rank 1.");
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::update[ETI]"
+                                      : "KokkosBlas::update[noETI]");
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+    if (KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+      printf("KokkosBlas1::update<> ETI specialization for < %s , %s , %s >\n",
+             typeid(XV).name(), typeid(YV).name(), typeid(ZV).name());
     else {
-      printf("KokkosBlas1::update<> non-ETI specialization for < %s , %s , %s >\n",typeid(XV).name(),typeid(YV).name(),typeid(ZV).name());
+      printf(
+          "KokkosBlas1::update<> non-ETI specialization for < %s , %s , %s >\n",
+          typeid(XV).name(), typeid(YV).name(), typeid(ZV).name());
     }
-    #endif
- 
+#endif
+
     const size_type numRows = X.extent(0);
     const size_type numCols = X.extent(1);
     int a = 2, b = 2, c = 2;
 
-    if (alpha == ATA::zero ()) {
+    if (alpha == ATA::zero()) {
       a = 0;
-    }
-    else {
+    } else {
       a = 2;
     }
-    if (beta == ATB::zero ()) {
+    if (beta == ATB::zero()) {
       b = 0;
-    }
-    else {
+    } else {
       b = 2;
     }
-    if (gamma == ATC::zero ()) {
+    if (gamma == ATC::zero()) {
       c = 0;
-    }
-    else {
+    } else {
       c = 2;
     }
 
-    if (numRows < static_cast<size_type> (INT_MAX) &&
-        numRows * numCols < static_cast<size_type> (INT_MAX)) {
+    if (numRows < static_cast<size_type>(INT_MAX) &&
+        numRows * numCols < static_cast<size_type>(INT_MAX)) {
       typedef int index_type;
-      V_Update_Generic<XV, YV, ZV, index_type> (alpha, X, beta, Y, gamma, Z, a, b, c);
-    }
-    else {
+      V_Update_Generic<XV, YV, ZV, index_type>(alpha, X, beta, Y, gamma, Z, a,
+                                               b, c);
+    } else {
       typedef typename XV::size_type index_type;
-      V_Update_Generic<XV, YV, ZV, index_type> (alpha, X, beta, Y, gamma, Z, a, b, c);
+      V_Update_Generic<XV, YV, ZV, index_type>(alpha, X, beta, Y, gamma, Z, a,
+                                               b, c);
     }
     Kokkos::Profiling::popRegion();
   }
 };
-#endif //!defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+#endif  //! defined(KOKKOSKERNELS_ETI_ONLY) ||
+        //! KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 
-
-
-} // namespace Impl
-} // namespace KokkosBlas
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization of
@@ -317,25 +343,31 @@ struct Update<XV, YV, ZV, 1, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>
 // one or more .cpp files.
 //
 
-#define KOKKOSBLAS1_UPDATE_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct Update< \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1, false, true>;
-
-#define KOKKOSBLAS1_UPDATE_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct Update< \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        1, false, true>;
+#define KOKKOSBLAS1_UPDATE_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE,       \
+                                         MEM_SPACE)                        \
+  extern template struct Update<                                           \
+      Kokkos::View<const SCALAR*, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      Kokkos::View<const SCALAR*, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      1, false, true>;
+
+#define KOKKOSBLAS1_UPDATE_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE,       \
+                                         MEM_SPACE)                        \
+  template struct Update<                                                  \
+      Kokkos::View<const SCALAR*, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      Kokkos::View<const SCALAR*, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      1, false, true>;
 
 //
 // Macro for declaration of full specialization of
@@ -345,29 +377,34 @@ template struct Update< \
 // one or more .cpp files.
 //
 
-#define KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct Update< \
-     Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     2, false, true>;
-
-#define KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct Update< \
-     Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     2, false, true>;
-
-
-#include<KokkosBlas1_update_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_update_eti_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas1_update_mv_eti_spec_decl.hpp>
-
-#endif // KOKKOSBLAS1_UPDATE_SPEC_HPP_
+#define KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE,     \
+                                            MEM_SPACE)                      \
+  extern template struct Update<                                            \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      2, false, true>;
+
+#define KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE,     \
+                                            MEM_SPACE)                      \
+  template struct Update<                                                   \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<const SCALAR**, LAYOUT,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      2, false, true>;
+
+#include <KokkosBlas1_update_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_update_eti_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas1_update_mv_eti_spec_decl.hpp>
+
+#endif  // KOKKOSBLAS1_UPDATE_SPEC_HPP_
diff --git a/src/blas/impl/KokkosBlas2_gemv_impl.hpp b/src/blas/impl/KokkosBlas2_gemv_impl.hpp
index c3f93e3315..6f27363be9 100644
--- a/src/blas/impl/KokkosBlas2_gemv_impl.hpp
+++ b/src/blas/impl/KokkosBlas2_gemv_impl.hpp
@@ -54,80 +54,72 @@ namespace Impl {
 
 // Functor for a single-level parallel_for version of nontranspose
 // GEMV.  The functor parallelizes over rows of the input matrix A.
-template<class AViewType,
-         class XViewType,
-         class YViewType,
-         const int alphaPreset, // 0 or 1 are specific values; -1 means general
-         const int betaPreset, // 0 or 1 are specific values; -1 means general
-         class IndexType = typename AViewType::size_type>
+template <class AViewType, class XViewType, class YViewType,
+          const int alphaPreset,  // 0 or 1 are specific values; -1 means
+                                  // general
+          const int betaPreset,  // 0 or 1 are specific values; -1 means general
+          class IndexType = typename AViewType::size_type>
 struct SingleLevelNontransposeGEMV {
   using AlphaCoeffType = typename AViewType::non_const_value_type;
   using BetaCoeffType  = typename YViewType::non_const_value_type;
-  using y_value_type = typename YViewType::non_const_value_type;
-  using AccumScalar = typename std::conditional<std::is_same<y_value_type, Kokkos::Experimental::half_t>::value, float, y_value_type>::type;
-
-  SingleLevelNontransposeGEMV (const AlphaCoeffType& alpha,
-                               const AViewType& A,
-                               const XViewType& x,
-                               const BetaCoeffType& beta,
-                               const YViewType& y) :
-    alpha_ (alpha), A_ (A), x_ (x), beta_ (beta), y_ (y)
-  {
-    static_assert (Kokkos::Impl::is_view<AViewType>::value,
-                   "AViewType must be a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XViewType>::value,
-                   "XViewType must be a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<YViewType>::value,
-                   "YViewType must be a Kokkos::View.");
-    static_assert (static_cast<int> (AViewType::rank) == 2,
-                   "AViewType must have rank 2.");
-    static_assert (static_cast<int> (XViewType::rank) == 1,
-                   "XViewType must have rank 1.");
-    static_assert (static_cast<int> (YViewType::rank) == 1,
-                   "YViewType must have rank 1.");
-    static_assert (std::is_integral<IndexType>::value,
-                   "IndexType must be an integer.");
-    static_assert (alphaPreset == 0 || alphaPreset == 1 || alphaPreset == -1,
-                   "Invalid alphaPreset value; valid values are 0, 1, and -1.");
-    static_assert (betaPreset == 0 || betaPreset == 1 || betaPreset == -1,
-                   "Invalid betaPreset value; valid values are 0, 1, and -1.");
+  using y_value_type   = typename YViewType::non_const_value_type;
+  using AccumScalar    = typename std::conditional<
+      std::is_same<y_value_type, Kokkos::Experimental::half_t>::value, float,
+      y_value_type>::type;
+
+  SingleLevelNontransposeGEMV(const AlphaCoeffType& alpha, const AViewType& A,
+                              const XViewType& x, const BetaCoeffType& beta,
+                              const YViewType& y)
+      : alpha_(alpha), A_(A), x_(x), beta_(beta), y_(y) {
+    static_assert(Kokkos::is_view<AViewType>::value,
+                  "AViewType must be a Kokkos::View.");
+    static_assert(Kokkos::is_view<XViewType>::value,
+                  "XViewType must be a Kokkos::View.");
+    static_assert(Kokkos::is_view<YViewType>::value,
+                  "YViewType must be a Kokkos::View.");
+    static_assert(static_cast<int>(AViewType::rank) == 2,
+                  "AViewType must have rank 2.");
+    static_assert(static_cast<int>(XViewType::rank) == 1,
+                  "XViewType must have rank 1.");
+    static_assert(static_cast<int>(YViewType::rank) == 1,
+                  "YViewType must have rank 1.");
+    static_assert(std::is_integral<IndexType>::value,
+                  "IndexType must be an integer.");
+    static_assert(alphaPreset == 0 || alphaPreset == 1 || alphaPreset == -1,
+                  "Invalid alphaPreset value; valid values are 0, 1, and -1.");
+    static_assert(betaPreset == 0 || betaPreset == 1 || betaPreset == -1,
+                  "Invalid betaPreset value; valid values are 0, 1, and -1.");
   }
 
   // i is the current row of the input matrix A, and the current row
   // of the output vector y.
-  KOKKOS_INLINE_FUNCTION void
-  operator () (const IndexType& i) const
-  {
+  KOKKOS_INLINE_FUNCTION void operator()(const IndexType& i) const {
     AccumScalar y_i;
     if (betaPreset == 0) {
-      y_i = Kokkos::ArithTraits<AccumScalar>::zero ();
-    }
-    else if (betaPreset == 1) {
+      y_i = Kokkos::ArithTraits<AccumScalar>::zero();
+    } else if (betaPreset == 1) {
       y_i = AccumScalar(y_(i));
-    }
-    else { // beta_ != 0 and beta != 1
+    } else {  // beta_ != 0 and beta != 1
       y_i = beta_ * AccumScalar(y_(i));
     }
 
     const IndexType numCols = A_.extent(1);
     if (alphaPreset == 0) {
-      ; // do nothing
-    }
-    else if (alphaPreset == 1) {
+      ;  // do nothing
+    } else if (alphaPreset == 1) {
       for (IndexType j = 0; j < numCols; ++j) {
-        y_i += AccumScalar(A_(i,j)) * x_(j);
+        y_i += AccumScalar(A_(i, j)) * x_(j);
       }
-    }
-    else { // alpha_ != 0 and alpha_ != 1
+    } else {  // alpha_ != 0 and alpha_ != 1
       for (IndexType j = 0; j < numCols; ++j) {
-        y_i += alpha_ * AccumScalar(A_(i,j)) * x_(j);
+        y_i += alpha_ * AccumScalar(A_(i, j)) * x_(j);
       }
     }
 
     y_(i) = y_i;
   }
 
-private:
+ private:
   AlphaCoeffType alpha_;
   typename AViewType::const_type A_;
   typename XViewType::const_type x_;
@@ -144,94 +136,88 @@ struct SingleLevelNontransposeGEMV {
 // arbitrary length.  This is bad on GPU because the GPU
 // implementation of Kokkos::parallel_reduce may use shared memory for
 // intermediate results.
-template<class AViewType,
-         class XViewType,
-         class YViewType,
-         const bool conj,
-         const int alphaPreset, // 0 or 1 are specific values; -1 means general
-         const int betaPreset, // 0 or 1 are specific values; -1 means general
-         class IndexType = typename AViewType::size_type>
+template <class AViewType, class XViewType, class YViewType, const bool conj,
+          const int alphaPreset,  // 0 or 1 are specific values; -1 means
+                                  // general
+          const int betaPreset,  // 0 or 1 are specific values; -1 means general
+          class IndexType = typename AViewType::size_type>
 struct SingleLevelTransposeGEMV {
   using y_value_type   = typename YViewType::non_const_value_type;
   using AlphaCoeffType = typename AViewType::non_const_value_type;
   using BetaCoeffType  = typename YViewType::non_const_value_type;
-  using AccumScalar = typename std::conditional<std::is_same<y_value_type, Kokkos::Experimental::half_t>::value, float, y_value_type>::type;
+  using AccumScalar    = typename std::conditional<
+      std::is_same<y_value_type, Kokkos::Experimental::half_t>::value, float,
+      y_value_type>::type;
 
   typedef AccumScalar value_type[];
-  IndexType value_count; // Kokkos needs this for reductions w/ array results
-
-  SingleLevelTransposeGEMV (const AlphaCoeffType& alpha,
-                            const AViewType& A,
-                            const XViewType& x,
-                            const BetaCoeffType& beta,
-                            const YViewType& y) :
-    value_count (A.extent(1)), alpha_ (alpha),
-    A_ (A), x_ (x), beta_ (beta), y_ (y)
-  {
-    static_assert (Kokkos::Impl::is_view<AViewType>::value,
-                   "AViewType must be a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XViewType>::value,
-                   "XViewType must be a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<YViewType>::value,
-                   "YViewType must be a Kokkos::View.");
-    static_assert (static_cast<int> (AViewType::rank) == 2,
-                   "AViewType must have rank 2.");
-    static_assert (static_cast<int> (XViewType::rank) == 1,
-                   "XViewType must have rank 1.");
-    static_assert (static_cast<int> (YViewType::rank) == 1,
-                   "YViewType must have rank 1.");
-    static_assert (std::is_integral<IndexType>::value,
-                   "IndexType must be an integer.");
-    static_assert (alphaPreset == 0 || alphaPreset == 1 || alphaPreset == -1,
-                   "Invalid alphaPreset value; valid values are 0, 1, and -1.");
-    static_assert (betaPreset == 0 || betaPreset == 1 || betaPreset == -1,
-                   "Invalid betaPreset value; valid values are 0, 1, and -1.");
+  IndexType value_count;  // Kokkos needs this for reductions w/ array results
+
+  SingleLevelTransposeGEMV(const AlphaCoeffType& alpha, const AViewType& A,
+                           const XViewType& x, const BetaCoeffType& beta,
+                           const YViewType& y)
+      : value_count(A.extent(1)),
+        alpha_(alpha),
+        A_(A),
+        x_(x),
+        beta_(beta),
+        y_(y) {
+    static_assert(Kokkos::is_view<AViewType>::value,
+                  "AViewType must be a Kokkos::View.");
+    static_assert(Kokkos::is_view<XViewType>::value,
+                  "XViewType must be a Kokkos::View.");
+    static_assert(Kokkos::is_view<YViewType>::value,
+                  "YViewType must be a Kokkos::View.");
+    static_assert(static_cast<int>(AViewType::rank) == 2,
+                  "AViewType must have rank 2.");
+    static_assert(static_cast<int>(XViewType::rank) == 1,
+                  "XViewType must have rank 1.");
+    static_assert(static_cast<int>(YViewType::rank) == 1,
+                  "YViewType must have rank 1.");
+    static_assert(std::is_integral<IndexType>::value,
+                  "IndexType must be an integer.");
+    static_assert(alphaPreset == 0 || alphaPreset == 1 || alphaPreset == -1,
+                  "Invalid alphaPreset value; valid values are 0, 1, and -1.");
+    static_assert(betaPreset == 0 || betaPreset == 1 || betaPreset == -1,
+                  "Invalid betaPreset value; valid values are 0, 1, and -1.");
   }
 
-public:
-  KOKKOS_INLINE_FUNCTION void
-  init (value_type y_cur) const
-  {
+ public:
+  KOKKOS_INLINE_FUNCTION void init(value_type y_cur) const {
     for (IndexType j = 0; j < value_count; ++j) {
-      y_cur[j] = Kokkos::ArithTraits<AccumScalar>::zero ();
+      y_cur[j] = Kokkos::ArithTraits<AccumScalar>::zero();
     }
   }
 
-  KOKKOS_INLINE_FUNCTION void
-  join (volatile value_type dst,
-        const volatile value_type src) const
-  {
+  KOKKOS_INLINE_FUNCTION void join(volatile value_type dst,
+                                   const volatile value_type src) const {
     for (IndexType j = 0; j < value_count; ++j) {
       dst[j] += src[j];
     }
   }
 
-  KOKKOS_INLINE_FUNCTION void
-  final (value_type y_result) const
-  {
+  KOKKOS_INLINE_FUNCTION void final(value_type y_result) const {
     for (IndexType j = 0; j < value_count; ++j) {
       // Sum into initial y_ values; use beta as a pre-multiplier if nonzero.
-      if(betaPreset == 0)
+      if (betaPreset == 0)
         y_(j) = y_value_type(alpha_ * y_result[j]);
       else
         y_(j) = y_value_type(beta_ * AccumScalar(y_(j)) + alpha_ * y_result[j]);
     }
   }
 
-  KOKKOS_INLINE_FUNCTION void
-  operator () (const IndexType& i, value_type y_cur) const
-  {
+  KOKKOS_INLINE_FUNCTION void operator()(const IndexType& i,
+                                         value_type y_cur) const {
     using Kokkos::Details::ArithTraits;
     using KAT = ArithTraits<typename AViewType::non_const_value_type>;
 
     const auto x_i = x_(i);
     for (IndexType j = 0; j < value_count; ++j) {
-      const auto A_ij = conj ? KAT::conj (A_(i,j)) : A_(i,j);
+      const auto A_ij = conj ? KAT::conj(A_(i, j)) : A_(i, j);
       y_cur[j] += AccumScalar(A_ij) * x_i;
     }
   }
 
-private:
+ private:
   AlphaCoeffType alpha_;
   typename AViewType::const_type A_;
   typename XViewType::const_type x_;
@@ -240,41 +226,37 @@ struct SingleLevelTransposeGEMV {
 };
 
 // Single-level parallel version of GEMV.
-template<class AViewType,
-         class XViewType,
-         class YViewType,
-         class IndexType = typename AViewType::size_type>
-void
-singleLevelGemv (const char trans[],
-                 typename AViewType::const_value_type& alpha,
-                 const AViewType& A,
-                 const XViewType& x,
-                 typename YViewType::const_value_type& beta,
-                 const YViewType& y)
-{
-  static_assert (Kokkos::Impl::is_view<AViewType>::value,
-                 "AViewType must be a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<XViewType>::value,
-                 "XViewType must be a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<YViewType>::value,
-                 "YViewType must be a Kokkos::View.");
-  static_assert (static_cast<int> (AViewType::rank) == 2,
-                 "AViewType must have rank 2.");
-  static_assert (static_cast<int> (XViewType::rank) == 1,
-                 "XViewType must have rank 1.");
-  static_assert (static_cast<int> (YViewType::rank) == 1,
-                 "YViewType must have rank 1.");
-  static_assert (std::is_integral<IndexType>::value,
-                 "IndexType must be an integer");
-
-  using y_value_type = typename YViewType::non_const_value_type;
+template <class AViewType, class XViewType, class YViewType,
+          class IndexType = typename AViewType::size_type>
+void singleLevelGemv(const typename AViewType::execution_space& space,
+                     const char trans[],
+                     typename AViewType::const_value_type& alpha,
+                     const AViewType& A, const XViewType& x,
+                     typename YViewType::const_value_type& beta,
+                     const YViewType& y) {
+  static_assert(Kokkos::is_view<AViewType>::value,
+                "AViewType must be a Kokkos::View.");
+  static_assert(Kokkos::is_view<XViewType>::value,
+                "XViewType must be a Kokkos::View.");
+  static_assert(Kokkos::is_view<YViewType>::value,
+                "YViewType must be a Kokkos::View.");
+  static_assert(static_cast<int>(AViewType::rank) == 2,
+                "AViewType must have rank 2.");
+  static_assert(static_cast<int>(XViewType::rank) == 1,
+                "XViewType must have rank 1.");
+  static_assert(static_cast<int>(YViewType::rank) == 1,
+                "YViewType must have rank 1.");
+  static_assert(std::is_integral<IndexType>::value,
+                "IndexType must be an integer");
+
+  using y_value_type    = typename YViewType::non_const_value_type;
   using execution_space = typename AViewType::execution_space;
-  using policy_type = Kokkos::RangePolicy<execution_space, IndexType>;
+  using policy_type     = Kokkos::RangePolicy<execution_space, IndexType>;
 
   using AlphaCoeffType = typename AViewType::non_const_value_type;
   using BetaCoeffType  = typename YViewType::non_const_value_type;
 
-  policy_type range (0, A.extent(0));
+  policy_type range(space, 0, A.extent(0));
   const char tr = trans[0];
 
   // The transpose and conjugate transpose cases where A has zero rows
@@ -283,189 +265,202 @@ singleLevelGemv (const char trans[],
   // depend on that or its implementation details.  Instead, we reuse
   // an instantiation of the non-transpose case for alpha=0.
   if (A.extent(0) == 0 && (tr != 'N' && tr != 'n')) {
-    if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::zero ()) {
-      Kokkos::deep_copy (y, Kokkos::Details::ArithTraits<BetaCoeffType>::zero ());
-    }
-    else if (beta != Kokkos::Details::ArithTraits<BetaCoeffType>::one ()) {
+    if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::zero()) {
+      Kokkos::deep_copy(y, Kokkos::Details::ArithTraits<BetaCoeffType>::zero());
+    } else if (beta != Kokkos::Details::ArithTraits<BetaCoeffType>::one()) {
       // "Fake out" a scal() by using the non-transpose alpha=0,
       // general beta case.  This assumes that the functor doesn't
       // check dimensions.
-      using functor_type = SingleLevelNontransposeGEMV<AViewType, XViewType, YViewType,
-                                                       0, -1, IndexType>;
-      functor_type functor (alpha, A, x, beta, y);
-      Kokkos::parallel_for ("KokkosBlas::gemv[SingleLevel]",policy_type (0, A.extent(1)), functor);
+      using functor_type =
+          SingleLevelNontransposeGEMV<AViewType, XViewType, YViewType, 0, -1,
+                                      IndexType>;
+      functor_type functor(alpha, A, x, beta, y);
+      Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]",
+                           policy_type(0, A.extent(1)), functor);
     }
     return;
   }
 
   if (tr == 'N' || tr == 'n') {
-    if (alpha == Kokkos::Details::ArithTraits<AlphaCoeffType>::zero ()) {
-      if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::zero ()) {
+    if (alpha == Kokkos::Details::ArithTraits<AlphaCoeffType>::zero()) {
+      if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::zero()) {
         // Fill y with zeros
-        Kokkos::deep_copy (y, Kokkos::Details::ArithTraits<y_value_type>::zero ());
-      }
-      else if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::one ()) {
+        Kokkos::deep_copy(y,
+                          Kokkos::Details::ArithTraits<y_value_type>::zero());
+      } else if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::one()) {
         // Do nothing (y := 1 * y)
+      } else {  // beta != 0 && beta != 1
+        using functor_type =
+            SingleLevelNontransposeGEMV<AViewType, XViewType, YViewType, 0, -1,
+                                        IndexType>;
+        functor_type functor(alpha, A, x, beta, y);
+        Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor);
       }
-      else { // beta != 0 && beta != 1
-        using functor_type = SingleLevelNontransposeGEMV<AViewType, XViewType, YViewType,
-                                                         0, -1, IndexType>;
-        functor_type functor (alpha, A, x, beta, y);
-        Kokkos::parallel_for ("KokkosBlas::gemv[SingleLevel]",range, functor);
-      }
-    }
-    else if (alpha == Kokkos::Details::ArithTraits<AlphaCoeffType>::one ()) {
-      if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::zero ()) {
-        using functor_type = SingleLevelNontransposeGEMV<AViewType, XViewType, YViewType,
-                                                         1, 0, IndexType>;
-        functor_type functor (alpha, A, x, beta, y);
-        Kokkos::parallel_for ("KokkosBlas::gemv[SingleLevel]",range, functor);
-      }
-      else if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::one ()) {
-        using functor_type = SingleLevelNontransposeGEMV<AViewType, XViewType, YViewType,
-                                                         1, 1, IndexType>;
-        functor_type functor (alpha, A, x, beta, y);
-        Kokkos::parallel_for ("KokkosBlas::gemv[SingleLevel]",range, functor);
+    } else if (alpha == Kokkos::Details::ArithTraits<AlphaCoeffType>::one()) {
+      if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::zero()) {
+        using functor_type =
+            SingleLevelNontransposeGEMV<AViewType, XViewType, YViewType, 1, 0,
+                                        IndexType>;
+        functor_type functor(alpha, A, x, beta, y);
+        Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor);
+      } else if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::one()) {
+        using functor_type =
+            SingleLevelNontransposeGEMV<AViewType, XViewType, YViewType, 1, 1,
+                                        IndexType>;
+        functor_type functor(alpha, A, x, beta, y);
+        Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor);
+      } else {  // beta != 0 && beta != 1
+        using functor_type =
+            SingleLevelNontransposeGEMV<AViewType, XViewType, YViewType, 1, -1,
+                                        IndexType>;
+        functor_type functor(alpha, A, x, beta, y);
+        Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor);
       }
-      else { // beta != 0 && beta != 1
-        using functor_type = SingleLevelNontransposeGEMV<AViewType, XViewType, YViewType,
-                                                         1, -1, IndexType>;
-        functor_type functor (alpha, A, x, beta, y);
-        Kokkos::parallel_for ("KokkosBlas::gemv[SingleLevel]",range, functor);
+    } else {  // alpha != 0 and alpha != 1
+      if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::zero()) {
+        using functor_type =
+            SingleLevelNontransposeGEMV<AViewType, XViewType, YViewType, -1, 0,
+                                        IndexType>;
+        functor_type functor(alpha, A, x, beta, y);
+        Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor);
+      } else if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::one()) {
+        using functor_type =
+            SingleLevelNontransposeGEMV<AViewType, XViewType, YViewType, -1, 1,
+                                        IndexType>;
+        functor_type functor(alpha, A, x, beta, y);
+        Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor);
+      } else {  // beta != 0 && beta != 1
+        using functor_type =
+            SingleLevelNontransposeGEMV<AViewType, XViewType, YViewType, -1, -1,
+                                        IndexType>;
+        functor_type functor(alpha, A, x, beta, y);
+        Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor);
       }
     }
-    else { // alpha != 0 and alpha != 1
-      if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::zero ()) {
-        using functor_type = SingleLevelNontransposeGEMV<AViewType, XViewType, YViewType,
-                                                         -1, 0, IndexType>;
-        functor_type functor (alpha, A, x, beta, y);
-        Kokkos::parallel_for ("KokkosBlas::gemv[SingleLevel]",range, functor);
-      }
-      else if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::one ()) {
-        using functor_type = SingleLevelNontransposeGEMV<AViewType, XViewType, YViewType,
-                                                         -1, 1, IndexType>;
-        functor_type functor (alpha, A, x, beta, y);
-        Kokkos::parallel_for ("KokkosBlas::gemv[SingleLevel]",range, functor);
-      }
-      else { // beta != 0 && beta != 1
-        using functor_type = SingleLevelNontransposeGEMV<AViewType, XViewType, YViewType,
-                                                         -1, -1, IndexType>;
-        functor_type functor (alpha, A, x, beta, y);
-        Kokkos::parallel_for ("KokkosBlas::gemv[SingleLevel]",range, functor);
-      }
-    }
-  }
-  else if (tr == 'T' || tr == 't') { // transpose, no conjugate
-    if (alpha == Kokkos::Details::ArithTraits<AlphaCoeffType>::zero ()) {
-      if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::zero ()) {
+  } else if (tr == 'T' || tr == 't') {  // transpose, no conjugate
+    if (alpha == Kokkos::Details::ArithTraits<AlphaCoeffType>::zero()) {
+      if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::zero()) {
         // Fill y with zeros
-        Kokkos::deep_copy (y, Kokkos::Details::ArithTraits<y_value_type>::zero ());
-      }
-      else if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::one ()) {
+        Kokkos::deep_copy(y,
+                          Kokkos::Details::ArithTraits<y_value_type>::zero());
+      } else if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::one()) {
         // Do nothing (y := 1 * y)
+      } else {  // beta != 0 && beta != 1
+        using functor_type =
+            SingleLevelTransposeGEMV<AViewType, XViewType, YViewType, false, 0,
+                                     -1, IndexType>;
+        functor_type functor(alpha, A, x, beta, y);
+        Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range,
+                                functor);
       }
-      else { // beta != 0 && beta != 1
-        using functor_type = SingleLevelTransposeGEMV<AViewType, XViewType, YViewType,
-                                                      false, 0, -1, IndexType>;
-        functor_type functor (alpha, A, x, beta, y);
-        Kokkos::parallel_reduce ("KokkosBlas::gemv[SingleLevelTranspose]", range, functor);
-      }
-    }
-    else if (alpha == Kokkos::Details::ArithTraits<AlphaCoeffType>::one ()) {
-      if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::zero ()) {
-        using functor_type = SingleLevelTransposeGEMV<AViewType, XViewType, YViewType,
-                                                      false, 1, 0, IndexType>;
-        functor_type functor (alpha, A, x, beta, y);
-        Kokkos::parallel_reduce ("KokkosBlas::gemv[SingleLevelTranspose]", range, functor);
-      }
-      else if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::one ()) {
-        using functor_type = SingleLevelTransposeGEMV<AViewType, XViewType, YViewType,
-                                                      false, 1, 1, IndexType>;
-        functor_type functor (alpha, A, x, beta, y);
-        Kokkos::parallel_reduce ("KokkosBlas::gemv[SingleLevelTranspose]", range, functor);
-      }
-      else { // beta != 0 && beta != 1
-        using functor_type = SingleLevelTransposeGEMV<AViewType, XViewType, YViewType,
-                                                      false, 1, -1, IndexType>;
-        functor_type functor (alpha, A, x, beta, y);
-        Kokkos::parallel_reduce ("KokkosBlas::gemv[SingleLevelTranspose]", range, functor);
-      }
-    }
-    else { // alpha != 0 and alpha != 1
-      if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::zero ()) {
-        using functor_type = SingleLevelTransposeGEMV<AViewType, XViewType, YViewType,
-                                                      false, -1, 0, IndexType>;
-        functor_type functor (alpha, A, x, beta, y);
-        Kokkos::parallel_reduce ("KokkosBlas::gemv[SingleLevelTranspose]", range, functor);
-      }
-      else if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::one ()) {
-        using functor_type = SingleLevelTransposeGEMV<AViewType, XViewType, YViewType,
-                                                      false, -1, 1, IndexType>;
-        functor_type functor (alpha, A, x, beta, y);
-        Kokkos::parallel_reduce ("KokkosBlas::gemv[SingleLevelTranspose]", range, functor);
+    } else if (alpha == Kokkos::Details::ArithTraits<AlphaCoeffType>::one()) {
+      if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::zero()) {
+        using functor_type =
+            SingleLevelTransposeGEMV<AViewType, XViewType, YViewType, false, 1,
+                                     0, IndexType>;
+        functor_type functor(alpha, A, x, beta, y);
+        Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range,
+                                functor);
+      } else if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::one()) {
+        using functor_type =
+            SingleLevelTransposeGEMV<AViewType, XViewType, YViewType, false, 1,
+                                     1, IndexType>;
+        functor_type functor(alpha, A, x, beta, y);
+        Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range,
+                                functor);
+      } else {  // beta != 0 && beta != 1
+        using functor_type =
+            SingleLevelTransposeGEMV<AViewType, XViewType, YViewType, false, 1,
+                                     -1, IndexType>;
+        functor_type functor(alpha, A, x, beta, y);
+        Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range,
+                                functor);
       }
-      else { // beta != 0 && beta != 1
-        using functor_type = SingleLevelTransposeGEMV<AViewType, XViewType, YViewType,
-                                                      false, -1, -1, IndexType>;
-        functor_type functor (alpha, A, x, beta, y);
-        Kokkos::parallel_reduce ("KokkosBlas::gemv[SingleLevelTranspose]", range, functor);
+    } else {  // alpha != 0 and alpha != 1
+      if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::zero()) {
+        using functor_type =
+            SingleLevelTransposeGEMV<AViewType, XViewType, YViewType, false, -1,
+                                     0, IndexType>;
+        functor_type functor(alpha, A, x, beta, y);
+        Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range,
+                                functor);
+      } else if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::one()) {
+        using functor_type =
+            SingleLevelTransposeGEMV<AViewType, XViewType, YViewType, false, -1,
+                                     1, IndexType>;
+        functor_type functor(alpha, A, x, beta, y);
+        Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range,
+                                functor);
+      } else {  // beta != 0 && beta != 1
+        using functor_type =
+            SingleLevelTransposeGEMV<AViewType, XViewType, YViewType, false, -1,
+                                     -1, IndexType>;
+        functor_type functor(alpha, A, x, beta, y);
+        Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range,
+                                functor);
       }
     }
-  }
-  else if (tr == 'C' || tr == 'c' || tr == 'H' || tr == 'h') { // conj xpose
-    if (alpha == Kokkos::Details::ArithTraits<AlphaCoeffType>::zero ()) {
-      if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::zero ()) {
+  } else if (tr == 'C' || tr == 'c' || tr == 'H' || tr == 'h') {  // conj xpose
+    if (alpha == Kokkos::Details::ArithTraits<AlphaCoeffType>::zero()) {
+      if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::zero()) {
         // Fill y with zeros
-        Kokkos::deep_copy (y, Kokkos::Details::ArithTraits<y_value_type>::zero ());
-      }
-      else if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::one ()) {
+        Kokkos::deep_copy(y,
+                          Kokkos::Details::ArithTraits<y_value_type>::zero());
+      } else if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::one()) {
         // Do nothing (y := 1 * y)
+      } else {  // beta != 0 && beta != 1
+        using functor_type =
+            SingleLevelTransposeGEMV<AViewType, XViewType, YViewType, true, 0,
+                                     -1, IndexType>;
+        functor_type functor(alpha, A, x, beta, y);
+        Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range,
+                                functor);
       }
-      else { // beta != 0 && beta != 1
-        using functor_type = SingleLevelTransposeGEMV<AViewType, XViewType, YViewType,
-                                                      true, 0, -1, IndexType>;
-        functor_type functor (alpha, A, x, beta, y);
-        Kokkos::parallel_reduce ("KokkosBlas::gemv[SingleLevelTranspose]", range, functor);
-      }
-    }
-    else if (alpha == Kokkos::Details::ArithTraits<AlphaCoeffType>::one ()) {
-      if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::zero ()) {
-        using functor_type = SingleLevelTransposeGEMV<AViewType, XViewType, YViewType,
-                                                      true, 1, 0, IndexType>;
-        functor_type functor (alpha, A, x, beta, y);
-        Kokkos::parallel_reduce ("KokkosBlas::gemv[SingleLevelTranspose]", range, functor);
+    } else if (alpha == Kokkos::Details::ArithTraits<AlphaCoeffType>::one()) {
+      if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::zero()) {
+        using functor_type =
+            SingleLevelTransposeGEMV<AViewType, XViewType, YViewType, true, 1,
+                                     0, IndexType>;
+        functor_type functor(alpha, A, x, beta, y);
+        Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range,
+                                functor);
+      } else if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::one()) {
+        using functor_type =
+            SingleLevelTransposeGEMV<AViewType, XViewType, YViewType, true, 1,
+                                     1, IndexType>;
+        functor_type functor(alpha, A, x, beta, y);
+        Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range,
+                                functor);
+      } else {  // beta != 0 && beta != 1
+        using functor_type =
+            SingleLevelTransposeGEMV<AViewType, XViewType, YViewType, true, 1,
+                                     -1, IndexType>;
+        functor_type functor(alpha, A, x, beta, y);
+        Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range,
+                                functor);
       }
-      else if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::one ()) {
-        using functor_type = SingleLevelTransposeGEMV<AViewType, XViewType, YViewType,
-                                                      true, 1, 1, IndexType>;
-        functor_type functor (alpha, A, x, beta, y);
-        Kokkos::parallel_reduce ("KokkosBlas::gemv[SingleLevelTranspose]", range, functor);
-      }
-      else { // beta != 0 && beta != 1
-        using functor_type = SingleLevelTransposeGEMV<AViewType, XViewType, YViewType,
-                                                      true, 1, -1, IndexType>;
-        functor_type functor (alpha, A, x, beta, y);
-        Kokkos::parallel_reduce ("KokkosBlas::gemv[SingleLevelTranspose]", range, functor);
-      }
-    }
-    else { // alpha != 0 and alpha != 1
-      if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::zero ()) {
-        using functor_type = SingleLevelTransposeGEMV<AViewType, XViewType, YViewType,
-                                                      true, -1, 0, IndexType>;
-        functor_type functor (alpha, A, x, beta, y);
-        Kokkos::parallel_reduce ("KokkosBlas::gemv[SingleLevelTranspose]", range, functor);
-      }
-      else if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::one ()) {
-        using functor_type = SingleLevelTransposeGEMV<AViewType, XViewType, YViewType,
-                                                      true, -1, 1, IndexType>;
-        functor_type functor (alpha, A, x, beta, y);
-        Kokkos::parallel_reduce ("KokkosBlas::gemv[SingleLevelTranspose]", range, functor);
-      }
-      else { // beta != 0 && beta != 1
-        using functor_type = SingleLevelTransposeGEMV<AViewType, XViewType, YViewType,
-                                                      true, -1, -1, IndexType>;
-        functor_type functor (alpha, A, x, beta, y);
-        Kokkos::parallel_reduce ("KokkosBlas::gemv[SingleLevelTranspose]", range, functor);
+    } else {  // alpha != 0 and alpha != 1
+      if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::zero()) {
+        using functor_type =
+            SingleLevelTransposeGEMV<AViewType, XViewType, YViewType, true, -1,
+                                     0, IndexType>;
+        functor_type functor(alpha, A, x, beta, y);
+        Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range,
+                                functor);
+      } else if (beta == Kokkos::Details::ArithTraits<BetaCoeffType>::one()) {
+        using functor_type =
+            SingleLevelTransposeGEMV<AViewType, XViewType, YViewType, true, -1,
+                                     1, IndexType>;
+        functor_type functor(alpha, A, x, beta, y);
+        Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range,
+                                functor);
+      } else {  // beta != 0 && beta != 1
+        using functor_type =
+            SingleLevelTransposeGEMV<AViewType, XViewType, YViewType, true, -1,
+                                     -1, IndexType>;
+        functor_type functor(alpha, A, x, beta, y);
+        Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range,
+                                functor);
       }
     }
   }
@@ -477,113 +472,106 @@ struct TwoLevelGEMV_LayoutRightTag {};
 // ---------------------------------------------------------------------------------------------
 // Functor for a two-level parallel_reduce version of GEMV (non-transpose),
 // designed for performance on GPU. Kernel depends on the layout of A.
-template<class AViewType,
-         class XViewType,
-         class YViewType,
-         class IndexType = typename AViewType::size_type>
+template <class AViewType, class XViewType, class YViewType,
+          class IndexType = typename AViewType::size_type>
 struct TwoLevelGEMV {
   using y_value_type   = typename YViewType::non_const_value_type;
   using AlphaCoeffType = typename AViewType::non_const_value_type;
   using BetaCoeffType  = typename YViewType::non_const_value_type;
-  using AccumScalar = typename std::conditional<std::is_same<y_value_type, Kokkos::Experimental::half_t>::value, float, y_value_type>::type;
+  using AccumScalar    = typename std::conditional<
+      std::is_same<y_value_type, Kokkos::Experimental::half_t>::value, float,
+      y_value_type>::type;
 
   using execution_space = typename AViewType::execution_space;
-  using policy_type = Kokkos::TeamPolicy<execution_space>;
-  using member_type = typename policy_type::member_type;
-
-  TwoLevelGEMV (const AlphaCoeffType& alpha,
-                         const AViewType& A,
-                         const XViewType& x,
-                         const BetaCoeffType& beta,
-                         const YViewType& y) :
-    alpha_ (alpha), A_ (A), x_ (x), beta_ (beta), y_ (y)
-  {
-    static_assert (Kokkos::Impl::is_view<AViewType>::value,
-                   "AViewType must be a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XViewType>::value,
-                   "XViewType must be a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<YViewType>::value,
-                   "YViewType must be a Kokkos::View.");
-    static_assert (static_cast<int> (AViewType::rank) == 2,
-                   "AViewType must have rank 2.");
-    static_assert (static_cast<int> (XViewType::rank) == 1,
-                   "XViewType must have rank 1.");
-    static_assert (static_cast<int> (YViewType::rank) == 1,
-                   "YViewType must have rank 1.");
-    static_assert (std::is_integral<IndexType>::value,
-                   "IndexType must be an integer.");
+  using policy_type     = Kokkos::TeamPolicy<execution_space>;
+  using member_type     = typename policy_type::member_type;
+
+  TwoLevelGEMV(const AlphaCoeffType& alpha, const AViewType& A,
+               const XViewType& x, const BetaCoeffType& beta,
+               const YViewType& y)
+      : alpha_(alpha), A_(A), x_(x), beta_(beta), y_(y) {
+    static_assert(Kokkos::is_view<AViewType>::value,
+                  "AViewType must be a Kokkos::View.");
+    static_assert(Kokkos::is_view<XViewType>::value,
+                  "XViewType must be a Kokkos::View.");
+    static_assert(Kokkos::is_view<YViewType>::value,
+                  "YViewType must be a Kokkos::View.");
+    static_assert(static_cast<int>(AViewType::rank) == 2,
+                  "AViewType must have rank 2.");
+    static_assert(static_cast<int>(XViewType::rank) == 1,
+                  "XViewType must have rank 1.");
+    static_assert(static_cast<int>(YViewType::rank) == 1,
+                  "YViewType must have rank 1.");
+    static_assert(std::is_integral<IndexType>::value,
+                  "IndexType must be an integer.");
   }
 
-public:
-  //LayoutLeft version: 32xK blocks.
-  //  -Each team handles block rows. 
-  //  -Groups of 32 threads handle N/teamsize columns sequentially, placing results into shared.
-  //  -Then individual thread results are combined with parallel_reduce.
-  KOKKOS_INLINE_FUNCTION void
-  operator () (TwoLevelGEMV_LayoutLeftTag, const member_type& team) const
-  {
-    using KAT = Kokkos::ArithTraits<y_value_type>;
+ public:
+  // LayoutLeft version: 32xK blocks.
+  //  -Each team handles block rows.
+  //  -Groups of 32 threads handle N/teamsize columns sequentially, placing
+  //  results into shared. -Then individual thread results are combined with
+  //  parallel_reduce.
+  KOKKOS_INLINE_FUNCTION void operator()(TwoLevelGEMV_LayoutLeftTag,
+                                         const member_type& team) const {
+    using KAT  = Kokkos::ArithTraits<y_value_type>;
     using AKAT = Kokkos::ArithTraits<AccumScalar>;
-    //Allocate a Scalar in shared for each thread
-    AccumScalar* blockResult = (AccumScalar*) team.team_shmem().get_shmem(32 * sizeof(AccumScalar));
+    // Allocate a Scalar in shared for each thread
+    AccumScalar* blockResult =
+        (AccumScalar*)team.team_shmem().get_shmem(32 * sizeof(AccumScalar));
     Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 32),
-    [&](int i)
-    {
-      blockResult[i] = AKAT::zero();
-    });
+                         [&](int i) { blockResult[i] = AKAT::zero(); });
     team.team_barrier();
-    //Which block this thread will work on
+    // Which block this thread will work on
     int block = team.team_rank() / 32;
-    //Which row in the block this thread will work on
-    IndexType row = team.league_rank() * 32 + team.team_rank() % 32;
+    // Which row in the block this thread will work on
+    IndexType row           = team.league_rank() * 32 + team.team_rank() % 32;
     IndexType blockColStart = columnsPerThread * block;
-    AccumScalar localSum = AKAT::zero();
-    //compute local sum
-    if(row < (IndexType) A_.extent(0))
-    {
-      for(IndexType col = blockColStart; col < blockColStart + columnsPerThread && col < A_.extent(1); col++)
-      {
-        //A access is coalesced, x access is a broadcast
+    AccumScalar localSum    = AKAT::zero();
+    // compute local sum
+    if (row < (IndexType)A_.extent(0)) {
+      for (IndexType col = blockColStart;
+           col < blockColStart + columnsPerThread && col < A_.extent(1);
+           col++) {
+        // A access is coalesced, x access is a broadcast
         localSum += AccumScalar(A_(row, col)) * AccumScalar(x_(col));
       }
+      // atomically combine local result into shared
+      Kokkos::atomic_add(&blockResult[team.team_rank() % 32], localSum);
     }
-    //atomically combine local result into shared
-    Kokkos::atomic_add(&blockResult[team.team_rank() % 32], localSum);
     team.team_barrier();
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 32),
-    [&](int i)
-    {
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 32), [&](int i) {
       IndexType yrow = team.league_rank() * 32 + i;
-      if(yrow < (IndexType) A_.extent(0))
-      {
-        if(beta_ == KAT::zero())
+      if (yrow < (IndexType)A_.extent(0)) {
+        if (beta_ == KAT::zero())
           y_(yrow) = y_value_type(alpha_ * blockResult[i]);
         else
-          y_(yrow) = y_value_type(beta_ * AccumScalar(y_(yrow)) + alpha_ * blockResult[i]);
+          y_(yrow) = y_value_type(beta_ * AccumScalar(y_(yrow)) +
+                                  alpha_ * blockResult[i]);
       }
     });
   }
 
-  //LayoutRight version: one team per row
-  KOKKOS_INLINE_FUNCTION void
-  operator () (TwoLevelGEMV_LayoutRightTag, const member_type& team) const
-  {
+  // LayoutRight version: one team per row
+  KOKKOS_INLINE_FUNCTION void operator()(TwoLevelGEMV_LayoutRightTag,
+                                         const member_type& team) const {
     using KAT = Kokkos::ArithTraits<y_value_type>;
 
     const IndexType N = A_.extent(1);
-    const int i = team.league_rank(); // batch id
+    const int i       = team.league_rank();  // batch id
 
     // parallel-reduce to compute val += A(:,j)' * x
     AccumScalar val;
-    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, N ), [&] ( const int j, AccumScalar& update ) {
-      update += AccumScalar(A_(i, j)) * x_(j);
-    }, val);
+    Kokkos::parallel_reduce(
+        Kokkos::TeamThreadRange(team, N),
+        [&](const int j, AccumScalar& update) {
+          update += AccumScalar(A_(i, j)) * x_(j);
+        },
+        val);
 
     // compute yj = beta*yj + alpha*val
-    Kokkos::single(Kokkos::PerTeam(team),
-    [&]()
-    {
-      if(beta_ == KAT::zero())
+    Kokkos::single(Kokkos::PerTeam(team), [&]() {
+      if (beta_ == KAT::zero())
         y_(i) = y_value_type(alpha_ * val);
       else
         y_(i) = y_value_type(beta_ * AccumScalar(y_(i)) + alpha_ * val);
@@ -591,7 +579,8 @@ struct TwoLevelGEMV {
   }
 
   IndexType columnsPerThread;
-private:
+
+ private:
   AlphaCoeffType alpha_;
   typename AViewType::const_type A_;
   typename XViewType::const_type x_;
@@ -599,81 +588,75 @@ struct TwoLevelGEMV {
   YViewType y_;
 };
 
-
 // ---------------------------------------------------------------------------------------------
 // Functor for a two-level parallel_reduce version of (conjugate)
 // transpose GEMV.  The functor uses parallel-for over the columns of the input
 // matrix A and each team uses parallel-reduce over the row of its column.
 // The output vector y is the reduction result.
-template<class AViewType,
-         class XViewType,
-         class YViewType,
-         const bool conj,
-         class IndexType = typename AViewType::size_type>
+template <class AViewType, class XViewType, class YViewType, const bool conj,
+          class IndexType = typename AViewType::size_type>
 struct TwoLevelTransposeGEMV {
   using y_value_type   = typename YViewType::non_const_value_type;
   using AlphaCoeffType = typename AViewType::non_const_value_type;
   using BetaCoeffType  = typename YViewType::non_const_value_type;
-  using AccumScalar = typename std::conditional<std::is_same<y_value_type, Kokkos::Experimental::half_t>::value, float, y_value_type>::type;
+  using AccumScalar    = typename std::conditional<
+      std::is_same<y_value_type, Kokkos::Experimental::half_t>::value, float,
+      y_value_type>::type;
 
   using execution_space = typename AViewType::execution_space;
-  using policy_type = Kokkos::TeamPolicy<execution_space>;
-  using member_type = typename policy_type::member_type;
-
-  TwoLevelTransposeGEMV (const AlphaCoeffType& alpha,
-                         const AViewType& A,
-                         const XViewType& x,
-                         const BetaCoeffType& beta,
-                         const YViewType& y) :
-    alpha_ (alpha), A_ (A), x_ (x), beta_ (beta), y_ (y)
-  {
-    static_assert (Kokkos::Impl::is_view<AViewType>::value,
-                   "AViewType must be a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XViewType>::value,
-                   "XViewType must be a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<YViewType>::value,
-                   "YViewType must be a Kokkos::View.");
-    static_assert (static_cast<int> (AViewType::rank) == 2,
-                   "AViewType must have rank 2.");
-    static_assert (static_cast<int> (XViewType::rank) == 1,
-                   "XViewType must have rank 1.");
-    static_assert (static_cast<int> (YViewType::rank) == 1,
-                   "YViewType must have rank 1.");
-    static_assert (std::is_integral<IndexType>::value,
-                   "IndexType must be an integer.");
+  using policy_type     = Kokkos::TeamPolicy<execution_space>;
+  using member_type     = typename policy_type::member_type;
+
+  TwoLevelTransposeGEMV(const AlphaCoeffType& alpha, const AViewType& A,
+                        const XViewType& x, const BetaCoeffType& beta,
+                        const YViewType& y)
+      : alpha_(alpha), A_(A), x_(x), beta_(beta), y_(y) {
+    static_assert(Kokkos::is_view<AViewType>::value,
+                  "AViewType must be a Kokkos::View.");
+    static_assert(Kokkos::is_view<XViewType>::value,
+                  "XViewType must be a Kokkos::View.");
+    static_assert(Kokkos::is_view<YViewType>::value,
+                  "YViewType must be a Kokkos::View.");
+    static_assert(static_cast<int>(AViewType::rank) == 2,
+                  "AViewType must have rank 2.");
+    static_assert(static_cast<int>(XViewType::rank) == 1,
+                  "XViewType must have rank 1.");
+    static_assert(static_cast<int>(YViewType::rank) == 1,
+                  "YViewType must have rank 1.");
+    static_assert(std::is_integral<IndexType>::value,
+                  "IndexType must be an integer.");
   }
 
-public:
-  KOKKOS_INLINE_FUNCTION void
-  operator () (const member_type & team) const
-  {
+ public:
+  KOKKOS_INLINE_FUNCTION void operator()(const member_type& team) const {
     using Kokkos::Details::ArithTraits;
     using KAT_A = ArithTraits<typename AViewType::non_const_value_type>;
     using KAT_Y = ArithTraits<typename YViewType::non_const_value_type>;
 
     const IndexType M = A_.extent(0);
-    const int j = team.league_rank(); // batch id
+    const int j       = team.league_rank();  // batch id
 
     // parallel-reduce to compute val += A(:,j)' * x
     AccumScalar val;
-    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, M ), [&] ( const int i, AccumScalar& update ) {
-      const auto x_i = x_(i);
-      const auto A_ij = conj ? KAT_A::conj (A_(i,j)) : A_(i,j);
-      update += AccumScalar(A_ij) * x_i;
-    }, val);
+    Kokkos::parallel_reduce(
+        Kokkos::TeamThreadRange(team, M),
+        [&](const int i, AccumScalar& update) {
+          const auto x_i  = x_(i);
+          const auto A_ij = conj ? KAT_A::conj(A_(i, j)) : A_(i, j);
+          update += AccumScalar(A_ij) * x_i;
+        },
+        val);
 
     // compute yj = beta*yj + alpha*val
-    Kokkos::single(Kokkos::PerTeam(team),
-    [&]()
-    {
-      if(beta_ == KAT_Y::zero())
+    Kokkos::single(Kokkos::PerTeam(team), [&]() {
+      if (beta_ == KAT_Y::zero())
         y_(j) = y_value_type(alpha_ * val);
       else
         y_(j) = y_value_type(beta_ * AccumScalar(y_(j)) + alpha_ * val);
     });
   }
 
-private:
+ private:
   AlphaCoeffType alpha_;
   typename AViewType::const_type A_;
   typename XViewType::const_type x_;
@@ -682,40 +665,36 @@ struct TwoLevelTransposeGEMV {
 };
 
 // Two-level parallel version of GEMV.
-template<class AViewType,
-         class XViewType,
-         class YViewType,
-         class IndexType = typename AViewType::size_type>
-void
-twoLevelGemv (const char trans[],
-              typename AViewType::const_value_type& alpha,
-              const AViewType& A,
-              const XViewType& x,
-              typename YViewType::const_value_type& beta,
-              const YViewType& y)
-{
-  static_assert (Kokkos::Impl::is_view<AViewType>::value,
-                 "AViewType must be a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<XViewType>::value,
-                 "XViewType must be a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<YViewType>::value,
-                 "YViewType must be a Kokkos::View.");
-  static_assert (static_cast<int> (AViewType::rank) == 2,
-                 "AViewType must have rank 2.");
-  static_assert (static_cast<int> (XViewType::rank) == 1,
-                 "XViewType must have rank 1.");
-  static_assert (static_cast<int> (YViewType::rank) == 1,
-                 "YViewType must have rank 1.");
-  static_assert (std::is_integral<IndexType>::value,
-                 "IndexType must be an integer");
-
-  using y_value_type    = typename YViewType::non_const_value_type;
-  using execution_space = typename AViewType::execution_space;
+template <class AViewType, class XViewType, class YViewType,
+          class IndexType = typename AViewType::size_type>
+void twoLevelGemv(const typename AViewType::execution_space& space,
+                  const char trans[],
+                  typename AViewType::const_value_type& alpha,
+                  const AViewType& A, const XViewType& x,
+                  typename YViewType::const_value_type& beta,
+                  const YViewType& y) {
+  static_assert(Kokkos::is_view<AViewType>::value,
+                "AViewType must be a Kokkos::View.");
+  static_assert(Kokkos::is_view<XViewType>::value,
+                "XViewType must be a Kokkos::View.");
+  static_assert(Kokkos::is_view<YViewType>::value,
+                "YViewType must be a Kokkos::View.");
+  static_assert(static_cast<int>(AViewType::rank) == 2,
+                "AViewType must have rank 2.");
+  static_assert(static_cast<int>(XViewType::rank) == 1,
+                "XViewType must have rank 1.");
+  static_assert(static_cast<int>(YViewType::rank) == 1,
+                "YViewType must have rank 1.");
+  static_assert(std::is_integral<IndexType>::value,
+                "IndexType must be an integer");
+
+  using y_value_type      = typename YViewType::non_const_value_type;
+  using execution_space   = typename AViewType::execution_space;
   using team_policy_type  = Kokkos::TeamPolicy<execution_space>;
   using range_policy_type = Kokkos::RangePolicy<execution_space, IndexType>;
 
   using Kokkos::Details::ArithTraits;
-  using KAT = ArithTraits<typename AViewType::non_const_value_type>;
+  using KAT  = ArithTraits<typename AViewType::non_const_value_type>;
   using YKAT = ArithTraits<typename YViewType::non_const_value_type>;
 
   const char tr = toupper(trans[0]);
@@ -725,124 +704,123 @@ twoLevelGemv (const char trans[],
   // could implement this using KokkosBlas::scal, but we don't want to
   // depend on that or its implementation details.  Instead, we reuse
   // an instantiation of the non-transpose case for alpha=0.
-  if (y.extent(0) == 0)
-  {
-    //no entries to update
+  if (y.extent(0) == 0) {
+    // no entries to update
     return;
-  }
-  else if (x.extent(0) == 0)
-  {
-    if (beta == YKAT::zero ()) {
-      Kokkos::deep_copy (y, KAT::zero ());
-    }
-    else if (beta != YKAT::one ()) {
+  } else if (x.extent(0) == 0) {
+    if (beta == YKAT::zero()) {
+      Kokkos::deep_copy(y, KAT::zero());
+    } else if (beta != YKAT::one()) {
       // "Fake out" a scal() by using the non-transpose alpha=0,
       // general beta case.  This assumes that the functor doesn't
       // check dimensions.
-      using functor_type = SingleLevelNontransposeGEMV<AViewType, XViewType, YViewType,
-                                                       0, -1, IndexType>;
-      functor_type functor (alpha, A, x, beta, y);
-      Kokkos::parallel_for ("KokkosBlas::gemv[SingleLevel]",range_policy_type (0, y.extent(0)), functor);
+      using functor_type =
+          SingleLevelNontransposeGEMV<AViewType, XViewType, YViewType, 0, -1,
+                                      IndexType>;
+      functor_type functor(alpha, A, x, beta, y);
+      Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]",
+                           range_policy_type(space, 0, y.extent(0)), functor);
     }
     return;
   }
 
   if (tr == 'N') {
-    constexpr bool isLayoutLeft = std::is_same<typename AViewType::array_layout, Kokkos::LayoutLeft>::value;
-    using layout_tag = typename std::conditional<isLayoutLeft,
-      TwoLevelGEMV_LayoutLeftTag, TwoLevelGEMV_LayoutRightTag>::type;
+    constexpr bool isLayoutLeft = std::is_same<typename AViewType::array_layout,
+                                               Kokkos::LayoutLeft>::value;
+    // Both kernels work for both layouts - the only difference is access
+    // pattern.
+    using layout_tag =
+        typename std::conditional<isLayoutLeft, TwoLevelGEMV_LayoutLeftTag,
+                                  TwoLevelGEMV_LayoutRightTag>::type;
     using tagged_policy = Kokkos::TeamPolicy<execution_space, layout_tag>;
-    using functor_type = TwoLevelGEMV<AViewType, XViewType, YViewType, IndexType>;
-    functor_type functor (alpha, A, x, beta, y);
+    using functor_type =
+        TwoLevelGEMV<AViewType, XViewType, YViewType, IndexType>;
+    functor_type functor(alpha, A, x, beta, y);
     tagged_policy team;
-    if(isLayoutLeft)
-    {
-      using AccumScalar = typename std::conditional<std::is_same<y_value_type, Kokkos::Experimental::half_t>::value, float, y_value_type>::type;
+    if (isLayoutLeft) {
+      using AccumScalar = typename std::conditional<
+          std::is_same<y_value_type, Kokkos::Experimental::half_t>::value,
+          float, y_value_type>::type;
       size_t sharedPerTeam = 32 * sizeof(AccumScalar);
-      IndexType numTeams = (A.extent(0) + 31) / 32;
+      IndexType numTeams   = (A.extent(0) + 31) / 32;
       tagged_policy temp(1, 1);
-      int teamSize = temp.team_size_max(functor, Kokkos::ParallelForTag());
-      //make sure teamSize is a multiple of 32
+      temp.set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam));
+      int teamSize =
+          temp.team_size_recommended(functor, Kokkos::ParallelForTag());
+      // make sure teamSize is a multiple of 32
       teamSize -= teamSize % 32;
-      //don't make teamSize larger than what's useful
-      if((size_t) teamSize > 32 * A.extent(1))
-        teamSize = 32 * A.extent(1);
-      int numBlocks = teamSize / 32;
+      // don't make teamSize larger than what's useful
+      if ((size_t)teamSize > 32 * A.extent(1)) teamSize = 32 * A.extent(1);
+        // FIXME SYCL: team_size_recommended() returns too big of a team size.
+        // Kernel hangs with 1024 threads on XEHP.
+#ifdef KOKKOS_ENABLE_SYCL
+      if (std::is_same<execution_space, Kokkos::Experimental::SYCL>::value) {
+        if (teamSize > 256) teamSize = 256;
+      }
+#endif
+      int numBlocks            = teamSize / 32;
       functor.columnsPerThread = (A.extent(1) + numBlocks - 1) / numBlocks;
-      team = tagged_policy(numTeams, teamSize).set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam));
-    }
-    else
-    {
-      //LayoutRight: one team per row
-      team = tagged_policy(A.extent(0), Kokkos::AUTO);
+      team                     = tagged_policy(space, numTeams, teamSize)
+                 .set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam));
+    } else {
+      // LayoutRight: one team per row
+      team = tagged_policy(space, A.extent(0), Kokkos::AUTO);
     }
-    Kokkos::parallel_for ("KokkosBlas::gemv[twoLevel]", team, functor);
-  }
-  else {
-    if (alpha == KAT::zero () && beta == KAT::zero ()) {
+    Kokkos::parallel_for("KokkosBlas::gemv[twoLevel]", team, functor);
+  } else {
+    if (alpha == KAT::zero() && beta == KAT::zero()) {
       // Fill y with zeros
-      Kokkos::deep_copy (y, Kokkos::Details::ArithTraits<y_value_type>::zero ());
-    }
-    else if (alpha == KAT::zero () && beta == KAT::one ()) {
+      Kokkos::deep_copy(y, Kokkos::Details::ArithTraits<y_value_type>::zero());
+    } else if (alpha == KAT::zero() && beta == KAT::one()) {
       // Do nothing (y := 1 * y)
-    }
-    else if (tr == 'T') {
+    } else if (tr == 'T') {
       // transpose, and not conj transpose
-      team_policy_type  team (A.extent(1), Kokkos::AUTO);
-      using functor_type = TwoLevelTransposeGEMV<AViewType, XViewType, YViewType,
-                                                 false, IndexType>;
-      functor_type functor (alpha, A, x, beta, y);
-      Kokkos::parallel_for ("KokkosBlas::gemv[twoLevelTranspose]", team, functor);
-    }
-    else if (tr == 'C' || tr == 'H') {
+      team_policy_type team(space, A.extent(1), Kokkos::AUTO);
+      using functor_type = TwoLevelTransposeGEMV<AViewType, XViewType,
+                                                 YViewType, false, IndexType>;
+      functor_type functor(alpha, A, x, beta, y);
+      Kokkos::parallel_for("KokkosBlas::gemv[twoLevelTranspose]", team,
+                           functor);
+    } else if (tr == 'C' || tr == 'H') {
       // conjugate transpose
-      team_policy_type  team (A.extent(1), Kokkos::AUTO);
-      using functor_type = TwoLevelTransposeGEMV<AViewType, XViewType, YViewType,
-                                                 true, IndexType>;
-      functor_type functor (alpha, A, x, beta, y);
-      Kokkos::parallel_for ("KokkosBlas::gemv[twoLevelTranspose]", team, functor);
+      team_policy_type team(space, A.extent(1), Kokkos::AUTO);
+      using functor_type = TwoLevelTransposeGEMV<AViewType, XViewType,
+                                                 YViewType, true, IndexType>;
+      functor_type functor(alpha, A, x, beta, y);
+      Kokkos::parallel_for("KokkosBlas::gemv[twoLevelTranspose]", team,
+                           functor);
     }
   }
 }
 
-//generalGemv: use 1 level (Range) or 2 level (Team) implementation,
-//depending on whether execution space is CPU or GPU. enable_if makes sure
-//unused kernels are not instantiated.
-template<class AViewType,
-         class XViewType,
-         class YViewType,
-         class IndexType,
-         typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space
-           <typename AViewType::execution_space>()>::type* = nullptr>
-void
-generalGemvImpl (const char trans[],
-                 typename AViewType::const_value_type& alpha,
-                 const AViewType& A,
-                 const XViewType& x,
-                 typename YViewType::const_value_type& beta,
-                 const YViewType& y)
-{
-  singleLevelGemv (trans, alpha, A, x, beta, y);
+// generalGemv: use 1 level (Range) or 2 level (Team) implementation,
+// depending on whether execution space is CPU or GPU. enable_if makes sure
+// unused kernels are not instantiated.
+template <class AViewType, class XViewType, class YViewType, class IndexType,
+          typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename AViewType::execution_space>()>::type* = nullptr>
+void generalGemvImpl(const typename AViewType::execution_space& space,
+                     const char trans[],
+                     typename AViewType::const_value_type& alpha,
+                     const AViewType& A, const XViewType& x,
+                     typename YViewType::const_value_type& beta,
+                     const YViewType& y) {
+  singleLevelGemv(space, trans, alpha, A, x, beta, y);
 }
 
-template<class AViewType,
-         class XViewType,
-         class YViewType,
-         class IndexType,
-         typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space
-           <typename AViewType::execution_space>()>::type* = nullptr>
-void
-generalGemvImpl (const char trans[],
-                 typename AViewType::const_value_type& alpha,
-                 const AViewType& A,
-                 const XViewType& x,
-                 typename YViewType::const_value_type& beta,
-                 const YViewType& y)
-{
-  twoLevelGemv (trans, alpha, A, x, beta, y);
+template <class AViewType, class XViewType, class YViewType, class IndexType,
+          typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename AViewType::execution_space>()>::type* = nullptr>
+void generalGemvImpl(const typename AViewType::execution_space& space,
+                     const char trans[],
+                     typename AViewType::const_value_type& alpha,
+                     const AViewType& A, const XViewType& x,
+                     typename YViewType::const_value_type& beta,
+                     const YViewType& y) {
+  twoLevelGemv(space, trans, alpha, A, x, beta, y);
 }
 
-} // namespace Impl
-} // namespace KokkosBlas
+}  // namespace Impl
+}  // namespace KokkosBlas
 
-#endif // KOKKOS_BLAS2_MV_IMPL_GEMV_HPP_
+#endif  // KOKKOS_BLAS2_MV_IMPL_GEMV_HPP_
diff --git a/src/blas/impl/KokkosBlas2_gemv_spec.hpp b/src/blas/impl/KokkosBlas2_gemv_spec.hpp
index cb8c616e10..bb043ac2de 100644
--- a/src/blas/impl/KokkosBlas2_gemv_spec.hpp
+++ b/src/blas/impl/KokkosBlas2_gemv_spec.hpp
@@ -49,19 +49,18 @@
 #include "Kokkos_InnerProductSpaceTraits.hpp"
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
-#include<KokkosBlas2_gemv_impl.hpp>
+#include <KokkosBlas2_gemv_impl.hpp>
 #endif
 
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class XMV, class YMV, class ZMV>
+template <class XMV, class YMV, class ZMV>
 struct gemv_eti_spec_avail {
   enum : bool { value = false };
 };
-}
-}
-
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization availability
@@ -70,25 +69,23 @@ struct gemv_eti_spec_avail {
 // We may spread out definitions (see _INST macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS2_GEMV_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct gemv_eti_spec_avail< \
-         Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR*, \
-                      LAYOUT, \
-                      Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<SCALAR*, \
-                      LAYOUT, \
-                      Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> > \
-         > { enum : bool { value = true }; };
-
+#define KOKKOSBLAS2_GEMV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  template <>                                                                  \
+  struct gemv_eti_spec_avail<                                                  \
+      Kokkos::View<const SCALAR**, LAYOUT,                                     \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const SCALAR*, LAYOUT,                                      \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {               \
+    enum : bool { value = true };                                              \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosBlas2_gemv_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas2_gemv_eti_spec_avail.hpp>
+#include <KokkosBlas2_gemv_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas2_gemv_eti_spec_avail.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
@@ -98,62 +95,58 @@ namespace Impl {
 //
 
 // Implementation of KokkosBlas::gemv.
-template<class AViewType,
-         class XViewType,
-         class YViewType,
-         bool tpl_spec_avail = gemv_tpl_spec_avail<AViewType, XViewType, YViewType>::value,
-         bool eti_spec_avail = gemv_eti_spec_avail<AViewType, XViewType, YViewType>::value
-         >
+template <class AViewType, class XViewType, class YViewType,
+          bool tpl_spec_avail =
+              gemv_tpl_spec_avail<AViewType, XViewType, YViewType>::value,
+          bool eti_spec_avail =
+              gemv_eti_spec_avail<AViewType, XViewType, YViewType>::value>
 struct GEMV {
-  static void
-  gemv (const char trans[],
-        typename AViewType::const_value_type& alpha,
-        const AViewType& A,
-        const XViewType& x,
-        typename YViewType::const_value_type& beta,
-        const YViewType& y)
-  #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+  static void gemv(const typename AViewType::execution_space& space,
+                   const char trans[],
+                   typename AViewType::const_value_type& alpha,
+                   const AViewType& A, const XViewType& x,
+                   typename YViewType::const_value_type& beta,
+                   const YViewType& y)
+#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
   {
-    static_assert (Kokkos::Impl::is_view<AViewType>::value,
-                   "AViewType must be a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XViewType>::value,
-                   "XViewType must be a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<YViewType>::value,
-                   "YViewType must be a Kokkos::View.");
-    static_assert (static_cast<int> (AViewType::rank) == 2,
-                   "AViewType must have rank 2.");
-    static_assert (static_cast<int> (XViewType::rank) == 1,
-                   "XViewType must have rank 1.");
-    static_assert (static_cast<int> (YViewType::rank) == 1,
-                   "YViewType must have rank 1.");
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::gemv[ETI]":"KokkosBlas::gemv[noETI]");
+    static_assert(Kokkos::is_view<AViewType>::value,
+                  "AViewType must be a Kokkos::View.");
+    static_assert(Kokkos::is_view<XViewType>::value,
+                  "XViewType must be a Kokkos::View.");
+    static_assert(Kokkos::is_view<YViewType>::value,
+                  "YViewType must be a Kokkos::View.");
+    static_assert(static_cast<int>(AViewType::rank) == 2,
+                  "AViewType must have rank 2.");
+    static_assert(static_cast<int>(XViewType::rank) == 1,
+                  "XViewType must have rank 1.");
+    static_assert(static_cast<int>(YViewType::rank) == 1,
+                  "YViewType must have rank 1.");
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::gemv[ETI]"
+                                      : "KokkosBlas::gemv[noETI]");
     typedef typename AViewType::size_type size_type;
     const size_type numRows = A.extent(0);
     const size_type numCols = A.extent(1);
 
     // Prefer int as the index type, but use a larger type if needed.
-    if (numRows < static_cast<size_type> (INT_MAX) &&
-        numCols < static_cast<size_type> (INT_MAX)) {
-      generalGemvImpl<AViewType, XViewType, YViewType, int>
-         (trans, alpha, A, x, beta, y);
-    }
-    else {
-      generalGemvImpl<AViewType, XViewType, YViewType, int64_t>
-         (trans, alpha, A, x, beta, y);
+    if (numRows < static_cast<size_type>(INT_MAX) &&
+        numCols < static_cast<size_type>(INT_MAX)) {
+      generalGemvImpl<AViewType, XViewType, YViewType, int>(space, trans, alpha,
+                                                            A, x, beta, y);
+    } else {
+      generalGemvImpl<AViewType, XViewType, YViewType, int64_t>(
+          space, trans, alpha, A, x, beta, y);
     }
     Kokkos::Profiling::popRegion();
   }
-  #else
-  ;
-  #endif //!defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+#else
+      ;
+#endif  //! defined(KOKKOSKERNELS_ETI_ONLY) ||
+        //! KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 };
 
-
-
-
-} // namespace Impl
-} // namespace KokkosBlas
-
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization of
@@ -163,36 +156,31 @@ struct GEMV {
 // one or more .cpp files.
 //
 
-#define KOKKOSBLAS2_GEMV_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-extern template struct GEMV< \
-     Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const SCALAR*, \
-                  LAYOUT, \
-                  Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR*, \
-                  LAYOUT, \
-                  Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     false, true>;
-
-#define KOKKOSBLAS2_GEMV_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-template struct GEMV< \
-     Kokkos::View<const SCALAR**, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const SCALAR*, \
-                  LAYOUT, \
-                  Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR*, \
-                  LAYOUT, \
-                  Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >,  \
-     false, true>;
-
-
-#include<KokkosBlas2_gemv_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas2_gemv_eti_spec_decl.hpp>
-
-#endif // KOKKOSBLAS1_GEMV_SPEC_HPP_
+#define KOKKOSBLAS2_GEMV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  extern template struct GEMV<                                                \
+      Kokkos::View<const SCALAR**, LAYOUT,                                    \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                     \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      false, true>;
+
+#define KOKKOSBLAS2_GEMV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  template struct GEMV<                                                       \
+      Kokkos::View<const SCALAR**, LAYOUT,                                    \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<const SCALAR*, LAYOUT,                                     \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      false, true>;
+
+#include <KokkosBlas2_gemv_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas2_gemv_eti_spec_decl.hpp>
+
+#endif  // KOKKOSBLAS1_GEMV_SPEC_HPP_
diff --git a/src/blas/impl/KokkosBlas2_team_gemv_spec.hpp b/src/blas/impl/KokkosBlas2_team_gemv_spec.hpp
index 7013912f42..b6cc7bf125 100644
--- a/src/blas/impl/KokkosBlas2_team_gemv_spec.hpp
+++ b/src/blas/impl/KokkosBlas2_team_gemv_spec.hpp
@@ -42,8 +42,8 @@
 //@HEADER
 */
 
-#ifndef KOKKOSBLAS1_TEAM_GEMV_SPEC_HPP_
-#define KOKKOSBLAS1_TEAM_GEMV_SPEC_HPP_
+#ifndef KOKKOSBLAS2_TEAM_GEMV_SPEC_HPP_
+#define KOKKOSBLAS2_TEAM_GEMV_SPEC_HPP_
 
 #include <KokkosKernels_config.h>
 #include <Kokkos_Core.hpp>
@@ -54,90 +54,114 @@ namespace KokkosBlas {
 namespace Experimental {
 namespace Impl {
 
-template<class MT, class XV, class YV, int T>
+template <class MT, class XV, class YV, int T>
 struct team_gemv_tpl_spec_avail {
   constexpr static bool value = false;
 };
 
-
 // Unification and Specialization layer
-template<class TeamType, class MatrixType, class XVector, class YVector, int T,
-         bool tpl_spec_avail = team_gemv_tpl_spec_avail<MatrixType,XVector,YVector,T>::value>
+template <class TeamType, class MatrixType, class XVector, class YVector, int T,
+          bool tpl_spec_avail =
+              team_gemv_tpl_spec_avail<MatrixType, XVector, YVector, T>::value>
 struct TeamGEMV {
-  static KOKKOS_INLINE_FUNCTION void team_gemv (const TeamType& team,
-      const typename XVector::non_const_value_type& alpha,
-      const MatrixType& A,
-      const XVector& x,
+  static KOKKOS_INLINE_FUNCTION void team_gemv(
+      const TeamType& team, const typename XVector::non_const_value_type& alpha,
+      const MatrixType& A, const XVector& x,
       const typename YVector::non_const_value_type& beta, const YVector& y);
 };
 
-template<class TeamType, class MatrixType, class XVector, class YVector>
-struct TeamGEMV<TeamType, MatrixType, XVector, YVector, 0, false>
-{
-  typedef typename Kokkos::Details::InnerProductSpaceTraits<typename MatrixType::non_const_value_type>::dot_type dot_type;
-  typedef typename XVector::non_const_value_type x_value_type;
-  static KOKKOS_INLINE_FUNCTION void team_gemv (const TeamType& team,
-      const typename XVector::non_const_value_type& alpha,
-      const MatrixType& A,
-      const XVector& x,
+template <class TeamType, class MatrixType, class XVector, class YVector>
+struct TeamGEMV<TeamType, MatrixType, XVector, YVector, 0, false> {
+  typedef typename Kokkos::Details::InnerProductSpaceTraits<
+      typename MatrixType::non_const_value_type>::dot_type dot_type;
+  static KOKKOS_INLINE_FUNCTION void team_gemv(
+      const TeamType& team, const typename XVector::non_const_value_type& alpha,
+      const MatrixType& A, const XVector& x,
+      const typename YVector::non_const_value_type& beta, const YVector& y) {
+    const int N = A.extent(0);
+    const int M = A.extent(1);
+
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) {
+      dot_type Ax_i;
+      Kokkos::parallel_reduce(
+          Kokkos::ThreadVectorRange(team, M),
+          [&](const int& j, dot_type& val) { val += A(i, j) * x(j); }, Ax_i);
+      y(i) = beta * y(i) + alpha * Ax_i;
+    });
+  }
+};
+
+template <class TeamType, class MatrixType, class XVector, class YVector>
+struct TeamGEMV<TeamType, MatrixType, XVector, YVector, -1, false> {
+  typedef typename MatrixType::non_const_value_type value_type;
+  typedef typename Kokkos::Details::InnerProductSpaceTraits<
+      typename MatrixType::non_const_value_type>::dot_type dot_type;
+  static KOKKOS_INLINE_FUNCTION void team_gemv(
+      const TeamType& team, const typename XVector::non_const_value_type& alpha,
+      const MatrixType& A, const XVector& x,
       const typename YVector::non_const_value_type& beta, const YVector& y) {
     const int N = A.extent(0);
     const int M = A.extent(1);
 
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(team,N), [&] (const int& i) {
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) {
       dot_type Ax_i;
-      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,M), [&] (const int& j, dot_type& val ) {
-        val += A(i,j) * x(j);
-      },Ax_i);
+      Kokkos::parallel_reduce(
+          Kokkos::ThreadVectorRange(team, M),
+          [&](const int& j, dot_type& val) {
+            val += Kokkos::ArithTraits<value_type>::conj(A(i, j)) * x(j);
+          },
+          Ax_i);
       y(i) = beta * y(i) + alpha * Ax_i;
     });
   }
 };
 
-template<class TeamType, class MatrixType, class XVector, class YVector>
-struct TeamGEMV<TeamType, MatrixType, XVector, YVector, 1, false>
-{
-  typedef typename Kokkos::Details::InnerProductSpaceTraits<typename MatrixType::non_const_value_type>::dot_type dot_type;
-  static KOKKOS_INLINE_FUNCTION void team_gemv (const TeamType& team,
-      const typename XVector::non_const_value_type& alpha,
-      const MatrixType& A,
-      const XVector& x,
+template <class TeamType, class MatrixType, class XVector, class YVector>
+struct TeamGEMV<TeamType, MatrixType, XVector, YVector, 1, false> {
+  typedef typename Kokkos::Details::InnerProductSpaceTraits<
+      typename MatrixType::non_const_value_type>::dot_type dot_type;
+  static KOKKOS_INLINE_FUNCTION void team_gemv(
+      const TeamType& team, const typename XVector::non_const_value_type& alpha,
+      const MatrixType& A, const XVector& x,
       const typename YVector::non_const_value_type& beta, const YVector& y) {
     const int N = A.extent(1);
     const int M = A.extent(0);
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(team,N), [&] (const int& i) {
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) {
       dot_type Ax_i;
-      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,M), [&] (const int& j, dot_type& val ) {
-        val += A(j,i) * x(j);
-      },Ax_i);
+      Kokkos::parallel_reduce(
+          Kokkos::ThreadVectorRange(team, M),
+          [&](const int& j, dot_type& val) { val += A(j, i) * x(j); }, Ax_i);
       y(i) = beta * y(i) + alpha * Ax_i;
     });
   }
 };
 
-template<class TeamType, class MatrixType, class XVector, class YVector>
-struct TeamGEMV<TeamType, MatrixType, XVector, YVector, 2, false>
-{
+template <class TeamType, class MatrixType, class XVector, class YVector>
+struct TeamGEMV<TeamType, MatrixType, XVector, YVector, 2, false> {
   typedef typename MatrixType::non_const_value_type value_type;
-  typedef typename Kokkos::Details::InnerProductSpaceTraits<value_type>::dot_type dot_type;
-  static KOKKOS_INLINE_FUNCTION void team_gemv (const TeamType& team,
-      const typename XVector::non_const_value_type& alpha,
-      const MatrixType& A,
-      const XVector& x,
+  typedef
+      typename Kokkos::Details::InnerProductSpaceTraits<value_type>::dot_type
+          dot_type;
+  static KOKKOS_INLINE_FUNCTION void team_gemv(
+      const TeamType& team, const typename XVector::non_const_value_type& alpha,
+      const MatrixType& A, const XVector& x,
       const typename YVector::non_const_value_type& beta, const YVector& y) {
     const int N = A.extent(1);
     const int M = A.extent(0);
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(team,N), [&] (const int& i) {
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) {
       dot_type Ax_i;
-      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,M), [&] (const int& j, dot_type& val ) {
-        val += Kokkos::ArithTraits<value_type>::conj(A(j,i)) * x(j);
-      },Ax_i);
+      Kokkos::parallel_reduce(
+          Kokkos::ThreadVectorRange(team, M),
+          [&](const int& j, dot_type& val) {
+            val += Kokkos::ArithTraits<value_type>::conj(A(j, i)) * x(j);
+          },
+          Ax_i);
       y(i) = beta * y(i) + alpha * Ax_i;
     });
   }
 };
-}
-}
-}
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp b/src/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp
index 8c4e404e9f..52c0c47470 100644
--- a/src/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp
+++ b/src/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp
@@ -45,165 +45,152 @@
 #ifndef KOKKOS_BLAS3_GEMM_DOTBASED_IMPL_HPP_
 #define KOKKOS_BLAS3_GEMM_DOTBASED_IMPL_HPP_
 
+#include "KokkosBlas_util.hpp"
+
 namespace KokkosBlas {
 namespace Impl {
 
-
-// DotBasedGEMM implements the optimization for C = beta*C + alpha*A^TB 
-// with A and B matrices both being tall and skinny. C matrix is assumably 
-// small, so, each entry of C is computed by performing the dot product of 
+// DotBasedGEMM implements the optimization for C = beta*C + alpha*A^TB
+// with A and B matrices both being tall and skinny. C matrix is assumably
+// small, so, each entry of C is computed by performing the dot product of
 // respective columns of A and B matrices. Note that the dot products are
 // performed on very long vectors, so, each dot product is distributed among
-// numDivPerDot teams.     
-
-struct TagZero{};   // The init tag for beta=0 
-struct TagInit{};   // The init tag for beta!=0 and beta !=1 
-struct TagMult{};   // The multiplication tag for transposed A
-struct TagMultCT{};   // The multiplication tag for conjugate-transposed A 
-template<class ExecSpace, class AV, class BV, class CV>
-struct DotBasedGEMM{
-
+// numDivPerDot teams.
+
+struct TagZero {};    // The init tag for beta=0
+struct TagInit {};    // The init tag for beta!=0 and beta !=1
+struct TagMult {};    // The multiplication tag for transposed A
+struct TagMultCT {};  // The multiplication tag for conjugate-transposed A
+template <class ExecSpace, class AV, class BV, class CV>
+struct DotBasedGEMM {
   const AV A;
   const BV B;
   CV C;
 
   using scalar_A = typename AV::non_const_value_type;
-  using size_A = typename AV::size_type;
+  using size_A   = typename AV::size_type;
   using scalar_C = typename CV::non_const_value_type;
-  using size_C = typename CV::size_type;
-  using AVT = Kokkos::Details::ArithTraits<scalar_A>;
-  using CVT = Kokkos::Details::ArithTraits<scalar_C>;
+  using size_C   = typename CV::size_type;
+  using AVT      = Kokkos::Details::ArithTraits<scalar_A>;
+  using CVT      = Kokkos::Details::ArithTraits<scalar_C>;
 
   const scalar_A alpha;
   const scalar_C beta;
 
-  const size_C numCrows;           
+  const size_C numCrows;
   const size_C numCcols;
 
-  size_C numDivPerDot;   // number of teams collectively performing a dot product
-  size_C numTeams;       // total number of teams
-  
+  size_C numDivPerDot;  // number of teams collectively performing a dot product
+  size_C numTeams;      // total number of teams
+
   const size_A dotSize;  // the length of the vectors in the dot products
-  size_A chunkSize;      // the local length of each team's share on the dot product  
-  
-
-  DotBasedGEMM(const scalar_A& alpha_, const AV& A_, const BV& B_, const scalar_C& beta_, const CV& C_) :
-  A(A_), B(B_), C(C_), alpha(alpha_), beta(beta_),
-  numCrows(C.extent(0)), numCcols(C.extent(1)), dotSize(A.extent(0))
-  { }
-
-  void run(bool conjugateTranspose) {
-
-    // NOTE: these workPerTeam and approxNumTeams were used for TPL CUBLAS,
-    //       and may need to be retuned for other architectures
-    constexpr size_C workPerTeam = 4096;                   // Amount of work per team
-    const size_C ndots = numCrows * numCcols;              // Number of dot products
-    size_C appxNumTeams = (dotSize * ndots) / workPerTeam; // Estimation for appxNumTeams
-
-    // Adjust appxNumTeams in case it is too small or too large
-    if(appxNumTeams < 1)
-      appxNumTeams = 1;
-    if(appxNumTeams > 1024)
-      appxNumTeams = 1024;
-
-    // If there are more dot products than the number of teams,
-    // then set the number of teams to be number of dot products
-    // and each team will perform only one dot product.
-    // We don't want a team to perform more than one dot product.
-    if(ndots >= appxNumTeams) {
-      numTeams = ndots;
-      numDivPerDot = 1;
-    }
-    // If there are more teams than dot products, each dot product can
-    // potentially be performed by multiple teams. First, compute 
-    // numDivPerDot as an integer (take the floor, not ceiling), then,
-    // compute actual number of teams by using this factor.
-    else {
-      numDivPerDot = appxNumTeams / ndots;
-      numTeams = ndots * numDivPerDot;
-    }
 
-    // Determine the local length for the dot product
-    chunkSize = dotSize / numDivPerDot;
-    if(numDivPerDot > 1)
-      chunkSize++;
+  DotBasedGEMM(const scalar_A& alpha_, const AV& A_, const BV& B_,
+               const scalar_C& beta_, const CV& C_)
+      : A(A_),
+        B(B_),
+        C(C_),
+        alpha(alpha_),
+        beta(beta_),
+        numCrows(C.extent(0)),
+        numCcols(C.extent(1)),
+        dotSize(A.extent(0)) {}
+
+  void run(const typename CV::execution_space& space, bool conjugateTranspose) {
+    multipleReductionWorkDistribution<ExecSpace, size_C>(
+        dotSize, numCrows * numCcols, numDivPerDot);
+    const size_C ndots = numCrows * numCcols;  // Number of dot products
+    numTeams           = ndots * numDivPerDot;
 
     // Initialize C matrix if beta != 1
-    if(beta == CVT::zero()) {
-      Kokkos::MDRangePolicy<TagZero, ExecSpace, Kokkos::Rank<2>> policyInit({0,0}, {numCrows, numCcols});
-      Kokkos::parallel_for("Initialize C for Dot Product Based GEMM", policyInit, *this);
-    }
-    else if(beta != CVT::one()) {
-      Kokkos::MDRangePolicy<TagInit, ExecSpace, Kokkos::Rank<2>> policyInit({0,0}, {numCrows, numCcols});
-      Kokkos::parallel_for("Initialize C for Dot Product Based GEMM", policyInit, *this);
+    if (beta == CVT::zero()) {
+      Kokkos::MDRangePolicy<TagZero, ExecSpace, Kokkos::Rank<2>> policyInit(
+          {0, 0}, {numCrows, numCcols});
+      Kokkos::parallel_for("Initialize C for Dot Product Based GEMM",
+                           policyInit, *this);
+    } else if (beta != CVT::one()) {
+      Kokkos::MDRangePolicy<TagInit, ExecSpace, Kokkos::Rank<2>> policyInit(
+          {0, 0}, {numCrows, numCcols});
+      Kokkos::parallel_for("Initialize C for Dot Product Based GEMM",
+                           policyInit, *this);
     }
-    
+
     // Multiply alpha*A^TB and add it to beta*C
-    if(conjugateTranspose) {
-      Kokkos::TeamPolicy<TagMultCT, ExecSpace> policyMult(numTeams, Kokkos::AUTO);
+    if (conjugateTranspose) {
+      Kokkos::TeamPolicy<TagMultCT, ExecSpace> policyMult(space, numTeams,
+                                                          Kokkos::AUTO);
       Kokkos::parallel_for("Perform Dot Product Based GEMM", policyMult, *this);
-    }
-    else{
-      Kokkos::TeamPolicy<TagMult, ExecSpace> policyMult(numTeams, Kokkos::AUTO);
+    } else {
+      Kokkos::TeamPolicy<TagMult, ExecSpace> policyMult(space, numTeams,
+                                                        Kokkos::AUTO);
       Kokkos::parallel_for("Perform Dot Product Based GEMM", policyMult, *this);
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const TagZero&, const size_C &rowId, const size_C &colId ) const {
-    C(rowId, colId) = CVT::zero(); 
+  void operator()(const TagZero&, const size_C& rowId,
+                  const size_C& colId) const {
+    C(rowId, colId) = CVT::zero();
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const TagInit&, const size_C &rowId, const size_C &colId ) const {
+  void operator()(const TagInit&, const size_C& rowId,
+                  const size_C& colId) const {
     C(rowId, colId) = beta * C(rowId, colId);
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const TagMult&, const typename Kokkos::TeamPolicy<ExecSpace>::member_type& teamMember) const {
-
+  void operator()(const TagMult&,
+                  const typename Kokkos::TeamPolicy<ExecSpace>::member_type&
+                      teamMember) const {
     const size_C globalRank = teamMember.league_rank();
-    const size_C localRank = globalRank % numDivPerDot;
-    const size_C i = globalRank / numDivPerDot;
-    const size_C rowId = i / numCcols;
-    const size_C colId = i % numCcols;
-    
+    const size_C localRank  = globalRank % numDivPerDot;
+    const size_C i          = globalRank / numDivPerDot;
+    const size_C rowId      = i / numCcols;
+    const size_C colId      = i % numCcols;
+
     scalar_C result = CVT::zero();
-    const size_A baseInd = chunkSize*localRank; 
-    Kokkos::parallel_reduce( Kokkos::TeamThreadRange(teamMember, chunkSize), [&]( const size_A k, scalar_C &update ) {
-        if(baseInd + k < dotSize)
-          update += alpha * A(baseInd+k, rowId) * B(baseInd+k, colId);
-      }, result );
-
-    Kokkos::single(Kokkos::PerTeam(teamMember), [&] () { 
-      Kokkos::atomic_add(&C(rowId, colId), result);
-      });
+    size_A begin    = localRank * (dotSize / numDivPerDot);
+    size_A end      = (localRank + 1) * (dotSize / numDivPerDot);
+    if (localRank == numDivPerDot - 1) end = dotSize;
+    Kokkos::parallel_reduce(
+        Kokkos::TeamThreadRange(teamMember, begin, end),
+        [&](const size_A k, scalar_C& update) {
+          update += alpha * A(k, rowId) * B(k, colId);
+        },
+        result);
+
+    Kokkos::single(Kokkos::PerTeam(teamMember),
+                   [&]() { Kokkos::atomic_add(&C(rowId, colId), result); });
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const TagMultCT&, const typename Kokkos::TeamPolicy<ExecSpace>::member_type& teamMember) const {
-
+  void operator()(const TagMultCT&,
+                  const typename Kokkos::TeamPolicy<ExecSpace>::member_type&
+                      teamMember) const {
     const size_C globalRank = teamMember.league_rank();
-    const size_C localRank = globalRank % numDivPerDot;
-    const size_C i = globalRank / numDivPerDot;
-    const size_C rowId = i / numCcols;
-    const size_C colId = i % numCcols;
-    
+    const size_C localRank  = globalRank % numDivPerDot;
+    const size_C i          = globalRank / numDivPerDot;
+    const size_C rowId      = i / numCcols;
+    const size_C colId      = i % numCcols;
+
     scalar_C result = CVT::zero();
-    const size_A baseInd = chunkSize*localRank; 
-    Kokkos::parallel_reduce( Kokkos::TeamThreadRange(teamMember, chunkSize), [&]( const size_A k, scalar_C &update ) {
-        if(baseInd + k < dotSize)
-          update += alpha * AVT::conj(A(baseInd+k, rowId)) * B(baseInd+k, colId);
-      }, result );
-
-    Kokkos::single(Kokkos::PerTeam(teamMember), [&] () { 
-      Kokkos::atomic_add(&C(rowId, colId), result);
-      });
+    size_A begin    = localRank * (dotSize / numDivPerDot);
+    size_A end      = (localRank + 1) * (dotSize / numDivPerDot);
+    if (localRank == numDivPerDot - 1) end = dotSize;
+    Kokkos::parallel_reduce(
+        Kokkos::TeamThreadRange(teamMember, begin, end),
+        [&](const size_A k, scalar_C& update) {
+          update += alpha * AVT::conj(A(k, rowId)) * B(k, colId);
+        },
+        result);
+
+    Kokkos::single(Kokkos::PerTeam(teamMember),
+                   [&]() { Kokkos::atomic_add(&C(rowId, colId), result); });
   }
-
 };
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/blas/impl/KokkosBlas3_gemm_impl.hpp b/src/blas/impl/KokkosBlas3_gemm_impl.hpp
index 124909c159..2057c8d8ad 100644
--- a/src/blas/impl/KokkosBlas3_gemm_impl.hpp
+++ b/src/blas/impl/KokkosBlas3_gemm_impl.hpp
@@ -50,7 +50,7 @@
 
 #ifdef KOKKOS_ENABLE_CXX14
 #ifdef KOKKOS_COMPILER_GNU
-#if KOKKOS_COMPILER_GNU<=740
+#if KOKKOS_COMPILER_GNU <= 740
 #define KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
 #endif
 #endif
@@ -61,345 +61,457 @@ namespace Impl {
 
 // Choose Iteration Layout for copying data from global memory into scratch
 // On CPUs it is more important to have consecutive write,
-// On GPUs it is more important to not jump around in global memory, i.e. have coallesced loads
-template<class ExecSpace, class LayoutA, class LayoutAScratch>
+// On GPUs it is more important to not jump around in global memory, i.e. have
+// coallesced loads
+template <class ExecSpace, class LayoutA, class LayoutAScratch>
 struct impl_gemm_choose_copy_layout {
   using type = LayoutAScratch;
 };
 
 #ifdef KOKKOS_ENABLE_CUDA
-template<class LayoutA, class LayoutAScratch>
-struct impl_gemm_choose_copy_layout<Kokkos::Cuda,LayoutA,LayoutAScratch> {
+template <class LayoutA, class LayoutAScratch>
+struct impl_gemm_choose_copy_layout<Kokkos::Cuda, LayoutA, LayoutAScratch> {
   using type = LayoutA;
 };
 #endif
 
 #ifdef KOKKOS_ENABLE_HIP
-template<class LayoutA, class LayoutAScratch>
-struct impl_gemm_choose_copy_layout<Kokkos::Experimental::HIP,LayoutA,LayoutAScratch> {
+template <class LayoutA, class LayoutAScratch>
+struct impl_gemm_choose_copy_layout<Kokkos::Experimental::HIP, LayoutA,
+                                    LayoutAScratch> {
   using type = LayoutA;
 };
 #endif
 
 // DeepCopy matrix block into scratch
-template<class TeamHandle, class ViewTypeScratch, class ViewType, class Layout, int blockDim_i, int blockDim_j, int Transpose>
+template <class TeamHandle, class ViewTypeScratch, class ViewType, class Layout,
+          int blockDim_i, int blockDim_j, int Transpose>
 struct impl_deep_copy_matrix_block;
 
-template<class TeamHandle, class ViewTypeScratch, class ViewType, class Layout, int blockDim_i, int blockDim_j>
-struct impl_deep_copy_matrix_block<TeamHandle,ViewTypeScratch,ViewType,Layout,blockDim_i,blockDim_j,0> {
+template <class TeamHandle, class ViewTypeScratch, class ViewType, class Layout,
+          int blockDim_i, int blockDim_j>
+struct impl_deep_copy_matrix_block<TeamHandle, ViewTypeScratch, ViewType,
+                                   Layout, blockDim_i, blockDim_j, 0> {
   typedef typename ViewType::non_const_value_type value_type;
-  typedef Kokkos::Details::ArithTraits<value_type>     ATV;
+  typedef Kokkos::Details::ArithTraits<value_type> ATV;
 
   KOKKOS_INLINE_FUNCTION
-  static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, const ViewType& A, const int& offset_i, const int& offset_j) {
-    if(offset_i + blockDim_i <= A.extent_int(0) && offset_j + blockDim_j <= A.extent_int(1)) {
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,blockDim_j), [&] (const int j) {
+  static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr,
+                   const ViewType& A, const int& offset_i,
+                   const int& offset_j) {
+    if (offset_i + blockDim_i <= A.extent_int(0) &&
+        offset_j + blockDim_j <= A.extent_int(1)) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) {
 #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
-        const int idx_j = offset_j+j;
+            const int idx_j = offset_j + j;
 #endif
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,blockDim_i), [&] (const int i) {
+            Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i),
+                                 [&](const int i) {
 #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
-          const int idx_j = offset_j+j;
+                                   const int idx_j = offset_j + j;
 #endif
-          const int idx_i = offset_i+i;
-          A_scr(i,j) = A(idx_i,idx_j);
-        });
-      });
+                                   const int idx_i = offset_i + i;
+                                   A_scr(i, j)     = A(idx_i, idx_j);
+                                 });
+          });
     } else {
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,blockDim_j), [&] (const int j) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) {
 #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
-        int idx_j = offset_j+j;
+            int idx_j = offset_j + j;
 #endif
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,blockDim_i), [&] (const int i) {
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(team, blockDim_i), [&](const int i) {
 #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
-          int idx_j = offset_j+j;
-#endif
-          const int idx_i = offset_i+i;
-          A_scr(i,j) = idx_i<A.extent_int(0) && idx_j<A.extent_int(1) ? A(idx_i,idx_j) : ATV::zero();
-        });
-      });
+                  int idx_j = offset_j + j;
+#endif
+                  const int idx_i = offset_i + i;
+                  A_scr(i, j) =
+                      idx_i < A.extent_int(0) && idx_j < A.extent_int(1)
+                          ? A(idx_i, idx_j)
+                          : ATV::zero();
+                });
+          });
     }
   }
 };
 
-template<class TeamHandle, class ViewTypeScratch, class ViewType,  int blockDim_i, int blockDim_j>
-struct impl_deep_copy_matrix_block<TeamHandle,ViewTypeScratch,ViewType,Kokkos::LayoutRight,blockDim_i,blockDim_j,0> {
+template <class TeamHandle, class ViewTypeScratch, class ViewType,
+          int blockDim_i, int blockDim_j>
+struct impl_deep_copy_matrix_block<TeamHandle, ViewTypeScratch, ViewType,
+                                   Kokkos::LayoutRight, blockDim_i, blockDim_j,
+                                   0> {
   typedef typename ViewType::non_const_value_type value_type;
-  typedef Kokkos::Details::ArithTraits<value_type>     ATV;
+  typedef Kokkos::Details::ArithTraits<value_type> ATV;
 
   KOKKOS_INLINE_FUNCTION
-  static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, const ViewType& A, const int& offset_i, const int& offset_j) {
-    if(offset_i + blockDim_i <= A.extent_int(0) && offset_j + blockDim_j <= A.extent_int(1)) {
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,blockDim_i), [&] (const int i) {
-        const int idx_i = offset_i+i;
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,blockDim_j), [&] (const int j) {
-          const int idx_j = offset_j+j;
-          A_scr(i,j) = A(idx_i,idx_j);
-        });
-      });
+  static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr,
+                   const ViewType& A, const int& offset_i,
+                   const int& offset_j) {
+    if (offset_i + blockDim_i <= A.extent_int(0) &&
+        offset_j + blockDim_j <= A.extent_int(1)) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) {
+            const int idx_i = offset_i + i;
+            Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j),
+                                 [&](const int j) {
+                                   const int idx_j = offset_j + j;
+                                   A_scr(i, j)     = A(idx_i, idx_j);
+                                 });
+          });
     } else {
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,blockDim_i), [&] (const int i) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) {
 #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
-        int idx_i = offset_i+i;
+            int idx_i = offset_i + i;
 #endif
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,blockDim_j), [&] (const int j) {
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(team, blockDim_j), [&](const int j) {
 #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
-          int idx_i = offset_i+i;
-#endif
-          const int idx_j = offset_j+j;
-          A_scr(i,j) = idx_i<A.extent_int(0) && idx_j<A.extent_int(1) ? A(idx_i,idx_j) : ATV::zero();
-        });
-      });
+                  int idx_i = offset_i + i;
+#endif
+                  const int idx_j = offset_j + j;
+                  A_scr(i, j) =
+                      idx_i < A.extent_int(0) && idx_j < A.extent_int(1)
+                          ? A(idx_i, idx_j)
+                          : ATV::zero();
+                });
+          });
     }
   }
 };
 
-template<class TeamHandle, class ViewTypeScratch, class ViewType, class Layout, int blockDim_i, int blockDim_j>
-struct impl_deep_copy_matrix_block<TeamHandle,ViewTypeScratch,ViewType,Layout,blockDim_i,blockDim_j,1> {
+template <class TeamHandle, class ViewTypeScratch, class ViewType, class Layout,
+          int blockDim_i, int blockDim_j>
+struct impl_deep_copy_matrix_block<TeamHandle, ViewTypeScratch, ViewType,
+                                   Layout, blockDim_i, blockDim_j, 1> {
   typedef typename ViewType::non_const_value_type value_type;
-  typedef Kokkos::Details::ArithTraits<value_type>     ATV;
+  typedef Kokkos::Details::ArithTraits<value_type> ATV;
 
   KOKKOS_INLINE_FUNCTION
-  static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, const ViewType& A, const int& offset_i, const int& offset_j) {
-    if(offset_i + blockDim_i <= A.extent_int(1) && offset_j + blockDim_j <= A.extent_int(0)) {
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,blockDim_j), [&] (const int j) {
+  static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr,
+                   const ViewType& A, const int& offset_i,
+                   const int& offset_j) {
+    if (offset_i + blockDim_i <= A.extent_int(1) &&
+        offset_j + blockDim_j <= A.extent_int(0)) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) {
 #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
-        const int idx_j = offset_j+j;
+            const int idx_j = offset_j + j;
 #endif
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,blockDim_i), [&] (const int i) {
+            Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i),
+                                 [&](const int i) {
 #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
-          const int idx_j = offset_j+j;
+                                   const int idx_j = offset_j + j;
 #endif
-          const int idx_i = offset_i+i;
-          A_scr(i,j) = A(idx_j,idx_i);
-        });
-      });
+                                   const int idx_i = offset_i + i;
+                                   A_scr(i, j)     = A(idx_j, idx_i);
+                                 });
+          });
     } else {
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,blockDim_j), [&] (const int j) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) {
 #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
-        int idx_j = offset_j+j;
+            int idx_j = offset_j + j;
 #endif
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,blockDim_i), [&] (const int i) {
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(team, blockDim_i), [&](const int i) {
 #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
-          int idx_j = offset_j+j;
-#endif
-          const int idx_i = offset_i+i;
-          A_scr(i,j) = idx_i<A.extent_int(1) && idx_j<A.extent_int(0) ? A(idx_j,idx_i) : ATV::zero();
-        });
-      });
+                  int idx_j = offset_j + j;
+#endif
+                  const int idx_i = offset_i + i;
+                  A_scr(i, j) =
+                      idx_i < A.extent_int(1) && idx_j < A.extent_int(0)
+                          ? A(idx_j, idx_i)
+                          : ATV::zero();
+                });
+          });
     }
   }
 };
 
-template<class TeamHandle, class ViewTypeScratch, class ViewType,  int blockDim_i, int blockDim_j>
-struct impl_deep_copy_matrix_block<TeamHandle,ViewTypeScratch,ViewType,Kokkos::LayoutRight,blockDim_i,blockDim_j,1> {
+template <class TeamHandle, class ViewTypeScratch, class ViewType,
+          int blockDim_i, int blockDim_j>
+struct impl_deep_copy_matrix_block<TeamHandle, ViewTypeScratch, ViewType,
+                                   Kokkos::LayoutRight, blockDim_i, blockDim_j,
+                                   1> {
   typedef typename ViewType::non_const_value_type value_type;
-  typedef Kokkos::Details::ArithTraits<value_type>     ATV;
+  typedef Kokkos::Details::ArithTraits<value_type> ATV;
 
   KOKKOS_INLINE_FUNCTION
-  static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, const ViewType& A, const int& offset_i, const int& offset_j) {
-    if(offset_i + blockDim_i <= A.extent_int(1) && offset_j + blockDim_j <= A.extent_int(0)) {
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,blockDim_i), [&] (const int i) {
+  static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr,
+                   const ViewType& A, const int& offset_i,
+                   const int& offset_j) {
+    if (offset_i + blockDim_i <= A.extent_int(1) &&
+        offset_j + blockDim_j <= A.extent_int(0)) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) {
 #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
-        const int idx_i = offset_i+i;
+            const int idx_i = offset_i + i;
 #endif
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,blockDim_j), [&] (const int j) {
+            Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j),
+                                 [&](const int j) {
 #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
-          const int idx_i = offset_i+i;
+                                   const int idx_i = offset_i + i;
 #endif
-          const int idx_j = offset_j+j;
-          A_scr(i,j) = A(idx_j,idx_i);
-        });
-      });
+                                   const int idx_j = offset_j + j;
+                                   A_scr(i, j)     = A(idx_j, idx_i);
+                                 });
+          });
     } else {
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,blockDim_i), [&] (const int i) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) {
 #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
-        int idx_i = offset_i+i;
+            int idx_i = offset_i + i;
 #endif
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,blockDim_j), [&] (const int j) {
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(team, blockDim_j), [&](const int j) {
 #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
-          int idx_i = offset_i+i;
-#endif
-          const int idx_j = offset_j+j;
-          A_scr(i,j) = idx_i<A.extent_int(1) && idx_j<A.extent_int(0) ? A(idx_j,idx_i) : ATV::zero();
-        });
-      });
+                  int idx_i = offset_i + i;
+#endif
+                  const int idx_j = offset_j + j;
+                  A_scr(i, j) =
+                      idx_i < A.extent_int(1) && idx_j < A.extent_int(0)
+                          ? A(idx_j, idx_i)
+                          : ATV::zero();
+                });
+          });
     }
   }
 };
 
-template<class TeamHandle, class ViewTypeScratch, class ViewType, class Layout, int blockDim_i, int blockDim_j>
-struct impl_deep_copy_matrix_block<TeamHandle,ViewTypeScratch,ViewType,Layout,blockDim_i,blockDim_j,2> {
+template <class TeamHandle, class ViewTypeScratch, class ViewType, class Layout,
+          int blockDim_i, int blockDim_j>
+struct impl_deep_copy_matrix_block<TeamHandle, ViewTypeScratch, ViewType,
+                                   Layout, blockDim_i, blockDim_j, 2> {
   typedef typename ViewType::non_const_value_type value_type;
-  typedef Kokkos::Details::ArithTraits<value_type>     ATV;
+  typedef Kokkos::Details::ArithTraits<value_type> ATV;
 
   KOKKOS_INLINE_FUNCTION
-  static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, const ViewType& A, const int& offset_i, const int& offset_j) {
-    if(offset_i + blockDim_i <= A.extent_int(1) && offset_j + blockDim_j <= A.extent_int(0)) {
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,blockDim_j), [&] (const int j) {
+  static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr,
+                   const ViewType& A, const int& offset_i,
+                   const int& offset_j) {
+    if (offset_i + blockDim_i <= A.extent_int(1) &&
+        offset_j + blockDim_j <= A.extent_int(0)) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) {
 #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
-        const int idx_j = offset_j+j;
+            const int idx_j = offset_j + j;
 #endif
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,blockDim_i), [&] (const int i) {
+            Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i),
+                                 [&](const int i) {
 #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
-          const int idx_j = offset_j+j;
+                                   const int idx_j = offset_j + j;
 #endif
-          const int idx_i = offset_i+i;
-          A_scr(i,j) = ATV::conj(A(idx_j,idx_i));
-        });
-      });
+                                   const int idx_i = offset_i + i;
+                                   A_scr(i, j)     = ATV::conj(A(idx_j, idx_i));
+                                 });
+          });
     } else {
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,blockDim_j), [&] (const int j) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) {
 #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
-        int idx_j = offset_j+j;
+            int idx_j = offset_j + j;
 #endif
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,blockDim_i), [&] (const int i) {
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(team, blockDim_i), [&](const int i) {
 #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
-          int idx_j = offset_j+j;
-#endif
-          const int idx_i = offset_i+i;
-          A_scr(i,j) = idx_i<A.extent_int(1) && idx_j<A.extent_int(0) ? ATV::conj(A(idx_j,idx_i)) : ATV::zero();
-        });
-      });
+                  int idx_j = offset_j + j;
+#endif
+                  const int idx_i = offset_i + i;
+                  A_scr(i, j) =
+                      idx_i < A.extent_int(1) && idx_j < A.extent_int(0)
+                          ? ATV::conj(A(idx_j, idx_i))
+                          : ATV::zero();
+                });
+          });
     }
   }
 };
 
-template<class TeamHandle, class ViewTypeScratch, class ViewType,  int blockDim_i, int blockDim_j>
-struct impl_deep_copy_matrix_block<TeamHandle,ViewTypeScratch,ViewType,Kokkos::LayoutRight,blockDim_i,blockDim_j,2> {
+template <class TeamHandle, class ViewTypeScratch, class ViewType,
+          int blockDim_i, int blockDim_j>
+struct impl_deep_copy_matrix_block<TeamHandle, ViewTypeScratch, ViewType,
+                                   Kokkos::LayoutRight, blockDim_i, blockDim_j,
+                                   2> {
   typedef typename ViewType::non_const_value_type value_type;
-  typedef Kokkos::Details::ArithTraits<value_type>     ATV;
+  typedef Kokkos::Details::ArithTraits<value_type> ATV;
 
   KOKKOS_INLINE_FUNCTION
-  static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, const ViewType& A, const int& offset_i, const int& offset_j) {
-    if(offset_i + blockDim_i <= A.extent_int(1) && offset_j + blockDim_j <= A.extent_int(0)) {
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,blockDim_i), [&] (const int i) {
+  static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr,
+                   const ViewType& A, const int& offset_i,
+                   const int& offset_j) {
+    if (offset_i + blockDim_i <= A.extent_int(1) &&
+        offset_j + blockDim_j <= A.extent_int(0)) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) {
 #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
-        const int idx_i = offset_i+i;
+            const int idx_i = offset_i + i;
 #endif
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,blockDim_j), [&] (const int j) {
+            Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j),
+                                 [&](const int j) {
 #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
-          const int idx_i = offset_i+i;
+                                   const int idx_i = offset_i + i;
 #endif
-          const int idx_j = offset_j+j;
-          A_scr(i,j) = ATV::conj(A(idx_j,idx_i)); 
-        });
-      });
+                                   const int idx_j = offset_j + j;
+                                   A_scr(i, j)     = ATV::conj(A(idx_j, idx_i));
+                                 });
+          });
     } else {
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,blockDim_i), [&] (const int i) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) {
 #ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
-        int idx_i = offset_i+i;
+            int idx_i = offset_i + i;
 #endif
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,blockDim_j), [&] (const int j) {
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(team, blockDim_j), [&](const int j) {
 #ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
-          int idx_i = offset_i+i;
-#endif
-          const int idx_j = offset_j+j;
-          A_scr(i,j) = idx_i<A.extent_int(1) && idx_j<A.extent_int(0) ? ATV::conj(A(idx_j,idx_i)) : ATV::zero();
-        });
-      });
+                  int idx_i = offset_i + i;
+#endif
+                  const int idx_j = offset_j + j;
+                  A_scr(i, j) =
+                      idx_i < A.extent_int(1) && idx_j < A.extent_int(0)
+                          ? ATV::conj(A(idx_j, idx_i))
+                          : ATV::zero();
+                });
+          });
     }
   }
 };
 
-template<class TeamHandle, class ViewType, class ViewTypeScratch, class Layout, int blockDim_i, int blockDim_j>
+template <class TeamHandle, class ViewType, class ViewTypeScratch, class Layout,
+          int blockDim_i, int blockDim_j>
 struct impl_update_matrix_block {
   typedef typename ViewType::non_const_value_type value_type;
-  typedef Kokkos::Details::ArithTraits<value_type>     ATV;
+  typedef Kokkos::Details::ArithTraits<value_type> ATV;
 
   KOKKOS_INLINE_FUNCTION
-  static void update(const TeamHandle& team, const value_type& beta , const ViewType& A,
-                                             const value_type& alpha, const ViewTypeScratch& A_scr,
-                                             const int& offset_i, const int& offset_j) {
-    if(offset_i + blockDim_i <= A.extent_int(0) && offset_j + blockDim_j <= A.extent_int(1)) {
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,blockDim_j), [&] (const int j) {
-        const int idx_j = offset_j+j;
-        if(beta == ATV::zero()) {
-          Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,blockDim_i), [&] (const int i) {
-            const int idx_i = offset_i+i;
-            A(idx_i,idx_j) = alpha * A_scr(i,j);
-          });
-        } else {
-          Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,blockDim_i), [&] (const int i) {
-            const int idx_i = offset_i+i;
-            A(idx_i,idx_j) = beta * A(idx_i,idx_j) + alpha * A_scr(i,j);
+  static void update(const TeamHandle& team, const value_type& beta,
+                     const ViewType& A, const value_type& alpha,
+                     const ViewTypeScratch& A_scr, const int& offset_i,
+                     const int& offset_j) {
+    if (offset_i + blockDim_i <= A.extent_int(0) &&
+        offset_j + blockDim_j <= A.extent_int(1)) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, blockDim_j), [&](const int j) {
+            const int idx_j = offset_j + j;
+            if (beta == ATV::zero()) {
+              Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i),
+                                   [&](const int i) {
+                                     const int idx_i = offset_i + i;
+                                     A(idx_i, idx_j) = alpha * A_scr(i, j);
+                                   });
+            } else {
+              Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_i),
+                                   [&](const int i) {
+                                     const int idx_i = offset_i + i;
+                                     A(idx_i, idx_j) = beta * A(idx_i, idx_j) +
+                                                       alpha * A_scr(i, j);
+                                   });
+            }
           });
-        }
-      });
     } else {
-      const int range_i = offset_i + blockDim_i <= A.extent_int(0)?blockDim_i:A.extent_int(0)%blockDim_i;
-      const int range_j = offset_j + blockDim_j <= A.extent_int(1)?blockDim_j:A.extent_int(1)%blockDim_j;
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,range_j), [&] (const int j) {
-        const int idx_j = offset_j+j;
-        if(beta == ATV::zero()) {
-          Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,range_i), [&] (const int i) {
-            const int idx_i = offset_i+i;
-            A(idx_i,idx_j) = alpha * A_scr(i,j);
-          });
-        } else {
-          Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,range_i), [&] (const int i) {
-            const int idx_i = offset_i+i;
-            A(idx_i,idx_j) = beta * A(idx_i,idx_j) + alpha * A_scr(i,j);
+      const int range_i = offset_i + blockDim_i <= A.extent_int(0)
+                              ? blockDim_i
+                              : A.extent_int(0) % blockDim_i;
+      const int range_j = offset_j + blockDim_j <= A.extent_int(1)
+                              ? blockDim_j
+                              : A.extent_int(1) % blockDim_j;
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, range_j), [&](const int j) {
+            const int idx_j = offset_j + j;
+            if (beta == ATV::zero()) {
+              Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, range_i),
+                                   [&](const int i) {
+                                     const int idx_i = offset_i + i;
+                                     A(idx_i, idx_j) = alpha * A_scr(i, j);
+                                   });
+            } else {
+              Kokkos::parallel_for(
+                  Kokkos::ThreadVectorRange(team, range_i), [&](const int i) {
+                    const int idx_i = offset_i + i;
+                    A(idx_i, idx_j) =
+                        beta * A(idx_i, idx_j) + alpha * A_scr(i, j);
+                  });
+            }
           });
-        }
-      });
     }
   }
 };
 
-template<class TeamHandle, class ViewType, class ViewTypeScratch, int blockDim_i, int blockDim_j>
-struct impl_update_matrix_block<TeamHandle,ViewType,ViewTypeScratch,Kokkos::LayoutRight,blockDim_i,blockDim_j> {
+template <class TeamHandle, class ViewType, class ViewTypeScratch,
+          int blockDim_i, int blockDim_j>
+struct impl_update_matrix_block<TeamHandle, ViewType, ViewTypeScratch,
+                                Kokkos::LayoutRight, blockDim_i, blockDim_j> {
   typedef typename ViewType::non_const_value_type value_type;
-  typedef Kokkos::Details::ArithTraits<value_type>     ATV;
+  typedef Kokkos::Details::ArithTraits<value_type> ATV;
 
   KOKKOS_INLINE_FUNCTION
-  static void update(const TeamHandle& team, const value_type& beta , const ViewType& A,
-                                             const value_type& alpha, const ViewTypeScratch& A_scr,
-                                             const int& offset_i, const int& offset_j) {
-    if(offset_i + blockDim_i <= A.extent_int(0) && offset_j + blockDim_j <= A.extent_int(1)) {
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,blockDim_i), [&] (const int i) {
-        const int idx_i = offset_i+i;
-        if(beta == ATV::zero()) {
-          Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,blockDim_j), [&] (const int j) {
-            const int idx_j = offset_j+j;
-            A(idx_i,idx_j) = alpha * A_scr(i,j);
-          });
-        } else {
-          Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,blockDim_j), [&] (const int j) {
-            const int idx_j = offset_j+j;
-            A(idx_i,idx_j) = beta * A(idx_i,idx_j) + alpha * A_scr(i,j);
+  static void update(const TeamHandle& team, const value_type& beta,
+                     const ViewType& A, const value_type& alpha,
+                     const ViewTypeScratch& A_scr, const int& offset_i,
+                     const int& offset_j) {
+    if (offset_i + blockDim_i <= A.extent_int(0) &&
+        offset_j + blockDim_j <= A.extent_int(1)) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, blockDim_i), [&](const int i) {
+            const int idx_i = offset_i + i;
+            if (beta == ATV::zero()) {
+              Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j),
+                                   [&](const int j) {
+                                     const int idx_j = offset_j + j;
+                                     A(idx_i, idx_j) = alpha * A_scr(i, j);
+                                   });
+            } else {
+              Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockDim_j),
+                                   [&](const int j) {
+                                     const int idx_j = offset_j + j;
+                                     A(idx_i, idx_j) = beta * A(idx_i, idx_j) +
+                                                       alpha * A_scr(i, j);
+                                   });
+            }
           });
-        }
-      });
     } else {
-      const int range_i = offset_i + blockDim_i <= A.extent_int(0)?blockDim_i:A.extent_int(0)%blockDim_i;
-      const int range_j = offset_j + blockDim_j <= A.extent_int(1)?blockDim_j:A.extent_int(1)%blockDim_j;
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,range_i), [&] (const int i) {
-        const int idx_i = offset_i+i;
-        if(beta == ATV::zero()) {
-          Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,range_j), [&] (const int j) {
-            const int idx_j = offset_j+j;
-            A(idx_i,idx_j) = alpha * A_scr(i,j);
-          });
-        } else {
-          Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,range_j), [&] (const int j) {
-            const int idx_j = offset_j+j;
-            A(idx_i,idx_j) = beta * A(idx_i,idx_j) + alpha * A_scr(i,j);
+      const int range_i = offset_i + blockDim_i <= A.extent_int(0)
+                              ? blockDim_i
+                              : A.extent_int(0) % blockDim_i;
+      const int range_j = offset_j + blockDim_j <= A.extent_int(1)
+                              ? blockDim_j
+                              : A.extent_int(1) % blockDim_j;
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, range_i), [&](const int i) {
+            const int idx_i = offset_i + i;
+            if (beta == ATV::zero()) {
+              Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, range_j),
+                                   [&](const int j) {
+                                     const int idx_j = offset_j + j;
+                                     A(idx_i, idx_j) = alpha * A_scr(i, j);
+                                   });
+            } else {
+              Kokkos::parallel_for(
+                  Kokkos::ThreadVectorRange(team, range_j), [&](const int j) {
+                    const int idx_j = offset_j + j;
+                    A(idx_i, idx_j) =
+                        beta * A(idx_i, idx_j) + alpha * A_scr(i, j);
+                  });
+            }
           });
-
-        }
-      });
     }
   }
 };
 
-// Compute a single A block 8 B block, also do an in-place no-additional blocking team GEMM
-template<class TeamHandle, class ViewTypeA, class ViewTypeB, class ViewTypeC>
-KOKKOS_INLINE_FUNCTION
-void impl_team_gemm_block(const TeamHandle& team, const ViewTypeC& C, const ViewTypeA& A, const ViewTypeB& B) {
+// Compute a single A block 8 B block, also do an in-place no-additional
+// blocking team GEMM
+template <class TeamHandle, class ViewTypeA, class ViewTypeB, class ViewTypeC>
+KOKKOS_INLINE_FUNCTION void impl_team_gemm_block(const TeamHandle& team,
+                                                 const ViewTypeC& C,
+                                                 const ViewTypeA& A,
+                                                 const ViewTypeB& B) {
   typedef typename ViewTypeC::non_const_value_type ScalarC;
 // GNU COMPILER BUG WORKAROUND
-#if defined(KOKKOS_COMPILER_GNU) && (!defined(__CUDA_ARCH__) || !defined(__HIP_DEVICE_COMPILE__))
+#if defined(KOKKOS_COMPILER_GNU) && \
+    (!defined(__CUDA_ARCH__) || !defined(__HIP_DEVICE_COMPILE__))
   int blockA0 = A.extent_int(0);
   int blockA1 = A.extent_int(1);
   int blockB1 = B.extent_int(1);
@@ -408,80 +520,81 @@ void impl_team_gemm_block(const TeamHandle& team, const ViewTypeC& C, const View
   const int blockA1 = A.extent_int(1);
   const int blockB1 = B.extent_int(1);
 #endif
-  Kokkos::parallel_for(Kokkos::TeamThreadRange(team,blockA0), [&] (const int i) {
+  Kokkos::parallel_for(
+      Kokkos::TeamThreadRange(team, blockA0), [&](const int i) {
 #ifndef KOKKOSKERNELS_ENABLE_OMP_SIMD
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,blockB1/4), [&] (const int B_j) {
+        Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockB1 / 4),
+                             [&](const int B_j) {
 #else
-    #pragma omp simd
+#pragma omp simd
     for(int B_j=0; B_j<blockB1/4; B_j++) {
 #endif
-      ScalarC C_ij0 = 0;
-      ScalarC C_ij1 = 0;
-      ScalarC C_ij2 = 0;
-      ScalarC C_ij3 = 0;
-      for(int j = 0; j < blockA1; j++) {
-        ScalarC A_ij = A(i,j);
-        C_ij0 += A_ij*B(j,B_j);
-        C_ij1 += A_ij*B(j,B_j+blockB1/4);
-        C_ij2 += A_ij*B(j,B_j+2*blockB1/4);
-        C_ij3 += A_ij*B(j,B_j+3*blockB1/4);
-      }
-      C(i,B_j) += C_ij0;
-      C(i,B_j+blockB1/4) += C_ij1;
-      C(i,B_j+2*blockB1/4) += C_ij2;
-      C(i,B_j+3*blockB1/4) += C_ij3;
+                               ScalarC C_ij0 = 0;
+                               ScalarC C_ij1 = 0;
+                               ScalarC C_ij2 = 0;
+                               ScalarC C_ij3 = 0;
+                               for (int j = 0; j < blockA1; j++) {
+                                 ScalarC A_ij = A(i, j);
+                                 C_ij0 += A_ij * B(j, B_j);
+                                 C_ij1 += A_ij * B(j, B_j + blockB1 / 4);
+                                 C_ij2 += A_ij * B(j, B_j + 2 * blockB1 / 4);
+                                 C_ij3 += A_ij * B(j, B_j + 3 * blockB1 / 4);
+                               }
+                               C(i, B_j) += C_ij0;
+                               C(i, B_j + blockB1 / 4) += C_ij1;
+                               C(i, B_j + 2 * blockB1 / 4) += C_ij2;
+                               C(i, B_j + 3 * blockB1 / 4) += C_ij3;
 #ifndef KOKKOSKERNELS_ENABLE_OMP_SIMD
-    });
+                             });
 #else
     }
 #endif
-  });
+      });
 }
 
-template<int TransposeA, int TransposeB>
+template <int TransposeA, int TransposeB>
 struct impl_gemm_label;
 
-template<>
-struct impl_gemm_label<0,0> {
+template <>
+struct impl_gemm_label<0, 0> {
   static constexpr const char* label = "KokkosBlas::gemm[NN]";
 };
-template<>
-struct impl_gemm_label<0,1> {
+template <>
+struct impl_gemm_label<0, 1> {
   static constexpr const char* label = "KokkosBlas::gemm[NT]";
 };
-template<>
-struct impl_gemm_label<0,2> {
+template <>
+struct impl_gemm_label<0, 2> {
   static constexpr const char* label = "KokkosBlas::gemm[NC]";
 };
 
-template<>
-struct impl_gemm_label<1,0> {
+template <>
+struct impl_gemm_label<1, 0> {
   static constexpr const char* label = "KokkosBlas::gemm[TN]";
 };
-template<>
-struct impl_gemm_label<1,1> {
+template <>
+struct impl_gemm_label<1, 1> {
   static constexpr const char* label = "KokkosBlas::gemm[TT]";
 };
-template<>
-struct impl_gemm_label<1,2> {
+template <>
+struct impl_gemm_label<1, 2> {
   static constexpr const char* label = "KokkosBlas::gemm[TC]";
 };
 
-template<>
-struct impl_gemm_label<2,0> {
+template <>
+struct impl_gemm_label<2, 0> {
   static constexpr const char* label = "KokkosBlas::gemm[CN]";
 };
-template<>
-struct impl_gemm_label<2,1> {
+template <>
+struct impl_gemm_label<2, 1> {
   static constexpr const char* label = "KokkosBlas::gemm[CT]";
 };
-template<>
-struct impl_gemm_label<2,2> {
+template <>
+struct impl_gemm_label<2, 2> {
   static constexpr const char* label = "KokkosBlas::gemm[CC]";
 };
 
-
-template<class ExecSpace, class ViewTypeA, class ViewTypeB, class ViewTypeC,
+template <class ExecSpace, class ViewTypeA, class ViewTypeB, class ViewTypeC,
           int blockA0, int blockA1, int blockB1, int TransposeA, int TransposeB>
 struct GEMMImpl {
   ViewTypeA A;
@@ -496,26 +609,34 @@ struct GEMMImpl {
   int scratch_level;
 
   ScalarC alpha, beta;
-  typedef Kokkos::View<ScalarA[blockA0][blockA1],Kokkos::LayoutLeft,typename ExecSpace::scratch_memory_space>
-    ViewTypeAScratch;
-  typedef Kokkos::View<ScalarB[blockA1][blockB1],Kokkos::LayoutRight,typename ExecSpace::scratch_memory_space>
-    ViewTypeBScratch;
-  typedef Kokkos::View<ScalarC[blockA0][blockB1],Kokkos::LayoutRight,typename ExecSpace::scratch_memory_space>
-    ViewTypeCScratch;
-
-  GEMMImpl(const ScalarA& alpha_, const ViewTypeA& A_, const ViewTypeB& B_, const ScalarC& beta_, const ViewTypeC& C_):A(A_),B(B_),C(C_),
-      num_blocks_0((C.extent_int(0)+blockA0-1)/blockA0),num_blocks_1((C.extent_int(1)+blockB1-1)/blockB1) {
+  typedef Kokkos::View<ScalarA[blockA0][blockA1], Kokkos::LayoutLeft,
+                       typename ExecSpace::scratch_memory_space>
+      ViewTypeAScratch;
+  typedef Kokkos::View<ScalarB[blockA1][blockB1], Kokkos::LayoutRight,
+                       typename ExecSpace::scratch_memory_space>
+      ViewTypeBScratch;
+  typedef Kokkos::View<ScalarC[blockA0][blockB1], Kokkos::LayoutRight,
+                       typename ExecSpace::scratch_memory_space>
+      ViewTypeCScratch;
+
+  GEMMImpl(const ScalarA& alpha_, const ViewTypeA& A_, const ViewTypeB& B_,
+           const ScalarC& beta_, const ViewTypeC& C_)
+      : A(A_),
+        B(B_),
+        C(C_),
+        num_blocks_0((C.extent_int(0) + blockA0 - 1) / blockA0),
+        num_blocks_1((C.extent_int(1) + blockB1 - 1) / blockB1) {
     scratch_level = 0;
-    alpha = alpha_;
-    beta = beta_;
+    alpha         = alpha_;
+    beta          = beta_;
   }
 
-  void run(int team_size, int vector_length, int scr_level) {
-    scratch_level = scr_level;
-    int scratch_memory_size =
-      ViewTypeAScratch::shmem_size() +
-      ViewTypeBScratch::shmem_size() +
-      ViewTypeCScratch::shmem_size();
+  void run(const ExecSpace& space, int team_size, int vector_length,
+           int scr_level) {
+    scratch_level           = scr_level;
+    int scratch_memory_size = ViewTypeAScratch::shmem_size() +
+                              ViewTypeBScratch::shmem_size() +
+                              ViewTypeCScratch::shmem_size();
 
 #if defined(KOKKOS_ENABLE_HIP)
     // Note lbv, 10/29/20: The LaunchBounds<384,2> leads
@@ -524,69 +645,79 @@ struct GEMMImpl {
     // are allocated... Switching to LaunchBounds<384,2> fixes
     // that problem but I'm not sure if that it a good perf
     // parameter or why it is set to 2 for Cuda?
-    Kokkos::TeamPolicy<ExecSpace,Kokkos::LaunchBounds<384,0>> policy(num_blocks_0*num_blocks_1,team_size,vector_length);
+    Kokkos::TeamPolicy<ExecSpace, Kokkos::LaunchBounds<384, 0>> policy(
+        space, num_blocks_0 * num_blocks_1, team_size, vector_length);
 #else
-    Kokkos::TeamPolicy<ExecSpace,Kokkos::LaunchBounds<384,2>> policy(num_blocks_0*num_blocks_1,team_size,vector_length);
+    Kokkos::TeamPolicy<ExecSpace, Kokkos::LaunchBounds<384, 2>> policy(
+        space, num_blocks_0 * num_blocks_1, team_size, vector_length);
 #endif
 
-    Kokkos::parallel_for(impl_gemm_label<TransposeA,TransposeB>::label,policy.set_scratch_size(scratch_level,Kokkos::PerTeam(scratch_memory_size)),*this);
+    Kokkos::parallel_for(
+        impl_gemm_label<TransposeA, TransposeB>::label,
+        policy.set_scratch_size(scratch_level,
+                                Kokkos::PerTeam(scratch_memory_size)),
+        *this);
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team) const {
+  void operator()(
+      const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team) const {
     // This team is responsible for computing a single block of C
     const int league_rank = team.league_rank();
-    const int num_blocks = num_blocks_1;
-    const int i_offset = (league_rank/num_blocks)*blockA0;
-    const int j_offset = (league_rank%num_blocks)*blockB1;
+    const int num_blocks  = num_blocks_1;
+    const int i_offset    = (league_rank / num_blocks) * blockA0;
+    const int j_offset    = (league_rank % num_blocks) * blockB1;
 
     ViewTypeAScratch A_scr(team.team_scratch(scratch_level));
     ViewTypeBScratch B_scr(team.team_scratch(scratch_level));
     ViewTypeCScratch C_scr(team.team_scratch(scratch_level));
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(team,blockA0), [&] (const int i) {
-      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,blockB1), [&] (const int j) {
-        C_scr(i,j) = 0;
-      });
-    });
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(team, blockA0), [&](const int i) {
+          Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, blockB1),
+                               [&](const int j) { C_scr(i, j) = 0; });
+        });
     team.team_barrier();
 
     // Move along the inner dimension in blocks
-    const int length = TransposeA>0?A.extent_int(0):A.extent_int(1);
-    for(int A_j = 0; A_j < length; A_j += blockA1) {
+    const int length = TransposeA > 0 ? A.extent_int(0) : A.extent_int(1);
+    for (int A_j = 0; A_j < length; A_j += blockA1) {
       // Load A block into scratch
 
-      impl_deep_copy_matrix_block<typename Kokkos::TeamPolicy<ExecSpace>::member_type,
-                                  ViewTypeAScratch,ViewTypeA,
-                                  typename impl_gemm_choose_copy_layout<ExecSpace,
-                                     typename ViewTypeA::array_layout,
-                                     typename ViewTypeAScratch::array_layout>::type,
-                                  blockA0,blockA1,TransposeA>::copy(team,A_scr,A,i_offset,A_j);
+      impl_deep_copy_matrix_block<
+          typename Kokkos::TeamPolicy<ExecSpace>::member_type, ViewTypeAScratch,
+          ViewTypeA,
+          typename impl_gemm_choose_copy_layout<
+              ExecSpace, typename ViewTypeA::array_layout,
+              typename ViewTypeAScratch::array_layout>::type,
+          blockA0, blockA1, TransposeA>::copy(team, A_scr, A, i_offset, A_j);
 
       // Load B block into scratch
-      impl_deep_copy_matrix_block<typename Kokkos::TeamPolicy<ExecSpace>::member_type,
-                                  ViewTypeBScratch,ViewTypeB,
-                                  typename impl_gemm_choose_copy_layout<ExecSpace,
-                                     typename ViewTypeB::array_layout,
-                                     typename ViewTypeBScratch::array_layout>::type,
-                                  blockA1,blockB1,TransposeB>::copy(team,B_scr,B,A_j,j_offset);
+      impl_deep_copy_matrix_block<
+          typename Kokkos::TeamPolicy<ExecSpace>::member_type, ViewTypeBScratch,
+          ViewTypeB,
+          typename impl_gemm_choose_copy_layout<
+              ExecSpace, typename ViewTypeB::array_layout,
+              typename ViewTypeBScratch::array_layout>::type,
+          blockA1, blockB1, TransposeB>::copy(team, B_scr, B, A_j, j_offset);
 
       // Wait for A and B block to be in scratch memory
       team.team_barrier();
 
       // Add contribution from multiplying the A and B block to the C block
-      impl_team_gemm_block(team,C_scr,A_scr,B_scr);
+      impl_team_gemm_block(team, C_scr, A_scr, B_scr);
 
-      // Wait for subblock computation to be done before loading the next A and B block
+      // Wait for subblock computation to be done before loading the next A and
+      // B block
       team.team_barrier();
     }
     // Write back the C block from scratch to main memory
-    impl_update_matrix_block<typename Kokkos::TeamPolicy<ExecSpace>::member_type,
-                                      ViewTypeC,ViewTypeCScratch,
-                                      typename ViewTypeC::array_layout,
-                                      blockA0,blockB1>::update(team,beta,C,alpha,C_scr,i_offset,j_offset);
+    impl_update_matrix_block<
+        typename Kokkos::TeamPolicy<ExecSpace>::member_type, ViewTypeC,
+        ViewTypeCScratch, typename ViewTypeC::array_layout, blockA0,
+        blockB1>::update(team, beta, C, alpha, C_scr, i_offset, j_offset);
   }
 };
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 #endif
diff --git a/src/blas/impl/KokkosBlas3_gemm_spec.hpp b/src/blas/impl/KokkosBlas3_gemm_spec.hpp
index 8064219a93..dd336bd375 100644
--- a/src/blas/impl/KokkosBlas3_gemm_spec.hpp
+++ b/src/blas/impl/KokkosBlas3_gemm_spec.hpp
@@ -57,13 +57,12 @@
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class AVT, class BVT, class CVT>
+template <class AVT, class BVT, class CVT>
 struct gemm_eti_spec_avail {
   enum : bool { value = false };
 };
-}
-}
-
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization availability
@@ -72,26 +71,38 @@ struct gemm_eti_spec_avail {
 // We may spread out definitions (see _INST macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT( SCALAR, LAYOUTA, LAYOUTB, LAYOUTC, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct gemm_eti_spec_avail< \
-         Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<const SCALAR**, LAYOUTB, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<SCALAR**, LAYOUTC, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> > \
-         > { enum : bool { value = true }; };
-
-#define KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT( SCALAR, Kokkos::LayoutLeft, Kokkos::LayoutLeft, LAYOUT, EXEC_SPACE, MEM_SPACE) \
-    KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT( SCALAR, Kokkos::LayoutLeft, Kokkos::LayoutRight, LAYOUT, EXEC_SPACE, MEM_SPACE) \
-    KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT( SCALAR, Kokkos::LayoutRight, Kokkos::LayoutLeft, LAYOUT, EXEC_SPACE, MEM_SPACE) \
-    KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT( SCALAR, Kokkos::LayoutRight, Kokkos::LayoutRight, LAYOUT, EXEC_SPACE, MEM_SPACE)
+#define KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, LAYOUTA, LAYOUTB,       \
+                                               LAYOUTC, EXEC_SPACE, MEM_SPACE) \
+  template <>                                                                  \
+  struct gemm_eti_spec_avail<                                                  \
+      Kokkos::View<const SCALAR**, LAYOUTA,                                    \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const SCALAR**, LAYOUTB,                                    \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<SCALAR**, LAYOUTC, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {               \
+    enum : bool { value = true };                                              \
+  };
+
+#define KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, Kokkos::LayoutLeft,           \
+                                         Kokkos::LayoutLeft, LAYOUT,           \
+                                         EXEC_SPACE, MEM_SPACE)                \
+  KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, Kokkos::LayoutLeft,           \
+                                         Kokkos::LayoutRight, LAYOUT,          \
+                                         EXEC_SPACE, MEM_SPACE)                \
+  KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, Kokkos::LayoutRight,          \
+                                         Kokkos::LayoutLeft, LAYOUT,           \
+                                         EXEC_SPACE, MEM_SPACE)                \
+  KOKKOSBLAS3_GEMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, Kokkos::LayoutRight,          \
+                                         Kokkos::LayoutRight, LAYOUT,          \
+                                         EXEC_SPACE, MEM_SPACE)
 
 // Include the actual specialization declarations
-#include<KokkosBlas3_gemm_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas3_gemm_eti_spec_avail.hpp>
+#include <KokkosBlas3_gemm_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas3_gemm_eti_spec_avail.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
@@ -101,149 +112,213 @@ namespace Impl {
 //
 
 // Implementation of KokkosBlas::gemm.
-template<class AViewType,
-         class BViewType,
-         class CViewType,
-         bool tpl_spec_avail = gemm_tpl_spec_avail<AViewType, BViewType, CViewType>::value,
-         bool eti_spec_avail = gemm_eti_spec_avail<AViewType, BViewType, CViewType>::value
-         >
+template <class AViewType, class BViewType, class CViewType,
+          bool tpl_spec_avail =
+              gemm_tpl_spec_avail<AViewType, BViewType, CViewType>::value,
+          bool eti_spec_avail =
+              gemm_eti_spec_avail<AViewType, BViewType, CViewType>::value>
 struct GEMM {
-  static void
-  gemm (const char transA[],
-        const char transB[],
-        typename AViewType::const_value_type& alpha,
-        const AViewType& A,
-        const BViewType& B,
-        typename CViewType::const_value_type& beta,
-        const CViewType& C)
+  static void gemm(const typename CViewType::execution_space& space,
+                   const char transA[], const char transB[],
+                   typename AViewType::const_value_type& alpha,
+                   const AViewType& A, const BViewType& B,
+                   typename CViewType::const_value_type& beta,
+                   const CViewType& C)
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
-{
-  static_assert (Kokkos::Impl::is_view<AViewType>::value,
-                 "AViewType must be a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<BViewType>::value,
-                 "BViewType must be a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<CViewType>::value,
-                 "CViewType must be a Kokkos::View.");
-  static_assert (static_cast<int> (AViewType::rank) == 2,
-                 "AViewType must have rank 2.");
-  static_assert (static_cast<int> (BViewType::rank) == 2,
-                 "BViewType must have rank 2.");
-  static_assert (static_cast<int> (CViewType::rank) == 2,
-                 "CViewType must have rank 2.");
-
-  Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::gemm[ETI]":"KokkosBlas::gemm[noETI]");
-  // Figure out Scalar Types
-  typedef typename AViewType::non_const_value_type ScalarA;
-  typedef typename BViewType::non_const_value_type ScalarB;
-  typedef typename CViewType::non_const_value_type ScalarC;
-  typedef typename CViewType::execution_space ExecSpace;
-
-  // Figure out whether to use DotBased implementation
-  const int M = static_cast<int> (C.extent(0));
-  const int N = static_cast<int> (C.extent(1));
-
-  const bool is_device_space = KokkosKernels::Impl::kk_is_gpu_exec_space<ExecSpace>();
-  const bool A_is_lr = std::is_same<Kokkos::LayoutRight, typename AViewType::array_layout>::value;
-  const bool A_is_tr = ((transA[0]=='T') || (transA[0]=='t') || (transA[0]=='C') || (transA[0]=='c'));
-  const bool B_is_tr = ((transB[0]=='T') || (transB[0]=='t') || (transB[0]=='C') || (transB[0]=='c'));
-
-  // NOTE: these thresholds were copied from TPL CUBLAS, and may need to be retuned
-  constexpr int numDotsLayoutLeftThreshold = 1600;
-  constexpr int numDotsLayoutRightThreshold = 100;
-  if((   (!A_is_lr && A_is_tr && !B_is_tr && M*N < numDotsLayoutLeftThreshold)
-      || ( A_is_lr && A_is_tr && !B_is_tr && M*N < numDotsLayoutRightThreshold))
-     && is_device_space) {
-
-    // call dot-based GEMM, only for C := beta * C + alpha * A^T * B, on device
-    bool A_is_conj = ((transA[0]=='C') || (transA[0]=='c'));
-    DotBasedGEMM<ExecSpace, AViewType, BViewType, CViewType> dotBasedGemm(alpha, A, B, beta, C);
-    dotBasedGemm.run(A_is_conj);
-
-  } else {
-
-    // Define Blocking sizes (this will be used for scratch spaces)
-    static constexpr int blockA0 = 24;
-    static constexpr int blockB1 = 64;
-    static constexpr int blockA1 = (sizeof(ScalarA)*blockA0*16 + sizeof(ScalarB)*16*blockB1 + sizeof(ScalarC)*blockA0*blockB1 < 24000) ? 16 :
-                                   (sizeof(ScalarA)*blockA0*8 + sizeof(ScalarB)*8*blockB1 + sizeof(ScalarC)*blockA0*blockB1 < 24000) ? 8 :
-                                   (sizeof(ScalarA)*blockA0*4 + sizeof(ScalarB)*4*blockB1 + sizeof(ScalarC)*blockA0*blockB1 < 24000) ? 4 : 16 ;
-    static constexpr int vector_length = blockB1/4;
-
-    // Compute scratch space size
-    typedef KokkosBlas::Impl::GEMMImpl<typename CViewType::execution_space,AViewType,BViewType,CViewType,blockA0,blockA1,blockB1,0,0> gemm_dummy_type;
-    const int scratch_memory_size =
+  {
+    static_assert(Kokkos::is_view<AViewType>::value,
+                  "AViewType must be a Kokkos::View.");
+    static_assert(Kokkos::is_view<BViewType>::value,
+                  "BViewType must be a Kokkos::View.");
+    static_assert(Kokkos::is_view<CViewType>::value,
+                  "CViewType must be a Kokkos::View.");
+    static_assert(static_cast<int>(AViewType::rank) == 2,
+                  "AViewType must have rank 2.");
+    static_assert(static_cast<int>(BViewType::rank) == 2,
+                  "BViewType must have rank 2.");
+    static_assert(static_cast<int>(CViewType::rank) == 2,
+                  "CViewType must have rank 2.");
+
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::gemm[ETI]"
+                                      : "KokkosBlas::gemm[noETI]");
+    // Figure out Scalar Types
+    typedef typename AViewType::non_const_value_type ScalarA;
+    typedef typename BViewType::non_const_value_type ScalarB;
+    typedef typename CViewType::non_const_value_type ScalarC;
+    typedef typename CViewType::execution_space ExecSpace;
+
+    // Figure out whether to use DotBased implementation
+    const int M = static_cast<int>(C.extent(0));
+    const int N = static_cast<int>(C.extent(1));
+
+    const bool is_device_space =
+        KokkosKernels::Impl::kk_is_gpu_exec_space<ExecSpace>();
+    const bool A_is_lr = std::is_same<Kokkos::LayoutRight,
+                                      typename AViewType::array_layout>::value;
+    const bool A_is_tr = ((transA[0] == 'T') || (transA[0] == 't') ||
+                          (transA[0] == 'C') || (transA[0] == 'c'));
+    const bool B_is_tr = ((transB[0] == 'T') || (transB[0] == 't') ||
+                          (transB[0] == 'C') || (transB[0] == 'c'));
+
+    // NOTE: these thresholds were copied from TPL CUBLAS, and may need to be
+    // retuned
+    constexpr int numDotsLayoutLeftThreshold  = 1600;
+    constexpr int numDotsLayoutRightThreshold = 100;
+    if (((!A_is_lr && A_is_tr && !B_is_tr &&
+          M * N < numDotsLayoutLeftThreshold) ||
+         (A_is_lr && A_is_tr && !B_is_tr &&
+          M * N < numDotsLayoutRightThreshold)) &&
+        is_device_space) {
+      // call dot-based GEMM, only for C := beta * C + alpha * A^T * B, on
+      // device
+      bool A_is_conj = ((transA[0] == 'C') || (transA[0] == 'c'));
+      DotBasedGEMM<ExecSpace, AViewType, BViewType, CViewType> dotBasedGemm(
+          alpha, A, B, beta, C);
+      dotBasedGemm.run(space, A_is_conj);
+
+    } else {
+      // Define Blocking sizes (this will be used for scratch spaces)
+      static constexpr int blockA0 = 24;
+      static constexpr int blockB1 = 64;
+      static constexpr int blockA1 =
+          (sizeof(ScalarA) * blockA0 * 16 + sizeof(ScalarB) * 16 * blockB1 +
+               sizeof(ScalarC) * blockA0 * blockB1 <
+           24000)
+              ? 16
+              : (sizeof(ScalarA) * blockA0 * 8 + sizeof(ScalarB) * 8 * blockB1 +
+                     sizeof(ScalarC) * blockA0 * blockB1 <
+                 24000)
+                    ? 8
+                    : (sizeof(ScalarA) * blockA0 * 4 +
+                           sizeof(ScalarB) * 4 * blockB1 +
+                           sizeof(ScalarC) * blockA0 * blockB1 <
+                       24000)
+                          ? 4
+                          : 16;
+      int vector_length     = blockB1 / 4;
+      int max_vector_length = KokkosKernels::Impl::kk_get_max_vector_size<
+          typename CViewType::execution_space>();
+      if (vector_length > max_vector_length) vector_length = max_vector_length;
+
+      // Compute scratch space size
+      typedef KokkosBlas::Impl::GEMMImpl<typename CViewType::execution_space,
+                                         AViewType, BViewType, CViewType,
+                                         blockA0, blockA1, blockB1, 0, 0>
+          gemm_dummy_type;
+      const int scratch_memory_size =
           gemm_dummy_type::ViewTypeAScratch::required_allocation_size() +
           gemm_dummy_type::ViewTypeBScratch::required_allocation_size() +
           gemm_dummy_type::ViewTypeCScratch::required_allocation_size();
-    const int scratch_level = scratch_memory_size < 24000 ? 0 : 1;
-
-    // Figure out Team Sizes
-    int team_size = 1;
-    #if defined(KOKKOS_ENABLE_CUDA)
-    if(std::is_same<typename CViewType::execution_space,Kokkos::Cuda>::value)
-      team_size = blockA0;
-    #endif
-    #if defined(KOKKOS_ENABLE_HIP)
-    if(std::is_same<typename CViewType::execution_space,Kokkos::Experimental::HIP>::value)
-      team_size = blockA0;
-    #endif
-    #if defined(KOKKOS_ENABLE_ROCM)
-    if(std::is_same<typename CViewType::execution_space,Kokkos::ROCm>::value)
-      team_size = blockA0;
-    #endif
+      const int scratch_level = scratch_memory_size < 24000 ? 0 : 1;
+
+      // Figure out Team Sizes
+      int team_size = 1;
+#if defined(KOKKOS_ENABLE_CUDA)
+      if (std::is_same<typename CViewType::execution_space,
+                       Kokkos::Cuda>::value)
+        team_size = blockA0;
+#endif
+#if defined(KOKKOS_ENABLE_HIP)
+      if (std::is_same<typename CViewType::execution_space,
+                       Kokkos::Experimental::HIP>::value)
+        team_size = blockA0;
+#endif
+#if defined(KOKKOS_ENABLE_ROCM)
+      if (std::is_same<typename CViewType::execution_space,
+                       Kokkos::ROCm>::value)
+        team_size = blockA0;
+#endif
+#if defined(KOKKOS_ENABLE_SYCL)
+      if (std::is_same<typename CViewType::execution_space,
+                       Kokkos::Experimental::SYCL>::value)
+        team_size = blockA0;
+#endif
 
-    // Call the correct kernel
-    if((transA[0]=='N' || transA[0]=='n') && (transB[0]=='N' || transB[0]=='n')) {
-      KokkosBlas::Impl::GEMMImpl<typename CViewType::execution_space,AViewType,BViewType,CViewType,blockA0,blockA1,blockB1,0,0> gemm(alpha,A,B,beta,C);
-      gemm.run(team_size,vector_length,scratch_level);
-    }
-    if((transA[0]=='T' || transA[0]=='t') && (transB[0]=='N' || transB[0]=='n')) {
-      KokkosBlas::Impl::GEMMImpl<typename CViewType::execution_space,AViewType,BViewType,CViewType,blockA0,blockA1,blockB1,1,0> gemm(alpha,A,B,beta,C);
-      gemm.run(team_size,vector_length,scratch_level);
-    }
-    if((transA[0]=='C' || transA[0]=='c') && (transB[0]=='N' || transB[0]=='n')) {
-      KokkosBlas::Impl::GEMMImpl<typename CViewType::execution_space,AViewType,BViewType,CViewType,blockA0,blockA1,blockB1,2,0> gemm(alpha,A,B,beta,C);
-      gemm.run(team_size,vector_length,scratch_level);
-    }
-    if((transA[0]=='N' || transA[0]=='n') && (transB[0]=='T' || transB[0]=='t')) {
-      KokkosBlas::Impl::GEMMImpl<typename CViewType::execution_space,AViewType,BViewType,CViewType,blockA0,blockA1,blockB1,0,1> gemm(alpha,A,B,beta,C);
-      gemm.run(team_size,vector_length,scratch_level);
-    }
-    if((transA[0]=='T' || transA[0]=='t') && (transB[0]=='T' || transB[0]=='t')) {
-      KokkosBlas::Impl::GEMMImpl<typename CViewType::execution_space,AViewType,BViewType,CViewType,blockA0,blockA1,blockB1,1,1> gemm(alpha,A,B,beta,C);
-      gemm.run(team_size,vector_length,scratch_level);
-    }
-    if((transA[0]=='C' || transA[0]=='c') && (transB[0]=='T' || transB[0]=='t')) {
-      KokkosBlas::Impl::GEMMImpl<typename CViewType::execution_space,AViewType,BViewType,CViewType,blockA0,blockA1,blockB1,2,1> gemm(alpha,A,B,beta,C);
-      gemm.run(team_size,vector_length,scratch_level);
-    }
-    if((transA[0]=='N' || transA[0]=='n') && (transB[0]=='C' || transB[0]=='c')) {
-      KokkosBlas::Impl::GEMMImpl<typename CViewType::execution_space,AViewType,BViewType,CViewType,blockA0,blockA1,blockB1,0,2> gemm(alpha,A,B,beta,C);
-      gemm.run(team_size,vector_length,scratch_level);
-    }
-    if((transA[0]=='T' || transA[0]=='t') && (transB[0]=='C' || transB[0]=='c')) {
-      KokkosBlas::Impl::GEMMImpl<typename CViewType::execution_space,AViewType,BViewType,CViewType,blockA0,blockA1,blockB1,1,2> gemm(alpha,A,B,beta,C);
-      gemm.run(team_size,vector_length,scratch_level);
-    }
-    if((transA[0]=='C' || transA[0]=='c') && (transB[0]=='C' || transB[0]=='c')) {
-      KokkosBlas::Impl::GEMMImpl<typename CViewType::execution_space,AViewType,BViewType,CViewType,blockA0,blockA1,blockB1,2,2> gemm(alpha,A,B,beta,C);
-      gemm.run(team_size,vector_length,scratch_level);
+      // Call the correct kernel
+      if ((transA[0] == 'N' || transA[0] == 'n') &&
+          (transB[0] == 'N' || transB[0] == 'n')) {
+        KokkosBlas::Impl::GEMMImpl<typename CViewType::execution_space,
+                                   AViewType, BViewType, CViewType, blockA0,
+                                   blockA1, blockB1, 0, 0>
+            gemm(alpha, A, B, beta, C);
+        gemm.run(space, team_size, vector_length, scratch_level);
+      }
+      if ((transA[0] == 'T' || transA[0] == 't') &&
+          (transB[0] == 'N' || transB[0] == 'n')) {
+        KokkosBlas::Impl::GEMMImpl<typename CViewType::execution_space,
+                                   AViewType, BViewType, CViewType, blockA0,
+                                   blockA1, blockB1, 1, 0>
+            gemm(alpha, A, B, beta, C);
+        gemm.run(space, team_size, vector_length, scratch_level);
+      }
+      if ((transA[0] == 'C' || transA[0] == 'c') &&
+          (transB[0] == 'N' || transB[0] == 'n')) {
+        KokkosBlas::Impl::GEMMImpl<typename CViewType::execution_space,
+                                   AViewType, BViewType, CViewType, blockA0,
+                                   blockA1, blockB1, 2, 0>
+            gemm(alpha, A, B, beta, C);
+        gemm.run(space, team_size, vector_length, scratch_level);
+      }
+      if ((transA[0] == 'N' || transA[0] == 'n') &&
+          (transB[0] == 'T' || transB[0] == 't')) {
+        KokkosBlas::Impl::GEMMImpl<typename CViewType::execution_space,
+                                   AViewType, BViewType, CViewType, blockA0,
+                                   blockA1, blockB1, 0, 1>
+            gemm(alpha, A, B, beta, C);
+        gemm.run(space, team_size, vector_length, scratch_level);
+      }
+      if ((transA[0] == 'T' || transA[0] == 't') &&
+          (transB[0] == 'T' || transB[0] == 't')) {
+        KokkosBlas::Impl::GEMMImpl<typename CViewType::execution_space,
+                                   AViewType, BViewType, CViewType, blockA0,
+                                   blockA1, blockB1, 1, 1>
+            gemm(alpha, A, B, beta, C);
+        gemm.run(space, team_size, vector_length, scratch_level);
+      }
+      if ((transA[0] == 'C' || transA[0] == 'c') &&
+          (transB[0] == 'T' || transB[0] == 't')) {
+        KokkosBlas::Impl::GEMMImpl<typename CViewType::execution_space,
+                                   AViewType, BViewType, CViewType, blockA0,
+                                   blockA1, blockB1, 2, 1>
+            gemm(alpha, A, B, beta, C);
+        gemm.run(space, team_size, vector_length, scratch_level);
+      }
+      if ((transA[0] == 'N' || transA[0] == 'n') &&
+          (transB[0] == 'C' || transB[0] == 'c')) {
+        KokkosBlas::Impl::GEMMImpl<typename CViewType::execution_space,
+                                   AViewType, BViewType, CViewType, blockA0,
+                                   blockA1, blockB1, 0, 2>
+            gemm(alpha, A, B, beta, C);
+        gemm.run(space, team_size, vector_length, scratch_level);
+      }
+      if ((transA[0] == 'T' || transA[0] == 't') &&
+          (transB[0] == 'C' || transB[0] == 'c')) {
+        KokkosBlas::Impl::GEMMImpl<typename CViewType::execution_space,
+                                   AViewType, BViewType, CViewType, blockA0,
+                                   blockA1, blockB1, 1, 2>
+            gemm(alpha, A, B, beta, C);
+        gemm.run(space, team_size, vector_length, scratch_level);
+      }
+      if ((transA[0] == 'C' || transA[0] == 'c') &&
+          (transB[0] == 'C' || transB[0] == 'c')) {
+        KokkosBlas::Impl::GEMMImpl<typename CViewType::execution_space,
+                                   AViewType, BViewType, CViewType, blockA0,
+                                   blockA1, blockB1, 2, 2>
+            gemm(alpha, A, B, beta, C);
+        gemm.run(space, team_size, vector_length, scratch_level);
+      }
     }
+    Kokkos::Profiling::popRegion();
   }
-  Kokkos::Profiling::popRegion();
-}
 #else
-;
-#endif //!defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
-
+      ;
+#endif  //! defined(KOKKOSKERNELS_ETI_ONLY) ||
+        //! KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 };
 
-
-
-
-} // namespace Impl
-} // namespace KokkosBlas
-
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization of
@@ -253,39 +328,61 @@ struct GEMM {
 // one or more .cpp files.
 //
 
-#define KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS( SCALAR, LAYOUTA, LAYOUTB, LAYOUTC, EXEC_SPACE, MEM_SPACE ) \
-extern template struct GEMM< \
-     Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const SCALAR**, LAYOUTB, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR**, LAYOUTC, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     false, true>;
-
-#define KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS( SCALAR, LAYOUTA, LAYOUTB, LAYOUTC, EXEC_SPACE, MEM_SPACE ) \
-template struct GEMM< \
-     Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const SCALAR**, LAYOUTB, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR**, LAYOUTC, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >,  \
-     false, true>;
-
-#define KOKKOSBLAS3_GEMM_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, Kokkos::LayoutLeft, Kokkos::LayoutLeft, LAYOUT, EXEC_SPACE, MEM_SPACE) \
-    KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, Kokkos::LayoutLeft, Kokkos::LayoutRight, LAYOUT, EXEC_SPACE, MEM_SPACE) \
-    KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, Kokkos::LayoutRight, Kokkos::LayoutLeft, LAYOUT, EXEC_SPACE, MEM_SPACE) \
-    KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, Kokkos::LayoutRight, Kokkos::LayoutRight, LAYOUT, EXEC_SPACE, MEM_SPACE)
-
-#define KOKKOSBLAS3_GEMM_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, Kokkos::LayoutLeft, Kokkos::LayoutLeft, LAYOUT, EXEC_SPACE, MEM_SPACE) \
-    KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, Kokkos::LayoutLeft, Kokkos::LayoutRight, LAYOUT, EXEC_SPACE, MEM_SPACE) \
-    KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, Kokkos::LayoutRight, Kokkos::LayoutLeft, LAYOUT, EXEC_SPACE, MEM_SPACE) \
-    KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, Kokkos::LayoutRight, Kokkos::LayoutRight, LAYOUT, EXEC_SPACE, MEM_SPACE)
-
-#include<KokkosBlas3_gemm_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas3_gemm_eti_spec_decl.hpp>
-
-#endif // KOKKOSBLAS3_GEMM_SPEC_HPP_
+#define KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB,       \
+                                               LAYOUTC, EXEC_SPACE, MEM_SPACE) \
+  extern template struct GEMM<                                                 \
+      Kokkos::View<const SCALAR**, LAYOUTA,                                    \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const SCALAR**, LAYOUTB,                                    \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<SCALAR**, LAYOUTC, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      false, true>;
+
+#define KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB,       \
+                                               LAYOUTC, EXEC_SPACE, MEM_SPACE) \
+  template struct GEMM<                                                        \
+      Kokkos::View<const SCALAR**, LAYOUTA,                                    \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const SCALAR**, LAYOUTB,                                    \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<SCALAR**, LAYOUTC, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      false, true>;
+
+#define KOKKOSBLAS3_GEMM_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, Kokkos::LayoutLeft,          \
+                                         Kokkos::LayoutLeft, LAYOUT,          \
+                                         EXEC_SPACE, MEM_SPACE)               \
+  KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, Kokkos::LayoutLeft,          \
+                                         Kokkos::LayoutRight, LAYOUT,         \
+                                         EXEC_SPACE, MEM_SPACE)               \
+  KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, Kokkos::LayoutRight,         \
+                                         Kokkos::LayoutLeft, LAYOUT,          \
+                                         EXEC_SPACE, MEM_SPACE)               \
+  KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, Kokkos::LayoutRight,         \
+                                         Kokkos::LayoutRight, LAYOUT,         \
+                                         EXEC_SPACE, MEM_SPACE)
+
+#define KOKKOSBLAS3_GEMM_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, Kokkos::LayoutLeft,          \
+                                         Kokkos::LayoutLeft, LAYOUT,          \
+                                         EXEC_SPACE, MEM_SPACE)               \
+  KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, Kokkos::LayoutLeft,          \
+                                         Kokkos::LayoutRight, LAYOUT,         \
+                                         EXEC_SPACE, MEM_SPACE)               \
+  KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, Kokkos::LayoutRight,         \
+                                         Kokkos::LayoutLeft, LAYOUT,          \
+                                         EXEC_SPACE, MEM_SPACE)               \
+  KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, Kokkos::LayoutRight,         \
+                                         Kokkos::LayoutRight, LAYOUT,         \
+                                         EXEC_SPACE, MEM_SPACE)
+
+#include <KokkosBlas3_gemm_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas3_gemm_eti_spec_decl.hpp>
+
+#endif  // KOKKOSBLAS3_GEMM_SPEC_HPP_
diff --git a/src/blas/impl/KokkosBlas3_trmm_spec.hpp b/src/blas/impl/KokkosBlas3_trmm_spec.hpp
index 3c0bd9df6f..41a6d0d852 100644
--- a/src/blas/impl/KokkosBlas3_trmm_spec.hpp
+++ b/src/blas/impl/KokkosBlas3_trmm_spec.hpp
@@ -48,40 +48,44 @@
 #include "Kokkos_Core.hpp"
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
-#include<KokkosBlas3_trmm_impl.hpp>
+#include <KokkosBlas3_trmm_impl.hpp>
 #endif
 
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class AVIT, class BVIT>
+template <class AVIT, class BVIT>
 struct trmm_eti_spec_avail {
   enum : bool { value = false };
 };
-} // namespace Impl
-} // namespace KokkosBlas
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // This Macro is for readability of the template arguments.
 //
-#define KOKKOSBLAS3_TRMM_ETI_SPEC_AVAIL_LAYOUT( SCALAR, LAYOUTA, LAYOUTB, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct trmm_eti_spec_avail< \
-         Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> > \
-         > { enum : bool { value = true }; };
+#define KOKKOSBLAS3_TRMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, LAYOUTA, LAYOUTB,     \
+                                               EXEC_SPACE, MEM_SPACE)        \
+  template <>                                                                \
+  struct trmm_eti_spec_avail<                                                \
+      Kokkos::View<const SCALAR**, LAYOUTA,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {             \
+    enum : bool { value = true };                                            \
+  };
 
 //
 // This Macros provides the ETI specialization of trmm
 //
-#define KOKKOSBLAS3_TRMM_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    KOKKOSBLAS3_TRMM_ETI_SPEC_AVAIL_LAYOUT( SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, MEM_SPACE)
+#define KOKKOSBLAS3_TRMM_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  KOKKOSBLAS3_TRMM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE,   \
+                                         MEM_SPACE)
 
 // Include the actual specialization declarations
-#include<KokkosBlas3_trmm_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas3_trmm_eti_spec_avail.hpp>
+#include <KokkosBlas3_trmm_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas3_trmm_eti_spec_avail.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
@@ -90,56 +94,41 @@ namespace Impl {
 // trmm
 //
 
-//Unification layer
-template<class AVIT,
-         class BVIT,
-         bool tpl_spec_avail = trmm_tpl_spec_avail<AVIT, BVIT>::value,
-         bool eti_spec_avail = trmm_eti_spec_avail<AVIT, BVIT>::value
-        >
-struct TRMM{
-  static void
-  trmm (const char side[],
-        const char uplo[],
-        const char trans[],
-        const char diag[],
-        typename BVIT::const_value_type& alpha,
-        const AVIT& A,
-        const BVIT& B);
+// Unification layer
+template <class AVIT, class BVIT,
+          bool tpl_spec_avail = trmm_tpl_spec_avail<AVIT, BVIT>::value,
+          bool eti_spec_avail = trmm_eti_spec_avail<AVIT, BVIT>::value>
+struct TRMM {
+  static void trmm(const char side[], const char uplo[], const char trans[],
+                   const char diag[], typename BVIT::const_value_type& alpha,
+                   const AVIT& A, const BVIT& B);
 };
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
-template<class AVIT,
-         class BVIT>
+template <class AVIT, class BVIT>
 struct TRMM<AVIT, BVIT, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
-  static void
-  trmm (const char side[],
-        const char uplo[],
-        const char trans[],
-        const char diag[],
-        typename BVIT::const_value_type& alpha,
-        const AVIT& A,
-        const BVIT& B)
-  {
-    static_assert (Kokkos::Impl::is_view<AVIT>::value,
-                   "AVIT must be a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<BVIT>::value,
-                   "BVIT must be a Kokkos::View.");
-    static_assert (static_cast<int> (AVIT::rank) == 2,
-                   "AVIT must have rank 2.");
-    static_assert (static_cast<int> (BVIT::rank) == 2,
-                   "BVIT must have rank 2.");
-
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::trmm[ETI]":"KokkosBlas::trmm[noETI]");
-
-    typename AVIT::HostMirror host_A  = Kokkos::create_mirror_view(A);
-    typename BVIT::HostMirror host_B  = Kokkos::create_mirror_view(B);
+  static void trmm(const char side[], const char uplo[], const char trans[],
+                   const char diag[], typename BVIT::const_value_type& alpha,
+                   const AVIT& A, const BVIT& B) {
+    static_assert(Kokkos::is_view<AVIT>::value, "AVIT must be a Kokkos::View.");
+    static_assert(Kokkos::is_view<BVIT>::value, "BVIT must be a Kokkos::View.");
+    static_assert(static_cast<int>(AVIT::rank) == 2, "AVIT must have rank 2.");
+    static_assert(static_cast<int>(BVIT::rank) == 2, "BVIT must have rank 2.");
+
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::trmm[ETI]"
+                                      : "KokkosBlas::trmm[noETI]");
+
+    typename AVIT::HostMirror host_A = Kokkos::create_mirror_view(A);
+    typename BVIT::HostMirror host_B = Kokkos::create_mirror_view(B);
 
     // Copy A to host_A and B to host_B
     // no-op if A and B MemorySpace is HostSpace
     Kokkos::deep_copy(host_A, A);
     Kokkos::deep_copy(host_B, B);
 
-    SerialTrmm_Invoke<typename AVIT::HostMirror, typename BVIT::HostMirror> (side, uplo, trans, diag, alpha, host_A, host_B);
+    SerialTrmm_Invoke<typename AVIT::HostMirror, typename BVIT::HostMirror>(
+        side, uplo, trans, diag, alpha, host_A, host_B);
 
     // Copy host_B to B
     // no-op if B's MemorySpace is HostSpace
@@ -148,44 +137,51 @@ struct TRMM<AVIT, BVIT, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
     Kokkos::Profiling::popRegion();
   }
 };
-#endif //!defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+#endif  //! defined(KOKKOSKERNELS_ETI_ONLY) ||
+        //! KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 
-} // namespace Impl
-} // namespace KokkosBlas
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // These Macros are for readability.
 //
-#define KOKKOSBLAS3_TRMM_ETI_SPEC_DECL_LAYOUTS( SCALAR, LAYOUTA, LAYOUTB, EXEC_SPACE, MEM_SPACE ) \
-extern template struct TRMM< \
-     Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     false, true>;
-
-#define KOKKOSBLAS3_TRMM_ETI_SPEC_INST_LAYOUTS( SCALAR, LAYOUTA, LAYOUTB, EXEC_SPACE, MEM_SPACE ) \
-template struct TRMM< \
-     Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     false, true>;
-
-//
-// These Macros are only included when we are not compiling libkokkoskernels but are
-// auto generating files. These macros provide the explicit instantiation
+#define KOKKOSBLAS3_TRMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB,     \
+                                               EXEC_SPACE, MEM_SPACE)        \
+  extern template struct TRMM<                                               \
+      Kokkos::View<const SCALAR**, LAYOUTA,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      false, true>;
+
+#define KOKKOSBLAS3_TRMM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB,     \
+                                               EXEC_SPACE, MEM_SPACE)        \
+  template struct TRMM<                                                      \
+      Kokkos::View<const SCALAR**, LAYOUTA,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      false, true>;
+
+//
+// These Macros are only included when we are not compiling libkokkoskernels but
+// are auto generating files. These macros provide the explicit instantiation
 // declaration and definition of TRMM, potentially reducing user code size. The
 // "extern template" skips the implicit instatiation step ensuring that the
 // callers code uses this explicit instantiation definition of TRMM.
 //
-#define KOKKOSBLAS3_TRMM_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    KOKKOSBLAS3_TRMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, MEM_SPACE)
+#define KOKKOSBLAS3_TRMM_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  KOKKOSBLAS3_TRMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE,  \
+                                         MEM_SPACE)
 
-#define KOKKOSBLAS3_TRMM_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    KOKKOSBLAS3_TRMM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, MEM_SPACE)
+#define KOKKOSBLAS3_TRMM_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  KOKKOSBLAS3_TRMM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE,  \
+                                         MEM_SPACE)
 
-#include<KokkosBlas3_trmm_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas3_trmm_eti_spec_decl.hpp>
+#include <KokkosBlas3_trmm_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas3_trmm_eti_spec_decl.hpp>
 
-#endif // KOKKOSBLAS3_TRMM_SPEC_HPP_
+#endif  // KOKKOSBLAS3_TRMM_SPEC_HPP_
diff --git a/src/blas/impl/KokkosBlas3_trsm_impl.hpp b/src/blas/impl/KokkosBlas3_trsm_impl.hpp
index 4bab54c70e..b215633093 100644
--- a/src/blas/impl/KokkosBlas3_trsm_impl.hpp
+++ b/src/blas/impl/KokkosBlas3_trsm_impl.hpp
@@ -46,10 +46,10 @@
 #define KOKKOSBLAS3_TRSM_IMPL_HPP_
 
 /// \file KokkosBlas3_trsm_impl.hpp
-/// \brief Implementation(s) of triangular linear system solve (with multiple RHSs)
-/// \brief Sequential fall-back implementation calls the exisiting serial batched TRSM.
-/// \brief Two sequential fall-back implementations for conjugate transpose case are
-/// \brief also based on the exisiting serial batched TRSM.
+/// \brief Implementation(s) of triangular linear system solve (with multiple
+/// RHSs) \brief Sequential fall-back implementation calls the exisiting serial
+/// batched TRSM. \brief Two sequential fall-back implementations for conjugate
+/// transpose case are \brief also based on the exisiting serial batched TRSM.
 
 #include "KokkosKernels_config.h"
 #include "Kokkos_Core.hpp"
@@ -60,270 +60,287 @@
 namespace KokkosBlas {
 namespace Impl {
 
-template<typename ScalarType,
-         typename ValueType>
-int
-SerialTrsmInternalLeftLowerConj(const bool use_unit_diag,
-                                const int m, const int n,
-                                const ScalarType alpha,
-                                const ValueType* KOKKOS_RESTRICT A, const int as0, const int as1,
-                                /**/  ValueType* KOKKOS_RESTRICT B, const int bs0, const int bs1) {
-
+template <typename ScalarType, typename ValueType>
+int SerialTrsmInternalLeftLowerConj(const bool use_unit_diag, const int m,
+                                    const int n, const ScalarType alpha,
+                                    const ValueType* KOKKOS_RESTRICT A,
+                                    const int as0, const int as1,
+                                    /**/ ValueType* KOKKOS_RESTRICT B,
+                                    const int bs0, const int bs1) {
   typedef Kokkos::Details::ArithTraits<ValueType> AT;
-  
+
   const ScalarType one(1.0), zero(0.0);
-      
-  if (alpha == zero)   KokkosBatched::SerialSetInternal  ::invoke(m, n, zero,  B, bs0, bs1);
+
+  if (alpha == zero)
+    KokkosBatched::SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1);
   else {
-    if (alpha != one)  KokkosBatched::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1);
+    if (alpha != one)
+      KokkosBatched::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1);
     if (m <= 0 || n <= 0) return 0;
 
-    for (int p=0;p<m;++p) {
-      const int iend = m-p-1, jend = n;
-        
-      const ValueType *KOKKOS_RESTRICT a21 = A+(p+1)*as0+p*as1;
-          
-      ValueType
-        *KOKKOS_RESTRICT b1t = B+p*bs0,
-        *KOKKOS_RESTRICT B2  = B+(p+1)*bs0;
-        
+    for (int p = 0; p < m; ++p) {
+      const int iend = m - p - 1, jend = n;
+
+      const ValueType* KOKKOS_RESTRICT a21 = A + (p + 1) * as0 + p * as1;
+
+      ValueType *KOKKOS_RESTRICT b1t                 = B + p * bs0,
+                                 *KOKKOS_RESTRICT B2 = B + (p + 1) * bs0;
+
       if (!use_unit_diag) {
-        const ValueType alpha11 = AT::conj(A[p*as0+p*as1]);
-        for (int j=0;j<jend;++j)
-          b1t[j*bs1] = b1t[j*bs1] / alpha11;
+        const ValueType alpha11 = AT::conj(A[p * as0 + p * as1]);
+        for (int j = 0; j < jend; ++j) b1t[j * bs1] = b1t[j * bs1] / alpha11;
       }
-        
-      for (int i=0;i<iend;++i)
-        for (int j=0;j<jend;++j)
-          B2[i*bs0+j*bs1] -= AT::conj(a21[i*as0]) * b1t[j*bs1];
+
+      for (int i = 0; i < iend; ++i)
+        for (int j = 0; j < jend; ++j)
+          B2[i * bs0 + j * bs1] -= AT::conj(a21[i * as0]) * b1t[j * bs1];
     }
-  }      
+  }
   return 0;
 }
 
-template<typename ScalarType,
-         typename ValueType>
-int
-SerialTrsmInternalLeftUpperConj(const bool use_unit_diag,
-                                const int m, const int n,
-                                const ScalarType alpha,
-                                const ValueType* KOKKOS_RESTRICT A, const int as0, const int as1,
-                                /**/  ValueType* KOKKOS_RESTRICT B, const int bs0, const int bs1) {
-
+template <typename ScalarType, typename ValueType>
+int SerialTrsmInternalLeftUpperConj(const bool use_unit_diag, const int m,
+                                    const int n, const ScalarType alpha,
+                                    const ValueType* KOKKOS_RESTRICT A,
+                                    const int as0, const int as1,
+                                    /**/ ValueType* KOKKOS_RESTRICT B,
+                                    const int bs0, const int bs1) {
   typedef Kokkos::Details::ArithTraits<ValueType> AT;
 
   const ScalarType one(1.0), zero(0.0);
 
-  if (alpha == zero)  KokkosBatched::SerialSetInternal  ::invoke(m, n, zero,  B, bs0, bs1);
+  if (alpha == zero)
+    KokkosBatched::SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1);
   else {
-    if (alpha != one) KokkosBatched::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1);
+    if (alpha != one)
+      KokkosBatched::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1);
     if (m <= 0 || n <= 0) return 0;
-      
-    ValueType *KOKKOS_RESTRICT B0 = B;
-    for (int p=(m-1);p>=0;--p) {
+
+    ValueType* KOKKOS_RESTRICT B0 = B;
+    for (int p = (m - 1); p >= 0; --p) {
       const int iend = p, jend = n;
 
-      const ValueType* KOKKOS_RESTRICT a01 = A+p*as1;
-      ValueType* KOKKOS_RESTRICT b1t = B+p*bs0;
+      const ValueType* KOKKOS_RESTRICT a01 = A + p * as1;
+      ValueType* KOKKOS_RESTRICT b1t       = B + p * bs0;
 
       if (!use_unit_diag) {
-        const ValueType alpha11 = AT::conj(A[p*as0+p*as1]);
-        for (int j=0;j<n;++j)
-          b1t[j*bs1] = b1t[j*bs1] / alpha11;
+        const ValueType alpha11 = AT::conj(A[p * as0 + p * as1]);
+        for (int j = 0; j < n; ++j) b1t[j * bs1] = b1t[j * bs1] / alpha11;
       }
-        
-      if (p>0){//Note: A workaround to produce correct results for complex<double> with Intel-18.2.199
-        for (int i=0;i<iend;++i)
-          for (int j=0;j<jend;++j)
-            B0[i*bs0+j*bs1] -= AT::conj(a01[i*as0]) * b1t[j*bs1];
+
+      if (p > 0) {  // Note: A workaround to produce correct results for
+                    // complex<double> with Intel-18.2.199
+        for (int i = 0; i < iend; ++i)
+          for (int j = 0; j < jend; ++j)
+            B0[i * bs0 + j * bs1] -= AT::conj(a01[i * as0]) * b1t[j * bs1];
       }
     }
   }
   return 0;
 }
 
-
-template<class AViewType, class BViewType>
-void SerialTrsm_Invoke (const char side[],
-                  const char uplo[],
-                  const char trans[],
-                  const char diag[],
-                  typename BViewType::const_value_type& alpha,
-                  const AViewType& A,
-                  const BViewType& B)
-{
+template <class AViewType, class BViewType>
+void SerialTrsm_Invoke(const char side[], const char uplo[], const char trans[],
+                       const char diag[],
+                       typename BViewType::const_value_type& alpha,
+                       const AViewType& A, const BViewType& B) {
   using KokkosBatched::Algo;
   using KokkosBatched::Diag;
 
-  //Side::Left, Uplo::Lower, Trans::NoTranspose
-  if (((side[0]=='L')||(side[0]=='l'))&&((uplo[0]=='L')||(uplo[0]=='l'))&&((trans[0]=='N')||(trans[0]=='n'))&&((diag[0]=='U')||(diag[0]=='u')))
-    KokkosBatched::SerialTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(Diag::Unit::use_unit_diag,
-                                                               B.extent(0), B.extent(1),
-                                                               alpha,
-                                                               A.data(), A.stride(0), A.stride(1),
-                                                               B.data(), B.stride(0), B.stride(1));
-  if (((side[0]=='L')||(side[0]=='l'))&&((uplo[0]=='L')||(uplo[0]=='l'))&&((trans[0]=='N')||(trans[0]=='n'))&&((diag[0]=='N')||(diag[0]=='n')))
-    KokkosBatched::SerialTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(Diag::NonUnit::use_unit_diag,
-                                                               B.extent(0), B.extent(1),
-                                                               alpha,
-                                                               A.data(), A.stride(0), A.stride(1),
-                                                               B.data(), B.stride(0), B.stride(1));
-
-  //Side::Left, Uplo::Lower, Trans::Transpose
-  if (((side[0]=='L')||(side[0]=='l'))&&((uplo[0]=='L')||(uplo[0]=='l'))&&((trans[0]=='T')||(trans[0]=='t'))&&((diag[0]=='U')||(diag[0]=='u')))
-    KokkosBatched::SerialTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(Diag::Unit::use_unit_diag,
-                                                               B.extent(0), B.extent(1),
-                                                               alpha,
-                                                               A.data(), A.stride(1), A.stride(0),
-                                                               B.data(), B.stride(0), B.stride(1));
-  if (((side[0]=='L')||(side[0]=='l'))&&((uplo[0]=='L')||(uplo[0]=='l'))&&((trans[0]=='T')||(trans[0]=='t'))&&((diag[0]=='N')||(diag[0]=='n')))
-    KokkosBatched::SerialTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(Diag::NonUnit::use_unit_diag,
-                                                               B.extent(0), B.extent(1),
-                                                               alpha,
-                                                               A.data(), A.stride(1), A.stride(0),
-                                                               B.data(), B.stride(0), B.stride(1));
-
-  //Side::Left, Uplo::Lower, Trans::ConjTranspose
-  if (((side[0]=='L')||(side[0]=='l'))&&((uplo[0]=='L')||(uplo[0]=='l'))&&((trans[0]=='C')||(trans[0]=='c'))&&((diag[0]=='U')||(diag[0]=='u')))
-    SerialTrsmInternalLeftUpperConj(Diag::Unit::use_unit_diag,
-                                    B.extent(0), B.extent(1),
-                                    alpha,
-                                    A.data(), A.stride(1), A.stride(0),
-                                    B.data(), B.stride(0), B.stride(1));
-  if (((side[0]=='L')||(side[0]=='l'))&&((uplo[0]=='L')||(uplo[0]=='l'))&&((trans[0]=='C')||(trans[0]=='c'))&&((diag[0]=='N')||(diag[0]=='n')))
-    SerialTrsmInternalLeftUpperConj(Diag::NonUnit::use_unit_diag,
-                                    B.extent(0), B.extent(1),
-                                    alpha,
-                                    A.data(), A.stride(1), A.stride(0),
-                                    B.data(), B.stride(0), B.stride(1));
-
-  //Side::Left, Uplo::Upper, Trans::NoTranspose
-  if (((side[0]=='L')||(side[0]=='l'))&&((uplo[0]=='U')||(uplo[0]=='u'))&&((trans[0]=='N')||(trans[0]=='n'))&&((diag[0]=='U')||(diag[0]=='u')))
-    KokkosBatched::SerialTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(Diag::Unit::use_unit_diag,
-                                                               B.extent(0), B.extent(1),
-                                                               alpha,
-                                                               A.data(), A.stride(0), A.stride(1),
-                                                               B.data(), B.stride(0), B.stride(1));
-  if (((side[0]=='L')||(side[0]=='l'))&&((uplo[0]=='U')||(uplo[0]=='u'))&&((trans[0]=='N')||(trans[0]=='n'))&&((diag[0]=='N')||(diag[0]=='n')))
-    KokkosBatched::SerialTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(Diag::NonUnit::use_unit_diag,
-                                                               B.extent(0), B.extent(1),
-                                                               alpha,
-                                                               A.data(), A.stride(0), A.stride(1),
-                                                               B.data(), B.stride(0), B.stride(1));
-
-  //Side::Left, Uplo::Upper, Trans::Transpose
-  if (((side[0]=='L')||(side[0]=='l'))&&((uplo[0]=='U')||(uplo[0]=='u'))&&((trans[0]=='T')||(trans[0]=='t'))&&((diag[0]=='U')||(diag[0]=='u')))
-    KokkosBatched::SerialTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(Diag::Unit::use_unit_diag,
-                                                               B.extent(0), B.extent(1),
-                                                               alpha,
-                                                               A.data(), A.stride(1), A.stride(0),
-                                                               B.data(), B.stride(0), B.stride(1));
-  if (((side[0]=='L')||(side[0]=='l'))&&((uplo[0]=='U')||(uplo[0]=='u'))&&((trans[0]=='T')||(trans[0]=='t'))&&((diag[0]=='N')||(diag[0]=='n')))
-    KokkosBatched::SerialTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(Diag::NonUnit::use_unit_diag,
-                                                               B.extent(0), B.extent(1),
-                                                               alpha,
-                                                               A.data(), A.stride(1), A.stride(0),
-                                                               B.data(), B.stride(0), B.stride(1));
-
-  //Side::Left, Uplo::Upper, Trans::ConjTranspose
-  if (((side[0]=='L')||(side[0]=='l'))&&((uplo[0]=='U')||(uplo[0]=='u'))&&((trans[0]=='C')||(trans[0]=='c'))&&((diag[0]=='U')||(diag[0]=='u')))
-    SerialTrsmInternalLeftLowerConj(Diag::Unit::use_unit_diag,
-                                    B.extent(0), B.extent(1),
-                                    alpha,
-                                    A.data(), A.stride(1), A.stride(0),
-                                    B.data(), B.stride(0), B.stride(1));
-  if (((side[0]=='L')||(side[0]=='l'))&&((uplo[0]=='U')||(uplo[0]=='u'))&&((trans[0]=='C')||(trans[0]=='c'))&&((diag[0]=='N')||(diag[0]=='n')))
-    SerialTrsmInternalLeftLowerConj(Diag::NonUnit::use_unit_diag,
-                                    B.extent(0), B.extent(1),
-                                    alpha,
-                                    A.data(), A.stride(1), A.stride(0),
-                                    B.data(), B.stride(0), B.stride(1));
+  // Side::Left, Uplo::Lower, Trans::NoTranspose
+  if (((side[0] == 'L') || (side[0] == 'l')) &&
+      ((uplo[0] == 'L') || (uplo[0] == 'l')) &&
+      ((trans[0] == 'N') || (trans[0] == 'n')) &&
+      ((diag[0] == 'U') || (diag[0] == 'u')))
+    KokkosBatched::SerialTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(
+        Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(),
+        A.stride(0), A.stride(1), B.data(), B.stride(0), B.stride(1));
+  if (((side[0] == 'L') || (side[0] == 'l')) &&
+      ((uplo[0] == 'L') || (uplo[0] == 'l')) &&
+      ((trans[0] == 'N') || (trans[0] == 'n')) &&
+      ((diag[0] == 'N') || (diag[0] == 'n')))
+    KokkosBatched::SerialTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(
+        Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(),
+        A.stride(0), A.stride(1), B.data(), B.stride(0), B.stride(1));
+
+  // Side::Left, Uplo::Lower, Trans::Transpose
+  if (((side[0] == 'L') || (side[0] == 'l')) &&
+      ((uplo[0] == 'L') || (uplo[0] == 'l')) &&
+      ((trans[0] == 'T') || (trans[0] == 't')) &&
+      ((diag[0] == 'U') || (diag[0] == 'u')))
+    KokkosBatched::SerialTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(
+        Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(),
+        A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1));
+  if (((side[0] == 'L') || (side[0] == 'l')) &&
+      ((uplo[0] == 'L') || (uplo[0] == 'l')) &&
+      ((trans[0] == 'T') || (trans[0] == 't')) &&
+      ((diag[0] == 'N') || (diag[0] == 'n')))
+    KokkosBatched::SerialTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(
+        Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(),
+        A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1));
+
+  // Side::Left, Uplo::Lower, Trans::ConjTranspose
+  if (((side[0] == 'L') || (side[0] == 'l')) &&
+      ((uplo[0] == 'L') || (uplo[0] == 'l')) &&
+      ((trans[0] == 'C') || (trans[0] == 'c')) &&
+      ((diag[0] == 'U') || (diag[0] == 'u')))
+    SerialTrsmInternalLeftUpperConj(
+        Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(),
+        A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1));
+  if (((side[0] == 'L') || (side[0] == 'l')) &&
+      ((uplo[0] == 'L') || (uplo[0] == 'l')) &&
+      ((trans[0] == 'C') || (trans[0] == 'c')) &&
+      ((diag[0] == 'N') || (diag[0] == 'n')))
+    SerialTrsmInternalLeftUpperConj(
+        Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(),
+        A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1));
+
+  // Side::Left, Uplo::Upper, Trans::NoTranspose
+  if (((side[0] == 'L') || (side[0] == 'l')) &&
+      ((uplo[0] == 'U') || (uplo[0] == 'u')) &&
+      ((trans[0] == 'N') || (trans[0] == 'n')) &&
+      ((diag[0] == 'U') || (diag[0] == 'u')))
+    KokkosBatched::SerialTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(
+        Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(),
+        A.stride(0), A.stride(1), B.data(), B.stride(0), B.stride(1));
+  if (((side[0] == 'L') || (side[0] == 'l')) &&
+      ((uplo[0] == 'U') || (uplo[0] == 'u')) &&
+      ((trans[0] == 'N') || (trans[0] == 'n')) &&
+      ((diag[0] == 'N') || (diag[0] == 'n')))
+    KokkosBatched::SerialTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(
+        Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(),
+        A.stride(0), A.stride(1), B.data(), B.stride(0), B.stride(1));
+
+  // Side::Left, Uplo::Upper, Trans::Transpose
+  if (((side[0] == 'L') || (side[0] == 'l')) &&
+      ((uplo[0] == 'U') || (uplo[0] == 'u')) &&
+      ((trans[0] == 'T') || (trans[0] == 't')) &&
+      ((diag[0] == 'U') || (diag[0] == 'u')))
+    KokkosBatched::SerialTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(
+        Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(),
+        A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1));
+  if (((side[0] == 'L') || (side[0] == 'l')) &&
+      ((uplo[0] == 'U') || (uplo[0] == 'u')) &&
+      ((trans[0] == 'T') || (trans[0] == 't')) &&
+      ((diag[0] == 'N') || (diag[0] == 'n')))
+    KokkosBatched::SerialTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(
+        Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(),
+        A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1));
+
+  // Side::Left, Uplo::Upper, Trans::ConjTranspose
+  if (((side[0] == 'L') || (side[0] == 'l')) &&
+      ((uplo[0] == 'U') || (uplo[0] == 'u')) &&
+      ((trans[0] == 'C') || (trans[0] == 'c')) &&
+      ((diag[0] == 'U') || (diag[0] == 'u')))
+    SerialTrsmInternalLeftLowerConj(
+        Diag::Unit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(),
+        A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1));
+  if (((side[0] == 'L') || (side[0] == 'l')) &&
+      ((uplo[0] == 'U') || (uplo[0] == 'u')) &&
+      ((trans[0] == 'C') || (trans[0] == 'c')) &&
+      ((diag[0] == 'N') || (diag[0] == 'n')))
+    SerialTrsmInternalLeftLowerConj(
+        Diag::NonUnit::use_unit_diag, B.extent(0), B.extent(1), alpha, A.data(),
+        A.stride(1), A.stride(0), B.data(), B.stride(0), B.stride(1));
   ////
-  //Side::Right, Uplo::Lower, Trans::NoTranspose
-  if (((side[0]=='R')||(side[0]=='r'))&&((uplo[0]=='L')||(uplo[0]=='l'))&&((trans[0]=='N')||(trans[0]=='n'))&&((diag[0]=='U')||(diag[0]=='u')))
-    KokkosBatched::SerialTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(Diag::Unit::use_unit_diag,
-                                                               B.extent(1), B.extent(0),
-                                                               alpha,
-                                                               A.data(), A.stride(1), A.stride(0),
-                                                               B.data(), B.stride(1), B.stride(0));
-  if (((side[0]=='R')||(side[0]=='r'))&&((uplo[0]=='L')||(uplo[0]=='l'))&&((trans[0]=='N')||(trans[0]=='n'))&&((diag[0]=='N')||(diag[0]=='n')))
-    KokkosBatched::SerialTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(Diag::NonUnit::use_unit_diag,
-                                                               B.extent(1), B.extent(0),
-                                                               alpha,
-                                                               A.data(), A.stride(1), A.stride(0),
-                                                               B.data(), B.stride(1), B.stride(0));
-
-  //Side::Right, Uplo::Lower, Trans::Transpose
-  if (((side[0]=='R')||(side[0]=='r'))&&((uplo[0]=='L')||(uplo[0]=='l'))&&((trans[0]=='T')||(trans[0]=='t'))&&((diag[0]=='U')||(diag[0]=='u')))
-    KokkosBatched::SerialTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(Diag::Unit::use_unit_diag,
-                                                               B.extent(1), B.extent(0),
-                                                               alpha,
-                                                               A.data(), A.stride(0), A.stride(1),
-                                                               B.data(), B.stride(1), B.stride(0));
-  if (((side[0]=='R')||(side[0]=='r'))&&((uplo[0]=='L')||(uplo[0]=='l'))&&((trans[0]=='T')||(trans[0]=='t'))&&((diag[0]=='N')||(diag[0]=='n')))
-    KokkosBatched::SerialTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(Diag::NonUnit::use_unit_diag,
-                                                               B.extent(1), B.extent(0),
-                                                               alpha,
-                                                               A.data(), A.stride(0), A.stride(1),
-                                                               B.data(), B.stride(1), B.stride(0));
-
-  //Side::Right, Uplo::Lower, Trans::ConjTranspose
-  if (((side[0]=='R')||(side[0]=='r'))&&((uplo[0]=='L')||(uplo[0]=='l'))&&((trans[0]=='C')||(trans[0]=='c'))&&((diag[0]=='U')||(diag[0]=='u')))
-    SerialTrsmInternalLeftLowerConj(Diag::Unit::use_unit_diag,
-                                    B.extent(1), B.extent(0),
-                                    alpha,
-                                    A.data(), A.stride(0), A.stride(1),
-                                    B.data(), B.stride(1), B.stride(0));
-  if (((side[0]=='R')||(side[0]=='r'))&&((uplo[0]=='L')||(uplo[0]=='l'))&&((trans[0]=='C')||(trans[0]=='c'))&&((diag[0]=='N')||(diag[0]=='n')))
-    SerialTrsmInternalLeftLowerConj(Diag::NonUnit::use_unit_diag,
-                                    B.extent(1), B.extent(0),
-                                    alpha,
-                                    A.data(), A.stride(0), A.stride(1),
-                                    B.data(), B.stride(1), B.stride(0));
-
-  //Side::Right, Uplo::Upper, Trans::NoTranspose
-  if (((side[0]=='R')||(side[0]=='r'))&&((uplo[0]=='U')||(uplo[0]=='u'))&&((trans[0]=='N')||(trans[0]=='n'))&&((diag[0]=='U')||(diag[0]=='u')))
-    KokkosBatched::SerialTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(Diag::Unit::use_unit_diag,
-                                                               B.extent(1), B.extent(0),
-                                                               alpha,
-                                                               A.data(), A.stride(1), A.stride(0),
-                                                               B.data(), B.stride(1), B.stride(0));
-  if (((side[0]=='R')||(side[0]=='r'))&&((uplo[0]=='U')||(uplo[0]=='u'))&&((trans[0]=='N')||(trans[0]=='n'))&&((diag[0]=='N')||(diag[0]=='n')))
-    KokkosBatched::SerialTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(Diag::NonUnit::use_unit_diag,
-                                                               B.extent(1), B.extent(0),
-                                                               alpha,
-                                                               A.data(), A.stride(1), A.stride(0),
-                                                               B.data(), B.stride(1), B.stride(0));
-
-  //Side::Right, Uplo::Upper, Trans::Transpose
-  if (((side[0]=='R')||(side[0]=='r'))&&((uplo[0]=='U')||(uplo[0]=='u'))&&((trans[0]=='T')||(trans[0]=='t'))&&((diag[0]=='U')||(diag[0]=='u')))
-    KokkosBatched::SerialTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(Diag::Unit::use_unit_diag,
-                                                               B.extent(1), B.extent(0),
-                                                               alpha,
-                                                               A.data(), A.stride(0), A.stride(1),
-                                                               B.data(), B.stride(1), B.stride(0));
-  if (((side[0]=='R')||(side[0]=='r'))&&((uplo[0]=='U')||(uplo[0]=='u'))&&((trans[0]=='T')||(trans[0]=='t'))&&((diag[0]=='N')||(diag[0]=='n')))
-    KokkosBatched::SerialTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(Diag::NonUnit::use_unit_diag,
-                                                               B.extent(1), B.extent(0),
-                                                               alpha,
-                                                               A.data(), A.stride(0), A.stride(1),
-                                                               B.data(), B.stride(1), B.stride(0));
-															   
-  //Side::Right, Uplo::Upper, Trans::ConjTranspose
-  if (((side[0]=='R')||(side[0]=='r'))&&((uplo[0]=='U')||(uplo[0]=='u'))&&((trans[0]=='C')||(trans[0]=='c'))&&((diag[0]=='U')||(diag[0]=='u')))
-    SerialTrsmInternalLeftUpperConj(Diag::Unit::use_unit_diag,
-                                    B.extent(1), B.extent(0),
-                                    alpha,
-                                    A.data(), A.stride(0), A.stride(1),
-                                    B.data(), B.stride(1), B.stride(0));
-  if (((side[0]=='R')||(side[0]=='r'))&&((uplo[0]=='U')||(uplo[0]=='u'))&&((trans[0]=='C')||(trans[0]=='c'))&&((diag[0]=='N')||(diag[0]=='n')))
-    SerialTrsmInternalLeftUpperConj(Diag::NonUnit::use_unit_diag,
-                                    B.extent(1), B.extent(0),
-                                    alpha,
-                                    A.data(), A.stride(0), A.stride(1),
-                                    B.data(), B.stride(1), B.stride(0));
+  // Side::Right, Uplo::Lower, Trans::NoTranspose
+  if (((side[0] == 'R') || (side[0] == 'r')) &&
+      ((uplo[0] == 'L') || (uplo[0] == 'l')) &&
+      ((trans[0] == 'N') || (trans[0] == 'n')) &&
+      ((diag[0] == 'U') || (diag[0] == 'u')))
+    KokkosBatched::SerialTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(
+        Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(),
+        A.stride(1), A.stride(0), B.data(), B.stride(1), B.stride(0));
+  if (((side[0] == 'R') || (side[0] == 'r')) &&
+      ((uplo[0] == 'L') || (uplo[0] == 'l')) &&
+      ((trans[0] == 'N') || (trans[0] == 'n')) &&
+      ((diag[0] == 'N') || (diag[0] == 'n')))
+    KokkosBatched::SerialTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(
+        Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(),
+        A.stride(1), A.stride(0), B.data(), B.stride(1), B.stride(0));
+
+  // Side::Right, Uplo::Lower, Trans::Transpose
+  if (((side[0] == 'R') || (side[0] == 'r')) &&
+      ((uplo[0] == 'L') || (uplo[0] == 'l')) &&
+      ((trans[0] == 'T') || (trans[0] == 't')) &&
+      ((diag[0] == 'U') || (diag[0] == 'u')))
+    KokkosBatched::SerialTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(
+        Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(),
+        A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0));
+  if (((side[0] == 'R') || (side[0] == 'r')) &&
+      ((uplo[0] == 'L') || (uplo[0] == 'l')) &&
+      ((trans[0] == 'T') || (trans[0] == 't')) &&
+      ((diag[0] == 'N') || (diag[0] == 'n')))
+    KokkosBatched::SerialTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(
+        Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(),
+        A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0));
+
+  // Side::Right, Uplo::Lower, Trans::ConjTranspose
+  if (((side[0] == 'R') || (side[0] == 'r')) &&
+      ((uplo[0] == 'L') || (uplo[0] == 'l')) &&
+      ((trans[0] == 'C') || (trans[0] == 'c')) &&
+      ((diag[0] == 'U') || (diag[0] == 'u')))
+    SerialTrsmInternalLeftLowerConj(
+        Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(),
+        A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0));
+  if (((side[0] == 'R') || (side[0] == 'r')) &&
+      ((uplo[0] == 'L') || (uplo[0] == 'l')) &&
+      ((trans[0] == 'C') || (trans[0] == 'c')) &&
+      ((diag[0] == 'N') || (diag[0] == 'n')))
+    SerialTrsmInternalLeftLowerConj(
+        Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(),
+        A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0));
+
+  // Side::Right, Uplo::Upper, Trans::NoTranspose
+  if (((side[0] == 'R') || (side[0] == 'r')) &&
+      ((uplo[0] == 'U') || (uplo[0] == 'u')) &&
+      ((trans[0] == 'N') || (trans[0] == 'n')) &&
+      ((diag[0] == 'U') || (diag[0] == 'u')))
+    KokkosBatched::SerialTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(
+        Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(),
+        A.stride(1), A.stride(0), B.data(), B.stride(1), B.stride(0));
+  if (((side[0] == 'R') || (side[0] == 'r')) &&
+      ((uplo[0] == 'U') || (uplo[0] == 'u')) &&
+      ((trans[0] == 'N') || (trans[0] == 'n')) &&
+      ((diag[0] == 'N') || (diag[0] == 'n')))
+    KokkosBatched::SerialTrsmInternalLeftLower<Algo::Trsm::Unblocked>::invoke(
+        Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(),
+        A.stride(1), A.stride(0), B.data(), B.stride(1), B.stride(0));
+
+  // Side::Right, Uplo::Upper, Trans::Transpose
+  if (((side[0] == 'R') || (side[0] == 'r')) &&
+      ((uplo[0] == 'U') || (uplo[0] == 'u')) &&
+      ((trans[0] == 'T') || (trans[0] == 't')) &&
+      ((diag[0] == 'U') || (diag[0] == 'u')))
+    KokkosBatched::SerialTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(
+        Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(),
+        A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0));
+  if (((side[0] == 'R') || (side[0] == 'r')) &&
+      ((uplo[0] == 'U') || (uplo[0] == 'u')) &&
+      ((trans[0] == 'T') || (trans[0] == 't')) &&
+      ((diag[0] == 'N') || (diag[0] == 'n')))
+    KokkosBatched::SerialTrsmInternalLeftUpper<Algo::Trsm::Unblocked>::invoke(
+        Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(),
+        A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0));
+
+  // Side::Right, Uplo::Upper, Trans::ConjTranspose
+  if (((side[0] == 'R') || (side[0] == 'r')) &&
+      ((uplo[0] == 'U') || (uplo[0] == 'u')) &&
+      ((trans[0] == 'C') || (trans[0] == 'c')) &&
+      ((diag[0] == 'U') || (diag[0] == 'u')))
+    SerialTrsmInternalLeftUpperConj(
+        Diag::Unit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(),
+        A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0));
+  if (((side[0] == 'R') || (side[0] == 'r')) &&
+      ((uplo[0] == 'U') || (uplo[0] == 'u')) &&
+      ((trans[0] == 'C') || (trans[0] == 'c')) &&
+      ((diag[0] == 'N') || (diag[0] == 'n')))
+    SerialTrsmInternalLeftUpperConj(
+        Diag::NonUnit::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(),
+        A.stride(0), A.stride(1), B.data(), B.stride(1), B.stride(0));
 }
 
-}// namespace Impl
-}// namespace KokkosBlas
-#endif // KOKKOSBLAS3_TRSM_IMPL_HPP_
+}  // namespace Impl
+}  // namespace KokkosBlas
+#endif  // KOKKOSBLAS3_TRSM_IMPL_HPP_
diff --git a/src/blas/impl/KokkosBlas3_trsm_spec.hpp b/src/blas/impl/KokkosBlas3_trsm_spec.hpp
index 6970322d74..3c8284f0fb 100644
--- a/src/blas/impl/KokkosBlas3_trsm_spec.hpp
+++ b/src/blas/impl/KokkosBlas3_trsm_spec.hpp
@@ -50,18 +50,18 @@
 #include <sstream>
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
-#include<KokkosBlas3_trsm_impl.hpp>
+#include <KokkosBlas3_trsm_impl.hpp>
 #endif
 
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class AVT, class BVT>
+template <class AVT, class BVT>
 struct trsm_eti_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization availability
@@ -70,21 +70,25 @@ struct trsm_eti_spec_avail {
 // We may spread out definitions (see _INST macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS3_TRSM_ETI_SPEC_AVAIL_LAYOUT( SCALAR, LAYOUTA, LAYOUTB, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct trsm_eti_spec_avail< \
-         Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> > \
-         > { enum : bool { value = true }; };
-
-#define KOKKOSBLAS3_TRSM_ETI_SPEC_AVAIL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    KOKKOSBLAS3_TRSM_ETI_SPEC_AVAIL_LAYOUT( SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, MEM_SPACE)
+#define KOKKOSBLAS3_TRSM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, LAYOUTA, LAYOUTB,     \
+                                               EXEC_SPACE, MEM_SPACE)        \
+  template <>                                                                \
+  struct trsm_eti_spec_avail<                                                \
+      Kokkos::View<const SCALAR**, LAYOUTA,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {             \
+    enum : bool { value = true };                                            \
+  };
+
+#define KOKKOSBLAS3_TRSM_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  KOKKOSBLAS3_TRSM_ETI_SPEC_AVAIL_LAYOUT(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE,   \
+                                         MEM_SPACE)
 
 // Include the actual specialization declarations
-#include<KokkosBlas3_trsm_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas3_trsm_eti_spec_avail.hpp>
+#include <KokkosBlas3_trsm_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas3_trsm_eti_spec_avail.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
@@ -93,66 +97,59 @@ namespace Impl {
 // trsm
 //
 
-//Unification layer
-template<class AViewType,
-         class BViewType,
-         bool tpl_spec_avail = trsm_tpl_spec_avail<AViewType, BViewType>::value,
-         bool eti_spec_avail = trsm_eti_spec_avail<AViewType, BViewType>::value
-        >
-struct TRSM{
-  static void
-  trsm (const char side[],
-        const char uplo[],
-        const char trans[],
-        const char diag[],
-        typename BViewType::const_value_type& alpha,
-        const AViewType& A,
-        const BViewType& B);
+// Unification layer
+template <
+    class AViewType, class BViewType,
+    bool tpl_spec_avail = trsm_tpl_spec_avail<AViewType, BViewType>::value,
+    bool eti_spec_avail = trsm_eti_spec_avail<AViewType, BViewType>::value>
+struct TRSM {
+  static void trsm(const char side[], const char uplo[], const char trans[],
+                   const char diag[],
+                   typename BViewType::const_value_type& alpha,
+                   const AViewType& A, const BViewType& B);
 };
 
 // Implementation of KokkosBlas::trsm.
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
-template<class AViewType,
-         class BViewType>
+template <class AViewType, class BViewType>
 struct TRSM<AViewType, BViewType, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
-  static void
-  trsm (const char side[],
-        const char uplo[],
-        const char trans[],
-        const char diag[],
-        typename BViewType::const_value_type& alpha,
-        const AViewType& A,
-        const BViewType& B)
-  {
-    static_assert (Kokkos::Impl::is_view<AViewType>::value,
-                   "AViewType must be a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<BViewType>::value,
-                   "BViewType must be a Kokkos::View.");
-    static_assert (static_cast<int> (AViewType::rank) == 2,
-                   "AViewType must have rank 2.");
-    static_assert (static_cast<int> (BViewType::rank) == 2,
-                   "BViewType must have rank 2.");
-
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::trsm[ETI]":"KokkosBlas::trsm[noETI]");
-
-    typename AViewType::HostMirror h_A  = Kokkos::create_mirror_view(A);
-    typename BViewType::HostMirror h_B  = Kokkos::create_mirror_view(B);
+  static void trsm(const char side[], const char uplo[], const char trans[],
+                   const char diag[],
+                   typename BViewType::const_value_type& alpha,
+                   const AViewType& A, const BViewType& B) {
+    static_assert(Kokkos::is_view<AViewType>::value,
+                  "AViewType must be a Kokkos::View.");
+    static_assert(Kokkos::is_view<BViewType>::value,
+                  "BViewType must be a Kokkos::View.");
+    static_assert(static_cast<int>(AViewType::rank) == 2,
+                  "AViewType must have rank 2.");
+    static_assert(static_cast<int>(BViewType::rank) == 2,
+                  "BViewType must have rank 2.");
+
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::trsm[ETI]"
+                                      : "KokkosBlas::trsm[noETI]");
+
+    typename AViewType::HostMirror h_A = Kokkos::create_mirror_view(A);
+    typename BViewType::HostMirror h_B = Kokkos::create_mirror_view(B);
 
     Kokkos::deep_copy(h_A, A);
     Kokkos::deep_copy(h_B, B);
 
-    SerialTrsm_Invoke<typename AViewType::HostMirror, typename BViewType::HostMirror> (side, uplo, trans, diag, alpha, h_A, h_B);
+    SerialTrsm_Invoke<typename AViewType::HostMirror,
+                      typename BViewType::HostMirror>(side, uplo, trans, diag,
+                                                      alpha, h_A, h_B);
 
     Kokkos::deep_copy(B, h_B);
 
     Kokkos::Profiling::popRegion();
   }
 };
-#endif //!defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
-
-} // namespace Impl
-} // namespace KokkosBlas
+#endif  //! defined(KOKKOSKERNELS_ETI_ONLY) ||
+        //! KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization of
@@ -162,29 +159,35 @@ struct TRSM<AViewType, BViewType, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
 // one or more .cpp files.
 //
 
-#define KOKKOSBLAS3_TRSM_ETI_SPEC_DECL_LAYOUTS( SCALAR, LAYOUTA, LAYOUTB, EXEC_SPACE, MEM_SPACE ) \
-extern template struct TRSM< \
-     Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     false, true>;
-
-#define KOKKOSBLAS3_TRSM_ETI_SPEC_INST_LAYOUTS( SCALAR, LAYOUTA, LAYOUTB, EXEC_SPACE, MEM_SPACE ) \
-template struct TRSM< \
-     Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     false, true>;
-
-#define KOKKOSBLAS3_TRSM_ETI_SPEC_DECL( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    KOKKOSBLAS3_TRSM_ETI_SPEC_DECL_LAYOUTS(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, MEM_SPACE)
-
-#define KOKKOSBLAS3_TRSM_ETI_SPEC_INST( SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE ) \
-    KOKKOSBLAS3_TRSM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, MEM_SPACE)
-
-#include<KokkosBlas3_trsm_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas3_trsm_eti_spec_decl.hpp>
-
-#endif // KOKKOSBLAS3_TRSM_SPEC_HPP_
+#define KOKKOSBLAS3_TRSM_ETI_SPEC_DECL_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB,     \
+                                               EXEC_SPACE, MEM_SPACE)        \
+  extern template struct TRSM<                                               \
+      Kokkos::View<const SCALAR**, LAYOUTA,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      false, true>;
+
+#define KOKKOSBLAS3_TRSM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB,     \
+                                               EXEC_SPACE, MEM_SPACE)        \
+  template struct TRSM<                                                      \
+      Kokkos::View<const SCALAR**, LAYOUTA,                                  \
+                   Kokkos::Device<EXEC_SPACE, MEM_SPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      false, true>;
+
+#define KOKKOSBLAS3_TRSM_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  KOKKOSBLAS3_TRSM_ETI_SPEC_DECL_LAYOUTS(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE,  \
+                                         MEM_SPACE)
+
+#define KOKKOSBLAS3_TRSM_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \
+  KOKKOSBLAS3_TRSM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE,  \
+                                         MEM_SPACE)
+
+#include <KokkosBlas3_trsm_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas3_trsm_eti_spec_decl.hpp>
+
+#endif  // KOKKOSBLAS3_TRSM_SPEC_HPP_
diff --git a/src/blas/impl/KokkosBlas_gesv_impl.hpp b/src/blas/impl/KokkosBlas_gesv_impl.hpp
index d35f4b88ac..5a47370dbf 100644
--- a/src/blas/impl/KokkosBlas_gesv_impl.hpp
+++ b/src/blas/impl/KokkosBlas_gesv_impl.hpp
@@ -54,9 +54,9 @@
 namespace KokkosBlas {
 namespace Impl {
 
-//NOTE: Might add the implementation of KokkosBlas::gesv later
+// NOTE: Might add the implementation of KokkosBlas::gesv later
 
-} // namespace Impl
-} // namespace KokkosBlas
+}  // namespace Impl
+}  // namespace KokkosBlas
 
-#endif // KOKKOSBLAS_IMPL_GESV_HPP
+#endif  // KOKKOSBLAS_IMPL_GESV_HPP
diff --git a/src/blas/impl/KokkosBlas_gesv_spec.hpp b/src/blas/impl/KokkosBlas_gesv_spec.hpp
index e96151cf20..7c9e939313 100644
--- a/src/blas/impl/KokkosBlas_gesv_spec.hpp
+++ b/src/blas/impl/KokkosBlas_gesv_spec.hpp
@@ -49,19 +49,19 @@
 #include <Kokkos_ArithTraits.hpp>
 
 // Include the actual functors
-#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY 
+#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 #include <KokkosBlas_gesv_impl.hpp>
 #endif
 
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class AVT, class BVT>
+template <class AVT, class BVT>
 struct gesv_eti_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization availability
@@ -70,20 +70,22 @@ struct gesv_eti_spec_avail {
 // We may spread out definitions (see _INST macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS_GESV_ETI_SPEC_AVAIL( SCALAR_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \
-    template<> \
-    struct gesv_eti_spec_avail< \
-                  Kokkos::View<SCALAR_TYPE **, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                  Kokkos::View<SCALAR_TYPE **, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged> > > \
-    { enum : bool { value = true }; };
+#define KOKKOSBLAS_GESV_ETI_SPEC_AVAIL(SCALAR_TYPE, LAYOUT_TYPE,        \
+                                       EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \
+  template <>                                                           \
+  struct gesv_eti_spec_avail<                                           \
+      Kokkos::View<SCALAR_TYPE **, LAYOUT_TYPE,                         \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,           \
+      Kokkos::View<SCALAR_TYPE **, LAYOUT_TYPE,                         \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {        \
+    enum : bool { value = true };                                       \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosBlas_gesv_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas_gesv_eti_spec_avail.hpp>
+#include <KokkosBlas_gesv_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas_gesv_eti_spec_avail.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
@@ -91,40 +93,30 @@ namespace Impl {
 // Unification layer
 /// \brief Implementation of KokkosBlas::gesv.
 
-template<class AMatrix,
-         class BXMV,
-         class IPIVV,
-         bool tpl_spec_avail = gesv_tpl_spec_avail<AMatrix, BXMV>::value,
-         bool eti_spec_avail = gesv_eti_spec_avail<AMatrix, BXMV>::value
-        >
-struct GESV{
-  static void
-  gesv (const AMatrix& A,
-        const BXMV& B,
-        const IPIVV& IPIV);
+template <class AMatrix, class BXMV, class IPIVV,
+          bool tpl_spec_avail = gesv_tpl_spec_avail<AMatrix, BXMV>::value,
+          bool eti_spec_avail = gesv_eti_spec_avail<AMatrix, BXMV>::value>
+struct GESV {
+  static void gesv(const AMatrix &A, const BXMV &B, const IPIVV &IPIV);
 };
 
-
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 //! Full specialization of gesv for multi vectors.
 // Unification layer
-template<class AMatrix,
-         class BXMV,
-         class IPIVV>
-struct GESV<AMatrix, BXMV, IPIVV, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>{
-  static void
-  gesv (const AMatrix& /* A */,
-        const BXMV& /* B */,
-        const IPIVV& /* IPIV */)
-  {
-   //NOTE: Might add the implementation of KokkosBlas::gesv later
-   throw std::runtime_error("No fallback implementation of GESV (general LU factorization & solve) exists. Enable BLAS and/or MAGMA TPL.");
+template <class AMatrix, class BXMV, class IPIVV>
+struct GESV<AMatrix, BXMV, IPIVV, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  static void gesv(const AMatrix & /* A */, const BXMV & /* B */,
+                   const IPIVV & /* IPIV */) {
+    // NOTE: Might add the implementation of KokkosBlas::gesv later
+    throw std::runtime_error(
+        "No fallback implementation of GESV (general LU factorization & solve) "
+        "exists. Enable BLAS and/or MAGMA TPL.");
   }
 };
 
 #endif
-}// namespace Impl
-}// namespace KokkosBlas
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
 // Macro for declaration of full specialization of
@@ -133,34 +125,37 @@ struct GESV<AMatrix, BXMV, IPIVV, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>{
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSBLAS_GESV_ETI_SPEC_DECL( SCALAR_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE ) \
-    extern template struct  \
-    GESV<             Kokkos::View<SCALAR_TYPE **, LAYOUT_TYPE,  \
-                                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                      Kokkos::View<SCALAR_TYPE **, LAYOUT_TYPE,  \
-                                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                      Kokkos::View<int *, LAYOUT_TYPE,  \
-                                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-                                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                      false, true >; \
-
-#define KOKKOSBLAS_GESV_ETI_SPEC_INST( SCALAR_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \
-    template struct  \
-    GESV<             Kokkos::View<SCALAR_TYPE **, LAYOUT_TYPE,  \
-                                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                      Kokkos::View<SCALAR_TYPE **, LAYOUT_TYPE,  \
-                                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                      Kokkos::View<int *, LAYOUT_TYPE,  \
-                                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-                                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                      false, true > ;
-
-#include<KokkosBlas_gesv_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas_gesv_eti_spec_decl.hpp>
-
-
-#endif // KOKKOSBLAS_IMPL_GESV_SPEC_HPP_
+#define KOKKOSBLAS_GESV_ETI_SPEC_DECL(SCALAR_TYPE, LAYOUT_TYPE,        \
+                                      EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \
+  extern template struct GESV<                                         \
+      Kokkos::View<SCALAR_TYPE **, LAYOUT_TYPE,                        \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      Kokkos::View<SCALAR_TYPE **, LAYOUT_TYPE,                        \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      Kokkos::View<int *, LAYOUT_TYPE,                                 \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace,   \
+                                  Kokkos::HostSpace>,                  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      false, true>;
+
+#define KOKKOSBLAS_GESV_ETI_SPEC_INST(SCALAR_TYPE, LAYOUT_TYPE,        \
+                                      EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \
+  template struct GESV<                                                \
+      Kokkos::View<SCALAR_TYPE **, LAYOUT_TYPE,                        \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      Kokkos::View<SCALAR_TYPE **, LAYOUT_TYPE,                        \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      Kokkos::View<int *, LAYOUT_TYPE,                                 \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace,   \
+                                  Kokkos::HostSpace>,                  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+      false, true>;
+
+#include <KokkosBlas_gesv_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas_gesv_eti_spec_decl.hpp>
+
+#endif  // KOKKOSBLAS_IMPL_GESV_SPEC_HPP_
diff --git a/src/blas/impl/KokkosBlas_trtri_impl.hpp b/src/blas/impl/KokkosBlas_trtri_impl.hpp
index a974513c2e..5aa82e6480 100644
--- a/src/blas/impl/KokkosBlas_trtri_impl.hpp
+++ b/src/blas/impl/KokkosBlas_trtri_impl.hpp
@@ -50,54 +50,48 @@
  * \brief Implementation of triangular matrix inverse
  */
 
-
 #include "KokkosKernels_config.h"
 #include "Kokkos_Core.hpp"
 #include "KokkosBatched_Trtri_Decl.hpp"
 #include "KokkosBatched_Trtri_Serial_Impl.hpp"
 
 namespace KokkosBlas {
-  namespace Impl {
+namespace Impl {
 
-    template<class RViewType,
-             class AViewType>
-    void SerialTrtri_Invoke (const RViewType &R,
-                            const char uplo[],
-                            const char diag[],
-                            const AViewType &A)
-    { 
-      using KokkosBatched::Algo;
-      using KokkosBatched::Diag;
-      using KokkosBatched::SerialTrtriInternalLower;
-      using KokkosBatched::SerialTrtriInternalUpper;
+template <class RViewType, class AViewType>
+void SerialTrtri_Invoke(const RViewType &R, const char uplo[],
+                        const char diag[], const AViewType &A) {
+  using KokkosBatched::Algo;
+  using KokkosBatched::Diag;
+  using KokkosBatched::SerialTrtriInternalLower;
+  using KokkosBatched::SerialTrtriInternalUpper;
 
-      char __uplo = tolower(uplo[0]),
-           __diag = tolower(diag[0]);
+  char __uplo = tolower(uplo[0]), __diag = tolower(diag[0]);
 
-      //// Lower ////
-      if (__uplo == 'l') {
-        if (__diag == 'u') {
-          R() = SerialTrtriInternalLower<Algo::Trtri::Unblocked>::invoke(Diag::Unit::use_unit_diag,
-                                                                  A.extent(0), A.extent(1),
-                                                                  A.data(), A.stride(0), A.stride(1));
-        } else {
-          R() = SerialTrtriInternalLower<Algo::Trtri::Unblocked>::invoke(Diag::NonUnit::use_unit_diag,
-                                                                  A.extent(0), A.extent(1),
-                                                                  A.data(), A.stride(0), A.stride(1));
-        }
-      } else {
-      //// Upper ////
-        if (__diag == 'u') {
-          R() = SerialTrtriInternalUpper<Algo::Trtri::Unblocked>::invoke(Diag::Unit::use_unit_diag,
-                                                                  A.extent(0), A.extent(1),
-                                                                  A.data(), A.stride(0), A.stride(1));
-        } else {
-          R() = SerialTrtriInternalUpper<Algo::Trtri::Unblocked>::invoke(Diag::NonUnit::use_unit_diag,
-                                                                  A.extent(0), A.extent(1),
-                                                                  A.data(), A.stride(0), A.stride(1));
-        }
-      }
+  //// Lower ////
+  if (__uplo == 'l') {
+    if (__diag == 'u') {
+      R() = SerialTrtriInternalLower<Algo::Trtri::Unblocked>::invoke(
+          Diag::Unit::use_unit_diag, A.extent(0), A.extent(1), A.data(),
+          A.stride(0), A.stride(1));
+    } else {
+      R() = SerialTrtriInternalLower<Algo::Trtri::Unblocked>::invoke(
+          Diag::NonUnit::use_unit_diag, A.extent(0), A.extent(1), A.data(),
+          A.stride(0), A.stride(1));
+    }
+  } else {
+    //// Upper ////
+    if (__diag == 'u') {
+      R() = SerialTrtriInternalUpper<Algo::Trtri::Unblocked>::invoke(
+          Diag::Unit::use_unit_diag, A.extent(0), A.extent(1), A.data(),
+          A.stride(0), A.stride(1));
+    } else {
+      R() = SerialTrtriInternalUpper<Algo::Trtri::Unblocked>::invoke(
+          Diag::NonUnit::use_unit_diag, A.extent(0), A.extent(1), A.data(),
+          A.stride(0), A.stride(1));
     }
-  } // namespace Impl
-} // namespace KokkosBlas
-#endif // KOKKOSBLAS_TRTRI_IMPL_HPP_
+  }
+}
+}  // namespace Impl
+}  // namespace KokkosBlas
+#endif  // KOKKOSBLAS_TRTRI_IMPL_HPP_
diff --git a/src/blas/impl/KokkosBlas_trtri_spec.hpp b/src/blas/impl/KokkosBlas_trtri_spec.hpp
index 01f53d04e1..1cccad1ea4 100644
--- a/src/blas/impl/KokkosBlas_trtri_spec.hpp
+++ b/src/blas/impl/KokkosBlas_trtri_spec.hpp
@@ -48,34 +48,37 @@
 #include "Kokkos_Core.hpp"
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
-#include<KokkosBlas_trtri_impl.hpp>
+#include <KokkosBlas_trtri_impl.hpp>
 #endif
 
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class RVIT, class AVIT>
+template <class RVIT, class AVIT>
 struct trtri_eti_spec_avail {
   enum : bool { value = false };
 };
-} // namespace Impl
-} // namespace KokkosBlas
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
-// This Macros provides the ETI specialization of trtri, currently not available.
+// This Macros provides the ETI specialization of trtri, currently not
+// available.
 //
-#define KOKKOSBLAS_TRTRI_ETI_SPEC_AVAIL( SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE ) \
-    template<> \
-    struct trtri_eti_spec_avail< \
-         Kokkos::View<int, LAYOUTA, Kokkos::HostSpace, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-         Kokkos::View<SCALAR**, LAYOUTA, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                      Kokkos::MemoryTraits<Kokkos::Unmanaged> > \
-         > { enum : bool { value = true }; };
+#define KOKKOSBLAS_TRTRI_ETI_SPEC_AVAIL(SCALAR, LAYOUTA, EXEC_SPACE,         \
+                                        MEM_SPACE)                           \
+  template <>                                                                \
+  struct trtri_eti_spec_avail<                                               \
+      Kokkos::View<int, LAYOUTA, Kokkos::HostSpace,                          \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<SCALAR**, LAYOUTA, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {             \
+    enum : bool { value = true };                                            \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosBlas_trtri_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosBlas_trtri_eti_spec_avail.hpp>
+#include <KokkosBlas_trtri_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosBlas_trtri_eti_spec_avail.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
@@ -84,78 +87,70 @@ namespace Impl {
 // trtri
 //
 
-//Unification layer
-template<class RVIT,
-         class AVIT,
-         bool tpl_spec_avail = trtri_tpl_spec_avail<RVIT, AVIT>::value,
-         bool eti_spec_avail = trtri_eti_spec_avail<RVIT, AVIT>::value
-        >
-struct TRTRI{
-  static void
-  trtri (const RVIT& R,
-        const char uplo[],
-        const char diag[],
-        const AVIT& A);
+// Unification layer
+template <class RVIT, class AVIT,
+          bool tpl_spec_avail = trtri_tpl_spec_avail<RVIT, AVIT>::value,
+          bool eti_spec_avail = trtri_eti_spec_avail<RVIT, AVIT>::value>
+struct TRTRI {
+  static void trtri(const RVIT& R, const char uplo[], const char diag[],
+                    const AVIT& A);
 };
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
-template<class RVIT, class AVIT>
+template <class RVIT, class AVIT>
 struct TRTRI<RVIT, AVIT, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
-  static void
-  trtri (const RVIT& R,
-        const char uplo[],
-        const char diag[],
-        const AVIT& A)
-  {
-    static_assert (Kokkos::Impl::is_view<AVIT>::value,
-                   "AVIT must be a Kokkos::View.");
-    static_assert (static_cast<int> (AVIT::rank) == 2,
-                   "AVIT must have rank 2.");
-
-    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY?"KokkosBlas::trtri[ETI]":"KokkosBlas::trtri[noETI]");
-
-    typename AVIT::HostMirror host_A  = Kokkos::create_mirror_view(A);
-    typename RVIT::HostMirror host_R  = Kokkos::create_mirror_view(R);
+  static void trtri(const RVIT& R, const char uplo[], const char diag[],
+                    const AVIT& A) {
+    static_assert(Kokkos::is_view<AVIT>::value, "AVIT must be a Kokkos::View.");
+    static_assert(static_cast<int>(AVIT::rank) == 2, "AVIT must have rank 2.");
+
+    Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+                                      ? "KokkosBlas::trtri[ETI]"
+                                      : "KokkosBlas::trtri[noETI]");
+
+    typename AVIT::HostMirror host_A = Kokkos::create_mirror_view(A);
+    typename RVIT::HostMirror host_R = Kokkos::create_mirror_view(R);
 
     Kokkos::deep_copy(host_A, A);
 
-    SerialTrtri_Invoke<typename RVIT::HostMirror, typename AVIT::HostMirror> (R, uplo, diag, host_A);
+    SerialTrtri_Invoke<typename RVIT::HostMirror, typename AVIT::HostMirror>(
+        R, uplo, diag, host_A);
 
     Kokkos::deep_copy(A, host_A);
 
     Kokkos::Profiling::popRegion();
   }
 };
-#endif //!defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
-
-} // namespace Impl
-} // namespace KokkosBlas
+#endif  //! defined(KOKKOSKERNELS_ETI_ONLY) ||
+        //! KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 //
-// These Macros are only included when we are not compiling libkokkoskernels but are
-// auto generating files. These macros provide the explicit instantiation
+// These Macros are only included when we are not compiling libkokkoskernels but
+// are auto generating files. These macros provide the explicit instantiation
 // declaration and definition of TRTRI, potentially reducing user code size. The
 // "extern template" skips the implicit instatiation step ensuring that the
 // callers code uses this explicit instantiation definition of TRTRI.
 //
-#define KOKKOSBLAS_TRTRI_ETI_SPEC_DECL( SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE ) \
-extern template struct TRTRI< \
-     Kokkos::View<int, LAYOUTA, Kokkos::HostSpace, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR**, LAYOUTA, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     false, true>;
-
-#define KOKKOSBLAS_TRTRI_ETI_SPEC_INST( SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE ) \
-template struct TRTRI< \
-     Kokkos::View<int, LAYOUTA, Kokkos::HostSpace, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR**, LAYOUTA, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     false, true>;
-
-#include<KokkosBlas_trtri_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosBlas_trtri_eti_spec_decl.hpp>
-
-#endif // KOKKOSBLAS_TRTRI_SPEC_HPP_
+#define KOKKOSBLAS_TRTRI_ETI_SPEC_DECL(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \
+  extern template struct TRTRI<                                                \
+      Kokkos::View<int, LAYOUTA, Kokkos::HostSpace,                            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<SCALAR**, LAYOUTA, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      false, true>;
+
+#define KOKKOSBLAS_TRTRI_ETI_SPEC_INST(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \
+  template struct TRTRI<                                                       \
+      Kokkos::View<int, LAYOUTA, Kokkos::HostSpace,                            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<SCALAR**, LAYOUTA, Kokkos::Device<EXEC_SPACE, MEM_SPACE>,   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      false, true>;
+
+#include <KokkosBlas_trtri_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosBlas_trtri_eti_spec_decl.hpp>
+
+#endif  // KOKKOSBLAS_TRTRI_SPEC_HPP_
diff --git a/src/blas/impl/KokkosBlas_util.hpp b/src/blas/impl/KokkosBlas_util.hpp
new file mode 100644
index 0000000000..378382545a
--- /dev/null
+++ b/src/blas/impl/KokkosBlas_util.hpp
@@ -0,0 +1,106 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Brian Kelley (bmkelle@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_BLAS_UTIL_HPP
+#define KOKKOS_BLAS_UTIL_HPP
+
+namespace KokkosBlas {
+namespace Impl {
+
+// Helper to choose the work distribution for a TeamPolicy computing multiple
+// reductions. Each team computes a partial reduction and atomically contributes
+// to the final result.
+//
+// This was originally written for dot-based GEMM, but can also be applied to
+// multivector dots/norms.
+
+// Input params:
+//  * length: size of each vector to reduce
+//  * numReductions: number of reductions to compute
+// Output params:
+//  * teamsPerReduction: number of teams to use for each reduction
+template <typename ExecSpace, typename size_type>
+void multipleReductionWorkDistribution(size_type length,
+                                       size_type numReductions,
+                                       size_type& teamsPerDot) {
+  constexpr size_type workPerTeam = 4096;  // Amount of work per team
+  size_type appxNumTeams =
+      (length * numReductions) / workPerTeam;  // Estimation for appxNumTeams
+
+  // Adjust appxNumTeams in case it is too small or too large
+  if (appxNumTeams < 1) appxNumTeams = 1;
+  if (appxNumTeams > 1024) appxNumTeams = 1024;
+
+  // If there are more reductions than the number of teams,
+  // then set the number of teams to be number of reductions.
+  // We don't want a team to contribute to more than one reduction.
+  if (numReductions >= appxNumTeams) {
+    teamsPerDot = 1;
+  }
+  // If there are more teams than reductions, each reduction can
+  // potentially be performed by multiple teams. First, compute
+  // teamsPerDot as an integer (take the floor, not ceiling), then,
+  // compute actual number of teams by using this factor.
+  else {
+    teamsPerDot = appxNumTeams / numReductions;
+  }
+}
+
+// Functor to apply sqrt() to each element of a 1D view.
+
+template <class RV>
+struct TakeSqrtFunctor {
+  TakeSqrtFunctor(const RV& r_) : r(r_) {}
+
+  KOKKOS_INLINE_FUNCTION void operator()(int i) const {
+    r(i) = Kokkos::ArithTraits<typename RV::non_const_value_type>::sqrt(r(i));
+  }
+
+  RV r;
+};
+
+}  // namespace Impl
+}  // namespace KokkosBlas
+
+#endif
diff --git a/src/common/KokkosKernels_BitUtils.hpp b/src/common/KokkosKernels_BitUtils.hpp
index 7c343ff5a4..049acd8b27 100644
--- a/src/common/KokkosKernels_BitUtils.hpp
+++ b/src/common/KokkosKernels_BitUtils.hpp
@@ -46,212 +46,139 @@
 #define _KOKKOSKERNELS_BITUTILS_HPP
 #include "Kokkos_Core.hpp"
 
-#if defined (KOKKOS_COMPILER_MSVC)
+#if defined(KOKKOS_COMPILER_MSVC)
 #include <intrin.h>
 #endif
 
-namespace KokkosKernels{
+namespace KokkosKernels {
 
-namespace Impl{
+namespace Impl {
 
 // POP COUNT function returns the number of set bits
-#if defined( __CUDA_ARCH__ ) || defined(__HIP_DEVICE_COMPILE__)
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( unsigned i ){
-  return __popc(i);
-}
+int pop_count(unsigned i) { return __popc(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( unsigned long i ){
-  return __popcll(i);
-}
+int pop_count(unsigned long i) { return __popcll(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( unsigned long long i ){
-  return __popcll(i);
-}
+int pop_count(unsigned long long i) { return __popcll(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( int i ){
-  return __popc(i);
-}
+int pop_count(int i) { return __popc(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( long i ){
-  return __popcll(i);
-}
+int pop_count(long i) { return __popcll(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( long long i ){
-  return __popcll(i);
-}
+int pop_count(long long i) { return __popcll(i); }
 
-#elif defined ( __INTEL_COMPILER )
+#elif defined(__INTEL_COMPILER)
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( unsigned i ){
-  return _popcnt32(i);
-}
+int pop_count(unsigned i) { return _popcnt32(i); }
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( unsigned long i ){
-  return _popcnt64(i);
-}
+int pop_count(unsigned long i) { return _popcnt64(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( unsigned long long i ){
-  return _popcnt64(i);
-}
+int pop_count(unsigned long long i) { return _popcnt64(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count(int i ){
-  return _popcnt32(i);
-}
+int pop_count(int i) { return _popcnt32(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( long i ){
-  return _popcnt64(i);
-}
+int pop_count(long i) { return _popcnt64(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( long long i ){
-  return _popcnt64(i);
-}
+int pop_count(long long i) { return _popcnt64(i); }
 
-#elif defined( __GNUC__ ) || defined( __GNUG__ )
+#elif defined(__GNUC__) || defined(__GNUG__)
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( unsigned i ){
-  return __builtin_popcount(i);
-}
+int pop_count(unsigned i) { return __builtin_popcount(i); }
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( unsigned long i ){
-  return __builtin_popcountl(i);
-}
+int pop_count(unsigned long i) { return __builtin_popcountl(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( unsigned long long i ){
-  return __builtin_popcountll(i);
-}
+int pop_count(unsigned long long i) { return __builtin_popcountll(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( int i ){
-  return __builtin_popcount(i);
-}
+int pop_count(int i) { return __builtin_popcount(i); }
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count(  long i ){
-  return __builtin_popcountl(i);
-}
+int pop_count(long i) { return __builtin_popcountl(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count(  long long i ){
-  return __builtin_popcountll(i);
-}
+int pop_count(long long i) { return __builtin_popcountll(i); }
 
 #elif defined(__ibmxl_vrm__)
-// See https://www.ibm.com/support/knowledgecenter/SSGH3R_16.1.0/com.ibm.xlcpp161.aix.doc/compiler_ref/compiler_builtins.html
+// See
+// https://www.ibm.com/support/knowledgecenter/SSGH3R_16.1.0/com.ibm.xlcpp161.aix.doc/compiler_ref/compiler_builtins.html
 // link gives info about builtin names for xlclang++
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( unsigned i ){
-  return __builtin_popcnt4(i);
-}
+int pop_count(unsigned i) { return __builtin_popcnt4(i); }
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( unsigned long i ){
-  return __builtin_popcnt8(i);
-}
+int pop_count(unsigned long i) { return __builtin_popcnt8(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( unsigned long long i ){
-  return __builtin_popcnt8(i);
-}
+int pop_count(unsigned long long i) { return __builtin_popcnt8(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( int i ){
-  return __builtin_popcnt4(i);
-}
+int pop_count(int i) { return __builtin_popcnt4(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( long i ){
-  return __builtin_popcnt8(i);
-}
+int pop_count(long i) { return __builtin_popcnt8(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( long long i ){
-  return __builtin_popcnt8(i);
-}
+int pop_count(long long i) { return __builtin_popcnt8(i); }
 
 #elif defined(__IBMCPP__) || defined(__IBMC__)
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( unsigned i ){
-  return __popcnt4(i);
-}
+int pop_count(unsigned i) { return __popcnt4(i); }
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( unsigned long i ){
-  return __popcnt8(i);
-}
+int pop_count(unsigned long i) { return __popcnt8(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( unsigned long long i ){
-  return __popcnt8(i);
-}
+int pop_count(unsigned long long i) { return __popcnt8(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( int i ){
-  return __popcnt4(i);
-}
+int pop_count(int i) { return __popcnt4(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( long i ){
-  return __popcnt8(i);
-}
+int pop_count(long i) { return __popcnt8(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( long long i ){
-  return __popcnt8(i);
-}
+int pop_count(long long i) { return __popcnt8(i); }
 
-#elif defined (KOKKOS_COMPILER_MSVC)
+#elif defined(KOKKOS_COMPILER_MSVC)
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( unsigned i ){
-    return __popcnt(i);
-}
+int pop_count(unsigned i) { return __popcnt(i); }
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( unsigned long i ){
-    return __popcnt(i);
-}
+int pop_count(unsigned long i) { return __popcnt(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( unsigned long long i ){
-    return __popcnt64(i);
-}
+int pop_count(unsigned long long i) { return __popcnt64(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count(int i ){
-    return __popcnt(i);
-}
+int pop_count(int i) { return __popcnt(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( long i ){
-    return __popcnt(i);
-}
+int pop_count(long i) { return __popcnt(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int pop_count( long long i ){
-    return __popcnt64(i);
-}
+int pop_count(long long i) { return __popcnt64(i); }
 
 #else
-  #error "Popcount function is not defined for this compiler. Please report this with the compiler you are using to KokkosKernels."
+#error \
+    "Popcount function is not defined for this compiler. Please report this with the compiler you are using to KokkosKernels."
 #endif
 
-
 // least_set_bit function returns the position of right most set bit
 
-#if defined( __CUDA_ARCH__ ) || defined(__HIP_DEVICE_COMPILE__)
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
 KOKKOS_FORCEINLINE_FUNCTION
-int least_set_bit( unsigned i ){
-  return __ffs(i);
-}
+int least_set_bit(unsigned i) { return __ffs(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int least_set_bit( unsigned long i ){
+int least_set_bit(unsigned long i) {
 #if defined(__HIP_DEVICE_COMPILE__)
   return __ffsll(static_cast<unsigned long long>(i));
 #else
@@ -259,21 +186,14 @@ int least_set_bit( unsigned long i ){
 #endif
 }
 
-
 KOKKOS_FORCEINLINE_FUNCTION
-int least_set_bit( unsigned long long i ){
-  return __ffsll(i);
-}
-
-
+int least_set_bit(unsigned long long i) { return __ffsll(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int least_set_bit( int i ){
-  return __ffs(i);
-}
+int least_set_bit(int i) { return __ffs(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int least_set_bit( long i ){
+int least_set_bit(long i) {
 #if defined(__HIP_DEVICE_COMPILE__)
   return __ffsll(static_cast<long long>(i));
 #else
@@ -282,10 +202,7 @@ int least_set_bit( long i ){
 }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int least_set_bit( long long i ){
-  return __ffsll(i);
-}
-
+int least_set_bit(long long i) { return __ffsll(i); }
 
 /*
 #elif defined ( __INTEL_COMPILER )
@@ -333,70 +250,47 @@ int least_set_bit( long long i ){
 }
 */
 
-#elif defined ( __INTEL_COMPILER ) || defined( KOKKOS_COMPILER_IBM ) || defined( __GNUC__ ) || defined( __GNUG__ )
+#elif defined(__INTEL_COMPILER) || defined(KOKKOS_COMPILER_IBM) || \
+    defined(__GNUC__) || defined(__GNUG__)
 KOKKOS_FORCEINLINE_FUNCTION
-int least_set_bit( unsigned i ){
-  return __builtin_ffs(i);
-}
+int least_set_bit(unsigned i) { return __builtin_ffs(i); }
 KOKKOS_FORCEINLINE_FUNCTION
-int least_set_bit( unsigned long i ){
-  return __builtin_ffsl(i);
-}
+int least_set_bit(unsigned long i) { return __builtin_ffsl(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int least_set_bit( unsigned long long i ){
-  return __builtin_ffsll(i);
-}
+int least_set_bit(unsigned long long i) { return __builtin_ffsll(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int least_set_bit( int i ){
-  return __builtin_ffs(i);
-}
+int least_set_bit(int i) { return __builtin_ffs(i); }
 KOKKOS_FORCEINLINE_FUNCTION
-int least_set_bit(  long i ){
-  return __builtin_ffsl(i);
-}
+int least_set_bit(long i) { return __builtin_ffsl(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int least_set_bit(  long long i ){
-  return __builtin_ffsll(i);
-}
+int least_set_bit(long long i) { return __builtin_ffsll(i); }
 
-#elif defined (KOKKOS_COMPILER_MSVC)
+#elif defined(KOKKOS_COMPILER_MSVC)
 KOKKOS_FORCEINLINE_FUNCTION
-int least_set_bit( unsigned i ){
-    return __lzcnt(i);
-}
+int least_set_bit(unsigned i) { return __lzcnt(i); }
 KOKKOS_FORCEINLINE_FUNCTION
-int least_set_bit( unsigned long i ){
-    return __lzcnt(i);
-}
+int least_set_bit(unsigned long i) { return __lzcnt(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int least_set_bit( unsigned long long i ){
-    return __lzcnt64(i);
-}
+int least_set_bit(unsigned long long i) { return __lzcnt64(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int least_set_bit( int i ){
-    return __lzcnt(i);
-}
+int least_set_bit(int i) { return __lzcnt(i); }
 KOKKOS_FORCEINLINE_FUNCTION
-int least_set_bit(  long i ){
-    return __lzcnt(i);
-}
+int least_set_bit(long i) { return __lzcnt(i); }
 
 KOKKOS_FORCEINLINE_FUNCTION
-int least_set_bit(  long long i ){
-    return __lzcnt64(i);
-}
+int least_set_bit(long long i) { return __lzcnt64(i); }
 
 #else
-  #error "least_set_bit function is not defined for this compiler. Please report this with the compiler you are using to KokkosKernels."
+#error \
+    "least_set_bit function is not defined for this compiler. Please report this with the compiler you are using to KokkosKernels."
 #endif
 
-
-}
-}
+}  // namespace Impl
+}  // namespace KokkosKernels
 
 #endif
diff --git a/src/common/KokkosKernels_Controls.hpp b/src/common/KokkosKernels_Controls.hpp
index cda3dd915c..c5a47a24b3 100644
--- a/src/common/KokkosKernels_Controls.hpp
+++ b/src/common/KokkosKernels_Controls.hpp
@@ -42,11 +42,9 @@
 //@HEADER
 */
 
-
-
 #ifndef _KOKKOSKERNEL_CONTROLS_HPP
 #define _KOKKOSKERNEL_CONTROLS_HPP
-/// \file  KokkosKernels_Controls.hpp 
+/// \file  KokkosKernels_Controls.hpp
 /// \brief Mechanism to control internal behavior of kernels
 /// \author Luc Berger-Vergiat (lberge@sandia.gov)
 
@@ -63,83 +61,107 @@
 #include "cusparse.h"
 #endif
 
-namespace KokkosKernels{
-namespace Experimental{
-
-  // Declaration of Controls class
-  class Controls {
+#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE
+#include "rocsparse.h"
+#endif
 
-  public:
-    // Constructor
-    Controls() = default;
+namespace KokkosKernels {
+namespace Experimental {
 
-    // set a new parameter
-    void setParameter(const std::string& name, const std::string& value) {
-      kernel_parameters[name] = value;
-    }
+// Declaration of Controls class
+class Controls {
+ public:
+  // Constructor
+  Controls() = default;
 
-    // check if a parameter is already set
-    bool isParameter(const std::string& name) const {
-      bool return_value = false;
+  // set a new parameter
+  void setParameter(const std::string& name, const std::string& value) {
+    kernel_parameters[name] = value;
+  }
 
-      auto search = kernel_parameters.find(name);
-      if(search != kernel_parameters.end()) { return_value = true; }
+  // check if a parameter is already set
+  bool isParameter(const std::string& name) const {
+    bool return_value = false;
 
-      return return_value;
+    auto search = kernel_parameters.find(name);
+    if (search != kernel_parameters.end()) {
+      return_value = true;
     }
 
-    // retrieve the value associated with a parameter if it is already set
-    std::string getParameter(const std::string& name) const {
-      auto search = kernel_parameters.find(name);
-      std::string value;
-      if(search == kernel_parameters.end()) {
-	std::cout << "Parameter " << name << " was not found in the list of parameters!" << std::endl;
-	value = "";
-      } else {
-	value = search->second;
-      }
-      return value;
+    return return_value;
+  }
+
+  // retrieve the value associated with a parameter if it is already set
+  std::string getParameter(const std::string& name) const {
+    auto search = kernel_parameters.find(name);
+    std::string value;
+    if (search == kernel_parameters.end()) {
+      std::cout << "Parameter " << name
+                << " was not found in the list of parameters!" << std::endl;
+      value = "";
+    } else {
+      value = search->second;
     }
+    return value;
+  }
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
-    mutable cublasHandle_t cublasHandle = 0;
-
-    cublasHandle_t getCublasHandle() const {
-      if(cublasHandle == 0) {
-	KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton();
-	cublasHandle = s.handle;
-      }
-      return cublasHandle;
-    }
+  mutable cublasHandle_t cublasHandle = 0;
 
-    void setCublasHandle(const cublasHandle_t userCublasHandle) {
-      cublasHandle = userCublasHandle;
+  cublasHandle_t getCublasHandle() const {
+    if (cublasHandle == 0) {
+      KokkosBlas::Impl::CudaBlasSingleton& s =
+          KokkosBlas::Impl::CudaBlasSingleton::singleton();
+      cublasHandle = s.handle;
     }
+    return cublasHandle;
+  }
+
+  void setCublasHandle(const cublasHandle_t userCublasHandle) {
+    cublasHandle = userCublasHandle;
+  }
 #endif
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-    mutable cusparseHandle_t cusparseHandle = 0;
-
-    cusparseHandle_t getCusparseHandle() const {
-      if(cusparseHandle == 0) {
-	KokkosKernels::Impl::CusparseSingleton & s =
-	  KokkosKernels::Impl::CusparseSingleton::singleton();
-	cusparseHandle = s.cusparseHandle;
-      }
-      return cusparseHandle;
+  mutable cusparseHandle_t cusparseHandle = 0;
+
+  cusparseHandle_t getCusparseHandle() const {
+    if (cusparseHandle == 0) {
+      KokkosKernels::Impl::CusparseSingleton& s =
+          KokkosKernels::Impl::CusparseSingleton::singleton();
+      cusparseHandle = s.cusparseHandle;
     }
+    return cusparseHandle;
+  }
+
+  void setCusparseHandle(const cusparseHandle_t userCusparseHandle) {
+    cusparseHandle = userCusparseHandle;
+  }
+#endif
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE
+  mutable rocsparse_handle rocsparseHandle = 0;
 
-    void setCusparseHandle(const cusparseHandle_t userCusparseHandle) {
-      cusparseHandle = userCusparseHandle;
+  rocsparse_handle getRocsparseHandle() const {
+    if (rocsparseHandle == 0) {
+      KokkosKernels::Impl::RocsparseSingleton& s =
+          KokkosKernels::Impl::RocsparseSingleton::singleton();
+      rocsparseHandle = s.rocsparseHandle;
     }
+    return rocsparseHandle;
+  }
+
+  void setRocsparseHandle(const rocsparse_handle userRocsparseHandle) {
+    rocsparseHandle = userRocsparseHandle;
+  }
 #endif
 
-  private:
-    // storage for kernel parameters
-    std::unordered_map<std::string, std::string> kernel_parameters;
-  };
+ private:
+  // storage for kernel parameters
+  std::unordered_map<std::string, std::string> kernel_parameters;
+};
 
-} // namespace Experimental
-} // namespace KokkosKernels
+}  // namespace Experimental
+}  // namespace KokkosKernels
 
-#endif // _KOKKOSKERNEL_CONTROLS_HPP
+#endif  // _KOKKOSKERNEL_CONTROLS_HPP
diff --git a/src/sparse/impl/KokkosSparse_BsrMatrix_impl.hpp b/src/common/KokkosKernels_Error.hpp
similarity index 83%
rename from src/sparse/impl/KokkosSparse_BsrMatrix_impl.hpp
rename to src/common/KokkosKernels_Error.hpp
index 1e04cf3101..b2f41fd4f6 100644
--- a/src/sparse/impl/KokkosSparse_BsrMatrix_impl.hpp
+++ b/src/common/KokkosKernels_Error.hpp
@@ -36,23 +36,25 @@
 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Questions? Contact Luc Berger-Vergiat (lberge@sandia.gov)
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
 //
 // ************************************************************************
 //@HEADER
 */
 
-#ifndef KOKKOS_SPARSE_BSRMATRIX_IMPL_HPP_
-#define KOKKOS_SPARSE_BSRMATRIX_IMPL_HPP_
+#ifndef KOKKOSKERNELS_ERROR_HPP
+#define KOKKOSKERNELS_ERROR_HPP
 
-#include <set>
-#include <sstream>
 #include <stdexcept>
-#include <type_traits>
 
-#include "Kokkos_Core.hpp"
-#include "Kokkos_StaticCrsGraph.hpp"
-#include "Kokkos_ArithTraits.hpp"
-#include "KokkosSparse_CrsMatrix.hpp"
+namespace KokkosKernels {
+namespace Impl {
 
-#endif // KOKKOS_SPARSE_BSRMATRIX_IMPL_HPP_
+inline void throw_runtime_exception(const std::string &msg) {
+  throw std::runtime_error(msg);
+}
+
+}  // namespace Impl
+}  // namespace KokkosKernels
+
+#endif  // KOKKOSKERNELS_ERROR_HPP
diff --git a/src/common/KokkosKernels_ExecSpaceUtils.hpp b/src/common/KokkosKernels_ExecSpaceUtils.hpp
index 9f66c4d44a..444d787963 100644
--- a/src/common/KokkosKernels_ExecSpaceUtils.hpp
+++ b/src/common/KokkosKernels_ExecSpaceUtils.hpp
@@ -43,54 +43,56 @@
 */
 
 #include "Kokkos_Core.hpp"
-#include "Kokkos_Atomic.hpp"
+
+#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU)
+#include <level_zero/zes_api.h>
+#include <CL/sycl/backend/level_zero.hpp>
+#endif
 
 #ifndef _KOKKOSKERNELSUTILSEXECSPACEUTILS_HPP
 #define _KOKKOSKERNELSUTILSEXECSPACEUTILS_HPP
 
+namespace KokkosKernels {
 
-namespace KokkosKernels{
-
-namespace Impl{
+namespace Impl {
 
 enum ExecSpaceType {
   Exec_SERIAL,
   Exec_OMP,
-  Exec_PTHREADS,
-  Exec_QTHREADS,
+  Exec_THREADS,
   Exec_CUDA,
   Exec_HIP,
   Exec_SYCL
 };
 template <typename ExecutionSpace>
-KOKKOS_FORCEINLINE_FUNCTION ExecSpaceType kk_get_exec_space_type(){
+KOKKOS_FORCEINLINE_FUNCTION ExecSpaceType kk_get_exec_space_type() {
   ExecSpaceType exec_space = Exec_SERIAL;
-#if defined( KOKKOS_ENABLE_SERIAL )
-  if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){
+#if defined(KOKKOS_ENABLE_SERIAL)
+  if (std::is_same<Kokkos::Serial, ExecutionSpace>::value) {
     exec_space = Exec_SERIAL;
   }
 #endif
 
-#if defined( KOKKOS_ENABLE_THREADS )
-  if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){
-    exec_space =  Exec_PTHREADS;
+#if defined(KOKKOS_ENABLE_THREADS)
+  if (std::is_same<Kokkos::Threads, ExecutionSpace>::value) {
+    exec_space = Exec_THREADS;
   }
 #endif
 
-#if defined( KOKKOS_ENABLE_OPENMP )
-  if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){
+#if defined(KOKKOS_ENABLE_OPENMP)
+  if (std::is_same<Kokkos::OpenMP, ExecutionSpace>::value) {
     exec_space = Exec_OMP;
   }
 #endif
 
-#if defined( KOKKOS_ENABLE_CUDA )
-  if (std::is_same<Kokkos::Cuda, ExecutionSpace >::value){
+#if defined(KOKKOS_ENABLE_CUDA)
+  if (std::is_same<Kokkos::Cuda, ExecutionSpace>::value) {
     exec_space = Exec_CUDA;
   }
 #endif
 
-#if defined( KOKKOS_ENABLE_HIP )
-  if (std::is_same<Kokkos::Experimental::HIP, ExecutionSpace >::value){
+#if defined(KOKKOS_ENABLE_HIP)
+  if (std::is_same<Kokkos::Experimental::HIP, ExecutionSpace>::value) {
     exec_space = Exec_HIP;
   }
 #endif
@@ -101,11 +103,6 @@ KOKKOS_FORCEINLINE_FUNCTION ExecSpaceType kk_get_exec_space_type(){
   }
 #endif
 
-#if defined(KOKKOS_ENABLE_QTHREAD)
-  if (std::is_same<Kokkos::Qthread, ExecutionSpace>::value) {
-    exec_space = Exec_QTHREADS;
-  }
-#endif
   return exec_space;
 }
 
@@ -178,8 +175,8 @@ kk_is_a64fx_mem_space<Kokkos::HostSpace>() {
 // Host function to determine free and total device memory.
 // Will throw if execution space doesn't support this.
 template <typename MemorySpace>
-inline void kk_get_free_total_memory(
-    size_t& /* free_mem */, size_t & /* total_mem */) {
+inline void kk_get_free_total_memory(size_t& /* free_mem */,
+                                     size_t& /* total_mem */) {
   std::ostringstream oss;
   oss << "Error: memory space " << MemorySpace::name()
       << " does not support querying free/total memory.";
@@ -188,81 +185,146 @@ inline void kk_get_free_total_memory(
 
 #ifdef KOKKOS_ENABLE_CUDA
 template <>
-inline void kk_get_free_total_memory<Kokkos::CudaSpace>(size_t& free_mem, size_t& total_mem)
-{
+inline void kk_get_free_total_memory<Kokkos::CudaSpace>(size_t& free_mem,
+                                                        size_t& total_mem) {
   cudaMemGetInfo(&free_mem, &total_mem);
 }
 template <>
-inline void kk_get_free_total_memory<Kokkos::CudaUVMSpace>(size_t& free_mem, size_t& total_mem)
-{
+inline void kk_get_free_total_memory<Kokkos::CudaUVMSpace>(size_t& free_mem,
+                                                           size_t& total_mem) {
   cudaMemGetInfo(&free_mem, &total_mem);
 }
 template <>
-inline void kk_get_free_total_memory<Kokkos::CudaHostPinnedSpace>(size_t& free_mem, size_t& total_mem)
-{
+inline void kk_get_free_total_memory<Kokkos::CudaHostPinnedSpace>(
+    size_t& free_mem, size_t& total_mem) {
   cudaMemGetInfo(&free_mem, &total_mem);
 }
 #endif
 
 #ifdef KOKKOS_ENABLE_HIP
 template <>
-inline void kk_get_free_total_memory<Kokkos::Experimental::HIPSpace>(size_t& free_mem, size_t& total_mem)
-{
+inline void kk_get_free_total_memory<Kokkos::Experimental::HIPSpace>(
+    size_t& free_mem, size_t& total_mem) {
   hipMemGetInfo(&free_mem, &total_mem);
 }
 #endif
 
-inline int kk_get_suggested_vector_size(
-    const size_t nr, const  size_t nnz, const ExecSpaceType exec_space){
+// FIXME_SYCL Use compiler extension instead of low level interface when
+// available. Also, we assume to query memory associated with the default queue.
+#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU)
+template <>
+inline void kk_get_free_total_memory<Kokkos::Experimental::SYCLDeviceUSMSpace>(
+    size_t& free_mem, size_t& total_mem) {
+  sycl::queue queue;
+  sycl::device device = queue.get_device();
+  auto level_zero_handle =
+      sycl::get_native<sycl::backend::ext_oneapi_level_zero>(device);
+
+  uint32_t n_memory_modules = 0;
+  zesDeviceEnumMemoryModules(level_zero_handle, &n_memory_modules, nullptr);
+
+  if (n_memory_modules != 1) {
+    std::ostringstream oss;
+    oss << "Error: number of memory modules for the SYCL backend: "
+        << n_memory_modules
+        << ". We only support querying free/total memory if exactly one memory "
+           "module was found. Make sure that ZES_ENABLE_SYSMAN=1 is set at run "
+           "time if no memeory modules were found!";
+    throw std::runtime_error(oss.str());
+  }
+
+  zes_mem_handle_t memory_module_handle;
+  zesDeviceEnumMemoryModules(level_zero_handle, &n_memory_modules,
+                             &memory_module_handle);
+  zes_mem_state_t memory_properties{
+      ZES_STRUCTURE_TYPE_MEM_PROPERTIES,
+  };
+  zesMemoryGetState(memory_module_handle, &memory_properties);
+  total_mem = memory_properties.size;
+  free_mem  = memory_properties.free;
+}
+
+template <>
+inline void kk_get_free_total_memory<Kokkos::Experimental::SYCLHostUSMSpace>(
+    size_t& free_mem, size_t& total_mem) {
+  kk_get_free_total_memory<Kokkos::Experimental::SYCLDeviceUSMSpace>(free_mem,
+                                                                     total_mem);
+}
+
+template <>
+inline void kk_get_free_total_memory<Kokkos::Experimental::SYCLSharedUSMSpace>(
+    size_t& free_mem, size_t& total_mem) {
+  kk_get_free_total_memory<Kokkos::Experimental::SYCLDeviceUSMSpace>(free_mem,
+                                                                     total_mem);
+}
+#endif
+
+template <typename ExecSpace>
+inline int kk_get_max_vector_size() {
+  return Kokkos::TeamPolicy<ExecSpace>::vector_length_max();
+}
+
+#ifdef KOKKOS_ENABLE_SYCL
+template <>
+inline int kk_get_max_vector_size<Kokkos::Experimental::SYCL>() {
+  // FIXME SYCL: hardcoding to 8 is a workaround that seems to work for all
+  // kernels. Wait for max subgroup size query to be fixed in SYCL and/or
+  // Kokkos. Then TeamPolicy::vector_length_max() can be used for all
+  // backends.
+  return 8;
+}
+#endif
+
+inline int kk_get_suggested_vector_size(const size_t nr, const size_t nnz,
+                                        const ExecSpaceType exec_space) {
   int suggested_vector_size_ = 1;
-  switch (exec_space){
-  default:
-    break;
-  case Exec_SERIAL:
-  case Exec_OMP:
-  case Exec_PTHREADS:
-  case Exec_QTHREADS:
-    break;
-  case Exec_CUDA:
-  case Exec_HIP:
-    if (nr > 0)
-      suggested_vector_size_ = nnz / double (nr) + 0.5;
-    if (suggested_vector_size_ < 3){
-      suggested_vector_size_ = 2;
-    }
-    else if (suggested_vector_size_ <= 6){
-      suggested_vector_size_ = 4;
-    }
-    else if (suggested_vector_size_ <= 12){
-      suggested_vector_size_ = 8;
-    }
-    else if (suggested_vector_size_ <= 24){
-      suggested_vector_size_ = 16;
-    }
-    else {
-      if(exec_space == Exec_CUDA || suggested_vector_size_ <= 48) {
-        //use full CUDA warp, or half a HIP wavefront
+  int max_vector_size        = 1;
+  switch (exec_space) {
+    case Exec_CUDA: max_vector_size = 32; break;
+    case Exec_HIP: max_vector_size = 64; break;
+    case Exec_SYCL:
+      // FIXME SYCL: same as above - 8 is a workaround
+      max_vector_size = 8;
+      break;
+    default:;
+  }
+  switch (exec_space) {
+    default: break;
+    case Exec_SERIAL:
+    case Exec_OMP:
+    case Exec_THREADS: break;
+    case Exec_CUDA:
+    case Exec_HIP:
+    case Exec_SYCL:
+      if (nr > 0) suggested_vector_size_ = nnz / double(nr) + 0.5;
+      if (suggested_vector_size_ < 3) {
+        suggested_vector_size_ = 2;
+      } else if (suggested_vector_size_ <= 6) {
+        suggested_vector_size_ = 4;
+      } else if (suggested_vector_size_ <= 12) {
+        suggested_vector_size_ = 8;
+      } else if (suggested_vector_size_ <= 24) {
+        suggested_vector_size_ = 16;
+      } else if (suggested_vector_size_ <= 48) {
         suggested_vector_size_ = 32;
-      }
-      else {
-        //use full HIP wavefront
+      } else {
         suggested_vector_size_ = 64;
       }
-    }
-    break;
+      if (suggested_vector_size_ > max_vector_size)
+        suggested_vector_size_ = max_vector_size;
+      break;
   }
   return suggested_vector_size_;
-
 }
 
-
-inline int kk_get_suggested_team_size(const int vector_size, const ExecSpaceType exec_space){
-  if (exec_space == Exec_CUDA || exec_space == Exec_HIP) {
-    //TODO: where this is used, tune the target value for
-    //threads per block (but 256 is probably OK for CUDA and HIP)
+inline int kk_get_suggested_team_size(const int vector_size,
+                                      const ExecSpaceType exec_space) {
+  if (exec_space == Exec_CUDA || exec_space == Exec_HIP ||
+      exec_space == Exec_SYCL) {
+    // TODO: where this is used, tune the target value for
+    // threads per block (but 256 is probably OK for CUDA and HIP)
     return 256 / vector_size;
-  }
-  else {
+  } else {
     return 1;
   }
 }
@@ -314,15 +376,15 @@ struct SpaceInstance<Kokkos::Experimental::HIP> {
     hipStreamDestroy(stream);
   }
   static bool overlap() {
-    //TODO: does HIP have an equivalent for CUDA_LAUNCH_BLOCKING?
+    // TODO: does HIP have an equivalent for CUDA_LAUNCH_BLOCKING?
     return true;
   }
 };
 #endif
 
-}
+}  // namespace Experimental
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosKernels
 
 #endif
diff --git a/src/common/KokkosKernels_Handle.hpp b/src/common/KokkosKernels_Handle.hpp
index b37f27fc40..0e9ba8dc4e 100644
--- a/src/common/KokkosKernels_Handle.hpp
+++ b/src/common/KokkosKernels_Handle.hpp
@@ -54,86 +54,122 @@
 #ifndef _KOKKOSKERNELHANDLE_HPP
 #define _KOKKOSKERNELHANDLE_HPP
 
-namespace KokkosKernels{
+namespace KokkosKernels {
 
-namespace Experimental{
+namespace Experimental {
 
-template <class size_type_, class lno_t_, class scalar_t_,
-          class ExecutionSpace, class TemporaryMemorySpace, class PersistentMemorySpace>
-class KokkosKernelsHandle
-{
-public:
+template <class size_type_, class lno_t_, class scalar_t_, class ExecutionSpace,
+          class TemporaryMemorySpace, class PersistentMemorySpace>
+class KokkosKernelsHandle {
+  static_assert(std::is_signed<lno_t_>::value,
+                "KokkosKernelsHandle requires that lno_t_ (ordinal) is a "
+                "signed integer type.");
 
+ public:
   typedef typename ExecutionSpace::execution_space HandleExecSpace;
   typedef typename TemporaryMemorySpace::memory_space HandleTempMemorySpace;
-  typedef typename PersistentMemorySpace::memory_space HandlePersistentMemorySpace;
-  //typedef Kokkos::Device<ExecutionSpace::execution_space,ExecutionSpace::memory_space>  HandleExecSpace;
-  //typedef Kokkos::Device<TemporaryMemorySpace::execution_space,TemporaryMemorySpace::memory_space> HandleTempMemorySpace;
-  //typedef Kokkos::Device<PersistentMemorySpace::execution_space,PersistentMemorySpace::memory_space> HandlePersistentMemorySpace;
-
-  typedef typename std::remove_const<size_type_>::type  size_type;
+  typedef
+      typename PersistentMemorySpace::memory_space HandlePersistentMemorySpace;
+  // typedef
+  // Kokkos::Device<ExecutionSpace::execution_space,ExecutionSpace::memory_space>
+  // HandleExecSpace; typedef
+  // Kokkos::Device<TemporaryMemorySpace::execution_space,TemporaryMemorySpace::memory_space>
+  // HandleTempMemorySpace; typedef
+  // Kokkos::Device<PersistentMemorySpace::execution_space,PersistentMemorySpace::memory_space>
+  // HandlePersistentMemorySpace;
+
+  typedef typename std::remove_const<size_type_>::type size_type;
   typedef const size_type const_size_type;
 
-  typedef typename std::remove_const<lno_t_>::type  nnz_lno_t;
+  typedef typename std::remove_const<lno_t_>::type nnz_lno_t;
   typedef const nnz_lno_t const_nnz_lno_t;
 
-  typedef typename std::remove_const<scalar_t_>::type  nnz_scalar_t;
+  typedef typename std::remove_const<scalar_t_>::type nnz_scalar_t;
   typedef const nnz_scalar_t const_nnz_scalar_t;
 
-  template <  typename right_size_type_, typename right_lno_t_, typename right_scalar_t_,
-              typename right_ExecutionSpace, typename right_TemporaryMemorySpace, typename right_PersistentMemorySpace>
-  //KokkosKernelsHandle<const_size_type,const_nnz_lno_t, const_nnz_scalar_t, HandleExecSpace, HandleTempMemorySpace, HandlePersistentMemorySpace> &operator=
-  KokkosKernelsHandle
-      (KokkosKernelsHandle<right_size_type_, right_lno_t_, right_scalar_t_,
-            right_ExecutionSpace, right_TemporaryMemorySpace, right_PersistentMemorySpace> & right_side_handle){
-
-    static_assert (std::is_same<size_type_,  const_size_type>::value,
-        "Kernel handle left hand side should have const size type in assignment");
-    static_assert (std::is_same<lno_t_,  const_nnz_lno_t>::value,
-        "Kernel handle left hand side should have const lno type in assignment");
-    static_assert (std::is_same<scalar_t_,  const_nnz_scalar_t>::value,
-        "Kernel handle left hand side should have const scalar type in assignment");
-
-    static_assert (std::is_same<ExecutionSpace,  HandleExecSpace>::value,
-        "Kernel handle left hand side should have execution space in assignment");
-    static_assert (std::is_same<TemporaryMemorySpace,  HandleTempMemorySpace>::value,
-        "Kernel handle left hand side should have temp memory space in assignment");
-    static_assert (std::is_same<PersistentMemorySpace,  HandlePersistentMemorySpace>::value,
-        "Kernel handle left hand side should have persistent memory space in assignment");
-
-    typedef typename  std::remove_const<right_size_type_>::type  nonconst_right_size_type;
+  template <typename right_size_type_, typename right_lno_t_,
+            typename right_scalar_t_, typename right_ExecutionSpace,
+            typename right_TemporaryMemorySpace,
+            typename right_PersistentMemorySpace>
+  // KokkosKernelsHandle<const_size_type,const_nnz_lno_t, const_nnz_scalar_t,
+  // HandleExecSpace, HandleTempMemorySpace, HandlePersistentMemorySpace>
+  // &operator=
+  KokkosKernelsHandle(
+      KokkosKernelsHandle<right_size_type_, right_lno_t_, right_scalar_t_,
+                          right_ExecutionSpace, right_TemporaryMemorySpace,
+                          right_PersistentMemorySpace> &right_side_handle) {
+    static_assert(std::is_same<size_type_, const_size_type>::value,
+                  "Kernel handle left hand side should have const size type in "
+                  "assignment");
+    static_assert(std::is_same<lno_t_, const_nnz_lno_t>::value,
+                  "Kernel handle left hand side should have const lno type in "
+                  "assignment");
+    static_assert(std::is_same<scalar_t_, const_nnz_scalar_t>::value,
+                  "Kernel handle left hand side should have const scalar type "
+                  "in assignment");
+
+    static_assert(std::is_same<ExecutionSpace, HandleExecSpace>::value,
+                  "Kernel handle left hand side should have execution space in "
+                  "assignment");
+    static_assert(
+        std::is_same<TemporaryMemorySpace, HandleTempMemorySpace>::value,
+        "Kernel handle left hand side should have temp memory space in "
+        "assignment");
+    static_assert(
+        std::is_same<PersistentMemorySpace, HandlePersistentMemorySpace>::value,
+        "Kernel handle left hand side should have persistent memory space in "
+        "assignment");
+
+    typedef typename std::remove_const<right_size_type_>::type
+        nonconst_right_size_type;
     typedef const nonconst_right_size_type const_right_size_type;
 
-    typedef typename  std::remove_const<right_lno_t_>::type  nonconst_right_lno_t;
+    typedef typename std::remove_const<right_lno_t_>::type nonconst_right_lno_t;
     typedef const nonconst_right_lno_t const_right_lno_t;
 
-    typedef typename  std::remove_const<right_scalar_t_>::type  nonconst_right_scalar_t;
+    typedef typename std::remove_const<right_scalar_t_>::type
+        nonconst_right_scalar_t;
     typedef const nonconst_right_scalar_t const_right_scalar_t;
 
-
-    static_assert (std::is_same<size_type_,  const_right_size_type>::value,
-        "Kernel handle left and right sides should have same size type in assignment");
-    static_assert (std::is_same<lno_t_,  const_right_lno_t>::value,
-        "Kernel handle left and right sides should have same lno type in assignment");
-    static_assert (std::is_same<scalar_t_,  const_right_scalar_t>::value,
-        "Kernel handle left and right sides should have same scalar type in assignment");
-
-
-    static_assert (std::is_same<typename ExecutionSpace::execution_space,  typename right_ExecutionSpace::execution_space>::value,
-        "Kernel handle left and right sides should have same execution_space in assignment");
+    static_assert(std::is_same<size_type_, const_right_size_type>::value,
+                  "Kernel handle left and right sides should have same size "
+                  "type in assignment");
+    static_assert(std::is_same<lno_t_, const_right_lno_t>::value,
+                  "Kernel handle left and right sides should have same lno "
+                  "type in assignment");
+    static_assert(std::is_same<scalar_t_, const_right_scalar_t>::value,
+                  "Kernel handle left and right sides should have same scalar "
+                  "type in assignment");
+
+    static_assert(
+        std::is_same<typename ExecutionSpace::execution_space,
+                     typename right_ExecutionSpace::execution_space>::value,
+        "Kernel handle left and right sides should have same execution_space "
+        "in assignment");
     /*
-    static_assert (std::is_same<typename TemporaryMemorySpace::execution_space, typename right_TemporaryMemorySpace::execution_space>::value,
-        "Kernel handle left and right sides should have same TemporaryMemorySpace in assignment");
-    static_assert (std::is_same<typename PersistentMemorySpace::execution_space, typename right_PersistentMemorySpace::execution_space>::value,
-        "Kernel handle left and right sides should have same PersistentMemorySpace in assignment");
+    static_assert (std::is_same<typename TemporaryMemorySpace::execution_space,
+    typename right_TemporaryMemorySpace::execution_space>::value, "Kernel handle
+    left and right sides should have same TemporaryMemorySpace in assignment");
+    static_assert (std::is_same<typename PersistentMemorySpace::execution_space,
+    typename right_PersistentMemorySpace::execution_space>::value, "Kernel
+    handle left and right sides should have same PersistentMemorySpace in
+    assignment");
       */
-    static_assert (std::is_same<typename ExecutionSpace::memory_space, typename right_ExecutionSpace::memory_space>::value,
-        "Kernel handle left and right sides should have same ExecutionSpace in assignment");
-    static_assert (std::is_same<typename TemporaryMemorySpace::memory_space, typename right_TemporaryMemorySpace::memory_space>::value,
-        "Kernel handle left and right sides should have same TemporaryMemorySpace in assignment");
-    static_assert (std::is_same<typename PersistentMemorySpace::memory_space, typename right_PersistentMemorySpace::memory_space>::value,
-        "Kernel handle left and right sides should have same PersistentMemorySpace in assignment");
-
+    static_assert(
+        std::is_same<typename ExecutionSpace::memory_space,
+                     typename right_ExecutionSpace::memory_space>::value,
+        "Kernel handle left and right sides should have same ExecutionSpace in "
+        "assignment");
+    static_assert(
+        std::is_same<typename TemporaryMemorySpace::memory_space,
+                     typename right_TemporaryMemorySpace::memory_space>::value,
+        "Kernel handle left and right sides should have same "
+        "TemporaryMemorySpace in assignment");
+    static_assert(
+        std::is_same<typename PersistentMemorySpace::memory_space,
+                     typename right_PersistentMemorySpace::memory_space>::value,
+        "Kernel handle left and right sides should have same "
+        "PersistentMemorySpace in assignment");
 
     this->gcHandle    = right_side_handle.get_graph_coloring_handle();
     this->gcHandle_d2 = right_side_handle.get_distance2_graph_coloring_handle();
@@ -149,13 +185,13 @@ class KokkosKernelsHandle
     this->sptrsvHandle = right_side_handle.get_sptrsv_handle();
     this->spilukHandle = right_side_handle.get_spiluk_handle();
 
-    this->team_work_size = right_side_handle.get_set_team_work_size();
-    this->shared_memory_size = right_side_handle.get_shmem_size();
+    this->team_work_size      = right_side_handle.get_set_team_work_size();
+    this->shared_memory_size  = right_side_handle.get_shmem_size();
     this->suggested_team_size = right_side_handle.get_set_suggested_team_size();
 
-    this->my_exec_space = right_side_handle.get_handle_exec_space();
+    this->my_exec_space          = right_side_handle.get_handle_exec_space();
     this->use_dynamic_scheduling = right_side_handle.is_dynamic_scheduling();
-    this->KKVERBOSE = right_side_handle.get_verbose();
+    this->KKVERBOSE              = right_side_handle.get_verbose();
     this->vector_size = right_side_handle.get_set_suggested_vector_size();
 
     is_owner_of_the_gc_handle = false;
@@ -164,83 +200,107 @@ class KokkosKernelsHandle
     is_owner_of_the_gs_sptrsvL_handle = false;
     is_owner_of_the_gs_sptrsvU_handle = false;
     // ---------------------------------------- //
-    is_owner_of_the_d2_gc_handle = false;
-    is_owner_of_the_gs_handle = false;
+    is_owner_of_the_d2_gc_handle  = false;
+    is_owner_of_the_gs_handle     = false;
     is_owner_of_the_spgemm_handle = false;
-    is_owner_of_the_spadd_handle = false;
+    is_owner_of_the_spadd_handle  = false;
     is_owner_of_the_sptrsv_handle = false;
     is_owner_of_the_spiluk_handle = false;
-    //return *this;
+    // return *this;
   }
 
-
-  typedef typename KokkosGraph::
-    GraphColoringHandle<const_size_type, const_nnz_lno_t, const_nnz_lno_t, HandleExecSpace, HandleTempMemorySpace, HandlePersistentMemorySpace>
+  typedef typename KokkosGraph::GraphColoringHandle<
+      const_size_type, const_nnz_lno_t, const_nnz_lno_t, HandleExecSpace,
+      HandleTempMemorySpace, HandlePersistentMemorySpace>
       GraphColoringHandleType;
 
-  typedef typename KokkosGraph::
-    GraphColorDistance2Handle<const_size_type, const_nnz_lno_t, const_nnz_lno_t, HandleExecSpace, HandleTempMemorySpace, HandlePersistentMemorySpace>
+  typedef typename KokkosGraph::GraphColorDistance2Handle<
+      const_size_type, const_nnz_lno_t, const_nnz_lno_t, HandleExecSpace,
+      HandleTempMemorySpace, HandlePersistentMemorySpace>
       GraphColorDistance2HandleType;
 
-  typedef typename KokkosSparse::
-    GaussSeidelHandle<const_size_type, const_nnz_lno_t, const_nnz_scalar_t, HandleExecSpace, HandleTempMemorySpace, HandlePersistentMemorySpace>
+  typedef typename KokkosSparse::GaussSeidelHandle<
+      const_size_type, const_nnz_lno_t, const_nnz_scalar_t, HandleExecSpace,
+      HandleTempMemorySpace, HandlePersistentMemorySpace>
       GaussSeidelHandleType;
 
-  //These are subclasses of GaussSeidelHandleType.
-  typedef typename KokkosSparse::
-    PointGaussSeidelHandle<const_size_type, const_nnz_lno_t, const_nnz_scalar_t, HandleExecSpace, HandleTempMemorySpace, HandlePersistentMemorySpace>
+  // These are subclasses of GaussSeidelHandleType.
+  typedef typename KokkosSparse::PointGaussSeidelHandle<
+      const_size_type, const_nnz_lno_t, const_nnz_scalar_t, HandleExecSpace,
+      HandleTempMemorySpace, HandlePersistentMemorySpace>
       PointGaussSeidelHandleType;
-  typedef typename KokkosSparse::
-    ClusterGaussSeidelHandle<const_size_type, const_nnz_lno_t, const_nnz_scalar_t, HandleExecSpace, HandleTempMemorySpace, HandlePersistentMemorySpace>
+  typedef typename KokkosSparse::ClusterGaussSeidelHandle<
+      const_size_type, const_nnz_lno_t, const_nnz_scalar_t, HandleExecSpace,
+      HandleTempMemorySpace, HandlePersistentMemorySpace>
       ClusterGaussSeidelHandleType;
   // ---------------------------------------- //
   // These are for Two-stage Gauss-Seidel
-  typedef typename KokkosSparse::
-    TwoStageGaussSeidelHandle<const_size_type, const_nnz_lno_t, const_nnz_scalar_t, HandleExecSpace, HandleTempMemorySpace, HandlePersistentMemorySpace>
+  typedef typename KokkosSparse::TwoStageGaussSeidelHandle<
+      const_size_type, const_nnz_lno_t, const_nnz_scalar_t, HandleExecSpace,
+      HandleTempMemorySpace, HandlePersistentMemorySpace>
       TwoStageGaussSeidelHandleType;
-  typedef
-    KokkosKernelsHandle<const_size_type, const_nnz_lno_t, const_nnz_scalar_t, HandleExecSpace, HandleTempMemorySpace, HandleTempMemorySpace>
+  typedef KokkosKernelsHandle<const_size_type, const_nnz_lno_t,
+                              const_nnz_scalar_t, HandleExecSpace,
+                              HandleTempMemorySpace, HandleTempMemorySpace>
       TwoStageGaussSeidelSPTRSVHandleType;
   // ---------------------------------------- //
 
-  typedef typename KokkosSparse::
-    SPGEMMHandle<const_size_type, const_nnz_lno_t, const_nnz_scalar_t, HandleExecSpace, HandleTempMemorySpace, HandlePersistentMemorySpace>
+  typedef typename KokkosSparse::SPGEMMHandle<
+      const_size_type, const_nnz_lno_t, const_nnz_scalar_t, HandleExecSpace,
+      HandleTempMemorySpace, HandlePersistentMemorySpace>
       SPGEMMHandleType;
 
-  typedef typename Kokkos::View<nnz_scalar_t *, HandleTempMemorySpace> in_scalar_nnz_view_t;
-
-  typedef typename Kokkos::View<size_type *, HandleTempMemorySpace> row_lno_temp_work_view_t;
-  typedef typename Kokkos::View<size_type *, HandleTempMemorySpace> size_type_temp_work_view_t;
-  typedef typename Kokkos::View<size_type *, HandlePersistentMemorySpace> row_lno_persistent_work_view_t;
-  typedef typename Kokkos::View<size_type *, HandlePersistentMemorySpace> size_type_persistent_work_view_t;
-  typedef typename row_lno_persistent_work_view_t::HostMirror row_lno_persistent_work_host_view_t; //Host view type
-  typedef typename size_type_persistent_work_view_t::HostMirror size_type_persistent_work_host_view_t; //Host view type
-  typedef typename Kokkos::View<nnz_scalar_t *, HandleTempMemorySpace> scalar_temp_work_view_t;
-  typedef typename Kokkos::View<nnz_scalar_t *, HandlePersistentMemorySpace> scalar_persistent_work_view_t;
-  typedef typename Kokkos::View<nnz_scalar_t **, default_layout, HandlePersistentMemorySpace> scalar_persistent_work_view2d_t;
-  typedef typename Kokkos::View<nnz_lno_t *, HandleTempMemorySpace> nnz_lno_temp_work_view_t;
-  typedef typename Kokkos::View<nnz_lno_t *, HandlePersistentMemorySpace> nnz_lno_persistent_work_view_t;
-  typedef typename nnz_lno_persistent_work_view_t::HostMirror nnz_lno_persistent_work_host_view_t; //Host view type
-  typedef typename Kokkos::View<bool *, HandlePersistentMemorySpace> bool_persistent_view_t;
+  typedef typename Kokkos::View<nnz_scalar_t *, HandleTempMemorySpace>
+      in_scalar_nnz_view_t;
+
+  typedef typename Kokkos::View<size_type *, HandleTempMemorySpace>
+      row_lno_temp_work_view_t;
+  typedef typename Kokkos::View<size_type *, HandleTempMemorySpace>
+      size_type_temp_work_view_t;
+  typedef typename Kokkos::View<size_type *, HandlePersistentMemorySpace>
+      row_lno_persistent_work_view_t;
+  typedef typename Kokkos::View<size_type *, HandlePersistentMemorySpace>
+      size_type_persistent_work_view_t;
+  typedef typename row_lno_persistent_work_view_t::HostMirror
+      row_lno_persistent_work_host_view_t;  // Host view type
+  typedef typename size_type_persistent_work_view_t::HostMirror
+      size_type_persistent_work_host_view_t;  // Host view type
+  typedef typename Kokkos::View<nnz_scalar_t *, HandleTempMemorySpace>
+      scalar_temp_work_view_t;
+  typedef typename Kokkos::View<nnz_scalar_t *, HandlePersistentMemorySpace>
+      scalar_persistent_work_view_t;
+  typedef typename Kokkos::View<nnz_scalar_t **, default_layout,
+                                HandlePersistentMemorySpace>
+      scalar_persistent_work_view2d_t;
+  typedef typename Kokkos::View<nnz_lno_t *, HandleTempMemorySpace>
+      nnz_lno_temp_work_view_t;
+  typedef typename Kokkos::View<nnz_lno_t *, HandlePersistentMemorySpace>
+      nnz_lno_persistent_work_view_t;
+  typedef typename nnz_lno_persistent_work_view_t::HostMirror
+      nnz_lno_persistent_work_host_view_t;  // Host view type
+  typedef typename Kokkos::View<bool *, HandlePersistentMemorySpace>
+      bool_persistent_view_t;
   typedef typename Kokkos::View<bool *, HandleTempMemorySpace> bool_temp_view_t;
 
-  typedef
-    typename KokkosSparse::SPADDHandle<row_lno_temp_work_view_t, nnz_lno_temp_work_view_t, scalar_temp_work_view_t, HandleExecSpace, HandleTempMemorySpace>
+  typedef typename KokkosSparse::SPADDHandle<
+      row_lno_temp_work_view_t, nnz_lno_temp_work_view_t,
+      scalar_temp_work_view_t, HandleExecSpace, HandleTempMemorySpace>
       SPADDHandleType;
 
-  typedef
-    typename KokkosSparse::Experimental::SPTRSVHandle<const_size_type, const_nnz_lno_t, const_nnz_scalar_t, HandleExecSpace, HandleTempMemorySpace, HandlePersistentMemorySpace>
+  typedef typename KokkosSparse::Experimental::SPTRSVHandle<
+      const_size_type, const_nnz_lno_t, const_nnz_scalar_t, HandleExecSpace,
+      HandleTempMemorySpace, HandlePersistentMemorySpace>
       SPTRSVHandleType;
 #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
   using integer_view_host_t = typename SPTRSVHandleType::integer_view_host_t;
 #endif
 
-  typedef
-    typename KokkosSparse::Experimental::SPILUKHandle<const_size_type, const_nnz_lno_t, const_nnz_scalar_t, HandleExecSpace, HandleTempMemorySpace, HandlePersistentMemorySpace>
+  typedef typename KokkosSparse::Experimental::SPILUKHandle<
+      const_size_type, const_nnz_lno_t, const_nnz_scalar_t, HandleExecSpace,
+      HandleTempMemorySpace, HandlePersistentMemorySpace>
       SPILUKHandleType;
 
-private:
-
+ private:
   GraphColoringHandleType *gcHandle;
   GraphColorDistance2HandleType *gcHandle_d2;
 
@@ -278,41 +338,44 @@ class KokkosKernelsHandle
   bool is_owner_of_the_sptrsv_handle;
   bool is_owner_of_the_spiluk_handle;
 
-public:
-
+ public:
   KokkosKernelsHandle()
-    : gcHandle(NULL)
-    , gcHandle_d2(NULL)
-    , gsHandle(NULL)
-    // Handles for Classical GS (inner SpTRSV)
-    , gs_sptrsvLHandle(NULL)
-    , gs_sptrsvUHandle(NULL)
-    // ---------------------------------------- //
-    , spgemmHandle(NULL)
-    , spaddHandle(NULL)
-    , sptrsvHandle(NULL)
-    , spilukHandle(NULL)
-    , team_work_size(-1)
-    , shared_memory_size(16128)
-    , suggested_team_size(-1)
-    , my_exec_space(KokkosKernels::Impl::kk_get_exec_space_type<HandleExecSpace>())
-    , use_dynamic_scheduling(true)
-    , KKVERBOSE(false)
-    , vector_size(-1)
-    , is_owner_of_the_gc_handle(true)
-    , is_owner_of_the_d2_gc_handle(true)
-    , is_owner_of_the_gs_handle(true)
-    // Handles for Classical GS (inner SpTRSV)
-    , is_owner_of_the_gs_sptrsvL_handle(true)
-    , is_owner_of_the_gs_sptrsvU_handle(true)
-    // ---------------------------------------- //
-    , is_owner_of_the_spgemm_handle(true)
-    , is_owner_of_the_spadd_handle(true)
-    , is_owner_of_the_sptrsv_handle(true)
-    , is_owner_of_the_spiluk_handle(true)
-  {}
-
-  ~KokkosKernelsHandle(){
+      : gcHandle(NULL),
+        gcHandle_d2(NULL),
+        gsHandle(NULL)
+        // Handles for Classical GS (inner SpTRSV)
+        ,
+        gs_sptrsvLHandle(NULL),
+        gs_sptrsvUHandle(NULL)
+        // ---------------------------------------- //
+        ,
+        spgemmHandle(NULL),
+        spaddHandle(NULL),
+        sptrsvHandle(NULL),
+        spilukHandle(NULL),
+        team_work_size(-1),
+        shared_memory_size(16128),
+        suggested_team_size(-1),
+        my_exec_space(
+            KokkosKernels::Impl::kk_get_exec_space_type<HandleExecSpace>()),
+        use_dynamic_scheduling(true),
+        KKVERBOSE(false),
+        vector_size(-1),
+        is_owner_of_the_gc_handle(true),
+        is_owner_of_the_d2_gc_handle(true),
+        is_owner_of_the_gs_handle(true)
+        // Handles for Classical GS (inner SpTRSV)
+        ,
+        is_owner_of_the_gs_sptrsvL_handle(true),
+        is_owner_of_the_gs_sptrsvU_handle(true)
+        // ---------------------------------------- //
+        ,
+        is_owner_of_the_spgemm_handle(true),
+        is_owner_of_the_spadd_handle(true),
+        is_owner_of_the_sptrsv_handle(true),
+        is_owner_of_the_spiluk_handle(true) {}
+
+  ~KokkosKernelsHandle() {
     this->destroy_gs_handle();
     // ---------------------------------------- //
     // Handles for Classical GS (inner SpTRSV)
@@ -327,55 +390,51 @@ class KokkosKernelsHandle
     this->destroy_spiluk_handle();
   }
 
-
-  void set_verbose(bool verbose_){
-    this->KKVERBOSE = verbose_;
-  }
-  bool get_verbose (){
-    return this->KKVERBOSE;
-  }
+  void set_verbose(bool verbose_) { this->KKVERBOSE = verbose_; }
+  bool get_verbose() { return this->KKVERBOSE; }
   /**
    * \brief Sets the chunk size to be handled by each team.
-   * Big chunks are good, they reduce the overhead that occurs at the every iteration.
-   * However, too big chunks causes load-imbalances.
-   * If this is not set, the algorithm will use teamsize as chunk sizes.
-   * On GPUs, this is usually okay. On CPUs, we get better performance for bigger chunksizes.
-   * This depends on how big is your work. On SPGEMM a single work is the multiplication for a row.
-   * For laplace we had 50-500 FLOPs per row for AP and RxAP. Best was to use 256 chunks. ~12800 - 128K flops per team was fine.
-   * For brick we had 250 - 2500 FLOPs per  row for AP and RxAP. Best was to use 256 chunks. 64K to 640K flops per team was fine.
-   * But on EMPIRE 1200 - 20800 FLOPs per row for AP and RxAP. Best was to use 16 chunks. 19K to 330K was flops per team was fine.
-   * This bases on the load balancing issues introduced. A general way is to have 100K flops per team for SPGEMM.
-   * \param team_work_size_: input, the size of the chunks.
+   * Big chunks are good, they reduce the overhead that occurs at the every
+   * iteration. However, too big chunks causes load-imbalances. If this is not
+   * set, the algorithm will use teamsize as chunk sizes. On GPUs, this is
+   * usually okay. On CPUs, we get better performance for bigger chunksizes.
+   * This depends on how big is your work. On SPGEMM a single work is the
+   * multiplication for a row. For laplace we had 50-500 FLOPs per row for AP
+   * and RxAP. Best was to use 256 chunks. ~12800 - 128K flops per team was
+   * fine. For brick we had 250 - 2500 FLOPs per  row for AP and RxAP. Best was
+   * to use 256 chunks. 64K to 640K flops per team was fine. But on EMPIRE 1200
+   * - 20800 FLOPs per row for AP and RxAP. Best was to use 16 chunks. 19K to
+   * 330K was flops per team was fine. This bases on the load balancing issues
+   * introduced. A general way is to have 100K flops per team for SPGEMM. \param
+   * team_work_size_: input, the size of the chunks.
    */
-  void set_team_work_size(const int team_work_size_){
+  void set_team_work_size(const int team_work_size_) {
     this->team_work_size = team_work_size_;
   }
-  int get_set_team_work_size(){
-    return this->team_work_size;
-  }
+  int get_set_team_work_size() { return this->team_work_size; }
   /**
    * \brief Returns the enum type for the execution space.
    */
-  KokkosKernels::Impl::ExecSpaceType get_handle_exec_space(){
+  KokkosKernels::Impl::ExecSpaceType get_handle_exec_space() {
     return this->my_exec_space;
   }
 
   /**
-   * \brief Returns the suggested team work size. If set with set_team_work_size,
-   * it will return the set value. Otherwise it will return the teamsize.
-   * \param team_size: input, team size used by the kernel.
-   * \param concurrency: input, the number of threads overall. Not used currently.
+   * \brief Returns the suggested team work size. If set with
+   * set_team_work_size, it will return the set value. Otherwise it will return
+   * the teamsize. \param team_size: input, team size used by the kernel. \param
+   * concurrency: input, the number of threads overall. Not used currently.
    * \param overall_work_size: The overall work size.
    */
-  int get_team_work_size(const int team_size, const int /* concurrency */, const nnz_lno_t /* overall_work_size */){
+  int get_team_work_size(const int team_size, const int /* concurrency */,
+                         const nnz_lno_t /* overall_work_size */) {
     if (this->team_work_size != -1) {
       return this->team_work_size;
-    }
-    else {
-      if (my_exec_space == KokkosKernels::Impl::Exec_CUDA || my_exec_space == KokkosKernels::Impl::Exec_HIP) {
+    } else {
+      if (my_exec_space == KokkosKernels::Impl::Exec_CUDA ||
+          my_exec_space == KokkosKernels::Impl::Exec_HIP) {
         return team_size;
-      }
-      else {
+      } else {
         return 16;
       }
     }
@@ -386,34 +445,28 @@ class KokkosKernelsHandle
    * not for those kernels where load-imbalances might occur.
    * \input is_dynamic: true or false -> dynamic or static scheduling.
    */
-  void set_dynamic_scheduling(const bool is_dynamic){
+  void set_dynamic_scheduling(const bool is_dynamic) {
     this->use_dynamic_scheduling = is_dynamic;
   }
 
   /**
    * \brief Returns true or false, use dynamic scheduling or not.
    */
-  bool is_dynamic_scheduling(){
-    return this->use_dynamic_scheduling;
-  }
-
-
+  bool is_dynamic_scheduling() { return this->use_dynamic_scheduling; }
 
   /**
-   * \brief sets the shared memory size to be used by the kernels using shared memory on GPUs.
-   * \param shared_memory_size: input, shared memory size to be used by the kernel.   *
+   * \brief sets the shared memory size to be used by the kernels using shared
+   * memory on GPUs. \param shared_memory_size: input, shared memory size to be
+   * used by the kernel.   *
    */
-  void set_shmem_size(const size_t shared_memory_size_){
+  void set_shmem_size(const size_t shared_memory_size_) {
     this->shared_memory_size = shared_memory_size_;
   }
 
   /**
    * \brief Returns the shared memory size suggested by the handle.
    */
-  size_t get_shmem_size(){
-    return shared_memory_size;
-  }
-
+  size_t get_shmem_size() { return shared_memory_size; }
 
   /**
    * \brief Returns the suggested vector size based on the execution space.
@@ -422,147 +475,139 @@ class KokkosKernelsHandle
    * \param nr: number of rows, of vertices.
    * \param nnz: number of nonzeroes, or edges.
    */
-  int get_suggested_vector_size(const size_t nr,const size_t nnz){
-    if (vector_size == -1){
-      return KokkosKernels::Impl::kk_get_suggested_vector_size(nr, nnz, my_exec_space);
-    }
-    else {
+  int get_suggested_vector_size(const size_t nr, const size_t nnz) {
+    if (vector_size == -1) {
+      return KokkosKernels::Impl::kk_get_suggested_vector_size(nr, nnz,
+                                                               my_exec_space);
+    } else {
       return vector_size;
     }
   }
 
-  void set_suggested_vector_size(int vector_size_){
+  void set_suggested_vector_size(int vector_size_) {
     this->vector_size = vector_size_;
   }
 
-  int get_set_suggested_vector_size(){
-    return this->vector_size;
-  }
+  int get_set_suggested_vector_size() { return this->vector_size; }
   /**
    * \brief Sets the team size to be used by the kernels. On GPUs and CPUs
    * usually the defaults are fine. But on CPUs with hyperthreads it might be
    * worth of trying different team sizes.
    * \param suggested_team_size_: team size to set.
    */
-  void set_suggested_team_size(const int suggested_team_size_){
+  void set_suggested_team_size(const int suggested_team_size_) {
     this->suggested_team_size = suggested_team_size_;
   }
 
-  int get_set_suggested_team_size(){
-    return this->suggested_team_size;
-  }
-
+  int get_set_suggested_team_size() { return this->suggested_team_size; }
 
   /**
-   * \brief Returns the team size, either set by the user or suggested by the handle.
-   * \param vector_size: suggested vector size by the handle.
+   * \brief Returns the team size, either set by the user or suggested by the
+   * handle. \param vector_size: suggested vector size by the handle.
    */
-  int get_suggested_team_size(const int vector_size_){
-    if (this->suggested_team_size != -1){
+  int get_suggested_team_size(const int vector_size_) {
+    if (this->suggested_team_size != -1) {
       return this->suggested_team_size;
-    }
-    else {
-      return KokkosKernels::Impl::kk_get_suggested_team_size(vector_size_, my_exec_space);
+    } else {
+      return KokkosKernels::Impl::kk_get_suggested_team_size(vector_size_,
+                                                             my_exec_space);
     }
   }
 
-
-
   // SPGEM
-  SPGEMMHandleType *get_spgemm_handle(){
-    return this->spgemmHandle;
-  }
-  void create_spgemm_handle(KokkosSparse::SPGEMMAlgorithm spgemm_algo = KokkosSparse::SPGEMM_DEFAULT){
+  SPGEMMHandleType *get_spgemm_handle() { return this->spgemmHandle; }
+  void create_spgemm_handle(KokkosSparse::SPGEMMAlgorithm spgemm_algo =
+                                KokkosSparse::SPGEMM_DEFAULT) {
     this->destroy_spgemm_handle();
     this->is_owner_of_the_spgemm_handle = true;
-    this->spgemmHandle = new SPGEMMHandleType(spgemm_algo);
+    this->spgemmHandle                  = new SPGEMMHandleType(spgemm_algo);
   }
-  void destroy_spgemm_handle(){
-    if (is_owner_of_the_spgemm_handle && this->spgemmHandle != NULL){
+  void destroy_spgemm_handle() {
+    if (is_owner_of_the_spgemm_handle && this->spgemmHandle != NULL) {
       delete this->spgemmHandle;
       this->spgemmHandle = NULL;
     }
   }
 
-
-
   // Distance-1 Graph Coloring
-  GraphColoringHandleType *get_graph_coloring_handle(){
-    // (wcmclen): Should there be a check here to make sure we've created a GC handle before
-    //            handing the pointer out to something? This is disabled for now because it
-    //            gets thrown in tests run by spot check. Moving forward, we should consider
-    //            whether a "get the handle ptr, then allocate" vs. "only give out the handle ptr
-    //            if it actually exists" model.
-    //if(!this->is_owner_of_the_gc_handle)
+  GraphColoringHandleType *get_graph_coloring_handle() {
+    // (wcmclen): Should there be a check here to make sure we've created a GC
+    // handle before
+    //            handing the pointer out to something? This is disabled for now
+    //            because it gets thrown in tests run by spot check. Moving
+    //            forward, we should consider whether a "get the handle ptr,
+    //            then allocate" vs. "only give out the handle ptr if it
+    //            actually exists" model.
+    // if(!this->is_owner_of_the_gc_handle)
     //{
     //  throw std::runtime_error("Graph coloring handle has not been created.");
     //}
     return this->gcHandle;
   }
-  void create_graph_coloring_handle(KokkosGraph::ColoringAlgorithm coloring_type = KokkosGraph::COLORING_DEFAULT){
+  void create_graph_coloring_handle(
+      KokkosGraph::ColoringAlgorithm coloring_type =
+          KokkosGraph::COLORING_DEFAULT) {
     this->destroy_graph_coloring_handle();
     this->is_owner_of_the_gc_handle = true;
-    this->gcHandle = new GraphColoringHandleType();
+    this->gcHandle                  = new GraphColoringHandleType();
     this->gcHandle->set_algorithm(coloring_type, true);
     this->gcHandle->set_tictoc(KKVERBOSE);
   }
-  void destroy_graph_coloring_handle(){
-    if (is_owner_of_the_gc_handle &&  this->gcHandle != NULL){
+  void destroy_graph_coloring_handle() {
+    if (is_owner_of_the_gc_handle && this->gcHandle != NULL) {
       delete this->gcHandle;
       this->gcHandle = NULL;
     }
   }
 
-
-
   // Distance-2 Graph Coloring
-  GraphColorDistance2HandleType *get_distance2_graph_coloring_handle()
-  {
-    /* disabled for consistency with `get_graph_coloring_handle()`. See the comment there
-       for reasons.
-    if(!this->is_owner_of_the_d2_gc_handle)
+  GraphColorDistance2HandleType *get_distance2_graph_coloring_handle() {
+    /* disabled for consistency with `get_graph_coloring_handle()`. See the
+    comment there for reasons. if(!this->is_owner_of_the_d2_gc_handle)
     {
-      throw std::runtime_error("D2 graph coloring handle has not been created.");
+      throw std::runtime_error("D2 graph coloring handle has not been
+    created.");
     }
     */
     return this->gcHandle_d2;
   }
-  void create_distance2_graph_coloring_handle(KokkosGraph::GraphColoringAlgorithmDistance2 coloring_type = KokkosGraph::COLORING_D2_DEFAULT)
-  {
+  void create_distance2_graph_coloring_handle(
+      KokkosGraph::GraphColoringAlgorithmDistance2 coloring_type =
+          KokkosGraph::COLORING_D2_DEFAULT) {
     this->destroy_distance2_graph_coloring_handle();
     this->is_owner_of_the_d2_gc_handle = true;
-    this->gcHandle_d2 = new GraphColorDistance2HandleType();
+    this->gcHandle_d2                  = new GraphColorDistance2HandleType();
     this->gcHandle_d2->set_algorithm(coloring_type, true);
     this->gcHandle_d2->set_tictoc(KKVERBOSE);
     this->gcHandle_d2->set_verbose(KKVERBOSE);
   }
-  void destroy_distance2_graph_coloring_handle()
-  {
-    if(is_owner_of_the_d2_gc_handle && this->gcHandle_d2 != NULL)
-    {
+  void destroy_distance2_graph_coloring_handle() {
+    if (is_owner_of_the_d2_gc_handle && this->gcHandle_d2 != NULL) {
       delete this->gcHandle_d2;
       this->gcHandle_d2 = NULL;
     }
   }
 
-
-
-  GaussSeidelHandleType *get_gs_handle() {
-    return this->gsHandle;
-  }
+  GaussSeidelHandleType *get_gs_handle() { return this->gsHandle; }
   PointGaussSeidelHandleType *get_point_gs_handle() {
-    auto pgs = dynamic_cast<PointGaussSeidelHandleType*>(this->gsHandle);
-    if(this->gsHandle && !pgs)
-      throw std::runtime_error("GaussSeidelHandle exists but is not set up for point-coloring GS.");
+    auto pgs = dynamic_cast<PointGaussSeidelHandleType *>(this->gsHandle);
+    if (this->gsHandle && !pgs)
+      throw std::runtime_error(
+          "GaussSeidelHandle exists but is not set up for point-coloring GS.");
     return pgs;
   }
   ClusterGaussSeidelHandleType *get_cluster_gs_handle() {
-    auto cgs = dynamic_cast<ClusterGaussSeidelHandleType*>(this->gsHandle);
-    if(this->gsHandle && !cgs)
-      throw std::runtime_error("GaussSeidelHandle exists but is not set up for cluster-coloring GS.");
+    auto cgs = dynamic_cast<ClusterGaussSeidelHandleType *>(this->gsHandle);
+    if (this->gsHandle && !cgs)
+      throw std::runtime_error(
+          "GaussSeidelHandle exists but is not set up for cluster-coloring "
+          "GS.");
     return cgs;
   }
-  void create_gs_handle(KokkosSparse::GSAlgorithm gs_algorithm = KokkosSparse::GS_DEFAULT) {
+  void create_gs_handle(
+      KokkosSparse::GSAlgorithm gs_algorithm = KokkosSparse::GS_DEFAULT,
+      KokkosGraph::ColoringAlgorithm coloring_algorithm =
+          KokkosGraph::COLORING_DEFAULT) {
     this->destroy_gs_handle();
     this->is_owner_of_the_gs_handle = true;
     // ---------------------------------------- //
@@ -570,234 +615,227 @@ class KokkosKernelsHandle
     if (gs_algorithm == KokkosSparse::GS_TWOSTAGE)
       this->gsHandle = new TwoStageGaussSeidelHandleType();
     else
-      this->gsHandle = new PointGaussSeidelHandleType(gs_algorithm);
+      this->gsHandle =
+          new PointGaussSeidelHandleType(gs_algorithm, coloring_algorithm);
   }
   // ---------------------------------------- //
   // Two-stage Gauss-Seidel handle
   TwoStageGaussSeidelHandleType *get_twostage_gs_handle() {
-    auto gs2 = dynamic_cast<TwoStageGaussSeidelHandleType*>(this->gsHandle);
-    if(this->gsHandle && !gs2)
-      throw std::runtime_error("GaussSeidelHandle exists but is not set up for two-stage GS.");
+    auto gs2 = dynamic_cast<TwoStageGaussSeidelHandleType *>(this->gsHandle);
+    if (this->gsHandle && !gs2)
+      throw std::runtime_error(
+          "GaussSeidelHandle exists but is not set up for two-stage GS.");
     return gs2;
   }
   // ---------------------------------------- //
   // Specify numer of outer sweeps for two-stage Gauss-Seidel
-  void set_gs_set_num_outer_sweeps (int num_outer_sweeps) {
+  void set_gs_set_num_outer_sweeps(int num_outer_sweeps) {
     auto gs2 = get_twostage_gs_handle();
-    gs2->setNumOuterSweeps (num_outer_sweeps);
+    gs2->setNumOuterSweeps(num_outer_sweeps);
   }
   // ---------------------------------------- //
   // Specify numer of inner sweeps for two-stage Gauss-Seidel
-  void set_gs_set_num_inner_sweeps (int num_inner_sweeps) {
+  void set_gs_set_num_inner_sweeps(int num_inner_sweeps) {
     auto gs2 = get_twostage_gs_handle();
-    gs2->setNumInnerSweeps (num_inner_sweeps);
+    gs2->setNumInnerSweeps(num_inner_sweeps);
   }
   // ---------------------------------------- //
   // Specify damping factor of inner sweeps for two-stage Gauss-Seidel
-  void set_gs_set_inner_damp_factor (nnz_scalar_t damp_factor) {
+  void set_gs_set_inner_damp_factor(nnz_scalar_t damp_factor) {
     auto gs2 = get_twostage_gs_handle();
-    gs2->setInnerDampFactor (damp_factor);
+    gs2->setInnerDampFactor(damp_factor);
   }
   // ---------------------------------------- //
-  // Specify to use either Two-stage or Classical (i.e., inner Jacobi-Richardson or SpTrsv)
-  void set_gs_twostage (bool two_stage, size_type nrows) {
+  // Specify to use either Two-stage or Classical (i.e., inner Jacobi-Richardson
+  // or SpTrsv)
+  void set_gs_twostage(bool two_stage, size_type nrows) {
     auto gs2 = get_twostage_gs_handle();
-    gs2->setTwoStage (two_stage);
+    gs2->setTwoStage(two_stage);
     if (!two_stage) {
       using namespace KokkosSparse::Experimental;
-      #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
       // NOTE: we call CuSPARSE on GPU, if possible
       if (std::is_same<size_type, int>::value &&
           std::is_same<nnz_lno_t, int>::value &&
-          std::is_same<HandleExecSpace, Kokkos::Cuda>::value)
-      {
-        this->create_gs_sptrsvL_handle (SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows);
-        this->create_gs_sptrsvU_handle (SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows);
+          std::is_same<HandleExecSpace, Kokkos::Cuda>::value) {
+        this->create_gs_sptrsvL_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows);
+        this->create_gs_sptrsvU_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows);
       } else
-      #endif
+#endif
       {
-        this->create_gs_sptrsvL_handle (SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows);
-        this->create_gs_sptrsvU_handle (SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows);
+        this->create_gs_sptrsvL_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows);
+        this->create_gs_sptrsvU_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows);
       }
     }
   }
   // ---------------------------------------- //
   // Specify to use either Compact or Classical form of recurrence
-  void set_gs_twostage_compact_form (bool compact_form) {
+  void set_gs_twostage_compact_form(bool compact_form) {
     auto gs2 = get_twostage_gs_handle();
-    gs2->setCompactForm (compact_form);
+    gs2->setCompactForm(compact_form);
   }
 
-
-  void create_gs_handle(KokkosSparse::ClusteringAlgorithm clusterAlgo, nnz_lno_t hint_verts_per_cluster) {
+  void create_gs_handle(KokkosSparse::ClusteringAlgorithm clusterAlgo,
+                        nnz_lno_t hint_verts_per_cluster,
+                        KokkosGraph::ColoringAlgorithm coloring_algorithm =
+                            KokkosGraph::COLORING_DEFAULT) {
     this->destroy_gs_handle();
     this->is_owner_of_the_gs_handle = true;
-    this->gsHandle = new ClusterGaussSeidelHandleType(clusterAlgo, hint_verts_per_cluster);
+    this->gsHandle                  = new ClusterGaussSeidelHandleType(
+        clusterAlgo, hint_verts_per_cluster, coloring_algorithm);
   }
-  void destroy_gs_handle(){
-    if (is_owner_of_the_gs_handle && this->gsHandle != NULL){
+  void destroy_gs_handle() {
+    if (is_owner_of_the_gs_handle && this->gsHandle != NULL) {
       delete this->gsHandle;
       this->gsHandle = NULL;
     }
   }
 
-
   // ---------------------------------------- //
   // Handles for Classical GS (inner SpTRSV)
-  TwoStageGaussSeidelSPTRSVHandleType *get_gs_sptrsvL_handle(){
+  TwoStageGaussSeidelSPTRSVHandleType *get_gs_sptrsvL_handle() {
     return this->gs_sptrsvLHandle;
   }
-  TwoStageGaussSeidelSPTRSVHandleType *get_gs_sptrsvU_handle(){
+  TwoStageGaussSeidelSPTRSVHandleType *get_gs_sptrsvU_handle() {
     return this->gs_sptrsvUHandle;
   }
-  void create_gs_sptrsvL_handle(KokkosSparse::Experimental::SPTRSVAlgorithm algm, size_type nrows) {
+  void create_gs_sptrsvL_handle(
+      KokkosSparse::Experimental::SPTRSVAlgorithm algm, size_type nrows) {
     this->destroy_gs_sptrsvL_handle();
     this->is_owner_of_the_gs_sptrsvL_handle = true;
     this->gs_sptrsvLHandle = new TwoStageGaussSeidelSPTRSVHandleType();
     this->gs_sptrsvLHandle->create_sptrsv_handle(algm, nrows, true);
   }
-  void create_gs_sptrsvU_handle(KokkosSparse::Experimental::SPTRSVAlgorithm algm, size_type nrows) {
+  void create_gs_sptrsvU_handle(
+      KokkosSparse::Experimental::SPTRSVAlgorithm algm, size_type nrows) {
     this->destroy_gs_sptrsvU_handle();
     this->is_owner_of_the_gs_sptrsvU_handle = true;
     this->gs_sptrsvUHandle = new TwoStageGaussSeidelSPTRSVHandleType();
     this->gs_sptrsvUHandle->create_sptrsv_handle(algm, nrows, false);
   }
-  void destroy_gs_sptrsvL_handle(){
-    if (this->is_owner_of_the_gs_sptrsvL_handle && this->gs_sptrsvLHandle != nullptr)
-    {
+  void destroy_gs_sptrsvL_handle() {
+    if (this->is_owner_of_the_gs_sptrsvL_handle &&
+        this->gs_sptrsvLHandle != nullptr) {
       delete this->gs_sptrsvLHandle;
       this->gs_sptrsvLHandle = nullptr;
     }
   }
-  void destroy_gs_sptrsvU_handle(){
-    if (this->is_owner_of_the_gs_sptrsvU_handle && this->gs_sptrsvUHandle != nullptr)
-    {
+  void destroy_gs_sptrsvU_handle() {
+    if (this->is_owner_of_the_gs_sptrsvU_handle &&
+        this->gs_sptrsvUHandle != nullptr) {
       delete this->gs_sptrsvUHandle;
       this->gs_sptrsvUHandle = nullptr;
     }
   }
   // ---------------------------------------- //
 
-
-  SPADDHandleType *get_spadd_handle(){
-    return this->spaddHandle;
-  }
+  SPADDHandleType *get_spadd_handle() { return this->spaddHandle; }
   void create_spadd_handle(bool input_sorted) {
     this->destroy_spadd_handle();
     this->is_owner_of_the_spadd_handle = true;
-    this->spaddHandle = new SPADDHandleType(input_sorted);
+    this->spaddHandle                  = new SPADDHandleType(input_sorted);
   }
-  void destroy_spadd_handle(){
-    if (is_owner_of_the_spadd_handle && this->spaddHandle != NULL)
-    {
+  void destroy_spadd_handle() {
+    if (is_owner_of_the_spadd_handle && this->spaddHandle != NULL) {
       delete this->spaddHandle;
       this->spaddHandle = NULL;
     }
   }
 
-  SPTRSVHandleType *get_sptrsv_handle(){
-    return this->sptrsvHandle;
-  }
+  SPTRSVHandleType *get_sptrsv_handle() { return this->sptrsvHandle; }
 
-  void create_sptrsv_handle(KokkosSparse::Experimental::SPTRSVAlgorithm algm, size_type nrows, bool lower_tri) {
+  void create_sptrsv_handle(KokkosSparse::Experimental::SPTRSVAlgorithm algm,
+                            size_type nrows, bool lower_tri) {
     this->destroy_sptrsv_handle();
     this->is_owner_of_the_sptrsv_handle = true;
     this->sptrsvHandle = new SPTRSVHandleType(algm, nrows, lower_tri);
-//    this->sptrsvHandle->init_handle(nrows);
+    //    this->sptrsvHandle->init_handle(nrows);
     this->sptrsvHandle->set_team_size(this->team_work_size);
     this->sptrsvHandle->set_vector_size(this->vector_size);
 
 #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
     // default SpMV option
     if (algm == KokkosSparse::Experimental::SPTRSVAlgorithm::SUPERNODAL_SPMV ||
-        algm == KokkosSparse::Experimental::SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) {
-      this->set_sptrsv_column_major (true);
+        algm ==
+            KokkosSparse::Experimental::SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) {
+      this->set_sptrsv_column_major(true);
     }
 #endif
   }
 
 #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
-  void set_sptrsv_verbose (bool verbose) {
-    this->sptrsvHandle->set_verbose (verbose);
+  void set_sptrsv_verbose(bool verbose) {
+    this->sptrsvHandle->set_verbose(verbose);
   }
 
+  void set_sptrsv_perm(int *perm) { this->sptrsvHandle->set_perm(perm); }
 
-  void set_sptrsv_perm (int *perm) {
-    this->sptrsvHandle->set_perm (perm);
-  }
-
-  void set_sptrsv_supernodes (int nsuper, integer_view_host_t supercols, int *etree) {
-    this->sptrsvHandle->set_supernodes (nsuper, supercols, etree);
+  void set_sptrsv_supernodes(int nsuper, integer_view_host_t supercols,
+                             int *etree) {
+    this->sptrsvHandle->set_supernodes(nsuper, supercols, etree);
   }
 
-  void set_sptrsv_diag_supernode_sizes (int unblocked, int blocked) {
+  void set_sptrsv_diag_supernode_sizes(int unblocked, int blocked) {
     this->sptrsvHandle->set_supernode_size_unblocked(unblocked);
     this->sptrsvHandle->set_supernode_size_blocked(blocked);
   }
 
   void set_sptrsv_unit_diagonal(bool flag) {
-    this->sptrsvHandle->set_unit_diagonal (flag);
+    this->sptrsvHandle->set_unit_diagonal(flag);
   }
 
-  void set_sptrsv_merge_supernodes (bool flag) {
-    this->sptrsvHandle->set_merge_supernodes (flag);
+  void set_sptrsv_merge_supernodes(bool flag) {
+    this->sptrsvHandle->set_merge_supernodes(flag);
   }
 
   void set_sptrsv_invert_diagonal(bool flag) {
-    this->sptrsvHandle->set_invert_diagonal (flag);
+    this->sptrsvHandle->set_invert_diagonal(flag);
   }
 
-  void set_sptrsv_invert_offdiagonal (bool flag) {
-    if (flag == true && !(this->is_sptrsv_column_major ())) {
+  void set_sptrsv_invert_offdiagonal(bool flag) {
+    if (flag == true && !(this->is_sptrsv_column_major())) {
       std::cout << std::endl
-                << " ** cannot invert offdiagonal in CSR **"
-                << std::endl << std::endl;
+                << " ** cannot invert offdiagonal in CSR **" << std::endl
+                << std::endl;
       return;
     }
 
-    this->sptrsvHandle->set_invert_offdiagonal (flag);
-  }
-
-  void set_sptrsv_etree (int *etree) {
-    this->sptrsvHandle->set_etree (etree);
+    this->sptrsvHandle->set_invert_offdiagonal(flag);
   }
 
+  void set_sptrsv_etree(int *etree) { this->sptrsvHandle->set_etree(etree); }
 
-  void set_sptrsv_column_major (bool col_major) {
-    if (col_major == false && this->sptrsvHandle->get_invert_offdiagonal ()) {
+  void set_sptrsv_column_major(bool col_major) {
+    if (col_major == false && this->sptrsvHandle->get_invert_offdiagonal()) {
       std::cout << std::endl
                 << " ** cannot use CSR for inverting offdiagonal **"
-                << std::endl << std::endl;
+                << std::endl
+                << std::endl;
       return;
     }
-    this->sptrsvHandle->set_column_major (col_major);
+    this->sptrsvHandle->set_column_major(col_major);
   }
 
-  bool is_sptrsv_lower_tri () {
-    return this->sptrsvHandle->is_lower_tri ();
-  }
+  bool is_sptrsv_lower_tri() { return this->sptrsvHandle->is_lower_tri(); }
 
-  bool is_sptrsv_column_major () {
-    return this->sptrsvHandle->is_column_major ();
+  bool is_sptrsv_column_major() {
+    return this->sptrsvHandle->is_column_major();
   }
 
-  void set_sptrsv_trmm_on_device (bool trmm_on_device) {
-    this->sptrsvHandle->set_trmm_on_device (trmm_on_device);
+  void set_sptrsv_trmm_on_device(bool trmm_on_device) {
+    this->sptrsvHandle->set_trmm_on_device(trmm_on_device);
   }
 #endif
-  void destroy_sptrsv_handle(){
-    if (is_owner_of_the_sptrsv_handle && this->sptrsvHandle != nullptr)
-    {
+  void destroy_sptrsv_handle() {
+    if (is_owner_of_the_sptrsv_handle && this->sptrsvHandle != nullptr) {
       delete this->sptrsvHandle;
       this->sptrsvHandle = nullptr;
     }
   }
 
-
-  SPILUKHandleType *get_spiluk_handle(){
-    return this->spilukHandle;
-  }
-  void create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm algm, size_type nrows, size_type nnzL, size_type nnzU) {
+  SPILUKHandleType *get_spiluk_handle() { return this->spilukHandle; }
+  void create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm algm,
+                            size_type nrows, size_type nnzL, size_type nnzU) {
     this->destroy_spiluk_handle();
     this->is_owner_of_the_spiluk_handle = true;
     this->spilukHandle = new SPILUKHandleType(algm, nrows, nnzL, nnzU);
@@ -805,17 +843,16 @@ class KokkosKernelsHandle
     this->spilukHandle->set_team_size(this->team_work_size);
     this->spilukHandle->set_vector_size(this->vector_size);
   }
-  void destroy_spiluk_handle(){
-    if (is_owner_of_the_spiluk_handle && this->spilukHandle != nullptr)
-    {
+  void destroy_spiluk_handle() {
+    if (is_owner_of_the_spiluk_handle && this->spilukHandle != nullptr) {
       delete this->spilukHandle;
       this->spilukHandle = nullptr;
     }
   }
 
-};    // end class KokkosKernelsHandle
+};  // end class KokkosKernelsHandle
 
-}
-}
+}  // namespace Experimental
+}  // namespace KokkosKernels
 
-#endif //_KOKKOSKERNELHANDLE_HPP
+#endif  //_KOKKOSKERNELHANDLE_HPP
diff --git a/src/common/KokkosKernels_HashmapAccumulator.hpp b/src/common/KokkosKernels_HashmapAccumulator.hpp
index 72bf3fb408..b7f39f75c2 100644
--- a/src/common/KokkosKernels_HashmapAccumulator.hpp
+++ b/src/common/KokkosKernels_HashmapAccumulator.hpp
@@ -54,7 +54,7 @@ namespace Experimental {
 
 /**
  * @brief types of hash operations supported by HashmapAccumulator.
- * 
+ *
  * /var bitwiseAnd: Performs key & hashOpRHS
  * /var modulo:     Performs key % hashOpRHS
  * /var pow2Modulo: Performs key & (hashOpRHS - 1)
@@ -65,13 +65,15 @@ struct HashOpType {
   struct pow2Modulo {};
 };
 
-template <typename size_type, typename key_type, typename value_type, typename hash_type>
+template <typename size_type, typename key_type, typename value_type,
+          typename hash_type>
 /**
  * \brief HashmapAccumulator class
  * The use of this is described in the paper:
- *   "Performance-portable sparse matrix-matrix multiplication for many-core architectures"
- *   ( https://ieeexplore.ieee.org/abstract/document/7965111/ ) in section III.D
- * 
+ *   "Performance-portable sparse matrix-matrix multiplication for many-core
+ * architectures" ( https://ieeexplore.ieee.org/abstract/document/7965111/ ) in
+ * section III.D
+ *
  * Public members:
  * \var hash_begins: Holds the beginning indices of the linked lists
  *                   corresponding to hash values [Begins]
@@ -79,112 +81,101 @@ template <typename size_type, typename key_type, typename value_type, typename h
  *                   within the linked list [Nexts]
  * \var keys:        This stores the column indices of the crs matrix [Ids]
  * \var values:      This store the numerical values (matrix elements) [Values]
- * 
+ *
  * Private members:
  * \var __max_value_size: The length of the two arrays (keys and hash_nexts)
  * \var __hashOpRHS:      The right hand side of the requested hash operation.
  * \var __insert_success: Value to return upon insertion success.
  * \var __insert_full:    Value to return upon insertion failure.
  */
-struct HashmapAccumulator
-{
+struct HashmapAccumulator {
   // begin public members
   // issue-508, TODO: It's best for used_size to be an internal member of this
   // class but the current use-cases rely on used_size to be a parameter to the
   // below insertion routines. One way to remove used_size as a parameter to the
   // insertion routines is to instantiate multiple HashmapAccumulator objects
-  // (one hashmap for each team of threads) instead of using a single 
-  // HashmapAccumulator object for multiple teams of threads; this entails 
+  // (one hashmap for each team of threads) instead of using a single
+  // HashmapAccumulator object for multiple teams of threads; this entails
   // major refactoring throughout the kokkos-kernels code base.
   // Making used_size a pointer and private member of this
   // class still exposes access to this member outside of the class and is
   // not a good option.
   // size_type used_size;
 
-  // issue-508, TODO: The hash_begins, hash_nexts, keys, values, __insert_success,
-  // and __insert_full members should all be private as well. They should be managed
-  // solely by this HashmapAccumulator class: initialized in the constructor(s)
-  // and only managed by HashmapAccumulator insertion routines. Making these
-  // members private requires major refactoring throughout the kokkos-kernels
-  // code base. If allocations for these members must really live outside this
-  // class, we need new members that break __max_value_size into:
-  // hash_begins_len, hash_nexts_len, keys_len, and values_len...!
-  
-  size_type*  hash_begins;
-  size_type*  hash_nexts;
-  key_type*   keys;
-  value_type* values;
+  // issue-508, TODO: The hash_begins, hash_nexts, keys, values,
+  // __insert_success, and __insert_full members should all be private as well.
+  // They should be managed solely by this HashmapAccumulator class: initialized
+  // in the constructor(s) and only managed by HashmapAccumulator insertion
+  // routines. Making these members private requires major refactoring
+  // throughout the kokkos-kernels code base. If allocations for these members
+  // must really live outside this class, we need new members that break
+  // __max_value_size into: hash_begins_len, hash_nexts_len, keys_len, and
+  // values_len...!
+
+  size_type *hash_begins;
+  size_type *hash_nexts;
+  key_type *keys;
+  value_type *values;
 
   /**
    * \brief default constructor HashmapAccumulator
-   * Sets used_size to 0, __insert_success to 0, __insert_full to 1, and __hashOpRHS
-   * to 0.
-   * 
+   * Sets used_size to 0, __insert_success to 0, __insert_full to 1, and
+   * __hashOpRHS to 0.
+   *
    * Assumption: hash_begins_ are all initialized to -1.
    */
   KOKKOS_INLINE_FUNCTION
-  HashmapAccumulator ():
-        hash_begins(),
+  HashmapAccumulator()
+      : hash_begins(),
         hash_nexts(),
         keys(),
         values(),
         __max_value_size(),
-        __hashOpRHS(0)
-        {}
-
+        __hashOpRHS(0) {}
 
   /**
    * \brief parameterized constructor HashmapAccumulator
    * Sets used_size to 0, __insert_success to 0, and __insert_full to 1.
-   * 
+   *
    * \param max_value_size_: The length of the two arrays (keys and hash_nexts)
-   * \param hashOpRHS:       The right hand side of the requested hash operation.
-   * \param hash_begins_:    Holds the beginning indices of the linked lists
-   *                         corresponding to hash values [Begins]
-   * \param hash_nexts_:     Holds the indicies of the next elements
-   *                         within the linked list [Nexts]
+   * \param hashOpRHS:       The right hand side of the requested hash
+   * operation. \param hash_begins_:    Holds the beginning indices of the
+   * linked lists corresponding to hash values [Begins] \param hash_nexts_:
+   * Holds the indicies of the next elements within the linked list [Nexts]
    * \param keys_:           This stores the column indices of (??) [Ids]
-   * \param values_:         This store the (matrix element?) numerical value of (??) [Values]
-   * 
+   * \param values_:         This store the (matrix element?) numerical value of
+   * (??) [Values]
+   *
    * Assumption: hash_begins_ are all initialized to -1.
    */
   KOKKOS_INLINE_FUNCTION
-  HashmapAccumulator (
-      const size_type max_value_size_,
-      const size_type hashOpRHS,
-      size_type *hash_begins_,
-      size_type *hash_nexts_,
-      key_type *keys_,
-      value_type *values_):
-        hash_begins(hash_begins_),
+  HashmapAccumulator(const size_type max_value_size_, const size_type hashOpRHS,
+                     size_type *hash_begins_, size_type *hash_nexts_,
+                     key_type *keys_, value_type *values_)
+      : hash_begins(hash_begins_),
         hash_nexts(hash_nexts_),
         keys(keys_),
         values(values_),
         __max_value_size(max_value_size_),
-        __hashOpRHS(hashOpRHS)
-        {
-          // Substract 1 and use the bitwiseAnd __compute_hash member.
-          if (std::is_same<hash_type, HashOpType::pow2Modulo>::value) {
-            __hashOpRHS -= 1;
-          }
-        }
-
+        __hashOpRHS(hashOpRHS) {
+    // Substract 1 and use the bitwiseAnd __compute_hash member.
+    if (std::is_same<hash_type, HashOpType::pow2Modulo>::value) {
+      __hashOpRHS -= 1;
+    }
+  }
 
-  //function to be called from device.
-  //Accumulation is OR operation.
-  //Insertion is sequential, no race condition for the insertion.
+  // function to be called from device.
+  // Accumulation is OR operation.
+  // Insertion is sequential, no race condition for the insertion.
   KOKKOS_INLINE_FUNCTION
-  int sequential_insert_into_hash_mergeOr_TrackHashes (
-      key_type key,
-      value_type value,
-      size_type *used_size_,
-      size_type *used_hash_size,
-      size_type *used_hashes)
-  {
+  int sequential_insert_into_hash_mergeOr_TrackHashes(key_type key,
+                                                      value_type value,
+                                                      size_type *used_size_,
+                                                      size_type *used_hash_size,
+                                                      size_type *used_hashes) {
     size_type hash, i, my_index;
 
-    if (key == -1)
-      return __insert_success;
+    if (key == -1) return __insert_success;
 
     hash = __compute_hash(key, __hashOpRHS);
     for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) {
@@ -203,35 +194,29 @@ struct HashmapAccumulator
     hash_nexts[my_index] = hash_begins[hash];
 
     hash_begins[hash] = my_index;
-    keys[my_index] = key;
-    values[my_index] = value;
+    keys[my_index]    = key;
+    values[my_index]  = value;
     return __insert_success;
   }
 
-
-  //function to be called from device.
-  //Accumulation is OR operation.
-  //TODO: This function is for triangle counting.
-  //Assume that there are 2 values for triangle count.
+  // function to be called from device.
+  // Accumulation is OR operation.
+  // TODO: This function is for triangle counting.
+  // Assume that there are 2 values for triangle count.
   KOKKOS_INLINE_FUNCTION
-  int sequential_insert_into_hash_mergeOr_TriangleCount_TrackHashes (
-      key_type key,
-      value_type value,
-      value_type *values2,
-      size_type *used_size_,
-      size_type *used_hash_size,
-      size_type *used_hashes)
-  {
+  int sequential_insert_into_hash_mergeOr_TriangleCount_TrackHashes(
+      key_type key, value_type value, value_type *values2,
+      size_type *used_size_, size_type *used_hash_size,
+      size_type *used_hashes) {
     size_type hash, i, my_index;
 
-    if (key == -1)
-      return __insert_success;
+    if (key == -1) return __insert_success;
 
     hash = __compute_hash(key, __hashOpRHS);
     for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) {
       if (keys[i] == key) {
         values2[i] = values2[i] | (values[i] & value);
-        values[i] = values[i] | value;
+        values[i]  = values[i] | value;
         return __insert_success;
       }
     }
@@ -245,35 +230,29 @@ struct HashmapAccumulator
     hash_nexts[my_index] = hash_begins[hash];
 
     hash_begins[hash] = my_index;
-    keys[my_index] = key;
-    values[my_index] = value;
+    keys[my_index]    = key;
+    values[my_index]  = value;
     values2[my_index] = 0;
     return __insert_success;
   }
 
-
-  //this is used in slow triangle counting method.
-  //L x Incidence
+  // this is used in slow triangle counting method.
+  // L x Incidence
   KOKKOS_INLINE_FUNCTION
-  int sequential_insert_into_hash_mergeAnd_TriangleCount_TrackHashes (
-      key_type key,
-      value_type value,
-      value_type *values2,
-      size_type * /*used_size_*/,
-      size_type * /*used_hash_size*/,
-      size_type * /*used_hashes*/)
-  {
+  int sequential_insert_into_hash_mergeAnd_TriangleCount_TrackHashes(
+      key_type key, value_type value, value_type *values2,
+      size_type * /*used_size_*/, size_type * /*used_hash_size*/,
+      size_type * /*used_hashes*/) {
     size_type hash, i;
 
-    if (key == -1)
-      return __insert_success;
+    if (key == -1) return __insert_success;
 
-    //this function will only try to do an AND operation with
-    //existing keys. If the key is not there, returns __insert_full.
+    // this function will only try to do an AND operation with
+    // existing keys. If the key is not there, returns __insert_full.
     hash = __compute_hash(key, __hashOpRHS);
     for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) {
       if (keys[i] == key) {
-        //values2[i] = values2[i] | (values[i] & value);
+        // values2[i] = values2[i] | (values[i] & value);
         values[i] = values[i] & value;
         ++values2[i];
         return __insert_success;
@@ -282,23 +261,18 @@ struct HashmapAccumulator
     return __insert_full;
   }
 
-
-  //this is used in LxL or Incidence^T x L
+  // this is used in LxL or Incidence^T x L
   KOKKOS_INLINE_FUNCTION
-  value_type sequential_insert_into_hash_mergeAnd_TriangleCount_TrackHashes (
-      key_type key,
-      value_type value)
-  {
+  value_type sequential_insert_into_hash_mergeAnd_TriangleCount_TrackHashes(
+      key_type key, value_type value) {
     size_type hash, i;
 
-    if (key == -1)
-      return __insert_success;
+    if (key == -1) return __insert_success;
 
-    //this function will only try to do an AND operation with
-    //existing keys. If the key is not there, returns __insert_full.
+    // this function will only try to do an AND operation with
+    // existing keys. If the key is not there, returns __insert_full.
     hash = __compute_hash(key, __hashOpRHS);
-    for (i = hash_begins[hash]; i != -1; i = hash_nexts[i])
-    {
+    for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) {
       if (keys[i] == key) {
         return values[i] & value;
       }
@@ -306,91 +280,76 @@ struct HashmapAccumulator
     return 0;
   }
 
-
-  //this is used in slow triangle counting method.
-  //L x Incidence
+  // this is used in slow triangle counting method.
+  // L x Incidence
   KOKKOS_INLINE_FUNCTION
-  int sequential_insert_into_hash_TriangleCount_TrackHashes (
-      key_type key,
-      value_type value,
-      value_type *values2,
-      size_type *used_size_,
-      size_type *used_hash_size,
-      size_type *used_hashes)
-  {
+  int sequential_insert_into_hash_TriangleCount_TrackHashes(
+      key_type key, value_type value, value_type *values2,
+      size_type *used_size_, size_type *used_hash_size,
+      size_type *used_hashes) {
     size_type hash, my_index;
 
-    if (key == -1)
-      return __insert_success;
+    if (key == -1) return __insert_success;
 
-    //this function will directly insert, won't check if it exists already.
+    // this function will directly insert, won't check if it exists already.
     if (*used_size_ >= __max_value_size) return __insert_full;
     my_index = (*used_size_)++;
 
-    keys[my_index] = key;
-    values[my_index] = value;
+    keys[my_index]    = key;
+    values[my_index]  = value;
     values2[my_index] = 1;
 
     hash = __compute_hash(key, __hashOpRHS);
     if (hash_begins[hash] == -1) {
-      hash_begins[hash] = my_index;
+      hash_begins[hash]                = my_index;
       used_hashes[used_hash_size[0]++] = hash;
     } else {
       hash_nexts[my_index] = hash_begins[hash];
-      hash_begins[hash] = my_index;
+      hash_begins[hash]    = my_index;
     }
     return __insert_success;
   }
 
-
-  //this is used in LxL or Incidence^T x L
+  // this is used in LxL or Incidence^T x L
   KOKKOS_INLINE_FUNCTION
-  int sequential_insert_into_hash_TriangleCount_TrackHashes (
-      key_type key,
-      value_type value,
-      size_type *used_size_,
+  int sequential_insert_into_hash_TriangleCount_TrackHashes(
+      key_type key, value_type value, size_type *used_size_,
       size_type *used_hash_size,
-      size_type *used_hashes) // issue-508, TODO figure out what this "used_hashes" is for
+      size_type *used_hashes)  // issue-508, TODO figure out what this
+                               // "used_hashes" is for
   {
     size_type hash, my_index;
 
-    if (key == -1)
-      return __insert_success;
+    if (key == -1) return __insert_success;
 
-    //this function will directly insert, won't check if it exists already.
+    // this function will directly insert, won't check if it exists already.
     if (*used_size_ >= __max_value_size) return __insert_full;
     my_index = (*used_size_)++;
 
-    keys[my_index] = key;
+    keys[my_index]   = key;
     values[my_index] = value;
 
     hash = __compute_hash(key, __hashOpRHS);
     if (hash_begins[hash] == -1) {
-      hash_begins[hash] = my_index;
+      hash_begins[hash]                = my_index;
       used_hashes[used_hash_size[0]++] = hash;
     } else {
       hash_nexts[my_index] = hash_begins[hash];
-      hash_begins[hash] = my_index;
+      hash_begins[hash]    = my_index;
     }
     return __insert_success;
   }
 
-
-  //function to be called from device.
-  //Insertion is sequential, no race condition for the insertion.
-  //the mergeadd used in the numeric of KKMEM.
+  // function to be called from device.
+  // Insertion is sequential, no race condition for the insertion.
+  // the mergeadd used in the numeric of KKMEM.
   KOKKOS_INLINE_FUNCTION
-  int sequential_insert_into_hash_mergeAdd_TrackHashes (
-      key_type key,
-      value_type value,
-      size_type *used_size_,
-      size_type *used_hash_size,
-      size_type *used_hashes)
-  {
+  int sequential_insert_into_hash_mergeAdd_TrackHashes(
+      key_type key, value_type value, size_type *used_size_,
+      size_type *used_hash_size, size_type *used_hashes) {
     size_type hash, i, my_index;
 
-    if (key == -1)
-      return __insert_success;
+    if (key == -1) return __insert_success;
 
     // issue-508, TODO: ensure that i < __max_value_size, but
     // need information about length of keys, values, and hash_nexts first!
@@ -410,26 +369,22 @@ struct HashmapAccumulator
     hash_nexts[my_index] = hash_begins[hash];
 
     hash_begins[hash] = my_index;
-    keys[my_index] = key;
-    values[my_index] = value;
+    keys[my_index]    = key;
+    values[my_index]  = value;
     return __insert_success;
   }
 
-
-  //no values. simply adds to the keys.
-  //used in the compression to count the sets.
-  //also used in the symbolic of spgemm if no compression is applied.
+  // no values. simply adds to the keys.
+  // used in the compression to count the sets.
+  // also used in the symbolic of spgemm if no compression is applied.
   KOKKOS_INLINE_FUNCTION
-  int sequential_insert_into_hash_TrackHashes (
-      key_type key,
-      size_type *used_size_,
-      size_type *used_hash_size,
-      size_type *used_hashes)
-  {
+  int sequential_insert_into_hash_TrackHashes(key_type key,
+                                              size_type *used_size_,
+                                              size_type *used_hash_size,
+                                              size_type *used_hashes) {
     size_type hash, i, my_index;
 
-    if (key == -1)
-      return __insert_success;
+    if (key == -1) return __insert_success;
 
     hash = __compute_hash(key, __hashOpRHS);
     for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) {
@@ -446,31 +401,25 @@ struct HashmapAccumulator
     hash_nexts[my_index] = hash_begins[hash];
 
     hash_begins[hash] = my_index;
-    keys[my_index] = key;
+    keys[my_index]    = key;
     return __insert_success;
   }
 
-
-  //used in the kkmem's numeric phase for second level hashmaps.
-  //function to be called from device.
-  //Accumulation is Add operation. It is not atomicAdd, as this
-  //is for the cases where we know that none of the simultanous
-  //insertions will have the same key.
-  //Insertion is simulteanous for the vector lanes of a thread.
-  //used_size should be a shared pointer among the thread vectors
+  // used in the kkmem's numeric phase for second level hashmaps.
+  // function to be called from device.
+  // Accumulation is Add operation. It is not atomicAdd, as this
+  // is for the cases where we know that none of the simultanous
+  // insertions will have the same key.
+  // Insertion is simulteanous for the vector lanes of a thread.
+  // used_size should be a shared pointer among the thread vectors
   KOKKOS_INLINE_FUNCTION
-  int vector_atomic_insert_into_hash_mergeAdd_TrackHashes (
-      const key_type key,
-      const value_type value,
-      volatile size_type *used_size_,
-      size_type *used_hash_size,
-      size_type *used_hashes
-      )
-  {
+  int vector_atomic_insert_into_hash_mergeAdd_TrackHashes(
+      const key_type key, const value_type value,
+      volatile size_type *used_size_, size_type *used_hash_size,
+      size_type *used_hashes) {
     size_type hash, i, my_write_index, hashbeginning;
 
-    if (key == -1)
-      return __insert_success;
+    if (key == -1) return __insert_success;
 
     hash = __compute_hash(key, __hashOpRHS);
     if (hash != -1) {
@@ -491,22 +440,23 @@ struct HashmapAccumulator
     if (my_write_index >= __max_value_size) {
       return __insert_full;
     } else {
-
-      keys[my_write_index] = key;
+      keys[my_write_index]   = key;
       values[my_write_index] = value;
 
-      #if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
-      //this is an issue on VOLTA and up because warps do not go in SIMD fashion anymore.
-      //while some thread might insert my_write_index into linked list, another
-      //thread in the warp might be reading keys in above loop.
-      //before inserting the new value in liked list -- which is done with atomic exchange below,
-      //we make sure that the linked is is complete my assigning the hash_next to current head.
-      //the head might be different when we do the atomic exchange.
-      //this would cause temporarily skipping a key in the linkedlist until
-      //hash_nexts is updated second time as below.
-      //but this is okay for spgemm,
-      //because no two keys will be inserted into hashmap at the same time, as rows have unique columns.
-      
+#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \
+    defined(KOKKOS_ARCH_AMPERE)
+      // this is an issue on VOLTA and up because warps do not go in SIMD
+      // fashion anymore. while some thread might insert my_write_index into
+      // linked list, another thread in the warp might be reading keys in above
+      // loop. before inserting the new value in liked list -- which is done
+      // with atomic exchange below, we make sure that the linked is is complete
+      // my assigning the hash_next to current head. the head might be different
+      // when we do the atomic exchange. this would cause temporarily skipping a
+      // key in the linkedlist until hash_nexts is updated second time as below.
+      // but this is okay for spgemm,
+      // because no two keys will be inserted into hashmap at the same time, as
+      // rows have unique columns.
+
       // Neither the compiler nor the execution unit can re-order the line
       // directly below with the next line performing the atomic_exchange as the
       // atomic exchange writes to hash_begins[hash] and this line reads from
@@ -518,11 +468,13 @@ struct HashmapAccumulator
       // not be able to access the dangling linked list since
       // hash_nexts[my_write_index] would still be -1.
       hash_nexts[my_write_index] = hash_begins[hash];
-      #endif
+#endif
 
-      hashbeginning = Kokkos::atomic_exchange(hash_begins+hash, my_write_index);
+      hashbeginning =
+          Kokkos::atomic_exchange(hash_begins + hash, my_write_index);
       if (hashbeginning == -1) {
-        used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = hash;
+        used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] =
+            hash;
       }
       hash_nexts[my_write_index] = hashbeginning;
       return __insert_success;
@@ -532,20 +484,14 @@ struct HashmapAccumulator
   // NOTE: this is an exact copy of vector_atmoic_insert_into_hash_mergeAdd from
   // https://github.com/kokkos/kokkos-kernels/blob/750fe24508a69ed4dba92bb4a9e17a6094b1a083/src/common/KokkosKernels_HashmapAccumulator.hpp#L442-L502
   template <typename team_member_t>
-  KOKKOS_INLINE_FUNCTION
-  int vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length (
-      const team_member_t & /* teamMember */,
-      const int /* vector_size */,
-      size_type hash,
-      const key_type key,
-      const value_type value,
-      volatile size_type *used_size_,
-      const size_type max_value_size_)
-  {
+  KOKKOS_INLINE_FUNCTION int
+  vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length(
+      const team_member_t & /* teamMember */, const int /* vector_size */,
+      size_type hash, const key_type key, const value_type value,
+      volatile size_type *used_size_, const size_type max_value_size_) {
     // Cannot compute hash here due to impl_speed use-case
-    //hash = __compute_hash(key, __hashOpRHS);
-    if (key == -1)
-      return __insert_success;
+    // hash = __compute_hash(key, __hashOpRHS);
+    if (key == -1) return __insert_success;
 
     if (hash != -1) {
       size_type i = hash_begins[hash];
@@ -564,26 +510,29 @@ struct HashmapAccumulator
     if (used_size_[0] >= max_value_size_) {
       return __insert_full;
     }
-    size_type my_write_index = Kokkos::atomic_fetch_add(used_size_, size_type(1));
+    size_type my_write_index =
+        Kokkos::atomic_fetch_add(used_size_, size_type(1));
 
     if (my_write_index >= max_value_size_) {
       return __insert_full;
     } else {
-      keys[my_write_index] = key;
+      keys[my_write_index]   = key;
       values[my_write_index] = value;
 
-      #if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
-      //this is an issue on VOLTA and up because warps do not go in SIMD fashion anymore.
-      //while some thread might insert my_write_index into linked list, another
-      //thread in the warp might be reading keys in above loop.
-      //before inserting the new value in liked list -- which is done with atomic exchange below,
-      //we make sure that the linked is is complete my assigning the hash_next to current head.
-      //the head might be different when we do the atomic exchange.
-      //this would cause temporarily skipping a key in the linkedlist until
-      //hash_nexts is updated second time as below.
-      //but this is okay for spgemm,
-      //because no two keys will be inserted into hashmap at the same time, as rows have unique columns.
-      
+#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \
+    defined(KOKKOS_ARCH_AMPERE)
+      // this is an issue on VOLTA and up because warps do not go in SIMD
+      // fashion anymore. while some thread might insert my_write_index into
+      // linked list, another thread in the warp might be reading keys in above
+      // loop. before inserting the new value in liked list -- which is done
+      // with atomic exchange below, we make sure that the linked is is complete
+      // my assigning the hash_next to current head. the head might be different
+      // when we do the atomic exchange. this would cause temporarily skipping a
+      // key in the linkedlist until hash_nexts is updated second time as below.
+      // but this is okay for spgemm,
+      // because no two keys will be inserted into hashmap at the same time, as
+      // rows have unique columns.
+
       // Neither the compiler nor the execution unit can re-order the line
       // directly below with the next line performing the atomic_exchange as the
       // atomic exchange writes to hash_begins[hash] and this line reads from
@@ -595,54 +544,44 @@ struct HashmapAccumulator
       // not be able to access the dangling linked list since
       // hash_nexts[my_write_index] would still be -1.
       hash_nexts[my_write_index] = hash_begins[hash];
-      #endif
+#endif
 
       // Atomically:
       // hashbeginning = hash_begins[hash]
       // hash_begins[hash] = my_write_index
       // hash_nexts[my_write_index] = hash_begins[hash]
-      size_type hashbeginning = Kokkos::atomic_exchange(hash_begins+hash, my_write_index);
+      size_type hashbeginning =
+          Kokkos::atomic_exchange(hash_begins + hash, my_write_index);
       hash_nexts[my_write_index] = hashbeginning;
       return __insert_success;
     }
   }
 
-  //used in kkmem's numeric phase to insert to first level hashmaps.
-  //function to be called from device.
-  //Accumulation is Add operation. It is not atomicAdd, as this
-  //is for the cases where we know that none of the simultanous
-  //insertions will have the same key.
-  //Insertion is simulteanous for the vector lanes of a thread.
-  //used_size should be a shared pointer among the thread vectors
+  // used in kkmem's numeric phase to insert to first level hashmaps.
+  // function to be called from device.
+  // Accumulation is Add operation. It is not atomicAdd, as this
+  // is for the cases where we know that none of the simultanous
+  // insertions will have the same key.
+  // Insertion is simulteanous for the vector lanes of a thread.
+  // used_size should be a shared pointer among the thread vectors
   KOKKOS_INLINE_FUNCTION
-  int vector_atomic_insert_into_hash_mergeAdd (
-      const key_type key,
-      const value_type value,
-      volatile size_type *used_size_)
-  {
-    if (key == -1)
-      return __insert_success;
-
-    return vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length(nullptr,
-                                                                               0,
-                                                                               __compute_hash(key, __hashOpRHS),
-                                                                               key,
-                                                                               value,
-                                                                               used_size_,
-                                                                               __max_value_size);
+  int vector_atomic_insert_into_hash_mergeAdd(const key_type key,
+                                              const value_type value,
+                                              volatile size_type *used_size_) {
+    if (key == -1) return __insert_success;
+
+    return vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length(
+        nullptr, 0, __compute_hash(key, __hashOpRHS), key, value, used_size_,
+        __max_value_size);
   }
 
-  //used in symbolic of kkmem if the compression is not applied.
+  // used in symbolic of kkmem if the compression is not applied.
   KOKKOS_INLINE_FUNCTION
-  int vector_atomic_insert_into_hash (
-      const key_type &key,
-      volatile size_type *used_size_
-      )
-  {
+  int vector_atomic_insert_into_hash(const key_type &key,
+                                     volatile size_type *used_size_) {
     size_type hash, i, my_write_index, hashbeginning;
 
-    if (key == -1)
-      return __insert_success;
+    if (key == -1) return __insert_success;
 
     hash = __compute_hash(key, __hashOpRHS);
     for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) {
@@ -656,49 +595,47 @@ struct HashmapAccumulator
     if (my_write_index >= __max_value_size) {
       return __insert_full;
     } else {
-
       keys[my_write_index] = key;
 
-      #if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
-      //this is an issue on VOLTA and up because warps do not go in SIMD fashion anymore.
-      //while some thread might insert my_write_index into linked list, another
-      //thread in the warp might be reading keys in above loop.
-      //before inserting the new value in liked list -- which is done with atomic exchange below,
-      //we make sure that the linked is is complete my assigning the hash_next to current head.
-      //the head might be different when we do the atomic exchange.
-      //this would cause temporarily skipping a key in the linkedlist until
-      //hash_nexts is updated second time as below.
-      //but this is okay for spgemm,
-      //because no two keys will be inserted into hashmap at the same time, as rows have unique columns.
+#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \
+    defined(KOKKOS_ARCH_AMPERE)
+      // this is an issue on VOLTA and up because warps do not go in SIMD
+      // fashion anymore. while some thread might insert my_write_index into
+      // linked list, another thread in the warp might be reading keys in above
+      // loop. before inserting the new value in liked list -- which is done
+      // with atomic exchange below, we make sure that the linked is is complete
+      // my assigning the hash_next to current head. the head might be different
+      // when we do the atomic exchange. this would cause temporarily skipping a
+      // key in the linkedlist until hash_nexts is updated second time as below.
+      // but this is okay for spgemm,
+      // because no two keys will be inserted into hashmap at the same time, as
+      // rows have unique columns.
       hash_nexts[my_write_index] = hash_begins[hash];
-      #endif
+#endif
 
-      hashbeginning = Kokkos::atomic_exchange(hash_begins+hash, my_write_index);
+      hashbeginning =
+          Kokkos::atomic_exchange(hash_begins + hash, my_write_index);
       hash_nexts[my_write_index] = hashbeginning;
       return __insert_success;
     }
   }
 
-
-  //function to be called from device.
-  //Accumulation is Add operation. It is not atomicAdd, as this
-  //is for the cases where we know that none of the simultanous
-  //insertions will have the same key.
-  //Insertion is simulteanous for the vector lanes of a thread.
-  //used_size should be a shared pointer among the thread vectors
+  // function to be called from device.
+  // Accumulation is Add operation. It is not atomicAdd, as this
+  // is for the cases where we know that none of the simultanous
+  // insertions will have the same key.
+  // Insertion is simulteanous for the vector lanes of a thread.
+  // used_size should be a shared pointer among the thread vectors
   KOKKOS_INLINE_FUNCTION
-  int vector_atomic_insert_into_hash_mergeOr (
-      const key_type &key,
-      const value_type &value,
-      volatile size_type *used_size_)
-  {
+  int vector_atomic_insert_into_hash_mergeOr(const key_type &key,
+                                             const value_type &value,
+                                             volatile size_type *used_size_) {
     size_type hash, i, my_write_index, hashbeginning;
 
-    if (key == -1)
-      return __insert_success;
+    if (key == -1) return __insert_success;
 
     hash = __compute_hash(key, __hashOpRHS);
-    for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]){
+    for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) {
       if (keys[i] == key) {
         values[i] = values[i] | value;
         return __insert_success;
@@ -710,49 +647,46 @@ struct HashmapAccumulator
     if (my_write_index >= __max_value_size) {
       return __insert_full;
     } else {
-
-      keys[my_write_index] = key;
+      keys[my_write_index]   = key;
       values[my_write_index] = value;
 
-      #if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
-      //this is an issue on VOLTA and up because warps do not go in SIMD fashion anymore.
-      //while some thread might insert my_write_index into linked list, another
-      //thread in the warp might be reading keys in above loop.
-      //before inserting the new value in liked list -- which is done with atomic exchange below,
-      //we make sure that the linked is is complete my assigning the hash_next to current head.
-      //the head might be different when we do the atomic exchange.
-      //this would cause temporarily skipping a key in the linkedlist until
-      //hash_nexts is updated second time as below.
-      //but this is okay for spgemm,
-      //because no two keys will be inserted into hashmap at the same time, as rows have unique columns.
+#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \
+    defined(KOKKOS_ARCH_AMPERE)
+      // this is an issue on VOLTA and up because warps do not go in SIMD
+      // fashion anymore. while some thread might insert my_write_index into
+      // linked list, another thread in the warp might be reading keys in above
+      // loop. before inserting the new value in liked list -- which is done
+      // with atomic exchange below, we make sure that the linked is is complete
+      // my assigning the hash_next to current head. the head might be different
+      // when we do the atomic exchange. this would cause temporarily skipping a
+      // key in the linkedlist until hash_nexts is updated second time as below.
+      // but this is okay for spgemm,
+      // because no two keys will be inserted into hashmap at the same time, as
+      // rows have unique columns.
       hash_nexts[my_write_index] = hash_begins[hash];
-      #endif
+#endif
 
-      hashbeginning = Kokkos::atomic_exchange(hash_begins+hash, my_write_index);
+      hashbeginning =
+          Kokkos::atomic_exchange(hash_begins + hash, my_write_index);
       hash_nexts[my_write_index] = hashbeginning;
       return __insert_success;
     }
   }
 
-
-  //function to be called from device.
-  //Accumulation is Add operation. It is not atomicAdd, as this
-  //is for the cases where we know that none of the simultanous
-  //insertions will have the same key.
-  //Insertion is simulteanous for the vector lanes of a thread.
-  //used_size should be a shared pointer among the thread vectors
+  // function to be called from device.
+  // Accumulation is Add operation. It is not atomicAdd, as this
+  // is for the cases where we know that none of the simultanous
+  // insertions will have the same key.
+  // Insertion is simulteanous for the vector lanes of a thread.
+  // used_size should be a shared pointer among the thread vectors
   KOKKOS_INLINE_FUNCTION
-  int vector_atomic_insert_into_hash_mergeOr_TrackHashes (
-      const key_type &key,
-      const value_type &value,
-      volatile size_type *used_size_,
-      size_type *used_hash_size,
-      size_type *used_hashes)
-  {
+  int vector_atomic_insert_into_hash_mergeOr_TrackHashes(
+      const key_type &key, const value_type &value,
+      volatile size_type *used_size_, size_type *used_hash_size,
+      size_type *used_hashes) {
     size_type hash, i, my_write_index, hashbeginning;
 
-    if (key == -1)
-      return __insert_success;
+    if (key == -1) return __insert_success;
 
     hash = __compute_hash(key, __hashOpRHS);
     for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) {
@@ -767,45 +701,44 @@ struct HashmapAccumulator
     if (my_write_index >= __max_value_size) {
       return __insert_full;
     } else {
-
-      keys[my_write_index] = key;
+      keys[my_write_index]   = key;
       values[my_write_index] = value;
 
-      #if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
-      //this is an issue on VOLTA and up because warps do not go in SIMD fashion anymore.
-      //while some thread might insert my_write_index into linked list, another
-      //thread in the warp might be reading keys in above loop.
-      //before inserting the new value in liked list -- which is done with atomic exchange below,
-      //we make sure that the linked is is complete my assigning the hash_next to current head.
-      //the head might be different when we do the atomic exchange.
-      //this would cause temporarily skipping a key in the linkedlist until
-      //hash_nexts is updated second time as below.
-      //but this is okay for spgemm,
-      //because no two keys will be inserted into hashmap at the same time, as rows have unique columns.
+#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \
+    defined(KOKKOS_ARCH_AMPERE)
+      // this is an issue on VOLTA and up because warps do not go in SIMD
+      // fashion anymore. while some thread might insert my_write_index into
+      // linked list, another thread in the warp might be reading keys in above
+      // loop. before inserting the new value in liked list -- which is done
+      // with atomic exchange below, we make sure that the linked is is complete
+      // my assigning the hash_next to current head. the head might be different
+      // when we do the atomic exchange. this would cause temporarily skipping a
+      // key in the linkedlist until hash_nexts is updated second time as below.
+      // but this is okay for spgemm,
+      // because no two keys will be inserted into hashmap at the same time, as
+      // rows have unique columns.
       hash_nexts[my_write_index] = hash_begins[hash];
-      #endif
+#endif
 
-      hashbeginning = Kokkos::atomic_exchange(hash_begins+hash, my_write_index);
+      hashbeginning =
+          Kokkos::atomic_exchange(hash_begins + hash, my_write_index);
       if (hashbeginning == -1) {
-        used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = hash;
+        used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] =
+            hash;
       }
       hash_nexts[my_write_index] = hashbeginning;
       return __insert_success;
     }
   }
 
-
   KOKKOS_INLINE_FUNCTION
-  int vector_atomic_insert_into_hash_TrackHashes (
-      const key_type &key,
-      volatile size_type *used_size_,
-      size_type *used_hash_size,
-      size_type *used_hashes)
-  {
+  int vector_atomic_insert_into_hash_TrackHashes(const key_type &key,
+                                                 volatile size_type *used_size_,
+                                                 size_type *used_hash_size,
+                                                 size_type *used_hashes) {
     size_type hash, i, my_write_index, hashbeginning;
 
-    if (key == -1)
-      return __insert_success;
+    if (key == -1) return __insert_success;
 
     hash = __compute_hash(key, __hashOpRHS);
     for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) {
@@ -819,76 +752,70 @@ struct HashmapAccumulator
     if (my_write_index >= __max_value_size) {
       return __insert_full;
     } else {
-
       keys[my_write_index] = key;
 
-      #if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
-      //this is an issue on VOLTA and up because warps do not go in SIMD fashion anymore.
-      //while some thread might insert my_write_index into linked list, another
-      //thread in the warp might be reading keys in above loop.
-      //before inserting the new value in liked list -- which is done with atomic exchange below,
-      //we make sure that the linked is is complete my assigning the hash_next to current head.
-      //the head might be different when we do the atomic exchange.
-      //this would cause temporarily skipping a key in the linkedlist until
-      //hash_nexts is updated second time as below.
-      //but this is okay for spgemm,
-      //because no two keys will be inserted into hashmap at the same time, as rows have unique columns.
+#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \
+    defined(KOKKOS_ARCH_AMPERE)
+      // this is an issue on VOLTA and up because warps do not go in SIMD
+      // fashion anymore. while some thread might insert my_write_index into
+      // linked list, another thread in the warp might be reading keys in above
+      // loop. before inserting the new value in liked list -- which is done
+      // with atomic exchange below, we make sure that the linked is is complete
+      // my assigning the hash_next to current head. the head might be different
+      // when we do the atomic exchange. this would cause temporarily skipping a
+      // key in the linkedlist until hash_nexts is updated second time as below.
+      // but this is okay for spgemm,
+      // because no two keys will be inserted into hashmap at the same time, as
+      // rows have unique columns.
       hash_nexts[my_write_index] = hash_begins[hash];
-      #endif
+#endif
 
-      hashbeginning = Kokkos::atomic_exchange(hash_begins+hash, my_write_index);
+      hashbeginning =
+          Kokkos::atomic_exchange(hash_begins + hash, my_write_index);
       if (hashbeginning == -1) {
-        used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = hash;
+        used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] =
+            hash;
       }
       hash_nexts[my_write_index] = hashbeginning;
       return __insert_success;
     }
   }
   // end public members
-  private:
-    size_type __max_value_size;
-    size_type __hashOpRHS;
-    static constexpr
-    int __insert_success = 0;
-    static constexpr
-    int __insert_full = 1;
-
-    template<typename U = hash_type, 
-             typename std::enable_if<std::is_same<U, 
-                                     HashOpType::bitwiseAnd>::value ||
-                                     std::is_same<U, HashOpType::pow2Modulo>::value, 
-                      std::size_t>::type = 0>
-    KOKKOS_INLINE_FUNCTION
-    int __compute_hash(size_type key, size_type bitmask) {
-      size_type hash = key & bitmask;
-      #ifdef HASHMAPACCUMULATOR_ASSERT_ENABLED
-        if (hash == -1)
-          Kokkos::abort("__compute_hash: hash = -1");
-        if (key == -1)
-          Kokkos::abort("__compute_hash: key = -1");
-      #endif // HASHMAPACCUMULATOR_ASSERT_ENABLED
-      return hash;
-    }
+ private:
+  size_type __max_value_size;
+  size_type __hashOpRHS;
+  static constexpr int __insert_success = 0;
+  static constexpr int __insert_full    = 1;
+
+  template <typename U = hash_type,
+            typename std::enable_if<
+                std::is_same<U, HashOpType::bitwiseAnd>::value ||
+                    std::is_same<U, HashOpType::pow2Modulo>::value,
+                std::size_t>::type = 0>
+  KOKKOS_INLINE_FUNCTION int __compute_hash(size_type key, size_type bitmask) {
+    size_type hash = key & bitmask;
+#ifdef HASHMAPACCUMULATOR_ASSERT_ENABLED
+    if (hash == -1) Kokkos::abort("__compute_hash: hash = -1");
+    if (key == -1) Kokkos::abort("__compute_hash: key = -1");
+#endif  // HASHMAPACCUMULATOR_ASSERT_ENABLED
+    return hash;
+  }
 
-    template<typename U = hash_type, typename std::enable_if<std::is_same<U, HashOpType::modulo>::value, std::size_t>::type = 0>
-    KOKKOS_INLINE_FUNCTION
-    int __compute_hash(size_type key, size_type divisor) {
-      size_type hash = key % divisor;
-      #ifdef HASHMAPACCUMULATOR_ASSERT_ENABLED
-        if (hash == -1)
-          Kokkos::abort("__compute_hash: hash = -1");
-        if (key == -1)
-          Kokkos::abort("__compute_hash: key = -1");
-      #endif // HASHMAPACCUMULATOR_ASSERT_ENABLED
-      return hash;
-    }
+  template <typename U                                 = hash_type,
+            typename std::enable_if<std::is_same<U, HashOpType::modulo>::value,
+                                    std::size_t>::type = 0>
+  KOKKOS_INLINE_FUNCTION int __compute_hash(size_type key, size_type divisor) {
+    size_type hash = key % divisor;
+#ifdef HASHMAPACCUMULATOR_ASSERT_ENABLED
+    if (hash == -1) Kokkos::abort("__compute_hash: hash = -1");
+    if (key == -1) Kokkos::abort("__compute_hash: key = -1");
+#endif  // HASHMAPACCUMULATOR_ASSERT_ENABLED
+    return hash;
+  }
   // private
 };  // struct HashmapAccumulator
 
+}  // namespace Experimental
+}  // namespace KokkosKernels
 
-}   // namespace Experimental
-}   // namespace KokkosKernels
-
-#endif //  _KOKKOSKERNELS_HASHMAPACCUMULATOR_HPP
-
-
+#endif  //  _KOKKOSKERNELS_HASHMAPACCUMULATOR_HPP
diff --git a/src/common/KokkosKernels_IOUtils.hpp b/src/common/KokkosKernels_IOUtils.hpp
index ad981872bd..bf1f3b4bfc 100644
--- a/src/common/KokkosKernels_IOUtils.hpp
+++ b/src/common/KokkosKernels_IOUtils.hpp
@@ -49,6 +49,7 @@
 #include <sstream>
 #include <algorithm>
 #include <vector>
+#include <unordered_set>
 #include <stdexcept>
 #include <type_traits>
 #ifndef _KOKKOSKERNELSIOUTILS_HPP
@@ -60,244 +61,215 @@
 #include "KokkosKernels_SimpleUtils.hpp"
 #include <sys/stat.h>
 
-namespace KokkosKernels{
+namespace KokkosKernels {
 
-
-namespace Impl{
+namespace Impl {
 
 // Get the interval for Kokkos::fill_random
 // For real, interval is (-mag, mag)
 // For complex, both real and imaginary parts will have interval (-mag, mag)
 template <typename Scalar>
-inline void getRandomBounds(double mag, Scalar& start, Scalar& end) {
+inline void getRandomBounds(double mag, Scalar &start, Scalar &end) {
   start = -mag * Kokkos::ArithTraits<Scalar>::one();
   end   = mag * Kokkos::ArithTraits<Scalar>::one();
 }
 
 template <>
-inline void getRandomBounds(double mag, Kokkos::complex<float>& start, Kokkos::complex<float>& end) {
+inline void getRandomBounds(double mag, Kokkos::complex<float> &start,
+                            Kokkos::complex<float> &end) {
   start = Kokkos::complex<float>(-mag, -mag);
   end   = Kokkos::complex<float>(mag, mag);
 }
 
-template<>
-inline void getRandomBounds(double mag, Kokkos::complex<double>& start, Kokkos::complex<double>& end)
-{
+template <>
+inline void getRandomBounds(double mag, Kokkos::complex<double> &start,
+                            Kokkos::complex<double> &end) {
   start = Kokkos::complex<double>(-mag, -mag);
-  end = Kokkos::complex<double>(mag, mag);
+  end   = Kokkos::complex<double>(mag, mag);
 }
 
-//MD: Bases on Christian's sparseMatrix_generate function in test_crsmatrix.cpp file.
-template< typename ScalarType , typename OrdinalType, typename SizeType>
-void kk_sparseMatrix_generate(
-    OrdinalType nrows,
-    OrdinalType ncols,
-    SizeType &nnz,
-    OrdinalType row_size_variance,
-    OrdinalType bandwidth,
-    ScalarType* &values,
-    SizeType* &rowPtr,
-    OrdinalType* &colInd)
-{
-  rowPtr = new SizeType[nrows+1];
-
-  OrdinalType elements_per_row = nrows ? nnz/nrows : 0;
+// MD: Bases on Christian's sparseMatrix_generate function in test_crsmatrix.cpp
+// file.
+template <typename ScalarType, typename OrdinalType, typename SizeType>
+void kk_sparseMatrix_generate(OrdinalType nrows, OrdinalType ncols,
+                              SizeType &nnz, OrdinalType row_size_variance,
+                              OrdinalType bandwidth, ScalarType *&values,
+                              SizeType *&rowPtr, OrdinalType *&colInd) {
+  rowPtr = new SizeType[nrows + 1];
+
+  OrdinalType elements_per_row = nrows ? nnz / nrows : 0;
   srand(13721);
   rowPtr[0] = 0;
-  for(int row=0;row<nrows;row++)
-  {
-    int varianz = (1.0*rand()/RAND_MAX-0.5)*row_size_variance;
+  for (int row = 0; row < nrows; row++) {
+    int varianz       = (1.0 * rand() / RAND_MAX - 0.5) * row_size_variance;
     int numRowEntries = elements_per_row + varianz;
-    if(numRowEntries < 0)
-      numRowEntries = 0;
-    rowPtr[row+1] = rowPtr[row] + numRowEntries;
-  }
-  nnz = rowPtr[nrows];
+    if (numRowEntries < 0) numRowEntries = 0;
+    // Clamping numRowEntries above accomplishes 2 things:
+    //  - If ncols is 0, numRowEntries will also be 0
+    //  - With numRowEntries at most 2/3 the number of columns, in the worst
+    //  case
+    //    90% of insertions will succeed after 6 tries
+    if (numRowEntries > 0.66 * ncols) numRowEntries = 0.66 * ncols;
+    rowPtr[row + 1] = rowPtr[row] + numRowEntries;
+  }
+  nnz    = rowPtr[nrows];
   values = new ScalarType[nnz];
   colInd = new OrdinalType[nnz];
-  for(OrdinalType row=0;row<nrows;row++)
-  {
-    for(SizeType k=rowPtr[row]; k<rowPtr[row+1]; ++k)
-    {
-      while (true){
-        OrdinalType pos = (1.0*rand()/RAND_MAX-0.5)*bandwidth+row;
-        while(pos<0) pos+=ncols;
-        while(pos>=ncols) pos-=ncols;
+  for (OrdinalType row = 0; row < nrows; row++) {
+    for (SizeType k = rowPtr[row]; k < rowPtr[row + 1]; ++k) {
+      while (true) {
+        OrdinalType pos = (1.0 * rand() / RAND_MAX - 0.5) * bandwidth + row;
+        while (pos < 0) pos += ncols;
+        while (pos >= ncols) pos -= ncols;
 
         bool is_already_in_the_row = false;
-        for(SizeType j = rowPtr[row] ; j<k ;j++){
-          if (colInd[j] == pos){
+        for (SizeType j = rowPtr[row]; j < k; j++) {
+          if (colInd[j] == pos) {
             is_already_in_the_row = true;
             break;
           }
         }
         if (!is_already_in_the_row) {
-
-          colInd[k]= pos;
+          colInd[k] = pos;
           break;
         }
       }
     }
   }
-  //Sample each value from uniform (-50, 50) for real types, or (-50 - 50i, 50 + 50i) for complex types.
-  Kokkos::View<ScalarType*, Kokkos::HostSpace> valuesView(values, nnz);
+  // Sample each value from uniform (-50, 50) for real types, or (-50 - 50i, 50
+  // + 50i) for complex types.
+  Kokkos::View<ScalarType *, Kokkos::HostSpace> valuesView(values, nnz);
   ScalarType randStart, randEnd;
   getRandomBounds(50.0, randStart, randEnd);
   Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace> pool(13718);
   Kokkos::fill_random(valuesView, pool, randStart, randEnd);
 }
 
-template< typename ScalarType , typename OrdinalType, typename SizeType>
+template <typename ScalarType, typename OrdinalType, typename SizeType>
 void kk_sparseMatrix_generate_lower_upper_triangle(
-    char uplo,
-    OrdinalType nrows,
-    OrdinalType ncols,
-    SizeType &nnz,
-    OrdinalType /*row_size_variance*/,
-    OrdinalType /*bandwidth*/,
-    ScalarType* &values,
-    SizeType* &rowPtr,
-    OrdinalType* &colInd)
-{
-  rowPtr = new SizeType[nrows+1];
-
-  //OrdinalType elements_per_row = nnz/nrows;
+    char uplo, OrdinalType nrows, OrdinalType ncols, SizeType &nnz,
+    OrdinalType /*row_size_variance*/, OrdinalType /*bandwidth*/,
+    ScalarType *&values, SizeType *&rowPtr, OrdinalType *&colInd) {
+  rowPtr = new SizeType[nrows + 1];
+
+  // OrdinalType elements_per_row = nnz/nrows;
   srand(13721);
   rowPtr[0] = 0;
-  for(int row=0;row<nrows;row++)
-  {
-    if (uplo =='L')
-      rowPtr[row+1] = rowPtr[row] + row + 1;
+  for (int row = 0; row < nrows; row++) {
+    if (uplo == 'L')
+      rowPtr[row + 1] = rowPtr[row] + row + 1;
     else
-      rowPtr[row+1] = rowPtr[row] + ncols - (row) ;
+      rowPtr[row + 1] = rowPtr[row] + ncols - (row);
   }
-  nnz = rowPtr[nrows];
+  nnz    = rowPtr[nrows];
   values = new ScalarType[nnz];
   colInd = new OrdinalType[nnz];
-  for(OrdinalType row=0;row<nrows;row++)
-  {
-
-    for(SizeType k=rowPtr[row]; k<rowPtr[row+1]; k++) {
-      if (uplo =='L')
-        colInd[k]= k - rowPtr[row];
+  for (OrdinalType row = 0; row < nrows; row++) {
+    for (SizeType k = rowPtr[row]; k < rowPtr[row + 1]; k++) {
+      if (uplo == 'L')
+        colInd[k] = k - rowPtr[row];
       else
-        colInd[k]= row + (k - rowPtr[row]);
+        colInd[k] = row + (k - rowPtr[row]);
       values[k] = 1.0;
     }
   }
 }
 
-template< typename ScalarType , typename OrdinalType, typename SizeType>
+template <typename ScalarType, typename OrdinalType, typename SizeType>
 void kk_diagonally_dominant_sparseMatrix_generate(
-    OrdinalType nrows,
-    OrdinalType ncols,
-    SizeType &nnz,
-    OrdinalType row_size_variance,
-    OrdinalType bandwidth,
-    ScalarType* &values,
-    SizeType* &rowPtr,
-    OrdinalType* &colInd,
-    ScalarType diagDominance = 10 * Kokkos::ArithTraits<ScalarType>::one())
-{
-  rowPtr = new SizeType[nrows+1];
-
-  OrdinalType elements_per_row = nnz/nrows;
+    OrdinalType nrows, OrdinalType ncols, SizeType &nnz,
+    OrdinalType row_size_variance, OrdinalType bandwidth, ScalarType *&values,
+    SizeType *&rowPtr, OrdinalType *&colInd,
+    ScalarType diagDominance = 10 * Kokkos::ArithTraits<ScalarType>::one()) {
+  rowPtr = new SizeType[nrows + 1];
+
+  OrdinalType elements_per_row = nnz / nrows;
   srand(13721);
   rowPtr[0] = 0;
-  for(int row=0;row<nrows;row++)
-  {
-    int varianz = (1.0*rand()/RAND_MAX-0.5)*row_size_variance;
-    rowPtr[row+1] = rowPtr[row] + elements_per_row+varianz;
-    if(rowPtr[row+1] <= rowPtr[row])   // This makes sure that there is
-      rowPtr[row+1] = rowPtr[row] + 1; // at least one nonzero in the row
-  }
-  nnz = rowPtr[nrows];
+  for (int row = 0; row < nrows; row++) {
+    int varianz = (1.0 * rand() / RAND_MAX - 0.5) * row_size_variance;
+    if (varianz < 1) varianz = 1;
+    if (varianz > 0.75 * ncols) varianz = 0.75 * ncols;
+    rowPtr[row + 1] = rowPtr[row] + elements_per_row + varianz;
+    if (rowPtr[row + 1] <= rowPtr[row])   // This makes sure that there is
+      rowPtr[row + 1] = rowPtr[row] + 1;  // at least one nonzero in the row
+  }
+  nnz    = rowPtr[nrows];
   values = new ScalarType[nnz];
   colInd = new OrdinalType[nnz];
-  for(OrdinalType row=0; row<nrows; row++)
-  {
+  for (OrdinalType row = 0; row < nrows; row++) {
     ScalarType total_values = 0;
-    for(SizeType k=rowPtr[row]; k<rowPtr[row+1]-1; k++)
-    {
-      while (true){
-        OrdinalType pos = (1.0*rand()/RAND_MAX-0.5)*bandwidth+row;
-        while(pos < 0)
-	  pos += ncols;
-        while(pos >= ncols)
-	  pos -= ncols;
-
-        bool is_already_in_the_row = false;
-	if(pos == row)
-	  is_already_in_the_row = true;
-	else
-	{
-	  for(SizeType j = rowPtr[row] ; j<k ;j++){
-	    if (colInd[j] == pos){
-	      is_already_in_the_row = true;
-	      break;
-	    }
-	  }
-	}
-        if (!is_already_in_the_row) {
-
-          colInd[k]= pos;
-          values[k] = 100.0*rand()/RAND_MAX-50.0;
-	  total_values += Kokkos::Details::ArithTraits<ScalarType>::abs(values[k]);
+    std::unordered_set<OrdinalType> entriesInRow;
+    // We always add the diagonal entry (after this loop)
+    entriesInRow.insert(row);
+    for (SizeType k = rowPtr[row]; k < rowPtr[row + 1] - 1; k++) {
+      while (true) {
+        OrdinalType pos = (1.0 * rand() / RAND_MAX - 0.5) * bandwidth + row;
+        while (pos < 0) pos += ncols;
+        while (pos >= ncols) pos -= ncols;
+
+        if (entriesInRow.find(pos) == entriesInRow.end()) {
+          entriesInRow.insert(pos);
+          colInd[k] = pos;
+          values[k] = 100.0 * rand() / RAND_MAX - 50.0;
+          total_values +=
+              Kokkos::Details::ArithTraits<ScalarType>::abs(values[k]);
           break;
         }
       }
     }
 
-    colInd[rowPtr[row+1] - 1]= row;
-    values[rowPtr[row+1] - 1] = total_values * diagDominance;
+    colInd[rowPtr[row + 1] - 1] = row;
+    values[rowPtr[row + 1] - 1] = total_values * diagDominance;
   }
 }
 
-
 // This function creates a diagonal sparse matrix for testing matrix operations.
 // The elements on the diagonal are 1, 2, ..., n-1, n.
 // If "invert" is true, it will return the inverse of the above diagonal matrix.
 template <typename crsMat_t>
 crsMat_t kk_generate_diag_matrix(typename crsMat_t::const_ordinal_type n,
-                                 const bool invert = false){
+                                 const bool invert = false) {
   typedef typename crsMat_t::ordinal_type ot;
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::non_const_type row_map_view_t;
-  typedef typename graph_t::entries_type::non_const_type   cols_view_t;
+  typedef typename graph_t::entries_type::non_const_type cols_view_t;
   typedef typename crsMat_t::values_type::non_const_type values_view_t;
 
   typedef typename row_map_view_t::non_const_value_type size_type;
   typedef typename cols_view_t::non_const_value_type lno_t;
   typedef typename values_view_t::non_const_value_type scalar_t;
 
-  row_map_view_t rowmap_view("rowmap_view", n+1);
+  row_map_view_t rowmap_view("rowmap_view", n + 1);
   cols_view_t columns_view("colsmap_view", n);
   values_view_t values_view("values_view", n);
 
   {
-    typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view);
-    typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view);
-    typename values_view_t::HostMirror hv = Kokkos::create_mirror_view (values_view);
-
-    for (lno_t i = 0; i <= n; ++i){
+    typename row_map_view_t::HostMirror hr =
+        Kokkos::create_mirror_view(rowmap_view);
+    typename cols_view_t::HostMirror hc =
+        Kokkos::create_mirror_view(columns_view);
+    typename values_view_t::HostMirror hv =
+        Kokkos::create_mirror_view(values_view);
+
+    for (lno_t i = 0; i <= n; ++i) {
       hr(i) = size_type(i);
     }
 
-    for (ot i = 0; i < n; ++i){
+    for (ot i = 0; i < n; ++i) {
       hc(i) = lno_t(i);
-      if(invert){
-        hv(i) = scalar_t(1.0)/(scalar_t(i + 1));
-      }
-      else{
+      if (invert) {
+        hv(i) = scalar_t(1.0) / (scalar_t(i + 1));
+      } else {
         hv(i) = scalar_t(i + 1);
       }
     }
-    Kokkos::deep_copy (rowmap_view , hr);
-    Kokkos::deep_copy (columns_view , hc);
-    Kokkos::deep_copy (values_view , hv);
+    Kokkos::deep_copy(rowmap_view, hr);
+    Kokkos::deep_copy(columns_view, hc);
+    Kokkos::deep_copy(values_view, hv);
   }
 
-  graph_t static_graph (columns_view, rowmap_view);
+  graph_t static_graph(columns_view, rowmap_view);
   crsMat_t crsmat("CrsMatrix", n, values_view, static_graph);
   return crsmat;
 }
@@ -310,106 +282,109 @@ crsMat_t kk_generate_diagonally_dominant_sparse_matrix(
     typename crsMat_t::const_ordinal_type row_size_variance,
     typename crsMat_t::const_ordinal_type bandwidth,
     typename crsMat_t::const_value_type diagDominance =
-      10 * Kokkos::ArithTraits<typename crsMat_t::value_type>::one())
-{
+        10 * Kokkos::ArithTraits<typename crsMat_t::value_type>::one()) {
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::non_const_type row_map_view_t;
-  typedef typename graph_t::entries_type::non_const_type   cols_view_t;
+  typedef typename graph_t::entries_type::non_const_type cols_view_t;
   typedef typename crsMat_t::values_type::non_const_type values_view_t;
 
-
   typedef typename row_map_view_t::non_const_value_type size_type;
   typedef typename cols_view_t::non_const_value_type lno_t;
   typedef typename values_view_t::non_const_value_type scalar_t;
   lno_t *adj;
-  size_type *xadj;//, nnzA;
+  size_type *xadj;  //, nnzA;
   scalar_t *values;
 
   kk_diagonally_dominant_sparseMatrix_generate<scalar_t, lno_t, size_type>(
-      nrows, ncols, nnz, row_size_variance,  bandwidth,
-      values, xadj, adj, diagDominance);
+      nrows, ncols, nnz, row_size_variance, bandwidth, values, xadj, adj,
+      diagDominance);
 
-  row_map_view_t rowmap_view("rowmap_view", nrows+1);
+  row_map_view_t rowmap_view("rowmap_view", nrows + 1);
   cols_view_t columns_view("colsmap_view", nnz);
   values_view_t values_view("values_view", nnz);
 
   {
-    typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view);
-    typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view);
-    typename values_view_t::HostMirror hv = Kokkos::create_mirror_view (values_view);
-
-    for (lno_t i = 0; i <= nrows; ++i){
+    typename row_map_view_t::HostMirror hr =
+        Kokkos::create_mirror_view(rowmap_view);
+    typename cols_view_t::HostMirror hc =
+        Kokkos::create_mirror_view(columns_view);
+    typename values_view_t::HostMirror hv =
+        Kokkos::create_mirror_view(values_view);
+
+    for (lno_t i = 0; i <= nrows; ++i) {
       hr(i) = xadj[i];
     }
 
-    for (size_type i = 0; i < nnz; ++i){
+    for (size_type i = 0; i < nnz; ++i) {
       hc(i) = adj[i];
       hv(i) = values[i];
     }
-    Kokkos::deep_copy (rowmap_view , hr);
-    Kokkos::deep_copy (columns_view , hc);
-    Kokkos::deep_copy (values_view , hv);
+    Kokkos::deep_copy(rowmap_view, hr);
+    Kokkos::deep_copy(columns_view, hc);
+    Kokkos::deep_copy(values_view, hv);
   }
 
-  graph_t static_graph (columns_view, rowmap_view);
+  graph_t static_graph(columns_view, rowmap_view);
   crsMat_t crsmat("CrsMatrix", ncols, values_view, static_graph);
-  delete [] xadj; delete [] adj; delete [] values;
+  delete[] xadj;
+  delete[] adj;
+  delete[] values;
   return crsmat;
 }
 
 template <typename crsMat_t>
 crsMat_t kk_generate_triangular_sparse_matrix(
-    char uplo,
-    typename crsMat_t::const_ordinal_type nrows,
+    char uplo, typename crsMat_t::const_ordinal_type nrows,
     typename crsMat_t::const_ordinal_type ncols,
     typename crsMat_t::non_const_size_type &nnz,
     typename crsMat_t::const_ordinal_type row_size_variance,
-    typename crsMat_t::const_ordinal_type bandwidth){
-
+    typename crsMat_t::const_ordinal_type bandwidth) {
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::non_const_type row_map_view_t;
-  typedef typename graph_t::entries_type::non_const_type   cols_view_t;
+  typedef typename graph_t::entries_type::non_const_type cols_view_t;
   typedef typename crsMat_t::values_type::non_const_type values_view_t;
 
-
   typedef typename row_map_view_t::non_const_value_type size_type;
   typedef typename cols_view_t::non_const_value_type lno_t;
   typedef typename values_view_t::non_const_value_type scalar_t;
   lno_t *adj;
-  size_type *xadj;//, nnzA;
+  size_type *xadj;  //, nnzA;
   scalar_t *values;
 
   kk_sparseMatrix_generate_lower_upper_triangle<scalar_t, lno_t, size_type>(
-      uplo,
-      nrows, ncols, nnz, row_size_variance,  bandwidth,
-      values, xadj, adj);
+      uplo, nrows, ncols, nnz, row_size_variance, bandwidth, values, xadj, adj);
 
-  row_map_view_t rowmap_view("rowmap_view", nrows+1);
+  row_map_view_t rowmap_view("rowmap_view", nrows + 1);
   cols_view_t columns_view("colsmap_view", nnz);
   values_view_t values_view("values_view", nnz);
 
   {
-    typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view);
-    typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view);
-    typename values_view_t::HostMirror hv = Kokkos::create_mirror_view (values_view);
-
-    for (lno_t i = 0; i <= nrows; ++i){
+    typename row_map_view_t::HostMirror hr =
+        Kokkos::create_mirror_view(rowmap_view);
+    typename cols_view_t::HostMirror hc =
+        Kokkos::create_mirror_view(columns_view);
+    typename values_view_t::HostMirror hv =
+        Kokkos::create_mirror_view(values_view);
+
+    for (lno_t i = 0; i <= nrows; ++i) {
       hr(i) = xadj[i];
     }
 
-    for (size_type i = 0; i < nnz; ++i){
+    for (size_type i = 0; i < nnz; ++i) {
       hc(i) = adj[i];
       hv(i) = values[i];
     }
-    Kokkos::deep_copy (rowmap_view , hr);
-    Kokkos::deep_copy (columns_view , hc);
-    Kokkos::deep_copy (values_view , hv);
+    Kokkos::deep_copy(rowmap_view, hr);
+    Kokkos::deep_copy(columns_view, hc);
+    Kokkos::deep_copy(values_view, hv);
     Kokkos::fence();
   }
 
-  graph_t static_graph (columns_view, rowmap_view);
+  graph_t static_graph(columns_view, rowmap_view);
   crsMat_t crsmat("CrsMatrix", ncols, values_view, static_graph);
-  delete [] xadj; delete [] adj; delete [] values;
+  delete[] xadj;
+  delete[] adj;
+  delete[] values;
   return crsmat;
 }
 
@@ -419,84 +394,81 @@ crsMat_t kk_generate_sparse_matrix(
     typename crsMat_t::const_ordinal_type ncols,
     typename crsMat_t::non_const_size_type &nnz,
     typename crsMat_t::const_ordinal_type row_size_variance,
-    typename crsMat_t::const_ordinal_type bandwidth){
-
+    typename crsMat_t::const_ordinal_type bandwidth) {
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::non_const_type row_map_view_t;
-  typedef typename graph_t::entries_type::non_const_type   cols_view_t;
+  typedef typename graph_t::entries_type::non_const_type cols_view_t;
   typedef typename crsMat_t::values_type::non_const_type values_view_t;
 
-
   typedef typename row_map_view_t::non_const_value_type size_type;
   typedef typename cols_view_t::non_const_value_type lno_t;
   typedef typename values_view_t::non_const_value_type scalar_t;
   lno_t *adj;
-  size_type *xadj;//, nnzA;
+  size_type *xadj;  //, nnzA;
   scalar_t *values;
 
   kk_sparseMatrix_generate<scalar_t, lno_t, size_type>(
-      nrows, ncols, nnz, row_size_variance,  bandwidth,
-      values, xadj, adj);
+      nrows, ncols, nnz, row_size_variance, bandwidth, values, xadj, adj);
 
-  row_map_view_t rowmap_view("rowmap_view", nrows+1);
+  row_map_view_t rowmap_view("rowmap_view", nrows + 1);
   cols_view_t columns_view("colsmap_view", nnz);
   values_view_t values_view("values_view", nnz);
 
   {
-    typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view);
-    typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view);
-    typename values_view_t::HostMirror hv = Kokkos::create_mirror_view (values_view);
-
-    for (lno_t i = 0; i <= nrows; ++i){
+    typename row_map_view_t::HostMirror hr =
+        Kokkos::create_mirror_view(rowmap_view);
+    typename cols_view_t::HostMirror hc =
+        Kokkos::create_mirror_view(columns_view);
+    typename values_view_t::HostMirror hv =
+        Kokkos::create_mirror_view(values_view);
+
+    for (lno_t i = 0; i <= nrows; ++i) {
       hr(i) = xadj[i];
     }
 
-    for (size_type i = 0; i < nnz; ++i){
+    for (size_type i = 0; i < nnz; ++i) {
       hc(i) = adj[i];
       hv(i) = values[i];
     }
-    Kokkos::deep_copy (rowmap_view , hr);
-    Kokkos::deep_copy (columns_view , hc);
-    Kokkos::deep_copy (values_view , hv);
+    Kokkos::deep_copy(rowmap_view, hr);
+    Kokkos::deep_copy(columns_view, hc);
+    Kokkos::deep_copy(values_view, hv);
   }
 
-  graph_t static_graph (columns_view, rowmap_view);
+  graph_t static_graph(columns_view, rowmap_view);
   crsMat_t crsmat("CrsMatrix", ncols, values_view, static_graph);
-  delete [] xadj; delete [] adj; delete [] values;
+  delete[] xadj;
+  delete[] adj;
+  delete[] values;
   return crsmat;
 }
 
-
-
-
-//TODO: need to fix the size_type. All over the reading inputs are lno_t.
+// TODO: need to fix the size_type. All over the reading inputs are lno_t.
 
 template <typename stype>
-void md_malloc(stype **arr, size_t n, std::string /*alloc_str*/ = ""){
+void md_malloc(stype **arr, size_t n, std::string /*alloc_str*/ = "") {
   *arr = new stype[n];
-  if (*arr == NULL){
-    throw std::runtime_error ("Memory Allocation Problem\n");
+  if (*arr == NULL) {
+    throw std::runtime_error("Memory Allocation Problem\n");
   }
 }
 
 template <typename idx, typename wt>
-struct Edge{
+struct Edge {
   idx src;
   idx dst;
   wt ew;
-  bool operator<(const Edge <idx,wt> & a) const
-  {
-    //return !((this->src < a.src) || (this->src == a.src && this->dst < a.dst));
+  bool operator<(const Edge<idx, wt> &a) const {
+    // return !((this->src < a.src) || (this->src == a.src && this->dst <
+    // a.dst));
     return (this->src < a.src) || (this->src == a.src && this->dst < a.dst);
   }
 };
 
-
 ////////////////////////////////////////////////////////////////////////////////
 // From MTGL
 ////////////////////////////////////////////////////////////////////////////////
-inline size_t kk_get_file_size(const char* file)
-{
+inline size_t kk_get_file_size(const char *file) {
   // struct stat stat_buf;
 
 #ifdef _WIN32
@@ -511,91 +483,85 @@ inline size_t kk_get_file_size(const char* file)
 }
 
 template <typename lno_t>
-void buildEdgeListFromBinSrcTarg_undirected(
-    const char *fnameSrc, const char*fnameTarg,
-    size_t &numEdges,
-    lno_t **srcs, lno_t **dst){
-   size_t srcFileSize = kk_get_file_size(fnameSrc);
-   size_t trgFileSize = kk_get_file_size(fnameTarg);
-   // test these values
+void buildEdgeListFromBinSrcTarg_undirected(const char *fnameSrc,
+                                            const char *fnameTarg,
+                                            size_t &numEdges, lno_t **srcs,
+                                            lno_t **dst) {
+  size_t srcFileSize = kk_get_file_size(fnameSrc);
+  size_t trgFileSize = kk_get_file_size(fnameTarg);
+  // test these values
 
-   size_t srcSize = srcFileSize / sizeof(lno_t);
-   size_t trgSize = trgFileSize / sizeof(lno_t);
-   if (srcSize != trgSize)
-   {
-     throw std::runtime_error ("Src and Target file needs to be the same size");
-   }
-   //Assumption that each edge is listed once
-   numEdges = srcSize;
-
-   md_malloc<lno_t>(srcs, numEdges);
-   md_malloc<lno_t>(dst, numEdges);
+  size_t srcSize = srcFileSize / sizeof(lno_t);
+  size_t trgSize = trgFileSize / sizeof(lno_t);
+  if (srcSize != trgSize) {
+    throw std::runtime_error("Src and Target file needs to be the same size");
+  }
+  // Assumption that each edge is listed once
+  numEdges = srcSize;
 
-   ////////////////////////////////////////////////////////
-   // Read source data into buffer
-   ////////////////////////////////////////////////////////
+  md_malloc<lno_t>(srcs, numEdges);
+  md_malloc<lno_t>(dst, numEdges);
 
-   std::ifstream myFile (fnameSrc, std::ios::in | std::ios::binary);
+  ////////////////////////////////////////////////////////
+  // Read source data into buffer
+  ////////////////////////////////////////////////////////
 
-   myFile.read((char *) *srcs, sizeof(lno_t) * (numEdges));
+  std::ifstream myFile(fnameSrc, std::ios::in | std::ios::binary);
 
-   myFile.close();
+  myFile.read((char *)*srcs, sizeof(lno_t) * (numEdges));
 
-   std::ifstream myFile2 (fnameTarg, std::ios::in | std::ios::binary);
+  myFile.close();
 
-   myFile2.read((char *) *dst, sizeof(lno_t) * (numEdges));
+  std::ifstream myFile2(fnameTarg, std::ios::in | std::ios::binary);
 
-   myFile2.close();
-   //
+  myFile2.read((char *)*dst, sizeof(lno_t) * (numEdges));
 
+  myFile2.close();
+  //
 }
 
-
-
 template <typename idx_array_type>
-inline void kk_write_1Dview_to_file(idx_array_type view, const char *filename){
-
+inline void kk_write_1Dview_to_file(idx_array_type view, const char *filename) {
   typedef typename idx_array_type::HostMirror host_type;
-  //typedef typename idx_array_type::size_type idx;
-  host_type host_view = Kokkos::create_mirror_view (view);
-  Kokkos::deep_copy (host_view , view);
+  // typedef typename idx_array_type::size_type idx;
+  host_type host_view = Kokkos::create_mirror_view(view);
+  Kokkos::deep_copy(host_view, view);
   Kokkos::fence();
-  std::ofstream myFile (filename, std::ios::out );
-  for (size_t i = 0; i < view.extent(0); ++i){
-	  myFile << host_view(i) << std::endl;
+  std::ofstream myFile(filename, std::ios::out);
+  for (size_t i = 0; i < view.extent(0); ++i) {
+    myFile << host_view(i) << std::endl;
   }
   myFile.close();
 }
 
 template <typename idx_array_type>
-inline void kk_read_1Dview_from_file(idx_array_type &view, const char *filename){
-
+inline void kk_read_1Dview_from_file(idx_array_type &view,
+                                     const char *filename) {
   typedef typename idx_array_type::HostMirror host_type;
-  //typedef typename idx_array_type::size_type idx;
-  host_type host_view = Kokkos::create_mirror_view (view);
-  std::ifstream myFile (filename, std::ios::in );
+  // typedef typename idx_array_type::size_type idx;
+  host_type host_view = Kokkos::create_mirror_view(view);
+  std::ifstream myFile(filename, std::ios::in);
 
-  for (size_t i = 0; i < view.extent(0); ++i){
-	  myFile >> host_view(i);
+  for (size_t i = 0; i < view.extent(0); ++i) {
+    myFile >> host_view(i);
   }
   myFile.close();
-  Kokkos::deep_copy (view, host_view);
+  Kokkos::deep_copy(view, host_view);
   Kokkos::fence();
 }
 
-
-
-
 template <typename idx>
-void convert_crs_to_lower_triangle_edge_list(idx nv, idx *xadj, idx *adj, idx *lower_triangle_srcs, idx *lower_triangle_dests){
+void convert_crs_to_lower_triangle_edge_list(idx nv, idx *xadj, idx *adj,
+                                             idx *lower_triangle_srcs,
+                                             idx *lower_triangle_dests) {
   idx ind = 0;
-  for (idx i = 0; i < nv; ++i){
+  for (idx i = 0; i < nv; ++i) {
     idx xb = xadj[i];
-    idx xe = xadj[i+1];
-    for (idx j = xb; j < xe; ++j){
+    idx xe = xadj[i + 1];
+    for (idx j = xb; j < xe; ++j) {
       idx dst = adj[j];
-      if (i < dst){
-        lower_triangle_srcs[ind] = i;
+      if (i < dst) {
+        lower_triangle_srcs[ind]    = i;
         lower_triangle_dests[ind++] = dst;
       }
     }
@@ -603,50 +569,45 @@ void convert_crs_to_lower_triangle_edge_list(idx nv, idx *xadj, idx *adj, idx *l
 }
 
 template <typename idx>
-void convert_crs_to_edge_list(idx nv, idx *xadj, idx *srcs){
-  for (idx i = 0; i < nv; ++i){
+void convert_crs_to_edge_list(idx nv, idx *xadj, idx *srcs) {
+  for (idx i = 0; i < nv; ++i) {
     idx xb = xadj[i];
-    idx xe = xadj[i+1];
-    for (idx j = xb; j < xe; ++j){
+    idx xe = xadj[i + 1];
+    for (idx j = xb; j < xe; ++j) {
       srcs[j] = i;
     }
   }
 }
 
 template <typename size_type, typename lno_t, typename wt>
-void convert_edge_list_to_csr (lno_t nv, size_type ne,
-    lno_t *srcs, lno_t *dests, wt *ew,
-    size_type *xadj, lno_t *adj, wt *crs_ew){
-
-  std::vector <struct Edge<lno_t, wt> > edges (ne);
-  for(size_type i = 0; i < ne; ++i){
+void convert_edge_list_to_csr(lno_t nv, size_type ne, lno_t *srcs, lno_t *dests,
+                              wt *ew, size_type *xadj, lno_t *adj, wt *crs_ew) {
+  std::vector<struct Edge<lno_t, wt>> edges(ne);
+  for (size_type i = 0; i < ne; ++i) {
     edges[i].src = srcs[i];
     edges[i].dst = dests[i];
-    edges[i].ew = ew[i];
+    edges[i].ew  = ew[i];
   }
-  std::sort (edges.begin(), edges.begin() + ne);
+  std::sort(edges.begin(), edges.begin() + ne);
 
   size_type eind = 0;
-  for (lno_t i = 0; i < nv; ++i){
+  for (lno_t i = 0; i < nv; ++i) {
     (xadj)[i] = eind;
-    while (edges[eind].src == i){
-      (adj)[eind] = edges[eind].dst;
+    while (edges[eind].src == i) {
+      (adj)[eind]     = edges[eind].dst;
       (*crs_ew)[eind] = edges[eind].ew;
       ++eind;
     }
   }
   xadj[nv] = eind;
-
 }
 
 template <typename in_lno_t, typename size_type, typename lno_t>
-void convert_undirected_edge_list_to_csr (
-    lno_t nv, size_type ne,
-    in_lno_t *srcs, in_lno_t *dests,
-    size_type *xadj, lno_t *adj){
-
-  std::vector <struct Edge<lno_t, double> > edges (ne * 2);
-  for(size_type i = 0; i < ne; ++i){
+void convert_undirected_edge_list_to_csr(lno_t nv, size_type ne, in_lno_t *srcs,
+                                         in_lno_t *dests, size_type *xadj,
+                                         lno_t *adj) {
+  std::vector<struct Edge<lno_t, double>> edges(ne * 2);
+  for (size_type i = 0; i < ne; ++i) {
     edges[i * 2].src = srcs[i];
     edges[i * 2].dst = dests[i];
 
@@ -654,22 +615,21 @@ void convert_undirected_edge_list_to_csr (
     edges[i * 2 + 1].dst = srcs[i];
   }
 #ifdef KOKKOSKERNELS_HAVE_OUTER
-#include<parallel/multiseq_selection.h>
-#include<parallel/multiway_merge.h>
-#include<parallel/merge.h>
-#include<parallel/multiway_mergesort.h>
-  __gnu_parallel::parallel_sort_mwms<false,true, struct Edge<lno_t, double> *>
-  (&(edges[0]), &(edges[0])+ne*2,
-      std::less<struct Edge<lno_t, double> >(), 64);
+#include <parallel/multiseq_selection.h>
+#include <parallel/multiway_merge.h>
+#include <parallel/merge.h>
+#include <parallel/multiway_mergesort.h>
+  __gnu_parallel::parallel_sort_mwms<false, true, struct Edge<lno_t, double> *>(
+      &(edges[0]), &(edges[0]) + ne * 2,
+      std::less<struct Edge<lno_t, double>>(), 64);
 #else
-  std::sort (edges.begin(), edges.begin() + ne * 2);
+  std::sort(edges.begin(), edges.begin() + ne * 2);
 #endif
 
-
   size_type eind = 0;
-  for (lno_t i = 0; i < nv; ++i){
+  for (lno_t i = 0; i < nv; ++i) {
     (xadj)[i] = eind;
-    while (edges[eind].src == i){
+    while (edges[eind].src == i) {
       (adj)[eind] = edges[eind].dst;
       //(*crs_ew)[eind] = edges[eind].ew;
       ++eind;
@@ -714,216 +674,188 @@ void read_graph_src_dst_bin(
 }
 */
 
-
 template <typename idx, typename wt>
-void write_edgelist_bin(
-    size_t ne,
-    const idx *edge_begins,
-    const  idx *edge_ends,
-    const  wt *ew,
-    const  char *filename){
-  std::ofstream myFile (filename, std::ios::out | std::ios::binary);
-  myFile.write((char *) &ne, sizeof(idx));
-  myFile.write((char *) edge_begins, sizeof(idx) * (ne));
-  myFile.write((char *) edge_ends, sizeof(idx) * (ne));
-  myFile.write((char *) ew, sizeof(wt) * (ne));
+void write_edgelist_bin(size_t ne, const idx *edge_begins, const idx *edge_ends,
+                        const wt *ew, const char *filename) {
+  std::ofstream myFile(filename, std::ios::out | std::ios::binary);
+  myFile.write((char *)&ne, sizeof(idx));
+  myFile.write((char *)edge_begins, sizeof(idx) * (ne));
+  myFile.write((char *)edge_ends, sizeof(idx) * (ne));
+  myFile.write((char *)ew, sizeof(wt) * (ne));
   myFile.close();
 }
 
 template <typename idx, typename wt>
-void read_edgelist_bin(
-    idx *ne,
-    idx **edge_begins,
-    idx **edge_ends,
-    wt **ew,
-    const  char *filename){
-
-  std::ifstream myFile (filename, std::ios::in | std::ios::binary);
-
+void read_edgelist_bin(idx *ne, idx **edge_begins, idx **edge_ends, wt **ew,
+                       const char *filename) {
+  std::ifstream myFile(filename, std::ios::in | std::ios::binary);
 
-  myFile.read((char *) ne, sizeof(idx));
+  myFile.read((char *)ne, sizeof(idx));
   md_malloc<idx>(edge_begins, *ne);
   md_malloc<idx>(edge_ends, *ne);
-  md_malloc<wt> (ew, *ne);
-  myFile.read((char *) *edge_begins, sizeof(idx) * (*ne));
-  myFile.read((char *) *edge_ends, sizeof(idx) * (*ne));
-  myFile.read((char *) *ew, sizeof(wt) * (*ne));
+  md_malloc<wt>(ew, *ne);
+  myFile.read((char *)*edge_begins, sizeof(idx) * (*ne));
+  myFile.read((char *)*edge_ends, sizeof(idx) * (*ne));
+  myFile.read((char *)*ew, sizeof(wt) * (*ne));
   myFile.close();
 }
 
-
-
-
-
 template <typename lno_t, typename size_type, typename scalar_t>
-void write_graph_bin(lno_t nv, size_type ne,const size_type *xadj,const  lno_t *adj,const  scalar_t *ew,const  char *filename){
-  std::ofstream myFile (filename, std::ios::out | std::ios::binary);
-  myFile.write((char *) &nv, sizeof(lno_t));
-  myFile.write((char *) &ne, sizeof(size_type));
-  myFile.write((char *) xadj, sizeof(size_type) * (nv + 1));
+void write_graph_bin(lno_t nv, size_type ne, const size_type *xadj,
+                     const lno_t *adj, const scalar_t *ew,
+                     const char *filename) {
+  std::ofstream myFile(filename, std::ios::out | std::ios::binary);
+  myFile.write((char *)&nv, sizeof(lno_t));
+  myFile.write((char *)&ne, sizeof(size_type));
+  myFile.write((char *)xadj, sizeof(size_type) * (nv + 1));
 
-  myFile.write((char *) adj, sizeof(lno_t) * (ne));
+  myFile.write((char *)adj, sizeof(lno_t) * (ne));
 
-  myFile.write((char *) ew, sizeof(scalar_t) * (ne));
+  myFile.write((char *)ew, sizeof(scalar_t) * (ne));
 
   myFile.close();
 }
 
 template <typename lno_t, typename size_type, typename scalar_t>
-void write_graph_crs(lno_t nv, size_type ne,const size_type *xadj,const  lno_t *adj,const  scalar_t *ew,const  char *filename){
-  std::ofstream myFile (filename, std::ios::out );
+void write_graph_crs(lno_t nv, size_type ne, const size_type *xadj,
+                     const lno_t *adj, const scalar_t *ew,
+                     const char *filename) {
+  std::ofstream myFile(filename, std::ios::out);
   myFile << nv << " " << ne << std::endl;
 
-  for (lno_t i = 0; i <= nv; ++i){
-    myFile  << xadj[i] << " ";
+  for (lno_t i = 0; i <= nv; ++i) {
+    myFile << xadj[i] << " ";
   }
-  myFile  << std::endl;
+  myFile << std::endl;
 
-  for (lno_t i = 0; i < nv; ++i){
+  for (lno_t i = 0; i < nv; ++i) {
     size_type b = xadj[i];
     size_type e = xadj[i + 1];
-    for (size_type j = b; j < e; ++j){
-      myFile  << adj[j] << " ";
+    for (size_type j = b; j < e; ++j) {
+      myFile << adj[j] << " ";
     }
-    myFile  << std::endl;
+    myFile << std::endl;
   }
-  for (size_type i = 0; i < ne; ++i){
-    myFile  << ew[i] << " ";
+  for (size_type i = 0; i < ne; ++i) {
+    myFile << ew[i] << " ";
   }
-  myFile  << std::endl;
+  myFile << std::endl;
 
   myFile.close();
 }
 
 template <typename lno_t, typename size_type, typename scalar_t>
-void write_graph_ligra(lno_t nv, size_type ne,const size_type *xadj,const  lno_t *adj,const  scalar_t * /*ew*/,const  char *filename){
-
-  std::ofstream ff (filename);
+void write_graph_ligra(lno_t nv, size_type ne, const size_type *xadj,
+                       const lno_t *adj, const scalar_t * /*ew*/,
+                       const char *filename) {
+  std::ofstream ff(filename);
   ff << "AdjacencyGraph" << std::endl;
   ff << nv << std::endl << ne << std::endl;
-  for (lno_t i = 0; i < nv; ++i){
+  for (lno_t i = 0; i < nv; ++i) {
     ff << xadj[i] << std::endl;
   }
-  for (size_type i = 0; i < ne; ++i){
+  for (size_type i = 0; i < ne; ++i) {
     ff << adj[i] << std::endl;
   }
   ff.close();
 }
 
-//MM: types and utility functions for parsing the MatrixMarket format
-namespace MM
-{
-  enum MtxObject
-  {
-    UNDEFINED_OBJECT,
-    MATRIX,
-    VECTOR
-  };
-  enum MtxFormat
-  {
-    UNDEFINED_FORMAT,
-    COORDINATE,
-    ARRAY
-  };
-  enum MtxField
-  {
-    UNDEFINED_FIELD,
-    REAL,     //includes both float and double
-    COMPLEX,  //includes complex<float> and complex<double>
-    INTEGER,  //includes all integer types
-    PATTERN   //not a type, but means the value for every entry is 1
-  };
-  enum MtxSym
-  {
-    UNDEFINED_SYMMETRY,
-    GENERAL,
-    SYMMETRIC,      //A(i, j) = A(j, i)
-    SKEW_SYMMETRIC, //A(i, j) = -A(j, i)
-    HERMITIAN       //A(i, j) = a + bi; A(j, i) = a - bi
-  };
-
-  //readScalar/writeScalar: read and write a scalar in the form that it appears in an .mtx file.
-  //The >> and << operators won't work, because complex appears as "real imag", not "(real, imag)"
-  template<typename scalar_t>
-  scalar_t readScalar(std::istream& is)
-  {
-    scalar_t val;
-    is >> val;
-    return val;
-  }
+// MM: types and utility functions for parsing the MatrixMarket format
+namespace MM {
+enum MtxObject { UNDEFINED_OBJECT, MATRIX, VECTOR };
+enum MtxFormat { UNDEFINED_FORMAT, COORDINATE, ARRAY };
+enum MtxField {
+  UNDEFINED_FIELD,
+  REAL,     // includes both float and double
+  COMPLEX,  // includes complex<float> and complex<double>
+  INTEGER,  // includes all integer types
+  PATTERN   // not a type, but means the value for every entry is 1
+};
+enum MtxSym {
+  UNDEFINED_SYMMETRY,
+  GENERAL,
+  SYMMETRIC,       // A(i, j) = A(j, i)
+  SKEW_SYMMETRIC,  // A(i, j) = -A(j, i)
+  HERMITIAN        // A(i, j) = a + bi; A(j, i) = a - bi
+};
 
-  template<>
-  inline Kokkos::complex<float> readScalar(std::istream& is)
-  {
-    float r, i;
-    is >> r;
-    is >> i;
-    return Kokkos::complex<float>(r, i);
-  }
+// readScalar/writeScalar: read and write a scalar in the form that it appears
+// in an .mtx file. The >> and << operators won't work, because complex appears
+// as "real imag", not "(real, imag)"
+template <typename scalar_t>
+scalar_t readScalar(std::istream &is) {
+  scalar_t val;
+  is >> val;
+  return val;
+}
 
-  template<>
-  inline Kokkos::complex<double> readScalar(std::istream& is)
-  {
-    double r, i;
-    is >> r;
-    is >> i;
-    return Kokkos::complex<double>(r, i);
-  }
+template <>
+inline Kokkos::complex<float> readScalar(std::istream &is) {
+  float r, i;
+  is >> r;
+  is >> i;
+  return Kokkos::complex<float>(r, i);
+}
 
-  template<typename scalar_t>
-  void writeScalar(std::ostream& os, scalar_t val)
-  {
-    os << val;
-  }
+template <>
+inline Kokkos::complex<double> readScalar(std::istream &is) {
+  double r, i;
+  is >> r;
+  is >> i;
+  return Kokkos::complex<double>(r, i);
+}
 
-  template<>
-  inline void writeScalar(std::ostream& os, Kokkos::complex<float> val)
-  {
-    os << val.real() << ' ' << val.imag();
-  }
+template <typename scalar_t>
+void writeScalar(std::ostream &os, scalar_t val) {
+  os << val;
+}
 
-  template<>
-  inline void writeScalar(std::ostream& os, Kokkos::complex<double> val)
-  {
-    os << val.real() << ' ' << val.imag();
-  }
+template <>
+inline void writeScalar(std::ostream &os, Kokkos::complex<float> val) {
+  os << val.real() << ' ' << val.imag();
+}
 
-  //symmetryFlip: given a value for A(i, j), return the value that
-  //should be inserted at A(j, i) (if any)
-  template<typename scalar_t>
-  scalar_t symmetryFlip(scalar_t val, MtxSym symFlag)
-  {
-    if(symFlag == SKEW_SYMMETRIC)
-      return -val;
-    return val;
-  }
+template <>
+inline void writeScalar(std::ostream &os, Kokkos::complex<double> val) {
+  os << val.real() << ' ' << val.imag();
+}
 
-  template<>
-  inline Kokkos::complex<float> symmetryFlip(Kokkos::complex<float> val, MtxSym symFlag)
-  {
-    if(symFlag == HERMITIAN)
-      return Kokkos::conj(val);
-    else if(symFlag == SKEW_SYMMETRIC)
-      return -val;
-    return val;
-  }
+// symmetryFlip: given a value for A(i, j), return the value that
+// should be inserted at A(j, i) (if any)
+template <typename scalar_t>
+scalar_t symmetryFlip(scalar_t val, MtxSym symFlag) {
+  if (symFlag == SKEW_SYMMETRIC) return -val;
+  return val;
+}
 
-  template<>
-  inline Kokkos::complex<double> symmetryFlip(Kokkos::complex<double> val, MtxSym symFlag)
-  {
-    if(symFlag == HERMITIAN)
-      return Kokkos::conj(val);
-    else if(symFlag == SKEW_SYMMETRIC)
-      return -val;
-    return val;
-  }
+template <>
+inline Kokkos::complex<float> symmetryFlip(Kokkos::complex<float> val,
+                                           MtxSym symFlag) {
+  if (symFlag == HERMITIAN)
+    return Kokkos::conj(val);
+  else if (symFlag == SKEW_SYMMETRIC)
+    return -val;
+  return val;
 }
 
+template <>
+inline Kokkos::complex<double> symmetryFlip(Kokkos::complex<double> val,
+                                            MtxSym symFlag) {
+  if (symFlag == HERMITIAN)
+    return Kokkos::conj(val);
+  else if (symFlag == SKEW_SYMMETRIC)
+    return -val;
+  return val;
+}
+}  // namespace MM
+
 template <typename lno_t, typename size_type, typename scalar_t>
-void write_matrix_mtx(lno_t nrows, lno_t ncols, size_type nentries, const size_type *xadj, const lno_t *adj, const scalar_t *vals, const char *filename) {
-  std::ofstream myFile (filename);
-  myFile  << "%%MatrixMarket matrix coordinate ";
-  if(std::is_same<scalar_t, Kokkos::complex<float>>::value ||
+void write_matrix_mtx(lno_t nrows, lno_t ncols, size_type nentries,
+                      const size_type *xadj, const lno_t *adj,
+                      const scalar_t *vals, const char *filename) {
+  std::ofstream myFile(filename);
+  myFile << "%%MatrixMarket matrix coordinate ";
+  if (std::is_same<scalar_t, Kokkos::complex<float>>::value ||
       std::is_same<scalar_t, Kokkos::complex<double>>::value)
     myFile << "complex";
   else
@@ -935,7 +867,7 @@ void write_matrix_mtx(lno_t nrows, lno_t ncols, size_type nentries, const size_t
     size_type b = xadj[i];
     size_type e = xadj[i + 1];
     for (size_type j = b; j < e; ++j) {
-      myFile  << i + 1 << " " << adj[j] + 1 << " ";
+      myFile << i + 1 << " " << adj[j] + 1 << " ";
       MM::writeScalar<scalar_t>(myFile, vals[j]);
       myFile << '\n';
     }
@@ -944,23 +876,24 @@ void write_matrix_mtx(lno_t nrows, lno_t ncols, size_type nentries, const size_t
 }
 
 template <typename lno_t, typename size_type, typename scalar_t>
-void write_graph_mtx(lno_t nv, size_type ne,const size_type *xadj,const  lno_t *adj,const  scalar_t *ew,const  char *filename){
-
-  std::ofstream myFile (filename);
-  myFile  << "%%MatrixMarket matrix coordinate ";
-  if(std::is_same<scalar_t, Kokkos::complex<float>>::value ||
+void write_graph_mtx(lno_t nv, size_type ne, const size_type *xadj,
+                     const lno_t *adj, const scalar_t *ew,
+                     const char *filename) {
+  std::ofstream myFile(filename);
+  myFile << "%%MatrixMarket matrix coordinate ";
+  if (std::is_same<scalar_t, Kokkos::complex<float>>::value ||
       std::is_same<scalar_t, Kokkos::complex<double>>::value)
     myFile << "complex";
   else
     myFile << "real";
   myFile << " general\n";
-  myFile  << nv << " " << nv << " " << ne << '\n';
+  myFile << nv << " " << nv << " " << ne << '\n';
   myFile << std::setprecision(8) << std::scientific;
-  for (lno_t i = 0; i < nv; ++i){
+  for (lno_t i = 0; i < nv; ++i) {
     size_type b = xadj[i];
     size_type e = xadj[i + 1];
-    for (size_type j = b; j < e; ++j){
-      myFile  << i + 1 << " " << (adj)[j] + 1 << " ";
+    for (size_type j = b; j < e; ++j) {
+      myFile << i + 1 << " " << (adj)[j] + 1 << " ";
       MM::writeScalar<scalar_t>(myFile, ew[j]);
       myFile << '\n';
     }
@@ -969,355 +902,346 @@ void write_graph_mtx(lno_t nv, size_type ne,const size_type *xadj,const  lno_t *
   myFile.close();
 }
 
-
-
 template <typename lno_t, typename size_type, typename scalar_t>
-void read_graph_bin(lno_t *nv, size_type *ne,size_type **xadj, lno_t **adj, scalar_t **ew, const char *filename){
-
-  std::ifstream myFile (filename, std::ios::in | std::ios::binary);
+void read_graph_bin(lno_t *nv, size_type *ne, size_type **xadj, lno_t **adj,
+                    scalar_t **ew, const char *filename) {
+  std::ifstream myFile(filename, std::ios::in | std::ios::binary);
 
-  myFile.read((char *) nv, sizeof(lno_t));
-  myFile.read((char *) ne, sizeof(size_type));
-  md_malloc<size_type>(xadj, *nv+1);
+  myFile.read((char *)nv, sizeof(lno_t));
+  myFile.read((char *)ne, sizeof(size_type));
+  md_malloc<size_type>(xadj, *nv + 1);
   md_malloc<lno_t>(adj, *ne);
-  md_malloc<scalar_t> (ew, *ne);
-  myFile.read((char *) *xadj, sizeof(size_type) * (*nv + 1));
-  myFile.read((char *) *adj, sizeof(lno_t) * (*ne));
-  myFile.read((char *) *ew, sizeof(scalar_t) * (*ne));
+  md_malloc<scalar_t>(ew, *ne);
+  myFile.read((char *)*xadj, sizeof(size_type) * (*nv + 1));
+  myFile.read((char *)*adj, sizeof(lno_t) * (*ne));
+  myFile.read((char *)*ew, sizeof(scalar_t) * (*ne));
   myFile.close();
 }
 
-//When Kokkos issue #2313 is resolved, can delete
-//parseScalar and just use operator>>
-template<typename scalar_t>
-scalar_t parseScalar(std::istream& is)
-{
+// When Kokkos issue #2313 is resolved, can delete
+// parseScalar and just use operator>>
+template <typename scalar_t>
+scalar_t parseScalar(std::istream &is) {
   scalar_t val;
   is >> val;
   return val;
 }
 
-template<>
-inline Kokkos::complex<float> parseScalar(std::istream& is)
-{
+template <>
+inline Kokkos::complex<float> parseScalar(std::istream &is) {
   std::complex<float> val;
   is >> val;
   return Kokkos::complex<float>(val);
 }
 
-template<>
-inline Kokkos::complex<double> parseScalar(std::istream& is)
-{
+template <>
+inline Kokkos::complex<double> parseScalar(std::istream &is) {
   std::complex<double> val;
   is >> val;
   return Kokkos::complex<double>(val);
 }
 
 template <typename lno_t, typename size_type, typename scalar_t>
-void read_graph_crs(lno_t *nv, size_type *ne,size_type **xadj, lno_t **adj, scalar_t **ew, const char *filename){
-
-  std::ifstream myFile (filename, std::ios::in );
+void read_graph_crs(lno_t *nv, size_type *ne, size_type **xadj, lno_t **adj,
+                    scalar_t **ew, const char *filename) {
+  std::ifstream myFile(filename, std::ios::in);
   myFile >> *nv >> *ne;
 
-  md_malloc<size_type>(xadj, *nv+1);
+  md_malloc<size_type>(xadj, *nv + 1);
   md_malloc<lno_t>(adj, *ne);
-  md_malloc<scalar_t> (ew, *ne);
+  md_malloc<scalar_t>(ew, *ne);
 
-  for (lno_t i = 0; i <= *nv; ++i){
-    myFile  >> (*xadj)[i];
+  for (lno_t i = 0; i <= *nv; ++i) {
+    myFile >> (*xadj)[i];
   }
 
-  for (size_type i = 0; i < *ne; ++i){
-    myFile  >> (*adj)[i];
+  for (size_type i = 0; i < *ne; ++i) {
+    myFile >> (*adj)[i];
   }
-  for (size_type i = 0; i < *ne; ++i){
+  for (size_type i = 0; i < *ne; ++i) {
     (*ew)[i] = parseScalar<scalar_t>(myFile);
   }
   myFile.close();
 }
 
-
-
-
-inline bool endswith (std::string const &fullString, std::string const &ending) {
-    if (fullString.length() >= ending.length()) {
-        return (0 == fullString.compare (fullString.length() - ending.length(), ending.length(), ending));
-    } else {
-        return false;
-    }
+inline bool endswith(std::string const &fullString, std::string const &ending) {
+  if (fullString.length() >= ending.length()) {
+    return (0 == fullString.compare(fullString.length() - ending.length(),
+                                    ending.length(), ending));
+  } else {
+    return false;
+  }
 }
 
-
 template <typename crs_matrix_t>
-void write_kokkos_crst_matrix(crs_matrix_t a_crsmat,const  char *filename){
+void write_kokkos_crst_matrix(crs_matrix_t a_crsmat, const char *filename) {
   typedef typename crs_matrix_t::StaticCrsGraphType graph_t;
-  typedef typename graph_t::row_map_type::non_const_type     row_map_view_t;
-  typedef typename graph_t::entries_type::non_const_type     cols_view_t;
+  typedef typename graph_t::row_map_type::non_const_type row_map_view_t;
+  typedef typename graph_t::entries_type::non_const_type cols_view_t;
   typedef typename crs_matrix_t::values_type::non_const_type values_view_t;
 
   typedef typename row_map_view_t::value_type offset_t;
-  typedef typename cols_view_t::value_type    lno_t;
-  typedef typename values_view_t::value_type  scalar_t;
-  typedef typename values_view_t::size_type   size_type;
+  typedef typename cols_view_t::value_type lno_t;
+  typedef typename values_view_t::value_type scalar_t;
+  typedef typename values_view_t::size_type size_type;
 
   size_type nnz = a_crsmat.nnz();
 
-  auto a_rowmap_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a_crsmat.graph.row_map);
-  auto a_entries_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a_crsmat.graph.entries);
-  auto a_values_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a_crsmat.values);
-  offset_t* a_rowmap  = const_cast<offset_t*>(a_rowmap_view.data());
-  lno_t*    a_entries = a_entries_view.data();
-  scalar_t* a_values  = a_values_view.data();
+  auto a_rowmap_view = Kokkos::create_mirror_view_and_copy(
+      Kokkos::HostSpace(), a_crsmat.graph.row_map);
+  auto a_entries_view = Kokkos::create_mirror_view_and_copy(
+      Kokkos::HostSpace(), a_crsmat.graph.entries);
+  auto a_values_view =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a_crsmat.values);
+  offset_t *a_rowmap = const_cast<offset_t *>(a_rowmap_view.data());
+  lno_t *a_entries   = a_entries_view.data();
+  scalar_t *a_values = a_values_view.data();
 
   std::string strfilename(filename);
-  if (endswith(strfilename, ".mtx") || endswith(strfilename, ".mm")){
+  if (endswith(strfilename, ".mtx") || endswith(strfilename, ".mm")) {
     write_matrix_mtx<lno_t, offset_t, scalar_t>(
-        a_crsmat.numRows(), a_crsmat.numCols(), a_crsmat.nnz(),
-        a_rowmap, a_entries, a_values, filename);
+        a_crsmat.numRows(), a_crsmat.numCols(), a_crsmat.nnz(), a_rowmap,
+        a_entries, a_values, filename);
     return;
-  }
-  else if(a_crsmat.numRows() != a_crsmat.numCols())
-  {
-    throw std::runtime_error("For formats other than MatrixMarket (suffix .mm or .mtx),\n"
+  } else if (a_crsmat.numRows() != a_crsmat.numCols()) {
+    throw std::runtime_error(
+        "For formats other than MatrixMarket (suffix .mm or .mtx),\n"
         "write_kokkos_crst_matrix only supports square matrices");
   }
-  if (endswith(strfilename, ".bin")){
-    write_graph_bin<lno_t, offset_t, scalar_t>(a_crsmat.numRows(),
-        nnz, a_rowmap, a_entries, a_values, filename);
-  }
-  else if (endswith(strfilename, ".ligra")){
-    write_graph_ligra<lno_t, offset_t, scalar_t>(a_crsmat.numRows(),
-        nnz, a_rowmap, a_entries, a_values, filename);
-  }
-  else if (endswith(strfilename, ".crs")){
-    write_graph_crs<lno_t, offset_t, scalar_t>(a_crsmat.numRows(),
-        nnz, a_rowmap, a_entries, a_values, filename);
-  }
-  else {
-    std::string errMsg = std::string("write_kokkos_crst_matrix: File extension on ") + filename + " does not correspond to a known format";
+  if (endswith(strfilename, ".bin")) {
+    write_graph_bin<lno_t, offset_t, scalar_t>(
+        a_crsmat.numRows(), nnz, a_rowmap, a_entries, a_values, filename);
+  } else if (endswith(strfilename, ".ligra")) {
+    write_graph_ligra<lno_t, offset_t, scalar_t>(
+        a_crsmat.numRows(), nnz, a_rowmap, a_entries, a_values, filename);
+  } else if (endswith(strfilename, ".crs")) {
+    write_graph_crs<lno_t, offset_t, scalar_t>(
+        a_crsmat.numRows(), nnz, a_rowmap, a_entries, a_values, filename);
+  } else {
+    std::string errMsg =
+        std::string("write_kokkos_crst_matrix: File extension on ") + filename +
+        " does not correspond to a known format";
     throw std::runtime_error(errMsg);
   }
 }
 
 template <typename lno_t, typename size_type, typename scalar_t>
-int read_mtx (
-    const char *fileName,
-    lno_t *nrows, lno_t* ncols, size_type *ne,
-    size_type **xadj, lno_t **adj, scalar_t **ew,
-    bool symmetrize = false, bool remove_diagonal = true,
-    bool transpose = false)
-{
+int read_mtx(const char *fileName, lno_t *nrows, lno_t *ncols, size_type *ne,
+             size_type **xadj, lno_t **adj, scalar_t **ew,
+             bool symmetrize = false, bool remove_diagonal = true,
+             bool transpose = false) {
   using namespace MM;
-  std::ifstream mmf (fileName, std::ifstream::in);
+  std::ifstream mmf(fileName, std::ifstream::in);
   if (!mmf.is_open()) {
-    throw std::runtime_error ("File cannot be opened\n");
+    throw std::runtime_error("File cannot be opened\n");
   }
 
   std::string fline = "";
   getline(mmf, fline);
 
-  if (fline.size() < 2 || fline[0] != '%' || fline[1] != '%'){
-    throw std::runtime_error ("Invalid MM file. Line-1\n");
+  if (fline.size() < 2 || fline[0] != '%' || fline[1] != '%') {
+    throw std::runtime_error("Invalid MM file. Line-1\n");
   }
 
-  //make sure every required field is in the file, by initializing them to UNDEFINED_*
+  // make sure every required field is in the file, by initializing them to
+  // UNDEFINED_*
   MtxObject mtx_object = UNDEFINED_OBJECT;
   MtxFormat mtx_format = UNDEFINED_FORMAT;
-  MtxField mtx_field = UNDEFINED_FIELD;
-  MtxSym mtx_sym = UNDEFINED_SYMMETRY;
+  MtxField mtx_field   = UNDEFINED_FIELD;
+  MtxSym mtx_sym       = UNDEFINED_SYMMETRY;
 
-  if (fline.find("matrix") != std::string::npos){
+  if (fline.find("matrix") != std::string::npos) {
     mtx_object = MATRIX;
-  } else if (fline.find("vector") != std::string::npos){
+  } else if (fline.find("vector") != std::string::npos) {
     mtx_object = VECTOR;
-    throw std::runtime_error("MatrixMarket \"vector\" is not supported by KokkosKernels read_mtx()");
+    throw std::runtime_error(
+        "MatrixMarket \"vector\" is not supported by KokkosKernels read_mtx()");
   }
 
-  if (fline.find("coordinate") != std::string::npos){
-    //sparse
+  if (fline.find("coordinate") != std::string::npos) {
+    // sparse
     mtx_format = COORDINATE;
-  }
-  else if (fline.find("array") != std::string::npos){
-    //dense
+  } else if (fline.find("array") != std::string::npos) {
+    // dense
     mtx_format = ARRAY;
   }
 
-  if(fline.find("real") != std::string::npos || 
-     fline.find("double") != std::string::npos)
-  {
-    if (std::is_same<scalar_t,Kokkos::Experimental::half_t>::value)
+  if (fline.find("real") != std::string::npos ||
+      fline.find("double") != std::string::npos) {
+    if (std::is_same<scalar_t, Kokkos::Experimental::half_t>::value ||
+        std::is_same<scalar_t, Kokkos::Experimental::bhalf_t>::value)
       mtx_field = REAL;
     else {
       if (!std::is_floating_point<scalar_t>::value)
         throw std::runtime_error(
-            "scalar_t in read_mtx() incompatible with float or double typed MatrixMarket file.");
+            "scalar_t in read_mtx() incompatible with float or double typed "
+            "MatrixMarket file.");
       else
         mtx_field = REAL;
     }
-  }
-  else if (fline.find("complex") != std::string::npos){
-    if(!(std::is_same<scalar_t, Kokkos::complex<float>>::value ||
+  } else if (fline.find("complex") != std::string::npos) {
+    if (!(std::is_same<scalar_t, Kokkos::complex<float>>::value ||
           std::is_same<scalar_t, Kokkos::complex<double>>::value))
-      throw std::runtime_error("scalar_t in read_mtx() incompatible with complex-typed MatrixMarket file.");
+      throw std::runtime_error(
+          "scalar_t in read_mtx() incompatible with complex-typed MatrixMarket "
+          "file.");
     else
       mtx_field = COMPLEX;
-  }
-  else if (fline.find("integer") != std::string::npos){
-    if(std::is_integral<scalar_t>::value 
-       || std::is_floating_point<scalar_t>::value
-       || std::is_same<scalar_t,Kokkos::Experimental::half_t>::value)
+  } else if (fline.find("integer") != std::string::npos) {
+    if (std::is_integral<scalar_t>::value ||
+        std::is_floating_point<scalar_t>::value ||
+        std::is_same<scalar_t, Kokkos::Experimental::half_t>::value ||
+        std::is_same<scalar_t, Kokkos::Experimental::bhalf_t>::value)
       mtx_field = INTEGER;
     else
-      throw std::runtime_error("scalar_t in read_mtx() incompatible with integer-typed MatrixMarket file.");
-  }
-  else if (fline.find("pattern") != std::string::npos){
+      throw std::runtime_error(
+          "scalar_t in read_mtx() incompatible with integer-typed MatrixMarket "
+          "file.");
+  } else if (fline.find("pattern") != std::string::npos) {
     mtx_field = PATTERN;
-    //any reasonable choice for scalar_t can represent "1" or "1.0 + 0i", so nothing to check here
+    // any reasonable choice for scalar_t can represent "1" or "1.0 + 0i", so
+    // nothing to check here
   }
 
-  if (fline.find("general") != std::string::npos){
+  if (fline.find("general") != std::string::npos) {
     mtx_sym = GENERAL;
-  }
-  else if (fline.find("skew-symmetric") != std::string::npos){
+  } else if (fline.find("skew-symmetric") != std::string::npos) {
     mtx_sym = SKEW_SYMMETRIC;
-  }
-  else if (fline.find("symmetric") != std::string::npos){
-    //checking for "symmetric" after "skew-symmetric" because it's a substring
+  } else if (fline.find("symmetric") != std::string::npos) {
+    // checking for "symmetric" after "skew-symmetric" because it's a substring
     mtx_sym = SYMMETRIC;
-  }
-  else if (fline.find("hermitian") != std::string::npos ||
-      fline.find("Hermitian") != std::string::npos){
+  } else if (fline.find("hermitian") != std::string::npos ||
+             fline.find("Hermitian") != std::string::npos) {
     mtx_sym = HERMITIAN;
   }
-  //Validate the matrix attributes
-  if(mtx_format == ARRAY)
-  {
-    if(mtx_sym == UNDEFINED_SYMMETRY)
-      mtx_sym = GENERAL;
-    if(mtx_sym != GENERAL)
-      throw std::runtime_error("array format MatrixMarket file must have general symmetry (optional to include \"general\")");
-  }
-  if(mtx_object == UNDEFINED_OBJECT)
-    throw std::runtime_error("MatrixMarket file header is missing the object type.");
-  if(mtx_format == UNDEFINED_FORMAT)
+  // Validate the matrix attributes
+  if (mtx_format == ARRAY) {
+    if (mtx_sym == UNDEFINED_SYMMETRY) mtx_sym = GENERAL;
+    if (mtx_sym != GENERAL)
+      throw std::runtime_error(
+          "array format MatrixMarket file must have general symmetry (optional "
+          "to include \"general\")");
+  }
+  if (mtx_object == UNDEFINED_OBJECT)
+    throw std::runtime_error(
+        "MatrixMarket file header is missing the object type.");
+  if (mtx_format == UNDEFINED_FORMAT)
     throw std::runtime_error("MatrixMarket file header is missing the format.");
-  if(mtx_field == UNDEFINED_FIELD)
-    throw std::runtime_error("MatrixMarket file header is missing the field type.");
-  if(mtx_sym == UNDEFINED_SYMMETRY)
-    throw std::runtime_error("MatrixMarket file header is missing the symmetry type.");
-
-  while(1){
+  if (mtx_field == UNDEFINED_FIELD)
+    throw std::runtime_error(
+        "MatrixMarket file header is missing the field type.");
+  if (mtx_sym == UNDEFINED_SYMMETRY)
+    throw std::runtime_error(
+        "MatrixMarket file header is missing the symmetry type.");
+
+  while (1) {
     getline(mmf, fline);
-    if(fline[0] != '%') break;
+    if (fline[0] != '%') break;
   }
-  std::stringstream ss (fline);
+  std::stringstream ss(fline);
   lno_t nr = 0, nc = 0;
   size_type nnz = 0;
   ss >> nr >> nc;
-  if(mtx_format == COORDINATE)
+  if (mtx_format == COORDINATE)
     ss >> nnz;
   else
     nnz = nr * nc;
   size_type numEdges = nnz;
-  symmetrize = symmetrize || mtx_sym != GENERAL;
-  if(symmetrize && nr != nc)
-  {
+  symmetrize         = symmetrize || mtx_sym != GENERAL;
+  if (symmetrize && nr != nc) {
     throw std::runtime_error("A non-square matrix cannot be symmetrized.");
   }
-  if(mtx_format == ARRAY)
-  {
-    //Array format only supports general symmetry and non-pattern 
-    if(symmetrize)
-      throw std::runtime_error("array format MatrixMarket file cannot be symmetrized.");
-    if(mtx_field == PATTERN)
-      throw std::runtime_error("array format MatrixMarket file can't have \"pattern\" field type.");
+  if (mtx_format == ARRAY) {
+    // Array format only supports general symmetry and non-pattern
+    if (symmetrize)
+      throw std::runtime_error(
+          "array format MatrixMarket file cannot be symmetrized.");
+    if (mtx_field == PATTERN)
+      throw std::runtime_error(
+          "array format MatrixMarket file can't have \"pattern\" field type.");
   }
-  if(symmetrize)
-  {
+  if (symmetrize) {
     numEdges = 2 * nnz;
   }
-  //numEdges is only an upper bound (diagonal entries may be removed)
-  std::vector <struct Edge<lno_t, scalar_t> > edges (numEdges);
-  size_type nE = 0;
+  // numEdges is only an upper bound (diagonal entries may be removed)
+  std::vector<struct Edge<lno_t, scalar_t>> edges(numEdges);
+  size_type nE      = 0;
   lno_t numDiagonal = 0;
-  for (size_type i = 0; i < nnz; ++i){
+  for (size_type i = 0; i < nnz; ++i) {
     getline(mmf, fline);
-    std::stringstream ss2 (fline);
+    std::stringstream ss2(fline);
     struct Edge<lno_t, scalar_t> tmp;
-    //read source, dest (edge) and weight (value)
-    lno_t s,d;
+    // read source, dest (edge) and weight (value)
+    lno_t s, d;
     scalar_t w;
-    if(mtx_format == ARRAY)
-    {
-      //In array format, entries are listed in column major order,
-      //so the row and column can be determined just from the index i
+    if (mtx_format == ARRAY) {
+      // In array format, entries are listed in column major order,
+      // so the row and column can be determined just from the index i
       //(but make them 1-based indices, to match the way coordinate works)
-      s = i % nr + 1; //row
-      d = i / nr + 1; //col
-    }
-    else
-    {
-      //In coordinate format, row and col of each entry is read from file
+      s = i % nr + 1;  // row
+      d = i / nr + 1;  // col
+    } else {
+      // In coordinate format, row and col of each entry is read from file
       ss2 >> s >> d;
     }
-    if(mtx_field == PATTERN)
+    if (mtx_field == PATTERN)
       w = 1;
     else
       w = readScalar<scalar_t>(ss2);
-    if (!transpose){
+    if (!transpose) {
       tmp.src = s - 1;
       tmp.dst = d - 1;
-      tmp.ew = w;
-    }
-    else {
+      tmp.ew  = w;
+    } else {
       tmp.src = d - 1;
       tmp.dst = s - 1;
-      tmp.ew = w;
+      tmp.ew  = w;
     }
-    if (tmp.src == tmp.dst){
+    if (tmp.src == tmp.dst) {
       numDiagonal++;
-      if (!remove_diagonal){
+      if (!remove_diagonal) {
         edges[nE++] = tmp;
       }
       continue;
     }
     edges[nE++] = tmp;
-    if (symmetrize){
+    if (symmetrize) {
       struct Edge<lno_t, scalar_t> tmp2;
       tmp2.src = tmp.dst;
       tmp2.dst = tmp.src;
-      //the symmetrized value is w, -w or conj(w) if mtx_sym is
-      //SYMMETRIC, SKEW_SYMMETRIC or HERMITIAN, respectively.
-      tmp2.ew = symmetryFlip<scalar_t>(tmp.ew, mtx_sym);
+      // the symmetrized value is w, -w or conj(w) if mtx_sym is
+      // SYMMETRIC, SKEW_SYMMETRIC or HERMITIAN, respectively.
+      tmp2.ew     = symmetryFlip<scalar_t>(tmp.ew, mtx_sym);
       edges[nE++] = tmp2;
     }
   }
   mmf.close();
-  std::sort (edges.begin(), edges.begin() + nE);
-  if (transpose){
+  std::sort(edges.begin(), edges.begin() + nE);
+  if (transpose) {
     lno_t tmp = nr;
-    nr = nc;
-    nc = tmp;
+    nr        = nc;
+    nc        = tmp;
   }
-  //idx *nv, idx *ne, idx **xadj, idx **adj, wt **wt
+  // idx *nv, idx *ne, idx **xadj, idx **adj, wt **wt
   *nrows = nr;
   *ncols = nc;
-  *ne = nE;
+  *ne    = nE;
   //*xadj = new idx[nr + 1];
-  md_malloc<size_type>(xadj, nr+1);
+  md_malloc<size_type>(xadj, nr + 1);
   //*adj = new idx[nE];
   md_malloc<lno_t>(adj, nE);
   //*ew = new wt[nE];
   md_malloc<scalar_t>(ew, nE);
-  size_type eind = 0;
+  size_type eind   = 0;
   size_type actual = 0;
-  for (lno_t i = 0; i < nr; ++i){
-    (*xadj)[i] = actual;
+  for (lno_t i = 0; i < nr; ++i) {
+    (*xadj)[i]    = actual;
     bool is_first = true;
-    while (eind < nE && edges[eind].src == i){
-      if (is_first || !symmetrize || eind == 0 || (eind > 0 && edges[eind - 1].dst != edges[eind].dst)){
+    while (eind < nE && edges[eind].src == i) {
+      if (is_first || !symmetrize || eind == 0 ||
+          (eind > 0 && edges[eind - 1].dst != edges[eind].dst)) {
         (*adj)[actual] = edges[eind].dst;
-        (*ew)[actual] = edges[eind].ew;
+        (*ew)[actual]  = edges[eind].ew;
         ++actual;
       }
       is_first = false;
@@ -1325,78 +1249,71 @@ int read_mtx (
     }
   }
   (*xadj)[nr] = actual;
-  *ne = actual;
+  *ne         = actual;
   return 0;
 }
 
-//Version of read_mtx which does not capture the number of columns.
-//This is the old interface; it's kept for backwards compatibility.
+// Version of read_mtx which does not capture the number of columns.
+// This is the old interface; it's kept for backwards compatibility.
 template <typename lno_t, typename size_type, typename scalar_t>
-int read_mtx (
-    const char *fileName,
-    lno_t *nv, size_type *ne,
-    size_type **xadj, lno_t **adj, scalar_t **ew,
-    bool symmetrize = false, bool remove_diagonal = true,
-    bool transpose = false)
-{
-  lno_t ncol; //will discard
-  return read_mtx<lno_t, size_type, scalar_t>(fileName, nv, &ncol, ne, xadj, adj, ew, symmetrize, remove_diagonal, transpose);
+int read_mtx(const char *fileName, lno_t *nv, size_type *ne, size_type **xadj,
+             lno_t **adj, scalar_t **ew, bool symmetrize = false,
+             bool remove_diagonal = true, bool transpose = false) {
+  lno_t ncol;  // will discard
+  return read_mtx<lno_t, size_type, scalar_t>(fileName, nv, &ncol, ne, xadj,
+                                              adj, ew, symmetrize,
+                                              remove_diagonal, transpose);
 }
 
 template <typename lno_t, typename size_type, typename scalar_t>
-void read_matrix(lno_t *nv, size_type *ne,size_type **xadj, lno_t **adj, scalar_t **ew, const char *filename){
-
+void read_matrix(lno_t *nv, size_type *ne, size_type **xadj, lno_t **adj,
+                 scalar_t **ew, const char *filename) {
   std::string strfilename(filename);
-  if (endswith(strfilename, ".mtx") || endswith(strfilename, ".mm")){
-    read_mtx (
-        filename,
-        nv, ne,
-        xadj, adj, ew,false,false,false);
+  if (endswith(strfilename, ".mtx") || endswith(strfilename, ".mm")) {
+    read_mtx(filename, nv, ne, xadj, adj, ew, false, false, false);
   }
 
-  else if (endswith(strfilename, ".bin")){
-    read_graph_bin(nv, ne,xadj, adj, ew, filename);
+  else if (endswith(strfilename, ".bin")) {
+    read_graph_bin(nv, ne, xadj, adj, ew, filename);
   }
 
-  else if (endswith(strfilename, ".crs")){
-
-    read_graph_crs(nv, ne,xadj, adj, ew, filename);
+  else if (endswith(strfilename, ".crs")) {
+    read_graph_crs(nv, ne, xadj, adj, ew, filename);
   }
 
   else {
-    throw std::runtime_error ("Reader is not available\n");
+    throw std::runtime_error("Reader is not available\n");
   }
 }
 
 template <typename crsMat_t>
-crsMat_t read_kokkos_crst_matrix(const char * filename_){
+crsMat_t read_kokkos_crst_matrix(const char *filename_) {
   std::string strfilename(filename_);
-  bool isMatrixMarket = endswith(strfilename, ".mtx") || endswith(strfilename, ".mm");
+  bool isMatrixMarket =
+      endswith(strfilename, ".mtx") || endswith(strfilename, ".mm");
 
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::non_const_type row_map_view_t;
-  typedef typename graph_t::entries_type::non_const_type   cols_view_t;
+  typedef typename graph_t::entries_type::non_const_type cols_view_t;
   typedef typename crsMat_t::values_type::non_const_type values_view_t;
 
   typedef typename row_map_view_t::value_type size_type;
-  typedef typename cols_view_t::value_type   lno_t;
+  typedef typename cols_view_t::value_type lno_t;
   typedef typename values_view_t::value_type scalar_t;
 
   lno_t nr, nc, *adj;
   size_type *xadj, nnzA;
   scalar_t *values;
 
-  if(isMatrixMarket)
-  {
-    //MatrixMarket file contains the exact number of columns
-    read_mtx<lno_t, size_type, scalar_t> (
-        filename_, &nr, &nc, &nnzA, &xadj, &adj, &values, false, false, false);
-  }
-  else
-  {
-    //.crs and .bin files don't contain #cols, so will compute it later based on the entries
-    read_matrix<lno_t, size_type, scalar_t>(
-        &nr, &nnzA, &xadj, &adj, &values, filename_);
+  if (isMatrixMarket) {
+    // MatrixMarket file contains the exact number of columns
+    read_mtx<lno_t, size_type, scalar_t>(filename_, &nr, &nc, &nnzA, &xadj,
+                                         &adj, &values, false, false, false);
+  } else {
+    //.crs and .bin files don't contain #cols, so will compute it later based on
+    // the entries
+    read_matrix<lno_t, size_type, scalar_t>(&nr, &nnzA, &xadj, &adj, &values,
+                                            filename_);
   }
 
   row_map_view_t rowmap_view("rowmap_view", nr + 1);
@@ -1404,134 +1321,136 @@ crsMat_t read_kokkos_crst_matrix(const char * filename_){
   values_view_t values_view("values_view", nnzA);
 
   {
-    Kokkos::View<size_type*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> hr(xadj, nr + 1);
-    Kokkos::View<lno_t*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> hc(adj, nnzA);
-    Kokkos::View<scalar_t*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> hv(values, nnzA);
-    Kokkos::deep_copy (rowmap_view , hr);
-    Kokkos::deep_copy (columns_view , hc);
-    Kokkos::deep_copy (values_view , hv);
-  }
-
-  if(!isMatrixMarket)
-  {
-    KokkosKernels::Impl::kk_view_reduce_max
-        <cols_view_t, typename crsMat_t::execution_space>(nnzA, columns_view, nc);
+    Kokkos::View<size_type *, Kokkos::HostSpace,
+                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+        hr(xadj, nr + 1);
+    Kokkos::View<lno_t *, Kokkos::HostSpace,
+                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+        hc(adj, nnzA);
+    Kokkos::View<scalar_t *, Kokkos::HostSpace,
+                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+        hv(values, nnzA);
+    Kokkos::deep_copy(rowmap_view, hr);
+    Kokkos::deep_copy(columns_view, hc);
+    Kokkos::deep_copy(values_view, hv);
+  }
+
+  if (!isMatrixMarket) {
+    KokkosKernels::Impl::kk_view_reduce_max<cols_view_t,
+                                            typename crsMat_t::execution_space>(
+        nnzA, columns_view, nc);
     nc++;
   }
-  
-  graph_t static_graph (columns_view, rowmap_view);
+
+  graph_t static_graph(columns_view, rowmap_view);
   crsMat_t crsmat("CrsMatrix", nc, values_view, static_graph);
-  delete [] xadj; delete [] adj; delete [] values;
+  delete[] xadj;
+  delete[] adj;
+  delete[] values;
   return crsmat;
 }
 
-
 template <typename crsGraph_t>
-crsGraph_t read_kokkos_crst_graph(const char * filename_){
-
+crsGraph_t read_kokkos_crst_graph(const char *filename_) {
   typedef typename crsGraph_t::row_map_type::non_const_type row_map_view_t;
-  typedef typename crsGraph_t::entries_type::non_const_type   cols_view_t;
+  typedef typename crsGraph_t::entries_type::non_const_type cols_view_t;
 
   typedef typename row_map_view_t::value_type size_type;
-  typedef typename cols_view_t::value_type   lno_t;
-  typedef double scalar_t ;
+  typedef typename cols_view_t::value_type lno_t;
+  typedef double scalar_t;
 
   lno_t nv, *adj;
   size_type *xadj, nnzA;
   scalar_t *values;
-  read_matrix<lno_t, size_type, scalar_t>(
-      &nv, &nnzA, &xadj, &adj, &values, filename_);
+  read_matrix<lno_t, size_type, scalar_t>(&nv, &nnzA, &xadj, &adj, &values,
+                                          filename_);
 
-  row_map_view_t rowmap_view("rowmap_view", nv+1);
+  row_map_view_t rowmap_view("rowmap_view", nv + 1);
   cols_view_t columns_view("colsmap_view", nnzA);
 
-
-
   {
-    typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view);
-    typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view);
+    typename row_map_view_t::HostMirror hr =
+        Kokkos::create_mirror_view(rowmap_view);
+    typename cols_view_t::HostMirror hc =
+        Kokkos::create_mirror_view(columns_view);
 
-    for (lno_t i = 0; i <= nv; ++i){
+    for (lno_t i = 0; i <= nv; ++i) {
       hr(i) = xadj[i];
     }
 
-    for (size_type i = 0; i < nnzA; ++i){
+    for (size_type i = 0; i < nnzA; ++i) {
       hc(i) = adj[i];
     }
-    Kokkos::deep_copy (rowmap_view , hr);
-    Kokkos::deep_copy (columns_view , hc);
+    Kokkos::deep_copy(rowmap_view, hr);
+    Kokkos::deep_copy(columns_view, hc);
   }
 
   lno_t ncols = 0;
-  KokkosKernels::Impl::kk_view_reduce_max
-      <cols_view_t, typename crsGraph_t::execution_space>(nnzA, columns_view, ncols);
+  KokkosKernels::Impl::kk_view_reduce_max<cols_view_t,
+                                          typename crsGraph_t::execution_space>(
+      nnzA, columns_view, ncols);
   ncols += 1;
 
-  crsGraph_t static_graph (columns_view, rowmap_view, ncols);
-  delete [] xadj; delete [] adj; delete [] values;
+  crsGraph_t static_graph(columns_view, rowmap_view, ncols);
+  delete[] xadj;
+  delete[] adj;
+  delete[] values;
   return static_graph;
 }
 
-
-
 template <typename size_type, typename nnz_lno_t>
 inline void kk_sequential_create_incidence_matrix(
-    nnz_lno_t num_rows,
-    const size_type *xadj,
-    const nnz_lno_t *adj,
-    size_type *i_adj //output. preallocated
-  ){
-
+    nnz_lno_t num_rows, const size_type *xadj, const nnz_lno_t *adj,
+    size_type *i_adj  // output. preallocated
+) {
   std::vector<size_type> c_xadj(num_rows);
-  for (nnz_lno_t i = 0; i < num_rows; i++){
+  for (nnz_lno_t i = 0; i < num_rows; i++) {
     c_xadj[i] = xadj[i];
   }
-  int eCnt=0;
-  for (nnz_lno_t i = 0; i < num_rows; i++){
-    size_type begin = xadj[i];
-    size_type end = xadj[i + 1];
+  int eCnt = 0;
+  for (nnz_lno_t i = 0; i < num_rows; i++) {
+    size_type begin   = xadj[i];
+    size_type end     = xadj[i + 1];
     nnz_lno_t adjsize = end - begin;
 
-    for (nnz_lno_t j = 0; j < adjsize; j++){
+    for (nnz_lno_t j = 0; j < adjsize; j++) {
       size_type aind = j + begin;
-      nnz_lno_t col = adj[aind];
-      if (i < col){
-        i_adj[c_xadj[i]++] = eCnt;
+      nnz_lno_t col  = adj[aind];
+      if (i < col) {
+        i_adj[c_xadj[i]++]   = eCnt;
         i_adj[c_xadj[col]++] = eCnt++;
       }
     }
   }
 
-  for (nnz_lno_t i = 0; i < num_rows; i++){
-    if (c_xadj[i] != xadj[i+1]){
-      std::cout << "i:" << i << " c_xadj[i]:" << c_xadj[i] << " xadj[i+1]:" << xadj[i+1] << std::endl;
+  for (nnz_lno_t i = 0; i < num_rows; i++) {
+    if (c_xadj[i] != xadj[i + 1]) {
+      std::cout << "i:" << i << " c_xadj[i]:" << c_xadj[i]
+                << " xadj[i+1]:" << xadj[i + 1] << std::endl;
     }
   }
 }
 
 template <typename size_type, typename nnz_lno_t>
 inline void kk_sequential_create_incidence_matrix_transpose(
-    const nnz_lno_t num_rows,
-    const size_type num_edges,
-    const size_type *xadj,
+    const nnz_lno_t num_rows, const size_type num_edges, const size_type *xadj,
     const nnz_lno_t *adj,
-    size_type *i_xadj, //output. preallocated
-    nnz_lno_t *i_adj //output. preallocated
-  ){
-
-  for (nnz_lno_t i = 0; i < num_edges/2 + 1; i++){
+    size_type *i_xadj,  // output. preallocated
+    nnz_lno_t *i_adj    // output. preallocated
+) {
+  for (nnz_lno_t i = 0; i < num_edges / 2 + 1; i++) {
     i_xadj[i] = i * 2;
   }
-  int eCnt=0;
-  for (nnz_lno_t i = 0; i < num_rows; i++){
-    size_type begin = xadj[i];
-    size_type end = xadj[i + 1];
+  int eCnt = 0;
+  for (nnz_lno_t i = 0; i < num_rows; i++) {
+    size_type begin   = xadj[i];
+    size_type end     = xadj[i + 1];
     nnz_lno_t adjsize = end - begin;
 
-    for (nnz_lno_t j = 0; j < adjsize; j++){
+    for (nnz_lno_t j = 0; j < adjsize; j++) {
       size_type aind = j + begin;
-      nnz_lno_t col = adj[aind];
-      if (i < col){
+      nnz_lno_t col  = adj[aind];
+      if (i < col) {
         i_adj[eCnt++] = i;
         i_adj[eCnt++] = col;
       }
@@ -1539,8 +1458,7 @@ inline void kk_sequential_create_incidence_matrix_transpose(
   }
 }
 
-
-}
-}
+}  // namespace Impl
+}  // namespace KokkosKernels
 
 #endif
diff --git a/src/common/KokkosKernels_PrintUtils.hpp b/src/common/KokkosKernels_PrintUtils.hpp
index ef3acac107..d368c1a227 100644
--- a/src/common/KokkosKernels_PrintUtils.hpp
+++ b/src/common/KokkosKernels_PrintUtils.hpp
@@ -45,124 +45,114 @@
 #ifndef _KOKKOSKERNELS_PRINTUTILS_HPP
 #define _KOKKOSKERNELS_PRINTUTILS_HPP
 #include "Kokkos_Core.hpp"
-#include "Kokkos_Atomic.hpp"
-#include "Kokkos_Timer.hpp"
 #include <ostream>
 
-namespace KokkosKernels{
+namespace KokkosKernels {
 
+namespace Impl {
 
-namespace Impl{
-
-
-
-template <typename in_lno_view_t,
-          typename out_lno_view_t>
-struct Histogram{
+template <typename in_lno_view_t, typename out_lno_view_t>
+struct Histogram {
   in_lno_view_t inview;
   out_lno_view_t outview;
-  Histogram (in_lno_view_t inview_, out_lno_view_t outview_): inview(inview_), outview(outview_){}
+  Histogram(in_lno_view_t inview_, out_lno_view_t outview_)
+      : inview(inview_), outview(outview_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const size_t &ii) const {
-    typedef typename std::remove_reference< decltype(outview(0)) >::type atomic_incr_type;
+  void operator()(const size_t& ii) const {
+    typedef typename std::remove_reference<decltype(outview(0))>::type
+        atomic_incr_type;
     Kokkos::atomic_fetch_add(&(outview(inview(ii))), atomic_incr_type(1));
   }
 };
 
-
 /**
  * \brief given an integer input view, it fills the histogram that
  * represents how many of each value exists.
  * \param in_elements: number of the elements in input view.
  * \param in_view: the input view. Has to be integer-like.
- * \param histogram: the output histogram. User is responsible from initializing them with 0, and size
- * must be big enough to hold all values in input view.
+ * \param histogram: the output histogram. User is responsible from initializing
+ * them with 0, and size must be big enough to hold all values in input view.
  */
-template <typename in_lno_view_t,
-          typename out_lno_view_t,
-          typename MyExecSpace>
+template <typename in_lno_view_t, typename out_lno_view_t, typename MyExecSpace>
 inline void kk_get_histogram(
-    typename in_lno_view_t::size_type in_elements,
-    in_lno_view_t in_view,
-    out_lno_view_t histogram /*must be initialized with 0s*/){
+    typename in_lno_view_t::size_type in_elements, in_lno_view_t in_view,
+    out_lno_view_t histogram /*must be initialized with 0s*/) {
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-  Kokkos::parallel_for( "KokkosKernels::Common::GetHistogram", my_exec_space(0, in_elements), Histogram<in_lno_view_t, out_lno_view_t>(in_view, histogram));
+  Kokkos::parallel_for(
+      "KokkosKernels::Common::GetHistogram", my_exec_space(0, in_elements),
+      Histogram<in_lno_view_t, out_lno_view_t>(in_view, histogram));
   MyExecSpace().fence();
 }
 
-
 /**
  * \brief Prints the given 1D view.
- * \param os: Stream to print to. To print to stdout use std::cout, stderr, std::cerr, or a file use an ofstream object.
- * \param view: input view to print.
- * \param print_all: whether to print all elements or not. If it is false, print print_size/2 first and last elements.
- * \param sep: Element separator. Default is a single space: " "
- * \param print_size: Total elements to print if print_all is false print_size/2 first and last elements are pritned.
- *                    This parameter is not used if print_all is set to true.
+ * \param os: Stream to print to. To print to stdout use std::cout, stderr,
+ * std::cerr, or a file use an ofstream object. \param view: input view to
+ * print. \param print_all: whether to print all elements or not. If it is
+ * false, print print_size/2 first and last elements. \param sep: Element
+ * separator. Default is a single space: " " \param print_size: Total elements
+ * to print if print_all is false print_size/2 first and last elements are
+ * pritned. This parameter is not used if print_all is set to true.
  */
 template <typename idx_array_type>
-inline void kk_print_1Dview(std::ostream& os, idx_array_type view, bool print_all=false, const char* sep=" ", size_t print_size=40)
-{
+inline std::enable_if_t<idx_array_type::rank <= 1> kk_print_1Dview(
+    std::ostream& os, idx_array_type view, bool print_all = false,
+    const char* sep = " ", size_t print_size = 40) {
   typedef typename idx_array_type::HostMirror host_type;
   typedef typename idx_array_type::size_type idx;
-  host_type host_view = Kokkos::create_mirror_view (view);
-  Kokkos::deep_copy (host_view, view);
+  host_type host_view = Kokkos::create_mirror_view(view);
+  Kokkos::deep_copy(host_view, view);
+  const auto print_range = [&](idx begin, idx end) {
+    for (idx i = begin; i < end; ++i) os << host_view.access(i) << sep;
+  };
   idx nr = host_view.extent(0);
-  if (!print_all)
-  {
-    if (nr > print_size)
-    {
-      idx n = print_size / 2;
-      for (idx i = 0; i < n; ++i)
-      {
-        os << host_view(i) << sep;
-      }
-      os << "... ... ..." << sep;
+  if (print_all || nr <= print_size) {
+    print_range(0, nr);
+  } else {
+    idx n = print_size / 2;
+    print_range(0, n);
+    os << "... ... ..." << sep;
+    print_range(nr - n, nr);
+  }
+  os << std::endl;
+}
 
-      for (idx i = nr-n; i < nr; ++i)
-      {
-        os << host_view(i) << sep;
-      }
-      os << std::endl;
-    }
-    else
-    {
-      for (idx i = 0; i < nr; ++i)
-      {
-        os << host_view(i) << sep;
-      }
-      os << std::endl;
-    }
+/**
+ * \brief Multi-vector variant (see rank-1 for param description). Prints Nx1
+ * rank-2 vectors same like rank-1 vectors and prints multi-vector dimensions.
+ */
+template <typename idx_array_type>
+inline std::enable_if_t<idx_array_type::rank >= 2> kk_print_1Dview(
+    std::ostream& os, idx_array_type view, bool print_all = false,
+    const char* sep = " ", size_t print_size = 40) {
+  if (idx_array_type::rank == 2 && view.extent(1) == 1) {
+    kk_print_1Dview(os, subview(view, Kokkos::ALL, 0), print_all, sep,
+                    print_size);
+    return;
   }
-  else
-  {
-    for (idx i = 0; i < nr; ++i)
-    {
-      os << host_view(i) << sep;
-    }
-    os << std::endl;
+  os << "[" << view.extent(0);
+  for (int i = 1; i < idx_array_type::rank; ++i) {
+    os << "x" << view.extent(i);
   }
+  os << " multi-vector]" << std::endl;
 }
 
-
 /**
  * \brief Prints the given 1D view.
  * \param view: input view to print.
  * \param print_all: whether to print all elements or not. If it is false,
  * only first and last 20 elements are printed.
- * 
+ *
  * This interface is provided for backwards compatiblity.
  */
 template <typename idx_array_type>
-inline void kk_print_1Dview(idx_array_type view, bool print_all = false, size_t print_size = 40){
-
+inline void kk_print_1Dview(idx_array_type view, bool print_all = false,
+                            size_t print_size = 40) {
   kk_print_1Dview(std::cout, view, print_all, " ", print_size);
-
-}
-
-}
 }
 
+}  // namespace Impl
+}  // namespace KokkosKernels
 
 #endif
diff --git a/src/common/KokkosKernels_SimpleUtils.hpp b/src/common/KokkosKernels_SimpleUtils.hpp
index 8a8202a47e..c1f68ebd3b 100644
--- a/src/common/KokkosKernels_SimpleUtils.hpp
+++ b/src/common/KokkosKernels_SimpleUtils.hpp
@@ -44,158 +44,164 @@
 #ifndef _KOKKOSKERNELS_SIMPLEUTILS_HPP
 #define _KOKKOSKERNELS_SIMPLEUTILS_HPP
 #include "Kokkos_Core.hpp"
-#include "Kokkos_Atomic.hpp"
 #include "Kokkos_ArithTraits.hpp"
-#include "Kokkos_Timer.hpp"
 #include <type_traits>
 
+#define KOKKOSKERNELS_MACRO_MIN(x, y) ((x) < (y) ? (x) : (y))
+#define KOKKOSKERNELS_MACRO_MAX(x, y) ((x) < (y) ? (y) : (x))
+#define KOKKOSKERNELS_MACRO_ABS(x) \
+  Kokkos::Details::ArithTraits<typename std::decay<decltype(x)>::type>::abs(x)
 
-#define KOKKOSKERNELS_MACRO_MIN(x,y) ((x) < (y) ? (x) : (y))
-#define KOKKOSKERNELS_MACRO_MAX(x,y) ((x) < (y) ? (y) : (x))
-#define KOKKOSKERNELS_MACRO_ABS(x)  Kokkos::Details::ArithTraits<typename std::decay<decltype(x)>::type>::abs (x)
+namespace KokkosKernels {
 
-namespace KokkosKernels{
+namespace Impl {
 
-namespace Impl{
-
-template<class ViewType>
+template <class ViewType>
 class SquareRootFunctor {
-public:
-typedef typename ViewType::execution_space execution_space;
-typedef typename ViewType::size_type size_type;
+ public:
+  typedef typename ViewType::execution_space execution_space;
+  typedef typename ViewType::size_type size_type;
 
-SquareRootFunctor (const ViewType& theView) : theView_ (theView) {}
+  SquareRootFunctor(const ViewType &theView) : theView_(theView) {}
 
-KOKKOS_INLINE_FUNCTION void operator() (const size_type i) const {
-  typedef typename ViewType::value_type value_type;
-  theView_(i) = Kokkos::Details::ArithTraits<value_type>::sqrt (theView_(i));
-}
-private:
-ViewType theView_;
+  KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const {
+    typedef typename ViewType::value_type value_type;
+    theView_(i) = Kokkos::Details::ArithTraits<value_type>::sqrt(theView_(i));
+  }
+
+ private:
+  ViewType theView_;
 };
 
 template <typename view_t>
-struct ExclusiveParallelPrefixSum{
+struct ExclusiveParallelPrefixSum {
   typedef typename view_t::value_type idx;
   view_t array_sum;
-  ExclusiveParallelPrefixSum(view_t arr_): array_sum(arr_){}
+  ExclusiveParallelPrefixSum(view_t arr_) : array_sum(arr_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const size_t ii, size_t& update, const bool final) const {
-
+  void operator()(const size_t ii, size_t &update, const bool final) const {
     idx val = array_sum(ii);
     if (final) {
-      array_sum(ii) = idx (update);
+      array_sum(ii) = idx(update);
     }
     update += val;
   }
 };
 
 template <typename array_type>
-struct InclusiveParallelPrefixSum{
+struct InclusiveParallelPrefixSum {
   typedef typename array_type::value_type idx;
   array_type array_sum;
-  InclusiveParallelPrefixSum(array_type arr_): array_sum(arr_){}
+  InclusiveParallelPrefixSum(array_type arr_) : array_sum(arr_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const size_t ii, size_t& update, const bool final) const {
+  void operator()(const size_t ii, size_t &update, const bool final) const {
     update += array_sum(ii);
     if (final) {
-      array_sum(ii) = idx (update);
+      array_sum(ii) = idx(update);
     }
   }
 };
 
-
 /***
- * \brief Function performs the exclusive parallel prefix sum. That is each entry holds the sum
- * until itself.
- * \param num_elements: size of the array
+ * \brief Function performs the exclusive parallel prefix sum. That is each
+ * entry holds the sum until itself. \param num_elements: size of the array
  * \param arr: the array for which the prefix sum will be performed.
  */
 template <typename view_t, typename MyExecSpace>
-inline void kk_exclusive_parallel_prefix_sum(typename view_t::value_type num_elements, view_t arr){
+inline void kk_exclusive_parallel_prefix_sum(
+    typename view_t::value_type num_elements, view_t arr) {
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-  Kokkos::parallel_scan( "KokkosKernels::Common::PrefixSum", my_exec_space(0, num_elements), ExclusiveParallelPrefixSum<view_t>(arr));
+  Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum",
+                        my_exec_space(0, num_elements),
+                        ExclusiveParallelPrefixSum<view_t>(arr));
 }
 
-
-
-
 /***
- * \brief Function performs the inclusive parallel prefix sum. That is each entry holds the sum
- * until itself including itself.
- * \param num_elements: size of the array
- * \param arr: the array for which the prefix sum will be performed.
+ * \brief Function performs the inclusive parallel prefix sum. That is each
+ * entry holds the sum until itself including itself. \param num_elements: size
+ * of the array \param arr: the array for which the prefix sum will be
+ * performed.
  */
 template <typename forward_array_type, typename MyExecSpace>
-void kk_inclusive_parallel_prefix_sum(typename forward_array_type::value_type num_elements, forward_array_type arr){
+void kk_inclusive_parallel_prefix_sum(
+    typename forward_array_type::value_type num_elements,
+    forward_array_type arr) {
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-  Kokkos::parallel_scan( "KokkosKernels::Common::PrefixSum", my_exec_space(0, num_elements), InclusiveParallelPrefixSum<forward_array_type>(arr));
+  Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum",
+                        my_exec_space(0, num_elements),
+                        InclusiveParallelPrefixSum<forward_array_type>(arr));
 }
 
 template <typename view_t>
-struct ReductionFunctor{
+struct ReductionFunctor {
   view_t array_sum;
-  ReductionFunctor(view_t arr_): array_sum(arr_){}
+  ReductionFunctor(view_t arr_) : array_sum(arr_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const size_t ii, typename view_t::value_type & update) const {
-	  update += array_sum(ii);
+  void operator()(const size_t ii, typename view_t::value_type &update) const {
+    update += array_sum(ii);
   }
 };
 
-
 template <typename view_t>
-struct ReductionFunctor2{
+struct ReductionFunctor2 {
   view_t array_sum;
-  ReductionFunctor2(view_t arr_): array_sum(arr_){}
+  ReductionFunctor2(view_t arr_) : array_sum(arr_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const size_t ii, size_t & update) const {
-	  update += array_sum(ii);
+  void operator()(const size_t ii, size_t &update) const {
+    update += array_sum(ii);
   }
 };
 
-
 template <typename view_t, typename view2_t>
-struct DiffReductionFunctor{
+struct DiffReductionFunctor {
   view_t array_begins;
   view2_t array_ends;
-  DiffReductionFunctor(view_t begins, view2_t ends): array_begins(begins), array_ends(ends){}
+  DiffReductionFunctor(view_t begins, view2_t ends)
+      : array_begins(begins), array_ends(ends) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const size_t ii, typename view_t::non_const_value_type & update) const {
-          update += (array_ends(ii) - array_begins(ii));
+  void operator()(const size_t ii,
+                  typename view_t::non_const_value_type &update) const {
+    update += (array_ends(ii) - array_begins(ii));
   }
 };
 
 template <typename view_t, typename view2_t, typename MyExecSpace>
-inline void kk_reduce_diff_view(size_t num_elements, view_t smaller, view2_t bigger, typename view_t::non_const_value_type & reduction){
+inline void kk_reduce_diff_view(
+    size_t num_elements, view_t smaller, view2_t bigger,
+    typename view_t::non_const_value_type &reduction) {
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-  Kokkos::parallel_reduce( "KokkosKernels::Common::ReduceDiffView", my_exec_space(0, num_elements), DiffReductionFunctor<view_t, view2_t>(smaller, bigger), reduction);
+  Kokkos::parallel_reduce(
+      "KokkosKernels::Common::ReduceDiffView", my_exec_space(0, num_elements),
+      DiffReductionFunctor<view_t, view2_t>(smaller, bigger), reduction);
 }
 
 template <typename it>
-struct DiffReductionFunctorP{
-  const it * array_begins;
-  const it* array_ends;
-  DiffReductionFunctorP(const it * begins, const it* ends): array_begins(begins), array_ends(ends){}
+struct DiffReductionFunctorP {
+  const it *array_begins;
+  const it *array_ends;
+  DiffReductionFunctorP(const it *begins, const it *ends)
+      : array_begins(begins), array_ends(ends) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const size_t ii, it & update) const {
-          update += (array_ends[ii] - array_begins[ii]);
+  void operator()(const size_t ii, it &update) const {
+    update += (array_ends[ii] - array_begins[ii]);
   }
 };
 
-template <typename it,  typename MyExecSpace>
-inline void kkp_reduce_diff_view(const size_t num_elements, const it *smaller, const it *bigger, it & reduction){
+template <typename it, typename MyExecSpace>
+inline void kkp_reduce_diff_view(const size_t num_elements, const it *smaller,
+                                 const it *bigger, it &reduction) {
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-  Kokkos::parallel_reduce( "KokkosKernels::Common::ReduceDiffView", my_exec_space(0, num_elements), DiffReductionFunctorP<it>(smaller, bigger), reduction);
+  Kokkos::parallel_reduce(
+      "KokkosKernels::Common::ReduceDiffView", my_exec_space(0, num_elements),
+      DiffReductionFunctorP<it>(smaller, bigger), reduction);
 }
 
-
-
 /***
  * \brief Function performs the a reduction
  * until itself.
@@ -203,45 +209,51 @@ inline void kkp_reduce_diff_view(const size_t num_elements, const it *smaller, c
  * \param arr: the array for which the prefix sum will be performed.
  */
 template <typename view_t, typename MyExecSpace>
-inline void kk_reduce_view(size_t num_elements, view_t arr, typename view_t::value_type & reduction){
+inline void kk_reduce_view(size_t num_elements, view_t arr,
+                           typename view_t::value_type &reduction) {
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-  Kokkos::parallel_reduce( "KokkosKernels::Common::ReduceView", my_exec_space(0, num_elements), ReductionFunctor<view_t>(arr), reduction);
+  Kokkos::parallel_reduce("KokkosKernels::Common::ReduceView",
+                          my_exec_space(0, num_elements),
+                          ReductionFunctor<view_t>(arr), reduction);
 }
 
 template <typename view_t, typename MyExecSpace>
-inline void kk_reduce_view2(size_t num_elements, view_t arr, size_t & reduction){
+inline void kk_reduce_view2(size_t num_elements, view_t arr,
+                            size_t &reduction) {
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-  Kokkos::parallel_reduce( "KokkosKernels::Common::ReduceView2", my_exec_space(0, num_elements), ReductionFunctor2<view_t>(arr), reduction);
+  Kokkos::parallel_reduce("KokkosKernels::Common::ReduceView2",
+                          my_exec_space(0, num_elements),
+                          ReductionFunctor2<view_t>(arr), reduction);
 }
 
-template<typename view_type1, typename view_type2, typename eps_type = typename Kokkos::Details::ArithTraits<typename view_type2::non_const_value_type>::mag_type>
-struct IsIdenticalFunctor{
+template <typename view_type1, typename view_type2,
+          typename eps_type = typename Kokkos::Details::ArithTraits<
+              typename view_type2::non_const_value_type>::mag_type>
+struct IsIdenticalFunctor {
   view_type1 view1;
   view_type2 view2;
   eps_type eps;
 
-
-  IsIdenticalFunctor(view_type1 view1_, view_type2 view2_, eps_type eps_):
-    view1(view1_), view2(view2_), eps(eps_){}
+  IsIdenticalFunctor(view_type1 view1_, view_type2 view2_, eps_type eps_)
+      : view1(view1_), view2(view2_), eps(eps_) {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const size_t &i, size_t &is_equal) const {
-	typedef typename view_type2::non_const_value_type val_type;
-	typedef Kokkos::Details::ArithTraits<val_type> KAT;
-	typedef typename KAT::mag_type mag_type;
-    const mag_type val_diff = KAT::abs (view1(i) - view2(i));
-
+    typedef typename view_type2::non_const_value_type val_type;
+    typedef Kokkos::Details::ArithTraits<val_type> KAT;
+    typedef typename KAT::mag_type mag_type;
+    const mag_type val_diff = KAT::abs(view1(i) - view2(i));
 
-    if (val_diff > eps ) {
-      is_equal+=1;
+    if (val_diff > eps) {
+      is_equal += 1;
     }
   }
 };
 
-template <typename view_type1, typename view_type2, typename eps_type, typename MyExecSpace>
-bool kk_is_identical_view(view_type1 view1, view_type2 view2, eps_type eps){
-
-  if (view1.extent(0) != view2.extent(0)){
+template <typename view_type1, typename view_type2, typename eps_type,
+          typename MyExecSpace>
+bool kk_is_identical_view(view_type1 view1, view_type2 view2, eps_type eps) {
+  if (view1.extent(0) != view2.extent(0)) {
     return false;
   }
 
@@ -249,26 +261,29 @@ bool kk_is_identical_view(view_type1 view1, view_type2 view2, eps_type eps){
 
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
   size_t issame = 0;
-  Kokkos::parallel_reduce( "KokkosKernels::Common::IsIdenticalView", my_exec_space(0,num_elements),
-      IsIdenticalFunctor<view_type1, view_type2, eps_type>(view1, view2, eps), issame);
+  Kokkos::parallel_reduce(
+      "KokkosKernels::Common::IsIdenticalView", my_exec_space(0, num_elements),
+      IsIdenticalFunctor<view_type1, view_type2, eps_type>(view1, view2, eps),
+      issame);
   MyExecSpace().fence();
-  if (issame > 0){
+  if (issame > 0) {
     return false;
-  }
-  else {
+  } else {
     return true;
   }
 }
 
-template<typename view_type1, typename view_type2, typename eps_type = typename Kokkos::Details::ArithTraits<typename view_type2::non_const_value_type>::mag_type>
-struct IsRelativelyIdenticalFunctor{
+template <typename view_type1, typename view_type2,
+          typename eps_type = typename Kokkos::Details::ArithTraits<
+              typename view_type2::non_const_value_type>::mag_type>
+struct IsRelativelyIdenticalFunctor {
   view_type1 view1;
   view_type2 view2;
   eps_type eps;
 
-
-  IsRelativelyIdenticalFunctor(view_type1 view1_, view_type2 view2_, eps_type eps_):
-    view1(view1_), view2(view2_), eps(eps_){}
+  IsRelativelyIdenticalFunctor(view_type1 view1_, view_type2 view2_,
+                               eps_type eps_)
+      : view1(view1_), view2(view2_), eps(eps_) {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const size_t &i, size_t &is_equal) const {
@@ -276,22 +291,24 @@ struct IsRelativelyIdenticalFunctor{
     typedef Kokkos::Details::ArithTraits<val_type> KAT;
     typedef typename KAT::mag_type mag_type;
     typedef Kokkos::Details::ArithTraits<mag_type> KATM;
- 
-    mag_type val_diff = KAT::abs (view1(i) - view2(i));
-    if(KAT::abs(view1(i)) > KATM::zero() && KAT::abs(view2(i)) > KATM::zero()) {
+
+    mag_type val_diff = KAT::abs(view1(i) - view2(i));
+    if (KAT::abs(view1(i)) > KATM::zero() &&
+        KAT::abs(view2(i)) > KATM::zero()) {
       val_diff = val_diff / KAT::abs(view2(i));
     }
 
-    if (val_diff > eps ) {
-      is_equal+=1;
+    if (val_diff > eps) {
+      is_equal += 1;
     }
   }
 };
 
-template <typename view_type1, typename view_type2, typename eps_type, typename MyExecSpace>
-bool kk_is_relatively_identical_view(view_type1 view1, view_type2 view2, eps_type eps){
-
-  if (view1.extent(0) != view2.extent(0)){
+template <typename view_type1, typename view_type2, typename eps_type,
+          typename MyExecSpace>
+bool kk_is_relatively_identical_view(view_type1 view1, view_type2 view2,
+                                     eps_type eps) {
+  if (view1.extent(0) != view2.extent(0)) {
     return false;
   }
 
@@ -299,59 +316,97 @@ bool kk_is_relatively_identical_view(view_type1 view1, view_type2 view2, eps_typ
 
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
   size_t issame = 0;
-  Kokkos::parallel_reduce( "KokkosKernels::Common::IsRelativelyIdenticalView", my_exec_space(0,num_elements),
-      IsRelativelyIdenticalFunctor<view_type1, view_type2, eps_type>(view1, view2, eps), issame);
+  Kokkos::parallel_reduce(
+      "KokkosKernels::Common::IsRelativelyIdenticalView",
+      my_exec_space(0, num_elements),
+      IsRelativelyIdenticalFunctor<view_type1, view_type2, eps_type>(
+          view1, view2, eps),
+      issame);
   MyExecSpace().fence();
-  if (issame > 0){
+  if (issame > 0) {
     return false;
-  }
-  else {
+  } else {
     return true;
   }
 }
 
-template<typename view_type>
-struct ReduceMaxFunctor{
-
+template <typename view_type>
+struct ReduceMaxFunctor {
   view_type view_to_reduce;
   typedef typename view_type::non_const_value_type value_type;
   const value_type min_val;
-  ReduceMaxFunctor(
-      view_type view_to_reduce_): view_to_reduce(view_to_reduce_),
-          min_val((std::numeric_limits<value_type>::lowest())){
-  }
+  ReduceMaxFunctor(view_type view_to_reduce_)
+      : view_to_reduce(view_to_reduce_),
+        min_val((std::numeric_limits<value_type>::lowest())) {}
   KOKKOS_INLINE_FUNCTION
   void operator()(const size_t &i, value_type &max_reduction) const {
     value_type val = view_to_reduce(i);
-    if (max_reduction < val) { max_reduction = val;}
-
+    if (max_reduction < val) {
+      max_reduction = val;
+    }
   }
   KOKKOS_INLINE_FUNCTION
-  void join (volatile value_type& dst,const volatile value_type& src) const {
-    if (dst < src) { dst = src;}
+  void join(volatile value_type &dst, const volatile value_type &src) const {
+    if (dst < src) {
+      dst = src;
+    }
   }
 
-
   KOKKOS_INLINE_FUNCTION
-  void init (value_type& dst) const
-  {
+  void init(value_type &dst) const {
     // The identity under max is -Inf.
     // Kokkos does not come with a portable way to access
     // floating -point Inf and NaN. Trilinos does , however;
     // see Kokkos :: ArithTraits in the Tpetra package.
     dst = min_val;
   }
-
 };
 
-template <typename view_type , typename MyExecSpace>
-void kk_view_reduce_max(size_t num_elements, view_type view_to_reduce, typename view_type::non_const_value_type &max_reduction){
+template <typename view_type, typename MyExecSpace>
+void kk_view_reduce_max(
+    size_t num_elements, view_type view_to_reduce,
+    typename view_type::non_const_value_type &max_reduction) {
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-  Kokkos::parallel_reduce( "KokkosKernels::Common::ReduceMax", my_exec_space(0,num_elements), ReduceMaxFunctor<view_type>(view_to_reduce), max_reduction);
+  Kokkos::parallel_reduce(
+      "KokkosKernels::Common::ReduceMax", my_exec_space(0, num_elements),
+      ReduceMaxFunctor<view_type>(view_to_reduce), max_reduction);
 }
 
-
+// xorshift hash/pseudorandom function (supported for 32- and 64-bit integer
+// types only)
+template <typename Value>
+KOKKOS_FORCEINLINE_FUNCTION Value xorshiftHash(Value v) {
+  static_assert(std::is_unsigned<Value>::value,
+                "xorshiftHash: value must be an unsigned integer type");
+  uint64_t x = v;
+  x ^= x >> 12;
+  x ^= x << 25;
+  x ^= x >> 27;
+  return std::is_same<Value, uint32_t>::value
+             ? static_cast<Value>((x * 2685821657736338717ULL - 1) >> 16)
+             : static_cast<Value>(x * 2685821657736338717ULL - 1);
 }
+
+template <typename V>
+struct SequentialFillFunctor {
+  using size_type = typename V::size_type;
+  using val_type  = typename V::non_const_value_type;
+  SequentialFillFunctor(const V &v_, val_type start_) : v(v_), start(start_) {}
+  KOKKOS_INLINE_FUNCTION void operator()(size_type i) const {
+    v(i) = start + (val_type)i;
+  }
+  V v;
+  val_type start;
+};
+
+template <typename V>
+void sequential_fill(const V &v, typename V::non_const_value_type start = 0) {
+  Kokkos::parallel_for(
+      Kokkos::RangePolicy<typename V::execution_space>(0, v.extent(0)),
+      SequentialFillFunctor<V>(v, start));
 }
 
+}  // namespace Impl
+}  // namespace KokkosKernels
+
 #endif
diff --git a/src/common/KokkosKernels_Sorting.hpp b/src/common/KokkosKernels_Sorting.hpp
index 7929efd2f8..1cdf1df7ee 100644
--- a/src/common/KokkosKernels_Sorting.hpp
+++ b/src/common/KokkosKernels_Sorting.hpp
@@ -46,52 +46,55 @@
 
 #include "Kokkos_Core.hpp"
 #include "KokkosKernels_SimpleUtils.hpp"  //for kk_exclusive_parallel_prefix_sum
-#include "KokkosKernels_ExecSpaceUtils.hpp"//for kk_is_gpu_exec_space
+#include "KokkosKernels_ExecSpaceUtils.hpp"  //for kk_is_gpu_exec_space
 #include <type_traits>
 
 namespace KokkosKernels {
 
 namespace Impl {
-  template<typename Value>
-  struct DefaultComparator
-  {
-    KOKKOS_INLINE_FUNCTION bool operator()(const Value lhs, const Value rhs) const
-    {
-      return lhs < rhs;
-    }
-  };
-}
+template <typename Value>
+struct DefaultComparator {
+  KOKKOS_INLINE_FUNCTION bool operator()(const Value lhs,
+                                         const Value rhs) const {
+    return lhs < rhs;
+  }
+};
+}  // namespace Impl
 
 // ----------------------------------
 // CRS matrix/graph sorting utilities
 // ----------------------------------
 
-// The sort_crs* functions sort the adjacent column list for each row into ascending order.
+// The sort_crs* functions sort the adjacent column list for each row into
+// ascending order.
 
-template<typename execution_space, typename rowmap_t, typename entries_t, typename values_t>
-void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const values_t& values);
+template <typename execution_space, typename rowmap_t, typename entries_t,
+          typename values_t>
+void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries,
+                     const values_t& values);
 
 template <typename crsMat_t>
 void sort_crs_matrix(const crsMat_t& A);
 
-template<typename execution_space, typename rowmap_t, typename entries_t>
+template <typename execution_space, typename rowmap_t, typename entries_t>
 void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries);
 
 template <typename crsGraph_t>
 void sort_crs_graph(const crsGraph_t& G);
 
-// sort_and_merge_matrix produces a new matrix which is equivalent to A but is sorted
-// and has no duplicated entries: each (i, j) is unique. Values for duplicated entries are summed.
-template<typename crsMat_t>
+// sort_and_merge_matrix produces a new matrix which is equivalent to A but is
+// sorted and has no duplicated entries: each (i, j) is unique. Values for
+// duplicated entries are summed.
+template <typename crsMat_t>
 crsMat_t sort_and_merge_matrix(const crsMat_t& A);
 
-template<typename crsGraph_t>
+template <typename crsGraph_t>
 crsGraph_t sort_and_merge_graph(const crsGraph_t& G);
 
-template<typename exec_space, typename rowmap_t, typename entries_t>
-void sort_and_merge_graph(
-    const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in,
-    rowmap_t& rowmap_out, entries_t& entries_out);
+template <typename exec_space, typename rowmap_t, typename entries_t>
+void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in,
+                          const entries_t& entries_in, rowmap_t& rowmap_out,
+                          entries_t& entries_out);
 
 // ----------------------------
 // General device-level sorting
@@ -99,86 +102,99 @@ void sort_and_merge_graph(
 
 // Bitonic sort: sorts v according to the comparator object's operator().
 // Default comparator is just operator< for v's element type.
-template<typename View, typename ExecSpace, typename Ordinal, typename Comparator = Impl::DefaultComparator<typename View::value_type>>
+template <
+    typename View, typename ExecSpace, typename Ordinal,
+    typename Comparator = Impl::DefaultComparator<typename View::value_type>>
 void bitonicSort(View v, const Comparator& comp = Comparator());
 
 // --------------------------------------------------------
 // Serial sorting (callable inside any kernel or host code)
 // --------------------------------------------------------
 
-// Radix sort. Not in-place: requires scratch array 'valuesAux' to be the same size as values.
-// ValueType must be an unsigned integer type.
-template<typename Ordinal, typename ValueType>
-KOKKOS_INLINE_FUNCTION void
-SerialRadixSort(ValueType* values, ValueType* valuesAux, Ordinal n);
+// Radix sort. Not in-place: requires scratch array 'valuesAux' to be the same
+// size as values. ValueType must be an unsigned integer type.
+template <typename Ordinal, typename ValueType>
+KOKKOS_INLINE_FUNCTION void SerialRadixSort(ValueType* values,
+                                            ValueType* valuesAux, Ordinal n);
 
-// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts values[0...n].
-template<typename Ordinal, typename ValueType, typename PermType>
-KOKKOS_INLINE_FUNCTION void
-SerialRadixSort2(ValueType* values, ValueType* valuesAux, PermType* perm, PermType* permAux, Ordinal n);
+// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts
+// values[0...n].
+template <typename Ordinal, typename ValueType, typename PermType>
+KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values,
+                                             ValueType* valuesAux,
+                                             PermType* perm, PermType* permAux,
+                                             Ordinal n);
 
 // -------------------------------------------------------------------
 // Team-level parallel sorting (callable inside any TeamPolicy kernel)
 // -------------------------------------------------------------------
 
-// Comparison based sorting that uses the entire team (described by mem) to sort raw array according to the comparator.
-template<typename Ordinal, typename ValueType, typename TeamMember, typename Comparator = Impl::DefaultComparator<ValueType>>
-KOKKOS_INLINE_FUNCTION void
-TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator());
-
-// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts values[0...n].
-template<typename Ordinal, typename ValueType, typename PermType, typename TeamMember, typename Comparator = Impl::DefaultComparator<ValueType>>
-KOKKOS_INLINE_FUNCTION void
-TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator());
+// Comparison based sorting that uses the entire team (described by mem) to sort
+// raw array according to the comparator.
+template <typename Ordinal, typename ValueType, typename TeamMember,
+          typename Comparator = Impl::DefaultComparator<ValueType>>
+KOKKOS_INLINE_FUNCTION void TeamBitonicSort(
+    ValueType* values, Ordinal n, const TeamMember mem,
+    const Comparator& comp = Comparator());
+
+// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts
+// values[0...n].
+template <typename Ordinal, typename ValueType, typename PermType,
+          typename TeamMember,
+          typename Comparator = Impl::DefaultComparator<ValueType>>
+KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(
+    ValueType* values, PermType* perm, Ordinal n, const TeamMember mem,
+    const Comparator& comp = Comparator());
 
 namespace Impl {
 
-template<typename execution_space, typename rowmap_t, typename entries_t, typename values_t>
-struct SortCrsMatrixFunctor
-{
+template <typename execution_space, typename rowmap_t, typename entries_t,
+          typename values_t>
+struct SortCrsMatrixFunctor {
   using size_type = typename rowmap_t::non_const_value_type;
-  using lno_t = typename entries_t::non_const_value_type;
-  using scalar_t = typename values_t::non_const_value_type;
-  using team_mem = typename Kokkos::TeamPolicy<execution_space>::member_type;
-  //The functor owns memory for entriesAux, so it can't have MemoryTraits<Unmanaged>
-  using entries_managed_t = Kokkos::View<typename entries_t::data_type, typename entries_t::device_type>;
-  using values_managed_t = Kokkos::View<typename values_t::data_type, typename values_t::device_type>;
-
-  SortCrsMatrixFunctor(bool usingRangePol, const rowmap_t& rowmap_, const entries_t& entries_, const values_t& values_)
-    : rowmap(rowmap_), entries(entries_), values(values_)
-  {
-    if(usingRangePol)
-    {
-      entriesAux = entries_managed_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"),
+  using lno_t     = typename entries_t::non_const_value_type;
+  using scalar_t  = typename values_t::non_const_value_type;
+  using team_mem  = typename Kokkos::TeamPolicy<execution_space>::member_type;
+  // The functor owns memory for entriesAux, so it can't have
+  // MemoryTraits<Unmanaged>
+  using entries_managed_t = Kokkos::View<typename entries_t::data_type,
+                                         typename entries_t::device_type>;
+  using values_managed_t  = Kokkos::View<typename values_t::data_type,
+                                        typename values_t::device_type>;
+
+  SortCrsMatrixFunctor(bool usingRangePol, const rowmap_t& rowmap_,
+                       const entries_t& entries_, const values_t& values_)
+      : rowmap(rowmap_), entries(entries_), values(values_) {
+    if (usingRangePol) {
+      entriesAux = entries_managed_t(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"),
           entries.extent(0));
-      valuesAux = values_managed_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values aux"),
+      valuesAux = values_managed_t(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values aux"),
           values.extent(0));
     }
-    //otherwise, aux arrays won't be allocated (sorting in place)
+    // otherwise, aux arrays won't be allocated (sorting in place)
   }
 
-  KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const
-  {
+  KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const {
     size_type rowStart = rowmap(i);
-    size_type rowEnd = rowmap(i + 1);
-    lno_t rowNum = rowEnd - rowStart;
-    //Radix sort requires unsigned keys for comparison
+    size_type rowEnd   = rowmap(i + 1);
+    lno_t rowNum       = rowEnd - rowStart;
+    // Radix sort requires unsigned keys for comparison
     using unsigned_lno_t = typename std::make_unsigned<lno_t>::type;
     KokkosKernels::SerialRadixSort2<lno_t, unsigned_lno_t, scalar_t>(
-        (unsigned_lno_t*) entries.data() + rowStart,
-        (unsigned_lno_t*) entriesAux.data() + rowStart,
-        values.data() + rowStart,
+        (unsigned_lno_t*)entries.data() + rowStart,
+        (unsigned_lno_t*)entriesAux.data() + rowStart, values.data() + rowStart,
         valuesAux.data() + rowStart, rowNum);
   }
 
-  KOKKOS_INLINE_FUNCTION void operator()(const team_mem t) const
-  {
-    size_type i = t.league_rank();
+  KOKKOS_INLINE_FUNCTION void operator()(const team_mem t) const {
+    size_type i        = t.league_rank();
     size_type rowStart = rowmap(i);
-    size_type rowEnd = rowmap(i + 1);
-    lno_t rowNum = rowEnd - rowStart;
-    KokkosKernels::TeamBitonicSort2<lno_t, lno_t, scalar_t, team_mem>
-      (entries.data() + rowStart, values.data() + rowStart, rowNum, t);
+    size_type rowEnd   = rowmap(i + 1);
+    lno_t rowNum       = rowEnd - rowStart;
+    KokkosKernels::TeamBitonicSort2<lno_t, lno_t, scalar_t, team_mem>(
+        entries.data() + rowStart, values.data() + rowStart, rowNum, t);
   }
 
   rowmap_t rowmap;
@@ -188,47 +204,45 @@ struct SortCrsMatrixFunctor
   values_managed_t valuesAux;
 };
 
-template<typename execution_space, typename rowmap_t, typename entries_t>
-struct SortCrsGraphFunctor
-{
+template <typename execution_space, typename rowmap_t, typename entries_t>
+struct SortCrsGraphFunctor {
   using size_type = typename rowmap_t::non_const_value_type;
-  using lno_t = typename entries_t::non_const_value_type;
-  using team_mem = typename Kokkos::TeamPolicy<execution_space>::member_type;
-  //The functor owns memory for entriesAux, so it can't have MemoryTraits<Unmanaged>
-  using entries_managed_t = Kokkos::View<typename entries_t::data_type, typename entries_t::device_type>;
-
-  SortCrsGraphFunctor(bool usingRangePol, const rowmap_t& rowmap_, const entries_t& entries_)
-    : rowmap(rowmap_), entries(entries_)
-  {
-    if(usingRangePol)
-    {
-      entriesAux = entries_managed_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"),
+  using lno_t     = typename entries_t::non_const_value_type;
+  using team_mem  = typename Kokkos::TeamPolicy<execution_space>::member_type;
+  // The functor owns memory for entriesAux, so it can't have
+  // MemoryTraits<Unmanaged>
+  using entries_managed_t = Kokkos::View<typename entries_t::data_type,
+                                         typename entries_t::device_type>;
+
+  SortCrsGraphFunctor(bool usingRangePol, const rowmap_t& rowmap_,
+                      const entries_t& entries_)
+      : rowmap(rowmap_), entries(entries_) {
+    if (usingRangePol) {
+      entriesAux = entries_managed_t(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"),
           entries.extent(0));
     }
-    //otherwise, aux arrays won't be allocated (sorting in place)
+    // otherwise, aux arrays won't be allocated (sorting in place)
   }
 
-  KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const
-  {
+  KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const {
     size_type rowStart = rowmap(i);
-    size_type rowEnd = rowmap(i + 1);
-    lno_t rowNum = rowEnd - rowStart;
-    //Radix sort requires unsigned keys for comparison
+    size_type rowEnd   = rowmap(i + 1);
+    lno_t rowNum       = rowEnd - rowStart;
+    // Radix sort requires unsigned keys for comparison
     using unsigned_lno_t = typename std::make_unsigned<lno_t>::type;
     KokkosKernels::SerialRadixSort<lno_t, unsigned_lno_t>(
-        (unsigned_lno_t*) entries.data() + rowStart,
-        (unsigned_lno_t*) entriesAux.data() + rowStart,
-        rowNum);
+        (unsigned_lno_t*)entries.data() + rowStart,
+        (unsigned_lno_t*)entriesAux.data() + rowStart, rowNum);
   }
 
-  KOKKOS_INLINE_FUNCTION void operator()(const team_mem t) const
-  {
-    size_type i = t.league_rank();
+  KOKKOS_INLINE_FUNCTION void operator()(const team_mem t) const {
+    size_type i        = t.league_rank();
     size_type rowStart = rowmap(i);
-    size_type rowEnd = rowmap(i + 1);
-    lno_t rowNum = rowEnd - rowStart;
-    KokkosKernels::TeamBitonicSort<lno_t, lno_t, team_mem>
-      (entries.data() + rowStart, rowNum, t);
+    size_type rowEnd   = rowmap(i + 1);
+    lno_t rowNum       = rowEnd - rowStart;
+    KokkosKernels::TeamBitonicSort<lno_t, lno_t, team_mem>(
+        entries.data() + rowStart, rowNum, t);
   }
 
   rowmap_t rowmap;
@@ -236,39 +250,33 @@ struct SortCrsGraphFunctor
   entries_managed_t entriesAux;
 };
 
-template<typename rowmap_t, typename entries_t>
-struct MergedRowmapFunctor
-{
-  using size_type = typename rowmap_t::non_const_value_type;
-  using lno_t = typename entries_t::non_const_value_type;
+template <typename rowmap_t, typename entries_t>
+struct MergedRowmapFunctor {
+  using size_type  = typename rowmap_t::non_const_value_type;
+  using lno_t      = typename entries_t::non_const_value_type;
   using c_rowmap_t = typename rowmap_t::const_type;
 
-  //Precondition: entries are sorted within each row
-  MergedRowmapFunctor(const rowmap_t& mergedCounts_, const c_rowmap_t& rowmap_, const entries_t& entries_)
-    : mergedCounts(mergedCounts_), rowmap(rowmap_), entries(entries_)
-  {}
+  // Precondition: entries are sorted within each row
+  MergedRowmapFunctor(const rowmap_t& mergedCounts_, const c_rowmap_t& rowmap_,
+                      const entries_t& entries_)
+      : mergedCounts(mergedCounts_), rowmap(rowmap_), entries(entries_) {}
 
-  KOKKOS_INLINE_FUNCTION void operator()(lno_t row, size_type& lnewNNZ) const
-  {
+  KOKKOS_INLINE_FUNCTION void operator()(lno_t row, size_type& lnewNNZ) const {
     size_type rowBegin = rowmap(row);
-    size_type rowEnd = rowmap(row + 1);
-    if(rowEnd == rowBegin)
-    {
-      //Row was empty to begin with
+    size_type rowEnd   = rowmap(row + 1);
+    if (rowEnd == rowBegin) {
+      // Row was empty to begin with
       mergedCounts(row) = 0;
       return;
     }
-    //Otherwise, the first entry in the row exists
+    // Otherwise, the first entry in the row exists
     lno_t uniqueEntries = 1;
-    for(size_type j = rowBegin + 1; j < rowEnd; j++)
-    {
-      if(entries(j - 1) != entries(j))
-        uniqueEntries++;
+    for (size_type j = rowBegin + 1; j < rowEnd; j++) {
+      if (entries(j - 1) != entries(j)) uniqueEntries++;
     }
     mergedCounts(row) = uniqueEntries;
     lnewNNZ += uniqueEntries;
-    if(row == lno_t((rowmap.extent(0) - 1) - 1))
-      mergedCounts(row + 1) = 0;
+    if (row == lno_t((rowmap.extent(0) - 1) - 1)) mergedCounts(row + 1) = 0;
   }
 
   rowmap_t mergedCounts;
@@ -276,53 +284,51 @@ struct MergedRowmapFunctor
   entries_t entries;
 };
 
-template<typename rowmap_t, typename entries_t, typename values_t>
-struct MatrixMergedEntriesFunctor
-{
+template <typename rowmap_t, typename entries_t, typename values_t>
+struct MatrixMergedEntriesFunctor {
   using size_type = typename rowmap_t::non_const_value_type;
-  using lno_t = typename entries_t::non_const_value_type;
-  using scalar_t = typename values_t::non_const_value_type;
-
-  //Precondition: entries are sorted within each row
-  MatrixMergedEntriesFunctor(
-      const rowmap_t& rowmap_, const entries_t& entries_, const values_t& values_,
-      const rowmap_t& mergedRowmap_, const entries_t& mergedEntries_, const values_t& mergedValues_)
-    : rowmap(rowmap_), entries(entries_), values(values_),
-    mergedRowmap(mergedRowmap_), mergedEntries(mergedEntries_), mergedValues(mergedValues_)
-  {}
-
-  KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const
-  {
+  using lno_t     = typename entries_t::non_const_value_type;
+  using scalar_t  = typename values_t::non_const_value_type;
+
+  // Precondition: entries are sorted within each row
+  MatrixMergedEntriesFunctor(const rowmap_t& rowmap_, const entries_t& entries_,
+                             const values_t& values_,
+                             const rowmap_t& mergedRowmap_,
+                             const entries_t& mergedEntries_,
+                             const values_t& mergedValues_)
+      : rowmap(rowmap_),
+        entries(entries_),
+        values(values_),
+        mergedRowmap(mergedRowmap_),
+        mergedEntries(mergedEntries_),
+        mergedValues(mergedValues_) {}
+
+  KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const {
     size_type rowBegin = rowmap(row);
-    size_type rowEnd = rowmap(row + 1);
-    if(rowEnd == rowBegin)
-    {
-      //Row was empty to begin with, nothing to do
+    size_type rowEnd   = rowmap(row + 1);
+    if (rowEnd == rowBegin) {
+      // Row was empty to begin with, nothing to do
       return;
     }
-    //Otherwise, accumulate the value for each column
-    scalar_t accumVal = values(rowBegin);
-    lno_t accumCol = entries(rowBegin);
+    // Otherwise, accumulate the value for each column
+    scalar_t accumVal   = values(rowBegin);
+    lno_t accumCol      = entries(rowBegin);
     size_type insertPos = mergedRowmap(row);
-    for(size_type j = rowBegin + 1; j < rowEnd; j++)
-    {
-      if(accumCol == entries(j))
-      {
-        //accumulate
+    for (size_type j = rowBegin + 1; j < rowEnd; j++) {
+      if (accumCol == entries(j)) {
+        // accumulate
         accumVal += values(j);
-      }
-      else
-      {
-        //write out and reset
-        mergedValues(insertPos) = accumVal;
+      } else {
+        // write out and reset
+        mergedValues(insertPos)  = accumVal;
         mergedEntries(insertPos) = accumCol;
         insertPos++;
         accumVal = values(j);
         accumCol = entries(j);
       }
     }
-    //always left with the last unique entry
-    mergedValues(insertPos) = accumVal;
+    // always left with the last unique entry
+    mergedValues(insertPos)  = accumVal;
     mergedEntries(insertPos) = accumCol;
   }
 
@@ -334,43 +340,39 @@ struct MatrixMergedEntriesFunctor
   values_t mergedValues;
 };
 
-template<typename rowmap_t, typename entries_t>
-struct GraphMergedEntriesFunctor
-{
+template <typename rowmap_t, typename entries_t>
+struct GraphMergedEntriesFunctor {
   using size_type = typename rowmap_t::non_const_value_type;
-  using lno_t = typename entries_t::non_const_value_type;
-
-  //Precondition: entries are sorted within each row
-  GraphMergedEntriesFunctor(
-      const rowmap_t& rowmap_, const entries_t& entries_,
-      const rowmap_t& mergedRowmap_, const entries_t& mergedEntries_)
-    : rowmap(rowmap_), entries(entries_),
-    mergedRowmap(mergedRowmap_), mergedEntries(mergedEntries_)
-  {}
-
-  KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const
-  {
+  using lno_t     = typename entries_t::non_const_value_type;
+
+  // Precondition: entries are sorted within each row
+  GraphMergedEntriesFunctor(const rowmap_t& rowmap_, const entries_t& entries_,
+                            const rowmap_t& mergedRowmap_,
+                            const entries_t& mergedEntries_)
+      : rowmap(rowmap_),
+        entries(entries_),
+        mergedRowmap(mergedRowmap_),
+        mergedEntries(mergedEntries_) {}
+
+  KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const {
     size_type rowBegin = rowmap(row);
-    size_type rowEnd = rowmap(row + 1);
-    if(rowEnd == rowBegin)
-    {
-      //Row was empty to begin with, nothing to do
+    size_type rowEnd   = rowmap(row + 1);
+    if (rowEnd == rowBegin) {
+      // Row was empty to begin with, nothing to do
       return;
     }
-    //Otherwise, accumulate the value for each column
-    lno_t accumCol = entries(rowBegin);
+    // Otherwise, accumulate the value for each column
+    lno_t accumCol      = entries(rowBegin);
     size_type insertPos = mergedRowmap(row);
-    for(size_type j = rowBegin + 1; j < rowEnd; j++)
-    {
-      if(accumCol != entries(j))
-      {
-        //write out and reset
+    for (size_type j = rowBegin + 1; j < rowEnd; j++) {
+      if (accumCol != entries(j)) {
+        // write out and reset
         mergedEntries(insertPos) = accumCol;
         insertPos++;
         accumCol = entries(j);
       }
     }
-    //always left with the last unique entry
+    // always left with the last unique entry
     mergedEntries(insertPos) = accumCol;
   }
 
@@ -380,68 +382,68 @@ struct GraphMergedEntriesFunctor
   entries_t mergedEntries;
 };
 
-//Functor that sorts a view on one team
-template<typename View, typename Ordinal, typename TeamMember, typename Comparator>
-struct BitonicSingleTeamFunctor
-{
-  BitonicSingleTeamFunctor(View& v_, const Comparator& comp_) : v(v_), comp(comp_) {}
-  KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const
-  {
-    KokkosKernels::TeamBitonicSort<Ordinal, typename View::value_type, TeamMember, Comparator>(v.data(), v.extent(0), t, comp);
+// Functor that sorts a view on one team
+template <typename View, typename Ordinal, typename TeamMember,
+          typename Comparator>
+struct BitonicSingleTeamFunctor {
+  BitonicSingleTeamFunctor(View& v_, const Comparator& comp_)
+      : v(v_), comp(comp_) {}
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const {
+    KokkosKernels::TeamBitonicSort<Ordinal, typename View::value_type,
+                                   TeamMember, Comparator>(
+        v.data(), v.extent(0), t, comp);
   };
   View v;
   Comparator comp;
 };
 
-//Functor that sorts equally sized chunks on each team
-template<typename View, typename Ordinal, typename TeamMember, typename Comparator>
-struct BitonicChunkFunctor
-{
-  BitonicChunkFunctor(View& v_, const Comparator& comp_, Ordinal chunkSize_) : v(v_), comp(comp_), chunkSize(chunkSize_) {}
-  KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const
-  {
-    Ordinal chunk = t.league_rank();
+// Functor that sorts equally sized chunks on each team
+template <typename View, typename Ordinal, typename TeamMember,
+          typename Comparator>
+struct BitonicChunkFunctor {
+  BitonicChunkFunctor(View& v_, const Comparator& comp_, Ordinal chunkSize_)
+      : v(v_), comp(comp_), chunkSize(chunkSize_) {}
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const {
+    Ordinal chunk      = t.league_rank();
     Ordinal chunkStart = chunk * chunkSize;
-    Ordinal n = chunkSize;
-    if(chunkStart + n > Ordinal(v.extent(0)))
-      n = v.extent(0) - chunkStart;
-    KokkosKernels::TeamBitonicSort<Ordinal, typename View::value_type, TeamMember, Comparator>(v.data() + chunkStart, n, t, comp);
+    Ordinal n          = chunkSize;
+    if (chunkStart + n > Ordinal(v.extent(0))) n = v.extent(0) - chunkStart;
+    KokkosKernels::TeamBitonicSort<Ordinal, typename View::value_type,
+                                   TeamMember, Comparator>(
+        v.data() + chunkStart, n, t, comp);
   };
   View v;
   Comparator comp;
   Ordinal chunkSize;
 };
 
-//Functor that does just the first phase (brown) of bitonic sort on equally-sized chunks
-template<typename View, typename Ordinal, typename TeamMember, typename Comparator>
-struct BitonicPhase1Functor
-{
+// Functor that does just the first phase (brown) of bitonic sort on
+// equally-sized chunks
+template <typename View, typename Ordinal, typename TeamMember,
+          typename Comparator>
+struct BitonicPhase1Functor {
   typedef typename View::value_type Value;
-  BitonicPhase1Functor(View& v_, const Comparator& comp_, Ordinal boxSize_, Ordinal teamsPerBox_)
-    : v(v_), comp(comp_), boxSize(boxSize_), teamsPerBox(teamsPerBox_)
-  {}
-  KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const
-  {
-    Ordinal box = t.league_rank() / teamsPerBox;
-    Ordinal boxStart = boxSize * box;
-    Ordinal work = boxSize / teamsPerBox / 2;
-    Ordinal workStart = work * (t.league_rank() % teamsPerBox);
+  BitonicPhase1Functor(View& v_, const Comparator& comp_, Ordinal boxSize_,
+                       Ordinal teamsPerBox_)
+      : v(v_), comp(comp_), boxSize(boxSize_), teamsPerBox(teamsPerBox_) {}
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const {
+    Ordinal box         = t.league_rank() / teamsPerBox;
+    Ordinal boxStart    = boxSize * box;
+    Ordinal work        = boxSize / teamsPerBox / 2;
+    Ordinal workStart   = work * (t.league_rank() % teamsPerBox);
     Ordinal workReflect = boxSize - workStart - 1;
     Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work),
-      [&](const Ordinal i)
-      {
-        Ordinal elem1 = boxStart + workStart + i;
-        Ordinal elem2 = boxStart + workReflect - i;
-        if(elem2 < Ordinal(v.extent(0)))
-        {
-          if(comp(v(elem2), v(elem1)))
-          {
-            Value temp = v(elem1);
-            v(elem1) = v(elem2);
-            v(elem2) = temp;
-          }
-        }
-      });
+                         [&](const Ordinal i) {
+                           Ordinal elem1 = boxStart + workStart + i;
+                           Ordinal elem2 = boxStart + workReflect - i;
+                           if (elem2 < Ordinal(v.extent(0))) {
+                             if (comp(v(elem2), v(elem1))) {
+                               Value temp = v(elem1);
+                               v(elem1)   = v(elem2);
+                               v(elem2)   = temp;
+                             }
+                           }
+                         });
   };
   View v;
   Comparator comp;
@@ -449,68 +451,61 @@ struct BitonicPhase1Functor
   Ordinal teamsPerBox;
 };
 
-//Functor that does the second phase (red) of bitonic sort
-template<typename View, typename Ordinal, typename TeamMember, typename Comparator>
-struct BitonicPhase2Functor
-{
+// Functor that does the second phase (red) of bitonic sort
+template <typename View, typename Ordinal, typename TeamMember,
+          typename Comparator>
+struct BitonicPhase2Functor {
   typedef typename View::value_type Value;
-  BitonicPhase2Functor(View& v_, const Comparator& comp_, Ordinal boxSize_, Ordinal teamsPerBox_)
-    : v(v_), comp(comp_), boxSize(boxSize_), teamsPerBox(teamsPerBox_)
-  {}
-  KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const
-  {
+  BitonicPhase2Functor(View& v_, const Comparator& comp_, Ordinal boxSize_,
+                       Ordinal teamsPerBox_)
+      : v(v_), comp(comp_), boxSize(boxSize_), teamsPerBox(teamsPerBox_) {}
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const {
     Ordinal logBoxSize = 1;
-    while((Ordinal(1) << logBoxSize) < boxSize)
-      logBoxSize++;
-    Ordinal box = t.league_rank() / teamsPerBox;
-    Ordinal boxStart = boxSize * box;
-    Ordinal work = boxSize / teamsPerBox / 2;
+    while ((Ordinal(1) << logBoxSize) < boxSize) logBoxSize++;
+    Ordinal box       = t.league_rank() / teamsPerBox;
+    Ordinal boxStart  = boxSize * box;
+    Ordinal work      = boxSize / teamsPerBox / 2;
     Ordinal workStart = boxStart + work * (t.league_rank() % teamsPerBox);
-    Ordinal jump = boxSize / 2;
+    Ordinal jump      = boxSize / 2;
     Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work),
-      [&](const Ordinal i)
-      {
-        Ordinal elem1 = workStart + i;
-        Ordinal elem2 = workStart + jump + i;
-        if(elem2 < Ordinal(v.extent(0)))
-        {
-          if(comp(v(elem2), v(elem1)))
-          {
-            Value temp = v(elem1);
-            v(elem1) = v(elem2);
-            v(elem2) = temp;
-          }
-        }
-      });
-    if(teamsPerBox == 1)
-    {
-      //This team can finish phase 2 for all the smaller red boxes that follow,
-      //since there are no longer cross-team data dependencies
-      for(Ordinal subLevel = 1; subLevel < logBoxSize; subLevel++)
-      {
+                         [&](const Ordinal i) {
+                           Ordinal elem1 = workStart + i;
+                           Ordinal elem2 = workStart + jump + i;
+                           if (elem2 < Ordinal(v.extent(0))) {
+                             if (comp(v(elem2), v(elem1))) {
+                               Value temp = v(elem1);
+                               v(elem1)   = v(elem2);
+                               v(elem2)   = temp;
+                             }
+                           }
+                         });
+    if (teamsPerBox == 1) {
+      // This team can finish phase 2 for all the smaller red boxes that follow,
+      // since there are no longer cross-team data dependencies
+      for (Ordinal subLevel = 1; subLevel < logBoxSize; subLevel++) {
         t.team_barrier();
         Ordinal logSubBoxSize = logBoxSize - subLevel;
-        Ordinal subBoxSize = Ordinal(1) << logSubBoxSize;
-        Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work),
-          [&](const Ordinal i)
-          {
-            Ordinal globalThread = i + t.league_rank() * work;
-            Ordinal subBox = globalThread >> (logSubBoxSize - 1);
-            Ordinal subBoxStart = subBox << logSubBoxSize;
-            Ordinal subBoxOffset = globalThread & ((Ordinal(1) << (logSubBoxSize - 1)) - 1); //i % (subBoxSize / 2)
-            Ordinal elem1 = subBoxStart + subBoxOffset;
-            //later phases (pink box): within a block, compare with fixed distance (boxSize / 2) apart
-            Ordinal elem2 = elem1 + subBoxSize / 2;
-            if(elem2 < Ordinal(v.extent(0)))
-            {
-              if(comp(v(elem2), v(elem1)))
-              {
-                Value temp = v(elem1);
-                v(elem1) = v(elem2);
-                v(elem2) = temp;
+        Ordinal subBoxSize    = Ordinal(1) << logSubBoxSize;
+        Kokkos::parallel_for(
+            Kokkos::TeamThreadRange(t, work), [&](const Ordinal i) {
+              Ordinal globalThread = i + t.league_rank() * work;
+              Ordinal subBox       = globalThread >> (logSubBoxSize - 1);
+              Ordinal subBoxStart  = subBox << logSubBoxSize;
+              Ordinal subBoxOffset =
+                  globalThread & ((Ordinal(1) << (logSubBoxSize - 1)) -
+                                  1);  // i % (subBoxSize / 2)
+              Ordinal elem1 = subBoxStart + subBoxOffset;
+              // later phases (pink box): within a block, compare with fixed
+              // distance (boxSize / 2) apart
+              Ordinal elem2 = elem1 + subBoxSize / 2;
+              if (elem2 < Ordinal(v.extent(0))) {
+                if (comp(v(elem2), v(elem1))) {
+                  Value temp = v(elem1);
+                  v(elem1)   = v(elem2);
+                  v(elem2)   = temp;
+                }
               }
-            }
-          });
+            });
       }
     }
   };
@@ -520,636 +515,603 @@ struct BitonicPhase2Functor
   Ordinal teamsPerBox;
 };
 
-} //namespace Impl
+}  // namespace Impl
 
 // Sort a CRS matrix: within each row, sort entries ascending by column.
 // At the same time, permute the values.
-template<typename execution_space, typename rowmap_t, typename entries_t, typename values_t>
-void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const values_t& values)
-{
-  using lno_t = typename entries_t::non_const_value_type;
+template <typename execution_space, typename rowmap_t, typename entries_t,
+          typename values_t>
+void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries,
+                     const values_t& values) {
+  using lno_t    = typename entries_t::non_const_value_type;
   using team_pol = Kokkos::TeamPolicy<execution_space>;
-  bool useRadix = !Impl::kk_is_gpu_exec_space<execution_space>();
-  lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0;
-  if(numRows == 0)
-    return;
+  bool useRadix  = !Impl::kk_is_gpu_exec_space<execution_space>();
+  lno_t numRows  = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0;
+  if (numRows == 0) return;
   Impl::SortCrsMatrixFunctor<execution_space, rowmap_t, entries_t, values_t>
-    funct(useRadix, rowmap, entries, values);
-  if(useRadix)
-  {
-    Kokkos::parallel_for("sort_crs_matrix", Kokkos::RangePolicy<execution_space>(0, numRows), funct);
-  }
-  else
-  {
-    //Try to get teamsize to be largest power of 2 not greater than avg entries per row
-    //TODO (probably important for performnce): add thread-level sort also, and use that
-    //for small avg degree. But this works for now.
+      funct(useRadix, rowmap, entries, values);
+  if (useRadix) {
+    Kokkos::parallel_for("sort_crs_matrix",
+                         Kokkos::RangePolicy<execution_space>(0, numRows),
+                         funct);
+  } else {
+    // Try to get teamsize to be largest power of 2 not greater than avg entries
+    // per row
+    // TODO (probably important for performnce): add thread-level sort also, and
+    // use that for small avg degree. But this works for now.
     lno_t idealTeamSize = 1;
-    lno_t avgDeg = (entries.extent(0) + numRows - 1) / numRows;
-    while(idealTeamSize < avgDeg / 2)
-    {
+    lno_t avgDeg        = (entries.extent(0) + numRows - 1) / numRows;
+    while (idealTeamSize < avgDeg / 2) {
       idealTeamSize *= 2;
     }
     team_pol temp(numRows, 1);
     lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag());
-    lno_t teamSize = std::min(idealTeamSize, maxTeamSize);
+    lno_t teamSize    = std::min(idealTeamSize, maxTeamSize);
     Kokkos::parallel_for("sort_crs_matrix", team_pol(numRows, teamSize), funct);
   }
 }
 
 template <typename crsMat_t>
-void sort_crs_matrix(const crsMat_t& A)
-{
-  //Note: rowmap_t has const values, but that's OK as sorting doesn't modify it
-  using rowmap_t = typename crsMat_t::row_map_type;
-  using entries_t = typename crsMat_t::index_type::non_const_type;
-  using values_t = typename crsMat_t::values_type::non_const_type;
+void sort_crs_matrix(const crsMat_t& A) {
+  // Note: rowmap_t has const values, but that's OK as sorting doesn't modify it
+  using rowmap_t   = typename crsMat_t::row_map_type;
+  using entries_t  = typename crsMat_t::index_type::non_const_type;
+  using values_t   = typename crsMat_t::values_type::non_const_type;
   using exec_space = typename crsMat_t::execution_space;
-  //NOTE: the rowmap of a StaticCrsGraph is const-valued, but the
-  //entries and CrsMatrix values are non-const (so sorting them directly
-  //is allowed)
-  sort_crs_matrix<exec_space, rowmap_t, entries_t, values_t>
-    (A.graph.row_map, A.graph.entries, A.values);
+  // NOTE: the rowmap of a StaticCrsGraph is const-valued, but the
+  // entries and CrsMatrix values are non-const (so sorting them directly
+  // is allowed)
+  sort_crs_matrix<exec_space, rowmap_t, entries_t, values_t>(
+      A.graph.row_map, A.graph.entries, A.values);
 }
 
 // Sort a CRS graph: within each row, sort entries ascending by column.
-template<typename execution_space, typename rowmap_t, typename entries_t>
-void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries)
-{
-  using lno_t = typename entries_t::non_const_value_type;
+template <typename execution_space, typename rowmap_t, typename entries_t>
+void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) {
+  using lno_t    = typename entries_t::non_const_value_type;
   using team_pol = Kokkos::TeamPolicy<execution_space>;
-  bool useRadix = !Impl::kk_is_gpu_exec_space<execution_space>();
-  lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0;
-  if(numRows == 0)
-    return;
-  Impl::SortCrsGraphFunctor<execution_space, rowmap_t, entries_t>
-    funct(useRadix, rowmap, entries);
-  if(useRadix)
-  {
-    Kokkos::parallel_for("sort_crs_graph", Kokkos::RangePolicy<execution_space>(0, numRows), funct);
-  }
-  else
-  {
-    //Try to get teamsize to be largest power of 2 less than or equal to
-    //half the entries per row. 0.5 * #entries is bitonic's parallelism within a row.
-    //TODO (probably important for performnce): add thread-level sort also, and use that
-    //for small avg degree. But this works for now.
+  bool useRadix  = !Impl::kk_is_gpu_exec_space<execution_space>();
+  lno_t numRows  = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0;
+  if (numRows == 0) return;
+  Impl::SortCrsGraphFunctor<execution_space, rowmap_t, entries_t> funct(
+      useRadix, rowmap, entries);
+  if (useRadix) {
+    Kokkos::parallel_for("sort_crs_graph",
+                         Kokkos::RangePolicy<execution_space>(0, numRows),
+                         funct);
+  } else {
+    // Try to get teamsize to be largest power of 2 less than or equal to
+    // half the entries per row. 0.5 * #entries is bitonic's parallelism within
+    // a row.
+    // TODO (probably important for performnce): add thread-level sort also, and
+    // use that for small avg degree. But this works for now.
     lno_t idealTeamSize = 1;
-    lno_t avgDeg = (entries.extent(0) + numRows - 1) / numRows;
-    while(idealTeamSize < avgDeg / 2)
-    {
+    lno_t avgDeg        = (entries.extent(0) + numRows - 1) / numRows;
+    while (idealTeamSize < avgDeg / 2) {
       idealTeamSize *= 2;
     }
     team_pol temp(numRows, 1);
     lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag());
-    lno_t teamSize = std::min(idealTeamSize, maxTeamSize);
+    lno_t teamSize    = std::min(idealTeamSize, maxTeamSize);
     Kokkos::parallel_for("sort_crs_graph", team_pol(numRows, teamSize), funct);
   }
 }
 
 template <typename crsGraph_t>
-void sort_crs_graph(const crsGraph_t& G)
-{
-  static_assert(!std::is_const<typename crsGraph_t::entries_type::value_type>::value,
+void sort_crs_graph(const crsGraph_t& G) {
+  static_assert(
+      !std::is_const<typename crsGraph_t::entries_type::value_type>::value,
       "sort_crs_graph requires StaticCrsGraph entries to be non-const.");
-  sort_crs_graph<
-    typename crsGraph_t::execution_space,
-    typename crsGraph_t::row_map_type, typename crsGraph_t::entries_type>
-    (G.row_map, G.entries);
+  sort_crs_graph<typename crsGraph_t::execution_space,
+                 typename crsGraph_t::row_map_type,
+                 typename crsGraph_t::entries_type>(G.row_map, G.entries);
 }
 
-//Sort the rows of matrix, and merge duplicate entries.
-template<typename crsMat_t>
-crsMat_t sort_and_merge_matrix(const crsMat_t& A)
-{
+// Sort the rows of matrix, and merge duplicate entries.
+template <typename crsMat_t>
+crsMat_t sort_and_merge_matrix(const crsMat_t& A) {
   using c_rowmap_t = typename crsMat_t::row_map_type;
-  using rowmap_t = typename crsMat_t::row_map_type::non_const_type;
-  using entries_t = typename crsMat_t::index_type::non_const_type;
-  using values_t = typename crsMat_t::values_type::non_const_type;
-  using size_type = typename rowmap_t::non_const_value_type;
+  using rowmap_t   = typename crsMat_t::row_map_type::non_const_type;
+  using entries_t  = typename crsMat_t::index_type::non_const_type;
+  using values_t   = typename crsMat_t::values_type::non_const_type;
+  using size_type  = typename rowmap_t::non_const_value_type;
   using exec_space = typename crsMat_t::execution_space;
-  using range_t = Kokkos::RangePolicy<exec_space>;
+  using range_t    = Kokkos::RangePolicy<exec_space>;
   sort_crs_matrix(A);
-  //Count entries per row into a new rowmap, in terms of merges that can be done
-  rowmap_t mergedRowmap(Kokkos::view_alloc(Kokkos::WithoutInitializing, "SortedMerged rowmap"), A.numRows() + 1);
+  // Count entries per row into a new rowmap, in terms of merges that can be
+  // done
+  rowmap_t mergedRowmap(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "SortedMerged rowmap"),
+      A.numRows() + 1);
   size_type numCompressedEntries = 0;
   Kokkos::parallel_reduce(range_t(0, A.numRows()),
-      Impl::MergedRowmapFunctor<rowmap_t, entries_t>(mergedRowmap, A.graph.row_map, A.graph.entries), numCompressedEntries);
-  //Prefix sum to get rowmap
-  Impl::kk_exclusive_parallel_prefix_sum<rowmap_t, exec_space>(A.numRows() + 1, mergedRowmap);
+                          Impl::MergedRowmapFunctor<rowmap_t, entries_t>(
+                              mergedRowmap, A.graph.row_map, A.graph.entries),
+                          numCompressedEntries);
+  // Prefix sum to get rowmap
+  Impl::kk_exclusive_parallel_prefix_sum<rowmap_t, exec_space>(A.numRows() + 1,
+                                                               mergedRowmap);
   entries_t mergedEntries("SortedMerged entries", numCompressedEntries);
   values_t mergedValues("SortedMerged values", numCompressedEntries);
-  //Compute merged entries and values
-  Kokkos::parallel_for(range_t(0, A.numRows()),
-      Impl::MatrixMergedEntriesFunctor<c_rowmap_t, entries_t, values_t>
-      (A.graph.row_map, A.graph.entries, A.values,
-       mergedRowmap, mergedEntries, mergedValues));
-  //Finally, construct the new compressed matrix
-  return crsMat_t("SortedMerged", A.numRows(), A.numCols(), numCompressedEntries,
-      mergedValues, mergedRowmap, mergedEntries);
+  // Compute merged entries and values
+  Kokkos::parallel_for(
+      range_t(0, A.numRows()),
+      Impl::MatrixMergedEntriesFunctor<c_rowmap_t, entries_t, values_t>(
+          A.graph.row_map, A.graph.entries, A.values, mergedRowmap,
+          mergedEntries, mergedValues));
+  // Finally, construct the new compressed matrix
+  return crsMat_t("SortedMerged", A.numRows(), A.numCols(),
+                  numCompressedEntries, mergedValues, mergedRowmap,
+                  mergedEntries);
 }
 
-template<typename exec_space, typename rowmap_t, typename entries_t>
-void sort_and_merge_graph(
-    const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in,
-    rowmap_t& rowmap_out, entries_t& entries_out)
-{
-  using size_type = typename rowmap_t::non_const_value_type;
-  using lno_t = typename entries_t::non_const_value_type;
-  using range_t = Kokkos::RangePolicy<exec_space>;
+template <typename exec_space, typename rowmap_t, typename entries_t>
+void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in,
+                          const entries_t& entries_in, rowmap_t& rowmap_out,
+                          entries_t& entries_out) {
+  using size_type      = typename rowmap_t::non_const_value_type;
+  using lno_t          = typename entries_t::non_const_value_type;
+  using range_t        = Kokkos::RangePolicy<exec_space>;
   using const_rowmap_t = typename rowmap_t::const_type;
-  lno_t numRows = rowmap_in.extent(0);
-  if(numRows <= 1)
-  {
-    //Matrix has zero rows
-    rowmap_out = rowmap_t();
+  lno_t numRows        = rowmap_in.extent(0);
+  if (numRows <= 1) {
+    // Matrix has zero rows
+    rowmap_out  = rowmap_t();
     entries_out = entries_t();
     return;
   }
   numRows--;
-  //Sort in place
+  // Sort in place
   sort_crs_graph<exec_space, const_rowmap_t, entries_t>(rowmap_in, entries_in);
-  //Count entries per row into a new rowmap, in terms of merges that can be done
-  rowmap_out = rowmap_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "SortedMerged rowmap"), numRows + 1);
+  // Count entries per row into a new rowmap, in terms of merges that can be
+  // done
+  rowmap_out = rowmap_t(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "SortedMerged rowmap"),
+      numRows + 1);
   size_type numCompressedEntries = 0;
   Kokkos::parallel_reduce(range_t(0, numRows),
-      Impl::MergedRowmapFunctor<rowmap_t, entries_t>(rowmap_out, rowmap_in, entries_in), numCompressedEntries);
-  //Prefix sum to get rowmap
-  Impl::kk_exclusive_parallel_prefix_sum<rowmap_t, exec_space>(numRows + 1, rowmap_out);
+                          Impl::MergedRowmapFunctor<rowmap_t, entries_t>(
+                              rowmap_out, rowmap_in, entries_in),
+                          numCompressedEntries);
+  // Prefix sum to get rowmap
+  Impl::kk_exclusive_parallel_prefix_sum<rowmap_t, exec_space>(numRows + 1,
+                                                               rowmap_out);
   entries_out = entries_t("SortedMerged entries", numCompressedEntries);
-  //Compute merged entries and values
-  Kokkos::parallel_for(range_t(0, numRows),
-      Impl::GraphMergedEntriesFunctor<const_rowmap_t, entries_t>
-      (rowmap_in, entries_in,
-       rowmap_out, entries_out));
+  // Compute merged entries and values
+  Kokkos::parallel_for(
+      range_t(0, numRows),
+      Impl::GraphMergedEntriesFunctor<const_rowmap_t, entries_t>(
+          rowmap_in, entries_in, rowmap_out, entries_out));
 }
 
-template<typename crsGraph_t>
-crsGraph_t sort_and_merge_graph(const crsGraph_t& G)
-{
-  using rowmap_t = typename crsGraph_t::row_map_type::non_const_type;
+template <typename crsGraph_t>
+crsGraph_t sort_and_merge_graph(const crsGraph_t& G) {
+  using rowmap_t  = typename crsGraph_t::row_map_type::non_const_type;
   using entries_t = typename crsGraph_t::entries_type;
-  static_assert(!std::is_const<typename entries_t::value_type>::value,
+  static_assert(
+      !std::is_const<typename entries_t::value_type>::value,
       "sort_and_merge_graph requires StaticCrsGraph entries to be non-const.");
   rowmap_t mergedRowmap;
   entries_t mergedEntries;
-  sort_and_merge_graph<typename crsGraph_t::execution_space, rowmap_t, entries_t>(G.row_map, G.entries, mergedRowmap, mergedEntries);
+  sort_and_merge_graph<typename crsGraph_t::execution_space, rowmap_t,
+                       entries_t>(G.row_map, G.entries, mergedRowmap,
+                                  mergedEntries);
   return crsGraph_t(mergedEntries, mergedRowmap);
 }
 
-//Version to be called from host on a single array
-//Generally ~2x slower than Kokkos::sort() for large arrays (> 50 M elements),
-//but faster for smaller arrays.
+// Version to be called from host on a single array
+// Generally ~2x slower than Kokkos::sort() for large arrays (> 50 M elements),
+// but faster for smaller arrays.
 //
-//This is more general than BinSort: bitonic supports any trivially copyable type
-//and an arbitrary device-compatible comparison operator (provided through operator() of Comparator)
-//If comparator is void, use operator< (which should only be used for primitives)
-template<typename View, typename ExecSpace, typename Ordinal, typename Comparator>
-void bitonicSort(View v, const Comparator& comp)
-{
+// This is more general than BinSort: bitonic supports any trivially copyable
+// type and an arbitrary device-compatible comparison operator (provided through
+// operator() of Comparator) If comparator is void, use operator< (which should
+// only be used for primitives)
+template <typename View, typename ExecSpace, typename Ordinal,
+          typename Comparator>
+void bitonicSort(View v, const Comparator& comp) {
   typedef Kokkos::TeamPolicy<ExecSpace> team_policy;
   typedef typename team_policy::member_type team_member;
   Ordinal n = v.extent(0);
-  //If n is small, just sort on a single team
-  if(n <= Ordinal(1) << 12)
-  {
-    Kokkos::parallel_for(team_policy(1, Kokkos::AUTO()),
-        Impl::BitonicSingleTeamFunctor<View, Ordinal, team_member, Comparator>(v, comp));
-  }
-  else
-  {
+  // If n is small, just sort on a single team
+  if (n <= Ordinal(1) << 12) {
+    Kokkos::parallel_for(
+        team_policy(1, Kokkos::AUTO()),
+        Impl::BitonicSingleTeamFunctor<View, Ordinal, team_member, Comparator>(
+            v, comp));
+  } else {
     Ordinal npot = 1;
-    while(npot < n)
-      npot <<= 1;
-    //Partition the data equally among fixed number of teams
+    while (npot < n) npot <<= 1;
+    // Partition the data equally among fixed number of teams
     Ordinal chunkSize = 512;
-    Ordinal numTeams = npot / chunkSize;
-    //First, sort within teams
-    Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()),
-        Impl::BitonicChunkFunctor<View, Ordinal, team_member, Comparator>(v, comp, chunkSize));
-    for(int teamsPerBox = 2; teamsPerBox <= npot / chunkSize; teamsPerBox *= 2)
-    {
+    Ordinal numTeams  = npot / chunkSize;
+    // First, sort within teams
+    Kokkos::parallel_for(
+        team_policy(numTeams, Kokkos::AUTO()),
+        Impl::BitonicChunkFunctor<View, Ordinal, team_member, Comparator>(
+            v, comp, chunkSize));
+    for (int teamsPerBox = 2; teamsPerBox <= npot / chunkSize;
+         teamsPerBox *= 2) {
       Ordinal boxSize = teamsPerBox * chunkSize;
-      Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()),
-          Impl::BitonicPhase1Functor<View, Ordinal, team_member, Comparator>(v, comp, boxSize, teamsPerBox));
-      for(int boxDiv = 1; teamsPerBox >> boxDiv; boxDiv++)
-      {
-        Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()),
-            Impl::BitonicPhase2Functor<View, Ordinal, team_member, Comparator>(v, comp, boxSize >> boxDiv, teamsPerBox >> boxDiv));
+      Kokkos::parallel_for(
+          team_policy(numTeams, Kokkos::AUTO()),
+          Impl::BitonicPhase1Functor<View, Ordinal, team_member, Comparator>(
+              v, comp, boxSize, teamsPerBox));
+      for (int boxDiv = 1; teamsPerBox >> boxDiv; boxDiv++) {
+        Kokkos::parallel_for(
+            team_policy(numTeams, Kokkos::AUTO()),
+            Impl::BitonicPhase2Functor<View, Ordinal, team_member, Comparator>(
+                v, comp, boxSize >> boxDiv, teamsPerBox >> boxDiv));
       }
     }
   }
 }
 
-//Radix sort for integers, on a single thread within a team.
-//Pros: few diverging branches, so OK for sorting on a single GPU vector lane. Better on CPU cores.
-//Con: requires auxiliary storage, and this version only works for integers
-template<typename Ordinal, typename ValueType>
-KOKKOS_INLINE_FUNCTION void
-SerialRadixSort(ValueType* values, ValueType* valuesAux, Ordinal n)
-{
-  static_assert(std::is_integral<ValueType>::value && std::is_unsigned<ValueType>::value,
+// Radix sort for integers, on a single thread within a team.
+// Pros: few diverging branches, so OK for sorting on a single GPU vector lane.
+// Better on CPU cores. Con: requires auxiliary storage, and this version only
+// works for integers
+template <typename Ordinal, typename ValueType>
+KOKKOS_INLINE_FUNCTION void SerialRadixSort(ValueType* values,
+                                            ValueType* valuesAux, Ordinal n) {
+  static_assert(
+      std::is_integral<ValueType>::value && std::is_unsigned<ValueType>::value,
       "radixSort can only be run on unsigned integers.");
-  if(n <= 1)
-    return;
+  if (n <= 1) return;
   ValueType maxVal = 0;
-  for(Ordinal i = 0; i < n; i++)
-  {
-    if(maxVal < values[i])
-      maxVal = values[i];
+  for (Ordinal i = 0; i < n; i++) {
+    if (maxVal < values[i]) maxVal = values[i];
   }
-  //determine how many significant bits the data has
+  // determine how many significant bits the data has
   int passes = 0;
-  while(maxVal)
-  {
+  while (maxVal) {
     maxVal >>= 4;
     passes++;
   }
-  //Is the data currently held in values (false) or valuesAux (true)?
+  // Is the data currently held in values (false) or valuesAux (true)?
   bool inAux = false;
-  //sort 4 bits at a time, into 16 buckets
+  // sort 4 bits at a time, into 16 buckets
   ValueType mask = 0xF;
-  //maskPos counts the low bit index of mask (0, 4, 8, ...)
+  // maskPos counts the low bit index of mask (0, 4, 8, ...)
   Ordinal maskPos = 0;
-  for(int p = 0; p < passes; p++)
-  {
-    //Count the number of elements in each bucket
+  for (int p = 0; p < passes; p++) {
+    // Count the number of elements in each bucket
     Ordinal count[16] = {0};
     Ordinal offset[17];
-    if(!inAux)
-    {
-      for(Ordinal i = 0; i < n; i++)
-      {
+    if (!inAux) {
+      for (Ordinal i = 0; i < n; i++) {
         count[(values[i] & mask) >> maskPos]++;
       }
-    }
-    else
-    {
-      for(Ordinal i = 0; i < n; i++)
-      {
+    } else {
+      for (Ordinal i = 0; i < n; i++) {
         count[(valuesAux[i] & mask) >> maskPos]++;
       }
     }
     offset[0] = 0;
-    //get offset as the prefix sum for count
-    for(Ordinal i = 0; i < 16; i++)
-    {
+    // get offset as the prefix sum for count
+    for (Ordinal i = 0; i < 16; i++) {
       offset[i + 1] = offset[i] + count[i];
     }
-    //now for each element in [lo, hi), move it to its offset in the other buffer
-    //this branch should be ok because whichBuf is the same on all threads
-    if(!inAux)
-    {
-      for(Ordinal i = 0; i < n; i++)
-      {
+    // now for each element in [lo, hi), move it to its offset in the other
+    // buffer this branch should be ok because whichBuf is the same on all
+    // threads
+    if (!inAux) {
+      for (Ordinal i = 0; i < n; i++) {
         Ordinal bucket = (values[i] & mask) >> maskPos;
         valuesAux[offset[bucket + 1] - count[bucket]] = values[i];
         count[bucket]--;
       }
-    }
-    else
-    {
-      for(Ordinal i = 0; i < n; i++)
-      {
+    } else {
+      for (Ordinal i = 0; i < n; i++) {
         Ordinal bucket = (valuesAux[i] & mask) >> maskPos;
         values[offset[bucket + 1] - count[bucket]] = valuesAux[i];
         count[bucket]--;
       }
     }
     inAux = !inAux;
-    mask = mask << 4;
+    mask  = mask << 4;
     maskPos += 4;
   }
-  //Move values back into main array if they are currently in aux.
-  //This is the case if an odd number of rounds were done.
-  if(inAux)
-  {
-    for(Ordinal i = 0; i < n; i++)
-    {
+  // Move values back into main array if they are currently in aux.
+  // This is the case if an odd number of rounds were done.
+  if (inAux) {
+    for (Ordinal i = 0; i < n; i++) {
       values[i] = valuesAux[i];
     }
   }
 }
 
-//Radix sort for integers (no internal parallelism).
-//While sorting, also permute "perm" array along with the values.
-//Pros: few diverging branches, so good for sorting on a single GPU vector lane.
-//Con: requires auxiliary storage, this version only works for integers (although float/double is possible)
-template<typename Ordinal, typename ValueType, typename PermType>
-KOKKOS_INLINE_FUNCTION void
-SerialRadixSort2(ValueType* values, ValueType* valuesAux, PermType* perm, PermType* permAux, Ordinal n)
-{
-  static_assert(std::is_integral<ValueType>::value && std::is_unsigned<ValueType>::value,
+// Radix sort for integers (no internal parallelism).
+// While sorting, also permute "perm" array along with the values.
+// Pros: few diverging branches, so good for sorting on a single GPU vector
+// lane. Con: requires auxiliary storage, this version only works for integers
+// (although float/double is possible)
+template <typename Ordinal, typename ValueType, typename PermType>
+KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values,
+                                             ValueType* valuesAux,
+                                             PermType* perm, PermType* permAux,
+                                             Ordinal n) {
+  static_assert(
+      std::is_integral<ValueType>::value && std::is_unsigned<ValueType>::value,
       "radixSort can only be run on unsigned integers.");
-  if(n <= 1)
-    return;
+  if (n <= 1) return;
   ValueType maxVal = 0;
-  for(Ordinal i = 0; i < n; i++)
-  {
-    if(maxVal < values[i])
-      maxVal = values[i];
+  for (Ordinal i = 0; i < n; i++) {
+    if (maxVal < values[i]) maxVal = values[i];
   }
   int passes = 0;
-  while(maxVal)
-  {
+  while (maxVal) {
     maxVal >>= 4;
     passes++;
   }
-  //Is the data currently held in values (false) or valuesAux (true)?
+  // Is the data currently held in values (false) or valuesAux (true)?
   bool inAux = false;
-  //sort 4 bits at a time, into 16 buckets
+  // sort 4 bits at a time, into 16 buckets
   ValueType mask = 0xF;
-  //maskPos counts the low bit index of mask (0, 4, 8, ...)
+  // maskPos counts the low bit index of mask (0, 4, 8, ...)
   Ordinal maskPos = 0;
-  for(int p = 0; p < passes; p++)
-  {
-    //Count the number of elements in each bucket
+  for (int p = 0; p < passes; p++) {
+    // Count the number of elements in each bucket
     Ordinal count[16] = {0};
     Ordinal offset[17];
-    if(!inAux)
-    {
-      for(Ordinal i = 0; i < n; i++)
-      {
+    if (!inAux) {
+      for (Ordinal i = 0; i < n; i++) {
         count[(values[i] & mask) >> maskPos]++;
       }
-    }
-    else
-    {
-      for(Ordinal i = 0; i < n; i++)
-      {
+    } else {
+      for (Ordinal i = 0; i < n; i++) {
         count[(valuesAux[i] & mask) >> maskPos]++;
       }
     }
     offset[0] = 0;
-    //get offset as the prefix sum for count
-    for(Ordinal i = 0; i < 16; i++)
-    {
+    // get offset as the prefix sum for count
+    for (Ordinal i = 0; i < 16; i++) {
       offset[i + 1] = offset[i] + count[i];
     }
-    //now for each element in [lo, hi), move it to its offset in the other buffer
-    //this branch should be ok because whichBuf is the same on all threads
-    if(!inAux)
-    {
-      for(Ordinal i = 0; i < n; i++)
-      {
+    // now for each element in [lo, hi), move it to its offset in the other
+    // buffer this branch should be ok because whichBuf is the same on all
+    // threads
+    if (!inAux) {
+      for (Ordinal i = 0; i < n; i++) {
         Ordinal bucket = (values[i] & mask) >> maskPos;
         valuesAux[offset[bucket + 1] - count[bucket]] = values[i];
-        permAux[offset[bucket + 1] - count[bucket]] = perm[i];
+        permAux[offset[bucket + 1] - count[bucket]]   = perm[i];
         count[bucket]--;
       }
-    }
-    else
-    {
-      for(Ordinal i = 0; i < n; i++)
-      {
+    } else {
+      for (Ordinal i = 0; i < n; i++) {
         Ordinal bucket = (valuesAux[i] & mask) >> maskPos;
         values[offset[bucket + 1] - count[bucket]] = valuesAux[i];
-        perm[offset[bucket + 1] - count[bucket]] = permAux[i];
+        perm[offset[bucket + 1] - count[bucket]]   = permAux[i];
         count[bucket]--;
       }
     }
     inAux = !inAux;
-    mask = mask << 4;
+    mask  = mask << 4;
     maskPos += 4;
   }
-  //Move values back into main array if they are currently in aux.
-  //This is the case if an odd number of rounds were done.
-  if(inAux)
-  {
-    for(Ordinal i = 0; i < n; i++)
-    {
+  // Move values back into main array if they are currently in aux.
+  // This is the case if an odd number of rounds were done.
+  if (inAux) {
+    for (Ordinal i = 0; i < n; i++) {
       values[i] = valuesAux[i];
-      perm[i] = permAux[i];
+      perm[i]   = permAux[i];
     }
   }
 }
 
-//Bitonic merge sort (requires only comparison operators and trivially-copyable)
-//Pros: In-place, plenty of parallelism for GPUs, and memory references are coalesced
-//Con: O(n log^2(n)) serial time is bad on CPUs
-//Good diagram of the algorithm at https://en.wikipedia.org/wiki/Bitonic_sorter
-template<typename Ordinal, typename ValueType, typename TeamMember, typename Comparator>
-KOKKOS_INLINE_FUNCTION void
-TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, const Comparator& comp)
-{
-  //Algorithm only works on power-of-two input size only.
-  //If n is not a power-of-two, will implicitly pretend
-  //that values[i] for i >= n is just the max for ValueType, so it never gets swapped
-  Ordinal npot = 1;
+// Bitonic merge sort (requires only comparison operators and
+// trivially-copyable) Pros: In-place, plenty of parallelism for GPUs, and
+// memory references are coalesced Con: O(n log^2(n)) serial time is bad on CPUs
+// Good diagram of the algorithm at https://en.wikipedia.org/wiki/Bitonic_sorter
+template <typename Ordinal, typename ValueType, typename TeamMember,
+          typename Comparator>
+KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n,
+                                            const TeamMember mem,
+                                            const Comparator& comp) {
+  // Algorithm only works on power-of-two input size only.
+  // If n is not a power-of-two, will implicitly pretend
+  // that values[i] for i >= n is just the max for ValueType, so it never gets
+  // swapped
+  Ordinal npot   = 1;
   Ordinal levels = 0;
-  while(npot < n)
-  {
+  while (npot < n) {
     levels++;
     npot <<= 1;
   }
-  for(Ordinal i = 0; i < levels; i++)
-  {
-    for(Ordinal j = 0; j <= i; j++)
-    {
+  for (Ordinal i = 0; i < levels; i++) {
+    for (Ordinal j = 0; j <= i; j++) {
       // n/2 pairs of items are compared in parallel
-      Kokkos::parallel_for(Kokkos::TeamVectorRange(mem, npot / 2),
-        [=](const Ordinal t)
-        {
-          //How big are the brown/pink boxes?
-          Ordinal boxSize = Ordinal(2) << (i - j);
-          //Which box contains this thread?
-          Ordinal boxID = t >> (i - j);         //t * 2 / boxSize;
-          Ordinal boxStart = boxID << (1 + i - j);  //boxID * boxSize
-          Ordinal boxOffset = t - (boxStart >> 1);  //t - boxID * boxSize / 2;
-          Ordinal elem1 = boxStart + boxOffset;
-          if(j == 0)
-          {
-            //first phase (brown box): within a block, compare with the opposite value in the box
-            Ordinal elem2 = boxStart + boxSize - 1 - boxOffset;
-            if(elem2 < n)
-            {
-              //both elements in bounds, so compare them and swap if out of order
-              if(comp(values[elem2], values[elem1]))
-              {
-                ValueType temp = values[elem1];
-                values[elem1] = values[elem2];
-                values[elem2] = temp;
+      Kokkos::parallel_for(
+          Kokkos::TeamVectorRange(mem, npot / 2), [=](const Ordinal t) {
+            // How big are the brown/pink boxes?
+            Ordinal boxSize = Ordinal(2) << (i - j);
+            // Which box contains this thread?
+            Ordinal boxID     = t >> (i - j);          // t * 2 / boxSize;
+            Ordinal boxStart  = boxID << (1 + i - j);  // boxID * boxSize
+            Ordinal boxOffset = t - (boxStart >> 1);   // t - boxID * boxSize /
+                                                       // 2;
+            Ordinal elem1 = boxStart + boxOffset;
+            if (j == 0) {
+              // first phase (brown box): within a block, compare with the
+              // opposite value in the box
+              Ordinal elem2 = boxStart + boxSize - 1 - boxOffset;
+              if (elem2 < n) {
+                // both elements in bounds, so compare them and swap if out of
+                // order
+                if (comp(values[elem2], values[elem1])) {
+                  ValueType temp = values[elem1];
+                  values[elem1]  = values[elem2];
+                  values[elem2]  = temp;
+                }
               }
-            }
-          }
-          else
-          {
-            //later phases (pink box): within a block, compare with fixed distance (boxSize / 2) apart
-            Ordinal elem2 = elem1 + boxSize / 2;
-            if(elem2 < n)
-            {
-              if(comp(values[elem2], values[elem1]))
-              {
-                ValueType temp = values[elem1];
-                values[elem1] = values[elem2];
-                values[elem2] = temp;
+            } else {
+              // later phases (pink box): within a block, compare with fixed
+              // distance (boxSize / 2) apart
+              Ordinal elem2 = elem1 + boxSize / 2;
+              if (elem2 < n) {
+                if (comp(values[elem2], values[elem1])) {
+                  ValueType temp = values[elem1];
+                  values[elem1]  = values[elem2];
+                  values[elem2]  = temp;
+                }
               }
             }
-          }
-        });
+          });
       mem.team_barrier();
     }
   }
 }
 
-//Sort "values", while applying the same swaps to "perm" 
-template<typename Ordinal, typename ValueType, typename PermType, typename TeamMember, typename Comparator>
-KOKKOS_INLINE_FUNCTION void
-TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, const Comparator& comp)
-{
-  //Algorithm only works on power-of-two input size only.
-  //If n is not a power-of-two, will implicitly pretend
-  //that values[i] for i >= n is just the max for ValueType, so it never gets swapped
-  Ordinal npot = 1;
+// Sort "values", while applying the same swaps to "perm"
+template <typename Ordinal, typename ValueType, typename PermType,
+          typename TeamMember, typename Comparator>
+KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm,
+                                             Ordinal n, const TeamMember mem,
+                                             const Comparator& comp) {
+  // Algorithm only works on power-of-two input size only.
+  // If n is not a power-of-two, will implicitly pretend
+  // that values[i] for i >= n is just the max for ValueType, so it never gets
+  // swapped
+  Ordinal npot   = 1;
   Ordinal levels = 0;
-  while(npot < n)
-  {
+  while (npot < n) {
     levels++;
     npot <<= 1;
   }
-  for(Ordinal i = 0; i < levels; i++)
-  {
-    for(Ordinal j = 0; j <= i; j++)
-    {
+  for (Ordinal i = 0; i < levels; i++) {
+    for (Ordinal j = 0; j <= i; j++) {
       // n/2 pairs of items are compared in parallel
-      Kokkos::parallel_for(Kokkos::TeamVectorRange(mem, npot / 2),
-        [=](const Ordinal t)
-        {
-          //How big are the brown/pink boxes?
-          Ordinal boxSize = Ordinal(2) << (i - j);
-          //Which box contains this thread?
-          Ordinal boxID = t >> (i - j);         //t * 2 / boxSize;
-          Ordinal boxStart = boxID << (1 + i - j);  //boxID * boxSize
-          Ordinal boxOffset = t - (boxStart >> 1);  //t - boxID * boxSize / 2;
-          Ordinal elem1 = boxStart + boxOffset;
-          if(j == 0)
-          {
-            //first phase (brown box): within a block, compare with the opposite value in the box
-            Ordinal elem2 = boxStart + boxSize - 1 - boxOffset;
-            if(elem2 < n)
-            {
-              //both elements in bounds, so compare them and swap if out of order
-              if(comp(values[elem2], values[elem1]))
-              {
-                ValueType temp1 = values[elem1];
-                values[elem1] = values[elem2];
-                values[elem2] = temp1;
-                PermType temp2 = perm[elem1];
-                perm[elem1] = perm[elem2];
-                perm[elem2] = temp2;
+      Kokkos::parallel_for(
+          Kokkos::TeamVectorRange(mem, npot / 2), [=](const Ordinal t) {
+            // How big are the brown/pink boxes?
+            Ordinal boxSize = Ordinal(2) << (i - j);
+            // Which box contains this thread?
+            Ordinal boxID     = t >> (i - j);          // t * 2 / boxSize;
+            Ordinal boxStart  = boxID << (1 + i - j);  // boxID * boxSize
+            Ordinal boxOffset = t - (boxStart >> 1);   // t - boxID * boxSize /
+                                                       // 2;
+            Ordinal elem1 = boxStart + boxOffset;
+            if (j == 0) {
+              // first phase (brown box): within a block, compare with the
+              // opposite value in the box
+              Ordinal elem2 = boxStart + boxSize - 1 - boxOffset;
+              if (elem2 < n) {
+                // both elements in bounds, so compare them and swap if out of
+                // order
+                if (comp(values[elem2], values[elem1])) {
+                  ValueType temp1 = values[elem1];
+                  values[elem1]   = values[elem2];
+                  values[elem2]   = temp1;
+                  PermType temp2  = perm[elem1];
+                  perm[elem1]     = perm[elem2];
+                  perm[elem2]     = temp2;
+                }
               }
-            }
-          }
-          else
-          {
-            //later phases (pink box): within a block, compare with fixed distance (boxSize / 2) apart
-            Ordinal elem2 = elem1 + boxSize / 2;
-            if(elem2 < n)
-            {
-              if(comp(values[elem2], values[elem1]))
-              {
-                ValueType temp1 = values[elem1];
-                values[elem1] = values[elem2];
-                values[elem2] = temp1;
-                PermType temp2 = perm[elem1];
-                perm[elem1] = perm[elem2];
-                perm[elem2] = temp2;
+            } else {
+              // later phases (pink box): within a block, compare with fixed
+              // distance (boxSize / 2) apart
+              Ordinal elem2 = elem1 + boxSize / 2;
+              if (elem2 < n) {
+                if (comp(values[elem2], values[elem1])) {
+                  ValueType temp1 = values[elem1];
+                  values[elem1]   = values[elem2];
+                  values[elem2]   = temp1;
+                  PermType temp2  = perm[elem1];
+                  perm[elem1]     = perm[elem2];
+                  perm[elem2]     = temp2;
+                }
               }
             }
-          }
-        });
+          });
       mem.team_barrier();
     }
   }
 }
 
-//For backward compatibility: keep the public interface accessible in KokkosKernels::Impl::
-namespace Impl
-{
-  template<typename execution_space, typename rowmap_t, typename entries_t>
-  [[deprecated]]
-  void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries)
-  {
-    KokkosKernels::sort_crs_graph<execution_space, rowmap_t, entries_t>(rowmap, entries);
-  }
-
-  template<typename execution_space, typename rowmap_t, typename entries_t, typename values_t>
-  [[deprecated]]
-  void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const values_t& values)
-  {
-    KokkosKernels::sort_crs_matrix<execution_space, rowmap_t, entries_t, values_t>(rowmap, entries, values);
-  }
+// For backward compatibility: keep the public interface accessible in
+// KokkosKernels::Impl::
+namespace Impl {
+template <typename execution_space, typename rowmap_t, typename entries_t>
+[[deprecated]] void sort_crs_graph(const rowmap_t& rowmap,
+                                   const entries_t& entries) {
+  KokkosKernels::sort_crs_graph<execution_space, rowmap_t, entries_t>(rowmap,
+                                                                      entries);
+}
 
-  template <typename crsMat_t>
-  [[deprecated]]
-  void sort_crs_matrix(const crsMat_t& A)
-  {
-    KokkosKernels::sort_crs_matrix(A);
-  }
+template <typename execution_space, typename rowmap_t, typename entries_t,
+          typename values_t>
+[[deprecated]] void sort_crs_matrix(const rowmap_t& rowmap,
+                                    const entries_t& entries,
+                                    const values_t& values) {
+  KokkosKernels::sort_crs_matrix<execution_space, rowmap_t, entries_t,
+                                 values_t>(rowmap, entries, values);
+}
 
-  template<typename exec_space, typename rowmap_t, typename entries_t>
-  [[deprecated]]
-  void sort_and_merge_graph(
-      const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in,
-      rowmap_t& rowmap_out, entries_t& entries_out)
-  {
-    KokkosKernels::sort_and_merge_graph<exec_space, rowmap_t, entries_t>(rowmap_in, entries_in, rowmap_out, entries_out);
-  }
+template <typename crsMat_t>
+[[deprecated]] void sort_crs_matrix(const crsMat_t& A) {
+  KokkosKernels::sort_crs_matrix(A);
+}
 
-  template<typename crsMat_t>
-  [[deprecated]]
-  crsMat_t sort_and_merge_matrix(const crsMat_t& A)
-  {
-    KokkosKernels::sort_and_merge_matrix(A);
-  }
+template <typename exec_space, typename rowmap_t, typename entries_t>
+[[deprecated]] void sort_and_merge_graph(
+    const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in,
+    rowmap_t& rowmap_out, entries_t& entries_out) {
+  KokkosKernels::sort_and_merge_graph<exec_space, rowmap_t, entries_t>(
+      rowmap_in, entries_in, rowmap_out, entries_out);
+}
 
-  template<typename View, typename ExecSpace, typename Ordinal, typename Comparator = Impl::DefaultComparator<typename View::value_type>>
-  [[deprecated]]
-  void bitonicSort(View v, const Comparator& comp = Comparator())
-  {
-    KokkosKernels::bitonicSort<View, ExecSpace, Ordinal, Comparator>(v, comp);
-  }
+template <typename crsMat_t>
+[[deprecated]] crsMat_t sort_and_merge_matrix(const crsMat_t& A) {
+  return KokkosKernels::sort_and_merge_matrix(A);
+}
 
-  template<typename Ordinal, typename ValueType>
-  [[deprecated]]
-  KOKKOS_INLINE_FUNCTION
-  void
-  SerialRadixSort(ValueType* values, ValueType* valuesAux, Ordinal n)
-  {
-    KokkosKernels::SerialRadixSort<Ordinal, ValueType>(values, valuesAux, n);
-  }
+template <
+    typename View, typename ExecSpace, typename Ordinal,
+    typename Comparator = Impl::DefaultComparator<typename View::value_type>>
+[[deprecated]] void bitonicSort(View v, const Comparator& comp = Comparator()) {
+  KokkosKernels::bitonicSort<View, ExecSpace, Ordinal, Comparator>(v, comp);
+}
 
-  // Same as SerialRadixSort, but also permutes perm[0...n] as it sorts values[0...n].
-  template<typename Ordinal, typename ValueType, typename PermType>
-  [[deprecated]]
-  KOKKOS_INLINE_FUNCTION
-  void
-  SerialRadixSort2(ValueType* values, ValueType* valuesAux, PermType* perm, PermType* permAux, Ordinal n)
-  {
-    KokkosKernels::SerialRadixSort2<Ordinal, ValueType, PermType>(values, valuesAux, perm, permAux, n);
-  }
+template <typename Ordinal, typename ValueType>
+[[deprecated]] KOKKOS_INLINE_FUNCTION void SerialRadixSort(ValueType* values,
+                                                           ValueType* valuesAux,
+                                                           Ordinal n) {
+  KokkosKernels::SerialRadixSort<Ordinal, ValueType>(values, valuesAux, n);
+}
 
-  template<typename Ordinal, typename ValueType, typename TeamMember, typename Comparator = Impl::DefaultComparator<ValueType>>
-  [[deprecated]]
-  KOKKOS_INLINE_FUNCTION
-  void
-  TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator())
-  {
-    KokkosKernels::TeamBitonicSort<Ordinal, ValueType, TeamMember, Comparator>(values, n, mem, comp);
-  }
+// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts
+// values[0...n].
+template <typename Ordinal, typename ValueType, typename PermType>
+[[deprecated]] KOKKOS_INLINE_FUNCTION void SerialRadixSort2(
+    ValueType* values, ValueType* valuesAux, PermType* perm, PermType* permAux,
+    Ordinal n) {
+  KokkosKernels::SerialRadixSort2<Ordinal, ValueType, PermType>(
+      values, valuesAux, perm, permAux, n);
+}
 
-  // Same as SerialRadixSort, but also permutes perm[0...n] as it sorts values[0...n].
-  template<typename Ordinal, typename ValueType, typename PermType, typename TeamMember, typename Comparator = Impl::DefaultComparator<ValueType>>
-  [[deprecated]]
-  KOKKOS_INLINE_FUNCTION
-  void
-  TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator())
-  {
-    KokkosKernels::TeamBitonicSort2<Ordinal, ValueType, PermType, TeamMember, Comparator>(values, perm, n, mem, comp);
-  }
+template <typename Ordinal, typename ValueType, typename TeamMember,
+          typename Comparator = Impl::DefaultComparator<ValueType>>
+[[deprecated]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort(
+    ValueType* values, Ordinal n, const TeamMember mem,
+    const Comparator& comp = Comparator()) {
+  KokkosKernels::TeamBitonicSort<Ordinal, ValueType, TeamMember, Comparator>(
+      values, n, mem, comp);
 }
 
+// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts
+// values[0...n].
+template <typename Ordinal, typename ValueType, typename PermType,
+          typename TeamMember,
+          typename Comparator = Impl::DefaultComparator<ValueType>>
+[[deprecated]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(
+    ValueType* values, PermType* perm, Ordinal n, const TeamMember mem,
+    const Comparator& comp = Comparator()) {
+  KokkosKernels::TeamBitonicSort2<Ordinal, ValueType, PermType, TeamMember,
+                                  Comparator>(values, perm, n, mem, comp);
 }
+}  // namespace Impl
 
-#endif
+}  // namespace KokkosKernels
 
+#endif
diff --git a/src/common/KokkosKernels_SparseUtils.hpp b/src/common/KokkosKernels_SparseUtils.hpp
index cecb264987..323ae7846f 100644
--- a/src/common/KokkosKernels_SparseUtils.hpp
+++ b/src/common/KokkosKernels_SparseUtils.hpp
@@ -44,287 +44,260 @@
 #ifndef _KOKKOSKERNELS_SPARSEUTILS_HPP
 #define _KOKKOSKERNELS_SPARSEUTILS_HPP
 #include "Kokkos_Core.hpp"
-#include "Kokkos_Atomic.hpp"
-#include "Kokkos_Timer.hpp"
 #include "KokkosKernels_SimpleUtils.hpp"
 #include "KokkosKernels_IOUtils.hpp"
 #include "KokkosKernels_ExecSpaceUtils.hpp"
 #include <vector>
 #include "KokkosKernels_PrintUtils.hpp"
+#include "KokkosSparse_CrsMatrix.hpp"
+#include "KokkosSparse_BlockCrsMatrix.hpp"
+#include "KokkosSparse_BsrMatrix.hpp"
 
 #ifdef KOKKOSKERNELS_HAVE_PARALLEL_GNUSORT
-#include<parallel/algorithm>
+#include <parallel/algorithm>
 #endif
 
-namespace KokkosKernels{
+namespace KokkosKernels {
 
+enum SparseMatrixFormat {
+  BlockCRS,
+  BSR,
+  CRS = BlockCRS,  // convenience alias: for block_size=1 or no-blocks there is
+                   // no difference in value ordering (so the format tag becomes
+                   // irrelevant)
+};
 
-namespace Impl{
+namespace Impl {
 
-template <typename in_row_view_t,
-          typename in_nnz_view_t,
-		  typename in_val_view_t,
-		  typename out_row_view_t,
-		  typename out_nnz_view_t,
-		  typename out_val_view_t>
+template <typename in_row_view_t, typename in_nnz_view_t,
+          typename in_val_view_t, typename out_row_view_t,
+          typename out_nnz_view_t, typename out_val_view_t>
 void kk_create_blockcrs_formated_point_crsmatrix(
-		int block_size,
-	    size_t num_rows,
-	    size_t num_cols,
-	    in_row_view_t in_xadj,
-	    in_nnz_view_t in_adj,
-		in_val_view_t in_vals,
-
-
-	    size_t &out_num_rows,
-	    size_t &out_num_cols,
-	    out_row_view_t &out_xadj,
-	    out_nnz_view_t &out_adj,
-		out_val_view_t &out_vals
-	    ){
-
-	typedef typename in_nnz_view_t::non_const_value_type lno_t;
-	typedef typename in_row_view_t::non_const_value_type size_type;
-	typedef typename in_val_view_t::non_const_value_type scalar_t;
-
-
-    typename in_row_view_t::HostMirror hr = Kokkos::create_mirror_view (in_xadj);
-    Kokkos::deep_copy (hr, in_xadj);
-    typename in_nnz_view_t::HostMirror he = Kokkos::create_mirror_view (in_adj);
-    Kokkos::deep_copy (he, in_adj);
-    typename in_val_view_t::HostMirror hv = Kokkos::create_mirror_view (in_vals);
-    Kokkos::deep_copy (hv, in_vals);
-
-    out_num_rows = (num_rows / block_size) * block_size;
-    if (num_rows % block_size) out_num_rows += block_size;
-
-    out_num_cols = (num_cols / block_size) * block_size;
-    if (num_cols % block_size) out_num_cols += block_size;
-
-
-    std::vector<size_type> block_rows_xadj (out_num_rows + 1, 0);
-    std::vector<lno_t>     block_adj; //(in_adj.extent(0), 0);
-    std::vector<scalar_t>  block_vals;// (in_adj.extent(0), 0);
-
-    std::vector<lno_t> block_columns (out_num_cols, 0);
-    std::vector<scalar_t> block_accumulators (out_num_cols, 0);
-    std::vector<bool> block_flags (out_num_cols, false);
-
-    for (lno_t i = 0; i < lno_t(num_rows); i += block_size){
-    	//std::cout << "row:" << i << std::endl;
-    	lno_t outputrowsize = 0;
-
-    	for (lno_t block_ind = 0; block_ind < block_size; ++block_ind){
-    		lno_t row_ind = block_ind + i;
-    		//std::cout << "\nrow_ind:" << row_ind << std::endl;
-    		if (row_ind < lno_t(num_rows)){
-    			size_type adj_begin = hr(row_ind);
-    			size_type adj_end = hr(row_ind + 1);
-
-    			lno_t row_size = adj_end - adj_begin;
-
-    			for (lno_t col_ind = 0; col_ind < row_size; ++col_ind){
-
-    				lno_t colid = he(col_ind + adj_begin);
-    				//scalar_t colval = hv(col_ind);
-
-    				lno_t block_column_start = (colid / block_size) * block_size;
-
-    				for (lno_t kk = 0; kk < block_size; ++kk){
-    					lno_t col_id_to_insert = block_column_start + kk;
-        				//std::cout << colid << " " << block_column_start << " " << col_id_to_insert << " " << block_flags[col_id_to_insert] << " ## ";
-
-    					if (block_flags[col_id_to_insert] == false) {
-    						block_flags[col_id_to_insert] = true;
-    						//block_adj[output_index + outputrowsize++] = col_id_to_insert;
-    						//block_adj.push_back(col_id_to_insert);
-    						block_columns[outputrowsize++] = col_id_to_insert;
-
-    					}
-    				}
-    			}
-    		}
-    		else {
-
-				lno_t colid = row_ind;
-				//scalar_t colval = hv(col_ind);
-
-				lno_t block_column_start = (colid / block_size) * block_size;
-
-				for (lno_t kk = 0; kk < block_size; ++kk){
-					lno_t col_id_to_insert = block_column_start + kk;
-					if (block_flags[col_id_to_insert] == false) {
-						block_flags[col_id_to_insert] = true;
-						//block_adj[output_index + outputrowsize++] = col_id_to_insert;
-						//block_adj.push_back(col_id_to_insert);
-						block_columns[outputrowsize++] = col_id_to_insert;
-					}
-				}
-    		}
-    	}
-		std::sort(block_columns.begin(), block_columns.begin() + outputrowsize);
-		//std::cout << "\nrow:" << i << " outputrowsize:" << outputrowsize << std::endl;
-    	for(lno_t kk = 0; kk < outputrowsize; ++kk){
-    		block_flags[block_columns[kk]] = false;
-    		//std::cout << block_columns[kk] << " ";
-    	}
-    	//std::cout << std::endl;
-
-    	for (lno_t block_ind = 0; block_ind < block_size; ++block_ind){
-    		lno_t row_ind = block_ind + i;
-    		if (row_ind < lno_t(num_rows)){
-    			size_type adj_begin = hr(row_ind);
-    			size_type adj_end = hr(row_ind + 1);
-
-    			lno_t row_size = adj_end - adj_begin;
-
-    			for (lno_t col_ind = 0; col_ind < row_size; ++col_ind){
-    				lno_t colid = he(col_ind + adj_begin);
-    				scalar_t colval = hv(col_ind + adj_begin);
-    				block_accumulators[colid] = colval;
-    			}
-    		}
-    		else {
-    			block_accumulators[row_ind] = 1;
-    		}
-
-        	for(lno_t kk = 0; kk < outputrowsize; ++kk){
-        		lno_t outcol = block_columns[kk];
-        		block_adj.push_back(outcol );
-        		block_vals.push_back(block_accumulators[outcol]);
-        		block_accumulators[outcol] = 0;
-        	}
-        	block_rows_xadj[row_ind + 1] = block_rows_xadj[row_ind] + outputrowsize;
-    	}
-    }
-
-
-    out_xadj = out_row_view_t ("BlockedPointCRS XADJ", out_num_rows + 1);
-    out_adj = out_nnz_view_t("BlockedPointCRS ADJ", block_adj.size());
-    out_vals = out_val_view_t("BlockedPointCRS VALS", block_vals.size());
+    int block_size, size_t num_rows, size_t num_cols, in_row_view_t in_xadj,
+    in_nnz_view_t in_adj, in_val_view_t in_vals,
 
-
-    typename out_row_view_t::HostMirror hor = Kokkos::create_mirror_view (out_xadj);
-    typename out_nnz_view_t::HostMirror hoe = Kokkos::create_mirror_view (out_adj);
-    typename out_val_view_t::HostMirror hov = Kokkos::create_mirror_view (out_vals);
-
-    for (lno_t i = 0; i < lno_t(out_num_rows) + 1; ++i ){
-    	hor(i) = block_rows_xadj[i];
-    }
-
-    size_type ne = block_adj.size();
-    for (size_type i = 0; i < ne; ++i ){
-    	hoe(i) = block_adj[i];
+    size_t &out_num_rows, size_t &out_num_cols, out_row_view_t &out_xadj,
+    out_nnz_view_t &out_adj, out_val_view_t &out_vals) {
+  typedef typename in_nnz_view_t::non_const_value_type lno_t;
+  typedef typename in_row_view_t::non_const_value_type size_type;
+  typedef typename in_val_view_t::non_const_value_type scalar_t;
+
+  typename in_row_view_t::HostMirror hr = Kokkos::create_mirror_view(in_xadj);
+  Kokkos::deep_copy(hr, in_xadj);
+  typename in_nnz_view_t::HostMirror he = Kokkos::create_mirror_view(in_adj);
+  Kokkos::deep_copy(he, in_adj);
+  typename in_val_view_t::HostMirror hv = Kokkos::create_mirror_view(in_vals);
+  Kokkos::deep_copy(hv, in_vals);
+
+  out_num_rows = (num_rows / block_size) * block_size;
+  if (num_rows % block_size) out_num_rows += block_size;
+
+  out_num_cols = (num_cols / block_size) * block_size;
+  if (num_cols % block_size) out_num_cols += block_size;
+
+  std::vector<size_type> block_rows_xadj(out_num_rows + 1, 0);
+  std::vector<lno_t> block_adj;      //(in_adj.extent(0), 0);
+  std::vector<scalar_t> block_vals;  // (in_adj.extent(0), 0);
+
+  std::vector<lno_t> block_columns(out_num_cols, 0);
+  std::vector<scalar_t> block_accumulators(out_num_cols, 0);
+  std::vector<bool> block_flags(out_num_cols, false);
+
+  for (lno_t i = 0; i < lno_t(num_rows); i += block_size) {
+    // std::cout << "row:" << i << std::endl;
+    lno_t outputrowsize = 0;
+
+    for (lno_t block_ind = 0; block_ind < block_size; ++block_ind) {
+      lno_t row_ind = block_ind + i;
+      // std::cout << "\nrow_ind:" << row_ind << std::endl;
+      if (row_ind < lno_t(num_rows)) {
+        size_type adj_begin = hr(row_ind);
+        size_type adj_end   = hr(row_ind + 1);
+
+        lno_t row_size = adj_end - adj_begin;
+
+        for (lno_t col_ind = 0; col_ind < row_size; ++col_ind) {
+          lno_t colid = he(col_ind + adj_begin);
+          // scalar_t colval = hv(col_ind);
+
+          lno_t block_column_start = (colid / block_size) * block_size;
+
+          for (lno_t kk = 0; kk < block_size; ++kk) {
+            lno_t col_id_to_insert = block_column_start + kk;
+            // std::cout << colid << " " << block_column_start << " " <<
+            // col_id_to_insert << " " << block_flags[col_id_to_insert] << " ##
+            // ";
+
+            if (block_flags[col_id_to_insert] == false) {
+              block_flags[col_id_to_insert] = true;
+              // block_adj[output_index + outputrowsize++] = col_id_to_insert;
+              // block_adj.push_back(col_id_to_insert);
+              block_columns[outputrowsize++] = col_id_to_insert;
+            }
+          }
+        }
+      } else {
+        lno_t colid = row_ind;
+        // scalar_t colval = hv(col_ind);
+
+        lno_t block_column_start = (colid / block_size) * block_size;
+
+        for (lno_t kk = 0; kk < block_size; ++kk) {
+          lno_t col_id_to_insert = block_column_start + kk;
+          if (block_flags[col_id_to_insert] == false) {
+            block_flags[col_id_to_insert] = true;
+            // block_adj[output_index + outputrowsize++] = col_id_to_insert;
+            // block_adj.push_back(col_id_to_insert);
+            block_columns[outputrowsize++] = col_id_to_insert;
+          }
+        }
+      }
     }
-    for (size_type i = 0; i < ne; ++i ){
-    	hov(i) = block_vals[i];
+    std::sort(block_columns.begin(), block_columns.begin() + outputrowsize);
+    // std::cout << "\nrow:" << i << " outputrowsize:" << outputrowsize <<
+    // std::endl;
+    for (lno_t kk = 0; kk < outputrowsize; ++kk) {
+      block_flags[block_columns[kk]] = false;
+      // std::cout << block_columns[kk] << " ";
     }
+    // std::cout << std::endl;
 
-    Kokkos::deep_copy (out_xadj, hor);
-    Kokkos::deep_copy (out_adj, hoe);
-    Kokkos::deep_copy (out_vals, hov);
-}
-
-template <typename in_row_view_t,
-          typename in_nnz_view_t,
-		  typename in_val_view_t,
-		  typename out_row_view_t,
-		  typename out_nnz_view_t,
-		  typename out_val_view_t>
-void kk_create_blockcrs_from_blockcrs_formatted_point_crs(
-		int block_size,
-	    size_t num_rows,
-	    size_t num_cols,
-	    in_row_view_t in_xadj,
-	    in_nnz_view_t in_adj,
-		in_val_view_t in_vals,
-
-
-	    size_t &out_num_rows,
-	    size_t &out_num_cols,
-	    out_row_view_t &out_xadj,
-	    out_nnz_view_t &out_adj,
-		out_val_view_t &out_vals
-	    ){
+    for (lno_t block_ind = 0; block_ind < block_size; ++block_ind) {
+      lno_t row_ind = block_ind + i;
+      if (row_ind < lno_t(num_rows)) {
+        size_type adj_begin = hr(row_ind);
+        size_type adj_end   = hr(row_ind + 1);
 
-    typename in_row_view_t::HostMirror hr = Kokkos::create_mirror_view (in_xadj);
-    Kokkos::deep_copy (hr, in_xadj);
-    typename in_nnz_view_t::HostMirror he = Kokkos::create_mirror_view (in_adj);
-    Kokkos::deep_copy (he, in_adj);
-    typename in_val_view_t::HostMirror hv = Kokkos::create_mirror_view (in_vals);
-    Kokkos::deep_copy (hv, in_vals);
+        lno_t row_size = adj_end - adj_begin;
 
+        for (lno_t col_ind = 0; col_ind < row_size; ++col_ind) {
+          lno_t colid               = he(col_ind + adj_begin);
+          scalar_t colval           = hv(col_ind + adj_begin);
+          block_accumulators[colid] = colval;
+        }
+      } else {
+        block_accumulators[row_ind] = 1;
+      }
 
-	out_num_rows = num_rows / block_size;
-	out_num_cols = num_cols / block_size;
+      for (lno_t kk = 0; kk < outputrowsize; ++kk) {
+        lno_t outcol = block_columns[kk];
+        block_adj.push_back(outcol);
+        block_vals.push_back(block_accumulators[outcol]);
+        block_accumulators[outcol] = 0;
+      }
+      block_rows_xadj[row_ind + 1] = block_rows_xadj[row_ind] + outputrowsize;
+    }
+  }
 
+  out_xadj = out_row_view_t("BlockedPointCRS XADJ", out_num_rows + 1);
+  out_adj  = out_nnz_view_t("BlockedPointCRS ADJ", block_adj.size());
+  out_vals = out_val_view_t("BlockedPointCRS VALS", block_vals.size());
 
-    out_xadj = out_row_view_t ("BlockedCRS XADJ", out_num_rows + 1);
-    out_adj = out_nnz_view_t("BlockedCRS ADJ", in_adj.extent(0) / (block_size * block_size));
-    out_vals = out_val_view_t("BlockedCRS VALS", in_vals.extent(0));
+  typename out_row_view_t::HostMirror hor =
+      Kokkos::create_mirror_view(out_xadj);
+  typename out_nnz_view_t::HostMirror hoe = Kokkos::create_mirror_view(out_adj);
+  typename out_val_view_t::HostMirror hov =
+      Kokkos::create_mirror_view(out_vals);
 
+  for (lno_t i = 0; i < lno_t(out_num_rows) + 1; ++i) {
+    hor(i) = block_rows_xadj[i];
+  }
 
-    typename out_row_view_t::HostMirror hor = Kokkos::create_mirror_view (out_xadj);
-    typename out_nnz_view_t::HostMirror hoe = Kokkos::create_mirror_view (out_adj);
-    typename out_val_view_t::HostMirror hov = Kokkos::create_mirror_view (out_vals);
+  size_type ne = block_adj.size();
+  for (size_type i = 0; i < ne; ++i) {
+    hoe(i) = block_adj[i];
+  }
+  for (size_type i = 0; i < ne; ++i) {
+    hov(i) = block_vals[i];
+  }
 
+  Kokkos::deep_copy(out_xadj, hor);
+  Kokkos::deep_copy(out_adj, hoe);
+  Kokkos::deep_copy(out_vals, hov);
+}
 
-	typedef typename in_nnz_view_t::non_const_value_type lno_t;
-	typedef typename in_row_view_t::non_const_value_type size_type;
-	//typedef typename in_val_view_t::non_const_value_type scalar_t;
+template <typename in_row_view_t, typename in_nnz_view_t,
+          typename in_val_view_t, typename out_row_view_t,
+          typename out_nnz_view_t, typename out_val_view_t>
+void kk_create_blockcrs_from_blockcrs_formatted_point_crs(
+    int block_size, size_t num_rows, size_t num_cols, in_row_view_t in_xadj,
+    in_nnz_view_t in_adj, in_val_view_t in_vals,
+
+    size_t &out_num_rows, size_t &out_num_cols, out_row_view_t &out_xadj,
+    out_nnz_view_t &out_adj, out_val_view_t &out_vals) {
+  typename in_row_view_t::HostMirror hr = Kokkos::create_mirror_view(in_xadj);
+  Kokkos::deep_copy(hr, in_xadj);
+  typename in_nnz_view_t::HostMirror he = Kokkos::create_mirror_view(in_adj);
+  Kokkos::deep_copy(he, in_adj);
+  typename in_val_view_t::HostMirror hv = Kokkos::create_mirror_view(in_vals);
+  Kokkos::deep_copy(hv, in_vals);
+
+  out_num_rows = num_rows / block_size;
+  out_num_cols = num_cols / block_size;
+
+  out_xadj = out_row_view_t("BlockedCRS XADJ", out_num_rows + 1);
+  out_adj  = out_nnz_view_t("BlockedCRS ADJ",
+                           in_adj.extent(0) / (block_size * block_size));
+  out_vals = out_val_view_t("BlockedCRS VALS", in_vals.extent(0));
+
+  typename out_row_view_t::HostMirror hor =
+      Kokkos::create_mirror_view(out_xadj);
+  typename out_nnz_view_t::HostMirror hoe = Kokkos::create_mirror_view(out_adj);
+  typename out_val_view_t::HostMirror hov =
+      Kokkos::create_mirror_view(out_vals);
+
+  typedef typename in_nnz_view_t::non_const_value_type lno_t;
+  typedef typename in_row_view_t::non_const_value_type size_type;
+  // typedef typename in_val_view_t::non_const_value_type scalar_t;
 
-    for(lno_t i = 0; i < lno_t(out_num_rows); ++i ){
-    	hor(i) = hr(i * block_size) / (block_size * block_size);
+  for (lno_t i = 0; i < lno_t(out_num_rows); ++i) {
+    hor(i) = hr(i * block_size) / (block_size * block_size);
 
-    	size_type ib = hr(i * block_size);
-    	size_type ie = hr(i * block_size + 1);
+    size_type ib = hr(i * block_size);
+    size_type ie = hr(i * block_size + 1);
 
-    	lno_t is = ie - ib;
+    lno_t is = ie - ib;
 
-    	size_type ob = hor(i);
-    	//size_type oe = hr(i * block_size + 1) / block_size;
-    	lno_t os = (ie - ib) / block_size;
-    	lno_t write_index = 0;
-    	for (lno_t j = 0; j < is; ++j){
-    		lno_t e = he(ib + j);
-    		if (e % block_size == 0){
-    			hoe(ob + write_index++) = e / block_size;
-    		}
-    	}
-    	if (write_index != os) {
-    		std::cerr << "row:" << i << " expected size:" << os << " written size:" << write_index << std::endl;
-    		exit(1);
-    	}
+    size_type ob = hor(i);
+    // size_type oe = hr(i * block_size + 1) / block_size;
+    lno_t os          = (ie - ib) / block_size;
+    lno_t write_index = 0;
+    for (lno_t j = 0; j < is; ++j) {
+      lno_t e = he(ib + j);
+      if (e % block_size == 0) {
+        hoe(ob + write_index++) = e / block_size;
+      }
     }
-    hor(out_num_rows) = hr(out_num_rows * block_size) / (block_size * block_size);
-    Kokkos::deep_copy (out_xadj, hor);
-    Kokkos::deep_copy (out_adj, hoe);
-
-    size_type ne = in_adj.extent(0);
-    for(size_type i = 0; i < ne; ++i ){
-    	hov(i) = hv(i);
+    if (write_index != os) {
+      std::cerr << "row:" << i << " expected size:" << os
+                << " written size:" << write_index << std::endl;
+      exit(1);
     }
-    Kokkos::deep_copy (out_vals, hov);
-}
+  }
+  hor(out_num_rows) = hr(out_num_rows * block_size) / (block_size * block_size);
+  Kokkos::deep_copy(out_xadj, hor);
+  Kokkos::deep_copy(out_adj, hoe);
 
-template <typename in_row_view_t,
-          typename in_nnz_view_t,
-          typename in_scalar_view_t,
-          typename out_row_view_t,
-          typename out_nnz_view_t,
-          typename out_scalar_view_t,
-          typename tempwork_row_view_t,
-          typename MyExecSpace>
-struct TransposeMatrix{
+  size_type ne = in_adj.extent(0);
+  for (size_type i = 0; i < ne; ++i) {
+    hov(i) = hv(i);
+  }
+  Kokkos::deep_copy(out_vals, hov);
+}
 
-  struct CountTag{};
-  struct FillTag{};
+template <typename in_row_view_t, typename in_nnz_view_t,
+          typename in_scalar_view_t, typename out_row_view_t,
+          typename out_nnz_view_t, typename out_scalar_view_t,
+          typename tempwork_row_view_t, typename MyExecSpace>
+struct TransposeMatrix {
+  struct CountTag {};
+  struct FillTag {};
 
-  typedef Kokkos::TeamPolicy<CountTag, MyExecSpace> team_count_policy_t ;
-  typedef Kokkos::TeamPolicy<FillTag, MyExecSpace> team_fill_policy_t ;
+  typedef Kokkos::TeamPolicy<CountTag, MyExecSpace> team_count_policy_t;
+  typedef Kokkos::TeamPolicy<FillTag, MyExecSpace> team_fill_policy_t;
 
-  typedef typename team_count_policy_t::member_type team_count_member_t ;
-  typedef typename team_fill_policy_t::member_type team_fill_member_t ;
+  typedef typename team_count_policy_t::member_type team_count_member_t;
+  typedef typename team_fill_policy_t::member_type team_fill_member_t;
 
   typedef typename in_nnz_view_t::non_const_value_type nnz_lno_t;
   typedef typename in_row_view_t::non_const_value_type size_type;
@@ -334,230 +307,242 @@ struct TransposeMatrix{
   in_row_view_t xadj;
   in_nnz_view_t adj;
   in_scalar_view_t vals;
-  out_row_view_t t_xadj; //allocated
-  out_nnz_view_t t_adj;  //allocated
-  out_scalar_view_t t_vals;  //allocated
+  out_row_view_t t_xadj;     // allocated
+  out_nnz_view_t t_adj;      // allocated
+  out_scalar_view_t t_vals;  // allocated
   tempwork_row_view_t tmp_txadj;
   bool transpose_values;
   nnz_lno_t team_work_size;
 
-  TransposeMatrix(
-      nnz_lno_t num_rows_,
-      nnz_lno_t num_cols_,
-      in_row_view_t xadj_,
-      in_nnz_view_t adj_,
-      in_scalar_view_t vals_,
-      out_row_view_t t_xadj_,
-      out_nnz_view_t t_adj_,
-      out_scalar_view_t t_vals_,
-      tempwork_row_view_t tmp_txadj_,
-      bool transpose_values_,
-      nnz_lno_t team_row_work_size_):
-        num_rows(num_rows_), num_cols(num_cols_),
-        xadj(xadj_), adj(adj_), vals(vals_),
-        t_xadj(t_xadj_),  t_adj(t_adj_), t_vals(t_vals_),
-        tmp_txadj(tmp_txadj_), transpose_values(transpose_values_), team_work_size(team_row_work_size_) {}
+  TransposeMatrix(nnz_lno_t num_rows_, nnz_lno_t num_cols_, in_row_view_t xadj_,
+                  in_nnz_view_t adj_, in_scalar_view_t vals_,
+                  out_row_view_t t_xadj_, out_nnz_view_t t_adj_,
+                  out_scalar_view_t t_vals_, tempwork_row_view_t tmp_txadj_,
+                  bool transpose_values_, nnz_lno_t team_row_work_size_)
+      : num_rows(num_rows_),
+        num_cols(num_cols_),
+        xadj(xadj_),
+        adj(adj_),
+        vals(vals_),
+        t_xadj(t_xadj_),
+        t_adj(t_adj_),
+        t_vals(t_vals_),
+        tmp_txadj(tmp_txadj_),
+        transpose_values(transpose_values_),
+        team_work_size(team_row_work_size_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const CountTag&, const team_count_member_t & teamMember) const {
-
+  void operator()(const CountTag &,
+                  const team_count_member_t &teamMember) const {
     const nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, num_rows);
-    //TODO we dont need to go over rows
-    //just go over nonzeroes.
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,team_row_begin,team_row_end), [&] (const nnz_lno_t& row_index) {
-      const size_type col_begin = xadj[row_index];
-      const size_type col_end = xadj[row_index + 1];
-      const nnz_lno_t left_work = col_end - col_begin;
-      Kokkos::parallel_for(
-          Kokkos::ThreadVectorRange(teamMember, left_work),
-          [&] (nnz_lno_t i) {
-        const size_type adjind = i + col_begin;
-        const nnz_lno_t colIndex = adj[adjind];
-        typedef typename std::remove_reference< decltype( t_xadj(0) ) >::type atomic_incr_type;
-        Kokkos::atomic_fetch_add(&(t_xadj(colIndex)), atomic_incr_type(1));
-      });
-    });
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, num_rows);
+    // TODO we dont need to go over rows
+    // just go over nonzeroes.
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          const size_type col_begin = xadj[row_index];
+          const size_type col_end   = xadj[row_index + 1];
+          const nnz_lno_t left_work = col_end - col_begin;
+          Kokkos::parallel_for(
+              Kokkos::ThreadVectorRange(teamMember, left_work),
+              [&](nnz_lno_t i) {
+                const size_type adjind   = i + col_begin;
+                const nnz_lno_t colIndex = adj[adjind];
+                typedef
+                    typename std::remove_reference<decltype(t_xadj(0))>::type
+                        atomic_incr_type;
+                Kokkos::atomic_fetch_add(&(t_xadj(colIndex)),
+                                         atomic_incr_type(1));
+              });
+        });
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const FillTag&, const team_fill_member_t & teamMember) const {
+  void operator()(const FillTag &, const team_fill_member_t &teamMember) const {
     const nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, num_rows);
-
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,team_row_begin,team_row_end), [&] (const nnz_lno_t& row_index) {
-    //const nnz_lno_t teamsize = teamMember.team_size();
-    //for (nnz_lno_t row_index = team_row_begin + teamMember.team_rank(); row_index < team_row_end; row_index += teamsize){
-      const size_type col_begin = xadj[row_index];
-      const size_type col_end = xadj[row_index + 1];
-      const nnz_lno_t left_work = col_end - col_begin;
-      Kokkos::parallel_for(
-          Kokkos::ThreadVectorRange(teamMember, left_work),
-          [&] (nnz_lno_t i) {
-        const size_type adjind = i + col_begin;
-        const nnz_lno_t colIndex = adj[adjind];
-        typedef typename std::remove_reference< decltype( tmp_txadj(0) ) >::type atomic_incr_type;
-        const size_type pos = Kokkos::atomic_fetch_add(&(tmp_txadj(colIndex)), atomic_incr_type(1));
-
-        t_adj(pos) = row_index;
-        if (transpose_values){
-          t_vals(pos) = vals[adjind];
-        }
-
-      });
-    //}
-    });
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, num_rows);
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          // const nnz_lno_t teamsize = teamMember.team_size();
+          // for (nnz_lno_t row_index = team_row_begin + teamMember.team_rank();
+          // row_index < team_row_end; row_index += teamsize){
+          const size_type col_begin = xadj[row_index];
+          const size_type col_end   = xadj[row_index + 1];
+          const nnz_lno_t left_work = col_end - col_begin;
+          Kokkos::parallel_for(
+              Kokkos::ThreadVectorRange(teamMember, left_work),
+              [&](nnz_lno_t i) {
+                const size_type adjind   = i + col_begin;
+                const nnz_lno_t colIndex = adj[adjind];
+                typedef
+                    typename std::remove_reference<decltype(tmp_txadj(0))>::type
+                        atomic_incr_type;
+                const size_type pos = Kokkos::atomic_fetch_add(
+                    &(tmp_txadj(colIndex)), atomic_incr_type(1));
+
+                t_adj(pos) = row_index;
+                if (transpose_values) {
+                  t_vals(pos) = vals[adjind];
+                }
+              });
+          //}
+        });
   }
 };
 
-template <typename in_row_view_t,
-          typename in_nnz_view_t,
-          typename in_scalar_view_t,
-          typename out_row_view_t,
-          typename out_nnz_view_t,
-          typename out_scalar_view_t,
-          typename tempwork_row_view_t,
-          typename MyExecSpace>
+template <typename in_row_view_t, typename in_nnz_view_t,
+          typename in_scalar_view_t, typename out_row_view_t,
+          typename out_nnz_view_t, typename out_scalar_view_t,
+          typename tempwork_row_view_t, typename MyExecSpace>
 void transpose_matrix(
     typename in_nnz_view_t::non_const_value_type num_rows,
-    typename in_nnz_view_t::non_const_value_type num_cols,
-    in_row_view_t xadj,
-    in_nnz_view_t adj,
-    in_scalar_view_t vals,
-    out_row_view_t t_xadj,    //pre-allocated -- initialized with 0
-    out_nnz_view_t t_adj,     //pre-allocated -- no need for initialize
-    out_scalar_view_t t_vals  //pre-allocated -- no need for initialize
-    )
-{
-  //allocate some memory for work for row pointers
-  tempwork_row_view_t tmp_row_view(Kokkos::view_alloc(Kokkos::WithoutInitializing, "tmp_row_view"), num_cols + 1);
-
-  //create the functor for tranpose.
-  typedef TransposeMatrix <
-      in_row_view_t, in_nnz_view_t, in_scalar_view_t,
-      out_row_view_t, out_nnz_view_t, out_scalar_view_t,
-      tempwork_row_view_t, MyExecSpace>  TransposeFunctor_t;
+    typename in_nnz_view_t::non_const_value_type num_cols, in_row_view_t xadj,
+    in_nnz_view_t adj, in_scalar_view_t vals,
+    out_row_view_t t_xadj,    // pre-allocated -- initialized with 0
+    out_nnz_view_t t_adj,     // pre-allocated -- no need for initialize
+    out_scalar_view_t t_vals  // pre-allocated -- no need for initialize
+) {
+  // allocate some memory for work for row pointers
+  tempwork_row_view_t tmp_row_view(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "tmp_row_view"),
+      num_cols + 1);
+
+  // create the functor for tranpose.
+  typedef TransposeMatrix<in_row_view_t, in_nnz_view_t, in_scalar_view_t,
+                          out_row_view_t, out_nnz_view_t, out_scalar_view_t,
+                          tempwork_row_view_t, MyExecSpace>
+      TransposeFunctor_t;
 
   typedef typename TransposeFunctor_t::team_count_policy_t count_tp_t;
   typedef typename TransposeFunctor_t::team_fill_policy_t fill_tp_t;
 
   typename in_row_view_t::non_const_value_type nnz = adj.extent(0);
 
-  //determine vector lanes per thread
-  int thread_size = kk_get_suggested_vector_size(num_rows, nnz, kk_get_exec_space_type<MyExecSpace>());
+  // determine vector lanes per thread
+  int thread_size = kk_get_suggested_vector_size(
+      num_rows, nnz, kk_get_exec_space_type<MyExecSpace>());
 
-  //determine threads per team
-  int team_size = kk_get_suggested_team_size(thread_size, kk_get_exec_space_type<MyExecSpace>());
+  // determine threads per team
+  int team_size = kk_get_suggested_team_size(
+      thread_size, kk_get_exec_space_type<MyExecSpace>());
 
-  TransposeFunctor_t tm ( num_rows, num_cols, xadj, adj, vals,
-                          t_xadj, t_adj, t_vals,
-                          tmp_row_view,
-                          true,
-                          team_size);
+  TransposeFunctor_t tm(num_rows, num_cols, xadj, adj, vals, t_xadj, t_adj,
+                        t_vals, tmp_row_view, true, team_size);
 
-  Kokkos::parallel_for("KokkosKernels::Impl::transpose_matrix::S0", count_tp_t((num_rows + team_size - 1) / team_size, team_size, thread_size), tm);
+  Kokkos::parallel_for("KokkosKernels::Impl::transpose_matrix::S0",
+                       count_tp_t((num_rows + team_size - 1) / team_size,
+                                  team_size, thread_size),
+                       tm);
 
-  kk_exclusive_parallel_prefix_sum<out_row_view_t, MyExecSpace>(num_cols+1, t_xadj);
+  kk_exclusive_parallel_prefix_sum<out_row_view_t, MyExecSpace>(num_cols + 1,
+                                                                t_xadj);
 
   Kokkos::deep_copy(tmp_row_view, t_xadj);
 
-  Kokkos::parallel_for("KokkosKernels::Impl::transpose_matrix::S1", fill_tp_t((num_rows + team_size - 1) / team_size, team_size, thread_size), tm);
+  Kokkos::parallel_for(
+      "KokkosKernels::Impl::transpose_matrix::S1",
+      fill_tp_t((num_rows + team_size - 1) / team_size, team_size, thread_size),
+      tm);
 
   MyExecSpace().fence();
 }
 
 template <typename crsMat_t>
-crsMat_t transpose_matrix(const crsMat_t& A)
-{
-  //Allocate views and call the other version of transpose_matrix
-  using c_rowmap_t = typename crsMat_t::row_map_type;
+crsMat_t transpose_matrix(const crsMat_t &A) {
+  // Allocate views and call the other version of transpose_matrix
+  using c_rowmap_t  = typename crsMat_t::row_map_type;
   using c_entries_t = typename crsMat_t::index_type;
-  using c_values_t = typename crsMat_t::values_type;
-  using rowmap_t = typename crsMat_t::row_map_type::non_const_type;
-  using entries_t = typename crsMat_t::index_type::non_const_type;
-  using values_t = typename crsMat_t::values_type::non_const_type;
+  using c_values_t  = typename crsMat_t::values_type;
+  using rowmap_t    = typename crsMat_t::row_map_type::non_const_type;
+  using entries_t   = typename crsMat_t::index_type::non_const_type;
+  using values_t    = typename crsMat_t::values_type::non_const_type;
   rowmap_t AT_rowmap("Transpose rowmap", A.numCols() + 1);
   entries_t AT_entries(
-      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Transpose entries"), A.nnz());
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Transpose entries"),
+      A.nnz());
   values_t AT_values(
-      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Transpose values"), A.nnz());
-  transpose_matrix<
-    c_rowmap_t, c_entries_t, c_values_t,
-    rowmap_t, entries_t, values_t,
-    rowmap_t, typename crsMat_t::execution_space>(
-        A.numRows(), A.numCols(),
-        A.graph.row_map, A.graph.entries, A.values,
-        AT_rowmap, AT_entries, AT_values);
-  //And construct the transpose crsMat_t
-  return crsMat_t("Transpose", A.numCols(), A.numRows(), A.nnz(), AT_values, AT_rowmap, AT_entries);
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Transpose values"),
+      A.nnz());
+  transpose_matrix<c_rowmap_t, c_entries_t, c_values_t, rowmap_t, entries_t,
+                   values_t, rowmap_t, typename crsMat_t::execution_space>(
+      A.numRows(), A.numCols(), A.graph.row_map, A.graph.entries, A.values,
+      AT_rowmap, AT_entries, AT_values);
+  // And construct the transpose crsMat_t
+  return crsMat_t("Transpose", A.numCols(), A.numRows(), A.nnz(), AT_values,
+                  AT_rowmap, AT_entries);
 }
 
-template <typename in_row_view_t,
-          typename in_nnz_view_t,
-          typename out_row_view_t,
-          typename out_nnz_view_t,
-          typename tempwork_row_view_t,
-          typename MyExecSpace>
+template <typename in_row_view_t, typename in_nnz_view_t,
+          typename out_row_view_t, typename out_nnz_view_t,
+          typename tempwork_row_view_t, typename MyExecSpace>
 void transpose_graph(
     typename in_nnz_view_t::non_const_value_type num_rows,
-    typename in_nnz_view_t::non_const_value_type num_cols,
-    in_row_view_t xadj,
+    typename in_nnz_view_t::non_const_value_type num_cols, in_row_view_t xadj,
     in_nnz_view_t adj,
-    out_row_view_t t_xadj, //pre-allocated -- initialized with 0
-    out_nnz_view_t t_adj   //pre-allocated -- no need for initialize
-    )
-{
-  //allocate some memory for work for row pointers
-  tempwork_row_view_t tmp_row_view(Kokkos::view_alloc(Kokkos::WithoutInitializing, "tmp_row_view"), num_cols + 1);
+    out_row_view_t t_xadj,  // pre-allocated -- initialized with 0
+    out_nnz_view_t t_adj    // pre-allocated -- no need for initialize
+) {
+  // allocate some memory for work for row pointers
+  tempwork_row_view_t tmp_row_view(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "tmp_row_view"),
+      num_cols + 1);
 
   in_nnz_view_t tmp1;
   out_nnz_view_t tmp2;
 
-  //create the functor for tranpose.
-  typedef TransposeMatrix <
-      in_row_view_t, in_nnz_view_t, in_nnz_view_t,
-      out_row_view_t, out_nnz_view_t, out_nnz_view_t,
-      tempwork_row_view_t, MyExecSpace>  TransposeFunctor_t;
+  // create the functor for tranpose.
+  typedef TransposeMatrix<in_row_view_t, in_nnz_view_t, in_nnz_view_t,
+                          out_row_view_t, out_nnz_view_t, out_nnz_view_t,
+                          tempwork_row_view_t, MyExecSpace>
+      TransposeFunctor_t;
 
   typedef typename TransposeFunctor_t::team_count_policy_t count_tp_t;
   typedef typename TransposeFunctor_t::team_fill_policy_t fill_tp_t;
 
   typename in_row_view_t::non_const_value_type nnz = adj.extent(0);
 
-  //determine vector lanes per thread
-  int thread_size = kk_get_suggested_vector_size(num_rows, nnz, kk_get_exec_space_type<MyExecSpace>());
+  // determine vector lanes per thread
+  int thread_size = kk_get_suggested_vector_size(
+      num_rows, nnz, kk_get_exec_space_type<MyExecSpace>());
 
-  //determine threads per team
-  int team_size = kk_get_suggested_team_size(thread_size, kk_get_exec_space_type<MyExecSpace>());
+  // determine threads per team
+  int team_size = kk_get_suggested_team_size(
+      thread_size, kk_get_exec_space_type<MyExecSpace>());
 
-  TransposeFunctor_t tm ( num_rows, num_cols, xadj, adj, tmp1,
-                          t_xadj, t_adj, tmp2,
-                          tmp_row_view,
-                          false,
-                          team_size);
+  TransposeFunctor_t tm(num_rows, num_cols, xadj, adj, tmp1, t_xadj, t_adj,
+                        tmp2, tmp_row_view, false, team_size);
 
-  Kokkos::parallel_for("KokkosKernels::Impl::transpose_graph::S0", count_tp_t((num_rows + team_size - 1) / team_size, team_size, thread_size), tm);
+  Kokkos::parallel_for("KokkosKernels::Impl::transpose_graph::S0",
+                       count_tp_t((num_rows + team_size - 1) / team_size,
+                                  team_size, thread_size),
+                       tm);
 
-  kk_exclusive_parallel_prefix_sum<out_row_view_t, MyExecSpace>(num_cols+1, t_xadj);
+  kk_exclusive_parallel_prefix_sum<out_row_view_t, MyExecSpace>(num_cols + 1,
+                                                                t_xadj);
 
   Kokkos::deep_copy(tmp_row_view, t_xadj);
 
-  Kokkos::parallel_for("KokkosKernels::Impl::transpose_graph::S1", fill_tp_t((num_rows + team_size - 1) / team_size, team_size, thread_size), tm);
+  Kokkos::parallel_for(
+      "KokkosKernels::Impl::transpose_graph::S1",
+      fill_tp_t((num_rows + team_size - 1) / team_size, team_size, thread_size),
+      tm);
 
   MyExecSpace().fence();
 }
 
 template <typename forward_map_type, typename reverse_map_type>
-struct Fill_Reverse_Scale_Functor{
-
-  struct CountTag{};
-  struct FillTag{};
+struct Fill_Reverse_Scale_Functor {
+  struct CountTag {};
+  struct FillTag {};
 
   typedef struct CountTag CountTag;
   typedef struct FillTag FillTag;
 
-
   typedef typename forward_map_type::value_type forward_type;
   typedef typename reverse_map_type::value_type reverse_type;
   forward_map_type forward_map;
@@ -567,208 +552,220 @@ struct Fill_Reverse_Scale_Functor{
   const reverse_type multiply_shift_for_scale;
   const reverse_type division_shift_for_bucket;
 
-
-  Fill_Reverse_Scale_Functor(
-      forward_map_type forward_map_,
-      reverse_map_type reverse_map_xadj_,
-      reverse_map_type reverse_map_adj_,
-      reverse_type multiply_shift_for_scale_,
-      reverse_type division_shift_for_bucket_):
-        forward_map(forward_map_), reverse_map_xadj(reverse_map_xadj_), reverse_map_adj(reverse_map_adj_),
+  Fill_Reverse_Scale_Functor(forward_map_type forward_map_,
+                             reverse_map_type reverse_map_xadj_,
+                             reverse_map_type reverse_map_adj_,
+                             reverse_type multiply_shift_for_scale_,
+                             reverse_type division_shift_for_bucket_)
+      : forward_map(forward_map_),
+        reverse_map_xadj(reverse_map_xadj_),
+        reverse_map_adj(reverse_map_adj_),
         multiply_shift_for_scale(multiply_shift_for_scale_),
-        division_shift_for_bucket(division_shift_for_bucket_){}
+        division_shift_for_bucket(division_shift_for_bucket_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const CountTag&, const size_t &ii) const {
+  void operator()(const CountTag &, const size_t &ii) const {
     forward_type fm = forward_map[ii];
-    fm = fm << multiply_shift_for_scale;
+    fm              = fm << multiply_shift_for_scale;
     fm += ii >> division_shift_for_bucket;
-    typedef typename std::remove_reference< decltype( reverse_map_xadj(0) ) >::type atomic_incr_type;
-    Kokkos::atomic_fetch_add( &(reverse_map_xadj(fm)), atomic_incr_type(1));
+    typedef typename std::remove_reference<decltype(reverse_map_xadj(0))>::type
+        atomic_incr_type;
+    Kokkos::atomic_fetch_add(&(reverse_map_xadj(fm)), atomic_incr_type(1));
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const FillTag&, const size_t &ii) const {
+  void operator()(const FillTag &, const size_t &ii) const {
     forward_type fm = forward_map[ii];
 
     fm = fm << multiply_shift_for_scale;
     fm += ii >> division_shift_for_bucket;
-    typedef typename std::remove_reference< decltype( reverse_map_xadj(0) ) >::type atomic_incr_type;
-    const reverse_type future_index = Kokkos::atomic_fetch_add( &(reverse_map_xadj(fm) ), atomic_incr_type(1));
+    typedef typename std::remove_reference<decltype(reverse_map_xadj(0))>::type
+        atomic_incr_type;
+    const reverse_type future_index =
+        Kokkos::atomic_fetch_add(&(reverse_map_xadj(fm)), atomic_incr_type(1));
     reverse_map_adj(future_index) = ii;
   }
 };
 
-
 template <typename from_view_t, typename to_view_t>
-struct StridedCopy1{
+struct StridedCopy1 {
   const from_view_t from;
   to_view_t to;
   const size_t stride;
-  StridedCopy1(
-      const from_view_t from_,
-      to_view_t to_,
-      size_t stride_):from(from_), to (to_), stride(stride_){}
-
+  StridedCopy1(const from_view_t from_, to_view_t to_, size_t stride_)
+      : from(from_), to(to_), stride(stride_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const size_t &ii) const {
-    to[ii] = from[(ii) * stride];
-  }
+  void operator()(const size_t &ii) const { to[ii] = from[(ii)*stride]; }
 };
 
 template <typename forward_map_type, typename reverse_map_type>
-struct Reverse_Map_Functor{
-
-  struct CountTag{};
-  struct FillTag{};
+struct Reverse_Map_Functor {
+  struct CountTag {};
+  struct FillTag {};
 
   typedef struct CountTag CountTag;
   typedef struct FillTag FillTag;
 
-
   typedef typename forward_map_type::value_type forward_type;
   typedef typename reverse_map_type::value_type reverse_type;
   forward_map_type forward_map;
   reverse_map_type reverse_map_xadj;
   reverse_map_type reverse_map_adj;
 
-
-  Reverse_Map_Functor(
-      forward_map_type forward_map_,
-      reverse_map_type reverse_map_xadj_,
-      reverse_map_type reverse_map_adj_):
-        forward_map(forward_map_), reverse_map_xadj(reverse_map_xadj_), reverse_map_adj(reverse_map_adj_){}
+  Reverse_Map_Functor(forward_map_type forward_map_,
+                      reverse_map_type reverse_map_xadj_,
+                      reverse_map_type reverse_map_adj_)
+      : forward_map(forward_map_),
+        reverse_map_xadj(reverse_map_xadj_),
+        reverse_map_adj(reverse_map_adj_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const CountTag&, const size_t &ii) const {
+  void operator()(const CountTag &, const size_t &ii) const {
     forward_type fm = forward_map[ii];
-    typedef typename std::remove_reference< decltype( reverse_map_xadj(0) ) >::type atomic_incr_type;
-    Kokkos::atomic_fetch_add( &(reverse_map_xadj(fm)), atomic_incr_type(1));
+    typedef typename std::remove_reference<decltype(reverse_map_xadj(0))>::type
+        atomic_incr_type;
+    Kokkos::atomic_fetch_add(&(reverse_map_xadj(fm)), atomic_incr_type(1));
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const FillTag&, const size_t &ii) const {
+  void operator()(const FillTag &, const size_t &ii) const {
     forward_type c = forward_map[ii];
-    typedef typename std::remove_reference< decltype( reverse_map_xadj(0) ) >::type atomic_incr_type;
-    const reverse_type future_index = Kokkos::atomic_fetch_add( &(reverse_map_xadj(c)), atomic_incr_type(1));
+    typedef typename std::remove_reference<decltype(reverse_map_xadj(0))>::type
+        atomic_incr_type;
+    const reverse_type future_index =
+        Kokkos::atomic_fetch_add(&(reverse_map_xadj(c)), atomic_incr_type(1));
     reverse_map_adj(future_index) = ii;
   }
 };
 
-
 /**
  * \brief Utility function to obtain a reverse map given a map.
  * Input is a map with the number of elements within the map.
- * forward_map[c] = i, where c is a forward element and forward_map has a size of num_forward_elements.
- * i is the value that c is mapped in the forward map, and the range of that is num_reverse_elements.
- * Output is the reverse_map_xadj and reverse_map_adj such that,
- * all c, forward_map[c] = i, will appear in  reverse_map_adj[ reverse_map_xadj[i]: reverse_map_xadj[i+1])
- * \param: num_forward_elements: the number of elements in the forward map, the size of the forward map.
- * \param: num_reverse_elements: the number of elements that forward map is mapped to. It is the value of max i.
- * \param: forward_map: input forward_map, where forward_map[c] = i.
- * \param: reverse_map_xadj: reverse map xadj, that is it will hold the beginning and
- * end indices on reverse_map_adj such that all values mapped to i will be [ reverse_map_xadj[i]: reverse_map_xadj[i+1])
- * its size will be num_reverse_elements + 1. NO NEED TO INITIALIZE.
- * \param: reverse_map_adj: reverse map adj, holds the values of reverse maps. Its size is num_forward_elements.
+ * forward_map[c] = i, where c is a forward element and forward_map has a size
+ * of num_forward_elements. i is the value that c is mapped in the forward map,
+ * and the range of that is num_reverse_elements. Output is the reverse_map_xadj
+ * and reverse_map_adj such that, all c, forward_map[c] = i, will appear in
+ * reverse_map_adj[ reverse_map_xadj[i]: reverse_map_xadj[i+1]) \param:
+ * num_forward_elements: the number of elements in the forward map, the size of
+ * the forward map. \param: num_reverse_elements: the number of elements that
+ * forward map is mapped to. It is the value of max i. \param: forward_map:
+ * input forward_map, where forward_map[c] = i. \param: reverse_map_xadj:
+ * reverse map xadj, that is it will hold the beginning and end indices on
+ * reverse_map_adj such that all values mapped to i will be [
+ * reverse_map_xadj[i]: reverse_map_xadj[i+1]) its size will be
+ * num_reverse_elements + 1. NO NEED TO INITIALIZE. \param: reverse_map_adj:
+ * reverse map adj, holds the values of reverse maps. Its size is
+ * num_forward_elements.
  *
  */
-template <typename forward_array_type, typename reverse_array_type, typename MyExecSpace>
+template <typename forward_array_type, typename reverse_array_type,
+          typename MyExecSpace>
 void kk_create_reverse_map(
-    const typename reverse_array_type::value_type &num_forward_elements, //num_vertices
-    const typename forward_array_type::value_type &num_reverse_elements, //num_colors
+    const typename reverse_array_type::value_type
+        &num_forward_elements,  // num_vertices
+    const typename forward_array_type::value_type
+        &num_reverse_elements,  // num_colors
 
-    const forward_array_type &forward_map, //vertex to colors
-    const reverse_array_type &reverse_map_xadj, // colors to vertex xadj
-    const reverse_array_type &reverse_map_adj){ //colros to vertex adj
+    const forward_array_type &forward_map,        // vertex to colors
+    const reverse_array_type &reverse_map_xadj,   // colors to vertex xadj
+    const reverse_array_type &reverse_map_adj) {  // colros to vertex adj
 
   typedef typename reverse_array_type::value_type lno_t;
   typedef typename forward_array_type::value_type reverse_lno_t;
 
-  const lno_t  MINIMUM_TO_ATOMIC = 128;
+  const lno_t MINIMUM_TO_ATOMIC = 128;
 
-  //typedef Kokkos::TeamPolicy<CountTag, MyExecSpace> team_count_policy_t ;
-  //typedef Kokkos::TeamPolicy<FillTag, MyExecSpace> team_fill_policy_t ;
+  // typedef Kokkos::TeamPolicy<CountTag, MyExecSpace> team_count_policy_t ;
+  // typedef Kokkos::TeamPolicy<FillTag, MyExecSpace> team_fill_policy_t ;
 
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
 
-  //IF There are very few reverse elements, atomics are likely to create contention.
-  if (num_reverse_elements < MINIMUM_TO_ATOMIC){
-    const lno_t scale_size = 1024;
+  // IF There are very few reverse elements, atomics are likely to create
+  // contention.
+  if (num_reverse_elements < MINIMUM_TO_ATOMIC) {
+    const lno_t scale_size               = 1024;
     const lno_t multiply_shift_for_scale = 10;
 
-    //there will be 1024 buckets
+    // there will be 1024 buckets
     const lno_t division_shift_for_bucket =
-          lno_t (ceil(log(double (num_forward_elements) / scale_size)/log(2)));
+        lno_t(ceil(log(double(num_forward_elements) / scale_size) / log(2)));
 
-    //coloring indices are base-1. we end up using not using element 1.
-    const reverse_lno_t tmp_reverse_size =
-        (num_reverse_elements + 1) << multiply_shift_for_scale;
+    // coloring indices are base-1. we end up using not using element 1.
+    const reverse_lno_t tmp_reverse_size = (num_reverse_elements + 1)
+                                           << multiply_shift_for_scale;
 
-    typename reverse_array_type::non_const_type
-        tmp_color_xadj ("TMP_REVERSE_XADJ", tmp_reverse_size + 1);
+    typename reverse_array_type::non_const_type tmp_color_xadj(
+        "TMP_REVERSE_XADJ", tmp_reverse_size + 1);
 
-    typedef Fill_Reverse_Scale_Functor<forward_array_type, reverse_array_type> frsf;
+    typedef Fill_Reverse_Scale_Functor<forward_array_type, reverse_array_type>
+        frsf;
     typedef typename frsf::CountTag cnt_tag;
     typedef typename frsf::FillTag fill_tag;
     typedef Kokkos::RangePolicy<cnt_tag, MyExecSpace> my_cnt_exec_space;
     typedef Kokkos::RangePolicy<fill_tag, MyExecSpace> my_fill_exec_space;
 
-    frsf frm (forward_map, tmp_color_xadj, reverse_map_adj,
-            multiply_shift_for_scale, division_shift_for_bucket);
+    frsf frm(forward_map, tmp_color_xadj, reverse_map_adj,
+             multiply_shift_for_scale, division_shift_for_bucket);
 
-    Kokkos::parallel_for ("KokkosKernels::Common::CreateReverseMap::NonAtomic::S0", my_cnt_exec_space (0, num_forward_elements) , frm);
+    Kokkos::parallel_for(
+        "KokkosKernels::Common::CreateReverseMap::NonAtomic::S0",
+        my_cnt_exec_space(0, num_forward_elements), frm);
     MyExecSpace().fence();
 
-
-    //kk_inclusive_parallel_prefix_sum<reverse_array_type, MyExecSpace>(tmp_reverse_size + 1, tmp_color_xadj);
-    kk_exclusive_parallel_prefix_sum<reverse_array_type, MyExecSpace>
-      (tmp_reverse_size + 1, tmp_color_xadj);
+    // kk_inclusive_parallel_prefix_sum<reverse_array_type,
+    // MyExecSpace>(tmp_reverse_size + 1, tmp_color_xadj);
+    kk_exclusive_parallel_prefix_sum<reverse_array_type, MyExecSpace>(
+        tmp_reverse_size + 1, tmp_color_xadj);
     MyExecSpace().fence();
 
-    Kokkos::parallel_for ("KokkosKernels::Common::CreateReverseMap::NonAtomic::S1",
-        my_exec_space (0, num_reverse_elements + 1) ,
-        StridedCopy1<reverse_array_type, reverse_array_type>
-          (tmp_color_xadj, reverse_map_xadj, scale_size));
+    Kokkos::parallel_for(
+        "KokkosKernels::Common::CreateReverseMap::NonAtomic::S1",
+        my_exec_space(0, num_reverse_elements + 1),
+        StridedCopy1<reverse_array_type, reverse_array_type>(
+            tmp_color_xadj, reverse_map_xadj, scale_size));
     MyExecSpace().fence();
-    Kokkos::parallel_for ("KokkosKernels::Common::CreateReverseMap::NonAtomic::S2",my_fill_exec_space (0, num_forward_elements) , frm);
+    Kokkos::parallel_for(
+        "KokkosKernels::Common::CreateReverseMap::NonAtomic::S2",
+        my_fill_exec_space(0, num_forward_elements), frm);
     MyExecSpace().fence();
-  }
-  else
-  //atomic implementation.
+  } else
+  // atomic implementation.
   {
-    reverse_array_type tmp_color_xadj ("TMP_REVERSE_XADJ", num_reverse_elements + 1);
+    reverse_array_type tmp_color_xadj("TMP_REVERSE_XADJ",
+                                      num_reverse_elements + 1);
 
-    typedef Reverse_Map_Functor<forward_array_type, reverse_array_type> rmp_functor_type;
+    typedef Reverse_Map_Functor<forward_array_type, reverse_array_type>
+        rmp_functor_type;
     typedef typename rmp_functor_type::CountTag cnt_tag;
     typedef typename rmp_functor_type::FillTag fill_tag;
     typedef Kokkos::RangePolicy<cnt_tag, MyExecSpace> my_cnt_exec_space;
     typedef Kokkos::RangePolicy<fill_tag, MyExecSpace> my_fill_exec_space;
 
-    rmp_functor_type frm (forward_map, tmp_color_xadj, reverse_map_adj);
+    rmp_functor_type frm(forward_map, tmp_color_xadj, reverse_map_adj);
 
-    Kokkos::parallel_for ("KokkosKernels::Common::CreateReverseMap::Atomic::S0", my_cnt_exec_space (0, num_forward_elements) , frm);
+    Kokkos::parallel_for("KokkosKernels::Common::CreateReverseMap::Atomic::S0",
+                         my_cnt_exec_space(0, num_forward_elements), frm);
     MyExecSpace().fence();
 
-    //kk_inclusive_parallel_prefix_sum<reverse_array_type, MyExecSpace>(num_reverse_elements + 1, reverse_map_xadj);
-    kk_exclusive_parallel_prefix_sum<reverse_array_type, MyExecSpace>
-      (num_reverse_elements + 1, tmp_color_xadj);
+    // kk_inclusive_parallel_prefix_sum<reverse_array_type,
+    // MyExecSpace>(num_reverse_elements + 1, reverse_map_xadj);
+    kk_exclusive_parallel_prefix_sum<reverse_array_type, MyExecSpace>(
+        num_reverse_elements + 1, tmp_color_xadj);
     MyExecSpace().fence();
 
-    Kokkos::deep_copy (reverse_map_xadj, tmp_color_xadj);
+    Kokkos::deep_copy(reverse_map_xadj, tmp_color_xadj);
     MyExecSpace().fence();
 
-    Kokkos::parallel_for ("KokkosKernels::Common::CreateReverseMap::Atomic::S1", my_fill_exec_space (0, num_forward_elements) , frm);
+    Kokkos::parallel_for("KokkosKernels::Common::CreateReverseMap::Atomic::S1",
+                         my_fill_exec_space(0, num_forward_elements), frm);
     MyExecSpace().fence();
   }
 }
 
-template <typename in_row_view_t, typename in_nnz_view_t,  typename in_color_view_t,
-          typename team_member>
-struct ColorChecker{
-
-
-
+template <typename in_row_view_t, typename in_nnz_view_t,
+          typename in_color_view_t, typename team_member>
+struct ColorChecker {
   typedef typename in_row_view_t::value_type size_type;
   typedef typename in_nnz_view_t::value_type lno_t;
   typedef typename in_color_view_t::value_type color_t;
@@ -778,150 +775,144 @@ struct ColorChecker{
   in_color_view_t color_view;
   lno_t team_row_chunk_size;
 
-
-
-  ColorChecker(
-      lno_t num_rows_,
-      in_row_view_t xadj_,
-      in_nnz_view_t adj_,
-      in_color_view_t color_view_,
-      lno_t chunk_size):
-        num_rows(num_rows_),
-        xadj(xadj_), adj(adj_), color_view(color_view_),
-        team_row_chunk_size(chunk_size){}
+  ColorChecker(lno_t num_rows_, in_row_view_t xadj_, in_nnz_view_t adj_,
+               in_color_view_t color_view_, lno_t chunk_size)
+      : num_rows(num_rows_),
+        xadj(xadj_),
+        adj(adj_),
+        color_view(color_view_),
+        team_row_chunk_size(chunk_size) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const team_member & teamMember, size_t &num_conflicts) const {
-    //get the range of rows for team.
+  void operator()(const team_member &teamMember, size_t &num_conflicts) const {
+    // get the range of rows for team.
     const lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, num_rows);
+    const lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, num_rows);
 
     size_t nf = 0;
-    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const lno_t& row_index, size_t &team_num_conf)
-    {
-
-      color_t my_color = color_view(row_index);
-      const size_type col_begin = xadj[row_index];
-      const size_type col_end = xadj[row_index + 1];
-      const lno_t left_work = col_end - col_begin;
-
-      size_t conf1= 0;
-      Kokkos::parallel_reduce(
-          Kokkos::ThreadVectorRange(teamMember, left_work),
-          [&] (lno_t i, size_t & valueToUpdate) {
-        const size_type adjind = i + col_begin;
-        const lno_t colIndex = adj[adjind];
-        if (colIndex != row_index){
-          color_t second_color = color_view(colIndex);
-          if (second_color == my_color)
-            valueToUpdate += 1;
-        }
-      },
-      conf1);
-      team_num_conf += conf1;
-    }, nf);
+    Kokkos::parallel_reduce(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const lno_t &row_index, size_t &team_num_conf) {
+          color_t my_color          = color_view(row_index);
+          const size_type col_begin = xadj[row_index];
+          const size_type col_end   = xadj[row_index + 1];
+          const lno_t left_work     = col_end - col_begin;
+
+          size_t conf1 = 0;
+          Kokkos::parallel_reduce(
+              Kokkos::ThreadVectorRange(teamMember, left_work),
+              [&](lno_t i, size_t &valueToUpdate) {
+                const size_type adjind = i + col_begin;
+                const lno_t colIndex   = adj[adjind];
+                if (colIndex != row_index) {
+                  color_t second_color = color_view(colIndex);
+                  if (second_color == my_color) valueToUpdate += 1;
+                }
+              },
+              conf1);
+          team_num_conf += conf1;
+        },
+        nf);
     num_conflicts += nf;
   }
 };
 
 /**
- * \brief given a graph and a coloring function returns true or false if distance-1 coloring is valid or not.
+ * \brief given a graph and a coloring function returns true or false if
+ distance-1 coloring is valid or not.
  * \param num_rows: num rows in input graph
  * \param num_cols: num cols in input graph
  * \param xadj: row pointers of the input graph
  * \param adj: column indices of the input graph
- * \param t_xadj: output, the row indices of the output graph. MUST BE INITIALIZED WITH ZEROES.
-
- * \param vector_size: suggested vector size, optional. if -1, kernel will decide.
- * \param suggested_team_size: suggested team size, optional. if -1, kernel will decide.
- * \param team_work_chunk_size: suggested work size of a team, optional. if -1, kernel will decide.
- * \param use_dynamic_scheduling: whether to use dynamic scheduling. Default is true.
+ * \param t_xadj: output, the row indices of the output graph. MUST BE
+ INITIALIZED WITH ZEROES.
+
+ * \param vector_size: suggested vector size, optional. if -1, kernel will
+ decide.
+ * \param suggested_team_size: suggested team size, optional. if -1, kernel will
+ decide.
+ * \param team_work_chunk_size: suggested work size of a team, optional. if -1,
+ kernel will decide.
+ * \param use_dynamic_scheduling: whether to use dynamic scheduling. Default is
+ true.
  */
-template <typename in_row_view_t,
-          typename in_nnz_view_t,
-          typename in_color_view_t,
-          typename MyExecSpace>
+template <typename in_row_view_t, typename in_nnz_view_t,
+          typename in_color_view_t, typename MyExecSpace>
 inline size_t kk_is_d1_coloring_valid(
     typename in_nnz_view_t::non_const_value_type num_rows,
     typename in_nnz_view_t::non_const_value_type /*num_cols*/,
-    in_row_view_t xadj,
-    in_nnz_view_t adj,
-    in_color_view_t v_colors
-    ){
+    in_row_view_t xadj, in_nnz_view_t adj, in_color_view_t v_colors) {
   ExecSpaceType my_exec_space = kk_get_exec_space_type<MyExecSpace>();
-  int vector_size = kk_get_suggested_vector_size(num_rows, adj.extent(0), my_exec_space);
-  int suggested_team_size = kk_get_suggested_team_size(vector_size, my_exec_space);;
-  typename in_nnz_view_t::non_const_value_type team_work_chunk_size = suggested_team_size;
-  typedef Kokkos::TeamPolicy<MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic> > dynamic_team_policy ;
-  typedef typename dynamic_team_policy::member_type team_member_t ;
-
-  struct ColorChecker <in_row_view_t, in_nnz_view_t, in_color_view_t, team_member_t>  cc(num_rows, xadj, adj, v_colors, team_work_chunk_size);
+  int vector_size =
+      kk_get_suggested_vector_size(num_rows, adj.extent(0), my_exec_space);
+  int suggested_team_size =
+      kk_get_suggested_team_size(vector_size, my_exec_space);
+  ;
+  typename in_nnz_view_t::non_const_value_type team_work_chunk_size =
+      suggested_team_size;
+  typedef Kokkos::TeamPolicy<MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic>>
+      dynamic_team_policy;
+  typedef typename dynamic_team_policy::member_type team_member_t;
+
+  struct ColorChecker<in_row_view_t, in_nnz_view_t, in_color_view_t,
+                      team_member_t>
+      cc(num_rows, xadj, adj, v_colors, team_work_chunk_size);
   size_t num_conf = 0;
-  Kokkos::parallel_reduce( "KokkosKernels::Common::IsD1ColoringValid", dynamic_team_policy(num_rows / team_work_chunk_size + 1 ,
-      suggested_team_size, vector_size), cc, num_conf);
+  Kokkos::parallel_reduce(
+      "KokkosKernels::Common::IsD1ColoringValid",
+      dynamic_team_policy(num_rows / team_work_chunk_size + 1,
+                          suggested_team_size, vector_size),
+      cc, num_conf);
 
   MyExecSpace().fence();
   return num_conf;
 }
 
-template<typename Reducer, typename ordinal_t, typename rowmap_t>
-struct MinMaxDegreeFunctor
-{
+template <typename Reducer, typename ordinal_t, typename rowmap_t>
+struct MinMaxDegreeFunctor {
   using ReducerVal = typename Reducer::value_type;
-  MinMaxDegreeFunctor(const rowmap_t& rowmap_)
-    : rowmap(rowmap_) {}
-  KOKKOS_INLINE_FUNCTION void operator()(ordinal_t i, ReducerVal& lminmax) const
-  {
+  MinMaxDegreeFunctor(const rowmap_t &rowmap_) : rowmap(rowmap_) {}
+  KOKKOS_INLINE_FUNCTION void operator()(ordinal_t i,
+                                         ReducerVal &lminmax) const {
     ordinal_t deg = rowmap(i + 1) - rowmap(i);
-    if(deg < lminmax.min_val)
-      lminmax.min_val = deg;
-    if(deg > lminmax.max_val)
-      lminmax.max_val = deg;
+    if (deg < lminmax.min_val) lminmax.min_val = deg;
+    if (deg > lminmax.max_val) lminmax.max_val = deg;
   }
   rowmap_t rowmap;
 };
 
-template<typename Reducer, typename ordinal_t, typename rowmap_t>
-struct MaxDegreeFunctor
-{
+template <typename Reducer, typename ordinal_t, typename rowmap_t>
+struct MaxDegreeFunctor {
   using ReducerVal = typename Reducer::value_type;
-  MaxDegreeFunctor(const rowmap_t& rowmap_)
-    : rowmap(rowmap_) {}
-  KOKKOS_INLINE_FUNCTION void operator()(ordinal_t i, ReducerVal& lmax) const
-  {
+  MaxDegreeFunctor(const rowmap_t &rowmap_) : rowmap(rowmap_) {}
+  KOKKOS_INLINE_FUNCTION void operator()(ordinal_t i, ReducerVal &lmax) const {
     ordinal_t deg = rowmap(i + 1) - rowmap(i);
-    if(deg > lmax)
-      lmax = deg;
+    if (deg > lmax) lmax = deg;
   }
   rowmap_t rowmap;
 };
 
-template<typename device_t, typename ordinal_t, typename rowmap_t>
-ordinal_t graph_max_degree(const rowmap_t& rowmap)
-{
-  using Reducer = Kokkos::Max<ordinal_t>;
+template <typename device_t, typename ordinal_t, typename rowmap_t>
+ordinal_t graph_max_degree(const rowmap_t &rowmap) {
+  using Reducer   = Kokkos::Max<ordinal_t>;
   ordinal_t nrows = rowmap.extent(0);
-  if(nrows)
-    nrows--;
-  if(nrows == 0)
-    return 0;
+  if (nrows) nrows--;
+  if (nrows == 0) return 0;
   ordinal_t val;
   Kokkos::parallel_reduce(
       Kokkos::RangePolicy<typename device_t::execution_space>(0, nrows),
-      MaxDegreeFunctor<Reducer, ordinal_t, rowmap_t>(rowmap),
-      Reducer(val));
+      MaxDegreeFunctor<Reducer, ordinal_t, rowmap_t>(rowmap), Reducer(val));
   return val;
 }
 
-template<typename device_t, typename ordinal_t, typename rowmap_t>
-void graph_min_max_degree(const rowmap_t& rowmap, ordinal_t& min_degree, ordinal_t& max_degree)
-{
-  using Reducer = Kokkos::MinMax<ordinal_t>;
+template <typename device_t, typename ordinal_t, typename rowmap_t>
+void graph_min_max_degree(const rowmap_t &rowmap, ordinal_t &min_degree,
+                          ordinal_t &max_degree) {
+  using Reducer   = Kokkos::MinMax<ordinal_t>;
   ordinal_t nrows = rowmap.extent(0);
-  if(nrows)
-    nrows--;
-  if(nrows == 0)
-  {
+  if (nrows) nrows--;
+  if (nrows == 0) {
     min_degree = 0;
     max_degree = 0;
     return;
@@ -947,8 +938,9 @@ struct IncidenceMatrix{
   typedef struct FillTag FillTag;
 
   typedef Kokkos::TeamPolicy<FillTag, MyExecSpace> team_fill_policy_t ;
-  typedef Kokkos::TeamPolicy<FillTag, MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic> > dynamic_team_fill_policy_t ;
-  typedef typename team_fill_policy_t::member_type team_fill_member_t ;
+  typedef Kokkos::TeamPolicy<FillTag, MyExecSpace,
+Kokkos::Schedule<Kokkos::Dynamic> > dynamic_team_fill_policy_t ; typedef
+typename team_fill_policy_t::member_type team_fill_member_t ;
 
   typedef typename in_nnz_view_t::non_const_value_type nnz_lno_t;
   typedef typename in_row_view_t::non_const_value_type size_type;
@@ -977,11 +969,12 @@ struct IncidenceMatrix{
   KOKKOS_INLINE_FUNCTION
   void operator()(const FillTag&, const team_fill_member_t & teamMember) const {
     const nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, num_rows);
+    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin +
+team_work_size, num_rows);
 
 
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,team_row_begin,team_row_end), [&] (const nnz_lno_t& row_index) {
-      const size_type col_begin = xadj[row_index];
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,team_row_begin,team_row_end),
+[&] (const nnz_lno_t& row_index) { const size_type col_begin = xadj[row_index];
       const size_type col_end = xadj[row_index + 1];
       const nnz_lno_t left_work = col_end - col_begin;
       Kokkos::parallel_for(
@@ -991,8 +984,8 @@ struct IncidenceMatrix{
         const nnz_lno_t colIndex = adj[adjind];
         if (row_index < colIndex){
 
-          const size_type pos = Kokkos::atomic_fetch_add(&(tmp_txadj(colIndex)),1);
-          t_adj(adjind) = adjind;
+          const size_type pos =
+Kokkos::atomic_fetch_add(&(tmp_txadj(colIndex)),1); t_adj(adjind) = adjind;
           t_adj(pos) = adjind;
         }
       });
@@ -1007,12 +1000,13 @@ struct IncidenceMatrix{
  * \param num_cols: num cols in input graph
  * \param xadj: row pointers of the input graph
  * \param adj: column indices of the input graph
- * \param t_xadj: output, the row indices of the output graph. MUST BE INITIALIZED WITH ZEROES.
- * \param t_adj: output, column indices. No need for initializations.
- * \param vector_size: suggested vector size, optional. if -1, kernel will decide.
- * \param suggested_team_size: suggested team size, optional. if -1, kernel will decide.
- * \param team_work_chunk_size: suggested work size of a team, optional. if -1, kernel will decide.
- * \param use_dynamic_scheduling: whether to use dynamic scheduling. Default is true.
+ * \param t_xadj: output, the row indices of the output graph. MUST BE
+ * INITIALIZED WITH ZEROES. \param t_adj: output, column indices. No need for
+ * initializations. \param vector_size: suggested vector size, optional. if -1,
+ * kernel will decide. \param suggested_team_size: suggested team size,
+ * optional. if -1, kernel will decide. \param team_work_chunk_size: suggested
+ * work size of a team, optional. if -1, kernel will decide. \param
+ * use_dynamic_scheduling: whether to use dynamic scheduling. Default is true.
  */
 /*
 template <typename in_row_view_t,
@@ -1023,17 +1017,17 @@ inline void kk_create_incidence_matrix(
     typename in_nnz_view_t::non_const_value_type num_rows,
     in_row_view_t xadj,
     in_nnz_view_t adj,
-    out_nnz_view_t i_adj,  //pre-allocated -- no need for initialize -- size is same as adj
-    int vector_size = -1,
-    int suggested_team_size = -1,
-    typename in_nnz_view_t::non_const_value_type team_work_chunk_size = -1,
-    bool use_dynamic_scheduling = true
+    out_nnz_view_t i_adj,  //pre-allocated -- no need for initialize -- size is
+same as adj int vector_size = -1, int suggested_team_size = -1, typename
+in_nnz_view_t::non_const_value_type team_work_chunk_size = -1, bool
+use_dynamic_scheduling = true
     ){
 
 
   typedef typename in_row_view_t::non_const_type tmp_row_view_t;
   //allocate some memory for work for row pointers
-  tmp_row_view_t tmp_row_view(Kokkos::view_alloc(Kokkos::WithoutInitializing, "tmp_row_view"), num_rows + 1);
+  tmp_row_view_t tmp_row_view(Kokkos::view_alloc(Kokkos::WithoutInitializing,
+"tmp_row_view"), num_rows + 1);
 
   Kokkos::deep_copy(tmp_row_view, xadj);
 
@@ -1052,17 +1046,20 @@ inline void kk_create_incidence_matrix(
 
 
   typedef typename IncidenceMatrix_Functor_t::team_fill_policy_t fill_tp_t;
-  typedef typename IncidenceMatrix_Functor_t::dynamic_team_fill_policy_t d_fill_tp_t;
+  typedef typename IncidenceMatrix_Functor_t::dynamic_team_fill_policy_t
+d_fill_tp_t;
 
   typename in_row_view_t::non_const_value_type nnz = adj.extent(0);
 
   //set the vector size, if not suggested.
   if (vector_size == -1)
-    vector_size = kk_get_suggested_vector_size(num_rows, nnz, kk_get_exec_space_type<MyExecSpace>());
+    vector_size = kk_get_suggested_vector_size(num_rows, nnz,
+kk_get_exec_space_type<MyExecSpace>());
 
   //set the team size, if not suggested.
   if (suggested_team_size == -1)
-    suggested_team_size = kk_get_suggested_team_size(vector_size, kk_get_exec_space_type<MyExecSpace>());
+    suggested_team_size = kk_get_suggested_team_size(vector_size,
+kk_get_exec_space_type<MyExecSpace>());
 
   //set the chunk size, if not suggested.
   if (team_work_chunk_size == -1)
@@ -1071,10 +1068,12 @@ inline void kk_create_incidence_matrix(
 
 
   if (use_dynamic_scheduling){
-    Kokkos::parallel_for(  fill_tp_t(num_rows  / team_work_chunk_size + 1 , suggested_team_size, vector_size), tm);
+    Kokkos::parallel_for(  fill_tp_t(num_rows  / team_work_chunk_size + 1 ,
+suggested_team_size, vector_size), tm);
   }
   else {
-    Kokkos::parallel_for(  d_fill_tp_t(num_rows  / team_work_chunk_size + 1 , suggested_team_size, vector_size), tm);
+    Kokkos::parallel_for(  d_fill_tp_t(num_rows  / team_work_chunk_size + 1 ,
+suggested_team_size, vector_size), tm);
   }
   MyExecSpace().fence();
 
@@ -1082,456 +1081,413 @@ inline void kk_create_incidence_matrix(
 */
 
 template <typename size_type, typename lno_t>
-void kk_get_lower_triangle_count_sequential(
-    const lno_t nv,
-    const size_type *in_xadj,
-    const lno_t *in_adj,
-    size_type *out_xadj,
-    const lno_t *new_indices = NULL
-    ){
-  for (lno_t i = 0; i < nv; ++i){
+void kk_get_lower_triangle_count_sequential(const lno_t nv,
+                                            const size_type *in_xadj,
+                                            const lno_t *in_adj,
+                                            size_type *out_xadj,
+                                            const lno_t *new_indices = NULL) {
+  for (lno_t i = 0; i < nv; ++i) {
     lno_t row_index = i;
 
     if (new_indices) row_index = new_indices[i];
 
-    out_xadj[i] = 0;
+    out_xadj[i]     = 0;
     size_type begin = in_xadj[i];
-    lno_t rowsize = in_xadj[i + 1] - begin;
+    lno_t rowsize   = in_xadj[i + 1] - begin;
 
-    for (lno_t j = 0; j < rowsize; ++j){
-      lno_t col = in_adj[j + begin];
+    for (lno_t j = 0; j < rowsize; ++j) {
+      lno_t col       = in_adj[j + begin];
       lno_t col_index = col;
       if (new_indices) col_index = new_indices[col];
 
-      if (row_index > col_index){
+      if (row_index > col_index) {
         ++out_xadj[i];
       }
     }
   }
 }
 
-
-
-template <typename size_type,
-          typename lno_t,
-          typename ExecutionSpace,
+template <typename size_type, typename lno_t, typename ExecutionSpace,
           typename scalar_t = double>
-struct LowerTriangularMatrix{
-
-  struct CountTag{};
-  struct FillTag{};
+struct LowerTriangularMatrix {
+  struct CountTag {};
+  struct FillTag {};
 
   typedef struct CountTag CountTag;
   typedef struct FillTag FillTag;
 
-  typedef Kokkos::TeamPolicy<CountTag, ExecutionSpace> team_count_policy_t ;
-  typedef Kokkos::TeamPolicy<FillTag, ExecutionSpace> team_fill_policy_t ;
-
-  typedef Kokkos::TeamPolicy<CountTag, ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic> > dynamic_team_count_policy_t ;
-  typedef Kokkos::TeamPolicy<FillTag, ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic> > dynamic_team_fill_policy_t ;
+  typedef Kokkos::TeamPolicy<CountTag, ExecutionSpace> team_count_policy_t;
+  typedef Kokkos::TeamPolicy<FillTag, ExecutionSpace> team_fill_policy_t;
 
+  typedef Kokkos::TeamPolicy<CountTag, ExecutionSpace,
+                             Kokkos::Schedule<Kokkos::Dynamic>>
+      dynamic_team_count_policy_t;
+  typedef Kokkos::TeamPolicy<FillTag, ExecutionSpace,
+                             Kokkos::Schedule<Kokkos::Dynamic>>
+      dynamic_team_fill_policy_t;
 
-  typedef typename team_count_policy_t::member_type team_count_member_t ;
-  typedef typename team_fill_policy_t::member_type team_fill_member_t ;
-
+  typedef typename team_count_policy_t::member_type team_count_member_t;
+  typedef typename team_fill_policy_t::member_type team_fill_member_t;
 
   lno_t num_rows;
-  const size_type * xadj;
-  const lno_t * adj;
+  const size_type *xadj;
+  const lno_t *adj;
   const scalar_t *in_vals;
   const lno_t *permutation;
 
-  size_type * t_xadj; //allocated
-  lno_t * t_adj;  //allocated
+  size_type *t_xadj;  // allocated
+  lno_t *t_adj;       // allocated
   scalar_t *t_vals;
 
   const lno_t team_work_size;
   const ExecSpaceType exec_space;
   const bool is_lower;
 
-  LowerTriangularMatrix(
-      const lno_t num_rows_,
-      const size_type * xadj_,
-      const lno_t * adj_,
-      const scalar_t *in_vals_,
-      const lno_t *permutation_,
-      size_type * t_xadj_,
-      lno_t * t_adj_,
-      scalar_t *out_vals_,
-      const lno_t team_row_work_size_,
-      bool is_lower_ = true):
-        num_rows(num_rows_),
-        xadj(xadj_), adj(adj_), in_vals (in_vals_),permutation(permutation_),
-        t_xadj(t_xadj_),  t_adj(t_adj_), t_vals(out_vals_),
-        team_work_size(team_row_work_size_), exec_space (kk_get_exec_space_type<ExecutionSpace>()), is_lower(is_lower_) {}
+  LowerTriangularMatrix(const lno_t num_rows_, const size_type *xadj_,
+                        const lno_t *adj_, const scalar_t *in_vals_,
+                        const lno_t *permutation_, size_type *t_xadj_,
+                        lno_t *t_adj_, scalar_t *out_vals_,
+                        const lno_t team_row_work_size_, bool is_lower_ = true)
+      : num_rows(num_rows_),
+        xadj(xadj_),
+        adj(adj_),
+        in_vals(in_vals_),
+        permutation(permutation_),
+        t_xadj(t_xadj_),
+        t_adj(t_adj_),
+        t_vals(out_vals_),
+        team_work_size(team_row_work_size_),
+        exec_space(kk_get_exec_space_type<ExecutionSpace>()),
+        is_lower(is_lower_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const CountTag&, const team_count_member_t & teamMember) const {
-
+  void operator()(const CountTag &,
+                  const team_count_member_t &teamMember) const {
     const lno_t team_row_begin = teamMember.league_rank() * team_work_size;
-    const lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, num_rows);
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,team_row_begin,team_row_end), [&] (const lno_t& row_index) {
-      lno_t row_perm = row_index;
-      if (permutation != NULL){
-        row_perm = permutation[row_perm];
-      }
-
-      const size_type col_begin = xadj[row_index];
-      const size_type col_end = xadj[row_index + 1];
-      const lno_t left_work = col_end - col_begin;
-      lno_t lower_row_size = 0;
-      Kokkos::parallel_reduce(
-          Kokkos::ThreadVectorRange(teamMember, left_work),
-          [&] (lno_t i, lno_t &rowsize_) {
-        const size_type adjind = i + col_begin;
-        lno_t colIndex = adj[adjind];
-        if (permutation != NULL){
-          colIndex = permutation[colIndex];
-        }
-        if (is_lower){
-          if (row_perm > colIndex){
-            rowsize_ += 1;
+    const lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, num_rows);
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const lno_t &row_index) {
+          lno_t row_perm = row_index;
+          if (permutation != NULL) {
+            row_perm = permutation[row_perm];
           }
-        }
-        else {
-          if (row_perm < colIndex){
-            rowsize_ += 1;
-          }
-        }
-      }, lower_row_size);
 
-      t_xadj[row_index] = lower_row_size;
-    });
+          const size_type col_begin = xadj[row_index];
+          const size_type col_end   = xadj[row_index + 1];
+          const lno_t left_work     = col_end - col_begin;
+          lno_t lower_row_size      = 0;
+          Kokkos::parallel_reduce(
+              Kokkos::ThreadVectorRange(teamMember, left_work),
+              [&](lno_t i, lno_t &rowsize_) {
+                const size_type adjind = i + col_begin;
+                lno_t colIndex         = adj[adjind];
+                if (permutation != NULL) {
+                  colIndex = permutation[colIndex];
+                }
+                if (is_lower) {
+                  if (row_perm > colIndex) {
+                    rowsize_ += 1;
+                  }
+                } else {
+                  if (row_perm < colIndex) {
+                    rowsize_ += 1;
+                  }
+                }
+              },
+              lower_row_size);
+
+          t_xadj[row_index] = lower_row_size;
+        });
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const FillTag&, const team_fill_member_t & teamMember) const {
-
+  void operator()(const FillTag &, const team_fill_member_t &teamMember) const {
     const lno_t team_row_begin = teamMember.league_rank() * team_work_size;
-    const lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, num_rows);
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,team_row_begin,team_row_end), [&] (const lno_t& row_index) {
-      lno_t row_perm = row_index;
-      if (permutation != NULL){
-        row_perm = permutation[row_perm];
-      }
+    const lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, num_rows);
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const lno_t &row_index) {
+          lno_t row_perm = row_index;
+          if (permutation != NULL) {
+            row_perm = permutation[row_perm];
+          }
 
-      const size_type col_begin = xadj[row_index];
-      const size_type col_end = xadj[row_index + 1];
-      const lno_t read_left_work = col_end - col_begin;
-
-
-      const size_type write_begin = t_xadj[row_index];
-      const size_type write_end = t_xadj[row_index + 1];
-      const lno_t write_left_work = write_end - write_begin;
-
-      //TODO: Write GPU (vector-level) version here:
-      /*
-      if(kk_is_gpu_exec_space<ExecutionSpace>())
-      {
-        Kokkos::parallel_for(
-            Kokkos::ThreadVectorRange(teamMember, read_left_work),
-            [&] (lno_t i) {
-          const size_type adjind = i + col_begin;
-          const lno_t colIndex = adj[adjind];
-        });
-      }
-      else
-      ...
-      */
-
-      for (lno_t r = 0 , w = 0; r <  read_left_work && w < write_left_work; ++r){
-        const size_type adjind = r + col_begin;
-        const lno_t colIndex = adj[adjind];
-        lno_t colperm = colIndex;
-        if (permutation != NULL){
-          colperm = permutation[colIndex];
-        }
-        if (is_lower){
-          if (row_perm > colperm){
-            if (in_vals != NULL){
-              t_vals[write_begin + w] = in_vals[adjind];
-            }
-            t_adj[write_begin + w++] = colIndex;
+          const size_type col_begin  = xadj[row_index];
+          const size_type col_end    = xadj[row_index + 1];
+          const lno_t read_left_work = col_end - col_begin;
+
+          const size_type write_begin = t_xadj[row_index];
+          const size_type write_end   = t_xadj[row_index + 1];
+          const lno_t write_left_work = write_end - write_begin;
+
+          // TODO: Write GPU (vector-level) version here:
+          /*
+          if(kk_is_gpu_exec_space<ExecutionSpace>())
+          {
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(teamMember, read_left_work),
+                [&] (lno_t i) {
+              const size_type adjind = i + col_begin;
+              const lno_t colIndex = adj[adjind];
+            });
           }
-        }
-        else {
-          if (row_perm < colperm){
-            if (in_vals != NULL){
-              t_vals[write_begin + w] = in_vals[adjind];
+          else
+          ...
+          */
+
+          for (lno_t r = 0, w = 0; r < read_left_work && w < write_left_work;
+               ++r) {
+            const size_type adjind = r + col_begin;
+            const lno_t colIndex   = adj[adjind];
+            lno_t colperm          = colIndex;
+            if (permutation != NULL) {
+              colperm = permutation[colIndex];
+            }
+            if (is_lower) {
+              if (row_perm > colperm) {
+                if (in_vals != NULL) {
+                  t_vals[write_begin + w] = in_vals[adjind];
+                }
+                t_adj[write_begin + w++] = colIndex;
+              }
+            } else {
+              if (row_perm < colperm) {
+                if (in_vals != NULL) {
+                  t_vals[write_begin + w] = in_vals[adjind];
+                }
+                t_adj[write_begin + w++] = colIndex;
+              }
             }
-            t_adj[write_begin + w++] = colIndex;
           }
-        }
-
-
-      }
-    });
+        });
   }
 };
 template <typename size_type, typename lno_t, typename ExecutionSpace>
 void kk_get_lower_triangle_count_parallel(
-    const lno_t nv,
-    const size_type ne,
-    const size_type *in_xadj,
-    const lno_t *in_adj,
-    size_type *out_xadj,
-    const lno_t *new_indices = NULL,
-    bool use_dynamic_scheduling = false,
-    int chunksize = 4,
-    bool is_lower = true
-    ){
-
-
-  const int vector_size = kk_get_suggested_vector_size(nv, ne, kk_get_exec_space_type<ExecutionSpace>());
-  const int suggested_team_size = kk_get_suggested_team_size(vector_size, kk_get_exec_space_type<ExecutionSpace>());
+    const lno_t nv, const size_type ne, const size_type *in_xadj,
+    const lno_t *in_adj, size_type *out_xadj, const lno_t *new_indices = NULL,
+    bool use_dynamic_scheduling = false, int chunksize = 4,
+    bool is_lower = true) {
+  const int vector_size = kk_get_suggested_vector_size(
+      nv, ne, kk_get_exec_space_type<ExecutionSpace>());
+  const int suggested_team_size = kk_get_suggested_team_size(
+      vector_size, kk_get_exec_space_type<ExecutionSpace>());
   const int team_work_chunk_size = suggested_team_size * chunksize;
   typedef LowerTriangularMatrix<size_type, lno_t, ExecutionSpace> ltm_t;
 
-  ltm_t ltm (
-      nv,
-      in_xadj,
-      in_adj, NULL,
-      new_indices,
-      out_xadj,
-      NULL, NULL,
-      team_work_chunk_size,
-      is_lower);
-
+  ltm_t ltm(nv, in_xadj, in_adj, NULL, new_indices, out_xadj, NULL, NULL,
+            team_work_chunk_size, is_lower);
 
   typedef typename ltm_t::team_count_policy_t count_tp_t;
   typedef typename ltm_t::dynamic_team_count_policy_t d_count_tp_t;
 
-
-  if (use_dynamic_scheduling){
-    Kokkos::parallel_for( "KokkosKernels::Common::GetLowerTriangleCount::DynamicSchedule",  d_count_tp_t(nv  / team_work_chunk_size + 1 , suggested_team_size, vector_size), ltm);
-  }
-  else {
-    Kokkos::parallel_for( "KokkosKernels::Common::GetLowerTriangleCount::StaticSchedule", count_tp_t(nv  / team_work_chunk_size + 1 , suggested_team_size, vector_size), ltm);
+  if (use_dynamic_scheduling) {
+    Kokkos::parallel_for(
+        "KokkosKernels::Common::GetLowerTriangleCount::DynamicSchedule",
+        d_count_tp_t(nv / team_work_chunk_size + 1, suggested_team_size,
+                     vector_size),
+        ltm);
+  } else {
+    Kokkos::parallel_for(
+        "KokkosKernels::Common::GetLowerTriangleCount::StaticSchedule",
+        count_tp_t(nv / team_work_chunk_size + 1, suggested_team_size,
+                   vector_size),
+        ltm);
   }
   ExecutionSpace().fence();
 }
 
-
-
-
-
-
 template <typename size_type, typename lno_t>
-void kk_sort_by_row_size_sequential(
-    const lno_t nv,
-    const size_type *in_xadj,
-    lno_t *new_indices,
-    int sort_decreasing_order = 1){
-
-
-  std::vector<lno_t> begins (nv);
-  std::vector<lno_t> nexts (nv);
-  for (lno_t i = 0; i < nv; ++i){
+void kk_sort_by_row_size_sequential(const lno_t nv, const size_type *in_xadj,
+                                    lno_t *new_indices,
+                                    int sort_decreasing_order = 1) {
+  std::vector<lno_t> begins(nv);
+  std::vector<lno_t> nexts(nv);
+  for (lno_t i = 0; i < nv; ++i) {
     nexts[i] = begins[i] = -1;
   }
 
-
-
-  for (lno_t i = 0; i < nv; ++i){
-    lno_t row_size = in_xadj[i+1] - in_xadj[i];
-    nexts [i] = begins[row_size];
+  for (lno_t i = 0; i < nv; ++i) {
+    lno_t row_size   = in_xadj[i + 1] - in_xadj[i];
+    nexts[i]         = begins[row_size];
     begins[row_size] = i;
   }
-  if (sort_decreasing_order == 1){
-    lno_t new_index = nv;
+  if (sort_decreasing_order == 1) {
+    lno_t new_index     = nv;
     const lno_t row_end = -1;
-    for (lno_t i = 0; i < nv ; ++i){
+    for (lno_t i = 0; i < nv; ++i) {
       lno_t row = begins[i];
-      while (row != row_end){
+      while (row != row_end) {
         new_indices[row] = --new_index;
-        row = nexts[row];
+        row              = nexts[row];
       }
     }
-  }
-  else if (sort_decreasing_order == 2){
-    lno_t new_index_top = nv;
+  } else if (sort_decreasing_order == 2) {
+    lno_t new_index_top    = nv;
     lno_t new_index_bottom = 0;
-    const lno_t row_end = -1;
-    bool is_even = true;
-    for (lno_t i = nv - 1; ; --i){
+    const lno_t row_end    = -1;
+    bool is_even           = true;
+    for (lno_t i = nv - 1;; --i) {
       lno_t row = begins[i];
-      while (row != row_end){
-        if (is_even){
+      while (row != row_end) {
+        if (is_even) {
           new_indices[row] = --new_index_top;
-        }
-        else {
+        } else {
           new_indices[row] = new_index_bottom++;
         }
         is_even = !is_even;
-        row = nexts[row];
+        row     = nexts[row];
       }
       if (i == 0) break;
     }
-  }
-  else {
-    lno_t new_index = 0;
+  } else {
+    lno_t new_index     = 0;
     const lno_t row_end = -1;
-    for (lno_t i = 0; i < nv ; ++i){
+    for (lno_t i = 0; i < nv; ++i) {
       lno_t row = begins[i];
-      while (row != row_end){
+      while (row != row_end) {
         new_indices[row] = new_index++;
-        row = nexts[row];
+        row              = nexts[row];
       }
     }
   }
 }
 #ifdef KOKKOSKERNELS_HAVE_PARALLEL_GNUSORT
 template <typename size_type, typename lno_t, typename ExecutionSpace>
-void kk_sort_by_row_size_parallel(
-    const lno_t nv,
-    const size_type *in_xadj,
-    lno_t *new_indices, int sort_decreasing_order = 1, int num_threads=1){
-
+void kk_sort_by_row_size_parallel(const lno_t nv, const size_type *in_xadj,
+                                  lno_t *new_indices,
+                                  int sort_decreasing_order = 1,
+                                  int num_threads           = 1) {
   typedef Kokkos::RangePolicy<ExecutionSpace> my_exec_space;
 
-  struct SortItem{ 
+  struct SortItem {
     lno_t id;
     lno_t size;
-   bool operator<(const SortItem & a) const
-  {
-    return this->size > a.size;
-  }
+    bool operator<(const SortItem &a) const { return this->size > a.size; }
   };
 
   std::vector<SortItem> vnum_elements(nv);
-  SortItem * num_elements = &(vnum_elements[0]);
-
-
-  Kokkos::parallel_for( "KokkosKernels::Common::SortByRowSize::S0", my_exec_space(0, nv),
-      KOKKOS_LAMBDA(const lno_t& row) {
-        lno_t row_size = in_xadj[row+1] - in_xadj[row];
-        num_elements[row].size = row_size; 
-        num_elements[row].id = row;
+  SortItem *num_elements = &(vnum_elements[0]);
+
+  Kokkos::parallel_for(
+      "KokkosKernels::Common::SortByRowSize::S0", my_exec_space(0, nv),
+      KOKKOS_LAMBDA(const lno_t &row) {
+        lno_t row_size         = in_xadj[row + 1] - in_xadj[row];
+        num_elements[row].size = row_size;
+        num_elements[row].id   = row;
       });
-  __gnu_parallel::sort
-  (&(num_elements[0]), &(num_elements[0])+nv,
-      std::less<struct SortItem >());
+  __gnu_parallel::sort(&(num_elements[0]), &(num_elements[0]) + nv,
+                       std::less<struct SortItem>());
 
-      if (sort_decreasing_order == 1){
-        Kokkos::parallel_for( "KokkosKernels::Common::SortByRowSize::S1", my_exec_space(0, nv),
-        KOKKOS_LAMBDA(const lno_t& row) {
+  if (sort_decreasing_order == 1) {
+    Kokkos::parallel_for(
+        "KokkosKernels::Common::SortByRowSize::S1", my_exec_space(0, nv),
+        KOKKOS_LAMBDA(const lno_t &row) {
           new_indices[num_elements[row].id] = row;
         });
-      }
-      else if (sort_decreasing_order == 0){
-        Kokkos::parallel_for( "KokkosKernels::Common::SortByRowSize::S2", my_exec_space(0, nv),
-        KOKKOS_LAMBDA(const lno_t& row) {
+  } else if (sort_decreasing_order == 0) {
+    Kokkos::parallel_for(
+        "KokkosKernels::Common::SortByRowSize::S2", my_exec_space(0, nv),
+        KOKKOS_LAMBDA(const lno_t &row) {
           new_indices[num_elements[row].id] = nv - row - 1;
         });
-      } 
-      else {
-        Kokkos::parallel_for( "KokkosKernels::Common::SortByRowSize::S3", my_exec_space(0, nv),
-        KOKKOS_LAMBDA(const lno_t& row) {
-          if (row   & 1){
-          new_indices[num_elements[row].id] = nv - (row + 1) / 2;
-          } 
-          else {
-          new_indices[num_elements[row].id] = row / 2 ;
+  } else {
+    Kokkos::parallel_for(
+        "KokkosKernels::Common::SortByRowSize::S3", my_exec_space(0, nv),
+        KOKKOS_LAMBDA(const lno_t &row) {
+          if (row & 1) {
+            new_indices[num_elements[row].id] = nv - (row + 1) / 2;
+          } else {
+            new_indices[num_elements[row].id] = row / 2;
           }
         });
-      }
+  }
 }
 #endif
 
 #ifdef KOKKOSKERNELS_HAVE_PARALLEL_GNUSORT
 template <typename size_type, typename lno_t, typename ExecutionSpace>
-void kk_sort_by_row_size(
-    const lno_t nv,
-    const size_type *in_xadj,
-    lno_t *new_indices, int sort_decreasing_order = 1, int num_threads=64){
-
+void kk_sort_by_row_size(const lno_t nv, const size_type *in_xadj,
+                         lno_t *new_indices, int sort_decreasing_order = 1,
+                         int num_threads = 64) {
   std::cout << "Parallel Sort" << std::endl;
-  kk_sort_by_row_size_parallel<size_type, lno_t, ExecutionSpace>(nv, in_xadj, new_indices, sort_decreasing_order, num_threads); 
+  kk_sort_by_row_size_parallel<size_type, lno_t, ExecutionSpace>(
+      nv, in_xadj, new_indices, sort_decreasing_order, num_threads);
 }
 #else
 template <typename size_type, typename lno_t, typename ExecutionSpace>
-void kk_sort_by_row_size(
-    const lno_t nv,
-    const size_type *in_xadj,
-    lno_t *new_indices, int sort_decreasing_order = 1, int /*num_threads*/=64){
-
+void kk_sort_by_row_size(const lno_t nv, const size_type *in_xadj,
+                         lno_t *new_indices, int sort_decreasing_order = 1,
+                         int /*num_threads*/ = 64) {
   std::cout << "Sequential Sort" << std::endl;
-  kk_sort_by_row_size_sequential(nv, in_xadj, new_indices, sort_decreasing_order);
+  kk_sort_by_row_size_sequential(nv, in_xadj, new_indices,
+                                 sort_decreasing_order);
 }
 #endif
 
-template <typename size_type, typename lno_t, typename ExecutionSpace, typename scalar_t = double>
+template <typename size_type, typename lno_t, typename ExecutionSpace,
+          typename scalar_t = double>
 void kk_get_lower_triangle_fill_parallel(
-    const lno_t nv,
-    const size_type ne,
-    const size_type *in_xadj,
-    const lno_t *in_adj,
-    const scalar_t *in_vals,
-    size_type *out_xadj,
-    lno_t *out_adj,
-    scalar_t *out_vals,
-    const lno_t *new_indices = NULL,
-    bool use_dynamic_scheduling = false,
-    bool chunksize = 4,
-    bool is_lower = true
-    ){
-
-
-  const int vector_size = kk_get_suggested_vector_size(nv, ne, kk_get_exec_space_type<ExecutionSpace>());
-  const int suggested_team_size = kk_get_suggested_team_size(vector_size, kk_get_exec_space_type<ExecutionSpace>());
+    const lno_t nv, const size_type ne, const size_type *in_xadj,
+    const lno_t *in_adj, const scalar_t *in_vals, size_type *out_xadj,
+    lno_t *out_adj, scalar_t *out_vals, const lno_t *new_indices = NULL,
+    bool use_dynamic_scheduling = false, bool chunksize = 4,
+    bool is_lower = true) {
+  const int vector_size = kk_get_suggested_vector_size(
+      nv, ne, kk_get_exec_space_type<ExecutionSpace>());
+  const int suggested_team_size = kk_get_suggested_team_size(
+      vector_size, kk_get_exec_space_type<ExecutionSpace>());
   const int team_work_chunk_size = suggested_team_size * chunksize;
 
-  typedef LowerTriangularMatrix<size_type, lno_t, ExecutionSpace,scalar_t> ltm_t;
-  ltm_t ltm (
-      nv,
-      in_xadj,
-      in_adj,in_vals,
-      new_indices,
-      out_xadj,
-      out_adj,out_vals,
-      team_work_chunk_size,
-      is_lower);
-
+  typedef LowerTriangularMatrix<size_type, lno_t, ExecutionSpace, scalar_t>
+      ltm_t;
+  ltm_t ltm(nv, in_xadj, in_adj, in_vals, new_indices, out_xadj, out_adj,
+            out_vals, team_work_chunk_size, is_lower);
 
   typedef typename ltm_t::team_fill_policy_t fill_p_t;
   typedef typename ltm_t::dynamic_team_fill_policy_t d_fill_p_t;
 
-
-  if (use_dynamic_scheduling){
-    Kokkos::parallel_for( "KokkosKernels::Common::GetLowerTriangleFill::DynamicSchedule", d_fill_p_t(nv  / team_work_chunk_size + 1 , suggested_team_size, vector_size), ltm);
-  }
-  else {
-    Kokkos::parallel_for( "KokkosKernels::Common::GetLowerTriangleFill::StaticSchedule", fill_p_t(nv  / team_work_chunk_size + 1 , suggested_team_size, vector_size), ltm);
+  if (use_dynamic_scheduling) {
+    Kokkos::parallel_for(
+        "KokkosKernels::Common::GetLowerTriangleFill::DynamicSchedule",
+        d_fill_p_t(nv / team_work_chunk_size + 1, suggested_team_size,
+                   vector_size),
+        ltm);
+  } else {
+    Kokkos::parallel_for(
+        "KokkosKernels::Common::GetLowerTriangleFill::StaticSchedule",
+        fill_p_t(nv / team_work_chunk_size + 1, suggested_team_size,
+                 vector_size),
+        ltm);
   }
   ExecutionSpace().fence();
 }
 template <typename size_type, typename lno_t, typename scalar_t>
-void kk_get_lower_triangle_fill_sequential(
-    lno_t nv,
-    const size_type *in_xadj,
-    const lno_t *in_adj,
-    const scalar_t *in_vals,
-    const size_type *out_xadj,
-    lno_t *out_adj,
-    scalar_t *out_vals,
-    const lno_t *new_indices = NULL
-    ){
-  for (lno_t i = 0; i < nv; ++i){
+void kk_get_lower_triangle_fill_sequential(lno_t nv, const size_type *in_xadj,
+                                           const lno_t *in_adj,
+                                           const scalar_t *in_vals,
+                                           const size_type *out_xadj,
+                                           lno_t *out_adj, scalar_t *out_vals,
+                                           const lno_t *new_indices = NULL) {
+  for (lno_t i = 0; i < nv; ++i) {
     lno_t row_index = i;
 
     if (new_indices) row_index = new_indices[i];
     size_type write_index = out_xadj[i];
-    size_type begin = in_xadj[i];
-    lno_t rowsize = in_xadj[i + 1] - begin;
-    for (lno_t j = 0; j < rowsize; ++j){
-      lno_t col = in_adj[j + begin];
+    size_type begin       = in_xadj[i];
+    lno_t rowsize         = in_xadj[i + 1] - begin;
+    for (lno_t j = 0; j < rowsize; ++j) {
+      lno_t col       = in_adj[j + begin];
       lno_t col_index = col;
       if (new_indices) col_index = new_indices[col];
 
-      if (row_index > col_index){
-        if (in_vals != NULL && out_vals != NULL){
+      if (row_index > col_index) {
+        if (in_vals != NULL && out_vals != NULL) {
           out_vals[write_index] = in_vals[j + begin];
         }
         out_adj[write_index++] = col;
@@ -1540,516 +1496,630 @@ void kk_get_lower_triangle_fill_sequential(
   }
 }
 template <typename size_type, typename lno_t, typename ExecutionSpace>
-void kk_get_lower_triangle_count(
-    const lno_t nv, const size_type ne,
-    const size_type *in_xadj,
-    const lno_t *in_adj,
-    size_type *out_xadj,
-    const lno_t *new_indices = NULL,
-    bool use_dynamic_scheduling = false,
-    bool chunksize = 4,
-    bool is_lower = true
-    ){
-  //Kokkos::Timer timer1;
-
-
-  //kk_get_lower_triangle_count_sequential(nv, in_xadj, in_adj, out_xadj, new_indices);
+void kk_get_lower_triangle_count(const lno_t nv, const size_type ne,
+                                 const size_type *in_xadj, const lno_t *in_adj,
+                                 size_type *out_xadj,
+                                 const lno_t *new_indices    = NULL,
+                                 bool use_dynamic_scheduling = false,
+                                 bool chunksize = 4, bool is_lower = true) {
+  // Kokkos::Timer timer1;
+
+  // kk_get_lower_triangle_count_sequential(nv, in_xadj, in_adj, out_xadj,
+  // new_indices);
   kk_get_lower_triangle_count_parallel<size_type, lno_t, ExecutionSpace>(
-      nv, ne, in_xadj, in_adj, out_xadj, new_indices,use_dynamic_scheduling,chunksize, is_lower);
-  //double count = timer1.seconds();
-  //std::cout << "lower count time:" << count<< std::endl;
-
+      nv, ne, in_xadj, in_adj, out_xadj, new_indices, use_dynamic_scheduling,
+      chunksize, is_lower);
+  // double count = timer1.seconds();
+  // std::cout << "lower count time:" << count<< std::endl;
 }
-template <typename size_type, typename lno_t, typename scalar_t, typename ExecutionSpace>
-void kk_get_lower_triangle_fill(
-    lno_t nv, size_type ne,
-    const size_type *in_xadj,
-    const lno_t *in_adj,
-    const scalar_t *in_vals,
-    size_type *out_xadj,
-    lno_t *out_adj,
-    scalar_t *out_vals,
-    const lno_t *new_indices = NULL,
-    bool use_dynamic_scheduling = false,
-    bool chunksize = 4,
-    bool is_lower = true
-    ){
-  //Kokkos::Timer timer1;
-/*
-  kk_get_lower_triangle_fill_sequential(
-    nv, in_xadj, in_adj,
-    in_vals,
-    out_xadj,
-    out_adj,
-    out_vals,
-    new_indices
-    );
-*/
-
-
-  kk_get_lower_triangle_fill_parallel<size_type, lno_t, ExecutionSpace, scalar_t>(
-      nv,
-      ne,
-      in_xadj,
-      in_adj,
+template <typename size_type, typename lno_t, typename scalar_t,
+          typename ExecutionSpace>
+void kk_get_lower_triangle_fill(lno_t nv, size_type ne,
+                                const size_type *in_xadj, const lno_t *in_adj,
+                                const scalar_t *in_vals, size_type *out_xadj,
+                                lno_t *out_adj, scalar_t *out_vals,
+                                const lno_t *new_indices    = NULL,
+                                bool use_dynamic_scheduling = false,
+                                bool chunksize = 4, bool is_lower = true) {
+  // Kokkos::Timer timer1;
+  /*
+    kk_get_lower_triangle_fill_sequential(
+      nv, in_xadj, in_adj,
       in_vals,
       out_xadj,
       out_adj,
       out_vals,
-      new_indices,
-      use_dynamic_scheduling,
-      chunksize,
-      is_lower
+      new_indices
       );
+  */
 
-  //double fill = timer1.seconds();
-  //std::cout << "lower fill time:" << fill<< std::endl;
+  kk_get_lower_triangle_fill_parallel<size_type, lno_t, ExecutionSpace,
+                                      scalar_t>(
+      nv, ne, in_xadj, in_adj, in_vals, out_xadj, out_adj, out_vals,
+      new_indices, use_dynamic_scheduling, chunksize, is_lower);
 
+  // double fill = timer1.seconds();
+  // std::cout << "lower fill time:" << fill<< std::endl;
 }
 
-
-
 template <typename crstmat_t>
-crstmat_t kk_get_lower_triangle(crstmat_t in_crs_matrix,
+crstmat_t kk_get_lower_triangle(
+    crstmat_t in_crs_matrix,
     typename crstmat_t::index_type::value_type *new_indices = NULL,
-    bool use_dynamic_scheduling = false,
-    bool chunksize = 4){
-
+    bool use_dynamic_scheduling = false, bool chunksize = 4) {
   typedef typename crstmat_t::execution_space exec_space;
   typedef typename crstmat_t::StaticCrsGraphType graph_t;
   typedef typename crstmat_t::row_map_type::non_const_type row_map_view_t;
-  typedef typename crstmat_t::index_type::non_const_type   cols_view_t;
+  typedef typename crstmat_t::index_type::non_const_type cols_view_t;
   typedef typename crstmat_t::values_type::non_const_type values_view_t;
-  //typedef typename crstmat_t::row_map_type::const_type const_row_map_view_t;
-  //typedef typename crstmat_t::index_type::const_type   const_cols_view_t;
-  //typedef typename crstmat_t::values_type::const_type const_values_view_t;
+  // typedef typename crstmat_t::row_map_type::const_type const_row_map_view_t;
+  // typedef typename crstmat_t::index_type::const_type   const_cols_view_t;
+  // typedef typename crstmat_t::values_type::const_type const_values_view_t;
 
   typedef typename row_map_view_t::non_const_value_type size_type;
   typedef typename cols_view_t::non_const_value_type lno_t;
   typedef typename values_view_t::non_const_value_type scalar_t;
 
-
   lno_t nr = in_crs_matrix.numRows();
 
-  const scalar_t *vals = in_crs_matrix.values.data();
+  const scalar_t *vals    = in_crs_matrix.values.data();
   const size_type *rowmap = in_crs_matrix.graph.row_map.data();
-  const lno_t *entries= in_crs_matrix.graph.entries.data();
-  const size_type ne = in_crs_matrix.graph.entries.extent(0);
+  const lno_t *entries    = in_crs_matrix.graph.entries.data();
+  const size_type ne      = in_crs_matrix.graph.entries.extent(0);
 
+  row_map_view_t new_row_map(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), nr + 1);
+  kk_get_lower_triangle_count<size_type, lno_t, exec_space>(
+      nr, ne, rowmap, entries, new_row_map.data(), new_indices,
+      use_dynamic_scheduling, chunksize);
 
-  row_map_view_t new_row_map (Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), nr + 1);
-  kk_get_lower_triangle_count
-  <size_type, lno_t, exec_space> (nr, ne, rowmap, entries, new_row_map.data(), new_indices, use_dynamic_scheduling, chunksize);
-
-  kk_exclusive_parallel_prefix_sum
-  <row_map_view_t, exec_space>(nr + 1, new_row_map);
+  kk_exclusive_parallel_prefix_sum<row_map_view_t, exec_space>(nr + 1,
+                                                               new_row_map);
   exec_space().fence();
 
-  auto ll_size = Kokkos::subview(new_row_map, nr);
-  auto h_ll_size = Kokkos::create_mirror_view (ll_size);
-  Kokkos::deep_copy (h_ll_size, ll_size);
+  auto ll_size   = Kokkos::subview(new_row_map, nr);
+  auto h_ll_size = Kokkos::create_mirror_view(ll_size);
+  Kokkos::deep_copy(h_ll_size, ll_size);
   size_type ll_nnz_size = h_ll_size();
 
-  //cols_view_t new_entries ("LL", ll_nnz_size);
-  //values_view_t new_values ("LL", ll_nnz_size);
-  cols_view_t new_entries (Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), ll_nnz_size);
-  values_view_t new_values (Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), ll_nnz_size);
+  // cols_view_t new_entries ("LL", ll_nnz_size);
+  // values_view_t new_values ("LL", ll_nnz_size);
+  cols_view_t new_entries(Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"),
+                          ll_nnz_size);
+  values_view_t new_values(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), ll_nnz_size);
 
-  kk_get_lower_triangle_fill
-  <size_type, lno_t, scalar_t, exec_space> (
-      nr, ne, rowmap, entries, vals, new_row_map.data(),
-      new_entries.data(), new_values.data(),new_indices, use_dynamic_scheduling, chunksize);
+  kk_get_lower_triangle_fill<size_type, lno_t, scalar_t, exec_space>(
+      nr, ne, rowmap, entries, vals, new_row_map.data(), new_entries.data(),
+      new_values.data(), new_indices, use_dynamic_scheduling, chunksize);
 
-  graph_t g (new_entries, new_row_map);
-  crstmat_t new_ll_mtx("lower triangle", in_crs_matrix.numCols(), new_values, g);
+  graph_t g(new_entries, new_row_map);
+  crstmat_t new_ll_mtx("lower triangle", in_crs_matrix.numCols(), new_values,
+                       g);
   return new_ll_mtx;
 }
 
 template <typename crstmat_t>
-crstmat_t kk_get_lower_crs_matrix(crstmat_t in_crs_matrix,
+crstmat_t kk_get_lower_crs_matrix(
+    crstmat_t in_crs_matrix,
     typename crstmat_t::index_type::value_type *new_indices = NULL,
-    bool use_dynamic_scheduling = false,
-    bool chunksize = 4){
-
+    bool use_dynamic_scheduling = false, bool chunksize = 4) {
   typedef typename crstmat_t::execution_space exec_space;
   typedef typename crstmat_t::StaticCrsGraphType graph_t;
   typedef typename crstmat_t::row_map_type::non_const_type row_map_view_t;
-  typedef typename crstmat_t::index_type::non_const_type   cols_view_t;
+  typedef typename crstmat_t::index_type::non_const_type cols_view_t;
   typedef typename crstmat_t::values_type::non_const_type values_view_t;
-  //typedef typename crstmat_t::row_map_type::const_type const_row_map_view_t;
-  //typedef typename crstmat_t::index_type::const_type   const_cols_view_t;
-  //typedef typename crstmat_t::values_type::const_type const_values_view_t;
+  // typedef typename crstmat_t::row_map_type::const_type const_row_map_view_t;
+  // typedef typename crstmat_t::index_type::const_type   const_cols_view_t;
+  // typedef typename crstmat_t::values_type::const_type const_values_view_t;
 
   typedef typename row_map_view_t::non_const_value_type size_type;
   typedef typename cols_view_t::non_const_value_type lno_t;
   typedef typename values_view_t::non_const_value_type scalar_t;
 
-
   lno_t nr = in_crs_matrix.numRows();
 
-  const scalar_t *vals = in_crs_matrix.values.data();
+  const scalar_t *vals    = in_crs_matrix.values.data();
   const size_type *rowmap = in_crs_matrix.graph.row_map.data();
-  const lno_t *entries= in_crs_matrix.graph.entries.data();
-  const size_type ne = in_crs_matrix.graph.entries.extent(0);
-
-
-  row_map_view_t new_row_map (Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), nr + 1);
-  kk_get_lower_triangle_count
-  <size_type, lno_t, exec_space> (nr, ne, rowmap, entries, new_row_map.data(), new_indices, use_dynamic_scheduling, chunksize);
+  const lno_t *entries    = in_crs_matrix.graph.entries.data();
+  const size_type ne      = in_crs_matrix.graph.entries.extent(0);
 
+  row_map_view_t new_row_map(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), nr + 1);
+  kk_get_lower_triangle_count<size_type, lno_t, exec_space>(
+      nr, ne, rowmap, entries, new_row_map.data(), new_indices,
+      use_dynamic_scheduling, chunksize);
 
-  kk_exclusive_parallel_prefix_sum
-  <row_map_view_t, exec_space>(nr + 1, new_row_map);
+  kk_exclusive_parallel_prefix_sum<row_map_view_t, exec_space>(nr + 1,
+                                                               new_row_map);
   exec_space().fence();
 
-  auto ll_size = Kokkos::subview(new_row_map, nr);
-  auto h_ll_size = Kokkos::create_mirror_view (ll_size);
-  Kokkos::deep_copy (h_ll_size, ll_size);
+  auto ll_size   = Kokkos::subview(new_row_map, nr);
+  auto h_ll_size = Kokkos::create_mirror_view(ll_size);
+  Kokkos::deep_copy(h_ll_size, ll_size);
   size_type ll_nnz_size = h_ll_size();
 
-  //cols_view_t new_entries ("LL", ll_nnz_size);
-  //values_view_t new_values ("LL", ll_nnz_size);
-  cols_view_t new_entries (Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), ll_nnz_size);
-  values_view_t new_values (Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), ll_nnz_size);
+  // cols_view_t new_entries ("LL", ll_nnz_size);
+  // values_view_t new_values ("LL", ll_nnz_size);
+  cols_view_t new_entries(Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"),
+                          ll_nnz_size);
+  values_view_t new_values(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), ll_nnz_size);
 
-  kk_get_lower_triangle_fill
-  <size_type, lno_t, scalar_t, exec_space> (
-      nr, ne, rowmap, entries, vals, new_row_map.data(),
-      new_entries.data(), new_values.data(),new_indices, use_dynamic_scheduling, chunksize);
+  kk_get_lower_triangle_fill<size_type, lno_t, scalar_t, exec_space>(
+      nr, ne, rowmap, entries, vals, new_row_map.data(), new_entries.data(),
+      new_values.data(), new_indices, use_dynamic_scheduling, chunksize);
 
-  graph_t g (new_entries, new_row_map);
-  crstmat_t new_ll_mtx("lower triangle", in_crs_matrix.numCols(), new_values, g);
+  graph_t g(new_entries, new_row_map);
+  crstmat_t new_ll_mtx("lower triangle", in_crs_matrix.numCols(), new_values,
+                       g);
   return new_ll_mtx;
 }
 
 template <typename graph_t>
 graph_t kk_get_lower_crs_graph(graph_t in_crs_matrix,
-    typename graph_t::data_type *new_indices = NULL,
-    bool /*use_dynamic_scheduling*/ = false,
-    bool /*chunksize*/ = 4){
-
+                               typename graph_t::data_type *new_indices = NULL,
+                               bool /*use_dynamic_scheduling*/          = false,
+                               bool /*chunksize*/                       = 4) {
   typedef typename graph_t::execution_space exec_space;
 
   typedef typename graph_t::row_map_type::non_const_type row_map_view_t;
-  typedef typename graph_t::entries_type::non_const_type   cols_view_t;
-
-  //typedef typename graph_t::row_map_type::const_type const_row_map_view_t;
-  //typedef typename graph_t::entries_type::const_type   const_cols_view_t;
+  typedef typename graph_t::entries_type::non_const_type cols_view_t;
 
+  // typedef typename graph_t::row_map_type::const_type const_row_map_view_t;
+  // typedef typename graph_t::entries_type::const_type   const_cols_view_t;
 
   typedef typename row_map_view_t::non_const_value_type size_type;
   typedef typename cols_view_t::non_const_value_type lno_t;
 
-
-
-  lno_t nr = in_crs_matrix.numRows();
+  lno_t nr                = in_crs_matrix.numRows();
   const size_type *rowmap = in_crs_matrix.row_map.data();
-  const lno_t *entries= in_crs_matrix.entries.data();
+  const lno_t *entries    = in_crs_matrix.entries.data();
 
   const size_type ne = in_crs_matrix.graph.entries.extent(0);
 
-  row_map_view_t new_row_map (Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), nr + 1);
-  kk_get_lower_triangle_count
-  <size_type, lno_t, exec_space> (nr, ne, rowmap, entries, new_row_map.data(), new_indices);
+  row_map_view_t new_row_map(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), nr + 1);
+  kk_get_lower_triangle_count<size_type, lno_t, exec_space>(
+      nr, ne, rowmap, entries, new_row_map.data(), new_indices);
 
-  kk_exclusive_parallel_prefix_sum
-  <row_map_view_t, exec_space>(nr + 1, new_row_map);
+  kk_exclusive_parallel_prefix_sum<row_map_view_t, exec_space>(nr + 1,
+                                                               new_row_map);
   exec_space().fence();
 
-  auto ll_size = Kokkos::subview(new_row_map, nr);
-  auto h_ll_size = Kokkos::create_mirror_view (ll_size);
-  Kokkos::deep_copy (h_ll_size, ll_size);
+  auto ll_size   = Kokkos::subview(new_row_map, nr);
+  auto h_ll_size = Kokkos::create_mirror_view(ll_size);
+  Kokkos::deep_copy(h_ll_size, ll_size);
   size_type ll_nnz_size = h_ll_size();
 
-  cols_view_t new_entries (Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), ll_nnz_size);
+  cols_view_t new_entries(Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"),
+                          ll_nnz_size);
 
+  kk_get_lower_triangle_fill<size_type, lno_t, lno_t, exec_space>(
+      nr, ne, rowmap, entries, NULL, new_row_map.data(), new_entries.data(),
+      NULL, new_indices);
 
-  kk_get_lower_triangle_fill
-  <size_type, lno_t, lno_t, exec_space> (
-      nr, ne, rowmap, entries, NULL, new_row_map.data(),
-      new_entries.data(), NULL,new_indices);
-
-  graph_t g (new_entries, new_row_map);
+  graph_t g(new_entries, new_row_map);
 
   return g;
 }
 
-
-template <typename row_map_view_t,
-          typename cols_view_t,
-          typename values_view_t,
-          typename out_row_map_view_t,
-          typename out_cols_view_t,
-          typename out_values_view_t,
-          typename new_indices_t,
-          typename exec_space
-          >
-void kk_get_lower_triangle(
-    typename cols_view_t::non_const_value_type nr,
-    row_map_view_t in_rowmap,
-    cols_view_t in_entries,
-    values_view_t in_values,
-    out_row_map_view_t &out_rowmap,
-    out_cols_view_t &out_entries,
-    out_values_view_t &out_values,
-    new_indices_t &new_indices,
-    bool use_dynamic_scheduling = false,
-    bool chunksize = 4,
-    bool is_lower = true){
-
-  //typedef typename row_map_view_t::const_type const_row_map_view_t;
-  //typedef typename cols_view_t::const_type   const_cols_view_t;
-  //typedef typename values_view_t::const_type const_values_view_t;
+template <typename row_map_view_t, typename cols_view_t, typename values_view_t,
+          typename out_row_map_view_t, typename out_cols_view_t,
+          typename out_values_view_t, typename new_indices_t,
+          typename exec_space>
+void kk_get_lower_triangle(typename cols_view_t::non_const_value_type nr,
+                           row_map_view_t in_rowmap, cols_view_t in_entries,
+                           values_view_t in_values,
+                           out_row_map_view_t &out_rowmap,
+                           out_cols_view_t &out_entries,
+                           out_values_view_t &out_values,
+                           new_indices_t &new_indices,
+                           bool use_dynamic_scheduling = false,
+                           bool chunksize = 4, bool is_lower = true) {
+  // typedef typename row_map_view_t::const_type const_row_map_view_t;
+  // typedef typename cols_view_t::const_type   const_cols_view_t;
+  // typedef typename values_view_t::const_type const_values_view_t;
 
   typedef typename row_map_view_t::non_const_value_type size_type;
   typedef typename cols_view_t::non_const_value_type lno_t;
   typedef typename values_view_t::non_const_value_type scalar_t;
 
-
-
-  const scalar_t *vals = in_values.data();
+  const scalar_t *vals    = in_values.data();
   const size_type *rowmap = in_rowmap.data();
-  const lno_t *entries= in_entries.data();
-  const size_type ne = in_entries.extent(0);
+  const lno_t *entries    = in_entries.data();
+  const size_type ne      = in_entries.extent(0);
 
-  out_rowmap = out_row_map_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), nr + 1);
-  kk_get_lower_triangle_count
-  <size_type, lno_t, exec_space> (nr, ne, rowmap, entries,
-      out_rowmap.data(),
-      new_indices.data(),
-      use_dynamic_scheduling,
-      chunksize,  is_lower);
+  out_rowmap = out_row_map_view_t(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), nr + 1);
+  kk_get_lower_triangle_count<size_type, lno_t, exec_space>(
+      nr, ne, rowmap, entries, out_rowmap.data(), new_indices.data(),
+      use_dynamic_scheduling, chunksize, is_lower);
 
-  kk_exclusive_parallel_prefix_sum
-  <out_row_map_view_t, exec_space>(nr + 1, out_rowmap);
+  kk_exclusive_parallel_prefix_sum<out_row_map_view_t, exec_space>(nr + 1,
+                                                                   out_rowmap);
   exec_space().fence();
 
-  auto ll_size = Kokkos::subview(out_rowmap, nr);
-  auto h_ll_size = Kokkos::create_mirror_view (ll_size);
-  Kokkos::deep_copy (h_ll_size, ll_size);
+  auto ll_size   = Kokkos::subview(out_rowmap, nr);
+  auto h_ll_size = Kokkos::create_mirror_view(ll_size);
+  Kokkos::deep_copy(h_ll_size, ll_size);
   size_type ll_nnz_size = h_ll_size();
 
-  //cols_view_t new_entries ("LL", ll_nnz_size);
-  //values_view_t new_values ("LL", ll_nnz_size);
-  out_entries = out_cols_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), ll_nnz_size);
+  // cols_view_t new_entries ("LL", ll_nnz_size);
+  // values_view_t new_values ("LL", ll_nnz_size);
+  out_entries = out_cols_view_t(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), ll_nnz_size);
 
   if (in_values.data() != NULL)
-    out_values = out_values_view_t (Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), ll_nnz_size);
-
-  kk_get_lower_triangle_fill
-  <size_type, lno_t, scalar_t, exec_space> (
-      nr, ne,
-      rowmap, entries, vals,
-      out_rowmap.data(), out_entries.data(), out_values.data(),
-      new_indices.data(), use_dynamic_scheduling, chunksize,is_lower);
+    out_values = out_values_view_t(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), ll_nnz_size);
+
+  kk_get_lower_triangle_fill<size_type, lno_t, scalar_t, exec_space>(
+      nr, ne, rowmap, entries, vals, out_rowmap.data(), out_entries.data(),
+      out_values.data(), new_indices.data(), use_dynamic_scheduling, chunksize,
+      is_lower);
 }
 
-template <typename row_map_view_t,
-          typename cols_view_t,
-          typename out_row_map_view_t,
-          typename out_cols_view_t,
-          typename exec_space
-          >
+template <typename row_map_view_t, typename cols_view_t,
+          typename out_row_map_view_t, typename out_cols_view_t,
+          typename exec_space>
 void kk_create_incidence_tranpose_matrix_from_lower_triangle(
-    typename cols_view_t::non_const_value_type nr,
-    row_map_view_t in_rowmap,
-    cols_view_t in_entries,
-    out_row_map_view_t &out_rowmap,
-    out_cols_view_t &out_entries,
-    bool /*use_dynamic_scheduling */ = false,
-    bool /*chunksize*/ = 4){
-
-  //typedef typename row_map_view_t::const_type const_row_map_view_t;
-  //typedef typename cols_view_t::const_type   const_cols_view_t;
+    typename cols_view_t::non_const_value_type nr, row_map_view_t in_rowmap,
+    cols_view_t in_entries, out_row_map_view_t &out_rowmap,
+    out_cols_view_t &out_entries, bool /*use_dynamic_scheduling */ = false,
+    bool /*chunksize*/ = 4) {
+  // typedef typename row_map_view_t::const_type const_row_map_view_t;
+  // typedef typename cols_view_t::const_type   const_cols_view_t;
 
   typedef typename row_map_view_t::non_const_value_type size_type;
   typedef typename cols_view_t::non_const_value_type lno_t;
 
-  //const size_type *rowmap = in_rowmap.data();
-  //const lno_t *entries= in_entries.data();
+  // const size_type *rowmap = in_rowmap.data();
+  // const lno_t *entries= in_entries.data();
   const size_type ne = in_entries.extent(0);
-  out_rowmap = out_row_map_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), ne + 1);
-  //const lno_t nr = in_rowmap.extent(0) - 1;
+  out_rowmap         = out_row_map_view_t(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), ne + 1);
+  // const lno_t nr = in_rowmap.extent(0) - 1;
   typedef Kokkos::RangePolicy<exec_space> my_exec_space;
 
-  Kokkos::parallel_for("KokkosKernels::Common::CreateIncidenceTransposeMatrixFromLowerTriangle::S0", my_exec_space(0, ne + 1),
-      KOKKOS_LAMBDA(const lno_t& i) {
-    out_rowmap[i] = i * 2;
-    });
-
-
-  //typedef Kokkos::TeamPolicy<exec_space> team_policy_t;
-  //int vector_size = 2;
-  //team_policy_t(ne)
-  //nv  / team_work_chunk_size + 1 , suggested_team_size, vector_size
-
-  out_entries = out_cols_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), 2 * ne);
-
-  //TODO MAKE IT WITH TEAMS.
-  Kokkos::parallel_for("KokkosKernels::Common::CreateIncidenceTransposeMatrixFromLowerTriangle::S1", my_exec_space(0, nr),
-      KOKKOS_LAMBDA(const size_type& row) {
-    size_type begin = in_rowmap(row);
-    lno_t row_size = in_rowmap(row + 1) - begin;
-    for (int i = 0; i < row_size; ++i){
-      size_type edge_ind = i + begin;
-      lno_t col = in_entries(edge_ind);
-      edge_ind = edge_ind * 2;
-      out_entries[edge_ind] = row;
-      out_entries[edge_ind + 1] = col;
-    }
-
-    });
-  }
-
+  Kokkos::parallel_for(
+      "KokkosKernels::Common::CreateIncidenceTransposeMatrixFromLowerTriangle::"
+      "S0",
+      my_exec_space(0, ne + 1),
+      KOKKOS_LAMBDA(const lno_t &i) { out_rowmap[i] = i * 2; });
+
+  // typedef Kokkos::TeamPolicy<exec_space> team_policy_t;
+  // int vector_size = 2;
+  // team_policy_t(ne)
+  // nv  / team_work_chunk_size + 1 , suggested_team_size, vector_size
+
+  out_entries = out_cols_view_t(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "LL"), 2 * ne);
+
+  // TODO MAKE IT WITH TEAMS.
+  Kokkos::parallel_for(
+      "KokkosKernels::Common::CreateIncidenceTransposeMatrixFromLowerTriangle::"
+      "S1",
+      my_exec_space(0, nr), KOKKOS_LAMBDA(const size_type &row) {
+        size_type begin = in_rowmap(row);
+        lno_t row_size  = in_rowmap(row + 1) - begin;
+        for (int i = 0; i < row_size; ++i) {
+          size_type edge_ind        = i + begin;
+          lno_t col                 = in_entries(edge_ind);
+          edge_ind                  = edge_ind * 2;
+          out_entries[edge_ind]     = row;
+          out_entries[edge_ind + 1] = col;
+        }
+      });
+}
 
-template <typename row_map_view_t,
-          typename cols_view_t,
-          typename out_row_map_view_t,
-          typename out_cols_view_t,
-          typename permutation_view_t,
-          typename exec_space
-          >
+template <typename row_map_view_t, typename cols_view_t,
+          typename out_row_map_view_t, typename out_cols_view_t,
+          typename permutation_view_t, typename exec_space>
 void kk_create_incidence_matrix_from_original_matrix(
-    typename cols_view_t::non_const_value_type nr,
-    row_map_view_t in_rowmap,
-    cols_view_t in_entries,
-    out_row_map_view_t &out_rowmap,
-    out_cols_view_t &out_entries,
-    permutation_view_t permutation,
-    bool use_dynamic_scheduling = false,
-    bool chunksize = 4){
-
-  //typedef typename row_map_view_t::const_type const_row_map_view_t;
-  //typedef typename cols_view_t::const_type   const_cols_view_t;
+    typename cols_view_t::non_const_value_type nr, row_map_view_t in_rowmap,
+    cols_view_t in_entries, out_row_map_view_t &out_rowmap,
+    out_cols_view_t &out_entries, permutation_view_t permutation,
+    bool use_dynamic_scheduling = false, bool chunksize = 4) {
+  // typedef typename row_map_view_t::const_type const_row_map_view_t;
+  // typedef typename cols_view_t::const_type   const_cols_view_t;
 
   typedef typename row_map_view_t::non_const_value_type size_type;
   typedef typename cols_view_t::non_const_value_type lno_t;
   typedef Kokkos::RangePolicy<exec_space> my_exec_space;
-  lno_t * perm = permutation.data();
+  lno_t *perm        = permutation.data();
   const size_type ne = in_entries.extent(0);
 
-  out_rowmap = out_row_map_view_t (Kokkos::view_alloc(Kokkos::WithoutInitializing, "out_rowmap"), nr+1);
-  out_entries = out_cols_view_t (Kokkos::view_alloc(Kokkos::WithoutInitializing, "out_cols_view"), ne);
-
+  out_rowmap = out_row_map_view_t(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "out_rowmap"), nr + 1);
+  out_entries = out_cols_view_t(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "out_cols_view"), ne);
 
-  //todo: need to try both true and false
+  // todo: need to try both true and false
   bool sort_decreasing_order = true;
-  //find the size of rows at upper triangular.
-  //this gives the size of each column in lower triangluar.
-  kk_get_lower_triangle_count
-  <size_type, lno_t, exec_space> (nr, ne, in_rowmap.data(), in_entries.data(),
-      out_rowmap.data(),
-      permutation.data(),
-      use_dynamic_scheduling,
-      chunksize, sort_decreasing_order);
+  // find the size of rows at upper triangular.
+  // this gives the size of each column in lower triangluar.
+  kk_get_lower_triangle_count<size_type, lno_t, exec_space>(
+      nr, ne, in_rowmap.data(), in_entries.data(), out_rowmap.data(),
+      permutation.data(), use_dynamic_scheduling, chunksize,
+      sort_decreasing_order);
   exec_space().fence();
-  kk_exclusive_parallel_prefix_sum<out_row_map_view_t, exec_space>(nr+1, out_rowmap);
-
-  //kk_print_1Dview(out_rowmap, false, 20);
-
-  out_row_map_view_t out_rowmap_copy (Kokkos::view_alloc(Kokkos::WithoutInitializing, "tmp"), nr+1);
-  //out_rowmap = out_row_map_view_t("LL", nr+1);
-  Kokkos::parallel_for("KokkosKernels::Common::CreateIncidenceTransposeMatrixFromOriginalTriangle::S0", my_exec_space(0, nr+1),
-      KOKKOS_LAMBDA(const lno_t& i) {
-    out_rowmap_copy[i] = in_rowmap[i];
-  });
-
-  if (sort_decreasing_order){
-    Kokkos::parallel_for("KokkosKernels::Common::CreateIncidenceTransposeMatrixFromOriginalTriangle::S1", my_exec_space(0, nr),
-        KOKKOS_LAMBDA(const size_type& row) {
-      size_type begin = in_rowmap(row);
-      lno_t row_size = in_rowmap(row + 1) - begin;
-
-      lno_t row_perm = row;
-      if (perm) row_perm = perm[row];
-      //std::cout << "row:" << row << " rowperm:" << row_perm << std::endl;
-      size_type used_edge_index = out_rowmap[row];
-      lno_t used_count = 0;
-      for (int i = 0; i < row_size; ++i){
-
-        size_type edge_ind = i + begin;
-        lno_t col = in_entries[edge_ind];
-
-        lno_t col_perm = col;
-        if (perm) col_perm = perm[col];
-        if (row_perm > col_perm){
-          typedef typename std::remove_reference< decltype( out_rowmap_copy[0] ) >::type atomic_incr_type;
-          size_type row_write_index = Kokkos::atomic_fetch_add(&(out_rowmap_copy[row]), atomic_incr_type(1));
-          size_type col_write_index = Kokkos::atomic_fetch_add(&(out_rowmap_copy[col]), atomic_incr_type(1));
-          out_entries[row_write_index] = used_edge_index + used_count;
-          out_entries[col_write_index] = used_edge_index + used_count;
-          ++used_count;
+  kk_exclusive_parallel_prefix_sum<out_row_map_view_t, exec_space>(nr + 1,
+                                                                   out_rowmap);
+
+  // kk_print_1Dview(out_rowmap, false, 20);
+
+  out_row_map_view_t out_rowmap_copy(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "tmp"), nr + 1);
+  // out_rowmap = out_row_map_view_t("LL", nr+1);
+  Kokkos::parallel_for(
+      "KokkosKernels::Common::"
+      "CreateIncidenceTransposeMatrixFromOriginalTriangle::S0",
+      my_exec_space(0, nr + 1),
+      KOKKOS_LAMBDA(const lno_t &i) { out_rowmap_copy[i] = in_rowmap[i]; });
+
+  if (sort_decreasing_order) {
+    Kokkos::parallel_for(
+        "KokkosKernels::Common::"
+        "CreateIncidenceTransposeMatrixFromOriginalTriangle::S1",
+        my_exec_space(0, nr), KOKKOS_LAMBDA(const size_type &row) {
+          size_type begin = in_rowmap(row);
+          lno_t row_size  = in_rowmap(row + 1) - begin;
+
+          lno_t row_perm = row;
+          if (perm) row_perm = perm[row];
+          // std::cout << "row:" << row << " rowperm:" << row_perm << std::endl;
+          size_type used_edge_index = out_rowmap[row];
+          lno_t used_count          = 0;
+          for (int i = 0; i < row_size; ++i) {
+            size_type edge_ind = i + begin;
+            lno_t col          = in_entries[edge_ind];
+
+            lno_t col_perm = col;
+            if (perm) col_perm = perm[col];
+            if (row_perm > col_perm) {
+              typedef typename std::remove_reference<decltype(
+                  out_rowmap_copy[0])>::type atomic_incr_type;
+              size_type row_write_index = Kokkos::atomic_fetch_add(
+                  &(out_rowmap_copy[row]), atomic_incr_type(1));
+              size_type col_write_index = Kokkos::atomic_fetch_add(
+                  &(out_rowmap_copy[col]), atomic_incr_type(1));
+              out_entries[row_write_index] = used_edge_index + used_count;
+              out_entries[col_write_index] = used_edge_index + used_count;
+              ++used_count;
+            }
+          }
+        });
 
-        }
-      }
-    });
+  } else {
+    Kokkos::parallel_for(
+        "KokkosKernels::Common::"
+        "CreateIncidenceTransposeMatrixFromOriginalTriangle::S2",
+        my_exec_space(0, nr), KOKKOS_LAMBDA(const size_type &row) {
+          size_type begin = in_rowmap(row);
+          lno_t row_size  = in_rowmap(row + 1) - begin;
+
+          lno_t row_perm = row;
+          if (perm) row_perm = perm[row];
+          // std::cout << "row:" << row << " rowperm:" << row_perm << std::endl;
+          size_type used_edge_index = out_rowmap[row];
+          lno_t used_count          = 0;
+          for (int i = 0; i < row_size; ++i) {
+            size_type edge_ind = i + begin;
+            lno_t col          = in_entries[edge_ind];
+
+            lno_t col_perm = col;
+            if (perm) col_perm = perm[col];
+            if (row_perm < col_perm) {
+              typedef typename std::remove_reference<decltype(
+                  out_rowmap_copy[0])>::type atomic_incr_type;
+              size_type row_write_index = Kokkos::atomic_fetch_add(
+                  &(out_rowmap_copy[row]), atomic_incr_type(1));
+              size_type col_write_index = Kokkos::atomic_fetch_add(
+                  &(out_rowmap_copy[col]), atomic_incr_type(1));
+              out_entries[row_write_index] = used_edge_index + used_count;
+              out_entries[col_write_index] = used_edge_index + used_count;
+              ++used_count;
+            }
+          }
+        });
+  }
 
+  // out_rowmap = out_row_map_view_t("LL", nr+1);
+  Kokkos::parallel_for(
+      "KokkosKernels::Common::"
+      "CreateIncidenceTransposeMatrixFromOriginalTriangle::S3",
+      my_exec_space(0, nr + 1),
+      KOKKOS_LAMBDA(const lno_t &i) { out_rowmap[i] = in_rowmap[i]; });
+}
 
-  }
-  else {
-    Kokkos::parallel_for("KokkosKernels::Common::CreateIncidenceTransposeMatrixFromOriginalTriangle::S2", my_exec_space(0, nr),
-      KOKKOS_LAMBDA(const size_type& row) {
-    size_type begin = in_rowmap(row);
-    lno_t row_size = in_rowmap(row + 1) - begin;
-
-    lno_t row_perm = row;
-    if (perm) row_perm = perm[row];
-    //std::cout << "row:" << row << " rowperm:" << row_perm << std::endl;
-    size_type used_edge_index = out_rowmap[row];
-    lno_t used_count = 0;
-    for (int i = 0; i < row_size; ++i){
-
-      size_type edge_ind = i + begin;
-      lno_t col = in_entries[edge_ind];
-
-      lno_t col_perm = col;
-      if (perm) col_perm = perm[col];
-      if (row_perm < col_perm){
-        typedef typename std::remove_reference< decltype( out_rowmap_copy[0] ) >::type atomic_incr_type;
-        size_type row_write_index = Kokkos::atomic_fetch_add(&(out_rowmap_copy[row]), atomic_incr_type(1));
-        size_type col_write_index = Kokkos::atomic_fetch_add(&(out_rowmap_copy[col]), atomic_incr_type(1));
-        out_entries[row_write_index] = used_edge_index + used_count;
-        out_entries[col_write_index] = used_edge_index + used_count;
-        ++used_count;
+template <typename view_type>
+struct ReduceLargerRowCount {
+  view_type rowmap;
+  typename view_type::const_value_type threshold;
 
-      }
+  ReduceLargerRowCount(view_type view_to_reduce_,
+                       typename view_type::const_value_type threshold_)
+      : rowmap(view_to_reduce_), threshold(threshold_) {}
+  KOKKOS_INLINE_FUNCTION
+  void operator()(
+      const size_t &i,
+      typename view_type::non_const_value_type &sum_reduction) const {
+    if (rowmap(i + 1) - rowmap(i) > threshold) {
+      sum_reduction += 1;
     }
-  });
   }
+};
 
-
-  //out_rowmap = out_row_map_view_t("LL", nr+1);
-  Kokkos::parallel_for("KokkosKernels::Common::CreateIncidenceTransposeMatrixFromOriginalTriangle::S3", my_exec_space(0, nr+1),
-      KOKKOS_LAMBDA(const lno_t& i) {
-    out_rowmap[i] = in_rowmap[i];
-  });
+template <typename view_type, typename MyExecSpace>
+void kk_reduce_numrows_larger_than_threshold(
+    size_t num_elements, view_type view_to_reduce,
+    typename view_type::const_value_type threshold,
+    typename view_type::non_const_value_type &sum_reduction) {
+  typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
+  Kokkos::parallel_reduce(
+      "KokkosKernels::Common::ReduceNumRowsLargerThanThreshold",
+      my_exec_space(0, num_elements),
+      ReduceLargerRowCount<view_type>(view_to_reduce, threshold),
+      sum_reduction);
 }
 
+// Note: "block" in property name means it's block internal - otherwise it
+// addresses sparse rows/columns (whole blocks) within whole matrix.
+template <typename lno_t, typename size_type>
+class RowIndexBase {
+ public:
+  KOKKOS_INLINE_FUNCTION
+  RowIndexBase(const lno_t block_size_, const lno_t row_begin_,
+               const lno_t row_end_)
+      : block_size(block_size_),
+        row_begin(row_begin_),
+        row_end(row_end_),
+        row_size(row_end_ - row_begin_) {
+    row_off = row_begin_ * block_mtx_size();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  lno_t begin() { return row_begin; }
 
+  KOKKOS_INLINE_FUNCTION
+  lno_t end() { return row_end; }
 
-template<typename view_type>
-struct ReduceLargerRowCount{
+  KOKKOS_INLINE_FUNCTION
+  lno_t size() { return row_size; }
 
-  view_type rowmap;
-  typename view_type::const_value_type threshold;
+  KOKKOS_INLINE_FUNCTION
+  size_type block_mtx_size() {
+    return static_cast<size_type>(block_size) *
+           static_cast<size_type>(block_size);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type row_offset() { return row_off; }
+
+ protected:
+  lno_t block_size;  // = num_block_cols = num_block_rows
+  lno_t row_begin;
+  lno_t row_end;
+
+ private:  // cache
+  size_type row_off;
+  lno_t row_size;
+};
+
+template <SparseMatrixFormat /* format */, typename lno_t, typename size_type>
+class MatrixRowIndex;
+
+template <typename lno_t, typename size_type>
+class MatrixRowIndex<BlockCRS, lno_t, size_type>
+    : public RowIndexBase<lno_t, size_type> {
+ public:
+  using Base = RowIndexBase<lno_t, size_type>;
+
+  KOKKOS_INLINE_FUNCTION
+  MatrixRowIndex(const lno_t block_size_, const lno_t row_begin_,
+                 const lno_t row_end_)
+      : Base(block_size_, row_begin_, row_end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  size_type block(const lno_t col_idx) {
+    return Base::row_offset() + col_idx * Base::block_size;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type block_stride() { return Base::size() * Base::block_size; }
 
-  ReduceLargerRowCount(view_type view_to_reduce_, typename view_type::const_value_type threshold_): rowmap(view_to_reduce_), threshold(threshold_){}
   KOKKOS_INLINE_FUNCTION
-  void operator()(const size_t &i, typename view_type::non_const_value_type &sum_reduction) const {
-	  if (rowmap(i+1) - rowmap(i) > threshold){
-		  sum_reduction += 1;
-	  }
+  size_type value(const lno_t col_idx, const lno_t block_row,
+                  const lno_t block_col) {
+    return block(col_idx) + block_row * block_stride() + block_col;
   }
 };
 
-template <typename view_type , typename MyExecSpace>
-void kk_reduce_numrows_larger_than_threshold(size_t num_elements, view_type view_to_reduce,
-		typename view_type::const_value_type threshold, typename view_type::non_const_value_type &sum_reduction){
-  typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-  Kokkos::parallel_reduce( "KokkosKernels::Common::ReduceNumRowsLargerThanThreshold", my_exec_space(0,num_elements), ReduceLargerRowCount<view_type>(view_to_reduce, threshold), sum_reduction);
-}
+template <typename lno_t, typename size_type>
+class MatrixRowIndex<BSR, lno_t, size_type>
+    : public RowIndexBase<lno_t, size_type> {
+ public:
+  using Base = RowIndexBase<lno_t, size_type>;
 
-}
-}
+  KOKKOS_INLINE_FUNCTION
+  MatrixRowIndex(const lno_t block_size_, const lno_t row_begin_,
+                 const lno_t row_end_)
+      : Base(block_size_, row_begin_, row_end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  size_type block(const lno_t col_idx) {
+    return Base::row_offset() + col_idx * Base::block_mtx_size();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type block_stride() { return Base::block_size; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type value(const lno_t col_idx, const lno_t block_row,
+                  const lno_t block_col) {
+    return block(col_idx) + block_row * block_stride() + block_col;
+  }
+};
+
+template <typename mtx_t>
+struct MatrixTraits;
+
+template <typename scalar_t, typename lno_t, typename device,
+          typename mem_traits, typename size_type>
+struct MatrixTraits<
+    KokkosSparse::CrsMatrix<scalar_t, lno_t, device, mem_traits, size_type>> {
+  static constexpr auto format = KokkosKernels::CRS;
+};
+
+template <typename scalar_t, typename lno_t, typename device,
+          typename mem_traits, typename size_type>
+struct MatrixTraits<KokkosSparse::Experimental::BlockCrsMatrix<
+    scalar_t, lno_t, device, mem_traits, size_type>> {
+  static constexpr auto format = KokkosKernels::BlockCRS;
+};
+
+template <typename scalar_t, typename lno_t, typename device,
+          typename mem_traits, typename size_type>
+struct MatrixTraits<KokkosSparse::Experimental::BsrMatrix<
+    scalar_t, lno_t, device, mem_traits, size_type>> {
+  static constexpr auto format = KokkosKernels::BSR;
+};
+
+template <SparseMatrixFormat /* outFormat */>
+struct MatrixConverter;
+
+template <>
+struct MatrixConverter<BlockCRS> {
+  template <
+      typename scalar_t, typename lno_t, typename device, typename size_type,
+      typename crsMat_t =
+          KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>,
+      typename blockCrsMat_t = KokkosSparse::Experimental::BlockCrsMatrix<
+          scalar_t, lno_t, device, void, size_type>>
+  static blockCrsMat_t from_blockcrs_formated_point_crsmatrix(
+      const KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
+          &mtx,
+      lno_t block_size) {
+    return blockCrsMat_t(mtx, block_size);
+  }
+};
+
+template <>
+struct MatrixConverter<BSR> {
+  template <typename scalar_t, typename lno_t, typename size_type,
+            typename device,
+            typename bsrMtx_t = KokkosSparse::Experimental::BsrMatrix<
+                scalar_t, lno_t, device, void, size_type>>
+  static bsrMtx_t from_blockcrs_formated_point_crsmatrix(
+      const KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
+          &mtx,
+      lno_t block_size) {
+    return bsrMtx_t(mtx, block_size);
+  }
+};
+
+}  // namespace Impl
+}  // namespace KokkosKernels
 
 #endif
diff --git a/src/common/KokkosKernels_SparseUtils_cusparse.hpp b/src/common/KokkosKernels_SparseUtils_cusparse.hpp
index 3d3144e9cc..ea9bfd37dd 100644
--- a/src/common/KokkosKernels_SparseUtils_cusparse.hpp
+++ b/src/common/KokkosKernels_SparseUtils_cusparse.hpp
@@ -52,48 +52,47 @@ namespace KokkosSparse {
 namespace Impl {
 
 inline void cusparse_internal_error_throw(cusparseStatus_t cusparseStatus,
-					  const char *name,
-					  const char *file,
-					  const int line) {
+                                          const char* name, const char* file,
+                                          const int line) {
   std::ostringstream out;
 #if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION)
   out << name << " error( " << cusparseGetErrorName(cusparseStatus)
       << "): " << cusparseGetErrorString(cusparseStatus);
 #else
   out << name << " error( ";
-  switch(cusparseStatus) {
-  case CUSPARSE_STATUS_NOT_INITIALIZED:
-    out << "CUSPARSE_STATUS_NOT_INITIALIZED): cusparse handle was not created correctly.";
-    break;
-  case CUSPARSE_STATUS_ALLOC_FAILED:
-    out << "CUSPARSE_STATUS_ALLOC_FAILED): you might tried to allocate too much memory";
-    break;
-  case CUSPARSE_STATUS_INVALID_VALUE:
-    out << "CUSPARSE_STATUS_INVALID_VALUE)";
-    break;
-  case CUSPARSE_STATUS_ARCH_MISMATCH:
-    out << "CUSPARSE_STATUS_ARCH_MISMATCH)";
-    break;
-  case CUSPARSE_STATUS_MAPPING_ERROR:
-    out << "CUSPARSE_STATUS_MAPPING_ERROR)";
-    break;
-  case CUSPARSE_STATUS_EXECUTION_FAILED:
-    out << "CUSPARSE_STATUS_EXECUTION_FAILED)";
-    break;
-  case CUSPARSE_STATUS_INTERNAL_ERROR:
-    out << "CUSPARSE_STATUS_INTERNAL_ERROR)";
-    break;
-  case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
-    out << "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED)";
-    break;
-  case CUSPARSE_STATUS_ZERO_PIVOT:
-    out << "CUSPARSE_STATUS_ZERO_PIVOT)";
-    break;
-  default:
-    out << "unrecognized error code): this is bad!";
-    break;
+  switch (cusparseStatus) {
+    case CUSPARSE_STATUS_NOT_INITIALIZED:
+      out << "CUSPARSE_STATUS_NOT_INITIALIZED): cusparse handle was not "
+             "created correctly.";
+      break;
+    case CUSPARSE_STATUS_ALLOC_FAILED:
+      out << "CUSPARSE_STATUS_ALLOC_FAILED): you might tried to allocate too "
+             "much memory";
+      break;
+    case CUSPARSE_STATUS_INVALID_VALUE:
+      out << "CUSPARSE_STATUS_INVALID_VALUE)";
+      break;
+    case CUSPARSE_STATUS_ARCH_MISMATCH:
+      out << "CUSPARSE_STATUS_ARCH_MISMATCH)";
+      break;
+    case CUSPARSE_STATUS_MAPPING_ERROR:
+      out << "CUSPARSE_STATUS_MAPPING_ERROR)";
+      break;
+    case CUSPARSE_STATUS_EXECUTION_FAILED:
+      out << "CUSPARSE_STATUS_EXECUTION_FAILED)";
+      break;
+    case CUSPARSE_STATUS_INTERNAL_ERROR:
+      out << "CUSPARSE_STATUS_INTERNAL_ERROR)";
+      break;
+    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      out << "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED)";
+      break;
+    case CUSPARSE_STATUS_ZERO_PIVOT:
+      out << "CUSPARSE_STATUS_ZERO_PIVOT)";
+      break;
+    default: out << "unrecognized error code): this is bad!"; break;
   }
-#endif // CUSPARSE_VERSION
+#endif  // CUSPARSE_VERSION
   if (file) {
     out << " " << file << ":" << line;
   }
@@ -101,25 +100,23 @@ inline void cusparse_internal_error_throw(cusparseStatus_t cusparseStatus,
 }
 
 inline void cusparse_internal_safe_call(cusparseStatus_t cusparseStatus,
-					const char* name,
-					const char* file = nullptr,
-					const int line   = 0) {
+                                        const char* name,
+                                        const char* file = nullptr,
+                                        const int line   = 0) {
   if (CUSPARSE_STATUS_SUCCESS != cusparseStatus) {
     cusparse_internal_error_throw(cusparseStatus, name, file, line);
   }
 }
 
-  // The macro below defines is the public interface for the safe cusparse calls.
-  // The functions themselves are protected by impl namespace.
-#define KOKKOS_CUSPARSE_SAFE_CALL(call) \
-  KokkosSparse::Impl::cusparse_internal_safe_call(call, #call, __FILE__, __LINE__)
+// The macro below defines is the public interface for the safe cusparse calls.
+// The functions themselves are protected by impl namespace.
+#define KOKKOS_CUSPARSE_SAFE_CALL(call)                                  \
+  KokkosSparse::Impl::cusparse_internal_safe_call(call, #call, __FILE__, \
+                                                  __LINE__)
 
 }  // namespace Impl
 
-
-
 }  // namespace KokkosSparse
 
-
-#endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-#endif // _KOKKOSKERNELS_SPARSEUTILS_CUSPARSE_HPP
+#endif  // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+#endif  // _KOKKOSKERNELS_SPARSEUTILS_CUSPARSE_HPP
diff --git a/src/common/KokkosKernels_SparseUtils_rocsparse.hpp b/src/common/KokkosKernels_SparseUtils_rocsparse.hpp
new file mode 100644
index 0000000000..4637aab5a4
--- /dev/null
+++ b/src/common/KokkosKernels_SparseUtils_rocsparse.hpp
@@ -0,0 +1,185 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef _KOKKOSKERNELS_SPARSEUTILS_ROCSPARSE_HPP
+#define _KOKKOSKERNELS_SPARSEUTILS_ROCSPARSE_HPP
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE
+#include "rocsparse.h"
+
+namespace KokkosSparse {
+namespace Impl {
+
+inline void rocsparse_internal_error_throw(rocsparse_status rocsparseStatus,
+                                           const char* name, const char* file,
+                                           const int line) {
+  std::ostringstream out;
+  out << name << " error( ";
+  switch (rocsparseStatus) {
+    case rocsparse_status_invalid_handle:
+      out << "rocsparse_status_invalid_handle): handle not initialized, "
+             "invalid or null.";
+      break;
+    case rocsparse_status_not_implemented:
+      out << "rocsparse_status_not_implemented): function is not implemented.";
+      break;
+    case rocsparse_status_invalid_pointer:
+      out << "rocsparse_status_invalid_pointer): invalid pointer parameter.";
+      break;
+    case rocsparse_status_invalid_size:
+      out << "rocsparse_status_invalid_size): invalid size parameter.";
+      break;
+    case rocsparse_status_memory_error:
+      out << "rocsparse_status_memory_error): failed memory allocation, copy, "
+             "dealloc.";
+      break;
+    case rocsparse_status_internal_error:
+      out << "rocsparse_status_internal_error): other internal library "
+             "failure.";
+      break;
+    case rocsparse_status_invalid_value:
+      out << "rocsparse_status_invalid_value): invalid value parameter.";
+      break;
+    case rocsparse_status_arch_mismatch:
+      out << "rocsparse_status_arch_mismatch): device arch is not supported.";
+      break;
+    case rocsparse_status_zero_pivot:
+      out << "rocsparse_status_zero_pivot): encountered zero pivot.";
+      break;
+    case rocsparse_status_not_initialized:
+      out << "rocsparse_status_not_initialized): descriptor has not been "
+             "initialized.";
+      break;
+    case rocsparse_status_type_mismatch:
+      out << "rocsparse_status_type_mismatch): index types do not match.";
+      break;
+    default: out << "unrecognized error code): this is bad!"; break;
+  }
+  if (file) {
+    out << " " << file << ":" << line;
+  }
+  throw std::runtime_error(out.str());
+}
+
+inline void rocsparse_internal_safe_call(rocsparse_status rocsparseStatus,
+                                         const char* name,
+                                         const char* file = nullptr,
+                                         const int line   = 0) {
+  if (rocsparse_status_success != rocsparseStatus) {
+    rocsparse_internal_error_throw(rocsparseStatus, name, file, line);
+  }
+}
+
+// The macro below defines is the public interface for the safe cusparse calls.
+// The functions themselves are protected by impl namespace.
+#define KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(call)                             \
+  KokkosSparse::Impl::rocsparse_internal_safe_call(call, #call, __FILE__, \
+                                                   __LINE__)
+
+inline rocsparse_operation mode_kk_to_rocsparse(const char kk_mode[]) {
+  rocsparse_operation myRocsparseOperation;
+  switch (toupper(kk_mode[0])) {
+    case 'N': myRocsparseOperation = rocsparse_operation_none; break;
+    case 'T': myRocsparseOperation = rocsparse_operation_transpose; break;
+    case 'H':
+      myRocsparseOperation = rocsparse_operation_conjugate_transpose;
+      break;
+    default: {
+      std::cerr << "Mode " << kk_mode[0] << " invalid for rocSPARSE SpMV.\n";
+      throw std::invalid_argument("Invalid mode");
+    }
+  }
+  return myRocsparseOperation;
+}
+
+template <typename index_type>
+inline rocsparse_indextype rocsparse_index_type() {
+  if (std::is_same<index_type, uint16_t>::value) {
+    return rocsparse_indextype_u16;
+  } else if (std::is_same<index_type, int32_t>::value) {
+    return rocsparse_indextype_i32;
+  } else if (std::is_same<index_type, int64_t>::value) {
+    return rocsparse_indextype_i64;
+  } else {
+    std::ostringstream out;
+    out << "Trying to call rocSPARSE SpMV with unsupported index type: "
+        << typeid(index_type).name();
+    throw std::logic_error(out.str());
+  }
+}
+
+template <typename data_type>
+inline rocsparse_datatype rocsparse_compute_type() {
+  std::ostringstream out;
+  out << "Trying to call rocSPARSE SpMV with unsupported compute type: "
+      << typeid(data_type).name();
+  throw std::logic_error(out.str());
+}
+
+template <>
+inline rocsparse_datatype rocsparse_compute_type<float>() {
+  return rocsparse_datatype_f32_r;
+}
+
+template <>
+inline rocsparse_datatype rocsparse_compute_type<double>() {
+  return rocsparse_datatype_f64_r;
+}
+
+template <>
+inline rocsparse_datatype rocsparse_compute_type<Kokkos::complex<float>>() {
+  return rocsparse_datatype_f32_c;
+}
+
+template <>
+inline rocsparse_datatype rocsparse_compute_type<Kokkos::complex<double>>() {
+  return rocsparse_datatype_f64_c;
+}
+
+}  // namespace Impl
+
+}  // namespace KokkosSparse
+
+#endif  // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+#endif  // _KOKKOSKERNELS_SPARSEUTILS_CUSPARSE_HPP
diff --git a/src/common/KokkosKernels_Uniform_Initialized_MemoryPool.hpp b/src/common/KokkosKernels_Uniform_Initialized_MemoryPool.hpp
index a8cd4c0c44..93dc94f07e 100644
--- a/src/common/KokkosKernels_Uniform_Initialized_MemoryPool.hpp
+++ b/src/common/KokkosKernels_Uniform_Initialized_MemoryPool.hpp
@@ -48,36 +48,46 @@
 #include "KokkosKernels_Utils.hpp"
 #include <iostream>
 
-namespace KokkosKernels{
+namespace KokkosKernels {
 
-namespace Impl{
+namespace Impl {
 
-
-
-enum PoolType {OneThread2OneChunk, ManyThread2OneChunk};
+enum PoolType { OneThread2OneChunk, ManyThread2OneChunk };
 
 /*! \brief Class for simple memory pool allocations.
  *  At the constructor, we set the number of chunks and the size of a chunk,
- *  and afterwards each allocation returns a pointer in the size of 'chunk' that has
+ *  and afterwards each allocation returns a pointer in the size of 'chunk' that
+ has
  *  been set in the constructor.
  *
  *  There are two modes of the memory pool:
- *  OneThread2OneChunk: This is the case where we have 1-to-1 chunks for the number of
- *    threads, that is #chunks = #threads. This is the usual case for CPUs, OpenMP and Threads.
+ *  OneThread2OneChunk: This is the case where we have 1-to-1 chunks for the
+ number of
+ *    threads, that is #chunks = #threads. This is the usual case for CPUs,
+ OpenMP and Threads.
  *    We have a dedicated memory for each thread.
  *
- *    NOTE: This mode can be used for GPUs as well, based on the maximum number of threads
- *    that can be run on GPUs. However, since we this mode require unique thread id's this
- *    is not yet can be retrieved on GPUs. Therefore, dont use this mode on GPUs yet.
+ *    NOTE: This mode can be used for GPUs as well, based on the maximum number
+ of threads
+ *    that can be run on GPUs. However, since we this mode require unique thread
+ id's this
+ *    is not yet can be retrieved on GPUs. Therefore, dont use this mode on GPUs
+ yet.
  *
- *   ManyThread2OneChunk: This is the case where we have chunks <= # threads. Many thread
- *   will race for a memory allocation, some will get NULL pointers if there is not
- *   enough memory. This case still would work for #chunks = #threads, with an extra atomic
- *   operation. On GPUs, even when #chunks = Kokkos::Cuda::concurrency(), this option is safe
+ *   ManyThread2OneChunk: This is the case where we have chunks <= # threads.
+ Many thread
+ *   will race for a memory allocation, some will get NULL pointers if there is
+ not
+ *   enough memory. This case still would work for #chunks = #threads, with an
+ extra atomic
+ *   operation. On GPUs, even when #chunks = Kokkos::Cuda::concurrency(), this
+ option is safe
  *   to use.
  *
- *   This pool is limited in the sense that each allocation will be in the size of chunksize.
- *   The pool is specifically aimed for below cases which are difficult to write in kokkos:
+ *   This pool is limited in the sense that each allocation will be in the size
+ of chunksize.
+ *   The pool is specifically aimed for below cases which are difficult to write
+ in kokkos:
  *
  *    #pragma omp parallel
  *    {
@@ -92,41 +102,54 @@ enum PoolType {OneThread2OneChunk, ManyThread2OneChunk};
  *        ........
  *
  *        //////reset the accumulator//////
- *        for (i: 1->UsedAccumulatorIndexCount) Accumulator[ UsedAccumulatorIndices[i] ] = 0;
+ *        for (i: 1->UsedAccumulatorIndexCount) Accumulator[
+ UsedAccumulatorIndices[i] ] = 0;
  *      }
  *    }
  *
- *  In this example, each thread initializes its thread-private Accumulator (with size N) once.
- *  Then it clears its Accumulator at the end of each iteration using the sparse indices
+ *  In this example, each thread initializes its thread-private Accumulator
+ (with size N) once.
+ *  Then it clears its Accumulator at the end of each iteration using the sparse
+ indices
  *  (which is usually much less than N).
  *  Doing that in Kokkos,
- *      --- either requires the initializations to go into loop body, resulting in N work in each loop.
- *      --- Or, we can use preinitialized 2d views where the first dimension is ExecutionSpace::concurrency()
- *          However, this case becomes a problem in CUDA, as concurrency is pretty high and we might not have
+ *      --- either requires the initializations to go into loop body, resulting
+ in N work in each loop.
+ *      --- Or, we can use preinitialized 2d views where the first dimension is
+ ExecutionSpace::concurrency()
+ *          However, this case becomes a problem in CUDA, as concurrency is
+ pretty high and we might not have
  *          enough memory for that.
  *
  *
  *
- *  Using this pool, on can simply use memory allocations without worrying about the memory constraints, with
- *  a sacrifize on the parallelization on cuda, when there is not enough memory. Also, for the algorithms that
- *  would use scratch space, and would need memory only if it runs out of scratch space, since this memory allocation
- *  will not be performed by all threads, there might not be sacrifize on the parallelism even when there is not
+ *  Using this pool, on can simply use memory allocations without worrying about
+ the memory constraints, with
+ *  a sacrifize on the parallelization on cuda, when there is not enough memory.
+ Also, for the algorithms that
+ *  would use scratch space, and would need memory only if it runs out of
+ scratch space, since this memory allocation
+ *  will not be performed by all threads, there might not be sacrifize on the
+ parallelism even when there is not
  *  enough memory for all threads.
  *
  *
- *  void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type & teamMember) const {
+ *  void operator()(const typename
+ Kokkos::TeamPolicy<ExecutionSpace>::member_type & teamMember) const {
  *      volatile idx * myData = NULL;
  *      size_t tid = this->get_thread_id(teamMember);
  *
  *      while (myData == NULL){
- *        Kokkos::single(Kokkos::PerThread(teamMember),[&] (volatile idx * &memptr) {
+ *        Kokkos::single(Kokkos::PerThread(teamMember),[&] (volatile idx *
+ &memptr) {
  *          memptr = (volatile idx * )this->my_memory_pool.allocate_chunk(tid);
  *        }, myData);
  *      }
  *
  *
  *      /////..............work on memptr................../////
- *      /////..............reset as above on memptr so that thread leaves the memptr as it finds................../////
+ *      /////..............reset as above on memptr so that thread leaves the
+ memptr as it finds................../////
  *
  *      Kokkos::single(Kokkos::PerThread(teamMember),[=] () {
           this->my_memory_pool.release_chunk((idx *) myData);
@@ -134,124 +157,123 @@ enum PoolType {OneThread2OneChunk, ManyThread2OneChunk};
  *  }
  *
  * It would only be set as below for different spaces.
- *  typedef typename KokkosKernels::Impl::UniformMemoryPool<ExecSpace, data_type> simple_pool;
- *  simple_pool mypool(number_of_chuns, chunksize, initialization_value, KokkosKernels::Impl::OneThread2OneChunk)
+ *  typedef typename KokkosKernels::Impl::UniformMemoryPool<ExecSpace,
+ data_type> simple_pool;
+ *  simple_pool mypool(number_of_chuns, chunksize, initialization_value,
+ KokkosKernels::Impl::OneThread2OneChunk)
  *  or
- *  simple_pool mypool(number_of_chuns, chunksize, initialization_value, KokkosKernels::Impl::ManyThread2OneChunk)
+ *  simple_pool mypool(number_of_chuns, chunksize, initialization_value,
+ KokkosKernels::Impl::ManyThread2OneChunk)
  *
- *  Therefore, the option OneThread2OneChunk will guarantee that the same memory is used again and again by the same thread.
- *  This is not guaranteed by ManyThread2OneChunk, but as long as threads reset the memory they use, it will guarantee a memory
+ *  Therefore, the option OneThread2OneChunk will guarantee that the same memory
+ is used again and again by the same thread.
+ *  This is not guaranteed by ManyThread2OneChunk, but as long as threads reset
+ the memory they use, it will guarantee a memory
  *  that is has been initialized.
  */
 template <typename MyExecSpace, typename data_type>
-class UniformMemoryPool{
-
-private:
+class UniformMemoryPool {
+ private:
   typedef int lock_type;
-  typedef typename Kokkos::View <lock_type *, MyExecSpace> lock_view_t;
-  typedef typename Kokkos::View <data_type *, MyExecSpace> data_view_t;
+  typedef typename Kokkos::View<lock_type *, MyExecSpace> lock_view_t;
+  typedef typename Kokkos::View<data_type *, MyExecSpace> data_view_t;
 
   size_t num_chunks;
   size_t num_set_chunks;
   size_t modular_num_chunks;
   size_t chunk_size;
   size_t overall_size;
-  //const size_t next_free_chunk;
-  //const size_t last_free_chunk;
+  // const size_t next_free_chunk;
+  // const size_t last_free_chunk;
 
-  //index_view_t free_chunks;
+  // index_view_t free_chunks;
   lock_view_t chunk_locks;
   lock_type *pchunk_locks;
   data_view_t data_view;
   data_type *data;
   PoolType pool_type;
 
-public:
+ public:
   using execution_space = typename MyExecSpace::execution_space;
-  using memory_space = typename MyExecSpace::memory_space;
+  using memory_space    = typename MyExecSpace::memory_space;
 
   /**
    * \brief UniformMemoryPool constructor.
-   * \param num_chunks_: number of chunks. This will be rounded to minimum pow2 number.
-   * \param chunk_size_: chunk size, the size of each allocation.
-   * \param initialized_value: the value to initialize
-   * \param pool_type_: whether ManyThread2OneChunk or OneThread2OneChunk
+   * \param num_chunks_: number of chunks. This will be rounded to minimum pow2
+   * number. \param chunk_size_: chunk size, the size of each allocation. \param
+   * initialized_value: the value to initialize \param pool_type_: whether
+   * ManyThread2OneChunk or OneThread2OneChunk
    */
-  UniformMemoryPool(const size_t num_chunks_,
-                    const size_t set_chunk_size_,
+  UniformMemoryPool(const size_t num_chunks_, const size_t set_chunk_size_,
                     const data_type initialized_value = 0,
-                    const PoolType pool_type_ = OneThread2OneChunk,
-					bool initialize = true):
-                      num_chunks(1),
-                    num_set_chunks(num_chunks_), modular_num_chunks(0),
-                    chunk_size(set_chunk_size_),
-                    overall_size(),
-                    //next_free_chunk(0),
-                    //last_free_chunk(chunk_size_),
-                    //free_chunks (),
-                    chunk_locks (),
-                    pchunk_locks(),
-                    data_view (),
-                    data(),
-                    pool_type (pool_type_)
-                    {
-
+                    const PoolType pool_type_         = OneThread2OneChunk,
+                    bool initialize                   = true)
+      : num_chunks(1),
+        num_set_chunks(num_chunks_),
+        modular_num_chunks(0),
+        chunk_size(set_chunk_size_),
+        overall_size(),
+        // next_free_chunk(0),
+        // last_free_chunk(chunk_size_),
+        // free_chunks (),
+        chunk_locks(),
+        pchunk_locks(),
+        data_view(),
+        data(),
+        pool_type(pool_type_) {
     num_chunks = 1;
-    while (num_set_chunks > num_chunks){
+    while (num_set_chunks > num_chunks) {
       num_chunks *= 2;
     }
-    modular_num_chunks = num_chunks -1;
-    overall_size = num_chunks * chunk_size;
-    if (num_set_chunks > 0){
-    	data_view = data_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "pool data"), overall_size);
+    modular_num_chunks = num_chunks - 1;
+    overall_size       = num_chunks * chunk_size;
+    if (num_set_chunks > 0) {
+      data_view = data_view_t(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "pool data"),
+          overall_size);
     }
     data = (data_view.data());
 
     this->set_pool_type(pool_type_);
 
-    if (initialize){
-    	Kokkos::deep_copy(data_view, initialized_value);
+    if (initialize) {
+      Kokkos::deep_copy(data_view, initialized_value);
     }
-
   }
 
   /**
    * \brief UniformMemoryPool constructor
    */
-  UniformMemoryPool():
-                    num_chunks(1),
-                    num_set_chunks(0), modular_num_chunks(0),
-                    chunk_size(0),
-                    overall_size(0),
-                    //next_free_chunk(0),
-                    //last_free_chunk(0),
-                    //free_chunks (),
-                    chunk_locks (),
-                    pchunk_locks(),
-                    data_view (),
-                    data(),
-                    pool_type ()
-                    {
-  }
-
-
-
-
-    ~UniformMemoryPool() = default;
-
-    UniformMemoryPool( UniformMemoryPool && ) = default;
-    UniformMemoryPool( const UniformMemoryPool & ) = default;
-    UniformMemoryPool & operator = ( UniformMemoryPool && ) = default;
-    UniformMemoryPool & operator = ( const UniformMemoryPool & ) = default;
+  UniformMemoryPool()
+      : num_chunks(1),
+        num_set_chunks(0),
+        modular_num_chunks(0),
+        chunk_size(0),
+        overall_size(0),
+        // next_free_chunk(0),
+        // last_free_chunk(0),
+        // free_chunks (),
+        chunk_locks(),
+        pchunk_locks(),
+        data_view(),
+        data(),
+        pool_type() {}
+
+  ~UniformMemoryPool() = default;
+
+  UniformMemoryPool(UniformMemoryPool &&)      = default;
+  UniformMemoryPool(const UniformMemoryPool &) = default;
+  UniformMemoryPool &operator=(UniformMemoryPool &&) = default;
+  UniformMemoryPool &operator=(const UniformMemoryPool &) = default;
 
   /**
    * \brief To set the pool type
    * \param pool_type_: whether ManyThread2OneChunk or OneThread2OneChunk
    */
-  void set_pool_type (PoolType pool_type_){
-    if (pool_type_ == ManyThread2OneChunk){
-      if (num_set_chunks){
-    	  chunk_locks = lock_view_t("locks", num_chunks);
+  void set_pool_type(PoolType pool_type_) {
+    if (pool_type_ == ManyThread2OneChunk) {
+      if (num_set_chunks) {
+        chunk_locks = lock_view_t("locks", num_chunks);
       }
       pchunk_locks = chunk_locks.data();
     }
@@ -260,89 +282,89 @@ class UniformMemoryPool{
   /**
    * \brief Print the content of memory pool
    */
-  void print_memory_pool (bool print_all = false )const{
+  void print_memory_pool(bool print_all = false) const {
     std::cout << "num_chunks:" << num_chunks << std::endl;
     std::cout << "chunk_size:" << chunk_size << std::endl;
     std::cout << "overall_size:" << overall_size << std::endl;
     std::cout << "modular_num_chunks:" << modular_num_chunks << std::endl;
 
-    //std::cout << "Printing free_chunks view" << std::endl;
-    //print_1Dview(free_chunks, print_all);
+    // std::cout << "Printing free_chunks view" << std::endl;
+    // print_1Dview(free_chunks, print_all);
     std::cout << "Printing chunk_locks view" << std::endl;
     print_1Dview(chunk_locks, print_all);
     std::cout << "Printing data view" << std::endl;
     print_1Dview(data_view, print_all);
   }
 
-
   /**
    * \brief Returns the unique memory location for thread.
    * This would uniquely return a memory location, for mode: OneThread2OneChunk
-   * Use this function only if you dont wanna pay for the cost of switch in allocate_chunk
-   * function, and you know that memory pool is always OneThread2OneChunk. Otherwise
-   * you are likely to get a memory location that is not private.
+   * Use this function only if you dont wanna pay for the cost of switch in
+   * allocate_chunk function, and you know that memory pool is always
+   * OneThread2OneChunk. Otherwise you are likely to get a memory location that
+   * is not private.
    */
   KOKKOS_INLINE_FUNCTION
-  data_type *get_my_chunk(const size_t &thread_index) const{
-    //return data + (thread_index % num_chunks) * chunk_size;
+  data_type *get_my_chunk(const size_t &thread_index) const {
+    // return data + (thread_index % num_chunks) * chunk_size;
     return data + (thread_index & modular_num_chunks) * chunk_size;
   }
 
   /**
    * \brief Returns the unique memory location for thread.
-   * This would uniquely return a memory location, for mode: ManyThread2OneChunk.
-   * Use this function only if you dont wanna pay for the cost of switch in allocate_chunk
-   * function, and you know that memory pool is always ManyThread2OneChunk. Otherwise you will
-   * get a segfault.
+   * This would uniquely return a memory location, for mode:
+   * ManyThread2OneChunk. Use this function only if you dont wanna pay for the
+   * cost of switch in allocate_chunk function, and you know that memory pool is
+   * always ManyThread2OneChunk. Otherwise you will get a segfault.
    */
   KOKKOS_INLINE_FUNCTION
-  data_type *get_arbitrary_free_chunk(const size_t &thread_index) const{
+  data_type *get_arbitrary_free_chunk(const size_t &thread_index) const {
     return this->get_arbitrary_free_chunk(thread_index, num_chunks);
   }
 
   KOKKOS_INLINE_FUNCTION
-  data_type *get_arbitrary_free_chunk(const size_t &thread_index, const size_t max_tries) const{
+  data_type *get_arbitrary_free_chunk(const size_t &thread_index,
+                                      const size_t max_tries) const {
     size_t chunk_index = thread_index & modular_num_chunks;
-    size_t num_try = 0;
-    while(!Kokkos::atomic_compare_exchange_strong(pchunk_locks + chunk_index, 0, 1)){
+    size_t num_try     = 0;
+    while (!Kokkos::atomic_compare_exchange_strong(pchunk_locks + chunk_index,
+                                                   0, 1)) {
       chunk_index = (chunk_index + 1) & modular_num_chunks;
       ++num_try;
-      if (num_try > max_tries){
+      if (num_try > max_tries) {
         return NULL;
       }
     }
     return data + chunk_index * chunk_size;
   }
 
-
   /**
    * \brief Returns the unique memory location for thread.
    */
   KOKKOS_INLINE_FUNCTION
-  data_type* allocate_chunk(const size_t &thread_index) const{
-
-    switch(this->pool_type){
-    default:
-    case OneThread2OneChunk:
-      //printf("OneThread2OneChunk alloc for :%ld\n", thread_index);
-      return this->get_my_chunk(thread_index);
-    case ManyThread2OneChunk:
-      //printf("ManyThread2OneChunk alloc for :%ld\n", thread_index);
-      return this->get_arbitrary_free_chunk(thread_index, num_chunks);
+  data_type *allocate_chunk(const size_t &thread_index) const {
+    switch (this->pool_type) {
+      default:
+      case OneThread2OneChunk:
+        // printf("OneThread2OneChunk alloc for :%ld\n", thread_index);
+        return this->get_my_chunk(thread_index);
+      case ManyThread2OneChunk:
+        // printf("ManyThread2OneChunk alloc for :%ld\n", thread_index);
+        return this->get_arbitrary_free_chunk(thread_index, num_chunks);
     }
   }
 
   /**
    * \brief Releases the memory that has been allocated.
-   * Use this function only if you dont wanna pay for the cost of switch in release_chunk
-   * function, and you know that memory pool is always ManyThread2OneChunk. Otherwise you will
-   * get a segfault.
+   * Use this function only if you dont wanna pay for the cost of switch in
+   * release_chunk function, and you know that memory pool is always
+   * ManyThread2OneChunk. Otherwise you will get a segfault.
    */
   KOKKOS_INLINE_FUNCTION
-  void release_arbitrary_chunk(const data_type *chunk_ptr) const{
+  void release_arbitrary_chunk(const data_type *chunk_ptr) const {
     size_t alloc_index = (chunk_ptr - data) / chunk_size;
-    //printf("release:%ld #chunks:%ld\n", alloc_index, num_chunks);
-    //chunk_locks(alloc_index) = false;
+    // printf("release:%ld #chunks:%ld\n", alloc_index, num_chunks);
+    // chunk_locks(alloc_index) = false;
     chunk_locks(alloc_index) = 0;
   }
 
@@ -350,30 +372,24 @@ class UniformMemoryPool{
    * \brief Returns the chunk index of the pointer.
    */
   KOKKOS_INLINE_FUNCTION
-  size_t get_chunk_index(const data_type *chunk_ptr) const{
+  size_t get_chunk_index(const data_type *chunk_ptr) const {
     return (chunk_ptr - data) / chunk_size;
   }
 
-
   /**
    * \brief Releases the memory that has been allocated.
    */
   KOKKOS_INLINE_FUNCTION
-  void release_chunk(const data_type *chunk_ptr) const{
-    switch(this->pool_type){
-    default:
-    case OneThread2OneChunk:
-      break;
-    case ManyThread2OneChunk:
-      return this->release_arbitrary_chunk(chunk_ptr);
-
+  void release_chunk(const data_type *chunk_ptr) const {
+    switch (this->pool_type) {
+      default:
+      case OneThread2OneChunk: break;
+      case ManyThread2OneChunk: return this->release_arbitrary_chunk(chunk_ptr);
     }
   }
-
-
 };
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosKernels
 
 #endif
diff --git a/src/common/KokkosKernels_Utils.hpp b/src/common/KokkosKernels_Utils.hpp
index 5770f013a4..655d89ba67 100644
--- a/src/common/KokkosKernels_Utils.hpp
+++ b/src/common/KokkosKernels_Utils.hpp
@@ -42,9 +42,6 @@
 //@HEADER
 */
 #include "Kokkos_Core.hpp"
-#include "Kokkos_Atomic.hpp"
-#include "Kokkos_Timer.hpp"
-#include "Kokkos_MemoryTraits.hpp"
 #include "Kokkos_ArithTraits.hpp"
 #include "Kokkos_UnorderedMap.hpp"
 #include <iostream>
@@ -59,79 +56,67 @@
 #ifndef _KOKKOSKERNELSUTILS_HPP
 #define _KOKKOSKERNELSUTILS_HPP
 
+namespace KokkosKernels {
 
-
-namespace KokkosKernels{
-
-
-
-namespace Impl{
-
+namespace Impl {
 
 template <typename ExecutionSpace>
-ExecSpaceType get_exec_space_type(){
+ExecSpaceType get_exec_space_type() {
   return kk_get_exec_space_type<ExecutionSpace>();
 }
 
-inline int get_suggested_vector__size(
-    size_t nr, size_t nnz, ExecSpaceType exec_space){
-  return kk_get_suggested_vector_size(nr,nnz, exec_space);
+inline int get_suggested_vector__size(size_t nr, size_t nnz,
+                                      ExecSpaceType exec_space) {
+  return kk_get_suggested_vector_size(nr, nnz, exec_space);
 }
 
-
-
-template <typename in_lno_view_t,
-          typename out_lno_view_t,
-          typename MyExecSpace>
-void get_histogram(
-    typename in_lno_view_t::size_type in_elements,
-    in_lno_view_t in_view,
-    out_lno_view_t histogram /*must be initialized with 0s*/){
-  kk_get_histogram<in_lno_view_t, out_lno_view_t, MyExecSpace>(in_elements, in_view, histogram);
+template <typename in_lno_view_t, typename out_lno_view_t, typename MyExecSpace>
+void get_histogram(typename in_lno_view_t::size_type in_elements,
+                   in_lno_view_t in_view,
+                   out_lno_view_t histogram /*must be initialized with 0s*/) {
+  kk_get_histogram<in_lno_view_t, out_lno_view_t, MyExecSpace>(
+      in_elements, in_view, histogram);
 }
 
 template <typename idx, typename ExecutionSpace>
-void get_suggested_vector_size(
-    int &suggested_vector_size_,
-    idx nr, idx nnz) {
-  suggested_vector_size_ = kk_get_suggested_vector_size(nr, nnz, get_exec_space_type<ExecutionSpace>());
+void get_suggested_vector_size(int &suggested_vector_size_, idx nr, idx nnz) {
+  suggested_vector_size_ = kk_get_suggested_vector_size(
+      nr, nnz, get_exec_space_type<ExecutionSpace>());
 }
 
-//Get the best team size for the given functor.
-//If it uses shared memory, the amount used must be available through f.team_shmem_size(n),
-//not through the TeamPolicy. If this is how dynamic shared is set, just use AUTO for the team size.
-template<typename team_policy_t, typename Functor, typename ParallelTag = Kokkos::ParallelForTag>
-int get_suggested_team_size(Functor& f, int vector_size)
-{
+// Get the best team size for the given functor.
+// If it uses shared memory, the amount used must be available through
+// f.team_shmem_size(n), not through the TeamPolicy. If this is how dynamic
+// shared is set, just use AUTO for the team size.
+template <typename team_policy_t, typename Functor,
+          typename ParallelTag = Kokkos::ParallelForTag>
+int get_suggested_team_size(Functor &f, int vector_size) {
   using execution_space = typename team_policy_t::traits::execution_space;
-  if(kk_is_gpu_exec_space<execution_space>())
-  {
+  if (kk_is_gpu_exec_space<execution_space>()) {
     team_policy_t temp(1, 1, vector_size);
     return temp.team_size_recommended(f, ParallelTag());
-  }
-  else
+  } else
     return 1;
 }
 
-template<typename team_policy_t, typename Functor, typename ParallelTag = Kokkos::ParallelForTag>
-int get_suggested_team_size(Functor& f, int vector_size, size_t sharedPerTeam, size_t sharedPerThread)
-{
+template <typename team_policy_t, typename Functor,
+          typename ParallelTag = Kokkos::ParallelForTag>
+int get_suggested_team_size(Functor &f, int vector_size, size_t sharedPerTeam,
+                            size_t sharedPerThread) {
   using execution_space = typename team_policy_t::traits::execution_space;
-  if(kk_is_gpu_exec_space<execution_space>())
-  {
-    team_policy_t temp = team_policy_t(1, 1, vector_size).
-      set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam), Kokkos::PerThread(sharedPerThread));
+  if (kk_is_gpu_exec_space<execution_space>()) {
+    team_policy_t temp =
+        team_policy_t(1, 1, vector_size)
+            .set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam),
+                              Kokkos::PerThread(sharedPerThread));
     return temp.team_size_recommended(f, ParallelTag());
-  }
-  else
+  } else
     return 1;
 }
 
-template <typename idx_array_type,
-          typename idx_edge_array_type,
-          typename idx_out_edge_array_type,
-          typename team_member>
-struct FillSymmetricEdges{
+template <typename idx_array_type, typename idx_edge_array_type,
+          typename idx_out_edge_array_type, typename team_member>
+struct FillSymmetricEdges {
   typedef typename idx_array_type::value_type idx;
   idx num_rows;
   idx nnz;
@@ -141,48 +126,45 @@ struct FillSymmetricEdges{
   idx_out_edge_array_type srcs;
   idx_out_edge_array_type dsts;
 
-  FillSymmetricEdges(
-    typename idx_array_type::value_type num_rows_,
-    idx_array_type xadj_,
-    idx_edge_array_type adj_,
+  FillSymmetricEdges(typename idx_array_type::value_type num_rows_,
+                     idx_array_type xadj_, idx_edge_array_type adj_,
 
-    idx_out_edge_array_type srcs_,
-    idx_out_edge_array_type dsts_
-    ):num_rows(num_rows_),nnz(adj_.extent(0)), xadj(xadj_), adj(adj_), srcs(srcs_), dsts(dsts_){}
+                     idx_out_edge_array_type srcs_,
+                     idx_out_edge_array_type dsts_)
+      : num_rows(num_rows_),
+        nnz(adj_.extent(0)),
+        xadj(xadj_),
+        adj(adj_),
+        srcs(srcs_),
+        dsts(dsts_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const team_member & teamMember) const {
-    idx ii = teamMember.league_rank()  * teamMember.team_size()+ teamMember.team_rank();
+  void operator()(const team_member &teamMember) const {
+    idx ii = teamMember.league_rank() * teamMember.team_size() +
+             teamMember.team_rank();
     if (ii >= num_rows) return;
     idx row_begin = xadj[ii];
-    idx row_end = xadj[ii + 1];
+    idx row_end   = xadj[ii + 1];
 
     Kokkos::parallel_for(
-        Kokkos::ThreadVectorRange(teamMember, row_end - row_begin),
-        [&] (idx i) {
-      idx adjind = i + row_begin;
-      idx colIndex = adj[adjind];
-      if (colIndex < num_rows){
-        srcs[adjind] = ii + 1;
-        dsts[adjind] = colIndex + 1;
-        if (colIndex != ii){
-          srcs[adjind + nnz] = colIndex + 1;
-          dsts[adjind + nnz] = ii + 1;
-        }
-      }
-
-    });
-
+        Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&](idx i) {
+          idx adjind   = i + row_begin;
+          idx colIndex = adj[adjind];
+          if (colIndex < num_rows) {
+            srcs[adjind] = ii + 1;
+            dsts[adjind] = colIndex + 1;
+            if (colIndex != ii) {
+              srcs[adjind + nnz] = colIndex + 1;
+              dsts[adjind + nnz] = ii + 1;
+            }
+          }
+        });
   }
 };
 
-
-template <typename in_lno_row_view_t,
-          typename in_lno_nnz_view_t,
-          typename hashmap_t,
-          typename out_lno_row_view_t,
-          typename team_member>
-struct FillSymmetricEdgesHashMap{
+template <typename in_lno_row_view_t, typename in_lno_nnz_view_t,
+          typename hashmap_t, typename out_lno_row_view_t, typename team_member>
+struct FillSymmetricEdgesHashMap {
   typedef typename in_lno_row_view_t::value_type idx;
   idx num_rows;
   idx nnz;
@@ -192,64 +174,61 @@ struct FillSymmetricEdgesHashMap{
   out_lno_row_view_t pre_pps;
   bool lower_only;
 
-  FillSymmetricEdgesHashMap(
-      idx num_rows_,
-    in_lno_row_view_t xadj_,
-    in_lno_nnz_view_t adj_,
-    hashmap_t hashmap_,
-    out_lno_row_view_t pre_pps_
-    ):num_rows(num_rows_),nnz(adj_.extent(0)), xadj(xadj_), adj(adj_),
-        umap(hashmap_), pre_pps(pre_pps_){}
+  FillSymmetricEdgesHashMap(idx num_rows_, in_lno_row_view_t xadj_,
+                            in_lno_nnz_view_t adj_, hashmap_t hashmap_,
+                            out_lno_row_view_t pre_pps_)
+      : num_rows(num_rows_),
+        nnz(adj_.extent(0)),
+        xadj(xadj_),
+        adj(adj_),
+        umap(hashmap_),
+        pre_pps(pre_pps_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const team_member & teamMember/*, idx &nnz*/) const {
-    typedef typename std::remove_reference< decltype( pre_pps(0) ) >::type atomic_incr_type;
-    idx ii = teamMember.league_rank()  * teamMember.team_size()+ teamMember.team_rank();
+  void operator()(const team_member &teamMember /*, idx &nnz*/) const {
+    typedef typename std::remove_reference<decltype(pre_pps(0))>::type
+        atomic_incr_type;
+    idx ii = teamMember.league_rank() * teamMember.team_size() +
+             teamMember.team_rank();
     if (ii >= num_rows) {
       return;
     }
     idx row_begin = xadj[ii];
-    idx row_end = xadj[ii + 1];
+    idx row_end   = xadj[ii + 1];
     Kokkos::parallel_for(
-        Kokkos::ThreadVectorRange(teamMember, row_end - row_begin),
-        [&] (idx i) {
-      idx adjind = i + row_begin;
-      idx colIndex = adj[adjind];
-      if (colIndex < num_rows){
-        if (colIndex < ii){
-          Kokkos::UnorderedMapInsertResult r = umap.insert(Kokkos::pair<idx, idx>(colIndex, ii));
-          if (r.success()){
-
-            Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1));
-
-            Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), atomic_incr_type(1));
+        Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&](idx i) {
+          idx adjind   = i + row_begin;
+          idx colIndex = adj[adjind];
+          if (colIndex < num_rows) {
+            if (colIndex < ii) {
+              Kokkos::UnorderedMapInsertResult r =
+                  umap.insert(Kokkos::pair<idx, idx>(colIndex, ii));
+              if (r.success()) {
+                Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1));
+
+                Kokkos::atomic_fetch_add(&(pre_pps(colIndex)),
+                                         atomic_incr_type(1));
+              }
+            } else if (colIndex > ii) {
+              Kokkos::UnorderedMapInsertResult r =
+                  umap.insert(Kokkos::pair<idx, idx>(ii, colIndex));
+              if (r.success()) {
+                Kokkos::atomic_fetch_add(&(pre_pps(colIndex)),
+                                         atomic_incr_type(1));
+
+                Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1));
+              }
+            } else {
+              Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1));
+            }
           }
-        }
-        else if (colIndex > ii){
-
-          Kokkos::UnorderedMapInsertResult r = umap.insert(Kokkos::pair<idx, idx>(ii, colIndex));
-          if (r.success()){
-            Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), atomic_incr_type(1));
-
-            Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1));
-          }
-        }
-        else {
-          Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1));
-        }
-      }
-
-    });
-
+        });
   }
 };
 
-template <typename in_lno_row_view_t,
-          typename in_lno_nnz_view_t,
-          typename hashmap_t,
-          typename out_lno_row_view_t,
-          typename team_member>
-struct FillSymmetricLowerEdgesHashMap{
+template <typename in_lno_row_view_t, typename in_lno_nnz_view_t,
+          typename hashmap_t, typename out_lno_row_view_t, typename team_member>
+struct FillSymmetricLowerEdgesHashMap {
   typedef typename in_lno_row_view_t::value_type idx;
   idx num_rows;
   idx nnz;
@@ -258,61 +237,57 @@ struct FillSymmetricLowerEdgesHashMap{
   hashmap_t umap;
   out_lno_row_view_t pre_pps;
 
-
-  FillSymmetricLowerEdgesHashMap(
-      idx num_rows_,
-    in_lno_row_view_t xadj_,
-    in_lno_nnz_view_t adj_,
-    hashmap_t hashmap_,
-    out_lno_row_view_t pre_pps_,
-    bool /* lower_only_ */ = false
-    ):num_rows(num_rows_),nnz(adj_.extent(0)), xadj(xadj_), adj(adj_),
-        umap(hashmap_), pre_pps(pre_pps_){}
+  FillSymmetricLowerEdgesHashMap(idx num_rows_, in_lno_row_view_t xadj_,
+                                 in_lno_nnz_view_t adj_, hashmap_t hashmap_,
+                                 out_lno_row_view_t pre_pps_,
+                                 bool /* lower_only_ */ = false)
+      : num_rows(num_rows_),
+        nnz(adj_.extent(0)),
+        xadj(xadj_),
+        adj(adj_),
+        umap(hashmap_),
+        pre_pps(pre_pps_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const team_member & teamMember/*, idx &nnz*/) const {
-    typedef typename std::remove_reference< decltype( pre_pps(0) ) >::type atomic_incr_type;
-    idx ii = teamMember.league_rank()  * teamMember.team_size()+ teamMember.team_rank();
+  void operator()(const team_member &teamMember /*, idx &nnz*/) const {
+    typedef typename std::remove_reference<decltype(pre_pps(0))>::type
+        atomic_incr_type;
+    idx ii = teamMember.league_rank() * teamMember.team_size() +
+             teamMember.team_rank();
     if (ii >= num_rows) {
       return;
     }
     idx row_begin = xadj[ii];
-    idx row_end = xadj[ii + 1];
+    idx row_end   = xadj[ii + 1];
 
     Kokkos::parallel_for(
-        Kokkos::ThreadVectorRange(teamMember, row_end - row_begin),
-        [&] (idx i) {
-      idx adjind = i + row_begin;
-      idx colIndex = adj[adjind];
-      if (colIndex < num_rows){
-        if (colIndex < ii){
-          Kokkos::UnorderedMapInsertResult r = umap.insert(Kokkos::pair<idx, idx>(colIndex, ii));
-          if (r.success()){
-
-            Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), atomic_incr_type(1));
-          }
-        }
-        else if (colIndex > ii){
-
-          Kokkos::UnorderedMapInsertResult r = umap.insert(Kokkos::pair<idx, idx>(ii, colIndex));
-          if (r.success()){
-            Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1));
+        Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&](idx i) {
+          idx adjind   = i + row_begin;
+          idx colIndex = adj[adjind];
+          if (colIndex < num_rows) {
+            if (colIndex < ii) {
+              Kokkos::UnorderedMapInsertResult r =
+                  umap.insert(Kokkos::pair<idx, idx>(colIndex, ii));
+              if (r.success()) {
+                Kokkos::atomic_fetch_add(&(pre_pps(colIndex)),
+                                         atomic_incr_type(1));
+              }
+            } else if (colIndex > ii) {
+              Kokkos::UnorderedMapInsertResult r =
+                  umap.insert(Kokkos::pair<idx, idx>(ii, colIndex));
+              if (r.success()) {
+                Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1));
+              }
+            }
           }
-        }
-
-      }
-
-    });
+        });
   }
 };
 
-template <typename in_lno_row_view_t,
-          typename in_lno_nnz_view_t,
-          typename hashmap_t,
-          typename out_lno_row_view_t,
-          typename out_lno_nnz_view_t,
-          typename team_member_t>
-struct FillSymmetricCRS_HashMap{
+template <typename in_lno_row_view_t, typename in_lno_nnz_view_t,
+          typename hashmap_t, typename out_lno_row_view_t,
+          typename out_lno_nnz_view_t, typename team_member_t>
+struct FillSymmetricCRS_HashMap {
   typedef typename in_lno_row_view_t::value_type idx;
   idx num_rows;
   idx nnz;
@@ -322,66 +297,67 @@ struct FillSymmetricCRS_HashMap{
   out_lno_row_view_t pre_pps;
   out_lno_nnz_view_t sym_adj;
 
-  FillSymmetricCRS_HashMap(idx num_rows_,
-        in_lno_row_view_t xadj_,
-        in_lno_nnz_view_t adj_,
-        hashmap_t hashmap_,
-        out_lno_row_view_t pre_pps_,
-        out_lno_nnz_view_t sym_adj_):
-            num_rows(num_rows_),nnz(adj_.extent(0)),
-      xadj(xadj_), adj(adj_),
-      umap(hashmap_), pre_pps(pre_pps_), sym_adj(sym_adj_){}
+  FillSymmetricCRS_HashMap(idx num_rows_, in_lno_row_view_t xadj_,
+                           in_lno_nnz_view_t adj_, hashmap_t hashmap_,
+                           out_lno_row_view_t pre_pps_,
+                           out_lno_nnz_view_t sym_adj_)
+      : num_rows(num_rows_),
+        nnz(adj_.extent(0)),
+        xadj(xadj_),
+        adj(adj_),
+        umap(hashmap_),
+        pre_pps(pre_pps_),
+        sym_adj(sym_adj_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const team_member_t & teamMember) const {
-    typedef typename std::remove_reference< decltype( pre_pps(0) ) >::type atomic_incr_type;
-    idx ii = teamMember.league_rank()  * teamMember.team_size()+ teamMember.team_rank();
+  void operator()(const team_member_t &teamMember) const {
+    typedef typename std::remove_reference<decltype(pre_pps(0))>::type
+        atomic_incr_type;
+    idx ii = teamMember.league_rank() * teamMember.team_size() +
+             teamMember.team_rank();
     if (ii >= num_rows) {
       return;
     }
     idx row_begin = xadj[ii];
-    idx row_end = xadj[ii + 1];
+    idx row_end   = xadj[ii + 1];
 
     Kokkos::parallel_for(
-        Kokkos::ThreadVectorRange(teamMember, row_end - row_begin),
-        [&] (idx i) {
-      idx adjind = i + row_begin;
-      idx colIndex = adj[adjind];
-      if (colIndex < num_rows){
-        if (colIndex < ii){
-          if (umap.insert(Kokkos::pair<idx, idx>(colIndex, ii)).success()){
-            idx cAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), atomic_incr_type(1));
-            idx iAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1));
-            sym_adj[cAdjInd] = ii;
-            sym_adj[iAdjInd] = colIndex;
-          }
-        }
-        else if (colIndex > ii){
-          if (umap.insert(Kokkos::pair<idx, idx>(ii, colIndex)).success()){
-            idx cAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), atomic_incr_type(1));
-            idx iAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(ii)), atomic_incr_type(1));
-            sym_adj[cAdjInd] = ii;
-            sym_adj[iAdjInd] = colIndex;
+        Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&](idx i) {
+          idx adjind   = i + row_begin;
+          idx colIndex = adj[adjind];
+          if (colIndex < num_rows) {
+            if (colIndex < ii) {
+              if (umap.insert(Kokkos::pair<idx, idx>(colIndex, ii)).success()) {
+                idx cAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(colIndex)),
+                                                       atomic_incr_type(1));
+                idx iAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(ii)),
+                                                       atomic_incr_type(1));
+                sym_adj[cAdjInd] = ii;
+                sym_adj[iAdjInd] = colIndex;
+              }
+            } else if (colIndex > ii) {
+              if (umap.insert(Kokkos::pair<idx, idx>(ii, colIndex)).success()) {
+                idx cAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(colIndex)),
+                                                       atomic_incr_type(1));
+                idx iAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(ii)),
+                                                       atomic_incr_type(1));
+                sym_adj[cAdjInd] = ii;
+                sym_adj[iAdjInd] = colIndex;
+              }
+            } else {
+              idx cAdjInd      = Kokkos::atomic_fetch_add(&(pre_pps(colIndex)),
+                                                     atomic_incr_type(1));
+              sym_adj[cAdjInd] = ii;
+            }
           }
-        }
-        else {
-          idx cAdjInd = Kokkos::atomic_fetch_add(&(pre_pps(colIndex)), atomic_incr_type(1));
-          sym_adj[cAdjInd] = ii;
-        }
-      }
-    });
-
+        });
   }
 };
 
-
-template <typename in_lno_row_view_t,
-          typename in_lno_nnz_view_t,
-          typename hashmap_t,
-          typename out_lno_nnz_view_t,
-          typename out_lno_row_view_t,
-          typename team_member_t>
-struct FillSymmetricEdgeList_HashMap{
+template <typename in_lno_row_view_t, typename in_lno_nnz_view_t,
+          typename hashmap_t, typename out_lno_nnz_view_t,
+          typename out_lno_row_view_t, typename team_member_t>
+struct FillSymmetricEdgeList_HashMap {
   typedef typename in_lno_row_view_t::value_type idx;
   idx num_rows;
   idx nnz;
@@ -392,261 +368,257 @@ struct FillSymmetricEdgeList_HashMap{
   out_lno_nnz_view_t sym_dst;
   out_lno_row_view_t pps;
 
-  FillSymmetricEdgeList_HashMap(
-      idx num_rows_,
-        in_lno_row_view_t xadj_,
-        in_lno_nnz_view_t adj_,
-        hashmap_t hashmap_,
-        out_lno_nnz_view_t sym_src_,
-        out_lno_nnz_view_t sym_dst_,
-        out_lno_row_view_t pps_):
-            num_rows(num_rows_),nnz(adj_.extent(0)),
-      xadj(xadj_), adj(adj_),
-      umap(hashmap_), sym_src(sym_src_), sym_dst(sym_dst_), pps(pps_){}
+  FillSymmetricEdgeList_HashMap(idx num_rows_, in_lno_row_view_t xadj_,
+                                in_lno_nnz_view_t adj_, hashmap_t hashmap_,
+                                out_lno_nnz_view_t sym_src_,
+                                out_lno_nnz_view_t sym_dst_,
+                                out_lno_row_view_t pps_)
+      : num_rows(num_rows_),
+        nnz(adj_.extent(0)),
+        xadj(xadj_),
+        adj(adj_),
+        umap(hashmap_),
+        sym_src(sym_src_),
+        sym_dst(sym_dst_),
+        pps(pps_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const team_member_t & teamMember) const {
-    typedef typename std::remove_reference< decltype( pps(0) ) >::type atomic_incr_type;
-    idx ii = teamMember.league_rank()  * teamMember.team_size()+ teamMember.team_rank();
+  void operator()(const team_member_t &teamMember) const {
+    typedef
+        typename std::remove_reference<decltype(pps(0))>::type atomic_incr_type;
+    idx ii = teamMember.league_rank() * teamMember.team_size() +
+             teamMember.team_rank();
     if (ii >= num_rows) {
       return;
     }
     idx row_begin = xadj[ii];
-    idx row_end = xadj[ii + 1];
+    idx row_end   = xadj[ii + 1];
 
     Kokkos::parallel_for(
-        Kokkos::ThreadVectorRange(teamMember, row_end - row_begin),
-        [&] (idx i) {
-      idx adjind = i + row_begin;
-      idx colIndex = adj[adjind];
-      if (colIndex < num_rows){
-        if (colIndex < ii){
-          if (umap.insert(Kokkos::pair<idx, idx>(colIndex, ii)).success()){
-            idx cAdjInd = Kokkos::atomic_fetch_add(&(pps(colIndex)), atomic_incr_type(1));
-            sym_src[cAdjInd] = colIndex;
-            sym_dst[cAdjInd] = ii;
+        Kokkos::ThreadVectorRange(teamMember, row_end - row_begin), [&](idx i) {
+          idx adjind   = i + row_begin;
+          idx colIndex = adj[adjind];
+          if (colIndex < num_rows) {
+            if (colIndex < ii) {
+              if (umap.insert(Kokkos::pair<idx, idx>(colIndex, ii)).success()) {
+                idx cAdjInd      = Kokkos::atomic_fetch_add(&(pps(colIndex)),
+                                                       atomic_incr_type(1));
+                sym_src[cAdjInd] = colIndex;
+                sym_dst[cAdjInd] = ii;
+              }
+            } else if (colIndex > ii) {
+              if (umap.insert(Kokkos::pair<idx, idx>(ii, colIndex)).success()) {
+                idx cAdjInd =
+                    Kokkos::atomic_fetch_add(&(pps(ii)), atomic_incr_type(1));
+                sym_src[cAdjInd] = ii;
+                sym_dst[cAdjInd] = colIndex;
+              }
+            }
           }
-        }
-        else if (colIndex > ii){
-          if (umap.insert(Kokkos::pair<idx, idx>(ii, colIndex)).success()){
-            idx cAdjInd = Kokkos::atomic_fetch_add(&(pps(ii)), atomic_incr_type(1));
-            sym_src[cAdjInd] = ii;
-            sym_dst[cAdjInd] = colIndex;
-          }
-        }
-      }
-    });
-
+        });
   }
 };
 
 template <typename idx_array_type>
-void print_1Dview(std::ostream& os, idx_array_type view, bool print_all=false, const char* sep=" "){
+void print_1Dview(std::ostream &os, idx_array_type view, bool print_all = false,
+                  const char *sep = " ") {
   kk_print_1Dview(os, view, print_all, sep);
 }
 
 template <typename idx_array_type>
-void print_1Dview(idx_array_type view, bool print_all = false){
+void print_1Dview(idx_array_type view, bool print_all = false) {
   kk_print_1Dview(view, print_all);
 }
 
 template <typename lno_t, typename memory_space>
-void print_1Dpointer(const lno_t *pview, size_t size, bool print_all = false){
-  typedef Kokkos::View<const lno_t *, memory_space, Kokkos::MemoryUnmanaged> um_array_type;
-  um_array_type view (pview, size);
+void print_1Dpointer(const lno_t *pview, size_t size, bool print_all = false) {
+  typedef Kokkos::View<const lno_t *, memory_space, Kokkos::MemoryUnmanaged>
+      um_array_type;
+  um_array_type view(pview, size);
   kk_print_1Dview(view, print_all);
 }
 
 template <typename forward_map_type, typename reverse_map_type>
-struct Reverse_Map_Init{
+struct Reverse_Map_Init {
   typedef typename forward_map_type::value_type forward_type;
   typedef typename reverse_map_type::value_type reverse_type;
   forward_map_type forward_map;
   reverse_map_type reverse_map_xadj;
-  Reverse_Map_Init(
-      forward_map_type forward_map_,
-      reverse_map_type reverse_xadj_):
-        forward_map(forward_map_), reverse_map_xadj(reverse_xadj_){}
+  Reverse_Map_Init(forward_map_type forward_map_,
+                   reverse_map_type reverse_xadj_)
+      : forward_map(forward_map_), reverse_map_xadj(reverse_xadj_) {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const size_t &ii) const {
-    typedef typename std::remove_reference< decltype( reverse_map_xadj(0) ) >::type atomic_incr_type;
+    typedef typename std::remove_reference<decltype(reverse_map_xadj(0))>::type
+        atomic_incr_type;
     forward_type fm = forward_map[ii];
-    Kokkos::atomic_fetch_add( &(reverse_map_xadj(fm)), atomic_incr_type(1));
+    Kokkos::atomic_fetch_add(&(reverse_map_xadj(fm)), atomic_incr_type(1));
   }
 };
 
-
-
-
-
-
-
-
 template <typename forward_map_type, typename reverse_map_type>
-struct Fill_Reverse_Map{
+struct Fill_Reverse_Map {
   typedef typename forward_map_type::value_type forward_type;
   typedef typename reverse_map_type::value_type reverse_type;
   forward_map_type forward_map;
   reverse_map_type reverse_map_xadj;
   reverse_map_type reverse_map_adj;
 
-
-  Fill_Reverse_Map(
-      forward_map_type forward_map_,
-      reverse_map_type reverse_map_xadj_,
-      reverse_map_type reverse_map_adj_):
-        forward_map(forward_map_), reverse_map_xadj(reverse_map_xadj_), reverse_map_adj(reverse_map_adj_){}
+  Fill_Reverse_Map(forward_map_type forward_map_,
+                   reverse_map_type reverse_map_xadj_,
+                   reverse_map_type reverse_map_adj_)
+      : forward_map(forward_map_),
+        reverse_map_xadj(reverse_map_xadj_),
+        reverse_map_adj(reverse_map_adj_) {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const size_t &ii) const {
-    typedef typename std::remove_reference< decltype( reverse_map_xadj(0) ) >::type atomic_incr_type;
-    forward_type c = forward_map[ii];
-    const reverse_type future_index = Kokkos::atomic_fetch_add( &(reverse_map_xadj(c - 1)), atomic_incr_type(1));
+    typedef typename std::remove_reference<decltype(reverse_map_xadj(0))>::type
+        atomic_incr_type;
+    forward_type c                  = forward_map[ii];
+    const reverse_type future_index = Kokkos::atomic_fetch_add(
+        &(reverse_map_xadj(c - 1)), atomic_incr_type(1));
     reverse_map_adj(future_index) = ii;
   }
 };
 
-
-
-
 template <typename forward_array_type, typename MyExecSpace>
-void inclusive_parallel_prefix_sum(typename forward_array_type::value_type num_elements, forward_array_type arr){
-  kk_inclusive_parallel_prefix_sum<forward_array_type, MyExecSpace>(num_elements, arr);
+void inclusive_parallel_prefix_sum(
+    typename forward_array_type::value_type num_elements,
+    forward_array_type arr) {
+  kk_inclusive_parallel_prefix_sum<forward_array_type, MyExecSpace>(
+      num_elements, arr);
 }
 
 template <typename forward_array_type, typename MyExecSpace>
-void exclusive_parallel_prefix_sum(typename forward_array_type::value_type num_elements, forward_array_type arr){
-  kk_exclusive_parallel_prefix_sum<forward_array_type, MyExecSpace>(num_elements, arr);
+void exclusive_parallel_prefix_sum(
+    typename forward_array_type::value_type num_elements,
+    forward_array_type arr) {
+  kk_exclusive_parallel_prefix_sum<forward_array_type, MyExecSpace>(
+      num_elements, arr);
 }
 
 template <typename array_type>
-struct PropogataMaxValstoZeros{
+struct PropogataMaxValstoZeros {
   typedef typename array_type::value_type idx;
   array_type array_sum;
-  PropogataMaxValstoZeros(array_type arr_): array_sum(arr_){}
+  PropogataMaxValstoZeros(array_type arr_) : array_sum(arr_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const size_t ii, idx& update, const bool final) const {
-
+  void operator()(const size_t ii, idx &update, const bool final) const {
     idx value = array_sum(ii);
     if (value != 0) {
       update = value;
-    }
-    else if (final ){
-      array_sum(ii) = idx (update);
+    } else if (final) {
+      array_sum(ii) = idx(update);
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join( volatile       idx & update
-           , volatile const idx & input ) const {
+  void join(volatile idx &update, volatile const idx &input) const {
     if (input > update) update = input;
   }
-
-
 };
 
-template <typename out_array_t, typename in_array_t, typename scalar_1, typename scalar_2, typename MyExecSpace>
+template <typename out_array_t, typename in_array_t, typename scalar_1,
+          typename scalar_2, typename MyExecSpace>
 void a_times_x_plus_b(typename in_array_t::value_type num_elements,
-                  in_array_t out_arr, in_array_t in_arr,
-                  scalar_1 a, scalar_2 b){
-  kk_a_times_x_plus_b<out_array_t, in_array_t, scalar_1, scalar_2, MyExecSpace>
-  (num_elements, out_arr, in_arr,a, b);
+                      in_array_t out_arr, in_array_t in_arr, scalar_1 a,
+                      scalar_2 b) {
+  kk_a_times_x_plus_b<out_array_t, in_array_t, scalar_1, scalar_2, MyExecSpace>(
+      num_elements, out_arr, in_arr, a, b);
 }
 
-
 template <typename out_array_type, typename in_array_type, typename MyExecSpace>
-void modular_view(typename in_array_type::value_type num_elements, out_array_type out_arr, in_array_type in_arr, int mod_factor_){
-  kk_modular_view<out_array_type, in_array_type, MyExecSpace>
-  (num_elements, out_arr, in_arr, mod_factor_);
+void modular_view(typename in_array_type::value_type num_elements,
+                  out_array_type out_arr, in_array_type in_arr,
+                  int mod_factor_) {
+  kk_modular_view<out_array_type, in_array_type, MyExecSpace>(
+      num_elements, out_arr, in_arr, mod_factor_);
 }
 
 template <typename array_type>
-struct LinearInitialization{
+struct LinearInitialization {
   typedef typename array_type::value_type idx;
   array_type array_sum;
-  LinearInitialization(array_type arr_): array_sum(arr_){}
+  LinearInitialization(array_type arr_) : array_sum(arr_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const size_t ii) const {
-    array_sum(ii) = ii;
-  }
+  void operator()(const size_t ii) const { array_sum(ii) = ii; }
 };
 template <typename array_type, typename MyExecSpace>
-void linear_init(typename array_type::value_type num_elements, array_type arr){
+void linear_init(typename array_type::value_type num_elements, array_type arr) {
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-  Kokkos::parallel_for( "KokkosKernels::Common::LinearInit", my_exec_space(0, num_elements), LinearInitialization<array_type>(arr));
+  Kokkos::parallel_for("KokkosKernels::Common::LinearInit",
+                       my_exec_space(0, num_elements),
+                       LinearInitialization<array_type>(arr));
 }
 
-
 template <typename forward_array_type, typename MyExecSpace>
-void remove_zeros_in_xadj_vector(typename forward_array_type::value_type num_elements, forward_array_type arr){
+void remove_zeros_in_xadj_vector(
+    typename forward_array_type::value_type num_elements,
+    forward_array_type arr) {
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-  Kokkos::parallel_scan( "KokkosKernels::Common::RemoveZerosInXadjVector",my_exec_space(0, num_elements), PropogataMaxValstoZeros<forward_array_type>(arr));
+  Kokkos::parallel_scan("KokkosKernels::Common::RemoveZerosInXadjVector",
+                        my_exec_space(0, num_elements),
+                        PropogataMaxValstoZeros<forward_array_type>(arr));
 }
 
-
 template <typename forward_array_type, typename reverse_array_type>
-struct FillReverseBegins{
-
-  const forward_array_type &forward_map; //vertex to colors
-  reverse_array_type &reverse_map_xadj; // colors to vertex xadj
-
+struct FillReverseBegins {
+  const forward_array_type &forward_map;  // vertex to colors
+  reverse_array_type &reverse_map_xadj;   // colors to vertex xadj
 
   FillReverseBegins(
-      const forward_array_type &forward_map_, //vertex to colors
-      reverse_array_type &reverse_map_xadj_ // colors to vertex xadj
-      ):
-        forward_map(forward_map_), reverse_map_xadj(reverse_map_xadj_){}
+      const forward_array_type &forward_map_,  // vertex to colors
+      reverse_array_type &reverse_map_xadj_    // colors to vertex xadj
+      )
+      : forward_map(forward_map_), reverse_map_xadj(reverse_map_xadj_) {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const size_t ii) const {
     typename forward_array_type::value_type prev_col = forward_map(ii - 1);
-    typename forward_array_type::value_type cur_col = forward_map(ii);
-    while (prev_col < cur_col){
+    typename forward_array_type::value_type cur_col  = forward_map(ii);
+    while (prev_col < cur_col) {
       prev_col += 1;
       forward_map(prev_col) = ii + 1;
     }
   }
-
 };
 
-
 template <typename forward_map_type, typename reverse_map_type>
-struct Reverse_Map_Scale_Init{
+struct Reverse_Map_Scale_Init {
   typedef typename forward_map_type::value_type forward_type;
   typedef typename reverse_map_type::value_type reverse_type;
   forward_map_type forward_map;
   reverse_map_type reverse_map_xadj;
 
-
   const reverse_type multiply_shift_for_scale;
   const reverse_type division_shift_for_bucket;
 
-  Reverse_Map_Scale_Init(
-      forward_map_type forward_map_,
-      reverse_map_type reverse_xadj_,
-      reverse_type multiply_shift_for_scale_,
-      reverse_type division_shift_for_bucket_):
-        forward_map(forward_map_), reverse_map_xadj(reverse_xadj_),
+  Reverse_Map_Scale_Init(forward_map_type forward_map_,
+                         reverse_map_type reverse_xadj_,
+                         reverse_type multiply_shift_for_scale_,
+                         reverse_type division_shift_for_bucket_)
+      : forward_map(forward_map_),
+        reverse_map_xadj(reverse_xadj_),
         multiply_shift_for_scale(multiply_shift_for_scale_),
-        division_shift_for_bucket(division_shift_for_bucket_){}
+        division_shift_for_bucket(division_shift_for_bucket_) {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const size_t &ii) const {
-    typedef typename std::remove_reference< decltype( reverse_map_xadj(0) ) >::type atomic_incr_type;
+    typedef typename std::remove_reference<decltype(reverse_map_xadj(0))>::type
+        atomic_incr_type;
     forward_type fm = forward_map[ii];
-    fm = fm << multiply_shift_for_scale;
+    fm              = fm << multiply_shift_for_scale;
     fm += ii >> division_shift_for_bucket;
-    Kokkos::atomic_fetch_add( &(reverse_map_xadj(fm)), atomic_incr_type(1));
+    Kokkos::atomic_fetch_add(&(reverse_map_xadj(fm)), atomic_incr_type(1));
   }
 };
 
-
-
 template <typename forward_map_type, typename reverse_map_type>
-struct Fill_Reverse_Scale_Map{
+struct Fill_Reverse_Scale_Map {
   typedef typename forward_map_type::value_type forward_type;
   typedef typename reverse_map_type::value_type reverse_type;
   forward_map_type forward_map;
@@ -656,43 +628,42 @@ struct Fill_Reverse_Scale_Map{
   const reverse_type multiply_shift_for_scale;
   const reverse_type division_shift_for_bucket;
 
-
-  Fill_Reverse_Scale_Map(
-      forward_map_type forward_map_,
-      reverse_map_type reverse_map_xadj_,
-      reverse_map_type reverse_map_adj_,
-      reverse_type multiply_shift_for_scale_,
-      reverse_type division_shift_for_bucket_):
-        forward_map(forward_map_), reverse_map_xadj(reverse_map_xadj_), reverse_map_adj(reverse_map_adj_),
+  Fill_Reverse_Scale_Map(forward_map_type forward_map_,
+                         reverse_map_type reverse_map_xadj_,
+                         reverse_map_type reverse_map_adj_,
+                         reverse_type multiply_shift_for_scale_,
+                         reverse_type division_shift_for_bucket_)
+      : forward_map(forward_map_),
+        reverse_map_xadj(reverse_map_xadj_),
+        reverse_map_adj(reverse_map_adj_),
         multiply_shift_for_scale(multiply_shift_for_scale_),
-        division_shift_for_bucket(division_shift_for_bucket_){}
+        division_shift_for_bucket(division_shift_for_bucket_) {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const size_t &ii) const {
-    typedef typename std::remove_reference< decltype( reverse_map_xadj(0) ) >::type atomic_incr_type;
+    typedef typename std::remove_reference<decltype(reverse_map_xadj(0))>::type
+        atomic_incr_type;
     forward_type fm = forward_map[ii];
 
     fm = fm << multiply_shift_for_scale;
     fm += ii >> division_shift_for_bucket;
-    const reverse_type future_index = Kokkos::atomic_fetch_add( &(reverse_map_xadj(fm - 1)), atomic_incr_type(1));
+    const reverse_type future_index = Kokkos::atomic_fetch_add(
+        &(reverse_map_xadj(fm - 1)), atomic_incr_type(1));
     reverse_map_adj(future_index) = ii;
   }
 };
 
 template <typename from_view_t, typename to_view_t>
-struct StridedCopy{
+struct StridedCopy {
   const from_view_t from;
   to_view_t to;
   const size_t stride;
-  StridedCopy(
-      const from_view_t from_,
-      to_view_t to_,
-      size_t stride_):from(from_), to (to_), stride(stride_){}
-
+  StridedCopy(const from_view_t from_, to_view_t to_, size_t stride_)
+      : from(from_), to(to_), stride(stride_) {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const size_t &ii) const {
-    //std::cout << "ii:" << ii << " ii * stride:" << ii * stride << std::endl;
+    // std::cout << "ii:" << ii << " ii * stride:" << ii * stride << std::endl;
     to[ii] = from[(ii + 1) * stride - 1];
   }
 };
@@ -700,289 +671,294 @@ struct StridedCopy{
 /**
  * \brief Utility function to obtain a reverse map given a map.
  * Input is a map with the number of elements within the map.
- * forward_map[c] = i, where c is a forward elements and forward_map has a size of num_forward_elements.
- * i is the value that c is mapped in the forward map, and the range of that is num_reverse_elements.
- * Output is the reverse_map_xadj and reverse_map_adj such that,
- * all c, forward_map[c] = i, will appear in  reverse_map_adj[ reverse_map_xadj[i]: reverse_map_xadj[i+1])
- * \param: num_forward_elements: the number of elements in the forward map, the size of the forward map.
- * \param: num_reverse_elements: the number of elements that forward map is mapped to. It is the value of max i.
- * \param: forward_map: input forward_map, where forward_map[c] = i.
- * \param: reverse_map_xadj: reverse map xadj, that is it will hold the beginning and
- * end indices on reverse_map_adj such that all values mapped to i will be [ reverse_map_xadj[i]: reverse_map_xadj[i+1])
- * its size will be num_reverse_elements + 1.
- * \param: reverse_map_adj: reverse map adj, holds the values of reverse maps. Its size will be num_forward_elements.
+ * forward_map[c] = i, where c is a forward elements and forward_map has a size
+ * of num_forward_elements. i is the value that c is mapped in the forward map,
+ * and the range of that is num_reverse_elements. Output is the reverse_map_xadj
+ * and reverse_map_adj such that, all c, forward_map[c] = i, will appear in
+ * reverse_map_adj[ reverse_map_xadj[i]: reverse_map_xadj[i+1]) \param:
+ * num_forward_elements: the number of elements in the forward map, the size of
+ * the forward map. \param: num_reverse_elements: the number of elements that
+ * forward map is mapped to. It is the value of max i. \param: forward_map:
+ * input forward_map, where forward_map[c] = i. \param: reverse_map_xadj:
+ * reverse map xadj, that is it will hold the beginning and end indices on
+ * reverse_map_adj such that all values mapped to i will be [
+ * reverse_map_xadj[i]: reverse_map_xadj[i+1]) its size will be
+ * num_reverse_elements + 1. \param: reverse_map_adj: reverse map adj, holds the
+ * values of reverse maps. Its size will be num_forward_elements.
  *
  */
-template <typename forward_array_type, typename reverse_array_type, typename MyExecSpace>
+template <typename forward_array_type, typename reverse_array_type,
+          typename MyExecSpace>
 void create_reverse_map(
-    const typename reverse_array_type::value_type &num_forward_elements, //num_vertices
-    const typename forward_array_type::value_type &num_reverse_elements, //num_colors
+    const typename reverse_array_type::value_type
+        &num_forward_elements,  // num_vertices
+    const typename forward_array_type::value_type
+        &num_reverse_elements,  // num_colors
 
-    const forward_array_type &forward_map, //vertex to colors
-    reverse_array_type &reverse_map_xadj, // colors to vertex xadj
-    reverse_array_type &reverse_map_adj){ //colros to vertex adj
+    const forward_array_type &forward_map,  // vertex to colors
+    reverse_array_type &reverse_map_xadj,   // colors to vertex xadj
+    reverse_array_type &reverse_map_adj) {  // colros to vertex adj
 
   typedef typename reverse_array_type::value_type lno_t;
   typedef typename forward_array_type::value_type reverse_lno_t;
 
-  const lno_t  MINIMUM_TO_ATOMIC = 64;
-
-
+  const lno_t MINIMUM_TO_ATOMIC = 64;
 
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-  reverse_map_xadj = reverse_array_type("Reverse Map Xadj", num_reverse_elements + 1);
-  reverse_map_adj = reverse_array_type(Kokkos::view_alloc(Kokkos::WithoutInitializing, "REVERSE_ADJ"), num_forward_elements);
-
-
-
-  if (num_reverse_elements < MINIMUM_TO_ATOMIC){
-    const lno_t  scale_size = 1024;
-    const lno_t  multiply_shift_for_scale = 10;
+  reverse_map_xadj =
+      reverse_array_type("Reverse Map Xadj", num_reverse_elements + 1);
+  reverse_map_adj = reverse_array_type(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "REVERSE_ADJ"),
+      num_forward_elements);
+
+  if (num_reverse_elements < MINIMUM_TO_ATOMIC) {
+    const lno_t scale_size               = 1024;
+    const lno_t multiply_shift_for_scale = 10;
     const lno_t division_shift_for_bucket =
-          lno_t (ceil(log(double (num_forward_elements) / scale_size)/log(2)));
-    //const lno_t bucket_range_size = pow(2, division_shift_for_bucket);
+        lno_t(ceil(log(double(num_forward_elements) / scale_size) / log(2)));
+    // const lno_t bucket_range_size = pow(2, division_shift_for_bucket);
 
-    //coloring indices are base-1. we end up using not using element 1.
-    const reverse_lno_t tmp_reverse_size = (num_reverse_elements + 1) << multiply_shift_for_scale;
+    // coloring indices are base-1. we end up using not using element 1.
+    const reverse_lno_t tmp_reverse_size = (num_reverse_elements + 1)
+                                           << multiply_shift_for_scale;
 
-    reverse_array_type tmp_color_xadj ("TMP_REVERSE_XADJ",
-        tmp_reverse_size + 1);
+    reverse_array_type tmp_color_xadj("TMP_REVERSE_XADJ", tmp_reverse_size + 1);
 
     Reverse_Map_Scale_Init<forward_array_type, reverse_array_type> rmi(
-        forward_map,
-        tmp_color_xadj,
-        multiply_shift_for_scale,
+        forward_map, tmp_color_xadj, multiply_shift_for_scale,
         division_shift_for_bucket);
-    Kokkos::parallel_for ("KokkosKernels::Common::ReverseMapScaleInit",my_exec_space (0, num_forward_elements) , rmi);
+    Kokkos::parallel_for("KokkosKernels::Common::ReverseMapScaleInit",
+                         my_exec_space(0, num_forward_elements), rmi);
     MyExecSpace().fence();
 
-
-    inclusive_parallel_prefix_sum<reverse_array_type, MyExecSpace>(tmp_reverse_size + 1, tmp_color_xadj);
+    inclusive_parallel_prefix_sum<reverse_array_type, MyExecSpace>(
+        tmp_reverse_size + 1, tmp_color_xadj);
     MyExecSpace().fence();
 
-    Kokkos::parallel_for ("KokkosKernels::Common::StridedCopy",my_exec_space (0, num_reverse_elements + 1) , StridedCopy<reverse_array_type, reverse_array_type>(tmp_color_xadj, reverse_map_xadj, scale_size));
+    Kokkos::parallel_for("KokkosKernels::Common::StridedCopy",
+                         my_exec_space(0, num_reverse_elements + 1),
+                         StridedCopy<reverse_array_type, reverse_array_type>(
+                             tmp_color_xadj, reverse_map_xadj, scale_size));
     MyExecSpace().fence();
-    Fill_Reverse_Scale_Map<forward_array_type, reverse_array_type> frm (forward_map, tmp_color_xadj, reverse_map_adj,
-        multiply_shift_for_scale, division_shift_for_bucket);
-    Kokkos::parallel_for ("KokkosKernels::Common::FillReverseMap",my_exec_space (0, num_forward_elements) , frm);
+    Fill_Reverse_Scale_Map<forward_array_type, reverse_array_type> frm(
+        forward_map, tmp_color_xadj, reverse_map_adj, multiply_shift_for_scale,
+        division_shift_for_bucket);
+    Kokkos::parallel_for("KokkosKernels::Common::FillReverseMap",
+                         my_exec_space(0, num_forward_elements), frm);
     MyExecSpace().fence();
-  }
-  else
-  //atomic implementation.
+  } else
+  // atomic implementation.
   {
-    reverse_array_type tmp_color_xadj (Kokkos::view_alloc(Kokkos::WithoutInitializing, "TMP_REVERSE_XADJ"), num_reverse_elements + 1);
+    reverse_array_type tmp_color_xadj(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "TMP_REVERSE_XADJ"),
+        num_reverse_elements + 1);
 
-    Reverse_Map_Init<forward_array_type, reverse_array_type> rmi(forward_map, reverse_map_xadj);
+    Reverse_Map_Init<forward_array_type, reverse_array_type> rmi(
+        forward_map, reverse_map_xadj);
 
-    Kokkos::parallel_for ("KokkosKernels::Common::ReverseMapInit",my_exec_space (0, num_forward_elements) , rmi);
+    Kokkos::parallel_for("KokkosKernels::Common::ReverseMapInit",
+                         my_exec_space(0, num_forward_elements), rmi);
     MyExecSpace().fence();
-    //print_1Dview(reverse_map_xadj);
-
+    // print_1Dview(reverse_map_xadj);
 
-    inclusive_parallel_prefix_sum<reverse_array_type, MyExecSpace>(num_reverse_elements + 1, reverse_map_xadj);
+    inclusive_parallel_prefix_sum<reverse_array_type, MyExecSpace>(
+        num_reverse_elements + 1, reverse_map_xadj);
     MyExecSpace().fence();
-    Kokkos::deep_copy (tmp_color_xadj, reverse_map_xadj);
+    Kokkos::deep_copy(tmp_color_xadj, reverse_map_xadj);
     MyExecSpace().fence();
-    Fill_Reverse_Map<forward_array_type, reverse_array_type> frm (forward_map, tmp_color_xadj, reverse_map_adj);
-    Kokkos::parallel_for ("KokkosKernels::Common::FillReverseMap",my_exec_space (0, num_forward_elements) , frm);
+    Fill_Reverse_Map<forward_array_type, reverse_array_type> frm(
+        forward_map, tmp_color_xadj, reverse_map_adj);
+    Kokkos::parallel_for("KokkosKernels::Common::FillReverseMap",
+                         my_exec_space(0, num_forward_elements), frm);
     MyExecSpace().fence();
   }
 }
 
-
-template <typename value_array_type, typename out_value_array_type, typename idx_array_type>
-struct PermuteVector{
+template <typename value_array_type, typename out_value_array_type,
+          typename idx_array_type>
+struct PermuteVector {
   typedef typename idx_array_type::value_type idx;
   value_array_type old_vector;
   out_value_array_type new_vector;
   idx_array_type old_to_new_mapping;
   idx mapping_size;
-  PermuteVector(
-      value_array_type old_vector_,
-      out_value_array_type new_vector_,
-      idx_array_type old_to_new_mapping_):
-        old_vector(old_vector_), new_vector(new_vector_),old_to_new_mapping(old_to_new_mapping_), mapping_size(old_to_new_mapping_.extent(0)){}
+  PermuteVector(value_array_type old_vector_, out_value_array_type new_vector_,
+                idx_array_type old_to_new_mapping_)
+      : old_vector(old_vector_),
+        new_vector(new_vector_),
+        old_to_new_mapping(old_to_new_mapping_),
+        mapping_size(old_to_new_mapping_.extent(0)) {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const idx &ii) const {
     idx mapping = ii;
-    if (ii < mapping_size)
-      mapping = old_to_new_mapping[ii];
+    if (ii < mapping_size) mapping = old_to_new_mapping[ii];
     for (idx j = 0; j < static_cast<idx>(new_vector.extent(1)); j++) {
       new_vector.access(mapping, j) = old_vector.access(ii, j);
     }
   }
 };
 
-template <typename value_array_type, typename out_value_array_type, typename idx_array_type, typename MyExecSpace>
-void permute_vector(
-    typename idx_array_type::value_type num_elements,
-    idx_array_type &old_to_new_index_map,
-    value_array_type &old_vector,
-    out_value_array_type &new_vector
-    ){
+template <typename value_array_type, typename out_value_array_type,
+          typename idx_array_type, typename MyExecSpace>
+void permute_vector(typename idx_array_type::value_type num_elements,
+                    idx_array_type &old_to_new_index_map,
+                    value_array_type &old_vector,
+                    out_value_array_type &new_vector) {
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
 
-  Kokkos::parallel_for("KokkosKernels::Common::PermuteVector", my_exec_space(0,num_elements),
-      PermuteVector<value_array_type, out_value_array_type, idx_array_type>(old_vector, new_vector, old_to_new_index_map));
-
+  Kokkos::parallel_for(
+      "KokkosKernels::Common::PermuteVector", my_exec_space(0, num_elements),
+      PermuteVector<value_array_type, out_value_array_type, idx_array_type>(
+          old_vector, new_vector, old_to_new_index_map));
 }
 
-
-template <typename value_array_type, typename out_value_array_type, typename idx_array_type>
-struct PermuteBlockVector{
+template <typename value_array_type, typename out_value_array_type,
+          typename idx_array_type>
+struct PermuteBlockVector {
   typedef typename idx_array_type::value_type idx;
   int block_size;
   value_array_type old_vector;
   out_value_array_type new_vector;
   idx_array_type old_to_new_mapping;
   idx mapping_size;
-  PermuteBlockVector(
-      int block_size_,
-      value_array_type old_vector_,
-      out_value_array_type new_vector_,
-      idx_array_type old_to_new_mapping_):
-    	  block_size(block_size_),
-        old_vector(old_vector_), new_vector(new_vector_),old_to_new_mapping(old_to_new_mapping_), mapping_size(old_to_new_mapping_.extent(0)){}
+  PermuteBlockVector(int block_size_, value_array_type old_vector_,
+                     out_value_array_type new_vector_,
+                     idx_array_type old_to_new_mapping_)
+      : block_size(block_size_),
+        old_vector(old_vector_),
+        new_vector(new_vector_),
+        old_to_new_mapping(old_to_new_mapping_),
+        mapping_size(old_to_new_mapping_.extent(0)) {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const idx &ii) const {
-
     idx mapping = ii;
-    if (ii < mapping_size)
-      mapping = old_to_new_mapping[ii];
+    if (ii < mapping_size) mapping = old_to_new_mapping[ii];
     for (idx j = 0; j < static_cast<idx>(new_vector.extent(1)); j++) {
-      for (int i = 0; i < block_size; ++i){
-        new_vector.access(mapping*block_size + i, j) = old_vector.access(ii * block_size + i, j);
+      for (int i = 0; i < block_size; ++i) {
+        new_vector.access(mapping * block_size + i, j) =
+            old_vector.access(ii * block_size + i, j);
       }
     }
   }
 };
 
-template <typename value_array_type, typename out_value_array_type, typename idx_array_type, typename MyExecSpace>
-void permute_block_vector(
-    typename idx_array_type::value_type num_elements,
-	int block_size,
-    idx_array_type &old_to_new_index_map,
-    value_array_type &old_vector,
-    out_value_array_type &new_vector
-    ){
+template <typename value_array_type, typename out_value_array_type,
+          typename idx_array_type, typename MyExecSpace>
+void permute_block_vector(typename idx_array_type::value_type num_elements,
+                          int block_size, idx_array_type &old_to_new_index_map,
+                          value_array_type &old_vector,
+                          out_value_array_type &new_vector) {
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
 
-  Kokkos::parallel_for("KokkosKernels::Common::PermuteVector", my_exec_space(0,num_elements),
-		  PermuteBlockVector<value_array_type, out_value_array_type, idx_array_type>(block_size, old_vector, new_vector, old_to_new_index_map));
-
+  Kokkos::parallel_for(
+      "KokkosKernels::Common::PermuteVector", my_exec_space(0, num_elements),
+      PermuteBlockVector<value_array_type, out_value_array_type,
+                         idx_array_type>(block_size, old_vector, new_vector,
+                                         old_to_new_index_map));
 }
 
-//TODO BMK: clean this up by removing 1st argument. It is unused but
-//its name gives the impression that only num_elements of the vector are zeroed,
-//when really it's always the whole thing.
+// TODO BMK: clean this up by removing 1st argument. It is unused but
+// its name gives the impression that only num_elements of the vector are
+// zeroed, when really it's always the whole thing.
 template <typename value_array_type, typename MyExecSpace>
-void zero_vector(
-    typename value_array_type::value_type /* num_elements */,
-    value_array_type &vector
-    ){
+void zero_vector(typename value_array_type::value_type /* num_elements */,
+                 value_array_type &vector) {
   typedef typename value_array_type::non_const_value_type val_type;
-  Kokkos::deep_copy (vector, Kokkos::Details::ArithTraits<val_type>::zero ());
+  Kokkos::deep_copy(vector, Kokkos::Details::ArithTraits<val_type>::zero());
 }
 
-
 template <typename v1, typename v2, typename v3>
-struct MarkDuplicateSortedKeyValuePairs{
+struct MarkDuplicateSortedKeyValuePairs {
   v1 keys;
   v2 vals;
   v3 prefix_sum;
   typename v1::size_type overall_size;
-  MarkDuplicateSortedKeyValuePairs(v1 keys_,v2 vals_, v3 prefix_sum_, typename v1::size_type overall_size_):
-    keys(keys_), vals(vals_), prefix_sum(prefix_sum_), overall_size(overall_size_){}
+  MarkDuplicateSortedKeyValuePairs(v1 keys_, v2 vals_, v3 prefix_sum_,
+                                   typename v1::size_type overall_size_)
+      : keys(keys_),
+        vals(vals_),
+        prefix_sum(prefix_sum_),
+        overall_size(overall_size_) {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const size_t &i, typename v3::value_type &num_result) const {
     typename v1::value_type my_key = keys(i);
     typename v2::value_type my_val = vals(i);
 
-    if ((my_key != 0 && my_val != 0) && ((i + 1 >= overall_size) || (my_key != keys(i + 1) || my_val != vals(i + 1)))){
+    if ((my_key != 0 && my_val != 0) &&
+        ((i + 1 >= overall_size) ||
+         (my_key != keys(i + 1) || my_val != vals(i + 1)))) {
       prefix_sum(i) = 1;
       num_result += 1;
     }
   }
-
-
 };
 
 template <typename v1, typename v2, typename v3, typename v4, typename v5>
-struct FillSymmetricCSR{
+struct FillSymmetricCSR {
   v1 keys;
   v2 vals;
   v3 prefix_sum;
   typename v3::size_type array_size;
   v4 out_xadj;
   v5 out_adj;
-  FillSymmetricCSR(
-      v1 keys_,v2 vals_, v3 prefix_sum_, typename v3::size_type array_size_,
-      v4 out_xadj_, v5 out_adj_):
-        keys(keys_), vals(vals_), prefix_sum(prefix_sum_), array_size(array_size_),
-        out_xadj(out_xadj_), out_adj(out_adj_){}
+  FillSymmetricCSR(v1 keys_, v2 vals_, v3 prefix_sum_,
+                   typename v3::size_type array_size_, v4 out_xadj_,
+                   v5 out_adj_)
+      : keys(keys_),
+        vals(vals_),
+        prefix_sum(prefix_sum_),
+        array_size(array_size_),
+        out_xadj(out_xadj_),
+        out_adj(out_adj_) {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const size_t &i) const {
     typename v3::value_type my_pos = prefix_sum(i);
 
-    if (i + 1 >= array_size){
+    if (i + 1 >= array_size) {
       typename v2::value_type my_val = vals(i);
       typename v1::value_type my_key = keys(i);
-      out_adj(my_pos) = my_val - 1;
-      out_xadj(my_key) = my_pos + 1;
-    }
-    else {
+      out_adj(my_pos)                = my_val - 1;
+      out_xadj(my_key)               = my_pos + 1;
+    } else {
       typename v3::value_type next_pos = prefix_sum(i + 1);
-      if (my_pos != next_pos){
-
-        typename v2::value_type my_val = vals(i);
-        typename v1::value_type my_key = keys(i);
+      if (my_pos != next_pos) {
+        typename v2::value_type my_val   = vals(i);
+        typename v1::value_type my_key   = keys(i);
         typename v1::value_type next_key = keys(i + 1);
-        out_adj(my_pos) = my_val - 1;
-        if (my_key != next_key){
+        out_adj(my_pos)                  = my_val - 1;
+        if (my_key != next_key) {
           out_xadj(my_key) = my_pos + 1;
-
         }
-
       }
     }
   }
-
-
 };
 
-
-
-template <typename in_lno_row_view_t,
-          typename in_lno_nnz_view_t,
-          typename out_lno_nnz_view_t,
-          typename MyExecSpace>
+template <typename in_lno_row_view_t, typename in_lno_nnz_view_t,
+          typename out_lno_nnz_view_t, typename MyExecSpace>
 void symmetrize_and_get_lower_diagonal_edge_list(
     typename in_lno_nnz_view_t::value_type num_rows_to_symmetrize,
-    in_lno_row_view_t xadj,
-    in_lno_nnz_view_t adj,
-    out_lno_nnz_view_t &sym_srcs,
-    out_lno_nnz_view_t &sym_dsts_
-    ){
-
+    in_lno_row_view_t xadj, in_lno_nnz_view_t adj, out_lno_nnz_view_t &sym_srcs,
+    out_lno_nnz_view_t &sym_dsts_) {
   typedef typename in_lno_row_view_t::non_const_value_type idx;
 
-
   idx nnz = adj.extent(0);
 
-  //idx_out_edge_array_type tmp_srcs("tmpsrc", nnz * 2);
-  //idx_out_edge_array_type tmp_dsts("tmpdst",nnz * 2);
+  // idx_out_edge_array_type tmp_srcs("tmpsrc", nnz * 2);
+  // idx_out_edge_array_type tmp_dsts("tmpdst",nnz * 2);
 
-  typedef Kokkos::TeamPolicy<MyExecSpace> team_policy ;
-  typedef typename team_policy::member_type team_member_t ;
+  typedef Kokkos::TeamPolicy<MyExecSpace> team_policy;
+  typedef typename team_policy::member_type team_member_t;
 
-  //typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
+  // typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
 
-  //TODO: Should change this to temporary memory space?
-  typedef Kokkos::UnorderedMap< Kokkos::pair<idx, idx> , void , MyExecSpace> hashmap_t;
+  // TODO: Should change this to temporary memory space?
+  typedef Kokkos::UnorderedMap<Kokkos::pair<idx, idx>, void, MyExecSpace>
+      hashmap_t;
 
   out_lno_nnz_view_t pre_pps_("pre_pps", num_rows_to_symmetrize + 1);
 
@@ -990,110 +966,100 @@ void symmetrize_and_get_lower_diagonal_edge_list(
   {
     hashmap_t umap(nnz);
     umap.clear();
-    umap.end_erase ();
-    FillSymmetricLowerEdgesHashMap <in_lno_row_view_t, in_lno_nnz_view_t,
-    hashmap_t, out_lno_nnz_view_t, team_member_t> fse(
-        num_rows_to_symmetrize,
-        xadj,
-        adj,
-        umap,
-        pre_pps_
-    );
-
+    umap.end_erase();
+    FillSymmetricLowerEdgesHashMap<in_lno_row_view_t, in_lno_nnz_view_t,
+                                   hashmap_t, out_lno_nnz_view_t, team_member_t>
+        fse(num_rows_to_symmetrize, xadj, adj, umap, pre_pps_);
 
     int teamSizeMax = 0;
     int vector_size = 0;
 
-    get_suggested_vector_size<idx, MyExecSpace>(
-        vector_size,
-        xadj.extent(0) - 1, nnz);
+    get_suggested_vector_size<idx, MyExecSpace>(vector_size, xadj.extent(0) - 1,
+                                                nnz);
 
     teamSizeMax = get_suggested_team_size<team_policy>(fse, vector_size);
-    //std::cout << "max_allowed_team_size:" << max_allowed_team_size << " vs:" << vector_size << " tsm:" << teamSizeMax<< std::endl;
-    
-    team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size);
-    Kokkos::parallel_for("KokkosKernels::Common::SymmetrizeAndGetLowerDiagonalEdgeList::S0",
-        pol, fse/*, num_symmetric_edges*/);
-    MyExecSpace().fence();
+    // std::cout << "max_allowed_team_size:" << max_allowed_team_size << " vs:"
+    // << vector_size << " tsm:" << teamSizeMax<< std::endl;
 
+    team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax,
+                    teamSizeMax, vector_size);
+    Kokkos::parallel_for(
+        "KokkosKernels::Common::SymmetrizeAndGetLowerDiagonalEdgeList::S0", pol,
+        fse /*, num_symmetric_edges*/);
+    MyExecSpace().fence();
   }
 
   if (num_rows_to_symmetrize > 0)
-  exclusive_parallel_prefix_sum<out_lno_nnz_view_t, MyExecSpace>(
-      num_rows_to_symmetrize + 1,
-      pre_pps_);
+    exclusive_parallel_prefix_sum<out_lno_nnz_view_t, MyExecSpace>(
+        num_rows_to_symmetrize + 1, pre_pps_);
   MyExecSpace().fence();
 
   auto d_sym_edge_size = Kokkos::subview(pre_pps_, num_rows_to_symmetrize);
-  auto h_sym_edge_size = Kokkos::create_mirror_view (d_sym_edge_size);
-  Kokkos::deep_copy (h_sym_edge_size, d_sym_edge_size);
+  auto h_sym_edge_size = Kokkos::create_mirror_view(d_sym_edge_size);
+  Kokkos::deep_copy(h_sym_edge_size, d_sym_edge_size);
   num_symmetric_edges = h_sym_edge_size();
   /*
-  typename out_lno_nnz_view_t::HostMirror h_sym_edge_size = Kokkos::create_mirror_view (pre_pps_);
+  typename out_lno_nnz_view_t::HostMirror h_sym_edge_size =
+  Kokkos::create_mirror_view (pre_pps_);
 
   Kokkos::deep_copy (h_sym_edge_size , pre_pps_);
   num_symmetric_edges = h_sym_edge_size(h_sym_edge_size.extent(0) - 1);
   */
 
-
-  sym_srcs = out_lno_nnz_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "sym_srcs"), num_symmetric_edges);
-  sym_dsts_ = out_lno_nnz_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "sym_dsts_"), num_symmetric_edges);
+  sym_srcs = out_lno_nnz_view_t(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "sym_srcs"),
+      num_symmetric_edges);
+  sym_dsts_ = out_lno_nnz_view_t(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "sym_dsts_"),
+      num_symmetric_edges);
   MyExecSpace().fence();
   {
-
-    hashmap_t umap (nnz);
-    FillSymmetricEdgeList_HashMap <in_lno_row_view_t, in_lno_nnz_view_t,
-    hashmap_t, out_lno_nnz_view_t, out_lno_nnz_view_t, team_member_t>
-    FSCH (num_rows_to_symmetrize, xadj, adj, umap, sym_srcs, sym_dsts_, pre_pps_);
+    hashmap_t umap(nnz);
+    FillSymmetricEdgeList_HashMap<in_lno_row_view_t, in_lno_nnz_view_t,
+                                  hashmap_t, out_lno_nnz_view_t,
+                                  out_lno_nnz_view_t, team_member_t>
+        FSCH(num_rows_to_symmetrize, xadj, adj, umap, sym_srcs, sym_dsts_,
+             pre_pps_);
 
     int teamSizeMax = 0;
     int vector_size = 0;
 
-    get_suggested_vector_size<idx, MyExecSpace>(
-        vector_size,
-        xadj.extent(0) - 1, nnz);
+    get_suggested_vector_size<idx, MyExecSpace>(vector_size, xadj.extent(0) - 1,
+                                                nnz);
 
     teamSizeMax = get_suggested_team_size<team_policy>(FSCH, vector_size);
 
-    team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size);
-    Kokkos::parallel_for("KokkosKernels::Common::SymmetrizeAndGetLowerDiagonalEdgeList::S1", pol, FSCH);
+    team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax,
+                    teamSizeMax, vector_size);
+    Kokkos::parallel_for(
+        "KokkosKernels::Common::SymmetrizeAndGetLowerDiagonalEdgeList::S1", pol,
+        FSCH);
     MyExecSpace().fence();
   }
 }
 
-
-
-
-
-template <typename in_lno_row_view_t,
-          typename in_lno_nnz_view_t,
-          typename out_lno_row_view_t,
-          typename out_lno_nnz_view_t,
+template <typename in_lno_row_view_t, typename in_lno_nnz_view_t,
+          typename out_lno_row_view_t, typename out_lno_nnz_view_t,
           typename MyExecSpace>
 void symmetrize_graph_symbolic_hashmap(
     typename in_lno_row_view_t::value_type num_rows_to_symmetrize,
-    in_lno_row_view_t xadj,
-    in_lno_nnz_view_t adj,
-    out_lno_row_view_t &sym_xadj,
-    out_lno_nnz_view_t &sym_adj
-    ){
-
-
+    in_lno_row_view_t xadj, in_lno_nnz_view_t adj, out_lno_row_view_t &sym_xadj,
+    out_lno_nnz_view_t &sym_adj) {
   typedef typename in_lno_row_view_t::non_const_value_type idx;
 
   idx nnz = adj.extent(0);
 
+  // idx_out_edge_array_type tmp_srcs("tmpsrc", nnz * 2);
+  // idx_out_edge_array_type tmp_dsts("tmpdst",nnz * 2);
 
-  //idx_out_edge_array_type tmp_srcs("tmpsrc", nnz * 2);
-  //idx_out_edge_array_type tmp_dsts("tmpdst",nnz * 2);
-
-  typedef Kokkos::TeamPolicy<MyExecSpace> team_policy ;
-  typedef typename team_policy::member_type team_member_t ;
+  typedef Kokkos::TeamPolicy<MyExecSpace> team_policy;
+  typedef typename team_policy::member_type team_member_t;
 
-  //typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
+  // typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
 
-  //TODO: Should change this to temporary memory space?
-  typedef Kokkos::UnorderedMap< Kokkos::pair<idx, idx> , void , MyExecSpace> hashmap_t;
+  // TODO: Should change this to temporary memory space?
+  typedef Kokkos::UnorderedMap<Kokkos::pair<idx, idx>, void, MyExecSpace>
+      hashmap_t;
 
   out_lno_row_view_t pre_pps_("pre_pps", num_rows_to_symmetrize + 1);
 
@@ -1101,360 +1067,338 @@ void symmetrize_graph_symbolic_hashmap(
   {
     hashmap_t umap(nnz);
     umap.clear();
-    umap.end_erase ();
-    FillSymmetricEdgesHashMap <in_lno_row_view_t, in_lno_nnz_view_t,
-    hashmap_t, out_lno_row_view_t, team_member_t> fse(
-        num_rows_to_symmetrize,
-        xadj,
-        adj,
-        umap,
-        pre_pps_
-    );
-
+    umap.end_erase();
+    FillSymmetricEdgesHashMap<in_lno_row_view_t, in_lno_nnz_view_t, hashmap_t,
+                              out_lno_row_view_t, team_member_t>
+        fse(num_rows_to_symmetrize, xadj, adj, umap, pre_pps_);
 
     int teamSizeMax = 0;
     int vector_size = 0;
 
-    get_suggested_vector_size<idx, MyExecSpace>(
-        vector_size,
-        xadj.extent(0) - 1, nnz);
+    get_suggested_vector_size<idx, MyExecSpace>(vector_size, xadj.extent(0) - 1,
+                                                nnz);
 
     teamSizeMax = get_suggested_team_size<team_policy>(fse, vector_size);
 
-    team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size);
-    Kokkos::parallel_for("KokkosKernels::Common::SymmetrizeGraphSymbolicHashMap::S0",
-        pol, fse/*, num_symmetric_edges*/);
+    team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax,
+                    teamSizeMax, vector_size);
+    Kokkos::parallel_for(
+        "KokkosKernels::Common::SymmetrizeGraphSymbolicHashMap::S0", pol,
+        fse /*, num_symmetric_edges*/);
     MyExecSpace().fence();
   }
 
-
   if (num_rows_to_symmetrize > 0)
-  exclusive_parallel_prefix_sum<out_lno_row_view_t, MyExecSpace>(
-      num_rows_to_symmetrize + 1,
-      pre_pps_);
+    exclusive_parallel_prefix_sum<out_lno_row_view_t, MyExecSpace>(
+        num_rows_to_symmetrize + 1, pre_pps_);
   MyExecSpace().fence();
 
+  // out_lno_row_view_t d_sym_edge_size = Kokkos::subview(pre_pps_,
+  // num_rows_to_symmetrize, num_rows_to_symmetrize );
+  typename out_lno_row_view_t::HostMirror h_sym_edge_size =
+      Kokkos::create_mirror_view(pre_pps_);
 
-  //out_lno_row_view_t d_sym_edge_size = Kokkos::subview(pre_pps_, num_rows_to_symmetrize, num_rows_to_symmetrize );
-  typename out_lno_row_view_t::HostMirror h_sym_edge_size = Kokkos::create_mirror_view (pre_pps_);
-
-  Kokkos::deep_copy (h_sym_edge_size , pre_pps_);
+  Kokkos::deep_copy(h_sym_edge_size, pre_pps_);
   num_symmetric_edges = h_sym_edge_size(h_sym_edge_size.extent(0) - 1);
 
-
-  sym_adj = out_lno_nnz_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "sym_adj"), num_symmetric_edges);
+  sym_adj = out_lno_nnz_view_t(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "sym_adj"),
+      num_symmetric_edges);
   MyExecSpace().fence();
-  sym_xadj = out_lno_row_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "sym_xadj"), num_rows_to_symmetrize + 1);
+  sym_xadj = out_lno_row_view_t(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "sym_xadj"),
+      num_rows_to_symmetrize + 1);
   Kokkos::deep_copy(sym_xadj, pre_pps_);
   {
-
-    hashmap_t umap (nnz);
-    FillSymmetricCRS_HashMap <in_lno_row_view_t, in_lno_nnz_view_t,
-    hashmap_t, out_lno_row_view_t, out_lno_nnz_view_t, team_member_t>
-    FSCH (num_rows_to_symmetrize, xadj, adj, umap, pre_pps_, sym_adj);
+    hashmap_t umap(nnz);
+    FillSymmetricCRS_HashMap<in_lno_row_view_t, in_lno_nnz_view_t, hashmap_t,
+                             out_lno_row_view_t, out_lno_nnz_view_t,
+                             team_member_t>
+        FSCH(num_rows_to_symmetrize, xadj, adj, umap, pre_pps_, sym_adj);
 
     int teamSizeMax = 0;
     int vector_size = 0;
 
-    get_suggested_vector_size<idx, MyExecSpace>(
-        vector_size,
-        xadj.extent(0) - 1, nnz);
+    get_suggested_vector_size<idx, MyExecSpace>(vector_size, xadj.extent(0) - 1,
+                                                nnz);
 
     teamSizeMax = get_suggested_team_size<team_policy>(FSCH, vector_size);
 
-
-    team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size);
-    Kokkos::parallel_for("KokkosKernels::Common::SymmetrizeGraphSymbolicHashMap::S1",
-        pol, FSCH);
+    team_policy pol((num_rows_to_symmetrize + teamSizeMax - 1) / teamSizeMax,
+                    teamSizeMax, vector_size);
+    Kokkos::parallel_for(
+        "KokkosKernels::Common::SymmetrizeGraphSymbolicHashMap::S1", pol, FSCH);
     MyExecSpace().fence();
   }
 
   MyExecSpace().fence();
-
 }
 
 template <typename from_vector, typename to_vector, typename MyExecSpace>
-void copy_vector(
-                size_t num_elements,
-                from_vector from, to_vector to){
-
-  kk_copy_vector<from_vector, to_vector, MyExecSpace>
-      (num_elements, from, to);
-
+void copy_vector(size_t num_elements, from_vector from, to_vector to) {
+  kk_copy_vector<from_vector, to_vector, MyExecSpace>(num_elements, from, to);
 }
 
-
 template <typename from_vector, typename to_vector>
-struct CopyView{
+struct CopyView {
   from_vector from;
   to_vector to;
 
-  CopyView(from_vector &from_, to_vector to_): from(from_), to(to_){}
+  CopyView(from_vector &from_, to_vector to_) : from(from_), to(to_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const size_t &i) const {
-    to(i) = from(i);
-  }
+  void operator()(const size_t &i) const { to(i) = from(i); }
 };
 template <typename from_vector, typename to_vector, typename MyExecSpace>
-void copy_view(
-                size_t num_elements,
-                from_vector from, to_vector to){
-
+void copy_view(size_t num_elements, from_vector from, to_vector to) {
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-  Kokkos::parallel_for( "KokkosKernels::Common::CopyView", my_exec_space(0,num_elements), CopyView<from_vector, to_vector>(from, to));
-
+  Kokkos::parallel_for("KokkosKernels::Common::CopyView",
+                       my_exec_space(0, num_elements),
+                       CopyView<from_vector, to_vector>(from, to));
 }
 
-
-
 template <typename from_view>
-void safe_device_to_host_deep_copy(
-                size_t num_elements,
-                from_view from, typename from_view::HostMirror to){
-
+void safe_device_to_host_deep_copy(size_t num_elements, from_view from,
+                                   typename from_view::HostMirror to) {
   typedef typename from_view::value_type scalar_t;
   typedef typename from_view::device_type device_t;
 
-  typedef Kokkos::View <scalar_t *, device_t> unstrided_from_view_t;
-  unstrided_from_view_t unstrided_from ("unstrided", num_elements);
+  typedef Kokkos::View<scalar_t *, device_t> unstrided_from_view_t;
+  unstrided_from_view_t unstrided_from("unstrided", num_elements);
 
-  copy_view<from_view, unstrided_from_view_t, typename device_t::execution_space>  (num_elements, from, unstrided_from);
+  copy_view<from_view, unstrided_from_view_t,
+            typename device_t::execution_space>(num_elements, from,
+                                                unstrided_from);
 
   Kokkos::fence();
 
   typedef typename unstrided_from_view_t::HostMirror host_unstrided_from_view_t;
-  host_unstrided_from_view_t h_unstrided_from = Kokkos::create_mirror_view(unstrided_from);
+  host_unstrided_from_view_t h_unstrided_from =
+      Kokkos::create_mirror_view(unstrided_from);
 
-  Kokkos::deep_copy(h_unstrided_from,unstrided_from);
+  Kokkos::deep_copy(h_unstrided_from, unstrided_from);
   Kokkos::fence();
 
-  copy_view<host_unstrided_from_view_t,
-            typename from_view::HostMirror,
-            typename host_unstrided_from_view_t::device_type::execution_space>  (num_elements, h_unstrided_from, to);
+  copy_view<host_unstrided_from_view_t, typename from_view::HostMirror,
+            typename host_unstrided_from_view_t::device_type::execution_space>(
+      num_elements, h_unstrided_from, to);
 
   Kokkos::fence();
 }
 
-
 template <typename to_view>
-void safe_host_to_device_deep_copy(
-                size_t num_elements,
-                typename to_view::HostMirror from, to_view  to){
-
+void safe_host_to_device_deep_copy(size_t num_elements,
+                                   typename to_view::HostMirror from,
+                                   to_view to) {
   typedef typename to_view::value_type scalar_t;
   typedef typename to_view::device_type device_t;
 
   typedef typename to_view::HostMirror::device_type h_device_t;
 
-  typedef Kokkos::View <scalar_t *, h_device_t> host_unstrided_view_t;
-  typedef Kokkos::View <scalar_t *, device_t> device_unstrided_view_t;
+  typedef Kokkos::View<scalar_t *, h_device_t> host_unstrided_view_t;
+  typedef Kokkos::View<scalar_t *, device_t> device_unstrided_view_t;
 
-  host_unstrided_view_t host_unstrided_from ("unstrided", num_elements);
-  device_unstrided_view_t device_unstrided_to ("unstrided", num_elements);
+  host_unstrided_view_t host_unstrided_from("unstrided", num_elements);
+  device_unstrided_view_t device_unstrided_to("unstrided", num_elements);
 
-  copy_view<typename to_view::HostMirror, host_unstrided_view_t, typename h_device_t::execution_space>  (num_elements, from, host_unstrided_from);
+  copy_view<typename to_view::HostMirror, host_unstrided_view_t,
+            typename h_device_t::execution_space>(num_elements, from,
+                                                  host_unstrided_from);
 
   Kokkos::fence();
-  Kokkos::deep_copy(device_unstrided_to,host_unstrided_from);
+  Kokkos::deep_copy(device_unstrided_to, host_unstrided_from);
   Kokkos::fence();
 
-  copy_view<device_unstrided_view_t,
-            to_view,
-            typename device_t::execution_space>  (num_elements, device_unstrided_to, to);
+  copy_view<device_unstrided_view_t, to_view,
+            typename device_t::execution_space>(num_elements,
+                                                device_unstrided_to, to);
 
   Kokkos::fence();
 }
 
-
-
-
-template<typename view_type>
-struct ReduceSumFunctor{
-
+template <typename view_type>
+struct ReduceSumFunctor {
   view_type view_to_reduce;
 
-  ReduceSumFunctor(
-      view_type view_to_reduce_): view_to_reduce(view_to_reduce_){}
+  ReduceSumFunctor(view_type view_to_reduce_)
+      : view_to_reduce(view_to_reduce_) {}
 
-  void operator()(const size_t &i, typename view_type::non_const_value_type &sum_reduction) const {
+  void operator()(
+      const size_t &i,
+      typename view_type::non_const_value_type &sum_reduction) const {
     sum_reduction += view_to_reduce(i);
   }
 };
 
-template <typename view_type , typename MyExecSpace>
-void view_reduce_sum(size_t num_elements, view_type view_to_reduce, typename view_type::non_const_value_type &sum_reduction){
+template <typename view_type, typename MyExecSpace>
+void view_reduce_sum(size_t num_elements, view_type view_to_reduce,
+                     typename view_type::non_const_value_type &sum_reduction) {
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-  Kokkos::parallel_reduce( "KokkosKernels::Common::ViewReduceSum", my_exec_space(0,num_elements), ReduceSumFunctor<view_type>(view_to_reduce), sum_reduction);
+  Kokkos::parallel_reduce(
+      "KokkosKernels::Common::ViewReduceSum", my_exec_space(0, num_elements),
+      ReduceSumFunctor<view_type>(view_to_reduce), sum_reduction);
 }
 
-
-
-
-
-template <typename view_type , typename MyExecSpace>
-void view_reduce_max(size_t num_elements, view_type view_to_reduce, typename view_type::non_const_value_type &max_reduction){
-  kk_view_reduce_max<view_type, MyExecSpace>(num_elements, view_to_reduce, max_reduction);
+template <typename view_type, typename MyExecSpace>
+void view_reduce_max(size_t num_elements, view_type view_to_reduce,
+                     typename view_type::non_const_value_type &max_reduction) {
+  kk_view_reduce_max<view_type, MyExecSpace>(num_elements, view_to_reduce,
+                                             max_reduction);
 }
 
-template<typename size_type>
-struct ReduceRowSizeFunctor{
+template <typename size_type>
+struct ReduceRowSizeFunctor {
   const size_type *rowmap_view_begins;
   const size_type *rowmap_view_ends;
   const size_type min_val;
-  ReduceRowSizeFunctor(
-      const size_type *rb,const  size_type *re): rowmap_view_begins(rb), rowmap_view_ends(re),
-          min_val(0)
-  {}
+  ReduceRowSizeFunctor(const size_type *rb, const size_type *re)
+      : rowmap_view_begins(rb), rowmap_view_ends(re), min_val(0) {}
   KOKKOS_INLINE_FUNCTION
   void operator()(const size_t &i, size_type &max_reduction) const {
-    size_type val = rowmap_view_ends[i] - rowmap_view_begins[i] ;
-    if (max_reduction < val) { max_reduction = val;}
+    size_type val = rowmap_view_ends[i] - rowmap_view_begins[i];
+    if (max_reduction < val) {
+      max_reduction = val;
+    }
   }
   KOKKOS_INLINE_FUNCTION
-  void join (volatile size_type& dst,const volatile size_type& src) const {
-    if (dst < src) { dst = src;}
+  void join(volatile size_type &dst, const volatile size_type &src) const {
+    if (dst < src) {
+      dst = src;
+    }
   }
 
-
   KOKKOS_INLINE_FUNCTION
-  void init (size_type& dst) const
-  {
+  void init(size_type &dst) const {
     // The identity under max is -Inf.
     // Kokkos does not come with a portable way to access
     // floating -point Inf and NaN. Trilinos does , however;
     // see Kokkos :: ArithTraits in the Tpetra package.
     dst = min_val;
   }
-
 };
 
-//view has num_rows+1 elements.
-template <typename size_type , typename MyExecSpace>
+// view has num_rows+1 elements.
+template <typename size_type, typename MyExecSpace>
 void kk_view_reduce_max_row_size(const size_t num_rows,
-                const size_type *rowmap_view_begins,
-                const size_type *rowmap_view_ends,
-                size_type &max_row_size){
+                                 const size_type *rowmap_view_begins,
+                                 const size_type *rowmap_view_ends,
+                                 size_type &max_row_size) {
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-  Kokkos::parallel_reduce( "KokkosKernels::Common::ViewReduceMaxRowSize", my_exec_space(0,num_rows),
-      ReduceRowSizeFunctor<size_type>(rowmap_view_begins, rowmap_view_ends), max_row_size);
+  Kokkos::parallel_reduce(
+      "KokkosKernels::Common::ViewReduceMaxRowSize", my_exec_space(0, num_rows),
+      ReduceRowSizeFunctor<size_type>(rowmap_view_begins, rowmap_view_ends),
+      max_row_size);
 }
 
-
-template<typename view_type>
-struct ReduceMaxRowFunctor{
+template <typename view_type>
+struct ReduceMaxRowFunctor {
   view_type rowmap_view;
   typedef typename view_type::non_const_value_type value_type;
   const value_type min_val;
-  ReduceMaxRowFunctor(
-      view_type rowmap_view_): rowmap_view(rowmap_view_),
-          min_val(0)
-  {}
+  ReduceMaxRowFunctor(view_type rowmap_view_)
+      : rowmap_view(rowmap_view_), min_val(0) {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const size_t &i, value_type &max_reduction) const {
-    value_type val = rowmap_view(i+1) - rowmap_view(i) ;
-    if (max_reduction < val) { max_reduction = val;}
-
+    value_type val = rowmap_view(i + 1) - rowmap_view(i);
+    if (max_reduction < val) {
+      max_reduction = val;
+    }
   }
   KOKKOS_INLINE_FUNCTION
-  void join (volatile value_type& dst,const volatile value_type& src) const {
-    if (dst < src) { dst = src;}
+  void join(volatile value_type &dst, const volatile value_type &src) const {
+    if (dst < src) {
+      dst = src;
+    }
   }
 
-
   KOKKOS_INLINE_FUNCTION
-  void init (value_type& dst) const
-  {
+  void init(value_type &dst) const {
     // The identity under max is -Inf.
     // Kokkos does not come with a portable way to access
     // floating -point Inf and NaN. Trilinos does , however;
     // see Kokkos :: ArithTraits in the Tpetra package.
     dst = min_val;
   }
-
 };
 
-//view has num_rows+1 elements.
-template <typename view_type , typename MyExecSpace>
-void view_reduce_maxsizerow(size_t num_rows, view_type rowmap_view, typename view_type::non_const_value_type &max_reduction){
+// view has num_rows+1 elements.
+template <typename view_type, typename MyExecSpace>
+void view_reduce_maxsizerow(
+    size_t num_rows, view_type rowmap_view,
+    typename view_type::non_const_value_type &max_reduction) {
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-  Kokkos::parallel_reduce( "KokkosKernels::Common::ViewReduceMaxSizeRow", my_exec_space(0,num_rows), ReduceMaxRowFunctor<view_type>(rowmap_view), max_reduction);
+  Kokkos::parallel_reduce(
+      "KokkosKernels::Common::ViewReduceMaxSizeRow", my_exec_space(0, num_rows),
+      ReduceMaxRowFunctor<view_type>(rowmap_view), max_reduction);
 }
 
-
-
-
-template<typename view_type1, typename view_type2>
-struct IsEqualFunctor{
+template <typename view_type1, typename view_type2>
+struct IsEqualFunctor {
   view_type1 view1;
   view_type2 view2;
 
-  IsEqualFunctor(view_type1 view1_, view_type2 view2_): view1(view1_), view2(view2_){}
+  IsEqualFunctor(view_type1 view1_, view_type2 view2_)
+      : view1(view1_), view2(view2_) {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const size_t &i, int &is_equal) const {
     if (view1(i) != view2(i)) {
-      //std::cout << "i:" << i << "view1:" << view1(i) << " view2:" <<  view2(i) << std::endl;
-      //printf("i:%d v1:")
+      // std::cout << "i:" << i << "view1:" << view1(i) << " view2:" << view2(i)
+      // << std::endl; printf("i:%d v1:")
       is_equal = 0;
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join (volatile int& dst,const volatile int& src) const {
+  void join(volatile int &dst, const volatile int &src) const {
     dst = dst & src;
   }
   KOKKOS_INLINE_FUNCTION
-  void init (int& dst) const
-  {
-    dst = 1;
-  }
-
+  void init(int &dst) const { dst = 1; }
 };
 template <typename view_type1, typename view_type2, typename MyExecSpace>
-bool isSame(size_t num_elements, view_type1 view1, view_type2 view2){
+bool isSame(size_t num_elements, view_type1 view1, view_type2 view2) {
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
   int issame = 1;
-  Kokkos::parallel_reduce( "KokkosKernels::Common::isSame", my_exec_space(0,num_elements), IsEqualFunctor<view_type1, view_type2>(view1, view2), issame);
+  Kokkos::parallel_reduce(
+      "KokkosKernels::Common::isSame", my_exec_space(0, num_elements),
+      IsEqualFunctor<view_type1, view_type2>(view1, view2), issame);
   MyExecSpace().fence();
   return issame;
 }
 
-
 template <typename a_view_t, typename b_view_t, typename size_type>
-struct MaxHeap{
-
+struct MaxHeap {
   a_view_t heap_keys;
   b_view_t heap_values;
   size_type max_size;
   size_type current_size;
 
-  MaxHeap (
-      a_view_t heap_keys_,
-      b_view_t heap_values_,
-      size_type max_size_): heap_keys(heap_keys_), heap_values(heap_values_), max_size(max_size_), current_size(0){}
+  MaxHeap(a_view_t heap_keys_, b_view_t heap_values_, size_type max_size_)
+      : heap_keys(heap_keys_),
+        heap_values(heap_values_),
+        max_size(max_size_),
+        current_size(0) {}
 
   KOKKOS_INLINE_FUNCTION
-  void insert(typename a_view_t::value_type &key, typename b_view_t::value_type &val){
-    for (size_type i = 0; i < current_size; ++i){
-      if (key == heap_keys(i)){
+  void insert(typename a_view_t::value_type &key,
+              typename b_view_t::value_type &val) {
+    for (size_type i = 0; i < current_size; ++i) {
+      if (key == heap_keys(i)) {
         heap_values(i) = heap_values(i) & val;
         return;
       }
     }
-    heap_keys(current_size) = key;
+    heap_keys(current_size)     = key;
     heap_values(current_size++) = val;
   }
-
-
 };
 
-template <typename in_view_t,
-          typename MyExecSpace>
-struct InitScalar{
-  typedef Kokkos::TeamPolicy<MyExecSpace> team_policy_t ;
-  typedef typename team_policy_t::member_type team_member_t ;
+template <typename in_view_t, typename MyExecSpace>
+struct InitScalar {
+  typedef Kokkos::TeamPolicy<MyExecSpace> team_policy_t;
+  typedef typename team_policy_t::member_type team_member_t;
 
   typedef typename in_view_t::non_const_value_type nnz_lno_t;
   typedef typename in_view_t::size_type size_type;
@@ -1464,102 +1408,95 @@ struct InitScalar{
   size_type team_row_chunk_size;
   nnz_lno_t init_val;
 
-  InitScalar(
-      size_type num_elements_,
-      in_view_t view_to_init_,
-      size_type chunk_size_,
-      nnz_lno_t init_val_):
-        num_elements(num_elements_),
-        view_to_init(view_to_init_), team_row_chunk_size(chunk_size_), init_val (init_val_){}
+  InitScalar(size_type num_elements_, in_view_t view_to_init_,
+             size_type chunk_size_, nnz_lno_t init_val_)
+      : num_elements(num_elements_),
+        view_to_init(view_to_init_),
+        team_row_chunk_size(chunk_size_),
+        init_val(init_val_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const team_member_t & teamMember) const {
-
-    //const nnz_lno_t row_index = teamMember.league_rank() * team_row_chunk_size;
-
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, num_elements);
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_ind){
-      view_to_init [row_ind] = init_val;
-    });
+  void operator()(const team_member_t &teamMember) const {
+    // const nnz_lno_t row_index = teamMember.league_rank() *
+    // team_row_chunk_size;
+
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(
+        team_row_begin + team_row_chunk_size, num_elements);
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_ind) { view_to_init[row_ind] = init_val; });
   }
 };
 template <typename in_row_view_t, typename MyExecSpace>
-void init_view_withscalar(typename in_row_view_t::size_type num_elements, in_row_view_t arr,
+void init_view_withscalar(
+    typename in_row_view_t::size_type num_elements, in_row_view_t arr,
     typename in_row_view_t::size_type team_size,
-    typename in_row_view_t::non_const_value_type init_val){
-
-  typename in_row_view_t::size_type chunk_size =  num_elements / team_size;
-  typedef InitScalar <in_row_view_t, MyExecSpace>  InitScalar_t;
-  InitScalar_t tm (num_elements, arr, chunk_size, init_val);
+    typename in_row_view_t::non_const_value_type init_val) {
+  typename in_row_view_t::size_type chunk_size = num_elements / team_size;
+  typedef InitScalar<in_row_view_t, MyExecSpace> InitScalar_t;
+  InitScalar_t tm(num_elements, arr, chunk_size, init_val);
   typedef typename InitScalar_t::team_policy_t tcp_t;
   int vector_size = 1;
 
   Kokkos::Timer timer1;
-  Kokkos::parallel_for( "KokkosKernels::Common::InitViewWithScalar",  tcp_t(num_elements / chunk_size + 1 , team_size, vector_size), tm);
+  Kokkos::parallel_for(
+      "KokkosKernels::Common::InitViewWithScalar",
+      tcp_t(num_elements / chunk_size + 1, team_size, vector_size), tm);
   MyExecSpace().fence();
 }
 
-//A sum-reduction scalar representing a fixed-size array.
-template<typename scalar_t, int N>
-struct array_sum_reduce
-{
+// A sum-reduction scalar representing a fixed-size array.
+template <typename scalar_t, int N>
+struct array_sum_reduce {
   using ValueType = array_sum_reduce<scalar_t, N>;
 
   scalar_t data[N];
   KOKKOS_INLINE_FUNCTION
-  array_sum_reduce()
-  { 
-    for(int i = 0; i < N; i++)
-      data[i] = scalar_t();
+  array_sum_reduce() {
+    for (int i = 0; i < N; i++) data[i] = scalar_t();
   }
   KOKKOS_INLINE_FUNCTION
-  array_sum_reduce(const ValueType& rhs)
-  { 
-    for(int i = 0; i < N; i++)
-      data[i] = rhs.data[i];
+  array_sum_reduce(const ValueType &rhs) {
+    for (int i = 0; i < N; i++) data[i] = rhs.data[i];
   }
-  KOKKOS_INLINE_FUNCTION   // add operator
-  array_sum_reduce& operator+=(const ValueType& src)
-  {
-    for(int i = 0; i < N; i++)
-      data[i] += src.data[i];
+  KOKKOS_INLINE_FUNCTION  // add operator
+      array_sum_reduce &
+      operator+=(const ValueType &src) {
+    for (int i = 0; i < N; i++) data[i] += src.data[i];
     return *this;
-  } 
-  KOKKOS_INLINE_FUNCTION   // volatile add operator 
-  void operator +=(const volatile ValueType& src) volatile
-  {
-    for(int i = 0; i < N; i++)
-      data[i] += src.data[i];
+  }
+  KOKKOS_INLINE_FUNCTION  // volatile add operator
+      void
+      operator+=(const volatile ValueType &src) volatile {
+    for (int i = 0; i < N; i++) data[i] += src.data[i];
   }
 };
 
-template<typename InPtr, typename T>
-KOKKOS_INLINE_FUNCTION T* alignPtr(InPtr p)
-{
-  //ugly but computationally free and the "right" way to do this in C++
+template <typename InPtr, typename T>
+KOKKOS_INLINE_FUNCTION T *alignPtr(InPtr p) {
+  // ugly but computationally free and the "right" way to do this in C++
   std::uintptr_t ptrVal = reinterpret_cast<std::uintptr_t>(p);
-  //ptrVal + (align - 1) lands inside the next valid aligned scalar_t,
-  //and the mask produces the start of that scalar_t.
-  return reinterpret_cast<T*>((ptrVal + alignof(T) - 1) & (~(alignof(T) - 1)));
-}
-
-}
+  // ptrVal + (align - 1) lands inside the next valid aligned scalar_t,
+  // and the mask produces the start of that scalar_t.
+  return reinterpret_cast<T *>((ptrVal + alignof(T) - 1) & (~(alignof(T) - 1)));
 }
 
-//Define the identity for array_sum_reduce
-namespace Kokkos
-{
-  template<typename scalar_t, int N>
-  struct reduction_identity<KokkosKernels::Impl::array_sum_reduce<scalar_t, N>>
-  {
-    typedef KokkosKernels::Impl::array_sum_reduce<scalar_t, N> T;
-    KOKKOS_FORCEINLINE_FUNCTION static T sum()
-    {
-      //default constructor default-initializes each element (this should always be 0)
-      return T();
-    }
-  };
-}
+}  // namespace Impl
+}  // namespace KokkosKernels
+
+// Define the identity for array_sum_reduce
+namespace Kokkos {
+template <typename scalar_t, int N>
+struct reduction_identity<KokkosKernels::Impl::array_sum_reduce<scalar_t, N>> {
+  typedef KokkosKernels::Impl::array_sum_reduce<scalar_t, N> T;
+  KOKKOS_FORCEINLINE_FUNCTION static T sum() {
+    // default constructor default-initializes each element (this should always
+    // be 0)
+    return T();
+  }
+};
+}  // namespace Kokkos
 
 #endif
diff --git a/src/common/KokkosKernels_VectorUtils.hpp b/src/common/KokkosKernels_VectorUtils.hpp
index db0c31e053..67b91bd145 100644
--- a/src/common/KokkosKernels_VectorUtils.hpp
+++ b/src/common/KokkosKernels_VectorUtils.hpp
@@ -43,48 +43,36 @@
 */
 
 #include "Kokkos_Core.hpp"
-#include "Kokkos_Atomic.hpp"
-#include "Kokkos_Timer.hpp"
 #ifndef _KOKKOSKERNELS_VECTORUTILS_HPP
 #define _KOKKOSKERNELS_VECTORUTILS_HPP
 
+namespace KokkosKernels {
 
+namespace Impl {
 
-namespace KokkosKernels{
-
-
-
-namespace Impl{
-
-template <typename out_array_t, typename in_array_t, typename scalar_1, typename scalar_2>
-struct A_times_X_plus_B{
-
+template <typename out_array_t, typename in_array_t, typename scalar_1,
+          typename scalar_2>
+struct A_times_X_plus_B {
   out_array_t out_view;
   in_array_t in_view;
   const scalar_1 a;
   const scalar_2 b;
-  A_times_X_plus_B(
-      out_array_t out_view_,
-      in_array_t in_view_,
-      scalar_1 a_,
-      scalar_2 b_): out_view(out_view_),in_view(in_view_), a(a_), b(b_){}
+  A_times_X_plus_B(out_array_t out_view_, in_array_t in_view_, scalar_1 a_,
+                   scalar_2 b_)
+      : out_view(out_view_), in_view(in_view_), a(a_), b(b_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const size_t ii) const {
-    out_view(ii) = in_view(ii) * a + b;
-  }
+  void operator()(const size_t ii) const { out_view(ii) = in_view(ii) * a + b; }
 };
 
-
-
-
 template <typename out_array_type, typename in_array_type>
-struct ModularView{
+struct ModularView {
   typedef typename in_array_type::value_type vt;
   out_array_type out_view;
   in_array_type in_view;
   const int modular_constant;
-  ModularView(out_array_type out_view_,in_array_type in_view_,int mod_factor_): out_view(out_view_),in_view(in_view_), modular_constant(mod_factor_){}
+  ModularView(out_array_type out_view_, in_array_type in_view_, int mod_factor_)
+      : out_view(out_view_), in_view(in_view_), modular_constant(mod_factor_) {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const size_t ii) const {
@@ -92,18 +80,15 @@ struct ModularView{
   }
 };
 
-
 template <typename from_vector, typename to_vector>
-struct CopyVectorFunctor{
+struct CopyVectorFunctor {
   from_vector from;
   to_vector to;
 
-  CopyVectorFunctor(from_vector &from_, to_vector to_): from(from_), to(to_){}
+  CopyVectorFunctor(from_vector &from_, to_vector to_) : from(from_), to(to_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const size_t &i) const {
-    to[i] = from[i];
-  }
+  void operator()(const size_t &i) const { to[i] = from[i]; }
 };
 
 /**
@@ -115,40 +100,43 @@ struct CopyVectorFunctor{
  * \param a: scalar for multiplication
  * \param b: scalar for addition
  */
-template <typename out_array_t, typename in_array_t, typename scalar_1, typename scalar_2, typename MyExecSpace>
-inline void kk_a_times_x_plus_b(
-                  typename in_array_t::value_type num_elements,
-                  out_array_t out_arr, in_array_t in_arr,
-                  scalar_1 a, scalar_2 b){
+template <typename out_array_t, typename in_array_t, typename scalar_1,
+          typename scalar_2, typename MyExecSpace>
+inline void kk_a_times_x_plus_b(typename in_array_t::value_type num_elements,
+                                out_array_t out_arr, in_array_t in_arr,
+                                scalar_1 a, scalar_2 b) {
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-  Kokkos::parallel_for( "KokkosKernels::Common::ATimesXPlusB", my_exec_space(0, num_elements),
-      A_times_X_plus_B<out_array_t, in_array_t, scalar_1, scalar_2>(out_arr, in_arr, a, b));
+  Kokkos::parallel_for(
+      "KokkosKernels::Common::ATimesXPlusB", my_exec_space(0, num_elements),
+      A_times_X_plus_B<out_array_t, in_array_t, scalar_1, scalar_2>(
+          out_arr, in_arr, a, b));
 }
 
 /**
- * \brief calculates the modular of each entry input array and writes it to corresponding vector.
- * \param num_elements: number of elements in input and output arrays.
- * \param out_arr: output arr, can be same as input array.
- * \param in_arr: input arr.
- * \param mod_factor_: for what value the modular will be applied.
+ * \brief calculates the modular of each entry input array and writes it to
+ * corresponding vector. \param num_elements: number of elements in input and
+ * output arrays. \param out_arr: output arr, can be same as input array. \param
+ * in_arr: input arr. \param mod_factor_: for what value the modular will be
+ * applied.
  */
 template <typename out_array_type, typename in_array_type, typename MyExecSpace>
-inline void kk_modular_view(typename in_array_type::value_type num_elements, out_array_type out_arr, in_array_type in_arr, int mod_factor_){
+inline void kk_modular_view(typename in_array_type::value_type num_elements,
+                            out_array_type out_arr, in_array_type in_arr,
+                            int mod_factor_) {
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-  Kokkos::parallel_for( "KokkosKernels::Common::ModularView", my_exec_space(0, num_elements), ModularView<out_array_type, in_array_type>(out_arr, in_arr, mod_factor_));
+  Kokkos::parallel_for(
+      "KokkosKernels::Common::ModularView", my_exec_space(0, num_elements),
+      ModularView<out_array_type, in_array_type>(out_arr, in_arr, mod_factor_));
 }
 
-
-
 template <typename from_vector, typename to_vector, typename MyExecSpace>
-void kk_copy_vector(
-    size_t num_elements,
-    from_vector from, to_vector to){
+void kk_copy_vector(size_t num_elements, from_vector from, to_vector to) {
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-  Kokkos::parallel_for( "KokkosKernels::Common::CopyVector", my_exec_space(0,num_elements), CopyVectorFunctor<from_vector, to_vector>(from, to));
-
-}
-}
+  Kokkos::parallel_for("KokkosKernels::Common::CopyVector",
+                       my_exec_space(0, num_elements),
+                       CopyVectorFunctor<from_vector, to_vector>(from, to));
 }
+}  // namespace Impl
+}  // namespace KokkosKernels
 
 #endif
diff --git a/src/common/KokkosKernels_default_types.hpp b/src/common/KokkosKernels_default_types.hpp
index aec2ff98f2..4012b2e158 100644
--- a/src/common/KokkosKernels_default_types.hpp
+++ b/src/common/KokkosKernels_default_types.hpp
@@ -45,53 +45,56 @@
 #ifndef KOKKOSKERNELS_DEFAULT_TYPES_H
 #define KOKKOSKERNELS_DEFAULT_TYPES_H
 
-#include "Kokkos_Core.hpp"        //for LayoutLeft/LayoutRight
-#include <KokkosKernels_config.h> //for all the ETI #cmakedefine macros
+#include "Kokkos_Core.hpp"         //for LayoutLeft/LayoutRight
+#include <KokkosKernels_config.h>  //for all the ETI #cmakedefine macros
 
 #if defined(KOKKOSKERNELS_INST_ORDINAL_INT)
-  using default_lno_t = int;
+using default_lno_t = int;
 #elif defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T)
-  using default_lno_t = int64_t;
+using default_lno_t     = int64_t;
 #else
-  using default_lno_t = int;
+using default_lno_t     = int;
 #endif
-  //Prefer int as the default offset type, because cuSPARSE doesn't support size_t for rowptrs.
+// Prefer int as the default offset type, because cuSPARSE doesn't support
+// size_t for rowptrs.
 #if defined(KOKKOSKERNELS_INST_OFFSET_INT)
-  using default_size_type = int;
+using default_size_type = int;
 #elif defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)
-  using default_size_type = size_t;
+using default_size_type = size_t;
 #else
-  using default_size_type = int;
+using default_size_type = int;
 #endif
 
 #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
-  using default_layout = Kokkos::LayoutLeft;
+using default_layout = Kokkos::LayoutLeft;
 #elif defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
-  using default_layout = Kokkos::LayoutRight;
+using default_layout    = Kokkos::LayoutRight;
 #else
-  using default_layout = Kokkos::LayoutLeft;
+using default_layout    = Kokkos::LayoutLeft;
 #endif
 
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-  using default_scalar = double;
+using default_scalar = double;
 #elif defined(KOKKOSKERNELS_INST_FLOAT)
-  using default_scalar = float;
+using default_scalar    = float;
+#elif defined(KOKKOSKERNELS_INST_HALF)
+using default_scalar    = Kokkos::Experimental::half_t;
 #else
-  using default_scalar = double;
+using default_scalar = double;
 #endif
 
 #if defined(KOKKOS_ENABLE_CUDA)
-  using default_device = Kokkos::Cuda;
+using default_device = Kokkos::Cuda;
 #elif defined(KOKKOS_ENABLE_HIP)
-  using default_device = Kokkos::Experimental::HIP;
+using default_device    = Kokkos::Experimental::HIP;
 #elif defined(KOKKOS_ENABLE_OPENMPTARGET)
-  using default_device = Kokkos::Experimental::OpenMPTarget;
+using default_device    = Kokkos::Experimental::OpenMPTarget;
 #elif defined(KOKKOS_ENABLE_OPENMP)
-  using default_device = Kokkos::OpenMP;
+using default_device = Kokkos::OpenMP;
 #elif defined(KOKKOS_ENABLE_PTHREAD) || defined(KOKKOS_ENABLE_THREADS)
-  using default_device = Kokkos::Threads;
+using default_device = Kokkos::Threads;
 #else
-  using default_device = Kokkos::Serial;
+using default_device = Kokkos::Serial;
 #endif
 
-#endif // KOKKOSKERNELS_DEFAULT_TYPES_H
+#endif  // KOKKOSKERNELS_DEFAULT_TYPES_H
diff --git a/src/graph/KokkosGraph_Distance1Color.hpp b/src/graph/KokkosGraph_Distance1Color.hpp
index b9155734af..3001ea660c 100644
--- a/src/graph/KokkosGraph_Distance1Color.hpp
+++ b/src/graph/KokkosGraph_Distance1Color.hpp
@@ -48,70 +48,81 @@
 #include "KokkosGraph_Distance1Color_impl.hpp"
 #include "KokkosKernels_Utils.hpp"
 
-namespace KokkosGraph{
+namespace KokkosGraph {
 
-namespace Experimental{
-
-
-
-template <class KernelHandle,typename lno_row_view_t_, typename lno_nnz_view_t_>
-void graph_color_symbolic(
-    KernelHandle *handle,
-    typename KernelHandle::nnz_lno_t num_rows,
-    typename KernelHandle::nnz_lno_t /* num_cols */,
-    lno_row_view_t_ row_map,
-    lno_nnz_view_t_ entries,
-    bool /* is_symmetric */ = true){
+namespace Experimental {
 
+template <class KernelHandle, typename lno_row_view_t_,
+          typename lno_nnz_view_t_>
+void graph_color_symbolic(KernelHandle *handle,
+                          typename KernelHandle::nnz_lno_t num_rows,
+                          typename KernelHandle::nnz_lno_t /* num_cols */,
+                          lno_row_view_t_ row_map, lno_nnz_view_t_ entries,
+                          bool /* is_symmetric */ = true) {
   Kokkos::Timer timer;
 
-  typename KernelHandle::GraphColoringHandleType *gch = handle->get_graph_coloring_handle();
+  typename KernelHandle::GraphColoringHandleType *gch =
+      handle->get_graph_coloring_handle();
 
   ColoringAlgorithm algorithm = gch->get_coloring_algo_type();
 
-  typedef typename KernelHandle::GraphColoringHandleType::color_view_t color_view_type;
+  typedef typename KernelHandle::GraphColoringHandleType::color_view_t
+      color_view_type;
 
   gch->set_tictoc(handle->get_verbose());
 
   color_view_type colors_out;
-  if(gch->get_vertex_colors().use_count() > 0){
+  if (gch->get_vertex_colors().use_count() > 0) {
     colors_out = gch->get_vertex_colors();
   } else {
     colors_out = color_view_type("Graph Colors", num_rows);
   }
 
-  typedef typename Impl::GraphColor
-      <typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, lno_nnz_view_t_> BaseGraphColoring;
+  typedef
+      typename Impl::GraphColor<typename KernelHandle::GraphColoringHandleType,
+                                lno_row_view_t_, lno_nnz_view_t_>
+          BaseGraphColoring;
   BaseGraphColoring *gc = NULL;
 
-  switch (algorithm){
-  case COLORING_SERIAL:
-    gc = new BaseGraphColoring(num_rows, entries.extent(0), row_map, entries, gch);
-    break;
-
-  case COLORING_VB:
-  case COLORING_VBBIT:
-  case COLORING_VBCS:
-    typedef typename Impl::GraphColor_VB <typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, lno_nnz_view_t_> VBGraphColoring;
-    gc = new VBGraphColoring(num_rows, entries.extent(0), row_map, entries, gch);
-    break;
-
-  case COLORING_VBD:
-  case COLORING_VBDBIT:
-    typedef typename Impl::GraphColor_VBD <typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, lno_nnz_view_t_> VBDGraphColoring;
-    gc = new VBDGraphColoring(num_rows, entries.extent(0), row_map, entries, gch);
-    break;
-
-  case COLORING_EB:
-    typedef typename Impl::GraphColor_EB <typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, lno_nnz_view_t_> EBGraphColoring;
-    gc = new EBGraphColoring(num_rows, entries.extent(0),row_map, entries, gch);
-    break;
-
-  case COLORING_DEFAULT:
-    break;
-
-  default:
-    break;
+  switch (algorithm) {
+    case COLORING_SERIAL:
+      gc = new BaseGraphColoring(num_rows, entries.extent(0), row_map, entries,
+                                 gch);
+      break;
+
+    case COLORING_VB:
+    case COLORING_VBBIT:
+    case COLORING_VBCS:
+      typedef typename Impl::GraphColor_VB<
+          typename KernelHandle::GraphColoringHandleType, lno_row_view_t_,
+          lno_nnz_view_t_>
+          VBGraphColoring;
+      gc = new VBGraphColoring(num_rows, entries.extent(0), row_map, entries,
+                               gch);
+      break;
+
+    case COLORING_VBD:
+    case COLORING_VBDBIT:
+      typedef typename Impl::GraphColor_VBD<
+          typename KernelHandle::GraphColoringHandleType, lno_row_view_t_,
+          lno_nnz_view_t_>
+          VBDGraphColoring;
+      gc = new VBDGraphColoring(num_rows, entries.extent(0), row_map, entries,
+                                gch);
+      break;
+
+    case COLORING_EB:
+      typedef typename Impl::GraphColor_EB<
+          typename KernelHandle::GraphColoringHandleType, lno_row_view_t_,
+          lno_nnz_view_t_>
+          EBGraphColoring;
+      gc = new EBGraphColoring(num_rows, entries.extent(0), row_map, entries,
+                               gch);
+      break;
+
+    case COLORING_DEFAULT: break;
+
+    default: break;
   }
 
   int num_phases = 0;
@@ -125,20 +136,18 @@ void graph_color_symbolic(
   gch->set_vertex_colors(colors_out);
 }
 
-template <class KernelHandle,typename lno_row_view_t_, typename lno_nnz_view_t_>
-void graph_color(
-    KernelHandle *handle,
-    typename KernelHandle::nnz_lno_t num_rows,
-    typename KernelHandle::nnz_lno_t num_cols,
-    lno_row_view_t_ row_map,
-    lno_nnz_view_t_ entries,
-    bool is_symmetric = true)
-{
-  graph_color_symbolic(handle, num_rows, num_cols, row_map, entries, is_symmetric);
+template <class KernelHandle, typename lno_row_view_t_,
+          typename lno_nnz_view_t_>
+void graph_color(KernelHandle *handle,
+                 typename KernelHandle::nnz_lno_t num_rows,
+                 typename KernelHandle::nnz_lno_t num_cols,
+                 lno_row_view_t_ row_map, lno_nnz_view_t_ entries,
+                 bool is_symmetric = true) {
+  graph_color_symbolic(handle, num_rows, num_cols, row_map, entries,
+                       is_symmetric);
 }
 
 }  // end namespace Experimental
 }  // end namespace KokkosGraph
 
-#endif   // _KOKKOSGRAPH_DISTANCE1_COLOR_HPP
-
+#endif  // _KOKKOSGRAPH_DISTANCE1_COLOR_HPP
diff --git a/src/graph/KokkosGraph_Distance1ColorHandle.hpp b/src/graph/KokkosGraph_Distance1ColorHandle.hpp
index 8f9489966b..7f04bfa94f 100644
--- a/src/graph/KokkosGraph_Distance1ColorHandle.hpp
+++ b/src/graph/KokkosGraph_Distance1ColorHandle.hpp
@@ -44,7 +44,6 @@
 #include <fstream>
 #include <ostream>
 
-#include <Kokkos_MemoryTraits.hpp>
 #include <Kokkos_Core.hpp>
 #include <KokkosKernels_Utils.hpp>
 
@@ -54,105 +53,116 @@
 //#define VERBOSE
 namespace KokkosGraph {
 
-enum ColoringAlgorithm { COLORING_DEFAULT,
-                         COLORING_SERIAL,                     // Serial Greedy Coloring
-                         COLORING_VB,                         // Vertex Based Coloring
-                         COLORING_VBBIT,                      // Vertex Based Coloring with bit array
-                         COLORING_VBCS,                       // Vertex Based Color Set
-                         COLORING_VBD,                        // Vertex Based Deterministic Coloring
-                         COLORING_VBDBIT,                     // Vertex Based Deterministic Coloring with bit array
-                         COLORING_EB,                         // Edge Based Coloring
-                         COLORING_SERIAL2,                    // Serial Distance-2 Graph Coloring (kept here for backwards compatibility for SPGEMM and other use cases)
-                       };
+enum ColoringAlgorithm {
+  COLORING_DEFAULT,
+  COLORING_SERIAL,   // Serial Greedy Coloring
+  COLORING_VB,       // Vertex Based Coloring
+  COLORING_VBBIT,    // Vertex Based Coloring with bit array
+  COLORING_VBCS,     // Vertex Based Color Set
+  COLORING_VBD,      // Vertex Based Deterministic Coloring
+  COLORING_VBDBIT,   // Vertex Based Deterministic Coloring with bit array
+  COLORING_EB,       // Edge Based Coloring
+  COLORING_SERIAL2,  // Serial Distance-2 Graph Coloring (kept here for
+                     // backwards compatibility for SPGEMM and other use cases)
+};
 
-enum ConflictList{COLORING_NOCONFLICT, COLORING_ATOMIC, COLORING_PPS};
+enum ConflictList { COLORING_NOCONFLICT, COLORING_ATOMIC, COLORING_PPS };
 
-enum ColoringType {Distance1, Distance2};
+enum ColoringType { Distance1, Distance2 };
 
 template <class size_type_, class color_t_, class lno_t_,
-         //class lno_row_view_t_, class nonconst_color_view_t_, class lno_nnz_view_t_,
-          class ExecutionSpace, class TemporaryMemorySpace, class PersistentMemorySpace>
-class GraphColoringHandle
-{
-
-public:
+          // class lno_row_view_t_, class nonconst_color_view_t_, class
+          // lno_nnz_view_t_,
+          class ExecutionSpace, class TemporaryMemorySpace,
+          class PersistentMemorySpace>
+class GraphColoringHandle {
+ public:
   typedef ExecutionSpace HandleExecSpace;
   typedef TemporaryMemorySpace HandleTempMemorySpace;
   typedef PersistentMemorySpace HandlePersistentMemorySpace;
 
-
-  typedef typename std::remove_const<size_type_>::type  size_type;
+  typedef typename std::remove_const<size_type_>::type size_type;
   typedef const size_type const_size_type;
 
-  typedef typename std::remove_const<lno_t_>::type  nnz_lno_t;
+  typedef typename std::remove_const<lno_t_>::type nnz_lno_t;
   typedef const nnz_lno_t const_nnz_lno_t;
 
-  typedef typename std::remove_const<color_t_>::type  color_t;
+  typedef typename std::remove_const<color_t_>::type color_t;
   typedef const color_t const_color_t;
 
-
-
-  typedef typename Kokkos::View<color_t *, HandlePersistentMemorySpace> color_view_t;
+  typedef typename Kokkos::View<color_t *, HandlePersistentMemorySpace>
+      color_view_t;
 
   typedef typename color_view_t::array_layout color_view_array_layout;
   typedef typename color_view_t::device_type color_view_device_t;
   typedef typename color_view_t::memory_traits color_view_memory_traits;
-  typedef typename color_view_t::HostMirror color_host_view_t; //Host view type
-
+  typedef typename color_view_t::HostMirror color_host_view_t;  // Host view
+                                                                // type
 
-  typedef typename Kokkos::View<size_type *, HandleTempMemorySpace> size_type_temp_work_view_t;
-  typedef typename Kokkos::View<size_type *, HandlePersistentMemorySpace> size_type_persistent_work_view_t;
+  typedef typename Kokkos::View<size_type *, HandleTempMemorySpace>
+      size_type_temp_work_view_t;
+  typedef typename Kokkos::View<size_type *, HandlePersistentMemorySpace>
+      size_type_persistent_work_view_t;
 
-  typedef typename size_type_persistent_work_view_t::HostMirror size_type_persistent_work_host_view_t; //Host view type
+  typedef typename size_type_persistent_work_view_t::HostMirror
+      size_type_persistent_work_host_view_t;  // Host view type
 
-  typedef typename Kokkos::View<nnz_lno_t *, HandleTempMemorySpace> nnz_lno_temp_work_view_t;
-  typedef typename Kokkos::View<nnz_lno_t *, HandlePersistentMemorySpace> nnz_lno_persistent_work_view_t;
-  typedef typename nnz_lno_persistent_work_view_t::HostMirror nnz_lno_persistent_work_host_view_t; //Host view type
+  typedef typename Kokkos::View<nnz_lno_t *, HandleTempMemorySpace>
+      nnz_lno_temp_work_view_t;
+  typedef typename Kokkos::View<nnz_lno_t *, HandlePersistentMemorySpace>
+      nnz_lno_persistent_work_view_t;
+  typedef typename nnz_lno_persistent_work_view_t::HostMirror
+      nnz_lno_persistent_work_host_view_t;  // Host view type
 
-  typedef Kokkos::TeamPolicy<ExecutionSpace> team_policy_t ;
-  typedef typename team_policy_t::member_type team_member_t ;
+  typedef Kokkos::TeamPolicy<ExecutionSpace> team_policy_t;
+  typedef typename team_policy_t::member_type team_member_t;
 
   typedef typename Kokkos::View<size_t *> non_const_1d_size_type_view_t;
 
-private:
-
+ private:
   ColoringType GraphColoringType;
-  //Parameters
-  ColoringAlgorithm coloring_algorithm_type; //VB, VBBIT, VBCS, VBD or EB.
-  ConflictList conflict_list_type;  // whether to use a conflict list or not, and
-                                    // if using it wheter to create it with atomic or parallel prefix sum.
+  // Parameters
+  ColoringAlgorithm coloring_algorithm_type;  // VB, VBBIT, VBCS, VBD or EB.
+  ConflictList conflict_list_type;  // whether to use a conflict list or not,
+                                    // and if using it wheter to create it with
+                                    // atomic or parallel prefix sum.
 
   double min_reduction_for_conflictlist;
-                      //if used pps is selected to create conflict list, what min percantage should be the vertex list
-                      //be reduced, to create the new vertexlist. If it is reduced less than this percantage, use the
-                      //previous array.
+  // if used pps is selected to create conflict list, what min percantage should
+  // be the vertex list be reduced, to create the new vertexlist. If it is
+  // reduced less than this percantage, use the previous array.
 
   int min_elements_for_conflictlist;
-                            //minimum number of elements to create a new conflict list.
-                            //if current conflict list size is smaller than this number,
-                            //than we do not need to create a new conflict list.
-
-  bool serial_conflict_resolution;//perform parallel greedy coloring once, then resolve conflict serially.
-  bool tictoc; //print time at every step
+  // minimum number of elements to create a new conflict list.
+  // if current conflict list size is smaller than this number,
+  // than we do not need to create a new conflict list.
 
-  bool vb_edge_filtering;  //whether to do edge filtering or not in vertex based algorithms. Swaps on the ad error.
+  bool serial_conflict_resolution;  // perform parallel greedy coloring once,
+                                    // then resolve conflict serially.
+  bool tictoc;                      // print time at every step
 
-  int vb_chunk_size;  //the (minimum) size of the consecutive works that a thread will be assigned to.
-  int max_number_of_iterations; //maximum allowed number of phases
+  bool vb_edge_filtering;  // whether to do edge filtering or not in vertex
+                           // based algorithms. Swaps on the ad error.
 
-  int eb_num_initial_colors; //the number of colors to assign at the beginning of the edge-based algorithm
+  int vb_chunk_size;  // the (minimum) size of the consecutive works that a
+                      // thread will be assigned to.
+  int max_number_of_iterations;  // maximum allowed number of phases
 
-  //STATISTICS
-  double overall_coloring_time; //the overall time that it took to color the graph. In the case of the iterative calls.
-  double overall_coloring_time_phase1;    //
-  double overall_coloring_time_phase2;    //
-  double overall_coloring_time_phase3;    // Some timer accumulators for internal phases.
-  double overall_coloring_time_phase4;    //
-  double overall_coloring_time_phase5;    //
-  double coloring_time; //the time that it took to color the graph
+  int eb_num_initial_colors;  // the number of colors to assign at the beginning
+                              // of the edge-based algorithm
 
-  int num_phases; //
+  // STATISTICS
+  double overall_coloring_time;  // the overall time that it took to color the
+                                 // graph. In the case of the iterative calls.
+  double overall_coloring_time_phase1;  //
+  double overall_coloring_time_phase2;  //
+  double overall_coloring_time_phase3;  // Some timer accumulators for internal
+                                        // phases.
+  double overall_coloring_time_phase4;  //
+  double overall_coloring_time_phase5;  //
+  double coloring_time;  // the time that it took to color the graph
 
+  int num_phases;  //
 
   size_type size_of_edge_list;
   nnz_lno_persistent_work_view_t lower_triangle_src;
@@ -166,135 +176,133 @@ class GraphColoringHandle
   bool is_coloring_called_before;
   nnz_lno_t num_colors;
 
-
-  public:
-
-
+ public:
   /**
    * \brief Default constructor.
    */
-  GraphColoringHandle():
-    GraphColoringType(Distance1),
-    coloring_algorithm_type(COLORING_DEFAULT),
-    conflict_list_type(COLORING_ATOMIC),
-    min_reduction_for_conflictlist(0.35),
-    min_elements_for_conflictlist(1000 /*5000*/),
-    serial_conflict_resolution(false),
-    tictoc(false),
-    vb_edge_filtering(false),
-    vb_chunk_size(8),
-    max_number_of_iterations(200),
-    eb_num_initial_colors(1),
-    overall_coloring_time(0),
-    overall_coloring_time_phase1(0),
-    overall_coloring_time_phase2(0),
-    overall_coloring_time_phase3(0),
-    overall_coloring_time_phase4(0),
-    overall_coloring_time_phase5(0),
-    coloring_time(0),
-    num_phases(0), size_of_edge_list(0), lower_triangle_src(), lower_triangle_dst(),
-    use_vtx_list(false), vertex_colors(), is_coloring_called_before(false), num_colors(0)
-  {
+  GraphColoringHandle()
+      : GraphColoringType(Distance1),
+        coloring_algorithm_type(COLORING_DEFAULT),
+        conflict_list_type(COLORING_ATOMIC),
+        min_reduction_for_conflictlist(0.35),
+        min_elements_for_conflictlist(1000 /*5000*/),
+        serial_conflict_resolution(false),
+        tictoc(false),
+        vb_edge_filtering(false),
+        vb_chunk_size(8),
+        max_number_of_iterations(200),
+        eb_num_initial_colors(1),
+        overall_coloring_time(0),
+        overall_coloring_time_phase1(0),
+        overall_coloring_time_phase2(0),
+        overall_coloring_time_phase3(0),
+        overall_coloring_time_phase4(0),
+        overall_coloring_time_phase5(0),
+        coloring_time(0),
+        num_phases(0),
+        size_of_edge_list(0),
+        lower_triangle_src(),
+        lower_triangle_dst(),
+        use_vtx_list(false),
+        vertex_colors(),
+        is_coloring_called_before(false),
+        num_colors(0) {
     this->choose_default_algorithm();
     this->set_defaults(this->coloring_algorithm_type);
   }
 
-  /** \brief Sets the graph coloring type. Whether it is distance-1 or distance-2 coloring.
-   *  \param col_type: Coloring Type: KokkosKernels::Experimental::Graph::ColoringType which can be
-   *        either KokkosKernels::Experimental::Graph::Distance1 or KokkosKernels::Experimental::Graph::Distance2
+  /** \brief Sets the graph coloring type. Whether it is distance-1 or
+   * distance-2 coloring. \param col_type: Coloring Type:
+   * KokkosKernels::Experimental::Graph::ColoringType which can be either
+   * KokkosKernels::Experimental::Graph::Distance1 or
+   * KokkosKernels::Experimental::Graph::Distance2
    */
-  void set_coloring_type(const ColoringType &col_type){
+  void set_coloring_type(const ColoringType &col_type) {
     this->GraphColoringType = col_type;
   }
 
-  /** \brief Gets the graph coloring type. Whether it is distance-1 or distance-2 coloring.
-   *  returns Coloring Type: KokkosKernels::Experimental::Graph::ColoringType which can be
-   *        either KokkosKernels::Experimental::Graph::Distance1 or KokkosKernels::Experimental::Graph::Distance2
+  /** \brief Gets the graph coloring type. Whether it is distance-1 or
+   * distance-2 coloring. returns Coloring Type:
+   * KokkosKernels::Experimental::Graph::ColoringType which can be either
+   * KokkosKernels::Experimental::Graph::Distance1 or
+   * KokkosKernels::Experimental::Graph::Distance2
    */
-  ColoringType get_coloring_type(){
-    return this->GraphColoringType;
-  }
-
+  ColoringType get_coloring_type() { return this->GraphColoringType; }
 
   /** \brief Changes the graph coloring algorithm.
-   *  \param col_algo: Coloring algorithm: one of COLORING_VB, COLORING_VBBIT, COLORING_VBCS, COLORING_EB
-   *  \param set_default_parameters: whether or not to reset the default parameters for the given algorithm.
+   *  \param col_algo: Coloring algorithm: one of COLORING_VB, COLORING_VBBIT,
+   * COLORING_VBCS, COLORING_EB \param set_default_parameters: whether or not to
+   * reset the default parameters for the given algorithm.
    */
-  void set_algorithm(const ColoringAlgorithm &col_algo, bool set_default_parameters = true){
-    if (col_algo == COLORING_DEFAULT){
+  void set_algorithm(const ColoringAlgorithm &col_algo,
+                     bool set_default_parameters = true) {
+    if (col_algo == COLORING_DEFAULT) {
       this->choose_default_algorithm();
-    }
-    else {
+    } else {
       this->coloring_algorithm_type = col_algo;
     }
-    if (set_default_parameters){
+    if (set_default_parameters) {
       this->set_defaults(this->coloring_algorithm_type);
     }
   }
 
-
-  /** \brief Chooses best algorithm based on the execution space. COLORING_SERIAL if serial, otherwise COLORING_VBBIT.
-   *         VBBIT is the fastest parallel algorithm (unless on GPU and the graph's maximum degree is very large, but
-   *         we don't have information about the graph here)
+  /** \brief Chooses best algorithm based on the execution space.
+   * COLORING_SERIAL if serial, otherwise COLORING_VBBIT. VBBIT is the fastest
+   * parallel algorithm (unless on GPU and the graph's maximum degree is very
+   * large, but we don't have information about the graph here)
    */
-  void choose_default_algorithm()
-  {
+  void choose_default_algorithm() {
     auto exec = KokkosKernels::Impl::kk_get_exec_space_type<ExecutionSpace>();
-    if(exec == KokkosKernels::Impl::Exec_SERIAL)
-    {
+    if (exec == KokkosKernels::Impl::Exec_SERIAL) {
       this->coloring_algorithm_type = COLORING_SERIAL;
 #ifdef VERBOSE
-      std::cout << "Serial Execution Space, Default Algorithm: COLORING_SERIAL\n";
+      std::cout
+          << "Serial Execution Space, Default Algorithm: COLORING_SERIAL\n";
 #endif
-    }
-    else if(KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>())
-    {
-      this->coloring_algorithm_type = COLORING_EB;
+    } else if (exec == KokkosKernels::Impl::Exec_SYCL) {
+      // FIXME SYCL: Do not use EB
+      this->coloring_algorithm_type = COLORING_VBBIT;
 #ifdef VERBOSE
-      std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_EB\n";
+      std::cout << ExecutionSpace::name()
+                << " Execution Space, Default Algorithm: COLORING_VBBIT\n";
 #endif
-    }
-    else if(KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>())
-    {
+    } else if (KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>()) {
       this->coloring_algorithm_type = COLORING_EB;
 #ifdef VERBOSE
-      std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_EB\n";
+      std::cout << ExecutionSpace::name()
+                << " Execution Space, Default Algorithm: COLORING_EB\n";
 #endif
-    }
-    else
-    {
-      this->coloring_algorithm_type = COLORING_VB;
+    } else {
+      this->coloring_algorithm_type = COLORING_VBBIT;
 #ifdef VERBOSE
-      std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_VB\n";
+      std::cout << ExecutionSpace::name()
+                << " Execution Space, Default Algorithm: COLORING_VBBIT\n";
 #endif
     }
   }
 
-  template<typename v1, typename v2, typename v3>
-  struct CountLowerTriangle{
+  template <typename v1, typename v2, typename v3>
+  struct CountLowerTriangle {
     nnz_lno_t nv;
     v1 xadj;
     v2 adj;
     v3 lower_xadj_counts;
 
-    CountLowerTriangle(
-        nnz_lno_t nv_,
-        v1 xadj_,
-        v2 adj_,
-        v3 lower_xadj_counts_
-        ): nv(nv_),
-            xadj(xadj_), adj(adj_),
-            lower_xadj_counts(lower_xadj_counts_){}
+    CountLowerTriangle(nnz_lno_t nv_, v1 xadj_, v2 adj_, v3 lower_xadj_counts_)
+        : nv(nv_),
+          xadj(xadj_),
+          adj(adj_),
+          lower_xadj_counts(lower_xadj_counts_) {}
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const nnz_lno_t &i, size_type &new_num_edge) const {
       size_type xadj_begin = xadj(i);
-      size_type xadj_end = xadj(i + 1);
+      size_type xadj_end   = xadj(i + 1);
 
       size_type new_edge_count = 0;
-      for (size_type j = xadj_begin; j < xadj_end; ++j){
+      for (size_type j = xadj_begin; j < xadj_end; ++j) {
         nnz_lno_t n = adj(j);
-        if (i < n && n < nv){
+        if (i < n && n < nv) {
           new_edge_count += 1;
         }
       }
@@ -303,61 +311,52 @@ class GraphColoringHandle
     }
   };
 
-
-
-  template<typename v1, typename v2, typename v3>
-  struct CountLowerTriangleTeam{
-
+  template <typename v1, typename v2, typename v3>
+  struct CountLowerTriangleTeam {
     nnz_lno_t nv;
     v1 xadj;
     v2 adj;
     v3 lower_xadj_counts;
 
-    CountLowerTriangleTeam(
-        nnz_lno_t nv_,
-        v1 xadj_,
-        v2 adj_,
-        v3 lower_xadj_counts_
-        ): nv(nv_),
-            xadj(xadj_), adj(adj_),
-            lower_xadj_counts(lower_xadj_counts_){}
+    CountLowerTriangleTeam(nnz_lno_t nv_, v1 xadj_, v2 adj_,
+                           v3 lower_xadj_counts_)
+        : nv(nv_),
+          xadj(xadj_),
+          adj(adj_),
+          lower_xadj_counts(lower_xadj_counts_) {}
 
     KOKKOS_INLINE_FUNCTION
-    void operator()(const team_member_t & teamMember/*, row_lno_t &new_num_edge*/) const {
-
-
-      nnz_lno_t ii = teamMember.league_rank()  * teamMember.team_size()+ teamMember.team_rank();
+    void operator()(
+        const team_member_t &teamMember /*, row_lno_t &new_num_edge*/) const {
+      nnz_lno_t ii = teamMember.league_rank() * teamMember.team_size() +
+                     teamMember.team_rank();
       if (ii >= nv) {
         return;
       }
 
       size_type xadj_begin = xadj(ii);
-      size_type xadj_end = xadj(ii + 1);
+      size_type xadj_end   = xadj(ii + 1);
 
       size_type new_edge_count = 0;
 
-
       Kokkos::parallel_reduce(
           Kokkos::ThreadVectorRange(teamMember, xadj_end - xadj_begin),
-          [&] (size_type i, size_type &numEdges) {
-
-        size_type adjind = i + xadj_begin;
-        nnz_lno_t n = adj[adjind];
-        if (ii < n && n < nv){
-          numEdges += 1;
-        }
-      }, new_edge_count);
-
-      Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
-        lower_xadj_counts(ii + 1) = new_edge_count;
-      });
+          [&](size_type i, size_type &numEdges) {
+            size_type adjind = i + xadj_begin;
+            nnz_lno_t n      = adj[adjind];
+            if (ii < n && n < nv) {
+              numEdges += 1;
+            }
+          },
+          new_edge_count);
+
+      Kokkos::single(Kokkos::PerThread(teamMember),
+                     [&]() { lower_xadj_counts(ii + 1) = new_edge_count; });
     }
   };
 
-
-
-  template<typename v1, typename v2, typename v3, typename v4>
-  struct FillLowerTriangleTeam{
+  template <typename v1, typename v2, typename v3, typename v4>
+  struct FillLowerTriangleTeam {
     nnz_lno_t nv;
     v1 xadj;
     v2 adj;
@@ -365,51 +364,47 @@ class GraphColoringHandle
     v4 lower_srcs;
     v4 lower_dsts;
 
-    FillLowerTriangleTeam(
-        nnz_lno_t nv_,
-        v1 xadj_,
-        v2 adj_,
-        v3 lower_xadj_counts_,
-        v4 lower_srcs_,
-        v4 lower_dsts_
-        ):  nv(nv_),
-            xadj(xadj_), adj(adj_),
-            lower_xadj_counts(lower_xadj_counts_),
-            lower_srcs(lower_srcs_), lower_dsts(lower_dsts_) {}
-
+    FillLowerTriangleTeam(nnz_lno_t nv_, v1 xadj_, v2 adj_,
+                          v3 lower_xadj_counts_, v4 lower_srcs_, v4 lower_dsts_)
+        : nv(nv_),
+          xadj(xadj_),
+          adj(adj_),
+          lower_xadj_counts(lower_xadj_counts_),
+          lower_srcs(lower_srcs_),
+          lower_dsts(lower_dsts_) {}
 
     KOKKOS_INLINE_FUNCTION
-    void operator()(const team_member_t & teamMember) const {
-      typedef typename std::remove_reference< decltype( lower_xadj_counts(0) ) >::type atomic_incr_type;
+    void operator()(const team_member_t &teamMember) const {
+      typedef
+          typename std::remove_reference<decltype(lower_xadj_counts(0))>::type
+              atomic_incr_type;
 
-      nnz_lno_t ii = teamMember.league_rank()  * teamMember.team_size()+ teamMember.team_rank();
+      nnz_lno_t ii = teamMember.league_rank() * teamMember.team_size() +
+                     teamMember.team_rank();
       if (ii >= nv) {
         return;
       }
 
       size_type xadj_begin = xadj(ii);
-      size_type xadj_end = xadj(ii + 1);
+      size_type xadj_end   = xadj(ii + 1);
 
       Kokkos::parallel_for(
           Kokkos::ThreadVectorRange(teamMember, xadj_end - xadj_begin),
-          [&] (size_type i) {
-
-        size_type adjind = i + xadj_begin;
-        nnz_lno_t n = adj[adjind];
-        if (ii < n && n < nv){
-          size_type position =
-              Kokkos::atomic_fetch_add( &(lower_xadj_counts(ii)), atomic_incr_type(1));
-          lower_srcs(position) = ii;
-          lower_dsts(position) = n;
-        }
-      });
+          [&](size_type i) {
+            size_type adjind = i + xadj_begin;
+            nnz_lno_t n      = adj[adjind];
+            if (ii < n && n < nv) {
+              size_type position = Kokkos::atomic_fetch_add(
+                  &(lower_xadj_counts(ii)), atomic_incr_type(1));
+              lower_srcs(position) = ii;
+              lower_dsts(position) = n;
+            }
+          });
     }
   };
 
-
-
-  template<typename v1, typename v2, typename v3, typename v4>
-  struct FillLowerTriangle{
+  template <typename v1, typename v2, typename v3, typename v4>
+  struct FillLowerTriangle {
     nnz_lno_t nv;
     v1 xadj;
     v2 adj;
@@ -417,28 +412,24 @@ class GraphColoringHandle
     v4 lower_srcs;
     v4 lower_dsts;
 
-    FillLowerTriangle(
-        nnz_lno_t nv_,
-        v1 xadj_,
-        v2 adj_,
-        v3 lower_xadj_counts_,
-        v4 lower_srcs_,
-        v4 lower_dsts_
-        ):  nv(nv_),
-            xadj(xadj_), adj(adj_),
-            lower_xadj_counts(lower_xadj_counts_),
-            lower_srcs(lower_srcs_), lower_dsts(lower_dsts_) {}
+    FillLowerTriangle(nnz_lno_t nv_, v1 xadj_, v2 adj_, v3 lower_xadj_counts_,
+                      v4 lower_srcs_, v4 lower_dsts_)
+        : nv(nv_),
+          xadj(xadj_),
+          adj(adj_),
+          lower_xadj_counts(lower_xadj_counts_),
+          lower_srcs(lower_srcs_),
+          lower_dsts(lower_dsts_) {}
 
     KOKKOS_INLINE_FUNCTION
-    void operator()(const nnz_lno_t &i) const{
-
+    void operator()(const nnz_lno_t &i) const {
       size_type xadj_begin = xadj[i];
-      size_type xadj_end = xadj[i + 1];
+      size_type xadj_end   = xadj[i + 1];
 
-      for (size_type j = xadj_begin; j < xadj_end; ++j){
+      for (size_type j = xadj_begin; j < xadj_end; ++j) {
         nnz_lno_t n = adj(j);
-        if (i < n && n < nv){
-          size_type position = lower_xadj_counts(i)++;
+        if (i < n && n < nv) {
+          size_type position   = lower_xadj_counts(i)++;
           lower_srcs(position) = i;
           lower_dsts(position) = n;
         }
@@ -446,105 +437,112 @@ class GraphColoringHandle
     }
   };
 
-
-
   template <typename row_index_view_type, typename nonzero_view_type>
   void symmetrize_and_calculate_lower_diagonal_edge_list(
-      nnz_lno_t nv,
-      row_index_view_type xadj, nonzero_view_type adj){
-
-    KokkosKernels::Impl::symmetrize_and_get_lower_diagonal_edge_list
-    <row_index_view_type, nonzero_view_type, nnz_lno_persistent_work_view_t, ExecutionSpace>
-      (
-        nv,
-        xadj,
-        adj,
-        lower_triangle_src,
-        lower_triangle_dst);
+      nnz_lno_t nv, row_index_view_type xadj, nonzero_view_type adj) {
+    KokkosKernels::Impl::symmetrize_and_get_lower_diagonal_edge_list<
+        row_index_view_type, nonzero_view_type, nnz_lno_persistent_work_view_t,
+        ExecutionSpace>(nv, xadj, adj, lower_triangle_src, lower_triangle_dst);
 
     size_of_edge_list = lower_triangle_src.extent(0);
-
   }
 
-
-
   template <typename row_index_view_type, typename nonzero_view_type>
-  void get_lower_diagonal_edge_list(
-      nnz_lno_t nv, size_type ne,
-      row_index_view_type xadj, nonzero_view_type adj,
-      size_type  &num_out_edges,
-      nnz_lno_persistent_work_view_t &src,
-      nnz_lno_persistent_work_view_t &dst){
-
-    if (size_of_edge_list > 0){
+  void get_lower_diagonal_edge_list(nnz_lno_t nv, size_type ne,
+                                    row_index_view_type xadj,
+                                    nonzero_view_type adj,
+                                    size_type &num_out_edges,
+                                    nnz_lno_persistent_work_view_t &src,
+                                    nnz_lno_persistent_work_view_t &dst) {
+    if (size_of_edge_list > 0) {
       num_out_edges = size_of_edge_list;
-      //src = Kokkos::View<idx *, HandlePersistentMemorySpace> (this->lower_triangle_src);
-      //dst = Kokkos::View<idx *, HandlePersistentMemorySpace> (this->lower_triangle_dst);
+      // src = Kokkos::View<idx *, HandlePersistentMemorySpace>
+      // (this->lower_triangle_src); dst = Kokkos::View<idx *,
+      // HandlePersistentMemorySpace> (this->lower_triangle_dst);
       src = (this->lower_triangle_src);
       dst = (this->lower_triangle_dst);
-    }
-    else {
-
+    } else {
       size_type_temp_work_view_t lower_count("LowerXADJ", nv + 1);
       size_type new_num_edge = 0;
       typedef Kokkos::RangePolicy<ExecutionSpace> my_exec_space;
-      if (KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>())
-      {
-
-
+      if (KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>()) {
         int teamSizeMax = 0;
         int vector_size = 0;
 
-        CountLowerTriangleTeam<row_index_view_type, nonzero_view_type, size_type_temp_work_view_t> clt (nv, xadj, adj, lower_count);
+        CountLowerTriangleTeam<row_index_view_type, nonzero_view_type,
+                               size_type_temp_work_view_t>
+            clt(nv, xadj, adj, lower_count);
 
-        KokkosKernels::Impl::get_suggested_vector_size<size_type, HandleExecSpace>(
-            vector_size,
-            nv, ne);
+        KokkosKernels::Impl::get_suggested_vector_size<size_type,
+                                                       HandleExecSpace>(
+            vector_size, nv, ne);
 
-        teamSizeMax = KokkosKernels::Impl::get_suggested_team_size<team_policy_t>(clt, vector_size);
+        teamSizeMax =
+            KokkosKernels::Impl::get_suggested_team_size<team_policy_t>(
+                clt, vector_size);
 
         Kokkos::parallel_for("KokkosGraph::CountLowerTriangleTeam",
-            team_policy_t((nv + teamSizeMax - 1) / teamSizeMax, teamSizeMax, vector_size),
-            clt//, new_num_edge
+                             team_policy_t((nv + teamSizeMax - 1) / teamSizeMax,
+                                           teamSizeMax, vector_size),
+                             clt  //, new_num_edge
         );
 
-        KokkosKernels::Impl::inclusive_parallel_prefix_sum<size_type_temp_work_view_t, ExecutionSpace>
-        (nv+1, lower_count);
-        //Kokkos::parallel_scan (my_exec_space(0, nv + 1), PPS<row_lno_temp_work_view_t>(lower_count));
+        KokkosKernels::Impl::inclusive_parallel_prefix_sum<
+            size_type_temp_work_view_t, ExecutionSpace>(nv + 1, lower_count);
+        // Kokkos::parallel_scan (my_exec_space(0, nv + 1),
+        // PPS<row_lno_temp_work_view_t>(lower_count));
         ExecutionSpace().fence();
         auto lower_total_count = Kokkos::subview(lower_count, nv);
-        auto hlower = Kokkos::create_mirror_view (lower_total_count);
-        Kokkos::deep_copy (hlower, lower_total_count);
+        auto hlower            = Kokkos::create_mirror_view(lower_total_count);
+        Kokkos::deep_copy(hlower, lower_total_count);
 
         new_num_edge = hlower();
-        nnz_lno_persistent_work_view_t half_src (Kokkos::view_alloc(Kokkos::WithoutInitializing, "HALF SRC"),new_num_edge);
-        nnz_lno_persistent_work_view_t half_dst (Kokkos::view_alloc(Kokkos::WithoutInitializing, "HALF DST"),new_num_edge);
-        Kokkos::parallel_for("KokkosGraph::FillLowerTriangleTeam",
-            team_policy_t(nv / teamSizeMax + 1 , teamSizeMax, vector_size),
-            FillLowerTriangleTeam
-            <row_index_view_type, nonzero_view_type,
-           size_type_temp_work_view_t,nnz_lno_persistent_work_view_t> (nv, xadj, adj, lower_count, half_src, half_dst));
+        nnz_lno_persistent_work_view_t half_src(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing, "HALF SRC"),
+            new_num_edge);
+        nnz_lno_persistent_work_view_t half_dst(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing, "HALF DST"),
+            new_num_edge);
+        Kokkos::parallel_for(
+            "KokkosGraph::FillLowerTriangleTeam",
+            team_policy_t((nv + teamSizeMax - 1) / teamSizeMax, teamSizeMax,
+                          vector_size),
+            FillLowerTriangleTeam<row_index_view_type, nonzero_view_type,
+                                  size_type_temp_work_view_t,
+                                  nnz_lno_persistent_work_view_t>(
+                nv, xadj, adj, lower_count, half_src, half_dst));
 
         src = lower_triangle_src = half_src;
         dst = lower_triangle_dst = half_dst;
         num_out_edges = size_of_edge_list = new_num_edge;
-      }
-      else {
+      } else {
         if (nv > 0) {
-          Kokkos::parallel_reduce("KokkosGraph::CountLowerTriangleTeam",my_exec_space(0,nv),
-              CountLowerTriangle<row_index_view_type, nonzero_view_type, size_type_temp_work_view_t> (nv, xadj, adj, lower_count), new_num_edge);
+          Kokkos::parallel_reduce(
+              "KokkosGraph::CountLowerTriangleTeam", my_exec_space(0, nv),
+              CountLowerTriangle<row_index_view_type, nonzero_view_type,
+                                 size_type_temp_work_view_t>(nv, xadj, adj,
+                                                             lower_count),
+              new_num_edge);
         }
 
-        //Kokkos::parallel_scan (my_exec_space(0, nv + 1), PPS<row_lno_temp_work_view_t>(lower_count));
-
-        KokkosKernels::Impl::inclusive_parallel_prefix_sum<size_type_temp_work_view_t, ExecutionSpace>
-        (nv+1, lower_count);
-        nnz_lno_persistent_work_view_t half_src (Kokkos::view_alloc(Kokkos::WithoutInitializing, "HALF SRC"),new_num_edge);
-        nnz_lno_persistent_work_view_t half_dst (Kokkos::view_alloc(Kokkos::WithoutInitializing, "HALF DST"),new_num_edge);
-
-        Kokkos::parallel_for("KokkosGraph::FillLowerTriangleTeam",my_exec_space(0,nv), FillLowerTriangle
-            <row_index_view_type, nonzero_view_type,
-            size_type_temp_work_view_t,nnz_lno_persistent_work_view_t> (nv, xadj, adj, lower_count, half_src, half_dst));
+        // Kokkos::parallel_scan (my_exec_space(0, nv + 1),
+        // PPS<row_lno_temp_work_view_t>(lower_count));
+
+        KokkosKernels::Impl::inclusive_parallel_prefix_sum<
+            size_type_temp_work_view_t, ExecutionSpace>(nv + 1, lower_count);
+        nnz_lno_persistent_work_view_t half_src(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing, "HALF SRC"),
+            new_num_edge);
+        nnz_lno_persistent_work_view_t half_dst(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing, "HALF DST"),
+            new_num_edge);
+
+        Kokkos::parallel_for(
+            "KokkosGraph::FillLowerTriangleTeam", my_exec_space(0, nv),
+            FillLowerTriangle<row_index_view_type, nonzero_view_type,
+                              size_type_temp_work_view_t,
+                              nnz_lno_persistent_work_view_t>(
+                nv, xadj, adj, lower_count, half_src, half_dst));
 
         src = lower_triangle_src = half_src;
         dst = lower_triangle_dst = half_dst;
@@ -553,139 +551,192 @@ class GraphColoringHandle
     }
   }
 
-
-
-  struct ReduceMaxFunctor{
+  struct ReduceMaxFunctor {
     color_view_t colors;
-    ReduceMaxFunctor(color_view_t cat):colors(cat){}
+    ReduceMaxFunctor(color_view_t cat) : colors(cat) {}
 
     KOKKOS_INLINE_FUNCTION
-    void operator()(const nnz_lno_t &i, color_t & color_max) const {
-      if (color_max < colors(i) ) color_max = colors(i);
+    void operator()(const nnz_lno_t &i, color_t &color_max) const {
+      if (color_max < colors(i)) color_max = colors(i);
     }
 
     KOKKOS_INLINE_FUNCTION
-    void join (volatile color_t& dst , const volatile color_t& src) const { // max -plus semiring equivalent of "plus"
+    void join(volatile color_t &dst, const volatile color_t &src)
+        const {  // max -plus semiring equivalent of "plus"
       if (dst < src) {
         dst = src;
       }
     }
 
     KOKKOS_INLINE_FUNCTION
-    void init (color_t& dst) const {
-      dst = 0;
-    }
+    void init(color_t &dst) const { dst = 0; }
   };
 
-
-  nnz_lno_t get_num_colors(){
-    if (num_colors == 0){
+  nnz_lno_t get_num_colors() {
+    if (num_colors == 0) {
       typedef typename Kokkos::RangePolicy<ExecutionSpace> my_exec_space;
-      Kokkos::parallel_reduce("KokkosKernels::FindMax", my_exec_space(0, vertex_colors.extent(0)),
-          ReduceMaxFunctor(vertex_colors) ,num_colors);
+      Kokkos::parallel_reduce("KokkosKernels::FindMax",
+                              my_exec_space(0, vertex_colors.extent(0)),
+                              ReduceMaxFunctor(vertex_colors), num_colors);
     }
     return num_colors;
   }
 
-
-
   /** \brief Sets Default Parameter settings for the given algorithm.
    */
-  void set_defaults (const ColoringAlgorithm &col_algo){
-    switch (col_algo){
-    case COLORING_VB:
-    case COLORING_VBBIT:
-    case COLORING_VBCS:
-    case COLORING_VBD:
-    case COLORING_VBDBIT:
-    case COLORING_SERIAL:
-      this->conflict_list_type = COLORING_ATOMIC;
-      this->min_reduction_for_conflictlist = 0.35;
-      this->min_elements_for_conflictlist = 1000;
-      this->serial_conflict_resolution = false;
-      this->tictoc = false;
-      this->vb_edge_filtering = false;
-      this->vb_chunk_size = 8;
-      this->max_number_of_iterations = 200;
-      this->eb_num_initial_colors = 1;
-      break;
-    case COLORING_EB:
-      this->conflict_list_type = COLORING_PPS;
-      this->min_reduction_for_conflictlist = 0.35;
-      this->min_elements_for_conflictlist = 5000;
-      this->serial_conflict_resolution = false;
-      this->tictoc = false;
-      this->vb_edge_filtering = false;
-      this->vb_chunk_size = 8;
-      this->max_number_of_iterations = 20000;
-      this->eb_num_initial_colors = 1;
-      break;
-    default:
-      throw std::runtime_error ("Unknown Coloring Algorithm\n");
-      //break;
+  void set_defaults(const ColoringAlgorithm &col_algo) {
+    switch (col_algo) {
+      case COLORING_VB:
+      case COLORING_VBBIT:
+      case COLORING_VBCS:
+      case COLORING_VBD:
+      case COLORING_VBDBIT:
+      case COLORING_SERIAL:
+        this->conflict_list_type             = COLORING_ATOMIC;
+        this->min_reduction_for_conflictlist = 0.35;
+        this->min_elements_for_conflictlist  = 1000;
+        this->serial_conflict_resolution     = false;
+        this->tictoc                         = false;
+        this->vb_edge_filtering              = false;
+        this->vb_chunk_size                  = 8;
+        this->max_number_of_iterations       = 200;
+        this->eb_num_initial_colors          = 1;
+        break;
+      case COLORING_EB:
+        this->conflict_list_type             = COLORING_PPS;
+        this->min_reduction_for_conflictlist = 0.35;
+        this->min_elements_for_conflictlist  = 5000;
+        this->serial_conflict_resolution     = false;
+        this->tictoc                         = false;
+        this->vb_edge_filtering              = false;
+        this->vb_chunk_size                  = 8;
+        this->max_number_of_iterations       = 20000;
+        this->eb_num_initial_colors          = 1;
+        break;
+      default:
+        throw std::runtime_error("Unknown Coloring Algorithm\n");
+        // break;
     }
   }
 
-
   virtual ~GraphColoringHandle(){};
 
-  //getters
-  ColoringAlgorithm get_coloring_algo_type() const {return this->coloring_algorithm_type;}
-  ConflictList get_conflict_list_type() const {return this->conflict_list_type;}
-  double get_min_reduction_for_conflictlist() const{return this->min_reduction_for_conflictlist;}
-  int get_min_elements_for_conflictlist() const{ return this->min_elements_for_conflictlist;}
-  bool get_serial_conflict_resolution() const{return this->serial_conflict_resolution;}
-  bool get_tictoc() const{return this->tictoc;}
-  bool get_vb_edge_filtering() const{return this->vb_edge_filtering;}
-  int get_vb_chunk_size() const{return this->vb_chunk_size;}
-  int get_max_number_of_iterations() const{return this->max_number_of_iterations;}
-  int get_eb_num_initial_colors() const{return this->eb_num_initial_colors;}
-
-  double get_overall_coloring_time() const { return this->overall_coloring_time;}
-  double get_overall_coloring_time_phase1() const { return this->overall_coloring_time_phase1; }
-  double get_overall_coloring_time_phase2() const { return this->overall_coloring_time_phase2; }
-  double get_overall_coloring_time_phase3() const { return this->overall_coloring_time_phase3; }
-  double get_overall_coloring_time_phase4() const { return this->overall_coloring_time_phase4; }
-  double get_overall_coloring_time_phase5() const { return this->overall_coloring_time_phase5; }
-  double get_coloring_time() const { return this->coloring_time;}
-  int get_num_phases() const { return this->num_phases;}
-  color_view_t get_vertex_colors() const {return this->vertex_colors;}
-  bool is_coloring_called() const {return this->is_coloring_called_before;}
-  bool get_use_vtx_list() const {return this->use_vtx_list;}
-  nnz_lno_temp_work_view_t get_vertex_list() const {return this->vertex_list;}
-  size_type get_vertex_list_size() const {return this->vertex_list_size;}
-  //setters
-  void set_vertex_list(nnz_lno_temp_work_view_t vertex_list_, size_type vertex_list_size_){
-    this->vertex_list = vertex_list_;
+  // getters
+  ColoringAlgorithm get_coloring_algo_type() const {
+    return this->coloring_algorithm_type;
+  }
+  ConflictList get_conflict_list_type() const {
+    return this->conflict_list_type;
+  }
+  double get_min_reduction_for_conflictlist() const {
+    return this->min_reduction_for_conflictlist;
+  }
+  int get_min_elements_for_conflictlist() const {
+    return this->min_elements_for_conflictlist;
+  }
+  bool get_serial_conflict_resolution() const {
+    return this->serial_conflict_resolution;
+  }
+  bool get_tictoc() const { return this->tictoc; }
+  bool get_vb_edge_filtering() const { return this->vb_edge_filtering; }
+  int get_vb_chunk_size() const { return this->vb_chunk_size; }
+  int get_max_number_of_iterations() const {
+    return this->max_number_of_iterations;
+  }
+  int get_eb_num_initial_colors() const { return this->eb_num_initial_colors; }
+
+  double get_overall_coloring_time() const {
+    return this->overall_coloring_time;
+  }
+  double get_overall_coloring_time_phase1() const {
+    return this->overall_coloring_time_phase1;
+  }
+  double get_overall_coloring_time_phase2() const {
+    return this->overall_coloring_time_phase2;
+  }
+  double get_overall_coloring_time_phase3() const {
+    return this->overall_coloring_time_phase3;
+  }
+  double get_overall_coloring_time_phase4() const {
+    return this->overall_coloring_time_phase4;
+  }
+  double get_overall_coloring_time_phase5() const {
+    return this->overall_coloring_time_phase5;
+  }
+  double get_coloring_time() const { return this->coloring_time; }
+  int get_num_phases() const { return this->num_phases; }
+  color_view_t get_vertex_colors() const { return this->vertex_colors; }
+  bool is_coloring_called() const { return this->is_coloring_called_before; }
+  bool get_use_vtx_list() const { return this->use_vtx_list; }
+  nnz_lno_temp_work_view_t get_vertex_list() const { return this->vertex_list; }
+  size_type get_vertex_list_size() const { return this->vertex_list_size; }
+  // setters
+  void set_vertex_list(nnz_lno_temp_work_view_t vertex_list_,
+                       size_type vertex_list_size_) {
+    this->vertex_list      = vertex_list_;
     this->vertex_list_size = vertex_list_size_;
-    this->use_vtx_list = true;
-  }
-  void set_coloring_algo_type(const ColoringAlgorithm &col_algo){this->coloring_algorithm_type = col_algo;}
-  void set_conflict_list_type(const ConflictList &cl){this->conflict_list_type = cl;}
-  void set_min_reduction_for_conflictlist(const double &min_reduction){this->min_reduction_for_conflictlist = min_reduction;}
-  void set_min_elements_for_conflictlist(const int &min_elements){ this->min_elements_for_conflictlist = min_elements;}
-  void set_serial_conflict_resolution(const bool &use_serial_conflist_resolution){this->serial_conflict_resolution = use_serial_conflist_resolution;}
-  void set_tictoc(const bool use_tictoc){this->tictoc = use_tictoc;}
-  void set_vb_edge_filtering(const bool  &use_vb_edge_filtering){this->vb_edge_filtering = use_vb_edge_filtering;}
-  void set_vb_chunk_size(const int &chunksize){this->vb_chunk_size = chunksize;}
-  void set_max_number_of_iterations(const int &max_phases){this->max_number_of_iterations = max_phases;}
-  void set_eb_num_initial_colors(const int &num_initial_colors){this->eb_num_initial_colors = num_initial_colors;}
-  void add_to_overall_coloring_time(const double &coloring_time_){this->overall_coloring_time += coloring_time_;}
-  void add_to_overall_coloring_time_phase1(const double &coloring_time_){this->overall_coloring_time_phase1 += coloring_time_;}
-  void add_to_overall_coloring_time_phase2(const double &coloring_time_){this->overall_coloring_time_phase2 += coloring_time_;}
-  void add_to_overall_coloring_time_phase3(const double &coloring_time_){this->overall_coloring_time_phase3 += coloring_time_;}
-  void add_to_overall_coloring_time_phase4(const double &coloring_time_){this->overall_coloring_time_phase4 += coloring_time_;}
-  void add_to_overall_coloring_time_phase5(const double &coloring_time_){this->overall_coloring_time_phase5 += coloring_time_;}
-  void set_coloring_time(const double &coloring_time_){this->coloring_time = coloring_time_;}
-  void set_num_phases(const double &num_phases_){this->num_phases = num_phases_;}
-  void set_vertex_colors( const color_view_t vertex_colors_){
-    this->vertex_colors = vertex_colors_;
+    this->use_vtx_list     = true;
+  }
+  void set_coloring_algo_type(const ColoringAlgorithm &col_algo) {
+    this->coloring_algorithm_type = col_algo;
+  }
+  void set_conflict_list_type(const ConflictList &cl) {
+    this->conflict_list_type = cl;
+  }
+  void set_min_reduction_for_conflictlist(const double &min_reduction) {
+    this->min_reduction_for_conflictlist = min_reduction;
+  }
+  void set_min_elements_for_conflictlist(const int &min_elements) {
+    this->min_elements_for_conflictlist = min_elements;
+  }
+  void set_serial_conflict_resolution(
+      const bool &use_serial_conflist_resolution) {
+    this->serial_conflict_resolution = use_serial_conflist_resolution;
+  }
+  void set_tictoc(const bool use_tictoc) { this->tictoc = use_tictoc; }
+  void set_vb_edge_filtering(const bool &use_vb_edge_filtering) {
+    this->vb_edge_filtering = use_vb_edge_filtering;
+  }
+  void set_vb_chunk_size(const int &chunksize) {
+    this->vb_chunk_size = chunksize;
+  }
+  void set_max_number_of_iterations(const int &max_phases) {
+    this->max_number_of_iterations = max_phases;
+  }
+  void set_eb_num_initial_colors(const int &num_initial_colors) {
+    this->eb_num_initial_colors = num_initial_colors;
+  }
+  void add_to_overall_coloring_time(const double &coloring_time_) {
+    this->overall_coloring_time += coloring_time_;
+  }
+  void add_to_overall_coloring_time_phase1(const double &coloring_time_) {
+    this->overall_coloring_time_phase1 += coloring_time_;
+  }
+  void add_to_overall_coloring_time_phase2(const double &coloring_time_) {
+    this->overall_coloring_time_phase2 += coloring_time_;
+  }
+  void add_to_overall_coloring_time_phase3(const double &coloring_time_) {
+    this->overall_coloring_time_phase3 += coloring_time_;
+  }
+  void add_to_overall_coloring_time_phase4(const double &coloring_time_) {
+    this->overall_coloring_time_phase4 += coloring_time_;
+  }
+  void add_to_overall_coloring_time_phase5(const double &coloring_time_) {
+    this->overall_coloring_time_phase5 += coloring_time_;
+  }
+  void set_coloring_time(const double &coloring_time_) {
+    this->coloring_time = coloring_time_;
+  }
+  void set_num_phases(const double &num_phases_) {
+    this->num_phases = num_phases_;
+  }
+  void set_vertex_colors(const color_view_t vertex_colors_) {
+    this->vertex_colors             = vertex_colors_;
     this->is_coloring_called_before = true;
-    this->num_colors = 0;
+    this->num_colors                = 0;
   }
-
 };
 
-}   // namespace KokkosGraph
+}  // namespace KokkosGraph
 
-#endif // _GRAPHCOLORHANDLE_HPP
+#endif  // _GRAPHCOLORHANDLE_HPP
diff --git a/src/graph/KokkosGraph_Distance2Color.hpp b/src/graph/KokkosGraph_Distance2Color.hpp
index c974d0c4f4..211ad42f63 100644
--- a/src/graph/KokkosGraph_Distance2Color.hpp
+++ b/src/graph/KokkosGraph_Distance2Color.hpp
@@ -65,189 +65,197 @@ namespace Experimental {
  * @param[in]  row_map        Row map
  * @param[in]  row_entries    Row entries
  *
- * \post <code>handle->get_distance2_graph_coloring_handle()->get_vertex_colors()</code>
+ * \post
+ * <code>handle->get_distance2_graph_coloring_handle()->get_vertex_colors()</code>
  *    will return a view of length num_vertices, containing the colors.
  */
 
-template<class KernelHandle, typename InRowmap, typename InEntries>
-void graph_color_distance2(
-    KernelHandle *handle,
-    typename KernelHandle::nnz_lno_t num_verts,
-    InRowmap row_map,
-    InEntries row_entries)
-{
-  using size_type = typename KernelHandle::size_type;
-  using lno_t = typename KernelHandle::nnz_lno_t;
-  using InternalRowmap = Kokkos::View<
-    const size_type*, Kokkos::LayoutLeft,
-    typename InRowmap::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-  using InternalEntries = Kokkos::View<
-    const lno_t*, Kokkos::LayoutLeft,
-    typename InEntries::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+template <class KernelHandle, typename InRowmap, typename InEntries>
+void graph_color_distance2(KernelHandle *handle,
+                           typename KernelHandle::nnz_lno_t num_verts,
+                           InRowmap row_map, InEntries row_entries) {
+  using size_type       = typename KernelHandle::size_type;
+  using lno_t           = typename KernelHandle::nnz_lno_t;
+  using InternalRowmap  = Kokkos::View<const size_type *, Kokkos::LayoutLeft,
+                                      typename InRowmap::device_type,
+                                      Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+  using InternalEntries = Kokkos::View<const lno_t *, Kokkos::LayoutLeft,
+                                       typename InEntries::device_type,
+                                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
   Kokkos::Timer timer;
   size_type nnz = row_entries.extent(0);
   InternalRowmap rowmap_internal(row_map.data(), row_map.extent(0));
   InternalEntries rowentries_internal(row_entries.data(), nnz);
   auto gch_d2 = handle->get_distance2_graph_coloring_handle();
-  //note: last template argument 'false' means do distance-2, not bipartite
-  KokkosGraph::Impl::GraphColorDistance2
-    <typename KernelHandle::GraphColorDistance2HandleType, InternalRowmap, InternalEntries, false>
-    gc(num_verts, num_verts, rowmap_internal, rowentries_internal, rowmap_internal, rowentries_internal, gch_d2);
+  // note: last template argument 'false' means do distance-2, not bipartite
+  KokkosGraph::Impl::GraphColorDistance2<
+      typename KernelHandle::GraphColorDistance2HandleType, InternalRowmap,
+      InternalEntries, false>
+      gc(num_verts, num_verts, rowmap_internal, rowentries_internal,
+         rowmap_internal, rowentries_internal, gch_d2);
   gc.compute_distance2_color();
   gch_d2->add_to_overall_coloring_time(timer.seconds());
   gch_d2->set_coloring_time(timer.seconds());
 }
 
 /**
- * Color the left part (rows) of a bipartite graph: rows r1 and r2 can have the same color 
- * if there is no column c such that edges (r1, c) and (r2, c) exist. This means only conflicts over a path
- * exactly two edges long are avoided.
+ * Color the left part (rows) of a bipartite graph: rows r1 and r2 can have the
+ * same color if there is no column c such that edges (r1, c) and (r2, c) exist.
+ * This means only conflicts over a path exactly two edges long are avoided.
  *
- * This problem is equivalent to grouping the matrix rows into a minimal number of sets, so that
- * within each set, the intersection of any two rows' entries is empty.
+ * This problem is equivalent to grouping the matrix rows into a minimal number
+ * of sets, so that within each set, the intersection of any two rows' entries
+ * is empty.
  *
  * Distance-1 conflicts (where r1 and c are neighbors) are not avoided,
- * since columns are not colored. In general, there is no particular relationship between a row and column
- * that happen to have the same index.
+ * since columns are not colored. In general, there is no particular
+ * relationship between a row and column that happen to have the same index.
  *
- * However, if the input graph is symmetric and has diagonal entries in every row, then rows and columns
- * are equivalent and distance-1 conflicts are present through edges (r1, r1) and (r1, r2).
+ * However, if the input graph is symmetric and has diagonal entries in every
+ * row, then rows and columns are equivalent and distance-1 conflicts are
+ * present through edges (r1, r1) and (r1, r2).
  *
  * @param[in]  handle         The Kernel Handle
- * @param[in]  num_rows       Number of "rows" (vertices in the left part of the graph)
- * @param[in]  num_columns    Number of "columns" (vertices in the right part of the graph)
+ * @param[in]  num_rows       Number of "rows" (vertices in the left part of the
+ * graph)
+ * @param[in]  num_columns    Number of "columns" (vertices in the right part of
+ * the graph)
  * @param[in]  row_map        Row map (CRS format)
  * @param[in]  row_entries    Row entries (CRS format)
- * @param[in]  is_symmetric   Whether (rowmap, row_entries) is known to be symmetric. If it
- *                            is, this saves computing the transpose internally.
+ * @param[in]  is_symmetric   Whether (rowmap, row_entries) is known to be
+ * symmetric. If it is, this saves computing the transpose internally.
  *
- * \post <code>handle->get_distance2_graph_coloring_handle()->get_vertex_colors()</code>
+ * \post
+ * <code>handle->get_distance2_graph_coloring_handle()->get_vertex_colors()</code>
  *    will return a view of length num_rows, containing the colors.
  */
 
-template<class KernelHandle, typename InRowmap, typename InEntries> 
-void bipartite_color_rows(
-    KernelHandle *handle,
-    typename KernelHandle::nnz_lno_t num_rows,
-    typename KernelHandle::nnz_lno_t num_columns,
-    InRowmap row_map,
-    InEntries row_entries,
-    bool is_symmetric = false)
-{
+template <class KernelHandle, typename InRowmap, typename InEntries>
+void bipartite_color_rows(KernelHandle *handle,
+                          typename KernelHandle::nnz_lno_t num_rows,
+                          typename KernelHandle::nnz_lno_t num_columns,
+                          InRowmap row_map, InEntries row_entries,
+                          bool is_symmetric = false) {
   using execution_space = typename KernelHandle::HandleExecSpace;
-  using size_type = typename KernelHandle::size_type;
-  using lno_t = typename KernelHandle::nnz_lno_t;
-  using InternalRowmap = Kokkos::View<
-    const size_type*, Kokkos::LayoutLeft,
-    typename InRowmap::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-  using InternalEntries = Kokkos::View<
-    const lno_t*, Kokkos::LayoutLeft,
-    typename InEntries::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-  using TRowmap = Kokkos::View<size_type*, Kokkos::LayoutLeft, typename InRowmap::device_type>;
-  using TEntries = Kokkos::View<lno_t*, Kokkos::LayoutLeft, typename InEntries::device_type>;
+  using size_type       = typename KernelHandle::size_type;
+  using lno_t           = typename KernelHandle::nnz_lno_t;
+  using InternalRowmap  = Kokkos::View<const size_type *, Kokkos::LayoutLeft,
+                                      typename InRowmap::device_type,
+                                      Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+  using InternalEntries = Kokkos::View<const lno_t *, Kokkos::LayoutLeft,
+                                       typename InEntries::device_type,
+                                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+  using TRowmap         = Kokkos::View<size_type *, Kokkos::LayoutLeft,
+                               typename InRowmap::device_type>;
+  using TEntries        = Kokkos::View<lno_t *, Kokkos::LayoutLeft,
+                                typename InEntries::device_type>;
   Kokkos::Timer timer;
   size_type nnz = row_entries.extent(0);
   TRowmap col_map;
   TEntries col_entries;
-  if(!is_symmetric)
-  {
-    //Compute the transpose
-    col_map = TRowmap("Col map", num_columns + 1);
+  if (!is_symmetric) {
+    // Compute the transpose
+    col_map     = TRowmap("Col map", num_columns + 1);
     col_entries = TEntries("Col entries", nnz);
-    KokkosKernels::Impl::transpose_graph
-      <InRowmap, InEntries, TRowmap, TEntries, TRowmap, execution_space>
-      (num_rows, num_columns, row_map, row_entries, col_map, col_entries);
+    KokkosKernels::Impl::transpose_graph<InRowmap, InEntries, TRowmap, TEntries,
+                                         TRowmap, execution_space>(
+        num_rows, num_columns, row_map, row_entries, col_map, col_entries);
   }
   InternalRowmap rowmap_internal(row_map.data(), row_map.extent(0));
   InternalEntries rowentries_internal(row_entries.data(), nnz);
   InternalRowmap colmap_internal;
   InternalEntries colentries_internal;
-  if(is_symmetric)
-  {
-    colmap_internal = InternalRowmap(row_map.data(), row_map.extent(0));
+  if (is_symmetric) {
+    colmap_internal     = InternalRowmap(row_map.data(), row_map.extent(0));
     colentries_internal = InternalEntries(row_entries.data(), nnz);
-  }
-  else
-  {
-    colmap_internal = InternalRowmap(col_map.data(), col_map.extent(0));
+  } else {
+    colmap_internal     = InternalRowmap(col_map.data(), col_map.extent(0));
     colentries_internal = InternalEntries(col_entries.data(), nnz);
   }
   auto gch_d2 = handle->get_distance2_graph_coloring_handle();
-  //note: last template argument 'true' means do bipartite one-sided
-  KokkosGraph::Impl::GraphColorDistance2
-    <typename KernelHandle::GraphColorDistance2HandleType, InternalRowmap, InternalEntries, true>
-    gc(num_rows, num_columns, rowmap_internal, rowentries_internal, colmap_internal, colentries_internal, gch_d2);
+  // note: last template argument 'true' means do bipartite one-sided
+  KokkosGraph::Impl::GraphColorDistance2<
+      typename KernelHandle::GraphColorDistance2HandleType, InternalRowmap,
+      InternalEntries, true>
+      gc(num_rows, num_columns, rowmap_internal, rowentries_internal,
+         colmap_internal, colentries_internal, gch_d2);
   gc.compute_distance2_color();
   gch_d2->add_to_overall_coloring_time(timer.seconds());
   gch_d2->set_coloring_time(timer.seconds());
 }
 
 /**
- * Color the right part (columns) of a bipartite graph: columns c1 and c2 can have the same color 
- * if there is no row r such that edges (r, c1) and (r, c2) exist.
+ * Color the right part (columns) of a bipartite graph: columns c1 and c2 can
+ * have the same color if there is no row r such that edges (r, c1) and (r, c2)
+ * exist.
  *
- * This problem is equivalent to grouping the matrix columns into a minimal number of sets, so that
- * within each set, no two columns appear together in any row's entries. This can be used for computing
- * a compressed Jacobian matrix.
+ * This problem is equivalent to grouping the matrix columns into a minimal
+ * number of sets, so that within each set, no two columns appear together in
+ * any row's entries. This can be used for computing a compressed Jacobian
+ * matrix.
  *
- * Note that the input to this function is still a CRS (row-wise) graph. If you have a CCS (column-wise)
- * or a symmetric graph, use bipartite_color_rows() instead. Calling that with the column-wise graph is
- * equivalent to calling this with the row-wise graph, and that way the transpose will be
- * computed automatically as needed.
+ * Note that the input to this function is still a CRS (row-wise) graph. If you
+ * have a CCS (column-wise) or a symmetric graph, use bipartite_color_rows()
+ * instead. Calling that with the column-wise graph is equivalent to calling
+ * this with the row-wise graph, and that way the transpose will be computed
+ * automatically as needed.
  *
  * @param[in]  handle         The Kernel Handle
- * @param[in]  num_rows       Number of "rows" (vertices in the left part of the graph)
- * @param[in]  num_columns    Number of "columns" (vertices in the right part of the graph)
+ * @param[in]  num_rows       Number of "rows" (vertices in the left part of the
+ * graph)
+ * @param[in]  num_columns    Number of "columns" (vertices in the right part of
+ * the graph)
  * @param[in]  row_map        Row map
  * @param[in]  row_entries    Row entries
  *
- * \post handle->get_distance2_graph_coloring_handle()->get_vertex_colors() will return a view of length num_columns, containing the colors.
+ * \post handle->get_distance2_graph_coloring_handle()->get_vertex_colors() will
+ * return a view of length num_columns, containing the colors.
  */
-template<class KernelHandle, typename InRowmap, typename InEntries>
-void bipartite_color_columns(
-    KernelHandle *handle,
-    typename KernelHandle::nnz_lno_t num_rows,
-    typename KernelHandle::nnz_lno_t num_columns,
-    InRowmap row_map,
-    InEntries row_entries)
-{
+template <class KernelHandle, typename InRowmap, typename InEntries>
+void bipartite_color_columns(KernelHandle *handle,
+                             typename KernelHandle::nnz_lno_t num_rows,
+                             typename KernelHandle::nnz_lno_t num_columns,
+                             InRowmap row_map, InEntries row_entries) {
   using execution_space = typename KernelHandle::HandleExecSpace;
-  using size_type = typename KernelHandle::size_type;
-  using lno_t = typename KernelHandle::nnz_lno_t;
-  using InternalRowmap = Kokkos::View<
-    const size_type*, Kokkos::LayoutLeft,
-    typename InRowmap::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-  using InternalEntries = Kokkos::View<
-    const lno_t*, Kokkos::LayoutLeft,
-    typename InEntries::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-  using TRowmap = Kokkos::View<size_type*, Kokkos::LayoutLeft, typename InRowmap::device_type>;
-  using TEntries = Kokkos::View<lno_t*, Kokkos::LayoutLeft, typename InEntries::device_type>;
+  using size_type       = typename KernelHandle::size_type;
+  using lno_t           = typename KernelHandle::nnz_lno_t;
+  using InternalRowmap  = Kokkos::View<const size_type *, Kokkos::LayoutLeft,
+                                      typename InRowmap::device_type,
+                                      Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+  using InternalEntries = Kokkos::View<const lno_t *, Kokkos::LayoutLeft,
+                                       typename InEntries::device_type,
+                                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+  using TRowmap         = Kokkos::View<size_type *, Kokkos::LayoutLeft,
+                               typename InRowmap::device_type>;
+  using TEntries        = Kokkos::View<lno_t *, Kokkos::LayoutLeft,
+                                typename InEntries::device_type>;
   Kokkos::Timer timer;
   size_type nnz = row_entries.extent(0);
-  //Compute the transpose
+  // Compute the transpose
   TRowmap col_map("Col map", num_columns + 1);
-  TEntries col_entries(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Col entries"), nnz);
-  KokkosKernels::Impl::transpose_graph
-    <InRowmap, InEntries, TRowmap, TEntries, TRowmap, execution_space>
-    (num_rows, num_columns, row_map, row_entries, col_map, col_entries);
-  //Get unmanaged views for both graph and its transpose
+  TEntries col_entries(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Col entries"), nnz);
+  KokkosKernels::Impl::transpose_graph<InRowmap, InEntries, TRowmap, TEntries,
+                                       TRowmap, execution_space>(
+      num_rows, num_columns, row_map, row_entries, col_map, col_entries);
+  // Get unmanaged views for both graph and its transpose
   InternalRowmap colmap_internal(col_map.data(), col_map.extent(0));
   InternalEntries colentries_internal(col_entries.data(), nnz);
   InternalRowmap rowmap_internal(row_map.data(), row_map.extent(0));
   InternalEntries rowentries_internal(row_entries.data(), nnz);
   auto gch_d2 = handle->get_distance2_graph_coloring_handle();
-  //note: last template argument 'true' means do bipartite one-sided
-  KokkosGraph::Impl::GraphColorDistance2
-    <typename KernelHandle::GraphColorDistance2HandleType, InternalRowmap, InternalEntries, true>
-    gc(num_columns, num_rows, colmap_internal, colentries_internal, rowmap_internal, rowentries_internal, gch_d2);
+  // note: last template argument 'true' means do bipartite one-sided
+  KokkosGraph::Impl::GraphColorDistance2<
+      typename KernelHandle::GraphColorDistance2HandleType, InternalRowmap,
+      InternalEntries, true>
+      gc(num_columns, num_rows, colmap_internal, colentries_internal,
+         rowmap_internal, rowentries_internal, gch_d2);
   gc.compute_distance2_color();
   gch_d2->add_to_overall_coloring_time(timer.seconds());
   gch_d2->set_coloring_time(timer.seconds());
 }
 
+}  // end namespace Experimental
+}  // end namespace KokkosGraph
 
-}      // end namespace Experimental
-}      // end namespace KokkosGraph
-
-#endif //_KOKKOS_GRAPH_COLORDISTANCE2_HPP
-
+#endif  //_KOKKOS_GRAPH_COLORDISTANCE2_HPP
diff --git a/src/graph/KokkosGraph_Distance2ColorHandle.hpp b/src/graph/KokkosGraph_Distance2ColorHandle.hpp
index 35402a72ff..766ae6c36a 100644
--- a/src/graph/KokkosGraph_Distance2ColorHandle.hpp
+++ b/src/graph/KokkosGraph_Distance2ColorHandle.hpp
@@ -47,424 +47,424 @@
 
 #include <KokkosKernels_Utils.hpp>
 #include <Kokkos_Core.hpp>
-#include <Kokkos_MemoryTraits.hpp>
+#include <KokkosKernels_Error.hpp>
 
 #ifndef _GRAPHCOLORDISTANCE2HANDLE_HPP
 #define _GRAPHCOLORDISTANCE2HANDLE_HPP
 
 namespace KokkosGraph {
 
-enum GraphColoringAlgorithmDistance2
-{
-    COLORING_D2_DEFAULT,     // Distance-2 Graph Coloring default algorithm
-    COLORING_D2_SERIAL,      // Distance-2 Graph Coloring (SERIAL)
-    COLORING_D2_VB,          // Distance-2 Graph Coloring Vertex Based
-    COLORING_D2_VB_BIT,      // Distance-2 Graph Coloring Vertex Based BIT
-    COLORING_D2_VB_BIT_EF,   // Distance-2 Graph Coloring Vertex Based BIT + Edge Filtering
-    COLORING_D2_NB_BIT       // Distance-2 Graph Coloring Net Based BIT
+enum GraphColoringAlgorithmDistance2 {
+  COLORING_D2_DEFAULT,    // Distance-2 Graph Coloring default algorithm
+  COLORING_D2_SERIAL,     // Distance-2 Graph Coloring (SERIAL)
+  COLORING_D2_VB,         // Distance-2 Graph Coloring Vertex Based
+  COLORING_D2_VB_BIT,     // Distance-2 Graph Coloring Vertex Based BIT
+  COLORING_D2_VB_BIT_EF,  // Distance-2 Graph Coloring Vertex Based BIT + Edge
+                          // Filtering
+  COLORING_D2_NB_BIT      // Distance-2 Graph Coloring Net Based BIT
 };
 
-template<class size_type_,
-         class color_t_,
-         class lno_t_,
-         class ExecutionSpace,
-         class TemporaryMemorySpace,
-         class PersistentMemorySpace>
-class GraphColorDistance2Handle
-{
-
-  public:
-    using HandleExecSpace                          = ExecutionSpace;
-    using HandleTempMemorySpace                    = TemporaryMemorySpace;
-    using HandlePersistentMemorySpace              = PersistentMemorySpace;
-    using size_type                                = typename std::remove_const<size_type_>::type;
-    using const_size_type                          = const size_type;
-    using nnz_lno_type                             = typename std::remove_const<lno_t_>::type;
-    using const_nnz_lno_type                       = const nnz_lno_type;
-    using color_type                               = typename std::remove_const<color_t_>::type;
-    using const_color_type                         = const color_type;
-    using color_view_type                          = typename Kokkos::View<color_type*, HandlePersistentMemorySpace>;
-    using color_view_array_layout                  = typename color_view_type::array_layout;
-    using color_view_device_type                   = typename color_view_type::device_type;
-    using color_view_memory_traits                 = typename color_view_type::memory_traits;
-    using color_host_view_type                     = typename color_view_type::HostMirror;
-    using size_type_temp_work_view_type            = typename Kokkos::View<size_type*, HandleTempMemorySpace>;
-    using size_type_persistent_work_view_type      = typename Kokkos::View<size_type*, HandlePersistentMemorySpace>;
-    using size_type_persistent_work_host_view_type = typename size_type_persistent_work_view_type::HostMirror;
-    using nnz_lno_temp_work_view_type              = typename Kokkos::View<nnz_lno_type*, HandleTempMemorySpace>;
-    using nnz_lno_persistent_work_view_type        = typename Kokkos::View<nnz_lno_type*, HandlePersistentMemorySpace>;
-    using nnz_lno_persistent_work_host_view_type   = typename nnz_lno_persistent_work_view_type::HostMirror;
-    using team_policy_type                         = Kokkos::TeamPolicy<HandleExecSpace>;
-    using team_member_type                         = typename team_policy_type::member_type;
-    using non_const_1d_size_type_view_type         = typename Kokkos::View<size_t*>;
-
-  private:
-    // Parameters
-    GraphColoringAlgorithmDistance2 coloring_algorithm_type;      // Which algorithm type to use.
-
-    bool verbose;      // verbosity flag
-    bool tictoc;       // print time at every step
-
-    bool vb_edge_filtering;      // whether to do edge filtering or not in vertex based algorithms.
-
-    int vb_chunk_size;                 // the (minimum) size of the consecutive works that a thread will be assigned to.
-    int max_number_of_iterations;      // maximum allowed number of phases that
-
-
-    // STATISTICS
-    double overall_coloring_time;             // The overall time taken to color the graph. In the case of the iterative calls.
-    double overall_coloring_time_phase1;      //
-    double overall_coloring_time_phase2;      //
-    double overall_coloring_time_phase3;      // Some timer accumulators for internal phases.
-    double overall_coloring_time_phase4;      //
-    double overall_coloring_time_phase5;      //
-    double coloring_time;                     // the time that it took to color the graph
-
-    bool use_vtx_list;
-    nnz_lno_temp_work_view_type vertex_list;
-    size_type vertex_list_size;    
-
-    int num_phases;      // Number of phases used by the coloring algorithm
-
-    color_view_type vertex_colors;
-    bool            is_coloring_called_before;
-    nnz_lno_type    num_colors;
-
-  public:
-    /**
-     * Default constructor
-     */
-    GraphColorDistance2Handle()
-        : coloring_algorithm_type(COLORING_D2_DEFAULT)
-        , verbose(false)
-        , tictoc(false)
-        , vb_edge_filtering(false)
-        , vb_chunk_size(8)
-        , max_number_of_iterations(200)
-        , overall_coloring_time(0)
-        , overall_coloring_time_phase1(0)
-        , overall_coloring_time_phase2(0)
-        , overall_coloring_time_phase3(0)
-        , overall_coloring_time_phase4(0)
-        , overall_coloring_time_phase5(0)
-        , coloring_time(0)
-        , use_vtx_list(false)
-        , num_phases(0)
-        , vertex_colors()
-        , is_coloring_called_before(false)
-        , num_colors(0)
-    {
-        this->choose_default_algorithm();
-        this->set_defaults(this->coloring_algorithm_type);
-
-        // Throw an error if PersistentMemSpace != TempMemSpace since we don't support them being different (for now).
-        if(!std::is_same<PersistentMemorySpace, TemporaryMemorySpace>::value)
-        {
-            std::string message = "Distance-2 Graph Coloring Handle does not currently support different mem spaces";
-            Kokkos::Impl::throw_runtime_exception(message);
-        }
+template <class size_type_, class color_t_, class lno_t_, class ExecutionSpace,
+          class TemporaryMemorySpace, class PersistentMemorySpace>
+class GraphColorDistance2Handle {
+ public:
+  using HandleExecSpace             = ExecutionSpace;
+  using HandleTempMemorySpace       = TemporaryMemorySpace;
+  using HandlePersistentMemorySpace = PersistentMemorySpace;
+  using size_type          = typename std::remove_const<size_type_>::type;
+  using const_size_type    = const size_type;
+  using nnz_lno_type       = typename std::remove_const<lno_t_>::type;
+  using const_nnz_lno_type = const nnz_lno_type;
+  using color_type         = typename std::remove_const<color_t_>::type;
+  using const_color_type   = const color_type;
+  using color_view_type =
+      typename Kokkos::View<color_type*, HandlePersistentMemorySpace>;
+  using color_view_array_layout  = typename color_view_type::array_layout;
+  using color_view_device_type   = typename color_view_type::device_type;
+  using color_view_memory_traits = typename color_view_type::memory_traits;
+  using color_host_view_type     = typename color_view_type::HostMirror;
+  using size_type_temp_work_view_type =
+      typename Kokkos::View<size_type*, HandleTempMemorySpace>;
+  using size_type_persistent_work_view_type =
+      typename Kokkos::View<size_type*, HandlePersistentMemorySpace>;
+  using size_type_persistent_work_host_view_type =
+      typename size_type_persistent_work_view_type::HostMirror;
+  using nnz_lno_temp_work_view_type =
+      typename Kokkos::View<nnz_lno_type*, HandleTempMemorySpace>;
+  using nnz_lno_persistent_work_view_type =
+      typename Kokkos::View<nnz_lno_type*, HandlePersistentMemorySpace>;
+  using nnz_lno_persistent_work_host_view_type =
+      typename nnz_lno_persistent_work_view_type::HostMirror;
+  using team_policy_type = Kokkos::TeamPolicy<HandleExecSpace>;
+  using team_member_type = typename team_policy_type::member_type;
+  using non_const_1d_size_type_view_type = typename Kokkos::View<size_t*>;
+
+ private:
+  // Parameters
+  GraphColoringAlgorithmDistance2
+      coloring_algorithm_type;  // Which algorithm type to use.
+
+  bool verbose;  // verbosity flag
+  bool tictoc;   // print time at every step
+
+  bool vb_edge_filtering;  // whether to do edge filtering or not in vertex
+                           // based algorithms.
+
+  int vb_chunk_size;  // the (minimum) size of the consecutive works that a
+                      // thread will be assigned to.
+  int max_number_of_iterations;  // maximum allowed number of phases that
+
+  // STATISTICS
+  double overall_coloring_time;  // The overall time taken to color the graph.
+                                 // In the case of the iterative calls.
+  double overall_coloring_time_phase1;  //
+  double overall_coloring_time_phase2;  //
+  double overall_coloring_time_phase3;  // Some timer accumulators for internal
+                                        // phases.
+  double overall_coloring_time_phase4;  //
+  double overall_coloring_time_phase5;  //
+  double coloring_time;  // the time that it took to color the graph
+
+  bool use_vtx_list;
+  nnz_lno_temp_work_view_type vertex_list;
+  size_type vertex_list_size;
+
+  int num_phases;  // Number of phases used by the coloring algorithm
+
+  color_view_type vertex_colors;
+  bool is_coloring_called_before;
+  nnz_lno_type num_colors;
+
+ public:
+  /**
+   * Default constructor
+   */
+  GraphColorDistance2Handle()
+      : coloring_algorithm_type(COLORING_D2_DEFAULT),
+        verbose(false),
+        tictoc(false),
+        vb_edge_filtering(false),
+        vb_chunk_size(8),
+        max_number_of_iterations(200),
+        overall_coloring_time(0),
+        overall_coloring_time_phase1(0),
+        overall_coloring_time_phase2(0),
+        overall_coloring_time_phase3(0),
+        overall_coloring_time_phase4(0),
+        overall_coloring_time_phase5(0),
+        coloring_time(0),
+        use_vtx_list(false),
+        num_phases(0),
+        vertex_colors(),
+        is_coloring_called_before(false),
+        num_colors(0) {
+    this->choose_default_algorithm();
+    this->set_defaults(this->coloring_algorithm_type);
+
+    // Throw an error if PersistentMemSpace != TempMemSpace since we don't
+    // support them being different (for now).
+    if (!std::is_same<PersistentMemorySpace, TemporaryMemorySpace>::value) {
+      std::string message =
+          "Distance-2 Graph Coloring Handle does not currently support "
+          "different mem spaces";
+      KokkosKernels::Impl::throw_runtime_exception(message);
     }
-
-
-    /**
-     * Changes the graph coloring algorithm.
-     *
-     * @param[in] col_algo Coloring algorithm, one of:
-     *                     - COLORING_D2_DEFAULT
-     *                     - COLORING_D2_SERIAL
-     *                     - COLORING_D2_VB
-     *                     - COLORING_D2_VB_BIT
-     *                     - COLORING_D2_VB_BIT_EF
-     *                     - COLORING_D2_NB_BIT
-     *
-     *  @param[in] set_default_parameters Whether or not to reset the default parameters for the given algorithm.
-     *                                    Default = true.
-     *
-     *  @return None
-     */
-    void set_algorithm(const GraphColoringAlgorithmDistance2& col_algo, bool set_default_parameters = true)
-    {
-        if(col_algo == COLORING_D2_DEFAULT)
-        {
-            this->choose_default_algorithm();
-        }
-        else
-        {
-            this->coloring_algorithm_type = col_algo;
-        }
-        if(set_default_parameters)
-        {
-            this->set_defaults(this->coloring_algorithm_type);
-        }
+  }
+
+  /**
+   * Changes the graph coloring algorithm.
+   *
+   * @param[in] col_algo Coloring algorithm, one of:
+   *                     - COLORING_D2_DEFAULT
+   *                     - COLORING_D2_SERIAL
+   *                     - COLORING_D2_VB
+   *                     - COLORING_D2_VB_BIT
+   *                     - COLORING_D2_VB_BIT_EF
+   *                     - COLORING_D2_NB_BIT
+   *
+   *  @param[in] set_default_parameters Whether or not to reset the default
+   * parameters for the given algorithm. Default = true.
+   *
+   *  @return None
+   */
+  void set_algorithm(const GraphColoringAlgorithmDistance2& col_algo,
+                     bool set_default_parameters = true) {
+    if (col_algo == COLORING_D2_DEFAULT) {
+      this->choose_default_algorithm();
+    } else {
+      this->coloring_algorithm_type = col_algo;
     }
-
-
-    /**
-     * Chooses best algorithm based on the execution space.
-     *
-     * This chooses the best algorithm based on the execution space:
-     * - COLORING_D2_SERIAL if the execution space is SERIAL (more work efficient than NB_BIT)
-     * - COLORING_D2_NB_BIT otherwise (fastest parallel algorithm)
-     *
-     */
-
-    void choose_default_algorithm()
-    {
-        if(KokkosKernels::Impl::kk_get_exec_space_type<ExecutionSpace>() == KokkosKernels::Impl::Exec_SERIAL)
-        {
-            this->coloring_algorithm_type = COLORING_D2_SERIAL;
-#ifdef VERBOSE 
-            std::cout << "Serial Execution Space, Default Algorithm: COLORING_D2_SERIAL\n";
+    if (set_default_parameters) {
+      this->set_defaults(this->coloring_algorithm_type);
+    }
+  }
+
+  /**
+   * Chooses best algorithm based on the execution space.
+   *
+   * This chooses the best algorithm based on the execution space:
+   * - COLORING_D2_SERIAL if the execution space is SERIAL (more work efficient
+   * than NB_BIT)
+   * - COLORING_D2_NB_BIT otherwise (fastest parallel algorithm)
+   *
+   */
+
+  void choose_default_algorithm() {
+    if (KokkosKernels::Impl::kk_get_exec_space_type<ExecutionSpace>() ==
+        KokkosKernels::Impl::Exec_SERIAL) {
+      this->coloring_algorithm_type = COLORING_D2_SERIAL;
+#ifdef VERBOSE
+      std::cout
+          << "Serial Execution Space, Default Algorithm: COLORING_D2_SERIAL\n";
 #endif
-        }
-        else
-        {
-            this->coloring_algorithm_type = COLORING_D2_NB_BIT;
-#ifdef VERBOSE 
-            std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_D2_NB_BIT\n";
+    } else {
+      this->coloring_algorithm_type = COLORING_D2_NB_BIT;
+#ifdef VERBOSE
+      std::cout << ExecutionSpace::name()
+                << " Execution Space, Default Algorithm: COLORING_D2_NB_BIT\n";
 #endif
-        }
     }
-
-
-    nnz_lno_type get_num_colors()
-    {
-        if(num_colors == 0)
-          KokkosKernels::Impl::view_reduce_max<color_view_type, ExecutionSpace>(vertex_colors.extent(0), vertex_colors, num_colors);
-        return num_colors;
-    }
-
-
-    /** \brief Sets Default Parameter settings for the given algorithm.
-     */
-    void set_defaults(const GraphColoringAlgorithmDistance2& col_algo)
-    {
-        switch(col_algo)
-        {
-            case COLORING_D2_SERIAL:
-            case COLORING_D2_VB:
-            case COLORING_D2_VB_BIT:
-            case COLORING_D2_VB_BIT_EF:
-            case COLORING_D2_NB_BIT:
-                this->tictoc                   = false;
-                this->vb_edge_filtering        = false;
-                this->vb_chunk_size            = 8;
-                this->max_number_of_iterations = 200;
-                break;
-            default:
-                throw std::runtime_error("Unknown Distance-2 Graph Coloring Algorithm\n");
-        }
-    }
-
-
-    /**
-     * \brief Destructor
-     */
-    virtual ~GraphColorDistance2Handle() {};
-
-    // getters and setters
-    GraphColoringAlgorithmDistance2 get_coloring_algo_type() const { return this->coloring_algorithm_type; }
-
-    bool   get_verbose() const { return this->verbose; }
-    double get_coloring_time() const { return this->coloring_time; }
-    int    get_max_number_of_iterations() const { return this->max_number_of_iterations; }
-    int    get_num_phases() const { return this->num_phases; }
-
-    double get_overall_coloring_time() const { return this->overall_coloring_time; }
-    double get_overall_coloring_time_phase1() const { return this->overall_coloring_time_phase1; }
-    double get_overall_coloring_time_phase2() const { return this->overall_coloring_time_phase2; }
-    double get_overall_coloring_time_phase3() const { return this->overall_coloring_time_phase3; }
-    double get_overall_coloring_time_phase4() const { return this->overall_coloring_time_phase4; }
-    double get_overall_coloring_time_phase5() const { return this->overall_coloring_time_phase5; }
-
-    bool get_tictoc() const { return this->tictoc; }
-
-    int get_vb_chunk_size() const { return this->vb_chunk_size; }
-
-    bool get_vb_edge_filtering() const { return this->vb_edge_filtering; }
-
-    color_view_type get_vertex_colors() const { return this->vertex_colors; }
-
-    bool is_coloring_called() const { return this->is_coloring_called_before; }
-
-    bool get_use_vtx_list() const { return this->use_vtx_list; }
-    nnz_lno_temp_work_view_type get_vertex_list() const { return this->vertex_list; }
-    size_type get_vertex_list_size() const { return this->vertex_list_size; }
-
-    // setters
-    void set_vertex_list(nnz_lno_temp_work_view_type vertex_list_, size_type vertex_list_size_){
-      this->vertex_list = vertex_list_;
-      this->vertex_list_size = vertex_list_size_;
-      this->use_vtx_list = true;
-    }
-    void set_coloring_called() { this->is_coloring_called_before = true; }
-
-    void set_coloring_algo_type(const GraphColoringAlgorithmDistance2& col_algo) { this->coloring_algorithm_type = col_algo; }
-
-    void set_verbose(const bool verbose_) { this->verbose = verbose_; }
-    void set_coloring_time(const double& coloring_time_) { this->coloring_time = coloring_time_; }
-    void set_max_number_of_iterations(const int& max_phases) { this->max_number_of_iterations = max_phases; }
-    void set_num_phases(const int& num_phases_) { this->num_phases = num_phases_; }
-
-    void add_to_overall_coloring_time(const double& coloring_time_) { this->overall_coloring_time += coloring_time_; }
-    void add_to_overall_coloring_time_phase1(const double& coloring_time_)
-    {
-        this->overall_coloring_time_phase1 += coloring_time_;
-    }
-    void add_to_overall_coloring_time_phase2(const double& coloring_time_)
-    {
-        this->overall_coloring_time_phase2 += coloring_time_;
-    }
-    void add_to_overall_coloring_time_phase3(const double& coloring_time_)
-    {
-        this->overall_coloring_time_phase3 += coloring_time_;
-    }
-    void add_to_overall_coloring_time_phase4(const double& coloring_time_)
-    {
-        this->overall_coloring_time_phase4 += coloring_time_;
-    }
-    void add_to_overall_coloring_time_phase5(const double& coloring_time_)
-    {
-        this->overall_coloring_time_phase5 += coloring_time_;
-    }
-
-    void set_tictoc(const bool use_tictoc) { this->tictoc = use_tictoc; }
-
-    void set_vb_chunk_size(const int& chunksize) { this->vb_chunk_size = chunksize; }
-
-    void set_vb_edge_filtering(const bool& use_vb_edge_filtering) { this->vb_edge_filtering = use_vb_edge_filtering; }
-
-    void set_vertex_colors(const color_view_type vertex_colors_)
-    {
-        this->vertex_colors             = vertex_colors_;
-        this->is_coloring_called_before = true;
-        this->num_colors                = 0;
+  }
+
+  nnz_lno_type get_num_colors() {
+    if (num_colors == 0)
+      KokkosKernels::Impl::view_reduce_max<color_view_type, ExecutionSpace>(
+          vertex_colors.extent(0), vertex_colors, num_colors);
+    return num_colors;
+  }
+
+  /** \brief Sets Default Parameter settings for the given algorithm.
+   */
+  void set_defaults(const GraphColoringAlgorithmDistance2& col_algo) {
+    switch (col_algo) {
+      case COLORING_D2_SERIAL:
+      case COLORING_D2_VB:
+      case COLORING_D2_VB_BIT:
+      case COLORING_D2_VB_BIT_EF:
+      case COLORING_D2_NB_BIT:
+        this->tictoc                   = false;
+        this->vb_edge_filtering        = false;
+        this->vb_chunk_size            = 8;
+        this->max_number_of_iterations = 200;
+        break;
+      default:
+        throw std::runtime_error(
+            "Unknown Distance-2 Graph Coloring Algorithm\n");
     }
+  }
+
+  /**
+   * \brief Destructor
+   */
+  virtual ~GraphColorDistance2Handle(){};
+
+  // getters and setters
+  GraphColoringAlgorithmDistance2 get_coloring_algo_type() const {
+    return this->coloring_algorithm_type;
+  }
+
+  bool get_verbose() const { return this->verbose; }
+  double get_coloring_time() const { return this->coloring_time; }
+  int get_max_number_of_iterations() const {
+    return this->max_number_of_iterations;
+  }
+  int get_num_phases() const { return this->num_phases; }
+
+  double get_overall_coloring_time() const {
+    return this->overall_coloring_time;
+  }
+  double get_overall_coloring_time_phase1() const {
+    return this->overall_coloring_time_phase1;
+  }
+  double get_overall_coloring_time_phase2() const {
+    return this->overall_coloring_time_phase2;
+  }
+  double get_overall_coloring_time_phase3() const {
+    return this->overall_coloring_time_phase3;
+  }
+  double get_overall_coloring_time_phase4() const {
+    return this->overall_coloring_time_phase4;
+  }
+  double get_overall_coloring_time_phase5() const {
+    return this->overall_coloring_time_phase5;
+  }
+
+  bool get_tictoc() const { return this->tictoc; }
+
+  int get_vb_chunk_size() const { return this->vb_chunk_size; }
+
+  bool get_vb_edge_filtering() const { return this->vb_edge_filtering; }
+
+  color_view_type get_vertex_colors() const { return this->vertex_colors; }
+
+  bool is_coloring_called() const { return this->is_coloring_called_before; }
+
+  bool get_use_vtx_list() const { return this->use_vtx_list; }
+  nnz_lno_temp_work_view_type get_vertex_list() const {
+    return this->vertex_list;
+  }
+  size_type get_vertex_list_size() const { return this->vertex_list_size; }
+
+  // setters
+  void set_vertex_list(nnz_lno_temp_work_view_type vertex_list_,
+                       size_type vertex_list_size_) {
+    this->vertex_list      = vertex_list_;
+    this->vertex_list_size = vertex_list_size_;
+    this->use_vtx_list     = true;
+  }
+  void set_coloring_called() { this->is_coloring_called_before = true; }
+
+  void set_coloring_algo_type(const GraphColoringAlgorithmDistance2& col_algo) {
+    this->coloring_algorithm_type = col_algo;
+  }
+
+  void set_verbose(const bool verbose_) { this->verbose = verbose_; }
+  void set_coloring_time(const double& coloring_time_) {
+    this->coloring_time = coloring_time_;
+  }
+  void set_max_number_of_iterations(const int& max_phases) {
+    this->max_number_of_iterations = max_phases;
+  }
+  void set_num_phases(const int& num_phases_) {
+    this->num_phases = num_phases_;
+  }
+
+  void add_to_overall_coloring_time(const double& coloring_time_) {
+    this->overall_coloring_time += coloring_time_;
+  }
+  void add_to_overall_coloring_time_phase1(const double& coloring_time_) {
+    this->overall_coloring_time_phase1 += coloring_time_;
+  }
+  void add_to_overall_coloring_time_phase2(const double& coloring_time_) {
+    this->overall_coloring_time_phase2 += coloring_time_;
+  }
+  void add_to_overall_coloring_time_phase3(const double& coloring_time_) {
+    this->overall_coloring_time_phase3 += coloring_time_;
+  }
+  void add_to_overall_coloring_time_phase4(const double& coloring_time_) {
+    this->overall_coloring_time_phase4 += coloring_time_;
+  }
+  void add_to_overall_coloring_time_phase5(const double& coloring_time_) {
+    this->overall_coloring_time_phase5 += coloring_time_;
+  }
+
+  void set_tictoc(const bool use_tictoc) { this->tictoc = use_tictoc; }
+
+  void set_vb_chunk_size(const int& chunksize) {
+    this->vb_chunk_size = chunksize;
+  }
+
+  void set_vb_edge_filtering(const bool& use_vb_edge_filtering) {
+    this->vb_edge_filtering = use_vb_edge_filtering;
+  }
+
+  void set_vertex_colors(const color_view_type vertex_colors_) {
+    this->vertex_colors             = vertex_colors_;
+    this->is_coloring_called_before = true;
+    this->num_colors                = 0;
+  }
+
+  /**
+   * Print / write out the graph in a GraphVIZ format.
+   *
+   * - Nodes colored with 1-5 will be filled in with some colors.
+   * - Nodes colored > 5 will be unfilled (i.e., white background).
+   * - Nodes colored with 0 (i.e., uncolored) will be filled in with red
+   *   and will have a dashed border line. Uncolored nodes indicate a failed
+   *   coloring.
+   *
+   * @param[in] os use std::cout for output to STDOUT stream, or a ofstream
+   * object (i.e., `std::ofstream os("G.dot", std::ofstream::out);`) to write to
+   * a file.
+   */
+  template <typename kokkos_view_type, typename rowmap_type,
+            typename entries_type>
+  void dump_graphviz(std::ostream& os, const size_t num_verts,
+                     rowmap_type& rowmap, entries_type& entries,
+                     kokkos_view_type& colors) const {
+    using h_colors_type  = typename kokkos_view_type::HostMirror;
+    using h_rowmap_type  = typename rowmap_type::HostMirror;
+    using h_entries_type = typename entries_type::HostMirror;
+
+    h_colors_type h_colors = Kokkos::create_mirror_view(colors);
+    Kokkos::deep_copy(h_colors, colors);
+
+    h_rowmap_type h_rowmap = Kokkos::create_mirror_view(rowmap);
+    Kokkos::deep_copy(h_rowmap, rowmap);
+
+    h_entries_type h_entries = Kokkos::create_mirror_view(entries);
+    Kokkos::deep_copy(h_entries, entries);
+
+    os << "Graph G" << std::endl
+       << "{" << std::endl
+       << "    rankdir=LR;" << std::endl
+       << "    overlap=false;" << std::endl
+       << "    splines=true;" << std::endl
+       << "    maxiter=2000;" << std::endl
+       << "    node [shape=Mrecord];" << std::endl
+       << "    edge [len=2.0];" << std::endl
+       << std::endl;
+
+    for (size_t vid = 0; vid < num_verts; vid++) {
+      // Set color scheme for colors 1-5
+      std::string fillcolor = "";
+      std::string penwidth  = "";
+      std::string color     = "";
+      std::string style     = "";
+      std::string fontcolor = "";
+      if (1 == h_colors(vid)) {
+        fillcolor = ", fillcolor=\"#CC4C51\"";
+        style     = ", style=\"filled\"";
+      } else if (2 == h_colors(vid)) {
+        fillcolor = ", fillcolor=\"#CEBA5A\"";
+        style     = ", style=\"filled\"";
+      } else if (3 == h_colors(vid)) {
+        fillcolor = ", fillcolor=\"#838FA4\"";
+        style     = ", style=\"filled\"";
+      } else if (4 == h_colors(vid)) {
+        fillcolor = ", fillcolor=\"#C3AD86\"";
+        style     = ", style=\"filled\"";
+      } else if (5 == h_colors(vid)) {
+        fillcolor = ", fillcolor=\"#935C44\"";
+        style     = ", style=\"filled\"";
+      } else if (0 == h_colors(vid)) {
+        style     = ", style=\"filled,dashed\"";
+        fillcolor = ", fillcolor=\"red\"";
+        fontcolor = ", fontcolor=\"black\"";
+        color     = ", color=\"black\"";
+        penwidth  = ", penwidth=\"2.0\"";
+      }
 
+      os << "    " << vid << " [ label=\"" << vid << "|" << h_colors(vid)
+         << "\"" << style << fontcolor << color << fillcolor << penwidth << "];"
+         << std::endl;
 
-    /**
-     * Print / write out the graph in a GraphVIZ format.
-     *
-     * - Nodes colored with 1-5 will be filled in with some colors.
-     * - Nodes colored > 5 will be unfilled (i.e., white background).
-     * - Nodes colored with 0 (i.e., uncolored) will be filled in with red
-     *   and will have a dashed border line. Uncolored nodes indicate a failed
-     *   coloring.
-     *
-     * @param[in] os use std::cout for output to STDOUT stream, or a ofstream object
-     *            (i.e., `std::ofstream os("G.dot", std::ofstream::out);`) to write
-     *            to a file.
-     */
-    template<typename kokkos_view_type, typename rowmap_type, typename entries_type>
-    void dump_graphviz(std::ostream&     os,
-                       const size_t      num_verts,
-                       rowmap_type&      rowmap,
-                       entries_type&     entries,
-                       kokkos_view_type& colors) const
-    {
-        using h_colors_type  = typename kokkos_view_type::HostMirror;
-        using h_rowmap_type  = typename rowmap_type::HostMirror;
-        using h_entries_type = typename entries_type::HostMirror;
-
-        h_colors_type h_colors = Kokkos::create_mirror_view(colors);
-        Kokkos::deep_copy(h_colors, colors);
-
-        h_rowmap_type h_rowmap = Kokkos::create_mirror_view(rowmap);
-        Kokkos::deep_copy(h_rowmap, rowmap);
-
-        h_entries_type h_entries = Kokkos::create_mirror_view(entries);
-        Kokkos::deep_copy(h_entries, entries);
-
-        os << "Graph G" << std::endl
-           << "{" << std::endl
-           << "    rankdir=LR;" << std::endl
-           << "    overlap=false;" << std::endl
-           << "    splines=true;" << std::endl
-           << "    maxiter=2000;" << std::endl
-           << "    node [shape=Mrecord];" << std::endl
-           << "    edge [len=2.0];" << std::endl
-           << std::endl;
-
-        for(size_t vid = 0; vid < num_verts; vid++)
-        {
-            // Set color scheme for colors 1-5
-            std::string fillcolor = "";
-            std::string penwidth  = "";
-            std::string color     = "";
-            std::string style     = "";
-            std::string fontcolor = "";
-            if(1 == h_colors(vid))
-            {
-                fillcolor = ", fillcolor=\"#CC4C51\"";
-                style     = ", style=\"filled\"";
-            }
-            else if(2 == h_colors(vid))
-            {
-                fillcolor = ", fillcolor=\"#CEBA5A\"";
-                style     = ", style=\"filled\"";
-            }
-            else if(3 == h_colors(vid))
-            {
-                fillcolor = ", fillcolor=\"#838FA4\"";
-                style     = ", style=\"filled\"";
-            }
-            else if(4 == h_colors(vid))
-            {
-                fillcolor = ", fillcolor=\"#C3AD86\"";
-                style     = ", style=\"filled\"";
-            }
-            else if(5 == h_colors(vid))
-            {
-                fillcolor = ", fillcolor=\"#935C44\"";
-                style     = ", style=\"filled\"";
-            }
-            else if(0 == h_colors(vid))
-            {
-                style     = ", style=\"filled,dashed\"";
-                fillcolor = ", fillcolor=\"red\"";
-                fontcolor = ", fontcolor=\"black\"";
-                color     = ", color=\"black\"";
-                penwidth  = ", penwidth=\"2.0\"";
-            }
-
-            os << "    " << vid << " [ label=\"" << vid << "|" << h_colors(vid) << "\"" << style << fontcolor << color
-               << fillcolor << penwidth << "];" << std::endl;
-
-            // Add the node's edges
-            for(size_t iadj = h_rowmap(vid); iadj < (size_t)h_rowmap(vid + 1); iadj++)
-            {
-                size_t vadj = h_entries(iadj);
-                if(vadj >= vid)
-                {
-                    os << "    " << vid << " -- " << vadj << ";" << std::endl;
-                }
-            }
-            os << std::endl;
+      // Add the node's edges
+      for (size_t iadj = h_rowmap(vid); iadj < (size_t)h_rowmap(vid + 1);
+           iadj++) {
+        size_t vadj = h_entries(iadj);
+        if (vadj >= vid) {
+          os << "    " << vid << " -- " << vadj << ";" << std::endl;
         }
-        os << "}" << std::endl;
-    }      // dump_graphviz (end)
-
-    const char* getD2AlgorithmName() const
-    {
-      switch(coloring_algorithm_type)
-      {
-        case COLORING_D2_DEFAULT:
-          return "COLORING_D2_DEFAULT";
-        case COLORING_D2_SERIAL:
-          return "COLORING_D2_SERIAL";
-        case COLORING_D2_VB:
-          return "COLORING_D2_VB";
-        case COLORING_D2_VB_BIT:
-          return "COLORING_D2_VB_BIT";
-        case COLORING_D2_VB_BIT_EF:
-          return "COLORING_D2_VB_BIT_EF";
-        case COLORING_D2_NB_BIT:
-          return "COLORING_D2_NB_BIT";
       }
-      return "ERROR: unregistered algorithm";
+      os << std::endl;
+    }
+    os << "}" << std::endl;
+  }  // dump_graphviz (end)
+
+  const char* getD2AlgorithmName() const {
+    switch (coloring_algorithm_type) {
+      case COLORING_D2_DEFAULT: return "COLORING_D2_DEFAULT";
+      case COLORING_D2_SERIAL: return "COLORING_D2_SERIAL";
+      case COLORING_D2_VB: return "COLORING_D2_VB";
+      case COLORING_D2_VB_BIT: return "COLORING_D2_VB_BIT";
+      case COLORING_D2_VB_BIT_EF: return "COLORING_D2_VB_BIT_EF";
+      case COLORING_D2_NB_BIT: return "COLORING_D2_NB_BIT";
     }
-};      // class GraphColorDistance2Handle (end)
+    return "ERROR: unregistered algorithm";
+  }
+};  // class GraphColorDistance2Handle (end)
 
-}      // namespace KokkosGraph
+}  // namespace KokkosGraph
 
-#endif      // _GRAPHCOLORHANDLE_HPP
+#endif  // _GRAPHCOLORHANDLE_HPP
diff --git a/src/graph/KokkosGraph_ExplicitCoarsening.hpp b/src/graph/KokkosGraph_ExplicitCoarsening.hpp
index c380cadc64..8992aa4bb8 100644
--- a/src/graph/KokkosGraph_ExplicitCoarsening.hpp
+++ b/src/graph/KokkosGraph_ExplicitCoarsening.hpp
@@ -51,71 +51,89 @@
 namespace KokkosGraph {
 namespace Experimental {
 
-//Given a CRS graph and coarse labels, produce a new CRS graph representing the coarsened graph.
-//If A is nonsquare, entries in columns >= numVerts are discarded.
-//The labels should be in the range [0, numCoarseVerts), and the output graph wil have numCoarseVerts.
+// Given a CRS graph and coarse labels, produce a new CRS graph representing the
+// coarsened graph. If A is nonsquare, entries in columns >= numVerts are
+// discarded. The labels should be in the range [0, numCoarseVerts), and the
+// output graph wil have numCoarseVerts.
 //
-//If compress, sort and merge entries in each row.
-//An uncompressed graph will still work as input to some things like D1 graph coloring.
+// If compress, sort and merge entries in each row.
+// An uncompressed graph will still work as input to some things like D1 graph
+// coloring.
 
-template <typename device_t, typename fine_rowmap_t, typename fine_entries_t, typename labels_t, typename coarse_rowmap_t, typename coarse_entries_t>
+template <typename device_t, typename fine_rowmap_t, typename fine_entries_t,
+          typename labels_t, typename coarse_rowmap_t,
+          typename coarse_entries_t>
 void graph_explicit_coarsen(
     const fine_rowmap_t& fineRowmap, const fine_entries_t& fineEntries,
-    const labels_t& labels, typename fine_entries_t::non_const_value_type numCoarseVerts,
+    const labels_t& labels,
+    typename fine_entries_t::non_const_value_type numCoarseVerts,
     coarse_rowmap_t& coarseRowmap, coarse_entries_t& coarseEntries,
-    bool compress = true)
-{
-  using size_type = typename fine_rowmap_t::non_const_value_type;
-  using lno_t = typename fine_entries_t::non_const_value_type;
+    bool compress = true) {
+  using size_type  = typename fine_rowmap_t::non_const_value_type;
+  using lno_t      = typename fine_entries_t::non_const_value_type;
   using exec_space = typename device_t::execution_space;
-  static_assert(std::is_same<lno_t, typename coarse_entries_t::non_const_value_type>::value,
-      "graph_explicit_coarsen: The coarse and fine entry Views have different value types.");
-  KokkosGraph::Impl::ExplicitGraphCoarsening<lno_t, size_type, device_t, fine_rowmap_t, fine_entries_t, labels_t, coarse_rowmap_t, coarse_entries_t, coarse_entries_t>
-    egc(fineRowmap, fineEntries, labels, numCoarseVerts);
-  coarseRowmap = egc.coarseRowmap;
+  static_assert(
+      std::is_same<lno_t,
+                   typename coarse_entries_t::non_const_value_type>::value,
+      "graph_explicit_coarsen: The coarse and fine entry Views have different "
+      "value types.");
+  KokkosGraph::Impl::ExplicitGraphCoarsening<
+      lno_t, size_type, device_t, fine_rowmap_t, fine_entries_t, labels_t,
+      coarse_rowmap_t, coarse_entries_t, coarse_entries_t>
+      egc(fineRowmap, fineEntries, labels, numCoarseVerts);
+  coarseRowmap  = egc.coarseRowmap;
   coarseEntries = egc.coarseEntries;
-  if(compress)
-  {
+  if (compress) {
     coarse_rowmap_t mergedRowmap;
     coarse_entries_t mergedEntries;
-    KokkosKernels::sort_and_merge_graph<exec_space, coarse_rowmap_t, coarse_entries_t>
-      (coarseRowmap, coarseEntries, mergedRowmap, mergedEntries);
-    coarseRowmap = mergedRowmap;
+    KokkosKernels::sort_and_merge_graph<exec_space, coarse_rowmap_t,
+                                        coarse_entries_t>(
+        coarseRowmap, coarseEntries, mergedRowmap, mergedEntries);
+    coarseRowmap  = mergedRowmap;
     coarseEntries = mergedEntries;
   }
 }
 
-//Same as above, but also produce the map from coarse vertices to fine vertices (inverse map of labels)
-template <typename device_t, typename fine_rowmap_t, typename fine_entries_t, typename labels_t, typename coarse_rowmap_t, typename coarse_entries_t, typename ordinal_view_t>
+// Same as above, but also produce the map from coarse vertices to fine vertices
+// (inverse map of labels)
+template <typename device_t, typename fine_rowmap_t, typename fine_entries_t,
+          typename labels_t, typename coarse_rowmap_t,
+          typename coarse_entries_t, typename ordinal_view_t>
 void graph_explicit_coarsen_with_inverse_map(
     const fine_rowmap_t& fineRowmap, const fine_entries_t& fineEntries,
-    const labels_t& labels, typename fine_entries_t::non_const_value_type numCoarseVerts,
+    const labels_t& labels,
+    typename fine_entries_t::non_const_value_type numCoarseVerts,
     coarse_rowmap_t& coarseRowmap, coarse_entries_t& coarseEntries,
     ordinal_view_t& inverseOffsets, ordinal_view_t& inverseLabels,
-    bool compress = true)
-{
-  using size_type = typename fine_rowmap_t::non_const_value_type;
-  using lno_t = typename fine_entries_t::non_const_value_type;
+    bool compress = true) {
+  using size_type  = typename fine_rowmap_t::non_const_value_type;
+  using lno_t      = typename fine_entries_t::non_const_value_type;
   using exec_space = typename device_t::execution_space;
-  static_assert(std::is_same<lno_t, typename coarse_entries_t::non_const_value_type>::value,
-      "graph_explicit_coarsen: The coarse and fine entry Views have different value types.");
-  KokkosGraph::Impl::ExplicitGraphCoarsening<lno_t, size_type, device_t, fine_rowmap_t, fine_entries_t, labels_t, coarse_rowmap_t, coarse_entries_t, ordinal_view_t>
-    egc(fineRowmap, fineEntries, labels, numCoarseVerts);
-  coarseRowmap = egc.coarseRowmap;
-  coarseEntries = egc.coarseEntries;
+  static_assert(
+      std::is_same<lno_t,
+                   typename coarse_entries_t::non_const_value_type>::value,
+      "graph_explicit_coarsen: The coarse and fine entry Views have different "
+      "value types.");
+  KokkosGraph::Impl::ExplicitGraphCoarsening<
+      lno_t, size_type, device_t, fine_rowmap_t, fine_entries_t, labels_t,
+      coarse_rowmap_t, coarse_entries_t, ordinal_view_t>
+      egc(fineRowmap, fineEntries, labels, numCoarseVerts);
+  coarseRowmap   = egc.coarseRowmap;
+  coarseEntries  = egc.coarseEntries;
   inverseOffsets = egc.clusterOffsets;
-  inverseLabels = egc.clusterVerts;
-  if(compress)
-  {
+  inverseLabels  = egc.clusterVerts;
+  if (compress) {
     coarse_rowmap_t mergedRowmap;
     coarse_entries_t mergedEntries;
-    KokkosKernels::sort_and_merge_graph<exec_space, coarse_rowmap_t, coarse_entries_t>
-      (coarseRowmap, coarseEntries, mergedRowmap, mergedEntries);
-    coarseRowmap = mergedRowmap;
+    KokkosKernels::sort_and_merge_graph<exec_space, coarse_rowmap_t,
+                                        coarse_entries_t>(
+        coarseRowmap, coarseEntries, mergedRowmap, mergedEntries);
+    coarseRowmap  = mergedRowmap;
     coarseEntries = mergedEntries;
   }
 }
-  
-}}
+
+}  // namespace Experimental
+}  // namespace KokkosGraph
 
 #endif
diff --git a/src/graph/KokkosGraph_GraphColorHandle.hpp b/src/graph/KokkosGraph_GraphColorHandle.hpp
index 9526c34b0e..a99feaf5db 100644
--- a/src/graph/KokkosGraph_GraphColorHandle.hpp
+++ b/src/graph/KokkosGraph_GraphColorHandle.hpp
@@ -42,7 +42,6 @@
 //@HEADER
 */
 
-
 /**
  * This maintains backwards-compatibility with older code that included
  * the KokkosGraph_graph_color.hpp file. The new file is renamed to
diff --git a/src/graph/KokkosGraph_MIS2.hpp b/src/graph/KokkosGraph_MIS2.hpp
index b3098870c5..329566792c 100644
--- a/src/graph/KokkosGraph_MIS2.hpp
+++ b/src/graph/KokkosGraph_MIS2.hpp
@@ -47,63 +47,120 @@
 
 #include "KokkosGraph_Distance2MIS_impl.hpp"
 
-namespace KokkosGraph{
+namespace KokkosGraph {
 
-enum MIS2_Algorithm
-{
-  MIS2_QUALITY,
-  MIS2_FAST
-};
-
-namespace Experimental{
+enum MIS2_Algorithm { MIS2_QUALITY, MIS2_FAST };
 
 // Compute a distance-2 maximal independent set, given a symmetric CRS graph.
 // Returns a list of the vertices in the set.
 //
 // Column indices >= num_verts are ignored.
 
-template <typename device_t, typename rowmap_t, typename colinds_t, typename lno_view_t = typename colinds_t::non_const_type>
-lno_view_t
-graph_d2_mis(const rowmap_t& rowmap, const colinds_t& colinds, MIS2_Algorithm algo = MIS2_FAST)
-{
-  if(rowmap.extent(0) <= 1)
-  {
-    //zero vertices means the MIS is empty.
+template <typename device_t, typename rowmap_t, typename colinds_t,
+          typename lno_view_t = typename colinds_t::non_const_type>
+lno_view_t graph_d2_mis(const rowmap_t& rowmap, const colinds_t& colinds,
+                        MIS2_Algorithm algo = MIS2_FAST) {
+  if (rowmap.extent(0) <= 1) {
+    // zero vertices means the MIS is empty.
     return lno_view_t();
   }
-  switch(algo)
-  {
-    case MIS2_QUALITY:
-    {
-      Impl::D2_MIS_FixedPriority<device_t, rowmap_t, colinds_t, lno_view_t> mis(rowmap, colinds);
+  switch (algo) {
+    case MIS2_QUALITY: {
+      Impl::D2_MIS_FixedPriority<device_t, rowmap_t, colinds_t, lno_view_t> mis(
+          rowmap, colinds);
       return mis.compute();
     }
-    case MIS2_FAST:
-    {
-      Impl::D2_MIS_RandomPriority<device_t, rowmap_t, colinds_t, lno_view_t> mis(rowmap, colinds);
+    case MIS2_FAST: {
+      Impl::D2_MIS_RandomPriority<device_t, rowmap_t, colinds_t, lno_view_t>
+          mis(rowmap, colinds);
       return mis.compute();
     }
   }
   throw std::invalid_argument("graph_d2_mis: invalid algorithm");
 }
 
-template <typename device_t, typename rowmap_t, typename colinds_t, typename labels_t = typename colinds_t::non_const_type>
-labels_t
-graph_mis2_coarsen(const rowmap_t& rowmap, const colinds_t& colinds, typename colinds_t::non_const_value_type& numClusters, MIS2_Algorithm algo = MIS2_FAST)
-{
-  if(rowmap.extent(0) <= 1)
-  {
-    //there are no vertices to label
+template <typename device_t, typename rowmap_t, typename colinds_t,
+          typename labels_t = typename colinds_t::non_const_type>
+labels_t graph_mis2_coarsen(
+    const rowmap_t& rowmap, const colinds_t& colinds,
+    typename colinds_t::non_const_value_type& numClusters) {
+  if (rowmap.extent(0) <= 1) {
+    // there are no vertices to label
     numClusters = 0;
     return labels_t();
   }
-  labels_t mis2 = graph_d2_mis<device_t, rowmap_t, colinds_t, labels_t>(rowmap, colinds, algo);
-  numClusters = mis2.extent(0);
-  Impl::D2_MIS_Coarsening<device_t, rowmap_t, colinds_t, labels_t> coarsening(rowmap, colinds, mis2);
-  return coarsening.compute();
+  Impl::D2_MIS_Aggregation<device_t, rowmap_t, colinds_t, labels_t> aggregation(
+      rowmap, colinds);
+  aggregation.compute(false);
+  numClusters = aggregation.numAggs;
+  return aggregation.labels;
+}
+
+template <typename device_t, typename rowmap_t, typename colinds_t,
+          typename labels_t = typename colinds_t::non_const_type>
+labels_t graph_mis2_aggregate(
+    const rowmap_t& rowmap, const colinds_t& colinds,
+    typename colinds_t::non_const_value_type& numAggregates) {
+  if (rowmap.extent(0) <= 1) {
+    // there are no vertices to label
+    numAggregates = 0;
+    return labels_t();
+  }
+  Impl::D2_MIS_Aggregation<device_t, rowmap_t, colinds_t, labels_t> aggregation(
+      rowmap, colinds);
+  aggregation.compute(true);
+  numAggregates = aggregation.numAggs;
+  return aggregation.labels;
+}
+
+inline const char* mis2_algorithm_name(MIS2_Algorithm algo) {
+  switch (algo) {
+    case MIS2_QUALITY: return "MIS2_QUALITY";
+    case MIS2_FAST: return "MIS2_FAST";
+  }
+  return "*** Invalid MIS2 algo enum value.\n";
 }
 
-}  // end namespace Experimental
 }  // end namespace KokkosGraph
 
+// For backward compatibility
+namespace KokkosGraph {
+namespace Experimental {
+
+template <typename device_t, typename rowmap_t, typename colinds_t,
+          typename lno_view_t = typename colinds_t::non_const_type>
+[[deprecated]] lno_view_t graph_d2_mis(const rowmap_t& rowmap,
+                                       const colinds_t& colinds,
+                                       MIS2_Algorithm algo = MIS2_FAST) {
+  return KokkosGraph::graph_d2_mis<device_t, rowmap_t, colinds_t, lno_view_t>(
+      rowmap, colinds, algo);
+}
+
+template <typename device_t, typename rowmap_t, typename colinds_t,
+          typename labels_t = typename colinds_t::non_const_type>
+[[deprecated]] labels_t graph_mis2_coarsen(
+    const rowmap_t& rowmap, const colinds_t& colinds,
+    typename colinds_t::non_const_value_type& numClusters) {
+  return KokkosGraph::graph_mis2_coarsen<device_t, rowmap_t, colinds_t,
+                                         labels_t>(rowmap, colinds,
+                                                   numClusters);
+}
+
+template <typename device_t, typename rowmap_t, typename colinds_t,
+          typename labels_t = typename colinds_t::non_const_type>
+[[deprecated]] labels_t graph_mis2_aggregate(
+    const rowmap_t& rowmap, const colinds_t& colinds,
+    typename colinds_t::non_const_value_type& numAggregates) {
+  return KokkosGraph::graph_mis2_aggregate<device_t, rowmap_t, colinds_t,
+                                           labels_t>(rowmap, colinds,
+                                                     numAggregates);
+}
+
+[[deprecated]] inline const char* mis2_algorithm_name(MIS2_Algorithm algo) {
+  return KokkosGraph::mis2_algorithm_name(algo);
+}
+
+}  // namespace Experimental
+}  // namespace KokkosGraph
+
 #endif
diff --git a/src/graph/KokkosGraph_RCM.hpp b/src/graph/KokkosGraph_RCM.hpp
index 8f1109aa63..84ee974234 100644
--- a/src/graph/KokkosGraph_RCM.hpp
+++ b/src/graph/KokkosGraph_RCM.hpp
@@ -47,32 +47,28 @@
 
 #include "KokkosGraph_BFS_impl.hpp"
 
-namespace KokkosGraph
-{
-namespace Experimental
-{
+namespace KokkosGraph {
+namespace Experimental {
 
-//Compute the reverse Cuthill-McKee ordering of a graph.
-//The graph must be symmetric, but it may have any number of connected components.
-//This function returns a list of vertices in RCM order.
+// Compute the reverse Cuthill-McKee ordering of a graph.
+// The graph must be symmetric, but it may have any number of connected
+// components. This function returns a list of vertices in RCM order.
 
-template <typename device_t, typename rowmap_t, typename colinds_t, typename labels_t = typename colinds_t::non_const_type>
-labels_t
-graph_rcm(const rowmap_t& rowmap, const colinds_t& colinds)
-{
+template <typename device_t, typename rowmap_t, typename colinds_t,
+          typename labels_t = typename colinds_t::non_const_type>
+labels_t graph_rcm(const rowmap_t& rowmap, const colinds_t& colinds) {
   using lno_t = typename colinds_t::non_const_value_type;
-  if(rowmap.extent(0) <= 2)
-  {
-    //there are 0 or 1 vertices - return trivial ordering
+  if (rowmap.extent(0) <= 2) {
+    // there are 0 or 1 vertices - return trivial ordering
     lno_t numVerts = rowmap.extent(0);
-    if(numVerts)
-      numVerts--;
+    if (numVerts) numVerts--;
     return labels_t("RCM Labels", numVerts);
   }
   Impl::SerialRCM<rowmap_t, colinds_t, labels_t> algo(rowmap, colinds);
   return algo.rcm();
 }
 
-}}  //namespace KokkosGraph::Experimental
+}  // namespace Experimental
+}  // namespace KokkosGraph
 
 #endif
diff --git a/src/graph/KokkosGraph_Triangle.hpp b/src/graph/KokkosGraph_Triangle.hpp
index e63cbd2dc3..b9780a23c6 100644
--- a/src/graph/KokkosGraph_Triangle.hpp
+++ b/src/graph/KokkosGraph_Triangle.hpp
@@ -46,9 +46,9 @@
 #include "KokkosSparse_spgemm_impl.hpp"
 #include "KokkosKernels_IOUtils.hpp"
 #include "KokkosKernels_Handle.hpp"
-namespace KokkosGraph{
+namespace KokkosGraph {
 
-namespace Experimental{
+namespace Experimental {
 /*
 template <typename KernelHandle,
 typename alno_row_view_t_,
@@ -77,9 +77,10 @@ void triangle_count(
   {
     KokkosSparse::Impl::KokkosSPGEMM
     <KernelHandle,
-    alno_row_view_t_, alno_nnz_view_t_, typename KernelHandle::in_scalar_nnz_view_t,
-    blno_row_view_t_, blno_nnz_view_t_, typename KernelHandle::in_scalar_nnz_view_t>
-    kspgemm (handle,m,n,k,row_mapA, entriesA, transposeA, row_mapB, entriesB, transposeB);
+    alno_row_view_t_, alno_nnz_view_t_, typename
+KernelHandle::in_scalar_nnz_view_t, blno_row_view_t_, blno_nnz_view_t_, typename
+KernelHandle::in_scalar_nnz_view_t> kspgemm (handle,m,n,k,row_mapA, entriesA,
+transposeA, row_mapB, entriesB, transposeB);
     kspgemm.KokkosSPGEMM_symbolic_triangle(row_mapC);
   }
   break;
@@ -91,9 +92,10 @@ void triangle_count(
   {
     KokkosSparse::Impl::KokkosSPGEMM
     <KernelHandle,
-    alno_row_view_t_, alno_nnz_view_t_, typename KernelHandle::in_scalar_nnz_view_t,
-    blno_row_view_t_, blno_nnz_view_t_, typename KernelHandle::in_scalar_nnz_view_t>
-    kspgemm (handle,m,n,k,row_mapA, entriesA, transposeA, row_mapB, entriesB, transposeB);
+    alno_row_view_t_, alno_nnz_view_t_, typename
+KernelHandle::in_scalar_nnz_view_t, blno_row_view_t_, blno_nnz_view_t_, typename
+KernelHandle::in_scalar_nnz_view_t> kspgemm (handle,m,n,k,row_mapA, entriesA,
+transposeA, row_mapB, entriesB, transposeB);
     kspgemm.KokkosSPGEMM_symbolic_triangle(row_mapC);
   }
   break;
@@ -144,10 +146,12 @@ void triangle_enumerate(
         row_mapC
     );
 
-    typename clno_row_view_t_::value_type c_nnz_size = handle->get_spgemm_handle()->get_c_nnz();
-    if (c_nnz_size){
-      entriesC1 = clno_nnz_view_t_ (Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), c_nnz_size);
-      //entriesC2 = clno_nnz_view_t_ (Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), c_nnz_size);
+    typename clno_row_view_t_::value_type c_nnz_size =
+handle->get_spgemm_handle()->get_c_nnz(); if (c_nnz_size){ entriesC1 =
+clno_nnz_view_t_ (Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"),
+c_nnz_size);
+      //entriesC2 = clno_nnz_view_t_
+(Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), c_nnz_size);
     }
   }
 
@@ -161,9 +165,10 @@ void triangle_enumerate(
   {
     KokkosSparse::Impl::KokkosSPGEMM
     <KernelHandle,
-    alno_row_view_t_, alno_nnz_view_t_, typename KernelHandle::in_scalar_nnz_view_t,
-    blno_row_view_t_, blno_nnz_view_t_,  typename KernelHandle::in_scalar_nnz_view_t>
-    kspgemm (handle,m,n,k,row_mapA, entriesA, transposeA, row_mapB, entriesB, transposeB);
+    alno_row_view_t_, alno_nnz_view_t_, typename
+KernelHandle::in_scalar_nnz_view_t, blno_row_view_t_, blno_nnz_view_t_, typename
+KernelHandle::in_scalar_nnz_view_t> kspgemm (handle,m,n,k,row_mapA, entriesA,
+transposeA, row_mapB, entriesB, transposeB);
     kspgemm.KokkosSPGEMM_numeric_triangle(row_mapC, entriesC1, entriesC2);
   }
   break;
@@ -171,65 +176,50 @@ void triangle_enumerate(
 }
 */
 
-template <typename KernelHandle,
-typename alno_row_view_t_,
-typename alno_nnz_view_t_,
-typename blno_row_view_t_,
-typename blno_nnz_view_t_,
-typename visit_struct_t>
-void triangle_generic(
-    KernelHandle *handle,
-    typename KernelHandle::nnz_lno_t m,
-    typename KernelHandle::nnz_lno_t n,
-    typename KernelHandle::nnz_lno_t k,
-    alno_row_view_t_ row_mapA,
-    alno_nnz_view_t_ entriesA,
-    bool transposeA,
-    blno_row_view_t_ row_mapB,
-    blno_nnz_view_t_ entriesB,
-    bool transposeB,
-    visit_struct_t visit_struct){
+template <typename KernelHandle, typename alno_row_view_t_,
+          typename alno_nnz_view_t_, typename blno_row_view_t_,
+          typename blno_nnz_view_t_, typename visit_struct_t>
+void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
+                      typename KernelHandle::nnz_lno_t n,
+                      typename KernelHandle::nnz_lno_t k,
+                      alno_row_view_t_ row_mapA, alno_nnz_view_t_ entriesA,
+                      bool transposeA, blno_row_view_t_ row_mapB,
+                      blno_nnz_view_t_ entriesB, bool transposeB,
+                      visit_struct_t visit_struct) {
   using namespace KokkosSparse;
 
   typedef typename KernelHandle::SPGEMMHandleType spgemmHandleType;
   spgemmHandleType *sh = handle->get_spgemm_handle();
-  switch (sh->get_algorithm_type()){
-  //case SPGEMM_KK_TRIANGLE_LL:
-  case SPGEMM_KK_TRIANGLE_AI:
-  case SPGEMM_KK_TRIANGLE_IA:
-  case SPGEMM_KK_TRIANGLE_IA_UNION:
-  default:
-  {
-    KokkosSparse::Impl::KokkosSPGEMM
-    <KernelHandle,
-    alno_row_view_t_, alno_nnz_view_t_, typename KernelHandle::in_scalar_nnz_view_t,
-    blno_row_view_t_, blno_nnz_view_t_, typename KernelHandle::in_scalar_nnz_view_t>
-    kspgemm (handle,m,n,k,row_mapA, entriesA, transposeA, row_mapB, entriesB, transposeB);
-    kspgemm.KokkosSPGEMM_generic_triangle(visit_struct);
-  }
-  break;
-
-
+  switch (sh->get_algorithm_type()) {
+    // case SPGEMM_KK_TRIANGLE_LL:
+    case SPGEMM_KK_TRIANGLE_AI:
+    case SPGEMM_KK_TRIANGLE_IA:
+    case SPGEMM_KK_TRIANGLE_IA_UNION:
+    default: {
+      KokkosSparse::Impl::KokkosSPGEMM<
+          KernelHandle, alno_row_view_t_, alno_nnz_view_t_,
+          typename KernelHandle::in_scalar_nnz_view_t, blno_row_view_t_,
+          blno_nnz_view_t_, typename KernelHandle::in_scalar_nnz_view_t>
+          kspgemm(handle, m, n, k, row_mapA, entriesA, transposeA, row_mapB,
+                  entriesB, transposeB);
+      kspgemm.KokkosSPGEMM_generic_triangle(visit_struct);
+    } break;
   }
 }
 
-template <typename KernelHandle,
-typename alno_row_view_t_,
-typename alno_nnz_view_t_,
-typename visit_struct_t>
-void triangle_generic(
-    KernelHandle *handle,
-    typename KernelHandle::nnz_lno_t m,
-    alno_row_view_t_ row_mapA,
-    alno_nnz_view_t_ entriesA,
-    visit_struct_t visit_struct){
-
+template <typename KernelHandle, typename alno_row_view_t_,
+          typename alno_nnz_view_t_, typename visit_struct_t>
+void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
+                      alno_row_view_t_ row_mapA, alno_nnz_view_t_ entriesA,
+                      visit_struct_t visit_struct) {
   typedef typename KernelHandle::nnz_lno_t nnz_lno_t;
   typedef typename KernelHandle::size_type size_type;
 
   typedef typename KernelHandle::SPGEMMHandleType spgemmHandleType;
-  typedef typename KernelHandle::nnz_lno_persistent_work_view_t nnz_lno_persistent_work_view_t;
-  typedef typename KernelHandle::row_lno_persistent_work_view_t row_lno_persistent_work_view_t;
+  typedef typename KernelHandle::nnz_lno_persistent_work_view_t
+      nnz_lno_persistent_work_view_t;
+  typedef typename KernelHandle::row_lno_persistent_work_view_t
+      row_lno_persistent_work_view_t;
 
   typedef typename KernelHandle::HandleExecSpace ExecutionSpace;
 
@@ -240,43 +230,48 @@ void triangle_generic(
 
   //////SORT BASE ON THE SIZE OF ROWS/////
   int sort_lower_triangle = sh->get_sort_lower_triangular();
-  bool should_i_sort = false; 
-  if (sort_lower_triangle == 1) should_i_sort = true;
-  else if (sort_lower_triangle == 2){
+  bool should_i_sort      = false;
+  if (sort_lower_triangle == 1)
+    should_i_sort = true;
+  else if (sort_lower_triangle == 2) {
     size_type max_row_size = 0;
     KokkosKernels::Impl::kk_view_reduce_max_row_size<size_type, ExecutionSpace>(
-	m,  row_mapA.data(), row_mapA.data() + 1, max_row_size);
+        m, row_mapA.data(), row_mapA.data() + 1, max_row_size);
 
-    if (max_row_size > 1000){
+    if (max_row_size > 1000) {
       should_i_sort = true;
     }
   }
 
-  if (should_i_sort){
-
-    if(sh->get_lower_triangular_permutation().data() == NULL){
-      nnz_lno_persistent_work_view_t new_indices(Kokkos::view_alloc(Kokkos::WithoutInitializing, "new_indices"), m);
+  if (should_i_sort) {
+    if (sh->get_lower_triangular_permutation().data() == NULL) {
+      nnz_lno_persistent_work_view_t new_indices(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "new_indices"), m);
       int sort_decreasing_order = 1;
-      ////If true we place the largest row to top, so that largest row size will be minimized in lower triangle.
-      if (sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_AI || sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_LU){
+      ////If true we place the largest row to top, so that largest row size will
+      /// be minimized in lower triangle.
+      if (sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_AI ||
+          sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_LU) {
         sort_decreasing_order = 0;
-        //if false we place the largest row to bottom, so that largest column is minimizedin lower triangle.
-      }
-      else if (sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_LL){
+        // if false we place the largest row to bottom, so that largest column
+        // is minimizedin lower triangle.
+      } else if (sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_LL) {
         sort_decreasing_order = 1;
-        //if 2, we do an interleaved sort.
+        // if 2, we do an interleaved sort.
       }
       {
-        if (sh->get_sort_option() != -1){
+        if (sh->get_sort_option() != -1) {
           sort_decreasing_order = sh->get_sort_option();
         }
-        KokkosKernels::Impl::kk_sort_by_row_size<size_type, nnz_lno_t, ExecutionSpace>(
-            m, row_mapA.data(), new_indices.data(), sort_decreasing_order, ExecutionSpace::concurrency());
+        KokkosKernels::Impl::kk_sort_by_row_size<size_type, nnz_lno_t,
+                                                 ExecutionSpace>(
+            m, row_mapA.data(), new_indices.data(), sort_decreasing_order,
+            ExecutionSpace::concurrency());
       }
       sh->set_lower_triangular_permutation(new_indices);
     }
   }
-  if (handle->get_verbose()){
+  if (handle->get_verbose()) {
     std::cout << "Preprocess Sorting Time:" << timer1.seconds() << std::endl;
   }
   //////SORT BASE ON THE SIZE OF ROWS/////
@@ -288,60 +283,60 @@ void triangle_generic(
   timer1.reset();
   if (create_lower_triangular ||
       sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_LL ||
-      sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_LU){
-    sh->get_lower_triangular_matrix(lower_triangular_matrix_rowmap, lower_triangular_matrix_entries);
-    if( lower_triangular_matrix_rowmap.data() == NULL ||
-        lower_triangular_matrix_entries.data() == NULL){
-
+      sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_LU) {
+    sh->get_lower_triangular_matrix(lower_triangular_matrix_rowmap,
+                                    lower_triangular_matrix_entries);
+    if (lower_triangular_matrix_rowmap.data() == NULL ||
+        lower_triangular_matrix_entries.data() == NULL) {
       alno_nnz_view_t_ null_values;
-      nnz_lno_persistent_work_view_t new_indices = sh->get_lower_triangular_permutation();
-
-      KokkosKernels::Impl::kk_get_lower_triangle
-      <alno_row_view_t_, alno_nnz_view_t_, alno_nnz_view_t_,
-      row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, alno_nnz_view_t_,
-      nnz_lno_persistent_work_view_t, ExecutionSpace> (
-          m,
-          row_mapA, entriesA, null_values,
-          lower_triangular_matrix_rowmap, lower_triangular_matrix_entries, null_values,
-          new_indices , handle->is_dynamic_scheduling(), handle->get_team_work_size(1, ExecutionSpace::concurrency(), m)
-      );
-
-      sh->set_lower_triangular_matrix(lower_triangular_matrix_rowmap, lower_triangular_matrix_entries);
+      nnz_lno_persistent_work_view_t new_indices =
+          sh->get_lower_triangular_permutation();
+
+      KokkosKernels::Impl::kk_get_lower_triangle<
+          alno_row_view_t_, alno_nnz_view_t_, alno_nnz_view_t_,
+          row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t,
+          alno_nnz_view_t_, nnz_lno_persistent_work_view_t, ExecutionSpace>(
+          m, row_mapA, entriesA, null_values, lower_triangular_matrix_rowmap,
+          lower_triangular_matrix_entries, null_values, new_indices,
+          handle->is_dynamic_scheduling(),
+          handle->get_team_work_size(1, ExecutionSpace::concurrency(), m));
+
+      sh->set_lower_triangular_matrix(lower_triangular_matrix_rowmap,
+                                      lower_triangular_matrix_entries);
     }
   }
-  if (handle->get_verbose()){
-    std::cout << "Preprocess Create Lower Triangular Time:" << timer1.seconds() << std::endl;
+  if (handle->get_verbose()) {
+    std::cout << "Preprocess Create Lower Triangular Time:" << timer1.seconds()
+              << std::endl;
   }
   timer1.reset();
 
   row_lno_persistent_work_view_t upper_triangular_matrix_rowmap;
   nnz_lno_persistent_work_view_t upper_triangular_matrix_entries;
-  if (sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_LU){
-    sh->get_lower_triangular_matrix(lower_triangular_matrix_rowmap, lower_triangular_matrix_entries);
+  if (sh->get_algorithm_type() == SPGEMM_KK_TRIANGLE_LU) {
+    sh->get_lower_triangular_matrix(lower_triangular_matrix_rowmap,
+                                    lower_triangular_matrix_entries);
     alno_nnz_view_t_ null_values;
-    nnz_lno_persistent_work_view_t new_indices = sh->get_lower_triangular_permutation();
-
-    KokkosKernels::Impl::kk_get_lower_triangle
-    <alno_row_view_t_, alno_nnz_view_t_, alno_nnz_view_t_,
-    row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, alno_nnz_view_t_,
-    nnz_lno_persistent_work_view_t, ExecutionSpace> (
-        m,
-        row_mapA, entriesA, null_values,
-        upper_triangular_matrix_rowmap, upper_triangular_matrix_entries, null_values,
-        new_indices, handle->is_dynamic_scheduling(), 4, false
-    );
-
+    nnz_lno_persistent_work_view_t new_indices =
+        sh->get_lower_triangular_permutation();
+
+    KokkosKernels::Impl::kk_get_lower_triangle<
+        alno_row_view_t_, alno_nnz_view_t_, alno_nnz_view_t_,
+        row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t,
+        alno_nnz_view_t_, nnz_lno_persistent_work_view_t, ExecutionSpace>(
+        m, row_mapA, entriesA, null_values, upper_triangular_matrix_rowmap,
+        upper_triangular_matrix_entries, null_values, new_indices,
+        handle->is_dynamic_scheduling(), 4, false);
   }
-  if (handle->get_verbose()){
-    std::cout << "Preprocess Create Upper Triangular Time:" << timer1.seconds() << std::endl;
+  if (handle->get_verbose()) {
+    std::cout << "Preprocess Create Upper Triangular Time:" << timer1.seconds()
+              << std::endl;
   }
 
-
-
   /////////CREATE LOWER TRIANGLE///////
 
   ////
-  ///CREATE INCIDENCE MATRIX
+  /// CREATE INCIDENCE MATRIX
   ///
   timer1.reset();
   row_lno_persistent_work_view_t incidence_transpose_rowmap;
@@ -349,176 +344,152 @@ void triangle_generic(
 
   row_lno_persistent_work_view_t incidence_rowmap;
   nnz_lno_persistent_work_view_t incidence_entries;
-  switch (sh->get_algorithm_type()){
-
-  //IF it is one of below, we perform I^T x (A) or (L).
-  //so create the transpose of I.
-  case SPGEMM_KK_TRIANGLE_IA_UNION:
-  case SPGEMM_KK_TRIANGLE_IA:
-  {
-    //these are the algorithms that requires transpose of the incidence matrix.
-    sh->get_lower_triangular_matrix(lower_triangular_matrix_rowmap, lower_triangular_matrix_entries);
-
-    if( lower_triangular_matrix_rowmap.data() == NULL ||
-        lower_triangular_matrix_entries.data() == NULL){
-      std::cout << "Creating lower triangular A" << std::endl;
-
-      alno_nnz_view_t_ null_values;
-      nnz_lno_persistent_work_view_t new_indices = sh->get_lower_triangular_permutation();
-
-      KokkosKernels::Impl::kk_get_lower_triangle
-      <alno_row_view_t_, alno_nnz_view_t_, alno_nnz_view_t_,
-      row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, alno_nnz_view_t_,
-      nnz_lno_persistent_work_view_t, ExecutionSpace> (
-          m,
-          row_mapA, entriesA, null_values,
-          lower_triangular_matrix_rowmap, lower_triangular_matrix_entries, null_values,
-          new_indices, handle->is_dynamic_scheduling()
-      );
-    }
-    KokkosKernels::Impl::kk_create_incidence_tranpose_matrix_from_lower_triangle
-    <row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t,
-    row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t,
-    ExecutionSpace>
-        (m,
-        lower_triangular_matrix_rowmap, lower_triangular_matrix_entries,
-        incidence_transpose_rowmap, incidence_transpose_entries,
-        handle->is_dynamic_scheduling());
-  }
-  break;
-
-
-
-  //IF it is one of below, we perform (A) or (L) x I
-  //so create I.
-  case SPGEMM_KK_TRIANGLE_AI:
-  {
-    //these are the algorithms that requires the incidence matrix.
-
-    KokkosKernels::Impl::kk_create_incidence_matrix_from_original_matrix
-            < alno_row_view_t_, alno_nnz_view_t_,
-              row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t,
-              ExecutionSpace>
-            (m,
-                row_mapA, entriesA,
-            incidence_rowmap, incidence_entries, sh->get_lower_triangular_permutation(),
+  switch (sh->get_algorithm_type()) {
+    // IF it is one of below, we perform I^T x (A) or (L).
+    // so create the transpose of I.
+    case SPGEMM_KK_TRIANGLE_IA_UNION:
+    case SPGEMM_KK_TRIANGLE_IA: {
+      // these are the algorithms that requires transpose of the incidence
+      // matrix.
+      sh->get_lower_triangular_matrix(lower_triangular_matrix_rowmap,
+                                      lower_triangular_matrix_entries);
+
+      if (lower_triangular_matrix_rowmap.data() == NULL ||
+          lower_triangular_matrix_entries.data() == NULL) {
+        std::cout << "Creating lower triangular A" << std::endl;
+
+        alno_nnz_view_t_ null_values;
+        nnz_lno_persistent_work_view_t new_indices =
+            sh->get_lower_triangular_permutation();
+
+        KokkosKernels::Impl::kk_get_lower_triangle<
+            alno_row_view_t_, alno_nnz_view_t_, alno_nnz_view_t_,
+            row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t,
+            alno_nnz_view_t_, nnz_lno_persistent_work_view_t, ExecutionSpace>(
+            m, row_mapA, entriesA, null_values, lower_triangular_matrix_rowmap,
+            lower_triangular_matrix_entries, null_values, new_indices,
             handle->is_dynamic_scheduling());
-  }
-  break;
-  case SPGEMM_KK_TRIANGLE_LU:
-  case SPGEMM_KK_TRIANGLE_LL:
-  default:
-  {
-    break;
-  }
+      }
+      KokkosKernels::Impl::
+          kk_create_incidence_tranpose_matrix_from_lower_triangle<
+              row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t,
+              row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t,
+              ExecutionSpace>(
+              m, lower_triangular_matrix_rowmap,
+              lower_triangular_matrix_entries, incidence_transpose_rowmap,
+              incidence_transpose_entries, handle->is_dynamic_scheduling());
+    } break;
+
+    // IF it is one of below, we perform (A) or (L) x I
+    // so create I.
+    case SPGEMM_KK_TRIANGLE_AI: {
+      // these are the algorithms that requires the incidence matrix.
+
+      KokkosKernels::Impl::kk_create_incidence_matrix_from_original_matrix<
+          alno_row_view_t_, alno_nnz_view_t_, row_lno_persistent_work_view_t,
+          nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t,
+          ExecutionSpace>(m, row_mapA, entriesA, incidence_rowmap,
+                          incidence_entries,
+                          sh->get_lower_triangular_permutation(),
+                          handle->is_dynamic_scheduling());
+    } break;
+    case SPGEMM_KK_TRIANGLE_LU:
+    case SPGEMM_KK_TRIANGLE_LL:
+    default: {
+      break;
+    }
   }
 
-  if (handle->get_verbose()){
-    std::cout << "Preprocess Incidence Matrix Create Time:" << timer1.seconds() << std::endl;
+  if (handle->get_verbose()) {
+    std::cout << "Preprocess Incidence Matrix Create Time:" << timer1.seconds()
+              << std::endl;
   }
   ////
-  ///CREATE INCIDENCE MATRIX END
+  /// CREATE INCIDENCE MATRIX END
   ///
 
-
-  switch (sh->get_algorithm_type()){
-  default:
-  case SPGEMM_KK_TRIANGLE_LL:
-  {
-
-    KokkosSparse::Impl::KokkosSPGEMM
-    <KernelHandle,
-    row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t,
-    row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t>
-    kspgemm (handle,m,m,m,
-        lower_triangular_matrix_rowmap, lower_triangular_matrix_entries,
-        false,
-        lower_triangular_matrix_rowmap, lower_triangular_matrix_entries,
-        false);
-    kspgemm.KokkosSPGEMM_generic_triangle(visit_struct);
-  }
-  break;
-  case SPGEMM_KK_TRIANGLE_LU:
-  {
-
-    KokkosSparse::Impl::KokkosSPGEMM
-    <KernelHandle,
-    row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t,
-    row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t>
-    kspgemm (handle,m,m,m,
-        lower_triangular_matrix_rowmap, lower_triangular_matrix_entries,
-        false,
-        upper_triangular_matrix_rowmap, upper_triangular_matrix_entries,
-        false);
-    kspgemm.KokkosSPGEMM_generic_triangle(visit_struct);
-  }
-  break;
-  case SPGEMM_KK_TRIANGLE_AI:
-  {
-    if (create_lower_triangular){
-      KokkosSparse::Impl::KokkosSPGEMM
-      <KernelHandle,
-      row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t,
-      row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t>
-      kspgemm (handle,m,m,incidence_entries.extent(0) / 2,
-          lower_triangular_matrix_rowmap, lower_triangular_matrix_entries,
-          false, //transpose ignore.
-          incidence_rowmap, incidence_entries,
-          false);
-      kspgemm.KokkosSPGEMM_generic_triangle(visit_struct);
-    }
-    else {
-      KokkosSparse::Impl::KokkosSPGEMM
-      <KernelHandle,
-      alno_row_view_t_, alno_nnz_view_t_, nnz_lno_persistent_work_view_t,
-      row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t>
-      kspgemm (handle,m,m,incidence_entries.extent(0) / 2,
-          row_mapA, entriesA,
-          false, //transpose ignore.
-          incidence_rowmap, incidence_entries,
-          false);
-      kspgemm.KokkosSPGEMM_generic_triangle(visit_struct);
-    }
-  }
-
-  break;
-  case SPGEMM_KK_TRIANGLE_IA_UNION:
-  case SPGEMM_KK_TRIANGLE_IA:
-  {
-    if (create_lower_triangular){
-      KokkosSparse::Impl::KokkosSPGEMM
-      <KernelHandle,
-      row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t,
-      row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t>
-      kspgemm (handle,
-          incidence_transpose_rowmap.extent(0) - 1,m,m,
-          incidence_transpose_rowmap, incidence_transpose_entries,
-          false, //transpose ignore.
-          lower_triangular_matrix_rowmap, lower_triangular_matrix_entries,
-          false);
+  switch (sh->get_algorithm_type()) {
+    default:
+    case SPGEMM_KK_TRIANGLE_LL: {
+      KokkosSparse::Impl::KokkosSPGEMM<
+          KernelHandle, row_lno_persistent_work_view_t,
+          nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t,
+          row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t,
+          nnz_lno_persistent_work_view_t>
+          kspgemm(handle, m, m, m, lower_triangular_matrix_rowmap,
+                  lower_triangular_matrix_entries, false,
+                  lower_triangular_matrix_rowmap,
+                  lower_triangular_matrix_entries, false);
       kspgemm.KokkosSPGEMM_generic_triangle(visit_struct);
-    }
-    else {
-      KokkosSparse::Impl::KokkosSPGEMM
-      <KernelHandle,
-      row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t,
-      alno_row_view_t_, alno_nnz_view_t_, nnz_lno_persistent_work_view_t>
-      kspgemm (handle,
-          incidence_transpose_rowmap.extent(0) - 1,m,m,
-          incidence_transpose_rowmap, incidence_transpose_entries,
-          false, //transpose ignore.
-          row_mapA, entriesA,
-          false);
+    } break;
+    case SPGEMM_KK_TRIANGLE_LU: {
+      KokkosSparse::Impl::KokkosSPGEMM<
+          KernelHandle, row_lno_persistent_work_view_t,
+          nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t,
+          row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t,
+          nnz_lno_persistent_work_view_t>
+          kspgemm(handle, m, m, m, lower_triangular_matrix_rowmap,
+                  lower_triangular_matrix_entries, false,
+                  upper_triangular_matrix_rowmap,
+                  upper_triangular_matrix_entries, false);
       kspgemm.KokkosSPGEMM_generic_triangle(visit_struct);
+    } break;
+    case SPGEMM_KK_TRIANGLE_AI: {
+      if (create_lower_triangular) {
+        KokkosSparse::Impl::KokkosSPGEMM<
+            KernelHandle, row_lno_persistent_work_view_t,
+            nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t,
+            row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t,
+            nnz_lno_persistent_work_view_t>
+            kspgemm(handle, m, m, incidence_entries.extent(0) / 2,
+                    lower_triangular_matrix_rowmap,
+                    lower_triangular_matrix_entries,
+                    false,  // transpose ignore.
+                    incidence_rowmap, incidence_entries, false);
+        kspgemm.KokkosSPGEMM_generic_triangle(visit_struct);
+      } else {
+        KokkosSparse::Impl::KokkosSPGEMM<
+            KernelHandle, alno_row_view_t_, alno_nnz_view_t_,
+            nnz_lno_persistent_work_view_t, row_lno_persistent_work_view_t,
+            nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t>
+            kspgemm(handle, m, m, incidence_entries.extent(0) / 2, row_mapA,
+                    entriesA,
+                    false,  // transpose ignore.
+                    incidence_rowmap, incidence_entries, false);
+        kspgemm.KokkosSPGEMM_generic_triangle(visit_struct);
+      }
     }
-  }
-  break;
-
 
+    break;
+    case SPGEMM_KK_TRIANGLE_IA_UNION:
+    case SPGEMM_KK_TRIANGLE_IA: {
+      if (create_lower_triangular) {
+        KokkosSparse::Impl::KokkosSPGEMM<
+            KernelHandle, row_lno_persistent_work_view_t,
+            nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t,
+            row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t,
+            nnz_lno_persistent_work_view_t>
+            kspgemm(handle, incidence_transpose_rowmap.extent(0) - 1, m, m,
+                    incidence_transpose_rowmap, incidence_transpose_entries,
+                    false,  // transpose ignore.
+                    lower_triangular_matrix_rowmap,
+                    lower_triangular_matrix_entries, false);
+        kspgemm.KokkosSPGEMM_generic_triangle(visit_struct);
+      } else {
+        KokkosSparse::Impl::KokkosSPGEMM<
+            KernelHandle, row_lno_persistent_work_view_t,
+            nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t,
+            alno_row_view_t_, alno_nnz_view_t_, nnz_lno_persistent_work_view_t>
+            kspgemm(handle, incidence_transpose_rowmap.extent(0) - 1, m, m,
+                    incidence_transpose_rowmap, incidence_transpose_entries,
+                    false,  // transpose ignore.
+                    row_mapA, entriesA, false);
+        kspgemm.KokkosSPGEMM_generic_triangle(visit_struct);
+      }
+    } break;
   }
-
 }
 
-}
-}
+}  // namespace Experimental
+}  // namespace KokkosGraph
 #endif
diff --git a/src/graph/KokkosGraph_graph_color.hpp b/src/graph/KokkosGraph_graph_color.hpp
index 9526c34b0e..a99feaf5db 100644
--- a/src/graph/KokkosGraph_graph_color.hpp
+++ b/src/graph/KokkosGraph_graph_color.hpp
@@ -42,7 +42,6 @@
 //@HEADER
 */
 
-
 /**
  * This maintains backwards-compatibility with older code that included
  * the KokkosGraph_graph_color.hpp file. The new file is renamed to
diff --git a/src/graph/impl/KokkosGraph_BFS_impl.hpp b/src/graph/impl/KokkosGraph_BFS_impl.hpp
index 2d493df9c3..8a66650af5 100644
--- a/src/graph/impl/KokkosGraph_BFS_impl.hpp
+++ b/src/graph/impl/KokkosGraph_BFS_impl.hpp
@@ -54,107 +54,98 @@ namespace KokkosGraph {
 namespace Experimental {
 namespace Impl {
 
-template<typename rowmap_t, typename entries_t, typename lno_view_t>
-struct SerialRCM
-{
-  using size_type = typename rowmap_t::non_const_value_type;
-  using lno_t = typename entries_t::non_const_value_type;
-  using host_rowmap_t = Kokkos::View<size_type*, Kokkos::HostSpace>;
+template <typename rowmap_t, typename entries_t, typename lno_view_t>
+struct SerialRCM {
+  using size_type       = typename rowmap_t::non_const_value_type;
+  using lno_t           = typename entries_t::non_const_value_type;
+  using host_rowmap_t   = Kokkos::View<size_type*, Kokkos::HostSpace>;
   using host_lno_view_t = Kokkos::View<lno_t*, Kokkos::HostSpace>;
 
   lno_t numVerts;
   host_rowmap_t rowmap;
   host_lno_view_t entries;
 
-  SerialRCM(const rowmap_t& rowmap_, const entries_t& entries_) :
-    numVerts(rowmap_.extent(0) - 1),
-    rowmap(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HostRowmap"), rowmap_.extent(0)),
-    entries(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HostEntries"), entries_.extent(0))
-  {
+  SerialRCM(const rowmap_t& rowmap_, const entries_t& entries_)
+      : numVerts(rowmap_.extent(0) - 1),
+        rowmap(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HostRowmap"),
+               rowmap_.extent(0)),
+        entries(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HostEntries"),
+                entries_.extent(0)) {
     Kokkos::deep_copy(rowmap, rowmap_);
     Kokkos::deep_copy(entries, entries_);
   }
 
-  lno_t findPseudoPeripheral()
-  {
-    //Choose vertex with smallest degree
-    lno_t periph = -1;
+  lno_t findPseudoPeripheral() {
+    // Choose vertex with smallest degree
+    lno_t periph    = -1;
     lno_t periphDeg = numVerts;
-    for(lno_t i = 0; i < numVerts; i++)
-    {
+    for (lno_t i = 0; i < numVerts; i++) {
       lno_t deg = rowmap(i + 1) - rowmap(i);
-      if(deg < periphDeg)
-      {
-        periph = i;
+      if (deg < periphDeg) {
+        periph    = i;
         periphDeg = deg;
-        if(deg == 0)
-          break;
+        if (deg == 0) break;
       }
     }
     return periph;
   }
 
-  lno_view_t rcm()
-  {
+  lno_view_t rcm() {
     lno_t start = findPseudoPeripheral();
-    host_lno_view_t q(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Queue"), numVerts);
-    host_lno_view_t label(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Permutation"), numVerts);
-    for(lno_t i = 0; i < numVerts; i++)
-      label(i) = -1;
-    lno_t qhead = 0;
-    lno_t qtail = 0;
+    host_lno_view_t q(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Queue"),
+                      numVerts);
+    host_lno_view_t label(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Permutation"),
+        numVerts);
+    for (lno_t i = 0; i < numVerts; i++) label(i) = -1;
+    lno_t qhead  = 0;
+    lno_t qtail  = 0;
     label(start) = qtail;
-    q(qtail++) = start;
+    q(qtail++)   = start;
     std::vector<lno_t> neighbors;
     lno_t outerQueue = 0;
-    while(true)
-    {
+    while (true) {
       lno_t v = q(qhead++);
       neighbors.clear();
-      for(size_type j = rowmap(v); j < rowmap(v + 1); j++)
-      {
+      for (size_type j = rowmap(v); j < rowmap(v + 1); j++) {
         lno_t nei = entries(j);
-        if(nei == v || nei >= numVerts)
-          continue;
-        if(label(nei) == -1)
-        {
+        if (nei == v || nei >= numVerts) continue;
+        if (label(nei) == -1) {
           neighbors.push_back(nei);
         }
       }
       std::sort(neighbors.begin(), neighbors.end(),
-      [&](lno_t n1, lno_t n2) -> bool
-      {
-        //return true if n1 has a lower degree than n2
-        return (rowmap(n1 + 1) - rowmap(n1)) < (rowmap(n2 + 1) - rowmap(n2));
-      });
-      //label and enqueue all unlabeled neighbors
-      for(lno_t nei : neighbors)
-      {
+                [&](lno_t n1, lno_t n2) -> bool {
+                  // return true if n1 has a lower degree than n2
+                  return (rowmap(n1 + 1) - rowmap(n1)) <
+                         (rowmap(n2 + 1) - rowmap(n2));
+                });
+      // label and enqueue all unlabeled neighbors
+      for (lno_t nei : neighbors) {
         label(nei) = qtail;
         q(qtail++) = nei;
       }
-      if(qtail == numVerts)
-      {
-        //have labeled all vertices
+      if (qtail == numVerts) {
+        // have labeled all vertices
         break;
-      }
-      else if(qhead == qtail)
-      {
-        //have exhausted this connected component, but others remain unlabeled
-        while(label(outerQueue) != -1)
-          outerQueue++;
+      } else if (qhead == qtail) {
+        // have exhausted this connected component, but others remain unlabeled
+        while (label(outerQueue) != -1) outerQueue++;
         label(outerQueue) = qtail;
-        q(qtail++) = outerQueue;
+        q(qtail++)        = outerQueue;
       }
     }
-    lno_view_t labelOut(Kokkos::view_alloc(Kokkos::WithoutInitializing, "RCM Permutation"), numVerts);
-    //reverse the labels
-    for(lno_t i = 0; i < numVerts; i++)
-      label(i) = numVerts - label(i) - 1;
+    lno_view_t labelOut(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "RCM Permutation"),
+        numVerts);
+    // reverse the labels
+    for (lno_t i = 0; i < numVerts; i++) label(i) = numVerts - label(i) - 1;
     Kokkos::deep_copy(labelOut, label);
     return labelOut;
   }
 };
 
-}}} //namespace KokkosGraph::Experimental::Impl
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosGraph
 #endif
diff --git a/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp b/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp
index 567d174537..39e27795cc 100644
--- a/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp
+++ b/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp
@@ -43,9 +43,6 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <Kokkos_Atomic.hpp>
-#include <Kokkos_Timer.hpp>
-#include <Kokkos_MemoryTraits.hpp>
 #include <vector>
 #include "KokkosGraph_Distance1ColorHandle.hpp"
 
@@ -54,24 +51,23 @@
 #ifndef _KOKKOSCOLORINGIMP_HPP
 #define _KOKKOSCOLORINGIMP_HPP
 
+#define EBCOLORING_HIGHER_QUALITY  // suggested
+namespace KokkosGraph {
 
-#define EBCOLORING_HIGHER_QUALITY        //suggested
-namespace KokkosGraph{
-
-namespace Impl{
+namespace Impl {
 
 #define VB_COLORING_FORBIDDEN_SIZE 64
 #define VBBIT_COLORING_FORBIDDEN_SIZE 64
 /*! \brief Base class for graph coloring purposes.
  *  Each color represents the set of the vertices that are independent,
  *  e.g. no vertex having same color shares an edge.
- *  General aim is to find the minimum number of colors, minimum number of independent sets.
+ *  General aim is to find the minimum number of colors, minimum number of
+ * independent sets.
  */
-template <typename HandleType, typename lno_row_view_t_, typename lno_nnz_view_t_>
-class GraphColor
-{
-public:
-
+template <typename HandleType, typename lno_row_view_t_,
+          typename lno_nnz_view_t_>
+class GraphColor {
+ public:
   typedef lno_row_view_t_ in_lno_row_view_t;
   typedef lno_nnz_view_t_ in_lno_nnz_view_t;
 
@@ -81,17 +77,19 @@ class GraphColor
   typedef typename HandleType::size_type size_type;
   typedef typename HandleType::nnz_lno_t nnz_lno_t;
 
+  typedef typename in_lno_row_view_t::HostMirror
+      row_lno_host_view_t;  // Host view type
 
-  typedef typename in_lno_row_view_t::HostMirror row_lno_host_view_t; //Host view type
-
+  typedef typename in_lno_nnz_view_t::HostMirror
+      nnz_lno_host_view_t;  // Host view type
 
-  typedef typename in_lno_nnz_view_t::HostMirror nnz_lno_host_view_t; //Host view type
-
-  typedef typename HandleType::color_host_view_t color_host_view_t; //Host view type
+  typedef typename HandleType::color_host_view_t
+      color_host_view_t;  // Host view type
 
   typedef typename HandleType::HandleExecSpace MyExecSpace;
   typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace;
-  typedef typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace;
+  typedef
+      typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace;
 
   typedef typename HandleType::const_size_type const_size_type;
   typedef typename lno_row_view_t_::const_type const_lno_row_view_t;
@@ -99,434 +97,424 @@ class GraphColor
   typedef typename lno_nnz_view_t_::const_type const_lno_nnz_view_t;
   typedef typename lno_nnz_view_t_::non_const_type non_const_lno_nnz_view_t;
 
-protected:
-  nnz_lno_t nv; //# vertices
-  size_type ne; //# edges
-  const_lno_row_view_t xadj; //rowmap
-  const_lno_nnz_view_t adj; // entries
-  const_lno_nnz_view_t kok_src, kok_dst; //Edge list storage of the graph
+ protected:
+  nnz_lno_t nv;                           //# vertices
+  size_type ne;                           //# edges
+  const_lno_row_view_t xadj;              // rowmap
+  const_lno_nnz_view_t adj;               // entries
+  const_lno_nnz_view_t kok_src, kok_dst;  // Edge list storage of the graph
   HandleType *cp;
 
-public:
+ public:
   /**
    * \brief GraphColor constructor.
    * \param nv_: number of vertices in the graph
    * \param ne_: number of edges in the graph
    * \param row_map: the xadj array of the graph. Its size is nv_ +1
    * \param entries: adjacency array of the graph. Its size is ne_
-   * \param coloring_handle: GraphColoringHandle object that holds the specification about the graph coloring,
-   *    including parameters.
+   * \param coloring_handle: GraphColoringHandle object that holds the
+   * specification about the graph coloring, including parameters.
    */
-  GraphColor(
-      nnz_lno_t nv_,
-      size_type ne_,
-      const_lno_row_view_t row_map,
-      const_lno_nnz_view_t entries,
-      HandleType *coloring_handle):
-        nv (nv_), ne(ne_),xadj(row_map), adj (entries),
-        kok_src(), kok_dst(), cp(coloring_handle)
-  {
-    static_assert(std::is_same<size_type, typename const_lno_row_view_t::non_const_value_type>::value,
+  GraphColor(nnz_lno_t nv_, size_type ne_, const_lno_row_view_t row_map,
+             const_lno_nnz_view_t entries, HandleType *coloring_handle)
+      : nv(nv_),
+        ne(ne_),
+        xadj(row_map),
+        adj(entries),
+        kok_src(),
+        kok_dst(),
+        cp(coloring_handle) {
+    static_assert(
+        std::is_same<
+            size_type,
+            typename const_lno_row_view_t::non_const_value_type>::value,
         "Row map element type does not match handle's size_type.");
-    static_assert(std::is_same<nnz_lno_t, typename const_lno_nnz_view_t::non_const_value_type>::value,
+    static_assert(
+        std::is_same<
+            nnz_lno_t,
+            typename const_lno_nnz_view_t::non_const_value_type>::value,
         "Entries element type does not match handle's nnz_lno_t.");
   }
 
   /** \brief GraphColor destructor.
    */
-  virtual ~GraphColor(){}
-
-
-  /** \brief Function to color the vertices of the graphs. This is the base class,
-   * therefore, it only performs sequential coloring on the host device, ignoring the execution space.
-   * \param colors is the output array corresponding the color of each vertex.Size is this->nv.
-   *   Attn: Color array must be nonnegative numbers. If there is no initial colors,
-   *   it should be all initialized with zeros. Any positive value in the given array, will make the
-   *   algorithm to assume that the color is fixed for the corresponding vertex.
-   * \param num_phases: The number of iterations (phases) that algorithm takes to converge.
+  virtual ~GraphColor() {}
+
+  /** \brief Function to color the vertices of the graphs. This is the base
+   * class, therefore, it only performs sequential coloring on the host device,
+   * ignoring the execution space. \param colors is the output array
+   * corresponding the color of each vertex.Size is this->nv. Attn: Color array
+   * must be nonnegative numbers. If there is no initial colors, it should be
+   * all initialized with zeros. Any positive value in the given array, will
+   * make the algorithm to assume that the color is fixed for the corresponding
+   * vertex. \param num_phases: The number of iterations (phases) that algorithm
+   * takes to converge.
    */
-  virtual void color_graph(
-      color_view_t d_colors,
-      int &num_phases){
-
+  virtual void color_graph(color_view_t d_colors, int &num_phases) {
     num_phases = 1;
 
+    color_host_view_t colors = Kokkos::create_mirror_view(d_colors);
+    typename const_lno_row_view_t::HostMirror h_xadj =
+        Kokkos::create_mirror_view(this->xadj);
+    typename const_lno_nnz_view_t::HostMirror h_adj =
+        Kokkos::create_mirror_view(this->adj);
 
-    color_host_view_t colors = Kokkos::create_mirror_view (d_colors);
-    typename const_lno_row_view_t::HostMirror h_xadj = Kokkos::create_mirror_view (this->xadj);
-    typename const_lno_nnz_view_t::HostMirror h_adj = Kokkos::create_mirror_view (this->adj);
+    // typename nnz_lno_host_view_t::HostMirror::HostMirror::HostMirror h_adj =
+    // tmp;
 
-    //typename nnz_lno_host_view_t::HostMirror::HostMirror::HostMirror h_adj = tmp;
-
-    Kokkos::deep_copy (h_xadj, this->xadj);
-    Kokkos::deep_copy (h_adj, this->adj);
+    Kokkos::deep_copy(h_xadj, this->xadj);
+    Kokkos::deep_copy(h_adj, this->adj);
 
     MyExecSpace().fence();
 
-
-
-
-    //create a ban color array to keep track of
-    //which colors have been taken by the neighbor vertices.
+    // create a ban color array to keep track of
+    // which colors have been taken by the neighbor vertices.
     nnz_lno_t *banned_colors = new nnz_lno_t[this->nv];
 
-
     for (nnz_lno_t i = 0; i < this->nv; ++i) banned_colors[i] = 0;
 
     color_t max_color = 0;
-    //traverse vertices greedily
-    for (nnz_lno_t i = 0; i < this->nv; ++i){
-
+    // traverse vertices greedily
+    for (nnz_lno_t i = 0; i < this->nv; ++i) {
       size_type nbegin = h_xadj(i);
-      size_type nend = h_xadj(i + 1);
-      //std::cout << "nb:" << nbegin << " ne:" << nend << std::endl;
-      //check the colors of neighbors
-      for (size_type j = nbegin; j < nend; ++j){
+      size_type nend   = h_xadj(i + 1);
+      // std::cout << "nb:" << nbegin << " ne:" << nend << std::endl;
+      // check the colors of neighbors
+      for (size_type j = nbegin; j < nend; ++j) {
         nnz_lno_t n = h_adj(j);
         if (n >= nv) continue;
-        //set the banned_color of the color of the neighbor vertex to my vertex index.
-        //the entries in the banned_color array that has my vertex index will be the set of prohibeted colors.
+        // set the banned_color of the color of the neighbor vertex to my vertex
+        // index. the entries in the banned_color array that has my vertex index
+        // will be the set of prohibeted colors.
         banned_colors[colors(n)] = i;
       }
-      //check the prohibeted colors, and pick the first available one.
+      // check the prohibeted colors, and pick the first available one.
       for (color_t j = 1; j <= max_color; ++j) {
-        if(banned_colors[j] != i){
+        if (banned_colors[j] != i) {
           colors(i) = j;
           break;
         }
       }
-      //if no color is available, pick a new color.
+      // if no color is available, pick a new color.
       if (colors(i) == 0) colors(i) = ++max_color;
     }
-    delete [] banned_colors;
+    delete[] banned_colors;
 
-    Kokkos::deep_copy (d_colors, colors); // Copy from host to device.
+    Kokkos::deep_copy(d_colors, colors);  // Copy from host to device.
   }
 };
 
 /*! \brief Class for the vertex based graph coloring algorithms.
- *  They work better on CPUs and Xeon Phis, but edge-based ones are better on GPUs.
- *  Includes 3 algorithms:
- *  VB: Speculative parallel vertex based algorithm using a forbidden array of size 64 per thread.
- *  Best on Xeon Phi among the vertex based algorithms.
- *  VBBIT: Speculative parallel vertex based using a long integer for forbidden colors per vertex.
- *  Best on GPUs among the vertex based algorithms.
- *  VBCS: Speculative parallel vertex based using color set implementation.
+ *  They work better on CPUs and Xeon Phis, but edge-based ones are better on
+ * GPUs. Includes 3 algorithms: VB: Speculative parallel vertex based algorithm
+ * using a forbidden array of size 64 per thread. Best on Xeon Phi among the
+ * vertex based algorithms. VBBIT: Speculative parallel vertex based using a
+ * long integer for forbidden colors per vertex. Best on GPUs among the vertex
+ * based algorithms. VBCS: Speculative parallel vertex based using color set
+ * implementation.
  */
-template <typename HandleType, typename lno_row_view_t_, typename lno_nnz_view_t_>
-class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t_>{
-public:
-
+template <typename HandleType, typename lno_row_view_t_,
+          typename lno_nnz_view_t_>
+class GraphColor_VB
+    : public GraphColor<HandleType, lno_row_view_t_, lno_nnz_view_t_> {
+ public:
   typedef long long int ban_type;
 
   typedef lno_row_view_t_ in_lno_row_view_t;
   typedef lno_nnz_view_t_ in_lno_nnz_view_t;
   typedef typename HandleType::color_view_t color_view_type;
 
-
   typedef typename HandleType::size_type size_type;
   typedef typename lno_row_view_t_::device_type row_lno_view_device_t;
 
   typedef typename HandleType::nnz_lno_t nnz_lno_t;
 
   typedef typename HandleType::color_t color_t;
-  typedef typename HandleType::color_host_view_t color_host_view_t; //Host view type
+  typedef typename HandleType::color_host_view_t
+      color_host_view_t;  // Host view type
 
   typedef typename HandleType::HandleExecSpace MyExecSpace;
   typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace;
-  typedef typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace;
+  typedef
+      typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace;
 
-  typedef typename Kokkos::View<nnz_lno_t, row_lno_view_device_t> single_dim_index_view_type;
-  //typedef typename Kokkos::View<row_index_type, Kokkos::MemoryUnmanaged> um_array_type;
-  typedef typename single_dim_index_view_type::HostMirror single_dim_index_host_view_type; //Host view type
+  typedef typename Kokkos::View<nnz_lno_t, row_lno_view_device_t>
+      single_dim_index_view_type;
+  // typedef typename Kokkos::View<row_index_type, Kokkos::MemoryUnmanaged>
+  // um_array_type;
+  typedef typename single_dim_index_view_type::HostMirror
+      single_dim_index_host_view_type;  // Host view type
 
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
 
-  typedef typename HandleType::size_type_temp_work_view_t size_type_temp_work_view_t;
-  typedef typename HandleType::size_type_persistent_work_view_t size_type_persistent_work_view_t;
-
+  typedef typename HandleType::size_type_temp_work_view_t
+      size_type_temp_work_view_t;
+  typedef typename HandleType::size_type_persistent_work_view_t
+      size_type_persistent_work_view_t;
 
-  typedef typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t;
-  typedef typename HandleType::nnz_lno_persistent_work_view_t nnz_lno_persistent_work_view_t;
+  typedef
+      typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t;
+  typedef typename HandleType::nnz_lno_persistent_work_view_t
+      nnz_lno_persistent_work_view_t;
 
   typedef typename in_lno_row_view_t::const_type const_lno_row_view_t;
 
-
   typedef typename lno_nnz_view_t_::const_type const_lno_nnz_view_t;
   typedef typename lno_nnz_view_t_::non_const_type non_const_lno_nnz_view_t;
 
-
-protected:
-
-
-
-  //typedef Kokkos::View<idx, /*Kokkos::Serial::array_layout,*//* Kokkos::Serial,*/ Kokkos::MemoryUnmanaged> um_array_type;
-
-
-  bool _serialConflictResolution; //if true use serial conflict resolution
-  bool _ticToc; //if true print info in each step
-  ConflictList _conflict_scheme; //Enum: COLORING_NOCONFLICT, COLORING_ATOMIC, COLORING_PPS
-
-  double _pps_ratio; //the minimum number of reduction on the size of the conflictlist to create a new conflictlist
-  nnz_lno_t _min_vertex_cut_off; //minimum number of vertices to reduce the conflictlist further.
-  bool _edge_filtering; //if true, edge-filtering is applied by swaps on adjacency array.
-  int _chunkSize; //the size of the minimum work unit assigned to threads. Changes the convergence on GPUs
-  char _use_color_set; //the VB algorithm type.
+ protected:
+  // typedef Kokkos::View<idx, /*Kokkos::Serial::array_layout,*//*
+  // Kokkos::Serial,*/ Kokkos::MemoryUnmanaged> um_array_type;
+
+  bool _serialConflictResolution;  // if true use serial conflict resolution
+  bool _ticToc;                    // if true print info in each step
+  ConflictList _conflict_scheme;  // Enum: COLORING_NOCONFLICT, COLORING_ATOMIC,
+                                  // COLORING_PPS
+
+  double _pps_ratio;  // the minimum number of reduction on the size of the
+                      // conflictlist to create a new conflictlist
+  nnz_lno_t _min_vertex_cut_off;  // minimum number of vertices to reduce the
+                                  // conflictlist further.
+  bool _edge_filtering;  // if true, edge-filtering is applied by swaps on
+                         // adjacency array.
+  int _chunkSize;  // the size of the minimum work unit assigned to threads.
+                   // Changes the convergence on GPUs
+  char _use_color_set;  // the VB algorithm type.
                         // 0 for VB:
                         // 1: for VBCS
                         // 2: for VBBIT
 
   int _max_num_iterations;
 
-public:
+ public:
   /**
    * \brief GraphColor_VB constructor.
    * \param nv_: number of vertices in the graph
    * \param ne_: number of edges in the graph
    * \param row_map: the xadj array of the graph. Its size is nv_ +1
    * \param entries: adjacency array of the graph. Its size is ne_
-   * \param coloring_handle: GraphColoringHandle object that holds the specification about the graph coloring,
-   *    including parameters.
+   * \param coloring_handle: GraphColoringHandle object that holds the
+   * specification about the graph coloring, including parameters.
    */
-  GraphColor_VB(
-      nnz_lno_t nv_, size_type ne_,
-      const_lno_row_view_t row_map, const_lno_nnz_view_t entries,
-      HandleType *coloring_handle):
-    GraphColor<HandleType,lno_row_view_t_,lno_nnz_view_t_>(nv_, ne_, row_map, entries, coloring_handle),
-    _serialConflictResolution(coloring_handle->get_serial_conflict_resolution()),
-    _ticToc(coloring_handle->get_tictoc()),
-    _conflict_scheme(coloring_handle->get_conflict_list_type()),
-    _pps_ratio(coloring_handle->get_min_reduction_for_conflictlist()),
-    _min_vertex_cut_off(coloring_handle->get_min_elements_for_conflictlist()),
-    _edge_filtering(coloring_handle->get_vb_edge_filtering()),
-    _chunkSize(coloring_handle->get_vb_chunk_size()),
-    _use_color_set(),
-    _max_num_iterations(coloring_handle->get_max_number_of_iterations())
-    {
-      switch (coloring_handle->get_coloring_algo_type()){
-      case COLORING_VB:
-        this->_use_color_set = 0;
-        break;
-      case COLORING_VBBIT:
-        this->_use_color_set = 2;
-        break;
-      case COLORING_VBCS:
-        this->_use_color_set = 1;
-        break;
-      default: //cannnot get in here.
+  GraphColor_VB(nnz_lno_t nv_, size_type ne_, const_lno_row_view_t row_map,
+                const_lno_nnz_view_t entries, HandleType *coloring_handle)
+      : GraphColor<HandleType, lno_row_view_t_, lno_nnz_view_t_>(
+            nv_, ne_, row_map, entries, coloring_handle),
+        _serialConflictResolution(
+            coloring_handle->get_serial_conflict_resolution()),
+        _ticToc(coloring_handle->get_tictoc()),
+        _conflict_scheme(coloring_handle->get_conflict_list_type()),
+        _pps_ratio(coloring_handle->get_min_reduction_for_conflictlist()),
+        _min_vertex_cut_off(
+            coloring_handle->get_min_elements_for_conflictlist()),
+        _edge_filtering(coloring_handle->get_vb_edge_filtering()),
+        _chunkSize(coloring_handle->get_vb_chunk_size()),
+        _use_color_set(),
+        _max_num_iterations(coloring_handle->get_max_number_of_iterations()) {
+    switch (coloring_handle->get_coloring_algo_type()) {
+      case COLORING_VB: this->_use_color_set = 0; break;
+      case COLORING_VBBIT: this->_use_color_set = 2; break;
+      case COLORING_VBCS: this->_use_color_set = 1; break;
+      default:  // cannnot get in here.
         this->_use_color_set = 0;
         break;
-      }
     }
+  }
 
   /** \brief GraphColor_VB destructor.
-    */
-  virtual ~GraphColor_VB(){}
-
-  /** \brief Function to color the vertices of the graphs. Performs a vertex-based coloring.
-   * \param colors is the output array corresponding the color of each vertex. Size is this->nv.
-   *   Attn: Color array must be nonnegative numbers. If there is no initial colors,
-   *   it should be all initialized with zeros. Any positive value in the given array, will make the
-   *   algorithm to assume that the color is fixed for the corresponding vertex.
-   * \param num_phases: The number of iterations (phases) that algorithm takes to converge.
    */
-  virtual void color_graph(color_view_type colors,int &num_loops){
-
+  virtual ~GraphColor_VB() {}
+
+  /** \brief Function to color the vertices of the graphs. Performs a
+   * vertex-based coloring. \param colors is the output array corresponding the
+   * color of each vertex. Size is this->nv. Attn: Color array must be
+   * nonnegative numbers. If there is no initial colors, it should be all
+   * initialized with zeros. Any positive value in the given array, will make
+   * the algorithm to assume that the color is fixed for the corresponding
+   * vertex. \param num_phases: The number of iterations (phases) that algorithm
+   * takes to converge.
+   */
+  virtual void color_graph(color_view_type colors, int &num_loops) {
     if (this->_ticToc) {
-      std::cout
-          << "\tVB params:" << std::endl
-          << "\tuseConflictList:" << int (this->_conflict_scheme) << std::endl
-          << "\talgorithm:" << (int)this->_use_color_set << std::endl
-          << "\tserialConflictResolution:"  << (int) this->_serialConflictResolution << std::endl
-          << "\tticToc:" << (int) this->_ticToc << std::endl
-          << "\tuse_color_set:" << (int) this->_use_color_set << std::endl
-          << "\tpps_ratio:" << this->_pps_ratio << std::endl
-          << "\tmin_vertex_cut_off:" << this->_min_vertex_cut_off << std::endl
-          << "\tedge_filtering:" << (int)this->_edge_filtering << std::endl
-          << "\tmax_num_iterations:" << this->_max_num_iterations << std::endl
-          << "\tchunkSize:" << this->_chunkSize << std::endl;
-    }
-
-    //if the edge filtering is selected, then we do swaps on the adj array.
-    //to not to touch the given one, we copy the adj array.
+      std::cout << "\tVB params:" << std::endl
+                << "\tuseConflictList:" << int(this->_conflict_scheme)
+                << std::endl
+                << "\talgorithm:" << (int)this->_use_color_set << std::endl
+                << "\tserialConflictResolution:"
+                << (int)this->_serialConflictResolution << std::endl
+                << "\tticToc:" << (int)this->_ticToc << std::endl
+                << "\tuse_color_set:" << (int)this->_use_color_set << std::endl
+                << "\tpps_ratio:" << this->_pps_ratio << std::endl
+                << "\tmin_vertex_cut_off:" << this->_min_vertex_cut_off
+                << std::endl
+                << "\tedge_filtering:" << (int)this->_edge_filtering
+                << std::endl
+                << "\tmax_num_iterations:" << this->_max_num_iterations
+                << std::endl
+                << "\tchunkSize:" << this->_chunkSize << std::endl;
+    }
+
+    // if the edge filtering is selected, then we do swaps on the adj array.
+    // to not to touch the given one, we copy the adj array.
     nnz_lno_temp_work_view_t adj_copy;
 
-    //if we use edge-filtering, we perform swaps.
-    //We need to copy the adjacency array so that we dont harm the original one.
-    if (this->_edge_filtering){
-      adj_copy = nnz_lno_temp_work_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "adj copy"), this->ne);
+    // if we use edge-filtering, we perform swaps.
+    // We need to copy the adjacency array so that we dont harm the original
+    // one.
+    if (this->_edge_filtering) {
+      adj_copy = nnz_lno_temp_work_view_t(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "adj copy"),
+          this->ne);
       Kokkos::deep_copy(adj_copy, this->adj);
     }
 
-    //if color set algorithm is used, we need one more array to represent the range.
+    // if color set algorithm is used, we need one more array to represent the
+    // range.
     nnz_lno_temp_work_view_t vertex_color_set;
-    if (this->_use_color_set == 1){
+    if (this->_use_color_set == 1) {
       vertex_color_set = nnz_lno_temp_work_view_t("colorset", this->nv);
     }
 
-    //the conflictlist
-    nnz_lno_temp_work_view_t current_vertexList =
-        nnz_lno_temp_work_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "vertexList"), this->nv);
+    // the conflictlist
+    nnz_lno_temp_work_view_t current_vertexList = nnz_lno_temp_work_view_t(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "vertexList"),
+        this->nv);
     nnz_lno_t current_vertexListLength = this->nv;
-    
-    if(this->cp->get_use_vtx_list()){
-      //get the vertexList from the color handle, if it exists.
-      current_vertexList = this->cp->get_vertex_list();
+
+    if (this->cp->get_use_vtx_list()) {
+      // get the vertexList from the color handle, if it exists.
+      current_vertexList       = this->cp->get_vertex_list();
       current_vertexListLength = this->cp->get_vertex_list_size();
     } else {
-      //init vertexList sequentially.
-      Kokkos::parallel_for("KokkosGraph::GraphColoring::InitList",
-          my_exec_space(0, this->nv), functorInitList<nnz_lno_temp_work_view_t> (current_vertexList));
+      // init vertexList sequentially.
+      Kokkos::parallel_for(
+          "KokkosGraph::GraphColoring::InitList", my_exec_space(0, this->nv),
+          functorInitList<nnz_lno_temp_work_view_t>(current_vertexList));
     }
 
     // the next iteration's conflict list
     nnz_lno_temp_work_view_t next_iteration_recolorList;
     // the size of the current conflictlist
 
-    //the size of the next iteration's conflictlist
+    // the size of the next iteration's conflictlist
     single_dim_index_view_type next_iteration_recolorListLength;
 
     // if a conflictlist is used
-    if (this->_conflict_scheme!= COLORING_NOCONFLICT){
+    if (this->_conflict_scheme != COLORING_NOCONFLICT) {
       // Vertices to recolor. Will swap with vertexList.
-      next_iteration_recolorList = nnz_lno_temp_work_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "recolorList"), this->nv);
-      next_iteration_recolorListLength = single_dim_index_view_type("recolorListLength");
+      next_iteration_recolorList = nnz_lno_temp_work_view_t(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "recolorList"),
+          this->nv);
+      next_iteration_recolorListLength =
+          single_dim_index_view_type("recolorListLength");
     }
 
     nnz_lno_t numUncolored = this->nv;
 
-
-    double t, total=0.0;
-    double total_time_greedy_phase=0.0;
-    double total_time_find_conflicts=0.0;
-    double total_time_serial_conflict_resolution=0.0;
+    double t, total = 0.0;
+    double total_time_greedy_phase               = 0.0;
+    double total_time_find_conflicts             = 0.0;
+    double total_time_serial_conflict_resolution = 0.0;
     Kokkos::Timer timer;
     timer.reset();
 
-
-    int iter=0;
-    for (; (iter < this->_max_num_iterations) && (numUncolored>0); iter++){
-
-      if (this->_edge_filtering)
-      {
+    int iter = 0;
+    for (; (iter < this->_max_num_iterations) && (numUncolored > 0); iter++) {
+      if (this->_edge_filtering) {
         // First color greedy speculatively,
-        //some conflicts expected
-        this->colorGreedyEF(
-            this->xadj,
-            adj_copy,
-            colors,
-            vertex_color_set,
-            current_vertexList,
-            current_vertexListLength);
-      }
-      else {
+        // some conflicts expected
+        this->colorGreedyEF(this->xadj, adj_copy, colors, vertex_color_set,
+                            current_vertexList, current_vertexListLength);
+      } else {
         // First color greedy speculatively,
-        //some conflicts expected
-        this->colorGreedy(
-            this->xadj,
-            this->adj,
-            colors,
-            vertex_color_set,
-            current_vertexList,
-            current_vertexListLength);
+        // some conflicts expected
+        this->colorGreedy(this->xadj, this->adj, colors, vertex_color_set,
+                          current_vertexList, current_vertexListLength);
       }
 
       MyExecSpace().fence();
 
-      if (this->_ticToc){
+      if (this->_ticToc) {
         t = timer.seconds();
         total += t;
         total_time_greedy_phase += t;
-        std::cout << "\tTime speculative greedy phase " << iter << " : " << t << std::endl;
+        std::cout << "\tTime speculative greedy phase " << iter << " : " << t
+                  << std::endl;
         timer.reset();
       }
 
       bool swap_work_arrays = true;
-      if (this->_edge_filtering)
-      {
-      numUncolored = this->findConflicts(
-          swap_work_arrays,
-          this->xadj, adj_copy,
-          colors, vertex_color_set,
-          current_vertexList, current_vertexListLength,
-          next_iteration_recolorList, next_iteration_recolorListLength);
-      }
-      else {
+      if (this->_edge_filtering) {
+        numUncolored = this->findConflicts(
+            swap_work_arrays, this->xadj, adj_copy, colors, vertex_color_set,
+            current_vertexList, current_vertexListLength,
+            next_iteration_recolorList, next_iteration_recolorListLength);
+      } else {
         numUncolored = this->findConflicts(
-            swap_work_arrays,
-            this->xadj, this->adj,
-            colors, vertex_color_set,
+            swap_work_arrays, this->xadj, this->adj, colors, vertex_color_set,
             current_vertexList, current_vertexListLength,
             next_iteration_recolorList, next_iteration_recolorListLength);
       }
 
       MyExecSpace().fence();
 
-      if (_ticToc){
+      if (_ticToc) {
         t = timer.seconds();
         total += t;
         total_time_find_conflicts += t;
-        std::cout << "\tTime conflict detection " << iter << " : " << t << std::endl;
+        std::cout << "\tTime conflict detection " << iter << " : " << t
+                  << std::endl;
         timer.reset();
       }
 
-      if (this->_serialConflictResolution) break; // Break after first iteration.
-      if (this->_conflict_scheme != COLORING_NOCONFLICT && swap_work_arrays && (iter + 1< this->_max_num_iterations)){
+      if (this->_serialConflictResolution)
+        break;  // Break after first iteration.
+      if (this->_conflict_scheme != COLORING_NOCONFLICT && swap_work_arrays &&
+          (iter + 1 < this->_max_num_iterations)) {
         // Swap recolorList and vertexList
         nnz_lno_temp_work_view_t temp = current_vertexList;
-        current_vertexList = next_iteration_recolorList;
-        next_iteration_recolorList = temp;
-        current_vertexListLength = numUncolored;
-        next_iteration_recolorListLength = single_dim_index_view_type("recolorListLength");
+        current_vertexList            = next_iteration_recolorList;
+        next_iteration_recolorList    = temp;
+        current_vertexListLength      = numUncolored;
+        next_iteration_recolorListLength =
+            single_dim_index_view_type("recolorListLength");
       }
-
     }
 
-    //if VBCS algorithm is used, the colors are converted back to original form.
-    if (this->_use_color_set == 1){
+    // if VBCS algorithm is used, the colors are converted back to original
+    // form.
+    if (this->_use_color_set == 1) {
       Kokkos::parallel_for("KokkosGraph::GraphColoring::SetFinalColors",
-          my_exec_space(0, this->nv), set_final_colors (colors, vertex_color_set));
-    }
-    if (numUncolored > 0){
-
-    	if (this->_edge_filtering)
-    	{
-    		// Resolve conflicts by recoloring in serial
-    		this->resolveConflicts(
-    				this->nv,
-					this->xadj, adj_copy,
-					colors,
-					current_vertexList, current_vertexListLength
-    		);
-    	}
-    	else {
-    		// Resolve conflicts by recoloring in serial
-    		this->resolveConflicts(
-    				this->nv,
-					this->xadj, this->adj,
-					colors,
-					current_vertexList, current_vertexListLength
-    		);
-
-
-    	}
-    	MyExecSpace().fence();
-    	if (_ticToc){
-    		t = timer.seconds();
-    		total += t;
+                           my_exec_space(0, this->nv),
+                           set_final_colors(colors, vertex_color_set));
+    }
+    if (numUncolored > 0) {
+      if (this->_edge_filtering) {
+        // Resolve conflicts by recoloring in serial
+        this->resolveConflicts(this->nv, this->xadj, adj_copy, colors,
+                               current_vertexList, current_vertexListLength);
+      } else {
+        // Resolve conflicts by recoloring in serial
+        this->resolveConflicts(this->nv, this->xadj, this->adj, colors,
+                               current_vertexList, current_vertexListLength);
+      }
+      MyExecSpace().fence();
+      if (_ticToc) {
+        t = timer.seconds();
+        total += t;
         total_time_serial_conflict_resolution += t;
-    		std::cout << "\tTime serial conflict resolution: " << t << std::endl;
-    	}
+        std::cout << "\tTime serial conflict resolution: " << t << std::endl;
+      }
     }
     num_loops = iter;
 
     this->cp->add_to_overall_coloring_time_phase1(total_time_greedy_phase);
     this->cp->add_to_overall_coloring_time_phase2(total_time_find_conflicts);
-    this->cp->add_to_overall_coloring_time_phase3(total_time_serial_conflict_resolution);
-  }    // color_graph (end)
+    this->cp->add_to_overall_coloring_time_phase3(
+        total_time_serial_conflict_resolution);
+  }  // color_graph (end)
 
-
-private:
+ private:
   /** \brief Performs speculative coloring based on the given colorings.
    *  \param xadj_: row map of the graph
    *  \param adj_: entries, columns of the graph
@@ -535,58 +523,50 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
    *  \param current_vertexList_: current conflictlist
    *  \param current_vertexListLength_: size of current conflictlist
    */
-  void colorGreedy(
-      const_lno_row_view_t xadj_,
-      const_lno_nnz_view_t adj_,
-      color_view_type vertex_colors_,
-      nnz_lno_temp_work_view_t vertex_color_set,
-      nnz_lno_temp_work_view_t current_vertexList_,
-      nnz_lno_t current_vertexListLength_) {
-
-    nnz_lno_t chunkSize_ = this->_chunkSize; // Process chunkSize vertices in one chunk
+  void colorGreedy(const_lno_row_view_t xadj_, const_lno_nnz_view_t adj_,
+                   color_view_type vertex_colors_,
+                   nnz_lno_temp_work_view_t vertex_color_set,
+                   nnz_lno_temp_work_view_t current_vertexList_,
+                   nnz_lno_t current_vertexListLength_) {
+    nnz_lno_t chunkSize_ =
+        this->_chunkSize;  // Process chunkSize vertices in one chunk
 
-    if (current_vertexListLength_ < 100*chunkSize_) chunkSize_ = 1;
+    if (current_vertexListLength_ < 100 * chunkSize_) chunkSize_ = 1;
 
-    //if the algorithm VBBIT
+    // if the algorithm VBBIT
     if (this->_use_color_set == 2) {
-      //std::cout << ">>> functorGreedyColor_IMPLOG" << std::endl;    // WCMCLEN
-      functorGreedyColor_IMPLOG gc(
-          this->nv,
-          xadj_, adj_,
-          vertex_colors_,  current_vertexList_,
-          current_vertexListLength_, chunkSize_);
-      Kokkos::parallel_for("KokkosGraph::GraphColoring::GreedyColor_IMPLOG",
-          my_exec_space(0, current_vertexListLength_/chunkSize_+1), gc);
+      // std::cout << ">>> functorGreedyColor_IMPLOG" << std::endl;    //
+      // WCMCLEN
+      functorGreedyColor_IMPLOG gc(this->nv, xadj_, adj_, vertex_colors_,
+                                   current_vertexList_,
+                                   current_vertexListLength_, chunkSize_);
+      Kokkos::parallel_for(
+          "KokkosGraph::GraphColoring::GreedyColor_IMPLOG",
+          my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc);
 
     }
     // VBCS algorithm
-    else if (this->_use_color_set == 1){
-      //std::cout << ">>> functorGreedyColor_IMP" << std::endl;    // WCMCLEN
-      functorGreedyColor_IMP gc(
-          this->nv,
-          xadj_, adj_,
-          vertex_colors_, vertex_color_set, current_vertexList_,
-          current_vertexListLength_, chunkSize_);
-      Kokkos::parallel_for("KokkosGraph::GraphColoring::GreedyColor_IMP",
-          my_exec_space(0, current_vertexListLength_/chunkSize_+1), gc);
-
-    }
-    //VB algorithm
-    else if (this->_use_color_set == 0)
-    {
-      //std::cout << ">>> functorGreedyColor" << std::endl;    // WCMCLEN
-      functorGreedyColor  gc(
-          this->nv,
-          xadj_, adj_,
-          vertex_colors_,
-          current_vertexList_, current_vertexListLength_, chunkSize_);
-      Kokkos::parallel_for("KokkosGraph::GraphColoring::GreedyColor",
-          my_exec_space(0, current_vertexListLength_/chunkSize_+1), gc);
-
-    }
-  }      // colorGreedy (end)
-
-
+    else if (this->_use_color_set == 1) {
+      // std::cout << ">>> functorGreedyColor_IMP" << std::endl;    // WCMCLEN
+      functorGreedyColor_IMP gc(this->nv, xadj_, adj_, vertex_colors_,
+                                vertex_color_set, current_vertexList_,
+                                current_vertexListLength_, chunkSize_);
+      Kokkos::parallel_for(
+          "KokkosGraph::GraphColoring::GreedyColor_IMP",
+          my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc);
+
+    }
+    // VB algorithm
+    else if (this->_use_color_set == 0) {
+      // std::cout << ">>> functorGreedyColor" << std::endl;    // WCMCLEN
+      functorGreedyColor gc(this->nv, xadj_, adj_, vertex_colors_,
+                            current_vertexList_, current_vertexListLength_,
+                            chunkSize_);
+      Kokkos::parallel_for(
+          "KokkosGraph::GraphColoring::GreedyColor",
+          my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc);
+    }
+  }  // colorGreedy (end)
 
   /** \brief Performs speculative coloring based on the given colorings.
    *  \param xadj_: row map of the graph
@@ -596,140 +576,145 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
    *  \param current_vertexList_: current conflictlist
    *  \param current_vertexListLength_: size of current conflictlist
    */
-  void  colorGreedyEF(
-      const_lno_row_view_t xadj_,
-      nnz_lno_temp_work_view_t adj_,
-      color_view_type vertex_colors_,
-      nnz_lno_temp_work_view_t vertex_color_set,
-      nnz_lno_temp_work_view_t current_vertexList_,
-      nnz_lno_t current_vertexListLength_) {
-
-    nnz_lno_t chunkSize_ = this->_chunkSize; // Process chunkSize vertices in one chunk
+  void colorGreedyEF(const_lno_row_view_t xadj_, nnz_lno_temp_work_view_t adj_,
+                     color_view_type vertex_colors_,
+                     nnz_lno_temp_work_view_t vertex_color_set,
+                     nnz_lno_temp_work_view_t current_vertexList_,
+                     nnz_lno_t current_vertexListLength_) {
+    nnz_lno_t chunkSize_ =
+        this->_chunkSize;  // Process chunkSize vertices in one chunk
 
-    if (current_vertexListLength_ < 100*chunkSize_) chunkSize_ = 1;
+    if (current_vertexListLength_ < 100 * chunkSize_) chunkSize_ = 1;
 
-    //if the algorithm VBBIT
+    // if the algorithm VBBIT
     if (this->_use_color_set == 2) {
-
-      //If edge filtering is applied
-      //std::cout << ">>> functorGreedyColor_IMPLOG_EF" << std::endl;  // WCMCLEN
-      functorGreedyColor_IMPLOG_EF gc(
-          this->nv,
-          xadj_, adj_,
-          vertex_colors_, current_vertexList_,
-          current_vertexListLength_, chunkSize_);
-      Kokkos::parallel_for("KokkosGraph::GraphColoring::GreedyColor_IMPLOG_EF",
-          my_exec_space(0, current_vertexListLength_/chunkSize_+1), gc);
-
+      // If edge filtering is applied
+      // std::cout << ">>> functorGreedyColor_IMPLOG_EF" << std::endl;  //
+      // WCMCLEN
+      functorGreedyColor_IMPLOG_EF gc(this->nv, xadj_, adj_, vertex_colors_,
+                                      current_vertexList_,
+                                      current_vertexListLength_, chunkSize_);
+      Kokkos::parallel_for(
+          "KokkosGraph::GraphColoring::GreedyColor_IMPLOG_EF",
+          my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc);
 
     }
     // VBCS algorithm
-    else if (this->_use_color_set == 1){
-      //std::cout << ">>> functorGreedyColor_IMP_EF" << std::endl;    // WCMCLEN
-      functorGreedyColor_IMP_EF gc(
-          this->nv,
-          xadj_, adj_,
-          vertex_colors_, vertex_color_set, current_vertexList_,
-          current_vertexListLength_, chunkSize_);
-      Kokkos::parallel_for("KokkosGraph::GraphColoring::GreedyColor_IMP_EF",
-          my_exec_space(0, current_vertexListLength_/chunkSize_+1), gc);
-    }
-    //VB algorithm
-    else if (this->_use_color_set == 0)
-    {
-      //std::cout << ">>> functorGreedyColor_EF" << std::endl;    // WCMCLEN
-      functorGreedyColor_EF  gc(
-          this->nv,
-          xadj_, adj_,
-          vertex_colors_,
-          current_vertexList_, current_vertexListLength_, chunkSize_);
-      Kokkos::parallel_for("KokkosGraph::GraphColoring::GreedyColor_EF",
-          my_exec_space(0, current_vertexListLength_/chunkSize_+1), gc);
-
+    else if (this->_use_color_set == 1) {
+      // std::cout << ">>> functorGreedyColor_IMP_EF" << std::endl;    //
+      // WCMCLEN
+      functorGreedyColor_IMP_EF gc(this->nv, xadj_, adj_, vertex_colors_,
+                                   vertex_color_set, current_vertexList_,
+                                   current_vertexListLength_, chunkSize_);
+      Kokkos::parallel_for(
+          "KokkosGraph::GraphColoring::GreedyColor_IMP_EF",
+          my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc);
+    }
+    // VB algorithm
+    else if (this->_use_color_set == 0) {
+      // std::cout << ">>> functorGreedyColor_EF" << std::endl;    // WCMCLEN
+      functorGreedyColor_EF gc(this->nv, xadj_, adj_, vertex_colors_,
+                               current_vertexList_, current_vertexListLength_,
+                               chunkSize_);
+      Kokkos::parallel_for(
+          "KokkosGraph::GraphColoring::GreedyColor_EF",
+          my_exec_space(0, current_vertexListLength_ / chunkSize_ + 1), gc);
     }
   }
 
   /** \brief Performs conflict resolution
-   *  \param swap_work_arrays: An output parameter whether indicating whether the work arrays should be swapped or not.
-   *  \param xadj_: row map of the graph
-   *  \param adj_: entries, columns of the graph
-   *  \param vertex_colors_: colors corresponding to each vertex
-   *  \param vertex_color_set: if VBCS is used, color set of each vertex
-   *  \param current_vertexList_: current conflictlist
-   *  \param current_vertexListLength_: size of current conflictlist
+   *  \param swap_work_arrays: An output parameter whether indicating whether
+   * the work arrays should be swapped or not. \param xadj_: row map of the
+   * graph \param adj_: entries, columns of the graph \param vertex_colors_:
+   * colors corresponding to each vertex \param vertex_color_set: if VBCS is
+   * used, color set of each vertex \param current_vertexList_: current
+   * conflictlist \param current_vertexListLength_: size of current conflictlist
    *  \param next_iteration_recolorList_: next conflictlist
    *  \param next_iteration_recolorListLength_: size of next conflictlist
    */
   template <typename adj_view_t>
   nnz_lno_t findConflicts(
-      bool &swap_work_arrays,
-      const_lno_row_view_t xadj_,
-      adj_view_t adj_,
+      bool &swap_work_arrays, const_lno_row_view_t xadj_, adj_view_t adj_,
       color_view_type vertex_colors_,
       nnz_lno_temp_work_view_t vertex_color_set_,
       nnz_lno_temp_work_view_t current_vertexList_,
       nnz_lno_t current_vertexListLength_,
       nnz_lno_temp_work_view_t next_iteration_recolorList_,
       single_dim_index_view_type next_iteration_recolorListLength_) {
-
-    swap_work_arrays = true;
+    swap_work_arrays       = true;
     nnz_lno_t numUncolored = 0;
-    if (this->_conflict_scheme == COLORING_NOCONFLICT){
-      if (this->_use_color_set == 0 || this->_use_color_set == 2){
-        functorFindConflicts_No_Conflist<adj_view_t> conf( this->nv, xadj_, adj_, vertex_colors_);
-        Kokkos::parallel_reduce("KokkosGraph::GraphColoring::FindConflicts::CaseA", my_exec_space(0, current_vertexListLength_), conf, numUncolored);
-      }
-      else {
-        functorFindConflicts_No_Conflist_IMP<adj_view_t> conf(this->nv, xadj_, adj_,vertex_colors_, vertex_color_set_);
-        Kokkos::parallel_reduce("KokkosGraph::GraphColoring::FindConflicts::CaseB", my_exec_space(0, current_vertexListLength_), conf, numUncolored);
+    if (this->_conflict_scheme == COLORING_NOCONFLICT) {
+      if (this->_use_color_set == 0 || this->_use_color_set == 2) {
+        functorFindConflicts_No_Conflist<adj_view_t> conf(this->nv, xadj_, adj_,
+                                                          vertex_colors_);
+        Kokkos::parallel_reduce(
+            "KokkosGraph::GraphColoring::FindConflicts::CaseA",
+            my_exec_space(0, current_vertexListLength_), conf, numUncolored);
+      } else {
+        functorFindConflicts_No_Conflist_IMP<adj_view_t> conf(
+            this->nv, xadj_, adj_, vertex_colors_, vertex_color_set_);
+        Kokkos::parallel_reduce(
+            "KokkosGraph::GraphColoring::FindConflicts::CaseB",
+            my_exec_space(0, current_vertexListLength_), conf, numUncolored);
       }
-    }
-    else if (this->_conflict_scheme == COLORING_PPS){
-      if (this->_use_color_set == 0 || this->_use_color_set == 2){
+    } else if (this->_conflict_scheme == COLORING_PPS) {
+      if (this->_use_color_set == 0 || this->_use_color_set == 2) {
         // Check for conflicts. Compute numUncolored == numConflicts.
-        functorFindConflicts_PPS<adj_view_t> conf(this->nv, xadj_, adj_,vertex_colors_,current_vertexList_);
-        Kokkos::parallel_reduce("KokkosGraph::GraphColoring::FindConflicts::CaseC", my_exec_space(0, current_vertexListLength_), conf, numUncolored);
-      }
-      else {
+        functorFindConflicts_PPS<adj_view_t> conf(
+            this->nv, xadj_, adj_, vertex_colors_, current_vertexList_);
+        Kokkos::parallel_reduce(
+            "KokkosGraph::GraphColoring::FindConflicts::CaseC",
+            my_exec_space(0, current_vertexListLength_), conf, numUncolored);
+      } else {
         functorFindConflicts_PPS_IMP<adj_view_t> conf(
-            this->nv, xadj_, adj_,vertex_colors_, vertex_color_set_, current_vertexList_);
-        Kokkos::parallel_reduce("KokkosGraph::GraphColoring::FindConflicts::CaseD", my_exec_space(0, current_vertexListLength_), conf, numUncolored);
+            this->nv, xadj_, adj_, vertex_colors_, vertex_color_set_,
+            current_vertexList_);
+        Kokkos::parallel_reduce(
+            "KokkosGraph::GraphColoring::FindConflicts::CaseD",
+            my_exec_space(0, current_vertexListLength_), conf, numUncolored);
       }
 
-      if( numUncolored && (current_vertexListLength_ >= this->_min_vertex_cut_off) &&
-          (double (numUncolored) / current_vertexListLength_  <  (1.0 - this->_pps_ratio))){
-        if (this->_ticToc){
-          std::cout << "\tcreating work array with pps current_vertexListLength_:" <<
-              current_vertexListLength_ << " params->min_vertex_cut_off:" << this->_min_vertex_cut_off << std::endl;
+      if (numUncolored &&
+          (current_vertexListLength_ >= this->_min_vertex_cut_off) &&
+          (double(numUncolored) / current_vertexListLength_ <
+           (1.0 - this->_pps_ratio))) {
+        if (this->_ticToc) {
+          std::cout
+              << "\tcreating work array with pps current_vertexListLength_:"
+              << current_vertexListLength_
+              << " params->min_vertex_cut_off:" << this->_min_vertex_cut_off
+              << std::endl;
         }
         single_dim_index_host_view_type h_numUncolored(&numUncolored);
-        Kokkos::deep_copy (next_iteration_recolorListLength_, h_numUncolored);
-        Kokkos::parallel_scan ("KokkosGraph::GraphColoring::PrefixSum",
+        Kokkos::deep_copy(next_iteration_recolorListLength_, h_numUncolored);
+        Kokkos::parallel_scan(
+            "KokkosGraph::GraphColoring::PrefixSum",
             my_exec_space(0, current_vertexListLength_),
-            ppsWorklistFunctorVB<nnz_lno_temp_work_view_t>(this->nv, current_vertexList_, next_iteration_recolorList_));
-      }
-      else {
+            ppsWorklistFunctorVB<nnz_lno_temp_work_view_t>(
+                this->nv, current_vertexList_, next_iteration_recolorList_));
+      } else {
         swap_work_arrays = false;
       }
-    }
-    else { // worklist scheme COLORING_ATOMIC
-      if (this->_use_color_set == 0 || this->_use_color_set == 2){
+    } else {  // worklist scheme COLORING_ATOMIC
+      if (this->_use_color_set == 0 || this->_use_color_set == 2) {
         // Check for conflicts. Compute numUncolored == numConflicts.
-        functorFindConflicts_Atomic<adj_view_t> conf(this->nv,
-            xadj_, adj_,vertex_colors_,current_vertexList_,
+        functorFindConflicts_Atomic<adj_view_t> conf(
+            this->nv, xadj_, adj_, vertex_colors_, current_vertexList_,
             next_iteration_recolorList_, next_iteration_recolorListLength_);
-        Kokkos::parallel_reduce("KokkosGraph::GraphColoring::FindConflictsAtomic",
+        Kokkos::parallel_reduce(
+            "KokkosGraph::GraphColoring::FindConflictsAtomic",
             my_exec_space(0, current_vertexListLength_), conf, numUncolored);
-      }
-      else {
-        functorFindConflicts_Atomic_IMP<adj_view_t> conf(this->nv,
-            xadj_, adj_,vertex_colors_, vertex_color_set_,
-            current_vertexList_,next_iteration_recolorList_, next_iteration_recolorListLength_);
-        Kokkos::parallel_reduce("KokkosGraph::GraphColoring::FindConflictsAtomic_IMP",
+      } else {
+        functorFindConflicts_Atomic_IMP<adj_view_t> conf(
+            this->nv, xadj_, adj_, vertex_colors_, vertex_color_set_,
+            current_vertexList_, next_iteration_recolorList_,
+            next_iteration_recolorListLength_);
+        Kokkos::parallel_reduce(
+            "KokkosGraph::GraphColoring::FindConflictsAtomic_IMP",
             my_exec_space(0, current_vertexListLength_), conf, numUncolored);
       }
     }
-    if (this->_ticToc){
+    if (this->_ticToc) {
       std::cout << "\tnumUncolored:" << numUncolored << std::endl;
     }
     return numUncolored;
@@ -744,136 +729,137 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
    *  \param current_vertexListLength_: size of current conflictlist
    */
   template <typename adj_view_t>
-  void  resolveConflicts(
-      nnz_lno_t _nv,
-      const_lno_row_view_t xadj_,
-      adj_view_t adj_,
-      color_view_type vertex_colors_,
-      nnz_lno_temp_work_view_t current_vertexList_,
-      size_type current_vertexListLength_) {
-
+  void resolveConflicts(nnz_lno_t _nv, const_lno_row_view_t xadj_,
+                        adj_view_t adj_, color_view_type vertex_colors_,
+                        nnz_lno_temp_work_view_t current_vertexList_,
+                        size_type current_vertexListLength_) {
     color_t *forbidden = new color_t[_nv];
-    nnz_lno_t i=0;
-    nnz_lno_t end = _nv;
+    nnz_lno_t i        = 0;
+    nnz_lno_t end      = _nv;
     typename nnz_lno_temp_work_view_t::HostMirror h_recolor_list;
 
-    if (this->_conflict_scheme != COLORING_NOCONFLICT){
-      end = current_vertexListLength_;
-      h_recolor_list = Kokkos::create_mirror_view (current_vertexList_);
-      Kokkos::deep_copy (h_recolor_list, current_vertexList_);
+    if (this->_conflict_scheme != COLORING_NOCONFLICT) {
+      end            = current_vertexListLength_;
+      h_recolor_list = Kokkos::create_mirror_view(current_vertexList_);
+      Kokkos::deep_copy(h_recolor_list, current_vertexList_);
     }
-    color_host_view_t h_colors = Kokkos::create_mirror_view (vertex_colors_);
-    typename const_lno_row_view_t::HostMirror h_idx = Kokkos::create_mirror_view (xadj_);
-    typename adj_view_t::HostMirror h_adj = Kokkos::create_mirror_view (adj_);
-
+    color_host_view_t h_colors = Kokkos::create_mirror_view(vertex_colors_);
+    typename const_lno_row_view_t::HostMirror h_idx =
+        Kokkos::create_mirror_view(xadj_);
+    typename adj_view_t::HostMirror h_adj = Kokkos::create_mirror_view(adj_);
 
-    Kokkos::deep_copy (h_colors, vertex_colors_);
-    Kokkos::deep_copy (h_idx, xadj_);
-    Kokkos::deep_copy (h_adj, adj_);
+    Kokkos::deep_copy(h_colors, vertex_colors_);
+    Kokkos::deep_copy(h_idx, xadj_);
+    Kokkos::deep_copy(h_adj, adj_);
 
-    for (nnz_lno_t k=0; k <end; k++){
-      if (this->_conflict_scheme != COLORING_NOCONFLICT){
+    for (nnz_lno_t k = 0; k < end; k++) {
+      if (this->_conflict_scheme != COLORING_NOCONFLICT) {
         i = h_recolor_list(k);
-      }
-      else {
+      } else {
         // Check for uncolored vertices
         i = k;
       }
       if (h_colors(i) > 0) continue;
-      for (size_type j=h_idx(i); j<h_idx(i+1); j++){
-        if (h_adj(j) == i) continue; // Skip self-loops
+      for (size_type j = h_idx(i); j < h_idx(i + 1); j++) {
+        if (h_adj(j) == i) continue;  // Skip self-loops
         forbidden[h_colors(h_adj(j))] = i;
       }
       // color vertex i with smallest available color
-      int c=1;
-      while (forbidden[c]==i) c++;
+      int c = 1;
+      while (forbidden[c] == i) c++;
       h_colors(i) = c;
     }
-    Kokkos::deep_copy (vertex_colors_, h_colors);
-    delete [] forbidden;
+    Kokkos::deep_copy(vertex_colors_, h_colors);
+    delete[] forbidden;
   }
 
-public:
-  //Speculative Coloring Functors
+ public:
+  // Speculative Coloring Functors
 
   /**
    * Functor for VBBIT algorithms speculative coloring with edge filtering.
    */
   struct functorGreedyColor_IMPLOG_EF {
     nnz_lno_t nv;
-    const_lno_row_view_t _idx; //rowmap
-    nnz_lno_temp_work_view_t _adj; //entries
-    color_view_type _colors; // vertex colors
-    nnz_lno_temp_work_view_t _vertexList; // conflictlist
+    const_lno_row_view_t _idx;             // rowmap
+    nnz_lno_temp_work_view_t _adj;         // entries
+    color_view_type _colors;               // vertex colors
+    nnz_lno_temp_work_view_t _vertexList;  // conflictlist
     nnz_lno_t _vertexListLength;
     nnz_lno_t _chunkSize;
 
-    functorGreedyColor_IMPLOG_EF(
-        nnz_lno_t nv_,
-        const_lno_row_view_t xadj_,
-		nnz_lno_temp_work_view_t adj_,
-        color_view_type colors,
-        nnz_lno_temp_work_view_t vertexList,
-        nnz_lno_t vertexListLength,
-        nnz_lno_t chunkSize) :nv(nv_), _idx(xadj_), _adj(adj_), _colors(colors),
-      _vertexList(vertexList), _vertexListLength(vertexListLength), _chunkSize(chunkSize){}
+    functorGreedyColor_IMPLOG_EF(nnz_lno_t nv_, const_lno_row_view_t xadj_,
+                                 nnz_lno_temp_work_view_t adj_,
+                                 color_view_type colors,
+                                 nnz_lno_temp_work_view_t vertexList,
+                                 nnz_lno_t vertexListLength,
+                                 nnz_lno_t chunkSize)
+        : nv(nv_),
+          _idx(xadj_),
+          _adj(adj_),
+          _colors(colors),
+          _vertexList(vertexList),
+          _vertexListLength(vertexListLength),
+          _chunkSize(chunkSize) {}
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const nnz_lno_t ii) const {
       nnz_lno_t i = 0;
 
-      //outer loop is on chunks, a thread is assigned as many vertex as the chunksize
-      for (nnz_lno_t ichunk=0; ichunk<_chunkSize; ichunk++){
-        if (ii*_chunkSize +ichunk < _vertexListLength){
-          i = _vertexList(ii*_chunkSize +ichunk);
-        }
-        else{
+      // outer loop is on chunks, a thread is assigned as many vertex as the
+      // chunksize
+      for (nnz_lno_t ichunk = 0; ichunk < _chunkSize; ichunk++) {
+        if (ii * _chunkSize + ichunk < _vertexListLength) {
+          i = _vertexList(ii * _chunkSize + ichunk);
+        } else {
           continue;
         }
 
-        if (_colors(i) > 0) continue; // Already colored this vertex
+        if (_colors(i) > 0) continue;  // Already colored this vertex
 
-        size_type my_xadj_end = _idx(i+1);
-        size_type xadjbegin = _idx(i);
+        size_type my_xadj_end = _idx(i + 1);
+        size_type xadjbegin   = _idx(i);
 
         // Do multiple passes if array is too small.
-        color_t degree = my_xadj_end-xadjbegin; // My degree
+        color_t degree = my_xadj_end - xadjbegin;  // My degree
         color_t offset = 0;
 
-        //we parse the neigborlist multiple times,
-        //each time we look for a certain range of colors.
-        for (; (offset <= degree + VBBIT_COLORING_FORBIDDEN_SIZE); offset += VBBIT_COLORING_FORBIDDEN_SIZE){
-
+        // we parse the neigborlist multiple times,
+        // each time we look for a certain range of colors.
+        for (; (offset <= degree + VBBIT_COLORING_FORBIDDEN_SIZE);
+             offset += VBBIT_COLORING_FORBIDDEN_SIZE) {
           // Forbidden colors
-          //we use a single (long) int for forbidden colors
+          // we use a single (long) int for forbidden colors
           ban_type forbidden = 0;
 
           // Check nbors, fill forbidden array.
-          for (size_type j=xadjbegin; j<my_xadj_end; ++j){
-            nnz_lno_t n  = _adj(j);
-            if (n == i || n >= nv) continue; // Skip self-loops
+          for (size_type j = xadjbegin; j < my_xadj_end; ++j) {
+            nnz_lno_t n = _adj(j);
+            if (n == i || n >= nv) continue;  // Skip self-loops
             color_t c = _colors(n);
 
-            color_t color_offset = c-offset;
-            //if color is within the current range, or if its color is in a previously traversed range
-            if (c && color_offset <= VBBIT_COLORING_FORBIDDEN_SIZE){
-              //apply edge filtering, place it to front of the adjacency list,
-              //so that we wont see that anymore.
-              if (j > xadjbegin){
-                _adj(j) = _adj(xadjbegin);
+            color_t color_offset = c - offset;
+            // if color is within the current range, or if its color is in a
+            // previously traversed range
+            if (c && color_offset <= VBBIT_COLORING_FORBIDDEN_SIZE) {
+              // apply edge filtering, place it to front of the adjacency list,
+              // so that we wont see that anymore.
+              if (j > xadjbegin) {
+                _adj(j)         = _adj(xadjbegin);
                 _adj(xadjbegin) = n;
               }
               ++xadjbegin;
 
-              //if it is in the current range, then add the color to banned colors
-              if (c > offset){
-                //convert color to bit representation.
+              // if it is in the current range, then add the color to banned
+              // colors
+              if (c > offset) {
+                // convert color to bit representation.
                 ban_type ban_color_bit = 1;
-                ban_color_bit = ban_color_bit << (color_offset - 1);
-                //add it to forbidden colors
+                ban_color_bit          = ban_color_bit << (color_offset - 1);
+                // add it to forbidden colors
                 forbidden = forbidden | ban_color_bit;
-                //if there are no available colors in this range,
-                //early exit, no need to traverse the rest.
+                // if there are no available colors in this range,
+                // early exit, no need to traverse the rest.
                 if (~forbidden == 0) {
                   break;
                 }
@@ -882,13 +868,13 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
           }
 
           forbidden = ~(forbidden);
-          //check if an available color exits.
-          if (forbidden){
-            //if there is an available color, choose the first color,
-            //using 2s complement.
+          // check if an available color exits.
+          if (forbidden) {
+            // if there is an available color, choose the first color,
+            // using 2s complement.
             ban_type my_new_color = forbidden & (-forbidden);
-            color_t val = 1;
-            //convert it back to decimal color.
+            color_t val           = 1;
+            // convert it back to decimal color.
             while ((my_new_color & 1) == 0) {
               ++val;
               my_new_color = my_new_color >> 1;
@@ -905,7 +891,6 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
    * Functor for VBBIT algorithms speculative coloring without edge filtering.
    */
   struct functorGreedyColor_IMPLOG {
-
     nnz_lno_t nv;
     const_lno_row_view_t _idx;
     const_lno_nnz_view_t _adj;
@@ -914,50 +899,50 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
     nnz_lno_t _vertexListLength;
     nnz_lno_t _chunkSize;
 
-    functorGreedyColor_IMPLOG(
-        nnz_lno_t nv_,
-        const_lno_row_view_t xadj_,
-        const_lno_nnz_view_t adj_,
-        color_view_type colors,
-        nnz_lno_temp_work_view_t vertexList,
-        nnz_lno_t vertexListLength,
-        nnz_lno_t chunkSize) : nv(nv_),
-          _idx(xadj_), _adj(adj_), _colors(colors),
-          _vertexList(vertexList), _vertexListLength(vertexListLength), _chunkSize(chunkSize){}
+    functorGreedyColor_IMPLOG(nnz_lno_t nv_, const_lno_row_view_t xadj_,
+                              const_lno_nnz_view_t adj_, color_view_type colors,
+                              nnz_lno_temp_work_view_t vertexList,
+                              nnz_lno_t vertexListLength, nnz_lno_t chunkSize)
+        : nv(nv_),
+          _idx(xadj_),
+          _adj(adj_),
+          _colors(colors),
+          _vertexList(vertexList),
+          _vertexListLength(vertexListLength),
+          _chunkSize(chunkSize) {}
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const nnz_lno_t ii) const {
       nnz_lno_t i = 0;
-      for (nnz_lno_t ichunk=0; ichunk<_chunkSize; ichunk++){
-        if (ii*_chunkSize +ichunk < _vertexListLength)
-          i = _vertexList(ii*_chunkSize +ichunk);
+      for (nnz_lno_t ichunk = 0; ichunk < _chunkSize; ichunk++) {
+        if (ii * _chunkSize + ichunk < _vertexListLength)
+          i = _vertexList(ii * _chunkSize + ichunk);
         else
           continue;
 
-        if (_colors(i) > 0) continue; // Already colored this vertex
+        if (_colors(i) > 0) continue;  // Already colored this vertex
 
-        size_type my_xadj_end = _idx(i+1);
-        size_type xadjbegin = _idx(i);
+        size_type my_xadj_end = _idx(i + 1);
+        size_type xadjbegin   = _idx(i);
         // Do multiple passes if array is too small.
-        color_t degree = my_xadj_end - xadjbegin; // My degree
+        color_t degree = my_xadj_end - xadjbegin;  // My degree
         color_t offset = 0;
 
-        for (; (offset <= degree + VBBIT_COLORING_FORBIDDEN_SIZE); offset += VBBIT_COLORING_FORBIDDEN_SIZE){
-
-          ban_type forbidden = 0; // Forbidden colors
+        for (; (offset <= degree + VBBIT_COLORING_FORBIDDEN_SIZE);
+             offset += VBBIT_COLORING_FORBIDDEN_SIZE) {
+          ban_type forbidden = 0;  // Forbidden colors
 
           // Check nbors, fill forbidden array.
-          for (size_type j=xadjbegin; j<my_xadj_end; ++j){
-            nnz_lno_t n  = _adj(j);
-            if (n == i || n >= nv) continue; // Skip self-loops
-            color_t c = _colors(n);
-            color_t color_offset = c-offset;
-            //if color is in the current range
-            //convert it to binary and add it to forbidden
-            if (color_offset <= VBBIT_COLORING_FORBIDDEN_SIZE && c > offset){
-
+          for (size_type j = xadjbegin; j < my_xadj_end; ++j) {
+            nnz_lno_t n = _adj(j);
+            if (n == i || n >= nv) continue;  // Skip self-loops
+            color_t c            = _colors(n);
+            color_t color_offset = c - offset;
+            // if color is in the current range
+            // convert it to binary and add it to forbidden
+            if (color_offset <= VBBIT_COLORING_FORBIDDEN_SIZE && c > offset) {
               ban_type ban_color_bit = 1;
-              ban_color_bit = ban_color_bit << (color_offset - 1);
+              ban_color_bit          = ban_color_bit << (color_offset - 1);
 
               forbidden = forbidden | ban_color_bit;
               if (~forbidden == 0) {
@@ -967,7 +952,7 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
           }
           forbidden = (~forbidden);
 
-          if (forbidden){
+          if (forbidden) {
             ban_type my_new_color = forbidden & (-forbidden);
 
             color_t val = 1;
@@ -982,70 +967,71 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
         }
       }
     }
-
   };
 
   /**
    * Functor for VBCS algorithms speculative coloring with edge filtering.
    */
   struct functorGreedyColor_IMP_EF {
-
     nnz_lno_t nv;
     const_lno_row_view_t _xadj;
     nnz_lno_temp_work_view_t _adj;
     color_view_type _colors;
-    nnz_lno_temp_work_view_t _color_set ;
+    nnz_lno_temp_work_view_t _color_set;
     nnz_lno_temp_work_view_t _vertexList;
     nnz_lno_t _vertexListLength;
     nnz_lno_t _chunkSize;
 
-    functorGreedyColor_IMP_EF(
-        nnz_lno_t nv_,
-        const_lno_row_view_t xadj_,
-        nnz_lno_temp_work_view_t adj_,
-        color_view_type colors, nnz_lno_temp_work_view_t color_set,
-        nnz_lno_temp_work_view_t vertexList,
-        nnz_lno_t vertexListLength,
-        nnz_lno_t chunkSize): nv(nv_),
-        _xadj(xadj_), _adj(adj_),
-        _colors(colors), _color_set(color_set),
-        _vertexList(vertexList), _vertexListLength(vertexListLength),
-        _chunkSize(chunkSize){}
+    functorGreedyColor_IMP_EF(nnz_lno_t nv_, const_lno_row_view_t xadj_,
+                              nnz_lno_temp_work_view_t adj_,
+                              color_view_type colors,
+                              nnz_lno_temp_work_view_t color_set,
+                              nnz_lno_temp_work_view_t vertexList,
+                              nnz_lno_t vertexListLength, nnz_lno_t chunkSize)
+        : nv(nv_),
+          _xadj(xadj_),
+          _adj(adj_),
+          _colors(colors),
+          _color_set(color_set),
+          _vertexList(vertexList),
+          _vertexListLength(vertexListLength),
+          _chunkSize(chunkSize) {}
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const nnz_lno_t &ii) const {
       nnz_lno_t i = 0;
-      for (nnz_lno_t ichunk=0; ichunk<_chunkSize; ichunk++){
-        if (ii*_chunkSize +ichunk < _vertexListLength)
-          i = _vertexList(ii*_chunkSize +ichunk);
+      for (nnz_lno_t ichunk = 0; ichunk < _chunkSize; ichunk++) {
+        if (ii * _chunkSize + ichunk < _vertexListLength)
+          i = _vertexList(ii * _chunkSize + ichunk);
         else
           continue;
 
-        if (_colors(i) > 0) continue; // Already colored this vertex
-        size_type xadj_end = _xadj(i+1);
+        if (_colors(i) > 0) continue;  // Already colored this vertex
+        size_type xadj_end   = _xadj(i + 1);
         size_type xadj_begin = _xadj(i);
 
-        //my color set starts from zero, but if we are leaving vertices
-        //that cannot be colored in this iteration, we retrieve it from their previous color_sets.
+        // my color set starts from zero, but if we are leaving vertices
+        // that cannot be colored in this iteration, we retrieve it from their
+        // previous color_sets.
         nnz_lno_t my_color_set = 0;
-        while (1){
+        while (1) {
           color_t ban_colors = 0;
 
-          for (size_type j = xadj_begin; j < xadj_end && ~ban_colors; ++j){
+          for (size_type j = xadj_begin; j < xadj_end && ~ban_colors; ++j) {
             nnz_lno_t n = _adj(j);
-            if (n == i|| n >= nv) continue; // Skip self-loops
+            if (n == i || n >= nv) continue;  // Skip self-loops
 
             nnz_lno_t neighbor_color_set = _color_set(n);
-            //only if the neigbor has the same color set
-            if (neighbor_color_set <= my_color_set ){
+            // only if the neigbor has the same color set
+            if (neighbor_color_set <= my_color_set) {
               color_t ncolor = _colors(n);
-              if (ncolor){
-                if(j > xadj_begin){
-                  _adj(j) = _adj(xadj_begin);
+              if (ncolor) {
+                if (j > xadj_begin) {
+                  _adj(j)          = _adj(xadj_begin);
                   _adj(xadj_begin) = n;
                 }
                 ++xadj_begin;
-                if (neighbor_color_set == my_color_set){
+                if (neighbor_color_set == my_color_set) {
                   ban_colors = ban_colors | ncolor;
                 }
               }
@@ -1053,13 +1039,12 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
           }
 
           ban_colors = ~(ban_colors);
-          if (ban_colors){
+          if (ban_colors) {
             color_t my_color = ban_colors & (-ban_colors);
-            _color_set(i) = my_color_set;
-            _colors(i) = my_color;
+            _color_set(i)    = my_color_set;
+            _colors(i)       = my_color;
             break;
-          }
-          else{
+          } else {
             my_color_set += 1;
           }
         }
@@ -1071,57 +1056,55 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
    * Functor for VBCS algorithms speculative coloring without edge filtering.
    */
   struct functorGreedyColor_IMP {
-
     nnz_lno_t nv;
     const_lno_row_view_t _xadj;
     const_lno_nnz_view_t _adj;
     color_view_type _colors;
-    nnz_lno_temp_work_view_t _color_set ;
+    nnz_lno_temp_work_view_t _color_set;
     nnz_lno_temp_work_view_t _vertexList;
     nnz_lno_t _vertexListLength;
     nnz_lno_t _chunkSize;
 
-
-    functorGreedyColor_IMP(
-        nnz_lno_t nv_,
-        const_lno_row_view_t xadj_,
-        const_lno_nnz_view_t adj_,
-        color_view_type colors,
-        nnz_lno_temp_work_view_t color_set,
-        nnz_lno_temp_work_view_t vertexList,
-        nnz_lno_t vertexListLength,
-        nnz_lno_t chunkSize
-    ) : nv (nv_),
-      _xadj(xadj_), _adj(adj_),
-      _colors(colors), _color_set(color_set),
-      _vertexList(vertexList), _vertexListLength(vertexListLength),
-      _chunkSize(chunkSize){}
+    functorGreedyColor_IMP(nnz_lno_t nv_, const_lno_row_view_t xadj_,
+                           const_lno_nnz_view_t adj_, color_view_type colors,
+                           nnz_lno_temp_work_view_t color_set,
+                           nnz_lno_temp_work_view_t vertexList,
+                           nnz_lno_t vertexListLength, nnz_lno_t chunkSize)
+        : nv(nv_),
+          _xadj(xadj_),
+          _adj(adj_),
+          _colors(colors),
+          _color_set(color_set),
+          _vertexList(vertexList),
+          _vertexListLength(vertexListLength),
+          _chunkSize(chunkSize) {}
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const nnz_lno_t &ii) const {
       nnz_lno_t i = 0;
-      for (nnz_lno_t ichunk=0; ichunk<_chunkSize; ichunk++){
-        if (ii*_chunkSize +ichunk < _vertexListLength)
-          i = _vertexList(ii*_chunkSize +ichunk);
+      for (nnz_lno_t ichunk = 0; ichunk < _chunkSize; ichunk++) {
+        if (ii * _chunkSize + ichunk < _vertexListLength)
+          i = _vertexList(ii * _chunkSize + ichunk);
         else
           continue;
 
-        if (_colors(i) > 0) continue; // Already colored this vertex
-        size_type xadj_end = _xadj(i+1);
+        if (_colors(i) > 0) continue;  // Already colored this vertex
+        size_type xadj_end   = _xadj(i + 1);
         size_type xadj_begin = _xadj(i);
 
-        //my color set starts from zero, but if we are leaving vertices
-        //that cannot be colored in this iteration, we retrieve it from their previous color_sets.
-        nnz_lno_t my_color_set =  0;
-        //idx degree = xadj_end - xadj_begin;
-        for (; ;){
+        // my color set starts from zero, but if we are leaving vertices
+        // that cannot be colored in this iteration, we retrieve it from their
+        // previous color_sets.
+        nnz_lno_t my_color_set = 0;
+        // idx degree = xadj_end - xadj_begin;
+        for (;;) {
           color_t ban_colors = 0;
-          for (size_type j = xadj_begin; j < xadj_end ;++j){
+          for (size_type j = xadj_begin; j < xadj_end; ++j) {
             nnz_lno_t n = _adj(j);
-            if (n == i|| n >= nv) continue; // Skip self-loops
-            if ( my_color_set == _color_set(n)){
+            if (n == i || n >= nv) continue;  // Skip self-loops
+            if (my_color_set == _color_set(n)) {
               ban_colors = ban_colors | _colors(n);
-              if (~ban_colors == 0){
+              if (~ban_colors == 0) {
                 break;
               }
             }
@@ -1129,13 +1112,12 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
 
           ban_colors = ~(ban_colors);
 
-          if (ban_colors){
+          if (ban_colors) {
             color_t my_color = ban_colors & (-ban_colors);
-            _color_set(i) = my_color_set;
-            _colors(i) = my_color;
+            _color_set(i)    = my_color_set;
+            _colors(i)       = my_color;
             break;
-          }
-          else{
+          } else {
             my_color_set += 1;
           }
         }
@@ -1155,17 +1137,17 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
     nnz_lno_t _vertexListLength;
     nnz_lno_t _chunkSize;
 
-    functorGreedyColor_EF(
-        nnz_lno_t nv_,
-        const_lno_row_view_t xadj_,
-		nnz_lno_temp_work_view_t adj_,
-        color_view_type colors,
-        nnz_lno_temp_work_view_t vertexList,
-        nnz_lno_t vertexListLength,
-        nnz_lno_t chunkSize
-    ) : nv (nv_),
-      _idx(xadj_), _adj(adj_), _colors(colors),
-      _vertexList(vertexList), _vertexListLength(vertexListLength), _chunkSize(chunkSize){}
+    functorGreedyColor_EF(nnz_lno_t nv_, const_lno_row_view_t xadj_,
+                          nnz_lno_temp_work_view_t adj_, color_view_type colors,
+                          nnz_lno_temp_work_view_t vertexList,
+                          nnz_lno_t vertexListLength, nnz_lno_t chunkSize)
+        : nv(nv_),
+          _idx(xadj_),
+          _adj(adj_),
+          _colors(colors),
+          _vertexList(vertexList),
+          _vertexListLength(vertexListLength),
+          _chunkSize(chunkSize) {}
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const nnz_lno_t ii) const {
@@ -1179,64 +1161,66 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
       //       and reused for all vertices in the chunk.
       //
       nnz_lno_t i = 0;
-      for (nnz_lno_t ichunk=0; ichunk<_chunkSize; ichunk++){
-        if (ii*_chunkSize +ichunk < _vertexListLength)
-          i = _vertexList(ii*_chunkSize +ichunk);
+      for (nnz_lno_t ichunk = 0; ichunk < _chunkSize; ichunk++) {
+        if (ii * _chunkSize + ichunk < _vertexListLength)
+          i = _vertexList(ii * _chunkSize + ichunk);
         else
           continue;
 
-        if (_colors(i) > 0) continue; // Already colored this vertex
+        if (_colors(i) > 0) continue;  // Already colored this vertex
 
-        bool foundColor = false; // Have we found a valid color?
+        bool foundColor = false;  // Have we found a valid color?
 
         // Use forbidden array to find available color.
-        // This array should be small enough to fit in fast memory (use Kokkos memoryspace?)
-        bool forbidden[VB_COLORING_FORBIDDEN_SIZE]; // Forbidden colors
+        // This array should be small enough to fit in fast memory (use Kokkos
+        // memoryspace?)
+        bool forbidden[VB_COLORING_FORBIDDEN_SIZE];  // Forbidden colors
 
         // Do multiple passes if array is too small.
-        color_t degree = _idx(i+1)-_idx(i); // My degree
-        size_type my_xadj_end = _idx(i+1);
-        color_t offset = 0;
-        size_type xadjbegin = _idx(i);
+        color_t degree        = _idx(i + 1) - _idx(i);  // My degree
+        size_type my_xadj_end = _idx(i + 1);
+        color_t offset        = 0;
+        size_type xadjbegin   = _idx(i);
 
-        for (; (offset <= degree + VB_COLORING_FORBIDDEN_SIZE) && (!foundColor); offset += VB_COLORING_FORBIDDEN_SIZE){
+        for (; (offset <= degree + VB_COLORING_FORBIDDEN_SIZE) && (!foundColor);
+             offset += VB_COLORING_FORBIDDEN_SIZE) {
           // initialize
-          for (int j=0; j< VB_COLORING_FORBIDDEN_SIZE; j++){
+          for (int j = 0; j < VB_COLORING_FORBIDDEN_SIZE; j++) {
             forbidden[j] = false;
           }
-          if (offset == 0) forbidden[0] = true; // by convention, start at 1
+          if (offset == 0) forbidden[0] = true;  // by convention, start at 1
 
           // Check nbors, fill forbidden array.
-          for (size_type j=xadjbegin; j<my_xadj_end; ++j){
-            nnz_lno_t n  = _adj(j);
-            if (n == i|| n >= nv) {
-              continue; // Skip self-loops
+          for (size_type j = xadjbegin; j < my_xadj_end; ++j) {
+            nnz_lno_t n = _adj(j);
+            if (n == i || n >= nv) {
+              continue;  // Skip self-loops
             }
-            color_t c= _colors(n);
-            // Removed option to leave potentially conflicted vertices uncolored.
-            //if (c== -1){ // Nbor is being colored at same time
+            color_t c = _colors(n);
+            // Removed option to leave potentially conflicted vertices
+            // uncolored.
+            // if (c== -1){ // Nbor is being colored at same time
             //  _colors[i] = 0; // Neutral color, skip and recolor later
             //  foundColor = true;
             //  return;
             //}
-            if ((c>= offset) && (c-offset < VB_COLORING_FORBIDDEN_SIZE)){
-              forbidden[c-offset] = true;
+            if ((c >= offset) && (c - offset < VB_COLORING_FORBIDDEN_SIZE)) {
+              forbidden[c - offset] = true;
             }
-            if (c && c-offset < VB_COLORING_FORBIDDEN_SIZE){
-              if (j > xadjbegin){
-                _adj(j) = _adj(xadjbegin);
+            if (c && c - offset < VB_COLORING_FORBIDDEN_SIZE) {
+              if (j > xadjbegin) {
+                _adj(j)         = _adj(xadjbegin);
                 _adj(xadjbegin) = n;
               }
               ++xadjbegin;
             }
-
           }
 
           // color vertex i with smallest available color (FirstFit)
           // TODO: Add options for other color choices (Random, LeastUsed)
-          for (int c=0; c< VB_COLORING_FORBIDDEN_SIZE; c++){
-            if (!forbidden[c]){
-              _colors(i) = offset+c;
+          for (int c = 0; c < VB_COLORING_FORBIDDEN_SIZE; c++) {
+            if (!forbidden[c]) {
+              _colors(i) = offset + c;
               //_colors[i] += (i&1); // RandX strategy to reduce conflicts
               foundColor = true;
               break;
@@ -1245,10 +1229,8 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
         }
       }
     }
-
   };
 
-
   /**
    * Functor for VB algorithm speculative coloring without edge filtering.
    */
@@ -1261,17 +1243,17 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
     nnz_lno_t _vertexListLength;
     nnz_lno_t _chunkSize;
 
-    functorGreedyColor(
-        nnz_lno_t nv_,
-        const_lno_row_view_t xadj_,
-        const_lno_nnz_view_t adj_,
-        color_view_type colors,
-        nnz_lno_temp_work_view_t vertexList,
-        nnz_lno_t vertexListLength,
-        nnz_lno_t chunkSize
-    ) : nv (nv_),
-      _idx(xadj_), _adj(adj_), _colors(colors),
-      _vertexList(vertexList), _vertexListLength(vertexListLength), _chunkSize(chunkSize){}
+    functorGreedyColor(nnz_lno_t nv_, const_lno_row_view_t xadj_,
+                       const_lno_nnz_view_t adj_, color_view_type colors,
+                       nnz_lno_temp_work_view_t vertexList,
+                       nnz_lno_t vertexListLength, nnz_lno_t chunkSize)
+        : nv(nv_),
+          _idx(xadj_),
+          _adj(adj_),
+          _colors(colors),
+          _vertexList(vertexList),
+          _vertexListLength(vertexListLength),
+          _chunkSize(chunkSize) {}
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const nnz_lno_t ii) const {
@@ -1285,48 +1267,51 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
       //       and reused for all vertices in the chunk.
       //
       nnz_lno_t i = 0;
-      for (nnz_lno_t ichunk=0; ichunk<_chunkSize; ichunk++){
-        if (ii*_chunkSize +ichunk < _vertexListLength)
-          i = _vertexList(ii*_chunkSize +ichunk);
+      for (nnz_lno_t ichunk = 0; ichunk < _chunkSize; ichunk++) {
+        if (ii * _chunkSize + ichunk < _vertexListLength)
+          i = _vertexList(ii * _chunkSize + ichunk);
         else
           continue;
 
-        if (_colors(i) > 0) continue; // Already colored this vertex
+        if (_colors(i) > 0) continue;  // Already colored this vertex
 
-        bool foundColor = false; // Have we found a valid color?
+        bool foundColor = false;  // Have we found a valid color?
 
         // Use forbidden array to find available color.
-        // This array should be small enough to fit in fast memory (use Kokkos memoryspace?)
-        bool forbidden[VB_COLORING_FORBIDDEN_SIZE]; // Forbidden colors
+        // This array should be small enough to fit in fast memory (use Kokkos
+        // memoryspace?)
+        bool forbidden[VB_COLORING_FORBIDDEN_SIZE];  // Forbidden colors
 
         // Do multiple passes if array is too small.
-        color_t degree = _idx(i+1)-_idx(i); // My degree
+        color_t degree = _idx(i + 1) - _idx(i);  // My degree
         color_t offset = 1;
-        for (; (offset <= degree + VB_COLORING_FORBIDDEN_SIZE) && (!foundColor); offset += VB_COLORING_FORBIDDEN_SIZE){
+        for (; (offset <= degree + VB_COLORING_FORBIDDEN_SIZE) && (!foundColor);
+             offset += VB_COLORING_FORBIDDEN_SIZE) {
           // initialize
-          for (int j=0; j< VB_COLORING_FORBIDDEN_SIZE; j++){
+          for (int j = 0; j < VB_COLORING_FORBIDDEN_SIZE; j++) {
             forbidden[j] = false;
           }
 
           // Check nbors, fill forbidden array.
-          for (size_type j=_idx(i); j<_idx(i+1); j++){
-            if (_adj(j) == i|| _adj(j)  >= nv) continue; // Skip self-loops
-            color_t c= _colors(_adj(j));
-            // Removed option to leave potentially conflicted vertices uncolored.
-            //if (c== -1){ // Nbor is being colored at same time
+          for (size_type j = _idx(i); j < _idx(i + 1); j++) {
+            if (_adj(j) == i || _adj(j) >= nv) continue;  // Skip self-loops
+            color_t c = _colors(_adj(j));
+            // Removed option to leave potentially conflicted vertices
+            // uncolored.
+            // if (c== -1){ // Nbor is being colored at same time
             //  _colors[i] = 0; // Neutral color, skip and recolor later
             //  foundColor = true;
             //  return;
             //}
-            if ((c>= offset) && (c-offset < VB_COLORING_FORBIDDEN_SIZE))
-              forbidden[c-offset] = true;
+            if ((c >= offset) && (c - offset < VB_COLORING_FORBIDDEN_SIZE))
+              forbidden[c - offset] = true;
           }
 
           // color vertex i with smallest available color (FirstFit)
           // TODO: Add options for other color choices (Random, LeastUsed)
-          for (int c=0; c< VB_COLORING_FORBIDDEN_SIZE; c++){
-            if (!forbidden[c]){
-              _colors(i) = offset+c;
+          for (int c = 0; c < VB_COLORING_FORBIDDEN_SIZE; c++) {
+            if (!forbidden[c]) {
+              _colors(i) = offset + c;
               //_colors[i] += (i&1); // RandX strategy to reduce conflicts
               foundColor = true;
               break;
@@ -1335,41 +1320,33 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
         }
       }
     }
-  };      // functorGreedyColor  (end)
+  };  // functorGreedyColor  (end)
 
-
-  //Conflict find and worklist creation functors.
+  // Conflict find and worklist creation functors.
 
   /**
    * Finds conflicts without creating a new worklist
    */
   template <typename adj_view_t>
   struct functorFindConflicts_No_Conflist {
-
     nnz_lno_t nv;
     const_lno_row_view_t _idx;
     adj_view_t _adj;
     color_view_type _colors;
 
-    functorFindConflicts_No_Conflist(
-        nnz_lno_t nv_,
-        const_lno_row_view_t xadj_,
-		adj_view_t adj_,
-        color_view_type colors) : nv (nv_),
-          _idx(xadj_), _adj(adj_),_colors(colors)
-    {
-    }
+    functorFindConflicts_No_Conflist(nnz_lno_t nv_, const_lno_row_view_t xadj_,
+                                     adj_view_t adj_, color_view_type colors)
+        : nv(nv_), _idx(xadj_), _adj(adj_), _colors(colors) {}
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const nnz_lno_t ii, nnz_lno_t &numConflicts) const {
-
-      color_t my_color = _colors(ii);
-      size_type xadjend = _idx(ii+1);
-      size_type j=_idx(ii);
+      color_t my_color  = _colors(ii);
+      size_type xadjend = _idx(ii + 1);
+      size_type j       = _idx(ii);
 #ifdef DEGREECOMP
       idx myDegree = xadjend - j;
 #endif
-      for (; j< xadjend; j++){
+      for (; j < xadjend; j++) {
         nnz_lno_t neighbor = _adj(j);
 
         if (
@@ -1378,22 +1355,24 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
 #endif
             _colors(neighbor) == my_color
 #ifdef DEGREECOMP
-            && (myDegree < _idx(neighbor + 1) - _idx(neighbor) ||
-                (myDegree == _idx(neighbor + 1) - _idx(neighbor) && ii < neighbor))
+            &&
+            (myDegree < _idx(neighbor + 1) - _idx(neighbor) ||
+             (myDegree == _idx(neighbor + 1) - _idx(neighbor) && ii < neighbor))
 #endif
         ) {
-          //std::cout << "me:" << ii << " n:" << neighbor << " color:" << my_color << std::endl;
-          _colors(ii) = 0; // Uncolor vertex i
+          // std::cout << "me:" << ii << " n:" << neighbor << " color:" <<
+          // my_color << std::endl;
+          _colors(ii) = 0;  // Uncolor vertex i
           numConflicts += 1;
-          break; // Once i is uncolored and marked conflict
+          break;  // Once i is uncolored and marked conflict
         }
       }
     }
   };
 
-
   /**
-   * Finds conflicts by marking the work vertices to be used later for creation of new worklist with PPS
+   * Finds conflicts by marking the work vertices to be used later for creation
+   * of new worklist with PPS
    */
   template <typename adj_view_t>
   struct functorFindConflicts_PPS {
@@ -1403,27 +1382,27 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
     color_view_type _colors;
     nnz_lno_temp_work_view_t _vertexList;
 
-    functorFindConflicts_PPS(
-        nnz_lno_t nv_,
-        const_lno_row_view_t xadj_, adj_view_t adj_,
-        color_view_type colors,
-        nnz_lno_temp_work_view_t vertexList) :
-          nv (nv_),
-          _idx(xadj_), _adj(adj_), _colors(colors),
+    functorFindConflicts_PPS(nnz_lno_t nv_, const_lno_row_view_t xadj_,
+                             adj_view_t adj_, color_view_type colors,
+                             nnz_lno_temp_work_view_t vertexList)
+        : nv(nv_),
+          _idx(xadj_),
+          _adj(adj_),
+          _colors(colors),
           _vertexList(vertexList) {}
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const nnz_lno_t ii, nnz_lno_t &numConflicts) const {
-      nnz_lno_t i = _vertexList(ii);
+      nnz_lno_t i      = _vertexList(ii);
       color_t my_color = _colors(i);
       // check vertex i conflicts
 
-      size_type xadjend = _idx(i+1);
-      size_type j=_idx(i);
+      size_type xadjend = _idx(i + 1);
+      size_type j       = _idx(i);
 #ifdef DEGREECOMP
       idx myDegree = xadjend - j;
 #endif
-      for (; j<xadjend; j++){
+      for (; j < xadjend; j++) {
         nnz_lno_t neighbor = _adj(j);
         if (
 #ifndef DEGREECOMP
@@ -1431,20 +1410,20 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
 #endif
             _colors(neighbor) == my_color
 #ifdef DEGREECOMP
-            && (myDegree < _idx(neighbor + 1) - _idx(neighbor)
-                                                     || (myDegree == _idx(neighbor + 1) - _idx(neighbor) && i < neighbor))
+            &&
+            (myDegree < _idx(neighbor + 1) - _idx(neighbor) ||
+             (myDegree == _idx(neighbor + 1) - _idx(neighbor) && i < neighbor))
 #endif
         ) {
-          _colors(i) = 0; // Uncolor vertex i
+          _colors(i) = 0;  // Uncolor vertex i
           _vertexList(ii) += nv;
           numConflicts += 1;
-          break; // Once i is uncolored and marked conflict
+          break;  // Once i is uncolored and marked conflict
         }
       }
     }
   };
 
-
   /**
    * Finds conflicts and creates new worklist using atomic operations.
    */
@@ -1458,35 +1437,35 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
     nnz_lno_temp_work_view_t _recolorList;
     single_dim_index_view_type _recolorListLength;
 
-
-    functorFindConflicts_Atomic(
-        nnz_lno_t nv_,
-        const_lno_row_view_t xadj_,
-        adj_view_t adj_,
-        color_view_type colors,
-        nnz_lno_temp_work_view_t vertexList,
-        nnz_lno_temp_work_view_t recolorList,
-        single_dim_index_view_type recolorListLength
-    ) : nv (nv_),
-      _idx(xadj_), _adj(adj_), _colors(colors),
-      _vertexList(vertexList),
-      _recolorList(recolorList),
-      _recolorListLength(recolorListLength){}
+    functorFindConflicts_Atomic(nnz_lno_t nv_, const_lno_row_view_t xadj_,
+                                adj_view_t adj_, color_view_type colors,
+                                nnz_lno_temp_work_view_t vertexList,
+                                nnz_lno_temp_work_view_t recolorList,
+                                single_dim_index_view_type recolorListLength)
+        : nv(nv_),
+          _idx(xadj_),
+          _adj(adj_),
+          _colors(colors),
+          _vertexList(vertexList),
+          _recolorList(recolorList),
+          _recolorListLength(recolorListLength) {}
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const nnz_lno_t ii, nnz_lno_t &numConflicts) const {
-      typedef typename std::remove_reference< decltype( _recolorListLength() ) >::type atomic_incr_type;
+      typedef
+          typename std::remove_reference<decltype(_recolorListLength())>::type
+              atomic_incr_type;
 
-      nnz_lno_t i = _vertexList(ii);
+      nnz_lno_t i      = _vertexList(ii);
       color_t my_color = _colors(i);
 
-      size_type xadjend = _idx(i+1);
-      size_type j=_idx(i);
+      size_type xadjend = _idx(i + 1);
+      size_type j       = _idx(i);
 #ifdef DEGREECOMP
       idx myDegree = xadjend - j;
 #endif
 
-      for (; j < xadjend; j++){
+      for (; j < xadjend; j++) {
         nnz_lno_t neighbor = _adj(j);
         if (
 #ifndef DEGREECOMP
@@ -1494,21 +1473,22 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
 #endif
             _colors(neighbor) == my_color
 #ifdef DEGREECOMP
-            && (myDegree < _idx(neighbor + 1) - _idx(neighbor) ||
-                (myDegree == _idx(neighbor + 1) - _idx(neighbor) && i < neighbor))
+            &&
+            (myDegree < _idx(neighbor + 1) - _idx(neighbor) ||
+             (myDegree == _idx(neighbor + 1) - _idx(neighbor) && i < neighbor))
 #endif
         ) {
-          _colors(i) = 0; // Uncolor vertex i
+          _colors(i) = 0;  // Uncolor vertex i
           // Atomically add vertex i to recolorList
-          const nnz_lno_t k = Kokkos::atomic_fetch_add( &_recolorListLength(), atomic_incr_type(1));
-          _recolorList(k) = i;
+          const nnz_lno_t k = Kokkos::atomic_fetch_add(&_recolorListLength(),
+                                                       atomic_incr_type(1));
+          _recolorList(k)   = i;
           numConflicts += 1;
-          break; // Once i is uncolored and marked conflict
+          break;  // Once i is uncolored and marked conflict
         }
       }
     }
-  };    // struct functorFindConflicts_Atomic (end)
-
+  };  // struct functorFindConflicts_Atomic (end)
 
   /**
    * VBCS:  Finds conflicts without creating a new worklist
@@ -1516,66 +1496,67 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
   template <typename adj_view_t>
 
   struct functorFindConflicts_No_Conflist_IMP {
-
     nnz_lno_t nv;
     const_lno_row_view_t _xadj;
     adj_view_t _adj;
     color_view_type _colors;
     nnz_lno_temp_work_view_t _color_sets;
 
-
-    functorFindConflicts_No_Conflist_IMP(
-        nnz_lno_t nv_,
-        const_lno_row_view_t xadj_,
-		adj_view_t adj_,
-        color_view_type colors,
-        nnz_lno_temp_work_view_t color_sets
-    ) : nv (nv_),
-      _xadj(xadj_), _adj(adj_), _colors(colors), _color_sets(color_sets){}
+    functorFindConflicts_No_Conflist_IMP(nnz_lno_t nv_,
+                                         const_lno_row_view_t xadj_,
+                                         adj_view_t adj_,
+                                         color_view_type colors,
+                                         nnz_lno_temp_work_view_t color_sets)
+        : nv(nv_),
+          _xadj(xadj_),
+          _adj(adj_),
+          _colors(colors),
+          _color_sets(color_sets) {}
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const nnz_lno_t ii, nnz_lno_t &numConflicts) const {
       color_t my_color = _colors(ii);
-      if (my_color == 0){
-        // this should only happen when one_color_set_per_iteration is set to true.
+      if (my_color == 0) {
+        // this should only happen when one_color_set_per_iteration is set to
+        // true.
         numConflicts++;
-      }
-      else {
+      } else {
         nnz_lno_t my_color_set = _color_sets(ii);
-        size_type my_xadj_end = _xadj(ii+1);
+        size_type my_xadj_end  = _xadj(ii + 1);
         // check vertex i conflicts
 
-        size_type j=_xadj(ii);
+        size_type j = _xadj(ii);
 #ifdef DEGREECOMP
         idx myDegree = my_xadj_end - j;
 #endif
 
-        for (; j<my_xadj_end; j++){
+        for (; j < my_xadj_end; j++) {
           nnz_lno_t neighbor = _adj(j);
           if (
 #ifndef DEGREECOMP
               ii < neighbor && neighbor < nv &&
 #endif
-              _colors(neighbor) == my_color && my_color_set == _color_sets(neighbor)
+              _colors(neighbor) == my_color &&
+              my_color_set == _color_sets(neighbor)
 #ifdef DEGREECOMP
-              && (myDegree < _xadj(neighbor + 1) - _xadj(neighbor)||
-                  (myDegree == _xadj(neighbor + 1) - _xadj(neighbor) && ii < neighbor))
+              && (myDegree < _xadj(neighbor + 1) - _xadj(neighbor) ||
+                  (myDegree == _xadj(neighbor + 1) - _xadj(neighbor) &&
+                   ii < neighbor))
 #endif
           ) {
-            _colors(ii) = 0; // Uncolor vertex i
+            _colors(ii)     = 0;  // Uncolor vertex i
             _color_sets(ii) = 0;
             numConflicts++;
-            break; // Once i is uncolored and marked conflict
+            break;  // Once i is uncolored and marked conflict
           }
         }
-
       }
     }
-  };    // functorFindConflicts_No_Conflist_IMP (end)
-
+  };  // functorFindConflicts_No_Conflist_IMP (end)
 
   /**
-   * VBCS: Finds conflicts by marking the work vertices to be used later for creation of new worklist with PPS
+   * VBCS: Finds conflicts by marking the work vertices to be used later for
+   * creation of new worklist with PPS
    */
   template <typename adj_view_t>
   struct functorFindConflicts_PPS_IMP {
@@ -1586,65 +1567,65 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
     nnz_lno_temp_work_view_t _color_sets;
     nnz_lno_temp_work_view_t _vertexList;
 
-    functorFindConflicts_PPS_IMP(
-        nnz_lno_t nv_,
-        const_lno_row_view_t xadj_,
-		adj_view_t adj_,
-        color_view_type colors,
-        nnz_lno_temp_work_view_t color_sets,
-        nnz_lno_temp_work_view_t vertexList
-    ) : nv (nv_),
-      _xadj(xadj_), _adj(adj_), _colors(colors), _color_sets(color_sets),
-      _vertexList(vertexList) {}
+    functorFindConflicts_PPS_IMP(nnz_lno_t nv_, const_lno_row_view_t xadj_,
+                                 adj_view_t adj_, color_view_type colors,
+                                 nnz_lno_temp_work_view_t color_sets,
+                                 nnz_lno_temp_work_view_t vertexList)
+        : nv(nv_),
+          _xadj(xadj_),
+          _adj(adj_),
+          _colors(colors),
+          _color_sets(color_sets),
+          _vertexList(vertexList) {}
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const nnz_lno_t ii, nnz_lno_t &numConflicts) const {
-      //go through vertices, marking in _vertexList those which are uncolored or in conflict.
-      nnz_lno_t i = _vertexList(ii);
+      // go through vertices, marking in _vertexList those which are uncolored
+      // or in conflict.
+      nnz_lno_t i      = _vertexList(ii);
       color_t my_color = _colors(i);
-      if (my_color == 0){
+      if (my_color == 0) {
         _vertexList(ii) += nv;
         numConflicts++;
-      }
-      else {
+      } else {
         nnz_lno_t my_color_set = _color_sets(i);
-        size_type my_xadj_end = _xadj(i+1);
+        size_type my_xadj_end  = _xadj(i + 1);
         // check vertex i conflicts
 
-        size_type j=_xadj(i);
+        size_type j = _xadj(i);
 #ifdef DEGREECOMP
         idx myDegree = my_xadj_end - j;
 #endif
-        for (; j<my_xadj_end; j++){
+        for (; j < my_xadj_end; j++) {
           nnz_lno_t neighbor = _adj(j);
           if (
 #ifndef DEGREECOMP
               i < neighbor && neighbor < nv &&
 #endif
-              _colors(neighbor) == my_color && my_color_set == _color_sets(neighbor)
+              _colors(neighbor) == my_color &&
+              my_color_set == _color_sets(neighbor)
 #ifdef DEGREECOMP
-              && (myDegree < _xadj(neighbor + 1) - _xadj(neighbor)||
-                  (myDegree == _xadj(neighbor + 1) - _xadj(neighbor) && i < neighbor))
+              && (myDegree < _xadj(neighbor + 1) - _xadj(neighbor) ||
+                  (myDegree == _xadj(neighbor + 1) - _xadj(neighbor) &&
+                   i < neighbor))
 #endif
           ) {
-            _colors(i) = 0; // Uncolor vertex i
+            _colors(i)     = 0;  // Uncolor vertex i
             _color_sets(i) = 0;
             _vertexList(ii) += nv;
             numConflicts++;
-            break; // Once i is uncolored and marked conflict
+            break;  // Once i is uncolored and marked conflict
           }
         }
       }
     }
-  };    // functorFindConflicts_PPS_IMP (end)
-
+  };  // functorFindConflicts_PPS_IMP (end)
 
   /**
    * VBCS:Finds conflicts and creates new worklist using atomic operations.
    */
   template <typename adj_view_t>
   struct functorFindConflicts_Atomic_IMP {
-
     nnz_lno_t nv;
     const_lno_row_view_t _xadj;
     adj_view_t _adj;
@@ -1654,75 +1635,80 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
     nnz_lno_temp_work_view_t _recolorList;
     single_dim_index_view_type _recolorListLength;
 
-
     functorFindConflicts_Atomic_IMP(
-        nnz_lno_t nv_,
-        const_lno_row_view_t xadj_,
-		adj_view_t adj_,
-        color_view_type colors,
-        nnz_lno_temp_work_view_t color_sets,
+        nnz_lno_t nv_, const_lno_row_view_t xadj_, adj_view_t adj_,
+        color_view_type colors, nnz_lno_temp_work_view_t color_sets,
         nnz_lno_temp_work_view_t vertexList,
         nnz_lno_temp_work_view_t recolorList,
-        single_dim_index_view_type recolorListLength
-    ) : nv (nv_),
-      _xadj(xadj_), _adj(adj_), _colors(colors), _color_sets(color_sets),
-      _vertexList(vertexList),
-      _recolorList(recolorList),
-      _recolorListLength(recolorListLength){}
+        single_dim_index_view_type recolorListLength)
+        : nv(nv_),
+          _xadj(xadj_),
+          _adj(adj_),
+          _colors(colors),
+          _color_sets(color_sets),
+          _vertexList(vertexList),
+          _recolorList(recolorList),
+          _recolorListLength(recolorListLength) {}
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const nnz_lno_t ii, nnz_lno_t &numConflicts) const {
-      typedef typename std::remove_reference< decltype( _recolorListLength() ) >::type atomic_incr_type;
-      nnz_lno_t i = _vertexList(ii);
+      typedef
+          typename std::remove_reference<decltype(_recolorListLength())>::type
+              atomic_incr_type;
+      nnz_lno_t i      = _vertexList(ii);
       color_t my_color = _colors(i);
-      if (my_color == 0){
-        // this should only happen when one_color_set_per_iteration is set to true.
-        const nnz_lno_t k = Kokkos::atomic_fetch_add( &_recolorListLength(), atomic_incr_type(1));
-        _recolorList(k) = i;
+      if (my_color == 0) {
+        // this should only happen when one_color_set_per_iteration is set to
+        // true.
+        const nnz_lno_t k = Kokkos::atomic_fetch_add(&_recolorListLength(),
+                                                     atomic_incr_type(1));
+        _recolorList(k)   = i;
         numConflicts++;
-      }
-      else {
+      } else {
         nnz_lno_t my_color_set = _color_sets(i);
-        size_type my_xadj_end = _xadj(i+1);
+        size_type my_xadj_end  = _xadj(i + 1);
         // check vertex i conflicts
 
-        size_type j=_xadj(i);
+        size_type j = _xadj(i);
 #ifdef DEGREECOMP
         idx myDegree = my_xadj_end - j;
 #endif
-        for (; j< my_xadj_end; j++){
+        for (; j < my_xadj_end; j++) {
           nnz_lno_t neighbor = _adj(j);
           if (
 #ifndef DEGREECOMP
               i < neighbor && neighbor < nv &&
 #endif
-              _colors(neighbor) == my_color && my_color_set == _color_sets(neighbor)
+              _colors(neighbor) == my_color &&
+              my_color_set == _color_sets(neighbor)
 #ifdef DEGREECOMP
-              && (myDegree < _xadj(neighbor + 1) - _xadj(neighbor)||
-                  (myDegree == _xadj(neighbor + 1) - _xadj(neighbor) && i < neighbor))
+              && (myDegree < _xadj(neighbor + 1) - _xadj(neighbor) ||
+                  (myDegree == _xadj(neighbor + 1) - _xadj(neighbor) &&
+                   i < neighbor))
 #endif
           ) {
-            _colors(i) = 0; // Uncolor vertex i
+            _colors(i)     = 0;  // Uncolor vertex i
             _color_sets(i) = 0;
             // Atomically add vertex i to recolorList
-            const nnz_lno_t k = Kokkos::atomic_fetch_add( &_recolorListLength(), atomic_incr_type(1));
-            _recolorList(k) = i;
+            const nnz_lno_t k = Kokkos::atomic_fetch_add(&_recolorListLength(),
+                                                         atomic_incr_type(1));
+            _recolorList(k)   = i;
             numConflicts++;
-            break; // Once i is uncolored and marked conflict
+            break;  // Once i is uncolored and marked conflict
           }
         }
       }
     }
-  };      // functorFindConflicts_Atomic_IMP (end)
+  };  // functorFindConflicts_Atomic_IMP (end)
 
-  //Helper Functors
+  // Helper Functors
   /**
    * Functor to init a list sequentialy, that is list[i] = i
    */
   template <typename view_type>
-  struct functorInitList{
+  struct functorInitList {
     view_type _vertexList;
-    functorInitList (view_type vertexList) : _vertexList(vertexList) { }
+    functorInitList(view_type vertexList) : _vertexList(vertexList) {}
     KOKKOS_INLINE_FUNCTION
     void operator()(const nnz_lno_t i) const {
       // Natural order
@@ -1730,28 +1716,21 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
     }
   };
 
-
   template <typename view_type>
   struct ppsWorklistFunctorVB {
     nnz_lno_t _nv;
     view_type _vertexList;
     view_type _recolorList;
 
-    ppsWorklistFunctorVB(
-        nnz_lno_t nv_,
-        const view_type& vertexList,
-        const view_type& recolorList)
-      : _nv(nv_), _vertexList(vertexList), _recolorList(recolorList)
-    {}
+    ppsWorklistFunctorVB(nnz_lno_t nv_, const view_type &vertexList,
+                         const view_type &recolorList)
+        : _nv(nv_), _vertexList(vertexList), _recolorList(recolorList) {}
 
     KOKKOS_INLINE_FUNCTION
-    void operator()(nnz_lno_t i, nnz_lno_t& update, const bool final) const
-    {
+    void operator()(nnz_lno_t i, nnz_lno_t &update, const bool final) const {
       nnz_lno_t w = _vertexList(i);
-      if(w >= _nv)
-      {
-        if(final)
-          _recolorList(update) = w - _nv;
+      if (w >= _nv) {
+        if (final) _recolorList(update) = w - _nv;
         update++;
       }
     }
@@ -1760,258 +1739,254 @@ class GraphColor_VB:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t
   /**
    * Converting VBCS colors to final colors.
    */
-  struct set_final_colors{
+  struct set_final_colors {
     color_view_type kokcol;
-    nnz_lno_temp_work_view_t kokcolset; //the colors that are represented with bits, and the colors set that the color is in.
+    nnz_lno_temp_work_view_t
+        kokcolset;  // the colors that are represented with bits, and the colors
+                    // set that the color is in.
     color_t color_size;
 
     /** \brief functor constructor.
      * \param kokcol_  the colors of the vertices. Represented with bits.
-     * \param kokcolset_  the color set of the vertices. kokcolors_ and color_set_ together
-     *      is used to represent the colors e.g. color_set_(v) * (numbits_in_idx-1) + set_bit_position_in_kokcolors_(v)
+     * \param kokcolset_  the color set of the vertices. kokcolors_ and
+     * color_set_ together is used to represent the colors e.g. color_set_(v) *
+     * (numbits_in_idx-1) + set_bit_position_in_kokcolors_(v)
      */
-    set_final_colors(color_view_type kokcol_, nnz_lno_temp_work_view_t kokcolset_
-    ): kokcol(kokcol_),kokcolset(kokcolset_), color_size ( sizeof(color_t) * 8){}
+    set_final_colors(color_view_type kokcol_,
+                     nnz_lno_temp_work_view_t kokcolset_)
+        : kokcol(kokcol_),
+          kokcolset(kokcolset_),
+          color_size(sizeof(color_t) * 8) {}
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const nnz_lno_t &ii) const {
-
       color_t val = kokcol(ii);
-      if (val){
-        //find the position in the bit.
+      if (val) {
+        // find the position in the bit.
         nnz_lno_t i = 1;
         while ((val & 1) == 0) {
           ++i;
           val = val >> 1;
         }
 
-        //idx i = log2(val) + 1;
-        //set the final color.
+        // idx i = log2(val) + 1;
+        // set the final color.
         kokcol(ii) = i + kokcolset(ii) * color_size;
       }
     }
   };
 };  // class GraphColor_VB
 
-
-
-
 /*! \brief Class for the deterministic vertex based graph coloring algorithms.
  */
-template <typename HandleType, typename lno_row_view_t_, typename lno_nnz_view_t_>
-class GraphColor_VBD:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_t_>{
-public:
-
+template <typename HandleType, typename lno_row_view_t_,
+          typename lno_nnz_view_t_>
+class GraphColor_VBD
+    : public GraphColor<HandleType, lno_row_view_t_, lno_nnz_view_t_> {
+ public:
   typedef long long int ban_type;
 
   typedef lno_row_view_t_ in_lno_row_view_t;
   typedef lno_nnz_view_t_ in_lno_nnz_view_t;
   typedef typename HandleType::color_view_t color_view_type;
 
-
   typedef typename HandleType::size_type size_type;
   typedef typename lno_row_view_t_::device_type row_lno_view_device_t;
 
   typedef typename HandleType::nnz_lno_t nnz_lno_t;
 
   typedef typename HandleType::color_t color_t;
-  typedef typename HandleType::color_host_view_t color_host_view_t; //Host view type
+  typedef typename HandleType::color_host_view_t
+      color_host_view_t;  // Host view type
 
   typedef typename HandleType::HandleExecSpace MyExecSpace;
   typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace;
-  typedef typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace;
+  typedef
+      typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace;
 
-  typedef typename Kokkos::View<nnz_lno_t, row_lno_view_device_t> single_dim_index_view_type;
-  typedef typename single_dim_index_view_type::HostMirror single_dim_index_host_view_type; //Host view type
+  typedef typename Kokkos::View<nnz_lno_t, row_lno_view_device_t>
+      single_dim_index_view_type;
+  typedef typename single_dim_index_view_type::HostMirror
+      single_dim_index_host_view_type;  // Host view type
 
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
 
-  typedef typename HandleType::size_type_temp_work_view_t size_type_temp_work_view_t;
-  typedef typename HandleType::size_type_persistent_work_view_t size_type_persistent_work_view_t;
-
-
-  typedef typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t;
-  typedef typename HandleType::nnz_lno_persistent_work_view_t nnz_lno_persistent_work_view_t;
-
+  typedef typename HandleType::size_type_temp_work_view_t
+      size_type_temp_work_view_t;
+  typedef typename HandleType::size_type_persistent_work_view_t
+      size_type_persistent_work_view_t;
 
+  typedef
+      typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t;
+  typedef typename HandleType::nnz_lno_persistent_work_view_t
+      nnz_lno_persistent_work_view_t;
 
   typedef typename in_lno_row_view_t::const_type const_lno_row_view_t;
 
-
   typedef typename lno_nnz_view_t_::const_type const_lno_nnz_view_t;
   typedef typename lno_nnz_view_t_::non_const_type non_const_lno_nnz_view_t;
 
-
-protected:
-
-
-
-  bool _ticToc; //if true print info in each step
-  int _chunkSize; //the size of the minimum work unit assigned to threads. Changes the convergence on GPUs
-  char _use_color_set; //the VBD algorithm type.
+ protected:
+  bool _ticToc;    // if true print info in each step
+  int _chunkSize;  // the size of the minimum work unit assigned to threads.
+                   // Changes the convergence on GPUs
+  char _use_color_set;  // the VBD algorithm type.
                         // 0 for VBD:
 
-public:
+ public:
   /**
    * \brief GraphColor_VBD constructor.
    * \param nv_: number of vertices in the graph
    * \param ne_: number of edges in the graph
    * \param row_map: the xadj array of the graph. Its size is nv_ +1
    * \param entries: adjacency array of the graph. Its size is ne_
-   * \param coloring_handle: GraphColoringHandle object that holds the specification about the graph coloring,
-   *    including parameters.
+   * \param coloring_handle: GraphColoringHandle object that holds the
+   * specification about the graph coloring, including parameters.
    */
-  GraphColor_VBD(
-      nnz_lno_t nv_, size_type ne_,
-      const_lno_row_view_t row_map, const_lno_nnz_view_t entries,
-      HandleType *coloring_handle):
-    GraphColor<HandleType,lno_row_view_t_,lno_nnz_view_t_>(nv_, ne_, row_map, entries, coloring_handle),
-    _ticToc(coloring_handle->get_tictoc()),
-    _chunkSize(coloring_handle->get_vb_chunk_size()),
-    _use_color_set()
-    {
-      switch (coloring_handle->get_coloring_algo_type()){
-      case COLORING_VBD:
-        this->_use_color_set = 0;
-        break;
-      case COLORING_VBDBIT:
-        this->_use_color_set = 1;
-        break;
-      default: //cannnot get in here.
+  GraphColor_VBD(nnz_lno_t nv_, size_type ne_, const_lno_row_view_t row_map,
+                 const_lno_nnz_view_t entries, HandleType *coloring_handle)
+      : GraphColor<HandleType, lno_row_view_t_, lno_nnz_view_t_>(
+            nv_, ne_, row_map, entries, coloring_handle),
+        _ticToc(coloring_handle->get_tictoc()),
+        _chunkSize(coloring_handle->get_vb_chunk_size()),
+        _use_color_set() {
+    switch (coloring_handle->get_coloring_algo_type()) {
+      case COLORING_VBD: this->_use_color_set = 0; break;
+      case COLORING_VBDBIT: this->_use_color_set = 1; break;
+      default:  // cannnot get in here.
         this->_use_color_set = 0;
         break;
-
-      }
-
     }
+  }
 
   /** \brief GraphColor_VBD destructor.
-    */
-  virtual ~GraphColor_VBD(){}
+   */
+  virtual ~GraphColor_VBD() {}
 
-  /** \brief Function to color the vertices of the graphs. Performs a vertex-based coloring.
-   * \param colors is the output array recording the color of each vertex. Size is this->nv.
-   *   Attn: Color array must contain only positive numbers. If there are no initial colors,
-   *   it should be all initialized with zeros. Any strictly positive value in the color array,
+  /** \brief Function to color the vertices of the graphs. Performs a
+   * vertex-based coloring. \param colors is the output array recording the
+   * color of each vertex. Size is this->nv. Attn: Color array must contain only
+   * positive numbers. If there are no initial colors, it should be all
+   * initialized with zeros. Any strictly positive value in the color array,
    *   will make the algorithm assume that the corresponding vertex is already .
-   * \param num_loops: The number of loops in the while statement required to color the graph.
+   * \param num_loops: The number of loops in the while statement required to
+   * color the graph.
    */
-  virtual void color_graph(color_view_type colors, int &num_loops){
-
+  virtual void color_graph(color_view_type colors, int &num_loops) {
     if (this->_ticToc) {
-      std::cout
-          << "\tVBD params:" << std::endl
-          << "\talgorithm:" << (int)this->_use_color_set << std::endl
-          << "\tticToc:" << (int) this->_ticToc << std::endl
-          << "\tchunkSize:" << this->_chunkSize << std::endl;
+      std::cout << "\tVBD params:" << std::endl
+                << "\talgorithm:" << (int)this->_use_color_set << std::endl
+                << "\tticToc:" << (int)this->_ticToc << std::endl
+                << "\tchunkSize:" << this->_chunkSize << std::endl;
     }
 
     nnz_lno_t numVertices = this->nv;
 
-    size_type maxColors = 0;
-    nnz_lno_persistent_work_view_t score
-      = nnz_lno_persistent_work_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "score"), this->nv);
-    functorScoreCalculation<size_type, MyExecSpace> scoreCalculation(score, this->xadj);
-    
-    Kokkos::parallel_reduce("Deterministic Coloring: compute initial scores", my_exec_space(0, this->nv),
-                            scoreCalculation, Kokkos::Max<size_type>(maxColors));
+    size_type maxColors                  = 0;
+    nnz_lno_persistent_work_view_t score = nnz_lno_persistent_work_view_t(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "score"), this->nv);
+    functorScoreCalculation<size_type, MyExecSpace> scoreCalculation(
+        score, this->xadj);
+
+    Kokkos::parallel_reduce("Deterministic Coloring: compute initial scores",
+                            my_exec_space(0, this->nv), scoreCalculation,
+                            Kokkos::Max<size_type>(maxColors));
 
-   if (this->_ticToc) {
-     std::cout << "maxColors: " << maxColors << std::endl;
-   }
+    if (this->_ticToc) {
+      std::cout << "maxColors: " << maxColors << std::endl;
+    }
 
     // Create the dependency list of the graph
     nnz_lno_persistent_work_view_t dependency("dependency", numVertices);
     Kokkos::View<size_type, MyTempMemorySpace> frontierSize("frontierSize");
-    typename Kokkos::View<size_type, MyTempMemorySpace>::HostMirror host_frontierSize =
-      Kokkos::create_mirror_view(frontierSize);
-    Kokkos::View<size_type, MyTempMemorySpace> newFrontierSize("newFrontierSize");
-    typename Kokkos::View<size_type, MyTempMemorySpace>::HostMirror host_newFrontierSize =
-      Kokkos::create_mirror_view(newFrontierSize);
+    typename Kokkos::View<size_type, MyTempMemorySpace>::HostMirror
+        host_frontierSize = Kokkos::create_mirror_view(frontierSize);
+    Kokkos::View<size_type, MyTempMemorySpace> newFrontierSize(
+        "newFrontierSize");
+    typename Kokkos::View<size_type, MyTempMemorySpace>::HostMirror
+        host_newFrontierSize = Kokkos::create_mirror_view(newFrontierSize);
     nnz_lno_temp_work_view_t frontier("frontier", numVertices);
     nnz_lno_temp_work_view_t newFrontier("newFrontier", numVertices);
-    functorInitialDependency myInitialDependency(this->xadj, this->adj, score, dependency,
-                                                 newFrontier, newFrontierSize);
+    functorInitialDependency myInitialDependency(
+        this->xadj, this->adj, score, dependency, newFrontier, newFrontierSize);
     Kokkos::parallel_for("Deterministic Coloring: compute dependency list",
-                         my_exec_space(0, numVertices),
-                         myInitialDependency);
+                         my_exec_space(0, numVertices), myInitialDependency);
 
     Kokkos::deep_copy(host_newFrontierSize, newFrontierSize);
-    while(host_newFrontierSize() > 0) {
+    while (host_newFrontierSize() > 0) {
       ++num_loops;
-      // First swap fontier with newFrontier and fontierSize with newFrontierSize
-      // reset newFrontierSize
+      // First swap fontier with newFrontier and fontierSize with
+      // newFrontierSize reset newFrontierSize
       functorSwapOnDevice mySwapOnDevice(frontierSize, newFrontierSize);
-      Kokkos::parallel_for("Swap frontier sizes", my_exec_space(0, 1), mySwapOnDevice);
+      Kokkos::parallel_for("Swap frontier sizes", my_exec_space(0, 1),
+                           mySwapOnDevice);
       Kokkos::deep_copy(host_frontierSize, frontierSize);
       {
-	auto swap_tmp = frontier;
-	frontier = newFrontier;
-	newFrontier = swap_tmp;
+        auto swap_tmp = frontier;
+        frontier      = newFrontier;
+        newFrontier   = swap_tmp;
       }
 
       // Loop over nodes in the frontier
       // First variant without bit array, easier to understand/program
-      if(this->_use_color_set == 0) {
-        functorDeterministicColoring myDeterministicColoring(this->xadj, this->adj,
-                                                             dependency, frontier, frontierSize,
-                                                             newFrontier, newFrontierSize,
-                                                             maxColors, colors);
+      if (this->_use_color_set == 0) {
+        functorDeterministicColoring myDeterministicColoring(
+            this->xadj, this->adj, dependency, frontier, frontierSize,
+            newFrontier, newFrontierSize, maxColors, colors);
         Kokkos::parallel_for("Deterministic Coloring: color nodes in frontier",
                              my_exec_space(0, host_frontierSize()),
                              myDeterministicColoring);
 
-      } else if(this->_use_color_set == 1) {
+      } else if (this->_use_color_set == 1) {
         // Second variant with bit array for efficiency on GPU
         // The bit array is of size 64 so if maxColors > 64,
         // we need to use successive color ranges of width 64
         // to represent all the possible colors on the graph.
-        functorDeterministicColoringBitArray myDeterministicColoringBitArray(this->xadj, this->adj,
-                                                                             dependency, frontier,
-                                                                             frontierSize,
-                                                                             newFrontier,
-                                                                             newFrontierSize,
-                                                                             maxColors, colors);
-        Kokkos::parallel_for("Deterministic Coloring: color nodes in frontier",
-                             my_exec_space(0, host_frontierSize()),
-                             myDeterministicColoringBitArray); // Loop over current frontier
+        functorDeterministicColoringBitArray myDeterministicColoringBitArray(
+            this->xadj, this->adj, dependency, frontier, frontierSize,
+            newFrontier, newFrontierSize, maxColors, colors);
+        Kokkos::parallel_for(
+            "Deterministic Coloring: color nodes in frontier",
+            my_exec_space(0, host_frontierSize()),
+            myDeterministicColoringBitArray);  // Loop over current frontier
       }
       Kokkos::deep_copy(host_newFrontierSize, newFrontierSize);
-    } // while newFrontierSize
-
-  } // color_graph()
+    }  // while newFrontierSize
 
+  }  // color_graph()
 
   template <class max_type, class execution_space>
   struct functorScoreCalculation {
     nnz_lno_persistent_work_view_t score_;
     const_lno_row_view_t numNeighbors_;
 
-    functorScoreCalculation(nnz_lno_persistent_work_view_t& score, const_lno_row_view_t& numNeighbors)
-      : score_(score), numNeighbors_(numNeighbors) {}
+    functorScoreCalculation(nnz_lno_persistent_work_view_t &score,
+                            const_lno_row_view_t &numNeighbors)
+        : score_(score), numNeighbors_(numNeighbors) {}
 
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int i,  size_type &update) const {
+    void operator()(const int i, size_type &update) const {
       score_(i) = numNeighbors_(i + 1) - numNeighbors_(i);
-      update = ( (size_type) score_(i) < update ? update : (size_type) score_(i) );
+      update = ((size_type)score_(i) < update ? update : (size_type)score_(i));
     }
-  }; // functorScoreCalculation()
+  };  // functorScoreCalculation()
 
   struct functorSwapOnDevice {
     Kokkos::View<size_type, MyTempMemorySpace> frontierSize_;
     Kokkos::View<size_type, MyTempMemorySpace> newFrontierSize_;
 
-    functorSwapOnDevice(Kokkos::View<size_type, MyTempMemorySpace> frontierSize,
-                        Kokkos::View<size_type, MyTempMemorySpace> newFrontierSize)
-      : frontierSize_(frontierSize), newFrontierSize_(newFrontierSize) {}
+    functorSwapOnDevice(
+        Kokkos::View<size_type, MyTempMemorySpace> frontierSize,
+        Kokkos::View<size_type, MyTempMemorySpace> newFrontierSize)
+        : frontierSize_(frontierSize), newFrontierSize_(newFrontierSize) {}
 
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int /* dummy */) const {
-      frontierSize_() = newFrontierSize_();
+    void operator()(const int /* dummy */) const {
+      frontierSize_()    = newFrontierSize_();
       newFrontierSize_() = 0;
     }
 
-  }; // functorSwapOnDevice
+  };  // functorSwapOnDevice
 
   struct functorInitialDependency {
     const_lno_row_view_t xadj_;
@@ -2021,37 +1996,42 @@ class GraphColor_VBD:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_
     nnz_lno_temp_work_view_t newFrontier_;
     Kokkos::View<size_type, MyTempMemorySpace> newFrontierSize_;
 
-    functorInitialDependency(const_lno_row_view_t rowPtr,
-                             const_lno_nnz_view_t colInd,
-                             nnz_lno_persistent_work_view_t score,
-                             nnz_lno_persistent_work_view_t dependency,
-                             nnz_lno_temp_work_view_t newFrontier,
-                             Kokkos::View<size_type, MyTempMemorySpace> newFrontierSize)
-      : xadj_(rowPtr), adj_(colInd), score_(score), dependency_(dependency),
-        newFrontier_(newFrontier), newFrontierSize_(newFrontierSize) {}
+    functorInitialDependency(
+        const_lno_row_view_t rowPtr, const_lno_nnz_view_t colInd,
+        nnz_lno_persistent_work_view_t score,
+        nnz_lno_persistent_work_view_t dependency,
+        nnz_lno_temp_work_view_t newFrontier,
+        Kokkos::View<size_type, MyTempMemorySpace> newFrontierSize)
+        : xadj_(rowPtr),
+          adj_(colInd),
+          score_(score),
+          dependency_(dependency),
+          newFrontier_(newFrontier),
+          newFrontierSize_(newFrontierSize) {}
 
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int node) const {
-      typedef typename std::remove_reference< decltype( newFrontierSize_() ) >::type atomic_incr_type;
-      int myScore = score_(node);
+    void operator()(const int node) const {
+      typedef typename std::remove_reference<decltype(newFrontierSize_())>::type
+          atomic_incr_type;
+      int myScore   = score_(node);
       int numNeighs = xadj_(node + 1) - xadj_(node);
-      for(int neigh = 0; neigh < numNeighs; ++neigh) {
-        if(myScore < score_(adj_(xadj_(node) + neigh))) {
+      for (int neigh = 0; neigh < numNeighs; ++neigh) {
+        if (myScore < score_(adj_(xadj_(node) + neigh))) {
           dependency_(node) = dependency_(node) + 1;
         }
-        if(( myScore == score_(adj_(xadj_(node) + neigh)) ) &&
-           ( node < adj_(xadj_(node) + neigh) )) {
+        if ((myScore == score_(adj_(xadj_(node) + neigh))) &&
+            (node < adj_(xadj_(node) + neigh))) {
           dependency_(node) = dependency_(node) + 1;
         }
       }
-      if(dependency_(node) == 0) {
-        const size_type newFrontierIdx
-          = Kokkos::atomic_fetch_add(&newFrontierSize_(), atomic_incr_type(1));
+      if (dependency_(node) == 0) {
+        const size_type newFrontierIdx =
+            Kokkos::atomic_fetch_add(&newFrontierSize_(), atomic_incr_type(1));
         newFrontier_(newFrontierIdx) = node;
       }
     }
 
-  }; // functorInitialDependency
+  };  // functorInitialDependency
 
   struct functorDeterministicColoring {
     const_lno_row_view_t xadj_;
@@ -2086,39 +2066,43 @@ class GraphColor_VBD:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_
                         maxColors_) {}
 
     KOKKOS_INLINE_FUNCTION
-    void operator() (const size_type frontierIdx) const {
-      typedef typename std::remove_reference< decltype( newFrontierSize_() ) >::type atomic_incr_type;
+    void operator()(const size_type frontierIdx) const {
+      typedef typename std::remove_reference<decltype(newFrontierSize_())>::type
+          atomic_incr_type;
       size_type frontierNode = frontier_(frontierIdx);
-      for(size_type colorIdx= 0; colorIdx < maxColors_; ++colorIdx) {
+      for (size_type colorIdx = 0; colorIdx < maxColors_; ++colorIdx) {
         bannedColors_(frontierIdx, colorIdx) = 0;
       }
 
-      // Loop over neighbors, find banned colors, decrement dependency and update newFrontier
-      for(size_type neigh = xadj_(frontierNode); neigh < xadj_(frontierNode + 1); ++neigh) {
+      // Loop over neighbors, find banned colors, decrement dependency and
+      // update newFrontier
+      for (size_type neigh = xadj_(frontierNode);
+           neigh < xadj_(frontierNode + 1); ++neigh) {
         bannedColors_(frontierIdx, colors_(adj_(neigh))) = 1;
 
         // We want to avoid the cost of atomic operations when not needed
         // so let's check that the node is not already colored, i.e.
         // its dependency is not -1.
-        if(dependency_(adj_(neigh)) >= 0) {
-          nnz_lno_t myDependency = Kokkos::atomic_fetch_add(&dependency_(adj_(neigh)), -1);
+        if (dependency_(adj_(neigh)) >= 0) {
+          nnz_lno_t myDependency =
+              Kokkos::atomic_fetch_add(&dependency_(adj_(neigh)), -1);
           // dependency(myAdj(neigh)) = dependency(myAdj(neigh)) - 1;
-          if(myDependency - 1 == 0) {
-            const size_type newFrontierIdx
-              = Kokkos::atomic_fetch_add(&newFrontierSize_(), atomic_incr_type(1));
+          if (myDependency - 1 == 0) {
+            const size_type newFrontierIdx = Kokkos::atomic_fetch_add(
+                &newFrontierSize_(), atomic_incr_type(1));
             newFrontier_(newFrontierIdx) = adj_(neigh);
           }
         }
-      } // Loop over neighbors
+      }  // Loop over neighbors
 
-      for(size_type color = 1; color < maxColors_; ++color) {
+      for (size_type color = 1; color < maxColors_; ++color) {
         if (bannedColors_(frontierIdx, color) == 0) {
           colors_(frontierNode) = color;
           break;
         }
-      } // Loop over banned colors
+      }  // Loop over banned colors
     }
-  }; // functorDeterministicColoring
+  };  // functorDeterministicColoring
 
   struct functorDeterministicColoringBitArray {
     const_lno_row_view_t xadj_;
@@ -2131,35 +2115,41 @@ class GraphColor_VBD:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_
     size_type maxColors_;
     color_view_type colors_;
 
-    functorDeterministicColoringBitArray(const_lno_row_view_t rowPtr,
-                                 const_lno_nnz_view_t colInd,
-                                 nnz_lno_persistent_work_view_t dependency,
-                                 nnz_lno_temp_work_view_t frontier,
-                                 Kokkos::View<size_type, MyTempMemorySpace> frontierSize,
-                                 nnz_lno_temp_work_view_t newFrontier,
-                                 Kokkos::View<size_type, MyTempMemorySpace> newFrontierSize,
-                                 size_type maxColors,
-                                 color_view_type colors)
-    : xadj_(rowPtr), adj_(colInd), dependency_(dependency), frontier_(frontier),
-      frontierSize_(frontierSize), newFrontier_(newFrontier), newFrontierSize_(newFrontierSize),
-      maxColors_(maxColors), colors_(colors) {}
+    functorDeterministicColoringBitArray(
+        const_lno_row_view_t rowPtr, const_lno_nnz_view_t colInd,
+        nnz_lno_persistent_work_view_t dependency,
+        nnz_lno_temp_work_view_t frontier,
+        Kokkos::View<size_type, MyTempMemorySpace> frontierSize,
+        nnz_lno_temp_work_view_t newFrontier,
+        Kokkos::View<size_type, MyTempMemorySpace> newFrontierSize,
+        size_type maxColors, color_view_type colors)
+        : xadj_(rowPtr),
+          adj_(colInd),
+          dependency_(dependency),
+          frontier_(frontier),
+          frontierSize_(frontierSize),
+          newFrontier_(newFrontier),
+          newFrontierSize_(newFrontierSize),
+          maxColors_(maxColors),
+          colors_(colors) {}
 
     KOKKOS_INLINE_FUNCTION
-    void operator() (const size_type frontierIdx) const {
-
-      typedef typename std::remove_reference< decltype( newFrontierSize_() ) >::type atomic_incr_type;
+    void operator()(const size_type frontierIdx) const {
+      typedef typename std::remove_reference<decltype(newFrontierSize_())>::type
+          atomic_incr_type;
       size_type frontierNode = frontier_(frontierIdx);
       // Initialize bit array to all bits = 0
       unsigned long long bannedColors = 0;
       color_t myColor = 0, colorOffset = 0;
 
-      while(myColor == 0) {
+      while (myColor == 0) {
         // Loop over neighbors, find banned colors in the range:
         // [colorOffset + 1, colorOffset + 64]
-        for(size_type neigh = xadj_(frontierNode); neigh < xadj_(frontierNode + 1); ++neigh) {
+        for (size_type neigh = xadj_(frontierNode);
+             neigh < xadj_(frontierNode + 1); ++neigh) {
           color_t neighColor = colors_(adj_(neigh));
           // Check that the color is in the current range
-          if(neighColor > colorOffset && neighColor < colorOffset + 65) {
+          if (neighColor > colorOffset && neighColor < colorOffset + 65) {
             // Set bannedColors' bit in location colors(adj_(neigh)) to 1.
             bannedColors |= (1ULL << (neighColor - 1));
           }
@@ -2167,91 +2157,95 @@ class GraphColor_VBD:public GraphColor <HandleType,lno_row_view_t_,lno_nnz_view_
           // We want to avoid the cost of atomic operations when not needed
           // so let's check that the node is not already colored, i.e.
           // its dependency is not -1.
-          if(colorOffset == 0 && dependency_(adj_(neigh)) >= 0) {
+          if (colorOffset == 0 && dependency_(adj_(neigh)) >= 0) {
             nnz_lno_t myDependency =
-              Kokkos::atomic_fetch_add(&dependency_(adj_(neigh)), -1);
-            if(myDependency - 1 == 0) {
-              const size_type newFrontierIdx
-                = Kokkos::atomic_fetch_add(&newFrontierSize_(), atomic_incr_type(1));
+                Kokkos::atomic_fetch_add(&dependency_(adj_(neigh)), -1);
+            if (myDependency - 1 == 0) {
+              const size_type newFrontierIdx = Kokkos::atomic_fetch_add(
+                  &newFrontierSize_(), atomic_incr_type(1));
               newFrontier_(newFrontierIdx) = adj_(neigh);
             }
           }
-        } // Loop over neighbors
+        }  // Loop over neighbors
 
-        if(~bannedColors == 0ULL) {
+        if (~bannedColors == 0ULL) {
           colorOffset += 64;
           // Reset bannedColors to all 0 bits
           bannedColors |= ~bannedColors;
         } else {
           color_t colorIdx = 1;
           // Check if index colordIdx - 1, is set to one in bannedColors
-          while(bannedColors & (1ULL << (colorIdx - 1))) {++colorIdx;}
+          while (bannedColors & (1ULL << (colorIdx - 1))) {
+            ++colorIdx;
+          }
           myColor = colorOffset + colorIdx;
         }
       }
       colors_(frontierNode) = myColor;
     }
-  }; // functorDeterministicColoringBitArray
-
+  };  // functorDeterministicColoringBitArray
 
 };  // class GraphColor_VBD
 
-
 /*! \brief Class for modular parallel graph coloring using Kokkos.
  *  Performs a edge_base coloring, with the hope of better load balance
  *  as well as better memory accesses on GPUs.
  */
-template <typename HandleType, typename in_row_index_view_type_, typename in_nonzero_index_view_type_>
-class GraphColor_EB:public GraphColor <HandleType,in_row_index_view_type_,in_nonzero_index_view_type_>{
-public:
-
+template <typename HandleType, typename in_row_index_view_type_,
+          typename in_nonzero_index_view_type_>
+class GraphColor_EB : public GraphColor<HandleType, in_row_index_view_type_,
+                                        in_nonzero_index_view_type_> {
+  // FIXME SYCL: This does not work, returns colors with conflicts.
+ public:
   typedef long long int ban_type;
 
   typedef in_row_index_view_type_ in_row_index_view_type;
   typedef in_nonzero_index_view_type_ in_nonzero_index_view_type;
   typedef typename HandleType::color_view_t color_view_type;
 
-
   typedef typename HandleType::size_type size_type;
   typedef typename in_row_index_view_type_::device_type row_lno_view_device_t;
 
-
   typedef typename HandleType::nnz_lno_t nnz_lno_t;
 
-
   typedef typename HandleType::color_t color_t;
-  typedef typename HandleType::color_host_view_t color_host_view_t; //Host view type
-
-
-
+  typedef typename HandleType::color_host_view_t
+      color_host_view_t;  // Host view type
 
   typedef typename HandleType::HandleExecSpace MyExecSpace;
   typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace;
-  typedef typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace;
+  typedef
+      typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace;
 
-  typedef typename Kokkos::View<nnz_lno_t, row_lno_view_device_t> single_dim_index_view_type;
+  typedef typename Kokkos::View<nnz_lno_t, row_lno_view_device_t>
+      single_dim_index_view_type;
 
-  typedef typename single_dim_index_view_type::HostMirror single_dim_index_host_view_type; //Host view type
+  typedef typename single_dim_index_view_type::HostMirror
+      single_dim_index_host_view_type;  // Host view type
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
 
-  typedef typename HandleType::size_type_temp_work_view_t size_type_temp_work_view_t;
-  typedef typename HandleType::size_type_persistent_work_view_t size_type_persistent_work_view_t;
+  typedef typename HandleType::size_type_temp_work_view_t
+      size_type_temp_work_view_t;
+  typedef typename HandleType::size_type_persistent_work_view_t
+      size_type_persistent_work_view_t;
 
-  typedef typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t;
-  typedef typename HandleType::nnz_lno_persistent_work_view_t nnz_lno_persistent_work_view_t;
+  typedef
+      typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t;
+  typedef typename HandleType::nnz_lno_persistent_work_view_t
+      nnz_lno_persistent_work_view_t;
 
-  typedef typename Kokkos::View<color_t *, MyTempMemorySpace> color_temp_work_view_type;
+  typedef typename Kokkos::View<color_t *, MyTempMemorySpace>
+      color_temp_work_view_type;
 
   typedef Kokkos::View<char *, MyTempMemorySpace> char_temp_work_view_type;
-  typedef typename char_temp_work_view_type::HostMirror char_temp_work_host_view_type; //Host view type
-
-
+  typedef typename char_temp_work_view_type::HostMirror
+      char_temp_work_host_view_type;  // Host view type
 
   typedef typename in_row_index_view_type::const_type const_lno_row_view_t;
-  typedef typename in_nonzero_index_view_type::const_type const_nonzero_index_view_type;
-
-public:
+  typedef typename in_nonzero_index_view_type::const_type
+      const_nonzero_index_view_type;
 
+ public:
   /**
    * \brief GraphColor_EB constructor.
    * \param nv_ number of vertices in the graph
@@ -2259,140 +2253,144 @@ class GraphColor_EB:public GraphColor <HandleType,in_row_index_view_type_,in_non
    * \param xadj_ the xadj array of the graph. Its size is nv_ +1
    * \param adj_ adjacency array of the graph. Its size is ne_
    */
-  GraphColor_EB(nnz_lno_t nv_,
-                size_type ne_,
-                const_lno_row_view_t row_map,
+  GraphColor_EB(nnz_lno_t nv_, size_type ne_, const_lno_row_view_t row_map,
                 const_nonzero_index_view_type entries,
-                HandleType *coloring_handle):
-    GraphColor<HandleType,in_row_index_view_type_,in_nonzero_index_view_type_>(nv_, ne_, row_map, entries, coloring_handle)
-    {}
+                HandleType *coloring_handle)
+      : GraphColor<HandleType, in_row_index_view_type_,
+                   in_nonzero_index_view_type_>(nv_, ne_, row_map, entries,
+                                                coloring_handle) {}
 
   /**
    * \brief Class Destructor.
    */
-  virtual ~GraphColor_EB(){}
+  virtual ~GraphColor_EB() {}
 
-
-
-  /** \brief function to color the vertices of the graphs. Performs an edge based graph coloring.
-   *  the algorithm uses kokkos, so it is modular.
-   * \param colors is the output array corresponding the color of each vertex.Size is this->nv.
-   * \param num_loops is the output for the number of phases that the algorithm took to converge.
+  /** \brief function to color the vertices of the graphs. Performs an edge
+   * based graph coloring. the algorithm uses kokkos, so it is modular. \param
+   * colors is the output array corresponding the color of each vertex.Size is
+   * this->nv. \param num_loops is the output for the number of phases that the
+   * algorithm took to converge.
    */
-  virtual void color_graph(color_view_type kok_colors, int &num_loops ){
-
-    //std::cout << ">>> GraphColor_EB::color_graph()" << std::endl;  // WCMCLEN
+  virtual void color_graph(color_view_type kok_colors, int &num_loops) {
+    // std::cout << ">>> GraphColor_EB::color_graph()" << std::endl;  // WCMCLEN
 
-    //get EB parameters
+    // get EB parameters
     color_t numInitialColors = this->cp->get_eb_num_initial_colors();
-    double pps_cutoff = this->cp->get_min_reduction_for_conflictlist();
-    size_type ps_min = this->cp->get_min_elements_for_conflictlist();
+    double pps_cutoff        = this->cp->get_min_reduction_for_conflictlist();
+    size_type ps_min         = this->cp->get_min_elements_for_conflictlist();
     bool use_pps = (this->cp->get_conflict_list_type() == COLORING_PPS);
 
     bool tictoc = this->cp->get_tictoc();
 
     Kokkos::Timer *timer = NULL;
 
-    if (tictoc){
+    if (tictoc) {
       timer = new Kokkos::Timer();
-      std::cout << "\tRewriting EB params. num_initial_colors:" << numInitialColors
-          << " prefix_sum_shrink_min:"  << ps_min
-          << " ps_cutoff:" << pps_cutoff
-          << std::endl;
+      std::cout << "\tRewriting EB params. num_initial_colors:"
+                << numInitialColors << " prefix_sum_shrink_min:" << ps_min
+                << " ps_cutoff:" << pps_cutoff << std::endl;
     }
 
     size_type numEdges = 0;
     nnz_lno_persistent_work_view_t _kok_src, _kok_dst;
 
-
-    this->cp->get_lower_diagonal_edge_list (this->nv, this->ne, this->xadj, this->adj, numEdges, _kok_src, _kok_dst);
+    this->cp->get_lower_diagonal_edge_list(this->nv, this->ne, this->xadj,
+                                           this->adj, numEdges, _kok_src,
+                                           _kok_dst);
     size_type num_work_edges = numEdges;
 
-    //allocate memory for vertex ban colors, and tentative bans
-    color_temp_work_view_type color_ban (Kokkos::view_alloc(Kokkos::WithoutInitializing, "color_ban"), this->nv);
-    color_temp_work_view_type tentative_color_ban(Kokkos::view_alloc(Kokkos::WithoutInitializing, "tentative_color_ban"), this->nv);//views are initialized with zero
-    //allocate memory for vertex color set shifts.
-    nnz_lno_temp_work_view_t color_set ("color_set", this->nv); //initialized with zero.
-    //initialize colors, color bans
-    Kokkos::parallel_for ("KokkosGraph::GraphColoring::initColors",
-        my_exec_space (0, this->nv) , init_colors (kok_colors, color_ban, numInitialColors, color_set));
-    //std::cout << "nv:" << this->nv << " init_colors" << std::endl;
-
-    //worklist
-    size_type_temp_work_view_t edge_conflict_indices
-    (Kokkos::view_alloc(Kokkos::WithoutInitializing, "edge_conflict_indices"), num_work_edges);
-    //next iterations conflict list
-    size_type_temp_work_view_t new_edge_conflict_indices
-    (Kokkos::view_alloc(Kokkos::WithoutInitializing, "new_edge_conflict_indices"), num_work_edges);
-
-    char_temp_work_view_type edge_conflict_marker
-    (Kokkos::view_alloc(Kokkos::WithoutInitializing, "edge_conflict_marker"), num_work_edges);
-
-
-    //initialize the worklist sequentiall, and markers as 1.
-    Kokkos::parallel_for ("KokkosGraph::GraphColoring::InitWorkArrays",
-        my_exec_space (0, num_work_edges),
-        init_work_arrays(edge_conflict_indices, edge_conflict_marker)
-    );
+    // allocate memory for vertex ban colors, and tentative bans
+    color_temp_work_view_type color_ban(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "color_ban"), this->nv);
+    color_temp_work_view_type tentative_color_ban(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "tentative_color_ban"),
+        this->nv);  // views are initialized with zero
+    // allocate memory for vertex color set shifts.
+    nnz_lno_temp_work_view_t color_set("color_set",
+                                       this->nv);  // initialized with zero.
+    // initialize colors, color bans
+    Kokkos::parallel_for(
+        "KokkosGraph::GraphColoring::initColors", my_exec_space(0, this->nv),
+        init_colors(kok_colors, color_ban, numInitialColors, color_set));
+    // std::cout << "nv:" << this->nv << " init_colors" << std::endl;
+
+    // worklist
+    size_type_temp_work_view_t edge_conflict_indices(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "edge_conflict_indices"),
+        num_work_edges);
+    // next iterations conflict list
+    size_type_temp_work_view_t new_edge_conflict_indices(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "new_edge_conflict_indices"),
+        num_work_edges);
+
+    char_temp_work_view_type edge_conflict_marker(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "edge_conflict_marker"),
+        num_work_edges);
+
+    // initialize the worklist sequentiall, and markers as 1.
+    Kokkos::parallel_for(
+        "KokkosGraph::GraphColoring::InitWorkArrays",
+        my_exec_space(0, num_work_edges),
+        init_work_arrays(edge_conflict_indices, edge_conflict_marker));
     MyExecSpace().fence();
-    //std::cout << "nv:" << this->nv << " init_work_arrays" << std::endl;
+    // std::cout << "nv:" << this->nv << " init_work_arrays" << std::endl;
 
     double inittime = 0;
-    if (tictoc){
+    if (tictoc) {
       inittime = timer->seconds();
       timer->reset();
     }
-    double mc_time = 0, cnt_time = 0, ban_time = 0, expand_ban_time = 0, color_time = 0, pps_time = 0;
+    double mc_time = 0, cnt_time = 0, ban_time = 0, expand_ban_time = 0,
+           color_time = 0, pps_time = 0;
 
     size_type i = 0;
 
-
-    if (tictoc){
-    std::cout << "\t"; KokkosKernels::Impl::print_1Dview(_kok_src);
-    std::cout << "\t"; KokkosKernels::Impl::print_1Dview(_kok_dst);
-    std::cout << "\t"; KokkosKernels::Impl::print_1Dview(kok_colors);
-    std::cout << "\t"; KokkosKernels::Impl::print_1Dview(color_set);
+    if (tictoc) {
+      std::cout << "\t";
+      KokkosKernels::Impl::print_1Dview(_kok_src);
+      std::cout << "\t";
+      KokkosKernels::Impl::print_1Dview(_kok_dst);
+      std::cout << "\t";
+      KokkosKernels::Impl::print_1Dview(kok_colors);
+      std::cout << "\t";
+      KokkosKernels::Impl::print_1Dview(color_set);
     }
 
-    while(1){
-
+    while (1) {
       ++i;
-      //std::cout << "nv:" << this->nv << " i:" << i  << " num_work_edges:" << num_work_edges<< std::endl;
-      //conflict detection mark conflicts as color 0.
-      //update their bans
-      Kokkos::parallel_for("KokkosGraph::GraphColoring::HalfEdgeMarkConflicts",
-          my_exec_space(0,num_work_edges),
-          halfedge_mark_conflicts (
-              _kok_src, _kok_dst,
-              kok_colors, color_set,
-              color_ban, tentative_color_ban
-              ,edge_conflict_indices
-          )
-      );
+      // std::cout << "nv:" << this->nv << " i:" << i  << " num_work_edges:" <<
+      // num_work_edges<< std::endl; conflict detection mark conflicts as color
+      // 0. update their bans
+      Kokkos::parallel_for(
+          "KokkosGraph::GraphColoring::HalfEdgeMarkConflicts",
+          my_exec_space(0, num_work_edges),
+          halfedge_mark_conflicts(_kok_src, _kok_dst, kok_colors, color_set,
+                                  color_ban, tentative_color_ban,
+                                  edge_conflict_indices));
 
       MyExecSpace().fence();
-      //std::cout << "nv:" << this->nv << " i:" << i <<  " halfedge_mark_conflicts" << std::endl;
+      // std::cout << "nv:" << this->nv << " i:" << i <<  "
+      // halfedge_mark_conflicts" << std::endl;
 
-      if (tictoc){
+      if (tictoc) {
         mc_time += timer->seconds();
         timer->reset();
       }
 
-
       size_type num_conflict_reduction = 0;
 
-      //count conflicts, and mark the edges that does not need to be processed.
-
+      // count conflicts, and mark the edges that does not need to be processed.
 
       if (num_work_edges > 0)
-      Kokkos::parallel_reduce("KokkosGraph::GraphColoring::HalfEdgeConflictsCount",
-          my_exec_space(0, num_work_edges),
-          halfedge_conflict_count(
-              _kok_src, _kok_dst,
-              kok_colors, color_set,
-              edge_conflict_indices,
-              edge_conflict_marker
-          ),num_conflict_reduction);
+        Kokkos::parallel_reduce(
+            "KokkosGraph::GraphColoring::HalfEdgeConflictsCount",
+            my_exec_space(0, num_work_edges),
+            halfedge_conflict_count(_kok_src, _kok_dst, kok_colors, color_set,
+                                    edge_conflict_indices,
+                                    edge_conflict_marker),
+            num_conflict_reduction);
 
       MyExecSpace().fence();
 
@@ -2410,230 +2408,246 @@ class GraphColor_EB:public GraphColor <HandleType,in_row_index_view_type_,in_non
           << std::endl;
       */
 
-      if (tictoc){
+      if (tictoc) {
         cnt_time += timer->seconds();
         timer->reset();
       }
-      //std::cout << "nv:" << this->nv << " i:" << i <<  " before break:" << num_work_edges - num_conflict_reduction << std::endl;
+      // std::cout << "nv:" << this->nv << " i:" << i <<  " before break:" <<
+      // num_work_edges - num_conflict_reduction << std::endl;
 
-      //if (num_work_edges <= num_conflict_reduction) break;
+      // if (num_work_edges <= num_conflict_reduction) break;
       if (num_work_edges - num_conflict_reduction == 0) break;
 
-      //if the reduction is good enough w.r.t. parameters, create new worklist.
-      if (num_work_edges > ps_min && num_conflict_reduction / double (num_work_edges) > pps_cutoff)
-      {
-        //use_pps = false;
-        if (use_pps){
-          Kokkos::parallel_scan ("KokkosGraph::GraphColoring::CalcEdgePositions",
-              my_exec_space(0, num_work_edges),
-              ppsWorklistFunctorEB(edge_conflict_indices, new_edge_conflict_indices, edge_conflict_marker));
-        }
-        else {
-          //create new worklist
-          single_dim_index_view_type new_index = single_dim_index_view_type("recolorListLength");;
-          Kokkos::parallel_for ("KokkosGraph::GraphColoring::CreateNewWorkArrayAtomic",
+      // if the reduction is good enough w.r.t. parameters, create new worklist.
+      if (num_work_edges > ps_min &&
+          num_conflict_reduction / double(num_work_edges) > pps_cutoff) {
+        // use_pps = false;
+        if (use_pps) {
+          Kokkos::parallel_scan("KokkosGraph::GraphColoring::CalcEdgePositions",
+                                my_exec_space(0, num_work_edges),
+                                ppsWorklistFunctorEB(edge_conflict_indices,
+                                                     new_edge_conflict_indices,
+                                                     edge_conflict_marker));
+        } else {
+          // create new worklist
+          single_dim_index_view_type new_index =
+              single_dim_index_view_type("recolorListLength");
+          ;
+          Kokkos::parallel_for(
+              "KokkosGraph::GraphColoring::CreateNewWorkArrayAtomic",
               my_exec_space(0, num_work_edges),
-              atomic_create_new_work_array(new_index, edge_conflict_indices, edge_conflict_marker, new_edge_conflict_indices));
+              atomic_create_new_work_array(new_index, edge_conflict_indices,
+                                           edge_conflict_marker,
+                                           new_edge_conflict_indices));
           MyExecSpace().fence();
         }
 
-        //swap old and new worklist
+        // swap old and new worklist
         size_type_temp_work_view_t tmp = new_edge_conflict_indices;
-        new_edge_conflict_indices =  edge_conflict_indices;
-        edge_conflict_indices = tmp;
+        new_edge_conflict_indices      = edge_conflict_indices;
+        edge_conflict_indices          = tmp;
         num_work_edges -= num_conflict_reduction;
         num_conflict_reduction = 0;
 
-        if (tictoc){
+        if (tictoc) {
           pps_time += timer->seconds();
           timer->reset();
         }
       }
 
-      //create ban colors using the colored neighbors
-      Kokkos::parallel_for ("KokkosGraph::GraphColoring::HalfEdgeBancColors",
-          my_exec_space(0,num_work_edges),
-          halfedge_ban_colors(
-              _kok_src, _kok_dst,
-              kok_colors, color_set,
-              color_ban,edge_conflict_indices, edge_conflict_marker
-          )
-      );
+      // create ban colors using the colored neighbors
+      Kokkos::parallel_for(
+          "KokkosGraph::GraphColoring::HalfEdgeBancColors",
+          my_exec_space(0, num_work_edges),
+          halfedge_ban_colors(_kok_src, _kok_dst, kok_colors, color_set,
+                              color_ban, edge_conflict_indices,
+                              edge_conflict_marker));
 
       MyExecSpace().fence();
 
-      if (tictoc){
+      if (tictoc) {
         ban_time += timer->seconds();
         timer->reset();
       }
 
-
-      //create tentative ban using the uncolored neighbors.
-      Kokkos::parallel_for ("KokkosGraph::GraphColoring::HalfEdgeExpandBanForUnmatchedNeighbors",
-          my_exec_space(0,num_work_edges),
+      // create tentative ban using the uncolored neighbors.
+      Kokkos::parallel_for(
+          "KokkosGraph::GraphColoring::HalfEdgeExpandBanForUnmatchedNeighbors",
+          my_exec_space(0, num_work_edges),
           halfedge_expand_ban_for_unmatched_neighbors(
-              _kok_src, _kok_dst,
-              kok_colors, color_set,
-              color_ban,
-              tentative_color_ban,edge_conflict_indices)
-      );
+              _kok_src, _kok_dst, kok_colors, color_set, color_ban,
+              tentative_color_ban, edge_conflict_indices));
 
-      if (tictoc){
+      if (tictoc) {
         expand_ban_time += timer->seconds();
         timer->reset();
       }
 
-
-
-      //chose a color based on the ban arrays.
-      //if all colors in current set are taken, increase the color set, try again in the next iteration.
-      Kokkos::parallel_for("KokkosGraph::GraphColoring::ChooseColors",
-          my_exec_space(0,this->nv), choose_colors(kok_colors, color_set, color_ban, tentative_color_ban));
-      if (tictoc){
+      // chose a color based on the ban arrays.
+      // if all colors in current set are taken, increase the color set, try
+      // again in the next iteration.
+      Kokkos::parallel_for(
+          "KokkosGraph::GraphColoring::ChooseColors",
+          my_exec_space(0, this->nv),
+          choose_colors(kok_colors, color_set, color_ban, tentative_color_ban));
+      if (tictoc) {
         color_time += timer->seconds();
         timer->reset();
       }
     }
-    if (tictoc){
-      std::cout <<
-          "\tinit_time:" << inittime <<
-          " mc:" << mc_time <<
-          " cnt_time:" << cnt_time <<
-          " ban_time:" << ban_time <<
-          " expand ban time:" << expand_ban_time <<
-          " pps time:" << pps_time <<
-          " color time:" << color_time << std::endl<< std::endl;
+    if (tictoc) {
+      std::cout << "\tinit_time:" << inittime << " mc:" << mc_time
+                << " cnt_time:" << cnt_time << " ban_time:" << ban_time
+                << " expand ban time:" << expand_ban_time
+                << " pps time:" << pps_time << " color time:" << color_time
+                << std::endl
+                << std::endl;
     }
 
-    //set the final colors.
+    // set the final colors.
     Kokkos::parallel_for("KokkosGraph::GraphColoring::SetFinalColors",
-        my_exec_space(0,this->nv), set_final_colors (kok_colors, color_set));
+                         my_exec_space(0, this->nv),
+                         set_final_colors(kok_colors, color_set));
 
     num_loops = i;
 
-    if (tictoc){
+    if (tictoc) {
       delete timer;
     }
-  }       // color_graph (end)
-
+  }  // color_graph (end)
 
   /*! \brief Functor to initialize the colors of the vertices randomly,
    *  with the hope that it will reduce the conflict in parallel execution.
    *  It also initializes the color bans.
    */
-  struct init_colors{
-
+  struct init_colors {
     color_view_type kokcolors;
-    color_temp_work_view_type color_ban; //colors
-    color_t hash; //the number of colors to be assigned initially.
+    color_temp_work_view_type color_ban;  // colors
+    color_t hash;  // the number of colors to be assigned initially.
     nnz_lno_temp_work_view_t color_set;
 
-    //the value to initialize the color_ban_. We avoid using the first bit representing the sign.
-    //Therefore if idx is int, it can represent 32-1 colors. Use color_set to represent more.
+    // the value to initialize the color_ban_. We avoid using the first bit
+    // representing the sign. Therefore if idx is int, it can represent 32-1
+    // colors. Use color_set to represent more.
     color_t color_ban_init_val;
 
-
-    init_colors (color_view_type colors,color_temp_work_view_type color_ban_,color_t hash_, nnz_lno_temp_work_view_t color_set_):
-      kokcolors(colors), color_ban(color_ban_), hash(hash_), color_set(color_set_){
-      color_t tmp = 1;
-      color_ban_init_val = tmp <<( sizeof(color_t) * 8 -1);
+    init_colors(color_view_type colors, color_temp_work_view_type color_ban_,
+                color_t hash_, nnz_lno_temp_work_view_t color_set_)
+        : kokcolors(colors),
+          color_ban(color_ban_),
+          hash(hash_),
+          color_set(color_set_) {
+      color_t tmp        = 1;
+      color_ban_init_val = tmp << (sizeof(color_t) * 8 - 1);
     }
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const size_type &ii) const {
-      //set colors based on their input colors.
-      if(kokcolors(ii) > 0){
+      // set colors based on their input colors.
+      if (kokcolors(ii) > 0) {
         color_t colorsize = sizeof(color_t) * 8 - 1;
-        color_set(ii) = (kokcolors(ii) - 1) / colorsize;
-        kokcolors(ii) = 1 << ((kokcolors(ii) - 1) % colorsize);
+        color_set(ii)     = (kokcolors(ii) - 1) / colorsize;
+        kokcolors(ii)     = 1 << ((kokcolors(ii) - 1) % colorsize);
       }
       color_ban(ii) = color_ban_init_val;
     }
   };
 
-
   /*! \brief Functor to initialize the worklist
-     */
-  struct init_work_arrays{
+   */
+  struct init_work_arrays {
     size_type_temp_work_view_t _edge_conflict_indices;
     char_temp_work_view_type _edge_conflict_marker;
 
-    init_work_arrays (
-        size_type_temp_work_view_t edge_conflict_indices,
-        char_temp_work_view_type edge_conflict_marker):
-          _edge_conflict_indices(edge_conflict_indices),
+    init_work_arrays(size_type_temp_work_view_t edge_conflict_indices,
+                     char_temp_work_view_type edge_conflict_marker)
+        : _edge_conflict_indices(edge_conflict_indices),
           _edge_conflict_marker(edge_conflict_marker){};
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const size_type &ii) const {
-      _edge_conflict_indices(ii)= ii; //every edge needs to be worked on initially.
-      _edge_conflict_marker(ii) = 1; //every edge is a conflict initially.
+      _edge_conflict_indices(ii) =
+          ii;  // every edge needs to be worked on initially.
+      _edge_conflict_marker(ii) = 1;  // every edge is a conflict initially.
     }
   };
 
   /**\brief Functor to mark the conflicts.
-   * It goes to all the edges, and checks if two ends of an edge have the same color.
-   * If they do, then it marks the one with the larger index as conflict.
+   * It goes to all the edges, and checks if two ends of an edge have the same
+   * color. If they do, then it marks the one with the larger index as conflict.
    */
   struct halfedge_mark_conflicts {
-    //edge list, source and destinations of the edge list.
+    // edge list, source and destinations of the edge list.
     nnz_lno_persistent_work_view_t srcs, dsts;
     color_view_type kokcolors;
 
-    nnz_lno_temp_work_view_t color_set; //the colors that are represented with bits, and the colors set that the color is in.
-    color_temp_work_view_type color_ban, tentative_color_ban; //color ban for each vertex represented with bit, as well as tentative color ban.
+    nnz_lno_temp_work_view_t
+        color_set;  // the colors that are represented with bits, and the colors
+                    // set that the color is in.
+    color_temp_work_view_type color_ban,
+        tentative_color_ban;  // color ban for each vertex represented with bit,
+                              // as well as tentative color ban.
     size_type_temp_work_view_t edge_conf_indices;
 
-
-    //      idx color_ban_init_val;  //the value to initialize the color_ban_. We avoid using the first bit representing the sign.
-    //                                //Therefore if idx is int, it can represent 32-1 colors. Use color_set to represent more.
+    //      idx color_ban_init_val;  //the value to initialize the color_ban_.
+    //      We avoid using the first bit representing the sign.
+    //                                //Therefore if idx is int, it can
+    //                                represent 32-1 colors. Use color_set to
+    //                                represent more.
 
     /** \brief functor constructor.
      * \param srcs_ sources of the edgelist
      * \param dsts_ destinations of the edgelist
      * \param kokcolors_  the colors of the vertices. Represented with bits.
-     * \param color_set_  the color set of the vertices. kokcolors_ and color_set_ together
-     *      is used to represent the colors e.g. color_set_(v) * (numbits_in_idx-1) + set_bit_position_in_kokcolors_(v)
-     * \param color_ban_ the bit representation of the neighbor colors that are in the same color_set.
-     *                   color_ban_ only includes the colors of the neighbors that have been colored correctly.
-     * \param tentative_color_ban_ :bit reprensentation of the neighbor vertex colors, that have been colored in the current iteration.
-     *                              it is tentative, because coloring might have conflicts.
-     * \param edge_conf_indices_ : The worklist for the edges.
+     * \param color_set_  the color set of the vertices. kokcolors_ and
+     * color_set_ together is used to represent the colors e.g. color_set_(v) *
+     * (numbits_in_idx-1) + set_bit_position_in_kokcolors_(v) \param color_ban_
+     * the bit representation of the neighbor colors that are in the same
+     * color_set. color_ban_ only includes the colors of the neighbors that have
+     * been colored correctly. \param tentative_color_ban_ :bit reprensentation
+     * of the neighbor vertex colors, that have been colored in the current
+     * iteration. it is tentative, because coloring might have conflicts. \param
+     * edge_conf_indices_ : The worklist for the edges.
      */
-    halfedge_mark_conflicts (
-        nnz_lno_persistent_work_view_t srcs_,
-        nnz_lno_persistent_work_view_t dsts_,
-        color_view_type kokcolors_,
-        nnz_lno_temp_work_view_t color_set_,
-        color_temp_work_view_type color_ban_,
-        color_temp_work_view_type tentative_color_ban_,
-        size_type_temp_work_view_t edge_conf_indices_):
-      srcs(srcs_), dsts (dsts_),
-      kokcolors(kokcolors_), color_set(color_set_),
-      color_ban(color_ban_), tentative_color_ban(tentative_color_ban_),
-      edge_conf_indices(edge_conf_indices_){}
+    halfedge_mark_conflicts(nnz_lno_persistent_work_view_t srcs_,
+                            nnz_lno_persistent_work_view_t dsts_,
+                            color_view_type kokcolors_,
+                            nnz_lno_temp_work_view_t color_set_,
+                            color_temp_work_view_type color_ban_,
+                            color_temp_work_view_type tentative_color_ban_,
+                            size_type_temp_work_view_t edge_conf_indices_)
+        : srcs(srcs_),
+          dsts(dsts_),
+          kokcolors(kokcolors_),
+          color_set(color_set_),
+          color_ban(color_ban_),
+          tentative_color_ban(tentative_color_ban_),
+          edge_conf_indices(edge_conf_indices_) {}
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const size_type &ii) const {
       size_type work_index = edge_conf_indices(ii);
-      //traverse edges,
+      // traverse edges,
       nnz_lno_t src_id = srcs(work_index);
       nnz_lno_t dst_id = dsts(work_index);
 
-
       color_t source_color = kokcolors(src_id);
-      color_t dst_color = kokcolors(dst_id);
-
-      //if the source and destionation have the same color, e.g. same color and same color_set.
-      //then we have a conflict.
-      char is_conflicted = (source_color != 0 && (source_color == dst_color) && (color_set(src_id) == color_set(dst_id)));
-      if (is_conflicted){
-        //this functor works both sides, although there is a reverse edge that will be encountered.
-        //we want to mark the conflicts as soon as possible, so that those conflicted vertices neighbors wont have unnecessary conflicts.
-        //idx conflict_ver = (src_id < dst_id) ? src_id : dst_id;
+      color_t dst_color    = kokcolors(dst_id);
+
+      // if the source and destionation have the same color, e.g. same color and
+      // same color_set. then we have a conflict.
+      char is_conflicted = (source_color != 0 && (source_color == dst_color) &&
+                            (color_set(src_id) == color_set(dst_id)));
+      if (is_conflicted) {
+        // this functor works both sides, although there is a reverse edge that
+        // will be encountered. we want to mark the conflicts as soon as
+        // possible, so that those conflicted vertices neighbors wont have
+        // unnecessary conflicts. idx conflict_ver = (src_id < dst_id) ? src_id
+        // : dst_id;
         //
-        //TODO: dst_id seems to reduce the num colors, without increaisng runtime
-        kokcolors(dst_id) = 0;
+        // TODO: dst_id seems to reduce the num colors, without increaisng
+        // runtime
+        kokcolors(dst_id)           = 0;
         tentative_color_ban(dst_id) = 0;
       }
     }
@@ -2643,48 +2657,51 @@ class GraphColor_EB:public GraphColor <HandleType,in_row_index_view_type_,in_non
    * Also, as a side effect, it marks edge_conflict_marker and
    * remove those that are not needed to be looked further
    */
-  struct halfedge_conflict_count{
+  struct halfedge_conflict_count {
     nnz_lno_persistent_work_view_t _kok_src;
     nnz_lno_persistent_work_view_t _kok_dst;
     color_view_type _kok_colors;
-    nnz_lno_temp_work_view_t _color_set; //the colors that are represented with bits, and the colors set that the color is in.
+    nnz_lno_temp_work_view_t
+        _color_set;  // the colors that are represented with bits, and the
+                     // colors set that the color is in.
     size_type_temp_work_view_t _edge_conflict_indices;
     char_temp_work_view_type _edge_conflict_marker;
 
-    halfedge_conflict_count(
-        nnz_lno_persistent_work_view_t kok_src_,
-        nnz_lno_persistent_work_view_t kok_dst_,
-        color_view_type kok_colors,
-        nnz_lno_temp_work_view_t color_set,
-        size_type_temp_work_view_t edge_conflict_indices,
-        char_temp_work_view_type edge_conflict_marker):
-          _kok_src( kok_src_),
-          _kok_dst( kok_dst_),
-          _kok_colors( kok_colors),
-          _color_set( color_set),
-          _edge_conflict_indices( edge_conflict_indices),
-          _edge_conflict_marker( edge_conflict_marker){}
+    halfedge_conflict_count(nnz_lno_persistent_work_view_t kok_src_,
+                            nnz_lno_persistent_work_view_t kok_dst_,
+                            color_view_type kok_colors,
+                            nnz_lno_temp_work_view_t color_set,
+                            size_type_temp_work_view_t edge_conflict_indices,
+                            char_temp_work_view_type edge_conflict_marker)
+        : _kok_src(kok_src_),
+          _kok_dst(kok_dst_),
+          _kok_colors(kok_colors),
+          _color_set(color_set),
+          _edge_conflict_indices(edge_conflict_indices),
+          _edge_conflict_marker(edge_conflict_marker) {}
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const size_type &ii, size_type &sum) const {
-
       size_type w = _edge_conflict_indices(ii);
 
-      if (_edge_conflict_marker(w) == 0){
+      if (_edge_conflict_marker(w) == 0) {
         sum += 1;
-      }
-      else{
+      } else {
         nnz_lno_t d = this->_kok_dst(w);
         nnz_lno_t s = this->_kok_src(w);
 
         color_t dc = _kok_colors(d);
         color_t sc = _kok_colors(s);
 
-        if ( (dc && sc) || //if both colored
-            (sc && (_color_set(d) > _color_set(s))) || //if source is colored, and destination color set is larger than source
-            (dc && (_color_set(s) > _color_set(d))) //or if destionation is colored, and the source color set is larger
-        ){
-          //then no need to look at this edge anymore.
+        if ((dc && sc) ||  // if both colored
+            (sc && (_color_set(d) >
+                    _color_set(s))) ||  // if source is colored, and destination
+                                        // color set is larger than source
+            (dc && (_color_set(s) >
+                    _color_set(d)))  // or if destionation is colored, and the
+                                     // source color set is larger
+        ) {
+          // then no need to look at this edge anymore.
           _edge_conflict_marker(w) = 0;
           sum += 1;
         }
@@ -2696,37 +2713,30 @@ class GraphColor_EB:public GraphColor <HandleType,in_row_index_view_type_,in_non
    * \brief Functor to create the new work array with a parallel prefix sum.
    */
   struct ppsWorklistFunctorEB {
-    using edge_view = size_type_temp_work_view_t ;
+    using edge_view = size_type_temp_work_view_t;
     using char_view = char_temp_work_view_type;
     edge_view _oldlist;
     edge_view _newlist;
-    char_view _markers; //_markers(e) != 0 iff e has a conflict
+    char_view _markers;  //_markers(e) != 0 iff e has a conflict
 
-    ppsWorklistFunctorEB(
-        const edge_view& oldlist,
-        const edge_view& newlist,
-        const char_view& markers) :
-          _oldlist(oldlist), _newlist(newlist), _markers(markers)
-    {}
+    ppsWorklistFunctorEB(const edge_view &oldlist, const edge_view &newlist,
+                         const char_view &markers)
+        : _oldlist(oldlist), _newlist(newlist), _markers(markers) {}
 
     KOKKOS_INLINE_FUNCTION
-    void operator()(nnz_lno_t i, size_type& update, const bool final) const
-    {
+    void operator()(nnz_lno_t i, size_type &update, const bool final) const {
       size_type edge = _oldlist(i);
-      if(_markers(edge))
-      {
-        if(final)
-          _newlist(update) = edge;
+      if (_markers(edge)) {
+        if (final) _newlist(update) = edge;
         update++;
       }
     }
   };
 
-
   /**
    * \brief Functor to create the new work array with atomic operations.
    */
-  struct atomic_create_new_work_array{
+  struct atomic_create_new_work_array {
     single_dim_index_view_type _new_index;
     size_type_temp_work_view_t _edge_conflict_indices;
     char_temp_work_view_type _edge_conflict_marker;
@@ -2736,35 +2746,39 @@ class GraphColor_EB:public GraphColor <HandleType,in_row_index_view_type_,in_non
         single_dim_index_view_type new_index,
         size_type_temp_work_view_t edge_conflict_indices,
         char_temp_work_view_type edge_conflict_marker,
-        size_type_temp_work_view_t new_edge_conflict_indices):
-          _new_index(new_index),
+        size_type_temp_work_view_t new_edge_conflict_indices)
+        : _new_index(new_index),
           _edge_conflict_indices(edge_conflict_indices),
           _edge_conflict_marker(edge_conflict_marker),
-          _new_edge_conflict_indices(new_edge_conflict_indices){}
+          _new_edge_conflict_indices(new_edge_conflict_indices) {}
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const size_type ii) const {
-      typedef typename std::remove_reference< decltype( _new_index() ) >::type atomic_incr_type;
+      typedef typename std::remove_reference<decltype(_new_index())>::type
+          atomic_incr_type;
       size_type w = _edge_conflict_indices(ii);
-      if(_edge_conflict_marker(w)){
-        const size_type future_index = Kokkos::atomic_fetch_add( &_new_index(), atomic_incr_type(1));
+      if (_edge_conflict_marker(w)) {
+        const size_type future_index =
+            Kokkos::atomic_fetch_add(&_new_index(), atomic_incr_type(1));
         _new_edge_conflict_indices(future_index) = w;
       }
     }
-
   };
 
-
   /** \brief Functor for creating the ban colors for uncolored vertices.
    * It only creates ban_color, based on the certain information.
    * That is it works on the edges where one end is colored (for good)
    * and the other part is not colored.
    */
   struct halfedge_ban_colors {
-    nnz_lno_persistent_work_view_t srcs, dsts;  //edge list, source and destinations of the edge list.
+    nnz_lno_persistent_work_view_t srcs,
+        dsts;  // edge list, source and destinations of the edge list.
     color_view_type kokcolors;
-    nnz_lno_temp_work_view_t color_set; //the colors that are represented with bits, and the colors set that the color is in.
-    color_temp_work_view_type color_ban; //color ban for each vertex represented with bit
+    nnz_lno_temp_work_view_t
+        color_set;  // the colors that are represented with bits, and the colors
+                    // set that the color is in.
+    color_temp_work_view_type
+        color_ban;  // color ban for each vertex represented with bit
     size_type_temp_work_view_t conflict_indices;
     char_temp_work_view_type edge_conflict_marker;
 
@@ -2772,160 +2786,192 @@ class GraphColor_EB:public GraphColor <HandleType,in_row_index_view_type_,in_non
      * \param srcs_ sources of the edgelist
      * \param dsts_ destinations of the edgelist
      * \param kokcolors_  the colors of the vertices. Represented with bits.
-     * \param color_set_  the color set of the vertices. kokcolors_ and color_set_ together
-     *      is used to represent the colors e.g. color_set_(v) * (numbits_in_idx-1) + set_bit_position_in_kokcolors_(v)
-     * \param color_ban_ the bit representation of the neighbor colors that are in the same color_set.
-     *                   color_ban_ only includes the colors of the neighbors that have been colored correctly.
+     * \param color_set_  the color set of the vertices. kokcolors_ and
+     * color_set_ together is used to represent the colors e.g. color_set_(v) *
+     * (numbits_in_idx-1) + set_bit_position_in_kokcolors_(v) \param color_ban_
+     * the bit representation of the neighbor colors that are in the same
+     * color_set. color_ban_ only includes the colors of the neighbors that have
+     * been colored correctly.
      */
-    halfedge_ban_colors (
-        nnz_lno_persistent_work_view_t srcs_, nnz_lno_persistent_work_view_t dsts_,
-        color_view_type kokcolors_, nnz_lno_temp_work_view_t  color_set_,
-        color_temp_work_view_type color_ban_,
-        size_type_temp_work_view_t conflict_indices_, char_temp_work_view_type edge_conflict_marker_):
-          srcs(srcs_), dsts (dsts_),
-          kokcolors(kokcolors_), color_set(color_set_),
+    halfedge_ban_colors(nnz_lno_persistent_work_view_t srcs_,
+                        nnz_lno_persistent_work_view_t dsts_,
+                        color_view_type kokcolors_,
+                        nnz_lno_temp_work_view_t color_set_,
+                        color_temp_work_view_type color_ban_,
+                        size_type_temp_work_view_t conflict_indices_,
+                        char_temp_work_view_type edge_conflict_marker_)
+        : srcs(srcs_),
+          dsts(dsts_),
+          kokcolors(kokcolors_),
+          color_set(color_set_),
           color_ban(color_ban_),
-          conflict_indices(conflict_indices_), edge_conflict_marker(edge_conflict_marker_){}
+          conflict_indices(conflict_indices_),
+          edge_conflict_marker(edge_conflict_marker_) {}
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const size_type &ii) const {
       size_type work_index = conflict_indices(ii);
-      nnz_lno_t dst_id = dsts(work_index);
-      color_t dst_col = kokcolors(dst_id);
-      nnz_lno_t src_id = srcs(work_index);
-      color_t src_col = kokcolors(src_id);
-
-      //check destionation color.
-      //continue only if it is not colored
-      if ((!dst_col  && src_col) || (!src_col  && dst_col)){
-        //check src color, send its color to ban colors only if it is colored.
-        nnz_lno_t dest_col_set = color_set (dst_id);
-        nnz_lno_t src_col_set = color_set (src_id);
-        //check if they are in the same color set.
-        //if they are not, we do not ban the color, as it represents a different color.
-        if (src_col_set == dest_col_set){
-          //atomic or, as no threads owns 'dst' (neither src)
-          nnz_lno_t uncolored_vertex = dst_col? src_id: dst_id;
-          Kokkos::atomic_fetch_or<color_t>(&(color_ban(uncolored_vertex)), src_col | dst_col);
+      nnz_lno_t dst_id     = dsts(work_index);
+      color_t dst_col      = kokcolors(dst_id);
+      nnz_lno_t src_id     = srcs(work_index);
+      color_t src_col      = kokcolors(src_id);
+
+      // check destionation color.
+      // continue only if it is not colored
+      if ((!dst_col && src_col) || (!src_col && dst_col)) {
+        // check src color, send its color to ban colors only if it is colored.
+        nnz_lno_t dest_col_set = color_set(dst_id);
+        nnz_lno_t src_col_set  = color_set(src_id);
+        // check if they are in the same color set.
+        // if they are not, we do not ban the color, as it represents a
+        // different color.
+        if (src_col_set == dest_col_set) {
+          // atomic or, as no threads owns 'dst' (neither src)
+          nnz_lno_t uncolored_vertex = dst_col ? src_id : dst_id;
+          Kokkos::atomic_fetch_or<color_t>(&(color_ban(uncolored_vertex)),
+                                           src_col | dst_col);
           edge_conflict_marker(work_index) = 0;
         }
       }
     }
   };
 
-
   /**
-   * \brief Functor to tentatively color vertices. It propogates the color information
-   * to other end.
+   * \brief Functor to tentatively color vertices. It propogates the color
+   * information to other end.
    */
-  struct halfedge_expand_ban_for_unmatched_neighbors{
-    nnz_lno_persistent_work_view_t srcs, dsts; //edge list, source and destinations of the edge list.
+  struct halfedge_expand_ban_for_unmatched_neighbors {
+    nnz_lno_persistent_work_view_t srcs,
+        dsts;  // edge list, source and destinations of the edge list.
     color_view_type kokcolors;
-    nnz_lno_temp_work_view_t color_set; //the colors that are represented with bits, and the colors set that the color is in.
-    color_temp_work_view_type color_ban, tentative_color_ban; //color ban for each vertex represented with bit, as well as tentative color ban.
+    nnz_lno_temp_work_view_t
+        color_set;  // the colors that are represented with bits, and the colors
+                    // set that the color is in.
+    color_temp_work_view_type color_ban,
+        tentative_color_ban;  // color ban for each vertex represented with bit,
+                              // as well as tentative color ban.
     color_t first_digit;
     size_type_temp_work_view_t conflict_indices;
 
-
     /** \brief functor constructor.
      * \param srcs_ sources of the edgelist
      * \param dsts_ destinations of the edgelist
      * \param kokcolors_  the colors of the vertices. Represented with bits.
-     * \param color_set_  the color set of the vertices. kokcolors_ and color_set_ together
-     *      is used to represent the colors e.g. color_set_(v) * (numbits_in_idx-1) + set_bit_position_in_kokcolors_(v)
-     * \param color_ban_ the bit representation of the neighbor colors that are in the same color_set.
-     *                   color_ban_ only includes the colors of the neighbors that have been colored correctly.
-     * \param tentative_color_ban_ :bit reprensentation of the neighbor vertex colors, that have been colored in the current iteration.
-     *                              it is tentative, because coloring might have conflicts.
+     * \param color_set_  the color set of the vertices. kokcolors_ and
+     * color_set_ together is used to represent the colors e.g. color_set_(v) *
+     * (numbits_in_idx-1) + set_bit_position_in_kokcolors_(v) \param color_ban_
+     * the bit representation of the neighbor colors that are in the same
+     * color_set. color_ban_ only includes the colors of the neighbors that have
+     * been colored correctly. \param tentative_color_ban_ :bit reprensentation
+     * of the neighbor vertex colors, that have been colored in the current
+     * iteration. it is tentative, because coloring might have conflicts.
      */
-    halfedge_expand_ban_for_unmatched_neighbors (
-        nnz_lno_persistent_work_view_t srcs_, nnz_lno_persistent_work_view_t dsts_,
-        color_view_type kokcolors_, nnz_lno_temp_work_view_t  color_set_ ,
-        color_temp_work_view_type color_ban_, color_temp_work_view_type tentative_color_ban_,
-        size_type_temp_work_view_t conflict_indices_):
-          srcs(srcs_), dsts (dsts_),
-          kokcolors(kokcolors_), color_set(color_set_),
-          color_ban(color_ban_), tentative_color_ban(tentative_color_ban_),
-          conflict_indices(conflict_indices_){
+    halfedge_expand_ban_for_unmatched_neighbors(
+        nnz_lno_persistent_work_view_t srcs_,
+        nnz_lno_persistent_work_view_t dsts_, color_view_type kokcolors_,
+        nnz_lno_temp_work_view_t color_set_,
+        color_temp_work_view_type color_ban_,
+        color_temp_work_view_type tentative_color_ban_,
+        size_type_temp_work_view_t conflict_indices_)
+        : srcs(srcs_),
+          dsts(dsts_),
+          kokcolors(kokcolors_),
+          color_set(color_set_),
+          color_ban(color_ban_),
+          tentative_color_ban(tentative_color_ban_),
+          conflict_indices(conflict_indices_) {
       color_t tmp = 1;
-      first_digit = tmp <<( sizeof(color_t) * 8 -1);
+      first_digit = tmp << (sizeof(color_t) * 8 - 1);
     }
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const size_type &ii) const {
       size_type work_index = conflict_indices(ii);
-      nnz_lno_t dst_id = dsts(work_index);
-      color_t dst_col = kokcolors(dst_id);
+      nnz_lno_t dst_id     = dsts(work_index);
+      color_t dst_col      = kokcolors(dst_id);
 
-      //if the destionation is colored already, we have nothing to do.
-      //otherwise, if destionation is uncolored, or if its color < 0 (it has been tentatively colored)
-      //then we need to check the source.
-      if (dst_col == 0 || (dst_col & first_digit) ){
+      // if the destionation is colored already, we have nothing to do.
+      // otherwise, if destionation is uncolored, or if its color < 0 (it has
+      // been tentatively colored) then we need to check the source.
+      if (dst_col == 0 || (dst_col & first_digit)) {
         nnz_lno_t src_id = srcs(work_index);
-        color_t src_col = kokcolors(src_id);
-        //if source is colored, again we have nothing to do.
-        //if it is tentatively colored or uncolored, then we have work to do.
-        if (src_col == 0 || (src_col & first_digit)){
-          //check their colors sets, if they are on different color sets,
-          //we dont need to care about the prohibted colors on each other -- at least in this iteration.
-          nnz_lno_t dest_col_set = color_set (dst_id);
-          nnz_lno_t src_col_set = color_set (src_id);
-
-          if (src_col_set == dest_col_set){
-            if ((dst_col & first_digit)  && (src_col & first_digit)){
-              //if both ends are tentatively colored, we can check for the conflict here,
-              //and fix it as below. However doing so increased the number of colors,
-              //so, it has been turned of for now.
-
-              if (src_col == dst_col && dest_col_set == src_col_set){
-                //idx smaller_index = (src_id > dst_id) ? src_id : dst_id;
-                nnz_lno_t smaller_index = dst_id; //TODO which one is better? this seems to be not much changing
-                //idx smaller_index = src_id;
-                //then both have been colored tentavitely. propoagate the color of src to dst.
-                Kokkos::atomic_fetch_or<color_t>(&(tentative_color_ban(smaller_index)), -src_col);
-                nnz_lno_t banned_colors = ~(color_ban(smaller_index) | tentative_color_ban(smaller_index));
-                nnz_lno_t larger_col = banned_colors & (-banned_colors);
+        color_t src_col  = kokcolors(src_id);
+        // if source is colored, again we have nothing to do.
+        // if it is tentatively colored or uncolored, then we have work to do.
+        if (src_col == 0 || (src_col & first_digit)) {
+          // check their colors sets, if they are on different color sets,
+          // we dont need to care about the prohibted colors on each other -- at
+          // least in this iteration.
+          nnz_lno_t dest_col_set = color_set(dst_id);
+          nnz_lno_t src_col_set  = color_set(src_id);
+
+          if (src_col_set == dest_col_set) {
+            if ((dst_col & first_digit) && (src_col & first_digit)) {
+              // if both ends are tentatively colored, we can check for the
+              // conflict here, and fix it as below. However doing so increased
+              // the number of colors, so, it has been turned of for now.
+
+              if (src_col == dst_col && dest_col_set == src_col_set) {
+                // idx smaller_index = (src_id > dst_id) ? src_id : dst_id;
+                nnz_lno_t smaller_index =
+                    dst_id;  // TODO which one is better? this seems to be not
+                             // much changing
+                // idx smaller_index = src_id;
+                // then both have been colored tentavitely. propoagate the color
+                // of src to dst.
+                Kokkos::atomic_fetch_or<color_t>(
+                    &(tentative_color_ban(smaller_index)), -src_col);
+                nnz_lno_t banned_colors  = ~(color_ban(smaller_index) |
+                                            tentative_color_ban(smaller_index));
+                nnz_lno_t larger_col     = banned_colors & (-banned_colors);
                 kokcolors(smaller_index) = -(larger_col);
               }
-            }
-            else if(src_col != 0) {
-              //if src is tentavily colored, and dst is not colored,
-              //then we send the color information to dst's tentative_ban.
-
-              //Kokkos::atomic_fetch_or<color_type>(&(color_ban(dst_id)), -src_col);
-              Kokkos::atomic_fetch_or<color_t>(&(tentative_color_ban(dst_id)), -src_col);
-            }
-            else if (dst_col != 0){
-              //if it is dst tentatively colors, but src is not colored,
-              //then we send the dst color info to src's tentative_ban
-
-              //Kokkos::atomic_fetch_or<color_type>(&(color_ban(src_id)), -dst_col);
-              Kokkos::atomic_fetch_or<color_t>(&(tentative_color_ban(src_id)), -dst_col);
-            }
-            else {
-              //idx smaller_index = src_id < dst_id > 0 ? src_id: dst_id;
-              //idx larger_index = src_id < dst_id > 0 ? dst_id : src_id;
+            } else if (src_col != 0) {
+              // if src is tentavily colored, and dst is not colored,
+              // then we send the color information to dst's tentative_ban.
+
+              // Kokkos::atomic_fetch_or<color_type>(&(color_ban(dst_id)),
+              // -src_col);
+              Kokkos::atomic_fetch_or<color_t>(&(tentative_color_ban(dst_id)),
+                                               -src_col);
+            } else if (dst_col != 0) {
+              // if it is dst tentatively colors, but src is not colored,
+              // then we send the dst color info to src's tentative_ban
+
+              // Kokkos::atomic_fetch_or<color_type>(&(color_ban(src_id)),
+              // -dst_col);
+              Kokkos::atomic_fetch_or<color_t>(&(tentative_color_ban(src_id)),
+                                               -dst_col);
+            } else {
+              // idx smaller_index = src_id < dst_id > 0 ? src_id: dst_id;
+              // idx larger_index = src_id < dst_id > 0 ? dst_id : src_id;
 #ifndef TOOHIGHQUALITY
               nnz_lno_t smaller_index = src_id;
-              nnz_lno_t larger_index = dst_id;
+              nnz_lno_t larger_index  = dst_id;
 #endif
 #ifdef TOOHIGHQUALITY
               row_index_type smaller_index = dst_id;
-              row_index_type larger_index = src_id;
+              row_index_type larger_index  = src_id;
 #endif
 
-              //idx smaller_col =  src_id < dst_id > 0 ? src_col: dst_col;
-              //if both ends are uncolored, tentatively color the the source if its index is smaller than dst.
-              //make an 'bitwise or' of color_ban and tentative_color_ban to get the all prohibited colors.
-              //we need to find the right most zero here. it is easier to find right most 1, so we do a not of the result color ban.
-              color_t banned_colors = ~(color_ban(smaller_index) | tentative_color_ban(smaller_index));
-              //the 'bitwise and' of banned_colors with two's complement result in only the rightmost 1 to be set, which is our color.
+              // idx smaller_col =  src_id < dst_id > 0 ? src_col: dst_col;
+              // if both ends are uncolored, tentatively color the the source if
+              // its index is smaller than dst. make an 'bitwise or' of
+              // color_ban and tentative_color_ban to get the all prohibited
+              // colors. we need to find the right most zero here. it is easier
+              // to find right most 1, so we do a not of the result color ban.
+              color_t banned_colors = ~(color_ban(smaller_index) |
+                                        tentative_color_ban(smaller_index));
+              // the 'bitwise and' of banned_colors with two's complement result
+              // in only the rightmost 1 to be set, which is our color.
               src_col = banned_colors & (-banned_colors);
-              //set it to minus of the color, as it is tentative coloring.
+              // set it to minus of the color, as it is tentative coloring.
               kokcolors(smaller_index) = -(src_col);
-              //send the color information to dst's tentative color ban.
-              Kokkos::atomic_fetch_or<color_t>(&(tentative_color_ban(larger_index)), src_col);
-              //Kokkos::atomic_fetch_or<color_type>(&(color_ban(dst_id)), src_col);
+              // send the color information to dst's tentative color ban.
+              Kokkos::atomic_fetch_or<color_t>(
+                  &(tentative_color_ban(larger_index)), src_col);
+              // Kokkos::atomic_fetch_or<color_type>(&(color_ban(dst_id)),
+              // src_col);
             }
           }
         }
@@ -2933,122 +2979,146 @@ class GraphColor_EB:public GraphColor <HandleType,in_row_index_view_type_,in_non
     }
   };
 
-
-
-
-
   /** \brief Functor responsible for choosing a color for each uncolored vertex,
    * given the color_ban and tentative_color_ban
    */
   struct choose_colors {
     color_view_type kokcolors;
-    nnz_lno_temp_work_view_t color_set; //the colors that are represented with bits, and the colors set that the color is in.
-    color_temp_work_view_type color_ban, tentative_color_ban;  //color ban for each vertex represented with bit, as well as tentative color ban.
-    color_t color_ban_init_val;  //the value to initialize the color_ban_. We avoid using the first bit representing the sign.
-    //Therefore if idx is int, it can represent 32-1 colors. Use color_set to represent more
+    nnz_lno_temp_work_view_t
+        color_set;  // the colors that are represented with bits, and the colors
+                    // set that the color is in.
+    color_temp_work_view_type color_ban,
+        tentative_color_ban;  // color ban for each vertex represented with bit,
+                              // as well as tentative color ban.
+    color_t
+        color_ban_init_val;  // the value to initialize the color_ban_. We avoid
+                             // using the first bit representing the sign.
+    // Therefore if idx is int, it can represent 32-1 colors. Use color_set to
+    // represent more
 
     /** \brief functor constructor.
      * \param kokcolors_  the colors of the vertices. Represented with bits.
-     * \param color_set_  the color set of the vertices. kokcolors_ and color_set_ together
-     *      is used to represent the colors e.g. color_set_(v) * (numbits_in_idx-1) + set_bit_position_in_kokcolors_(v)
-     * \param color_ban_ the bit representation of the neighbor colors that are in the same color_set.
-     *                   color_ban_ only includes the colors of the neighbors that have been colored correctly.
-     * \param tentative_color_ban_ :bit reprensentation of the neighbor vertex colors, that have been colored in the current iteration.
-     *                              it is tentative, because coloring might have conflicts.
+     * \param color_set_  the color set of the vertices. kokcolors_ and
+     * color_set_ together is used to represent the colors e.g. color_set_(v) *
+     * (numbits_in_idx-1) + set_bit_position_in_kokcolors_(v) \param color_ban_
+     * the bit representation of the neighbor colors that are in the same
+     * color_set. color_ban_ only includes the colors of the neighbors that have
+     * been colored correctly. \param tentative_color_ban_ :bit reprensentation
+     * of the neighbor vertex colors, that have been colored in the current
+     * iteration. it is tentative, because coloring might have conflicts.
      */
-    choose_colors ( color_view_type kokcolors_, nnz_lno_temp_work_view_t  color_set_,
-        color_temp_work_view_type color_ban_,  color_temp_work_view_type tentative_color_ban_):
-          kokcolors(kokcolors_), color_set(color_set_),
-          color_ban(color_ban_), tentative_color_ban(tentative_color_ban_){
-      //color_ban should always have 1 at the first bit, so that that color is not allowed.
-      color_t tmp = 1;
-      color_ban_init_val = tmp <<( sizeof(color_t) * 8 -1);
+    choose_colors(color_view_type kokcolors_,
+                  nnz_lno_temp_work_view_t color_set_,
+                  color_temp_work_view_type color_ban_,
+                  color_temp_work_view_type tentative_color_ban_)
+        : kokcolors(kokcolors_),
+          color_set(color_set_),
+          color_ban(color_ban_),
+          tentative_color_ban(tentative_color_ban_) {
+      // color_ban should always have 1 at the first bit, so that that color is
+      // not allowed.
+      color_t tmp        = 1;
+      color_ban_init_val = tmp << (sizeof(color_t) * 8 - 1);
     }
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const nnz_lno_t &ii) const {
-      //if the vertex is uncolored, we will choose a new color for the vertex.
-      if (kokcolors[ii] == 0){
+      // if the vertex is uncolored, we will choose a new color for the vertex.
+      if (kokcolors[ii] == 0) {
         color_t certain_info = color_ban(ii);
-        //get the banned_color_set by taking 'bitwise or' or color ban and tentative_color_ban
+        // get the banned_color_set by taking 'bitwise or' or color ban and
+        // tentative_color_ban
         color_t banned_colors = ~(certain_info | tentative_color_ban(ii));
-        //my color is the first non set bit in the banned_colors. We perform a not operation,
-        //and make a 'bitwise and' with its 2's complement to find the first zero bit.
+        // my color is the first non set bit in the banned_colors. We perform a
+        // not operation, and make a 'bitwise and' with its 2's complement to
+        // find the first zero bit.
         color_t my_color = banned_colors & (-banned_colors);
-        if (my_color == 0){
+        if (my_color == 0) {
 #ifdef EBCOLORING_HIGHER_QUALITY
-          //if my color is zero, that is all the available colors in this set has been taken by the neighbors
-          //then I might need to change my color set. But we need to be sure about this, so we check if
-          //color_ban is full as well, since tentative_color_ban might be too pessimist.
+          // if my color is zero, that is all the available colors in this set
+          // has been taken by the neighbors then I might need to change my
+          // color set. But we need to be sure about this, so we check if
+          // color_ban is full as well, since tentative_color_ban might be too
+          // pessimist.
           banned_colors = ~(certain_info);
-          my_color = banned_colors & (-banned_colors);
-          if (my_color == 0){
+          my_color      = banned_colors & (-banned_colors);
+          if (my_color == 0) {
 #endif
-            //if all colors are taken by the neighbors (certainly excluding tentative colors), then
-            //I need to change my color set.
-            //if there are still available colors w.r.t. color_ban, then try one more time without increasing the color_set.
-            color_set(ii) += 1; // increase color set.
-            color_ban(ii) = color_ban_init_val; //set the color ban to its initial value.
+            // if all colors are taken by the neighbors (certainly excluding
+            // tentative colors), then I need to change my color set. if there
+            // are still available colors w.r.t. color_ban, then try one more
+            // time without increasing the color_set.
+            color_set(ii) += 1;  // increase color set.
+            color_ban(ii) =
+                color_ban_init_val;  // set the color ban to its initial value.
 #ifdef EBCOLORING_HIGHER_QUALITY
           }
 #endif
-          //in each case we cannot color this vertex. set the tentative_color_ban to 0
-          //try to color it at the next iteration.
+          // in each case we cannot color this vertex. set the
+          // tentative_color_ban to 0 try to color it at the next iteration.
           tentative_color_ban(ii) = 0;
-          //color_ban(ii) = color_ban_init_val; //set the color ban to its initial value.
-        }
-        else {
+          // color_ban(ii) = color_ban_init_val; //set the color ban to its
+          // initial value.
+        } else {
           kokcolors(ii) = my_color;
         }
-      }
-      else if (kokcolors(ii) & color_ban_init_val){
+      } else if (kokcolors(ii) & color_ban_init_val) {
         kokcolors(ii) = -kokcolors(ii);
       }
     }
   };
 
   /** Functor responsible for setting the final color for each vertex,
-   *  The color of the vertex is found ast color_set * (sizeof(color_type) * 8 -1) + log2(color)
+   *  The color of the vertex is found ast color_set * (sizeof(color_type) * 8
+   * -1) + log2(color)
    */
-  struct set_final_colors{
+  struct set_final_colors {
     color_view_type kokcol;
-    nnz_lno_temp_work_view_t kokcolset; //the colors that are represented with bits, and the colors set that the color is in.
+    nnz_lno_temp_work_view_t
+        kokcolset;  // the colors that are represented with bits, and the colors
+                    // set that the color is in.
     color_t color_size;
 
     /** \brief functor constructor.
      * \param kokcol_  the colors of the vertices. Represented with bits.
-     * \param kokcolset_  the color set of the vertices. kokcolors_ and color_set_ together
-     *      is used to represent the colors e.g. color_set_(v) * (numbits_in_idx-1) + set_bit_position_in_kokcolors_(v)
+     * \param kokcolset_  the color set of the vertices. kokcolors_ and
+     * color_set_ together is used to represent the colors e.g. color_set_(v) *
+     * (numbits_in_idx-1) + set_bit_position_in_kokcolors_(v)
      */
-    set_final_colors(color_view_type kokcol_, nnz_lno_temp_work_view_t  kokcolset_): kokcol(kokcol_),kokcolset(kokcolset_), color_size ( sizeof(color_t) * 8 -1){}
+    set_final_colors(color_view_type kokcol_,
+                     nnz_lno_temp_work_view_t kokcolset_)
+        : kokcol(kokcol_),
+          kokcolset(kokcolset_),
+          color_size(sizeof(color_t) * 8 - 1) {}
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const nnz_lno_t &ii) const {
       nnz_lno_t i = 0;
       color_t val = kokcol(ii);
-      //if check below is necessary.
+      // if check below is necessary.
       // this happens when a vertices all neighbors are colored,
-      //so the information from all neighbors are taken, no edge to be processed by this vertex.
-      //the algorithm works on the number of edges, if the edges are all consumed, the loop
-      //might terminate with an early exit without coloring this vertex.
-      //this happens when all neighbors consumes all the colors in the current vertex set,
-      //and the vertex left to be colored in the next iteration.
-      //but the vertex couldnt be colored, because there is no more edge left to be worked on.
+      // so the information from all neighbors are taken, no edge to be
+      // processed by this vertex. the algorithm works on the number of edges,
+      // if the edges are all consumed, the loop might terminate with an early
+      // exit without coloring this vertex. this happens when all neighbors
+      // consumes all the colors in the current vertex set, and the vertex left
+      // to be colored in the next iteration. but the vertex couldnt be colored,
+      // because there is no more edge left to be worked on.
 
-      if(val == 0) val = 1;
+      if (val == 0) val = 1;
 
-      //find the position in the bit.
+      // find the position in the bit.
       while (val != 0) {
         ++i;
         val = val >> 1;
       }
-      //set the final color.
+      // set the final color.
       kokcol(ii) = i + kokcolset(ii) * color_size;
     }
   };
 };
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosGraph
 
-#endif //_KOKKOSCOLORINGIMP_HPP
+#endif  //_KOKKOSCOLORINGIMP_HPP
diff --git a/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp b/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp
index ba14303b7f..ed40646711 100644
--- a/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp
+++ b/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp
@@ -50,9 +50,7 @@
 #include <vector>
 #include <type_traits>
 
-#include <Kokkos_Atomic.hpp>
 #include <Kokkos_Core.hpp>
-#include <Kokkos_MemoryTraits.hpp>
 #include <Kokkos_UniqueToken.hpp>
 
 #include <KokkosKernels_Uniform_Initialized_MemoryPool.hpp>
@@ -60,14 +58,11 @@
 #include <KokkosSparse_spgemm.hpp>
 #include <KokkosKernels_BitUtils.hpp>
 
-#include <Kokkos_Timer.hpp>
-
 #include "KokkosGraph_Distance1Color.hpp"
-#include "KokkosGraph_Distance1ColorHandle.hpp"      // todo: remove this  (SCAFFOLDING - WCMCLEN)
+#include "KokkosGraph_Distance1ColorHandle.hpp"  // todo: remove this  (SCAFFOLDING - WCMCLEN)
 #include "KokkosGraph_Distance2ColorHandle.hpp"
 #include "KokkosKernels_Handle.hpp"
 
-
 namespace KokkosGraph {
 
 namespace Impl {
@@ -88,1663 +83,1557 @@ namespace Impl {
  * Distance-1 conflicts will not be checked.
  *
  */
-template<typename HandleType,
-         typename rowmap_t,
-         typename entries_t,
-         bool doing_bipartite>
-class GraphColorDistance2
-{
-  //Need mutable entries type for edge filtering
+template <typename HandleType, typename rowmap_t, typename entries_t,
+          bool doing_bipartite>
+class GraphColorDistance2 {
+  // Need mutable entries type for edge filtering
   using nc_entries_t = typename entries_t::non_const_type;
-  public:
-    using execution_space       = typename HandleType::HandleExecSpace;
-    using memory_space          = typename HandleType::HandleTempMemorySpace;
-    using device_type           = Kokkos::Device<execution_space, memory_space>;
-    using color_view_type       = typename HandleType::color_view_type;
-    using color_type            = typename HandleType::color_type;
-    using size_type             = typename rowmap_t::non_const_value_type;
-    using lno_t                 = typename entries_t::non_const_value_type;
-    //Temporary view of lno_t
-    using lno_view_t            = typename HandleType::nnz_lno_temp_work_view_type;
-    //Single-element (0-dim) view of lno_t:
-    using single_lno_view_t     = typename Kokkos::View<lno_t, memory_space>;
-    using range_policy_type     = Kokkos::RangePolicy<execution_space>;
-    using team_policy_type      = Kokkos::TeamPolicy<execution_space>;
-    using team_member_type      = typename team_policy_type::member_type;
-    using bool_view_t           = Kokkos::View<bool*, memory_space>;
-    using bit_64_forbidden_type = uint64_t;
-    using bitset_t              = Kokkos::Bitset<device_type>;
-    using forbidden_view        = Kokkos::View<uint32_t*, device_type>;
-
-  protected:
-    lno_t     nr;          // num_rows  (also #verts, the objects being colored)
-    lno_t     nc;          // num cols
-    size_type ne;          // # edges
-    rowmap_t  xadj;        // rowmap, transpose of rowmap
-    entries_t adj;         // entries, transpose of entries   (size = # edges)
-    rowmap_t  t_xadj;      // transpose rowmap (aliases xadj if !doing_bipartite)
-    entries_t t_adj;       // transpose entries (aliases adj if !doing_bipartite)
-    HandleType* gc_handle;      // pointer to the graph coloring handle
-
-  private:
-    int  _chunkSize;      // the size of the minimum work unit assigned to threads.  Changes the convergence on GPUs
-    int  _max_num_iterations;
-    bool _ticToc;             // if true print info in each step
-    bool _verbose;            // if true, print verbose information
-    bool using_edge_filtering;
-    static constexpr color_type UNPROCESSED = 0;
-    static constexpr color_type UNCOLORABLE = ~UNPROCESSED;
-    static constexpr color_type CONFLICTED = UNCOLORABLE - 1;
-
-  public:
-    /**
-     * \brief GraphColorDistance2 constructor.
-     * \param nr_: number of vertices in the graph
-     * \param row_map: the xadj array of the graph. Its size is nr_ +1
-     * \param entries: adjacency array of the graph.
-     * \param handle: GraphColoringHandle object that holds the specification about the graph coloring,
-     *    including parameters.
-     */
-    GraphColorDistance2(lno_t       nr_,
-                        lno_t       nc_,
-                        rowmap_t    row_map,
-                        entries_t   entries,
-                        rowmap_t    t_row_map,
-                        entries_t   t_entries,
-                        HandleType* handle)
-        : nr(nr_)
-        , nc(nc_)
-        , ne(entries.extent(0))
-        , xadj(row_map)
-        , adj(entries)
-        , t_xadj(t_row_map)
-        , t_adj(t_entries)
-        , gc_handle(handle)
-        , _chunkSize(handle->get_vb_chunk_size())
-        , _max_num_iterations(handle->get_max_number_of_iterations())
-        , _ticToc(handle->get_verbose())
-        , _verbose(handle->get_verbose())
-    {
-      //Internal logic check: distance-2 coloring (non-bipartite) requires a square graph
-      if(!doing_bipartite && nr != nc)
-      {
-        throw std::runtime_error("D2 INTERNAL ERROR: requested undirected d2 coloring but input graph is not square (nr_ != nc_)");
-      }
-    }
 
+ public:
+  using execution_space = typename HandleType::HandleExecSpace;
+  using memory_space    = typename HandleType::HandleTempMemorySpace;
+  using device_type     = Kokkos::Device<execution_space, memory_space>;
+  using color_view_type = typename HandleType::color_view_type;
+  using color_type      = typename HandleType::color_type;
+  using size_type       = typename rowmap_t::non_const_value_type;
+  using lno_t           = typename entries_t::non_const_value_type;
+  // Temporary view of lno_t
+  using lno_view_t = typename HandleType::nnz_lno_temp_work_view_type;
+  // Single-element (0-dim) view of lno_t:
+  using single_lno_view_t     = typename Kokkos::View<lno_t, memory_space>;
+  using range_policy_type     = Kokkos::RangePolicy<execution_space>;
+  using team_policy_type      = Kokkos::TeamPolicy<execution_space>;
+  using team_member_type      = typename team_policy_type::member_type;
+  using bool_view_t           = Kokkos::View<bool*, memory_space>;
+  using bit_64_forbidden_type = uint64_t;
+  using bitset_t              = Kokkos::Bitset<device_type>;
+  using forbidden_view        = Kokkos::View<uint32_t*, device_type>;
+
+ protected:
+  lno_t nr;               // num_rows  (also #verts, the objects being colored)
+  lno_t nc;               // num cols
+  size_type ne;           // # edges
+  rowmap_t xadj;          // rowmap, transpose of rowmap
+  entries_t adj;          // entries, transpose of entries   (size = # edges)
+  rowmap_t t_xadj;        // transpose rowmap (aliases xadj if !doing_bipartite)
+  entries_t t_adj;        // transpose entries (aliases adj if !doing_bipartite)
+  HandleType* gc_handle;  // pointer to the graph coloring handle
+
+ private:
+  int _chunkSize;  // the size of the minimum work unit assigned to threads.
+                   // Changes the convergence on GPUs
+  int _max_num_iterations;
+  bool _ticToc;   // if true print info in each step
+  bool _verbose;  // if true, print verbose information
+  bool using_edge_filtering;
+  static constexpr color_type UNPROCESSED = 0;
+  static constexpr color_type UNCOLORABLE = ~UNPROCESSED;
+  static constexpr color_type CONFLICTED  = UNCOLORABLE - 1;
+
+ public:
+  /**
+   * \brief GraphColorDistance2 constructor.
+   * \param nr_: number of vertices in the graph
+   * \param row_map: the xadj array of the graph. Its size is nr_ +1
+   * \param entries: adjacency array of the graph.
+   * \param handle: GraphColoringHandle object that holds the specification
+   * about the graph coloring, including parameters.
+   */
+  GraphColorDistance2(lno_t nr_, lno_t nc_, rowmap_t row_map, entries_t entries,
+                      rowmap_t t_row_map, entries_t t_entries,
+                      HandleType* handle)
+      : nr(nr_),
+        nc(nc_),
+        ne(entries.extent(0)),
+        xadj(row_map),
+        adj(entries),
+        t_xadj(t_row_map),
+        t_adj(t_entries),
+        gc_handle(handle),
+        _chunkSize(handle->get_vb_chunk_size()),
+        _max_num_iterations(handle->get_max_number_of_iterations()),
+        _ticToc(handle->get_verbose()),
+        _verbose(handle->get_verbose()) {
+    // Internal logic check: distance-2 coloring (non-bipartite) requires a
+    // square graph
+    if (!doing_bipartite && nr != nc) {
+      throw std::runtime_error(
+          "D2 INTERNAL ERROR: requested undirected d2 coloring but input graph "
+          "is not square (nr_ != nc_)");
+    }
+  }
 
-    /**
-     *  \brief GraphColor destructor.
-     */
-    virtual ~GraphColorDistance2() {}
+  /**
+   *  \brief GraphColor destructor.
+   */
+  virtual ~GraphColorDistance2() {}
+
+  // -----------------------------------------------------------------
+  //
+  // GraphColorDistance2::execute()
+  //
+  // -----------------------------------------------------------------
+
+  /**
+   * \brief Computes the distance-2 graph coloring.
+   */
+  void compute_distance2_color() {
+    // Delegate to different coloring functions, depending on algorithm
+    using_edge_filtering = false;
+    color_view_type colors_out;
+    if (gc_handle->get_vertex_colors().use_count() > 0) {
+      colors_out = gc_handle->get_vertex_colors();
+    } else {
+      colors_out = color_view_type("Graph Colors", this->nr);
+    }
+    switch (this->gc_handle->get_coloring_algo_type()) {
+      case COLORING_D2_VB_BIT_EF: using_edge_filtering = true;
+      case COLORING_D2_VB_BIT:
+      case COLORING_D2_VB: compute_d2_coloring_vb(colors_out); break;
+      case COLORING_D2_NB_BIT: compute_d2_coloring_nb(colors_out); break;
+      case COLORING_D2_SERIAL: compute_d2_coloring_serial(colors_out); break;
+      default:
+        throw std::runtime_error(
+            std::string("D2 coloring handle has invalid algorithm: ") +
+            std::to_string((int)this->gc_handle->get_coloring_algo_type()));
+    }
+  }
 
-    // -----------------------------------------------------------------
-    //
-    // GraphColorDistance2::execute()
-    //
-    // -----------------------------------------------------------------
-
-    /**
-     * \brief Computes the distance-2 graph coloring.
-     */
-    void compute_distance2_color()
-    {
-        //Delegate to different coloring functions, depending on algorithm
-        using_edge_filtering = false;
-        color_view_type colors_out;
-        if(gc_handle->get_vertex_colors().use_count() > 0){
-          colors_out = gc_handle->get_vertex_colors();
-        } else {
-          colors_out = color_view_type("Graph Colors", this->nr);
-        }
-        switch(this->gc_handle->get_coloring_algo_type())
-        {
-          case COLORING_D2_VB_BIT_EF:
-            using_edge_filtering = true;
-          case COLORING_D2_VB_BIT:
-          case COLORING_D2_VB:
-            compute_d2_coloring_vb(colors_out);
-            break;
-          case COLORING_D2_NB_BIT:
-            compute_d2_coloring_nb(colors_out);
-            break;
-          case COLORING_D2_SERIAL:
-            compute_d2_coloring_serial(colors_out);
-            break;
-          default:
-            throw std::runtime_error(std::string("D2 coloring handle has invalid algorithm: ") +
-                std::to_string((int) this->gc_handle->get_coloring_algo_type()));
-        }
+  void compute_d2_coloring_vb(const color_view_type& colors_out) {
+    // Data:
+    // gc_handle = graph coloring handle
+    // nr        = num_rows  (scalar)
+    // nc        = num_cols  (scalar)
+    // xadj      = row_map   (view 1 dimension - [num_verts+1] - entries index
+    // into adj ) adj       = entries   (view 1 dimension - [num_edges]   -
+    // adjacency list )
+    if (this->_ticToc) {
+      std::cout << "\tcolor_graph_d2 params:" << std::endl
+                << "\t  algorithm                : "
+                << this->gc_handle->getD2AlgorithmName() << std::endl
+                << "\t  ticToc                   : " << this->_ticToc
+                << std::endl
+                << "\t  max_num_iterations       : "
+                << this->_max_num_iterations << std::endl
+                << "\t  chunkSize                : " << this->_chunkSize
+                << std::endl
+                << "\t  Edge Filtering Pass?     : "
+                << (int)using_edge_filtering << std::endl
+                << "\tgraph information:" << std::endl
+                << "\t  nr                       : " << this->nr << std::endl
+                << "\t  ne                       : " << this->ne << std::endl;
+      /*
+      // For extra verbosity if debugging...
+      prettyPrint1DView(this->xadj,   ">>> xadj   ", 500);
+      prettyPrint1DView(this->adj,    ">>>  adj   ", 500);
+      prettyPrint1DView(this->t_xadj, ">>> t_xadj ", 500);
+      prettyPrint1DView(this->t_adj,  ">>> t_adj  ", 500);
+      */
     }
-    
-    void compute_d2_coloring_vb(const color_view_type& colors_out)
-    {
-        // Data:
-        // gc_handle = graph coloring handle
-        // nr        = num_rows  (scalar)
-        // nc        = num_cols  (scalar)
-        // xadj      = row_map   (view 1 dimension - [num_verts+1] - entries index into adj )
-        // adj       = entries   (view 1 dimension - [num_edges]   - adjacency list )
-        if(this->_ticToc)
-        {
-            std::cout << "\tcolor_graph_d2 params:" << std::endl
-                      << "\t  algorithm                : " << this->gc_handle->getD2AlgorithmName() << std::endl
-                      << "\t  ticToc                   : " << this->_ticToc << std::endl
-                      << "\t  max_num_iterations       : " << this->_max_num_iterations << std::endl
-                      << "\t  chunkSize                : " << this->_chunkSize << std::endl
-                      << "\t  Edge Filtering Pass?     : " << (int)using_edge_filtering << std::endl
-                      << "\tgraph information:" << std::endl
-                      << "\t  nr                       : " << this->nr << std::endl
-                      << "\t  ne                       : " << this->ne << std::endl;
-            /*
-            // For extra verbosity if debugging...
-            prettyPrint1DView(this->xadj,   ">>> xadj   ", 500);
-            prettyPrint1DView(this->adj,    ">>>  adj   ", 500);
-            prettyPrint1DView(this->t_xadj, ">>> t_xadj ", 500);
-            prettyPrint1DView(this->t_adj,  ">>> t_adj  ", 500);
-            */
-        }
 
-        // conflictlist - store conflicts that can happen when we're coloring in parallel.
-        lno_view_t current_vertexList(
-            Kokkos::view_alloc(Kokkos::WithoutInitializing, "vertexList"), this->nr);
+    // conflictlist - store conflicts that can happen when we're coloring in
+    // parallel.
+    lno_view_t current_vertexList(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "vertexList"),
+        this->nr);
 
-        lno_t current_vertexListLength = this->nr;
-        
-        if(this->gc_handle->get_use_vtx_list()){
-          //init conflict list from coloring handle
-          current_vertexList = this->gc_handle->get_vertex_list();
-          current_vertexListLength = this->gc_handle->get_vertex_list_size();
-        } else {
-          // init conflictlist sequentially.
-          Kokkos::parallel_for("InitList", range_policy_type(0, this->nr), functorInitList<lno_view_t>(current_vertexList));
-        }
-        // Next iteratons's conflictList
-        lno_view_t next_iteration_recolorList(Kokkos::view_alloc(Kokkos::WithoutInitializing, "recolorList"), this->nr);
+    lno_t current_vertexListLength = this->nr;
 
-        // Size the next iteration conflictList
-        single_lno_view_t next_iteration_recolorListLength("recolorListLength");
+    if (this->gc_handle->get_use_vtx_list()) {
+      // init conflict list from coloring handle
+      current_vertexList       = this->gc_handle->get_vertex_list();
+      current_vertexListLength = this->gc_handle->get_vertex_list_size();
+    } else {
+      // init conflictlist sequentially.
+      Kokkos::parallel_for("InitList", range_policy_type(0, this->nr),
+                           functorInitList<lno_view_t>(current_vertexList));
+    }
+    // Next iteratons's conflictList
+    lno_view_t next_iteration_recolorList(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "recolorList"),
+        this->nr);
+
+    // Size the next iteration conflictList
+    single_lno_view_t next_iteration_recolorListLength("recolorListLength");
+
+    lno_t numUncolored             = this->nr;
+    lno_t numUncoloredPreviousIter = this->nr + 1;
+
+    double time;
+    double total_time = 0.0;
+    Kokkos::Timer timer;
+
+    int iter = 0;
+    for (; (iter < _max_num_iterations) && (numUncolored > 0); iter++) {
+      timer.reset();
+
+      // Save the # of uncolored from the previous iteration
+      numUncoloredPreviousIter = numUncolored;
+
+      // ------------------------------------------
+      // Do greedy color
+      // ------------------------------------------
+      if (using_edge_filtering) {
+        // Temporary mutable copies of adj array
+        // * This is required for edge-filtering passes to avoid
+        //   side effects since edge filtering modifies the adj array.
+        // * Allocate using lno_view_t (managed) but then access as an
+        // entries_t,
+        //   so that it has the same type as adj
+        // * on the other hand, t_adj is not actually modified by EF functor
+        lno_view_t adj_copy(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing, "adj copy"),
+            this->ne);
+        Kokkos::deep_copy(adj_copy, this->adj);
+        this->colorGreedyEF(this->xadj, adj_copy, this->t_xadj, this->t_adj,
+                            colors_out);
+      } else {
+        this->colorGreedy(this->xadj, this->adj, this->t_xadj, this->t_adj,
+                          colors_out, current_vertexList,
+                          current_vertexListLength);
+      }
 
-        lno_t numUncolored             = this->nr;
-        lno_t numUncoloredPreviousIter = this->nr + 1;
+      execution_space().fence();
 
-        double              time;
-        double              total_time = 0.0;
-        Kokkos::Timer timer;
+      if (this->_ticToc) {
+        time = timer.seconds();
+        total_time += time;
+        std::cout << "\tIteration: " << iter << std::endl
+                  << "\t  - Time speculative greedy phase : " << time
+                  << std::endl
+                  << "\t  - Num Uncolored (greedy-color)  : " << numUncolored
+                  << std::endl;
 
-        int iter = 0;
-        for(; (iter < _max_num_iterations) && (numUncolored > 0); iter++)
-        {
-            timer.reset();
+        gc_handle->add_to_overall_coloring_time_phase1(time);
 
-            // Save the # of uncolored from the previous iteration
-            numUncoloredPreviousIter = numUncolored;
-
-            // ------------------------------------------
-            // Do greedy color
-            // ------------------------------------------
-            if(using_edge_filtering)
-            {
-                // Temporary mutable copies of adj array
-                // * This is required for edge-filtering passes to avoid
-                //   side effects since edge filtering modifies the adj array.
-                // * Allocate using lno_view_t (managed) but then access as an entries_t,
-                //   so that it has the same type as adj
-                // * on the other hand, t_adj is not actually modified by EF functor
-                lno_view_t adj_copy(Kokkos::view_alloc(Kokkos::WithoutInitializing, "adj copy"), this->ne);
-                Kokkos::deep_copy(adj_copy, this->adj);
-                this->colorGreedyEF(this->xadj, adj_copy, this->t_xadj, this->t_adj, colors_out);
-            }
-            else
-            {
-                this->colorGreedy(
-                  this->xadj, this->adj, this->t_xadj, this->t_adj, colors_out, current_vertexList, current_vertexListLength);
-            }
+        timer.reset();
+      }
 
-            execution_space().fence();
+      // ------------------------------------------
+      // Find conflicts
+      // ------------------------------------------
+      bool swap_work_arrays = true;  // NOTE: swap_work_arrays can go away in
+                                     // this example -- was only ever
+                                     //       set false in the PPS code in the
+                                     //       original D1 coloring...
+
+      // NOTE: not using colorset algorithm in this so we don't include colorset
+      // data
+      numUncolored = this->findConflicts(
+          swap_work_arrays, this->xadj, this->adj, this->t_xadj, this->t_adj,
+          colors_out, current_vertexList, current_vertexListLength,
+          next_iteration_recolorList, next_iteration_recolorListLength);
 
-            if(this->_ticToc)
-            {
-                time = timer.seconds();
-                total_time += time;
-                std::cout << "\tIteration: " << iter << std::endl
-                          << "\t  - Time speculative greedy phase : " << time << std::endl
-                          << "\t  - Num Uncolored (greedy-color)  : " << numUncolored << std::endl;
+      execution_space().fence();
 
-                gc_handle->add_to_overall_coloring_time_phase1(time);
+      if (_ticToc) {
+        time = timer.seconds();
+        total_time += time;
+        std::cout << "\t  - Time conflict detection       : " << time
+                  << std::endl;
+        std::cout << "\t  - Num Uncolored (conflicts)     : " << numUncolored
+                  << std::endl;
+        gc_handle->add_to_overall_coloring_time_phase2(time);
+        timer.reset();
+      }
 
-                timer.reset();
-            }
+      // Swap the work arrays (for conflictlist)
+      if (swap_work_arrays) {
+        // Swap Work Arrays
+        if (iter + 1 < this->_max_num_iterations) {
+          lno_view_t temp            = current_vertexList;
+          current_vertexList         = next_iteration_recolorList;
+          next_iteration_recolorList = temp;
+
+          current_vertexListLength = numUncolored;
+          next_iteration_recolorListLength =
+              single_lno_view_t("recolorListLength");
+        }
+      }
 
-            // ------------------------------------------
-            // Find conflicts
-            // ------------------------------------------
-            bool swap_work_arrays = true;      // NOTE: swap_work_arrays can go away in this example -- was only ever
-                                               //       set false in the PPS code in the original D1 coloring...
-
-            // NOTE: not using colorset algorithm in this so we don't include colorset data
-            numUncolored = this->findConflicts(swap_work_arrays,
-                                               this->xadj,
-                                               this->adj,
-                                               this->t_xadj,
-                                               this->t_adj,
-                                               colors_out,
-                                               current_vertexList,
-                                               current_vertexListLength,
-                                               next_iteration_recolorList,
-                                               next_iteration_recolorListLength);
-
-            execution_space().fence();
-
-            if(_ticToc)
-            {
-                time = timer.seconds();
-                total_time += time;
-                std::cout << "\t  - Time conflict detection       : " << time << std::endl;
-                std::cout << "\t  - Num Uncolored (conflicts)     : " << numUncolored << std::endl;
-                gc_handle->add_to_overall_coloring_time_phase2(time);
-                timer.reset();
-            }
+      // Bail out if we didn't make any progress since we're probably stuck and
+      // it's better to just clean up in serial.
+      if (numUncolored == numUncoloredPreviousIter) break;
 
-            // Swap the work arrays (for conflictlist)
-            if(swap_work_arrays)
-            {
-                // Swap Work Arrays
-                if(iter + 1 < this->_max_num_iterations)
-                {
-                    lno_view_t temp            = current_vertexList;
-                    current_vertexList         = next_iteration_recolorList;
-                    next_iteration_recolorList = temp;
-
-                    current_vertexListLength         = numUncolored;
-                    next_iteration_recolorListLength = single_lno_view_t("recolorListLength");
-                }
-            }
+    }  // end for iterations && numUncolored > 0...
 
-            // Bail out if we didn't make any progress since we're probably stuck and it's better to just clean up in serial.
-            if(numUncolored == numUncoloredPreviousIter)
-                break;
-
-        }      // end for iterations && numUncolored > 0...
-
-        // ------------------------------------------
-        // clean up in serial (resolveConflictsSerial)
-        // ------------------------------------------
-        if(numUncolored > 0)
-        {
-            this->resolveConflictsSerial(this->xadj,
-                                         this->adj,
-                                         this->t_xadj,
-                                         this->t_adj,
-                                         colors_out,
-                                         current_vertexList,
-                                         current_vertexListLength);
-        }
+    // ------------------------------------------
+    // clean up in serial (resolveConflictsSerial)
+    // ------------------------------------------
+    if (numUncolored > 0) {
+      this->resolveConflictsSerial(this->xadj, this->adj, this->t_xadj,
+                                   this->t_adj, colors_out, current_vertexList,
+                                   current_vertexListLength);
+    }
 
-        execution_space().fence();
+    execution_space().fence();
 
-        if(_ticToc)
-        {
-            time = timer.seconds();
-            total_time += time;
-            std::cout << "\tTime serial conflict resolution   : " << time << std::endl;
-            gc_handle->add_to_overall_coloring_time_phase3(time);
-        }
+    if (_ticToc) {
+      time = timer.seconds();
+      total_time += time;
+      std::cout << "\tTime serial conflict resolution   : " << time
+                << std::endl;
+      gc_handle->add_to_overall_coloring_time_phase3(time);
+    }
 
-        // Save the number of phases and vertex colors to the graph coloring handle
-        this->gc_handle->set_vertex_colors(colors_out);
-        this->gc_handle->set_num_phases(iter);
-
-    }      // color_graph_d2 (end)
-
-    template<int batch>
-    struct NB_Coloring
-    {
-      NB_Coloring(
-          const lno_view_t&         worklist_,
-          const single_lno_view_t&  worklen_,
-          color_type                colorBase_,
-          const forbidden_view&     forbidden_,
-          color_view_type           colors_,
-          const rowmap_t&           Vrowmap_,
-          const entries_t&          Vcolinds_,
-          lno_t                     vertsPerThread_,
-          lno_t                     numCols_) :
-        worklist(worklist_), worklen(worklen_), colorBase(colorBase_), forbidden(forbidden_),
-        colors(colors_), Vrowmap(Vrowmap_), Vcolinds(Vcolinds_), vertsPerThread(vertsPerThread_), numCols(numCols_)
-      {}
-      KOKKOS_INLINE_FUNCTION void operator()(const lno_t ti) const
-      {
-        for(lno_t i = ti * vertsPerThread; i < (ti + 1) * vertsPerThread; i++)
-        {
-          if(i >= worklen())
-            return;
-          lno_t v = worklist(i);
-          //compute forbidden for v
-          unsigned forbid[batch] = {0};
-          //union the forbidden of all incident c's
-          size_type rowBegin = Vrowmap(v);
-          size_type rowEnd = Vrowmap(v + 1);
-          if(!doing_bipartite)
-          {
-            //gather distance-1 forbidden colors
-            for(int b = 0; b < batch; b++)
-              forbid[b] = forbidden(v * batch + b);
+    // Save the number of phases and vertex colors to the graph coloring handle
+    this->gc_handle->set_vertex_colors(colors_out);
+    this->gc_handle->set_num_phases(iter);
+
+  }  // color_graph_d2 (end)
+
+  template <int batch>
+  struct NB_Coloring {
+    NB_Coloring(const lno_view_t& worklist_, const single_lno_view_t& worklen_,
+                color_type colorBase_, const forbidden_view& forbidden_,
+                color_view_type colors_, const rowmap_t& Vrowmap_,
+                const entries_t& Vcolinds_, lno_t vertsPerThread_,
+                lno_t numCols_)
+        : worklist(worklist_),
+          worklen(worklen_),
+          colorBase(colorBase_),
+          forbidden(forbidden_),
+          colors(colors_),
+          Vrowmap(Vrowmap_),
+          Vcolinds(Vcolinds_),
+          vertsPerThread(vertsPerThread_),
+          numCols(numCols_) {}
+    KOKKOS_INLINE_FUNCTION void operator()(const lno_t ti) const {
+      for (lno_t i = ti * vertsPerThread; i < (ti + 1) * vertsPerThread; i++) {
+        if (i >= worklen()) return;
+        lno_t v = worklist(i);
+        // compute forbidden for v
+        unsigned forbid[batch] = {0};
+        // union the forbidden of all incident c's
+        size_type rowBegin = Vrowmap(v);
+        size_type rowEnd   = Vrowmap(v + 1);
+        if (!doing_bipartite) {
+          // gather distance-1 forbidden colors
+          for (int b = 0; b < batch; b++) forbid[b] = forbidden(v * batch + b);
+        }
+        // gather distance-2 forbidden colors
+        for (size_type j = rowBegin; j < rowEnd; j++) {
+          lno_t nei = Vcolinds(j);
+          if (nei < numCols) {
+            for (int b = 0; b < batch; b++)
+              forbid[b] |= forbidden(nei * batch + b);
           }
-          //gather distance-2 forbidden colors
-          for(size_type j = rowBegin; j < rowEnd; j++)
-          {
-            lno_t nei = Vcolinds(j);
-            if(nei < numCols)
-            {
-              for(int b = 0; b < batch; b++)
-                forbid[b] |= forbidden(nei * batch + b);
-            }
+        }
+        // Find the first 0 bit in forbid
+        color_type color = 0;
+        int colorWord    = 0;
+        int colorBit     = 0;
+        for (int b = 0; b < batch; b++) {
+          if (~forbid[b]) {
+            // least_set_bit returns 1 for the least significant bit, so
+            // subtracting 1
+            colorWord = b;
+            colorBit  = KokkosKernels::Impl::least_set_bit(~forbid[b]) - 1;
+            color     = colorBase + 32 * b + colorBit;
+            break;
           }
-          //Find the first 0 bit in forbid
-          color_type color = 0;
-          int colorWord = 0;
-          int colorBit = 0;
-          for(int b = 0; b < batch; b++)
-          {
-            if(~forbid[b])
-            {
-              //least_set_bit returns 1 for the least significant bit, so subtracting 1
-              colorWord = b;
-              colorBit = KokkosKernels::Impl::least_set_bit(~forbid[b]) - 1;
-              color = colorBase + 32 * b + colorBit;
-              break;
-            }
+        }
+        if (color && (colors(v) == 0 || colors(v) == CONFLICTED ||
+                      colors(v) == UNCOLORABLE)) {
+          // Color v
+          colors(v) = color;
+          if (!doing_bipartite) {
+            // Update forbidden for v (preventing dist-1 conflicts)
+            if (v < numCols)
+              Kokkos::atomic_fetch_or(&forbidden(v * batch + colorWord),
+                                      (uint32_t)1 << colorBit);
           }
-          if(color && (colors(v) == 0 || colors(v) == CONFLICTED || colors(v) == UNCOLORABLE))
-          {
-            //Color v
-            colors(v) = color;
-            if(!doing_bipartite)
-            {
-              //Update forbidden for v (preventing dist-1 conflicts)
-              if(v < numCols)
-                Kokkos::atomic_fetch_or(&forbidden(v * batch + colorWord), (uint32_t) 1 << colorBit);
-            }
-            //Update forbidden for all of v's neighbors
-            for(size_type j = rowBegin; j < rowEnd; j++)
-            {
-              lno_t nei = Vcolinds(j);
-              if(nei < numCols)
-              {
-                //Update column forbidden
-                Kokkos::atomic_fetch_or(&forbidden(nei * batch + colorWord), (uint32_t) 1 << colorBit);
-              }
+          // Update forbidden for all of v's neighbors
+          for (size_type j = rowBegin; j < rowEnd; j++) {
+            lno_t nei = Vcolinds(j);
+            if (nei < numCols) {
+              // Update column forbidden
+              Kokkos::atomic_fetch_or(&forbidden(nei * batch + colorWord),
+                                      (uint32_t)1 << colorBit);
             }
           }
-          else if (colors(v) == 0 || colors(v) == CONFLICTED || colors(v) == UNCOLORABLE)
-          {
-            colors(v) = UNCOLORABLE;
-          }
+        } else if (colors(v) == 0 || colors(v) == CONFLICTED ||
+                   colors(v) == UNCOLORABLE) {
+          colors(v) = UNCOLORABLE;
         }
       }
+    }
 
-      lno_view_t worklist;
-      single_lno_view_t worklen;
-      color_type colorBase;
-      forbidden_view forbidden;         //forbidden color bitset for columns
-      color_view_type colors;
-      rowmap_t Vrowmap;  //V <-> C graph (row v is the columns incident to v)
-      entries_t Vcolinds;
-      lno_t vertsPerThread;
-      const lno_t numCols;
-    };
-    
-    template<int batch>
-    struct NB_Conflict
-    {
-      NB_Conflict(
-          color_type colorBase_, const forbidden_view& forbidden_,
-          const color_view_type& colors_,
-          const rowmap_t& Crowmap_, const entries_t& Ccolinds_, lno_t numVerts_)
-        : colorBase(colorBase_), forbidden(forbidden_), colors(colors_),
-        Crowmap(Crowmap_), Ccolinds(Ccolinds_), numVerts(numVerts_)
-      {}
-
-      KOKKOS_INLINE_FUNCTION void operator()(const lno_t c) const
-      {
-        //Here, only processing 32 colors at a time.
-        //forbidNei is the (minimum) neighbor ID where each forbidden color was observed.
-        //This is why the whole batch can't be processed at once
-        lno_t forbidNei[32];
-        //Go over all the v neighbors, updating forbidden
-        size_type rowBegin = Crowmap(c);
-        size_type rowEnd = Crowmap(c + 1);
-        for(int b = 0; b < batch; b++)
-        {
-          unsigned forbid = 0U;
-          color_type batchBegin = colorBase + 32 * b;
-          for(size_type j = rowBegin; j <= rowEnd; j++)
-          {
-            lno_t nei;
-            if(j == rowEnd)
-            {
-              if(!doing_bipartite) //note: compile-time branch
-                nei = c;
-              else
-                break;
-            }
+    lno_view_t worklist;
+    single_lno_view_t worklen;
+    color_type colorBase;
+    forbidden_view forbidden;  // forbidden color bitset for columns
+    color_view_type colors;
+    rowmap_t Vrowmap;  // V <-> C graph (row v is the columns incident to v)
+    entries_t Vcolinds;
+    lno_t vertsPerThread;
+    const lno_t numCols;
+  };
+
+  template <int batch>
+  struct NB_Conflict {
+    NB_Conflict(color_type colorBase_, const forbidden_view& forbidden_,
+                const color_view_type& colors_, const rowmap_t& Crowmap_,
+                const entries_t& Ccolinds_, lno_t numVerts_)
+        : colorBase(colorBase_),
+          forbidden(forbidden_),
+          colors(colors_),
+          Crowmap(Crowmap_),
+          Ccolinds(Ccolinds_),
+          numVerts(numVerts_) {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(const lno_t c) const {
+      // Here, only processing 32 colors at a time.
+      // forbidNei is the (minimum) neighbor ID where each forbidden color was
+      // observed. This is why the whole batch can't be processed at once
+      lno_t forbidNei[32];
+      // Go over all the v neighbors, updating forbidden
+      size_type rowBegin = Crowmap(c);
+      size_type rowEnd   = Crowmap(c + 1);
+      for (int b = 0; b < batch; b++) {
+        unsigned forbid       = 0U;
+        color_type batchBegin = colorBase + 32 * b;
+        for (size_type j = rowBegin; j <= rowEnd; j++) {
+          lno_t nei;
+          if (j == rowEnd) {
+            if (!doing_bipartite)  // note: compile-time branch
+              nei = c;
             else
-              nei = Ccolinds(j);
-            if(nei >= numVerts)
-              continue;
-            color_type neiColor = colors(nei);
-            int colorOffset = neiColor - batchBegin;
-            if(colorOffset >= 0 && colorOffset < 32)
-            {
-              //if this is the first time the color has been seen, register nei in forbidNei
-              unsigned mask = 1U << colorOffset;
-              if(0 == (forbid & mask))
-              {
-                //First time seeing this color
-                forbidNei[colorOffset] = nei;
-              }
-              else
-              {
-                //Have seen this color before:
-                //must uncolor either nei or forbidNei[colorOffset] (whichever has higher ID)
-                if(nei > forbidNei[colorOffset])
-                {
-                  //nei has a higher id than another neighbor of the same color, so uncolor nei
-                  colors(nei) = CONFLICTED;
-                }
-                else if(nei < forbidNei[colorOffset])
-                {
-                  colors(forbidNei[colorOffset]) = CONFLICTED;
-                  forbidNei[colorOffset] = nei;
-                }
+              break;
+          } else
+            nei = Ccolinds(j);
+          if (nei >= numVerts) continue;
+          color_type neiColor = colors(nei);
+          int colorOffset     = neiColor - batchBegin;
+          if (colorOffset >= 0 && colorOffset < 32) {
+            // if this is the first time the color has been seen, register nei
+            // in forbidNei
+            unsigned mask = 1U << colorOffset;
+            if (0 == (forbid & mask)) {
+              // First time seeing this color
+              forbidNei[colorOffset] = nei;
+            } else {
+              // Have seen this color before:
+              // must uncolor either nei or forbidNei[colorOffset] (whichever
+              // has higher ID)
+              if (nei > forbidNei[colorOffset]) {
+                // nei has a higher id than another neighbor of the same color,
+                // so uncolor nei
+                colors(nei) = CONFLICTED;
+              } else if (nei < forbidNei[colorOffset]) {
+                colors(forbidNei[colorOffset]) = CONFLICTED;
+                forbidNei[colorOffset]         = nei;
               }
-              forbid |= mask;
             }
+            forbid |= mask;
           }
         }
       }
+    }
 
-      color_type colorBase;
-      forbidden_view forbidden;         //forbidden color bitset for columns
-      color_view_type colors;
-      rowmap_t Crowmap;  //C <-> V graph (row c is the vertices incident to c)
-      entries_t Ccolinds;
-      const lno_t numVerts;
-    };
-
-    template<int batch>
-    struct NB_RefreshForbidden
-    {
-      NB_RefreshForbidden(
-          color_type colorBase_, const forbidden_view& forbidden_, const color_view_type& colors_,
-          const rowmap_t& Crowmap_, const entries_t& Ccolinds_, lno_t numVerts_)
-        : colorBase(colorBase_), colorEnd(colorBase + 32 * batch), forbidden(forbidden_), colors(colors_), Crowmap(Crowmap_), Ccolinds(Ccolinds_), numVerts(numVerts_)
-      {}
-
-      KOKKOS_INLINE_FUNCTION void operator()(const lno_t c) const
-      {
-        //compute this in registers before storing to forbidden
-        unsigned newForbid[batch] = {0};
-        //Go over all the v neighbors, updating forbidden
-        size_type rowBegin = Crowmap(c);
-        size_type rowEnd = Crowmap(c + 1);
-        if(!doing_bipartite)
-        {
-          //first, add d-1 conflict
-          color_type selfColor = colors(c);
-          if(colorBase <= selfColor && selfColor < colorEnd)
-          {
-            int colorWord = (selfColor - colorBase) / 32;
-            int colorBit = (selfColor - colorBase) % 32;
-            newForbid[colorWord] |= ((uint32_t) 1 << colorBit);
-          }
-        }
-        for(size_type i = rowBegin; i < rowEnd; i++)
-        {
-          lno_t nei = Ccolinds(i);
-          if(nei >= numVerts)
-            continue;
-          color_type neiColor = colors(nei);
-          if(colorBase <= neiColor && neiColor < colorEnd)
-          {
-            int colorWord = (neiColor - colorBase) / 32;
-            int colorBit = (neiColor - colorBase) % 32;
-            newForbid[colorWord] |= ((uint32_t) 1 << colorBit);
-          }
+    color_type colorBase;
+    forbidden_view forbidden;  // forbidden color bitset for columns
+    color_view_type colors;
+    rowmap_t Crowmap;  // C <-> V graph (row c is the vertices incident to c)
+    entries_t Ccolinds;
+    const lno_t numVerts;
+  };
+
+  template <int batch>
+  struct NB_RefreshForbidden {
+    NB_RefreshForbidden(color_type colorBase_, const forbidden_view& forbidden_,
+                        const color_view_type& colors_,
+                        const rowmap_t& Crowmap_, const entries_t& Ccolinds_,
+                        lno_t numVerts_)
+        : colorBase(colorBase_),
+          colorEnd(colorBase + 32 * batch),
+          forbidden(forbidden_),
+          colors(colors_),
+          Crowmap(Crowmap_),
+          Ccolinds(Ccolinds_),
+          numVerts(numVerts_) {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(const lno_t c) const {
+      // compute this in registers before storing to forbidden
+      unsigned newForbid[batch] = {0};
+      // Go over all the v neighbors, updating forbidden
+      size_type rowBegin = Crowmap(c);
+      size_type rowEnd   = Crowmap(c + 1);
+      if (!doing_bipartite) {
+        // first, add d-1 conflict
+        color_type selfColor = colors(c);
+        if (colorBase <= selfColor && selfColor < colorEnd) {
+          int colorWord = (selfColor - colorBase) / 32;
+          int colorBit  = (selfColor - colorBase) % 32;
+          newForbid[colorWord] |= ((uint32_t)1 << colorBit);
         }
-        for(int i = 0; i < batch; i++)
-          forbidden(c * batch + i) = newForbid[i];
       }
-
-      color_type colorBase;
-      color_type colorEnd;
-      forbidden_view forbidden;         //forbidden color bitset for columns
-      color_view_type colors;
-      rowmap_t Crowmap;  //C <-> V graph (row c is the vertices incident to c)
-      entries_t Ccolinds;
-      const lno_t numVerts;
-    };
-
-    struct NB_Worklist
-    {
-      NB_Worklist(
-          const color_view_type colors_, const lno_view_t& worklist_, const single_lno_view_t& worklen_, lno_t nr_)
-        : colors(colors_), worklist(worklist_), worklen(worklen_), nr(nr_)
-      {}
-
-      KOKKOS_INLINE_FUNCTION void operator()(const lno_t v, lno_t& lnum, bool finalPass) const
-      {
-        if(colors(v) == CONFLICTED)
-        {
-          if(finalPass)
-            worklist(lnum) = v;
-          lnum++;
-        }
-        if(finalPass && v == nr - 1)
-        {
-          //The very last thread in the kernel knows how many items are in the next worklist
-          worklen() = lnum;
+      for (size_type i = rowBegin; i < rowEnd; i++) {
+        lno_t nei = Ccolinds(i);
+        if (nei >= numVerts) continue;
+        color_type neiColor = colors(nei);
+        if (colorBase <= neiColor && neiColor < colorEnd) {
+          int colorWord = (neiColor - colorBase) / 32;
+          int colorBit  = (neiColor - colorBase) % 32;
+          newForbid[colorWord] |= ((uint32_t)1 << colorBit);
         }
       }
+      for (int i = 0; i < batch; i++) forbidden(c * batch + i) = newForbid[i];
+    }
 
-      color_view_type colors;
-      lno_view_t worklist;
-      single_lno_view_t worklen;
-      lno_t nr;
-    };
-
-    struct NB_UpdateBatch
-    {
-      NB_UpdateBatch(
-          const color_view_type& colors_, const lno_view_t& worklist_, const single_lno_view_t& worklen_, lno_t nr_)
-        : colors(colors_), worklist(worklist_), worklen(worklen_), nr(nr_)
-      {}
-
-      KOKKOS_INLINE_FUNCTION void operator()(const lno_t v, lno_t & lnum, bool finalPass) const
-      {
-        if(colors(v) == UNCOLORABLE)
-        {
-          if(finalPass)
-            worklist(lnum) = v;
-          lnum++;
-        }
-        if(finalPass && v == nr - 1)
-        {
-          //The very last thread in the kernel knows the length of the new worklist.
-          worklen() = lnum;
-        }
+    color_type colorBase;
+    color_type colorEnd;
+    forbidden_view forbidden;  // forbidden color bitset for columns
+    color_view_type colors;
+    rowmap_t Crowmap;  // C <-> V graph (row c is the vertices incident to c)
+    entries_t Ccolinds;
+    const lno_t numVerts;
+  };
+
+  struct NB_Worklist {
+    NB_Worklist(const color_view_type colors_, const lno_view_t& worklist_,
+                const single_lno_view_t& worklen_, lno_t nr_)
+        : colors(colors_), worklist(worklist_), worklen(worklen_), nr(nr_) {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(const lno_t v, lno_t& lnum,
+                                           bool finalPass) const {
+      if (colors(v) == CONFLICTED) {
+        if (finalPass) worklist(lnum) = v;
+        lnum++;
+      }
+      if (finalPass && v == nr - 1) {
+        // The very last thread in the kernel knows how many items are in the
+        // next worklist
+        worklen() = lnum;
       }
+    }
 
-      color_view_type colors;
-      lno_view_t worklist;
-      single_lno_view_t worklen;
-      lno_t nr;
-    };
-
-    void compute_d2_coloring_nb(const color_view_type& colors_out)
-    {
-      //Member data used:
-      // gc_handle    = graph coloring handle
-      // nr           = #vertices
-      // nc           = #columns
-      // xadj/adj     = graph where rows are vertices, and adjacent columns are listed
-      // t_xadj/t_adj = graph where rows are columns, and adjacent vertices are listed.
-      //                Allowed to alias xadj/adj if same.
-      if(this->_ticToc)
-      {
-        std::cout << "\tcolor_symmetric_graph_d2 params:\n"
-          << "\t\t#vertices : " << this->nr << '\n'
-          << "\t\t#edges: " << this->ne << '\n';
+    color_view_type colors;
+    lno_view_t worklist;
+    single_lno_view_t worklen;
+    lno_t nr;
+  };
+
+  struct NB_UpdateBatch {
+    NB_UpdateBatch(const color_view_type& colors_, const lno_view_t& worklist_,
+                   const single_lno_view_t& worklen_, lno_t nr_)
+        : colors(colors_), worklist(worklist_), worklen(worklen_), nr(nr_) {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(const lno_t v, lno_t& lnum,
+                                           bool finalPass) const {
+      if (colors(v) == UNCOLORABLE) {
+        if (finalPass) worklist(lnum) = v;
+        lnum++;
       }
-      //Initialize worklist with every vertex
-      lno_view_t worklist("Worklist", this->nr);
-      single_lno_view_t worklen("Worklist length");
-      Kokkos::deep_copy(worklen, this->nr);
+      if (finalPass && v == nr - 1) {
+        // The very last thread in the kernel knows the length of the new
+        // worklist.
+        worklen() = lnum;
+      }
+    }
 
-      // init conflictlist sequentially.
-      Kokkos::parallel_for("InitList", range_policy_type(0, this->nr),
-          functorInitList<lno_view_t>(worklist));
-
-      //Estimate the number of colors that will be needed
-      //The algorithm can't use more colors than the max distance-2 degree,
-      //but it can use fewer.
-      //Here, subtracting 1 to represent the self-edge
-      lno_t avgDeg = adj.extent(0) / (xadj.extent(0) - 1) - 1;
-      //This d-2 chromatic number estimate is based on the following assumptions:
-      //  -the number of self-loops to v is just deg(v), and these can't cause conflicts
-      //  -each node has about the same degree, so
-      //   the total number of length-2 walks from v is about deg(v)^2
-      //  -the constant was determined experimentally
-      int estNumColors = 0.1 * (avgDeg * (avgDeg - 1));
-      // 8 words (256 bits/colors) is the maximum allowed batch size
-      int batch = 8;
-      // but don't use more than the estimate
-      for(int tryBatch = 1; tryBatch < 8; tryBatch *= 2)
-      {
-        if(estNumColors <= 32 * tryBatch)
-        {
-          batch = tryBatch;
-          break;
-        }
+    color_view_type colors;
+    lno_view_t worklist;
+    single_lno_view_t worklen;
+    lno_t nr;
+  };
+
+  void compute_d2_coloring_nb(const color_view_type& colors_out) {
+    // Member data used:
+    // gc_handle    = graph coloring handle
+    // nr           = #vertices
+    // nc           = #columns
+    // xadj/adj     = graph where rows are vertices, and adjacent columns are
+    // listed t_xadj/t_adj = graph where rows are columns, and adjacent vertices
+    // are listed.
+    //                Allowed to alias xadj/adj if same.
+    if (this->_ticToc) {
+      std::cout << "\tcolor_symmetric_graph_d2 params:\n"
+                << "\t\t#vertices : " << this->nr << '\n'
+                << "\t\t#edges: " << this->ne << '\n';
+    }
+    // Initialize worklist with every vertex
+    lno_view_t worklist("Worklist", this->nr);
+    single_lno_view_t worklen("Worklist length");
+    Kokkos::deep_copy(worklen, this->nr);
+
+    // init conflictlist sequentially.
+    Kokkos::parallel_for("InitList", range_policy_type(0, this->nr),
+                         functorInitList<lno_view_t>(worklist));
+
+    // Estimate the number of colors that will be needed
+    // The algorithm can't use more colors than the max distance-2 degree,
+    // but it can use fewer.
+    // Here, subtracting 1 to represent the self-edge
+    lno_t avgDeg = adj.extent(0) / (xadj.extent(0) - 1) - 1;
+    // This d-2 chromatic number estimate is based on the following assumptions:
+    //  -the number of self-loops to v is just deg(v), and these can't cause
+    //  conflicts -each node has about the same degree, so
+    //   the total number of length-2 walks from v is about deg(v)^2
+    //  -the constant was determined experimentally
+    int estNumColors = 0.1 * (avgDeg * (avgDeg - 1));
+    // 8 words (256 bits/colors) is the maximum allowed batch size
+    int batch = 8;
+    // but don't use more than the estimate
+    for (int tryBatch = 1; tryBatch < 8; tryBatch *= 2) {
+      if (estNumColors <= 32 * tryBatch) {
+        batch = tryBatch;
+        break;
       }
-      const lno_t numVerts = this->nr;
-      const lno_t numCols = this->nc;
-      //note: relying on forbidden and colors_out being initialized to 0
-      forbidden_view forbidden("Forbidden", batch * numCols);
-      int iter = 0;
-      Kokkos::Timer timer;
-      lno_t currentWork = this->nr;
-      batch = 1;
-      double colorTime = 0;
-      double conflictTime = 0;
-      double forbiddenTime = 0;
-      double worklistTime = 0;
-      for(color_type colorBase = 1;; colorBase += 32 * batch)
-      {
-        //Until the worklist is completely empty, run the functor specialization for batch size
-        while(currentWork)
-        {
-          lno_t vertsPerThread = 1;
-          lno_t workBatches = (currentWork + vertsPerThread - 1) / vertsPerThread;
-          timer.reset();
-          //if still using this color set, refresh forbidden.
-          //This avoids using too many colors, by relying on forbidden from before previous conflict resolution (which is now stale).
-          //Refreshing forbidden before conflict resolution ensures that previously-colored vertices do not get recolored.
-          switch(batch)
-          {
-            case 1:
-              Kokkos::parallel_for("NB D2 Forbidden", range_policy_type(0, numCols),
-                  NB_RefreshForbidden<1>(colorBase, forbidden, colors_out, this->t_xadj, this->t_adj, numVerts));
-              break;
-            case 2:
-              Kokkos::parallel_for("NB D2 Forbidden", range_policy_type(0, numCols),
-                  NB_RefreshForbidden<2>(colorBase, forbidden, colors_out, this->t_xadj, this->t_adj, numVerts));
-              break;
-            case 4:
-              Kokkos::parallel_for("NB D2 Forbidden", range_policy_type(0, numCols),
-                  NB_RefreshForbidden<4>(colorBase, forbidden, colors_out, this->t_xadj, this->t_adj, numVerts));
-              break;
-            case 8:
-              Kokkos::parallel_for("NB D2 Forbidden", range_policy_type(0, numCols),
-                  NB_RefreshForbidden<8>(colorBase, forbidden, colors_out, this->t_xadj, this->t_adj, numVerts));
-              break;
-            default:;
-          }
-          forbiddenTime += timer.seconds();
-          timer.reset();
-          switch(batch)
-          {
-            case 1:
-              timer.reset();
-              Kokkos::parallel_for("NB D2 Coloring", range_policy_type(0, workBatches),
-                  NB_Coloring<1>(worklist, worklen, colorBase, forbidden, colors_out, this->xadj, this->adj, vertsPerThread, numCols));
-              colorTime += timer.seconds();
-              timer.reset();
-              Kokkos::parallel_for("NB D2 Conflict Resolution", range_policy_type(0, numCols),
-                  NB_Conflict<1>(colorBase, forbidden, colors_out, this->t_xadj, this->t_adj, numVerts));
-              conflictTime += timer.seconds();
-              break;
-            case 2:
-              timer.reset();
-              Kokkos::parallel_for("NB D2 Coloring", range_policy_type(0, workBatches),
-                  NB_Coloring<2>(worklist, worklen, colorBase, forbidden, colors_out, this->xadj, this->adj, vertsPerThread, numCols));
-              colorTime += timer.seconds();
-              timer.reset();
-              Kokkos::parallel_for("NB D2 Conflict Resolution", range_policy_type(0, numCols),
-                  NB_Conflict<2>(colorBase, forbidden, colors_out, this->t_xadj, this->t_adj, numVerts));
-              conflictTime += timer.seconds();
-              break;
-            case 4:
-              timer.reset();
-              Kokkos::parallel_for("NB D2 Coloring", range_policy_type(0, workBatches),
-                  NB_Coloring<4>(worklist, worklen, colorBase, forbidden, colors_out, this->xadj, this->adj, vertsPerThread, numCols));
-              colorTime += timer.seconds();
-              timer.reset();
-              Kokkos::parallel_for("NB D2 Conflict Resolution", range_policy_type(0, numCols),
-                  NB_Conflict<4>(colorBase, forbidden, colors_out, this->t_xadj, this->t_adj, numVerts));
-              conflictTime += timer.seconds();
-              break;
-            case 8:
-              timer.reset();
-              Kokkos::parallel_for("NB D2 Coloring", range_policy_type(0, workBatches),
-                  NB_Coloring<8>(worklist, worklen, colorBase, forbidden, colors_out, this->xadj, this->adj, vertsPerThread, numCols));
-              colorTime += timer.seconds();
-              timer.reset();
-              Kokkos::parallel_for("NB D2 Conflict Resolution", range_policy_type(0, numCols),
-                  NB_Conflict<8>(colorBase, forbidden, colors_out, this->t_xadj, this->t_adj, numVerts));
-              conflictTime += timer.seconds();
-              break;
-            default:
-              throw std::logic_error("D2 symmetric color batch size is not a power-of-two, or is too big");
-          }
-          timer.reset();
-          //Then build the next worklist
-          Kokkos::parallel_scan("NB D2 worklist", range_policy_type(0, numVerts),
-              NB_Worklist(colors_out, worklist, worklen, numVerts), currentWork);
-          worklistTime += timer.seconds();
-          timer.reset();
-          iter++;
+    }
+    const lno_t numVerts = this->nr;
+    const lno_t numCols  = this->nc;
+    // note: relying on forbidden and colors_out being initialized to 0
+    forbidden_view forbidden("Forbidden", batch * numCols);
+    int iter = 0;
+    Kokkos::Timer timer;
+    lno_t currentWork    = this->nr;
+    batch                = 1;
+    double colorTime     = 0;
+    double conflictTime  = 0;
+    double forbiddenTime = 0;
+    double worklistTime  = 0;
+    for (color_type colorBase = 1;; colorBase += 32 * batch) {
+      // Until the worklist is completely empty, run the functor specialization
+      // for batch size
+      while (currentWork) {
+        lno_t vertsPerThread = 1;
+        lno_t workBatches = (currentWork + vertsPerThread - 1) / vertsPerThread;
+        timer.reset();
+        // if still using this color set, refresh forbidden.
+        // This avoids using too many colors, by relying on forbidden from
+        // before previous conflict resolution (which is now stale). Refreshing
+        // forbidden before conflict resolution ensures that previously-colored
+        // vertices do not get recolored.
+        switch (batch) {
+          case 1:
+            Kokkos::parallel_for(
+                "NB D2 Forbidden", range_policy_type(0, numCols),
+                NB_RefreshForbidden<1>(colorBase, forbidden, colors_out,
+                                       this->t_xadj, this->t_adj, numVerts));
+            break;
+          case 2:
+            Kokkos::parallel_for(
+                "NB D2 Forbidden", range_policy_type(0, numCols),
+                NB_RefreshForbidden<2>(colorBase, forbidden, colors_out,
+                                       this->t_xadj, this->t_adj, numVerts));
+            break;
+          case 4:
+            Kokkos::parallel_for(
+                "NB D2 Forbidden", range_policy_type(0, numCols),
+                NB_RefreshForbidden<4>(colorBase, forbidden, colors_out,
+                                       this->t_xadj, this->t_adj, numVerts));
+            break;
+          case 8:
+            Kokkos::parallel_for(
+                "NB D2 Forbidden", range_policy_type(0, numCols),
+                NB_RefreshForbidden<8>(colorBase, forbidden, colors_out,
+                                       this->t_xadj, this->t_adj, numVerts));
+            break;
+          default:;
         }
-        //Will need to run with a different color base, so rebuild the work list
-        Kokkos::parallel_scan("NB D2 Worklist Rebuild", range_policy_type(0, numVerts),
-            NB_UpdateBatch(colors_out, worklist, worklen, numVerts));
-        Kokkos::deep_copy(currentWork, worklen);
-        worklistTime += timer.seconds();
+        forbiddenTime += timer.seconds();
         timer.reset();
-        if(currentWork == 0)
-        {
-          //Still have no work to do, meaning every vertex is colored
-          break;
+        switch (batch) {
+          case 1:
+            timer.reset();
+            Kokkos::parallel_for(
+                "NB D2 Coloring", range_policy_type(0, workBatches),
+                NB_Coloring<1>(worklist, worklen, colorBase, forbidden,
+                               colors_out, this->xadj, this->adj,
+                               vertsPerThread, numCols));
+            colorTime += timer.seconds();
+            timer.reset();
+            Kokkos::parallel_for(
+                "NB D2 Conflict Resolution", range_policy_type(0, numCols),
+                NB_Conflict<1>(colorBase, forbidden, colors_out, this->t_xadj,
+                               this->t_adj, numVerts));
+            conflictTime += timer.seconds();
+            break;
+          case 2:
+            timer.reset();
+            Kokkos::parallel_for(
+                "NB D2 Coloring", range_policy_type(0, workBatches),
+                NB_Coloring<2>(worklist, worklen, colorBase, forbidden,
+                               colors_out, this->xadj, this->adj,
+                               vertsPerThread, numCols));
+            colorTime += timer.seconds();
+            timer.reset();
+            Kokkos::parallel_for(
+                "NB D2 Conflict Resolution", range_policy_type(0, numCols),
+                NB_Conflict<2>(colorBase, forbidden, colors_out, this->t_xadj,
+                               this->t_adj, numVerts));
+            conflictTime += timer.seconds();
+            break;
+          case 4:
+            timer.reset();
+            Kokkos::parallel_for(
+                "NB D2 Coloring", range_policy_type(0, workBatches),
+                NB_Coloring<4>(worklist, worklen, colorBase, forbidden,
+                               colors_out, this->xadj, this->adj,
+                               vertsPerThread, numCols));
+            colorTime += timer.seconds();
+            timer.reset();
+            Kokkos::parallel_for(
+                "NB D2 Conflict Resolution", range_policy_type(0, numCols),
+                NB_Conflict<4>(colorBase, forbidden, colors_out, this->t_xadj,
+                               this->t_adj, numVerts));
+            conflictTime += timer.seconds();
+            break;
+          case 8:
+            timer.reset();
+            Kokkos::parallel_for(
+                "NB D2 Coloring", range_policy_type(0, workBatches),
+                NB_Coloring<8>(worklist, worklen, colorBase, forbidden,
+                               colors_out, this->xadj, this->adj,
+                               vertsPerThread, numCols));
+            colorTime += timer.seconds();
+            timer.reset();
+            Kokkos::parallel_for(
+                "NB D2 Conflict Resolution", range_policy_type(0, numCols),
+                NB_Conflict<8>(colorBase, forbidden, colors_out, this->t_xadj,
+                               this->t_adj, numVerts));
+            conflictTime += timer.seconds();
+            break;
+          default:
+            throw std::logic_error(
+                "D2 symmetric color batch size is not a power-of-two, or is "
+                "too big");
         }
-        //Clear forbidden before continuing
-        Kokkos::deep_copy(forbidden, 0U);
-      }
-      execution_space().fence();
-      if(this->_ticToc)
-      {
-        std::cout << "~~ D2 timings ~~\n";
-        std::cout << "Coloring: " << colorTime << '\n';
-        std::cout << "Conflict: " << conflictTime << '\n';
-        std::cout << "Forbidden: " << forbiddenTime << '\n';
-        std::cout << "Worklist: " << worklistTime << '\n';
-        std::cout << "** Total: " << colorTime + conflictTime + forbiddenTime + worklistTime << "\n\n";
-      }
-      if(this->_ticToc)
-      {
-        gc_handle->add_to_overall_coloring_time_phase1(timer.seconds());
         timer.reset();
+        // Then build the next worklist
+        Kokkos::parallel_scan(
+            "NB D2 worklist", range_policy_type(0, numVerts),
+            NB_Worklist(colors_out, worklist, worklen, numVerts), currentWork);
+        worklistTime += timer.seconds();
+        timer.reset();
+        iter++;
       }
-
-      // Save the number of phases and vertex colors to the graph coloring handle
-      this->gc_handle->set_vertex_colors(colors_out);
-      this->gc_handle->set_num_phases(iter);
+      // Will need to run with a different color base, so rebuild the work list
+      Kokkos::parallel_scan(
+          "NB D2 Worklist Rebuild", range_policy_type(0, numVerts),
+          NB_UpdateBatch(colors_out, worklist, worklen, numVerts));
+      Kokkos::deep_copy(currentWork, worklen);
+      worklistTime += timer.seconds();
+      timer.reset();
+      if (currentWork == 0) {
+        // Still have no work to do, meaning every vertex is colored
+        break;
+      }
+      // Clear forbidden before continuing
+      Kokkos::deep_copy(forbidden, 0U);
+    }
+    execution_space().fence();
+    if (this->_ticToc) {
+      std::cout << "~~ D2 timings ~~\n";
+      std::cout << "Coloring: " << colorTime << '\n';
+      std::cout << "Conflict: " << conflictTime << '\n';
+      std::cout << "Forbidden: " << forbiddenTime << '\n';
+      std::cout << "Worklist: " << worklistTime << '\n';
+      std::cout << "** Total: "
+                << colorTime + conflictTime + forbiddenTime + worklistTime
+                << "\n\n";
+    }
+    if (this->_ticToc) {
+      gc_handle->add_to_overall_coloring_time_phase1(timer.seconds());
+      timer.reset();
     }
 
-    void compute_d2_coloring_serial(const color_view_type& colors_out)
-    {
-      //Member data used:
-      // gc_handle    = graph coloring handle
-      // nr           = #vertices
-      // nc           = #columns
-      // xadj/adj     = graph where rows are vertices, and adjacent columns are listed
-      // t_xadj/t_adj = graph where rows are columns, and adjacent vertices are listed.
-      //                Allowed to alias xadj/adj if same.
-      if(this->_ticToc)
-      {
-        std::cout << "\tcolor_symmetric_graph_d2 params:\n"
-          << "\t\t#vertices : " << this->nr << '\n'
-          << "\t\t#edges: " << this->ne << '\n';
-      }
-      Kokkos::View<unsigned*, Kokkos::HostSpace> forbidden("Forbidden", this->nc);
-      auto colors = Kokkos::create_mirror_view(colors_out);
-      //Get the graph(s) in host space, if not already
-      Kokkos::View<const size_type*, Kokkos::HostSpace> Vrowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), this->xadj);
-      Kokkos::View<const lno_t*, Kokkos::HostSpace> Vcolinds = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), this->adj);
-      //Create worklist
-      Kokkos::View<lno_t*, Kokkos::HostSpace> worklist(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Worklist"), this->nr);
-      int iter = 0;
-      Kokkos::Timer timer;
-      lno_t currentWork = this->nr;
-      lno_t numCols = this->nc;
-      for(color_type colorBase = 1; currentWork > 0; colorBase += 32)
-      {
-        //Rebuilding the worklist in-place.
-        lno_t worklistOutput = 0;
-        for(lno_t i = 0; i < currentWork; i++)
-        {
-          lno_t v = i;
-          if(iter > 0)
-            v = worklist(i);
-          //Compute v's forbidden for this batch
-          unsigned forbid = 0;
-          size_type rowBegin = Vrowmap(v);
-          size_type rowEnd = Vrowmap(v + 1);
-          //always include the diagonal (self-edge), to avoid distance-1 conflicts.
-          if(!doing_bipartite)
-            forbid |= forbidden(v);
-          for(size_type j = rowBegin; j < rowEnd; j++)
-          {
+    // Save the number of phases and vertex colors to the graph coloring handle
+    this->gc_handle->set_vertex_colors(colors_out);
+    this->gc_handle->set_num_phases(iter);
+  }
+
+  void compute_d2_coloring_serial(const color_view_type& colors_out) {
+    // Member data used:
+    // gc_handle    = graph coloring handle
+    // nr           = #vertices
+    // nc           = #columns
+    // xadj/adj     = graph where rows are vertices, and adjacent columns are
+    // listed t_xadj/t_adj = graph where rows are columns, and adjacent vertices
+    // are listed.
+    //                Allowed to alias xadj/adj if same.
+    if (this->_ticToc) {
+      std::cout << "\tcolor_symmetric_graph_d2 params:\n"
+                << "\t\t#vertices : " << this->nr << '\n'
+                << "\t\t#edges: " << this->ne << '\n';
+    }
+    Kokkos::View<unsigned*, Kokkos::HostSpace> forbidden("Forbidden", this->nc);
+    auto colors = Kokkos::create_mirror_view(colors_out);
+    // Get the graph(s) in host space, if not already
+    Kokkos::View<const size_type*, Kokkos::HostSpace> Vrowmap =
+        Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), this->xadj);
+    Kokkos::View<const lno_t*, Kokkos::HostSpace> Vcolinds =
+        Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), this->adj);
+    // Create worklist
+    Kokkos::View<lno_t*, Kokkos::HostSpace> worklist(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Worklist"), this->nr);
+    int iter = 0;
+    Kokkos::Timer timer;
+    lno_t currentWork = this->nr;
+    lno_t numCols     = this->nc;
+    for (color_type colorBase = 1; currentWork > 0; colorBase += 32) {
+      // Rebuilding the worklist in-place.
+      lno_t worklistOutput = 0;
+      for (lno_t i = 0; i < currentWork; i++) {
+        lno_t v = i;
+        if (iter > 0) v = worklist(i);
+        // Compute v's forbidden for this batch
+        unsigned forbid    = 0;
+        size_type rowBegin = Vrowmap(v);
+        size_type rowEnd   = Vrowmap(v + 1);
+        // always include the diagonal (self-edge), to avoid distance-1
+        // conflicts.
+        if (!doing_bipartite) forbid |= forbidden(v);
+        for (size_type j = rowBegin; j < rowEnd; j++) {
+          lno_t nei = Vcolinds(j);
+          if (nei < numCols) forbid |= forbidden(nei);
+        }
+        if (~forbid) {
+          int bitOffset = KokkosKernels::Impl::least_set_bit(~forbid) - 1;
+          colors(v)     = colorBase + bitOffset;
+          // Together with including diagonal, setting forbidden(v)
+          // with v's color will prevent all distance-1 conflicts
+          if (v < numCols) forbidden(v) |= (1U << bitOffset);
+          for (size_type j = rowBegin; j < rowEnd; j++) {
             lno_t nei = Vcolinds(j);
-            if(nei < numCols)
-              forbid |= forbidden(nei);
-          }
-          if(~forbid)
-          {
-            int bitOffset = KokkosKernels::Impl::least_set_bit(~forbid) - 1;
-            colors(v) = colorBase + bitOffset;
-            //Together with including diagonal, setting forbidden(v)
-            //with v's color will prevent all distance-1 conflicts
-            if(v < numCols)
-              forbidden(v) |= (1U << bitOffset);
-            for(size_type j = rowBegin; j < rowEnd; j++)
-            {
-              lno_t nei = Vcolinds(j);
-              //marking forbidden on out-neighbors of v prevents distance-2 conflicts
-              if(nei < numCols)
-                forbidden(nei) |= (1U << bitOffset);
-            }
-          }
-          else
-          {
-            //Can't color in this batch, so add to worklist
-            worklist(worklistOutput++) = v;
+            // marking forbidden on out-neighbors of v prevents distance-2
+            // conflicts
+            if (nei < numCols) forbidden(nei) |= (1U << bitOffset);
           }
+        } else {
+          // Can't color in this batch, so add to worklist
+          worklist(worklistOutput++) = v;
         }
-        currentWork = worklistOutput;
-        //Clear all forbidden bits
-        Kokkos::deep_copy(forbidden, 0U);
-        iter++;
       }
-      if(this->_ticToc)
-      {
-        gc_handle->add_to_overall_coloring_time_phase1(timer.seconds());
-      }
-      // Save the number of phases and vertex colors to the graph coloring handle
-      Kokkos::deep_copy(colors_out, colors);
-      this->gc_handle->set_vertex_colors(colors_out);
-      this->gc_handle->set_num_phases(iter);
+      currentWork = worklistOutput;
+      // Clear all forbidden bits
+      Kokkos::deep_copy(forbidden, 0U);
+      iter++;
     }
+    if (this->_ticToc) {
+      gc_handle->add_to_overall_coloring_time_phase1(timer.seconds());
+    }
+    // Save the number of phases and vertex colors to the graph coloring handle
+    Kokkos::deep_copy(colors_out, colors);
+    this->gc_handle->set_vertex_colors(colors_out);
+    this->gc_handle->set_num_phases(iter);
+  }
 
-  private:
-    // -----------------------------------------------------------------
-    //
-    // GraphColorDistance2::colorGreedy()
-    //
-    // -----------------------------------------------------------------
-    void colorGreedy(rowmap_t         xadj_,
-                     entries_t        adj_,
-                     rowmap_t         t_xadj_,
-                     entries_t        t_adj_,
-                     color_view_type  vertex_colors_,
-                     lno_view_t       current_vertexList_,
-                     lno_t            current_vertexListLength_)
-    {
-        lno_t chunkSize_ = this->_chunkSize;
-
-        if(current_vertexListLength_ < 100 * chunkSize_)
-        {
-            chunkSize_ = 1;
-        }
-
-        // Pick the right coloring algorithm to use based on which algorithm we're using
-        switch(this->gc_handle->get_coloring_algo_type())
-        {
-            // Single level parallelism on chunks
-            // 1. [P] loop over vertices
-            // 2. [S] loop over color offset blocks
-            // 3. [S] loop over vertex neighbors
-            // 4. [S] loop over vertex neighbors of neighbors
-            case COLORING_D2_VB:
-            {
-                functorGreedyColorVB gc(
-                  this->nr, this->nc, xadj_, adj_, t_xadj_, t_adj_, vertex_colors_, current_vertexList_, current_vertexListLength_);
-                Kokkos::parallel_for("LoopOverChunks", range_policy_type(0, this->nr), gc);
-            }
-            break;
-
-            // One level Perallelism, BIT Array for coloring
-            // 1. [P] loop over vertices
-            // 2. [S] loop over color offset blocks
-            // 3. [S] loop over vertex neighbors
-            // 4. [S] loop over vertex neighbors of neighbors
-            case COLORING_D2_VB_BIT:
-            {
-                functorGreedyColorVB_BIT gc(
-                  this->nr, this->nc, xadj_, adj_, t_xadj_, t_adj_, vertex_colors_, current_vertexList_, current_vertexListLength_);
-                Kokkos::parallel_for("LoopOverChunks", range_policy_type(0, this->nr), gc);
-            }
-            break;
-
-            default:
-                throw std::invalid_argument("Unknown Distance-2 Algorithm Type or invalid for non Edge Filtering mode.");
-        }
-
-    }      // colorGreedy (end)
-
+ private:
+  // -----------------------------------------------------------------
+  //
+  // GraphColorDistance2::colorGreedy()
+  //
+  // -----------------------------------------------------------------
+  void colorGreedy(rowmap_t xadj_, entries_t adj_, rowmap_t t_xadj_,
+                   entries_t t_adj_, color_view_type vertex_colors_,
+                   lno_view_t current_vertexList_,
+                   lno_t current_vertexListLength_) {
+    lno_t chunkSize_ = this->_chunkSize;
+
+    if (current_vertexListLength_ < 100 * chunkSize_) {
+      chunkSize_ = 1;
+    }
 
+    // Pick the right coloring algorithm to use based on which algorithm we're
+    // using
+    switch (this->gc_handle->get_coloring_algo_type()) {
+      // Single level parallelism on chunks
+      // 1. [P] loop over vertices
+      // 2. [S] loop over color offset blocks
+      // 3. [S] loop over vertex neighbors
+      // 4. [S] loop over vertex neighbors of neighbors
+      case COLORING_D2_VB: {
+        functorGreedyColorVB gc(this->nr, this->nc, xadj_, adj_, t_xadj_,
+                                t_adj_, vertex_colors_, current_vertexList_,
+                                current_vertexListLength_);
+        Kokkos::parallel_for("LoopOverChunks", range_policy_type(0, this->nr),
+                             gc);
+      } break;
+
+      // One level Perallelism, BIT Array for coloring
+      // 1. [P] loop over vertices
+      // 2. [S] loop over color offset blocks
+      // 3. [S] loop over vertex neighbors
+      // 4. [S] loop over vertex neighbors of neighbors
+      case COLORING_D2_VB_BIT: {
+        functorGreedyColorVB_BIT gc(this->nr, this->nc, xadj_, adj_, t_xadj_,
+                                    t_adj_, vertex_colors_, current_vertexList_,
+                                    current_vertexListLength_);
+        Kokkos::parallel_for("LoopOverChunks", range_policy_type(0, this->nr),
+                             gc);
+      } break;
+
+      default:
+        throw std::invalid_argument(
+            "Unknown Distance-2 Algorithm Type or invalid for non Edge "
+            "Filtering mode.");
+    }
 
-    // -----------------------------------------------------------------
-    //
-    // GraphColorDistance2::colorGreedyEF()
-    //
-    // -----------------------------------------------------------------
-    void colorGreedyEF(rowmap_t        xadj_,
-                       lno_view_t      adj_copy_,
-                       rowmap_t        t_xadj_,
-                       entries_t       t_adj_copy_,
-                       color_view_type vertex_colors_)
-    {
-        // Pick the right coloring algorithm to use based on which algorithm we're using
-        switch(this->gc_handle->get_coloring_algo_type())
-        {
-            // One level parallelism, BIT Array for coloring + edge filtering
-            // 1. [P] loop over vertices
-            // 2. [S] loop over color offset blocks
-            // 3. [S] loop over vertex neighbors
-            // 4. [S] loop over vertex neighbors of neighbors
-            case COLORING_D2_VB_BIT_EF:
-            {
-                functorGreedyColorVB_BIT_EF gc(this->nr, this->nc,
-                                               xadj_,
-                                               adj_copy_,
-                                               t_xadj_,
-                                               t_adj_copy_,
-                                               vertex_colors_);
-                Kokkos::parallel_for("LoopOverChunks", range_policy_type(0, this->nr), gc);
-                // prettyPrint1DView(vertex_colors_, "COLORS_GC_VB_BIT",500);
-            }
-            break;
+  }  // colorGreedy (end)
+
+  // -----------------------------------------------------------------
+  //
+  // GraphColorDistance2::colorGreedyEF()
+  //
+  // -----------------------------------------------------------------
+  void colorGreedyEF(rowmap_t xadj_, lno_view_t adj_copy_, rowmap_t t_xadj_,
+                     entries_t t_adj_copy_, color_view_type vertex_colors_) {
+    // Pick the right coloring algorithm to use based on which algorithm we're
+    // using
+    switch (this->gc_handle->get_coloring_algo_type()) {
+      // One level parallelism, BIT Array for coloring + edge filtering
+      // 1. [P] loop over vertices
+      // 2. [S] loop over color offset blocks
+      // 3. [S] loop over vertex neighbors
+      // 4. [S] loop over vertex neighbors of neighbors
+      case COLORING_D2_VB_BIT_EF: {
+        functorGreedyColorVB_BIT_EF gc(this->nr, this->nc, xadj_, adj_copy_,
+                                       t_xadj_, t_adj_copy_, vertex_colors_);
+        Kokkos::parallel_for("LoopOverChunks", range_policy_type(0, this->nr),
+                             gc);
+        // prettyPrint1DView(vertex_colors_, "COLORS_GC_VB_BIT",500);
+      } break;
+
+      default:
+        throw std::invalid_argument(
+            "Unknown Distance-2 Algorithm Type or algorithm does not use Edge "
+            "Filtering.");
+    }
+  }  // colorGreedyEF (end)
+
+  // -----------------------------------------------------------------
+  //
+  // GraphColorDistance2::findConflicts()
+  //
+  // -----------------------------------------------------------------
+  lno_t findConflicts(bool& swap_work_arrays, rowmap_t xadj_, entries_t adj_,
+                      rowmap_t t_xadj_, entries_t t_adj_,
+                      color_view_type vertex_colors_,
+                      lno_view_t current_vertexList_,
+                      lno_t current_vertexListLength_,
+                      lno_view_t next_iteration_recolorList_,
+                      single_lno_view_t next_iteration_recolorListLength_) {
+    swap_work_arrays          = true;
+    lno_t output_numUncolored = 0;
+
+    functorFindConflicts_Atomic conf(
+        this->nr, this->nc, xadj_, adj_, t_xadj_, t_adj_, vertex_colors_,
+        current_vertexList_, next_iteration_recolorList_,
+        next_iteration_recolorListLength_);
+    Kokkos::parallel_reduce("FindConflicts",
+                            range_policy_type(0, current_vertexListLength_),
+                            conf, output_numUncolored);
+    return output_numUncolored;
+  }  // findConflicts (end)
+
+  // -----------------------------------------------------------------
+  //
+  // GraphColorDistance2::resolveConflictsSerial()
+  //
+  // -----------------------------------------------------------------
+  void resolveConflictsSerial(rowmap_t xadj_, entries_t adj_, rowmap_t t_xadj_,
+                              entries_t t_adj_, color_view_type vertex_colors_,
+                              lno_view_t current_vertexList_,
+                              size_type current_vertexListLength_) {
+    color_type* forbidden = new color_type[nr];
+    for (lno_t i = 0; i < nr; i++) forbidden[i] = nr;
+    lno_t vid = 0;
+    lno_t end = nr;
+
+    typename lno_view_t::HostMirror h_recolor_list;
+
+    end            = current_vertexListLength_;
+    h_recolor_list = Kokkos::create_mirror_view(current_vertexList_);
+    Kokkos::deep_copy(h_recolor_list, current_vertexList_);
+
+    auto h_colors = Kokkos::create_mirror_view(vertex_colors_);
+
+    auto h_idx = Kokkos::create_mirror_view(xadj_);
+    auto h_adj = Kokkos::create_mirror_view(adj_);
+
+    auto h_t_idx = Kokkos::create_mirror_view(t_xadj_);
+    auto h_t_adj = Kokkos::create_mirror_view(t_adj_);
+
+    Kokkos::deep_copy(h_colors, vertex_colors_);
+
+    Kokkos::deep_copy(h_idx, xadj_);
+    Kokkos::deep_copy(h_adj, adj_);
+
+    Kokkos::deep_copy(h_t_idx, t_xadj_);
+    Kokkos::deep_copy(h_t_adj, t_adj_);
+
+    for (lno_t k = 0; k < end; k++) {
+      vid = h_recolor_list(k);
+
+      if (h_colors(vid) > 0) continue;
+
+      // loop over distance-1 neighbors of vid
+      for (size_type vid_d1_adj = h_idx(vid); vid_d1_adj < h_idx(vid + 1);
+           vid_d1_adj++) {
+        lno_t vid_d1 = h_adj(vid_d1_adj);
+        if (vid_d1 < nc) {
+          if (!doing_bipartite && vid_d1 != vid) {
+            forbidden[h_colors(vid_d1)] = vid;
+          }
+          // loop over neighbors of vid_d1 (distance-2 from vid)
+          for (size_type vid_d2_adj = h_t_idx(vid_d1);
+               vid_d2_adj < h_t_idx(vid_d1 + 1); vid_d2_adj++) {
+            lno_t vid_d2 = h_t_adj(vid_d2_adj);
 
-            default:
-                throw std::invalid_argument("Unknown Distance-2 Algorithm Type or algorithm does not use Edge Filtering.");
+            // skip over loops vid -- x -- vid, and filter out-of-bounds
+            if (vid_d2 != vid && vid_d2 < nr) forbidden[h_colors(vid_d2)] = vid;
+          }
         }
-    }      // colorGreedyEF (end)
-
-
+      }
 
-    // -----------------------------------------------------------------
+      // color vertex vid with smallest available color
+      int c = 1;
+      while (forbidden[c] == vid) c++;
+      h_colors(vid) = c;
+    }
+    Kokkos::deep_copy(vertex_colors_, h_colors);
+    delete[] forbidden;
+  }  // resolveConflictsSerial (end)
+
+  // ------------------------------------------------------
+  // Functions: Helpers
+  // ------------------------------------------------------
+
+ public:
+  // pretty-print a 1D View with label
+  template <typename kokkos_view_t>
+  void prettyPrint1DView(kokkos_view_t& view, const char* label,
+                         const size_t max_entries = 500) const {
+    int max_per_line = 20;
+    int line_count   = 1;
+    std::cout << label << " = [ \n\t";
+    for (size_t i = 0; i < view.extent(0); i++) {
+      std::cout << std::setw(5) << view(i) << " ";
+      if (line_count >= max_per_line) {
+        std::cout << std::endl << "\t";
+        line_count = 0;
+      }
+      line_count++;
+      if (i >= max_entries - 1) {
+        std::cout << "<snip>";
+        break;
+      }
+    }
+    if (line_count > 1) std::cout << std::endl;
+    std::cout << "\t ]" << std::endl;
+  }  // prettyPrint1DView (end)
+
+  // ------------------------------------------------------
+  // Functors: Distance-2 Graph Coloring
+  // ------------------------------------------------------
+
+  /**
+   * Functor to init a list sequentialy, that is list[i] = i
+   */
+  template <typename view_type>
+  struct functorInitList {
+    view_type _vertexList;
+    functorInitList(view_type vertexList) : _vertexList(vertexList) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const lno_t i) const {
+      // Natural order
+      _vertexList(i) = i;
+    }
+  };  // struct functorInitList (end)
+
+  /**
+   * Functor for VB algorithm speculative coloring without edge filtering.
+   * Single level parallelism
+   */
+  struct functorGreedyColorVB {
+    lno_t nr;                 // num vertices
+    lno_t nc;                 // num columns
+    rowmap_t _idx;            // vertex degree list
+    entries_t _adj;           // vertex adjacency list
+    rowmap_t _t_idx;          // transpose vertex degree list
+    entries_t _t_adj;         // transpose vertex adjacency list
+    color_view_type _colors;  // vertex colors
+    lno_view_t _vertexList;   //
+    lno_t _vertexListLength;  //
+    lno_t _chunkSize;         //
+
+    functorGreedyColorVB(lno_t nr_, lno_t nc_, rowmap_t xadj_, entries_t adj_,
+                         rowmap_t t_xadj_, entries_t t_adj_,
+                         color_view_type colors, lno_view_t vertexList,
+                         lno_t vertexListLength)
+        : nr(nr_),
+          nc(nc_),
+          _idx(xadj_),
+          _adj(adj_),
+          _t_idx(t_xadj_),
+          _t_adj(t_adj_),
+          _colors(colors),
+          _vertexList(vertexList),
+          _vertexListLength(vertexListLength) {}
+
+    // Color vertex i with smallest available color.
     //
-    // GraphColorDistance2::findConflicts()
+    // Each thread colors a chunk of vertices to prevent all vertices getting
+    // the same color.
     //
-    // -----------------------------------------------------------------
-    lno_t findConflicts(
-        bool&              swap_work_arrays,
-        rowmap_t           xadj_,
-        entries_t          adj_,
-        rowmap_t           t_xadj_,
-        entries_t          t_adj_,
-        color_view_type    vertex_colors_,
-        lno_view_t         current_vertexList_,
-        lno_t              current_vertexListLength_,
-        lno_view_t         next_iteration_recolorList_,
-        single_lno_view_t  next_iteration_recolorListLength_)
-    {
-        swap_work_arrays                 = true;
-        lno_t output_numUncolored = 0;
-
-        functorFindConflicts_Atomic conf(
-            this->nr, this->nc,
-            xadj_,
-            adj_,
-            t_xadj_,
-            t_adj_,
-            vertex_colors_,
-            current_vertexList_,
-            next_iteration_recolorList_,
-            next_iteration_recolorListLength_);
-        Kokkos::parallel_reduce("FindConflicts", range_policy_type(0, current_vertexListLength_), conf, output_numUncolored);
-        return output_numUncolored;
-    }      // findConflicts (end)
-
-
-
-    // -----------------------------------------------------------------
+    // This version uses a bool array of size FORBIDDEN_SIZE.
     //
-    // GraphColorDistance2::resolveConflictsSerial()
+    // param: ii = vertex id
     //
-    // -----------------------------------------------------------------
-    void resolveConflictsSerial(rowmap_t        xadj_,
-                                entries_t       adj_,
-                                rowmap_t        t_xadj_,
-                                entries_t       t_adj_,
-                                color_view_type vertex_colors_,
-                                lno_view_t      current_vertexList_,
-                                size_type       current_vertexListLength_)
-    {
-        color_type*  forbidden = new color_type[ nr ];
-        for(lno_t i = 0; i < nr; i++)
-          forbidden[i] = nr;
-        lno_t vid = 0;
-        lno_t end = nr;
-
-        typename lno_view_t::HostMirror h_recolor_list;
-
-        end            = current_vertexListLength_;
-        h_recolor_list = Kokkos::create_mirror_view(current_vertexList_);
-        Kokkos::deep_copy(h_recolor_list, current_vertexList_);
-
-        auto h_colors = Kokkos::create_mirror_view(vertex_colors_);
-
-        auto h_idx = Kokkos::create_mirror_view(xadj_);
-        auto h_adj = Kokkos::create_mirror_view(adj_);
-
-        auto h_t_idx = Kokkos::create_mirror_view(t_xadj_);
-        auto h_t_adj = Kokkos::create_mirror_view(t_adj_);
-
-        Kokkos::deep_copy(h_colors, vertex_colors_);
-
-        Kokkos::deep_copy(h_idx, xadj_);
-        Kokkos::deep_copy(h_adj, adj_);
-
-        Kokkos::deep_copy(h_t_idx, t_xadj_);
-        Kokkos::deep_copy(h_t_adj, t_adj_);
-
-        for(lno_t k = 0; k < end; k++)
-        {
-            vid = h_recolor_list(k);
-
-            if(h_colors(vid) > 0)
-                continue;
-
-            // loop over distance-1 neighbors of vid
-            for(size_type vid_d1_adj = h_idx(vid); vid_d1_adj < h_idx(vid + 1); vid_d1_adj++)
-            {
-                lno_t vid_d1 = h_adj(vid_d1_adj);
-                if(vid_d1 < nc)
-                {
-                  if(!doing_bipartite && vid_d1 != vid)
-                  {
-                    forbidden[ h_colors(vid_d1) ] = vid;
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const lno_t vid) const {
+      // If vertex is not already colored...
+      if (_colors(vid) <= 0) {
+        const size_type vid_adj_begin = _idx(vid);
+        const size_type vid_adj_end   = _idx(vid + 1);
+
+        // Use forbidden array to find available color.
+        // - should be small enough to fit into fast memory (use Kokkos
+        // memoryspace?)
+        bool forbidden[VB_D2_COLORING_FORBIDDEN_SIZE];  // Forbidden Colors
+
+        // Do multiple passes if the forbidden array is too small.
+        // * The Distance-1 code used the knowledge of the degree of the vertex
+        // to cap the number of iterations
+        //   but in distance-2 we'd need the total vertices at distance-2 which
+        //   we don't easily have aprioi. This could be as big as all the
+        //   vertices in the graph if diameter(G)=2...
+        for (color_type offset = 1; offset <= nr;
+             offset += VB_D2_COLORING_FORBIDDEN_SIZE) {
+          // initialize
+          for (int i = 0; i < VB_D2_COLORING_FORBIDDEN_SIZE; i++) {
+            forbidden[i] = false;
+          }
+          // Check neighbors, fill forbidden array.
+          for (size_type vid_adj = vid_adj_begin; vid_adj < vid_adj_end;
+               vid_adj++) {
+            const lno_t vid_d1 = _adj(vid_adj);
+            if (vid_d1 < nc) {
+              if (!doing_bipartite)  // note: compile-time branch (template
+                                     // param)
+              {
+                if (vid_d1 != vid) {
+                  const color_type c = _colors(vid_d1);
+                  if ((c >= offset) &&
+                      (c - offset < VB_D2_COLORING_FORBIDDEN_SIZE)) {
+                    forbidden[c - offset] = true;
                   }
-                  // loop over neighbors of vid_d1 (distance-2 from vid)
-                  for(size_type vid_d2_adj = h_t_idx(vid_d1); vid_d2_adj < h_t_idx(vid_d1 + 1); vid_d2_adj++)
-                  {
-                    lno_t vid_d2 = h_t_adj(vid_d2_adj);
-
-                    // skip over loops vid -- x -- vid, and filter out-of-bounds
-                    if(vid_d2 != vid && vid_d2 < nr)
-                      forbidden[ h_colors(vid_d2) ] = vid;
+                }
+              }
+              const size_type vid_d1_adj_begin = _t_idx(vid_d1);
+              const size_type vid_d1_adj_end   = _t_idx(vid_d1 + 1);
+              for (size_type vid_d1_adj = vid_d1_adj_begin;
+                   vid_d1_adj < vid_d1_adj_end; vid_d1_adj++) {
+                const lno_t vid_d2 = _t_adj(vid_d1_adj);
+
+                // Skip distance-2-self-loops
+                if (vid_d2 != vid && vid_d2 < nr) {
+                  const color_type c = _colors(vid_d2);
+                  if ((c >= offset) &&
+                      (c - offset < VB_D2_COLORING_FORBIDDEN_SIZE)) {
+                    forbidden[c - offset] = true;
                   }
                 }
+              }  // for vid_d1_adj...
             }
+          }  // for vid_adj...
 
-            // color vertex vid with smallest available color
-            int c = 1;
-            while(forbidden[ c ] == vid) c++;
-            h_colors(vid) = c;
-        }
-        Kokkos::deep_copy(vertex_colors_, h_colors);
-        delete[] forbidden;
-    }      // resolveConflictsSerial (end)
-
-    // ------------------------------------------------------
-    // Functions: Helpers
-    // ------------------------------------------------------
-
-
-  public:
-
-    // pretty-print a 1D View with label
-    template<typename kokkos_view_t>
-    void prettyPrint1DView(kokkos_view_t& view, const char* label, const size_t max_entries = 500) const
-    {
-        int max_per_line = 20;
-        int line_count   = 1;
-        std::cout << label << " = [ \n\t";
-        for(size_t i = 0; i < view.extent(0); i++)
-        {
-            std::cout << std::setw(5) << view(i) << " ";
-            if(line_count >= max_per_line)
-            {
-                std::cout << std::endl << "\t";
-                line_count = 0;
+          // color vertex i with smallest available color (firstFit)
+          for (int c = 0; c < VB_D2_COLORING_FORBIDDEN_SIZE; c++) {
+            if (!forbidden[c]) {
+              _colors(vid) = offset + c;
+              return;
             }
-            line_count++;
-            if(i >= max_entries - 1)
-            {
-                std::cout << "<snip>";
-                break;
+          }  // for c...
+        }    // for offset < nr
+      }      // if _colors(vid) <= 0...
+    }        // operator() (end)
+  };         // struct functorGreedyColorVB (end)
+
+  /**
+   * Functor for VB_BIT algorithm coloring without edge filtering.
+   * Single level parallelism
+   */
+  struct functorGreedyColorVB_BIT {
+    lno_t nr;                 // num vertices
+    lno_t nc;                 // num columns
+    rowmap_t _idx;            // vertex degree list
+    entries_t _adj;           // vertex adjacency list
+    rowmap_t _t_idx;          // transpose vertex degree list
+    entries_t _t_adj;         // transpose vertex adjacency list
+    color_view_type _colors;  // vertex colors
+    lno_view_t _vertexList;   //
+    lno_t _vertexListLength;  //
+
+    functorGreedyColorVB_BIT(lno_t nr_, lno_t nc_, rowmap_t xadj_,
+                             entries_t adj_, rowmap_t t_xadj_, entries_t t_adj_,
+                             color_view_type colors, lno_view_t vertexList,
+                             lno_t vertexListLength)
+        : nr(nr_),
+          nc(nc_),
+          _idx(xadj_),
+          _adj(adj_),
+          _t_idx(t_xadj_),
+          _t_adj(t_adj_),
+          _colors(colors),
+          _vertexList(vertexList),
+          _vertexListLength(vertexListLength) {}
+
+    // Color vertex i with smallest available color.
+    //
+    // Each thread colors a chunk of vertices to prevent all vertices getting
+    // the same color.
+    //
+    // This version uses a bool array of size FORBIDDEN_SIZE.
+    //
+    // param: ii = vertex id
+    //
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const lno_t vid) const {
+      // If vertex is not colored yet...
+      if (_colors(vid) == 0) {
+        const size_type vid_adj_begin = _idx(vid);
+        const size_type vid_adj_end   = _idx(vid + 1);
+
+        for (color_type offset = 1;
+             offset <= (nr + VBBIT_D2_COLORING_FORBIDDEN_SIZE);
+             offset += VBBIT_D2_COLORING_FORBIDDEN_SIZE) {
+          // Forbidden colors
+          // - single long int for forbidden colors
+          bit_64_forbidden_type forbidden = 0;
+
+          // If all available colors for this range are unavailable we can break
+          // out of the nested loops
+          bool break_out = false;
+
+          // Loop over distance-1 neighbors of vid
+          for (size_type vid_adj = vid_adj_begin;
+               !break_out && vid_adj < vid_adj_end; ++vid_adj) {
+            const lno_t vid_d1 = _adj(vid_adj);
+            if (vid_d1 < nc) {
+              if (!doing_bipartite)  // note: compile-time branch (template
+                                     // param)
+              {
+                // Check for dist-1 conflicts
+                if (vid_d1 != vid) {
+                  const color_type color        = _colors(vid_d1);
+                  const color_type color_offset = color - offset;
+                  if (color &&
+                      color_offset <= VBBIT_D2_COLORING_FORBIDDEN_SIZE) {
+                    // if it is in the current range, then add the color to the
+                    // banned colors
+                    if (color > offset) {
+                      // convert color to bit representation
+                      bit_64_forbidden_type ban_color_bit = 1;
+
+                      ban_color_bit = ban_color_bit << color_offset;
+
+                      // add it to forbidden colors
+                      forbidden |= (bit_64_forbidden_type(1) << color_offset);
+                    }
+                  }
+                }
+              }
+              const size_type vid_d1_adj_begin = _t_idx(vid_d1);
+              const size_type vid_d1_adj_end   = _t_idx(vid_d1 + 1);
+
+              // Loop over distance-2 neighbors of vid
+              for (size_type vid_d1_adj = vid_d1_adj_begin;
+                   !break_out && vid_d1_adj < vid_d1_adj_end; ++vid_d1_adj) {
+                const lno_t vid_d2 = _t_adj(vid_d1_adj);
+
+                // Ignore Distance-2 Self Loops
+                if (vid_d2 != vid && vid_d2 < nr) {
+                  const color_type color        = _colors(vid_d2);
+                  const color_type color_offset = color - offset;
+
+                  // if color is within the current range, or if its color is in
+                  // a previously traversed range
+                  if (offset <= color &&
+                      color_offset < VBBIT_D2_COLORING_FORBIDDEN_SIZE) {
+                    // if it is in the current range, then add the color to the
+                    // banned colors
+                    forbidden |= (bit_64_forbidden_type(1) << color_offset);
+
+                    // if there are no available colors in this range then exit
+                    // early, no need to traverse the rest.
+                    if (0 == ~forbidden) {
+                      break_out = true;
+                    }
+                  }  // if color in current range ...
+                }    // if vid_d2 ...
+              }      // for vid_d1_adj ...
             }
-        }
-        if(line_count > 1)
-            std::cout << std::endl;
-        std::cout << "\t ]" << std::endl;
-    }      // prettyPrint1DView (end)
-
-
-
-    // ------------------------------------------------------
-    // Functors: Distance-2 Graph Coloring
-    // ------------------------------------------------------
-
-    /**
-     * Functor to init a list sequentialy, that is list[i] = i
-     */
-    template<typename view_type>
-    struct functorInitList
-    {
-        view_type _vertexList;
-        functorInitList(view_type vertexList) : _vertexList(vertexList) {}
-
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const lno_t i) const
-        {
-            // Natural order
-            _vertexList(i) = i;
-        }
-    };      // struct functorInitList (end)
-
-
-    /**
-     * Functor for VB algorithm speculative coloring without edge filtering.
-     * Single level parallelism
-     */
-    struct functorGreedyColorVB
-    {
-        lno_t           nr;                // num vertices
-        lno_t           nc;                // num columns
-        rowmap_t        _idx;              // vertex degree list
-        entries_t       _adj;              // vertex adjacency list
-        rowmap_t        _t_idx;            // transpose vertex degree list
-        entries_t       _t_adj;            // transpose vertex adjacency list
-        color_view_type _colors;           // vertex colors
-        lno_view_t      _vertexList;       //
-        lno_t           _vertexListLength; //
-        lno_t           _chunkSize;        //
-
-        functorGreedyColorVB(lno_t            nr_,
-                             lno_t            nc_,
-                             rowmap_t         xadj_,
-                             entries_t        adj_,
-                             rowmap_t         t_xadj_,
-                             entries_t        t_adj_,
-                             color_view_type  colors,
-                             lno_view_t       vertexList,
-                             lno_t            vertexListLength)
-            : nr(nr_), nc(nc_)
-            , _idx(xadj_)
-            , _adj(adj_)
-            , _t_idx(t_xadj_)
-            , _t_adj(t_adj_)
-            , _colors(colors)
-            , _vertexList(vertexList)
-            , _vertexListLength(vertexListLength)
-        {
-        }
+          }  // for vid_adj ...
 
+          // check if an available color exists.
+          if (~forbidden) {
+            bit_64_forbidden_type color_offset =
+                KokkosKernels::Impl::least_set_bit(~forbidden) - 1;
+            _colors(vid) = offset + color_offset;
+            return;
+          }
+        }  // for offset <= (nr + VBBIT_D2_COLORING_FORBIDDEN_SIZE)
+      }    // if _colors(vid)==0
+    }      // operator() (end)
+  };       // struct functorGreedyColorVB_BIT (end)
+
+  /**
+   * Functor for VB_BIT_EF algorithm coloring without edge filtering.
+   * Single level parallelism
+   */
+  struct functorGreedyColorVB_BIT_EF {
+    lno_t _nr;                // num vertices
+    lno_t _nc;                // num vertices
+    rowmap_t _idx;            // rowmap
+    lno_view_t _adj;          // vertex adjacency list  (mutable)
+    rowmap_t _t_idx;          // transpose vertex degree list
+    entries_t _t_adj;         // transpose vertex adjacency list (NOT modified)
+    color_view_type _colors;  // vertex colors
+
+    functorGreedyColorVB_BIT_EF(lno_t nr_, lno_t nc_, rowmap_t xadj_,
+                                lno_view_t adj_, rowmap_t t_xadj_,
+                                entries_t t_adj_, color_view_type colors)
+        : _nr(nr_),
+          _nc(nc_),
+          _idx(xadj_),
+          _adj(adj_),
+          _t_idx(t_xadj_),
+          _t_adj(t_adj_),
+          _colors(colors) {}
+
+    // Color vertex i with smallest available color.
+    //
+    // Each thread colors a chunk of vertices to prevent all vertices getting
+    // the same color.
+    //
+    // This version uses a bool array of size FORBIDDEN_SIZE.
+    //
+    // param: ii = vertex id
+    //
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const lno_t vid) const {
+      // If vertex is not colored yet..
+      if (_colors(vid) == 0) {
+        size_type vid_adj_begin = _idx(vid);
+        size_type vid_adj_end   = _idx(vid + 1);
+
+        bool foundColor = false;
+        for (color_type offset = 0;
+             !foundColor && offset <= (_nr + VBBIT_D2_COLORING_FORBIDDEN_SIZE);
+             offset += VBBIT_D2_COLORING_FORBIDDEN_SIZE) {
+          // Forbidden colors
+          // - single long int for forbidden colors
+          bit_64_forbidden_type forbidden = 0;
+
+          // If all available colors for this range are unavailable we can break
+          // out of the nested loops
+          bool offset_colors_full = false;
+
+          // Loop over distance-1 neighbors of vid
+          for (size_type vid_adj = vid_adj_begin;
+               !offset_colors_full && vid_adj < vid_adj_end; ++vid_adj) {
+            const lno_t vid_d1 = _adj(vid_adj);
+            if (vid_d1 < _nc) {
+              if (!doing_bipartite)  // note: compile-time branch (template
+                                     // param)
+              {
+                // Check for dist-1 conflict
+                if (vid_d1 != vid) {
+                  color_type color        = _colors(vid_d1);
+                  color_type color_offset = color - offset;
+                  // if color is within the current range, or if its color is in
+                  // a previously traversed range
+                  if (color && offset < color &&
+                      color_offset <= VBBIT_D2_COLORING_FORBIDDEN_SIZE) {
+                    // if it is in the current range, then add the color to the
+                    // banned colors convert color to bit representation
+                    bit_64_forbidden_type ban_color_bit = 1;
+                    ban_color_bit = ban_color_bit << (color_offset - 1);
+                    // add it to forbidden colors
+                    forbidden = forbidden | ban_color_bit;
+                  }
+                }
+              }
 
-        // Color vertex i with smallest available color.
-        //
-        // Each thread colors a chunk of vertices to prevent all vertices getting the same color.
-        //
-        // This version uses a bool array of size FORBIDDEN_SIZE.
-        //
-        // param: ii = vertex id
-        //
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const lno_t vid) const
-        {
-            // If vertex is not already colored...
-            if(_colors(vid) <= 0)
-            {
-                const size_type vid_adj_begin = _idx(vid);
-                const size_type vid_adj_end   = _idx(vid + 1);
-
-                // Use forbidden array to find available color.
-                // - should be small enough to fit into fast memory (use Kokkos memoryspace?)
-                bool forbidden[ VB_D2_COLORING_FORBIDDEN_SIZE ];      // Forbidden Colors
-
-                // Do multiple passes if the forbidden array is too small.
-                // * The Distance-1 code used the knowledge of the degree of the vertex to cap the number of iterations
-                //   but in distance-2 we'd need the total vertices at distance-2 which we don't easily have aprioi.
-                //   This could be as big as all the vertices in the graph if diameter(G)=2...
-                for(color_type offset = 1; offset <= nr; offset += VB_D2_COLORING_FORBIDDEN_SIZE)
-                {
-                    // initialize
-                    for(int i = 0; i < VB_D2_COLORING_FORBIDDEN_SIZE; i++) { forbidden[ i ] = false; }
-                    // Check neighbors, fill forbidden array.
-                    for(size_type vid_adj = vid_adj_begin; vid_adj < vid_adj_end; vid_adj++)
-                    {
-                      const lno_t vid_d1 = _adj(vid_adj);
-                      if(vid_d1 < nc)
-                      {
-                        if(!doing_bipartite)  //note: compile-time branch (template param)
-                        {
-                          if(vid_d1 != vid)
-                          {
-                            const color_type c = _colors(vid_d1);
-                            if((c >= offset) && (c - offset < VB_D2_COLORING_FORBIDDEN_SIZE))
-                            {
-                                forbidden[ c - offset ] = true;
-                            }
-                          }
-                        }
-                        const size_type vid_d1_adj_begin = _t_idx(vid_d1);
-                        const size_type vid_d1_adj_end   = _t_idx(vid_d1 + 1);
-                        for(size_type vid_d1_adj = vid_d1_adj_begin; vid_d1_adj < vid_d1_adj_end; vid_d1_adj++)
-                        {
-                            const lno_t vid_d2 = _t_adj(vid_d1_adj);
-
-                            // Skip distance-2-self-loops
-                            if(vid_d2 != vid && vid_d2 < nr)
-                            {
-                                const color_type c = _colors(vid_d2);
-                                if((c >= offset) && (c - offset < VB_D2_COLORING_FORBIDDEN_SIZE))
-                                {
-                                    forbidden[ c - offset ] = true;
-                                }
-                            }
-                        }      // for vid_d1_adj...
+              size_type vid_d1_adj_begin     = _t_idx(vid_d1);
+              const size_type vid_d1_adj_end = _t_idx(vid_d1 + 1);
+              const size_type degree_vid_d1 = vid_d1_adj_end - vid_d1_adj_begin;
+              size_type num_vid_d2_colored_in_range = 0;
+
+              // Store the maximum color value found in the vertices adjacent to
+              // vid_d1
+              color_type max_color_adj_to_d1 = 0;
+
+              // Loop over distance-2 neighbors of vid
+              for (size_type vid_d1_adj = vid_d1_adj_begin;
+                   !offset_colors_full && vid_d1_adj < vid_d1_adj_end;
+                   ++vid_d1_adj) {
+                const lno_t vid_d2 = _t_adj(vid_d1_adj);
+
+                // Ignore Distance-2 Self Loops
+                if (vid_d2 != vid && vid_d2 < _nr) {
+                  color_type color = _colors(vid_d2);
+                  color_type color_offset =
+                      color - offset;  // color_offset < 0 means color is from a
+                                       // previous offset.
+
+                  // Update maximum color adjacent to vid_d1 found so far.
+                  max_color_adj_to_d1 =
+                      color > max_color_adj_to_d1 ? color : max_color_adj_to_d1;
+
+                  // if color is within the current range, or if its color is in
+                  // a previously traversed range
+                  if (color &&
+                      color_offset <= VBBIT_D2_COLORING_FORBIDDEN_SIZE) {
+                    num_vid_d2_colored_in_range++;
+
+                    // if it is in the current range, then add the color to the
+                    // banned colors
+                    if (color > offset) {
+                      // convert color to bit representation
+                      bit_64_forbidden_type ban_color_bit = 1;
+
+                      ban_color_bit = ban_color_bit << (color_offset - 1);
+
+                      // add it to forbidden colors
+                      forbidden = forbidden | ban_color_bit;
+
+                      // if there are no available colors in this range then
+                      // exit early, no need to traverse the rest b/c they
+                      // contribute no new information at this offset.
+                      if (0 == ~forbidden) {
+                        offset_colors_full = true;
+                        // Note: with edge-filtering, this can short-circuit the
+                        // loop over all
+                        //       neighbors of VID and will reduce the number of
+                        //       filtered edges.
                       }
-                    }          // for vid_adj...
-
-                    // color vertex i with smallest available color (firstFit)
-                    for(int c = 0; c < VB_D2_COLORING_FORBIDDEN_SIZE; c++)
-                    {
-                        if(!forbidden[ c ])
-                        {
-                            _colors(vid) = offset + c;
-                            return;
-                        }
-                    }      // for c...
-                }          // for offset < nr
-            }              // if _colors(vid) <= 0...
-        }                  // operator() (end)
-    };                     // struct functorGreedyColorVB (end)
-
-
-    /**
-     * Functor for VB_BIT algorithm coloring without edge filtering.
-     * Single level parallelism
-     */
-    struct functorGreedyColorVB_BIT
-    {
-        lno_t           nr;                // num vertices
-        lno_t           nc;                // num columns
-        rowmap_t        _idx;              // vertex degree list
-        entries_t       _adj;              // vertex adjacency list
-        rowmap_t        _t_idx;            // transpose vertex degree list
-        entries_t       _t_adj;            // transpose vertex adjacency list
-        color_view_type _colors;           // vertex colors
-        lno_view_t      _vertexList;       //
-        lno_t           _vertexListLength; //
-
-        functorGreedyColorVB_BIT(lno_t            nr_,
-                                 lno_t            nc_,
-                                 rowmap_t         xadj_,
-                                 entries_t        adj_,
-                                 rowmap_t         t_xadj_,
-                                 entries_t        t_adj_,
-                                 color_view_type  colors,
-                                 lno_view_t       vertexList,
-                                 lno_t            vertexListLength)
-            : nr(nr_), nc(nc_)
-            , _idx(xadj_)
-            , _adj(adj_)
-            , _t_idx(t_xadj_)
-            , _t_adj(t_adj_)
-            , _colors(colors)
-            , _vertexList(vertexList)
-            , _vertexListLength(vertexListLength)
-        {
-        }
+                    }  // if color > offset
+                  }    // if color && color_offset
+                }      // if vid_d2 != vid ...
+                else {
+                  // If there's a self-loop then we should increment our
+                  // 'colored in range' so we don't block filtering since we
+                  // know there must be a (v2,v1) edge
+                  num_vid_d2_colored_in_range++;
+                }
+              }  // for vid_d1_adj ...
+
+              // Edge filtering on the neighbors of vid.  We can only do this if
+              // ALL neighbors of vid_d1 have been visited and if all were
+              // colored in current offset range or lower.
+              if (degree_vid_d1 == num_vid_d2_colored_in_range) {
+                if (vid_adj_begin > vid_adj) {
+                  _adj(vid_adj)       = _adj(vid_adj_begin);
+                  _adj(vid_adj_begin) = vid_d1;
+                }
+                vid_adj_begin++;
+              }
+            }
 
+          }  // for vid_adj
+          forbidden = ~(forbidden);
 
-        // Color vertex i with smallest available color.
-        //
-        // Each thread colors a chunk of vertices to prevent all vertices getting the same color.
-        //
-        // This version uses a bool array of size FORBIDDEN_SIZE.
-        //
-        // param: ii = vertex id
-        //
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const lno_t vid) const
-        {
-            // If vertex is not colored yet...
-            if(_colors(vid) == 0)
-            {
-                const size_type vid_adj_begin = _idx(vid);
-                const size_type vid_adj_end   = _idx(vid + 1);
-
-                for(color_type offset = 1; offset <= (nr + VBBIT_D2_COLORING_FORBIDDEN_SIZE);
-                    offset += VBBIT_D2_COLORING_FORBIDDEN_SIZE)
-                {
-                    // Forbidden colors
-                    // - single long int for forbidden colors
-                    bit_64_forbidden_type forbidden = 0;
-
-                    // If all available colors for this range are unavailable we can break out of the nested loops
-                    bool break_out = false;
-
-                    // Loop over distance-1 neighbors of vid
-                    for(size_type vid_adj = vid_adj_begin; !break_out && vid_adj < vid_adj_end; ++vid_adj)
-                    {
-                        const lno_t vid_d1 = _adj(vid_adj);
-                        if(vid_d1 < nc)
-                        {
-                          if(!doing_bipartite)  //note: compile-time branch (template param)
-                          {
-                            //Check for dist-1 conflicts
-                            if(vid_d1 != vid)
-                            {
-                              const color_type color = _colors(vid_d1);
-                              const color_type color_offset = color - offset;
-                              if(color && color_offset <= VBBIT_D2_COLORING_FORBIDDEN_SIZE)
-                              {
-                                  // if it is in the current range, then add the color to the banned colors
-                                  if(color > offset)
-                                  {
-                                      // convert color to bit representation
-                                      bit_64_forbidden_type ban_color_bit = 1;
-
-                                      ban_color_bit = ban_color_bit << color_offset;
-
-                                      // add it to forbidden colors
-                                      forbidden |= (bit_64_forbidden_type(1) << color_offset);
-                                  }
-                              }
-                            }
-                          }
-                          const size_type    vid_d1_adj_begin = _t_idx(vid_d1);
-                          const size_type    vid_d1_adj_end   = _t_idx(vid_d1 + 1);
-
-                          // Loop over distance-2 neighbors of vid
-                          for(size_type vid_d1_adj = vid_d1_adj_begin; !break_out && vid_d1_adj < vid_d1_adj_end; ++vid_d1_adj)
-                          {
-                              const lno_t vid_d2 = _t_adj(vid_d1_adj);
-
-                              // Ignore Distance-2 Self Loops
-                              if(vid_d2 != vid && vid_d2 < nr)
-                              {
-                                  const color_type color        = _colors(vid_d2);
-                                  const color_type color_offset = color - offset;
-
-                                  // if color is within the current range, or if its color is in a previously traversed
-                                  // range
-                                  if(offset <= color && color_offset < VBBIT_D2_COLORING_FORBIDDEN_SIZE)
-                                  {
-                                      // if it is in the current range, then add the color to the banned colors
-                                      forbidden |= (bit_64_forbidden_type(1) << color_offset);
-
-                                      // if there are no available colors in this range then exit early,
-                                      // no need to traverse the rest.
-                                      if(0 == ~forbidden)
-                                      {
-                                          break_out = true;
-                                      }
-                                  }          // if color in current range ...
-                              }              // if vid_d2 ...
-                          }                  // for vid_d1_adj ...
-                        }
-                    }                      // for vid_adj ...
-
-                    // check if an available color exists.
-                    if(~forbidden)
-                    {
-                        bit_64_forbidden_type color_offset = KokkosKernels::Impl::least_set_bit(~forbidden) - 1;
-                        _colors(vid) = offset + color_offset;
-                        return;
-                    }
-                }      // for offset <= (nr + VBBIT_D2_COLORING_FORBIDDEN_SIZE)
-            }          // if _colors(vid)==0
-        }              // operator() (end)
-    };                 // struct functorGreedyColorVB_BIT (end)
-
-
-    /**
-     * Functor for VB_BIT_EF algorithm coloring without edge filtering.
-     * Single level parallelism
-     */
-    struct functorGreedyColorVB_BIT_EF
-    {
-        lno_t           _nr;                // num vertices
-        lno_t           _nc;                // num vertices
-        rowmap_t        _idx;               // rowmap
-        lno_view_t      _adj;               // vertex adjacency list  (mutable)
-        rowmap_t        _t_idx;             // transpose vertex degree list
-        entries_t       _t_adj;             // transpose vertex adjacency list (NOT modified)
-        color_view_type _colors;            // vertex colors
-
-        functorGreedyColorVB_BIT_EF(lno_t           nr_,
-                                    lno_t           nc_,
-                                    rowmap_t        xadj_,
-                                    lno_view_t      adj_,
-                                    rowmap_t        t_xadj_,
-                                    entries_t       t_adj_,
-                                    color_view_type colors)
-            : _nr(nr_), _nc(nc_)
-            , _idx(xadj_)
-            , _adj(adj_)
-            , _t_idx(t_xadj_)
-            , _t_adj(t_adj_)
-            , _colors(colors)
-        {
-        }
+          // check if an available color exists.
+          if (forbidden) {
+            // if there is an available color, choose the first color, using 2s
+            // complement.
+            bit_64_forbidden_type new_color = forbidden & (-forbidden);
+            color_type val                  = 1;
 
-        // Color vertex i with smallest available color.
-        //
-        // Each thread colors a chunk of vertices to prevent all vertices getting the same color.
-        //
-        // This version uses a bool array of size FORBIDDEN_SIZE.
-        //
-        // param: ii = vertex id
-        //
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const lno_t vid) const
-        {
-            // If vertex is not colored yet..
-            if(_colors(vid) == 0)
-            {
-                size_type vid_adj_begin = _idx(vid);
-                size_type vid_adj_end   = _idx(vid + 1);
-
-                bool foundColor = false;
-                for(color_type offset = 0; !foundColor && offset <= (_nr + VBBIT_D2_COLORING_FORBIDDEN_SIZE);
-                    offset += VBBIT_D2_COLORING_FORBIDDEN_SIZE)
-                {
-                    // Forbidden colors
-                    // - single long int for forbidden colors
-                    bit_64_forbidden_type forbidden = 0;
-
-                    // If all available colors for this range are unavailable we can break out of the nested loops
-                    bool offset_colors_full = false;
-
-                    // Loop over distance-1 neighbors of vid
-                    for(size_type vid_adj = vid_adj_begin; !offset_colors_full && vid_adj < vid_adj_end; ++vid_adj)
-                    {
-                        const lno_t vid_d1 = _adj(vid_adj);
-                        if(vid_d1 < _nc)
-                        {
-                          if(!doing_bipartite)  //note: compile-time branch (template param)
-                          {
-                            //Check for dist-1 conflict
-                            if(vid_d1 != vid)
-                            {
-                                color_type color = _colors(vid_d1);
-                                color_type color_offset = color - offset;
-                                // if color is within the current range, or if its color is in a previously traversed
-                                // range
-                                if(color && offset < color && color_offset <= VBBIT_D2_COLORING_FORBIDDEN_SIZE)
-                                {
-                                  // if it is in the current range, then add the color to the banned colors
-                                  // convert color to bit representation
-                                  bit_64_forbidden_type ban_color_bit = 1;
-                                  ban_color_bit = ban_color_bit << (color_offset - 1);
-                                  // add it to forbidden colors
-                                  forbidden = forbidden | ban_color_bit;
-                                }
-                            }
-                          }
-
-                          size_type       vid_d1_adj_begin            = _t_idx(vid_d1);
-                          const size_type vid_d1_adj_end              = _t_idx(vid_d1 + 1);
-                          const size_type degree_vid_d1               = vid_d1_adj_end - vid_d1_adj_begin;
-                          size_type       num_vid_d2_colored_in_range = 0;
-
-                          // Store the maximum color value found in the vertices adjacent to vid_d1
-                          color_type max_color_adj_to_d1 = 0;
-
-                          // Loop over distance-2 neighbors of vid
-                          for(size_type vid_d1_adj = vid_d1_adj_begin;
-                              !offset_colors_full && vid_d1_adj < vid_d1_adj_end;
-                              ++vid_d1_adj)
-                          {
-                              const lno_t vid_d2 = _t_adj(vid_d1_adj);
-
-                              // Ignore Distance-2 Self Loops
-                              if(vid_d2 != vid && vid_d2 < _nr)
-                              {
-                                  color_type color = _colors(vid_d2);
-                                  color_type color_offset =
-                                    color - offset;      // color_offset < 0 means color is from a previous offset.
-
-                                  // Update maximum color adjacent to vid_d1 found so far.
-                                  max_color_adj_to_d1 = color > max_color_adj_to_d1 ? color : max_color_adj_to_d1;
-
-                                  // if color is within the current range, or if its color is in a previously traversed
-                                  // range
-                                  if(color && color_offset <= VBBIT_D2_COLORING_FORBIDDEN_SIZE)
-                                  {
-                                      num_vid_d2_colored_in_range++;
-
-                                      // if it is in the current range, then add the color to the banned colors
-                                      if(color > offset)
-                                      {
-                                          // convert color to bit representation
-                                          bit_64_forbidden_type ban_color_bit = 1;
-
-                                          ban_color_bit = ban_color_bit << (color_offset - 1);
-
-                                          // add it to forbidden colors
-                                          forbidden = forbidden | ban_color_bit;
-
-                                          // if there are no available colors in this range then exit early,
-                                          // no need to traverse the rest b/c they contribute no new information
-                                          // at this offset.
-                                          if(0 == ~forbidden)
-                                          {
-                                              offset_colors_full = true;
-                                              // Note: with edge-filtering, this can short-circuit the loop over all
-                                              //       neighbors of VID and will reduce the number of filtered edges.
-                                          }
-                                      }      // if color > offset
-                                  }          // if color && color_offset
-                              }              // if vid_d2 != vid ...
-                              else
-                              {
-                                  // If there's a self-loop then we should increment our 'colored in range' so we don't
-                                  // block filtering since we know there must be a (v2,v1) edge
-                                  num_vid_d2_colored_in_range++;
-                              }
-                          }      // for vid_d1_adj ...
-
-                          // Edge filtering on the neighbors of vid.  We can only do this if ALL neighbors of vid_d1
-                          // have been visited and if all were colored in current offset range or lower.
-                          if(degree_vid_d1 == num_vid_d2_colored_in_range)
-                          {
-                              if(vid_adj_begin > vid_adj)
-                              {
-                                  _adj(vid_adj)       = _adj(vid_adj_begin);
-                                  _adj(vid_adj_begin) = vid_d1;
-                              }
-                              vid_adj_begin++;
-                          }
-                        }
-
-                    }      // for vid_adj
-                    forbidden = ~(forbidden);
-
-                    // check if an available color exists.
-                    if(forbidden)
-                    {
-                        // if there is an available color, choose the first color, using 2s complement.
-                        bit_64_forbidden_type new_color = forbidden & (-forbidden);
-                        color_type         val       = 1;
-
-                        // convert it back to decimal color.
-                        while((new_color & 1) == 0)
-                        {
-                            ++val;
-                            new_color = new_color >> 1;
-                        }
-                        _colors(vid) = val + offset;
-                        foundColor   = true;
-                        break;
-                    }
-                }      // for offset=0...
-            }          // if _colors(vid)==0
-        }              // operator() (end)
-    };                 // struct functorGreedyColorVB_BIT_EF (end)
-
-
-    struct functorFindConflicts_Atomic
-    {
-        lno_t             nr;      // num verts
-        lno_t             nc;      // num columns
-        rowmap_t          _idx;
-        entries_t         _adj;
-        rowmap_t          _t_idx;
-        entries_t         _t_adj;
-        color_view_type   _colors;
-        lno_view_t        _vertexList;
-        lno_view_t        _recolorList;
-        single_lno_view_t _recolorListLength;
-
-        functorFindConflicts_Atomic(lno_t             nr_,
-                                    lno_t             nc_,
-                                    rowmap_t          xadj_,
-                                    entries_t         adj_,
-                                    rowmap_t          t_xadj_,
-                                    entries_t         t_adj_,
-                                    color_view_type   colors,
-                                    lno_view_t        vertexList,
-                                    lno_view_t        recolorList,
-                                    single_lno_view_t recolorListLength)
-            : nr(nr_), nc(nc_)
-            , _idx(xadj_)
-            , _adj(adj_)
-            , _t_idx(t_xadj_)
-            , _t_adj(t_adj_)
-            , _colors(colors)
-            , _vertexList(vertexList)
-            , _recolorList(recolorList)
-            , _recolorListLength(recolorListLength)
-        {
+            // convert it back to decimal color.
+            while ((new_color & 1) == 0) {
+              ++val;
+              new_color = new_color >> 1;
+            }
+            _colors(vid) = val + offset;
+            foundColor   = true;
+            break;
+          }
+        }  // for offset=0...
+      }    // if _colors(vid)==0
+    }      // operator() (end)
+  };       // struct functorGreedyColorVB_BIT_EF (end)
+
+  struct functorFindConflicts_Atomic {
+    lno_t nr;  // num verts
+    lno_t nc;  // num columns
+    rowmap_t _idx;
+    entries_t _adj;
+    rowmap_t _t_idx;
+    entries_t _t_adj;
+    color_view_type _colors;
+    lno_view_t _vertexList;
+    lno_view_t _recolorList;
+    single_lno_view_t _recolorListLength;
+
+    functorFindConflicts_Atomic(lno_t nr_, lno_t nc_, rowmap_t xadj_,
+                                entries_t adj_, rowmap_t t_xadj_,
+                                entries_t t_adj_, color_view_type colors,
+                                lno_view_t vertexList, lno_view_t recolorList,
+                                single_lno_view_t recolorListLength)
+        : nr(nr_),
+          nc(nc_),
+          _idx(xadj_),
+          _adj(adj_),
+          _t_idx(t_xadj_),
+          _t_adj(t_adj_),
+          _colors(colors),
+          _vertexList(vertexList),
+          _recolorList(recolorList),
+          _recolorListLength(recolorListLength) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const lno_t i, lno_t& numConflicts) const {
+      const lno_t vid                  = _vertexList(i);
+      const color_type my_color        = _colors(vid);
+      const size_type vid_d1_adj_begin = _idx(vid);
+      const size_type vid_d1_adj_end   = _idx(vid + 1);
+      // If vid is a valid column (vid < nc), check for column->vert conflicts
+      for (size_type vid_d1_adj = vid_d1_adj_begin; vid_d1_adj < vid_d1_adj_end;
+           vid_d1_adj++) {
+        lno_t vid_d1 = _adj(vid_d1_adj);
+        if (vid_d1 < nc) {
+          if (!doing_bipartite)  // note: compile-time branch (template param)
+          {
+            // check for dist-1 conflict
+            if (vid_d1 != vid && _colors(vid_d1) == my_color) {
+              _colors(vid) = 0;  // uncolor vertex
+              // Atomically add vertex to recolorList
+              const lno_t k =
+                  Kokkos::atomic_fetch_add(&_recolorListLength(), lno_t(1));
+              _recolorList(k) = vid;
+              numConflicts++;
+              return;
+            }
+          }
+          const size_type d2_adj_begin = _t_idx(vid_d1);
+          const size_type d2_adj_end   = _t_idx(vid_d1 + 1);
+          for (size_type vid_d2_adj = d2_adj_begin; vid_d2_adj < d2_adj_end;
+               vid_d2_adj++) {
+            const lno_t vid_d2 = _t_adj(vid_d2_adj);
+
+            if (vid != vid_d2 && vid_d2 < nr) {
+              if (_colors(vid_d2) == my_color) {
+                _colors(vid) = 0;  // uncolor vertex
+                // Atomically add vertex to recolorList
+                const lno_t k =
+                    Kokkos::atomic_fetch_add(&_recolorListLength(), lno_t(1));
+                _recolorList(k) = vid;
+                numConflicts++;
+                return;
+              }
+            }  // if vid != vid_d2 ...
+          }    // for vid_d2_adj ...
         }
-
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const lno_t i, lno_t& numConflicts) const
-        {
-            const lno_t vid = _vertexList(i);
-            const color_type my_color = _colors(vid);
-            const size_type vid_d1_adj_begin = _idx(vid);
-            const size_type vid_d1_adj_end   = _idx(vid + 1);
-            //If vid is a valid column (vid < nc), check for column->vert conflicts
-            for(size_type vid_d1_adj = vid_d1_adj_begin; vid_d1_adj < vid_d1_adj_end; vid_d1_adj++)
-            {
-                lno_t vid_d1 = _adj(vid_d1_adj);
-                if(vid_d1 < nc)
-                {
-                  if(!doing_bipartite)  //note: compile-time branch (template param)
-                  {
-                    //check for dist-1 conflict
-                    if(vid_d1 != vid && _colors(vid_d1) == my_color)
-                    {
-                      _colors(vid) = 0;      // uncolor vertex
-                      // Atomically add vertex to recolorList
-                      const lno_t k = Kokkos::atomic_fetch_add(&_recolorListLength(), lno_t(1));
-                      _recolorList(k)      = vid;
-                      numConflicts++;
-                      return;
-                    }
-                  }
-                  const size_type d2_adj_begin = _t_idx(vid_d1);
-                  const size_type d2_adj_end = _t_idx(vid_d1 + 1);
-                  for(size_type vid_d2_adj = d2_adj_begin; vid_d2_adj < d2_adj_end; vid_d2_adj++)
-                  {
-                      const lno_t vid_d2 = _t_adj(vid_d2_adj);
-
-                      if(vid != vid_d2 && vid_d2 < nr)
-                      {
-                          if(_colors(vid_d2) == my_color)
-                          {
-                              _colors(vid) = 0;      // uncolor vertex
-                              // Atomically add vertex to recolorList
-                              const lno_t k = Kokkos::atomic_fetch_add(&_recolorListLength(), lno_t(1));
-                              _recolorList(k)      = vid;
-                              numConflicts++;
-                              return;
-                          }
-                      }      // if vid != vid_d2 ...
-                  }          // for vid_d2_adj ...
-                }
-            }              // for vid_d1_adj ...
-        }                  // operator() (end)
-    };                     // struct functorFindConflicts_Atomic (end)
-};      // end class GraphColorDistance2
-
+      }  // for vid_d1_adj ...
+    }    // operator() (end)
+  };     // struct functorFindConflicts_Atomic (end)
+};       // end class GraphColorDistance2
 
 /**
  * Prints out a histogram of graph colors for Distance-2 Graph Coloring
@@ -1753,63 +1642,62 @@ class GraphColorDistance2
  * and for row_entries and col_entries.
  *
  * @param[in]  handle           The kernel handle
- * @param[in]  num_rows         Number of rows in the matrix (number of vertices)
+ * @param[in]  num_rows         Number of rows in the matrix (number of
+ * vertices)
  * @param[in]  num_cols         Number of columns in the matrix
  * @param[in]  row_map          The row map
  * @param[in]  row_entries      The row entries
  * @param[in]  col_map          The column map
  * @param[in]  col_entries      The column entries
  * @param[out] validation_flags An array of 4 booleans.
- *                              validation_flags[0] : True IF the distance-2 coloring is invalid.
- *                              validation_flags[1] : True IF the coloring is bad because vertices are left uncolored.
- *                              validation_flags[2] : True IF the coloring is bad because at least one pair of vertices
- *                                                    at distance=2 from each other has the same color.
- *                              validation_flags[3] : True IF a vertex has a color greater than number of vertices in the graph.
- *                                                    May not be an INVALID coloring, but can indicate poor quality in coloring.
+ *                              validation_flags[0] : True IF the distance-2
+ * coloring is invalid. validation_flags[1] : True IF the coloring is bad
+ * because vertices are left uncolored. validation_flags[2] : True IF the
+ * coloring is bad because at least one pair of vertices at distance=2 from each
+ * other has the same color. validation_flags[3] : True IF a vertex has a color
+ * greater than number of vertices in the graph. May not be an INVALID coloring,
+ * but can indicate poor quality in coloring.
  * @param[in] csv               Output in CSV format? Default: false
  *
  * @return nothing
  */
-template<class KernelHandle>
-void graph_print_distance2_color_histogram(KernelHandle *handle,
-                                           bool csv=false)
-{
-  using lno_view_t = typename KernelHandle::nnz_lno_temp_work_view_t;
-  using lno_t = typename KernelHandle::nnz_lno_t;
+template <class KernelHandle>
+void graph_print_distance2_color_histogram(KernelHandle* handle,
+                                           bool csv = false) {
+  using lno_view_t      = typename KernelHandle::nnz_lno_temp_work_view_t;
+  using lno_t           = typename KernelHandle::nnz_lno_t;
   using execution_space = typename KernelHandle::HandleExecSpace;
-  using D2Handle = typename KernelHandle::GraphColorDistance2HandleType;
-  using color_view_t = typename D2Handle::color_view_type;
-  //Get handle
+  using D2Handle        = typename KernelHandle::GraphColorDistance2HandleType;
+  using color_view_t    = typename D2Handle::color_view_type;
+  // Get handle
   D2Handle* gch_d2 = handle->get_distance2_graph_coloring_handle();
-  //Get the coloring
+  // Get the coloring
   color_view_t colors = gch_d2->get_vertex_colors();
-  lno_t num_colors = gch_d2->get_num_colors();
+  lno_t num_colors    = gch_d2->get_num_colors();
   lno_view_t histogram("histogram", num_colors + 1);
-  KokkosKernels::Impl::kk_get_histogram
-    <color_view_t, lno_view_t, execution_space>
-    (colors.extent(0), colors, histogram);
-  auto h_histogram = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), histogram);
-  //note: both modes ignore color 0 in output, since we assume the coloring is valid
-  if(csv)
-  {
-      size_t i = 1;
-      for(; i < h_histogram.extent(0) - 1; i++)
-      {
-          std::cout << h_histogram(i) << ",";
-      }
-      std::cout << h_histogram(i);
-  }
-  else
-  {
-      auto histogram_slice = Kokkos::subview(histogram, std::make_pair((size_t)1, histogram.extent(0)));
-      std::cout << "Distance-2 Color Histogram (1..N): " << std::endl;
-      KokkosKernels::Impl::kk_print_1Dview(histogram_slice);
-      std::cout << std::endl;
+  KokkosKernels::Impl::kk_get_histogram<color_view_t, lno_view_t,
+                                        execution_space>(colors.extent(0),
+                                                         colors, histogram);
+  auto h_histogram =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), histogram);
+  // note: both modes ignore color 0 in output, since we assume the coloring is
+  // valid
+  if (csv) {
+    size_t i = 1;
+    for (; i < h_histogram.extent(0) - 1; i++) {
+      std::cout << h_histogram(i) << ",";
+    }
+    std::cout << h_histogram(i);
+  } else {
+    auto histogram_slice = Kokkos::subview(
+        histogram, std::make_pair((size_t)1, histogram.extent(0)));
+    std::cout << "Distance-2 Color Histogram (1..N): " << std::endl;
+    KokkosKernels::Impl::kk_print_1Dview(histogram_slice);
+    std::cout << std::endl;
   }
 }
 
-}      // namespace Impl
-}      // namespace KokkosGraph
-
+}  // namespace Impl
+}  // namespace KokkosGraph
 
-#endif      // _KOKKOSGRAPH_DISTANCE2COLOR_IMPL_HPP
+#endif  // _KOKKOSGRAPH_DISTANCE2COLOR_IMPL_HPP
diff --git a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp
index da1ac6e631..1628b715a8 100644
--- a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp
+++ b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp
@@ -51,35 +51,25 @@
 #include <cstdint>
 
 namespace KokkosGraph {
-namespace Experimental {
 namespace Impl {
 
-template<typename device_t, typename rowmap_t, typename entries_t, typename lno_view_t>
-struct D2_MIS_RandomPriority
-{
-  using exec_space = typename device_t::execution_space;
-  using mem_space = typename device_t::memory_space;
-  using bitset_t = Kokkos::Bitset<device_t>;
+template <typename device_t, typename rowmap_t, typename entries_t,
+          typename lno_view_t>
+struct D2_MIS_RandomPriority {
+  using exec_space     = typename device_t::execution_space;
+  using mem_space      = typename device_t::memory_space;
+  using bitset_t       = Kokkos::Bitset<device_t>;
   using const_bitset_t = Kokkos::ConstBitset<device_t>;
-  using size_type = typename rowmap_t::non_const_value_type;
-  using lno_t = typename entries_t::non_const_value_type;
-  //The type of status/priority values.
-  using status_t = typename std::make_unsigned<lno_t>::type;
-  using status_view_t = Kokkos::View<status_t*, mem_space>;
-  using range_pol = Kokkos::RangePolicy<exec_space>;
-  using team_pol = Kokkos::TeamPolicy<exec_space>;
-  using team_mem = typename team_pol::member_type;
+  using size_type      = typename rowmap_t::non_const_value_type;
+  using lno_t          = typename entries_t::non_const_value_type;
+  // The type of status/priority values.
+  using status_t        = typename std::make_unsigned<lno_t>::type;
+  using status_view_t   = Kokkos::View<status_t*, mem_space>;
+  using range_pol       = Kokkos::RangePolicy<exec_space>;
+  using team_pol        = Kokkos::TeamPolicy<exec_space>;
+  using team_mem        = typename team_pol::member_type;
   using all_worklists_t = Kokkos::View<lno_t**, Kokkos::LayoutLeft, mem_space>;
-  using worklist_t = Kokkos::View<lno_t*, Kokkos::LayoutLeft, mem_space>;
-
-  KOKKOS_INLINE_FUNCTION static uint32_t xorshiftHash(uint32_t in)
-  {
-    uint32_t x = in;
-    x ^= x << 13;
-    x ^= x >> 17;
-    x ^= x << 5;
-    return x;
-  }
+  using worklist_t      = Kokkos::View<lno_t*, Kokkos::LayoutLeft, mem_space>;
 
   // Priority values 0 and max are special, they mean the vertex is
   // in the independent set or eliminated from consideration, respectively.
@@ -87,48 +77,46 @@ struct D2_MIS_RandomPriority
   // based on degree and vertex ID as a tiebreak
   //   (higher priority = less preferred to being in the independent set)
 
-  static constexpr status_t IN_SET = 0;
+  static constexpr status_t IN_SET  = 0;
   static constexpr status_t OUT_SET = ~IN_SET;
 
   D2_MIS_RandomPriority(const rowmap_t& rowmap_, const entries_t& entries_)
-    : rowmap(rowmap_), entries(entries_), numVerts(rowmap.extent(0) - 1)
-  {
+      : rowmap(rowmap_), entries(entries_), numVerts(rowmap.extent(0) - 1) {
     status_t i = numVerts + 1;
-    nvBits = 0;
-    while(i)
-    {
+    nvBits     = 0;
+    while (i) {
       i >>= 1;
       nvBits++;
     }
-    //Each value in rowStatus represents the status and priority of each row.
-    //Each value in colStatus represents the lowest nonzero priority of any row adjacent to the column.
-    //  This counts up monotonically as vertices are eliminated (given status OUT_SET)
-    rowStatus = status_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "RowStatus"), numVerts);
-    colStatus = status_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "ColStatus"), numVerts);
-    allWorklists = Kokkos::View<lno_t**, Kokkos::LayoutLeft, mem_space>(Kokkos::view_alloc(Kokkos::WithoutInitializing, "AllWorklists"), numVerts, 3);
+    // Each value in rowStatus represents the status and priority of each row.
+    // Each value in colStatus represents the lowest nonzero priority of any row
+    // adjacent to the column.
+    //  This counts up monotonically as vertices are eliminated (given status
+    //  OUT_SET)
+    rowStatus = status_view_t(
+        Kokkos::ViewAllocateWithoutInitializing("RowStatus"), numVerts);
+    colStatus = status_view_t(
+        Kokkos::ViewAllocateWithoutInitializing("ColStatus"), numVerts);
+    allWorklists = Kokkos::View<lno_t**, Kokkos::LayoutLeft, mem_space>(
+        Kokkos::ViewAllocateWithoutInitializing("AllWorklists"), numVerts, 3);
   }
 
-  struct RefreshRowStatus
-  {
-    RefreshRowStatus(const status_view_t& rowStatus_, const worklist_t& worklist_, lno_t nvBits_, int round)
-      : rowStatus(rowStatus_), worklist(worklist_), nvBits(nvBits_)
-    {
-      hashedRound = xorshiftHash(round);
+  struct RefreshRowStatus {
+    RefreshRowStatus(const status_view_t& rowStatus_,
+                     const worklist_t& worklist_, lno_t nvBits_, int round)
+        : rowStatus(rowStatus_), worklist(worklist_), nvBits(nvBits_) {
+      hashedRound = KokkosKernels::Impl::xorshiftHash<status_t>(round);
     }
 
-    KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const
-    {
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const {
       lno_t i = worklist(w);
-      //Combine vertex and round to get some pseudorandom priority bits that change each round
-      status_t priority = xorshiftHash(i + hashedRound);
-      //Generate unique status per row, with IN_SET < status < OUT_SET,
-      int priorityBits = sizeof(status_t) * 8 - nvBits;
-      status_t priorityMask = 1;
-      priorityMask <<= priorityBits;
-      priorityMask--;
-      status_t newStatus = (status_t) (i + 1) + ((priority & priorityMask) << nvBits);
-      if(newStatus == OUT_SET)
-        newStatus--;
+      // Combine vertex and round to get some pseudorandom priority bits that
+      // change each round
+      status_t priority = KokkosKernels::Impl::xorshiftHash<status_t>(
+          KokkosKernels::Impl::xorshiftHash<status_t>(i) ^ hashedRound);
+      // Generate unique status per row, with IN_SET < status < OUT_SET,
+      status_t newStatus = (status_t)(i + 1) | (priority << nvBits);
+      if (newStatus == OUT_SET) newStatus--;
       rowStatus(i) = newStatus;
     }
 
@@ -138,64 +126,60 @@ struct D2_MIS_RandomPriority
     uint32_t hashedRound;
   };
 
-  struct RefreshColStatus
-  {
-    RefreshColStatus(const status_view_t& colStatus_, const worklist_t& worklist_, const status_view_t& rowStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_, lno_t worklistLen_)
-      : colStatus(colStatus_), worklist(worklist_), rowStatus(rowStatus_), rowmap(rowmap_), entries(entries_), nv(nv_), worklistLen(worklistLen_)
-    {}
-
-    KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const
-    {
+  struct RefreshColStatus {
+    RefreshColStatus(const status_view_t& colStatus_,
+                     const worklist_t& worklist_,
+                     const status_view_t& rowStatus_, const rowmap_t& rowmap_,
+                     const entries_t& entries_, lno_t nv_, lno_t worklistLen_)
+        : colStatus(colStatus_),
+          worklist(worklist_),
+          rowStatus(rowStatus_),
+          rowmap(rowmap_),
+          entries(entries_),
+          nv(nv_),
+          worklistLen(worklistLen_) {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const {
       lno_t i = worklist(w);
-      //iterate over {i} union the neighbors of i, to find
-      //minimum status.
-      status_t s = OUT_SET;
+      // iterate over {i} union the neighbors of i, to find
+      // minimum status.
+      status_t s         = rowStatus(i);
       size_type rowBegin = rowmap(i);
-      size_type rowEnd = rowmap(i + 1);
-      for(size_type j = rowBegin; j <= rowEnd; j++)
-      {
-        lno_t nei = (j == rowEnd) ? i : entries(j);
-        if(nei < nv)
-        {
+      size_type rowEnd   = rowmap(i + 1);
+      for (size_type j = rowBegin; j < rowEnd; j++) {
+        lno_t nei = entries(j);
+        if (nei < nv && nei != i) {
           status_t neiStat = rowStatus(nei);
-          if(neiStat < s)
-            s = neiStat;
+          if (neiStat < s) s = neiStat;
         }
       }
-      if(s == IN_SET)
-        s = OUT_SET;
+      if (s == IN_SET) s = OUT_SET;
       colStatus(i) = s;
     }
 
-    KOKKOS_INLINE_FUNCTION void operator()(const team_mem& t) const
-    {
+    KOKKOS_INLINE_FUNCTION void operator()(const team_mem& t) const {
       using MinReducer = Kokkos::Min<status_t>;
-      lno_t w = t.league_rank() * t.team_size() + t.team_rank();
-      if(w >= worklistLen)
-        return;
-      lno_t i = worklist(w);
+      lno_t w          = t.league_rank() * t.team_size() + t.team_rank();
+      if (w >= worklistLen) return;
+      lno_t i            = worklist(w);
       size_type rowBegin = rowmap(i);
-      size_type rowEnd = rowmap(i + 1);
-      lno_t rowLen = rowEnd - rowBegin;
-      //iterate over {i} union the neighbors of i, to find
-      //minimum status.
+      size_type rowEnd   = rowmap(i + 1);
+      lno_t rowLen       = rowEnd - rowBegin;
+      // iterate over {i} union the neighbors of i, to find
+      // minimum status.
       status_t s;
-      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(t, rowLen + 1),
-      [&](lno_t j, status_t& ls)
-      {
-        lno_t nei = (j == rowLen) ? i : entries(rowBegin + j);
-        if(nei < nv)
-        {
-          status_t neiStat = rowStatus(nei);
-          if(neiStat < ls)
-            ls = neiStat;
-        }
-      }, MinReducer(s));
-      Kokkos::single(Kokkos::PerThread(t),
-      [&]()
-      {
-        if(s == IN_SET)
-          s = OUT_SET;
+      Kokkos::parallel_reduce(
+          Kokkos::ThreadVectorRange(t, rowLen + 1),
+          [&](lno_t j, status_t& ls) {
+            lno_t nei = (j == rowLen) ? i : entries(rowBegin + j);
+            if (nei < nv) {
+              status_t neiStat = rowStatus(nei);
+              if (neiStat < ls) ls = neiStat;
+            }
+          },
+          MinReducer(s));
+      Kokkos::single(Kokkos::PerThread(t), [&]() {
+        if (s == IN_SET) s = OUT_SET;
         colStatus(i) = s;
       });
     }
@@ -209,103 +193,90 @@ struct D2_MIS_RandomPriority
     lno_t worklistLen;
   };
 
-  struct DecideSetFunctor
-  {
-    DecideSetFunctor(const status_view_t& rowStatus_, const status_view_t& colStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_, const worklist_t& worklist_, lno_t worklistLen_)
-      : rowStatus(rowStatus_), colStatus(colStatus_), rowmap(rowmap_), entries(entries_), nv(nv_), worklist(worklist_), worklistLen(worklistLen_)
-    {}
-
-    //Enum values to be used as flags, so that the team policy version can
-    //express the neighbor checking as an OR-reduction
-    enum
-    {
-      NEI_OUT_SET = 1,
-      NEI_DIFFERENT_STATUS = 2
-    };
-
-    KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const
-    {
+  struct DecideSetFunctor {
+    DecideSetFunctor(const status_view_t& rowStatus_,
+                     const status_view_t& colStatus_, const rowmap_t& rowmap_,
+                     const entries_t& entries_, lno_t nv_,
+                     const worklist_t& worklist_, lno_t worklistLen_)
+        : rowStatus(rowStatus_),
+          colStatus(colStatus_),
+          rowmap(rowmap_),
+          entries(entries_),
+          nv(nv_),
+          worklist(worklist_),
+          worklistLen(worklistLen_) {}
+
+    // Enum values to be used as flags, so that the team policy version can
+    // express the neighbor checking as an OR-reduction
+    enum { NEI_OUT_SET = 1, NEI_DIFFERENT_STATUS = 2 };
+
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const {
       lno_t i = worklist(w);
-      //Processing row i.
+      // Processing row i.
       status_t s = rowStatus(i);
-      if(s == IN_SET || s == OUT_SET)
-        return;
-      //s is the status which must be the minimum among all neighbors
-      //to decide that i is IN_SET.
+      if (s == IN_SET || s == OUT_SET) return;
+      // s is the status which must be the minimum among all neighbors
+      // to decide that i is IN_SET.
       size_type rowBegin = rowmap(i);
-      size_type rowEnd = rowmap(i + 1);
-      bool neiOut = false;
-      bool neiMismatchS = false;
-      for(size_type j = rowBegin; j <= rowEnd; j++)
-      {
+      size_type rowEnd   = rowmap(i + 1);
+      bool neiOut        = false;
+      bool neiMismatchS  = false;
+      for (size_type j = rowBegin; j <= rowEnd; j++) {
         lno_t nei = (j == rowEnd) ? i : entries(j);
-        if(nei >= nv)
-          continue;
+        if (nei >= nv) continue;
         status_t neiStat = colStatus(nei);
-        if(neiStat == OUT_SET)
-        {
+        if (neiStat == OUT_SET) {
           neiOut = true;
           break;
-        }
-        else if(neiStat != s)
-        {
+        } else if (neiStat != s) {
           neiMismatchS = true;
         }
       }
-      if(neiOut)
-      {
-        //In order to make future progress, need to update the
-        //col statuses for all neighbors of i.
+      if (neiOut) {
+        // In order to make future progress, need to update the
+        // col statuses for all neighbors of i.
         rowStatus(i) = OUT_SET;
-      }
-      else if(!neiMismatchS)
-      {
-        //all neighboring col statuses match s, therefore s is the minimum status among all d2 neighbors
+      } else if (!neiMismatchS) {
+        // all neighboring col statuses match s, therefore s is the minimum
+        // status among all d2 neighbors
         rowStatus(i) = IN_SET;
       }
     }
 
-    KOKKOS_INLINE_FUNCTION void operator()(const team_mem& t) const
-    {
+    KOKKOS_INLINE_FUNCTION void operator()(const team_mem& t) const {
       using OrReducer = Kokkos::BOr<int>;
-      lno_t w = t.league_rank() * t.team_size() + t.team_rank();
-      if(w >= worklistLen)
-        return;
+      lno_t w         = t.league_rank() * t.team_size() + t.team_rank();
+      if (w >= worklistLen) return;
       lno_t i = worklist(w);
-      //Processing row i.
+      // Processing row i.
       status_t s = rowStatus(i);
-      if(s == IN_SET || s == OUT_SET)
-        return;
-      //s is the status which must be the minimum among all neighbors
-      //to decide that i is IN_SET.
+      if (s == IN_SET || s == OUT_SET) return;
+      // s is the status which must be the minimum among all neighbors
+      // to decide that i is IN_SET.
       size_type rowBegin = rowmap(i);
-      size_type rowEnd = rowmap(i + 1);
-      lno_t rowLen = rowEnd - rowBegin;
-      int flags = 0;
-      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(t, rowLen + 1),
-      [&](lno_t j, int& lflags)
-      {
-        lno_t nei = (j == rowLen) ? i : entries(rowBegin + j);
-        if(nei >= nv)
-          return;
-        status_t neiStat = colStatus(nei);
-        if(neiStat == OUT_SET)
-          lflags |= NEI_OUT_SET;
-        else if(neiStat != s)
-          lflags |= NEI_DIFFERENT_STATUS;
-      }, OrReducer(flags));
-      Kokkos::single(Kokkos::PerThread(t),
-      [&]()
-      {
-        if(flags & NEI_OUT_SET)
-        {
-          //In order to make future progress, need to update the
-          //col statuses for all neighbors of i.
+      size_type rowEnd   = rowmap(i + 1);
+      lno_t rowLen       = rowEnd - rowBegin;
+      int flags          = 0;
+      Kokkos::parallel_reduce(
+          Kokkos::ThreadVectorRange(t, rowLen + 1),
+          [&](lno_t j, int& lflags) {
+            lno_t nei = (j == rowLen) ? i : entries(rowBegin + j);
+            if (nei >= nv) return;
+            status_t neiStat = colStatus(nei);
+            if (neiStat == OUT_SET)
+              lflags |= NEI_OUT_SET;
+            else if (neiStat != s)
+              lflags |= NEI_DIFFERENT_STATUS;
+          },
+          OrReducer(flags));
+      Kokkos::single(Kokkos::PerThread(t), [&]() {
+        if (flags & NEI_OUT_SET) {
+          // In order to make future progress, need to update the
+          // col statuses for all neighbors of i.
           rowStatus(i) = OUT_SET;
-        }
-        else if(!(flags & NEI_DIFFERENT_STATUS))
-        {
-          //all neighboring col statuses match s, therefore s is the minimum status among all d2 neighbors
+        } else if (!(flags & NEI_DIFFERENT_STATUS)) {
+          // all neighboring col statuses match s, therefore s is the minimum
+          // status among all d2 neighbors
           rowStatus(i) = IN_SET;
         }
       });
@@ -320,30 +291,21 @@ struct D2_MIS_RandomPriority
     lno_t worklistLen;
   };
 
-  struct CountInSet
-  {
-    CountInSet(const status_view_t& rowStatus_)
-      : rowStatus(rowStatus_)
-    {}
-    KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet) const
-    {
-      if(rowStatus(i) == IN_SET)
-        lNumInSet++;
+  struct CountInSet {
+    CountInSet(const status_view_t& rowStatus_) : rowStatus(rowStatus_) {}
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet) const {
+      if (rowStatus(i) == IN_SET) lNumInSet++;
     }
     status_view_t rowStatus;
   };
 
-  struct CompactInSet
-  {
+  struct CompactInSet {
     CompactInSet(const status_view_t& rowStatus_, const lno_view_t& setList_)
-      : rowStatus(rowStatus_), setList(setList_)
-    {}
-    KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet, bool finalPass) const
-    {
-      if(rowStatus(i) == IN_SET)
-      {
-        if(finalPass)
-          setList(lNumInSet) = i;
+        : rowStatus(rowStatus_), setList(setList_) {}
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet,
+                                           bool finalPass) const {
+      if (rowStatus(i) == IN_SET) {
+        if (finalPass) setList(lNumInSet) = i;
         lNumInSet++;
       }
     }
@@ -351,33 +313,33 @@ struct D2_MIS_RandomPriority
     lno_view_t setList;
   };
 
-  struct InitWorklistFunctor
-  {
-    InitWorklistFunctor(const worklist_t& worklist_)
-      : worklist(worklist_)
-    {}
-    KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const
-    {
-      worklist(i) = i;
+  struct MaskedWorklist {
+    MaskedWorklist(const lno_view_t& mask_, const worklist_t& worklist_)
+        : mask(mask_), worklist(worklist_) {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInList,
+                                           bool finalPass) const {
+      if (mask(i) < 0) {
+        if (finalPass) worklist(lNumInList) = i;
+        lNumInList++;
+      }
     }
+    lno_view_t mask;
     worklist_t worklist;
   };
 
-  struct CompactWorklistFunctor
-  {
-    CompactWorklistFunctor(const worklist_t& src_, const worklist_t& dst_, const status_view_t& status_)
-      : src(src_), dst(dst_), status(status_)
-    {}
+  struct CompactWorklistFunctor {
+    CompactWorklistFunctor(const worklist_t& src_, const worklist_t& dst_,
+                           const status_view_t& status_)
+        : src(src_), dst(dst_), status(status_) {}
 
-    KOKKOS_INLINE_FUNCTION void operator()(lno_t w, lno_t& lNumInSet, bool finalPass) const
-    {
-      lno_t i = src(w);
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t w, lno_t& lNumInSet,
+                                           bool finalPass) const {
+      lno_t i    = src(w);
       status_t s = status(i);
-      if(s != IN_SET && s != OUT_SET)
-      {
-        //next worklist needs to contain i
-        if(finalPass)
-          dst(lNumInSet) = i;
+      if (s != IN_SET && s != OUT_SET) {
+        // next worklist needs to contain i
+        if (finalPass) dst(lNumInSet) = i;
         lNumInSet++;
       }
     }
@@ -387,71 +349,192 @@ struct D2_MIS_RandomPriority
     status_view_t status;
   };
 
-  lno_view_t compute()
-  {
-    //Initialize first worklist to 0...numVerts
+  lno_view_t compute() {
+    // Initialize first worklist to 0...numVerts
     worklist_t rowWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 0);
-    Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(rowWorklist));
     worklist_t colWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 1);
-    Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(colWorklist));
+    KokkosKernels::Impl::sequential_fill(rowWorklist);
+    KokkosKernels::Impl::sequential_fill(colWorklist);
     worklist_t thirdWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 2);
-    auto execSpaceEnum = KokkosKernels::Impl::kk_get_exec_space_type<exec_space>();
-    bool useTeams = KokkosKernels::Impl::kk_is_gpu_exec_space<exec_space>() && (entries.extent(0) / numVerts >= 16);
-    int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size(numVerts, entries.extent(0), execSpaceEnum);
-    int round = 0;
-    lno_t rowWorkLen = numVerts;
-    lno_t colWorkLen = numVerts;
+    auto execSpaceEnum =
+        KokkosKernels::Impl::kk_get_exec_space_type<exec_space>();
+    bool useTeams = KokkosKernels::Impl::kk_is_gpu_exec_space<exec_space>() &&
+                    (entries.extent(0) / numVerts >= 16);
+    int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size(
+        numVerts, entries.extent(0), execSpaceEnum);
+    int round              = 0;
+    lno_t rowWorkLen       = numVerts;
+    lno_t colWorkLen       = numVerts;
     int refreshColTeamSize = 0;
-    int decideSetTeamSize = 0;
-    if(useTeams)
-    {
+    int decideSetTeamSize  = 0;
+    if (useTeams) {
       team_pol dummyPolicy(1, 1, vectorLength);
-      //Compute the recommended team size for RefreshColStatus and DecideSetFunctor (will be constant)
+      // Compute the recommended team size for RefreshColStatus and
+      // DecideSetFunctor (will be constant)
       {
-        RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, entries, numVerts, colWorkLen);
-        refreshColTeamSize = dummyPolicy.team_size_max(refreshCol, Kokkos::ParallelForTag());
+        RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap,
+                                    entries, numVerts, colWorkLen);
+        refreshColTeamSize =
+            dummyPolicy.team_size_max(refreshCol, Kokkos::ParallelForTag());
       }
       {
-        DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, numVerts, rowWorklist, rowWorkLen);
-        decideSetTeamSize = dummyPolicy.team_size_max(decideSet, Kokkos::ParallelForTag());
+        DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries,
+                                   numVerts, rowWorklist, rowWorkLen);
+        decideSetTeamSize =
+            dummyPolicy.team_size_max(decideSet, Kokkos::ParallelForTag());
       }
     }
-    while(true)
-    {
-      //Compute new row statuses
-      Kokkos::parallel_for(range_pol(0, rowWorkLen), RefreshRowStatus(rowStatus, rowWorklist, nvBits, round));
-      //Compute new col statuses
+    while (true) {
+      // Compute new row statuses
+      Kokkos::parallel_for(
+          range_pol(0, rowWorkLen),
+          RefreshRowStatus(rowStatus, rowWorklist, nvBits, round));
+      // Compute new col statuses
       {
-        RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap, entries, numVerts, colWorkLen);
-        if(useTeams)
-          Kokkos::parallel_for(team_pol((colWorkLen + refreshColTeamSize - 1) / refreshColTeamSize, refreshColTeamSize, vectorLength), refreshCol);
+        RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap,
+                                    entries, numVerts, colWorkLen);
+        if (useTeams)
+          Kokkos::parallel_for(team_pol((colWorkLen + refreshColTeamSize - 1) /
+                                            refreshColTeamSize,
+                                        refreshColTeamSize, vectorLength),
+                               refreshCol);
         else
           Kokkos::parallel_for(range_pol(0, colWorkLen), refreshCol);
       }
-      //Decide row statuses where enough information is available
+      // Decide row statuses where enough information is available
       {
-        DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries, numVerts, rowWorklist, rowWorkLen);
-        if(useTeams)
-          Kokkos::parallel_for(team_pol((rowWorkLen + decideSetTeamSize - 1) / decideSetTeamSize, decideSetTeamSize, vectorLength), decideSet);
+        DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries,
+                                   numVerts, rowWorklist, rowWorkLen);
+        if (useTeams)
+          Kokkos::parallel_for(
+              team_pol((rowWorkLen + decideSetTeamSize - 1) / decideSetTeamSize,
+                       decideSetTeamSize, vectorLength),
+              decideSet);
         else
           Kokkos::parallel_for(range_pol(0, rowWorkLen), decideSet);
       }
-      //Compact row worklist
-      Kokkos::parallel_scan(range_pol(0, rowWorkLen), CompactWorklistFunctor(rowWorklist, thirdWorklist, rowStatus), rowWorkLen);
-      if(rowWorkLen == 0)
-        break;
+      round++;
+      // Compact row worklist
+      Kokkos::parallel_scan(
+          range_pol(0, rowWorkLen),
+          CompactWorklistFunctor(rowWorklist, thirdWorklist, rowStatus),
+          rowWorkLen);
+      if (rowWorkLen == 0) break;
       std::swap(rowWorklist, thirdWorklist);
-      //Compact col worklist
-      Kokkos::parallel_scan(range_pol(0, colWorkLen), CompactWorklistFunctor(colWorklist, thirdWorklist, colStatus), colWorkLen);
+      // Compact col worklist
+      Kokkos::parallel_scan(
+          range_pol(0, colWorkLen),
+          CompactWorklistFunctor(colWorklist, thirdWorklist, colStatus),
+          colWorkLen);
       std::swap(colWorklist, thirdWorklist);
+    }
+    // now that every vertex has been decided IN_SET/OUT_SET,
+    // build a compact list of the vertices which are IN_SET.
+    lno_t numInSet = 0;
+    Kokkos::parallel_reduce(range_pol(0, numVerts), CountInSet(rowStatus),
+                            numInSet);
+    lno_view_t setList(Kokkos::ViewAllocateWithoutInitializing("D2MIS"),
+                       numInSet);
+    Kokkos::parallel_scan(range_pol(0, numVerts),
+                          CompactInSet(rowStatus, setList));
+    return setList;
+  }
+
+  // Compute with an initial mask: vertices with mask value < 0 are completely
+  // ignored
+  lno_view_t compute(const lno_view_t& mask) {
+    // Initialize first worklist to 0...numVerts
+    worklist_t rowWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 0);
+    worklist_t colWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 1);
+    lno_t rowWorkLen       = numVerts;
+    lno_t colWorkLen       = numVerts;
+    // Row worklist: initially only the non-masked vertices
+    Kokkos::parallel_scan(range_pol(0, numVerts),
+                          MaskedWorklist(mask, rowWorklist), rowWorkLen);
+    KokkosKernels::Impl::sequential_fill(colWorklist);
+    // Need to fill rowStatus with OUT_SET initially so that vertices not in the
+    // worklist don't affect algorithm
+    Kokkos::deep_copy(rowStatus, ~(status_t(0)));
+    worklist_t thirdWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 2);
+    auto execSpaceEnum =
+        KokkosKernels::Impl::kk_get_exec_space_type<exec_space>();
+    bool useTeams = KokkosKernels::Impl::kk_is_gpu_exec_space<exec_space>() &&
+                    (entries.extent(0) / numVerts >= 16);
+    int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size(
+        numVerts, entries.extent(0), execSpaceEnum);
+    int round              = 0;
+    int refreshColTeamSize = 0;
+    int decideSetTeamSize  = 0;
+    if (useTeams) {
+      team_pol dummyPolicy(1, 1, vectorLength);
+      // Compute the recommended team size for RefreshColStatus and
+      // DecideSetFunctor (will be constant)
+      {
+        RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap,
+                                    entries, numVerts, colWorkLen);
+        refreshColTeamSize =
+            dummyPolicy.team_size_max(refreshCol, Kokkos::ParallelForTag());
+      }
+      {
+        DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries,
+                                   numVerts, rowWorklist, rowWorkLen);
+        decideSetTeamSize =
+            dummyPolicy.team_size_max(decideSet, Kokkos::ParallelForTag());
+      }
+    }
+    while (true) {
+      // Compute new row statuses
+      Kokkos::parallel_for(
+          range_pol(0, rowWorkLen),
+          RefreshRowStatus(rowStatus, rowWorklist, nvBits, round));
+      // Compute new col statuses
+      {
+        RefreshColStatus refreshCol(colStatus, colWorklist, rowStatus, rowmap,
+                                    entries, numVerts, colWorkLen);
+        if (useTeams)
+          Kokkos::parallel_for(team_pol((colWorkLen + refreshColTeamSize - 1) /
+                                            refreshColTeamSize,
+                                        refreshColTeamSize, vectorLength),
+                               refreshCol);
+        else
+          Kokkos::parallel_for(range_pol(0, colWorkLen), refreshCol);
+      }
+      // Decide row statuses where enough information is available
+      {
+        DecideSetFunctor decideSet(rowStatus, colStatus, rowmap, entries,
+                                   numVerts, rowWorklist, rowWorkLen);
+        if (useTeams)
+          Kokkos::parallel_for(
+              team_pol((rowWorkLen + decideSetTeamSize - 1) / decideSetTeamSize,
+                       decideSetTeamSize, vectorLength),
+              decideSet);
+        else
+          Kokkos::parallel_for(range_pol(0, rowWorkLen), decideSet);
+      }
       round++;
+      // Compact row worklist
+      Kokkos::parallel_scan(
+          range_pol(0, rowWorkLen),
+          CompactWorklistFunctor(rowWorklist, thirdWorklist, rowStatus),
+          rowWorkLen);
+      if (rowWorkLen == 0) break;
+      std::swap(rowWorklist, thirdWorklist);
+      // Compact col worklist
+      Kokkos::parallel_scan(
+          range_pol(0, colWorkLen),
+          CompactWorklistFunctor(colWorklist, thirdWorklist, colStatus),
+          colWorkLen);
+      std::swap(colWorklist, thirdWorklist);
     }
-    //now that every vertex has been decided IN_SET/OUT_SET,
-    //build a compact list of the vertices which are IN_SET.
+    // now that every vertex has been decided IN_SET/OUT_SET,
+    // build a compact list of the vertices which are IN_SET.
     lno_t numInSet = 0;
-    Kokkos::parallel_reduce(range_pol(0, numVerts), CountInSet(rowStatus), numInSet);
-    lno_view_t setList(Kokkos::view_alloc(Kokkos::WithoutInitializing, "D2MIS"), numInSet);
-    Kokkos::parallel_scan(range_pol(0, numVerts), CompactInSet(rowStatus, setList));
+    Kokkos::parallel_reduce(range_pol(0, numVerts), CountInSet(rowStatus),
+                            numInSet);
+    lno_view_t setList(Kokkos::ViewAllocateWithoutInitializing("D2MIS"),
+                       numInSet);
+    Kokkos::parallel_scan(range_pol(0, numVerts),
+                          CompactInSet(rowStatus, setList));
     return setList;
   }
 
@@ -461,52 +544,25 @@ struct D2_MIS_RandomPriority
   status_view_t rowStatus;
   status_view_t colStatus;
   all_worklists_t allWorklists;
-  //The number of bits required to represent vertex IDs, in the ECL-MIS tiebreak scheme:
+  // The number of bits required to represent vertex IDs, in the ECL-MIS
+  // tiebreak scheme:
   //  ceil(log_2(numVerts + 1))
   int nvBits;
 };
 
-//    UNUSED CODE
-//    Version of RefreshRowStatus, which does linear interpolation between a degree-based score and a random score.
-//    By gradually increasing the interpolation coefficient in favor of random, the MIS can converge much faster than
-//    constant priorities.
-//
-//    KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const
-//    {
-//      lno_t i = worklist(w);
-//      int degBits = sizeof(status_t) * 8 - nvBits;
-//      if(degBits == 0)
-//      {
-//        //no space to store degree information. Algorithm will still work but will
-//        //probably produce a lower quality MIS.
-//        rowStatus(i) = i + 1;
-//        return;
-//      }
-//      //Combine vertex and round to get some pseudorandom priority bits that change each round
-//      status_t maxDegRange = (((status_t) 1) << degBits) - 2;
-//      lno_t deg = rowmap(i + 1) - rowmap(i);
-//      //Compute degree-based score and random score
-//      float degScore = (float) (deg - minDeg) * invDegRange;
-//      float randScore = (xorshiftHash(i + hashedRound) & 0xFFFF) / 65536.f;
-//      //Then linearly interpolate using k
-//      float finalScore = k * randScore + (1.f - k) * degScore;
-//      rowStatus(i) = (status_t) (i + 1) + (((status_t) (finalScore * maxDegRange)) << nvBits);
-//    }
-//    */
-
-template<typename device_t, typename rowmap_t, typename entries_t, typename lno_view_t>
-struct D2_MIS_FixedPriority
-{
-  using exec_space = typename device_t::execution_space;
-  using mem_space = typename device_t::memory_space;
-  using bitset_t = Kokkos::Bitset<device_t>;
+template <typename device_t, typename rowmap_t, typename entries_t,
+          typename lno_view_t>
+struct D2_MIS_FixedPriority {
+  using exec_space     = typename device_t::execution_space;
+  using mem_space      = typename device_t::memory_space;
+  using bitset_t       = Kokkos::Bitset<device_t>;
   using const_bitset_t = Kokkos::ConstBitset<device_t>;
-  using size_type = typename rowmap_t::non_const_value_type;
-  using lno_t = typename entries_t::non_const_value_type;
-  //The type of status/priority values.
-  using status_t = typename std::make_unsigned<lno_t>::type;
+  using size_type      = typename rowmap_t::non_const_value_type;
+  using lno_t          = typename entries_t::non_const_value_type;
+  // The type of status/priority values.
+  using status_t      = typename std::make_unsigned<lno_t>::type;
   using status_view_t = Kokkos::View<status_t*, mem_space>;
-  using range_pol = Kokkos::RangePolicy<exec_space>;
+  using range_pol     = Kokkos::RangePolicy<exec_space>;
 
   // Priority values 0 and max are special, they mean the vertex is
   // in the independent set or eliminated from consideration, respectively.
@@ -514,53 +570,70 @@ struct D2_MIS_FixedPriority
   // based on degree and vertex ID as a tiebreak
   //   (higher priority = less preferred to being in the independent set)
 
-  static constexpr status_t IN_SET = 0;
+  static constexpr status_t IN_SET  = 0;
   static constexpr status_t OUT_SET = ~IN_SET;
 
   D2_MIS_FixedPriority(const rowmap_t& rowmap_, const entries_t& entries_)
-    : rowmap(rowmap_), entries(entries_), numVerts(rowmap.extent(0) - 1), colUpdateBitset(numVerts),
-    worklist1(Kokkos::view_alloc(Kokkos::WithoutInitializing, "WL1"), numVerts),
-    worklist2(Kokkos::view_alloc(Kokkos::WithoutInitializing, "WL2"), numVerts)
-  {
+      : rowmap(rowmap_),
+        entries(entries_),
+        numVerts(rowmap.extent(0) - 1),
+        colUpdateBitset(numVerts),
+        worklist1(Kokkos::view_alloc(Kokkos::WithoutInitializing, "WL1"),
+                  numVerts),
+        worklist2(Kokkos::view_alloc(Kokkos::WithoutInitializing, "WL2"),
+                  numVerts) {
     status_t i = numVerts + 1;
-    nvBits = 0;
-    while(i)
-    {
+    nvBits     = 0;
+    while (i) {
       i >>= 1;
       nvBits++;
     }
-    //Each value in rowStatus represents the status and priority of each row.
-    //Each value in colStatus represents the lowest nonzero priority of any row adjacent to the column.
-    //  This counts up monotonically as vertices are eliminated (given status OUT_SET)
-    rowStatus = status_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "RowStatus"), numVerts);
-    colStatus = status_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "ColStatus"), numVerts);
-    KokkosKernels::Impl::graph_min_max_degree<device_t, lno_t, rowmap_t>(rowmap, minDegree, maxDegree);
-    //Compute row statuses 
-    Kokkos::parallel_for(range_pol(0, numVerts), InitRowStatus(rowStatus, rowmap, numVerts, nvBits, minDegree, maxDegree));
-    //Compute col statuses
-    Kokkos::parallel_for(range_pol(0, numVerts), InitColStatus(colStatus, rowStatus, rowmap, entries, numVerts));
+    // Each value in rowStatus represents the status and priority of each row.
+    // Each value in colStatus represents the lowest nonzero priority of any row
+    // adjacent to the column.
+    //  This counts up monotonically as vertices are eliminated (given status
+    //  OUT_SET)
+    rowStatus = status_view_t(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "RowStatus"), numVerts);
+    colStatus = status_view_t(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "ColStatus"), numVerts);
+    KokkosKernels::Impl::graph_min_max_degree<device_t, lno_t, rowmap_t>(
+        rowmap, minDegree, maxDegree);
+    // Compute row statuses
+    Kokkos::parallel_for(range_pol(0, numVerts),
+                         InitRowStatus(rowStatus, rowmap, numVerts, nvBits,
+                                       minDegree, maxDegree));
+    // Compute col statuses
+    Kokkos::parallel_for(
+        range_pol(0, numVerts),
+        InitColStatus(colStatus, rowStatus, rowmap, entries, numVerts));
   }
 
-  struct InitRowStatus
-  {
-    InitRowStatus(const status_view_t& rowStatus_, const rowmap_t& rowmap_, lno_t nv_, lno_t nvBits_, lno_t minDeg_, lno_t maxDeg_)
-      : rowStatus(rowStatus_), rowmap(rowmap_), nv(nv_), nvBits(nvBits_), minDeg(minDeg_), maxDeg(maxDeg_), invDegRange(1.f / (maxDeg - minDeg)) {}
-
-    KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const
-    {
-      //Generate unique status per row, with IN_SET < status < OUT_SET,
+  struct InitRowStatus {
+    InitRowStatus(const status_view_t& rowStatus_, const rowmap_t& rowmap_,
+                  lno_t nv_, lno_t nvBits_, lno_t minDeg_, lno_t maxDeg_)
+        : rowStatus(rowStatus_),
+          rowmap(rowmap_),
+          nv(nv_),
+          nvBits(nvBits_),
+          minDeg(minDeg_),
+          maxDeg(maxDeg_),
+          invDegRange(1.f / (maxDeg - minDeg)) {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const {
+      // Generate unique status per row, with IN_SET < status < OUT_SET,
       int degBits = sizeof(status_t) * 8 - nvBits;
-      if(degBits == 0)
-      {
-        //no space to store degree information. Algorithm will still work but will
-        //probably produce a lower quality MIS.
+      if (degBits == 0) {
+        // no space to store degree information. Algorithm will still work but
+        // will probably produce a lower quality MIS.
         rowStatus(i) = i + 1;
         return;
       }
-      status_t maxDegRange = (((status_t) 1) << degBits) - 2;
-      lno_t deg = rowmap(i + 1) - rowmap(i);
-      float degScore = (float) (deg - minDeg) * invDegRange;
-      rowStatus(i) = (status_t) (i + 1) + (((status_t) (degScore * maxDegRange)) << nvBits);
+      status_t maxDegRange = (((status_t)1) << degBits) - 2;
+      lno_t deg            = rowmap(i + 1) - rowmap(i);
+      float degScore       = (float)(deg - minDeg) * invDegRange;
+      rowStatus(i) =
+          (status_t)(i + 1) + (((status_t)(degScore * maxDegRange)) << nvBits);
     }
 
     status_view_t rowStatus;
@@ -572,27 +645,27 @@ struct D2_MIS_FixedPriority
     float invDegRange;
   };
 
-  struct InitColStatus
-  {
-    InitColStatus(const status_view_t& colStatus_, const status_view_t& rowStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_)
-      : colStatus(colStatus_), rowStatus(rowStatus_), rowmap(rowmap_), entries(entries_), nv(nv_)
-    {}
-
-    KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const
-    {
-      //iterate over {i} union the neighbors of i, to find
-      //minimum status.
-      status_t s = rowStatus(i);
+  struct InitColStatus {
+    InitColStatus(const status_view_t& colStatus_,
+                  const status_view_t& rowStatus_, const rowmap_t& rowmap_,
+                  const entries_t& entries_, lno_t nv_)
+        : colStatus(colStatus_),
+          rowStatus(rowStatus_),
+          rowmap(rowmap_),
+          entries(entries_),
+          nv(nv_) {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const {
+      // iterate over {i} union the neighbors of i, to find
+      // minimum status.
+      status_t s         = rowStatus(i);
       size_type rowBegin = rowmap(i);
-      size_type rowEnd = rowmap(i + 1);
-      for(size_type j = rowBegin; j < rowEnd; j++)
-      {
+      size_type rowEnd   = rowmap(i + 1);
+      for (size_type j = rowBegin; j < rowEnd; j++) {
         lno_t nei = entries(j);
-        if(nei != i && nei < nv)
-        {
+        if (nei != i && nei < nv) {
           status_t neiStat = rowStatus(nei);
-          if(neiStat < s)
-            s = neiStat;
+          if (neiStat < s) s = neiStat;
         }
       }
       colStatus(i) = s;
@@ -605,62 +678,58 @@ struct D2_MIS_FixedPriority
     lno_t nv;
   };
 
-  struct IterateStatusFunctor
-  {
-    IterateStatusFunctor(const status_view_t& rowStatus_, const status_view_t& colStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_, const lno_view_t& worklist_, const bitset_t& colUpdateBitset_)
-      : rowStatus(rowStatus_), colStatus(colStatus_), rowmap(rowmap_), entries(entries_), nv(nv_), worklist(worklist_), colUpdateBitset(colUpdateBitset_)
-    {}
-
-    KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const
-    {
+  struct IterateStatusFunctor {
+    IterateStatusFunctor(const status_view_t& rowStatus_,
+                         const status_view_t& colStatus_,
+                         const rowmap_t& rowmap_, const entries_t& entries_,
+                         lno_t nv_, const lno_view_t& worklist_,
+                         const bitset_t& colUpdateBitset_)
+        : rowStatus(rowStatus_),
+          colStatus(colStatus_),
+          rowmap(rowmap_),
+          entries(entries_),
+          nv(nv_),
+          worklist(worklist_),
+          colUpdateBitset(colUpdateBitset_) {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const {
       lno_t i = worklist(w);
-      //Processing row i.
+      // Processing row i.
       status_t s = rowStatus(i);
-      //s is the status which must be the minimum among all neighbors
-      //to decide that i is IN_SET.
+      // s is the status which must be the minimum among all neighbors
+      // to decide that i is IN_SET.
       size_type rowBegin = rowmap(i);
-      size_type rowEnd = rowmap(i + 1);
-      bool neiOut = false;
-      bool neiMismatchS = false;
-      for(size_type j = rowBegin; j <= rowEnd; j++)
-      {
+      size_type rowEnd   = rowmap(i + 1);
+      bool neiOut        = false;
+      bool neiMismatchS  = false;
+      for (size_type j = rowBegin; j <= rowEnd; j++) {
         lno_t nei = (j == rowEnd) ? i : entries(j);
-        if(nei >= nv)
-          continue;
+        if (nei >= nv) continue;
         status_t neiStat = colStatus(nei);
-        if(neiStat == OUT_SET)
-        {
+        if (neiStat == OUT_SET) {
           neiOut = true;
           break;
-        }
-        else if(neiStat != s)
-        {
+        } else if (neiStat != s) {
           neiMismatchS = true;
         }
       }
       bool statusChanged = neiOut || !neiMismatchS;
-      if(neiOut)
-      {
-        //In order to make future progress, need to update the
-        //col statuses for all neighbors of i which have status s.
-        //This will increase the minimum to the next smallest row,
-        //so that another nearby vertex can be added to the set.
+      if (neiOut) {
+        // In order to make future progress, need to update the
+        // col statuses for all neighbors of i which have status s.
+        // This will increase the minimum to the next smallest row,
+        // so that another nearby vertex can be added to the set.
         rowStatus(i) = OUT_SET;
-      }
-      else if(!neiMismatchS)
-      {
+      } else if (!neiMismatchS) {
         rowStatus(i) = IN_SET;
       }
-      if(statusChanged)
-      {
-        for(size_type j = rowBegin; j <= rowEnd; j++)
-        {
+      if (statusChanged) {
+        for (size_type j = rowBegin; j <= rowEnd; j++) {
           lno_t nei = (j == rowEnd) ? i : entries(j);
-          if(nei < nv && colStatus(nei) == s)
-            colUpdateBitset.set(nei);
+          if (nei < nv && colStatus(nei) == s) colUpdateBitset.set(nei);
         }
       }
-      //else: still undecided
+      // else: still undecided
     }
 
     status_view_t rowStatus;
@@ -672,23 +741,23 @@ struct D2_MIS_FixedPriority
     bitset_t colUpdateBitset;
   };
 
-  struct UpdateWorklistFunctor
-  {
-    UpdateWorklistFunctor(const status_view_t& rowStatus_, const lno_view_t& oldWorklist_, const lno_view_t& newWorklist_)
-      : rowStatus(rowStatus_), oldWorklist(oldWorklist_), newWorklist(newWorklist_)
-    {}
-
-    KOKKOS_INLINE_FUNCTION void operator()(lno_t w, lno_t& lcount, bool finalPass) const
-    {
-      //processing row i
+  struct UpdateWorklistFunctor {
+    UpdateWorklistFunctor(const status_view_t& rowStatus_,
+                          const lno_view_t& oldWorklist_,
+                          const lno_view_t& newWorklist_)
+        : rowStatus(rowStatus_),
+          oldWorklist(oldWorklist_),
+          newWorklist(newWorklist_) {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t w, lno_t& lcount,
+                                           bool finalPass) const {
+      // processing row i
       lno_t i = oldWorklist(w);
-      //Bit i will be set when it's decided IN_SET/OUT_SET.
-      //If clear, vertex i needs to be processed still.
+      // Bit i will be set when it's decided IN_SET/OUT_SET.
+      // If clear, vertex i needs to be processed still.
       status_t s = rowStatus(i);
-      if(s != IN_SET && s != OUT_SET)
-      {
-        if(finalPass)
-          newWorklist(lcount) = i;
+      if (s != IN_SET && s != OUT_SET) {
+        if (finalPass) newWorklist(lcount) = i;
         lcount++;
       }
     }
@@ -698,18 +767,15 @@ struct D2_MIS_FixedPriority
     lno_view_t newWorklist;
   };
 
-  struct ColRefreshWorklist
-  {
-    ColRefreshWorklist(const bitset_t& colUpdateBitset_, const lno_view_t& refreshList_)
-      : colUpdateBitset(colUpdateBitset_), refreshList(refreshList_)
-    {}
+  struct ColRefreshWorklist {
+    ColRefreshWorklist(const bitset_t& colUpdateBitset_,
+                       const lno_view_t& refreshList_)
+        : colUpdateBitset(colUpdateBitset_), refreshList(refreshList_) {}
 
-    KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lindex, bool finalPass) const
-    {
-      if(colUpdateBitset.test(i))
-      {
-        if(finalPass)
-        {
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lindex,
+                                           bool finalPass) const {
+      if (colUpdateBitset.test(i)) {
+        if (finalPass) {
           refreshList(lindex) = i;
           colUpdateBitset.reset(i);
         }
@@ -721,29 +787,30 @@ struct D2_MIS_FixedPriority
     lno_view_t refreshList;
   };
 
-  struct RefreshColStatus
-  {
-    RefreshColStatus(const lno_view_t& worklist_, const status_view_t& rowStatus_, const status_view_t& colStatus_, const rowmap_t& rowmap_, const entries_t& entries_, lno_t nv_)
-      : worklist(worklist_), rowStatus(rowStatus_), colStatus(colStatus_), rowmap(rowmap_), entries(entries_), nv(nv_)
-    {}
-
-    KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const
-    {
-      lno_t col = worklist(w);
+  struct RefreshColStatus {
+    RefreshColStatus(const lno_view_t& worklist_,
+                     const status_view_t& rowStatus_,
+                     const status_view_t& colStatus_, const rowmap_t& rowmap_,
+                     const entries_t& entries_, lno_t nv_)
+        : worklist(worklist_),
+          rowStatus(rowStatus_),
+          colStatus(colStatus_),
+          rowmap(rowmap_),
+          entries(entries_),
+          nv(nv_) {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t w) const {
+      lno_t col           = worklist(w);
       status_t minNeiStat = OUT_SET;
-      size_type rowBegin = rowmap(col);
-      size_type rowEnd = rowmap(col + 1);
-      for(size_type j = rowBegin; j <= rowEnd; j++)
-      {
+      size_type rowBegin  = rowmap(col);
+      size_type rowEnd    = rowmap(col + 1);
+      for (size_type j = rowBegin; j <= rowEnd; j++) {
         lno_t nei = (j == rowEnd) ? col : entries(j);
-        if(nei >= nv)
-          continue;
+        if (nei >= nv) continue;
         status_t neiStat = rowStatus(nei);
-        if(neiStat < minNeiStat)
-          minNeiStat = neiStat;
+        if (neiStat < minNeiStat) minNeiStat = neiStat;
       }
-      if(minNeiStat == IN_SET)
-        minNeiStat = OUT_SET;
+      if (minNeiStat == IN_SET) minNeiStat = OUT_SET;
       colStatus(col) = minNeiStat;
     }
 
@@ -755,42 +822,27 @@ struct D2_MIS_FixedPriority
     lno_t nv;
   };
 
-  struct InitWorklistFunctor
-  {
-    InitWorklistFunctor(const lno_view_t& worklist_)
-      : worklist(worklist_)
-    {}
-    KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const
-    {
-      worklist(i) = i;
-    }
+  struct InitWorklistFunctor {
+    InitWorklistFunctor(const lno_view_t& worklist_) : worklist(worklist_) {}
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const { worklist(i) = i; }
     lno_view_t worklist;
   };
 
-  struct CountInSet
-  {
-    CountInSet(const status_view_t& rowStatus_)
-      : rowStatus(rowStatus_)
-    {}
-    KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet) const
-    {
-      if(rowStatus(i) == IN_SET)
-        lNumInSet++;
+  struct CountInSet {
+    CountInSet(const status_view_t& rowStatus_) : rowStatus(rowStatus_) {}
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet) const {
+      if (rowStatus(i) == IN_SET) lNumInSet++;
     }
     status_view_t rowStatus;
   };
 
-  struct CompactInSet
-  {
+  struct CompactInSet {
     CompactInSet(const status_view_t& rowStatus_, const lno_view_t& setList_)
-      : rowStatus(rowStatus_), setList(setList_)
-    {}
-    KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet, bool finalPass) const
-    {
-      if(rowStatus(i) == IN_SET)
-      {
-        if(finalPass)
-          setList(lNumInSet) = i;
+        : rowStatus(rowStatus_), setList(setList_) {}
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lNumInSet,
+                                           bool finalPass) const {
+      if (rowStatus(i) == IN_SET) {
+        if (finalPass) setList(lNumInSet) = i;
         lNumInSet++;
       }
     }
@@ -798,39 +850,47 @@ struct D2_MIS_FixedPriority
     lno_view_t setList;
   };
 
-  lno_view_t compute()
-  {
-    //Initialize first worklist to 0...numVerts
-    Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(worklist1));
+  lno_view_t compute() {
+    // Initialize first worklist to 0...numVerts
+    Kokkos::parallel_for(range_pol(0, numVerts),
+                         InitWorklistFunctor(worklist1));
     lno_t workRemain = numVerts;
-    int numIter = 0;
-    while(workRemain)
-    {
-      //do another iteration
-      Kokkos::parallel_for(range_pol(0, workRemain),
-          IterateStatusFunctor(rowStatus, colStatus, rowmap, entries, numVerts, worklist1, colUpdateBitset));
-      //And refresh the column statuses using the other worklist.
+    int numIter      = 0;
+    while (workRemain) {
+      // do another iteration
+      Kokkos::parallel_for(
+          range_pol(0, workRemain),
+          IterateStatusFunctor(rowStatus, colStatus, rowmap, entries, numVerts,
+                               worklist1, colUpdateBitset));
+      // And refresh the column statuses using the other worklist.
       lno_t colsToRefresh;
       Kokkos::parallel_scan(range_pol(0, numVerts),
-          ColRefreshWorklist(colUpdateBitset, worklist2), colsToRefresh);
+                            ColRefreshWorklist(colUpdateBitset, worklist2),
+                            colsToRefresh);
       Kokkos::parallel_for(range_pol(0, colsToRefresh),
-          RefreshColStatus(worklist2, rowStatus, colStatus, rowmap, entries, numVerts));
-      //then build the next worklist with a scan. Also get the length of the next worklist.
+                           RefreshColStatus(worklist2, rowStatus, colStatus,
+                                            rowmap, entries, numVerts));
+      // then build the next worklist with a scan. Also get the length of the
+      // next worklist.
       lno_t newWorkRemain = 0;
-      Kokkos::parallel_scan(range_pol(0, workRemain),
+      Kokkos::parallel_scan(
+          range_pol(0, workRemain),
           UpdateWorklistFunctor(rowStatus, worklist1, worklist2),
           newWorkRemain);
-      //Finally, flip the worklists
+      // Finally, flip the worklists
       std::swap(worklist1, worklist2);
       workRemain = newWorkRemain;
       numIter++;
     }
-    //now that every vertex has been decided IN_SET/OUT_SET,
-    //build a compact list of the vertices which are IN_SET.
+    // now that every vertex has been decided IN_SET/OUT_SET,
+    // build a compact list of the vertices which are IN_SET.
     lno_t numInSet = 0;
-    Kokkos::parallel_reduce(range_pol(0, numVerts), CountInSet(rowStatus), numInSet);
-    lno_view_t setList(Kokkos::view_alloc(Kokkos::WithoutInitializing, "D2MIS"), numInSet);
-    Kokkos::parallel_scan(range_pol(0, numVerts), CompactInSet(rowStatus, setList));
+    Kokkos::parallel_reduce(range_pol(0, numVerts), CountInSet(rowStatus),
+                            numInSet);
+    lno_view_t setList(Kokkos::view_alloc(Kokkos::WithoutInitializing, "D2MIS"),
+                       numInSet);
+    Kokkos::parallel_scan(range_pol(0, numVerts),
+                          CompactInSet(rowStatus, setList));
     return setList;
   }
 
@@ -839,137 +899,375 @@ struct D2_MIS_FixedPriority
   lno_t numVerts;
   status_view_t rowStatus;
   status_view_t colStatus;
-  //The number of bits required to represent vertex IDs, in the ECL-MIS tiebreak scheme:
+  // The number of bits required to represent vertex IDs, in the ECL-MIS
+  // tiebreak scheme:
   //  ceil(log_2(numVerts + 1))
   int nvBits;
   lno_t minDegree;
   lno_t maxDegree;
-  //Bitset representing columns whose status needs to be recomputed
-  //These bits are cleared after each refresh.
+  // Bitset representing columns whose status needs to be recomputed
+  // These bits are cleared after each refresh.
   bitset_t colUpdateBitset;
   lno_view_t worklist1;
   lno_view_t worklist2;
 };
 
-template<typename device_t, typename rowmap_t, typename entries_t, typename labels_t>
-struct D2_MIS_Coarsening
-{
-  using exec_space = typename device_t::execution_space;
-  using mem_space = typename device_t::memory_space;
-  using bitset_t = Kokkos::Bitset<device_t>;
+template <typename device_t, typename rowmap_t, typename entries_t,
+          typename labels_t>
+struct D2_MIS_Aggregation {
+  using exec_space     = typename device_t::execution_space;
+  using mem_space      = typename device_t::memory_space;
+  using bitset_t       = Kokkos::Bitset<device_t>;
   using const_bitset_t = Kokkos::ConstBitset<device_t>;
-  using size_type = typename rowmap_t::non_const_value_type;
-  using lno_t = typename entries_t::non_const_value_type;
-  using lno_view_t = typename entries_t::non_const_type;
-  //The type of status/priority values.
-  using status_t = typename std::make_unsigned<lno_t>::type;
+  using size_type      = typename rowmap_t::non_const_value_type;
+  using char_view_t    = Kokkos::View<char*, mem_space>;
+  using lno_t          = typename entries_t::non_const_value_type;
+  using lno_view_t     = typename entries_t::non_const_type;
+  // The type of status/priority values.
+  using status_t      = typename std::make_unsigned<lno_t>::type;
   using status_view_t = Kokkos::View<status_t*, mem_space>;
-  using range_pol = Kokkos::RangePolicy<exec_space>;
-
-  D2_MIS_Coarsening(const rowmap_t& rowmap_, const entries_t& entries_, const labels_t& mis2_)
-    : rowmap(rowmap_), entries(entries_), mis2(mis2_),
-      numVerts(rowmap.extent(0) - 1),
-      labels(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Cluster Labels"), numVerts)
-  {
-    Kokkos::deep_copy(labels, (lno_t) -1);
+  using range_pol     = Kokkos::RangePolicy<exec_space>;
+  using mis2_view     = Kokkos::View<lno_t*, mem_space>;
+
+  D2_MIS_Aggregation(const rowmap_t& rowmap_, const entries_t& entries_)
+      : rowmap(rowmap_),
+        entries(entries_),
+        numVerts(rowmap.extent(0) - 1),
+        labels(Kokkos::ViewAllocateWithoutInitializing("AggregateLabels"),
+               numVerts),
+        roots("Root Status", numVerts) {
+    Kokkos::deep_copy(labels, (lno_t)-1);
   }
 
-  //Phase 1 (over 0...numClusters) labels roots and immediate neighbors of roots.
-  struct Phase1Functor
-  {
-    Phase1Functor(const rowmap_t& rowmap_, const entries_t& entries_, const labels_t& mis2_, lno_t numVerts_, const labels_t& labels_)
-      : rowmap(rowmap_), entries(entries_), mis2(mis2_), numVerts(numVerts_), labels(labels_)
-    {}
-
-    KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const
-    {
-      lno_t root = mis2(i);
-      size_type rowBegin = rowmap(root);
-      size_type rowEnd = rowmap(root + 1);
-      labels(root) = i;
-      for(size_type j = rowBegin; j < rowEnd; j++)
-      {
-        lno_t nei = entries(j);
-        if(nei != root && nei < numVerts)
-        {
-          labels(nei) = i;
+  struct Phase1Functor {
+    Phase1Functor(lno_t numVerts__, const mis2_view& m1__,
+                  const rowmap_t& rowmap__, const entries_t& entries__,
+                  const labels_t& labels__, const char_view_t& roots__)
+        : numVerts_(numVerts__),
+          m1_(m1__),
+          rowmap_(rowmap__),
+          entries_(entries__),
+          labels_(labels__),
+          roots_(roots__) {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t agg) const {
+      lno_t root         = m1_(agg);
+      roots_(root)       = 1;
+      size_type rowBegin = rowmap_(root);
+      size_type rowEnd   = rowmap_(root + 1);
+      labels_(root)      = agg;
+      for (size_type j = rowBegin; j < rowEnd; j++) {
+        lno_t nei = entries_(j);
+        if (nei < numVerts_) labels_(nei) = agg;
+      }
+    }
+
+    lno_t numVerts_;
+    mis2_view m1_;
+    rowmap_t rowmap_;
+    entries_t entries_;
+    labels_t labels_;
+    char_view_t roots_;
+  };
+
+  void createPrimaryAggregates() {
+    // Compute an MIS-2
+    D2_MIS_RandomPriority<device_t, rowmap_t, entries_t, mis2_view> d2mis(
+        rowmap, entries);
+    mis2_view m1 = d2mis.compute();
+    // Construct initial aggregates using roots and all direct neighbors
+    Kokkos::parallel_for(
+        range_pol(0, m1.extent(0)),
+        Phase1Functor(numVerts, m1, rowmap, entries, labels, roots));
+    numAggs = m1.extent(0);
+  }
+
+  struct CandAggSizesFunctor {
+    CandAggSizesFunctor(lno_t numVerts__, const labels_t& m2__,
+                        const rowmap_t& rowmap__, const entries_t& entries__,
+                        const labels_t& labels__,
+                        const labels_t& candAggSizes__)
+        : numVerts_(numVerts__),
+          m2_(m2__),
+          rowmap_(rowmap__),
+          entries_(entries__),
+          labels_(labels__),
+          candAggSizes_(candAggSizes__) {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const {
+      lno_t candRoot = m2_(i);
+      // Count the number of non-aggregated neighbors, including self
+      lno_t aggSize      = 1;
+      size_type rowBegin = rowmap_(candRoot);
+      size_type rowEnd   = rowmap_(candRoot + 1);
+      for (size_type j = rowBegin; j < rowEnd; j++) {
+        lno_t nei = entries_(j);
+        if (nei == candRoot || nei >= numVerts_) continue;
+        if (labels_(nei) == -1) aggSize++;
+      }
+      candAggSizes_(i) = aggSize;
+    }
+
+    lno_t numVerts_;
+    labels_t m2_;
+    rowmap_t rowmap_;
+    entries_t entries_;
+    labels_t labels_;
+    labels_t candAggSizes_;
+  };
+
+  struct ChoosePhase2AggsFunctor {
+    ChoosePhase2AggsFunctor(lno_t numVerts__, lno_t numAggs__,
+                            const labels_t& m2__, const rowmap_t& rowmap__,
+                            const entries_t& entries__,
+                            const labels_t& labels__,
+                            const labels_t& candAggSizes__,
+                            const char_view_t& roots__)
+        : numVerts_(numVerts__),
+          numAggs_(numAggs__),
+          m2_(m2__),
+          rowmap_(rowmap__),
+          entries_(entries__),
+          labels_(labels__),
+          candAggSizes_(candAggSizes__),
+          roots_(roots__) {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t i, lno_t& lid,
+                                           bool finalPass) const {
+      lno_t aggSize = candAggSizes_(i);
+      if (aggSize < 3) return;
+      if (finalPass) {
+        // Build the aggregate
+        lno_t root         = m2_(i);
+        lno_t aggID        = numAggs_ + lid;
+        labels_(root)      = aggID;
+        roots_(root)       = 1;
+        size_type rowBegin = rowmap_(root);
+        size_type rowEnd   = rowmap_(root + 1);
+        for (size_type j = rowBegin; j < rowEnd; j++) {
+          lno_t nei = entries_(j);
+          if (nei == root || nei >= numVerts_) continue;
+          if (labels_(nei) == -1) labels_(nei) = aggID;
         }
       }
+      lid++;
     }
 
-    rowmap_t rowmap;
-    entries_t entries;
-    labels_t mis2;
-    lno_t numVerts;
-    labels_t labels;
+    lno_t numVerts_;
+    lno_t numAggs_;
+    labels_t m2_;
+    rowmap_t rowmap_;
+    entries_t entries_;
+    labels_t labels_;
+    labels_t candAggSizes_;
+    char_view_t roots_;
   };
 
-  KOKKOS_INLINE_FUNCTION static uint32_t xorshiftHash(uint32_t in)
-  {
-    uint32_t x = in;
-    x ^= x << 13;
-    x ^= x >> 17;
-    x ^= x << 5;
-    return x;
+  void createSecondaryAggregates() {
+    labels_t candAggSizes(
+        Kokkos::ViewAllocateWithoutInitializing("Phase2 Candidate Agg Sizes"),
+        numVerts);
+    // Compute a new MIS-2 from only unaggregated nodes
+    D2_MIS_RandomPriority<device_t, rowmap_t, entries_t, labels_t> d2mis(
+        rowmap, entries);
+    labels_t m2        = d2mis.compute(labels);
+    lno_t numCandRoots = m2.extent(0);
+    // Compute the sizes of would-be aggregates.
+    Kokkos::parallel_for(range_pol(0, numCandRoots),
+                         CandAggSizesFunctor(numVerts, m2, rowmap, entries,
+                                             labels, candAggSizes));
+    // Now, filter out the candidate aggs which are big enough, and create those
+    // aggregates. Using a scan for this assigns IDs deterministically (unlike
+    // an atomic counter).
+    lno_t numNewAggs = 0;
+    Kokkos::parallel_scan(
+        range_pol(0, numCandRoots),
+        ChoosePhase2AggsFunctor(numVerts, numAggs, m2, rowmap, entries, labels,
+                                candAggSizes, roots),
+        numNewAggs);
+    numAggs += numNewAggs;
   }
 
-  //Phase 2 (over 0...numVerts) joins unlabeled vertices to the smallest adjacent cluster
-  struct Phase2Functor
-  {
-    Phase2Functor(const rowmap_t& rowmap_, const entries_t& entries_, const labels_t& mis2_, lno_t numVerts_, const labels_t& labels_)
-      : rowmap(rowmap_), entries(entries_), mis2(mis2_), numVerts(numVerts_), labels(labels_)
-    {}
+  struct SizeAndConnectivityFunctor {
+    SizeAndConnectivityFunctor(lno_t numVerts__, const rowmap_t& rowmap__,
+                               const entries_t& entries__,
+                               const labels_t& labels__,
+                               const labels_t& connectivities__,
+                               const labels_t& aggSizes__)
+        : numVerts_(numVerts__),
+          rowmap_(rowmap__),
+          entries_(entries__),
+          labels_(labels__),
+          connectivities_(connectivities__),
+          aggSizes_(aggSizes__) {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const {
+      lno_t agg = labels_(i);
+      if (agg != -1) {
+        Kokkos::atomic_increment(&aggSizes_(agg));
+        // compute connectivity of i
+        size_type rowBegin = rowmap_(i);
+        size_type rowEnd   = rowmap_(i + 1);
+        int connect        = 0;
+        for (size_type j = rowBegin; j < rowEnd; j++) {
+          lno_t nei = entries_(j);
+          if (nei == i || nei >= numVerts_) continue;
+          lno_t neiAgg = labels_(nei);
+          if (neiAgg == agg) connect++;
+        }
+        connectivities_(i) = connect;
+      }
+    }
 
-    KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const
-    {
-      if(labels(i) != (lno_t) -1)
+    lno_t numVerts_;
+    rowmap_t rowmap_;
+    entries_t entries_;
+    labels_t labels_;
+    labels_t connectivities_;
+    labels_t aggSizes_;
+  };
+
+  struct AssignLeftoverFunctor {
+    AssignLeftoverFunctor(lno_t numVerts__, const rowmap_t& rowmap__,
+                          const entries_t& entries__, const labels_t& labels__,
+                          const labels_t& labelsOld__,
+                          const labels_t& connectivities__,
+                          const labels_t& aggSizes__,
+                          const char_view_t& roots__)
+        : numVerts_(numVerts__),
+          rowmap_(rowmap__),
+          entries_(entries__),
+          labels_(labels__),
+          labelsOld_(labelsOld__),
+          connectivities_(connectivities__),
+          aggSizes_(aggSizes__),
+          roots_(roots__) {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const {
+      lno_t agg               = labelsOld_(i);
+      lno_t trackedAggs[8]    = {0};
+      lno_t trackedConnect[8] = {0};
+      char trackedRootAdj[8]  = {0};
+      if (roots_(i)) {
+        // Never reassign roots
+        labels_(i) = agg;
         return;
-      size_type rowBegin = rowmap(i);
-      size_type rowEnd = rowmap(i + 1);
-      lno_t cluster = -1;
-      uint32_t minScore = ~(uint32_t) 0;
-      for(size_type j = rowBegin; j < rowEnd; j++)
-      {
-        lno_t nei = entries(j);
-        if(nei == i || nei >= numVerts)
-          continue;
-        lno_t neiCluster = labels(nei);
-        if(neiCluster != -1 && neiCluster != cluster)
-        {
-          //check if this cluster is smaller
-          uint32_t score = xorshiftHash(i + xorshiftHash(neiCluster));
-          if(score < minScore)
-          {
-            cluster = neiCluster;
-            minScore = score;
+      }
+      lno_t numTracked   = 0;
+      size_type rowBegin = rowmap_(i);
+      size_type rowEnd   = rowmap_(i + 1);
+      for (size_type j = rowBegin; j < rowEnd; j++) {
+        lno_t nei = entries_(j);
+        if (nei == i || nei >= numVerts_) continue;
+        lno_t neiAgg = labelsOld_(nei);
+        if (neiAgg == -1 || neiAgg == agg) continue;
+        // Try to find neiAgg in tracked
+        lno_t trackedID = -1;
+        for (lno_t k = 0; k < numTracked; k++) {
+          if (trackedAggs[k] == neiAgg) {
+            trackedID = k;
+            break;
+          }
+        }
+        if (trackedID == -1) {
+          if (numTracked < 8) {
+            trackedID              = numTracked++;
+            trackedAggs[trackedID] = neiAgg;
+          } else {
+            // Ran out of space, just ignore this neighboring agg
+            break;
           }
         }
+        // Record the connectivity to this neighboring agg
+        if (roots_(nei)) trackedRootAdj[trackedID] = 1;
+        trackedConnect[trackedID]++;
       }
-      labels(i) = cluster;
+      // Now that we know connectivity of this node to (hopefully all)
+      // neighboring aggs, decide if it's better to join that agg instead
+      char bestRootAdj  = (agg >= 0) ? 1 : 0;
+      lno_t bestAgg     = agg;
+      lno_t bestConnect = connectivities_(i);
+      // If not in an agg, initial bestSize doesn't matter because any
+      // neighboring agg has a better connectivity than 0
+      lno_t bestSize = (agg >= 0) ? aggSizes_(agg) : 0;
+      for (int k = 0; k < numTracked; k++) {
+        lno_t s = aggSizes_(trackedAggs[k]);
+        // Priorities: adjacent to root > connect > size
+        if (trackedRootAdj[k] > bestRootAdj ||
+            (trackedRootAdj[k] == bestRootAdj &&
+             ((trackedConnect[k] > bestConnect) ||
+              (trackedConnect[k] == bestConnect && s < bestSize)))) {
+          bestRootAdj = trackedRootAdj[k];
+          bestConnect = trackedConnect[k];
+          bestSize    = s;
+          bestAgg     = trackedAggs[k];
+        }
+      }
+      labels_(i) = bestAgg;
     }
 
-    rowmap_t rowmap;
-    entries_t entries;
-    labels_t mis2;
-    lno_t numVerts;
-    labels_t labels;
+    lno_t numVerts_;
+    rowmap_t rowmap_;
+    entries_t entries_;
+    labels_t labels_;
+    labels_t labelsOld_;
+    labels_t connectivities_;
+    labels_t aggSizes_;
+    char_view_t roots_;
   };
 
-  labels_t compute()
-  {
-    lno_t numClusters = mis2.extent(0);
-    Kokkos::parallel_for(range_pol(0, numClusters), Phase1Functor(rowmap, entries, mis2, numVerts, labels));
-    Kokkos::parallel_for(range_pol(0, numVerts), Phase2Functor(rowmap, entries, mis2, numVerts, labels));
-    return labels;
+  void aggregateLeftovers() {
+    // Phase3 is cleanup. All aggregates have already been created, but some
+    // vertices might be unaggregated. Compute the current size of each
+    // aggregate, and then join each unaggregated node to the smallest
+    // neighboring aggregate.
+    labels_t labelsOld("old", numVerts);
+    Kokkos::deep_copy(labelsOld, labels);
+    labels_t connectivities("connect", numVerts);
+    labels_t aggSizes(
+        Kokkos::ViewAllocateWithoutInitializing("Phase3 Agg Sizes"), numAggs);
+    Kokkos::parallel_for(
+        range_pol(0, numVerts),
+        SizeAndConnectivityFunctor(numVerts, rowmap, entries, labels,
+                                   connectivities, aggSizes));
+    // Now, join vertices to aggregates
+    Kokkos::parallel_for(
+        range_pol(0, numVerts),
+        AssignLeftoverFunctor(numVerts, rowmap, entries, labels, labelsOld,
+                              connectivities, aggSizes, roots));
+  }
+
+  // phase 2 creates new aggregates in between the initial MIS-2 neighborhoods.
+  // Effectively slows coarsening rate by adding new aggregates.
+  void compute(bool enableSecondaryAggregates) {
+    //  * Phase 1: compute MIS-2, construct a 'primary' aggregate from each
+    //  in-set point and its neighbors
+    createPrimaryAggregates();
+    //  * Phase 2:
+    //    - Compute an MIS-2 on subgraph of unaggregated nodes
+    //    - For each in-set point:
+    //      - Count unaggregated neighbors.
+    //      - If total agg size would be >= 3, create the new aggregate.
+    //    - This is optional: enabling this phase slows coarsening rate (i.e.
+    //    coarse graph is larger)
+    if (enableSecondaryAggregates) createSecondaryAggregates();
+    //  * Phase 3: join still unaggregated (leftover) vertices to a neighboring
+    //  aggregate
+    //    - Ideally, the smallest neighboring aggregate.
+    //    - To remain deterministic, we use the agg sizes from end of
+    //    phase 2 and hold those constant during phase 3.
+    aggregateLeftovers();
   }
 
   rowmap_t rowmap;
   entries_t entries;
-  labels_t mis2;
   lno_t numVerts;
+  lno_t numAggs;
   labels_t labels;
+  char_view_t roots;
 };
 
-}}}
+}  // namespace Impl
+}  // namespace KokkosGraph
 
 #endif
diff --git a/src/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp b/src/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp
index 9562ce21fc..dece3aa742 100644
--- a/src/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp
+++ b/src/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp
@@ -48,38 +48,42 @@
 namespace KokkosGraph {
 namespace Impl {
 
-template<typename lno_t, typename size_type, typename device_t, typename fine_rowmap_t, typename fine_entries_t, typename labels_t, typename coarse_rowmap_t, typename coarse_entries_t, typename ordinal_view_t>
-struct ExplicitGraphCoarsening
-{
-  using exec_space = typename device_t::execution_space;
-  using range_pol = Kokkos::RangePolicy<exec_space>;
-  using team_pol = Kokkos::TeamPolicy<exec_space>;
-  using team_member_t = typename team_pol::member_type;
-  using bitset_t = Kokkos::Bitset<device_t>;
+template <typename lno_t, typename size_type, typename device_t,
+          typename fine_rowmap_t, typename fine_entries_t, typename labels_t,
+          typename coarse_rowmap_t, typename coarse_entries_t,
+          typename ordinal_view_t>
+struct ExplicitGraphCoarsening {
+  using exec_space     = typename device_t::execution_space;
+  using range_pol      = Kokkos::RangePolicy<exec_space>;
+  using team_pol       = Kokkos::TeamPolicy<exec_space>;
+  using team_member_t  = typename team_pol::member_type;
+  using bitset_t       = Kokkos::Bitset<device_t>;
   using const_bitset_t = Kokkos::ConstBitset<device_t>;
 
-  struct ClusterSizeFunctor
-  {
-    ClusterSizeFunctor(const ordinal_view_t& counts_, const labels_t& vertClusters_)
-      : counts(counts_), vertClusters(vertClusters_)
-    {}
-    KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const
-    {
+  struct ClusterSizeFunctor {
+    ClusterSizeFunctor(const ordinal_view_t& counts_,
+                       const labels_t& vertClusters_)
+        : counts(counts_), vertClusters(vertClusters_) {}
+    KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const {
       Kokkos::atomic_increment(&counts(vertClusters(i)));
     }
     ordinal_view_t counts;
     labels_t vertClusters;
   };
 
-  struct FillClusterVertsFunctor
-  {
-    FillClusterVertsFunctor(const ordinal_view_t& clusterOffsets_, const ordinal_view_t& clusterVerts_, const labels_t& vertClusters_, const ordinal_view_t& insertCounts_)
-      : clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), insertCounts(insertCounts_)
-    {}
-    KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const
-    {
+  struct FillClusterVertsFunctor {
+    FillClusterVertsFunctor(const ordinal_view_t& clusterOffsets_,
+                            const ordinal_view_t& clusterVerts_,
+                            const labels_t& vertClusters_,
+                            const ordinal_view_t& insertCounts_)
+        : clusterOffsets(clusterOffsets_),
+          clusterVerts(clusterVerts_),
+          vertClusters(vertClusters_),
+          insertCounts(insertCounts_) {}
+    KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const {
       lno_t cluster = vertClusters(i);
-      lno_t offset = clusterOffsets(cluster) + Kokkos::atomic_fetch_add(&insertCounts(cluster), 1);
+      lno_t offset  = clusterOffsets(cluster) +
+                     Kokkos::atomic_fetch_add(&insertCounts(cluster), 1);
       clusterVerts(offset) = i;
     }
     ordinal_view_t clusterOffsets;
@@ -88,23 +92,31 @@ struct ExplicitGraphCoarsening
     ordinal_view_t insertCounts;
   };
 
-  struct BuildCrossClusterMaskFunctor
-  {
-    BuildCrossClusterMaskFunctor(const fine_rowmap_t& rowmap_, const fine_entries_t& colinds_, const ordinal_view_t& clusterOffsets_, const ordinal_view_t& clusterVerts_, const labels_t& vertClusters_, const bitset_t& mask_)
-      : numRows(rowmap_.extent(0) - 1), rowmap(rowmap_), colinds(colinds_), clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), mask(mask_)
-    {}
+  struct BuildCrossClusterMaskFunctor {
+    BuildCrossClusterMaskFunctor(const fine_rowmap_t& rowmap_,
+                                 const fine_entries_t& colinds_,
+                                 const ordinal_view_t& clusterOffsets_,
+                                 const ordinal_view_t& clusterVerts_,
+                                 const labels_t& vertClusters_,
+                                 const bitset_t& mask_)
+        : numRows(rowmap_.extent(0) - 1),
+          rowmap(rowmap_),
+          colinds(colinds_),
+          clusterOffsets(clusterOffsets_),
+          clusterVerts(clusterVerts_),
+          vertClusters(vertClusters_),
+          mask(mask_) {}
 
-    //Used a fixed-size hash set in shared memory
-    KOKKOS_INLINE_FUNCTION constexpr int tableSize() const
-    {
-      //Should always be a power-of-two, so that X % tableSize() reduces to a bitwise and.
+    // Used a fixed-size hash set in shared memory
+    KOKKOS_INLINE_FUNCTION constexpr int tableSize() const {
+      // Should always be a power-of-two, so that X % tableSize() reduces to a
+      // bitwise and.
       return 512;
     }
 
-    //Given a cluster index, get the hash table index.
-    //This is the 32-bit xorshift RNG, but it works as a hash function.
-    KOKKOS_INLINE_FUNCTION unsigned xorshiftHash(lno_t cluster) const
-    {
+    // Given a cluster index, get the hash table index.
+    // This is the 32-bit xorshift RNG, but it works as a hash function.
+    KOKKOS_INLINE_FUNCTION unsigned xorshiftHash(lno_t cluster) const {
       unsigned x = cluster;
       x ^= x << 13;
       x ^= x >> 17;
@@ -112,78 +124,69 @@ struct ExplicitGraphCoarsening
       return x;
     }
 
-    KOKKOS_INLINE_FUNCTION bool lookup(lno_t cluster, int* table) const
-    {
+    KOKKOS_INLINE_FUNCTION bool lookup(lno_t cluster, int* table) const {
       unsigned h = xorshiftHash(cluster);
-      for(unsigned i = h; i < h + 2; i++)
-      {
-        if(table[i % tableSize()] == cluster)
-          return true;
+      for (unsigned i = h; i < h + 2; i++) {
+        if (table[i % tableSize()] == cluster) return true;
       }
       return false;
     }
 
-    //Try to insert the edge between cluster (team's cluster) and neighbor (neighboring cluster)
-    //by inserting nei into the table.
-    KOKKOS_INLINE_FUNCTION bool insert(lno_t cluster, lno_t nei, int* table) const
-    {
+    // Try to insert the edge between cluster (team's cluster) and neighbor
+    // (neighboring cluster) by inserting nei into the table.
+    KOKKOS_INLINE_FUNCTION bool insert(lno_t cluster, lno_t nei,
+                                       int* table) const {
       unsigned h = xorshiftHash(nei);
-      for(unsigned i = h; i < h + 2; i++)
-      {
-        if(Kokkos::atomic_compare_exchange_strong<int>(&table[i % tableSize()], cluster, nei))
+      for (unsigned i = h; i < h + 2; i++) {
+        if (Kokkos::atomic_compare_exchange_strong<int>(&table[i % tableSize()],
+                                                        cluster, nei))
           return true;
       }
       return false;
     }
 
-    KOKKOS_INLINE_FUNCTION void operator()(const team_member_t t) const
-    {
-      lno_t cluster = t.league_rank();
+    KOKKOS_INLINE_FUNCTION void operator()(const team_member_t t) const {
+      lno_t cluster     = t.league_rank();
       lno_t clusterSize = clusterOffsets(cluster + 1) - clusterOffsets(cluster);
-      //Use a fixed-size hash table per thread to accumulate neighbor of the cluster.
-      //If it fills up (very unlikely) then just count every remaining edge going to another cluster
-      //not already in the table; this provides a reasonable upper bound for overallocating the cluster graph.
-      //each thread handles a cluster
-      int* table = (int*) t.team_shmem().get_shmem(tableSize() * sizeof(int));
-      //mark every entry as cluster (self-loop) to represent free/empty
+      // Use a fixed-size hash table per thread to accumulate neighbor of the
+      // cluster. If it fills up (very unlikely) then just count every remaining
+      // edge going to another cluster not already in the table; this provides a
+      // reasonable upper bound for overallocating the cluster graph. each
+      // thread handles a cluster
+      int* table = (int*)t.team_shmem().get_shmem(tableSize() * sizeof(int));
+      // mark every entry as cluster (self-loop) to represent free/empty
       Kokkos::parallel_for(Kokkos::TeamVectorRange(t, tableSize()),
-        [&](const lno_t i)
-        {
-          table[i] = cluster;
-        });
+                           [&](const lno_t i) { table[i] = cluster; });
       t.team_barrier();
-      //now, for each row belonging to the cluster, iterate through the neighbors
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clusterSize),
-        [&] (const lno_t i)
-        {
-          lno_t row = clusterVerts(clusterOffsets(cluster) + i);
-          lno_t rowDeg = rowmap(row + 1) - rowmap(row);
-          Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, rowDeg),
-            [&] (const lno_t j)
-            {
-              lno_t nei = colinds(rowmap(row) + j);
-              //Remote neighbors are not included
-              if(nei >= numRows)
-                return;
-              lno_t neiCluster = vertClusters(nei);
-              if(neiCluster != cluster)
-              {
-                //Have a neighbor. Try to find it in the table.
-                if(!lookup(neiCluster, table))
-                {
-                  //Not in the table. Try to insert it.
-                  insert(cluster, neiCluster, table);
-                  //Whether or not insertion succeeded,
-                  //this is a cross-cluster edge possibly not seen before
-                  mask.set(rowmap(row) + j);
-                }
-              }
-            });
-        });
+      // now, for each row belonging to the cluster, iterate through the
+      // neighbors
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(t, clusterSize), [&](const lno_t i) {
+            lno_t row    = clusterVerts(clusterOffsets(cluster) + i);
+            lno_t rowDeg = rowmap(row + 1) - rowmap(row);
+            Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, rowDeg),
+                                 [&](const lno_t j) {
+                                   lno_t nei = colinds(rowmap(row) + j);
+                                   // Remote neighbors are not included
+                                   if (nei >= numRows) return;
+                                   lno_t neiCluster = vertClusters(nei);
+                                   if (neiCluster != cluster) {
+                                     // Have a neighbor. Try to find it in the
+                                     // table.
+                                     if (!lookup(neiCluster, table)) {
+                                       // Not in the table. Try to insert it.
+                                       insert(cluster, neiCluster, table);
+                                       // Whether or not insertion succeeded,
+                                       // this is a cross-cluster edge possibly
+                                       // not seen before
+                                       mask.set(rowmap(row) + j);
+                                     }
+                                   }
+                                 });
+          });
     }
 
-    size_t team_shmem_size(int /*teamSize*/) const
-    {
+    size_t team_shmem_size(int /*teamSize*/) const {
       return tableSize() * sizeof(int);
     }
 
@@ -196,53 +199,58 @@ struct ExplicitGraphCoarsening
     bitset_t mask;
   };
 
-  struct FillClusterEntriesFunctor
-  {
-    FillClusterEntriesFunctor(
-        const fine_rowmap_t& rowmap_, const fine_entries_t& colinds_, const coarse_rowmap_t& clusterRowmap_, const coarse_entries_t& clusterEntries_, const ordinal_view_t& clusterOffsets_, const ordinal_view_t& clusterVerts_, const labels_t& vertClusters_, const bitset_t& edgeMask_)
-      : rowmap(rowmap_), colinds(colinds_), clusterRowmap(clusterRowmap_), clusterEntries(clusterEntries_), clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), edgeMask(edgeMask_)
-    {}
-    //Run this scan over entries in clusterVerts (reordered point rows)
-    KOKKOS_INLINE_FUNCTION void operator()(const lno_t i, lno_t& lcount, const bool& finalPass) const
-    {
-      lno_t numRows = rowmap.extent(0) - 1;
-      lno_t row = clusterVerts(i);
+  struct FillClusterEntriesFunctor {
+    FillClusterEntriesFunctor(const fine_rowmap_t& rowmap_,
+                              const fine_entries_t& colinds_,
+                              const coarse_rowmap_t& clusterRowmap_,
+                              const coarse_entries_t& clusterEntries_,
+                              const ordinal_view_t& clusterOffsets_,
+                              const ordinal_view_t& clusterVerts_,
+                              const labels_t& vertClusters_,
+                              const bitset_t& edgeMask_)
+        : rowmap(rowmap_),
+          colinds(colinds_),
+          clusterRowmap(clusterRowmap_),
+          clusterEntries(clusterEntries_),
+          clusterOffsets(clusterOffsets_),
+          clusterVerts(clusterVerts_),
+          vertClusters(vertClusters_),
+          edgeMask(edgeMask_) {}
+    // Run this scan over entries in clusterVerts (reordered point rows)
+    KOKKOS_INLINE_FUNCTION void operator()(const lno_t i, lno_t& lcount,
+                                           const bool& finalPass) const {
+      lno_t numRows      = rowmap.extent(0) - 1;
+      lno_t row          = clusterVerts(i);
       size_type rowStart = rowmap(row);
-      size_type rowEnd = rowmap(row + 1);
-      lno_t cluster = vertClusters(row);
+      size_type rowEnd   = rowmap(row + 1);
+      lno_t cluster      = vertClusters(row);
       lno_t clusterStart = clusterOffsets(cluster);
-      //Count the number of entries in this row.
-      //This is how much lcount will be increased by,
-      //yielding the offset corresponding to
-      //these point entries in the cluster entries.
+      // Count the number of entries in this row.
+      // This is how much lcount will be increased by,
+      // yielding the offset corresponding to
+      // these point entries in the cluster entries.
       lno_t rowEntries = 0;
-      for(size_type j = rowStart; j < rowEnd; j++)
-      {
-        if(edgeMask.test(j))
-          rowEntries++;
+      for (size_type j = rowStart; j < rowEnd; j++) {
+        if (edgeMask.test(j)) rowEntries++;
       }
-      if(finalPass)
-      {
-        //if this is the last row in the cluster, update the upper bound in clusterRowmap
-        if(i == clusterStart)
-        {
+      if (finalPass) {
+        // if this is the last row in the cluster, update the upper bound in
+        // clusterRowmap
+        if (i == clusterStart) {
           clusterRowmap(cluster) = lcount;
         }
         lno_t clusterEdge = lcount;
-        //populate clusterEntries for these edges
-        for(size_type j = rowStart; j < rowEnd; j++)
-        {
-          if(edgeMask.test(j))
-          {
+        // populate clusterEntries for these edges
+        for (size_type j = rowStart; j < rowEnd; j++) {
+          if (edgeMask.test(j)) {
             clusterEntries(clusterEdge++) = vertClusters(colinds(j));
           }
         }
       }
-      //update the scan result at the end (exclusive)
+      // update the scan result at the end (exclusive)
       lcount += rowEntries;
-      if(i == numRows - 1 && finalPass)
-      {
-        //on the very last row, set the last entry of the cluster rowmap
+      if (i == numRows - 1 && finalPass) {
+        // on the very last row, set the last entry of the cluster rowmap
         clusterRowmap(clusterRowmap.extent(0) - 1) = lcount;
       }
     }
@@ -256,40 +264,67 @@ struct ExplicitGraphCoarsening
     const_bitset_t edgeMask;
   };
 
-  //Constructor just does the computation and outputs to coarseRowmap, coarseEntries.
-  ExplicitGraphCoarsening(const fine_rowmap_t& fineRowmap, const fine_entries_t& fineEntries, const labels_t& labels, lno_t numCoarseVerts)
-  {
+  // Constructor just does the computation and outputs to coarseRowmap,
+  // coarseEntries.
+  ExplicitGraphCoarsening(const fine_rowmap_t& fineRowmap,
+                          const fine_entries_t& fineEntries,
+                          const labels_t& labels, lno_t numCoarseVerts) {
     lno_t numFineVerts = fineRowmap.extent(0);
-    if(numFineVerts <= 1)
-    {
-      coarseRowmap = coarse_rowmap_t();
+    if (numFineVerts <= 1) {
+      coarseRowmap  = coarse_rowmap_t();
       coarseEntries = coarse_entries_t();
       return;
     }
     numFineVerts--;
     clusterOffsets = ordinal_view_t("Cluster offsets", numCoarseVerts + 1);
-    clusterVerts = ordinal_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Cluster verts"), numFineVerts);
-    Kokkos::parallel_for(range_pol(0, numFineVerts), ClusterSizeFunctor(clusterOffsets, labels));
-    KokkosKernels::Impl::exclusive_parallel_prefix_sum<ordinal_view_t, exec_space>(numCoarseVerts + 1, clusterOffsets);
+    clusterVerts   = ordinal_view_t(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Cluster verts"),
+        numFineVerts);
+    Kokkos::parallel_for(range_pol(0, numFineVerts),
+                         ClusterSizeFunctor(clusterOffsets, labels));
+    KokkosKernels::Impl::exclusive_parallel_prefix_sum<ordinal_view_t,
+                                                       exec_space>(
+        numCoarseVerts + 1, clusterOffsets);
     {
-      ordinal_view_t tempInsertCounts("Temporary cluster insert counts", numCoarseVerts);
-      Kokkos::parallel_for(range_pol(0, numFineVerts), FillClusterVertsFunctor(clusterOffsets, clusterVerts, labels, tempInsertCounts));
+      ordinal_view_t tempInsertCounts("Temporary cluster insert counts",
+                                      numCoarseVerts);
+      Kokkos::parallel_for(range_pol(0, numFineVerts),
+                           FillClusterVertsFunctor(clusterOffsets, clusterVerts,
+                                                   labels, tempInsertCounts));
     }
-    //Determine the set of edges (in the point graph) that cross between two distinct clusters
-    int vectorSize = KokkosKernels::Impl::kk_get_suggested_vector_size(numFineVerts, fineEntries.extent(0), KokkosKernels::Impl::kk_get_exec_space_type<exec_space>());
+    // Determine the set of edges (in the point graph) that cross between two
+    // distinct clusters
+    int vectorSize = KokkosKernels::Impl::kk_get_suggested_vector_size(
+        numFineVerts, fineEntries.extent(0),
+        KokkosKernels::Impl::kk_get_exec_space_type<exec_space>());
     bitset_t crossClusterEdgeMask(fineEntries.extent(0));
     size_type numClusterEdges;
     {
-      BuildCrossClusterMaskFunctor buildEdgeMask(fineRowmap, fineEntries, clusterOffsets, clusterVerts, labels, crossClusterEdgeMask);
-      int sharedPerTeam = buildEdgeMask.team_shmem_size(0); //using team-size = 0 for since no per-thread shared is used.
-      int teamSize = KokkosKernels::Impl::get_suggested_team_size<team_pol>(buildEdgeMask, vectorSize, sharedPerTeam, 0);
-      Kokkos::parallel_for(team_pol(numCoarseVerts, teamSize, vectorSize).set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam)), buildEdgeMask);
+      BuildCrossClusterMaskFunctor buildEdgeMask(fineRowmap, fineEntries,
+                                                 clusterOffsets, clusterVerts,
+                                                 labels, crossClusterEdgeMask);
+      int sharedPerTeam = buildEdgeMask.team_shmem_size(
+          0);  // using team-size = 0 for since no per-thread shared is used.
+      int teamSize = KokkosKernels::Impl::get_suggested_team_size<team_pol>(
+          buildEdgeMask, vectorSize, sharedPerTeam, 0);
+      Kokkos::parallel_for(
+          team_pol(numCoarseVerts, teamSize, vectorSize)
+              .set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam)),
+          buildEdgeMask);
       numClusterEdges = crossClusterEdgeMask.count();
     }
-    coarseRowmap = coarse_rowmap_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Cluster graph rowmap"), numCoarseVerts + 1);
-    coarseEntries = coarse_entries_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Cluster graph colinds"), numClusterEdges);
-    Kokkos::parallel_scan(range_pol(0, numFineVerts), FillClusterEntriesFunctor
-        (fineRowmap, fineEntries, coarseRowmap, coarseEntries, clusterOffsets, clusterVerts, labels, crossClusterEdgeMask));
+    coarseRowmap = coarse_rowmap_t(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Cluster graph rowmap"),
+        numCoarseVerts + 1);
+    coarseEntries =
+        coarse_entries_t(Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                                            "Cluster graph colinds"),
+                         numClusterEdges);
+    Kokkos::parallel_scan(
+        range_pol(0, numFineVerts),
+        FillClusterEntriesFunctor(fineRowmap, fineEntries, coarseRowmap,
+                                  coarseEntries, clusterOffsets, clusterVerts,
+                                  labels, crossClusterEdgeMask));
   }
 
   coarse_rowmap_t coarseRowmap;
@@ -298,6 +333,7 @@ struct ExplicitGraphCoarsening
   ordinal_view_t clusterVerts;
 };
 
-}}
+}  // namespace Impl
+}  // namespace KokkosGraph
 
 #endif
diff --git a/src/impl/KokkosKernels_helpers.hpp b/src/impl/KokkosKernels_helpers.hpp
index b5e3bddb05..e3e918956e 100644
--- a/src/impl/KokkosKernels_helpers.hpp
+++ b/src/impl/KokkosKernels_helpers.hpp
@@ -65,7 +65,8 @@ struct GetUnifiedLayoutPreferring {
 template <class ViewType>
 struct GetUnifiedLayout {
   using array_layout =
-      typename GetUnifiedLayoutPreferring<ViewType, default_layout>::array_layout;
+      typename GetUnifiedLayoutPreferring<ViewType,
+                                          default_layout>::array_layout;
 };
 
 template <class T, class TX, bool do_const,
@@ -76,21 +77,24 @@ struct GetUnifiedScalarViewType {
 
 template <class T, class TX>
 struct GetUnifiedScalarViewType<T, TX, false, true> {
-  typedef Kokkos::View<
-      typename T::non_const_value_type*,
-      typename KokkosKernels::Impl::GetUnifiedLayoutPreferring<T, typename TX::array_layout>::array_layout,
-      typename T::device_type,
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > type;
+  typedef Kokkos::View<typename T::non_const_value_type*,
+                       typename KokkosKernels::Impl::GetUnifiedLayoutPreferring<
+                           T, typename TX::array_layout>::array_layout,
+                       typename T::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      type;
 };
 
-template<class T, class TX>
-struct GetUnifiedScalarViewType<T,TX,true,true> {
+template <class T, class TX>
+struct GetUnifiedScalarViewType<T, TX, true, true> {
   typedef Kokkos::View<typename T::const_value_type*,
-                       typename KokkosKernels::Impl::GetUnifiedLayoutPreferring<T, typename TX::array_layout>::array_layout,
+                       typename KokkosKernels::Impl::GetUnifiedLayoutPreferring<
+                           T, typename TX::array_layout>::array_layout,
                        typename T::device_type,
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > type;
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      type;
 };
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosKernels
 #endif
diff --git a/src/impl/generated_specializations_cpp/spmv/KokkosSparse_spmv_blockcrsmatrix_eti_spec_inst.cpp.in b/src/impl/generated_specializations_cpp/spmv/KokkosSparse_spmv_blockcrsmatrix_eti_spec_inst.cpp.in
new file mode 100644
index 0000000000..1bb85d6067
--- /dev/null
+++ b/src/impl/generated_specializations_cpp/spmv/KokkosSparse_spmv_blockcrsmatrix_eti_spec_inst.cpp.in
@@ -0,0 +1,56 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true
+#include "KokkosKernels_config.h"
+#include "KokkosSparse_spmv_blockcrsmatrix_spec.hpp"
+
+namespace KokkosSparse {
+namespace Experimental {
+namespace Impl {
+// clang-format off
+@SPARSE_SPMV_BLOCKCRSMATRIX_ETI_INST_BLOCK@
+// clang-format on
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosSparse
diff --git a/src/impl/generated_specializations_cpp/spmv/KokkosSparse_spmv_bsrmatrix_eti_spec_inst.cpp.in b/src/impl/generated_specializations_cpp/spmv/KokkosSparse_spmv_bsrmatrix_eti_spec_inst.cpp.in
new file mode 100644
index 0000000000..eafb5c0b1e
--- /dev/null
+++ b/src/impl/generated_specializations_cpp/spmv/KokkosSparse_spmv_bsrmatrix_eti_spec_inst.cpp.in
@@ -0,0 +1,56 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true
+#include "KokkosKernels_config.h"
+#include "KokkosSparse_spmv_bsrmatrix_spec.hpp"
+
+namespace KokkosSparse {
+namespace Experimental {
+namespace Impl {
+// clang-format off
+@SPARSE_SPMV_BSRMATRIX_ETI_INST_BLOCK@
+// clang-format on
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosSparse
\ No newline at end of file
diff --git a/src/impl/generated_specializations_cpp/spmv/KokkosSparse_spmv_mv_blockcrsmatrix_eti_spec_inst.cpp.in b/src/impl/generated_specializations_cpp/spmv/KokkosSparse_spmv_mv_blockcrsmatrix_eti_spec_inst.cpp.in
new file mode 100644
index 0000000000..ae672bc04a
--- /dev/null
+++ b/src/impl/generated_specializations_cpp/spmv/KokkosSparse_spmv_mv_blockcrsmatrix_eti_spec_inst.cpp.in
@@ -0,0 +1,56 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true
+#include "KokkosKernels_config.h"
+#include "KokkosSparse_spmv_blockcrsmatrix_spec.hpp"
+
+namespace KokkosSparse {
+namespace Experimental {
+namespace Impl {
+// clang-format off
+@SPARSE_SPMV_MV_BLOCKCRSMATRIX_ETI_INST_BLOCK@
+// clang-format on
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosSparse
diff --git a/src/impl/generated_specializations_cpp/spmv/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_inst.cpp.in b/src/impl/generated_specializations_cpp/spmv/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_inst.cpp.in
new file mode 100644
index 0000000000..5f2c437627
--- /dev/null
+++ b/src/impl/generated_specializations_cpp/spmv/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_inst.cpp.in
@@ -0,0 +1,56 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true
+#include "KokkosKernels_config.h"
+#include "KokkosSparse_spmv_bsrmatrix_spec.hpp"
+
+namespace KokkosSparse {
+namespace Experimental {
+namespace Impl {
+// clang-format off
+@SPARSE_SPMV_MV_BSRMATRIX_ETI_INST_BLOCK@
+/// // clang-format on
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosSparse
diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spmv_blockcrsmatrix_eti_spec_avail.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_blockcrsmatrix_eti_spec_avail.hpp.in
new file mode 100644
index 0000000000..1ce97a5795
--- /dev/null
+++ b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_blockcrsmatrix_eti_spec_avail.hpp.in
@@ -0,0 +1,56 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_ETI_SPEC_AVAIL_HPP_
+#define KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_ETI_SPEC_AVAIL_HPP_
+
+namespace KokkosSparse {
+namespace Experimental {
+namespace Impl {
+// clang-format off
+@SPARSE_SPMV_BLOCKCRSMATRIX_ETI_AVAIL_BLOCK@
+// clang-format on
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosSparse
+#endif
diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spmv_blockcrsmatrix_eti_spec_decl.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_blockcrsmatrix_eti_spec_decl.hpp.in
new file mode 100644
index 0000000000..9ad333ccfd
--- /dev/null
+++ b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_blockcrsmatrix_eti_spec_decl.hpp.in
@@ -0,0 +1,56 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_ETI_SPEC_DECL_HPP_
+#define KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_ETI_SPEC_DECL_HPP_
+
+namespace KokkosSparse {
+namespace Experimental {
+namespace Impl {
+// clang-format off
+@SPARSE_SPMV_BLOCKCRSMATRIX_ETI_DECL_BLOCK@
+// clang-format on
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosSparse
+#endif
diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_avail.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_avail.hpp.in
new file mode 100644
index 0000000000..be5a45d793
--- /dev/null
+++ b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_avail.hpp.in
@@ -0,0 +1,55 @@
+#ifndef KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_AVAIL_HPP_
+#define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_AVAIL_HPP_
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+namespace KokkosSparse {
+namespace Experimental {
+namespace Impl {
+// clang-format off
+@SPARSE_SPMV_BSRMATRIX_ETI_AVAIL_BLOCK@
+// clang-format on
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosSparse
+#endif
diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_decl.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_decl.hpp.in
new file mode 100644
index 0000000000..164980342e
--- /dev/null
+++ b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_decl.hpp.in
@@ -0,0 +1,55 @@
+#ifndef KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_DECL_HPP_
+#define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_DECL_HPP_
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+namespace KokkosSparse {
+namespace Experimental {
+namespace Impl {
+// clang-format off
+@SPARSE_SPMV_BSRMATRIX_ETI_DECL_BLOCK@
+// clang-format on
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosSparse
+#endif
\ No newline at end of file
diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_blockcrsmatrix_eti_spec_avail.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_blockcrsmatrix_eti_spec_avail.hpp.in
new file mode 100644
index 0000000000..85b72e3b7b
--- /dev/null
+++ b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_blockcrsmatrix_eti_spec_avail.hpp.in
@@ -0,0 +1,56 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSSPARSE_SPMV_MV_BLOCKCRSMATRIX_ETI_SPEC_AVAIL_HPP_
+#define KOKKOSSPARSE_SPMV_MV_BLOCKCRSMATRIX_ETI_SPEC_AVAIL_HPP_
+
+namespace KokkosSparse {
+namespace Experimental {
+namespace Impl {
+// clang-format off
+@SPARSE_SPMV_MV_BLOCKCRSMATRIX_ETI_AVAIL_BLOCK@
+// clang-format on
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosSparse
+#endif
diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_blockcrsmatrix_eti_spec_decl.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_blockcrsmatrix_eti_spec_decl.hpp.in
new file mode 100644
index 0000000000..c0b77c54f2
--- /dev/null
+++ b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_blockcrsmatrix_eti_spec_decl.hpp.in
@@ -0,0 +1,56 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSSPARSE_SPMV_MV_BLOCKCRSMATRIX_ETI_SPEC_DECL_HPP_
+#define KOKKOSSPARSE_SPMV_MV_BLOCKCRSMATRIX_ETI_SPEC_DECL_HPP_
+
+namespace KokkosSparse {
+namespace Experimental {
+namespace Impl {
+// clang-format off
+@SPARSE_SPMV_MV_BLOCKCRSMATRIX_ETI_DECL_BLOCK@
+// clang-format on
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosSparse
+#endif
diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_avail.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_avail.hpp.in
new file mode 100644
index 0000000000..b3e9fb662a
--- /dev/null
+++ b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_avail.hpp.in
@@ -0,0 +1,56 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_AVAIL_HPP_
+#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_AVAIL_HPP_
+
+namespace KokkosSparse {
+namespace Experimental {
+namespace Impl {
+// clang-format off
+@SPARSE_SPMV_MV_BSRMATRIX_ETI_AVAIL_BLOCK@
+// clang-format on
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosSparse
+#endif
diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_decl.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_decl.hpp.in
new file mode 100644
index 0000000000..c49e565f7b
--- /dev/null
+++ b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_decl.hpp.in
@@ -0,0 +1,56 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_DECL_HPP_
+#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_DECL_HPP_
+
+namespace KokkosSparse {
+namespace Experimental {
+namespace Impl {
+// clang-format off
+@SPARSE_SPMV_MV_BSRMATRIX_ETI_DECL_BLOCK@
+// clang-format on
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosSparse
+#endif
\ No newline at end of file
diff --git a/src/impl/tpls/KokkosBlas1_abs_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_abs_tpl_spec_avail.hpp
index 0fff985da9..aefd0c8d2e 100644
--- a/src/impl/tpls/KokkosBlas1_abs_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosBlas1_abs_tpl_spec_avail.hpp
@@ -48,11 +48,11 @@
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class RMV, class XMV, int rank = RMV::rank>
+template <class RMV, class XMV, int rank = RMV::rank>
 struct abs_tpl_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/impl/tpls/KokkosBlas1_abs_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_abs_tpl_spec_decl.hpp
index 4545d82b06..6004a223c4 100644
--- a/src/impl/tpls/KokkosBlas1_abs_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosBlas1_abs_tpl_spec_decl.hpp
@@ -46,8 +46,7 @@
 #define KOKKOSBLAS1_ABS_TPL_SPEC_DECL_HPP_
 
 namespace KokkosBlas {
-namespace Impl {
-}
-}
+namespace Impl {}
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp
index c0e58b19b3..b367254e91 100644
--- a/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp
@@ -48,12 +48,12 @@
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class AV, class XMV, class BV, class YMV, int rank = YMV::Rank>
+template <class AV, class XMV, class BV, class YMV, int rank = YMV::Rank>
 struct axpby_tpl_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 namespace KokkosBlas {
 namespace Impl {
@@ -61,44 +61,56 @@ namespace Impl {
 // Generic Host side BLAS (could be MKL or whatever)
 #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
 
-#define KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS( SCALAR , LAYOUT, MEMSPACE ) \
-template<class ExecSpace> \
-struct axpby_tpl_spec_avail< \
-     SCALAR, \
-     Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     SCALAR, \
-     Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     1>  { enum : bool { value = true }; };
+#define KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE)        \
+  template <class ExecSpace>                                                   \
+  struct axpby_tpl_spec_avail<                                                 \
+      SCALAR,                                                                  \
+      Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      SCALAR,                                                                  \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1> {                                                                     \
+    enum : bool { value = true };                                              \
+  };
 
-KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS( double,                  Kokkos::LayoutLeft, Kokkos::HostSpace)
-KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS( float,                   Kokkos::LayoutLeft, Kokkos::HostSpace)
-KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::HostSpace)
-KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::HostSpace)
+KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft,
+                                      Kokkos::HostSpace)
+KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft,
+                                      Kokkos::HostSpace)
+KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<double>,
+                                      Kokkos::LayoutLeft, Kokkos::HostSpace)
+KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<float>,
+                                      Kokkos::LayoutLeft, Kokkos::HostSpace)
 
 #endif
 
 // cuBLAS
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
 
-#define KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( SCALAR , LAYOUT, MEMSPACE ) \
-template<class ExecSpace> \
-struct axpby_tpl_spec_avail< \
-     SCALAR, \
-     Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     SCALAR, \
-     Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     1>  { enum : bool { value = true }; };
+#define KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE)      \
+  template <class ExecSpace>                                                   \
+  struct axpby_tpl_spec_avail<                                                 \
+      SCALAR,                                                                  \
+      Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      SCALAR,                                                                  \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1> {                                                                     \
+    enum : bool { value = true };                                              \
+  };
 
-KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( double,                  Kokkos::LayoutLeft, Kokkos::CudaSpace)
-KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( float,                   Kokkos::LayoutLeft, Kokkos::CudaSpace)
-KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::CudaSpace)
-KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft,
+                                        Kokkos::CudaSpace)
+KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft,
+                                        Kokkos::CudaSpace)
+KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<double>,
+                                        Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<float>,
+                                        Kokkos::LayoutLeft, Kokkos::CudaSpace)
 
 #endif
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 #endif
diff --git a/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp
index edca714e67..e80494d358 100644
--- a/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp
@@ -45,344 +45,397 @@
 #ifndef KOKKOSBLAS1_AXPBY_TPL_SPEC_DECL_HPP_
 #define KOKKOSBLAS1_AXPBY_TPL_SPEC_DECL_HPP_
 
-
-
 namespace KokkosBlas {
 namespace Impl {
 
 namespace {
-  template<class AV, class XMV, class BV, class YMV>
-  inline void axpby_print_specialization() {
-      #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION 
-        printf("KokkosBlas1::axpby<> TPL Blas specialization for < %s , %s , %s , %s >\n",typeid(AV).name(),typeid(XMV).name(),typeid(BV).name(),typeid(YMV).name()); 
-      #endif 
-  }
-}
-}
+template <class AV, class XMV, class BV, class YMV>
+inline void axpby_print_specialization() {
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+  printf(
+      "KokkosBlas1::axpby<> TPL Blas specialization for < %s , %s , %s , %s "
+      ">\n",
+      typeid(AV).name(), typeid(XMV).name(), typeid(BV).name(),
+      typeid(YMV).name());
+#endif
 }
+}  // namespace
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
 #include "KokkosBlas_Host_tpl.hpp"
 namespace KokkosBlas {
 namespace Impl {
 
-#define KOKKOSBLAS1_DAXPBY_BLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Axpby< \
-     double, \
-     Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     double, \
-     Kokkos::View<double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     1, true, ETI_SPEC_AVAIL> { \
-  typedef double AV; \
-  typedef double BV; \
-  typedef Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef Kokkos::View<double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > YV; \
-\
-  static void \
-  axpby (const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,double]"); \
-    if((X.extent(0) < INT_MAX) && (beta == 1.0)) { \
-      axpby_print_specialization<AV,XV,BV,YV>(); \
-      int N = X.extent(0); \
-      int one = 1; \
-      HostBlas<double>::axpy(N,alpha,X.data(),one,Y.data(),one);   \
-    } else \
-      Axpby<AV,XV,BV,YV,YV::Rank,false,ETI_SPEC_AVAIL>::axpby(alpha,X,beta,Y); \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
+#define KOKKOSBLAS1_DAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL)              \
+  template <class ExecSpace>                                                   \
+  struct Axpby<                                                                \
+      double,                                                                  \
+      Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      double,                                                                  \
+      Kokkos::View<double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1, true, ETI_SPEC_AVAIL> {                                               \
+    typedef double AV;                                                         \
+    typedef double BV;                                                         \
+    typedef Kokkos::View<const double*, LAYOUT,                                \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                  \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XV;                                                                    \
+    typedef Kokkos::View<double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        YV;                                                                    \
+                                                                               \
+    static void axpby(const AV& alpha, const XV& X, const BV& beta,            \
+                      const YV& Y) {                                           \
+      Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,double]");     \
+      if ((X.extent(0) < INT_MAX) && (beta == 1.0)) {                          \
+        axpby_print_specialization<AV, XV, BV, YV>();                          \
+        int N   = X.extent(0);                                                 \
+        int one = 1;                                                           \
+        HostBlas<double>::axpy(N, alpha, X.data(), one, Y.data(), one);        \
+      } else                                                                   \
+        Axpby<AV, XV, BV, YV, YV::Rank, false, ETI_SPEC_AVAIL>::axpby(         \
+            alpha, X, beta, Y);                                                \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
 
-#define KOKKOSBLAS1_SAXPBY_BLAS( LAYOUT, MEMSPACE , ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Axpby< \
-     float, \
-     Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     float, \
-     Kokkos::View<float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     1, true, ETI_SPEC_AVAIL> { \
-  typedef float AV; \
-  typedef float BV; \
-  typedef Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef Kokkos::View<float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > YV; \
-\
-  static void \
-  axpby (const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,float]"); \
-    if((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \
-      axpby_print_specialization<AV,XV,BV,YV>(); \
-      int N = X.extent(0); \
-      int one = 1; \
-      HostBlas<float>::axpy(N,alpha,X.data(),one,Y.data(),one);        \
-    } else \
-      Axpby<AV,XV,BV,YV,YV::Rank,false,ETI_SPEC_AVAIL>::axpby(alpha,X,beta,Y); \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS1_SAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL)             \
+  template <class ExecSpace>                                                  \
+  struct Axpby<                                                               \
+      float,                                                                  \
+      Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      float,                                                                  \
+      Kokkos::View<float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      1, true, ETI_SPEC_AVAIL> {                                              \
+    typedef float AV;                                                         \
+    typedef float BV;                                                         \
+    typedef Kokkos::View<const float*, LAYOUT,                                \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        XV;                                                                   \
+    typedef Kokkos::View<float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        YV;                                                                   \
+                                                                              \
+    static void axpby(const AV& alpha, const XV& X, const BV& beta,           \
+                      const YV& Y) {                                          \
+      Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,float]");     \
+      if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) {                        \
+        axpby_print_specialization<AV, XV, BV, YV>();                         \
+        int N   = X.extent(0);                                                \
+        int one = 1;                                                          \
+        HostBlas<float>::axpy(N, alpha, X.data(), one, Y.data(), one);        \
+      } else                                                                  \
+        Axpby<AV, XV, BV, YV, YV::Rank, false, ETI_SPEC_AVAIL>::axpby(        \
+            alpha, X, beta, Y);                                               \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
+  };
 
-#define KOKKOSBLAS1_ZAXPBY_BLAS( LAYOUT, MEMSPACE , ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Axpby< \
-     Kokkos::complex<double>, \
-     Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::complex<double>, \
-     Kokkos::View<Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     1, true, ETI_SPEC_AVAIL> { \
-  typedef Kokkos::complex<double> AV; \
-  typedef Kokkos::complex<double> BV; \
-  typedef Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef Kokkos::View<Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > YV; \
-\
-  static void \
-  axpby (const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,complex<double>]"); \
-    if((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \
-      axpby_print_specialization<AV,XV,BV,YV>(); \
-      int N = X.extent(0); \
-      int one = 1; \
-      const std::complex<double> alpha_val = alpha;                   \
-      HostBlas<std::complex<double> >::axpy                           \
-        (N,alpha_val,    \
-         reinterpret_cast<const std::complex<double>*>(X.data()),one,  \
-         reinterpret_cast<std::complex<double>* >(Y.data()),one);       \
-    } else \
-      Axpby<AV,XV,BV,YV,YV::Rank,false,ETI_SPEC_AVAIL>::axpby(alpha,X,beta,Y); \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS1_ZAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL)         \
+  template <class ExecSpace>                                              \
+  struct Axpby<Kokkos::complex<double>,                                   \
+               Kokkos::View<const Kokkos::complex<double>*, LAYOUT,       \
+                            Kokkos::Device<ExecSpace, MEMSPACE>,          \
+                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >,    \
+               Kokkos::complex<double>,                                   \
+               Kokkos::View<Kokkos::complex<double>*, LAYOUT,             \
+                            Kokkos::Device<ExecSpace, MEMSPACE>,          \
+                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >,    \
+               1, true, ETI_SPEC_AVAIL> {                                 \
+    typedef Kokkos::complex<double> AV;                                   \
+    typedef Kokkos::complex<double> BV;                                   \
+    typedef Kokkos::View<const Kokkos::complex<double>*, LAYOUT,          \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,             \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >        \
+        XV;                                                               \
+    typedef Kokkos::View<Kokkos::complex<double>*, LAYOUT,                \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,             \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >        \
+        YV;                                                               \
+                                                                          \
+    static void axpby(const AV& alpha, const XV& X, const BV& beta,       \
+                      const YV& Y) {                                      \
+      Kokkos::Profiling::pushRegion(                                      \
+          "KokkosBlas::axpby[TPL_BLAS,complex<double>]");                 \
+      if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) {                    \
+        axpby_print_specialization<AV, XV, BV, YV>();                     \
+        int N                                = X.extent(0);               \
+        int one                              = 1;                         \
+        const std::complex<double> alpha_val = alpha;                     \
+        HostBlas<std::complex<double> >::axpy(                            \
+            N, alpha_val,                                                 \
+            reinterpret_cast<const std::complex<double>*>(X.data()), one, \
+            reinterpret_cast<std::complex<double>*>(Y.data()), one);      \
+      } else                                                              \
+        Axpby<AV, XV, BV, YV, YV::Rank, false, ETI_SPEC_AVAIL>::axpby(    \
+            alpha, X, beta, Y);                                           \
+      Kokkos::Profiling::popRegion();                                     \
+    }                                                                     \
+  };
 
-#define KOKKOSBLAS1_CAXPBY_BLAS( LAYOUT, MEMSPACE , ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Axpby< \
-     Kokkos::complex<float>, \
-     Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::complex<float>, \
-     Kokkos::View<Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     1, true, ETI_SPEC_AVAIL> { \
-  typedef Kokkos::complex<float> AV; \
-  typedef Kokkos::complex<float> BV; \
-  typedef Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef Kokkos::View<Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > YV; \
-\
-  static void \
-  axpby (const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,complex<float>]"); \
-    if((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \
-      axpby_print_specialization<AV,XV,BV,YV>(); \
-      int N = X.extent(0); \
-      int one = 1; \
-      const std::complex<float> alpha_val = alpha;                  \
-      HostBlas<std::complex<float> >::axpy                           \
-        (N,alpha_val,      \
-         reinterpret_cast<const std::complex<float>*>(X.data()),one,  \
-         reinterpret_cast<std::complex<float>*>(Y.data()),one);       \
-    } else \
-      Axpby<AV,XV,BV,YV,YV::Rank,false,ETI_SPEC_AVAIL>::axpby(alpha,X,beta,Y); \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS1_CAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL)        \
+  template <class ExecSpace>                                             \
+  struct Axpby<Kokkos::complex<float>,                                   \
+               Kokkos::View<const Kokkos::complex<float>*, LAYOUT,       \
+                            Kokkos::Device<ExecSpace, MEMSPACE>,         \
+                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >,   \
+               Kokkos::complex<float>,                                   \
+               Kokkos::View<Kokkos::complex<float>*, LAYOUT,             \
+                            Kokkos::Device<ExecSpace, MEMSPACE>,         \
+                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >,   \
+               1, true, ETI_SPEC_AVAIL> {                                \
+    typedef Kokkos::complex<float> AV;                                   \
+    typedef Kokkos::complex<float> BV;                                   \
+    typedef Kokkos::View<const Kokkos::complex<float>*, LAYOUT,          \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,            \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >       \
+        XV;                                                              \
+    typedef Kokkos::View<Kokkos::complex<float>*, LAYOUT,                \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,            \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >       \
+        YV;                                                              \
+                                                                         \
+    static void axpby(const AV& alpha, const XV& X, const BV& beta,      \
+                      const YV& Y) {                                     \
+      Kokkos::Profiling::pushRegion(                                     \
+          "KokkosBlas::axpby[TPL_BLAS,complex<float>]");                 \
+      if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) {                   \
+        axpby_print_specialization<AV, XV, BV, YV>();                    \
+        int N                               = X.extent(0);               \
+        int one                             = 1;                         \
+        const std::complex<float> alpha_val = alpha;                     \
+        HostBlas<std::complex<float> >::axpy(                            \
+            N, alpha_val,                                                \
+            reinterpret_cast<const std::complex<float>*>(X.data()), one, \
+            reinterpret_cast<std::complex<float>*>(Y.data()), one);      \
+      } else                                                             \
+        Axpby<AV, XV, BV, YV, YV::Rank, false, ETI_SPEC_AVAIL>::axpby(   \
+            alpha, X, beta, Y);                                          \
+      Kokkos::Profiling::popRegion();                                    \
+    }                                                                    \
+  };
 
-KOKKOSBLAS1_DAXPBY_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_DAXPBY_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
+KOKKOSBLAS1_DAXPBY_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true)
+KOKKOSBLAS1_DAXPBY_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false)
 
-KOKKOSBLAS1_SAXPBY_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_SAXPBY_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
+KOKKOSBLAS1_SAXPBY_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true)
+KOKKOSBLAS1_SAXPBY_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false)
 
-KOKKOSBLAS1_ZAXPBY_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_ZAXPBY_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
+KOKKOSBLAS1_ZAXPBY_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true)
+KOKKOSBLAS1_ZAXPBY_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false)
 
-KOKKOSBLAS1_CAXPBY_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_CAXPBY_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
+KOKKOSBLAS1_CAXPBY_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true)
+KOKKOSBLAS1_CAXPBY_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false)
 
 #undef KOKKOSBLAS1_DAXPBY_BLAS
 #undef KOKKOSBLAS1_SAXPBY_BLAS
 #undef KOKKOSBLAS1_ZAXPBY_BLAS
 #undef KOKKOSBLAS1_CAXPBY_BLAS
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
-#endif // KOKKOSKERNELS_ENABLE_TPL_BLAS
+#endif  // KOKKOSKERNELS_ENABLE_TPL_BLAS
 
 // cuBLAS
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
-#include<KokkosBlas_tpl_spec.hpp>
+#include <KokkosBlas_tpl_spec.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
 
-#define KOKKOSBLAS1_DAXPBY_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Axpby< \
-     double, \
-     Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     double, \
-     Kokkos::View<double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     1, true, ETI_SPEC_AVAIL> { \
-  typedef double AV; \
-  typedef double BV; \
-  typedef Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef Kokkos::View<double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > YV; \
-  typedef typename XV::size_type size_type; \
-\
-  static void \
-  axpby (const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,double]"); \
-    const size_type numElems = X.extent(0); \
-    if((numElems < static_cast<size_type> (INT_MAX)) && (beta == 1.0)) { \
-      axpby_print_specialization<AV,XV,BV,YV>(); \
-      const int N = static_cast<int> (numElems); \
-      constexpr int one = 1; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasDaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one); \
-    } else \
-      Axpby<AV,XV,BV,YV,YV::Rank,false,ETI_SPEC_AVAIL>::axpby(alpha,X,beta,Y); \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
+#define KOKKOSBLAS1_DAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL)            \
+  template <class ExecSpace>                                                   \
+  struct Axpby<                                                                \
+      double,                                                                  \
+      Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      double,                                                                  \
+      Kokkos::View<double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1, true, ETI_SPEC_AVAIL> {                                               \
+    typedef double AV;                                                         \
+    typedef double BV;                                                         \
+    typedef Kokkos::View<const double*, LAYOUT,                                \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                  \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XV;                                                                    \
+    typedef Kokkos::View<double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        YV;                                                                    \
+    typedef typename XV::size_type size_type;                                  \
+                                                                               \
+    static void axpby(const AV& alpha, const XV& X, const BV& beta,            \
+                      const YV& Y) {                                           \
+      Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,double]");   \
+      const size_type numElems = X.extent(0);                                  \
+      if ((numElems < static_cast<size_type>(INT_MAX)) && (beta == 1.0)) {     \
+        axpby_print_specialization<AV, XV, BV, YV>();                          \
+        const int N       = static_cast<int>(numElems);                        \
+        constexpr int one = 1;                                                 \
+        KokkosBlas::Impl::CudaBlasSingleton& s =                               \
+            KokkosBlas::Impl::CudaBlasSingleton::singleton();                  \
+        cublasDaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one);        \
+      } else                                                                   \
+        Axpby<AV, XV, BV, YV, YV::Rank, false, ETI_SPEC_AVAIL>::axpby(         \
+            alpha, X, beta, Y);                                                \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
 
-#define KOKKOSBLAS1_SAXPBY_CUBLAS( LAYOUT, MEMSPACE , ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Axpby< \
-     float, \
-     Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     float, \
-     Kokkos::View<float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     1, true, ETI_SPEC_AVAIL> { \
-  typedef float AV; \
-  typedef float BV; \
-  typedef Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef Kokkos::View<float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > YV; \
-  typedef typename XV::size_type size_type; \
-\
-  static void \
-  axpby (const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,float]"); \
-    const size_type numElems = X.extent(0); \
-    if((numElems < static_cast<size_type> (INT_MAX)) && (beta == 1.0f)) { \
-      axpby_print_specialization<AV,XV,BV,YV>(); \
-      const int N = static_cast<int> (numElems); \
-      constexpr int one = 1; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasSaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one); \
-    } else \
-      Axpby<AV,XV,BV,YV,YV::Rank,false,ETI_SPEC_AVAIL>::axpby(alpha,X,beta,Y); \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS1_SAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL)           \
+  template <class ExecSpace>                                                  \
+  struct Axpby<                                                               \
+      float,                                                                  \
+      Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      float,                                                                  \
+      Kokkos::View<float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      1, true, ETI_SPEC_AVAIL> {                                              \
+    typedef float AV;                                                         \
+    typedef float BV;                                                         \
+    typedef Kokkos::View<const float*, LAYOUT,                                \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        XV;                                                                   \
+    typedef Kokkos::View<float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        YV;                                                                   \
+    typedef typename XV::size_type size_type;                                 \
+                                                                              \
+    static void axpby(const AV& alpha, const XV& X, const BV& beta,           \
+                      const YV& Y) {                                          \
+      Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,float]");   \
+      const size_type numElems = X.extent(0);                                 \
+      if ((numElems < static_cast<size_type>(INT_MAX)) && (beta == 1.0f)) {   \
+        axpby_print_specialization<AV, XV, BV, YV>();                         \
+        const int N       = static_cast<int>(numElems);                       \
+        constexpr int one = 1;                                                \
+        KokkosBlas::Impl::CudaBlasSingleton& s =                              \
+            KokkosBlas::Impl::CudaBlasSingleton::singleton();                 \
+        cublasSaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one);       \
+      } else                                                                  \
+        Axpby<AV, XV, BV, YV, YV::Rank, false, ETI_SPEC_AVAIL>::axpby(        \
+            alpha, X, beta, Y);                                               \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
+  };
 
-#define KOKKOSBLAS1_ZAXPBY_CUBLAS( LAYOUT, MEMSPACE , ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Axpby< \
-     Kokkos::complex<double>, \
-     Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::complex<double>, \
-     Kokkos::View<Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     1, true, ETI_SPEC_AVAIL> { \
-  typedef Kokkos::complex<double> AV; \
-  typedef Kokkos::complex<double> BV; \
-  typedef Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef Kokkos::View<Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > YV; \
-  typedef typename XV::size_type size_type; \
-\
-  static void \
-  axpby (const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,complex<double>]"); \
-    const size_type numElems = X.extent(0); \
-    if((numElems < static_cast<size_type> (INT_MAX)) && (beta == 1.0f)) { \
-      axpby_print_specialization<AV,XV,BV,YV>(); \
-      const int N = static_cast<int> (numElems); \
-      constexpr int one = 1; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasZaxpy(s.handle, N, reinterpret_cast<const cuDoubleComplex*>(&alpha), reinterpret_cast<const cuDoubleComplex*>(X.data()), one, reinterpret_cast<cuDoubleComplex*>(Y.data()), one); \
-    } else \
-      Axpby<AV,XV,BV,YV,YV::Rank,false,ETI_SPEC_AVAIL>::axpby(alpha,X,beta,Y); \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS1_ZAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL)          \
+  template <class ExecSpace>                                                 \
+  struct Axpby<Kokkos::complex<double>,                                      \
+               Kokkos::View<const Kokkos::complex<double>*, LAYOUT,          \
+                            Kokkos::Device<ExecSpace, MEMSPACE>,             \
+                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >,       \
+               Kokkos::complex<double>,                                      \
+               Kokkos::View<Kokkos::complex<double>*, LAYOUT,                \
+                            Kokkos::Device<ExecSpace, MEMSPACE>,             \
+                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >,       \
+               1, true, ETI_SPEC_AVAIL> {                                    \
+    typedef Kokkos::complex<double> AV;                                      \
+    typedef Kokkos::complex<double> BV;                                      \
+    typedef Kokkos::View<const Kokkos::complex<double>*, LAYOUT,             \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
+        XV;                                                                  \
+    typedef Kokkos::View<Kokkos::complex<double>*, LAYOUT,                   \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
+        YV;                                                                  \
+    typedef typename XV::size_type size_type;                                \
+                                                                             \
+    static void axpby(const AV& alpha, const XV& X, const BV& beta,          \
+                      const YV& Y) {                                         \
+      Kokkos::Profiling::pushRegion(                                         \
+          "KokkosBlas::axpby[TPL_CUBLAS,complex<double>]");                  \
+      const size_type numElems = X.extent(0);                                \
+      if ((numElems < static_cast<size_type>(INT_MAX)) && (beta == 1.0f)) {  \
+        axpby_print_specialization<AV, XV, BV, YV>();                        \
+        const int N       = static_cast<int>(numElems);                      \
+        constexpr int one = 1;                                               \
+        KokkosBlas::Impl::CudaBlasSingleton& s =                             \
+            KokkosBlas::Impl::CudaBlasSingleton::singleton();                \
+        cublasZaxpy(s.handle, N,                                             \
+                    reinterpret_cast<const cuDoubleComplex*>(&alpha),        \
+                    reinterpret_cast<const cuDoubleComplex*>(X.data()), one, \
+                    reinterpret_cast<cuDoubleComplex*>(Y.data()), one);      \
+      } else                                                                 \
+        Axpby<AV, XV, BV, YV, YV::Rank, false, ETI_SPEC_AVAIL>::axpby(       \
+            alpha, X, beta, Y);                                              \
+      Kokkos::Profiling::popRegion();                                        \
+    }                                                                        \
+  };
 
-#define KOKKOSBLAS1_CAXPBY_CUBLAS( LAYOUT, MEMSPACE , ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Axpby< \
-     Kokkos::complex<float>, \
-     Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::complex<float>, \
-     Kokkos::View<Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     1, true, ETI_SPEC_AVAIL> { \
-  typedef Kokkos::complex<float> AV; \
-  typedef Kokkos::complex<float> BV; \
-  typedef Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef Kokkos::View<Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > YV; \
-  typedef typename XV::size_type size_type; \
-\
-  static void \
-  axpby (const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,complex<float>]"); \
-    const size_type numElems = X.extent(0); \
-    if((numElems < static_cast<size_type> (INT_MAX)) && (beta == 1.0f)) { \
-      axpby_print_specialization<AV,XV,BV,YV>(); \
-      const int N = static_cast<int> (numElems); \
-      constexpr int one = 1; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasCaxpy(s.handle, N, reinterpret_cast<const cuComplex*>(&alpha), reinterpret_cast<const cuComplex*>(X.data()), one, reinterpret_cast<cuComplex*>(Y.data()), one); \
-    } else \
-      Axpby<AV,XV,BV,YV,YV::Rank,false,ETI_SPEC_AVAIL>::axpby(alpha,X,beta,Y); \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS1_CAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL)          \
+  template <class ExecSpace>                                                 \
+  struct Axpby<Kokkos::complex<float>,                                       \
+               Kokkos::View<const Kokkos::complex<float>*, LAYOUT,           \
+                            Kokkos::Device<ExecSpace, MEMSPACE>,             \
+                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >,       \
+               Kokkos::complex<float>,                                       \
+               Kokkos::View<Kokkos::complex<float>*, LAYOUT,                 \
+                            Kokkos::Device<ExecSpace, MEMSPACE>,             \
+                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >,       \
+               1, true, ETI_SPEC_AVAIL> {                                    \
+    typedef Kokkos::complex<float> AV;                                       \
+    typedef Kokkos::complex<float> BV;                                       \
+    typedef Kokkos::View<const Kokkos::complex<float>*, LAYOUT,              \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
+        XV;                                                                  \
+    typedef Kokkos::View<Kokkos::complex<float>*, LAYOUT,                    \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
+        YV;                                                                  \
+    typedef typename XV::size_type size_type;                                \
+                                                                             \
+    static void axpby(const AV& alpha, const XV& X, const BV& beta,          \
+                      const YV& Y) {                                         \
+      Kokkos::Profiling::pushRegion(                                         \
+          "KokkosBlas::axpby[TPL_CUBLAS,complex<float>]");                   \
+      const size_type numElems = X.extent(0);                                \
+      if ((numElems < static_cast<size_type>(INT_MAX)) && (beta == 1.0f)) {  \
+        axpby_print_specialization<AV, XV, BV, YV>();                        \
+        const int N       = static_cast<int>(numElems);                      \
+        constexpr int one = 1;                                               \
+        KokkosBlas::Impl::CudaBlasSingleton& s =                             \
+            KokkosBlas::Impl::CudaBlasSingleton::singleton();                \
+        cublasCaxpy(s.handle, N, reinterpret_cast<const cuComplex*>(&alpha), \
+                    reinterpret_cast<const cuComplex*>(X.data()), one,       \
+                    reinterpret_cast<cuComplex*>(Y.data()), one);            \
+      } else                                                                 \
+        Axpby<AV, XV, BV, YV, YV::Rank, false, ETI_SPEC_AVAIL>::axpby(       \
+            alpha, X, beta, Y);                                              \
+      Kokkos::Profiling::popRegion();                                        \
+    }                                                                        \
+  };
 
-KOKKOSBLAS1_DAXPBY_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_DAXPBY_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
+KOKKOSBLAS1_DAXPBY_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
+KOKKOSBLAS1_DAXPBY_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
 
-KOKKOSBLAS1_SAXPBY_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_SAXPBY_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
+KOKKOSBLAS1_SAXPBY_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
+KOKKOSBLAS1_SAXPBY_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
 
-KOKKOSBLAS1_ZAXPBY_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_ZAXPBY_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
+KOKKOSBLAS1_ZAXPBY_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
+KOKKOSBLAS1_ZAXPBY_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
 
-KOKKOSBLAS1_CAXPBY_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_CAXPBY_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
+KOKKOSBLAS1_CAXPBY_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
+KOKKOSBLAS1_CAXPBY_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
 
 #undef KOKKOSBLAS1_DAXPBY_CUBLAS
 #undef KOKKOSBLAS1_SAXPBY_CUBLAS
 #undef KOKKOSBLAS1_ZAXPBY_CUBLAS
 #undef KOKKOSBLAS1_CAXPBY_CUBLAS
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
-#endif // KOKKOSKERNELS_ENABLE_TPL_CUBLAS
+#endif  // KOKKOSKERNELS_ENABLE_TPL_CUBLAS
 
 #endif
diff --git a/src/impl/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp
index d3b0fabd71..f4249c68ed 100644
--- a/src/impl/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp
@@ -48,12 +48,13 @@
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class AV, class XMV, class YMV, int Xrank = XMV::Rank, int Yrank = YMV::Rank>
+template <class AV, class XMV, class YMV, int Xrank = XMV::Rank,
+          int Yrank = YMV::Rank>
 struct dot_tpl_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 namespace KokkosBlas {
 namespace Impl {
@@ -61,45 +62,57 @@ namespace Impl {
 // Generic Host side BLAS (could be MKL or whatever)
 #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
 // double
-#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS( SCALAR, LAYOUT, MEMSPACE ) \
-template<class ExecSpace> \
-struct dot_tpl_spec_avail< \
-Kokkos::View<SCALAR, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,1> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE)          \
+  template <class ExecSpace>                                                   \
+  struct dot_tpl_spec_avail<                                                   \
+      Kokkos::View<SCALAR, LAYOUT, Kokkos::HostSpace,                          \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1, 1> {                                                                  \
+    enum : bool { value = true };                                              \
+  };
 
-KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS( double,                  Kokkos::LayoutLeft, Kokkos::HostSpace)
-KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS( float,                   Kokkos::LayoutLeft, Kokkos::HostSpace)
-KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::HostSpace)
-KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::HostSpace)
+KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft,
+                                    Kokkos::HostSpace)
+KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft,
+                                    Kokkos::HostSpace)
+KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<double>, Kokkos::LayoutLeft,
+                                    Kokkos::HostSpace)
+KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<float>, Kokkos::LayoutLeft,
+                                    Kokkos::HostSpace)
 
 #endif
 
 // cuBLAS
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
 // double
-#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( SCALAR, LAYOUT, MEMSPACE ) \
-template<class ExecSpace> \
-struct dot_tpl_spec_avail< \
-Kokkos::View<SCALAR, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,1> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE)        \
+  template <class ExecSpace>                                                   \
+  struct dot_tpl_spec_avail<                                                   \
+      Kokkos::View<SCALAR, LAYOUT, Kokkos::HostSpace,                          \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1, 1> {                                                                  \
+    enum : bool { value = true };                                              \
+  };
 
-KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( double,                  Kokkos::LayoutLeft, Kokkos::CudaSpace)
-KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( float,                   Kokkos::LayoutLeft, Kokkos::CudaSpace)
-KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::CudaSpace)
-KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft,
+                                      Kokkos::CudaSpace)
+KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft,
+                                      Kokkos::CudaSpace)
+KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<double>,
+                                      Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<float>,
+                                      Kokkos::LayoutLeft, Kokkos::CudaSpace)
 
 #endif
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 #endif
diff --git a/src/impl/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp
index bafac1198a..693b5273be 100644
--- a/src/impl/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp
@@ -45,21 +45,21 @@
 #ifndef KOKKOSBLAS1_DOT_TPL_SPEC_DECL_HPP_
 #define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_HPP_
 
-
 namespace KokkosBlas {
 namespace Impl {
 
 namespace {
-  template<class RV, class XV, class YV>
-  inline void dot_print_specialization() {
-      #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-        printf("KokkosBlas1::dot<> TPL Blas specialization for < %s , %s , %s >\n",typeid(RV).name(),typeid(XV).name(),typeid(YV).name());
-      #endif
-  }
+template <class RV, class XV, class YV>
+inline void dot_print_specialization() {
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+  printf("KokkosBlas1::dot<> TPL Blas specialization for < %s , %s , %s >\n",
+         typeid(RV).name(), typeid(XV).name(), typeid(YV).name());
+#endif
 }
+}  // namespace
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 // Generic Host side BLAS (could be MKL or whatever)
 #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
@@ -68,318 +68,357 @@ namespace {
 namespace KokkosBlas {
 namespace Impl {
 
-#define KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_BLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Dot< \
-Kokkos::View<double, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<double, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void dot (RV& R, const XV& X, const XV& Y) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS,double]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      dot_print_specialization<RV,XV,XV>(); \
-      int N = numElems; \
-      int one = 1; \
-      R() = HostBlas<double>::dot(N,X.data(),one,Y.data(),one);    \
-    } else { \
-      Dot<RV,XV,XV,1,1,false,ETI_SPEC_AVAIL>::dot(R,X,Y); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL)  \
+  template <class ExecSpace>                                                   \
+  struct Dot<                                                                  \
+      Kokkos::View<double, LAYOUT, Kokkos::HostSpace,                          \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1, 1, true, ETI_SPEC_AVAIL> {                                            \
+    typedef Kokkos::View<double, LAYOUT, Kokkos::HostSpace,                    \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        RV;                                                                    \
+    typedef Kokkos::View<const double*, LAYOUT,                                \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                  \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XV;                                                                    \
+    typedef typename XV::size_type size_type;                                  \
+                                                                               \
+    static void dot(RV& R, const XV& X, const XV& Y) {                         \
+      Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS,double]");       \
+      const size_type numElems = X.extent(0);                                  \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                        \
+        dot_print_specialization<RV, XV, XV>();                                \
+        int N   = numElems;                                                    \
+        int one = 1;                                                           \
+        R()     = HostBlas<double>::dot(N, X.data(), one, Y.data(), one);      \
+      } else {                                                                 \
+        Dot<RV, XV, XV, 1, 1, false, ETI_SPEC_AVAIL>::dot(R, X, Y);            \
+      }                                                                        \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
 
-#define KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_BLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Dot< \
-Kokkos::View<float, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<float, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void dot (RV& R, const XV& X, const XV& Y) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS,float]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      dot_print_specialization<RV,XV,XV>(); \
-      int N = numElems; \
-      int one = 1; \
-      R() = HostBlas<float>::dot(N,X.data(),one,Y.data(),one);    \
-    } else { \
-      Dot<RV,XV,XV,1,1,false,ETI_SPEC_AVAIL>::dot(R,X,Y); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \
+  template <class ExecSpace>                                                  \
+  struct Dot<                                                                 \
+      Kokkos::View<float, LAYOUT, Kokkos::HostSpace,                          \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      1, 1, true, ETI_SPEC_AVAIL> {                                           \
+    typedef Kokkos::View<float, LAYOUT, Kokkos::HostSpace,                    \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        RV;                                                                   \
+    typedef Kokkos::View<const float*, LAYOUT,                                \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        XV;                                                                   \
+    typedef typename XV::size_type size_type;                                 \
+                                                                              \
+    static void dot(RV& R, const XV& X, const XV& Y) {                        \
+      Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS,float]");       \
+      const size_type numElems = X.extent(0);                                 \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                       \
+        dot_print_specialization<RV, XV, XV>();                               \
+        int N   = numElems;                                                   \
+        int one = 1;                                                          \
+        R()     = HostBlas<float>::dot(N, X.data(), one, Y.data(), one);      \
+      } else {                                                                \
+        Dot<RV, XV, XV, 1, 1, false, ETI_SPEC_AVAIL>::dot(R, X, Y);           \
+      }                                                                       \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
+  };
 
-#define KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_BLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Dot< \
-Kokkos::View<Kokkos::complex<double>, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<Kokkos::complex<double>, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void dot (RV& R, const XV& X, const XV& Y) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS,complex<double>]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      dot_print_specialization<RV,XV,XV>(); \
-      int N = numElems; \
-      int one = 1; \
-      R() = HostBlas<std::complex<double> >::dot                       \
-        (N,                                                             \
-         reinterpret_cast<const std::complex<double>* >(X.data()),one, \
-         reinterpret_cast<const std::complex<double>* >(Y.data()),one); \
-    } else { \
-      Dot<RV,XV,XV,1,1,false,ETI_SPEC_AVAIL>::dot(R,X,Y); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL)    \
+  template <class ExecSpace>                                                     \
+  struct Dot<Kokkos::View<Kokkos::complex<double>, LAYOUT, Kokkos::HostSpace,    \
+                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+             Kokkos::View<const Kokkos::complex<double>*, LAYOUT,                \
+                          Kokkos::Device<ExecSpace, MEMSPACE>,                   \
+                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+             Kokkos::View<const Kokkos::complex<double>*, LAYOUT,                \
+                          Kokkos::Device<ExecSpace, MEMSPACE>,                   \
+                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+             1, 1, true, ETI_SPEC_AVAIL> {                                       \
+    typedef Kokkos::View<Kokkos::complex<double>, LAYOUT, Kokkos::HostSpace,     \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >               \
+        RV;                                                                      \
+    typedef Kokkos::View<const Kokkos::complex<double>*, LAYOUT,                 \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                    \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >               \
+        XV;                                                                      \
+    typedef typename XV::size_type size_type;                                    \
+                                                                                 \
+    static void dot(RV& R, const XV& X, const XV& Y) {                           \
+      Kokkos::Profiling::pushRegion(                                             \
+          "KokkosBlas::dot[TPL_BLAS,complex<double>]");                          \
+      const size_type numElems = X.extent(0);                                    \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                          \
+        dot_print_specialization<RV, XV, XV>();                                  \
+        int N   = numElems;                                                      \
+        int one = 1;                                                             \
+        R()     = HostBlas<std::complex<double> >::dot(                          \
+            N, reinterpret_cast<const std::complex<double>*>(X.data()), one, \
+            reinterpret_cast<const std::complex<double>*>(Y.data()), one);   \
+      } else {                                                                   \
+        Dot<RV, XV, XV, 1, 1, false, ETI_SPEC_AVAIL>::dot(R, X, Y);              \
+      }                                                                          \
+      Kokkos::Profiling::popRegion();                                            \
+    }                                                                            \
+  };
 
-#define KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_BLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Dot< \
-Kokkos::View<Kokkos::complex<float>, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<Kokkos::complex<float>, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void dot (RV& R, const XV& X, const XV& Y) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS,complex<float>]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      dot_print_specialization<RV,XV,XV>(); \
-      int N = numElems; \
-      int one = 1; \
-      R() = HostBlas<std::complex<float> >::dot                       \
-        (N,                                                             \
-         reinterpret_cast<const std::complex<float>* >(X.data()),one, \
-         reinterpret_cast<const std::complex<float>* >(Y.data()),one); \
-    } else {                                                           \
-      Dot<RV,XV,XV,1,1,false,ETI_SPEC_AVAIL>::dot(R,X,Y); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL)   \
+  template <class ExecSpace>                                                    \
+  struct Dot<Kokkos::View<Kokkos::complex<float>, LAYOUT, Kokkos::HostSpace,    \
+                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,            \
+             Kokkos::View<const Kokkos::complex<float>*, LAYOUT,                \
+                          Kokkos::Device<ExecSpace, MEMSPACE>,                  \
+                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,            \
+             Kokkos::View<const Kokkos::complex<float>*, LAYOUT,                \
+                          Kokkos::Device<ExecSpace, MEMSPACE>,                  \
+                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,            \
+             1, 1, true, ETI_SPEC_AVAIL> {                                      \
+    typedef Kokkos::View<Kokkos::complex<float>, LAYOUT, Kokkos::HostSpace,     \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >              \
+        RV;                                                                     \
+    typedef Kokkos::View<const Kokkos::complex<float>*, LAYOUT,                 \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                   \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >              \
+        XV;                                                                     \
+    typedef typename XV::size_type size_type;                                   \
+                                                                                \
+    static void dot(RV& R, const XV& X, const XV& Y) {                          \
+      Kokkos::Profiling::pushRegion(                                            \
+          "KokkosBlas::dot[TPL_BLAS,complex<float>]");                          \
+      const size_type numElems = X.extent(0);                                   \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                         \
+        dot_print_specialization<RV, XV, XV>();                                 \
+        int N   = numElems;                                                     \
+        int one = 1;                                                            \
+        R()     = HostBlas<std::complex<float> >::dot(                          \
+            N, reinterpret_cast<const std::complex<float>*>(X.data()), one, \
+            reinterpret_cast<const std::complex<float>*>(Y.data()), one);   \
+      } else {                                                                  \
+        Dot<RV, XV, XV, 1, 1, false, ETI_SPEC_AVAIL>::dot(R, X, Y);             \
+      }                                                                         \
+      Kokkos::Profiling::popRegion();                                           \
+    }                                                                           \
+  };
 
-KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
+KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true)
+KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                    false)
 
-KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
+KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true)
+KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                    false)
 
-KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
+KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true)
+KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                    false)
 
-KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
+KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true)
+KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                    false)
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 #endif
 
 // cuBLAS
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
-#include<KokkosBlas_tpl_spec.hpp>
+#include <KokkosBlas_tpl_spec.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
 
-#define KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Dot< \
-Kokkos::View<double, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<double, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void dot (RV& R, const XV& X, const XV& Y) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS,double]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      dot_print_specialization<RV,XV,XV>(); \
-      const int N = static_cast<int> (numElems); \
-      constexpr int one = 1; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasDdot(s.handle, N, X.data(), one, Y.data(), one, &R()); \
-    } else { \
-      Dot<RV,XV,XV,1,1,false,ETI_SPEC_AVAIL>::dot(R,X,Y); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE,                \
+                                              ETI_SPEC_AVAIL)                  \
+  template <class ExecSpace>                                                   \
+  struct Dot<                                                                  \
+      Kokkos::View<double, LAYOUT, Kokkos::HostSpace,                          \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1, 1, true, ETI_SPEC_AVAIL> {                                            \
+    typedef Kokkos::View<double, LAYOUT, Kokkos::HostSpace,                    \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        RV;                                                                    \
+    typedef Kokkos::View<const double*, LAYOUT,                                \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                  \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XV;                                                                    \
+    typedef typename XV::size_type size_type;                                  \
+                                                                               \
+    static void dot(RV& R, const XV& X, const XV& Y) {                         \
+      Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS,double]");     \
+      const size_type numElems = X.extent(0);                                  \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                        \
+        dot_print_specialization<RV, XV, XV>();                                \
+        const int N       = static_cast<int>(numElems);                        \
+        constexpr int one = 1;                                                 \
+        KokkosBlas::Impl::CudaBlasSingleton& s =                               \
+            KokkosBlas::Impl::CudaBlasSingleton::singleton();                  \
+        cublasDdot(s.handle, N, X.data(), one, Y.data(), one, &R());           \
+      } else {                                                                 \
+        Dot<RV, XV, XV, 1, 1, false, ETI_SPEC_AVAIL>::dot(R, X, Y);            \
+      }                                                                        \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
 
-#define KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Dot< \
-Kokkos::View<float, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<float, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void dot (RV& R, const XV& X, const XV& Y) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS,float]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      dot_print_specialization<RV,XV,XV>(); \
-      const int N = static_cast<int> (numElems); \
-      constexpr int one = 1; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasSdot(s.handle, N, X.data(), one, Y.data(), one, &R()); \
-    } else { \
-      Dot<RV,XV,XV,1,1,false,ETI_SPEC_AVAIL>::dot(R,X,Y); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE,               \
+                                              ETI_SPEC_AVAIL)                 \
+  template <class ExecSpace>                                                  \
+  struct Dot<                                                                 \
+      Kokkos::View<float, LAYOUT, Kokkos::HostSpace,                          \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      1, 1, true, ETI_SPEC_AVAIL> {                                           \
+    typedef Kokkos::View<float, LAYOUT, Kokkos::HostSpace,                    \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        RV;                                                                   \
+    typedef Kokkos::View<const float*, LAYOUT,                                \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        XV;                                                                   \
+    typedef typename XV::size_type size_type;                                 \
+                                                                              \
+    static void dot(RV& R, const XV& X, const XV& Y) {                        \
+      Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS,float]");     \
+      const size_type numElems = X.extent(0);                                 \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                       \
+        dot_print_specialization<RV, XV, XV>();                               \
+        const int N       = static_cast<int>(numElems);                       \
+        constexpr int one = 1;                                                \
+        KokkosBlas::Impl::CudaBlasSingleton& s =                              \
+            KokkosBlas::Impl::CudaBlasSingleton::singleton();                 \
+        cublasSdot(s.handle, N, X.data(), one, Y.data(), one, &R());          \
+      } else {                                                                \
+        Dot<RV, XV, XV, 1, 1, false, ETI_SPEC_AVAIL>::dot(R, X, Y);           \
+      }                                                                       \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
+  };
 
-#define KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Dot< \
-Kokkos::View<Kokkos::complex<double>, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<Kokkos::complex<double>, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void dot (RV& R, const XV& X, const XV& Y) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS,complex<double>]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      dot_print_specialization<RV,XV,XV>(); \
-      const int N = static_cast<int> (numElems); \
-      constexpr int one = 1; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasZdotc(s.handle, N, reinterpret_cast<const cuDoubleComplex*>(X.data()), one, reinterpret_cast<const cuDoubleComplex*>(Y.data()), one, reinterpret_cast<cuDoubleComplex*>(&R())); \
-    } else { \
-      Dot<RV,XV,XV,1,1,false,ETI_SPEC_AVAIL>::dot(R,X,Y); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE,               \
+                                              ETI_SPEC_AVAIL)                 \
+  template <class ExecSpace>                                                  \
+  struct Dot<Kokkos::View<Kokkos::complex<double>, LAYOUT, Kokkos::HostSpace, \
+                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+             Kokkos::View<const Kokkos::complex<double>*, LAYOUT,             \
+                          Kokkos::Device<ExecSpace, MEMSPACE>,                \
+                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+             Kokkos::View<const Kokkos::complex<double>*, LAYOUT,             \
+                          Kokkos::Device<ExecSpace, MEMSPACE>,                \
+                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+             1, 1, true, ETI_SPEC_AVAIL> {                                    \
+    typedef Kokkos::View<Kokkos::complex<double>, LAYOUT, Kokkos::HostSpace,  \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        RV;                                                                   \
+    typedef Kokkos::View<const Kokkos::complex<double>*, LAYOUT,              \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        XV;                                                                   \
+    typedef typename XV::size_type size_type;                                 \
+                                                                              \
+    static void dot(RV& R, const XV& X, const XV& Y) {                        \
+      Kokkos::Profiling::pushRegion(                                          \
+          "KokkosBlas::dot[TPL_CUBLAS,complex<double>]");                     \
+      const size_type numElems = X.extent(0);                                 \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                       \
+        dot_print_specialization<RV, XV, XV>();                               \
+        const int N       = static_cast<int>(numElems);                       \
+        constexpr int one = 1;                                                \
+        KokkosBlas::Impl::CudaBlasSingleton& s =                              \
+            KokkosBlas::Impl::CudaBlasSingleton::singleton();                 \
+        cublasZdotc(s.handle, N,                                              \
+                    reinterpret_cast<const cuDoubleComplex*>(X.data()), one,  \
+                    reinterpret_cast<const cuDoubleComplex*>(Y.data()), one,  \
+                    reinterpret_cast<cuDoubleComplex*>(&R()));                \
+      } else {                                                                \
+        Dot<RV, XV, XV, 1, 1, false, ETI_SPEC_AVAIL>::dot(R, X, Y);           \
+      }                                                                       \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
+  };
 
-#define KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Dot< \
-Kokkos::View<Kokkos::complex<float>, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<Kokkos::complex<float>, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void dot (RV& R, const XV& X, const XV& Y) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS,complex<float>]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      dot_print_specialization<RV,XV,XV>(); \
-      const int N = static_cast<int> (numElems); \
-      constexpr int one = 1; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasCdotc(s.handle, N, reinterpret_cast<const cuComplex*>(X.data()), one, reinterpret_cast<const cuComplex*>(Y.data()), one, reinterpret_cast<cuComplex*>(&R())); \
-    } else { \
-      Dot<RV,XV,XV,1,1,false,ETI_SPEC_AVAIL>::dot(R,X,Y); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE,                \
+                                              ETI_SPEC_AVAIL)                  \
+  template <class ExecSpace>                                                   \
+  struct Dot<Kokkos::View<Kokkos::complex<float>, LAYOUT, Kokkos::HostSpace,   \
+                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,           \
+             Kokkos::View<const Kokkos::complex<float>*, LAYOUT,               \
+                          Kokkos::Device<ExecSpace, MEMSPACE>,                 \
+                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,           \
+             Kokkos::View<const Kokkos::complex<float>*, LAYOUT,               \
+                          Kokkos::Device<ExecSpace, MEMSPACE>,                 \
+                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,           \
+             1, 1, true, ETI_SPEC_AVAIL> {                                     \
+    typedef Kokkos::View<Kokkos::complex<float>, LAYOUT, Kokkos::HostSpace,    \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        RV;                                                                    \
+    typedef Kokkos::View<const Kokkos::complex<float>*, LAYOUT,                \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                  \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XV;                                                                    \
+    typedef typename XV::size_type size_type;                                  \
+                                                                               \
+    static void dot(RV& R, const XV& X, const XV& Y) {                         \
+      Kokkos::Profiling::pushRegion(                                           \
+          "KokkosBlas::dot[TPL_CUBLAS,complex<float>]");                       \
+      const size_type numElems = X.extent(0);                                  \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                        \
+        dot_print_specialization<RV, XV, XV>();                                \
+        const int N       = static_cast<int>(numElems);                        \
+        constexpr int one = 1;                                                 \
+        KokkosBlas::Impl::CudaBlasSingleton& s =                               \
+            KokkosBlas::Impl::CudaBlasSingleton::singleton();                  \
+        cublasCdotc(s.handle, N, reinterpret_cast<const cuComplex*>(X.data()), \
+                    one, reinterpret_cast<const cuComplex*>(Y.data()), one,    \
+                    reinterpret_cast<cuComplex*>(&R()));                       \
+      } else {                                                                 \
+        Dot<RV, XV, XV, 1, 1, false, ETI_SPEC_AVAIL>::dot(R, X, Y);            \
+      }                                                                        \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
 
-KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
+KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                      true)
+KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                      false)
 
-KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
+KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                      true)
+KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                      false)
 
-KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
+KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                      true)
+KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                      false)
 
-KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
+KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                      true)
+KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                      false)
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 #endif
 
diff --git a/src/impl/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp
index 182aba3115..b2ad91a807 100644
--- a/src/impl/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp
@@ -48,74 +48,160 @@
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class RV, class XMV, int Xrank = XMV::Rank>
+template <class RV, class XMV, int Xrank = XMV::Rank>
 struct iamax_tpl_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 namespace KokkosBlas {
 namespace Impl {
 
 // Generic Host side BLAS (could be MKL or whatever)
-#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
+#if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS)
 // double
-#define KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS( INDEX_TYPE, SCALAR, LAYOUT, MEMSPACE ) \
-template<class ExecSpace> \
-struct iamax_tpl_spec_avail< \
-Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(INDEX_TYPE, SCALAR, LAYOUT,      \
+                                              MEMSPACE)                        \
+  template <class ExecSpace>                                                   \
+  struct iamax_tpl_spec_avail<                                                 \
+      Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::HostSpace,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1> {                                                                     \
+    enum : bool { value = true };                                              \
+  };
 
-KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS( unsigned long, double,                  Kokkos::LayoutLeft, Kokkos::HostSpace)
-KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS( unsigned long, float,                   Kokkos::LayoutLeft, Kokkos::HostSpace)
-KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS( unsigned long, Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::HostSpace)
-KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS( unsigned long, Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::HostSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(unsigned long, double, Kokkos::LayoutLeft,
+                                      Kokkos::HostSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(unsigned long, float, Kokkos::LayoutLeft,
+                                      Kokkos::HostSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(unsigned long, Kokkos::complex<double>,
+                                      Kokkos::LayoutLeft, Kokkos::HostSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(unsigned long, Kokkos::complex<float>,
+                                      Kokkos::LayoutLeft, Kokkos::HostSpace)
 
 #endif
 
 // cuBLAS
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS)
 // double
-#define KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( INDEX_TYPE, SCALAR, LAYOUT, MEMSPACE ) \
-template<class ExecSpace> \
-struct iamax_tpl_spec_avail< \
-Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1> { enum : bool { value = true }; }; \
-template<class ExecSpace> \
-struct iamax_tpl_spec_avail< \
-Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(INDEX_TYPE, SCALAR, LAYOUT,    \
+                                                MEMSPACE)                      \
+  template <class ExecSpace>                                                   \
+  struct iamax_tpl_spec_avail<                                                 \
+      Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::HostSpace,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1> {                                                                     \
+    enum : bool { value = true };                                              \
+  };                                                                           \
+  template <class ExecSpace>                                                   \
+  struct iamax_tpl_spec_avail<                                                 \
+      Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>,    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1> {                                                                     \
+    enum : bool { value = true };                                              \
+  };
 
-KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, double,                 Kokkos::LayoutLeft, Kokkos::CudaSpace)
-KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, double,                  Kokkos::LayoutLeft, Kokkos::CudaSpace)
-KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, float,                  Kokkos::LayoutLeft, Kokkos::CudaSpace)
-KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, float,                   Kokkos::LayoutLeft, Kokkos::CudaSpace)
-KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, Kokkos::complex<double>,Kokkos::LayoutLeft, Kokkos::CudaSpace)
-KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::CudaSpace)
-KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, Kokkos::complex<float>, Kokkos::LayoutLeft, Kokkos::CudaSpace)
-KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, double,
+                                        Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, double,
+                                        Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, float,
+                                        Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, float, Kokkos::LayoutLeft,
+                                        Kokkos::CudaSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, Kokkos::complex<double>,
+                                        Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, Kokkos::complex<double>,
+                                        Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, Kokkos::complex<float>,
+                                        Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, Kokkos::complex<float>,
+                                        Kokkos::LayoutLeft, Kokkos::CudaSpace)
 
-KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, double,                 Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
-KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, double,                  Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
-KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, float,                  Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
-KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, float,                   Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
-KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, Kokkos::complex<double>,Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
-KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
-KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, Kokkos::complex<float>, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
-KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, double,
+                                        Kokkos::LayoutLeft,
+                                        Kokkos::CudaUVMSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, double,
+                                        Kokkos::LayoutLeft,
+                                        Kokkos::CudaUVMSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, float,
+                                        Kokkos::LayoutLeft,
+                                        Kokkos::CudaUVMSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, float, Kokkos::LayoutLeft,
+                                        Kokkos::CudaUVMSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, Kokkos::complex<double>,
+                                        Kokkos::LayoutLeft,
+                                        Kokkos::CudaUVMSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, Kokkos::complex<double>,
+                                        Kokkos::LayoutLeft,
+                                        Kokkos::CudaUVMSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned long, Kokkos::complex<float>,
+                                        Kokkos::LayoutLeft,
+                                        Kokkos::CudaUVMSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, Kokkos::complex<float>,
+                                        Kokkos::LayoutLeft,
+                                        Kokkos::CudaUVMSpace)
 
 #endif
 
-}
-}
+// rocBLAS
+#if defined(KOKKOSKERNELS_ENABLE_TPL_ROCBLAS)
+
+#define KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(INDEX_TYPE, SCALAR, LAYOUT,   \
+                                                 MEMSPACE)                     \
+  template <class ExecSpace>                                                   \
+  struct iamax_tpl_spec_avail<                                                 \
+      Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::HostSpace,                      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1> {                                                                     \
+    enum : bool { value = true };                                              \
+  };                                                                           \
+  template <class ExecSpace>                                                   \
+  struct iamax_tpl_spec_avail<                                                 \
+      Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>,    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1> {                                                                     \
+    enum : bool { value = true };                                              \
+  };
+
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, double,
+                                         Kokkos::LayoutLeft,
+                                         Kokkos::Experimental::HIPSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, double,
+                                         Kokkos::LayoutLeft,
+                                         Kokkos::Experimental::HIPSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, float,
+                                         Kokkos::LayoutLeft,
+                                         Kokkos::Experimental::HIPSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, float,
+                                         Kokkos::LayoutLeft,
+                                         Kokkos::Experimental::HIPSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, Kokkos::complex<double>,
+                                         Kokkos::LayoutLeft,
+                                         Kokkos::Experimental::HIPSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, Kokkos::complex<double>,
+                                         Kokkos::LayoutLeft,
+                                         Kokkos::Experimental::HIPSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, Kokkos::complex<float>,
+                                         Kokkos::LayoutLeft,
+                                         Kokkos::Experimental::HIPSpace)
+KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, Kokkos::complex<float>,
+                                         Kokkos::LayoutLeft,
+                                         Kokkos::Experimental::HIPSpace)
+
+#endif
+
+}  // namespace Impl
+}  // namespace KokkosBlas
 #endif
diff --git a/src/impl/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp
index ed3ad56006..9a6adf5b79 100644
--- a/src/impl/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp
@@ -45,528 +45,499 @@
 #ifndef KOKKOSBLAS1_IAMAX_TPL_SPEC_DECL_HPP_
 #define KOKKOSBLAS1_IAMAX_TPL_SPEC_DECL_HPP_
 
-
 namespace KokkosBlas {
 namespace Impl {
-  template<class RV, class XV>
-  inline void iamax_print_specialization() {
-      #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-        #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
-          printf("KokkosBlas1::iamax<> TPL cuBLAS specialization for < %s , %s >\n",typeid(RV).name(),typeid(XV).name());
-        #else
-          #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
-            printf("KokkosBlas1::iamax<> TPL Blas specialization for < %s , %s >\n",typeid(RV).name(),typeid(XV).name());
-          #endif        
-        #endif
-      #endif
-  }
-}
+template <class RV, class XV>
+inline void iamax_print_specialization() {
+#if defined(KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION)
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS)
+  printf("KokkosBlas1::iamax<> TPL cuBLAS specialization for < %s , %s >\n",
+         typeid(RV).name(), typeid(XV).name());
+#elif defined(KOKKOSKERNELS_ENABLE_TPL_ROCBLAS)
+  printf("KokkosBlas1::iamax<> TPL rocBLAS specialization for < %s , %s >\n",
+         typeid(RV).name(), typeid(XV).name());
+#else
+#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
+  printf("KokkosBlas1::iamax<> TPL Blas specialization for < %s , %s >\n",
+         typeid(RV).name(), typeid(XV).name());
+#endif
+#endif
+#endif
 }
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 // Generic Host side BLAS (could be MKL or whatever)
-#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
+#if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS)
 #include "KokkosBlas_Host_tpl.hpp"
 
 namespace KokkosBlas {
 namespace Impl {
 
-
-#define KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_BLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Iamax< \
-Kokkos::View<unsigned long, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<unsigned long, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void iamax (RV& R, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_BLAS,double]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems == 0) { R() = 0; return; } \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      iamax_print_specialization<RV,XV>(); \
-      int N = static_cast<int> (numElems); \
-      const int XST = X.stride(0); \
-      const int LDX = (XST == 0) ? 1 : XST; \
-      int idx = HostBlas<double>::iamax(N,X.data(),LDX);  \
-      R() = static_cast<size_type>(idx); \
-    } else { \
-      Iamax<RV,XV,1,false,ETI_SPEC_AVAIL>::iamax(R,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_BLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Iamax< \
-Kokkos::View<unsigned long, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<unsigned long, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void iamax (RV& R, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_BLAS,float]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems == 0) { R() = 0; return; } \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      iamax_print_specialization<RV,XV>(); \
-      int N = static_cast<int> (numElems); \
-      const int XST = X.stride(0); \
-      const int LDX = (XST == 0) ? 1 : XST; \
-      int idx = HostBlas<float>::iamax(N,X.data(),LDX);  \
-      R() = static_cast<size_type>(idx); \
-    } else { \
-      Iamax<RV,XV,1,false,ETI_SPEC_AVAIL>::iamax(R,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_BLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Iamax< \
-Kokkos::View<unsigned long, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<unsigned long, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void iamax (RV& R, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_BLAS,complex<double>]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems == 0) { R() = 0; return; } \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      iamax_print_specialization<RV,XV>(); \
-      int N = static_cast<int> (numElems); \
-      const int XST = X.stride(0); \
-      const int LDX = (XST == 0) ? 1 : XST; \
-      int idx = HostBlas<std::complex<double> >::iamax(N,reinterpret_cast<const std::complex<double>*>(X.data()),LDX); \
-      R() = static_cast<size_type>(idx); \
-    } else { \
-      Iamax<RV,XV,1,false,ETI_SPEC_AVAIL>::iamax(R,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_BLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Iamax< \
-Kokkos::View<unsigned long, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<unsigned long, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void iamax (RV& R, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_BLAS,complex<float>]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems == 0) { R() = 0; return; } \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      iamax_print_specialization<RV,XV>(); \
-      int N = static_cast<int> (numElems); \
-      const int XST = X.stride(0); \
-      const int LDX = (XST == 0) ? 1 : XST; \
-      int idx = HostBlas<std::complex<float> >::iamax(N,reinterpret_cast<const std::complex<float>*>(X.data()),LDX); \
-      R() = static_cast<size_type>(idx); \
-    } else { \
-      Iamax<RV,XV,1,false,ETI_SPEC_AVAIL>::iamax(R,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
-
-KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
-
-KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
-
-KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
-
-}
-}
+#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS(                                  \
+    SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL)            \
+  template <class ExecSpace>                                                    \
+  struct Iamax<Kokkos::View<unsigned long, LAYOUT, Kokkos::HostSpace,           \
+                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+               Kokkos::View<const SCALAR_TYPE*, LAYOUT,                         \
+                            Kokkos::Device<ExecSpace, MEMSPACE>,                \
+                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+               1, true, ETI_SPEC_AVAIL> {                                       \
+    typedef Kokkos::View<unsigned long, LAYOUT, Kokkos::HostSpace,              \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >              \
+        RV;                                                                     \
+    typedef Kokkos::View<const SCALAR_TYPE*, LAYOUT,                            \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                   \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >              \
+        XV;                                                                     \
+    typedef typename XV::size_type size_type;                                   \
+                                                                                \
+    static void iamax(RV& R, const XV& X) {                                     \
+      Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_BLAS," #SCALAR_TYPE  \
+                                    "]");                                       \
+      const size_type numElems = X.extent(0);                                   \
+      if (numElems == 0) {                                                      \
+        R() = 0;                                                                \
+        return;                                                                 \
+      }                                                                         \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                         \
+        iamax_print_specialization<RV, XV>();                                   \
+        int N         = static_cast<int>(numElems);                             \
+        const int XST = X.stride(0);                                            \
+        const int LDX = (XST == 0) ? 1 : XST;                                   \
+        int idx       = HostBlas<BASE_SCALAR_TYPE>::iamax(                      \
+            N, reinterpret_cast<const BASE_SCALAR_TYPE*>(X.data()), LDX); \
+        R() = static_cast<size_type>(idx);                                      \
+      } else {                                                                  \
+        Iamax<RV, XV, 1, false, ETI_SPEC_AVAIL>::iamax(R, X);                   \
+      }                                                                         \
+      Kokkos::Profiling::popRegion();                                           \
+    }                                                                           \
+  };
+
+#define KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE,           \
+                                              ETI_SPEC_AVAIL)             \
+  KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS(double, double, LAYOUT, MEMSPACE, \
+                                        ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE,         \
+                                              ETI_SPEC_AVAIL)           \
+  KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS(float, float, LAYOUT, MEMSPACE, \
+                                        ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE,       \
+                                              ETI_SPEC_AVAIL)         \
+  KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::complex<double>,      \
+                                        std::complex<double>, LAYOUT, \
+                                        MEMSPACE, ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE,                \
+                                              ETI_SPEC_AVAIL)                  \
+  KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::complex<float>,                \
+                                        std::complex<float>, LAYOUT, MEMSPACE, \
+                                        ETI_SPEC_AVAIL)
+
+KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                      true)
+KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                      false)
+
+KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                      true)
+KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                      false)
+
+KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                      true)
+KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                      false)
+
+KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                      true)
+KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                      false)
+
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 #endif
 
 // cuBLAS
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
-#include<KokkosBlas_tpl_spec.hpp>
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS)
+#include <KokkosBlas_tpl_spec.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
 
-#define KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS( INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Iamax< \
-Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void iamax (RV& R, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_CUBLAS,double]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems == 0) { Kokkos::deep_copy (R, 0); return; } \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      iamax_print_specialization<RV,XV>(); \
-      const int N = static_cast<int> (numElems); \
-      const int XST = X.stride(0); \
-      const int LDX = (XST == 0) ? 1 : XST; \
-      int idx; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasIdamax(s.handle, N, X.data(), LDX, &idx); \
-      R() = static_cast<size_type>(idx); \
-    } else { \
-      Iamax<RV,XV,1,false,ETI_SPEC_AVAIL>::iamax(R,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-}; \
-template<class ExecSpace> \
-struct Iamax< \
-Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void iamax (RV& R, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_CUBLAS,double]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems == 0) { Kokkos::deep_copy (R, 0); return; } \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      iamax_print_specialization<RV,XV>(); \
-      const int N = static_cast<int> (numElems); \
-      const int XST = X.stride(0); \
-      const int LDX = (XST == 0) ? 1 : XST; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE); \
-      cublasIdamax(s.handle, N, X.data(), LDX, reinterpret_cast<int*>(R.data())); \
-      cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_HOST); \
-      Kokkos::fence(); \
-    } else { \
-      Iamax<RV,XV,1,false,ETI_SPEC_AVAIL>::iamax(R,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS( INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Iamax< \
-Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void iamax (RV& R, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_CUBLAS,float]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems == 0) { Kokkos::deep_copy (R, 0);; return; } \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      iamax_print_specialization<RV,XV>(); \
-      const int N = static_cast<int> (numElems); \
-      const int XST = X.stride(0); \
-      const int LDX = (XST == 0) ? 1 : XST; \
-      int idx; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasIsamax(s.handle, N, X.data(), LDX, &idx); \
-      R() = static_cast<size_type>(idx); \
-    } else { \
-      Iamax<RV,XV,1,false,ETI_SPEC_AVAIL>::iamax(R,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-}; \
-template<class ExecSpace> \
-struct Iamax< \
-Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void iamax (RV& R, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_CUBLAS,float]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems == 0) { Kokkos::deep_copy (R, 0);; return; } \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      iamax_print_specialization<RV,XV>(); \
-      const int N = static_cast<int> (numElems); \
-      const int XST = X.stride(0); \
-      const int LDX = (XST == 0) ? 1 : XST; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE); \
-      cublasIsamax(s.handle, N, X.data(), LDX, reinterpret_cast<int*>(R.data())); \
-      cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_HOST); \
-      Kokkos::fence(); \
-    } else { \
-      Iamax<RV,XV,1,false,ETI_SPEC_AVAIL>::iamax(R,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS( INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Iamax< \
-Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void iamax (RV& R, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_CUBLAS,complex<double>]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems == 0) { Kokkos::deep_copy (R, 0); return; } \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      iamax_print_specialization<RV,XV>(); \
-      const int N = static_cast<int> (numElems); \
-      const int XST = X.stride(0); \
-      const int LDX = (XST == 0) ? 1 : XST; \
-      int idx; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasIzamax(s.handle, N, reinterpret_cast<const cuDoubleComplex*>(X.data()), LDX, &idx); \
-      R() = static_cast<size_type>(idx); \
-    } else { \
-      Iamax<RV,XV,1,false,ETI_SPEC_AVAIL>::iamax(R,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-}; \
-template<class ExecSpace> \
-struct Iamax< \
-Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void iamax (RV& R, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_CUBLAS,complex<double>]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems == 0) { Kokkos::deep_copy (R, 0); return; } \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      iamax_print_specialization<RV,XV>(); \
-      const int N = static_cast<int> (numElems); \
-      const int XST = X.stride(0); \
-      const int LDX = (XST == 0) ? 1 : XST; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE); \
-      cublasIzamax(s.handle, N, reinterpret_cast<const cuDoubleComplex*>(X.data()), LDX, reinterpret_cast<int*>(R.data())); \
-      cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_HOST); \
-      Kokkos::fence(); \
-    } else { \
-      Iamax<RV,XV,1,false,ETI_SPEC_AVAIL>::iamax(R,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS( INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Iamax< \
-Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void iamax (RV& R, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_CUBLAS,complex<float>]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems == 0) { Kokkos::deep_copy (R, 0); return; } \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      iamax_print_specialization<RV,XV>(); \
-      const int N = static_cast<int> (numElems); \
-      const int XST = X.stride(0); \
-      const int LDX = (XST == 0) ? 1 : XST; \
-      int idx; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasIcamax(s.handle, N, reinterpret_cast<const cuComplex*>(X.data()), LDX, &idx); \
-      R() = static_cast<size_type>(idx); \
-    } else { \
-      Iamax<RV,XV,1,false,ETI_SPEC_AVAIL>::iamax(R,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-}; \
-template<class ExecSpace> \
-struct Iamax< \
-Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<INDEX_TYPE, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void iamax (RV& R, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_CUBLAS,complex<float>]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems == 0) { Kokkos::deep_copy (R, 0); return; } \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      iamax_print_specialization<RV,XV>(); \
-      const int N = static_cast<int> (numElems); \
-      const int XST = X.stride(0); \
-      const int LDX = (XST == 0) ? 1 : XST; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE); \
-      cublasIcamax(s.handle, N, reinterpret_cast<const cuComplex*>(X.data()), LDX, reinterpret_cast<int*>(R.data())); \
-      cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_HOST); \
-      Kokkos::fence(); \
-    } else { \
-      Iamax<RV,XV,1,false,ETI_SPEC_AVAIL>::iamax(R,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned long, Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned long, Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
-
-KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned long, Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned long, Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
-
-KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned long, Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned long, Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
-
-KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned long, Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned long, Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
-
-KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned int, Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned int, Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
-
-KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned int, Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned int, Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
-
-KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned int, Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned int, Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
-
-KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned int, Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned int, Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
-
-#if defined (KOKKOS_ENABLE_CUDA_UVM)
-KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned long, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned long, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
-
-KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned long, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned long, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
-
-KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned long, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned long, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
-
-KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned long, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned long, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
-
-KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned int, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned int, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
-
-KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned int, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned int, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
-
-KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned int, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned int, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
-
-KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned int, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS( unsigned int, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
+using CUBLAS_DEVICE_TYPE = Kokkos::Device<Kokkos::Cuda, Kokkos::CudaSpace>;
+#if defined(KOKKOS_ENABLE_CUDA_UVM)
+using CUBLASUVM_DEVICE_TYPE =
+    Kokkos::Device<Kokkos::Cuda, Kokkos::CudaUVMSpace>;
 #endif
 
-}
-}
+#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER(                      \
+    SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, MEMSPACE,   \
+    ETI_SPEC_AVAIL, RET_DEVICE_TYPE, CUBLAS_PTR_MODE_1, CUBLAS_PTR_MODE_2)    \
+  template <class ExecSpace>                                                  \
+  struct Iamax<Kokkos::View<INDEX_TYPE, LAYOUT, RET_DEVICE_TYPE,              \
+                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >,        \
+               Kokkos::View<const SCALAR_TYPE*, LAYOUT,                       \
+                            Kokkos::Device<ExecSpace, MEMSPACE>,              \
+                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >,        \
+               1, true, ETI_SPEC_AVAIL> {                                     \
+    typedef Kokkos::View<INDEX_TYPE, LAYOUT, RET_DEVICE_TYPE,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        RV;                                                                   \
+    typedef Kokkos::View<const SCALAR_TYPE*, LAYOUT,                          \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        XV;                                                                   \
+    typedef typename XV::size_type size_type;                                 \
+                                                                              \
+    static void iamax(RV& R, const XV& X) {                                   \
+      Kokkos::Profiling::pushRegion(                                          \
+          "KokkosBlas::iamax[TPL_CUBLAS," #SCALAR_TYPE "]");                  \
+      const size_type numElems = X.extent(0);                                 \
+      if (numElems == 0) {                                                    \
+        Kokkos::deep_copy(R, 0);                                              \
+        return;                                                               \
+      }                                                                       \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                       \
+        iamax_print_specialization<RV, XV>();                                 \
+        const int N   = static_cast<int>(numElems);                           \
+        const int XST = X.stride(0);                                          \
+        const int LDX = (XST == 0) ? 1 : XST;                                 \
+        KokkosBlas::Impl::CudaBlasSingleton& s =                              \
+            KokkosBlas::Impl::CudaBlasSingleton::singleton();                 \
+        cublasPointerMode_t prevPtrMode;                                      \
+        KOKKOS_CUBLAS_SAFE_CALL_IMPL(                                         \
+            cublasGetPointerMode(s.handle, &prevPtrMode));                    \
+        if (prevPtrMode == CUBLAS_PTR_MODE_2) {                               \
+          KOKKOS_CUBLAS_SAFE_CALL_IMPL(                                       \
+              cublasSetPointerMode(s.handle, CUBLAS_PTR_MODE_1));             \
+        }                                                                     \
+        KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN(                               \
+            s.handle, N, reinterpret_cast<const CUDA_SCALAR_TYPE*>(X.data()), \
+            LDX, reinterpret_cast<int*>(R.data())));                          \
+        if (prevPtrMode == CUBLAS_PTR_MODE_2) {                               \
+          KOKKOS_CUBLAS_SAFE_CALL_IMPL(                                       \
+              cublasSetPointerMode(s.handle, CUBLAS_PTR_MODE_2));             \
+        }                                                                     \
+      } else {                                                                \
+        Iamax<RV, XV, 1, false, ETI_SPEC_AVAIL>::iamax(R, X);                 \
+      }                                                                       \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
+  };
+
+#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, \
+                                                CUBLAS_FN, INDEX_TYPE, LAYOUT, \
+                                                MEMSPACE, ETI_SPEC_AVAIL)      \
+  KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER(                             \
+      SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, MEMSPACE,  \
+      ETI_SPEC_AVAIL, Kokkos::HostSpace, CUBLAS_POINTER_MODE_HOST,             \
+      CUBLAS_POINTER_MODE_DEVICE)                                              \
+  KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER(                             \
+      SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, MEMSPACE,  \
+      ETI_SPEC_AVAIL, CUBLAS_DEVICE_TYPE, CUBLAS_POINTER_MODE_DEVICE,          \
+      CUBLAS_POINTER_MODE_HOST)
+
+#if defined(KOKKOS_ENABLE_CUDA_UVM)
+#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(                          \
+    SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, MEMSPACE,   \
+    ETI_SPEC_AVAIL)                                                           \
+  KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER(                            \
+      SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, MEMSPACE, \
+      ETI_SPEC_AVAIL, Kokkos::HostSpace, CUBLAS_POINTER_MODE_HOST,            \
+      CUBLAS_POINTER_MODE_DEVICE)                                             \
+  KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER(                            \
+      SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, MEMSPACE, \
+      ETI_SPEC_AVAIL, CUBLASUVM_DEVICE_TYPE, CUBLAS_POINTER_MODE_DEVICE,      \
+      CUBLAS_POINTER_MODE_HOST)
+#endif
+
+#define KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \
+                                                ETI_SPEC_AVAIL)               \
+  KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS(double, double, cublasIdamax,       \
+                                          INDEX_TYPE, LAYOUT, MEMSPACE,       \
+                                          ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \
+                                                ETI_SPEC_AVAIL)               \
+  KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS(float, float, cublasIsamax,         \
+                                          INDEX_TYPE, LAYOUT, MEMSPACE,       \
+                                          ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \
+                                                ETI_SPEC_AVAIL)               \
+  KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS(                                    \
+      Kokkos::complex<double>, cuDoubleComplex, cublasIzamax, INDEX_TYPE,     \
+      LAYOUT, MEMSPACE, ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \
+                                                ETI_SPEC_AVAIL)               \
+  KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS(Kokkos::complex<float>, cuComplex,  \
+                                          cublasIcamax, INDEX_TYPE, LAYOUT,   \
+                                          MEMSPACE, ETI_SPEC_AVAIL)
+
+#if defined(KOKKOS_ENABLE_CUDA_UVM)
+#define KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(INDEX_TYPE, LAYOUT,       \
+                                                    MEMSPACE, ETI_SPEC_AVAIL) \
+  KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(double, double, cublasIdamax,   \
+                                              INDEX_TYPE, LAYOUT, MEMSPACE,   \
+                                              ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(INDEX_TYPE, LAYOUT,       \
+                                                    MEMSPACE, ETI_SPEC_AVAIL) \
+  KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(float, float, cublasIsamax,     \
+                                              INDEX_TYPE, LAYOUT, MEMSPACE,   \
+                                              ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(INDEX_TYPE, LAYOUT,       \
+                                                    MEMSPACE, ETI_SPEC_AVAIL) \
+  KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(                                \
+      Kokkos::complex<double>, cuDoubleComplex, cublasIzamax, INDEX_TYPE,     \
+      LAYOUT, MEMSPACE, ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(INDEX_TYPE, LAYOUT,       \
+                                                    MEMSPACE, ETI_SPEC_AVAIL) \
+  KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(                                \
+      Kokkos::complex<float>, cuComplex, cublasIcamax, INDEX_TYPE, LAYOUT,    \
+      MEMSPACE, ETI_SPEC_AVAIL)
+#endif
+
+KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft,
+                                        Kokkos::CudaSpace, true)
+KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft,
+                                        Kokkos::CudaSpace, false)
+
+KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft,
+                                        Kokkos::CudaSpace, true)
+KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft,
+                                        Kokkos::CudaSpace, false)
+
+KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft,
+                                        Kokkos::CudaSpace, true)
+KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft,
+                                        Kokkos::CudaSpace, false)
+
+KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft,
+                                        Kokkos::CudaSpace, true)
+KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned long, Kokkos::LayoutLeft,
+                                        Kokkos::CudaSpace, false)
+
+KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft,
+                                        Kokkos::CudaSpace, true)
+KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft,
+                                        Kokkos::CudaSpace, false)
+
+KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft,
+                                        Kokkos::CudaSpace, true)
+KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft,
+                                        Kokkos::CudaSpace, false)
+
+KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft,
+                                        Kokkos::CudaSpace, true)
+KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft,
+                                        Kokkos::CudaSpace, false)
+
+KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft,
+                                        Kokkos::CudaSpace, true)
+KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS(unsigned int, Kokkos::LayoutLeft,
+                                        Kokkos::CudaSpace, false)
+
+#if defined(KOKKOS_ENABLE_CUDA_UVM)
+KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft,
+                                            Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft,
+                                            Kokkos::CudaUVMSpace, false)
+
+KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft,
+                                            Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft,
+                                            Kokkos::CudaUVMSpace, false)
+
+KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft,
+                                            Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft,
+                                            Kokkos::CudaUVMSpace, false)
+
+KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft,
+                                            Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned long, Kokkos::LayoutLeft,
+                                            Kokkos::CudaUVMSpace, false)
+
+KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft,
+                                            Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft,
+                                            Kokkos::CudaUVMSpace, false)
+
+KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft,
+                                            Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft,
+                                            Kokkos::CudaUVMSpace, false)
+
+KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft,
+                                            Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft,
+                                            Kokkos::CudaUVMSpace, false)
+
+KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft,
+                                            Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft,
+                                            Kokkos::CudaUVMSpace, false)
+#endif
+
+}  // namespace Impl
+}  // namespace KokkosBlas
+
+#endif
+
+// rocBLAS
+#if defined(KOKKOSKERNELS_ENABLE_TPL_ROCBLAS)
+#include <KokkosBlas_tpl_spec.hpp>
+
+namespace KokkosBlas {
+namespace Impl {
+
+using ROCBLAS_DEVICE_TYPE =
+    Kokkos::Device<Kokkos::Experimental::HIP, Kokkos::Experimental::HIPSpace>;
+
+#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS_WRAPPER(                      \
+    SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, LAYOUT,          \
+    MEMSPACE, ETI_SPEC_AVAIL, RET_DEVICE_TYPE, ROCBLAS_PTR_MODE_1,             \
+    ROCBLAS_PTR_MODE_2)                                                        \
+  template <class ExecSpace>                                                   \
+  struct Iamax<Kokkos::View<INDEX_TYPE, LAYOUT, RET_DEVICE_TYPE,               \
+                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+               Kokkos::View<const SCALAR_TYPE*, LAYOUT,                        \
+                            Kokkos::Device<ExecSpace, MEMSPACE>,               \
+                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+               1, true, ETI_SPEC_AVAIL> {                                      \
+    typedef Kokkos::View<INDEX_TYPE, LAYOUT, RET_DEVICE_TYPE,                  \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        RV;                                                                    \
+    typedef Kokkos::View<const SCALAR_TYPE*, LAYOUT,                           \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                  \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XV;                                                                    \
+    typedef typename XV::size_type size_type;                                  \
+                                                                               \
+    static void iamax(RV& R, const XV& X) {                                    \
+      Kokkos::Profiling::pushRegion(                                           \
+          "KokkosBlas::iamax[TPL_ROCBLAS," #SCALAR_TYPE "]");                  \
+      const size_type numElems = X.extent(0);                                  \
+      if (numElems == 0) {                                                     \
+        Kokkos::deep_copy(R, 0);                                               \
+        return;                                                                \
+      }                                                                        \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                        \
+        iamax_print_specialization<RV, XV>();                                  \
+        const int N   = static_cast<int>(numElems);                            \
+        const int XST = X.stride(0);                                           \
+        const int LDX = (XST == 0) ? 1 : XST;                                  \
+        KokkosBlas::Impl::RocBlasSingleton& s =                                \
+            KokkosBlas::Impl::RocBlasSingleton::singleton();                   \
+        rocblas_pointer_mode prevPtrMode;                                      \
+        KOKKOS_ROCBLAS_SAFE_CALL_IMPL(                                         \
+            rocblas_get_pointer_mode(s.handle, &prevPtrMode));                 \
+        if (prevPtrMode == ROCBLAS_PTR_MODE_2) {                               \
+          KOKKOS_ROCBLAS_SAFE_CALL_IMPL(                                       \
+              rocblas_set_pointer_mode(s.handle, ROCBLAS_PTR_MODE_1));         \
+        }                                                                      \
+        KOKKOS_ROCBLAS_SAFE_CALL_IMPL(                                         \
+            ROCBLAS_FN(s.handle, N,                                            \
+                       reinterpret_cast<const ROCBLAS_SCALAR_TYPE*>(X.data()), \
+                       LDX, reinterpret_cast<int*>(R.data())));                \
+        if (prevPtrMode == ROCBLAS_PTR_MODE_2) {                               \
+          KOKKOS_ROCBLAS_SAFE_CALL_IMPL(                                       \
+              rocblas_set_pointer_mode(s.handle, ROCBLAS_PTR_MODE_2));         \
+        }                                                                      \
+      } else {                                                                 \
+        Iamax<RV, XV, 1, false, ETI_SPEC_AVAIL>::iamax(R, X);                  \
+      }                                                                        \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS(                             \
+    SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, LAYOUT,         \
+    MEMSPACE, ETI_SPEC_AVAIL)                                                 \
+  KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS_WRAPPER(                           \
+      SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, LAYOUT,       \
+      MEMSPACE, ETI_SPEC_AVAIL, Kokkos::HostSpace, rocblas_pointer_mode_host, \
+      rocblas_pointer_mode_device)                                            \
+  KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS_WRAPPER(                           \
+      SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, LAYOUT,       \
+      MEMSPACE, ETI_SPEC_AVAIL, ROCBLAS_DEVICE_TYPE,                          \
+      rocblas_pointer_mode_device, rocblas_pointer_mode_host)
+
+#define KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \
+                                                 ETI_SPEC_AVAIL)               \
+  KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS(double, double, rocblas_idamax,     \
+                                           INDEX_TYPE, LAYOUT, MEMSPACE,       \
+                                           ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \
+                                                 ETI_SPEC_AVAIL)               \
+  KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS(float, float, rocblas_isamax,       \
+                                           INDEX_TYPE, LAYOUT, MEMSPACE,       \
+                                           ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \
+                                                 ETI_SPEC_AVAIL)               \
+  KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS(                                    \
+      Kokkos::complex<double>, rocblas_double_complex, rocblas_izamax,         \
+      INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \
+                                                 ETI_SPEC_AVAIL)               \
+  KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS(                                    \
+      Kokkos::complex<float>, rocblas_float_complex, rocblas_icamax,           \
+      INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL)
+
+KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft,
+                                         Kokkos::Experimental::HIPSpace, true)
+KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft,
+                                         Kokkos::Experimental::HIPSpace, false)
+
+KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft,
+                                         Kokkos::Experimental::HIPSpace, true)
+KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft,
+                                         Kokkos::Experimental::HIPSpace, false)
+
+KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft,
+                                         Kokkos::Experimental::HIPSpace, true)
+KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft,
+                                         Kokkos::Experimental::HIPSpace, false)
+
+KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft,
+                                         Kokkos::Experimental::HIPSpace, true)
+KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft,
+                                         Kokkos::Experimental::HIPSpace, false)
+
+KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft,
+                                         Kokkos::Experimental::HIPSpace, true)
+KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft,
+                                         Kokkos::Experimental::HIPSpace, false)
+
+KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft,
+                                         Kokkos::Experimental::HIPSpace, true)
+KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft,
+                                         Kokkos::Experimental::HIPSpace, false)
+
+KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft,
+                                         Kokkos::Experimental::HIPSpace, true)
+KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft,
+                                         Kokkos::Experimental::HIPSpace, false)
+
+KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft,
+                                         Kokkos::Experimental::HIPSpace, true)
+KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft,
+                                         Kokkos::Experimental::HIPSpace, false)
+
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 #endif
 
diff --git a/src/impl/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp
index f065f40a2c..84286021d5 100644
--- a/src/impl/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp
@@ -48,11 +48,11 @@
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class YMV, class AV, class XMV, int rank = XMV::rank>
+template <class YMV, class AV, class XMV, int rank = XMV::rank>
 struct mult_tpl_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/impl/tpls/KokkosBlas1_mult_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_mult_tpl_spec_decl.hpp
index ab22b4b687..6501ddab42 100644
--- a/src/impl/tpls/KokkosBlas1_mult_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosBlas1_mult_tpl_spec_decl.hpp
@@ -46,8 +46,7 @@
 #define KOKKOSBLAS1_MULT_TPL_SPEC_DECL_HPP_
 
 namespace KokkosBlas {
-namespace Impl {
-}
-}
+namespace Impl {}
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp
index 5a44212e67..1ca215c0e5 100644
--- a/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp
@@ -48,12 +48,12 @@
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class AV, class XMV, int Xrank = XMV::Rank>
+template <class AV, class XMV, int Xrank = XMV::Rank>
 struct nrm1_tpl_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 namespace KokkosBlas {
 namespace Impl {
@@ -61,42 +61,57 @@ namespace Impl {
 // Generic Host side BLAS (could be MKL or whatever)
 #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
 // double
-#define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS( SCALAR, LAYOUT, MEMSPACE ) \
-template<class ExecSpace> \
-struct nrm1_tpl_spec_avail< \
-Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE)         \
+  template <class ExecSpace>                                                   \
+  struct nrm1_tpl_spec_avail<                                                  \
+      Kokkos::View<                                                            \
+          typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, \
+          LAYOUT, Kokkos::HostSpace,                                           \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                           \
+      Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1> {                                                                     \
+    enum : bool { value = true };                                              \
+  };
 
-KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS( double,                  Kokkos::LayoutLeft, Kokkos::HostSpace)
-KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS( float,                   Kokkos::LayoutLeft, Kokkos::HostSpace)
-KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::HostSpace)
-KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::HostSpace)
+KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft,
+                                     Kokkos::HostSpace)
+KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft,
+                                     Kokkos::HostSpace)
+KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<double>,
+                                     Kokkos::LayoutLeft, Kokkos::HostSpace)
+KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<float>, Kokkos::LayoutLeft,
+                                     Kokkos::HostSpace)
 
 #endif
 
 // cuBLAS
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
 // double
-#define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS( SCALAR, LAYOUT, MEMSPACE ) \
-template<class ExecSpace> \
-struct nrm1_tpl_spec_avail< \
-Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE)       \
+  template <class ExecSpace>                                                   \
+  struct nrm1_tpl_spec_avail<                                                  \
+      Kokkos::View<                                                            \
+          typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, \
+          LAYOUT, Kokkos::HostSpace,                                           \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                           \
+      Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1> {                                                                     \
+    enum : bool { value = true };                                              \
+  };
 
-KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS( double,                  Kokkos::LayoutLeft, Kokkos::CudaSpace)
-KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS( float,                   Kokkos::LayoutLeft, Kokkos::CudaSpace)
-KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::CudaSpace)
-KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft,
+                                       Kokkos::CudaSpace)
+KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft,
+                                       Kokkos::CudaSpace)
+KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<double>,
+                                       Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<float>,
+                                       Kokkos::LayoutLeft, Kokkos::CudaSpace)
 
 #endif
 
-
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 #endif
diff --git a/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp
index 9d0754d0ac..8a8a309548 100644
--- a/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp
@@ -45,21 +45,20 @@
 #ifndef KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_HPP_
 #define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_HPP_
 
-
 namespace KokkosBlas {
 namespace Impl {
 
-
 namespace {
-  template<class RV, class XV>
-  inline void nrm1_print_specialization() {
-      #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-        printf("KokkosBlas1::nrm1<> TPL Blas specialization for < %s , %s >\n",typeid(RV).name(),typeid(XV).name());
-      #endif
-  }
-}
-}
+template <class RV, class XV>
+inline void nrm1_print_specialization() {
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+  printf("KokkosBlas1::nrm1<> TPL Blas specialization for < %s , %s >\n",
+         typeid(RV).name(), typeid(XV).name());
+#endif
 }
+}  // namespace
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 // Generic Host side BLAS (could be MKL or whatever)
 #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
@@ -68,296 +67,338 @@ namespace {
 namespace KokkosBlas {
 namespace Impl {
 
-#define KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_BLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Nrm1< \
-Kokkos::View<double, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<double, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void nrm1 (RV& R, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_BLAS,double]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      nrm1_print_specialization<RV,XV>(); \
-      int N = numElems; \
-      int one = 1; \
-      R() = HostBlas<double>::asum(N,X.data(),one); \
-    } else { \
-      Nrm1<RV,XV,1,false,ETI_SPEC_AVAIL>::nrm1(R,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \
+  template <class ExecSpace>                                                   \
+  struct Nrm1<                                                                 \
+      Kokkos::View<double, LAYOUT, Kokkos::HostSpace,                          \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1, true, ETI_SPEC_AVAIL> {                                               \
+    typedef Kokkos::View<double, LAYOUT, Kokkos::HostSpace,                    \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        RV;                                                                    \
+    typedef Kokkos::View<const double*, LAYOUT,                                \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                  \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XV;                                                                    \
+    typedef typename XV::size_type size_type;                                  \
+                                                                               \
+    static void nrm1(RV& R, const XV& X) {                                     \
+      Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_BLAS,double]");      \
+      const size_type numElems = X.extent(0);                                  \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                        \
+        nrm1_print_specialization<RV, XV>();                                   \
+        int N   = numElems;                                                    \
+        int one = 1;                                                           \
+        R()     = HostBlas<double>::asum(N, X.data(), one);                    \
+      } else {                                                                 \
+        Nrm1<RV, XV, 1, false, ETI_SPEC_AVAIL>::nrm1(R, X);                    \
+      }                                                                        \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
 
-#define KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_BLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Nrm1< \
-Kokkos::View<float, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<float, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void nrm1 (RV& R, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_BLAS,float]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      nrm1_print_specialization<RV,XV>(); \
-      int N = numElems; \
-      int one = 1; \
-      R() = HostBlas<float>::asum(N,X.data(),one); \
-    } else { \
-      Nrm1<RV,XV,1,false,ETI_SPEC_AVAIL>::nrm1(R,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \
+  template <class ExecSpace>                                                   \
+  struct Nrm1<                                                                 \
+      Kokkos::View<float, LAYOUT, Kokkos::HostSpace,                           \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1, true, ETI_SPEC_AVAIL> {                                               \
+    typedef Kokkos::View<float, LAYOUT, Kokkos::HostSpace,                     \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        RV;                                                                    \
+    typedef Kokkos::View<const float*, LAYOUT,                                 \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                  \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XV;                                                                    \
+    typedef typename XV::size_type size_type;                                  \
+                                                                               \
+    static void nrm1(RV& R, const XV& X) {                                     \
+      Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_BLAS,float]");       \
+      const size_type numElems = X.extent(0);                                  \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                        \
+        nrm1_print_specialization<RV, XV>();                                   \
+        int N   = numElems;                                                    \
+        int one = 1;                                                           \
+        R()     = HostBlas<float>::asum(N, X.data(), one);                     \
+      } else {                                                                 \
+        Nrm1<RV, XV, 1, false, ETI_SPEC_AVAIL>::nrm1(R, X);                    \
+      }                                                                        \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
 
-#define KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_BLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Nrm1< \
-Kokkos::View<double, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<double, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void nrm1 (RV& R, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_BLAS,complex<double>]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      nrm1_print_specialization<RV,XV>(); \
-      int N = numElems; \
-      int one = 1;                                                      \
-      R() = HostBlas<std::complex<double> >::asum(N,reinterpret_cast<const std::complex<double>*>(X.data()),one); \
-    } else { \
-      Nrm1<RV,XV,1,false,ETI_SPEC_AVAIL>::nrm1(R,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL)    \
+  template <class ExecSpace>                                                      \
+  struct Nrm1<Kokkos::View<double, LAYOUT, Kokkos::HostSpace,                     \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+              Kokkos::View<const Kokkos::complex<double>*, LAYOUT,                \
+                           Kokkos::Device<ExecSpace, MEMSPACE>,                   \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+              1, true, ETI_SPEC_AVAIL> {                                          \
+    typedef Kokkos::View<double, LAYOUT, Kokkos::HostSpace,                       \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >                \
+        RV;                                                                       \
+    typedef Kokkos::View<const Kokkos::complex<double>*, LAYOUT,                  \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                     \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >                \
+        XV;                                                                       \
+    typedef typename XV::size_type size_type;                                     \
+                                                                                  \
+    static void nrm1(RV& R, const XV& X) {                                        \
+      Kokkos::Profiling::pushRegion(                                              \
+          "KokkosBlas::nrm1[TPL_BLAS,complex<double>]");                          \
+      const size_type numElems = X.extent(0);                                     \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                           \
+        nrm1_print_specialization<RV, XV>();                                      \
+        int N   = numElems;                                                       \
+        int one = 1;                                                              \
+        R()     = HostBlas<std::complex<double> >::asum(                          \
+            N, reinterpret_cast<const std::complex<double>*>(X.data()), one); \
+      } else {                                                                    \
+        Nrm1<RV, XV, 1, false, ETI_SPEC_AVAIL>::nrm1(R, X);                       \
+      }                                                                           \
+      Kokkos::Profiling::popRegion();                                             \
+    }                                                                             \
+  };
 
-#define KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_BLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Nrm1< \
-Kokkos::View<float, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<float, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void nrm1 (RV& R, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_BLAS,complex<float>]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      nrm1_print_specialization<RV,XV>(); \
-      int N = numElems; \
-      int one = 1; \
-      R() = HostBlas<std::complex<float> >::asum(N,reinterpret_cast<const std::complex<float>*>(X.data()),one); \
-    } else { \
-      Nrm1<RV,XV,1,false,ETI_SPEC_AVAIL>::nrm1(R,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL)   \
+  template <class ExecSpace>                                                     \
+  struct Nrm1<Kokkos::View<float, LAYOUT, Kokkos::HostSpace,                     \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,            \
+              Kokkos::View<const Kokkos::complex<float>*, LAYOUT,                \
+                           Kokkos::Device<ExecSpace, MEMSPACE>,                  \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,            \
+              1, true, ETI_SPEC_AVAIL> {                                         \
+    typedef Kokkos::View<float, LAYOUT, Kokkos::HostSpace,                       \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >               \
+        RV;                                                                      \
+    typedef Kokkos::View<const Kokkos::complex<float>*, LAYOUT,                  \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                    \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >               \
+        XV;                                                                      \
+    typedef typename XV::size_type size_type;                                    \
+                                                                                 \
+    static void nrm1(RV& R, const XV& X) {                                       \
+      Kokkos::Profiling::pushRegion(                                             \
+          "KokkosBlas::nrm1[TPL_BLAS,complex<float>]");                          \
+      const size_type numElems = X.extent(0);                                    \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                          \
+        nrm1_print_specialization<RV, XV>();                                     \
+        int N   = numElems;                                                      \
+        int one = 1;                                                             \
+        R()     = HostBlas<std::complex<float> >::asum(                          \
+            N, reinterpret_cast<const std::complex<float>*>(X.data()), one); \
+      } else {                                                                   \
+        Nrm1<RV, XV, 1, false, ETI_SPEC_AVAIL>::nrm1(R, X);                      \
+      }                                                                          \
+      Kokkos::Profiling::popRegion();                                            \
+    }                                                                            \
+  };
 
-KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
+KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                     true)
+KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                     false)
 
-KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
+KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                     true)
+KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                     false)
 
-KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
+KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                     true)
+KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                     false)
 
-KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
+KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                     true)
+KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                     false)
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 #endif
 
 // cuBLAS
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
-#include<KokkosBlas_tpl_spec.hpp>
+#include <KokkosBlas_tpl_spec.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
 
-#define KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Nrm1< \
-Kokkos::View<double, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<double, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void nrm1 (RV& R, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_CUBLAS,double]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      nrm1_print_specialization<RV,XV>(); \
-      const int N = static_cast<int> (numElems); \
-      constexpr int one = 1; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasDasum(s.handle, N, X.data(), one, R.data()); \
-    } else { \
-      Nrm1<RV,XV,1,false,ETI_SPEC_AVAIL>::nrm1(R,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE,               \
+                                               ETI_SPEC_AVAIL)                 \
+  template <class ExecSpace>                                                   \
+  struct Nrm1<                                                                 \
+      Kokkos::View<double, LAYOUT, Kokkos::HostSpace,                          \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1, true, ETI_SPEC_AVAIL> {                                               \
+    typedef Kokkos::View<double, LAYOUT, Kokkos::HostSpace,                    \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        RV;                                                                    \
+    typedef Kokkos::View<const double*, LAYOUT,                                \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                  \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XV;                                                                    \
+    typedef typename XV::size_type size_type;                                  \
+                                                                               \
+    static void nrm1(RV& R, const XV& X) {                                     \
+      Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_CUBLAS,double]");    \
+      const size_type numElems = X.extent(0);                                  \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                        \
+        nrm1_print_specialization<RV, XV>();                                   \
+        const int N       = static_cast<int>(numElems);                        \
+        constexpr int one = 1;                                                 \
+        KokkosBlas::Impl::CudaBlasSingleton& s =                               \
+            KokkosBlas::Impl::CudaBlasSingleton::singleton();                  \
+        cublasDasum(s.handle, N, X.data(), one, R.data());                     \
+      } else {                                                                 \
+        Nrm1<RV, XV, 1, false, ETI_SPEC_AVAIL>::nrm1(R, X);                    \
+      }                                                                        \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
 
-#define KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Nrm1< \
-Kokkos::View<float, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<float, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void nrm1 (RV& R, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_CUBLAS,float]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      nrm1_print_specialization<RV,XV>(); \
-      const int N = static_cast<int> (numElems); \
-      constexpr int one = 1; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasSasum(s.handle, N, X.data(), one, R.data()); \
-    } else { \
-      Nrm1<RV,XV,1,false,ETI_SPEC_AVAIL>::nrm1(R,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE,              \
+                                               ETI_SPEC_AVAIL)                \
+  template <class ExecSpace>                                                  \
+  struct Nrm1<                                                                \
+      Kokkos::View<float, LAYOUT, Kokkos::HostSpace,                          \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      1, true, ETI_SPEC_AVAIL> {                                              \
+    typedef Kokkos::View<float, LAYOUT, Kokkos::HostSpace,                    \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        RV;                                                                   \
+    typedef Kokkos::View<const float*, LAYOUT,                                \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        XV;                                                                   \
+    typedef typename XV::size_type size_type;                                 \
+                                                                              \
+    static void nrm1(RV& R, const XV& X) {                                    \
+      Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_CUBLAS,float]");    \
+      const size_type numElems = X.extent(0);                                 \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                       \
+        nrm1_print_specialization<RV, XV>();                                  \
+        const int N       = static_cast<int>(numElems);                       \
+        constexpr int one = 1;                                                \
+        KokkosBlas::Impl::CudaBlasSingleton& s =                              \
+            KokkosBlas::Impl::CudaBlasSingleton::singleton();                 \
+        cublasSasum(s.handle, N, X.data(), one, R.data());                    \
+      } else {                                                                \
+        Nrm1<RV, XV, 1, false, ETI_SPEC_AVAIL>::nrm1(R, X);                   \
+      }                                                                       \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
+  };
 
-#define KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Nrm1< \
-Kokkos::View<double, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<double, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void nrm1 (RV& R, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_CUBLAS,complex<double>]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      nrm1_print_specialization<RV,XV>(); \
-      const int N = static_cast<int> (numElems); \
-      constexpr int one = 1; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasDzasum(s.handle, N, reinterpret_cast<const cuDoubleComplex*>(X.data()), one, R.data()); \
-    } else { \
-      Nrm1<RV,XV,1,false,ETI_SPEC_AVAIL>::nrm1(R,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE,              \
+                                               ETI_SPEC_AVAIL)                \
+  template <class ExecSpace>                                                  \
+  struct Nrm1<Kokkos::View<double, LAYOUT, Kokkos::HostSpace,                 \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+              Kokkos::View<const Kokkos::complex<double>*, LAYOUT,            \
+                           Kokkos::Device<ExecSpace, MEMSPACE>,               \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+              1, true, ETI_SPEC_AVAIL> {                                      \
+    typedef Kokkos::View<double, LAYOUT, Kokkos::HostSpace,                   \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        RV;                                                                   \
+    typedef Kokkos::View<const Kokkos::complex<double>*, LAYOUT,              \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        XV;                                                                   \
+    typedef typename XV::size_type size_type;                                 \
+                                                                              \
+    static void nrm1(RV& R, const XV& X) {                                    \
+      Kokkos::Profiling::pushRegion(                                          \
+          "KokkosBlas::nrm1[TPL_CUBLAS,complex<double>]");                    \
+      const size_type numElems = X.extent(0);                                 \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                       \
+        nrm1_print_specialization<RV, XV>();                                  \
+        const int N       = static_cast<int>(numElems);                       \
+        constexpr int one = 1;                                                \
+        KokkosBlas::Impl::CudaBlasSingleton& s =                              \
+            KokkosBlas::Impl::CudaBlasSingleton::singleton();                 \
+        cublasDzasum(s.handle, N,                                             \
+                     reinterpret_cast<const cuDoubleComplex*>(X.data()), one, \
+                     R.data());                                               \
+      } else {                                                                \
+        Nrm1<RV, XV, 1, false, ETI_SPEC_AVAIL>::nrm1(R, X);                   \
+      }                                                                       \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
+  };
 
-#define KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Nrm1< \
-Kokkos::View<float, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<float, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void nrm1 (RV& R, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_CUBLAS,complex<float>]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      nrm1_print_specialization<RV,XV>(); \
-      const int N = static_cast<int> (numElems); \
-      constexpr int one = 1; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasScasum(s.handle, N, reinterpret_cast<const cuComplex*>(X.data()), one, R.data()); \
-    } else { \
-      Nrm1<RV,XV,1,false,ETI_SPEC_AVAIL>::nrm1(R,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE,        \
+                                               ETI_SPEC_AVAIL)          \
+  template <class ExecSpace>                                            \
+  struct Nrm1<Kokkos::View<float, LAYOUT, Kokkos::HostSpace,            \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,   \
+              Kokkos::View<const Kokkos::complex<float>*, LAYOUT,       \
+                           Kokkos::Device<ExecSpace, MEMSPACE>,         \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,   \
+              1, true, ETI_SPEC_AVAIL> {                                \
+    typedef Kokkos::View<float, LAYOUT, Kokkos::HostSpace,              \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >      \
+        RV;                                                             \
+    typedef Kokkos::View<const Kokkos::complex<float>*, LAYOUT,         \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,           \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >      \
+        XV;                                                             \
+    typedef typename XV::size_type size_type;                           \
+                                                                        \
+    static void nrm1(RV& R, const XV& X) {                              \
+      Kokkos::Profiling::pushRegion(                                    \
+          "KokkosBlas::nrm1[TPL_CUBLAS,complex<float>]");               \
+      const size_type numElems = X.extent(0);                           \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                 \
+        nrm1_print_specialization<RV, XV>();                            \
+        const int N       = static_cast<int>(numElems);                 \
+        constexpr int one = 1;                                          \
+        KokkosBlas::Impl::CudaBlasSingleton& s =                        \
+            KokkosBlas::Impl::CudaBlasSingleton::singleton();           \
+        cublasScasum(s.handle, N,                                       \
+                     reinterpret_cast<const cuComplex*>(X.data()), one, \
+                     R.data());                                         \
+      } else {                                                          \
+        Nrm1<RV, XV, 1, false, ETI_SPEC_AVAIL>::nrm1(R, X);             \
+      }                                                                 \
+      Kokkos::Profiling::popRegion();                                   \
+    }                                                                   \
+  };
 
-KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
+KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                       true)
+KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                       false)
 
-KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
+KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                       true)
+KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                       false)
 
-KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
+KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                       true)
+KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                       false)
 
-KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
+KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                       true)
+KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                       false)
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 #endif
 
diff --git a/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp
index 3facb0c245..68e55927ac 100644
--- a/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp
@@ -48,54 +48,69 @@
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class AV, class XMV, int Xrank = XMV::Rank>
+template <class AV, class XMV, int Xrank = XMV::Rank>
 struct nrm2_tpl_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 namespace KokkosBlas {
 namespace Impl {
 // Generic Host side BLAS (could be MKL or whatever)
 #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
 // double
-#define KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS( SCALAR, LAYOUT, MEMSPACE ) \
-template<class ExecSpace> \
-struct nrm2_tpl_spec_avail< \
-Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE)         \
+  template <class ExecSpace>                                                   \
+  struct nrm2_tpl_spec_avail<                                                  \
+      Kokkos::View<                                                            \
+          typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, \
+          LAYOUT, Kokkos::HostSpace,                                           \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                           \
+      Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1> {                                                                     \
+    enum : bool { value = true };                                              \
+  };
 
-KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS( double,                  Kokkos::LayoutLeft, Kokkos::HostSpace)
-KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS( float,                   Kokkos::LayoutLeft, Kokkos::HostSpace)
-KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::HostSpace)
-KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::HostSpace)
+KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft,
+                                     Kokkos::HostSpace)
+KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft,
+                                     Kokkos::HostSpace)
+KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<double>,
+                                     Kokkos::LayoutLeft, Kokkos::HostSpace)
+KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<float>, Kokkos::LayoutLeft,
+                                     Kokkos::HostSpace)
 
 #endif
 
 // cuBLAS
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
 // double
-#define KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS( SCALAR, LAYOUT, MEMSPACE ) \
-template<class ExecSpace> \
-struct nrm2_tpl_spec_avail< \
-Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE)       \
+  template <class ExecSpace>                                                   \
+  struct nrm2_tpl_spec_avail<                                                  \
+      Kokkos::View<                                                            \
+          typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, \
+          LAYOUT, Kokkos::HostSpace,                                           \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                           \
+      Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1> {                                                                     \
+    enum : bool { value = true };                                              \
+  };
 
-KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS( double,                  Kokkos::LayoutLeft, Kokkos::CudaSpace)
-KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS( float,                   Kokkos::LayoutLeft, Kokkos::CudaSpace)
-KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::CudaSpace)
-KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft,
+                                       Kokkos::CudaSpace)
+KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft,
+                                       Kokkos::CudaSpace)
+KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<double>,
+                                       Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<float>,
+                                       Kokkos::LayoutLeft, Kokkos::CudaSpace)
 
 #endif
 
-
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 #endif
diff --git a/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp
index 102d9f528b..954c039b8d 100644
--- a/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp
@@ -45,21 +45,20 @@
 #ifndef KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_HPP_
 #define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_HPP_
 
-
 namespace KokkosBlas {
 namespace Impl {
 
-
 namespace {
-  template<class RV, class XV>
-  inline void nrm2_print_specialization() {
-      #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-        printf("KokkosBlas1::nrm2<> TPL Blas specialization for < %s , %s >\n",typeid(RV).name(),typeid(XV).name());
-      #endif
-  }
-}
-}
+template <class RV, class XV>
+inline void nrm2_print_specialization() {
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+  printf("KokkosBlas1::nrm2<> TPL Blas specialization for < %s , %s >\n",
+         typeid(RV).name(), typeid(XV).name());
+#endif
 }
+}  // namespace
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 // Generic Host side BLAS (could be MKL or whatever)
 #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
@@ -68,307 +67,349 @@ namespace {
 namespace KokkosBlas {
 namespace Impl {
 
-
-#define KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_BLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Nrm2< \
-Kokkos::View<double, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<double, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void nrm2 (RV& R, const XV& X, const bool& take_sqrt) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_BLAS,double]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      nrm2_print_specialization<RV,XV>(); \
-      int N = numElems; \
-      int int_one = 1; \
-      R() = HostBlas<double>::nrm2(N,X.data(),int_one); \
-      if(!take_sqrt) R() = R()*R(); \
-    } else { \
-      Nrm2<RV,XV,1,false,ETI_SPEC_AVAIL>::nrm2(R,X,take_sqrt); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_BLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Nrm2< \
-Kokkos::View<float, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<float, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void nrm2 (RV& R, const XV& X, const bool& take_sqrt) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_BLAS,float]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      nrm2_print_specialization<RV,XV>(); \
-      int N = numElems; \
-      int int_one = 1; \
-      R() = HostBlas<float>::nrm2(N,X.data(),int_one); \
-      if(!take_sqrt) R() = R()*R(); \
-    } else { \
-      Nrm2<RV,XV,1,false,ETI_SPEC_AVAIL>::nrm2(R,X,take_sqrt); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_BLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Nrm2< \
-Kokkos::View<double, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<double, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void nrm2 (RV& R, const XV& X, const bool& take_sqrt) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_BLAS,complex<double>]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      nrm2_print_specialization<RV,XV>(); \
-      int N = numElems; \
-      int int_one = 1; \
-      R() = HostBlas<std::complex<double> >::nrm2(N,reinterpret_cast<const std::complex<double>*>(X.data()),int_one);       \
-      if(!take_sqrt) R() = R()*R(); \
-    } else { \
-      Nrm2<RV,XV,1,false,ETI_SPEC_AVAIL>::nrm2(R,X,take_sqrt); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Nrm2< \
-Kokkos::View<float, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<float, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void nrm2 (RV& R, const XV& X, const bool& take_sqrt) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_BLAS,complex<float>]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      nrm2_print_specialization<RV,XV>(); \
-      int N = numElems; \
-      int int_one = 1; \
-      R() = HostBlas<std::complex<float> >::nrm2(N,reinterpret_cast<const std::complex<float>*>(X.data()),int_one);       \
-      if(!take_sqrt) R() = R()*R(); \
-    } else { \
-      Nrm2<RV,XV,1,false,ETI_SPEC_AVAIL>::nrm2(R,X,take_sqrt); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
-
-KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
-
-KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
-
-KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
-
-}
-}
+#define KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \
+  template <class ExecSpace>                                                   \
+  struct Nrm2<                                                                 \
+      Kokkos::View<double, LAYOUT, Kokkos::HostSpace,                          \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1, true, ETI_SPEC_AVAIL> {                                               \
+    typedef Kokkos::View<double, LAYOUT, Kokkos::HostSpace,                    \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        RV;                                                                    \
+    typedef Kokkos::View<const double*, LAYOUT,                                \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                  \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XV;                                                                    \
+    typedef typename XV::size_type size_type;                                  \
+                                                                               \
+    static void nrm2(RV& R, const XV& X, const bool& take_sqrt) {              \
+      Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_BLAS,double]");      \
+      const size_type numElems = X.extent(0);                                  \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                        \
+        nrm2_print_specialization<RV, XV>();                                   \
+        int N       = numElems;                                                \
+        int int_one = 1;                                                       \
+        R()         = HostBlas<double>::nrm2(N, X.data(), int_one);            \
+        if (!take_sqrt) R() = R() * R();                                       \
+      } else {                                                                 \
+        Nrm2<RV, XV, 1, false, ETI_SPEC_AVAIL>::nrm2(R, X, take_sqrt);         \
+      }                                                                        \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+#define KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \
+  template <class ExecSpace>                                                   \
+  struct Nrm2<                                                                 \
+      Kokkos::View<float, LAYOUT, Kokkos::HostSpace,                           \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1, true, ETI_SPEC_AVAIL> {                                               \
+    typedef Kokkos::View<float, LAYOUT, Kokkos::HostSpace,                     \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        RV;                                                                    \
+    typedef Kokkos::View<const float*, LAYOUT,                                 \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                  \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XV;                                                                    \
+    typedef typename XV::size_type size_type;                                  \
+                                                                               \
+    static void nrm2(RV& R, const XV& X, const bool& take_sqrt) {              \
+      Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_BLAS,float]");       \
+      const size_type numElems = X.extent(0);                                  \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                        \
+        nrm2_print_specialization<RV, XV>();                                   \
+        int N       = numElems;                                                \
+        int int_one = 1;                                                       \
+        R()         = HostBlas<float>::nrm2(N, X.data(), int_one);             \
+        if (!take_sqrt) R() = R() * R();                                       \
+      } else {                                                                 \
+        Nrm2<RV, XV, 1, false, ETI_SPEC_AVAIL>::nrm2(R, X, take_sqrt);         \
+      }                                                                        \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+#define KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL)  \
+  template <class ExecSpace>                                                    \
+  struct Nrm2<Kokkos::View<double, LAYOUT, Kokkos::HostSpace,                   \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,           \
+              Kokkos::View<const Kokkos::complex<double>*, LAYOUT,              \
+                           Kokkos::Device<ExecSpace, MEMSPACE>,                 \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,           \
+              1, true, ETI_SPEC_AVAIL> {                                        \
+    typedef Kokkos::View<double, LAYOUT, Kokkos::HostSpace,                     \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >              \
+        RV;                                                                     \
+    typedef Kokkos::View<const Kokkos::complex<double>*, LAYOUT,                \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                   \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >              \
+        XV;                                                                     \
+    typedef typename XV::size_type size_type;                                   \
+                                                                                \
+    static void nrm2(RV& R, const XV& X, const bool& take_sqrt) {               \
+      Kokkos::Profiling::pushRegion(                                            \
+          "KokkosBlas::nrm2[TPL_BLAS,complex<double>]");                        \
+      const size_type numElems = X.extent(0);                                   \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                         \
+        nrm2_print_specialization<RV, XV>();                                    \
+        int N       = numElems;                                                 \
+        int int_one = 1;                                                        \
+        R()         = HostBlas<std::complex<double> >::nrm2(                    \
+            N, reinterpret_cast<const std::complex<double>*>(X.data()), \
+            int_one);                                                   \
+        if (!take_sqrt) R() = R() * R();                                        \
+      } else {                                                                  \
+        Nrm2<RV, XV, 1, false, ETI_SPEC_AVAIL>::nrm2(R, X, take_sqrt);          \
+      }                                                                         \
+      Kokkos::Profiling::popRegion();                                           \
+    }                                                                           \
+  };
+
+#define KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \
+  template <class ExecSpace>                                                   \
+  struct Nrm2<Kokkos::View<float, LAYOUT, Kokkos::HostSpace,                   \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+              Kokkos::View<const Kokkos::complex<float>*, LAYOUT,              \
+                           Kokkos::Device<ExecSpace, MEMSPACE>,                \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+              1, true, ETI_SPEC_AVAIL> {                                       \
+    typedef Kokkos::View<float, LAYOUT, Kokkos::HostSpace,                     \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        RV;                                                                    \
+    typedef Kokkos::View<const Kokkos::complex<float>*, LAYOUT,                \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                  \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XV;                                                                    \
+    typedef typename XV::size_type size_type;                                  \
+                                                                               \
+    static void nrm2(RV& R, const XV& X, const bool& take_sqrt) {              \
+      Kokkos::Profiling::pushRegion(                                           \
+          "KokkosBlas::nrm2[TPL_BLAS,complex<float>]");                        \
+      const size_type numElems = X.extent(0);                                  \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                        \
+        nrm2_print_specialization<RV, XV>();                                   \
+        int N       = numElems;                                                \
+        int int_one = 1;                                                       \
+        R()         = HostBlas<std::complex<float> >::nrm2(                    \
+            N, reinterpret_cast<const std::complex<float>*>(X.data()), \
+            int_one);                                                  \
+        if (!take_sqrt) R() = R() * R();                                       \
+      } else {                                                                 \
+        Nrm2<RV, XV, 1, false, ETI_SPEC_AVAIL>::nrm2(R, X, take_sqrt);         \
+      }                                                                        \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                     true)
+KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                     false)
+
+KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                     true)
+KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                     false)
+
+KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                     true)
+KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                     false)
+
+KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                     true)
+KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                     false)
+
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 #endif
 
 // cuBLAS
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
-#include<KokkosBlas_tpl_spec.hpp>
+#include <KokkosBlas_tpl_spec.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
 
-#define KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Nrm2< \
-Kokkos::View<double, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<double, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void nrm2 (RV& R, const XV& X, const bool& take_sqrt) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS,double]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      nrm2_print_specialization<RV,XV>(); \
-      const int N = static_cast<int> (numElems); \
-      constexpr int int_one = 1; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasDnrm2(s.handle, N, X.data(), int_one, &R()); \
-      if(!take_sqrt) R() = R()*R(); \
-    } else { \
-      Nrm2<RV,XV,1,false,ETI_SPEC_AVAIL>::nrm2(R,X,take_sqrt); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Nrm2< \
-Kokkos::View<float, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<float, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void nrm2 (RV& R, const XV& X, const bool& take_sqrt) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS,float]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      nrm2_print_specialization<RV,XV>(); \
-      const int N = static_cast<int> (numElems); \
-      constexpr int int_one = 1; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasSnrm2(s.handle, N, X.data(), int_one, &R()); \
-      if(!take_sqrt) R() = R()*R(); \
-    } else { \
-      Nrm2<RV,XV,1,false,ETI_SPEC_AVAIL>::nrm2(R,X,take_sqrt); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Nrm2< \
-Kokkos::View<double, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<double, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void nrm2 (RV& R, const XV& X, const bool& take_sqrt) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS,complex<double>]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      nrm2_print_specialization<RV,XV>(); \
-      const int N = static_cast<int> (numElems); \
-      constexpr int int_one = 1; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasDznrm2(s.handle, N, reinterpret_cast<const cuDoubleComplex*>(X.data()), int_one, &R()); \
-      if(!take_sqrt) R() = R()*R(); \
-    } else { \
-      Nrm2<RV,XV,1,false,ETI_SPEC_AVAIL>::nrm2(R,X,take_sqrt); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Nrm2< \
-Kokkos::View<float, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<float, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void nrm2 (RV& R, const XV& X, const bool& take_sqrt) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS,complex<float>]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      nrm2_print_specialization<RV,XV>(); \
-      const int N = static_cast<int> (numElems); \
-      constexpr int int_one = 1; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasScnrm2(s.handle, N, reinterpret_cast<const cuComplex*>(X.data()), int_one, &R()); \
-      if(!take_sqrt) R() = R()*R(); \
-    } else { \
-      Nrm2<RV,XV,1,false,ETI_SPEC_AVAIL>::nrm2(R,X,take_sqrt); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
-
-KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
-
-KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
-
-KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
-
-}
-}
+#define KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE,               \
+                                               ETI_SPEC_AVAIL)                 \
+  template <class ExecSpace>                                                   \
+  struct Nrm2<                                                                 \
+      Kokkos::View<double, LAYOUT, Kokkos::HostSpace,                          \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1, true, ETI_SPEC_AVAIL> {                                               \
+    typedef Kokkos::View<double, LAYOUT, Kokkos::HostSpace,                    \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        RV;                                                                    \
+    typedef Kokkos::View<const double*, LAYOUT,                                \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                  \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XV;                                                                    \
+    typedef typename XV::size_type size_type;                                  \
+                                                                               \
+    static void nrm2(RV& R, const XV& X, const bool& take_sqrt) {              \
+      Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS,double]");    \
+      const size_type numElems = X.extent(0);                                  \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                        \
+        nrm2_print_specialization<RV, XV>();                                   \
+        const int N           = static_cast<int>(numElems);                    \
+        constexpr int int_one = 1;                                             \
+        KokkosBlas::Impl::CudaBlasSingleton& s =                               \
+            KokkosBlas::Impl::CudaBlasSingleton::singleton();                  \
+        cublasDnrm2(s.handle, N, X.data(), int_one, &R());                     \
+        if (!take_sqrt) R() = R() * R();                                       \
+      } else {                                                                 \
+        Nrm2<RV, XV, 1, false, ETI_SPEC_AVAIL>::nrm2(R, X, take_sqrt);         \
+      }                                                                        \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+#define KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE,              \
+                                               ETI_SPEC_AVAIL)                \
+  template <class ExecSpace>                                                  \
+  struct Nrm2<                                                                \
+      Kokkos::View<float, LAYOUT, Kokkos::HostSpace,                          \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      1, true, ETI_SPEC_AVAIL> {                                              \
+    typedef Kokkos::View<float, LAYOUT, Kokkos::HostSpace,                    \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        RV;                                                                   \
+    typedef Kokkos::View<const float*, LAYOUT,                                \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        XV;                                                                   \
+    typedef typename XV::size_type size_type;                                 \
+                                                                              \
+    static void nrm2(RV& R, const XV& X, const bool& take_sqrt) {             \
+      Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS,float]");    \
+      const size_type numElems = X.extent(0);                                 \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                       \
+        nrm2_print_specialization<RV, XV>();                                  \
+        const int N           = static_cast<int>(numElems);                   \
+        constexpr int int_one = 1;                                            \
+        KokkosBlas::Impl::CudaBlasSingleton& s =                              \
+            KokkosBlas::Impl::CudaBlasSingleton::singleton();                 \
+        cublasSnrm2(s.handle, N, X.data(), int_one, &R());                    \
+        if (!take_sqrt) R() = R() * R();                                      \
+      } else {                                                                \
+        Nrm2<RV, XV, 1, false, ETI_SPEC_AVAIL>::nrm2(R, X, take_sqrt);        \
+      }                                                                       \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
+  };
+
+#define KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE,         \
+                                               ETI_SPEC_AVAIL)           \
+  template <class ExecSpace>                                             \
+  struct Nrm2<Kokkos::View<double, LAYOUT, Kokkos::HostSpace,            \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,    \
+              Kokkos::View<const Kokkos::complex<double>*, LAYOUT,       \
+                           Kokkos::Device<ExecSpace, MEMSPACE>,          \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,    \
+              1, true, ETI_SPEC_AVAIL> {                                 \
+    typedef Kokkos::View<double, LAYOUT, Kokkos::HostSpace,              \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >       \
+        RV;                                                              \
+    typedef Kokkos::View<const Kokkos::complex<double>*, LAYOUT,         \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,            \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >       \
+        XV;                                                              \
+    typedef typename XV::size_type size_type;                            \
+                                                                         \
+    static void nrm2(RV& R, const XV& X, const bool& take_sqrt) {        \
+      Kokkos::Profiling::pushRegion(                                     \
+          "KokkosBlas::nrm2[TPL_CUBLAS,complex<double>]");               \
+      const size_type numElems = X.extent(0);                            \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                  \
+        nrm2_print_specialization<RV, XV>();                             \
+        const int N           = static_cast<int>(numElems);              \
+        constexpr int int_one = 1;                                       \
+        KokkosBlas::Impl::CudaBlasSingleton& s =                         \
+            KokkosBlas::Impl::CudaBlasSingleton::singleton();            \
+        cublasDznrm2(s.handle, N,                                        \
+                     reinterpret_cast<const cuDoubleComplex*>(X.data()), \
+                     int_one, &R());                                     \
+        if (!take_sqrt) R() = R() * R();                                 \
+      } else {                                                           \
+        Nrm2<RV, XV, 1, false, ETI_SPEC_AVAIL>::nrm2(R, X, take_sqrt);   \
+      }                                                                  \
+      Kokkos::Profiling::popRegion();                                    \
+    }                                                                    \
+  };
+
+#define KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE,            \
+                                               ETI_SPEC_AVAIL)              \
+  template <class ExecSpace>                                                \
+  struct Nrm2<Kokkos::View<float, LAYOUT, Kokkos::HostSpace,                \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,       \
+              Kokkos::View<const Kokkos::complex<float>*, LAYOUT,           \
+                           Kokkos::Device<ExecSpace, MEMSPACE>,             \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,       \
+              1, true, ETI_SPEC_AVAIL> {                                    \
+    typedef Kokkos::View<float, LAYOUT, Kokkos::HostSpace,                  \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >          \
+        RV;                                                                 \
+    typedef Kokkos::View<const Kokkos::complex<float>*, LAYOUT,             \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,               \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >          \
+        XV;                                                                 \
+    typedef typename XV::size_type size_type;                               \
+                                                                            \
+    static void nrm2(RV& R, const XV& X, const bool& take_sqrt) {           \
+      Kokkos::Profiling::pushRegion(                                        \
+          "KokkosBlas::nrm2[TPL_CUBLAS,complex<float>]");                   \
+      const size_type numElems = X.extent(0);                               \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                     \
+        nrm2_print_specialization<RV, XV>();                                \
+        const int N           = static_cast<int>(numElems);                 \
+        constexpr int int_one = 1;                                          \
+        KokkosBlas::Impl::CudaBlasSingleton& s =                            \
+            KokkosBlas::Impl::CudaBlasSingleton::singleton();               \
+        cublasScnrm2(s.handle, N,                                           \
+                     reinterpret_cast<const cuComplex*>(X.data()), int_one, \
+                     &R());                                                 \
+        if (!take_sqrt) R() = R() * R();                                    \
+      } else {                                                              \
+        Nrm2<RV, XV, 1, false, ETI_SPEC_AVAIL>::nrm2(R, X, take_sqrt);      \
+      }                                                                     \
+      Kokkos::Profiling::popRegion();                                       \
+    }                                                                       \
+  };
+
+KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                       true)
+KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                       false)
+
+KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                       true)
+KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                       false)
+
+KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                       true)
+KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                       false)
+
+KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                       true)
+KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                       false)
+
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 #endif
 
-
 #endif
diff --git a/src/impl/tpls/KokkosBlas1_nrm2w_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_nrm2w_tpl_spec_avail.hpp
index 70b95aba6c..6dde279624 100644
--- a/src/impl/tpls/KokkosBlas1_nrm2w_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosBlas1_nrm2w_tpl_spec_avail.hpp
@@ -48,11 +48,11 @@
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class RMV, class XMV, int rank = RMV::rank>
+template <class RMV, class XMV, int rank = RMV::rank>
 struct nrm2w_tpl_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/impl/tpls/KokkosBlas1_nrm2w_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_nrm2w_tpl_spec_decl.hpp
index b77bfdb35f..919392902b 100644
--- a/src/impl/tpls/KokkosBlas1_nrm2w_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosBlas1_nrm2w_tpl_spec_decl.hpp
@@ -46,8 +46,7 @@
 #define KOKKOSBLAS1_NRM2W_TPL_SPEC_DECL_HPP_
 
 namespace KokkosBlas {
-namespace Impl {
-}
-}
+namespace Impl {}
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp
index 072abff904..31286c5baf 100644
--- a/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp
@@ -48,12 +48,12 @@
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class AV, class XMV, int Xrank = XMV::Rank>
+template <class AV, class XMV, int Xrank = XMV::Rank>
 struct nrminf_tpl_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 namespace KokkosBlas {
 namespace Impl {
@@ -61,22 +61,30 @@ namespace Impl {
 // Generic Host side BLAS (could be MKL or whatever)
 #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
 // double
-#define KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS( SCALAR, LAYOUT, MEMSPACE ) \
-template<class ExecSpace> \
-struct nrminf_tpl_spec_avail< \
-Kokkos::View<typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE)       \
+  template <class ExecSpace>                                                   \
+  struct nrminf_tpl_spec_avail<                                                \
+      Kokkos::View<                                                            \
+          typename Kokkos::Details::InnerProductSpaceTraits<SCALAR>::mag_type, \
+          LAYOUT, Kokkos::HostSpace,                                           \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                           \
+      Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1> {                                                                     \
+    enum : bool { value = true };                                              \
+  };
 
-KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS( double,                  Kokkos::LayoutLeft, Kokkos::HostSpace)
-KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS( float,                   Kokkos::LayoutLeft, Kokkos::HostSpace)
-KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::HostSpace)
-KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::HostSpace)
+KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft,
+                                       Kokkos::HostSpace)
+KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft,
+                                       Kokkos::HostSpace)
+KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<double>,
+                                       Kokkos::LayoutLeft, Kokkos::HostSpace)
+KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<float>,
+                                       Kokkos::LayoutLeft, Kokkos::HostSpace)
 
 #endif
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 #endif
diff --git a/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp
index b91e81891a..cdf2aa1b1b 100644
--- a/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp
@@ -45,21 +45,20 @@
 #ifndef KOKKOSBLAS1_NRMINF_TPL_SPEC_DECL_HPP_
 #define KOKKOSBLAS1_NRMINF_TPL_SPEC_DECL_HPP_
 
-
 namespace KokkosBlas {
 namespace Impl {
 
-
 namespace {
-  template<class RV, class XV>
-  inline void nrminf_print_specialization() {
-      #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-        printf("KokkosBlas1::nrminf<> TPL Blas specialization for < %s , %s >\n",typeid(RV).name(),typeid(XV).name());
-      #endif
-  }
-}
-}
+template <class RV, class XV>
+inline void nrminf_print_specialization() {
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+  printf("KokkosBlas1::nrminf<> TPL Blas specialization for < %s , %s >\n",
+         typeid(RV).name(), typeid(XV).name());
+#endif
 }
+}  // namespace
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 // Generic Host side BLAS (could be MKL or whatever)
 #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
@@ -68,157 +67,196 @@ namespace {
 namespace KokkosBlas {
 namespace Impl {
 
+#define KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE,               \
+                                               ETI_SPEC_AVAIL)                 \
+  template <class ExecSpace>                                                   \
+  struct NrmInf<                                                               \
+      Kokkos::View<double, LAYOUT, Kokkos::HostSpace,                          \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                   \
+      Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                   \
+      1, true, ETI_SPEC_AVAIL> {                                               \
+    typedef Kokkos::View<double, LAYOUT, Kokkos::HostSpace,                    \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged>>              \
+        RV;                                                                    \
+    typedef Kokkos::View<const double*, LAYOUT,                                \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                  \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged>>              \
+        XV;                                                                    \
+    typedef typename XV::size_type size_type;                                  \
+    typedef Kokkos::Details::InnerProductSpaceTraits<double> IPT;              \
+                                                                               \
+    static void nrminf(RV& R, const XV& X) {                                   \
+      Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_BLAS,double]");    \
+      const size_type numElems = X.extent(0);                                  \
+      if (numElems == 0) {                                                     \
+        R() = 0.0;                                                             \
+        return;                                                                \
+      }                                                                        \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                        \
+        nrminf_print_specialization<RV, XV>();                                 \
+        int N   = numElems;                                                    \
+        int one = 1;                                                           \
+        int idx = HostBlas<double>::iamax(N, X.data(), one) - 1;               \
+        R()     = IPT::norm(X(idx));                                           \
+      } else {                                                                 \
+        NrmInf<RV, XV, 1, false, ETI_SPEC_AVAIL>::nrminf(R, X);                \
+      }                                                                        \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
 
-#define KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_BLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct NrmInf< \
-Kokkos::View<double, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<double, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  typedef Kokkos::Details::InnerProductSpaceTraits<double> IPT; \
-  \
-  static void nrminf (RV& R, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_BLAS,double]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems == 0) { R() = 0.0; return; } \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      nrminf_print_specialization<RV,XV>(); \
-      int N = numElems; \
-      int one = 1; \
-      int idx = HostBlas<double>::iamax(N,X.data(),one)-1;  \
-      R() = IPT::norm(X(idx)); \
-    } else { \
-      NrmInf<RV,XV,1,false,ETI_SPEC_AVAIL>::nrminf(R,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_BLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct NrmInf< \
-Kokkos::View<float, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<float, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  typedef Kokkos::Details::InnerProductSpaceTraits<float> IPT; \
-  \
-  static void nrminf (RV& R, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_BLAS,float]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems == 0) { R() = 0.0f; return; } \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      nrminf_print_specialization<RV,XV>(); \
-      int N = numElems; \
-      int one = 1; \
-      int idx = HostBlas<float>::iamax(N,X.data(),one)-1;  \
-      R() = IPT::norm(X(idx)); \
-    } else { \
-      NrmInf<RV,XV,1,false,ETI_SPEC_AVAIL>::nrminf(R,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_BLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct NrmInf< \
-Kokkos::View<double, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<double, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  typedef Kokkos::Details::InnerProductSpaceTraits<Kokkos::complex<double>> IPT; \
-  \
-  static void nrminf (RV& R, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_BLAS,complex<double>]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems == 0) { R() = 0.0; return; } \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      nrminf_print_specialization<RV,XV>(); \
-      int N = numElems; \
-      int one = 1; \
-      int idx = HostBlas<std::complex<double> >::iamax(N,reinterpret_cast<const std::complex<double>*>(X.data()),one)-1; \
-      R() = IPT::norm(X(idx)); \
-    } else { \
-      NrmInf<RV,XV,1,false,ETI_SPEC_AVAIL>::nrminf(R,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_BLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct NrmInf< \
-Kokkos::View<float, LAYOUT, Kokkos::HostSpace, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<float, LAYOUT, Kokkos::HostSpace, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  typedef Kokkos::Details::InnerProductSpaceTraits<Kokkos::complex<float>> IPT; \
-  \
-  static void nrminf (RV& R, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_BLAS,complex<float>]"); \
-    const size_type numElems = X.extent(0); \
-    if (numElems == 0) { R() = 0.0f; return; } \
-    if (numElems < static_cast<size_type> (INT_MAX)) { \
-      nrminf_print_specialization<RV,XV>(); \
-      int N = numElems; \
-      int one = 1; \
-      int idx = HostBlas<std::complex<float> >::iamax(N,reinterpret_cast<const std::complex<float>*>(X.data()),one)-1; \
-      R() = IPT::norm(X(idx)); \
-    } else { \
-      NrmInf<RV,XV,1,false,ETI_SPEC_AVAIL>::nrminf(R,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
-
-KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
-
-KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
-
-KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
+#define KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE,              \
+                                               ETI_SPEC_AVAIL)                \
+  template <class ExecSpace>                                                  \
+  struct NrmInf<                                                              \
+      Kokkos::View<float, LAYOUT, Kokkos::HostSpace,                          \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                  \
+      Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged>>,                  \
+      1, true, ETI_SPEC_AVAIL> {                                              \
+    typedef Kokkos::View<float, LAYOUT, Kokkos::HostSpace,                    \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged>>             \
+        RV;                                                                   \
+    typedef Kokkos::View<const float*, LAYOUT,                                \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged>>             \
+        XV;                                                                   \
+    typedef typename XV::size_type size_type;                                 \
+    typedef Kokkos::Details::InnerProductSpaceTraits<float> IPT;              \
+                                                                              \
+    static void nrminf(RV& R, const XV& X) {                                  \
+      Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_BLAS,float]");    \
+      const size_type numElems = X.extent(0);                                 \
+      if (numElems == 0) {                                                    \
+        R() = 0.0f;                                                           \
+        return;                                                               \
+      }                                                                       \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                       \
+        nrminf_print_specialization<RV, XV>();                                \
+        int N   = numElems;                                                   \
+        int one = 1;                                                          \
+        int idx = HostBlas<float>::iamax(N, X.data(), one) - 1;               \
+        R()     = IPT::norm(X(idx));                                          \
+      } else {                                                                \
+        NrmInf<RV, XV, 1, false, ETI_SPEC_AVAIL>::nrminf(R, X);               \
+      }                                                                       \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
+  };
 
-}
-}
+#define KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE,              \
+                                               ETI_SPEC_AVAIL)                \
+  template <class ExecSpace>                                                  \
+  struct NrmInf<Kokkos::View<double, LAYOUT, Kokkos::HostSpace,               \
+                             Kokkos::MemoryTraits<Kokkos::Unmanaged>>,        \
+                Kokkos::View<const Kokkos::complex<double>*, LAYOUT,          \
+                             Kokkos::Device<ExecSpace, MEMSPACE>,             \
+                             Kokkos::MemoryTraits<Kokkos::Unmanaged>>,        \
+                1, true, ETI_SPEC_AVAIL> {                                    \
+    typedef Kokkos::View<double, LAYOUT, Kokkos::HostSpace,                   \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged>>             \
+        RV;                                                                   \
+    typedef Kokkos::View<const Kokkos::complex<double>*, LAYOUT,              \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged>>             \
+        XV;                                                                   \
+    typedef typename XV::size_type size_type;                                 \
+    typedef Kokkos::Details::InnerProductSpaceTraits<Kokkos::complex<double>> \
+        IPT;                                                                  \
+                                                                              \
+    static void nrminf(RV& R, const XV& X) {                                  \
+      Kokkos::Profiling::pushRegion(                                          \
+          "KokkosBlas::nrminf[TPL_BLAS,complex<double>]");                    \
+      const size_type numElems = X.extent(0);                                 \
+      if (numElems == 0) {                                                    \
+        R() = 0.0;                                                            \
+        return;                                                               \
+      }                                                                       \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                       \
+        nrminf_print_specialization<RV, XV>();                                \
+        int N   = numElems;                                                   \
+        int one = 1;                                                          \
+        int idx =                                                             \
+            HostBlas<std::complex<double>>::iamax(                            \
+                N, reinterpret_cast<const std::complex<double>*>(X.data()),   \
+                one) -                                                        \
+            1;                                                                \
+        R() = IPT::norm(X(idx));                                              \
+      } else {                                                                \
+        NrmInf<RV, XV, 1, false, ETI_SPEC_AVAIL>::nrminf(R, X);               \
+      }                                                                       \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
+  };
+
+#define KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE,             \
+                                               ETI_SPEC_AVAIL)               \
+  template <class ExecSpace>                                                 \
+  struct NrmInf<Kokkos::View<float, LAYOUT, Kokkos::HostSpace,               \
+                             Kokkos::MemoryTraits<Kokkos::Unmanaged>>,       \
+                Kokkos::View<const Kokkos::complex<float>*, LAYOUT,          \
+                             Kokkos::Device<ExecSpace, MEMSPACE>,            \
+                             Kokkos::MemoryTraits<Kokkos::Unmanaged>>,       \
+                1, true, ETI_SPEC_AVAIL> {                                   \
+    typedef Kokkos::View<float, LAYOUT, Kokkos::HostSpace,                   \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged>>            \
+        RV;                                                                  \
+    typedef Kokkos::View<const Kokkos::complex<float>*, LAYOUT,              \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged>>            \
+        XV;                                                                  \
+    typedef typename XV::size_type size_type;                                \
+    typedef Kokkos::Details::InnerProductSpaceTraits<Kokkos::complex<float>> \
+        IPT;                                                                 \
+                                                                             \
+    static void nrminf(RV& R, const XV& X) {                                 \
+      Kokkos::Profiling::pushRegion(                                         \
+          "KokkosBlas::nrminf[TPL_BLAS,complex<float>]");                    \
+      const size_type numElems = X.extent(0);                                \
+      if (numElems == 0) {                                                   \
+        R() = 0.0f;                                                          \
+        return;                                                              \
+      }                                                                      \
+      if (numElems < static_cast<size_type>(INT_MAX)) {                      \
+        nrminf_print_specialization<RV, XV>();                               \
+        int N   = numElems;                                                  \
+        int one = 1;                                                         \
+        int idx =                                                            \
+            HostBlas<std::complex<float>>::iamax(                            \
+                N, reinterpret_cast<const std::complex<float>*>(X.data()),   \
+                one) -                                                       \
+            1;                                                               \
+        R() = IPT::norm(X(idx));                                             \
+      } else {                                                               \
+        NrmInf<RV, XV, 1, false, ETI_SPEC_AVAIL>::nrminf(R, X);              \
+      }                                                                      \
+      Kokkos::Profiling::popRegion();                                        \
+    }                                                                        \
+  };
+
+KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                       true)
+KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                       false)
+
+KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                       true)
+KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                       false)
+
+KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                       true)
+KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                       false)
+
+KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                       true)
+KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                       false)
+
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 #endif
 
diff --git a/src/impl/tpls/KokkosBlas1_reciprocal_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_reciprocal_tpl_spec_avail.hpp
index e27a9ec8e2..84de98fdb0 100644
--- a/src/impl/tpls/KokkosBlas1_reciprocal_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosBlas1_reciprocal_tpl_spec_avail.hpp
@@ -48,11 +48,11 @@
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class RMV, class XMV, int rank = RMV::rank>
+template <class RMV, class XMV, int rank = RMV::rank>
 struct reciprocal_tpl_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/impl/tpls/KokkosBlas1_reciprocal_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_reciprocal_tpl_spec_decl.hpp
index 5f65659b05..83665a886b 100644
--- a/src/impl/tpls/KokkosBlas1_reciprocal_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosBlas1_reciprocal_tpl_spec_decl.hpp
@@ -46,8 +46,7 @@
 #define KOKKOSBLAS1_RECIPROCAL_TPL_SPEC_DECL_HPP_
 
 namespace KokkosBlas {
-namespace Impl {
-}
-}
+namespace Impl {}
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/impl/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp
index 114923cca7..7e01fc1b65 100644
--- a/src/impl/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp
@@ -48,62 +48,105 @@
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class RV, class AV, class XV, int Xrank = XV::Rank>
+template <class RV, class AV, class XV, int Xrank = XV::Rank>
 struct scal_tpl_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 namespace KokkosBlas {
 namespace Impl {
 
 // Generic Host side BLAS (could be MKL or whatever)
-#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
+#if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS)
 // double
-#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS( SCALAR, LAYOUT, MEMSPACE ) \
-template<class ExecSpace> \
-struct scal_tpl_spec_avail< \
-Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-SCALAR, \
-Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE)         \
+  template <class ExecSpace>                                                   \
+  struct scal_tpl_spec_avail<                                                  \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      SCALAR,                                                                  \
+      Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1> {                                                                     \
+    enum : bool { value = true };                                              \
+  };
 
-KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS( double,                  Kokkos::LayoutLeft, Kokkos::HostSpace)
-KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS( float,                   Kokkos::LayoutLeft, Kokkos::HostSpace)
-KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::HostSpace)
-KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::HostSpace)
+KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft,
+                                     Kokkos::HostSpace)
+KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft,
+                                     Kokkos::HostSpace)
+KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<double>,
+                                     Kokkos::LayoutLeft, Kokkos::HostSpace)
+KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<float>, Kokkos::LayoutLeft,
+                                     Kokkos::HostSpace)
 
 #endif
 
 // cuBLAS
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS)
 // double
-#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( SCALAR, LAYOUT, MEMSPACE ) \
-template<class ExecSpace> \
-struct scal_tpl_spec_avail< \
-Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-SCALAR, \
-Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1> { enum : bool { value = true }; };
+#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE)       \
+  template <class ExecSpace>                                                   \
+  struct scal_tpl_spec_avail<                                                  \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      SCALAR,                                                                  \
+      Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1> {                                                                     \
+    enum : bool { value = true };                                              \
+  };
 
-KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( double,                  Kokkos::LayoutLeft, Kokkos::CudaSpace)
-KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( float,                   Kokkos::LayoutLeft, Kokkos::CudaSpace)
-KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::CudaSpace)
-KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft,
+                                       Kokkos::CudaSpace)
+KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft,
+                                       Kokkos::CudaSpace)
+KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<double>,
+                                       Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<float>,
+                                       Kokkos::LayoutLeft, Kokkos::CudaSpace)
 
-KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( double,                  Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
-KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( float,                   Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
-KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
-KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
+KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft,
+                                       Kokkos::CudaUVMSpace)
+KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft,
+                                       Kokkos::CudaUVMSpace)
+KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<double>,
+                                       Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
+KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<float>,
+                                       Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
 
 #endif
 
+// rocBLAS
+#if defined(KOKKOSKERNELS_ENABLE_TPL_ROCBLAS)
 
-}
-}
+#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE)      \
+  template <class ExecSpace>                                                   \
+  struct scal_tpl_spec_avail<                                                  \
+      Kokkos::View<SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      SCALAR,                                                                  \
+      Kokkos::View<const SCALAR*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1> {                                                                     \
+    enum : bool { value = true };                                              \
+  };
+
+KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft,
+                                        Kokkos::Experimental::HIPSpace)
+KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft,
+                                        Kokkos::Experimental::HIPSpace)
+KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex<double>,
+                                        Kokkos::LayoutLeft,
+                                        Kokkos::Experimental::HIPSpace)
+KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex<float>,
+                                        Kokkos::LayoutLeft,
+                                        Kokkos::Experimental::HIPSpace)
+
+#endif
+
+}  // namespace Impl
+}  // namespace KokkosBlas
 #endif
diff --git a/src/impl/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp
index d866702f4f..1b3b484a23 100644
--- a/src/impl/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp
@@ -49,347 +49,320 @@ namespace KokkosBlas {
 namespace Impl {
 
 namespace {
-  template<class RV, class AV, class XV>
-  inline void scal_print_specialization() {
-      #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-        printf("KokkosBlas1::scal<> TPL Blas specialization for < %s , %s , %s >\n",typeid(RV).name(),typeid(AV).name(),typeid(XV).name());
-      #endif
-  }
-}
-}
+template <class RV, class AS, class XV>
+inline void scal_print_specialization() {
+#if defined(KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION)
+  printf("KokkosBlas1::scal<> TPL Blas specialization for < %s , %s , %s >\n",
+         typeid(RV).name(), typeid(AS).name(), typeid(XV).name());
+#endif
 }
+}  // namespace
+}  // namespace Impl
+}  // namespace KokkosBlas
 
-#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
+#if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS)
 #include "KokkosBlas_Host_tpl.hpp"
 
 namespace KokkosBlas {
 namespace Impl {
 
-
-#define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_BLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Scal< \
-Kokkos::View<double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-double, \
-Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef double AV; \
-  typedef Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void scal (const RV& R, const double& alpha, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_BLAS,double]"); \
-    const size_type numElems = X.extent(0); \
-    if ((numElems < static_cast<size_type> (INT_MAX)) && (R.data() == X.data())) { \
-      scal_print_specialization<RV,AV,XV>(); \
-      int N = numElems; \
-      int one = 1; \
-      HostBlas<double>::scal(N,alpha,R.data(),one);     \
-    } else { \
-      Scal<RV,AV,XV,1,false,ETI_SPEC_AVAIL>::scal(R,alpha,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_BLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Scal< \
-Kokkos::View<float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-float, \
-Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef float AV; \
-  typedef Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void scal (const RV& R, const float& alpha, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_BLAS,float]"); \
-    const size_type numElems = X.extent(0); \
-    if ((numElems < static_cast<size_type> (INT_MAX)) && (R.data() == X.data())) { \
-      scal_print_specialization<RV,AV,XV>(); \
-      int N = numElems; \
-      int one = 1; \
-      HostBlas<float>::scal(N,alpha,R.data(),one);     \
-    } else { \
-      Scal<RV,AV,XV,1,false,ETI_SPEC_AVAIL>::scal(R,alpha,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_BLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Scal< \
-Kokkos::View<Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::complex<double>, \
-Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::complex<double> AV; \
-  typedef Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void scal (const RV& R, const Kokkos::complex<double>& alpha, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_BLAS,complex<double>]"); \
-    const size_type numElems = X.extent(0); \
-    if ((numElems < static_cast<size_type> (INT_MAX)) && (R.data() == X.data())) { \
-      scal_print_specialization<RV,AV,XV>(); \
-      int N = numElems; \
-      int one = 1; \
-      const std::complex<double> alpha_val = alpha;     \
-      HostBlas<std::complex<double> >::scal\
-        (N,alpha_val,              \
-         reinterpret_cast<std::complex<double>*>(R.data()), one);       \
-    } else { \
-      Scal<RV,AV,XV,1,false,ETI_SPEC_AVAIL>::scal(R,alpha,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_BLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Scal< \
-Kokkos::View<Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::complex<float>, \
-Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::complex<float> AV; \
-  typedef Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void scal (const RV& R, const Kokkos::complex<float>& alpha, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_BLAS,complex<float>]"); \
-    const size_type numElems = X.extent(0); \
-    if ((numElems < static_cast<size_type> (INT_MAX)) && (R.data() == X.data())) { \
-      scal_print_specialization<RV,AV,XV>(); \
-      int N = numElems; \
-      int one = 1; \
-      const std::complex<float> alpha_val = alpha;     \
-      HostBlas<std::complex<float> >::scal\
-        (N,alpha_val,               \
-         reinterpret_cast<std::complex<float>*>(R.data()), one);        \
-    } else { \
-      Scal<RV,AV,XV,1,false,ETI_SPEC_AVAIL>::scal(R,alpha,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
-
-KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
-
-KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
-
-KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, false)
-
-}
-}
+#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_BLAS(SCALAR_TYPE, BASE_SCALAR_TYPE,    \
+                                             LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \
+  template <class ExecSpace>                                                   \
+  struct Scal<                                                                 \
+      Kokkos::View<SCALAR_TYPE*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      SCALAR_TYPE,                                                             \
+      Kokkos::View<const SCALAR_TYPE*, LAYOUT,                                 \
+                   Kokkos::Device<ExecSpace, MEMSPACE>,                        \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1, true, ETI_SPEC_AVAIL> {                                               \
+    typedef Kokkos::View<SCALAR_TYPE*, LAYOUT,                                 \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                  \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        RV;                                                                    \
+    typedef SCALAR_TYPE AS;                                                    \
+    typedef Kokkos::View<const SCALAR_TYPE*, LAYOUT,                           \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                  \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XV;                                                                    \
+    typedef typename XV::size_type size_type;                                  \
+                                                                               \
+    static void scal(const RV& R, const AS& alpha, const XV& X) {              \
+      Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_BLAS," #SCALAR_TYPE  \
+                                    "]");                                      \
+      const size_type numElems = X.extent(0);                                  \
+      if ((numElems < static_cast<size_type>(INT_MAX)) &&                      \
+          (R.data() == X.data())) {                                            \
+        scal_print_specialization<RV, AS, XV>();                               \
+        int N                          = numElems;                             \
+        int one                        = 1;                                    \
+        const BASE_SCALAR_TYPE alpha_b = static_cast<BASE_SCALAR_TYPE>(alpha); \
+        HostBlas<BASE_SCALAR_TYPE>::scal(                                      \
+            N, alpha_b, reinterpret_cast<BASE_SCALAR_TYPE*>(R.data()), one);   \
+      } else {                                                                 \
+        Scal<RV, AS, XV, 1, false, ETI_SPEC_AVAIL>::scal(R, alpha, X);         \
+      }                                                                        \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+#define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \
+  KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_BLAS(double, double, LAYOUT, MEMSPACE,       \
+                                       ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \
+  KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_BLAS(float, float, LAYOUT, MEMSPACE,         \
+                                       ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \
+  KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_BLAS(Kokkos::complex<double>,                \
+                                       std::complex<double>, LAYOUT, MEMSPACE, \
+                                       ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \
+  KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_BLAS(Kokkos::complex<float>,                 \
+                                       std::complex<float>, LAYOUT, MEMSPACE,  \
+                                       ETI_SPEC_AVAIL)
+
+KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                     true)
+KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                     false)
+
+KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                     true)
+KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                     false)
+
+KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                     true)
+KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                     false)
+
+KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                     true)
+KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace,
+                                     false)
+
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 #endif
 
 // cuBLAS
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
-#include<KokkosBlas_tpl_spec.hpp>
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS)
+#include <KokkosBlas_tpl_spec.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
 
-#define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Scal< \
-Kokkos::View<double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-double, \
-Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef double AV; \
-  typedef Kokkos::View<const double*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void scal (const RV& R, const double& alpha, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_CUBLAS,double]"); \
-    const size_type numElems = X.extent(0); \
-    if ((numElems < static_cast<size_type> (INT_MAX)) && (R.data() == X.data())) { \
-      scal_print_specialization<RV,AV,XV>(); \
-      const int N = static_cast<int> (numElems); \
-      constexpr int one = 1; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasDscal(s.handle, N, &alpha, R.data(), one); \
-    } else { \
-      Scal<RV,AV,XV,1,false,ETI_SPEC_AVAIL>::scal(R,alpha,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Scal< \
-Kokkos::View<float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-float, \
-Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef float AV; \
-  typedef Kokkos::View<const float*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void scal (const RV& R, const float& alpha, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_CUBLAS,float]"); \
-    const size_type numElems = X.extent(0); \
-    if ((numElems < static_cast<size_type> (INT_MAX)) && (R.data() == X.data())) { \
-      scal_print_specialization<RV,AV,XV>(); \
-      const int N = static_cast<int> (numElems); \
-      constexpr int one = 1; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasSscal(s.handle, N, &alpha, R.data(), one); \
-    } else { \
-      Scal<RV,AV,XV,1,false,ETI_SPEC_AVAIL>::scal(R,alpha,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Scal< \
-Kokkos::View<Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::complex<double>, \
-Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::complex<double> AV; \
-  typedef Kokkos::View<const Kokkos::complex<double>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void scal (const RV& R, const Kokkos::complex<double>& alpha, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_CUBLAS,complex<double>]"); \
-    const size_type numElems = X.extent(0); \
-    if ((numElems < static_cast<size_type> (INT_MAX)) && (R.data() == X.data())) { \
-      scal_print_specialization<RV,AV,XV>(); \
-      const int N = static_cast<int> (numElems); \
-      constexpr int one = 1; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasZscal(s.handle, N, reinterpret_cast<const cuDoubleComplex*>(&alpha), reinterpret_cast<cuDoubleComplex*>(R.data()), one); \
-    } else { \
-      Scal<RV,AV,XV,1,false,ETI_SPEC_AVAIL>::scal(R,alpha,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct Scal< \
-Kokkos::View<Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-Kokkos::complex<float>, \
-Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-             Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-1,true, ETI_SPEC_AVAIL > { \
-  \
-  typedef Kokkos::View<Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > RV; \
-  typedef Kokkos::complex<float> AV; \
-  typedef Kokkos::View<const Kokkos::complex<float>*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > XV; \
-  typedef typename XV::size_type size_type; \
-  \
-  static void scal (const RV& R, const Kokkos::complex<float>& alpha, const XV& X) \
-  { \
-    Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_CUBLAS,complex<float>]"); \
-    const size_type numElems = X.extent(0); \
-    if ((numElems < static_cast<size_type> (INT_MAX)) && (R.data() == X.data())) { \
-      scal_print_specialization<RV,AV,XV>(); \
-      const int N = static_cast<int> (numElems); \
-      constexpr int one = 1; \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      cublasCscal(s.handle, N, reinterpret_cast<const cuComplex*>(&alpha), reinterpret_cast<cuComplex*>(R.data()), one); \
-    } else { \
-      Scal<RV,AV,XV,1,false,ETI_SPEC_AVAIL>::scal(R,alpha,X); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
-
-KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
-
-KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
-
-KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
-
-KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
-
-KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
-
-KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
-
-KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
+#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, \
+                                               CUBLAS_FN, LAYOUT, MEMSPACE,   \
+                                               ETI_SPEC_AVAIL)                \
+  template <class ExecSpace>                                                  \
+  struct Scal<                                                                \
+      Kokkos::View<SCALAR_TYPE*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      SCALAR_TYPE,                                                            \
+      Kokkos::View<const SCALAR_TYPE*, LAYOUT,                                \
+                   Kokkos::Device<ExecSpace, MEMSPACE>,                       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      1, true, ETI_SPEC_AVAIL> {                                              \
+    typedef Kokkos::View<SCALAR_TYPE*, LAYOUT,                                \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        RV;                                                                   \
+    typedef SCALAR_TYPE AS;                                                   \
+    typedef Kokkos::View<const SCALAR_TYPE*, LAYOUT,                          \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        XV;                                                                   \
+    typedef typename XV::size_type size_type;                                 \
+                                                                              \
+    static void scal(const RV& R, const AS& alpha, const XV& X) {             \
+      Kokkos::Profiling::pushRegion(                                          \
+          "KokkosBlas::scal[TPL_CUBLAS," #SCALAR_TYPE "]");                   \
+      const size_type numElems = X.extent(0);                                 \
+      if ((numElems < static_cast<size_type>(INT_MAX)) &&                     \
+          (R.data() == X.data())) {                                           \
+        scal_print_specialization<RV, AS, XV>();                              \
+        const int N       = static_cast<int>(numElems);                       \
+        constexpr int one = 1;                                                \
+        KokkosBlas::Impl::CudaBlasSingleton& s =                              \
+            KokkosBlas::Impl::CudaBlasSingleton::singleton();                 \
+        KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN(                               \
+            s.handle, N, reinterpret_cast<const CUDA_SCALAR_TYPE*>(&alpha),   \
+            reinterpret_cast<CUDA_SCALAR_TYPE*>(R.data()), one));             \
+      } else {                                                                \
+        Scal<RV, AS, XV, 1, false, ETI_SPEC_AVAIL>::scal(R, alpha, X);        \
+      }                                                                       \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
+  };
+
+#define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE,              \
+                                               ETI_SPEC_AVAIL)                \
+  KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(double, double, cublasDscal, LAYOUT, \
+                                         MEMSPACE, ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE,            \
+                                               ETI_SPEC_AVAIL)              \
+  KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(float, float, cublasSscal, LAYOUT, \
+                                         MEMSPACE, ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE,               \
+                                               ETI_SPEC_AVAIL)                 \
+  KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::complex<double>,              \
+                                         cuDoubleComplex, cublasZscal, LAYOUT, \
+                                         MEMSPACE, ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE,            \
+                                               ETI_SPEC_AVAIL)              \
+  KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::complex<float>, cuComplex, \
+                                         cublasCscal, LAYOUT, MEMSPACE,     \
+                                         ETI_SPEC_AVAIL)
+
+KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                       true)
+KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                       false)
+
+KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                       true)
+KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                       false)
+
+KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                       true)
+KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                       false)
+
+KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                       true)
+KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                                       false)
+
+KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace,
+                                       true)
+KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace,
+                                       false)
+
+KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace,
+                                       true)
+KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace,
+                                       false)
+
+KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace,
+                                       true)
+KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace,
+                                       false)
+
+KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace,
+                                       true)
+KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace,
+                                       false)
+
+}  // namespace Impl
+}  // namespace KokkosBlas
 
-}
-}
+#endif
+
+// rocBLAS
+#if defined(KOKKOSKERNELS_ENABLE_TPL_ROCBLAS)
+#include <KokkosBlas_tpl_spec.hpp>
+
+namespace KokkosBlas {
+namespace Impl {
+
+#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(                               \
+    SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, LAYOUT, MEMSPACE,            \
+    ETI_SPEC_AVAIL)                                                            \
+  template <class ExecSpace>                                                   \
+  struct Scal<                                                                 \
+      Kokkos::View<SCALAR_TYPE*, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      SCALAR_TYPE,                                                             \
+      Kokkos::View<const SCALAR_TYPE*, LAYOUT,                                 \
+                   Kokkos::Device<ExecSpace, MEMSPACE>,                        \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      1, true, ETI_SPEC_AVAIL> {                                               \
+    typedef Kokkos::View<SCALAR_TYPE*, LAYOUT,                                 \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                  \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        RV;                                                                    \
+    typedef SCALAR_TYPE AS;                                                    \
+    typedef Kokkos::View<const SCALAR_TYPE*, LAYOUT,                           \
+                         Kokkos::Device<ExecSpace, MEMSPACE>,                  \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XV;                                                                    \
+    typedef typename XV::size_type size_type;                                  \
+                                                                               \
+    static void scal(const RV& R, const AS& alpha, const XV& X) {              \
+      Kokkos::Profiling::pushRegion(                                           \
+          "KokkosBlas::scal[TPL_ROCBLAS," #SCALAR_TYPE "]");                   \
+      const size_type numElems = X.extent(0);                                  \
+      if ((numElems < static_cast<size_type>(INT_MAX)) &&                      \
+          (R.data() == X.data())) {                                            \
+        scal_print_specialization<RV, AS, XV>();                               \
+        const int N       = static_cast<int>(numElems);                        \
+        constexpr int one = 1;                                                 \
+        KokkosBlas::Impl::RocBlasSingleton& s =                                \
+            KokkosBlas::Impl::RocBlasSingleton::singleton();                   \
+        KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN(                              \
+            s.handle, N, reinterpret_cast<const ROCBLAS_SCALAR_TYPE*>(&alpha), \
+            reinterpret_cast<ROCBLAS_SCALAR_TYPE*>(R.data()), one));           \
+      } else {                                                                 \
+        Scal<RV, AS, XV, 1, false, ETI_SPEC_AVAIL>::scal(R, alpha, X);         \
+      }                                                                        \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+#define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE,        \
+                                                ETI_SPEC_AVAIL)          \
+  KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(double, double, rocblas_dscal, \
+                                          LAYOUT, MEMSPACE, ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE,              \
+                                                ETI_SPEC_AVAIL)                \
+  KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(float, float, rocblas_sscal, LAYOUT, \
+                                          MEMSPACE, ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE,             \
+                                                ETI_SPEC_AVAIL)               \
+  KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(                                    \
+      Kokkos::complex<double>, rocblas_double_complex, rocblas_zscal, LAYOUT, \
+      MEMSPACE, ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE,           \
+                                                ETI_SPEC_AVAIL)             \
+  KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(                                  \
+      Kokkos::complex<float>, rocblas_float_complex, rocblas_cscal, LAYOUT, \
+      MEMSPACE, ETI_SPEC_AVAIL)
+
+KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft,
+                                        Kokkos::Experimental::HIPSpace, true)
+KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft,
+                                        Kokkos::Experimental::HIPSpace, false)
+
+KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft,
+                                        Kokkos::Experimental::HIPSpace, true)
+KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft,
+                                        Kokkos::Experimental::HIPSpace, false)
+
+KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft,
+                                        Kokkos::Experimental::HIPSpace, true)
+KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft,
+                                        Kokkos::Experimental::HIPSpace, false)
+
+KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft,
+                                        Kokkos::Experimental::HIPSpace, true)
+KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft,
+                                        Kokkos::Experimental::HIPSpace, false)
+
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 #endif
 
diff --git a/src/impl/tpls/KokkosBlas1_sum_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_sum_tpl_spec_avail.hpp
index 1413bc88a8..0c05c28540 100644
--- a/src/impl/tpls/KokkosBlas1_sum_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosBlas1_sum_tpl_spec_avail.hpp
@@ -48,11 +48,11 @@
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class RMV, class XMV, int rank = RMV::rank>
+template <class RMV, class XMV, int rank = RMV::rank>
 struct sum_tpl_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/impl/tpls/KokkosBlas1_sum_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_sum_tpl_spec_decl.hpp
index 4545d82b06..6004a223c4 100644
--- a/src/impl/tpls/KokkosBlas1_sum_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosBlas1_sum_tpl_spec_decl.hpp
@@ -46,8 +46,7 @@
 #define KOKKOSBLAS1_ABS_TPL_SPEC_DECL_HPP_
 
 namespace KokkosBlas {
-namespace Impl {
-}
-}
+namespace Impl {}
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/impl/tpls/KokkosBlas1_update_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_update_tpl_spec_avail.hpp
index 6be318ce6e..13c5ff156c 100644
--- a/src/impl/tpls/KokkosBlas1_update_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosBlas1_update_tpl_spec_avail.hpp
@@ -48,11 +48,11 @@
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class XMV, class YMV, class ZMV, int rank = ZMV::rank>
+template <class XMV, class YMV, class ZMV, int rank = ZMV::rank>
 struct update_tpl_spec_avail {
   enum : bool { value = false };
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/impl/tpls/KokkosBlas1_update_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_update_tpl_spec_decl.hpp
index e130eced62..8cafa5c1e5 100644
--- a/src/impl/tpls/KokkosBlas1_update_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosBlas1_update_tpl_spec_decl.hpp
@@ -46,8 +46,7 @@
 #define KOKKOSBLAS1_UPDATE_TPL_SPEC_DECL_HPP_
 
 namespace KokkosBlas {
-namespace Impl {
-}
-}
+namespace Impl {}
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp
index 5c6d1734dc..2c22e23334 100644
--- a/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp
@@ -48,7 +48,7 @@
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class AT, class XT, class YT>
+template <class AT, class XT, class YT>
 struct gemv_tpl_spec_avail {
   enum : bool { value = false };
 };
@@ -56,55 +56,137 @@ struct gemv_tpl_spec_avail {
 // Generic Host side BLAS (could be MKL or whatever)
 #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
 
-#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( SCALAR, LAYOUTA, LAYOUTX, LAYOUTY, MEMSPACE ) \
-template<class ExecSpace> \
-struct gemv_tpl_spec_avail< \
-     Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const SCALAR*, LAYOUTX, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR*, LAYOUTY, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> > \
-     >  { enum : bool { value = true }; };
-
- KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( double,                  Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace)
- KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( float,                   Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace)
- KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace)
- KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace)
-
- KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( double,                  Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace)
- KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( float,                   Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace)
- KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<double>, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace)
- KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<float>,  Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace)
+#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, LAYOUTX,    \
+                                             LAYOUTY, MEMSPACE)           \
+  template <class ExecSpace>                                              \
+  struct gemv_tpl_spec_avail<                                             \
+      Kokkos::View<const SCALAR**, LAYOUTA,                               \
+                   Kokkos::Device<ExecSpace, MEMSPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR*, LAYOUTX,                                \
+                   Kokkos::Device<ExecSpace, MEMSPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR*, LAYOUTY, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {          \
+    enum : bool { value = true };                                         \
+  };
+
+KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft,
+                                     Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                     Kokkos::HostSpace)
+KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft,
+                                     Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                     Kokkos::HostSpace)
+KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<double>,
+                                     Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                     Kokkos::LayoutLeft, Kokkos::HostSpace)
+KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<float>, Kokkos::LayoutLeft,
+                                     Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                     Kokkos::HostSpace)
+
+KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight,
+                                     Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                     Kokkos::HostSpace)
+KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight,
+                                     Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                     Kokkos::HostSpace)
+KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<double>,
+                                     Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                     Kokkos::LayoutRight, Kokkos::HostSpace)
+KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<float>,
+                                     Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                     Kokkos::LayoutRight, Kokkos::HostSpace)
 
 #endif
 
 // cuBLAS
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
 
-#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( SCALAR, LAYOUTA, LAYOUTX, LAYOUTY, MEMSPACE ) \
-template<class ExecSpace> \
-struct gemv_tpl_spec_avail< \
-     Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const SCALAR*, LAYOUTX, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR*, LAYOUTY, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> > \
-     >  { enum : bool { value = true }; };
-
- KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( double,                  Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace)
- KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( float,                   Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace)
- KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace)
- KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace)
-
- KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( double,                  Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace)
- KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( float,                   Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace)
- KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<double>, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace)
- KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<float>,  Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace)
+#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTA, LAYOUTX,  \
+                                               LAYOUTY, MEMSPACE)         \
+  template <class ExecSpace>                                              \
+  struct gemv_tpl_spec_avail<                                             \
+      Kokkos::View<const SCALAR**, LAYOUTA,                               \
+                   Kokkos::Device<ExecSpace, MEMSPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR*, LAYOUTX,                                \
+                   Kokkos::Device<ExecSpace, MEMSPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR*, LAYOUTY, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {          \
+    enum : bool { value = true };                                         \
+  };
+
+// Note BMK: We use the same layout for A, X and Y because the GEMV
+// interface will switch the layouts of X and Y to that of A.
+// So this TPL version will match any layout combination, as long
+// as none are LayoutStride.
+
+KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft,
+                                       Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                       Kokkos::CudaSpace)
+KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft,
+                                       Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                       Kokkos::CudaSpace)
+KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<double>,
+                                       Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                       Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<float>,
+                                       Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                       Kokkos::LayoutLeft, Kokkos::CudaSpace)
+
+KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight,
+                                       Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                       Kokkos::CudaSpace)
+KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight,
+                                       Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                       Kokkos::CudaSpace)
+KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<double>,
+                                       Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                       Kokkos::LayoutRight, Kokkos::CudaSpace)
+KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<float>,
+                                       Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                       Kokkos::LayoutRight, Kokkos::CudaSpace)
+
+#endif
+
+// rocBLAS
+#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS
+
+#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT)    \
+  template <>                                                      \
+  struct gemv_tpl_spec_avail<                                      \
+      Kokkos::View<const SCALAR**, LAYOUT,                         \
+                   Kokkos::Device<Kokkos::Experimental::HIP,       \
+                                  Kokkos::Experimental::HIPSpace>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,      \
+      Kokkos::View<const SCALAR*, LAYOUT,                          \
+                   Kokkos::Device<Kokkos::Experimental::HIP,       \
+                                  Kokkos::Experimental::HIPSpace>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,      \
+      Kokkos::View<SCALAR*, LAYOUT,                                \
+                   Kokkos::Device<Kokkos::Experimental::HIP,       \
+                                  Kokkos::Experimental::HIPSpace>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {   \
+    enum : bool { value = true };                                  \
+  };
+
+KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft)
+KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft)
+KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex<double>,
+                                        Kokkos::LayoutLeft)
+KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex<float>,
+                                        Kokkos::LayoutLeft)
+
+KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight)
+KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight)
+KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex<double>,
+                                        Kokkos::LayoutRight)
+KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex<float>,
+                                        Kokkos::LayoutRight)
 
 #endif
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp
index b07e7dfffa..33ee439316 100644
--- a/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp
@@ -51,377 +51,755 @@
 namespace KokkosBlas {
 namespace Impl {
 
-#define KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA)			\
-  bool A_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTA>::value;	\
-  const int M = static_cast<int> (A_is_lr ? A.extent(1) : A.extent(0)); \
-  const int N = static_cast<int> (A_is_lr ? A.extent(0) : A.extent(1)); \
-  constexpr int one = 1;						\
-  const int LDA = A_is_lr ? A.stride(0) : A.stride(1);			\
-									\
-  char transa;								\
-  if ((trans[0]=='N')||(trans[0]=='n'))					\
-    transa = A_is_lr ? 'T' : 'N';					\
-  else if ((trans[0]=='T')||(trans[0]=='t'))				\
-    transa = A_is_lr ? 'N' : 'T';					\
-  else {								\
-    if (A_is_lr)							\
-      throw std::runtime_error("Error: HostBlas::gemv conjugate transpose requires LayoutLeft views."); \
-    transa = 'C';							\
-  }									\
-
-#define KOKKOSBLAS2_DGEMV_BLAS( LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct GEMV< \
-     Kokkos::View<const double**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const double*, LAYOUTX, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<double*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef double SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<const SCALAR*, LAYOUTX, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > XViewType; \
-  typedef Kokkos::View<SCALAR*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > YViewType; \
- \
-  static void \
-  gemv (const char trans[], \
-        typename AViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const XViewType& X, \
-        typename YViewType::const_value_type& beta, \
-        const YViewType& Y) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,double]"); \
-    KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \
-    HostBlas<double>::gemv(transa,M,N,alpha,A.data(),LDA,X.data(),one,beta,Y.data(),one); \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS2_SGEMV_BLAS( LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct GEMV< \
-     Kokkos::View<const float**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const float*, LAYOUTX, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<float*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef float SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<const SCALAR*, LAYOUTX, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > XViewType; \
-  typedef Kokkos::View<SCALAR*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > YViewType; \
-      \
-  static void \
-  gemv (const char trans[], \
-        typename AViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const XViewType& X, \
-        typename YViewType::const_value_type& beta, \
-        const YViewType& Y) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,float]"); \
-    KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \
-    HostBlas<float>::gemv(transa,M,N,alpha,A.data(),LDA,X.data(),one,beta,Y.data(),one); \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS2_ZGEMV_BLAS( LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct GEMV< \
-     Kokkos::View<const Kokkos::complex<double>**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const Kokkos::complex<double>*, LAYOUTX, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<Kokkos::complex<double>*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef Kokkos::complex<double> SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<const SCALAR*, LAYOUTX, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > XViewType; \
-  typedef Kokkos::View<SCALAR*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > YViewType; \
-      \
-  static void \
-  gemv (const char trans[], \
-        typename AViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const XViewType& X, \
-        typename YViewType::const_value_type& beta, \
-        const YViewType& Y) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,complex<double>]"); \
-    KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \
-    const std::complex<double> alpha_val = alpha, beta_val = beta;       \
-    HostBlas<std::complex<double> >::gemv                               \
-      (transa,M,N,                                                       \
-       alpha_val,            \
-       reinterpret_cast<const std::complex<double>*>(A.data()),LDA,     \
-       reinterpret_cast<const std::complex<double>*>(X.data()),one,     \
-       beta_val,            \
-       reinterpret_cast<      std::complex<double>*>(Y.data()),one);    \
-    Kokkos::Profiling::popRegion();                                     \
-  } \
-}; \
-
-#define KOKKOSBLAS2_CGEMV_BLAS( LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct GEMV< \
-     Kokkos::View<const Kokkos::complex<float>**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const Kokkos::complex<float>*, LAYOUTX, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<Kokkos::complex<float>*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef Kokkos::complex<float> SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<const SCALAR*, LAYOUTX, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > XViewType; \
-  typedef Kokkos::View<SCALAR*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > YViewType; \
-      \
-  static void \
-  gemv (const char trans[], \
-        typename AViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const XViewType& X, \
-        typename YViewType::const_value_type& beta, \
-        const YViewType& Y) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,complex<float>]"); \
-    KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \
-    const std::complex<float> alpha_val = alpha, beta_val = beta;       \
-    HostBlas<std::complex<float> >::gemv                                \
-      (transa,M,N,                                                       \
-       alpha_val,             \
-       reinterpret_cast<const std::complex<float>*>(A.data()),LDA,     \
-       reinterpret_cast<const std::complex<float>*>(X.data()),one,     \
-       beta_val,            \
-       reinterpret_cast<      std::complex<float>*>(Y.data()),one);    \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-KOKKOSBLAS2_DGEMV_BLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS2_DGEMV_BLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false)
-KOKKOSBLAS2_DGEMV_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true)
-KOKKOSBLAS2_DGEMV_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false)
-
-KOKKOSBLAS2_SGEMV_BLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS2_SGEMV_BLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false)
-KOKKOSBLAS2_SGEMV_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true)
-KOKKOSBLAS2_SGEMV_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false)
-
-KOKKOSBLAS2_ZGEMV_BLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS2_ZGEMV_BLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false)
-KOKKOSBLAS2_ZGEMV_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true)
-KOKKOSBLAS2_ZGEMV_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false)
-
-KOKKOSBLAS2_CGEMV_BLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS2_CGEMV_BLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false)
-KOKKOSBLAS2_CGEMV_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true)
-KOKKOSBLAS2_CGEMV_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false)
-
-}
-}
-#endif // KOKKOSKERNELS_ENABLE_TPL_BLAS
+#define KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA)                             \
+  bool A_is_lr      = std::is_same<Kokkos::LayoutRight, LAYOUTA>::value;     \
+  const int M       = static_cast<int>(A_is_lr ? A.extent(1) : A.extent(0)); \
+  const int N       = static_cast<int>(A_is_lr ? A.extent(0) : A.extent(1)); \
+  constexpr int one = 1;                                                     \
+  const int LDA     = A_is_lr ? A.stride(0) : A.stride(1);                   \
+                                                                             \
+  char transa;                                                               \
+  if ((trans[0] == 'N') || (trans[0] == 'n'))                                \
+    transa = A_is_lr ? 'T' : 'N';                                            \
+  else if ((trans[0] == 'T') || (trans[0] == 't'))                           \
+    transa = A_is_lr ? 'N' : 'T';                                            \
+  else {                                                                     \
+    if (A_is_lr)                                                             \
+      throw std::runtime_error(                                              \
+          "Error: HostBlas::gemv conjugate transpose requires LayoutLeft "   \
+          "views.");                                                         \
+    transa = 'C';                                                            \
+  }
+
+#define KOKKOSBLAS2_DGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE,         \
+                               ETI_SPEC_AVAIL)                               \
+  template <class ExecSpace>                                                 \
+  struct GEMV<                                                               \
+      Kokkos::View<const double**, LAYOUTA,                                  \
+                   Kokkos::Device<ExecSpace, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<const double*, LAYOUTX,                                   \
+                   Kokkos::Device<ExecSpace, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<double*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>,   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      true, ETI_SPEC_AVAIL> {                                                \
+    typedef double SCALAR;                                                   \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                            \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
+        AViewType;                                                           \
+    typedef Kokkos::View<const SCALAR*, LAYOUTX,                             \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
+        XViewType;                                                           \
+    typedef Kokkos::View<SCALAR*, LAYOUTY,                                   \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
+        YViewType;                                                           \
+                                                                             \
+    static void gemv(const typename AViewType::execution_space& /* space */, \
+                     const char trans[],                                     \
+                     typename AViewType::const_value_type& alpha,            \
+                     const AViewType& A, const XViewType& X,                 \
+                     typename YViewType::const_value_type& beta,             \
+                     const YViewType& Y) {                                   \
+      Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,double]");    \
+      KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA);                              \
+      HostBlas<double>::gemv(transa, M, N, alpha, A.data(), LDA, X.data(),   \
+                             one, beta, Y.data(), one);                      \
+      Kokkos::Profiling::popRegion();                                        \
+    }                                                                        \
+  };
+
+#define KOKKOSBLAS2_SGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE,           \
+                               ETI_SPEC_AVAIL)                                 \
+  template <class ExecSpace>                                                   \
+  struct GEMV<                                                                 \
+      Kokkos::View<const float**, LAYOUTA,                                     \
+                   Kokkos::Device<ExecSpace, MEM_SPACE>,                       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const float*, LAYOUTX,                                      \
+                   Kokkos::Device<ExecSpace, MEM_SPACE>,                       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<float*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>,      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      true, ETI_SPEC_AVAIL> {                                                  \
+    typedef float SCALAR;                                                      \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                              \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        AViewType;                                                             \
+    typedef Kokkos::View<const SCALAR*, LAYOUTX,                               \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XViewType;                                                             \
+    typedef Kokkos::View<SCALAR*, LAYOUTY,                                     \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        YViewType;                                                             \
+                                                                               \
+    static void gemv(const typename AViewType::execution_space& /* space */,   \
+                     const char trans[],                                       \
+                     typename AViewType::const_value_type& alpha,              \
+                     const AViewType& A, const XViewType& X,                   \
+                     typename YViewType::const_value_type& beta,               \
+                     const YViewType& Y) {                                     \
+      Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,float]");       \
+      KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA);                                \
+      HostBlas<float>::gemv(transa, M, N, alpha, A.data(), LDA, X.data(), one, \
+                            beta, Y.data(), one);                              \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+#define KOKKOSBLAS2_ZGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE,         \
+                               ETI_SPEC_AVAIL)                               \
+  template <class ExecSpace>                                                 \
+  struct GEMV<Kokkos::View<const Kokkos::complex<double>**, LAYOUTA,         \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,             \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,        \
+              Kokkos::View<const Kokkos::complex<double>*, LAYOUTX,          \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,             \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,        \
+              Kokkos::View<Kokkos::complex<double>*, LAYOUTY,                \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,             \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,        \
+              true, ETI_SPEC_AVAIL> {                                        \
+    typedef Kokkos::complex<double> SCALAR;                                  \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                            \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
+        AViewType;                                                           \
+    typedef Kokkos::View<const SCALAR*, LAYOUTX,                             \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
+        XViewType;                                                           \
+    typedef Kokkos::View<SCALAR*, LAYOUTY,                                   \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
+        YViewType;                                                           \
+                                                                             \
+    static void gemv(const typename AViewType::execution_space& /* space */, \
+                     const char trans[],                                     \
+                     typename AViewType::const_value_type& alpha,            \
+                     const AViewType& A, const XViewType& X,                 \
+                     typename YViewType::const_value_type& beta,             \
+                     const YViewType& Y) {                                   \
+      Kokkos::Profiling::pushRegion(                                         \
+          "KokkosBlas::gemv[TPL_BLAS,complex<double>]");                     \
+      KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA);                              \
+      const std::complex<double> alpha_val = alpha, beta_val = beta;         \
+      HostBlas<std::complex<double> >::gemv(                                 \
+          transa, M, N, alpha_val,                                           \
+          reinterpret_cast<const std::complex<double>*>(A.data()), LDA,      \
+          reinterpret_cast<const std::complex<double>*>(X.data()), one,      \
+          beta_val, reinterpret_cast<std::complex<double>*>(Y.data()), one); \
+      Kokkos::Profiling::popRegion();                                        \
+    }                                                                        \
+  };
+
+#define KOKKOSBLAS2_CGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE,         \
+                               ETI_SPEC_AVAIL)                               \
+  template <class ExecSpace>                                                 \
+  struct GEMV<Kokkos::View<const Kokkos::complex<float>**, LAYOUTA,          \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,             \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,        \
+              Kokkos::View<const Kokkos::complex<float>*, LAYOUTX,           \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,             \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,        \
+              Kokkos::View<Kokkos::complex<float>*, LAYOUTY,                 \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,             \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,        \
+              true, ETI_SPEC_AVAIL> {                                        \
+    typedef Kokkos::complex<float> SCALAR;                                   \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                            \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
+        AViewType;                                                           \
+    typedef Kokkos::View<const SCALAR*, LAYOUTX,                             \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
+        XViewType;                                                           \
+    typedef Kokkos::View<SCALAR*, LAYOUTY,                                   \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
+        YViewType;                                                           \
+                                                                             \
+    static void gemv(const typename AViewType::execution_space& /* space */, \
+                     const char trans[],                                     \
+                     typename AViewType::const_value_type& alpha,            \
+                     const AViewType& A, const XViewType& X,                 \
+                     typename YViewType::const_value_type& beta,             \
+                     const YViewType& Y) {                                   \
+      Kokkos::Profiling::pushRegion(                                         \
+          "KokkosBlas::gemv[TPL_BLAS,complex<float>]");                      \
+      KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA);                              \
+      const std::complex<float> alpha_val = alpha, beta_val = beta;          \
+      HostBlas<std::complex<float> >::gemv(                                  \
+          transa, M, N, alpha_val,                                           \
+          reinterpret_cast<const std::complex<float>*>(A.data()), LDA,       \
+          reinterpret_cast<const std::complex<float>*>(X.data()), one,       \
+          beta_val, reinterpret_cast<std::complex<float>*>(Y.data()), one);  \
+      Kokkos::Profiling::popRegion();                                        \
+    }                                                                        \
+  };
+
+KOKKOSBLAS2_DGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::LayoutLeft, Kokkos::HostSpace, true)
+KOKKOSBLAS2_DGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::LayoutLeft, Kokkos::HostSpace, false)
+KOKKOSBLAS2_DGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::LayoutRight, Kokkos::HostSpace, true)
+KOKKOSBLAS2_DGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::LayoutRight, Kokkos::HostSpace, false)
+
+KOKKOSBLAS2_SGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::LayoutLeft, Kokkos::HostSpace, true)
+KOKKOSBLAS2_SGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::LayoutLeft, Kokkos::HostSpace, false)
+KOKKOSBLAS2_SGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::LayoutRight, Kokkos::HostSpace, true)
+KOKKOSBLAS2_SGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::LayoutRight, Kokkos::HostSpace, false)
+
+KOKKOSBLAS2_ZGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::LayoutLeft, Kokkos::HostSpace, true)
+KOKKOSBLAS2_ZGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::LayoutLeft, Kokkos::HostSpace, false)
+KOKKOSBLAS2_ZGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::LayoutRight, Kokkos::HostSpace, true)
+KOKKOSBLAS2_ZGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::LayoutRight, Kokkos::HostSpace, false)
+
+KOKKOSBLAS2_CGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::LayoutLeft, Kokkos::HostSpace, true)
+KOKKOSBLAS2_CGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::LayoutLeft, Kokkos::HostSpace, false)
+KOKKOSBLAS2_CGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::LayoutRight, Kokkos::HostSpace, true)
+KOKKOSBLAS2_CGEMV_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::LayoutRight, Kokkos::HostSpace, false)
+
+}  // namespace Impl
+}  // namespace KokkosBlas
+#endif  // KOKKOSKERNELS_ENABLE_TPL_BLAS
 
 // cuBLAS
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
-#include<KokkosBlas_tpl_spec.hpp>
+#include <KokkosBlas_tpl_spec.hpp>
+
+namespace KokkosBlas {
+namespace Impl {
+
+#define KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA)                      \
+  bool A_is_lr      = std::is_same<Kokkos::LayoutRight, LAYOUTA>::value;     \
+  const int M       = static_cast<int>(A_is_lr ? A.extent(1) : A.extent(0)); \
+  const int N       = static_cast<int>(A_is_lr ? A.extent(0) : A.extent(1)); \
+  constexpr int one = 1;                                                     \
+  const int LDA     = A_is_lr ? A.stride(0) : A.stride(1);                   \
+                                                                             \
+  cublasOperation_t transa;                                                  \
+  if ((trans[0] == 'N') || (trans[0] == 'n'))                                \
+    transa = A_is_lr ? CUBLAS_OP_T : CUBLAS_OP_N;                            \
+  else if ((trans[0] == 'T') || (trans[0] == 't'))                           \
+    transa = A_is_lr ? CUBLAS_OP_N : CUBLAS_OP_T;                            \
+  else {                                                                     \
+    if (A_is_lr)                                                             \
+      throw std::runtime_error(                                              \
+          "Error: cublas gemv conjugate transpose requires LayoutLeft "      \
+          "views.");                                                         \
+    transa = CUBLAS_OP_C;                                                    \
+  }
+
+#define KOKKOSBLAS2_DGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE,         \
+                                 ETI_SPEC_AVAIL)                               \
+  template <class ExecSpace>                                                   \
+  struct GEMV<                                                                 \
+      Kokkos::View<const double**, LAYOUTA,                                    \
+                   Kokkos::Device<ExecSpace, MEM_SPACE>,                       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const double*, LAYOUTX,                                     \
+                   Kokkos::Device<ExecSpace, MEM_SPACE>,                       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<double*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>,     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      true, ETI_SPEC_AVAIL> {                                                  \
+    typedef double SCALAR;                                                     \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                              \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        AViewType;                                                             \
+    typedef Kokkos::View<const SCALAR*, LAYOUTX,                               \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XViewType;                                                             \
+    typedef Kokkos::View<SCALAR*, LAYOUTY,                                     \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        YViewType;                                                             \
+                                                                               \
+    static void gemv(const typename AViewType::execution_space& space,         \
+                     const char trans[],                                       \
+                     typename AViewType::const_value_type& alpha,              \
+                     const AViewType& A, const XViewType& X,                   \
+                     typename YViewType::const_value_type& beta,               \
+                     const YViewType& Y) {                                     \
+      Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,double]");    \
+      KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA);                         \
+      KokkosBlas::Impl::CudaBlasSingleton& s =                                 \
+          KokkosBlas::Impl::CudaBlasSingleton::singleton();                    \
+      KOKKOS_CUBLAS_SAFE_CALL_IMPL(                                            \
+          cublasSetStream(s.handle, space.cuda_stream()));                     \
+      KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDgemv(s.handle, transa, M, N, &alpha, \
+                                               A.data(), LDA, X.data(), one,   \
+                                               &beta, Y.data(), one));         \
+      KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL));           \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+#define KOKKOSBLAS2_SGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE,         \
+                                 ETI_SPEC_AVAIL)                               \
+  template <class ExecSpace>                                                   \
+  struct GEMV<                                                                 \
+      Kokkos::View<const float**, LAYOUTA,                                     \
+                   Kokkos::Device<ExecSpace, MEM_SPACE>,                       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const float*, LAYOUTX,                                      \
+                   Kokkos::Device<ExecSpace, MEM_SPACE>,                       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<float*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>,      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      true, ETI_SPEC_AVAIL> {                                                  \
+    typedef float SCALAR;                                                      \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                              \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        AViewType;                                                             \
+    typedef Kokkos::View<const SCALAR*, LAYOUTX,                               \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XViewType;                                                             \
+    typedef Kokkos::View<SCALAR*, LAYOUTY,                                     \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        YViewType;                                                             \
+                                                                               \
+    static void gemv(const typename AViewType::execution_space& space,         \
+                     const char trans[],                                       \
+                     typename AViewType::const_value_type& alpha,              \
+                     const AViewType& A, const XViewType& X,                   \
+                     typename YViewType::const_value_type& beta,               \
+                     const YViewType& Y) {                                     \
+      Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,float]");     \
+      KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA);                         \
+      KokkosBlas::Impl::CudaBlasSingleton& s =                                 \
+          KokkosBlas::Impl::CudaBlasSingleton::singleton();                    \
+      KOKKOS_CUBLAS_SAFE_CALL_IMPL(                                            \
+          cublasSetStream(s.handle, space.cuda_stream()));                     \
+      KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSgemv(s.handle, transa, M, N, &alpha, \
+                                               A.data(), LDA, X.data(), one,   \
+                                               &beta, Y.data(), one));         \
+      KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL));           \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+#define KOKKOSBLAS2_ZGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE,         \
+                                 ETI_SPEC_AVAIL)                               \
+  template <class ExecSpace>                                                   \
+  struct GEMV<Kokkos::View<const Kokkos::complex<double>**, LAYOUTA,           \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+              Kokkos::View<const Kokkos::complex<double>*, LAYOUTX,            \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+              Kokkos::View<Kokkos::complex<double>*, LAYOUTY,                  \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+              true, ETI_SPEC_AVAIL> {                                          \
+    typedef Kokkos::complex<double> SCALAR;                                    \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                              \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        AViewType;                                                             \
+    typedef Kokkos::View<const SCALAR*, LAYOUTX,                               \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XViewType;                                                             \
+    typedef Kokkos::View<SCALAR*, LAYOUTY,                                     \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        YViewType;                                                             \
+                                                                               \
+    static void gemv(const typename AViewType::execution_space& space,         \
+                     const char trans[],                                       \
+                     typename AViewType::const_value_type& alpha,              \
+                     const AViewType& A, const XViewType& X,                   \
+                     typename YViewType::const_value_type& beta,               \
+                     const YViewType& Y) {                                     \
+      Kokkos::Profiling::pushRegion(                                           \
+          "KokkosBlas::gemv[TPL_CUBLAS,complex<double>]");                     \
+      KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA);                         \
+      KokkosBlas::Impl::CudaBlasSingleton& s =                                 \
+          KokkosBlas::Impl::CudaBlasSingleton::singleton();                    \
+      KOKKOS_CUBLAS_SAFE_CALL_IMPL(                                            \
+          cublasSetStream(s.handle, space.cuda_stream()));                     \
+      KOKKOS_CUBLAS_SAFE_CALL_IMPL(                                            \
+          cublasZgemv(s.handle, transa, M, N,                                  \
+                      reinterpret_cast<const cuDoubleComplex*>(&alpha),        \
+                      reinterpret_cast<const cuDoubleComplex*>(A.data()), LDA, \
+                      reinterpret_cast<const cuDoubleComplex*>(X.data()), one, \
+                      reinterpret_cast<const cuDoubleComplex*>(&beta),         \
+                      reinterpret_cast<cuDoubleComplex*>(Y.data()), one));     \
+      KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL));           \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+#define KOKKOSBLAS2_CGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE,        \
+                                 ETI_SPEC_AVAIL)                              \
+  template <class ExecSpace>                                                  \
+  struct GEMV<Kokkos::View<const Kokkos::complex<float>**, LAYOUTA,           \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,              \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+              Kokkos::View<const Kokkos::complex<float>*, LAYOUTX,            \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,              \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+              Kokkos::View<Kokkos::complex<float>*, LAYOUTY,                  \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,              \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+              true, ETI_SPEC_AVAIL> {                                         \
+    typedef Kokkos::complex<float> SCALAR;                                    \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                             \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        AViewType;                                                            \
+    typedef Kokkos::View<const SCALAR*, LAYOUTX,                              \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        XViewType;                                                            \
+    typedef Kokkos::View<SCALAR*, LAYOUTY,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        YViewType;                                                            \
+                                                                              \
+    static void gemv(const typename AViewType::execution_space& space,        \
+                     const char trans[],                                      \
+                     typename AViewType::const_value_type& alpha,             \
+                     const AViewType& A, const XViewType& X,                  \
+                     typename YViewType::const_value_type& beta,              \
+                     const YViewType& Y) {                                    \
+      Kokkos::Profiling::pushRegion(                                          \
+          "KokkosBlas::gemv[TPL_CUBLAS,complex<float>]");                     \
+      KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA);                        \
+      KokkosBlas::Impl::CudaBlasSingleton& s =                                \
+          KokkosBlas::Impl::CudaBlasSingleton::singleton();                   \
+      KOKKOS_CUBLAS_SAFE_CALL_IMPL(                                           \
+          cublasSetStream(s.handle, space.cuda_stream()));                    \
+      KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgemv(                               \
+          s.handle, transa, M, N, reinterpret_cast<const cuComplex*>(&alpha), \
+          reinterpret_cast<const cuComplex*>(A.data()), LDA,                  \
+          reinterpret_cast<const cuComplex*>(X.data()), one,                  \
+          reinterpret_cast<const cuComplex*>(&beta),                          \
+          reinterpret_cast<cuComplex*>(Y.data()), one));                      \
+      KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL));          \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
+  };
+
+KOKKOSBLAS2_DGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
+KOKKOSBLAS2_DGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
+KOKKOSBLAS2_DGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::CudaSpace, true)
+KOKKOSBLAS2_DGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::CudaSpace, false)
+
+KOKKOSBLAS2_SGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
+KOKKOSBLAS2_SGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
+KOKKOSBLAS2_SGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::CudaSpace, true)
+KOKKOSBLAS2_SGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::CudaSpace, false)
+
+KOKKOSBLAS2_ZGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
+KOKKOSBLAS2_ZGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
+KOKKOSBLAS2_ZGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::CudaSpace, true)
+KOKKOSBLAS2_ZGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::CudaSpace, false)
+
+KOKKOSBLAS2_CGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
+KOKKOSBLAS2_CGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
+KOKKOSBLAS2_CGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::CudaSpace, true)
+KOKKOSBLAS2_CGEMV_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::CudaSpace, false)
+
+}  // namespace Impl
+}  // namespace KokkosBlas
+#endif  // KOKKOSKERNELS_ENABLE_TPL_CUBLAS
+
+// rocBLAS
+#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS
+#include <KokkosBlas_tpl_spec.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
 
-#define KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA)			\
-  bool A_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTA>::value;	\
-  const int M = static_cast<int> (A_is_lr ? A.extent(1) : A.extent(0)); \
-  const int N = static_cast<int> (A_is_lr ? A.extent(0) : A.extent(1)); \
-  constexpr int one = 1;						\
-  const int LDA = A_is_lr ? A.stride(0) : A.stride(1);			\
-									\
-  cublasOperation_t transa;						\
-  if ((trans[0]=='N')||(trans[0]=='n'))					\
-    transa = A_is_lr ? CUBLAS_OP_T : CUBLAS_OP_N;			\
-  else if ((trans[0]=='T')||(trans[0]=='t'))				\
-    transa = A_is_lr ? CUBLAS_OP_N : CUBLAS_OP_T;			\
-  else {								\
-    if (A_is_lr)							\
-      throw std::runtime_error("Error: cublas gemv conjugate transpose requires LayoutLeft views."); \
-    transa = CUBLAS_OP_C;						\
-  }									\
-
-#define KOKKOSBLAS2_DGEMV_CUBLAS( LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct GEMV< \
-     Kokkos::View<const double**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const double*, LAYOUTX, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<double*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef double SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<const SCALAR*, LAYOUTX, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > XViewType; \
-  typedef Kokkos::View<SCALAR*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > YViewType; \
- \
-  static void \
-  gemv (const char trans[], \
-        typename AViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const XViewType& X, \
-        typename YViewType::const_value_type& beta, \
-        const YViewType& Y) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,double]"); \
-    KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \
-    KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-    cublasDgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, X.data(), one, &beta, Y.data(), one); \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS2_SGEMV_CUBLAS( LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct GEMV< \
-     Kokkos::View<const float**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const float*, LAYOUTX, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<float*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef float SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<const SCALAR*, LAYOUTX, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > XViewType; \
-  typedef Kokkos::View<SCALAR*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > YViewType; \
-      \
-  static void \
-  gemv (const char trans[], \
-        typename AViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const XViewType& X, \
-        typename YViewType::const_value_type& beta, \
-        const YViewType& Y) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,float]"); \
-    KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \
-    KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-    cublasSgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, X.data(), one, &beta, Y.data(), one); \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS2_ZGEMV_CUBLAS( LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct GEMV< \
-     Kokkos::View<const Kokkos::complex<double>**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const Kokkos::complex<double>*, LAYOUTX, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<Kokkos::complex<double>*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef Kokkos::complex<double> SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<const SCALAR*, LAYOUTX, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > XViewType; \
-  typedef Kokkos::View<SCALAR*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > YViewType; \
-      \
-  static void \
-  gemv (const char trans[], \
-        typename AViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const XViewType& X, \
-        typename YViewType::const_value_type& beta, \
-        const YViewType& Y) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,complex<double>]"); \
-    KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \
-    KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-    cublasZgemv(s.handle, transa, M, N, reinterpret_cast<const cuDoubleComplex*>(&alpha), reinterpret_cast<const cuDoubleComplex*>(A.data()), LDA, reinterpret_cast<const cuDoubleComplex*>(X.data()), one, reinterpret_cast<const cuDoubleComplex*>(&beta), reinterpret_cast<cuDoubleComplex*>(Y.data()), one); \
-    Kokkos::Profiling::popRegion(); \
-  } \
-}; \
-
-#define KOKKOSBLAS2_CGEMV_CUBLAS( LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct GEMV< \
-     Kokkos::View<const Kokkos::complex<float>**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const Kokkos::complex<float>*, LAYOUTX, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<Kokkos::complex<float>*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef Kokkos::complex<float> SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<const SCALAR*, LAYOUTX, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > XViewType; \
-  typedef Kokkos::View<SCALAR*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > YViewType; \
-      \
-  static void \
-  gemv (const char trans[], \
-        typename AViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const XViewType& X, \
-        typename YViewType::const_value_type& beta, \
-        const YViewType& Y) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,complex<float>]"); \
-    KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \
-    KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-    cublasCgemv(s.handle, transa, M, N, reinterpret_cast<const cuComplex*>(&alpha), reinterpret_cast<const cuComplex*>(A.data()), LDA, reinterpret_cast<const cuComplex*>(X.data()), one, reinterpret_cast<const cuComplex*>(&beta), reinterpret_cast<cuComplex*>(Y.data()), one); \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-KOKKOSBLAS2_DGEMV_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS2_DGEMV_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
-KOKKOSBLAS2_DGEMV_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true)
-KOKKOSBLAS2_DGEMV_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false)
-
-KOKKOSBLAS2_SGEMV_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS2_SGEMV_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
-KOKKOSBLAS2_SGEMV_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true)
-KOKKOSBLAS2_SGEMV_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false)
-
-KOKKOSBLAS2_ZGEMV_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS2_ZGEMV_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
-KOKKOSBLAS2_ZGEMV_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true)
-KOKKOSBLAS2_ZGEMV_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false)
-
-KOKKOSBLAS2_CGEMV_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS2_CGEMV_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
-KOKKOSBLAS2_CGEMV_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true)
-KOKKOSBLAS2_CGEMV_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false)
-
-}
-}
-#endif // KOKKOSKERNELS_ENABLE_TPL_CUBLAS
+#define KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT)                      \
+  bool A_is_lr      = std::is_same<Kokkos::LayoutRight, LAYOUT>::value;      \
+  const int M       = static_cast<int>(A_is_lr ? A.extent(1) : A.extent(0)); \
+  const int N       = static_cast<int>(A_is_lr ? A.extent(0) : A.extent(1)); \
+  constexpr int one = 1;                                                     \
+  const int LDA     = A_is_lr ? A.stride(0) : A.stride(1);                   \
+                                                                             \
+  rocblas_operation transa;                                                  \
+  if ((trans[0] == 'N') || (trans[0] == 'n'))                                \
+    transa = A_is_lr ? rocblas_operation_transpose : rocblas_operation_none; \
+  else if ((trans[0] == 'T') || (trans[0] == 't'))                           \
+    transa = A_is_lr ? rocblas_operation_none : rocblas_operation_transpose; \
+  else {                                                                     \
+    if (A_is_lr)                                                             \
+      throw std::runtime_error(                                              \
+          "Error: rocBLAS Xgemv conjugate transpose requires LayoutLeft "    \
+          "matrix.");                                                        \
+    transa = rocblas_operation_conjugate_transpose;                          \
+  }
+
+#define KOKKOSBLAS2_DGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL)           \
+  template <>                                                                  \
+  struct GEMV<                                                                 \
+      Kokkos::View<const double**, LAYOUT,                                     \
+                   Kokkos::Device<Kokkos::Experimental::HIP, MEM_SPACE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const double*, LAYOUT,                                      \
+                   Kokkos::Device<Kokkos::Experimental::HIP, MEM_SPACE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<double*, LAYOUT,                                            \
+                   Kokkos::Device<Kokkos::Experimental::HIP, MEM_SPACE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      true, ETI_SPEC_AVAIL> {                                                  \
+    typedef double SCALAR;                                                     \
+    typedef Kokkos::View<const SCALAR**, LAYOUT,                               \
+                         Kokkos::Device<Kokkos::Experimental::HIP, MEM_SPACE>, \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        AViewType;                                                             \
+    typedef Kokkos::View<const SCALAR*, LAYOUT,                                \
+                         Kokkos::Device<Kokkos::Experimental::HIP, MEM_SPACE>, \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XViewType;                                                             \
+    typedef Kokkos::View<SCALAR*, LAYOUT,                                      \
+                         Kokkos::Device<Kokkos::Experimental::HIP, MEM_SPACE>, \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        YViewType;                                                             \
+                                                                               \
+    static void gemv(const typename AViewType::execution_space& space,         \
+                     const char trans[],                                       \
+                     typename AViewType::const_value_type& alpha,              \
+                     const AViewType& A, const XViewType& X,                   \
+                     typename YViewType::const_value_type& beta,               \
+                     const YViewType& Y) {                                     \
+      Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,double]");   \
+      KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT);                         \
+      KokkosBlas::Impl::RocBlasSingleton& s =                                  \
+          KokkosBlas::Impl::RocBlasSingleton::singleton();                     \
+      KOKKOS_ROCBLAS_SAFE_CALL_IMPL(                                           \
+          rocblas_dgemv(s.handle, transa, M, N, &alpha, A.data(), LDA,         \
+                        X.data(), one, &beta, Y.data(), one));                 \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+#define KOKKOSBLAS2_SGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL)           \
+  template <>                                                                  \
+  struct GEMV<                                                                 \
+      Kokkos::View<const float**, LAYOUT,                                      \
+                   Kokkos::Device<Kokkos::Experimental::HIP, MEM_SPACE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const float*, LAYOUT,                                       \
+                   Kokkos::Device<Kokkos::Experimental::HIP, MEM_SPACE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<float*, LAYOUT,                                             \
+                   Kokkos::Device<Kokkos::Experimental::HIP, MEM_SPACE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      true, ETI_SPEC_AVAIL> {                                                  \
+    typedef float SCALAR;                                                      \
+    typedef Kokkos::View<const SCALAR**, LAYOUT,                               \
+                         Kokkos::Device<Kokkos::Experimental::HIP, MEM_SPACE>, \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        AViewType;                                                             \
+    typedef Kokkos::View<const SCALAR*, LAYOUT,                                \
+                         Kokkos::Device<Kokkos::Experimental::HIP, MEM_SPACE>, \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XViewType;                                                             \
+    typedef Kokkos::View<SCALAR*, LAYOUT,                                      \
+                         Kokkos::Device<Kokkos::Experimental::HIP, MEM_SPACE>, \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        YViewType;                                                             \
+                                                                               \
+    static void gemv(const typename AViewType::execution_space& space,         \
+                     const char trans[],                                       \
+                     typename AViewType::const_value_type& alpha,              \
+                     const AViewType& A, const XViewType& X,                   \
+                     typename YViewType::const_value_type& beta,               \
+                     const YViewType& Y) {                                     \
+      Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,float]");    \
+      KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT);                         \
+      KokkosBlas::Impl::RocBlasSingleton& s =                                  \
+          KokkosBlas::Impl::RocBlasSingleton::singleton();                     \
+      KOKKOS_ROCBLAS_SAFE_CALL_IMPL(                                           \
+          rocblas_sgemv(s.handle, transa, M, N, &alpha, A.data(), LDA,         \
+                        X.data(), one, &beta, Y.data(), one));                 \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+#define KOKKOSBLAS2_ZGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL)           \
+  template <>                                                                  \
+  struct GEMV<                                                                 \
+      Kokkos::View<const Kokkos::complex<double>**, LAYOUT,                    \
+                   Kokkos::Device<Kokkos::Experimental::HIP, MEM_SPACE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const Kokkos::complex<double>*, LAYOUT,                     \
+                   Kokkos::Device<Kokkos::Experimental::HIP, MEM_SPACE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<Kokkos::complex<double>*, LAYOUT,                           \
+                   Kokkos::Device<Kokkos::Experimental::HIP, MEM_SPACE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      true, ETI_SPEC_AVAIL> {                                                  \
+    typedef Kokkos::complex<double> SCALAR;                                    \
+    typedef Kokkos::View<const SCALAR**, LAYOUT,                               \
+                         Kokkos::Device<Kokkos::Experimental::HIP, MEM_SPACE>, \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        AViewType;                                                             \
+    typedef Kokkos::View<const SCALAR*, LAYOUT,                                \
+                         Kokkos::Device<Kokkos::Experimental::HIP, MEM_SPACE>, \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XViewType;                                                             \
+    typedef Kokkos::View<SCALAR*, LAYOUT,                                      \
+                         Kokkos::Device<Kokkos::Experimental::HIP, MEM_SPACE>, \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        YViewType;                                                             \
+                                                                               \
+    static void gemv(const typename AViewType::execution_space& space,         \
+                     const char trans[],                                       \
+                     typename AViewType::const_value_type& alpha,              \
+                     const AViewType& A, const XViewType& X,                   \
+                     typename YViewType::const_value_type& beta,               \
+                     const YViewType& Y) {                                     \
+      Kokkos::Profiling::pushRegion(                                           \
+          "KokkosBlas::gemv[TPL_ROCBLAS,complex<double>]");                    \
+      KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT);                         \
+      KokkosBlas::Impl::RocBlasSingleton& s =                                  \
+          KokkosBlas::Impl::RocBlasSingleton::singleton();                     \
+      KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgemv(                             \
+          s.handle, transa, M, N,                                              \
+          reinterpret_cast<const rocblas_double_complex*>(&alpha),             \
+          reinterpret_cast<const rocblas_double_complex*>(A.data()), LDA,      \
+          reinterpret_cast<const rocblas_double_complex*>(X.data()), one,      \
+          reinterpret_cast<const rocblas_double_complex*>(&beta),              \
+          reinterpret_cast<rocblas_double_complex*>(Y.data()), one));          \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+#define KOKKOSBLAS2_CGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL)           \
+  template <>                                                                  \
+  struct GEMV<                                                                 \
+      Kokkos::View<const Kokkos::complex<float>**, LAYOUT,                     \
+                   Kokkos::Device<Kokkos::Experimental::HIP, MEM_SPACE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const Kokkos::complex<float>*, LAYOUT,                      \
+                   Kokkos::Device<Kokkos::Experimental::HIP, MEM_SPACE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<Kokkos::complex<float>*, LAYOUT,                            \
+                   Kokkos::Device<Kokkos::Experimental::HIP, MEM_SPACE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      true, ETI_SPEC_AVAIL> {                                                  \
+    typedef Kokkos::complex<float> SCALAR;                                     \
+    typedef Kokkos::View<const SCALAR**, LAYOUT,                               \
+                         Kokkos::Device<Kokkos::Experimental::HIP, MEM_SPACE>, \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        AViewType;                                                             \
+    typedef Kokkos::View<const SCALAR*, LAYOUT,                                \
+                         Kokkos::Device<Kokkos::Experimental::HIP, MEM_SPACE>, \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XViewType;                                                             \
+    typedef Kokkos::View<SCALAR*, LAYOUT,                                      \
+                         Kokkos::Device<Kokkos::Experimental::HIP, MEM_SPACE>, \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        YViewType;                                                             \
+                                                                               \
+    static void gemv(const typename AViewType::execution_space& space,         \
+                     const char trans[],                                       \
+                     typename AViewType::const_value_type& alpha,              \
+                     const AViewType& A, const XViewType& X,                   \
+                     typename YViewType::const_value_type& beta,               \
+                     const YViewType& Y) {                                     \
+      Kokkos::Profiling::pushRegion(                                           \
+          "KokkosBlas::gemv[TPL_ROCBLAS,complex<float>]");                     \
+      KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT);                         \
+      KokkosBlas::Impl::RocBlasSingleton& s =                                  \
+          KokkosBlas::Impl::RocBlasSingleton::singleton();                     \
+      KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgemv(                             \
+          s.handle, transa, M, N,                                              \
+          reinterpret_cast<const rocblas_float_complex*>(&alpha),              \
+          reinterpret_cast<const rocblas_float_complex*>(A.data()), LDA,       \
+          reinterpret_cast<const rocblas_float_complex*>(X.data()), one,       \
+          reinterpret_cast<const rocblas_float_complex*>(&beta),               \
+          reinterpret_cast<rocblas_float_complex*>(Y.data()), one));           \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace,
+                          true)
+KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace,
+                          false)
+KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace,
+                          true)
+KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace,
+                          false)
+
+KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace,
+                          true)
+KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace,
+                          false)
+KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace,
+                          true)
+KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace,
+                          false)
+
+KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace,
+                          true)
+KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace,
+                          false)
+KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace,
+                          true)
+KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace,
+                          false)
+
+KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace,
+                          true)
+KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace,
+                          false)
+KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace,
+                          true)
+KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace,
+                          false)
+
+}  // namespace Impl
+}  // namespace KokkosBlas
+#endif  // KOKKOSKERNELS_ENABLE_TPL_ROCBLAS
 
 #endif
diff --git a/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp
index 3b21c0e8a7..a569867586 100644
--- a/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp
@@ -48,71 +48,170 @@
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class AT, class XT, class YT>
+template <class AT, class XT, class YT>
 struct gemm_tpl_spec_avail {
   enum : bool { value = false };
 };
 
 // Generic Host side BLAS (could be MKL or whatever)
-#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
-
-#define KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( SCALAR , LAYOUTA, LAYOUTB, LAYOUTC, MEMSPACE ) \
-template<class ExecSpace> \
-struct gemm_tpl_spec_avail< \
-     Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR**, LAYOUTC, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> > \
-     >  { enum : bool { value = true }; };
-
- KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( double,                  Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace)
- KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( float,                   Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace)
- KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace)
- KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace)
-
- KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( double,                  Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace)
- KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( float,                   Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace)
- KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<double>, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace)
- KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<float>,  Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace)
+#if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS)
+
+#define KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, LAYOUTB,     \
+                                             LAYOUTC, MEMSPACE)            \
+  template <class ExecSpace>                                               \
+  struct gemm_tpl_spec_avail<                                              \
+      Kokkos::View<const SCALAR**, LAYOUTA,                                \
+                   Kokkos::Device<ExecSpace, MEMSPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      Kokkos::View<const SCALAR**, LAYOUTB,                                \
+                   Kokkos::Device<ExecSpace, MEMSPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      Kokkos::View<SCALAR**, LAYOUTC, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {           \
+    enum : bool { value = true };                                          \
+  };
+
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft,
+                                     Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                     Kokkos::HostSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft,
+                                     Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                     Kokkos::HostSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<double>,
+                                     Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                     Kokkos::LayoutLeft, Kokkos::HostSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<float>, Kokkos::LayoutLeft,
+                                     Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                     Kokkos::HostSpace)
+
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight,
+                                     Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                     Kokkos::HostSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight,
+                                     Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                     Kokkos::HostSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<double>,
+                                     Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                     Kokkos::LayoutRight, Kokkos::HostSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<float>,
+                                     Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                     Kokkos::LayoutRight, Kokkos::HostSpace)
 
 #endif
 
 // cuBLAS
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
-
-#define KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( SCALAR , LAYOUTA, LAYOUTB, LAYOUTC, MEMSPACE ) \
-template<class ExecSpace> \
-struct gemm_tpl_spec_avail< \
-     Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR**, LAYOUTC, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> > \
-     >  { enum : bool { value = true }; };
-
- KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( double,                  Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace)
- KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( double,                  Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
- KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( float,                   Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace)
- KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( float,                   Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
- KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace)
- KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
- KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace)
- KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
-
- KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( double,                  Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace)
- KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( double,                  Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
- KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( float,                   Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace)
- KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( float,                   Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
- KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<double>, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace)
- KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<double>, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
- KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<float>,  Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace)
- KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<float>,  Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS)
+
+#define KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTA, LAYOUTB,   \
+                                               LAYOUTC, MEMSPACE)          \
+  template <class ExecSpace>                                               \
+  struct gemm_tpl_spec_avail<                                              \
+      Kokkos::View<const SCALAR**, LAYOUTA,                                \
+                   Kokkos::Device<ExecSpace, MEMSPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      Kokkos::View<const SCALAR**, LAYOUTB,                                \
+                   Kokkos::Device<ExecSpace, MEMSPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      Kokkos::View<SCALAR**, LAYOUTC, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {           \
+    enum : bool { value = true };                                          \
+  };
+
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft,
+                                       Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                       Kokkos::CudaSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft,
+                                       Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                       Kokkos::CudaUVMSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft,
+                                       Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                       Kokkos::CudaSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft,
+                                       Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                       Kokkos::CudaUVMSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<double>,
+                                       Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                       Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<double>,
+                                       Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                       Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<float>,
+                                       Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                       Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<float>,
+                                       Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                       Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
+
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight,
+                                       Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                       Kokkos::CudaSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight,
+                                       Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                       Kokkos::CudaUVMSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight,
+                                       Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                       Kokkos::CudaSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight,
+                                       Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                       Kokkos::CudaUVMSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<double>,
+                                       Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                       Kokkos::LayoutRight, Kokkos::CudaSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<double>,
+                                       Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                       Kokkos::LayoutRight,
+                                       Kokkos::CudaUVMSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<float>,
+                                       Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                       Kokkos::LayoutRight, Kokkos::CudaSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<float>,
+                                       Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                       Kokkos::LayoutRight,
+                                       Kokkos::CudaUVMSpace)
+
+#endif
+
+// rocBLAS
+#if defined(KOKKOSKERNELS_ENABLE_TPL_ROCBLAS)
+
+#define KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \
+  template <class ExecSpace>                                              \
+  struct gemm_tpl_spec_avail<                                             \
+      Kokkos::View<const SCALAR**, LAYOUT,                                \
+                   Kokkos::Device<ExecSpace, MEMSPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR**, LAYOUT,                                \
+                   Kokkos::Device<ExecSpace, MEMSPACE>,                   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {          \
+    enum : bool { value = true };                                         \
+  };
+
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft,
+                                        Kokkos::Experimental::HIPSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft,
+                                        Kokkos::Experimental::HIPSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex<double>,
+                                        Kokkos::LayoutLeft,
+                                        Kokkos::Experimental::HIPSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex<float>,
+                                        Kokkos::LayoutLeft,
+                                        Kokkos::Experimental::HIPSpace)
+
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight,
+                                        Kokkos::Experimental::HIPSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight,
+                                        Kokkos::Experimental::HIPSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex<double>,
+                                        Kokkos::LayoutRight,
+                                        Kokkos::Experimental::HIPSpace)
+KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex<float>,
+                                        Kokkos::LayoutRight,
+                                        Kokkos::Experimental::HIPSpace)
 
 #endif
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp
index 4eba52efac..56ef551412 100644
--- a/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp
@@ -45,632 +45,501 @@
 #ifndef KOKKOSBLAS3_GEMM_TPL_SPEC_DECL_HPP_
 #define KOKKOSBLAS3_GEMM_TPL_SPEC_DECL_HPP_
 
-#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
+#if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS)
 #include "KokkosBlas_Host_tpl.hpp"
 
 namespace KokkosBlas {
 namespace Impl {
 
-#define KOKKOSBLAS3_DGEMM_BLAS( LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct GEMM< \
-     Kokkos::View<const double**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const double**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<double**, LAYOUTC, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef double SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUTC, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > CViewType; \
- \
-  static void \
-  gemm (const char transA[], \
-        const char transB[], \
-        typename AViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const BViewType& B, \
-        typename CViewType::const_value_type& beta, \
-        const CViewType& C) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS,double]"); \
-    const bool A_t = (transA[0]!='N') && (transA[0]!='n'); \
-    const int M = C.extent(0); \
-    const int N = C.extent(1); \
-    const int K = A.extent(A_t?0:1); \
-    \
-    bool A_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTA>::value; \
-    bool B_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTB>::value; \
-    bool C_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTC>::value; \
-    \
-    const int AST = A_is_lr?A.stride(0):A.stride(1), LDA = AST == 0 ? 1 : AST; \
-    const int BST = B_is_lr?B.stride(0):B.stride(1), LDB = BST == 0 ? 1 : BST; \
-    const int CST = C_is_lr?C.stride(0):C.stride(1), LDC = CST == 0 ? 1 : CST; \
-    \
-    if(!A_is_lr && !B_is_lr && !C_is_lr )                               \
-      HostBlas<double>::gemm                                            \
-        (transA[0],transB[0],                                                 \
-         M,N,K,                                                         \
-         alpha,                                                         \
-         A.data(),LDA,                                                  \
-         B.data(),LDB,                                                  \
-         beta,                                                          \
-         C.data(),LDC);                                                 \
-    if(A_is_lr && B_is_lr && C_is_lr )                                  \
-      HostBlas<double>::gemm                                            \
-        (transB[0],transA[0],                                                 \
-         N,M,K,                                                         \
-         alpha,                                                         \
-         B.data(),LDB,                                                  \
-         A.data(),LDA,                                                  \
-         beta,                                                          \
-         C.data(),LDC);                                                 \
-    Kokkos::Profiling::popRegion();                                     \
-  } \
-};
-
-#define KOKKOSBLAS3_SGEMM_BLAS( LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct GEMM< \
-     Kokkos::View<const float**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const float**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<float**, LAYOUTC, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef float SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUTC, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > CViewType; \
-      \
-  static void \
-  gemm (const char transA[], \
-        const char transB[], \
-        typename AViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const BViewType& B, \
-        typename CViewType::const_value_type& beta, \
-        const CViewType& C) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS,float]"); \
-    const bool A_t = (transA[0]!='N') && (transA[0]!='n'); \
-    const int M = C.extent(0); \
-    const int N = C.extent(1); \
-    const int K = A.extent(A_t?0:1); \
-    \
-    bool A_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTA>::value; \
-    bool B_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTB>::value; \
-    bool C_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTC>::value; \
-    \
-    const int AST = A_is_lr?A.stride(0):A.stride(1), LDA = AST == 0 ? 1 : AST; \
-    const int BST = B_is_lr?B.stride(0):B.stride(1), LDB = BST == 0 ? 1 : BST; \
-    const int CST = C_is_lr?C.stride(0):C.stride(1), LDC = CST == 0 ? 1 : CST; \
-    \
-    if(!A_is_lr && !B_is_lr && !C_is_lr )                              \
-      HostBlas<float>::gemm                                             \
-        (transA[0],transB[0],                                                 \
-         M,N,K,                                                         \
-         alpha,                                                         \
-         A.data(),LDA,                                                  \
-         B.data(),LDB,                                                  \
-         beta,                                                          \
-         C.data(),LDC);                                                 \
-    if(A_is_lr && B_is_lr && C_is_lr )                                  \
-      HostBlas<float>::gemm                                             \
-        (transB[0],transA[0],                                                 \
-         N,M,K,                                                         \
-         alpha,                                                         \
-         B.data(),LDB,                                                  \
-         A.data(),LDA,                                                  \
-         beta,                                                          \
-         C.data(),LDC);                                                 \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS3_ZGEMM_BLAS( LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct GEMM< \
-     Kokkos::View<const Kokkos::complex<double>**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const Kokkos::complex<double>**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<Kokkos::complex<double>**, LAYOUTC, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef Kokkos::complex<double> SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUTC, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > CViewType; \
-      \
-  static void \
-  gemm (const char transA[], \
-        const char transB[], \
-        typename AViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const BViewType& B, \
-        typename CViewType::const_value_type& beta, \
-        const CViewType& C) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS,complex<double>]"); \
-    const bool A_t = (transA[0]!='N') && (transA[0]!='n'); \
-    const int M = C.extent(0); \
-    const int N = C.extent(1); \
-    const int K = A.extent(A_t?0:1); \
-    \
-    bool A_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTA>::value; \
-    bool B_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTB>::value; \
-    bool C_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTC>::value; \
-    \
-    const int AST = A_is_lr?A.stride(0):A.stride(1), LDA = AST == 0 ? 1 : AST; \
-    const int BST = B_is_lr?B.stride(0):B.stride(1), LDB = BST == 0 ? 1 : BST; \
-    const int CST = C_is_lr?C.stride(0):C.stride(1), LDC = CST == 0 ? 1 : CST; \
-    \
-    const std::complex<double> alpha_val = alpha, beta_val = beta;     \
-    if(!A_is_lr && !B_is_lr && !C_is_lr )                              \
-      HostBlas<std::complex<double> >::gemm                             \
-        (transA[0],transB[0],                                                 \
-         M,N,K,                                                         \
-         alpha_val,          \
-         reinterpret_cast<const std::complex<double>*>(A.data()),LDA,   \
-         reinterpret_cast<const std::complex<double>*>(B.data()),LDB,   \
-         beta_val,           \
-         reinterpret_cast<      std::complex<double>*>(C.data()),LDC);  \
-    if(A_is_lr && B_is_lr && C_is_lr )                                  \
-      HostBlas<std::complex<double> >::gemm                              \
-        (transB[0],transA[0],                                                 \
-         N,M,K,                                                         \
-         alpha_val,          \
-         reinterpret_cast<const std::complex<double>*>(B.data()),LDB,   \
-         reinterpret_cast<const std::complex<double>*>(A.data()),LDA,   \
-         beta_val,           \
-         reinterpret_cast<      std::complex<double>*>(C.data()),LDC);  \
-    Kokkos::Profiling::popRegion(); \
-  } \
-}; \
-
-#define KOKKOSBLAS3_CGEMM_BLAS( LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct GEMM< \
-     Kokkos::View<const Kokkos::complex<float>**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const Kokkos::complex<float>**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<Kokkos::complex<float>**, LAYOUTC, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef Kokkos::complex<float> SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUTC, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > CViewType; \
-      \
-  static void \
-  gemm (const char transA[], \
-        const char transB[], \
-        typename AViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const BViewType& B, \
-        typename CViewType::const_value_type& beta, \
-        const CViewType& C) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS,complex<float>]"); \
-    const bool A_t = (transA[0]!='N') && (transA[0]!='n'); \
-    const int M = C.extent(0); \
-    const int N = C.extent(1); \
-    const int K = A.extent(A_t?0:1); \
-    \
-    bool A_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTA>::value; \
-    bool B_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTB>::value; \
-    bool C_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTC>::value; \
-    \
-    const int AST = A_is_lr?A.stride(0):A.stride(1), LDA = AST == 0 ? 1 : AST; \
-    const int BST = B_is_lr?B.stride(0):B.stride(1), LDB = BST == 0 ? 1 : BST; \
-    const int CST = C_is_lr?C.stride(0):C.stride(1), LDC = CST == 0 ? 1 : CST; \
-    \
-    const std::complex<float> alpha_val = alpha, beta_val = beta;      \
-    if(!A_is_lr && !B_is_lr && !C_is_lr )                              \
-      HostBlas<std::complex<float> >::gemm                             \
-        (transA[0],transB[0],                                                 \
-         M,N,K,                                                         \
-         alpha_val,          \
-         reinterpret_cast<const std::complex<float>*>(A.data()),LDA,   \
-         reinterpret_cast<const std::complex<float>*>(B.data()),LDB,   \
-         beta_val,           \
-         reinterpret_cast<      std::complex<float>*>(C.data()),LDC);  \
-    if(A_is_lr && B_is_lr && C_is_lr )                                  \
-      HostBlas<std::complex<float> >::gemm                              \
-        (transB[0],transA[0],                                                 \
-         N,M,K,                                                         \
-         alpha_val,          \
-         reinterpret_cast<const std::complex<float>*>(B.data()),LDB,   \
-         reinterpret_cast<const std::complex<float>*>(A.data()),LDA,   \
-         beta_val,           \
-         reinterpret_cast<      std::complex<float>*>(C.data()),LDC);  \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-KOKKOSBLAS3_DGEMM_BLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS3_DGEMM_BLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false)
-KOKKOSBLAS3_DGEMM_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true)
-KOKKOSBLAS3_DGEMM_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false)
-
-KOKKOSBLAS3_SGEMM_BLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS3_SGEMM_BLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false)
-KOKKOSBLAS3_SGEMM_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true)
-KOKKOSBLAS3_SGEMM_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false)
-
-KOKKOSBLAS3_ZGEMM_BLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS3_ZGEMM_BLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false)
-KOKKOSBLAS3_ZGEMM_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true)
-KOKKOSBLAS3_ZGEMM_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false)
-
-KOKKOSBLAS3_CGEMM_BLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true)
-KOKKOSBLAS3_CGEMM_BLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false)
-KOKKOSBLAS3_CGEMM_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true)
-KOKKOSBLAS3_CGEMM_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false)
-
-}
-}
-#endif // KOKKOSKERNELS_ENABLE_TPL_BLAS
+#define KOKKOSBLAS3_XGEMM_BLAS(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA,        \
+                               LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL)   \
+  template <class ExecSpace>                                                  \
+  struct GEMM<Kokkos::View<const SCALAR_TYPE**, LAYOUTA,                      \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,              \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+              Kokkos::View<const SCALAR_TYPE**, LAYOUTB,                      \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,              \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+              Kokkos::View<SCALAR_TYPE**, LAYOUTC,                            \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,              \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+              true, ETI_SPEC_AVAIL> {                                         \
+    typedef SCALAR_TYPE SCALAR;                                               \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                             \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        AViewType;                                                            \
+    typedef Kokkos::View<const SCALAR**, LAYOUTB,                             \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        BViewType;                                                            \
+    typedef Kokkos::View<SCALAR**, LAYOUTC,                                   \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        CViewType;                                                            \
+                                                                              \
+    static void gemm(const typename CViewType::execution_space& /* space*/,   \
+                     const char transA[], const char transB[],                \
+                     typename AViewType::const_value_type& alpha,             \
+                     const AViewType& A, const BViewType& B,                  \
+                     typename CViewType::const_value_type& beta,              \
+                     const CViewType& C) {                                    \
+      Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS," #SCALAR_TYPE \
+                                    "]");                                     \
+      const bool A_t = (transA[0] != 'N') && (transA[0] != 'n');              \
+      const int M    = C.extent(0);                                           \
+      const int N    = C.extent(1);                                           \
+      const int K    = A.extent(A_t ? 0 : 1);                                 \
+                                                                              \
+      bool A_is_lr = std::is_same<Kokkos::LayoutRight, LAYOUTA>::value;       \
+      bool B_is_lr = std::is_same<Kokkos::LayoutRight, LAYOUTB>::value;       \
+      bool C_is_lr = std::is_same<Kokkos::LayoutRight, LAYOUTC>::value;       \
+                                                                              \
+      const int AST = A_is_lr ? A.stride(0) : A.stride(1),                    \
+                LDA = AST == 0 ? 1 : AST;                                     \
+      const int BST = B_is_lr ? B.stride(0) : B.stride(1),                    \
+                LDB = BST == 0 ? 1 : BST;                                     \
+      const int CST = C_is_lr ? C.stride(0) : C.stride(1),                    \
+                LDC = CST == 0 ? 1 : CST;                                     \
+                                                                              \
+      const BASE_SCALAR_TYPE alpha_val = alpha, beta_val = beta;              \
+      if (!A_is_lr && !B_is_lr && !C_is_lr)                                   \
+        HostBlas<BASE_SCALAR_TYPE>::gemm(                                     \
+            transA[0], transB[0], M, N, K, alpha_val,                         \
+            reinterpret_cast<const BASE_SCALAR_TYPE*>(A.data()), LDA,         \
+            reinterpret_cast<const BASE_SCALAR_TYPE*>(B.data()), LDB,         \
+            beta_val, reinterpret_cast<BASE_SCALAR_TYPE*>(C.data()), LDC);    \
+      if (A_is_lr && B_is_lr && C_is_lr)                                      \
+        HostBlas<BASE_SCALAR_TYPE>::gemm(                                     \
+            transB[0], transA[0], N, M, K, alpha_val,                         \
+            reinterpret_cast<const BASE_SCALAR_TYPE*>(B.data()), LDB,         \
+            reinterpret_cast<const BASE_SCALAR_TYPE*>(A.data()), LDA,         \
+            beta_val, reinterpret_cast<BASE_SCALAR_TYPE*>(C.data()), LDC);    \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
+  };
+
+#define KOKKOSBLAS3_DGEMM_BLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE,           \
+                               ETI_SPEC_AVAIL)                                 \
+  KOKKOSBLAS3_XGEMM_BLAS(double, double, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \
+                         ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS3_SGEMM_BLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE,         \
+                               ETI_SPEC_AVAIL)                               \
+  KOKKOSBLAS3_XGEMM_BLAS(float, float, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \
+                         ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS3_ZGEMM_BLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE,    \
+                               ETI_SPEC_AVAIL)                          \
+  KOKKOSBLAS3_XGEMM_BLAS(Kokkos::complex<double>, std::complex<double>, \
+                         LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS3_CGEMM_BLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE,           \
+                               ETI_SPEC_AVAIL)                                 \
+  KOKKOSBLAS3_XGEMM_BLAS(Kokkos::complex<float>, std::complex<float>, LAYOUTA, \
+                         LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL)
+
+KOKKOSBLAS3_DGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::LayoutLeft, Kokkos::HostSpace, true)
+KOKKOSBLAS3_DGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::LayoutLeft, Kokkos::HostSpace, false)
+KOKKOSBLAS3_DGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::LayoutRight, Kokkos::HostSpace, true)
+KOKKOSBLAS3_DGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::LayoutRight, Kokkos::HostSpace, false)
+
+KOKKOSBLAS3_SGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::LayoutLeft, Kokkos::HostSpace, true)
+KOKKOSBLAS3_SGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::LayoutLeft, Kokkos::HostSpace, false)
+KOKKOSBLAS3_SGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::LayoutRight, Kokkos::HostSpace, true)
+KOKKOSBLAS3_SGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::LayoutRight, Kokkos::HostSpace, false)
+
+KOKKOSBLAS3_ZGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::LayoutLeft, Kokkos::HostSpace, true)
+KOKKOSBLAS3_ZGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::LayoutLeft, Kokkos::HostSpace, false)
+KOKKOSBLAS3_ZGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::LayoutRight, Kokkos::HostSpace, true)
+KOKKOSBLAS3_ZGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::LayoutRight, Kokkos::HostSpace, false)
+
+KOKKOSBLAS3_CGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::LayoutLeft, Kokkos::HostSpace, true)
+KOKKOSBLAS3_CGEMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::LayoutLeft, Kokkos::HostSpace, false)
+KOKKOSBLAS3_CGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::LayoutRight, Kokkos::HostSpace, true)
+KOKKOSBLAS3_CGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::LayoutRight, Kokkos::HostSpace, false)
+
+}  // namespace Impl
+}  // namespace KokkosBlas
+#endif  // KOKKOSKERNELS_ENABLE_TPL_BLAS
 
 // cuBLAS
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
-#include<KokkosBlas_tpl_spec.hpp>
-#include<KokkosBlas3_gemm_dotbased_impl.hpp>
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS)
+#include <KokkosBlas_tpl_spec.hpp>
+#include <KokkosBlas3_gemm_dotbased_impl.hpp>
+
+namespace KokkosBlas {
+namespace Impl {
+
+#define KOKKOSBLAS3_XGEMM_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, \
+                                 LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE,     \
+                                 ETI_SPEC_AVAIL)                           \
+  template <class ExecSpace>                                               \
+  struct GEMM<Kokkos::View<const SCALAR_TYPE**, LAYOUTA,                   \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,           \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,      \
+              Kokkos::View<const SCALAR_TYPE**, LAYOUTB,                   \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,           \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,      \
+              Kokkos::View<SCALAR_TYPE**, LAYOUTC,                         \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,           \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,      \
+              true, ETI_SPEC_AVAIL> {                                      \
+    typedef SCALAR_TYPE SCALAR;                                            \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                          \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,             \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >         \
+        AViewType;                                                         \
+    typedef Kokkos::View<const SCALAR**, LAYOUTB,                          \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,             \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >         \
+        BViewType;                                                         \
+    typedef Kokkos::View<SCALAR**, LAYOUTC,                                \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,             \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >         \
+        CViewType;                                                         \
+                                                                           \
+    static void gemm(const typename CViewType::execution_space& space,     \
+                     const char transA[], const char transB[],             \
+                     typename AViewType::const_value_type& alpha,          \
+                     const AViewType& A, const BViewType& B,               \
+                     typename CViewType::const_value_type& beta,           \
+                     const CViewType& C) {                                 \
+      Kokkos::Profiling::pushRegion(                                       \
+          "KokkosBlas::gemm[TPL_CUBLAS," #SCALAR_TYPE "]");                \
+      const bool A_t = (transA[0] != 'N') && (transA[0] != 'n');           \
+      const int M    = static_cast<int>(C.extent(0));                      \
+      const int N    = static_cast<int>(C.extent(1));                      \
+      const int K    = static_cast<int>(A.extent(A_t ? 0 : 1));            \
+                                                                           \
+      bool A_is_lr = std::is_same<Kokkos::LayoutRight, LAYOUTA>::value;    \
+      bool B_is_lr = std::is_same<Kokkos::LayoutRight, LAYOUTB>::value;    \
+      bool C_is_lr = std::is_same<Kokkos::LayoutRight, LAYOUTC>::value;    \
+                                                                           \
+      const int AST = A_is_lr ? A.stride(0) : A.stride(1),                 \
+                LDA = AST == 0 ? 1 : AST;                                  \
+      const int BST = B_is_lr ? B.stride(0) : B.stride(1),                 \
+                LDB = BST == 0 ? 1 : BST;                                  \
+      const int CST = C_is_lr ? C.stride(0) : C.stride(1),                 \
+                LDC = CST == 0 ? 1 : CST;                                  \
+                                                                           \
+      cublasOperation_t transa = trans_mode_kk_to_cublas(transA);          \
+      cublasOperation_t transb = trans_mode_kk_to_cublas(transB);          \
+                                                                           \
+      constexpr int numDotsLayoutLeftThreshold  = 1600;                    \
+      constexpr int numDotsLayoutRightThreshold = 100;                     \
+      if ((!A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N &&   \
+           M * N < numDotsLayoutLeftThreshold) ||                          \
+          (A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N &&    \
+           M * N < numDotsLayoutRightThreshold)) {                         \
+        DotBasedGEMM<ExecSpace, AViewType, BViewType, CViewType> gemm(     \
+            alpha, A, B, beta, C);                                         \
+        bool conjT = (std::is_same<SCALAR, double>::value ||               \
+                      std::is_same<SCALAR, float>::value)                  \
+                         ? false                                           \
+                         : (transa == CUBLAS_OP_C ? true : false);         \
+        gemm.run(space, conjT);                                            \
+      } else {                                                             \
+        KokkosBlas::Impl::CudaBlasSingleton& s =                           \
+            KokkosBlas::Impl::CudaBlasSingleton::singleton();              \
+        KOKKOS_CUBLAS_SAFE_CALL_IMPL(                                      \
+            cublasSetStream(s.handle, space.cuda_stream()));               \
+        if (!A_is_lr && !B_is_lr && !C_is_lr)                              \
+          KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN(                          \
+              s.handle, transa, transb, M, N, K,                           \
+              reinterpret_cast<const CUDA_SCALAR_TYPE*>(&alpha),           \
+              reinterpret_cast<const CUDA_SCALAR_TYPE*>(A.data()), LDA,    \
+              reinterpret_cast<const CUDA_SCALAR_TYPE*>(B.data()), LDB,    \
+              reinterpret_cast<const CUDA_SCALAR_TYPE*>(&beta),            \
+              reinterpret_cast<CUDA_SCALAR_TYPE*>(C.data()), LDC));        \
+        if (A_is_lr && B_is_lr && C_is_lr)                                 \
+          KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN(                          \
+              s.handle, transb, transa, N, M, K,                           \
+              reinterpret_cast<const CUDA_SCALAR_TYPE*>(&alpha),           \
+              reinterpret_cast<const CUDA_SCALAR_TYPE*>(B.data()), LDB,    \
+              reinterpret_cast<const CUDA_SCALAR_TYPE*>(A.data()), LDA,    \
+              reinterpret_cast<const CUDA_SCALAR_TYPE*>(&beta),            \
+              reinterpret_cast<CUDA_SCALAR_TYPE*>(C.data()), LDC));        \
+        KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL));     \
+      }                                                                    \
+      Kokkos::Profiling::popRegion();                                      \
+    }                                                                      \
+  };
+
+#define KOKKOSBLAS3_DGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE,    \
+                                 ETI_SPEC_AVAIL)                          \
+  KOKKOSBLAS3_XGEMM_CUBLAS(double, double, cublasDgemm, LAYOUTA, LAYOUTB, \
+                           LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS3_SGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE,  \
+                                 ETI_SPEC_AVAIL)                        \
+  KOKKOSBLAS3_XGEMM_CUBLAS(float, float, cublasSgemm, LAYOUTA, LAYOUTB, \
+                           LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS3_ZGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE,        \
+                                 ETI_SPEC_AVAIL)                              \
+  KOKKOSBLAS3_XGEMM_CUBLAS(Kokkos::complex<double>, cuDoubleComplex,          \
+                           cublasZgemm, LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \
+                           ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS3_CGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE,     \
+                                 ETI_SPEC_AVAIL)                           \
+  KOKKOSBLAS3_XGEMM_CUBLAS(Kokkos::complex<float>, cuComplex, cublasCgemm, \
+                           LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE,           \
+                           ETI_SPEC_AVAIL)
+
+KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
+KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
+KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::CudaSpace, true)
+KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::CudaSpace, false)
+
+KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
+KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
+
+KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
+KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
+KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::CudaSpace, true)
+KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::CudaSpace, false)
+
+KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
+KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS3_SGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
+
+KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
+KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
+KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::CudaSpace, true)
+KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::CudaSpace, false)
+
+KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
+KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS3_ZGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
+
+KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
+KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
+KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::CudaSpace, true)
+KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::CudaSpace, false)
+
+KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
+KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
+
+}  // namespace Impl
+}  // namespace KokkosBlas
+#endif  // KOKKOSKERNELS_ENABLE_TPL_CUBLAS
+
+// rocBLAS
+#if defined(KOKKOSKERNELS_ENABLE_TPL_ROCBLAS)
+#include <KokkosBlas_tpl_spec.hpp>
+#include <KokkosBlas3_gemm_dotbased_impl.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
 
-#define KOKKOSBLAS3_DGEMM_CUBLAS( LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct GEMM< \
-     Kokkos::View<const double**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const double**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<double**, LAYOUTC, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef double SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUTC, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > CViewType; \
- \
-  static void \
-  gemm (const char transA[], \
-        const char transB[], \
-        typename AViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const BViewType& B, \
-        typename CViewType::const_value_type& beta, \
-        const CViewType& C) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS,double]"); \
-    const bool A_t = (transA[0]!='N') && (transA[0]!='n'); \
-    const int M = static_cast<int> (C.extent(0)); \
-    const int N = static_cast<int> (C.extent(1)); \
-    const int K = static_cast<int> (A.extent(A_t?0:1)); \
-    \
-    bool A_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTA>::value; \
-    bool B_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTB>::value; \
-    bool C_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTC>::value; \
-    \
-    const int AST = A_is_lr?A.stride(0):A.stride(1), LDA = AST == 0 ? 1 : AST; \
-    const int BST = B_is_lr?B.stride(0):B.stride(1), LDB = BST == 0 ? 1 : BST; \
-    const int CST = C_is_lr?C.stride(0):C.stride(1), LDC = CST == 0 ? 1 : CST; \
-    \
-    cublasOperation_t transa, transb; \
-    if ((transA[0]=='N')||(transA[0]=='n')) \
-      transa = CUBLAS_OP_N; \
-    else if ((transA[0]=='T')||(transA[0]=='t')) \
-      transa = CUBLAS_OP_T; \
-    else \
-      transa = CUBLAS_OP_C; \
-    if ((transB[0]=='N')||(transB[0]=='n')) \
-      transb = CUBLAS_OP_N; \
-    else if ((transB[0]=='T')||(transB[0]=='t')) \
-      transb = CUBLAS_OP_T; \
-    else \
-      transb = CUBLAS_OP_C; \
-    \
-    constexpr int numDotsLayoutLeftThreshold = 1600; \
-    constexpr int numDotsLayoutRightThreshold = 100; \
-    if(   (!A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N && M*N < numDotsLayoutLeftThreshold) \
-       || ( A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N && M*N < numDotsLayoutRightThreshold)) { \
-      DotBasedGEMM<ExecSpace,AViewType,BViewType,CViewType> gemm(alpha,A,B,beta,C); \
-      gemm.run(false); \
-    } \
-    else { \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      if(!A_is_lr && !B_is_lr && !C_is_lr )                             \
-        cublasDgemm(s.handle, transa, transb, M, N, K, &alpha, A.data(), LDA, B.data(), LDB, &beta, C.data(), LDC); \
-      if(A_is_lr && B_is_lr && C_is_lr )                                \
-        cublasDgemm(s.handle, transb, transa, N, M, K, &alpha, B.data(), LDB, A.data(), LDA, &beta, C.data(), LDC); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS3_SGEMM_CUBLAS( LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct GEMM< \
-     Kokkos::View<const float**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const float**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<float**, LAYOUTC, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef float SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUTC, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > CViewType; \
-      \
-  static void \
-  gemm (const char transA[], \
-        const char transB[], \
-        typename AViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const BViewType& B, \
-        typename CViewType::const_value_type& beta, \
-        const CViewType& C) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS,float]"); \
-    const bool A_t = (transA[0]!='N') && (transA[0]!='n'); \
-    const int M = static_cast<int> (C.extent(0)); \
-    const int N = static_cast<int> (C.extent(1)); \
-    const int K = static_cast<int> (A.extent(A_t?0:1)); \
-    \
-    bool A_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTA>::value; \
-    bool B_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTB>::value; \
-    bool C_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTC>::value; \
-    \
-    const int AST = A_is_lr?A.stride(0):A.stride(1), LDA = AST == 0 ? 1 : AST; \
-    const int BST = B_is_lr?B.stride(0):B.stride(1), LDB = BST == 0 ? 1 : BST; \
-    const int CST = C_is_lr?C.stride(0):C.stride(1), LDC = CST == 0 ? 1 : CST; \
-    \
-    cublasOperation_t transa, transb; \
-    if ((transA[0]=='N')||(transA[0]=='n')) \
-      transa = CUBLAS_OP_N; \
-    else if ((transA[0]=='T')||(transA[0]=='t')) \
-      transa = CUBLAS_OP_T; \
-    else \
-      transa = CUBLAS_OP_C; \
-    if ((transB[0]=='N')||(transB[0]=='n')) \
-      transb = CUBLAS_OP_N; \
-    else if ((transB[0]=='T')||(transB[0]=='t')) \
-      transb = CUBLAS_OP_T; \
-    else \
-      transb = CUBLAS_OP_C; \
-    \
-    constexpr int numDotsLayoutLeftThreshold = 1600; \
-    constexpr int numDotsLayoutRightThreshold = 100; \
-    if(   (!A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N && M*N < numDotsLayoutLeftThreshold) \
-       || ( A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N && M*N < numDotsLayoutRightThreshold)) { \
-      DotBasedGEMM<ExecSpace,AViewType,BViewType,CViewType> gemm(alpha,A,B,beta,C); \
-      gemm.run(false); \
-    } \
-    else { \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      if(!A_is_lr && !B_is_lr && !C_is_lr ) \
-        cublasSgemm(s.handle, transa, transb, M, N, K, &alpha, A.data(), LDA, B.data(), LDB, &beta, C.data(), LDC); \
-      if(A_is_lr && B_is_lr && C_is_lr ) \
-        cublasSgemm(s.handle, transb, transa, N, M, K, &alpha, B.data(), LDB, A.data(), LDA, &beta, C.data(), LDC); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS3_ZGEMM_CUBLAS( LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct GEMM< \
-     Kokkos::View<const Kokkos::complex<double>**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const Kokkos::complex<double>**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<Kokkos::complex<double>**, LAYOUTC, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef Kokkos::complex<double> SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUTC, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > CViewType; \
-      \
-  static void \
-  gemm (const char transA[], \
-        const char transB[], \
-        typename AViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const BViewType& B, \
-        typename CViewType::const_value_type& beta, \
-        const CViewType& C) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS,complex<double>]"); \
-    const bool A_t = (transA[0]!='N') && (transA[0]!='n'); \
-    const int M = static_cast<int> (C.extent(0)); \
-    const int N = static_cast<int> (C.extent(1)); \
-    const int K = static_cast<int> (A.extent(A_t?0:1)); \
-    \
-    bool A_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTA>::value; \
-    bool B_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTB>::value; \
-    bool C_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTC>::value; \
-    \
-    const int AST = A_is_lr?A.stride(0):A.stride(1), LDA = AST == 0 ? 1 : AST; \
-    const int BST = B_is_lr?B.stride(0):B.stride(1), LDB = BST == 0 ? 1 : BST; \
-    const int CST = C_is_lr?C.stride(0):C.stride(1), LDC = CST == 0 ? 1 : CST; \
-    \
-    cublasOperation_t transa, transb; \
-    if ((transA[0]=='N')||(transA[0]=='n')) \
-      transa = CUBLAS_OP_N; \
-    else if ((transA[0]=='T')||(transA[0]=='t')) \
-      transa = CUBLAS_OP_T; \
-    else \
-      transa = CUBLAS_OP_C; \
-    if ((transB[0]=='N')||(transB[0]=='n')) \
-      transb = CUBLAS_OP_N; \
-    else if ((transB[0]=='T')||(transB[0]=='t')) \
-      transb = CUBLAS_OP_T; \
-    else \
-      transb = CUBLAS_OP_C; \
-    \
-    constexpr int numDotsLayoutLeftThreshold = 1600; \
-    constexpr int numDotsLayoutRightThreshold = 100; \
-    if(   (!A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N && M*N < numDotsLayoutLeftThreshold) \
-       || ( A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N && M*N < numDotsLayoutRightThreshold)) { \
-      DotBasedGEMM<ExecSpace,AViewType,BViewType,CViewType> gemm(alpha,A,B,beta,C); \
-      gemm.run(transa == CUBLAS_OP_C ? true : false);  \
-    } \
-    else { \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      if(!A_is_lr && !B_is_lr && !C_is_lr ) \
-        cublasZgemm(s.handle, transa, transb, M, N, K, reinterpret_cast<const cuDoubleComplex*>(&alpha), reinterpret_cast<const cuDoubleComplex*>(A.data()), LDA, reinterpret_cast<const cuDoubleComplex*>(B.data()), LDB, reinterpret_cast<const cuDoubleComplex*>(&beta), reinterpret_cast<cuDoubleComplex*>(C.data()), LDC); \
-      if(A_is_lr && B_is_lr && C_is_lr ) \
-        cublasZgemm(s.handle, transb, transa, N, M, K, reinterpret_cast<const cuDoubleComplex*>(&alpha), reinterpret_cast<const cuDoubleComplex*>(B.data()), LDB, reinterpret_cast<const cuDoubleComplex*>(A.data()), LDA, reinterpret_cast<const cuDoubleComplex*>(&beta), reinterpret_cast<cuDoubleComplex*>(C.data()), LDC); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-}; \
-
-#define KOKKOSBLAS3_CGEMM_CUBLAS( LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct GEMM< \
-     Kokkos::View<const Kokkos::complex<float>**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<const Kokkos::complex<float>**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<Kokkos::complex<float>**, LAYOUTC, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef Kokkos::complex<float> SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUTC, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > CViewType; \
-      \
-  static void \
-  gemm (const char transA[], \
-        const char transB[], \
-        typename AViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const BViewType& B, \
-        typename CViewType::const_value_type& beta, \
-        const CViewType& C) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS,complex<float>]"); \
-    const bool A_t = (transA[0]!='N') && (transA[0]!='n'); \
-    const int M = static_cast<int> (C.extent(0)); \
-    const int N = static_cast<int> (C.extent(1)); \
-    const int K = static_cast<int> (A.extent(A_t?0:1)); \
-    \
-    bool A_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTA>::value; \
-    bool B_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTB>::value; \
-    bool C_is_lr = std::is_same<Kokkos::LayoutRight,LAYOUTC>::value; \
-    \
-    const int AST = A_is_lr?A.stride(0):A.stride(1), LDA = AST == 0 ? 1 : AST; \
-    const int BST = B_is_lr?B.stride(0):B.stride(1), LDB = BST == 0 ? 1 : BST; \
-    const int CST = C_is_lr?C.stride(0):C.stride(1), LDC = CST == 0 ? 1 : CST; \
-    \
-    cublasOperation_t transa, transb; \
-    if ((transA[0]=='N')||(transA[0]=='n')) \
-      transa = CUBLAS_OP_N; \
-    else if ((transA[0]=='T')||(transA[0]=='t')) \
-      transa = CUBLAS_OP_T; \
-    else \
-      transa = CUBLAS_OP_C; \
-    if ((transB[0]=='N')||(transB[0]=='n')) \
-      transb = CUBLAS_OP_N; \
-    else if ((transB[0]=='T')||(transB[0]=='t')) \
-      transb = CUBLAS_OP_T; \
-    else \
-      transb = CUBLAS_OP_C; \
-    \
-    constexpr int numDotsLayoutLeftThreshold = 1600; \
-    constexpr int numDotsLayoutRightThreshold = 100; \
-    if(   (!A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N && M*N < numDotsLayoutLeftThreshold) \
-       || ( A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N && M*N < numDotsLayoutRightThreshold)) { \
-      DotBasedGEMM<ExecSpace,AViewType,BViewType,CViewType> gemm(alpha,A,B,beta,C); \
-      gemm.run(transa == CUBLAS_OP_C ? true : false);  \
-    } \
-    else { \
-      KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-      if(!A_is_lr && !B_is_lr && !C_is_lr ) \
-        cublasCgemm(s.handle, transa, transb, M, N, K, reinterpret_cast<const cuComplex*>(&alpha), reinterpret_cast<const cuComplex*>(A.data()), LDA, reinterpret_cast<const cuComplex*>(B.data()), LDB, reinterpret_cast<const cuComplex*>(&beta), reinterpret_cast<cuComplex*>(C.data()), LDC); \
-      if(A_is_lr && B_is_lr && C_is_lr ) \
-        cublasCgemm(s.handle, transb, transa, N, M, K, reinterpret_cast<const cuComplex*>(&alpha), reinterpret_cast<const cuComplex*>(B.data()), LDB, reinterpret_cast<const cuComplex*>(A.data()), LDA, reinterpret_cast<const cuComplex*>(&beta), reinterpret_cast<cuComplex*>(C.data()), LDC); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-KOKKOSBLAS3_DGEMM_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS3_DGEMM_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
-KOKKOSBLAS3_DGEMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true)
-KOKKOSBLAS3_DGEMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false)
-
-KOKKOSBLAS3_DGEMM_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS3_DGEMM_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
-KOKKOSBLAS3_DGEMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS3_DGEMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
-
-KOKKOSBLAS3_SGEMM_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS3_SGEMM_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
-KOKKOSBLAS3_SGEMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true)
-KOKKOSBLAS3_SGEMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false)
-
-KOKKOSBLAS3_SGEMM_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS3_SGEMM_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
-KOKKOSBLAS3_SGEMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS3_SGEMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
-
-KOKKOSBLAS3_ZGEMM_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS3_ZGEMM_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
-KOKKOSBLAS3_ZGEMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true)
-KOKKOSBLAS3_ZGEMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false)
-
-KOKKOSBLAS3_ZGEMM_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS3_ZGEMM_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
-KOKKOSBLAS3_ZGEMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS3_ZGEMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
-
-KOKKOSBLAS3_CGEMM_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
-KOKKOSBLAS3_CGEMM_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
-KOKKOSBLAS3_CGEMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true)
-KOKKOSBLAS3_CGEMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false)
-
-KOKKOSBLAS3_CGEMM_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS3_CGEMM_CUBLAS( Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false)
-KOKKOSBLAS3_CGEMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS3_CGEMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
-
-}
-}
-#endif // KOKKOSKERNELS_ENABLE_TPL_CUBLAS
+#define KOKKOSBLAS3_XGEMM_ROCBLAS(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE,          \
+                                  ROCBLAS_FN, LAYOUT, MEM_SPACE,             \
+                                  ETI_SPEC_AVAIL)                            \
+  template <class ExecSpace>                                                 \
+  struct GEMM<Kokkos::View<const SCALAR_TYPE**, LAYOUT,                      \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,             \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,        \
+              Kokkos::View<const SCALAR_TYPE**, LAYOUT,                      \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,             \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,        \
+              Kokkos::View<SCALAR_TYPE**, LAYOUT,                            \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,             \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,        \
+              true, ETI_SPEC_AVAIL> {                                        \
+    typedef SCALAR_TYPE SCALAR;                                              \
+    typedef Kokkos::View<const SCALAR**, LAYOUT,                             \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
+        AViewType;                                                           \
+    typedef Kokkos::View<const SCALAR**, LAYOUT,                             \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
+        BViewType;                                                           \
+    typedef Kokkos::View<SCALAR**, LAYOUT,                                   \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
+        CViewType;                                                           \
+                                                                             \
+    static void gemm(const typename CViewType::execution_space& space,       \
+                     const char transA[], const char transB[],               \
+                     typename AViewType::const_value_type& alpha,            \
+                     const AViewType& A, const BViewType& B,                 \
+                     typename CViewType::const_value_type& beta,             \
+                     const CViewType& C) {                                   \
+      Kokkos::Profiling::pushRegion(                                         \
+          "KokkosBlas::gemm[TPL_ROCBLAS," #SCALAR_TYPE "]");                 \
+                                                                             \
+      const bool A_t = (transA[0] != 'N') && (transA[0] != 'n');             \
+      const int M    = static_cast<int>(C.extent(0));                        \
+      const int N    = static_cast<int>(C.extent(1));                        \
+      const int K    = static_cast<int>(A.extent(A_t ? 0 : 1));              \
+                                                                             \
+      bool is_lr = std::is_same<Kokkos::LayoutRight, LAYOUT>::value;         \
+                                                                             \
+      const int AST = is_lr ? A.stride(0) : A.stride(1),                     \
+                LDA = AST == 0 ? 1 : AST;                                    \
+      const int BST = is_lr ? B.stride(0) : B.stride(1),                     \
+                LDB = BST == 0 ? 1 : BST;                                    \
+      const int CST = is_lr ? C.stride(0) : C.stride(1),                     \
+                LDC = CST == 0 ? 1 : CST;                                    \
+                                                                             \
+      rocblas_operation transa = trans_mode_kk_to_rocblas(transA);           \
+      rocblas_operation transb = trans_mode_kk_to_rocblas(transB);           \
+                                                                             \
+      constexpr int numDotsLayoutLeftThreshold  = 1600;                      \
+      constexpr int numDotsLayoutRightThreshold = 100;                       \
+      if ((!is_lr && transa != rocblas_operation_none &&                     \
+           transb == rocblas_operation_none &&                               \
+           M * N < numDotsLayoutLeftThreshold) ||                            \
+          (is_lr && transa != rocblas_operation_none &&                      \
+           transb == rocblas_operation_none &&                               \
+           M * N < numDotsLayoutRightThreshold)) {                           \
+        DotBasedGEMM<ExecSpace, AViewType, BViewType, CViewType> gemm(       \
+            alpha, A, B, beta, C);                                           \
+        bool conjT =                                                         \
+            (std::is_same<SCALAR, double>::value ||                          \
+             std::is_same<SCALAR, float>::value)                             \
+                ? false                                                      \
+                : (transa == rocblas_operation_conjugate_transpose ? true    \
+                                                                   : false); \
+        gemm.run(space, conjT);                                              \
+      } else {                                                               \
+        KokkosBlas::Impl::RocBlasSingleton& s =                              \
+            KokkosBlas::Impl::RocBlasSingleton::singleton();                 \
+        KOKKOS_ROCBLAS_SAFE_CALL_IMPL(                                       \
+            rocblas_set_stream(s.handle, space.hip_stream()));               \
+        if (!is_lr)                                                          \
+          KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN(                          \
+              s.handle, transa, transb, M, N, K,                             \
+              reinterpret_cast<const ROCBLAS_SCALAR_TYPE*>(&alpha),          \
+              reinterpret_cast<const ROCBLAS_SCALAR_TYPE*>(A.data()), LDA,   \
+              reinterpret_cast<const ROCBLAS_SCALAR_TYPE*>(B.data()), LDB,   \
+              reinterpret_cast<const ROCBLAS_SCALAR_TYPE*>(&beta),           \
+              reinterpret_cast<ROCBLAS_SCALAR_TYPE*>(C.data()), LDC));       \
+        else                                                                 \
+          KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN(                          \
+              s.handle, transb, transa, N, M, K,                             \
+              reinterpret_cast<const ROCBLAS_SCALAR_TYPE*>(&alpha),          \
+              reinterpret_cast<const ROCBLAS_SCALAR_TYPE*>(B.data()), LDB,   \
+              reinterpret_cast<const ROCBLAS_SCALAR_TYPE*>(A.data()), LDA,   \
+              reinterpret_cast<const ROCBLAS_SCALAR_TYPE*>(&beta),           \
+              reinterpret_cast<ROCBLAS_SCALAR_TYPE*>(C.data()), LDC));       \
+        KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL));   \
+      }                                                                      \
+      Kokkos::Profiling::popRegion();                                        \
+    }                                                                        \
+  };
+
+#define KOKKOSBLAS3_DGEMM_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL)          \
+  KOKKOSBLAS3_XGEMM_ROCBLAS(double, double, rocblas_dgemm, LAYOUT, MEM_SPACE, \
+                            ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS3_SGEMM_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL)        \
+  KOKKOSBLAS3_XGEMM_ROCBLAS(float, float, rocblas_sgemm, LAYOUT, MEM_SPACE, \
+                            ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS3_ZGEMM_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL)         \
+  KOKKOSBLAS3_XGEMM_ROCBLAS(Kokkos::complex<double>, rocblas_double_complex, \
+                            rocblas_zgemm, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS3_CGEMM_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL)       \
+  KOKKOSBLAS3_XGEMM_ROCBLAS(Kokkos::complex<float>, rocblas_float_complex, \
+                            rocblas_cgemm, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL)
+
+KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace,
+                          true)
+KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace,
+                          false)
+KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace,
+                          true)
+KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace,
+                          false)
+
+KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace,
+                          true)
+KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace,
+                          false)
+KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace,
+                          true)
+KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace,
+                          false)
+
+KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace,
+                          true)
+KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace,
+                          false)
+KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace,
+                          true)
+KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace,
+                          false)
+
+KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace,
+                          true)
+KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace,
+                          false)
+KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace,
+                          true)
+KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace,
+                          false)
+
+}  // namespace Impl
+}  // namespace KokkosBlas
+#endif  // KOKKOSKERNELS_ENABLE_TPL_ROCBLAS
 
 #endif
diff --git a/src/impl/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp
index 03e2badcc1..0f3ecf84ed 100644
--- a/src/impl/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp
@@ -49,7 +49,7 @@ namespace KokkosBlas {
 namespace Impl {
 
 // Specialization struct which defines whether a specialization exists
-template<class AVT, class BVT>
+template <class AVT, class BVT>
 struct trmm_tpl_spec_avail {
   enum : bool { value = false };
 };
@@ -57,59 +57,102 @@ struct trmm_tpl_spec_avail {
 // Generic Host side BLAS (could be MKL or whatever)
 #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
 
-#define KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( SCALAR , LAYOUTA, LAYOUTB, MEMSPACE ) \
-template<class ExecSpace> \
-struct trmm_tpl_spec_avail< \
-     Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> > \
-     >  { enum : bool { value = true }; };
+#define KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, LAYOUTB,     \
+                                             MEMSPACE)                     \
+  template <class ExecSpace>                                               \
+  struct trmm_tpl_spec_avail<                                              \
+      Kokkos::View<const SCALAR**, LAYOUTA,                                \
+                   Kokkos::Device<ExecSpace, MEMSPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {           \
+    enum : bool { value = true };                                          \
+  };
 
- KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( double,                  Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace)
- KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( float,                   Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace)
- KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace)
- KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace)
+KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft,
+                                     Kokkos::LayoutLeft, Kokkos::HostSpace)
+KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft,
+                                     Kokkos::LayoutLeft, Kokkos::HostSpace)
+KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<double>,
+                                     Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                     Kokkos::HostSpace)
+KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<float>, Kokkos::LayoutLeft,
+                                     Kokkos::LayoutLeft, Kokkos::HostSpace)
 
- KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( double,                  Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace)
- KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( float,                   Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace)
- KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<double>, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace)
- KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<float>,  Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace)
+KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight,
+                                     Kokkos::LayoutRight, Kokkos::HostSpace)
+KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight,
+                                     Kokkos::LayoutRight, Kokkos::HostSpace)
+KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<double>,
+                                     Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                     Kokkos::HostSpace)
+KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<float>,
+                                     Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                     Kokkos::HostSpace)
 
-#endif // KOKKOSKERNELS_ENABLE_TPL_BLAS
+#endif  // KOKKOSKERNELS_ENABLE_TPL_BLAS
 
 // cuBLAS
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
 
-#define KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( SCALAR , LAYOUTA, LAYOUTB, MEMSPACE ) \
-template<class ExecSpace> \
-struct trmm_tpl_spec_avail< \
-     Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> > \
-     >  { enum : bool { value = true }; };
+#define KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTA, LAYOUTB,   \
+                                               MEMSPACE)                   \
+  template <class ExecSpace>                                               \
+  struct trmm_tpl_spec_avail<                                              \
+      Kokkos::View<const SCALAR**, LAYOUTA,                                \
+                   Kokkos::Device<ExecSpace, MEMSPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {           \
+    enum : bool { value = true };                                          \
+  };
 
- KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( double,                  Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace)
- KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( double,                  Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
- KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( float,                   Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace)
- KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( float,                   Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
- KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace)
- KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
- KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace)
- KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
+KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft,
+                                       Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft,
+                                       Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
+KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft,
+                                       Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft,
+                                       Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
+KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<double>,
+                                       Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                       Kokkos::CudaSpace)
+KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<double>,
+                                       Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                       Kokkos::CudaUVMSpace)
+KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<float>,
+                                       Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                       Kokkos::CudaSpace)
+KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<float>,
+                                       Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                       Kokkos::CudaUVMSpace)
 
- KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( double,                  Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace)
- KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( double,                  Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
- KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( float,                   Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace)
- KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( float,                   Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
- KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<double>, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace)
- KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<double>, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
- KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<float>,  Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace)
- KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<float>,  Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
+KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight,
+                                       Kokkos::LayoutRight, Kokkos::CudaSpace)
+KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight,
+                                       Kokkos::LayoutRight,
+                                       Kokkos::CudaUVMSpace)
+KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight,
+                                       Kokkos::LayoutRight, Kokkos::CudaSpace)
+KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight,
+                                       Kokkos::LayoutRight,
+                                       Kokkos::CudaUVMSpace)
+KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<double>,
+                                       Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                       Kokkos::CudaSpace)
+KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<double>,
+                                       Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                       Kokkos::CudaUVMSpace)
+KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<float>,
+                                       Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                       Kokkos::CudaSpace)
+KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<float>,
+                                       Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                       Kokkos::CudaUVMSpace)
 
-#endif // KOKKOSKERNELS_ENABLE_TPL_CUBLAS
-} // namespace Impl
-} // namespace KokkosBlas
+#endif  // KOKKOSKERNELS_ENABLE_TPL_CUBLAS
+}  // namespace Impl
+}  // namespace KokkosBlas
 
-#endif // KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_HPP_
+#endif  // KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_HPP_
diff --git a/src/impl/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp
index c323db7e4a..2988ecb040 100644
--- a/src/impl/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp
@@ -52,251 +52,333 @@
 namespace KokkosBlas {
 namespace Impl {
 
-#define KOKKOSBLAS3_TRMM_BLAS(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \
-template<class ExecSpace> \
-struct TRMM< \
-     Kokkos::View<const SCALAR_TYPE**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR_TYPE**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef SCALAR_TYPE SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
-  \
-  static void \
-  trmm (const char side[], \
-        const char uplo[], \
-        const char trans[], \
-        const char diag[], \
-        typename BViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const BViewType& B) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::trmm[TPL_BLAS,"#SCALAR_TYPE"]"); \
-    const int M = static_cast<int> (B.extent(0)); \
-    const int N = static_cast<int> (B.extent(1)); \
-    \
-    bool A_is_layout_left = std::is_same<Kokkos::LayoutLeft,LAYOUTA>::value; \
-    bool B_is_layout_left = std::is_same<Kokkos::LayoutLeft,LAYOUTB>::value; \
-    \
-    const int AST = A_is_layout_left?A.stride(1):A.stride(0), LDA = (AST == 0) ? 1 : AST; \
-    const int BST = B_is_layout_left?B.stride(1):B.stride(0), LDB = (BST == 0) ? 1 : BST; \
-    \
-    char  side_; \
-    char  uplo_; \
-    \
-    if(A_is_layout_left) { \
-      if ((side[0]=='L')||(side[0]=='l')) \
-        side_ = 'L'; \
-      else \
-        side_ = 'R'; \
-      if ((uplo[0]=='L')||(uplo[0]=='l')) \
-        uplo_ = 'L'; \
-      else \
-        uplo_ = 'U'; \
-    } \
-    else { \
-      if ((side[0]=='L')||(side[0]=='l')) \
-        side_ = 'R'; \
-      else \
-        side_ = 'L'; \
-      if ((uplo[0]=='L')||(uplo[0]=='l')) \
-        uplo_ = 'U'; \
-      else \
-        uplo_ = 'L'; \
-    } \
-    \
-    if (A_is_layout_left) \
-      HostBlas<BASE_SCALAR_TYPE>::trmm(side_, uplo_, trans[0], diag[0], M, N, alpha, reinterpret_cast<const BASE_SCALAR_TYPE *>(A.data()), LDA, reinterpret_cast<BASE_SCALAR_TYPE *>(B.data()), LDB); \
-    else \
-      HostBlas<BASE_SCALAR_TYPE>::trmm(side_, uplo_, trans[0], diag[0], N, M, alpha, reinterpret_cast<const BASE_SCALAR_TYPE *>(A.data()), LDA, reinterpret_cast<BASE_SCALAR_TYPE *>(B.data()), LDB); \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
-
-#define KOKKOSBLAS3_DTRMM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL ) \
-KOKKOSBLAS3_TRMM_BLAS(double, double, LAYOUTA,  LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL)
-
-#define KOKKOSBLAS3_STRMM_BLAS( LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL ) \
-KOKKOSBLAS3_TRMM_BLAS(float, float, LAYOUTA,  LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL)
-
-#define KOKKOSBLAS3_ZTRMM_BLAS( LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL ) \
-KOKKOSBLAS3_TRMM_BLAS(Kokkos::complex<double>, std::complex<double>, LAYOUTA,  LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL)
-
-#define KOKKOSBLAS3_CTRMM_BLAS( LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL ) \
-KOKKOSBLAS3_TRMM_BLAS(Kokkos::complex<float>, std::complex<float>, LAYOUTA,  LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL)
+#define KOKKOSBLAS3_TRMM_BLAS(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, LAYOUTB, \
+                              MEM_SPACE, ETI_SPEC_AVAIL)                       \
+  template <class ExecSpace>                                                   \
+  struct TRMM<Kokkos::View<const SCALAR_TYPE**, LAYOUTA,                       \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+              Kokkos::View<SCALAR_TYPE**, LAYOUTB,                             \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+              true, ETI_SPEC_AVAIL> {                                          \
+    typedef SCALAR_TYPE SCALAR;                                                \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                              \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        AViewType;                                                             \
+    typedef Kokkos::View<SCALAR**, LAYOUTB,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        BViewType;                                                             \
+                                                                               \
+    static void trmm(const char side[], const char uplo[], const char trans[], \
+                     const char diag[],                                        \
+                     typename BViewType::const_value_type& alpha,              \
+                     const AViewType& A, const BViewType& B) {                 \
+      Kokkos::Profiling::pushRegion("KokkosBlas::trmm[TPL_BLAS," #SCALAR_TYPE  \
+                                    "]");                                      \
+      const int M = static_cast<int>(B.extent(0));                             \
+      const int N = static_cast<int>(B.extent(1));                             \
+                                                                               \
+      bool A_is_layout_left =                                                  \
+          std::is_same<Kokkos::LayoutLeft, LAYOUTA>::value;                    \
+      bool B_is_layout_left =                                                  \
+          std::is_same<Kokkos::LayoutLeft, LAYOUTB>::value;                    \
+                                                                               \
+      const int AST = A_is_layout_left ? A.stride(1) : A.stride(0),            \
+                LDA = (AST == 0) ? 1 : AST;                                    \
+      const int BST = B_is_layout_left ? B.stride(1) : B.stride(0),            \
+                LDB = (BST == 0) ? 1 : BST;                                    \
+                                                                               \
+      char side_;                                                              \
+      char uplo_;                                                              \
+                                                                               \
+      if (A_is_layout_left) {                                                  \
+        if ((side[0] == 'L') || (side[0] == 'l'))                              \
+          side_ = 'L';                                                         \
+        else                                                                   \
+          side_ = 'R';                                                         \
+        if ((uplo[0] == 'L') || (uplo[0] == 'l'))                              \
+          uplo_ = 'L';                                                         \
+        else                                                                   \
+          uplo_ = 'U';                                                         \
+      } else {                                                                 \
+        if ((side[0] == 'L') || (side[0] == 'l'))                              \
+          side_ = 'R';                                                         \
+        else                                                                   \
+          side_ = 'L';                                                         \
+        if ((uplo[0] == 'L') || (uplo[0] == 'l'))                              \
+          uplo_ = 'U';                                                         \
+        else                                                                   \
+          uplo_ = 'L';                                                         \
+      }                                                                        \
+                                                                               \
+      if (A_is_layout_left)                                                    \
+        HostBlas<BASE_SCALAR_TYPE>::trmm(                                      \
+            side_, uplo_, trans[0], diag[0], M, N, alpha,                      \
+            reinterpret_cast<const BASE_SCALAR_TYPE*>(A.data()), LDA,          \
+            reinterpret_cast<BASE_SCALAR_TYPE*>(B.data()), LDB);               \
+      else                                                                     \
+        HostBlas<BASE_SCALAR_TYPE>::trmm(                                      \
+            side_, uplo_, trans[0], diag[0], N, M, alpha,                      \
+            reinterpret_cast<const BASE_SCALAR_TYPE*>(A.data()), LDA,          \
+            reinterpret_cast<BASE_SCALAR_TYPE*>(B.data()), LDB);               \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+#define KOKKOSBLAS3_DTRMM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \
+  KOKKOSBLAS3_TRMM_BLAS(double, double, LAYOUTA, LAYOUTB, MEM_SPACE,        \
+                        ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS3_STRMM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \
+  KOKKOSBLAS3_TRMM_BLAS(float, float, LAYOUTA, LAYOUTB, MEM_SPACE,          \
+                        ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS3_ZTRMM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \
+  KOKKOSBLAS3_TRMM_BLAS(Kokkos::complex<double>, std::complex<double>,      \
+                        LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS3_CTRMM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL)   \
+  KOKKOSBLAS3_TRMM_BLAS(Kokkos::complex<float>, std::complex<float>, LAYOUTA, \
+                        LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL)
 
 // Explicitly define the TRMM class for all permutations listed below
 
-KOKKOSBLAS3_DTRMM_BLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::HostSpace, true)
-KOKKOSBLAS3_DTRMM_BLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::HostSpace, false)
-KOKKOSBLAS3_DTRMM_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true)
-KOKKOSBLAS3_DTRMM_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false)
-
-KOKKOSBLAS3_STRMM_BLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::HostSpace, true)
-KOKKOSBLAS3_STRMM_BLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::HostSpace, false)
-KOKKOSBLAS3_STRMM_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true)
-KOKKOSBLAS3_STRMM_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false)
-
-KOKKOSBLAS3_ZTRMM_BLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::HostSpace, true)
-KOKKOSBLAS3_ZTRMM_BLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::HostSpace, false)
-KOKKOSBLAS3_ZTRMM_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true)
-KOKKOSBLAS3_ZTRMM_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false)
-
-KOKKOSBLAS3_CTRMM_BLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::HostSpace, true)
-KOKKOSBLAS3_CTRMM_BLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::HostSpace, false)
-KOKKOSBLAS3_CTRMM_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true)
-KOKKOSBLAS3_CTRMM_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false)
-
-}
-}
-#endif // KOKKOSKERNELS_ENABLE_TPL_BLAS
+KOKKOSBLAS3_DTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::HostSpace, true)
+KOKKOSBLAS3_DTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::HostSpace, false)
+KOKKOSBLAS3_DTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::HostSpace, true)
+KOKKOSBLAS3_DTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::HostSpace, false)
+
+KOKKOSBLAS3_STRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::HostSpace, true)
+KOKKOSBLAS3_STRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::HostSpace, false)
+KOKKOSBLAS3_STRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::HostSpace, true)
+KOKKOSBLAS3_STRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::HostSpace, false)
+
+KOKKOSBLAS3_ZTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::HostSpace, true)
+KOKKOSBLAS3_ZTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::HostSpace, false)
+KOKKOSBLAS3_ZTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::HostSpace, true)
+KOKKOSBLAS3_ZTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::HostSpace, false)
+
+KOKKOSBLAS3_CTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::HostSpace, true)
+KOKKOSBLAS3_CTRMM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::HostSpace, false)
+KOKKOSBLAS3_CTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::HostSpace, true)
+KOKKOSBLAS3_CTRMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::HostSpace, false)
+
+}  // namespace Impl
+}  // namespace KokkosBlas
+#endif  // KOKKOSKERNELS_ENABLE_TPL_BLAS
 
 // cuBLAS
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
-#include<KokkosBlas_tpl_spec.hpp>
+#include <KokkosBlas_tpl_spec.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
 
-#define KOKKOSBLAS3_TRMM_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct TRMM< \
-     Kokkos::View<const SCALAR_TYPE**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR_TYPE**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef SCALAR_TYPE SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
-  \
-  static void \
-  trmm (const char side[], \
-        const char uplo[], \
-        const char trans[], \
-        const char diag[], \
-        typename BViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const BViewType& B) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::trmm[TPL_CUBLAS,"#SCALAR_TYPE"]"); \
-    const int M = static_cast<int> (B.extent(0)); \
-    const int N = static_cast<int> (B.extent(1)); \
-    \
-    bool A_is_layout_left = std::is_same<Kokkos::LayoutLeft,LAYOUTA>::value; \
-    bool B_is_layout_left = std::is_same<Kokkos::LayoutLeft,LAYOUTB>::value; \
-    \
-    const int AST = A_is_layout_left?A.stride(1):A.stride(0), LDA = (AST == 0) ? 1 : AST; \
-    const int BST = B_is_layout_left?B.stride(1):B.stride(0), LDB = (BST == 0) ? 1 : BST; \
-    \
-    cublasSideMode_t  side_; \
-    cublasFillMode_t  uplo_; \
-    cublasOperation_t trans_; \
-    cublasDiagType_t  diag_; \
-    \
-    if(A_is_layout_left) { \
-      if ((side[0]=='L')||(side[0]=='l')) \
-        side_ = CUBLAS_SIDE_LEFT; \
-      else \
-        side_ = CUBLAS_SIDE_RIGHT; \
-      if ((uplo[0]=='L')||(uplo[0]=='l')) \
-        uplo_ = CUBLAS_FILL_MODE_LOWER; \
-      else \
-        uplo_ = CUBLAS_FILL_MODE_UPPER; \
-    } \
-    else { \
-      if ((side[0]=='L')||(side[0]=='l')) \
-        side_ = CUBLAS_SIDE_RIGHT; \
-      else \
-        side_ = CUBLAS_SIDE_LEFT; \
-      if ((uplo[0]=='L')||(uplo[0]=='l')) \
-        uplo_ = CUBLAS_FILL_MODE_UPPER; \
-      else \
-        uplo_ = CUBLAS_FILL_MODE_LOWER; \
-    } \
-    \
-    if ((trans[0]=='N')||(trans[0]=='n')) \
-      trans_ = CUBLAS_OP_N; \
-    else if ((trans[0]=='T')||(trans[0]=='t')) \
-      trans_ = CUBLAS_OP_T; \
-    else \
-      trans_ = CUBLAS_OP_C; \
-    if ((diag[0]=='U')||(diag[0]=='u')) \
-      diag_ = CUBLAS_DIAG_UNIT; \
-    else \
-      diag_ = CUBLAS_DIAG_NON_UNIT; \
-    \
-    KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-    if (A_is_layout_left) \
-      CUBLAS_FN(s.handle, side_, uplo_, trans_, diag_, M, N, reinterpret_cast<const CUDA_SCALAR_TYPE*>(&alpha), reinterpret_cast<const CUDA_SCALAR_TYPE*>(A.data()), LDA, reinterpret_cast<CUDA_SCALAR_TYPE*>(B.data()), LDB, reinterpret_cast<CUDA_SCALAR_TYPE*>(B.data()), LDB); \
-    else \
-      CUBLAS_FN(s.handle, side_, uplo_, trans_, diag_, N, M, reinterpret_cast<const CUDA_SCALAR_TYPE*>(&alpha), reinterpret_cast<const CUDA_SCALAR_TYPE*>(A.data()), LDA, reinterpret_cast<CUDA_SCALAR_TYPE*>(B.data()), LDB, reinterpret_cast<CUDA_SCALAR_TYPE*>(B.data()), LDB); \
-    Kokkos::Profiling::popRegion(); \
-  } \
-}; \
-
-#define KOKKOSBLAS3_DTRMM_CUBLAS( LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL ) \
-KOKKOSBLAS3_TRMM_CUBLAS(double, double, cublasDtrmm, LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL )
-
-#define KOKKOSBLAS3_STRMM_CUBLAS( LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL ) \
-KOKKOSBLAS3_TRMM_CUBLAS(float, float, cublasStrmm, LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL )
-
-#define KOKKOSBLAS3_ZTRMM_CUBLAS( LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL ) \
-KOKKOSBLAS3_TRMM_CUBLAS(Kokkos::complex<double>, cuDoubleComplex, cublasZtrmm, LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL )
-
-#define KOKKOSBLAS3_CTRMM_CUBLAS( LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL ) \
-KOKKOSBLAS3_TRMM_CUBLAS(Kokkos::complex<float>, cuComplex, cublasCtrmm, LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL )
+#define KOKKOSBLAS3_TRMM_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN,      \
+                                LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL)   \
+  template <class ExecSpace>                                                   \
+  struct TRMM<Kokkos::View<const SCALAR_TYPE**, LAYOUTA,                       \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+              Kokkos::View<SCALAR_TYPE**, LAYOUTB,                             \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+              true, ETI_SPEC_AVAIL> {                                          \
+    typedef SCALAR_TYPE SCALAR;                                                \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                              \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        AViewType;                                                             \
+    typedef Kokkos::View<SCALAR**, LAYOUTB,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        BViewType;                                                             \
+                                                                               \
+    static void trmm(const char side[], const char uplo[], const char trans[], \
+                     const char diag[],                                        \
+                     typename BViewType::const_value_type& alpha,              \
+                     const AViewType& A, const BViewType& B) {                 \
+      Kokkos::Profiling::pushRegion(                                           \
+          "KokkosBlas::trmm[TPL_CUBLAS," #SCALAR_TYPE "]");                    \
+      const int M = static_cast<int>(B.extent(0));                             \
+      const int N = static_cast<int>(B.extent(1));                             \
+                                                                               \
+      bool A_is_layout_left =                                                  \
+          std::is_same<Kokkos::LayoutLeft, LAYOUTA>::value;                    \
+      bool B_is_layout_left =                                                  \
+          std::is_same<Kokkos::LayoutLeft, LAYOUTB>::value;                    \
+                                                                               \
+      const int AST = A_is_layout_left ? A.stride(1) : A.stride(0),            \
+                LDA = (AST == 0) ? 1 : AST;                                    \
+      const int BST = B_is_layout_left ? B.stride(1) : B.stride(0),            \
+                LDB = (BST == 0) ? 1 : BST;                                    \
+                                                                               \
+      cublasSideMode_t side_;                                                  \
+      cublasFillMode_t uplo_;                                                  \
+      cublasOperation_t trans_;                                                \
+      cublasDiagType_t diag_;                                                  \
+                                                                               \
+      if (A_is_layout_left) {                                                  \
+        if ((side[0] == 'L') || (side[0] == 'l'))                              \
+          side_ = CUBLAS_SIDE_LEFT;                                            \
+        else                                                                   \
+          side_ = CUBLAS_SIDE_RIGHT;                                           \
+        if ((uplo[0] == 'L') || (uplo[0] == 'l'))                              \
+          uplo_ = CUBLAS_FILL_MODE_LOWER;                                      \
+        else                                                                   \
+          uplo_ = CUBLAS_FILL_MODE_UPPER;                                      \
+      } else {                                                                 \
+        if ((side[0] == 'L') || (side[0] == 'l'))                              \
+          side_ = CUBLAS_SIDE_RIGHT;                                           \
+        else                                                                   \
+          side_ = CUBLAS_SIDE_LEFT;                                            \
+        if ((uplo[0] == 'L') || (uplo[0] == 'l'))                              \
+          uplo_ = CUBLAS_FILL_MODE_UPPER;                                      \
+        else                                                                   \
+          uplo_ = CUBLAS_FILL_MODE_LOWER;                                      \
+      }                                                                        \
+                                                                               \
+      if ((trans[0] == 'N') || (trans[0] == 'n'))                              \
+        trans_ = CUBLAS_OP_N;                                                  \
+      else if ((trans[0] == 'T') || (trans[0] == 't'))                         \
+        trans_ = CUBLAS_OP_T;                                                  \
+      else                                                                     \
+        trans_ = CUBLAS_OP_C;                                                  \
+      if ((diag[0] == 'U') || (diag[0] == 'u'))                                \
+        diag_ = CUBLAS_DIAG_UNIT;                                              \
+      else                                                                     \
+        diag_ = CUBLAS_DIAG_NON_UNIT;                                          \
+                                                                               \
+      KokkosBlas::Impl::CudaBlasSingleton& s =                                 \
+          KokkosBlas::Impl::CudaBlasSingleton::singleton();                    \
+      if (A_is_layout_left)                                                    \
+        CUBLAS_FN(s.handle, side_, uplo_, trans_, diag_, M, N,                 \
+                  reinterpret_cast<const CUDA_SCALAR_TYPE*>(&alpha),           \
+                  reinterpret_cast<const CUDA_SCALAR_TYPE*>(A.data()), LDA,    \
+                  reinterpret_cast<CUDA_SCALAR_TYPE*>(B.data()), LDB,          \
+                  reinterpret_cast<CUDA_SCALAR_TYPE*>(B.data()), LDB);         \
+      else                                                                     \
+        CUBLAS_FN(s.handle, side_, uplo_, trans_, diag_, N, M,                 \
+                  reinterpret_cast<const CUDA_SCALAR_TYPE*>(&alpha),           \
+                  reinterpret_cast<const CUDA_SCALAR_TYPE*>(A.data()), LDA,    \
+                  reinterpret_cast<CUDA_SCALAR_TYPE*>(B.data()), LDB,          \
+                  reinterpret_cast<CUDA_SCALAR_TYPE*>(B.data()), LDB);         \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+#define KOKKOSBLAS3_DTRMM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \
+  KOKKOSBLAS3_TRMM_CUBLAS(double, double, cublasDtrmm, LAYOUTA, LAYOUTB,      \
+                          MEM_SPACE, ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS3_STRMM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \
+  KOKKOSBLAS3_TRMM_CUBLAS(float, float, cublasStrmm, LAYOUTA, LAYOUTB,        \
+                          MEM_SPACE, ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS3_ZTRMM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \
+  KOKKOSBLAS3_TRMM_CUBLAS(Kokkos::complex<double>, cuDoubleComplex,           \
+                          cublasZtrmm, LAYOUTA, LAYOUTB, MEM_SPACE,           \
+                          ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS3_CTRMM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \
+  KOKKOSBLAS3_TRMM_CUBLAS(Kokkos::complex<float>, cuComplex, cublasCtrmm,     \
+                          LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL)
 
 // Explicitly define the TRMM class for all permutations listed below
 
-KOKKOSBLAS3_DTRMM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaSpace, true)
-KOKKOSBLAS3_DTRMM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaSpace, false)
-KOKKOSBLAS3_DTRMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true)
-KOKKOSBLAS3_DTRMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false)
-
-KOKKOSBLAS3_DTRMM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS3_DTRMM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaUVMSpace, false)
-KOKKOSBLAS3_DTRMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS3_DTRMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
-
-KOKKOSBLAS3_STRMM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaSpace, true)
-KOKKOSBLAS3_STRMM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaSpace, false)
-KOKKOSBLAS3_STRMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true)
-KOKKOSBLAS3_STRMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false)
-
-KOKKOSBLAS3_STRMM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS3_STRMM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaUVMSpace, false)
-KOKKOSBLAS3_STRMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS3_STRMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
-
-KOKKOSBLAS3_ZTRMM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaSpace, true)
-KOKKOSBLAS3_ZTRMM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaSpace, false)
-KOKKOSBLAS3_ZTRMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true)
-KOKKOSBLAS3_ZTRMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false)
-
-KOKKOSBLAS3_ZTRMM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS3_ZTRMM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaUVMSpace, false)
-KOKKOSBLAS3_ZTRMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS3_ZTRMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
-
-KOKKOSBLAS3_CTRMM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaSpace, true)
-KOKKOSBLAS3_CTRMM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaSpace, false)
-KOKKOSBLAS3_CTRMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true)
-KOKKOSBLAS3_CTRMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false)
-
-KOKKOSBLAS3_CTRMM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS3_CTRMM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaUVMSpace, false)
-KOKKOSBLAS3_CTRMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS3_CTRMM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
-
-} // namespace Impl
-} // namespace KokkosBlas
-#endif // KOKKOSKERNELS_ENABLE_TPL_CUBLAS
-
-#endif // KOKKOSBLAS3_TRMM_TPL_SPEC_DECL_HPP_
+KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaSpace, true)
+KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaSpace, false)
+KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaSpace, true)
+KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaSpace, false)
+
+KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaUVMSpace, false)
+KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS3_DTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaUVMSpace, false)
+
+KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaSpace, true)
+KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaSpace, false)
+KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaSpace, true)
+KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaSpace, false)
+
+KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaUVMSpace, false)
+KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS3_STRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaUVMSpace, false)
+
+KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaSpace, true)
+KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaSpace, false)
+KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaSpace, true)
+KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaSpace, false)
+
+KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaUVMSpace, false)
+KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS3_ZTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaUVMSpace, false)
+
+KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaSpace, true)
+KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaSpace, false)
+KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaSpace, true)
+KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaSpace, false)
+
+KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaUVMSpace, false)
+KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS3_CTRMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaUVMSpace, false)
+
+}  // namespace Impl
+}  // namespace KokkosBlas
+#endif  // KOKKOSKERNELS_ENABLE_TPL_CUBLAS
+
+#endif  // KOKKOSBLAS3_TRMM_TPL_SPEC_DECL_HPP_
diff --git a/src/impl/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp
index 29a04fb715..71d31a4e66 100644
--- a/src/impl/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp
@@ -49,7 +49,7 @@ namespace KokkosBlas {
 namespace Impl {
 
 // Specialization struct which defines whether a specialization exists
-template<class AVT, class BVT>
+template <class AVT, class BVT>
 struct trsm_tpl_spec_avail {
   enum : bool { value = false };
 };
@@ -57,59 +57,102 @@ struct trsm_tpl_spec_avail {
 // Generic Host side BLAS (could be MKL or whatever)
 #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
 
-#define KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( SCALAR , LAYOUTA, LAYOUTB, MEMSPACE ) \
-template<class ExecSpace> \
-struct trsm_tpl_spec_avail< \
-     Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> > \
-     >  { enum : bool { value = true }; };
+#define KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, LAYOUTB,     \
+                                             MEMSPACE)                     \
+  template <class ExecSpace>                                               \
+  struct trsm_tpl_spec_avail<                                              \
+      Kokkos::View<const SCALAR**, LAYOUTA,                                \
+                   Kokkos::Device<ExecSpace, MEMSPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {           \
+    enum : bool { value = true };                                          \
+  };
 
- KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( double,                  Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace)
- KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( float,                   Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace)
- KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace)
- KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace)
+KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft,
+                                     Kokkos::LayoutLeft, Kokkos::HostSpace)
+KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft,
+                                     Kokkos::LayoutLeft, Kokkos::HostSpace)
+KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<double>,
+                                     Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                     Kokkos::HostSpace)
+KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<float>, Kokkos::LayoutLeft,
+                                     Kokkos::LayoutLeft, Kokkos::HostSpace)
 
- KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( double,                  Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace)
- KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( float,                   Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace)
- KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<double>, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace)
- KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<float>,  Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace)
+KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight,
+                                     Kokkos::LayoutRight, Kokkos::HostSpace)
+KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight,
+                                     Kokkos::LayoutRight, Kokkos::HostSpace)
+KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<double>,
+                                     Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                     Kokkos::HostSpace)
+KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<float>,
+                                     Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                     Kokkos::HostSpace)
 
 #endif
 
 // cuBLAS
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
 
-#define KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( SCALAR , LAYOUTA, LAYOUTB, MEMSPACE ) \
-template<class ExecSpace> \
-struct trsm_tpl_spec_avail< \
-     Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> > \
-     >  { enum : bool { value = true }; };
+#define KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTA, LAYOUTB,   \
+                                               MEMSPACE)                   \
+  template <class ExecSpace>                                               \
+  struct trsm_tpl_spec_avail<                                              \
+      Kokkos::View<const SCALAR**, LAYOUTA,                                \
+                   Kokkos::Device<ExecSpace, MEMSPACE>,                    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {           \
+    enum : bool { value = true };                                          \
+  };
 
- KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( double,                  Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace)
- KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( double,                  Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
- KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( float,                   Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace)
- KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( float,                   Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
- KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace)
- KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
- KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace)
- KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
+KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft,
+                                       Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft,
+                                       Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
+KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft,
+                                       Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft,
+                                       Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
+KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<double>,
+                                       Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                       Kokkos::CudaSpace)
+KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<double>,
+                                       Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                       Kokkos::CudaUVMSpace)
+KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<float>,
+                                       Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                       Kokkos::CudaSpace)
+KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<float>,
+                                       Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                                       Kokkos::CudaUVMSpace)
 
- KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( double,                  Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace)
- KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( double,                  Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
- KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( float,                   Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace)
- KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( float,                   Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
- KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<double>, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace)
- KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<double>, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
- KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<float>,  Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace)
- KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex<float>,  Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
+KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight,
+                                       Kokkos::LayoutRight, Kokkos::CudaSpace)
+KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight,
+                                       Kokkos::LayoutRight,
+                                       Kokkos::CudaUVMSpace)
+KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight,
+                                       Kokkos::LayoutRight, Kokkos::CudaSpace)
+KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight,
+                                       Kokkos::LayoutRight,
+                                       Kokkos::CudaUVMSpace)
+KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<double>,
+                                       Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                       Kokkos::CudaSpace)
+KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<double>,
+                                       Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                       Kokkos::CudaUVMSpace)
+KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<float>,
+                                       Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                       Kokkos::CudaSpace)
+KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex<float>,
+                                       Kokkos::LayoutRight, Kokkos::LayoutRight,
+                                       Kokkos::CudaUVMSpace)
 
 #endif
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/impl/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp
index cb0372f4c3..78fb3d1b82 100644
--- a/src/impl/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp
@@ -51,661 +51,757 @@
 namespace KokkosBlas {
 namespace Impl {
 
-#define KOKKOSBLAS3_DTRSM_BLAS( LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct TRSM< \
-     Kokkos::View<const double**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<double**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef double SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
-  \
-  static void \
-  trsm (const char side[], \
-        const char uplo[], \
-        const char trans[], \
-        const char diag[], \
-        typename BViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const BViewType& B) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_BLAS,double]"); \
-    const int M = static_cast<int> (B.extent(0)); \
-    const int N = static_cast<int> (B.extent(1)); \
-    \
-    bool A_is_ll = std::is_same<Kokkos::LayoutLeft,LAYOUTA>::value; \
-    bool B_is_ll = std::is_same<Kokkos::LayoutLeft,LAYOUTB>::value; \
-    \
-    const int AST = A_is_ll?A.stride(1):A.stride(0), LDA = (AST == 0) ? 1 : AST; \
-    const int BST = B_is_ll?B.stride(1):B.stride(0), LDB = (BST == 0) ? 1 : BST; \
-    \
-    char  side_; \
-    char  uplo_; \
-    \
-    if(A_is_ll) { \
-      if ((side[0]=='L')||(side[0]=='l')) \
-        side_ = 'L'; \
-      else \
-        side_ = 'R'; \
-      if ((uplo[0]=='L')||(uplo[0]=='l')) \
-        uplo_ = 'L'; \
-      else \
-        uplo_ = 'U'; \
-    } \
-    else { \
-      if ((side[0]=='L')||(side[0]=='l')) \
-        side_ = 'R'; \
-      else \
-        side_ = 'L'; \
-      if ((uplo[0]=='L')||(uplo[0]=='l')) \
-        uplo_ = 'U'; \
-      else \
-        uplo_ = 'L'; \
-    } \
-    \
-    if(A_is_ll) \
-      HostBlas<double>::trsm(side_, uplo_, trans[0], diag[0], M, N, alpha, A.data(), LDA, B.data(), LDB); \
-    else \
-      HostBlas<double>::trsm(side_, uplo_, trans[0], diag[0], N, M, alpha, A.data(), LDA, B.data(), LDB); \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS3_DTRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL)    \
+  template <class ExecSpace>                                                   \
+  struct TRSM<                                                                 \
+      Kokkos::View<const double**, LAYOUTA,                                    \
+                   Kokkos::Device<ExecSpace, MEM_SPACE>,                       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<double**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>,    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      true, ETI_SPEC_AVAIL> {                                                  \
+    typedef double SCALAR;                                                     \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                              \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        AViewType;                                                             \
+    typedef Kokkos::View<SCALAR**, LAYOUTB,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        BViewType;                                                             \
+                                                                               \
+    static void trsm(const char side[], const char uplo[], const char trans[], \
+                     const char diag[],                                        \
+                     typename BViewType::const_value_type& alpha,              \
+                     const AViewType& A, const BViewType& B) {                 \
+      Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_BLAS,double]");      \
+      const int M = static_cast<int>(B.extent(0));                             \
+      const int N = static_cast<int>(B.extent(1));                             \
+                                                                               \
+      bool A_is_ll = std::is_same<Kokkos::LayoutLeft, LAYOUTA>::value;         \
+      bool B_is_ll = std::is_same<Kokkos::LayoutLeft, LAYOUTB>::value;         \
+                                                                               \
+      const int AST = A_is_ll ? A.stride(1) : A.stride(0),                     \
+                LDA = (AST == 0) ? 1 : AST;                                    \
+      const int BST = B_is_ll ? B.stride(1) : B.stride(0),                     \
+                LDB = (BST == 0) ? 1 : BST;                                    \
+                                                                               \
+      char side_;                                                              \
+      char uplo_;                                                              \
+                                                                               \
+      if (A_is_ll) {                                                           \
+        if ((side[0] == 'L') || (side[0] == 'l'))                              \
+          side_ = 'L';                                                         \
+        else                                                                   \
+          side_ = 'R';                                                         \
+        if ((uplo[0] == 'L') || (uplo[0] == 'l'))                              \
+          uplo_ = 'L';                                                         \
+        else                                                                   \
+          uplo_ = 'U';                                                         \
+      } else {                                                                 \
+        if ((side[0] == 'L') || (side[0] == 'l'))                              \
+          side_ = 'R';                                                         \
+        else                                                                   \
+          side_ = 'L';                                                         \
+        if ((uplo[0] == 'L') || (uplo[0] == 'l'))                              \
+          uplo_ = 'U';                                                         \
+        else                                                                   \
+          uplo_ = 'L';                                                         \
+      }                                                                        \
+                                                                               \
+      if (A_is_ll)                                                             \
+        HostBlas<double>::trsm(side_, uplo_, trans[0], diag[0], M, N, alpha,   \
+                               A.data(), LDA, B.data(), LDB);                  \
+      else                                                                     \
+        HostBlas<double>::trsm(side_, uplo_, trans[0], diag[0], N, M, alpha,   \
+                               A.data(), LDA, B.data(), LDB);                  \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
 
-#define KOKKOSBLAS3_STRSM_BLAS( LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct TRSM< \
-     Kokkos::View<const float**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<float**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef float SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
-  \
-  static void \
-  trsm (const char side[], \
-        const char uplo[], \
-        const char trans[], \
-        const char diag[], \
-        typename BViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const BViewType& B) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_BLAS,float]"); \
-    const int M = static_cast<int> (B.extent(0)); \
-    const int N = static_cast<int> (B.extent(1)); \
-    \
-    bool A_is_ll = std::is_same<Kokkos::LayoutLeft,LAYOUTA>::value; \
-    bool B_is_ll = std::is_same<Kokkos::LayoutLeft,LAYOUTB>::value; \
-    \
-    const int AST = A_is_ll?A.stride(1):A.stride(0), LDA = (AST == 0) ? 1 : AST; \
-    const int BST = B_is_ll?B.stride(1):B.stride(0), LDB = (BST == 0) ? 1 : BST; \
-    \
-    char  side_; \
-    char  uplo_; \
-    \
-    if(A_is_ll) { \
-      if ((side[0]=='L')||(side[0]=='l')) \
-        side_ = 'L'; \
-      else \
-        side_ = 'R'; \
-      if ((uplo[0]=='L')||(uplo[0]=='l')) \
-        uplo_ = 'L'; \
-      else \
-        uplo_ = 'U'; \
-    } \
-    else { \
-      if ((side[0]=='L')||(side[0]=='l')) \
-        side_ = 'R'; \
-      else \
-        side_ = 'L'; \
-      if ((uplo[0]=='L')||(uplo[0]=='l')) \
-        uplo_ = 'U'; \
-      else \
-        uplo_ = 'L'; \
-    } \
-    \
-    if(A_is_ll) \
-      HostBlas<float>::trsm(side_, uplo_, trans[0], diag[0], M, N, alpha, A.data(), LDA, B.data(), LDB); \
-    else \
-      HostBlas<float>::trsm(side_, uplo_, trans[0], diag[0], N, M, alpha, A.data(), LDA, B.data(), LDB); \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS3_STRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL)    \
+  template <class ExecSpace>                                                   \
+  struct TRSM<                                                                 \
+      Kokkos::View<const float**, LAYOUTA,                                     \
+                   Kokkos::Device<ExecSpace, MEM_SPACE>,                       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<float**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>,     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      true, ETI_SPEC_AVAIL> {                                                  \
+    typedef float SCALAR;                                                      \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                              \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        AViewType;                                                             \
+    typedef Kokkos::View<SCALAR**, LAYOUTB,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        BViewType;                                                             \
+                                                                               \
+    static void trsm(const char side[], const char uplo[], const char trans[], \
+                     const char diag[],                                        \
+                     typename BViewType::const_value_type& alpha,              \
+                     const AViewType& A, const BViewType& B) {                 \
+      Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_BLAS,float]");       \
+      const int M = static_cast<int>(B.extent(0));                             \
+      const int N = static_cast<int>(B.extent(1));                             \
+                                                                               \
+      bool A_is_ll = std::is_same<Kokkos::LayoutLeft, LAYOUTA>::value;         \
+      bool B_is_ll = std::is_same<Kokkos::LayoutLeft, LAYOUTB>::value;         \
+                                                                               \
+      const int AST = A_is_ll ? A.stride(1) : A.stride(0),                     \
+                LDA = (AST == 0) ? 1 : AST;                                    \
+      const int BST = B_is_ll ? B.stride(1) : B.stride(0),                     \
+                LDB = (BST == 0) ? 1 : BST;                                    \
+                                                                               \
+      char side_;                                                              \
+      char uplo_;                                                              \
+                                                                               \
+      if (A_is_ll) {                                                           \
+        if ((side[0] == 'L') || (side[0] == 'l'))                              \
+          side_ = 'L';                                                         \
+        else                                                                   \
+          side_ = 'R';                                                         \
+        if ((uplo[0] == 'L') || (uplo[0] == 'l'))                              \
+          uplo_ = 'L';                                                         \
+        else                                                                   \
+          uplo_ = 'U';                                                         \
+      } else {                                                                 \
+        if ((side[0] == 'L') || (side[0] == 'l'))                              \
+          side_ = 'R';                                                         \
+        else                                                                   \
+          side_ = 'L';                                                         \
+        if ((uplo[0] == 'L') || (uplo[0] == 'l'))                              \
+          uplo_ = 'U';                                                         \
+        else                                                                   \
+          uplo_ = 'L';                                                         \
+      }                                                                        \
+                                                                               \
+      if (A_is_ll)                                                             \
+        HostBlas<float>::trsm(side_, uplo_, trans[0], diag[0], M, N, alpha,    \
+                              A.data(), LDA, B.data(), LDB);                   \
+      else                                                                     \
+        HostBlas<float>::trsm(side_, uplo_, trans[0], diag[0], N, M, alpha,    \
+                              A.data(), LDA, B.data(), LDB);                   \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
 
-#define KOKKOSBLAS3_ZTRSM_BLAS( LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct TRSM< \
-     Kokkos::View<const Kokkos::complex<double>**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<Kokkos::complex<double>**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef Kokkos::complex<double> SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
-  \
-  static void \
-  trsm (const char side[], \
-        const char uplo[], \
-        const char trans[], \
-        const char diag[], \
-        typename BViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const BViewType& B) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_BLAS,complex<double>]"); \
-    const int M = static_cast<int> (B.extent(0)); \
-    const int N = static_cast<int> (B.extent(1)); \
-    \
-    bool A_is_ll = std::is_same<Kokkos::LayoutLeft,LAYOUTA>::value; \
-    bool B_is_ll = std::is_same<Kokkos::LayoutLeft,LAYOUTB>::value; \
-    \
-    const int AST = A_is_ll?A.stride(1):A.stride(0), LDA = (AST == 0) ? 1 : AST; \
-    const int BST = B_is_ll?B.stride(1):B.stride(0), LDB = (BST == 0) ? 1 : BST; \
-    \
-    char  side_; \
-    char  uplo_; \
-    \
-    if(A_is_ll) { \
-      if ((side[0]=='L')||(side[0]=='l')) \
-        side_ = 'L'; \
-      else \
-        side_ = 'R'; \
-      if ((uplo[0]=='L')||(uplo[0]=='l')) \
-        uplo_ = 'L'; \
-      else \
-        uplo_ = 'U'; \
-    } \
-    else { \
-      if ((side[0]=='L')||(side[0]=='l')) \
-        side_ = 'R'; \
-      else \
-        side_ = 'L'; \
-      if ((uplo[0]=='L')||(uplo[0]=='l')) \
-        uplo_ = 'U'; \
-      else \
-        uplo_ = 'L'; \
-    } \
-    \
-    const std::complex<double> alpha_val = alpha; \
-    if(A_is_ll) \
-      HostBlas<std::complex<double> >::trsm(side_, uplo_, trans[0], diag[0], M, N, alpha_val, reinterpret_cast<const std::complex<double>*>(A.data()), LDA, reinterpret_cast<std::complex<double>*>(B.data()), LDB); \
-    else \
-      HostBlas<std::complex<double> >::trsm(side_, uplo_, trans[0], diag[0], N, M, alpha_val, reinterpret_cast<const std::complex<double>*>(A.data()), LDA, reinterpret_cast<std::complex<double>*>(B.data()), LDB); \
-    Kokkos::Profiling::popRegion(); \
-  } \
-}; \
+#define KOKKOSBLAS3_ZTRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL)    \
+  template <class ExecSpace>                                                   \
+  struct TRSM<Kokkos::View<const Kokkos::complex<double>**, LAYOUTA,           \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+              Kokkos::View<Kokkos::complex<double>**, LAYOUTB,                 \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+              true, ETI_SPEC_AVAIL> {                                          \
+    typedef Kokkos::complex<double> SCALAR;                                    \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                              \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        AViewType;                                                             \
+    typedef Kokkos::View<SCALAR**, LAYOUTB,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        BViewType;                                                             \
+                                                                               \
+    static void trsm(const char side[], const char uplo[], const char trans[], \
+                     const char diag[],                                        \
+                     typename BViewType::const_value_type& alpha,              \
+                     const AViewType& A, const BViewType& B) {                 \
+      Kokkos::Profiling::pushRegion(                                           \
+          "KokkosBlas::trsm[TPL_BLAS,complex<double>]");                       \
+      const int M = static_cast<int>(B.extent(0));                             \
+      const int N = static_cast<int>(B.extent(1));                             \
+                                                                               \
+      bool A_is_ll = std::is_same<Kokkos::LayoutLeft, LAYOUTA>::value;         \
+      bool B_is_ll = std::is_same<Kokkos::LayoutLeft, LAYOUTB>::value;         \
+                                                                               \
+      const int AST = A_is_ll ? A.stride(1) : A.stride(0),                     \
+                LDA = (AST == 0) ? 1 : AST;                                    \
+      const int BST = B_is_ll ? B.stride(1) : B.stride(0),                     \
+                LDB = (BST == 0) ? 1 : BST;                                    \
+                                                                               \
+      char side_;                                                              \
+      char uplo_;                                                              \
+                                                                               \
+      if (A_is_ll) {                                                           \
+        if ((side[0] == 'L') || (side[0] == 'l'))                              \
+          side_ = 'L';                                                         \
+        else                                                                   \
+          side_ = 'R';                                                         \
+        if ((uplo[0] == 'L') || (uplo[0] == 'l'))                              \
+          uplo_ = 'L';                                                         \
+        else                                                                   \
+          uplo_ = 'U';                                                         \
+      } else {                                                                 \
+        if ((side[0] == 'L') || (side[0] == 'l'))                              \
+          side_ = 'R';                                                         \
+        else                                                                   \
+          side_ = 'L';                                                         \
+        if ((uplo[0] == 'L') || (uplo[0] == 'l'))                              \
+          uplo_ = 'U';                                                         \
+        else                                                                   \
+          uplo_ = 'L';                                                         \
+      }                                                                        \
+                                                                               \
+      const std::complex<double> alpha_val = alpha;                            \
+      if (A_is_ll)                                                             \
+        HostBlas<std::complex<double> >::trsm(                                 \
+            side_, uplo_, trans[0], diag[0], M, N, alpha_val,                  \
+            reinterpret_cast<const std::complex<double>*>(A.data()), LDA,      \
+            reinterpret_cast<std::complex<double>*>(B.data()), LDB);           \
+      else                                                                     \
+        HostBlas<std::complex<double> >::trsm(                                 \
+            side_, uplo_, trans[0], diag[0], N, M, alpha_val,                  \
+            reinterpret_cast<const std::complex<double>*>(A.data()), LDA,      \
+            reinterpret_cast<std::complex<double>*>(B.data()), LDB);           \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
 
-#define KOKKOSBLAS3_CTRSM_BLAS( LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct TRSM< \
-     Kokkos::View<const Kokkos::complex<float>**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<Kokkos::complex<float>**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef Kokkos::complex<float> SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
-  \
-  static void \
-  trsm (const char side[], \
-        const char uplo[], \
-        const char trans[], \
-        const char diag[], \
-        typename BViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const BViewType& B) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_BLAS,complex<float>]"); \
-    const int M = static_cast<int> (B.extent(0)); \
-    const int N = static_cast<int> (B.extent(1)); \
-    \
-    bool A_is_ll = std::is_same<Kokkos::LayoutLeft,LAYOUTA>::value; \
-    bool B_is_ll = std::is_same<Kokkos::LayoutLeft,LAYOUTB>::value; \
-    \
-    const int AST = A_is_ll?A.stride(1):A.stride(0), LDA = (AST == 0) ? 1 : AST; \
-    const int BST = B_is_ll?B.stride(1):B.stride(0), LDB = (BST == 0) ? 1 : BST; \
-    \
-    char  side_; \
-    char  uplo_; \
-    \
-    if(A_is_ll) { \
-      if ((side[0]=='L')||(side[0]=='l')) \
-        side_ = 'L'; \
-      else \
-        side_ = 'R'; \
-      if ((uplo[0]=='L')||(uplo[0]=='l')) \
-        uplo_ = 'L'; \
-      else \
-        uplo_ = 'U'; \
-    } \
-    else { \
-      if ((side[0]=='L')||(side[0]=='l')) \
-        side_ = 'R'; \
-      else \
-        side_ = 'L'; \
-      if ((uplo[0]=='L')||(uplo[0]=='l')) \
-        uplo_ = 'U'; \
-      else \
-        uplo_ = 'L'; \
-    } \
-    \
-    const std::complex<float> alpha_val = alpha; \
-    if(A_is_ll) \
-      HostBlas<std::complex<float> >::trsm(side_, uplo_, trans[0], diag[0], M, N, alpha_val, reinterpret_cast<const std::complex<float>*>(A.data()), LDA, reinterpret_cast<std::complex<float>*>(B.data()), LDB); \
-    else \
-      HostBlas<std::complex<float> >::trsm(side_, uplo_, trans[0], diag[0], N, M, alpha_val, reinterpret_cast<const std::complex<float>*>(A.data()), LDA, reinterpret_cast<std::complex<float>*>(B.data()), LDB); \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS3_CTRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL)    \
+  template <class ExecSpace>                                                   \
+  struct TRSM<Kokkos::View<const Kokkos::complex<float>**, LAYOUTA,            \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+              Kokkos::View<Kokkos::complex<float>**, LAYOUTB,                  \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+              true, ETI_SPEC_AVAIL> {                                          \
+    typedef Kokkos::complex<float> SCALAR;                                     \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                              \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        AViewType;                                                             \
+    typedef Kokkos::View<SCALAR**, LAYOUTB,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        BViewType;                                                             \
+                                                                               \
+    static void trsm(const char side[], const char uplo[], const char trans[], \
+                     const char diag[],                                        \
+                     typename BViewType::const_value_type& alpha,              \
+                     const AViewType& A, const BViewType& B) {                 \
+      Kokkos::Profiling::pushRegion(                                           \
+          "KokkosBlas::trsm[TPL_BLAS,complex<float>]");                        \
+      const int M = static_cast<int>(B.extent(0));                             \
+      const int N = static_cast<int>(B.extent(1));                             \
+                                                                               \
+      bool A_is_ll = std::is_same<Kokkos::LayoutLeft, LAYOUTA>::value;         \
+      bool B_is_ll = std::is_same<Kokkos::LayoutLeft, LAYOUTB>::value;         \
+                                                                               \
+      const int AST = A_is_ll ? A.stride(1) : A.stride(0),                     \
+                LDA = (AST == 0) ? 1 : AST;                                    \
+      const int BST = B_is_ll ? B.stride(1) : B.stride(0),                     \
+                LDB = (BST == 0) ? 1 : BST;                                    \
+                                                                               \
+      char side_;                                                              \
+      char uplo_;                                                              \
+                                                                               \
+      if (A_is_ll) {                                                           \
+        if ((side[0] == 'L') || (side[0] == 'l'))                              \
+          side_ = 'L';                                                         \
+        else                                                                   \
+          side_ = 'R';                                                         \
+        if ((uplo[0] == 'L') || (uplo[0] == 'l'))                              \
+          uplo_ = 'L';                                                         \
+        else                                                                   \
+          uplo_ = 'U';                                                         \
+      } else {                                                                 \
+        if ((side[0] == 'L') || (side[0] == 'l'))                              \
+          side_ = 'R';                                                         \
+        else                                                                   \
+          side_ = 'L';                                                         \
+        if ((uplo[0] == 'L') || (uplo[0] == 'l'))                              \
+          uplo_ = 'U';                                                         \
+        else                                                                   \
+          uplo_ = 'L';                                                         \
+      }                                                                        \
+                                                                               \
+      const std::complex<float> alpha_val = alpha;                             \
+      if (A_is_ll)                                                             \
+        HostBlas<std::complex<float> >::trsm(                                  \
+            side_, uplo_, trans[0], diag[0], M, N, alpha_val,                  \
+            reinterpret_cast<const std::complex<float>*>(A.data()), LDA,       \
+            reinterpret_cast<std::complex<float>*>(B.data()), LDB);            \
+      else                                                                     \
+        HostBlas<std::complex<float> >::trsm(                                  \
+            side_, uplo_, trans[0], diag[0], N, M, alpha_val,                  \
+            reinterpret_cast<const std::complex<float>*>(A.data()), LDA,       \
+            reinterpret_cast<std::complex<float>*>(B.data()), LDB);            \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
 
-KOKKOSBLAS3_DTRSM_BLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::HostSpace, true)
-KOKKOSBLAS3_DTRSM_BLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::HostSpace, false)
-KOKKOSBLAS3_DTRSM_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true)
-KOKKOSBLAS3_DTRSM_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false)
+KOKKOSBLAS3_DTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::HostSpace, true)
+KOKKOSBLAS3_DTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::HostSpace, false)
+KOKKOSBLAS3_DTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::HostSpace, true)
+KOKKOSBLAS3_DTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::HostSpace, false)
 
-KOKKOSBLAS3_STRSM_BLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::HostSpace, true)
-KOKKOSBLAS3_STRSM_BLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::HostSpace, false)
-KOKKOSBLAS3_STRSM_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true)
-KOKKOSBLAS3_STRSM_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false)
+KOKKOSBLAS3_STRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::HostSpace, true)
+KOKKOSBLAS3_STRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::HostSpace, false)
+KOKKOSBLAS3_STRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::HostSpace, true)
+KOKKOSBLAS3_STRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::HostSpace, false)
 
-KOKKOSBLAS3_ZTRSM_BLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::HostSpace, true)
-KOKKOSBLAS3_ZTRSM_BLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::HostSpace, false)
-KOKKOSBLAS3_ZTRSM_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true)
-KOKKOSBLAS3_ZTRSM_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false)
+KOKKOSBLAS3_ZTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::HostSpace, true)
+KOKKOSBLAS3_ZTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::HostSpace, false)
+KOKKOSBLAS3_ZTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::HostSpace, true)
+KOKKOSBLAS3_ZTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::HostSpace, false)
 
-KOKKOSBLAS3_CTRSM_BLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::HostSpace, true)
-KOKKOSBLAS3_CTRSM_BLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::HostSpace, false)
-KOKKOSBLAS3_CTRSM_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true)
-KOKKOSBLAS3_CTRSM_BLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false)
+KOKKOSBLAS3_CTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::HostSpace, true)
+KOKKOSBLAS3_CTRSM_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                       Kokkos::HostSpace, false)
+KOKKOSBLAS3_CTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::HostSpace, true)
+KOKKOSBLAS3_CTRSM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                       Kokkos::HostSpace, false)
 
-}
-}
-#endif // KOKKOSKERNELS_ENABLE_TPL_BLAS
+}  // namespace Impl
+}  // namespace KokkosBlas
+#endif  // KOKKOSKERNELS_ENABLE_TPL_BLAS
 
 // cuBLAS
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
-#include<KokkosBlas_tpl_spec.hpp>
+#include <KokkosBlas_tpl_spec.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
 
-#define KOKKOSBLAS3_DTRSM_CUBLAS( LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct TRSM< \
-     Kokkos::View<const double**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<double**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef double SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
- \
-  static void \
-  trsm (const char side[], \
-        const char uplo[], \
-        const char trans[], \
-        const char diag[], \
-        typename BViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const BViewType& B) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,double]"); \
-    const int M = static_cast<int> (B.extent(0)); \
-    const int N = static_cast<int> (B.extent(1)); \
-    \
-    bool A_is_ll = std::is_same<Kokkos::LayoutLeft,LAYOUTA>::value; \
-    bool B_is_ll = std::is_same<Kokkos::LayoutLeft,LAYOUTB>::value; \
-    \
-    const int AST = A_is_ll?A.stride(1):A.stride(0), LDA = (AST == 0) ? 1 : AST; \
-    const int BST = B_is_ll?B.stride(1):B.stride(0), LDB = (BST == 0) ? 1 : BST; \
-    \
-    cublasSideMode_t  side_; \
-    cublasFillMode_t  uplo_; \
-    cublasOperation_t trans_; \
-    cublasDiagType_t  diag_; \
-    \
-    if(A_is_ll) { \
-      if ((side[0]=='L')||(side[0]=='l')) \
-        side_ = CUBLAS_SIDE_LEFT; \
-      else \
-        side_ = CUBLAS_SIDE_RIGHT; \
-      if ((uplo[0]=='L')||(uplo[0]=='l')) \
-        uplo_ = CUBLAS_FILL_MODE_LOWER; \
-      else \
-        uplo_ = CUBLAS_FILL_MODE_UPPER; \
-    } \
-    else { \
-      if ((side[0]=='L')||(side[0]=='l')) \
-        side_ = CUBLAS_SIDE_RIGHT; \
-      else \
-        side_ = CUBLAS_SIDE_LEFT; \
-      if ((uplo[0]=='L')||(uplo[0]=='l')) \
-        uplo_ = CUBLAS_FILL_MODE_UPPER; \
-      else \
-        uplo_ = CUBLAS_FILL_MODE_LOWER; \
-    } \
-    \
-    if ((trans[0]=='N')||(trans[0]=='n')) \
-      trans_ = CUBLAS_OP_N; \
-    else if ((trans[0]=='T')||(trans[0]=='t')) \
-      trans_ = CUBLAS_OP_T; \
-    else \
-      trans_ = CUBLAS_OP_C; \
-    if ((diag[0]=='U')||(diag[0]=='u')) \
-      diag_ = CUBLAS_DIAG_UNIT; \
-    else \
-      diag_ = CUBLAS_DIAG_NON_UNIT; \
-    \
-    KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-    if(A_is_ll) \
-      cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha, A.data(), LDA, B.data(), LDB); \
-    else \
-      cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha, A.data(), LDA, B.data(), LDB); \
-    \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS3_DTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL)  \
+  template <class ExecSpace>                                                   \
+  struct TRSM<                                                                 \
+      Kokkos::View<const double**, LAYOUTA,                                    \
+                   Kokkos::Device<ExecSpace, MEM_SPACE>,                       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<double**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>,    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      true, ETI_SPEC_AVAIL> {                                                  \
+    typedef double SCALAR;                                                     \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                              \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        AViewType;                                                             \
+    typedef Kokkos::View<SCALAR**, LAYOUTB,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        BViewType;                                                             \
+                                                                               \
+    static void trsm(const char side[], const char uplo[], const char trans[], \
+                     const char diag[],                                        \
+                     typename BViewType::const_value_type& alpha,              \
+                     const AViewType& A, const BViewType& B) {                 \
+      Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,double]");    \
+      const int M = static_cast<int>(B.extent(0));                             \
+      const int N = static_cast<int>(B.extent(1));                             \
+                                                                               \
+      bool A_is_ll = std::is_same<Kokkos::LayoutLeft, LAYOUTA>::value;         \
+      bool B_is_ll = std::is_same<Kokkos::LayoutLeft, LAYOUTB>::value;         \
+                                                                               \
+      const int AST = A_is_ll ? A.stride(1) : A.stride(0),                     \
+                LDA = (AST == 0) ? 1 : AST;                                    \
+      const int BST = B_is_ll ? B.stride(1) : B.stride(0),                     \
+                LDB = (BST == 0) ? 1 : BST;                                    \
+                                                                               \
+      cublasSideMode_t side_;                                                  \
+      cublasFillMode_t uplo_;                                                  \
+      cublasOperation_t trans_;                                                \
+      cublasDiagType_t diag_;                                                  \
+                                                                               \
+      if (A_is_ll) {                                                           \
+        if ((side[0] == 'L') || (side[0] == 'l'))                              \
+          side_ = CUBLAS_SIDE_LEFT;                                            \
+        else                                                                   \
+          side_ = CUBLAS_SIDE_RIGHT;                                           \
+        if ((uplo[0] == 'L') || (uplo[0] == 'l'))                              \
+          uplo_ = CUBLAS_FILL_MODE_LOWER;                                      \
+        else                                                                   \
+          uplo_ = CUBLAS_FILL_MODE_UPPER;                                      \
+      } else {                                                                 \
+        if ((side[0] == 'L') || (side[0] == 'l'))                              \
+          side_ = CUBLAS_SIDE_RIGHT;                                           \
+        else                                                                   \
+          side_ = CUBLAS_SIDE_LEFT;                                            \
+        if ((uplo[0] == 'L') || (uplo[0] == 'l'))                              \
+          uplo_ = CUBLAS_FILL_MODE_UPPER;                                      \
+        else                                                                   \
+          uplo_ = CUBLAS_FILL_MODE_LOWER;                                      \
+      }                                                                        \
+                                                                               \
+      if ((trans[0] == 'N') || (trans[0] == 'n'))                              \
+        trans_ = CUBLAS_OP_N;                                                  \
+      else if ((trans[0] == 'T') || (trans[0] == 't'))                         \
+        trans_ = CUBLAS_OP_T;                                                  \
+      else                                                                     \
+        trans_ = CUBLAS_OP_C;                                                  \
+      if ((diag[0] == 'U') || (diag[0] == 'u'))                                \
+        diag_ = CUBLAS_DIAG_UNIT;                                              \
+      else                                                                     \
+        diag_ = CUBLAS_DIAG_NON_UNIT;                                          \
+                                                                               \
+      KokkosBlas::Impl::CudaBlasSingleton& s =                                 \
+          KokkosBlas::Impl::CudaBlasSingleton::singleton();                    \
+      if (A_is_ll)                                                             \
+        cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha,       \
+                    A.data(), LDA, B.data(), LDB);                             \
+      else                                                                     \
+        cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha,       \
+                    A.data(), LDA, B.data(), LDB);                             \
+                                                                               \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
 
-#define KOKKOSBLAS3_STRSM_CUBLAS( LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct TRSM< \
-     Kokkos::View<const float**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<float**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef float SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
-  \
-  static void \
-  trsm (const char side[], \
-        const char uplo[], \
-        const char trans[], \
-        const char diag[], \
-        typename BViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const BViewType& B) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,float]"); \
-    const int M = static_cast<int> (B.extent(0)); \
-    const int N = static_cast<int> (B.extent(1)); \
-    \
-    bool A_is_ll = std::is_same<Kokkos::LayoutLeft,LAYOUTA>::value; \
-    bool B_is_ll = std::is_same<Kokkos::LayoutLeft,LAYOUTB>::value; \
-    \
-    const int AST = A_is_ll?A.stride(1):A.stride(0), LDA = (AST == 0) ? 1 : AST; \
-    const int BST = B_is_ll?B.stride(1):B.stride(0), LDB = (BST == 0) ? 1 : BST; \
-    \
-    cublasSideMode_t  side_; \
-    cublasFillMode_t  uplo_; \
-    cublasOperation_t trans_; \
-    cublasDiagType_t  diag_; \
-    \
-    if(A_is_ll) { \
-      if ((side[0]=='L')||(side[0]=='l')) \
-        side_ = CUBLAS_SIDE_LEFT; \
-      else \
-        side_ = CUBLAS_SIDE_RIGHT; \
-      if ((uplo[0]=='L')||(uplo[0]=='l')) \
-        uplo_ = CUBLAS_FILL_MODE_LOWER; \
-      else \
-        uplo_ = CUBLAS_FILL_MODE_UPPER; \
-    } \
-    else { \
-      if ((side[0]=='L')||(side[0]=='l')) \
-        side_ = CUBLAS_SIDE_RIGHT; \
-      else \
-        side_ = CUBLAS_SIDE_LEFT; \
-      if ((uplo[0]=='L')||(uplo[0]=='l')) \
-        uplo_ = CUBLAS_FILL_MODE_UPPER; \
-      else \
-        uplo_ = CUBLAS_FILL_MODE_LOWER; \
-    } \
-    \
-    if ((trans[0]=='N')||(trans[0]=='n')) \
-      trans_ = CUBLAS_OP_N; \
-    else if ((trans[0]=='T')||(trans[0]=='t')) \
-      trans_ = CUBLAS_OP_T; \
-    else \
-      trans_ = CUBLAS_OP_C; \
-    if ((diag[0]=='U')||(diag[0]=='u')) \
-      diag_ = CUBLAS_DIAG_UNIT; \
-    else \
-      diag_ = CUBLAS_DIAG_NON_UNIT; \
-    \
-    KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-    if(A_is_ll) \
-      cublasStrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha, A.data(), LDA, B.data(), LDB); \
-    else \
-      cublasStrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha, A.data(), LDA, B.data(), LDB); \
-    \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS3_STRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL)  \
+  template <class ExecSpace>                                                   \
+  struct TRSM<                                                                 \
+      Kokkos::View<const float**, LAYOUTA,                                     \
+                   Kokkos::Device<ExecSpace, MEM_SPACE>,                       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<float**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>,     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      true, ETI_SPEC_AVAIL> {                                                  \
+    typedef float SCALAR;                                                      \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                              \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        AViewType;                                                             \
+    typedef Kokkos::View<SCALAR**, LAYOUTB,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        BViewType;                                                             \
+                                                                               \
+    static void trsm(const char side[], const char uplo[], const char trans[], \
+                     const char diag[],                                        \
+                     typename BViewType::const_value_type& alpha,              \
+                     const AViewType& A, const BViewType& B) {                 \
+      Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,float]");     \
+      const int M = static_cast<int>(B.extent(0));                             \
+      const int N = static_cast<int>(B.extent(1));                             \
+                                                                               \
+      bool A_is_ll = std::is_same<Kokkos::LayoutLeft, LAYOUTA>::value;         \
+      bool B_is_ll = std::is_same<Kokkos::LayoutLeft, LAYOUTB>::value;         \
+                                                                               \
+      const int AST = A_is_ll ? A.stride(1) : A.stride(0),                     \
+                LDA = (AST == 0) ? 1 : AST;                                    \
+      const int BST = B_is_ll ? B.stride(1) : B.stride(0),                     \
+                LDB = (BST == 0) ? 1 : BST;                                    \
+                                                                               \
+      cublasSideMode_t side_;                                                  \
+      cublasFillMode_t uplo_;                                                  \
+      cublasOperation_t trans_;                                                \
+      cublasDiagType_t diag_;                                                  \
+                                                                               \
+      if (A_is_ll) {                                                           \
+        if ((side[0] == 'L') || (side[0] == 'l'))                              \
+          side_ = CUBLAS_SIDE_LEFT;                                            \
+        else                                                                   \
+          side_ = CUBLAS_SIDE_RIGHT;                                           \
+        if ((uplo[0] == 'L') || (uplo[0] == 'l'))                              \
+          uplo_ = CUBLAS_FILL_MODE_LOWER;                                      \
+        else                                                                   \
+          uplo_ = CUBLAS_FILL_MODE_UPPER;                                      \
+      } else {                                                                 \
+        if ((side[0] == 'L') || (side[0] == 'l'))                              \
+          side_ = CUBLAS_SIDE_RIGHT;                                           \
+        else                                                                   \
+          side_ = CUBLAS_SIDE_LEFT;                                            \
+        if ((uplo[0] == 'L') || (uplo[0] == 'l'))                              \
+          uplo_ = CUBLAS_FILL_MODE_UPPER;                                      \
+        else                                                                   \
+          uplo_ = CUBLAS_FILL_MODE_LOWER;                                      \
+      }                                                                        \
+                                                                               \
+      if ((trans[0] == 'N') || (trans[0] == 'n'))                              \
+        trans_ = CUBLAS_OP_N;                                                  \
+      else if ((trans[0] == 'T') || (trans[0] == 't'))                         \
+        trans_ = CUBLAS_OP_T;                                                  \
+      else                                                                     \
+        trans_ = CUBLAS_OP_C;                                                  \
+      if ((diag[0] == 'U') || (diag[0] == 'u'))                                \
+        diag_ = CUBLAS_DIAG_UNIT;                                              \
+      else                                                                     \
+        diag_ = CUBLAS_DIAG_NON_UNIT;                                          \
+                                                                               \
+      KokkosBlas::Impl::CudaBlasSingleton& s =                                 \
+          KokkosBlas::Impl::CudaBlasSingleton::singleton();                    \
+      if (A_is_ll)                                                             \
+        cublasStrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha,       \
+                    A.data(), LDA, B.data(), LDB);                             \
+      else                                                                     \
+        cublasStrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha,       \
+                    A.data(), LDA, B.data(), LDB);                             \
+                                                                               \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
 
-#define KOKKOSBLAS3_ZTRSM_CUBLAS( LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct TRSM< \
-     Kokkos::View<const Kokkos::complex<double>**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<Kokkos::complex<double>**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef Kokkos::complex<double> SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
-  \
-  static void \
-  trsm (const char side[], \
-        const char uplo[], \
-        const char trans[], \
-        const char diag[], \
-        typename BViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const BViewType& B) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,complex<double>]"); \
-    const int M = static_cast<int> (B.extent(0)); \
-    const int N = static_cast<int> (B.extent(1)); \
-    \
-    bool A_is_ll = std::is_same<Kokkos::LayoutLeft,LAYOUTA>::value; \
-    bool B_is_ll = std::is_same<Kokkos::LayoutLeft,LAYOUTB>::value; \
-    \
-    const int AST = A_is_ll?A.stride(1):A.stride(0), LDA = (AST == 0) ? 1 : AST; \
-    const int BST = B_is_ll?B.stride(1):B.stride(0), LDB = (BST == 0) ? 1 : BST; \
-    \
-    cublasSideMode_t  side_; \
-    cublasFillMode_t  uplo_; \
-    cublasOperation_t trans_; \
-    cublasDiagType_t  diag_; \
-    \
-    if(A_is_ll) { \
-      if ((side[0]=='L')||(side[0]=='l')) \
-        side_ = CUBLAS_SIDE_LEFT; \
-      else \
-        side_ = CUBLAS_SIDE_RIGHT; \
-      if ((uplo[0]=='L')||(uplo[0]=='l')) \
-        uplo_ = CUBLAS_FILL_MODE_LOWER; \
-      else \
-        uplo_ = CUBLAS_FILL_MODE_UPPER; \
-    } \
-    else { \
-      if ((side[0]=='L')||(side[0]=='l')) \
-        side_ = CUBLAS_SIDE_RIGHT; \
-      else \
-        side_ = CUBLAS_SIDE_LEFT; \
-      if ((uplo[0]=='L')||(uplo[0]=='l')) \
-        uplo_ = CUBLAS_FILL_MODE_UPPER; \
-      else \
-        uplo_ = CUBLAS_FILL_MODE_LOWER; \
-    } \
-    \
-    if ((trans[0]=='N')||(trans[0]=='n')) \
-      trans_ = CUBLAS_OP_N; \
-    else if ((trans[0]=='T')||(trans[0]=='t')) \
-      trans_ = CUBLAS_OP_T; \
-    else \
-      trans_ = CUBLAS_OP_C; \
-    if ((diag[0]=='U')||(diag[0]=='u')) \
-      diag_ = CUBLAS_DIAG_UNIT; \
-    else \
-      diag_ = CUBLAS_DIAG_NON_UNIT; \
-    \
-    KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-    if(A_is_ll) \
-      cublasZtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, reinterpret_cast<const cuDoubleComplex*>(&alpha), reinterpret_cast<const cuDoubleComplex*>(A.data()), LDA, reinterpret_cast<cuDoubleComplex*>(B.data()), LDB); \
-    else \
-      cublasZtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, reinterpret_cast<const cuDoubleComplex*>(&alpha), reinterpret_cast<const cuDoubleComplex*>(A.data()), LDA, reinterpret_cast<cuDoubleComplex*>(B.data()), LDB); \
-    \
-    Kokkos::Profiling::popRegion(); \
-  } \
-}; \
+#define KOKKOSBLAS3_ZTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL)  \
+  template <class ExecSpace>                                                   \
+  struct TRSM<Kokkos::View<const Kokkos::complex<double>**, LAYOUTA,           \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+              Kokkos::View<Kokkos::complex<double>**, LAYOUTB,                 \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+              true, ETI_SPEC_AVAIL> {                                          \
+    typedef Kokkos::complex<double> SCALAR;                                    \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                              \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        AViewType;                                                             \
+    typedef Kokkos::View<SCALAR**, LAYOUTB,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        BViewType;                                                             \
+                                                                               \
+    static void trsm(const char side[], const char uplo[], const char trans[], \
+                     const char diag[],                                        \
+                     typename BViewType::const_value_type& alpha,              \
+                     const AViewType& A, const BViewType& B) {                 \
+      Kokkos::Profiling::pushRegion(                                           \
+          "KokkosBlas::trsm[TPL_CUBLAS,complex<double>]");                     \
+      const int M = static_cast<int>(B.extent(0));                             \
+      const int N = static_cast<int>(B.extent(1));                             \
+                                                                               \
+      bool A_is_ll = std::is_same<Kokkos::LayoutLeft, LAYOUTA>::value;         \
+      bool B_is_ll = std::is_same<Kokkos::LayoutLeft, LAYOUTB>::value;         \
+                                                                               \
+      const int AST = A_is_ll ? A.stride(1) : A.stride(0),                     \
+                LDA = (AST == 0) ? 1 : AST;                                    \
+      const int BST = B_is_ll ? B.stride(1) : B.stride(0),                     \
+                LDB = (BST == 0) ? 1 : BST;                                    \
+                                                                               \
+      cublasSideMode_t side_;                                                  \
+      cublasFillMode_t uplo_;                                                  \
+      cublasOperation_t trans_;                                                \
+      cublasDiagType_t diag_;                                                  \
+                                                                               \
+      if (A_is_ll) {                                                           \
+        if ((side[0] == 'L') || (side[0] == 'l'))                              \
+          side_ = CUBLAS_SIDE_LEFT;                                            \
+        else                                                                   \
+          side_ = CUBLAS_SIDE_RIGHT;                                           \
+        if ((uplo[0] == 'L') || (uplo[0] == 'l'))                              \
+          uplo_ = CUBLAS_FILL_MODE_LOWER;                                      \
+        else                                                                   \
+          uplo_ = CUBLAS_FILL_MODE_UPPER;                                      \
+      } else {                                                                 \
+        if ((side[0] == 'L') || (side[0] == 'l'))                              \
+          side_ = CUBLAS_SIDE_RIGHT;                                           \
+        else                                                                   \
+          side_ = CUBLAS_SIDE_LEFT;                                            \
+        if ((uplo[0] == 'L') || (uplo[0] == 'l'))                              \
+          uplo_ = CUBLAS_FILL_MODE_UPPER;                                      \
+        else                                                                   \
+          uplo_ = CUBLAS_FILL_MODE_LOWER;                                      \
+      }                                                                        \
+                                                                               \
+      if ((trans[0] == 'N') || (trans[0] == 'n'))                              \
+        trans_ = CUBLAS_OP_N;                                                  \
+      else if ((trans[0] == 'T') || (trans[0] == 't'))                         \
+        trans_ = CUBLAS_OP_T;                                                  \
+      else                                                                     \
+        trans_ = CUBLAS_OP_C;                                                  \
+      if ((diag[0] == 'U') || (diag[0] == 'u'))                                \
+        diag_ = CUBLAS_DIAG_UNIT;                                              \
+      else                                                                     \
+        diag_ = CUBLAS_DIAG_NON_UNIT;                                          \
+                                                                               \
+      KokkosBlas::Impl::CudaBlasSingleton& s =                                 \
+          KokkosBlas::Impl::CudaBlasSingleton::singleton();                    \
+      if (A_is_ll)                                                             \
+        cublasZtrsm(s.handle, side_, uplo_, trans_, diag_, M, N,               \
+                    reinterpret_cast<const cuDoubleComplex*>(&alpha),          \
+                    reinterpret_cast<const cuDoubleComplex*>(A.data()), LDA,   \
+                    reinterpret_cast<cuDoubleComplex*>(B.data()), LDB);        \
+      else                                                                     \
+        cublasZtrsm(s.handle, side_, uplo_, trans_, diag_, N, M,               \
+                    reinterpret_cast<const cuDoubleComplex*>(&alpha),          \
+                    reinterpret_cast<const cuDoubleComplex*>(A.data()), LDA,   \
+                    reinterpret_cast<cuDoubleComplex*>(B.data()), LDB);        \
+                                                                               \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
 
-#define KOKKOSBLAS3_CTRSM_CUBLAS( LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct TRSM< \
-     Kokkos::View<const Kokkos::complex<float>**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<Kokkos::complex<float>**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef Kokkos::complex<float> SCALAR; \
-  typedef Kokkos::View<const SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUTB, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
-  \
-  static void \
-  trsm (const char side[], \
-        const char uplo[], \
-        const char trans[], \
-        const char diag[], \
-        typename BViewType::const_value_type& alpha, \
-        const AViewType& A, \
-        const BViewType& B) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,complex<float>]"); \
-    const int M = static_cast<int> (B.extent(0)); \
-    const int N = static_cast<int> (B.extent(1)); \
-    \
-    bool A_is_ll = std::is_same<Kokkos::LayoutLeft,LAYOUTA>::value; \
-    bool B_is_ll = std::is_same<Kokkos::LayoutLeft,LAYOUTB>::value; \
-    \
-    const int AST = A_is_ll?A.stride(1):A.stride(0), LDA = (AST == 0) ? 1 : AST; \
-    const int BST = B_is_ll?B.stride(1):B.stride(0), LDB = (BST == 0) ? 1 : BST; \
-    \
-    cublasSideMode_t  side_; \
-    cublasFillMode_t  uplo_; \
-    cublasOperation_t trans_; \
-    cublasDiagType_t  diag_; \
-    \
-    if(A_is_ll) { \
-      if ((side[0]=='L')||(side[0]=='l')) \
-        side_ = CUBLAS_SIDE_LEFT; \
-      else \
-        side_ = CUBLAS_SIDE_RIGHT; \
-      if ((uplo[0]=='L')||(uplo[0]=='l')) \
-        uplo_ = CUBLAS_FILL_MODE_LOWER; \
-      else \
-        uplo_ = CUBLAS_FILL_MODE_UPPER; \
-    } \
-    else { \
-      if ((side[0]=='L')||(side[0]=='l')) \
-        side_ = CUBLAS_SIDE_RIGHT; \
-      else \
-        side_ = CUBLAS_SIDE_LEFT; \
-      if ((uplo[0]=='L')||(uplo[0]=='l')) \
-        uplo_ = CUBLAS_FILL_MODE_UPPER; \
-      else \
-        uplo_ = CUBLAS_FILL_MODE_LOWER; \
-    } \
-    \
-    if ((trans[0]=='N')||(trans[0]=='n')) \
-      trans_ = CUBLAS_OP_N; \
-    else if ((trans[0]=='T')||(trans[0]=='t')) \
-      trans_ = CUBLAS_OP_T; \
-    else \
-      trans_ = CUBLAS_OP_C; \
-    if ((diag[0]=='U')||(diag[0]=='u')) \
-      diag_ = CUBLAS_DIAG_UNIT; \
-    else \
-      diag_ = CUBLAS_DIAG_NON_UNIT; \
-    \
-    KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \
-    if(A_is_ll) \
-      cublasCtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, reinterpret_cast<const cuComplex*>(&alpha), reinterpret_cast<const cuComplex*>(A.data()), LDA, reinterpret_cast<cuComplex*>(B.data()), LDB); \
-    else \
-      cublasCtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, reinterpret_cast<const cuComplex*>(&alpha), reinterpret_cast<const cuComplex*>(A.data()), LDA, reinterpret_cast<cuComplex*>(B.data()), LDB); \
-    \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS3_CTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL)  \
+  template <class ExecSpace>                                                   \
+  struct TRSM<Kokkos::View<const Kokkos::complex<float>**, LAYOUTA,            \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+              Kokkos::View<Kokkos::complex<float>**, LAYOUTB,                  \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+              true, ETI_SPEC_AVAIL> {                                          \
+    typedef Kokkos::complex<float> SCALAR;                                     \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                              \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        AViewType;                                                             \
+    typedef Kokkos::View<SCALAR**, LAYOUTB,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        BViewType;                                                             \
+                                                                               \
+    static void trsm(const char side[], const char uplo[], const char trans[], \
+                     const char diag[],                                        \
+                     typename BViewType::const_value_type& alpha,              \
+                     const AViewType& A, const BViewType& B) {                 \
+      Kokkos::Profiling::pushRegion(                                           \
+          "KokkosBlas::trsm[TPL_CUBLAS,complex<float>]");                      \
+      const int M = static_cast<int>(B.extent(0));                             \
+      const int N = static_cast<int>(B.extent(1));                             \
+                                                                               \
+      bool A_is_ll = std::is_same<Kokkos::LayoutLeft, LAYOUTA>::value;         \
+      bool B_is_ll = std::is_same<Kokkos::LayoutLeft, LAYOUTB>::value;         \
+                                                                               \
+      const int AST = A_is_ll ? A.stride(1) : A.stride(0),                     \
+                LDA = (AST == 0) ? 1 : AST;                                    \
+      const int BST = B_is_ll ? B.stride(1) : B.stride(0),                     \
+                LDB = (BST == 0) ? 1 : BST;                                    \
+                                                                               \
+      cublasSideMode_t side_;                                                  \
+      cublasFillMode_t uplo_;                                                  \
+      cublasOperation_t trans_;                                                \
+      cublasDiagType_t diag_;                                                  \
+                                                                               \
+      if (A_is_ll) {                                                           \
+        if ((side[0] == 'L') || (side[0] == 'l'))                              \
+          side_ = CUBLAS_SIDE_LEFT;                                            \
+        else                                                                   \
+          side_ = CUBLAS_SIDE_RIGHT;                                           \
+        if ((uplo[0] == 'L') || (uplo[0] == 'l'))                              \
+          uplo_ = CUBLAS_FILL_MODE_LOWER;                                      \
+        else                                                                   \
+          uplo_ = CUBLAS_FILL_MODE_UPPER;                                      \
+      } else {                                                                 \
+        if ((side[0] == 'L') || (side[0] == 'l'))                              \
+          side_ = CUBLAS_SIDE_RIGHT;                                           \
+        else                                                                   \
+          side_ = CUBLAS_SIDE_LEFT;                                            \
+        if ((uplo[0] == 'L') || (uplo[0] == 'l'))                              \
+          uplo_ = CUBLAS_FILL_MODE_UPPER;                                      \
+        else                                                                   \
+          uplo_ = CUBLAS_FILL_MODE_LOWER;                                      \
+      }                                                                        \
+                                                                               \
+      if ((trans[0] == 'N') || (trans[0] == 'n'))                              \
+        trans_ = CUBLAS_OP_N;                                                  \
+      else if ((trans[0] == 'T') || (trans[0] == 't'))                         \
+        trans_ = CUBLAS_OP_T;                                                  \
+      else                                                                     \
+        trans_ = CUBLAS_OP_C;                                                  \
+      if ((diag[0] == 'U') || (diag[0] == 'u'))                                \
+        diag_ = CUBLAS_DIAG_UNIT;                                              \
+      else                                                                     \
+        diag_ = CUBLAS_DIAG_NON_UNIT;                                          \
+                                                                               \
+      KokkosBlas::Impl::CudaBlasSingleton& s =                                 \
+          KokkosBlas::Impl::CudaBlasSingleton::singleton();                    \
+      if (A_is_ll)                                                             \
+        cublasCtrsm(s.handle, side_, uplo_, trans_, diag_, M, N,               \
+                    reinterpret_cast<const cuComplex*>(&alpha),                \
+                    reinterpret_cast<const cuComplex*>(A.data()), LDA,         \
+                    reinterpret_cast<cuComplex*>(B.data()), LDB);              \
+      else                                                                     \
+        cublasCtrsm(s.handle, side_, uplo_, trans_, diag_, N, M,               \
+                    reinterpret_cast<const cuComplex*>(&alpha),                \
+                    reinterpret_cast<const cuComplex*>(A.data()), LDA,         \
+                    reinterpret_cast<cuComplex*>(B.data()), LDB);              \
+                                                                               \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
 
-KOKKOSBLAS3_DTRSM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaSpace, true)
-KOKKOSBLAS3_DTRSM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaSpace, false)
-KOKKOSBLAS3_DTRSM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true)
-KOKKOSBLAS3_DTRSM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false)
+KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaSpace, true)
+KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaSpace, false)
+KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaSpace, true)
+KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaSpace, false)
 
-KOKKOSBLAS3_DTRSM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS3_DTRSM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaUVMSpace, false)
-KOKKOSBLAS3_DTRSM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS3_DTRSM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
+KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaUVMSpace, false)
+KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS3_DTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaUVMSpace, false)
 
-KOKKOSBLAS3_STRSM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaSpace, true)
-KOKKOSBLAS3_STRSM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaSpace, false)
-KOKKOSBLAS3_STRSM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true)
-KOKKOSBLAS3_STRSM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false)
+KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaSpace, true)
+KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaSpace, false)
+KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaSpace, true)
+KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaSpace, false)
 
-KOKKOSBLAS3_STRSM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS3_STRSM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaUVMSpace, false)
-KOKKOSBLAS3_STRSM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS3_STRSM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
+KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaUVMSpace, false)
+KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS3_STRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaUVMSpace, false)
 
-KOKKOSBLAS3_ZTRSM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaSpace, true)
-KOKKOSBLAS3_ZTRSM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaSpace, false)
-KOKKOSBLAS3_ZTRSM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true)
-KOKKOSBLAS3_ZTRSM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false)
+KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaSpace, true)
+KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaSpace, false)
+KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaSpace, true)
+KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaSpace, false)
 
-KOKKOSBLAS3_ZTRSM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS3_ZTRSM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaUVMSpace, false)
-KOKKOSBLAS3_ZTRSM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS3_ZTRSM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
+KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaUVMSpace, false)
+KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS3_ZTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaUVMSpace, false)
 
-KOKKOSBLAS3_CTRSM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaSpace, true)
-KOKKOSBLAS3_CTRSM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaSpace, false)
-KOKKOSBLAS3_CTRSM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true)
-KOKKOSBLAS3_CTRSM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false)
+KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaSpace, true)
+KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaSpace, false)
+KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaSpace, true)
+KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaSpace, false)
 
-KOKKOSBLAS3_CTRSM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS3_CTRSM_CUBLAS( Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  Kokkos::CudaUVMSpace, false)
-KOKKOSBLAS3_CTRSM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true)
-KOKKOSBLAS3_CTRSM_CUBLAS( Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false)
+KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::CudaUVMSpace, false)
+KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaUVMSpace, true)
+KOKKOSBLAS3_CTRSM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::CudaUVMSpace, false)
 
-}
-}
-#endif // KOKKOSKERNELS_ENABLE_TPL_CUBLAS
+}  // namespace Impl
+}  // namespace KokkosBlas
+#endif  // KOKKOSKERNELS_ENABLE_TPL_CUBLAS
 
 #endif
diff --git a/src/impl/tpls/KokkosBlas_Cuda_tpl.cpp b/src/impl/tpls/KokkosBlas_Cuda_tpl.cpp
index bac6705590..878a9f72e4 100644
--- a/src/impl/tpls/KokkosBlas_Cuda_tpl.cpp
+++ b/src/impl/tpls/KokkosBlas_Cuda_tpl.cpp
@@ -1,3 +1,3 @@
 #include <Kokkos_Core.hpp>
 #include <KokkosKernels_config.h>
-#include<KokkosBlas_Cuda_tpl.hpp>
+#include <KokkosBlas_Cuda_tpl.hpp>
diff --git a/src/impl/tpls/KokkosBlas_Cuda_tpl.hpp b/src/impl/tpls/KokkosBlas_Cuda_tpl.hpp
index f2dec32dd3..9e0bff4c55 100644
--- a/src/impl/tpls/KokkosBlas_Cuda_tpl.hpp
+++ b/src/impl/tpls/KokkosBlas_Cuda_tpl.hpp
@@ -1,52 +1,49 @@
 #ifndef KOKKOSBLAS_CUDA_TPL_HPP_
 #define KOKKOSBLAS_CUDA_TPL_HPP_
 
-#if defined (KOKKOSKERNELS_ENABLE_TPL_CUBLAS)
-#include<KokkosBlas_tpl_spec.hpp>
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS)
+#include <KokkosBlas_tpl_spec.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
 
-CudaBlasSingleton::CudaBlasSingleton()
-{
-    cublasStatus_t stat = cublasCreate(&handle);
-    if (stat != CUBLAS_STATUS_SUCCESS)
-        Kokkos::abort("CUBLAS initialization failed\n");
+CudaBlasSingleton::CudaBlasSingleton() {
+  cublasStatus_t stat = cublasCreate(&handle);
+  if (stat != CUBLAS_STATUS_SUCCESS)
+    Kokkos::abort("CUBLAS initialization failed\n");
 
-    Kokkos::push_finalize_hook ([&] () { 
-        cublasDestroy(handle);
-    });
+  Kokkos::push_finalize_hook([&]() { cublasDestroy(handle); });
 }
 
-CudaBlasSingleton & CudaBlasSingleton::singleton()
-{ static CudaBlasSingleton s ; return s ; }
-
-}
+CudaBlasSingleton& CudaBlasSingleton::singleton() {
+  static CudaBlasSingleton s;
+  return s;
 }
-#endif
 
-#if defined (KOKKOSKERNELS_ENABLE_TPL_MAGMA)
-#include<KokkosBlas_tpl_spec.hpp>
+}  // namespace Impl
+}  // namespace KokkosBlas
+#endif  // defined (KOKKOSKERNELS_ENABLE_TPL_CUBLAS)
+
+#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)
+#include <KokkosBlas_tpl_spec.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
 
-MagmaSingleton::MagmaSingleton()
-{
-    magma_int_t stat = magma_init();
-    if (stat != MAGMA_SUCCESS)
-        Kokkos::abort("MAGMA initialization failed\n");
+MagmaSingleton::MagmaSingleton() {
+  magma_int_t stat = magma_init();
+  if (stat != MAGMA_SUCCESS) Kokkos::abort("MAGMA initialization failed\n");
 
-    Kokkos::push_finalize_hook ([&] () { 
-        magma_finalize();
-    });
+  Kokkos::push_finalize_hook([&]() { magma_finalize(); });
 }
 
-MagmaSingleton & MagmaSingleton::singleton()
-{ static MagmaSingleton s ; return s ; }
-
-}
+MagmaSingleton& MagmaSingleton::singleton() {
+  static MagmaSingleton s;
+  return s;
 }
-#endif
 
-#endif
+}  // namespace Impl
+}  // namespace KokkosBlas
+#endif  // defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)
+
+#endif  // KOKKOSBLAS_CUDA_TPL_HPP_
diff --git a/src/impl/tpls/KokkosBlas_Host_tpl.cpp b/src/impl/tpls/KokkosBlas_Host_tpl.cpp
index 355c6ef70e..b834e4fcf0 100644
--- a/src/impl/tpls/KokkosBlas_Host_tpl.cpp
+++ b/src/impl/tpls/KokkosBlas_Host_tpl.cpp
@@ -5,1023 +5,819 @@
 #include "KokkosKernels_config.h"
 #include "KokkosBlas_Host_tpl.hpp"
 
-#if defined( KOKKOSKERNELS_ENABLE_TPL_BLAS )
+#if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS)
 
 /// Fortran headers
 extern "C" {
 
-  ///
-  /// scal
-  ///
-  void F77_BLAS_MANGLE(sscal,SSCAL)( const int* N, 
-                                     const float* alpha,
-                                     /* */ float* x, const int* x_inc);
-  void F77_BLAS_MANGLE(dscal,DSCAL)( const int* N, 
-                                     const double* alpha,
-                                     /* */ double* x, const int* x_inc);
-  void F77_BLAS_MANGLE(cscal,CSCAL)( const int* N, 
-                                     const std::complex<float>* alpha,
-                                     /* */ std::complex<float>* x, const int* x_inc);
-  void F77_BLAS_MANGLE(zscal,ZSCAL)( const int* N, 
-                                     const std::complex<double>* alpha,
-                                     /* */ std::complex<double>* x, const int* x_inc);
-  
-  ///
-  /// max
-  ///
-  int F77_BLAS_MANGLE(isamax,ISAMAX)( const int* N, const float* x, const int* x_inc);
-  int F77_BLAS_MANGLE(idamax,IDAMAX)( const int* N, const double* x, const int* x_inc);
-  int F77_BLAS_MANGLE(icamax,ICAMAX)( const int* N, const std::complex<float>* x, const int* x_inc);
-  int F77_BLAS_MANGLE(izamax,IZAMAX)( const int* N, const std::complex<double>* x, const int* x_inc);
-
-  
-  ///
-  /// nrm2
-  ///
-  float  F77_BLAS_MANGLE(snrm2, SNRM2 )( const int* N, const float* x, const int* x_inc);
-  double F77_BLAS_MANGLE(dnrm2, DNRM2 )( const int* N, const double* x, const int* x_inc);
-  float  F77_BLAS_MANGLE(scnrm2,SCNRM2)( const int* N, const std::complex<float>* x, const int* x_inc);
-  double F77_BLAS_MANGLE(dznrm2,DZNRM2)( const int* N, const std::complex<double>* x, const int* x_inc);
-  
-  ///
-  /// sum
-  ///  
-  float  F77_BLAS_MANGLE(sasum, SASUM )( const int* N, const float* x, const int* x_inc);
-  double F77_BLAS_MANGLE(dasum, DASUM )( const int* N, const double* x, const int* x_inc);
-  float  F77_BLAS_MANGLE(scasum,SCASUM)( const int* N, const std::complex<float>* x, const int* x_inc);
-  double F77_BLAS_MANGLE(dzasum,DZASUM)( const int* N, const std::complex<double>* x, const int* x_inc);
-    
-  ///
-  /// dot 
-  ///
-  float  F77_BLAS_MANGLE(sdot,SDOT)( const int* N, const float* x, const int* x_inc,
-                                     const float* y, const int* y_inc);
-  double F77_BLAS_MANGLE(ddot,DDOT)( const int* N, const double* x, const int* x_inc,
-                                     const double* y, const int* y_inc);
-# if defined( KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX )
-  std::complex<float>  F77_BLAS_MANGLE(cdotu,CDOTU)( const int* N, 
-                                                     const std::complex<float>* x, const int* x_inc,
-                                                     const std::complex<float>* y, const int* y_inc);   
-  std::complex<double> F77_BLAS_MANGLE(zdotu,ZDOTU)( const int* N, 
-                                                     const std::complex<double>* x, const int* x_inc,
-                                                     const std::complex<double>* y, const int* y_inc);
-  std::complex<float>  F77_BLAS_MANGLE(cdotc,CDOTC)( const int* N, 
-                                                     const std::complex<float>* x, const int* x_inc,
-                                                     const std::complex<float>* y, const int* y_inc);  
-  std::complex<double> F77_BLAS_MANGLE(zdotc,ZDOTC)( const int* N, 
-                                                     const std::complex<double>* x, const int* x_inc,
-                                                     const std::complex<double>* y, const int* y_inc);
-# else 
-  void F77_BLAS_MANGLE(cdotu,CDOTU)( std::complex<float> *res, 
-                                     const int* N, 
-                                     const std::complex<float>* x, const int* x_inc,
-                                     const std::complex<float>* y, const int* y_inc);   
-  void F77_BLAS_MANGLE(zdotu,ZDOTU)( std::complex<double> *res, 
-                                     const int* N, 
-                                     const std::complex<double>* x, const int* x_inc,
-                                     const std::complex<double>* y, const int* y_inc);
-  void F77_BLAS_MANGLE(cdotc,CDOTC)( std::complex<float> *res, 
-                                     const int* N, 
-                                     const std::complex<float>* x, const int* x_inc,
-                                     const std::complex<float>* y, const int* y_inc); 
-  void F77_BLAS_MANGLE(zdotc,ZDOTC)( std::complex<double> *res, 
-                                     const int* N, 
-                                     const std::complex<double>* x, const int* x_inc,
-                                     const std::complex<double>* y, const int* y_inc);
-# endif
-  
-  ///
-  /// axpy
-  ///
-  void F77_BLAS_MANGLE(saxpy,SAXPY)( const int* N, 
-                                     const float* alpha,
-                                     const float* x, const int* x_inc,
-                                     /* */ float* y, const int* y_inc);
-  void F77_BLAS_MANGLE(daxpy,DAXPY)( const int* N, 
-                                     const double* alpha,
-                                     const double* x, const int* x_inc,
-                                     /* */ double* y, const int* y_inc);
-  void F77_BLAS_MANGLE(caxpy,CAXPY)( const int* N, 
-                                     const std::complex<float>* alpha,
-                                     const std::complex<float>* x, const int* x_inc,
-                                     /* */ std::complex<float>* y, const int* y_inc);
-  void F77_BLAS_MANGLE(zaxpy,ZAXPY)( const int* N, 
-                                     const std::complex<double>* alpha,
-                                     const std::complex<double>* x, const int* x_inc,
-                                     /* */ std::complex<double>* y, const int* y_inc);
-  
-  ///
-  /// Gemv
-  ///
-  void F77_BLAS_MANGLE(sgemv,SGEMV)( const char*, 
-                                     int*, int*, 
-                                     const float*,
-                                     const float*, int*,
-                                     const float*, int*,
-                                     const float*,
-                                     /* */ float*, int* );
-  void F77_BLAS_MANGLE(dgemv,DGEMV)( const char*,
-                                     int*, int*, 
-                                     const double*,
-                                     const double*, int*,
-                                     const double*, int*,
-                                     const double*,
-                                     /* */ double*, int* );
-  void F77_BLAS_MANGLE(cgemv,CGEMV)( const char*,
-                                     int*, int*, 
-                                     const std::complex<float>*,
-                                     const std::complex<float>*, int*,
-                                     const std::complex<float>*, int*,
-                                     const std::complex<float>*,
-                                     /* */ std::complex<float>*, int* );
-  void F77_BLAS_MANGLE(zgemv,ZGEMV)( const char*,
-                                     int*, int*, 
-                                     const std::complex<double>*,
-                                     const std::complex<double>*, int*,
-                                     const std::complex<double>*, int*,
-                                     const std::complex<double>*,
-                                     /* */ std::complex<double>*, int* );
-
-  ///
-  /// Trsv
-  ///
-
-  void F77_BLAS_MANGLE(strsv,STRSV)( const char*, const char*, const char*, 
-                                     int*, 
-                                     const float*, int*,
-                                     /* */ float*, int* );
-  void F77_BLAS_MANGLE(dtrsv,DTRSV)( const char*, const char*, const char*, 
-                                     int*, 
-                                     const double*, int*,
-                                     /* */ double*, int* );
-  void F77_BLAS_MANGLE(ctrsv,CTRSV)( const char*, const char*, const char*, 
-                                     int*, 
-                                     const std::complex<float>*, int*,
-                                     /* */ std::complex<float>*, int* );
-  void F77_BLAS_MANGLE(ztrsv,ZTRSV)( const char*, const char*, const char*, 
-                                     int*, 
-                                     const std::complex<double>*, int*,
-                                     /* */ std::complex<double>*, int* );
-
-  ///
-  /// Gemm
-  ///
-
-  void F77_BLAS_MANGLE(sgemm,SGEMM)( const char*, const char*,
-                                     int*, int*, int*,
-                                     const float*,
-                                     const float*, int*,
-                                     const float*, int*,
-                                     const float*,
-                                     /* */ float*, int* );
-  void F77_BLAS_MANGLE(dgemm,DGEMM)( const char*, const char*,
-                                     int*, int*, int*,
-                                     const double*,
-                                     const double*, int*,
-                                     const double*, int*,
-                                     const double*,
-                                     /* */ double*, int* );
-  void F77_BLAS_MANGLE(cgemm,CGEMM)( const char*, const char*,
-                                     int*, int*, int*,
-                                     const std::complex<float>*,
-                                     const std::complex<float>*, int*,
-                                     const std::complex<float>*, int*,
-                                     const std::complex<float>*,
-                                     /* */ std::complex<float>*, int* );
-  void F77_BLAS_MANGLE(zgemm,ZGEMM)( const char*, const char*,
-                                     int*, int*, int*,
-                                     const std::complex<double>*,
-                                     const std::complex<double>*, int*,
-                                     const std::complex<double>*, int*,
-                                     const std::complex<double>*,
-                                     /* */ std::complex<double>*, int* );
-
-  ///
-  /// Herk
-  ///
-
-  void F77_BLAS_MANGLE(ssyrk,SSYRK)( const char*, const char*,
-                                     int*, int*, 
-                                     const float*,
-                                     const float*, int*,
-                                     const float*,
-                                     /* */ float*, int* );
-  void F77_BLAS_MANGLE(dsyrk,DSYRK)( const char*, const char*,
-                                     int*, int*, 
-                                     const double*,
-                                     const double*, int*,
-                                     const double*,
-                                     /* */ double*, int* );
-  void F77_BLAS_MANGLE(cherk,CHERK)( const char*, const char*,
-                                     int*, int*, 
-                                     const std::complex<float>*,
-                                     const std::complex<float>*, int*,
-                                     const std::complex<float>*,
-                                     /* */ std::complex<float>*, int* );
-  void F77_BLAS_MANGLE(zherk,ZHERK)( const char*, const char*,
-                                     int*, int*, 
-                                     const std::complex<double>*,
-                                     const std::complex<double>*, int*,
-                                     const std::complex<double>*,
-                                     /* */ std::complex<double>*, int* );
-
-  ///
-  /// Trmm
-  ///
-
-  void F77_BLAS_MANGLE(strmm,STRMM)( const char*, const char*, const char*, const char*,
-                                     int*, int*,
-                                     const float*,
-                                     const float*, int*,
-                                     /* */ float*, int* );
-  void F77_BLAS_MANGLE(dtrmm,DTRMM)( const char*, const char*, const char*, const char*,
-                                     int*, int*,
-                                     const double*,
-                                     const double*, int*,
-                                     /* */ double*, int* );
-  void F77_BLAS_MANGLE(ctrmm,CTRMM)( const char*, const char*, const char*, const char*,
-                                     int*, int*,
-                                     const std::complex<float>*,
-                                     const std::complex<float>*, int*,
-                                     /* */ std::complex<float>*, int* );
-  void F77_BLAS_MANGLE(ztrmm,ZTRMM)( const char*, const char*, const char*, const char*,
-                                     int*, int*,
-                                     const std::complex<double>*,
-                                     const std::complex<double>*, int*,
-                                     /* */ std::complex<double>*, int* );
-
-  ///
-  /// Trsm
-  ///
-
-  void F77_BLAS_MANGLE(strsm,STRSM)( const char*, const char*, const char*, const char*,
-                                     int*, int*,
-                                     const float*,
-                                     const float*, int*,
-                                     /* */ float*, int* );
-  void F77_BLAS_MANGLE(dtrsm,DTRSM)( const char*, const char*, const char*, const char*,
-                                     int*, int*,
-                                     const double*,
-                                     const double*, int*,
-                                     /* */ double*, int* );
-  void F77_BLAS_MANGLE(ctrsm,CTRSM)( const char*, const char*, const char*, const char*,
-                                     int*, int*,
-                                     const std::complex<float>*,
-                                     const std::complex<float>*, int*,
-                                     /* */ std::complex<float>*, int* );
-  void F77_BLAS_MANGLE(ztrsm,ZTRSM)( const char*, const char*, const char*, const char*,
-                                     int*, int*,
-                                     const std::complex<double>*,
-                                     const std::complex<double>*, int*,
-                                     /* */ std::complex<double>*, int* );
-
-  ///
-  /// Gesv
-  ///
-
-  void F77_BLAS_MANGLE(sgesv,SGESV)( int*, int*,
-                                     float*, int*, int*,
-                                     float*, int*, int* );
-  void F77_BLAS_MANGLE(dgesv,DGESV)( int*, int*,
-                                     double*, int*, int*,
-                                     double*, int*, int* );
-  void F77_BLAS_MANGLE(cgesv,CGESV)( int*, int*,
-                                     std::complex<float>*, int*, int*,
-                                     std::complex<float>*, int*, int* );
-  void F77_BLAS_MANGLE(zgesv,ZGESV)( int*, int*,
-                                     std::complex<double>*, int*, int*,
-                                     std::complex<double>*, int*, int* );
-
-  ///
-  /// Trtri
-  ///
+///
+/// scal
+///
+void F77_BLAS_MANGLE(sscal, SSCAL)(const int* N, const float* alpha,
+                                   /* */ float* x, const int* x_inc);
+void F77_BLAS_MANGLE(dscal, DSCAL)(const int* N, const double* alpha,
+                                   /* */ double* x, const int* x_inc);
+void F77_BLAS_MANGLE(cscal,
+                     CSCAL)(const int* N, const std::complex<float>* alpha,
+                            /* */ std::complex<float>* x, const int* x_inc);
+void F77_BLAS_MANGLE(zscal,
+                     ZSCAL)(const int* N, const std::complex<double>* alpha,
+                            /* */ std::complex<double>* x, const int* x_inc);
+
+///
+/// max
+///
+int F77_BLAS_MANGLE(isamax, ISAMAX)(const int* N, const float* x,
+                                    const int* x_inc);
+int F77_BLAS_MANGLE(idamax, IDAMAX)(const int* N, const double* x,
+                                    const int* x_inc);
+int F77_BLAS_MANGLE(icamax, ICAMAX)(const int* N, const std::complex<float>* x,
+                                    const int* x_inc);
+int F77_BLAS_MANGLE(izamax, IZAMAX)(const int* N, const std::complex<double>* x,
+                                    const int* x_inc);
+
+///
+/// nrm2
+///
+float F77_BLAS_MANGLE(snrm2, SNRM2)(const int* N, const float* x,
+                                    const int* x_inc);
+double F77_BLAS_MANGLE(dnrm2, DNRM2)(const int* N, const double* x,
+                                     const int* x_inc);
+float F77_BLAS_MANGLE(scnrm2, SCNRM2)(const int* N,
+                                      const std::complex<float>* x,
+                                      const int* x_inc);
+double F77_BLAS_MANGLE(dznrm2, DZNRM2)(const int* N,
+                                       const std::complex<double>* x,
+                                       const int* x_inc);
+
+///
+/// sum
+///
+float F77_BLAS_MANGLE(sasum, SASUM)(const int* N, const float* x,
+                                    const int* x_inc);
+double F77_BLAS_MANGLE(dasum, DASUM)(const int* N, const double* x,
+                                     const int* x_inc);
+float F77_BLAS_MANGLE(scasum, SCASUM)(const int* N,
+                                      const std::complex<float>* x,
+                                      const int* x_inc);
+double F77_BLAS_MANGLE(dzasum, DZASUM)(const int* N,
+                                       const std::complex<double>* x,
+                                       const int* x_inc);
+
+///
+/// dot
+///
+float F77_BLAS_MANGLE(sdot, SDOT)(const int* N, const float* x,
+                                  const int* x_inc, const float* y,
+                                  const int* y_inc);
+double F77_BLAS_MANGLE(ddot, DDOT)(const int* N, const double* x,
+                                   const int* x_inc, const double* y,
+                                   const int* y_inc);
+#if defined(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX)
+std::complex<float> F77_BLAS_MANGLE(cdotu, CDOTU)(const int* N,
+                                                  const std::complex<float>* x,
+                                                  const int* x_inc,
+                                                  const std::complex<float>* y,
+                                                  const int* y_inc);
+std::complex<double> F77_BLAS_MANGLE(zdotu, ZDOTU)(
+    const int* N, const std::complex<double>* x, const int* x_inc,
+    const std::complex<double>* y, const int* y_inc);
+std::complex<float> F77_BLAS_MANGLE(cdotc, CDOTC)(const int* N,
+                                                  const std::complex<float>* x,
+                                                  const int* x_inc,
+                                                  const std::complex<float>* y,
+                                                  const int* y_inc);
+std::complex<double> F77_BLAS_MANGLE(zdotc, ZDOTC)(
+    const int* N, const std::complex<double>* x, const int* x_inc,
+    const std::complex<double>* y, const int* y_inc);
+#else
+void F77_BLAS_MANGLE(cdotu,
+                     CDOTU)(std::complex<float>* res, const int* N,
+                            const std::complex<float>* x, const int* x_inc,
+                            const std::complex<float>* y, const int* y_inc);
+void F77_BLAS_MANGLE(zdotu,
+                     ZDOTU)(std::complex<double>* res, const int* N,
+                            const std::complex<double>* x, const int* x_inc,
+                            const std::complex<double>* y, const int* y_inc);
+void F77_BLAS_MANGLE(cdotc,
+                     CDOTC)(std::complex<float>* res, const int* N,
+                            const std::complex<float>* x, const int* x_inc,
+                            const std::complex<float>* y, const int* y_inc);
+void F77_BLAS_MANGLE(zdotc,
+                     ZDOTC)(std::complex<double>* res, const int* N,
+                            const std::complex<double>* x, const int* x_inc,
+                            const std::complex<double>* y, const int* y_inc);
+#endif
+
+///
+/// axpy
+///
+void F77_BLAS_MANGLE(saxpy, SAXPY)(const int* N, const float* alpha,
+                                   const float* x, const int* x_inc,
+                                   /* */ float* y, const int* y_inc);
+void F77_BLAS_MANGLE(daxpy, DAXPY)(const int* N, const double* alpha,
+                                   const double* x, const int* x_inc,
+                                   /* */ double* y, const int* y_inc);
+void F77_BLAS_MANGLE(caxpy,
+                     CAXPY)(const int* N, const std::complex<float>* alpha,
+                            const std::complex<float>* x, const int* x_inc,
+                            /* */ std::complex<float>* y, const int* y_inc);
+void F77_BLAS_MANGLE(zaxpy,
+                     ZAXPY)(const int* N, const std::complex<double>* alpha,
+                            const std::complex<double>* x, const int* x_inc,
+                            /* */ std::complex<double>* y, const int* y_inc);
+
+///
+/// Gemv
+///
+void F77_BLAS_MANGLE(sgemv, SGEMV)(const char*, int*, int*, const float*,
+                                   const float*, int*, const float*, int*,
+                                   const float*,
+                                   /* */ float*, int*);
+void F77_BLAS_MANGLE(dgemv, DGEMV)(const char*, int*, int*, const double*,
+                                   const double*, int*, const double*, int*,
+                                   const double*,
+                                   /* */ double*, int*);
+void F77_BLAS_MANGLE(cgemv, CGEMV)(const char*, int*, int*,
+                                   const std::complex<float>*,
+                                   const std::complex<float>*, int*,
+                                   const std::complex<float>*, int*,
+                                   const std::complex<float>*,
+                                   /* */ std::complex<float>*, int*);
+void F77_BLAS_MANGLE(zgemv, ZGEMV)(const char*, int*, int*,
+                                   const std::complex<double>*,
+                                   const std::complex<double>*, int*,
+                                   const std::complex<double>*, int*,
+                                   const std::complex<double>*,
+                                   /* */ std::complex<double>*, int*);
+
+///
+/// Trsv
+///
+
+void F77_BLAS_MANGLE(strsv, STRSV)(const char*, const char*, const char*, int*,
+                                   const float*, int*,
+                                   /* */ float*, int*);
+void F77_BLAS_MANGLE(dtrsv, DTRSV)(const char*, const char*, const char*, int*,
+                                   const double*, int*,
+                                   /* */ double*, int*);
+void F77_BLAS_MANGLE(ctrsv, CTRSV)(const char*, const char*, const char*, int*,
+                                   const std::complex<float>*, int*,
+                                   /* */ std::complex<float>*, int*);
+void F77_BLAS_MANGLE(ztrsv, ZTRSV)(const char*, const char*, const char*, int*,
+                                   const std::complex<double>*, int*,
+                                   /* */ std::complex<double>*, int*);
+
+///
+/// Gemm
+///
+
+void F77_BLAS_MANGLE(sgemm, SGEMM)(const char*, const char*, int*, int*, int*,
+                                   const float*, const float*, int*,
+                                   const float*, int*, const float*,
+                                   /* */ float*, int*);
+void F77_BLAS_MANGLE(dgemm, DGEMM)(const char*, const char*, int*, int*, int*,
+                                   const double*, const double*, int*,
+                                   const double*, int*, const double*,
+                                   /* */ double*, int*);
+void F77_BLAS_MANGLE(cgemm, CGEMM)(const char*, const char*, int*, int*, int*,
+                                   const std::complex<float>*,
+                                   const std::complex<float>*, int*,
+                                   const std::complex<float>*, int*,
+                                   const std::complex<float>*,
+                                   /* */ std::complex<float>*, int*);
+void F77_BLAS_MANGLE(zgemm, ZGEMM)(const char*, const char*, int*, int*, int*,
+                                   const std::complex<double>*,
+                                   const std::complex<double>*, int*,
+                                   const std::complex<double>*, int*,
+                                   const std::complex<double>*,
+                                   /* */ std::complex<double>*, int*);
+
+///
+/// Herk
+///
+
+void F77_BLAS_MANGLE(ssyrk, SSYRK)(const char*, const char*, int*, int*,
+                                   const float*, const float*, int*,
+                                   const float*,
+                                   /* */ float*, int*);
+void F77_BLAS_MANGLE(dsyrk, DSYRK)(const char*, const char*, int*, int*,
+                                   const double*, const double*, int*,
+                                   const double*,
+                                   /* */ double*, int*);
+void F77_BLAS_MANGLE(cherk, CHERK)(const char*, const char*, int*, int*,
+                                   const std::complex<float>*,
+                                   const std::complex<float>*, int*,
+                                   const std::complex<float>*,
+                                   /* */ std::complex<float>*, int*);
+void F77_BLAS_MANGLE(zherk, ZHERK)(const char*, const char*, int*, int*,
+                                   const std::complex<double>*,
+                                   const std::complex<double>*, int*,
+                                   const std::complex<double>*,
+                                   /* */ std::complex<double>*, int*);
+
+///
+/// Trmm
+///
+
+void F77_BLAS_MANGLE(strmm, STRMM)(const char*, const char*, const char*,
+                                   const char*, int*, int*, const float*,
+                                   const float*, int*,
+                                   /* */ float*, int*);
+void F77_BLAS_MANGLE(dtrmm, DTRMM)(const char*, const char*, const char*,
+                                   const char*, int*, int*, const double*,
+                                   const double*, int*,
+                                   /* */ double*, int*);
+void F77_BLAS_MANGLE(ctrmm, CTRMM)(const char*, const char*, const char*,
+                                   const char*, int*, int*,
+                                   const std::complex<float>*,
+                                   const std::complex<float>*, int*,
+                                   /* */ std::complex<float>*, int*);
+void F77_BLAS_MANGLE(ztrmm, ZTRMM)(const char*, const char*, const char*,
+                                   const char*, int*, int*,
+                                   const std::complex<double>*,
+                                   const std::complex<double>*, int*,
+                                   /* */ std::complex<double>*, int*);
+
+///
+/// Trsm
+///
+
+void F77_BLAS_MANGLE(strsm, STRSM)(const char*, const char*, const char*,
+                                   const char*, int*, int*, const float*,
+                                   const float*, int*,
+                                   /* */ float*, int*);
+void F77_BLAS_MANGLE(dtrsm, DTRSM)(const char*, const char*, const char*,
+                                   const char*, int*, int*, const double*,
+                                   const double*, int*,
+                                   /* */ double*, int*);
+void F77_BLAS_MANGLE(ctrsm, CTRSM)(const char*, const char*, const char*,
+                                   const char*, int*, int*,
+                                   const std::complex<float>*,
+                                   const std::complex<float>*, int*,
+                                   /* */ std::complex<float>*, int*);
+void F77_BLAS_MANGLE(ztrsm, ZTRSM)(const char*, const char*, const char*,
+                                   const char*, int*, int*,
+                                   const std::complex<double>*,
+                                   const std::complex<double>*, int*,
+                                   /* */ std::complex<double>*, int*);
+
+///
+/// Gesv
+///
+
+void F77_BLAS_MANGLE(sgesv, SGESV)(int*, int*, float*, int*, int*, float*, int*,
+                                   int*);
+void F77_BLAS_MANGLE(dgesv, DGESV)(int*, int*, double*, int*, int*, double*,
+                                   int*, int*);
+void F77_BLAS_MANGLE(cgesv, CGESV)(int*, int*, std::complex<float>*, int*, int*,
+                                   std::complex<float>*, int*, int*);
+void F77_BLAS_MANGLE(zgesv, ZGESV)(int*, int*, std::complex<double>*, int*,
+                                   int*, std::complex<double>*, int*, int*);
+
+///
+/// Trtri
+///
 /*
     HostBlas<float>::trtri(const char uplo, const char diag,
                            int n, const float *a, int lda) {
       int info = 0;
-      F77_FUNC_STRTRI(&uplo, 
-                      &diag, &n, 
+      F77_FUNC_STRTRI(&uplo,
+                      &diag, &n,
                       a, &lda, &info);
 */
-  void F77_BLAS_MANGLE(strtri,STRTRI)(const char*,
-                                      const char*, int*,
-                                      const float*, int*, int*);
-  void F77_BLAS_MANGLE(dtrtri,DTRTRI)(const char*,
-                                      const char*, int*,
-                                      const double*, int*, int*);
-  void F77_BLAS_MANGLE(ctrtri,CTRTRI)(const char*,
-                                      const char*, int*,
-                                      const std::complex<float>*, int*, int*);
-  void F77_BLAS_MANGLE(ztrtri,ZTRTRI)(const char*,
-                                      const char*, int*,
-                                      const std::complex<double>*, int*, int*);
+void F77_BLAS_MANGLE(strtri, STRTRI)(const char*, const char*, int*,
+                                     const float*, int*, int*);
+void F77_BLAS_MANGLE(dtrtri, DTRTRI)(const char*, const char*, int*,
+                                     const double*, int*, int*);
+void F77_BLAS_MANGLE(ctrtri, CTRTRI)(const char*, const char*, int*,
+                                     const std::complex<float>*, int*, int*);
+void F77_BLAS_MANGLE(ztrtri, ZTRTRI)(const char*, const char*, int*,
+                                     const std::complex<double>*, int*, int*);
 }
 
-
-
-  void F77_BLAS_MANGLE(sscal,SSCAL)( const int* N, 
-                                     const float* alpha,
-                                     /* */ float* x, const int* x_inc);
-  void F77_BLAS_MANGLE(dscal,DSCAL)( const int* N, 
-                                     const double* alpha,
-                                     /* */ double* x, const int* x_inc);
-  void F77_BLAS_MANGLE(cscal,CSCAL)( const int* N, 
-                                     const std::complex<float>* alpha,
-                                     /* */ std::complex<float>* x, const int* x_inc);
-  void F77_BLAS_MANGLE(zscal,ZSCAL)( const int* N, 
-                                     const std::complex<double>* alpha,
-                                     /* */ std::complex<double>* x, const int* x_inc);
-  
-#define F77_FUNC_SSCAL F77_BLAS_MANGLE(sscal,SSCAL)
-#define F77_FUNC_DSCAL F77_BLAS_MANGLE(dscal,DSCAL)
-#define F77_FUNC_CSCAL F77_BLAS_MANGLE(cscal,CSCAL)
-#define F77_FUNC_ZSCAL F77_BLAS_MANGLE(zscal,ZSCAL)
-
-#define F77_FUNC_ISAMAX F77_BLAS_MANGLE(isamax,ISAMAX)
-#define F77_FUNC_IDAMAX F77_BLAS_MANGLE(idamax,IDAMAX)
-#define F77_FUNC_ICAMAX F77_BLAS_MANGLE(icamax,ICAMAX)
-#define F77_FUNC_IZAMAX F77_BLAS_MANGLE(izamax,IZAMAX)
-  
-#define F77_FUNC_SNRM2  F77_BLAS_MANGLE(snrm2,  SNRM2 )
-#define F77_FUNC_DNRM2  F77_BLAS_MANGLE(dnrm2,  DNRM2 )
+void F77_BLAS_MANGLE(sscal, SSCAL)(const int* N, const float* alpha,
+                                   /* */ float* x, const int* x_inc);
+void F77_BLAS_MANGLE(dscal, DSCAL)(const int* N, const double* alpha,
+                                   /* */ double* x, const int* x_inc);
+void F77_BLAS_MANGLE(cscal,
+                     CSCAL)(const int* N, const std::complex<float>* alpha,
+                            /* */ std::complex<float>* x, const int* x_inc);
+void F77_BLAS_MANGLE(zscal,
+                     ZSCAL)(const int* N, const std::complex<double>* alpha,
+                            /* */ std::complex<double>* x, const int* x_inc);
+
+#define F77_FUNC_SSCAL F77_BLAS_MANGLE(sscal, SSCAL)
+#define F77_FUNC_DSCAL F77_BLAS_MANGLE(dscal, DSCAL)
+#define F77_FUNC_CSCAL F77_BLAS_MANGLE(cscal, CSCAL)
+#define F77_FUNC_ZSCAL F77_BLAS_MANGLE(zscal, ZSCAL)
+
+#define F77_FUNC_ISAMAX F77_BLAS_MANGLE(isamax, ISAMAX)
+#define F77_FUNC_IDAMAX F77_BLAS_MANGLE(idamax, IDAMAX)
+#define F77_FUNC_ICAMAX F77_BLAS_MANGLE(icamax, ICAMAX)
+#define F77_FUNC_IZAMAX F77_BLAS_MANGLE(izamax, IZAMAX)
+
+#define F77_FUNC_SNRM2 F77_BLAS_MANGLE(snrm2, SNRM2)
+#define F77_FUNC_DNRM2 F77_BLAS_MANGLE(dnrm2, DNRM2)
 #define F77_FUNC_SCNRM2 F77_BLAS_MANGLE(scnrm2, SCNRM2)
 #define F77_FUNC_DZNRM2 F77_BLAS_MANGLE(dznrm2, DZNRM2)
-      
-#define F77_FUNC_SASUM  F77_BLAS_MANGLE(sasum,  SASUM )
-#define F77_FUNC_DASUM  F77_BLAS_MANGLE(dasum,  DASUM )
+
+#define F77_FUNC_SASUM F77_BLAS_MANGLE(sasum, SASUM)
+#define F77_FUNC_DASUM F77_BLAS_MANGLE(dasum, DASUM)
 #define F77_FUNC_SCASUM F77_BLAS_MANGLE(scasum, SCASUM)
 #define F77_FUNC_DZASUM F77_BLAS_MANGLE(dzasum, DZASUM)
 
-#define F77_FUNC_SDOT  F77_BLAS_MANGLE(sdot,SDOT)
-#define F77_FUNC_DDOT  F77_BLAS_MANGLE(ddot,DDOT)
-#define F77_FUNC_CDOTU F77_BLAS_MANGLE(cdotu,CDOTU)
-#define F77_FUNC_ZDOTU F77_BLAS_MANGLE(zdotu,ZDOTU)
-#define F77_FUNC_CDOTC F77_BLAS_MANGLE(cdotc,CDOTC)
-#define F77_FUNC_ZDOTC F77_BLAS_MANGLE(zdotc,ZDOTC)
-
-#define F77_FUNC_SAXPY F77_BLAS_MANGLE(saxpy,SAXPY)
-#define F77_FUNC_DAXPY F77_BLAS_MANGLE(daxpy,DAXPY)
-#define F77_FUNC_CAXPY F77_BLAS_MANGLE(caxpy,CAXPY)
-#define F77_FUNC_ZAXPY F77_BLAS_MANGLE(zaxpy,ZAXPY)
-
-#define F77_FUNC_SGEMV F77_BLAS_MANGLE(sgemv,SGEMV)
-#define F77_FUNC_DGEMV F77_BLAS_MANGLE(dgemv,DGEMV)
-#define F77_FUNC_CGEMV F77_BLAS_MANGLE(cgemv,CGEMV)
-#define F77_FUNC_ZGEMV F77_BLAS_MANGLE(zgemv,ZGEMV)
-
-#define F77_FUNC_STRSV F77_BLAS_MANGLE(strsv,STRSV)
-#define F77_FUNC_DTRSV F77_BLAS_MANGLE(dtrsv,DTRSV)
-#define F77_FUNC_CTRSV F77_BLAS_MANGLE(ctrsv,CTRSV)
-#define F77_FUNC_ZTRSV F77_BLAS_MANGLE(ztrsv,ZTRSV)
-
-#define F77_FUNC_SGEMM F77_BLAS_MANGLE(sgemm,SGEMM)
-#define F77_FUNC_DGEMM F77_BLAS_MANGLE(dgemm,DGEMM)
-#define F77_FUNC_CGEMM F77_BLAS_MANGLE(cgemm,CGEMM)
-#define F77_FUNC_ZGEMM F77_BLAS_MANGLE(zgemm,ZGEMM)
-
-#define F77_FUNC_SSYRK F77_BLAS_MANGLE(ssyrk,SSYRK)
-#define F77_FUNC_DSYRK F77_BLAS_MANGLE(dsyrk,DSYRK)
-#define F77_FUNC_CHERK F77_BLAS_MANGLE(cherk,CHERK)
-#define F77_FUNC_ZHERK F77_BLAS_MANGLE(zherk,ZHERK)
-
-#define F77_FUNC_STRMM F77_BLAS_MANGLE(strmm,STRMM)
-#define F77_FUNC_DTRMM F77_BLAS_MANGLE(dtrmm,DTRMM)
-#define F77_FUNC_CTRMM F77_BLAS_MANGLE(ctrmm,CTRMM)
-#define F77_FUNC_ZTRMM F77_BLAS_MANGLE(ztrmm,ZTRMM)
+#define F77_FUNC_SDOT F77_BLAS_MANGLE(sdot, SDOT)
+#define F77_FUNC_DDOT F77_BLAS_MANGLE(ddot, DDOT)
+#define F77_FUNC_CDOTU F77_BLAS_MANGLE(cdotu, CDOTU)
+#define F77_FUNC_ZDOTU F77_BLAS_MANGLE(zdotu, ZDOTU)
+#define F77_FUNC_CDOTC F77_BLAS_MANGLE(cdotc, CDOTC)
+#define F77_FUNC_ZDOTC F77_BLAS_MANGLE(zdotc, ZDOTC)
+
+#define F77_FUNC_SAXPY F77_BLAS_MANGLE(saxpy, SAXPY)
+#define F77_FUNC_DAXPY F77_BLAS_MANGLE(daxpy, DAXPY)
+#define F77_FUNC_CAXPY F77_BLAS_MANGLE(caxpy, CAXPY)
+#define F77_FUNC_ZAXPY F77_BLAS_MANGLE(zaxpy, ZAXPY)
+
+#define F77_FUNC_SGEMV F77_BLAS_MANGLE(sgemv, SGEMV)
+#define F77_FUNC_DGEMV F77_BLAS_MANGLE(dgemv, DGEMV)
+#define F77_FUNC_CGEMV F77_BLAS_MANGLE(cgemv, CGEMV)
+#define F77_FUNC_ZGEMV F77_BLAS_MANGLE(zgemv, ZGEMV)
+
+#define F77_FUNC_STRSV F77_BLAS_MANGLE(strsv, STRSV)
+#define F77_FUNC_DTRSV F77_BLAS_MANGLE(dtrsv, DTRSV)
+#define F77_FUNC_CTRSV F77_BLAS_MANGLE(ctrsv, CTRSV)
+#define F77_FUNC_ZTRSV F77_BLAS_MANGLE(ztrsv, ZTRSV)
+
+#define F77_FUNC_SGEMM F77_BLAS_MANGLE(sgemm, SGEMM)
+#define F77_FUNC_DGEMM F77_BLAS_MANGLE(dgemm, DGEMM)
+#define F77_FUNC_CGEMM F77_BLAS_MANGLE(cgemm, CGEMM)
+#define F77_FUNC_ZGEMM F77_BLAS_MANGLE(zgemm, ZGEMM)
+
+#define F77_FUNC_SSYRK F77_BLAS_MANGLE(ssyrk, SSYRK)
+#define F77_FUNC_DSYRK F77_BLAS_MANGLE(dsyrk, DSYRK)
+#define F77_FUNC_CHERK F77_BLAS_MANGLE(cherk, CHERK)
+#define F77_FUNC_ZHERK F77_BLAS_MANGLE(zherk, ZHERK)
+
+#define F77_FUNC_STRMM F77_BLAS_MANGLE(strmm, STRMM)
+#define F77_FUNC_DTRMM F77_BLAS_MANGLE(dtrmm, DTRMM)
+#define F77_FUNC_CTRMM F77_BLAS_MANGLE(ctrmm, CTRMM)
+#define F77_FUNC_ZTRMM F77_BLAS_MANGLE(ztrmm, ZTRMM)
+
+#define F77_FUNC_STRSM F77_BLAS_MANGLE(strsm, STRSM)
+#define F77_FUNC_DTRSM F77_BLAS_MANGLE(dtrsm, DTRSM)
+#define F77_FUNC_CTRSM F77_BLAS_MANGLE(ctrsm, CTRSM)
+#define F77_FUNC_ZTRSM F77_BLAS_MANGLE(ztrsm, ZTRSM)
+
+#define F77_FUNC_SGESV F77_BLAS_MANGLE(sgesv, SGESV)
+#define F77_FUNC_DGESV F77_BLAS_MANGLE(dgesv, DGESV)
+#define F77_FUNC_CGESV F77_BLAS_MANGLE(cgesv, CGESV)
+#define F77_FUNC_ZGESV F77_BLAS_MANGLE(zgesv, ZGESV)
+
+#define F77_FUNC_STRTRI F77_BLAS_MANGLE(strtri, STRTRI)
+#define F77_FUNC_DTRTRI F77_BLAS_MANGLE(dtrtri, DTRTRI)
+#define F77_FUNC_CTRTRI F77_BLAS_MANGLE(ctrtri, CTRTRI)
+#define F77_FUNC_ZTRTRI F77_BLAS_MANGLE(ztrtri, ZTRTRI)
 
-#define F77_FUNC_STRSM F77_BLAS_MANGLE(strsm,STRSM)
-#define F77_FUNC_DTRSM F77_BLAS_MANGLE(dtrsm,DTRSM)
-#define F77_FUNC_CTRSM F77_BLAS_MANGLE(ctrsm,CTRSM)
-#define F77_FUNC_ZTRSM F77_BLAS_MANGLE(ztrsm,ZTRSM)
-
-#define F77_FUNC_SGESV F77_BLAS_MANGLE(sgesv,SGESV)
-#define F77_FUNC_DGESV F77_BLAS_MANGLE(dgesv,DGESV)
-#define F77_FUNC_CGESV F77_BLAS_MANGLE(cgesv,CGESV)
-#define F77_FUNC_ZGESV F77_BLAS_MANGLE(zgesv,ZGESV)
+namespace KokkosBlas {
+namespace Impl {
 
-#define F77_FUNC_STRTRI F77_BLAS_MANGLE(strtri,STRTRI)
-#define F77_FUNC_DTRTRI F77_BLAS_MANGLE(dtrtri,DTRTRI)
-#define F77_FUNC_CTRTRI F77_BLAS_MANGLE(ctrtri,CTRTRI)
-#define F77_FUNC_ZTRTRI F77_BLAS_MANGLE(ztrtri,ZTRTRI)
+///
+/// float
+///
 
-namespace KokkosBlas {
-  namespace Impl {
+template <>
+void HostBlas<float>::scal(int n, const float alpha,
+                           /* */ float* x, int x_inc) {
+  F77_FUNC_SSCAL(&n, &alpha, x, &x_inc);
+}
+template <>
+int HostBlas<float>::iamax(int n, const float* x, int x_inc) {
+  return F77_FUNC_ISAMAX(&n, x, &x_inc);
+}
+template <>
+float HostBlas<float>::nrm2(int n, const float* x, int x_inc) {
+  return F77_FUNC_SNRM2(&n, x, &x_inc);
+}
+template <>
+float HostBlas<float>::asum(int n, const float* x, int x_inc) {
+  return F77_FUNC_SASUM(&n, x, &x_inc);
+}
+template <>
+float HostBlas<float>::dot(int n, const float* x, int x_inc, const float* y,
+                           int y_inc) {
+  return F77_FUNC_SDOT(&n, x, &x_inc, y, &y_inc);
+}
+template <>
+void HostBlas<float>::axpy(int n, const float alpha, const float* x, int x_inc,
+                           /* */ float* y, int y_inc) {
+  F77_FUNC_SAXPY(&n, &alpha, x, &x_inc, y, &y_inc);
+}
+template <>
+void HostBlas<float>::gemv(const char trans, int m, int n, const float alpha,
+                           const float* a, int lda, const float* b, int ldb,
+                           const float beta,
+                           /* */ float* c, int ldc) {
+  F77_FUNC_SGEMV(&trans, &m, &n, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+}
+template <>
+void HostBlas<float>::trsv(const char uplo, const char transa, const char diag,
+                           int m, const float* a, int lda,
+                           /* */ float* b, int ldb) {
+  F77_FUNC_STRSV(&uplo, &transa, &diag, &m, a, &lda, b, &ldb);
+}
+template <>
+void HostBlas<float>::gemm(const char transa, const char transb, int m, int n,
+                           int k, const float alpha, const float* a, int lda,
+                           const float* b, int ldb, const float beta,
+                           /* */ float* c, int ldc) {
+  F77_FUNC_SGEMM(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta,
+                 c, &ldc);
+}
+template <>
+void HostBlas<float>::herk(const char transa, const char transb, int n, int k,
+                           const float alpha, const float* a, int lda,
+                           const float beta,
+                           /* */ float* c, int ldc) {
+  F77_FUNC_SSYRK(&transa, &transb, &n, &k, &alpha, a, &lda, &beta, c, &ldc);
+}
+template <>
+void HostBlas<float>::trmm(const char side, const char uplo, const char transa,
+                           const char diag, int m, int n, const float alpha,
+                           const float* a, int lda,
+                           /* */ float* b, int ldb) {
+  F77_FUNC_STRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b,
+                 &ldb);
+}
+template <>
+void HostBlas<float>::trsm(const char side, const char uplo, const char transa,
+                           const char diag, int m, int n, const float alpha,
+                           const float* a, int lda,
+                           /* */ float* b, int ldb) {
+  F77_FUNC_STRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b,
+                 &ldb);
+}
+template <>
+void HostBlas<float>::gesv(int n, int rhs, float* a, int lda, int* ipiv,
+                           float* b, int ldb, int info) {
+  F77_FUNC_SGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info);
+}
+template <>
+int HostBlas<float>::trtri(const char uplo, const char diag, int n,
+                           const float* a, int lda) {
+  int info = 0;
+  F77_FUNC_STRTRI(&uplo, &diag, &n, a, &lda, &info);
+  return info;
+}
 
-    ///
-    /// float
-    ///
+///
+/// double
+///
 
-    template<>
-    void
-    HostBlas<float>::scal(int n, 
-                             const float alpha,
-                             /* */ float *x, int x_inc) {
-      F77_FUNC_SSCAL(&n, &alpha, x, &x_inc);
-    }
-    template<>
-    int
-    HostBlas<float>::iamax(int n, 
-                              const float *x, int x_inc) {
-      return F77_FUNC_ISAMAX(&n, x, &x_inc);
-    }
-    template<>
-    float
-    HostBlas<float>::nrm2(int n, 
-                             const float *x, int x_inc) {
-      return F77_FUNC_SNRM2(&n, x, &x_inc);
-    }
-    template<>
-    float
-    HostBlas<float>::asum(int n, 
-                             const float *x, int x_inc) {
-      return F77_FUNC_SASUM(&n, x, &x_inc);
-    }
-    template<>
-    float
-    HostBlas<float>::dot(int n, 
-                            const float *x, int x_inc,
-                            const float *y, int y_inc) {
-      return F77_FUNC_SDOT(&n, x, &x_inc, y, &y_inc);
-    }
-    template<>
-    void
-    HostBlas<float>::axpy(int n,
-                             const float alpha,
-                             const float *x, int x_inc,
-                             /* */ float *y, int y_inc) {
-      F77_FUNC_SAXPY(&n, &alpha, x, &x_inc, y, &y_inc);
-    }   
-    template<>
-    void
-    HostBlas<float>::gemv(const char trans, 
-                      int m, int n, 
-                      const float alpha, 
-                      const float *a, int lda,
-                      const float *b, int ldb,
-                      const float beta,
-                      /* */ float *c, int ldc) {
-      F77_FUNC_SGEMV(&trans, 
-                     &m, &n,
-                     &alpha,
-                     a, &lda,
-                     b, &ldb,
-                     &beta,
-                     c, &ldc);
-    }
-    template<>
-    void
-    HostBlas<float>::trsv(const char uplo, const char transa, const char diag, 
-                      int m, 
-                      const float *a, int lda,
-                      /* */ float *b, int ldb) {
-      F77_FUNC_STRSV(&uplo, &transa, &diag,
-                     &m,
-                     a, &lda,
-                     b, &ldb);
-    }
-    template<>
-    void 
-    HostBlas<float>::gemm(const char transa, const char transb, 
-                      int m, int n, int k,
-                      const float alpha, 
-                      const float *a, int lda,
-                      const float *b, int ldb,
-                      const float beta,
-                      /* */ float *c, int ldc) {
-      F77_FUNC_SGEMM(&transa, &transb,
-                     &m, &n, &k,
-                     &alpha,
-                     a, &lda,
-                     b, &ldb,
-                     &beta,
-                     c, &ldc);
-    }
-    template<>
-    void 
-    HostBlas<float>::herk(const char transa, const char transb, 
-                      int n, int k,
-                      const float alpha, 
-                      const float *a, int lda,
-                      const float beta,
-                      /* */ float *c, int ldc) {
-      F77_FUNC_SSYRK(&transa, &transb,
-                     &n, &k,
-                     &alpha,
-                     a, &lda,
-                     &beta,
-                     c, &ldc);
-    }
-    template<>
-    void 
-    HostBlas<float>::trmm(const char side, const char uplo, const char transa, const char diag,
-                      int m, int n, 
-                      const float alpha, 
-                      const float *a, int lda,
-                      /* */ float *b, int ldb) {
-      F77_FUNC_STRMM(&side, &uplo, &transa, &diag,
-                     &m, &n,
-                     &alpha,
-                     a, &lda,
-                     b, &ldb);
-    }
-    template<>
-    void 
-    HostBlas<float>::trsm(const char side, const char uplo, const char transa, const char diag,
-                      int m, int n, 
-                      const float alpha, 
-                      const float *a, int lda,
-                      /* */ float *b, int ldb) {
-      F77_FUNC_STRSM(&side, &uplo, &transa, &diag,
-                     &m, &n,
-                     &alpha,
-                     a, &lda,
-                     b, &ldb);
-    }
-    template<>
-    void 
-    HostBlas<float>::gesv(int n, int rhs,
-                          float *a, int lda, int *ipiv,
-                          float *b, int ldb, int info) {
-      F77_FUNC_SGESV(&n, &rhs,
-                     a, &lda, ipiv,
-                     b, &ldb, &info);
-    }
-    template<>
-    int 
-    HostBlas<float>::trtri(const char uplo, const char diag,
-                           int n, const float *a, int lda) {
-      int info = 0;
-      F77_FUNC_STRTRI(&uplo, 
-                      &diag, &n, 
-                      a, &lda, &info);
-      return info;
-    }
+template <>
+void HostBlas<double>::scal(int n, const double alpha,
+                            /* */ double* x, int x_inc) {
+  F77_FUNC_DSCAL(&n, &alpha, x, &x_inc);
+}
+template <>
+int HostBlas<double>::iamax(int n, const double* x, int x_inc) {
+  return F77_FUNC_IDAMAX(&n, x, &x_inc);
+}
+template <>
+double HostBlas<double>::nrm2(int n, const double* x, int x_inc) {
+  return F77_FUNC_DNRM2(&n, x, &x_inc);
+}
+template <>
+double HostBlas<double>::asum(int n, const double* x, int x_inc) {
+  return F77_FUNC_DASUM(&n, x, &x_inc);
+}
+template <>
+double HostBlas<double>::dot(int n, const double* x, int x_inc, const double* y,
+                             int y_inc) {
+  return F77_FUNC_DDOT(&n, x, &x_inc, y, &y_inc);
+}
+template <>
+void HostBlas<double>::axpy(int n, const double alpha, const double* x,
+                            int x_inc,
+                            /* */ double* y, int y_inc) {
+  F77_FUNC_DAXPY(&n, &alpha, x, &x_inc, y, &y_inc);
+}
+template <>
+void HostBlas<double>::gemv(const char trans, int m, int n, const double alpha,
+                            const double* a, int lda, const double* b, int ldb,
+                            const double beta,
+                            /* */ double* c, int ldc) {
+  F77_FUNC_DGEMV(&trans, &m, &n, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+}
+template <>
+void HostBlas<double>::trsv(const char uplo, const char transa, const char diag,
+                            int m, const double* a, int lda,
+                            /* */ double* b, int ldb) {
+  F77_FUNC_DTRSV(&uplo, &transa, &diag, &m, a, &lda, b, &ldb);
+}
+template <>
+void HostBlas<double>::gemm(const char transa, const char transb, int m, int n,
+                            int k, const double alpha, const double* a, int lda,
+                            const double* b, int ldb, const double beta,
+                            /* */ double* c, int ldc) {
+  F77_FUNC_DGEMM(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta,
+                 c, &ldc);
+}
+template <>
+void HostBlas<double>::herk(const char transa, const char transb, int n, int k,
+                            const double alpha, const double* a, int lda,
+                            const double beta,
+                            /* */ double* c, int ldc) {
+  F77_FUNC_DSYRK(&transa, &transb, &n, &k, &alpha, a, &lda, &beta, c, &ldc);
+}
+template <>
+void HostBlas<double>::trmm(const char side, const char uplo, const char transa,
+                            const char diag, int m, int n, const double alpha,
+                            const double* a, int lda,
+                            /* */ double* b, int ldb) {
+  F77_FUNC_DTRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b,
+                 &ldb);
+}
+template <>
+void HostBlas<double>::trsm(const char side, const char uplo, const char transa,
+                            const char diag, int m, int n, const double alpha,
+                            const double* a, int lda,
+                            /* */ double* b, int ldb) {
+  F77_FUNC_DTRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b,
+                 &ldb);
+}
+template <>
+void HostBlas<double>::gesv(int n, int rhs, double* a, int lda, int* ipiv,
+                            double* b, int ldb, int info) {
+  F77_FUNC_DGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info);
+}
+template <>
+int HostBlas<double>::trtri(const char uplo, const char diag, int n,
+                            const double* a, int lda) {
+  int info = 0;
+  F77_FUNC_DTRTRI(&uplo, &diag, &n, a, &lda, &info);
+  return info;
+}
 
-    ///
-    /// double
-    ///
+///
+/// std::complex<float>
+///
 
-    template<>
-    void
-    HostBlas<double>::scal(int n, 
-                             const double alpha,
-                             /* */ double *x, int x_inc) {
-      F77_FUNC_DSCAL(&n, &alpha, x, &x_inc);
-    }
-    template<>
-    int
-    HostBlas<double>::iamax(int n, 
-                               const double *x, int x_inc) {
-      return F77_FUNC_IDAMAX(&n, x, &x_inc);
-    }
-    template<>
-    double
-    HostBlas<double>::nrm2(int n, 
-                              const double *x, int x_inc) {
-      return F77_FUNC_DNRM2(&n, x, &x_inc);
-    }
-    template<>
-    double
-    HostBlas<double>::asum(int n, 
-                              const double *x, int x_inc) {
-      return F77_FUNC_DASUM(&n, x, &x_inc);
-    }
-    template<>
-    double
-    HostBlas<double>::dot(int n, 
-                            const double *x, int x_inc,
-                            const double *y, int y_inc) {
-      return F77_FUNC_DDOT(&n, x, &x_inc, y, &y_inc);
-    }
-    template<>
-    void
-    HostBlas<double>::axpy(int n,
-                             const double alpha,
-                             const double *x, int x_inc,
-                             /* */ double *y, int y_inc) {
-      F77_FUNC_DAXPY(&n, &alpha, x, &x_inc, y, &y_inc);
-    }     
-    template<>
-    void 
-    HostBlas<double>::gemv(const char trans, 
-                       int m, int n, 
-                       const double alpha, 
-                       const double *a, int lda,
-                       const double *b, int ldb,
-                       const double beta,
-                       /* */ double *c, int ldc) {
-      F77_FUNC_DGEMV(&trans, 
-                     &m, &n,
-                     &alpha,
-                     a, &lda,
-                     b, &ldb,
-                     &beta,
-                     c, &ldc);
-    }
-    template<>
-    void 
-    HostBlas<double>::trsv(const char uplo, const char transa, const char diag, 
-                       int m, 
-                       const double *a, int lda,
-                       /* */ double *b, int ldb) {
-      F77_FUNC_DTRSV(&uplo, &transa, &diag,
-                     &m,
-                     a, &lda,
-                     b, &ldb);
-    }
-    template<>
-    void 
-    HostBlas<double>::gemm(const char transa, const char transb, 
-                       int m, int n, int k,
-                       const double alpha, 
-                       const double *a, int lda,
-                       const double *b, int ldb,
-                       const double beta,
-                       /* */ double *c, int ldc) {
-      F77_FUNC_DGEMM(&transa, &transb,
-                     &m, &n, &k,
-                     &alpha,
-                     a, &lda,
-                     b, &ldb,
-                     &beta,
-                     c, &ldc);
-    }
-    template<>
-    void 
-    HostBlas<double>::herk(const char transa, const char transb, 
-                       int n, int k,
-                       const double alpha, 
-                       const double *a, int lda,
-                       const double beta,
-                       /* */ double *c, int ldc) {
-      F77_FUNC_DSYRK(&transa, &transb,
-                     &n, &k,
-                     &alpha,
-                     a, &lda,
-                     &beta,
-                     c, &ldc);
-    }
-    template<>
-    void 
-    HostBlas<double>::trmm(const char side, const char uplo, const char transa, const char diag,
-                       int m, int n, 
-                       const double alpha, 
-                       const double *a, int lda,
-                       /* */ double *b, int ldb) {
-      F77_FUNC_DTRMM(&side, &uplo, &transa, &diag,
-                     &m, &n,
-                     &alpha,
-                     a, &lda,
-                     b, &ldb);
-    }
-    template<>
-    void 
-    HostBlas<double>::trsm(const char side, const char uplo, const char transa, const char diag,
-                       int m, int n, 
-                       const double alpha, 
-                       const double *a, int lda,
-                       /* */ double *b, int ldb) {
-      F77_FUNC_DTRSM(&side, &uplo, &transa, &diag,
-                     &m, &n,
-                     &alpha,
-                     a, &lda,
-                     b, &ldb);
-    }
-    template<>
-    void 
-    HostBlas<double>::gesv(int n, int rhs,
-                          double *a, int lda, int *ipiv,
-                          double *b, int ldb, int info) {
-      F77_FUNC_DGESV(&n, &rhs,
-                     a, &lda, ipiv,
-                     b, &ldb, &info);
-    }
-    template<>
-    int 
-    HostBlas<double>::trtri(const char uplo, const char diag,
-                            int n, const double *a, int lda) {
-      int info = 0;
-      F77_FUNC_DTRTRI(&uplo, 
-                      &diag, &n, 
-                      a, &lda, &info);
-      return info;
-    }
+template <>
+void HostBlas<std::complex<float> >::scal(int n,
+                                          const std::complex<float> alpha,
+                                          /* */ std::complex<float>* x,
+                                          int x_inc) {
+  F77_FUNC_CSCAL(&n, &alpha, x, &x_inc);
+}
+template <>
+int HostBlas<std::complex<float> >::iamax(int n, const std::complex<float>* x,
+                                          int x_inc) {
+  return F77_FUNC_ICAMAX(&n, x, &x_inc);
+}
+template <>
+float HostBlas<std::complex<float> >::nrm2(int n, const std::complex<float>* x,
+                                           int x_inc) {
+  return F77_FUNC_SCNRM2(&n, x, &x_inc);
+}
+template <>
+float HostBlas<std::complex<float> >::asum(int n, const std::complex<float>* x,
+                                           int x_inc) {
+  return F77_FUNC_SCASUM(&n, x, &x_inc);
+}
+template <>
+std::complex<float> HostBlas<std::complex<float> >::dot(
+    int n, const std::complex<float>* x, int x_inc,
+    const std::complex<float>* y, int y_inc) {
+#if defined(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX)
+  return F77_FUNC_CDOTC(&n, x, &x_inc, y, &y_inc);
+#else
+  std::complex<float> res;
+  F77_FUNC_CDOTC(&res, &n, x, &x_inc, y, &y_inc);
+  return res;
+#endif
+}
+template <>
+void HostBlas<std::complex<float> >::axpy(int n,
+                                          const std::complex<float> alpha,
+                                          const std::complex<float>* x,
+                                          int x_inc,
+                                          /* */ std::complex<float>* y,
+                                          int y_inc) {
+  F77_FUNC_CAXPY(&n, &alpha, x, &x_inc, y, &y_inc);
+}
 
-    /// 
-    /// std::complex<float>
-    ///
+template <>
+void HostBlas<std::complex<float> >::gemv(const char trans, int m, int n,
+                                          const std::complex<float> alpha,
+                                          const std::complex<float>* a, int lda,
+                                          const std::complex<float>* b, int ldb,
+                                          const std::complex<float> beta,
+                                          /* */ std::complex<float>* c,
+                                          int ldc) {
+  F77_FUNC_CGEMV(&trans, &m, &n, &alpha, (const std::complex<float>*)a, &lda,
+                 (const std::complex<float>*)b, &ldb, &beta,
+                 (std::complex<float>*)c, &ldc);
+}
+template <>
+void HostBlas<std::complex<float> >::trsv(const char uplo, const char transa,
+                                          const char diag, int m,
+                                          const std::complex<float>* a, int lda,
+                                          /* */ std::complex<float>* b,
+                                          int ldb) {
+  F77_FUNC_CTRSV(&uplo, &transa, &diag, &m, (const std::complex<float>*)a, &lda,
+                 (std::complex<float>*)b, &ldb);
+}
+template <>
+void HostBlas<std::complex<float> >::gemm(
+    const char transa, const char transb, int m, int n, int k,
+    const std::complex<float> alpha, const std::complex<float>* a, int lda,
+    const std::complex<float>* b, int ldb, const std::complex<float> beta,
+    /* */ std::complex<float>* c, int ldc) {
+  F77_FUNC_CGEMM(&transa, &transb, &m, &n, &k, &alpha,
+                 (const std::complex<float>*)a, &lda,
+                 (const std::complex<float>*)b, &ldb, &beta,
+                 (std::complex<float>*)c, &ldc);
+}
+template <>
+void HostBlas<std::complex<float> >::herk(const char transa, const char transb,
+                                          int n, int k,
+                                          const std::complex<float> alpha,
+                                          const std::complex<float>* a, int lda,
+                                          const std::complex<float> beta,
+                                          /* */ std::complex<float>* c,
+                                          int ldc) {
+  F77_FUNC_CHERK(&transa, &transb, &n, &k, &alpha,
+                 (const std::complex<float>*)a, &lda, &beta,
+                 (std::complex<float>*)c, &ldc);
+}
+template <>
+void HostBlas<std::complex<float> >::trmm(const char side, const char uplo,
+                                          const char transa, const char diag,
+                                          int m, int n,
+                                          const std::complex<float> alpha,
+                                          const std::complex<float>* a, int lda,
+                                          /* */ std::complex<float>* b,
+                                          int ldb) {
+  F77_FUNC_CTRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha,
+                 (const std::complex<float>*)a, &lda, (std::complex<float>*)b,
+                 &ldb);
+}
+template <>
+void HostBlas<std::complex<float> >::trsm(const char side, const char uplo,
+                                          const char transa, const char diag,
+                                          int m, int n,
+                                          const std::complex<float> alpha,
+                                          const std::complex<float>* a, int lda,
+                                          /* */ std::complex<float>* b,
+                                          int ldb) {
+  F77_FUNC_CTRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha,
+                 (const std::complex<float>*)a, &lda, (std::complex<float>*)b,
+                 &ldb);
+}
+template <>
+void HostBlas<std::complex<float> >::gesv(int n, int rhs,
+                                          std::complex<float>* a, int lda,
+                                          int* ipiv, std::complex<float>* b,
+                                          int ldb, int info) {
+  F77_FUNC_CGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info);
+}
+template <>
+int HostBlas<std::complex<float> >::trtri(const char uplo, const char diag,
+                                          int n, const std::complex<float>* a,
+                                          int lda) {
+  int info = 0;
+  F77_FUNC_CTRTRI(&uplo, &diag, &n, a, &lda, &info);
+  return info;
+}
 
-    template<>
-    void
-    HostBlas<std::complex<float> >::scal(int n, 
-                                            const std::complex<float> alpha,
-                                            /* */ std::complex<float> *x, int x_inc) {
-      F77_FUNC_CSCAL(&n, &alpha, x, &x_inc);
-    }
-    template<>
-    int
-    HostBlas<std::complex<float> >::iamax(int n, 
-                                             const std::complex<float> *x, int x_inc) {
-      return F77_FUNC_ICAMAX(&n, x, &x_inc);
-    }
-    template<>
-    float
-    HostBlas<std::complex<float> >::nrm2(int n, 
-                                            const std::complex<float> *x, int x_inc) {
-      return F77_FUNC_SCNRM2(&n, x, &x_inc);
-    }
-    template<>
-    float
-    HostBlas<std::complex<float> >::asum(int n, 
-                                            const std::complex<float> *x, int x_inc) {
-      return F77_FUNC_SCASUM(&n, x, &x_inc);
-    }
-    template<>
-    std::complex<float>
-    HostBlas<std::complex<float> >::dot(int n, 
-                                           const std::complex<float> *x, int x_inc,
-                                           const std::complex<float> *y, int y_inc) {
-# if defined( KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX )
-      return F77_FUNC_CDOTC(&n, x, &x_inc, y, &y_inc);
-# else
-      std::complex<float> res;
-      F77_FUNC_CDOTC(&res, &n, x, &x_inc, y, &y_inc);
-      return res;
-# endif
-    }
-    template<>
-    void
-    HostBlas<std::complex<float> >::axpy(int n,
-                                            const std::complex<float> alpha,
-                                            const std::complex<float> *x, int x_inc,
-                                            /* */ std::complex<float> *y, int y_inc) {
-      F77_FUNC_CAXPY(&n, &alpha, x, &x_inc, y, &y_inc);
-    }     
-    
-    template<>
-    void 
-    HostBlas<std::complex<float> >::gemv(const char trans, 
-                                        int m, int n, 
-                                        const std::complex<float> alpha, 
-                                        const std::complex<float> *a, int lda,
-                                        const std::complex<float> *b, int ldb,
-                                        const std::complex<float> beta,
-                                        /* */ std::complex<float> *c, int ldc) {
-      F77_FUNC_CGEMV(&trans, 
-                     &m, &n,
-                     &alpha,
-                     (const std::complex<float>*)a, &lda,
-                     (const std::complex<float>*)b, &ldb,
-                     &beta,
-                     (      std::complex<float>*)c, &ldc);
-    }
-    template<>
-    void 
-    HostBlas<std::complex<float> >::trsv(const char uplo, const char transa, const char diag, 
-                                        int m, 
-                                        const std::complex<float> *a, int lda,
-                                        /* */ std::complex<float> *b, int ldb) {
-      F77_FUNC_CTRSV(&uplo, &transa, &diag,
-                     &m,
-                     (const std::complex<float>*)a, &lda,
-                     (      std::complex<float>*)b, &ldb);
-    }
-    template<>
-    void 
-    HostBlas<std::complex<float> >::gemm(const char transa, const char transb, 
-                                        int m, int n, int k,
-                                        const std::complex<float> alpha, 
-                                        const std::complex<float> *a, int lda,
-                                        const std::complex<float> *b, int ldb,
-                                        const std::complex<float> beta,
-                                        /* */ std::complex<float> *c, int ldc) {
-      F77_FUNC_CGEMM(&transa, &transb,
-                     &m, &n, &k,
-                     &alpha,
-                     (const std::complex<float>*)a, &lda,
-                     (const std::complex<float>*)b, &ldb,
-                     &beta,
-                     (      std::complex<float>*)c, &ldc);
-    }
-    template<>
-    void 
-    HostBlas<std::complex<float> >::herk(const char transa, const char transb, 
-                                        int n, int k,
-                                        const std::complex<float> alpha, 
-                                        const std::complex<float> *a, int lda,
-                                        const std::complex<float> beta,
-                                        /* */ std::complex<float> *c, int ldc) {
-      F77_FUNC_CHERK(&transa, &transb,
-                     &n, &k,
-                     &alpha,
-                     (const std::complex<float>*)a, &lda,
-                     &beta,
-                     (      std::complex<float>*)c, &ldc);
-    }
-    template<>
-    void 
-    HostBlas<std::complex<float> >::trmm(const char side, const char uplo, const char transa, const char diag,
-                                        int m, int n, 
-                                        const std::complex<float> alpha, 
-                                        const std::complex<float> *a, int lda,
-                                        /* */ std::complex<float> *b, int ldb) {
-      F77_FUNC_CTRMM(&side, &uplo, &transa, &diag,
-                     &m, &n,
-                     &alpha,
-                     (const std::complex<float>*)a, &lda,
-                     (      std::complex<float>*)b, &ldb);
-    }
-    template<>
-    void 
-    HostBlas<std::complex<float> >::trsm(const char side, const char uplo, const char transa, const char diag,
-                                        int m, int n, 
-                                        const std::complex<float> alpha, 
-                                        const std::complex<float> *a, int lda,
-                                        /* */ std::complex<float> *b, int ldb) {
-      F77_FUNC_CTRSM(&side, &uplo, &transa, &diag,
-                     &m, &n,
-                     &alpha,
-                     (const std::complex<float>*)a, &lda,
-                     (      std::complex<float>*)b, &ldb);
-    }
-    template<>
-    void 
-    HostBlas<std::complex<float> >::gesv(int n, int rhs,
-                                         std::complex<float> *a, int lda, int *ipiv,
-                                         std::complex<float> *b, int ldb, int info) {
-      F77_FUNC_CGESV(&n, &rhs,
-                     a, &lda, ipiv,
-                     b, &ldb, &info);
-    }
-    template<>
-    int 
-    HostBlas<std::complex<float> >::trtri(const char uplo, const char diag,
-                                          int n, const std::complex<float> *a, int lda) {
-      int info = 0;
-      F77_FUNC_CTRTRI(&uplo, 
-                      &diag, &n, 
-                      a, &lda, &info);
-      return info;
-    }
-    
-    ///
-    /// std::complex<double>
-    ///
-    
+///
+/// std::complex<double>
+///
 
-    template<>
-    void
-    HostBlas<std::complex<double> >::scal(int n, 
-                                            const std::complex<double> alpha,
-                                            /* */ std::complex<double> *x, int x_inc) {
-      F77_FUNC_ZSCAL(&n, &alpha, x, &x_inc);
-    }
-    template<>
-    int
-    HostBlas<std::complex<double> >::iamax(int n, 
-                                              const std::complex<double> *x, int x_inc) {
-      return F77_FUNC_IZAMAX(&n, x, &x_inc);
-    }
-    template<>
-    double
-    HostBlas<std::complex<double> >::nrm2(int n, 
-                                            const std::complex<double> *x, int x_inc) {
-      return F77_FUNC_DZNRM2(&n, x, &x_inc);
-    }
-    template<>
-    double
-    HostBlas<std::complex<double> >::asum(int n, 
-                                            const std::complex<double> *x, int x_inc) {
-      return F77_FUNC_DZASUM(&n, x, &x_inc);
-    }
-    template<>
-    std::complex<double>
-    HostBlas<std::complex<double> >::dot(int n, 
-                                           const std::complex<double> *x, int x_inc,
-                                           const std::complex<double> *y, int y_inc) {
-# if defined( KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX )
-      return F77_FUNC_ZDOTC(&n, x, &x_inc, y, &y_inc);
-# else
-      std::complex<double> res;
-      F77_FUNC_ZDOTC(&res, &n, x, &x_inc, y, &y_inc);
-      return res;
-# endif
-    }
-    template<>
-    void
-    HostBlas<std::complex<double> >::axpy(int n,
-                                            const std::complex<double> alpha,
-                                            const std::complex<double> *x, int x_inc,
-                                            /* */ std::complex<double> *y, int y_inc) {
-      F77_FUNC_ZAXPY(&n, &alpha, x, &x_inc, y, &y_inc);
-    }     
+template <>
+void HostBlas<std::complex<double> >::scal(int n,
+                                           const std::complex<double> alpha,
+                                           /* */ std::complex<double>* x,
+                                           int x_inc) {
+  F77_FUNC_ZSCAL(&n, &alpha, x, &x_inc);
+}
+template <>
+int HostBlas<std::complex<double> >::iamax(int n, const std::complex<double>* x,
+                                           int x_inc) {
+  return F77_FUNC_IZAMAX(&n, x, &x_inc);
+}
+template <>
+double HostBlas<std::complex<double> >::nrm2(int n,
+                                             const std::complex<double>* x,
+                                             int x_inc) {
+  return F77_FUNC_DZNRM2(&n, x, &x_inc);
+}
+template <>
+double HostBlas<std::complex<double> >::asum(int n,
+                                             const std::complex<double>* x,
+                                             int x_inc) {
+  return F77_FUNC_DZASUM(&n, x, &x_inc);
+}
+template <>
+std::complex<double> HostBlas<std::complex<double> >::dot(
+    int n, const std::complex<double>* x, int x_inc,
+    const std::complex<double>* y, int y_inc) {
+#if defined(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX)
+  return F77_FUNC_ZDOTC(&n, x, &x_inc, y, &y_inc);
+#else
+  std::complex<double> res;
+  F77_FUNC_ZDOTC(&res, &n, x, &x_inc, y, &y_inc);
+  return res;
+#endif
+}
+template <>
+void HostBlas<std::complex<double> >::axpy(int n,
+                                           const std::complex<double> alpha,
+                                           const std::complex<double>* x,
+                                           int x_inc,
+                                           /* */ std::complex<double>* y,
+                                           int y_inc) {
+  F77_FUNC_ZAXPY(&n, &alpha, x, &x_inc, y, &y_inc);
+}
 
-    template<>
-    void 
-    HostBlas<std::complex<double> >::gemv(const char trans, 
-                                         int m, int n, 
-                                         const std::complex<double> alpha, 
-                                         const std::complex<double> *a, int lda,
-                                         const std::complex<double> *b, int ldb,
-                                         const std::complex<double> beta,
-                                         /* */ std::complex<double> *c, int ldc) {
-      F77_FUNC_ZGEMV(&trans, 
-                     &m, &n,
-                     &alpha,
-                     (const std::complex<double>*)a, &lda,
-                     (const std::complex<double>*)b, &ldb,
-                     &beta,
-                     (      std::complex<double>*)c, &ldc);
-    }
-    template<>
-    void 
-    HostBlas<std::complex<double> >::trsv(const char uplo, const char transa, const char diag, 
-                                         int m, 
-                                         const std::complex<double> *a, int lda,
-                                         /* */ std::complex<double> *b, int ldb) {
-      F77_FUNC_ZTRSV(&uplo, &transa, &diag,
-                     &m,
-                     (const std::complex<double>*)a, &lda,
-                     (      std::complex<double>*)b, &ldb);
-    }
-    template<>
-    void 
-    HostBlas<std::complex<double> >::gemm(const char transa, const char transb, 
-                                         int m, int n, int k,
-                                         const std::complex<double> alpha, 
-                                         const std::complex<double> *a, int lda,
-                                         const std::complex<double> *b, int ldb,
-                                         const std::complex<double> beta,
-                                         /* */ std::complex<double> *c, int ldc) {
-      F77_FUNC_ZGEMM(&transa, &transb,
-                     &m, &n, &k,
-                     &alpha,
-                     (const std::complex<double>*)a, &lda,
-                     (const std::complex<double>*)b, &ldb,
-                     &beta,
-                     (      std::complex<double>*)c, &ldc);
-    }
-    template<>
-    void 
-    HostBlas<std::complex<double> >::herk(const char transa, const char transb, 
-                                         int n, int k,
-                                         const std::complex<double> alpha, 
-                                         const std::complex<double> *a, int lda,
-                                         const std::complex<double> beta,
-                                         /* */ std::complex<double> *c, int ldc) {
-      F77_FUNC_ZHERK(&transa, &transb,
-                     &n, &k,
-                     &alpha,
-                     (const std::complex<double>*)a, &lda,
-                     &beta,
-                     (      std::complex<double>*)c, &ldc);
-    }
-    template<>
-    void 
-    HostBlas<std::complex<double> >::trmm(const char side, const char uplo, const char transa, const char diag,
-                                         int m, int n, 
-                                         const std::complex<double> alpha, 
-                                         const std::complex<double> *a, int lda,
-                                         /* */ std::complex<double> *b, int ldb) {
-      F77_FUNC_ZTRMM(&side, &uplo, &transa, &diag,
-                     &m, &n,
-                     &alpha,
-                     (const std::complex<double>*)a, &lda,
-                     (      std::complex<double>*)b, &ldb);
-    }
-    template<>
-    void 
-    HostBlas<std::complex<double> >::trsm(const char side, const char uplo, const char transa, const char diag,
-                                         int m, int n, 
-                                         const std::complex<double> alpha, 
-                                         const std::complex<double> *a, int lda,
-                                         /* */ std::complex<double> *b, int ldb) {
-      F77_FUNC_ZTRSM(&side, &uplo, &transa, &diag,
-                     &m, &n,
-                     &alpha,
-                     (const std::complex<double>*)a, &lda,
-                     (      std::complex<double>*)b, &ldb);
-    }
-    template<>
-    void 
-    HostBlas<std::complex<double> >::gesv(int n, int rhs,
-                                         std::complex<double> *a, int lda, int *ipiv,
-                                         std::complex<double> *b, int ldb, int info) {
-      F77_FUNC_ZGESV(&n, &rhs,
-                     a, &lda, ipiv,
-                     b, &ldb, &info);
-    }
-    template<>
-    int 
-    HostBlas<std::complex<double> >::trtri(const char uplo, const char diag,
-                                           int n, const std::complex<double> *a, int lda) {
-      int info = 0;
-      F77_FUNC_ZTRTRI(&uplo, 
-                      &diag, &n, 
-                      a, &lda, &info);
-      return info;
-    }
+template <>
+void HostBlas<std::complex<double> >::gemv(
+    const char trans, int m, int n, const std::complex<double> alpha,
+    const std::complex<double>* a, int lda, const std::complex<double>* b,
+    int ldb, const std::complex<double> beta,
+    /* */ std::complex<double>* c, int ldc) {
+  F77_FUNC_ZGEMV(&trans, &m, &n, &alpha, (const std::complex<double>*)a, &lda,
+                 (const std::complex<double>*)b, &ldb, &beta,
+                 (std::complex<double>*)c, &ldc);
+}
+template <>
+void HostBlas<std::complex<double> >::trsv(const char uplo, const char transa,
+                                           const char diag, int m,
+                                           const std::complex<double>* a,
+                                           int lda,
+                                           /* */ std::complex<double>* b,
+                                           int ldb) {
+  F77_FUNC_ZTRSV(&uplo, &transa, &diag, &m, (const std::complex<double>*)a,
+                 &lda, (std::complex<double>*)b, &ldb);
+}
+template <>
+void HostBlas<std::complex<double> >::gemm(
+    const char transa, const char transb, int m, int n, int k,
+    const std::complex<double> alpha, const std::complex<double>* a, int lda,
+    const std::complex<double>* b, int ldb, const std::complex<double> beta,
+    /* */ std::complex<double>* c, int ldc) {
+  F77_FUNC_ZGEMM(&transa, &transb, &m, &n, &k, &alpha,
+                 (const std::complex<double>*)a, &lda,
+                 (const std::complex<double>*)b, &ldb, &beta,
+                 (std::complex<double>*)c, &ldc);
+}
+template <>
+void HostBlas<std::complex<double> >::herk(
+    const char transa, const char transb, int n, int k,
+    const std::complex<double> alpha, const std::complex<double>* a, int lda,
+    const std::complex<double> beta,
+    /* */ std::complex<double>* c, int ldc) {
+  F77_FUNC_ZHERK(&transa, &transb, &n, &k, &alpha,
+                 (const std::complex<double>*)a, &lda, &beta,
+                 (std::complex<double>*)c, &ldc);
+}
+template <>
+void HostBlas<std::complex<double> >::trmm(
+    const char side, const char uplo, const char transa, const char diag, int m,
+    int n, const std::complex<double> alpha, const std::complex<double>* a,
+    int lda,
+    /* */ std::complex<double>* b, int ldb) {
+  F77_FUNC_ZTRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha,
+                 (const std::complex<double>*)a, &lda, (std::complex<double>*)b,
+                 &ldb);
+}
+template <>
+void HostBlas<std::complex<double> >::trsm(
+    const char side, const char uplo, const char transa, const char diag, int m,
+    int n, const std::complex<double> alpha, const std::complex<double>* a,
+    int lda,
+    /* */ std::complex<double>* b, int ldb) {
+  F77_FUNC_ZTRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha,
+                 (const std::complex<double>*)a, &lda, (std::complex<double>*)b,
+                 &ldb);
+}
+template <>
+void HostBlas<std::complex<double> >::gesv(int n, int rhs,
+                                           std::complex<double>* a, int lda,
+                                           int* ipiv, std::complex<double>* b,
+                                           int ldb, int info) {
+  F77_FUNC_ZGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info);
+}
+template <>
+int HostBlas<std::complex<double> >::trtri(const char uplo, const char diag,
+                                           int n, const std::complex<double>* a,
+                                           int lda) {
+  int info = 0;
+  F77_FUNC_ZTRTRI(&uplo, &diag, &n, a, &lda, &info);
+  return info;
+}
 
-  } // namespace Impl
-} // namespace KokkosBlas
-#endif // KOKKOSKERNELS_ENABLE_TPL_BLAS
+}  // namespace Impl
+}  // namespace KokkosBlas
+#endif  // KOKKOSKERNELS_ENABLE_TPL_BLAS
diff --git a/src/impl/tpls/KokkosBlas_Host_tpl.hpp b/src/impl/tpls/KokkosBlas_Host_tpl.hpp
index f057a28175..efde4db81a 100644
--- a/src/impl/tpls/KokkosBlas_Host_tpl.hpp
+++ b/src/impl/tpls/KokkosBlas_Host_tpl.hpp
@@ -45,111 +45,73 @@
 #ifndef KOKKOSBLAS_HOST_TPL_HPP_
 #define KOKKOSBLAS_HOST_TPL_HPP_
 
-/// \file  KokkosBlas_Host_tpl.hpp 
+/// \file  KokkosBlas_Host_tpl.hpp
 /// \brief BLAS wrapper
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
-
 #include "KokkosKernels_config.h"
 #include "Kokkos_ArithTraits.hpp"
 
 #if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS)
 
 namespace KokkosBlas {
-  namespace Impl {
-
-    template<typename T>
-    struct HostBlas {
-      typedef Kokkos::ArithTraits<T> ats;
-      typedef typename ats::mag_type mag_type;
-  
-      static
-      void scal(int n,
-                const T alpha,
-                /* */ T *x, int x_inc);
-
-      static 
-      int iamax(int n, 
-                const T *x, int x_inc);
-      
-      static 
-      mag_type nrm2(int n,
-                    const T *x, int x_inc);
-
-      static
-      mag_type asum(int n, 
-                    const T *x, int x_inc);
-      
-      static 
-      T dot(int n,
-            const T *x, int x_inc,
-            const T *y, int y_inc);
-
-      static
-      void axpy(int n, 
-                const T alpha,
-                const T *x, int x_inc,
-                /* */ T *y, int y_inc);
-
-      static 
-      void gemv(const char trans, 
-                int m, int n, 
-                const T alpha, 
-                const T *a, int lda,
-                const T *b, int ldb,
-                const T beta,
-                /* */ T *c, int ldc);
-
-      static 
-      void trsv(const char uplo, const char transa, const char diag, 
-                int m, 
-                const T *a, int lda,
-                /* */ T *b, int ldb);
-
-      static 
-      void gemm(const char transa, const char transb, 
-                int m, int n, int k,
-                const T alpha, 
-                const T *a, int lda,
-                const T *b, int ldb,
-                const T beta,
-                /* */ T *c, int ldc);
-
-      static 
-      void herk(const char transa, const char transb, 
-                int n, int k,
-                const T alpha, 
-                const T *a, int lda,
-                const T beta,
-                /* */ T *c, int ldc);
-
-      static 
-      void trmm(const char side, const char uplo, const char transa, const char diag,
-                int m, int n, 
-                const T alpha, 
-                const T *a, int lda,
-                /* */ T *b, int ldb);
-
-      static 
-      void trsm(const char side, const char uplo, const char transa, const char diag,
-                int m, int n, 
-                const T alpha, 
-                const T *a, int lda,
-                /* */ T *b, int ldb);
-
-      static
-      void gesv(int n, int rhs,
-                T *a, int lda, int *ipiv,
-                T *b, int ldb,
-                int info);
-
-      static
-      int trtri(const char uplo, const char diag,
-                int n, const T *a, int lda);
-    };
-  }
-}
-
-#endif // KOKKOSKERNELS_ENABLE_TPL_BLAS
-
-#endif // KOKKOSBLAS_HOST_TPL_HPP_
+namespace Impl {
+
+template <typename T>
+struct HostBlas {
+  typedef Kokkos::ArithTraits<T> ats;
+  typedef typename ats::mag_type mag_type;
+
+  static void scal(int n, const T alpha,
+                   /* */ T *x, int x_inc);
+
+  static int iamax(int n, const T *x, int x_inc);
+
+  static mag_type nrm2(int n, const T *x, int x_inc);
+
+  static mag_type asum(int n, const T *x, int x_inc);
+
+  static T dot(int n, const T *x, int x_inc, const T *y, int y_inc);
+
+  static void axpy(int n, const T alpha, const T *x, int x_inc,
+                   /* */ T *y, int y_inc);
+
+  static void gemv(const char trans, int m, int n, const T alpha, const T *a,
+                   int lda, const T *b, int ldb, const T beta,
+                   /* */ T *c, int ldc);
+
+  static void trsv(const char uplo, const char transa, const char diag, int m,
+                   const T *a, int lda,
+                   /* */ T *b, int ldb);
+
+  static void gemm(const char transa, const char transb, int m, int n, int k,
+                   const T alpha, const T *a, int lda, const T *b, int ldb,
+                   const T beta,
+                   /* */ T *c, int ldc);
+
+  static void herk(const char transa, const char transb, int n, int k,
+                   const T alpha, const T *a, int lda, const T beta,
+                   /* */ T *c, int ldc);
+
+  static void trmm(const char side, const char uplo, const char transa,
+                   const char diag, int m, int n, const T alpha, const T *a,
+                   int lda,
+                   /* */ T *b, int ldb);
+
+  static void trsm(const char side, const char uplo, const char transa,
+                   const char diag, int m, int n, const T alpha, const T *a,
+                   int lda,
+                   /* */ T *b, int ldb);
+
+  static void gesv(int n, int rhs, T *a, int lda, int *ipiv, T *b, int ldb,
+                   int info);
+
+  static int trtri(const char uplo, const char diag, int n, const T *a,
+                   int lda);
+};
+}  // namespace Impl
+}  // namespace KokkosBlas
+
+#endif  // KOKKOSKERNELS_ENABLE_TPL_BLAS
+
+#endif  // KOKKOSBLAS_HOST_TPL_HPP_
diff --git a/src/impl/tpls/KokkosBlas_Rocm_tpl.cpp b/src/impl/tpls/KokkosBlas_Rocm_tpl.cpp
new file mode 100644
index 0000000000..cff2373a32
--- /dev/null
+++ b/src/impl/tpls/KokkosBlas_Rocm_tpl.cpp
@@ -0,0 +1,3 @@
+#include <Kokkos_Core.hpp>
+#include <KokkosKernels_config.h>
+#include <KokkosBlas_Rocm_tpl.hpp>
diff --git a/src/impl/tpls/KokkosBlas_Rocm_tpl.hpp b/src/impl/tpls/KokkosBlas_Rocm_tpl.hpp
new file mode 100644
index 0000000000..f5ba380d11
--- /dev/null
+++ b/src/impl/tpls/KokkosBlas_Rocm_tpl.hpp
@@ -0,0 +1,26 @@
+#ifndef KOKKOSBLAS_ROCM_TPL_HPP_
+#define KOKKOSBLAS_ROCM_TPL_HPP_
+
+#if defined(KOKKOSKERNELS_ENABLE_TPL_ROCBLAS)
+#include <KokkosBlas_tpl_spec.hpp>
+
+namespace KokkosBlas {
+namespace Impl {
+
+RocBlasSingleton::RocBlasSingleton() {
+  KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_create_handle(&handle));
+
+  Kokkos::push_finalize_hook(
+      [&]() { KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_destroy_handle(handle)); });
+}
+
+RocBlasSingleton& RocBlasSingleton::singleton() {
+  static RocBlasSingleton s;
+  return s;
+}
+
+}  // namespace Impl
+}  // namespace KokkosBlas
+#endif  // defined (KOKKOSKERNELS_ENABLE_TPL_ROCBLAS)
+
+#endif  // KOKKOSBLAS_ROCM_TPL_HPP_
diff --git a/src/impl/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp
index e25a9aa3f1..4544a396bd 100644
--- a/src/impl/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp
@@ -48,8 +48,7 @@
 namespace KokkosBlas {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class AMatrix,
-         class BXMV>
+template <class AMatrix, class BXMV>
 struct gesv_tpl_spec_avail {
   enum : bool { value = false };
 };
@@ -57,77 +56,87 @@ struct gesv_tpl_spec_avail {
 // Generic Host side BLAS (could be MKL or whatever)
 #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
 
-#define KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( SCALAR, LAYOUT, MEMSPACE ) \
-template<class ExecSpace> \
-struct gesv_tpl_spec_avail< \
-     Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> > \
-     >  { enum : bool { value = true }; };
+#define KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE)     \
+  template <class ExecSpace>                                              \
+  struct gesv_tpl_spec_avail<                                             \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {          \
+    enum : bool { value = true };                                         \
+  };
 
- KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( double,                  Kokkos::LayoutLeft, Kokkos::HostSpace)
- KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( float,                   Kokkos::LayoutLeft, Kokkos::HostSpace)
- KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::HostSpace)
- KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::HostSpace)
+KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft,
+                                    Kokkos::HostSpace)
+KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft,
+                                    Kokkos::HostSpace)
+KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<double>, Kokkos::LayoutLeft,
+                                    Kokkos::HostSpace)
+KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<float>, Kokkos::LayoutLeft,
+                                    Kokkos::HostSpace)
 /*
 #if defined (KOKKOSKERNELS_INST_DOUBLE) \
  && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT)
- KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( double,                  Kokkos::LayoutRight, Kokkos::HostSpace)
-#endif
+ KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutRight,
+Kokkos::HostSpace) #endif
 #if defined (KOKKOSKERNELS_INST_FLOAT) \
  && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT)
- KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( float,                   Kokkos::LayoutRight, Kokkos::HostSpace)
-#endif
+ KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutRight,
+Kokkos::HostSpace) #endif
 #if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
  && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT)
- KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<double>, Kokkos::LayoutRight, Kokkos::HostSpace)
-#endif
+ KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<double>,
+Kokkos::LayoutRight, Kokkos::HostSpace) #endif
 #if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
  && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT)
- KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<float>,  Kokkos::LayoutRight, Kokkos::HostSpace)
-#endif
+ KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<float>,
+Kokkos::LayoutRight, Kokkos::HostSpace) #endif
 */
 #endif
 
 // MAGMA
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA
 
-#define KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( SCALAR, LAYOUT, MEMSPACE ) \
-template<class ExecSpace> \
-struct gesv_tpl_spec_avail< \
-     Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> > \
-     >  { enum : bool { value = true }; };
+#define KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUT, MEMSPACE)    \
+  template <class ExecSpace>                                              \
+  struct gesv_tpl_spec_avail<                                             \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {          \
+    enum : bool { value = true };                                         \
+  };
 
- KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( double,                  Kokkos::LayoutLeft, Kokkos::CudaSpace)
- KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( float,                   Kokkos::LayoutLeft, Kokkos::CudaSpace)
- KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::CudaSpace)
- KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft,
+                                     Kokkos::CudaSpace)
+KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft,
+                                     Kokkos::CudaSpace)
+KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex<double>,
+                                     Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex<float>, Kokkos::LayoutLeft,
+                                     Kokkos::CudaSpace)
 
 /*
 #if defined (KOKKOSKERNELS_INST_DOUBLE) \
  && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT)
- KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( double,                 Kokkos::LayoutRight, Kokkos::CudaSpace)
-#endif
+ KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( double, Kokkos::LayoutRight,
+Kokkos::CudaSpace) #endif
 #if defined (KOKKOSKERNELS_INST_FLOAT) \
  && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT)
- KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( float,                  Kokkos::LayoutRight, Kokkos::CudaSpace)
-#endif
+ KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( float, Kokkos::LayoutRight,
+Kokkos::CudaSpace) #endif
 #if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
  && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT)
- KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex<double>,Kokkos::LayoutRight, Kokkos::CudaSpace)
-#endif
+ KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA(
+Kokkos::complex<double>,Kokkos::LayoutRight, Kokkos::CudaSpace) #endif
 #if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
  && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT)
- KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex<float>, Kokkos::LayoutRight, Kokkos::CudaSpace)
-#endif
+ KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex<float>,
+Kokkos::LayoutRight, Kokkos::CudaSpace) #endif
 */
 #endif
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 #endif
diff --git a/src/impl/tpls/KokkosBlas_gesv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas_gesv_tpl_spec_decl.hpp
index f2d8d062c0..c8b69bad49 100644
--- a/src/impl/tpls/KokkosBlas_gesv_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosBlas_gesv_tpl_spec_decl.hpp
@@ -47,427 +47,521 @@
 
 namespace KokkosBlas {
 namespace Impl {
-  template<class AViewType, class BViewType, class PViewType>
-  inline void gesv_print_specialization() {
-      #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
-        #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA
-          printf("KokkosBlas::gesv<> TPL MAGMA specialization for < %s , %s, %s >\n",typeid(AViewType).name(),typeid(BViewType).name(),typeid(PViewType).name());
-        #else
-          #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
-            printf("KokkosBlas::gesv<> TPL Blas specialization for < %s , %s, %s >\n",typeid(AViewType).name(),typeid(BViewType).name(),typeid(PViewType).name());
-          #endif        
-        #endif
-      #endif
-  }
-}
+template <class AViewType, class BViewType, class PViewType>
+inline void gesv_print_specialization() {
+#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA
+  printf("KokkosBlas::gesv<> TPL MAGMA specialization for < %s , %s, %s >\n",
+         typeid(AViewType).name(), typeid(BViewType).name(),
+         typeid(PViewType).name());
+#else
+#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
+  printf("KokkosBlas::gesv<> TPL Blas specialization for < %s , %s, %s >\n",
+         typeid(AViewType).name(), typeid(BViewType).name(),
+         typeid(PViewType).name());
+#endif
+#endif
+#endif
 }
+}  // namespace Impl
+}  // namespace KokkosBlas
 
 // Generic Host side BLAS (could be MKL or whatever)
 #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
-#include<KokkosBlas_Host_tpl.hpp>
+#include <KokkosBlas_Host_tpl.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
 
-#define KOKKOSBLAS_DGESV_BLAS( LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct GESV< \
-     Kokkos::View<double**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<double**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<int*, LAYOUT, Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef double SCALAR; \
-  typedef Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
-  typedef Kokkos::View<int*, LAYOUT, Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > PViewType; \
-  \
-  static void \
-  gesv (const AViewType& A, \
-        const BViewType& B, \
-        const PViewType& IPIV) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::gesv[TPL_BLAS,double]"); \
-    gesv_print_specialization<AViewType,BViewType,PViewType>(); \
-    const bool with_pivot = !((IPIV.extent(0) == 0) && (IPIV.data()==nullptr)); \
-    \
-    const int N    = static_cast<int> (A.extent(1)); \
-    const int AST  = static_cast<int> (A.stride(1)); \
-    const int LDA  = (AST == 0) ? 1 : AST; \
-    const int BST  = static_cast<int> (B.stride(1)); \
-    const int LDB  = (BST == 0) ? 1 : BST; \
-    const int NRHS = static_cast<int> (B.extent(1)); \
-    \
-    int info = 0; \
-    \
-    if(with_pivot) { \
-      HostBlas<double>::gesv (N, NRHS, A.data(), LDA, IPIV.data(), B.data(), LDB, info); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS_DGESV_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL)              \
+  template <class ExecSpace>                                                  \
+  struct GESV<                                                                \
+      Kokkos::View<double**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>,    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<double**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>,    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<int*, LAYOUT,                                              \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace,          \
+                                  Kokkos::HostSpace>,                         \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      true, ETI_SPEC_AVAIL> {                                                 \
+    typedef double SCALAR;                                                    \
+    typedef Kokkos::View<SCALAR**, LAYOUT,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        AViewType;                                                            \
+    typedef Kokkos::View<SCALAR**, LAYOUT,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        BViewType;                                                            \
+    typedef Kokkos::View<                                                     \
+        int*, LAYOUT,                                                         \
+        Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
+        Kokkos::MemoryTraits<Kokkos::Unmanaged> >                             \
+        PViewType;                                                            \
+                                                                              \
+    static void gesv(const AViewType& A, const BViewType& B,                  \
+                     const PViewType& IPIV) {                                 \
+      Kokkos::Profiling::pushRegion("KokkosBlas::gesv[TPL_BLAS,double]");     \
+      gesv_print_specialization<AViewType, BViewType, PViewType>();           \
+      const bool with_pivot =                                                 \
+          !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr));               \
+                                                                              \
+      const int N    = static_cast<int>(A.extent(1));                         \
+      const int AST  = static_cast<int>(A.stride(1));                         \
+      const int LDA  = (AST == 0) ? 1 : AST;                                  \
+      const int BST  = static_cast<int>(B.stride(1));                         \
+      const int LDB  = (BST == 0) ? 1 : BST;                                  \
+      const int NRHS = static_cast<int>(B.extent(1));                         \
+                                                                              \
+      int info = 0;                                                           \
+                                                                              \
+      if (with_pivot) {                                                       \
+        HostBlas<double>::gesv(N, NRHS, A.data(), LDA, IPIV.data(), B.data(), \
+                               LDB, info);                                    \
+      }                                                                       \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
+  };
 
-#define KOKKOSBLAS_SGESV_BLAS( LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct GESV< \
-     Kokkos::View<float**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<float**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<int*, LAYOUT, Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef float SCALAR; \
-  typedef Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
-  typedef Kokkos::View<int*, LAYOUT, Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > PViewType; \
-  \
-  static void \
-  gesv (const AViewType& A, \
-        const BViewType& B, \
-        const PViewType& IPIV) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::gesv[TPL_BLAS,float]"); \
-    gesv_print_specialization<AViewType,BViewType,PViewType>(); \
-    const bool with_pivot = !((IPIV.extent(0) == 0) && (IPIV.data()==nullptr)); \
-    \
-    const int N    = static_cast<int> (A.extent(1)); \
-    const int AST  = static_cast<int> (A.stride(1)); \
-    const int LDA  = (AST == 0) ? 1 : AST; \
-    const int BST  = static_cast<int> (B.stride(1)); \
-    const int LDB  = (BST == 0) ? 1 : BST; \
-    const int NRHS = static_cast<int> (B.extent(1)); \
-    \
-    int info = 0; \
-    \
-    if(with_pivot) { \
-      HostBlas<float>::gesv (N, NRHS, A.data(), LDA, IPIV.data(), B.data(), LDB, info); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS_SGESV_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL)              \
+  template <class ExecSpace>                                                  \
+  struct GESV<                                                                \
+      Kokkos::View<float**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>,     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<float**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>,     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<int*, LAYOUT,                                              \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace,          \
+                                  Kokkos::HostSpace>,                         \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      true, ETI_SPEC_AVAIL> {                                                 \
+    typedef float SCALAR;                                                     \
+    typedef Kokkos::View<SCALAR**, LAYOUT,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        AViewType;                                                            \
+    typedef Kokkos::View<SCALAR**, LAYOUT,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        BViewType;                                                            \
+    typedef Kokkos::View<                                                     \
+        int*, LAYOUT,                                                         \
+        Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
+        Kokkos::MemoryTraits<Kokkos::Unmanaged> >                             \
+        PViewType;                                                            \
+                                                                              \
+    static void gesv(const AViewType& A, const BViewType& B,                  \
+                     const PViewType& IPIV) {                                 \
+      Kokkos::Profiling::pushRegion("KokkosBlas::gesv[TPL_BLAS,float]");      \
+      gesv_print_specialization<AViewType, BViewType, PViewType>();           \
+      const bool with_pivot =                                                 \
+          !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr));               \
+                                                                              \
+      const int N    = static_cast<int>(A.extent(1));                         \
+      const int AST  = static_cast<int>(A.stride(1));                         \
+      const int LDA  = (AST == 0) ? 1 : AST;                                  \
+      const int BST  = static_cast<int>(B.stride(1));                         \
+      const int LDB  = (BST == 0) ? 1 : BST;                                  \
+      const int NRHS = static_cast<int>(B.extent(1));                         \
+                                                                              \
+      int info = 0;                                                           \
+                                                                              \
+      if (with_pivot) {                                                       \
+        HostBlas<float>::gesv(N, NRHS, A.data(), LDA, IPIV.data(), B.data(),  \
+                              LDB, info);                                     \
+      }                                                                       \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
+  };
 
-#define KOKKOSBLAS_ZGESV_BLAS( LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct GESV< \
-     Kokkos::View<Kokkos::complex<double>**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<Kokkos::complex<double>**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<int*, LAYOUT, Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef Kokkos::complex<double> SCALAR; \
-  typedef Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
-  typedef Kokkos::View<int*, LAYOUT, Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > PViewType; \
-  \
-  static void \
-  gesv (const AViewType& A, \
-        const BViewType& B, \
-        const PViewType& IPIV) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::gesv[TPL_BLAS,complex<double>]"); \
-    gesv_print_specialization<AViewType,BViewType,PViewType>(); \
-    const bool with_pivot = !((IPIV.extent(0) == 0) && (IPIV.data()==nullptr)); \
-    \
-    const int N    = static_cast<int> (A.extent(1)); \
-    const int AST  = static_cast<int> (A.stride(1)); \
-    const int LDA  = (AST == 0) ? 1 : AST; \
-    const int BST  = static_cast<int> (B.stride(1)); \
-    const int LDB  = (BST == 0) ? 1 : BST; \
-    const int NRHS = static_cast<int> (B.extent(1)); \
-    \
-    int info = 0; \
-    \
-    if(with_pivot) { \
-      HostBlas<std::complex<double> >::gesv \
-        (N, NRHS, reinterpret_cast<std::complex<double>*>(A.data()), LDA, IPIV.data(), reinterpret_cast<std::complex<double>*>(B.data()), LDB, info); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-}; \
+#define KOKKOSBLAS_ZGESV_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL)              \
+  template <class ExecSpace>                                                  \
+  struct GESV<Kokkos::View<Kokkos::complex<double>**, LAYOUT,                 \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,              \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+              Kokkos::View<Kokkos::complex<double>**, LAYOUT,                 \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,              \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+              Kokkos::View<int*, LAYOUT,                                      \
+                           Kokkos::Device<Kokkos::DefaultHostExecutionSpace,  \
+                                          Kokkos::HostSpace>,                 \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+              true, ETI_SPEC_AVAIL> {                                         \
+    typedef Kokkos::complex<double> SCALAR;                                   \
+    typedef Kokkos::View<SCALAR**, LAYOUT,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        AViewType;                                                            \
+    typedef Kokkos::View<SCALAR**, LAYOUT,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        BViewType;                                                            \
+    typedef Kokkos::View<                                                     \
+        int*, LAYOUT,                                                         \
+        Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
+        Kokkos::MemoryTraits<Kokkos::Unmanaged> >                             \
+        PViewType;                                                            \
+                                                                              \
+    static void gesv(const AViewType& A, const BViewType& B,                  \
+                     const PViewType& IPIV) {                                 \
+      Kokkos::Profiling::pushRegion(                                          \
+          "KokkosBlas::gesv[TPL_BLAS,complex<double>]");                      \
+      gesv_print_specialization<AViewType, BViewType, PViewType>();           \
+      const bool with_pivot =                                                 \
+          !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr));               \
+                                                                              \
+      const int N    = static_cast<int>(A.extent(1));                         \
+      const int AST  = static_cast<int>(A.stride(1));                         \
+      const int LDA  = (AST == 0) ? 1 : AST;                                  \
+      const int BST  = static_cast<int>(B.stride(1));                         \
+      const int LDB  = (BST == 0) ? 1 : BST;                                  \
+      const int NRHS = static_cast<int>(B.extent(1));                         \
+                                                                              \
+      int info = 0;                                                           \
+                                                                              \
+      if (with_pivot) {                                                       \
+        HostBlas<std::complex<double> >::gesv(                                \
+            N, NRHS, reinterpret_cast<std::complex<double>*>(A.data()), LDA,  \
+            IPIV.data(), reinterpret_cast<std::complex<double>*>(B.data()),   \
+            LDB, info);                                                       \
+      }                                                                       \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
+  };
 
-#define KOKKOSBLAS_CGESV_BLAS( LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct GESV< \
-     Kokkos::View<Kokkos::complex<float>**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<Kokkos::complex<float>**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<int*, LAYOUT, Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef Kokkos::complex<float> SCALAR; \
-  typedef Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
-  typedef Kokkos::View<int*, LAYOUT, Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > PViewType; \
-  \
-  static void \
-  gesv (const AViewType& A, \
-        const BViewType& B, \
-        const PViewType& IPIV) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::gesv[TPL_BLAS,complex<float>]"); \
-    gesv_print_specialization<AViewType,BViewType,PViewType>(); \
-    const bool with_pivot = !((IPIV.extent(0) == 0) && (IPIV.data()==nullptr)); \
-    \
-    const int N    = static_cast<int> (A.extent(1)); \
-    const int AST  = static_cast<int> (A.stride(1)); \
-    const int LDA  = (AST == 0) ? 1 : AST; \
-    const int BST  = static_cast<int> (B.stride(1)); \
-    const int LDB  = (BST == 0) ? 1 : BST; \
-    const int NRHS = static_cast<int> (B.extent(1)); \
-    \
-    int info = 0; \
-    \
-    if(with_pivot) { \
-      HostBlas<std::complex<float> >::gesv \
-        (N, NRHS, reinterpret_cast<std::complex<float>*>(A.data()), LDA, IPIV.data(), reinterpret_cast<std::complex<float>*>(B.data()), LDB, info); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS_CGESV_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL)              \
+  template <class ExecSpace>                                                  \
+  struct GESV<Kokkos::View<Kokkos::complex<float>**, LAYOUT,                  \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,              \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+              Kokkos::View<Kokkos::complex<float>**, LAYOUT,                  \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,              \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+              Kokkos::View<int*, LAYOUT,                                      \
+                           Kokkos::Device<Kokkos::DefaultHostExecutionSpace,  \
+                                          Kokkos::HostSpace>,                 \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+              true, ETI_SPEC_AVAIL> {                                         \
+    typedef Kokkos::complex<float> SCALAR;                                    \
+    typedef Kokkos::View<SCALAR**, LAYOUT,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        AViewType;                                                            \
+    typedef Kokkos::View<SCALAR**, LAYOUT,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        BViewType;                                                            \
+    typedef Kokkos::View<                                                     \
+        int*, LAYOUT,                                                         \
+        Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
+        Kokkos::MemoryTraits<Kokkos::Unmanaged> >                             \
+        PViewType;                                                            \
+                                                                              \
+    static void gesv(const AViewType& A, const BViewType& B,                  \
+                     const PViewType& IPIV) {                                 \
+      Kokkos::Profiling::pushRegion(                                          \
+          "KokkosBlas::gesv[TPL_BLAS,complex<float>]");                       \
+      gesv_print_specialization<AViewType, BViewType, PViewType>();           \
+      const bool with_pivot =                                                 \
+          !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr));               \
+                                                                              \
+      const int N    = static_cast<int>(A.extent(1));                         \
+      const int AST  = static_cast<int>(A.stride(1));                         \
+      const int LDA  = (AST == 0) ? 1 : AST;                                  \
+      const int BST  = static_cast<int>(B.stride(1));                         \
+      const int LDB  = (BST == 0) ? 1 : BST;                                  \
+      const int NRHS = static_cast<int>(B.extent(1));                         \
+                                                                              \
+      int info = 0;                                                           \
+                                                                              \
+      if (with_pivot) {                                                       \
+        HostBlas<std::complex<float> >::gesv(                                 \
+            N, NRHS, reinterpret_cast<std::complex<float>*>(A.data()), LDA,   \
+            IPIV.data(), reinterpret_cast<std::complex<float>*>(B.data()),    \
+            LDB, info);                                                       \
+      }                                                                       \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
+  };
 
-KOKKOSBLAS_DGESV_BLAS( Kokkos::LayoutLeft,  Kokkos::HostSpace, true)
-KOKKOSBLAS_DGESV_BLAS( Kokkos::LayoutLeft,  Kokkos::HostSpace, false)
+KOKKOSBLAS_DGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true)
+KOKKOSBLAS_DGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false)
 
-KOKKOSBLAS_SGESV_BLAS( Kokkos::LayoutLeft,  Kokkos::HostSpace, true)
-KOKKOSBLAS_SGESV_BLAS( Kokkos::LayoutLeft,  Kokkos::HostSpace, false)
+KOKKOSBLAS_SGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true)
+KOKKOSBLAS_SGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false)
 
-KOKKOSBLAS_ZGESV_BLAS( Kokkos::LayoutLeft,  Kokkos::HostSpace, true)
-KOKKOSBLAS_ZGESV_BLAS( Kokkos::LayoutLeft,  Kokkos::HostSpace, false)
+KOKKOSBLAS_ZGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true)
+KOKKOSBLAS_ZGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false)
 
-KOKKOSBLAS_CGESV_BLAS( Kokkos::LayoutLeft,  Kokkos::HostSpace, true)
-KOKKOSBLAS_CGESV_BLAS( Kokkos::LayoutLeft,  Kokkos::HostSpace, false)
+KOKKOSBLAS_CGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true)
+KOKKOSBLAS_CGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false)
 
-}
-}
-#endif // KOKKOSKERNELS_ENABLE_TPL_BLAS
+}  // namespace Impl
+}  // namespace KokkosBlas
+#endif  // KOKKOSKERNELS_ENABLE_TPL_BLAS
 
 // MAGMA
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA
-#include<KokkosBlas_tpl_spec.hpp>
+#include <KokkosBlas_tpl_spec.hpp>
 
 namespace KokkosBlas {
 namespace Impl {
 
-#define KOKKOSBLAS_DGESV_MAGMA( LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct GESV< \
-     Kokkos::View<double**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<double**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<magma_int_t*, LAYOUT, Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef double SCALAR; \
-  typedef Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
-  typedef Kokkos::View<magma_int_t*, LAYOUT, Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > PViewType; \
-  \
-  static void \
-  gesv (const AViewType& A, \
-        const BViewType& B, \
-        const PViewType& IPIV) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::gesv[TPL_MAGMA,double]"); \
-    gesv_print_specialization<AViewType,BViewType,PViewType>(); \
-    const bool with_pivot = !((IPIV.extent(0) == 0) && (IPIV.data()==nullptr)); \
-    \
-    magma_int_t N    = static_cast<magma_int_t> (A.extent(1)); \
-    magma_int_t AST  = static_cast<magma_int_t> (A.stride(1)); \
-    magma_int_t LDA  = (AST == 0) ? 1 : AST; \
-    magma_int_t BST  = static_cast<magma_int_t> (B.stride(1)); \
-    magma_int_t LDB  = (BST == 0) ? 1 : BST; \
-    magma_int_t NRHS = static_cast<magma_int_t> (B.extent(1)); \
-    \
-    KokkosBlas::Impl::MagmaSingleton & s = KokkosBlas::Impl::MagmaSingleton::singleton(); \
-    magma_int_t  info = 0; \
-    \
-    if(with_pivot) { \
-      magma_dgesv_gpu(N,NRHS,reinterpret_cast<magmaDouble_ptr>(A.data()),LDA,IPIV.data(),reinterpret_cast<magmaDouble_ptr>(B.data()),LDB,&info); \
-    } \
-    else { \
-      magma_dgesv_nopiv_gpu(N,NRHS,reinterpret_cast<magmaDouble_ptr>(A.data()),LDA,reinterpret_cast<magmaDouble_ptr>(B.data()),LDB,&info); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS_DGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL)             \
+  template <class ExecSpace>                                                  \
+  struct GESV<                                                                \
+      Kokkos::View<double**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>,    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<double**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>,    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<magma_int_t*, LAYOUT,                                      \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace,          \
+                                  Kokkos::HostSpace>,                         \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      true, ETI_SPEC_AVAIL> {                                                 \
+    typedef double SCALAR;                                                    \
+    typedef Kokkos::View<SCALAR**, LAYOUT,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        AViewType;                                                            \
+    typedef Kokkos::View<SCALAR**, LAYOUT,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        BViewType;                                                            \
+    typedef Kokkos::View<                                                     \
+        magma_int_t*, LAYOUT,                                                 \
+        Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
+        Kokkos::MemoryTraits<Kokkos::Unmanaged> >                             \
+        PViewType;                                                            \
+                                                                              \
+    static void gesv(const AViewType& A, const BViewType& B,                  \
+                     const PViewType& IPIV) {                                 \
+      Kokkos::Profiling::pushRegion("KokkosBlas::gesv[TPL_MAGMA,double]");    \
+      gesv_print_specialization<AViewType, BViewType, PViewType>();           \
+      const bool with_pivot =                                                 \
+          !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr));               \
+                                                                              \
+      magma_int_t N    = static_cast<magma_int_t>(A.extent(1));               \
+      magma_int_t AST  = static_cast<magma_int_t>(A.stride(1));               \
+      magma_int_t LDA  = (AST == 0) ? 1 : AST;                                \
+      magma_int_t BST  = static_cast<magma_int_t>(B.stride(1));               \
+      magma_int_t LDB  = (BST == 0) ? 1 : BST;                                \
+      magma_int_t NRHS = static_cast<magma_int_t>(B.extent(1));               \
+                                                                              \
+      KokkosBlas::Impl::MagmaSingleton& s =                                   \
+          KokkosBlas::Impl::MagmaSingleton::singleton();                      \
+      magma_int_t info = 0;                                                   \
+                                                                              \
+      if (with_pivot) {                                                       \
+        magma_dgesv_gpu(N, NRHS, reinterpret_cast<magmaDouble_ptr>(A.data()), \
+                        LDA, IPIV.data(),                                     \
+                        reinterpret_cast<magmaDouble_ptr>(B.data()), LDB,     \
+                        &info);                                               \
+      } else {                                                                \
+        magma_dgesv_nopiv_gpu(                                                \
+            N, NRHS, reinterpret_cast<magmaDouble_ptr>(A.data()), LDA,        \
+            reinterpret_cast<magmaDouble_ptr>(B.data()), LDB, &info);         \
+      }                                                                       \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
+  };
 
-#define KOKKOSBLAS_SGESV_MAGMA( LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct GESV< \
-     Kokkos::View<float**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<float**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<magma_int_t*, LAYOUT, Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef float SCALAR; \
-  typedef Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
-  typedef Kokkos::View<magma_int_t*, LAYOUT, Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > PViewType; \
-  \
-  static void \
-  gesv (const AViewType& A, \
-        const BViewType& B, \
-        const PViewType& IPIV) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::gesv[TPL_MAGMA,float]"); \
-    gesv_print_specialization<AViewType,BViewType,PViewType>(); \
-    const bool with_pivot = !((IPIV.extent(0) == 0) && (IPIV.data()==nullptr)); \
-    \
-    magma_int_t N    = static_cast<magma_int_t> (A.extent(1)); \
-    magma_int_t AST  = static_cast<magma_int_t> (A.stride(1)); \
-    magma_int_t LDA  = (AST == 0) ? 1 : AST; \
-    magma_int_t BST  = static_cast<magma_int_t> (B.stride(1)); \
-    magma_int_t LDB  = (BST == 0) ? 1 : BST; \
-    magma_int_t NRHS = static_cast<magma_int_t> (B.extent(1)); \
-    \
-    KokkosBlas::Impl::MagmaSingleton & s = KokkosBlas::Impl::MagmaSingleton::singleton(); \
-    magma_int_t  info = 0; \
-    \
-    if(with_pivot) { \
-      magma_sgesv_gpu(N,NRHS,reinterpret_cast<magmaFloat_ptr>(A.data()),LDA,IPIV.data(),reinterpret_cast<magmaFloat_ptr>(B.data()),LDB,&info); \
-    } \
-    else { \
-      magma_sgesv_nopiv_gpu(N,NRHS,reinterpret_cast<magmaFloat_ptr>(A.data()),LDA,reinterpret_cast<magmaFloat_ptr>(B.data()),LDB,&info); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS_SGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL)             \
+  template <class ExecSpace>                                                  \
+  struct GESV<                                                                \
+      Kokkos::View<float**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>,     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<float**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>,     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      Kokkos::View<magma_int_t*, LAYOUT,                                      \
+                   Kokkos::Device<Kokkos::DefaultHostExecutionSpace,          \
+                                  Kokkos::HostSpace>,                         \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                 \
+      true, ETI_SPEC_AVAIL> {                                                 \
+    typedef float SCALAR;                                                     \
+    typedef Kokkos::View<SCALAR**, LAYOUT,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        AViewType;                                                            \
+    typedef Kokkos::View<SCALAR**, LAYOUT,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        BViewType;                                                            \
+    typedef Kokkos::View<                                                     \
+        magma_int_t*, LAYOUT,                                                 \
+        Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
+        Kokkos::MemoryTraits<Kokkos::Unmanaged> >                             \
+        PViewType;                                                            \
+                                                                              \
+    static void gesv(const AViewType& A, const BViewType& B,                  \
+                     const PViewType& IPIV) {                                 \
+      Kokkos::Profiling::pushRegion("KokkosBlas::gesv[TPL_MAGMA,float]");     \
+      gesv_print_specialization<AViewType, BViewType, PViewType>();           \
+      const bool with_pivot =                                                 \
+          !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr));               \
+                                                                              \
+      magma_int_t N    = static_cast<magma_int_t>(A.extent(1));               \
+      magma_int_t AST  = static_cast<magma_int_t>(A.stride(1));               \
+      magma_int_t LDA  = (AST == 0) ? 1 : AST;                                \
+      magma_int_t BST  = static_cast<magma_int_t>(B.stride(1));               \
+      magma_int_t LDB  = (BST == 0) ? 1 : BST;                                \
+      magma_int_t NRHS = static_cast<magma_int_t>(B.extent(1));               \
+                                                                              \
+      KokkosBlas::Impl::MagmaSingleton& s =                                   \
+          KokkosBlas::Impl::MagmaSingleton::singleton();                      \
+      magma_int_t info = 0;                                                   \
+                                                                              \
+      if (with_pivot) {                                                       \
+        magma_sgesv_gpu(N, NRHS, reinterpret_cast<magmaFloat_ptr>(A.data()),  \
+                        LDA, IPIV.data(),                                     \
+                        reinterpret_cast<magmaFloat_ptr>(B.data()), LDB,      \
+                        &info);                                               \
+      } else {                                                                \
+        magma_sgesv_nopiv_gpu(                                                \
+            N, NRHS, reinterpret_cast<magmaFloat_ptr>(A.data()), LDA,         \
+            reinterpret_cast<magmaFloat_ptr>(B.data()), LDB, &info);          \
+      }                                                                       \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
+  };
 
-#define KOKKOSBLAS_ZGESV_MAGMA( LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct GESV< \
-     Kokkos::View<Kokkos::complex<double>**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<Kokkos::complex<double>**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<magma_int_t*, LAYOUT, Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef Kokkos::complex<double> SCALAR; \
-  typedef Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
-  typedef Kokkos::View<magma_int_t*, LAYOUT, Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > PViewType; \
-  \
-  static void \
-  gesv (const AViewType& A, \
-        const BViewType& B, \
-        const PViewType& IPIV) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::gesv[TPL_MAGMA,complex<double>]"); \
-    gesv_print_specialization<AViewType,BViewType,PViewType>(); \
-    const bool with_pivot = !((IPIV.extent(0) == 0) && (IPIV.data()==nullptr)); \
-    \
-    magma_int_t N    = static_cast<magma_int_t> (A.extent(1)); \
-    magma_int_t AST  = static_cast<magma_int_t> (A.stride(1)); \
-    magma_int_t LDA  = (AST == 0) ? 1 : AST; \
-    magma_int_t BST  = static_cast<magma_int_t> (B.stride(1)); \
-    magma_int_t LDB  = (BST == 0) ? 1 : BST; \
-    magma_int_t NRHS = static_cast<magma_int_t> (B.extent(1)); \
-    \
-    KokkosBlas::Impl::MagmaSingleton & s = KokkosBlas::Impl::MagmaSingleton::singleton(); \
-    magma_int_t  info = 0; \
-    \
-    if(with_pivot) { \
-      magma_zgesv_gpu(N,NRHS,reinterpret_cast<magmaDoubleComplex_ptr>(A.data()),LDA,IPIV.data(),reinterpret_cast<magmaDoubleComplex_ptr>(B.data()),LDB,&info); \
-    } \
-    else { \
-      magma_zgesv_nopiv_gpu(N,NRHS,reinterpret_cast<magmaDoubleComplex_ptr>(A.data()),LDA,reinterpret_cast<magmaDoubleComplex_ptr>(B.data()),LDB,&info); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-}; \
+#define KOKKOSBLAS_ZGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL)             \
+  template <class ExecSpace>                                                  \
+  struct GESV<Kokkos::View<Kokkos::complex<double>**, LAYOUT,                 \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,              \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+              Kokkos::View<Kokkos::complex<double>**, LAYOUT,                 \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,              \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+              Kokkos::View<magma_int_t*, LAYOUT,                              \
+                           Kokkos::Device<Kokkos::DefaultHostExecutionSpace,  \
+                                          Kokkos::HostSpace>,                 \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+              true, ETI_SPEC_AVAIL> {                                         \
+    typedef Kokkos::complex<double> SCALAR;                                   \
+    typedef Kokkos::View<SCALAR**, LAYOUT,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        AViewType;                                                            \
+    typedef Kokkos::View<SCALAR**, LAYOUT,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        BViewType;                                                            \
+    typedef Kokkos::View<                                                     \
+        magma_int_t*, LAYOUT,                                                 \
+        Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
+        Kokkos::MemoryTraits<Kokkos::Unmanaged> >                             \
+        PViewType;                                                            \
+                                                                              \
+    static void gesv(const AViewType& A, const BViewType& B,                  \
+                     const PViewType& IPIV) {                                 \
+      Kokkos::Profiling::pushRegion(                                          \
+          "KokkosBlas::gesv[TPL_MAGMA,complex<double>]");                     \
+      gesv_print_specialization<AViewType, BViewType, PViewType>();           \
+      const bool with_pivot =                                                 \
+          !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr));               \
+                                                                              \
+      magma_int_t N    = static_cast<magma_int_t>(A.extent(1));               \
+      magma_int_t AST  = static_cast<magma_int_t>(A.stride(1));               \
+      magma_int_t LDA  = (AST == 0) ? 1 : AST;                                \
+      magma_int_t BST  = static_cast<magma_int_t>(B.stride(1));               \
+      magma_int_t LDB  = (BST == 0) ? 1 : BST;                                \
+      magma_int_t NRHS = static_cast<magma_int_t>(B.extent(1));               \
+                                                                              \
+      KokkosBlas::Impl::MagmaSingleton& s =                                   \
+          KokkosBlas::Impl::MagmaSingleton::singleton();                      \
+      magma_int_t info = 0;                                                   \
+                                                                              \
+      if (with_pivot) {                                                       \
+        magma_zgesv_gpu(                                                      \
+            N, NRHS, reinterpret_cast<magmaDoubleComplex_ptr>(A.data()), LDA, \
+            IPIV.data(), reinterpret_cast<magmaDoubleComplex_ptr>(B.data()),  \
+            LDB, &info);                                                      \
+      } else {                                                                \
+        magma_zgesv_nopiv_gpu(                                                \
+            N, NRHS, reinterpret_cast<magmaDoubleComplex_ptr>(A.data()), LDA, \
+            reinterpret_cast<magmaDoubleComplex_ptr>(B.data()), LDB, &info);  \
+      }                                                                       \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
+  };
 
-#define KOKKOSBLAS_CGESV_MAGMA( LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL ) \
-template<class ExecSpace> \
-struct GESV< \
-     Kokkos::View<Kokkos::complex<float>**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<Kokkos::complex<float>**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<magma_int_t*, LAYOUT, Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef Kokkos::complex<float> SCALAR; \
-  typedef Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  typedef Kokkos::View<SCALAR**, LAYOUT, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > BViewType; \
-  typedef Kokkos::View<magma_int_t*, LAYOUT, Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > PViewType; \
- \
-  static void \
-  gesv (const AViewType& A, \
-        const BViewType& B, \
-        const PViewType& IPIV) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::gesv[TPL_MAGMA,complex<float>]"); \
-    gesv_print_specialization<AViewType,BViewType,PViewType>(); \
-    const bool with_pivot = !((IPIV.extent(0) == 0) && (IPIV.data()==nullptr)); \
-    \
-    magma_int_t N    = static_cast<magma_int_t> (A.extent(1)); \
-    magma_int_t AST  = static_cast<magma_int_t> (A.stride(1)); \
-    magma_int_t LDA  = (AST == 0) ? 1 : AST; \
-    magma_int_t BST  = static_cast<magma_int_t> (B.stride(1)); \
-    magma_int_t LDB  = (BST == 0) ? 1 : BST; \
-    magma_int_t NRHS = static_cast<magma_int_t> (B.extent(1)); \
-    \
-    KokkosBlas::Impl::MagmaSingleton & s = KokkosBlas::Impl::MagmaSingleton::singleton(); \
-    magma_int_t  info = 0; \
-    \
-    if(with_pivot) { \
-      magma_cgesv_gpu(N,NRHS,reinterpret_cast<magmaFloatComplex_ptr>(A.data()),LDA,IPIV.data(),reinterpret_cast<magmaFloatComplex_ptr>(B.data()),LDB,&info); \
-    } \
-    else { \
-      magma_cgesv_nopiv_gpu(N,NRHS,reinterpret_cast<magmaFloatComplex_ptr>(A.data()),LDA,reinterpret_cast<magmaFloatComplex_ptr>(B.data()),LDB,&info); \
-    } \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS_CGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL)             \
+  template <class ExecSpace>                                                  \
+  struct GESV<Kokkos::View<Kokkos::complex<float>**, LAYOUT,                  \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,              \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+              Kokkos::View<Kokkos::complex<float>**, LAYOUT,                  \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,              \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+              Kokkos::View<magma_int_t*, LAYOUT,                              \
+                           Kokkos::Device<Kokkos::DefaultHostExecutionSpace,  \
+                                          Kokkos::HostSpace>,                 \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+              true, ETI_SPEC_AVAIL> {                                         \
+    typedef Kokkos::complex<float> SCALAR;                                    \
+    typedef Kokkos::View<SCALAR**, LAYOUT,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        AViewType;                                                            \
+    typedef Kokkos::View<SCALAR**, LAYOUT,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        BViewType;                                                            \
+    typedef Kokkos::View<                                                     \
+        magma_int_t*, LAYOUT,                                                 \
+        Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>, \
+        Kokkos::MemoryTraits<Kokkos::Unmanaged> >                             \
+        PViewType;                                                            \
+                                                                              \
+    static void gesv(const AViewType& A, const BViewType& B,                  \
+                     const PViewType& IPIV) {                                 \
+      Kokkos::Profiling::pushRegion(                                          \
+          "KokkosBlas::gesv[TPL_MAGMA,complex<float>]");                      \
+      gesv_print_specialization<AViewType, BViewType, PViewType>();           \
+      const bool with_pivot =                                                 \
+          !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr));               \
+                                                                              \
+      magma_int_t N    = static_cast<magma_int_t>(A.extent(1));               \
+      magma_int_t AST  = static_cast<magma_int_t>(A.stride(1));               \
+      magma_int_t LDA  = (AST == 0) ? 1 : AST;                                \
+      magma_int_t BST  = static_cast<magma_int_t>(B.stride(1));               \
+      magma_int_t LDB  = (BST == 0) ? 1 : BST;                                \
+      magma_int_t NRHS = static_cast<magma_int_t>(B.extent(1));               \
+                                                                              \
+      KokkosBlas::Impl::MagmaSingleton& s =                                   \
+          KokkosBlas::Impl::MagmaSingleton::singleton();                      \
+      magma_int_t info = 0;                                                   \
+                                                                              \
+      if (with_pivot) {                                                       \
+        magma_cgesv_gpu(                                                      \
+            N, NRHS, reinterpret_cast<magmaFloatComplex_ptr>(A.data()), LDA,  \
+            IPIV.data(), reinterpret_cast<magmaFloatComplex_ptr>(B.data()),   \
+            LDB, &info);                                                      \
+      } else {                                                                \
+        magma_cgesv_nopiv_gpu(                                                \
+            N, NRHS, reinterpret_cast<magmaFloatComplex_ptr>(A.data()), LDA,  \
+            reinterpret_cast<magmaFloatComplex_ptr>(B.data()), LDB, &info);   \
+      }                                                                       \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
+  };
 
-KOKKOSBLAS_DGESV_MAGMA( Kokkos::LayoutLeft,  Kokkos::CudaSpace, true)
-KOKKOSBLAS_DGESV_MAGMA( Kokkos::LayoutLeft,  Kokkos::CudaSpace, false)
+KOKKOSBLAS_DGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
+KOKKOSBLAS_DGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
 
-KOKKOSBLAS_SGESV_MAGMA( Kokkos::LayoutLeft,  Kokkos::CudaSpace, true)
-KOKKOSBLAS_SGESV_MAGMA( Kokkos::LayoutLeft,  Kokkos::CudaSpace, false)
+KOKKOSBLAS_SGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
+KOKKOSBLAS_SGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
 
-KOKKOSBLAS_ZGESV_MAGMA( Kokkos::LayoutLeft,  Kokkos::CudaSpace, true)
-KOKKOSBLAS_ZGESV_MAGMA( Kokkos::LayoutLeft,  Kokkos::CudaSpace, false)
+KOKKOSBLAS_ZGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
+KOKKOSBLAS_ZGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
 
-KOKKOSBLAS_CGESV_MAGMA( Kokkos::LayoutLeft,  Kokkos::CudaSpace, true)
-KOKKOSBLAS_CGESV_MAGMA( Kokkos::LayoutLeft,  Kokkos::CudaSpace, false)
+KOKKOSBLAS_CGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
+KOKKOSBLAS_CGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false)
 
-}
-}
-#endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA
+}  // namespace Impl
+}  // namespace KokkosBlas
+#endif  // KOKKOSKERNELS_ENABLE_TPL_MAGMA
 
 #endif
diff --git a/src/impl/tpls/KokkosBlas_tpl_spec.hpp b/src/impl/tpls/KokkosBlas_tpl_spec.hpp
index b59aa09b37..fbb353c656 100644
--- a/src/impl/tpls/KokkosBlas_tpl_spec.hpp
+++ b/src/impl/tpls/KokkosBlas_tpl_spec.hpp
@@ -57,12 +57,190 @@ struct CudaBlasSingleton {
 
   CudaBlasSingleton();
 
-  static CudaBlasSingleton & singleton();
+  static CudaBlasSingleton& singleton();
 };
 
+inline void cublas_internal_error_throw(cublasStatus_t cublasState,
+                                        const char* name, const char* file,
+                                        const int line) {
+  std::ostringstream out;
+  // out << name << " error( " << cublasGetStatusName(cublasState)
+  //     << "): " << cublasGetStatusString(cublasState);
+  out << name << " error( ";
+  switch (cublasState) {
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      out << "CUBLAS_STATUS_NOT_INITIALIZED): the library was not initialized.";
+      break;
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      out << "CUBLAS_STATUS_ALLOC_FAILED): the resource allocation failed.";
+      break;
+    case CUBLAS_STATUS_INVALID_VALUE:
+      out << "CUBLAS_STATUS_INVALID_VALUE): an invalid numerical value was "
+             "used as an argument.";
+      break;
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      out << "CUBLAS_STATUS_ARCH_MISMATCH): an absent device architectural "
+             "feature is required.";
+      break;
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      out << "CUBLAS_STATUS_MAPPING_ERROR): an access to GPU memory space "
+             "failed.";
+      break;
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      out << "CUBLAS_STATUS_EXECUTION_FAILED): the GPU program failed to "
+             "execute.";
+      break;
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      out << "CUBLAS_STATUS_INTERNAL_ERROR): an internal operation failed.";
+      break;
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+      out << "CUBLAS_STATUS_NOT_SUPPORTED): the feature required is not "
+             "supported.";
+      break;
+    default: out << "unrecognized error code): this is bad!"; break;
+  }
+  if (file) {
+    out << " " << file << ":" << line;
+  }
+  throw std::runtime_error(out.str());
 }
+
+inline void cublas_internal_safe_call(cublasStatus_t cublasState,
+                                      const char* name,
+                                      const char* file = nullptr,
+                                      const int line   = 0) {
+  if (CUBLAS_STATUS_SUCCESS != cublasState) {
+    cublas_internal_error_throw(cublasState, name, file, line);
+  }
+}
+
+// The macro below defines the interface for the safe cublas calls.
+// The functions themselves are protected by impl namespace and this
+// is not meant to be used by external application or libraries.
+#define KOKKOS_CUBLAS_SAFE_CALL_IMPL(call) \
+  KokkosBlas::Impl::cublas_internal_safe_call(call, #call, __FILE__, __LINE__)
+
+/// \brief This function converts KK transpose mode to cuBLAS transpose mode
+inline cublasOperation_t trans_mode_kk_to_cublas(const char kkMode[]) {
+  cublasOperation_t trans;
+  if ((kkMode[0] == 'N') || (kkMode[0] == 'n'))
+    trans = CUBLAS_OP_N;
+  else if ((kkMode[0] == 'T') || (kkMode[0] == 't'))
+    trans = CUBLAS_OP_T;
+  else
+    trans = CUBLAS_OP_C;
+  return trans;
 }
-#endif // KOKKOSKERNELS_ENABLE_TPL_CUBLAS
+
+}  // namespace Impl
+}  // namespace KokkosBlas
+#endif  // KOKKOSKERNELS_ENABLE_TPL_CUBLAS
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS
+#include <rocblas.h>
+
+namespace KokkosBlas {
+namespace Impl {
+
+struct RocBlasSingleton {
+  rocblas_handle handle;
+
+  RocBlasSingleton();
+
+  static RocBlasSingleton& singleton();
+};
+
+inline void rocblas_internal_error_throw(rocblas_status rocblasState,
+                                         const char* name, const char* file,
+                                         const int line) {
+  std::ostringstream out;
+  out << name << " error( ";
+  switch (rocblasState) {
+    case rocblas_status_invalid_handle:
+      out << "rocblas_status_invalid_handle): handle not initialized, invalid "
+             "or null.";
+      break;
+    case rocblas_status_not_implemented:
+      out << "rocblas_status_not_implemented): function is not implemented.";
+      break;
+    case rocblas_status_invalid_pointer:
+      out << "rocblas_status_invalid_pointer): invalid pointer argument.";
+      break;
+    case rocblas_status_invalid_size:
+      out << "rocblas_status_invalid_size): invalid size argument.";
+      break;
+    case rocblas_status_memory_error:
+      out << "rocblas_status_memory_error): failed internal memory allocation, "
+             "copy or dealloc.";
+      break;
+    case rocblas_status_internal_error:
+      out << "rocblas_status_internal_error): other internal library failure.";
+      break;
+    case rocblas_status_perf_degraded:
+      out << "rocblas_status_perf_degraded): performance degraded due to low "
+             "device memory.";
+      break;
+    case rocblas_status_size_query_mismatch:
+      out << "unmatched start/stop size query): .";
+      break;
+    case rocblas_status_size_increased:
+      out << "rocblas_status_size_increased): queried device memory size "
+             "increased.";
+      break;
+    case rocblas_status_size_unchanged:
+      out << "rocblas_status_size_unchanged): queried device memory size "
+             "unchanged.";
+      break;
+    case rocblas_status_invalid_value:
+      out << "rocblas_status_invalid_value): passed argument not valid.";
+      break;
+    case rocblas_status_continue:
+      out << "rocblas_status_continue): nothing preventing function to "
+             "proceed.";
+      break;
+    case rocblas_status_check_numerics_fail:
+      out << "rocblas_status_check_numerics_fail): will be set if the "
+             "vector/matrix has a NaN or an Infinity.";
+      break;
+    default: out << "unrecognized error code): this is bad!"; break;
+  }
+  if (file) {
+    out << " " << file << ":" << line;
+  }
+  throw std::runtime_error(out.str());
+}
+
+inline void rocblas_internal_safe_call(rocblas_status rocblasState,
+                                       const char* name,
+                                       const char* file = nullptr,
+                                       const int line   = 0) {
+  if (rocblas_status_success != rocblasState) {
+    rocblas_internal_error_throw(rocblasState, name, file, line);
+  }
+}
+
+// The macro below defines the interface for the safe rocblas calls.
+// The functions themselves are protected by impl namespace and this
+// is not meant to be used by external application or libraries.
+#define KOKKOS_ROCBLAS_SAFE_CALL_IMPL(call) \
+  KokkosBlas::Impl::rocblas_internal_safe_call(call, #call, __FILE__, __LINE__)
+
+/// \brief This function converts KK transpose mode to rocBLAS transpose mode
+inline rocblas_operation trans_mode_kk_to_rocblas(const char kkMode[]) {
+  rocblas_operation trans;
+  if ((kkMode[0] == 'N') || (kkMode[0] == 'n'))
+    trans = rocblas_operation_none;
+  else if ((kkMode[0] == 'T') || (kkMode[0] == 't'))
+    trans = rocblas_operation_transpose;
+  else
+    trans = rocblas_operation_conjugate_transpose;
+  return trans;
+}
+
+}  // namespace Impl
+}  // namespace KokkosBlas
+
+#endif  // KOKKOSKERNELS_ENABLE_TPL_ROCBLAS
 
 // If LAPACK TPL is enabled, it is preferred over magma's LAPACK
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA
@@ -72,14 +250,13 @@ namespace KokkosBlas {
 namespace Impl {
 
 struct MagmaSingleton {
-
   MagmaSingleton();
 
-  static MagmaSingleton & singleton();
+  static MagmaSingleton& singleton();
 };
 
-}
-}
-#endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA
+}  // namespace Impl
+}  // namespace KokkosBlas
+#endif  // KOKKOSKERNELS_ENABLE_TPL_MAGMA
 
-#endif // KOKKOSBLAS_TPL_SPEC_HPP_
+#endif  // KOKKOSBLAS_TPL_SPEC_HPP_
diff --git a/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp
index 4b602bd765..974fe76eb0 100644
--- a/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp
@@ -49,62 +49,87 @@ namespace KokkosBlas {
 namespace Impl {
 
 // Specialization struct which defines whether a specialization exists
-template<class RVT, class AVT>
+template <class RVT, class AVT>
 struct trtri_tpl_spec_avail {
   enum : bool { value = false };
 };
 
 // Generic Host side LAPACK (could be MKL or whatever)
-#define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL( SCALAR , LAYOUTA, MEMSPACE ) \
-template<class ExecSpace> \
-struct trtri_tpl_spec_avail< \
-     Kokkos::View<int, LAYOUTA, Kokkos::HostSpace, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEMSPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> > \
-     >  { enum : bool { value = true }; };
+#define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE)         \
+  template <class ExecSpace>                                               \
+  struct trtri_tpl_spec_avail<                                             \
+      Kokkos::View<int, LAYOUTA, Kokkos::HostSpace,                        \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
+      Kokkos::View<SCALAR**, LAYOUTA, Kokkos::Device<ExecSpace, MEMSPACE>, \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {           \
+    enum : bool { value = true };                                          \
+  };
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
-#define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( SCALAR , LAYOUTA, MEMSPACE ) \
-KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE)
+#define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, MEMSPACE) \
+  KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE)
 #else
-#define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( SCALAR , LAYOUTA, MEMSPACE )
-#endif // KOKKOSKERNELS_ENABLE_TPL_BLAS
+#define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, MEMSPACE)
+#endif  // KOKKOSKERNELS_ENABLE_TPL_BLAS
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA
-#define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( SCALAR , LAYOUTA, MEMSPACE ) \
-KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE)
+#define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUTA, MEMSPACE) \
+  KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE)
 #else
-#define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( SCALAR , LAYOUTA, MEMSPACE )
-#endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA
+#define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUTA, MEMSPACE)
+#endif  // KOKKOSKERNELS_ENABLE_TPL_MAGMA
 
- KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( double,                  Kokkos::LayoutLeft, Kokkos::HostSpace) 
- KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( double,                  Kokkos::LayoutLeft, Kokkos::CudaSpace)
- KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( double,                  Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
- KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( float,                   Kokkos::LayoutLeft, Kokkos::HostSpace)
- KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( float,                   Kokkos::LayoutLeft, Kokkos::CudaSpace)
- KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( float,                   Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
- KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::HostSpace)
- KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::CudaSpace)
- KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex<double>, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
- KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::HostSpace)
- KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::CudaSpace)
- KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex<float>,  Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
+KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft,
+                                     Kokkos::HostSpace)
+KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft,
+                                      Kokkos::CudaSpace)
+KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft,
+                                      Kokkos::CudaUVMSpace)
+KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft,
+                                     Kokkos::HostSpace)
+KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft,
+                                      Kokkos::CudaSpace)
+KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft,
+                                      Kokkos::CudaUVMSpace)
+KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<double>,
+                                     Kokkos::LayoutLeft, Kokkos::HostSpace)
+KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex<double>,
+                                      Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex<double>,
+                                      Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
+KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<float>, Kokkos::LayoutLeft,
+                                     Kokkos::HostSpace)
+KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex<float>,
+                                      Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex<float>,
+                                      Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
 
- KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( double,                  Kokkos::LayoutRight, Kokkos::HostSpace)
- KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( double,                  Kokkos::LayoutRight, Kokkos::CudaSpace)
- KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( double,                  Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
- KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( float,                   Kokkos::LayoutRight, Kokkos::HostSpace)
- KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( float,                   Kokkos::LayoutRight, Kokkos::CudaSpace)
- KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( float,                   Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
- KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<double>, Kokkos::LayoutRight, Kokkos::HostSpace)
- KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex<double>, Kokkos::LayoutRight, Kokkos::CudaSpace)
- KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex<double>, Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
- KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( Kokkos::complex<float>,  Kokkos::LayoutRight, Kokkos::HostSpace)
- KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex<float>,  Kokkos::LayoutRight, Kokkos::CudaSpace)
- KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex<float>,  Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
+KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight,
+                                     Kokkos::HostSpace)
+KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight,
+                                      Kokkos::CudaSpace)
+KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight,
+                                      Kokkos::CudaUVMSpace)
+KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight,
+                                     Kokkos::HostSpace)
+KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight,
+                                      Kokkos::CudaSpace)
+KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight,
+                                      Kokkos::CudaUVMSpace)
+KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<double>,
+                                     Kokkos::LayoutRight, Kokkos::HostSpace)
+KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex<double>,
+                                      Kokkos::LayoutRight, Kokkos::CudaSpace)
+KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex<double>,
+                                      Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
+KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<float>,
+                                     Kokkos::LayoutRight, Kokkos::HostSpace)
+KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex<float>,
+                                      Kokkos::LayoutRight, Kokkos::CudaSpace)
+KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex<float>,
+                                      Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
 
-} // namespace Impl
-} // namespace KokkosBlas
+}  // namespace Impl
+}  // namespace KokkosBlas
 
-#endif // KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_HPP_
+#endif  // KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_HPP_
diff --git a/src/impl/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp
index 79d52a7602..af9f843938 100644
--- a/src/impl/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp
@@ -45,128 +45,157 @@
 #ifndef KOKKOSBLAS_TRTRI_TPL_SPEC_DECL_HPP_
 #define KOKKOSBLAS_TRTRI_TPL_SPEC_DECL_HPP_
 
-#include "KokkosBlas_Host_tpl.hpp" // trtri prototype
+#include "KokkosBlas_Host_tpl.hpp"  // trtri prototype
 #include "KokkosBlas_tpl_spec.hpp"
 
 namespace KokkosBlas {
 namespace Impl {
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS
-#define KOKKOSBLAS_TRTRI_BLAS_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \
-template<class ExecSpace> \
-struct TRTRI< \
-     Kokkos::View<int, LAYOUTA, Kokkos::HostSpace, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR_TYPE**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef SCALAR_TYPE SCALAR; \
-typedef Kokkos::View<int, LAYOUTA, Kokkos::HostSpace, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > RViewType; \
-  typedef Kokkos::View<const SCALAR_TYPE**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  \
-  static void \
-  trtri (const RViewType& R, \
-        const char uplo[], \
-        const char diag[], \
-        const AViewType& A) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::trtri[TPL_BLAS,"#SCALAR_TYPE"]"); \
-    const int M = static_cast<int> (A.extent(0)); \
-    \
-    bool A_is_layout_left = std::is_same<Kokkos::LayoutLeft,LAYOUTA>::value; \
-    \
-    const int AST = A_is_layout_left?A.stride(1):A.stride(0), LDA = (AST == 0) ? 1 : AST; \
-    \
-    char  uplo_; \
-    \
-    if ((uplo[0]=='L')||(uplo[0]=='l')) \
-      uplo_ = A_is_layout_left ? 'L' : 'U'; \
-    else \
-      uplo_ = A_is_layout_left ? 'U' : 'L'; \
-    \
-    R() = HostBlas<BASE_SCALAR_TYPE>::trtri(uplo_, diag[0], M, reinterpret_cast<const BASE_SCALAR_TYPE *>(A.data()), LDA); \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS_TRTRI_BLAS_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA,     \
+                                   MEM_SPACE, ETI_SPEC_AVAIL)                  \
+  template <class ExecSpace>                                                   \
+  struct TRTRI<Kokkos::View<int, LAYOUTA, Kokkos::HostSpace,                   \
+                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+               Kokkos::View<SCALAR_TYPE**, LAYOUTA,                            \
+                            Kokkos::Device<ExecSpace, MEM_SPACE>,              \
+                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+               true, ETI_SPEC_AVAIL> {                                         \
+    typedef SCALAR_TYPE SCALAR;                                                \
+    typedef Kokkos::View<int, LAYOUTA, Kokkos::HostSpace,                      \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        RViewType;                                                             \
+    typedef Kokkos::View<const SCALAR_TYPE**, LAYOUTA,                         \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        AViewType;                                                             \
+                                                                               \
+    static void trtri(const RViewType& R, const char uplo[],                   \
+                      const char diag[], const AViewType& A) {                 \
+      Kokkos::Profiling::pushRegion("KokkosBlas::trtri[TPL_BLAS," #SCALAR_TYPE \
+                                    "]");                                      \
+      const int M = static_cast<int>(A.extent(0));                             \
+                                                                               \
+      bool A_is_layout_left =                                                  \
+          std::is_same<Kokkos::LayoutLeft, LAYOUTA>::value;                    \
+                                                                               \
+      const int AST = A_is_layout_left ? A.stride(1) : A.stride(0),            \
+                LDA = (AST == 0) ? 1 : AST;                                    \
+                                                                               \
+      char uplo_;                                                              \
+                                                                               \
+      if ((uplo[0] == 'L') || (uplo[0] == 'l'))                                \
+        uplo_ = A_is_layout_left ? 'L' : 'U';                                  \
+      else                                                                     \
+        uplo_ = A_is_layout_left ? 'U' : 'L';                                  \
+                                                                               \
+      R() = HostBlas<BASE_SCALAR_TYPE>::trtri(                                 \
+          uplo_, diag[0], M,                                                   \
+          reinterpret_cast<const BASE_SCALAR_TYPE*>(A.data()), LDA);           \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
 #else
-#define KOKKOSBLAS_TRTRI_BLAS_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL)
-#endif // KOKKOSKERNELS_ENABLE_TPL_BLAS
+#define KOKKOSBLAS_TRTRI_BLAS_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \
+                                   MEM_SPACE, ETI_SPEC_AVAIL)
+#endif  // KOKKOSKERNELS_ENABLE_TPL_BLAS
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA
-#define KOKKOSBLAS_TRTRI_BLAS_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, MAGMA_FN, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \
-template<class ExecSpace> \
-struct TRTRI< \
-     Kokkos::View<int, LAYOUTA, Kokkos::HostSpace, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     Kokkos::View<SCALAR_TYPE**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-     true, ETI_SPEC_AVAIL> { \
-  typedef SCALAR_TYPE SCALAR; \
-typedef Kokkos::View<int, LAYOUTA, Kokkos::HostSpace, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > RViewType; \
-  typedef Kokkos::View<const SCALAR_TYPE**, LAYOUTA, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > AViewType; \
-  \
-  static void \
-  trtri (const RViewType& R, \
-        const char uplo[], \
-        const char diag[], \
-        const AViewType& A) { \
-    \
-    Kokkos::Profiling::pushRegion("KokkosBlas::trtri[TPL_BLAS,"#SCALAR_TYPE"]"); \
-    magma_int_t M = static_cast<magma_int_t> (A.extent(0)); \
-    \
-    bool A_is_layout_left = std::is_same<Kokkos::LayoutLeft,LAYOUTA>::value; \
-    \
-    magma_int_t AST = A_is_layout_left?A.stride(1):A.stride(0), LDA = (AST == 0) ? 1 : AST; \
-    magma_int_t info = 0; \
-    magma_uplo_t uplo_; \
-    magma_diag_t diag_; \
-    \
-    if ((uplo[0]=='L')||(uplo[0]=='l')) \
-      uplo_ = A_is_layout_left ? MagmaLower : MagmaUpper; \
-    else \
-      uplo_ = A_is_layout_left ? MagmaUpper : MagmaLower; \
-    \
-    if (diag[0] == 'U' || diag[0] == 'u') \
-      diag_ = MagmaUnit; \
-    else \
-      diag_ = MagmaNonUnit; \
-    \
-    KokkosBlas::Impl::MagmaSingleton & s = KokkosBlas::Impl::MagmaSingleton::singleton(); \
-    R() = MAGMA_FN(uplo_, diag_, M, reinterpret_cast<BASE_SCALAR_TYPE> (const_cast<SCALAR_TYPE*> (A.data())), LDA, &info); \
-    Kokkos::Profiling::popRegion(); \
-  } \
-};
+#define KOKKOSBLAS_TRTRI_BLAS_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, MAGMA_FN,   \
+                                    LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL)        \
+  template <class ExecSpace>                                                   \
+  struct TRTRI<Kokkos::View<int, LAYOUTA, Kokkos::HostSpace,                   \
+                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+               Kokkos::View<SCALAR_TYPE**, LAYOUTA,                            \
+                            Kokkos::Device<ExecSpace, MEM_SPACE>,              \
+                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+               true, ETI_SPEC_AVAIL> {                                         \
+    typedef SCALAR_TYPE SCALAR;                                                \
+    typedef Kokkos::View<int, LAYOUTA, Kokkos::HostSpace,                      \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        RViewType;                                                             \
+    typedef Kokkos::View<const SCALAR_TYPE**, LAYOUTA,                         \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        AViewType;                                                             \
+                                                                               \
+    static void trtri(const RViewType& R, const char uplo[],                   \
+                      const char diag[], const AViewType& A) {                 \
+      Kokkos::Profiling::pushRegion("KokkosBlas::trtri[TPL_BLAS," #SCALAR_TYPE \
+                                    "]");                                      \
+      magma_int_t M = static_cast<magma_int_t>(A.extent(0));                   \
+                                                                               \
+      bool A_is_layout_left =                                                  \
+          std::is_same<Kokkos::LayoutLeft, LAYOUTA>::value;                    \
+                                                                               \
+      magma_int_t AST  = A_is_layout_left ? A.stride(1) : A.stride(0),         \
+                  LDA  = (AST == 0) ? 1 : AST;                                 \
+      magma_int_t info = 0;                                                    \
+      magma_uplo_t uplo_;                                                      \
+      magma_diag_t diag_;                                                      \
+                                                                               \
+      if ((uplo[0] == 'L') || (uplo[0] == 'l'))                                \
+        uplo_ = A_is_layout_left ? MagmaLower : MagmaUpper;                    \
+      else                                                                     \
+        uplo_ = A_is_layout_left ? MagmaUpper : MagmaLower;                    \
+                                                                               \
+      if (diag[0] == 'U' || diag[0] == 'u')                                    \
+        diag_ = MagmaUnit;                                                     \
+      else                                                                     \
+        diag_ = MagmaNonUnit;                                                  \
+                                                                               \
+      KokkosBlas::Impl::MagmaSingleton& s =                                    \
+          KokkosBlas::Impl::MagmaSingleton::singleton();                       \
+      R() = MAGMA_FN(uplo_, diag_, M,                                          \
+                     reinterpret_cast<BASE_SCALAR_TYPE>(                       \
+                         const_cast<SCALAR_TYPE*>(A.data())),                  \
+                     LDA, &info);                                              \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
 #else
-#define KOKKOSBLAS_TRTRI_BLAS_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, MAGMA_FN, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL)
-#endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA
-
+#define KOKKOSBLAS_TRTRI_BLAS_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, MAGMA_FN, \
+                                    LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL)
+#endif  // KOKKOSKERNELS_ENABLE_TPL_MAGMA
 
 // Explicitly define the TRTRI class for all permutations listed below
 
 // Handle type and space permutations
-#define KOKKOSBLAS_DTRTRI_BLAS(LAYOUTA, ETI_SPEC_AVAIL) \
-KOKKOSBLAS_TRTRI_BLAS_HOST(double, double, LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) \
-KOKKOSBLAS_TRTRI_BLAS_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \
-KOKKOSBLAS_TRTRI_BLAS_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, LAYOUTA, Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL)
-
-#define KOKKOSBLAS_STRTRI_BLAS(LAYOUTA, ETI_SPEC_AVAIL) \
-KOKKOSBLAS_TRTRI_BLAS_HOST(float, float, LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) \
-KOKKOSBLAS_TRTRI_BLAS_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu, LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \
-KOKKOSBLAS_TRTRI_BLAS_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu, LAYOUTA, Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL)
-
-#define KOKKOSBLAS_ZTRTRI_BLAS(LAYOUTA, ETI_SPEC_AVAIL) \
-KOKKOSBLAS_TRTRI_BLAS_HOST(Kokkos::complex<double>, std::complex<double>, LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) \
-KOKKOSBLAS_TRTRI_BLAS_MAGMA(Kokkos::complex<double>, magmaDoubleComplex_ptr, magma_ztrtri_gpu, LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \
-KOKKOSBLAS_TRTRI_BLAS_MAGMA(Kokkos::complex<double>, magmaDoubleComplex_ptr, magma_ztrtri_gpu, LAYOUTA, Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL)
-
-#define KOKKOSBLAS_CTRTRI_BLAS(LAYOUTA, ETI_SPEC_AVAIL) \
-KOKKOSBLAS_TRTRI_BLAS_HOST(Kokkos::complex<float>, std::complex<float>, LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) \
-KOKKOSBLAS_TRTRI_BLAS_MAGMA(Kokkos::complex<float>, magmaFloatComplex_ptr, magma_ctrtri_gpu, LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \
-KOKKOSBLAS_TRTRI_BLAS_MAGMA(Kokkos::complex<float>, magmaFloatComplex_ptr, magma_ctrtri_gpu, LAYOUTA, Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL) \
+#define KOKKOSBLAS_DTRTRI_BLAS(LAYOUTA, ETI_SPEC_AVAIL)                   \
+  KOKKOSBLAS_TRTRI_BLAS_HOST(double, double, LAYOUTA, Kokkos::HostSpace,  \
+                             ETI_SPEC_AVAIL)                              \
+  KOKKOSBLAS_TRTRI_BLAS_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu,  \
+                              LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \
+  KOKKOSBLAS_TRTRI_BLAS_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu,  \
+                              LAYOUTA, Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS_STRTRI_BLAS(LAYOUTA, ETI_SPEC_AVAIL)                   \
+  KOKKOSBLAS_TRTRI_BLAS_HOST(float, float, LAYOUTA, Kokkos::HostSpace,    \
+                             ETI_SPEC_AVAIL)                              \
+  KOKKOSBLAS_TRTRI_BLAS_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu,    \
+                              LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \
+  KOKKOSBLAS_TRTRI_BLAS_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu,    \
+                              LAYOUTA, Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS_ZTRTRI_BLAS(LAYOUTA, ETI_SPEC_AVAIL)                        \
+  KOKKOSBLAS_TRTRI_BLAS_HOST(Kokkos::complex<double>, std::complex<double>,    \
+                             LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL)       \
+  KOKKOSBLAS_TRTRI_BLAS_MAGMA(Kokkos::complex<double>, magmaDoubleComplex_ptr, \
+                              magma_ztrtri_gpu, LAYOUTA, Kokkos::CudaSpace,    \
+                              ETI_SPEC_AVAIL)                                  \
+  KOKKOSBLAS_TRTRI_BLAS_MAGMA(Kokkos::complex<double>, magmaDoubleComplex_ptr, \
+                              magma_ztrtri_gpu, LAYOUTA, Kokkos::CudaUVMSpace, \
+                              ETI_SPEC_AVAIL)
+
+#define KOKKOSBLAS_CTRTRI_BLAS(LAYOUTA, ETI_SPEC_AVAIL)                        \
+  KOKKOSBLAS_TRTRI_BLAS_HOST(Kokkos::complex<float>, std::complex<float>,      \
+                             LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL)       \
+  KOKKOSBLAS_TRTRI_BLAS_MAGMA(Kokkos::complex<float>, magmaFloatComplex_ptr,   \
+                              magma_ctrtri_gpu, LAYOUTA, Kokkos::CudaSpace,    \
+                              ETI_SPEC_AVAIL)                                  \
+  KOKKOSBLAS_TRTRI_BLAS_MAGMA(Kokkos::complex<float>, magmaFloatComplex_ptr,   \
+                              magma_ctrtri_gpu, LAYOUTA, Kokkos::CudaUVMSpace, \
+                              ETI_SPEC_AVAIL)
 
 // Handle layout permutations
 KOKKOSBLAS_DTRTRI_BLAS(Kokkos::LayoutLeft, true)
@@ -174,22 +203,22 @@ KOKKOSBLAS_DTRTRI_BLAS(Kokkos::LayoutLeft, false)
 KOKKOSBLAS_DTRTRI_BLAS(Kokkos::LayoutRight, true)
 KOKKOSBLAS_DTRTRI_BLAS(Kokkos::LayoutRight, false)
 
-KOKKOSBLAS_STRTRI_BLAS(Kokkos::LayoutLeft,  true)
-KOKKOSBLAS_STRTRI_BLAS(Kokkos::LayoutLeft,  false)
+KOKKOSBLAS_STRTRI_BLAS(Kokkos::LayoutLeft, true)
+KOKKOSBLAS_STRTRI_BLAS(Kokkos::LayoutLeft, false)
 KOKKOSBLAS_STRTRI_BLAS(Kokkos::LayoutRight, true)
 KOKKOSBLAS_STRTRI_BLAS(Kokkos::LayoutRight, false)
 
-KOKKOSBLAS_ZTRTRI_BLAS(Kokkos::LayoutLeft,  true)
-KOKKOSBLAS_ZTRTRI_BLAS(Kokkos::LayoutLeft,  false)
+KOKKOSBLAS_ZTRTRI_BLAS(Kokkos::LayoutLeft, true)
+KOKKOSBLAS_ZTRTRI_BLAS(Kokkos::LayoutLeft, false)
 KOKKOSBLAS_ZTRTRI_BLAS(Kokkos::LayoutRight, true)
 KOKKOSBLAS_ZTRTRI_BLAS(Kokkos::LayoutRight, false)
 
-KOKKOSBLAS_CTRTRI_BLAS(Kokkos::LayoutLeft,  true)
-KOKKOSBLAS_CTRTRI_BLAS(Kokkos::LayoutLeft,  false)
+KOKKOSBLAS_CTRTRI_BLAS(Kokkos::LayoutLeft, true)
+KOKKOSBLAS_CTRTRI_BLAS(Kokkos::LayoutLeft, false)
 KOKKOSBLAS_CTRTRI_BLAS(Kokkos::LayoutRight, true)
 KOKKOSBLAS_CTRTRI_BLAS(Kokkos::LayoutRight, false)
 
-} // namespace Impl
-} // nameSpace KokkosBlas
+}  // namespace Impl
+}  // nameSpace KokkosBlas
 
-#endif // KOKKOSBLAS_TRTRI_TPL_SPEC_DECL_HPP_
+#endif  // KOKKOSBLAS_TRTRI_TPL_SPEC_DECL_HPP_
diff --git a/src/impl/tpls/KokkosKernels_tpl_handles_decl.hpp b/src/impl/tpls/KokkosKernels_tpl_handles_decl.hpp
index cee904ce9c..50b2d1c2ef 100644
--- a/src/impl/tpls/KokkosKernels_tpl_handles_decl.hpp
+++ b/src/impl/tpls/KokkosKernels_tpl_handles_decl.hpp
@@ -50,19 +50,37 @@
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
 #include "KokkosKernels_SparseUtils_cusparse.hpp"
 
-namespace KokkosKernels{
-namespace Impl{
+namespace KokkosKernels {
+namespace Impl {
 
 struct CusparseSingleton {
   cusparseHandle_t cusparseHandle;
 
   CusparseSingleton();
 
-  static CusparseSingleton & singleton();
+  static CusparseSingleton& singleton();
 };
 
-} // namespace Impl
-} // namespace KokkosKernels
+}  // namespace Impl
+}  // namespace KokkosKernels
 #endif
 
-#endif // KOKKOSKERNELS_TPL_HANDLES_DECL_HPP_
+#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE
+#include <rocsparse.h>
+
+namespace KokkosKernels {
+namespace Impl {
+
+struct RocsparseSingleton {
+  rocsparse_handle rocsparseHandle;
+
+  RocsparseSingleton();
+
+  static RocsparseSingleton& singleton();
+};
+
+}  // namespace Impl
+}  // namespace KokkosKernels
+#endif
+
+#endif  // KOKKOSKERNELS_TPL_HANDLES_DECL_HPP_
diff --git a/src/impl/tpls/KokkosKernels_tpl_handles_def.hpp b/src/impl/tpls/KokkosKernels_tpl_handles_def.hpp
index dc8c199550..84b5386a00 100644
--- a/src/impl/tpls/KokkosKernels_tpl_handles_def.hpp
+++ b/src/impl/tpls/KokkosKernels_tpl_handles_def.hpp
@@ -45,28 +45,50 @@
 #ifndef KOKKOSKERNELS_TPL_HANDLES_DEF_HPP_
 #define KOKKOSKERNELS_TPL_HANDLES_DEF_HPP_
 
+#include "KokkosKernels_tpl_handles_decl.hpp"
+
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
 #include "cusparse.h"
-#include "KokkosKernels_tpl_handles_decl.hpp"
 
-namespace KokkosKernels{
-namespace Impl{
+namespace KokkosKernels {
+namespace Impl {
 
 CusparseSingleton::CusparseSingleton() {
   KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreate(&cusparseHandle));
 
-  Kokkos::push_finalize_hook ([&] () { 
-      cusparseDestroy(cusparseHandle);
-  });
+  Kokkos::push_finalize_hook([&]() { cusparseDestroy(cusparseHandle); });
 }
 
-CusparseSingleton & CusparseSingleton::singleton() {
-  static CusparseSingleton s ;
-  return s ;
+CusparseSingleton& CusparseSingleton::singleton() {
+  static CusparseSingleton s;
+  return s;
 }
 
-} // namespace Impl
-} // namespace KokkosKernels
+}  // namespace Impl
+}  // namespace KokkosKernels
 #endif
 
-#endif // KOKKOSKERNELS_TPL_HANDLES_DEF_HPP_
+#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE
+#include "KokkosKernels_SparseUtils_rocsparse.hpp"
+
+namespace KokkosKernels {
+namespace Impl {
+
+RocsparseSingleton::RocsparseSingleton() {
+  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_handle(&rocsparseHandle));
+
+  Kokkos::push_finalize_hook([&]() {
+    KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_handle(rocsparseHandle));
+  });
+}
+
+RocsparseSingleton& RocsparseSingleton::singleton() {
+  static RocsparseSingleton s;
+  return s;
+}
+
+}  // namespace Impl
+}  // namespace KokkosKernels
+#endif  // KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE
+
+#endif  // KOKKOSKERNELS_TPL_HANDLES_DEF_HPP_
diff --git a/src/impl/tpls/KokkosSparse_gauss_seidel_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_gauss_seidel_tpl_spec_avail.hpp
index b7ed0ae2c0..1da5b2136b 100644
--- a/src/impl/tpls/KokkosSparse_gauss_seidel_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosSparse_gauss_seidel_tpl_spec_avail.hpp
@@ -48,25 +48,26 @@
 namespace KokkosSparse {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class KernelHandle, class a_size_view_t_, class a_lno_view_t>
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t>
 struct gauss_seidel_symbolic_tpl_spec_avail {
   enum : bool { value = false };
 };
 
 // Specialization struct which defines whether a specialization exists
-template<class KernelHandle, class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t>
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
+          class a_scalar_view_t>
 struct gauss_seidel_numeric_tpl_spec_avail {
   enum : bool { value = false };
 };
 
 // Specialization struct which defines whether a specialization exists
-template<class KernelHandle, class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t, class x_scalar_view_t, class y_scalar_view_t>
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
+          class a_scalar_view_t, class x_scalar_view_t, class y_scalar_view_t>
 struct gauss_seidel_apply_tpl_spec_avail {
   enum : bool { value = false };
 };
 
-
-}
-}
+}  // namespace Impl
+}  // namespace KokkosSparse
 
 #endif
diff --git a/src/impl/tpls/KokkosSparse_gauss_seidel_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_gauss_seidel_tpl_spec_decl.hpp
index d1d19e7640..4f8e822121 100644
--- a/src/impl/tpls/KokkosSparse_gauss_seidel_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosSparse_gauss_seidel_tpl_spec_decl.hpp
@@ -46,8 +46,7 @@
 #define KOKKOSPARSE_GAUSS_SEIDEL_TPL_SPEC_DECL_HPP_
 
 namespace KokkosSparse {
-namespace Impl {
-}
-}
+namespace Impl {}
+}  // namespace KokkosSparse
 
 #endif
diff --git a/src/impl/tpls/KokkosSparse_spgemm_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_spgemm_tpl_spec_avail.hpp
index d056386350..2067ad2b38 100644
--- a/src/impl/tpls/KokkosSparse_spgemm_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosSparse_spgemm_tpl_spec_avail.hpp
@@ -48,37 +48,31 @@
 namespace KokkosSparse {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<
-  class KernelHandle,
-  class a_size_view_t_, class a_lno_view_t,
-  class b_size_view_t_, class b_lno_view_t,
-  class c_size_view_t_>
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
+          class b_size_view_t_, class b_lno_view_t, class c_size_view_t_>
 struct spgemm_symbolic_tpl_spec_avail {
   enum : bool { value = false };
 };
 
 // Specialization struct which defines whether a specialization exists
-template<
-  class KernelHandle,
-  class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t,
-  class b_size_view_t_, class b_lno_view_t, class b_scalar_view_t,
-  class c_size_view_t_, class c_lno_view_t, class c_scalar_view_t>
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
+          class a_scalar_view_t, class b_size_view_t_, class b_lno_view_t,
+          class b_scalar_view_t, class c_size_view_t_, class c_lno_view_t,
+          class c_scalar_view_t>
 struct spgemm_numeric_tpl_spec_avail {
   enum : bool { value = false };
 };
 
 // Specialization struct which defines whether a specialization exists
-template<
-  class KernelHandle,
-  class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t,
-  class b_size_view_t_, class b_lno_view_t, class b_scalar_view_t,
-  class c_size_view_t_, class c_lno_view_t, class c_scalar_view_t,
-  class dinv_scalar_view_t>
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
+          class a_scalar_view_t, class b_size_view_t_, class b_lno_view_t,
+          class b_scalar_view_t, class c_size_view_t_, class c_lno_view_t,
+          class c_scalar_view_t, class dinv_scalar_view_t>
 struct spgemm_jacobi_tpl_spec_avail {
   enum : bool { value = false };
 };
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosSparse
 
 #endif
diff --git a/src/impl/tpls/KokkosSparse_spgemm_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spgemm_tpl_spec_decl.hpp
index e42c6525c3..cb171ad0a4 100644
--- a/src/impl/tpls/KokkosSparse_spgemm_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosSparse_spgemm_tpl_spec_decl.hpp
@@ -46,8 +46,7 @@
 #define KOKKOSPARSE_SPGEMM_TPL_SPEC_DECL_HPP_
 
 namespace KokkosSparse {
-namespace Impl {
-}
-}
+namespace Impl {}
+}  // namespace KokkosSparse
 
 #endif
diff --git a/src/impl/tpls/KokkosSparse_spiluk_numeric_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_spiluk_numeric_tpl_spec_avail.hpp
index d29e531a65..3535edf4ae 100644
--- a/src/impl/tpls/KokkosSparse_spiluk_numeric_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosSparse_spiluk_numeric_tpl_spec_avail.hpp
@@ -48,22 +48,15 @@
 namespace KokkosSparse {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class KernelHandle,
-         class ARowMapType,
-         class AEntriesType,
-         class AValuesType,
-         class LRowMapType,
-         class LEntriesType,
-         class LValuesType,
-         class URowMapType,
-         class UEntriesType,
-         class UValuesType>
+template <class KernelHandle, class ARowMapType, class AEntriesType,
+          class AValuesType, class LRowMapType, class LEntriesType,
+          class LValuesType, class URowMapType, class UEntriesType,
+          class UValuesType>
 struct spiluk_numeric_tpl_spec_avail {
   enum : bool { value = false };
 };
 
-
-}
-}
+}  // namespace Impl
+}  // namespace KokkosSparse
 
 #endif
diff --git a/src/impl/tpls/KokkosSparse_spiluk_numeric_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spiluk_numeric_tpl_spec_decl.hpp
index d97cf0dbb3..dfd07e8600 100644
--- a/src/impl/tpls/KokkosSparse_spiluk_numeric_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosSparse_spiluk_numeric_tpl_spec_decl.hpp
@@ -46,8 +46,7 @@
 #define KOKKOSPARSE_SPILUK_NUMERIC_TPL_SPEC_DECL_HPP_
 
 namespace KokkosSparse {
-namespace Impl {
-}
-}
+namespace Impl {}
+}  // namespace KokkosSparse
 
 #endif
diff --git a/src/impl/tpls/KokkosSparse_spiluk_symbolic_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_spiluk_symbolic_tpl_spec_avail.hpp
index 8e9d7c51ef..ccf1b19786 100644
--- a/src/impl/tpls/KokkosSparse_spiluk_symbolic_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosSparse_spiluk_symbolic_tpl_spec_avail.hpp
@@ -48,19 +48,14 @@
 namespace KokkosSparse {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class KernelHandle,
-         class ARowMapType,
-         class AEntriesType,
-         class LRowMapType,
-         class LEntriesType,
-         class URowMapType,
-         class UEntriesType>
+template <class KernelHandle, class ARowMapType, class AEntriesType,
+          class LRowMapType, class LEntriesType, class URowMapType,
+          class UEntriesType>
 struct spiluk_symbolic_tpl_spec_avail {
   enum : bool { value = false };
 };
 
-
-}
-}
+}  // namespace Impl
+}  // namespace KokkosSparse
 
 #endif
diff --git a/src/impl/tpls/KokkosSparse_spiluk_symbolic_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spiluk_symbolic_tpl_spec_decl.hpp
index 663fd689c2..4acaf4922e 100644
--- a/src/impl/tpls/KokkosSparse_spiluk_symbolic_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosSparse_spiluk_symbolic_tpl_spec_decl.hpp
@@ -46,8 +46,7 @@
 #define KOKKOSPARSE_SPILUK_SYMBOLIC_TPL_SPEC_DECL_HPP_
 
 namespace KokkosSparse {
-namespace Impl {
-}
-}
+namespace Impl {}
+}  // namespace KokkosSparse
 
 #endif
diff --git a/src/impl/tpls/KokkosSparse_spmv_blockcrsmatrix_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_spmv_blockcrsmatrix_tpl_spec_avail.hpp
new file mode 100644
index 0000000000..e7ac862f22
--- /dev/null
+++ b/src/impl/tpls/KokkosSparse_spmv_blockcrsmatrix_tpl_spec_avail.hpp
@@ -0,0 +1,70 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSPARSE_SPMV_BLOCKCRSMATRIX_TPL_SPEC_AVAIL_HPP_
+#define KOKKOSPARSE_SPMV_BLOCKCRSMATRIX_TPL_SPEC_AVAIL_HPP_
+
+namespace KokkosSparse {
+namespace Experimental {
+namespace Impl {
+
+// Specialization struct which defines whether a specialization exists
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM>
+struct spmv_blockcrsmatrix_tpl_spec_avail {
+  enum : bool { value = false };
+};
+
+// Specialization struct which defines whether a specialization exists
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM>
+struct spmv_mv_blockcrsmatrix_tpl_spec_avail {
+  enum : bool { value = false };
+};
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosSparse
+
+#endif  // KOKKOSPARSE_SPMV_BLOCKCRSMATRIX_TPL_SPEC_AVAIL_HPP_
diff --git a/src/impl/tpls/KokkosSparse_spmv_blockcrsmatrix_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_blockcrsmatrix_tpl_spec_decl.hpp
new file mode 100644
index 0000000000..d5e9aad5be
--- /dev/null
+++ b/src/impl/tpls/KokkosSparse_spmv_blockcrsmatrix_tpl_spec_decl.hpp
@@ -0,0 +1,48 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_TPL_SPEC_DECL_HPP
+#define KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_TPL_SPEC_DECL_HPP
+
+#endif  // KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_TPL_SPEC_DECL_HPP
diff --git a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp
new file mode 100644
index 0000000000..cd8287b38e
--- /dev/null
+++ b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp
@@ -0,0 +1,327 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_HPP_
+#define KOKKOSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_HPP_
+
+namespace KokkosSparse {
+namespace Experimental {
+namespace Impl {
+// Specialization struct which defines whether a specialization exists
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM>
+struct spmv_bsrmatrix_tpl_spec_avail {
+  enum : bool { value = false };
+};
+
+// cuSPARSE
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+
+// These versions of cuSPARSE require the ordinal and offset types to be the
+// same. For KokkosKernels, this means int/int only.
+
+#define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(                   \
+    SCALAR, ORDINAL, OFFSET, XL, YL, MEMSPACE)                                 \
+  template <>                                                                  \
+  struct spmv_bsrmatrix_tpl_spec_avail<                                        \
+      const SCALAR, const ORDINAL, Kokkos::Device<Kokkos::Cuda, MEMSPACE>,     \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET, const SCALAR*,    \
+      XL, Kokkos::Device<Kokkos::Cuda, MEMSPACE>,                              \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>, SCALAR*, \
+      YL, Kokkos::Device<Kokkos::Cuda, MEMSPACE>,                              \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged> > {                              \
+    enum : bool { value = true };                                              \
+  };
+
+#if (9000 <= CUDA_VERSION)
+
+KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int,
+                                                    Kokkos::LayoutLeft,
+                                                    Kokkos::LayoutLeft,
+                                                    Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int,
+                                                    Kokkos::LayoutLeft,
+                                                    Kokkos::LayoutLeft,
+                                                    Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int,
+                                                    Kokkos::LayoutRight,
+                                                    Kokkos::LayoutRight,
+                                                    Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int,
+                                                    Kokkos::LayoutRight,
+                                                    Kokkos::LayoutRight,
+                                                    Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int,
+                                                    Kokkos::LayoutLeft,
+                                                    Kokkos::LayoutLeft,
+                                                    Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int,
+                                                    Kokkos::LayoutLeft,
+                                                    Kokkos::LayoutLeft,
+                                                    Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int,
+                                                    Kokkos::LayoutRight,
+                                                    Kokkos::LayoutRight,
+                                                    Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int,
+                                                    Kokkos::LayoutRight,
+                                                    Kokkos::LayoutRight,
+                                                    Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>, int,
+                                                    int, Kokkos::LayoutLeft,
+                                                    Kokkos::LayoutLeft,
+                                                    Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>,
+                                                    int, int,
+                                                    Kokkos::LayoutLeft,
+                                                    Kokkos::LayoutLeft,
+                                                    Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>, int,
+                                                    int, Kokkos::LayoutRight,
+                                                    Kokkos::LayoutRight,
+                                                    Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>,
+                                                    int, int,
+                                                    Kokkos::LayoutRight,
+                                                    Kokkos::LayoutRight,
+                                                    Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>, int,
+                                                    int, Kokkos::LayoutLeft,
+                                                    Kokkos::LayoutLeft,
+                                                    Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>,
+                                                    int, int,
+                                                    Kokkos::LayoutLeft,
+                                                    Kokkos::LayoutLeft,
+                                                    Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>, int,
+                                                    int, Kokkos::LayoutRight,
+                                                    Kokkos::LayoutRight,
+                                                    Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>,
+                                                    int, int,
+                                                    Kokkos::LayoutRight,
+                                                    Kokkos::LayoutRight,
+                                                    Kokkos::CudaUVMSpace)
+
+#endif  // CUDA/CUSPARSE >= 9.0?
+#endif  // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+#define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE)      \
+  template <>                                                                  \
+  struct spmv_bsrmatrix_tpl_spec_avail<                                        \
+      const SCALAR, const int, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>,   \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const int, const SCALAR*,       \
+      Kokkos::LayoutLeft, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>,        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>, SCALAR*, \
+      Kokkos::LayoutLeft, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>,        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged> > {                              \
+    enum : bool { value = true };                                              \
+  };
+
+#ifdef KOKKOS_ENABLE_SERIAL
+KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(float, Kokkos::Serial)
+KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(double, Kokkos::Serial)
+KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex<float>,
+                                               Kokkos::Serial)
+KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex<double>,
+                                               Kokkos::Serial)
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMP
+KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(float, Kokkos::OpenMP)
+KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(double, Kokkos::OpenMP)
+KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex<float>,
+                                               Kokkos::OpenMP)
+KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex<double>,
+                                               Kokkos::OpenMP)
+#endif
+
+#endif
+
+// Specialization struct which defines whether a specialization exists
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM,
+          const bool integerScalarType =
+              std::is_integral<typename std::decay<AT>::type>::value>
+struct spmv_mv_bsrmatrix_tpl_spec_avail {
+  enum : bool { value = false };
+};
+
+// cuSPARSE
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+
+// These versions of cuSPARSE require the ordinal and offset types to be the
+// same. For KokkosKernels, this means int/int only.
+
+#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(                \
+    SCALAR, ORDINAL, OFFSET, XL, YL, MEMSPACE)                                 \
+  template <>                                                                  \
+  struct spmv_mv_bsrmatrix_tpl_spec_avail<                                     \
+      const SCALAR, const ORDINAL, Kokkos::Device<Kokkos::Cuda, MEMSPACE>,     \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET, const SCALAR*,    \
+      XL, Kokkos::Device<Kokkos::Cuda, MEMSPACE>,                              \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>, SCALAR*, \
+      YL, Kokkos::Device<Kokkos::Cuda, MEMSPACE>,                              \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, true> {                         \
+    enum : bool { value = true };                                              \
+  };
+
+#if (9000 <= CUDA_VERSION)
+
+KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int,
+                                                       Kokkos::LayoutLeft,
+                                                       Kokkos::LayoutLeft,
+                                                       Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int,
+                                                       Kokkos::LayoutLeft,
+                                                       Kokkos::LayoutLeft,
+                                                       Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int,
+                                                       Kokkos::LayoutRight,
+                                                       Kokkos::LayoutRight,
+                                                       Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int,
+                                                       Kokkos::LayoutRight,
+                                                       Kokkos::LayoutRight,
+                                                       Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int,
+                                                       Kokkos::LayoutLeft,
+                                                       Kokkos::LayoutLeft,
+                                                       Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int,
+                                                       Kokkos::LayoutLeft,
+                                                       Kokkos::LayoutLeft,
+                                                       Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int,
+                                                       Kokkos::LayoutRight,
+                                                       Kokkos::LayoutRight,
+                                                       Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int,
+                                                       Kokkos::LayoutRight,
+                                                       Kokkos::LayoutRight,
+                                                       Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>,
+                                                       int, int,
+                                                       Kokkos::LayoutLeft,
+                                                       Kokkos::LayoutLeft,
+                                                       Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>,
+                                                       int, int,
+                                                       Kokkos::LayoutLeft,
+                                                       Kokkos::LayoutLeft,
+                                                       Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>,
+                                                       int, int,
+                                                       Kokkos::LayoutRight,
+                                                       Kokkos::LayoutRight,
+                                                       Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>,
+                                                       int, int,
+                                                       Kokkos::LayoutRight,
+                                                       Kokkos::LayoutRight,
+                                                       Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>,
+                                                       int, int,
+                                                       Kokkos::LayoutLeft,
+                                                       Kokkos::LayoutLeft,
+                                                       Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>,
+                                                       int, int,
+                                                       Kokkos::LayoutLeft,
+                                                       Kokkos::LayoutLeft,
+                                                       Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>,
+                                                       int, int,
+                                                       Kokkos::LayoutRight,
+                                                       Kokkos::LayoutRight,
+                                                       Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>,
+                                                       int, int,
+                                                       Kokkos::LayoutRight,
+                                                       Kokkos::LayoutRight,
+                                                       Kokkos::CudaUVMSpace)
+
+#endif  // CUDA/CUSPARSE >= 9.0?
+#endif  // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE)   \
+  template <>                                                                  \
+  struct spmv_mv_bsrmatrix_tpl_spec_avail<                                     \
+      const SCALAR, const int, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>,   \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const int, const SCALAR*,       \
+      Kokkos::LayoutLeft, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>,        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>, SCALAR*, \
+      Kokkos::LayoutLeft, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>,        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, true> {                         \
+    enum : bool { value = true };                                              \
+  };
+
+#ifdef KOKKOS_ENABLE_SERIAL
+KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(float, Kokkos::Serial)
+KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(double, Kokkos::Serial)
+KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex<float>,
+                                                  Kokkos::Serial)
+KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex<double>,
+                                                  Kokkos::Serial)
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMP
+KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(float, Kokkos::OpenMP)
+KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(double, Kokkos::OpenMP)
+KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex<float>,
+                                                  Kokkos::OpenMP)
+KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex<double>,
+                                                  Kokkos::OpenMP)
+#endif
+
+#endif
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosSparse
+
+#endif  // KOKKOSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_HPP_
diff --git a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp
new file mode 100644
index 0000000000..a1ae213ea9
--- /dev/null
+++ b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp
@@ -0,0 +1,863 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP
+#define KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP
+
+#include "KokkosKernels_Controls.hpp"
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+#include <mkl.h>
+
+namespace KokkosSparse {
+namespace Experimental {
+namespace Impl {
+
+#if (__INTEL_MKL__ > 2017)
+// MKL 2018 and above: use new interface: sparse_matrix_t and mkl_sparse_?_mv()
+
+namespace BSR {
+inline void mkl_safe_call(int errcode) {
+  if (errcode != SPARSE_STATUS_SUCCESS)
+    throw std::runtime_error("MKL returned non-success error code");
+}
+
+inline sparse_operation_t mode_kk_to_mkl(char mode_kk) {
+  switch (toupper(mode_kk)) {
+    case 'N': return SPARSE_OPERATION_NON_TRANSPOSE;
+    case 'T': return SPARSE_OPERATION_TRANSPOSE;
+    case 'H': return SPARSE_OPERATION_CONJUGATE_TRANSPOSE;
+    default:;
+  }
+  throw std::invalid_argument(
+      "Invalid mode for MKL (should be one of N, T, H)");
+}
+}  // namespace BSR
+
+using BSR::mkl_safe_call;
+using BSR::mode_kk_to_mkl;
+
+inline matrix_descr getDescription() {
+  matrix_descr A_descr;
+  A_descr.type = SPARSE_MATRIX_TYPE_GENERAL;
+  A_descr.mode = SPARSE_FILL_MODE_FULL;
+  A_descr.diag = SPARSE_DIAG_NON_UNIT;
+  return A_descr;
+}
+
+inline void spmv_block_impl_mkl(sparse_operation_t op, float alpha, float beta,
+                                int m, int n, int b, const int* Arowptrs,
+                                const int* Aentries, const float* Avalues,
+                                const float* x, float* y) {
+  sparse_matrix_t A_mkl;
+  mkl_safe_call(mkl_sparse_s_create_bsr(
+      &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b,
+      const_cast<int*>(Arowptrs), const_cast<int*>(Arowptrs + 1),
+      const_cast<int*>(Aentries), const_cast<float*>(Avalues)));
+
+  matrix_descr A_descr = getDescription();
+  mkl_safe_call(mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y));
+}
+
+inline void spmv_block_impl_mkl(sparse_operation_t op, double alpha,
+                                double beta, int m, int n, int b,
+                                const int* Arowptrs, const int* Aentries,
+                                const double* Avalues, const double* x,
+                                double* y) {
+  sparse_matrix_t A_mkl;
+  mkl_safe_call(mkl_sparse_d_create_bsr(
+      &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b,
+      const_cast<int*>(Arowptrs), const_cast<int*>(Arowptrs + 1),
+      const_cast<int*>(Aentries), const_cast<double*>(Avalues)));
+
+  matrix_descr A_descr = getDescription();
+  mkl_safe_call(mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y));
+}
+
+inline void spmv_block_impl_mkl(sparse_operation_t op,
+                                Kokkos::complex<float> alpha,
+                                Kokkos::complex<float> beta, int m, int n,
+                                int b, const int* Arowptrs, const int* Aentries,
+                                const Kokkos::complex<float>* Avalues,
+                                const Kokkos::complex<float>* x,
+                                Kokkos::complex<float>* y) {
+  sparse_matrix_t A_mkl;
+  mkl_safe_call(mkl_sparse_c_create_bsr(
+      &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b,
+      const_cast<int*>(Arowptrs), const_cast<int*>(Arowptrs + 1),
+      const_cast<int*>(Aentries), (MKL_Complex8*)Avalues));
+
+  MKL_Complex8& alpha_mkl = reinterpret_cast<MKL_Complex8&>(alpha);
+  MKL_Complex8& beta_mkl  = reinterpret_cast<MKL_Complex8&>(beta);
+  matrix_descr A_descr    = getDescription();
+  mkl_safe_call(mkl_sparse_c_mv(op, alpha_mkl, A_mkl, A_descr,
+                                reinterpret_cast<const MKL_Complex8*>(x),
+                                beta_mkl, reinterpret_cast<MKL_Complex8*>(y)));
+}
+
+inline void spmv_block_impl_mkl(sparse_operation_t op,
+                                Kokkos::complex<double> alpha,
+                                Kokkos::complex<double> beta, int m, int n,
+                                int b, const int* Arowptrs, const int* Aentries,
+                                const Kokkos::complex<double>* Avalues,
+                                const Kokkos::complex<double>* x,
+                                Kokkos::complex<double>* y) {
+  sparse_matrix_t A_mkl;
+  mkl_safe_call(mkl_sparse_z_create_bsr(
+      &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b,
+      const_cast<int*>(Arowptrs), const_cast<int*>(Arowptrs + 1),
+      const_cast<int*>(Aentries), (MKL_Complex16*)Avalues));
+
+  matrix_descr A_descr     = getDescription();
+  MKL_Complex16& alpha_mkl = reinterpret_cast<MKL_Complex16&>(alpha);
+  MKL_Complex16& beta_mkl  = reinterpret_cast<MKL_Complex16&>(beta);
+  mkl_safe_call(mkl_sparse_z_mv(op, alpha_mkl, A_mkl, A_descr,
+                                reinterpret_cast<const MKL_Complex16*>(x),
+                                beta_mkl, reinterpret_cast<MKL_Complex16*>(y)));
+}
+
+inline void spm_mv_block_impl_mkl(sparse_operation_t op, float alpha,
+                                  float beta, int m, int n, int b,
+                                  const int* Arowptrs, const int* Aentries,
+                                  const float* Avalues, const float* x,
+                                  int colx, int ldx, float* y, int ldy) {
+  sparse_matrix_t A_mkl;
+  mkl_safe_call(mkl_sparse_s_create_bsr(
+      &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b,
+      const_cast<int*>(Arowptrs), const_cast<int*>(Arowptrs + 1),
+      const_cast<int*>(Aentries), const_cast<float*>(Avalues)));
+
+  matrix_descr A_descr = getDescription();
+  mkl_safe_call(mkl_sparse_s_mm(op, alpha, A_mkl, A_descr,
+                                SPARSE_LAYOUT_ROW_MAJOR, x, colx, ldx, beta, y,
+                                ldy));
+}
+
+inline void spm_mv_block_impl_mkl(sparse_operation_t op, double alpha,
+                                  double beta, int m, int n, int b,
+                                  const int* Arowptrs, const int* Aentries,
+                                  const double* Avalues, const double* x,
+                                  int colx, int ldx, double* y, int ldy) {
+  sparse_matrix_t A_mkl;
+  mkl_safe_call(mkl_sparse_d_create_bsr(
+      &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b,
+      const_cast<int*>(Arowptrs), const_cast<int*>(Arowptrs + 1),
+      const_cast<int*>(Aentries), const_cast<double*>(Avalues)));
+
+  matrix_descr A_descr = getDescription();
+  mkl_safe_call(mkl_sparse_d_mm(op, alpha, A_mkl, A_descr,
+                                SPARSE_LAYOUT_ROW_MAJOR, x, colx, ldx, beta, y,
+                                ldy));
+}
+
+inline void spm_mv_block_impl_mkl(sparse_operation_t op,
+                                  Kokkos::complex<float> alpha,
+                                  Kokkos::complex<float> beta, int m, int n,
+                                  int b, const int* Arowptrs,
+                                  const int* Aentries,
+                                  const Kokkos::complex<float>* Avalues,
+                                  const Kokkos::complex<float>* x, int colx,
+                                  int ldx, Kokkos::complex<float>* y, int ldy) {
+  sparse_matrix_t A_mkl;
+  mkl_safe_call(mkl_sparse_c_create_bsr(
+      &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b,
+      const_cast<int*>(Arowptrs), const_cast<int*>(Arowptrs + 1),
+      const_cast<int*>(Aentries), (MKL_Complex8*)Avalues));
+
+  MKL_Complex8& alpha_mkl = reinterpret_cast<MKL_Complex8&>(alpha);
+  MKL_Complex8& beta_mkl  = reinterpret_cast<MKL_Complex8&>(beta);
+  matrix_descr A_descr    = getDescription();
+  mkl_safe_call(
+      mkl_sparse_c_mm(op, alpha_mkl, A_mkl, A_descr, SPARSE_LAYOUT_ROW_MAJOR,
+                      reinterpret_cast<const MKL_Complex8*>(x), colx, ldx,
+                      beta_mkl, reinterpret_cast<MKL_Complex8*>(y), ldy));
+}
+
+inline void spm_mv_block_impl_mkl(
+    sparse_operation_t op, Kokkos::complex<double> alpha,
+    Kokkos::complex<double> beta, int m, int n, int b, const int* Arowptrs,
+    const int* Aentries, const Kokkos::complex<double>* Avalues,
+    const Kokkos::complex<double>* x, int colx, int ldx,
+    Kokkos::complex<double>* y, int ldy) {
+  sparse_matrix_t A_mkl;
+  mkl_safe_call(mkl_sparse_z_create_bsr(
+      &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b,
+      const_cast<int*>(Arowptrs), const_cast<int*>(Arowptrs + 1),
+      const_cast<int*>(Aentries), (MKL_Complex16*)Avalues));
+
+  matrix_descr A_descr     = getDescription();
+  MKL_Complex16& alpha_mkl = reinterpret_cast<MKL_Complex16&>(alpha);
+  MKL_Complex16& beta_mkl  = reinterpret_cast<MKL_Complex16&>(beta);
+  mkl_safe_call(
+      mkl_sparse_z_mm(op, alpha_mkl, A_mkl, A_descr, SPARSE_LAYOUT_ROW_MAJOR,
+                      reinterpret_cast<const MKL_Complex16*>(x), colx, ldx,
+                      beta_mkl, reinterpret_cast<MKL_Complex16*>(y), ldy));
+}
+
+#endif
+
+#if (__INTEL_MKL__ == 2017)
+
+inline void spmv_block_impl_mkl(char mode, float alpha, float beta, int m,
+                                int n, int b, const int* Arowptrs,
+                                const int* Aentries, const float* Avalues,
+                                const float* x, float* y) {
+  mkl_sbsrmv(&mode, &m, &n, &b, &alpha, "G**C", Avalues, Aentries, Arowptrs,
+             Arowptrs + 1, x, &beta, y);
+}
+
+inline void spmv_block_impl_mkl(char mode, double alpha, double beta, int m,
+                                int n, int b, const int* Arowptrs,
+                                const int* Aentries, const double* Avalues,
+                                const double* x, double* y) {
+  mkl_dbsrmv(&mode, &m, &n, &b, &alpha, "G**C", Avalues, Aentries, Arowptrs,
+             Arowptrs + 1, x, &beta, y);
+}
+
+inline void spmv_block_impl_mkl(char mode, Kokkos::complex<float> alpha,
+                                Kokkos::complex<float> beta, int m, int n,
+                                int b, const int* Arowptrs, const int* Aentries,
+                                const Kokkos::complex<float>* Avalues,
+                                const Kokkos::complex<float>* x,
+                                Kokkos::complex<float>* y) {
+  const MKL_Complex8* alpha_mkl = reinterpret_cast<const MKL_Complex8*>(&alpha);
+  const MKL_Complex8* beta_mkl  = reinterpret_cast<const MKL_Complex8*>(&beta);
+  const MKL_Complex8* Avalues_mkl =
+      reinterpret_cast<const MKL_Complex8*>(Avalues);
+  const MKL_Complex8* x_mkl = reinterpret_cast<const MKL_Complex8*>(x);
+  MKL_Complex8* y_mkl       = reinterpret_cast<MKL_Complex8*>(y);
+  mkl_cbsrmv(&mode, &m, &n, &b, alpha_mkl, "G**C", Avalues_mkl, Aentries,
+             Arowptrs, Arowptrs + 1, x_mkl, beta_mkl, y_mkl);
+}
+
+inline void spmv_block_impl_mkl(char mode, Kokkos::complex<double> alpha,
+                                Kokkos::complex<double> beta, int m, int n,
+                                int b, const int* Arowptrs, const int* Aentries,
+                                const Kokkos::complex<double>* Avalues,
+                                const Kokkos::complex<double>* x,
+                                Kokkos::complex<double>* y) {
+  const MKL_Complex16* alpha_mkl =
+      reinterpret_cast<const MKL_Complex16*>(&alpha);
+  const MKL_Complex16* beta_mkl = reinterpret_cast<const MKL_Complex16*>(&beta);
+  const MKL_Complex16* Avalues_mkl =
+      reinterpret_cast<const MKL_Complex16*>(Avalues);
+  const MKL_Complex16* x_mkl = reinterpret_cast<const MKL_Complex16*>(x);
+  MKL_Complex16* y_mkl       = reinterpret_cast<MKL_Complex16*>(y);
+  mkl_zbsrmv(&mode, &m, &n, &b, alpha_mkl, "G**C", Avalues_mkl, Aentries,
+             Arowptrs, Arowptrs + 1, x_mkl, beta_mkl, y_mkl);
+}
+
+inline void spm_mv_block_impl_mkl(char mode, float alpha, float beta, int m,
+                                  int n, int b, const int* Arowptrs,
+                                  const int* Aentries, const float* Avalues,
+                                  const float* x, int colx, int ldx, float* y,
+                                  int ldy) {
+  mkl_sbsrmm(&mode, &m, &n, &colx, &b, &alpha, "G**C", Avalues, Aentries,
+             Arowptrs, Arowptrs + 1, x, &beta, y);
+}
+
+inline void spm_mv_block_impl_mkl(char mode, double alpha, double beta, int m,
+                                  int n, int b, const int* Arowptrs,
+                                  const int* Aentries, const double* Avalues,
+                                  const double* x, int colx, int ldx, double* y,
+                                  int ldy) {
+  mkl_dbsrmm(&mode, &m, &n, &colx, &b, &alpha, "G**C", Avalues, Aentries,
+             Arowptrs, Arowptrs + 1, x, ldx, &beta, y, ldy);
+}
+
+inline void spm_mv_block_impl_mkl(char mode, Kokkos::complex<float> alpha,
+                                  Kokkos::complex<float> beta, int m, int n,
+                                  int b, const int* Arowptrs,
+                                  const int* Aentries,
+                                  const Kokkos::complex<float>* Avalues,
+                                  const Kokkos::complex<float>* x, int colx,
+                                  int ldx, Kokkos::complex<float>* y, int ldy) {
+  const MKL_Complex8* alpha_mkl = reinterpret_cast<const MKL_Complex8*>(&alpha);
+  const MKL_Complex8* beta_mkl  = reinterpret_cast<const MKL_Complex8*>(&beta);
+  const MKL_Complex8* Avalues_mkl =
+      reinterpret_cast<const MKL_Complex8*>(Avalues);
+  const MKL_Complex8* x_mkl = reinterpret_cast<const MKL_Complex8*>(x);
+  MKL_Complex8* y_mkl       = reinterpret_cast<MKL_Complex8*>(y);
+  mkl_cbsrmv(&mode, &m, &n, &colx, &b, alpha_mkl, "G**C", Avalues_mkl, Aentries,
+             Arowptrs, Arowptrs + 1, x_mkl, ldx, beta_mkl, y_mkl, ldy);
+}
+
+inline void spm_mv_block_impl_mkl(
+    char mode, Kokkos::complex<double> alpha, Kokkos::complex<double> beta,
+    int m, int n, int b, const int* Arowptrs, const int* Aentries,
+    const Kokkos::complex<double>* Avalues, const Kokkos::complex<double>* x,
+    int colx, int ldx, Kokkos::complex<double>* y, int ldy) {
+  const MKL_Complex16* alpha_mkl =
+      reinterpret_cast<const MKL_Complex16*>(&alpha);
+  const MKL_Complex16* beta_mkl = reinterpret_cast<const MKL_Complex16*>(&beta);
+  const MKL_Complex16* Avalues_mkl =
+      reinterpret_cast<const MKL_Complex16*>(Avalues);
+  const MKL_Complex16* x_mkl = reinterpret_cast<const MKL_Complex16*>(x);
+  MKL_Complex16* y_mkl       = reinterpret_cast<MKL_Complex16*>(y);
+  mkl_zbsrmv(&mode, &m, &n, &colx, &b, alpha_mkl, "G**C", Avalues_mkl, Aentries,
+             Arowptrs, Arowptrs + 1, x_mkl, ldx, beta_mkl, y_mkl, ldy);
+}
+
+#endif
+
+#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY)              \
+  template <>                                                                  \
+  struct SPMV_BSRMATRIX<                                                       \
+      SCALAR const, int const, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>,   \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, int const, SCALAR const*,       \
+      Kokkos::LayoutLeft, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>,        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>, SCALAR*, \
+      Kokkos::LayoutLeft, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>,        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, true, COMPILE_LIBRARY> {        \
+    using device_type = Kokkos::Device<EXECSPACE, Kokkos::HostSpace>;          \
+    using AMatrix =                                                            \
+        BsrMatrix<SCALAR const, int const, device_type,                        \
+                  Kokkos::MemoryTraits<Kokkos::Unmanaged>, int const>;         \
+    using XVector = Kokkos::View<                                              \
+        SCALAR const*, Kokkos::LayoutLeft, device_type,                        \
+        Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>;       \
+    using YVector = Kokkos::View<SCALAR*, Kokkos::LayoutLeft, device_type,     \
+                                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>;     \
+    using coefficient_type = typename YVector::non_const_value_type;           \
+                                                                               \
+    static void spmv_bsrmatrix(                                                \
+        const KokkosKernels::Experimental::Controls& /*controls*/,             \
+        const char mode[], const coefficient_type& alpha, const AMatrix& A,    \
+        const XVector& X, const coefficient_type& beta, const YVector& Y) {    \
+      std::string label = "KokkosSparse::spmv[TPL_MKL,BSRMATRIX" +             \
+                          Kokkos::ArithTraits<SCALAR>::name() + "]";           \
+      Kokkos::Profiling::pushRegion(label);                                    \
+      spmv_block_impl_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(),   \
+                          A.numCols(), A.blockDim(), A.graph.row_map.data(),   \
+                          A.graph.entries.data(), A.values.data(), X.data(),   \
+                          Y.data());                                           \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+#ifdef KOKKOS_ENABLE_SERIAL
+KOKKOSSPARSE_SPMV_MKL(float, Kokkos::Serial, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MKL(double, Kokkos::Serial,
+                      KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MKL(Kokkos::complex<float>, Kokkos::Serial,
+                      KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MKL(Kokkos::complex<double>, Kokkos::Serial,
+                      KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMP
+KOKKOSSPARSE_SPMV_MKL(float, Kokkos::OpenMP, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MKL(double, Kokkos::OpenMP,
+                      KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MKL(Kokkos::complex<float>, Kokkos::OpenMP,
+                      KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MKL(Kokkos::complex<double>, Kokkos::OpenMP,
+                      KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+#endif
+
+#undef KOKKOSSPARSE_SPMV_MKL
+
+#define KOKKOSSPARSE_SPMV_MV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY)           \
+  template <>                                                                  \
+  struct SPMV_MV_BSRMATRIX<                                                    \
+      SCALAR const, int const, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>,   \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, int const, SCALAR const**,      \
+      Kokkos::LayoutLeft, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>,        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,          \
+      SCALAR**, Kokkos::LayoutLeft,                                            \
+      Kokkos::Device<EXECSPACE, Kokkos::HostSpace>,                            \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, true, true, COMPILE_LIBRARY> {  \
+    using device_type = Kokkos::Device<EXECSPACE, Kokkos::HostSpace>;          \
+    using AMatrix =                                                            \
+        BsrMatrix<SCALAR const, int const, device_type,                        \
+                  Kokkos::MemoryTraits<Kokkos::Unmanaged>, int const>;         \
+    using XVector = Kokkos::View<                                              \
+        SCALAR const**, Kokkos::LayoutLeft, device_type,                       \
+        Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>;       \
+    using YVector = Kokkos::View<SCALAR**, Kokkos::LayoutLeft, device_type,    \
+                                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>;     \
+    using coefficient_type = typename YVector::non_const_value_type;           \
+                                                                               \
+    static void spmv_mv_bsrmatrix(                                             \
+        const KokkosKernels::Experimental::Controls& /*controls*/,             \
+        const char mode[], const coefficient_type& alpha, const AMatrix& A,    \
+        const XVector& X, const coefficient_type& beta, const YVector& Y) {    \
+      std::string label = "KokkosSparse::spmv[TPL_MKL,BSRMATRIX" +             \
+                          Kokkos::ArithTraits<SCALAR>::name() + "]";           \
+      Kokkos::Profiling::pushRegion(label);                                    \
+      int colx = static_cast<int>(X.extent(1));                                \
+      int ldx  = static_cast<int>(X.stride_1());                               \
+      int ldy  = static_cast<int>(Y.stride_1());                               \
+      spm_mv_block_impl_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), \
+                            A.numCols(), A.blockDim(), A.graph.row_map.data(), \
+                            A.graph.entries.data(), A.values.data(), X.data(), \
+                            colx, ldx, Y.data(), ldy);                         \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+#ifdef KOKKOS_ENABLE_SERIAL
+KOKKOSSPARSE_SPMV_MV_MKL(float, Kokkos::Serial,
+                         KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_MKL(double, Kokkos::Serial,
+                         KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex<float>, Kokkos::Serial,
+                         KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex<double>, Kokkos::Serial,
+                         KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMP
+KOKKOSSPARSE_SPMV_MV_MKL(float, Kokkos::OpenMP,
+                         KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_MKL(double, Kokkos::OpenMP,
+                         KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex<float>, Kokkos::OpenMP,
+                         KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex<double>, Kokkos::OpenMP,
+                         KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+#endif
+
+#undef KOKKOSSPARSE_SPMV_MV_MKL
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosSparse
+
+#endif
+
+// cuSPARSE
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+#include "cusparse.h"
+#include "KokkosKernels_SparseUtils_cusparse.hpp"
+
+//
+// From  https://docs.nvidia.com/cuda/cusparse/index.html#bsrmv
+// Several comments on bsrmv():
+// - Only blockDim > 1 is supported
+// - Only CUSPARSE_OPERATION_NON_TRANSPOSE is supported
+// - Only CUSPARSE_MATRIX_TYPE_GENERAL is supported.
+//
+namespace KokkosSparse {
+namespace Experimental {
+namespace Impl {
+
+template <class AMatrix, class XVector, class YVector>
+void spmv_block_impl_cusparse(
+    const KokkosKernels::Experimental::Controls& controls, const char mode[],
+    typename YVector::non_const_value_type const& alpha, const AMatrix& A,
+    const XVector& x, typename YVector::non_const_value_type const& beta,
+    const YVector& y) {
+  using offset_type = typename AMatrix::non_const_size_type;
+  using entry_type  = typename AMatrix::non_const_ordinal_type;
+  using value_type  = typename AMatrix::non_const_value_type;
+
+  /* initialize cusparse library */
+  cusparseHandle_t cusparseHandle = controls.getCusparseHandle();
+
+  /* Set the operation mode */
+  cusparseOperation_t myCusparseOperation;
+  switch (toupper(mode[0])) {
+    case 'N': myCusparseOperation = CUSPARSE_OPERATION_NON_TRANSPOSE; break;
+    default: {
+      std::cerr << "Mode " << mode << " invalid for cusparse[*]bsrmv.\n";
+      throw std::invalid_argument("Invalid mode");
+    } break;
+  }
+
+#if (9000 <= CUDA_VERSION)
+
+  /* create and set the matrix descriptor */
+  cusparseMatDescr_t descrA = 0;
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descrA));
+  KOKKOS_CUSPARSE_SAFE_CALL(
+      cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
+  KOKKOS_CUSPARSE_SAFE_CALL(
+      cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO));
+  cusparseDirection_t dirA = CUSPARSE_DIRECTION_ROW;
+
+  /* perform the actual SpMV operation */
+  if ((std::is_same<int, offset_type>::value) &&
+      (std::is_same<int, entry_type>::value)) {
+    if (std::is_same<value_type, float>::value) {
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseSbsrmv(
+          cusparseHandle, dirA, myCusparseOperation, A.numRows(), A.numCols(),
+          A.nnz(), reinterpret_cast<float const*>(&alpha), descrA,
+          reinterpret_cast<float const*>(A.values.data()),
+          A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(),
+          reinterpret_cast<float const*>(x.data()),
+          reinterpret_cast<float const*>(&beta),
+          reinterpret_cast<float*>(y.data())));
+    } else if (std::is_same<value_type, double>::value) {
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseDbsrmv(
+          cusparseHandle, dirA, myCusparseOperation, A.numRows(), A.numCols(),
+          A.nnz(), reinterpret_cast<double const*>(&alpha), descrA,
+          reinterpret_cast<double const*>(A.values.data()),
+          A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(),
+          reinterpret_cast<double const*>(x.data()),
+          reinterpret_cast<double const*>(&beta),
+          reinterpret_cast<double*>(y.data())));
+    } else if (std::is_same<value_type, Kokkos::complex<float>>::value) {
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseCbsrmv(
+          cusparseHandle, dirA, myCusparseOperation, A.numRows(), A.numCols(),
+          A.nnz(), reinterpret_cast<cuComplex const*>(&alpha), descrA,
+          reinterpret_cast<cuComplex const*>(A.values.data()),
+          A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(),
+          reinterpret_cast<cuComplex const*>(x.data()),
+          reinterpret_cast<cuComplex const*>(&beta),
+          reinterpret_cast<cuComplex*>(y.data())));
+    } else if (std::is_same<value_type, Kokkos::complex<double>>::value) {
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseZbsrmv(
+          cusparseHandle, dirA, myCusparseOperation, A.numRows(), A.numCols(),
+          A.nnz(), reinterpret_cast<cuDoubleComplex const*>(&alpha), descrA,
+          reinterpret_cast<cuDoubleComplex const*>(A.values.data()),
+          A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(),
+          reinterpret_cast<cuDoubleComplex const*>(x.data()),
+          reinterpret_cast<cuDoubleComplex const*>(&beta),
+          reinterpret_cast<cuDoubleComplex*>(y.data())));
+    } else {
+      throw std::logic_error(
+          "Trying to call cusparse[*]bsrmv with a scalar type not "
+          "float/double, "
+          "nor complex of either!");
+    }
+  } else {
+    throw std::logic_error(
+        "With cuSPARSE pre-10.0, offset and entry types must be int. "
+        "Something wrong with TPL avail logic.");
+  }
+
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyMatDescr(descrA));
+#endif  // CUDA_VERSION
+}
+
+// Reference
+// https://docs.nvidia.com/cuda/cusparse/index.html#bsrmm
+// Several comments on bsrmm():
+// - Only blockDim > 1 is supported
+// - Only CUSPARSE_OPERATION_NON_TRANSPOSE is supported
+// - Only CUSPARSE_MATRIX_TYPE_GENERAL is supported.
+//
+template <class AMatrix, class XVector, class YVector>
+void spm_mv_block_impl_cusparse(
+    const KokkosKernels::Experimental::Controls& controls, const char mode[],
+    typename YVector::non_const_value_type const& alpha, const AMatrix& A,
+    const XVector& x, typename YVector::non_const_value_type const& beta,
+    const YVector& y) {
+  using offset_type = typename AMatrix::non_const_size_type;
+  using entry_type  = typename AMatrix::non_const_ordinal_type;
+  using value_type  = typename AMatrix::non_const_value_type;
+
+  /* initialize cusparse library */
+  cusparseHandle_t cusparseHandle = controls.getCusparseHandle();
+
+  /* Set the operation mode */
+  cusparseOperation_t myCusparseOperation;
+  switch (toupper(mode[0])) {
+    case 'N': myCusparseOperation = CUSPARSE_OPERATION_NON_TRANSPOSE; break;
+    default: {
+      std::cerr << "Mode " << mode << " invalid for cusparse[*]bsrmv.\n";
+      throw std::invalid_argument("Invalid mode");
+    } break;
+  }
+
+  int colx = static_cast<int>(x.extent(1));
+  int ldx  = static_cast<int>(x.stride_1());
+  int ldy  = static_cast<int>(y.stride_1());
+
+#if (9000 <= CUDA_VERSION)
+
+  /* create and set the matrix descriptor */
+  cusparseMatDescr_t descrA = 0;
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descrA));
+  KOKKOS_CUSPARSE_SAFE_CALL(
+      cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
+  KOKKOS_CUSPARSE_SAFE_CALL(
+      cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO));
+  cusparseDirection_t dirA = CUSPARSE_DIRECTION_ROW;
+
+  /* perform the actual SpMV operation */
+  if ((std::is_same<int, offset_type>::value) &&
+      (std::is_same<int, entry_type>::value)) {
+    if (std::is_same<value_type, float>::value) {
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseSbsrmm(
+          cusparseHandle, dirA, myCusparseOperation,
+          CUSPARSE_OPERATION_NON_TRANSPOSE, A.numRows(), colx, A.numCols(),
+          A.nnz(), reinterpret_cast<float const*>(&alpha), descrA,
+          reinterpret_cast<float const*>(A.values.data()),
+          A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(),
+          reinterpret_cast<float const*>(x.data()), ldx,
+          reinterpret_cast<float const*>(&beta),
+          reinterpret_cast<float*>(y.data()), ldy));
+    } else if (std::is_same<value_type, double>::value) {
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseDbsrmm(
+          cusparseHandle, dirA, myCusparseOperation,
+          CUSPARSE_OPERATION_NON_TRANSPOSE, A.numRows(), colx, A.numCols(),
+          A.nnz(), reinterpret_cast<double const*>(&alpha), descrA,
+          reinterpret_cast<double const*>(A.values.data()),
+          A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(),
+          reinterpret_cast<double const*>(x.data()), ldx,
+          reinterpret_cast<double const*>(&beta),
+          reinterpret_cast<double*>(y.data()), ldy));
+    } else if (std::is_same<value_type, Kokkos::complex<float>>::value) {
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseCbsrmm(
+          cusparseHandle, dirA, myCusparseOperation,
+          CUSPARSE_OPERATION_NON_TRANSPOSE, A.numRows(), colx, A.numCols(),
+          A.nnz(), reinterpret_cast<cuComplex const*>(&alpha), descrA,
+          reinterpret_cast<cuComplex const*>(A.values.data()),
+          A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(),
+          reinterpret_cast<cuComplex const*>(x.data()), ldx,
+          reinterpret_cast<cuComplex const*>(&beta),
+          reinterpret_cast<cuComplex*>(y.data()), ldy));
+    } else if (std::is_same<value_type, Kokkos::complex<double>>::value) {
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseZbsrmm(
+          cusparseHandle, dirA, myCusparseOperation,
+          CUSPARSE_OPERATION_NON_TRANSPOSE, A.numRows(), colx, A.numCols(),
+          A.nnz(), reinterpret_cast<cuDoubleComplex const*>(&alpha), descrA,
+          reinterpret_cast<cuDoubleComplex const*>(A.values.data()),
+          A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(),
+          reinterpret_cast<cuDoubleComplex const*>(x.data()), ldx,
+          reinterpret_cast<cuDoubleComplex const*>(&beta),
+          reinterpret_cast<cuDoubleComplex*>(y.data()), ldy));
+    } else {
+      throw std::logic_error(
+          "Trying to call cusparse[*]bsrmm with a scalar type not "
+          "float/double, "
+          "nor complex of either!");
+    }
+  } else {
+    throw std::logic_error(
+        "With cuSPARSE pre-10.0, offset and entry types must be int. "
+        "Something wrong with TPL avail logic.");
+  }
+
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyMatDescr(descrA));
+#endif  // CUDA_VERSION
+}
+
+#define KOKKOSSPARSE_SPMV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE,     \
+                                   COMPILE_LIBRARY)                            \
+  template <>                                                                  \
+  struct SPMV_BSRMATRIX<                                                       \
+      SCALAR const, ORDINAL const, Kokkos::Device<Kokkos::Cuda, SPACE>,        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, OFFSET const, SCALAR const*,    \
+      LAYOUT, Kokkos::Device<Kokkos::Cuda, SPACE>,                             \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>, SCALAR*, \
+      LAYOUT, Kokkos::Device<Kokkos::Cuda, SPACE>,                             \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, true, COMPILE_LIBRARY> {        \
+    using device_type       = Kokkos::Device<Kokkos::Cuda, SPACE>;             \
+    using memory_trait_type = Kokkos::MemoryTraits<Kokkos::Unmanaged>;         \
+    using AMatrix = BsrMatrix<SCALAR const, ORDINAL const, device_type,        \
+                              memory_trait_type, OFFSET const>;                \
+    using XVector = Kokkos::View<                                              \
+        SCALAR const*, LAYOUT, device_type,                                    \
+        Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>;       \
+    using YVector =                                                            \
+        Kokkos::View<SCALAR*, LAYOUT, device_type, memory_trait_type>;         \
+    using Controls = KokkosKernels::Experimental::Controls;                    \
+                                                                               \
+    using coefficient_type = typename YVector::non_const_value_type;           \
+                                                                               \
+    static void spmv_bsrmatrix(const Controls& controls, const char mode[],    \
+                               const coefficient_type& alpha,                  \
+                               const AMatrix& A, const XVector& x,             \
+                               const coefficient_type& beta,                   \
+                               const YVector& y) {                             \
+      std::string label = "KokkosSparse::spmv[TPL_CUSPARSE,BSRMATRIX" +        \
+                          Kokkos::ArithTraits<SCALAR>::name() + "]";           \
+      Kokkos::Profiling::pushRegion(label);                                    \
+      spmv_block_impl_cusparse(controls, mode, alpha, A, x, beta, y);          \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+#if (9000 <= CUDA_VERSION)
+KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutLeft,
+                           Kokkos::CudaSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutRight,
+                           Kokkos::CudaSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(float, int, int, Kokkos::LayoutLeft,
+                           Kokkos::CudaSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(float, int, int, Kokkos::LayoutRight,
+                           Kokkos::CudaSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<double>, int, int,
+                           Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<double>, int, int,
+                           Kokkos::LayoutRight, Kokkos::CudaSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>, int, int, Kokkos::LayoutLeft,
+                           Kokkos::CudaSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>, int, int,
+                           Kokkos::LayoutRight, Kokkos::CudaSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutLeft,
+                           Kokkos::CudaUVMSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutRight,
+                           Kokkos::CudaUVMSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(float, int, int, Kokkos::LayoutLeft,
+                           Kokkos::CudaUVMSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(float, int, int, Kokkos::LayoutRight,
+                           Kokkos::CudaUVMSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<double>, int, int,
+                           Kokkos::LayoutLeft, Kokkos::CudaUVMSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<double>, int, int,
+                           Kokkos::LayoutRight, Kokkos::CudaUVMSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>, int, int, Kokkos::LayoutLeft,
+                           Kokkos::CudaUVMSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>, int, int,
+                           Kokkos::LayoutRight, Kokkos::CudaUVMSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+#endif
+
+#undef KOKKOSSPARSE_SPMV_CUSPARSE
+
+#define KOKKOSSPARSE_SPMV_MV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE,  \
+                                      COMPILE_LIBRARY)                         \
+  template <>                                                                  \
+  struct SPMV_MV_BSRMATRIX<                                                    \
+      SCALAR const, ORDINAL const, Kokkos::Device<Kokkos::Cuda, SPACE>,        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, OFFSET const, SCALAR const**,   \
+      LAYOUT, Kokkos::Device<Kokkos::Cuda, SPACE>,                             \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,          \
+      SCALAR**, LAYOUT, Kokkos::Device<Kokkos::Cuda, SPACE>,                   \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, true, true, COMPILE_LIBRARY> {  \
+    using device_type       = Kokkos::Device<Kokkos::Cuda, SPACE>;             \
+    using memory_trait_type = Kokkos::MemoryTraits<Kokkos::Unmanaged>;         \
+    using AMatrix = BsrMatrix<SCALAR const, ORDINAL const, device_type,        \
+                              memory_trait_type, OFFSET const>;                \
+    using XVector = Kokkos::View<                                              \
+        SCALAR const**, LAYOUT, device_type,                                   \
+        Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>;       \
+    using YVector =                                                            \
+        Kokkos::View<SCALAR**, LAYOUT, device_type, memory_trait_type>;        \
+    using Controls = KokkosKernels::Experimental::Controls;                    \
+                                                                               \
+    using coefficient_type = typename YVector::non_const_value_type;           \
+                                                                               \
+    static void spmv_mv_bsrmatrix(const Controls& controls, const char mode[], \
+                                  const coefficient_type& alpha,               \
+                                  const AMatrix& A, const XVector& x,          \
+                                  const coefficient_type& beta,                \
+                                  const YVector& y) {                          \
+      std::string label = "KokkosSparse::spmv[TPL_CUSPARSE,BSRMATRIX" +        \
+                          Kokkos::ArithTraits<SCALAR>::name() + "]";           \
+      Kokkos::Profiling::pushRegion(label);                                    \
+      spm_mv_block_impl_cusparse(controls, mode, alpha, A, x, beta, y);        \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+#if (9000 <= CUDA_VERSION)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutLeft,
+                              Kokkos::CudaSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutRight,
+                              Kokkos::CudaSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutLeft,
+                              Kokkos::CudaSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutRight,
+                              Kokkos::CudaSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<double>, int, int,
+                              Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<double>, int, int,
+                              Kokkos::LayoutRight, Kokkos::CudaSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<float>, int, int,
+                              Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<float>, int, int,
+                              Kokkos::LayoutRight, Kokkos::CudaSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutLeft,
+                              Kokkos::CudaUVMSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutRight,
+                              Kokkos::CudaUVMSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutLeft,
+                              Kokkos::CudaUVMSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutRight,
+                              Kokkos::CudaUVMSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<double>, int, int,
+                              Kokkos::LayoutLeft, Kokkos::CudaUVMSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<double>, int, int,
+                              Kokkos::LayoutRight, Kokkos::CudaUVMSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<float>, int, int,
+                              Kokkos::LayoutLeft, Kokkos::CudaUVMSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<float>, int, int,
+                              Kokkos::LayoutRight, Kokkos::CudaUVMSpace,
+                              KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+#endif
+
+#undef KOKKOSSPARSE_SPMV_MV_CUSPARSE
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosSparse
+
+#endif
+
+#endif  // KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP
diff --git a/src/impl/tpls/KokkosSparse_spmv_struct_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_spmv_struct_tpl_spec_avail.hpp
index 0de5dcf51b..5d443c992d 100644
--- a/src/impl/tpls/KokkosSparse_spmv_struct_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosSparse_spmv_struct_tpl_spec_avail.hpp
@@ -48,22 +48,20 @@
 namespace KokkosSparse {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class AT, class AO, class AD, class AM, class AS,
-         class XT, class XL, class XD, class XM,
-         class YT, class YL, class YD, class YM>
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM>
 struct spmv_struct_tpl_spec_avail {
   enum : bool { value = false };
 };
 
 // Specialization struct which defines whether a specialization exists
-template<class AT, class AO, class AD, class AM, class AS,
-         class XT, class XL, class XD, class XM,
-         class YT, class YL, class YD, class YM>
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM>
 struct spmv_mv_struct_tpl_spec_avail {
   enum : bool { value = false };
 };
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosSparse
 
-#endif // KOKKOSPARSE_SPMV_STRUCT_TPL_SPEC_AVAIL_HPP_
+#endif  // KOKKOSPARSE_SPMV_STRUCT_TPL_SPEC_AVAIL_HPP_
diff --git a/src/impl/tpls/KokkosSparse_spmv_struct_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_struct_tpl_spec_decl.hpp
index 172c783029..33bfca9f7c 100644
--- a/src/impl/tpls/KokkosSparse_spmv_struct_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosSparse_spmv_struct_tpl_spec_decl.hpp
@@ -46,8 +46,7 @@
 #define KOKKOSPARSE_SPMV_STRUCT_TPL_SPEC_DECL_HPP_
 
 namespace KokkosSparse {
-namespace Impl {
-}
-}
+namespace Impl {}
+}  // namespace KokkosSparse
 
-#endif // KOKKOSPARSE_SPMV_STRUCT_TPL_SPEC_DECL_HPP_
+#endif  // KOKKOSPARSE_SPMV_STRUCT_TPL_SPEC_DECL_HPP_
diff --git a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp
index a6749be8c8..fd42797d71 100644
--- a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp
@@ -48,9 +48,8 @@
 namespace KokkosSparse {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class AT, class AO, class AD, class AM, class AS,
-         class XT, class XL, class XD, class XM,
-         class YT, class YL, class YD, class YM>
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM>
 struct spmv_tpl_spec_avail {
   enum : bool { value = false };
 };
@@ -58,72 +57,197 @@ struct spmv_tpl_spec_avail {
 // cuSPARSE
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
 
-//These versions of cuSPARSE require the ordinal and offset types to be the same.
-//For KokkosKernels, this means int/int only.
+// These versions of cuSPARSE require the ordinal and offset types to be the
+// same. For KokkosKernels, this means int/int only.
 
-#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(SCALAR, ORDINAL, OFFSET, XL, YL, MEMSPACE) \
-template <> \
-struct spmv_tpl_spec_avail<const SCALAR, const ORDINAL, Kokkos::Device<Kokkos::Cuda, MEMSPACE>, Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET, \
-			   const SCALAR*,       XL, Kokkos::Device<Kokkos::Cuda, MEMSPACE>, Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>, \
-			   SCALAR*,             YL, Kokkos::Device<Kokkos::Cuda, MEMSPACE>, Kokkos::MemoryTraits<Kokkos::Unmanaged> > { \
-  enum : bool { value = true }; \
-};
+#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(SCALAR, ORDINAL, OFFSET, XL, \
+                                                  YL, MEMSPACE)                \
+  template <>                                                                  \
+  struct spmv_tpl_spec_avail<                                                  \
+      const SCALAR, const ORDINAL, Kokkos::Device<Kokkos::Cuda, MEMSPACE>,     \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET, const SCALAR*,    \
+      XL, Kokkos::Device<Kokkos::Cuda, MEMSPACE>,                              \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>, SCALAR*, \
+      YL, Kokkos::Device<Kokkos::Cuda, MEMSPACE>,                              \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged> > {                              \
+    enum : bool { value = true };                                              \
+  };
 
 #if (9000 <= CUDA_VERSION)
 
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>, int, int, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>, int, int, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>, int, int, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>, int, int, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
-
-//CUDA_VERSION by itself cannot determine whether the generic cuSPARSE API is available:
-//cuSPARSE version 10.1.105 does not have the generic API, but it comes with the same CUDA_VERSION (10010) as 10.1.243 which does.
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, Kokkos::LayoutLeft,
+                                          Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, Kokkos::LayoutLeft,
+                                          Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, Kokkos::LayoutRight,
+                                          Kokkos::LayoutRight,
+                                          Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, Kokkos::LayoutRight,
+                                          Kokkos::LayoutRight,
+                                          Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, Kokkos::LayoutLeft,
+                                          Kokkos::LayoutLeft,
+                                          Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, Kokkos::LayoutLeft,
+                                          Kokkos::LayoutLeft,
+                                          Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, Kokkos::LayoutRight,
+                                          Kokkos::LayoutRight,
+                                          Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, Kokkos::LayoutRight,
+                                          Kokkos::LayoutRight,
+                                          Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>, int, int,
+                                          Kokkos::LayoutLeft,
+                                          Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>, int, int,
+                                          Kokkos::LayoutLeft,
+                                          Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>, int, int,
+                                          Kokkos::LayoutRight,
+                                          Kokkos::LayoutRight,
+                                          Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>, int, int,
+                                          Kokkos::LayoutRight,
+                                          Kokkos::LayoutRight,
+                                          Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>, int, int,
+                                          Kokkos::LayoutLeft,
+                                          Kokkos::LayoutLeft,
+                                          Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>, int, int,
+                                          Kokkos::LayoutLeft,
+                                          Kokkos::LayoutLeft,
+                                          Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>, int, int,
+                                          Kokkos::LayoutRight,
+                                          Kokkos::LayoutRight,
+                                          Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>, int, int,
+                                          Kokkos::LayoutRight,
+                                          Kokkos::LayoutRight,
+                                          Kokkos::CudaUVMSpace)
+
+// CUDA_VERSION by itself cannot determine whether the generic cuSPARSE API is
+// available: cuSPARSE version 10.1.105 does not have the generic API, but it
+// comes with the same CUDA_VERSION (10010) as 10.1.243 which does.
 #if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION)
 
-//Can enable int64/size_t.
-//TODO: if Nvidia ever supports int/size_t, add that too.
-
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(float, int64_t, size_t, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(double, int64_t, size_t, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(float, int64_t, size_t, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(double, int64_t, size_t, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(float, int64_t, size_t, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(double, int64_t, size_t, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(float, int64_t, size_t, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(double, int64_t, size_t, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>, int64_t, size_t, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>, int64_t, size_t, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>, int64_t, size_t, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>, int64_t, size_t, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>, int64_t, size_t, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>, int64_t, size_t, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>, int64_t, size_t, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
-  KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>, int64_t, size_t, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace)
+// Can enable int64/size_t.
+// TODO: if Nvidia ever supports int/size_t, add that too.
+
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(float, int64_t, size_t,
+                                          Kokkos::LayoutLeft,
+                                          Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(double, int64_t, size_t,
+                                          Kokkos::LayoutLeft,
+                                          Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(float, int64_t, size_t,
+                                          Kokkos::LayoutRight,
+                                          Kokkos::LayoutRight,
+                                          Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(double, int64_t, size_t,
+                                          Kokkos::LayoutRight,
+                                          Kokkos::LayoutRight,
+                                          Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(float, int64_t, size_t,
+                                          Kokkos::LayoutLeft,
+                                          Kokkos::LayoutLeft,
+                                          Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(double, int64_t, size_t,
+                                          Kokkos::LayoutLeft,
+                                          Kokkos::LayoutLeft,
+                                          Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(float, int64_t, size_t,
+                                          Kokkos::LayoutRight,
+                                          Kokkos::LayoutRight,
+                                          Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(double, int64_t, size_t,
+                                          Kokkos::LayoutRight,
+                                          Kokkos::LayoutRight,
+                                          Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>, int64_t,
+                                          size_t, Kokkos::LayoutLeft,
+                                          Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>, int64_t,
+                                          size_t, Kokkos::LayoutLeft,
+                                          Kokkos::LayoutLeft, Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>, int64_t,
+                                          size_t, Kokkos::LayoutRight,
+                                          Kokkos::LayoutRight,
+                                          Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>, int64_t,
+                                          size_t, Kokkos::LayoutRight,
+                                          Kokkos::LayoutRight,
+                                          Kokkos::CudaSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>, int64_t,
+                                          size_t, Kokkos::LayoutLeft,
+                                          Kokkos::LayoutLeft,
+                                          Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>, int64_t,
+                                          size_t, Kokkos::LayoutLeft,
+                                          Kokkos::LayoutLeft,
+                                          Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>, int64_t,
+                                          size_t, Kokkos::LayoutRight,
+                                          Kokkos::LayoutRight,
+                                          Kokkos::CudaUVMSpace)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>, int64_t,
+                                          size_t, Kokkos::LayoutRight,
+                                          Kokkos::LayoutRight,
+                                          Kokkos::CudaUVMSpace)
 
 #endif  // CUSPARSE >= 10.3 (nested, implies >= 9.0)
 #endif  // CUDA/CUSPARSE >= 9.0?
 #endif  // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
 
+#if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE)
+
+#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(SCALAR, LAYOUT)             \
+  template <>                                                                  \
+  struct spmv_tpl_spec_avail<                                                  \
+      const SCALAR, const rocsparse_int,                                       \
+      Kokkos::Device<Kokkos::Experimental::HIP,                                \
+                     Kokkos::Experimental::HIPSpace>,                          \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const rocsparse_int,            \
+      const SCALAR*, LAYOUT,                                                   \
+      Kokkos::Device<Kokkos::Experimental::HIP,                                \
+                     Kokkos::Experimental::HIPSpace>,                          \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>, SCALAR*, \
+      LAYOUT,                                                                  \
+      Kokkos::Device<Kokkos::Experimental::HIP,                                \
+                     Kokkos::Experimental::HIPSpace>,                          \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged> > {                              \
+    enum : bool { value = true };                                              \
+  };
+
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(double, Kokkos::LayoutLeft)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(float, Kokkos::LayoutLeft)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(Kokkos::complex<double>,
+                                           Kokkos::LayoutLeft)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(Kokkos::complex<float>,
+                                           Kokkos::LayoutLeft)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(double, Kokkos::LayoutRight)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(float, Kokkos::LayoutRight)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(Kokkos::complex<double>,
+                                           Kokkos::LayoutRight)
+KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(Kokkos::complex<float>,
+                                           Kokkos::LayoutRight)
+
+#endif  // KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE
+
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
-#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \
-template <> \
-struct spmv_tpl_spec_avail<const SCALAR, const int, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>, Kokkos::MemoryTraits<Kokkos::Unmanaged>, const int, \
-			   const SCALAR*,       Kokkos::LayoutLeft, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>, Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>, \
-			   SCALAR*,             Kokkos::LayoutLeft, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>, Kokkos::MemoryTraits<Kokkos::Unmanaged> > { \
-  enum : bool { value = true }; \
-};
+#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE)                \
+  template <>                                                                  \
+  struct spmv_tpl_spec_avail<                                                  \
+      const SCALAR, const int, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>,   \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const int, const SCALAR*,       \
+      Kokkos::LayoutLeft, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>,        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>, SCALAR*, \
+      Kokkos::LayoutLeft, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>,        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged> > {                              \
+    enum : bool { value = true };                                              \
+  };
 
 #ifdef KOKKOS_ENABLE_SERIAL
 KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(float, Kokkos::Serial)
@@ -139,18 +263,18 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex<float>, Kokkos::OpenMP)
 KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex<double>, Kokkos::OpenMP)
 #endif
 
-#endif
+#endif  // KOKKOSKERNELS_ENABLE_TPL_MKL
 
 // Specialization struct which defines whether a specialization exists
-template<class AT, class AO, class AD, class AM, class AS,
-         class XT, class XL, class XD, class XM,
-         class YT, class YL, class YD, class YM,
-         const bool integerScalarType =
-           std::is_integral<typename std::decay<AT>::type>::value>
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM,
+          const bool integerScalarType =
+              std::is_integral<typename std::decay<AT>::type>::value>
 struct spmv_mv_tpl_spec_avail {
   enum : bool { value = false };
 };
 
-}}  // KokkosSparse::Impl
+}  // namespace Impl
+}  // namespace KokkosSparse
 
-#endif // KOKKOSPARSE_SPMV_TPL_SPEC_AVAIL_HPP_
+#endif  // KOKKOSPARSE_SPMV_TPL_SPEC_AVAIL_HPP_
diff --git a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp
index 2b39a58c3a..17a72b2ad3 100644
--- a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp
@@ -55,472 +55,738 @@
 namespace KokkosSparse {
 namespace Impl {
 
-  template <class AMatrix, class XVector, class YVector>
-  void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls,
-		     const char mode[],
-		     typename YVector::non_const_value_type const & alpha,
-		     const AMatrix& A,
-		     const XVector& x,
-		     typename YVector::non_const_value_type const & beta,
-		     const YVector& y) {
-    using offset_type = typename AMatrix::non_const_size_type;
-    using entry_type  = typename AMatrix::non_const_ordinal_type;
-    using value_type  = typename AMatrix::non_const_value_type;
-
-    /* initialize cusparse library */
-    cusparseHandle_t cusparseHandle = controls.getCusparseHandle();
-
-    /* Set the operation mode */
-    cusparseOperation_t myCusparseOperation;
-    switch(toupper(mode[0]))
-    {
-      case 'N':
-        myCusparseOperation = CUSPARSE_OPERATION_NON_TRANSPOSE;
-        break;
-      case 'T':
-        myCusparseOperation = CUSPARSE_OPERATION_TRANSPOSE;
-        break;
-      case 'H':
-        myCusparseOperation = CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE;
-        break;
-      default:
-      {
-        std::cerr << "Mode " << mode << " invalid for cuSPARSE SpMV.\n";
-        throw std::invalid_argument("Invalid mode");
-      }
+template <class AMatrix, class XVector, class YVector>
+void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls,
+                   const char mode[],
+                   typename YVector::non_const_value_type const& alpha,
+                   const AMatrix& A, const XVector& x,
+                   typename YVector::non_const_value_type const& beta,
+                   const YVector& y) {
+  using offset_type = typename AMatrix::non_const_size_type;
+  using entry_type  = typename AMatrix::non_const_ordinal_type;
+  using value_type  = typename AMatrix::non_const_value_type;
+
+  /* initialize cusparse library */
+  cusparseHandle_t cusparseHandle = controls.getCusparseHandle();
+
+  /* Set the operation mode */
+  cusparseOperation_t myCusparseOperation;
+  switch (toupper(mode[0])) {
+    case 'N': myCusparseOperation = CUSPARSE_OPERATION_NON_TRANSPOSE; break;
+    case 'T': myCusparseOperation = CUSPARSE_OPERATION_TRANSPOSE; break;
+    case 'H':
+      myCusparseOperation = CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE;
+      break;
+    default: {
+      std::cerr << "Mode " << mode << " invalid for cuSPARSE SpMV.\n";
+      throw std::invalid_argument("Invalid mode");
     }
+  }
 
 #if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION)
 
-    /* Check that cusparse can handle the types of the input Kokkos::CrsMatrix */
-    cusparseIndexType_t myCusparseOffsetType;
-    if(std::is_same<offset_type, int>::value)
-      myCusparseOffsetType = CUSPARSE_INDEX_32I;
-    else if(std::is_same<offset_type, int64_t>::value || std::is_same<offset_type, size_t>::value)
-      myCusparseOffsetType = CUSPARSE_INDEX_64I;
-    else 
-      throw std::logic_error("Offset type of CrsMatrix isn't supported by cuSPARSE, yet TPL layer says it is");
-    cusparseIndexType_t myCusparseEntryType;
-    if(std::is_same<entry_type, int>::value)
-      myCusparseEntryType = CUSPARSE_INDEX_32I;
-    else if(std::is_same<entry_type, int64_t>::value)
-      myCusparseEntryType = CUSPARSE_INDEX_64I;
-    else
-      throw std::logic_error("Ordinal (entry) type of CrsMatrix isn't supported by cuSPARSE, yet TPL layer says it is");
-    cudaDataType myCudaDataType;
-    if(std::is_same<value_type, float>::value)
-      myCudaDataType = CUDA_R_32F;
-    else if(std::is_same<value_type, double>::value)
-      myCudaDataType = CUDA_R_64F;
-    else if(std::is_same<value_type, Kokkos::complex<float>>::value)
-      myCudaDataType = CUDA_C_32F;
-    else if(std::is_same<value_type, Kokkos::complex<double>>::value)
-      myCudaDataType = CUDA_C_64F;
-    else
-      throw std::logic_error("Scalar (data) type of CrsMatrix isn't supported by cuSPARSE, yet TPL layer says it is");
-
-    /* create matrix */
-    cusparseSpMatDescr_t A_cusparse;
-    KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr(&A_cusparse, A.numRows(), A.numCols(), A.nnz(),
-						(void*) A.graph.row_map.data(),
-						(void*) A.graph.entries.data(),
-						(void*) A.values.data(),
-						myCusparseOffsetType,
-						myCusparseEntryType,
-						CUSPARSE_INDEX_BASE_ZERO,
-						myCudaDataType));
-
-    /* create lhs and rhs */
-    cusparseDnVecDescr_t vecX, vecY;
-    KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec(&vecX, x.extent_int(0), (void*) x.data(), myCudaDataType));
-    KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec(&vecY, y.extent_int(0), (void*) y.data(), myCudaDataType));
-
-    size_t bufferSize     = 0;
-    void*  dBuffer        = NULL;
-    cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT;
-    if(controls.isParameter("algorithm"))
-    {
-      const std::string algName = controls.getParameter("algorithm");
-      if(algName == "default")
-        alg = CUSPARSE_MV_ALG_DEFAULT;
-      else if(algName == "merge")
-        alg = CUSPARSE_CSRMV_ALG2;
-    }
-    KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV_bufferSize(cusparseHandle, myCusparseOperation,
-						      &alpha, A_cusparse, vecX, &beta, vecY, myCudaDataType,
-						      alg, &bufferSize));
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc(&dBuffer, bufferSize));
+  /* Check that cusparse can handle the types of the input Kokkos::CrsMatrix */
+  cusparseIndexType_t myCusparseOffsetType;
+  if (std::is_same<offset_type, int>::value)
+    myCusparseOffsetType = CUSPARSE_INDEX_32I;
+  else if (std::is_same<offset_type, int64_t>::value ||
+           std::is_same<offset_type, size_t>::value)
+    myCusparseOffsetType = CUSPARSE_INDEX_64I;
+  else
+    throw std::logic_error(
+        "Offset type of CrsMatrix isn't supported by cuSPARSE, yet TPL layer "
+        "says it is");
+  cusparseIndexType_t myCusparseEntryType;
+  if (std::is_same<entry_type, int>::value)
+    myCusparseEntryType = CUSPARSE_INDEX_32I;
+  else if (std::is_same<entry_type, int64_t>::value)
+    myCusparseEntryType = CUSPARSE_INDEX_64I;
+  else
+    throw std::logic_error(
+        "Ordinal (entry) type of CrsMatrix isn't supported by cuSPARSE, yet "
+        "TPL layer says it is");
+  cudaDataType myCudaDataType;
+  if (std::is_same<value_type, float>::value)
+    myCudaDataType = CUDA_R_32F;
+  else if (std::is_same<value_type, double>::value)
+    myCudaDataType = CUDA_R_64F;
+  else if (std::is_same<value_type, Kokkos::complex<float>>::value)
+    myCudaDataType = CUDA_C_32F;
+  else if (std::is_same<value_type, Kokkos::complex<double>>::value)
+    myCudaDataType = CUDA_C_64F;
+  else
+    throw std::logic_error(
+        "Scalar (data) type of CrsMatrix isn't supported by cuSPARSE, yet TPL "
+        "layer says it is");
+
+  /* create matrix */
+  cusparseSpMatDescr_t A_cusparse;
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr(
+      &A_cusparse, A.numRows(), A.numCols(), A.nnz(),
+      (void*)A.graph.row_map.data(), (void*)A.graph.entries.data(),
+      (void*)A.values.data(), myCusparseOffsetType, myCusparseEntryType,
+      CUSPARSE_INDEX_BASE_ZERO, myCudaDataType));
+
+  /* create lhs and rhs */
+  cusparseDnVecDescr_t vecX, vecY;
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec(
+      &vecX, x.extent_int(0), (void*)x.data(), myCudaDataType));
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec(
+      &vecY, y.extent_int(0), (void*)y.data(), myCudaDataType));
+
+  size_t bufferSize     = 0;
+  void* dBuffer         = NULL;
+  cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT;
+  if (controls.isParameter("algorithm")) {
+    const std::string algName = controls.getParameter("algorithm");
+    if (algName == "default")
+      alg = CUSPARSE_MV_ALG_DEFAULT;
+    else if (algName == "merge")
+      alg = CUSPARSE_CSRMV_ALG2;
+  }
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV_bufferSize(
+      cusparseHandle, myCusparseOperation, &alpha, A_cusparse, vecX, &beta,
+      vecY, myCudaDataType, alg, &bufferSize));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc(&dBuffer, bufferSize));
 
-    /* perform SpMV */
-    KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV(cusparseHandle, myCusparseOperation,
-					   &alpha, A_cusparse, vecX, &beta, vecY, myCudaDataType,
-					   alg, dBuffer));
+  /* perform SpMV */
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV(cusparseHandle, myCusparseOperation,
+                                         &alpha, A_cusparse, vecX, &beta, vecY,
+                                         myCudaDataType, alg, dBuffer));
 
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(dBuffer));
-    KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(vecX));
-    KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(vecY));
-    KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(A_cusparse));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(dBuffer));
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(vecX));
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(vecY));
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(A_cusparse));
 
 #elif (9000 <= CUDA_VERSION)
 
-    /* create and set the matrix descriptor */
-    cusparseMatDescr_t descrA = 0;
-    KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descrA));
-    KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
-    KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO));
-
-    /* perform the actual SpMV operation */
-    if(std::is_same<int, offset_type>::value) {
-      if (std::is_same<value_type,float>::value) {
-	KOKKOS_CUSPARSE_SAFE_CALL(cusparseScsrmv(cusparseHandle, myCusparseOperation,
-						 A.numRows(), A.numCols(), A.nnz(),
-						 reinterpret_cast<float const*>(&alpha), descrA,
-						 reinterpret_cast<float const*>(A.values.data()),
-						 A.graph.row_map.data(), A.graph.entries.data(),
-						 reinterpret_cast<float const*>(x.data()),
-						 reinterpret_cast<float const*>(&beta),
-						 reinterpret_cast<float*>(y.data())));
-
-      } else  if (std::is_same<value_type,double>::value) {
-	KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrmv(cusparseHandle, myCusparseOperation,
-						 A.numRows(), A.numCols(), A.nnz(),
-						 reinterpret_cast<double const *>(&alpha), descrA,
-						 reinterpret_cast<double const *>(A.values.data()),
-						 A.graph.row_map.data(), A.graph.entries.data(),
-						 reinterpret_cast<double const *>(x.data()),
-						 reinterpret_cast<double const *>(&beta),
-						 reinterpret_cast<double*>(y.data())));
-      } else  if (std::is_same<value_type,Kokkos::complex<float>>::value) {
-	KOKKOS_CUSPARSE_SAFE_CALL(cusparseCcsrmv(cusparseHandle, myCusparseOperation,
-						 A.numRows(), A.numCols(), A.nnz(),
-						 reinterpret_cast<cuComplex const*>(&alpha), descrA,
-						 reinterpret_cast<cuComplex const*>(A.values.data()),
-						 A.graph.row_map.data(), A.graph.entries.data(),
-						 reinterpret_cast<cuComplex const*>(x.data()),
-						 reinterpret_cast<cuComplex const*>(&beta),
-						 reinterpret_cast<cuComplex*>(y.data())));
-      } else  if (std::is_same<value_type,Kokkos::complex<double>>::value) {
-	KOKKOS_CUSPARSE_SAFE_CALL(cusparseZcsrmv(cusparseHandle, myCusparseOperation,
-						 A.numRows(), A.numCols(), A.nnz(),
-						 reinterpret_cast<cuDoubleComplex const*>(&alpha), descrA,
-						 reinterpret_cast<cuDoubleComplex const*>(A.values.data()),
-						 A.graph.row_map.data(), A.graph.entries.data(),
-						 reinterpret_cast<cuDoubleComplex const*>(x.data()),
-						 reinterpret_cast<cuDoubleComplex const*>(&beta),
-						 reinterpret_cast<cuDoubleComplex*>(y.data())));
-      } else {
-	throw std::logic_error("Trying to call cusparse SpMV with a scalar type not float/double, nor complex of either!");
-      }
+  /* create and set the matrix descriptor */
+  cusparseMatDescr_t descrA = 0;
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descrA));
+  KOKKOS_CUSPARSE_SAFE_CALL(
+      cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
+  KOKKOS_CUSPARSE_SAFE_CALL(
+      cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO));
+
+  /* perform the actual SpMV operation */
+  if (std::is_same<int, offset_type>::value) {
+    if (std::is_same<value_type, float>::value) {
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseScsrmv(
+          cusparseHandle, myCusparseOperation, A.numRows(), A.numCols(),
+          A.nnz(), reinterpret_cast<float const*>(&alpha), descrA,
+          reinterpret_cast<float const*>(A.values.data()),
+          A.graph.row_map.data(), A.graph.entries.data(),
+          reinterpret_cast<float const*>(x.data()),
+          reinterpret_cast<float const*>(&beta),
+          reinterpret_cast<float*>(y.data())));
+
+    } else if (std::is_same<value_type, double>::value) {
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrmv(
+          cusparseHandle, myCusparseOperation, A.numRows(), A.numCols(),
+          A.nnz(), reinterpret_cast<double const*>(&alpha), descrA,
+          reinterpret_cast<double const*>(A.values.data()),
+          A.graph.row_map.data(), A.graph.entries.data(),
+          reinterpret_cast<double const*>(x.data()),
+          reinterpret_cast<double const*>(&beta),
+          reinterpret_cast<double*>(y.data())));
+    } else if (std::is_same<value_type, Kokkos::complex<float>>::value) {
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseCcsrmv(
+          cusparseHandle, myCusparseOperation, A.numRows(), A.numCols(),
+          A.nnz(), reinterpret_cast<cuComplex const*>(&alpha), descrA,
+          reinterpret_cast<cuComplex const*>(A.values.data()),
+          A.graph.row_map.data(), A.graph.entries.data(),
+          reinterpret_cast<cuComplex const*>(x.data()),
+          reinterpret_cast<cuComplex const*>(&beta),
+          reinterpret_cast<cuComplex*>(y.data())));
+    } else if (std::is_same<value_type, Kokkos::complex<double>>::value) {
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseZcsrmv(
+          cusparseHandle, myCusparseOperation, A.numRows(), A.numCols(),
+          A.nnz(), reinterpret_cast<cuDoubleComplex const*>(&alpha), descrA,
+          reinterpret_cast<cuDoubleComplex const*>(A.values.data()),
+          A.graph.row_map.data(), A.graph.entries.data(),
+          reinterpret_cast<cuDoubleComplex const*>(x.data()),
+          reinterpret_cast<cuDoubleComplex const*>(&beta),
+          reinterpret_cast<cuDoubleComplex*>(y.data())));
     } else {
-      throw std::logic_error("With cuSPARSE pre-10.0, offset type must be int. Something wrong with TPL avail logic.");
+      throw std::logic_error(
+          "Trying to call cusparse SpMV with a scalar type not float/double, "
+          "nor complex of either!");
     }
-
-    KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyMatDescr(descrA));
-#endif // CUDA_VERSION
+  } else {
+    throw std::logic_error(
+        "With cuSPARSE pre-10.0, offset type must be int. Something wrong with "
+        "TPL avail logic.");
   }
 
-#define KOKKOSSPARSE_SPMV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, COMPILE_LIBRARY) \
-  template<>								\
-  struct SPMV<SCALAR const,  ORDINAL const, Kokkos::Device<Kokkos::Cuda, SPACE>, Kokkos::MemoryTraits<Kokkos::Unmanaged>, OFFSET const, \
-	      SCALAR const*, LAYOUT,        Kokkos::Device<Kokkos::Cuda, SPACE>, Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess>, \
-	      SCALAR*,       LAYOUT,        Kokkos::Device<Kokkos::Cuda, SPACE>, Kokkos::MemoryTraits<Kokkos::Unmanaged>, \
-	      true, COMPILE_LIBRARY> {						\
-    using device_type = Kokkos::Device<Kokkos::Cuda, SPACE>; \
-    using memory_trait_type = Kokkos::MemoryTraits<Kokkos::Unmanaged>;	\
-    using AMatrix  = CrsMatrix<SCALAR const, ORDINAL const, device_type, memory_trait_type, OFFSET const>; \
-    using XVector  = Kokkos::View<SCALAR const*, LAYOUT, device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess>>; \
-    using YVector  = Kokkos::View<SCALAR*, LAYOUT, device_type, memory_trait_type>; \
-    using Controls = KokkosKernels::Experimental::Controls;		\
-									\
-    using coefficient_type = typename YVector::non_const_value_type;	\
-									\
-    static void spmv (const Controls& controls,				\
-                      const char mode[],				\
-		      const coefficient_type& alpha,			\
-		      const AMatrix& A,					\
-		      const XVector& x,					\
-		      const coefficient_type& beta,			\
-		      const YVector& y) {				\
-      std::string label = "KokkosSparse::spmv[TPL_CUSPARSE," + Kokkos::ArithTraits<SCALAR>::name() + "]"; \
-      Kokkos::Profiling::pushRegion(label);				\
-      spmv_cusparse(controls, mode, alpha, A, x, beta, y);		\
-      Kokkos::Profiling::popRegion();					\
-    }									\
-  }; 
-
-//BMK: cuSPARSE that comes with CUDA 9 does not support tranpose or conjugate transpose modes.
-//No version of cuSPARSE supports mode C (conjugate, non transpose).
-//In those cases, fall back to KokkosKernels native spmv.
+  KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyMatDescr(descrA));
+#endif  // CUDA_VERSION
+}
+
+#define KOKKOSSPARSE_SPMV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE,     \
+                                   COMPILE_LIBRARY)                            \
+  template <>                                                                  \
+  struct SPMV<                                                                 \
+      SCALAR const, ORDINAL const, Kokkos::Device<Kokkos::Cuda, SPACE>,        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, OFFSET const, SCALAR const*,    \
+      LAYOUT, Kokkos::Device<Kokkos::Cuda, SPACE>,                             \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>, SCALAR*, \
+      LAYOUT, Kokkos::Device<Kokkos::Cuda, SPACE>,                             \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, true, COMPILE_LIBRARY> {        \
+    using device_type       = Kokkos::Device<Kokkos::Cuda, SPACE>;             \
+    using memory_trait_type = Kokkos::MemoryTraits<Kokkos::Unmanaged>;         \
+    using AMatrix = CrsMatrix<SCALAR const, ORDINAL const, device_type,        \
+                              memory_trait_type, OFFSET const>;                \
+    using XVector = Kokkos::View<                                              \
+        SCALAR const*, LAYOUT, device_type,                                    \
+        Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>;       \
+    using YVector =                                                            \
+        Kokkos::View<SCALAR*, LAYOUT, device_type, memory_trait_type>;         \
+    using Controls = KokkosKernels::Experimental::Controls;                    \
+                                                                               \
+    using coefficient_type = typename YVector::non_const_value_type;           \
+                                                                               \
+    static void spmv(const Controls& controls, const char mode[],              \
+                     const coefficient_type& alpha, const AMatrix& A,          \
+                     const XVector& x, const coefficient_type& beta,           \
+                     const YVector& y) {                                       \
+      std::string label = "KokkosSparse::spmv[TPL_CUSPARSE," +                 \
+                          Kokkos::ArithTraits<SCALAR>::name() + "]";           \
+      Kokkos::Profiling::pushRegion(label);                                    \
+      spmv_cusparse(controls, mode, alpha, A, x, beta, y);                     \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
+  };
+
+// BMK: cuSPARSE that comes with CUDA 9 does not support tranpose or conjugate
+// transpose modes. No version of cuSPARSE supports mode C (conjugate, non
+// transpose). In those cases, fall back to KokkosKernels native spmv.
 
 #if (9000 <= CUDA_VERSION)
-  KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutLeft,  Kokkos::CudaSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutRight, Kokkos::CudaSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(float,  int, int, Kokkos::LayoutLeft,  Kokkos::CudaSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(float,  int, int, Kokkos::LayoutRight, Kokkos::CudaSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<double>, int, int, Kokkos::LayoutLeft,  Kokkos::CudaSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<double>, int, int, Kokkos::LayoutRight, Kokkos::CudaSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>,  int, int, Kokkos::LayoutLeft,  Kokkos::CudaSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>,  int, int, Kokkos::LayoutRight, Kokkos::CudaSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutLeft,  Kokkos::CudaUVMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(float,  int, int, Kokkos::LayoutLeft,  Kokkos::CudaUVMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(float,  int, int, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<double>, int, int, Kokkos::LayoutLeft,  Kokkos::CudaUVMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<double>, int, int, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>,  int, int, Kokkos::LayoutLeft,  Kokkos::CudaUVMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>,  int, int, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutLeft,
+                           Kokkos::CudaSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutRight,
+                           Kokkos::CudaSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(float, int, int, Kokkos::LayoutLeft,
+                           Kokkos::CudaSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(float, int, int, Kokkos::LayoutRight,
+                           Kokkos::CudaSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<double>, int, int,
+                           Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<double>, int, int,
+                           Kokkos::LayoutRight, Kokkos::CudaSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>, int, int, Kokkos::LayoutLeft,
+                           Kokkos::CudaSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>, int, int,
+                           Kokkos::LayoutRight, Kokkos::CudaSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutLeft,
+                           Kokkos::CudaUVMSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutRight,
+                           Kokkos::CudaUVMSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(float, int, int, Kokkos::LayoutLeft,
+                           Kokkos::CudaUVMSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(float, int, int, Kokkos::LayoutRight,
+                           Kokkos::CudaUVMSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<double>, int, int,
+                           Kokkos::LayoutLeft, Kokkos::CudaUVMSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<double>, int, int,
+                           Kokkos::LayoutRight, Kokkos::CudaUVMSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>, int, int, Kokkos::LayoutLeft,
+                           Kokkos::CudaUVMSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>, int, int,
+                           Kokkos::LayoutRight, Kokkos::CudaUVMSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
 
 #if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION)
-  KOKKOSSPARSE_SPMV_CUSPARSE(double, int64_t, size_t, Kokkos::LayoutLeft,  Kokkos::CudaSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(double, int64_t, size_t, Kokkos::LayoutRight, Kokkos::CudaSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(float,  int64_t, size_t, Kokkos::LayoutLeft,  Kokkos::CudaSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(float,  int64_t, size_t, Kokkos::LayoutRight, Kokkos::CudaSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<double>, int64_t, size_t, Kokkos::LayoutLeft,  Kokkos::CudaSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<double>, int64_t, size_t, Kokkos::LayoutRight, Kokkos::CudaSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>,  int64_t, size_t, Kokkos::LayoutLeft,  Kokkos::CudaSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>,  int64_t, size_t, Kokkos::LayoutRight, Kokkos::CudaSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(double, int64_t, size_t, Kokkos::LayoutLeft,  Kokkos::CudaUVMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(double, int64_t, size_t, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(float,  int64_t, size_t, Kokkos::LayoutLeft,  Kokkos::CudaUVMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(float,  int64_t, size_t, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<double>, int64_t, size_t, Kokkos::LayoutLeft,  Kokkos::CudaUVMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<double>, int64_t, size_t, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>,  int64_t, size_t, Kokkos::LayoutLeft,  Kokkos::CudaUVMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>,  int64_t, size_t, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(double, int64_t, size_t, Kokkos::LayoutLeft,
+                           Kokkos::CudaSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(double, int64_t, size_t, Kokkos::LayoutRight,
+                           Kokkos::CudaSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(float, int64_t, size_t, Kokkos::LayoutLeft,
+                           Kokkos::CudaSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(float, int64_t, size_t, Kokkos::LayoutRight,
+                           Kokkos::CudaSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<double>, int64_t, size_t,
+                           Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<double>, int64_t, size_t,
+                           Kokkos::LayoutRight, Kokkos::CudaSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>, int64_t, size_t,
+                           Kokkos::LayoutLeft, Kokkos::CudaSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>, int64_t, size_t,
+                           Kokkos::LayoutRight, Kokkos::CudaSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(double, int64_t, size_t, Kokkos::LayoutLeft,
+                           Kokkos::CudaUVMSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(double, int64_t, size_t, Kokkos::LayoutRight,
+                           Kokkos::CudaUVMSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(float, int64_t, size_t, Kokkos::LayoutLeft,
+                           Kokkos::CudaUVMSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(float, int64_t, size_t, Kokkos::LayoutRight,
+                           Kokkos::CudaUVMSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<double>, int64_t, size_t,
+                           Kokkos::LayoutLeft, Kokkos::CudaUVMSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<double>, int64_t, size_t,
+                           Kokkos::LayoutRight, Kokkos::CudaUVMSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>, int64_t, size_t,
+                           Kokkos::LayoutLeft, Kokkos::CudaUVMSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>, int64_t, size_t,
+                           Kokkos::LayoutRight, Kokkos::CudaUVMSpace,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
 #endif
 #endif
 
 #undef KOKKOSSPARSE_SPMV_CUSPARSE
 
-} // namespace Impl
-} // namespace KokkosSparse
-#endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+}  // namespace Impl
+}  // namespace KokkosSparse
+#endif  // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+
+// rocSPARSE
+#if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE)
+#include <rocsparse.h>
+#include "KokkosKernels_SparseUtils_rocsparse.hpp"
+
+namespace KokkosSparse {
+namespace Impl {
+
+template <class AMatrix, class XVector, class YVector>
+void spmv_rocsparse(const KokkosKernels::Experimental::Controls& controls,
+                    const char mode[],
+                    typename YVector::non_const_value_type const& alpha,
+                    const AMatrix& A, const XVector& x,
+                    typename YVector::non_const_value_type const& beta,
+                    const YVector& y) {
+  using offset_type = typename AMatrix::non_const_size_type;
+  using entry_type  = typename AMatrix::non_const_ordinal_type;
+  using value_type  = typename AMatrix::non_const_value_type;
+
+  /* initialize rocsparse library */
+  rocsparse_handle handle = controls.getRocsparseHandle();
+
+  /* Set the operation mode */
+  rocsparse_operation myRocsparseOperation = mode_kk_to_rocsparse(mode);
+
+  /* Set the index type */
+  rocsparse_indextype offset_index_type = rocsparse_index_type<offset_type>();
+  rocsparse_indextype entry_index_type  = rocsparse_index_type<entry_type>();
+
+  /* Set the scalar type */
+  rocsparse_datatype compute_type = rocsparse_compute_type<value_type>();
+
+  /* Create the rocsparse mat and csr descr */
+  rocsparse_mat_descr Amat;
+  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&Amat));
+  rocsparse_spmat_descr Aspmat;
+  // We need to do some casting to void*
+  // Note that row_map is always a const view so const_cast is necessary,
+  // however entries and values may not be const so we need to check first.
+  void* csr_row_ptr =
+      static_cast<void*>(const_cast<offset_type*>(A.graph.row_map.data()));
+  void* csr_col_ind =
+      static_cast<void*>(const_cast<entry_type*>(A.graph.entries.data()));
+  void* csr_val = static_cast<void*>(const_cast<value_type*>(A.values.data()));
+
+  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_csr_descr(
+      &Aspmat, A.numRows(), A.numCols(), A.nnz(), csr_row_ptr, csr_col_ind,
+      csr_val, offset_index_type, entry_index_type, rocsparse_index_base_zero,
+      compute_type));
+
+  /* Create rocsparse dense vectors for X and Y */
+  rocsparse_dnvec_descr vecX, vecY;
+  void* x_data = static_cast<void*>(
+      const_cast<typename XVector::non_const_value_type*>(x.data()));
+  void* y_data = static_cast<void*>(
+      const_cast<typename YVector::non_const_value_type*>(y.data()));
+  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_dnvec_descr(
+      &vecX, x.extent_int(0), x_data,
+      rocsparse_compute_type<typename XVector::non_const_value_type>()));
+  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_dnvec_descr(
+      &vecY, y.extent_int(0), y_data,
+      rocsparse_compute_type<typename YVector::non_const_value_type>()));
+
+  /* Actually perform the SpMV operation, first size buffer, then compute result
+   */
+  size_t buffer_size     = 0;
+  void* tmp_buffer       = nullptr;
+  rocsparse_spmv_alg alg = rocsparse_spmv_alg_default;
+  // Note, Dec 6th 2021 - lbv:
+  // rocSPARSE offers two diffrent algorithms for spmv
+  // 1. ocsparse_spmv_alg_csr_adaptive
+  // 2. rocsparse_spmv_alg_csr_stream
+  // it is unclear which one is the default algorithm
+  // or what both algorithms are intended for?
+  if (controls.isParameter("algorithm")) {
+    const std::string algName = controls.getParameter("algorithm");
+    if (algName == "default")
+      alg = rocsparse_spmv_alg_default;
+    else if (algName == "merge")
+      alg = rocsparse_spmv_alg_csr_stream;
+  }
+  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(
+      rocsparse_spmv(handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta,
+                     vecY, compute_type, alg, &buffer_size, tmp_buffer));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&tmp_buffer, buffer_size));
+  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(
+      rocsparse_spmv(handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta,
+                     vecY, compute_type, alg, &buffer_size, tmp_buffer));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(tmp_buffer));
+
+  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_dnvec_descr(vecY));
+  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_dnvec_descr(vecX));
+  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_spmat_descr(Aspmat));
+  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_mat_descr(Amat));
+}
+
+#define KOKKOSSPARSE_SPMV_ROCSPARSE(SCALAR, LAYOUT, COMPILE_LIBRARY)          \
+  template <>                                                                 \
+  struct SPMV<SCALAR const, rocsparse_int const,                              \
+              Kokkos::Device<Kokkos::Experimental::HIP,                       \
+                             Kokkos::Experimental::HIPSpace>,                 \
+              Kokkos::MemoryTraits<Kokkos::Unmanaged>, rocsparse_int const,   \
+              SCALAR const*, LAYOUT,                                          \
+              Kokkos::Device<Kokkos::Experimental::HIP,                       \
+                             Kokkos::Experimental::HIPSpace>,                 \
+              Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>, \
+              SCALAR*, LAYOUT,                                                \
+              Kokkos::Device<Kokkos::Experimental::HIP,                       \
+                             Kokkos::Experimental::HIPSpace>,                 \
+              Kokkos::MemoryTraits<Kokkos::Unmanaged>, true,                  \
+              COMPILE_LIBRARY> {                                              \
+    using device_type       = Kokkos::Device<Kokkos::Experimental::HIP,       \
+                                       Kokkos::Experimental::HIPSpace>; \
+    using memory_trait_type = Kokkos::MemoryTraits<Kokkos::Unmanaged>;        \
+    using AMatrix = CrsMatrix<SCALAR const, rocsparse_int const, device_type, \
+                              memory_trait_type, rocsparse_int const>;        \
+    using XVector = Kokkos::View<                                             \
+        SCALAR const*, LAYOUT, device_type,                                   \
+        Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>;      \
+    using YVector =                                                           \
+        Kokkos::View<SCALAR*, LAYOUT, device_type, memory_trait_type>;        \
+    using Controls = KokkosKernels::Experimental::Controls;                   \
+                                                                              \
+    using coefficient_type = typename YVector::non_const_value_type;          \
+                                                                              \
+    static void spmv(const Controls& controls, const char mode[],             \
+                     const coefficient_type& alpha, const AMatrix& A,         \
+                     const XVector& x, const coefficient_type& beta,          \
+                     const YVector& y) {                                      \
+      std::string label = "KokkosSparse::spmv[TPL_ROCSPARSE," +               \
+                          Kokkos::ArithTraits<SCALAR>::name() + "]";          \
+      Kokkos::Profiling::pushRegion(label);                                   \
+      spmv_rocsparse(controls, mode, alpha, A, x, beta, y);                   \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
+  };
+
+KOKKOSSPARSE_SPMV_ROCSPARSE(double, Kokkos::LayoutLeft,
+                            KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_ROCSPARSE(double, Kokkos::LayoutRight,
+                            KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_ROCSPARSE(float, Kokkos::LayoutLeft,
+                            KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_ROCSPARSE(float, Kokkos::LayoutRight,
+                            KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex<double>, Kokkos::LayoutLeft,
+                            KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex<double>, Kokkos::LayoutRight,
+                            KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex<float>, Kokkos::LayoutLeft,
+                            KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex<float>, Kokkos::LayoutRight,
+                            KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+
+}  // namespace Impl
+}  // namespace KokkosSparse
+#endif  // KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
 #include <mkl.h>
 
-namespace KokkosSparse
-{
-namespace Impl
-{
+namespace KokkosSparse {
+namespace Impl {
 
 #if (__INTEL_MKL__ > 2017)
-  //MKL 2018 and above: use new interface: sparse_matrix_t and mkl_sparse_?_mv()
- 
-  inline void mkl_safe_call(int errcode)
-  {
-    if(errcode != SPARSE_STATUS_SUCCESS)
-      throw std::runtime_error("MKL returned non-success error code");
-  }
+// MKL 2018 and above: use new interface: sparse_matrix_t and mkl_sparse_?_mv()
+
+// Note 12/03/21 - lbv:
+// mkl_safe_call and mode_kk_to_mkl should
+// be moved to some sparse or mkl utility
+// header. It is likely that these will be
+// reused for other kernels.
+inline void mkl_safe_call(int errcode) {
+  if (errcode != SPARSE_STATUS_SUCCESS)
+    throw std::runtime_error("MKL returned non-success error code");
+}
 
-  inline sparse_operation_t mode_kk_to_mkl(char mode_kk)
-  {
-    switch(toupper(mode_kk))
-    {
-      case 'N':
-        return SPARSE_OPERATION_NON_TRANSPOSE;
-      case 'T':
-        return SPARSE_OPERATION_TRANSPOSE;
-      case 'H':
-        return SPARSE_OPERATION_CONJUGATE_TRANSPOSE;
-      default:;
-    }
-    throw std::invalid_argument("Invalid mode for MKL (should be one of N, T, H)");
+inline sparse_operation_t mode_kk_to_mkl(char mode_kk) {
+  switch (toupper(mode_kk)) {
+    case 'N': return SPARSE_OPERATION_NON_TRANSPOSE;
+    case 'T': return SPARSE_OPERATION_TRANSPOSE;
+    case 'H': return SPARSE_OPERATION_CONJUGATE_TRANSPOSE;
+    default:;
   }
+  throw std::invalid_argument(
+      "Invalid mode for MKL (should be one of N, T, H)");
+}
 
-  inline void spmv_mkl(sparse_operation_t op, float alpha, float beta,
-      int m, int n, const int* Arowptrs, const int* Aentries, const float* Avalues,
-      const float* x, float* y)
-  {
-    sparse_matrix_t A_mkl;
-    matrix_descr A_descr;
-    A_descr.type = SPARSE_MATRIX_TYPE_GENERAL;
-    A_descr.mode = SPARSE_FILL_MODE_FULL;
-    A_descr.diag = SPARSE_DIAG_NON_UNIT;
-    mkl_safe_call(mkl_sparse_s_create_csr(&A_mkl, SPARSE_INDEX_BASE_ZERO, m, n,
-          const_cast<int*>(Arowptrs), const_cast<int*>(Arowptrs + 1),
-          const_cast<int*>(Aentries), const_cast<float*>(Avalues)));
-    mkl_safe_call(mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y));
-  }
+inline void spmv_mkl(sparse_operation_t op, float alpha, float beta, int m,
+                     int n, const int* Arowptrs, const int* Aentries,
+                     const float* Avalues, const float* x, float* y) {
+  sparse_matrix_t A_mkl;
+  matrix_descr A_descr;
+  A_descr.type = SPARSE_MATRIX_TYPE_GENERAL;
+  A_descr.mode = SPARSE_FILL_MODE_FULL;
+  A_descr.diag = SPARSE_DIAG_NON_UNIT;
+  mkl_safe_call(mkl_sparse_s_create_csr(
+      &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast<int*>(Arowptrs),
+      const_cast<int*>(Arowptrs + 1), const_cast<int*>(Aentries),
+      const_cast<float*>(Avalues)));
+  mkl_safe_call(mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y));
+}
 
-  inline void spmv_mkl(sparse_operation_t op, double alpha, double beta,
-      int m, int n, const int* Arowptrs, const int* Aentries, const double* Avalues,
-      const double* x, double* y)
-  {
-    sparse_matrix_t A_mkl;
-    matrix_descr A_descr;
-    A_descr.type = SPARSE_MATRIX_TYPE_GENERAL;
-    A_descr.mode = SPARSE_FILL_MODE_FULL;
-    A_descr.diag = SPARSE_DIAG_NON_UNIT;
-    mkl_safe_call(mkl_sparse_d_create_csr(&A_mkl, SPARSE_INDEX_BASE_ZERO, m, n,
-          const_cast<int*>(Arowptrs), const_cast<int*>(Arowptrs + 1),
-          const_cast<int*>(Aentries), const_cast<double*>(Avalues)));
-    mkl_safe_call(mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y));
-  }
+inline void spmv_mkl(sparse_operation_t op, double alpha, double beta, int m,
+                     int n, const int* Arowptrs, const int* Aentries,
+                     const double* Avalues, const double* x, double* y) {
+  sparse_matrix_t A_mkl;
+  matrix_descr A_descr;
+  A_descr.type = SPARSE_MATRIX_TYPE_GENERAL;
+  A_descr.mode = SPARSE_FILL_MODE_FULL;
+  A_descr.diag = SPARSE_DIAG_NON_UNIT;
+  mkl_safe_call(mkl_sparse_d_create_csr(
+      &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast<int*>(Arowptrs),
+      const_cast<int*>(Arowptrs + 1), const_cast<int*>(Aentries),
+      const_cast<double*>(Avalues)));
+  mkl_safe_call(mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y));
+}
 
-  inline void spmv_mkl(sparse_operation_t op, Kokkos::complex<float> alpha, Kokkos::complex<float> beta,
-      int m, int n, const int* Arowptrs, const int* Aentries, const Kokkos::complex<float>* Avalues,
-      const Kokkos::complex<float>* x, Kokkos::complex<float>* y)
-  {
-    sparse_matrix_t A_mkl;
-    matrix_descr A_descr;
-    A_descr.type = SPARSE_MATRIX_TYPE_GENERAL;
-    A_descr.mode = SPARSE_FILL_MODE_FULL;
-    A_descr.diag = SPARSE_DIAG_NON_UNIT;
-    mkl_safe_call(mkl_sparse_c_create_csr(&A_mkl, SPARSE_INDEX_BASE_ZERO, m, n,
-          const_cast<int*>(Arowptrs), const_cast<int*>(Arowptrs + 1),
-          const_cast<int*>(Aentries), (MKL_Complex8*) Avalues));
-    MKL_Complex8& alpha_mkl = reinterpret_cast<MKL_Complex8&>(alpha);
-    MKL_Complex8& beta_mkl = reinterpret_cast<MKL_Complex8&>(beta);
-    mkl_safe_call(mkl_sparse_c_mv(
-          op, alpha_mkl, A_mkl, A_descr,
-          reinterpret_cast<const MKL_Complex8*>(x), beta_mkl, reinterpret_cast<MKL_Complex8*>(y)));
-  }
+inline void spmv_mkl(sparse_operation_t op, Kokkos::complex<float> alpha,
+                     Kokkos::complex<float> beta, int m, int n,
+                     const int* Arowptrs, const int* Aentries,
+                     const Kokkos::complex<float>* Avalues,
+                     const Kokkos::complex<float>* x,
+                     Kokkos::complex<float>* y) {
+  sparse_matrix_t A_mkl;
+  matrix_descr A_descr;
+  A_descr.type = SPARSE_MATRIX_TYPE_GENERAL;
+  A_descr.mode = SPARSE_FILL_MODE_FULL;
+  A_descr.diag = SPARSE_DIAG_NON_UNIT;
+  mkl_safe_call(mkl_sparse_c_create_csr(
+      &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast<int*>(Arowptrs),
+      const_cast<int*>(Arowptrs + 1), const_cast<int*>(Aentries),
+      (MKL_Complex8*)Avalues));
+  MKL_Complex8& alpha_mkl = reinterpret_cast<MKL_Complex8&>(alpha);
+  MKL_Complex8& beta_mkl  = reinterpret_cast<MKL_Complex8&>(beta);
+  mkl_safe_call(mkl_sparse_c_mv(op, alpha_mkl, A_mkl, A_descr,
+                                reinterpret_cast<const MKL_Complex8*>(x),
+                                beta_mkl, reinterpret_cast<MKL_Complex8*>(y)));
+}
 
-  inline void spmv_mkl(sparse_operation_t op, Kokkos::complex<double> alpha, Kokkos::complex<double> beta,
-      int m, int n, const int* Arowptrs, const int* Aentries, const Kokkos::complex<double>* Avalues,
-      const Kokkos::complex<double>* x, Kokkos::complex<double>* y)
-  {
-    sparse_matrix_t A_mkl;
-    matrix_descr A_descr;
-    A_descr.type = SPARSE_MATRIX_TYPE_GENERAL;
-    A_descr.mode = SPARSE_FILL_MODE_FULL;
-    A_descr.diag = SPARSE_DIAG_NON_UNIT;
-    mkl_safe_call(mkl_sparse_z_create_csr(&A_mkl, SPARSE_INDEX_BASE_ZERO, m, n,
-          const_cast<int*>(Arowptrs), const_cast<int*>(Arowptrs + 1),
-          const_cast<int*>(Aentries), (MKL_Complex16*) Avalues));
-    MKL_Complex16& alpha_mkl = reinterpret_cast<MKL_Complex16&>(alpha);
-    MKL_Complex16& beta_mkl = reinterpret_cast<MKL_Complex16&>(beta);
-    mkl_safe_call(mkl_sparse_z_mv(
-          op, alpha_mkl, A_mkl, A_descr,
-          reinterpret_cast<const MKL_Complex16*>(x), beta_mkl, reinterpret_cast<MKL_Complex16*>(y)));
-  }
+inline void spmv_mkl(sparse_operation_t op, Kokkos::complex<double> alpha,
+                     Kokkos::complex<double> beta, int m, int n,
+                     const int* Arowptrs, const int* Aentries,
+                     const Kokkos::complex<double>* Avalues,
+                     const Kokkos::complex<double>* x,
+                     Kokkos::complex<double>* y) {
+  sparse_matrix_t A_mkl;
+  matrix_descr A_descr;
+  A_descr.type = SPARSE_MATRIX_TYPE_GENERAL;
+  A_descr.mode = SPARSE_FILL_MODE_FULL;
+  A_descr.diag = SPARSE_DIAG_NON_UNIT;
+  mkl_safe_call(mkl_sparse_z_create_csr(
+      &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast<int*>(Arowptrs),
+      const_cast<int*>(Arowptrs + 1), const_cast<int*>(Aentries),
+      (MKL_Complex16*)Avalues));
+  MKL_Complex16& alpha_mkl = reinterpret_cast<MKL_Complex16&>(alpha);
+  MKL_Complex16& beta_mkl  = reinterpret_cast<MKL_Complex16&>(beta);
+  mkl_safe_call(mkl_sparse_z_mv(op, alpha_mkl, A_mkl, A_descr,
+                                reinterpret_cast<const MKL_Complex16*>(x),
+                                beta_mkl, reinterpret_cast<MKL_Complex16*>(y)));
+}
 
-#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \
-  template<>								\
-  struct SPMV<SCALAR const,  int const, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>, Kokkos::MemoryTraits<Kokkos::Unmanaged>, int const, \
-	      SCALAR const*, Kokkos::LayoutLeft, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>, Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess>, \
-	      SCALAR*,       Kokkos::LayoutLeft, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>, Kokkos::MemoryTraits<Kokkos::Unmanaged>, \
-	      true, COMPILE_LIBRARY> {						\
-									\
-    using device_type = Kokkos::Device<EXECSPACE, Kokkos::HostSpace>; \
-    using AMatrix = CrsMatrix<SCALAR const, int const, device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>, int const>; \
-    using XVector = Kokkos::View<SCALAR const*, Kokkos::LayoutLeft, device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess>>; \
-    using YVector = Kokkos::View<SCALAR*, Kokkos::LayoutLeft, device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>>; \
-    using coefficient_type = typename YVector::non_const_value_type;	\
-    using Controls = KokkosKernels::Experimental::Controls;		\
-									\
-    static void spmv (const Controls&,				        \
-                      const char mode[],				\
-		      const coefficient_type& alpha,			\
-		      const AMatrix& A,					\
-		      const XVector& x,					\
-		      const coefficient_type& beta,			\
-		      const YVector& y) {				\
-      std::string label = "KokkosSparse::spmv[TPL_MKL," + Kokkos::ArithTraits<SCALAR>::name() + "]"; \
-      Kokkos::Profiling::pushRegion(label);				\
+#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY)              \
+  template <>                                                                  \
+  struct SPMV<                                                                 \
+      SCALAR const, int const, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>,   \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, int const, SCALAR const*,       \
+      Kokkos::LayoutLeft, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>,        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>, SCALAR*, \
+      Kokkos::LayoutLeft, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>,        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, true, COMPILE_LIBRARY> {        \
+    using device_type = Kokkos::Device<EXECSPACE, Kokkos::HostSpace>;          \
+    using AMatrix =                                                            \
+        CrsMatrix<SCALAR const, int const, device_type,                        \
+                  Kokkos::MemoryTraits<Kokkos::Unmanaged>, int const>;         \
+    using XVector = Kokkos::View<                                              \
+        SCALAR const*, Kokkos::LayoutLeft, device_type,                        \
+        Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>;       \
+    using YVector = Kokkos::View<SCALAR*, Kokkos::LayoutLeft, device_type,     \
+                                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>;     \
+    using coefficient_type = typename YVector::non_const_value_type;           \
+    using Controls         = KokkosKernels::Experimental::Controls;            \
+                                                                               \
+    static void spmv(const Controls&, const char mode[],                       \
+                     const coefficient_type& alpha, const AMatrix& A,          \
+                     const XVector& x, const coefficient_type& beta,           \
+                     const YVector& y) {                                       \
+      std::string label = "KokkosSparse::spmv[TPL_MKL," +                      \
+                          Kokkos::ArithTraits<SCALAR>::name() + "]";           \
+      Kokkos::Profiling::pushRegion(label);                                    \
       spmv_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), A.numCols(), \
-      A.graph.row_map.data(), A.graph.entries.data(), A.values.data(), x.data(), y.data()); \
-      Kokkos::Profiling::popRegion();					\
-    }									\
+               A.graph.row_map.data(), A.graph.entries.data(),                 \
+               A.values.data(), x.data(), y.data());                           \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
   };
 #endif
 
 #if (__INTEL_MKL__ == 2017)
-  //MKL 2017: use old interface: mkl_?csrmv
-  inline char mode_kk_to_mkl(char mode_kk)
-  {
-    switch(toupper(mode_kk))
-    {
-      case 'N':
-        return 'N';
-      case 'T':
-        return 'T';
-      case 'H':
-        return 'C';
-      default:;
-    }
-    throw std::invalid_argument("Invalid mode for MKL (should be one of N, T, H)");
+// MKL 2017: use old interface: mkl_?csrmv
+inline char mode_kk_to_mkl(char mode_kk) {
+  switch (toupper(mode_kk)) {
+    case 'N': return 'N';
+    case 'T': return 'T';
+    case 'H': return 'C';
+    default:;
   }
+  throw std::invalid_argument(
+      "Invalid mode for MKL (should be one of N, T, H)");
+}
 
+inline void spmv_mkl(char mode, float alpha, float beta, int m, int n,
+                     const int* Arowptrs, const int* Aentries,
+                     const float* Avalues, const float* x, float* y) {
+  mkl_scsrmv(&mode, &m, &n, &alpha, "G**C", Avalues, Aentries, Arowptrs,
+             Arowptrs + 1, x, &beta, y);
+}
 
-  inline void spmv_mkl(char mode, float alpha, float beta, int m, int n, const int* Arowptrs, const int* Aentries, const float* Avalues, const float* x, float* y)
-  {
-    mkl_scsrmv(&mode, &m, &n, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, &beta, y);
-  }
-
-  inline void spmv_mkl(char mode, double alpha, double beta, int m, int n, const int* Arowptrs, const int* Aentries, const double* Avalues, const double* x, double* y)
-  {
-    mkl_dcsrmv(&mode, &m, &n, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, &beta, y);
-  }
+inline void spmv_mkl(char mode, double alpha, double beta, int m, int n,
+                     const int* Arowptrs, const int* Aentries,
+                     const double* Avalues, const double* x, double* y) {
+  mkl_dcsrmv(&mode, &m, &n, &alpha, "G**C", Avalues, Aentries, Arowptrs,
+             Arowptrs + 1, x, &beta, y);
+}
 
-  inline void spmv_mkl(char mode, Kokkos::complex<float> alpha, Kokkos::complex<float> beta, int m, int n, const int* Arowptrs, const int* Aentries, const Kokkos::complex<float>* Avalues, const Kokkos::complex<float>* x, Kokkos::complex<float>* y)
-  {
-    const MKL_Complex8* alpha_mkl = reinterpret_cast<const MKL_Complex8*>(&alpha);
-    const MKL_Complex8* beta_mkl = reinterpret_cast<const MKL_Complex8*>(&beta);
-    const MKL_Complex8* Avalues_mkl = reinterpret_cast<const MKL_Complex8*>(Avalues);
-    const MKL_Complex8* x_mkl = reinterpret_cast<const MKL_Complex8*>(x);
-    MKL_Complex8* y_mkl = reinterpret_cast<MKL_Complex8*>(y);
-    mkl_ccsrmv(&mode, &m, &n, alpha_mkl, "G**C", Avalues_mkl, Aentries, Arowptrs, Arowptrs + 1, x_mkl, beta_mkl, y_mkl);
-  }
+inline void spmv_mkl(char mode, Kokkos::complex<float> alpha,
+                     Kokkos::complex<float> beta, int m, int n,
+                     const int* Arowptrs, const int* Aentries,
+                     const Kokkos::complex<float>* Avalues,
+                     const Kokkos::complex<float>* x,
+                     Kokkos::complex<float>* y) {
+  const MKL_Complex8* alpha_mkl = reinterpret_cast<const MKL_Complex8*>(&alpha);
+  const MKL_Complex8* beta_mkl  = reinterpret_cast<const MKL_Complex8*>(&beta);
+  const MKL_Complex8* Avalues_mkl =
+      reinterpret_cast<const MKL_Complex8*>(Avalues);
+  const MKL_Complex8* x_mkl = reinterpret_cast<const MKL_Complex8*>(x);
+  MKL_Complex8* y_mkl       = reinterpret_cast<MKL_Complex8*>(y);
+  mkl_ccsrmv(&mode, &m, &n, alpha_mkl, "G**C", Avalues_mkl, Aentries, Arowptrs,
+             Arowptrs + 1, x_mkl, beta_mkl, y_mkl);
+}
 
-  inline void spmv_mkl(char mode, Kokkos::complex<double> alpha, Kokkos::complex<double> beta, int m, int n, const int* Arowptrs, const int* Aentries, const Kokkos::complex<double>* Avalues, const Kokkos::complex<double>* x, Kokkos::complex<double>* y)
-  {
-    const MKL_Complex16* alpha_mkl = reinterpret_cast<const MKL_Complex16*>(&alpha);
-    const MKL_Complex16* beta_mkl = reinterpret_cast<const MKL_Complex16*>(&beta);
-    const MKL_Complex16* Avalues_mkl = reinterpret_cast<const MKL_Complex16*>(Avalues);
-    const MKL_Complex16* x_mkl = reinterpret_cast<const MKL_Complex16*>(x);
-    MKL_Complex16* y_mkl = reinterpret_cast<MKL_Complex16*>(y);
-    mkl_zcsrmv(&mode, &m, &n, alpha_mkl, "G**C", Avalues_mkl, Aentries, Arowptrs, Arowptrs + 1, x_mkl, beta_mkl, y_mkl);
-  }
+inline void spmv_mkl(char mode, Kokkos::complex<double> alpha,
+                     Kokkos::complex<double> beta, int m, int n,
+                     const int* Arowptrs, const int* Aentries,
+                     const Kokkos::complex<double>* Avalues,
+                     const Kokkos::complex<double>* x,
+                     Kokkos::complex<double>* y) {
+  const MKL_Complex16* alpha_mkl =
+      reinterpret_cast<const MKL_Complex16*>(&alpha);
+  const MKL_Complex16* beta_mkl = reinterpret_cast<const MKL_Complex16*>(&beta);
+  const MKL_Complex16* Avalues_mkl =
+      reinterpret_cast<const MKL_Complex16*>(Avalues);
+  const MKL_Complex16* x_mkl = reinterpret_cast<const MKL_Complex16*>(x);
+  MKL_Complex16* y_mkl       = reinterpret_cast<MKL_Complex16*>(y);
+  mkl_zcsrmv(&mode, &m, &n, alpha_mkl, "G**C", Avalues_mkl, Aentries, Arowptrs,
+             Arowptrs + 1, x_mkl, beta_mkl, y_mkl);
+}
 
-#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \
-  template<>								\
-  struct SPMV<SCALAR const,  int const, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>, Kokkos::MemoryTraits<Kokkos::Unmanaged>, int const, \
-	      SCALAR const*, Kokkos::LayoutLeft, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>, Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess>, \
-	      SCALAR*,       Kokkos::LayoutLeft, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>, Kokkos::MemoryTraits<Kokkos::Unmanaged>, \
-	      true, COMPILE_LIBRARY> {						\
-									\
-    using device_type = Kokkos::Device<EXECSPACE, Kokkos::HostSpace>; \
-    using AMatrix = CrsMatrix<SCALAR const, int const, device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>, int const>; \
-    using XVector = Kokkos::View<SCALAR const*, Kokkos::LayoutLeft, device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess>>; \
-    using YVector = Kokkos::View<SCALAR*, Kokkos::LayoutLeft, device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>>; \
-    using coefficient_type = typename YVector::non_const_value_type;	\
-    using Controls = KokkosKernels::Experimental::Controls;		\
-									\
-    static void spmv (const Controls&,				        \
-                      const char mode[],				\
-		      const coefficient_type& alpha,			\
-		      const AMatrix& A,					\
-		      const XVector& x,					\
-		      const coefficient_type& beta,			\
-		      const YVector& y) {				\
-      std::string label = "KokkosSparse::spmv[TPL_MKL," + Kokkos::ArithTraits<SCALAR>::name() + "]"; \
-      Kokkos::Profiling::pushRegion(label);				\
+#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY)              \
+  template <>                                                                  \
+  struct SPMV<                                                                 \
+      SCALAR const, int const, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>,   \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, int const, SCALAR const*,       \
+      Kokkos::LayoutLeft, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>,        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>, SCALAR*, \
+      Kokkos::LayoutLeft, Kokkos::Device<EXECSPACE, Kokkos::HostSpace>,        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, true, COMPILE_LIBRARY> {        \
+    using device_type = Kokkos::Device<EXECSPACE, Kokkos::HostSpace>;          \
+    using AMatrix =                                                            \
+        CrsMatrix<SCALAR const, int const, device_type,                        \
+                  Kokkos::MemoryTraits<Kokkos::Unmanaged>, int const>;         \
+    using XVector = Kokkos::View<                                              \
+        SCALAR const*, Kokkos::LayoutLeft, device_type,                        \
+        Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>;       \
+    using YVector = Kokkos::View<SCALAR*, Kokkos::LayoutLeft, device_type,     \
+                                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>;     \
+    using coefficient_type = typename YVector::non_const_value_type;           \
+    using Controls         = KokkosKernels::Experimental::Controls;            \
+                                                                               \
+    static void spmv(const Controls&, const char mode[],                       \
+                     const coefficient_type& alpha, const AMatrix& A,          \
+                     const XVector& x, const coefficient_type& beta,           \
+                     const YVector& y) {                                       \
+      std::string label = "KokkosSparse::spmv[TPL_MKL," +                      \
+                          Kokkos::ArithTraits<SCALAR>::name() + "]";           \
+      Kokkos::Profiling::pushRegion(label);                                    \
       spmv_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), A.numCols(), \
-      A.graph.row_map.data(), A.graph.entries.data(), A.values.data(), x.data(), y.data()); \
-      Kokkos::Profiling::popRegion();					\
-    }									\
+               A.graph.row_map.data(), A.graph.entries.data(),                 \
+               A.values.data(), x.data(), y.data());                           \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
   };
 #endif
 
 #ifdef KOKKOS_ENABLE_SERIAL
-  KOKKOSSPARSE_SPMV_MKL(float, Kokkos::Serial, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_MKL(double, Kokkos::Serial, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_MKL(Kokkos::complex<float>, Kokkos::Serial, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_MKL(Kokkos::complex<double>, Kokkos::Serial, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MKL(float, Kokkos::Serial, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MKL(double, Kokkos::Serial,
+                      KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MKL(Kokkos::complex<float>, Kokkos::Serial,
+                      KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MKL(Kokkos::complex<double>, Kokkos::Serial,
+                      KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
 #endif
 
 #ifdef KOKKOS_ENABLE_OPENMP
-  KOKKOSSPARSE_SPMV_MKL(float, Kokkos::OpenMP, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_MKL(double, Kokkos::OpenMP, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_MKL(Kokkos::complex<float>, Kokkos::OpenMP, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
-  KOKKOSSPARSE_SPMV_MKL(Kokkos::complex<double>, Kokkos::OpenMP, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MKL(float, Kokkos::OpenMP, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MKL(double, Kokkos::OpenMP,
+                      KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MKL(Kokkos::complex<float>, Kokkos::OpenMP,
+                      KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
+KOKKOSSPARSE_SPMV_MKL(Kokkos::complex<double>, Kokkos::OpenMP,
+                      KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
 #endif
 
 #undef KOKKOSSPARSE_SPMV_MKL
-}
-}
+}  // namespace Impl
+}  // namespace KokkosSparse
 #endif
 
-#endif // KOKKOSPARSE_SPMV_TPL_SPEC_DECL_HPP_
+#endif  // KOKKOSPARSE_SPMV_TPL_SPEC_DECL_HPP_
diff --git a/src/impl/tpls/KokkosSparse_sptrsv_solve_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_sptrsv_solve_tpl_spec_avail.hpp
index 8f16fa8ee8..000f9d7995 100644
--- a/src/impl/tpls/KokkosSparse_sptrsv_solve_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosSparse_sptrsv_solve_tpl_spec_avail.hpp
@@ -48,18 +48,13 @@
 namespace KokkosSparse {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class KernelHandle,
-         class RowMapType,
-         class EntriesType,
-         class ValuesType,
-         class BType,
-         class XType>
+template <class KernelHandle, class RowMapType, class EntriesType,
+          class ValuesType, class BType, class XType>
 struct sptrsv_solve_tpl_spec_avail {
   enum : bool { value = false };
 };
 
-
-}
-}
+}  // namespace Impl
+}  // namespace KokkosSparse
 
 #endif
diff --git a/src/impl/tpls/KokkosSparse_sptrsv_solve_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_sptrsv_solve_tpl_spec_decl.hpp
index bd15783b96..f0e0293162 100644
--- a/src/impl/tpls/KokkosSparse_sptrsv_solve_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosSparse_sptrsv_solve_tpl_spec_decl.hpp
@@ -46,8 +46,7 @@
 #define KOKKOSPARSE_SPTRSV_SOLVE_TPL_SPEC_DECL_HPP_
 
 namespace KokkosSparse {
-namespace Impl {
-}
-}
+namespace Impl {}
+}  // namespace KokkosSparse
 
 #endif
diff --git a/src/impl/tpls/KokkosSparse_sptrsv_symbolic_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_sptrsv_symbolic_tpl_spec_avail.hpp
index c340ad2824..3f005f39e0 100644
--- a/src/impl/tpls/KokkosSparse_sptrsv_symbolic_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosSparse_sptrsv_symbolic_tpl_spec_avail.hpp
@@ -48,15 +48,12 @@
 namespace KokkosSparse {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class KernelHandle,
-         class RowMapType,
-         class EntriesType>
+template <class KernelHandle, class RowMapType, class EntriesType>
 struct sptrsv_symbolic_tpl_spec_avail {
   enum : bool { value = false };
 };
 
-
-}
-}
+}  // namespace Impl
+}  // namespace KokkosSparse
 
 #endif
diff --git a/src/impl/tpls/KokkosSparse_sptrsv_symbolic_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_sptrsv_symbolic_tpl_spec_decl.hpp
index f148773693..faa5806a1a 100644
--- a/src/impl/tpls/KokkosSparse_sptrsv_symbolic_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosSparse_sptrsv_symbolic_tpl_spec_decl.hpp
@@ -46,8 +46,7 @@
 #define KOKKOSPARSE_SPTRSV_SYMBOLIC_TPL_SPEC_DECL_HPP_
 
 namespace KokkosSparse {
-namespace Impl {
-}
-}
+namespace Impl {}
+}  // namespace KokkosSparse
 
 #endif
diff --git a/src/impl/tpls/KokkosSparse_trsv_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_trsv_tpl_spec_avail.hpp
index 2e701f066e..3cc88645c6 100644
--- a/src/impl/tpls/KokkosSparse_trsv_tpl_spec_avail.hpp
+++ b/src/impl/tpls/KokkosSparse_trsv_tpl_spec_avail.hpp
@@ -48,15 +48,13 @@
 namespace KokkosSparse {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class CrsMatrixType,
-         class DomainMultiVectorType,
-         class RangeMultiVectorType>
+template <class CrsMatrixType, class DomainMultiVectorType,
+          class RangeMultiVectorType>
 struct trsv_tpl_spec_avail {
   enum : bool { value = false };
 };
 
-
-}
-}
+}  // namespace Impl
+}  // namespace KokkosSparse
 
 #endif
diff --git a/src/impl/tpls/KokkosSparse_trsv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_trsv_tpl_spec_decl.hpp
index e9f9771ab4..fd1be19265 100644
--- a/src/impl/tpls/KokkosSparse_trsv_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosSparse_trsv_tpl_spec_decl.hpp
@@ -46,8 +46,7 @@
 #define KOKKOSPARSE_TRSV_TPL_SPEC_DECL_HPP_
 
 namespace KokkosSparse {
-namespace Impl {
-}
-}
+namespace Impl {}
+}  // namespace KokkosSparse
 
 #endif
diff --git a/src/sparse/KokkosSparse.hpp b/src/sparse/KokkosSparse.hpp
index 2c2cceb464..20f33a9002 100644
--- a/src/sparse/KokkosSparse.hpp
+++ b/src/sparse/KokkosSparse.hpp
@@ -60,4 +60,3 @@
 #include "KokkosSparse_trsv.hpp"
 #include "KokkosSparse_spgemm.hpp"
 #include "KokkosSparse_gauss_seidel.hpp"
-
diff --git a/src/sparse/KokkosSparse_BlockCrsMatrix.hpp b/src/sparse/KokkosSparse_BlockCrsMatrix.hpp
index 094cccc519..4e52c0f693 100644
--- a/src/sparse/KokkosSparse_BlockCrsMatrix.hpp
+++ b/src/sparse/KokkosSparse_BlockCrsMatrix.hpp
@@ -76,29 +76,32 @@ namespace Experimental {
 ///
 /// MatrixType must provide the \c value_type and \c ordinal_type
 /// typedefs.  In addition, it must make sense to use SparseBlockRowView to
-/// view a block-row of MatrixType.  
-template<class MatrixType>
+/// view a block-row of MatrixType.
+template <class MatrixType>
 struct SparseBlockRowView {
   //! The type of the values in the row.
   typedef typename MatrixType::value_type value_type;
   //! The type of the column indices in the row.
   typedef typename MatrixType::ordinal_type ordinal_type;
-  //! The type for returned block of values. 
-  typedef Kokkos::View< value_type**, Kokkos::LayoutStride, typename MatrixType::device_type, Kokkos::MemoryUnmanaged > block_values_type;
+  //! The type for returned block of values.
+  typedef Kokkos::View<value_type**, Kokkos::LayoutStride,
+                       typename MatrixType::device_type,
+                       Kokkos::MemoryUnmanaged>
+      block_values_type;
 
-private:
+ private:
   //! Array of values in the row.
   value_type* values_;
   //! Array of (local) column indices in the row.
   ordinal_type* colidx_;
   /// \brief Stride between successive rows in a block.
   ///
-  /// For block compressed sparse row (BlockCSR) storage with row-major layout by full row,
-  /// (i.e. consecutive rows within a block are NOT contiguous), this will be the stride 
-  /// between rows within a block-row
+  /// For block compressed sparse row (BlockCSR) storage with row-major layout
+  /// by full row, (i.e. consecutive rows within a block are NOT contiguous),
+  /// this will be the stride between rows within a block-row
   const ordinal_type blockDim_;
 
-public:
+ public:
   /// \brief Constructor
   ///
   /// \param values [in] Array of the row's values.
@@ -109,12 +112,12 @@ struct SparseBlockRowView {
   //
   // Assumes values and colidx__ already offset to the correct location
   KOKKOS_INLINE_FUNCTION
-  SparseBlockRowView (value_type* const values,
-                      ordinal_type* const colidx__,
-                      const ordinal_type& blockDim,
-                      const ordinal_type& count) :
-    values_ (values), colidx_ (colidx__), blockDim_(blockDim), length (count)
-  {}
+  SparseBlockRowView(value_type* const values, ordinal_type* const colidx__,
+                     const ordinal_type& blockDim, const ordinal_type& count)
+      : values_(values),
+        colidx_(colidx__),
+        blockDim_(blockDim),
+        length(count) {}
 
   /// \brief Constructor with offset into \c colidx array
   ///
@@ -123,7 +126,8 @@ struct SparseBlockRowView {
   /// \param blockDim [in] (Constant) stride between rows in
   ///   within a block in the above arrays.
   /// \param count [in] Number of blocks in the desired block-row
-  /// \param start [in] Offset into values and colidx of the desired block-row start.
+  /// \param start [in] Offset into values and colidx of the desired block-row
+  /// start.
   ///   Note: The offset into the values array for a block-row equals
   ///           num_blocks_prior_to_block-row*blockDim*blockDim
   ///
@@ -131,16 +135,18 @@ struct SparseBlockRowView {
   ///   built-in integer type.  This may differ from ordinal_type.
   ///   For example, the matrix may have dimensions that fit in int,
   ///   but a number of entries that does not fit in int.
-  template<class OffsetType>
-  KOKKOS_INLINE_FUNCTION
-  SparseBlockRowView (const typename MatrixType::values_type& values,
-                      const typename MatrixType::index_type& colidx__,
-                      const ordinal_type& blockDim,
-                      const ordinal_type& count,
-                      const OffsetType& start,
-                      const typename std::enable_if<std::is_integral<OffsetType>::value, int>::type& = 0) :
-    values_ (&values(start*blockDim*blockDim)), colidx_ (&colidx__(start)), blockDim_(blockDim), length (count)
-  {}
+  template <class OffsetType>
+  KOKKOS_INLINE_FUNCTION SparseBlockRowView(
+      const typename MatrixType::values_type& values,
+      const typename MatrixType::index_type& colidx__,
+      const ordinal_type& blockDim, const ordinal_type& count,
+      const OffsetType& start,
+      const typename std::enable_if<std::is_integral<OffsetType>::value,
+                                    int>::type& = 0)
+      : values_(&values(start * blockDim * blockDim)),
+        colidx_(&colidx__(start)),
+        blockDim_(blockDim),
+        length(count) {}
 
   /// \brief Number of entries (i.e. blocks) in the row.
   ///
@@ -155,8 +161,7 @@ struct SparseBlockRowView {
   /// Here, length refers to the number of blocks in a block-row
   const ordinal_type length;
 
-
-  /// \brief Return a pointer offset to full-row i of values_ array; 
+  /// \brief Return a pointer offset to full-row i of values_ array;
   ///        user responsible for indexing into this pointer correctly
   /// \param i [in] must be the LOCAL row index offset within this block-row
   ///
@@ -165,11 +170,11 @@ struct SparseBlockRowView {
   /// Pointer interfaces are NOT guaranteed for backward compatibility
   /// This interface is intended for performant kernels, not common usage
   KOKKOS_INLINE_FUNCTION
-  value_type* full_row_in_block_row (const ordinal_type& i) const {
-    return values_+(i*length*blockDim_) ;
+  value_type* full_row_in_block_row(const ordinal_type& i) const {
+    return values_ + (i * length * blockDim_);
   }
 
-  /// /brief Return a pointer offset to local row i of block K of values_ array; 
+  /// /brief Return a pointer offset to local row i of block K of values_ array;
   ///        user responsible for indexing into this pointer correctly
   /// \param K [in] must be the LOCAL block index within this block-row
   /// \param i [in] must be the LOCAL row index offset within this block-row
@@ -179,11 +184,12 @@ struct SparseBlockRowView {
   /// Pointer interfaces are NOT guaranteed for backward compatibility
   /// This interface is intended for performant kernels, not common usage
   KOKKOS_INLINE_FUNCTION
-  value_type* local_row_in_block (const ordinal_type& K, const ordinal_type& i) const {
-    return (values_+(K*blockDim_ + i*length*blockDim_)) ;
+  value_type* local_row_in_block(const ordinal_type& K,
+                                 const ordinal_type& i) const {
+    return (values_ + (K * blockDim_ + i * length * blockDim_));
   }
 
-  /// \brief Return the value at a specified block K of block-row 
+  /// \brief Return the value at a specified block K of block-row
   ///        with local row and col offset (i,j)
   /// \param K [in] must be the LOCAL block index within this block-row
   /// \param i [in] must be the LOCAL row index offset within this block-row
@@ -191,39 +197,47 @@ struct SparseBlockRowView {
   ///
   /// Output: reference to value_type at the given (K, i, j) offset into values_
   KOKKOS_INLINE_FUNCTION
-  value_type& local_block_value (const ordinal_type& K, const ordinal_type& i, const ordinal_type& j) const {
-    return values_[K*blockDim_ + i*length*blockDim_ + j];
+  value_type& local_block_value(const ordinal_type& K, const ordinal_type& i,
+                                const ordinal_type& j) const {
+    return values_[K * blockDim_ + i * length * blockDim_ + j];
   }
 
-  /// \brief Return unmanaged 2D strided View wrapping local block K from this block-row
+  /// \brief Return the block column index for a specified block K
+  ///
   /// \param K [in] must be the LOCAL block index within this block-row
+  /// \return Block column index for "uncompressed" block row
+  KOKKOS_INLINE_FUNCTION
+  ordinal_type block_colidx(const ordinal_type K) const { return colidx_[K]; }
+
+  /// \brief Return unmanaged 2D strided View wrapping local block K from this
+  /// block-row \param K [in] must be the LOCAL block index within this
+  /// block-row
   KOKKOS_INLINE_FUNCTION
   block_values_type block(const ordinal_type& K) const {
-    return block_values_type( &(values_[K*blockDim_]), Kokkos::LayoutStride(blockDim_,length*blockDim_,blockDim_,1) );
+    return block_values_type(
+        &(values_[K * blockDim_]),
+        Kokkos::LayoutStride(blockDim_, length * blockDim_, blockDim_, 1));
   }
 
-
   /// \brief Return offset into colidx_ for the requested block idx
   ///        If none found, return Kokkos::Details::ArithTraits::max
   /// \param idx_to_match [in] local block idx within block-row
   /// \param is_sorted [in] defaulted to false; no usage at this time
   KOKKOS_INLINE_FUNCTION
-  ordinal_type findRelBlockOffset ( const ordinal_type idx_to_match, bool /*is_sorted*/ = false ) const {
-    ordinal_type offset = Kokkos::Details::ArithTraits< ordinal_type >::max();
-    for ( ordinal_type blk_offset = 0; blk_offset < length; ++blk_offset ) {
+  ordinal_type findRelBlockOffset(const ordinal_type idx_to_match,
+                                  bool /*is_sorted*/ = false) const {
+    ordinal_type offset = Kokkos::Details::ArithTraits<ordinal_type>::max();
+    for (ordinal_type blk_offset = 0; blk_offset < length; ++blk_offset) {
       ordinal_type idx = colidx_[blk_offset];
-      if ( idx == idx_to_match ) 
-      { 
-        offset = blk_offset; 
+      if (idx == idx_to_match) {
+        offset = blk_offset;
         break;
-      } // return relative offset
+      }  // return relative offset
     }
     return offset;
   }
-
 };
 
-
 /// \class SparseBlockRowViewConst
 /// \brief Const view of a row of a sparse matrix.
 /// \tparam MatrixType Sparse matrix type, such as BlockCrsMatrix.
@@ -231,16 +245,19 @@ struct SparseBlockRowView {
 /// This class is like SparseBlockRowView, except that it provides a const
 /// view.  This class exists in order to let users get a const view of
 /// a row of a nonconst matrix.
-template<class MatrixType>
+template <class MatrixType>
 struct SparseBlockRowViewConst {
   //! The type of the values in the row.
   typedef const typename MatrixType::non_const_value_type value_type;
   //! The type of the column indices in the row.
   typedef const typename MatrixType::non_const_ordinal_type ordinal_type;
-  //! The type for returned block of values. 
-  typedef Kokkos::View< value_type**, Kokkos::LayoutStride, typename MatrixType::device_type, Kokkos::MemoryUnmanaged > block_values_type;
+  //! The type for returned block of values.
+  typedef Kokkos::View<value_type**, Kokkos::LayoutStride,
+                       typename MatrixType::device_type,
+                       Kokkos::MemoryUnmanaged>
+      block_values_type;
 
-private:
+ private:
   //! Array of values in the row.
   value_type* values_;
   //! Array of (local) column indices in the row.
@@ -248,11 +265,11 @@ struct SparseBlockRowViewConst {
   /// \brief Stride between successive rows in a block-row
   ///
   /// For block compressed sparse row (BlockCSR) storage with row-major layout,
-  /// (i.e. consecutive rows within a block are NOT contiguous), this will be the stride 
-  /// between rows within a block-row
+  /// (i.e. consecutive rows within a block are NOT contiguous), this will be
+  /// the stride between rows within a block-row
   const ordinal_type blockDim_;
 
-public:
+ public:
   /// \brief Constructor
   ///
   /// \param values [in] Array of the row's values.
@@ -263,19 +280,22 @@ struct SparseBlockRowViewConst {
   //
   // Assumes values and colidx__ already offset to the correct location
   KOKKOS_INLINE_FUNCTION
-  SparseBlockRowViewConst (value_type* const values,
-                           ordinal_type* const colidx__,
-                           const ordinal_type& blockDim,
-                           const ordinal_type& count) :
-    values_ (values), colidx_ (colidx__), blockDim_(blockDim), length (count)
-  {}
+  SparseBlockRowViewConst(value_type* const values,
+                          ordinal_type* const colidx__,
+                          const ordinal_type& blockDim,
+                          const ordinal_type& count)
+      : values_(values),
+        colidx_(colidx__),
+        blockDim_(blockDim),
+        length(count) {}
 
   /// \brief Constructor with offset into \c colidx array
   ///
   /// \param values [in] Array of the row's values.
   /// \param colidx [in] Array of the row's column indices.
   /// \param count [in] Number of entries in the row.
-  /// \param start [in] Offset into values and colidx of the desired block-row start.
+  /// \param start [in] Offset into values and colidx of the desired block-row
+  /// start.
   ///   Note: The offset into the values array for a block-row equals
   ///           num_blocks_prior_to_block-row*blockDim*blockDim
   ///
@@ -283,16 +303,18 @@ struct SparseBlockRowViewConst {
   ///   built-in integer type.  This may differ from ordinal_type.
   ///   For example, the matrix may have dimensions that fit in int,
   ///   but a number of entries that does not fit in int.
-  template<class OffsetType>
-  KOKKOS_INLINE_FUNCTION
-  SparseBlockRowViewConst (const typename MatrixType::values_type& values,
-                           const typename MatrixType::index_type& colidx__,
-                           const ordinal_type& blockDim,
-                           const ordinal_type& count,
-                           const OffsetType& start,
-                           const typename std::enable_if<std::is_integral<OffsetType>::value, int>::type& = 0) :
-    values_ (&values(start*blockDim*blockDim)), colidx_ (&colidx__(start)), blockDim_(blockDim), length (count)
-  {}
+  template <class OffsetType>
+  KOKKOS_INLINE_FUNCTION SparseBlockRowViewConst(
+      const typename MatrixType::values_type& values,
+      const typename MatrixType::index_type& colidx__,
+      const ordinal_type& blockDim, const ordinal_type& count,
+      const OffsetType& start,
+      const typename std::enable_if<std::is_integral<OffsetType>::value,
+                                    int>::type& = 0)
+      : values_(&values(start * blockDim * blockDim)),
+        colidx_(&colidx__(start)),
+        blockDim_(blockDim),
+        length(count) {}
 
   /// \brief Number of entries (i.e. blocks) in the row.
   ///
@@ -306,8 +328,7 @@ struct SparseBlockRowViewConst {
   /// 'length' is ordinal_type.
   const ordinal_type length;
 
-
-  /// \brief Return a pointer offset to full-row i of values_ array; 
+  /// \brief Return a pointer offset to full-row i of values_ array;
   ///        user responsible for indexing into this pointer correctly
   /// \param i [in] must be the LOCAL row index offset within this block-row
   ///
@@ -316,11 +337,11 @@ struct SparseBlockRowViewConst {
   /// Pointer interfaces are NOT guaranteed for backward compatibility
   /// This interface is intended for performant kernels, not common usage
   KOKKOS_INLINE_FUNCTION
-  value_type* full_row_in_block_row (const ordinal_type& i) const {
-    return values_+(i*length*blockDim_) ;
+  value_type* full_row_in_block_row(const ordinal_type& i) const {
+    return values_ + (i * length * blockDim_);
   }
 
-  /// /brief Return a pointer offset to local row i of block K of values_ array; 
+  /// /brief Return a pointer offset to local row i of block K of values_ array;
   ///        user responsible for indexing into this pointer correctly
   /// \param K [in] must be the LOCAL block index within this block-row
   /// \param i [in] must be the LOCAL row index offset within this block-row
@@ -330,48 +351,60 @@ struct SparseBlockRowViewConst {
   /// Pointer interfaces are NOT guaranteed for backward compatibility
   /// This interface is intended for performant kernels, not common usage
   KOKKOS_INLINE_FUNCTION
-  value_type* local_row_in_block (const ordinal_type& K, const ordinal_type& i) const {
-    return (values_+(K*blockDim_ + i*length*blockDim_)) ;
+  value_type* local_row_in_block(const ordinal_type& K,
+                                 const ordinal_type& i) const {
+    return (values_ + (K * blockDim_ + i * length * blockDim_));
   }
 
-  /// \brief Return the value at a specified block K with local row and col ids (i,j)
-  /// \param K [in] must be the LOCAL block index within this block-row
+  /// \brief Return the value at a specified block K with local row and col ids
+  /// (i,j) \param K [in] must be the LOCAL block index within this block-row
   /// \param i [in] must be the LOCAL row index offset within this block-row
   /// \param j [in] must be the LOCAL col index offset within this block-row
   ///
   /// Output: reference to value_type at the given (K, i, j) offset into values_
   KOKKOS_INLINE_FUNCTION
-  value_type& local_block_value (const ordinal_type& K, const ordinal_type& i, const ordinal_type& j) const {
-    return values_[K*blockDim_ + i*length*blockDim_ + j];
+  value_type& local_block_value(const ordinal_type& K, const ordinal_type& i,
+                                const ordinal_type& j) const {
+    return values_[K * blockDim_ + i * length * blockDim_ + j];
   }
 
-  /// \brief Return unmanaged 2D strided View wrapping local block K from this block-row
+  /// \brief Return the block column index for a specified block K
+  ///
   /// \param K [in] must be the LOCAL block index within this block-row
+  /// \return Block column index for "uncompressed" block row
+  KOKKOS_INLINE_FUNCTION
+  ordinal_type block_colidx(const ordinal_type K) const { return colidx_[K]; }
+
+  /// \brief Return unmanaged 2D strided View wrapping local block K from this
+  /// block-row \param K [in] must be the LOCAL block index within this
+  /// block-row
   KOKKOS_INLINE_FUNCTION
   block_values_type block(const ordinal_type& K) const {
-    return block_values_type( &(values_[K*blockDim_]), Kokkos::LayoutStride(blockDim_,length*blockDim_,blockDim_,1) );
+    return block_values_type(
+        &(values_[K * blockDim_]),
+        Kokkos::LayoutStride(blockDim_, length * blockDim_, blockDim_, 1));
   }
 
-
   /// \brief Return offset into colidx_ for the requested block idx
   ///        If none found, return Kokkos::Details::ArithTraits::max
   /// \param idx_to_match [in] local block idx within block-row
   /// \param is_sorted [in] defaulted to false; no usage at this time
   KOKKOS_INLINE_FUNCTION
-  ordinal_type findRelBlockOffset ( const ordinal_type &idx_to_match, bool /*is_sorted*/ = false ) const {
+  ordinal_type findRelBlockOffset(const ordinal_type& idx_to_match,
+                                  bool /*is_sorted*/ = false) const {
     typedef typename std::remove_cv<ordinal_type>::type non_const_ordinal_type;
-    non_const_ordinal_type offset = Kokkos::Details::ArithTraits< non_const_ordinal_type >::max();
-    for ( non_const_ordinal_type blk_offset = 0; blk_offset < length; ++blk_offset ) {
+    non_const_ordinal_type offset =
+        Kokkos::Details::ArithTraits<non_const_ordinal_type>::max();
+    for (non_const_ordinal_type blk_offset = 0; blk_offset < length;
+         ++blk_offset) {
       ordinal_type idx = colidx_[blk_offset];
-      if ( idx == idx_to_match ) 
-      { 
-        offset = blk_offset; 
+      if (idx == idx_to_match) {
+        offset = blk_offset;
         break;
-      } // return relative offset
+      }  // return relative offset
     }
     return offset;
   }
-
 };
 
 /// \class BlockCrsMatrix
@@ -386,15 +419,21 @@ struct SparseBlockRowViewConst {
 /// Trilinos traditionally uses to describe compressed sparse row
 /// storage for sparse matrices, as described, for example, in Saad
 /// (2nd ed.).
-template<class ScalarType,
-         class OrdinalType,
-         class Device,
-         class MemoryTraits = void,
-         class SizeType = typename Kokkos::ViewTraits<OrdinalType*, Device, void, void>::size_type>
+template <class ScalarType, class OrdinalType, class Device,
+          class MemoryTraits = void,
+          class SizeType     = typename Kokkos::ViewTraits<OrdinalType*, Device,
+                                                       void, void>::size_type>
 class BlockCrsMatrix {
-private:
-  typedef typename Kokkos::ViewTraits<ScalarType*,Device,void,void>::host_mirror_space host_mirror_space ;
-public:
+  static_assert(
+      std::is_signed<OrdinalType>::value,
+      "BlockCrsMatrix requires that OrdinalType is a signed integer type.");
+
+ private:
+  typedef
+      typename Kokkos::ViewTraits<ScalarType*, Device, void,
+                                  void>::host_mirror_space host_mirror_space;
+
+ public:
   //! Type of the matrix's execution space.
   typedef typename Device::execution_space execution_space;
   //! Type of the matrix's memory space.
@@ -414,11 +453,17 @@ class BlockCrsMatrix {
   typedef SizeType size_type;
 
   //! Type of a host-memory mirror of the sparse matrix.
-  typedef BlockCrsMatrix<ScalarType, OrdinalType, host_mirror_space, MemoryTraits> HostMirror;
+  typedef BlockCrsMatrix<ScalarType, OrdinalType, host_mirror_space,
+                         MemoryTraits>
+      HostMirror;
   //! Type of the graph structure of the sparse matrix.
-  typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft, execution_space, memory_traits, size_type> StaticCrsGraphType;
+  typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft,
+                                 execution_space, memory_traits, size_type>
+      StaticCrsGraphType;
   //! Type of the graph structure of the sparse matrix - consistent with Kokkos.
-  typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft, execution_space, memory_traits, size_type> staticcrsgraph_type;
+  typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft,
+                                 execution_space, memory_traits, size_type>
+      staticcrsgraph_type;
   //! Type of column indices in the sparse matrix.
   typedef typename staticcrsgraph_type::entries_type index_type;
   //! Const version of the type of column indices in the sparse matrix.
@@ -432,7 +477,9 @@ class BlockCrsMatrix {
   //! Nonconst version of the type of row offsets in the sparse matrix.
   typedef typename row_map_type::non_const_value_type non_const_size_type;
   //! Kokkos Array type of the entries (values) in the sparse matrix.
-  typedef Kokkos::View<value_type*, Kokkos::LayoutRight, device_type, MemoryTraits> values_type;
+  typedef Kokkos::View<value_type*, Kokkos::LayoutRight, device_type,
+                       MemoryTraits>
+      values_type;
   //! Const version of the type of the entries in the sparse matrix.
   typedef typename values_type::const_value_type const_value_type;
   //! Nonconst version of the type of the entries in the sparse matrix.
@@ -462,40 +509,38 @@ class BlockCrsMatrix {
   ///
   /// mfh: numCols and nnz should be properties of the graph, not the matrix.
   /// Then BlockCrsMatrix needs methods to get these from the graph.
-  BlockCrsMatrix () :
-    numCols_ (0),
-    blockDim_ (0)
-  {}
+  BlockCrsMatrix() : numCols_(0), blockDim_(0) {}
 
   //! Copy constructor (shallow copy).
-  template<typename SType,
-           typename OType,
-           class DType,
-           class MTType,
-           typename IType>
-  BlockCrsMatrix (const BlockCrsMatrix<SType,OType,DType,MTType,IType> & B) :
-    graph (B.graph.entries, B.graph.row_map),
-    values (B.values),
-    dev_config (B.dev_config),
-    numCols_ (B.numCols ()),
-    blockDim_ (B.blockDim ())
-  {
+  template <typename SType, typename OType, class DType, class MTType,
+            typename IType>
+  BlockCrsMatrix(const BlockCrsMatrix<SType, OType, DType, MTType, IType>& B)
+      : graph(B.graph.entries, B.graph.row_map),
+        values(B.values),
+        dev_config(B.dev_config),
+        numCols_(B.numCols()),
+        blockDim_(B.blockDim()) {
     graph.row_block_offsets = B.graph.row_block_offsets;
-    //MD: Changed the copy constructor of graph
-    //as the constructor of StaticCrsGraph does not allow copy from non const version.
+    // MD: Changed the copy constructor of graph
+    // as the constructor of StaticCrsGraph does not allow copy from non const
+    // version.
   }
 
   /// \brief Construct with a graph that will be shared.
   ///
+  /// \param[in] arg_label   The sparse matrix's label.
+  /// \param[in] arg_graph   The graph between the blocks.
+  /// \param[in] blockDimIn  The block size.
+  ///
   /// Allocate the values array for subsequent fill.
-  BlockCrsMatrix (const std::string& arg_label,
-                  const staticcrsgraph_type& arg_graph, 
-                  const OrdinalType& blockDimIn) :
-    graph (arg_graph),
-    values (arg_label, arg_graph.entries.extent(0)),
-    numCols_ (maximum_entry (arg_graph) + 1),
-    blockDim_ (blockDimIn)
-  {}
+  BlockCrsMatrix(const std::string& arg_label,
+                 const staticcrsgraph_type& arg_graph,
+                 const OrdinalType& blockDimIn)
+      : graph(arg_graph),
+        values(arg_label,
+               arg_graph.entries.extent(0) * blockDimIn * blockDimIn),
+        numCols_(maximum_entry(arg_graph) + 1),
+        blockDim_(blockDimIn) {}
 
   /// \brief Constructor that copies raw arrays of host data in
   ///   coordinate format.
@@ -521,18 +566,11 @@ class BlockCrsMatrix {
   ///   vectorization.
   ///
   /// The \c pad argument is currently not used.
-  BlockCrsMatrix (const std::string &label,
-                  OrdinalType nrows,
-                  OrdinalType ncols,
-                  size_type annz,
-                  ScalarType* val,
-                  OrdinalType* rows,
-                  OrdinalType* cols,
-                  OrdinalType blockdim,
-                  bool pad = false)
-  {
-    (void) pad;
-    ctor_impl (label, nrows, ncols, annz, val, rows, cols, blockdim);
+  BlockCrsMatrix(const std::string& label, OrdinalType nrows, OrdinalType ncols,
+                 size_type annz, ScalarType* val, OrdinalType* rows,
+                 OrdinalType* cols, OrdinalType blockdim, bool pad = false) {
+    (void)pad;
+    ctor_impl(label, nrows, ncols, annz, val, rows, cols, blockdim);
   }
 
   /// \brief Constructor that accepts a row map, column indices, and
@@ -549,36 +587,34 @@ class BlockCrsMatrix {
   /// \param rows [in/out] The row map (containing the offsets to the
   ///   data in each row).
   /// \param cols [in/out] The column indices.
-  BlockCrsMatrix (const std::string& /*label*/,
-                  const OrdinalType nrows,
-                  const OrdinalType ncols,
-                  const size_type /*annz*/,
-                  const values_type& vals,
-                  const row_map_type& rows,
-                  const index_type& cols,
-                  const OrdinalType blockDimIn) :
-    graph (cols, rows),
-    values (vals),
-    numCols_ (ncols),
-    blockDim_ (blockDimIn)
-  {
-
-    const ordinal_type actualNumRows = (rows.extent (0) != 0) ?
-      static_cast<ordinal_type> (rows.extent (0) - static_cast<size_type> (1)) :
-      static_cast<ordinal_type> (0);
+  BlockCrsMatrix(const std::string& /*label*/, const OrdinalType nrows,
+                 const OrdinalType ncols, const size_type /*annz*/,
+                 const values_type& vals, const row_map_type& rows,
+                 const index_type& cols, const OrdinalType blockDimIn)
+      : graph(cols, rows),
+        values(vals),
+        numCols_(ncols),
+        blockDim_(blockDimIn) {
+    const ordinal_type actualNumRows =
+        (rows.extent(0) != 0) ? static_cast<ordinal_type>(
+                                    rows.extent(0) - static_cast<size_type>(1))
+                              : static_cast<ordinal_type>(0);
     if (nrows != actualNumRows) {
       std::ostringstream os;
-      os << "Input argument nrows = " << nrows << " != the actual number of "
-        "rows " << actualNumRows << " according to the 'rows' input argument.";
-      throw std::invalid_argument (os.str ());
+      os << "Input argument nrows = " << nrows
+         << " != the actual number of "
+            "rows "
+         << actualNumRows << " according to the 'rows' input argument.";
+      throw std::invalid_argument(os.str());
     }
     // nnz returns graph.entries.extent(0) i.e. ptr[ nrows + 1 ] nnz entry
-    // input annz is nnz of values, not comparable with block ptr 'nnz' i.e. numBlocks
+    // input annz is nnz of values, not comparable with block ptr 'nnz' i.e.
+    // numBlocks
     if (blockDim_ <= 0) {
       std::ostringstream os;
       os << "Input argument blockDim = " << blockDim_
          << " is not larger than 0.";
-      throw std::invalid_argument (os.str ());
+      throw std::invalid_argument(os.str());
     }
   }
 
@@ -587,237 +623,246 @@ class BlockCrsMatrix {
   /// The matrix will store and use the row map, indices, and values
   /// directly (by view, not by deep copy).
   ///
-  /// \param label [in] The sparse matrix's label.
-  /// \param nrows [in] The number of rows.
-  /// \param ncols [in] The number of columns.
-  /// \param annz [in] The number of entries.
-  /// \param vals [in/out] The entries.
-  /// \param rows [in/out] The row map (containing the offsets to the
-  ///   data in each row).
-  /// \param cols [in/out] The column indices.
-  BlockCrsMatrix (const std::string& /*label*/,
-                  const OrdinalType& ncols,
-                  const values_type& vals,
-                  const staticcrsgraph_type& graph_,
-                  const OrdinalType& blockDimIn) :
-    graph (graph_),
-    values (vals),
-    numCols_ (ncols),
-    blockDim_ (blockDimIn)
-  {}
+  /// \param[in] label  The sparse matrix's label.
+  /// \param[in] ncols  The number of columns.
+  /// \param[in] vals   The entries.
+  /// \param[in] graph_ The graph between the blocks.
+  /// \param[in] blockDimIn  The block size.
+  BlockCrsMatrix(const std::string& /*label*/, const OrdinalType& ncols,
+                 const values_type& vals, const staticcrsgraph_type& graph_,
+                 const OrdinalType& blockDimIn)
+      : graph(graph_), values(vals), numCols_(ncols), blockDim_(blockDimIn) {}
 
   /// \brief Constructor that accepts a CrsMatrix and block dimension,
   ///        assuming the provided CrsMatrix has appropriate block structure.
-  template<typename SType,
-           typename OType,
-           class DType,
-           class MTType,
-           typename IType>
-  BlockCrsMatrix (const KokkosSparse::CrsMatrix<SType, OType, DType, MTType, IType> &crs_mtx,
-                  const OrdinalType blockDimIn)
-  {
-    typedef typename KokkosSparse::CrsMatrix<SType, OType, DType, MTType, IType> crs_matrix_type;
+  template <typename SType, typename OType, class DType, class MTType,
+            typename IType>
+  BlockCrsMatrix(const KokkosSparse::CrsMatrix<SType, OType, DType, MTType,
+                                               IType>& crs_mtx,
+                 const OrdinalType blockDimIn) {
+    typedef typename KokkosSparse::CrsMatrix<SType, OType, DType, MTType, IType>
+        crs_matrix_type;
     typedef typename crs_matrix_type::staticcrsgraph_type crs_graph_type;
     typedef typename crs_graph_type::entries_type crs_graph_entries_type;
     typedef typename crs_graph_type::row_map_type crs_graph_row_map_type;
 
     blockDim_ = blockDimIn;
-    numCols_ = crs_mtx.numCols() / blockDim_;
-    values = crs_mtx.values;
+    numCols_  = crs_mtx.numCols() / blockDim_;
+    values    = crs_mtx.values;
 
-    OrdinalType nbrows = crs_mtx.numRows()/blockDim_; // actual number of block rows; add 1 for ptr length
+    OrdinalType nbrows =
+        crs_mtx.numRows() /
+        blockDim_;  // actual number of block rows; add 1 for ptr length
 
-    // block_rows will accumulate the number of blocks per row - this is NOT the row_map with cum sum!!
-    std::vector<OrdinalType> block_rows( nbrows, 0 );
+    // block_rows will accumulate the number of blocks per row - this is NOT the
+    // row_map with cum sum!!
+    std::vector<OrdinalType> block_rows(nbrows, 0);
 
-    typename crs_graph_row_map_type::HostMirror h_crs_row_map = Kokkos::create_mirror_view(crs_mtx.graph.row_map);
+    typename crs_graph_row_map_type::HostMirror h_crs_row_map =
+        Kokkos::create_mirror_view(crs_mtx.graph.row_map);
     Kokkos::deep_copy(h_crs_row_map, crs_mtx.graph.row_map);
-    typename crs_graph_entries_type::HostMirror h_crs_entries = Kokkos::create_mirror_view(crs_mtx.graph.entries);
+    typename crs_graph_entries_type::HostMirror h_crs_entries =
+        Kokkos::create_mirror_view(crs_mtx.graph.entries);
     Kokkos::deep_copy(h_crs_entries, crs_mtx.graph.entries);
 
-    // determine size of block cols indices == number of blocks, i.e. nnz for the block CRS graph
+    // determine size of block cols indices == number of blocks, i.e. nnz for
+    // the block CRS graph
     OrdinalType numBlocks = 0;
-    for ( OrdinalType i = 0; i < crs_mtx.numRows(); i+=blockDim_ ) {
-      numBlocks += ( h_crs_row_map(i+1) - h_crs_row_map(i) ) / blockDim_; // cum sum
-      block_rows[ i/blockDim_ ] = ( h_crs_row_map(i+1) - h_crs_row_map(i) ) / blockDim_; // frequency counts
+    for (OrdinalType i = 0; i < crs_mtx.numRows(); i += blockDim_) {
+      numBlocks +=
+          (h_crs_row_map(i + 1) - h_crs_row_map(i)) / blockDim_;  // cum sum
+      block_rows[i / blockDim_] = (h_crs_row_map(i + 1) - h_crs_row_map(i)) /
+                                  blockDim_;  // frequency counts
     }
 
     // create_staticcrsgraph takes the frequency of blocks per row
-    // and returns the cum sum pointer row_map with nbrows+1 size, and total numBlocks in the final entry
-    graph = Kokkos::create_staticcrsgraph<staticcrsgraph_type> ("blockgraph", block_rows);
-    typename index_type::HostMirror h_entries = Kokkos::create_mirror_view (graph.entries);
-    typename row_map_type::HostMirror h_rowmap = Kokkos::create_mirror_view (graph.row_map);
+    // and returns the cum sum pointer row_map with nbrows+1 size, and total
+    // numBlocks in the final entry
+    graph = Kokkos::create_staticcrsgraph<staticcrsgraph_type>("blockgraph",
+                                                               block_rows);
+    typename index_type::HostMirror h_entries =
+        Kokkos::create_mirror_view(graph.entries);
+    typename row_map_type::HostMirror h_rowmap =
+        Kokkos::create_mirror_view(graph.row_map);
 
-    Kokkos::deep_copy (h_rowmap, graph.row_map);
+    Kokkos::deep_copy(h_rowmap, graph.row_map);
 
     for (OrdinalType i = 0; i < nbrows; ++i) {
       OrdinalType blks_in_row = block_rows[i];
-      
+
       OrdinalType offset_into_blkcolidx_start = h_rowmap(i);
-      OrdinalType offset_into_colidx_start = offset_into_blkcolidx_start*blockDim_*blockDim_;
+      OrdinalType offset_into_colidx_start =
+          offset_into_blkcolidx_start * blockDim_ * blockDim_;
 
-      for ( OrdinalType lidx = 0; lidx < blks_in_row; ++lidx ) {
-        h_entries( offset_into_blkcolidx_start+lidx ) = h_crs_entries( offset_into_colidx_start + blockDim_*lidx ) / blockDim_;
+      for (OrdinalType lidx = 0; lidx < blks_in_row; ++lidx) {
+        h_entries(offset_into_blkcolidx_start + lidx) =
+            h_crs_entries(offset_into_colidx_start + blockDim_ * lidx) /
+            blockDim_;
       }
     }
 
-    Kokkos::deep_copy (graph.entries, h_entries);
+    Kokkos::deep_copy(graph.entries, h_entries);
   }
 
-
   /// Declaration for ctor_impl - this member function is not inlined
-  void
-  ctor_impl (const std::string &label,
-          const OrdinalType nrows,
-          const OrdinalType ncols,
-          const size_type annz,
-          ScalarType* val,
-          OrdinalType* rows,
-          OrdinalType* cols,
-          const OrdinalType blockDimIn);
-
-
-  /// \brief Given an array of blocks, sum the values into corresponding 
+  void ctor_impl(const std::string& label, const OrdinalType nrows,
+                 const OrdinalType ncols, const size_type annz, ScalarType* val,
+                 OrdinalType* rows, OrdinalType* cols,
+                 const OrdinalType blockDimIn);
+
+  /// \brief Given an array of blocks, sum the values into corresponding
   ///        block in BlockCrsMatrix
   /// \param rowi [in]   is a block-row index
   /// \param ncol [in]   is number of blocks referenced in cols[] array
-  /// \param cols[] [in] are block colidxs within the block-row to be summed into
+  /// \param cols[] [in] are block colidxs within the block-row to be summed
+  /// into
   ///                    ncol entries
   /// \param vals[] [in] array containing 'block' of values
   ///        ncol*block_size*block_size entries
-  ///        assume vals block is provided in 'LayoutRight' or 'Row Major' format, that is 
-  ///        e.g. 2x2 block [ a b ; c d ] provided as flattened 1d array as [a b c d]
-  ///        Assume that each block is stored contiguously in vals:
-  ///        [a b; c d] [e f; g h] -> [a b c d e f g h]
-  ///        If so, then i in [0, ncols) for cols[] 
-  ///        maps to i*block_size*block_size in vals[]
+  ///        assume vals block is provided in 'LayoutRight' or 'Row Major'
+  ///        format, that is e.g. 2x2 block [ a b ; c d ] provided as flattened
+  ///        1d array as [a b c d] Assume that each block is stored contiguously
+  ///        in vals: [a b; c d] [e f; g h] -> [a b c d e f g h] If so, then i
+  ///        in [0, ncols) for cols[] maps to i*block_size*block_size in vals[]
   KOKKOS_INLINE_FUNCTION
-  OrdinalType
-  sumIntoValues (const OrdinalType rowi,
-                 const OrdinalType cols[],
-                 const OrdinalType ncol,
-                 const ScalarType vals[],
-                 const bool is_sorted = false,
-                 const bool force_atomic = false) const
-  {
-    SparseBlockRowView<BlockCrsMatrix> row_view = this->block_row (rowi);
-    const ordinal_type block_size = this->blockDim();
-
-    ordinal_type numValid = 0; // number of valid local column indices
+  OrdinalType sumIntoValues(const OrdinalType rowi, const OrdinalType cols[],
+                            const OrdinalType ncol, const ScalarType vals[],
+                            const bool is_sorted    = false,
+                            const bool force_atomic = false) const {
+    SparseBlockRowView<BlockCrsMatrix> row_view = this->block_row(rowi);
+    const ordinal_type block_size               = this->blockDim();
 
-    for (ordinal_type i = 0; i < ncol; ++i) {
+    ordinal_type numValid = 0;  // number of valid local column indices
 
+    for (ordinal_type i = 0; i < ncol; ++i) {
       // Find offset into values for block-row rowi and colidx cols[i]
       // cols[i] is the index to match
-      // blk_offset is the offset for block colidx from bptr[rowi] to bptr[rowi + 1] (not global offset)
-      // colidx_ and values_ are already offset to the beginning of blockrow rowi
+      // blk_offset is the offset for block colidx from bptr[rowi] to bptr[rowi
+      // + 1] (not global offset) colidx_ and values_ are already offset to the
+      // beginning of blockrow rowi
       auto blk_offset = row_view.findRelBlockOffset(cols[i], is_sorted);
-      if ( blk_offset != Kokkos::Details::ArithTraits<ordinal_type>::max() ) {
-        ordinal_type offset_into_vals = i*block_size*block_size; //stride == 1 assumed between elements
-        for ( ordinal_type lrow = 0; lrow < block_size; ++lrow ) {
-          auto local_row_values = row_view.local_row_in_block(blk_offset, lrow); // pointer to start of specified local row within this block
-          for ( ordinal_type lcol = 0; lcol < block_size; ++lcol ) {
+      if (blk_offset != Kokkos::Details::ArithTraits<ordinal_type>::max()) {
+        ordinal_type offset_into_vals =
+            i * block_size *
+            block_size;  // stride == 1 assumed between elements
+        for (ordinal_type lrow = 0; lrow < block_size; ++lrow) {
+          auto local_row_values = row_view.local_row_in_block(
+              blk_offset, lrow);  // pointer to start of specified local row
+                                  // within this block
+          for (ordinal_type lcol = 0; lcol < block_size; ++lcol) {
             if (force_atomic) {
-              Kokkos::atomic_add (&(local_row_values[lcol]), vals[ offset_into_vals + lrow*block_size + lcol ]);
-            }
-            else {
-              local_row_values[lcol] += vals[ offset_into_vals + lrow*block_size + lcol];
+              Kokkos::atomic_add(
+                  &(local_row_values[lcol]),
+                  vals[offset_into_vals + lrow * block_size + lcol]);
+            } else {
+              local_row_values[lcol] +=
+                  vals[offset_into_vals + lrow * block_size + lcol];
             }
           }
         }
         ++numValid;
       }
-    } // end for ncol
+    }  // end for ncol
     return numValid;
   }
 
-
-  /// \brief Given an array of blocks, replace the values of corresponding 
+  /// \brief Given an array of blocks, replace the values of corresponding
   ///        blocks in BlockCrsMatrix
   /// \param rowi [in]   is a block-row index
   /// \param ncol [in]   is number of blocks referenced in cols[] array
-  /// \param cols[] [in] are block colidxs within the block-row to be summed into
+  /// \param cols[] [in] are block colidxs within the block-row to be summed
+  /// into
   ///                    ncol entries
   /// \param vals[] [in] array containing 'block' of values
   //        ncol*block_size*block_size entries
-  //        assume vals block is provided in 'LayoutRight' or 'Row Major' format, that is 
-  //        e.g. 2x2 block [ a b ; c d ] provided as flattened 1d array as [a b c d]
-  //        Assume that each block is stored contiguously in vals:
-  //        [a b; c d] [e f; g h] -> [a b c d e f g h]
-  //        If so, then i in [0, ncols) for cols[] 
-  //        maps to i*block_size*block_size in vals[]
+  //        assume vals block is provided in 'LayoutRight' or 'Row Major'
+  //        format, that is e.g. 2x2 block [ a b ; c d ] provided as flattened
+  //        1d array as [a b c d] Assume that each block is stored contiguously
+  //        in vals: [a b; c d] [e f; g h] -> [a b c d e f g h] If so, then i in
+  //        [0, ncols) for cols[] maps to i*block_size*block_size in vals[]
   KOKKOS_INLINE_FUNCTION
-  OrdinalType
-  replaceValues (const OrdinalType rowi,
-                 const OrdinalType cols[],
-                 const OrdinalType ncol,
-                 const ScalarType vals[],
-                 const bool is_sorted = false,
-                 const bool force_atomic = false) const
-  {
-    SparseBlockRowView<BlockCrsMatrix> row_view = this->block_row (rowi);
-    const ordinal_type block_size = this->blockDim();
-
-    ordinal_type numValid = 0; // number of valid local column indices
+  OrdinalType replaceValues(const OrdinalType rowi, const OrdinalType cols[],
+                            const OrdinalType ncol, const ScalarType vals[],
+                            const bool is_sorted    = false,
+                            const bool force_atomic = false) const {
+    SparseBlockRowView<BlockCrsMatrix> row_view = this->block_row(rowi);
+    const ordinal_type block_size               = this->blockDim();
 
-    for (ordinal_type i = 0; i < ncol; ++i) {
+    ordinal_type numValid = 0;  // number of valid local column indices
 
+    for (ordinal_type i = 0; i < ncol; ++i) {
       // Find offset into values for block-row rowi and colidx cols[i]
       // cols[i] is the index to match
-      // blk_offset is the offset for block colidx from bptr[rowi] to bptr[rowi + 1] (not global offset)
-      // colidx_ and values_ are already offset to the beginning of blockrow rowi
+      // blk_offset is the offset for block colidx from bptr[rowi] to bptr[rowi
+      // + 1] (not global offset) colidx_ and values_ are already offset to the
+      // beginning of blockrow rowi
       auto blk_offset = row_view.findRelBlockOffset(cols[i], is_sorted);
-      if ( blk_offset != Kokkos::Details::ArithTraits<ordinal_type>::max() ) {
-        ordinal_type offset_into_vals = i*block_size*block_size; //stride == 1 assumed between elements
-        for ( ordinal_type lrow = 0; lrow < block_size; ++lrow ) {
-          auto local_row_values = row_view.local_row_in_block(blk_offset, lrow); // pointer to start of specified local row within this block
-          for ( ordinal_type lcol = 0; lcol < block_size; ++lcol ) {
+      if (blk_offset != Kokkos::Details::ArithTraits<ordinal_type>::max()) {
+        ordinal_type offset_into_vals =
+            i * block_size *
+            block_size;  // stride == 1 assumed between elements
+        for (ordinal_type lrow = 0; lrow < block_size; ++lrow) {
+          auto local_row_values = row_view.local_row_in_block(
+              blk_offset, lrow);  // pointer to start of specified local row
+                                  // within this block
+          for (ordinal_type lcol = 0; lcol < block_size; ++lcol) {
             if (force_atomic) {
-              Kokkos::atomic_assign(&(local_row_values[lcol]), vals[ offset_into_vals + lrow*block_size + lcol ]);
-            }
-            else {
-              local_row_values[lcol] = vals[ offset_into_vals + lrow*block_size + lcol];
+              Kokkos::atomic_assign(
+                  &(local_row_values[lcol]),
+                  vals[offset_into_vals + lrow * block_size + lcol]);
+            } else {
+              local_row_values[lcol] =
+                  vals[offset_into_vals + lrow * block_size + lcol];
             }
           }
         }
         ++numValid;
       }
-    } // end for ncol
+    }  // end for ncol
     return numValid;
   }
 
   //! Attempt to assign the input matrix to \c *this.
   // Are the CUDA sparse handles needed to be copied here??
-  template<typename aScalarType, typename aOrdinalType, class aDevice, class aMemoryTraits,typename aSizeType>
-  BlockCrsMatrix&
-  operator= (const BlockCrsMatrix<aScalarType, aOrdinalType, aDevice, aMemoryTraits, aSizeType>& mtx)
-  {
-    numCols_ = mtx.numCols ();
-    blockDim_ = mtx.blockDim ();
-    graph = mtx.graph;
-    values = mtx.values;
+  template <typename aScalarType, typename aOrdinalType, class aDevice,
+            class aMemoryTraits, typename aSizeType>
+  BlockCrsMatrix& operator=(
+      const BlockCrsMatrix<aScalarType, aOrdinalType, aDevice, aMemoryTraits,
+                           aSizeType>& mtx) {
+    numCols_   = mtx.numCols();
+    blockDim_  = mtx.blockDim();
+    graph      = mtx.graph;
+    values     = mtx.values;
     dev_config = mtx.dev_config;
     return *this;
   }
 
   //! The number of rows in the sparse matrix.
-  KOKKOS_INLINE_FUNCTION ordinal_type numRows () const {
-    return graph.numRows ();
+  KOKKOS_INLINE_FUNCTION ordinal_type numRows() const {
+    return graph.numRows();
   }
 
   //! The number of columns in the sparse matrix.
-  KOKKOS_INLINE_FUNCTION ordinal_type numCols () const {
-    return numCols_;
-  }
+  KOKKOS_INLINE_FUNCTION ordinal_type numCols() const { return numCols_; }
 
   //! The block dimension in the sparse block matrix.
-  KOKKOS_INLINE_FUNCTION ordinal_type blockDim () const {
-    return blockDim_ ;
+  KOKKOS_INLINE_FUNCTION ordinal_type blockDim() const { return blockDim_; }
+
+  //! The number of "point" (non-block) rows in the matrix.
+  //  This is the dimension of the range of this matrix as a linear operator.
+  KOKKOS_INLINE_FUNCTION ordinal_type numPointRows() const {
+    return numRows() * blockDim();
+  }
+
+  //! The number of "point" (non-block) columns in the matrix.
+  //  This is the dimension of the domain of this matrix as a linear operator.
+  KOKKOS_INLINE_FUNCTION ordinal_type numPointCols() const {
+    return numCols() * blockDim();
   }
 
   //! The number of stored entries in the sparse matrix.
-  KOKKOS_INLINE_FUNCTION size_type nnz () const {
-    return graph.entries.extent (0);
+  KOKKOS_INLINE_FUNCTION size_type nnz() const {
+    return graph.entries.extent(0);
   }
 
   friend struct SparseBlockRowView<BlockCrsMatrix>;
@@ -828,14 +873,15 @@ class BlockCrsMatrix {
   ///
   /// The returned object \c view implements the following interface:
   /// <ul>
-  /// <li> \c view.length is the number of entries (i.e. blocks) 
+  /// <li> \c view.length is the number of entries (i.e. blocks)
   ///      in the block row </li>
   /// <li> \c view.local_row_in_block_row(K, i) returns a nonconst pointer
-  ///      to the values of the ith local row in the k-th block of the block-row </li>
+  ///      to the values of the ith local row in the k-th block of the block-row
+  ///      </li>
   /// <li> \c view.full_row_in_block_row(i) returns a nonconst pointer
   ///      to the values of the ith local row of the block-row </li>
   /// <li> \c view.local_block_value(K, i, j) returns a nonconst reference
-  ///      to the value in the ith local row and jth local col 
+  ///      to the value in the ith local row and jth local col
   ///      of the k-th block of the block-row </li>
   /// <li> \c view.block(K) returns an unmanaged 2D strided Kokkos::View
   ///      of the values of the k-th block of the block-row </li>
@@ -845,15 +891,17 @@ class BlockCrsMatrix {
   /// should instead assign to 'auto'.
   ///
   KOKKOS_INLINE_FUNCTION
-  SparseBlockRowView<BlockCrsMatrix> block_row (const ordinal_type i) const {
-
-    const size_type start = graph.row_map(i); // total num blocks prior to this block-row
-    const ordinal_type count = static_cast<ordinal_type> (graph.row_map(i+1) - start); // num blocks in this row
+  SparseBlockRowView<BlockCrsMatrix> block_row(const ordinal_type i) const {
+    const size_type start =
+        graph.row_map(i);  // total num blocks prior to this block-row
+    const ordinal_type count = static_cast<ordinal_type>(
+        graph.row_map(i + 1) - start);  // num blocks in this row
 
     if (count == 0) {
-      return SparseBlockRowView<BlockCrsMatrix> (nullptr, nullptr, 1, 0);
+      return SparseBlockRowView<BlockCrsMatrix>(nullptr, nullptr, 1, 0);
     } else {
-      return SparseBlockRowView<BlockCrsMatrix> (values, graph.entries, blockDim(), count, start);
+      return SparseBlockRowView<BlockCrsMatrix>(values, graph.entries,
+                                                blockDim(), count, start);
     }
   }
 
@@ -863,14 +911,15 @@ class BlockCrsMatrix {
   ///
   /// The returned object \c view implements the following interface:
   /// <ul>
-  /// <li> \c view.length is the number of entries (i.e. blocks) 
+  /// <li> \c view.length is the number of entries (i.e. blocks)
   ///      in the block row </li>
   /// <li> \c view.local_row_in_block_row(K, i) returns a nonconst pointer
-  ///      to the values of the ith local row in the k-th block of the block-row </li>
+  ///      to the values of the ith local row in the k-th block of the block-row
+  ///      </li>
   /// <li> \c view.full_row_in_block_row(i) returns a nonconst pointer
   ///      to the values of the ith local row of the block-row </li>
   /// <li> \c view.local_block_value(K, i, j) returns a nonconst reference
-  ///      to the value in the ith local row and jth local col 
+  ///      to the value in the ith local row and jth local col
   ///      of the k-th block of the block-row </li>
   /// <li> \c view.block(K) returns an unmanaged 2D strided Kokkos::View
   ///      of the values of the k-th block of the block-row </li>
@@ -880,61 +929,78 @@ class BlockCrsMatrix {
   /// should instead assign to 'auto'.
   ///
   KOKKOS_INLINE_FUNCTION
-  SparseBlockRowViewConst<BlockCrsMatrix> block_row_Const (const ordinal_type i) const {
-
-    const size_type start = graph.row_map(i); // total num blocks prior to this block-row
-    const ordinal_type count = static_cast<ordinal_type> (graph.row_map(i+1) - start); // num blocks in this row
+  SparseBlockRowViewConst<BlockCrsMatrix> block_row_Const(
+      const ordinal_type i) const {
+    const size_type start =
+        graph.row_map(i);  // total num blocks prior to this block-row
+    const ordinal_type count = static_cast<ordinal_type>(
+        graph.row_map(i + 1) - start);  // num blocks in this row
 
     if (count == 0) {
-      return SparseBlockRowViewConst<BlockCrsMatrix> (nullptr, nullptr, 1, 0);
+      return SparseBlockRowViewConst<BlockCrsMatrix>(nullptr, nullptr, 1, 0);
     } else {
-      return SparseBlockRowViewConst<BlockCrsMatrix> (values, graph.entries, blockDim(), count, start);
+      return SparseBlockRowViewConst<BlockCrsMatrix>(values, graph.entries,
+                                                     blockDim(), count, start);
     }
   }
 
-private:
+ private:
   ordinal_type numCols_;
-  ordinal_type blockDim_; // TODO Assuming square blocks for now - add blockRowDim, blockColDim
+  ordinal_type blockDim_;  // TODO Assuming square blocks for now - add
+                           // blockRowDim, blockColDim
 };
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 // Input assumptions:
-//   rows is pointer rep for the row_map member View of the BlockCrsMatrix graph (i.e. cum sum of number of blocks per block-row)
-//   cols is pointer rep for the entries member View of the BlockCrsMatrix graph (colidx for block-row blocks)
-//   annz is the total number of non-zeros in the CrsMatrix (equal to blockDim*blockDim*numBlocks)
-template< typename ScalarType , typename OrdinalType, class Device, class MemoryTraits, typename SizeType >
-void
-BlockCrsMatrix<ScalarType , OrdinalType, Device, MemoryTraits, SizeType >::
-ctor_impl (const std::string &/*label*/,
-           const OrdinalType nrows,
-           const OrdinalType ncols,
-           const size_type annz,
-           ScalarType* val,
-           OrdinalType* rows,
-           OrdinalType* cols,
-           const OrdinalType blockDimIn)
-{
-  numCols_ = ncols;
+//   rows is pointer rep for the row_map member View of the BlockCrsMatrix graph
+//   (i.e. cum sum of number of blocks per block-row) cols is pointer rep for
+//   the entries member View of the BlockCrsMatrix graph (colidx for block-row
+//   blocks) annz is the total number of non-zeros in the CrsMatrix (equal to
+//   blockDim*blockDim*numBlocks)
+template <typename ScalarType, typename OrdinalType, class Device,
+          class MemoryTraits, typename SizeType>
+void BlockCrsMatrix<ScalarType, OrdinalType, Device, MemoryTraits,
+                    SizeType>::ctor_impl(const std::string& /*label*/,
+                                         const OrdinalType nrows,
+                                         const OrdinalType ncols,
+                                         const size_type annz, ScalarType* val,
+                                         OrdinalType* rows, OrdinalType* cols,
+                                         const OrdinalType blockDimIn) {
+  numCols_  = ncols;
   blockDim_ = blockDimIn;
 
   // Wrap the raw pointers in unmanaged host Views
-  typename values_type::HostMirror unman_val( val, annz );
-  typename row_map_type::HostMirror unman_rows( rows, nrows+1);
-  typename index_type::HostMirror unman_cols( cols, ncols );
+  typename values_type::HostMirror unman_val(val, annz);
+  typename row_map_type::HostMirror unman_rows(rows, nrows + 1);
+  typename index_type::HostMirror unman_cols(cols, ncols);
 
-  // Create temporary Views for row_map and entries because the StaticCrsGraph ctor requires View inputs
-  values_type tmp_row_map("tmp_row_map", nrows+1);
+  // Create temporary Views for row_map and entries because the StaticCrsGraph
+  // ctor requires View inputs
+  values_type tmp_row_map("tmp_row_map", nrows + 1);
   values_type tmp_entries("tmp_entries", ncols);
 
-  Kokkos::deep_copy( val, unman_val );
-  Kokkos::deep_copy( tmp_row_map, unman_rows );
-  Kokkos::deep_copy( tmp_entries, unman_cols );
+  Kokkos::deep_copy(val, unman_val);
+  Kokkos::deep_copy(tmp_row_map, unman_rows);
+  Kokkos::deep_copy(tmp_entries, unman_cols);
 
   // Initialize graph using the temp entries and row_map Views
-  graph = staticcrsgraph_type( tmp_entries, tmp_row_map );
+  graph = staticcrsgraph_type(tmp_entries, tmp_row_map);
 }
 
-}} // namespace KokkosSparse::Experimental
+/// \class is_block_crs_matrix
+/// \brief is_block_crs_matrix<T>::value is true if T is a BlockCrsMatrix<...>,
+/// false oterhwise
+template <typename>
+struct is_block_crs_matrix : public std::false_type {};
+template <typename... P>
+struct is_block_crs_matrix<BlockCrsMatrix<P...>> : public std::true_type {};
+template <typename... P>
+struct is_block_crs_matrix<const BlockCrsMatrix<P...>> : public std::true_type {
+};
+
+}  // namespace Experimental
+}  // namespace KokkosSparse
+
 #endif
diff --git a/src/sparse/KokkosSparse_BsrMatrix.hpp b/src/sparse/KokkosSparse_BsrMatrix.hpp
index cc58fbd67a..a615eff478 100644
--- a/src/sparse/KokkosSparse_BsrMatrix.hpp
+++ b/src/sparse/KokkosSparse_BsrMatrix.hpp
@@ -52,7 +52,16 @@
 #ifndef KOKKOS_SPARSE_BSRMATRIX_HPP_
 #define KOKKOS_SPARSE_BSRMATRIX_HPP_
 
-#include "impl/KokkosSparse_BsrMatrix_impl.hpp"
+#include <set>
+#include <sstream>
+#include <stdexcept>
+#include <type_traits>
+
+#include "Kokkos_Core.hpp"
+#include "Kokkos_StaticCrsGraph.hpp"
+#include "Kokkos_ArithTraits.hpp"
+#include "KokkosSparse_CrsMatrix.hpp"
+#include "KokkosKernels_Error.hpp"
 
 namespace KokkosSparse {
 
@@ -65,7 +74,7 @@ struct BsrRowView {
   //! The type of the column indices in the row.
   typedef typename MatrixType::ordinal_type ordinal_type;
   //! The type for returned block of values.
-  typedef Kokkos::View<value_type**, Kokkos::LayoutStride,
+  typedef Kokkos::View<value_type**, Kokkos::LayoutRight,
                        typename MatrixType::device_type,
                        Kokkos::MemoryUnmanaged>
       block_values_type;
@@ -92,10 +101,7 @@ struct BsrRowView {
   KOKKOS_INLINE_FUNCTION
   BsrRowView(value_type* const values, ordinal_type* const colidx,
              const ordinal_type& blockDim, const ordinal_type& count)
-      : values_(values),
-        colidx_(colidx),
-        blockDim_(blockDim),
-        length(count) {}
+      : values_(values), colidx_(colidx), blockDim_(blockDim), length(count) {}
 
   /// \brief Constructor with offset into \c colidx array
   ///
@@ -173,9 +179,8 @@ struct BsrRowView {
   /// block-row
   KOKKOS_INLINE_FUNCTION
   block_values_type block(const ordinal_type& K) const {
-    return block_values_type(
-        &(values_[K * blockDim_ * blockDim_]),
-        Kokkos::LayoutStride(blockDim_, blockDim_, blockDim_, 1));
+    return block_values_type(&(values_[K * blockDim_ * blockDim_]),
+                             Kokkos::LayoutRight(blockDim_, blockDim_));
   }
 
   /// \brief Return offset into colidx_ for the requested block idx
@@ -204,7 +209,7 @@ struct BsrRowViewConst {
   //! The type of the column indices in the row.
   typedef const typename MatrixType::non_const_ordinal_type ordinal_type;
   //! The type for returned block of values.
-  typedef Kokkos::View<value_type**, Kokkos::LayoutStride,
+  typedef Kokkos::View<value_type**, Kokkos::LayoutRight,
                        typename MatrixType::device_type,
                        Kokkos::MemoryUnmanaged>
       block_values_type;
@@ -230,10 +235,7 @@ struct BsrRowViewConst {
   KOKKOS_INLINE_FUNCTION
   BsrRowViewConst(value_type* const values, ordinal_type* const colidx,
                   const ordinal_type& blockDim, const ordinal_type& count)
-      : values_(values),
-        colidx_(colidx),
-        blockDim_(blockDim),
-        length(count) {}
+      : values_(values), colidx_(colidx), blockDim_(blockDim), length(count) {}
 
   /// \brief Constructor with offset into \c colidx array
   ///
@@ -301,14 +303,20 @@ struct BsrRowViewConst {
     return values_[K * blockDim_ * blockDim_ + i * blockDim_ + j];
   }
 
+  /// \brief Return the block column index for a specified block K
+  ///
+  /// \param K [in] must be the LOCAL block index within this block-row
+  /// \return Block column index for "uncompressed" block row
+  KOKKOS_INLINE_FUNCTION
+  ordinal_type block_colidx(const ordinal_type K) const { return colidx_[K]; }
+
   /// \brief Return unmanaged 2D strided View wrapping local block K from this
   /// block-row \param K [in] must be the LOCAL block index within this
   /// block-row
   KOKKOS_INLINE_FUNCTION
   block_values_type block(const ordinal_type& K) const {
-    return block_values_type(
-        &(values_[K * blockDim_ * blockDim_]),
-        Kokkos::LayoutStride(blockDim_, blockDim_, blockDim_, 1));
+    return block_values_type(&(values_[K * blockDim_ * blockDim_]),
+                             Kokkos::LayoutRight(blockDim_, blockDim_));
   }
 
   /// \brief Return offset into colidx_ for the requested block idx
@@ -350,6 +358,10 @@ template <class ScalarType, class OrdinalType, class Device,
           class SizeType     = typename Kokkos::ViewTraits<OrdinalType*, Device,
                                                        void, void>::size_type>
 class BsrMatrix {
+  static_assert(
+      std::is_signed<OrdinalType>::value,
+      "BsrMatrix requires that OrdinalType is a signed integer type.");
+
  private:
   typedef
       typename Kokkos::ViewTraits<ScalarType*, Device, void,
@@ -430,7 +442,7 @@ class BsrMatrix {
   ///
   /// mfh: numCols and nnz should be properties of the graph, not the matrix.
   /// Then BsrMatrix needs methods to get these from the graph.
-  BsrMatrix() : numCols_(0), blockDim_(0) {}
+  BsrMatrix() : graph(), values(), dev_config(), numCols_(0), blockDim_(1) {}
 
   //! Copy constructor (shallow copy).
   template <typename SType, typename OType, class DType, class MTType,
@@ -449,13 +461,24 @@ class BsrMatrix {
 
   /// \brief Construct with a graph that will be shared.
   ///
+  /// \param[in] arg_label   The sparse matrix's label.
+  /// \param[in] arg_graph   The graph between the blocks.
+  /// \param[in] blockDimIn  The block size.
+  ///
   /// Allocate the values array for subsequent fill.
   BsrMatrix(const std::string& arg_label, const staticcrsgraph_type& arg_graph,
             const OrdinalType& blockDimIn)
       : graph(arg_graph),
-        values(arg_label, arg_graph.entries.extent(0)),
+        values(arg_label,
+               arg_graph.entries.extent(0) * blockDimIn * blockDimIn),
         numCols_(maximum_entry(arg_graph) + 1),
-        blockDim_(blockDimIn) {}
+        blockDim_(blockDimIn) {
+    if (blockDim_ < 1) {
+      std::ostringstream os;
+      os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_;
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
+    }
+  }
 
   /// \brief Constructor that copies raw arrays of host data in
   ///   coordinate format.
@@ -483,42 +506,51 @@ class BsrMatrix {
   /// The \c pad argument is currently not used.
   BsrMatrix(const std::string& label, OrdinalType nrows, OrdinalType ncols,
             size_type annz, ScalarType* val, OrdinalType* rows,
-            OrdinalType* cols, OrdinalType blockdim, bool pad = false)
-  {
+            OrdinalType* cols, OrdinalType blockdim, bool pad = false) {
     (void)label;
     (void)pad;
     blockDim_ = blockdim;
 
+    if (blockDim_ < 1) {
+      std::ostringstream os;
+      os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_;
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
+    }
+
     if ((ncols % blockDim_ != 0) || (nrows % blockDim_ != 0)) {
-      assert((ncols % blockDim_ == 0) &&
+      assert(
+          (ncols % blockDim_ == 0) &&
           "BsrMatrix: input CrsMatrix columns is not a multiple of block size");
       assert((nrows % blockDim_ == 0) &&
-      "BsrMatrix: input CrsMatrix rows is not a multiple of block size");
+             "BsrMatrix: input CrsMatrix rows is not a multiple of block size");
     }
 
-    numCols_  = ncols / blockDim_;
+    numCols_                  = ncols / blockDim_;
     ordinal_type tmp_num_rows = nrows / blockDim_;
 
     //
     // Wrap the raw pointers in unmanaged host Views
+    // Note that the inputs are in coordinate format.
+    // So unman_rows and unman_cols have the same type.
     //
     typename values_type::HostMirror unman_val(val, annz);
-    typename row_map_type::HostMirror unman_rows(rows, annz);
+    typename index_type::HostMirror unman_rows(rows, annz);
     typename index_type::HostMirror unman_cols(cols, annz);
 
-    row_map_type tmp_row_map("tmp_row_map", tmp_num_rows + 1);
+    typename row_map_type::non_const_type tmp_row_map(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "rowmap"),
+        tmp_num_rows + 1);
     auto row_map_host = Kokkos::create_mirror_view(tmp_row_map);
     Kokkos::deep_copy(row_map_host, 0);
 
     if (annz > 0) {
       ordinal_type iblock = 0;
-      std::set< ordinal_type > set_blocks;
-      for (ordinal_type ii = 0; ii <= annz; ++ii) {
+      std::set<ordinal_type> set_blocks;
+      for (size_type ii = 0; ii <= annz; ++ii) {
         if ((ii == annz) || ((unman_rows(ii) / blockDim_) > iblock)) {
           // Flush the stored entries
           row_map_host(iblock + 1) = set_blocks.size();
-          if (ii == annz)
-            break;
+          if (ii == annz) break;
           set_blocks.clear();
           iblock = unman_rows(ii) / blockDim_;
         }
@@ -527,7 +559,7 @@ class BsrMatrix {
       }
     }
 
-    for (ordinal_type ii = 0; ii < annz; ++ii)
+    for (size_type ii = 0; ii < annz; ++ii)
       row_map_host(ii + 1) += row_map_host(ii);
 
     Kokkos::deep_copy(tmp_row_map, row_map_host);
@@ -544,15 +576,13 @@ class BsrMatrix {
     if (annz > 0) {
       //--- Fill tmp_entries
       ordinal_type cur_block = 0;
-      std::set< ordinal_type > set_blocks;
-      for (ordinal_type ii = 0; ii <= annz; ++ii) {
+      std::set<ordinal_type> set_blocks;
+      for (size_type ii = 0; ii <= annz; ++ii) {
         if ((ii == annz) || ((unman_rows(ii) / blockDim_) > cur_block)) {
           // Flush the stored entries
           ordinal_type ipos = row_map_host(cur_block);
-          for (auto jblock : set_blocks)
-            tmp_entries_host(ipos++) = jblock;
-          if (ii == annz)
-            break;
+          for (auto jblock : set_blocks) tmp_entries_host(ipos++) = jblock;
+          if (ii == annz) break;
           set_blocks.clear();
           cur_block = unman_rows(ii) / blockDim_;
         }
@@ -560,14 +590,15 @@ class BsrMatrix {
         set_blocks.insert(tmp_jblock);
       }
       //--- Fill numerical values
-      for (ordinal_type ii = 0; ii < annz; ++ii) {
-        ordinal_type iblock = rows(ii) / blockDim_;
-        ordinal_type ilocal = rows(ii) % blockDim_;
-        ordinal_type jblock = cols(ii) / blockDim_;
-        ordinal_type jlocal = cols(ii) % blockDim_;
-        for (auto jj = row_map_host(jblock); jj < row_map_host(jblock + 1); ++jj) {
+      for (size_type ii = 0; ii < annz; ++ii) {
+        const auto ilocal = unman_rows(ii) % blockDim_;
+        const auto jblock = unman_cols(ii) / blockDim_;
+        const auto jlocal = unman_cols(ii) % blockDim_;
+        for (auto jj = row_map_host(jblock); jj < row_map_host(jblock + 1);
+             ++jj) {
           if (tmp_entries_host(jj) == jblock) {
-            const auto shift = jj * blockDim_ * blockDim_ + ilocal * blockDim_ + jlocal;
+            const auto shift =
+                jj * blockDim_ * blockDim_ + ilocal * blockDim_ + jlocal;
             values_host(shift) = unman_val(ii);
             break;
           }
@@ -604,6 +635,12 @@ class BsrMatrix {
         values(vals),
         numCols_(ncols),
         blockDim_(blockDimIn) {
+    if (blockDim_ < 1) {
+      std::ostringstream os;
+      os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_;
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
+    }
+
     const ordinal_type actualNumRows =
         (rows.extent(0) != 0) ? static_cast<ordinal_type>(
                                     rows.extent(0) - static_cast<size_type>(1))
@@ -632,18 +669,21 @@ class BsrMatrix {
   /// The matrix will store and use the row map, indices, and values
   /// directly (by view, not by deep copy).
   ///
-  /// \param label [in] The sparse matrix's label.
-  /// \param nrows [in] The number of rows.
-  /// \param ncols [in] The number of columns.
-  /// \param annz [in] The number of entries.
-  /// \param vals [in/out] The entries.
-  /// \param rows [in/out] The row map (containing the offsets to the
-  ///   data in each row).
-  /// \param cols [in/out] The column indices.
+  /// \param[in] label  The sparse matrix's label.
+  /// \param[in] ncols  The number of columns.
+  /// \param[in] vals   The entries.
+  /// \param[in] graph_ The graph between the blocks.
+  /// \param[in] blockDimIn  The block size.
   BsrMatrix(const std::string& /*label*/, const OrdinalType& ncols,
             const values_type& vals, const staticcrsgraph_type& graph_,
             const OrdinalType& blockDimIn)
-      : graph(graph_), values(vals), numCols_(ncols), blockDim_(blockDimIn) {}
+      : graph(graph_), values(vals), numCols_(ncols), blockDim_(blockDimIn) {
+    if (blockDim_ < 1) {
+      std::ostringstream os;
+      os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_;
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
+    }
+  }
 
   /// \brief Constructor that accepts a CrsMatrix and block dimension,
   ///        assuming the provided CrsMatrix has appropriate block structure.
@@ -658,14 +698,20 @@ class BsrMatrix {
     typedef typename crs_graph_type::entries_type crs_graph_entries_type;
     typedef typename crs_graph_type::row_map_type crs_graph_row_map_type;
 
+    blockDim_ = blockDimIn;
+    if (blockDim_ < 1) {
+      std::ostringstream os;
+      os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_;
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
+    }
+
     assert(
         (crs_mtx.numCols() % blockDim_ == 0) &&
         "BsrMatrix: input CrsMatrix columns is not a multiple of block size");
     assert((crs_mtx.numRows() % blockDim_ == 0) &&
            "BsrMatrix: input CrsMatrix rows is not a multiple of block size");
 
-    blockDim_ = blockDimIn;
-    numCols_  = crs_mtx.numCols() / blockDim_;
+    numCols_ = crs_mtx.numCols() / blockDim_;
 
     OrdinalType nbrows =
         crs_mtx.numRows() /
@@ -687,7 +733,8 @@ class BsrMatrix {
     OrdinalType numBlocks = 0;
     for (OrdinalType i = 0; i < crs_mtx.numRows(); i += blockDim_) {
       std::set<OrdinalType> col_set;
-      for (OrdinalType ie = h_crs_row_map(i); ie < h_crs_row_map(i + blockDim_); ++ie) {
+      for (auto ie = h_crs_row_map(i); ie < h_crs_row_map(i + blockDim_);
+           ++ie) {
         col_set.insert(h_crs_entries(ie) / blockDim_);
       }
       numBlocks += col_set.size();                 // cum sum
@@ -699,7 +746,7 @@ class BsrMatrix {
     // numBlocks in the final entry
     graph = Kokkos::create_staticcrsgraph<staticcrsgraph_type>("blockgraph",
                                                                block_rows);
-    typename index_type::HostMirror h_row_map =
+    typename row_map_type::HostMirror h_row_map =
         Kokkos::create_mirror_view(graph.row_map);
     Kokkos::deep_copy(h_row_map, graph.row_map);
 
@@ -711,10 +758,11 @@ class BsrMatrix {
       auto ir_start = ib * blockDim_;
       auto ir_stop  = (ib + 1) * blockDim_;
       std::set<OrdinalType> col_set;
-      for (OrdinalType jk = h_crs_row_map(ir_start); jk < h_crs_row_map(ir_stop); ++jk) {
+      for (auto jk = h_crs_row_map(ir_start); jk < h_crs_row_map(ir_stop);
+           ++jk) {
         col_set.insert(h_crs_entries(jk) / blockDim_);
       }
-      for (auto col_block : col_set)  {
+      for (auto col_block : col_set) {
         h_entries(ientry++) = col_block;
       }
     }
@@ -728,7 +776,7 @@ class BsrMatrix {
 
     typename values_type::HostMirror h_values =
         Kokkos::create_mirror_view(values);
-    if (h_values.extent(0) < numBlocks * blockDim_ * blockDim_) {
+    if (h_values.extent(0) < size_t(numBlocks * blockDim_ * blockDim_)) {
       Kokkos::resize(h_values, numBlocks * blockDim_ * blockDim_);
       Kokkos::resize(values, numBlocks * blockDim_ * blockDim_);
     }
@@ -737,11 +785,11 @@ class BsrMatrix {
     for (OrdinalType ir = 0; ir < crs_mtx.numRows(); ++ir) {
       const auto iblock = ir / blockDim_;
       const auto ilocal = ir % blockDim_;
-      for (OrdinalType jk = h_crs_row_map(ir); jk < h_crs_row_map(ir + 1); ++jk) {
+      for (auto jk = h_crs_row_map(ir); jk < h_crs_row_map(ir + 1); ++jk) {
         const auto jc     = h_crs_entries(jk);
         const auto jblock = jc / blockDim_;
         const auto jlocal = jc % blockDim_;
-        for (OrdinalType jkb = h_row_map(iblock); jkb < h_row_map(iblock + 1); ++jkb) {
+        for (auto jkb = h_row_map(iblock); jkb < h_row_map(iblock + 1); ++jkb) {
           if (h_entries(jkb) == jblock) {
             OrdinalType shift = jkb * blockDim_ * blockDim_;
             h_values(shift + ilocal * blockDim_ + jlocal) = h_crs_values(jk);
@@ -751,7 +799,6 @@ class BsrMatrix {
       }
     }
     Kokkos::deep_copy(values, h_values);
-
   }
 
   /// \brief Given an array of blocks, sum the values into corresponding
@@ -772,8 +819,8 @@ class BsrMatrix {
                             const OrdinalType ncol, const ScalarType vals[],
                             const bool is_sorted    = false,
                             const bool force_atomic = false) const {
-    return operateValues( BsrMatrix::valueOperation::ADD,
-                          rowi, cols, ncol, vals, is_sorted, force_atomic);
+    return operateValues(BsrMatrix::valueOperation::ADD, rowi, cols, ncol, vals,
+                         is_sorted, force_atomic);
   }
 
   /// \brief Given an array of blocks, replace the values of corresponding
@@ -794,8 +841,8 @@ class BsrMatrix {
                             const OrdinalType ncol, const ScalarType vals[],
                             const bool is_sorted    = false,
                             const bool force_atomic = false) const {
-    return operateValues( BsrMatrix::valueOperation::ASSIGN,
-                         rowi, cols, ncol, vals, is_sorted, force_atomic);
+    return operateValues(BsrMatrix::valueOperation::ASSIGN, rowi, cols, ncol,
+                         vals, is_sorted, force_atomic);
   }
 
   //! Attempt to assign the input matrix to \c *this.
@@ -823,6 +870,18 @@ class BsrMatrix {
   //! The block dimension in the sparse block matrix.
   KOKKOS_INLINE_FUNCTION ordinal_type blockDim() const { return blockDim_; }
 
+  //! The number of "point" (non-block) rows in the matrix.
+  //  This is the dimension of the range of this matrix as a linear operator.
+  KOKKOS_INLINE_FUNCTION ordinal_type numPointRows() const {
+    return numRows() * blockDim();
+  }
+
+  //! The number of "point" (non-block) columns in the matrix.
+  //  This is the dimension of the domain of this matrix as a linear operator.
+  KOKKOS_INLINE_FUNCTION ordinal_type numPointCols() const {
+    return numCols() * blockDim();
+  }
+
   //! The number of stored entries in the sparse matrix.
   KOKKOS_INLINE_FUNCTION size_type nnz() const {
     return graph.entries.extent(0);
@@ -901,13 +960,12 @@ class BsrMatrix {
     if (count == 0) {
       return BsrRowViewConst<BsrMatrix>(nullptr, nullptr, 1, 0);
     } else {
-      return BsrRowViewConst<BsrMatrix>(values, graph.entries,
-                                                blockDim(), count, start);
+      return BsrRowViewConst<BsrMatrix>(values, graph.entries, blockDim(),
+                                        count, start);
     }
   }
 
  protected:
-
   enum class valueOperation { ADD, ASSIGN };
 
   /// \brief Given an array of blocks, operate on the values of corresponding
@@ -925,13 +983,12 @@ class BsrMatrix {
   //        [0, ncols) for cols[] maps to i*block_size*block_size in vals[]
   KOKKOS_INLINE_FUNCTION
   OrdinalType operateValues(const BsrMatrix::valueOperation op,
-                            const OrdinalType rowi,
-                            const OrdinalType cols[], const OrdinalType ncol,
-                            const ScalarType vals[],
+                            const OrdinalType rowi, const OrdinalType cols[],
+                            const OrdinalType ncol, const ScalarType vals[],
                             const bool is_sorted    = false,
                             const bool force_atomic = false) const {
     BsrRowView<BsrMatrix> row_view = this->block_row(rowi);
-    const ordinal_type block_size     = this->blockDim();
+    const ordinal_type block_size  = this->blockDim();
 
     ordinal_type numValid = 0;  // number of valid local column indices
 
@@ -951,8 +1008,7 @@ class BsrMatrix {
               blk_offset, lrow);  // pointer to start of specified local row
           // within this block
           switch (op) {
-            case BsrMatrix::valueOperation::ADD:
-            {
+            case BsrMatrix::valueOperation::ADD: {
               for (ordinal_type lcol = 0; lcol < block_size; ++lcol) {
                 if (force_atomic) {
                   Kokkos::atomic_add(
@@ -965,8 +1021,7 @@ class BsrMatrix {
               }
               break;
             }
-            case BsrMatrix::valueOperation::ASSIGN:
-            {
+            case BsrMatrix::valueOperation::ASSIGN: {
               for (ordinal_type lcol = 0; lcol < block_size; ++lcol) {
                 if (force_atomic) {
                   Kokkos::atomic_assign(
@@ -988,13 +1043,14 @@ class BsrMatrix {
   }
 
  private:
-  ordinal_type numCols_;
-  ordinal_type blockDim_;  // TODO Assuming square blocks for now
+  ordinal_type numCols_  = 0;
+  ordinal_type blockDim_ = 1;  // TODO Assuming square blocks for now
 };
 
 //----------------------------------------------------------------------------
 /// \class is_bsr_matrix
-/// \brief is_bsr_matrix<T>::value is true if T is a BsrMatrix<...>, false otherwise
+/// \brief is_bsr_matrix<T>::value is true if T is a BsrMatrix<...>, false
+/// otherwise
 template <typename>
 struct is_bsr_matrix : public std::false_type {};
 template <typename... P>
diff --git a/src/sparse/KokkosSparse_CrsMatrix.hpp b/src/sparse/KokkosSparse_CrsMatrix.hpp
index 50a2524a43..9cbb22c38d 100644
--- a/src/sparse/KokkosSparse_CrsMatrix.hpp
+++ b/src/sparse/KokkosSparse_CrsMatrix.hpp
@@ -61,7 +61,6 @@
 #include "KokkosKernels_default_types.hpp"
 #include "KokkosKernels_Macros.hpp"
 
-
 namespace KokkosSparse {
 //! String that tells sparse kernels to use the transpose of the matrix.
 static char KOKKOSKERNELS_UNUSED_ATTRIBUTE Transpose[] = "T";
@@ -75,23 +74,23 @@ static char KOKKOSKERNELS_UNUSED_ATTRIBUTE ConjugateTranspose[] = "H";
 ///   or conjugate of the matrix.
 static char KOKKOSKERNELS_UNUSED_ATTRIBUTE NoTranspose[] = "N";
 
-template<class DeviceType>
+template <class DeviceType>
 inline int RowsPerThread(const int NNZPerRow) {
-  if(NNZPerRow == 0) return 1;
+  if (NNZPerRow == 0) return 1;
   int result = 2;
-  while(result*NNZPerRow <= 2048) {
-    result*=2;
+  while (result * NNZPerRow <= 2048) {
+    result *= 2;
   }
-  return result/2;
+  return result / 2;
 }
 #ifdef KOKKOS_ENABLE_CUDA
-template<>
+template <>
 inline int RowsPerThread<Kokkos::Cuda>(const int /*NNZPerRow*/) {
   return 1;
 }
 #endif
 #ifdef KOKKOS_ENABLE_HIP
-template<>
+template <>
 inline int RowsPerThread<Kokkos::Experimental::HIP>(const int /*NNZPerRow*/) {
   return 1;
 }
@@ -107,8 +106,8 @@ struct DeviceConfig {
   struct Dim3 {
     size_t x, y, z;
     KOKKOS_INLINE_FUNCTION
-    Dim3(const size_t x_, const size_t y_ = 1, const size_t z_ = 1) :
-      x(x_), y(y_), z(z_) {}
+    Dim3(const size_t x_, const size_t y_ = 1, const size_t z_ = 1)
+        : x(x_), y(y_), z(z_) {}
   };
 
   Dim3 block_dim;
@@ -116,19 +115,20 @@ struct DeviceConfig {
   size_t num_threads_per_block;
 
   KOKKOS_INLINE_FUNCTION
-  DeviceConfig(const size_t num_blocks_ = 0,
+  DeviceConfig(const size_t num_blocks_          = 0,
                const size_t threads_per_block_x_ = 0,
                const size_t threads_per_block_y_ = 0,
-               const size_t threads_per_block_z_ = 1) :
-    block_dim(threads_per_block_x_,threads_per_block_y_,threads_per_block_z_),
-    num_blocks(num_blocks_),
-    num_threads_per_block(block_dim.x * block_dim.y * block_dim.z)
-    {}
+               const size_t threads_per_block_z_ = 1)
+      : block_dim(threads_per_block_x_, threads_per_block_y_,
+                  threads_per_block_z_),
+        num_blocks(num_blocks_),
+        num_threads_per_block(block_dim.x * block_dim.y * block_dim.z) {}
 };
 
 /// \class SparseRowView
 /// \brief View of a row of a sparse matrix.
-/// \tparam MatrixType Sparse matrix type, such as (but not limited to) CrsMatrix.
+/// \tparam MatrixType Sparse matrix type, such as (but not limited to)
+/// CrsMatrix.
 ///
 /// This class provides a generic view of a row of a sparse matrix.
 /// We intended this class to view a row of a CrsMatrix, but
@@ -163,14 +163,14 @@ struct DeviceConfig {
 /// The stride is one for the compressed sparse row storage format (as
 /// is used by CrsMatrix), but may be greater than one for other
 /// sparse matrix storage formats (e.g., ELLPACK or jagged diagonal).
-template<class MatrixType>
+template <class MatrixType>
 struct SparseRowView {
   //! The type of the values in the row.
   typedef typename MatrixType::value_type value_type;
   //! The type of the column indices in the row.
   typedef typename MatrixType::ordinal_type ordinal_type;
 
-private:
+ private:
   //! Array of values in the row.
   value_type* values_;
   //! Array of (local) column indices in the row.
@@ -184,7 +184,7 @@ struct SparseRowView {
   /// \c ordinal_type is the correct type.
   const ordinal_type stride_;
 
-public:
+ public:
   /// \brief Constructor
   ///
   /// \param values [in] Array of the row's values.
@@ -193,12 +193,9 @@ struct SparseRowView {
   ///   each of the above arrays.
   /// \param count [in] Number of entries in the row.
   KOKKOS_INLINE_FUNCTION
-  SparseRowView (value_type* const values,
-                 ordinal_type* const colidx__,
-                 const ordinal_type& stride,
-                 const ordinal_type& count) :
-    values_ (values), colidx_ (colidx__), stride_ (stride), length (count)
-  {}
+  SparseRowView(value_type* const values, ordinal_type* const colidx__,
+                const ordinal_type& stride, const ordinal_type& count)
+      : values_(values), colidx_(colidx__), stride_(stride), length(count) {}
 
   /// \brief Constructor with offset into \c colidx array
   ///
@@ -213,16 +210,18 @@ struct SparseRowView {
   ///   built-in integer type.  This may differ from ordinal_type.
   ///   For example, the matrix may have dimensions that fit in int,
   ///   but a number of entries that does not fit in int.
-  template<class OffsetType>
-  KOKKOS_INLINE_FUNCTION
-  SparseRowView (const typename MatrixType::values_type& values,
-                 const typename MatrixType::index_type& colidx__,
-                 const ordinal_type& stride,
-                 const ordinal_type& count,
-                 const OffsetType& idx,
-                 const typename std::enable_if<std::is_integral<OffsetType>::value, int>::type& = 0) :
-    values_ (&values(idx)), colidx_ (&colidx__(idx)), stride_ (stride), length (count)
-  {}
+  template <class OffsetType>
+  KOKKOS_INLINE_FUNCTION SparseRowView(
+      const typename MatrixType::values_type& values,
+      const typename MatrixType::index_type& colidx__,
+      const ordinal_type& stride, const ordinal_type& count,
+      const OffsetType& idx,
+      const typename std::enable_if<std::is_integral<OffsetType>::value,
+                                    int>::type& = 0)
+      : values_(&values(idx)),
+        colidx_(&colidx__(idx)),
+        stride_(stride),
+        length(count) {}
 
   /// \brief Number of entries in the row.
   ///
@@ -241,36 +240,37 @@ struct SparseRowView {
   /// "Entry i" is not necessarily the entry with column index i, nor
   /// does i necessarily correspond to the (local) row index.
   KOKKOS_INLINE_FUNCTION
-  value_type& value (const ordinal_type& i) const {
-    return values_[i*stride_];
+  value_type& value(const ordinal_type& i) const {
+    return values_[i * stride_];
   }
 
-  /// \brief Reference to the column index of entry i in this row of the sparse matrix.
+  /// \brief Reference to the column index of entry i in this row of the sparse
+  /// matrix.
   ///
   /// "Entry i" is not necessarily the entry with column index i, nor
   /// does i necessarily correspond to the (local) row index.
   KOKKOS_INLINE_FUNCTION
-  ordinal_type& colidx (const ordinal_type& i) const {
-    return colidx_[i*stride_];
+  ordinal_type& colidx(const ordinal_type& i) const {
+    return colidx_[i * stride_];
   }
 };
 
-
 /// \class SparseRowViewConst
 /// \brief Const view of a row of a sparse matrix.
-/// \tparam MatrixType Sparse matrix type, such as (but not limited to) CrsMatrix.
+/// \tparam MatrixType Sparse matrix type, such as (but not limited to)
+/// CrsMatrix.
 ///
 /// This class is like SparseRowView, except that it provides a const
 /// view.  This class exists in order to let users get a const view of
 /// a row of a nonconst matrix.
-template<class MatrixType>
+template <class MatrixType>
 struct SparseRowViewConst {
   //! The type of the values in the row.
   typedef const typename MatrixType::non_const_value_type value_type;
   //! The type of the column indices in the row.
   typedef const typename MatrixType::non_const_ordinal_type ordinal_type;
 
-private:
+ private:
   //! Array of values in the row.
   value_type* values_;
   //! Array of (local) column indices in the row.
@@ -284,7 +284,7 @@ struct SparseRowViewConst {
   /// \c ordinal_type is the correct type.
   const ordinal_type stride_;
 
-public:
+ public:
   /// \brief Constructor
   ///
   /// \param values [in] Array of the row's values.
@@ -293,12 +293,9 @@ struct SparseRowViewConst {
   ///   each of the above arrays.
   /// \param count [in] Number of entries in the row.
   KOKKOS_INLINE_FUNCTION
-  SparseRowViewConst (value_type* const values,
-                      ordinal_type* const colidx__,
-                      const ordinal_type& stride,
-                      const ordinal_type& count) :
-    values_ (values), colidx_ (colidx__), stride_ (stride), length (count)
-  {}
+  SparseRowViewConst(value_type* const values, ordinal_type* const colidx__,
+                     const ordinal_type& stride, const ordinal_type& count)
+      : values_(values), colidx_(colidx__), stride_(stride), length(count) {}
 
   /// \brief Constructor with offset into \c colidx array
   ///
@@ -313,16 +310,18 @@ struct SparseRowViewConst {
   ///   built-in integer type.  This may differ from ordinal_type.
   ///   For example, the matrix may have dimensions that fit in int,
   ///   but a number of entries that does not fit in int.
-  template<class OffsetType>
-  KOKKOS_INLINE_FUNCTION
-  SparseRowViewConst (const typename MatrixType::values_type& values,
-                      const typename MatrixType::index_type& colidx__,
-                      const ordinal_type& stride,
-                      const ordinal_type& count,
-                      const OffsetType& idx,
-                      const typename std::enable_if<std::is_integral<OffsetType>::value, int>::type& = 0) :
-    values_ (&values(idx)), colidx_ (&colidx__(idx)), stride_ (stride), length (count)
-  {}
+  template <class OffsetType>
+  KOKKOS_INLINE_FUNCTION SparseRowViewConst(
+      const typename MatrixType::values_type& values,
+      const typename MatrixType::index_type& colidx__,
+      const ordinal_type& stride, const ordinal_type& count,
+      const OffsetType& idx,
+      const typename std::enable_if<std::is_integral<OffsetType>::value,
+                                    int>::type& = 0)
+      : values_(&values(idx)),
+        colidx_(&colidx__(idx)),
+        stride_(stride),
+        length(count) {}
 
   /// \brief Number of entries in the row.
   ///
@@ -342,8 +341,8 @@ struct SparseRowViewConst {
   /// "Entry i" is not necessarily the entry with column index i, nor
   /// does i necessarily correspond to the (local) row index.
   KOKKOS_INLINE_FUNCTION
-  value_type& value (const ordinal_type& i) const {
-    return values_[i*stride_];
+  value_type& value(const ordinal_type& i) const {
+    return values_[i * stride_];
   }
 
   /// \brief (Const) reference to the column index of entry i in this
@@ -352,8 +351,8 @@ struct SparseRowViewConst {
   /// "Entry i" is not necessarily the entry with column index i, nor
   /// does i necessarily correspond to the (local) row index.
   KOKKOS_INLINE_FUNCTION
-  ordinal_type& colidx (const ordinal_type& i) const {
-    return colidx_[i*stride_];
+  ordinal_type& colidx(const ordinal_type& i) const {
+    return colidx_[i * stride_];
   }
 };
 
@@ -369,15 +368,21 @@ struct SparseRowViewConst {
 /// Trilinos traditionally uses to describe compressed sparse row
 /// storage for sparse matrices, as described, for example, in Saad
 /// (2nd ed.).
-template<class ScalarType,
-         class OrdinalType,
-         class Device,
-         class MemoryTraits = void,
-         class SizeType = typename Kokkos::ViewTraits<OrdinalType*, Device, void, void>::size_type>
+template <class ScalarType, class OrdinalType, class Device,
+          class MemoryTraits = void,
+          class SizeType     = typename Kokkos::ViewTraits<OrdinalType*, Device,
+                                                       void, void>::size_type>
 class CrsMatrix {
-private:
-  typedef typename Kokkos::ViewTraits<ScalarType*,Device,void,MemoryTraits>::host_mirror_space host_mirror_space ;
-public:
+  static_assert(
+      std::is_signed<OrdinalType>::value,
+      "CrsMatrix requires that OrdinalType is a signed integer type.");
+
+ private:
+  typedef typename Kokkos::ViewTraits<ScalarType*, Device, void,
+                                      MemoryTraits>::host_mirror_space
+      host_mirror_space;
+
+ public:
   //! Type of the matrix's execution space.
   typedef typename Device::execution_space execution_space;
   //! Type of the matrix's memory space.
@@ -397,11 +402,17 @@ class CrsMatrix {
   typedef SizeType size_type;
 
   //! Type of a host-memory mirror of the sparse matrix.
-  typedef CrsMatrix<ScalarType, OrdinalType, host_mirror_space, MemoryTraits, SizeType> HostMirror;
+  typedef CrsMatrix<ScalarType, OrdinalType, host_mirror_space, MemoryTraits,
+                    SizeType>
+      HostMirror;
   //! Type of the graph structure of the sparse matrix.
-  typedef Kokkos::StaticCrsGraph<ordinal_type, default_layout, device_type, memory_traits, size_type> StaticCrsGraphType;
+  typedef Kokkos::StaticCrsGraph<ordinal_type, default_layout, device_type,
+                                 memory_traits, size_type>
+      StaticCrsGraphType;
   //! Type of the graph structure of the sparse matrix - consistent with Kokkos.
-  typedef Kokkos::StaticCrsGraph<ordinal_type, default_layout, device_type, memory_traits, size_type> staticcrsgraph_type;
+  typedef Kokkos::StaticCrsGraph<ordinal_type, default_layout, device_type,
+                                 memory_traits, size_type>
+      staticcrsgraph_type;
   //! Type of column indices in the sparse matrix.
   typedef typename staticcrsgraph_type::entries_type index_type;
   //! Const version of the type of column indices in the sparse matrix.
@@ -415,18 +426,22 @@ class CrsMatrix {
   //! Nonconst version of the type of row offsets in the sparse matrix.
   typedef typename row_map_type::non_const_value_type non_const_size_type;
   //! Kokkos Array type of the entries (values) in the sparse matrix.
-  typedef Kokkos::View<value_type*, Kokkos::LayoutRight, device_type, MemoryTraits> values_type;
+  typedef Kokkos::View<value_type*, Kokkos::LayoutRight, device_type,
+                       MemoryTraits>
+      values_type;
   //! Const version of the type of the entries in the sparse matrix.
   typedef typename values_type::const_value_type const_value_type;
   //! Nonconst version of the type of the entries in the sparse matrix.
   typedef typename values_type::non_const_value_type non_const_value_type;
 
-  typedef CrsMatrix<const_value_type,ordinal_type,device_type,memory_traits,size_type> const_type;
+  typedef CrsMatrix<const_value_type, ordinal_type, device_type, memory_traits,
+                    size_type>
+      const_type;
 
 #ifdef KOKKOS_USE_CUSPARSE
   cusparseHandle_t cusparse_handle;
   cusparseMatDescr_t cusparse_descr;
-#endif // KOKKOS_USE_CUSPARSE
+#endif  // KOKKOS_USE_CUSPARSE
 
   /// \name Storage of the actual sparsity structure and values.
   ///
@@ -441,7 +456,7 @@ class CrsMatrix {
   values_type values;
   //@}
 
-private:
+ private:
   /// \brief The number of distinct column indices used by the matrix
   ///
   /// This value might not be exact but rather an upper bound of the
@@ -451,7 +466,7 @@ class CrsMatrix {
   /// output of the kernel.
   ordinal_type numCols_;
 
-public:
+ public:
   /// \brief Launch configuration that can be used by
   ///   overloads/specializations of MV_multiply().
   ///
@@ -465,65 +480,70 @@ class CrsMatrix {
   /// the graph, not the matrix.  Then CrsMatrix needs methods to get
   /// these from the graph.
   KOKKOS_INLINE_FUNCTION
-  CrsMatrix () :
-    numCols_ (0)
-  {}
+  CrsMatrix() : numCols_(0) {}
 
   //! Copy constructor (shallow copy).
-  template<typename InScalar,
-           typename InOrdinal,
-           class InDevice,
-           class InMemTraits,
-           typename InSizeType>
-  KOKKOS_INLINE_FUNCTION
-  CrsMatrix (const CrsMatrix<InScalar,InOrdinal,InDevice,InMemTraits,InSizeType> & B) :
-    graph (B.graph.entries, B.graph.row_map),
-    values (B.values),
-    numCols_ (B.numCols ()),
-    dev_config (B.dev_config)
+  template <typename InScalar, typename InOrdinal, class InDevice,
+            class InMemTraits, typename InSizeType>
+  KOKKOS_INLINE_FUNCTION CrsMatrix(
+      const CrsMatrix<InScalar, InOrdinal, InDevice, InMemTraits, InSizeType>&
+          B)
+      : graph(B.graph.entries, B.graph.row_map),
+        values(B.values),
+        numCols_(B.numCols()),
+        dev_config(B.dev_config)
 #ifdef KOKKOS_USE_CUSPARSE
-    ,
-    cusparse_handle (B.cusparse_handle),
-    cusparse_descr (B.cusparse_descr)
-#endif // KOKKOS_USE_CUSPARSE
+        ,
+        cusparse_handle(B.cusparse_handle),
+        cusparse_descr(B.cusparse_descr)
+#endif  // KOKKOS_USE_CUSPARSE
   {
     graph.row_block_offsets = B.graph.row_block_offsets;
-    //TODO: MD 07/2017: Changed the copy constructor of graph
-    //as the constructor of StaticCrsGraph does not allow copy from non const version.
+    // TODO: MD 07/2017: Changed the copy constructor of graph
+    // as the constructor of StaticCrsGraph does not allow copy from non const
+    // version.
   }
 
   //! Deep copy constructor (can cross spaces)
-  template<typename InScalar, typename InOrdinal, typename InDevice, typename InMemTraits, typename InSizeType>
-  CrsMatrix (const std::string&,
-      const CrsMatrix<InScalar, InOrdinal, InDevice, InMemTraits, InSizeType>& mat_)
-  {
-    typename row_map_type::non_const_type rowmap(Kokkos::view_alloc(Kokkos::WithoutInitializing, "rowmap"), mat_.graph.row_map.extent(0));
-    index_type cols(Kokkos::view_alloc(Kokkos::WithoutInitializing, "cols"), mat_.nnz());
-    values = values_type(Kokkos::view_alloc(Kokkos::WithoutInitializing, "values"), mat_.nnz());
+  template <typename InScalar, typename InOrdinal, typename InDevice,
+            typename InMemTraits, typename InSizeType>
+  CrsMatrix(const std::string&,
+            const CrsMatrix<InScalar, InOrdinal, InDevice, InMemTraits,
+                            InSizeType>& mat_) {
+    typename row_map_type::non_const_type rowmap(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "rowmap"),
+        mat_.graph.row_map.extent(0));
+    index_type cols(Kokkos::view_alloc(Kokkos::WithoutInitializing, "cols"),
+                    mat_.nnz());
+    values = values_type(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "values"), mat_.nnz());
     Kokkos::deep_copy(rowmap, mat_.graph.row_map);
     Kokkos::deep_copy(cols, mat_.graph.entries);
     Kokkos::deep_copy(values, mat_.values);
 
     numCols_ = mat_.numCols();
-    graph = StaticCrsGraphType(cols, rowmap);
+    graph    = StaticCrsGraphType(cols, rowmap);
 
 #ifdef KOKKOS_USE_CUSPARSE
-    cusparseCreate (&cusparse_handle);
-    cusparseCreateMatDescr (&cusparse_descr);
-#endif // KOKKOS_USE_CUSPARSE
+    cusparseCreate(&cusparse_handle);
+    cusparseCreateMatDescr(&cusparse_descr);
+#endif  // KOKKOS_USE_CUSPARSE
   }
 
   /// \brief Construct with a graph that will be shared.
   ///
   /// Allocate the values array for subsquent fill.
-  template<typename InOrdinal, typename InLayout, typename InDevice, typename InMemTraits, typename InSizeType>
-  [[deprecated("Use the constructor that accepts ncols as input instead.")]]
-  CrsMatrix (const std::string& label,
-             const Kokkos::StaticCrsGraph<InOrdinal, InLayout, InDevice, InMemTraits, InSizeType>& graph_) :
-    graph (graph_.entries, graph_.row_map),
-    values (label, graph_.entries.extent(0)),
-    numCols_ (maximum_entry (graph_) + 1)
-  {}
+  template <typename InOrdinal, typename InLayout, typename InDevice,
+            typename InMemTraits, typename InSizeType>
+  [
+      [deprecated("Use the constructor that accepts ncols as input "
+                  "instead.")]] CrsMatrix(const std::string& label,
+                                          const Kokkos::StaticCrsGraph<
+                                              InOrdinal, InLayout, InDevice,
+                                              InMemTraits, InSizeType>& graph_)
+      : graph(graph_.entries, graph_.row_map),
+        values(label, graph_.entries.extent(0)),
+        numCols_(maximum_entry(graph_) + 1) {}
 
   /// \brief Constructor that accepts a a static graph, and numCols.
   ///
@@ -532,18 +552,19 @@ class CrsMatrix {
   ///
   /// \param label [in] The sparse matrix's label.
   /// \param ncols [in] The number of columns.
-  template<typename InOrdinal, typename InLayout, typename InDevice, typename InMemTraits, typename InSizeType>
-  CrsMatrix (const std::string& label,
-             const Kokkos::StaticCrsGraph<InOrdinal, InLayout, InDevice, InMemTraits, InSizeType>& graph_,
-             const OrdinalType& ncols) :
-    graph (graph_.entries, graph_.row_map),
-    values (label, graph_.entries.extent(0)),
-    numCols_ (ncols)
-  {
+  template <typename InOrdinal, typename InLayout, typename InDevice,
+            typename InMemTraits, typename InSizeType>
+  CrsMatrix(const std::string& label,
+            const Kokkos::StaticCrsGraph<InOrdinal, InLayout, InDevice,
+                                         InMemTraits, InSizeType>& graph_,
+            const OrdinalType& ncols)
+      : graph(graph_.entries, graph_.row_map),
+        values(label, graph_.entries.extent(0)),
+        numCols_(ncols) {
 #ifdef KOKKOS_USE_CUSPARSE
-    cusparseCreate (&cusparse_handle);
-    cusparseCreateMatDescr (&cusparse_descr);
-#endif // KOKKOS_USE_CUSPARSE
+    cusparseCreate(&cusparse_handle);
+    cusparseCreateMatDescr(&cusparse_descr);
+#endif  // KOKKOS_USE_CUSPARSE
   }
 
   /// \brief Constructor that accepts a a static graph, and values.
@@ -559,27 +580,26 @@ class CrsMatrix {
   /// \param rows [in/out] The row map (containing the offsets to the
   ///   data in each row).
   /// \param cols [in/out] The column indices.
-  template<typename InOrdinal, typename InLayout, typename InDevice, typename InMemTraits, typename InSizeType>
-  CrsMatrix (const std::string&,
-             const OrdinalType& ncols,
-             const values_type& vals,
-             const Kokkos::StaticCrsGraph<InOrdinal, InLayout, InDevice, InMemTraits, InSizeType>& graph_) :
-    graph (graph_.entries, graph_.row_map),
-    values (vals),
-    numCols_ (ncols)
-  {
+  template <typename InOrdinal, typename InLayout, typename InDevice,
+            typename InMemTraits, typename InSizeType>
+  CrsMatrix(const std::string&, const OrdinalType& ncols,
+            const values_type& vals,
+            const Kokkos::StaticCrsGraph<InOrdinal, InLayout, InDevice,
+                                         InMemTraits, InSizeType>& graph_)
+      : graph(graph_.entries, graph_.row_map), values(vals), numCols_(ncols) {
 #ifdef KOKKOS_USE_CUSPARSE
-    cusparseCreate (&cusparse_handle);
-    cusparseCreateMatDescr (&cusparse_descr);
-#endif // KOKKOS_USE_CUSPARSE
+    cusparseCreate(&cusparse_handle);
+    cusparseCreateMatDescr(&cusparse_descr);
+#endif  // KOKKOS_USE_CUSPARSE
   }
 
   /// \brief Constructor that copies raw arrays of host data in
   ///   3-array CRS (compresed row storage) format.
   ///
-  /// On input, the entries must be sorted by row. \c rowmap determines where each row begins
-  /// and ends. For each entry k (0 <= k < annz), \c cols[k] gives the adjacent column,
-  /// and \c val[k] gives the corresponding matrix value.
+  /// On input, the entries must be sorted by row. \c rowmap determines where
+  /// each row begins and ends. For each entry k (0 <= k < annz), \c cols[k]
+  /// gives the adjacent column, and \c val[k] gives the corresponding matrix
+  /// value.
   ///
   /// This constructor is mainly useful for benchmarking or for
   /// reading the sparse matrix's data from a file.
@@ -589,48 +609,49 @@ class CrsMatrix {
   /// \param ncols [in] The number of columns.
   /// \param annz [in] The number of entries.
   /// \param val [in] The values.
-  /// \param rowmap [in] The row offsets. The values/columns in row k begin at index
-  ///   \c rowmap[k] and end at \c rowmap[k+1]-1 (inclusive). This means the array
-  ///   must have length \c nrows+1.
+  /// \param rowmap [in] The row offsets. The values/columns in row k begin at
+  /// index
+  ///   \c rowmap[k] and end at \c rowmap[k+1]-1 (inclusive). This means the
+  ///   array must have length \c nrows+1.
   /// \param cols [in] The column indices. \c cols[k] is the column
   ///   index of entry k, with a corresponding value of \c val[k] .
-  CrsMatrix (const std::string &/*label*/,
-             OrdinalType nrows,
-             OrdinalType ncols,
-             size_type annz,
-             ScalarType* val,
-             OrdinalType* rowmap,
-             OrdinalType* cols)
-  {
+  CrsMatrix(const std::string& /*label*/, OrdinalType nrows, OrdinalType ncols,
+            size_type annz, ScalarType* val, OrdinalType* rowmap,
+            OrdinalType* cols) {
     using Kokkos::Unmanaged;
-    using HostRowmap = Kokkos::View<SizeType*, Kokkos::HostSpace>;
-    using UnmanagedRowmap = Kokkos::View<const SizeType*, Kokkos::HostSpace, Kokkos::MemoryTraits<Unmanaged>>;
-    using UnmanagedEntries = Kokkos::View<const OrdinalType*, Kokkos::HostSpace, Kokkos::MemoryTraits<Unmanaged>>;
-    using UnmanagedValues = Kokkos::View<const ScalarType*, Kokkos::HostSpace, Kokkos::MemoryTraits<Unmanaged>>;
-    //Allocate device rowmap, entries, values views
-    typename row_map_type::non_const_type rowmapDevice(Kokkos::view_alloc(Kokkos::WithoutInitializing, "rowmap"), nrows + 1);
-    index_type entriesDevice(Kokkos::view_alloc(Kokkos::WithoutInitializing, "entries"), annz);
-    //given rowmap in ordinal_type, so may need to convert to size_type explicitly
+    using HostRowmap       = Kokkos::View<SizeType*, Kokkos::HostSpace>;
+    using UnmanagedRowmap  = Kokkos::View<const SizeType*, Kokkos::HostSpace,
+                                         Kokkos::MemoryTraits<Unmanaged>>;
+    using UnmanagedEntries = Kokkos::View<const OrdinalType*, Kokkos::HostSpace,
+                                          Kokkos::MemoryTraits<Unmanaged>>;
+    using UnmanagedValues  = Kokkos::View<const ScalarType*, Kokkos::HostSpace,
+                                         Kokkos::MemoryTraits<Unmanaged>>;
+    // Allocate device rowmap, entries, values views
+    typename row_map_type::non_const_type rowmapDevice(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "rowmap"), nrows + 1);
+    index_type entriesDevice(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "entries"), annz);
+    // given rowmap in ordinal_type, so may need to convert to size_type
+    // explicitly
     HostRowmap rowmapConverted;
     UnmanagedRowmap rowmapRaw;
-    if(!std::is_same<OrdinalType, SizeType>::value)
-    {
-      rowmapConverted = HostRowmap(Kokkos::view_alloc(Kokkos::WithoutInitializing, "rowmap raw"), nrows + 1);
-      for(OrdinalType i = 0; i <= nrows; i++)
-        rowmapConverted(i) = rowmap[i];
+    if (!std::is_same<OrdinalType, SizeType>::value) {
+      rowmapConverted = HostRowmap(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "rowmap raw"),
+          nrows + 1);
+      for (OrdinalType i = 0; i <= nrows; i++) rowmapConverted(i) = rowmap[i];
       rowmapRaw = rowmapConverted;
-    }
-    else
-    {
-      rowmapRaw = UnmanagedRowmap((const SizeType*) rowmap, nrows + 1);
+    } else {
+      rowmapRaw = UnmanagedRowmap((const SizeType*)rowmap, nrows + 1);
     }
     Kokkos::deep_copy(rowmapDevice, rowmapRaw);
     UnmanagedEntries entriesRaw(cols, annz);
     Kokkos::deep_copy(entriesDevice, entriesRaw);
-    //Construct graph and populate all members
+    // Construct graph and populate all members
     this->numCols_ = ncols;
-    this->graph = StaticCrsGraphType(entriesDevice, rowmapDevice);
-    this->values = values_type(Kokkos::view_alloc(Kokkos::WithoutInitializing, "values"), annz);
+    this->graph    = StaticCrsGraphType(entriesDevice, rowmapDevice);
+    this->values   = values_type(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "values"), annz);
     UnmanagedValues valuesRaw(val, annz);
     Kokkos::deep_copy(this->values, valuesRaw);
 
@@ -640,13 +661,13 @@ class CrsMatrix {
     // FIXME (mfh 09 Aug 2013) This is actually static initialization
     // of the library; you should do it once for the whole program,
     // not once per matrix.  We need to protect this somehow.
-    cusparseCreate (&cusparse_handle);
+    cusparseCreate(&cusparse_handle);
 
     // This is a per-matrix attribute.  It encapsulates things like
     // whether the matrix is lower or upper triangular, etc.  Ditto
     // for other TPLs like MKL.
-    cusparseCreateMatDescr (&cusparse_descr);
-#endif // KOKKOS_USE_CUSPARSE
+    cusparseCreateMatDescr(&cusparse_descr);
+#endif  // KOKKOS_USE_CUSPARSE
   }
 
   /// \brief Constructor that accepts a row map, column indices, and
@@ -663,65 +684,58 @@ class CrsMatrix {
   /// \param rowmap [in] The row map (containing the offsets to the
   ///   data in each row).
   /// \param cols [in] The column indices.
-  CrsMatrix (const std::string& /* label */,
-             const OrdinalType nrows,
-             const OrdinalType ncols,
-             const size_type annz,
-             const values_type& vals,
-             const row_map_type& rowmap,
-             const index_type& cols) :
-    graph (cols, rowmap),
-    values (vals),
-    numCols_ (ncols)
-  {
-    const ordinal_type actualNumRows = (rowmap.extent(0) != 0) ?
-      static_cast<ordinal_type> (rowmap.extent(0) - static_cast<size_type> (1)) :
-      static_cast<ordinal_type> (0);
+  CrsMatrix(const std::string& /* label */, const OrdinalType nrows,
+            const OrdinalType ncols, const size_type annz,
+            const values_type& vals, const row_map_type& rowmap,
+            const index_type& cols)
+      : graph(cols, rowmap), values(vals), numCols_(ncols) {
+    const ordinal_type actualNumRows =
+        (rowmap.extent(0) != 0)
+            ? static_cast<ordinal_type>(rowmap.extent(0) -
+                                        static_cast<size_type>(1))
+            : static_cast<ordinal_type>(0);
     if (nrows != actualNumRows) {
       std::ostringstream os;
-      os << "Input argument nrows = " << nrows << " != the actual number of "
-        "rows " << actualNumRows << " according to the 'rows' input argument.";
-      throw std::invalid_argument (os.str ());
+      os << "Input argument nrows = " << nrows
+         << " != the actual number of "
+            "rows "
+         << actualNumRows << " according to the 'rows' input argument.";
+      throw std::invalid_argument(os.str());
     }
-    if (annz != nnz ()) {
+    if (annz != nnz()) {
       std::ostringstream os;
-      os << "Input argument annz = " << annz
-         << " != this->nnz () = " << nnz () << ".";
-      throw std::invalid_argument (os.str ());
+      os << "Input argument annz = " << annz << " != this->nnz () = " << nnz()
+         << ".";
+      throw std::invalid_argument(os.str());
     }
 
 #ifdef KOKKOS_USE_CUSPARSE
-    cusparseCreate (&cusparse_handle);
-    cusparseCreateMatDescr (&cusparse_descr);
-#endif // KOKKOS_USE_CUSPARSE
+    cusparseCreate(&cusparse_handle);
+    cusparseCreateMatDescr(&cusparse_descr);
+#endif  // KOKKOS_USE_CUSPARSE
   }
 
   KOKKOS_INLINE_FUNCTION
-  OrdinalType
-  sumIntoValues (const OrdinalType rowi,
-                 const OrdinalType cols[],
-                 const OrdinalType ncol,
-                 const ScalarType vals[],
-                 const bool is_sorted = false,
-                 const bool force_atomic = false) const
-  {
-    SparseRowView<CrsMatrix> row_view = this->row (rowi);
-    const ordinal_type length = row_view.length;
+  OrdinalType sumIntoValues(const OrdinalType rowi, const OrdinalType cols[],
+                            const OrdinalType ncol, const ScalarType vals[],
+                            const bool is_sorted    = false,
+                            const bool force_atomic = false) const {
+    SparseRowView<CrsMatrix> row_view = this->row(rowi);
+    const ordinal_type length         = row_view.length;
 
-    ordinal_type hint = 0; // Guess for offset of current column index in row
-    ordinal_type numValid = 0; // number of valid local column indices
+    ordinal_type hint = 0;  // Guess for offset of current column index in row
+    ordinal_type numValid = 0;  // number of valid local column indices
 
     for (ordinal_type i = 0; i < ncol; ++i) {
       // NOTE (mfh 19 Sep 2017) This assumes that row_view stores
       // column indices contiguously.  It does, but one could imagine
       // changing that at some point.
-      const ordinal_type offset =
-        findRelOffset (&(row_view.colidx(0)), length, cols[i], hint, is_sorted);
+      const ordinal_type offset = findRelOffset(&(row_view.colidx(0)), length,
+                                                cols[i], hint, is_sorted);
       if (offset != length) {
         if (force_atomic) {
-          Kokkos::atomic_add (&(row_view.value(offset)), vals[i]);
-        }
-        else {
+          Kokkos::atomic_add(&(row_view.value(offset)), vals[i]);
+        } else {
           row_view.value(offset) += vals[i];
         }
         ++numValid;
@@ -735,33 +749,27 @@ class CrsMatrix {
     return numValid;
   }
 
-
   KOKKOS_INLINE_FUNCTION
-  OrdinalType
-  replaceValues (const OrdinalType rowi,
-                 const OrdinalType cols[],
-                 const OrdinalType ncol,
-                 const ScalarType vals[],
-                 const bool is_sorted = false,
-                 const bool force_atomic = false) const
-  {
-    SparseRowView<CrsMatrix> row_view = this->row (rowi);
-    const ordinal_type length = row_view.length;
+  OrdinalType replaceValues(const OrdinalType rowi, const OrdinalType cols[],
+                            const OrdinalType ncol, const ScalarType vals[],
+                            const bool is_sorted    = false,
+                            const bool force_atomic = false) const {
+    SparseRowView<CrsMatrix> row_view = this->row(rowi);
+    const ordinal_type length         = row_view.length;
 
-    ordinal_type hint = 0; // Guess for offset of current column index in row
-    ordinal_type numValid = 0; // number of valid local column indices
+    ordinal_type hint = 0;  // Guess for offset of current column index in row
+    ordinal_type numValid = 0;  // number of valid local column indices
 
     for (ordinal_type i = 0; i < ncol; ++i) {
       // NOTE (mfh 19 Sep 2017) This assumes that row_view stores
       // column indices contiguously.  It does, but one could imagine
       // changing that at some point.
-      const ordinal_type offset =
-        findRelOffset (&(row_view.colidx(0)), length, cols[i], hint, is_sorted);
+      const ordinal_type offset = findRelOffset(&(row_view.colidx(0)), length,
+                                                cols[i], hint, is_sorted);
       if (offset != length) {
         if (force_atomic) {
-          Kokkos::atomic_assign (&(row_view.value(offset)), vals[i]);
-        }
-        else {
+          Kokkos::atomic_assign(&(row_view.value(offset)), vals[i]);
+        } else {
           row_view.value(offset) = vals[i];
         }
         ++numValid;
@@ -776,29 +784,35 @@ class CrsMatrix {
   }
 
   //! Attempt to assign the input matrix to \c *this.
-  template<typename aScalarType, typename aOrdinalType, class aDevice, class aMemoryTraits,typename aSizeType>
-  CrsMatrix&
-  operator= (const CrsMatrix<aScalarType, aOrdinalType, aDevice, aMemoryTraits, aSizeType>& mtx)
-  {
-    numCols_ = mtx.numCols ();
-    graph = mtx.graph;
-    values = mtx.values;
+  template <typename aScalarType, typename aOrdinalType, class aDevice,
+            class aMemoryTraits, typename aSizeType>
+  CrsMatrix& operator=(const CrsMatrix<aScalarType, aOrdinalType, aDevice,
+                                       aMemoryTraits, aSizeType>& mtx) {
+    numCols_   = mtx.numCols();
+    graph      = mtx.graph;
+    values     = mtx.values;
     dev_config = mtx.dev_config;
     return *this;
   }
 
   //! The number of rows in the sparse matrix.
-  KOKKOS_INLINE_FUNCTION ordinal_type numRows () const {
-    return graph.numRows ();
+  KOKKOS_INLINE_FUNCTION ordinal_type numRows() const {
+    return graph.numRows();
   }
 
   //! The number of columns in the sparse matrix.
-  KOKKOS_INLINE_FUNCTION ordinal_type numCols () const {
-    return numCols_;
-  }
+  KOKKOS_INLINE_FUNCTION ordinal_type numCols() const { return numCols_; }
+
+  //! The number of "point" (non-block) rows in the matrix. Since Crs is not
+  //! blocked, this is just the number of regular rows.
+  KOKKOS_INLINE_FUNCTION ordinal_type numPointRows() const { return numRows(); }
+
+  //! The number of "point" (non-block) columns in the matrix. Since Crs is not
+  //! blocked, this is just the number of regular columns.
+  KOKKOS_INLINE_FUNCTION ordinal_type numPointCols() const { return numCols(); }
 
   //! The number of stored entries in the sparse matrix.
-  KOKKOS_INLINE_FUNCTION size_type nnz () const {
+  KOKKOS_INLINE_FUNCTION size_type nnz() const {
     return graph.entries.extent(0);
   }
 
@@ -849,16 +863,17 @@ class CrsMatrix {
   /// }
   /// \endcode
   KOKKOS_INLINE_FUNCTION
-  SparseRowView<CrsMatrix> row (const ordinal_type i) const {
+  SparseRowView<CrsMatrix> row(const ordinal_type i) const {
     const size_type start = graph.row_map(i);
     // count is guaranteed to fit in ordinal_type, as long as no row
     // has duplicate entries.
-    const ordinal_type count = static_cast<ordinal_type> (graph.row_map(i+1) - start);
+    const ordinal_type count =
+        static_cast<ordinal_type>(graph.row_map(i + 1) - start);
 
     if (count == 0) {
-      return SparseRowView<CrsMatrix> (NULL, NULL, 1, 0);
+      return SparseRowView<CrsMatrix>(NULL, NULL, 1, 0);
     } else {
-      return SparseRowView<CrsMatrix> (values, graph.entries, 1, count, start);
+      return SparseRowView<CrsMatrix>(values, graph.entries, 1, count, start);
     }
   }
 
@@ -907,19 +922,31 @@ class CrsMatrix {
   /// }
   /// \endcode
   KOKKOS_INLINE_FUNCTION
-  SparseRowViewConst<CrsMatrix> rowConst (const ordinal_type i) const {
+  SparseRowViewConst<CrsMatrix> rowConst(const ordinal_type i) const {
     const size_type start = graph.row_map(i);
     // count is guaranteed to fit in ordinal_type, as long as no row
     // has duplicate entries.
-    const ordinal_type count = static_cast<ordinal_type> (graph.row_map(i+1) - start);
+    const ordinal_type count =
+        static_cast<ordinal_type>(graph.row_map(i + 1) - start);
 
     if (count == 0) {
-      return SparseRowViewConst<CrsMatrix> (NULL, NULL, 1, 0);
+      return SparseRowViewConst<CrsMatrix>(NULL, NULL, 1, 0);
     } else {
-      return SparseRowViewConst<CrsMatrix> (values, graph.entries, 1, count, start);
+      return SparseRowViewConst<CrsMatrix>(values, graph.entries, 1, count,
+                                           start);
     }
   }
 };
 
-}
+/// \class is_crs_matrix
+/// \brief is_crs_matrix<T>::value is true if T is a CrsMatrix<...>, false
+/// otherwise
+template <typename>
+struct is_crs_matrix : public std::false_type {};
+template <typename... P>
+struct is_crs_matrix<CrsMatrix<P...>> : public std::true_type {};
+template <typename... P>
+struct is_crs_matrix<const CrsMatrix<P...>> : public std::true_type {};
+
+}  // namespace KokkosSparse
 #endif
diff --git a/src/sparse/KokkosSparse_OrdinalTraits.hpp b/src/sparse/KokkosSparse_OrdinalTraits.hpp
index 6e96495fc5..21d44cf57c 100644
--- a/src/sparse/KokkosSparse_OrdinalTraits.hpp
+++ b/src/sparse/KokkosSparse_OrdinalTraits.hpp
@@ -71,9 +71,9 @@ namespace KokkosSparse {
 /// Teuchos::OrdinalTraits<T>::invalid() because it is not marked as a
 /// Kokkos device function.  I also can't use std::numeric_limits for
 /// the same reason.  That's why this traits class needs to exist.
-template<class T>
+template <class T>
 struct OrdinalTraits {
-  static KOKKOS_INLINE_FUNCTION T invalid () { return -1; }
+  static KOKKOS_INLINE_FUNCTION T invalid() { return -1; }
 };
 
 // template<>
@@ -81,46 +81,50 @@ struct OrdinalTraits {
 //   static KOKKOS_INLINE_FUNCTION char invalid () { return CHAR_MAX; }
 // };
 
-template<>
+template <>
 struct OrdinalTraits<short int> {
-  static KOKKOS_INLINE_FUNCTION short int invalid () { return -1; }
+  static KOKKOS_INLINE_FUNCTION short int invalid() { return -1; }
 };
 
-template<>
+template <>
 struct OrdinalTraits<unsigned short int> {
-  static KOKKOS_INLINE_FUNCTION unsigned short int invalid () { return USHRT_MAX; }
+  static KOKKOS_INLINE_FUNCTION unsigned short int invalid() {
+    return USHRT_MAX;
+  }
 };
 
-template<>
+template <>
 struct OrdinalTraits<int> {
-  static KOKKOS_INLINE_FUNCTION int invalid () { return -1; }
+  static KOKKOS_INLINE_FUNCTION int invalid() { return -1; }
 };
 
-template<>
+template <>
 struct OrdinalTraits<unsigned int> {
-  static KOKKOS_INLINE_FUNCTION unsigned int invalid () { return UINT_MAX; }
+  static KOKKOS_INLINE_FUNCTION unsigned int invalid() { return UINT_MAX; }
 };
 
-template<>
+template <>
 struct OrdinalTraits<long> {
-  static KOKKOS_INLINE_FUNCTION long invalid () { return -1; }
+  static KOKKOS_INLINE_FUNCTION long invalid() { return -1; }
 };
 
-template<>
+template <>
 struct OrdinalTraits<unsigned long> {
-  static KOKKOS_INLINE_FUNCTION unsigned long invalid () { return ULONG_MAX; }
+  static KOKKOS_INLINE_FUNCTION unsigned long invalid() { return ULONG_MAX; }
 };
 
-template<>
+template <>
 struct OrdinalTraits<long long> {
-  static KOKKOS_INLINE_FUNCTION long long invalid () { return -1; }
+  static KOKKOS_INLINE_FUNCTION long long invalid() { return -1; }
 };
 
-template<>
+template <>
 struct OrdinalTraits<unsigned long long> {
-  static KOKKOS_INLINE_FUNCTION unsigned long long invalid () { return ULLONG_MAX; }
+  static KOKKOS_INLINE_FUNCTION unsigned long long invalid() {
+    return ULLONG_MAX;
+  }
 };
 
-} // namespace KokkosSparse
+}  // namespace KokkosSparse
 
-#endif // KOKKOS_SPARSE_ORDINALTRAITS_HPP_
+#endif  // KOKKOS_SPARSE_ORDINALTRAITS_HPP_
diff --git a/src/sparse/KokkosSparse_findRelOffset.hpp b/src/sparse/KokkosSparse_findRelOffset.hpp
index 0dac2c2233..3eb1100821 100644
--- a/src/sparse/KokkosSparse_findRelOffset.hpp
+++ b/src/sparse/KokkosSparse_findRelOffset.hpp
@@ -49,137 +49,132 @@
 /// \brief Find the relative offset of a column index in a sparse
 ///   graph's or sparse matrix's row.
 
-#include "Kokkos_Macros.hpp" // KOKKOS_FUNCTION
+#include "Kokkos_Macros.hpp"  // KOKKOS_FUNCTION
 #include <type_traits>
 
 namespace KokkosSparse {
-  /// \brief Search <tt>indsToSearch[0 .. numEnt-1]</tt> for
-  ///   \c indToFind, using equality comparison.
-  ///
-  /// \return If found, return index of \c indToFind in \c indsToSearch;
-  ///   else, return \c numEnt (by analogy with C++ Standard Library
-  ///   functions like std::find, that return "the end of the sequence"
-  ///   in this case).
-  ///
-  /// \tparam OffsetType Integer type that can be used to represent any
-  ///   valid index in \c indsToSearch, up to and including \c numEnt.
-  /// \tparam IndexViewType 1-D array of equality-comparable entries
-  ///   (generally intended to be column indices).  This may a 1-D
-  ///   Kokkos::View, a raw 1-D array, or any type that implements
-  ///   operator[](OffsetType).
-  ///
-  /// \param indsToSearch [in] Array of indices to search.  For a
-  ///   sparse graph or matrix, this is the array of all the column
-  ///   indices for some row of the graph / matrix.
-  /// \param numEnt [in] Number of entries in \c indsToSearch to
-  ///   search.  This is a separate argument, first so that this
-  ///   function works with raw arrays as well as Kokkos::View, and
-  ///   second so that users don't have to incur the overhead of
-  ///   calling Kokkos::subview to limit the length of a View.  The
-  ///   latter may be particularly helpful for the case of the
-  ///   begin/end-pointer variant of CSR graph/matrix storage.
-  /// \param indToFind [in] (Local) column index for which to find the
-  ///   offset.  This has the same type as that of each entry in
-  ///   \c indsToSearch.
-  /// \param hint [in] Hint for where to find \c indToFind in the array.
-  ///   If <tt>indsToSearch[hint] == indToFind</tt>, then the hint is
-  ///   correct.  The hint is ignored if it is out of range (that is,
-  ///   greater than or equal to the number of entries in the given
-  ///   row).
-  /// \param isSorted [in] Whether the input array of indices to search
-  ///   is sorted in increasing order.
-  ///
-  /// The hint optimizes for the case of calling this method several
-  /// times with the same sparse graph / matrix row, when several
-  /// index inputs occur in consecutive sequence.  This may occur (for
-  /// example) when there are multiple degrees of freedom per mesh
-  /// point, and users are handling the assignment of degrees of
-  /// freedom to global indices manually (rather than letting some
-  /// other class take care of it).  In that case, users might choose
-  /// to assign the degrees of freedom for a mesh point to consecutive
-  /// global indices.  Epetra implements the hint for this reason.
-  ///
-  /// The hint only costs two comparisons (one to check range, and the
-  /// other to see if the hint was correct), and it can save searching
-  /// for the indices (which may take a lot more than two
-  /// comparisons).
-  ///
-  /// \note To implementers: We put <tt>indsToSearch</tt> before
-  ///   <tt>indToFind</tt> so that we can derive the type of
-  ///   <tt>indToFind</tt> directly from that of each entry of
-  ///   <tt>indsToSearch</tt>, without needing <tt>IndexViewType</tt>
-  ///   to be a Kokkos::View.  Thankfully, arguments to a C++ function
-  ///   behave more like LET* than LET (in ANSI Common Lisp terms).
-  template<class OffsetType, class IndexViewType>
-  KOKKOS_FUNCTION OffsetType
-  findRelOffset (const IndexViewType& indsToSearch,
-                 const OffsetType numEnt,
-                 /* typename IndexViewType::const_value_type */
-                 const typename std::decay<decltype (indsToSearch[0]) >::type indToFind,
-                 const OffsetType hint,
-                 const bool isSorted)
-  {
-    // IndexViewType doesn't have to be a Kokkos::View; it just has to
-    // implement operator[] like a 1-D array.
-    //
-    // static_assert (Kokkos::is_view<IndexViewType>::value,
-    //                 "IndexViewType must be a Kokkos::View");
-    // static_assert (static_cast<int> (IndexViewType::rank) == 1,
-    //                 "IndexViewType must be a rank-1 Kokkos::View");
-    static_assert (std::is_integral<OffsetType>::value,
-                   "OffsetType must be an integer.");
+/// \brief Search <tt>indsToSearch[0 .. numEnt-1]</tt> for
+///   \c indToFind, using equality comparison.
+///
+/// \return If found, return index of \c indToFind in \c indsToSearch;
+///   else, return \c numEnt (by analogy with C++ Standard Library
+///   functions like std::find, that return "the end of the sequence"
+///   in this case).
+///
+/// \tparam OffsetType Integer type that can be used to represent any
+///   valid index in \c indsToSearch, up to and including \c numEnt.
+/// \tparam IndexViewType 1-D array of equality-comparable entries
+///   (generally intended to be column indices).  This may a 1-D
+///   Kokkos::View, a raw 1-D array, or any type that implements
+///   operator[](OffsetType).
+///
+/// \param indsToSearch [in] Array of indices to search.  For a
+///   sparse graph or matrix, this is the array of all the column
+///   indices for some row of the graph / matrix.
+/// \param numEnt [in] Number of entries in \c indsToSearch to
+///   search.  This is a separate argument, first so that this
+///   function works with raw arrays as well as Kokkos::View, and
+///   second so that users don't have to incur the overhead of
+///   calling Kokkos::subview to limit the length of a View.  The
+///   latter may be particularly helpful for the case of the
+///   begin/end-pointer variant of CSR graph/matrix storage.
+/// \param indToFind [in] (Local) column index for which to find the
+///   offset.  This has the same type as that of each entry in
+///   \c indsToSearch.
+/// \param hint [in] Hint for where to find \c indToFind in the array.
+///   If <tt>indsToSearch[hint] == indToFind</tt>, then the hint is
+///   correct.  The hint is ignored if it is out of range (that is,
+///   greater than or equal to the number of entries in the given
+///   row).
+/// \param isSorted [in] Whether the input array of indices to search
+///   is sorted in increasing order.
+///
+/// The hint optimizes for the case of calling this method several
+/// times with the same sparse graph / matrix row, when several
+/// index inputs occur in consecutive sequence.  This may occur (for
+/// example) when there are multiple degrees of freedom per mesh
+/// point, and users are handling the assignment of degrees of
+/// freedom to global indices manually (rather than letting some
+/// other class take care of it).  In that case, users might choose
+/// to assign the degrees of freedom for a mesh point to consecutive
+/// global indices.  Epetra implements the hint for this reason.
+///
+/// The hint only costs two comparisons (one to check range, and the
+/// other to see if the hint was correct), and it can save searching
+/// for the indices (which may take a lot more than two
+/// comparisons).
+///
+/// \note To implementers: We put <tt>indsToSearch</tt> before
+///   <tt>indToFind</tt> so that we can derive the type of
+///   <tt>indToFind</tt> directly from that of each entry of
+///   <tt>indsToSearch</tt>, without needing <tt>IndexViewType</tt>
+///   to be a Kokkos::View.  Thankfully, arguments to a C++ function
+///   behave more like LET* than LET (in ANSI Common Lisp terms).
+template <class OffsetType, class IndexViewType>
+KOKKOS_FUNCTION OffsetType findRelOffset(
+    const IndexViewType& indsToSearch, const OffsetType numEnt,
+    /* typename IndexViewType::const_value_type */
+    const typename std::decay<decltype(indsToSearch[0])>::type indToFind,
+    const OffsetType hint, const bool isSorted) {
+  // IndexViewType doesn't have to be a Kokkos::View; it just has to
+  // implement operator[] like a 1-D array.
+  //
+  // static_assert (Kokkos::is_view<IndexViewType>::value,
+  //                 "IndexViewType must be a Kokkos::View");
+  // static_assert (static_cast<int> (IndexViewType::rank) == 1,
+  //                 "IndexViewType must be a rank-1 Kokkos::View");
+  static_assert(std::is_integral<OffsetType>::value,
+                "OffsetType must be an integer.");
 
-    if (hint < numEnt && indsToSearch[hint] == indToFind) {
-      return hint; // hint was correct
-    }
+  if (hint < numEnt && indsToSearch[hint] == indToFind) {
+    return hint;  // hint was correct
+  }
 
-    // Even if the array is sorted, use linear search if the number of
-    // entries is small ("small" is a tuning parameter; feel free to
-    // tune for your architecture).  'constexpr' promises the compiler
-    // that it can bake this constant as a literal into the code.
-    constexpr OffsetType linearSearchThreshold = 16;
+  // Even if the array is sorted, use linear search if the number of
+  // entries is small ("small" is a tuning parameter; feel free to
+  // tune for your architecture).  'constexpr' promises the compiler
+  // that it can bake this constant as a literal into the code.
+  constexpr OffsetType linearSearchThreshold = 16;
 
-    if (! isSorted || numEnt < linearSearchThreshold) {
-      for (OffsetType k = 0; k < numEnt; ++k) {
-        if (indsToSearch[k] == indToFind) {
-          return k;
-        }
+  if (!isSorted || numEnt < linearSearchThreshold) {
+    for (OffsetType k = 0; k < numEnt; ++k) {
+      if (indsToSearch[k] == indToFind) {
+        return k;
       }
     }
-    else { // use binary search
-      OffsetType start = 0;
-      OffsetType end = numEnt;
-      // Compare epetra/src/Epetra_Util.cpp, Epetra_Util_binary_search.
-      // Unlike that function, I don't use end = numEnt-1, because I
-      // want this code to work also for unsigned OffsetType (signed is
-      // preferred, though).  Thus, in my code, end is always "one past
-      // the last valid index."
-      while (end > start) {
-        // Invariants: 0 <= start < end, thus start + end > 0.
-        const OffsetType mid = (start + end - 1) / 2;
-        // Invariants: 0 <= start <= mid < end.
-        if (indsToSearch[mid] < indToFind) {
-          // Invariant: start < mid+1 (thus, recursion terminates),
-          // and for all k <= mid, indsToSearch[k] < indToFind.
-          start = mid + 1; // Invariant: 0 < mid < start <= end.
-        }
-        else { // indsToSearch[mid] >= indToFind
-          // Invariant: mid < end (thus, recursion terminates),
-          // and for all k <= mid, indsToSearch[k] >= indToFind.
-          end = mid; // Invariant: 0 <= start <= mid <= end.
-        }
-      }
-      // Invariant: 0 <= start == end.
-
-      // Don't check if we've already passed the end.
-      if (start < numEnt && indsToSearch[start] == indToFind) {
-        return start;
+  } else {  // use binary search
+    OffsetType start = 0;
+    OffsetType end   = numEnt;
+    // Compare epetra/src/Epetra_Util.cpp, Epetra_Util_binary_search.
+    // Unlike that function, I don't use end = numEnt-1, because I
+    // want this code to work also for unsigned OffsetType (signed is
+    // preferred, though).  Thus, in my code, end is always "one past
+    // the last valid index."
+    while (end > start) {
+      // Invariants: 0 <= start < end, thus start + end > 0.
+      const OffsetType mid = (start + end - 1) / 2;
+      // Invariants: 0 <= start <= mid < end.
+      if (indsToSearch[mid] < indToFind) {
+        // Invariant: start < mid+1 (thus, recursion terminates),
+        // and for all k <= mid, indsToSearch[k] < indToFind.
+        start = mid + 1;  // Invariant: 0 < mid < start <= end.
+      } else {            // indsToSearch[mid] >= indToFind
+        // Invariant: mid < end (thus, recursion terminates),
+        // and for all k <= mid, indsToSearch[k] >= indToFind.
+        end = mid;  // Invariant: 0 <= start <= mid <= end.
       }
     }
+    // Invariant: 0 <= start == end.
 
-    return numEnt; // "end of sequence"
+    // Don't check if we've already passed the end.
+    if (start < numEnt && indsToSearch[start] == indToFind) {
+      return start;
+    }
   }
 
-} // namespace KokkosSparse
+  return numEnt;  // "end of sequence"
+}
+
+}  // namespace KokkosSparse
 
-#endif // KOKKOS_SPARSE_FINDRELOFFSET_HPP
+#endif  // KOKKOS_SPARSE_FINDRELOFFSET_HPP
diff --git a/src/sparse/KokkosSparse_gauss_seidel.hpp b/src/sparse/KokkosSparse_gauss_seidel.hpp
index 66e7a3531a..efe70dd1c5 100644
--- a/src/sparse/KokkosSparse_gauss_seidel.hpp
+++ b/src/sparse/KokkosSparse_gauss_seidel.hpp
@@ -47,753 +47,761 @@
 #include "KokkosSparse_gauss_seidel_spec.hpp"
 #include "KokkosKernels_Handle.hpp"
 #include "KokkosKernels_helpers.hpp"
+#include "KokkosKernels_Error.hpp"
+
+namespace KokkosSparse {
+
+namespace Experimental {
+
+template <typename KernelHandle, typename lno_row_view_t_,
+          typename lno_nnz_view_t_>
+void gauss_seidel_symbolic(KernelHandle *handle,
+                           typename KernelHandle::const_nnz_lno_t num_rows,
+                           typename KernelHandle::const_nnz_lno_t num_cols,
+                           lno_row_view_t_ row_map, lno_nnz_view_t_ entries,
+                           bool is_graph_symmetric = true) {
+  static_assert(std::is_same<typename KernelHandle::const_size_type,
+                             typename lno_row_view_t_::const_value_type>::value,
+                "KokkosSparse::gauss_seidel_symbolic: Size type of the matrix "
+                "should be same as kernelHandle sizetype.");
+
+  static_assert(std::is_same<typename KernelHandle::const_nnz_lno_t,
+                             typename lno_nnz_view_t_::const_value_type>::value,
+                "KokkosSparse::gauss_seidel_symbolic: lno type of the matrix "
+                "should be same as kernelHandle lno_t.");
+
+  typedef typename KernelHandle::const_size_type c_size_t;
+  typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
+  typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
+
+  typedef typename KernelHandle::HandleExecSpace c_exec_t;
+  typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
+  typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
+
+  typedef typename KokkosKernels::Experimental::KokkosKernelsHandle<
+      c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t>
+      const_handle_type;
+  // const_handle_type tmp_handle = *handle;
+  const_handle_type tmp_handle(*handle);
+
+  typedef Kokkos::View<typename lno_row_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           lno_row_view_t_>::array_layout,
+                       typename lno_row_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_alno_row_view_t_;
+
+  typedef Kokkos::View<typename lno_nnz_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           lno_nnz_view_t_>::array_layout,
+                       typename lno_nnz_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_alno_nnz_view_t_;
+
+  // Internal_alno_row_view_t_ const_a_r  = row_map;
+  // Internal_alno_nnz_view_t_ const_a_l  = entries;
+  Internal_alno_row_view_t_ const_a_r(row_map.data(), row_map.extent(0));
+  Internal_alno_nnz_view_t_ const_a_l(entries.data(), entries.extent(0));
+
+  using namespace KokkosSparse::Impl;
+
+  GAUSS_SEIDEL_SYMBOLIC<
+      const_handle_type, Internal_alno_row_view_t_,
+      Internal_alno_nnz_view_t_>::gauss_seidel_symbolic(&tmp_handle, num_rows,
+                                                        num_cols, const_a_r,
+                                                        const_a_l,
+                                                        is_graph_symmetric);
+}
+
+template <typename KernelHandle, typename lno_row_view_t_,
+          typename lno_nnz_view_t_>
+void block_gauss_seidel_symbolic(
+    KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows,
+    typename KernelHandle::const_nnz_lno_t num_cols,
+    typename KernelHandle::const_nnz_lno_t block_size, lno_row_view_t_ row_map,
+    lno_nnz_view_t_ entries, bool is_graph_symmetric = true) {
+  auto gsHandle = handle->get_point_gs_handle();
+  if (gsHandle->get_algorithm_type() == GS_CLUSTER) {
+    throw std::runtime_error(
+        "Block versions of Gauss-Seidel are incompatible with algorithm "
+        "GS_CLUSTER");
+  }
+  gsHandle->set_block_size(block_size);
+
+  gauss_seidel_symbolic(handle, num_rows, num_cols, row_map, entries,
+                        is_graph_symmetric);
+}
+
+template <KokkosKernels::SparseMatrixFormat format = KokkosKernels::CRS,
+          typename KernelHandle, typename lno_row_view_t_,
+          typename lno_nnz_view_t_, typename scalar_nnz_view_t_>
+void gauss_seidel_numeric(KernelHandle *handle,
+                          typename KernelHandle::const_nnz_lno_t num_rows,
+                          typename KernelHandle::const_nnz_lno_t num_cols,
+                          lno_row_view_t_ row_map, lno_nnz_view_t_ entries,
+                          scalar_nnz_view_t_ values,
+                          bool is_graph_symmetric = true) {
+  static_assert(std::is_same<typename KernelHandle::const_size_type,
+                             typename lno_row_view_t_::const_value_type>::value,
+                "KokkosSparse::gauss_seidel_symbolic: Size type of the matrix "
+                "should be same as kernelHandle sizetype.");
+
+  static_assert(std::is_same<typename KernelHandle::const_nnz_lno_t,
+                             typename lno_nnz_view_t_::const_value_type>::value,
+                "KokkosSparse::gauss_seidel_symbolic: lno type of the matrix "
+                "should be same as kernelHandle lno_t.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_nnz_scalar_t,
+                   typename scalar_nnz_view_t_::const_value_type>::value,
+      "KokkosSparse::gauss_seidel_symbolic: scalar type of the matrix should "
+      "be same as kernelHandle scalar_t.");
+
+  typedef typename KernelHandle::const_size_type c_size_t;
+  typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
+  typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
+
+  typedef typename KernelHandle::HandleExecSpace c_exec_t;
+  typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
+  typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
+
+  typedef typename KokkosKernels::Experimental::KokkosKernelsHandle<
+      c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t>
+      const_handle_type;
+  // const_handle_type tmp_handle = *handle;
+  const_handle_type tmp_handle(*handle);
+
+  typedef Kokkos::View<typename lno_row_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           lno_row_view_t_>::array_layout,
+                       typename lno_row_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_alno_row_view_t_;
+
+  typedef Kokkos::View<typename lno_nnz_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           lno_nnz_view_t_>::array_layout,
+                       typename lno_nnz_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_alno_nnz_view_t_;
+
+  typedef Kokkos::View<typename scalar_nnz_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           scalar_nnz_view_t_>::array_layout,
+                       typename scalar_nnz_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_ascalar_nnz_view_t_;
+
+  Internal_alno_row_view_t_ const_a_r(row_map.data(), row_map.extent(0));
+  Internal_alno_nnz_view_t_ const_a_l(entries.data(), entries.extent(0));
+  Internal_ascalar_nnz_view_t_ const_a_v(values.data(), values.extent(0));
+
+  using namespace KokkosSparse::Impl;
+
+  GAUSS_SEIDEL_NUMERIC<
+      const_handle_type, format, Internal_alno_row_view_t_,
+      Internal_alno_nnz_view_t_,
+      Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(&tmp_handle, num_rows,
+                                                          num_cols, const_a_r,
+                                                          const_a_l, const_a_v,
+                                                          is_graph_symmetric);
+}
+
+template <KokkosKernels::SparseMatrixFormat format = KokkosKernels::CRS,
+          typename KernelHandle, typename lno_row_view_t_,
+          typename lno_nnz_view_t_, typename scalar_nnz_view_t_>
+void gauss_seidel_numeric(KernelHandle *handle,
+                          typename KernelHandle::const_nnz_lno_t num_rows,
+                          typename KernelHandle::const_nnz_lno_t num_cols,
+                          lno_row_view_t_ row_map, lno_nnz_view_t_ entries,
+                          scalar_nnz_view_t_ values,
+                          scalar_nnz_view_t_ given_inverse_diagonal,
+                          bool is_graph_symmetric = true) {
+  static_assert(std::is_same<typename KernelHandle::const_size_type,
+                             typename lno_row_view_t_::const_value_type>::value,
+                "KokkosSparse::gauss_seidel_symbolic: Size type of the matrix "
+                "should be same as kernelHandle sizetype.");
+
+  static_assert(std::is_same<typename KernelHandle::const_nnz_lno_t,
+                             typename lno_nnz_view_t_::const_value_type>::value,
+                "KokkosSparse::gauss_seidel_symbolic: lno type of the matrix "
+                "should be same as kernelHandle lno_t.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_nnz_scalar_t,
+                   typename scalar_nnz_view_t_::const_value_type>::value,
+      "KokkosSparse::gauss_seidel_symbolic: scalar type of the matrix should "
+      "be same as kernelHandle scalar_t.");
+
+  typedef typename KernelHandle::const_size_type c_size_t;
+  typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
+  typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
+
+  typedef typename KernelHandle::HandleExecSpace c_exec_t;
+  typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
+  typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
+
+  typedef typename KokkosKernels::Experimental::KokkosKernelsHandle<
+      c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t>
+      const_handle_type;
+  // const_handle_type tmp_handle = *handle;
+  const_handle_type tmp_handle(*handle);
+
+  typedef Kokkos::View<typename lno_row_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           lno_row_view_t_>::array_layout,
+                       typename lno_row_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_alno_row_view_t_;
+
+  typedef Kokkos::View<typename lno_nnz_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           lno_nnz_view_t_>::array_layout,
+                       typename lno_nnz_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_alno_nnz_view_t_;
+
+  typedef Kokkos::View<typename scalar_nnz_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           scalar_nnz_view_t_>::array_layout,
+                       typename scalar_nnz_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_ascalar_nnz_view_t_;
+
+  Internal_alno_row_view_t_ const_a_r(row_map.data(), row_map.extent(0));
+  Internal_alno_nnz_view_t_ const_a_l(entries.data(), entries.extent(0));
+  Internal_ascalar_nnz_view_t_ const_a_v(values.data(), values.extent(0));
+  Internal_ascalar_nnz_view_t_ const_a_d(given_inverse_diagonal.data(),
+                                         given_inverse_diagonal.extent(0));
+
+  using namespace KokkosSparse::Impl;
+
+  GAUSS_SEIDEL_NUMERIC<
+      const_handle_type, format, Internal_alno_row_view_t_,
+      Internal_alno_nnz_view_t_,
+      Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(&tmp_handle, num_rows,
+                                                          num_cols, const_a_r,
+                                                          const_a_l, const_a_v,
+                                                          const_a_d,
+                                                          is_graph_symmetric);
+}
 
-namespace KokkosSparse{
+template <KokkosKernels::SparseMatrixFormat format = KokkosKernels::BlockCRS,
+          typename KernelHandle, typename lno_row_view_t_,
+          typename lno_nnz_view_t_, typename scalar_nnz_view_t_>
+void block_gauss_seidel_numeric(
+    KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows,
+    typename KernelHandle::const_nnz_lno_t num_cols,
+    typename KernelHandle::const_nnz_lno_t block_size, lno_row_view_t_ row_map,
+    lno_nnz_view_t_ entries, scalar_nnz_view_t_ values,
+    bool is_graph_symmetric = true) {
+  auto gsHandle = handle->get_point_gs_handle();
+  if (gsHandle->get_algorithm_type() == GS_CLUSTER) {
+    throw std::runtime_error(
+        "Block versions of Gauss-Seidel are incompatible with algorithm "
+        "GS_CLUSTER");
+  }
+  gsHandle->set_block_size(block_size);
 
-  namespace Experimental{
-
-    template <typename KernelHandle, typename lno_row_view_t_, typename lno_nnz_view_t_>
-    void gauss_seidel_symbolic(
-                               KernelHandle *handle,
-                               typename KernelHandle::const_nnz_lno_t num_rows,
-                               typename KernelHandle::const_nnz_lno_t num_cols,
-                               lno_row_view_t_ row_map,
-                               lno_nnz_view_t_ entries,
-                               bool is_graph_symmetric = true){
+  gauss_seidel_numeric<format>(handle, num_rows, num_cols, row_map, entries,
+                               values, is_graph_symmetric);
+}
 
+template <KokkosKernels::SparseMatrixFormat format = KokkosKernels::CRS,
+          typename KernelHandle, typename lno_row_view_t_,
+          typename lno_nnz_view_t_, typename scalar_nnz_view_t_,
+          typename x_scalar_view_t, typename y_scalar_view_t>
+void symmetric_gauss_seidel_apply(
+    KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows,
+    typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map,
+    lno_nnz_view_t_ entries, scalar_nnz_view_t_ values,
+    x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec,
+    bool init_zero_x_vector, bool update_y_vector,
+    typename KernelHandle::nnz_scalar_t omega, int numIter) {
+  static_assert(std::is_same<typename KernelHandle::const_size_type,
+                             typename lno_row_view_t_::const_value_type>::value,
+                "KokkosSparse::symmetric_gauss_seidel_apply: Size type of the "
+                "matrix should be same as kernelHandle sizetype.");
+
+  static_assert(std::is_same<typename KernelHandle::const_nnz_lno_t,
+                             typename lno_nnz_view_t_::const_value_type>::value,
+                "KokkosSparse::symmetric_gauss_seidel_apply: lno type of the "
+                "matrix should be same as kernelHandle lno_t.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_nnz_scalar_t,
+                   typename scalar_nnz_view_t_::const_value_type>::value,
+      "KokkosSparse::symmetric_gauss_seidel_apply: scalar type of the matrix "
+      "should be same as kernelHandle scalar_t.");
+
+  static_assert(std::is_same<typename KernelHandle::const_nnz_scalar_t,
+                             typename y_scalar_view_t::const_value_type>::value,
+                "KokkosSparse::symmetric_gauss_seidel_apply: scalar type of "
+                "the y-vector should be same as kernelHandle scalar_t.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::nnz_scalar_t,
+                   typename x_scalar_view_t::value_type>::value,
+      "KokkosSparse::symmetric_gauss_seidel_apply: scalar type of the x-vector "
+      "should be same as kernelHandle non-const scalar_t.");
+
+  static_assert(!std::is_same<typename lno_row_view_t_::array_layout,
+                              Kokkos::LayoutStride>::value,
+                "KokkosSparse::symmetric_gauss_seidel_apply: row_map must have "
+                "a contiguous layout (Left or Right, not Stride)");
+  static_assert(!std::is_same<typename lno_nnz_view_t_::array_layout,
+                              Kokkos::LayoutStride>::value,
+                "KokkosSparse::symmetric_gauss_seidel_apply: entries must have "
+                "a contiguous layout (Left or Right, not Stride)");
+  static_assert(!std::is_same<typename scalar_nnz_view_t_::array_layout,
+                              Kokkos::LayoutStride>::value,
+                "KokkosSparse::symmetric_gauss_seidel_apply: values must have "
+                "a contiguous layout (Left or Right, not Stride)");
+
+  // Check compatibility of #vectors
+  if (x_lhs_output_vec.extent(1) != y_rhs_input_vec.extent(1)) {
+    std::ostringstream os;
+    os << "KokkosSparse::symmetric_gauss_seidel_apply: "
+       << "X has " << x_lhs_output_vec.extent(1) << "columns, Y has "
+       << y_rhs_input_vec.extent(1) << " columns.";
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
+  }
 
+  typedef typename KernelHandle::const_size_type c_size_t;
+  typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
+  typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
+
+  typedef typename KernelHandle::HandleExecSpace c_exec_t;
+  typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
+  typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
+
+  typedef typename KokkosKernels::Experimental::KokkosKernelsHandle<
+      c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t>
+      const_handle_type;
+  const_handle_type tmp_handle(*handle);
+
+  typedef Kokkos::View<typename lno_row_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           lno_row_view_t_>::array_layout,
+                       typename lno_row_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_alno_row_view_t_;
+
+  typedef Kokkos::View<typename lno_nnz_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           lno_nnz_view_t_>::array_layout,
+                       typename lno_nnz_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_alno_nnz_view_t_;
+
+  typedef Kokkos::View<typename scalar_nnz_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           scalar_nnz_view_t_>::array_layout,
+                       typename scalar_nnz_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_ascalar_nnz_view_t_;
+
+  typedef Kokkos::View<typename y_scalar_view_t::const_value_type **,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           y_scalar_view_t>::array_layout,
+                       typename y_scalar_view_t::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_yscalar_nnz_view_t_;
+
+  typedef Kokkos::View<typename x_scalar_view_t::non_const_value_type **,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           x_scalar_view_t>::array_layout,
+                       typename x_scalar_view_t::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_xscalar_nnz_view_t_;
+
+  Internal_alno_row_view_t_ const_a_r(row_map.data(), row_map.extent(0));
+  Internal_alno_nnz_view_t_ const_a_l(entries.data(), entries.extent(0));
+  Internal_ascalar_nnz_view_t_ const_a_v(values.data(), values.extent(0));
+
+  Internal_xscalar_nnz_view_t_ nonconst_x_v(x_lhs_output_vec.data(),
+                                            x_lhs_output_vec.extent(0),
+                                            x_lhs_output_vec.extent(1));
+  Internal_yscalar_nnz_view_t_ const_y_v(y_rhs_input_vec.data(),
+                                         y_rhs_input_vec.extent(0),
+                                         y_rhs_input_vec.extent(1));
+
+  using namespace KokkosSparse::Impl;
+
+  GAUSS_SEIDEL_APPLY<const_handle_type, format, Internal_alno_row_view_t_,
+                     Internal_alno_nnz_view_t_, Internal_ascalar_nnz_view_t_,
+                     Internal_xscalar_nnz_view_t_,
+                     Internal_yscalar_nnz_view_t_>::
+      gauss_seidel_apply(&tmp_handle, num_rows, num_cols, const_a_r, const_a_l,
+                         const_a_v, nonconst_x_v, const_y_v, init_zero_x_vector,
+                         update_y_vector, omega, numIter, true, true);
+}
 
-      static_assert (std::is_same<typename KernelHandle::const_size_type,
-                     typename lno_row_view_t_::const_value_type>::value,
-                     "KokkosSparse::gauss_seidel_symbolic: Size type of the matrix should be same as kernelHandle sizetype.");
+template <KokkosKernels::SparseMatrixFormat format = KokkosKernels::BlockCRS,
+          typename KernelHandle, typename lno_row_view_t_,
+          typename lno_nnz_view_t_, typename scalar_nnz_view_t_,
+          typename x_scalar_view_t, typename y_scalar_view_t>
+void symmetric_block_gauss_seidel_apply(
+    KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows,
+    typename KernelHandle::const_nnz_lno_t num_cols,
+    typename KernelHandle::const_nnz_lno_t block_size,
+
+    lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values,
+    x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec,
+    bool init_zero_x_vector, bool update_y_vector,
+    typename KernelHandle::nnz_scalar_t omega, int numIter) {
+  // Check compatibility of dimensions at run time.
+  if (x_lhs_output_vec.extent(1) != y_rhs_input_vec.extent(1)) {
+    std::ostringstream os;
+    os << "KokkosSparse::symmetric_block_gauss_seidel_apply: Dimensions of X "
+          "and Y do not match: "
+       << "X has " << x_lhs_output_vec.extent(1) << "columns, Y has "
+       << y_rhs_input_vec.extent(1) << " columns.";
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
+  }
+  auto gsHandle = handle->get_point_gs_handle();
+  if (gsHandle->get_algorithm_type() == GS_CLUSTER) {
+    throw std::runtime_error(
+        "Block versions of Gauss-Seidel are incompatible with algorithm "
+        "GS_CLUSTER");
+  }
 
-      static_assert (std::is_same<typename KernelHandle::const_nnz_lno_t,
-                     typename lno_nnz_view_t_::const_value_type>::value,
-                     "KokkosSparse::gauss_seidel_symbolic: lno type of the matrix should be same as kernelHandle lno_t.");
+  gsHandle->set_block_size(block_size);
+  symmetric_gauss_seidel_apply<format>(
+      handle, num_rows, num_cols, row_map, entries, values, x_lhs_output_vec,
+      y_rhs_input_vec, init_zero_x_vector, update_y_vector, omega, numIter);
+}
+template <KokkosKernels::SparseMatrixFormat format = KokkosKernels::CRS,
+          class KernelHandle, typename lno_row_view_t_,
+          typename lno_nnz_view_t_, typename scalar_nnz_view_t_,
+          typename x_scalar_view_t, typename y_scalar_view_t>
+void forward_sweep_gauss_seidel_apply(
+    KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows,
+    typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map,
+    lno_nnz_view_t_ entries, scalar_nnz_view_t_ values,
+    x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec,
+    bool init_zero_x_vector, bool update_y_vector,
+    typename KernelHandle::nnz_scalar_t omega, int numIter) {
+  static_assert(std::is_same<typename KernelHandle::const_size_type,
+                             typename lno_row_view_t_::const_value_type>::value,
+                "KokkosSparse::forward_sweep_gauss_seidel_apply: Size type of "
+                "the matrix should be same as kernelHandle sizetype.");
+
+  static_assert(std::is_same<typename KernelHandle::const_nnz_lno_t,
+                             typename lno_nnz_view_t_::const_value_type>::value,
+                "KokkosSparse::forward_sweep_gauss_seidel_apply: lno type of "
+                "the matrix should be same as kernelHandle lno_t.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_nnz_scalar_t,
+                   typename scalar_nnz_view_t_::const_value_type>::value,
+      "KokkosSparse::forward_sweep_gauss_seidel_apply: scalar type of the "
+      "matrix should be same as kernelHandle scalar_t.");
+
+  static_assert(std::is_same<typename KernelHandle::const_nnz_scalar_t,
+                             typename y_scalar_view_t::const_value_type>::value,
+                "KokkosSparse::forward_sweep_gauss_seidel_apply: scalar type "
+                "of the y-vector should be same as kernelHandle scalar_t.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::nnz_scalar_t,
+                   typename x_scalar_view_t::value_type>::value,
+      "KokkosSparse::forward_sweep_gauss_seidel_apply: scalar type of the "
+      "x-vector should be same as kernelHandle non-const scalar_t.");
+
+  static_assert(!std::is_same<typename lno_row_view_t_::array_layout,
+                              Kokkos::LayoutStride>::value,
+                "KokkosSparse::forward_sweep_gauss_seidel_apply: row_map must "
+                "have a contiguous layout (Left or Right, not Stride)");
+  static_assert(!std::is_same<typename lno_nnz_view_t_::array_layout,
+                              Kokkos::LayoutStride>::value,
+                "KokkosSparse::forward_sweep_gauss_seidel_apply: entries must "
+                "have a contiguous layout (Left or Right, not Stride)");
+  static_assert(!std::is_same<typename scalar_nnz_view_t_::array_layout,
+                              Kokkos::LayoutStride>::value,
+                "KokkosSparse::forward_sweep_gauss_seidel_apply: values must "
+                "have a contiguous layout (Left or Right, not Stride)");
+
+  // Check compatibility of dimensions at run time.
+  if (x_lhs_output_vec.extent(1) != y_rhs_input_vec.extent(1)) {
+    std::ostringstream os;
+    os << "KokkosSparse::forward_sweep_gauss_seidel_apply: Dimensions of X and "
+          "Y do not match: "
+       << "X has " << x_lhs_output_vec.extent(1) << "columns, Y has "
+       << y_rhs_input_vec.extent(1) << " columns.";
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
+  }
 
+  typedef typename KernelHandle::const_size_type c_size_t;
+  typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
+  typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
+
+  typedef typename KernelHandle::HandleExecSpace c_exec_t;
+  typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
+  typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
+
+  typedef typename KokkosKernels::Experimental::KokkosKernelsHandle<
+      c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t>
+      const_handle_type;
+  // const_handle_type tmp_handle = *handle;
+  const_handle_type tmp_handle(*handle);
+
+  typedef Kokkos::View<typename lno_row_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           lno_row_view_t_>::array_layout,
+                       typename lno_row_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_alno_row_view_t_;
+
+  typedef Kokkos::View<typename lno_nnz_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           lno_nnz_view_t_>::array_layout,
+                       typename lno_nnz_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_alno_nnz_view_t_;
+
+  typedef Kokkos::View<typename scalar_nnz_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           scalar_nnz_view_t_>::array_layout,
+                       typename scalar_nnz_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_ascalar_nnz_view_t_;
+
+  typedef Kokkos::View<typename y_scalar_view_t::const_value_type **,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           y_scalar_view_t>::array_layout,
+                       typename y_scalar_view_t::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_yscalar_nnz_view_t_;
+
+  typedef Kokkos::View<typename x_scalar_view_t::non_const_value_type **,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           x_scalar_view_t>::array_layout,
+                       typename x_scalar_view_t::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_xscalar_nnz_view_t_;
+
+  Internal_alno_row_view_t_ const_a_r(row_map.data(), row_map.extent(0));
+  Internal_alno_nnz_view_t_ const_a_l(entries.data(), entries.extent(0));
+  Internal_ascalar_nnz_view_t_ const_a_v(values.data(), values.extent(0));
+
+  Internal_xscalar_nnz_view_t_ nonconst_x_v(x_lhs_output_vec.data(),
+                                            x_lhs_output_vec.extent(0),
+                                            x_lhs_output_vec.extent(1));
+  Internal_yscalar_nnz_view_t_ const_y_v(y_rhs_input_vec.data(),
+                                         y_rhs_input_vec.extent(0),
+                                         y_rhs_input_vec.extent(1));
+
+  using namespace KokkosSparse::Impl;
+
+  GAUSS_SEIDEL_APPLY<const_handle_type, format, Internal_alno_row_view_t_,
+                     Internal_alno_nnz_view_t_, Internal_ascalar_nnz_view_t_,
+                     Internal_xscalar_nnz_view_t_,
+                     Internal_yscalar_nnz_view_t_>::
+      gauss_seidel_apply(&tmp_handle, num_rows, num_cols, const_a_r, const_a_l,
+                         const_a_v, nonconst_x_v, const_y_v, init_zero_x_vector,
+                         update_y_vector, omega, numIter, true, false);
+}
 
-      typedef typename KernelHandle::const_size_type c_size_t;
-      typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
-      typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
-
-      typedef typename KernelHandle::HandleExecSpace c_exec_t;
-      typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
-      typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
-
-      typedef typename  KokkosKernels::Experimental::KokkosKernelsHandle<c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t> const_handle_type;
-      //const_handle_type tmp_handle = *handle;
-      const_handle_type tmp_handle (*handle);
-
-      typedef Kokkos::View<
-        typename lno_row_view_t_::const_value_type*,
-        typename KokkosKernels::Impl::GetUnifiedLayout<lno_row_view_t_>::array_layout,
-        typename lno_row_view_t_::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_alno_row_view_t_;
-
-      typedef Kokkos::View<
-        typename lno_nnz_view_t_::const_value_type*,
-        typename KokkosKernels::Impl::GetUnifiedLayout<lno_nnz_view_t_>::array_layout,
-        typename lno_nnz_view_t_::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_alno_nnz_view_t_;
+template <KokkosKernels::SparseMatrixFormat format = KokkosKernels::BlockCRS,
+          typename KernelHandle, typename lno_row_view_t_,
+          typename lno_nnz_view_t_, typename scalar_nnz_view_t_,
+          typename x_scalar_view_t, typename y_scalar_view_t>
+void forward_sweep_block_gauss_seidel_apply(
+    KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows,
+    typename KernelHandle::const_nnz_lno_t num_cols,
+    typename KernelHandle::const_nnz_lno_t block_size,
+
+    lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values,
+    x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec,
+    bool init_zero_x_vector, bool update_y_vector,
+    typename KernelHandle::nnz_scalar_t omega, int numIter) {
+  // Check compatibility of dimensions at run time.
+  if (x_lhs_output_vec.extent(1) != y_rhs_input_vec.extent(1)) {
+    std::ostringstream os;
+    os << "KokkosSparse::forward_sweep_block_gauss_seidel_apply: Dimensions of "
+          "X and Y do not match: "
+       << "X has " << x_lhs_output_vec.extent(1) << "columns, Y has "
+       << y_rhs_input_vec.extent(1) << " columns.";
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
+  }
 
+  auto gsHandle = handle->get_point_gs_handle();
+  if (gsHandle->get_algorithm_type() == GS_CLUSTER) {
+    throw std::runtime_error(
+        "Block versions of Gauss-Seidel are incompatible with algorithm "
+        "GS_CLUSTER");
+  }
+  gsHandle->set_block_size(block_size);
+  forward_sweep_gauss_seidel_apply<format>(
+      handle, num_rows, num_cols, row_map, entries, values, x_lhs_output_vec,
+      y_rhs_input_vec, init_zero_x_vector, update_y_vector, omega, numIter);
+}
+template <KokkosKernels::SparseMatrixFormat format = KokkosKernels::CRS,
+          class KernelHandle, typename lno_row_view_t_,
+          typename lno_nnz_view_t_, typename scalar_nnz_view_t_,
+          typename x_scalar_view_t, typename y_scalar_view_t>
+void backward_sweep_gauss_seidel_apply(
+    KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows,
+    typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map,
+    lno_nnz_view_t_ entries, scalar_nnz_view_t_ values,
+    x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec,
+    bool init_zero_x_vector, bool update_y_vector,
+    typename KernelHandle::nnz_scalar_t omega, int numIter) {
+  static_assert(std::is_same<typename KernelHandle::const_size_type,
+                             typename lno_row_view_t_::const_value_type>::value,
+                "KokkosSparse::backward_sweep_gauss_seidel_apply: Size type of "
+                "the matrix should be same as kernelHandle sizetype.");
+
+  static_assert(std::is_same<typename KernelHandle::const_nnz_lno_t,
+                             typename lno_nnz_view_t_::const_value_type>::value,
+                "KokkosSparse::backward_sweep_gauss_seidel_apply: lno type of "
+                "the matrix should be same as kernelHandle lno_t.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_nnz_scalar_t,
+                   typename scalar_nnz_view_t_::const_value_type>::value,
+      "KokkosSparse::backward_sweep_gauss_seidel_apply: scalar type of the "
+      "matrix should be same as kernelHandle scalar_t.");
+
+  static_assert(std::is_same<typename KernelHandle::const_nnz_scalar_t,
+                             typename y_scalar_view_t::const_value_type>::value,
+                "KokkosSparse::backward_sweep_gauss_seidel_apply: scalar type "
+                "of the y-vector should be same as kernelHandle scalar_t.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::nnz_scalar_t,
+                   typename x_scalar_view_t::value_type>::value,
+      "KokkosSparse::backward_sweep_gauss_seidel_apply: scalar type of the "
+      "x-vector should be same as kernelHandle non-const scalar_t.");
+
+  static_assert(!std::is_same<typename lno_row_view_t_::array_layout,
+                              Kokkos::LayoutStride>::value,
+                "KokkosSparse::backward_sweep_gauss_seidel_apply: row_map must "
+                "have a contiguous layout (Left or Right, not Stride)");
+  static_assert(!std::is_same<typename lno_nnz_view_t_::array_layout,
+                              Kokkos::LayoutStride>::value,
+                "KokkosSparse::backward_sweep_gauss_seidel_apply: entries must "
+                "have a contiguous layout (Left or Right, not Stride)");
+  static_assert(!std::is_same<typename scalar_nnz_view_t_::array_layout,
+                              Kokkos::LayoutStride>::value,
+                "KokkosSparse::backward_sweep_gauss_seidel_apply: values must "
+                "have a contiguous layout (Left or Right, not Stride)");
+
+  // Check compatibility of dimensions at run time.
+  if (x_lhs_output_vec.extent(1) != y_rhs_input_vec.extent(1)) {
+    std::ostringstream os;
+    os << "KokkosSparse::backward_sweep_gauss_seidel_apply: Dimensions of X "
+          "and Y do not match: "
+       << "X has " << x_lhs_output_vec.extent(1) << "columns, Y has "
+       << y_rhs_input_vec.extent(1) << " columns.";
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
+  }
 
-      //Internal_alno_row_view_t_ const_a_r  = row_map;
-      //Internal_alno_nnz_view_t_ const_a_l  = entries;
-      Internal_alno_row_view_t_ const_a_r (row_map.data(), row_map.extent(0));
-      Internal_alno_nnz_view_t_ const_a_l (entries.data(), entries.extent(0));
+  typedef typename KernelHandle::const_size_type c_size_t;
+  typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
+  typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
+
+  typedef typename KernelHandle::HandleExecSpace c_exec_t;
+  typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
+  typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
+
+  typedef typename KokkosKernels::Experimental::KokkosKernelsHandle<
+      c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t>
+      const_handle_type;
+  // const_handle_type tmp_handle = *handle;
+  const_handle_type tmp_handle(*handle);
+
+  typedef Kokkos::View<typename lno_row_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           lno_row_view_t_>::array_layout,
+                       typename lno_row_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_alno_row_view_t_;
+
+  typedef Kokkos::View<typename lno_nnz_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           lno_nnz_view_t_>::array_layout,
+                       typename lno_nnz_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_alno_nnz_view_t_;
+
+  typedef Kokkos::View<typename scalar_nnz_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           scalar_nnz_view_t_>::array_layout,
+                       typename scalar_nnz_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_ascalar_nnz_view_t_;
+
+  typedef Kokkos::View<typename y_scalar_view_t::const_value_type **,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           y_scalar_view_t>::array_layout,
+                       typename y_scalar_view_t::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_yscalar_nnz_view_t_;
+
+  typedef Kokkos::View<typename x_scalar_view_t::non_const_value_type **,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           x_scalar_view_t>::array_layout,
+                       typename x_scalar_view_t::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_xscalar_nnz_view_t_;
+
+  Internal_alno_row_view_t_ const_a_r(row_map.data(), row_map.extent(0));
+  Internal_alno_nnz_view_t_ const_a_l(entries.data(), entries.extent(0));
+  Internal_ascalar_nnz_view_t_ const_a_v(values.data(), values.extent(0));
+
+  Internal_xscalar_nnz_view_t_ nonconst_x_v(x_lhs_output_vec.data(),
+                                            x_lhs_output_vec.extent(0),
+                                            x_lhs_output_vec.extent(1));
+  Internal_yscalar_nnz_view_t_ const_y_v(y_rhs_input_vec.data(),
+                                         y_rhs_input_vec.extent(0),
+                                         y_rhs_input_vec.extent(1));
+
+  using namespace KokkosSparse::Impl;
+
+  GAUSS_SEIDEL_APPLY<const_handle_type, format, Internal_alno_row_view_t_,
+                     Internal_alno_nnz_view_t_, Internal_ascalar_nnz_view_t_,
+                     Internal_xscalar_nnz_view_t_,
+                     Internal_yscalar_nnz_view_t_>::
+      gauss_seidel_apply(&tmp_handle, num_rows, num_cols, const_a_r, const_a_l,
+                         const_a_v, nonconst_x_v, const_y_v, init_zero_x_vector,
+                         update_y_vector, omega, numIter, false, true);
+}
 
-      using namespace KokkosSparse::Impl;
-
-      GAUSS_SEIDEL_SYMBOLIC<const_handle_type, Internal_alno_row_view_t_, Internal_alno_nnz_view_t_>::gauss_seidel_symbolic
-        (&tmp_handle, num_rows, num_cols, const_a_r, const_a_l, is_graph_symmetric);
-
-    }
-
-    template <typename KernelHandle, typename lno_row_view_t_, typename lno_nnz_view_t_>
-    void block_gauss_seidel_symbolic(
-                                     KernelHandle *handle,
-                                     typename KernelHandle::const_nnz_lno_t num_rows,
-                                     typename KernelHandle::const_nnz_lno_t num_cols,
-                                     typename KernelHandle::const_nnz_lno_t block_size,
-                                     lno_row_view_t_ row_map,
-                                     lno_nnz_view_t_ entries,
-                                     bool is_graph_symmetric = true)
-    {
-      auto gsHandle = handle->get_point_gs_handle();
-      if(gsHandle->get_algorithm_type() == GS_CLUSTER)
-      {
-        throw std::runtime_error("Block versions of Gauss-Seidel are incompatible with algorithm GS_CLUSTER");
-      }
-      gsHandle->set_block_size(block_size);
-
-      gauss_seidel_symbolic(handle,
-                            num_rows, num_cols,
-                            row_map, entries, is_graph_symmetric);
-
-    }
-
-
-    template <typename KernelHandle,
-              typename lno_row_view_t_,
-              typename lno_nnz_view_t_,
-              typename scalar_nnz_view_t_>
-    void gauss_seidel_numeric(KernelHandle *handle,
-                              typename KernelHandle::const_nnz_lno_t num_rows,
-                              typename KernelHandle::const_nnz_lno_t num_cols,
-                              lno_row_view_t_ row_map,
-                              lno_nnz_view_t_ entries,
-                              scalar_nnz_view_t_ values,
-                              bool is_graph_symmetric = true
-                              ){
-
-      static_assert (std::is_same<typename KernelHandle::const_size_type,
-                     typename lno_row_view_t_::const_value_type>::value,
-                     "KokkosSparse::gauss_seidel_symbolic: Size type of the matrix should be same as kernelHandle sizetype.");
-
-      static_assert (std::is_same<typename KernelHandle::const_nnz_lno_t,
-                     typename lno_nnz_view_t_::const_value_type>::value,
-                     "KokkosSparse::gauss_seidel_symbolic: lno type of the matrix should be same as kernelHandle lno_t.");
-
-      static_assert (std::is_same<typename KernelHandle::const_nnz_scalar_t,
-                     typename scalar_nnz_view_t_::const_value_type>::value,
-                     "KokkosSparse::gauss_seidel_symbolic: scalar type of the matrix should be same as kernelHandle scalar_t.");
-
-
-      typedef typename KernelHandle::const_size_type c_size_t;
-      typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
-      typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
-
-      typedef typename KernelHandle::HandleExecSpace c_exec_t;
-      typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
-      typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
-
-      typedef typename  KokkosKernels::Experimental::KokkosKernelsHandle<c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t> const_handle_type;
-      //const_handle_type tmp_handle = *handle;
-      const_handle_type tmp_handle (*handle);
-
-      typedef Kokkos::View<
-        typename lno_row_view_t_::const_value_type*,
-        typename KokkosKernels::Impl::GetUnifiedLayout<lno_row_view_t_>::array_layout,
-        typename lno_row_view_t_::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_alno_row_view_t_;
-
-      typedef Kokkos::View<
-        typename lno_nnz_view_t_::const_value_type*,
-        typename KokkosKernels::Impl::GetUnifiedLayout<lno_nnz_view_t_>::array_layout,
-        typename lno_nnz_view_t_::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_alno_nnz_view_t_;
-
-      typedef Kokkos::View<
-        typename scalar_nnz_view_t_::const_value_type*,
-        typename KokkosKernels::Impl::GetUnifiedLayout<scalar_nnz_view_t_>::array_layout,
-        typename scalar_nnz_view_t_::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_ascalar_nnz_view_t_;
-
-      Internal_alno_row_view_t_ const_a_r (row_map.data(), row_map.extent(0));
-      Internal_alno_nnz_view_t_ const_a_l (entries.data(), entries.extent(0));
-      Internal_ascalar_nnz_view_t_ const_a_v (values.data(), values.extent(0));
-
-
-      using namespace KokkosSparse::Impl;
-
-      GAUSS_SEIDEL_NUMERIC<const_handle_type, Internal_alno_row_view_t_, Internal_alno_nnz_view_t_, Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric
-        (&tmp_handle, num_rows, num_cols, const_a_r, const_a_l, const_a_v, is_graph_symmetric);
-    }
-
-    template <typename KernelHandle,
-              typename lno_row_view_t_,
-              typename lno_nnz_view_t_,
-              typename scalar_nnz_view_t_>
-    void gauss_seidel_numeric(KernelHandle *handle,
-                              typename KernelHandle::const_nnz_lno_t num_rows,
-                              typename KernelHandle::const_nnz_lno_t num_cols,
-                              lno_row_view_t_ row_map,
-                              lno_nnz_view_t_ entries,
-                              scalar_nnz_view_t_ values,
-                              scalar_nnz_view_t_ given_inverse_diagonal,
-                              bool is_graph_symmetric = true
-                              ){
-
-      static_assert (std::is_same<typename KernelHandle::const_size_type,
-                     typename lno_row_view_t_::const_value_type>::value,
-                     "KokkosSparse::gauss_seidel_symbolic: Size type of the matrix should be same as kernelHandle sizetype.");
-
-      static_assert (std::is_same<typename KernelHandle::const_nnz_lno_t,
-                     typename lno_nnz_view_t_::const_value_type>::value,
-                     "KokkosSparse::gauss_seidel_symbolic: lno type of the matrix should be same as kernelHandle lno_t.");
-
-      static_assert (std::is_same<typename KernelHandle::const_nnz_scalar_t,
-                     typename scalar_nnz_view_t_::const_value_type>::value,
-                     "KokkosSparse::gauss_seidel_symbolic: scalar type of the matrix should be same as kernelHandle scalar_t.");
-
-
-      typedef typename KernelHandle::const_size_type c_size_t;
-      typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
-      typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
-
-      typedef typename KernelHandle::HandleExecSpace c_exec_t;
-      typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
-      typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
-
-      typedef typename  KokkosKernels::Experimental::KokkosKernelsHandle<c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t> const_handle_type;
-      //const_handle_type tmp_handle = *handle;
-      const_handle_type tmp_handle (*handle);
-
-      typedef Kokkos::View<
-        typename lno_row_view_t_::const_value_type*,
-        typename KokkosKernels::Impl::GetUnifiedLayout<lno_row_view_t_>::array_layout,
-        typename lno_row_view_t_::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_alno_row_view_t_;
-
-      typedef Kokkos::View<
-        typename lno_nnz_view_t_::const_value_type*,
-        typename KokkosKernels::Impl::GetUnifiedLayout<lno_nnz_view_t_>::array_layout,
-        typename lno_nnz_view_t_::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_alno_nnz_view_t_;
-
-      typedef Kokkos::View<
-        typename scalar_nnz_view_t_::const_value_type*,
-        typename KokkosKernels::Impl::GetUnifiedLayout<scalar_nnz_view_t_>::array_layout,
-        typename scalar_nnz_view_t_::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_ascalar_nnz_view_t_;
-
-      Internal_alno_row_view_t_ const_a_r (row_map.data(), row_map.extent(0));
-      Internal_alno_nnz_view_t_ const_a_l (entries.data(), entries.extent(0));
-      Internal_ascalar_nnz_view_t_ const_a_v (values.data(), values.extent(0));
-      Internal_ascalar_nnz_view_t_ const_a_d (given_inverse_diagonal.data(), given_inverse_diagonal.extent(0));
-
-
-      using namespace KokkosSparse::Impl;
-
-      GAUSS_SEIDEL_NUMERIC<const_handle_type, Internal_alno_row_view_t_, Internal_alno_nnz_view_t_, Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric
-        (&tmp_handle, num_rows, num_cols, const_a_r, const_a_l, const_a_v, const_a_d, is_graph_symmetric);
-    }
-
-    template <typename KernelHandle,
-              typename lno_row_view_t_,
-              typename lno_nnz_view_t_,
-              typename scalar_nnz_view_t_>
-    void block_gauss_seidel_numeric(KernelHandle *handle,
-                                    typename KernelHandle::const_nnz_lno_t num_rows,
-                                    typename KernelHandle::const_nnz_lno_t num_cols,
-                                    typename KernelHandle::const_nnz_lno_t block_size,
-                                    lno_row_view_t_ row_map,
-                                    lno_nnz_view_t_ entries,
-                                    scalar_nnz_view_t_ values,
-                                    bool is_graph_symmetric = true
-                                    ){
-      auto gsHandle = handle->get_point_gs_handle();
-      if(gsHandle->get_algorithm_type() == GS_CLUSTER)
-      {
-        throw std::runtime_error("Block versions of Gauss-Seidel are incompatible with algorithm GS_CLUSTER");
-      }
-      gsHandle->set_block_size(block_size);
-
-      gauss_seidel_numeric(handle,
-                           num_rows,num_cols,
-                           row_map, entries, values,
-                           is_graph_symmetric
-                           );
-    }
-
-    template <typename KernelHandle,
-              typename lno_row_view_t_,
-              typename lno_nnz_view_t_,
-              typename scalar_nnz_view_t_,
-              typename x_scalar_view_t,
-              typename y_scalar_view_t>
-    void symmetric_gauss_seidel_apply(KernelHandle *handle,
-                                      typename KernelHandle::const_nnz_lno_t num_rows,
-                                      typename KernelHandle::const_nnz_lno_t num_cols,
-                                      lno_row_view_t_ row_map,
-                                      lno_nnz_view_t_ entries,
-                                      scalar_nnz_view_t_ values,
-                                      x_scalar_view_t x_lhs_output_vec,
-                                      y_scalar_view_t y_rhs_input_vec,
-                                      bool init_zero_x_vector,
-                                      bool update_y_vector,
-                                      typename KernelHandle::nnz_scalar_t omega,
-                                      int numIter){
-
-      static_assert (std::is_same<typename KernelHandle::const_size_type,
-                     typename lno_row_view_t_::const_value_type>::value,
-                     "KokkosSparse::symmetric_gauss_seidel_apply: Size type of the matrix should be same as kernelHandle sizetype.");
-
-      static_assert (std::is_same<typename KernelHandle::const_nnz_lno_t,
-                     typename lno_nnz_view_t_::const_value_type>::value,
-                     "KokkosSparse::symmetric_gauss_seidel_apply: lno type of the matrix should be same as kernelHandle lno_t.");
-
-      static_assert (std::is_same<typename KernelHandle::const_nnz_scalar_t,
-                     typename scalar_nnz_view_t_::const_value_type>::value,
-                     "KokkosSparse::symmetric_gauss_seidel_apply: scalar type of the matrix should be same as kernelHandle scalar_t.");
-
-      static_assert (std::is_same<typename KernelHandle::const_nnz_scalar_t,
-                     typename y_scalar_view_t::const_value_type>::value,
-                     "KokkosSparse::symmetric_gauss_seidel_apply: scalar type of the y-vector should be same as kernelHandle scalar_t.");
-
-      static_assert (std::is_same<typename KernelHandle::nnz_scalar_t,
-                     typename x_scalar_view_t::value_type>::value,
-                     "KokkosSparse::symmetric_gauss_seidel_apply: scalar type of the x-vector should be same as kernelHandle non-const scalar_t.");
-
-      static_assert (!std::is_same<typename lno_row_view_t_::array_layout, Kokkos::LayoutStride>::value,
-                     "KokkosSparse::symmetric_gauss_seidel_apply: row_map must have a contiguous layout (Left or Right, not Stride)");
-      static_assert (!std::is_same<typename lno_nnz_view_t_::array_layout, Kokkos::LayoutStride>::value,
-                     "KokkosSparse::symmetric_gauss_seidel_apply: entries must have a contiguous layout (Left or Right, not Stride)");
-      static_assert (!std::is_same<typename scalar_nnz_view_t_::array_layout, Kokkos::LayoutStride>::value,
-                     "KokkosSparse::symmetric_gauss_seidel_apply: values must have a contiguous layout (Left or Right, not Stride)");
-
-      // Check compatibility of #vectors
-      if(x_lhs_output_vec.extent(1) != y_rhs_input_vec.extent(1))
-      {
-        std::ostringstream os;
-        os << "KokkosSparse::symmetric_gauss_seidel_apply: " <<
-          "X has " << x_lhs_output_vec.extent(1) << "columns, Y has " << y_rhs_input_vec.extent(1) << " columns.";
-        Kokkos::Impl::throw_runtime_exception (os.str ());
-      }
-
-      typedef typename KernelHandle::const_size_type c_size_t;
-      typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
-      typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
-
-      typedef typename KernelHandle::HandleExecSpace c_exec_t;
-      typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
-      typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
-
-      typedef typename  KokkosKernels::Experimental::KokkosKernelsHandle<c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t> const_handle_type;
-      const_handle_type tmp_handle (*handle);
-
-      typedef Kokkos::View<
-        typename lno_row_view_t_::const_value_type*,
-        typename KokkosKernels::Impl::GetUnifiedLayout<lno_row_view_t_>::array_layout,
-        typename lno_row_view_t_::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_alno_row_view_t_;
-
-      typedef Kokkos::View<
-        typename lno_nnz_view_t_::const_value_type*,
-        typename KokkosKernels::Impl::GetUnifiedLayout<lno_nnz_view_t_>::array_layout,
-        typename lno_nnz_view_t_::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_alno_nnz_view_t_;
-
-      typedef Kokkos::View<
-        typename scalar_nnz_view_t_::const_value_type*,
-        typename KokkosKernels::Impl::GetUnifiedLayout<scalar_nnz_view_t_>::array_layout,
-        typename scalar_nnz_view_t_::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_ascalar_nnz_view_t_;
-
-      typedef Kokkos::View<
-        typename y_scalar_view_t::const_value_type**,
-        typename KokkosKernels::Impl::GetUnifiedLayout<y_scalar_view_t>::array_layout,
-        typename y_scalar_view_t::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_yscalar_nnz_view_t_;
-
-      typedef Kokkos::View<
-        typename x_scalar_view_t::non_const_value_type**,
-        typename KokkosKernels::Impl::GetUnifiedLayout<x_scalar_view_t>::array_layout,
-        typename x_scalar_view_t::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_xscalar_nnz_view_t_;
-
-      Internal_alno_row_view_t_ const_a_r (row_map.data(), row_map.extent(0));
-      Internal_alno_nnz_view_t_ const_a_l (entries.data(), entries.extent(0));
-      Internal_ascalar_nnz_view_t_ const_a_v (values.data(), values.extent(0));
-
-      Internal_xscalar_nnz_view_t_ nonconst_x_v (x_lhs_output_vec.data(), x_lhs_output_vec.extent(0), x_lhs_output_vec.extent(1));
-      Internal_yscalar_nnz_view_t_ const_y_v (y_rhs_input_vec.data(), y_rhs_input_vec.extent(0), y_rhs_input_vec.extent(1));
-
-      using namespace KokkosSparse::Impl;
-
-      GAUSS_SEIDEL_APPLY<const_handle_type,
-                         Internal_alno_row_view_t_, Internal_alno_nnz_view_t_, Internal_ascalar_nnz_view_t_,
-                         Internal_xscalar_nnz_view_t_, Internal_yscalar_nnz_view_t_>::gauss_seidel_apply (
-                             &tmp_handle, num_rows, num_cols,
-                             const_a_r, const_a_l, const_a_v,
-                             nonconst_x_v, const_y_v,
-                             init_zero_x_vector,
-                             update_y_vector,
-                             omega,
-                             numIter, true, true);
-
-
-    }
-
-    template <typename KernelHandle,
-              typename lno_row_view_t_,
-              typename lno_nnz_view_t_,
-              typename scalar_nnz_view_t_,
-              typename x_scalar_view_t,
-              typename y_scalar_view_t>
-    void symmetric_block_gauss_seidel_apply(KernelHandle *handle,
-                                            typename KernelHandle::const_nnz_lno_t num_rows,
-                                            typename KernelHandle::const_nnz_lno_t num_cols,
-                                            typename KernelHandle::const_nnz_lno_t block_size,
-
-                                            lno_row_view_t_ row_map,
-                                            lno_nnz_view_t_ entries,
-                                            scalar_nnz_view_t_ values,
-                                            x_scalar_view_t x_lhs_output_vec,
-                                            y_scalar_view_t y_rhs_input_vec,
-                                            bool init_zero_x_vector,
-                                            bool update_y_vector,
-                                            typename KernelHandle::nnz_scalar_t omega,
-                                            int numIter)
-    {
-      // Check compatibility of dimensions at run time.
-      if(x_lhs_output_vec.extent(1) != y_rhs_input_vec.extent(1))
-      {
-        std::ostringstream os;
-        os << "KokkosSparse::symmetric_block_gauss_seidel_apply: Dimensions of X and Y do not match: " <<
-          "X has " << x_lhs_output_vec.extent(1) << "columns, Y has " << y_rhs_input_vec.extent(1) << " columns.";
-        Kokkos::Impl::throw_runtime_exception (os.str ());
-      }
-      auto gsHandle = handle->get_point_gs_handle();
-      if(gsHandle->get_algorithm_type() == GS_CLUSTER)
-      {
-        throw std::runtime_error("Block versions of Gauss-Seidel are incompatible with algorithm GS_CLUSTER");
-      }
-
-      gsHandle->set_block_size(block_size);
-      symmetric_gauss_seidel_apply(handle,num_rows,num_cols,
-                                   row_map,
-                                   entries,
-                                   values,
-                                   x_lhs_output_vec,
-                                   y_rhs_input_vec,
-                                   init_zero_x_vector,
-                                   update_y_vector,
-                                   omega,
-                                   numIter);
-    }
-    template <class KernelHandle,
-              typename lno_row_view_t_,
-              typename lno_nnz_view_t_,
-              typename scalar_nnz_view_t_,
-              typename x_scalar_view_t, typename y_scalar_view_t>
-    void forward_sweep_gauss_seidel_apply(
-                                          KernelHandle *handle,
-                                          typename KernelHandle::const_nnz_lno_t num_rows,
-                                          typename KernelHandle::const_nnz_lno_t num_cols,
-                                          lno_row_view_t_ row_map,
-                                          lno_nnz_view_t_ entries,
-                                          scalar_nnz_view_t_ values,
-                                          x_scalar_view_t x_lhs_output_vec,
-                                          y_scalar_view_t y_rhs_input_vec,
-                                          bool init_zero_x_vector,
-                                          bool update_y_vector,
-                                          typename KernelHandle::nnz_scalar_t omega,
-                                          int numIter)
-    {
-      static_assert (std::is_same<typename KernelHandle::const_size_type,
-                     typename lno_row_view_t_::const_value_type>::value,
-                     "KokkosSparse::forward_sweep_gauss_seidel_apply: Size type of the matrix should be same as kernelHandle sizetype.");
-
-      static_assert (std::is_same<typename KernelHandle::const_nnz_lno_t,
-                     typename lno_nnz_view_t_::const_value_type>::value,
-                     "KokkosSparse::forward_sweep_gauss_seidel_apply: lno type of the matrix should be same as kernelHandle lno_t.");
-
-      static_assert (std::is_same<typename KernelHandle::const_nnz_scalar_t,
-                     typename scalar_nnz_view_t_::const_value_type>::value,
-                     "KokkosSparse::forward_sweep_gauss_seidel_apply: scalar type of the matrix should be same as kernelHandle scalar_t.");
-
-      static_assert (std::is_same<typename KernelHandle::const_nnz_scalar_t,
-                     typename y_scalar_view_t::const_value_type>::value,
-                     "KokkosSparse::forward_sweep_gauss_seidel_apply: scalar type of the y-vector should be same as kernelHandle scalar_t.");
-
-      static_assert (std::is_same<typename KernelHandle::nnz_scalar_t,
-                     typename x_scalar_view_t::value_type>::value,
-                     "KokkosSparse::forward_sweep_gauss_seidel_apply: scalar type of the x-vector should be same as kernelHandle non-const scalar_t.");
-
-      static_assert (!std::is_same<typename lno_row_view_t_::array_layout, Kokkos::LayoutStride>::value,
-                     "KokkosSparse::forward_sweep_gauss_seidel_apply: row_map must have a contiguous layout (Left or Right, not Stride)");
-      static_assert (!std::is_same<typename lno_nnz_view_t_::array_layout, Kokkos::LayoutStride>::value,
-                     "KokkosSparse::forward_sweep_gauss_seidel_apply: entries must have a contiguous layout (Left or Right, not Stride)");
-      static_assert (!std::is_same<typename scalar_nnz_view_t_::array_layout, Kokkos::LayoutStride>::value,
-                     "KokkosSparse::forward_sweep_gauss_seidel_apply: values must have a contiguous layout (Left or Right, not Stride)");
-
-      // Check compatibility of dimensions at run time.
-      if(x_lhs_output_vec.extent(1) != y_rhs_input_vec.extent(1))
-      {
-        std::ostringstream os;
-        os << "KokkosSparse::forward_sweep_gauss_seidel_apply: Dimensions of X and Y do not match: " <<
-          "X has " << x_lhs_output_vec.extent(1) << "columns, Y has " << y_rhs_input_vec.extent(1) << " columns.";
-        Kokkos::Impl::throw_runtime_exception (os.str ());
-      }
-
-      typedef typename KernelHandle::const_size_type c_size_t;
-      typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
-      typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
-
-      typedef typename KernelHandle::HandleExecSpace c_exec_t;
-      typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
-      typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
-
-      typedef typename  KokkosKernels::Experimental::KokkosKernelsHandle<c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t> const_handle_type;
-      //const_handle_type tmp_handle = *handle;
-      const_handle_type tmp_handle (*handle);
-
-      typedef Kokkos::View<
-        typename lno_row_view_t_::const_value_type*,
-        typename KokkosKernels::Impl::GetUnifiedLayout<lno_row_view_t_>::array_layout,
-        typename lno_row_view_t_::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_alno_row_view_t_;
-
-      typedef Kokkos::View<
-        typename lno_nnz_view_t_::const_value_type*,
-        typename KokkosKernels::Impl::GetUnifiedLayout<lno_nnz_view_t_>::array_layout,
-        typename lno_nnz_view_t_::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_alno_nnz_view_t_;
-
-      typedef Kokkos::View<
-        typename scalar_nnz_view_t_::const_value_type*,
-        typename KokkosKernels::Impl::GetUnifiedLayout<scalar_nnz_view_t_>::array_layout,
-        typename scalar_nnz_view_t_::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_ascalar_nnz_view_t_;
-
-      typedef Kokkos::View<
-        typename y_scalar_view_t::const_value_type**,
-        typename KokkosKernels::Impl::GetUnifiedLayout<y_scalar_view_t>::array_layout,
-        typename y_scalar_view_t::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_yscalar_nnz_view_t_;
-
-      typedef Kokkos::View<
-        typename x_scalar_view_t::non_const_value_type**,
-        typename KokkosKernels::Impl::GetUnifiedLayout<x_scalar_view_t>::array_layout,
-        typename x_scalar_view_t::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_xscalar_nnz_view_t_;
-
-      Internal_alno_row_view_t_ const_a_r (row_map.data(), row_map.extent(0));
-      Internal_alno_nnz_view_t_ const_a_l (entries.data(), entries.extent(0));
-      Internal_ascalar_nnz_view_t_ const_a_v (values.data(), values.extent(0));
-
-      Internal_xscalar_nnz_view_t_ nonconst_x_v (x_lhs_output_vec.data(), x_lhs_output_vec.extent(0), x_lhs_output_vec.extent(1));
-      Internal_yscalar_nnz_view_t_ const_y_v (y_rhs_input_vec.data(), y_rhs_input_vec.extent(0), y_rhs_input_vec.extent(1));
-
-      using namespace KokkosSparse::Impl;
-
-      GAUSS_SEIDEL_APPLY<const_handle_type,
-                         Internal_alno_row_view_t_, Internal_alno_nnz_view_t_, Internal_ascalar_nnz_view_t_,
-                         Internal_xscalar_nnz_view_t_, Internal_yscalar_nnz_view_t_>::gauss_seidel_apply (
-                             &tmp_handle, num_rows, num_cols,
-                             const_a_r, const_a_l, const_a_v,
-                             nonconst_x_v, const_y_v,
-                             init_zero_x_vector,
-                             update_y_vector,
-                             omega,
-                             numIter, true, false);
-    }
-
-
-    template <typename KernelHandle,
-              typename lno_row_view_t_,
-              typename lno_nnz_view_t_,
-              typename scalar_nnz_view_t_,
-              typename x_scalar_view_t,
-              typename y_scalar_view_t>
-    void forward_sweep_block_gauss_seidel_apply(KernelHandle *handle,
-                                                typename KernelHandle::const_nnz_lno_t num_rows,
-                                                typename KernelHandle::const_nnz_lno_t num_cols,
-                                                typename KernelHandle::const_nnz_lno_t block_size,
-
-                                                lno_row_view_t_ row_map,
-                                                lno_nnz_view_t_ entries,
-                                                scalar_nnz_view_t_ values,
-                                                x_scalar_view_t x_lhs_output_vec,
-                                                y_scalar_view_t y_rhs_input_vec,
-                                                bool init_zero_x_vector,
-                                                bool update_y_vector,
-                                                typename KernelHandle::nnz_scalar_t omega,
-                                                int numIter)
-    {
-      // Check compatibility of dimensions at run time.
-      if(x_lhs_output_vec.extent(1) != y_rhs_input_vec.extent(1))
-      {
-        std::ostringstream os;
-        os << "KokkosSparse::forward_sweep_block_gauss_seidel_apply: Dimensions of X and Y do not match: " <<
-          "X has " << x_lhs_output_vec.extent(1) << "columns, Y has " << y_rhs_input_vec.extent(1) << " columns.";
-        Kokkos::Impl::throw_runtime_exception (os.str ());
-      }
-
-      auto gsHandle = handle->get_point_gs_handle();
-      if(gsHandle->get_algorithm_type() == GS_CLUSTER)
-      {
-        throw std::runtime_error("Block versions of Gauss-Seidel are incompatible with algorithm GS_CLUSTER");
-      }
-      gsHandle->set_block_size(block_size);
-      forward_sweep_gauss_seidel_apply(handle,num_rows,num_cols,
-                                       row_map,
-                                       entries,
-                                       values,
-                                       x_lhs_output_vec,
-                                       y_rhs_input_vec,
-                                       init_zero_x_vector,
-                                       update_y_vector,
-                                       omega,
-                                       numIter);
-    }
-    template <class KernelHandle,
-              typename lno_row_view_t_,
-              typename lno_nnz_view_t_,
-              typename scalar_nnz_view_t_,
-              typename x_scalar_view_t, typename y_scalar_view_t>
-    void backward_sweep_gauss_seidel_apply(
-                                           KernelHandle *handle,
-                                           typename KernelHandle::const_nnz_lno_t num_rows,
-                                           typename KernelHandle::const_nnz_lno_t num_cols,
-                                           lno_row_view_t_ row_map,
-                                           lno_nnz_view_t_ entries,
-                                           scalar_nnz_view_t_ values,
-                                           x_scalar_view_t x_lhs_output_vec,
-                                           y_scalar_view_t y_rhs_input_vec,
-                                           bool init_zero_x_vector,
-                                           bool update_y_vector,
-                                           typename KernelHandle::nnz_scalar_t omega,
-                                           int numIter){
-
-      static_assert (std::is_same<typename KernelHandle::const_size_type,
-                     typename lno_row_view_t_::const_value_type>::value,
-                     "KokkosSparse::backward_sweep_gauss_seidel_apply: Size type of the matrix should be same as kernelHandle sizetype.");
-
-      static_assert (std::is_same<typename KernelHandle::const_nnz_lno_t,
-                     typename lno_nnz_view_t_::const_value_type>::value,
-                     "KokkosSparse::backward_sweep_gauss_seidel_apply: lno type of the matrix should be same as kernelHandle lno_t.");
-
-      static_assert (std::is_same<typename KernelHandle::const_nnz_scalar_t,
-                     typename scalar_nnz_view_t_::const_value_type>::value,
-                     "KokkosSparse::backward_sweep_gauss_seidel_apply: scalar type of the matrix should be same as kernelHandle scalar_t.");
-
-      static_assert (std::is_same<typename KernelHandle::const_nnz_scalar_t,
-                     typename y_scalar_view_t::const_value_type>::value,
-                     "KokkosSparse::backward_sweep_gauss_seidel_apply: scalar type of the y-vector should be same as kernelHandle scalar_t.");
-
-      static_assert (std::is_same<typename KernelHandle::nnz_scalar_t,
-                     typename x_scalar_view_t::value_type>::value,
-                     "KokkosSparse::backward_sweep_gauss_seidel_apply: scalar type of the x-vector should be same as kernelHandle non-const scalar_t.");
-
-      static_assert (!std::is_same<typename lno_row_view_t_::array_layout, Kokkos::LayoutStride>::value,
-                     "KokkosSparse::backward_sweep_gauss_seidel_apply: row_map must have a contiguous layout (Left or Right, not Stride)");
-      static_assert (!std::is_same<typename lno_nnz_view_t_::array_layout, Kokkos::LayoutStride>::value,
-                     "KokkosSparse::backward_sweep_gauss_seidel_apply: entries must have a contiguous layout (Left or Right, not Stride)");
-      static_assert (!std::is_same<typename scalar_nnz_view_t_::array_layout, Kokkos::LayoutStride>::value,
-                     "KokkosSparse::backward_sweep_gauss_seidel_apply: values must have a contiguous layout (Left or Right, not Stride)");
-
-      // Check compatibility of dimensions at run time.
-      if(x_lhs_output_vec.extent(1) != y_rhs_input_vec.extent(1))
-      {
-        std::ostringstream os;
-        os << "KokkosSparse::backward_sweep_gauss_seidel_apply: Dimensions of X and Y do not match: " <<
-          "X has " << x_lhs_output_vec.extent(1) << "columns, Y has " << y_rhs_input_vec.extent(1) << " columns.";
-        Kokkos::Impl::throw_runtime_exception (os.str ());
-      }
-
-      typedef typename KernelHandle::const_size_type c_size_t;
-      typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
-      typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
-
-      typedef typename KernelHandle::HandleExecSpace c_exec_t;
-      typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
-      typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
-
-      typedef typename  KokkosKernels::Experimental::KokkosKernelsHandle<c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t> const_handle_type;
-      //const_handle_type tmp_handle = *handle;
-      const_handle_type tmp_handle (*handle);
-
-      typedef Kokkos::View<
-        typename lno_row_view_t_::const_value_type*,
-        typename KokkosKernels::Impl::GetUnifiedLayout<lno_row_view_t_>::array_layout,
-        typename lno_row_view_t_::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_alno_row_view_t_;
-
-      typedef Kokkos::View<
-        typename lno_nnz_view_t_::const_value_type*,
-        typename KokkosKernels::Impl::GetUnifiedLayout<lno_nnz_view_t_>::array_layout,
-        typename lno_nnz_view_t_::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_alno_nnz_view_t_;
-
-      typedef Kokkos::View<
-        typename scalar_nnz_view_t_::const_value_type*,
-        typename KokkosKernels::Impl::GetUnifiedLayout<scalar_nnz_view_t_>::array_layout,
-        typename scalar_nnz_view_t_::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_ascalar_nnz_view_t_;
-
-      typedef Kokkos::View<
-        typename y_scalar_view_t::const_value_type**,
-        typename KokkosKernels::Impl::GetUnifiedLayout<y_scalar_view_t>::array_layout,
-        typename y_scalar_view_t::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_yscalar_nnz_view_t_;
-
-      typedef Kokkos::View<
-        typename x_scalar_view_t::non_const_value_type**,
-        typename KokkosKernels::Impl::GetUnifiedLayout<x_scalar_view_t>::array_layout,
-        typename x_scalar_view_t::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_xscalar_nnz_view_t_;
-
-      Internal_alno_row_view_t_ const_a_r (row_map.data(), row_map.extent(0));
-      Internal_alno_nnz_view_t_ const_a_l (entries.data(), entries.extent(0));
-      Internal_ascalar_nnz_view_t_ const_a_v (values.data(), values.extent(0));
-
-      Internal_xscalar_nnz_view_t_ nonconst_x_v (x_lhs_output_vec.data(), x_lhs_output_vec.extent(0), x_lhs_output_vec.extent(1));
-      Internal_yscalar_nnz_view_t_ const_y_v (y_rhs_input_vec.data(), y_rhs_input_vec.extent(0), y_rhs_input_vec.extent(1));
-
-      using namespace KokkosSparse::Impl;
-
-      GAUSS_SEIDEL_APPLY<const_handle_type,
-                         Internal_alno_row_view_t_, Internal_alno_nnz_view_t_, Internal_ascalar_nnz_view_t_,
-                         Internal_xscalar_nnz_view_t_, Internal_yscalar_nnz_view_t_>::gauss_seidel_apply (
-                             &tmp_handle, num_rows, num_cols,
-                             const_a_r, const_a_l, const_a_v,
-                             nonconst_x_v, const_y_v,
-                             init_zero_x_vector,
-                             update_y_vector,
-                             omega,
-                             numIter, false, true);
-
-
-    }
-
-    template <typename KernelHandle,
-              typename lno_row_view_t_,
-              typename lno_nnz_view_t_,
-              typename scalar_nnz_view_t_,
-              typename x_scalar_view_t,
-              typename y_scalar_view_t>
-    void backward_sweep_block_gauss_seidel_apply(KernelHandle *handle,
-                                                 typename KernelHandle::const_nnz_lno_t num_rows,
-                                                 typename KernelHandle::const_nnz_lno_t num_cols,
-                                                 typename KernelHandle::const_nnz_lno_t block_size,
-
-                                                 lno_row_view_t_ row_map,
-                                                 lno_nnz_view_t_ entries,
-                                                 scalar_nnz_view_t_ values,
-                                                 x_scalar_view_t x_lhs_output_vec,
-                                                 y_scalar_view_t y_rhs_input_vec,
-                                                 bool init_zero_x_vector,
-                                                 bool update_y_vector,
-                                                 typename KernelHandle::nnz_scalar_t omega,
-                                                 int numIter)
-    {
-      // Check compatibility of dimensions at run time.
-      if(x_lhs_output_vec.extent(1) != y_rhs_input_vec.extent(1))
-      {
-        std::ostringstream os;
-        os << "KokkosSparse::backward_sweep_block_gauss_seidel_apply: Dimensions of X and Y do not match: " <<
-          "X has " << x_lhs_output_vec.extent(1) << "columns, Y has " << y_rhs_input_vec.extent(1) << " columns.";
-        Kokkos::Impl::throw_runtime_exception (os.str ());
-      }
-      auto gsHandle = handle->get_point_gs_handle();
-      if(gsHandle->get_algorithm_type() == GS_CLUSTER)
-      {
-        throw std::runtime_error("Block versions of Gauss-Seidel are incompatible with algorithm GS_CLUSTER");
-      }
-      gsHandle->set_block_size(block_size);
-      backward_sweep_gauss_seidel_apply(handle,num_rows,num_cols,
-                                        row_map,
-                                        entries,
-                                        values,
-                                        x_lhs_output_vec,
-                                        y_rhs_input_vec,
-                                        init_zero_x_vector,
-                                        update_y_vector,
-                                        omega,
-                                        numIter);
-    }
+template <KokkosKernels::SparseMatrixFormat format = KokkosKernels::BlockCRS,
+          typename KernelHandle, typename lno_row_view_t_,
+          typename lno_nnz_view_t_, typename scalar_nnz_view_t_,
+          typename x_scalar_view_t, typename y_scalar_view_t>
+void backward_sweep_block_gauss_seidel_apply(
+    KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows,
+    typename KernelHandle::const_nnz_lno_t num_cols,
+    typename KernelHandle::const_nnz_lno_t block_size,
+
+    lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values,
+    x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec,
+    bool init_zero_x_vector, bool update_y_vector,
+    typename KernelHandle::nnz_scalar_t omega, int numIter) {
+  // Check compatibility of dimensions at run time.
+  if (x_lhs_output_vec.extent(1) != y_rhs_input_vec.extent(1)) {
+    std::ostringstream os;
+    os << "KokkosSparse::backward_sweep_block_gauss_seidel_apply: Dimensions "
+          "of X and Y do not match: "
+       << "X has " << x_lhs_output_vec.extent(1) << "columns, Y has "
+       << y_rhs_input_vec.extent(1) << " columns.";
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
+  }
+  auto gsHandle = handle->get_point_gs_handle();
+  if (gsHandle->get_algorithm_type() == GS_CLUSTER) {
+    throw std::runtime_error(
+        "Block versions of Gauss-Seidel are incompatible with algorithm "
+        "GS_CLUSTER");
   }
+  gsHandle->set_block_size(block_size);
+  backward_sweep_gauss_seidel_apply<format>(
+      handle, num_rows, num_cols, row_map, entries, values, x_lhs_output_vec,
+      y_rhs_input_vec, init_zero_x_vector, update_y_vector, omega, numIter);
 }
+}  // namespace Experimental
+}  // namespace KokkosSparse
 #endif
diff --git a/src/sparse/KokkosSparse_gauss_seidel_handle.hpp b/src/sparse/KokkosSparse_gauss_seidel_handle.hpp
index 5397d69096..179aaa323c 100644
--- a/src/sparse/KokkosSparse_gauss_seidel_handle.hpp
+++ b/src/sparse/KokkosSparse_gauss_seidel_handle.hpp
@@ -42,718 +42,715 @@
 //@HEADER
 */
 
-
-#include <Kokkos_MemoryTraits.hpp>
 #include <Kokkos_Core.hpp>
 #include <KokkosKernels_Utils.hpp>
 // needed for two-stage/classical GS
 #include <KokkosSparse_CrsMatrix.hpp>
+// needed for the set of available coloring algorithms
+#include <KokkosGraph_Distance1ColorHandle.hpp>
 
 #ifndef _GAUSSSEIDELHANDLE_HPP
 #define _GAUSSSEIDELHANDLE_HPP
 //#define VERBOSE
 
-namespace KokkosSparse{
-
-  enum GSAlgorithm{GS_DEFAULT, GS_PERMUTED, GS_TEAM, GS_CLUSTER, GS_TWOSTAGE};
-  enum GSDirection{GS_FORWARD, GS_BACKWARD, GS_SYMMETRIC};
-  enum ClusteringAlgorithm{CLUSTER_DEFAULT, CLUSTER_MIS2, CLUSTER_BALLOON, NUM_CLUSTERING_ALGORITHMS};
-
-  inline const char* getClusterAlgoName(ClusteringAlgorithm ca)
-  {
-    switch(ca)
-    {
-      case CLUSTER_BALLOON:
-        return "Balloon";
-      case CLUSTER_MIS2:
-        return "MIS(2)";
-      default:;
-    }
-    return "INVALID CLUSTERING ALGORITHM";
+namespace KokkosSparse {
+
+enum GSAlgorithm { GS_DEFAULT, GS_PERMUTED, GS_TEAM, GS_CLUSTER, GS_TWOSTAGE };
+enum GSDirection { GS_FORWARD, GS_BACKWARD, GS_SYMMETRIC };
+enum ClusteringAlgorithm {
+  CLUSTER_DEFAULT,
+  CLUSTER_MIS2,
+  CLUSTER_BALLOON,
+  NUM_CLUSTERING_ALGORITHMS
+};
+
+inline const char *getClusterAlgoName(ClusteringAlgorithm ca) {
+  switch (ca) {
+    case CLUSTER_BALLOON: return "Balloon";
+    case CLUSTER_MIS2: return "MIS(2)";
+    default:;
   }
+  return "INVALID CLUSTERING ALGORITHM";
+}
 
-  template <class size_type_, class lno_t_, class scalar_t_,
-            class ExecutionSpace,
-            class TemporaryMemorySpace,
-            class PersistentMemorySpace>
-  class GaussSeidelHandle{
-  public:
-    typedef ExecutionSpace HandleExecSpace;
-    typedef TemporaryMemorySpace HandleTempMemorySpace;
-    typedef PersistentMemorySpace HandlePersistentMemorySpace;
-
-    typedef typename std::remove_const<size_type_>::type  size_type;
-    typedef const size_type const_size_type;
-
-    typedef typename std::remove_const<lno_t_>::type  nnz_lno_t;
-    typedef const nnz_lno_t const_nnz_lno_t;
-
-    typedef typename std::remove_const<scalar_t_>::type  nnz_scalar_t;
-    typedef const nnz_scalar_t const_nnz_scalar_t;
-
-
-    typedef typename Kokkos::View<size_type *, HandleTempMemorySpace> row_lno_temp_work_view_t;
-    typedef typename Kokkos::View<size_type *, HandlePersistentMemorySpace> row_lno_persistent_work_view_t;
-    typedef typename row_lno_persistent_work_view_t::HostMirror row_lno_persistent_work_host_view_t; //Host view type
-
-    typedef typename Kokkos::View<nnz_scalar_t *, HandleTempMemorySpace> scalar_temp_work_view_t;
-    typedef typename Kokkos::View<nnz_scalar_t *, HandlePersistentMemorySpace> scalar_persistent_work_view_t;
-    typedef typename scalar_persistent_work_view_t::HostMirror scalar_persistent_work_host_view_t; //Host view type
-
-    typedef typename Kokkos::View<nnz_lno_t *, HandleTempMemorySpace> nnz_lno_temp_work_view_t;
-    typedef typename Kokkos::View<nnz_lno_t *, HandlePersistentMemorySpace> nnz_lno_persistent_work_view_t;
-    typedef typename nnz_lno_persistent_work_view_t::HostMirror nnz_lno_persistent_work_host_view_t; //Host view type
-
-
-  protected:
-    GSAlgorithm algorithm_type;
-
-    nnz_lno_persistent_work_host_view_t color_xadj;
-    nnz_lno_persistent_work_view_t color_adj;
-    nnz_lno_t numColors;
-
-    bool called_symbolic;
-    bool called_numeric;
-
-    int suggested_vector_size;
-    int suggested_team_size;
-
-  public:
-
-    /**
-     * \brief Default constructor.
-     */
-    GaussSeidelHandle(GSAlgorithm gs) :
-      algorithm_type(gs),
-      color_xadj(), color_adj(), numColors(0),
-      called_symbolic(false), called_numeric(false),
-      suggested_vector_size(0), suggested_team_size(0)
-    {}
-
-    virtual ~GaussSeidelHandle() = default;
-
-    //getters
-    GSAlgorithm get_algorithm_type() const {return this->algorithm_type;}
+template <class size_type_, class lno_t_, class scalar_t_, class ExecutionSpace,
+          class TemporaryMemorySpace, class PersistentMemorySpace>
+class GaussSeidelHandle {
+ public:
+  typedef ExecutionSpace HandleExecSpace;
+  typedef TemporaryMemorySpace HandleTempMemorySpace;
+  typedef PersistentMemorySpace HandlePersistentMemorySpace;
+
+  typedef typename std::remove_const<size_type_>::type size_type;
+  typedef const size_type const_size_type;
+
+  typedef typename std::remove_const<lno_t_>::type nnz_lno_t;
+  typedef const nnz_lno_t const_nnz_lno_t;
+
+  typedef typename std::remove_const<scalar_t_>::type nnz_scalar_t;
+  typedef const nnz_scalar_t const_nnz_scalar_t;
+
+  typedef typename Kokkos::View<size_type *, HandleTempMemorySpace>
+      row_lno_temp_work_view_t;
+  typedef typename Kokkos::View<size_type *, HandlePersistentMemorySpace>
+      row_lno_persistent_work_view_t;
+  typedef typename row_lno_persistent_work_view_t::HostMirror
+      row_lno_persistent_work_host_view_t;  // Host view type
+
+  typedef typename Kokkos::View<nnz_scalar_t *, HandleTempMemorySpace>
+      scalar_temp_work_view_t;
+  typedef typename Kokkos::View<nnz_scalar_t *, HandlePersistentMemorySpace>
+      scalar_persistent_work_view_t;
+  typedef typename scalar_persistent_work_view_t::HostMirror
+      scalar_persistent_work_host_view_t;  // Host view type
+
+  typedef typename Kokkos::View<nnz_lno_t *, HandleTempMemorySpace>
+      nnz_lno_temp_work_view_t;
+  typedef typename Kokkos::View<nnz_lno_t *, HandlePersistentMemorySpace>
+      nnz_lno_persistent_work_view_t;
+  typedef typename nnz_lno_persistent_work_view_t::HostMirror
+      nnz_lno_persistent_work_host_view_t;  // Host view type
+
+ protected:
+  GSAlgorithm algorithm_type;
+
+  nnz_lno_persistent_work_host_view_t color_xadj;
+  nnz_lno_persistent_work_view_t color_adj;
+  nnz_lno_t numColors;
+
+  bool called_symbolic;
+  bool called_numeric;
+
+  int suggested_vector_size;
+  int suggested_team_size;
+
+ public:
+  /**
+   * \brief Default constructor.
+   */
+  GaussSeidelHandle(GSAlgorithm gs)
+      : algorithm_type(gs),
+        color_xadj(),
+        color_adj(),
+        numColors(0),
+        called_symbolic(false),
+        called_numeric(false),
+        suggested_vector_size(0),
+        suggested_team_size(0) {}
+
+  virtual ~GaussSeidelHandle() = default;
+
+  // getters
+  GSAlgorithm get_algorithm_type() const { return this->algorithm_type; }
+
+  nnz_lno_persistent_work_host_view_t get_color_xadj() const {
+    return this->color_xadj;
+  }
+  nnz_lno_persistent_work_view_t get_color_adj() const {
+    return this->color_adj;
+  }
+  nnz_lno_t get_num_colors() const { return this->numColors; }
 
-    nnz_lno_persistent_work_host_view_t get_color_xadj() const {
-      return this->color_xadj;
-    }
-    nnz_lno_persistent_work_view_t get_color_adj() const {
-      return this->color_adj;
-    }
-    nnz_lno_t get_num_colors() const {
-      return this->numColors;
-    }
+  bool is_symbolic_called() const { return this->called_symbolic; }
+  bool is_numeric_called() const { return this->called_numeric; }
 
-    bool is_symbolic_called() const {
-      return this->called_symbolic;
-    }
-    bool is_numeric_called() const {
-      return this->called_numeric;
-    }
+  // setters
+  void set_algorithm_type(const GSAlgorithm sgs_algo) {
+    this->algorithm_type  = sgs_algo;
+    this->called_symbolic = false;
+  }
 
-    //setters
-    void set_algorithm_type(const GSAlgorithm sgs_algo){
-      this->algorithm_type = sgs_algo;
-      this->called_symbolic = false;
-    }
+  void set_call_symbolic(bool call = true) { this->called_symbolic = call; }
+  void set_call_numeric(bool call = true) { this->called_numeric = call; }
 
-    void set_call_symbolic(bool call = true) {
-      this->called_symbolic = call;
-    }
-    void set_call_numeric(bool call = true) {
-      this->called_numeric = call;
-    }
+  void set_color_xadj(const nnz_lno_persistent_work_host_view_t &color_xadj_) {
+    this->color_xadj = color_xadj_;
+  }
+  void set_color_adj(const nnz_lno_persistent_work_view_t &color_adj_) {
+    this->color_adj = color_adj_;
+  }
+  void set_num_colors(const nnz_lno_t &numColors_) {
+    this->numColors = numColors_;
+  }
 
-    void set_color_xadj(const nnz_lno_persistent_work_host_view_t &color_xadj_) {
-      this->color_xadj = color_xadj_;
-    }
-    void set_color_adj(const nnz_lno_persistent_work_view_t &color_adj_) {
-      this->color_adj = color_adj_;
-    }
-    void set_num_colors(const nnz_lno_t &numColors_) {
-      this->numColors = numColors_;
+  void vector_team_size(int max_allowed_team_size,
+                        int &suggested_vector_size_,  // output
+                        int &suggested_team_size_,    // output
+                        size_type nr, size_type nnz) {
+    if (this->suggested_team_size && this->suggested_vector_size) {
+      suggested_vector_size_ = this->suggested_vector_size;
+      suggested_team_size_   = this->suggested_team_size;
+      return;
+    } else {
+      KokkosKernels::Impl::get_suggested_vector_size<size_type, ExecutionSpace>(
+          suggested_vector_size_, nr, nnz);
+      KokkosKernels::Impl::get_suggested_team_size<ExecutionSpace>(
+          max_allowed_team_size, suggested_vector_size_, suggested_team_size_);
+      this->suggested_team_size   = suggested_vector_size_;
+      this->suggested_vector_size = suggested_vector_size_;
     }
+  }
+};
+
+template <class size_type_, class lno_t_, class scalar_t_, class ExecutionSpace,
+          class TemporaryMemorySpace, class PersistentMemorySpace>
+class PointGaussSeidelHandle
+    : public GaussSeidelHandle<size_type_, lno_t_, scalar_t_, ExecutionSpace,
+                               TemporaryMemorySpace, PersistentMemorySpace> {
+ public:
+  typedef GaussSeidelHandle<size_type_, lno_t_, scalar_t_, ExecutionSpace,
+                            TemporaryMemorySpace, PersistentMemorySpace>
+      GSHandle;
+  typedef ExecutionSpace HandleExecSpace;
+  typedef TemporaryMemorySpace HandleTempMemorySpace;
+  typedef PersistentMemorySpace HandlePersistentMemorySpace;
+
+  typedef typename std::remove_const<size_type_>::type size_type;
+  typedef const size_type const_size_type;
+
+  typedef typename std::remove_const<lno_t_>::type nnz_lno_t;
+  typedef const nnz_lno_t const_nnz_lno_t;
+
+  typedef typename std::remove_const<scalar_t_>::type nnz_scalar_t;
+  typedef const nnz_scalar_t const_nnz_scalar_t;
+
+  typedef typename Kokkos::View<size_type *, HandleTempMemorySpace>
+      row_lno_temp_work_view_t;
+  typedef typename Kokkos::View<size_type *, HandlePersistentMemorySpace>
+      row_lno_persistent_work_view_t;
+  typedef typename row_lno_persistent_work_view_t::HostMirror
+      row_lno_persistent_work_host_view_t;  // Host view type
+
+  typedef typename Kokkos::View<nnz_scalar_t *, HandleTempMemorySpace>
+      scalar_temp_work_view_t;
+  typedef typename Kokkos::View<nnz_scalar_t *, HandlePersistentMemorySpace>
+      scalar_persistent_work_view_t;
+  typedef typename Kokkos::View<nnz_scalar_t **, default_layout,
+                                HandlePersistentMemorySpace>
+      scalar_persistent_work_view2d_t;
+  typedef typename scalar_persistent_work_view_t::HostMirror
+      scalar_persistent_work_host_view_t;  // Host view type
+
+  typedef typename Kokkos::View<nnz_lno_t *, HandleTempMemorySpace>
+      nnz_lno_temp_work_view_t;
+  typedef typename Kokkos::View<nnz_lno_t *, HandlePersistentMemorySpace>
+      nnz_lno_persistent_work_view_t;
+  typedef typename nnz_lno_persistent_work_view_t::HostMirror
+      nnz_lno_persistent_work_host_view_t;  // Host view type
+
+ private:
+  row_lno_persistent_work_view_t permuted_xadj;
+  nnz_lno_persistent_work_view_t permuted_adj;
+  scalar_persistent_work_view_t permuted_adj_vals;
+  nnz_lno_persistent_work_view_t old_to_new_map;
+
+  scalar_persistent_work_view2d_t permuted_y_vector;
+  scalar_persistent_work_view2d_t permuted_x_vector;
+
+  scalar_persistent_work_view_t permuted_inverse_diagonal;
+  nnz_lno_t block_size;  // this is for block sgs
+
+  nnz_lno_t num_values_in_l1, num_values_in_l2, num_big_rows;
+  size_t level_1_mem, level_2_mem;
+
+  // Option set by user: rows with at least this many nonzeros are handled by a
+  // separate kernel
+  nnz_lno_t long_row_threshold;
+  // Number of long rows per color set. They are all grouped at the end of each
+  // color set.
+  nnz_lno_persistent_work_host_view_t long_rows_per_color;
+  // Maximum row length in each color set.
+  nnz_lno_persistent_work_host_view_t max_row_length_per_color;
+  // Temporary space for matvec over long rows - size is only max num long rows
+  // in a color.
+  scalar_persistent_work_view_t long_row_x;
+
+  // Coloring algorithm to use
+  KokkosGraph::ColoringAlgorithm coloring_algo;
+
+ public:
+  /**
+   * \brief Default constructor.
+   */
+  PointGaussSeidelHandle(GSAlgorithm gs = GS_DEFAULT,
+                         KokkosGraph::ColoringAlgorithm coloring_algo_ =
+                             KokkosGraph::COLORING_DEFAULT)
+      : GSHandle(gs),
+        permuted_xadj(),
+        permuted_adj(),
+        permuted_adj_vals(),
+        old_to_new_map(),
+        permuted_y_vector(),
+        permuted_x_vector(),
+        permuted_inverse_diagonal(),
+        block_size(1),
+        num_values_in_l1(-1),
+        num_values_in_l2(-1),
+        num_big_rows(0),
+        level_1_mem(0),
+        level_2_mem(0),
+        long_row_threshold(0),
+        coloring_algo(coloring_algo_) {
+    if (gs == GS_DEFAULT) this->choose_default_algorithm();
+  }
 
-    void vector_team_size(
-                          int max_allowed_team_size,
-                          int &suggested_vector_size_,  //output
-                          int &suggested_team_size_,    //output
-                          size_type nr, size_type nnz){
-      if (this->suggested_team_size && this->suggested_vector_size) {
-        suggested_vector_size_ = this->suggested_vector_size;
-        suggested_team_size_ = this->suggested_team_size;
-        return;
-      }
-      else {
-        KokkosKernels::Impl::get_suggested_vector_size<size_type, ExecutionSpace>(suggested_vector_size_, nr, nnz);
-        KokkosKernels::Impl::get_suggested_team_size<ExecutionSpace>(max_allowed_team_size, suggested_vector_size_, suggested_team_size_);
-        this->suggested_team_size = suggested_vector_size_;
-        this->suggested_vector_size = suggested_vector_size_;
-
-      }
-    }
-  };
-
-  template <class size_type_, class lno_t_, class scalar_t_,
-            class ExecutionSpace,
-            class TemporaryMemorySpace,
-            class PersistentMemorySpace>
-  class PointGaussSeidelHandle
-  : public GaussSeidelHandle<size_type_, lno_t_, scalar_t_, ExecutionSpace, TemporaryMemorySpace, PersistentMemorySpace>
-  {
-  public:
-    typedef GaussSeidelHandle<size_type_, lno_t_, scalar_t_, ExecutionSpace, TemporaryMemorySpace, PersistentMemorySpace> GSHandle;
-    typedef ExecutionSpace HandleExecSpace;
-    typedef TemporaryMemorySpace HandleTempMemorySpace;
-    typedef PersistentMemorySpace HandlePersistentMemorySpace;
-
-    typedef typename std::remove_const<size_type_>::type  size_type;
-    typedef const size_type const_size_type;
-
-    typedef typename std::remove_const<lno_t_>::type  nnz_lno_t;
-    typedef const nnz_lno_t const_nnz_lno_t;
-
-    typedef typename std::remove_const<scalar_t_>::type  nnz_scalar_t;
-    typedef const nnz_scalar_t const_nnz_scalar_t;
-
-
-    typedef typename Kokkos::View<size_type *, HandleTempMemorySpace> row_lno_temp_work_view_t;
-    typedef typename Kokkos::View<size_type *, HandlePersistentMemorySpace> row_lno_persistent_work_view_t;
-    typedef typename row_lno_persistent_work_view_t::HostMirror row_lno_persistent_work_host_view_t; //Host view type
-
-    typedef typename Kokkos::View<nnz_scalar_t *, HandleTempMemorySpace> scalar_temp_work_view_t;
-    typedef typename Kokkos::View<nnz_scalar_t *, HandlePersistentMemorySpace> scalar_persistent_work_view_t;
-    typedef typename Kokkos::View<nnz_scalar_t **, default_layout, HandlePersistentMemorySpace> scalar_persistent_work_view2d_t;
-    typedef typename scalar_persistent_work_view_t::HostMirror scalar_persistent_work_host_view_t; //Host view type
-
-    typedef typename Kokkos::View<nnz_lno_t *, HandleTempMemorySpace> nnz_lno_temp_work_view_t;
-    typedef typename Kokkos::View<nnz_lno_t *, HandlePersistentMemorySpace> nnz_lno_persistent_work_view_t;
-    typedef typename nnz_lno_persistent_work_view_t::HostMirror nnz_lno_persistent_work_host_view_t; //Host view type
-
-  private:
-    row_lno_persistent_work_view_t permuted_xadj;
-    nnz_lno_persistent_work_view_t permuted_adj;
-    scalar_persistent_work_view_t permuted_adj_vals;
-    nnz_lno_persistent_work_view_t old_to_new_map;
-
-    scalar_persistent_work_view2d_t permuted_y_vector;
-    scalar_persistent_work_view2d_t permuted_x_vector;
-
-    scalar_persistent_work_view_t permuted_inverse_diagonal;
-    nnz_lno_t block_size; //this is for block sgs
-
-    nnz_lno_t num_values_in_l1, num_values_in_l2, num_big_rows;
-    size_t level_1_mem, level_2_mem;
-
-    //Option set by user: rows with at least this many nonzeros are handled by a separate kernel
-    nnz_lno_t long_row_threshold;
-    //Number of long rows per color set. They are all grouped at the end of each color set.
-    nnz_lno_persistent_work_host_view_t long_rows_per_color;
-    //Maximum row length in each color set.
-    nnz_lno_persistent_work_host_view_t max_row_length_per_color;
-    //Temporary space for matvec over long rows - size is only max num long rows in a color.
-    scalar_persistent_work_view_t long_row_x;
-
-  public:
-
-    /**
-     * \brief Default constructor.
-     */
-    PointGaussSeidelHandle(GSAlgorithm gs = GS_DEFAULT) :
-      GSHandle(gs),
-      permuted_xadj(), permuted_adj(), permuted_adj_vals(), old_to_new_map(),
-      permuted_y_vector(), permuted_x_vector(),
-      permuted_inverse_diagonal(), block_size(1),
-      num_values_in_l1(-1), num_values_in_l2(-1),num_big_rows(0), level_1_mem(0), level_2_mem(0),
-      long_row_threshold(0)
-    {
-      if (gs == GS_DEFAULT)
-        this->choose_default_algorithm();
-    }
+  void set_block_size(nnz_lno_t bs) { this->block_size = bs; }
+  nnz_lno_t get_block_size() const { return this->block_size; }
 
-    void set_block_size(nnz_lno_t bs){this->block_size = bs; }
-    nnz_lno_t get_block_size() const {return this->block_size;}
+  void choose_default_algorithm() {
+    if (KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>())
+      this->algorithm_type = GS_TEAM;
+    else
+      this->algorithm_type = GS_PERMUTED;
+  }
 
-    void choose_default_algorithm(){
-      if(KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>())
-        this->algorithm_type = GS_TEAM;
-      else
-        this->algorithm_type = GS_PERMUTED;
-    }
+  KokkosGraph::ColoringAlgorithm get_coloring_algorithm() const {
+    return this->coloring_algo;
+  }
+  void set_coloring_algorithm(KokkosGraph::ColoringAlgorithm algo) {
+    this->coloring_algo = algo;
+  }
 
-    ~PointGaussSeidelHandle() = default;
+  ~PointGaussSeidelHandle() = default;
 
-    //getters
-    row_lno_persistent_work_view_t get_new_xadj() const {
-      return this->permuted_xadj;
-    }
-    nnz_lno_persistent_work_view_t get_new_adj() const {
-      return this->permuted_adj;
-    }
-    scalar_persistent_work_view_t get_new_adj_val() const {
-      return this->permuted_adj_vals;
-    }
-    nnz_lno_persistent_work_view_t get_old_to_new_map() const {
-      return this->old_to_new_map;
-    }
-
-    //setters
-    void set_algorithm_type(const GSAlgorithm &sgs_algo){this->algorithm_type = sgs_algo;}
+  // getters
+  row_lno_persistent_work_view_t get_new_xadj() const {
+    return this->permuted_xadj;
+  }
+  nnz_lno_persistent_work_view_t get_new_adj() const {
+    return this->permuted_adj;
+  }
+  scalar_persistent_work_view_t get_new_adj_val() const {
+    return this->permuted_adj_vals;
+  }
+  nnz_lno_persistent_work_view_t get_old_to_new_map() const {
+    return this->old_to_new_map;
+  }
 
-    void set_call_symbolic(bool call = true){this->called_symbolic = call;}
-    void set_call_numeric(bool call = true){this->called_numeric = call;}
+  // setters
+  void set_algorithm_type(const GSAlgorithm &sgs_algo) {
+    this->algorithm_type = sgs_algo;
+  }
 
-    void set_num_colors(const nnz_lno_t &numColors_) {
-      this->numColors = numColors_;
-    }
+  void set_call_symbolic(bool call = true) { this->called_symbolic = call; }
+  void set_call_numeric(bool call = true) { this->called_numeric = call; }
 
-    void set_new_xadj(const row_lno_persistent_work_view_t &xadj_) {
-      this->permuted_xadj = xadj_;
-    }
-    void set_new_adj(const nnz_lno_persistent_work_view_t &adj_) {
-      this->permuted_adj = adj_;
-    }
-    void set_new_adj_val(const scalar_persistent_work_view_t &adj_vals_) {
-      this->permuted_adj_vals = adj_vals_;
-    }
-    void set_old_to_new_map(const nnz_lno_persistent_work_view_t &old_to_new_map_) {
-      this->old_to_new_map = old_to_new_map_;
-    }
-    void set_permuted_inverse_diagonal (const scalar_persistent_work_view_t permuted_inverse_diagonal_){
-      this->permuted_inverse_diagonal = permuted_inverse_diagonal_;
-    }
+  void set_num_colors(const nnz_lno_t &numColors_) {
+    this->numColors = numColors_;
+  }
 
-    scalar_persistent_work_view_t get_permuted_inverse_diagonal() const {
-      return this->permuted_inverse_diagonal;
-    }
+  void set_new_xadj(const row_lno_persistent_work_view_t &xadj_) {
+    this->permuted_xadj = xadj_;
+  }
+  void set_new_adj(const nnz_lno_persistent_work_view_t &adj_) {
+    this->permuted_adj = adj_;
+  }
+  void set_new_adj_val(const scalar_persistent_work_view_t &adj_vals_) {
+    this->permuted_adj_vals = adj_vals_;
+  }
+  void set_old_to_new_map(
+      const nnz_lno_persistent_work_view_t &old_to_new_map_) {
+    this->old_to_new_map = old_to_new_map_;
+  }
+  void set_permuted_inverse_diagonal(
+      const scalar_persistent_work_view_t permuted_inverse_diagonal_) {
+    this->permuted_inverse_diagonal = permuted_inverse_diagonal_;
+  }
 
+  scalar_persistent_work_view_t get_permuted_inverse_diagonal() const {
+    return this->permuted_inverse_diagonal;
+  }
 
-    void set_level_1_mem(size_t _level_1_mem){
-      this->level_1_mem = _level_1_mem;
-    }
-    void set_level_2_mem(size_t _level_2_mem){
-      this->level_2_mem = _level_2_mem;
-    }
+  void set_level_1_mem(size_t _level_1_mem) {
+    this->level_1_mem = _level_1_mem;
+  }
+  void set_level_2_mem(size_t _level_2_mem) {
+    this->level_2_mem = _level_2_mem;
+  }
 
-    void set_num_values_in_l1(nnz_lno_t _num_values_in_l1){
-      this->num_values_in_l1 = _num_values_in_l1;
-    }
-    void set_num_values_in_l2(nnz_lno_t _num_values_in_l2){
-      this->num_values_in_l2 = _num_values_in_l2;
-    }
+  void set_num_values_in_l1(nnz_lno_t _num_values_in_l1) {
+    this->num_values_in_l1 = _num_values_in_l1;
+  }
+  void set_num_values_in_l2(nnz_lno_t _num_values_in_l2) {
+    this->num_values_in_l2 = _num_values_in_l2;
+  }
 
-    void set_num_big_rows(nnz_lno_t _big_rows){
-      this->num_big_rows = _big_rows;
-    }
+  void set_num_big_rows(nnz_lno_t _big_rows) { this->num_big_rows = _big_rows; }
 
-    size_t get_level_1_mem() const {
-      return this->level_1_mem;
-    }
-    size_t get_level_2_mem() const {
-      return this->level_2_mem;
-    }
+  size_t get_level_1_mem() const { return this->level_1_mem; }
+  size_t get_level_2_mem() const { return this->level_2_mem; }
 
-    nnz_lno_t get_num_values_in_l1() const {
-      return this->num_values_in_l1 ;
-    }
-    nnz_lno_t get_num_values_in_l2() const {
-      return this->num_values_in_l2;
-    }
-    nnz_lno_t get_num_big_rows() const {
-      return this->num_big_rows;
-    }
+  nnz_lno_t get_num_values_in_l1() const { return this->num_values_in_l1; }
+  nnz_lno_t get_num_values_in_l2() const { return this->num_values_in_l2; }
+  nnz_lno_t get_num_big_rows() const { return this->num_big_rows; }
 
-    nnz_lno_t get_long_row_threshold() const
-    {
-      return long_row_threshold;
-    }
+  nnz_lno_t get_long_row_threshold() const { return long_row_threshold; }
 
-    void set_long_row_threshold(nnz_lno_t lrt)
-    {
-      long_row_threshold = lrt;
-    }
+  void set_long_row_threshold(nnz_lno_t lrt) { long_row_threshold = lrt; }
 
-    nnz_lno_persistent_work_host_view_t get_long_rows_per_color() const
-    {
-      return long_rows_per_color;
-    }
+  nnz_lno_persistent_work_host_view_t get_long_rows_per_color() const {
+    return long_rows_per_color;
+  }
 
-    void set_long_rows_per_color(const nnz_lno_persistent_work_host_view_t& long_rows_per_color_)
-    {
-      long_rows_per_color = long_rows_per_color_;
-    }
+  void set_long_rows_per_color(
+      const nnz_lno_persistent_work_host_view_t &long_rows_per_color_) {
+    long_rows_per_color = long_rows_per_color_;
+  }
 
-    nnz_lno_persistent_work_host_view_t get_max_row_length_per_color() const
-    {
-      return max_row_length_per_color;
-    }
+  nnz_lno_persistent_work_host_view_t get_max_row_length_per_color() const {
+    return max_row_length_per_color;
+  }
 
-    void set_max_row_length_per_color(const nnz_lno_persistent_work_host_view_t& max_row_length_per_color_)
-    {
-      max_row_length_per_color = max_row_length_per_color_;
-    }
+  void set_max_row_length_per_color(
+      const nnz_lno_persistent_work_host_view_t &max_row_length_per_color_) {
+    max_row_length_per_color = max_row_length_per_color_;
+  }
 
-    scalar_persistent_work_view_t get_long_row_x() const
-    {
-      return long_row_x;
-    }
+  scalar_persistent_work_view_t get_long_row_x() const { return long_row_x; }
 
-    void set_long_row_x(const scalar_persistent_work_view_t& long_row_x_)
-    {
-      long_row_x = long_row_x_;
-    }
+  void set_long_row_x(const scalar_persistent_work_view_t &long_row_x_) {
+    long_row_x = long_row_x_;
+  }
 
-    void allocate_x_y_vectors(nnz_lno_t num_rows, nnz_lno_t num_cols, nnz_lno_t num_vecs){
-      if(permuted_y_vector.extent(0) != size_t(num_rows) || permuted_y_vector.extent(1) != size_t(num_vecs)){
-        permuted_y_vector = scalar_persistent_work_view2d_t("PERMUTED Y VECTOR", num_rows, num_vecs);
-      }
-      if(permuted_x_vector.extent(0) != size_t(num_cols) || permuted_x_vector.extent(1) != size_t(num_vecs)){
-        permuted_x_vector = scalar_persistent_work_view2d_t("PERMUTED X VECTOR", num_cols, num_vecs);
-      }
+  void allocate_x_y_vectors(nnz_lno_t num_rows, nnz_lno_t num_cols,
+                            nnz_lno_t num_vecs) {
+    if (permuted_y_vector.extent(0) != size_t(num_rows) ||
+        permuted_y_vector.extent(1) != size_t(num_vecs)) {
+      permuted_y_vector = scalar_persistent_work_view2d_t("PERMUTED Y VECTOR",
+                                                          num_rows, num_vecs);
     }
-
-    scalar_persistent_work_view2d_t get_permuted_y_vector() const {return this->permuted_y_vector;}
-    scalar_persistent_work_view2d_t get_permuted_x_vector() const {return this->permuted_x_vector;}
-
-    void vector_team_size(
-                          int max_allowed_team_size,
-                          int &suggested_vector_size_,
-                          int &suggested_team_size_,
-                          size_type nr, size_type nnz){
-      //suggested_team_size_ =  this->suggested_team_size = 1;
-      //suggested_vector_size_=this->suggested_vector_size = 1;
-      //return;
-      if (this->suggested_team_size && this->suggested_vector_size) {
-        suggested_vector_size_ = this->suggested_vector_size;
-        suggested_team_size_ = this->suggested_team_size;
-        return;
-      }
-      else {
-        KokkosKernels::Impl::get_suggested_vector_size<size_type, ExecutionSpace>(suggested_vector_size_, nr, nnz);
-        KokkosKernels::Impl::get_suggested_team_size<ExecutionSpace>(max_allowed_team_size, suggested_vector_size_, suggested_team_size_);
-        this->suggested_team_size = suggested_vector_size_;
-        this->suggested_vector_size = suggested_vector_size_;
-
-      }
+    if (permuted_x_vector.extent(0) != size_t(num_cols) ||
+        permuted_x_vector.extent(1) != size_t(num_vecs)) {
+      permuted_x_vector = scalar_persistent_work_view2d_t("PERMUTED X VECTOR",
+                                                          num_cols, num_vecs);
     }
-  };
-
-  template <class size_type_, class lno_t_, class scalar_t_,
-            class ExecutionSpace,
-            class TemporaryMemorySpace,
-            class PersistentMemorySpace>
-  class ClusterGaussSeidelHandle
-  : public GaussSeidelHandle<size_type_, lno_t_, scalar_t_, ExecutionSpace, TemporaryMemorySpace, PersistentMemorySpace>
-  {
-  public:
-    typedef GaussSeidelHandle<size_type_, lno_t_, scalar_t_, ExecutionSpace, TemporaryMemorySpace, PersistentMemorySpace> GSHandle;
-    typedef ExecutionSpace HandleExecSpace;
-    typedef TemporaryMemorySpace HandleTempMemorySpace;
-    typedef PersistentMemorySpace HandlePersistentMemorySpace;
-
-    typedef typename std::remove_const<size_type_>::type  size_type;
-    typedef const size_type const_size_type;
-
-    typedef typename std::remove_const<lno_t_>::type  nnz_lno_t;
-    typedef const nnz_lno_t const_nnz_lno_t;
-
-    typedef typename std::remove_const<scalar_t_>::type  nnz_scalar_t;
-    typedef const nnz_scalar_t const_nnz_scalar_t;
-
-    typedef typename Kokkos::View<size_type *, HandleTempMemorySpace> row_lno_temp_work_view_t;
-    typedef typename Kokkos::View<size_type *, HandlePersistentMemorySpace> row_lno_persistent_work_view_t;
-    typedef typename row_lno_persistent_work_view_t::HostMirror row_lno_persistent_work_host_view_t; //Host view type
-
-    typedef typename Kokkos::View<nnz_scalar_t *, HandleTempMemorySpace> scalar_temp_work_view_t;
-    typedef typename Kokkos::View<nnz_scalar_t *, HandlePersistentMemorySpace> scalar_persistent_work_view_t;
-    typedef typename scalar_persistent_work_view_t::HostMirror scalar_persistent_work_host_view_t; //Host view type
-
-    typedef typename Kokkos::View<nnz_lno_t *, HandleTempMemorySpace> nnz_lno_temp_work_view_t;
-    typedef typename Kokkos::View<nnz_lno_t *, HandlePersistentMemorySpace> nnz_lno_persistent_work_view_t;
-    typedef typename nnz_lno_persistent_work_view_t::HostMirror nnz_lno_persistent_work_host_view_t; //Host view type
-
-  private:
-
-    ClusteringAlgorithm cluster_algo;
-
-    //This is the user-configurable target cluster size.
-    //Some clusters may be slightly larger or smaller,
-    //but cluster_xadj always has the exact size of each.
-    nnz_lno_t cluster_size;
-
-    int suggested_vector_size;
-    int suggested_team_size;
-
-    scalar_persistent_work_view_t inverse_diagonal;
-
-    //cluster_xadj and cluster_adj encode the vertices in each cluster
-    nnz_lno_persistent_work_view_t cluster_xadj;
-    nnz_lno_persistent_work_view_t cluster_adj;
-    //vert_clusters(i) is the cluster that vertex i belongs to
-    nnz_lno_persistent_work_view_t vert_clusters;
-
-  public:
-
-    /**
-     * \brief Default constructor.
-     */
-
-    //Constructor for cluster-coloring based GS and SGS
-    ClusterGaussSeidelHandle(ClusteringAlgorithm cluster_algo_, nnz_lno_t cluster_size_)
-      : GSHandle(GS_CLUSTER), cluster_algo(cluster_algo_), cluster_size(cluster_size_),
-      inverse_diagonal(), cluster_xadj(), cluster_adj(), vert_clusters()
-    {}
-
-    void set_cluster_size(nnz_lno_t cs) {this->cluster_size = cs;}
-    nnz_lno_t get_cluster_size() const {return this->cluster_size;}
+  }
 
-    void set_vert_clusters(nnz_lno_persistent_work_view_t& vert_clusters_) {
-      this->vert_clusters = vert_clusters_;
-    }
-    void set_cluster_xadj(nnz_lno_persistent_work_view_t& cluster_xadj_) {
-      this->cluster_xadj = cluster_xadj_;
-    }
-    void set_cluster_adj(nnz_lno_persistent_work_view_t& cluster_adj_) {
-      this->cluster_adj = cluster_adj_;
-    }
+  scalar_persistent_work_view2d_t get_permuted_y_vector() const {
+    return this->permuted_y_vector;
+  }
+  scalar_persistent_work_view2d_t get_permuted_x_vector() const {
+    return this->permuted_x_vector;
+  }
 
-    nnz_lno_persistent_work_view_t get_vert_clusters() const {
-      if(!this->is_symbolic_called())
-        throw std::runtime_error("vert_clusters does not exist until after symbolic setup.");
-      return vert_clusters;
+  void vector_team_size(int max_allowed_team_size, int &suggested_vector_size_,
+                        int &suggested_team_size_, size_type nr,
+                        size_type nnz) {
+    // suggested_team_size_ =  this->suggested_team_size = 1;
+    // suggested_vector_size_=this->suggested_vector_size = 1;
+    // return;
+    if (this->suggested_team_size && this->suggested_vector_size) {
+      suggested_vector_size_ = this->suggested_vector_size;
+      suggested_team_size_   = this->suggested_team_size;
+      return;
+    } else {
+      KokkosKernels::Impl::get_suggested_vector_size<size_type, ExecutionSpace>(
+          suggested_vector_size_, nr, nnz);
+      KokkosKernels::Impl::get_suggested_team_size<ExecutionSpace>(
+          max_allowed_team_size, suggested_vector_size_, suggested_team_size_);
+      this->suggested_team_size   = suggested_vector_size_;
+      this->suggested_vector_size = suggested_vector_size_;
     }
+  }
+};
+
+template <class size_type_, class lno_t_, class scalar_t_, class ExecutionSpace,
+          class TemporaryMemorySpace, class PersistentMemorySpace>
+class ClusterGaussSeidelHandle
+    : public GaussSeidelHandle<size_type_, lno_t_, scalar_t_, ExecutionSpace,
+                               TemporaryMemorySpace, PersistentMemorySpace> {
+ public:
+  typedef GaussSeidelHandle<size_type_, lno_t_, scalar_t_, ExecutionSpace,
+                            TemporaryMemorySpace, PersistentMemorySpace>
+      GSHandle;
+  typedef ExecutionSpace HandleExecSpace;
+  typedef TemporaryMemorySpace HandleTempMemorySpace;
+  typedef PersistentMemorySpace HandlePersistentMemorySpace;
+
+  typedef typename std::remove_const<size_type_>::type size_type;
+  typedef const size_type const_size_type;
+
+  typedef typename std::remove_const<lno_t_>::type nnz_lno_t;
+  typedef const nnz_lno_t const_nnz_lno_t;
+
+  typedef typename std::remove_const<scalar_t_>::type nnz_scalar_t;
+  typedef const nnz_scalar_t const_nnz_scalar_t;
+
+  typedef typename Kokkos::View<size_type *, HandleTempMemorySpace>
+      row_lno_temp_work_view_t;
+  typedef typename Kokkos::View<size_type *, HandlePersistentMemorySpace>
+      row_lno_persistent_work_view_t;
+  typedef typename row_lno_persistent_work_view_t::HostMirror
+      row_lno_persistent_work_host_view_t;  // Host view type
+
+  typedef typename Kokkos::View<nnz_scalar_t *, HandleTempMemorySpace>
+      scalar_temp_work_view_t;
+  typedef typename Kokkos::View<nnz_scalar_t *, HandlePersistentMemorySpace>
+      scalar_persistent_work_view_t;
+  typedef typename scalar_persistent_work_view_t::HostMirror
+      scalar_persistent_work_host_view_t;  // Host view type
+
+  typedef typename Kokkos::View<nnz_lno_t *, HandleTempMemorySpace>
+      nnz_lno_temp_work_view_t;
+  typedef typename Kokkos::View<nnz_lno_t *, HandlePersistentMemorySpace>
+      nnz_lno_persistent_work_view_t;
+  typedef typename nnz_lno_persistent_work_view_t::HostMirror
+      nnz_lno_persistent_work_host_view_t;  // Host view type
+
+ private:
+  ClusteringAlgorithm cluster_algo;
+
+  // This is the user-configurable target cluster size.
+  // Some clusters may be slightly larger or smaller,
+  // but cluster_xadj always has the exact size of each.
+  nnz_lno_t cluster_size;
+
+  // Coloring algorithm to use on the coarsened graph
+  KokkosGraph::ColoringAlgorithm coloring_algo;
+
+  int suggested_vector_size;
+  int suggested_team_size;
+
+  scalar_persistent_work_view_t inverse_diagonal;
+
+  // cluster_xadj and cluster_adj encode the vertices in each cluster
+  nnz_lno_persistent_work_view_t cluster_xadj;
+  nnz_lno_persistent_work_view_t cluster_adj;
+  // vert_clusters(i) is the cluster that vertex i belongs to
+  nnz_lno_persistent_work_view_t vert_clusters;
+
+ public:
+  /**
+   * \brief Default constructor.
+   */
+
+  // Constructor for cluster-coloring based GS and SGS
+  ClusterGaussSeidelHandle(ClusteringAlgorithm cluster_algo_,
+                           nnz_lno_t cluster_size_,
+                           KokkosGraph::ColoringAlgorithm coloring_algo_)
+      : GSHandle(GS_CLUSTER),
+        cluster_algo(cluster_algo_),
+        cluster_size(cluster_size_),
+        coloring_algo(coloring_algo_),
+        inverse_diagonal(),
+        cluster_xadj(),
+        cluster_adj(),
+        vert_clusters() {}
+
+  void set_cluster_size(nnz_lno_t cs) { this->cluster_size = cs; }
+  nnz_lno_t get_cluster_size() const { return this->cluster_size; }
+
+  KokkosGraph::ColoringAlgorithm get_coloring_algorithm() const {
+    return this->coloring_algo;
+  }
+  void set_coloring_algorithm(KokkosGraph::ColoringAlgorithm algo) {
+    this->coloring_algo = algo;
+  }
 
-    nnz_lno_persistent_work_view_t get_cluster_xadj() const {
-      if(!this->is_symbolic_called())
-        throw std::runtime_error("cluster_xadj does not exist until after symbolic setup.");
-      return cluster_xadj;
-    }
+  void set_vert_clusters(nnz_lno_persistent_work_view_t &vert_clusters_) {
+    this->vert_clusters = vert_clusters_;
+  }
+  void set_cluster_xadj(nnz_lno_persistent_work_view_t &cluster_xadj_) {
+    this->cluster_xadj = cluster_xadj_;
+  }
+  void set_cluster_adj(nnz_lno_persistent_work_view_t &cluster_adj_) {
+    this->cluster_adj = cluster_adj_;
+  }
 
-    nnz_lno_persistent_work_view_t get_cluster_adj() const {
-      if(!this->is_symbolic_called())
-        throw std::runtime_error("cluster_adj does not exist until after symbolic setup.");
-      return cluster_adj;
-    }
+  nnz_lno_persistent_work_view_t get_vert_clusters() const {
+    if (!this->is_symbolic_called())
+      throw std::runtime_error(
+          "vert_clusters does not exist until after symbolic setup.");
+    return vert_clusters;
+  }
 
-    void set_inverse_diagonal(scalar_persistent_work_view_t& inv_diag) {
-      this->inverse_diagonal = inv_diag;
-    }
+  nnz_lno_persistent_work_view_t get_cluster_xadj() const {
+    if (!this->is_symbolic_called())
+      throw std::runtime_error(
+          "cluster_xadj does not exist until after symbolic setup.");
+    return cluster_xadj;
+  }
 
-    scalar_persistent_work_view_t get_inverse_diagonal() const {
-      if(!this->is_symbolic_called())
-        throw std::runtime_error("inverse diagonal does not exist until after numeric setup.");
-      return inverse_diagonal;
-    }
+  nnz_lno_persistent_work_view_t get_cluster_adj() const {
+    if (!this->is_symbolic_called())
+      throw std::runtime_error(
+          "cluster_adj does not exist until after symbolic setup.");
+    return cluster_adj;
+  }
 
-    bool use_teams() const
-    {
-      return KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>();
-    }
+  void set_inverse_diagonal(scalar_persistent_work_view_t &inv_diag) {
+    this->inverse_diagonal = inv_diag;
+  }
 
-    ~ClusterGaussSeidelHandle() = default;
-
-    ClusteringAlgorithm get_clustering_algo() const {return this->cluster_algo;}
-  };
-
-
-  // -------------------------------------
-  // Handle for Two-stage/Classical GS
-  template <typename input_size_t, typename input_ordinal_t, typename input_scalar_t,
-            class ExecutionSpace,
-            class TemporaryMemorySpace,
-            class PersistentMemorySpace>
-  class TwoStageGaussSeidelHandle
-  : public GaussSeidelHandle<input_size_t, input_ordinal_t, input_scalar_t,
-                             ExecutionSpace, TemporaryMemorySpace, PersistentMemorySpace>
-  {
-  public:
-    using memory_space = typename ExecutionSpace::memory_space;
-    using scalar_t  = typename std::remove_const<input_scalar_t>::type;
-    using ordinal_t = typename std::remove_const<input_ordinal_t>::type;
-    using size_type = typename std::remove_const<input_size_t>::type;
-
-    using device_t = Kokkos::Device<ExecutionSpace, TemporaryMemorySpace>;
-    using crsmat_t = KokkosSparse::CrsMatrix <scalar_t, ordinal_t, device_t, void, size_type>;
-    using graph_t  = typename crsmat_t::StaticCrsGraphType;
-
-    using input_row_map_view_t = typename graph_t::row_map_type;
-    using input_entries_view_t = typename graph_t::entries_type;
-    using input_values_view_t  = typename crsmat_t::values_type;
-
-    using const_row_map_view_t = typename input_row_map_view_t::const_type;
-    using       row_map_view_t = typename input_row_map_view_t::non_const_type;
-
-    using const_entries_view_t = typename input_entries_view_t::const_type;
-    using       entries_view_t = typename input_entries_view_t::non_const_type;
-
-    using const_values_view_t = typename input_values_view_t::const_type;
-    using       values_view_t = typename input_values_view_t::non_const_type;
-
-    using const_ordinal_t = typename const_entries_view_t::value_type;
-    using const_scalar_t  = typename const_values_view_t::value_type;
-
-    using vector_view_t = Kokkos::View<scalar_t**, default_layout, device_t>;
-
-    using GSHandle = GaussSeidelHandle<input_size_t, input_ordinal_t, input_scalar_t,
-                                       ExecutionSpace, TemporaryMemorySpace, PersistentMemorySpace>;
-
-    TwoStageGaussSeidelHandle () :
-    GSHandle (GS_TWOSTAGE),
-    nrows (0),
-    nrhs (1),
-    direction (GS_SYMMETRIC),
-    two_stage (true),
-    compact_form (false),
-    num_inner_sweeps (1),
-    num_outer_sweeps (1)
-    {
-      const scalar_t one (1.0);
-      inner_omega = one;
-    }
+  scalar_persistent_work_view_t get_inverse_diagonal() const {
+    if (!this->is_symbolic_called())
+      throw std::runtime_error(
+          "inverse diagonal does not exist until after numeric setup.");
+    return inverse_diagonal;
+  }
 
-    // Sweep direction
-    void setSweepDirection (GSDirection direction_) {
-      this->direction = direction_;
-    }
-    GSDirection getSweepDirection () {
-      return this->direction;
-    }
+  bool use_teams() const {
+    return KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>();
+  }
 
-    // specify whether to perform inner sweeps
-    void setTwoStage (bool two_stage_) {
-      this->two_stage = two_stage_;
-    }
-    bool isTwoStage () {
-      return this->two_stage;
-    }
+  ~ClusterGaussSeidelHandle() = default;
+
+  ClusteringAlgorithm get_clustering_algo() const { return this->cluster_algo; }
+};
+
+// -------------------------------------
+// Handle for Two-stage/Classical GS
+template <typename input_size_t, typename input_ordinal_t,
+          typename input_scalar_t, class ExecutionSpace,
+          class TemporaryMemorySpace, class PersistentMemorySpace>
+class TwoStageGaussSeidelHandle
+    : public GaussSeidelHandle<input_size_t, input_ordinal_t, input_scalar_t,
+                               ExecutionSpace, TemporaryMemorySpace,
+                               PersistentMemorySpace> {
+ public:
+  using memory_space = typename ExecutionSpace::memory_space;
+  using scalar_t     = typename std::remove_const<input_scalar_t>::type;
+  using ordinal_t    = typename std::remove_const<input_ordinal_t>::type;
+  using size_type    = typename std::remove_const<input_size_t>::type;
+
+  using device_t = Kokkos::Device<ExecutionSpace, TemporaryMemorySpace>;
+  using crsmat_t =
+      KokkosSparse::CrsMatrix<scalar_t, ordinal_t, device_t, void, size_type>;
+  using graph_t = typename crsmat_t::StaticCrsGraphType;
+
+  using input_row_map_view_t = typename graph_t::row_map_type;
+  using input_entries_view_t = typename graph_t::entries_type;
+  using input_values_view_t  = typename crsmat_t::values_type;
+
+  using const_row_map_view_t = typename input_row_map_view_t::const_type;
+  using row_map_view_t       = typename input_row_map_view_t::non_const_type;
+
+  using const_entries_view_t = typename input_entries_view_t::const_type;
+  using entries_view_t       = typename input_entries_view_t::non_const_type;
+
+  using const_values_view_t = typename input_values_view_t::const_type;
+  using values_view_t       = typename input_values_view_t::non_const_type;
+
+  using const_ordinal_t = typename const_entries_view_t::value_type;
+  using const_scalar_t  = typename const_values_view_t::value_type;
+
+  using vector_view_t = Kokkos::View<scalar_t **, default_layout, device_t>;
+
+  using GSHandle =
+      GaussSeidelHandle<input_size_t, input_ordinal_t, input_scalar_t,
+                        ExecutionSpace, TemporaryMemorySpace,
+                        PersistentMemorySpace>;
+
+  TwoStageGaussSeidelHandle()
+      : GSHandle(GS_TWOSTAGE),
+        nrows(0),
+        nrhs(1),
+        direction(GS_SYMMETRIC),
+        two_stage(true),
+        compact_form(false),
+        num_inner_sweeps(1),
+        num_outer_sweeps(1) {
+    const scalar_t one(1.0);
+    inner_omega = one;
+  }
 
-    // specify whether to use compact form of recurrence
-    void setCompactForm (bool compact_form_) {
-      this->compact_form = compact_form_;
-    }
-    bool isCompactForm () {
-      return this->compact_form;
-    }
+  // Sweep direction
+  void setSweepDirection(GSDirection direction_) {
+    this->direction = direction_;
+  }
+  GSDirection getSweepDirection() { return this->direction; }
 
-    // Number of outer sweeps
-    void setNumOuterSweeps (int num_outer_sweeps_) {
-      this->num_outer_sweeps = num_outer_sweeps_;
-    }
-    int getNumOuterSweeps () {
-      return this->num_outer_sweeps;
-    }
+  // specify whether to perform inner sweeps
+  void setTwoStage(bool two_stage_) { this->two_stage = two_stage_; }
+  bool isTwoStage() { return this->two_stage; }
 
-    // Number of inner sweeps
-    void setNumInnerSweeps (int num_inner_sweeps_) {
-      this->num_inner_sweeps = num_inner_sweeps_;
-    }
-    int getNumInnerSweeps () {
-      return this->num_inner_sweeps;
-    }
+  // specify whether to use compact form of recurrence
+  void setCompactForm(bool compact_form_) {
+    this->compact_form = compact_form_;
+  }
+  bool isCompactForm() { return this->compact_form; }
 
-    // Inner damping factor
-    void setInnerDampFactor (scalar_t inner_omega_) {
-      this->inner_omega = inner_omega_;
-    }
-    scalar_t getInnerDampFactor () {
-      return this->inner_omega;
-    }
+  // Number of outer sweeps
+  void setNumOuterSweeps(int num_outer_sweeps_) {
+    this->num_outer_sweeps = num_outer_sweeps_;
+  }
+  int getNumOuterSweeps() { return this->num_outer_sweeps; }
 
-    // Workspaces
-    // > diagonal (inverse)
-    void setD (values_view_t D_) {
-      this->D = D_;
-    }
-    values_view_t getD () {
-      return this->D;
-    }
-    // > Lower part of diagonal block
-    void setL (crsmat_t L) {
-      this->crsmatL = L;
-    }
-    crsmat_t getL () {
-      return this->crsmatL;
-    }
-    // > Upper part of diagonal block
-    void setU (crsmat_t U) {
-      this->crsmatU = U;
-    }
-    crsmat_t getU () {
-      return this->crsmatU;
-    }
-    // > Complement of U
-    void setLa (crsmat_t La) {
-      this->crsmatLa = La;
-    }
-    crsmat_t getLa () {
-      return this->crsmatLa;
-    }
-    // > Complement of L
-    void setUa (crsmat_t Ua) {
-      this->crsmatUa = Ua;
-    }
-    crsmat_t getUa () {
-      return this->crsmatUa;
-    }
-    // > diagonal (not-inverse)
-    void setDa (values_view_t Da_) {
-      this->Da = Da_;
-    }
-    values_view_t getDa () {
-      return this->Da;
-    }
+  // Number of inner sweeps
+  void setNumInnerSweeps(int num_inner_sweeps_) {
+    this->num_inner_sweeps = num_inner_sweeps_;
+  }
+  int getNumInnerSweeps() { return this->num_inner_sweeps; }
 
-    void initVectors (int nrows_, int nrhs_) {
-      if (this->nrows != nrows_ || this->nrhs != nrhs_) {
-        this->localR = vector_view_t ("temp", nrows_, nrhs_);
-        this->localT = vector_view_t ("temp", nrows_, nrhs_);
-        this->localZ = vector_view_t ("temp", nrows_, nrhs_);
-        this->nrows = nrows_;
-        this->nrhs = nrhs_;
-      }
-    }
-    vector_view_t getVectorR () {
-      return this->localR;
-    }
-    vector_view_t getVectorT () {
-      return this->localT;
-    }
-    vector_view_t getVectorZ () {
-      return this->localZ;
+  // Inner damping factor
+  void setInnerDampFactor(scalar_t inner_omega_) {
+    this->inner_omega = inner_omega_;
+  }
+  scalar_t getInnerDampFactor() { return this->inner_omega; }
+
+  // Workspaces
+  // > diagonal (inverse)
+  void setD(values_view_t D_) { this->D = D_; }
+  values_view_t getD() { return this->D; }
+  // > Lower part of diagonal block
+  void setL(crsmat_t L) { this->crsmatL = L; }
+  crsmat_t getL() { return this->crsmatL; }
+  // > Upper part of diagonal block
+  void setU(crsmat_t U) { this->crsmatU = U; }
+  crsmat_t getU() { return this->crsmatU; }
+  // > Complement of U
+  void setLa(crsmat_t La) { this->crsmatLa = La; }
+  crsmat_t getLa() { return this->crsmatLa; }
+  // > Complement of L
+  void setUa(crsmat_t Ua) { this->crsmatUa = Ua; }
+  crsmat_t getUa() { return this->crsmatUa; }
+  // > diagonal (not-inverse)
+  void setDa(values_view_t Da_) { this->Da = Da_; }
+  values_view_t getDa() { return this->Da; }
+
+  void initVectors(int nrows_, int nrhs_) {
+    if (this->nrows != nrows_ || this->nrhs != nrhs_) {
+      this->localR = vector_view_t("temp", nrows_, nrhs_);
+      this->localT = vector_view_t("temp", nrows_, nrhs_);
+      this->localZ = vector_view_t("temp", nrows_, nrhs_);
+      this->nrows  = nrows_;
+      this->nrhs   = nrhs_;
     }
-
-  private:
-    int nrows;
-    int nrhs;
-
-    // workspaces
-    // > A = L + D + U
-    values_view_t D;
-    crsmat_t crsmatL;
-    crsmat_t crsmatU;
-    // > complements for compact form of recurrence
-    //   where La = A - U and Ua = A - L
-    values_view_t Da;
-    crsmat_t crsmatLa;
-    crsmat_t crsmatUa;
-
-    // > residual vector for outer GS, Rk = B-A*Xk
-    vector_view_t localR;
-    // > workspace used for inner JR (for SpMV)
-    vector_view_t localT;
-    // > solultion correction from inner JR
-    vector_view_t localZ;
-
-    // solver parameters
-    GSDirection direction;
-    bool two_stage;
-    bool compact_form;
-    int num_inner_sweeps;
-    int num_outer_sweeps;
-    scalar_t inner_omega;
-  };
-  // -------------------------------------
-}
+  }
+  vector_view_t getVectorR() { return this->localR; }
+  vector_view_t getVectorT() { return this->localT; }
+  vector_view_t getVectorZ() { return this->localZ; }
+
+ private:
+  int nrows;
+  int nrhs;
+
+  // workspaces
+  // > A = L + D + U
+  values_view_t D;
+  crsmat_t crsmatL;
+  crsmat_t crsmatU;
+  // > complements for compact form of recurrence
+  //   where La = A - U and Ua = A - L
+  values_view_t Da;
+  crsmat_t crsmatLa;
+  crsmat_t crsmatUa;
+
+  // > residual vector for outer GS, Rk = B-A*Xk
+  vector_view_t localR;
+  // > workspace used for inner JR (for SpMV)
+  vector_view_t localT;
+  // > solultion correction from inner JR
+  vector_view_t localZ;
+
+  // solver parameters
+  GSDirection direction;
+  bool two_stage;
+  bool compact_form;
+  int num_inner_sweeps;
+  int num_outer_sweeps;
+  scalar_t inner_omega;
+};
+// -------------------------------------
+}  // namespace KokkosSparse
 #endif
-
diff --git a/src/sparse/KokkosSparse_getDiagCopy.hpp b/src/sparse/KokkosSparse_getDiagCopy.hpp
index 5dc809ddfd..c1d45b13ec 100644
--- a/src/sparse/KokkosSparse_getDiagCopy.hpp
+++ b/src/sparse/KokkosSparse_getDiagCopy.hpp
@@ -53,24 +53,22 @@
 
 namespace KokkosSparse {
 
-template<class DiagType,
-         class OffsetsType,
-         class CrsMatrixType>
-void
-getDiagCopy (const DiagType& D,
-             const OffsetsType& offsets,
-             const CrsMatrixType& A)
-{
-  static_assert (Kokkos::Impl::is_view<DiagType>::value,
-                 "The DiagType template parameter must be a Kokkos::View.");
-  static_assert (static_cast<int> (DiagType::rank) == 1,
-                 "The DiagType template parameter must be a 1-D Kokkos::View.");
-  static_assert (std::is_same<DiagType, typename DiagType::non_const_type>::value,
-                 "The DiagType template parameter must be a nonconst Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<OffsetsType>::value,
-                 "The OffsetsType template parameter must be a Kokkos::View.");
-  static_assert (static_cast<int> (OffsetsType::rank) == 1,
-                 "The OffsetsType template parameter must be a 1-D Kokkos::View.");
+template <class DiagType, class OffsetsType, class CrsMatrixType>
+void getDiagCopy(const DiagType& D, const OffsetsType& offsets,
+                 const CrsMatrixType& A) {
+  static_assert(Kokkos::is_view<DiagType>::value,
+                "The DiagType template parameter must be a Kokkos::View.");
+  static_assert(static_cast<int>(DiagType::rank) == 1,
+                "The DiagType template parameter must be a 1-D Kokkos::View.");
+  static_assert(
+      std::is_same<typename DiagType::value_type,
+                   typename DiagType::non_const_value_type>::value,
+      "The DiagType template parameter must be a nonconst Kokkos::View.");
+  static_assert(Kokkos::is_view<OffsetsType>::value,
+                "The OffsetsType template parameter must be a Kokkos::View.");
+  static_assert(
+      static_cast<int>(OffsetsType::rank) == 1,
+      "The OffsetsType template parameter must be a 1-D Kokkos::View.");
 
   typedef typename CrsMatrixType::value_type scalar_type;
   typedef typename CrsMatrixType::ordinal_type ordinal_type;
@@ -80,20 +78,20 @@ getDiagCopy (const DiagType& D,
   // Standardize on unmanaged Views, in order to avoid proliferation
   // of instantiations of the implementation type.
   Kokkos::View<typename DiagType::non_const_value_type*,
-    typename DiagType::array_layout,
-    typename DiagType::device_type,
-    Kokkos::MemoryUnmanaged> D_internal = D;
+               typename DiagType::array_layout, typename DiagType::device_type,
+               Kokkos::MemoryUnmanaged>
+      D_internal = D;
   Kokkos::View<typename OffsetsType::const_value_type*,
-    typename OffsetsType::array_layout,
-    typename OffsetsType::device_type,
-    Kokkos::MemoryUnmanaged> offsets_internal = offsets;
+               typename OffsetsType::array_layout,
+               typename OffsetsType::device_type, Kokkos::MemoryUnmanaged>
+      offsets_internal = offsets;
 
-  typedef Impl::CrsMatrixGetDiagCopyWithOffsets<scalar_type,
-    ordinal_type, device_type, offset_type> impl_type;
-  impl_type::getDiagCopy (D_internal, offsets_internal, A);
+  typedef Impl::CrsMatrixGetDiagCopyWithOffsets<scalar_type, ordinal_type,
+                                                device_type, offset_type>
+      impl_type;
+  impl_type::getDiagCopy(D_internal, offsets_internal, A);
 }
 
-} // namespace KokkosSparse
-
-#endif // KOKKOS_SPARSE_GETDIAGCOPY_HPP_
+}  // namespace KokkosSparse
 
+#endif  // KOKKOS_SPARSE_GETDIAGCOPY_HPP_
diff --git a/src/sparse/KokkosSparse_spadd.hpp b/src/sparse/KokkosSparse_spadd.hpp
index f72c06b19c..6db63455be 100644
--- a/src/sparse/KokkosSparse_spadd.hpp
+++ b/src/sparse/KokkosSparse_spadd.hpp
@@ -75,10 +75,11 @@ template <typename size_type, typename ordinal_type, typename ARowPtrsT,
           typename CRowPtrsT, typename ExecSpace>
 struct SortedCountEntriesRange {
   SortedCountEntriesRange(ordinal_type nrows_,
-                     const typename ARowPtrsT::const_type& Arowptrs_,
-                     const AColIndsT& Acolinds_,
-                     const typename BRowPtrsT::const_type& Browptrs_,
-                     const BColIndsT& Bcolinds_, const CRowPtrsT& Crowcounts_)
+                          const typename ARowPtrsT::const_type& Arowptrs_,
+                          const AColIndsT& Acolinds_,
+                          const typename BRowPtrsT::const_type& Browptrs_,
+                          const BColIndsT& Bcolinds_,
+                          const CRowPtrsT& Crowcounts_)
       : nrows(nrows_),
         Arowptrs(Arowptrs_),
         Acolinds(Acolinds_),
@@ -86,8 +87,7 @@ struct SortedCountEntriesRange {
         Bcolinds(Bcolinds_),
         Crowcounts(Crowcounts_) {}
 
-  KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const
-  {
+  KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const {
     const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits<ordinal_type>::max();
 
     // count the union of nonzeros in Arow and Brow
@@ -98,16 +98,17 @@ struct SortedCountEntriesRange {
     size_type Arowlen    = Arowptrs(i + 1) - Arowstart;
     size_type Browstart  = Browptrs(i);
     size_type Browlen    = Browptrs(i + 1) - Browstart;
-    ordinal_type Acol = (Arowlen == 0) ? ORDINAL_MAX : Acolinds(Arowstart);
-    ordinal_type Bcol = (Browlen == 0) ? ORDINAL_MAX : Bcolinds(Browstart);
+    ordinal_type Acol    = (Arowlen == 0) ? ORDINAL_MAX : Acolinds(Arowstart);
+    ordinal_type Bcol    = (Browlen == 0) ? ORDINAL_MAX : Bcolinds(Browstart);
     while (Acol != ORDINAL_MAX || Bcol != ORDINAL_MAX) {
       ordinal_type Ccol = (Acol < Bcol) ? Acol : Bcol;
       numEntries++;
-      //Eat all entries in both A and B which have this column
-      //This also results in Acol/Bcol being updated to following entries for next loop iter
-      while(Acol == Ccol)
+      // Eat all entries in both A and B which have this column
+      // This also results in Acol/Bcol being updated to following entries for
+      // next loop iter
+      while (Acol == Ccol)
         Acol = (ai == Arowlen) ? ORDINAL_MAX : Acolinds(Arowstart + ai++);
-      while(Bcol == Ccol)
+      while (Bcol == Ccol)
         Bcol = (bi == Browlen) ? ORDINAL_MAX : Bcolinds(Browstart + bi++);
     }
     Crowcounts(i) = numEntries;
@@ -126,10 +127,11 @@ template <typename size_type, typename ordinal_type, typename ARowPtrsT,
           typename CRowPtrsT, typename ExecSpace>
 struct SortedCountEntriesTeam {
   SortedCountEntriesTeam(ordinal_type nrows_,
-                     const typename ARowPtrsT::const_type& Arowptrs_,
-                     const AColIndsT& Acolinds_,
-                     const typename BRowPtrsT::const_type& Browptrs_,
-                     const BColIndsT& Bcolinds_, const CRowPtrsT& Crowcounts_)
+                         const typename ARowPtrsT::const_type& Arowptrs_,
+                         const AColIndsT& Acolinds_,
+                         const typename BRowPtrsT::const_type& Browptrs_,
+                         const BColIndsT& Bcolinds_,
+                         const CRowPtrsT& Crowcounts_)
       : nrows(nrows_),
         Arowptrs(Arowptrs_),
         Acolinds(Acolinds_),
@@ -140,8 +142,7 @@ struct SortedCountEntriesTeam {
   using TeamPol = Kokkos::TeamPolicy<ExecSpace>;
   using TeamMem = typename TeamPol::member_type;
 
-  KOKKOS_INLINE_FUNCTION void longRowFallback(const ordinal_type i) const
-  {
+  KOKKOS_INLINE_FUNCTION void longRowFallback(const ordinal_type i) const {
     const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits<ordinal_type>::max();
 
     // count the union of nonzeros in Arow and Brow
@@ -152,121 +153,100 @@ struct SortedCountEntriesTeam {
     size_type Arowlen    = Arowptrs(i + 1) - Arowstart;
     size_type Browstart  = Browptrs(i);
     size_type Browlen    = Browptrs(i + 1) - Browstart;
-    ordinal_type Acol = (Arowlen == 0) ? ORDINAL_MAX : Acolinds(Arowstart);
-    ordinal_type Bcol = (Browlen == 0) ? ORDINAL_MAX : Bcolinds(Browstart);
+    ordinal_type Acol    = (Arowlen == 0) ? ORDINAL_MAX : Acolinds(Arowstart);
+    ordinal_type Bcol    = (Browlen == 0) ? ORDINAL_MAX : Bcolinds(Browstart);
     while (Acol != ORDINAL_MAX || Bcol != ORDINAL_MAX) {
       ordinal_type Ccol = (Acol < Bcol) ? Acol : Bcol;
       numEntries++;
-      //Eat all entries in both A and B which have this column
-      //This also results in Acol/Bcol being updated to following entries for next loop iter
-      while(Acol == Ccol)
+      // Eat all entries in both A and B which have this column
+      // This also results in Acol/Bcol being updated to following entries for
+      // next loop iter
+      while (Acol == Ccol)
         Acol = (ai == Arowlen) ? ORDINAL_MAX : Acolinds(Arowstart + ai++);
-      while(Bcol == Ccol)
+      while (Bcol == Ccol)
         Bcol = (bi == Browlen) ? ORDINAL_MAX : Bcolinds(Browstart + bi++);
     }
     Crowcounts(i) = numEntries;
   }
 
-  KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const
-  {
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const {
     ordinal_type i = t.league_rank() * t.team_size() + t.team_rank();
-    if(i >= nrows)
-      return;
-    ordinal_type* allScratch = (ordinal_type*) t.team_shmem().get_shmem(totalShared);
-    ordinal_type* scratch = allScratch + t.team_rank() * sharedPerThread;
+    if (i >= nrows) return;
+    ordinal_type* allScratch =
+        (ordinal_type*)t.team_shmem().get_shmem(totalShared);
+    ordinal_type* scratch  = allScratch + t.team_rank() * sharedPerThread;
     ordinal_type Arowstart = Arowptrs(i);
     ordinal_type Arowlen   = Arowptrs(i + 1) - Arowstart;
     ordinal_type Browstart = Browptrs(i);
     ordinal_type Browlen   = Browptrs(i + 1) - Browstart;
-    ordinal_type n = Arowlen + Browlen;
-    if(n > sharedPerThread)
-    {
-      //fall back to slow serial method
-      Kokkos::single(Kokkos::PerThread(t),
-      [&]()
-      {
-        longRowFallback(i);
-      });
+    ordinal_type n         = Arowlen + Browlen;
+    if (n > sharedPerThread) {
+      // fall back to slow serial method
+      Kokkos::single(Kokkos::PerThread(t), [&]() { longRowFallback(i); });
       return;
     }
-    if(n == 0)
-    {
-      Kokkos::single(Kokkos::PerThread(t),
-      [&]()
-      {
-        Crowcounts(i) = 0;
-      });
+    if (n == 0) {
+      Kokkos::single(Kokkos::PerThread(t), [&]() { Crowcounts(i) = 0; });
       return;
     }
-    //Figure out the number of bitonic steps: ceil(log2(n))
-    ordinal_type npot = 1;
+    // Figure out the number of bitonic steps: ceil(log2(n))
+    ordinal_type npot   = 1;
     ordinal_type levels = 0;
-    while(npot < n)
-    {
+    while (npot < n) {
       levels++;
       npot <<= 1;
     }
-    //Copy A and B entries to scratch
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, Arowlen),
-    [&](ordinal_type j)
-    {
-      scratch[j] = Acolinds(Arowstart + j);
-    });
+    // Copy A and B entries to scratch
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(t, Arowlen),
+        [&](ordinal_type j) { scratch[j] = Acolinds(Arowstart + j); });
     Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, Browlen),
-    [&](ordinal_type j)
-    {
-      scratch[npot - 1 - j] = Bcolinds(Browstart + j);
-    });
-    //Fill space between A and B with ORDINAL_MAX,
-    //to maintain a valid bitonic sequence of power-of-two length 
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, npot - n),
-    [&](ordinal_type j)
-    {
-      scratch[Arowlen + j] = Kokkos::ArithTraits<ordinal_type>::max();
-    });
+                         [&](ordinal_type j) {
+                           scratch[npot - 1 - j] = Bcolinds(Browstart + j);
+                         });
+    // Fill space between A and B with ORDINAL_MAX,
+    // to maintain a valid bitonic sequence of power-of-two length
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(t, npot - n), [&](ordinal_type j) {
+          scratch[Arowlen + j] = Kokkos::ArithTraits<ordinal_type>::max();
+        });
     // npot = 2^levels
-    for(ordinal_type level = 0; level < levels; level++)
-    {
+    for (ordinal_type level = 0; level < levels; level++) {
       // npot/2 pairs of items are compared in parallel
       Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, npot >> 1),
-        [&](const ordinal_type j)
-        {
-          ordinal_type boxSize = npot >> level;
-          //Which box contains this thread?
-          //box = (j / boxSize), and boxSize = 2^(levels-level), so
-          //box = j * 2^(level-levels) = j >> (levels - level)
-          ordinal_type boxID = (j * 2) >> (levels - level);
-          //boxStart = boxID * boxSize = boxID * 2^(levels-level)
-          //= boxID << (levels-level)
-          ordinal_type boxStart = boxID << (levels - level);
-          ordinal_type boxOffset = j - boxID * boxSize / 2;
-          ordinal_type elem1 = boxStart + boxOffset;
-          ordinal_type elem2 = elem1 + (boxSize >> 1);
-          if(scratch[elem2] < scratch[elem1])
-          {
-            ordinal_type temp = scratch[elem1];
-            scratch[elem1] = scratch[elem2];
-            scratch[elem2] = temp;
-          }
-        });
+                           [&](const ordinal_type j) {
+                             ordinal_type boxSize = npot >> level;
+                             // Which box contains this thread?
+                             // box = (j / boxSize), and boxSize =
+                             // 2^(levels-level), so box = j * 2^(level-levels)
+                             // = j >> (levels - level)
+                             ordinal_type boxID = (j * 2) >> (levels - level);
+                             // boxStart = boxID * boxSize = boxID *
+                             // 2^(levels-level) = boxID << (levels-level)
+                             ordinal_type boxStart  = boxID << (levels - level);
+                             ordinal_type boxOffset = j - boxID * boxSize / 2;
+                             ordinal_type elem1     = boxStart + boxOffset;
+                             ordinal_type elem2     = elem1 + (boxSize >> 1);
+                             if (scratch[elem2] < scratch[elem1]) {
+                               ordinal_type temp = scratch[elem1];
+                               scratch[elem1]    = scratch[elem2];
+                               scratch[elem2]    = temp;
+                             }
+                           });
     }
-    //Finally, count the number of distinct entries (this is #rising edges + 1)
+    // Finally, count the number of distinct entries (this is #rising edges + 1)
     ordinal_type risingEdges;
-    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(t, n - 1),
-      [&](const ordinal_type j, ordinal_type& lcount)
-      {
-        if(scratch[j] != scratch[j + 1])
-          lcount++;
-      }, risingEdges);
+    Kokkos::parallel_reduce(
+        Kokkos::ThreadVectorRange(t, n - 1),
+        [&](const ordinal_type j, ordinal_type& lcount) {
+          if (scratch[j] != scratch[j + 1]) lcount++;
+        },
+        risingEdges);
     Kokkos::single(Kokkos::PerThread(t),
-    [&]()
-    {
-      Crowcounts(i) = risingEdges + 1;
-    });
+                   [&]() { Crowcounts(i) = risingEdges + 1; });
   }
 
-  size_t team_shmem_size(int teamSize) const
-  {
+  size_t team_shmem_size(int teamSize) const {
     return sharedPerThread * sizeof(ordinal_type) * teamSize;
   }
 
@@ -276,8 +256,9 @@ struct SortedCountEntriesTeam {
   const typename BRowPtrsT::const_type Browptrs;
   const BColIndsT Bcolinds;
   CRowPtrsT Crowcounts;
-  int sharedPerThread;  //Shared for each thread, measured in sizeof(ordinal_type)
-  int totalShared;      //Shared for whole team, measured in bytes
+  int sharedPerThread;  // Shared for each thread, measured in
+                        // sizeof(ordinal_type)
+  int totalShared;      // Shared for whole team, measured in bytes
 };
 
 // get upper bound for C entries per row (assumes worst case, that entries in A
@@ -382,8 +363,7 @@ struct MergeEntriesFunctor {
   KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const {
     size_type CrowStart = Crowptrs(i);
     size_type CrowEnd   = Crowptrs(i + 1);
-    if(CrowEnd == CrowStart)
-    {
+    if (CrowEnd == CrowStart) {
       Crowcounts(i) = 0;
       return;
     }
@@ -392,10 +372,10 @@ struct MergeEntriesFunctor {
     size_type BrowStart = Browptrs(i);
     ordinal_type CFit   = 0;  // counting through merged C indices (within row)
     for (size_type Cit = CrowStart; Cit < CrowEnd; Cit++) {
-      if((Cit > CrowStart) && (Ccolinds(Cit) != Ccolinds(Cit - 1)))
-      {
-        //This is a different column than the previous entry, and is not the first entry.
-        //This means that this is the first occurence of a unique column.
+      if ((Cit > CrowStart) && (Ccolinds(Cit) != Ccolinds(Cit - 1))) {
+        // This is a different column than the previous entry, and is not the
+        // first entry. This means that this is the first occurence of a unique
+        // column.
         CFit++;
       }
       size_type permVal = ABperm(Cit);
@@ -429,7 +409,7 @@ struct MergeEntriesFunctor {
   CcolindsT Bpos;
 };
 
-//Run SortedCountEntries: non-GPU, always uses the RangePolicy version.
+// Run SortedCountEntries: non-GPU, always uses the RangePolicy version.
 template <typename KernelHandle, typename alno_row_view_t_,
           typename alno_nnz_view_t_, typename blno_row_view_t_,
           typename blno_nnz_view_t_, typename clno_row_view_t_>
@@ -437,23 +417,25 @@ void runSortedCountEntries(
     const alno_row_view_t_& a_rowmap, const alno_nnz_view_t_& a_entries,
     const blno_row_view_t_& b_rowmap, const blno_nnz_view_t_& b_entries,
     const clno_row_view_t_& c_rowmap,
-    typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space<typename KernelHandle::SPADDHandleType::execution_space>()>::type* = nullptr)
-{
-  using size_type = typename KernelHandle::size_type;
+    typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space<
+        typename KernelHandle::SPADDHandleType::execution_space>()>::type* =
+        nullptr) {
+  using size_type    = typename KernelHandle::size_type;
   using ordinal_type = typename KernelHandle::nnz_lno_t;
-  using execution_space = typename KernelHandle::SPADDHandleType::execution_space;
+  using execution_space =
+      typename KernelHandle::SPADDHandleType::execution_space;
   using range_type = Kokkos::RangePolicy<execution_space>;
-  auto nrows = c_rowmap.extent(0) - 1;
+  auto nrows       = c_rowmap.extent(0) - 1;
   SortedCountEntriesRange<size_type, ordinal_type, alno_row_view_t_,
-    blno_row_view_t_, alno_nnz_view_t_, blno_nnz_view_t_,
-    clno_row_view_t_, execution_space>
+                          blno_row_view_t_, alno_nnz_view_t_, blno_nnz_view_t_,
+                          clno_row_view_t_, execution_space>
       countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap);
   Kokkos::parallel_for(
       "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries",
       range_type(0, nrows), countEntries);
 }
 
-//Run SortedCountEntries: GPU, uses the TeamPolicy or RangePolicy depending
+// Run SortedCountEntries: GPU, uses the TeamPolicy or RangePolicy depending
 //  on average nz per row (a runtime decision)
 template <typename KernelHandle, typename alno_row_view_t_,
           typename alno_nnz_view_t_, typename blno_row_view_t_,
@@ -462,52 +444,55 @@ void runSortedCountEntries(
     const alno_row_view_t_& a_rowmap, const alno_nnz_view_t_& a_entries,
     const blno_row_view_t_& b_rowmap, const blno_nnz_view_t_& b_entries,
     const clno_row_view_t_& c_rowmap,
-    typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space<typename KernelHandle::SPADDHandleType::execution_space>()>::type* = nullptr)
-{
-  using size_type = typename KernelHandle::size_type;
+    typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space<
+        typename KernelHandle::SPADDHandleType::execution_space>()>::type* =
+        nullptr) {
+  using size_type    = typename KernelHandle::size_type;
   using ordinal_type = typename KernelHandle::nnz_lno_t;
-  using execution_space = typename KernelHandle::SPADDHandleType::execution_space;
+  using execution_space =
+      typename KernelHandle::SPADDHandleType::execution_space;
   using RangePol = Kokkos::RangePolicy<execution_space>;
-  using TeamPol = Kokkos::TeamPolicy<execution_space>;
-  auto nrows = c_rowmap.extent(0) - 1;
-  size_type c_est_nnz = 1.4 * (a_entries.extent(0) + b_entries.extent(0)) / nrows;
-  if(c_est_nnz <= 512)
-  {
-    //Convert c_est_nnz to a power of 2
+  using TeamPol  = Kokkos::TeamPolicy<execution_space>;
+  auto nrows     = c_rowmap.extent(0) - 1;
+  size_type c_est_nnz =
+      1.4 * (a_entries.extent(0) + b_entries.extent(0)) / nrows;
+  if (c_est_nnz <= 512) {
+    // Convert c_est_nnz to a power of 2
     size_type pot_est_nnz = 1;
-    while(pot_est_nnz < c_est_nnz)
-      pot_est_nnz *= 2;
-    //Estimate max number of uncompressed entries in each row of C
+    while (pot_est_nnz < c_est_nnz) pot_est_nnz *= 2;
+    // Estimate max number of uncompressed entries in each row of C
     int vector_length = 1;
-    int vector_length_max = TeamPol::vector_length_max();
-    while(vector_length * 2 <= vector_length_max &&
-        (size_type) vector_length * 2 <= pot_est_nnz)
-    {
+    int vector_length_max =
+        KokkosKernels::Impl::kk_get_max_vector_size<execution_space>();
+    while (vector_length * 2 <= vector_length_max &&
+           (size_type)vector_length * 2 <= pot_est_nnz) {
       vector_length *= 2;
     }
     SortedCountEntriesTeam<size_type, ordinal_type, alno_row_view_t_,
-      blno_row_view_t_, alno_nnz_view_t_, blno_nnz_view_t_,
-      clno_row_view_t_, execution_space>
+                           blno_row_view_t_, alno_nnz_view_t_, blno_nnz_view_t_,
+                           clno_row_view_t_, execution_space>
         countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap);
     countEntries.sharedPerThread = pot_est_nnz;
-    //compute largest possible team size
+    // compute largest possible team size
     TeamPol testPolicy(1, 1, vector_length);
-    testPolicy.set_scratch_size(0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type)));
-    int team_size = testPolicy.team_size_recommended(countEntries, Kokkos::ParallelForTag());
-    //construct real policy
+    testPolicy.set_scratch_size(
+        0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type)));
+    int team_size = testPolicy.team_size_recommended(countEntries,
+                                                     Kokkos::ParallelForTag());
+    // construct real policy
     int league_size = (nrows + team_size - 1) / team_size;
     TeamPol policy(league_size, team_size, vector_length);
-    policy.set_scratch_size(0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type)));
-    countEntries.totalShared = countEntries.sharedPerThread * team_size * sizeof(ordinal_type);
+    policy.set_scratch_size(
+        0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type)));
+    countEntries.totalShared =
+        countEntries.sharedPerThread * team_size * sizeof(ordinal_type);
     Kokkos::parallel_for(
-        "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries",
-        policy, countEntries);
-  }
-  else
-  {
+        "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", policy,
+        countEntries);
+  } else {
     SortedCountEntriesRange<size_type, ordinal_type, alno_row_view_t_,
-      blno_row_view_t_, alno_nnz_view_t_, blno_nnz_view_t_,
-      clno_row_view_t_, execution_space>
+                            blno_row_view_t_, alno_nnz_view_t_,
+                            blno_nnz_view_t_, clno_row_view_t_, execution_space>
         countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap);
     Kokkos::parallel_for(
         "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries",
@@ -577,13 +562,12 @@ void spadd_symbolic(
   ordinal_type nrows = a_rowmap.extent(0) - 1;
   typedef Kokkos::RangePolicy<execution_space, ordinal_type> range_type;
   if (addHandle->is_input_sorted()) {
-    runSortedCountEntries
-      <KernelHandle, alno_row_view_t_, alno_nnz_view_t_,
-      blno_row_view_t_, blno_nnz_view_t_, clno_row_view_t_>
-        (a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap);
-    KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum
-      <clno_row_view_t_, execution_space>
-      (nrows + 1, c_rowmap);
+    runSortedCountEntries<KernelHandle, alno_row_view_t_, alno_nnz_view_t_,
+                          blno_row_view_t_, blno_nnz_view_t_, clno_row_view_t_>(
+        a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap);
+    KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<clno_row_view_t_,
+                                                          execution_space>(
+        nrows + 1, c_rowmap);
   } else {
     // note: scoping individual parts of the process to free views sooner,
     // minimizing peak memory usage run the unsorted c_rowmap upper bound
@@ -625,7 +609,7 @@ void spadd_symbolic(
         range_type(0, nrows), unmergedSum);
     // sort the unmerged sum
     KokkosKernels::sort_crs_matrix<execution_space, clno_row_view_t_,
-                                         clno_nnz_view_t_, clno_nnz_view_t_>(
+                                   clno_nnz_view_t_, clno_nnz_view_t_>(
         c_rowmap_upperbound, c_entries_uncompressed, ab_perm);
     clno_nnz_view_t_ a_pos(
         Kokkos::view_alloc(Kokkos::WithoutInitializing, "A entry positions"),
@@ -687,46 +671,43 @@ struct SortedNumericSumFunctor {
         alpha(alpha_),
         beta(beta_) {}
 
-  KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const
-  {
+  KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const {
     const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits<ordinal_type>::max();
 
     // count the union of nonzeros in Arow and Brow
-    size_type ai         = 0;
-    size_type bi         = 0;
-    size_type Arowstart  = Arowptrs(i);
-    size_type Arowlen    = Arowptrs(i + 1) - Arowstart;
-    size_type Browstart  = Browptrs(i);
-    size_type Browlen    = Browptrs(i + 1) - Browstart;
-    ordinal_type Acol = (Arowlen == 0) ? ORDINAL_MAX : Acolinds(Arowstart);
-    ordinal_type Bcol = (Browlen == 0) ? ORDINAL_MAX : Bcolinds(Browstart);
-    size_type Coffset = Crowptrs(i);
-    while (Acol != ORDINAL_MAX || Bcol != ORDINAL_MAX)
-    {
+    size_type ai        = 0;
+    size_type bi        = 0;
+    size_type Arowstart = Arowptrs(i);
+    size_type Arowlen   = Arowptrs(i + 1) - Arowstart;
+    size_type Browstart = Browptrs(i);
+    size_type Browlen   = Browptrs(i + 1) - Browstart;
+    ordinal_type Acol   = (Arowlen == 0) ? ORDINAL_MAX : Acolinds(Arowstart);
+    ordinal_type Bcol   = (Browlen == 0) ? ORDINAL_MAX : Bcolinds(Browstart);
+    size_type Coffset   = Crowptrs(i);
+    while (Acol != ORDINAL_MAX || Bcol != ORDINAL_MAX) {
       ordinal_type Ccol = (Acol < Bcol) ? Acol : Bcol;
-      //Eat all entries in both A and B which have this column
-      //This also results in Acol/Bcol being updated to following entries for next loop iter
+      // Eat all entries in both A and B which have this column
+      // This also results in Acol/Bcol being updated to following entries for
+      // next loop iter
       CscalarT accum = Kokkos::ArithTraits<CscalarT>::zero();
-      while(Acol == Ccol)
-      {
+      while (Acol == Ccol) {
         accum += static_cast<CscalarT>(alpha * Avalues(Arowstart + ai));
         ai++;
-        if(ai == Arowlen)
+        if (ai == Arowlen)
           Acol = ORDINAL_MAX;
         else
           Acol = Acolinds(Arowstart + ai);
       }
-      while(Bcol == Ccol)
-      {
+      while (Bcol == Ccol) {
         accum += static_cast<CscalarT>(beta * Bvalues(Browstart + bi));
         bi++;
-        if(bi == Browlen)
+        if (bi == Browlen)
           Bcol = ORDINAL_MAX;
         else
           Bcol = Bcolinds(Browstart + bi);
       }
       Ccolinds(Coffset) = Ccol;
-      Cvalues(Coffset) = accum;
+      Cvalues(Coffset)  = accum;
       Coffset++;
     }
   }
@@ -824,7 +805,8 @@ void spadd_numeric(KernelHandle* kernel_handle, const alno_row_view_t_ a_rowmap,
   typedef typename KernelHandle::size_type size_type;
   typedef typename KernelHandle::nnz_lno_t ordinal_type;
   typedef typename KernelHandle::nnz_scalar_t scalar_type;
-  typedef typename KernelHandle::SPADDHandleType::execution_space execution_space;
+  typedef
+      typename KernelHandle::SPADDHandleType::execution_space execution_space;
   // Check that A/B/C data types match KernelHandle types, and that C data types
   // are nonconst (doesn't matter if A/B types are const)
   static_assert(SAME_TYPE(ascalar_t_, scalar_type),
@@ -917,8 +899,9 @@ void spadd_symbolic(KernelHandle* handle, const AMatrix& A, const BMatrix& B,
   using values_type  = typename CMatrix::values_type::non_const_type;
 
   // Create the row_map of C, no need to initialize it
-  row_map_type row_mapC(Kokkos::view_alloc(Kokkos::WithoutInitializing, "row map"),
-                        A.numRows() + 1);
+  row_map_type row_mapC(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "row map"),
+      A.numRows() + 1);
   KokkosSparse::Experimental::spadd_symbolic<
       KernelHandle, typename AMatrix::row_map_type::const_type,
       typename AMatrix::index_type::const_type,
@@ -931,14 +914,16 @@ void spadd_symbolic(KernelHandle* handle, const AMatrix& A, const BMatrix& B,
   // views so we can build a graph and then matrix C
   // and subsequently construct C.
   auto addHandle = handle->get_spadd_handle();
-  entries_type entriesC(Kokkos::view_alloc(Kokkos::WithoutInitializing, "entries"),
-                        addHandle->get_c_nnz());
+  entries_type entriesC(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "entries"),
+      addHandle->get_c_nnz());
   // Finally since we already have the number of nnz handy
   // we can go ahead and allocate C's values and set them.
   values_type valuesC(Kokkos::view_alloc(Kokkos::WithoutInitializing, "values"),
                       addHandle->get_c_nnz());
 
-  C = CMatrix("matrix", A.numRows(), A.numCols(), addHandle->get_c_nnz(), valuesC, row_mapC, entriesC);
+  C = CMatrix("matrix", A.numRows(), A.numCols(), addHandle->get_c_nnz(),
+              valuesC, row_mapC, entriesC);
 }
 
 // Symbolic: count entries in each row in C to produce rowmap
diff --git a/src/sparse/KokkosSparse_spadd_handle.hpp b/src/sparse/KokkosSparse_spadd_handle.hpp
index 6bf72013e4..917b1038a6 100644
--- a/src/sparse/KokkosSparse_spadd_handle.hpp
+++ b/src/sparse/KokkosSparse_spadd_handle.hpp
@@ -42,7 +42,6 @@
 //@HEADER
 */
 
-#include <Kokkos_MemoryTraits.hpp>
 #include <Kokkos_Core.hpp>
 #include <iostream>
 #include <string>
@@ -50,20 +49,18 @@
 #ifndef _SPADDHANDLE_HPP
 #define _SPADDHANDLE_HPP
 
-namespace KokkosSparse{
+namespace KokkosSparse {
 
-template <class lno_row_view_t_,
-          class lno_nnz_view_t_,
-          class scalar_nnz_view_t_,
-          class ExecutionSpace,
-          class MemorySpace>
+template <class lno_row_view_t_, class lno_nnz_view_t_,
+          class scalar_nnz_view_t_, class ExecutionSpace, class MemorySpace>
 class SPADDHandle {
-public:
+ public:
   typedef typename lno_nnz_view_t_::non_const_type nnz_lno_view_t;
   typedef typename lno_row_view_t_::non_const_type nnz_row_view_t;
   typedef typename lno_row_view_t_::non_const_value_type size_type;
   typedef ExecutionSpace execution_space;
-private:
+
+ private:
   bool input_sorted;
 
   size_type result_nnz_size;
@@ -71,80 +68,67 @@ class SPADDHandle {
   bool called_symbolic;
   bool called_numeric;
 
-  //a_pos and b_pos are used by the unsorted version of the kernel
-  //both have same length as a_entries and b_entries
-  //each entry provides the index in C row where the corresponding entry is added
+  // a_pos and b_pos are used by the unsorted version of the kernel
+  // both have same length as a_entries and b_entries
+  // each entry provides the index in C row where the corresponding entry is
+  // added
   nnz_lno_view_t a_pos;
   nnz_lno_view_t b_pos;
 
-public:
+ public:
   /**
    * \brief sets the result nnz size.
    * \param result_nnz_size: size of the output matrix.
    */
 
-  void set_a_b_pos(const nnz_lno_view_t& a_pos_in, const nnz_lno_view_t& b_pos_in)
-  {
+  void set_a_b_pos(const nnz_lno_view_t& a_pos_in,
+                   const nnz_lno_view_t& b_pos_in) {
     a_pos = a_pos_in;
     b_pos = b_pos_in;
   }
 
-  nnz_lno_view_t get_a_pos()
-  {
-    return a_pos;
-  }
+  nnz_lno_view_t get_a_pos() { return a_pos; }
 
-  nnz_lno_view_t get_b_pos()
-  {
-    return b_pos;
-  }
+  nnz_lno_view_t get_b_pos() { return b_pos; }
 
   /**
    * \brief sets the result nnz size.
    * \param result_nnz_size: size of the output matrix.
    */
-  void set_c_nnz(size_type result_nnz_size_){
+  void set_c_nnz(size_type result_nnz_size_) {
     this->result_nnz_size = result_nnz_size_;
   }
 
   /**
    * \brief returns the result nnz size.
    */
-  size_type get_c_nnz(){
-    return this->result_nnz_size;
-  }
+  size_type get_c_nnz() { return this->result_nnz_size; }
 
-  void set_sort_option(int option){
-    this->sort_option = option;
-  }
+  void set_sort_option(int option) { this->sort_option = option; }
 
-  int get_sort_option(){
-    return this->sort_option;
-  }
+  int get_sort_option() { return this->sort_option; }
 
   /**
    * \brief Default constructor.
    */
-  SPADDHandle(bool input_is_sorted) :
-    input_sorted(input_is_sorted), result_nnz_size(0),
-    called_symbolic(false), called_numeric(false)
-    {}
+  SPADDHandle(bool input_is_sorted)
+      : input_sorted(input_is_sorted),
+        result_nnz_size(0),
+        called_symbolic(false),
+        called_numeric(false) {}
 
-  virtual ~SPADDHandle() {};
+  virtual ~SPADDHandle(){};
 
-  bool is_symbolic_called(){return this->called_symbolic;}
-  bool is_numeric_called(){return this->called_numeric;}
+  bool is_symbolic_called() { return this->called_symbolic; }
+  bool is_numeric_called() { return this->called_numeric; }
 
-  //setters
-  void set_call_symbolic(bool call = true){this->called_symbolic = call;}
-  void set_call_numeric(bool call = true){this->called_numeric = call;}
+  // setters
+  void set_call_symbolic(bool call = true) { this->called_symbolic = call; }
+  void set_call_numeric(bool call = true) { this->called_numeric = call; }
 
-  bool is_input_sorted()
-  {
-    return input_sorted;
-  }
+  bool is_input_sorted() { return input_sorted; }
 };
 
-}
+}  // namespace KokkosSparse
 
 #endif
diff --git a/src/sparse/KokkosSparse_spgemm.hpp b/src/sparse/KokkosSparse_spgemm.hpp
index 2d33181639..bdf4d0da75 100644
--- a/src/sparse/KokkosSparse_spgemm.hpp
+++ b/src/sparse/KokkosSparse_spgemm.hpp
@@ -70,13 +70,15 @@ void spgemm_symbolic(KernelHandle& kh, const AMatrix& A, const bool Amode,
 
   const size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
   if (c_nnz_size) {
-    entriesC = entries_type(Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"),
-                            c_nnz_size);
-    valuesC  = values_type(Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"),
-                          c_nnz_size);
+    entriesC = entries_type(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"),
+        c_nnz_size);
+    valuesC = values_type(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size);
   }
 
-  C = CMatrix("C=AB", A.numRows(), B.numCols(), c_nnz_size, valuesC, row_mapC, entriesC);
+  C = CMatrix("C=AB", A.numRows(), B.numCols(), c_nnz_size, valuesC, row_mapC,
+              entriesC);
 }
 
 template <class KernelHandle, class AMatrix, class BMatrix, class CMatrix>
@@ -90,7 +92,6 @@ void spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode,
       &kh, A.numRows(), B.numRows(), B.numCols(), A.graph.row_map,
       A.graph.entries, A.values, Amode, B.graph.row_map, B.graph.entries,
       B.values, Bmode, C.graph.row_map, C.graph.entries, C.values);
-  kh.destroy_spgemm_handle();
 }
 
 }  // namespace KokkosSparse
diff --git a/src/sparse/KokkosSparse_spgemm_handle.hpp b/src/sparse/KokkosSparse_spgemm_handle.hpp
index f517682d5e..f7da72d07b 100644
--- a/src/sparse/KokkosSparse_spgemm_handle.hpp
+++ b/src/sparse/KokkosSparse_spgemm_handle.hpp
@@ -42,7 +42,6 @@
 //@HEADER
 */
 
-#include <Kokkos_MemoryTraits.hpp>
 #include <Kokkos_Core.hpp>
 #include <iostream>
 #include <string>
@@ -54,130 +53,147 @@
 #define _SPGEMMHANDLE_HPP
 //#define VERBOSE
 
-namespace KokkosSparse{
-
-//TODO:SPGEMM_KK_MEMORY2 option is for testing in openmp.
-//it wont work on cuda, not bind to a test program.
-//hidden parameter for StringToSPGEMMAlgorithm for now.
-enum SPGEMMAlgorithm{
-		/*DEFAULT*/SPGEMM_KK, SPGEMM_KK_DENSE, SPGEMM_KK_MEMORY, SPGEMM_KK_LP, //KKVARIANTS
-		SPGEMM_CUSPARSE,  SPGEMM_CUSP, SPGEMM_MKL, SPGEMM_MKL2PHASE, SPGEMM_VIENNA, //TPLS
-
-		//TRIANGLE COUNTING SPECIALIZED
-		SPGEMM_KK_TRIANGLE_AI, //SPGEMM_KK_TRIANGLE_DEFAULT, SPGEMM_KK_TRIANGLE_MEM, SPGEMM_KK_TRIANGLE_DENSE,
-		SPGEMM_KK_TRIANGLE_IA_UNION, //SPGEMM_KK_TRIANGLE_DEFAULT_IA_UNION, SPGEMM_KK_TRIANGLE_MEM_IA_UNION, SPGEMM_KK_TRIANGLE_DENSE_IA_UNION,
-		SPGEMM_KK_TRIANGLE_IA,//SPGEMM_KK_TRIANGLE_IA_DEFAULT, SPGEMM_KK_TRIANGLE_IA_MEM, SPGEMM_KK_TRIANGLE_IA_DENSE,
-		SPGEMM_KK_TRIANGLE_LL,
-		SPGEMM_KK_TRIANGLE_LU,
-
-		//below research code.
-		SPGEMM_KK_MULTIMEM, SPGEMM_KK_OUTERMULTIMEM,
-		SPGEMM_DEFAULT, SPGEMM_DEBUG, SPGEMM_SERIAL,
-		SPGEMM_KK_CUCKOO, //USE CUCKOO HASHING
-		SPGEMM_KK_TRACKED_CUCKOO, //USE SCALED CUCKOO HASHING
-		SPGEMM_KK_TRACKED_CUCKOO_F, //USE SCALED FANCY CUCKOO HASHING
-		SPGEMM_KK_SPEED,  // DENSE ACCUMULATOR SAME AS SPEED
-		SPGEMM_KK_MEMORY_SORTED,
-		SPGEMM_KK_MEMORY_TEAM,
-		SPGEMM_KK_MEMORY_BIGTEAM,
-		SPGEMM_KK_MEMORY_SPREADTEAM,
-		SPGEMM_KK_MEMORY_BIGSPREADTEAM,
-		SPGEMM_KK_MEMORY2,
-		SPGEMM_KK_MEMSPEED};
-
-enum SPGEMMAccumulator{
-  SPGEMM_ACC_DEFAULT, SPGEMM_ACC_DENSE, SPGEMM_ACC_SPARSE,
+namespace KokkosSparse {
+
+// TODO:SPGEMM_KK_MEMORY2 option is for testing in openmp.
+// it wont work on cuda, not bind to a test program.
+// hidden parameter for StringToSPGEMMAlgorithm for now.
+enum SPGEMMAlgorithm {
+  /*DEFAULT*/ SPGEMM_KK,
+  SPGEMM_KK_DENSE,
+  SPGEMM_KK_MEMORY,
+  SPGEMM_KK_LP,  // KKVARIANTS
+  SPGEMM_CUSPARSE,
+  SPGEMM_CUSP,
+  SPGEMM_MKL,
+  SPGEMM_MKL2PHASE,
+  SPGEMM_VIENNA,  // TPLS
+
+  // TRIANGLE COUNTING SPECIALIZED
+  SPGEMM_KK_TRIANGLE_AI,  // SPGEMM_KK_TRIANGLE_DEFAULT, SPGEMM_KK_TRIANGLE_MEM,
+                          // SPGEMM_KK_TRIANGLE_DENSE,
+  SPGEMM_KK_TRIANGLE_IA_UNION,  // SPGEMM_KK_TRIANGLE_DEFAULT_IA_UNION,
+                                // SPGEMM_KK_TRIANGLE_MEM_IA_UNION,
+                                // SPGEMM_KK_TRIANGLE_DENSE_IA_UNION,
+  SPGEMM_KK_TRIANGLE_IA,        // SPGEMM_KK_TRIANGLE_IA_DEFAULT,
+                                // SPGEMM_KK_TRIANGLE_IA_MEM,
+                                // SPGEMM_KK_TRIANGLE_IA_DENSE,
+  SPGEMM_KK_TRIANGLE_LL,
+  SPGEMM_KK_TRIANGLE_LU,
+
+  // below research code.
+  SPGEMM_KK_MULTIMEM,
+  SPGEMM_KK_OUTERMULTIMEM,
+  SPGEMM_DEFAULT,
+  SPGEMM_DEBUG,
+  SPGEMM_SERIAL,
+  SPGEMM_KK_CUCKOO,            // USE CUCKOO HASHING
+  SPGEMM_KK_TRACKED_CUCKOO,    // USE SCALED CUCKOO HASHING
+  SPGEMM_KK_TRACKED_CUCKOO_F,  // USE SCALED FANCY CUCKOO HASHING
+  SPGEMM_KK_SPEED,             // DENSE ACCUMULATOR SAME AS SPEED
+  SPGEMM_KK_MEMORY_SORTED,
+  SPGEMM_KK_MEMORY_TEAM,
+  SPGEMM_KK_MEMORY_BIGTEAM,
+  SPGEMM_KK_MEMORY_SPREADTEAM,
+  SPGEMM_KK_MEMORY_BIGSPREADTEAM,
+  SPGEMM_KK_MEMORY2,
+  SPGEMM_KK_MEMSPEED
 };
-template <class size_type_, class lno_t_, class scalar_t_,
-          class ExecutionSpace,
-          class TemporaryMemorySpace,
-          class PersistentMemorySpace>
-class SPGEMMHandle{
-public:
+
+enum SPGEMMAccumulator {
+  SPGEMM_ACC_DEFAULT,
+  SPGEMM_ACC_DENSE,
+  SPGEMM_ACC_SPARSE,
+};
+template <class size_type_, class lno_t_, class scalar_t_, class ExecutionSpace,
+          class TemporaryMemorySpace, class PersistentMemorySpace>
+class SPGEMMHandle {
+ public:
   typedef ExecutionSpace HandleExecSpace;
   typedef TemporaryMemorySpace HandleTempMemorySpace;
   typedef PersistentMemorySpace HandlePersistentMemorySpace;
 
-
-  typedef typename std::remove_const<size_type_>::type  size_type;
+  typedef typename std::remove_const<size_type_>::type size_type;
   typedef const size_type const_size_type;
 
-  typedef typename std::remove_const<lno_t_>::type  nnz_lno_t;
+  typedef typename std::remove_const<lno_t_>::type nnz_lno_t;
   typedef const nnz_lno_t const_nnz_lno_t;
 
-  typedef typename std::remove_const<scalar_t_>::type  nnz_scalar_t;
+  typedef typename std::remove_const<scalar_t_>::type nnz_scalar_t;
   typedef const nnz_scalar_t const_nnz_scalar_t;
 
-
-  typedef typename Kokkos::View<size_type *, HandleTempMemorySpace> row_lno_temp_work_view_t;
-  typedef typename Kokkos::View<size_type *, HandlePersistentMemorySpace> row_lno_persistent_work_view_t;
-  typedef typename row_lno_persistent_work_view_t::HostMirror row_lno_persistent_work_host_view_t; //Host view type
-
-  typedef typename Kokkos::View<nnz_scalar_t *, HandleTempMemorySpace> scalar_temp_work_view_t;
-  typedef typename Kokkos::View<nnz_scalar_t *, HandlePersistentMemorySpace> scalar_persistent_work_view_t;
-
-
-  typedef typename Kokkos::View<nnz_lno_t *, HandleTempMemorySpace> nnz_lno_temp_work_view_t;
-  typedef typename Kokkos::View<nnz_lno_t *, HandlePersistentMemorySpace> nnz_lno_persistent_work_view_t;
-  typedef typename nnz_lno_persistent_work_view_t::HostMirror nnz_lno_persistent_work_host_view_t; //Host view type
-
+  typedef typename Kokkos::View<size_type *, HandleTempMemorySpace>
+      row_lno_temp_work_view_t;
+  typedef typename Kokkos::View<size_type *, HandlePersistentMemorySpace>
+      row_lno_persistent_work_view_t;
+  typedef typename row_lno_persistent_work_view_t::HostMirror
+      row_lno_persistent_work_host_view_t;  // Host view type
+
+  typedef typename Kokkos::View<nnz_scalar_t *, HandleTempMemorySpace>
+      scalar_temp_work_view_t;
+  typedef typename Kokkos::View<nnz_scalar_t *, HandlePersistentMemorySpace>
+      scalar_persistent_work_view_t;
+
+  typedef typename Kokkos::View<nnz_lno_t *, HandleTempMemorySpace>
+      nnz_lno_temp_work_view_t;
+  typedef typename Kokkos::View<nnz_lno_t *, HandlePersistentMemorySpace>
+      nnz_lno_persistent_work_view_t;
+  typedef typename nnz_lno_persistent_work_view_t::HostMirror
+      nnz_lno_persistent_work_host_view_t;  // Host view type
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-  struct cuSparseHandleType{
+  struct cuSparseHandleType {
     cusparseHandle_t handle;
     cusparseOperation_t transA;
     cusparseOperation_t transB;
     cusparseMatDescr_t a_descr;
     cusparseMatDescr_t b_descr;
     cusparseMatDescr_t c_descr;
-    cuSparseHandleType(bool transposeA, bool transposeB){
+    cuSparseHandleType(bool transposeA, bool transposeB) {
       cusparseStatus_t status;
-      status= cusparseCreate(&handle);
+      status = cusparseCreate(&handle);
       if (status != CUSPARSE_STATUS_SUCCESS) {
-        throw std::runtime_error ("cusparseCreate ERROR\n");
-        //return;
+        throw std::runtime_error("cusparseCreate ERROR\n");
+        // return;
       }
       cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST);
 
-      if (transposeA){
+      if (transposeA) {
         transA = CUSPARSE_OPERATION_TRANSPOSE;
+      } else {
+        transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
       }
-      else {
-        transA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
-      }
-      if (transposeB){
+      if (transposeB) {
         transB = CUSPARSE_OPERATION_TRANSPOSE;
-      }
-      else {
-        transB  = CUSPARSE_OPERATION_NON_TRANSPOSE;
+      } else {
+        transB = CUSPARSE_OPERATION_NON_TRANSPOSE;
       }
 
-
       status = cusparseCreateMatDescr(&a_descr);
       if (status != CUSPARSE_STATUS_SUCCESS) {
-        throw std::runtime_error ("cusparseCreateMatDescr a_descr ERROR\n");
-        //return;
+        throw std::runtime_error("cusparseCreateMatDescr a_descr ERROR\n");
+        // return;
       }
-      cusparseSetMatType(a_descr,CUSPARSE_MATRIX_TYPE_GENERAL);
-      cusparseSetMatIndexBase(a_descr,CUSPARSE_INDEX_BASE_ZERO);
+      cusparseSetMatType(a_descr, CUSPARSE_MATRIX_TYPE_GENERAL);
+      cusparseSetMatIndexBase(a_descr, CUSPARSE_INDEX_BASE_ZERO);
 
       status = cusparseCreateMatDescr(&b_descr);
       if (status != CUSPARSE_STATUS_SUCCESS) {
-        throw std::runtime_error ("cusparseCreateMatDescr b_descr ERROR\n");
-        //return;
+        throw std::runtime_error("cusparseCreateMatDescr b_descr ERROR\n");
+        // return;
       }
-      cusparseSetMatType(b_descr,CUSPARSE_MATRIX_TYPE_GENERAL);
-      cusparseSetMatIndexBase(b_descr,CUSPARSE_INDEX_BASE_ZERO);
+      cusparseSetMatType(b_descr, CUSPARSE_MATRIX_TYPE_GENERAL);
+      cusparseSetMatIndexBase(b_descr, CUSPARSE_INDEX_BASE_ZERO);
 
       status = cusparseCreateMatDescr(&c_descr);
       if (status != CUSPARSE_STATUS_SUCCESS) {
-        throw std::runtime_error ("cusparseCreateMatDescr  c_descr ERROR\n");
-        //return;
+        throw std::runtime_error("cusparseCreateMatDescr  c_descr ERROR\n");
+        // return;
       }
-      cusparseSetMatType(c_descr,CUSPARSE_MATRIX_TYPE_GENERAL);
-      cusparseSetMatIndexBase(c_descr,CUSPARSE_INDEX_BASE_ZERO);
+      cusparseSetMatType(c_descr, CUSPARSE_MATRIX_TYPE_GENERAL);
+      cusparseSetMatIndexBase(c_descr, CUSPARSE_INDEX_BASE_ZERO);
     }
-    ~cuSparseHandleType(){
+    ~cuSparseHandleType() {
       cusparseDestroyMatDescr(a_descr);
       cusparseDestroyMatDescr(b_descr);
       cusparseDestroyMatDescr(c_descr);
@@ -187,7 +203,7 @@ class SPGEMMHandle{
 
   typedef cuSparseHandleType SPGEMMcuSparseHandleType;
 #endif
-private:
+ private:
   SPGEMMAlgorithm algorithm_type;
   SPGEMMAccumulator accumulator_type;
   size_type result_nnz_size;
@@ -201,7 +217,8 @@ class SPGEMMHandle{
   nnz_lno_t max_nnz_compressed_result;
 
   size_type compressed_b_size;
-  row_lno_temp_work_view_t compressed_b_rowmap;// compressed_b_set_begins, compressed_b_set_nexts;
+  row_lno_temp_work_view_t
+      compressed_b_rowmap;  // compressed_b_set_begins, compressed_b_set_nexts;
   nnz_lno_temp_work_view_t compressed_b_set_indices, compressed_b_sets;
 
   row_lno_temp_work_view_t compressed_c_rowmap;
@@ -211,8 +228,7 @@ class SPGEMMHandle{
   row_lno_temp_work_view_t tranpose_a_xadj, tranpose_b_xadj, tranpose_c_xadj;
   nnz_lno_temp_work_view_t tranpose_a_adj, tranpose_b_adj, tranpose_c_adj;
 
-  bool transpose_a,transpose_b, transpose_c_symbolic;
-
+  bool transpose_a, transpose_b, transpose_c_symbolic;
 
   nnz_lno_t num_colors;
   nnz_lno_persistent_work_host_view_t color_xadj;
@@ -220,27 +236,24 @@ class SPGEMMHandle{
   nnz_lno_t num_multi_colors, num_used_colors;
   nnz_lno_persistent_work_view_t min_result_row_for_each_row;
 
-
   bool create_lower_triangular;
-  int sort_lower_triangular; //0 - do not sort // 1 - sort // 2 - Algorithm decides (default)
-  int sort_option ;
+  int sort_lower_triangular;  // 0 - do not sort // 1 - sort // 2 - Algorithm
+                              // decides (default)
+  int sort_option;
   nnz_lno_persistent_work_view_t lower_triangular_permutation;
 
   row_lno_persistent_work_view_t lower_triangular_matrix_rowmap;
   nnz_lno_persistent_work_view_t lower_triangular_matrix_entries;
 
-
   row_lno_persistent_work_view_t incidence_matrix_row_map;
   nnz_lno_persistent_work_view_t incidence_matrix_entries;
   bool compress_second_matrix;
 
-
   double multi_color_scale;
   int mkl_sort_option;
   bool calculate_read_write_cost;
 
-  public:
-
+ public:
   std::string coloring_input_file;
   std::string coloring_output_file;
 
@@ -252,419 +265,431 @@ class SPGEMMHandle{
 
   size_t compressed_max_row_flops, compressed_overall_flops;
 
-  void set_first_level_hash_cut_off(double first_level_hash_cut_off_){
+  void set_first_level_hash_cut_off(double first_level_hash_cut_off_) {
     this->first_level_hash_cut_off = first_level_hash_cut_off_;
   }
 
-  double get_first_level_hash_cut_off(){
+  double get_first_level_hash_cut_off() {
     return this->first_level_hash_cut_off;
   }
 
-  void set_compression_cut_off(double compression_cut_off_){
+  void set_compression_cut_off(double compression_cut_off_) {
     this->compression_cut_off = compression_cut_off_;
   }
 
-  double get_compression_cut_off(){
-    return this->compression_cut_off;
-  }
-  void set_min_hash_size_scale(int scale){
-    min_hash_size_scale = scale;
-  }
-  int get_min_hash_size_scale(){
-    return min_hash_size_scale;
-  }
-  void set_read_write_cost_calc(bool read_write_cost_cal){
+  double get_compression_cut_off() { return this->compression_cut_off; }
+  void set_min_hash_size_scale(int scale) { min_hash_size_scale = scale; }
+  int get_min_hash_size_scale() { return min_hash_size_scale; }
+  void set_read_write_cost_calc(bool read_write_cost_cal) {
     this->calculate_read_write_cost = read_write_cost_cal;
   }
-  int get_read_write_cost_calc(){
-    return this->calculate_read_write_cost;
-  }
+  int get_read_write_cost_calc() { return this->calculate_read_write_cost; }
 
-  typename Kokkos::View<int *, HandlePersistentMemorySpace> persistent_c_xadj, persistent_a_xadj, persistent_b_xadj, persistent_a_adj, persistent_b_adj;
+  typename Kokkos::View<int *, HandlePersistentMemorySpace> persistent_c_xadj,
+      persistent_a_xadj, persistent_b_xadj, persistent_a_adj, persistent_b_adj;
   size_t MaxColDenseAcc;
   bool mkl_keep_output;
   bool mkl_convert_to_1base;
   bool is_compression_single_step;
 
-  void set_mkl_sort_option(int mkl_sort_option_){
+  void set_mkl_sort_option(int mkl_sort_option_) {
     this->mkl_sort_option = mkl_sort_option_;
   }
-  int get_mkl_sort_option(){
-    return this->mkl_sort_option;
-  }
+  int get_mkl_sort_option() { return this->mkl_sort_option; }
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-  private:
+ private:
   SPGEMMcuSparseHandleType *cuSPARSEHandle;
 
-  public:
+ public:
 #endif
 
-  void set_c_column_indices(nnz_lno_temp_work_view_t c_col_indices_){
+  void set_c_column_indices(nnz_lno_temp_work_view_t c_col_indices_) {
     this->c_column_indices = c_col_indices_;
   }
 
-  nnz_lno_temp_work_view_t get_c_column_indices(){
+  nnz_lno_temp_work_view_t get_c_column_indices() {
     return this->c_column_indices;
   }
 
-  void set_color_xadj(
-      nnz_lno_t num_colors_,
-      nnz_lno_persistent_work_host_view_t color_xadj_,
-      nnz_lno_persistent_work_view_t color_adj_,
-      nnz_lno_persistent_work_view_t vertex_colors_,
-      nnz_lno_t num_multi_colors_, nnz_lno_t num_used_colors_){
-
-    num_colors = num_colors_;
-    color_xadj = color_xadj_;
-    color_adj = color_adj_;
+  void set_color_xadj(nnz_lno_t num_colors_,
+                      nnz_lno_persistent_work_host_view_t color_xadj_,
+                      nnz_lno_persistent_work_view_t color_adj_,
+                      nnz_lno_persistent_work_view_t vertex_colors_,
+                      nnz_lno_t num_multi_colors_, nnz_lno_t num_used_colors_) {
+    num_colors    = num_colors_;
+    color_xadj    = color_xadj_;
+    color_adj     = color_adj_;
     vertex_colors = vertex_colors_;
 
     num_multi_colors = num_multi_colors_;
-    num_used_colors = num_used_colors_;
+    num_used_colors  = num_used_colors_;
   }
 
   /**
    * \brief sets the result nnz size.
    * \param result_nnz_size: size of the output matrix.
    */
-  void set_c_nnz(size_type result_nnz_size_){
+  void set_c_nnz(size_type result_nnz_size_) {
     this->result_nnz_size = result_nnz_size_;
   }
   /**
    * \brief returns the result nnz size.
    */
-  size_type get_c_nnz(){
-    return this->result_nnz_size;
-  }
+  size_type get_c_nnz() { return this->result_nnz_size; }
 
-  void set_multi_color_scale(double multi_color_scale_){
+  void set_multi_color_scale(double multi_color_scale_) {
     this->multi_color_scale = multi_color_scale_;
   }
 
-  double get_multi_color_scale(){
-    return this->multi_color_scale;
-  }
+  double get_multi_color_scale() { return this->multi_color_scale; }
 
-  void get_color_xadj(
-      nnz_lno_t &num_colors_,
-      nnz_lno_persistent_work_host_view_t &color_xadj_,
-      nnz_lno_persistent_work_view_t &color_adj_,
-      nnz_lno_persistent_work_view_t &vertex_colors_,
-      nnz_lno_t &num_multi_colors_, nnz_lno_t &num_used_colors_){
-    num_colors_ = num_colors;
-    color_xadj_ = color_xadj;
-    color_adj_ = color_adj ;
+  void get_color_xadj(nnz_lno_t &num_colors_,
+                      nnz_lno_persistent_work_host_view_t &color_xadj_,
+                      nnz_lno_persistent_work_view_t &color_adj_,
+                      nnz_lno_persistent_work_view_t &vertex_colors_,
+                      nnz_lno_t &num_multi_colors_,
+                      nnz_lno_t &num_used_colors_) {
+    num_colors_       = num_colors;
+    color_xadj_       = color_xadj;
+    color_adj_        = color_adj;
     num_multi_colors_ = num_multi_colors;
-    num_used_colors_ = num_used_colors ;
-    vertex_colors_ = vertex_colors;
+    num_used_colors_  = num_used_colors;
+    vertex_colors_    = vertex_colors;
   }
 
-  void set_compressed_c(
-      row_lno_temp_work_view_t compressed_c_rowmap_){
+  void set_compressed_c(row_lno_temp_work_view_t compressed_c_rowmap_) {
     compressed_c_rowmap = compressed_c_rowmap_;
   }
 
-  void get_compressed_c(
-      row_lno_temp_work_view_t &compressed_c_rowmap_){
+  void get_compressed_c(row_lno_temp_work_view_t &compressed_c_rowmap_) {
     compressed_c_rowmap_ = compressed_c_rowmap;
   }
 
-  //TODO: store transpose here.
-  void get_c_transpose_symbolic(){}
+  // TODO: store transpose here.
+  void get_c_transpose_symbolic() {}
 
-  
-  void set_sort_lower_triangular(int option){
+  void set_sort_lower_triangular(int option) {
     this->sort_lower_triangular = option;
   }
-  int get_sort_lower_triangular(){
-    return this->sort_lower_triangular;
-  }
+  int get_sort_lower_triangular() { return this->sort_lower_triangular; }
 
-  void set_sort_option(int option){
-    this->sort_option = option;
-  }
-  int get_sort_option(){
-    return this->sort_option;
-  }
+  void set_sort_option(int option) { this->sort_option = option; }
+  int get_sort_option() { return this->sort_option; }
 
-  void set_create_lower_triangular(bool option){
+  void set_create_lower_triangular(bool option) {
     this->create_lower_triangular = option;
   }
-  bool get_create_lower_triangular(){
-    return this->create_lower_triangular;
-  }
+  bool get_create_lower_triangular() { return this->create_lower_triangular; }
 
-  void set_lower_triangular_permutation(nnz_lno_persistent_work_view_t ltp_){
+  void set_lower_triangular_permutation(nnz_lno_persistent_work_view_t ltp_) {
     this->lower_triangular_permutation = ltp_;
   }
 
-  nnz_lno_persistent_work_view_t get_lower_triangular_permutation(){
+  nnz_lno_persistent_work_view_t get_lower_triangular_permutation() {
     return this->lower_triangular_permutation;
   }
 
   void set_lower_triangular_matrix(
       row_lno_persistent_work_view_t lower_triangular_matrix_rowmap_,
-      nnz_lno_persistent_work_view_t lower_triangular_matrix_entries_){
-    this->lower_triangular_matrix_rowmap = lower_triangular_matrix_rowmap_;
+      nnz_lno_persistent_work_view_t lower_triangular_matrix_entries_) {
+    this->lower_triangular_matrix_rowmap  = lower_triangular_matrix_rowmap_;
     this->lower_triangular_matrix_entries = lower_triangular_matrix_entries_;
   }
   void get_lower_triangular_matrix(
       row_lno_persistent_work_view_t &lower_triangular_matrix_rowmap_,
-      nnz_lno_persistent_work_view_t &lower_triangular_matrix_entries_){
-    lower_triangular_matrix_rowmap_ = this->lower_triangular_matrix_rowmap;
+      nnz_lno_persistent_work_view_t &lower_triangular_matrix_entries_) {
+    lower_triangular_matrix_rowmap_  = this->lower_triangular_matrix_rowmap;
     lower_triangular_matrix_entries_ = this->lower_triangular_matrix_entries;
   }
 
-
-  void set_compressed_b(
-      size_type b_nnz_size,
-      row_lno_temp_work_view_t compressed_b_rowmap_,
-      nnz_lno_temp_work_view_t compressed_b_set_indices_,
-      nnz_lno_temp_work_view_t compressed_b_sets_){
-    compressed_b_size = b_nnz_size;
-    compressed_b_rowmap = compressed_b_rowmap_;
+  void set_compressed_b(size_type b_nnz_size,
+                        row_lno_temp_work_view_t compressed_b_rowmap_,
+                        nnz_lno_temp_work_view_t compressed_b_set_indices_,
+                        nnz_lno_temp_work_view_t compressed_b_sets_) {
+    compressed_b_size        = b_nnz_size;
+    compressed_b_rowmap      = compressed_b_rowmap_;
     compressed_b_set_indices = compressed_b_set_indices_;
-    compressed_b_sets = compressed_b_sets_;
+    compressed_b_sets        = compressed_b_sets_;
   }
 
-
-  void get_compressed_b(
-      size_type &b_nnz_size,
-      row_lno_temp_work_view_t &compressed_b_rowmap_,
-      nnz_lno_temp_work_view_t &compressed_b_set_indices_,
-      nnz_lno_temp_work_view_t &compressed_b_sets_){
-    b_nnz_size = compressed_b_size;
-    compressed_b_rowmap_ = compressed_b_rowmap;
+  void get_compressed_b(size_type &b_nnz_size,
+                        row_lno_temp_work_view_t &compressed_b_rowmap_,
+                        nnz_lno_temp_work_view_t &compressed_b_set_indices_,
+                        nnz_lno_temp_work_view_t &compressed_b_sets_) {
+    b_nnz_size                = compressed_b_size;
+    compressed_b_rowmap_      = compressed_b_rowmap;
     compressed_b_set_indices_ = compressed_b_set_indices;
-    compressed_b_sets_ = compressed_b_sets;
+    compressed_b_sets_        = compressed_b_sets;
   }
 
   /**
    * \brief Default constructor.
    */
-  SPGEMMHandle(SPGEMMAlgorithm gs = SPGEMM_DEFAULT):
-    algorithm_type(gs), accumulator_type(SPGEMM_ACC_DEFAULT), result_nnz_size(0),
-    called_symbolic(false), called_numeric(false),
-    suggested_vector_size(0), suggested_team_size(0), max_nnz_inresult(0),
-    c_column_indices(),
-    tranpose_a_xadj(), tranpose_b_xadj(), tranpose_c_xadj(),
-    tranpose_a_adj(), tranpose_b_adj(), tranpose_c_adj(),
-    transpose_a(false),transpose_b(false), transpose_c_symbolic(false),
-    num_colors(0),
-    color_xadj(), color_adj(), vertex_colors(), num_multi_colors(0),num_used_colors(0),
-    min_result_row_for_each_row(),
-
-    create_lower_triangular(false),
-    sort_lower_triangular(2),
-    sort_option (-1),
-    lower_triangular_permutation(),
-    lower_triangular_matrix_rowmap(),
-    lower_triangular_matrix_entries(),
-    incidence_matrix_row_map(),
-    incidence_matrix_entries(),compress_second_matrix(true),
-
-    multi_color_scale(1), mkl_sort_option(7), calculate_read_write_cost(false),
-    coloring_input_file(""),
-    coloring_output_file(""), min_hash_size_scale(1), compression_cut_off(0.85), first_level_hash_cut_off(0.50),
-    original_max_row_flops(std::numeric_limits<size_t>::max()), original_overall_flops(std::numeric_limits<size_t>::max()),
-    persistent_a_xadj(), persistent_b_xadj(), persistent_a_adj(), persistent_b_adj(), MaxColDenseAcc(250001),
-    mkl_keep_output(true),
-    mkl_convert_to_1base(true), is_compression_single_step(false)
+  SPGEMMHandle(SPGEMMAlgorithm gs = SPGEMM_DEFAULT)
+      : algorithm_type(gs),
+        accumulator_type(SPGEMM_ACC_DEFAULT),
+        result_nnz_size(0),
+        called_symbolic(false),
+        called_numeric(false),
+        suggested_vector_size(0),
+        suggested_team_size(0),
+        max_nnz_inresult(0),
+        c_column_indices(),
+        tranpose_a_xadj(),
+        tranpose_b_xadj(),
+        tranpose_c_xadj(),
+        tranpose_a_adj(),
+        tranpose_b_adj(),
+        tranpose_c_adj(),
+        transpose_a(false),
+        transpose_b(false),
+        transpose_c_symbolic(false),
+        num_colors(0),
+        color_xadj(),
+        color_adj(),
+        vertex_colors(),
+        num_multi_colors(0),
+        num_used_colors(0),
+        min_result_row_for_each_row(),
+
+        create_lower_triangular(false),
+        sort_lower_triangular(2),
+        sort_option(-1),
+        lower_triangular_permutation(),
+        lower_triangular_matrix_rowmap(),
+        lower_triangular_matrix_entries(),
+        incidence_matrix_row_map(),
+        incidence_matrix_entries(),
+        compress_second_matrix(true),
+
+        multi_color_scale(1),
+        mkl_sort_option(7),
+        calculate_read_write_cost(false),
+        coloring_input_file(""),
+        coloring_output_file(""),
+        min_hash_size_scale(1),
+        compression_cut_off(0.85),
+        first_level_hash_cut_off(0.50),
+        original_max_row_flops(std::numeric_limits<size_t>::max()),
+        original_overall_flops(std::numeric_limits<size_t>::max()),
+        persistent_a_xadj(),
+        persistent_b_xadj(),
+        persistent_a_adj(),
+        persistent_b_adj(),
+        MaxColDenseAcc(250001),
+        mkl_keep_output(true),
+        mkl_convert_to_1base(true),
+        is_compression_single_step(false)
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-  ,cuSPARSEHandle(NULL)
+        ,
+        cuSPARSEHandle(NULL)
 #endif
   {
-    if (gs == SPGEMM_DEFAULT){
+    if (gs == SPGEMM_DEFAULT) {
       this->choose_default_algorithm();
     }
   }
 
-
-  virtual ~SPGEMMHandle(){
-
+  virtual ~SPGEMMHandle() {
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
     this->destroy_cuSPARSE_Handle();
 #endif
   };
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-  void create_cuSPARSE_Handle(bool transA, bool transB){
+  void create_cuSPARSE_Handle(bool transA, bool transB) {
     this->destroy_cuSPARSE_Handle();
     this->cuSPARSEHandle = new cuSparseHandleType(transA, transB);
   }
-  void destroy_cuSPARSE_Handle(){
-    if (this->cuSPARSEHandle != NULL){
+  void destroy_cuSPARSE_Handle() {
+    if (this->cuSPARSEHandle != NULL) {
       delete this->cuSPARSEHandle;
       this->cuSPARSEHandle = NULL;
     }
   }
 
-  SPGEMMcuSparseHandleType *get_cuSparseHandle(){
+  SPGEMMcuSparseHandleType *get_cuSparseHandle() {
     return this->cuSPARSEHandle;
   }
 #endif
-  void choose_default_algorithm(){
-#if defined( KOKKOS_ENABLE_SERIAL )
-    if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){
+  void choose_default_algorithm() {
+#if defined(KOKKOS_ENABLE_SERIAL)
+    if (std::is_same<Kokkos::Serial, ExecutionSpace>::value) {
       this->algorithm_type = SPGEMM_SERIAL;
 #ifdef VERBOSE
-      std::cout << "Serial Execution Space, Default Algorithm: SPGEMM_SERIAL" << std::endl;
+      std::cout << "Serial Execution Space, Default Algorithm: SPGEMM_SERIAL"
+                << std::endl;
 #endif
     }
 #endif
 
-#if defined( KOKKOS_ENABLE_THREADS )
-    if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){
+#if defined(KOKKOS_ENABLE_THREADS)
+    if (std::is_same<Kokkos::Threads, ExecutionSpace>::value) {
       this->algorithm_type = SPGEMM_SERIAL;
 #ifdef VERBOSE
-      std::cout << "THREADS Execution Space, Default Algorithm: SPGEMM_SERIAL" << std::endl;
+      std::cout << "THREADS Execution Space, Default Algorithm: SPGEMM_SERIAL"
+                << std::endl;
 #endif
     }
 #endif
 
-#if defined( KOKKOS_ENABLE_OPENMP )
-    if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){
+#if defined(KOKKOS_ENABLE_OPENMP)
+    if (std::is_same<Kokkos::OpenMP, ExecutionSpace>::value) {
       this->algorithm_type = SPGEMM_SERIAL;
 #ifdef VERBOSE
-      std::cout << "OpenMP Execution Space, Default Algorithm: SPGEMM_SERIAL" << std::endl;
+      std::cout << "OpenMP Execution Space, Default Algorithm: SPGEMM_SERIAL"
+                << std::endl;
 #endif
     }
 #endif
 
-#if defined( KOKKOS_ENABLE_CUDA )
-    if (std::is_same<Kokkos::Cuda, ExecutionSpace >::value){
+#if defined(KOKKOS_ENABLE_CUDA)
+    if (std::is_same<Kokkos::Cuda, ExecutionSpace>::value) {
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
       this->algorithm_type = SPGEMM_CUSPARSE;
+#else
+      this->algorithm_type = SPGEMM_KK;
+#endif
 #ifdef VERBOSE
-      std::cout << "Cuda Execution Space, Default Algorithm: SPGEMM_CUSPARSE" << std::endl;
+      std::cout << "Cuda Execution Space, Default Algorithm: SPGEMM_CUSPARSE"
+                << std::endl;
 #endif
     }
 #endif
 
-#if defined( KOKKOS_ENABLE_HIP )
-    if (std::is_same<Kokkos::Experimental::HIP, ExecutionSpace >::value){
+#if defined(KOKKOS_ENABLE_HIP)
+    if (std::is_same<Kokkos::Experimental::HIP, ExecutionSpace>::value) {
       this->algorithm_type = SPGEMM_KK;
 #ifdef VERBOSE
-      std::cout << "HIP Execution Space, Default Algorithm: SPGEMM_KK" << std::endl;
+      std::cout << "HIP Execution Space, Default Algorithm: SPGEMM_KK"
+                << std::endl;
 #endif
     }
 #endif
 
-#if defined( KOKKOS_ENABLE_QTHREAD)
-    if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){
-      this->algorithm_type = SPGEMM_SERIAL;
+#if defined(KOKKOS_ENABLE_SYCL)
+    if (std::is_same<Kokkos::Experimental::SYCL, ExecutionSpace>::value) {
+      this->algorithm_type = SPGEMM_KK;
 #ifdef VERBOSE
-      std::cout << "Qthread Execution Space, Default Algorithm: SPGEMM_SERIAL" << std::endl;
+      std::cout << "SYCL Execution Space, Default Algorithm: SPGEMM_KK"
+                << std::endl;
 #endif
     }
 #endif
   }
 
-
-  void set_compression(bool compress_second_matrix_){
+  void set_compression(bool compress_second_matrix_) {
     this->compress_second_matrix = compress_second_matrix_;
   }
 
-  bool get_compression (){
-    return this->compress_second_matrix;
-  }
-
-  SPGEMMAccumulator get_accumulator_type() const {return this->accumulator_type;}
-  void set_accumulator_type(const SPGEMMAccumulator &acc_type){this->accumulator_type = acc_type;}
+  bool get_compression() { return this->compress_second_matrix; }
 
+  SPGEMMAccumulator get_accumulator_type() const {
+    return this->accumulator_type;
+  }
+  void set_accumulator_type(const SPGEMMAccumulator &acc_type) {
+    this->accumulator_type = acc_type;
+  }
 
-  //getters
-  SPGEMMAlgorithm get_algorithm_type() const {return this->algorithm_type;}
+  // getters
+  SPGEMMAlgorithm get_algorithm_type() const { return this->algorithm_type; }
 
-  bool is_symbolic_called(){return this->called_symbolic;}
-  bool is_numeric_called(){return this->called_numeric;}
+  bool is_symbolic_called() { return this->called_symbolic; }
+  bool is_numeric_called() { return this->called_numeric; }
 
+  nnz_lno_t get_max_result_nnz() const { return this->max_nnz_inresult; }
 
-  nnz_lno_t get_max_result_nnz() const{
-    return this->max_nnz_inresult ;
+  nnz_lno_t get_max_compresed_result_nnz() const {
+    return this->max_nnz_compressed_result;
   }
 
-  nnz_lno_t get_max_compresed_result_nnz() const{
-    return this->max_nnz_compressed_result ;
+  // setters
+  void set_algorithm_type(const SPGEMMAlgorithm &sgs_algo) {
+    this->algorithm_type = sgs_algo;
   }
+  void set_call_symbolic(bool call = true) { this->called_symbolic = call; }
+  void set_call_numeric(bool call = true) { this->called_numeric = call; }
 
-
-  //setters
-  void set_algorithm_type(const SPGEMMAlgorithm &sgs_algo){this->algorithm_type = sgs_algo;}
-  void set_call_symbolic(bool call = true){this->called_symbolic = call;}
-  void set_call_numeric(bool call = true){this->called_numeric = call;}
-
-  void set_max_result_nnz(nnz_lno_t num_result_nnz_){
+  void set_max_result_nnz(nnz_lno_t num_result_nnz_) {
     this->max_nnz_inresult = num_result_nnz_;
   }
 
-  void set_max_compresed_result_nnz(nnz_lno_t num_result_nnz_){
+  void set_max_compresed_result_nnz(nnz_lno_t num_result_nnz_) {
     this->max_nnz_compressed_result = num_result_nnz_;
   }
 
-  void vector_team_size(
-      int max_allowed_team_size,
-      int &suggested_vector_size_,
-      int &suggested_team_size_,
-      size_type nr, size_type nnz){
-    //suggested_team_size_ =  this->suggested_team_size = 1;
-    //suggested_vector_size_=this->suggested_vector_size = 1;
-    //return;
+  void vector_team_size(int max_allowed_team_size, int &suggested_vector_size_,
+                        int &suggested_team_size_, size_type nr,
+                        size_type nnz) {
+    // suggested_team_size_ =  this->suggested_team_size = 1;
+    // suggested_vector_size_=this->suggested_vector_size = 1;
+    // return;
     if (this->suggested_team_size && this->suggested_vector_size) {
-      //already set in the handle
+      // already set in the handle
       suggested_vector_size_ = this->suggested_vector_size;
-      suggested_team_size_ = this->suggested_team_size;
+      suggested_team_size_   = this->suggested_team_size;
       return;
     }
 
-    //otherwise, recompute team_size/vector_size based on heuristic and save them in the handle
-    suggested_vector_size_ = KokkosKernels::Impl::kk_get_suggested_vector_size(nr, nnz, KokkosKernels::Impl::kk_get_exec_space_type<ExecutionSpace>());
-    if(KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>())
+    // otherwise, recompute team_size/vector_size based on heuristic and save
+    // them in the handle
+    suggested_vector_size_ = KokkosKernels::Impl::kk_get_suggested_vector_size(
+        nr, nnz, KokkosKernels::Impl::kk_get_exec_space_type<ExecutionSpace>());
+    if (KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>())
       suggested_team_size_ = max_allowed_team_size / suggested_vector_size_;
     else
       suggested_team_size = max_allowed_team_size;
     this->suggested_vector_size = suggested_vector_size_;
-    this->suggested_team_size = suggested_vector_size_;
+    this->suggested_team_size   = suggested_vector_size_;
   }
 
-  void set_compression_steps(bool isCompressionSingleStep){
+  void set_compression_steps(bool isCompressionSingleStep) {
     this->is_compression_single_step = isCompressionSingleStep;
   }
 
-  void set_min_col_of_row(nnz_lno_persistent_work_view_t min_result_row_for_each_row_){
+  void set_min_col_of_row(
+      nnz_lno_persistent_work_view_t min_result_row_for_each_row_) {
     this->min_result_row_for_each_row = min_result_row_for_each_row_;
   }
 
-  nnz_lno_persistent_work_view_t get_min_col_of_row(){
+  nnz_lno_persistent_work_view_t get_min_col_of_row() {
     return this->min_result_row_for_each_row;
   }
 
-  bool get_compression_step(){
-    return is_compression_single_step;
-  }
+  bool get_compression_step() { return is_compression_single_step; }
 };
 
-
-  inline SPGEMMAlgorithm StringToSPGEMMAlgorithm(std::string & name) {
-    if(name=="SPGEMM_DEFAULT")             return SPGEMM_KK;
-    else if(name=="SPGEMM_KK")       	   return SPGEMM_KK;
-    else if(name=="SPGEMM_KK_MEMORY")      return SPGEMM_KK_MEMORY;
-    else if(name=="SPGEMM_KK_DENSE")       return SPGEMM_KK_DENSE;
-    else if(name=="SPGEMM_KK_LP")  		   return SPGEMM_KK_LP;
-    else if(name=="SPGEMM_KK_MEMSPEED")    return SPGEMM_KK;
-
-    else if(name=="SPGEMM_DEBUG")          return SPGEMM_SERIAL;
-    else if(name=="SPGEMM_SERIAL")         return SPGEMM_SERIAL;
-    else if(name=="SPGEMM_CUSPARSE")       return SPGEMM_CUSPARSE;
-    else if(name=="SPGEMM_CUSP")           return SPGEMM_CUSP;
-    else if(name=="SPGEMM_MKL")            return SPGEMM_MKL;
-    else if(name=="SPGEMM_VIENNA")         return SPGEMM_VIENNA;
-    else
-      throw std::runtime_error("Invalid SPGEMMAlgorithm name");
-  }
-
-
-
+inline SPGEMMAlgorithm StringToSPGEMMAlgorithm(std::string &name) {
+  if (name == "SPGEMM_DEFAULT")
+    return SPGEMM_KK;
+  else if (name == "SPGEMM_KK")
+    return SPGEMM_KK;
+  else if (name == "SPGEMM_KK_MEMORY")
+    return SPGEMM_KK_MEMORY;
+  else if (name == "SPGEMM_KK_DENSE")
+    return SPGEMM_KK_DENSE;
+  else if (name == "SPGEMM_KK_LP")
+    return SPGEMM_KK_LP;
+  else if (name == "SPGEMM_KK_MEMSPEED")
+    return SPGEMM_KK;
+
+  else if (name == "SPGEMM_DEBUG")
+    return SPGEMM_SERIAL;
+  else if (name == "SPGEMM_SERIAL")
+    return SPGEMM_SERIAL;
+  else if (name == "SPGEMM_CUSPARSE")
+    return SPGEMM_CUSPARSE;
+  else if (name == "SPGEMM_CUSP")
+    return SPGEMM_CUSP;
+  else if (name == "SPGEMM_MKL")
+    return SPGEMM_MKL;
+  else if (name == "SPGEMM_VIENNA")
+    return SPGEMM_VIENNA;
+  else
+    throw std::runtime_error("Invalid SPGEMMAlgorithm name");
 }
 
+}  // namespace KokkosSparse
+
 #endif
diff --git a/src/sparse/KokkosSparse_spgemm_jacobi.hpp b/src/sparse/KokkosSparse_spgemm_jacobi.hpp
index 8e67bc5dee..35f6fb7920 100644
--- a/src/sparse/KokkosSparse_spgemm_jacobi.hpp
+++ b/src/sparse/KokkosSparse_spgemm_jacobi.hpp
@@ -47,201 +47,219 @@
 #include "KokkosKernels_helpers.hpp"
 #include "KokkosSparse_spgemm_jacobi_spec.hpp"
 
-namespace KokkosSparse{
-
-  namespace Experimental{
-
-
-    template <typename KernelHandle,
-	      typename alno_row_view_t_,
-	      typename alno_nnz_view_t_,
-	      typename ascalar_nnz_view_t_,
-	      typename blno_row_view_t_,
-	      typename blno_nnz_view_t_,
-	      typename bscalar_nnz_view_t_,
-	      typename clno_row_view_t_,
-	      typename clno_nnz_view_t_,
-	      typename cscalar_nnz_view_t_,
-	      typename dinv_view_t_>
-    void spgemm_jacobi(KernelHandle *handle,
-		       typename KernelHandle::const_nnz_lno_t m,
-		       typename KernelHandle::const_nnz_lno_t n,
-		       typename KernelHandle::const_nnz_lno_t k,
-
-		       alno_row_view_t_ row_mapA,
-		       alno_nnz_view_t_ entriesA,
-		       ascalar_nnz_view_t_ valuesA,
-
-		       bool transposeA,
-		       blno_row_view_t_ row_mapB,
-		       blno_nnz_view_t_ entriesB,
-		       bscalar_nnz_view_t_ valuesB,
-    
-		       bool transposeB,
-		       clno_row_view_t_ row_mapC,
-		       clno_nnz_view_t_ &entriesC,
-		       cscalar_nnz_view_t_ &valuesC,
-
-		       typename cscalar_nnz_view_t_::const_value_type omega,
-		       dinv_view_t_ dinv){
-
-
-      static_assert (std::is_same<typename clno_row_view_t_::value_type,
-		     typename clno_row_view_t_::non_const_value_type>::value,
-		     "KokkosSparse::spgemm_jacobi: Output matrix rowmap must be non-const.");
-
-      static_assert (std::is_same<typename clno_nnz_view_t_::value_type,
-		     typename clno_nnz_view_t_::non_const_value_type>::value,
-		     "KokkosSparse::spgemm_jacobi: Output matrix entriesView must be non-const.");
-
-      static_assert (std::is_same<typename cscalar_nnz_view_t_::value_type,
-		     typename cscalar_nnz_view_t_::non_const_value_type>::value,
-		     "KokkosSparse::spgemm_jacobi: Output matrix scalar view must be non-const.");
-
-      static_assert (std::is_same<typename KernelHandle::const_size_type,
-		     typename alno_row_view_t_::const_value_type>::value,
-		     "KokkosSparse::spgemm_jacobi: Size type of left handside matrix should be same as kernelHandle sizetype.");
-
-      static_assert (std::is_same<typename KernelHandle::const_size_type,
-		     typename blno_row_view_t_::const_value_type>::value,
-		     "KokkosSparse::spgemm_jacobi: Size type of right handside matrix should be same as kernelHandle sizetype.");
-
-      static_assert (std::is_same<typename KernelHandle::const_size_type,
-		     typename clno_row_view_t_::const_value_type>::value,
-		     "KokkosSparse::spgemm_jacobi: Size type of output matrix should be same as kernelHandle sizetype.");
-
-      static_assert (std::is_same<typename KernelHandle::const_nnz_lno_t,
-		     typename alno_nnz_view_t_::const_value_type>::value,
-		     "KokkosSparse::spgemm_jacobi: lno type of left handside matrix should be same as kernelHandle lno_t.");
-
-      static_assert (std::is_same<typename KernelHandle::const_nnz_lno_t,
-		     typename blno_nnz_view_t_::const_value_type>::value,
-		     "KokkosSparse::spgemm_jacobi: lno type of right handside matrix should be same as kernelHandle lno_t.");
-
-      static_assert (std::is_same<typename KernelHandle::const_nnz_lno_t,
-		     typename clno_nnz_view_t_::const_value_type>::value,
-		     "KokkosSparse::spgemm_jacobi: lno type of output matrix should be same as kernelHandle lno_t.");
-
-      static_assert (std::is_same<typename KernelHandle::const_nnz_scalar_t,
-		     typename ascalar_nnz_view_t_::const_value_type>::value,
-		     "KokkosSparse::spgemm_jacobi: scalar type of left handside matrix should be same as kernelHandle scalar.");
-
-      static_assert (std::is_same<typename KernelHandle::const_nnz_scalar_t,
-		     typename bscalar_nnz_view_t_::const_value_type>::value,
-		     "KokkosSparse::spgemm_jacobi: scalar type of right handside matrix should be same as kernelHandle scalar.");
-
-      static_assert (std::is_same<typename KernelHandle::const_nnz_scalar_t,
-		     typename cscalar_nnz_view_t_::const_value_type>::value,
-		     "KokkosSparse::spgemm_jacobi: scalar type of output matrix should be same as kernelHandle scalar.");
-
-
-      if (transposeA || transposeB){
-	throw std::runtime_error ("spgemm-jacobi does not support transposed multiply. "
-				  "If you need this case please let kokkos-kernels developers know.\n");
-      }
-
-
-      typedef typename KernelHandle::const_size_type c_size_t;
-      typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
-      typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
-
-      typedef typename KernelHandle::HandleExecSpace c_exec_t;
-      typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
-      typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
-      typedef typename Kokkos::Device<c_exec_t, c_temp_t> UniformDevice_t;
-
-      typedef typename  KokkosKernels::Experimental::KokkosKernelsHandle<c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t> const_handle_type;
-
-
-
-      const_handle_type tmp_handle (*handle);
-
-      typedef Kokkos::View<
-	typename alno_row_view_t_::const_value_type*,
-	typename KokkosKernels::Impl::GetUnifiedLayout<alno_row_view_t_>::array_layout,
-	UniformDevice_t, 
-	Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_alno_row_view_t_;
-
-      typedef Kokkos::View<
-	typename alno_nnz_view_t_::const_value_type*,
-	typename KokkosKernels::Impl::GetUnifiedLayout<alno_nnz_view_t_>::array_layout,
-	UniformDevice_t, 
-	Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_alno_nnz_view_t_;
-
-      typedef Kokkos::View<
-	typename ascalar_nnz_view_t_::const_value_type*,
-	typename KokkosKernels::Impl::GetUnifiedLayout<ascalar_nnz_view_t_>::array_layout,
-	UniformDevice_t, 
-	Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_ascalar_nnz_view_t_;
-
-      typedef Kokkos::View<
-	typename blno_row_view_t_::const_value_type*,
-	typename KokkosKernels::Impl::GetUnifiedLayout<blno_row_view_t_>::array_layout,
-	UniformDevice_t, 
-	Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_blno_row_view_t_;
-
-      typedef Kokkos::View<
-	typename blno_nnz_view_t_::const_value_type*,
-	typename KokkosKernels::Impl::GetUnifiedLayout<blno_nnz_view_t_>::array_layout,
-	UniformDevice_t,
-	Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_blno_nnz_view_t_;
-
-      typedef Kokkos::View<
-	typename bscalar_nnz_view_t_::const_value_type*,
-	typename KokkosKernels::Impl::GetUnifiedLayout<bscalar_nnz_view_t_>::array_layout,
-	UniformDevice_t,
-	Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_bscalar_nnz_view_t_;
-
-      typedef Kokkos::View<
-	typename clno_row_view_t_::non_const_value_type*,
-	typename KokkosKernels::Impl::GetUnifiedLayout<clno_row_view_t_>::array_layout,
-	UniformDevice_t, 
-	Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_clno_row_view_t_;
-
-      typedef Kokkos::View<
-	typename clno_nnz_view_t_::non_const_value_type*,
-	typename KokkosKernels::Impl::GetUnifiedLayout<clno_nnz_view_t_>::array_layout,
-	UniformDevice_t, 
-	Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_clno_nnz_view_t_;
-
-      typedef Kokkos::View<
-	typename cscalar_nnz_view_t_::non_const_value_type*,
-	typename KokkosKernels::Impl::GetUnifiedLayout<cscalar_nnz_view_t_>::array_layout,
-	UniformDevice_t, 
-	Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_cscalar_nnz_view_t_;
-
-      typedef Kokkos::View<
-	typename dinv_view_t_::const_value_type**,
-	typename KokkosKernels::Impl::GetUnifiedLayout<dinv_view_t_>::array_layout,
-	UniformDevice_t, 
-	Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_dinv_view_t_;
-
-      Internal_alno_row_view_t_ const_a_r (row_mapA.data(), row_mapA.extent(0));
-      Internal_alno_nnz_view_t_ const_a_l (entriesA.data(), entriesA.extent(0));
-      Internal_ascalar_nnz_view_t_ const_a_s (valuesA.data(), valuesA.extent(0));
-      Internal_blno_row_view_t_ const_b_r (row_mapB.data(), row_mapB.extent(0));
-      Internal_blno_nnz_view_t_ const_b_l  (entriesB.data(), entriesB.extent(0));
-      Internal_bscalar_nnz_view_t_ const_b_s (valuesB.data(), valuesB.extent(0));
-      Internal_clno_row_view_t_ nonconst_c_r  (row_mapC.data(), row_mapC.extent(0));
-      Internal_clno_nnz_view_t_ nonconst_c_l  (entriesC.data(), entriesC.extent(0));
-      Internal_cscalar_nnz_view_t_ nonconst_c_s (valuesC.data(), valuesC.extent(0));
-      Internal_dinv_view_t_ const_d_s (dinv.data(), dinv.extent(0), dinv.extent(1));
-
-
-      KokkosSparse::Impl::SPGEMM_JACOBI<
-	const_handle_type, 
-	Internal_alno_row_view_t_, Internal_alno_nnz_view_t_, Internal_ascalar_nnz_view_t_,
-	Internal_blno_row_view_t_, Internal_blno_nnz_view_t_, Internal_bscalar_nnz_view_t_,
-	Internal_clno_row_view_t_, Internal_clno_nnz_view_t_, Internal_cscalar_nnz_view_t_,
-	Internal_dinv_view_t_>::
-	spgemm_jacobi(&tmp_handle, m, n, k,
-		      const_a_r, const_a_l, const_a_s, transposeA,
-		      const_b_r, const_b_l, const_b_s, transposeB,
-		      nonconst_c_r, nonconst_c_l, nonconst_c_s,
-		      omega, const_d_s);
-    }
-
-
+namespace KokkosSparse {
+
+namespace Experimental {
+
+template <typename KernelHandle, typename alno_row_view_t_,
+          typename alno_nnz_view_t_, typename ascalar_nnz_view_t_,
+          typename blno_row_view_t_, typename blno_nnz_view_t_,
+          typename bscalar_nnz_view_t_, typename clno_row_view_t_,
+          typename clno_nnz_view_t_, typename cscalar_nnz_view_t_,
+          typename dinv_view_t_>
+void spgemm_jacobi(KernelHandle *handle,
+                   typename KernelHandle::const_nnz_lno_t m,
+                   typename KernelHandle::const_nnz_lno_t n,
+                   typename KernelHandle::const_nnz_lno_t k,
+
+                   alno_row_view_t_ row_mapA, alno_nnz_view_t_ entriesA,
+                   ascalar_nnz_view_t_ valuesA,
+
+                   bool transposeA, blno_row_view_t_ row_mapB,
+                   blno_nnz_view_t_ entriesB, bscalar_nnz_view_t_ valuesB,
+
+                   bool transposeB, clno_row_view_t_ row_mapC,
+                   clno_nnz_view_t_ &entriesC, cscalar_nnz_view_t_ &valuesC,
+
+                   typename cscalar_nnz_view_t_::const_value_type omega,
+                   dinv_view_t_ dinv) {
+  static_assert(
+      std::is_same<typename clno_row_view_t_::value_type,
+                   typename clno_row_view_t_::non_const_value_type>::value,
+      "KokkosSparse::spgemm_jacobi: Output matrix rowmap must be non-const.");
+
+  static_assert(
+      std::is_same<typename clno_nnz_view_t_::value_type,
+                   typename clno_nnz_view_t_::non_const_value_type>::value,
+      "KokkosSparse::spgemm_jacobi: Output matrix entriesView must be "
+      "non-const.");
+
+  static_assert(
+      std::is_same<typename cscalar_nnz_view_t_::value_type,
+                   typename cscalar_nnz_view_t_::non_const_value_type>::value,
+      "KokkosSparse::spgemm_jacobi: Output matrix scalar view must be "
+      "non-const.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_size_type,
+                   typename alno_row_view_t_::const_value_type>::value,
+      "KokkosSparse::spgemm_jacobi: Size type of left handside matrix should "
+      "be same as kernelHandle sizetype.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_size_type,
+                   typename blno_row_view_t_::const_value_type>::value,
+      "KokkosSparse::spgemm_jacobi: Size type of right handside matrix should "
+      "be same as kernelHandle sizetype.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_size_type,
+                   typename clno_row_view_t_::const_value_type>::value,
+      "KokkosSparse::spgemm_jacobi: Size type of output matrix should be same "
+      "as kernelHandle sizetype.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_nnz_lno_t,
+                   typename alno_nnz_view_t_::const_value_type>::value,
+      "KokkosSparse::spgemm_jacobi: lno type of left handside matrix should be "
+      "same as kernelHandle lno_t.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_nnz_lno_t,
+                   typename blno_nnz_view_t_::const_value_type>::value,
+      "KokkosSparse::spgemm_jacobi: lno type of right handside matrix should "
+      "be same as kernelHandle lno_t.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_nnz_lno_t,
+                   typename clno_nnz_view_t_::const_value_type>::value,
+      "KokkosSparse::spgemm_jacobi: lno type of output matrix should be same "
+      "as kernelHandle lno_t.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_nnz_scalar_t,
+                   typename ascalar_nnz_view_t_::const_value_type>::value,
+      "KokkosSparse::spgemm_jacobi: scalar type of left handside matrix should "
+      "be same as kernelHandle scalar.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_nnz_scalar_t,
+                   typename bscalar_nnz_view_t_::const_value_type>::value,
+      "KokkosSparse::spgemm_jacobi: scalar type of right handside matrix "
+      "should be same as kernelHandle scalar.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_nnz_scalar_t,
+                   typename cscalar_nnz_view_t_::const_value_type>::value,
+      "KokkosSparse::spgemm_jacobi: scalar type of output matrix should be "
+      "same as kernelHandle scalar.");
+
+  if (transposeA || transposeB) {
+    throw std::runtime_error(
+        "spgemm-jacobi does not support transposed multiply. "
+        "If you need this case please let kokkos-kernels developers know.\n");
   }
+
+  typedef typename KernelHandle::const_size_type c_size_t;
+  typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
+  typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
+
+  typedef typename KernelHandle::HandleExecSpace c_exec_t;
+  typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
+  typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
+  typedef typename Kokkos::Device<c_exec_t, c_temp_t> UniformDevice_t;
+
+  typedef typename KokkosKernels::Experimental::KokkosKernelsHandle<
+      c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t>
+      const_handle_type;
+
+  const_handle_type tmp_handle(*handle);
+
+  typedef Kokkos::View<typename alno_row_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           alno_row_view_t_>::array_layout,
+                       UniformDevice_t,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_alno_row_view_t_;
+
+  typedef Kokkos::View<typename alno_nnz_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           alno_nnz_view_t_>::array_layout,
+                       UniformDevice_t,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_alno_nnz_view_t_;
+
+  typedef Kokkos::View<typename ascalar_nnz_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           ascalar_nnz_view_t_>::array_layout,
+                       UniformDevice_t,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_ascalar_nnz_view_t_;
+
+  typedef Kokkos::View<typename blno_row_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           blno_row_view_t_>::array_layout,
+                       UniformDevice_t,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_blno_row_view_t_;
+
+  typedef Kokkos::View<typename blno_nnz_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           blno_nnz_view_t_>::array_layout,
+                       UniformDevice_t,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_blno_nnz_view_t_;
+
+  typedef Kokkos::View<typename bscalar_nnz_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           bscalar_nnz_view_t_>::array_layout,
+                       UniformDevice_t,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_bscalar_nnz_view_t_;
+
+  typedef Kokkos::View<typename clno_row_view_t_::non_const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           clno_row_view_t_>::array_layout,
+                       UniformDevice_t,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_clno_row_view_t_;
+
+  typedef Kokkos::View<typename clno_nnz_view_t_::non_const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           clno_nnz_view_t_>::array_layout,
+                       UniformDevice_t,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_clno_nnz_view_t_;
+
+  typedef Kokkos::View<typename cscalar_nnz_view_t_::non_const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           cscalar_nnz_view_t_>::array_layout,
+                       UniformDevice_t,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_cscalar_nnz_view_t_;
+
+  typedef Kokkos::View<typename dinv_view_t_::const_value_type **,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           dinv_view_t_>::array_layout,
+                       UniformDevice_t,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_dinv_view_t_;
+
+  Internal_alno_row_view_t_ const_a_r(row_mapA.data(), row_mapA.extent(0));
+  Internal_alno_nnz_view_t_ const_a_l(entriesA.data(), entriesA.extent(0));
+  Internal_ascalar_nnz_view_t_ const_a_s(valuesA.data(), valuesA.extent(0));
+  Internal_blno_row_view_t_ const_b_r(row_mapB.data(), row_mapB.extent(0));
+  Internal_blno_nnz_view_t_ const_b_l(entriesB.data(), entriesB.extent(0));
+  Internal_bscalar_nnz_view_t_ const_b_s(valuesB.data(), valuesB.extent(0));
+  Internal_clno_row_view_t_ nonconst_c_r(row_mapC.data(), row_mapC.extent(0));
+  Internal_clno_nnz_view_t_ nonconst_c_l(entriesC.data(), entriesC.extent(0));
+  Internal_cscalar_nnz_view_t_ nonconst_c_s(valuesC.data(), valuesC.extent(0));
+  Internal_dinv_view_t_ const_d_s(dinv.data(), dinv.extent(0), dinv.extent(1));
+
+  KokkosSparse::Impl::SPGEMM_JACOBI<
+      const_handle_type, Internal_alno_row_view_t_, Internal_alno_nnz_view_t_,
+      Internal_ascalar_nnz_view_t_, Internal_blno_row_view_t_,
+      Internal_blno_nnz_view_t_, Internal_bscalar_nnz_view_t_,
+      Internal_clno_row_view_t_, Internal_clno_nnz_view_t_,
+      Internal_cscalar_nnz_view_t_,
+      Internal_dinv_view_t_>::spgemm_jacobi(&tmp_handle, m, n, k, const_a_r,
+                                            const_a_l, const_a_s, transposeA,
+                                            const_b_r, const_b_l, const_b_s,
+                                            transposeB, nonconst_c_r,
+                                            nonconst_c_l, nonconst_c_s, omega,
+                                            const_d_s);
 }
+
+}  // namespace Experimental
+}  // namespace KokkosSparse
 #endif
diff --git a/src/sparse/KokkosSparse_spgemm_numeric.hpp b/src/sparse/KokkosSparse_spgemm_numeric.hpp
index bd72114be0..60a54f5b8b 100644
--- a/src/sparse/KokkosSparse_spgemm_numeric.hpp
+++ b/src/sparse/KokkosSparse_spgemm_numeric.hpp
@@ -47,201 +47,214 @@
 #include "KokkosKernels_helpers.hpp"
 #include "KokkosSparse_spgemm_numeric_spec.hpp"
 
-
-namespace KokkosSparse{
-
-namespace Experimental{
-
-
-template <typename KernelHandle,
-typename alno_row_view_t_,
-typename alno_nnz_view_t_,
-typename ascalar_nnz_view_t_,
-typename blno_row_view_t_,
-typename blno_nnz_view_t_,
-typename bscalar_nnz_view_t_,
-typename clno_row_view_t_,
-typename clno_nnz_view_t_,
-typename cscalar_nnz_view_t_>
-void spgemm_numeric(
-    KernelHandle *handle,
-    typename KernelHandle::const_nnz_lno_t m,
-    typename KernelHandle::const_nnz_lno_t n,
-    typename KernelHandle::const_nnz_lno_t k,
-    alno_row_view_t_ row_mapA,
-    alno_nnz_view_t_ entriesA,
-    ascalar_nnz_view_t_ valuesA,
-
-    bool transposeA,
-    blno_row_view_t_ row_mapB,
-    blno_nnz_view_t_ entriesB,
-    bscalar_nnz_view_t_ valuesB,
-    bool transposeB,
-    clno_row_view_t_ row_mapC,
-    clno_nnz_view_t_ &entriesC,
-    cscalar_nnz_view_t_ &valuesC
-){
-
-
-
-  static_assert (std::is_same<typename clno_nnz_view_t_::value_type,
-      typename clno_nnz_view_t_::non_const_value_type>::value,
-      "KokkosSparse::spgemm_numeric: Output matrix entriesView must be non-const.");
-
-  static_assert (std::is_same<typename cscalar_nnz_view_t_::value_type,
-      typename cscalar_nnz_view_t_::non_const_value_type>::value,
-      "KokkosSparse::spgemm_numeric: Output matrix scalar view must be non-const.");
-
-  static_assert (std::is_same<typename KernelHandle::const_size_type,
-      typename alno_row_view_t_::const_value_type>::value,
-      "KokkosSparse::spgemm_numeric: Size type of left handside matrix should be same as kernelHandle sizetype.");
-
-  static_assert (std::is_same<typename KernelHandle::const_size_type,
-      typename blno_row_view_t_::const_value_type>::value,
-      "KokkosSparse::spgemm_numeric: Size type of right handside matrix should be same as kernelHandle sizetype.");
-
-  static_assert (std::is_same<typename KernelHandle::const_size_type,
-      typename clno_row_view_t_::const_value_type>::value,
-      "KokkosSparse::spgemm_numeric: Size type of output matrix should be same as kernelHandle sizetype.");
-
-
-  static_assert (std::is_same<typename KernelHandle::const_nnz_lno_t,
-      typename alno_nnz_view_t_::const_value_type>::value,
-      "KokkosSparse::spgemm_numeric: lno type of left handside matrix should be same as kernelHandle lno_t.");
-
-  static_assert (std::is_same<typename KernelHandle::const_nnz_lno_t,
-      typename blno_nnz_view_t_::const_value_type>::value,
-      "KokkosSparse::spgemm_numeric: lno type of right handside matrix should be same as kernelHandle lno_t.");
-
-  static_assert (std::is_same<typename KernelHandle::const_nnz_lno_t,
-      typename clno_nnz_view_t_::const_value_type>::value,
-      "KokkosSparse::spgemm_numeric: lno type of output matrix should be same as kernelHandle lno_t.");
-
-  static_assert (std::is_same<typename KernelHandle::const_nnz_scalar_t,
-      typename ascalar_nnz_view_t_::const_value_type>::value,
-      "KokkosSparse::spgemm_numeric: scalar type of left handside matrix should be same as kernelHandle scalar.");
-
-  static_assert (std::is_same<typename KernelHandle::const_nnz_scalar_t,
-      typename bscalar_nnz_view_t_::const_value_type>::value,
-      "KokkosSparse::spgemm_numeric: scalar type of right handside matrix should be same as kernelHandle scalar.");
-
-  static_assert (std::is_same<typename KernelHandle::const_nnz_scalar_t,
-      typename cscalar_nnz_view_t_::const_value_type>::value,
-      "KokkosSparse::spgemm_numeric: scalar type of output matrix should be same as kernelHandle scalar.");
-
-
-  if (transposeA || transposeB){
-    throw std::runtime_error ("SpGEMM is not implemented for Transposes yet. "
+namespace KokkosSparse {
+
+namespace Experimental {
+
+template <typename KernelHandle, typename alno_row_view_t_,
+          typename alno_nnz_view_t_, typename ascalar_nnz_view_t_,
+          typename blno_row_view_t_, typename blno_nnz_view_t_,
+          typename bscalar_nnz_view_t_, typename clno_row_view_t_,
+          typename clno_nnz_view_t_, typename cscalar_nnz_view_t_>
+void spgemm_numeric(KernelHandle *handle,
+                    typename KernelHandle::const_nnz_lno_t m,
+                    typename KernelHandle::const_nnz_lno_t n,
+                    typename KernelHandle::const_nnz_lno_t k,
+                    alno_row_view_t_ row_mapA, alno_nnz_view_t_ entriesA,
+                    ascalar_nnz_view_t_ valuesA,
+
+                    bool transposeA, blno_row_view_t_ row_mapB,
+                    blno_nnz_view_t_ entriesB, bscalar_nnz_view_t_ valuesB,
+                    bool transposeB, clno_row_view_t_ row_mapC,
+                    clno_nnz_view_t_ &entriesC, cscalar_nnz_view_t_ &valuesC) {
+  static_assert(
+      std::is_same<typename clno_nnz_view_t_::value_type,
+                   typename clno_nnz_view_t_::non_const_value_type>::value,
+      "KokkosSparse::spgemm_numeric: Output matrix entriesView must be "
+      "non-const.");
+
+  static_assert(
+      std::is_same<typename cscalar_nnz_view_t_::value_type,
+                   typename cscalar_nnz_view_t_::non_const_value_type>::value,
+      "KokkosSparse::spgemm_numeric: Output matrix scalar view must be "
+      "non-const.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_size_type,
+                   typename alno_row_view_t_::const_value_type>::value,
+      "KokkosSparse::spgemm_numeric: Size type of left handside matrix should "
+      "be same as kernelHandle sizetype.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_size_type,
+                   typename blno_row_view_t_::const_value_type>::value,
+      "KokkosSparse::spgemm_numeric: Size type of right handside matrix should "
+      "be same as kernelHandle sizetype.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_size_type,
+                   typename clno_row_view_t_::const_value_type>::value,
+      "KokkosSparse::spgemm_numeric: Size type of output matrix should be same "
+      "as kernelHandle sizetype.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_nnz_lno_t,
+                   typename alno_nnz_view_t_::const_value_type>::value,
+      "KokkosSparse::spgemm_numeric: lno type of left handside matrix should "
+      "be same as kernelHandle lno_t.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_nnz_lno_t,
+                   typename blno_nnz_view_t_::const_value_type>::value,
+      "KokkosSparse::spgemm_numeric: lno type of right handside matrix should "
+      "be same as kernelHandle lno_t.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_nnz_lno_t,
+                   typename clno_nnz_view_t_::const_value_type>::value,
+      "KokkosSparse::spgemm_numeric: lno type of output matrix should be same "
+      "as kernelHandle lno_t.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_nnz_scalar_t,
+                   typename ascalar_nnz_view_t_::const_value_type>::value,
+      "KokkosSparse::spgemm_numeric: scalar type of left handside matrix "
+      "should be same as kernelHandle scalar.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_nnz_scalar_t,
+                   typename bscalar_nnz_view_t_::const_value_type>::value,
+      "KokkosSparse::spgemm_numeric: scalar type of right handside matrix "
+      "should be same as kernelHandle scalar.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_nnz_scalar_t,
+                   typename cscalar_nnz_view_t_::const_value_type>::value,
+      "KokkosSparse::spgemm_numeric: scalar type of output matrix should be "
+      "same as kernelHandle scalar.");
+
+  if (transposeA || transposeB) {
+    throw std::runtime_error(
+        "SpGEMM is not implemented for Transposes yet. "
         "If you need this case please let kokkos-kernels developers know.\n");
   }
 
+  if (m < 1 || n < 1 || k < 1) return;
 
-    typedef typename KernelHandle::const_size_type c_size_t;
-    typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
-    typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
-
-    typedef typename KernelHandle::HandleExecSpace c_exec_t;
-    typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
-    typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
-    typedef typename Kokkos::Device<c_exec_t, c_temp_t> UniformDevice_t;
-
-    typedef typename  KokkosKernels::Experimental::KokkosKernelsHandle<c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t> const_handle_type;
-
-  const_handle_type tmp_handle (*handle);
+  typedef typename KernelHandle::const_size_type c_size_t;
+  typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
+  typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
 
-  typedef Kokkos::View<
-      typename alno_row_view_t_::const_value_type*,
-      typename KokkosKernels::Impl::GetUnifiedLayout<alno_row_view_t_>::array_layout,
-      UniformDevice_t, //typename alno_row_view_t_::device_type,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_alno_row_view_t_;
+  typedef typename KernelHandle::HandleExecSpace c_exec_t;
+  typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
+  typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
+  typedef typename Kokkos::Device<c_exec_t, c_temp_t> UniformDevice_t;
 
-  typedef Kokkos::View<
-      typename alno_nnz_view_t_::const_value_type*,
-      typename KokkosKernels::Impl::GetUnifiedLayout<alno_nnz_view_t_>::array_layout,
-      UniformDevice_t, // typename alno_nnz_view_t_::device_type,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_alno_nnz_view_t_;
+  typedef typename KokkosKernels::Experimental::KokkosKernelsHandle<
+      c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t>
+      const_handle_type;
 
+  const_handle_type tmp_handle(*handle);
 
-  typedef Kokkos::View<
-      typename ascalar_nnz_view_t_::const_value_type*,
-      typename KokkosKernels::Impl::GetUnifiedLayout<ascalar_nnz_view_t_>::array_layout,
-      UniformDevice_t, //       typename ascalar_nnz_view_t_::device_type,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_ascalar_nnz_view_t_;
+  typedef Kokkos::View<typename alno_row_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           alno_row_view_t_>::array_layout,
+                       UniformDevice_t,  // typename
+                                         // alno_row_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_alno_row_view_t_;
 
+  typedef Kokkos::View<typename alno_nnz_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           alno_nnz_view_t_>::array_layout,
+                       UniformDevice_t,  // typename
+                                         // alno_nnz_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_alno_nnz_view_t_;
 
   typedef Kokkos::View<
-      typename blno_row_view_t_::const_value_type*,
-      typename KokkosKernels::Impl::GetUnifiedLayout<blno_row_view_t_>::array_layout,
-      UniformDevice_t, //       typename blno_row_view_t_::device_type,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_blno_row_view_t_;
-
+      typename ascalar_nnz_view_t_::const_value_type *,
+      typename KokkosKernels::Impl::GetUnifiedLayout<
+          ascalar_nnz_view_t_>::array_layout,
+      UniformDevice_t,  //       typename ascalar_nnz_view_t_::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_ascalar_nnz_view_t_;
+
+  typedef Kokkos::View<typename blno_row_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           blno_row_view_t_>::array_layout,
+                       UniformDevice_t,  //       typename
+                                         //       blno_row_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_blno_row_view_t_;
+
+  typedef Kokkos::View<typename blno_nnz_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           blno_nnz_view_t_>::array_layout,
+                       UniformDevice_t,  //       typename
+                                         //       blno_nnz_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_blno_nnz_view_t_;
 
   typedef Kokkos::View<
-      typename blno_nnz_view_t_::const_value_type*,
-      typename KokkosKernels::Impl::GetUnifiedLayout<blno_nnz_view_t_>::array_layout,
-      UniformDevice_t, //       typename blno_nnz_view_t_::device_type,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_blno_nnz_view_t_;
+      typename bscalar_nnz_view_t_::const_value_type *,
+      typename KokkosKernels::Impl::GetUnifiedLayout<
+          bscalar_nnz_view_t_>::array_layout,
+      UniformDevice_t,  //       typename bscalar_nnz_view_t_::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_bscalar_nnz_view_t_;
+
+  // static assert clno_row_view_t_ can be const type (row map is fixed after
+  // symbolic phase).
+  typedef Kokkos::View<typename clno_row_view_t_::value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           clno_row_view_t_>::array_layout,
+                       UniformDevice_t,  //       typename
+                                         //       clno_row_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_clno_row_view_t_;
+
+  typedef Kokkos::View<typename clno_nnz_view_t_::non_const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           clno_nnz_view_t_>::array_layout,
+                       UniformDevice_t,  //       typename
+                                         //       clno_nnz_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_clno_nnz_view_t_;
 
   typedef Kokkos::View<
-      typename bscalar_nnz_view_t_::const_value_type*,
-      typename KokkosKernels::Impl::GetUnifiedLayout<bscalar_nnz_view_t_>::array_layout,
-      UniformDevice_t, //       typename bscalar_nnz_view_t_::device_type,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_bscalar_nnz_view_t_;
-
-  //static assert clno_row_view_t_ can be const type (row map is fixed after symbolic phase).
-  typedef Kokkos::View<
-      typename clno_row_view_t_::value_type*,
-      typename KokkosKernels::Impl::GetUnifiedLayout<clno_row_view_t_>::array_layout,
-      UniformDevice_t, //       typename clno_row_view_t_::device_type,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_clno_row_view_t_;
-
-  typedef Kokkos::View<
-      typename clno_nnz_view_t_::non_const_value_type*,
-      typename KokkosKernels::Impl::GetUnifiedLayout<clno_nnz_view_t_>::array_layout,
-      UniformDevice_t, //       typename clno_nnz_view_t_::device_type,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_clno_nnz_view_t_;
-
-  typedef Kokkos::View<
-      typename cscalar_nnz_view_t_::non_const_value_type*,
-      typename KokkosKernels::Impl::GetUnifiedLayout<cscalar_nnz_view_t_>::array_layout,
-      UniformDevice_t, //       typename cscalar_nnz_view_t_::device_type,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_cscalar_nnz_view_t_;
-
-  Internal_alno_row_view_t_ const_a_r (row_mapA.data(), row_mapA.extent(0));
-  Internal_alno_nnz_view_t_ const_a_l (entriesA.data(), entriesA.extent(0));
-  Internal_ascalar_nnz_view_t_ const_a_s (valuesA.data(), valuesA.extent(0));
-  Internal_blno_row_view_t_ const_b_r (row_mapB.data(), row_mapB.extent(0));
-  Internal_blno_nnz_view_t_ const_b_l  (entriesB.data(), entriesB.extent(0));
-  Internal_bscalar_nnz_view_t_ const_b_s ( valuesB.data(), valuesB.extent(0));
-  Internal_clno_row_view_t_ nonconst_c_r  ( row_mapC.data(), row_mapC.extent(0));
-  Internal_clno_nnz_view_t_ nonconst_c_l  ( entriesC.data(), entriesC.extent(0));
-  Internal_cscalar_nnz_view_t_ nonconst_c_s ( valuesC.data(), valuesC.extent(0));
-
+      typename cscalar_nnz_view_t_::non_const_value_type *,
+      typename KokkosKernels::Impl::GetUnifiedLayout<
+          cscalar_nnz_view_t_>::array_layout,
+      UniformDevice_t,  //       typename cscalar_nnz_view_t_::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_cscalar_nnz_view_t_;
+
+  Internal_alno_row_view_t_ const_a_r(row_mapA.data(), row_mapA.extent(0));
+  Internal_alno_nnz_view_t_ const_a_l(entriesA.data(), entriesA.extent(0));
+  Internal_ascalar_nnz_view_t_ const_a_s(valuesA.data(), valuesA.extent(0));
+  Internal_blno_row_view_t_ const_b_r(row_mapB.data(), row_mapB.extent(0));
+  Internal_blno_nnz_view_t_ const_b_l(entriesB.data(), entriesB.extent(0));
+  Internal_bscalar_nnz_view_t_ const_b_s(valuesB.data(), valuesB.extent(0));
+  Internal_clno_row_view_t_ nonconst_c_r(row_mapC.data(), row_mapC.extent(0));
+  Internal_clno_nnz_view_t_ nonconst_c_l(entriesC.data(), entriesC.extent(0));
+  Internal_cscalar_nnz_view_t_ nonconst_c_s(valuesC.data(), valuesC.extent(0));
 
   KokkosSparse::Impl::SPGEMM_NUMERIC<
-  const_handle_type, //KernelHandle,
-  Internal_alno_row_view_t_, Internal_alno_nnz_view_t_, Internal_ascalar_nnz_view_t_,
-  Internal_blno_row_view_t_, Internal_blno_nnz_view_t_, Internal_bscalar_nnz_view_t_,
-  Internal_clno_row_view_t_, Internal_clno_nnz_view_t_, Internal_cscalar_nnz_view_t_>::
-  spgemm_numeric(
-      &tmp_handle, //handle,
-      m,
-      n,
-      k,
-      const_a_r,
-      const_a_l,
-      const_a_s,
-      transposeA,
-      const_b_r,
-      const_b_l,
-      const_b_s,
-      transposeB,
-      nonconst_c_r,
-      nonconst_c_l,
-      nonconst_c_s);
+      const_handle_type,  // KernelHandle,
+      Internal_alno_row_view_t_, Internal_alno_nnz_view_t_,
+      Internal_ascalar_nnz_view_t_, Internal_blno_row_view_t_,
+      Internal_blno_nnz_view_t_, Internal_bscalar_nnz_view_t_,
+      Internal_clno_row_view_t_, Internal_clno_nnz_view_t_,
+      Internal_cscalar_nnz_view_t_>::spgemm_numeric(&tmp_handle,  // handle,
+                                                    m, n, k, const_a_r,
+                                                    const_a_l, const_a_s,
+                                                    transposeA, const_b_r,
+                                                    const_b_l, const_b_s,
+                                                    transposeB, nonconst_c_r,
+                                                    nonconst_c_l, nonconst_c_s);
 }
 
-
-}
-}
+}  // namespace Experimental
+}  // namespace KokkosSparse
 #endif
diff --git a/src/sparse/KokkosSparse_spgemm_symbolic.hpp b/src/sparse/KokkosSparse_spgemm_symbolic.hpp
index ada56db44d..4c30016b50 100644
--- a/src/sparse/KokkosSparse_spgemm_symbolic.hpp
+++ b/src/sparse/KokkosSparse_spgemm_symbolic.hpp
@@ -48,62 +48,62 @@
 
 #include "KokkosSparse_spgemm_symbolic_spec.hpp"
 
-
-namespace KokkosSparse{
-
-namespace Experimental{
-
-template <typename KernelHandle,
-typename alno_row_view_t_,
-typename alno_nnz_view_t_,
-typename blno_row_view_t_,
-typename blno_nnz_view_t_,
-typename clno_row_view_t_>
-void spgemm_symbolic(
-    KernelHandle *handle,
-    typename KernelHandle::const_nnz_lno_t m,
-    typename KernelHandle::const_nnz_lno_t n,
-    typename KernelHandle::const_nnz_lno_t k,
-    alno_row_view_t_ row_mapA,
-    alno_nnz_view_t_ entriesA,
-    bool transposeA,
-    blno_row_view_t_ row_mapB,
-    blno_nnz_view_t_ entriesB,
-    bool transposeB,
-    clno_row_view_t_ row_mapC){
-
-
-  static_assert (std::is_same<typename clno_row_view_t_::value_type,
-      typename clno_row_view_t_::non_const_value_type>::value,
+namespace KokkosSparse {
+
+namespace Experimental {
+
+template <typename KernelHandle, typename alno_row_view_t_,
+          typename alno_nnz_view_t_, typename blno_row_view_t_,
+          typename blno_nnz_view_t_, typename clno_row_view_t_>
+void spgemm_symbolic(KernelHandle *handle,
+                     typename KernelHandle::const_nnz_lno_t m,
+                     typename KernelHandle::const_nnz_lno_t n,
+                     typename KernelHandle::const_nnz_lno_t k,
+                     alno_row_view_t_ row_mapA, alno_nnz_view_t_ entriesA,
+                     bool transposeA, blno_row_view_t_ row_mapB,
+                     blno_nnz_view_t_ entriesB, bool transposeB,
+                     clno_row_view_t_ row_mapC) {
+  static_assert(
+      std::is_same<typename clno_row_view_t_::value_type,
+                   typename clno_row_view_t_::non_const_value_type>::value,
       "KokkosSparse::spgemm_symbolic: Output matrix rowmap must be non-const.");
 
-  static_assert (std::is_same<typename KernelHandle::const_size_type,
-      typename alno_row_view_t_::const_value_type>::value,
-      "KokkosSparse::spgemm_symbolic: Size type of left handside matrix should be same as kernelHandle sizetype.");
-
-  static_assert (std::is_same<typename KernelHandle::const_size_type,
-      typename blno_row_view_t_::const_value_type>::value,
-      "KokkosSparse::spgemm_symbolic: Size type of right handside matrix should be same as kernelHandle sizetype.");
-
-  static_assert (std::is_same<typename KernelHandle::const_size_type,
-      typename clno_row_view_t_::const_value_type>::value,
-      "KokkosSparse::spgemm_symbolic: Size type of output matrix should be same as kernelHandle sizetype.");
-
-
-  static_assert (std::is_same<typename KernelHandle::const_nnz_lno_t,
-      typename alno_nnz_view_t_::const_value_type>::value,
-      "KokkosSparse::spgemm_symbolic: lno type of left handside matrix should be same as kernelHandle lno_t.");
-
-  static_assert (std::is_same<typename KernelHandle::const_nnz_lno_t,
-      typename blno_nnz_view_t_::const_value_type>::value,
-      "KokkosSparse::spgemm_symbolic: lno type of right handside matrix should be same as kernelHandle lno_t.");
-
-  if (transposeA || transposeB){
-    throw std::runtime_error ("SpGEMM is not implemented for Transposes yet. "
+  static_assert(
+      std::is_same<typename KernelHandle::const_size_type,
+                   typename alno_row_view_t_::const_value_type>::value,
+      "KokkosSparse::spgemm_symbolic: Size type of left handside matrix should "
+      "be same as kernelHandle sizetype.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_size_type,
+                   typename blno_row_view_t_::const_value_type>::value,
+      "KokkosSparse::spgemm_symbolic: Size type of right handside matrix "
+      "should be same as kernelHandle sizetype.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_size_type,
+                   typename clno_row_view_t_::const_value_type>::value,
+      "KokkosSparse::spgemm_symbolic: Size type of output matrix should be "
+      "same as kernelHandle sizetype.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_nnz_lno_t,
+                   typename alno_nnz_view_t_::const_value_type>::value,
+      "KokkosSparse::spgemm_symbolic: lno type of left handside matrix should "
+      "be same as kernelHandle lno_t.");
+
+  static_assert(
+      std::is_same<typename KernelHandle::const_nnz_lno_t,
+                   typename blno_nnz_view_t_::const_value_type>::value,
+      "KokkosSparse::spgemm_symbolic: lno type of right handside matrix should "
+      "be same as kernelHandle lno_t.");
+
+  if (transposeA || transposeB) {
+    throw std::runtime_error(
+        "SpGEMM is not implemented for Transposes yet. "
         "If you need this case please let kokkos-kernels developers know.\n");
   }
 
-
   typedef typename KernelHandle::const_size_type c_size_t;
   typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
   typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
@@ -113,71 +113,70 @@ void spgemm_symbolic(
   typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
   typedef typename Kokkos::Device<c_exec_t, c_temp_t> UniformDevice_t;
 
-  typedef typename  KokkosKernels::Experimental::KokkosKernelsHandle<c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t> const_handle_type;
-  const_handle_type tmp_handle (*handle);
-
-
-  typedef Kokkos::View<
-      typename alno_row_view_t_::const_value_type*,
-      typename KokkosKernels::Impl::GetUnifiedLayout<alno_row_view_t_>::array_layout,
-      UniformDevice_t, //      typename alno_row_view_t_::device_type,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_alno_row_view_t_;
-
-  typedef Kokkos::View<
-      typename alno_nnz_view_t_::const_value_type*,
-      typename KokkosKernels::Impl::GetUnifiedLayout<alno_nnz_view_t_>::array_layout,
-      UniformDevice_t, //      typename alno_nnz_view_t_::device_type,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_alno_nnz_view_t_;
-
-
-  typedef Kokkos::View<
-      typename blno_row_view_t_::const_value_type*,
-      typename KokkosKernels::Impl::GetUnifiedLayout<blno_row_view_t_>::array_layout,
-      UniformDevice_t, //      typename blno_row_view_t_::device_type,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_blno_row_view_t_;
-
-
-  typedef Kokkos::View<
-      typename blno_nnz_view_t_::const_value_type*,
-      typename KokkosKernels::Impl::GetUnifiedLayout<blno_nnz_view_t_>::array_layout,
-      UniformDevice_t, //      typename blno_nnz_view_t_::device_type,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_blno_nnz_view_t_;
-
-
-  //static assert clno_row_view_t_ cannot be const type.
-  typedef Kokkos::View<
-      typename clno_row_view_t_::non_const_value_type*,
-      typename KokkosKernels::Impl::GetUnifiedLayout<clno_row_view_t_>::array_layout,
-      UniformDevice_t, //      typename clno_row_view_t_::device_type,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > Internal_clno_row_view_t_;
-
-  Internal_alno_row_view_t_ const_a_r (row_mapA.data(), row_mapA.extent(0));
-  Internal_alno_nnz_view_t_ const_a_l (entriesA.data(), entriesA.extent(0));
-  Internal_blno_row_view_t_ const_b_r (row_mapB.data(), row_mapB.extent(0));
-  Internal_blno_nnz_view_t_ const_b_l  (entriesB.data(), entriesB.extent(0));
-  Internal_clno_row_view_t_ const_c_r  ( row_mapC.data(), row_mapC.extent(0));
+  typedef typename KokkosKernels::Experimental::KokkosKernelsHandle<
+      c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t>
+      const_handle_type;
+  const_handle_type tmp_handle(*handle);
+
+  typedef Kokkos::View<typename alno_row_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           alno_row_view_t_>::array_layout,
+                       UniformDevice_t,  //      typename
+                                         //      alno_row_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_alno_row_view_t_;
+
+  typedef Kokkos::View<typename alno_nnz_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           alno_nnz_view_t_>::array_layout,
+                       UniformDevice_t,  //      typename
+                                         //      alno_nnz_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_alno_nnz_view_t_;
+
+  typedef Kokkos::View<typename blno_row_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           blno_row_view_t_>::array_layout,
+                       UniformDevice_t,  //      typename
+                                         //      blno_row_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_blno_row_view_t_;
+
+  typedef Kokkos::View<typename blno_nnz_view_t_::const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           blno_nnz_view_t_>::array_layout,
+                       UniformDevice_t,  //      typename
+                                         //      blno_nnz_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_blno_nnz_view_t_;
+
+  // static assert clno_row_view_t_ cannot be const type.
+  typedef Kokkos::View<typename clno_row_view_t_::non_const_value_type *,
+                       typename KokkosKernels::Impl::GetUnifiedLayout<
+                           clno_row_view_t_>::array_layout,
+                       UniformDevice_t,  //      typename
+                                         //      clno_row_view_t_::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      Internal_clno_row_view_t_;
+
+  Internal_alno_row_view_t_ const_a_r(row_mapA.data(), row_mapA.extent(0));
+  Internal_alno_nnz_view_t_ const_a_l(entriesA.data(), entriesA.extent(0));
+  Internal_blno_row_view_t_ const_b_r(row_mapB.data(), row_mapB.extent(0));
+  Internal_blno_nnz_view_t_ const_b_l(entriesB.data(), entriesB.extent(0));
+  Internal_clno_row_view_t_ const_c_r(row_mapC.data(), row_mapC.extent(0));
 
   using namespace KokkosSparse::Impl;
   SPGEMM_SYMBOLIC<
-  	  const_handle_type, //KernelHandle,
+      const_handle_type,  // KernelHandle,
       Internal_alno_row_view_t_, Internal_alno_nnz_view_t_,
       Internal_blno_row_view_t_, Internal_blno_nnz_view_t_,
-      Internal_clno_row_view_t_>::spgemm_symbolic(
-    	  &tmp_handle, //handle,
-          m,
-          n,
-          k,
-          const_a_r,
-          const_a_l,
-          transposeA,
-          const_b_r,
-          const_b_l,
-          transposeB,
-          const_c_r);
-
-
+      Internal_clno_row_view_t_>::spgemm_symbolic(&tmp_handle,  // handle,
+                                                  m, n, k, const_a_r, const_a_l,
+                                                  transposeA, const_b_r,
+                                                  const_b_l, transposeB,
+                                                  const_c_r);
 }
 
-}
-}
+}  // namespace Experimental
+}  // namespace KokkosSparse
 #endif
diff --git a/src/sparse/KokkosSparse_spiluk.hpp b/src/sparse/KokkosSparse_spiluk.hpp
index c974054a26..f1ed7b3e0f 100644
--- a/src/sparse/KokkosSparse_spiluk.hpp
+++ b/src/sparse/KokkosSparse_spiluk.hpp
@@ -56,408 +56,518 @@
 
 //#include "KokkosSparse_spiluk_handle.hpp"
 #include "KokkosKernels_helpers.hpp"
+#include "KokkosKernels_Error.hpp"
 #include "KokkosSparse_spiluk_symbolic_spec.hpp"
 #include "KokkosSparse_spiluk_numeric_spec.hpp"
 
 namespace KokkosSparse {
 namespace Experimental {
 
-#define KOKKOSKERNELS_SPILUK_SAME_TYPE(A, B) std::is_same<typename std::remove_const<A>::type, typename std::remove_const<B>::type>::value
-
-  template <typename KernelHandle,
-            typename ARowMapType,
-            typename AEntriesType,
-            typename LRowMapType,
-            typename LEntriesType,
-            typename URowMapType,
-            typename UEntriesType>
-  void spiluk_symbolic(
-      KernelHandle *handle,
-      typename KernelHandle::const_nnz_lno_t fill_lev,
-      ARowMapType&  A_rowmap,
-      AEntriesType& A_entries,
-      LRowMapType&  L_rowmap,
-      LEntriesType& L_entries,
-      URowMapType&  U_rowmap,
-      UEntriesType& U_entries)
-  {
-    typedef typename KernelHandle::size_type size_type;
-    typedef typename KernelHandle::nnz_lno_t ordinal_type;
-
-    static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(typename ARowMapType::non_const_value_type, size_type),
-        "spiluk_symbolic: A size_type must match KernelHandle size_type (const doesn't matter)");
-    static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(typename AEntriesType::non_const_value_type, ordinal_type),
-        "spiluk_symbolic: A entry type must match KernelHandle entry type (aka nnz_lno_t, and const doesn't matter)");
-
-    static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(typename LRowMapType::non_const_value_type, size_type),
-        "spiluk_symbolic: L size_type must match KernelHandle size_type (const doesn't matter)");
-    static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(typename LEntriesType::non_const_value_type, ordinal_type),
-        "spiluk_symbolic: L entry type must match KernelHandle entry type (aka nnz_lno_t, and const doesn't matter)");
-
-    static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(typename URowMapType::non_const_value_type, size_type),
-        "spiluk_symbolic: U size_type must match KernelHandle size_type (const doesn't matter)");
-    static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(typename UEntriesType::non_const_value_type, ordinal_type),
-        "spiluk_symbolic: U entry type must match KernelHandle entry type (aka nnz_lno_t, and const doesn't matter)");
-
-    static_assert (Kokkos::Impl::is_view<ARowMapType>::value,
-        "spiluk_symbolic: A_rowmap is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<AEntriesType>::value,
-        "spiluk_symbolic: A_entries is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<LRowMapType>::value,
-        "spiluk_symbolic: L_rowmap is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<LEntriesType>::value,
-        "spiluk_symbolic: L_entries is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<URowMapType>::value,
-        "spiluk_symbolic: U_rowmap is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<UEntriesType>::value,
-        "spiluk_symbolic: U_entries is not a Kokkos::View.");
-
-    static_assert ((int) LRowMapType::rank == (int) ARowMapType::rank,
-        "spiluk_symbolic: The ranks of L_rowmap and A_rowmap do not match.");
-    static_assert ((int) LEntriesType::rank == (int) AEntriesType::rank,
-        "spiluk_symbolic: The ranks of L_entries and A_entries do not match.");
-
-    static_assert ((int) LRowMapType::rank == (int) URowMapType::rank,
-        "spiluk_symbolic: The ranks of L_rowmap and U_rowmap do not match.");
-    static_assert ((int) LEntriesType::rank == (int) UEntriesType::rank,
-        "spiluk_symbolic: The ranks of L_entries and U_entries do not match.");
-
-    static_assert (LRowMapType::rank == 1,
-        "spiluk_symbolic: A_rowmap, L_rowmap and U_rowmap must all have rank 1.");
-    static_assert (LEntriesType::rank == 1,
-        "spiluk_symbolic: A_entries, L_entries and U_entries must all have rank 1.");
-
-    static_assert (std::is_same<typename LRowMapType::value_type,
-                                typename LRowMapType::non_const_value_type>::value,
-                   "spiluk_symbolic: The output L_rowmap must be nonconst.");
-    static_assert (std::is_same<typename LEntriesType::value_type,
-                                typename LEntriesType::non_const_value_type>::value,
-                   "spiluk_symbolic: The output L_entries must be nonconst.");
-    static_assert (std::is_same<typename URowMapType::value_type,
-                                typename URowMapType::non_const_value_type>::value,
-                   "spiluk_symbolic: The output U_rowmap must be nonconst.");
-    static_assert (std::is_same<typename UEntriesType::value_type,
-                                typename UEntriesType::non_const_value_type>::value,
-                   "spiluk_symbolic: The output U_entries must be nonconst.");
-
-    static_assert (std::is_same<typename LRowMapType::device_type, typename ARowMapType::device_type>::value,
-        "spiluk_symbolic: Views LRowMapType and ARowMapType have different device_types.");
-    static_assert (std::is_same<typename LEntriesType::device_type, typename AEntriesType::device_type>::value,
-        "spiluk_symbolic: Views LEntriesType and AEntriesType have different device_types.");
-
-    static_assert (std::is_same<typename LRowMapType::device_type, typename URowMapType::device_type>::value,
-        "spiluk_symbolic: Views LRowMapType and URowMapType have different device_types.");
-    static_assert (std::is_same<typename LEntriesType::device_type, typename UEntriesType::device_type>::value,
-        "spiluk_symbolic: Views LEntriesType and UEntriesType have different device_types.");
-
-    static_assert (std::is_same<typename LRowMapType::device_type::execution_space, typename KernelHandle::SPILUKHandleType::execution_space>::value,
-        "spiluk_symbolic: KernelHandle and Views have different execution spaces.");
-    static_assert (std::is_same<typename LEntriesType::device_type::execution_space, typename KernelHandle::SPILUKHandleType::execution_space>::value,
-        "spiluk_symbolic: KernelHandle and Views have different execution spaces.");
-
-    static_assert (std::is_same<typename LRowMapType::device_type, typename LEntriesType::device_type>::value,
-        "spiluk_symbolic: rowmap and entries have different device types.");
-
-    // Check validity of fill level
-    if ( fill_lev < 0 ) {
-      std::ostringstream os;
-      os << "KokkosSparse::Experimental::spiluk_symbolic: fill_lev: " << fill_lev << ". Valid value is >= 0.";
-      Kokkos::Impl::throw_runtime_exception (os.str ());
-    }
-  
-    typedef typename KernelHandle::const_size_type c_size_t;
-    typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
-    typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
-
-    typedef typename KernelHandle::HandleExecSpace c_exec_t;
-    typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
-    typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
-
-    typedef typename  KokkosKernels::Experimental::KokkosKernelsHandle<c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t> const_handle_type;
-    const_handle_type tmp_handle (*handle);
-
-    typedef Kokkos::View<
-          typename ARowMapType::const_value_type*,
-          typename KokkosKernels::Impl::GetUnifiedLayout<ARowMapType>::array_layout,
-          typename ARowMapType::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > ARowMap_Internal;
-
-    typedef Kokkos::View<
-          typename AEntriesType::const_value_type*,
-          typename KokkosKernels::Impl::GetUnifiedLayout<AEntriesType>::array_layout,
-          typename AEntriesType::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > AEntries_Internal;
-
-    typedef Kokkos::View<
-          typename LRowMapType::non_const_value_type*,
-          typename KokkosKernels::Impl::GetUnifiedLayout<LRowMapType>::array_layout,
-          typename LRowMapType::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > LRowMap_Internal;
-
-    typedef Kokkos::View<
-          typename LEntriesType::non_const_value_type*,
-          typename KokkosKernels::Impl::GetUnifiedLayout<LEntriesType>::array_layout,
-          typename LEntriesType::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > LEntries_Internal;
-
-    typedef Kokkos::View<
-          typename URowMapType::non_const_value_type*,
-          typename KokkosKernels::Impl::GetUnifiedLayout<URowMapType>::array_layout,
-          typename URowMapType::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > URowMap_Internal;
-
-    typedef Kokkos::View<
-          typename UEntriesType::non_const_value_type*,
-          typename KokkosKernels::Impl::GetUnifiedLayout<UEntriesType>::array_layout,
-          typename UEntriesType::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > UEntries_Internal;
-
-    ARowMap_Internal  A_rowmap_i  = A_rowmap;
-    AEntries_Internal A_entries_i = A_entries;
-    LRowMap_Internal  L_rowmap_i  = L_rowmap;
-    LEntries_Internal L_entries_i = L_entries;
-    URowMap_Internal  U_rowmap_i  = U_rowmap;
-    UEntries_Internal U_entries_i = U_entries;
-
-    KokkosSparse::Impl::SPILUK_SYMBOLIC<const_handle_type,
-                                        ARowMap_Internal, AEntries_Internal,
-                                        LRowMap_Internal, LEntries_Internal, 
-                                        URowMap_Internal, UEntries_Internal>::
-                       spiluk_symbolic(&tmp_handle, fill_lev, A_rowmap_i, A_entries_i, L_rowmap_i, L_entries_i, U_rowmap_i, U_entries_i);
-
-  } // spiluk_symbolic
-
-
-  template <typename KernelHandle,
-            typename ARowMapType,
-            typename AEntriesType,
-            typename AValuesType,
-            typename LRowMapType,
-            typename LEntriesType,
-            typename LValuesType,
-            typename URowMapType,
-            typename UEntriesType,
-            typename UValuesType>
-  void spiluk_numeric(
-      KernelHandle *handle,
-      typename KernelHandle::const_nnz_lno_t fill_lev,
-      ARowMapType&  A_rowmap,
-      AEntriesType& A_entries,
-      AValuesType&  A_values,
-      LRowMapType&  L_rowmap,
-      LEntriesType& L_entries,
-      LValuesType&  L_values,
-      URowMapType&  U_rowmap,
-      UEntriesType& U_entries,
-      UValuesType&  U_values)
-  {
-    typedef typename KernelHandle::size_type size_type;
-    typedef typename KernelHandle::nnz_lno_t ordinal_type;
-    typedef typename KernelHandle::nnz_scalar_t scalar_type;
-    
-    static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(typename ARowMapType::non_const_value_type, size_type),
-        "spiluk_numeric: A size_type must match KernelHandle size_type (const doesn't matter)");
-    static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(typename AEntriesType::non_const_value_type, ordinal_type),
-        "spiluk_numeric: A entry type must match KernelHandle entry type (aka nnz_lno_t, and const doesn't matter)");
-    static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(typename AValuesType::value_type, scalar_type),
-        "spiluk_numeric: A scalar type must match KernelHandle entry type (aka nnz_lno_t, and const doesn't matter)");
-
-    static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(typename LRowMapType::non_const_value_type, size_type),
-        "spiluk_numeric: L size_type must match KernelHandle size_type (const doesn't matter)");
-    static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(typename LEntriesType::non_const_value_type, ordinal_type),
-        "spiluk_numeric: L entry type must match KernelHandle entry type (aka nnz_lno_t, and const doesn't matter)");
-    static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(typename LValuesType::value_type, scalar_type),
-        "spiluk_numeric: L scalar type must match KernelHandle entry type (aka nnz_lno_t, and const doesn't matter)");
-
-    static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(typename URowMapType::non_const_value_type, size_type),
-        "spiluk_numeric: U size_type must match KernelHandle size_type (const doesn't matter)");
-    static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(typename UEntriesType::non_const_value_type, ordinal_type),
-        "spiluk_numeric: U entry type must match KernelHandle entry type (aka nnz_lno_t, and const doesn't matter)");
-    static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(typename UValuesType::value_type, scalar_type),
-        "spiluk_numeric: U scalar type must match KernelHandle entry type (aka nnz_lno_t, and const doesn't matter)");
-
-    static_assert (Kokkos::Impl::is_view<ARowMapType>::value,
-        "spiluk_numeric: A_rowmap is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<AEntriesType>::value,
-        "spiluk_numeric: A_entries is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<AValuesType>::value,
-        "spiluk_numeric: A_values is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<LRowMapType>::value,
-        "spiluk_numeric: L_rowmap is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<LEntriesType>::value,
-        "spiluk_numeric: L_entries is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<LValuesType>::value,
-        "spiluk_numeric: L_values is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<URowMapType>::value,
-        "spiluk_numeric: U_rowmap is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<UEntriesType>::value,
-        "spiluk_numeric: U_entries is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<UValuesType>::value,
-        "spiluk_numeric: U_values is not a Kokkos::View.");
-
-    static_assert ((int) LRowMapType::rank == (int) ARowMapType::rank,
-        "spiluk_numeric: The ranks of L_rowmap and A_rowmap do not match.");
-    static_assert ((int) LEntriesType::rank == (int) AEntriesType::rank,
-        "spiluk_numeric: The ranks of L_entries and A_entries do not match.");
-    static_assert ((int) LValuesType::rank == (int) AValuesType::rank,
-        "spiluk_numeric: The ranks of L_values and A_values do not match.");
-
-    static_assert ((int) LRowMapType::rank == (int) URowMapType::rank,
-        "spiluk_numeric: The ranks of L_rowmap and U_rowmap do not match.");
-    static_assert ((int) LEntriesType::rank == (int) UEntriesType::rank,
-        "spiluk_numeric: The ranks of L_entries and U_entries do not match.");
-    static_assert ((int) LValuesType::rank == (int) UValuesType::rank,
-        "spiluk_numeric: The ranks of L_values and U_values do not match.");
-
-    static_assert (LRowMapType::rank == 1,
-        "spiluk_numeric: A_rowmap, L_rowmap and U_rowmap must all have rank 1.");
-    static_assert (LEntriesType::rank == 1,
-        "spiluk_numeric: A_entries, L_entries and U_entries must all have rank 1.");
-    static_assert (LValuesType::rank == 1,
-        "spiluk_numeric: A_values, L_values and U_values must all have rank 1.");
-
-    static_assert (std::is_same<typename LEntriesType::value_type,
-                                typename LEntriesType::non_const_value_type>::value,
-                   "spiluk_numeric: The output L_entries must be nonconst.");
-    static_assert (std::is_same<typename LValuesType::value_type,
-                                typename LValuesType::non_const_value_type>::value,
-                   "spiluk_numeric: The output L_values must be nonconst.");
-    static_assert (std::is_same<typename UEntriesType::value_type,
-                                typename UEntriesType::non_const_value_type>::value,
-                   "spiluk_numeric: The output U_entries must be nonconst.");
-    static_assert (std::is_same<typename UValuesType::value_type,
-                                typename UValuesType::non_const_value_type>::value,
-                   "spiluk_numeric: The output U_values must be nonconst.");
-
-    static_assert (std::is_same<typename LRowMapType::device_type, typename ARowMapType::device_type>::value,
-        "spiluk_numeric: Views LRowMapType and ARowMapType have different device_types.");
-    static_assert (std::is_same<typename LEntriesType::device_type, typename AEntriesType::device_type>::value,
-        "spiluk_numeric: Views LEntriesType and AEntriesType have different device_types.");
-    static_assert (std::is_same<typename LValuesType::device_type, typename AValuesType::device_type>::value,
-        "spiluk_numeric: Views LValuesType and AValuesType have different device_types.");
-
-    static_assert (std::is_same<typename LRowMapType::device_type, typename URowMapType::device_type>::value,
-        "spiluk_numeric: Views LRowMapType and URowMapType have different device_types.");
-    static_assert (std::is_same<typename LEntriesType::device_type, typename UEntriesType::device_type>::value,
-        "spiluk_numeric: Views LEntriesType and UEntriesType have different device_types.");
-    static_assert (std::is_same<typename LValuesType::device_type, typename UValuesType::device_type>::value,
-        "spiluk_numeric: Views LValuesType and UValuesType have different device_types.");
-
-    static_assert (std::is_same<typename LRowMapType::device_type::execution_space, typename KernelHandle::SPILUKHandleType::execution_space>::value,
-        "spiluk_numeric: KernelHandle and Views have different execution spaces.");
-    static_assert (std::is_same<typename LEntriesType::device_type::execution_space, typename KernelHandle::SPILUKHandleType::execution_space>::value,
-        "spiluk_numeric: KernelHandle and Views have different execution spaces.");
-    static_assert (std::is_same<typename LValuesType::device_type::execution_space, typename KernelHandle::SPILUKHandleType::execution_space>::value,
-        "spiluk_numeric: KernelHandle and Views have different execution spaces.");
-
-    static_assert (std::is_same<typename LRowMapType::device_type, typename LEntriesType::device_type>::value,
-        "spiluk_numeric: rowmap and entries have different device types.");
-    static_assert (std::is_same<typename LRowMapType::device_type, typename LValuesType::device_type>::value,
-        "spiluk_numeric: rowmap and values have different device types.");
-
-    // Check validity of fill level
-    if ( fill_lev < 0 ) {
-      std::ostringstream os;
-      os << "KokkosSparse::Experimental::spiluk_numeric: fill_lev: " << fill_lev << ". Valid value is >= 0.";
-      Kokkos::Impl::throw_runtime_exception (os.str ());
-    }
-
-    // Check if symbolic has been called
-    if ( handle->get_spiluk_handle()->is_symbolic_complete() == false ) {
-      std::ostringstream os;
-      os << "KokkosSparse::Experimental::spiluk_numeric: spiluk_symbolic must be called before spiluk_numeric.";
-      Kokkos::Impl::throw_runtime_exception (os.str ());
-    }
-
-    typedef typename KernelHandle::const_size_type c_size_t;
-    typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
-    typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
-
-    typedef typename KernelHandle::HandleExecSpace c_exec_t;
-    typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
-    typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
-
-    typedef typename  KokkosKernels::Experimental::KokkosKernelsHandle<c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t> const_handle_type;
-    const_handle_type tmp_handle (*handle);
-
-    typedef Kokkos::View<
-          typename ARowMapType::const_value_type*,
-          typename KokkosKernels::Impl::GetUnifiedLayout<ARowMapType>::array_layout,
-          typename ARowMapType::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > ARowMap_Internal;
-
-    typedef Kokkos::View<
-          typename AEntriesType::const_value_type*,
-          typename KokkosKernels::Impl::GetUnifiedLayout<AEntriesType>::array_layout,
-          typename AEntriesType::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > AEntries_Internal;
-
-    typedef Kokkos::View<
-          typename AValuesType::const_value_type*,
-          typename KokkosKernels::Impl::GetUnifiedLayout<AValuesType>::array_layout,
-          typename AValuesType::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > AValues_Internal;
-
-    typedef Kokkos::View<
-          typename LRowMapType::const_value_type*,
-          typename KokkosKernels::Impl::GetUnifiedLayout<LRowMapType>::array_layout,
-          typename LRowMapType::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > LRowMap_Internal;
-
-    typedef Kokkos::View<
-          typename LEntriesType::non_const_value_type*,
-          typename KokkosKernels::Impl::GetUnifiedLayout<LEntriesType>::array_layout,
-          typename LEntriesType::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > LEntries_Internal;
-
-    typedef Kokkos::View<
-          typename LValuesType::non_const_value_type*,
-          typename KokkosKernels::Impl::GetUnifiedLayout<LValuesType>::array_layout,
-          typename LValuesType::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > LValues_Internal;
-
-    typedef Kokkos::View<
-          typename URowMapType::const_value_type*,
-          typename KokkosKernels::Impl::GetUnifiedLayout<URowMapType>::array_layout,
-          typename URowMapType::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > URowMap_Internal;
-
-    typedef Kokkos::View<
-          typename UEntriesType::non_const_value_type*,
-          typename KokkosKernels::Impl::GetUnifiedLayout<UEntriesType>::array_layout,
-          typename UEntriesType::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > UEntries_Internal;
-
-    typedef Kokkos::View<
-          typename UValuesType::non_const_value_type*,
-          typename KokkosKernels::Impl::GetUnifiedLayout<UValuesType>::array_layout,
-          typename UValuesType::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > UValues_Internal;
-
-    ARowMap_Internal  A_rowmap_i  = A_rowmap;
-    AEntries_Internal A_entries_i = A_entries;
-    AValues_Internal  A_values_i  = A_values;
-    LRowMap_Internal  L_rowmap_i  = L_rowmap;
-    LEntries_Internal L_entries_i = L_entries;
-    LValues_Internal  L_values_i  = L_values;
-    URowMap_Internal  U_rowmap_i  = U_rowmap;
-    UEntries_Internal U_entries_i = U_entries;
-    UValues_Internal  U_values_i  = U_values;
-
-    KokkosSparse::Impl::SPILUK_NUMERIC<const_handle_type,
-                                       ARowMap_Internal, AEntries_Internal, AValues_Internal,
-                                       LRowMap_Internal, LEntries_Internal, LValues_Internal,
-                                       URowMap_Internal, UEntries_Internal, UValues_Internal>::
-                       spiluk_numeric(&tmp_handle, fill_lev, A_rowmap_i, A_entries_i, A_values_i, 
-                                                             L_rowmap_i, L_entries_i, L_values_i,
-                                                             U_rowmap_i, U_entries_i, U_values_i);
-
-  } // spiluk_numeric
-
-} // namespace Experimental
-} // namespace KokkosSparse
+#define KOKKOSKERNELS_SPILUK_SAME_TYPE(A, B)        \
+  std::is_same<typename std::remove_const<A>::type, \
+               typename std::remove_const<B>::type>::value
+
+template <typename KernelHandle, typename ARowMapType, typename AEntriesType,
+          typename LRowMapType, typename LEntriesType, typename URowMapType,
+          typename UEntriesType>
+void spiluk_symbolic(KernelHandle* handle,
+                     typename KernelHandle::const_nnz_lno_t fill_lev,
+                     ARowMapType& A_rowmap, AEntriesType& A_entries,
+                     LRowMapType& L_rowmap, LEntriesType& L_entries,
+                     URowMapType& U_rowmap, UEntriesType& U_entries) {
+  typedef typename KernelHandle::size_type size_type;
+  typedef typename KernelHandle::nnz_lno_t ordinal_type;
+
+  static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(
+                    typename ARowMapType::non_const_value_type, size_type),
+                "spiluk_symbolic: A size_type must match KernelHandle "
+                "size_type (const doesn't matter)");
+  static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(
+                    typename AEntriesType::non_const_value_type, ordinal_type),
+                "spiluk_symbolic: A entry type must match KernelHandle entry "
+                "type (aka nnz_lno_t, and const doesn't matter)");
+
+  static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(
+                    typename LRowMapType::non_const_value_type, size_type),
+                "spiluk_symbolic: L size_type must match KernelHandle "
+                "size_type (const doesn't matter)");
+  static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(
+                    typename LEntriesType::non_const_value_type, ordinal_type),
+                "spiluk_symbolic: L entry type must match KernelHandle entry "
+                "type (aka nnz_lno_t, and const doesn't matter)");
+
+  static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(
+                    typename URowMapType::non_const_value_type, size_type),
+                "spiluk_symbolic: U size_type must match KernelHandle "
+                "size_type (const doesn't matter)");
+  static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(
+                    typename UEntriesType::non_const_value_type, ordinal_type),
+                "spiluk_symbolic: U entry type must match KernelHandle entry "
+                "type (aka nnz_lno_t, and const doesn't matter)");
+
+  static_assert(Kokkos::is_view<ARowMapType>::value,
+                "spiluk_symbolic: A_rowmap is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<AEntriesType>::value,
+                "spiluk_symbolic: A_entries is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<LRowMapType>::value,
+                "spiluk_symbolic: L_rowmap is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<LEntriesType>::value,
+                "spiluk_symbolic: L_entries is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<URowMapType>::value,
+                "spiluk_symbolic: U_rowmap is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<UEntriesType>::value,
+                "spiluk_symbolic: U_entries is not a Kokkos::View.");
+
+  static_assert(
+      (int)LRowMapType::rank == (int)ARowMapType::rank,
+      "spiluk_symbolic: The ranks of L_rowmap and A_rowmap do not match.");
+  static_assert(
+      (int)LEntriesType::rank == (int)AEntriesType::rank,
+      "spiluk_symbolic: The ranks of L_entries and A_entries do not match.");
+
+  static_assert(
+      (int)LRowMapType::rank == (int)URowMapType::rank,
+      "spiluk_symbolic: The ranks of L_rowmap and U_rowmap do not match.");
+  static_assert(
+      (int)LEntriesType::rank == (int)UEntriesType::rank,
+      "spiluk_symbolic: The ranks of L_entries and U_entries do not match.");
+
+  static_assert(
+      LRowMapType::rank == 1,
+      "spiluk_symbolic: A_rowmap, L_rowmap and U_rowmap must all have rank 1.");
+  static_assert(LEntriesType::rank == 1,
+                "spiluk_symbolic: A_entries, L_entries and U_entries must all "
+                "have rank 1.");
+
+  static_assert(std::is_same<typename LRowMapType::value_type,
+                             typename LRowMapType::non_const_value_type>::value,
+                "spiluk_symbolic: The output L_rowmap must be nonconst.");
+  static_assert(
+      std::is_same<typename LEntriesType::value_type,
+                   typename LEntriesType::non_const_value_type>::value,
+      "spiluk_symbolic: The output L_entries must be nonconst.");
+  static_assert(std::is_same<typename URowMapType::value_type,
+                             typename URowMapType::non_const_value_type>::value,
+                "spiluk_symbolic: The output U_rowmap must be nonconst.");
+  static_assert(
+      std::is_same<typename UEntriesType::value_type,
+                   typename UEntriesType::non_const_value_type>::value,
+      "spiluk_symbolic: The output U_entries must be nonconst.");
+
+  static_assert(std::is_same<typename LRowMapType::device_type,
+                             typename ARowMapType::device_type>::value,
+                "spiluk_symbolic: Views LRowMapType and ARowMapType have "
+                "different device_types.");
+  static_assert(std::is_same<typename LEntriesType::device_type,
+                             typename AEntriesType::device_type>::value,
+                "spiluk_symbolic: Views LEntriesType and AEntriesType have "
+                "different device_types.");
+
+  static_assert(std::is_same<typename LRowMapType::device_type,
+                             typename URowMapType::device_type>::value,
+                "spiluk_symbolic: Views LRowMapType and URowMapType have "
+                "different device_types.");
+  static_assert(std::is_same<typename LEntriesType::device_type,
+                             typename UEntriesType::device_type>::value,
+                "spiluk_symbolic: Views LEntriesType and UEntriesType have "
+                "different device_types.");
+
+  static_assert(
+      std::is_same<
+          typename LRowMapType::device_type::execution_space,
+          typename KernelHandle::SPILUKHandleType::execution_space>::value,
+      "spiluk_symbolic: KernelHandle and Views have different execution "
+      "spaces.");
+  static_assert(
+      std::is_same<
+          typename LEntriesType::device_type::execution_space,
+          typename KernelHandle::SPILUKHandleType::execution_space>::value,
+      "spiluk_symbolic: KernelHandle and Views have different execution "
+      "spaces.");
+
+  static_assert(
+      std::is_same<typename LRowMapType::device_type,
+                   typename LEntriesType::device_type>::value,
+      "spiluk_symbolic: rowmap and entries have different device types.");
+
+  // Check validity of fill level
+  if (fill_lev < 0) {
+    std::ostringstream os;
+    os << "KokkosSparse::Experimental::spiluk_symbolic: fill_lev: " << fill_lev
+       << ". Valid value is >= 0.";
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
+  }
+
+  typedef typename KernelHandle::const_size_type c_size_t;
+  typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
+  typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
+
+  typedef typename KernelHandle::HandleExecSpace c_exec_t;
+  typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
+  typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
+
+  typedef typename KokkosKernels::Experimental::KokkosKernelsHandle<
+      c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t>
+      const_handle_type;
+  const_handle_type tmp_handle(*handle);
+
+  typedef Kokkos::View<
+      typename ARowMapType::const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<ARowMapType>::array_layout,
+      typename ARowMapType::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      ARowMap_Internal;
+
+  typedef Kokkos::View<
+      typename AEntriesType::const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<
+          AEntriesType>::array_layout,
+      typename AEntriesType::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      AEntries_Internal;
+
+  typedef Kokkos::View<
+      typename LRowMapType::non_const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<LRowMapType>::array_layout,
+      typename LRowMapType::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      LRowMap_Internal;
+
+  typedef Kokkos::View<
+      typename LEntriesType::non_const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<
+          LEntriesType>::array_layout,
+      typename LEntriesType::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      LEntries_Internal;
+
+  typedef Kokkos::View<
+      typename URowMapType::non_const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<URowMapType>::array_layout,
+      typename URowMapType::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      URowMap_Internal;
+
+  typedef Kokkos::View<
+      typename UEntriesType::non_const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<
+          UEntriesType>::array_layout,
+      typename UEntriesType::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      UEntries_Internal;
+
+  ARowMap_Internal A_rowmap_i   = A_rowmap;
+  AEntries_Internal A_entries_i = A_entries;
+  LRowMap_Internal L_rowmap_i   = L_rowmap;
+  LEntries_Internal L_entries_i = L_entries;
+  URowMap_Internal U_rowmap_i   = U_rowmap;
+  UEntries_Internal U_entries_i = U_entries;
+
+  KokkosSparse::Impl::SPILUK_SYMBOLIC<
+      const_handle_type, ARowMap_Internal, AEntries_Internal, LRowMap_Internal,
+      LEntries_Internal, URowMap_Internal,
+      UEntries_Internal>::spiluk_symbolic(&tmp_handle, fill_lev, A_rowmap_i,
+                                          A_entries_i, L_rowmap_i, L_entries_i,
+                                          U_rowmap_i, U_entries_i);
+
+}  // spiluk_symbolic
+
+template <typename KernelHandle, typename ARowMapType, typename AEntriesType,
+          typename AValuesType, typename LRowMapType, typename LEntriesType,
+          typename LValuesType, typename URowMapType, typename UEntriesType,
+          typename UValuesType>
+void spiluk_numeric(KernelHandle* handle,
+                    typename KernelHandle::const_nnz_lno_t fill_lev,
+                    ARowMapType& A_rowmap, AEntriesType& A_entries,
+                    AValuesType& A_values, LRowMapType& L_rowmap,
+                    LEntriesType& L_entries, LValuesType& L_values,
+                    URowMapType& U_rowmap, UEntriesType& U_entries,
+                    UValuesType& U_values) {
+  typedef typename KernelHandle::size_type size_type;
+  typedef typename KernelHandle::nnz_lno_t ordinal_type;
+  typedef typename KernelHandle::nnz_scalar_t scalar_type;
+
+  static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(
+                    typename ARowMapType::non_const_value_type, size_type),
+                "spiluk_numeric: A size_type must match KernelHandle size_type "
+                "(const doesn't matter)");
+  static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(
+                    typename AEntriesType::non_const_value_type, ordinal_type),
+                "spiluk_numeric: A entry type must match KernelHandle entry "
+                "type (aka nnz_lno_t, and const doesn't matter)");
+  static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(typename AValuesType::value_type,
+                                               scalar_type),
+                "spiluk_numeric: A scalar type must match KernelHandle entry "
+                "type (aka nnz_lno_t, and const doesn't matter)");
+
+  static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(
+                    typename LRowMapType::non_const_value_type, size_type),
+                "spiluk_numeric: L size_type must match KernelHandle size_type "
+                "(const doesn't matter)");
+  static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(
+                    typename LEntriesType::non_const_value_type, ordinal_type),
+                "spiluk_numeric: L entry type must match KernelHandle entry "
+                "type (aka nnz_lno_t, and const doesn't matter)");
+  static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(typename LValuesType::value_type,
+                                               scalar_type),
+                "spiluk_numeric: L scalar type must match KernelHandle entry "
+                "type (aka nnz_lno_t, and const doesn't matter)");
+
+  static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(
+                    typename URowMapType::non_const_value_type, size_type),
+                "spiluk_numeric: U size_type must match KernelHandle size_type "
+                "(const doesn't matter)");
+  static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(
+                    typename UEntriesType::non_const_value_type, ordinal_type),
+                "spiluk_numeric: U entry type must match KernelHandle entry "
+                "type (aka nnz_lno_t, and const doesn't matter)");
+  static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(typename UValuesType::value_type,
+                                               scalar_type),
+                "spiluk_numeric: U scalar type must match KernelHandle entry "
+                "type (aka nnz_lno_t, and const doesn't matter)");
+
+  static_assert(Kokkos::is_view<ARowMapType>::value,
+                "spiluk_numeric: A_rowmap is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<AEntriesType>::value,
+                "spiluk_numeric: A_entries is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<AValuesType>::value,
+                "spiluk_numeric: A_values is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<LRowMapType>::value,
+                "spiluk_numeric: L_rowmap is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<LEntriesType>::value,
+                "spiluk_numeric: L_entries is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<LValuesType>::value,
+                "spiluk_numeric: L_values is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<URowMapType>::value,
+                "spiluk_numeric: U_rowmap is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<UEntriesType>::value,
+                "spiluk_numeric: U_entries is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<UValuesType>::value,
+                "spiluk_numeric: U_values is not a Kokkos::View.");
+
+  static_assert(
+      (int)LRowMapType::rank == (int)ARowMapType::rank,
+      "spiluk_numeric: The ranks of L_rowmap and A_rowmap do not match.");
+  static_assert(
+      (int)LEntriesType::rank == (int)AEntriesType::rank,
+      "spiluk_numeric: The ranks of L_entries and A_entries do not match.");
+  static_assert(
+      (int)LValuesType::rank == (int)AValuesType::rank,
+      "spiluk_numeric: The ranks of L_values and A_values do not match.");
+
+  static_assert(
+      (int)LRowMapType::rank == (int)URowMapType::rank,
+      "spiluk_numeric: The ranks of L_rowmap and U_rowmap do not match.");
+  static_assert(
+      (int)LEntriesType::rank == (int)UEntriesType::rank,
+      "spiluk_numeric: The ranks of L_entries and U_entries do not match.");
+  static_assert(
+      (int)LValuesType::rank == (int)UValuesType::rank,
+      "spiluk_numeric: The ranks of L_values and U_values do not match.");
+
+  static_assert(
+      LRowMapType::rank == 1,
+      "spiluk_numeric: A_rowmap, L_rowmap and U_rowmap must all have rank 1.");
+  static_assert(LEntriesType::rank == 1,
+                "spiluk_numeric: A_entries, L_entries and U_entries must all "
+                "have rank 1.");
+  static_assert(
+      LValuesType::rank == 1,
+      "spiluk_numeric: A_values, L_values and U_values must all have rank 1.");
+
+  static_assert(
+      std::is_same<typename LEntriesType::value_type,
+                   typename LEntriesType::non_const_value_type>::value,
+      "spiluk_numeric: The output L_entries must be nonconst.");
+  static_assert(std::is_same<typename LValuesType::value_type,
+                             typename LValuesType::non_const_value_type>::value,
+                "spiluk_numeric: The output L_values must be nonconst.");
+  static_assert(
+      std::is_same<typename UEntriesType::value_type,
+                   typename UEntriesType::non_const_value_type>::value,
+      "spiluk_numeric: The output U_entries must be nonconst.");
+  static_assert(std::is_same<typename UValuesType::value_type,
+                             typename UValuesType::non_const_value_type>::value,
+                "spiluk_numeric: The output U_values must be nonconst.");
+
+  static_assert(std::is_same<typename LRowMapType::device_type,
+                             typename ARowMapType::device_type>::value,
+                "spiluk_numeric: Views LRowMapType and ARowMapType have "
+                "different device_types.");
+  static_assert(std::is_same<typename LEntriesType::device_type,
+                             typename AEntriesType::device_type>::value,
+                "spiluk_numeric: Views LEntriesType and AEntriesType have "
+                "different device_types.");
+  static_assert(std::is_same<typename LValuesType::device_type,
+                             typename AValuesType::device_type>::value,
+                "spiluk_numeric: Views LValuesType and AValuesType have "
+                "different device_types.");
+
+  static_assert(std::is_same<typename LRowMapType::device_type,
+                             typename URowMapType::device_type>::value,
+                "spiluk_numeric: Views LRowMapType and URowMapType have "
+                "different device_types.");
+  static_assert(std::is_same<typename LEntriesType::device_type,
+                             typename UEntriesType::device_type>::value,
+                "spiluk_numeric: Views LEntriesType and UEntriesType have "
+                "different device_types.");
+  static_assert(std::is_same<typename LValuesType::device_type,
+                             typename UValuesType::device_type>::value,
+                "spiluk_numeric: Views LValuesType and UValuesType have "
+                "different device_types.");
+
+  static_assert(
+      std::is_same<
+          typename LRowMapType::device_type::execution_space,
+          typename KernelHandle::SPILUKHandleType::execution_space>::value,
+      "spiluk_numeric: KernelHandle and Views have different execution "
+      "spaces.");
+  static_assert(
+      std::is_same<
+          typename LEntriesType::device_type::execution_space,
+          typename KernelHandle::SPILUKHandleType::execution_space>::value,
+      "spiluk_numeric: KernelHandle and Views have different execution "
+      "spaces.");
+  static_assert(
+      std::is_same<
+          typename LValuesType::device_type::execution_space,
+          typename KernelHandle::SPILUKHandleType::execution_space>::value,
+      "spiluk_numeric: KernelHandle and Views have different execution "
+      "spaces.");
+
+  static_assert(
+      std::is_same<typename LRowMapType::device_type,
+                   typename LEntriesType::device_type>::value,
+      "spiluk_numeric: rowmap and entries have different device types.");
+  static_assert(
+      std::is_same<typename LRowMapType::device_type,
+                   typename LValuesType::device_type>::value,
+      "spiluk_numeric: rowmap and values have different device types.");
+
+  // Check validity of fill level
+  if (fill_lev < 0) {
+    std::ostringstream os;
+    os << "KokkosSparse::Experimental::spiluk_numeric: fill_lev: " << fill_lev
+       << ". Valid value is >= 0.";
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
+  }
+
+  // Check if symbolic has been called
+  if (handle->get_spiluk_handle()->is_symbolic_complete() == false) {
+    std::ostringstream os;
+    os << "KokkosSparse::Experimental::spiluk_numeric: spiluk_symbolic must be "
+          "called before spiluk_numeric.";
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
+  }
+
+  typedef typename KernelHandle::const_size_type c_size_t;
+  typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
+  typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
+
+  typedef typename KernelHandle::HandleExecSpace c_exec_t;
+  typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
+  typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
+
+  typedef typename KokkosKernels::Experimental::KokkosKernelsHandle<
+      c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t>
+      const_handle_type;
+  const_handle_type tmp_handle(*handle);
+
+  typedef Kokkos::View<
+      typename ARowMapType::const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<ARowMapType>::array_layout,
+      typename ARowMapType::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      ARowMap_Internal;
+
+  typedef Kokkos::View<
+      typename AEntriesType::const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<
+          AEntriesType>::array_layout,
+      typename AEntriesType::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      AEntries_Internal;
+
+  typedef Kokkos::View<
+      typename AValuesType::const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<AValuesType>::array_layout,
+      typename AValuesType::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      AValues_Internal;
+
+  typedef Kokkos::View<
+      typename LRowMapType::const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<LRowMapType>::array_layout,
+      typename LRowMapType::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      LRowMap_Internal;
+
+  typedef Kokkos::View<
+      typename LEntriesType::non_const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<
+          LEntriesType>::array_layout,
+      typename LEntriesType::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      LEntries_Internal;
+
+  typedef Kokkos::View<
+      typename LValuesType::non_const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<LValuesType>::array_layout,
+      typename LValuesType::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      LValues_Internal;
+
+  typedef Kokkos::View<
+      typename URowMapType::const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<URowMapType>::array_layout,
+      typename URowMapType::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      URowMap_Internal;
+
+  typedef Kokkos::View<
+      typename UEntriesType::non_const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<
+          UEntriesType>::array_layout,
+      typename UEntriesType::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      UEntries_Internal;
+
+  typedef Kokkos::View<
+      typename UValuesType::non_const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<UValuesType>::array_layout,
+      typename UValuesType::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      UValues_Internal;
+
+  ARowMap_Internal A_rowmap_i   = A_rowmap;
+  AEntries_Internal A_entries_i = A_entries;
+  AValues_Internal A_values_i   = A_values;
+  LRowMap_Internal L_rowmap_i   = L_rowmap;
+  LEntries_Internal L_entries_i = L_entries;
+  LValues_Internal L_values_i   = L_values;
+  URowMap_Internal U_rowmap_i   = U_rowmap;
+  UEntries_Internal U_entries_i = U_entries;
+  UValues_Internal U_values_i   = U_values;
+
+  KokkosSparse::Impl::SPILUK_NUMERIC<
+      const_handle_type, ARowMap_Internal, AEntries_Internal, AValues_Internal,
+      LRowMap_Internal, LEntries_Internal, LValues_Internal, URowMap_Internal,
+      UEntries_Internal, UValues_Internal>::spiluk_numeric(&tmp_handle,
+                                                           fill_lev, A_rowmap_i,
+                                                           A_entries_i,
+                                                           A_values_i,
+                                                           L_rowmap_i,
+                                                           L_entries_i,
+                                                           L_values_i,
+                                                           U_rowmap_i,
+                                                           U_entries_i,
+                                                           U_values_i);
+
+}  // spiluk_numeric
+
+}  // namespace Experimental
+}  // namespace KokkosSparse
 
 #undef KOKKOSKERNELS_SPILUK_SAME_TYPE
 
-#endif // KOKKOSSPARSE_SPILUK_HPP_
-
+#endif  // KOKKOSSPARSE_SPILUK_HPP_
diff --git a/src/sparse/KokkosSparse_spiluk_handle.hpp b/src/sparse/KokkosSparse_spiluk_handle.hpp
index cb4679d65d..522e0461d5 100644
--- a/src/sparse/KokkosSparse_spiluk_handle.hpp
+++ b/src/sparse/KokkosSparse_spiluk_handle.hpp
@@ -42,7 +42,6 @@
 //@HEADER
 */
 
-#include <Kokkos_MemoryTraits.hpp>
 #include <Kokkos_Core.hpp>
 #include <iostream>
 #include <string>
@@ -57,15 +56,15 @@ namespace KokkosSparse {
 namespace Experimental {
 
 // TP2 algorithm has issues with some offset-ordinal combo to be addressed
-enum class SPILUKAlgorithm { SEQLVLSCHD_RP, SEQLVLSCHD_TP1/*, SEQLVLSCHED_TP2*/ };
+enum class SPILUKAlgorithm {
+  SEQLVLSCHD_RP,
+  SEQLVLSCHD_TP1 /*, SEQLVLSCHED_TP2*/
+};
 
-template <class size_type_, class lno_t_, class scalar_t_,
-          class ExecutionSpace,
-          class TemporaryMemorySpace,
-          class PersistentMemorySpace>
+template <class size_type_, class lno_t_, class scalar_t_, class ExecutionSpace,
+          class TemporaryMemorySpace, class PersistentMemorySpace>
 class SPILUKHandle {
-public:
-
+ public:
   typedef ExecutionSpace HandleExecSpace;
   typedef TemporaryMemorySpace HandleTempMemorySpace;
   typedef PersistentMemorySpace HandlePersistentMemorySpace;
@@ -73,35 +72,40 @@ class SPILUKHandle {
   typedef ExecutionSpace execution_space;
   typedef HandlePersistentMemorySpace memory_space;
 
-
-  typedef typename std::remove_const<size_type_>::type  size_type;
+  typedef typename std::remove_const<size_type_>::type size_type;
   typedef const size_type const_size_type;
 
-  typedef typename std::remove_const<lno_t_>::type  nnz_lno_t;
+  typedef typename std::remove_const<lno_t_>::type nnz_lno_t;
   typedef const nnz_lno_t const_nnz_lno_t;
 
-  typedef typename std::remove_const<scalar_t_>::type  nnz_scalar_t;
+  typedef typename std::remove_const<scalar_t_>::type nnz_scalar_t;
   typedef const nnz_scalar_t const_nnz_scalar_t;
 
-  typedef typename Kokkos::View<size_type *, HandlePersistentMemorySpace> nnz_row_view_t;
-  
-  typedef typename Kokkos::View<nnz_lno_t *, HandlePersistentMemorySpace> nnz_lno_view_t;
-
-  typedef typename std::make_signed<typename nnz_row_view_t::non_const_value_type>::type signed_integral_t;
-  typedef Kokkos::View< signed_integral_t*, typename nnz_row_view_t::array_layout, typename nnz_row_view_t::device_type, typename nnz_row_view_t::memory_traits > signed_nnz_lno_view_t;
+  typedef typename Kokkos::View<size_type *, HandlePersistentMemorySpace>
+      nnz_row_view_t;
 
+  typedef typename Kokkos::View<nnz_lno_t *, HandlePersistentMemorySpace>
+      nnz_lno_view_t;
 
-private:
+  typedef typename std::make_signed<
+      typename nnz_row_view_t::non_const_value_type>::type signed_integral_t;
+  typedef Kokkos::View<signed_integral_t *,
+                       typename nnz_row_view_t::array_layout,
+                       typename nnz_row_view_t::device_type,
+                       typename nnz_row_view_t::memory_traits>
+      signed_nnz_lno_view_t;
 
-  nnz_row_view_t level_list;//level IDs which the rows belong to
-  nnz_lno_view_t level_idx; //the list of rows in each level
-  nnz_lno_view_t level_ptr; //the starting index (into the view level_idx) of each level
+ private:
+  nnz_row_view_t level_list;  // level IDs which the rows belong to
+  nnz_lno_view_t level_idx;   // the list of rows in each level
+  nnz_lno_view_t
+      level_ptr;  // the starting index (into the view level_idx) of each level
 
   size_type nrows;
   size_type nlevel;
   size_type nnzL;
   size_type nnzU;
-  size_type level_maxrows;//maximum number of rows of levels
+  size_type level_maxrows;  // maximum number of rows of levels
 
   bool symbolic_complete;
 
@@ -110,24 +114,25 @@ class SPILUKHandle {
   int team_size;
   int vector_size;
 
-public:
-
-  SPILUKHandle ( SPILUKAlgorithm choice, const size_type nrows_, const size_type nnzL_, const size_type nnzU_, bool symbolic_complete_ = false ) :
-    level_list(),
-    level_idx(),
-    level_ptr(),
-    nrows(nrows_),
-    nlevel(0),
-    nnzL(nnzL_),
-    nnzU(nnzU_),
-    level_maxrows(0),
-    symbolic_complete( symbolic_complete_ ),
-    algm(choice),
-    team_size(-1),
-    vector_size(-1)
-  {}
-
-  void reset_handle( const size_type nrows_, const size_type nnzL_, const size_type nnzU_ ) {
+ public:
+  SPILUKHandle(SPILUKAlgorithm choice, const size_type nrows_,
+               const size_type nnzL_, const size_type nnzU_,
+               bool symbolic_complete_ = false)
+      : level_list(),
+        level_idx(),
+        level_ptr(),
+        nrows(nrows_),
+        nlevel(0),
+        nnzL(nnzL_),
+        nnzU(nnzU_),
+        level_maxrows(0),
+        symbolic_complete(symbolic_complete_),
+        algm(choice),
+        team_size(-1),
+        vector_size(-1) {}
+
+  void reset_handle(const size_type nrows_, const size_type nnzL_,
+                    const size_type nnzU_) {
     set_nrows(nrows_);
     set_num_levels(0);
     set_nnzL(nnzL_);
@@ -135,12 +140,11 @@ class SPILUKHandle {
     set_level_maxrows(0);
     level_list = nnz_row_view_t("level_list", nrows_),
     level_idx  = nnz_lno_view_t("level_idx", nrows_),
-    level_ptr  = nnz_lno_view_t("level_ptr", nrows_+1),
+    level_ptr  = nnz_lno_view_t("level_ptr", nrows_ + 1),
     reset_symbolic_complete();
   }
 
-  virtual ~SPILUKHandle() {};
-
+  virtual ~SPILUKHandle(){};
 
   void set_algorithm(SPILUKAlgorithm choice) { algm = choice; }
 
@@ -177,7 +181,9 @@ class SPILUKHandle {
   size_type get_level_maxrows() const { return level_maxrows; }
 
   KOKKOS_INLINE_FUNCTION
-  void set_level_maxrows(const size_type level_maxrows_) { this->level_maxrows = level_maxrows_; }
+  void set_level_maxrows(const size_type level_maxrows_) {
+    this->level_maxrows = level_maxrows_;
+  }
 
   bool is_symbolic_complete() const { return symbolic_complete; }
 
@@ -187,39 +193,45 @@ class SPILUKHandle {
   void set_symbolic_complete() { this->symbolic_complete = true; }
   void reset_symbolic_complete() { this->symbolic_complete = false; }
 
-  void set_team_size(const int ts) {this->team_size = ts;}
-  int get_team_size() const {return this->team_size;}
+  void set_team_size(const int ts) { this->team_size = ts; }
+  int get_team_size() const { return this->team_size; }
 
-  void set_vector_size(const int vs) {this->vector_size = vs;}
-  int get_vector_size() const {return this->vector_size;}
+  void set_vector_size(const int vs) { this->vector_size = vs; }
+  int get_vector_size() const { return this->vector_size; }
 
-  void print_algorithm() { 
-    if ( algm == SPILUKAlgorithm::SEQLVLSCHD_RP )
-      std::cout << "SEQLVLSCHD_RP" << std::endl;;
+  void print_algorithm() {
+    if (algm == SPILUKAlgorithm::SEQLVLSCHD_RP)
+      std::cout << "SEQLVLSCHD_RP" << std::endl;
+    ;
 
-    if ( algm == SPILUKAlgorithm::SEQLVLSCHD_TP1 )
-      std::cout << "SEQLVLSCHD_TP1" << std::endl;;
+    if (algm == SPILUKAlgorithm::SEQLVLSCHD_TP1)
+      std::cout << "SEQLVLSCHD_TP1" << std::endl;
+    ;
 
     /*
     if ( algm == SPILUKAlgorithm::SEQLVLSCHED_TP2 ) {
       std::cout << "SEQLVLSCHED_TP2" << std::endl;;
-      std::cout << "WARNING: With CUDA this is currently only reliable with int-int ordinal-offset pair" << std::endl;
+      std::cout << "WARNING: With CUDA this is currently only reliable with
+    int-int ordinal-offset pair" << std::endl;
     }
     */
   }
 
-  inline SPILUKAlgorithm StringToSPILUKAlgorithm(std::string & name) {
-    if(name=="SPILUK_DEFAULT")             return SPILUKAlgorithm::SEQLVLSCHD_RP;
-    else if(name=="SPILUK_RANGEPOLICY")    return SPILUKAlgorithm::SEQLVLSCHD_RP;
-    else if(name=="SPILUK_TEAMPOLICY1")    return SPILUKAlgorithm::SEQLVLSCHD_TP1;
-    /*else if(name=="SPILUK_TEAMPOLICY2")    return SPILUKAlgorithm::SEQLVLSCHED_TP2;*/
+  inline SPILUKAlgorithm StringToSPILUKAlgorithm(std::string &name) {
+    if (name == "SPILUK_DEFAULT")
+      return SPILUKAlgorithm::SEQLVLSCHD_RP;
+    else if (name == "SPILUK_RANGEPOLICY")
+      return SPILUKAlgorithm::SEQLVLSCHD_RP;
+    else if (name == "SPILUK_TEAMPOLICY1")
+      return SPILUKAlgorithm::SEQLVLSCHD_TP1;
+    /*else if(name=="SPILUK_TEAMPOLICY2")    return
+     * SPILUKAlgorithm::SEQLVLSCHED_TP2;*/
     else
       throw std::runtime_error("Invalid SPILUKAlgorithm name");
   }
-
 };
 
-} // namespace Experimental
-} // namespace Kokkos
+}  // namespace Experimental
+}  // namespace KokkosSparse
 
 #endif
diff --git a/src/sparse/KokkosSparse_spmv.hpp b/src/sparse/KokkosSparse_spmv.hpp
index b7c3a26f47..8ec7799e16 100644
--- a/src/sparse/KokkosSparse_spmv.hpp
+++ b/src/sparse/KokkosSparse_spmv.hpp
@@ -42,8 +42,9 @@
 //@HEADER
 */
 
-
-
+/// \file
+/// \brief Interfaces for the Kokkos sparse-matrix-vector multiply
+///
 
 #ifndef KOKKOSSPARSE_SPMV_HPP_
 #define KOKKOSSPARSE_SPMV_HPP_
@@ -52,114 +53,137 @@
 #include "KokkosKernels_Controls.hpp"
 #include "KokkosSparse_spmv_spec.hpp"
 #include "KokkosSparse_spmv_struct_spec.hpp"
+#include "KokkosSparse_spmv_blockcrsmatrix_spec.hpp"
+#include "KokkosSparse_spmv_bsrmatrix_spec.hpp"
 #include <type_traits>
+#include "KokkosSparse_BsrMatrix.hpp"
 #include "KokkosSparse_CrsMatrix.hpp"
+#include "KokkosSparse_BlockCrsMatrix.hpp"
 #include "KokkosBlas1_scal.hpp"
 #include "KokkosKernels_Utils.hpp"
+#include "KokkosKernels_Error.hpp"
 
 namespace KokkosSparse {
 
 namespace {
-  struct RANK_ONE{};
-  struct RANK_TWO{};
-}
+struct RANK_ONE {};
+struct RANK_TWO {};
+}  // namespace
 
-template <class AlphaType, class AMatrix, class XVector, class BetaType, class YVector>
-void
-spmv (KokkosKernels::Experimental::Controls controls,
-      const char mode[],
-      const AlphaType& alpha,
-      const AMatrix& A,
-      const XVector& x,
-      const BetaType& beta,
-      const YVector& y,
-      const RANK_ONE)
-{
-  // Make sure that both x and y have the same rank.
-  static_assert (static_cast<int> (XVector::rank) ==
-                 static_cast<int> (YVector::rank),
-    "KokkosSparse::spmv: Vector ranks do not match.");
+/// \brief Tag-dispatch for \c Kokkos sparse matrix-vector multiply on single
+/// vector
+///
+///
+/// \tparam AMatrix  A KokkosSparse::CrsMatrix, KokkosSparse::BlockCrsMatrix or
+/// KokkosSparse::BsrMatrix
+///
+/// \param controls [in] kokkos-kernels control structure.
+/// \param mode [in]
+/// \param alpha [in] Scalar multiplier for the matrix A.
+/// \param A [in] The sparse matrix A.
+/// \param x [in] A vector.
+/// \param beta [in] Scalar multiplier for the multivector y.
+/// \param y [in/out] vector.
+/// \param RANK_ONE tag dispatch
+///
+#ifdef DOXY  // documentation version
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector>
+#else
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector,
+          typename std::enable_if<
+              KokkosSparse::is_crs_matrix<AMatrix>::value>::type* = nullptr>
+#endif
+void spmv(KokkosKernels::Experimental::Controls controls, const char mode[],
+          const AlphaType& alpha, const AMatrix& A, const XVector& x,
+          const BetaType& beta, const YVector& y, const RANK_ONE) {
+
+  // Make sure that x and y have the same rank.
+  static_assert(
+      static_cast<int>(XVector::rank) == static_cast<int>(YVector::rank),
+      "KokkosSparse::spmv: Vector ranks do not match.");
   // Make sure that x (and therefore y) is rank 1.
-  static_assert (static_cast<int> (XVector::rank) == 1,
-    "KokkosSparse::spmv: Both Vector inputs must have rank 1 "
-    "in order to call this specialization of spmv.");
+  static_assert(static_cast<int>(XVector::rank) == 1,
+                "KokkosSparse::spmv: Both Vector inputs must have rank 1 "
+                "in order to call this specialization of spmv.");
   // Make sure that y is non-const.
-  static_assert (std::is_same<typename YVector::value_type,
-                   typename YVector::non_const_value_type>::value,
-    "KokkosSparse::spmv: Output Vector must be non-const.");
+  static_assert(std::is_same<typename YVector::value_type,
+                             typename YVector::non_const_value_type>::value,
+                "KokkosSparse::spmv: Output Vector must be non-const.");
 
   // Check compatibility of dimensions at run time.
   if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) {
     if ((x.extent(1) != y.extent(1)) ||
-        (static_cast<size_t> (A.numCols ()) > static_cast<size_t> (x.extent(0))) ||
-        (static_cast<size_t> (A.numRows ()) > static_cast<size_t> (y.extent(0)))) {
+        (static_cast<size_t>(A.numCols()) > static_cast<size_t>(x.extent(0))) ||
+        (static_cast<size_t>(A.numRows()) > static_cast<size_t>(y.extent(0)))) {
       std::ostringstream os;
       os << "KokkosSparse::spmv: Dimensions do not match: "
-         << ", A: " << A.numRows () << " x " << A.numCols()
+         << ", A: " << A.numRows() << " x " << A.numCols()
          << ", x: " << x.extent(0) << " x " << x.extent(1)
-         << ", y: " << y.extent(0) << " x " << y.extent(1)
-         ;
-      Kokkos::Impl::throw_runtime_exception (os.str ());
+         << ", y: " << y.extent(0) << " x " << y.extent(1);
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
     }
-  }
-  else {
+  } else {
     if ((x.extent(1) != y.extent(1)) ||
-        (static_cast<size_t> (A.numCols ()) > static_cast<size_t> (y.extent(0))) ||
-        (static_cast<size_t> (A.numRows ()) > static_cast<size_t> (x.extent(0)))) {
+        (static_cast<size_t>(A.numCols()) > static_cast<size_t>(y.extent(0))) ||
+        (static_cast<size_t>(A.numRows()) > static_cast<size_t>(x.extent(0)))) {
       std::ostringstream os;
       os << "KokkosSparse::spmv: Dimensions do not match (transpose): "
-         << ", A: " << A.numRows () << " x " << A.numCols()
+         << ", A: " << A.numRows() << " x " << A.numCols()
          << ", x: " << x.extent(0) << " x " << x.extent(1)
-         << ", y: " << y.extent(0) << " x " << y.extent(1)
-         ;
-      Kokkos::Impl::throw_runtime_exception (os.str ());
+         << ", y: " << y.extent(0) << " x " << y.extent(1);
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
     }
   }
 
   typedef KokkosSparse::CrsMatrix<
-              typename AMatrix::const_value_type,
-              typename AMatrix::const_ordinal_type,
-              typename AMatrix::device_type,
-              Kokkos::MemoryTraits<Kokkos::Unmanaged>,
-              typename AMatrix::const_size_type>          AMatrix_Internal;
+      typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type,
+      typename AMatrix::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>,
+      typename AMatrix::const_size_type>
+      AMatrix_Internal;
 
   typedef Kokkos::View<
-            typename XVector::const_value_type*,
-            typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
-            typename XVector::device_type,
-            Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > XVector_Internal;
+      typename XVector::const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
+      typename XVector::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      XVector_Internal;
 
   typedef Kokkos::View<
-            typename YVector::non_const_value_type*,
-            typename KokkosKernels::Impl::GetUnifiedLayout<YVector>::array_layout,
-            typename YVector::device_type,
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> > YVector_Internal;
+      typename YVector::non_const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<YVector>::array_layout,
+      typename YVector::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      YVector_Internal;
 
   AMatrix_Internal A_i = A;
   XVector_Internal x_i = x;
   YVector_Internal y_i = y;
 
-  if(alpha == Kokkos::ArithTraits<AlphaType>::zero() ||
-      A_i.numRows() == 0 || A_i.numCols() == 0 || A_i.nnz() == 0)
-  {
-    //This is required to maintain semantics of KokkosKernels native SpMV:
-    //if y contains NaN but beta = 0, the result y should be filled with 0.
-    //For example, this is useful for passing in uninitialized y and beta=0.
-    if(beta == Kokkos::ArithTraits<BetaType>::zero())
+  if (alpha == Kokkos::ArithTraits<AlphaType>::zero() || A_i.numRows() == 0 ||
+      A_i.numCols() == 0 || A_i.nnz() == 0) {
+    // This is required to maintain semantics of KokkosKernels native SpMV:
+    // if y contains NaN but beta = 0, the result y should be filled with 0.
+    // For example, this is useful for passing in uninitialized y and beta=0.
+    if (beta == Kokkos::ArithTraits<BetaType>::zero())
       Kokkos::deep_copy(y_i, Kokkos::ArithTraits<BetaType>::zero());
     else
       KokkosBlas::scal(y_i, beta, y_i);
     return;
   }
 
-  //Whether to call KokkosKernel's native implementation, even if a TPL impl is available
-  bool useFallback = controls.isParameter("algorithm") && controls.getParameter("algorithm") == "native";
+  // Whether to call KokkosKernel's native implementation, even if a TPL impl is
+  // available
+  bool useFallback = controls.isParameter("algorithm") &&
+                     controls.getParameter("algorithm") == "native";
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-  //cuSPARSE does not support the conjugate mode (C), and cuSPARSE 9 only supports the normal (N) mode.
-  if(std::is_same<typename AMatrix_Internal::memory_space, Kokkos::CudaSpace>::value ||
-      std::is_same<typename AMatrix_Internal::memory_space, Kokkos::CudaUVMSpace>::value)
-  {
+  // cuSPARSE does not support the conjugate mode (C), and cuSPARSE 9 only
+  // supports the normal (N) mode.
+  if (std::is_same<typename AMatrix_Internal::memory_space,
+                   Kokkos::CudaSpace>::value ||
+      std::is_same<typename AMatrix_Internal::memory_space,
+                   Kokkos::CudaUVMSpace>::value) {
 #if (9000 <= CUDA_VERSION)
     useFallback = useFallback || (mode[0] != NoTranspose[0]);
 #endif
@@ -169,658 +193,1382 @@ spmv (KokkosKernels::Experimental::Controls controls,
   }
 #endif
 
+#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE
+  if (std::is_same<typename AMatrix_Internal::memory_space,
+                   Kokkos::Experimental::HIPSpace>::value) {
+    useFallback = useFallback || (mode[0] != NoTranspose[0]);
+  }
+#endif
+
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
-  if(std::is_same<typename AMatrix_Internal::memory_space, Kokkos::HostSpace>::value)
-  {
+  if (std::is_same<typename AMatrix_Internal::memory_space,
+                   Kokkos::HostSpace>::value) {
     useFallback = useFallback || (mode[0] == Conjugate[0]);
   }
 #endif
 
-  if(useFallback)
-  {
-    //Explicitly call the non-TPL SPMV implementation
-    std::string label = "KokkosSparse::spmv[NATIVE," + Kokkos::ArithTraits<typename AMatrix_Internal::non_const_value_type>::name() + "]";
+  if (useFallback) {
+    // Explicitly call the non-TPL SPMV implementation
+    std::string label =
+        "KokkosSparse::spmv[NATIVE," +
+        Kokkos::ArithTraits<
+            typename AMatrix_Internal::non_const_value_type>::name() +
+        "]";
     Kokkos::Profiling::pushRegion(label);
-    Impl::SPMV<
-      typename AMatrix_Internal::value_type,
-      typename AMatrix_Internal::ordinal_type,
-      typename AMatrix_Internal::device_type,
-      typename AMatrix_Internal::memory_traits,
-      typename AMatrix_Internal::size_type,
-      typename XVector_Internal::value_type*,
-      typename XVector_Internal::array_layout,
-      typename XVector_Internal::device_type,
-      typename XVector_Internal::memory_traits,
-      typename YVector_Internal::value_type*,
-      typename YVector_Internal::array_layout,
-      typename YVector_Internal::device_type,
-      typename YVector_Internal::memory_traits,
-      false>
-        ::spmv (controls, mode, alpha, A_i, x_i, beta, y_i);
+    Impl::SPMV<typename AMatrix_Internal::value_type,
+               typename AMatrix_Internal::ordinal_type,
+               typename AMatrix_Internal::device_type,
+               typename AMatrix_Internal::memory_traits,
+               typename AMatrix_Internal::size_type,
+               typename XVector_Internal::value_type*,
+               typename XVector_Internal::array_layout,
+               typename XVector_Internal::device_type,
+               typename XVector_Internal::memory_traits,
+               typename YVector_Internal::value_type*,
+               typename YVector_Internal::array_layout,
+               typename YVector_Internal::device_type,
+               typename YVector_Internal::memory_traits, false>::spmv(controls,
+                                                                      mode,
+                                                                      alpha,
+                                                                      A_i, x_i,
+                                                                      beta,
+                                                                      y_i);
     Kokkos::Profiling::popRegion();
+  } else {
+    // note: the cuSPARSE spmv wrapper defines a profiling region, so one is not
+    // needed here.
+    Impl::SPMV<typename AMatrix_Internal::value_type,
+               typename AMatrix_Internal::ordinal_type,
+               typename AMatrix_Internal::device_type,
+               typename AMatrix_Internal::memory_traits,
+               typename AMatrix_Internal::size_type,
+               typename XVector_Internal::value_type*,
+               typename XVector_Internal::array_layout,
+               typename XVector_Internal::device_type,
+               typename XVector_Internal::memory_traits,
+               typename YVector_Internal::value_type*,
+               typename YVector_Internal::array_layout,
+               typename YVector_Internal::device_type,
+               typename YVector_Internal::memory_traits>::spmv(controls, mode,
+                                                               alpha, A_i, x_i,
+                                                               beta, y_i);
   }
-  else
-  {
-    //note: the cuSPARSE spmv wrapper defines a profiling region, so one is not needed here.
-    Impl::SPMV<
-      typename AMatrix_Internal::value_type,
-      typename AMatrix_Internal::ordinal_type,
-      typename AMatrix_Internal::device_type,
-      typename AMatrix_Internal::memory_traits,
-      typename AMatrix_Internal::size_type,
-      typename XVector_Internal::value_type*,
-      typename XVector_Internal::array_layout,
-      typename XVector_Internal::device_type,
-      typename XVector_Internal::memory_traits,
-      typename YVector_Internal::value_type*,
-      typename YVector_Internal::array_layout,
-      typename YVector_Internal::device_type,
-      typename YVector_Internal::memory_traits>
-        ::spmv (controls, mode, alpha, A_i, x_i, beta, y_i);
+}
+
+#ifdef DOXY  // hide SFINAE from documentation
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector>
+#else
+template <
+    class AlphaType, class AMatrix, class XVector, class BetaType,
+    class YVector,
+    typename std::enable_if<KokkosSparse::Experimental::is_block_crs_matrix<
+        AMatrix>::value>::type* = nullptr>
+#endif
+void spmv(KokkosKernels::Experimental::Controls controls, const char mode[],
+          const AlphaType& alpha, const AMatrix& A, const XVector& x,
+          const BetaType& beta, const YVector& y, const RANK_ONE) {
+  // Make sure that x and y have the same rank.
+  static_assert(
+      static_cast<int>(XVector::rank) == static_cast<int>(YVector::rank),
+      "KokkosSparse::spmv: Vector ranks do not match.");
+  // Make sure that x (and therefore y) is rank 1.
+  static_assert(static_cast<int>(XVector::rank) == 1,
+                "KokkosSparse::spmv: Both Vector inputs must have rank 1 "
+                "in order to call this specialization of spmv.");
+  // Make sure that y is non-const.
+  static_assert(std::is_same<typename YVector::value_type,
+                             typename YVector::non_const_value_type>::value,
+                "KokkosSparse::spmv: Output Vector must be non-const.");
+  //
+  if (A.blockDim() == 1) {
+    KokkosSparse::CrsMatrix<
+        typename AMatrix::value_type, typename AMatrix::ordinal_type,
+        typename AMatrix::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>,
+        typename AMatrix::size_type>
+        Acrs("bsr_to_crs", A.numCols(), A.values, A.graph);
+    KokkosSparse::spmv(controls, mode, alpha, Acrs, x, beta, y, RANK_ONE());
+    return;
   }
+  // Check compatibility of dimensions at run time.
+  if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) {
+    if ((x.extent(1) != y.extent(1)) ||
+        (static_cast<size_t>(A.numCols() * A.blockDim()) !=
+         static_cast<size_t>(x.extent(0))) ||
+        (static_cast<size_t>(A.numRows() * A.blockDim()) !=
+         static_cast<size_t>(y.extent(0)))) {
+      std::ostringstream os;
+      os << "KokkosSparse::spmv (BlockCrsMatrix): Dimensions do not match: "
+         << ", A: " << A.numRows() * A.blockDim() << " x "
+         << A.numCols() * A.blockDim() << ", x: " << x.extent(0) << " x "
+         << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1);
+
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
+    }
+  } else {
+    if ((x.extent(1) != y.extent(1)) ||
+        (static_cast<size_t>(A.numCols() * A.blockDim()) !=
+         static_cast<size_t>(y.extent(0))) ||
+        (static_cast<size_t>(A.numRows() * A.blockDim()) !=
+         static_cast<size_t>(x.extent(0)))) {
+      std::ostringstream os;
+      os << "KokkosSparse::spmv (BlockCrsMatrix): Dimensions do not match "
+            "(transpose): "
+         << ", A: " << A.numRows() * A.blockDim() << " x "
+         << A.numCols() * A.blockDim() << ", x: " << x.extent(0) << " x "
+         << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1);
+
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
+    }
+  }
+  //
+  typedef KokkosSparse::Experimental::BlockCrsMatrix<
+      typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type,
+      typename AMatrix::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>,
+      typename AMatrix::const_size_type>
+      AMatrix_Internal;
+
+  typedef Kokkos::View<
+      typename XVector::const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
+      typename XVector::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      XVector_Internal;
+
+  typedef Kokkos::View<
+      typename YVector::non_const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<YVector>::array_layout,
+      typename YVector::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      YVector_Internal;
+
+  AMatrix_Internal A_i(A);
+  XVector_Internal x_i(x);
+  YVector_Internal y_i(y);
+
+#define __SPMV_TYPES__                               \
+  typename AMatrix_Internal::const_value_type,       \
+      typename AMatrix_Internal::const_ordinal_type, \
+      typename AMatrix_Internal::device_type,        \
+      typename AMatrix_Internal::memory_traits,      \
+      typename AMatrix_Internal::const_size_type,    \
+      typename XVector_Internal::const_value_type*,  \
+      typename XVector_Internal::array_layout,       \
+      typename XVector_Internal::device_type,        \
+      typename XVector_Internal::memory_traits,      \
+      typename YVector_Internal::value_type*,        \
+      typename YVector_Internal::array_layout,       \
+      typename YVector_Internal::device_type,        \
+      typename YVector_Internal::memory_traits
+
+  constexpr bool eti_spec_avail =
+      KokkosSparse::Experimental::Impl::spmv_blockcrsmatrix_eti_spec_avail<
+          __SPMV_TYPES__>::value;
+
+  Experimental::Impl::SPMV_BLOCKCRSMATRIX<
+      __SPMV_TYPES__, eti_spec_avail>::spmv_blockcrsmatrix(controls, mode,
+                                                           alpha, A_i, x_i,
+                                                           beta, y_i);
+#undef __SPMV_TYPES__
 }
 
-template<class AlphaType, class AMatrix, class XVector, class BetaType, class YVector ,
-         class XLayout = typename XVector::array_layout>
+#ifdef DOXY  // hide SFINAE
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector>
+#else
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector,
+          typename std::enable_if<KokkosSparse::Experimental::is_bsr_matrix<
+              AMatrix>::value>::type* = nullptr>
+#endif
+void spmv(KokkosKernels::Experimental::Controls controls, const char mode[],
+          const AlphaType& alpha, const AMatrix& A, const XVector& x,
+          const BetaType& beta, const YVector& y, const RANK_ONE) {
+  // Make sure that x and y have the same rank.
+  static_assert(
+      static_cast<int>(XVector::rank) == static_cast<int>(YVector::rank),
+      "KokkosSparse::spmv: Vector ranks do not match.");
+  // Make sure that x (and therefore y) is rank 1.
+  static_assert(static_cast<int>(XVector::rank) == 1,
+                "KokkosSparse::spmv: Both Vector inputs must have rank 1 "
+                "in order to call this specialization of spmv.");
+  // Make sure that y is non-const.
+  static_assert(std::is_same<typename YVector::value_type,
+                             typename YVector::non_const_value_type>::value,
+                "KokkosSparse::spmv: Output Vector must be non-const.");
+
+  //
+  if (A.blockDim() == 1) {
+    KokkosSparse::CrsMatrix<
+        typename AMatrix::value_type, typename AMatrix::ordinal_type,
+        typename AMatrix::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>,
+        typename AMatrix::size_type>
+        Acrs("bsr_to_crs", A.numCols(), A.values, A.graph);
+    KokkosSparse::spmv(controls, mode, alpha, Acrs, x, beta, y, RANK_ONE());
+    return;
+  }
+  // Check compatibility of dimensions at run time.
+  if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) {
+    if ((x.extent(1) != y.extent(1)) ||
+        (static_cast<size_t>(A.numCols() * A.blockDim()) !=
+         static_cast<size_t>(x.extent(0))) ||
+        (static_cast<size_t>(A.numRows() * A.blockDim()) !=
+         static_cast<size_t>(y.extent(0)))) {
+      std::ostringstream os;
+      os << "KokkosSparse::spmv (BsrMatrix): Dimensions do not match: "
+         << ", A: " << A.numRows() * A.blockDim() << " x "
+         << A.numCols() * A.blockDim() << ", x: " << x.extent(0) << " x "
+         << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1);
+
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
+    }
+  } else {
+    if ((x.extent(1) != y.extent(1)) ||
+        (static_cast<size_t>(A.numCols() * A.blockDim()) !=
+         static_cast<size_t>(y.extent(0))) ||
+        (static_cast<size_t>(A.numRows() * A.blockDim()) !=
+         static_cast<size_t>(x.extent(0)))) {
+      std::ostringstream os;
+      os << "KokkosSparse::spmv (BsrMatrix): Dimensions do not match "
+            "(transpose): "
+         << ", A: " << A.numRows() * A.blockDim() << " x "
+         << A.numCols() * A.blockDim() << ", x: " << x.extent(0) << " x "
+         << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1);
+
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
+    }
+  }
+  //
+  typedef KokkosSparse::Experimental::BsrMatrix<
+      typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type,
+      typename AMatrix::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>,
+      typename AMatrix::const_size_type>
+      AMatrix_Internal;
+
+  typedef Kokkos::View<
+      typename XVector::const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
+      typename XVector::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      XVector_Internal;
+
+  typedef Kokkos::View<
+      typename YVector::non_const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<YVector>::array_layout,
+      typename YVector::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      YVector_Internal;
+
+  AMatrix_Internal A_i(A);
+  XVector_Internal x_i(x);
+  YVector_Internal y_i(y);
+
+  if (alpha == Kokkos::ArithTraits<AlphaType>::zero() || A_i.numRows() == 0 ||
+      A_i.numCols() == 0 || A_i.nnz() == 0) {
+    // This is required to maintain semantics of KokkosKernels native SpMV:
+    // if y contains NaN but beta = 0, the result y should be filled with 0.
+    // For example, this is useful for passing in uninitialized y and beta=0.
+    if (beta == Kokkos::ArithTraits<BetaType>::zero())
+      Kokkos::deep_copy(y_i, Kokkos::ArithTraits<BetaType>::zero());
+    else
+      KokkosBlas::scal(y_i, beta, y_i);
+    return;
+  }
+
+  //
+  // Whether to call KokkosKernel's native implementation, even if a TPL impl is
+  // available
+  bool useFallback = controls.isParameter("algorithm") &&
+                     controls.getParameter("algorithm") == "native";
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+  // cuSPARSE does not support the modes (C), (T), (H)
+  if (std::is_same<typename AMatrix_Internal::memory_space,
+                   Kokkos::CudaSpace>::value ||
+      std::is_same<typename AMatrix_Internal::memory_space,
+                   Kokkos::CudaUVMSpace>::value) {
+    useFallback = useFallback || (mode[0] != NoTranspose[0]);
+  }
+#endif
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+  if (std::is_same<typename AMatrix_Internal::memory_space,
+                   Kokkos::HostSpace>::value) {
+    useFallback = useFallback || (mode[0] == Conjugate[0]);
+  }
+#endif
+
+  if (useFallback) {
+    // Explicitly call the non-TPL SPMV_BSRMATRIX implementation
+    std::string label =
+        "KokkosSparse::spmv[NATIVE,BSRMATRIX," +
+        Kokkos::ArithTraits<
+            typename AMatrix_Internal::non_const_value_type>::name() +
+        "]";
+    Kokkos::Profiling::pushRegion(label);
+    Experimental::Impl::SPMV_BSRMATRIX<
+        typename AMatrix_Internal::const_value_type,
+        typename AMatrix_Internal::const_ordinal_type,
+        typename AMatrix_Internal::device_type,
+        typename AMatrix_Internal::memory_traits,
+        typename AMatrix_Internal::const_size_type,
+        typename XVector_Internal::const_value_type*,
+        typename XVector_Internal::array_layout,
+        typename XVector_Internal::device_type,
+        typename XVector_Internal::memory_traits,
+        typename YVector_Internal::value_type*,
+        typename YVector_Internal::array_layout,
+        typename YVector_Internal::device_type,
+        typename YVector_Internal::memory_traits,
+        false>::spmv_bsrmatrix(controls, mode, alpha, A_i, x_i, beta, y_i);
+    Kokkos::Profiling::popRegion();
+  } else {
+#define __SPMV_TYPES__                               \
+  typename AMatrix_Internal::const_value_type,       \
+      typename AMatrix_Internal::const_ordinal_type, \
+      typename AMatrix_Internal::device_type,        \
+      typename AMatrix_Internal::memory_traits,      \
+      typename AMatrix_Internal::const_size_type,    \
+      typename XVector_Internal::const_value_type*,  \
+      typename XVector_Internal::array_layout,       \
+      typename XVector_Internal::device_type,        \
+      typename XVector_Internal::memory_traits,      \
+      typename YVector_Internal::value_type*,        \
+      typename YVector_Internal::array_layout,       \
+      typename YVector_Internal::device_type,        \
+      typename YVector_Internal::memory_traits
+
+    constexpr bool tpl_spec_avail =
+        KokkosSparse::Experimental::Impl::spmv_bsrmatrix_tpl_spec_avail<
+            __SPMV_TYPES__>::value;
+
+    constexpr bool eti_spec_avail =
+        tpl_spec_avail
+            ? KOKKOSKERNELS_IMPL_COMPILE_LIBRARY /* force FALSE in app/test */
+            : KokkosSparse::Experimental::Impl::spmv_bsrmatrix_eti_spec_avail<
+                  __SPMV_TYPES__>::value;
+
+    Experimental::Impl::SPMV_BSRMATRIX<__SPMV_TYPES__, tpl_spec_avail,
+                                       eti_spec_avail>::spmv_bsrmatrix(controls,
+                                                                       mode,
+                                                                       alpha,
+                                                                       A_i, x_i,
+                                                                       beta,
+                                                                       y_i);
+
+#undef __SPMV_TYPES__
+  }
+}
+
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector, class XLayout = typename XVector::array_layout>
 struct SPMV2D1D {
-  static bool spmv2d1d (const char mode[],
-        const AlphaType& alpha,
-        const AMatrix& A,
-        const XVector& x,
-        const BetaType& beta,
-        const YVector& y);
+  static bool spmv2d1d(const char mode[], const AlphaType& alpha,
+                       const AMatrix& A, const XVector& x, const BetaType& beta,
+                       const YVector& y);
 };
 
-
-#if defined (KOKKOSKERNELS_INST_LAYOUTSTRIDE) || !defined(KOKKOSKERNELS_ETI_ONLY)
-template<class AlphaType, class AMatrix, class XVector, class BetaType, class YVector>
-struct SPMV2D1D<AlphaType, AMatrix, XVector, BetaType, YVector, Kokkos::LayoutStride>{
-  static bool spmv2d1d (const char mode[],
-        const AlphaType& alpha,
-        const AMatrix& A,
-        const XVector& x,
-        const BetaType& beta,
-        const YVector& y)
-  {
-    spmv (mode, alpha, A, x, beta, y);
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || !defined(KOKKOSKERNELS_ETI_ONLY)
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector>
+struct SPMV2D1D<AlphaType, AMatrix, XVector, BetaType, YVector,
+                Kokkos::LayoutStride> {
+  static bool spmv2d1d(const char mode[], const AlphaType& alpha,
+                       const AMatrix& A, const XVector& x, const BetaType& beta,
+                       const YVector& y) {
+    spmv(mode, alpha, A, x, beta, y);
     return true;
   }
 #else
-template<class AlphaType, class AMatrix, class XVector, class BetaType, class YVector>
-struct SPMV2D1D<AlphaType, AMatrix, XVector, BetaType, YVector, Kokkos::LayoutStride>{
-  static bool spmv2d1d (const char /*mode*/[],
-	const AlphaType& /*alpha*/,
-	const AMatrix& /*A*/,
-        const XVector& /*x*/,
-        const BetaType& /*beta*/,
-        const YVector& /*y*/)
-  {
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector>
+struct SPMV2D1D<AlphaType, AMatrix, XVector, BetaType, YVector,
+                Kokkos::LayoutStride> {
+  static bool spmv2d1d(const char /*mode*/[], const AlphaType& /*alpha*/,
+                       const AMatrix& /*A*/, const XVector& /*x*/,
+                       const BetaType& /*beta*/, const YVector& /*y*/) {
     return false;
   }
 #endif
 };
 
-#if defined (KOKKOSKERNELS_INST_LAYOUTLEFT) || !defined(KOKKOSKERNELS_ETI_ONLY)
-template<class AlphaType, class AMatrix, class XVector, class BetaType, class YVector>
-struct SPMV2D1D<AlphaType, AMatrix, XVector, BetaType, YVector, Kokkos::LayoutLeft>{
-  static bool spmv2d1d (const char mode[],
-        const AlphaType& alpha,
-        const AMatrix& A,
-        const XVector& x,
-        const BetaType& beta,
-        const YVector& y)
-  {
-    spmv (mode, alpha, A, x, beta, y);
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || !defined(KOKKOSKERNELS_ETI_ONLY)
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector>
+struct SPMV2D1D<AlphaType, AMatrix, XVector, BetaType, YVector,
+                Kokkos::LayoutLeft> {
+  static bool spmv2d1d(const char mode[], const AlphaType& alpha,
+                       const AMatrix& A, const XVector& x, const BetaType& beta,
+                       const YVector& y) {
+    spmv(mode, alpha, A, x, beta, y);
     return true;
   }
 #else
-template<class AlphaType, class AMatrix, class XVector, class BetaType, class YVector>
-struct SPMV2D1D<AlphaType, AMatrix, XVector, BetaType, YVector, Kokkos::LayoutLeft>{
-  static bool spmv2d1d (const char /*mode*/[],
-        const AlphaType& /*alpha*/,
-        const AMatrix& /*A*/,
-        const XVector& /*x*/,
-        const BetaType& /*beta*/,
-        const YVector& /*y*/)
-  {
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector>
+struct SPMV2D1D<AlphaType, AMatrix, XVector, BetaType, YVector,
+                Kokkos::LayoutLeft> {
+  static bool spmv2d1d(const char /*mode*/[], const AlphaType& /*alpha*/,
+                       const AMatrix& /*A*/, const XVector& /*x*/,
+                       const BetaType& /*beta*/, const YVector& /*y*/) {
     return false;
   }
 #endif
 };
 
-
-#if defined (KOKKOSKERNELS_INST_LAYOUTLEFT) || !defined(KOKKOSKERNELS_ETI_ONLY)
-template<class AlphaType, class AMatrix, class XVector, class BetaType, class YVector>
-struct SPMV2D1D<AlphaType, AMatrix, XVector, BetaType, YVector, Kokkos::LayoutRight>{
-  static bool spmv2d1d (const char mode[],
-        const AlphaType& alpha,
-        const AMatrix& A,
-        const XVector& x,
-        const BetaType& beta,
-        const YVector& y)
-  {
-    spmv (mode, alpha, A, x, beta, y);
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || !defined(KOKKOSKERNELS_ETI_ONLY)
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector>
+struct SPMV2D1D<AlphaType, AMatrix, XVector, BetaType, YVector,
+                Kokkos::LayoutRight> {
+  static bool spmv2d1d(const char mode[], const AlphaType& alpha,
+                       const AMatrix& A, const XVector& x, const BetaType& beta,
+                       const YVector& y) {
+    spmv(mode, alpha, A, x, beta, y);
     return true;
   }
 #else
-template<class AlphaType, class AMatrix, class XVector, class BetaType, class YVector>
-struct SPMV2D1D<AlphaType, AMatrix, XVector, BetaType, YVector, Kokkos::LayoutRight>{
-  static bool spmv2d1d (const char /*mode*/[],
-        const AlphaType& /*alpha*/,
-        const AMatrix& /*A*/,
-        const XVector& /*x*/,
-        const BetaType& /*beta*/,
-        const YVector& /*y*/)
-  {
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector>
+struct SPMV2D1D<AlphaType, AMatrix, XVector, BetaType, YVector,
+                Kokkos::LayoutRight> {
+  static bool spmv2d1d(const char /*mode*/[], const AlphaType& /*alpha*/,
+                       const AMatrix& /*A*/, const XVector& /*x*/,
+                       const BetaType& /*beta*/, const YVector& /*y*/) {
     return false;
   }
 #endif
 };
 
-template<class AlphaType, class AMatrix, class XVector, class BetaType, class YVector>
-void
-spmv (KokkosKernels::Experimental::Controls /*controls*/,
-      const char mode[],
-      const AlphaType& alpha,
-      const AMatrix& A,
-      const XVector& x,
-      const BetaType& beta,
-      const YVector& y,
-      const RANK_TWO)
-{
-  // Make sure that both x and y have the same rank.
-  static_assert (static_cast<int> (XVector::rank) ==
-                 static_cast<int> (YVector::rank),
-    "KokkosBlas::spmv: Vector ranks do not match.");
+/// \brief Tag-dispatch sparse matrix-vector multiply on multivectors
+///
+/// \tparam AMatrix A KokkosSparse::CrsMatrix,
+/// KokkosSparse::Experimental::BsrMatrix, or
+/// KokkosSparse::Experimental::BlockCrsMatrix
+///
+/// \param controls [in] kokkos-kernels control structure.
+/// \param mode [in] \c "N" for no transpose
+/// \param alpha [in] Scalar multiplier for the matrix A.
+/// \param A [in] The sparse matrix A.
+/// \param x [in] A multivector (rank-2 Kokkos::View).
+/// \param beta [in] Scalar multiplier for the multivector y.
+/// \param y [in/out] multivector (rank-2 Kokkos::View).
+/// \param RANK_TWO tag-dispatch
+///
+#ifdef DOXY
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector>
+#else
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector,
+          typename std::enable_if<
+              KokkosSparse::is_crs_matrix<AMatrix>::value>::type* = nullptr>
+#endif
+void spmv(KokkosKernels::Experimental::Controls /*controls*/, const char mode[],
+          const AlphaType& alpha, const AMatrix& A, const XVector& x,
+          const BetaType& beta, const YVector& y, const RANK_TWO) {
+  // Make sure that x and y have the same rank.
+  static_assert(
+      static_cast<int>(XVector::rank) == static_cast<int>(YVector::rank),
+      "KokkosSparse::spmv: Vector ranks do not match.");
+  // Make sure that x (and therefore y) is rank 2.
+  static_assert(static_cast<int>(XVector::rank) == 2,
+                "KokkosSparse::spmv: Both Vector inputs must have rank 2 "
+                "in order to call this specialization of spmv.");
   // Make sure that y is non-const.
-  static_assert (std::is_same<typename YVector::value_type,
-                   typename YVector::non_const_value_type>::value,
-    "KokkosBlas::spmv: Output Vector must be non-const.");
+  static_assert(std::is_same<typename YVector::value_type,
+                             typename YVector::non_const_value_type>::value,
+                "KokkosSparse::spmv: Output Vector must be non-const.");
 
   // Check compatibility of dimensions at run time.
   if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) {
     if ((x.extent(1) != y.extent(1)) ||
-        (static_cast<size_t> (A.numCols ()) > static_cast<size_t> (x.extent(0))) ||
-        (static_cast<size_t> (A.numRows ()) > static_cast<size_t> (y.extent(0)))) {
+        (static_cast<size_t>(A.numCols()) > static_cast<size_t>(x.extent(0))) ||
+        (static_cast<size_t>(A.numRows()) > static_cast<size_t>(y.extent(0)))) {
       std::ostringstream os;
       os << "KokkosBlas::spmv: Dimensions do not match: "
-         << ", A: " << A.numRows () << " x " << A.numCols()
+         << ", A: " << A.numRows() << " x " << A.numCols()
          << ", x: " << x.extent(0) << " x " << x.extent(1)
          << ", y: " << y.extent(0) << " x " << y.extent(1);
-      Kokkos::Impl::throw_runtime_exception (os.str ());
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
     }
   } else {
     if ((x.extent(1) != y.extent(1)) ||
-        (static_cast<size_t> (A.numCols ()) > static_cast<size_t> (y.extent(0))) ||
-        (static_cast<size_t> (A.numRows ()) > static_cast<size_t> (x.extent(0)))) {
+        (static_cast<size_t>(A.numCols()) > static_cast<size_t>(y.extent(0))) ||
+        (static_cast<size_t>(A.numRows()) > static_cast<size_t>(x.extent(0)))) {
       std::ostringstream os;
       os << "KokkosBlas::spmv: Dimensions do not match (transpose): "
-         << ", A: " << A.numRows () << " x " << A.numCols()
+         << ", A: " << A.numRows() << " x " << A.numCols()
          << ", x: " << x.extent(0) << " x " << x.extent(1)
          << ", y: " << y.extent(0) << " x " << y.extent(1);
-      Kokkos::Impl::throw_runtime_exception (os.str ());
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
     }
   }
 
   typedef KokkosSparse::CrsMatrix<
-        typename AMatrix::const_value_type,
-        typename AMatrix::const_ordinal_type,
-        typename AMatrix::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged>,
-        typename AMatrix::const_size_type>              AMatrix_Internal;
+      typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type,
+      typename AMatrix::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>,
+      typename AMatrix::const_size_type>
+      AMatrix_Internal;
 
   AMatrix_Internal A_i = A;
 
   // Call single-vector version if appropriate
   if (x.extent(1) == 1) {
-    typedef Kokkos::View<typename XVector::const_value_type*,
-      typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
-      typename XVector::device_type,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > XVector_SubInternal;
-    typedef Kokkos::View<typename YVector::non_const_value_type*,
-      typename KokkosKernels::Impl::GetUnifiedLayout<YVector>::array_layout,
-      typename YVector::device_type,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged> > YVector_SubInternal;
+    typedef Kokkos::View<
+        typename XVector::const_value_type*,
+        typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
+        typename XVector::device_type,
+        Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+        XVector_SubInternal;
+    typedef Kokkos::View<
+        typename YVector::non_const_value_type*,
+        typename KokkosKernels::Impl::GetUnifiedLayout<YVector>::array_layout,
+        typename YVector::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+        YVector_SubInternal;
 
-    XVector_SubInternal x_i = Kokkos::subview (x, Kokkos::ALL (), 0);
-    YVector_SubInternal y_i = Kokkos::subview (y, Kokkos::ALL (), 0);
+    XVector_SubInternal x_i = Kokkos::subview(x, Kokkos::ALL(), 0);
+    YVector_SubInternal y_i = Kokkos::subview(y, Kokkos::ALL(), 0);
 
-    //spmv (mode, alpha, A, x_i, beta, y_i);
-    using impl_type = SPMV2D1D<AlphaType, AMatrix_Internal,
-      XVector_SubInternal, BetaType, YVector_SubInternal,
-      typename XVector_SubInternal::array_layout>;
-    if (impl_type::spmv2d1d (mode, alpha, A, x_i, beta, y_i)) {
+    // spmv (mode, alpha, A, x_i, beta, y_i);
+    using impl_type = SPMV2D1D<AlphaType, AMatrix_Internal, XVector_SubInternal,
+                               BetaType, YVector_SubInternal,
+                               typename XVector_SubInternal::array_layout>;
+    if (impl_type::spmv2d1d(mode, alpha, A, x_i, beta, y_i)) {
       return;
     }
   }
   {
     typedef Kokkos::View<
-            typename XVector::const_value_type**,
-            typename XVector::array_layout,
-            typename XVector::device_type,
-            Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > XVector_Internal;
+        typename XVector::const_value_type**, typename XVector::array_layout,
+        typename XVector::device_type,
+        Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+        XVector_Internal;
 
-    typedef Kokkos::View<
-              typename YVector::non_const_value_type**,
-              typename YVector::array_layout,
-              typename YVector::device_type,
-              Kokkos::MemoryTraits<Kokkos::Unmanaged> > YVector_Internal;
+    typedef Kokkos::View<typename YVector::non_const_value_type**,
+                         typename YVector::array_layout,
+                         typename YVector::device_type,
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+        YVector_Internal;
 
     XVector_Internal x_i = x;
     YVector_Internal y_i = y;
 
-    return Impl::SPMV_MV<typename AMatrix_Internal::value_type,
-                         typename AMatrix_Internal::ordinal_type,
-                         typename AMatrix_Internal::device_type,
-                         typename AMatrix_Internal::memory_traits,
-                         typename AMatrix_Internal::size_type,
-                         typename XVector_Internal::value_type**,
-                         typename XVector_Internal::array_layout,
-                         typename XVector_Internal::device_type,
-                         typename XVector_Internal::memory_traits,
-                         typename YVector_Internal::value_type**,
-                         typename YVector_Internal::array_layout,
-                         typename YVector_Internal::device_type,
-                         typename YVector_Internal::memory_traits>::spmv_mv (mode, alpha, A_i, x_i, beta, y_i);
+    return Impl::SPMV_MV<
+        typename AMatrix_Internal::value_type,
+        typename AMatrix_Internal::ordinal_type,
+        typename AMatrix_Internal::device_type,
+        typename AMatrix_Internal::memory_traits,
+        typename AMatrix_Internal::size_type,
+        typename XVector_Internal::value_type**,
+        typename XVector_Internal::array_layout,
+        typename XVector_Internal::device_type,
+        typename XVector_Internal::memory_traits,
+        typename YVector_Internal::value_type**,
+        typename YVector_Internal::array_layout,
+        typename YVector_Internal::device_type,
+        typename YVector_Internal::memory_traits>::spmv_mv(mode, alpha, A_i,
+                                                           x_i, beta, y_i);
+  }
+}
+
+#ifdef DOXY  // hide SFINAE
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector>
+#else
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector,
+          typename std::enable_if<KokkosSparse::Experimental::is_bsr_matrix<
+              AMatrix>::value>::type* = nullptr>
+#endif
+void spmv(KokkosKernels::Experimental::Controls controls, const char mode[],
+          const AlphaType& alpha, const AMatrix& A, const XVector& x,
+          const BetaType& beta, const YVector& y, const RANK_TWO) {
+  // Make sure that x and y have the same rank.
+  static_assert(
+      static_cast<int>(XVector::rank) == static_cast<int>(YVector::rank),
+      "KokkosSparse::spmv: Vector ranks do not match.");
+  // Make sure that x (and therefore y) is rank 2.
+  static_assert(static_cast<int>(XVector::rank) == 2,
+                "KokkosSparse::spmv: Both Vector inputs must have rank 2 "
+                "in order to call this specialization of spmv.");
+  // Make sure that y is non-const.
+  static_assert(std::is_same<typename YVector::value_type,
+                             typename YVector::non_const_value_type>::value,
+                "KokkosSparse::spmv: Output Vector must be non-const.");
+
+  //
+  if (A.blockDim() == 1) {
+    KokkosSparse::CrsMatrix<
+        typename AMatrix::value_type, typename AMatrix::ordinal_type,
+        typename AMatrix::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>,
+        typename AMatrix::size_type>
+        Acrs("bsr_to_crs", A.numCols(), A.values, A.graph);
+    KokkosSparse::spmv(controls, mode, alpha, Acrs, x, beta, y, RANK_TWO());
+    return;
+  }
+  // Check compatibility of dimensions at run time.
+  if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) {
+    if ((x.extent(1) != y.extent(1)) ||
+        (static_cast<size_t>(A.numCols() * A.blockDim()) !=
+         static_cast<size_t>(x.extent(0))) ||
+        (static_cast<size_t>(A.numRows() * A.blockDim()) !=
+         static_cast<size_t>(y.extent(0)))) {
+      std::ostringstream os;
+      os << "KokkosSparse::spmv (BsrMatrix): Dimensions do not match: "
+         << ", A: " << A.numRows() * A.blockDim() << " x "
+         << A.numCols() * A.blockDim() << ", x: " << x.extent(0) << " x "
+         << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1);
+
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
+    }
+  } else {
+    if ((x.extent(1) != y.extent(1)) ||
+        (static_cast<size_t>(A.numCols() * A.blockDim()) !=
+         static_cast<size_t>(y.extent(0))) ||
+        (static_cast<size_t>(A.numRows() * A.blockDim()) !=
+         static_cast<size_t>(x.extent(0)))) {
+      std::ostringstream os;
+      os << "KokkosSparse::spmv (BsrMatrix): Dimensions do not match "
+            "(transpose): "
+         << ", A: " << A.numRows() * A.blockDim() << " x "
+         << A.numCols() * A.blockDim() << ", x: " << x.extent(0) << " x "
+         << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1);
+
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
+    }
+  }
+  //
+  typedef KokkosSparse::Experimental::BsrMatrix<
+      typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type,
+      typename AMatrix::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>,
+      typename AMatrix::const_size_type>
+      AMatrix_Internal;
+  AMatrix_Internal A_i(A);
+
+  typedef Kokkos::View<
+      typename XVector::const_value_type**,
+      typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
+      typename XVector::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      XVector_Internal;
+  XVector_Internal x_i(x);
+
+  typedef Kokkos::View<
+      typename YVector::non_const_value_type**,
+      typename KokkosKernels::Impl::GetUnifiedLayout<YVector>::array_layout,
+      typename YVector::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      YVector_Internal;
+  YVector_Internal y_i(y);
+  //
+  if (alpha == Kokkos::ArithTraits<AlphaType>::zero() || A_i.numRows() == 0 ||
+      A_i.numCols() == 0 || A_i.nnz() == 0) {
+    // This is required to maintain semantics of KokkosKernels native SpMV:
+    // if y contains NaN but beta = 0, the result y should be filled with 0.
+    // For example, this is useful for passing in uninitialized y and beta=0.
+    if (beta == Kokkos::ArithTraits<BetaType>::zero())
+      Kokkos::deep_copy(y_i, Kokkos::ArithTraits<BetaType>::zero());
+    else
+      KokkosBlas::scal(y_i, beta, y_i);
+    return;
+  }
+  //
+  // Call single-vector version if appropriate
+  //
+  if (x.extent(1) == 1) {
+    typedef Kokkos::View<
+        typename XVector::const_value_type*,
+        typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
+        typename XVector::device_type,
+        Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+        XVector_SubInternal;
+    typedef Kokkos::View<
+        typename YVector::non_const_value_type*,
+        typename KokkosKernels::Impl::GetUnifiedLayout<YVector>::array_layout,
+        typename YVector::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+        YVector_SubInternal;
+
+    XVector_SubInternal x_0 = Kokkos::subview(x_i, Kokkos::ALL(), 0);
+    YVector_SubInternal y_0 = Kokkos::subview(y_i, Kokkos::ALL(), 0);
+
+    return spmv(controls, mode, alpha, A_i, x_0, beta, y_0, RANK_ONE());
+  }
+  //
+  // Whether to call KokkosKernel's native implementation, even if a TPL impl is
+  // available
+  bool useFallback = controls.isParameter("algorithm") &&
+                     controls.getParameter("algorithm") == "native";
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+  // cuSPARSE does not support the modes (C), (T), (H)
+  if (std::is_same<typename AMatrix_Internal::memory_space,
+                   Kokkos::CudaSpace>::value ||
+      std::is_same<typename AMatrix_Internal::memory_space,
+                   Kokkos::CudaUVMSpace>::value) {
+    useFallback = useFallback || (mode[0] != NoTranspose[0]);
+  }
+#endif
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
+  if (std::is_same<typename AMatrix_Internal::memory_space,
+                   Kokkos::HostSpace>::value) {
+    useFallback = useFallback || (mode[0] == Conjugate[0]);
+  }
+#endif
+
+  if (useFallback) {
+    // Explicitly call the non-TPL SPMV_BSRMATRIX implementation
+    std::string label =
+        "KokkosSparse::spmv[NATIVE,BSMATRIX," +
+        Kokkos::ArithTraits<
+            typename AMatrix_Internal::non_const_value_type>::name() +
+        "]";
+    Kokkos::Profiling::pushRegion(label);
+    Experimental::Impl::SPMV_MV_BSRMATRIX<
+        typename AMatrix_Internal::const_value_type,
+        typename AMatrix_Internal::const_ordinal_type,
+        typename AMatrix_Internal::device_type,
+        typename AMatrix_Internal::memory_traits,
+        typename AMatrix_Internal::const_size_type,
+        typename XVector_Internal::const_value_type**,
+        typename XVector_Internal::array_layout,
+        typename XVector_Internal::device_type,
+        typename XVector_Internal::memory_traits,
+        typename YVector_Internal::value_type**,
+        typename YVector_Internal::array_layout,
+        typename YVector_Internal::device_type,
+        typename YVector_Internal::memory_traits,
+        false>::spmv_mv_bsrmatrix(controls, mode, alpha, A_i, x_i, beta, y_i);
+    Kokkos::Profiling::popRegion();
+  } else {
+    Experimental::Impl::SPMV_MV_BSRMATRIX<
+        typename AMatrix_Internal::const_value_type,
+        typename AMatrix_Internal::const_ordinal_type,
+        typename AMatrix_Internal::device_type,
+        typename AMatrix_Internal::memory_traits,
+        typename AMatrix_Internal::const_size_type,
+        typename XVector_Internal::const_value_type**,
+        typename XVector_Internal::array_layout,
+        typename XVector_Internal::device_type,
+        typename XVector_Internal::memory_traits,
+        typename YVector_Internal::value_type**,
+        typename YVector_Internal::array_layout,
+        typename YVector_Internal::device_type,
+        typename YVector_Internal::memory_traits>::spmv_mv_bsrmatrix(controls,
+                                                                     mode,
+                                                                     alpha, A_i,
+                                                                     x_i, beta,
+                                                                     y_i);
+  }
+}
+
+#ifdef DOXY  // hide SFINAE
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector>
+#else
+template <
+    class AlphaType, class AMatrix, class XVector, class BetaType,
+    class YVector,
+    typename std::enable_if<KokkosSparse::Experimental::is_block_crs_matrix<
+        AMatrix>::value>::type* = nullptr>
+#endif
+void spmv(KokkosKernels::Experimental::Controls controls, const char mode[],
+          const AlphaType& alpha, const AMatrix& A, const XVector& x,
+          const BetaType& beta, const YVector& y, const RANK_TWO) {
+  // Make sure that x and y have the same rank.
+  static_assert(
+      static_cast<int>(XVector::rank) == static_cast<int>(YVector::rank),
+      "KokkosSparse::spmv: Vector ranks do not match.");
+  // Make sure that x (and therefore y) is rank 2.
+  static_assert(static_cast<int>(XVector::rank) == 2,
+                "KokkosSparse::spmv: Both Vector inputs must have rank 2 "
+                "in order to call this specialization of spmv.");
+  // Make sure that y is non-const.
+  static_assert(std::is_same<typename YVector::value_type,
+                             typename YVector::non_const_value_type>::value,
+                "KokkosSparse::spmv: Output Vector must be non-const.");
+
+  if (A.blockDim() == 1) {
+    KokkosSparse::CrsMatrix<
+        typename AMatrix::value_type, typename AMatrix::ordinal_type,
+        typename AMatrix::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>,
+        typename AMatrix::size_type>
+        Acrs("blockcrs_to_crs", A.numCols(), A.values, A.graph);
+    KokkosSparse::spmv(controls, mode, alpha, Acrs, x, beta, y, RANK_TWO());
+    return;
+  }
+  // Check compatibility of dimensions at run time.
+  if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) {
+    if ((x.extent(1) != y.extent(1)) ||
+        (static_cast<size_t>(A.numCols() * A.blockDim()) !=
+         static_cast<size_t>(x.extent(0))) ||
+        (static_cast<size_t>(A.numRows() * A.blockDim()) !=
+         static_cast<size_t>(y.extent(0)))) {
+      std::ostringstream os;
+      os << "KokkosSparse::spmv (BlockCrsMatrix): Dimensions do not match: "
+         << ", A: " << A.numRows() * A.blockDim() << " x "
+         << A.numCols() * A.blockDim() << ", x: " << x.extent(0) << " x "
+         << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1);
+
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
+    }
+  } else {
+    if ((x.extent(1) != y.extent(1)) ||
+        (static_cast<size_t>(A.numCols() * A.blockDim()) !=
+         static_cast<size_t>(y.extent(0))) ||
+        (static_cast<size_t>(A.numRows() * A.blockDim()) !=
+         static_cast<size_t>(x.extent(0)))) {
+      std::ostringstream os;
+      os << "KokkosSparse::spmv (BlockCrsMatrix): Dimensions do not match "
+            "(transpose): "
+         << ", A: " << A.numRows() * A.blockDim() << " x "
+         << A.numCols() * A.blockDim() << ", x: " << x.extent(0) << " x "
+         << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1);
+
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
+    }
+  }
+  //
+  typedef KokkosSparse::Experimental::BlockCrsMatrix<
+      typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type,
+      typename AMatrix::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>,
+      typename AMatrix::const_size_type>
+      AMatrix_Internal;
+  AMatrix_Internal A_i(A);
+
+  typedef Kokkos::View<
+      typename XVector::const_value_type**,
+      typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
+      typename XVector::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      XVector_Internal;
+  XVector_Internal x_i(x);
+
+  typedef Kokkos::View<
+      typename YVector::non_const_value_type**,
+      typename KokkosKernels::Impl::GetUnifiedLayout<YVector>::array_layout,
+      typename YVector::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      YVector_Internal;
+  YVector_Internal y_i(y);
+  //
+  //
+  // Call single-vector version if appropriate
+  //
+  if (x.extent(1) == 1) {
+    typedef Kokkos::View<
+        typename XVector::const_value_type*,
+        typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
+        typename XVector::device_type,
+        Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+        XVector_SubInternal;
+    typedef Kokkos::View<
+        typename YVector::non_const_value_type*,
+        typename KokkosKernels::Impl::GetUnifiedLayout<YVector>::array_layout,
+        typename YVector::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+        YVector_SubInternal;
+
+    XVector_SubInternal x_0 = Kokkos::subview(x_i, Kokkos::ALL(), 0);
+    YVector_SubInternal y_0 = Kokkos::subview(y_i, Kokkos::ALL(), 0);
+
+    return spmv(controls, mode, alpha, A_i, x_0, beta, y_0, RANK_ONE());
   }
+  //
+  return Experimental::Impl::SPMV_MV_BLOCKCRSMATRIX<
+      typename AMatrix_Internal::value_type,
+      typename AMatrix_Internal::ordinal_type,
+      typename AMatrix_Internal::device_type,
+      typename AMatrix_Internal::memory_traits,
+      typename AMatrix_Internal::size_type,
+      typename XVector_Internal::value_type**,
+      typename XVector_Internal::array_layout,
+      typename XVector_Internal::device_type,
+      typename XVector_Internal::memory_traits,
+      typename YVector_Internal::value_type**,
+      typename YVector_Internal::array_layout,
+      typename YVector_Internal::device_type,
+      typename YVector_Internal::memory_traits>::
+      spmv_mv_blockcrsmatrix(controls, mode, alpha, A_i, x_i, beta, y_i);
 }
 
 /// \brief Public interface to local sparse matrix-vector multiply.
 ///
 /// Compute y = beta*y + alpha*Op(A)*x, where x and y are either both
 /// rank 1 (single vectors) or rank 2 (multivectors) Kokkos::View
-/// instances, A is a KokkosSparse::CrsMatrix, and Op(A) is determined
+/// instances, and Op(A) is determined
 /// by \c mode.  If beta == 0, ignore and overwrite the initial
 /// entries of y; if alpha == 0, ignore the entries of A and x.
 ///
+/// If \c AMatrix is a KokkosSparse::Experimental::BsrMatrix, controls may have
+/// \c "algorithm" = \c "experimental_tc_bsr" to use Nvidia tensor cores on
+/// Volta or Ampere architectures. On Volta-architecture GPUs the only available
+/// precision is mixed-precision fp32 accumulator from fp16 inputs. On
+/// Ampere-architecture GPUs (cc >= 80), mixed precision is used when A is fp16,
+/// x is fp16, and y is fp32. Otherwise, double-precision is used. The caller
+/// may override this by setting the \c "tc_precision" = \c "mixed" or
+/// \c "double" as desired.
+///
+/// For mixed precision, performance will degrade for blockDim < 16.
+/// For double precision, for blockDim < 8.
+/// For such cases, consider an alternate SpMV algorithm.
+///
+/// May have \c "algorithm" set to \c "native" to bypass TPLs if they are
+/// enabled for Kokkos::CrsMatrix and Kokkos::Experimental::BsrMatrix on a
+/// single vector, or for Kokkos::Experimental::BsrMatrix with a multivector.
+///
+/// \tparam AMatrix KokkosSparse::CrsMatrix,
+/// KokkosSparse::Experimental::BlockCrsMatrix, or
+/// KokkosSparse::Experimental::BsrMatrix
+///
 /// \param controls [in] kokkos-kernels control structure
 /// \param mode [in] "N" for no transpose, "T" for transpose, or "C"
 ///   for conjugate transpose.
 /// \param alpha [in] Scalar multiplier for the matrix A.
-/// \param A [in] The sparse matrix; KokkosSparse::CrsMatrix instance.
+/// \param A [in] The sparse matrix A.
 /// \param x [in] Either a single vector (rank-1 Kokkos::View) or
 ///   multivector (rank-2 Kokkos::View).
 /// \param beta [in] Scalar multiplier for the (multi)vector y.
 /// \param y [in/out] Either a single vector (rank-1 Kokkos::View) or
 ///   multivector (rank-2 Kokkos::View).  It must have the same number
 ///   of columns as x.
-template <class AlphaType, class AMatrix, class XVector, class BetaType, class YVector>
-void spmv(KokkosKernels::Experimental::Controls controls,
-	  const char mode[],
-	  const AlphaType& alpha,
-	  const AMatrix& A,
-	  const XVector& x,
-	  const BetaType& beta,
-	  const YVector& y) {
+///
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector>
+void spmv(KokkosKernels::Experimental::Controls controls, const char mode[],
+          const AlphaType& alpha, const AMatrix& A, const XVector& x,
+          const BetaType& beta, const YVector& y) {
+  // Make sure that both x and y have the same rank.
+  static_assert(
+      static_cast<int>(XVector::rank) == static_cast<int>(YVector::rank),
+      "KokkosSparse::spmv: Vector ranks do not match.");
+  // Make sure that y is non-const.
+  static_assert(std::is_same<typename YVector::value_type,
+                             typename YVector::non_const_value_type>::value,
+                "KokkosSparse::spmv: Output Vector must be non-const.");
+
+  // Check compatibility of dimensions at run time.
+  if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) {
+    if ((x.extent(1) != y.extent(1)) ||
+        (static_cast<size_t>(A.numPointCols()) !=
+         static_cast<size_t>(x.extent(0))) ||
+        (static_cast<size_t>(A.numPointRows()) !=
+         static_cast<size_t>(y.extent(0)))) {
+      std::ostringstream os;
+      os << "KokkosSparse::spmv (Generic): Dimensions do not match: "
+         << ", A: " << A.numPointRows() << " x " << A.numPointCols()
+         << ", x: " << x.extent(0) << " x " << x.extent(1)
+         << ", y: " << y.extent(0) << " x " << y.extent(1);
+
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
+    }
+  } else {
+    if ((x.extent(1) != y.extent(1)) ||
+        (static_cast<size_t>(A.numPointCols()) !=
+         static_cast<size_t>(y.extent(0))) ||
+        (static_cast<size_t>(A.numPointRows()) !=
+         static_cast<size_t>(x.extent(0)))) {
+      std::ostringstream os;
+      os << "KokkosSparse::spmv (Generic): Dimensions do not match "
+            "(transpose): "
+         << ", A: " << A.numPointRows() << " x " << A.numPointCols()
+         << ", x: " << x.extent(0) << " x " << x.extent(1)
+         << ", y: " << y.extent(0) << " x " << y.extent(1);
+
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
+    }
+  }
+
+  if (alpha == Kokkos::ArithTraits<AlphaType>::zero() || A.numRows() == 0 ||
+      A.numCols() == 0 || A.nnz() == 0) {
+    // This is required to maintain semantics of KokkosKernels native SpMV:
+    // if y contains NaN but beta = 0, the result y should be filled with 0.
+    // For example, this is useful for passing in uninitialized y and beta=0.
+    if (beta == Kokkos::ArithTraits<BetaType>::zero())
+      Kokkos::deep_copy(y, Kokkos::ArithTraits<BetaType>::zero());
+    else
+      KokkosBlas::scal(y, beta, y);
+    return;
+  }
+  //
   using RANK_SPECIALISE =
-    typename std::conditional<static_cast<int> (XVector::rank) == 2,
-                              RANK_TWO, RANK_ONE>::type;
-  spmv (controls, mode, alpha, A, x, beta, y, RANK_SPECIALISE ());
+      typename std::conditional<static_cast<int>(XVector::rank) == 2, RANK_TWO,
+                                RANK_ONE>::type;
+  spmv(controls, mode, alpha, A, x, beta, y, RANK_SPECIALISE());
+}
+
+/// \brief Catch-all public interface to error on invalid Kokkos::Sparse spmv
+/// argument types
+///
+/// This is a catch-all interfaceace that throws a compile-time error if \c
+/// AMatrix is not a CrsMatrix, BsrMatrix, or BlockCrsMatrix
+///
+template <
+    class AlphaType, class AMatrix, class XVector, class BetaType,
+    class YVector,
+    typename std::enable_if<
+        !KokkosSparse::Experimental::is_block_crs_matrix<AMatrix>::value &&
+        !KokkosSparse::Experimental::is_bsr_matrix<AMatrix>::value &&
+        !KokkosSparse::is_crs_matrix<AMatrix>::value>::type* = nullptr>
+void spmv(KokkosKernels::Experimental::Controls /*controls*/,
+          const char[] /*mode*/, const AlphaType& /*alpha*/,
+          const AMatrix& /*A*/, const XVector& /*x*/, const BetaType& /*beta*/,
+          const YVector& /*y*/) {
+  // have to arrange this so that the compiler can't tell this is false until
+  // instantiation
+  static_assert(
+      KokkosSparse::is_crs_matrix<AMatrix>::value ||
+          KokkosSparse::Experimental::is_bsr_matrix<AMatrix>::value ||
+          KokkosSparse::Experimental::is_block_crs_matrix<AMatrix>::value,
+      "SpMV: AMatrix must be CrsMatrix, BsrMatrix, or BlockCrsMatrix");
 }
 
 // Overload for backward compatibility and also just simpler
 // interface for users that are happy with the kernel default settings
-template <class AlphaType, class AMatrix, class XVector, class BetaType, class YVector>
-void spmv(const char mode[],
-	  const AlphaType& alpha,
-	  const AMatrix& A,
-	  const XVector& x,
-	  const BetaType& beta,
-	  const YVector& y) {
-
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector>
+void spmv(const char mode[], const AlphaType& alpha, const AMatrix& A,
+          const XVector& x, const BetaType& beta, const YVector& y) {
   KokkosKernels::Experimental::Controls controls;
   spmv(controls, mode, alpha, A, x, beta, y);
+}
 
+namespace Experimental {
+
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector>
+void spmv_struct(const char mode[], const int stencil_type,
+                 const Kokkos::View<typename AMatrix::non_const_ordinal_type*,
+                                    Kokkos::HostSpace>& structure,
+                 const AlphaType& alpha, const AMatrix& A, const XVector& x,
+                 const BetaType& beta, const YVector& y, const RANK_ONE) {
+  // Make sure that both x and y have the same rank.
+  static_assert((int)XVector::rank == (int)YVector::rank,
+                "KokkosSparse::spmv_struct: Vector ranks do not match.");
+  // Make sure that x (and therefore y) is rank 1.
+  static_assert(
+      (int)XVector::rank == 1,
+      "KokkosSparse::spmv_struct: Both Vector inputs must have rank 1 in "
+      "order to call this specialization of spmv.");
+  // Make sure that y is non-const.
+  static_assert(std::is_same<typename YVector::value_type,
+                             typename YVector::non_const_value_type>::value,
+                "KokkosSparse::spmv_struct: Output Vector must be non-const.");
+
+  // Check compatibility of dimensions at run time.
+  if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) {
+    if ((x.extent(1) != y.extent(1)) ||
+        (static_cast<size_t>(A.numCols()) > static_cast<size_t>(x.extent(0))) ||
+        (static_cast<size_t>(A.numRows()) > static_cast<size_t>(y.extent(0)))) {
+      std::ostringstream os;
+      os << "KokkosSparse::spmv_struct: Dimensions do not match: "
+         << ", A: " << A.numRows() << " x " << A.numCols()
+         << ", x: " << x.extent(0) << " x " << x.extent(1)
+         << ", y: " << y.extent(0) << " x " << y.extent(1);
+
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
+    }
+  } else {
+    if ((x.extent(1) != y.extent(1)) ||
+        (static_cast<size_t>(A.numCols()) > static_cast<size_t>(y.extent(0))) ||
+        (static_cast<size_t>(A.numRows()) > static_cast<size_t>(x.extent(0)))) {
+      std::ostringstream os;
+      os << "KokkosSparse::spmv_struct: Dimensions do not match (transpose): "
+         << ", A: " << A.numRows() << " x " << A.numCols()
+         << ", x: " << x.extent(0) << " x " << x.extent(1)
+         << ", y: " << y.extent(0) << " x " << y.extent(1);
+
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
+    }
+  }
+
+  typedef KokkosSparse::CrsMatrix<
+      typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type,
+      typename AMatrix::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>,
+      typename AMatrix::const_size_type>
+      AMatrix_Internal;
+
+  typedef Kokkos::View<
+      typename XVector::const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
+      typename XVector::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      XVector_Internal;
+
+  typedef Kokkos::View<
+      typename YVector::non_const_value_type*,
+      typename KokkosKernels::Impl::GetUnifiedLayout<YVector>::array_layout,
+      typename YVector::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      YVector_Internal;
+
+  AMatrix_Internal A_i = A;
+  XVector_Internal x_i = x;
+  YVector_Internal y_i = y;
+
+  return KokkosSparse::Impl::SPMV_STRUCT<
+      typename AMatrix_Internal::value_type,
+      typename AMatrix_Internal::ordinal_type,
+      typename AMatrix_Internal::device_type,
+      typename AMatrix_Internal::memory_traits,
+      typename AMatrix_Internal::size_type,
+      typename XVector_Internal::value_type*,
+      typename XVector_Internal::array_layout,
+      typename XVector_Internal::device_type,
+      typename XVector_Internal::memory_traits,
+      typename YVector_Internal::value_type*,
+      typename YVector_Internal::array_layout,
+      typename YVector_Internal::device_type,
+      typename YVector_Internal::memory_traits>::spmv_struct(mode, stencil_type,
+                                                             structure, alpha,
+                                                             A_i, x_i, beta,
+                                                             y_i);
 }
 
-  namespace Experimental {
-
-    template <class AlphaType, class AMatrix, class XVector, class BetaType, class YVector>
-    void
-    spmv_struct (const char mode[],
-                 const int stencil_type,
-                 const Kokkos::View<typename AMatrix::non_const_ordinal_type*, Kokkos::HostSpace>& structure,
-                 const AlphaType& alpha,
-                 const AMatrix& A,
-                 const XVector& x,
-                 const BetaType& beta,
-                 const YVector& y,
-                 const RANK_ONE)
-    {
-      // Make sure that both x and y have the same rank.
-      static_assert ((int) XVector::rank == (int) YVector::rank,
-                     "KokkosSparse::spmv_struct: Vector ranks do not match.");
-      // Make sure that x (and therefore y) is rank 1.
-      static_assert ((int) XVector::rank == 1,
-                     "KokkosSparse::spmv_struct: Both Vector inputs must have rank 1 in "
-                     "order to call this specialization of spmv.");
-      // Make sure that y is non-const.
-      static_assert (std::is_same<typename YVector::value_type,
-                     typename YVector::non_const_value_type>::value,
-                     "KokkosSparse::spmv_struct: Output Vector must be non-const.");
-
-      // Check compatibility of dimensions at run time.
-      if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) {
-        if ((x.extent(1) != y.extent(1)) ||
-            (static_cast<size_t> (A.numCols ()) > static_cast<size_t> (x.extent(0))) ||
-            (static_cast<size_t> (A.numRows ()) > static_cast<size_t> (y.extent(0)))) {
-          std::ostringstream os;
-          os << "KokkosSparse::spmv_struct: Dimensions do not match: "
-             << ", A: " << A.numRows () << " x " << A.numCols()
-             << ", x: " << x.extent(0) << " x " << x.extent(1)
-             << ", y: " << y.extent(0) << " x " << y.extent(1)
-            ;
-
-          Kokkos::Impl::throw_runtime_exception (os.str ());
-        }
-      }
-      else {
-        if ((x.extent(1) != y.extent(1)) ||
-            (static_cast<size_t> (A.numCols ()) > static_cast<size_t> (y.extent(0))) ||
-            (static_cast<size_t> (A.numRows ()) > static_cast<size_t> (x.extent(0)))) {
-          std::ostringstream os;
-          os << "KokkosSparse::spmv_struct: Dimensions do not match (transpose): "
-             << ", A: " << A.numRows () << " x " << A.numCols()
-             << ", x: " << x.extent(0) << " x " << x.extent(1)
-             << ", y: " << y.extent(0) << " x " << y.extent(1)
-            ;
-
-          Kokkos::Impl::throw_runtime_exception (os.str ());
-        }
-      }
-
-
-      typedef KokkosSparse::CrsMatrix<
-        typename AMatrix::const_value_type,
-                                            typename AMatrix::const_ordinal_type,
-                                            typename AMatrix::device_type,
-                                            Kokkos::MemoryTraits<Kokkos::Unmanaged>,
-                                            typename AMatrix::const_size_type>          AMatrix_Internal;
-
-      typedef Kokkos::View<
-        typename XVector::const_value_type*,
-        typename KokkosKernels::Impl::GetUnifiedLayout<XVector>::array_layout,
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector, class XLayout = typename XVector::array_layout>
+struct SPMV2D1D_STRUCT {
+  static bool spmv2d1d_struct(
+      const char mode[], const int stencil_type,
+      const Kokkos::View<typename AMatrix::non_const_ordinal_type*,
+                         Kokkos::HostSpace>& structure,
+      const AlphaType& alpha, const AMatrix& A, const XVector& x,
+      const BetaType& beta, const YVector& y);
+};
+
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || !defined(KOKKOSKERNELS_ETI_ONLY)
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector>
+struct SPMV2D1D_STRUCT<AlphaType, AMatrix, XVector, BetaType, YVector,
+                       Kokkos::LayoutStride> {
+  static bool spmv2d1d_struct(
+      const char mode[], const int stencil_type,
+      const Kokkos::View<typename AMatrix::non_const_ordinal_type*,
+                         Kokkos::HostSpace>& structure,
+      const AlphaType& alpha, const AMatrix& A, const XVector& x,
+      const BetaType& beta, const YVector& y) {
+    spmv_struct(mode, stencil_type, structure, alpha, A, x, beta, y,
+                RANK_ONE());
+    return true;
+  }
+};
+#else
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector>
+struct SPMV2D1D_STRUCT<AlphaType, AMatrix, XVector, BetaType, YVector,
+                       Kokkos::LayoutStride> {
+  static bool spmv2d1d_struct(
+      const char /*mode*/[], const int /*stencil_type*/,
+      const Kokkos::View<typename AMatrix::non_const_ordinal_type*,
+                         Kokkos::HostSpace>& /*structure*/,
+      const AlphaType& /*alpha*/, const AMatrix& /*A*/, const XVector& /*x*/,
+      const BetaType& /*beta*/, const YVector& /*y*/) {
+    return false;
+  }
+};
+#endif
+
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || !defined(KOKKOSKERNELS_ETI_ONLY)
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector>
+struct SPMV2D1D_STRUCT<AlphaType, AMatrix, XVector, BetaType, YVector,
+                       Kokkos::LayoutLeft> {
+  static bool spmv2d1d_struct(
+      const char mode[], const int stencil_type,
+      const Kokkos::View<typename AMatrix::non_const_ordinal_type*,
+                         Kokkos::HostSpace>& structure,
+      const AlphaType& alpha, const AMatrix& A, const XVector& x,
+      const BetaType& beta, const YVector& y) {
+    spmv_struct(mode, stencil_type, structure, alpha, A, x, beta, y,
+                RANK_ONE());
+    return true;
+  }
+};
+#else
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector>
+struct SPMV2D1D_STRUCT<AlphaType, AMatrix, XVector, BetaType, YVector,
+                       Kokkos::LayoutLeft> {
+  static bool spmv2d1d_struct(
+      const char /*mode*/[], const int /*stencil_type*/,
+      const Kokkos::View<typename AMatrix::non_const_ordinal_type*,
+                         Kokkos::HostSpace>& /*structure*/,
+      const AlphaType& /*alpha*/, const AMatrix& /*A*/, const XVector& /*x*/,
+      const BetaType& /*beta*/, const YVector& /*y*/) {
+    return false;
+  }
+};
+#endif
+
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || !defined(KOKKOSKERNELS_ETI_ONLY)
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector>
+struct SPMV2D1D_STRUCT<AlphaType, AMatrix, XVector, BetaType, YVector,
+                       Kokkos::LayoutRight> {
+  static bool spmv2d1d_struct(
+      const char mode[], const int stencil_type,
+      const Kokkos::View<typename AMatrix::non_const_ordinal_type*,
+                         Kokkos::HostSpace>& structure,
+      const AlphaType& alpha, const AMatrix& A, const XVector& x,
+      const BetaType& beta, const YVector& y) {
+    spmv_struct(mode, stencil_type, structure, alpha, A, x, beta, y,
+                RANK_ONE());
+    return true;
+  }
+};
+#else
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector>
+struct SPMV2D1D_STRUCT<AlphaType, AMatrix, XVector, BetaType, YVector,
+                       Kokkos::LayoutRight> {
+  static bool spmv2d1d_struct(
+      const char /*mode*/[], const int /*stencil_type*/,
+      const Kokkos::View<typename AMatrix::non_const_ordinal_type*,
+                         Kokkos::HostSpace>& /*structure*/,
+      const AlphaType& /*alpha*/, const AMatrix& /*A*/, const XVector& /*x*/,
+      const BetaType& /*beta*/, const YVector& /*y*/) {
+    return false;
+  }
+};
+#endif
+
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector>
+void spmv_struct(const char mode[], const int stencil_type,
+                 const Kokkos::View<typename AMatrix::non_const_ordinal_type*,
+                                    Kokkos::HostSpace>& structure,
+                 const AlphaType& alpha, const AMatrix& A, const XVector& x,
+                 const BetaType& beta, const YVector& y, const RANK_TWO) {
+  // Make sure that both x and y have the same rank.
+  static_assert(XVector::rank == YVector::rank,
+                "KokkosBlas::spmv: Vector ranks do not match.");
+  // Make sure that y is non-const.
+  static_assert(std::is_same<typename YVector::value_type,
+                             typename YVector::non_const_value_type>::value,
+                "KokkosBlas::spmv: Output Vector must be non-const.");
+
+  // Check compatibility of dimensions at run time.
+  if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) {
+    if ((x.extent(1) != y.extent(1)) ||
+        (static_cast<size_t>(A.numCols()) > static_cast<size_t>(x.extent(0))) ||
+        (static_cast<size_t>(A.numRows()) > static_cast<size_t>(y.extent(0)))) {
+      std::ostringstream os;
+      os << "KokkosBlas::spmv: Dimensions do not match: "
+         << ", A: " << A.numRows() << " x " << A.numCols()
+         << ", x: " << x.extent(0) << " x " << x.extent(1)
+         << ", y: " << y.extent(0) << " x " << y.extent(1);
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
+    }
+  } else {
+    if ((x.extent(1) != y.extent(1)) ||
+        (static_cast<size_t>(A.numCols()) > static_cast<size_t>(y.extent(0))) ||
+        (static_cast<size_t>(A.numRows()) > static_cast<size_t>(x.extent(0)))) {
+      std::ostringstream os;
+      os << "KokkosBlas::spmv: Dimensions do not match (transpose): "
+         << ", A: " << A.numRows() << " x " << A.numCols()
+         << ", x: " << x.extent(0) << " x " << x.extent(1)
+         << ", y: " << y.extent(0) << " x " << y.extent(1);
+      KokkosKernels::Impl::throw_runtime_exception(os.str());
+    }
+  }
+
+  typedef KokkosSparse::CrsMatrix<
+      typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type,
+      typename AMatrix::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>,
+      typename AMatrix::const_size_type>
+      AMatrix_Internal;
+
+  AMatrix_Internal A_i = A;
+
+  // Call single-vector version if appropriate
+  if (x.extent(1) == 1) {
+    typedef Kokkos::View<
+        typename XVector::const_value_type*, typename YVector::array_layout,
         typename XVector::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > XVector_Internal;
+        Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+        XVector_SubInternal;
+    typedef Kokkos::View<
+        typename YVector::non_const_value_type*, typename YVector::array_layout,
+        typename YVector::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+        YVector_SubInternal;
+
+    XVector_SubInternal x_i = Kokkos::subview(x, Kokkos::ALL(), 0);
+    YVector_SubInternal y_i = Kokkos::subview(y, Kokkos::ALL(), 0);
+
+    // spmv_struct (mode, alpha, A, x_i, beta, y_i);
+    if (SPMV2D1D_STRUCT<AlphaType, AMatrix_Internal, XVector_SubInternal,
+                        BetaType, YVector_SubInternal,
+                        typename XVector_SubInternal::array_layout>::
+            spmv2d1d_struct(mode, stencil_type, structure, alpha, A, x_i, beta,
+                            y_i)) {
+      return;
+    }
+  }
 
-      typedef Kokkos::View<
-        typename YVector::non_const_value_type*,
-        typename KokkosKernels::Impl::GetUnifiedLayout<YVector>::array_layout,
-        typename YVector::device_type,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged> > YVector_Internal;
+  // Call true rank 2 vector implementation
+  {
+    typedef Kokkos::View<
+        typename XVector::const_value_type**, typename XVector::array_layout,
+        typename XVector::device_type,
+        Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+        XVector_Internal;
 
-      AMatrix_Internal A_i = A;
-      XVector_Internal x_i = x;
-      YVector_Internal y_i = y;
+    typedef Kokkos::View<typename YVector::non_const_value_type**,
+                         typename YVector::array_layout,
+                         typename YVector::device_type,
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+        YVector_Internal;
 
-      return Impl::SPMV_STRUCT<
+    XVector_Internal x_i = x;
+    YVector_Internal y_i = y;
+
+    return KokkosSparse::Impl::SPMV_MV<
         typename AMatrix_Internal::value_type,
         typename AMatrix_Internal::ordinal_type,
         typename AMatrix_Internal::device_type,
         typename AMatrix_Internal::memory_traits,
         typename AMatrix_Internal::size_type,
-        typename XVector_Internal::value_type*,
+        typename XVector_Internal::value_type**,
         typename XVector_Internal::array_layout,
         typename XVector_Internal::device_type,
         typename XVector_Internal::memory_traits,
-        typename YVector_Internal::value_type*,
+        typename YVector_Internal::value_type**,
         typename YVector_Internal::array_layout,
         typename YVector_Internal::device_type,
-        typename YVector_Internal::memory_traits>::spmv_struct (mode, stencil_type, structure,
-                                                                alpha, A_i, x_i, beta, y_i);
-    }
-
-
-    template<class AlphaType, class AMatrix, class XVector, class BetaType, class YVector ,
-             class XLayout = typename XVector::array_layout>
-    struct SPMV2D1D_STRUCT{
-      static bool spmv2d1d_struct (const char mode[],
-                                   const int stencil_type,
-                                   const Kokkos::View<typename AMatrix::non_const_ordinal_type*, Kokkos::HostSpace>& structure,
-                                   const AlphaType& alpha,
-                                   const AMatrix& A,
-                                   const XVector& x,
-                                   const BetaType& beta,
-                                   const YVector& y);
-    };
-
-
-#if defined (KOKKOSKERNELS_INST_LAYOUTSTRIDE) || !defined(KOKKOSKERNELS_ETI_ONLY)
-    template<class AlphaType, class AMatrix, class XVector, class BetaType, class YVector>
-    struct SPMV2D1D_STRUCT<AlphaType, AMatrix, XVector, BetaType, YVector, Kokkos::LayoutStride>{
-      static bool spmv2d1d_struct (const char mode[],
-                                   const int stencil_type,
-                                   const Kokkos::View<typename AMatrix::non_const_ordinal_type*, Kokkos::HostSpace>& structure,
-                                   const AlphaType& alpha,
-                                   const AMatrix& A,
-                                   const XVector& x,
-                                   const BetaType& beta,
-                                   const YVector& y){
-        spmv_struct (mode, stencil_type, structure, alpha, A, x, beta, y, RANK_ONE());
-        return true;
-      }
-#else
-    template<class AlphaType, class AMatrix, class XVector, class BetaType, class YVector>
-    struct SPMV2D1D_STRUCT<AlphaType, AMatrix, XVector, BetaType, YVector, Kokkos::LayoutStride>{
-      static bool spmv2d1d_struct (const char /*mode*/[],
-                                   const int /*stencil_type*/,
-                                   const Kokkos::View<typename AMatrix::non_const_ordinal_type*, Kokkos::HostSpace>& /*structure*/,
-                                   const AlphaType& /*alpha*/,
-                                   const AMatrix& /*A*/,
-                                   const XVector& /*x*/,
-                                   const BetaType& /*beta*/,
-                                   const YVector& /*y*/){
-        return false;
-      }
-#endif
-    };
-
-#if defined (KOKKOSKERNELS_INST_LAYOUTLEFT) || !defined(KOKKOSKERNELS_ETI_ONLY)
-    template<class AlphaType, class AMatrix, class XVector, class BetaType, class YVector>
-    struct SPMV2D1D_STRUCT<AlphaType, AMatrix, XVector, BetaType, YVector, Kokkos::LayoutLeft>{
-      static bool spmv2d1d_struct (const char mode[],
-                                   const int stencil_type,
-                                   const Kokkos::View<typename AMatrix::non_const_ordinal_type*, Kokkos::HostSpace>& structure,
-                                   const AlphaType& alpha,
-                                   const AMatrix& A,
-                                   const XVector& x,
-                                   const BetaType& beta,
-                                   const YVector& y){
-        spmv_struct (mode, stencil_type, structure, alpha, A, x, beta, y, RANK_ONE());
-        return true;
-      }
-#else
-    template<class AlphaType, class AMatrix, class XVector, class BetaType, class YVector>
-    struct SPMV2D1D_STRUCT<AlphaType, AMatrix, XVector, BetaType, YVector, Kokkos::LayoutLeft>{
-      static bool spmv2d1d_struct (const char /*mode*/[],
-                                   const int /*stencil_type*/,
-                                   const Kokkos::View<typename AMatrix::non_const_ordinal_type*, Kokkos::HostSpace>& /*structure*/,
-                                   const AlphaType& /*alpha*/,
-                                   const AMatrix& /*A*/,
-                                   const XVector& /*x*/,
-                                   const BetaType& /*beta*/,
-                                   const YVector& /*y*/){
-        return false;
-      }
-#endif
-    };
-
-
-#if defined (KOKKOSKERNELS_INST_LAYOUTLEFT) || !defined(KOKKOSKERNELS_ETI_ONLY)
-    template<class AlphaType, class AMatrix, class XVector, class BetaType, class YVector>
-    struct SPMV2D1D_STRUCT<AlphaType, AMatrix, XVector, BetaType, YVector, Kokkos::LayoutRight>{
-      static bool spmv2d1d_struct (const char mode[],
-                                   const int stencil_type,
-                                   const Kokkos::View<typename AMatrix::non_const_ordinal_type*, Kokkos::HostSpace>& structure,
-                                   const AlphaType& alpha,
-                                   const AMatrix& A,
-                                   const XVector& x,
-                                   const BetaType& beta,
-                                   const YVector& y){
-        spmv_struct (mode, stencil_type, structure, alpha, A, x, beta, y, RANK_ONE());
-        return true;
-      }
-#else
-    template<class AlphaType, class AMatrix, class XVector, class BetaType, class YVector>
-    struct SPMV2D1D_STRUCT<AlphaType, AMatrix, XVector, BetaType, YVector, Kokkos::LayoutRight>{
-      static bool spmv2d1d_struct (const char /*mode*/[],
-                                   const int /*stencil_type*/,
-                                   const Kokkos::View<typename AMatrix::non_const_ordinal_type*, Kokkos::HostSpace>& /*structure*/,
-                                   const AlphaType& /*alpha*/,
-                                   const AMatrix& /*A*/,
-                                   const XVector& /*x*/,
-                                   const BetaType& /*beta*/,
-				   const YVector& /*y*/){
-        return false;
-      }
-#endif
-    };
-
-    template<class AlphaType, class AMatrix, class XVector, class BetaType, class YVector>
-    void
-    spmv_struct (const char mode[],
-                 const int stencil_type,
-                 const Kokkos::View<typename AMatrix::non_const_ordinal_type*, Kokkos::HostSpace>& structure,
-                 const AlphaType& alpha,
-                 const AMatrix& A,
-                 const XVector& x,
-                 const BetaType& beta,
-                 const YVector& y,
-                 const RANK_TWO)
-    {
-      // Make sure that both x and y have the same rank.
-      static_assert (XVector::rank == YVector::rank,
-                     "KokkosBlas::spmv: Vector ranks do not match.");
-      // Make sure that y is non-const.
-      static_assert (std::is_same<typename YVector::value_type,
-                     typename YVector::non_const_value_type>::value,
-                     "KokkosBlas::spmv: Output Vector must be non-const.");
-
-      // Check compatibility of dimensions at run time.
-      if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) {
-        if ((x.extent(1) != y.extent(1)) ||
-            (static_cast<size_t> (A.numCols ()) > static_cast<size_t> (x.extent(0))) ||
-            (static_cast<size_t> (A.numRows ()) > static_cast<size_t> (y.extent(0)))) {
-          std::ostringstream os;
-          os << "KokkosBlas::spmv: Dimensions do not match: "
-             << ", A: " << A.numRows () << " x " << A.numCols()
-             << ", x: " << x.extent(0) << " x " << x.extent(1)
-             << ", y: " << y.extent(0) << " x " << y.extent(1);
-          Kokkos::Impl::throw_runtime_exception (os.str ());
-        }
-      } else {
-        if ((x.extent(1) != y.extent(1)) ||
-            (static_cast<size_t> (A.numCols ()) > static_cast<size_t> (y.extent(0))) ||
-            (static_cast<size_t> (A.numRows ()) > static_cast<size_t> (x.extent(0)))) {
-          std::ostringstream os;
-          os << "KokkosBlas::spmv: Dimensions do not match (transpose): "
-             << ", A: " << A.numRows () << " x " << A.numCols()
-             << ", x: " << x.extent(0) << " x " << x.extent(1)
-             << ", y: " << y.extent(0) << " x " << y.extent(1);
-          Kokkos::Impl::throw_runtime_exception (os.str ());
-        }
-      }
-
-      typedef KokkosSparse::CrsMatrix<
-        typename AMatrix::const_value_type,
-                                            typename AMatrix::const_ordinal_type,
-                                            typename AMatrix::device_type,
-                                            Kokkos::MemoryTraits<Kokkos::Unmanaged>,
-                                            typename AMatrix::const_size_type>              AMatrix_Internal;
-
-      AMatrix_Internal A_i = A;
-
-      // Call single-vector version if appropriate
-      if (x.extent(1) == 1) {
-        typedef Kokkos::View<typename XVector::const_value_type*,
-                             typename YVector::array_layout,
-                             typename XVector::device_type,
-                             Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > XVector_SubInternal;
-        typedef Kokkos::View<typename YVector::non_const_value_type*,
-                             typename YVector::array_layout,
-                             typename YVector::device_type,
-                             Kokkos::MemoryTraits<Kokkos::Unmanaged> > YVector_SubInternal;
-
-        XVector_SubInternal x_i = Kokkos::subview (x, Kokkos::ALL (), 0);
-        YVector_SubInternal y_i = Kokkos::subview (y, Kokkos::ALL (), 0);
-
-
-
-        //spmv_struct (mode, alpha, A, x_i, beta, y_i);
-        if (SPMV2D1D_STRUCT  <AlphaType, AMatrix_Internal, XVector_SubInternal,
-            BetaType, YVector_SubInternal, typename XVector_SubInternal::array_layout>::spmv2d1d_struct(mode, stencil_type, structure, alpha, A, x_i, beta, y_i)) {
-          return;
-        }
-      }
-
-      // Call true rank 2 vector implementation
-      {
-        typedef Kokkos::View<
-          typename XVector::const_value_type**,
-          typename XVector::array_layout,
-          typename XVector::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > XVector_Internal;
-
-        typedef Kokkos::View<
-          typename YVector::non_const_value_type**,
-          typename YVector::array_layout,
-          typename YVector::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> > YVector_Internal;
-
-        XVector_Internal x_i = x;
-        YVector_Internal y_i = y;
-
-        return Impl::SPMV_MV<typename AMatrix_Internal::value_type,
-                             typename AMatrix_Internal::ordinal_type,
-                             typename AMatrix_Internal::device_type,
-                             typename AMatrix_Internal::memory_traits,
-                             typename AMatrix_Internal::size_type,
-                             typename XVector_Internal::value_type**,
-                             typename XVector_Internal::array_layout,
-                             typename XVector_Internal::device_type,
-                             typename XVector_Internal::memory_traits,
-                             typename YVector_Internal::value_type**,
-                             typename YVector_Internal::array_layout,
-                             typename YVector_Internal::device_type,
-                             typename YVector_Internal::memory_traits>::spmv_mv (mode, alpha, A_i, x_i, beta, y_i);
-      }
-    }
+        typename YVector_Internal::memory_traits>::spmv_mv(mode, alpha, A_i,
+                                                           x_i, beta, y_i);
+  }
+}
 
-    /// \brief Public interface to structured local sparse matrix-vector multiply.
-    ///
-    /// Compute y = beta*y + alpha*Op(A)*x, where x and y are either both
-    /// rank 1 (single vectors) or rank 2 (multivectors) Kokkos::View
-    /// instances, A is a KokkosSparse::CrsMatrix, and Op(A) is determined
-    /// by \c mode.  If beta == 0, ignore and overwrite the initial
-    /// entries of y; if alpha == 0, ignore the entries of A and x.
-    ///
-    /// \param mode [in] "N" for no transpose, "T" for transpose, or "C"
-    ///   for conjugate transpose.
-    /// \param structure [in] this 1D view stores the # rows in each dimension (i,j,k)
-    /// \param alpha [in] Scalar multiplier for the matrix A.
-    /// \param A [in] The sparse matrix; KokkosSparse::CrsMatrix instance.
-    /// \param x [in] Either a single vector (rank-1 Kokkos::View) or
-    ///   multivector (rank-2 Kokkos::View).
-    /// \param beta [in] Scalar multiplier for the (multi)vector y.
-    /// \param y [in/out] Either a single vector (rank-1 Kokkos::View) or
-    ///   multivector (rank-2 Kokkos::View).  It must have the same number
-    ///   of columns as x.
-    template <class AlphaType, class AMatrix, class XVector, class BetaType, class YVector>
-    void
-    spmv_struct (const char mode[],
-                 const int stencil_type,
-                 const Kokkos::View<typename AMatrix::non_const_ordinal_type*, Kokkos::HostSpace>& structure,
-                 const AlphaType& alpha,
-                 const AMatrix& A,
-                 const XVector& x,
-                 const BetaType& beta,
-                 const YVector& y) {
-      typedef typename Kokkos::Impl::if_c<XVector::rank == 2, RANK_TWO, RANK_ONE>::type RANK_SPECIALISE;
-      spmv_struct (mode, stencil_type, structure, alpha, A, x, beta, y, RANK_SPECIALISE ());
-    }
+/// \brief Public interface to structured local sparse matrix-vector multiply.
+///
+/// Compute y = beta*y + alpha*Op(A)*x, where x and y are either both
+/// rank 1 (single vectors) or rank 2 (multivectors) Kokkos::View
+/// instances, A is a KokkosSparse::CrsMatrix, and Op(A) is determined
+/// by \c mode.  If beta == 0, ignore and overwrite the initial
+/// entries of y; if alpha == 0, ignore the entries of A and x.
+///
+/// \param mode [in] "N" for no transpose, "T" for transpose, or "C"
+///   for conjugate transpose.
+/// \param structure [in] this 1D view stores the # rows in each dimension
+/// (i,j,k) \param alpha [in] Scalar multiplier for the matrix A. \param A [in]
+/// The sparse matrix; KokkosSparse::CrsMatrix instance. \param x [in] Either a
+/// single vector (rank-1 Kokkos::View) or
+///   multivector (rank-2 Kokkos::View).
+/// \param beta [in] Scalar multiplier for the (multi)vector y.
+/// \param y [in/out] Either a single vector (rank-1 Kokkos::View) or
+///   multivector (rank-2 Kokkos::View).  It must have the same number
+///   of columns as x.
+template <class AlphaType, class AMatrix, class XVector, class BetaType,
+          class YVector>
+void spmv_struct(const char mode[], const int stencil_type,
+                 const Kokkos::View<typename AMatrix::non_const_ordinal_type*,
+                                    Kokkos::HostSpace>& structure,
+                 const AlphaType& alpha, const AMatrix& A, const XVector& x,
+                 const BetaType& beta, const YVector& y) {
+  typedef
+      typename std::conditional<XVector::rank == 2, RANK_TWO, RANK_ONE>::type
+          RANK_SPECIALISE;
+  spmv_struct(mode, stencil_type, structure, alpha, A, x, beta, y,
+              RANK_SPECIALISE());
+}
 
-  } // namespace Experimental
-} // namespace KokkosSparse
+}  // namespace Experimental
+}  // namespace KokkosSparse
 
 #endif
-
diff --git a/src/sparse/KokkosSparse_sptrsv.hpp b/src/sparse/KokkosSparse_sptrsv.hpp
index 2ac041201e..861491c9a6 100644
--- a/src/sparse/KokkosSparse_sptrsv.hpp
+++ b/src/sparse/KokkosSparse_sptrsv.hpp
@@ -64,325 +64,342 @@
 namespace KokkosSparse {
 namespace Experimental {
 
-#define KOKKOSKERNELS_SPTRSV_SAME_TYPE(A, B) std::is_same<typename std::remove_const<A>::type, typename std::remove_const<B>::type>::value
-
-  template <typename KernelHandle,
-            typename lno_row_view_t_,
-            typename lno_nnz_view_t_>
-  void sptrsv_symbolic(
-      KernelHandle *handle, 
-      lno_row_view_t_ rowmap,
-      lno_nnz_view_t_ entries)
-  {
-    typedef typename KernelHandle::size_type size_type;
-    typedef typename KernelHandle::nnz_lno_t ordinal_type;
-
-    static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE(typename lno_row_view_t_::non_const_value_type, size_type),
-        "sptrsv_symbolic: A size_type must match KernelHandle size_type (const doesn't matter)");
-
-    static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE(typename lno_nnz_view_t_::non_const_value_type, ordinal_type),
-        "sptrsv_symbolic: A entry type must match KernelHandle entry type (aka nnz_lno_t, and const doesn't matter)");
-
-
-    typedef typename KernelHandle::const_size_type c_size_t;
-    typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
-    typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
-
-    typedef typename KernelHandle::HandleExecSpace c_exec_t;
-    typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
-    typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
-
-    typedef typename  KokkosKernels::Experimental::KokkosKernelsHandle<c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t> const_handle_type;
-    const_handle_type tmp_handle (*handle);
-
-    typedef Kokkos::View<
-          typename lno_row_view_t_::const_value_type*,
-          typename KokkosKernels::Impl::GetUnifiedLayout<lno_row_view_t_>::array_layout,
-          typename lno_row_view_t_::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > RowMap_Internal;
-
-    typedef Kokkos::View<
-          typename lno_nnz_view_t_::const_value_type*,
-          typename KokkosKernels::Impl::GetUnifiedLayout<lno_nnz_view_t_>::array_layout,
-          typename lno_nnz_view_t_::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > Entries_Internal;
-
-
-    #ifdef  KK_TRISOLVE_TIMERS
-    Kokkos::Timer timer_sptrsv;
-    #endif
-    RowMap_Internal rowmap_i = rowmap;
-    Entries_Internal entries_i = entries;
-
-    KokkosSparse::Impl::SPTRSV_SYMBOLIC<const_handle_type, RowMap_Internal, Entries_Internal>::sptrsv_symbolic (&tmp_handle, rowmap_i, entries_i);
-
-    #ifdef KK_TRISOLVE_TIMERS
-    std::cout << "     > sptrsv_symbolic time = " << timer_sptrsv.seconds() << std::endl;
-    #endif
-  } // sptrsv_symbolic
-
-  template <typename KernelHandle,
-            typename lno_row_view_t_,
-            typename lno_nnz_view_t_,
-            typename scalar_nnz_view_t_>
-  void sptrsv_symbolic(
-      KernelHandle *handle, 
-      lno_row_view_t_ rowmap,
-      lno_nnz_view_t_ entries,
-      scalar_nnz_view_t_ values)
-  {
-    typedef typename KernelHandle::size_type size_type;
-    typedef typename KernelHandle::nnz_lno_t ordinal_type;
-    typedef typename KernelHandle::nnz_scalar_t scalar_type;
-
-    static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE(typename lno_row_view_t_::non_const_value_type, size_type),
-        "sptrsv_symbolic: A size_type must match KernelHandle size_type (const doesn't matter)");
-
-    static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE(typename lno_nnz_view_t_::non_const_value_type, ordinal_type),
-        "sptrsv_symbolic: A entry type must match KernelHandle entry type (aka nnz_lno_t, and const doesn't matter)");
-
-    static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE(typename scalar_nnz_view_t_::value_type, scalar_type),
-        "sptrsv_symbolic: A scalar type must match KernelHandle entry type (aka nnz_lno_t, and const doesn't matter)");
-
-    typedef typename KernelHandle::const_size_type c_size_t;
-    typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
-    typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
-
-    typedef typename KernelHandle::HandleExecSpace c_exec_t;
-    typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
-    typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
-
-    typedef typename  KokkosKernels::Experimental::KokkosKernelsHandle<c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t> const_handle_type;
-    const_handle_type tmp_handle (*handle);
-
-    typedef Kokkos::View<
-          typename lno_row_view_t_::const_value_type*,
-          typename KokkosKernels::Impl::GetUnifiedLayout<lno_row_view_t_>::array_layout,
-          typename lno_row_view_t_::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > RowMap_Internal;
-
-    typedef Kokkos::View<
-          typename lno_nnz_view_t_::const_value_type*,
-          typename KokkosKernels::Impl::GetUnifiedLayout<lno_nnz_view_t_>::array_layout,
-          typename lno_nnz_view_t_::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > Entries_Internal;
-
-    typedef Kokkos::View<
-          typename scalar_nnz_view_t_::const_value_type*,
-          typename KokkosKernels::Impl::GetUnifiedLayout<scalar_nnz_view_t_>::array_layout,
-          typename scalar_nnz_view_t_::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > Values_Internal;
-
-    #ifdef KK_TRISOLVE_TIMERS
-    Kokkos::Timer timer_sptrsv;
-    #endif
-    auto sptrsv_handle = handle->get_sptrsv_handle();
-    if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) {
-      RowMap_Internal rowmap_i = rowmap;
-      Entries_Internal entries_i = entries;
-      Values_Internal values_i = values;
-
-
-      typedef typename KernelHandle::SPTRSVHandleType sptrsvHandleType;
-      sptrsvHandleType *sh = handle->get_sptrsv_handle();
-      auto nrows = sh->get_nrows();
-
-      KokkosSparse::Impl::sptrsvcuSPARSE_symbolic
-      < sptrsvHandleType,
-        RowMap_Internal,
-        Entries_Internal,
-        Values_Internal >
-        (sh, nrows, rowmap_i, entries_i, values_i, false);
-
-    }
-    else {
-      KokkosSparse::Experimental::sptrsv_symbolic (handle, rowmap, entries);
-    }
-    #ifdef KK_TRISOLVE_TIMERS
-    std::cout << "     + sptrsv_symbolic time = " << timer_sptrsv.seconds() << std::endl;
-    #endif
-  } // sptrsv_symbolic
-
-  template <typename KernelHandle,
-            typename lno_row_view_t_,
-            typename lno_nnz_view_t_,
-            typename scalar_nnz_view_t_,
-            class BType,
-            class XType>
-  void sptrsv_solve(
-      KernelHandle *handle, 
-      lno_row_view_t_ rowmap,
-      lno_nnz_view_t_ entries,
-      scalar_nnz_view_t_ values,
-      BType b,
-      XType x)
-  {
-    typedef typename KernelHandle::size_type size_type;
-    typedef typename KernelHandle::nnz_lno_t ordinal_type;
-    typedef typename KernelHandle::nnz_scalar_t scalar_type;
-    
-    static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE(typename lno_row_view_t_::non_const_value_type, size_type),
-        "sptrsv_solve: A size_type must match KernelHandle size_type (const doesn't matter)");
-    static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE(typename lno_nnz_view_t_::non_const_value_type, ordinal_type),
-        "sptrsv_solve: A entry type must match KernelHandle entry type (aka nnz_lno_t, and const doesn't matter)");
-    static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE(typename scalar_nnz_view_t_::value_type, scalar_type),
-        "sptrsv_solve: A scalar type must match KernelHandle entry type (aka nnz_lno_t, and const doesn't matter)");
-
-    static_assert (Kokkos::Impl::is_view<BType>::value,
-        "sptrsv: b is not a Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<XType>::value,
-        "sptrsv: x is not a Kokkos::View.");
-    static_assert ((int) BType::rank == (int) XType::rank,
-        "sptrsv: The ranks of b and x do not match.");
-    static_assert (BType::rank == 1,
-        "sptrsv: b and x must both either have rank 1.");
-    static_assert (std::is_same<typename XType::value_type,
-        typename XType::non_const_value_type>::value,
-        "sptrsv: The output x must be nonconst.");
-    static_assert (std::is_same<typename BType::device_type, typename XType::device_type>::value,
-        "sptrsv: Views BType and XType have different device_types.");
-    static_assert (std::is_same<typename BType::device_type::execution_space, typename KernelHandle::SPTRSVHandleType::execution_space>::value,
-        "sptrsv: KernelHandle and Views have different execution spaces.");
-    static_assert (std::is_same<typename lno_row_view_t_::device_type, typename lno_nnz_view_t_::device_type>::value,
-        "sptrsv: rowmap and entries have different device types.");
-    static_assert (std::is_same<typename lno_row_view_t_::device_type, typename scalar_nnz_view_t_::device_type>::value,
-        "sptrsv: rowmap and values have different device types.");
-
-
-    typedef typename KernelHandle::const_size_type c_size_t;
-    typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
-    typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
-
-    typedef typename KernelHandle::HandleExecSpace c_exec_t;
-    typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
-    typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
-
-    typedef typename  KokkosKernels::Experimental::KokkosKernelsHandle<c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t> const_handle_type;
-    const_handle_type tmp_handle (*handle);
-
-    typedef Kokkos::View<
-          typename lno_row_view_t_::const_value_type*,
-          typename KokkosKernels::Impl::GetUnifiedLayout<lno_row_view_t_>::array_layout,
-          typename lno_row_view_t_::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > RowMap_Internal;
-
-    typedef Kokkos::View<
-          typename lno_nnz_view_t_::const_value_type*,
-          typename KokkosKernels::Impl::GetUnifiedLayout<lno_nnz_view_t_>::array_layout,
-          typename lno_nnz_view_t_::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > Entries_Internal;
-
-    typedef Kokkos::View<
-          typename scalar_nnz_view_t_::const_value_type*,
-          typename KokkosKernels::Impl::GetUnifiedLayout<scalar_nnz_view_t_>::array_layout,
-          typename scalar_nnz_view_t_::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > Values_Internal;
-
-
-    typedef Kokkos::View<
-          typename BType::const_value_type*,
-          typename KokkosKernels::Impl::GetUnifiedLayout<BType>::array_layout,
-          typename BType::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > BType_Internal;
-
-    typedef Kokkos::View<
-          typename XType::non_const_value_type*,
-          typename KokkosKernels::Impl::GetUnifiedLayout<XType>::array_layout,
-          typename XType::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> > XType_Internal;
-
-
-    RowMap_Internal rowmap_i = rowmap;
-    Entries_Internal entries_i = entries;
-    Values_Internal values_i = values;
+#define KOKKOSKERNELS_SPTRSV_SAME_TYPE(A, B)        \
+  std::is_same<typename std::remove_const<A>::type, \
+               typename std::remove_const<B>::type>::value
+
+template <typename KernelHandle, typename lno_row_view_t_,
+          typename lno_nnz_view_t_>
+void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap,
+                     lno_nnz_view_t_ entries) {
+  typedef typename KernelHandle::size_type size_type;
+  typedef typename KernelHandle::nnz_lno_t ordinal_type;
+
+  static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE(
+                    typename lno_row_view_t_::non_const_value_type, size_type),
+                "sptrsv_symbolic: A size_type must match KernelHandle "
+                "size_type (const doesn't matter)");
+
+  static_assert(
+      KOKKOSKERNELS_SPTRSV_SAME_TYPE(
+          typename lno_nnz_view_t_::non_const_value_type, ordinal_type),
+      "sptrsv_symbolic: A entry type must match KernelHandle entry type (aka "
+      "nnz_lno_t, and const doesn't matter)");
+
+  typedef typename KernelHandle::const_size_type c_size_t;
+  typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
+  typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
+
+  typedef typename KernelHandle::HandleExecSpace c_exec_t;
+  typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
+  typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
+
+  typedef typename KokkosKernels::Experimental::KokkosKernelsHandle<
+      c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t>
+      const_handle_type;
+  const_handle_type tmp_handle(*handle);
+
+  typedef Kokkos::View<
+      typename lno_row_view_t_::const_value_type *,
+      typename KokkosKernels::Impl::GetUnifiedLayout<
+          lno_row_view_t_>::array_layout,
+      typename lno_row_view_t_::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      RowMap_Internal;
+
+  typedef Kokkos::View<
+      typename lno_nnz_view_t_::const_value_type *,
+      typename KokkosKernels::Impl::GetUnifiedLayout<
+          lno_nnz_view_t_>::array_layout,
+      typename lno_nnz_view_t_::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      Entries_Internal;
+
+#ifdef KK_TRISOLVE_TIMERS
+  Kokkos::Timer timer_sptrsv;
+#endif
+  RowMap_Internal rowmap_i   = rowmap;
+  Entries_Internal entries_i = entries;
 
-    BType_Internal b_i = b;
-    XType_Internal x_i = x;
+  KokkosSparse::Impl::SPTRSV_SYMBOLIC<
+      const_handle_type, RowMap_Internal,
+      Entries_Internal>::sptrsv_symbolic(&tmp_handle, rowmap_i, entries_i);
 
-    auto sptrsv_handle = handle->get_sptrsv_handle();
-    if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) {
-      typedef typename KernelHandle::SPTRSVHandleType sptrsvHandleType;
-      sptrsvHandleType *sh = handle->get_sptrsv_handle();
-      auto nrows = sh->get_nrows();
+#ifdef KK_TRISOLVE_TIMERS
+  std::cout << "     > sptrsv_symbolic time = " << timer_sptrsv.seconds()
+            << std::endl;
+#endif
+}  // sptrsv_symbolic
+
+template <typename KernelHandle, typename lno_row_view_t_,
+          typename lno_nnz_view_t_, typename scalar_nnz_view_t_>
+void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap,
+                     lno_nnz_view_t_ entries, scalar_nnz_view_t_ values) {
+  typedef typename KernelHandle::size_type size_type;
+  typedef typename KernelHandle::nnz_lno_t ordinal_type;
+  typedef typename KernelHandle::nnz_scalar_t scalar_type;
+
+  static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE(
+                    typename lno_row_view_t_::non_const_value_type, size_type),
+                "sptrsv_symbolic: A size_type must match KernelHandle "
+                "size_type (const doesn't matter)");
+
+  static_assert(
+      KOKKOSKERNELS_SPTRSV_SAME_TYPE(
+          typename lno_nnz_view_t_::non_const_value_type, ordinal_type),
+      "sptrsv_symbolic: A entry type must match KernelHandle entry type (aka "
+      "nnz_lno_t, and const doesn't matter)");
+
+  static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE(
+                    typename scalar_nnz_view_t_::value_type, scalar_type),
+                "sptrsv_symbolic: A scalar type must match KernelHandle entry "
+                "type (aka nnz_lno_t, and const doesn't matter)");
+
+  typedef typename KernelHandle::const_size_type c_size_t;
+  typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
+  typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
+
+  typedef typename KernelHandle::HandleExecSpace c_exec_t;
+  typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
+  typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
+
+  typedef typename KokkosKernels::Experimental::KokkosKernelsHandle<
+      c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t>
+      const_handle_type;
+  const_handle_type tmp_handle(*handle);
+
+  typedef Kokkos::View<
+      typename lno_row_view_t_::const_value_type *,
+      typename KokkosKernels::Impl::GetUnifiedLayout<
+          lno_row_view_t_>::array_layout,
+      typename lno_row_view_t_::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      RowMap_Internal;
+
+  typedef Kokkos::View<
+      typename lno_nnz_view_t_::const_value_type *,
+      typename KokkosKernels::Impl::GetUnifiedLayout<
+          lno_nnz_view_t_>::array_layout,
+      typename lno_nnz_view_t_::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      Entries_Internal;
+
+  typedef Kokkos::View<
+      typename scalar_nnz_view_t_::const_value_type *,
+      typename KokkosKernels::Impl::GetUnifiedLayout<
+          scalar_nnz_view_t_>::array_layout,
+      typename scalar_nnz_view_t_::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      Values_Internal;
+
+#ifdef KK_TRISOLVE_TIMERS
+  Kokkos::Timer timer_sptrsv;
+#endif
+  auto sptrsv_handle = handle->get_sptrsv_handle();
+  if (sptrsv_handle->get_algorithm() ==
+      KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) {
+    RowMap_Internal rowmap_i   = rowmap;
+    Entries_Internal entries_i = entries;
+    Values_Internal values_i   = values;
 
-      KokkosSparse::Impl::sptrsvcuSPARSE_solve
-      < sptrsvHandleType,
-        RowMap_Internal,
-        Entries_Internal,
-        Values_Internal,
-        BType_Internal,
-        XType_Internal >
-        (sh, nrows, rowmap_i, entries_i, values_i, b_i, x_i, false);
+    typedef typename KernelHandle::SPTRSVHandleType sptrsvHandleType;
+    sptrsvHandleType *sh = handle->get_sptrsv_handle();
+    auto nrows           = sh->get_nrows();
 
-    }
-    else {
-      KokkosSparse::Impl::SPTRSV_SOLVE<const_handle_type, RowMap_Internal, Entries_Internal, Values_Internal, BType_Internal, XType_Internal>::sptrsv_solve (&tmp_handle, rowmap_i, entries_i, values_i, b_i, x_i);
-    }
+    KokkosSparse::Impl::sptrsvcuSPARSE_symbolic<
+        sptrsvHandleType, RowMap_Internal, Entries_Internal, Values_Internal>(
+        sh, nrows, rowmap_i, entries_i, values_i, false);
 
-  } // sptrsv_solve
+  } else {
+    KokkosSparse::Experimental::sptrsv_symbolic(handle, rowmap, entries);
+  }
+#ifdef KK_TRISOLVE_TIMERS
+  std::cout << "     + sptrsv_symbolic time = " << timer_sptrsv.seconds()
+            << std::endl;
+#endif
+}  // sptrsv_symbolic
+
+template <typename KernelHandle, typename lno_row_view_t_,
+          typename lno_nnz_view_t_, typename scalar_nnz_view_t_, class BType,
+          class XType>
+void sptrsv_solve(KernelHandle *handle, lno_row_view_t_ rowmap,
+                  lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, BType b,
+                  XType x) {
+  typedef typename KernelHandle::size_type size_type;
+  typedef typename KernelHandle::nnz_lno_t ordinal_type;
+  typedef typename KernelHandle::nnz_scalar_t scalar_type;
+
+  static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE(
+                    typename lno_row_view_t_::non_const_value_type, size_type),
+                "sptrsv_solve: A size_type must match KernelHandle size_type "
+                "(const doesn't matter)");
+  static_assert(
+      KOKKOSKERNELS_SPTRSV_SAME_TYPE(
+          typename lno_nnz_view_t_::non_const_value_type, ordinal_type),
+      "sptrsv_solve: A entry type must match KernelHandle entry type (aka "
+      "nnz_lno_t, and const doesn't matter)");
+  static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE(
+                    typename scalar_nnz_view_t_::value_type, scalar_type),
+                "sptrsv_solve: A scalar type must match KernelHandle entry "
+                "type (aka nnz_lno_t, and const doesn't matter)");
+
+  static_assert(Kokkos::is_view<BType>::value,
+                "sptrsv: b is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<XType>::value,
+                "sptrsv: x is not a Kokkos::View.");
+  static_assert((int)BType::rank == (int)XType::rank,
+                "sptrsv: The ranks of b and x do not match.");
+  static_assert(BType::rank == 1,
+                "sptrsv: b and x must both either have rank 1.");
+  static_assert(std::is_same<typename XType::value_type,
+                             typename XType::non_const_value_type>::value,
+                "sptrsv: The output x must be nonconst.");
+  static_assert(std::is_same<typename BType::device_type,
+                             typename XType::device_type>::value,
+                "sptrsv: Views BType and XType have different device_types.");
+  static_assert(
+      std::is_same<
+          typename BType::device_type::execution_space,
+          typename KernelHandle::SPTRSVHandleType::execution_space>::value,
+      "sptrsv: KernelHandle and Views have different execution spaces.");
+  static_assert(std::is_same<typename lno_row_view_t_::device_type,
+                             typename lno_nnz_view_t_::device_type>::value,
+                "sptrsv: rowmap and entries have different device types.");
+  static_assert(std::is_same<typename lno_row_view_t_::device_type,
+                             typename scalar_nnz_view_t_::device_type>::value,
+                "sptrsv: rowmap and values have different device types.");
+
+  typedef typename KernelHandle::const_size_type c_size_t;
+  typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
+  typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
+
+  typedef typename KernelHandle::HandleExecSpace c_exec_t;
+  typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
+  typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
+
+  typedef typename KokkosKernels::Experimental::KokkosKernelsHandle<
+      c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t>
+      const_handle_type;
+  const_handle_type tmp_handle(*handle);
+
+  typedef Kokkos::View<
+      typename lno_row_view_t_::const_value_type *,
+      typename KokkosKernels::Impl::GetUnifiedLayout<
+          lno_row_view_t_>::array_layout,
+      typename lno_row_view_t_::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      RowMap_Internal;
+
+  typedef Kokkos::View<
+      typename lno_nnz_view_t_::const_value_type *,
+      typename KokkosKernels::Impl::GetUnifiedLayout<
+          lno_nnz_view_t_>::array_layout,
+      typename lno_nnz_view_t_::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      Entries_Internal;
+
+  typedef Kokkos::View<
+      typename scalar_nnz_view_t_::const_value_type *,
+      typename KokkosKernels::Impl::GetUnifiedLayout<
+          scalar_nnz_view_t_>::array_layout,
+      typename scalar_nnz_view_t_::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      Values_Internal;
+
+  typedef Kokkos::View<
+      typename BType::const_value_type *,
+      typename KokkosKernels::Impl::GetUnifiedLayout<BType>::array_layout,
+      typename BType::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      BType_Internal;
+
+  typedef Kokkos::View<
+      typename XType::non_const_value_type *,
+      typename KokkosKernels::Impl::GetUnifiedLayout<XType>::array_layout,
+      typename XType::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      XType_Internal;
+
+  RowMap_Internal rowmap_i   = rowmap;
+  Entries_Internal entries_i = entries;
+  Values_Internal values_i   = values;
+
+  BType_Internal b_i = b;
+  XType_Internal x_i = x;
+
+  auto sptrsv_handle = handle->get_sptrsv_handle();
+  if (sptrsv_handle->get_algorithm() ==
+      KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) {
+    typedef typename KernelHandle::SPTRSVHandleType sptrsvHandleType;
+    sptrsvHandleType *sh = handle->get_sptrsv_handle();
+    auto nrows           = sh->get_nrows();
+
+    KokkosSparse::Impl::sptrsvcuSPARSE_solve<sptrsvHandleType, RowMap_Internal,
+                                             Entries_Internal, Values_Internal,
+                                             BType_Internal, XType_Internal>(
+        sh, nrows, rowmap_i, entries_i, values_i, b_i, x_i, false);
+
+  } else {
+    KokkosSparse::Impl::SPTRSV_SOLVE<
+        const_handle_type, RowMap_Internal, Entries_Internal, Values_Internal,
+        BType_Internal, XType_Internal>::sptrsv_solve(&tmp_handle, rowmap_i,
+                                                      entries_i, values_i, b_i,
+                                                      x_i);
+  }
 
+}  // sptrsv_solve
 
 #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV)
-  // ---------------------------------------------------------------------
-  template <typename KernelHandle,
-            class XType>
-  void sptrsv_solve(
-      KernelHandle *handle, 
-      XType x,
-      XType b)
-  {
-    auto crsmat = handle->get_sptrsv_handle ()->get_crsmat ();
-    auto values  = crsmat.values;
-    auto graph   = crsmat.graph;
-    auto row_map = graph.row_map;
-    auto entries = graph.entries;
-
-    if (!(handle->get_sptrsv_handle ()->is_numeric_complete())) {
-      std::cout << std::endl
-                << " ** needs to call sptrsv_compute before calling sptrsv_solve **"
-                << std::endl << std::endl;
-      return;
-    }
-
-    if (handle->is_sptrsv_lower_tri ()) {
-      // apply forward pivoting
-      Kokkos::deep_copy (x, b);
-
-      // the fifth argument (i.e., first x) is not used
-      sptrsv_solve (handle, row_map, entries, values, x, x);
-    } else {
-      // the fifth argument (i.e., first x) is not used
-      sptrsv_solve (handle, row_map, entries, values, b, b);
-
-      // apply backward pivoting
-      Kokkos::deep_copy (x, b);
-   }
+// ---------------------------------------------------------------------
+template <typename KernelHandle, class XType>
+void sptrsv_solve(KernelHandle *handle, XType x, XType b) {
+  auto crsmat  = handle->get_sptrsv_handle()->get_crsmat();
+  auto values  = crsmat.values;
+  auto graph   = crsmat.graph;
+  auto row_map = graph.row_map;
+  auto entries = graph.entries;
+
+  if (!(handle->get_sptrsv_handle()->is_numeric_complete())) {
+    std::cout
+        << std::endl
+        << " ** needs to call sptrsv_compute before calling sptrsv_solve **"
+        << std::endl
+        << std::endl;
+    return;
   }
 
-  // ---------------------------------------------------------------------
-  template <typename KernelHandle,
-            class XType>
-  void sptrsv_solve(
-      KernelHandle *handleL,
-      KernelHandle *handleU,
-      XType x,
-      XType b)
-  {
-    // Lower-triangular solve
-    sptrsv_solve(handleL, x, b);
-
-    // copy the solution to rhs
-    Kokkos::deep_copy (b, x);
-
-    // uper-triangular solve
-    sptrsv_solve(handleU, x, b);
+  if (handle->is_sptrsv_lower_tri()) {
+    // apply forward pivoting
+    Kokkos::deep_copy(x, b);
+
+    // the fifth argument (i.e., first x) is not used
+    sptrsv_solve(handle, row_map, entries, values, x, x);
+  } else {
+    // the fifth argument (i.e., first x) is not used
+    sptrsv_solve(handle, row_map, entries, values, b, b);
+
+    // apply backward pivoting
+    Kokkos::deep_copy(x, b);
   }
+}
+
+// ---------------------------------------------------------------------
+template <typename KernelHandle, class XType>
+void sptrsv_solve(KernelHandle *handleL, KernelHandle *handleU, XType x,
+                  XType b) {
+  // Lower-triangular solve
+  sptrsv_solve(handleL, x, b);
+
+  // copy the solution to rhs
+  Kokkos::deep_copy(b, x);
+
+  // uper-triangular solve
+  sptrsv_solve(handleU, x, b);
+}
 #endif
 
-} // namespace Experimental
-} // namespace KokkosSparse
+}  // namespace Experimental
+}  // namespace KokkosSparse
 
 #undef KOKKOSKERNELS_SPTRSV_SAME_TYPE
 
-#endif // KOKKOSSPARSE_SPTRSV_HPP_
-
+#endif  // KOKKOSSPARSE_SPTRSV_HPP_
diff --git a/src/sparse/KokkosSparse_sptrsv_cholmod.hpp b/src/sparse/KokkosSparse_sptrsv_cholmod.hpp
index f8e68aea6a..796ee579bd 100644
--- a/src/sparse/KokkosSparse_sptrsv_cholmod.hpp
+++ b/src/sparse/KokkosSparse_sptrsv_cholmod.hpp
@@ -62,231 +62,241 @@
 namespace KokkosSparse {
 namespace Experimental {
 
-
-/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
-/* Auxiliary functions for symbolic analysis                                                 */
-
-  /* ========================================================================================= */
-  template <typename cholmod_int_type, typename graph_t, typename KernelHandle>
-  graph_t read_cholmod_graphL(KernelHandle *kernelHandle, cholmod_factor *L, cholmod_common *cm) {
-
-    /* ---------------------------------------------------------------------- */
-    /* get inputs */
-    /* ---------------------------------------------------------------------- */
-    size_t n = L->n;
-    size_t nsuper = L->nsuper;     // # of supernodal columns
-    cholmod_int_type *mb = (cholmod_int_type*)(L->pi);    // mb[s+1] - mb[s] = total number of rows in the s-th supernodes (diagonal+off-diagonal)
-    cholmod_int_type *nb = (cholmod_int_type*)(L->super);
-    cholmod_int_type *colptr = (cholmod_int_type*)(L->px);      // colptr
-    cholmod_int_type *rowind = (cholmod_int_type*)(L->s);       // rowind
-
-    bool ptr_by_column = false;
-    if (kernelHandle->is_sptrsv_column_major()) {
-      int nnzA = colptr[nsuper] - colptr[0]; // overestimated if not block_diag
-      return read_supernodal_graphL<graph_t> (kernelHandle, n, nsuper, nnzA, ptr_by_column, mb, nb, rowind);
-    } else {
-      return read_supernodal_graphLt<graph_t> (kernelHandle, n, nsuper, ptr_by_column, mb, nb, rowind);
-    }
+/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+/* Auxiliary functions for symbolic analysis */
+
+/* =========================================================================================
+ */
+template <typename cholmod_int_type, typename graph_t, typename KernelHandle>
+graph_t read_cholmod_graphL(KernelHandle *kernelHandle, cholmod_factor *L,
+                            cholmod_common *cm) {
+  /* ---------------------------------------------------------------------- */
+  /* get inputs */
+  /* ---------------------------------------------------------------------- */
+  size_t n      = L->n;
+  size_t nsuper = L->nsuper;  // # of supernodal columns
+  cholmod_int_type *mb =
+      (cholmod_int_type
+           *)(L->pi);  // mb[s+1] - mb[s] = total number of rows in the s-th
+                       // supernodes (diagonal+off-diagonal)
+  cholmod_int_type *nb     = (cholmod_int_type *)(L->super);
+  cholmod_int_type *colptr = (cholmod_int_type *)(L->px);  // colptr
+  cholmod_int_type *rowind = (cholmod_int_type *)(L->s);   // rowind
+
+  bool ptr_by_column = false;
+  if (kernelHandle->is_sptrsv_column_major()) {
+    int nnzA = colptr[nsuper] - colptr[0];  // overestimated if not block_diag
+    return read_supernodal_graphL<graph_t>(kernelHandle, n, nsuper, nnzA,
+                                           ptr_by_column, mb, nb, rowind);
+  } else {
+    return read_supernodal_graphLt<graph_t>(kernelHandle, n, nsuper,
+                                            ptr_by_column, mb, nb, rowind);
+  }
+}
+
+/* =========================================================================================
+ */
+template <typename cholmod_int_type>
+void compute_etree_cholmod(cholmod_sparse *A, cholmod_common *cm, int **etree) {
+  cholmod_factor *L;
+  if (std::is_same<cholmod_int_type, long>::value == true) {
+    L = cholmod_l_analyze(A, cm);
+  } else if (std::is_same<cholmod_int_type, int>::value == true) {
+    L = cholmod_analyze(A, cm);
   }
 
-
-  /* ========================================================================================= */
-  template <typename cholmod_int_type>
-  void compute_etree_cholmod(cholmod_sparse *A, cholmod_common *cm, int **etree) {
-    cholmod_factor *L;
-    if (std::is_same<cholmod_int_type, long>::value == true) {
-      L = cholmod_l_analyze (A, cm);
-    } else if (std::is_same<cholmod_int_type, int>::value == true) {
-      L = cholmod_analyze (A, cm);
-    }
-
-    size_t n = L->n;
-    size_t nsuper = L->nsuper;      // # of supernodal columns
-    cholmod_int_type *Iwork = (cholmod_int_type*)(cm->Iwork);
-    cholmod_int_type *Parent = Iwork + (2*n); /* size nfsuper <= n [ */
-
-    *etree = new int [nsuper];
-    for (size_t ii = 0 ; ii < nsuper; ii++) (*etree)[ii] = Parent[ii];
+  size_t n                 = L->n;
+  size_t nsuper            = L->nsuper;  // # of supernodal columns
+  cholmod_int_type *Iwork  = (cholmod_int_type *)(cm->Iwork);
+  cholmod_int_type *Parent = Iwork + (2 * n); /* size nfsuper <= n [ */
+
+  *etree = new int[nsuper];
+  for (size_t ii = 0; ii < nsuper; ii++) (*etree)[ii] = Parent[ii];
+}
+
+/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+/* For symbolic analysis */
+template <typename cholmod_int_type, typename KernelHandle>
+void sptrsv_symbolic(KernelHandle *kernelHandleL, KernelHandle *kernelHandleU,
+                     cholmod_factor *L, cholmod_common *cm) {
+  // ===================================================================
+  // load sptrsv-handles
+  auto *handleL = kernelHandleL->get_sptrsv_handle();
+  auto *handleU = kernelHandleU->get_sptrsv_handle();
+
+  // ==============================================
+  // load supernodes
+  int nsuper                  = L->nsuper;
+  cholmod_int_type *supercols = (cholmod_int_type *)(L->super);
+  // convert supercols into internal-view type
+  using integer_view_host_t =
+      typename KernelHandle::SPTRSVHandleType::integer_view_host_t;
+  integer_view_host_t supercols_view =
+      integer_view_host_t("supercols", 1 + nsuper);
+  for (int i = 0; i <= nsuper; i++) {
+    supercols_view(i) = supercols[i];
   }
 
+  // ==============================================
+  // load etree (optional)
+  int *etree = handleL->get_etree();
 
-/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
-/* For symbolic analysis                                                                     */
-  template <typename cholmod_int_type, typename KernelHandle>
-  void sptrsv_symbolic(
-      KernelHandle *kernelHandleL,
-      KernelHandle *kernelHandleU,
-      cholmod_factor *L,
-      cholmod_common *cm)
-  {
-    // ===================================================================
-    // load sptrsv-handles
-    auto *handleL = kernelHandleL->get_sptrsv_handle ();
-    auto *handleU = kernelHandleU->get_sptrsv_handle ();
+  // ==============================================
+  // extract CrsGraph for L from Cholmod
+  using host_graph_t = typename KernelHandle::SPTRSVHandleType::host_graph_t;
+  auto graphL =
+      read_cholmod_graphL<cholmod_int_type, host_graph_t>(kernelHandleL, L, cm);
 
+  if (handleU->is_column_major()) {
     // ==============================================
-    // load supernodes
-    int nsuper = L->nsuper;
-    cholmod_int_type *supercols = (cholmod_int_type*)(L->super);
-    // convert supercols into internal-view type
-    using integer_view_host_t = typename KernelHandle::SPTRSVHandleType::integer_view_host_t;
-    integer_view_host_t supercols_view = integer_view_host_t ("supercols", 1+nsuper);
-    for (int i = 0; i <= nsuper; i++) {
-      supercols_view (i) = supercols[i];
-    }
+    // extract CrsGraph for U from Cholmod
+    handleU->set_column_major(false);
+    auto graphU = read_cholmod_graphL<cholmod_int_type, host_graph_t>(
+        kernelHandleU, L, cm);
+    handleU->set_column_major(true);
 
     // ==============================================
-    // load etree (optional)
-    int *etree = handleL->get_etree ();
-
+    // call supnodal symbolic
+    sptrsv_supernodal_symbolic(nsuper, supercols_view.data(), etree, graphL,
+                               kernelHandleL, graphU, kernelHandleU);
+  } else {
     // ==============================================
-    // extract CrsGraph for L from Cholmod
-    using host_graph_t = typename KernelHandle::SPTRSVHandleType::host_graph_t;
-    auto graphL = read_cholmod_graphL<cholmod_int_type, host_graph_t>(kernelHandleL, L, cm);
-
-    if (handleU->is_column_major ()) {
-      // ==============================================
-      // extract CrsGraph for U from Cholmod
-      handleU->set_column_major (false);
-      auto graphU = read_cholmod_graphL<cholmod_int_type, host_graph_t>(kernelHandleU, L, cm);
-      handleU->set_column_major (true);
-
-      // ==============================================
-      // call supnodal symbolic
-      sptrsv_supernodal_symbolic (nsuper, supercols_view.data (), etree,
-                                  graphL, kernelHandleL,
-                                  graphU, kernelHandleU);
-    } else {
-      // ==============================================
-      // call supnodal symbolic
-      sptrsv_supernodal_symbolic (nsuper, supercols_view.data (), etree,
-                                  graphL, kernelHandleL,
-                                  graphL, kernelHandleU);
-    }
+    // call supnodal symbolic
+    sptrsv_supernodal_symbolic(nsuper, supercols_view.data(), etree, graphL,
+                               kernelHandleL, graphL, kernelHandleU);
   }
-
-
-/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
-/* Auxiliary functions for numeric computation                                               */
-
-  /* ========================================================================================= */
-  template <typename cholmod_int_type, typename crsmat_t, typename graph_t, typename KernelHandle>
-  crsmat_t read_cholmod_factor(KernelHandle *kernelHandle, cholmod_factor *L, cholmod_common *cm, graph_t &static_graph) {
-
-    using values_view_t = typename crsmat_t::values_type::non_const_type;
-    using scalar_t      = typename values_view_t::value_type;
-
-    /* ---------------------------------------------------------------------- */
-    /* get inputs */
-    /* ---------------------------------------------------------------------- */
-    size_t n = L->n;
-    size_t nsuper = L->nsuper;     // # of supernodal columns
-    // mb[s+1] - mb[s] = total number of rows in the s-th supernodes (diagonal+off-diagonal)
-    cholmod_int_type *mb = (cholmod_int_type*)(L->pi);
-    cholmod_int_type *nb = (cholmod_int_type*)(L->super);
-    cholmod_int_type *colptr = (cholmod_int_type*)(L->px);
-    cholmod_int_type *rowind = (cholmod_int_type*)(L->s);
-    scalar_t *Lx = (scalar_t*)(L->x); // data
-
-    bool ptr_by_column = false;
-    if (kernelHandle->is_sptrsv_column_major()) {
-      return read_supernodal_values<crsmat_t> (kernelHandle, n, nsuper,
-                                               ptr_by_column, mb, nb, colptr, rowind, Lx, static_graph);
-    } else {
-      return read_supernodal_valuesLt<crsmat_t> (kernelHandle, n, nsuper,
-                                                 ptr_by_column, mb, nb, colptr, rowind, Lx, static_graph);
-    }
+}
+
+/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+/* Auxiliary functions for numeric computation */
+
+/* =========================================================================================
+ */
+template <typename cholmod_int_type, typename crsmat_t, typename graph_t,
+          typename KernelHandle>
+crsmat_t read_cholmod_factor(KernelHandle *kernelHandle, cholmod_factor *L,
+                             cholmod_common *cm, graph_t &static_graph) {
+  using values_view_t = typename crsmat_t::values_type::non_const_type;
+  using scalar_t      = typename values_view_t::value_type;
+
+  /* ---------------------------------------------------------------------- */
+  /* get inputs */
+  /* ---------------------------------------------------------------------- */
+  size_t n      = L->n;
+  size_t nsuper = L->nsuper;  // # of supernodal columns
+  // mb[s+1] - mb[s] = total number of rows in the s-th supernodes
+  // (diagonal+off-diagonal)
+  cholmod_int_type *mb     = (cholmod_int_type *)(L->pi);
+  cholmod_int_type *nb     = (cholmod_int_type *)(L->super);
+  cholmod_int_type *colptr = (cholmod_int_type *)(L->px);
+  cholmod_int_type *rowind = (cholmod_int_type *)(L->s);
+  scalar_t *Lx             = (scalar_t *)(L->x);  // data
+
+  bool ptr_by_column = false;
+  if (kernelHandle->is_sptrsv_column_major()) {
+    return read_supernodal_values<crsmat_t>(kernelHandle, n, nsuper,
+                                            ptr_by_column, mb, nb, colptr,
+                                            rowind, Lx, static_graph);
+  } else {
+    return read_supernodal_valuesLt<crsmat_t>(kernelHandle, n, nsuper,
+                                              ptr_by_column, mb, nb, colptr,
+                                              rowind, Lx, static_graph);
+  }
+}
+
+/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+/* For numeric computation */
+template <typename cholmod_int_type, typename KernelHandle>
+void sptrsv_compute(KernelHandle *kernelHandleL, KernelHandle *kernelHandleU,
+                    cholmod_factor *L, cholmod_common *cm) {
+  // ==============================================
+  // load sptrsv-handles
+  auto *handleL = kernelHandleL->get_sptrsv_handle();
+  auto *handleU = kernelHandleU->get_sptrsv_handle();
+
+  if (!(handleL->is_symbolic_complete()) ||
+      !(handleU->is_symbolic_complete())) {
+    std::cout
+        << std::endl
+        << " ** needs to call sptrsv_symbolic before calling sptrsv_numeric **"
+        << std::endl
+        << std::endl;
+    return;
   }
 
+  // ==============================================
+  // load options
+  bool useSpMV =
+      (handleL->get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV ||
+       handleL->get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG);
+
+  // ==============================================
+  // load crsGraph
+  auto graph = handleL->get_graph();
+
+  // ==============================================
+  // read numerical values of L from Cholmod
+  using crsmat_t = typename KernelHandle::SPTRSVHandleType::crsmat_t;
+  auto cholmodL = read_cholmod_factor<cholmod_int_type, crsmat_t>(kernelHandleL,
+                                                                  L, cm, graph);
+
+  // ==============================================
+  // split the matrix into submatrices for spmv at each level
+  if (useSpMV) {
+    split_crsmat<crsmat_t>(kernelHandleL, cholmodL);
+  }
 
-/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
-/* For numeric computation                                                                   */
-  template <typename cholmod_int_type, typename KernelHandle>
-  void sptrsv_compute(
-      KernelHandle *kernelHandleL,
-      KernelHandle *kernelHandleU,
-      cholmod_factor *L,
-      cholmod_common *cm)
-  {
-    // ==============================================
-    // load sptrsv-handles
-    auto *handleL = kernelHandleL->get_sptrsv_handle ();
-    auto *handleU = kernelHandleU->get_sptrsv_handle ();
-
-    if (!(handleL->is_symbolic_complete()) ||
-        !(handleU->is_symbolic_complete())) {
-      std::cout << std::endl
-                << " ** needs to call sptrsv_symbolic before calling sptrsv_numeric **"
-                << std::endl << std::endl;
-      return;
-    }
-
-    // ==============================================
-    // load options
-    bool useSpMV = (handleL->get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV ||
-                    handleL->get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG);
-
-    // ==============================================
-    // load crsGraph
-    auto graph = handleL->get_graph ();
+  // ==============================================
+  // save crsmat
+  handleL->set_crsmat(cholmodL);
+  if (handleU->is_column_major()) {
+    auto graphU = handleU->get_graph();
 
-    // ==============================================
-    // read numerical values of L from Cholmod
-    using crsmat_t = typename KernelHandle::SPTRSVHandleType::crsmat_t;
-    auto cholmodL = read_cholmod_factor<cholmod_int_type, crsmat_t> (kernelHandleL, L, cm, graph);
+    handleU->set_lower_tri(true);
+    handleU->set_column_major(false);
+    auto cholmodU = read_cholmod_factor<cholmod_int_type, crsmat_t>(
+        kernelHandleU, L, cm, graphU);
 
+    handleU->set_lower_tri(false);
+    handleU->set_column_major(true);
     // ==============================================
     // split the matrix into submatrices for spmv at each level
     if (useSpMV) {
-      split_crsmat<crsmat_t> (kernelHandleL, cholmodL);
+      split_crsmat<crsmat_t>(kernelHandleU, cholmodU);
     }
-
-    // ==============================================
-    // save crsmat
-    handleL->set_crsmat (cholmodL);
-    if (handleU->is_column_major ()) {
-      auto graphU = handleU->get_graph ();
-
-      handleU->set_lower_tri (true);
-      handleU->set_column_major (false);
-      auto cholmodU = read_cholmod_factor<cholmod_int_type, crsmat_t> (kernelHandleU, L, cm, graphU);
-
-      handleU->set_lower_tri (false);
-      handleU->set_column_major (true);
-      // ==============================================
-      // split the matrix into submatrices for spmv at each level
-      if (useSpMV) {
-        split_crsmat<crsmat_t> (kernelHandleU, cholmodU);
-      }
-      handleU->set_crsmat (cholmodU);
-    } else {
-      handleU->set_crsmat (cholmodL);
-      if (useSpMV) {
-        if (!handleL->get_invert_offdiagonal ()) {
-          // copy submatrices to U for SpMV at each level
-          auto nlevels = handleL->get_num_levels();
-          std::vector <crsmat_t> sub_crsmats (nlevels);
-          std::vector <crsmat_t> diag_blocks (nlevels);
-          for (int lvl = 0; lvl < nlevels; lvl++) {
-            sub_crsmats[lvl] = handleL->get_submatrix (nlevels-lvl-1);
-            diag_blocks[lvl] = handleL->get_diagblock (nlevels-lvl-1);
-          }
-          handleU->set_submatrices (sub_crsmats);
-          handleU->set_diagblocks (diag_blocks);
-        } else {
-          // not supported
+    handleU->set_crsmat(cholmodU);
+  } else {
+    handleU->set_crsmat(cholmodL);
+    if (useSpMV) {
+      if (!handleL->get_invert_offdiagonal()) {
+        // copy submatrices to U for SpMV at each level
+        auto nlevels = handleL->get_num_levels();
+        std::vector<crsmat_t> sub_crsmats(nlevels);
+        std::vector<crsmat_t> diag_blocks(nlevels);
+        for (int lvl = 0; lvl < nlevels; lvl++) {
+          sub_crsmats[lvl] = handleL->get_submatrix(nlevels - lvl - 1);
+          diag_blocks[lvl] = handleL->get_diagblock(nlevels - lvl - 1);
         }
+        handleU->set_submatrices(sub_crsmats);
+        handleU->set_diagblocks(diag_blocks);
+      } else {
+        // not supported
       }
     }
-
-    // ==============================================
-    handleL->set_numeric_complete ();
-    handleU->set_numeric_complete ();
   }
 
-} // namespace Experimental
-} // namespace KokkosSparse
+  // ==============================================
+  handleL->set_numeric_complete();
+  handleU->set_numeric_complete();
+}
 
-#endif // KOKKOSKERNELS_ENABLE_TPL_CHOLMOD && KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
-#endif // KOKKOSSPARSE_SPTRSV_CHOLMOD_HPP_
+}  // namespace Experimental
+}  // namespace KokkosSparse
 
+#endif  // KOKKOSKERNELS_ENABLE_TPL_CHOLMOD &&
+        // KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
+#endif  // KOKKOSSPARSE_SPTRSV_CHOLMOD_HPP_
diff --git a/src/sparse/KokkosSparse_sptrsv_handle.hpp b/src/sparse/KokkosSparse_sptrsv_handle.hpp
index 4813cb085f..4c9c98d6c1 100644
--- a/src/sparse/KokkosSparse_sptrsv_handle.hpp
+++ b/src/sparse/KokkosSparse_sptrsv_handle.hpp
@@ -42,7 +42,6 @@
 //@HEADER
 */
 
-#include <Kokkos_MemoryTraits.hpp>
 #include <Kokkos_Core.hpp>
 #include <iostream>
 #include <string>
@@ -54,11 +53,12 @@
 #include "cusparse.h"
 #endif
 
-#if defined(KOKKOS_ENABLE_CUDA) && 10000 < CUDA_VERSION && defined(KOKKOSKERNELS_ENABLE_EXP_CUDAGRAPH)
+#if defined(KOKKOS_ENABLE_CUDA) && 10000 < CUDA_VERSION && \
+    defined(KOKKOSKERNELS_ENABLE_EXP_CUDAGRAPH)
 #define KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
 #endif
 
- // Enable supernodal sptrsv
+// Enable supernodal sptrsv
 #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
 #include <KokkosSparse_CrsMatrix.hpp>
 #endif
@@ -66,16 +66,24 @@
 namespace KokkosSparse {
 namespace Experimental {
 
-// TODO TP2 algorithm had issues with some offset-ordinal combo to be addressed when compiled in Trilinos...
-enum class SPTRSVAlgorithm { SEQLVLSCHD_RP, SEQLVLSCHD_TP1/*, SEQLVLSCHED_TP2*/, SEQLVLSCHD_TP1CHAIN, SPTRSV_CUSPARSE, SUPERNODAL_NAIVE, SUPERNODAL_ETREE, SUPERNODAL_DAG, SUPERNODAL_SPMV, SUPERNODAL_SPMV_DAG };
+// TODO TP2 algorithm had issues with some offset-ordinal combo to be addressed
+// when compiled in Trilinos...
+enum class SPTRSVAlgorithm {
+  SEQLVLSCHD_RP,
+  SEQLVLSCHD_TP1 /*, SEQLVLSCHED_TP2*/,
+  SEQLVLSCHD_TP1CHAIN,
+  SPTRSV_CUSPARSE,
+  SUPERNODAL_NAIVE,
+  SUPERNODAL_ETREE,
+  SUPERNODAL_DAG,
+  SUPERNODAL_SPMV,
+  SUPERNODAL_SPMV_DAG
+};
 
-template <class size_type_, class lno_t_, class scalar_t_,
-          class ExecutionSpace,
-          class TemporaryMemorySpace,
-          class PersistentMemorySpace>
+template <class size_type_, class lno_t_, class scalar_t_, class ExecutionSpace,
+          class TemporaryMemorySpace, class PersistentMemorySpace>
 class SPTRSVHandle {
-public:
-
+ public:
   typedef ExecutionSpace HandleExecSpace;
   typedef TemporaryMemorySpace HandleTempMemorySpace;
   typedef PersistentMemorySpace HandlePersistentMemorySpace;
@@ -83,77 +91,97 @@ class SPTRSVHandle {
   typedef ExecutionSpace execution_space;
   typedef HandlePersistentMemorySpace memory_space;
 
-
-  typedef typename std::remove_const<size_type_>::type  size_type;
+  typedef typename std::remove_const<size_type_>::type size_type;
   typedef const size_type const_size_type;
 
-  typedef typename std::remove_const<lno_t_>::type  nnz_lno_t;
+  typedef typename std::remove_const<lno_t_>::type nnz_lno_t;
   typedef const nnz_lno_t const_nnz_lno_t;
 
-  typedef typename std::remove_const<scalar_t_>::type  scalar_t;
+  typedef typename std::remove_const<scalar_t_>::type scalar_t;
   typedef const scalar_t const_nnz_scalar_t;
 
-
   // row_map type (managed memory)
-  typedef typename Kokkos::View<size_type *, HandleTempMemorySpace> nnz_row_view_temp_t;
-  typedef typename Kokkos::View<size_type *, HandlePersistentMemorySpace> nnz_row_view_t;
+  typedef typename Kokkos::View<size_type *, HandleTempMemorySpace>
+      nnz_row_view_temp_t;
+  typedef typename Kokkos::View<size_type *, HandlePersistentMemorySpace>
+      nnz_row_view_t;
   typedef typename nnz_row_view_t::HostMirror host_nnz_row_view_t;
-  typedef typename Kokkos::View<int *, HandlePersistentMemorySpace> int_row_view_t;
- // typedef typename row_lno_persistent_work_view_t::HostMirror row_lno_persistent_work_host_view_t; //Host view type
-  typedef typename Kokkos::View<const size_type *, HandlePersistentMemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess>> nnz_row_unmanaged_view_t; // for rank1 subviews
+  typedef typename Kokkos::View<int *, HandlePersistentMemorySpace>
+      int_row_view_t;
+  // typedef typename row_lno_persistent_work_view_t::HostMirror
+  // row_lno_persistent_work_host_view_t; //Host view type
+  typedef typename Kokkos::View<
+      const size_type *, HandlePersistentMemorySpace,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>
+      nnz_row_unmanaged_view_t;  // for rank1 subviews
 
   // values type (managed memory)
-  typedef typename Kokkos::View<scalar_t *, HandleTempMemorySpace> nnz_scalar_view_temp_t;
-  typedef typename Kokkos::View<scalar_t *, HandlePersistentMemorySpace> nnz_scalar_view_t;
+  typedef typename Kokkos::View<scalar_t *, HandleTempMemorySpace>
+      nnz_scalar_view_temp_t;
+  typedef typename Kokkos::View<scalar_t *, HandlePersistentMemorySpace>
+      nnz_scalar_view_t;
   typedef typename nnz_scalar_view_t::HostMirror host_nnz_scalar_view_t;
-  typedef typename Kokkos::View<const scalar_t *, HandlePersistentMemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess>> nnz_scalar_unmanaged_view_t; // for rank1 subviews
+  typedef typename Kokkos::View<
+      const scalar_t *, HandlePersistentMemorySpace,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>
+      nnz_scalar_unmanaged_view_t;  // for rank1 subviews
 
   // entries type (managed memory)
-  typedef typename Kokkos::View<nnz_lno_t *, HandleTempMemorySpace> nnz_lno_view_temp_t;
-  typedef typename Kokkos::View<nnz_lno_t *, HandlePersistentMemorySpace> nnz_lno_view_t;
-  typedef typename Kokkos::View<nnz_lno_t *, Kokkos::HostSpace> hostspace_nnz_lno_view_t;
+  typedef typename Kokkos::View<nnz_lno_t *, HandleTempMemorySpace>
+      nnz_lno_view_temp_t;
+  typedef typename Kokkos::View<nnz_lno_t *, HandlePersistentMemorySpace>
+      nnz_lno_view_t;
+  typedef typename Kokkos::View<nnz_lno_t *, Kokkos::HostSpace>
+      hostspace_nnz_lno_view_t;
   typedef typename nnz_lno_view_t::HostMirror host_nnz_lno_view_t;
-  typedef typename Kokkos::View<const nnz_lno_t *, HandlePersistentMemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess>> nnz_lno_unmanaged_view_t; // for rank1 subviews
- // typedef typename nnz_lno_persistent_work_view_t::HostMirror nnz_lno_persistent_work_host_view_t; //Host view type
-
-
-  typedef typename std::make_signed<typename nnz_row_view_t::non_const_value_type>::type signed_integral_t;
-  typedef Kokkos::View< signed_integral_t*, typename nnz_row_view_t::array_layout, typename nnz_row_view_t::device_type, typename nnz_row_view_t::memory_traits > signed_nnz_lno_view_t;
+  typedef typename Kokkos::View<
+      const nnz_lno_t *, HandlePersistentMemorySpace,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>
+      nnz_lno_unmanaged_view_t;  // for rank1 subviews
+  // typedef typename nnz_lno_persistent_work_view_t::HostMirror
+  // nnz_lno_persistent_work_host_view_t; //Host view type
+
+  typedef typename std::make_signed<
+      typename nnz_row_view_t::non_const_value_type>::type signed_integral_t;
+  typedef Kokkos::View<signed_integral_t *,
+                       typename nnz_row_view_t::array_layout,
+                       typename nnz_row_view_t::device_type,
+                       typename nnz_row_view_t::memory_traits>
+      signed_nnz_lno_view_t;
   typedef typename signed_nnz_lno_view_t::HostMirror host_signed_nnz_lno_view_t;
 
-  typedef typename Kokkos::View<scalar_t **, HandlePersistentMemorySpace> mtx_scalar_view_t;
-
+  typedef typename Kokkos::View<scalar_t **, HandlePersistentMemorySpace>
+      mtx_scalar_view_t;
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
   struct cuSparseHandleType {
     cusparseHandle_t handle;
     cusparseOperation_t transpose;
-    csrsv2Info_t info {0};
+    csrsv2Info_t info{0};
     cusparseMatDescr_t descr;
     cusparseSolvePolicy_t policy;
-    void *pBuffer {nullptr};
+    void *pBuffer{nullptr};
 
-    cuSparseHandleType(bool transpose_, bool is_lower){
+    cuSparseHandleType(bool transpose_, bool is_lower) {
       cusparseStatus_t status;
-      status= cusparseCreate(&handle);
+      status = cusparseCreate(&handle);
       if (status != CUSPARSE_STATUS_SUCCESS) {
-        throw std::runtime_error ("cusparseCreate ERROR\n");
+        throw std::runtime_error("cusparseCreate ERROR\n");
       }
       cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST);
 
-      if (transpose_){
+      if (transpose_) {
         transpose = CUSPARSE_OPERATION_TRANSPOSE;
-      }
-      else {
-        transpose  = CUSPARSE_OPERATION_NON_TRANSPOSE;
+      } else {
+        transpose = CUSPARSE_OPERATION_NON_TRANSPOSE;
       }
 
       status = cusparseCreateMatDescr(&descr);
       if (status != CUSPARSE_STATUS_SUCCESS) {
-        throw std::runtime_error ("cusparseCreateMatDescr descr ERROR\n");
+        throw std::runtime_error("cusparseCreateMatDescr descr ERROR\n");
       }
-      cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
-      cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
+      cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
+      cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
 
       if (is_lower)
         cusparseSetMatFillMode(descr, CUSPARSE_FILL_MODE_LOWER);
@@ -178,15 +206,14 @@ class SPTRSVHandle {
   typedef cuSparseHandleType SPTRSVcuSparseHandleType;
 #endif
 
-
 #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
-  bool cudagraphCreated; // Move this later
+  bool cudagraphCreated;  // Move this later
   struct cudaGraphWrapperType {
     cudaGraph_t cudagraph;
     cudaGraphExec_t cudagraphinstance;
     cudaStream_t stream;
 
-    //cudaGraphWrapperType() { }
+    // cudaGraphWrapperType() { }
     //~cudaGraphWrapperType() { }
   };
 
@@ -194,50 +221,53 @@ class SPTRSVHandle {
 
   void create_SPTRSVcudaGraphWrapperType() {
     destroy_SPTRSVcudaGraphWrapperType();
-      sptrsvCudaGraph = new SPTRSVcudaGraphWrapperType;
+    sptrsvCudaGraph = new SPTRSVcudaGraphWrapperType;
     cudaStreamCreate(&sptrsvCudaGraph->stream);
   }
 
   void destroy_SPTRSVcudaGraphWrapperType() {
-    if(sptrsvCudaGraph != nullptr) {
-      //cudaGraphExecDestroy(sptrsvCudaGraph->cudagraphinstance);
-      //cudaGraphDestroy(sptrsvCudaGraph->cudagraph);
+    if (sptrsvCudaGraph != nullptr) {
+      // cudaGraphExecDestroy(sptrsvCudaGraph->cudagraphinstance);
+      // cudaGraphDestroy(sptrsvCudaGraph->cudagraph);
       cudaStreamDestroy(sptrsvCudaGraph->stream);
       delete sptrsvCudaGraph;
       sptrsvCudaGraph = nullptr;
     }
   }
 
-  SPTRSVcudaGraphWrapperType* get_sptrsvCudaGraph() {
-    return sptrsvCudaGraph;
-  }
+  SPTRSVcudaGraphWrapperType *get_sptrsvCudaGraph() { return sptrsvCudaGraph; }
 #endif
 
 #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
   using supercols_memory_space = TemporaryMemorySpace;
 
   using supercols_host_execution_space = Kokkos::DefaultHostExecutionSpace;
-  using supercols_host_memory_space = typename supercols_host_execution_space::memory_space;
+  using supercols_host_memory_space =
+      typename supercols_host_execution_space::memory_space;
 
-  using integer_view_t = Kokkos::View<int*, supercols_memory_space>;
-  using integer_view_host_t = Kokkos::View<int*, supercols_host_memory_space>;
+  using integer_view_t      = Kokkos::View<int *, supercols_memory_space>;
+  using integer_view_host_t = Kokkos::View<int *, supercols_host_memory_space>;
 
-  using workspace_t = typename Kokkos::View<scalar_t*, Kokkos::Device<execution_space, supercols_memory_space>>;
+  using workspace_t = typename Kokkos::View<
+      scalar_t *, Kokkos::Device<execution_space, supercols_memory_space>>;
 
   //
-  using host_crsmat_t = KokkosSparse::CrsMatrix<scalar_t, nnz_lno_t, supercols_host_execution_space, void, size_type>;
-  using crsmat_t      = KokkosSparse::CrsMatrix<scalar_t, nnz_lno_t, Kokkos::Device<execution_space, PersistentMemorySpace>, void, size_type>;
+  using host_crsmat_t =
+      KokkosSparse::CrsMatrix<scalar_t, nnz_lno_t,
+                              supercols_host_execution_space, void, size_type>;
+  using crsmat_t = KokkosSparse::CrsMatrix<
+      scalar_t, nnz_lno_t,
+      Kokkos::Device<execution_space, PersistentMemorySpace>, void, size_type>;
 
   //
   using host_graph_t = typename host_crsmat_t::StaticCrsGraphType;
-  using graph_t      = typename      crsmat_t::StaticCrsGraphType;
+  using graph_t      = typename crsmat_t::StaticCrsGraphType;
 
   //
   using crsmat_list_t = typename std::vector<crsmat_t>;
 #endif
 
-private:
-
+ private:
 #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
   SPTRSVcudaGraphWrapperType *sptrsvCudaGraph;
 #endif
@@ -251,9 +281,9 @@ class SPTRSVHandle {
   // Symbolic: Level scheduling data
   signed_nnz_lno_view_t level_list;
   nnz_lno_view_t nodes_per_level;
-  hostspace_nnz_lno_view_t hnodes_per_level; // NEW
+  hostspace_nnz_lno_view_t hnodes_per_level;  // NEW
   nnz_lno_view_t nodes_grouped_by_level;
-  hostspace_nnz_lno_view_t hnodes_grouped_by_level; // NEW
+  hostspace_nnz_lno_view_t hnodes_grouped_by_level;  // NEW
   size_type nlevel;
 
   int team_size;
@@ -261,10 +291,10 @@ class SPTRSVHandle {
 
   bool stored_diagonal;
   nnz_lno_view_t diagonal_offsets;
-  nnz_scalar_view_t diagonal_values; // inserted by rowid
+  nnz_scalar_view_t diagonal_values;  // inserted by rowid
 
   host_nnz_lno_view_t hdiagonal_offsets;
-  host_nnz_scalar_view_t hdiagonal_values; // inserted by rowid
+  host_nnz_scalar_view_t hdiagonal_values;  // inserted by rowid
 
   // Symbolic: Single-block chain data
   host_signed_nnz_lno_view_t h_chain_ptr;
@@ -276,39 +306,34 @@ class SPTRSVHandle {
   bool require_symbolic_lvlsched_phase;
   bool require_symbolic_chain_phase;
 
-  void set_if_algm_require_symb_lvlsched () {
-    if (algm == SPTRSVAlgorithm::SEQLVLSCHD_RP
-        || algm == SPTRSVAlgorithm::SEQLVLSCHD_TP1
-      /*|| algm == SPTRSVAlgorithm::SEQLVLSCHED_TP2*/
+  void set_if_algm_require_symb_lvlsched() {
+    if (algm == SPTRSVAlgorithm::SEQLVLSCHD_RP ||
+        algm == SPTRSVAlgorithm::SEQLVLSCHD_TP1
+        /*|| algm == SPTRSVAlgorithm::SEQLVLSCHED_TP2*/
         || algm == SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN
 #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
-        || algm == SPTRSVAlgorithm::SUPERNODAL_NAIVE
-        || algm == SPTRSVAlgorithm::SUPERNODAL_ETREE
-        || algm == SPTRSVAlgorithm::SUPERNODAL_DAG
-        || algm == SPTRSVAlgorithm::SUPERNODAL_SPMV
-        || algm == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG
+        || algm == SPTRSVAlgorithm::SUPERNODAL_NAIVE ||
+        algm == SPTRSVAlgorithm::SUPERNODAL_ETREE ||
+        algm == SPTRSVAlgorithm::SUPERNODAL_DAG ||
+        algm == SPTRSVAlgorithm::SUPERNODAL_SPMV ||
+        algm == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG
 #endif
-       )
-    {
+    ) {
       require_symbolic_lvlsched_phase = true;
-    }
-    else {
+    } else {
       require_symbolic_lvlsched_phase = false;
     }
   }
 
-  void set_if_algm_require_symb_chain () {
-    if (algm == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN
-       )
-    {
+  void set_if_algm_require_symb_chain() {
+    if (algm ==
+        KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) {
       require_symbolic_chain_phase = true;
-    }
-    else {
+    } else {
       require_symbolic_chain_phase = false;
     }
   }
 
-
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
   SPTRSVcuSparseHandleType *cuSPARSEHandle;
   int_row_view_t tmp_int_rowmap;
@@ -330,18 +355,19 @@ class SPTRSVHandle {
   // dag
   host_graph_t dag_host;
 
-  // map from supernode to column id, i.e., superdols[s] = the first column id of s-th supernode
-  integer_view_host_t supercols_host; // on the host
-  integer_view_t      supercols;      // on the default host/device
+  // map from supernode to column id, i.e., superdols[s] = the first column id
+  // of s-th supernode
+  integer_view_host_t supercols_host;  // on the host
+  integer_view_t supercols;            // on the default host/device
 
   // workspace size
   signed_integral_t lwork;
   workspace_t work;
   // offset to workspace for each supernodal column
   integer_view_host_t work_offset_host;
-  integer_view_t      work_offset;
+  integer_view_t work_offset;
 
-  // 
+  //
   bool merge_supernodes;
   bool invert_diagonal;
   bool invert_offdiagonal;
@@ -352,17 +378,18 @@ class SPTRSVHandle {
   int sup_size_unblocked;
   int sup_size_blocked;
   integer_view_host_t diag_kernel_type_host;
-  integer_view_t      diag_kernel_type;
+  integer_view_t diag_kernel_type;
   integer_view_host_t kernel_type_host;
-  integer_view_t      kernel_type;
+  integer_view_t kernel_type;
 
   // permutation
   bool perm_avail;
   integer_view_t perm;
 
   // graphs
-  host_graph_t original_graph_host; // graph on host before merge (only if merged)
-  host_graph_t graph_host; // mirror of graph on host
+  host_graph_t
+      original_graph_host;  // graph on host before merge (only if merged)
+  host_graph_t graph_host;  // mirror of graph on host
   graph_t graph;
 
   // crsmat
@@ -374,61 +401,63 @@ class SPTRSVHandle {
   crsmat_list_t diag_blocks;
 
   int num_streams;
-  #if defined(KOKKOS_ENABLE_CUDA)
+#if defined(KOKKOS_ENABLE_CUDA)
   cudaStream_t *cuda_streams;
-  #endif
+#endif
 
   // verbose
   bool verbose;
 #endif
 
-
-public:
-
-  SPTRSVHandle(SPTRSVAlgorithm choice, const size_type nrows_, bool lower_tri_, bool symbolic_complete_ = false, bool numeric_complete_ = false) :
+ public:
+  SPTRSVHandle(SPTRSVAlgorithm choice, const size_type nrows_, bool lower_tri_,
+               bool symbolic_complete_ = false, bool numeric_complete_ = false)
+      :
 #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
-    cudagraphCreated(false),
-    sptrsvCudaGraph(nullptr),
+        cudagraphCreated(false),
+        sptrsvCudaGraph(nullptr),
 #endif
-    nrows(nrows_),
-    lower_tri(lower_tri_),
-    algm(choice),
-    level_list(),
-    nodes_per_level(),
-    hnodes_per_level(),
-    nodes_grouped_by_level(),
-    hnodes_grouped_by_level(),
-    nlevel(0),
-    team_size(-1),
-    vector_size(-1),
-    stored_diagonal(false),
-    diagonal_offsets(),
-    diagonal_values(), // inserted by rowid
-    hdiagonal_offsets(),
-    hdiagonal_values(),
-    h_chain_ptr(),
-    num_chain_entries(0),
-    chain_threshold(-1),
-    symbolic_complete(symbolic_complete_),
-    numeric_complete( numeric_complete_ ),
-    require_symbolic_lvlsched_phase(false),
-    require_symbolic_chain_phase(false)
+        nrows(nrows_),
+        lower_tri(lower_tri_),
+        algm(choice),
+        level_list(),
+        nodes_per_level(),
+        hnodes_per_level(),
+        nodes_grouped_by_level(),
+        hnodes_grouped_by_level(),
+        nlevel(0),
+        team_size(-1),
+        vector_size(-1),
+        stored_diagonal(false),
+        diagonal_offsets(),
+        diagonal_values(),  // inserted by rowid
+        hdiagonal_offsets(),
+        hdiagonal_values(),
+        h_chain_ptr(),
+        num_chain_entries(0),
+        chain_threshold(-1),
+        symbolic_complete(symbolic_complete_),
+        numeric_complete(numeric_complete_),
+        require_symbolic_lvlsched_phase(false),
+        require_symbolic_chain_phase(false)
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-    , cuSPARSEHandle(nullptr)
-    , tmp_int_rowmap()
+        ,
+        cuSPARSEHandle(nullptr),
+        tmp_int_rowmap()
 #endif
 #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
-    , unit_diag (false)
-    , merge_supernodes (false)
-    , invert_diagonal (true)
-    , invert_offdiagonal (false)
-    , etree (nullptr)
-    , trmm_on_device (true)
-    , sup_size_unblocked (100)
-    , sup_size_blocked (200)
-    , perm_avail (false)
-    , spmv_trans (true)
-    , verbose (false)
+        ,
+        unit_diag(false),
+        merge_supernodes(false),
+        invert_diagonal(true),
+        invert_offdiagonal(false),
+        etree(nullptr),
+        trmm_on_device(true),
+        sup_size_unblocked(100),
+        sup_size_blocked(200),
+        perm_avail(false),
+        spmv_trans(true),
+        verbose(false)
 #endif
   {
     this->set_if_algm_require_symb_lvlsched();
@@ -446,130 +475,106 @@ class SPTRSVHandle {
   }
 
 #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
-  // set nsuper and supercols (# of supernodes, and map from supernode to column id
-  template<class input_int_type>
-  void set_supernodes (signed_integral_t nsuper_, input_int_type *supercols_, int *etree_) {
+  // set nsuper and supercols (# of supernodes, and map from supernode to column
+  // id
+  template <class input_int_type>
+  void set_supernodes(signed_integral_t nsuper_, input_int_type *supercols_,
+                      int *etree_) {
     // set etree (just wrap etree in a view)
-    this->etree_host = integer_view_host_t (etree_, nsuper_);
+    this->etree_host = integer_view_host_t(etree_, nsuper_);
     // set supernodes (make a copy, from input_int_type to int)
-    integer_view_host_t supercols_view = integer_view_host_t ("supercols", 1+nsuper_);
+    integer_view_host_t supercols_view =
+        integer_view_host_t("supercols", 1 + nsuper_);
     for (signed_integral_t i = 0; i <= nsuper_; i++) {
-      supercols_view (i) = supercols_[i];
+      supercols_view(i) = supercols_[i];
     }
 
-    set_supernodes (nsuper_, supercols_view, etree_);
+    set_supernodes(nsuper_, supercols_view, etree_);
   }
 
-  void set_supernodes (signed_integral_t nsuper_, integer_view_host_t supercols_, int *etree_) {
+  void set_supernodes(signed_integral_t nsuper_, integer_view_host_t supercols_,
+                      int *etree_) {
     // set etree
-    this->etree_host = integer_view_host_t (etree_, nsuper_);
+    this->etree_host = integer_view_host_t(etree_, nsuper_);
     // set supernodes
     set_supernodes(nsuper_, supercols_);
   }
 
-  void set_supernodes (signed_integral_t nsuper_, integer_view_host_t supercols_view) {
+  void set_supernodes(signed_integral_t nsuper_,
+                      integer_view_host_t supercols_view) {
     this->nsuper = nsuper_;
 
     // supercols
-    integer_view_host_t supercols_subview (supercols_view.data (), 1+nsuper_);
-    this->supercols_host = integer_view_host_t ("supercols_host", 1+nsuper_);
-    Kokkos::deep_copy (this->supercols_host, supercols_subview);
+    integer_view_host_t supercols_subview(supercols_view.data(), 1 + nsuper_);
+    this->supercols_host = integer_view_host_t("supercols_host", 1 + nsuper_);
+    Kokkos::deep_copy(this->supercols_host, supercols_subview);
 
-    this->supercols = integer_view_t ("supercols", 1+nsuper_);
-    Kokkos::deep_copy (this->supercols, this->supercols_host);
+    this->supercols = integer_view_t("supercols", 1 + nsuper_);
+    Kokkos::deep_copy(this->supercols, this->supercols_host);
 
     // workspace offset
-    this->work_offset_host = integer_view_host_t ("workoffset_host", nsuper_);
-    this->work_offset = integer_view_t ("workoffset", nsuper_);
+    this->work_offset_host = integer_view_host_t("workoffset_host", nsuper_);
+    this->work_offset      = integer_view_t("workoffset", nsuper_);
 
-    // kernel type 
-    this->diag_kernel_type_host = integer_view_host_t ("diag_kernel_type_host", nsuper_);
-    this->diag_kernel_type = integer_view_t ("diag_kernel_type", nsuper_);
-    this->kernel_type_host = integer_view_host_t ("kernel_type_host", nsuper_);
-    this->kernel_type = integer_view_t ("kernel_type", nsuper_);
+    // kernel type
+    this->diag_kernel_type_host =
+        integer_view_host_t("diag_kernel_type_host", nsuper_);
+    this->diag_kernel_type = integer_view_t("diag_kernel_type", nsuper_);
+    this->kernel_type_host = integer_view_host_t("kernel_type_host", nsuper_);
+    this->kernel_type      = integer_view_t("kernel_type", nsuper_);
 
     // number of streams
     this->num_streams = 0;
   }
 
   // set lower/upper triangular
-  void set_lower_tri(bool lower_tri_) {
-    lower_tri = lower_tri_;
-  }
+  void set_lower_tri(bool lower_tri_) { lower_tri = lower_tri_; }
 
   // set supernodal dag
-  void set_supernodal_dag (host_graph_t dag_) {
-    this->dag_host = dag_;
-  }
+  void set_supernodal_dag(host_graph_t dag_) { this->dag_host = dag_; }
 
   // return number of supernodes
-  signed_integral_t get_num_supernodes () {
-    return this->nsuper;
-  }
+  signed_integral_t get_num_supernodes() { return this->nsuper; }
 
   // return map to supernode to column id
-  const int* get_supercols () {
-    return this->supercols.data ();
-  }
+  const int *get_supercols() { return this->supercols.data(); }
 
-  const int* get_supercols_host () {
-    return this->supercols_host.data ();
-  }
+  const int *get_supercols_host() { return this->supercols_host.data(); }
 
   // return parents info in etree of supernodes
-  const int* get_etree_parents () {
-    return this->etree_host.data ();
-  }
+  const int *get_etree_parents() { return this->etree_host.data(); }
 
   // return parents info in etree of supernodes
-  host_graph_t get_supernodal_dag () {
-    return this->dag_host;
-  }
+  host_graph_t get_supernodal_dag() { return this->dag_host; }
 
   // workspace size
-  void set_workspace_size (signed_integral_t lwork_) {
+  void set_workspace_size(signed_integral_t lwork_) {
     this->lwork = lwork_;
-    this->work = workspace_t("work", lwork);
-  }
-  signed_integral_t get_workspace_size () {
-    return this->lwork;
+    this->work  = workspace_t("work", lwork);
   }
+  signed_integral_t get_workspace_size() { return this->lwork; }
 
   // workspace
   KOKKOS_INLINE_FUNCTION
-  workspace_t get_workspace() const {
-    return this->work;
-  }
+  workspace_t get_workspace() const { return this->work; }
 
   // workspace
   KOKKOS_INLINE_FUNCTION
-  integer_view_t get_work_offset() const { 
-    return this->work_offset;
-  }
+  integer_view_t get_work_offset() const { return this->work_offset; }
 
-  integer_view_host_t get_work_offset_host() const { 
+  integer_view_host_t get_work_offset_host() const {
     return this->work_offset_host;
   }
 
   // specify whether too run KokkosKernels::trmm on device or not
-  void set_trmm_on_device (bool flag) {
-    this->trmm_on_device = flag;
-  }
-
-  bool get_trmm_on_device () {
-    return trmm_on_device;
-  }
+  void set_trmm_on_device(bool flag) { this->trmm_on_device = flag; }
 
+  bool get_trmm_on_device() { return trmm_on_device; }
 
   // supernode size tolerance to pick right kernel type
-  int get_supernode_size_unblocked() {
-    return this->sup_size_unblocked;
-  }
-
-  int get_supernode_size_blocked() {
-    return this->sup_size_blocked;
-  }
+  int get_supernode_size_unblocked() { return this->sup_size_unblocked; }
 
+  int get_supernode_size_blocked() { return this->sup_size_blocked; }
 
   void set_supernode_size_unblocked(int size_unblocked) {
     this->sup_size_unblocked = size_unblocked;
@@ -580,13 +585,9 @@ class SPTRSVHandle {
   }
 
   // specify to merge supernodes
-  void set_merge_supernodes(bool flag) {
-    this->merge_supernodes = flag;
-  }
+  void set_merge_supernodes(bool flag) { this->merge_supernodes = flag; }
 
-  bool get_merge_supernodes() {
-    return this->merge_supernodes;
-  }
+  bool get_merge_supernodes() { return this->merge_supernodes; }
 
   // specify etree
   void set_etree(int *etree_) {
@@ -594,189 +595,147 @@ class SPTRSVHandle {
     this->etree = etree_;
   }
 
-  int* get_etree() {
-    return this->etree;
-  }
+  int *get_etree() { return this->etree; }
 
   // specify to invertt diagonal
-  void set_invert_diagonal(bool flag) {
-    this->invert_diagonal = flag;
-  }
+  void set_invert_diagonal(bool flag) { this->invert_diagonal = flag; }
 
-  bool get_invert_diagonal() {
-    return this->invert_diagonal;
-  }
+  bool get_invert_diagonal() { return this->invert_diagonal; }
 
   // specify to apply the inverse of diagonal to the offdiagonal blocks
-  void set_invert_offdiagonal(bool flag) {
-    this->invert_offdiagonal = flag;
-  }
+  void set_invert_offdiagonal(bool flag) { this->invert_offdiagonal = flag; }
 
-  bool get_invert_offdiagonal() {
-    return this->invert_offdiagonal;
-  }
+  bool get_invert_offdiagonal() { return this->invert_offdiagonal; }
 
   // kernel type
-  integer_view_host_t get_kernel_type_host () {
-    return this->kernel_type_host;
-  }
+  integer_view_host_t get_kernel_type_host() { return this->kernel_type_host; }
 
-  integer_view_host_t get_diag_kernel_type_host () {
+  integer_view_host_t get_diag_kernel_type_host() {
     return this->diag_kernel_type_host;
   }
 
-
   KOKKOS_INLINE_FUNCTION
-  integer_view_t get_kernel_type () {
-    return this->kernel_type;
-  }
+  integer_view_t get_kernel_type() { return this->kernel_type; }
 
   KOKKOS_INLINE_FUNCTION
-  integer_view_t get_diag_kernel_type () {
-    return this->diag_kernel_type;
-  }
+  integer_view_t get_diag_kernel_type() { return this->diag_kernel_type; }
 
   // permutation vector
-  void set_perm (int *perm_) {
-    this->perm = integer_view_t("PermView", nrows);
-    auto perm_host = Kokkos::create_mirror_view (this->perm);
+  void set_perm(int *perm_) {
+    this->perm     = integer_view_t("PermView", nrows);
+    auto perm_host = Kokkos::create_mirror_view(this->perm);
 
     // copy perm to device
     for (int i = 0; i < nrows; i++) {
       perm_host[i] = perm_[i];
     }
-    Kokkos::deep_copy (this->perm, perm_host);
+    Kokkos::deep_copy(this->perm, perm_host);
     this->perm_avail = true;
   }
 
-  bool has_perm() {
-    return this->perm_avail;
-  }
+  bool has_perm() { return this->perm_avail; }
 
   // graph on host (before merge)
-  void set_original_graph_host (host_graph_t graph_host_) {
+  void set_original_graph_host(host_graph_t graph_host_) {
     this->original_graph_host = graph_host_;
   }
 
-  host_graph_t get_original_graph_host () {
-    return this->original_graph_host;
-  }
+  host_graph_t get_original_graph_host() { return this->original_graph_host; }
 
   // graph on host
-  void set_graph_host (host_graph_t graph_host_) {
+  void set_graph_host(host_graph_t graph_host_) {
     this->graph_host = graph_host_;
   }
 
-  host_graph_t get_graph_host () {
-    return this->graph_host;
-  }
+  host_graph_t get_graph_host() { return this->graph_host; }
 
   // graph on device
-  void set_graph (graph_t graph_) {
-    this->graph = graph_;
-  }
+  void set_graph(graph_t graph_) { this->graph = graph_; }
 
-  graph_t get_graph () {
-    return this->graph;
-  }
+  graph_t get_graph() { return this->graph; }
 
   // set if unit diagonal
-  void set_unit_diagonal(bool unit_diag_) {
-    this->unit_diag = unit_diag_;
-  }
-  bool is_unit_diagonal() {
-    return this->unit_diag;
-  }
+  void set_unit_diagonal(bool unit_diag_) { this->unit_diag = unit_diag_; }
+  bool is_unit_diagonal() { return this->unit_diag; }
 
   // set CSR or CSC format
-  void set_column_major(bool col_major_) {
-    this->col_major = col_major_;
-  }
+  void set_column_major(bool col_major_) { this->col_major = col_major_; }
 
-  bool is_column_major() {
-    return this->col_major;
-  }
+  bool is_column_major() { return this->col_major; }
 
   // crsmat
-  void set_crsmat (crsmat_t crsmat_) {
-    this->crsmat = crsmat_;
-  }
+  void set_crsmat(crsmat_t crsmat_) { this->crsmat = crsmat_; }
 
-  crsmat_t get_crsmat () {
-    return this->crsmat;
-  }
+  crsmat_t get_crsmat() { return this->crsmat; }
 
   // submatrices
-  void set_submatrices (crsmat_list_t subcrsmats) {
+  void set_submatrices(crsmat_list_t subcrsmats) {
     this->sub_crsmats = subcrsmats;
   }
 
-  crsmat_t get_submatrix (int i) {
-    return this->sub_crsmats [i];
-  }
+  crsmat_t get_submatrix(int i) { return this->sub_crsmats[i]; }
 
   // diagonal subblocks
-  void set_diagblocks (crsmat_list_t subcrsmats) {
+  void set_diagblocks(crsmat_list_t subcrsmats) {
     this->diag_blocks = subcrsmats;
   }
 
-  crsmat_t get_diagblock (int i) {
-    return this->diag_blocks [i];
-  }
+  crsmat_t get_diagblock(int i) { return this->diag_blocks[i]; }
 
   // spmv option
-  void set_transpose_spmv(bool spmv_trans_) {
-    this->spmv_trans = spmv_trans_;
-  }
+  void set_transpose_spmv(bool spmv_trans_) { this->spmv_trans = spmv_trans_; }
 
-  bool transpose_spmv() {
-    return this->spmv_trans;
-  }
+  bool transpose_spmv() { return this->spmv_trans; }
 
   // verbose
-  void set_verbose (bool verbose_) {
-    this->verbose = verbose_;
-  }
+  void set_verbose(bool verbose_) { this->verbose = verbose_; }
 
-  #if defined(KOKKOS_ENABLE_CUDA)
+#if defined(KOKKOS_ENABLE_CUDA)
   // streams
   void setNumStreams(int num_streams_) {
     this->num_streams = num_streams_;
     if (num_streams_ > 0) {
-      this->cuda_streams = (cudaStream_t*)malloc(num_streams_ * sizeof(cudaStream_t));
-      for (int i = 0 ; i < num_streams_; i++) {
+      this->cuda_streams =
+          (cudaStream_t *)malloc(num_streams_ * sizeof(cudaStream_t));
+      for (int i = 0; i < num_streams_; i++) {
         cudaStreamCreate(&(this->cuda_streams[i]));
       }
     }
   }
 
-  cudaStream_t* getStream(int id) {
-    return &(this->cuda_streams[id]);
-  }
-  #endif
+  cudaStream_t *getStream(int id) { return &(this->cuda_streams[id]); }
+#endif
 #endif
 
   // Requires nrows_ input
   // Allocates all views
   void new_init_handle(const size_type nrows_) {
-    //set_nrows(nrows_);
+    // set_nrows(nrows_);
     nrows = nrows_;
-    // Assumed that level scheduling occurs during symbolic phase for all algorithms, for now
+    // Assumed that level scheduling occurs during symbolic phase for all
+    // algorithms, for now
 
     // TODO: Set sizes differently/smaller, resize during symbolic to save space
-    if ( this->require_symbolic_lvlsched_phase == true )
-    {
+    if (this->require_symbolic_lvlsched_phase == true) {
       set_num_levels(0);
-      level_list = signed_nnz_lno_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "level_list"), nrows_);
-      Kokkos::deep_copy( level_list, signed_integral_t(-1) );
-      //The host side views need to be initialized, but the device-side views don't.
-      //Symbolic computes on the host (and requires these are 0 initialized), and then copies to device.
-      hnodes_per_level = hostspace_nnz_lno_view_t("host nodes_per_level", nrows_);
-      hnodes_grouped_by_level = hostspace_nnz_lno_view_t("host nodes_grouped_by_level", nrows_);
-      nodes_per_level =  nnz_lno_view_t(
-          Kokkos::view_alloc(Kokkos::WithoutInitializing, "nodes_per_level"), nrows_);
-      nodes_grouped_by_level = nnz_lno_view_t(
-          Kokkos::view_alloc(Kokkos::WithoutInitializing, "nodes_grouped_by_level"), nrows_);
+      level_list = signed_nnz_lno_view_t(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "level_list"),
+          nrows_);
+      Kokkos::deep_copy(level_list, signed_integral_t(-1));
+      // The host side views need to be initialized, but the device-side views
+      // don't. Symbolic computes on the host (and requires these are 0
+      // initialized), and then copies to device.
+      hnodes_per_level =
+          hostspace_nnz_lno_view_t("host nodes_per_level", nrows_);
+      hnodes_grouped_by_level =
+          hostspace_nnz_lno_view_t("host nodes_grouped_by_level", nrows_);
+      nodes_per_level = nnz_lno_view_t(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "nodes_per_level"),
+          nrows_);
+      nodes_grouped_by_level =
+          nnz_lno_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                                            "nodes_grouped_by_level"),
+                         nrows_);
 
 #if 0
       std::cout << "  newinit_handle: level schedule allocs" << std::endl;
@@ -789,45 +748,55 @@ class SPTRSVHandle {
     }
 
     if (stored_diagonal) {
-      diagonal_offsets = nnz_lno_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "diagonal_offsets"), nrows_);
-      diagonal_values = nnz_scalar_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "diagonal_values"), nrows_); // inserted by rowid
+      diagonal_offsets = nnz_lno_view_t(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "diagonal_offsets"),
+          nrows_);
+      diagonal_values = nnz_scalar_view_t(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "diagonal_values"),
+          nrows_);  // inserted by rowid
       hdiagonal_offsets = Kokkos::create_mirror_view(diagonal_offsets);
-      hdiagonal_values = Kokkos::create_mirror_view(diagonal_values);
+      hdiagonal_values  = Kokkos::create_mirror_view(diagonal_values);
     }
 
-    if (this->require_symbolic_chain_phase == true)
-    {
+    if (this->require_symbolic_chain_phase == true) {
       if (this->chain_threshold == -1) {
         // Need default if chain_threshold not set
-        // 0 means every level, regardless of number of nodes, is launched within a kernel
+        // 0 means every level, regardless of number of nodes, is launched
+        // within a kernel
         if (team_size == -1) {
-          this->chain_threshold = 0; 
+          this->chain_threshold = 0;
           h_chain_ptr = host_signed_nnz_lno_view_t("h_chain_ptr", this->nrows);
-        }
-        else {
-          std::cout << "  Warning: chain_threshold was not set - will default to team_size = " << this->team_size << "  chain_threshold = " << this->chain_threshold << std::endl;
-          this->chain_threshold = this->team_size; 
+        } else {
+          std::cout << "  Warning: chain_threshold was not set - will default "
+                       "to team_size = "
+                    << this->team_size
+                    << "  chain_threshold = " << this->chain_threshold
+                    << std::endl;
+          this->chain_threshold = this->team_size;
           h_chain_ptr = host_signed_nnz_lno_view_t("h_chain_ptr", this->nrows);
         }
-      }
-      else {
+      } else {
         if (this->team_size >= this->chain_threshold) {
           h_chain_ptr = host_signed_nnz_lno_view_t("h_chain_ptr", this->nrows);
-        }
-        else if (this->team_size == -1 && chain_threshold > 0) {
-          std::cout << "  Warning: team_size was not set; chain_threshold = " << this->chain_threshold << std::endl;
-          std::cout << "  Automatically setting team_size to chain_threshold - if this exceeds the hardware limitations relaunch with reduced chain_threshold or set a valid team_size" << std::endl;
+        } else if (this->team_size == -1 && chain_threshold > 0) {
+          std::cout << "  Warning: team_size was not set; chain_threshold = "
+                    << this->chain_threshold << std::endl;
+          std::cout << "  Automatically setting team_size to chain_threshold - "
+                       "if this exceeds the hardware limitations relaunch with "
+                       "reduced chain_threshold or set a valid team_size"
+                    << std::endl;
           this->team_size = this->chain_threshold;
           h_chain_ptr = host_signed_nnz_lno_view_t("h_chain_ptr", this->nrows);
-        }
-        else {
-          std::cout << "  EXPERIMENTAL: team_size less than chain size. team_size = " << this->team_size << "  chain_threshold = " << this->chain_threshold << std::endl;
+        } else {
+          std::cout
+              << "  EXPERIMENTAL: team_size less than chain size. team_size = "
+              << this->team_size
+              << "  chain_threshold = " << this->chain_threshold << std::endl;
           h_chain_ptr = host_signed_nnz_lno_view_t("h_chain_ptr", this->nrows);
         }
       }
-    }
-    else {
-      h_chain_ptr = host_signed_nnz_lno_view_t();
+    } else {
+      h_chain_ptr           = host_signed_nnz_lno_view_t();
       this->chain_threshold = -1;
     }
 
@@ -850,52 +819,51 @@ class SPTRSVHandle {
   };
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-  void create_cuSPARSE_Handle(bool transpose, bool is_lower){
+  void create_cuSPARSE_Handle(bool transpose, bool is_lower) {
     this->destroy_cuSPARSE_Handle();
     this->cuSPARSEHandle = new cuSparseHandleType(transpose, is_lower);
   }
-  void destroy_cuSPARSE_Handle(){
-    if (this->cuSPARSEHandle != nullptr){
+  void destroy_cuSPARSE_Handle() {
+    if (this->cuSPARSEHandle != nullptr) {
       delete this->cuSPARSEHandle;
       this->cuSPARSEHandle = nullptr;
     }
   }
 
-  SPTRSVcuSparseHandleType *get_cuSparseHandle(){
+  SPTRSVcuSparseHandleType *get_cuSparseHandle() {
     return this->cuSPARSEHandle;
   }
 
-  void allocate_tmp_int_rowmap (size_type N) {
-    tmp_int_rowmap = int_row_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "tmp_int_rowmap"), N);
+  void allocate_tmp_int_rowmap(size_type N) {
+    tmp_int_rowmap = int_row_view_t(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "tmp_int_rowmap"), N);
   }
   template <typename RowViewType>
-  int_row_view_t get_int_rowmap_view_copy (const RowViewType & rowmap) {
+  int_row_view_t get_int_rowmap_view_copy(const RowViewType &rowmap) {
     Kokkos::deep_copy(tmp_int_rowmap, rowmap);
     return tmp_int_rowmap;
   }
   template <typename RowViewType>
-  int* get_int_rowmap_ptr_copy (const RowViewType & rowmap) {
+  int *get_int_rowmap_ptr_copy(const RowViewType &rowmap) {
     Kokkos::deep_copy(tmp_int_rowmap, rowmap);
     Kokkos::fence();
     return tmp_int_rowmap.data();
   }
-  int_row_view_t get_int_rowmap_view () {
-    return tmp_int_rowmap;
-  }
-  int* get_int_rowmap_ptr () {
-    return tmp_int_rowmap.data();
-  }
+  int_row_view_t get_int_rowmap_view() { return tmp_int_rowmap; }
+  int *get_int_rowmap_ptr() { return tmp_int_rowmap.data(); }
 #endif
 
-
-  bool algm_requires_symb_lvlsched() const { return require_symbolic_lvlsched_phase; } 
+  bool algm_requires_symb_lvlsched() const {
+    return require_symbolic_lvlsched_phase;
+  }
 
   bool algm_requires_symb_chain() const { return require_symbolic_chain_phase; }
 
-  // Can change the algorithm to a "Compatible algorithms" - for ease in some testing cases
-  void set_algorithm(SPTRSVAlgorithm choice) { 
+  // Can change the algorithm to a "Compatible algorithms" - for ease in some
+  // testing cases
+  void set_algorithm(SPTRSVAlgorithm choice) {
     if (algm != choice) {
-      algm = choice; 
+      algm = choice;
     }
   }
 
@@ -905,11 +873,10 @@ class SPTRSVHandle {
   KOKKOS_INLINE_FUNCTION
   signed_nnz_lno_view_t get_level_list() const { return level_list; }
 
-  inline
-  host_signed_nnz_lno_view_t get_host_level_list() const { 
+  inline host_signed_nnz_lno_view_t get_host_level_list() const {
     auto hlevel_list = Kokkos::create_mirror_view(this->level_list);
     Kokkos::deep_copy(hlevel_list, this->level_list);
-    return hlevel_list; 
+    return hlevel_list;
   }
 
   void set_stored_diagonal(const bool stored_diagonal_) {
@@ -923,54 +890,68 @@ class SPTRSVHandle {
   nnz_scalar_view_t get_diagonal_values() const { return diagonal_values; }
 
   KOKKOS_INLINE_FUNCTION
-  host_nnz_lno_view_t get_host_diagonal_offsets() const { return hdiagonal_offsets; }
+  host_nnz_lno_view_t get_host_diagonal_offsets() const {
+    return hdiagonal_offsets;
+  }
 
   KOKKOS_INLINE_FUNCTION
-  host_nnz_scalar_view_t get_host_diagonal_values() const { return hdiagonal_values; }
+  host_nnz_scalar_view_t get_host_diagonal_values() const {
+    return hdiagonal_values;
+  }
 
-  inline
-  host_signed_nnz_lno_view_t get_host_chain_ptr() const { return h_chain_ptr; }
+  inline host_signed_nnz_lno_view_t get_host_chain_ptr() const {
+    return h_chain_ptr;
+  }
 
   KOKKOS_INLINE_FUNCTION
   nnz_lno_view_t get_nodes_per_level() const { return nodes_per_level; }
 
-  inline
-  hostspace_nnz_lno_view_t get_host_nodes_per_level() const { 
-    return hnodes_per_level; 
+  inline hostspace_nnz_lno_view_t get_host_nodes_per_level() const {
+    return hnodes_per_level;
   }
 
   KOKKOS_INLINE_FUNCTION
-  nnz_lno_view_t get_nodes_grouped_by_level() const { return nodes_grouped_by_level; }
+  nnz_lno_view_t get_nodes_grouped_by_level() const {
+    return nodes_grouped_by_level;
+  }
 
-  inline
-  hostspace_nnz_lno_view_t get_host_nodes_grouped_by_level() const { return hnodes_grouped_by_level; }
+  inline hostspace_nnz_lno_view_t get_host_nodes_grouped_by_level() const {
+    return hnodes_grouped_by_level;
+  }
 
   KOKKOS_INLINE_FUNCTION
   size_type get_nrows() const { return nrows; }
   void set_nrows(const size_type nrows_) { this->nrows = nrows_; }
 
-
   void reset_chain_threshold(const signed_integral_t threshold) {
     if (threshold != this->chain_threshold || h_chain_ptr.span() == 0) {
-        this->chain_threshold = threshold;
-        if (this->team_size >= this->chain_threshold) {
-        //  h_chain_ptr = host_signed_nnz_lno_view_t("h_chain_ptr", this->nrows);
-        }
-        else if (this->team_size == -1 && chain_threshold > 0) {
-          //std::cout << "  Warning: team_size was not set  team_size = " << this->team_size << "  chain_threshold = " << this->chain_threshold << std::endl;
-          //std::cout << "  Automatically setting team_size to chain_threshold - if this exceeds the hardware limitation a runtime error will occur during kernel launch - reduce chain_threshold in that case" << std::endl;
-          this->team_size = this->chain_threshold;
-        //  h_chain_ptr = host_signed_nnz_lno_view_t("h_chain_ptr", this->nrows);
-        }
-        else {
-          std::cout << "  EXPERIMENTAL: team_size < chain_size: team_size = " << this->team_size << "  chain_threshold = " << this->chain_threshold << std::endl;
-        }
+      this->chain_threshold = threshold;
+      if (this->team_size >= this->chain_threshold) {
+        //  h_chain_ptr = host_signed_nnz_lno_view_t("h_chain_ptr",
+        //  this->nrows);
+      } else if (this->team_size == -1 && chain_threshold > 0) {
+        // std::cout << "  Warning: team_size was not set  team_size = " <<
+        // this->team_size << "  chain_threshold = " << this->chain_threshold <<
+        // std::endl; std::cout << "  Automatically setting team_size to
+        // chain_threshold - if this exceeds the hardware limitation a runtime
+        // error will occur during kernel launch - reduce chain_threshold in
+        // that case" << std::endl;
+        this->team_size = this->chain_threshold;
+        //  h_chain_ptr = host_signed_nnz_lno_view_t("h_chain_ptr",
+        //  this->nrows);
+      } else {
+        std::cout << "  EXPERIMENTAL: team_size < chain_size: team_size = "
+                  << this->team_size
+                  << "  chain_threshold = " << this->chain_threshold
+                  << std::endl;
+      }
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  signed_integral_t get_chain_threshold () const { return this->chain_threshold; }
-
+  signed_integral_t get_chain_threshold() const {
+    return this->chain_threshold;
+  }
 
   bool is_lower_tri() const { return lower_tri; }
   bool is_upper_tri() const { return !lower_tri; }
@@ -992,90 +973,96 @@ class SPTRSVHandle {
   void set_numeric_incomplete() { this->numeric_complete = false; }
 
   KOKKOS_INLINE_FUNCTION
-  int get_team_size() const {return this->team_size;}
+  int get_team_size() const { return this->team_size; }
   // Called by user at setup - should only set a value, no alloc
-  void set_team_size(const int ts) {this->team_size = ts;}
+  void set_team_size(const int ts) { this->team_size = ts; }
 
   KOKKOS_INLINE_FUNCTION
-  int get_vector_size() const {return this->vector_size;}
+  int get_vector_size() const { return this->vector_size; }
   // Called by user at setup - should only set a value, no alloc
-  void set_vector_size(const int vs) {this->vector_size = vs;}
+  void set_vector_size(const int vs) { this->vector_size = vs; }
 
   KOKKOS_INLINE_FUNCTION
-  int get_num_chain_entries() const {return this->num_chain_entries;}
-  void set_num_chain_entries(const int nce) {this->num_chain_entries = nce;}
-
-  void print_algorithm() { 
-    if ( algm == SPTRSVAlgorithm::SEQLVLSCHD_RP )
-      std::cout << "SEQLVLSCHD_RP" << std::endl;;
-
-    if ( algm == SPTRSVAlgorithm::SEQLVLSCHD_TP1 )
-      std::cout << "SEQLVLSCHD_TP1" << std::endl;;
-/*
-    if ( algm == SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) {
-      std::cout << "SEQLVLSCHED_TP2" << std::endl;;
-      std::cout << "WARNING: With CUDA this is currently only reliable with int-int ordinal-offset pair" << std::endl;
-    }
-*/
-    if ( algm == SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN )
-      std::cout << "SEQLVLSCHD_TP1CHAIN" << std::endl;;
+  int get_num_chain_entries() const { return this->num_chain_entries; }
+  void set_num_chain_entries(const int nce) { this->num_chain_entries = nce; }
+
+  void print_algorithm() {
+    if (algm == SPTRSVAlgorithm::SEQLVLSCHD_RP)
+      std::cout << "SEQLVLSCHD_RP" << std::endl;
+    ;
+
+    if (algm == SPTRSVAlgorithm::SEQLVLSCHD_TP1)
+      std::cout << "SEQLVLSCHD_TP1" << std::endl;
+    ;
+    /*
+        if ( algm == SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) {
+          std::cout << "SEQLVLSCHED_TP2" << std::endl;;
+          std::cout << "WARNING: With CUDA this is currently only reliable with
+       int-int ordinal-offset pair" << std::endl;
+        }
+    */
+    if (algm == SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN)
+      std::cout << "SEQLVLSCHD_TP1CHAIN" << std::endl;
+    ;
 
-    if ( algm == SPTRSVAlgorithm::SPTRSV_CUSPARSE )
-      std::cout << "SPTRSV_CUSPARSE" << std::endl;;
+    if (algm == SPTRSVAlgorithm::SPTRSV_CUSPARSE)
+      std::cout << "SPTRSV_CUSPARSE" << std::endl;
+    ;
 
-    if ( algm == SPTRSVAlgorithm::SUPERNODAL_NAIVE )
+    if (algm == SPTRSVAlgorithm::SUPERNODAL_NAIVE)
       std::cout << "SUPERNODAL_NAIVE" << std::endl;
 
-    if ( algm == SPTRSVAlgorithm::SUPERNODAL_ETREE )
+    if (algm == SPTRSVAlgorithm::SUPERNODAL_ETREE)
       std::cout << "SUPERNODAL_ETREE" << std::endl;
 
-    if ( algm == SPTRSVAlgorithm::SUPERNODAL_DAG )
+    if (algm == SPTRSVAlgorithm::SUPERNODAL_DAG)
       std::cout << "SUPERNODAL_DAG" << std::endl;
 
-    if ( algm == SPTRSVAlgorithm::SUPERNODAL_SPMV )
+    if (algm == SPTRSVAlgorithm::SUPERNODAL_SPMV)
       std::cout << "SUPERNODAL_SPMV" << std::endl;
 
-    if ( algm == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG )
+    if (algm == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG)
       std::cout << "SUPERNODAL_SPMV_DAG" << std::endl;
   }
 
-
-  std::string return_algorithm_string() { 
+  std::string return_algorithm_string() {
     std::string ret_string;
 
-    if ( algm == SPTRSVAlgorithm::SEQLVLSCHD_RP )
-      ret_string = "SEQLVLSCHD_RP";
+    if (algm == SPTRSVAlgorithm::SEQLVLSCHD_RP) ret_string = "SEQLVLSCHD_RP";
 
-    if ( algm == SPTRSVAlgorithm::SEQLVLSCHD_TP1 )
-      ret_string = "SEQLVLSCHD_TP1";
-/*
-    if ( algm == SPTRSVAlgorithm::SEQLVLSCHED_TP2 )
-      ret_string = "SEQLVLSCHED_TP2";
-*/
-    if ( algm == SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN )
+    if (algm == SPTRSVAlgorithm::SEQLVLSCHD_TP1) ret_string = "SEQLVLSCHD_TP1";
+    /*
+        if ( algm == SPTRSVAlgorithm::SEQLVLSCHED_TP2 )
+          ret_string = "SEQLVLSCHED_TP2";
+    */
+    if (algm == SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN)
       ret_string = "SEQLVLSCHD_TP1CHAIN";
 
-    if ( algm == SPTRSVAlgorithm::SPTRSV_CUSPARSE )
+    if (algm == SPTRSVAlgorithm::SPTRSV_CUSPARSE)
       ret_string = "SPTRSV_CUSPARSE";
 
     return ret_string;
   }
 
-
-  inline SPTRSVAlgorithm StringToSPTRSVAlgorithm(std::string & name) {
-    if(name=="SPTRSV_DEFAULT")                return SPTRSVAlgorithm::SEQLVLSCHD_RP;
-    else if(name=="SPTRSV_RANGEPOLICY")       return SPTRSVAlgorithm::SEQLVLSCHD_RP;
-    else if(name=="SPTRSV_TEAMPOLICY1")       return SPTRSVAlgorithm::SEQLVLSCHD_TP1;
-    /*else if(name=="SPTRSV_TEAMPOLICY2")       return SPTRSVAlgorithm::SEQLVLSCHED_TP2;*/
-    else if(name=="SPTRSV_TEAMPOLICY1CHAIN")  return SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN;
-    else if(name=="SPTRSV_CUSPARSE")          return SPTRSVAlgorithm::SPTRSV_CUSPARSE;
+  inline SPTRSVAlgorithm StringToSPTRSVAlgorithm(std::string &name) {
+    if (name == "SPTRSV_DEFAULT")
+      return SPTRSVAlgorithm::SEQLVLSCHD_RP;
+    else if (name == "SPTRSV_RANGEPOLICY")
+      return SPTRSVAlgorithm::SEQLVLSCHD_RP;
+    else if (name == "SPTRSV_TEAMPOLICY1")
+      return SPTRSVAlgorithm::SEQLVLSCHD_TP1;
+    /*else if(name=="SPTRSV_TEAMPOLICY2")       return
+     * SPTRSVAlgorithm::SEQLVLSCHED_TP2;*/
+    else if (name == "SPTRSV_TEAMPOLICY1CHAIN")
+      return SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN;
+    else if (name == "SPTRSV_CUSPARSE")
+      return SPTRSVAlgorithm::SPTRSV_CUSPARSE;
     else
       throw std::runtime_error("Invalid SPTRSVAlgorithm name");
   }
-
 };
 
-} // namespace Experimental
-} // namespace Kokkos
+}  // namespace Experimental
+}  // namespace KokkosSparse
 
 #endif
diff --git a/src/sparse/KokkosSparse_sptrsv_superlu.hpp b/src/sparse/KokkosSparse_sptrsv_superlu.hpp
index 5f861c781e..c7c611e104 100644
--- a/src/sparse/KokkosSparse_sptrsv_superlu.hpp
+++ b/src/sparse/KokkosSparse_sptrsv_superlu.hpp
@@ -62,184 +62,185 @@
 namespace KokkosSparse {
 namespace Experimental {
 
+/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+/* Auxiliary function for symbolic analysis */
 
-/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
-/* Auxiliary function for symbolic analysis                                                  */
-
-/* ========================================================================================= */
+/* =========================================================================================
+ */
 template <typename graph_t, typename KernelHandle>
 graph_t read_superlu_graphL(KernelHandle *kernelHandle, SuperMatrix *L) {
-
   /* ---------------------------------------------------------------------- */
   /* get inputs */
   /* ---------------------------------------------------------------------- */
-  int n = L->nrow;
-  SCformat *Lstore = (SCformat*)(L->Store);
+  int n            = L->nrow;
+  SCformat *Lstore = (SCformat *)(L->Store);
 
   // use unit diagonal for L
-  kernelHandle->set_sptrsv_unit_diagonal (true);
+  kernelHandle->set_sptrsv_unit_diagonal(true);
 
-  int nsuper = 1 + Lstore->nsuper;     // # of supernodal columns
-  int * mb = Lstore->rowind_colptr;
-  int * nb = Lstore->sup_to_col;
-  int * colptr = Lstore->nzval_colptr;
-  int * rowind = Lstore->rowind;
+  int nsuper  = 1 + Lstore->nsuper;  // # of supernodal columns
+  int *mb     = Lstore->rowind_colptr;
+  int *nb     = Lstore->sup_to_col;
+  int *colptr = Lstore->nzval_colptr;
+  int *rowind = Lstore->rowind;
 
   bool ptr_by_column = true;
-  int nnzA = colptr[n] - colptr[0]; // overestimated if not block_diag
-  return read_supernodal_graphL<graph_t> (kernelHandle, n, nsuper, nnzA, ptr_by_column, mb, nb, rowind);
+  int nnzA = colptr[n] - colptr[0];  // overestimated if not block_diag
+  return read_supernodal_graphL<graph_t>(kernelHandle, n, nsuper, nnzA,
+                                         ptr_by_column, mb, nb, rowind);
 }
 
-
-/* ========================================================================================= */
+/* =========================================================================================
+ */
 // read SuperLU U factor into CSR
 template <typename graph_t, typename KernelHandle>
-graph_t read_superlu_graphU(KernelHandle *kernelHandle, SuperMatrix *L,  SuperMatrix *U) {
-
-  using   row_map_view_t = typename graph_t::row_map_type::non_const_type;
-  using      cols_view_t = typename graph_t::entries_type::non_const_type;
-  using host_cols_view_t = typename cols_view_t::HostMirror;
-  using integer_view_host_t = Kokkos::View<int*, Kokkos::HostSpace>;
+graph_t read_superlu_graphU(KernelHandle *kernelHandle, SuperMatrix *L,
+                            SuperMatrix *U) {
+  using row_map_view_t      = typename graph_t::row_map_type::non_const_type;
+  using cols_view_t         = typename graph_t::entries_type::non_const_type;
+  using host_cols_view_t    = typename cols_view_t::HostMirror;
+  using integer_view_host_t = Kokkos::View<int *, Kokkos::HostSpace>;
 
   /* load options */
-  auto *handle = kernelHandle->get_sptrsv_handle ();
-  bool u_in_csc = handle->is_column_major ();
+  auto *handle  = kernelHandle->get_sptrsv_handle();
+  bool u_in_csc = handle->is_column_major();
 
   // use non-unit diagonal for U
-  kernelHandle->set_sptrsv_unit_diagonal (false);
+  kernelHandle->set_sptrsv_unit_diagonal(false);
 
-  SCformat *Lstore = (SCformat*)(L->Store);
-  NCformat *Ustore = (NCformat*)(U->Store);
+  SCformat *Lstore = (SCformat *)(L->Store);
+  NCformat *Ustore = (NCformat *)(U->Store);
 
   /* create a map from row id to supernode id */
-  int n = L->nrow;
-  int nsuper = 1 + Lstore->nsuper;     // # of supernodal columns
-  int *nb = Lstore->sup_to_col;
+  int n        = L->nrow;
+  int nsuper   = 1 + Lstore->nsuper;  // # of supernodal columns
+  int *nb      = Lstore->sup_to_col;
   int *colptrU = Ustore->colptr;
   int *rowindU = Ustore->rowind;
 
-  integer_view_host_t map ("map", n);
+  integer_view_host_t map("map", n);
   int supid = 0;
   for (int k = 0; k < nsuper; k++) {
     int j1 = nb[k];
-    int j2 = nb[k+1];
+    int j2 = nb[k + 1];
     for (int j = j1; j < j2; j++) {
-      map (j) = supid;
+      map(j) = supid;
     }
-    supid ++;
+    supid++;
   }
 
   /* count number of nonzeros in each row */
-  row_map_view_t rowmap_view ("rowmap_view", n+1);
-  auto hr = Kokkos::create_mirror_view (rowmap_view);
-  Kokkos::deep_copy (hr, 0);
+  row_map_view_t rowmap_view("rowmap_view", n + 1);
+  auto hr = Kokkos::create_mirror_view(rowmap_view);
+  Kokkos::deep_copy(hr, 0);
 
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
   Kokkos::Timer timer;
   double time1 = 0.0;
   double time2 = 0.0;
   double time3 = 0.0;
   timer.reset();
-  #endif
+#endif
 
-  integer_view_host_t sup ("sup", nsuper);
-  integer_view_host_t check ("check", nsuper);
-  Kokkos::deep_copy (check, 0);
-  for (int k = nsuper-1; k >= 0; k--) {
-    int j1 = nb[k];
-    int nscol = nb[k+1] - j1;
+  integer_view_host_t sup("sup", nsuper);
+  integer_view_host_t check("check", nsuper);
+  Kokkos::deep_copy(check, 0);
+  for (int k = nsuper - 1; k >= 0; k--) {
+    int j1    = nb[k];
+    int nscol = nb[k + 1] - j1;
 
     /* the diagonal block */
     for (int i = 0; i < nscol; i++) {
-      hr (j1+i + 1) += nscol;
+      hr(j1 + i + 1) += nscol;
     }
 
     /* the off-diagonal blocks */
     // TODO: should take unions of nonzero columns per block row
     int nsup = 0;
     for (int jcol = j1; jcol < j1 + nscol; jcol++) {
-      for (int i = colptrU[jcol]; i < colptrU[jcol+1]; i++ ){
+      for (int i = colptrU[jcol]; i < colptrU[jcol + 1]; i++) {
         int irow = rowindU[i];
-        supid = map (irow);
-        if (check (supid) == 0) {
+        supid    = map(irow);
+        if (check(supid) == 0) {
           if (u_in_csc) {
-            int nsrow = nb[supid+1] - nb[supid];
+            int nsrow = nb[supid + 1] - nb[supid];
             for (int jj = j1; jj < j1 + nscol; jj++) {
-              hr (jj + 1) += nsrow;
+              hr(jj + 1) += nsrow;
             }
           } else {
-            for (int ii = nb[supid]; ii < nb[supid+1]; ii++) {
-              hr (ii + 1) += nscol;
+            for (int ii = nb[supid]; ii < nb[supid + 1]; ii++) {
+              hr(ii + 1) += nscol;
             }
           }
-          check (supid) = 1;
-          sup (nsup) = supid;
-          nsup ++;
+          check(supid) = 1;
+          sup(nsup)    = supid;
+          nsup++;
         }
       }
     }
     // reset check
-    for (int i = 0; i < nsup; i++ ) {
-      check (sup (i)) = 0;
+    for (int i = 0; i < nsup; i++) {
+      check(sup(i)) = 0;
     }
   }
 
   // convert to the offset for each row
   for (int i = 1; i <= n; i++) {
-    hr (i) += hr (i-1);
+    hr(i) += hr(i - 1);
   }
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  double time_count = timer.seconds ();
-  #endif
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  double time_count = timer.seconds();
+#endif
 
   /* Upper-triangular matrix */
-  auto nnzA = hr (n);
-  cols_view_t column_view ("colmap_view", nnzA);
-  host_cols_view_t hc = Kokkos::create_mirror_view (column_view);
+  auto nnzA = hr(n);
+  cols_view_t column_view("colmap_view", nnzA);
+  host_cols_view_t hc = Kokkos::create_mirror_view(column_view);
 
   for (int k = 0; k < nsuper; k++) {
-    int j1 = nb[k];
-    int nscol = nb[k+1] - j1;
-    #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+    int j1    = nb[k];
+    int nscol = nb[k + 1] - j1;
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
     timer.reset();
-    #endif
+#endif
 
     /* the diagonal "dense" block */
     for (int i = 0; i < nscol; i++) {
       for (int j = 0; j < nscol; j++) {
         hc(hr(j1 + i) + j) = j1 + j;
       }
-      hr (j1 + i) += nscol;
+      hr(j1 + i) += nscol;
     }
-    #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-    time1 += timer.seconds ();
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+    time1 += timer.seconds();
     timer.reset();
-    #endif
+#endif
 
     /* the off-diagonal "sparse" blocks */
     int nsup = 0;
     // let me first find off-diagonal supernodal blocks..
     for (int jcol = j1; jcol < j1 + nscol; jcol++) {
-      for (int i = colptrU[jcol]; i < colptrU[jcol+1]; i++ ) {
+      for (int i = colptrU[jcol]; i < colptrU[jcol + 1]; i++) {
         int irow = rowindU[i];
-        if (check (map (irow)) == 0) {
-          check (map (irow)) = 1;
-          sup (nsup) = map (irow);
-          nsup ++;
+        if (check(map(irow)) == 0) {
+          check(map(irow)) = 1;
+          sup(nsup)        = map(irow);
+          nsup++;
         }
       }
     }
-    #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-    time2 += timer.seconds ();
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+    time2 += timer.seconds();
     timer.reset();
-    #endif
+#endif
     if (u_in_csc) {
       for (int jcol = j1; jcol < j1 + nscol; jcol++) {
         // move up all the row pointers for all the supernodal blocks
         for (int i = 0; i < nsup; i++) {
-          for (int ii = nb[sup (i)]; ii < nb[sup (i) + 1]; ii++) {
+          for (int ii = nb[sup(i)]; ii < nb[sup(i) + 1]; ii++) {
             hc(hr(jcol)) = ii;
-            hr(jcol) ++;
+            hr(jcol)++;
           }
         }
       }
@@ -249,9 +250,9 @@ graph_t read_superlu_graphU(KernelHandle *kernelHandle, SuperMatrix *L,  SuperMa
         // (only nonzero columns)
         // TODO: should take unions of nonzero columns per block row
         for (int i = 0; i < nsup; i++) {
-          for (int ii = nb[sup (i)]; ii < nb[sup (i) + 1]; ii++) {
+          for (int ii = nb[sup(i)]; ii < nb[sup(i) + 1]; ii++) {
             hc(hr(ii)) = jcol;
-            hr(ii) ++;
+            hr(ii)++;
           }
         }
       }
@@ -259,217 +260,213 @@ graph_t read_superlu_graphU(KernelHandle *kernelHandle, SuperMatrix *L,  SuperMa
 
     // reset check
     for (int i = 0; i < nsup; i++) {
-      check (sup (i)) = 0;
+      check(sup(i)) = 0;
     }
-    #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-    time3 += timer.seconds ();
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+    time3 += timer.seconds();
     timer.reset();
-    #endif
+#endif
   }
 
   // fix hr
   for (int i = n; i >= 1; i--) {
-    hr(i) = hr(i-1);
+    hr(i) = hr(i - 1);
   }
   hr(0) = 0;
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
   std::cout << "    * Matrix size = " << n << std::endl;
-  std::cout << "    * Total nnz   = " << hr (n) << std::endl;
-  std::cout << "    * nnz / n     = " << hr (n)/n << std::endl;
-  std::cout << "    * time        = " << time_count << " + " << time1 << " + " << time2 << " + " << time3 << std::endl;
-  #endif
+  std::cout << "    * Total nnz   = " << hr(n) << std::endl;
+  std::cout << "    * nnz / n     = " << hr(n) / n << std::endl;
+  std::cout << "    * time        = " << time_count << " + " << time1 << " + "
+            << time2 << " + " << time3 << std::endl;
+#endif
 
   // deepcopy
-  Kokkos::deep_copy (rowmap_view, hr);
-  Kokkos::deep_copy (column_view, hc);
+  Kokkos::deep_copy(rowmap_view, hr);
+  Kokkos::deep_copy(column_view, hc);
 
   // create crsgraph
-  graph_t static_graph (column_view, rowmap_view);
+  graph_t static_graph(column_view, rowmap_view);
   return static_graph;
 }
 
-
-/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
-/* Symbolic analysis                                                                         */
+/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+/* Symbolic analysis */
 template <typename KernelHandle>
-void sptrsv_symbolic(
-    KernelHandle *kernelHandleL,
-    KernelHandle *kernelHandleU,
-    SuperMatrix &L,
-    SuperMatrix &U)
-{
+void sptrsv_symbolic(KernelHandle *kernelHandleL, KernelHandle *kernelHandleU,
+                     SuperMatrix &L, SuperMatrix &U) {
   Kokkos::Timer timer;
   Kokkos::Timer tic;
   timer.reset();
 
   // ===================================================================
   // load sptrsv-handles
-  auto *handleL = kernelHandleL->get_sptrsv_handle ();
-
-  // ===================================================================
-  // read CrsGraph from SuperLU factor
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  bool merge = handleL->get_merge_supernodes ();
-  std::cout << " > Read SuperLU factor into KokkosSparse::CrsMatrix (invert diagonal and copy to device)" << std::endl;
+  auto *handleL = kernelHandleL->get_sptrsv_handle();
+
+// ===================================================================
+// read CrsGraph from SuperLU factor
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  bool merge = handleL->get_merge_supernodes();
+  std::cout << " > Read SuperLU factor into KokkosSparse::CrsMatrix (invert "
+               "diagonal and copy to device)"
+            << std::endl;
   if (merge) {
     std::cout << " > Merge supernodes" << std::endl;
   }
-  #endif
+#endif
   using host_graph_t = typename KernelHandle::SPTRSVHandleType::host_graph_t;
   host_graph_t graphL_host;
   host_graph_t graphU_host;
 
   tic.reset();
-  graphL_host = read_superlu_graphL<host_graph_t> (kernelHandleL, &L);
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  double time_read_L = tic.seconds ();
+  graphL_host = read_superlu_graphL<host_graph_t>(kernelHandleL, &L);
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  double time_read_L = tic.seconds();
   tic.reset();
-  #endif
-  graphU_host = read_superlu_graphU<host_graph_t> (kernelHandleU, &L, &U);
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  double time_read_U = tic.seconds ();
-  int nrows (graphL_host.row_map.extent(0));
-  std::cout << "   Conversion Time (from SuperLU to CSR): " << time_read_L << " + " << time_read_U
-            << ", nnz = " << graphL_host.row_map(nrows) << " + " << graphU_host.row_map(nrows)
-            << std::endl;
+#endif
+  graphU_host = read_superlu_graphU<host_graph_t>(kernelHandleU, &L, &U);
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  double time_read_U = tic.seconds();
+  int nrows(graphL_host.row_map.extent(0));
+  std::cout << "   Conversion Time (from SuperLU to CSR): " << time_read_L
+            << " + " << time_read_U << ", nnz = " << graphL_host.row_map(nrows)
+            << " + " << graphU_host.row_map(nrows) << std::endl;
   tic.reset();
-  #endif
+#endif
 
   // ===================================================================
   // load supnodal info
-  SCformat *Lstore = (SCformat*)(L.Store);
-  int nsuper = 1 + Lstore->nsuper;
-  int *supercols = Lstore->sup_to_col;
+  SCformat *Lstore = (SCformat *)(L.Store);
+  int nsuper       = 1 + Lstore->nsuper;
+  int *supercols   = Lstore->sup_to_col;
 
   // ===================================================================
   // load etree info (optional)
-  int *etree = handleL->get_etree ();
+  int *etree = handleL->get_etree();
 
   // ===================================================================
   // call supnodal symbolic
-  sptrsv_supernodal_symbolic (nsuper, supercols, etree,
-                              graphL_host, kernelHandleL,
-                              graphU_host, kernelHandleU);
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  double time_seconds = tic.seconds ();
-  std::cout << "   SpTRSV Supernodal Symbolic Time      : " << time_seconds << std::endl;
-  #endif
+  sptrsv_supernodal_symbolic(nsuper, supercols, etree, graphL_host,
+                             kernelHandleL, graphU_host, kernelHandleU);
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  double time_seconds = tic.seconds();
+  std::cout << "   SpTRSV Supernodal Symbolic Time      : " << time_seconds
+            << std::endl;
+#endif
 }
 
+/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+/* Auxiliary functions for numeric computation */
 
-/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
-/* Auxiliary functions for numeric computation                                               */
-
-/* ========================================================================================= */
+/* =========================================================================================
+ */
 template <typename crsmat_t, typename graph_t, typename KernelHandle>
-crsmat_t read_superlu_valuesL(KernelHandle *kernelHandle, SuperMatrix *L, graph_t &static_graph) {
-
+crsmat_t read_superlu_valuesL(KernelHandle *kernelHandle, SuperMatrix *L,
+                              graph_t &static_graph) {
   using values_view_t = typename crsmat_t::values_type::non_const_type;
   using scalar_t      = typename values_view_t::value_type;
 
   /* ---------------------------------------------------------------------- */
   /* get inputs */
   /* ---------------------------------------------------------------------- */
-  SCformat *Lstore = (SCformat*)(L->Store);
-  scalar_t *Lx = (scalar_t*)(Lstore->nzval);
+  SCformat *Lstore = (SCformat *)(L->Store);
+  scalar_t *Lx     = (scalar_t *)(Lstore->nzval);
 
-  int n = L->nrow;
-  int nsuper = 1 + Lstore->nsuper;     // # of supernodal columns
-  int * mb = Lstore->rowind_colptr;
-  int * nb = Lstore->sup_to_col;
-  int * colptr = Lstore->nzval_colptr;
-  int * rowind = Lstore->rowind;
+  int n       = L->nrow;
+  int nsuper  = 1 + Lstore->nsuper;  // # of supernodal columns
+  int *mb     = Lstore->rowind_colptr;
+  int *nb     = Lstore->sup_to_col;
+  int *colptr = Lstore->nzval_colptr;
+  int *rowind = Lstore->rowind;
 
   bool ptr_by_column = true;
-  return read_supernodal_values<crsmat_t> (kernelHandle, n, nsuper, 
-                                           ptr_by_column, mb, nb,
-                                           colptr, rowind, Lx, static_graph);
+  return read_supernodal_values<crsmat_t>(kernelHandle, n, nsuper,
+                                          ptr_by_column, mb, nb, colptr, rowind,
+                                          Lx, static_graph);
 }
 
-
-/* ========================================================================================= */
+/* =========================================================================================
+ */
 // store numerical values of SuperLU U-factor into CSR
-template <typename crsmat_t,
-          typename graph_t,
-          typename KernelHandle>
-crsmat_t
-read_superlu_valuesU(KernelHandle *kernelHandle,
-                     SuperMatrix *L,  SuperMatrix *U, graph_t &static_graph) {
-
-  using values_view_t  = typename crsmat_t::values_type::non_const_type;
-  using scalar_t       = typename values_view_t::value_type;
-  using integer_view_host_t = Kokkos::View<int*, Kokkos::HostSpace>;
+template <typename crsmat_t, typename graph_t, typename KernelHandle>
+crsmat_t read_superlu_valuesU(KernelHandle *kernelHandle, SuperMatrix *L,
+                              SuperMatrix *U, graph_t &static_graph) {
+  using values_view_t       = typename crsmat_t::values_type::non_const_type;
+  using scalar_t            = typename values_view_t::value_type;
+  using integer_view_host_t = Kokkos::View<int *, Kokkos::HostSpace>;
 
-  const scalar_t zero (0.0);
+  const scalar_t zero(0.0);
 
   /* load options */
   // NOTE: invert-offdiag not supported in CSR format
-  auto *handle = kernelHandle->get_sptrsv_handle ();
-  bool u_in_csc = handle->is_column_major ();
+  auto *handle  = kernelHandle->get_sptrsv_handle();
+  bool u_in_csc = handle->is_column_major();
 
   /* load inputs */
-  SCformat *Lstore = (SCformat*)(L->Store);
-  scalar_t *Lx = (scalar_t*)(Lstore->nzval);
+  SCformat *Lstore = (SCformat *)(L->Store);
+  scalar_t *Lx     = (scalar_t *)(Lstore->nzval);
 
-  NCformat *Ustore = (NCformat*)(U->Store);
-  scalar_t *Uval = (scalar_t*)(Ustore->nzval);
+  NCformat *Ustore = (NCformat *)(U->Store);
+  scalar_t *Uval   = (scalar_t *)(Ustore->nzval);
 
-  int n = L->nrow;
-  int nsuper = 1 + Lstore->nsuper;     // # of supernodal columns
-  int *nb = Lstore->sup_to_col;
-  int *mb = Lstore->rowind_colptr;
+  int n        = L->nrow;
+  int nsuper   = 1 + Lstore->nsuper;  // # of supernodal columns
+  int *nb      = Lstore->sup_to_col;
+  int *mb      = Lstore->rowind_colptr;
   int *colptrL = Lstore->nzval_colptr;
   int *colptrU = Ustore->colptr;
   int *rowindU = Ustore->rowind;
 
   /* create a map from row id to supernode id */
   int supid = 0;
-  integer_view_host_t map ("map", n);
+  integer_view_host_t map("map", n);
   for (int k = 0; k < nsuper; k++) {
     int j1 = nb[k];
-    int j2 = nb[k+1];
+    int j2 = nb[k + 1];
     for (int j = j1; j < j2; j++) {
-        map (j) = supid;
+      map(j) = supid;
     }
-    supid ++;
+    supid++;
   }
 
   auto rowmap_view = static_graph.row_map;
-  auto hr = Kokkos::create_mirror_view (rowmap_view);
-  Kokkos::deep_copy (hr, rowmap_view);
+  auto hr          = Kokkos::create_mirror_view(rowmap_view);
+  Kokkos::deep_copy(hr, rowmap_view);
 
   /* Upper-triangular matrix */
-  auto nnzA = hr (n);
-  values_view_t values_view ("values_view", nnzA);
-  auto hv = Kokkos::create_mirror_view (values_view);
-  Kokkos::deep_copy (hv, zero);
-
-  integer_view_host_t sup ("supernodes", nsuper);
-  integer_view_host_t off ("offsets", nsuper);
-  integer_view_host_t check ("check", nsuper);
-  Kokkos::deep_copy (check, 0);
+  auto nnzA = hr(n);
+  values_view_t values_view("values_view", nnzA);
+  auto hv = Kokkos::create_mirror_view(values_view);
+  Kokkos::deep_copy(hv, zero);
+
+  integer_view_host_t sup("supernodes", nsuper);
+  integer_view_host_t off("offsets", nsuper);
+  integer_view_host_t check("check", nsuper);
+  Kokkos::deep_copy(check, 0);
   for (int k = 0; k < nsuper; k++) {
-    int j1 = nb[k];
-    int nscol = nb[k+1] - j1;
+    int j1    = nb[k];
+    int nscol = nb[k + 1] - j1;
 
-    int i1 = mb[j1];
-    int nsrow = mb[j1+1] - i1;
+    int i1    = mb[j1];
+    int nsrow = mb[j1 + 1] - i1;
 
     /* the diagonal "dense" block */
     int psx = colptrL[j1];
     if (u_in_csc) {
       for (int j = 0; j < nscol; j++) {
         for (int i = 0; i <= j; i++) {
-          hv(hr(j1 + j) + i) = Lx[psx + i + j*nsrow];
+          hv(hr(j1 + j) + i) = Lx[psx + i + j * nsrow];
         }
-        hr (j1 + j) += nscol;
+        hr(j1 + j) += nscol;
       }
     } else {
       for (int i = 0; i < nscol; i++) {
         for (int j = i; j < nscol; j++) {
-          hv(hr(j1 + i) + j) = Lx[psx + i + j*nsrow];
+          hv(hr(j1 + i) + j) = Lx[psx + i + j * nsrow];
         }
-        hr (j1 + i) += nscol;
+        hr(j1 + i) += nscol;
       }
     }
 
@@ -477,27 +474,27 @@ read_superlu_valuesU(KernelHandle *kernelHandle,
     // let me first find off-diagonal supernodal blocks..
     int nsup = 0;
     for (int jcol = j1; jcol < j1 + nscol; jcol++) {
-      for (int i = colptrU[jcol]; i < colptrU[jcol+1]; i++ ){
+      for (int i = colptrU[jcol]; i < colptrU[jcol + 1]; i++) {
         int irow = rowindU[i];
-        if (check (map (irow)) == 0) {
-          check (map (irow)) = 1;
-          sup (nsup) = map (irow);
-          nsup ++;
+        if (check(map(irow)) == 0) {
+          check(map(irow)) = 1;
+          sup(nsup)        = map(irow);
+          nsup++;
         }
       }
     }
     if (u_in_csc) {
       int offset = 0;
       for (int i = 0; i < nsup; i++) {
-        off (sup (i)) = offset;
-        offset += nb[sup (i) + 1] - nb[sup (i)];
+        off(sup(i)) = offset;
+        offset += nb[sup(i) + 1] - nb[sup(i)];
       }
       for (int jcol = j1; jcol < j1 + nscol; jcol++) {
         // add nonzeros in jcol-th column
-        for (int i = colptrU[jcol]; i < colptrU[jcol+1]; i++ ){
-          int irow = rowindU[i];
-          int id = map (irow);
-          int ioff = off (id) + (irow - nb[id]);
+        for (int i = colptrU[jcol]; i < colptrU[jcol + 1]; i++) {
+          int irow            = rowindU[i];
+          int id              = map(irow);
+          int ioff            = off(id) + (irow - nb[id]);
           hv(hr(jcol) + ioff) = Uval[i];
         }
         // move up the pointers for all the supernodal blocks
@@ -508,43 +505,43 @@ read_superlu_valuesU(KernelHandle *kernelHandle,
         // add nonzeros in jcol-th column
         // (only nonzero columns)
         // TODO: should take unions of nonzero columns per block row
-        for (int i = colptrU[jcol]; i < colptrU[jcol+1]; i++ ){
-          int irow = rowindU[i];
+        for (int i = colptrU[jcol]; i < colptrU[jcol + 1]; i++) {
+          int irow     = rowindU[i];
           hv(hr(irow)) = Uval[i];
         }
         // move up all the row pointers for all the supernodal blocks
         for (int i = 0; i < nsup; i++) {
-          for (int ii = nb[sup (i)]; ii < nb[sup (i) + 1]; ii++) {
-            hr(ii) ++;
+          for (int ii = nb[sup(i)]; ii < nb[sup(i) + 1]; ii++) {
+            hr(ii)++;
           }
         }
       }
     }
     // reset check
     for (int i = 0; i < nsup; i++) {
-      check (sup (i)) = 0;
+      check(sup(i)) = 0;
     }
   }
 
   // fix hr
   for (int i = n; i >= 1; i--) {
-    hr(i) = hr(i-1);
+    hr(i) = hr(i - 1);
   }
   hr(0) = 0;
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
   std::cout << "    * Matrix size = " << n << std::endl;
-  std::cout << "    * Total nnz   = " << hr (n) << std::endl;
-  std::cout << "    * nnz / n     = " << hr (n)/n << std::endl;
-  #endif
+  std::cout << "    * Total nnz   = " << hr(n) << std::endl;
+  std::cout << "    * nnz / n     = " << hr(n) / n << std::endl;
+#endif
 
   // invert blocks (TODO: done on host for now)
   bool unit_diag = false;
-  auto entries = static_graph.entries;
-  auto hc = Kokkos::create_mirror_view (entries);
-  Kokkos::deep_copy (hc, entries);
-  invert_supernodal_columns (kernelHandle, unit_diag, nsuper, nb, hr, hc, hv);
+  auto entries   = static_graph.entries;
+  auto hc        = Kokkos::create_mirror_view(entries);
+  Kokkos::deep_copy(hc, entries);
+  invert_supernodal_columns(kernelHandle, unit_diag, nsuper, nb, hr, hc, hv);
   // deepcopy
-  Kokkos::deep_copy (values_view, hv);
+  Kokkos::deep_copy(values_view, hv);
 
   // create crs
   crsmat_t crsmat("CrsMatrix", n, values_view, static_graph);
@@ -552,164 +549,172 @@ read_superlu_valuesU(KernelHandle *kernelHandle,
   return crsmat;
 }
 
-
-/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
-/* For numeric computation                                                                   */
+/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+/* For numeric computation */
 template <typename KernelHandle>
-void sptrsv_compute(
-    KernelHandle *kernelHandleL,
-    KernelHandle *kernelHandleU,
-    SuperMatrix &L,
-    SuperMatrix &U)
-{
-  using      crsmat_t = typename KernelHandle::SPTRSVHandleType::crsmat_t;
+void sptrsv_compute(KernelHandle *kernelHandleL, KernelHandle *kernelHandleU,
+                    SuperMatrix &L, SuperMatrix &U) {
+  using crsmat_t      = typename KernelHandle::SPTRSVHandleType::crsmat_t;
   using host_crsmat_t = typename KernelHandle::SPTRSVHandleType::host_crsmat_t;
 
   Kokkos::Timer tic;
   Kokkos::Timer timer;
   // ===================================================================
   // load sptrsv-handles
-  auto *handleL = kernelHandleL->get_sptrsv_handle ();
-  auto *handleU = kernelHandleU->get_sptrsv_handle ();
+  auto *handleL = kernelHandleL->get_sptrsv_handle();
+  auto *handleU = kernelHandleU->get_sptrsv_handle();
 
   if (!(handleL->is_symbolic_complete()) ||
       !(handleU->is_symbolic_complete())) {
-    std::cout << std::endl
-            << " ** needs to call sptrsv_symbolic before calling sptrsv_numeric **"
-           << std::endl << std::endl;
+    std::cout
+        << std::endl
+        << " ** needs to call sptrsv_symbolic before calling sptrsv_numeric **"
+        << std::endl
+        << std::endl;
     return;
   }
 
   // ===================================================================
   // load options
-  bool merge = handleL->get_merge_supernodes ();
-  bool useSpMV = (handleL->get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV ||
-                  handleL->get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG);
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  bool merge = handleL->get_merge_supernodes();
+  bool useSpMV =
+      (handleL->get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV ||
+       handleL->get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG);
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
   double time_seconds = 0.0;
-  bool invert_offdiag = handleL->get_invert_offdiagonal ();
-  if (merge)          std::cout << " >> merge\n" << std::endl;
+  bool invert_offdiag = handleL->get_invert_offdiagonal();
+  if (merge) std::cout << " >> merge\n" << std::endl;
   if (invert_offdiag) std::cout << " >> invert offdiag\n" << std::endl;
-  #endif
+#endif
 
   // ===================================================================
   // load graphs
-  auto graphL = handleL->get_graph ();
-  auto graphL_host = handleL->get_graph_host ();
+  auto graphL      = handleL->get_graph();
+  auto graphL_host = handleL->get_graph_host();
 
-  auto graphU = handleU->get_graph ();
-  auto graphU_host = handleU->get_graph_host ();
+  auto graphU      = handleU->get_graph();
+  auto graphU_host = handleU->get_graph_host();
 
-  int nsuper = handleL->get_num_supernodes ();
-  const int* supercols = handleL->get_supercols_host ();
+  int nsuper           = handleL->get_num_supernodes();
+  const int *supercols = handleL->get_supercols_host();
   crsmat_t superluL, superluU;
   host_crsmat_t superluL_host, superluU_host;
   if (merge) {
-    tic.reset ();
+    tic.reset();
     // ========================================================
     // read in the numerical L-values into merged csc
     // NOTE: we first load into CRS, and then merge (should be combined)
     // 1) load L into crs (offdiagonal not inverted, unless invert diag)
-    bool invert_diag = handleL->get_invert_diagonal ();
-    kernelHandleL->set_sptrsv_invert_diagonal (false);  // invert after merge
-    auto original_graphL_host = handleL->get_original_graph_host ();
-    superluL_host = read_superlu_valuesL<host_crsmat_t> (kernelHandleL, &L, original_graphL_host);
+    bool invert_diag = handleL->get_invert_diagonal();
+    kernelHandleL->set_sptrsv_invert_diagonal(false);  // invert after merge
+    auto original_graphL_host = handleL->get_original_graph_host();
+    superluL_host = read_superlu_valuesL<host_crsmat_t>(kernelHandleL, &L,
+                                                        original_graphL_host);
     // 2) re-load L into merged crs
     bool unit_diag = true;
     // reset invert option
-    kernelHandleL->set_sptrsv_invert_diagonal (invert_diag);
+    kernelHandleL->set_sptrsv_invert_diagonal(invert_diag);
     if (useSpMV) {
-      superluL_host = read_merged_supernodes<host_crsmat_t> (kernelHandleL, nsuper, supercols,
-                                                             unit_diag, superluL_host, graphL_host);
+      superluL_host = read_merged_supernodes<host_crsmat_t>(
+          kernelHandleL, nsuper, supercols, unit_diag, superluL_host,
+          graphL_host);
     } else {
-      superluL = read_merged_supernodes<crsmat_t> (kernelHandleL, nsuper, supercols,
-                                                   unit_diag, superluL_host, graphL);
+      superluL = read_merged_supernodes<crsmat_t>(
+          kernelHandleL, nsuper, supercols, unit_diag, superluL_host, graphL);
     }
 
     // ========================================================
     // read in the numerical U-values into merged csr
     // 1) load U into crs
-    invert_diag = handleU->get_invert_diagonal ();
-    kernelHandleU->set_sptrsv_invert_diagonal (false);  // invert after merge
-    auto original_graphU_host = handleU->get_original_graph_host ();
-    superluU_host = read_superlu_valuesU<host_crsmat_t> (kernelHandleU, &L, &U, original_graphU_host);
+    invert_diag = handleU->get_invert_diagonal();
+    kernelHandleU->set_sptrsv_invert_diagonal(false);  // invert after merge
+    auto original_graphU_host = handleU->get_original_graph_host();
+    superluU_host = read_superlu_valuesU<host_crsmat_t>(kernelHandleU, &L, &U,
+                                                        original_graphU_host);
     // 2) re-load U into merged crs
     unit_diag = false;
     // reset invert option
-    kernelHandleU->set_sptrsv_invert_diagonal (invert_diag);
+    kernelHandleU->set_sptrsv_invert_diagonal(invert_diag);
     if (useSpMV) {
-      superluU_host = read_merged_supernodes<host_crsmat_t> (kernelHandleU, nsuper, supercols,
-                                                             unit_diag, superluU_host, graphU_host);
+      superluU_host = read_merged_supernodes<host_crsmat_t>(
+          kernelHandleU, nsuper, supercols, unit_diag, superluU_host,
+          graphU_host);
     } else {
-      superluU = read_merged_supernodes<crsmat_t> (kernelHandleU, nsuper, supercols,
-                                                   unit_diag, superluU_host, graphU);
+      superluU = read_merged_supernodes<crsmat_t>(
+          kernelHandleU, nsuper, supercols, unit_diag, superluU_host, graphU);
     }
-    #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-    time_seconds = tic.seconds ();
-    std::cout << "   Time to Merge and Copy to device: " << time_seconds << std::endl;
-    #endif
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+    time_seconds = tic.seconds();
+    std::cout << "   Time to Merge and Copy to device: " << time_seconds
+              << std::endl;
+#endif
   } else {
     // ========================================================
     // read in the numerical values into merged csc for L
-    //kernelHandleL->set_sptrsv_invert_diagonal (true); // only, invert diag is supported for now
-    tic.reset ();
+    // kernelHandleL->set_sptrsv_invert_diagonal (true); // only, invert diag is
+    // supported for now
+    tic.reset();
     if (useSpMV) {
-      superluL_host = read_superlu_valuesL<host_crsmat_t> (kernelHandleL, &L, graphL_host);
+      superluL_host =
+          read_superlu_valuesL<host_crsmat_t>(kernelHandleL, &L, graphL_host);
     } else {
-      superluL = read_superlu_valuesL<crsmat_t> (kernelHandleL, &L, graphL);
+      superluL = read_superlu_valuesL<crsmat_t>(kernelHandleL, &L, graphL);
     }
-    #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-    double timeL = tic.seconds ();
-    #endif
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+    double timeL = tic.seconds();
+#endif
 
     // ========================================================
     // read in the numerical values into merged csc/csr for U
-    tic.reset ();
-    //kernelHandleU->set_sptrsv_invert_diagonal (true); // only, invert diag is supported for now
+    tic.reset();
+    // kernelHandleU->set_sptrsv_invert_diagonal (true); // only, invert diag is
+    // supported for now
     if (useSpMV) {
-      superluU_host = read_superlu_valuesU<host_crsmat_t> (kernelHandleU, &L, &U, graphU_host);
+      superluU_host = read_superlu_valuesU<host_crsmat_t>(kernelHandleU, &L, &U,
+                                                          graphU_host);
     } else {
-      superluU = read_superlu_valuesU<crsmat_t> (kernelHandleU, &L, &U, graphU);
+      superluU = read_superlu_valuesU<crsmat_t>(kernelHandleU, &L, &U, graphU);
     }
-    #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-    double timeU = tic.seconds ();
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+    double timeU = tic.seconds();
     std::cout << "   Time to copy to device: " << std::endl;
     std::cout << "   > copy L to device: " << timeL << std::endl;
     std::cout << "   > copy U to device: " << timeU << std::endl;
-    #endif
+#endif
   }
 
   // ===================================================================
   if (useSpMV) {
     // ----------------------------------------------------
     // split the matrix into submatrices for spmv at each level
-    tic.reset ();
-    split_crsmat<crsmat_t> (kernelHandleL, superluL_host);
-    split_crsmat<crsmat_t> (kernelHandleU, superluU_host);
-    #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-    time_seconds = tic.seconds ();
+    tic.reset();
+    split_crsmat<crsmat_t>(kernelHandleL, superluL_host);
+    split_crsmat<crsmat_t>(kernelHandleU, superluU_host);
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+    time_seconds = tic.seconds();
     std::cout << "   Time to Split to submatrix: " << time_seconds << std::endl;
-    #endif
+#endif
   }
 
   // ==============================================
   // save crsmat
-  handleL->set_crsmat (superluL);
-  handleU->set_crsmat (superluU);
+  handleL->set_crsmat(superluL);
+  handleU->set_crsmat(superluU);
 
   // ===================================================================
-  handleL->set_numeric_complete ();
-  handleU->set_numeric_complete ();
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  time_seconds = timer.seconds ();
-  std::cout << "   Total Compute Time: " << time_seconds << std::endl << std::endl;
-  #endif
-} // sptrsv_compute
-
-
-} // namespace Experimental
-} // namespace KokkosSparse
+  handleL->set_numeric_complete();
+  handleU->set_numeric_complete();
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  time_seconds = timer.seconds();
+  std::cout << "   Total Compute Time: " << time_seconds << std::endl
+            << std::endl;
+#endif
+}  // sptrsv_compute
 
-#endif // KOKKOSKERNELS_ENABLE_TPL_SUPERLU && KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
-#endif // KOKKOSSPARSE_SPTRSV_SUPERLU_HPP_
+}  // namespace Experimental
+}  // namespace KokkosSparse
 
+#endif  // KOKKOSKERNELS_ENABLE_TPL_SUPERLU &&
+        // KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
+#endif  // KOKKOSSPARSE_SPTRSV_SUPERLU_HPP_
diff --git a/src/sparse/KokkosSparse_sptrsv_supernode.hpp b/src/sparse/KokkosSparse_sptrsv_supernode.hpp
index e8ff465802..fa9a607be7 100644
--- a/src/sparse/KokkosSparse_sptrsv_supernode.hpp
+++ b/src/sparse/KokkosSparse_sptrsv_supernode.hpp
@@ -69,79 +69,82 @@
 namespace KokkosSparse {
 namespace Experimental {
 
-template<typename ordinal_type>
+template <typename ordinal_type>
 class sort_indices {
-   public:
-     sort_indices(ordinal_type* rowinds) : rowinds_(rowinds){}
-     bool operator()(int i, int j) const { return rowinds_[i] < rowinds_[j]; }
-   private:
-     ordinal_type* rowinds_; // rowindices
+ public:
+  sort_indices(ordinal_type *rowinds) : rowinds_(rowinds) {}
+  bool operator()(int i, int j) const { return rowinds_[i] < rowinds_[j]; }
+
+ private:
+  ordinal_type *rowinds_;  // rowindices
 };
 
-/* ========================================================================================= */
+/* =========================================================================================
+ */
 template <typename host_graph_t, typename graph_t>
-graph_t deep_copy_graph (host_graph_t &host_graph) {
+graph_t deep_copy_graph(host_graph_t &host_graph) {
   // load graph on host
   auto row_map = host_graph.row_map;
   auto entries = host_graph.entries;
-  auto nrows = host_graph.numRows ();
-  auto nnz = row_map (nrows);
+  auto nrows   = host_graph.numRows();
+  auto nnz     = row_map(nrows);
 
   // create graph on device
   using row_map_view_t = typename graph_t::row_map_type::non_const_type;
   using cols_view_t    = typename graph_t::entries_type::non_const_type;
-  row_map_view_t rowmap_view ("rowmap_view", nrows+1);
-  cols_view_t    column_view ("colmap_view", nnz);
+  row_map_view_t rowmap_view("rowmap_view", nrows + 1);
+  cols_view_t column_view("colmap_view", nnz);
 
   // copy graph to device
-  Kokkos::deep_copy (rowmap_view, row_map);
-  Kokkos::deep_copy (column_view, entries);
-  graph_t static_graph (column_view, rowmap_view);
+  Kokkos::deep_copy(rowmap_view, row_map);
+  Kokkos::deep_copy(column_view, entries);
+  graph_t static_graph(column_view, rowmap_view);
   return static_graph;
 }
 
-/* ========================================================================================= */
-template <typename graph_t, typename ptr_type, typename size_type, typename ordinal_type, typename KernelHandle>
-graph_t
-read_supernodal_graphL(KernelHandle *kernelHandle, int n, int nsuper, int nnzA, bool ptr_by_column,
-                       ptr_type *mb, size_type *nb, ordinal_type *rowind) {
-
-  using row_map_view_t = typename graph_t::row_map_type::non_const_type;
-  using cols_view_t    = typename graph_t::entries_type::non_const_type;
-  using integer_view_host_t = Kokkos::View<ordinal_type*, Kokkos::HostSpace>;
+/* =========================================================================================
+ */
+template <typename graph_t, typename ptr_type, typename size_type,
+          typename ordinal_type, typename KernelHandle>
+graph_t read_supernodal_graphL(KernelHandle *kernelHandle, int n, int nsuper,
+                               int nnzA, bool ptr_by_column, ptr_type *mb,
+                               size_type *nb, ordinal_type *rowind) {
+  using row_map_view_t      = typename graph_t::row_map_type::non_const_type;
+  using cols_view_t         = typename graph_t::entries_type::non_const_type;
+  using integer_view_host_t = Kokkos::View<ordinal_type *, Kokkos::HostSpace>;
 
   // load parameters
-  auto *handle = kernelHandle->get_sptrsv_handle ();
-  bool merge = handle->get_merge_supernodes ();
+  auto *handle = kernelHandle->get_sptrsv_handle();
+  bool merge   = handle->get_merge_supernodes();
 
-  row_map_view_t rowmap_view ("rowmap_view", n+1);
-  cols_view_t    column_view ("colmap_view", nnzA);
-  auto hr = Kokkos::create_mirror_view (rowmap_view);
-  auto hc = Kokkos::create_mirror_view (column_view);
+  row_map_view_t rowmap_view("rowmap_view", n + 1);
+  cols_view_t column_view("colmap_view", nnzA);
+  auto hr = Kokkos::create_mirror_view(rowmap_view);
+  auto hc = Kokkos::create_mirror_view(column_view);
 
   // compute offset for each row
-  int j = 0;
+  int j               = 0;
   int max_nnz_per_row = 0;
-  hr(j) = 0;
-  for (int s = 0 ; s < nsuper ; s++) {
+  hr(j)               = 0;
+  for (int s = 0; s < nsuper; s++) {
     int j1 = nb[s];
-    int j2 = nb[s+1];
+    int j2 = nb[s + 1];
     // number of columns in the s-th supernode column
     int nscol = j2 - j1;
 
     int i1, i2;
     if (ptr_by_column) {
       i1 = mb[j1];
-      i2 = mb[j1+1];
+      i2 = mb[j1 + 1];
     } else {
       i1 = mb[s];
-      i2 = mb[s+1];
+      i2 = mb[s + 1];
     }
     // "total" number of rows in all the supernodes (diagonal+off-diagonal)
     int nsrow = i2 - i1;
 
     for (int jj = 0; jj < nscol; jj++) {
-      hr(j+1) = hr(j) + nsrow;
+      hr(j + 1) = hr(j) + nsrow;
       j++;
     }
     if (nsrow > max_nnz_per_row) {
@@ -149,40 +152,43 @@ read_supernodal_graphL(KernelHandle *kernelHandle, int n, int nsuper, int nnzA,
     }
   }
 
-  integer_view_host_t sorted_rowind_view ("sorted_rowind", max_nnz_per_row+1);
-  ordinal_type *sorted_rowind = sorted_rowind_view.data ();
+  integer_view_host_t sorted_rowind_view("sorted_rowind", max_nnz_per_row + 1);
+  ordinal_type *sorted_rowind = sorted_rowind_view.data();
   // store L in csr
-  for (int s = 0 ; s < nsuper ; s++) {
-    int j1 = nb[s];
-    int j2 = nb[s+1];
-    int nscol = j2 - j1;      // number of columns in the s-th supernode column
+  for (int s = 0; s < nsuper; s++) {
+    int j1    = nb[s];
+    int j2    = nb[s + 1];
+    int nscol = j2 - j1;  // number of columns in the s-th supernode column
 
     int i1, i2;
     if (ptr_by_column) {
       i1 = mb[j1];
-      i2 = mb[j1+1];
+      i2 = mb[j1 + 1];
     } else {
       i1 = mb[s];
-      i2 = mb[s+1];
+      i2 = mb[s + 1];
     }
-    int nsrow  = i2 - i1;    // "total" number of rows in all the supernodes (diagonal+off-diagonal)
-    int nsrow2 = nsrow - nscol;  // "total" number of rows in all the off-diagonal supernodes
-    int ps2    = i1 + nscol;     // offset into rowind
+    int nsrow = i2 - i1;  // "total" number of rows in all the supernodes
+                          // (diagonal+off-diagonal)
+    int nsrow2 =
+        nsrow -
+        nscol;  // "total" number of rows in all the off-diagonal supernodes
+    int ps2 = i1 + nscol;  // offset into rowind
 
     /* diagonal block */
     for (int ii = 0; ii < nscol; ii++) {
       // lower-triangular part
       for (int jj = 0; jj < ii; jj++) {
-        hc(hr(j1+jj)) = j1+ii;
-        hr(j1+jj) ++;
+        hc(hr(j1 + jj)) = j1 + ii;
+        hr(j1 + jj)++;
       }
       // diagonal
-      hc(hr(j1+ii)) = j1+ii;
-      hr(j1+ii) ++;
+      hc(hr(j1 + ii)) = j1 + ii;
+      hr(j1 + ii)++;
       // explicitly store zeros in upper-part
-      for (int jj = ii+1; jj < nscol; jj++) {
-        hc(hr(j1+jj)) = j1+ii;
-        hr(j1+jj) ++;
+      for (int jj = ii + 1; jj < nscol; jj++) {
+        hc(hr(j1 + jj)) = j1 + ii;
+        hr(j1 + jj)++;
       }
     }
 
@@ -192,161 +198,165 @@ read_supernodal_graphL(KernelHandle *kernelHandle, int n, int nsuper, int nnzA,
       for (int ii = 0; ii < nsrow2; ii++) {
         sorted_rowind[ii] = ii;
       }
-      std::sort(&(sorted_rowind[0]), &(sorted_rowind[nsrow2]), sort_indices<ordinal_type>(&rowind[ps2]));
+      std::sort(&(sorted_rowind[0]), &(sorted_rowind[nsrow2]),
+                sort_indices<ordinal_type>(&rowind[ps2]));
     }
     for (int kk = 0; kk < nsrow2; kk++) {
-      int ii = (merge ? sorted_rowind[kk] : kk); // sorted rowind
-      int i = rowind[ps2 + ii];
+      int ii = (merge ? sorted_rowind[kk] : kk);  // sorted rowind
+      int i  = rowind[ps2 + ii];
       for (int jj = 0; jj < nscol; jj++) {
-        hc(hr(j1+jj)) = i;
-        hr(j1+jj) ++;
+        hc(hr(j1 + jj)) = i;
+        hr(j1 + jj)++;
       }
     }
   }
 
   // fix hr
   for (int i = n; i >= 1; i--) {
-    hr(i) = hr(i-1);
+    hr(i) = hr(i - 1);
   }
   hr(0) = 0;
 
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
   std::cout << "    * Matrix size = " << n << std::endl;
-  std::cout << "    * Total nnz   = " << hr (n) << std::endl;
-  std::cout << "    * nnz / n     = " << hr (n)/n << std::endl;
-  #endif
+  std::cout << "    * Total nnz   = " << hr(n) << std::endl;
+  std::cout << "    * nnz / n     = " << hr(n) / n << std::endl;
+#endif
 
   // deepcopy
-  Kokkos::deep_copy (rowmap_view, hr);
-  Kokkos::deep_copy (column_view, hc);
+  Kokkos::deep_copy(rowmap_view, hr);
+  Kokkos::deep_copy(column_view, hc);
 
   // create crs
-  graph_t static_graph (column_view, rowmap_view);
+  graph_t static_graph(column_view, rowmap_view);
   return static_graph;
 }
 
-
-/* ========================================================================================= */
-template <typename graph_t, typename ptr_type, typename size_type, typename ordinal_type, typename KernelHandle>
-graph_t
-read_supernodal_graphLt(KernelHandle *kernelHandle, int n, int nsuper, bool ptr_by_column,
-                        ptr_type *mb, size_type *nb, ordinal_type *rowind) {
-
-  using row_map_view_t = typename graph_t::row_map_type::non_const_type;
-  using cols_view_t    = typename graph_t::entries_type::non_const_type;
-  using integer_view_host_t = Kokkos::View<int*, Kokkos::HostSpace>;
+/* =========================================================================================
+ */
+template <typename graph_t, typename ptr_type, typename size_type,
+          typename ordinal_type, typename KernelHandle>
+graph_t read_supernodal_graphLt(KernelHandle *kernelHandle, int n, int nsuper,
+                                bool ptr_by_column, ptr_type *mb, size_type *nb,
+                                ordinal_type *rowind) {
+  using row_map_view_t      = typename graph_t::row_map_type::non_const_type;
+  using cols_view_t         = typename graph_t::entries_type::non_const_type;
+  using integer_view_host_t = Kokkos::View<int *, Kokkos::HostSpace>;
 
   // load parameters
-  auto *handle = kernelHandle->get_sptrsv_handle ();
-  bool merge = handle->get_merge_supernodes ();
+  auto *handle = kernelHandle->get_sptrsv_handle();
+  bool merge   = handle->get_merge_supernodes();
 
   /* create a map from row id to supernode id */
-  integer_view_host_t map ("map", n);
+  integer_view_host_t map("map", n);
   int supid = 0;
   for (int k = 0; k < nsuper; k++) {
     int j1 = nb[k];
-    int j2 = nb[k+1];
+    int j2 = nb[k + 1];
     for (int j = j1; j < j2; j++) {
-      map (j) = supid;
+      map(j) = supid;
     }
-    supid ++;
+    supid++;
   }
 
-  row_map_view_t rowmap_view ("rowmap_view", n+1);
-  auto hr = Kokkos::create_mirror_view (rowmap_view);
-  Kokkos::deep_copy (hr, 0);
+  row_map_view_t rowmap_view("rowmap_view", n + 1);
+  auto hr = Kokkos::create_mirror_view(rowmap_view);
+  Kokkos::deep_copy(hr, 0);
 
-  integer_view_host_t sup ("sup", nsuper);
-  integer_view_host_t check ("check", nsuper);
-  Kokkos::deep_copy (check, 0);
+  integer_view_host_t sup("sup", nsuper);
+  integer_view_host_t check("check", nsuper);
+  Kokkos::deep_copy(check, 0);
 
   // compute offset for each row
   int nnzA = 0;
   for (int s = 0; s < nsuper; s++) {
     int j1 = nb[s];
-    int j2 = nb[s+1];
+    int j2 = nb[s + 1];
     // number of columns in the s-th supernode column
     int nscol = j2 - j1;
 
     int i1, i2;
     if (ptr_by_column) {
       i1 = mb[j1];
-      i2 = mb[j1+1];
+      i2 = mb[j1 + 1];
     } else {
       i1 = mb[s];
-      i2 = mb[s+1];
+      i2 = mb[s + 1];
     }
 
     // diagonal blocks
     for (int ii = j1; ii < j2; ii++) {
-      hr(ii+1) += nscol;
+      hr(ii + 1) += nscol;
       nnzA += nscol;
     }
 
     // offdiagonal blocks
-    int nsup = 0;
-    int ps2   = i1 + nscol; // offset into rowind
-    int nsrow = i2 - i1;
-    int nsrow2 = nsrow-nscol;
+    int nsup   = 0;
+    int ps2    = i1 + nscol;  // offset into rowind
+    int nsrow  = i2 - i1;
+    int nsrow2 = nsrow - nscol;
     for (int kk = 0; kk < nsrow2; kk++) {
       int irow = rowind[ps2 + kk];
-      supid = map (irow);
-      if (check (supid) == 0) {
-        for (int ii = nb[supid]; ii < nb[supid+1]; ii++) {
-          hr (ii + 1) += nscol;
+      supid    = map(irow);
+      if (check(supid) == 0) {
+        for (int ii = nb[supid]; ii < nb[supid + 1]; ii++) {
+          hr(ii + 1) += nscol;
           nnzA += nscol;
         }
-        check (supid) = 1;
-        sup (nsup) = supid;
-        nsup ++;
+        check(supid) = 1;
+        sup(nsup)    = supid;
+        nsup++;
       }
     }
     // reset check
-    for (int i = 0; i < nsup; i++ ) {
-      check (sup (i)) = 0;
+    for (int i = 0; i < nsup; i++) {
+      check(sup(i)) = 0;
     }
   }
   for (int i = 0; i < n; i++) {
-    hr(i+1) += hr(i);
+    hr(i + 1) += hr(i);
   }
-  cols_view_t    column_view ("colmap_view", nnzA);
-  auto hc = Kokkos::create_mirror_view (column_view);
+  cols_view_t column_view("colmap_view", nnzA);
+  auto hc = Kokkos::create_mirror_view(column_view);
 
   // pointer to off-diagonals (diagonal comes first)
-  integer_view_host_t off ("off", 1+n);
+  integer_view_host_t off("off", 1 + n);
   for (int s = 0; s < nsuper; s++) {
     int i1 = nb[s];
-    int i2 = nb[s+1];
+    int i2 = nb[s + 1];
     // number of columns in the s-th supernode column
     int nscol = i2 - i1;
     for (int ii = i1; ii < i2; ii++) {
-      off (ii) = hr (ii) + nscol;
+      off(ii) = hr(ii) + nscol;
     }
   }
 
   // store L in csr
   for (int s = 0; s < nsuper; s++) {
-    int j1 = nb[s];
-    int j2 = nb[s+1];
-    int nscol = j2 - j1;      // number of columns in the s-th supernode column
+    int j1    = nb[s];
+    int j2    = nb[s + 1];
+    int nscol = j2 - j1;  // number of columns in the s-th supernode column
 
     int i1, i2;
     if (ptr_by_column) {
       i1 = mb[j1];
-      i2 = mb[j1+1];
+      i2 = mb[j1 + 1];
     } else {
       i1 = mb[s];
-      i2 = mb[s+1];
+      i2 = mb[s + 1];
     }
-    int nsrow  = i2 - i1;    // "total" number of rows in all the supernodes (diagonal+off-diagonal)
-    int nsrow2 = nsrow - nscol;  // "total" number of rows in all the off-diagonal supernodes
-    int ps2    = i1 + nscol;     // offset into rowind
+    int nsrow = i2 - i1;  // "total" number of rows in all the supernodes
+                          // (diagonal+off-diagonal)
+    int nsrow2 =
+        nsrow -
+        nscol;  // "total" number of rows in all the off-diagonal supernodes
+    int ps2 = i1 + nscol;  // offset into rowind
 
     /* diagonal block */
     for (int ii = 0; ii < nscol; ii++) {
       // explicitly store zeros in upper-part
       for (int jj = 0; jj < nscol; jj++) {
-        hc(hr(j1+ii)+jj) = j1+jj;
+        hc(hr(j1 + ii) + jj) = j1 + jj;
       }
     }
 
@@ -354,22 +364,22 @@ read_supernodal_graphLt(KernelHandle *kernelHandle, int n, int nsuper, bool ptr_
     int nsup = 0;
     for (int kk = 0; kk < nsrow2; kk++) {
       int irow = rowind[ps2 + kk];
-      supid = map (irow);
-      if (check (supid) == 0) {
-        for (int ii = nb[supid]; ii < nb[supid+1]; ii++) {
+      supid    = map(irow);
+      if (check(supid) == 0) {
+        for (int ii = nb[supid]; ii < nb[supid + 1]; ii++) {
           for (int jj = 0; jj < nscol; jj++) {
-            hc (off(ii)+jj) = j1+jj;
+            hc(off(ii) + jj) = j1 + jj;
           }
           off(ii) += nscol;
         }
-       check (supid) = 1;
-       sup (nsup) = supid;
-       nsup ++;
+        check(supid) = 1;
+        sup(nsup)    = supid;
+        nsup++;
       }
     }
     // reset check
-    for (int i = 0; i < nsup; i++ ) {
-      check (sup (i)) = 0;
+    for (int i = 0; i < nsup; i++) {
+      check(sup(i)) = 0;
     }
   }
 
@@ -377,38 +387,38 @@ read_supernodal_graphLt(KernelHandle *kernelHandle, int n, int nsuper, bool ptr_
     // they should be sorted?
   }
 
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
   std::cout << "    * Matrix size = " << n << std::endl;
-  std::cout << "    * Total nnz   = " << hr (n) << std::endl;
-  std::cout << "    * nnz / n     = " << hr (n)/n << std::endl;
-  #endif
+  std::cout << "    * Total nnz   = " << hr(n) << std::endl;
+  std::cout << "    * nnz / n     = " << hr(n) / n << std::endl;
+#endif
 
   // deepcopy
-  Kokkos::deep_copy (rowmap_view, hr);
-  Kokkos::deep_copy (column_view, hc);
+  Kokkos::deep_copy(rowmap_view, hr);
+  Kokkos::deep_copy(column_view, hc);
 
   // create crs
-  graph_t static_graph (column_view, rowmap_view);
+  graph_t static_graph(column_view, rowmap_view);
   return static_graph;
 }
 
-
-/* ========================================================================================= */
+/* =========================================================================================
+ */
 template <typename input_graph_t, typename input_size_type>
-void check_supernode_sizes(const char *title, int n, int nsuper, input_size_type *nb, input_graph_t &graph) {
-
+void check_supernode_sizes(const char *title, int n, int nsuper,
+                           input_size_type *nb, input_graph_t &graph) {
   auto rowmap_view = graph.row_map;
-  auto hr = Kokkos::create_mirror_view (rowmap_view);
-  Kokkos::deep_copy (hr, rowmap_view);
+  auto hr          = Kokkos::create_mirror_view(rowmap_view);
+  Kokkos::deep_copy(hr, rowmap_view);
 
   int min_nsrow = 0, max_nsrow = 0, tot_nsrow = 0;
   int min_nscol = 0, max_nscol = 0, tot_nscol = 0;
-  for (int s = 0; s <nsuper; s++) {
+  for (int s = 0; s < nsuper; s++) {
     int j1 = nb[s];
-    int j2 = nb[s+1];
+    int j2 = nb[s + 1];
 
     int nscol = j2 - j1;
-    int nsrow = hr(j1+1) - hr(j1);
+    int nsrow = hr(j1 + 1) - hr(j1);
 
     if (s == 0) {
       min_nscol = max_nscol = tot_nscol = nscol;
@@ -432,129 +442,131 @@ void check_supernode_sizes(const char *title, int n, int nsuper, input_size_type
     }
   }
   std::cout << std::endl
-            << " ------------------------------------- "
-            << std::endl << std::endl;
+            << " ------------------------------------- " << std::endl
+            << std::endl;
   std::cout << " " << title << std::endl;
   std::cout << "  + nsuper = " << nsuper << std::endl;
   std::cout << "  > nsrow: min = " << min_nsrow << ", max = " << max_nsrow
-            << ", avg = " << tot_nsrow/nsuper << std::endl;
-  std::cout << "  > nscol: min = " <<  min_nscol << ", max = " << max_nscol
-            << ", avg = " << tot_nscol/nsuper << std::endl;
+            << ", avg = " << tot_nsrow / nsuper << std::endl;
+  std::cout << "  > nscol: min = " << min_nscol << ", max = " << max_nscol
+            << ", avg = " << tot_nscol / nsuper << std::endl;
   std::cout << "    + Matrix size = " << n << std::endl;
   std::cout << "    + Total nnz   = " << hr(n) << std::endl;
-  std::cout << "    + nnz / n     = " << hr(n)/n << std::endl;
+  std::cout << "    + nnz / n     = " << hr(n) / n << std::endl;
 }
 
-
-/* ========================================================================================= */
+/* =========================================================================================
+ */
 template <typename host_graph_t, typename graph_t, typename input_size_type>
-host_graph_t
-generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const input_size_type *nb) {
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+host_graph_t generate_supernodal_graph(bool col_major, graph_t &graph,
+                                       int nsuper, const input_size_type *nb) {
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
   double time_seconds = 0.0;
   Kokkos::Timer timer;
-  #endif
+#endif
 
-  using size_type = typename graph_t::size_type;
-  using cols_view_host_t    = typename host_graph_t::entries_type::non_const_type;
-  using row_map_view_host_t = typename host_graph_t::row_map_type::non_const_type;
-  using integer_view_host_t = Kokkos::View<int*, Kokkos::HostSpace>;
+  using size_type        = typename graph_t::size_type;
+  using cols_view_host_t = typename host_graph_t::entries_type::non_const_type;
+  using row_map_view_host_t =
+      typename host_graph_t::row_map_type::non_const_type;
+  using integer_view_host_t = Kokkos::View<int *, Kokkos::HostSpace>;
 
-  int n = graph.numRows ();
+  int n        = graph.numRows();
   auto row_map = graph.row_map;
   auto entries = graph.entries;
 
-  auto row_map_host = Kokkos::create_mirror_view (row_map);
-  auto entries_host = Kokkos::create_mirror_view (entries);
-  Kokkos::deep_copy (row_map_host, row_map);
-  Kokkos::deep_copy (entries_host, entries);
+  auto row_map_host = Kokkos::create_mirror_view(row_map);
+  auto entries_host = Kokkos::create_mirror_view(entries);
+  Kokkos::deep_copy(row_map_host, row_map);
+  Kokkos::deep_copy(entries_host, entries);
 
   // map col/row to supernode
-  integer_view_host_t map ("map", n);
+  integer_view_host_t map("map", n);
   for (int s = 0; s < nsuper; s++) {
-    for (int j = nb[s]; j < nb[s+1]; j++) {
-      map (j) = s;
+    for (int j = nb[s]; j < nb[s + 1]; j++) {
+      map(j) = s;
     }
   }
 
   // count non-empty supernodal blocks
-  row_map_view_host_t hr ("rowmap_view", nsuper+1);
-  integer_view_host_t check ("check", nsuper);
-  integer_view_host_t  idxs ("idxs",  nsuper);
-  Kokkos::deep_copy (hr, 0);
-  Kokkos::deep_copy (check, 0);
-
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  timer.reset ();
-  #endif
+  row_map_view_host_t hr("rowmap_view", nsuper + 1);
+  integer_view_host_t check("check", nsuper);
+  integer_view_host_t idxs("idxs", nsuper);
+  Kokkos::deep_copy(hr, 0);
+  Kokkos::deep_copy(check, 0);
+
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  timer.reset();
+#endif
   int nblocks = 0;
   for (int s = 0; s < nsuper; s++) {
     int j1 = nb[s];
-    int j2 = j1+1;  // based on the first row
+    int j2 = j1 + 1;  // based on the first row
 
     size_type nidxs = 0;
-    for (size_type i = row_map_host (j1); i < row_map_host (j2); i++) {
-      int s2 = map (entries_host (i));
+    for (size_type i = row_map_host(j1); i < row_map_host(j2); i++) {
+      int s2 = map(entries_host(i));
       // supernodal blocks may not be filled with zeros
       // so need to check by each row
       // (also rowids are not sorted)
-      if (check (s2) == 0) {
-        check (s2) = 1;
-        nblocks ++;
+      if (check(s2) == 0) {
+        check(s2) = 1;
+        nblocks++;
         // count blocks per row for col_major
-        hr (s2+1) ++;
+        hr(s2 + 1)++;
         // keep track of non-zero block ids
-        idxs (nidxs) = s2;
-        nidxs ++;
+        idxs(nidxs) = s2;
+        nidxs++;
       }
     }
     // reset check
-    //Kokkos::deep_copy (check, 0);
+    // Kokkos::deep_copy (check, 0);
     for (size_type i = 0; i < nidxs; i++) {
-      check (idxs(i)) = 0;
+      check(idxs(i)) = 0;
     }
   }
 
-  cols_view_host_t hc ("colmap_view", nblocks);
+  cols_view_host_t hc("colmap_view", nblocks);
   if (col_major) {
     // convert to offset
     for (int s = 0; s < nsuper; s++) {
-      hr (s+1) += hr (s);
+      hr(s + 1) += hr(s);
     }
   }
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  time_seconds = timer.seconds ();
-  std::cout << "   > Generate Supernodal Graph: count blocks   : " << time_seconds << std::endl;
-  timer.reset ();
-  #endif
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  time_seconds = timer.seconds();
+  std::cout << "   > Generate Supernodal Graph: count blocks   : "
+            << time_seconds << std::endl;
+  timer.reset();
+#endif
 
   nblocks = 0;
   for (int s = 0; s < nsuper; s++) {
     int j1 = nb[s];
-    int j2 = j1+1;  // based on the first row
+    int j2 = j1 + 1;  // based on the first row
 
     size_type nidxs = 0;
-    for (size_type i = row_map_host (j1); i < row_map_host (j2); i++) {
-      int s2 = map (entries_host (i));
+    for (size_type i = row_map_host(j1); i < row_map_host(j2); i++) {
+      int s2 = map(entries_host(i));
       // supernodal blocks may not be filled with zeros
       // so need to check by each row
       // (also rowids are not sorted)
-      if (check (s2) == 0) {
-        check (s2) = 1;
+      if (check(s2) == 0) {
+        check(s2) = 1;
         if (col_major) {
-          hc (hr (s2)) = s;
-          hr (s2) ++;
+          hc(hr(s2)) = s;
+          hr(s2)++;
         } else {
-          hc (nblocks) = s2;
+          hc(nblocks) = s2;
         }
-        nblocks ++;
+        nblocks++;
         // keep track of non-zero block ids
-        idxs (nidxs) = s2;
-        nidxs ++;
+        idxs(nidxs) = s2;
+        nidxs++;
       }
     }
     if (!col_major) {
-      hr (s+1) = nblocks;
+      hr(s + 1) = nblocks;
     }
     // reset check
     /*if (!col_major) {
@@ -566,38 +578,40 @@ generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const inpu
       Kokkos::deep_copy (check, 0);
     }*/
     for (size_type i = 0; i < nidxs; i++) {
-      check (idxs(i)) = 0;
+      check(idxs(i)) = 0;
     }
   }
   // fix hr
   if (col_major) {
     for (int s = nsuper; s > 0; s--) {
-      hr (s) = hr (s-1);
+      hr(s) = hr(s - 1);
     }
-    hr (0) = 0;
+    hr(0) = 0;
   }
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  time_seconds = timer.seconds ();
-  std::cout << "   > Generate Supernodal Graph: compress graph : " << time_seconds
-            << " (col_major = " << col_major << ")" << std::endl;
-  timer.reset ();
-  #endif
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  time_seconds = timer.seconds();
+  std::cout << "   > Generate Supernodal Graph: compress graph : "
+            << time_seconds << " (col_major = " << col_major << ")"
+            << std::endl;
+  timer.reset();
+#endif
 
   // sort column ids per row
-  KokkosKernels::sort_crs_graph<Kokkos::HostSpace::execution_space, row_map_view_host_t, cols_view_host_t>
-      (hr, hc);
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  time_seconds = timer.seconds ();
-  std::cout << "   > Generate Supernodal Graph: sort graph     : " << time_seconds << std::endl << std::endl;
-  #endif
-
-  host_graph_t static_graph (hc, hr);
+  KokkosKernels::sort_crs_graph<Kokkos::HostSpace::execution_space,
+                                row_map_view_host_t, cols_view_host_t>(hr, hc);
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  time_seconds = timer.seconds();
+  std::cout << "   > Generate Supernodal Graph: sort graph     : "
+            << time_seconds << std::endl
+            << std::endl;
+#endif
+
+  host_graph_t static_graph(hc, hr);
   return static_graph;
 }
 
 template <typename graph_t>
 graph_t generate_supernodal_dag(int nsuper, graph_t &supL, graph_t &supU) {
-
   // graph_t is assumed to be on the host
   auto row_mapL = supL.row_map;
   auto entriesL = supL.entries;
@@ -605,32 +619,32 @@ graph_t generate_supernodal_dag(int nsuper, graph_t &supL, graph_t &supU) {
   auto entriesU = supU.entries;
 
   // compute dag
-  int totedges = 0;
+  int totedges         = 0;
   using size_type      = typename graph_t::size_type;
   using row_map_view_t = typename graph_t::row_map_type::non_const_type;
   using cols_view_t    = typename graph_t::entries_type::non_const_type;
 
-  row_map_view_t colptr ("rowind", nsuper+1);
-  cols_view_t rowind ("colptr", row_mapL (nsuper)); // over-estimate
-  cols_view_t edges ("edges", nsuper); // workspace
-  cols_view_t check ("edges", nsuper); // workspace
-  Kokkos::deep_copy (check, 0);
-  colptr (0) = totedges;
-  for (int s = 0; s < nsuper; s ++) {
+  row_map_view_t colptr("rowind", nsuper + 1);
+  cols_view_t rowind("colptr", row_mapL(nsuper));  // over-estimate
+  cols_view_t edges("edges", nsuper);              // workspace
+  cols_view_t check("edges", nsuper);              // workspace
+  Kokkos::deep_copy(check, 0);
+  colptr(0) = totedges;
+  for (int s = 0; s < nsuper; s++) {
     // count # of edges (search for first matching nonzero)
     size_type nedges = 0;
-    size_type k1 = 1 + row_mapL (s); // skip diagonal
-    size_type k2 = 1 + row_mapU (s); // skip diagonal
-    for (; k1 < row_mapL (s+1); k1++) {
-       // look for match
-       while (k2+1 < row_mapU (s+1) && entriesL (k1) > entriesU (k2)) {
-         k2 ++;
-       }
-       if (k2+1 >= row_mapU (s+1) || entriesL (k1) <= entriesU (k2)) {
-         edges (nedges) = entriesL (k1);
-         nedges ++;
-         if (entriesL (k1) == entriesU (k2)) {
-           #if 0
+    size_type k1     = 1 + row_mapL(s);  // skip diagonal
+    size_type k2     = 1 + row_mapU(s);  // skip diagonal
+    for (; k1 < row_mapL(s + 1); k1++) {
+      // look for match
+      while (k2 + 1 < row_mapU(s + 1) && entriesL(k1) > entriesU(k2)) {
+        k2++;
+      }
+      if (k2 + 1 >= row_mapU(s + 1) || entriesL(k1) <= entriesU(k2)) {
+        edges(nedges) = entriesL(k1);
+        nedges++;
+        if (entriesL(k1) == entriesU(k2)) {
+#if 0
            // make sure entriesU(k2) rows include the sparsity structure of s?
            for (int k = row_mapL (entriesU (k2)); k < row_mapL (entriesU (k2) + 1); k++) {
              check (entriesL (k)) = 1;
@@ -644,45 +658,44 @@ graph_t generate_supernodal_dag(int nsuper, graph_t &supL, graph_t &supU) {
            for (int k = row_mapL (entriesU (k2)); k < row_mapL (entriesU (k2) + 1); k++) {
              check (entriesL (k)) = 0;
            }
-           #endif
-           break;
-         }
+#endif
+          break;
+        }
       }
     }
     // store the edges
     for (size_type k = 0; k < nedges; k++) {
-      rowind (totedges) = edges (k);
-      totedges ++;
+      rowind(totedges) = edges(k);
+      totedges++;
     }
-    colptr (s+1) = totedges;
+    colptr(s + 1) = totedges;
   }
 
   // compress dag into graph
-  cols_view_t hc ("colmap_view", totedges);
-  for (size_type k = colptr (0); k < colptr (nsuper); k++) {
-    hc (k) = rowind (k);
+  cols_view_t hc("colmap_view", totedges);
+  for (size_type k = colptr(0); k < colptr(nsuper); k++) {
+    hc(k) = rowind(k);
   }
 
-  graph_t static_graph (hc, colptr);
+  graph_t static_graph(hc, colptr);
   return static_graph;
 }
 
-
-
-/* ========================================================================================= */
+/* =========================================================================================
+ */
 template <typename input_graph_t, typename input_size_type>
-void merge_supernodal_graph(int *p_nsuper, input_size_type *nb,
-                            bool col_majorL, input_graph_t &graphL,
-                            bool col_majorU, input_graph_t &graphU,
-                            int *etree) {
-
+void merge_supernodal_graph(int *p_nsuper, input_size_type *nb, bool col_majorL,
+                            input_graph_t &graphL, bool col_majorU,
+                            input_graph_t &graphU, int *etree) {
   int nsuper = *p_nsuper;
 
   // ---------------------------------------------------------------
   // looking for supernodes to merge (i.e., dense diagonal blocks)
   int nsuper2 = 0;
-  auto supL = generate_supernodal_graph<input_graph_t, input_graph_t> (!col_majorL, graphL, nsuper, nb);
-  auto supU = generate_supernodal_graph<input_graph_t, input_graph_t> ( col_majorU, graphU, nsuper, nb);
+  auto supL   = generate_supernodal_graph<input_graph_t, input_graph_t>(
+      !col_majorL, graphL, nsuper, nb);
+  auto supU = generate_supernodal_graph<input_graph_t, input_graph_t>(
+      col_majorU, graphU, nsuper, nb);
 
   auto row_mapL = supL.row_map;
   auto entriesL = supL.entries;
@@ -691,58 +704,61 @@ void merge_supernodal_graph(int *p_nsuper, input_size_type *nb,
   auto entriesU = supU.entries;
 
   // map the first supernode
-  using integer_view_host_t = Kokkos::View<int*, Kokkos::HostSpace>;
-  integer_view_host_t map ("map", nsuper); // map old to new supernodes
-  map (0) = 0;
-  for (int s = 0; s < nsuper-1; s++) {
-    int s2 = s;
+  using integer_view_host_t = Kokkos::View<int *, Kokkos::HostSpace>;
+  integer_view_host_t map("map", nsuper);  // map old to new supernodes
+  map(0) = 0;
+  for (int s = 0; s < nsuper - 1; s++) {
+    int s2      = s;
     bool merged = false;
     do {
       // check if L(s2+1:end, s2) and L(s2+1:end, s2+1)are the same
       bool mergedL = false;
-      int k1 = row_mapL[s2+1] - row_mapL[s2];
-      int k2 = row_mapL[s2+2] - row_mapL[s2+1];
-      if (k1 == k2+1) {
+      int k1       = row_mapL[s2 + 1] - row_mapL[s2];
+      int k2       = row_mapL[s2 + 2] - row_mapL[s2 + 1];
+      if (k1 == k2 + 1) {
         mergedL = true;
         for (int k = 0; k < k2 && mergedL; k++) {
-          if (entriesL[row_mapL[s2]+k+1] != entriesL[row_mapL[s2+1]+k]) {
+          if (entriesL[row_mapL[s2] + k + 1] !=
+              entriesL[row_mapL[s2 + 1] + k]) {
             mergedL = false;
           }
         }
       }
       // check if U(s2+1:end, s2) and U(s2+1:end, s2+1) are the same
       bool mergedU = false;
-      k1 = row_mapU[s2+1] - row_mapU[s2];
-      k2 = row_mapU[s2+2] - row_mapU[s2+1];
-      if (k1 == k2+1) {
+      k1           = row_mapU[s2 + 1] - row_mapU[s2];
+      k2           = row_mapU[s2 + 2] - row_mapU[s2 + 1];
+      if (k1 == k2 + 1) {
         mergedU = true;
         for (int k = 0; k < k2 && mergedU; k++) {
-          if (entriesU[row_mapU[s2]+k+1] != entriesU[row_mapU[s2+1]+k]) {
+          if (entriesU[row_mapU[s2] + k + 1] !=
+              entriesU[row_mapU[s2 + 1] + k]) {
             mergedU = false;
           }
         }
       }
       merged = (mergedL && mergedU);
       if (merged) {
-        //printf( "  >> merge s2+1=%d(%dx%d, row=%d:%d) with s=%d(%dx%d) <\n",
-        //              s2+1,nb[s2+2]-nb[s2+1],nb[s2+2]-nb[s2+1], nb[s2+1],nb[s2+2]-1,
-        //              s,nb[s+1]-nb[s],nb[s+1]-nb[s]);
-        map (s2+1) = nsuper2;
-        s2 ++;
+        // printf( "  >> merge s2+1=%d(%dx%d, row=%d:%d) with s=%d(%dx%d) <\n",
+        //              s2+1,nb[s2+2]-nb[s2+1],nb[s2+2]-nb[s2+1],
+        //              nb[s2+1],nb[s2+2]-1, s,nb[s+1]-nb[s],nb[s+1]-nb[s]);
+        map(s2 + 1) = nsuper2;
+        s2++;
       } else {
-        //printf( "  -- not merge s2+1=%d(%dx%d, row=%d:%d) with s=%d(%dx%d) --\n",
+        // printf( "  -- not merge s2+1=%d(%dx%d, row=%d:%d) with s=%d(%dx%d)
+        // --\n",
         //           s2+1,nb[s2+2]-nb[s2+1],nb[s2+2]-nb[s2+1],nb[s2+1],nb[s2+2]-1,
         //           s,nb[s+1]-nb[s],nb[s+1]-nb[s]);
-        map (s2+1) = nsuper2+1;
+        map(s2 + 1) = nsuper2 + 1;
       }
-    } while (merged && s2 < nsuper-1);
+    } while (merged && s2 < nsuper - 1);
     s = s2;
-    nsuper2 ++;
+    nsuper2++;
   }
-  nsuper2 = map (nsuper-1)+1;
-  //printf( " nsuper2 = %d\n",nsuper2 );
-  //printf( " map:\n" );
-  //for (int s = 0; s < nsuper; s++) printf( "   %d %d\n",s,map (s) );
+  nsuper2 = map(nsuper - 1) + 1;
+  // printf( " nsuper2 = %d\n",nsuper2 );
+  // printf( " map:\n" );
+  // for (int s = 0; s < nsuper; s++) printf( "   %d %d\n",s,map (s) );
 
   // ----------------------------------------------------------
   // make sure each of the merged supernodes has the same parent in the etree
@@ -750,94 +766,92 @@ void merge_supernodal_graph(int *p_nsuper, input_size_type *nb,
   integer_view_host_t map2;
   if (etree != nullptr) {
     nsuper3 = 0;
-    map2 = integer_view_host_t ("map2", nsuper); // map old to new supernodes
+    map2    = integer_view_host_t("map2", nsuper);  // map old to new supernodes
     for (int s2 = 0, s = 0; s2 < nsuper2; s2++) {
       // look for parent of the first supernode
       int s3 = s;
-      while (etree[s3] != -1 && map (etree[s3]) == map (s3)) {
-        s3 ++;
+      while (etree[s3] != -1 && map(etree[s3]) == map(s3)) {
+        s3++;
       }
-      map2 (s) = nsuper3;
-      int p = (etree[s3] == -1 ? -1 : map (etree[s3]));
+      map2(s) = nsuper3;
+      int p   = (etree[s3] == -1 ? -1 : map(etree[s3]));
 
       // go through the rest of the supernode in this merged supernode
       s++;
-      while (s < nsuper && map (s) == s2) {
-        int q = (etree[s3] == -1 ? -1 : map (etree[s3]));
-        while (etree[s3] != -1 && map (etree[s3]) == map (s3)) {
-          s3 ++;
-          q = (etree[s3] == -1 ? -1 : map (etree[s3]));
+      while (s < nsuper && map(s) == s2) {
+        int q = (etree[s3] == -1 ? -1 : map(etree[s3]));
+        while (etree[s3] != -1 && map(etree[s3]) == map(s3)) {
+          s3++;
+          q = (etree[s3] == -1 ? -1 : map(etree[s3]));
         }
 
         if (q != p) {
           p = q;
-          nsuper3 ++;
+          nsuper3++;
         }
-        map2 (s) = nsuper3;
-        s ++;
+        map2(s) = nsuper3;
+        s++;
       }
-      nsuper3 ++;
+      nsuper3++;
     }
   } else {
     nsuper3 = nsuper2;
-    map2 = map;
+    map2    = map;
   }
-  //printf( " nsuper3 = %d\n",nsuper3 );
-  //printf( " map:\n" );
-  //for (int s = 0; s < nsuper; s++) printf( "   %d %d\n",s,map2 (s) );
+  // printf( " nsuper3 = %d\n",nsuper3 );
+  // printf( " map:\n" );
+  // for (int s = 0; s < nsuper; s++) printf( "   %d %d\n",s,map2 (s) );
 
   // ----------------------------------------------------------
   // construct new supernodes
-  integer_view_host_t nb2 ("nb2", 1+nsuper3);
+  integer_view_host_t nb2("nb2", 1 + nsuper3);
   for (int s2 = 0, s = 0; s2 < nsuper3; s2++) {
-    nb2 (1+s2) = 0;
+    nb2(1 + s2) = 0;
     // merging supernodal rows
-    while(s < nsuper && map2 (s) == s2) {
-      nb2 (1+s2) += (nb[s+1]-nb[s]);
-      s ++;
+    while (s < nsuper && map2(s) == s2) {
+      nb2(1 + s2) += (nb[s + 1] - nb[s]);
+      s++;
     }
   }
 
   // copy back the new supernodes "offsets"
-  nb2 (0) = 0;
+  nb2(0) = 0;
   for (int s = 0; s < nsuper3; s++) {
-    nb2 (s+1) = nb2 (s) + nb2 (s+1);
+    nb2(s + 1) = nb2(s) + nb2(s + 1);
   }
   // copy nb
-  for (int s = 0; s <nsuper3; s++) {
-    nb[s+1] = nb2 (s+1);
+  for (int s = 0; s < nsuper3; s++) {
+    nb[s + 1] = nb2(s + 1);
   }
 
   // ----------------------------------------------------------
   // construct new etree
   if (etree != nullptr) {
-    integer_view_host_t etree2 ("etree2", nsuper3);
+    integer_view_host_t etree2("etree2", nsuper3);
     for (int s = 0; s < nsuper; s++) {
       // etree
-      int s2 = map2 (s);
-      int p = (etree[s] == -1 ? -1 : map2 (etree[s]));
+      int s2 = map2(s);
+      int p  = (etree[s] == -1 ? -1 : map2(etree[s]));
       if (p != s2) {
-        etree2 (s2) = p;
+        etree2(s2) = p;
       }
     }
     // copy etree
-    for (int s = 0; s <nsuper3; s++) {
-      etree[s] = etree2 (s);
+    for (int s = 0; s < nsuper3; s++) {
+      etree[s] = etree2(s);
     }
   }
 
   *p_nsuper = nsuper3;
 }
 
-
-/* ========================================================================================= */
-template <typename output_graph_t, typename input_graph_t, typename input_size_type>
-output_graph_t
-generate_merged_supernodal_graph(bool lower,
-                                 int nsuper, const input_size_type *nb,
-                                 int nsuper2,      input_size_type *nb2,
-                                 input_graph_t &graph, int *nnz) {
-
+/* =========================================================================================
+ */
+template <typename output_graph_t, typename input_graph_t,
+          typename input_size_type>
+output_graph_t generate_merged_supernodal_graph(
+    bool lower, int nsuper, const input_size_type *nb, int nsuper2,
+    input_size_type *nb2, input_graph_t &graph, int *nnz) {
   using cols_view_t    = typename output_graph_t::entries_type::non_const_type;
   using row_map_view_t = typename output_graph_t::row_map_type::non_const_type;
   using size_type      = typename input_graph_t::size_type;
@@ -846,143 +860,148 @@ generate_merged_supernodal_graph(bool lower,
   // now let me find nsrow for the merged supernode
   auto row_map = graph.row_map;
   auto entries = graph.entries;
-  int n = graph.numRows ();
+  int n        = graph.numRows();
 
-  using integer_view_host_t = Kokkos::View<int*, Kokkos::HostSpace>;
-  integer_view_host_t mb2 ("mb2", nsuper2);
-  integer_view_host_t work1 ("work1", n);
-  integer_view_host_t work2 ("work2", n+1);
-  Kokkos::deep_copy (work1, 0);
+  using integer_view_host_t = Kokkos::View<int *, Kokkos::HostSpace>;
+  integer_view_host_t mb2("mb2", nsuper2);
+  integer_view_host_t work1("work1", n);
+  integer_view_host_t work2("work2", n + 1);
+  Kokkos::deep_copy(work1, 0);
 
   int nnzS = 0;
   int nnzA = 0;
-  integer_view_host_t rowind ("rowind", row_map(n)); // over-estimate
-  integer_view_host_t colptr ("colptr", nsuper2+1);
-  colptr (0) = nnzS;
+  integer_view_host_t rowind("rowind", row_map(n));  // over-estimate
+  integer_view_host_t colptr("colptr", nsuper2 + 1);
+  colptr(0) = nnzS;
   for (int s2 = 0, s = 0; s2 < nsuper2; s2++) {
-    mb2 (s2) = 0;
+    mb2(s2) = 0;
     // merging supernodal rows
     // NOTE: SuperLU may not fill zeros to fill the supernodes
     //       So, these rows may be just subset of the supernodal rows
-    while (s < nsuper && nb[s+1] <= nb2[s2+1]) {
+    while (s < nsuper && nb[s + 1] <= nb2[s2 + 1]) {
       input_size_type j1 = nb[s];
-      for (size_type k = row_map (j1); k < row_map (j1+1); k++) {
+      for (size_type k = row_map(j1); k < row_map(j1 + 1); k++) {
         // just taking union of rows
-        if (work1 (entries[k]) == 0) {
-          work1 (entries[k]) = 1;
-          work2 (mb2 (s2)) = entries[k];
-          mb2 (s2) ++;
+        if (work1(entries[k]) == 0) {
+          work1(entries[k]) = 1;
+          work2(mb2(s2))    = entries[k];
+          mb2(s2)++;
         }
       }
       s++;
     }
     // sort such that diagonal come on the top
-    std::sort(work2.data (), work2.data () + mb2 (s2));
+    std::sort(work2.data(), work2.data() + mb2(s2));
 
     // save nonzero row ids
     if (lower) {
       // lower in csc, diagonal come on top
-      for (int k = 0; k < mb2 (s2); k++) {
-        rowind (nnzS) = work2 (k);
-        work1 (work2 (k)) = 0;
-        nnzS ++;
+      for (int k = 0; k < mb2(s2); k++) {
+        rowind(nnzS)    = work2(k);
+        work1(work2(k)) = 0;
+        nnzS++;
       }
     } else {
       // upper in csc, diagonal block is on bottom right now, but move it to top
-      int nd = nb2[s2+1]-nb2[s2]; // size of diagonal block
+      int nd = nb2[s2 + 1] - nb2[s2];  // size of diagonal block
       // > diagonal block
-      for (int k = mb2 (s2) - nd; k < mb2 (s2); k++) {
-        rowind (nnzS) = work2 (k);
-        work1 (work2 (k)) = 0;
-        nnzS ++;
+      for (int k = mb2(s2) - nd; k < mb2(s2); k++) {
+        rowind(nnzS)    = work2(k);
+        work1(work2(k)) = 0;
+        nnzS++;
       }
       // > offdiagonal blocks
-      for (int k = 0; k < mb2 (s2) - nd; k++) {
-        rowind (nnzS) = work2 (k);
-        work1 (work2 (k)) = 0;
-        nnzS ++;
+      for (int k = 0; k < mb2(s2) - nd; k++) {
+        rowind(nnzS)    = work2(k);
+        work1(work2(k)) = 0;
+        nnzS++;
       }
     }
-    colptr (s2+1) = nnzS;
-    nnzA += (nb2[s2+1]-nb2[s2]) * mb2 (s2);
+    colptr(s2 + 1) = nnzS;
+    nnzA += (nb2[s2 + 1] - nb2[s2]) * mb2(s2);
   }
 
   // ----------------------------------------------------------
   // now let's create crs graph
-  row_map_view_t rowmap_view ("rowmap_view", n+1);
-  cols_view_t    column_view ("colmap_view", nnzA);
-  auto hr = Kokkos::create_mirror_view (rowmap_view);
-  auto hc = Kokkos::create_mirror_view (column_view);
+  row_map_view_t rowmap_view("rowmap_view", n + 1);
+  cols_view_t column_view("colmap_view", nnzA);
+  auto hr = Kokkos::create_mirror_view(rowmap_view);
+  auto hc = Kokkos::create_mirror_view(column_view);
 
-  nnzA = 0;
+  nnzA  = 0;
   hr(0) = 0;
   for (int s2 = 0; s2 < nsuper2; s2++) {
-    for (int j = nb2[s2]; j < nb2[s2+1]; j++) {
-      for (int k = colptr (s2); k < colptr (s2+1); k++) {
-        hc(nnzA) = rowind (k);
-        nnzA ++;
+    for (int j = nb2[s2]; j < nb2[s2 + 1]; j++) {
+      for (int k = colptr(s2); k < colptr(s2 + 1); k++) {
+        hc(nnzA) = rowind(k);
+        nnzA++;
       }
-      hr(j+1) = nnzA;
+      hr(j + 1) = nnzA;
     }
   }
   *nnz = nnzA;
 
   // deepcopy
-  Kokkos::deep_copy (rowmap_view, hr);
-  Kokkos::deep_copy (column_view, hc);
+  Kokkos::deep_copy(rowmap_view, hr);
+  Kokkos::deep_copy(column_view, hc);
 
   // create crs
-  output_graph_t static_graph (column_view, rowmap_view);
+  output_graph_t static_graph(column_view, rowmap_view);
   return static_graph;
 }
 
-/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
-/* For symbolic analysis                                                                     */
+/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+/* For symbolic analysis */
 template <typename host_graph_t, typename KernelHandle>
-void sptrsv_supernodal_symbolic(
-    int nsuper, int *supercols, int *etree,
-    host_graph_t graphL_host, KernelHandle *kernelHandleL,
-    host_graph_t graphU_host, KernelHandle *kernelHandleU) {
-
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  int nrows = graphL_host.numRows ();
+void sptrsv_supernodal_symbolic(int nsuper, int *supercols, int *etree,
+                                host_graph_t graphL_host,
+                                KernelHandle *kernelHandleL,
+                                host_graph_t graphU_host,
+                                KernelHandle *kernelHandleU) {
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  int nrows           = graphL_host.numRows();
   double time_seconds = 0.0;
   Kokkos::Timer timer;
   Kokkos::Timer tic;
-  timer.reset ();
-  #endif
+  timer.reset();
+#endif
 
   // ===================================================================
   // load sptrsv-handles
-  auto *handleL = kernelHandleL->get_sptrsv_handle ();
-  auto *handleU = kernelHandleU->get_sptrsv_handle ();
+  auto *handleL = kernelHandleL->get_sptrsv_handle();
+  auto *handleU = kernelHandleU->get_sptrsv_handle();
 
   // store arguments to handle
-  handleL->set_graph_host (graphL_host);
-  handleU->set_graph_host (graphU_host);
-  handleL->set_supernodes (nsuper, supercols, etree);
-  handleU->set_supernodes (nsuper, supercols, etree);
+  handleL->set_graph_host(graphL_host);
+  handleU->set_graph_host(graphU_host);
+  handleL->set_supernodes(nsuper, supercols, etree);
+  handleU->set_supernodes(nsuper, supercols, etree);
 
   // ===================================================================
   // load options
-  bool col_majorL = handleL->is_column_major ();
-  bool col_majorU = handleU->is_column_major ();
-  bool merge = handleL->get_merge_supernodes ();
-  bool UinCSC = handleU->is_column_major ();
-  bool needEtree = (handleL->get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV ||
-                    handleL->get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_ETREE);
+  bool col_majorL = handleL->is_column_major();
+  bool col_majorU = handleU->is_column_major();
+  bool merge      = handleL->get_merge_supernodes();
+  bool UinCSC     = handleU->is_column_major();
+  bool needEtree =
+      (handleL->get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV ||
+       handleL->get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE);
   if (needEtree && etree == nullptr) {
     std::cout << std::endl
-              << " ** etree needs to be set before calling sptrsv_symbolic with SuperLU **"
-              << std::endl << std::endl;
+              << " ** etree needs to be set before calling sptrsv_symbolic "
+                 "with SuperLU **"
+              << std::endl
+              << std::endl;
     return;
   }
 
   // ===================================================================
   // > make a copy of supercols (merge needs both original and merged supercols)
-  using integer_view_host_t = typename KernelHandle::SPTRSVHandleType::integer_view_host_t;
-  integer_view_host_t supercols_view ("supercols view", 1+nsuper);
-  int *supercols_merged = supercols_view.data ();
+  using integer_view_host_t =
+      typename KernelHandle::SPTRSVHandleType::integer_view_host_t;
+  integer_view_host_t supercols_view("supercols view", 1 + nsuper);
+  int *supercols_merged = supercols_view.data();
   for (int i = 0; i <= nsuper; i++) {
     supercols_merged[i] = supercols[i];
   }
@@ -990,56 +1009,61 @@ void sptrsv_supernodal_symbolic(
     // =================================================================
     // merge supernodes
     int nsuper_merged = nsuper;
-    #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-    tic.reset ();
-    check_supernode_sizes("Original L-structure", nrows, nsuper, supercols_merged, graphL_host);
-    check_supernode_sizes("Original U-structure", nrows, nsuper, supercols_merged, graphU_host);
-    #endif
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+    tic.reset();
+    check_supernode_sizes("Original L-structure", nrows, nsuper,
+                          supercols_merged, graphL_host);
+    check_supernode_sizes("Original U-structure", nrows, nsuper,
+                          supercols_merged, graphU_host);
+#endif
     // etree will be updated
-    merge_supernodal_graph (&nsuper_merged, supercols_merged,
-                            col_majorL, graphL_host, col_majorU, graphU_host,
-                            etree);
-
-    // =================================================================
-    // generate merged graph for L-solve
-    #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-    int nnzL = graphL_host.row_map (nrows);
-    #endif
+    merge_supernodal_graph(&nsuper_merged, supercols_merged, col_majorL,
+                           graphL_host, col_majorU, graphU_host, etree);
+
+// =================================================================
+// generate merged graph for L-solve
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+    int nnzL = graphL_host.row_map(nrows);
+#endif
     int nnzL_merged;
     bool lower = true;
-    handleL->set_original_graph_host (graphL_host); // save graph before merge
-    graphL_host = generate_merged_supernodal_graph<host_graph_t> (lower, nsuper, supercols,
-                                                                  nsuper_merged, supercols_merged,
-                                                                  graphL_host, &nnzL_merged);
-    #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-    time_seconds = tic.seconds ();
-    check_supernode_sizes ("After Merge", nrows, nsuper_merged, supercols_merged, graphL_host);
+    handleL->set_original_graph_host(graphL_host);  // save graph before merge
+    graphL_host = generate_merged_supernodal_graph<host_graph_t>(
+        lower, nsuper, supercols, nsuper_merged, supercols_merged, graphL_host,
+        &nnzL_merged);
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+    time_seconds = tic.seconds();
+    check_supernode_sizes("After Merge", nrows, nsuper_merged, supercols_merged,
+                          graphL_host);
     std::cout << " for L factor:" << std::endl;
     std::cout << "   Merge Supernodes Time: " << time_seconds << std::endl;
     std::cout << "   Number of nonzeros   : " << nnzL << " -> " << nnzL_merged
-              << " : " << double(nnzL_merged) / double(nnzL) << "x" << std::endl;
-    #endif
-
-    // =================================================================
-    // generate merged graph for U-solve
-    #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-    tic.reset ();
-    int nnzU = graphU_host.row_map (nrows);
-    #endif
+              << " : " << double(nnzL_merged) / double(nnzL) << "x"
+              << std::endl;
+#endif
+
+// =================================================================
+// generate merged graph for U-solve
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+    tic.reset();
+    int nnzU = graphU_host.row_map(nrows);
+#endif
     int nnzU_merged;
     lower = (UinCSC ? false : true);
-    handleU->set_original_graph_host (graphU_host); // save graph before merge
-    graphU_host = generate_merged_supernodal_graph<host_graph_t> (lower, nsuper, supercols,
-                                                                  nsuper_merged, supercols_merged,
-                                                                  graphU_host, &nnzU_merged);
-    #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-    time_seconds = tic.seconds ();
-    check_supernode_sizes("After Merge", nrows, nsuper_merged, supercols_merged, graphU_host);
+    handleU->set_original_graph_host(graphU_host);  // save graph before merge
+    graphU_host = generate_merged_supernodal_graph<host_graph_t>(
+        lower, nsuper, supercols, nsuper_merged, supercols_merged, graphU_host,
+        &nnzU_merged);
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+    time_seconds = tic.seconds();
+    check_supernode_sizes("After Merge", nrows, nsuper_merged, supercols_merged,
+                          graphU_host);
     std::cout << " for U factor:" << std::endl;
     std::cout << "   Merge Supernodes Time: " << time_seconds << std::endl;
     std::cout << "   Number of nonzeros   : " << nnzU << " -> " << nnzU_merged
-              << " : " << double(nnzU_merged) / double(nnzU) << "x" << std::endl;
-    #endif
+              << " : " << double(nnzU_merged) / double(nnzU) << "x"
+              << std::endl;
+#endif
 
     // update the number of supernodes
     nsuper = nsuper_merged;
@@ -1050,333 +1074,383 @@ void sptrsv_supernodal_symbolic(
   // ===================================================================
   // copy graph to device
   using graph_t = typename KernelHandle::SPTRSVHandleType::graph_t;
-  auto graphL = deep_copy_graph<host_graph_t, graph_t> (graphL_host);
-  auto graphU = deep_copy_graph<host_graph_t, graph_t> (graphU_host);
+  auto graphL   = deep_copy_graph<host_graph_t, graph_t>(graphL_host);
+  auto graphU   = deep_copy_graph<host_graph_t, graph_t>(graphU_host);
 
   // ===================================================================
   // save the supernodal info in the handles for L/U solves
-  handleL->set_supernodes (nsuper, supercols_view, etree);
-  handleU->set_supernodes (nsuper, supercols_view, etree);
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  time_seconds = tic.seconds ();
+  handleL->set_supernodes(nsuper, supercols_view, etree);
+  handleU->set_supernodes(nsuper, supercols_view, etree);
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  time_seconds = tic.seconds();
   std::cout << "   Deep-copy graph Time: " << time_seconds << std::endl;
-  tic.reset ();
-  #endif
+  tic.reset();
+#endif
 
-  if (handleL->get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_DAG ||
-      handleL->get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) {
+  if (handleL->get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG ||
+      handleL->get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) {
     // generate supernodal graphs for DAG scheduling
-    auto supL = generate_supernodal_graph<host_graph_t> (!col_majorL, graphL_host, nsuper, supercols);
-    auto supU = generate_supernodal_graph<host_graph_t> ( col_majorU, graphU_host, nsuper, supercols);
-    #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-    time_seconds = tic.seconds ();
-    std::cout << "   Compute Supernodal Graph Time: " << time_seconds << std::endl;
-    tic.reset ();
-    #endif
-
-    auto dagL = generate_supernodal_dag<host_graph_t> (nsuper, supL, supU);
-    auto dagU = generate_supernodal_dag<host_graph_t> (nsuper, supU, supL);
-    handleL->set_supernodal_dag (dagL);
-    handleU->set_supernodal_dag (dagU);
-    #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-    time_seconds = tic.seconds ();
+    auto supL = generate_supernodal_graph<host_graph_t>(
+        !col_majorL, graphL_host, nsuper, supercols);
+    auto supU = generate_supernodal_graph<host_graph_t>(col_majorU, graphU_host,
+                                                        nsuper, supercols);
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+    time_seconds = tic.seconds();
+    std::cout << "   Compute Supernodal Graph Time: " << time_seconds
+              << std::endl;
+    tic.reset();
+#endif
+
+    auto dagL = generate_supernodal_dag<host_graph_t>(nsuper, supL, supU);
+    auto dagU = generate_supernodal_dag<host_graph_t>(nsuper, supU, supL);
+    handleL->set_supernodal_dag(dagL);
+    handleU->set_supernodal_dag(dagU);
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+    time_seconds = tic.seconds();
     std::cout << "   Compute DAG Time: " << time_seconds << std::endl;
-    tic.reset ();
-    #endif
+    tic.reset();
+#endif
   }
 
   // ===================================================================
   // do symbolic for L solve on the host
   auto row_mapL = graphL.row_map;
   auto entriesL = graphL.entries;
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
   tic.reset();
   std::cout << std::endl;
-  #endif
-  sptrsv_symbolic (kernelHandleL, row_mapL, entriesL);
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  time_seconds = tic.seconds ();
+#endif
+  sptrsv_symbolic(kernelHandleL, row_mapL, entriesL);
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  time_seconds = tic.seconds();
   std::cout << " > Lower-TRI: " << std::endl;
   std::cout << "   Symbolic Time: " << time_seconds << std::endl;
-  tic.reset ();
-  #endif
+  tic.reset();
+#endif
 
   // ===================================================================
   // do symbolic for U solve on the host
   auto row_mapU = graphU.row_map;
   auto entriesU = graphU.entries;
-  sptrsv_symbolic (kernelHandleU, row_mapU, entriesU);
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  time_seconds = tic.seconds ();
+  sptrsv_symbolic(kernelHandleU, row_mapU, entriesU);
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  time_seconds = tic.seconds();
   std::cout << " > Upper-TRI: " << std::endl;
   std::cout << "   Symbolic Time: " << time_seconds << std::endl;
-  #endif
+#endif
 
   // ===================================================================
   // save options
-  handleL->set_merge_supernodes (merge);
-  handleU->set_merge_supernodes (merge);
+  handleL->set_merge_supernodes(merge);
+  handleU->set_merge_supernodes(merge);
 
   // ===================================================================
   // save graphs
-  handleL->set_graph (graphL);
-  handleU->set_graph (graphU);
+  handleL->set_graph(graphL);
+  handleU->set_graph(graphU);
   // graph on host (merged)
-  handleL->set_graph_host (graphL_host);
-  handleU->set_graph_host (graphU_host);
+  handleL->set_graph_host(graphL_host);
+  handleU->set_graph_host(graphU_host);
 
   // ===================================================================
-  handleL->set_symbolic_complete ();
-  handleU->set_symbolic_complete ();
-  handleL->set_etree (etree);
-  handleU->set_etree (etree);
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  time_seconds = timer.seconds ();
-  std::cout << "   Total Symbolic Time: " << time_seconds << std::endl << std::endl;
-  std::cout << "   Total nnz: " << graphL_host.row_map (nrows) << " + " << graphU_host.row_map (nrows) << std::endl;
-  #endif
+  handleL->set_symbolic_complete();
+  handleU->set_symbolic_complete();
+  handleL->set_etree(etree);
+  handleU->set_etree(etree);
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  time_seconds = timer.seconds();
+  std::cout << "   Total Symbolic Time: " << time_seconds << std::endl
+            << std::endl;
+  std::cout << "   Total nnz: " << graphL_host.row_map(nrows) << " + "
+            << graphU_host.row_map(nrows) << std::endl;
+#endif
 }
 
+/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+/* Auxiliary functions for numeric computation */
+
+/* =========================================================================================
+ */
+struct Tag_SupTrtriFunctor {};
+struct Tag_SupTrtriTrmmFunctor {};
+
+template <typename UploType, typename DiagType, typename integer_view_host_t,
+          typename input_size_type, typename row_map_type, typename index_type,
+          typename values_type>
+struct TriSupernodalTrtriFunctor {
+  integer_view_host_t supernode_ids;
+  const input_size_type *nb;
+  row_map_type hr;
+  index_type hc;
+  values_type hv;
+
+  KOKKOS_INLINE_FUNCTION
+  TriSupernodalTrtriFunctor(integer_view_host_t supernode_ids_,
+                            const input_size_type *nb_, row_map_type &hr_,
+                            index_type &hc_, values_type &hv_)
+      : supernode_ids(supernode_ids_), nb(nb_), hr(hr_), hc(hc_), hv(hv_) {}
+
+  // functor: just invert diagonal
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const Tag_SupTrtriFunctor &, const int i) const {
+    using execution_space = typename values_type::execution_space;
+    using memory_space    = typename execution_space::memory_space;
+    using values_view_t   = typename values_type::non_const_type;
+    using scalar_t        = typename values_view_t::value_type;
+
+    using range_type    = Kokkos::pair<int, int>;
+    using TrtriAlgoType = KokkosBatched::Algo::Trtri::Unblocked;
+
+    int s     = supernode_ids(i);
+    int j1    = nb[s];
+    int nsrow = hr(j1 + 1) - hr(j1);
+    int nscol = nb[s + 1] - nb[s];
+
+    // invert diagonal
+    auto nnzD = hr(j1);
+    Kokkos::View<scalar_t **, Kokkos::LayoutLeft, memory_space,
+                 Kokkos::MemoryUnmanaged>
+        viewL(&hv(nnzD), nsrow, nscol);
+    auto Ljj = Kokkos::subview(viewL, range_type(0, nscol), Kokkos::ALL());
+    KokkosBatched::SerialTrtri<UploType, DiagType, TrtriAlgoType>::invoke(Ljj);
+  }
 
-
-/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
-/* Auxiliary functions for numeric computation                                               */
-
-/* ========================================================================================= */
-  struct Tag_SupTrtriFunctor{};
-  struct Tag_SupTrtriTrmmFunctor{};
-
-  template <typename UploType, typename DiagType, typename integer_view_host_t,
-            typename input_size_type, typename row_map_type, typename index_type, typename values_type>
-  struct TriSupernodalTrtriFunctor {
-
-    integer_view_host_t supernode_ids;
-    const input_size_type *nb;
-    row_map_type hr;
-    index_type   hc;
-    values_type  hv;
-
-    KOKKOS_INLINE_FUNCTION
-    TriSupernodalTrtriFunctor(integer_view_host_t supernode_ids_, const input_size_type *nb_,
-                              row_map_type& hr_, index_type& hc_, values_type& hv_) :
-    supernode_ids(supernode_ids_),
-    nb(nb_),
-    hr(hr_),
-    hc(hc_),
-    hv(hv_)
-    {}
-
-    // functor: just invert diagonal
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const Tag_SupTrtriFunctor&, const int i) const {
-      using execution_space = typename values_type::execution_space;
-      using memory_space    = typename execution_space::memory_space;
-      using values_view_t   = typename values_type::non_const_type;
-      using scalar_t        = typename values_view_t::value_type;
-
-      using range_type = Kokkos::pair<int, int>;
-      using TrtriAlgoType = KokkosBatched::Algo::Trtri::Unblocked;
-
-      int s = supernode_ids(i);
-      int j1 = nb[s];
-      int nsrow = hr(j1+1) - hr(j1);
-      int nscol = nb[s +1] - nb[s];
-
-      // invert diagonal
-      auto nnzD = hr (j1);
-      Kokkos::View<scalar_t**, Kokkos::LayoutLeft, memory_space, Kokkos::MemoryUnmanaged>
-        viewL (&hv(nnzD), nsrow, nscol);
-      auto Ljj = Kokkos::subview (viewL, range_type (0, nscol), Kokkos::ALL ());
-      KokkosBatched::SerialTrtri<UploType, DiagType, TrtriAlgoType>::invoke(Ljj);
-    }
-
-    // functor: invert diagonal + apply inverse to off-diagonal
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const Tag_SupTrtriTrmmFunctor&, const int i) const {
-      using execution_space = typename values_type::execution_space;
-      using memory_space    = typename execution_space::memory_space;
-      using values_view_t   = typename values_type::non_const_type;
-      using scalar_t        = typename values_view_t::value_type;
-
-      using range_type = Kokkos::pair<int, int>;
-      using TrtriAlgoType = KokkosBatched::Algo::Trtri::Unblocked;
-      using Side  = KokkosBatched::Side;
-      using Trans = KokkosBatched::Trans;
-
-      int s = supernode_ids(i);
-      int j1 = nb[s];
-      int nsrow = hr(j1+1) - hr(j1);
-      int nscol = nb[s +1] - nb[s];
-
-      // invert diagonal
-      auto nnzD = hr (j1);
-      Kokkos::View<scalar_t**, Kokkos::LayoutLeft, memory_space, Kokkos::MemoryUnmanaged>
-        viewL (&hv(nnzD), nsrow, nscol);
-      auto Ljj = Kokkos::subview (viewL, range_type (0, nscol), Kokkos::ALL ());
-      KokkosBatched::SerialTrtri<UploType, DiagType, TrtriAlgoType>::invoke(Ljj);
-
-      // apply invse to off-diagonal
-      //if (nsrow > nscol && invert_offdiag)
-      {
-        const scalar_t one (1.0);
-        auto Lij = Kokkos::subview (viewL, range_type (nscol, nsrow), Kokkos::ALL ());
-        KokkosBatched::SerialTrmm<Side::Right, UploType, Trans::NoTranspose, DiagType, TrtriAlgoType>::
-          invoke(one, Ljj, Lij);
-      }
+  // functor: invert diagonal + apply inverse to off-diagonal
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const Tag_SupTrtriTrmmFunctor &, const int i) const {
+    using execution_space = typename values_type::execution_space;
+    using memory_space    = typename execution_space::memory_space;
+    using values_view_t   = typename values_type::non_const_type;
+    using scalar_t        = typename values_view_t::value_type;
+
+    using range_type    = Kokkos::pair<int, int>;
+    using TrtriAlgoType = KokkosBatched::Algo::Trtri::Unblocked;
+    using Side          = KokkosBatched::Side;
+    using Trans         = KokkosBatched::Trans;
+
+    int s     = supernode_ids(i);
+    int j1    = nb[s];
+    int nsrow = hr(j1 + 1) - hr(j1);
+    int nscol = nb[s + 1] - nb[s];
+
+    // invert diagonal
+    auto nnzD = hr(j1);
+    Kokkos::View<scalar_t **, Kokkos::LayoutLeft, memory_space,
+                 Kokkos::MemoryUnmanaged>
+        viewL(&hv(nnzD), nsrow, nscol);
+    auto Ljj = Kokkos::subview(viewL, range_type(0, nscol), Kokkos::ALL());
+    KokkosBatched::SerialTrtri<UploType, DiagType, TrtriAlgoType>::invoke(Ljj);
+
+    // apply invse to off-diagonal
+    // if (nsrow > nscol && invert_offdiag)
+    {
+      const scalar_t one(1.0);
+      auto Lij =
+          Kokkos::subview(viewL, range_type(nscol, nsrow), Kokkos::ALL());
+      KokkosBatched::SerialTrmm<Side::Right, UploType, Trans::NoTranspose,
+                                DiagType, TrtriAlgoType>::invoke(one, Ljj, Lij);
     }
-  };
-
+  }
+};
 
-/* ========================================================================================= */
+/* =========================================================================================
+ */
 template <typename KernelHandle, typename input_size_type,
           typename row_map_type, typename index_type, typename values_type,
           typename integer_view_host_t>
-void
-invert_supernodal_columns_batched(KernelHandle *kernelHandle, bool unit_diag, const input_size_type *nb,
-                                  row_map_type& hr, index_type& hc, values_type& hv, int num_batches, integer_view_host_t supernode_ids) {
-
+void invert_supernodal_columns_batched(KernelHandle *kernelHandle,
+                                       bool unit_diag,
+                                       const input_size_type *nb,
+                                       row_map_type &hr, index_type &hc,
+                                       values_type &hv, int num_batches,
+                                       integer_view_host_t supernode_ids) {
   using execution_space = typename values_type::execution_space;
 
   using Uplo = KokkosBatched::Uplo;
   using Diag = KokkosBatched::Diag;
 
   // load parameters
-  auto *handle = kernelHandle->get_sptrsv_handle ();
-  bool invert_diag = handle->get_invert_diagonal ();
-  bool invert_offdiag = handle->get_invert_offdiagonal ();
+  auto *handle        = kernelHandle->get_sptrsv_handle();
+  bool invert_diag    = handle->get_invert_diagonal();
+  bool invert_offdiag = handle->get_invert_offdiagonal();
 
   // quick return
   if (!invert_diag && !invert_offdiag) return;
 
   if (num_batches > 0) {
     // lower is always in CSC, if UinCSC, then lower=false, else lower=true
-    bool lower_tri = kernelHandle->is_sptrsv_lower_tri ();
-    bool lower = ((lower_tri && handle->is_column_major ()) || (!lower_tri && !handle->is_column_major ()));
+    bool lower_tri = kernelHandle->is_sptrsv_lower_tri();
+    bool lower     = ((lower_tri && handle->is_column_major()) ||
+                  (!lower_tri && !handle->is_column_major()));
 
     if (lower) {
       if (unit_diag) {
         if (invert_offdiag) {
-          using range_policy = Kokkos::RangePolicy<Tag_SupTrtriTrmmFunctor, execution_space>;
-          TriSupernodalTrtriFunctor<Uplo::Lower, Diag::Unit, integer_view_host_t, input_size_type, row_map_type, index_type, values_type>
-            sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv);
-          Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batches), sptrsv_tritri_functor);
+          using range_policy =
+              Kokkos::RangePolicy<Tag_SupTrtriTrmmFunctor, execution_space>;
+          TriSupernodalTrtriFunctor<Uplo::Lower, Diag::Unit,
+                                    integer_view_host_t, input_size_type,
+                                    row_map_type, index_type, values_type>
+              sptrsv_tritri_functor(supernode_ids, nb, hr, hc, hv);
+          Kokkos::parallel_for("TriSupernodalTrtriFunctor",
+                               range_policy(0, num_batches),
+                               sptrsv_tritri_functor);
         } else {
-          using range_policy = Kokkos::RangePolicy<Tag_SupTrtriFunctor, execution_space>;
-          TriSupernodalTrtriFunctor<Uplo::Lower, Diag::Unit, integer_view_host_t, input_size_type, row_map_type, index_type, values_type>
-            sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv);
-          Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batches), sptrsv_tritri_functor);
+          using range_policy =
+              Kokkos::RangePolicy<Tag_SupTrtriFunctor, execution_space>;
+          TriSupernodalTrtriFunctor<Uplo::Lower, Diag::Unit,
+                                    integer_view_host_t, input_size_type,
+                                    row_map_type, index_type, values_type>
+              sptrsv_tritri_functor(supernode_ids, nb, hr, hc, hv);
+          Kokkos::parallel_for("TriSupernodalTrtriFunctor",
+                               range_policy(0, num_batches),
+                               sptrsv_tritri_functor);
         }
       } else {
         if (invert_offdiag) {
-          using range_policy = Kokkos::RangePolicy<Tag_SupTrtriTrmmFunctor, execution_space>;
-          TriSupernodalTrtriFunctor<Uplo::Lower, Diag::NonUnit, integer_view_host_t, input_size_type, row_map_type, index_type, values_type>
-            sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv);
-          Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batches), sptrsv_tritri_functor);
+          using range_policy =
+              Kokkos::RangePolicy<Tag_SupTrtriTrmmFunctor, execution_space>;
+          TriSupernodalTrtriFunctor<Uplo::Lower, Diag::NonUnit,
+                                    integer_view_host_t, input_size_type,
+                                    row_map_type, index_type, values_type>
+              sptrsv_tritri_functor(supernode_ids, nb, hr, hc, hv);
+          Kokkos::parallel_for("TriSupernodalTrtriFunctor",
+                               range_policy(0, num_batches),
+                               sptrsv_tritri_functor);
         } else {
-          using range_policy = Kokkos::RangePolicy<Tag_SupTrtriFunctor, execution_space>;
-          TriSupernodalTrtriFunctor<Uplo::Lower, Diag::NonUnit, integer_view_host_t, input_size_type, row_map_type, index_type, values_type>
-            sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv);
-          Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batches), sptrsv_tritri_functor);
+          using range_policy =
+              Kokkos::RangePolicy<Tag_SupTrtriFunctor, execution_space>;
+          TriSupernodalTrtriFunctor<Uplo::Lower, Diag::NonUnit,
+                                    integer_view_host_t, input_size_type,
+                                    row_map_type, index_type, values_type>
+              sptrsv_tritri_functor(supernode_ids, nb, hr, hc, hv);
+          Kokkos::parallel_for("TriSupernodalTrtriFunctor",
+                               range_policy(0, num_batches),
+                               sptrsv_tritri_functor);
         }
       }
     } else {
       if (unit_diag) {
         if (invert_offdiag) {
-          using range_policy = Kokkos::RangePolicy<Tag_SupTrtriTrmmFunctor, execution_space>;
-          TriSupernodalTrtriFunctor<Uplo::Upper, Diag::Unit, integer_view_host_t, input_size_type, row_map_type, index_type, values_type>
-            sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv);
-          Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batches), sptrsv_tritri_functor);
+          using range_policy =
+              Kokkos::RangePolicy<Tag_SupTrtriTrmmFunctor, execution_space>;
+          TriSupernodalTrtriFunctor<Uplo::Upper, Diag::Unit,
+                                    integer_view_host_t, input_size_type,
+                                    row_map_type, index_type, values_type>
+              sptrsv_tritri_functor(supernode_ids, nb, hr, hc, hv);
+          Kokkos::parallel_for("TriSupernodalTrtriFunctor",
+                               range_policy(0, num_batches),
+                               sptrsv_tritri_functor);
         } else {
-          using range_policy = Kokkos::RangePolicy<Tag_SupTrtriFunctor, execution_space>;
-          TriSupernodalTrtriFunctor<Uplo::Upper, Diag::Unit, integer_view_host_t, input_size_type, row_map_type, index_type, values_type>
-            sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv);
-          Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batches), sptrsv_tritri_functor);
+          using range_policy =
+              Kokkos::RangePolicy<Tag_SupTrtriFunctor, execution_space>;
+          TriSupernodalTrtriFunctor<Uplo::Upper, Diag::Unit,
+                                    integer_view_host_t, input_size_type,
+                                    row_map_type, index_type, values_type>
+              sptrsv_tritri_functor(supernode_ids, nb, hr, hc, hv);
+          Kokkos::parallel_for("TriSupernodalTrtriFunctor",
+                               range_policy(0, num_batches),
+                               sptrsv_tritri_functor);
         }
       } else {
         if (invert_offdiag) {
-          using range_policy = Kokkos::RangePolicy<Tag_SupTrtriTrmmFunctor, execution_space>;
-          TriSupernodalTrtriFunctor<Uplo::Upper, Diag::NonUnit, integer_view_host_t, input_size_type, row_map_type, index_type, values_type>
-            sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv);
-          Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batches), sptrsv_tritri_functor);
+          using range_policy =
+              Kokkos::RangePolicy<Tag_SupTrtriTrmmFunctor, execution_space>;
+          TriSupernodalTrtriFunctor<Uplo::Upper, Diag::NonUnit,
+                                    integer_view_host_t, input_size_type,
+                                    row_map_type, index_type, values_type>
+              sptrsv_tritri_functor(supernode_ids, nb, hr, hc, hv);
+          Kokkos::parallel_for("TriSupernodalTrtriFunctor",
+                               range_policy(0, num_batches),
+                               sptrsv_tritri_functor);
         } else {
-          using range_policy = Kokkos::RangePolicy<Tag_SupTrtriFunctor, execution_space>;
-          TriSupernodalTrtriFunctor<Uplo::Upper, Diag::NonUnit, integer_view_host_t, input_size_type, row_map_type, index_type, values_type>
-            sptrsv_tritri_functor (supernode_ids, nb, hr, hc, hv);
-          Kokkos::parallel_for("TriSupernodalTrtriFunctor", range_policy(0, num_batches), sptrsv_tritri_functor);
+          using range_policy =
+              Kokkos::RangePolicy<Tag_SupTrtriFunctor, execution_space>;
+          TriSupernodalTrtriFunctor<Uplo::Upper, Diag::NonUnit,
+                                    integer_view_host_t, input_size_type,
+                                    row_map_type, index_type, values_type>
+              sptrsv_tritri_functor(supernode_ids, nb, hr, hc, hv);
+          Kokkos::parallel_for("TriSupernodalTrtriFunctor",
+                               range_policy(0, num_batches),
+                               sptrsv_tritri_functor);
         }
       }
     }
   }
 }
 
-
-/* ========================================================================================= */
+/* =========================================================================================
+ */
 template <typename KernelHandle, typename input_size_type,
           typename row_map_type, typename index_type, typename values_type>
-void
-invert_supernodal_columns(KernelHandle *kernelHandle, bool unit_diag, int nsuper, const input_size_type *nb,
-                          row_map_type& hr, index_type& hc, values_type& hv) {
-
-  using execution_space = typename values_type::execution_space;
-  using memory_space    = typename execution_space::memory_space;
-  using values_view_t   = typename values_type::non_const_type;
-  using scalar_t        = typename values_view_t::value_type;
-  using range_type = Kokkos::pair<int, int>;
-  using integer_view_host_t = Kokkos::View<int*, Kokkos::HostSpace>;
-
-  const scalar_t one (1.0);
+void invert_supernodal_columns(KernelHandle *kernelHandle, bool unit_diag,
+                               int nsuper, const input_size_type *nb,
+                               row_map_type &hr, index_type &hc,
+                               values_type &hv) {
+  using execution_space     = typename values_type::execution_space;
+  using memory_space        = typename execution_space::memory_space;
+  using values_view_t       = typename values_type::non_const_type;
+  using scalar_t            = typename values_view_t::value_type;
+  using range_type          = Kokkos::pair<int, int>;
+  using integer_view_host_t = Kokkos::View<int *, Kokkos::HostSpace>;
+
+  const scalar_t one(1.0);
 
   // load parameters
-  auto *handle = kernelHandle->get_sptrsv_handle ();
-  bool invert_diag = handle->get_invert_diagonal ();
-  bool invert_offdiag = handle->get_invert_offdiagonal ();
+  auto *handle        = kernelHandle->get_sptrsv_handle();
+  bool invert_diag    = handle->get_invert_diagonal();
+  bool invert_offdiag = handle->get_invert_offdiagonal();
 
   // lower is always in CSC, if UinCSC, then lower=false, else lower=true
-  bool lower_tri = kernelHandle->is_sptrsv_lower_tri ();
-  bool lower = ((lower_tri && handle->is_column_major ()) || (!lower_tri && !handle->is_column_major ()));
+  bool lower_tri = kernelHandle->is_sptrsv_lower_tri();
+  bool lower     = ((lower_tri && handle->is_column_major()) ||
+                (!lower_tri && !handle->is_column_major()));
 
   // quick return
   if (!invert_diag) return;
 
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
   Kokkos::Timer timer;
   double time1 = 0.0;
   double time2 = 0.0;
   double time3 = 0.0;
-  #endif
+#endif
 
   // ----------------------------------------------------------
   // now let's invert some blocks
   // > first go through all the supernode columns
   // > use KokkosBlas on large blocks, and keep track of small blocks
   // > to call batchedBlas on them
-  int num_batches = 0;
+  int num_batches    = 0;
   int size_unblocked = handle->get_supernode_size_unblocked();
-  integer_view_host_t supernode_ids ("supernode_batch", nsuper);
+  integer_view_host_t supernode_ids("supernode_batch", nsuper);
 
   // ----------------------------------------------------------
   // If we are running KokkosKernels::trmm on device,
   // then we need to allocate a workspace on device
   using trmm_execution_space = typename KernelHandle::HandleExecSpace;
   using trmm_memory_space    = typename trmm_execution_space::memory_space;
-  using trmm_view_t = Kokkos::View<scalar_t*, trmm_execution_space>;
-  #if !defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS)
-  // use KokkosBlas::trmm only with CUBLAS (since deep-copy to host throws an error)
+  using trmm_view_t          = Kokkos::View<scalar_t *, trmm_execution_space>;
+#if !defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS)
+  // use KokkosBlas::trmm only with CUBLAS (since deep-copy to host throws an
+  // error)
   bool run_trmm_on_device = false;
-  #else
-  bool run_trmm_on_device = (handle->get_trmm_on_device() &&
-                             !std::is_same< trmm_execution_space, execution_space>::value);
-  #endif
+#else
+  bool run_trmm_on_device =
+      (handle->get_trmm_on_device() &&
+       !std::is_same<trmm_execution_space, execution_space>::value);
+#endif
 
   // figure out largest supernode
   int lwork = 0;
-  trmm_view_t trmm_dwork ("trmm_dwork", lwork);
-  if(run_trmm_on_device) {
+  trmm_view_t trmm_dwork("trmm_dwork", lwork);
+  if (run_trmm_on_device) {
     for (int s2 = 0; s2 < nsuper; s2++) {
-      int nscol = nb[s2+1] - nb[s2];
+      int nscol = nb[s2 + 1] - nb[s2];
       if (nscol >= size_unblocked) {
-        int j1 = nb[s2];
-        int nsrow = hr(j1+1) - hr(j1);
+        int j1    = nb[s2];
+        int nsrow = hr(j1 + 1) - hr(j1);
         if (lwork < nsrow * nscol) {
           lwork = nsrow * nscol;
         }
@@ -1395,215 +1469,221 @@ invert_supernodal_columns(KernelHandle *kernelHandle, bool unit_diag, int nsuper
   // now go through the supernode columns and invert "large" supernodes
   // using KokkosKernels::trtri (host) and KokkosKernels::trmm (host or device)
   for (int s2 = 0; s2 < nsuper; s2++) {
-    int nscol = nb[s2+1] - nb[s2];
+    int nscol = nb[s2 + 1] - nb[s2];
 
     if (nscol >= size_unblocked) {
-      int j1 = nb[s2];
-      int nsrow = hr(j1+1) - hr(j1);
+      int j1    = nb[s2];
+      int nsrow = hr(j1 + 1) - hr(j1);
 
-      auto nnzD = hr (j1);
+      auto nnzD      = hr(j1);
       char uplo_char = (lower ? 'L' : 'U');
       char diag_char = (unit_diag ? 'U' : 'N');
 
       // NOTE: we currently supports only default_layout = LayoutLeft
-      Kokkos::View<scalar_t**, default_layout, memory_space, Kokkos::MemoryUnmanaged>
-        viewL (&hv(nnzD), nsrow, nscol);
-      auto Ljj = Kokkos::subview (viewL, range_type (0, nscol), Kokkos::ALL ());
-
-      #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-      timer.reset ();
-      #endif
-      #if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)
-      if(run_trmm_on_device) {
-        Kokkos::View<scalar_t**, Kokkos::LayoutLeft, trmm_memory_space, Kokkos::MemoryUnmanaged>
-          dViewL (trmm_dwork.data(), nsrow, nscol);
+      Kokkos::View<scalar_t **, default_layout, memory_space,
+                   Kokkos::MemoryUnmanaged>
+          viewL(&hv(nnzD), nsrow, nscol);
+      auto Ljj = Kokkos::subview(viewL, range_type(0, nscol), Kokkos::ALL());
+
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+      timer.reset();
+#endif
+#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)
+      if (run_trmm_on_device) {
+        Kokkos::View<scalar_t **, Kokkos::LayoutLeft, trmm_memory_space,
+                     Kokkos::MemoryUnmanaged>
+            dViewL(trmm_dwork.data(), nsrow, nscol);
 
         // deep-copy the whole supernode column to device
         Kokkos::deep_copy(dViewL, viewL);
 
         // call trtri on device
-        auto dViewLjj = Kokkos::subview (dViewL, range_type (0, nscol), Kokkos::ALL ());
+        auto dViewLjj =
+            Kokkos::subview(dViewL, range_type(0, nscol), Kokkos::ALL());
         KokkosBlas::trtri(&uplo_char, &diag_char, dViewLjj);
       } else
-      #endif
+#endif
       {
         // call trtri on host
         KokkosBlas::trtri(&uplo_char, &diag_char, Ljj);
       }
-      #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-      time1 += timer.seconds ();
-      #endif
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+      time1 += timer.seconds();
+#endif
 
       if (nsrow > nscol && invert_offdiag) {
         char side_char = 'R';
         char tran_char = 'N';
-        auto Lij = Kokkos::subview (viewL, range_type (nscol, nsrow), Kokkos::ALL ());
-
-        #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-        timer.reset ();
-        #endif
-        if(run_trmm_on_device) {
-          Kokkos::View<scalar_t**, Kokkos::LayoutLeft, trmm_memory_space, Kokkos::MemoryUnmanaged>
-            dViewL (trmm_dwork.data(), nsrow, nscol);
-
-          #if !defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)
+        auto Lij =
+            Kokkos::subview(viewL, range_type(nscol, nsrow), Kokkos::ALL());
+
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+        timer.reset();
+#endif
+        if (run_trmm_on_device) {
+          Kokkos::View<scalar_t **, Kokkos::LayoutLeft, trmm_memory_space,
+                       Kokkos::MemoryUnmanaged>
+              dViewL(trmm_dwork.data(), nsrow, nscol);
+
+#if !defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)
           // deep-copy the whole supernode column to device
           Kokkos::deep_copy(dViewL, viewL);
-          #endif
+#endif
 
           // NOTE: we currently supports only default_layout = LayoutLeft
-          auto dViewLjj = Kokkos::subview (dViewL, range_type (0, nscol), Kokkos::ALL ());
-          auto dViewLij = Kokkos::subview (dViewL, range_type (nscol, nsrow), Kokkos::ALL ());
+          auto dViewLjj =
+              Kokkos::subview(dViewL, range_type(0, nscol), Kokkos::ALL());
+          auto dViewLij =
+              Kokkos::subview(dViewL, range_type(nscol, nsrow), Kokkos::ALL());
 
-          KokkosBlas::trmm (&side_char, &uplo_char,
-                            &tran_char, &diag_char,
-                            one, dViewLjj, dViewLij);
+          KokkosBlas::trmm(&side_char, &uplo_char, &tran_char, &diag_char, one,
+                           dViewLjj, dViewLij);
 
-          #if !defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)
-          // deep-copy the whole panel back to host (since I cannot just deep-copy Lij)
+#if !defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)
+          // deep-copy the whole panel back to host (since I cannot just
+          // deep-copy Lij)
           Kokkos::deep_copy(viewL, dViewL);
-          #endif
-        } else
-        {
-          KokkosBlas::trmm (&side_char, &uplo_char,
-                            &tran_char, &diag_char,
-                            one, Ljj, Lij);
+#endif
+        } else {
+          KokkosBlas::trmm(&side_char, &uplo_char, &tran_char, &diag_char, one,
+                           Ljj, Lij);
         }
-        #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-        time2 += timer.seconds ();
-        #endif
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+        time2 += timer.seconds();
+#endif
       }
 
-      #if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)
-      if(run_trmm_on_device) {
+#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)
+      if (run_trmm_on_device) {
         // deep-copy the whole supernode column back to host
-        Kokkos::View<scalar_t**, Kokkos::LayoutLeft, trmm_memory_space, Kokkos::MemoryUnmanaged>
-          dViewL (trmm_dwork.data(), nsrow, nscol);
+        Kokkos::View<scalar_t **, Kokkos::LayoutLeft, trmm_memory_space,
+                     Kokkos::MemoryUnmanaged>
+            dViewL(trmm_dwork.data(), nsrow, nscol);
         Kokkos::deep_copy(viewL, dViewL);
       }
-      #endif
-    }
-    else {
-      supernode_ids (num_batches) = s2;
-      num_batches ++;
+#endif
+    } else {
+      supernode_ids(num_batches) = s2;
+      num_batches++;
     }
   }
 
-  // ----------------------------------------------------------
-  // now call batchedBLAS on "small" supernodes
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  timer.reset ();
-  #endif
-  invert_supernodal_columns_batched(kernelHandle, unit_diag, nb, hr, hc, hv, num_batches, supernode_ids);
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  time3 = timer.seconds ();
-  #endif
-
-  if(run_trmm_on_device) {
+// ----------------------------------------------------------
+// now call batchedBLAS on "small" supernodes
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  timer.reset();
+#endif
+  invert_supernodal_columns_batched(kernelHandle, unit_diag, nb, hr, hc, hv,
+                                    num_batches, supernode_ids);
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  time3 = timer.seconds();
+#endif
+
+  if (run_trmm_on_device) {
     // to make sure the data is deep-copied to host..
     Kokkos::fence();
   }
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
   std::cout << "   invert_supernodes" << std::endl;
-  std::cout << "   + num supernodes = " << nsuper << " num batchs = " << num_batches << std::endl;
+  std::cout << "   + num supernodes = " << nsuper
+            << " num batchs = " << num_batches << std::endl;
   std::cout << "   > Time for inversion::trtri : " << time1 << std::endl;
   std::cout << "   > Time for inversion::trmm  : " << time2 << std::endl;
   std::cout << "   > Time for batchs           : " << time3 << std::endl;
-  #endif
+#endif
 }
 
+/* =========================================================================================
+ */
+template <typename crsmat_t, typename input_crsmat_t, typename input_ptr_type,
+          typename graph_t, typename KernelHandle>
+crsmat_t read_merged_supernodes(KernelHandle *kernelHandle, int nsuper,
+                                const input_ptr_type *mb, bool unit_diag,
+                                input_crsmat_t &L, graph_t &static_graph) {
+  using values_view_t      = typename crsmat_t::values_type::non_const_type;
+  using scalar_t           = typename values_view_t::value_type;
+  using scalar_view_host_t = Kokkos::View<scalar_t *, Kokkos::HostSpace>;
 
-/* ========================================================================================= */
-template <typename crsmat_t, typename input_crsmat_t, typename input_ptr_type, typename graph_t,
-          typename KernelHandle>
-crsmat_t
-read_merged_supernodes(KernelHandle *kernelHandle, int nsuper, const input_ptr_type *mb,
-                       bool unit_diag, input_crsmat_t &L, graph_t &static_graph) {
-
-  using values_view_t  = typename crsmat_t::values_type::non_const_type;
-  using scalar_t = typename values_view_t::value_type;
-  using scalar_view_host_t = Kokkos::View<scalar_t*, Kokkos::HostSpace>;
-
-  const scalar_t zero (0.0);
+  const scalar_t zero(0.0);
 
   // original matrix
-  auto graphL = L.graph; // in_graph
+  auto graphL   = L.graph;  // in_graph
   auto row_mapL = graphL.row_map;
   auto entriesL = graphL.entries;
   auto valuesL  = L.values;
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
   Kokkos::Timer timer;
   Kokkos::Timer timer2;
-  timer.reset ();
-  timer2.reset ();
-  #endif
+  timer.reset();
+  timer2.reset();
+#endif
 
   // merged graph
   auto rowmap_view = static_graph.row_map;
   auto column_view = static_graph.entries;
 
-  auto hr = Kokkos::create_mirror_view (rowmap_view);
-  auto hc = Kokkos::create_mirror_view (column_view);
-  Kokkos::deep_copy (hr, rowmap_view);
-  Kokkos::deep_copy (hc, column_view);
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  double time_copy = timer2.seconds ();
-  timer2.reset ();
-  #endif
+  auto hr = Kokkos::create_mirror_view(rowmap_view);
+  auto hc = Kokkos::create_mirror_view(column_view);
+  Kokkos::deep_copy(hr, rowmap_view);
+  Kokkos::deep_copy(hc, column_view);
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  double time_copy = timer2.seconds();
+  timer2.reset();
+#endif
 
   // ----------------------------------------------------------
   // now let's merge supernodes
-  int n = graphL.numRows ();
-  scalar_view_host_t dwork ("dwork", n);
-  Kokkos::deep_copy (dwork, zero);
-
-  auto nnzA = hr (n);
-  values_view_t values_view ("values_view", nnzA);
-  auto hv = Kokkos::create_mirror_view (values_view);
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  double time_mirror = timer2.seconds ();
-  timer2.reset ();
-  #endif
+  int n = graphL.numRows();
+  scalar_view_host_t dwork("dwork", n);
+  Kokkos::deep_copy(dwork, zero);
+
+  auto nnzA = hr(n);
+  values_view_t values_view("values_view", nnzA);
+  auto hv = Kokkos::create_mirror_view(values_view);
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  double time_mirror = timer2.seconds();
+  timer2.reset();
+#endif
 
   for (int s2 = 0; s2 < nsuper; s2++) {
-    for (int j = mb[s2]; j < mb[s2+1]; j++) {
-      for (int k = row_mapL[j]; k < row_mapL[j+1]; k++) {
-        dwork (entriesL[k]) = valuesL[k];
+    for (int j = mb[s2]; j < mb[s2 + 1]; j++) {
+      for (int k = row_mapL[j]; k < row_mapL[j + 1]; k++) {
+        dwork(entriesL[k]) = valuesL[k];
       }
-      for (int k = hr (j); k < hr (j+1); k++) {
-        hv(k) = dwork (hc(k));
+      for (int k = hr(j); k < hr(j + 1); k++) {
+        hv(k) = dwork(hc(k));
       }
-      for (int k = row_mapL[j]; k < row_mapL[j+1]; k++) {
-        dwork (entriesL[k]) = zero;
+      for (int k = row_mapL[j]; k < row_mapL[j + 1]; k++) {
+        dwork(entriesL[k]) = zero;
       }
     }
   }
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  double time_merge = timer2.seconds ();
-  timer2.reset ();
-  #endif
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  double time_merge = timer2.seconds();
+  timer2.reset();
+#endif
 
   // invert blocks (TODO done on host for now)
-  invert_supernodal_columns (kernelHandle, unit_diag, nsuper, mb, hr, hc, hv);
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  double time_invert = timer2.seconds ();
-  timer2.reset ();
-  #endif
+  invert_supernodal_columns(kernelHandle, unit_diag, nsuper, mb, hr, hc, hv);
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  double time_invert = timer2.seconds();
+  timer2.reset();
+#endif
   // deepcopy
-  Kokkos::deep_copy (values_view, hv);
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  time_copy += timer2.seconds ();
-  #endif
+  Kokkos::deep_copy(values_view, hv);
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  time_copy += timer2.seconds();
+#endif
 
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  double time = timer.seconds ();
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  double time = timer.seconds();
   std::cout << "   read_merged_supernodes" << std::endl;
   std::cout << "   > Time       : " << time << std::endl;
-  std::cout << "    + copy   time : " << time_copy   << std::endl;
+  std::cout << "    + copy   time : " << time_copy << std::endl;
   std::cout << "    + mirror time : " << time_mirror << std::endl;
-  std::cout << "    + merge  time : " << time_merge  << std::endl;
+  std::cout << "    + merge  time : " << time_merge << std::endl;
   std::cout << "    + invert time : " << time_invert << std::endl;
-  #endif
+#endif
 
   // create crs
   crsmat_t crsmat("CrsMatrix", n, values_view, static_graph);
@@ -1611,47 +1691,49 @@ read_merged_supernodes(KernelHandle *kernelHandle, int nsuper, const input_ptr_t
   return crsmat;
 }
 
-
-/* ========================================================================================= */
+/* =========================================================================================
+ */
 template <typename crsmat_t, typename graph_t, typename scalar_t,
-          typename input_size_type, typename input_ptr_type,
-          typename size_type, typename ordinal_type, typename KernelHandle>
-crsmat_t
-read_supernodal_values(KernelHandle *kernelHandle,
-                       int n, int nsuper, bool ptr_by_column, const input_size_type *mb, const input_ptr_type *nb,
-                       const size_type *colptr, ordinal_type *rowind, scalar_t *Lx, graph_t &static_graph) {
-
-  using  values_view_t = typename crsmat_t::values_type::non_const_type;
-  using integer_view_host_t = Kokkos::View<ordinal_type*, Kokkos::HostSpace>;
-
-  const scalar_t zero (0.0);
-  const scalar_t one (1.0);
+          typename input_size_type, typename input_ptr_type, typename size_type,
+          typename ordinal_type, typename KernelHandle>
+crsmat_t read_supernodal_values(KernelHandle *kernelHandle, int n, int nsuper,
+                                bool ptr_by_column, const input_size_type *mb,
+                                const input_ptr_type *nb,
+                                const size_type *colptr, ordinal_type *rowind,
+                                scalar_t *Lx, graph_t &static_graph) {
+  using values_view_t       = typename crsmat_t::values_type::non_const_type;
+  using integer_view_host_t = Kokkos::View<ordinal_type *, Kokkos::HostSpace>;
+
+  const scalar_t zero(0.0);
+  const scalar_t one(1.0);
 
   Kokkos::Timer timer;
-  timer.reset ();
+  timer.reset();
 
   // load parameters
-  auto *handle = kernelHandle->get_sptrsv_handle ();
-  bool unit_diag = handle->is_unit_diagonal ();
-  bool merge = handle->get_merge_supernodes ();
+  auto *handle   = kernelHandle->get_sptrsv_handle();
+  bool unit_diag = handle->is_unit_diagonal();
+  bool merge     = handle->get_merge_supernodes();
 
   // lower is always in CSC, if UinCSC, then lower=false, else lower=true
-  bool lower_tri = kernelHandle->is_sptrsv_lower_tri ();
-  bool lower = ((lower_tri && handle->is_column_major ()) || (!lower_tri && !handle->is_column_major ()));
+  bool lower_tri = kernelHandle->is_sptrsv_lower_tri();
+  bool lower     = ((lower_tri && handle->is_column_major()) ||
+                (!lower_tri && !handle->is_column_major()));
 
   // load graph
   auto rowmap_view = static_graph.row_map;
   auto column_view = static_graph.entries;
-  auto hr = Kokkos::create_mirror_view (rowmap_view);
-  auto hc = Kokkos::create_mirror_view (column_view);
-  Kokkos::deep_copy (hr, rowmap_view);
-  Kokkos::deep_copy (hc, column_view);
+  auto hr          = Kokkos::create_mirror_view(rowmap_view);
+  auto hc          = Kokkos::create_mirror_view(column_view);
+  Kokkos::deep_copy(hr, rowmap_view);
+  Kokkos::deep_copy(hc, column_view);
 
   // total nnz
-  int nnzL = hr (n);
-  values_view_t values_view ("values_view", nnzL);
-  auto hv = Kokkos::create_mirror_view (values_view);
-  Kokkos::deep_copy (hv, zero); // seems to be needed (instead of zeroing out upper)
+  int nnzL = hr(n);
+  values_view_t values_view("values_view", nnzL);
+  auto hv = Kokkos::create_mirror_view(values_view);
+  Kokkos::deep_copy(hv,
+                    zero);  // seems to be needed (instead of zeroing out upper)
 
   // compute max nnz per row
   int max_nnz_per_row = 0;
@@ -1661,10 +1743,10 @@ read_supernodal_values(KernelHandle *kernelHandle,
       int j1 = nb[s];
 
       i1 = mb[j1];
-      i2 = mb[j1+1];
+      i2 = mb[j1 + 1];
     } else {
       i1 = mb[s];
-      i2 = mb[s+1];
+      i2 = mb[s + 1];
     }
     // "total" number of rows in all the supernodes (diagonal+off-diagonal)
     int nsrow = i2 - i1;
@@ -1673,26 +1755,29 @@ read_supernodal_values(KernelHandle *kernelHandle,
     }
   }
 
-  integer_view_host_t sorted_rowind ("sorted_rowind", max_nnz_per_row+1);
+  integer_view_host_t sorted_rowind("sorted_rowind", max_nnz_per_row + 1);
   // store L in csr
   for (int s = 0; s < nsuper; s++) {
-    int j1 = nb[s];
-    int j2 = nb[s+1];
-    int nscol = j2 - j1;      // number of columns in the s-th supernode column
+    int j1    = nb[s];
+    int j2    = nb[s + 1];
+    int nscol = j2 - j1;  // number of columns in the s-th supernode column
 
     int i1, i2;
     if (ptr_by_column) {
       i1 = mb[j1];
-      i2 = mb[j1+1];
+      i2 = mb[j1 + 1];
     } else {
       i1 = mb[s];
-      i2 = mb[s+1];
+      i2 = mb[s + 1];
     }
-    int nsrow  = i2 - i1;    // "total" number of rows in all the supernodes (diagonal+off-diagonal)
-    int nsrow2 = nsrow - nscol;  // "total" number of rows in all the off-diagonal supernodes
-    int ps2    = i1 + nscol;     // offset into rowind
+    int nsrow = i2 - i1;  // "total" number of rows in all the supernodes
+                          // (diagonal+off-diagonal)
+    int nsrow2 =
+        nsrow -
+        nscol;  // "total" number of rows in all the off-diagonal supernodes
+    int ps2 = i1 + nscol;  // offset into rowind
 
-    int psx;                 // offset into data,   Lx[s][s]
+    int psx;  // offset into data,   Lx[s][s]
     if (ptr_by_column) {
       psx = colptr[j1];
     } else {
@@ -1700,169 +1785,175 @@ read_supernodal_values(KernelHandle *kernelHandle,
     }
 
     /* diagonal block */
-    // for each column (or row due to symmetry), the diagonal supernodal block is stored (in ascending order of row indexes) first
-    // so that we can do TRSM on the diagonal block
+    // for each column (or row due to symmetry), the diagonal supernodal block
+    // is stored (in ascending order of row indexes) first so that we can do
+    // TRSM on the diagonal block
     for (int jj = 0; jj < nscol; jj++) {
       if (lower) {
         // shift for explicitly store zeros in upper-part
-        hr(j1+jj) += jj;
+        hr(j1 + jj) += jj;
         // diagonal
         if (unit_diag) {
-          hv(hr(j1+jj)) = one;
+          hv(hr(j1 + jj)) = one;
         } else {
-          hv(hr(j1+jj)) = Lx[psx + (jj + jj*nsrow)];
+          hv(hr(j1 + jj)) = Lx[psx + (jj + jj * nsrow)];
         }
-        hr(j1+jj) ++;
+        hr(j1 + jj)++;
         // lower-triangular part
-        for (int ii = jj+1; ii < nscol; ii++) {
-          hv(hr(j1+jj)) = Lx[psx + (ii + jj*nsrow)];
-          hr(j1+jj) ++;
+        for (int ii = jj + 1; ii < nscol; ii++) {
+          hv(hr(j1 + jj)) = Lx[psx + (ii + jj * nsrow)];
+          hr(j1 + jj)++;
         }
       } else {
         // upper-triangular part
         for (int ii = 0; ii < jj; ii++) {
-          hv(hr(j1+jj)) = Lx[psx + (ii + jj*nsrow)];
-          hr(j1+jj) ++;
+          hv(hr(j1 + jj)) = Lx[psx + (ii + jj * nsrow)];
+          hr(j1 + jj)++;
         }
         // diagonal
         if (unit_diag) {
-          hv(hr(j1+jj)) = one;
+          hv(hr(j1 + jj)) = one;
         } else {
-          hv(hr(j1+jj)) = Lx[psx + (jj + jj*nsrow)];
+          hv(hr(j1 + jj)) = Lx[psx + (jj + jj * nsrow)];
         }
-        hr(j1+jj) ++;
+        hr(j1 + jj)++;
         // shift for explicitly store zeros in lower-part
-        hr(j1+jj) += (nscol-jj-1);
+        hr(j1 + jj) += (nscol - jj - 1);
       }
     }
     /* off-diagonal blocks */
     if (merge) {
       // sort rowind (to merge supernodes)
       for (int ii = 0; ii < nsrow2; ii++) {
-        sorted_rowind (ii) = ii;
+        sorted_rowind(ii) = ii;
       }
-      std::sort(sorted_rowind.data (), sorted_rowind.data () + nsrow2, sort_indices<ordinal_type>(&rowind[ps2]));
+      std::sort(sorted_rowind.data(), sorted_rowind.data() + nsrow2,
+                sort_indices<ordinal_type>(&rowind[ps2]));
     }
     for (int jj = 0; jj < nscol; jj++) {
       for (int kk = 0; kk < nsrow2; kk++) {
-        int ii = (merge ? sorted_rowind (kk) : kk); // sorted rowind
-        hv(hr(j1+jj)) = Lx[psx + (nscol+ii + jj*nsrow)];
-        hr(j1+jj) ++;
+        int ii          = (merge ? sorted_rowind(kk) : kk);  // sorted rowind
+        hv(hr(j1 + jj)) = Lx[psx + (nscol + ii + jj * nsrow)];
+        hr(j1 + jj)++;
       }
     }
   }
 
   // fix hr
   for (int i = n; i >= 1; i--) {
-    hr(i) = hr(i-1);
+    hr(i) = hr(i - 1);
   }
   hr(0) = 0;
 
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  std::cout << "    read_supernodal_values(" << (lower ? "lower)" : "upper)") << std::endl;
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  std::cout << "    read_supernodal_values(" << (lower ? "lower)" : "upper)")
+            << std::endl;
   std::cout << "    * Matrix size = " << n << std::endl;
-  std::cout << "    * Total nnz   = " << hr (n) << std::endl;
-  std::cout << "    * nnz / n     = " << hr (n)/n << std::endl;
+  std::cout << "    * Total nnz   = " << hr(n) << std::endl;
+  std::cout << "    * nnz / n     = " << hr(n) / n << std::endl;
 
-  double time = timer.seconds ();
+  double time = timer.seconds();
   std::cout << "    > Time : " << time << std::endl;
-  #endif
+#endif
 
   // invert blocks (TODO done on host for now)
-  invert_supernodal_columns (kernelHandle, unit_diag, nsuper, nb, hr, hc, hv);
+  invert_supernodal_columns(kernelHandle, unit_diag, nsuper, nb, hr, hc, hv);
   // deepcopy
-  Kokkos::deep_copy (values_view, hv);
+  Kokkos::deep_copy(values_view, hv);
 
   // create crs
-  crsmat_t crsmat ("CrsMatrix", n, values_view, static_graph);
+  crsmat_t crsmat("CrsMatrix", n, values_view, static_graph);
 
   return crsmat;
 }
 
-
-/* ========================================================================================= */
+/* =========================================================================================
+ */
 template <typename crsmat_t, typename graph_t, typename scalar_t,
-          typename input_size_type, typename input_ptr_type,
-          typename size_type, typename ordinal_type, typename KernelHandle>
-crsmat_t
-read_supernodal_valuesLt(KernelHandle *kernelHandle,
-                         int n, int nsuper, bool ptr_by_column, const input_size_type *mb, const input_ptr_type *nb,
-                         const size_type *colptr, ordinal_type *rowind, scalar_t *Lx, graph_t &static_graph) {
-
-  using  values_view_t = typename crsmat_t::values_type::non_const_type;
-  using integer_view_host_t = Kokkos::View<int*, Kokkos::HostSpace>;
-
-  const scalar_t zero (0.0);
-  const scalar_t one (1.0);
+          typename input_size_type, typename input_ptr_type, typename size_type,
+          typename ordinal_type, typename KernelHandle>
+crsmat_t read_supernodal_valuesLt(KernelHandle *kernelHandle, int n, int nsuper,
+                                  bool ptr_by_column, const input_size_type *mb,
+                                  const input_ptr_type *nb,
+                                  const size_type *colptr, ordinal_type *rowind,
+                                  scalar_t *Lx, graph_t &static_graph) {
+  using values_view_t       = typename crsmat_t::values_type::non_const_type;
+  using integer_view_host_t = Kokkos::View<int *, Kokkos::HostSpace>;
+
+  const scalar_t zero(0.0);
+  const scalar_t one(1.0);
 
   Kokkos::Timer timer;
-  timer.reset ();
+  timer.reset();
 
   // load parameters
-  auto *handle = kernelHandle->get_sptrsv_handle ();
-  bool unit_diag = handle->is_unit_diagonal ();
+  auto *handle   = kernelHandle->get_sptrsv_handle();
+  bool unit_diag = handle->is_unit_diagonal();
 
   // load graph
   auto rowmap_view = static_graph.row_map;
   auto column_view = static_graph.entries;
-  auto hr = Kokkos::create_mirror_view (rowmap_view);
-  auto hc = Kokkos::create_mirror_view (column_view);
-  Kokkos::deep_copy (hr, rowmap_view);
-  Kokkos::deep_copy (hc, column_view);
+  auto hr          = Kokkos::create_mirror_view(rowmap_view);
+  auto hc          = Kokkos::create_mirror_view(column_view);
+  Kokkos::deep_copy(hr, rowmap_view);
+  Kokkos::deep_copy(hc, column_view);
 
   // total nnz
-  int nnzL = hr (n);
-  values_view_t values_view ("values_view", nnzL);
-  auto hv = Kokkos::create_mirror_view (values_view);
-  Kokkos::deep_copy (hv, zero); // seems to be needed (instead of zeroing out upper)
+  int nnzL = hr(n);
+  values_view_t values_view("values_view", nnzL);
+  auto hv = Kokkos::create_mirror_view(values_view);
+  Kokkos::deep_copy(hv,
+                    zero);  // seems to be needed (instead of zeroing out upper)
 
   /* create a map from row id to supernode id */
-  integer_view_host_t map ("map", n);
+  integer_view_host_t map("map", n);
   int supid = 0;
   for (int k = 0; k < nsuper; k++) {
     int j1 = nb[k];
-    int j2 = nb[k+1];
+    int j2 = nb[k + 1];
     for (int j = j1; j < j2; j++) {
-      map (j) = supid;
+      map(j) = supid;
     }
-    supid ++;
+    supid++;
   }
 
   // pointer to off-diagonals (diagonal comes first)
-  integer_view_host_t off ("off", n+1);
+  integer_view_host_t off("off", n + 1);
   for (int s = 0; s < nsuper; s++) {
     int i1 = nb[s];
-    int i2 = nb[s+1];
+    int i2 = nb[s + 1];
     // number of columns in the s-th supernode column
     int nscol = i2 - i1;
     for (int ii = i1; ii < i2; ii++) {
-      off (ii) = hr (ii) + nscol;
+      off(ii) = hr(ii) + nscol;
     }
   }
 
   // store L in csr
-  integer_view_host_t sup ("sup", nsuper);
-  integer_view_host_t check ("check", nsuper);
-  Kokkos::deep_copy (check, 0);
+  integer_view_host_t sup("sup", nsuper);
+  integer_view_host_t check("check", nsuper);
+  Kokkos::deep_copy(check, 0);
   for (int s = 0; s < nsuper; s++) {
-
-    int j1 = nb[s];
-    int j2 = nb[s+1];
-    int nscol = j2 - j1;      // number of columns in the s-th supernode column
+    int j1    = nb[s];
+    int j2    = nb[s + 1];
+    int nscol = j2 - j1;  // number of columns in the s-th supernode column
 
     int i1, i2;
     if (ptr_by_column) {
       i1 = mb[j1];
-      i2 = mb[j1+1];
+      i2 = mb[j1 + 1];
     } else {
       i1 = mb[s];
-      i2 = mb[s+1];
+      i2 = mb[s + 1];
     }
-    int nsrow  = i2 - i1;    // "total" number of rows in all the supernodes (diagonal+off-diagonal)
-    int nsrow2 = nsrow - nscol;  // "total" number of rows in all the off-diagonal supernodes
-    int ps2    = i1 + nscol;     // offset into rowind
+    int nsrow = i2 - i1;  // "total" number of rows in all the supernodes
+                          // (diagonal+off-diagonal)
+    int nsrow2 =
+        nsrow -
+        nscol;  // "total" number of rows in all the off-diagonal supernodes
+    int ps2 = i1 + nscol;  // offset into rowind
 
-    int psx;                 // offset into data,   Lx[s][s]
+    int psx;  // offset into data,   Lx[s][s]
     if (ptr_by_column) {
       psx = colptr[j1];
     } else {
@@ -1870,18 +1961,19 @@ read_supernodal_valuesLt(KernelHandle *kernelHandle,
     }
 
     /* diagonal block */
-    // for each column (or row due to symmetry), the diagonal supernodal block is stored (in ascending order of row indexes) first
-    // so that we can do TRSM on the diagonal block
+    // for each column (or row due to symmetry), the diagonal supernodal block
+    // is stored (in ascending order of row indexes) first so that we can do
+    // TRSM on the diagonal block
     for (int ii = 0; ii < nscol; ii++) {
       // lower-triangular part
       for (int jj = 0; jj < ii; jj++) {
-        hv(hr(j1+ii)+jj) = Lx[psx + (ii + jj*nsrow)];
+        hv(hr(j1 + ii) + jj) = Lx[psx + (ii + jj * nsrow)];
       }
       // diagonal
       if (unit_diag) {
-        hv(hr(j1+ii)+ii) = one;
+        hv(hr(j1 + ii) + ii) = one;
       } else {
-        hv(hr(j1+ii)+ii) = Lx[psx + (ii + ii*nsrow)];
+        hv(hr(j1 + ii) + ii) = Lx[psx + (ii + ii * nsrow)];
       }
     }
     /* off-diagonal blocks */
@@ -1890,448 +1982,484 @@ read_supernodal_valuesLt(KernelHandle *kernelHandle,
       for (int kk = 0; kk < nsrow2; kk++) {
         int irow = rowind[ps2 + kk];
 
-        hv(off(irow)+jj) = Lx[psx + (nscol+kk + jj*nsrow)];
+        hv(off(irow) + jj) = Lx[psx + (nscol + kk + jj * nsrow)];
 
-        supid = map (irow);
-        if (check (supid) == 0) {
-          check (supid) = 1;
-          sup (nsup) = supid;
-          nsup ++;
+        supid = map(irow);
+        if (check(supid) == 0) {
+          check(supid) = 1;
+          sup(nsup)    = supid;
+          nsup++;
         }
       }
     }
     // shift pointers, and reset check
-    for (int i = 0; i < nsup; i++ ) {
-      supid = sup (i);
-      for (int ii = nb[supid]; ii < nb[supid+1]; ii++) {
-        off (ii) += nscol;
+    for (int i = 0; i < nsup; i++) {
+      supid = sup(i);
+      for (int ii = nb[supid]; ii < nb[supid + 1]; ii++) {
+        off(ii) += nscol;
       }
-      check (supid) = 0;
+      check(supid) = 0;
     }
   }
 
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
   std::cout << "    read_supernodal_valuesLt" << std::endl;
   std::cout << "    * Matrix size = " << n << std::endl;
-  std::cout << "    * Total nnz   = " << hr (n) << std::endl;
-  std::cout << "    * nnz / n     = " << hr (n)/n << std::endl;
+  std::cout << "    * Total nnz   = " << hr(n) << std::endl;
+  std::cout << "    * nnz / n     = " << hr(n) / n << std::endl;
 
-  double time = timer.seconds ();
+  double time = timer.seconds();
   std::cout << "    > Time : " << time << std::endl;
-  #endif
+#endif
 
   // invert blocks (TODO done on host for now)
-  invert_supernodal_columns (kernelHandle, unit_diag, nsuper, nb, hr, hc, hv);
+  invert_supernodal_columns(kernelHandle, unit_diag, nsuper, nb, hr, hc, hv);
   // deepcopy
-  Kokkos::deep_copy (values_view, hv);
+  Kokkos::deep_copy(values_view, hv);
 
   // create crs
-  crsmat_t crsmat ("CrsMatrix", n, values_view, static_graph);
+  crsmat_t crsmat("CrsMatrix", n, values_view, static_graph);
 
   return crsmat;
 }
 
-
-/* ========================================================================================= */
+/* =========================================================================================
+ */
 template <typename crsmat_t, typename KernelHandle, typename host_crsmat_t>
 void split_crsmat(KernelHandle *kernelHandleL, host_crsmat_t superluL) {
-
-  using        graph_t = typename crsmat_t::StaticCrsGraphType;
+  using graph_t        = typename crsmat_t::StaticCrsGraphType;
   using row_map_view_t = typename graph_t::row_map_type::non_const_type;
-  using    cols_view_t = typename graph_t::entries_type::non_const_type;
-  using  values_view_t = typename crsmat_t::values_type::non_const_type;
+  using cols_view_t    = typename graph_t::entries_type::non_const_type;
+  using values_view_t  = typename crsmat_t::values_type::non_const_type;
 
   using row_map_view_host_t = typename row_map_view_t::HostMirror;
-  using    cols_view_host_t = typename cols_view_t::HostMirror;
-  using  values_view_host_t = typename values_view_t::HostMirror;
+  using cols_view_host_t    = typename cols_view_t::HostMirror;
+  using values_view_host_t  = typename values_view_t::HostMirror;
 
-  using scalar_t = typename KernelHandle::nnz_scalar_t;
+  using scalar_t  = typename KernelHandle::nnz_scalar_t;
   using size_type = typename KernelHandle::size_type;
 
-  const scalar_t zero (0.0);
+  const scalar_t zero(0.0);
 
   // get sparse-triangular solve handle
-  auto *handleL = kernelHandleL->get_sptrsv_handle ();
+  auto *handleL = kernelHandleL->get_sptrsv_handle();
 
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
   Kokkos::Timer timer;
   Kokkos::Timer timer2;
   double time1 = 0.0;
   double time2 = 0.0;
   double time3 = 0.0;
   double time4 = 0.0;
-  #endif
+#endif
   // ===================================================================
   // number of supernodes per level
-  auto nodes_per_level = handleL->get_nodes_per_level ();
-  auto hnodes_per_level = Kokkos::create_mirror_view (nodes_per_level);
-  Kokkos::deep_copy (hnodes_per_level, nodes_per_level);
+  auto nodes_per_level  = handleL->get_nodes_per_level();
+  auto hnodes_per_level = Kokkos::create_mirror_view(nodes_per_level);
+  Kokkos::deep_copy(hnodes_per_level, nodes_per_level);
 
   // id of supernodes at each level
-  auto nodes_grouped_by_level = handleL->get_nodes_grouped_by_level ();
-  auto nodes_grouped_by_level_host = Kokkos::create_mirror_view (nodes_grouped_by_level);
-  Kokkos::deep_copy (nodes_grouped_by_level_host, nodes_grouped_by_level);
+  auto nodes_grouped_by_level = handleL->get_nodes_grouped_by_level();
+  auto nodes_grouped_by_level_host =
+      Kokkos::create_mirror_view(nodes_grouped_by_level);
+  Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level);
 
   // load graphs
-  auto graphL = handleL->get_graph_host ();
+  auto graphL = handleL->get_graph_host();
 
   // crsgraph for L
-  int nrows = graphL.numRows ();
+  int nrows     = graphL.numRows();
   auto row_mapL = graphL.row_map;
   auto entriesL = graphL.entries;
 
-  auto values = superluL.values;
-  auto valuesL = Kokkos::create_mirror_view (values);
-  Kokkos::deep_copy (valuesL, values);
+  auto values  = superluL.values;
+  auto valuesL = Kokkos::create_mirror_view(values);
+  Kokkos::deep_copy(valuesL, values);
 
-  int node_count = 0; // number of supernodes processed
-  int nlevels = handleL->get_num_levels ();
+  int node_count = 0;  // number of supernodes processed
+  int nlevels    = handleL->get_num_levels();
 
-  bool invert_offdiag = handleL->get_invert_offdiagonal ();
-  const int* supercols_host = handleL->get_supercols_host ();
+  bool invert_offdiag       = handleL->get_invert_offdiagonal();
+  const int *supercols_host = handleL->get_supercols_host();
 
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  timer.reset ();
-  #endif
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  timer.reset();
+#endif
   // count total nnz
   int newNnz = 0;
   for (int j = 0; j < nrows; j++) {
-    for (size_type k = row_mapL (j); k < row_mapL (j+1); k++) {
-      if (valuesL (k) != zero) {
-          newNnz ++;
+    for (size_type k = row_mapL(j); k < row_mapL(j + 1); k++) {
+      if (valuesL(k) != zero) {
+        newNnz++;
       }
     }
   }
   // allocate for all the subgraphs
-  row_map_view_t total_rowmap_view (Kokkos::view_alloc(Kokkos::WithoutInitializing, "rowmap_view"), 2*nlevels*(nrows+1));
-  cols_view_t    total_column_view (Kokkos::view_alloc(Kokkos::WithoutInitializing, "colmap_view"), newNnz);
-  values_view_t  total_values_view (Kokkos::view_alloc(Kokkos::WithoutInitializing, "values_view"), newNnz);
+  row_map_view_t total_rowmap_view(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "rowmap_view"),
+      2 * nlevels * (nrows + 1));
+  cols_view_t total_column_view(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "colmap_view"), newNnz);
+  values_view_t total_values_view(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "values_view"), newNnz);
   // create host-mirrors
-  row_map_view_host_t total_hr = Kokkos::create_mirror_view (total_rowmap_view);
-  cols_view_host_t    total_hc = Kokkos::create_mirror_view (total_column_view);
-  values_view_host_t  total_hv = Kokkos::create_mirror_view (total_values_view);
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  time4 = timer.seconds ();
-  #endif
-
-  // form crsgraph for each submatrix at each level
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  int oldNnz = row_mapL (nrows);
-  #endif
+  row_map_view_host_t total_hr = Kokkos::create_mirror_view(total_rowmap_view);
+  cols_view_host_t total_hc    = Kokkos::create_mirror_view(total_column_view);
+  values_view_host_t total_hv  = Kokkos::create_mirror_view(total_values_view);
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  time4 = timer.seconds();
+#endif
+
+// form crsgraph for each submatrix at each level
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  int oldNnz = row_mapL(nrows);
+#endif
   newNnz = 0;
-  std::vector <crsmat_t> sub_crsmats (nlevels);
-  std::vector <crsmat_t> diag_blocks (nlevels);
+  std::vector<crsmat_t> sub_crsmats(nlevels);
+  std::vector<crsmat_t> diag_blocks(nlevels);
   int offset_view = 0;
   for (int lvl = 0; lvl < nlevels; ++lvl) {
-    // > count nnz
-    #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-    timer.reset ();
-    #endif
+// > count nnz
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+    timer.reset();
+#endif
     int nnzL = 0;
     int nnzD = 0;
-    int lvl_nodes = hnodes_per_level (lvl); // number of supernodes at this level
+    int lvl_nodes =
+        hnodes_per_level(lvl);  // number of supernodes at this level
     for (int league_rank = 0; league_rank < lvl_nodes; league_rank++) {
-      auto s = nodes_grouped_by_level_host (node_count + league_rank);
+      auto s = nodes_grouped_by_level_host(node_count + league_rank);
 
       // supernodal column size
-      int j1 = supercols_host[s];
-      int j2 = supercols_host[s+1];
-      int nscol = j2 - j1 ;        // number of columns in the s-th supernode column
+      int j1    = supercols_host[s];
+      int j2    = supercols_host[s + 1];
+      int nscol = j2 - j1;  // number of columns in the s-th supernode column
       for (int j = j1; j < j2; j++) {
-        for (size_type k = row_mapL (j); k < row_mapL (j+1); k++) {
-          if (valuesL (k) != zero) {
-            if (invert_offdiag || k >= row_mapL (j) + nscol) {
-              nnzL ++;
+        for (size_type k = row_mapL(j); k < row_mapL(j + 1); k++) {
+          if (valuesL(k) != zero) {
+            if (invert_offdiag || k >= row_mapL(j) + nscol) {
+              nnzL++;
             } else {
-              nnzD ++;
+              nnzD++;
             }
           }
         }
       }
     }
 
-    #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-    timer2.reset ();
-    #endif
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+    timer2.reset();
+#endif
     // create subviews for the subgraph
-    using range_type = Kokkos::pair<int, int>;
-    int offset_rowmap = lvl * 2 * (nrows+1);
-    row_map_view_t rowmap_view = Kokkos::subview(total_rowmap_view, range_type (offset_rowmap, offset_rowmap+(nrows+1)));
-    cols_view_t    column_view = Kokkos::subview(total_column_view, range_type (offset_view, offset_view+nnzL));
-    values_view_t  values_view = Kokkos::subview(total_values_view, range_type (offset_view, offset_view+nnzL));
-
-    row_map_view_host_t hr = Kokkos::subview(total_hr, range_type (offset_rowmap, offset_rowmap+(nrows+1)));
-    cols_view_host_t    hc = Kokkos::subview(total_hc, range_type (offset_view, offset_view+nnzL));
-    values_view_host_t  hv = Kokkos::subview(total_hv, range_type (offset_view, offset_view+nnzL));
+    using range_type  = Kokkos::pair<int, int>;
+    int offset_rowmap = lvl * 2 * (nrows + 1);
+    row_map_view_t rowmap_view =
+        Kokkos::subview(total_rowmap_view,
+                        range_type(offset_rowmap, offset_rowmap + (nrows + 1)));
+    cols_view_t column_view = Kokkos::subview(
+        total_column_view, range_type(offset_view, offset_view + nnzL));
+    values_view_t values_view = Kokkos::subview(
+        total_values_view, range_type(offset_view, offset_view + nnzL));
+
+    row_map_view_host_t hr = Kokkos::subview(
+        total_hr, range_type(offset_rowmap, offset_rowmap + (nrows + 1)));
+    cols_view_host_t hc =
+        Kokkos::subview(total_hc, range_type(offset_view, offset_view + nnzL));
+    values_view_host_t hv =
+        Kokkos::subview(total_hv, range_type(offset_view, offset_view + nnzL));
     offset_view += nnzL;
 
     // create subviews for the subgraph, just for diagonal blocks
-    offset_rowmap += nrows+1;
-    row_map_view_t rowmapD_view = Kokkos::subview(total_rowmap_view, range_type (offset_rowmap, offset_rowmap+(nrows+1)));
-    cols_view_t    columnD_view = Kokkos::subview(total_column_view, range_type (offset_view, offset_view+nnzD));
-    values_view_t  valuesD_view = Kokkos::subview(total_values_view, range_type (offset_view, offset_view+nnzD));
-
-    row_map_view_host_t hrD = Kokkos::subview(total_hr, range_type (offset_rowmap, offset_rowmap+(nrows+1)));
-    cols_view_host_t    hcD = Kokkos::subview(total_hc, range_type (offset_view, offset_view+nnzD));
-    values_view_host_t  hvD = Kokkos::subview(total_hv, range_type (offset_view, offset_view+nnzD));
+    offset_rowmap += nrows + 1;
+    row_map_view_t rowmapD_view =
+        Kokkos::subview(total_rowmap_view,
+                        range_type(offset_rowmap, offset_rowmap + (nrows + 1)));
+    cols_view_t columnD_view = Kokkos::subview(
+        total_column_view, range_type(offset_view, offset_view + nnzD));
+    values_view_t valuesD_view = Kokkos::subview(
+        total_values_view, range_type(offset_view, offset_view + nnzD));
+
+    row_map_view_host_t hrD = Kokkos::subview(
+        total_hr, range_type(offset_rowmap, offset_rowmap + (nrows + 1)));
+    cols_view_host_t hcD =
+        Kokkos::subview(total_hc, range_type(offset_view, offset_view + nnzD));
+    values_view_host_t hvD =
+        Kokkos::subview(total_hv, range_type(offset_view, offset_view + nnzD));
     offset_view += nnzD;
-    #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-    time3 += timer2.seconds ();
-    #endif
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+    time3 += timer2.seconds();
+#endif
 
     // create subgraph
-    hr (0) = 0;
-    hrD (0) = 0;
-    if (handleL->transpose_spmv()) { // NOTE: it always transpose, for now
+    hr(0)  = 0;
+    hrD(0) = 0;
+    if (handleL->transpose_spmv()) {  // NOTE: it always transpose, for now
       // >> store submatrix with transpose (CSR -> CSC or CSC -> CSR) <<
       // count nnz / row
       for (int j = 0; j <= nrows; j++) {
-        hr (j) = 0;
-        hrD (j) = 0;
+        hr(j)  = 0;
+        hrD(j) = 0;
       }
       nnzL = 0;
       nnzD = 0;
       for (int league_rank = 0; league_rank < lvl_nodes; league_rank++) {
-        auto s = nodes_grouped_by_level_host (node_count + league_rank);
-        int j1 = supercols_host[s];                  // start of this supernode
-        int j2 = supercols_host[s+1];                // start of next supernode
-        int nscol = j2 - j1 ;        // number of columns in the s-th supernode column
+        auto s    = nodes_grouped_by_level_host(node_count + league_rank);
+        int j1    = supercols_host[s];      // start of this supernode
+        int j2    = supercols_host[s + 1];  // start of next supernode
+        int nscol = j2 - j1;  // number of columns in the s-th supernode column
         for (int j = j1; j < j2; j++) {
-          for (size_type k = row_mapL (j); k < row_mapL (j+1); k++) {
-            if (valuesL (k) != zero) {
-              if (invert_offdiag || k >= row_mapL (j) + nscol) {
-                hr (1 + entriesL (k)) ++;
-                nnzL ++;
+          for (size_type k = row_mapL(j); k < row_mapL(j + 1); k++) {
+            if (valuesL(k) != zero) {
+              if (invert_offdiag || k >= row_mapL(j) + nscol) {
+                hr(1 + entriesL(k))++;
+                nnzL++;
               } else {
-                hrD (1 + entriesL (k)) ++;
-                nnzD ++;
+                hrD(1 + entriesL(k))++;
+                nnzD++;
               }
             }
           }
         }
       }
       for (int j = 0; j < nrows; j++) {
-        hr (j+1) += hr (j);
-        hrD (j+1) += hrD (j);
+        hr(j + 1) += hr(j);
+        hrD(j + 1) += hrD(j);
       }
       // insert nzs
       for (int league_rank = 0; league_rank < lvl_nodes; league_rank++) {
-        auto s = nodes_grouped_by_level_host (node_count + league_rank);
+        auto s = nodes_grouped_by_level_host(node_count + league_rank);
 
         // start/end column id for this supernodal column at this level
         // (these column ids are sorted in ascending order at each level)
-        int j1 = supercols_host[s];                  // start of this supernode
-        int j2 = supercols_host[s+1];                // start of next supernode
-        int nscol = j2 - j1 ;        // number of columns in the s-th supernode column
+        int j1    = supercols_host[s];      // start of this supernode
+        int j2    = supercols_host[s + 1];  // start of next supernode
+        int nscol = j2 - j1;  // number of columns in the s-th supernode column
         for (int j = j1; j < j2; j++) {
           // diagonals
-          for (size_type k = row_mapL (j); k < row_mapL (j) + nscol; k++) {
+          for (size_type k = row_mapL(j); k < row_mapL(j) + nscol; k++) {
             // remove zeros
-            if (valuesL (k) != zero) {
+            if (valuesL(k) != zero) {
               if (invert_offdiag) {
-                hc (hr (entriesL (k))) = j;
-                hv (hr (entriesL (k))) = valuesL (k);
-                hr (entriesL (k)) ++;
+                hc(hr(entriesL(k))) = j;
+                hv(hr(entriesL(k))) = valuesL(k);
+                hr(entriesL(k))++;
               } else {
-                hcD (hrD (entriesL (k))) = j;
-                hvD (hrD (entriesL (k))) = valuesL (k);
-                hrD (entriesL (k)) ++;
+                hcD(hrD(entriesL(k))) = j;
+                hvD(hrD(entriesL(k))) = valuesL(k);
+                hrD(entriesL(k))++;
               }
             }
           }
 
           // off-diagonals
-          for (size_type k = row_mapL (j) + nscol; k < row_mapL (j+1); k++) {
-            // remove zeros, and minus for updating off-diagonal elements with Spmv
-            if (valuesL (k) != zero) {
-              hc (hr (entriesL (k))) = j;
-              hv (hr (entriesL (k))) = -valuesL (k);
-              hr (entriesL (k)) ++;
+          for (size_type k = row_mapL(j) + nscol; k < row_mapL(j + 1); k++) {
+            // remove zeros, and minus for updating off-diagonal elements with
+            // Spmv
+            if (valuesL(k) != zero) {
+              hc(hr(entriesL(k))) = j;
+              hv(hr(entriesL(k))) = -valuesL(k);
+              hr(entriesL(k))++;
             }
           }
         }
       }
       // fix pointers
       for (int j = nrows; j > 0; j--) {
-        hr (j) = hr (j-1);
-        hrD (j) = hrD (j-1);
+        hr(j)  = hr(j - 1);
+        hrD(j) = hrD(j - 1);
       }
-      hr (0) = 0;
-      hrD (0) = 0;
+      hr(0)  = 0;
+      hrD(0) = 0;
     } else {
       // >> store submatrix without transpose (CSC -> CSC or CSR -> CSR) <<
-      nnzL = 0;
-      nnzD = 0;
-      int j0 = 0; // end of previous supernode at this level (not including this column)
+      nnzL   = 0;
+      nnzD   = 0;
+      int j0 = 0;  // end of previous supernode at this level (not including
+                   // this column)
       for (int league_rank = 0; league_rank < lvl_nodes; league_rank++) {
-        auto s = nodes_grouped_by_level_host (node_count + league_rank);
+        auto s = nodes_grouped_by_level_host(node_count + league_rank);
 
         // start/end column id for this supernodal column at this level
         // (these column ids are sorted in ascending order at each level)
-        int j1 = supercols_host[s];                  // start of this supernode
-        int j2 = supercols_host[s+1];                // start of next supernode
-        int nscol = j2 - j1 ;        // number of columns in the s-th supernode column
-
-        // insert empty columns for the columns skipped (between the previous and this supernodes)
-        for (int j = j0+1; j <= j1; j++) {
-          hr (j) = hr (j-1);
-          hrD (j) = hrD (j-1);
+        int j1    = supercols_host[s];      // start of this supernode
+        int j2    = supercols_host[s + 1];  // start of next supernode
+        int nscol = j2 - j1;  // number of columns in the s-th supernode column
+
+        // insert empty columns for the columns skipped (between the previous
+        // and this supernodes)
+        for (int j = j0 + 1; j <= j1; j++) {
+          hr(j)  = hr(j - 1);
+          hrD(j) = hrD(j - 1);
         }
 
         // insert the columns in this supernode
         for (int j = j1; j < j2; j++) {
           // diagonals
-          for (size_type k = row_mapL (j); k < row_mapL (j) + nscol; k++) {
+          for (size_type k = row_mapL(j); k < row_mapL(j) + nscol; k++) {
             // remove zeros
-            if (valuesL (k) != zero) {
+            if (valuesL(k) != zero) {
               if (invert_offdiag) {
-                hc (nnzL) = entriesL (k);
-                hv (nnzL) = valuesL (k);
-                nnzL ++;
+                hc(nnzL) = entriesL(k);
+                hv(nnzL) = valuesL(k);
+                nnzL++;
               } else {
-                hcD (nnzD) = entriesL (k);
-                hvD (nnzD) = valuesL (k);
-                nnzD ++;
+                hcD(nnzD) = entriesL(k);
+                hvD(nnzD) = valuesL(k);
+                nnzD++;
               }
             }
           }
 
           // off-diagonals
-          for (size_type k = row_mapL (j) + nscol; k < row_mapL (j+1); k++) {
-            // remove zeros, and minus for updating off-diagonal elements with Spmv
-            if (valuesL (k) != zero) {
-              hc (nnzL) =  entriesL (k);
-              hv (nnzL) = -valuesL (k);
-              nnzL ++;
+          for (size_type k = row_mapL(j) + nscol; k < row_mapL(j + 1); k++) {
+            // remove zeros, and minus for updating off-diagonal elements with
+            // Spmv
+            if (valuesL(k) != zero) {
+              hc(nnzL) = entriesL(k);
+              hv(nnzL) = -valuesL(k);
+              nnzL++;
             }
           }
-          hr (j+1) = nnzL;
-          hrD (j+1) = nnzD;
+          hr(j + 1)  = nnzL;
+          hrD(j + 1) = nnzD;
         }
-        j0 = j2; // update the last column of the processed supernode (not including this column)
+        j0 = j2;  // update the last column of the processed supernode (not
+                  // including this column)
       }
 
       // insert empty columns at the end
-      for (int j = j0+1; j <= nrows; j++) {
-        hr (j) = hr (j-1);
-        hrD (j) = hrD (j-1);
+      for (int j = j0 + 1; j <= nrows; j++) {
+        hr(j)  = hr(j - 1);
+        hrD(j) = hrD(j - 1);
       }
     }
-    newNnz += nnzL+nnzD;
-    #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-    time1 += timer.seconds ();
-    timer.reset ();
-    #endif
+    newNnz += nnzL + nnzD;
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+    time1 += timer.seconds();
+    timer.reset();
+#endif
 
     // create crs-graph
     graph_t sub_graph(column_view, rowmap_view);
     sub_crsmats[lvl] = crsmat_t("CrsMatrix", nrows, values_view, sub_graph);
     if (!invert_offdiag) {
       graph_t diag_graph(columnD_view, rowmapD_view);
-      diag_blocks[lvl] = crsmat_t("DiagMatrix", nrows, valuesD_view, diag_graph);
+      diag_blocks[lvl] =
+          crsmat_t("DiagMatrix", nrows, valuesD_view, diag_graph);
     }
-    #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-    //std::cout << "   > split nnz(" << lvl << ") = " << nnzL+nnzD << std::endl;
-    time2 += timer.seconds ();
-    #endif
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+    // std::cout << "   > split nnz(" << lvl << ") = " << nnzL+nnzD <<
+    // std::endl;
+    time2 += timer.seconds();
+#endif
 
     // update the number of supernodes processed
     node_count += lvl_nodes;
   }
-  // deep-copy all the subviews
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  timer.reset ();
-  #endif
-  Kokkos::deep_copy (total_rowmap_view, total_hr);
-  Kokkos::deep_copy (total_column_view, total_hc);
-  Kokkos::deep_copy (total_values_view, total_hv);
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
-  time2 += timer.seconds ();
-  #endif
-  handleL->set_submatrices (sub_crsmats);
+// deep-copy all the subviews
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  timer.reset();
+#endif
+  Kokkos::deep_copy(total_rowmap_view, total_hr);
+  Kokkos::deep_copy(total_column_view, total_hc);
+  Kokkos::deep_copy(total_values_view, total_hv);
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+  time2 += timer.seconds();
+#endif
+  handleL->set_submatrices(sub_crsmats);
   if (!invert_offdiag) {
-    handleL->set_diagblocks (diag_blocks);
+    handleL->set_diagblocks(diag_blocks);
   }
-  #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
+#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
   std::cout << "   split_crsmat" << std::endl;
-  std::cout << "   > Time to split to submatrices       : " << time1 << std::endl;
-  std::cout << "      + allocate submatrices            : " << time4 << std::endl;
-  std::cout << "      + create subviews                 : " << time3 << std::endl;
-  std::cout << "   > Time to copy submatrices to device : " << time2 << std::endl;
-  std::cout << "   > Total NNZ                          : " << oldNnz << " -> " << newNnz
-            << std::endl << std::endl;
-  #endif
+  std::cout << "   > Time to split to submatrices       : " << time1
+            << std::endl;
+  std::cout << "      + allocate submatrices            : " << time4
+            << std::endl;
+  std::cout << "      + create subviews                 : " << time3
+            << std::endl;
+  std::cout << "   > Time to copy submatrices to device : " << time2
+            << std::endl;
+  std::cout << "   > Total NNZ                          : " << oldNnz << " -> "
+            << newNnz << std::endl
+            << std::endl;
+#endif
 }
 
-/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
-/* For numeric computation                                                                   */
-  template <typename crsmat_input_t,
-            typename KernelHandle>
-  void sptrsv_compute(
-      KernelHandle *kernelHandleL,
-      crsmat_input_t L)
-  {
-    // ===================================================================
-    // load sptrsv-handles
-    auto *handleL = kernelHandleL->get_sptrsv_handle ();
-    if (!(handleL->is_symbolic_complete())) {
-      std::cout << std::endl
-                << " ** needs to call sptrsv_symbolic before calling sptrsv_numeric **"
-                << std::endl << std::endl;
-      return;
-    }
-    bool merged = handleL->get_merge_supernodes ();
-    if (merged) {
-      // TODO: follow what's done in sptrsv_compute in superlu
-      std::cout << std::endl
-                << " ** merge is not supported through this interface, yet **"
-                << std::endl << std::endl;
-      return;
-    }
-
-    // ===================================================================
-    // load supernodes
-    int nsuper = handleL->get_num_supernodes ();
-    const int *supercols = handleL->get_supercols_host ();
-
-    // ==============================================
-    // load crsGraph
-    //auto graph = handleL->get_original_graph_host (); // graph stored in handle (before merge)
-    auto graph = handleL->get_graph (); // graph stored in handle (before merge)
-    auto graph_host = handleL->get_graph_host (); // graph stored in handle (before merge)
-    auto row_map = graph_host.row_map;
-    auto entries = graph_host.entries;
-    auto nrows = graph_host.numRows ();
-
-    // from input CrsMatrix
-    auto values = L.values;            // numerical values from input (host), output will be stored in handle
-
-    // ==============================================
-    // read numerical values of L from Cholmod
-    using crsmat_t = typename KernelHandle::SPTRSVHandleType::crsmat_t;
-    bool ptr_by_column = true;
-    auto crsmatL = read_supernodal_values<crsmat_t> (kernelHandleL, nrows, nsuper, ptr_by_column, row_map.data (), supercols,
-                                                     row_map.data (), entries.data (), values.data (), graph);
-
-    // ===================================================================
-    bool useSpMV = (handleL->get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV ||
-                    handleL->get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG);
-    if (useSpMV) {
-      // ----------------------------------------------------
-      // split the matrix into submatrices for spmv at each level
-      split_crsmat<crsmat_t> (kernelHandleL, crsmatL);
-    }
+/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+/* For numeric computation */
+template <typename crsmat_input_t, typename KernelHandle>
+void sptrsv_compute(KernelHandle *kernelHandleL, crsmat_input_t L) {
+  // ===================================================================
+  // load sptrsv-handles
+  auto *handleL = kernelHandleL->get_sptrsv_handle();
+  if (!(handleL->is_symbolic_complete())) {
+    std::cout
+        << std::endl
+        << " ** needs to call sptrsv_symbolic before calling sptrsv_numeric **"
+        << std::endl
+        << std::endl;
+    return;
+  }
+  bool merged = handleL->get_merge_supernodes();
+  if (merged) {
+    // TODO: follow what's done in sptrsv_compute in superlu
+    std::cout << std::endl
+              << " ** merge is not supported through this interface, yet **"
+              << std::endl
+              << std::endl;
+    return;
+  }
 
-    // ==============================================
-    // save crsmat
-    handleL->set_crsmat (crsmatL);
+  // ===================================================================
+  // load supernodes
+  int nsuper           = handleL->get_num_supernodes();
+  const int *supercols = handleL->get_supercols_host();
+
+  // ==============================================
+  // load crsGraph
+  // auto graph = handleL->get_original_graph_host (); // graph stored in handle
+  // (before merge)
+  auto graph = handleL->get_graph();  // graph stored in handle (before merge)
+  auto graph_host =
+      handleL->get_graph_host();  // graph stored in handle (before merge)
+  auto row_map = graph_host.row_map;
+  auto entries = graph_host.entries;
+  auto nrows   = graph_host.numRows();
+
+  // from input CrsMatrix
+  auto values = L.values;  // numerical values from input (host), output will be
+                           // stored in handle
+
+  // ==============================================
+  // read numerical values of L from Cholmod
+  using crsmat_t     = typename KernelHandle::SPTRSVHandleType::crsmat_t;
+  bool ptr_by_column = true;
+  auto crsmatL       = read_supernodal_values<crsmat_t>(
+      kernelHandleL, nrows, nsuper, ptr_by_column, row_map.data(), supercols,
+      row_map.data(), entries.data(), values.data(), graph);
 
-    // ===================================================================
-    handleL->set_numeric_complete ();
+  // ===================================================================
+  bool useSpMV =
+      (handleL->get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV ||
+       handleL->get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG);
+  if (useSpMV) {
+    // ----------------------------------------------------
+    // split the matrix into submatrices for spmv at each level
+    split_crsmat<crsmat_t>(kernelHandleL, crsmatL);
   }
 
-} // namespace Experimental
-} // namespace KokkosSparse
+  // ==============================================
+  // save crsmat
+  handleL->set_crsmat(crsmatL);
+
+  // ===================================================================
+  handleL->set_numeric_complete();
+}
+
+}  // namespace Experimental
+}  // namespace KokkosSparse
 
-#endif // KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
-#endif // KOKKOSSPARSE_SPTRSV_SUPERNODE_HPP_
+#endif  // KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
+#endif  // KOKKOSSPARSE_SPTRSV_SUPERNODE_HPP_
diff --git a/src/sparse/KokkosSparse_trsv.hpp b/src/sparse/KokkosSparse_trsv.hpp
index 136ea02e70..eda3501ad0 100644
--- a/src/sparse/KokkosSparse_trsv.hpp
+++ b/src/sparse/KokkosSparse_trsv.hpp
@@ -55,6 +55,7 @@
 #include <type_traits>
 
 #include "KokkosSparse_trsv_spec.hpp"
+#include "KokkosKernels_Error.hpp"
 
 namespace KokkosSparse {
 
@@ -75,105 +76,101 @@ namespace KokkosSparse {
 /// \param b [in] The input (right-hand side) (multi)vector.
 /// \param x [in] The output (left-hand side) (multi)vector.
 template <class AMatrix, class BMV, class XMV>
-void
-trsv (const char uplo[],
-      const char trans[],
-      const char diag[],
-      const AMatrix& A,
-      const BMV& b,
-      const XMV& x)
-{
+void trsv(const char uplo[], const char trans[], const char diag[],
+          const AMatrix& A, const BMV& b, const XMV& x) {
   // FIXME (mfh 23 Apr 2015) Need to implement rank-1 version of this function.
-  static_assert (BMV::rank == 2, "KokkosBlas::trsv: Rank-1 version of this "
-                 "function has not yet been implemented.");
-
-  static_assert (Kokkos::Impl::is_view<BMV>::value,
-                 "KokkosBlas::trsv: b is not a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<XMV>::value,
-                 "KokkosBlas::trsv: x is not a Kokkos::View.");
-  static_assert ((int) BMV::rank == (int) XMV::rank,
-                 "KokkosBlas::trsv: The ranks of b and x do not match.");
-  static_assert (BMV::rank == 1 || BMV::rank == 2,
-                 "KokkosBlas::trsv: b and x must both either have rank 1, or rank 2.");
-  static_assert (std::is_same<typename XMV::value_type,
-                 typename XMV::non_const_value_type>::value,
-                 "KokkosBlas::trsv: The output x must be nonconst.");
-
-//The following three code lines have been moved up by Massimiliano Lupo Pasini
+  static_assert(BMV::rank == 2,
+                "KokkosBlas::trsv: Rank-1 version of this "
+                "function has not yet been implemented.");
+
+  static_assert(Kokkos::is_view<BMV>::value,
+                "KokkosBlas::trsv: b is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<XMV>::value,
+                "KokkosBlas::trsv: x is not a Kokkos::View.");
+  static_assert((int)BMV::rank == (int)XMV::rank,
+                "KokkosBlas::trsv: The ranks of b and x do not match.");
+  static_assert(
+      BMV::rank == 1 || BMV::rank == 2,
+      "KokkosBlas::trsv: b and x must both either have rank 1, or rank 2.");
+  static_assert(std::is_same<typename XMV::value_type,
+                             typename XMV::non_const_value_type>::value,
+                "KokkosBlas::trsv: The output x must be nonconst.");
+
+  // The following three code lines have been moved up by Massimiliano Lupo
+  // Pasini
   typedef typename BMV::size_type size_type;
-  const size_type numRows = static_cast<size_type> (A.numRows ());
-  const size_type numCols = static_cast<size_type> (A.numCols ());
-  const size_type zero = static_cast<size_type> (0);
+  const size_type numRows = static_cast<size_type>(A.numRows());
+  const size_type numCols = static_cast<size_type>(A.numCols());
+  const size_type zero    = static_cast<size_type>(0);
 
-  if (zero!=numRows && uplo[0] != 'U' && uplo[0] != 'u' && uplo[0] != 'L' && uplo[0] != 'l') {
+  if (zero != numRows && uplo[0] != 'U' && uplo[0] != 'u' && uplo[0] != 'L' &&
+      uplo[0] != 'l') {
     std::ostringstream os;
     os << "Invalid uplo[0] = \'" << uplo << "\'";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
-  if (zero!=numRows && trans[0] != 'C' && trans[0] != 'c' && trans[0] != 'T' && trans[0] != 't' && trans[0] != 'N' && trans[0] != 'n') {
+  if (zero != numRows && trans[0] != 'C' && trans[0] != 'c' &&
+      trans[0] != 'T' && trans[0] != 't' && trans[0] != 'N' &&
+      trans[0] != 'n') {
     std::ostringstream os;
     os << "Invalid trans[0] = \'" << trans << "\'";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
-  if (zero!=numRows && diag[0] != 'U' && diag[0] != 'u' && diag[0] != 'N' && diag[0] != 'n') {
+  if (zero != numRows && diag[0] != 'U' && diag[0] != 'u' && diag[0] != 'N' &&
+      diag[0] != 'n') {
     std::ostringstream os;
     os << "Invalid diag[0] = \'" << diag << "\'";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
-/*  typedef typename BMV::size_type size_type;
-  const size_type numRows = static_cast<size_type> (A.numRows ());
-  const size_type numCols = static_cast<size_type> (A.numCols ());*/
+  /*  typedef typename BMV::size_type size_type;
+    const size_type numRows = static_cast<size_type> (A.numRows ());
+    const size_type numCols = static_cast<size_type> (A.numCols ());*/
 
   const bool transpose = trans[0] != 'N' && trans[0] != 'n';
-  if (! transpose && (numCols != x.extent(0) || numRows != b.extent(0))) {
+  if (!transpose && (numCols != x.extent(0) || numRows != b.extent(0))) {
     std::ostringstream os;
     os << "Dimensions do not match (non-transpose case).  "
-       << "A is " << numRows << " x " << numCols
-       << ", x is " << x.extent(0) << " x " << x.extent(1)
-       << ", and b is " << b.extent(0) << " x " << b.extent(1);
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+       << "A is " << numRows << " x " << numCols << ", x is " << x.extent(0)
+       << " x " << x.extent(1) << ", and b is " << b.extent(0) << " x "
+       << b.extent(1);
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
   if (transpose && (numRows != x.extent(0) || numCols != b.extent(0))) {
     std::ostringstream os;
     os << "Dimensions do not match (transpose or conjugate transpose case).  "
-       << "A is " << numRows << " x " << numCols
-       << ", x is " << x.extent(0) << " x " << x.extent(1)
-       << ", and b is " << b.extent(0) << " x " << b.extent(1);
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+       << "A is " << numRows << " x " << numCols << ", x is " << x.extent(0)
+       << " x " << x.extent(1) << ", and b is " << b.extent(0) << " x "
+       << b.extent(1);
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
   }
 
   typedef KokkosSparse::CrsMatrix<
-              typename AMatrix::const_value_type,
-              typename AMatrix::const_ordinal_type,
-              typename AMatrix::device_type,
-              Kokkos::MemoryTraits<Kokkos::Unmanaged>,
-              typename AMatrix::const_size_type>          AMatrix_Internal;
+      typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type,
+      typename AMatrix::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>,
+      typename AMatrix::const_size_type>
+      AMatrix_Internal;
 
   AMatrix_Internal A_i = A;
 
   typedef Kokkos::View<
-          typename BMV::const_value_type**,
-          typename BMV::array_layout,
-          typename BMV::device_type,
-          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > BMV_Internal;
-
-  typedef Kokkos::View<
-            typename XMV::non_const_value_type**,
-            typename XMV::array_layout,
-            typename XMV::device_type,
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> > XMV_Internal;
-
+      typename BMV::const_value_type**, typename BMV::array_layout,
+      typename BMV::device_type,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
+      BMV_Internal;
 
+  typedef Kokkos::View<typename XMV::non_const_value_type**,
+                       typename XMV::array_layout, typename XMV::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      XMV_Internal;
 
   BMV_Internal b_i = b;
   XMV_Internal x_i = x;
 
-
-  KokkosSparse::Impl::TRSV<AMatrix_Internal, BMV_Internal, XMV_Internal>::trsv (uplo, trans, diag, A_i, b_i, x_i);
+  KokkosSparse::Impl::TRSV<AMatrix_Internal, BMV_Internal, XMV_Internal>::trsv(
+      uplo, trans, diag, A_i, b_i, x_i);
 }
 
-} // namespace KokkosSparse
-
-#endif // KOKKOS_SPARSE_TRSV_HPP_
+}  // namespace KokkosSparse
 
+#endif  // KOKKOS_SPARSE_TRSV_HPP_
diff --git a/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp
index da4d77e205..60a00bd36a 100644
--- a/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp
@@ -47,11 +47,8 @@
 
 #include "KokkosKernels_Utils.hpp"
 #include <Kokkos_Core.hpp>
-#include <Kokkos_Atomic.hpp>
-#include <Kokkos_Timer.hpp>
 #include <Kokkos_Bitset.hpp>
 #include <Kokkos_Sort.hpp>
-#include <Kokkos_MemoryTraits.hpp>
 #include "KokkosGraph_Distance1Color.hpp"
 #include "KokkosKernels_BitUtils.hpp"
 #include "KokkosKernels_SimpleUtils.hpp"
@@ -59,229 +56,238 @@
 #include "KokkosGraph_MIS2.hpp"
 #include "KokkosGraph_ExplicitCoarsening.hpp"
 
-namespace KokkosSparse{
-  namespace Impl{
-
-    template <typename HandleType, typename lno_row_view_t_, typename lno_nnz_view_t_, typename scalar_nnz_view_t_>
-    class ClusterGaussSeidel{
-
-    public:
-
-      typedef lno_row_view_t_ in_lno_row_view_t;
-      typedef lno_nnz_view_t_ in_lno_nnz_view_t;
-      typedef scalar_nnz_view_t_ in_scalar_nnz_view_t;
-
-      typedef typename HandleType::HandleExecSpace MyExecSpace;
-      typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace;
-      typedef typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace;
-
-
-      typedef typename in_lno_row_view_t::non_const_value_type row_lno_t;
-
-      typedef typename HandleType::size_type size_type;
-      typedef typename HandleType::nnz_lno_t nnz_lno_t;
-      typedef typename HandleType::nnz_scalar_t nnz_scalar_t;
-
-      static_assert(std::is_same<size_type, typename in_lno_row_view_t::non_const_value_type>::value,
-          "ClusterGaussSeidel: Handle's size_type does not match input rowmap's element type.");
-      static_assert(std::is_same<nnz_lno_t, typename in_lno_nnz_view_t::non_const_value_type>::value,
-          "ClusterGaussSeidel: Handle's nnz_lno_t does not match input entries's element type.");
-
-      typedef typename in_lno_row_view_t::const_type const_lno_row_view_t;
-      typedef typename in_lno_row_view_t::non_const_type non_const_lno_row_view_t;
-
-      typedef typename lno_nnz_view_t_::const_type const_lno_nnz_view_t;
-      typedef typename lno_nnz_view_t_::non_const_type non_const_lno_nnz_view_t;
-
-      typedef typename scalar_nnz_view_t_::const_type const_scalar_nnz_view_t;
-      typedef typename scalar_nnz_view_t_::non_const_type non_const_scalar_nnz_view_t;
-
-      typedef typename HandleType::row_lno_temp_work_view_t row_lno_temp_work_view_t;
-      typedef typename HandleType::row_lno_persistent_work_view_t row_lno_persistent_work_view_t;
-      typedef typename HandleType::row_lno_persistent_work_host_view_t row_lno_persistent_work_host_view_t; //Host view type
-
-      typedef typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t;
-      typedef typename HandleType::nnz_lno_persistent_work_view_t nnz_lno_persistent_work_view_t;
-      typedef typename HandleType::nnz_lno_persistent_work_host_view_t nnz_lno_persistent_work_host_view_t; //Host view type
-
-      typedef typename HandleType::scalar_temp_work_view_t scalar_temp_work_view_t;
-      typedef typename HandleType::scalar_persistent_work_view_t scalar_persistent_work_view_t;
-
-      typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-      typedef nnz_lno_t color_t;
-      typedef Kokkos::View<color_t *, MyTempMemorySpace> color_view_t;
-      typedef Kokkos::Bitset<MyExecSpace> bitset_t;
-      typedef Kokkos::ConstBitset<MyExecSpace> const_bitset_t;
-
-      typedef Kokkos::TeamPolicy<MyExecSpace> team_policy_t ;
-      typedef typename team_policy_t::member_type team_member_t ;
-
-    private:
-      HandleType *handle;
-
-      //Get the specialized ClusterGaussSeidel handle from the main handle
-      typename HandleType::ClusterGaussSeidelHandleType* get_gs_handle()
-      {
-        auto *gsHandle = dynamic_cast<typename HandleType::ClusterGaussSeidelHandleType*>(this->handle->get_gs_handle());
-        if(!gsHandle)
-        {
-          throw std::runtime_error("ClusterGaussSeidel: GS handle has not been created, or is set up for Point GS.");
+namespace KokkosSparse {
+namespace Impl {
+
+template <typename HandleType, typename lno_row_view_t_,
+          typename lno_nnz_view_t_, typename scalar_nnz_view_t_>
+class ClusterGaussSeidel {
+ public:
+  typedef lno_row_view_t_ in_lno_row_view_t;
+  typedef lno_nnz_view_t_ in_lno_nnz_view_t;
+  typedef scalar_nnz_view_t_ in_scalar_nnz_view_t;
+
+  typedef typename HandleType::HandleExecSpace MyExecSpace;
+  typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace;
+  typedef
+      typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace;
+
+  typedef typename in_lno_row_view_t::non_const_value_type row_lno_t;
+
+  typedef typename HandleType::size_type size_type;
+  typedef typename HandleType::nnz_lno_t nnz_lno_t;
+  typedef typename HandleType::nnz_scalar_t nnz_scalar_t;
+
+  static_assert(
+      std::is_same<size_type,
+                   typename in_lno_row_view_t::non_const_value_type>::value,
+      "ClusterGaussSeidel: Handle's size_type does not match input rowmap's "
+      "element type.");
+  static_assert(
+      std::is_same<nnz_lno_t,
+                   typename in_lno_nnz_view_t::non_const_value_type>::value,
+      "ClusterGaussSeidel: Handle's nnz_lno_t does not match input entries's "
+      "element type.");
+
+  typedef typename in_lno_row_view_t::const_type const_lno_row_view_t;
+  typedef typename in_lno_row_view_t::non_const_type non_const_lno_row_view_t;
+
+  typedef typename lno_nnz_view_t_::const_type const_lno_nnz_view_t;
+  typedef typename lno_nnz_view_t_::non_const_type non_const_lno_nnz_view_t;
+
+  typedef typename scalar_nnz_view_t_::const_type const_scalar_nnz_view_t;
+  typedef
+      typename scalar_nnz_view_t_::non_const_type non_const_scalar_nnz_view_t;
+
+  typedef
+      typename HandleType::row_lno_temp_work_view_t row_lno_temp_work_view_t;
+  typedef typename HandleType::row_lno_persistent_work_view_t
+      row_lno_persistent_work_view_t;
+  typedef typename HandleType::row_lno_persistent_work_host_view_t
+      row_lno_persistent_work_host_view_t;  // Host view type
+
+  typedef
+      typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t;
+  typedef typename HandleType::nnz_lno_persistent_work_view_t
+      nnz_lno_persistent_work_view_t;
+  typedef typename HandleType::nnz_lno_persistent_work_host_view_t
+      nnz_lno_persistent_work_host_view_t;  // Host view type
+
+  typedef typename HandleType::scalar_temp_work_view_t scalar_temp_work_view_t;
+  typedef typename HandleType::scalar_persistent_work_view_t
+      scalar_persistent_work_view_t;
+
+  typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
+  typedef nnz_lno_t color_t;
+  typedef Kokkos::View<color_t*, MyTempMemorySpace> color_view_t;
+  typedef Kokkos::Bitset<MyExecSpace> bitset_t;
+  typedef Kokkos::ConstBitset<MyExecSpace> const_bitset_t;
+
+  typedef Kokkos::TeamPolicy<MyExecSpace> team_policy_t;
+  typedef typename team_policy_t::member_type team_member_t;
+
+ private:
+  HandleType* handle;
+
+  // Get the specialized ClusterGaussSeidel handle from the main handle
+  typename HandleType::ClusterGaussSeidelHandleType* get_gs_handle() {
+    auto* gsHandle =
+        dynamic_cast<typename HandleType::ClusterGaussSeidelHandleType*>(
+            this->handle->get_gs_handle());
+    if (!gsHandle) {
+      throw std::runtime_error(
+          "ClusterGaussSeidel: GS handle has not been created, or is set up "
+          "for Point GS.");
+    }
+    return gsHandle;
+  }
+
+  nnz_lno_t num_rows, num_cols;
+
+  const_lno_row_view_t row_map;
+  const_lno_nnz_view_t entries;
+  const_scalar_nnz_view_t values;
+
+  const_scalar_nnz_view_t given_inverse_diagonal;
+
+  bool have_diagonal_given;
+  bool is_symmetric;
+
+  static constexpr nnz_lno_t apply_batch_size = 16;
+
+ public:
+  struct PSGS_ForwardTag {};
+  struct PSGS_BackwardTag {};
+
+  template <typename x_value_array_type, typename y_value_array_type>
+  struct PSGS {
+    // CSR storage of the matrix.
+    const_lno_row_view_t _xadj;
+    const_lno_nnz_view_t _adj;
+    const_scalar_nnz_view_t _adj_vals;
+
+    // Input/output vectors, as in Ax = y
+    x_value_array_type _Xvector;
+    y_value_array_type _Yvector;
+    nnz_lno_persistent_work_view_t _color_adj;
+    nnz_lno_persistent_work_view_t _cluster_offsets;
+    nnz_lno_persistent_work_view_t _cluster_verts;
+    scalar_persistent_work_view_t _inverse_diagonal;
+    nnz_scalar_t _omega;
+
+    nnz_lno_t _color_set_begin;
+    nnz_lno_t _color_set_end;
+    bool _forward_direction;
+
+    PSGS(const_lno_row_view_t xadj_, const_lno_nnz_view_t adj_,
+         const_scalar_nnz_view_t adj_vals_, x_value_array_type Xvector_,
+         y_value_array_type Yvector_, nnz_lno_persistent_work_view_t color_adj_,
+         nnz_lno_persistent_work_view_t cluster_offsets_,
+         nnz_lno_persistent_work_view_t cluster_verts_, nnz_scalar_t omega_,
+         scalar_persistent_work_view_t inverse_diagonal_)
+        : _xadj(xadj_),
+          _adj(adj_),
+          _adj_vals(adj_vals_),
+          _Xvector(Xvector_),
+          _Yvector(Yvector_),
+          _color_adj(color_adj_),
+          _cluster_offsets(cluster_offsets_),
+          _cluster_verts(cluster_verts_),
+          _inverse_diagonal(inverse_diagonal_),
+          _omega(omega_),
+          _color_set_begin(0),
+          _color_set_end(0),
+          _forward_direction(true) {}
+
+    KOKKOS_FORCEINLINE_FUNCTION
+    void rowApply(nnz_scalar_t* sum, const nnz_lno_t row) const {
+      size_type row_begin = _xadj(row);
+      size_type row_end   = _xadj(row + 1);
+      nnz_lno_t num_vecs  = _Xvector.extent(1);
+      for (nnz_lno_t batch_start = 0; batch_start < num_vecs;
+           batch_start += apply_batch_size) {
+        nnz_lno_t this_batch_size = apply_batch_size;
+        if (batch_start + this_batch_size >= num_vecs)
+          this_batch_size = num_vecs - batch_start;
+        // the current batch of columns given by: batch_start, this_batch_size
+        for (nnz_lno_t i = 0; i < this_batch_size; i++)
+          sum[i] = _Yvector(row, batch_start + i);
+        for (size_type adjind = row_begin; adjind < row_end; ++adjind) {
+          nnz_lno_t col    = _adj(adjind);
+          nnz_scalar_t val = _adj_vals(adjind);
+          for (nnz_lno_t i = 0; i < this_batch_size; i++)
+            sum[i] -= val * _Xvector(col, batch_start + i);
         }
-        return gsHandle;
+        nnz_scalar_t invDiagonalVal = _inverse_diagonal(row);
+        for (nnz_lno_t i = 0; i < this_batch_size; i++)
+          _Xvector(row, batch_start + i) += _omega * sum[i] * invDiagonalVal;
       }
-
-      nnz_lno_t num_rows, num_cols;
-
-      const_lno_row_view_t row_map;
-      const_lno_nnz_view_t entries;
-      const_scalar_nnz_view_t values;
-
-      const_scalar_nnz_view_t given_inverse_diagonal;
-
-      bool have_diagonal_given;
-      bool is_symmetric;
-
-      static constexpr nnz_lno_t apply_batch_size = 16;
-
-    public:
-
-      struct PSGS_ForwardTag {};
-      struct PSGS_BackwardTag {};
-
-      template <typename x_value_array_type, typename y_value_array_type>
-      struct PSGS
-      {
-        // CSR storage of the matrix.
-        const_lno_row_view_t _xadj;
-        const_lno_nnz_view_t _adj;     
-        const_scalar_nnz_view_t _adj_vals;
-
-        //Input/output vectors, as in Ax = y
-        x_value_array_type             _Xvector;
-        y_value_array_type             _Yvector;
-        nnz_lno_persistent_work_view_t _color_adj;
-        nnz_lno_persistent_work_view_t _cluster_offsets;
-        nnz_lno_persistent_work_view_t _cluster_verts;
-        scalar_persistent_work_view_t  _inverse_diagonal;
-        nnz_scalar_t                   _omega;
-
-        nnz_lno_t _color_set_begin;
-        nnz_lno_t _color_set_end;
-        bool _forward_direction;
-
-        PSGS(const_lno_row_view_t xadj_, const_lno_nnz_view_t adj_, const_scalar_nnz_view_t adj_vals_,
-             x_value_array_type Xvector_, y_value_array_type Yvector_,
-             nnz_lno_persistent_work_view_t color_adj_,
-             nnz_lno_persistent_work_view_t cluster_offsets_, nnz_lno_persistent_work_view_t cluster_verts_, 
-             nnz_scalar_t omega_,
-             scalar_persistent_work_view_t inverse_diagonal_)
-          :
-          _xadj             (xadj_),
-          _adj              (adj_),
-          _adj_vals         (adj_vals_),
-          _Xvector          (Xvector_),
-          _Yvector          (Yvector_),
-          _color_adj        (color_adj_),
-          _cluster_offsets  (cluster_offsets_),
-          _cluster_verts    (cluster_verts_),
-          _inverse_diagonal (inverse_diagonal_),
-          _omega            (omega_),
-          _color_set_begin  (0),
-          _color_set_end    (0),
-          _forward_direction(true)
-        {}
-
-        KOKKOS_FORCEINLINE_FUNCTION
-        void rowApply(nnz_scalar_t* sum, const nnz_lno_t row) const
-        {
-          size_type row_begin = _xadj(row);
-          size_type row_end = _xadj(row + 1);
-          nnz_lno_t num_vecs = _Xvector.extent(1);
-          for(nnz_lno_t batch_start = 0; batch_start < num_vecs; batch_start += apply_batch_size)
-          {
-            nnz_lno_t this_batch_size = apply_batch_size;
-            if(batch_start + this_batch_size >= num_vecs)
-              this_batch_size = num_vecs - batch_start;
-            //the current batch of columns given by: batch_start, this_batch_size
-            for(nnz_lno_t i = 0; i < this_batch_size; i++)
-              sum[i] = _Yvector(row, batch_start + i);
-            for(size_type adjind = row_begin; adjind < row_end; ++adjind)
-            {
-              nnz_lno_t col = _adj(adjind);
-              nnz_scalar_t val = _adj_vals(adjind);
-              for(nnz_lno_t i = 0; i < this_batch_size; i++)
-                sum[i] -= val * _Xvector(col, batch_start + i);
-            }
-            nnz_scalar_t invDiagonalVal = _inverse_diagonal(row);
-            for(nnz_lno_t i = 0; i < this_batch_size; i++)
-              _Xvector(row, batch_start + i) += _omega * sum[i] * invDiagonalVal;
-          }
-        }
-
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const PSGS_ForwardTag, const nnz_lno_t ii) const {
-          //color_adj(ii) is a cluster in the current color set.
-          nnz_lno_t clusterColorsetIndex = _color_set_begin + ii;
-          nnz_lno_t cluster = _color_adj(clusterColorsetIndex);
-          nnz_scalar_t sum[apply_batch_size];
-          for(nnz_lno_t j = _cluster_offsets(cluster); j < _cluster_offsets(cluster + 1); j++)
-          {
-            rowApply(sum, _cluster_verts(j));
-          }
-        }
-
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const PSGS_BackwardTag, const nnz_lno_t ii) const {
-          //color_adj(ii) is a cluster in the current color set.
-          nnz_lno_t clusterColorsetIndex =  _color_set_end - 1 - ii;
-          nnz_lno_t cluster = _color_adj(clusterColorsetIndex);
-          nnz_scalar_t sum[apply_batch_size];
-          for(nnz_lno_t j = _cluster_offsets(cluster + 1); j > _cluster_offsets(cluster); j--)
-          {
-            rowApply(sum, _cluster_verts(j - 1));
-          }
-        }
-      };
-
-      template <typename x_value_array_type, typename y_value_array_type>
-      struct Team_PSGS
-      {
-        //CSR storage of the matrix
-        const_lno_row_view_t _xadj;
-        const_lno_nnz_view_t _adj;
-        const_scalar_nnz_view_t _adj_vals;
-
-        //X,Y vectors, as in Ax = y
-        x_value_array_type _Xvector;
-        y_value_array_type _Yvector;
-        nnz_lno_t _color_set_begin;
-        nnz_lno_t _color_set_end;
-        nnz_lno_persistent_work_view_t _color_adj;
-        nnz_lno_persistent_work_view_t _cluster_offsets;
-        nnz_lno_persistent_work_view_t _cluster_verts;
-
-        //_clusters_per_team tries to reach the same total work per
-        //team by dividing the handle's heuristic get_
-        nnz_lno_t _clusters_per_team;
-
-        scalar_persistent_work_view_t _inverse_diagonal;
-
-        bool _is_backward;
-
-        nnz_scalar_t _omega;
-
-        Team_PSGS(const_lno_row_view_t xadj_, const_lno_nnz_view_t adj_, const_scalar_nnz_view_t adj_vals_,
-                  x_value_array_type Xvector_, y_value_array_type Yvector_,
-                  nnz_lno_t color_set_begin_, nnz_lno_t color_set_end_,
-                  nnz_lno_persistent_work_view_t color_adj_,
-                  nnz_lno_persistent_work_view_t cluster_offsets_,
-                  nnz_lno_persistent_work_view_t cluster_verts_,
-                  scalar_persistent_work_view_t inverse_diagonal_,
-                  nnz_lno_t clusters_per_team_,
-                  nnz_scalar_t omega_ = Kokkos::Details::ArithTraits<nnz_scalar_t>::one()) :
-          _xadj( xadj_),
-          _adj( adj_),
-          _adj_vals( adj_vals_),
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const PSGS_ForwardTag, const nnz_lno_t ii) const {
+      // color_adj(ii) is a cluster in the current color set.
+      nnz_lno_t clusterColorsetIndex = _color_set_begin + ii;
+      nnz_lno_t cluster              = _color_adj(clusterColorsetIndex);
+      nnz_scalar_t sum[apply_batch_size];
+      for (nnz_lno_t j = _cluster_offsets(cluster);
+           j < _cluster_offsets(cluster + 1); j++) {
+        rowApply(sum, _cluster_verts(j));
+      }
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const PSGS_BackwardTag, const nnz_lno_t ii) const {
+      // color_adj(ii) is a cluster in the current color set.
+      nnz_lno_t clusterColorsetIndex = _color_set_end - 1 - ii;
+      nnz_lno_t cluster              = _color_adj(clusterColorsetIndex);
+      nnz_scalar_t sum[apply_batch_size];
+      for (nnz_lno_t j = _cluster_offsets(cluster + 1);
+           j > _cluster_offsets(cluster); j--) {
+        rowApply(sum, _cluster_verts(j - 1));
+      }
+    }
+  };
+
+  template <typename x_value_array_type, typename y_value_array_type>
+  struct Team_PSGS {
+    // CSR storage of the matrix
+    const_lno_row_view_t _xadj;
+    const_lno_nnz_view_t _adj;
+    const_scalar_nnz_view_t _adj_vals;
+
+    // X,Y vectors, as in Ax = y
+    x_value_array_type _Xvector;
+    y_value_array_type _Yvector;
+    nnz_lno_t _color_set_begin;
+    nnz_lno_t _color_set_end;
+    nnz_lno_persistent_work_view_t _color_adj;
+    nnz_lno_persistent_work_view_t _cluster_offsets;
+    nnz_lno_persistent_work_view_t _cluster_verts;
+
+    //_clusters_per_team tries to reach the same total work per
+    // team by dividing the handle's heuristic get_
+    nnz_lno_t _clusters_per_team;
+
+    scalar_persistent_work_view_t _inverse_diagonal;
+
+    bool _is_backward;
+
+    nnz_scalar_t _omega;
+
+    Team_PSGS(
+        const_lno_row_view_t xadj_, const_lno_nnz_view_t adj_,
+        const_scalar_nnz_view_t adj_vals_, x_value_array_type Xvector_,
+        y_value_array_type Yvector_, nnz_lno_t color_set_begin_,
+        nnz_lno_t color_set_end_, nnz_lno_persistent_work_view_t color_adj_,
+        nnz_lno_persistent_work_view_t cluster_offsets_,
+        nnz_lno_persistent_work_view_t cluster_verts_,
+        scalar_persistent_work_view_t inverse_diagonal_,
+        nnz_lno_t clusters_per_team_,
+        nnz_scalar_t omega_ = Kokkos::Details::ArithTraits<nnz_scalar_t>::one())
+        : _xadj(xadj_),
+          _adj(adj_),
+          _adj_vals(adj_vals_),
           _Xvector(Xvector_),
           _Yvector(Yvector_),
           _color_set_begin(color_set_begin_),
@@ -292,701 +298,675 @@ namespace KokkosSparse{
           _clusters_per_team(clusters_per_team_),
           _inverse_diagonal(inverse_diagonal_),
           _is_backward(false),
-          _omega(omega_)
-        {}
-
-        template<int N>
-        KOKKOS_INLINE_FUNCTION void runColBatch(const team_member_t& teamMember, nnz_lno_t row, nnz_lno_t colStart) const
-        {
-          typedef KokkosKernels::Impl::array_sum_reduce<nnz_scalar_t, N> reducer; 
-          size_type row_begin = _xadj(row);
-          size_type row_end = _xadj(row + 1);
-          reducer sum;
-          Kokkos::parallel_reduce(
-            Kokkos::ThreadVectorRange(teamMember, row_end - row_begin),
-            [&] (size_type i, reducer& lsum)
-            {
-              size_type adjind = row_begin + i;
-              nnz_lno_t colIndex = _adj(adjind);
-              nnz_scalar_t val = _adj_vals(adjind);
-              for(int j = 0; j < N; j++)
-                lsum.data[j] += val * _Xvector(colIndex, colStart + j);
-            }, sum);
-          Kokkos::single(Kokkos::PerThread(teamMember),[&] ()
-          {
-            nnz_scalar_t invDiagonalVal = _inverse_diagonal(row);
-            for(int i = 0; i < N; i++)
-            {
-              _Xvector(row, colStart + i) +=
-                _omega * (_Yvector(row, colStart + i) - sum.data[i]) * invDiagonalVal;
-            }
-          });
+          _omega(omega_) {}
+
+    template <int N>
+    KOKKOS_INLINE_FUNCTION void runColBatch(const team_member_t& teamMember,
+                                            nnz_lno_t row,
+                                            nnz_lno_t colStart) const {
+      typedef KokkosKernels::Impl::array_sum_reduce<nnz_scalar_t, N> reducer;
+      size_type row_begin = _xadj(row);
+      size_type row_end   = _xadj(row + 1);
+      reducer sum;
+      Kokkos::parallel_reduce(
+          Kokkos::ThreadVectorRange(teamMember, row_end - row_begin),
+          [&](size_type i, reducer& lsum) {
+            size_type adjind   = row_begin + i;
+            nnz_lno_t colIndex = _adj(adjind);
+            nnz_scalar_t val   = _adj_vals(adjind);
+            for (int j = 0; j < N; j++)
+              lsum.data[j] += val * _Xvector(colIndex, colStart + j);
+          },
+          sum);
+      Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+        nnz_scalar_t invDiagonalVal = _inverse_diagonal(row);
+        for (int i = 0; i < N; i++) {
+          _Xvector(row, colStart + i) +=
+              _omega * (_Yvector(row, colStart + i) - sum.data[i]) *
+              invDiagonalVal;
         }
-
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const team_member_t& teamMember) const
-        {
-          Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, _clusters_per_team),
-            [&](const nnz_lno_t work)
-            {
-              nnz_lno_t ii = _color_set_begin + (teamMember.league_rank() * _clusters_per_team) + work;
-              if (ii >= _color_set_end)
-                return;
-              nnz_lno_t cluster = _color_adj(ii);
-              for(nnz_lno_t j = _cluster_offsets(cluster); j < _cluster_offsets(cluster + 1); j++)
-              {
-                nnz_lno_t row = _cluster_verts(j);
-                nnz_lno_t num_vecs = _Xvector.extent(1);
-                for(nnz_lno_t batch_start = 0; batch_start < num_vecs;)
-                {
-                  switch(num_vecs - batch_start)
-                  {
-                    #define COL_BATCH_CASE(n) \
-                    case n: \
-                            runColBatch<n>(teamMember, row, batch_start); \
-                            batch_start += n; \
-                            break;
-                    COL_BATCH_CASE(1)
-                    COL_BATCH_CASE(2)
-                    COL_BATCH_CASE(3)
-                    COL_BATCH_CASE(4)
-                    COL_BATCH_CASE(5)
-                    COL_BATCH_CASE(6)
-                    COL_BATCH_CASE(7)
-                    #undef COL_BATCH_CASE
-                    default:
-                      runColBatch<8>(teamMember, row, batch_start);
-                      batch_start += 8;
-                  }
+      });
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const team_member_t& teamMember) const {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(teamMember, _clusters_per_team),
+          [&](const nnz_lno_t work) {
+            nnz_lno_t ii = _color_set_begin +
+                           (teamMember.league_rank() * _clusters_per_team) +
+                           work;
+            if (ii >= _color_set_end) return;
+            nnz_lno_t cluster = _color_adj(ii);
+            for (nnz_lno_t j = _cluster_offsets(cluster);
+                 j < _cluster_offsets(cluster + 1); j++) {
+              nnz_lno_t row      = _cluster_verts(j);
+              nnz_lno_t num_vecs = _Xvector.extent(1);
+              for (nnz_lno_t batch_start = 0; batch_start < num_vecs;) {
+                switch (num_vecs - batch_start) {
+#define COL_BATCH_CASE(n)                         \
+  case n:                                         \
+    runColBatch<n>(teamMember, row, batch_start); \
+    batch_start += n;                             \
+    break;
+                  COL_BATCH_CASE(1)
+                  COL_BATCH_CASE(2)
+                  COL_BATCH_CASE(3)
+                  COL_BATCH_CASE(4)
+                  COL_BATCH_CASE(5)
+                  COL_BATCH_CASE(6)
+                  COL_BATCH_CASE(7)
+#undef COL_BATCH_CASE
+                  default:
+                    runColBatch<8>(teamMember, row, batch_start);
+                    batch_start += 8;
                 }
               }
-            });
-        }
-      };
-
-      /**
-       * \brief constructor
-       */
-
-      ClusterGaussSeidel(HandleType *handle_,
-                  nnz_lno_t num_rows_,
-                  nnz_lno_t num_cols_,
-                  const_lno_row_view_t row_map_,
-                  const_lno_nnz_view_t entries_,
-                  const_scalar_nnz_view_t values_):
-        handle(handle_), num_rows(num_rows_), num_cols(num_cols_),
-        row_map(row_map_), entries(entries_), values(values_),
+            }
+          });
+    }
+  };
+
+  /**
+   * \brief constructor
+   */
+
+  ClusterGaussSeidel(HandleType* handle_, nnz_lno_t num_rows_,
+                     nnz_lno_t num_cols_, const_lno_row_view_t row_map_,
+                     const_lno_nnz_view_t entries_,
+                     const_scalar_nnz_view_t values_)
+      : handle(handle_),
+        num_rows(num_rows_),
+        num_cols(num_cols_),
+        row_map(row_map_),
+        entries(entries_),
+        values(values_),
         have_diagonal_given(false),
-        is_symmetric(true)
-      {}
-
-      ClusterGaussSeidel(HandleType *handle_,
-                  nnz_lno_t num_rows_,
-                  nnz_lno_t num_cols_,
-                  const_lno_row_view_t row_map_,
-                  const_lno_nnz_view_t entries_,
-                  bool is_symmetric_ = true):
-        handle(handle_),
-        num_rows(num_rows_), num_cols(num_cols_),
+        is_symmetric(true) {}
+
+  ClusterGaussSeidel(HandleType* handle_, nnz_lno_t num_rows_,
+                     nnz_lno_t num_cols_, const_lno_row_view_t row_map_,
+                     const_lno_nnz_view_t entries_, bool is_symmetric_ = true)
+      : handle(handle_),
+        num_rows(num_rows_),
+        num_cols(num_cols_),
         row_map(row_map_),
         entries(entries_),
         values(),
         have_diagonal_given(false),
-        is_symmetric(is_symmetric_)
-      {}
-
-      /**
-       * \brief constructor
-       */
-      ClusterGaussSeidel(HandleType *handle_,
-                  nnz_lno_t num_rows_,
-                  nnz_lno_t num_cols_,
-                  const_lno_row_view_t row_map_,
-                  const_lno_nnz_view_t entries_,
-                  const_scalar_nnz_view_t values_,
-                  bool is_symmetric_):
-        handle(handle_),
-        num_rows(num_rows_), num_cols(num_cols_),
-        row_map(row_map_), entries(entries_), values(values_),
+        is_symmetric(is_symmetric_) {}
+
+  /**
+   * \brief constructor
+   */
+  ClusterGaussSeidel(HandleType* handle_, nnz_lno_t num_rows_,
+                     nnz_lno_t num_cols_, const_lno_row_view_t row_map_,
+                     const_lno_nnz_view_t entries_,
+                     const_scalar_nnz_view_t values_, bool is_symmetric_)
+      : handle(handle_),
+        num_rows(num_rows_),
+        num_cols(num_cols_),
+        row_map(row_map_),
+        entries(entries_),
+        values(values_),
         have_diagonal_given(false),
-        is_symmetric(is_symmetric_)
-      {}
-
-      ClusterGaussSeidel(
-                  HandleType *handle_,
-                  nnz_lno_t num_rows_,
-                  nnz_lno_t num_cols_,
-                  const_lno_row_view_t row_map_,
-                  const_lno_nnz_view_t entries_,
-                  const_scalar_nnz_view_t values_,
-                  const_scalar_nnz_view_t given_inverse_diagonal_,
-                  bool is_symmetric_):
-        handle(handle_),
-        num_rows(num_rows_), num_cols(num_cols_),
-        row_map(row_map_), entries(entries_), values(values_),
+        is_symmetric(is_symmetric_) {}
+
+  ClusterGaussSeidel(HandleType* handle_, nnz_lno_t num_rows_,
+                     nnz_lno_t num_cols_, const_lno_row_view_t row_map_,
+                     const_lno_nnz_view_t entries_,
+                     const_scalar_nnz_view_t values_,
+                     const_scalar_nnz_view_t given_inverse_diagonal_,
+                     bool is_symmetric_)
+      : handle(handle_),
+        num_rows(num_rows_),
+        num_cols(num_cols_),
+        row_map(row_map_),
+        entries(entries_),
+        values(values_),
         given_inverse_diagonal(given_inverse_diagonal_),
         have_diagonal_given(true),
-        is_symmetric(is_symmetric_)
-      {}
-
-      //Functor to swap the numbers of two colors,
-      //so that the last cluster has the last color.
-      //Except, doesn't touch the color of the last cluster,
-      //since that value is needed the entire time this is running.
-      struct ClusterColorRelabelFunctor
-      {
-        typedef typename HandleType::GraphColoringHandleType GCHandle;
-        typedef typename GCHandle::color_view_t ColorView;
-        typedef Kokkos::View<row_lno_t*, MyTempMemorySpace> RowmapView;
-        typedef Kokkos::View<nnz_lno_t*, MyTempMemorySpace> EntriesView;
-        ClusterColorRelabelFunctor(ColorView& colors_, color_t numClusterColors_, nnz_lno_t numClusters_)
-          : colors(colors_), numClusterColors(numClusterColors_), numClusters(numClusters_)
-        {}
-
-        KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const
-        {
-          if(colors(i) == numClusterColors)
-            colors(i) = colors(numClusters - 1);
-          else if(colors(i) == colors(numClusters - 1))
-            colors(i) = numClusterColors;
-        }
-
-        ColorView colors;
-        color_t numClusterColors;
-        nnz_lno_t numClusters;
-      };
-
-      //Relabel the last cluster, after running ClusterColorRelabelFunctor.
-      //Call with a one-element range policy.
-      struct RelabelLastColorFunctor
-      {
-        typedef typename HandleType::GraphColoringHandleType GCHandle;
-        typedef typename GCHandle::color_view_t ColorView;
-
-        RelabelLastColorFunctor(ColorView& colors_, color_t numClusterColors_, nnz_lno_t numClusters_)
-          : colors(colors_), numClusterColors(numClusterColors_), numClusters(numClusters_)
-        {}
-
-        KOKKOS_INLINE_FUNCTION void operator()(const size_type) const
-        {
-          colors(numClusters - 1) = numClusterColors;
-        }
-        
-        ColorView colors;
-        color_t numClusterColors;
-        nnz_lno_t numClusters;
-      };
-
-      struct ClusterToVertexColoring
-      {
-        typedef typename HandleType::GraphColoringHandleType GCHandle;
-        typedef typename GCHandle::color_view_t ColorView;
-
-        ClusterToVertexColoring(ColorView& clusterColors_, ColorView& vertexColors_, nnz_lno_t numRows_, nnz_lno_t numClusters_, nnz_lno_t clusterSize_)
-          : clusterColors(clusterColors_), vertexColors(vertexColors_), numRows(numRows_), numClusters(numClusters_), clusterSize(clusterSize_)
-        {}
-
-        KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const
-        {
-          size_type cluster = i / clusterSize;
-          size_type clusterOffset = i - cluster * clusterSize;
-          vertexColors(i) = ((clusterColors(cluster) - 1) * clusterSize) + clusterOffset + 1;
-        }
-
-        ColorView clusterColors;
-        ColorView vertexColors;
-        nnz_lno_t numRows;
-        nnz_lno_t numClusters;
-        nnz_lno_t clusterSize;
-      };
-
-      //Assign cluster labels to vertices, given that the vertices are naturally
-      //ordered so that contiguous groups of vertices form decent clusters.
-      template<typename View>
-      struct NopVertClusteringFunctor
-      {
-        NopVertClusteringFunctor(View& vertClusters_, nnz_lno_t clusterSize_) :
-            vertClusters(vertClusters_),
-            numRows(vertClusters.extent(0)),
-            clusterSize(clusterSize_)
-        {}
-        KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t i) const
-        {
-          vertClusters(i) = i / clusterSize;
-        }
-        View vertClusters;
-        nnz_lno_t numRows;
-        nnz_lno_t clusterSize;
-      };
-
-      template<typename View>
-      struct ReorderedClusteringFunctor
-      {
-        ReorderedClusteringFunctor(View& vertClusters_, View& ordering_, nnz_lno_t clusterSize_) :
-            vertClusters(vertClusters_),
-            ordering(ordering_),
-            numRows(vertClusters.extent(0)),
-            clusterSize(clusterSize_)
-        {}
-        KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t i) const
-        {
-          vertClusters(i) = ordering(i) / clusterSize;
-        }
-        View vertClusters;
-        View ordering;
-        nnz_lno_t numRows;
-        nnz_lno_t clusterSize;
-      };
-
-
-      void initialize_symbolic()
-      {
-        using nnz_view_t   = nnz_lno_persistent_work_view_t;
-        using in_rowmap_t  = const_lno_row_view_t;
-        using in_colinds_t = const_lno_nnz_view_t;
-        using rowmap_t     = Kokkos::View<size_type*, MyTempMemorySpace>;
-        using colinds_t    = Kokkos::View<nnz_lno_t*, MyTempMemorySpace>;
-        using raw_rowmap_t = Kokkos::View<const size_type*, MyTempMemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-        using raw_colinds_t = Kokkos::View<const nnz_lno_t*, MyTempMemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-        auto gsHandle = get_gs_handle();
+        is_symmetric(is_symmetric_) {}
+
+  // Functor to swap the numbers of two colors,
+  // so that the last cluster has the last color.
+  // Except, doesn't touch the color of the last cluster,
+  // since that value is needed the entire time this is running.
+  struct ClusterColorRelabelFunctor {
+    typedef typename HandleType::GraphColoringHandleType GCHandle;
+    typedef typename GCHandle::color_view_t ColorView;
+    typedef Kokkos::View<row_lno_t*, MyTempMemorySpace> RowmapView;
+    typedef Kokkos::View<nnz_lno_t*, MyTempMemorySpace> EntriesView;
+    ClusterColorRelabelFunctor(ColorView& colors_, color_t numClusterColors_,
+                               nnz_lno_t numClusters_)
+        : colors(colors_),
+          numClusterColors(numClusterColors_),
+          numClusters(numClusters_) {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const {
+      if (colors(i) == numClusterColors)
+        colors(i) = colors(numClusters - 1);
+      else if (colors(i) == colors(numClusters - 1))
+        colors(i) = numClusterColors;
+    }
+
+    ColorView colors;
+    color_t numClusterColors;
+    nnz_lno_t numClusters;
+  };
+
+  // Relabel the last cluster, after running ClusterColorRelabelFunctor.
+  // Call with a one-element range policy.
+  struct RelabelLastColorFunctor {
+    typedef typename HandleType::GraphColoringHandleType GCHandle;
+    typedef typename GCHandle::color_view_t ColorView;
+
+    RelabelLastColorFunctor(ColorView& colors_, color_t numClusterColors_,
+                            nnz_lno_t numClusters_)
+        : colors(colors_),
+          numClusterColors(numClusterColors_),
+          numClusters(numClusters_) {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(const size_type) const {
+      colors(numClusters - 1) = numClusterColors;
+    }
+
+    ColorView colors;
+    color_t numClusterColors;
+    nnz_lno_t numClusters;
+  };
+
+  struct ClusterToVertexColoring {
+    typedef typename HandleType::GraphColoringHandleType GCHandle;
+    typedef typename GCHandle::color_view_t ColorView;
+
+    ClusterToVertexColoring(ColorView& clusterColors_, ColorView& vertexColors_,
+                            nnz_lno_t numRows_, nnz_lno_t numClusters_,
+                            nnz_lno_t clusterSize_)
+        : clusterColors(clusterColors_),
+          vertexColors(vertexColors_),
+          numRows(numRows_),
+          numClusters(numClusters_),
+          clusterSize(clusterSize_) {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const {
+      size_type cluster       = i / clusterSize;
+      size_type clusterOffset = i - cluster * clusterSize;
+      vertexColors(i) =
+          ((clusterColors(cluster) - 1) * clusterSize) + clusterOffset + 1;
+    }
+
+    ColorView clusterColors;
+    ColorView vertexColors;
+    nnz_lno_t numRows;
+    nnz_lno_t numClusters;
+    nnz_lno_t clusterSize;
+  };
+
+  // Assign cluster labels to vertices, given that the vertices are naturally
+  // ordered so that contiguous groups of vertices form decent clusters.
+  template <typename View>
+  struct NopVertClusteringFunctor {
+    NopVertClusteringFunctor(View& vertClusters_, nnz_lno_t clusterSize_)
+        : vertClusters(vertClusters_),
+          numRows(vertClusters.extent(0)),
+          clusterSize(clusterSize_) {}
+    KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t i) const {
+      vertClusters(i) = i / clusterSize;
+    }
+    View vertClusters;
+    nnz_lno_t numRows;
+    nnz_lno_t clusterSize;
+  };
+
+  template <typename View>
+  struct ReorderedClusteringFunctor {
+    ReorderedClusteringFunctor(View& vertClusters_, View& ordering_,
+                               nnz_lno_t clusterSize_)
+        : vertClusters(vertClusters_),
+          ordering(ordering_),
+          numRows(vertClusters.extent(0)),
+          clusterSize(clusterSize_) {}
+    KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t i) const {
+      vertClusters(i) = ordering(i) / clusterSize;
+    }
+    View vertClusters;
+    View ordering;
+    nnz_lno_t numRows;
+    nnz_lno_t clusterSize;
+  };
+
+  void initialize_symbolic() {
+    using nnz_view_t    = nnz_lno_persistent_work_view_t;
+    using in_rowmap_t   = const_lno_row_view_t;
+    using in_colinds_t  = const_lno_nnz_view_t;
+    using rowmap_t      = Kokkos::View<size_type*, MyTempMemorySpace>;
+    using colinds_t     = Kokkos::View<nnz_lno_t*, MyTempMemorySpace>;
+    using raw_rowmap_t  = Kokkos::View<const size_type*, MyTempMemorySpace,
+                                      Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+    using raw_colinds_t = Kokkos::View<const nnz_lno_t*, MyTempMemorySpace,
+                                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+    auto gsHandle       = get_gs_handle();
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
-        Kokkos::Timer timer;
+    Kokkos::Timer timer;
 #endif
-        //sym_xadj/sym_adj is only used here for clustering.
-        //Create them as non-const, unmanaged views to avoid
-        //duplicating a bunch of code between the
-        //symmetric and non-symmetric input cases.
-        rowmap_t sym_xadj;
-        colinds_t sym_adj;
-        if(!this->is_symmetric)
-        {
-          KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap
-            <in_rowmap_t, in_colinds_t, rowmap_t, colinds_t, MyExecSpace>
-            (num_rows, this->row_map, this->entries, sym_xadj, sym_adj);
+    // sym_xadj/sym_adj is only used here for clustering.
+    // Create them as non-const, unmanaged views to avoid
+    // duplicating a bunch of code between the
+    // symmetric and non-symmetric input cases.
+    rowmap_t sym_xadj;
+    colinds_t sym_adj;
+    if (!this->is_symmetric) {
+      KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap<
+          in_rowmap_t, in_colinds_t, rowmap_t, colinds_t, MyExecSpace>(
+          num_rows, this->row_map, this->entries, sym_xadj, sym_adj);
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
-          std::cout << "SYMMETRIZING TIME: " << timer.seconds() << std::endl;
-          timer.reset();
+      std::cout << "SYMMETRIZING TIME: " << timer.seconds() << std::endl;
+      timer.reset();
 #endif
-        }
-        //Now that a symmetric graph is available, build the cluster graph (also symmetric)
-        nnz_lno_t clusterSize = gsHandle->get_cluster_size();
-        nnz_lno_t numClusters = (num_rows + clusterSize - 1) / clusterSize;
-        raw_rowmap_t raw_sym_xadj;
-        raw_colinds_t raw_sym_adj;
-        if(this->is_symmetric)
-        {
-          raw_sym_xadj = raw_rowmap_t(this->row_map.data(), this->row_map.extent(0));
-          raw_sym_adj = raw_colinds_t(this->entries.data(), this->entries.extent(0));
-        }
-        else
-        {
-          raw_sym_xadj = raw_rowmap_t(sym_xadj.data(), sym_xadj.extent(0));
-          raw_sym_adj = raw_colinds_t(sym_adj.data(), sym_adj.extent(0));
-        }
-        nnz_view_t vertClusters;
-        auto clusterAlgo = gsHandle->get_clustering_algo();
-        if(clusterAlgo == CLUSTER_DEFAULT)
-          clusterAlgo = CLUSTER_MIS2;
-        switch(clusterAlgo)
-        {
-          case CLUSTER_MIS2:
-          {
-            vertClusters = KokkosGraph::Experimental::graph_mis2_coarsen<MyExecSpace, raw_rowmap_t, raw_colinds_t, nnz_view_t>
-              (raw_sym_xadj, raw_sym_adj, numClusters, KokkosGraph::MIS2_FAST);
-            break;
-          }
-          case CLUSTER_BALLOON:
-          {
-            BalloonClustering<HandleType, raw_rowmap_t, raw_colinds_t> balloon(num_rows, raw_sym_xadj, raw_sym_adj);
-            vertClusters = balloon.run(clusterSize);
-            break;
-          }
-          case CLUSTER_DEFAULT:
-          {
-            throw std::logic_error("Logic to choose default clustering algorithm is incorrect");
-          }
-          default:
-            throw std::runtime_error("Clustering algo " + std::to_string((int) clusterAlgo) + " is not implemented");
-        }
+    }
+    // Now that a symmetric graph is available, build the cluster graph (also
+    // symmetric)
+    nnz_lno_t clusterSize = gsHandle->get_cluster_size();
+    nnz_lno_t numClusters = (num_rows + clusterSize - 1) / clusterSize;
+    raw_rowmap_t raw_sym_xadj;
+    raw_colinds_t raw_sym_adj;
+    if (this->is_symmetric) {
+      raw_sym_xadj =
+          raw_rowmap_t(this->row_map.data(), this->row_map.extent(0));
+      raw_sym_adj =
+          raw_colinds_t(this->entries.data(), this->entries.extent(0));
+    } else {
+      raw_sym_xadj = raw_rowmap_t(sym_xadj.data(), sym_xadj.extent(0));
+      raw_sym_adj  = raw_colinds_t(sym_adj.data(), sym_adj.extent(0));
+    }
+    nnz_view_t vertClusters;
+    auto clusterAlgo = gsHandle->get_clustering_algo();
+    if (clusterAlgo == CLUSTER_DEFAULT) clusterAlgo = CLUSTER_MIS2;
+    switch (clusterAlgo) {
+      case CLUSTER_MIS2: {
+        vertClusters =
+            KokkosGraph::graph_mis2_aggregate<MyExecSpace, raw_rowmap_t,
+                                              raw_colinds_t, nnz_view_t>(
+                raw_sym_xadj, raw_sym_adj, numClusters);
+        break;
+      }
+      case CLUSTER_BALLOON: {
+        BalloonClustering<HandleType, raw_rowmap_t, raw_colinds_t> balloon(
+            num_rows, raw_sym_xadj, raw_sym_adj);
+        vertClusters = balloon.run(clusterSize);
+        break;
+      }
+      case CLUSTER_DEFAULT: {
+        throw std::logic_error(
+            "Logic to choose default clustering algorithm is incorrect");
+      }
+      default:
+        throw std::runtime_error("Clustering algo " +
+                                 std::to_string((int)clusterAlgo) +
+                                 " is not implemented");
+    }
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
-        std::cout << "Graph clustering: " << timer.seconds() << '\n';
-        timer.reset();
+    std::cout << "Graph clustering: " << timer.seconds() << '\n';
+    timer.reset();
 #endif
-        rowmap_t clusterRowmap;
-        colinds_t clusterEntries;
-        nnz_view_t clusterOffsets;
-        nnz_view_t clusterVerts;
-        KokkosGraph::Experimental::graph_explicit_coarsen_with_inverse_map<Kokkos::Device<MyExecSpace, MyTempMemorySpace>, raw_rowmap_t, raw_colinds_t, nnz_view_t, rowmap_t, colinds_t, nnz_view_t>
-          (raw_sym_xadj, raw_sym_adj, vertClusters, numClusters, clusterRowmap, clusterEntries, clusterOffsets, clusterVerts, false);
+    rowmap_t clusterRowmap;
+    colinds_t clusterEntries;
+    nnz_view_t clusterOffsets;
+    nnz_view_t clusterVerts;
+    KokkosGraph::Experimental::graph_explicit_coarsen_with_inverse_map<
+        Kokkos::Device<MyExecSpace, MyTempMemorySpace>, raw_rowmap_t,
+        raw_colinds_t, nnz_view_t, rowmap_t, colinds_t, nnz_view_t>(
+        raw_sym_xadj, raw_sym_adj, vertClusters, numClusters, clusterRowmap,
+        clusterEntries, clusterOffsets, clusterVerts, false);
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
-        std::cout << "Building explicit cluster graph: " << timer.seconds() << '\n';
-        timer.reset();
+    std::cout << "Building explicit cluster graph: " << timer.seconds() << '\n';
+    timer.reset();
 #endif
 #if KOKKOSSPARSE_IMPL_PRINTDEBUG
-        {
-          auto clusterRowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), clusterRowmap);
-          auto clusterEntriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), clusterEntries);
-          puts("Cluster graph (cluster #, and neighbors):");
-          for(nnz_lno_t i = 0; i < numClusters; i++)
-          {
-            printf("%d: ", (int) i);
-            for(nnz_lno_t j = clusterRowmapHost(i); j < clusterRowmapHost(i + 1); j++)
-            {
-              printf("%d ", (int) clusterEntriesHost(j));
-            }
-            putchar('\n');
-          }
-          printf("\n\n\n");
+    {
+      auto clusterRowmapHost = Kokkos::create_mirror_view_and_copy(
+          Kokkos::HostSpace(), clusterRowmap);
+      auto clusterEntriesHost = Kokkos::create_mirror_view_and_copy(
+          Kokkos::HostSpace(), clusterEntries);
+      puts("Cluster graph (cluster #, and neighbors):");
+      for (nnz_lno_t i = 0; i < numClusters; i++) {
+        printf("%d: ", (int)i);
+        for (nnz_lno_t j = clusterRowmapHost(i); j < clusterRowmapHost(i + 1);
+             j++) {
+          printf("%d ", (int)clusterEntriesHost(j));
         }
+        putchar('\n');
+      }
+      printf("\n\n\n");
+    }
 #endif
-        //Get the coloring of the cluster graph.
-        typename HandleType::GraphColoringHandleType::color_view_t colors;
-        color_t numColors;
+    // Get the coloring of the cluster graph.
+    typename HandleType::GraphColoringHandleType::color_view_t colors;
+    color_t numColors;
 #if KOKKOSSPARSE_IMPL_RUNSEQUENTIAL
-        numColors = numClusters;
-        std::cout << "SEQUENTIAL CGS: numColors = numClusters = " << numClusters << '\n';
-        typename HandleType::GraphColoringHandleType::color_view_t::HostMirror h_colors = Kokkos::create_mirror_view(colors);
-        for(int i = 0; i < numClusters; ++i){
-          h_colors(i) = i + 1;
-        }
-        Kokkos::deep_copy(colors, h_colors);
+    numColors = numClusters;
+    std::cout << "SEQUENTIAL CGS: numColors = numClusters = " << numClusters
+              << '\n';
+    typename HandleType::GraphColoringHandleType::color_view_t::HostMirror
+        h_colors = Kokkos::create_mirror_view(colors);
+    for (int i = 0; i < numClusters; ++i) {
+      h_colors(i) = i + 1;
+    }
+    Kokkos::deep_copy(colors, h_colors);
 #else
-        //Create a handle that uses nnz_lno_t as the size_type, since the cluster graph should never be larger than 2^31 entries.
-        HandleType kh;
-        kh.create_graph_coloring_handle(KokkosGraph::COLORING_DEFAULT);
-        KokkosGraph::Experimental::graph_color_symbolic(&kh, numClusters, numClusters, clusterRowmap, clusterEntries);
-        //retrieve colors
-        auto coloringHandle = kh.get_graph_coloring_handle();
-        colors = coloringHandle->get_vertex_colors();
-        numColors = coloringHandle->get_num_colors();
-        kh.destroy_graph_coloring_handle();
+    // Create a handle that uses nnz_lno_t as the size_type, since the cluster
+    // graph should never be larger than 2^31 entries.
+    HandleType kh;
+    kh.create_graph_coloring_handle(gsHandle->get_coloring_algorithm());
+    KokkosGraph::Experimental::graph_color_symbolic(
+        &kh, numClusters, numClusters, clusterRowmap, clusterEntries);
+    // retrieve colors
+    auto coloringHandle = kh.get_graph_coloring_handle();
+    colors              = coloringHandle->get_vertex_colors();
+    numColors           = coloringHandle->get_num_colors();
+    kh.destroy_graph_coloring_handle();
 #endif
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
-        std::cout << "Coloring: " << timer.seconds() << '\n';
-        timer.reset();
+    std::cout << "Coloring: " << timer.seconds() << '\n';
+    timer.reset();
 #endif
-        nnz_lno_persistent_work_view_t color_xadj;
-        nnz_lno_persistent_work_view_t color_adj;
-        KokkosKernels::Impl::create_reverse_map
-          <typename HandleType::GraphColoringHandleType::color_view_t,
-           nnz_lno_persistent_work_view_t, MyExecSpace>
-          (numClusters, numColors, colors, color_xadj, color_adj);
-        MyExecSpace().fence();
+    nnz_lno_persistent_work_view_t color_xadj;
+    nnz_lno_persistent_work_view_t color_adj;
+    KokkosKernels::Impl::create_reverse_map<
+        typename HandleType::GraphColoringHandleType::color_view_t,
+        nnz_lno_persistent_work_view_t, MyExecSpace>(
+        numClusters, numColors, colors, color_xadj, color_adj);
+    MyExecSpace().fence();
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
-        std::cout << "CREATE_REVERSE_MAP:" << timer.seconds() << std::endl;
-        timer.reset();
+    std::cout << "CREATE_REVERSE_MAP:" << timer.seconds() << std::endl;
+    timer.reset();
 #endif
-        nnz_lno_persistent_work_host_view_t color_xadj_host(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Color xadj"), color_xadj.extent(0));
-        Kokkos::deep_copy(color_xadj_host, color_xadj);
-        gsHandle->set_color_xadj(color_xadj_host);
-        gsHandle->set_color_adj(color_adj);
-        gsHandle->set_num_colors(numColors);
-        gsHandle->set_cluster_xadj(clusterOffsets);
-        gsHandle->set_cluster_adj(clusterVerts);
-        gsHandle->set_call_symbolic(true);
-      }
-
-      struct Get_Matrix_Diagonals
-      {
-        const_lno_row_view_t _xadj;
-        const_lno_nnz_view_t _adj; // CSR storage of the graph.
-        const_scalar_nnz_view_t _adj_vals; // CSR storage of the graph.
-        scalar_persistent_work_view_t _diagonals;
-
-        nnz_lno_t num_total_rows;
-        nnz_lno_t rows_per_team;
-
-        nnz_scalar_t one;
-
-        Get_Matrix_Diagonals(
-                             const_lno_row_view_t xadj_,
-                             const_lno_nnz_view_t adj_,
-                             const_scalar_nnz_view_t adj_vals_,
-                             scalar_persistent_work_view_t diagonals_,
-                             nnz_lno_t num_total_rows_,
-                             nnz_lno_t rows_per_team_) :
-          _xadj(xadj_),
+    nnz_lno_persistent_work_host_view_t color_xadj_host(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Color xadj"),
+        color_xadj.extent(0));
+    Kokkos::deep_copy(color_xadj_host, color_xadj);
+    gsHandle->set_color_xadj(color_xadj_host);
+    gsHandle->set_color_adj(color_adj);
+    gsHandle->set_num_colors(numColors);
+    gsHandle->set_cluster_xadj(clusterOffsets);
+    gsHandle->set_cluster_adj(clusterVerts);
+    gsHandle->set_call_symbolic(true);
+  }
+
+  struct Get_Matrix_Diagonals {
+    const_lno_row_view_t _xadj;
+    const_lno_nnz_view_t _adj;          // CSR storage of the graph.
+    const_scalar_nnz_view_t _adj_vals;  // CSR storage of the graph.
+    scalar_persistent_work_view_t _diagonals;
+
+    nnz_lno_t num_total_rows;
+    nnz_lno_t rows_per_team;
+
+    nnz_scalar_t one;
+
+    Get_Matrix_Diagonals(const_lno_row_view_t xadj_, const_lno_nnz_view_t adj_,
+                         const_scalar_nnz_view_t adj_vals_,
+                         scalar_persistent_work_view_t diagonals_,
+                         nnz_lno_t num_total_rows_, nnz_lno_t rows_per_team_)
+        : _xadj(xadj_),
           _adj(adj_),
-          _adj_vals(adj_vals_), _diagonals(diagonals_),
-          num_total_rows(num_total_rows_), rows_per_team(rows_per_team_),
-          one(Kokkos::Details::ArithTraits<nnz_scalar_t>::one())
-        {}
-
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const nnz_lno_t row_id) const {
-          size_type row_begin = _xadj(row_id);
-          size_type row_end = _xadj(row_id + 1);
-          for(size_type j = row_begin; j < row_end; j++)
-          {
-            nnz_lno_t column_id = _adj(j);
-            if(column_id == row_id)
-            {
-              nnz_scalar_t val = _adj_vals(j);
-              _diagonals(row_id) = one / val;
-              break;
-            }
-          }
-        }
-
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const team_member_t &team) const
-        {
-          const nnz_lno_t i_begin = team.league_rank() * rows_per_team;
-          const nnz_lno_t i_end = i_begin + rows_per_team <= num_total_rows ? i_begin + rows_per_team : num_total_rows;
-          Kokkos::parallel_for(Kokkos::TeamThreadRange(team,i_begin,i_end),
-          [&] (const nnz_lno_t row_id)
-          {
-            size_type row_begin = _xadj(row_id);
-            size_type row_end = _xadj(row_id + 1);
-            nnz_lno_t row_size = row_end - row_begin;
-
-            Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, row_size),
-            [&] (const nnz_lno_t col_ind)
-            {
-              size_type val_index = col_ind + row_begin;
-              nnz_lno_t column_id = _adj(val_index);
-              if(column_id == row_id)
-              {
-                _diagonals(row_id) = one / _adj_vals(val_index);
-                return;
-              }
-            });
-          });
-        }
-      };
-
-      void initialize_numeric()
-      {
-        auto gsHandle = get_gs_handle();
-        if(!gsHandle->is_symbolic_called())
-        {
-          this->initialize_symbolic();
+          _adj_vals(adj_vals_),
+          _diagonals(diagonals_),
+          num_total_rows(num_total_rows_),
+          rows_per_team(rows_per_team_),
+          one(Kokkos::Details::ArithTraits<nnz_scalar_t>::one()) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const nnz_lno_t row_id) const {
+      size_type row_begin = _xadj(row_id);
+      size_type row_end   = _xadj(row_id + 1);
+      for (size_type j = row_begin; j < row_end; j++) {
+        nnz_lno_t column_id = _adj(j);
+        if (column_id == row_id) {
+          nnz_scalar_t val   = _adj_vals(j);
+          _diagonals(row_id) = one / val;
+          break;
         }
-        //Timer for whole numeric
+      }
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const team_member_t& team) const {
+      const nnz_lno_t i_begin = team.league_rank() * rows_per_team;
+      const nnz_lno_t i_end   = i_begin + rows_per_team <= num_total_rows
+                                  ? i_begin + rows_per_team
+                                  : num_total_rows;
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(team, i_begin, i_end),
+                           [&](const nnz_lno_t row_id) {
+                             size_type row_begin = _xadj(row_id);
+                             size_type row_end   = _xadj(row_id + 1);
+                             nnz_lno_t row_size  = row_end - row_begin;
+
+                             Kokkos::parallel_for(
+                                 Kokkos::ThreadVectorRange(team, row_size),
+                                 [&](const nnz_lno_t col_ind) {
+                                   size_type val_index = col_ind + row_begin;
+                                   nnz_lno_t column_id = _adj(val_index);
+                                   if (column_id == row_id) {
+                                     _diagonals(row_id) =
+                                         one / _adj_vals(val_index);
+                                     return;
+                                   }
+                                 });
+                           });
+    }
+  };
+
+  void initialize_numeric() {
+    auto gsHandle = get_gs_handle();
+    if (!gsHandle->is_symbolic_called()) {
+      this->initialize_symbolic();
+    }
+    // Timer for whole numeric
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
-        Kokkos::Timer timer;
+    Kokkos::Timer timer;
 #endif
-        size_type nnz = this->entries.extent(0);
-
-        int suggested_vector_size = this->handle->get_suggested_vector_size(num_rows, nnz);
-        int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-
-        scalar_persistent_work_view_t inverse_diagonal(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Aii^-1"), num_rows);
-        nnz_lno_t rows_per_team = this->handle->get_team_work_size(suggested_team_size, MyExecSpace::concurrency(), num_rows);
-
-        if(have_diagonal_given) {
-          Kokkos::deep_copy(inverse_diagonal, this->given_inverse_diagonal);
-        }
-        else {
-          //extract inverse diagonal from matrix
-          Get_Matrix_Diagonals gmd(
-              this->row_map, this->entries, this->values,
-              inverse_diagonal,
-              num_rows, rows_per_team);
-          if(gsHandle->use_teams())
-          {
-            Kokkos::parallel_for("KokkosSparse::GaussSeidel::team_get_matrix_diagonals",
-                team_policy_t((num_rows + rows_per_team - 1) / rows_per_team, suggested_team_size, suggested_vector_size), gmd);
-          }
-          else
-          {
-            Kokkos::parallel_for("KokkosSparse::GaussSeidel::get_matrix_diagonals",
-                my_exec_space(0, num_rows), gmd);
-          }
-        }
-        gsHandle->set_inverse_diagonal(inverse_diagonal);
-        gsHandle->set_call_numeric(true);
-        MyExecSpace().fence();
+    size_type nnz = this->entries.extent(0);
+
+    int suggested_vector_size =
+        this->handle->get_suggested_vector_size(num_rows, nnz);
+    int suggested_team_size =
+        this->handle->get_suggested_team_size(suggested_vector_size);
+
+    scalar_persistent_work_view_t inverse_diagonal(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Aii^-1"), num_rows);
+    nnz_lno_t rows_per_team = this->handle->get_team_work_size(
+        suggested_team_size, MyExecSpace::concurrency(), num_rows);
+
+    if (have_diagonal_given) {
+      Kokkos::deep_copy(inverse_diagonal, this->given_inverse_diagonal);
+    } else {
+      // extract inverse diagonal from matrix
+      Get_Matrix_Diagonals gmd(this->row_map, this->entries, this->values,
+                               inverse_diagonal, num_rows, rows_per_team);
+      if (gsHandle->use_teams()) {
+        Kokkos::parallel_for(
+            "KokkosSparse::GaussSeidel::team_get_matrix_diagonals",
+            team_policy_t((num_rows + rows_per_team - 1) / rows_per_team,
+                          suggested_team_size, suggested_vector_size),
+            gmd);
+      } else {
+        Kokkos::parallel_for("KokkosSparse::GaussSeidel::get_matrix_diagonals",
+                             my_exec_space(0, num_rows), gmd);
+      }
+    }
+    gsHandle->set_inverse_diagonal(inverse_diagonal);
+    gsHandle->set_call_numeric(true);
+    MyExecSpace().fence();
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
-        std::cout << "NUMERIC:" << timer.seconds() << std::endl;
+    std::cout << "NUMERIC:" << timer.seconds() << std::endl;
 #endif
-      }
-
-      template <typename x_value_array_type, typename y_value_array_type>
-      void apply(
-          x_value_array_type x_lhs_output_vec,
-          y_value_array_type y_rhs_input_vec,
-          bool init_zero_x_vector = false,
-          int numIter = 1,
-          nnz_scalar_t omega = Kokkos::Details::ArithTraits<nnz_scalar_t>::one(),
-          bool apply_forward = true,
-          bool apply_backward = true,
-          bool /*update_y_vector*/ = true)
-      {
-        auto gsHandle = get_gs_handle();
-
-        size_type nnz = entries.extent(0);
-        nnz_lno_persistent_work_view_t color_adj = gsHandle->get_color_adj();
-        nnz_lno_persistent_work_host_view_t h_color_xadj = gsHandle->get_color_xadj();
-
-        color_t numColors = gsHandle->get_num_colors();
-
-        if(init_zero_x_vector){
-          KokkosKernels::Impl::zero_vector<x_value_array_type, MyExecSpace>(num_cols, x_lhs_output_vec);
-        }
-
-        scalar_persistent_work_view_t inverse_diagonal = gsHandle->get_inverse_diagonal();
-
-        if(gsHandle->use_teams())
-        {
-          int suggested_vector_size = this->handle->get_suggested_vector_size(num_rows, nnz);
-          int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-
-          nnz_lno_t rows_per_team = this->handle->get_team_work_size(suggested_team_size, MyExecSpace::concurrency(), num_rows);
-          //Get clusters per team. Round down to favor finer granularity, since this is sensitive to load imbalance
-          nnz_lno_t clusters_per_team = rows_per_team / gsHandle->get_cluster_size();
-          if(clusters_per_team == 0)
-            clusters_per_team = 1;
-
-          Team_PSGS<x_value_array_type, y_value_array_type> gs(
-              this->row_map, this->entries, this->values,
-              x_lhs_output_vec, y_rhs_input_vec,
-              0, 0, //color set range is set right before launch for each color set
-              color_adj,
-              gsHandle->get_cluster_xadj(), gsHandle->get_cluster_adj(),
-              inverse_diagonal,
-              clusters_per_team,
-              omega);
-
-          this->IterativeTeamPSGS(
-              gs,
-              numColors,
-              h_color_xadj,
-              suggested_team_size,
-              suggested_vector_size,
-              numIter,
-              apply_forward,
-              apply_backward);
-        }
-        else
-        {
-          PSGS<x_value_array_type, y_value_array_type> gs(
-              this->row_map, this->entries, this->values,
-              x_lhs_output_vec, y_rhs_input_vec,
-              color_adj,
-              gsHandle->get_cluster_xadj(), gsHandle->get_cluster_adj(),
-              omega, inverse_diagonal);
-
-          this->IterativePSGS(
-              gs,
-              numColors,
-              h_color_xadj,
-              numIter,
-              apply_forward,
-              apply_backward);
-        }
+  }
+
+  template <typename x_value_array_type, typename y_value_array_type>
+  void apply(
+      x_value_array_type x_lhs_output_vec, y_value_array_type y_rhs_input_vec,
+      bool init_zero_x_vector = false, int numIter = 1,
+      nnz_scalar_t omega = Kokkos::Details::ArithTraits<nnz_scalar_t>::one(),
+      bool apply_forward = true, bool apply_backward = true,
+      bool /*update_y_vector*/ = true) {
+    auto gsHandle = get_gs_handle();
+
+    size_type nnz                            = entries.extent(0);
+    nnz_lno_persistent_work_view_t color_adj = gsHandle->get_color_adj();
+    nnz_lno_persistent_work_host_view_t h_color_xadj =
+        gsHandle->get_color_xadj();
+
+    color_t numColors = gsHandle->get_num_colors();
+
+    if (init_zero_x_vector) {
+      KokkosKernels::Impl::zero_vector<x_value_array_type, MyExecSpace>(
+          num_cols, x_lhs_output_vec);
+    }
+
+    scalar_persistent_work_view_t inverse_diagonal =
+        gsHandle->get_inverse_diagonal();
+
+    if (gsHandle->use_teams()) {
+      int suggested_vector_size =
+          this->handle->get_suggested_vector_size(num_rows, nnz);
+      int suggested_team_size =
+          this->handle->get_suggested_team_size(suggested_vector_size);
+
+      nnz_lno_t rows_per_team = this->handle->get_team_work_size(
+          suggested_team_size, MyExecSpace::concurrency(), num_rows);
+      // Get clusters per team. Round down to favor finer granularity, since
+      // this is sensitive to load imbalance
+      nnz_lno_t clusters_per_team =
+          rows_per_team / gsHandle->get_cluster_size();
+      if (clusters_per_team == 0) clusters_per_team = 1;
+
+      Team_PSGS<x_value_array_type, y_value_array_type> gs(
+          this->row_map, this->entries, this->values, x_lhs_output_vec,
+          y_rhs_input_vec, 0,
+          0,  // color set range is set right before launch for each color set
+          color_adj, gsHandle->get_cluster_xadj(), gsHandle->get_cluster_adj(),
+          inverse_diagonal, clusters_per_team, omega);
+
+      this->IterativeTeamPSGS(gs, numColors, h_color_xadj, suggested_team_size,
+                              suggested_vector_size, numIter, apply_forward,
+                              apply_backward);
+    } else {
+      PSGS<x_value_array_type, y_value_array_type> gs(
+          this->row_map, this->entries, this->values, x_lhs_output_vec,
+          y_rhs_input_vec, color_adj, gsHandle->get_cluster_xadj(),
+          gsHandle->get_cluster_adj(), omega, inverse_diagonal);
+
+      this->IterativePSGS(gs, numColors, h_color_xadj, numIter, apply_forward,
+                          apply_backward);
+    }
+    MyExecSpace().fence();
+  }
+
+  template <typename TPSGS>
+  void IterativeTeamPSGS(TPSGS& gs, color_t numColors,
+                         nnz_lno_persistent_work_host_view_t h_color_xadj,
+                         nnz_lno_t team_size, nnz_lno_t vec_size,
+                         int num_iteration, bool apply_forward,
+                         bool apply_backward) {
+    for (int i = 0; i < num_iteration; ++i)
+      this->DoTeamPSGS(gs, numColors, h_color_xadj, team_size, vec_size,
+                       apply_forward, apply_backward);
+  }
+
+  template <typename TPSGS>
+  void DoTeamPSGS(TPSGS& gs, color_t numColors,
+                  nnz_lno_persistent_work_host_view_t h_color_xadj,
+                  nnz_lno_t team_size, nnz_lno_t vec_size, bool apply_forward,
+                  bool apply_backward) {
+    if (apply_forward) {
+      gs._is_backward = false;
+      for (color_t i = 0; i < numColors; ++i) {
+        nnz_lno_t color_index_begin = h_color_xadj(i);
+        nnz_lno_t color_index_end   = h_color_xadj(i + 1);
+        int overall_work    = color_index_end - color_index_begin;  // /256 + 1;
+        gs._color_set_begin = color_index_begin;
+        gs._color_set_end   = color_index_end;
+        Kokkos::parallel_for(
+            "KokkosSparse::GaussSeidel::Team_PSGS::forward",
+            team_policy_t((overall_work + gs._clusters_per_team - 1) /
+                              gs._clusters_per_team,
+                          team_size, vec_size),
+            gs);
         MyExecSpace().fence();
       }
-
-      template<typename TPSGS>
-      void IterativeTeamPSGS(
-          TPSGS& gs,
-          color_t numColors,
-          nnz_lno_persistent_work_host_view_t h_color_xadj,
-          nnz_lno_t team_size,
-          nnz_lno_t vec_size,
-          int num_iteration,
-          bool apply_forward,
-          bool apply_backward)
-      {
-        for (int i = 0; i < num_iteration; ++i)
-          this->DoTeamPSGS(gs, numColors, h_color_xadj, team_size, vec_size, apply_forward, apply_backward);
-      }
-
-      template<typename TPSGS>
-      void DoTeamPSGS(
-          TPSGS& gs, color_t numColors, nnz_lno_persistent_work_host_view_t h_color_xadj,
-          nnz_lno_t team_size, nnz_lno_t vec_size,
-          bool apply_forward,
-          bool apply_backward)
-      {
-        if (apply_forward)
-        {
-          gs._is_backward = false;
-          for (color_t i = 0; i < numColors; ++i){
-            nnz_lno_t color_index_begin = h_color_xadj(i);
-            nnz_lno_t color_index_end = h_color_xadj(i + 1);
-            int overall_work = color_index_end - color_index_begin;// /256 + 1;
-            gs._color_set_begin = color_index_begin;
-            gs._color_set_end = color_index_end;
-            Kokkos::parallel_for("KokkosSparse::GaussSeidel::Team_PSGS::forward",
-                                 team_policy_t((overall_work + gs._clusters_per_team - 1) / gs._clusters_per_team, team_size, vec_size),
-                                 gs);
-            MyExecSpace().fence();
+    }
+    if (apply_backward) {
+      gs._is_backward = true;
+      if (numColors > 0)
+        for (color_t i = numColors - 1;; --i) {
+          nnz_lno_t color_index_begin = h_color_xadj(i);
+          nnz_lno_t color_index_end   = h_color_xadj(i + 1);
+          nnz_lno_t overall_work =
+              color_index_end - color_index_begin;  // /256 + 1;
+          gs._color_set_begin = color_index_begin;
+          gs._color_set_end   = color_index_end;
+          Kokkos::parallel_for(
+              "KokkosSparse::GaussSeidel::Team_PSGS::forward",
+              team_policy_t((overall_work + gs._clusters_per_team - 1) /
+                                gs._clusters_per_team,
+                            team_size, vec_size),
+              gs);
+          MyExecSpace().fence();
+          if (i == 0) {
+            break;
           }
         }
-        if (apply_backward)
-        {
-          gs._is_backward = true;
-          if (numColors > 0)
-            for (color_t i = numColors - 1; ; --i) {
-              nnz_lno_t color_index_begin = h_color_xadj(i);
-              nnz_lno_t color_index_end = h_color_xadj(i + 1);
-              nnz_lno_t overall_work = color_index_end - color_index_begin;// /256 + 1;
-              gs._color_set_begin = color_index_begin;
-              gs._color_set_end = color_index_end;
-              Kokkos::parallel_for("KokkosSparse::GaussSeidel::Team_PSGS::forward",
-                                   team_policy_t((overall_work + gs._clusters_per_team - 1) / gs._clusters_per_team, team_size, vec_size),
-                                   gs);
-              MyExecSpace().fence();
-              if (i == 0){
-                break;
-              }
-            }
-        }
-      }
-
-      template<typename PSGS>
-      void IterativePSGS(
-          PSGS& gs,
-          color_t numColors,
-          nnz_lno_persistent_work_host_view_t h_color_xadj,
-          int num_iteration,
-          bool apply_forward,
-          bool apply_backward)
-      {
-        for (int i = 0; i < num_iteration; ++i){
-          this->DoPSGS(gs, numColors, h_color_xadj, apply_forward, apply_backward);
-        }
+    }
+  }
+
+  template <typename PSGS>
+  void IterativePSGS(PSGS& gs, color_t numColors,
+                     nnz_lno_persistent_work_host_view_t h_color_xadj,
+                     int num_iteration, bool apply_forward,
+                     bool apply_backward) {
+    for (int i = 0; i < num_iteration; ++i) {
+      this->DoPSGS(gs, numColors, h_color_xadj, apply_forward, apply_backward);
+    }
+  }
+
+  template <typename PSGS>
+  void DoPSGS(PSGS& gs, color_t numColors,
+              nnz_lno_persistent_work_host_view_t h_color_xadj,
+              bool apply_forward, bool apply_backward) {
+    if (apply_forward) {
+      for (color_t i = 0; i < numColors; ++i) {
+        nnz_lno_t color_index_begin = h_color_xadj(i);
+        nnz_lno_t color_index_end   = h_color_xadj(i + 1);
+        gs._color_set_begin         = color_index_begin;
+        gs._color_set_end           = color_index_end;
+        Kokkos::parallel_for("KokkosSparse::GaussSeidel::PSGS::forward",
+                             Kokkos::RangePolicy<MyExecSpace, PSGS_ForwardTag>(
+                                 0, color_index_end - color_index_begin),
+                             gs);
+        MyExecSpace().fence();
       }
-
-      template<typename PSGS>
-      void DoPSGS(
-          PSGS &gs, color_t numColors, nnz_lno_persistent_work_host_view_t h_color_xadj,
-          bool apply_forward,
-          bool apply_backward)
-      {
-        if (apply_forward){
-          for (color_t i = 0; i < numColors; ++i){
-            nnz_lno_t color_index_begin = h_color_xadj(i);
-            nnz_lno_t color_index_end = h_color_xadj(i + 1);
-            gs._color_set_begin = color_index_begin;
-            gs._color_set_end = color_index_end;
-            Kokkos::parallel_for ("KokkosSparse::GaussSeidel::PSGS::forward",
-                Kokkos::RangePolicy<MyExecSpace, PSGS_ForwardTag>
-                (0, color_index_end - color_index_begin), gs);
-            MyExecSpace().fence();
-          }
-        }
-        if (apply_backward && numColors){
-          for (size_type i = numColors - 1; ; --i){
-            nnz_lno_t color_index_begin = h_color_xadj(i);
-            nnz_lno_t color_index_end = h_color_xadj(i + 1);
-            gs._color_set_begin = color_index_begin;
-            gs._color_set_end = color_index_end;
-            Kokkos::parallel_for ("KokkosSparse::GaussSeidel::PSGS::backward",
-                Kokkos::RangePolicy<MyExecSpace, PSGS_BackwardTag>
-                (0, color_index_end - color_index_begin), gs);
-            MyExecSpace().fence();
-            if (i == 0){
-              break;
-            }
-          }
+    }
+    if (apply_backward && numColors) {
+      for (size_type i = numColors - 1;; --i) {
+        nnz_lno_t color_index_begin = h_color_xadj(i);
+        nnz_lno_t color_index_end   = h_color_xadj(i + 1);
+        gs._color_set_begin         = color_index_begin;
+        gs._color_set_end           = color_index_end;
+        Kokkos::parallel_for("KokkosSparse::GaussSeidel::PSGS::backward",
+                             Kokkos::RangePolicy<MyExecSpace, PSGS_BackwardTag>(
+                                 0, color_index_end - color_index_begin),
+                             gs);
+        MyExecSpace().fence();
+        if (i == 0) {
+          break;
         }
       }
-    }; //class ClusterGaussSeidel
-  } //namespace Impl
-} //namespace KokkosSparse
+    }
+  }
+};  // class ClusterGaussSeidel
+}  // namespace Impl
+}  // namespace KokkosSparse
 
 #endif
-
diff --git a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp
index 3a2f6eccae..0f265dfbc4 100644
--- a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp
@@ -47,1808 +47,1893 @@
 
 #include "KokkosKernels_Utils.hpp"
 #include <Kokkos_Core.hpp>
-#include <Kokkos_Atomic.hpp>
-#include <Kokkos_Timer.hpp>
 #include <Kokkos_Bitset.hpp>
-#include <Kokkos_MemoryTraits.hpp>
 #include "KokkosGraph_Distance1Color.hpp"
 #include "KokkosKernels_Uniform_Initialized_MemoryPool.hpp"
 #include "KokkosKernels_BitUtils.hpp"
 #include "KokkosKernels_SimpleUtils.hpp"
 #include "KokkosKernels_Sorting.hpp"
 
-//FOR DEBUGGING
+// FOR DEBUGGING
 #include "KokkosBlas1_nrm2.hpp"
 
-namespace KokkosSparse{
-  namespace Impl{
-
-    template <typename HandleType, typename lno_row_view_t_, typename lno_nnz_view_t_, typename scalar_nnz_view_t_>
-    class PointGaussSeidel{
-
-    public:
-
-      typedef lno_row_view_t_ in_lno_row_view_t;
-      typedef lno_nnz_view_t_ in_lno_nnz_view_t;
-      typedef scalar_nnz_view_t_ in_scalar_nnz_view_t;
-
-      typedef typename HandleType::HandleExecSpace MyExecSpace;
-      typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace;
-      typedef typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace;
-
-      typedef typename in_lno_row_view_t::non_const_value_type row_lno_t;
-
-      typedef typename HandleType::size_type size_type;
-      typedef typename HandleType::nnz_lno_t nnz_lno_t;
-      typedef typename HandleType::nnz_scalar_t nnz_scalar_t;
-
-      typedef typename in_lno_row_view_t::const_type const_lno_row_view_t;
-      typedef typename in_lno_row_view_t::non_const_type non_const_lno_row_view_t;
-
-      typedef typename lno_nnz_view_t_::const_type const_lno_nnz_view_t;
-      typedef typename lno_nnz_view_t_::non_const_type non_const_lno_nnz_view_t;
-
-      typedef typename scalar_nnz_view_t_::const_type const_scalar_nnz_view_t;
-      typedef typename scalar_nnz_view_t_::non_const_type non_const_scalar_nnz_view_t;
-
-      typedef typename HandleType::row_lno_temp_work_view_t row_lno_temp_work_view_t;
-      typedef typename HandleType::row_lno_persistent_work_view_t row_lno_persistent_work_view_t;
-      typedef typename HandleType::row_lno_persistent_work_host_view_t row_lno_persistent_work_host_view_t; //Host view type
-
-      typedef typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t;
-      typedef typename HandleType::nnz_lno_persistent_work_view_t nnz_lno_persistent_work_view_t;
-      typedef typename HandleType::nnz_lno_persistent_work_host_view_t nnz_lno_persistent_work_host_view_t; //Host view type
-
-      typedef typename HandleType::scalar_temp_work_view_t scalar_temp_work_view_t;
-      typedef typename HandleType::scalar_persistent_work_view2d_t scalar_persistent_work_view2d_t;
-      typedef typename HandleType::scalar_persistent_work_view_t scalar_persistent_work_view_t;
-
-      typedef Kokkos::RangePolicy<MyExecSpace> range_pol;
-      typedef typename HandleType::GraphColoringHandleType::color_view_t color_view_t;
-      typedef typename HandleType::GraphColoringHandleType::color_t color_t;
-      typedef Kokkos::Bitset<MyExecSpace> bitset_t;
-      typedef Kokkos::ConstBitset<MyExecSpace> const_bitset_t;
-
-      typedef Kokkos::TeamPolicy<MyExecSpace> team_policy_t ;
-      typedef typename team_policy_t::member_type team_member_t ;
-
-      struct BlockTag{};
-      struct BigBlockTag{};
-      struct LongRowTag{};
-
-      typedef Kokkos::TeamPolicy<BlockTag, MyExecSpace> block_apply_team_policy_t ;
-      typedef Kokkos::TeamPolicy<BigBlockTag, MyExecSpace> bigblock_apply_team_policy_t ;
-      typedef Kokkos::RangePolicy<LongRowTag, MyExecSpace> longrow_apply_range_policy_t ;
-      typedef Kokkos::TeamPolicy<LongRowTag, MyExecSpace> longrow_apply_team_policy_t ;
-      typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_scalar_t> pool_memory_space;
-
-    private:
-      HandleType *handle;
+namespace KokkosSparse {
+namespace Impl {
+
+template <typename HandleType, typename lno_row_view_t_,
+          typename lno_nnz_view_t_, typename scalar_nnz_view_t_,
+          KokkosKernels::SparseMatrixFormat format = KokkosKernels::CRS>
+class PointGaussSeidel {
+ public:
+  typedef lno_row_view_t_ in_lno_row_view_t;
+  typedef lno_nnz_view_t_ in_lno_nnz_view_t;
+  typedef scalar_nnz_view_t_ in_scalar_nnz_view_t;
+
+  typedef typename HandleType::HandleExecSpace MyExecSpace;
+  typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace;
+  typedef
+      typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace;
+
+  typedef typename in_lno_row_view_t::non_const_value_type row_lno_t;
+
+  typedef typename HandleType::size_type size_type;
+  typedef typename HandleType::nnz_lno_t nnz_lno_t;
+  typedef typename HandleType::nnz_scalar_t nnz_scalar_t;
+
+  typedef typename in_lno_row_view_t::const_type const_lno_row_view_t;
+  typedef typename in_lno_row_view_t::non_const_type non_const_lno_row_view_t;
+
+  typedef typename lno_nnz_view_t_::const_type const_lno_nnz_view_t;
+  typedef typename lno_nnz_view_t_::non_const_type non_const_lno_nnz_view_t;
+
+  typedef typename scalar_nnz_view_t_::const_type const_scalar_nnz_view_t;
+  typedef
+      typename scalar_nnz_view_t_::non_const_type non_const_scalar_nnz_view_t;
+
+  typedef
+      typename HandleType::row_lno_temp_work_view_t row_lno_temp_work_view_t;
+  typedef typename HandleType::row_lno_persistent_work_view_t
+      row_lno_persistent_work_view_t;
+  typedef typename HandleType::row_lno_persistent_work_host_view_t
+      row_lno_persistent_work_host_view_t;  // Host view type
+
+  typedef
+      typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t;
+  typedef typename HandleType::nnz_lno_persistent_work_view_t
+      nnz_lno_persistent_work_view_t;
+  typedef typename HandleType::nnz_lno_persistent_work_host_view_t
+      nnz_lno_persistent_work_host_view_t;  // Host view type
+
+  typedef typename HandleType::scalar_temp_work_view_t scalar_temp_work_view_t;
+  typedef typename HandleType::scalar_persistent_work_view2d_t
+      scalar_persistent_work_view2d_t;
+  typedef typename HandleType::scalar_persistent_work_view_t
+      scalar_persistent_work_view_t;
+
+  typedef Kokkos::RangePolicy<MyExecSpace> range_pol;
+  typedef
+      typename HandleType::GraphColoringHandleType::color_view_t color_view_t;
+  typedef typename HandleType::GraphColoringHandleType::color_t color_t;
+  typedef Kokkos::Bitset<MyExecSpace> bitset_t;
+  typedef Kokkos::ConstBitset<MyExecSpace> const_bitset_t;
+
+  typedef Kokkos::TeamPolicy<MyExecSpace> team_policy_t;
+  typedef typename team_policy_t::member_type team_member_t;
+
+  struct BlockTag {};
+  struct BigBlockTag {};
+  struct LongRowTag {};
+
+  typedef Kokkos::TeamPolicy<BlockTag, MyExecSpace> block_apply_team_policy_t;
+  typedef Kokkos::TeamPolicy<BigBlockTag, MyExecSpace>
+      bigblock_apply_team_policy_t;
+  typedef Kokkos::RangePolicy<LongRowTag, MyExecSpace>
+      longrow_apply_range_policy_t;
+  typedef Kokkos::TeamPolicy<LongRowTag, MyExecSpace>
+      longrow_apply_team_policy_t;
+  typedef KokkosKernels::Impl::UniformMemoryPool<MyTempMemorySpace,
+                                                 nnz_scalar_t>
+      pool_memory_space;
+
+  typedef
+      typename KokkosKernels::Impl::MatrixRowIndex<format, nnz_lno_t, size_type>
+          RowIndex;
+
+ private:
+  HandleType* handle;
+
+  // Get the specialized PointGaussSeidel handle from the main handle
+  typename HandleType::PointGaussSeidelHandleType* get_gs_handle() {
+    auto gsHandle =
+        dynamic_cast<typename HandleType::PointGaussSeidelHandleType*>(
+            this->handle->get_gs_handle());
+    if (!gsHandle) {
+      throw std::runtime_error(
+          "PointGaussSeidel: GS handle has not been created, or is set up for "
+          "Cluster GS.");
+    }
+    return gsHandle;
+  }
 
-      //Get the specialized PointGaussSeidel handle from the main handle
-      typename HandleType::PointGaussSeidelHandleType* get_gs_handle()
-      {
-        auto gsHandle = dynamic_cast<typename HandleType::PointGaussSeidelHandleType*>(this->handle->get_gs_handle());
-        if(!gsHandle)
-        {
-          throw std::runtime_error("PointGaussSeidel: GS handle has not been created, or is set up for Cluster GS.");
+  nnz_lno_t num_rows, num_cols;
+
+  const_lno_row_view_t row_map;
+  const_lno_nnz_view_t entries;
+  const_scalar_nnz_view_t values;
+
+  const_scalar_nnz_view_t given_inverse_diagonal;
+
+  bool have_diagonal_given;
+  bool is_symmetric;
+
+  // Batch size for column applies. Used as a stack array size, so must be a
+  // compile-time constant.
+  static constexpr nnz_lno_t apply_batch_size = 8;
+
+ public:
+  struct PSGS {
+    row_lno_persistent_work_view_t _xadj;
+    nnz_lno_persistent_work_view_t _adj;      // CSR storage of the graph.
+    scalar_persistent_work_view_t _adj_vals;  // CSR storage of the graph.
+
+    scalar_persistent_work_view2d_t _Xvector /*output*/;
+    scalar_persistent_work_view2d_t _Yvector;
+
+    scalar_persistent_work_view_t _permuted_inverse_diagonal;
+
+    nnz_scalar_t omega;
+
+    scalar_persistent_work_view_t
+        _long_row_x;             // Results of simple Ax matvec over long rows.
+    nnz_lno_t _long_row_col;     // Which X/Y column is now being processed for
+                                 // long rows.
+    nnz_lno_t _color_set_begin;  //(only used for long rows): where the current
+                                 // set of rows begins
+    nnz_lno_t _long_row_par;
+
+    PSGS(row_lno_persistent_work_view_t xadj_,
+         nnz_lno_persistent_work_view_t adj_,
+         scalar_persistent_work_view_t adj_vals_,
+         scalar_persistent_work_view2d_t Xvector_,
+         scalar_persistent_work_view2d_t Yvector_,
+         nnz_lno_persistent_work_view_t /* color_adj_ */, nnz_scalar_t omega_,
+         scalar_persistent_work_view_t permuted_inverse_diagonal_)
+        : _xadj(xadj_),
+          _adj(adj_),
+          _adj_vals(adj_vals_),
+          _Xvector(Xvector_),
+          _Yvector(Yvector_),
+          _permuted_inverse_diagonal(permuted_inverse_diagonal_),
+          omega(omega_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const nnz_lno_t ii) const {
+      size_type row_begin                = _xadj(ii);
+      size_type row_end                  = _xadj(ii + 1);
+      nnz_scalar_t sum[apply_batch_size] = {0};
+      nnz_lno_t num_vecs                 = _Xvector.extent(1);
+      for (nnz_lno_t batch_start = 0; batch_start < num_vecs;
+           batch_start += apply_batch_size) {
+        nnz_lno_t this_batch_size = apply_batch_size;
+        if (batch_start + this_batch_size >= num_vecs)
+          this_batch_size = num_vecs - batch_start;
+        // the current batch of columns given by: batch_start, this_batch_size
+        for (nnz_lno_t i = 0; i < this_batch_size; i++)
+          sum[i] = _Yvector(ii, batch_start + i);
+        for (size_type adjind = row_begin; adjind < row_end; ++adjind) {
+          nnz_lno_t colIndex = _adj(adjind);
+          nnz_scalar_t val   = _adj_vals(adjind);
+          for (nnz_lno_t i = 0; i < this_batch_size; i++)
+            sum[i] -= val * _Xvector(colIndex, batch_start + i);
         }
-        return gsHandle;
+        nnz_scalar_t invDiagonalVal = _permuted_inverse_diagonal(ii);
+        for (nnz_lno_t i = 0; i < this_batch_size; i++)
+          _Xvector(ii, batch_start + i) += omega * sum[i] * invDiagonalVal;
       }
-
-      nnz_lno_t num_rows, num_cols;
-
-      const_lno_row_view_t row_map;
-      const_lno_nnz_view_t entries;
-      const_scalar_nnz_view_t values;
-
-      const_scalar_nnz_view_t given_inverse_diagonal;
-
-      bool have_diagonal_given;
-      bool is_symmetric;
-
-      //Batch size for column applies. Used as a stack array size, so must be a compile-time constant.
-      static constexpr nnz_lno_t apply_batch_size = 8;
-
-    public:
-
-      struct PSGS{
-        row_lno_persistent_work_view_t _xadj;
-        nnz_lno_persistent_work_view_t _adj; // CSR storage of the graph.
-        scalar_persistent_work_view_t _adj_vals; // CSR storage of the graph.
-
-        scalar_persistent_work_view2d_t _Xvector /*output*/;
-        scalar_persistent_work_view2d_t _Yvector;
-
-        scalar_persistent_work_view_t _permuted_inverse_diagonal;
-
-        nnz_scalar_t omega;
-
-        scalar_persistent_work_view_t _long_row_x;  //Results of simple Ax matvec over long rows.
-        nnz_lno_t _long_row_col;  //Which X/Y column is now being processed for long rows.
-        nnz_lno_t _color_set_begin; //(only used for long rows): where the current set of rows begins
-        nnz_lno_t _long_row_par;
-
-        PSGS(row_lno_persistent_work_view_t xadj_, nnz_lno_persistent_work_view_t adj_, scalar_persistent_work_view_t adj_vals_,
-             scalar_persistent_work_view2d_t Xvector_, scalar_persistent_work_view2d_t Yvector_, nnz_lno_persistent_work_view_t /* color_adj_ */,
-             nnz_scalar_t omega_,
-             scalar_persistent_work_view_t permuted_inverse_diagonal_):
-          _xadj( xadj_),
-          _adj( adj_),
-          _adj_vals( adj_vals_),
-          _Xvector( Xvector_),
-          _Yvector( Yvector_), _permuted_inverse_diagonal(permuted_inverse_diagonal_),
-          omega(omega_){}
-
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const nnz_lno_t ii) const {
-          size_type row_begin = _xadj(ii);
-          size_type row_end = _xadj(ii + 1);
-          nnz_scalar_t sum[apply_batch_size] = {0};
-          nnz_lno_t num_vecs = _Xvector.extent(1);
-          for(nnz_lno_t batch_start = 0; batch_start < num_vecs; batch_start += apply_batch_size)
-          {
-            nnz_lno_t this_batch_size = apply_batch_size;
-            if(batch_start + this_batch_size >= num_vecs)
-              this_batch_size = num_vecs - batch_start;
-            //the current batch of columns given by: batch_start, this_batch_size
-            for(nnz_lno_t i = 0; i < this_batch_size; i++)
-              sum[i] = _Yvector(ii, batch_start + i);
-            for (size_type adjind = row_begin; adjind < row_end; ++adjind){
-              nnz_lno_t colIndex = _adj(adjind);
-              nnz_scalar_t val = _adj_vals(adjind);
-              for(nnz_lno_t i = 0; i < this_batch_size; i++)
-                sum[i] -= val * _Xvector(colIndex, batch_start + i);
-            }
-            nnz_scalar_t invDiagonalVal = _permuted_inverse_diagonal(ii);
-            for(nnz_lno_t i = 0; i < this_batch_size; i++)
-              _Xvector(ii, batch_start + i) += omega * sum[i] * invDiagonalVal;
-          }
-        }
-
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const LongRowTag&, const nnz_lno_t i) const {
-          nnz_lno_t row = _color_set_begin + i / _long_row_par;
-          nnz_lno_t chunk = i % _long_row_par;
-          size_type row_begin = _xadj(row);
-          size_type row_end = _xadj(row + 1);
-          size_type chunk_begin = row_begin + (row_end - row_begin) * chunk / _long_row_par;
-          size_type chunk_end = row_begin + (row_end - row_begin) * (chunk + 1) / _long_row_par;
-          if(chunk_end > row_end)
-            chunk_end = row_end;
-          nnz_scalar_t localSum{};
-          for(size_type j = chunk_begin; j < chunk_end; j++)
-          {
-            nnz_lno_t colIndex = _adj(j);
-            localSum += _adj_vals(j) * _Xvector(colIndex, _long_row_col);
-          }
-          Kokkos::atomic_add(&_long_row_x(row - _color_set_begin), localSum);
-        }
-      };
-
-      struct Team_PSGS{
-
-        row_lno_persistent_work_view_t _xadj;
-        nnz_lno_persistent_work_view_t _adj; // CSR storage of the graph.
-        scalar_persistent_work_view_t _adj_vals; // CSR storage of the graph.
-
-        scalar_persistent_work_view2d_t _Xvector /*output*/;
-        scalar_persistent_work_view2d_t _Yvector;
-        nnz_lno_t _color_set_begin;
-        nnz_lno_t _color_set_end;
-
-        scalar_persistent_work_view_t _permuted_inverse_diagonal;
-        nnz_lno_t block_size;
-        nnz_lno_t team_work_size;
-        const size_t shared_memory_size;
-
-        int suggested_team_size;
-        const size_t thread_shared_memory_scalar_size;
-        int vector_size;
-        const pool_memory_space pool;
-        const nnz_lno_t num_max_vals_in_l1, num_max_vals_in_l2;
-        bool is_backward;
-
-        nnz_scalar_t omega;
-
-        typedef typename KokkosKernels::Impl::array_sum_reduce<nnz_scalar_t, apply_batch_size> batch_sum;
-
-        nnz_lno_persistent_work_view_t _long_rows;
-        scalar_persistent_work_view_t _long_row_x;
-        nnz_lno_t _long_row_col;  //Which X/Y column is now being processed for long rows.
-        nnz_lno_t _long_row_par;
-
-        Team_PSGS(row_lno_persistent_work_view_t xadj_, nnz_lno_persistent_work_view_t adj_, scalar_persistent_work_view_t adj_vals_,
-                  scalar_persistent_work_view2d_t Xvector_, scalar_persistent_work_view2d_t Yvector_,
-                  nnz_lno_t color_set_begin, nnz_lno_t color_set_end,
-                  scalar_persistent_work_view_t permuted_inverse_diagonal_,
-                  pool_memory_space pms,
-                  nnz_lno_t _num_max_vals_in_l1 = 0,
-                  nnz_lno_t _num_max_vals_in_l2 = 0,
-                  nnz_scalar_t omega_ = Kokkos::Details::ArithTraits<nnz_scalar_t>::one(),
-
-                  nnz_lno_t block_size_ = 1,
-                  nnz_lno_t team_work_size_ = 1,
-                  size_t shared_memory_size_ = 16,
-                  int suggested_team_size_ = 1,
-                  int vector_size_ = 1):
-          _xadj( xadj_),
-          _adj( adj_),
-          _adj_vals( adj_vals_),
-          _Xvector( Xvector_),
-          _Yvector( Yvector_),
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const LongRowTag&, const nnz_lno_t i) const {
+      nnz_lno_t row       = _color_set_begin + i / _long_row_par;
+      nnz_lno_t chunk     = i % _long_row_par;
+      size_type row_begin = _xadj(row);
+      size_type row_end   = _xadj(row + 1);
+      size_type chunk_begin =
+          row_begin + (row_end - row_begin) * chunk / _long_row_par;
+      size_type chunk_end =
+          row_begin + (row_end - row_begin) * (chunk + 1) / _long_row_par;
+      if (chunk_end > row_end) chunk_end = row_end;
+      nnz_scalar_t localSum{};
+      for (size_type j = chunk_begin; j < chunk_end; j++) {
+        nnz_lno_t colIndex = _adj(j);
+        localSum += _adj_vals(j) * _Xvector(colIndex, _long_row_col);
+      }
+      Kokkos::atomic_add(&_long_row_x(row - _color_set_begin), localSum);
+    }
+  };
+
+  struct Team_PSGS {
+    row_lno_persistent_work_view_t _xadj;
+    nnz_lno_persistent_work_view_t _adj;      // CSR storage of the graph.
+    scalar_persistent_work_view_t _adj_vals;  // CSR storage of the graph.
+
+    scalar_persistent_work_view2d_t _Xvector /*output*/;
+    scalar_persistent_work_view2d_t _Yvector;
+    nnz_lno_t _color_set_begin;
+    nnz_lno_t _color_set_end;
+
+    scalar_persistent_work_view_t _permuted_inverse_diagonal;
+    nnz_lno_t block_size;
+    nnz_lno_t team_work_size;
+    const size_t shared_memory_size;
+
+    int suggested_team_size;
+    const size_t thread_shared_memory_scalar_size;
+    int vector_size;
+    const pool_memory_space pool;
+    const nnz_lno_t num_max_vals_in_l1, num_max_vals_in_l2;
+    bool is_backward;
+
+    nnz_scalar_t omega;
+
+    typedef typename KokkosKernels::Impl::array_sum_reduce<nnz_scalar_t,
+                                                           apply_batch_size>
+        batch_sum;
+
+    nnz_lno_persistent_work_view_t _long_rows;
+    scalar_persistent_work_view_t _long_row_x;
+    nnz_lno_t _long_row_col;  // Which X/Y column is now being processed for
+                              // long rows.
+    nnz_lno_t _long_row_par;
+
+    Team_PSGS(
+        row_lno_persistent_work_view_t xadj_,
+        nnz_lno_persistent_work_view_t adj_,
+        scalar_persistent_work_view_t adj_vals_,
+        scalar_persistent_work_view2d_t Xvector_,
+        scalar_persistent_work_view2d_t Yvector_, nnz_lno_t color_set_begin,
+        nnz_lno_t color_set_end,
+        scalar_persistent_work_view_t permuted_inverse_diagonal_,
+        pool_memory_space pms, nnz_lno_t _num_max_vals_in_l1 = 0,
+        nnz_lno_t _num_max_vals_in_l2 = 0,
+        nnz_scalar_t omega_ = Kokkos::Details::ArithTraits<nnz_scalar_t>::one(),
+
+        nnz_lno_t block_size_ = 1, nnz_lno_t team_work_size_ = 1,
+        size_t shared_memory_size_ = 16, int suggested_team_size_ = 1,
+        int vector_size_ = 1)
+        : _xadj(xadj_),
+          _adj(adj_),
+          _adj_vals(adj_vals_),
+          _Xvector(Xvector_),
+          _Yvector(Yvector_),
           _color_set_begin(color_set_begin),
-          _color_set_end(color_set_end), _permuted_inverse_diagonal(permuted_inverse_diagonal_),
+          _color_set_end(color_set_end),
+          _permuted_inverse_diagonal(permuted_inverse_diagonal_),
           block_size(block_size_),
           team_work_size(team_work_size_),
           shared_memory_size(shared_memory_size_),
           suggested_team_size(suggested_team_size_),
-          thread_shared_memory_scalar_size(((shared_memory_size / suggested_team_size / 8) * 8 ) / sizeof(nnz_scalar_t) ),
-          vector_size(vector_size_), pool (pms), num_max_vals_in_l1(_num_max_vals_in_l1),
-          num_max_vals_in_l2(_num_max_vals_in_l2), is_backward(false),
-          omega(omega_){}
-
-        //Do a Gauss-Seidel step on a single row, for X/Y columns colStart:colStart+N-1 (inclusive)
-        //Specializing this on the batch size allows the best reuse of matrix accesses, while also
-        //using the correct width array_sum_reduce.
-        template<int N>
-        KOKKOS_INLINE_FUNCTION void runColBatch(const team_member_t& teamMember, nnz_lno_t row, nnz_lno_t colStart) const
-        {
-          typedef KokkosKernels::Impl::array_sum_reduce<nnz_scalar_t, N> reducer; 
-          size_type row_begin = _xadj(row);
-          size_type row_end = _xadj(row + 1);
-          reducer sum;
-          Kokkos::parallel_reduce(
-            Kokkos::ThreadVectorRange(teamMember, row_end - row_begin),
-            [&] (size_type i, reducer& lsum)
-            {
-              size_type adjind = row_begin + i;
-              nnz_lno_t colIndex = _adj(adjind);
-              nnz_scalar_t val = _adj_vals(adjind);
-              for(int j = 0; j < N; j++)
-                lsum.data[j] += val * _Xvector(colIndex, colStart + j);
-            }, sum);
-          Kokkos::single(Kokkos::PerThread(teamMember),[&] ()
-          {
-            nnz_scalar_t invDiagonalVal = _permuted_inverse_diagonal(row);
-            for(int i = 0; i < N; i++)
-            {
-              _Xvector(row, colStart + i) +=
-                omega * (_Yvector(row, colStart + i) - sum.data[i]) * invDiagonalVal;
-            }
-          });
+          thread_shared_memory_scalar_size(
+              ((shared_memory_size / suggested_team_size / 8) * 8) /
+              sizeof(nnz_scalar_t)),
+          vector_size(vector_size_),
+          pool(pms),
+          num_max_vals_in_l1(_num_max_vals_in_l1),
+          num_max_vals_in_l2(_num_max_vals_in_l2),
+          is_backward(false),
+          omega(omega_) {}
+
+    // Do a Gauss-Seidel step on a single row, for X/Y columns
+    // colStart:colStart+N-1 (inclusive) Specializing this on the batch size
+    // allows the best reuse of matrix accesses, while also using the correct
+    // width array_sum_reduce.
+    template <int N>
+    KOKKOS_INLINE_FUNCTION void runColBatch(const team_member_t& teamMember,
+                                            nnz_lno_t row,
+                                            nnz_lno_t colStart) const {
+      typedef KokkosKernels::Impl::array_sum_reduce<nnz_scalar_t, N> reducer;
+      size_type row_begin = _xadj(row);
+      size_type row_end   = _xadj(row + 1);
+      reducer sum;
+      Kokkos::parallel_reduce(
+          Kokkos::ThreadVectorRange(teamMember, row_end - row_begin),
+          [&](size_type i, reducer& lsum) {
+            size_type adjind   = row_begin + i;
+            nnz_lno_t colIndex = _adj(adjind);
+            nnz_scalar_t val   = _adj_vals(adjind);
+            for (int j = 0; j < N; j++)
+              lsum.data[j] += val * _Xvector(colIndex, colStart + j);
+          },
+          sum);
+      Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+        nnz_scalar_t invDiagonalVal = _permuted_inverse_diagonal(row);
+        for (int i = 0; i < N; i++) {
+          _Xvector(row, colStart + i) +=
+              omega * (_Yvector(row, colStart + i) - sum.data[i]) *
+              invDiagonalVal;
         }
-
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const team_member_t & teamMember) const {
-          nnz_lno_t row = teamMember.league_rank() * teamMember.team_size() + teamMember.team_rank() + _color_set_begin;
-          if (row >= _color_set_end)
-            return;
-          nnz_lno_t num_vecs = _Xvector.extent(1);
-          for(nnz_lno_t batch_start = 0; batch_start < num_vecs;)
-          {
-            switch(num_vecs - batch_start)
-            {
-              #define COL_BATCH_CASE(n) \
-              case n: \
-                      runColBatch<n>(teamMember, row, batch_start); \
-                      batch_start += n; \
-                      break;
-              COL_BATCH_CASE(1)
-              COL_BATCH_CASE(2)
-              COL_BATCH_CASE(3)
-              COL_BATCH_CASE(4)
-              COL_BATCH_CASE(5)
-              COL_BATCH_CASE(6)
-              COL_BATCH_CASE(7)
-              #undef COL_BATCH_CASE
-              default:
-                runColBatch<8>(teamMember, row, batch_start);
-                batch_start += 8;
-            }
-          }
+      });
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const team_member_t& teamMember) const {
+      nnz_lno_t row = teamMember.league_rank() * teamMember.team_size() +
+                      teamMember.team_rank() + _color_set_begin;
+      if (row >= _color_set_end) return;
+      nnz_lno_t num_vecs = _Xvector.extent(1);
+      for (nnz_lno_t batch_start = 0; batch_start < num_vecs;) {
+        switch (num_vecs - batch_start) {
+#define COL_BATCH_CASE(n)                         \
+  case n:                                         \
+    runColBatch<n>(teamMember, row, batch_start); \
+    batch_start += n;                             \
+    break;
+          COL_BATCH_CASE(1)
+          COL_BATCH_CASE(2)
+          COL_BATCH_CASE(3)
+          COL_BATCH_CASE(4)
+          COL_BATCH_CASE(5)
+          COL_BATCH_CASE(6)
+          COL_BATCH_CASE(7)
+#undef COL_BATCH_CASE
+          default:
+            runColBatch<8>(teamMember, row, batch_start);
+            batch_start += 8;
         }
+      }
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const BigBlockTag&, const team_member_t& teamMember) const {
+      const nnz_lno_t team_row_begin =
+          teamMember.league_rank() * team_work_size + _color_set_begin;
+      const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(
+          team_row_begin + team_work_size, _color_set_end);
+      // get the shared memory and shift it based on the thread index so that
+      // each thread has private memory.
+      nnz_scalar_t* all_shared_memory =
+          (nnz_scalar_t*)(teamMember.team_shmem().get_shmem(
+              shared_memory_size));
+
+      all_shared_memory +=
+          thread_shared_memory_scalar_size * teamMember.team_rank();
+
+      // store the diagonal positions, because we want to update them on shared
+      // memory if we update them on global memory.
+      nnz_lno_t* diagonal_positions = (nnz_lno_t*)all_shared_memory;
+      all_shared_memory = (nnz_scalar_t*)(((nnz_lno_t*)all_shared_memory) +
+                                          ((block_size / 8) + 1) * 8);
+
+      nnz_scalar_t* all_global_memory = NULL;
+
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+          [&](const nnz_lno_t ii) {
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(teamMember, block_size),
+                [&](nnz_lno_t i) { diagonal_positions[i] = -1; });
+
+            RowIndex row(block_size, _xadj(ii), _xadj(ii + 1));
+            nnz_lno_t l1_val_size = row.size() * block_size, l2_val_size = 0;
+            // if the current row size is larger than shared memory size,
+            // than allocate l2 vector.
+            if (row.size() > num_max_vals_in_l1) {
+              volatile nnz_scalar_t* tmp = NULL;
+              while (tmp == NULL) {
+                Kokkos::single(
+                    Kokkos::PerThread(teamMember),
+                    [&](volatile nnz_scalar_t*& memptr) {
+                      memptr =
+                          (volatile nnz_scalar_t*)(pool.allocate_chunk(ii));
+                    },
+                    tmp);
+              }
+              all_global_memory = (nnz_scalar_t*)tmp;
+              l1_val_size       = num_max_vals_in_l1 * block_size;
+              l2_val_size       = (row.size() * block_size - l1_val_size);
+            }
+            for (nnz_lno_t vec = 0; vec < (nnz_lno_t)_Xvector.extent(1);
+                 vec++) {
+              // bring values to l1 vector
+              Kokkos::parallel_for(
+                  Kokkos::ThreadVectorRange(teamMember, l1_val_size),
+                  [&](nnz_lno_t i) {
+                    size_type adjind   = i / block_size + row.begin();
+                    nnz_lno_t colIndex = _adj(adjind);
 
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const BigBlockTag&, const team_member_t & teamMember) const {
-
-          const nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size + _color_set_begin;
-          const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, _color_set_end);
-          //get the shared memory and shift it based on the thread index so that each thread has private memory.
-          nnz_scalar_t *all_shared_memory = (nnz_scalar_t *) (teamMember.team_shmem().get_shmem(shared_memory_size));
-
-          all_shared_memory += thread_shared_memory_scalar_size * teamMember.team_rank();
-
-          //store the diagonal positions, because we want to update them on shared memory if we update them on global memory.
-          nnz_lno_t *diagonal_positions = (nnz_lno_t *)all_shared_memory;
-          all_shared_memory =  (nnz_scalar_t *) (((nnz_lno_t *)all_shared_memory) + ((block_size / 8) + 1) * 8);
-
-          nnz_scalar_t *all_global_memory = NULL;
-
+                    if (colIndex == ii) {
+                      diagonal_positions[i % block_size] = i;
+                    }
+                    all_shared_memory[i] =
+                        _Xvector(colIndex * block_size + i % block_size, vec);
+                  });
+              // bring values to l2 vector.
+              Kokkos::parallel_for(
+                  Kokkos::ThreadVectorRange(teamMember, l2_val_size),
+                  [&](nnz_lno_t k) {
+                    nnz_lno_t i = l1_val_size + k;
 
-          Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t ii) {
+                    size_type adjind   = i / block_size + row.begin();
+                    nnz_lno_t colIndex = _adj(adjind);
 
+                    if (colIndex == ii) {
+                      diagonal_positions[i % block_size] = i;
+                    }
+                    all_global_memory[k] =
+                        _Xvector(colIndex * block_size + i % block_size, vec);
+                  });
 
-              Kokkos::parallel_for(
-                                   Kokkos::ThreadVectorRange(teamMember, block_size),
-                                   [&] (nnz_lno_t i) {
-                                     diagonal_positions[i] = -1;
-                                   });
-
-              size_type row_begin = _xadj(ii);
-              size_type row_end = _xadj(ii + 1);
-              nnz_lno_t row_size = row_end - row_begin;
-              nnz_lno_t scalar_row_begin = row_begin * block_size * block_size;
-
-              nnz_lno_t l1_val_size = row_size * block_size, l2_val_size = 0;
-              //if the current row size is larger than shared memory size,
-              //than allocate l2 vector.
-              if (row_size > num_max_vals_in_l1){
-                volatile nnz_scalar_t * tmp = NULL;
-                while (tmp == NULL){
-                  Kokkos::single(Kokkos::PerThread(teamMember),[&] (volatile nnz_scalar_t * &memptr) {
-                      memptr = (volatile nnz_scalar_t * )( pool.allocate_chunk(ii));
-                    }, tmp);
-                }
-                all_global_memory = (nnz_scalar_t *)tmp;
-                l1_val_size = num_max_vals_in_l1 * block_size;
-                l2_val_size = (row_size * block_size - l1_val_size);
+              // sequentially solve in the block.
+              // this respects backward and forward sweeps.
+              for (int m = 0; m < block_size; ++m) {
+                int i = m;
+                if (is_backward) i = block_size - m - 1;
+                // first reduce l1 dot product.
+                // MD: TODO: if thread dot product is implemented it should be
+                // called here.
+                nnz_scalar_t product = 0;
+                Kokkos::parallel_reduce(
+                    Kokkos::ThreadVectorRange(teamMember, l1_val_size),
+                    [&](nnz_lno_t colind, nnz_scalar_t& valueToUpdate) {
+                      const size_type val_idx = row.value(
+                          colind / block_size, i, colind % block_size);
+                      valueToUpdate +=
+                          all_shared_memory[colind] * _adj_vals(val_idx);
+                    },
+                    product);
+                // l2 dot product.
+                // MD: TODO: if thread dot product is implemented, it should be
+                // called here again.
+                nnz_scalar_t product2 = 0;
+                Kokkos::parallel_reduce(
+                    Kokkos::ThreadVectorRange(teamMember, l2_val_size),
+                    [&](nnz_lno_t colind2, nnz_scalar_t& valueToUpdate) {
+                      nnz_lno_t colind        = colind2 + l1_val_size;
+                      const size_type val_idx = row.value(
+                          colind / block_size, i, colind % block_size);
+                      valueToUpdate +=
+                          all_global_memory[colind2] * _adj_vals(val_idx);
+                    },
+                    product2);
+
+                product += product2;
+                // update the new vector entries.
+                Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+                  nnz_lno_t block_row_index = ii * block_size + i;
+                  nnz_scalar_t invDiagonalVal =
+                      _permuted_inverse_diagonal(block_row_index);
+                  _Xvector(block_row_index, vec) +=
+                      omega * (_Yvector(block_row_index, vec) - product) *
+                      invDiagonalVal;
+
+                  // we need to update the values of the vector entries if they
+                  // are already brought to shared memory to sync with global
+                  // memory.
+                  if (diagonal_positions[i] != -1) {
+                    if (diagonal_positions[i] < l1_val_size)
+                      all_shared_memory[diagonal_positions[i]] =
+                          _Xvector(block_row_index, vec);
+                    else
+                      all_global_memory[diagonal_positions[i] - l1_val_size] =
+                          _Xvector(block_row_index, vec);
+                  }
+                });
               }
-              for(nnz_lno_t vec = 0; vec < (nnz_lno_t) _Xvector.extent(1); vec++)
-              {
-                //bring values to l1 vector
-                Kokkos::parallel_for(
-                                     Kokkos::ThreadVectorRange(teamMember, l1_val_size),
-                                     [&] (nnz_lno_t i) {
-                                       size_type adjind = i / block_size + row_begin;
-                                       nnz_lno_t colIndex = _adj(adjind);
-
-                                       if (colIndex == ii){
-                                         diagonal_positions[i % block_size] = i;
-                                       }
-                                       all_shared_memory[i] = _Xvector(colIndex * block_size + i % block_size, vec);
-                                     });
-                //bring values to l2 vector.
-                Kokkos::parallel_for(
-                                     Kokkos::ThreadVectorRange(teamMember, l2_val_size),
-                                     [&] (nnz_lno_t k) {
-                                       nnz_lno_t i = l1_val_size + k;
-
-                                       size_type adjind = i / block_size + row_begin;
-                                       nnz_lno_t colIndex = _adj(adjind);
-
-                                       if (colIndex == ii){
-                                         diagonal_positions[i % block_size] = i;
-                                       }
-                                       all_global_memory[k] = _Xvector(colIndex * block_size + i % block_size, vec);
-                                     });
-
-                //sequentially solve in the block.
-                //this respects backward and forward sweeps.
-                for (int m = 0; m < block_size; ++m )
-                {
-                  int i = m;
-                  if (is_backward) i = block_size - m - 1;
-                  size_type current_row_begin = scalar_row_begin + i * row_size * block_size;
-                  //first reduce l1 dot product.
-                  //MD: TODO: if thread dot product is implemented it should be called here.
-                  nnz_scalar_t product = 0 ;
-                  Kokkos::parallel_reduce(
-                                          Kokkos::ThreadVectorRange(teamMember, l1_val_size),
-                                          [&] (nnz_lno_t colind, nnz_scalar_t & valueToUpdate) {
-                                            valueToUpdate += all_shared_memory[colind] * _adj_vals(current_row_begin + colind);
-                                          },
-                                          product);
-                  //l2 dot product.
-                  //MD: TODO: if thread dot product is implemented, it should be called here again.
-                  nnz_scalar_t product2 = 0 ;
-                  Kokkos::parallel_reduce(
-                                          Kokkos::ThreadVectorRange(teamMember, l2_val_size),
-                                          [&] (nnz_lno_t colind2, nnz_scalar_t & valueToUpdate) {
-                                            nnz_lno_t colind = colind2 + l1_val_size;
-                                            valueToUpdate += all_global_memory[colind2] * _adj_vals(current_row_begin + colind);
-                                          },
-                                          product2);
-
-                  product += product2;
-                  //update the new vector entries.
-                  Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
-                      nnz_lno_t block_row_index = ii * block_size + i;
-                      nnz_scalar_t invDiagonalVal = _permuted_inverse_diagonal(block_row_index);
-                      _Xvector(block_row_index, vec) += omega * (_Yvector(block_row_index, vec) - product) * invDiagonalVal;
-
-                      //we need to update the values of the vector entries if they are already brought to shared memory to sync with global memory.
-                      if (diagonal_positions[i] != -1){
-                        if (diagonal_positions[i] < l1_val_size)
-                          all_shared_memory[diagonal_positions[i]] = _Xvector(block_row_index, vec);
-                        else
-                          all_global_memory[diagonal_positions[i] - l1_val_size] = _Xvector(block_row_index, vec);
-                      }
-                    });
-                }
 
 #if KOKKOSSPARSE_IMPL_PRINTDEBUG
-                if (/*i == 0 && ii == 1*/ ii == 0 || (block_size == 1 && ii < 2))
-                {
-                  std::cout << "In X/Y column " << vec << std::endl;
-                  std::cout << "\n\n\nrow:" << ii * block_size + i;
-                  std::cout << "\nneighbors:";
-                  for (int z = 0; z < int (row_size); ++z)
-                  {
-                    std::cout << _adj(_xadj(ii) + z) << " ";
-                  }
-                  std::cout <<"\n\nrow-0:X -- all-shared-memory:";
-                  for (int z = 0; z < int (row_size * block_size); ++z)
-                  {
-                    std::cout << all_shared_memory[z] << " ";
-                  }
-                  std::cout << std::endl << "product:" << product << std::endl;
-                  std::cout << "diagonal" << _permuted_inverse_diagonal(ii * block_size + i) << std::endl;
-                  std::cout << "_Yvector: " << _Yvector(ii * block_size + i, vec) << std::endl;
-                  std::cout << "block_row_index:" << ii * block_size + i <<  " _Xvector(block_row_index): ";
-                  _Xvector(ii * block_size + i, vec) << ' ';
-                  std::cout << std::endl;
+              if (/*i == 0 && ii == 1*/ ii == 0 ||
+                  (block_size == 1 && ii < 2)) {
+                std::cout << "In X/Y column " << vec << std::endl;
+                std::cout << "\n\n\nrow:" << ii * block_size + i;
+                std::cout << "\nneighbors:";
+                for (int z = 0; z < int(row_size); ++z) {
+                  std::cout << _adj(_xadj(ii) + z) << " ";
                 }
-#endif
-              }
-              if (row_size > num_max_vals_in_l1)
-              {
-                Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
-                    pool.release_chunk(all_global_memory);
-                  });
+                std::cout << "\n\nrow-0:X -- all-shared-memory:";
+                for (int z = 0; z < int(row_size * block_size); ++z) {
+                  std::cout << all_shared_memory[z] << " ";
+                }
+                std::cout << std::endl << "product:" << product << std::endl;
+                std::cout << "diagonal"
+                          << _permuted_inverse_diagonal(ii * block_size + i)
+                          << std::endl;
+                std::cout << "_Yvector: " << _Yvector(ii * block_size + i, vec)
+                          << std::endl;
+                std::cout << "block_row_index:" << ii * block_size + i
+                          << " _Xvector(block_row_index): ";
+                _Xvector(ii * block_size + i, vec) << ' ';
+                std::cout << std::endl;
               }
-            });
-        }
-
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const BlockTag&, const team_member_t & teamMember) const {
-
+#endif
+            }
+            if (row.size() > num_max_vals_in_l1) {
+              Kokkos::single(Kokkos::PerThread(teamMember),
+                             [&]() { pool.release_chunk(all_global_memory); });
+            }
+          });
+    }
 
-          const nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size + _color_set_begin;
-          const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, _color_set_end);
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const BlockTag&, const team_member_t& teamMember) const {
+      const nnz_lno_t team_row_begin =
+          teamMember.league_rank() * team_work_size + _color_set_begin;
+      const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(
+          team_row_begin + team_work_size, _color_set_end);
 
-          nnz_scalar_t *all_shared_memory = (nnz_scalar_t *) (teamMember.team_shmem().get_shmem(shared_memory_size));
+      nnz_scalar_t* all_shared_memory =
+          (nnz_scalar_t*)(teamMember.team_shmem().get_shmem(
+              shared_memory_size));
 
-          all_shared_memory += thread_shared_memory_scalar_size * teamMember.team_rank();
+      all_shared_memory +=
+          thread_shared_memory_scalar_size * teamMember.team_rank();
 
-          nnz_lno_t *diagonal_positions = (nnz_lno_t *)all_shared_memory;
-          all_shared_memory =  (nnz_scalar_t *) (((nnz_lno_t *)all_shared_memory) + ((block_size / 8) + 1) * 8);
+      nnz_lno_t* diagonal_positions = (nnz_lno_t*)all_shared_memory;
+      all_shared_memory = (nnz_scalar_t*)(((nnz_lno_t*)all_shared_memory) +
+                                          ((block_size / 8) + 1) * 8);
 
-          Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& ii) {
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+          [&](const nnz_lno_t& ii) {
 #if KOKKOSSPARSE_IMPL_PRINTDEBUG
-              Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
-                  for(nnz_lno_t i = 0; i < block_size; diagonal_positions[i++] = -1);
-                });
+            Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+              for (nnz_lno_t i             = 0; i < block_size;
+                   diagonal_positions[i++] = -1)
+                ;
+            });
 #endif
 
-              Kokkos::parallel_for(
-                                   Kokkos::ThreadVectorRange(teamMember, block_size),
-                                   [&] (nnz_lno_t i) {
-                                     diagonal_positions[i] = -1;
-                                   });
-
-              size_type block_row_begin = _xadj(ii);
-              size_type block_row_end = _xadj(ii + 1);
-              nnz_lno_t block_row_size = block_row_end - block_row_begin;
-              //offset in adj_vals of the first row in this block
-              nnz_lno_t scalar_row_begin = block_row_begin * block_size * block_size;
-              //number of scalars in each row of this block
-              nnz_lno_t scalar_row_size = block_row_size * block_size;
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(teamMember, block_size),
+                [&](nnz_lno_t i) { diagonal_positions[i] = -1; });
 
-              Kokkos::parallel_for(
+            RowIndex row(block_size, _xadj(ii), _xadj(ii + 1));
+            // offset in adj_vals of the first row in this block
+            // number of scalars in each row of this block
+            nnz_lno_t scalar_row_size = row.size() * block_size;
+
+            Kokkos::parallel_for(
                 Kokkos::ThreadVectorRange(teamMember, scalar_row_size),
-                [&] (nnz_lno_t i)
-                {
-                  size_type adjind = i / block_size + block_row_begin;
+                [&](nnz_lno_t i) {
+                  size_type adjind   = i / block_size + row.begin();
                   nnz_lno_t colIndex = _adj(adjind);
-                  if (colIndex == ii)
-                  {
+                  if (colIndex == ii) {
                     diagonal_positions[i % block_size] = i;
                   }
                 });
 
-              for(nnz_lno_t vec = 0; vec < (nnz_lno_t) _Xvector.extent(1); vec++)
-              {
-                Kokkos::parallel_for(
+            for (nnz_lno_t vec = 0; vec < (nnz_lno_t)_Xvector.extent(1);
+                 vec++) {
+              Kokkos::parallel_for(
                   Kokkos::ThreadVectorRange(teamMember, scalar_row_size),
-                  [&] (nnz_lno_t i)
-                  {
-                    size_type adjind = i / block_size + block_row_begin;
+                  [&](nnz_lno_t i) {
+                    size_type adjind   = i / block_size + row.begin();
                     nnz_lno_t colIndex = _adj(adjind);
-                    all_shared_memory[i] = _Xvector(colIndex * block_size + i % block_size, vec);
+                    all_shared_memory[i] =
+                        _Xvector(colIndex * block_size + i % block_size, vec);
                   });
 
-                for (int m = 0; m < block_size; ++m )
-                {
-                  int i = m;
-                  if (is_backward)
-                  {
-                    i = block_size - m - 1;
-                  }
-                  size_type current_row_begin = scalar_row_begin + i * scalar_row_size;
-                  nnz_scalar_t product = 0;
-                  Kokkos::parallel_reduce(
+              for (int m = 0; m < block_size; ++m) {
+                int i = m;
+                if (is_backward) {
+                  i = block_size - m - 1;
+                }
+                nnz_scalar_t product = 0;
+                Kokkos::parallel_reduce(
                     Kokkos::ThreadVectorRange(teamMember, scalar_row_size),
-                    [&] (nnz_lno_t colind, nnz_scalar_t & valueToUpdate)
-                    {
-                      valueToUpdate += all_shared_memory[colind] * _adj_vals(current_row_begin + colind);
-                    }, product);
-
-                  Kokkos::single(Kokkos::PerThread(teamMember),[&] ()
-                  {
-                    nnz_lno_t block_row_index = ii * block_size + i;
-                    nnz_scalar_t invDiagonalVal = _permuted_inverse_diagonal(block_row_index);
-                    _Xvector(block_row_index, vec) += omega * (_Yvector(block_row_index, vec) - product) * invDiagonalVal;
-
-                    if (diagonal_positions[i] != -1)
-                    {
-                      all_shared_memory[diagonal_positions[i]] = _Xvector(block_row_index, vec);
-                    }
-                  });
+                    [&](nnz_lno_t colind, nnz_scalar_t& valueToUpdate) {
+                      const size_type val_idx = row.value(
+                          colind / block_size, i, colind % block_size);
+
+                      valueToUpdate +=
+                          all_shared_memory[colind] * _adj_vals(val_idx);
+                    },
+                    product);
+
+                Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+                  nnz_lno_t block_row_index = ii * block_size + i;
+                  nnz_scalar_t invDiagonalVal =
+                      _permuted_inverse_diagonal(block_row_index);
+                  _Xvector(block_row_index, vec) +=
+                      omega * (_Yvector(block_row_index, vec) - product) *
+                      invDiagonalVal;
+
+                  if (diagonal_positions[i] != -1) {
+                    all_shared_memory[diagonal_positions[i]] =
+                        _Xvector(block_row_index, vec);
+                  }
+                });
 
 #if KOKKOSSPARSE_IMPL_PRINTDEBUG
-                  if (!KokkosKernels::Impl::kk_is_gpu_exec_space<typename team_member_t::execution_space>() && (ii == 0 || (block_size == 1 && ii < 2))){
-                    std::cout << "\n\n\nrow:" << ii * block_size + i;
-                    std::cout << "\nneighbors:";
-                    for (nnz_lno_t z = 0; z < block_row_size; ++z){
-                      std::cout << _adj(_xadj(ii) + z) << " ";
-                    }
-
-                    std::cout <<"\n\nrow-0:X -- all-shared-memory:";
-                    for (nnz_lno_t z = 0; z < scalar_row_size; ++z){
-                      std::cout << all_shared_memory[z] << " ";
-                    }
-                    std::cout << std::endl << "product:" << product << std::endl;
-                    std::cout << "diagonal" << _permuted_inverse_diagonal(ii * block_size + i) << std::endl;
-                    std::cout << "_Yvector" << _Yvector(ii * block_size + i, vec) << std::endl;
+                if (!KokkosKernels::Impl::kk_is_gpu_exec_space<
+                        typename team_member_t::execution_space>() &&
+                    (ii == 0 || (block_size == 1 && ii < 2))) {
+                  std::cout << "\n\n\nrow:" << ii * block_size + i;
+                  std::cout << "\nneighbors:";
+                  for (nnz_lno_t z = 0; z < block_row_size; ++z) {
+                    std::cout << _adj(_xadj(ii) + z) << " ";
+                  }
 
-                    std::cout << std::endl << "block_row_index:" << ii * block_size + i <<  " _Xvector(block_row_index):" << _Xvector(ii * block_size + i, vec) << std::endl << std::endl<< std::endl;
+                  std::cout << "\n\nrow-0:X -- all-shared-memory:";
+                  for (nnz_lno_t z = 0; z < scalar_row_size; ++z) {
+                    std::cout << all_shared_memory[z] << " ";
                   }
-#endif
-                  //row_begin += row_size * block_size;
+                  std::cout << std::endl << "product:" << product << std::endl;
+                  std::cout << "diagonal"
+                            << _permuted_inverse_diagonal(ii * block_size + i)
+                            << std::endl;
+                  std::cout << "_Yvector" << _Yvector(ii * block_size + i, vec)
+                            << std::endl;
+
+                  std::cout << std::endl
+                            << "block_row_index:" << ii * block_size + i
+                            << " _Xvector(block_row_index):"
+                            << _Xvector(ii * block_size + i, vec) << std::endl
+                            << std::endl
+                            << std::endl;
                 }
+#endif
               }
-            });
-        }
-
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const LongRowTag&, const team_member_t& teamMember) const {
-          nnz_lno_t row = _color_set_begin + teamMember.league_rank() / _long_row_par;
-          nnz_lno_t chunk = teamMember.league_rank() % _long_row_par;
-          size_type row_begin = _xadj(row);
-          size_type row_end = _xadj(row + 1);
-          size_type chunk_begin = row_begin + (row_end - row_begin) * chunk / _long_row_par;
-          size_type chunk_end = row_begin + (row_end - row_begin) * (chunk + 1) / _long_row_par;
-          if(chunk_end > row_end)
-            chunk_end = row_end;
-          nnz_scalar_t localSum;
-          Kokkos::parallel_reduce(Kokkos::TeamThreadRange(teamMember, chunk_begin, chunk_end),
-            [&](size_type j, nnz_scalar_t& lsum)
-            {
-              nnz_lno_t colIndex = _adj(j);
-              lsum += _adj_vals(j) * _Xvector(colIndex, _long_row_col);
-            }, localSum);
-          Kokkos::single(Kokkos::PerTeam(teamMember),
-            [&]()
-            {
-              Kokkos::atomic_add(&_long_row_x(row - _color_set_begin), localSum);
-            });
-        }
-
-        size_t team_shmem_size (int /* team_size */) const {
-          return shared_memory_size;
-        }
-      };
-
-      struct LongRowComparator
-      {
-        KOKKOS_DEFAULTED_FUNCTION LongRowComparator() = default;
-        KOKKOS_INLINE_FUNCTION LongRowComparator(const in_lno_row_view_t& xadj_, nnz_lno_t longRowThreshold_)
-          : xadj(xadj_), longRowThreshold(longRowThreshold_)
-        {}
-
-        KOKKOS_INLINE_FUNCTION bool operator()(nnz_lno_t lhs, nnz_lno_t rhs) const
-        {
-          int lhsLong = xadj(lhs + 1) - xadj(lhs) >= longRowThreshold;
-          int rhsLong = xadj(rhs + 1) - xadj(rhs) >= longRowThreshold;
-          if(lhsLong < rhsLong)
-            return true;
-          else if(lhsLong > rhsLong)
-            return false;
-          //Either both long or both short, just order ascending by ID
-          return lhs < rhs;
-        }
-
-        in_lno_row_view_t xadj;
-        size_type longRowThreshold;
-      };
-
-      //Functor to sort each color set - first by whether 'long row', second by ID.
-      //Also populates long_rows_per_color.
-      struct SortIntoLongRowsFunctor
-      {
-        SortIntoLongRowsFunctor(
-            const in_lno_row_view_t& xadj_, nnz_lno_t longRowThreshold_,
-            const nnz_lno_persistent_work_view_t& color_xadj_, const nnz_lno_persistent_work_view_t& color_adj_,
-            const nnz_lno_persistent_work_view_t& long_rows_per_color_, const nnz_lno_persistent_work_view_t& max_row_length_per_color_)
-          : xadj(xadj_), longRowThreshold(longRowThreshold_), color_xadj(color_xadj_), color_adj(color_adj_),
-          long_rows_per_color(long_rows_per_color_), max_row_length_per_color(max_row_length_per_color_)
-        {}
-
-        KOKKOS_INLINE_FUNCTION void operator()(const team_member_t& t, nnz_lno_t& lmostPerColor) const
-        {
-          LongRowComparator comp(xadj, longRowThreshold);
-          nnz_lno_t color = t.league_rank();
-          nnz_lno_t colorBegin = color_xadj(color);
-          nnz_lno_t colorLen = color_xadj(color + 1) - colorBegin;
-          KokkosKernels::TeamBitonicSort(color_adj.data() + colorBegin, colorLen, t, comp);
-          t.team_barrier();
-          //Now that the color set is sorted, count how many long rows there were
-          nnz_lno_t numLongRows;
-          Kokkos::parallel_reduce(Kokkos::TeamThreadRange(t, colorBegin, colorBegin + colorLen),
-            [&](nnz_lno_t i, nnz_lno_t& lnumLongRows)
-            {
-              nnz_lno_t row = color_adj(i);
-              if(xadj(row + 1) - xadj(row) >= longRowThreshold)
-                lnumLongRows++;
-            }, numLongRows);
-          Kokkos::single(Kokkos::PerTeam(t),
-            [&]()
-            {
-              long_rows_per_color(color) = numLongRows;
-              if(numLongRows > lmostPerColor)
-                lmostPerColor = numLongRows;
-            });
-          nnz_lno_t max_row_length = 0;
-          Kokkos::parallel_reduce(Kokkos::TeamThreadRange(t, colorBegin, colorBegin + colorLen),
-            [&](nnz_lno_t i, nnz_lno_t& lmaxLength)
-            {
-              nnz_lno_t row = color_adj(i);
-              nnz_lno_t len = xadj(row + 1) - xadj(row);
-              if(len > lmaxLength)
-                lmaxLength = len;
-            }, Kokkos::Max<nnz_lno_t>(max_row_length));
-          Kokkos::single(Kokkos::PerTeam(t),
-            [&]()
-            {
-              max_row_length_per_color(color) = max_row_length;
-            });
-        }
-
-        in_lno_row_view_t xadj;
-        size_type longRowThreshold;
-        nnz_lno_persistent_work_view_t color_xadj;
-        nnz_lno_persistent_work_view_t color_adj;
-        nnz_lno_persistent_work_view_t long_rows_per_color;
-        nnz_lno_persistent_work_view_t max_row_length_per_color;
-      };
-
-      /**
-       * \brief constructor
-       */
-
-      PointGaussSeidel(HandleType *handle_,
-                  nnz_lno_t num_rows_,
-                  nnz_lno_t num_cols_,
-                  const_lno_row_view_t row_map_,
-                  const_lno_nnz_view_t entries_,
-                  const_scalar_nnz_view_t values_):
-        handle(handle_), num_rows(num_rows_), num_cols(num_cols_),
-        row_map(row_map_), entries(entries_), values(values_),
+            }
+          });
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const LongRowTag&, const team_member_t& teamMember) const {
+      nnz_lno_t row =
+          _color_set_begin + teamMember.league_rank() / _long_row_par;
+      nnz_lno_t chunk     = teamMember.league_rank() % _long_row_par;
+      size_type row_begin = _xadj(row);
+      size_type row_end   = _xadj(row + 1);
+      size_type chunk_begin =
+          row_begin + (row_end - row_begin) * chunk / _long_row_par;
+      size_type chunk_end =
+          row_begin + (row_end - row_begin) * (chunk + 1) / _long_row_par;
+      if (chunk_end > row_end) chunk_end = row_end;
+      nnz_scalar_t localSum;
+      Kokkos::parallel_reduce(
+          Kokkos::TeamThreadRange(teamMember, chunk_begin, chunk_end),
+          [&](size_type j, nnz_scalar_t& lsum) {
+            nnz_lno_t colIndex = _adj(j);
+            lsum += _adj_vals(j) * _Xvector(colIndex, _long_row_col);
+          },
+          localSum);
+      Kokkos::single(Kokkos::PerTeam(teamMember), [&]() {
+        Kokkos::atomic_add(&_long_row_x(row - _color_set_begin), localSum);
+      });
+    }
+
+    size_t team_shmem_size(int /* team_size */) const {
+      return shared_memory_size;
+    }
+  };
+
+  struct LongRowComparator {
+    KOKKOS_DEFAULTED_FUNCTION LongRowComparator() = default;
+    KOKKOS_INLINE_FUNCTION LongRowComparator(const in_lno_row_view_t& xadj_,
+                                             nnz_lno_t longRowThreshold_)
+        : xadj(xadj_), longRowThreshold(longRowThreshold_) {}
+
+    KOKKOS_INLINE_FUNCTION bool operator()(nnz_lno_t lhs, nnz_lno_t rhs) const {
+      int lhsLong = xadj(lhs + 1) - xadj(lhs) >= longRowThreshold;
+      int rhsLong = xadj(rhs + 1) - xadj(rhs) >= longRowThreshold;
+      if (lhsLong < rhsLong)
+        return true;
+      else if (lhsLong > rhsLong)
+        return false;
+      // Either both long or both short, just order ascending by ID
+      return lhs < rhs;
+    }
+
+    in_lno_row_view_t xadj;
+    size_type longRowThreshold;
+  };
+
+  // Functor to sort each color set - first by whether 'long row', second by ID.
+  // Also populates long_rows_per_color.
+  struct SortIntoLongRowsFunctor {
+    SortIntoLongRowsFunctor(
+        const in_lno_row_view_t& xadj_, nnz_lno_t longRowThreshold_,
+        const nnz_lno_persistent_work_view_t& color_xadj_,
+        const nnz_lno_persistent_work_view_t& color_adj_,
+        const nnz_lno_persistent_work_view_t& long_rows_per_color_,
+        const nnz_lno_persistent_work_view_t& max_row_length_per_color_)
+        : xadj(xadj_),
+          longRowThreshold(longRowThreshold_),
+          color_xadj(color_xadj_),
+          color_adj(color_adj_),
+          long_rows_per_color(long_rows_per_color_),
+          max_row_length_per_color(max_row_length_per_color_) {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(const team_member_t& t,
+                                           nnz_lno_t& lmostPerColor) const {
+      LongRowComparator comp(xadj, longRowThreshold);
+      nnz_lno_t color      = t.league_rank();
+      nnz_lno_t colorBegin = color_xadj(color);
+      nnz_lno_t colorLen   = color_xadj(color + 1) - colorBegin;
+      KokkosKernels::TeamBitonicSort(color_adj.data() + colorBegin, colorLen, t,
+                                     comp);
+      t.team_barrier();
+      // Now that the color set is sorted, count how many long rows there were
+      nnz_lno_t numLongRows;
+      Kokkos::parallel_reduce(
+          Kokkos::TeamThreadRange(t, colorBegin, colorBegin + colorLen),
+          [&](nnz_lno_t i, nnz_lno_t& lnumLongRows) {
+            nnz_lno_t row = color_adj(i);
+            if (xadj(row + 1) - xadj(row) >= longRowThreshold) lnumLongRows++;
+          },
+          numLongRows);
+      Kokkos::single(Kokkos::PerTeam(t), [&]() {
+        long_rows_per_color(color) = numLongRows;
+        if (numLongRows > lmostPerColor) lmostPerColor = numLongRows;
+      });
+      nnz_lno_t max_row_length = 0;
+      Kokkos::parallel_reduce(
+          Kokkos::TeamThreadRange(t, colorBegin, colorBegin + colorLen),
+          [&](nnz_lno_t i, nnz_lno_t& lmaxLength) {
+            nnz_lno_t row = color_adj(i);
+            nnz_lno_t len = xadj(row + 1) - xadj(row);
+            if (len > lmaxLength) lmaxLength = len;
+          },
+          Kokkos::Max<nnz_lno_t>(max_row_length));
+      Kokkos::single(Kokkos::PerTeam(t), [&]() {
+        max_row_length_per_color(color) = max_row_length;
+      });
+    }
+
+    in_lno_row_view_t xadj;
+    size_type longRowThreshold;
+    nnz_lno_persistent_work_view_t color_xadj;
+    nnz_lno_persistent_work_view_t color_adj;
+    nnz_lno_persistent_work_view_t long_rows_per_color;
+    nnz_lno_persistent_work_view_t max_row_length_per_color;
+  };
+
+  /**
+   * \brief constructor
+   */
+
+  PointGaussSeidel(HandleType* handle_, nnz_lno_t num_rows_,
+                   nnz_lno_t num_cols_, const_lno_row_view_t row_map_,
+                   const_lno_nnz_view_t entries_,
+                   const_scalar_nnz_view_t values_)
+      : handle(handle_),
+        num_rows(num_rows_),
+        num_cols(num_cols_),
+        row_map(row_map_),
+        entries(entries_),
+        values(values_),
         have_diagonal_given(false),
-        is_symmetric(true){}
-
-
-      PointGaussSeidel(HandleType *handle_,
-                  nnz_lno_t num_rows_,
-                  nnz_lno_t num_cols_,
-                  const_lno_row_view_t row_map_,
-                  const_lno_nnz_view_t entries_,
-                  bool is_symmetric_ = true):
-        handle(handle_),
-        num_rows(num_rows_), num_cols(num_cols_),
+        is_symmetric(true) {}
+
+  PointGaussSeidel(HandleType* handle_, nnz_lno_t num_rows_,
+                   nnz_lno_t num_cols_, const_lno_row_view_t row_map_,
+                   const_lno_nnz_view_t entries_, bool is_symmetric_ = true)
+      : handle(handle_),
+        num_rows(num_rows_),
+        num_cols(num_cols_),
         row_map(row_map_),
         entries(entries_),
         values(),
         have_diagonal_given(false),
-        is_symmetric(is_symmetric_){}
-
-
-      /**
-       * \brief constructor
-       */
-      PointGaussSeidel(HandleType *handle_,
-                  nnz_lno_t num_rows_,
-                  nnz_lno_t num_cols_,
-                  const_lno_row_view_t row_map_,
-                  const_lno_nnz_view_t entries_,
-                  const_scalar_nnz_view_t values_,
-                  bool is_symmetric_):
-        handle(handle_),
-        num_rows(num_rows_), num_cols(num_cols_),
-        row_map(row_map_), entries(entries_), values(values_),
+        is_symmetric(is_symmetric_) {}
+
+  /**
+   * \brief constructor
+   */
+  PointGaussSeidel(HandleType* handle_, nnz_lno_t num_rows_,
+                   nnz_lno_t num_cols_, const_lno_row_view_t row_map_,
+                   const_lno_nnz_view_t entries_,
+                   const_scalar_nnz_view_t values_, bool is_symmetric_)
+      : handle(handle_),
+        num_rows(num_rows_),
+        num_cols(num_cols_),
+        row_map(row_map_),
+        entries(entries_),
+        values(values_),
         have_diagonal_given(false),
-        is_symmetric(is_symmetric_){}
-
-
-      PointGaussSeidel(HandleType *handle_,
-                  nnz_lno_t num_rows_,
-                  nnz_lno_t num_cols_,
-                  const_lno_row_view_t row_map_,
-                  const_lno_nnz_view_t entries_,
-                  const_scalar_nnz_view_t values_,
-                  const_scalar_nnz_view_t given_inverse_diagonal_,
-                  bool is_symmetric_):
-        handle(handle_),
-        num_rows(num_rows_), num_cols(num_cols_),
-        row_map(row_map_), entries(entries_), values(values_),
+        is_symmetric(is_symmetric_) {}
+
+  PointGaussSeidel(HandleType* handle_, nnz_lno_t num_rows_,
+                   nnz_lno_t num_cols_, const_lno_row_view_t row_map_,
+                   const_lno_nnz_view_t entries_,
+                   const_scalar_nnz_view_t values_,
+                   const_scalar_nnz_view_t given_inverse_diagonal_,
+                   bool is_symmetric_)
+      : handle(handle_),
+        num_rows(num_rows_),
+        num_cols(num_cols_),
+        row_map(row_map_),
+        entries(entries_),
+        values(values_),
         given_inverse_diagonal(given_inverse_diagonal_),
         have_diagonal_given(true),
-        is_symmetric(is_symmetric_){}
+        is_symmetric(is_symmetric_) {}
 
-      void initialize_symbolic()
-      {
-        auto gsHandle = get_gs_handle();
-        const size_type longRowThreshold = gsHandle->get_long_row_threshold();
+  void initialize_symbolic() {
+    auto gsHandle                    = get_gs_handle();
+    const size_type longRowThreshold = gsHandle->get_long_row_threshold();
 
-        //Validate settings
-        if(gsHandle->get_block_size() > 1 && longRowThreshold > 0)
-          throw std::runtime_error("Can't use MTGS long row algorithm with blocks.");
+    // Validate settings
+    if (gsHandle->get_block_size() > 1 && longRowThreshold > 0)
+      throw std::runtime_error(
+          "Can't use MTGS long row algorithm with blocks.");
 
-        const_lno_row_view_t xadj = this->row_map;
-        const_lno_nnz_view_t adj = this->entries;
-        size_type nnz = adj.extent(0);
+    const_lno_row_view_t xadj = this->row_map;
+    const_lno_nnz_view_t adj  = this->entries;
+    size_type nnz             = adj.extent(0);
 
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
-        Kokkos::Timer timer;
+    Kokkos::Timer timer;
 #endif
-        typename HandleType::GraphColoringHandleType::color_view_t colors;
-        color_t numColors;
-        {
-          HandleType coloringHandle;
-          coloringHandle.create_graph_coloring_handle();
-          auto gchandle = coloringHandle.get_graph_coloring_handle();
-          if (!is_symmetric) {
-            if (gchandle->get_coloring_algo_type() == KokkosGraph::COLORING_EB) {
-
-              gchandle->symmetrize_and_calculate_lower_diagonal_edge_list(num_rows, xadj, adj);
-              KokkosGraph::Experimental::graph_color_symbolic <HandleType, const_lno_row_view_t, const_lno_nnz_view_t>
-                (&coloringHandle, num_rows, num_rows, xadj, adj);
-            }
-            else {
-              row_lno_temp_work_view_t tmp_xadj;
-              nnz_lno_temp_work_view_t tmp_adj;
-              KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap
-                < const_lno_row_view_t, const_lno_nnz_view_t,
-                  row_lno_temp_work_view_t, nnz_lno_temp_work_view_t,
-                  MyExecSpace>
-                (num_rows, xadj, adj, tmp_xadj, tmp_adj);
-              KokkosGraph::Experimental::graph_color_symbolic <HandleType, row_lno_temp_work_view_t, nnz_lno_temp_work_view_t>
-                (&coloringHandle, num_rows, num_rows, tmp_xadj, tmp_adj);
-            }
-          }
-          else {
-            KokkosGraph::Experimental::graph_color_symbolic <HandleType, const_lno_row_view_t, const_lno_nnz_view_t>
-              (&coloringHandle, num_rows, num_rows, xadj, adj);
-          }
-          colors =  gchandle->get_vertex_colors();
-          numColors = gchandle->get_num_colors();
+    typename HandleType::GraphColoringHandleType::color_view_t colors;
+    color_t numColors;
+    {
+      HandleType coloringHandle;
+      coloringHandle.create_graph_coloring_handle(
+          gsHandle->get_coloring_algorithm());
+      auto gchandle = coloringHandle.get_graph_coloring_handle();
+      if (!is_symmetric) {
+        if (gchandle->get_coloring_algo_type() == KokkosGraph::COLORING_EB) {
+          gchandle->symmetrize_and_calculate_lower_diagonal_edge_list(
+              num_rows, xadj, adj);
+          KokkosGraph::Experimental::graph_color_symbolic<
+              HandleType, const_lno_row_view_t, const_lno_nnz_view_t>(
+              &coloringHandle, num_rows, num_rows, xadj, adj);
+        } else {
+          row_lno_temp_work_view_t tmp_xadj;
+          nnz_lno_temp_work_view_t tmp_adj;
+          KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap<
+              const_lno_row_view_t, const_lno_nnz_view_t,
+              row_lno_temp_work_view_t, nnz_lno_temp_work_view_t, MyExecSpace>(
+              num_rows, xadj, adj, tmp_xadj, tmp_adj);
+          KokkosGraph::Experimental::graph_color_symbolic<
+              HandleType, row_lno_temp_work_view_t, nnz_lno_temp_work_view_t>(
+              &coloringHandle, num_rows, num_rows, tmp_xadj, tmp_adj);
         }
+      } else {
+        KokkosGraph::Experimental::graph_color_symbolic<
+            HandleType, const_lno_row_view_t, const_lno_nnz_view_t>(
+            &coloringHandle, num_rows, num_rows, xadj, adj);
+      }
+      colors    = gchandle->get_vertex_colors();
+      numColors = gchandle->get_num_colors();
+    }
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
-        std::cout << "COLORING_TIME:" << timer.seconds() << std::endl;
-        timer.reset();
+    std::cout << "COLORING_TIME:" << timer.seconds() << std::endl;
+    timer.reset();
 #endif
 
 #if KOKKOSSPARSE_IMPL_RUNSEQUENTIAL
-        numColors = num_rows;
-        KokkosKernels::Impl::print_1Dview(colors);
-        std::cout << "numCol:" << numColors << " numRows:" << num_rows << " cols:" << num_cols << " nnz:" << adj.extent(0) <<  std::endl;
-        typename HandleType::GraphColoringHandleType::color_view_t::HostMirror  h_colors = Kokkos::create_mirror_view (colors);
-        for(int i = 0; i < num_rows; ++i){
-          h_colors(i) = i + 1;
-        }
-        Kokkos::deep_copy(colors, h_colors);
+    numColors = num_rows;
+    KokkosKernels::Impl::print_1Dview(colors);
+    std::cout << "numCol:" << numColors << " numRows:" << num_rows
+              << " cols:" << num_cols << " nnz:" << adj.extent(0) << std::endl;
+    typename HandleType::GraphColoringHandleType::color_view_t::HostMirror
+        h_colors = Kokkos::create_mirror_view(colors);
+    for (int i = 0; i < num_rows; ++i) {
+      h_colors(i) = i + 1;
+    }
+    Kokkos::deep_copy(colors, h_colors);
 #endif
-        nnz_lno_persistent_work_view_t color_xadj;
-        nnz_lno_persistent_work_view_t color_adj;
+    nnz_lno_persistent_work_view_t color_xadj;
+    nnz_lno_persistent_work_view_t color_adj;
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
-        timer.reset();
+    timer.reset();
 #endif
-        KokkosKernels::Impl::create_reverse_map
-          <typename HandleType::GraphColoringHandleType::color_view_t,
-           nnz_lno_persistent_work_view_t, MyExecSpace>
-          (num_rows, numColors, colors, color_xadj, color_adj);
+    KokkosKernels::Impl::create_reverse_map<
+        typename HandleType::GraphColoringHandleType::color_view_t,
+        nnz_lno_persistent_work_view_t, MyExecSpace>(
+        num_rows, numColors, colors, color_xadj, color_adj);
 
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
-        MyExecSpace().fence();
-        std::cout << "CREATE_REVERSE_MAP:" << timer.seconds() << std::endl;
-        timer.reset();
+    MyExecSpace().fence();
+    std::cout << "CREATE_REVERSE_MAP:" << timer.seconds() << std::endl;
+    timer.reset();
 #endif
 
-        nnz_lno_persistent_work_host_view_t  h_color_xadj = Kokkos::create_mirror_view (color_xadj);
-        Kokkos::deep_copy (h_color_xadj , color_xadj);
+    nnz_lno_persistent_work_host_view_t h_color_xadj =
+        Kokkos::create_mirror_view(color_xadj);
+    Kokkos::deep_copy(h_color_xadj, color_xadj);
 
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
-        MyExecSpace().fence();
-        std::cout << "DEEP_COPY:" << timer.seconds() << std::endl;
-        timer.reset();
+    MyExecSpace().fence();
+    std::cout << "DEEP_COPY:" << timer.seconds() << std::endl;
+    timer.reset();
 #endif
-        if(longRowThreshold > 0)
-        {
-          //Count long rows per color set, and sort color sets so that long rows come after regular rows
-          nnz_lno_persistent_work_view_t long_rows_per_color(Kokkos::view_alloc(Kokkos::WithoutInitializing, "long_rows_per_color"), numColors);
-          nnz_lno_persistent_work_view_t max_row_length_per_color(Kokkos::view_alloc(Kokkos::WithoutInitializing, "max_row_length_per_color"), numColors);
-          nnz_lno_t mostLongRowsInColor = 0;
-          SortIntoLongRowsFunctor sortIntoLongRowsFunctor(xadj, longRowThreshold,
-              color_xadj, color_adj, long_rows_per_color, max_row_length_per_color);
-          int sortLongRowsTeamSize = 1;
-          {
-            team_policy_t temp(1, 1);
-            sortLongRowsTeamSize = temp.team_size_recommended(sortIntoLongRowsFunctor, Kokkos::ParallelReduceTag());
-          }
-          Kokkos::parallel_reduce(team_policy_t(numColors, sortLongRowsTeamSize),
-              sortIntoLongRowsFunctor,
-              Kokkos::Max<nnz_lno_t>(mostLongRowsInColor));
-          auto host_long_rows_per_color = Kokkos::create_mirror_view(long_rows_per_color);
-          Kokkos::deep_copy(host_long_rows_per_color, long_rows_per_color);
-          gsHandle->set_long_rows_per_color(host_long_rows_per_color);
-          auto host_max_row_length_per_color = Kokkos::create_mirror_view(max_row_length_per_color);
-          Kokkos::deep_copy(host_max_row_length_per_color, max_row_length_per_color);
-          gsHandle->set_max_row_length_per_color(host_max_row_length_per_color);
-          scalar_persistent_work_view_t long_row_x(Kokkos::view_alloc(Kokkos::WithoutInitializing, "long_row_x"), mostLongRowsInColor);
-          gsHandle->set_long_row_x(long_row_x);
-        }
-        else
-        {
-          //Just sort rows by ID.
-          KokkosKernels::sort_crs_graph<MyExecSpace, decltype(color_xadj), decltype(color_adj)>(color_xadj, color_adj);
-        }
+    if (longRowThreshold > 0) {
+      // Count long rows per color set, and sort color sets so that long rows
+      // come after regular rows
+      nnz_lno_persistent_work_view_t long_rows_per_color(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                             "long_rows_per_color"),
+          numColors);
+      nnz_lno_persistent_work_view_t max_row_length_per_color(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                             "max_row_length_per_color"),
+          numColors);
+      nnz_lno_t mostLongRowsInColor = 0;
+      SortIntoLongRowsFunctor sortIntoLongRowsFunctor(
+          xadj, longRowThreshold, color_xadj, color_adj, long_rows_per_color,
+          max_row_length_per_color);
+      int sortLongRowsTeamSize = 1;
+      {
+        team_policy_t temp(1, 1);
+        sortLongRowsTeamSize = temp.team_size_recommended(
+            sortIntoLongRowsFunctor, Kokkos::ParallelReduceTag());
+      }
+      Kokkos::parallel_reduce(team_policy_t(numColors, sortLongRowsTeamSize),
+                              sortIntoLongRowsFunctor,
+                              Kokkos::Max<nnz_lno_t>(mostLongRowsInColor));
+      auto host_long_rows_per_color =
+          Kokkos::create_mirror_view(long_rows_per_color);
+      Kokkos::deep_copy(host_long_rows_per_color, long_rows_per_color);
+      gsHandle->set_long_rows_per_color(host_long_rows_per_color);
+      auto host_max_row_length_per_color =
+          Kokkos::create_mirror_view(max_row_length_per_color);
+      Kokkos::deep_copy(host_max_row_length_per_color,
+                        max_row_length_per_color);
+      gsHandle->set_max_row_length_per_color(host_max_row_length_per_color);
+      scalar_persistent_work_view_t long_row_x(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "long_row_x"),
+          mostLongRowsInColor);
+      gsHandle->set_long_row_x(long_row_x);
+    } else {
+      // Just sort rows by ID.
+      KokkosKernels::sort_crs_graph<MyExecSpace, decltype(color_xadj),
+                                    decltype(color_adj)>(color_xadj, color_adj);
+    }
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
-        MyExecSpace().fence();
-        std::cout << "SORT_TIME:" << timer.seconds() << std::endl;
-        timer.reset();
+    MyExecSpace().fence();
+    std::cout << "SORT_TIME:" << timer.seconds() << std::endl;
+    timer.reset();
 #endif
 
-        row_lno_persistent_work_view_t permuted_xadj ("new xadj", num_rows + 1);
-        nnz_lno_persistent_work_view_t old_to_new_map ("old_to_new_index_", num_rows );
-        nnz_lno_persistent_work_view_t permuted_adj ("newadj_", nnz );
+    row_lno_persistent_work_view_t permuted_xadj("new xadj", num_rows + 1);
+    nnz_lno_persistent_work_view_t old_to_new_map("old_to_new_index_",
+                                                  num_rows);
+    nnz_lno_persistent_work_view_t permuted_adj("newadj_", nnz);
 
-        Kokkos::parallel_for( "KokkosSparse::PointGaussSeidel::create_permuted_xadj", range_pol(0,num_rows),
-                              create_permuted_xadj(
-                                                   color_adj,
-                                                   xadj,
-                                                   permuted_xadj,
-                                                   old_to_new_map));
-        //std::cout << "create_permuted_xadj" << std::endl;
+    Kokkos::parallel_for(
+        "KokkosSparse::PointGaussSeidel::create_permuted_xadj",
+        range_pol(0, num_rows),
+        create_permuted_xadj(color_adj, xadj, permuted_xadj, old_to_new_map));
+    // std::cout << "create_permuted_xadj" << std::endl;
 
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
-        MyExecSpace().fence();
-        std::cout << "CREATE_PERMUTED_XADJ:" << timer.seconds() << std::endl;
+    MyExecSpace().fence();
+    std::cout << "CREATE_PERMUTED_XADJ:" << timer.seconds() << std::endl;
 
-        timer.reset();
+    timer.reset();
 #endif
 
-        KokkosKernels::Impl::inclusive_parallel_prefix_sum
-          <row_lno_persistent_work_view_t, MyExecSpace>
-          (num_rows + 1, permuted_xadj);
+    KokkosKernels::Impl::inclusive_parallel_prefix_sum<
+        row_lno_persistent_work_view_t, MyExecSpace>(num_rows + 1,
+                                                     permuted_xadj);
 
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
-        MyExecSpace().fence();
-        std::cout << "INCLUSIVE_PPS:" << timer.seconds() << std::endl;
-        timer.reset();
+    MyExecSpace().fence();
+    std::cout << "INCLUSIVE_PPS:" << timer.seconds() << std::endl;
+    timer.reset();
 #endif
 
-
-        Kokkos::parallel_for( "KokkosSparse::PointGaussSeidel::fill_matrix_symbolic",range_pol(0,num_rows),
-                              fill_matrix_symbolic(
-                                                   num_rows,
-                                                   color_adj,
-                                                   xadj,
-                                                   adj,
-                                                   //adj_vals,
-                                                   permuted_xadj,
-                                                   permuted_adj,
-                                                   //newvals_,
-                                                   old_to_new_map));
+    Kokkos::parallel_for("KokkosSparse::PointGaussSeidel::fill_matrix_symbolic",
+                         range_pol(0, num_rows),
+                         fill_matrix_symbolic(num_rows, color_adj, xadj, adj,
+                                              // adj_vals,
+                                              permuted_xadj, permuted_adj,
+                                              // newvals_,
+                                              old_to_new_map));
 
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
-        MyExecSpace().fence();
-        std::cout << "SYMBOLIC_FILL:" << timer.seconds() << std::endl;
-        timer.reset();
+    MyExecSpace().fence();
+    std::cout << "SYMBOLIC_FILL:" << timer.seconds() << std::endl;
+    timer.reset();
 #endif
 
-        nnz_lno_t block_size = get_gs_handle()->get_block_size();
-
-        //MD: if block size is larger than 1;
-        //the algorithm copies the vector entries into shared memory and reuses this small shared memory for vector entries.
-        if (block_size > 1)
-        {
-          //first calculate max row size.
-          size_type max_row_size = 0;
-          KokkosKernels::Impl::kk_view_reduce_max_row_size<size_type, MyExecSpace>(num_rows, permuted_xadj.data(), permuted_xadj.data() + 1, max_row_size);
-
-          nnz_lno_t brows = permuted_xadj.extent(0) - 1;
-          size_type bnnz =  permuted_adj.extent(0) * block_size * block_size;
-
-          int suggested_vector_size = this->handle->get_suggested_vector_size(brows, bnnz);
-          int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-          size_t shmem_size_to_use = this->handle->get_shmem_size();
-
-          //MD: now we calculate how much memory is needed for shared memory.
-          //we have two-level vectors: as in spgemm hashmaps.
-          //we try to fit everything into shared memory.
-          //if they fit, we can use BlockTeam function in Team_SGS functor.
-          //on CPUs, we make L1 vector big enough so that it will always hold it.
-          //on GPUs, we have a upper bound for shared memory: handle->get_shmem_size(): this is set to 32128 bytes.
-          //If things do not fit into shared memory, we allocate vectors in global memory and run BigBlockTeam in Team_SGS functor.
-          size_t level_1_mem = max_row_size * block_size * sizeof(nnz_scalar_t) + ((block_size / 8 ) + 1) * 8 * sizeof(nnz_lno_t);
-          level_1_mem = suggested_team_size * level_1_mem;
-          size_t level_2_mem = 0;
-          nnz_lno_t num_values_in_l1 = max_row_size;
-          nnz_lno_t num_values_in_l2 = 0;
-          nnz_lno_t num_big_rows = 0;
-
-          if (!KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
-            //again, if it is on CPUs, we make L1 as big as we need.
-            size_t l1mem = 1;
-            while(l1mem < level_1_mem){
-              l1mem *= 2;
-            }
-            gsHandle->set_level_1_mem(l1mem);
-            level_1_mem = l1mem;
-            level_2_mem = 0;
+    nnz_lno_t block_size = get_gs_handle()->get_block_size();
+
+    // MD: if block size is larger than 1;
+    // the algorithm copies the vector entries into shared memory and reuses
+    // this small shared memory for vector entries.
+    if (block_size > 1) {
+      // first calculate max row size.
+      size_type max_row_size = 0;
+      KokkosKernels::Impl::kk_view_reduce_max_row_size<size_type, MyExecSpace>(
+          num_rows, permuted_xadj.data(), permuted_xadj.data() + 1,
+          max_row_size);
+
+      nnz_lno_t brows = permuted_xadj.extent(0) - 1;
+      size_type bnnz  = permuted_adj.extent(0) * block_size * block_size;
+
+      int suggested_vector_size =
+          this->handle->get_suggested_vector_size(brows, bnnz);
+      int suggested_team_size =
+          this->handle->get_suggested_team_size(suggested_vector_size);
+      size_t shmem_size_to_use = this->handle->get_shmem_size();
+
+      // MD: now we calculate how much memory is needed for shared memory.
+      // we have two-level vectors: as in spgemm hashmaps.
+      // we try to fit everything into shared memory.
+      // if they fit, we can use BlockTeam function in Team_SGS functor.
+      // on CPUs, we make L1 vector big enough so that it will always hold it.
+      // on GPUs, we have a upper bound for shared memory:
+      // handle->get_shmem_size(): this is set to 32128 bytes. If things do not
+      // fit into shared memory, we allocate vectors in global memory and run
+      // BigBlockTeam in Team_SGS functor.
+      size_t level_1_mem = max_row_size * block_size * sizeof(nnz_scalar_t) +
+                           ((block_size / 8) + 1) * 8 * sizeof(nnz_lno_t);
+      level_1_mem                = suggested_team_size * level_1_mem;
+      size_t level_2_mem         = 0;
+      nnz_lno_t num_values_in_l1 = max_row_size;
+      nnz_lno_t num_values_in_l2 = 0;
+      nnz_lno_t num_big_rows     = 0;
+
+      if (!KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
+        // again, if it is on CPUs, we make L1 as big as we need.
+        size_t l1mem = 1;
+        while (l1mem < level_1_mem) {
+          l1mem *= 2;
+        }
+        gsHandle->set_level_1_mem(l1mem);
+        level_1_mem = l1mem;
+        level_2_mem = 0;
+      } else {
+        // on GPUs set the L1 size to max shmem and calculate how much we need
+        // for L2. we try to shift with 8 always because of the errors we
+        // experience with memory shifts on GPUs.
+        level_1_mem      = shmem_size_to_use;
+        num_values_in_l1 = (shmem_size_to_use / suggested_team_size -
+                            ((block_size / 8) + 1) * 8 * sizeof(nnz_lno_t)) /
+                           sizeof(nnz_scalar_t) / block_size;
+        if (((block_size / 8) + 1) * 8 * sizeof(nnz_lno_t) >
+            shmem_size_to_use / suggested_team_size)
+          throw "Shared memory size is to small for the given block size\n";
+        if (num_values_in_l1 >= (nnz_lno_t)(max_row_size)) {
+          num_values_in_l2 = 0;
+          level_2_mem      = 0;
+          num_big_rows     = 0;
+        } else {
+          num_values_in_l2 = max_row_size - num_values_in_l1;
+          level_2_mem = num_values_in_l2 * block_size * sizeof(nnz_scalar_t);
+          // std::cout << "level_2_mem:" << level_2_mem << std::endl;
+          size_t l2mem = 1;
+          while (l2mem < level_2_mem) {
+            l2mem *= 2;
           }
-          else {
-            //on GPUs set the L1 size to max shmem and calculate how much we need for L2.
-            //we try to shift with 8 always because of the errors we experience with memory shifts on GPUs.
-            level_1_mem = shmem_size_to_use;
-            num_values_in_l1 = (shmem_size_to_use / suggested_team_size - ((block_size / 8 ) + 1) * 8 * sizeof(nnz_lno_t)) / sizeof(nnz_scalar_t) / block_size;
-            if (((block_size / 8 ) + 1) * 8 * sizeof(nnz_lno_t) > shmem_size_to_use / suggested_team_size ) throw "Shared memory size is to small for the given block size\n";
-            if (num_values_in_l1 >= (nnz_lno_t) (max_row_size) ){
-              num_values_in_l2 = 0;
-              level_2_mem = 0;
-              num_big_rows = 0;
+          level_2_mem = l2mem;
+          // std::cout << "level_2_mem:" << level_2_mem << std::endl;
+
+          size_type num_large_rows = 0;
+          KokkosKernels::Impl::kk_reduce_numrows_larger_than_threshold<
+              row_lno_persistent_work_view_t, MyExecSpace>(
+              brows, permuted_xadj, num_values_in_l1, num_large_rows);
+          num_big_rows = KOKKOSKERNELS_MACRO_MIN(
+              num_large_rows,
+              (size_type)(MyExecSpace::concurrency() / suggested_vector_size));
+          // std::cout << "num_big_rows:" << num_big_rows << std::endl;
+
+          if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
+            // check if we have enough memory for this. lower the concurrency if
+            // we do not have enugh memory.
+            size_t free_byte;
+            size_t total_byte;
+            KokkosKernels::Impl::kk_get_free_total_memory<
+                typename pool_memory_space::memory_space>(free_byte,
+                                                          total_byte);
+            size_t required_size = size_t(num_big_rows) * level_2_mem;
+            if (required_size + num_big_rows * sizeof(int) > free_byte) {
+              num_big_rows =
+                  ((((free_byte - num_big_rows * sizeof(int)) * 0.8) / 8) * 8) /
+                  level_2_mem;
             }
-            else {
-
-              num_values_in_l2 = max_row_size - num_values_in_l1;
-              level_2_mem = num_values_in_l2 * block_size  * sizeof(nnz_scalar_t);
-              //std::cout << "level_2_mem:" << level_2_mem << std::endl;
-              size_t l2mem = 1;
-              while(l2mem < level_2_mem){
-                l2mem *= 2;
-              }
-              level_2_mem  = l2mem;
-              //std::cout << "level_2_mem:" << level_2_mem << std::endl;
-
-              size_type num_large_rows = 0;
-              KokkosKernels::Impl::kk_reduce_numrows_larger_than_threshold<row_lno_persistent_work_view_t, MyExecSpace>(brows, permuted_xadj, num_values_in_l1, num_large_rows);
-              num_big_rows = KOKKOSKERNELS_MACRO_MIN(num_large_rows, (size_type)(MyExecSpace::concurrency() / suggested_vector_size));
-              //std::cout << "num_big_rows:" << num_big_rows << std::endl;
-
-              if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
-                //check if we have enough memory for this. lower the concurrency if we do not have enugh memory.
-                size_t free_byte ;
-                size_t total_byte ;
-                KokkosKernels::Impl::kk_get_free_total_memory<typename pool_memory_space::memory_space>(free_byte, total_byte);
-                size_t required_size = size_t (num_big_rows) * level_2_mem;
-                if (required_size + num_big_rows * sizeof(int) > free_byte){
-                  num_big_rows = ((((free_byte - num_big_rows * sizeof(int))* 0.8) /8 ) * 8) / level_2_mem;
-                }
-                {
-                  nnz_lno_t min_chunk_size = 1;
-                  while (min_chunk_size * 2 <= num_big_rows) {
-                    min_chunk_size *= 2;
-                  }
-                  num_big_rows = min_chunk_size;
-                }
+            {
+              nnz_lno_t min_chunk_size = 1;
+              while (min_chunk_size * 2 <= num_big_rows) {
+                min_chunk_size *= 2;
               }
+              num_big_rows = min_chunk_size;
             }
           }
-
-          gsHandle->set_level_1_mem(level_1_mem);
-          gsHandle->set_level_2_mem(level_2_mem);
-
-          gsHandle->set_num_values_in_l1(num_values_in_l1);
-          gsHandle->set_num_values_in_l2(num_values_in_l2);
-          gsHandle->set_num_big_rows(num_big_rows);
         }
+      }
 
-        gsHandle->set_color_xadj(h_color_xadj);
-        gsHandle->set_color_adj(color_adj);
-        gsHandle->set_num_colors(numColors);
-        gsHandle->set_new_xadj(permuted_xadj);
-        gsHandle->set_new_adj(permuted_adj);
-        gsHandle->set_old_to_new_map(old_to_new_map);
-        gsHandle->set_call_symbolic(true);
+      gsHandle->set_level_1_mem(level_1_mem);
+      gsHandle->set_level_2_mem(level_2_mem);
+
+      gsHandle->set_num_values_in_l1(num_values_in_l1);
+      gsHandle->set_num_values_in_l2(num_values_in_l2);
+      gsHandle->set_num_big_rows(num_big_rows);
+    }
+
+    gsHandle->set_color_xadj(h_color_xadj);
+    gsHandle->set_color_adj(color_adj);
+    gsHandle->set_num_colors(numColors);
+    gsHandle->set_new_xadj(permuted_xadj);
+    gsHandle->set_new_adj(permuted_adj);
+    gsHandle->set_old_to_new_map(old_to_new_map);
+    gsHandle->set_call_symbolic(true);
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
-        std::cout << "ALLOC:" << timer.seconds() << std::endl;
+    std::cout << "ALLOC:" << timer.seconds() << std::endl;
 #endif
-      }
-
-      struct create_permuted_xadj{
-        nnz_lno_persistent_work_view_t color_adj;
-        const_lno_row_view_t oldxadj;
-        row_lno_persistent_work_view_t newxadj;
-        nnz_lno_persistent_work_view_t old_to_new_index;
-        create_permuted_xadj(
-                             nnz_lno_persistent_work_view_t color_adj_,
-                             const_lno_row_view_t oldxadj_,
-                             row_lno_persistent_work_view_t newxadj_,
-                             nnz_lno_persistent_work_view_t old_to_new_index_):
-          color_adj(color_adj_), oldxadj(oldxadj_),
-          newxadj(newxadj_),old_to_new_index(old_to_new_index_){}
-
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const nnz_lno_t &i) const{
-          nnz_lno_t index = color_adj(i);
-          newxadj(i + 1) = oldxadj[index + 1] - oldxadj[index];
-          old_to_new_index[index] = i;
-        }
-      };
-
-      struct fill_matrix_symbolic{
-        nnz_lno_t num_rows;
-        nnz_lno_persistent_work_view_t color_adj;
-        const_lno_row_view_t oldxadj;
-        const_lno_nnz_view_t oldadj;
-        //value_array_type oldadjvals;
-        row_lno_persistent_work_view_t newxadj;
-        nnz_lno_persistent_work_view_t newadj;
-        //value_persistent_work_array_type newadjvals;
-        nnz_lno_persistent_work_view_t old_to_new_index;
-        fill_matrix_symbolic(
-                             nnz_lno_t num_rows_,
-                             nnz_lno_persistent_work_view_t color_adj_,
-                             const_lno_row_view_t oldxadj_,
-                             const_lno_nnz_view_t oldadj_,
-                             //value_array_type oldadjvals_,
-                             row_lno_persistent_work_view_t newxadj_,
-                             nnz_lno_persistent_work_view_t newadj_,
-                             //value_persistent_work_array_type newadjvals_,
-                             nnz_lno_persistent_work_view_t old_to_new_index_):
-          num_rows(num_rows_),
-          color_adj(color_adj_), oldxadj(oldxadj_), oldadj(oldadj_), //oldadjvals(oldadjvals_),
-          newxadj(newxadj_), newadj(newadj_), //newadjvals(newadjvals_),
-          old_to_new_index(old_to_new_index_){}
-
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const nnz_lno_t &i) const{
-          nnz_lno_t index = color_adj(i);
-          size_type xadj_begin = newxadj(i);
-
-          size_type old_xadj_end = oldxadj[index + 1];
-          for (size_type j = oldxadj[index]; j < old_xadj_end; ++j){
-            nnz_lno_t neighbor = oldadj[j];
-            if(neighbor < num_rows) neighbor = old_to_new_index[neighbor];
-            newadj[xadj_begin++] = neighbor;
-            //newadjvals[xadj_begin++] = oldadjvals[j];
-          }
-        }
-      };
-
-
-      struct fill_matrix_numeric{
-        nnz_lno_persistent_work_view_t color_adj;
-        const_lno_row_view_t oldxadj;
-        const_scalar_nnz_view_t oldadjvals;
-        row_lno_persistent_work_view_t newxadj;
-        scalar_persistent_work_view_t newadjvals;
-
-        nnz_lno_t num_total_rows;
-        nnz_lno_t rows_per_team;
-        nnz_lno_t block_matrix_size;
-        fill_matrix_numeric(
-                            nnz_lno_persistent_work_view_t color_adj_,
-                            const_lno_row_view_t oldxadj_,
-                            const_scalar_nnz_view_t oldadjvals_,
-                            row_lno_persistent_work_view_t newxadj_,
-                            scalar_persistent_work_view_t newadjvals_,
-                            nnz_lno_t num_total_rows_,
-                            nnz_lno_t rows_per_team_ , nnz_lno_t block_matrix_size_):
-          color_adj(color_adj_), oldxadj(oldxadj_),  oldadjvals(oldadjvals_),
-          newxadj(newxadj_), newadjvals(newadjvals_),
-          num_total_rows(num_total_rows_), rows_per_team(rows_per_team_), block_matrix_size(block_matrix_size_){}
-
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const nnz_lno_t &i) const{
-          nnz_lno_t index = color_adj(i);
-          size_type xadj_begin = newxadj(i) * block_matrix_size;
-          size_type old_xadj_end = oldxadj[index + 1] * block_matrix_size;
-
-          for (size_type j = oldxadj[index] * block_matrix_size ; j < old_xadj_end; ++j){
-            newadjvals[xadj_begin++] = oldadjvals[j];
-          }
-        }
-
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const team_member_t &team) const{
-
-          const nnz_lno_t i_begin = team.league_rank() * rows_per_team;
-          const nnz_lno_t i_end = i_begin + rows_per_team <= num_total_rows ? i_begin + rows_per_team : num_total_rows;
-          Kokkos::parallel_for(Kokkos::TeamThreadRange(team,i_begin,i_end), [&] (const nnz_lno_t& i) {
-              nnz_lno_t index = color_adj(i);
-              size_type xadj_begin = newxadj(i) * block_matrix_size;
+  }
 
-              size_type old_xadj_begin = oldxadj[index] * block_matrix_size;
-              size_type old_xadj_end = oldxadj[index + 1] * block_matrix_size;
-              Kokkos::parallel_for (Kokkos::ThreadVectorRange(team,old_xadj_end-old_xadj_begin), [&] (const nnz_lno_t& j) {
+  struct create_permuted_xadj {
+    nnz_lno_persistent_work_view_t color_adj;
+    const_lno_row_view_t oldxadj;
+    row_lno_persistent_work_view_t newxadj;
+    nnz_lno_persistent_work_view_t old_to_new_index;
+    create_permuted_xadj(nnz_lno_persistent_work_view_t color_adj_,
+                         const_lno_row_view_t oldxadj_,
+                         row_lno_persistent_work_view_t newxadj_,
+                         nnz_lno_persistent_work_view_t old_to_new_index_)
+        : color_adj(color_adj_),
+          oldxadj(oldxadj_),
+          newxadj(newxadj_),
+          old_to_new_index(old_to_new_index_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const nnz_lno_t& i) const {
+      nnz_lno_t index         = color_adj(i);
+      newxadj(i + 1)          = oldxadj[index + 1] - oldxadj[index];
+      old_to_new_index[index] = i;
+    }
+  };
+
+  struct fill_matrix_symbolic {
+    nnz_lno_t num_rows;
+    nnz_lno_persistent_work_view_t color_adj;
+    const_lno_row_view_t oldxadj;
+    const_lno_nnz_view_t oldadj;
+    // value_array_type oldadjvals;
+    row_lno_persistent_work_view_t newxadj;
+    nnz_lno_persistent_work_view_t newadj;
+    // value_persistent_work_array_type newadjvals;
+    nnz_lno_persistent_work_view_t old_to_new_index;
+    fill_matrix_symbolic(nnz_lno_t num_rows_,
+                         nnz_lno_persistent_work_view_t color_adj_,
+                         const_lno_row_view_t oldxadj_,
+                         const_lno_nnz_view_t oldadj_,
+                         // value_array_type oldadjvals_,
+                         row_lno_persistent_work_view_t newxadj_,
+                         nnz_lno_persistent_work_view_t newadj_,
+                         // value_persistent_work_array_type newadjvals_,
+                         nnz_lno_persistent_work_view_t old_to_new_index_)
+        : num_rows(num_rows_),
+          color_adj(color_adj_),
+          oldxadj(oldxadj_),
+          oldadj(oldadj_),  // oldadjvals(oldadjvals_),
+          newxadj(newxadj_),
+          newadj(newadj_),  // newadjvals(newadjvals_),
+          old_to_new_index(old_to_new_index_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const nnz_lno_t& i) const {
+      nnz_lno_t index      = color_adj(i);
+      size_type xadj_begin = newxadj(i);
+
+      size_type old_xadj_end = oldxadj[index + 1];
+      for (size_type j = oldxadj[index]; j < old_xadj_end; ++j) {
+        nnz_lno_t neighbor = oldadj[j];
+        if (neighbor < num_rows) neighbor = old_to_new_index[neighbor];
+        newadj[xadj_begin++] = neighbor;
+        // newadjvals[xadj_begin++] = oldadjvals[j];
+      }
+    }
+  };
+
+  struct fill_matrix_numeric {
+    nnz_lno_persistent_work_view_t color_adj;
+    const_lno_row_view_t oldxadj;
+    const_scalar_nnz_view_t oldadjvals;
+    row_lno_persistent_work_view_t newxadj;
+    scalar_persistent_work_view_t newadjvals;
+
+    nnz_lno_t num_total_rows;
+    nnz_lno_t rows_per_team;
+    nnz_lno_t block_matrix_size;
+    fill_matrix_numeric(nnz_lno_persistent_work_view_t color_adj_,
+                        const_lno_row_view_t oldxadj_,
+                        const_scalar_nnz_view_t oldadjvals_,
+                        row_lno_persistent_work_view_t newxadj_,
+                        scalar_persistent_work_view_t newadjvals_,
+                        nnz_lno_t num_total_rows_, nnz_lno_t rows_per_team_,
+                        nnz_lno_t block_matrix_size_)
+        : color_adj(color_adj_),
+          oldxadj(oldxadj_),
+          oldadjvals(oldadjvals_),
+          newxadj(newxadj_),
+          newadjvals(newadjvals_),
+          num_total_rows(num_total_rows_),
+          rows_per_team(rows_per_team_),
+          block_matrix_size(block_matrix_size_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const nnz_lno_t& i) const {
+      nnz_lno_t index        = color_adj(i);
+      size_type xadj_begin   = newxadj(i) * block_matrix_size;
+      size_type old_xadj_end = oldxadj[index + 1] * block_matrix_size;
+
+      for (size_type j = oldxadj[index] * block_matrix_size; j < old_xadj_end;
+           ++j) {
+        newadjvals[xadj_begin++] = oldadjvals[j];
+      }
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const team_member_t& team) const {
+      const nnz_lno_t i_begin = team.league_rank() * rows_per_team;
+      const nnz_lno_t i_end   = i_begin + rows_per_team <= num_total_rows
+                                  ? i_begin + rows_per_team
+                                  : num_total_rows;
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, i_begin, i_end),
+          [&](const nnz_lno_t& i) {
+            nnz_lno_t index      = color_adj(i);
+            size_type xadj_begin = newxadj(i) * block_matrix_size;
+
+            size_type old_xadj_begin = oldxadj[index] * block_matrix_size;
+            size_type old_xadj_end   = oldxadj[index + 1] * block_matrix_size;
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(team, old_xadj_end - old_xadj_begin),
+                [&](const nnz_lno_t& j) {
                   newadjvals[xadj_begin + j] = oldadjvals[old_xadj_begin + j];
                 });
-            });
-        }
-      };
-
-
-      struct Get_Matrix_Diagonals{
-
-        row_lno_persistent_work_view_t _xadj;
-        nnz_lno_persistent_work_view_t _adj; // CSR storage of the graph.
-        scalar_persistent_work_view_t _adj_vals; // CSR storage of the graph.
-        scalar_persistent_work_view_t _diagonals;
-
-        nnz_lno_t num_total_rows;
-        nnz_lno_t rows_per_team;
-        nnz_lno_t block_size;
-        nnz_lno_t block_matrix_size;
-
-        nnz_scalar_t one;
-
-
-        Get_Matrix_Diagonals(
-                             row_lno_persistent_work_view_t xadj_,
-                             nnz_lno_persistent_work_view_t adj_,
-                             scalar_persistent_work_view_t adj_vals_,
-                             scalar_persistent_work_view_t diagonals_,
-                             nnz_lno_t num_total_rows_,
-                             nnz_lno_t rows_per_team_ ,
-                             nnz_lno_t block_size_,
-                             nnz_lno_t block_matrix_size_):
-          _xadj( xadj_),
-          _adj( adj_),
-          _adj_vals( adj_vals_), _diagonals(diagonals_),
-          num_total_rows(num_total_rows_), rows_per_team(rows_per_team_),
-          block_size(block_size_),block_matrix_size(block_matrix_size_),one(Kokkos::Details::ArithTraits<nnz_scalar_t>::one()){}
-
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const nnz_lno_t & row_id) const {
-          size_type row_begin = _xadj[row_id];
-          size_type row_end = _xadj[row_id + 1] ;
-          nnz_lno_t row_size = row_end - row_begin;
-          for (nnz_lno_t col_ind = 0; col_ind < row_size; ++col_ind){
-            size_type nnz_ind = col_ind + row_begin;
-            nnz_lno_t column_id = _adj[nnz_ind];
-            if (column_id == row_id){
-              size_type val_index = row_begin * block_matrix_size + col_ind;
-              for (nnz_lno_t r = 0; r < block_size; ++r){
-                nnz_scalar_t val = _adj_vals[val_index];
-                _diagonals[row_id * block_size + r] = one / val;
-                val_index += row_size + 1;
-              }
-              break;
-            }
+          });
+    }
+  };
+
+  struct Get_Matrix_Diagonals {
+    row_lno_persistent_work_view_t _xadj;
+    nnz_lno_persistent_work_view_t _adj;      // CSR storage of the graph.
+    scalar_persistent_work_view_t _adj_vals;  // CSR storage of the graph.
+    scalar_persistent_work_view_t _diagonals;
+
+    nnz_lno_t num_total_rows;
+    nnz_lno_t rows_per_team;
+    nnz_lno_t block_size;
+    nnz_lno_t block_matrix_size;
+
+    nnz_scalar_t one;
+
+    Get_Matrix_Diagonals(row_lno_persistent_work_view_t xadj_,
+                         nnz_lno_persistent_work_view_t adj_,
+                         scalar_persistent_work_view_t adj_vals_,
+                         scalar_persistent_work_view_t diagonals_,
+                         nnz_lno_t num_total_rows_, nnz_lno_t rows_per_team_,
+                         nnz_lno_t block_size_, nnz_lno_t block_matrix_size_)
+        : _xadj(xadj_),
+          _adj(adj_),
+          _adj_vals(adj_vals_),
+          _diagonals(diagonals_),
+          num_total_rows(num_total_rows_),
+          rows_per_team(rows_per_team_),
+          block_size(block_size_),
+          block_matrix_size(block_matrix_size_),
+          one(Kokkos::Details::ArithTraits<nnz_scalar_t>::one()) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const nnz_lno_t& row_id) const {
+      RowIndex row(block_size, _xadj[row_id], _xadj[row_id + 1]);
+      for (nnz_lno_t col_ind = 0; col_ind < row.size(); ++col_ind) {
+        const nnz_lno_t column_id = _adj[row.begin() + col_ind];
+        if (column_id == row_id) {
+          size_type val_index = row.block(col_ind);
+          for (nnz_lno_t r = 0; r < block_size; ++r) {
+            nnz_scalar_t val                    = _adj_vals[val_index];
+            _diagonals[row_id * block_size + r] = one / val;
+            val_index += row.block_stride() + 1;
           }
+          break;
         }
-
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const team_member_t &team) const{
-
-          const nnz_lno_t i_begin = team.league_rank() * rows_per_team;
-          const nnz_lno_t i_end = i_begin + rows_per_team <= num_total_rows ? i_begin + rows_per_team : num_total_rows;
-          Kokkos::parallel_for(Kokkos::TeamThreadRange(team,i_begin,i_end), [&] (const nnz_lno_t& row_id) {
-              size_type row_begin = _xadj[row_id];
-              size_type row_end = _xadj[row_id + 1] ;
-              nnz_lno_t row_size = row_end - row_begin;
-
-              Kokkos::parallel_for (Kokkos::ThreadVectorRange(team,row_size), [&] (const nnz_lno_t& col_ind) {
-                  size_type val_index = col_ind + row_begin;
+      }
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const team_member_t& team) const {
+      const nnz_lno_t i_begin = team.league_rank() * rows_per_team;
+      const nnz_lno_t i_end   = i_begin + rows_per_team <= num_total_rows
+                                  ? i_begin + rows_per_team
+                                  : num_total_rows;
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, i_begin, i_end),
+          [&](const nnz_lno_t& row_id) {
+            RowIndex row(block_size, _xadj[row_id], _xadj[row_id + 1]);
+
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(team, row.size()),
+                [&](const nnz_lno_t& col_ind) {
+                  size_type val_index = col_ind + row.begin();
                   nnz_lno_t column_id = _adj[val_index];
-                  if (column_id == row_id){
-                    size_type _val_index = row_begin * block_matrix_size + col_ind * block_size;
-                    for (nnz_lno_t r = 0; r < block_size; ++r){
+                  if (column_id == row_id) {
+                    size_type _val_index = row.block(col_ind);
+                    for (nnz_lno_t r = 0; r < block_size; ++r) {
                       nnz_scalar_t val = _adj_vals[_val_index];
                       _diagonals[row_id * block_size + r] = one / val;
-                      _val_index += row_size * block_size + 1;
+                      _val_index += row.block_stride() + 1;
                     }
                   }
                 });
-            });
-        }
-      };
-
-      void initialize_numeric(){
-        auto gsHandle = this->get_gs_handle();
-        if (gsHandle->is_symbolic_called() == false){
-          this->initialize_symbolic();
-        }
-        //else
+          });
+    }
+  };
+
+  void initialize_numeric() {
+    auto gsHandle = this->get_gs_handle();
+    if (gsHandle->is_symbolic_called() == false) {
+      this->initialize_symbolic();
+    }
+    // else
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
-        Kokkos::Timer timer;
+    Kokkos::Timer timer;
 #endif
-        {
-
-
-          const_lno_row_view_t xadj = this->row_map;
-          const_lno_nnz_view_t adj = this->entries;
-          const_scalar_nnz_view_t adj_vals = this->values;
-          
-          size_type nnz = adj_vals.extent(0);
-
-          row_lno_persistent_work_view_t newxadj_ = gsHandle->get_new_xadj();
-          nnz_lno_persistent_work_view_t newadj_ = gsHandle->get_new_adj();
-          nnz_lno_persistent_work_view_t old_to_new_map = gsHandle->get_old_to_new_map();
-
-          nnz_lno_persistent_work_view_t color_adj = gsHandle->get_color_adj();
-          scalar_persistent_work_view_t permuted_adj_vals (Kokkos::view_alloc(Kokkos::WithoutInitializing, "newvals_"), nnz );
-
-
-          int suggested_vector_size = this->handle->get_suggested_vector_size(num_rows, nnz);
-          int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-          nnz_lno_t rows_per_team = this->handle->get_team_work_size(suggested_team_size,MyExecSpace::concurrency(), num_rows);
-
-          nnz_lno_t block_size = gsHandle->get_block_size();
-          nnz_lno_t block_matrix_size = block_size * block_size ;
-
-          //MD NOTE: 03/27/2018: below fill matrix operations will work fine with block size 1.
-          //If the block size is more than 1, below code assumes that the rows are sorted similar to point crs.
-          //for example given a block crs with 3 blocks in a column a,b,c where each of them is 3x3 matrix as below.
-          // a11 a12 a13   b11 b12 b13    c11 c12 c13
-          // a21 a22 a23   b21 b22 b23    c21 c22 c23
-          // a31 a32 a33   b31 b32 b33    c31 c32 c33
-          // this copy assumes the storage in the following order
-          // a11 a12 a13   b11 b12 b13    c11 c12 c13 a21 a22 a23   b21 b22 b23    c21 c22 c23 a31 a32 a33   b31 b32 b33    c31 c32 c33
-          // this is the order that is used in the rest of the algorithm.
-          // !!!!!!!!!!!!if the input has different format than this!!!!!!!!!!!!!!!!!!
-          // change fill_matrix_numeric so that they store the internal matrix as above.
-          // the rest will wok fine.
-
-          if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
-            Kokkos::parallel_for( "KokkosSparse::GaussSeidel::Team_fill_matrix_numeric",
-                                  team_policy_t((num_rows + rows_per_team - 1) / rows_per_team, suggested_team_size, suggested_vector_size),
-                                  fill_matrix_numeric(
-                                                      color_adj,
-                                                      xadj,
-                                                      //adj,
-                                                      adj_vals,
-                                                      newxadj_,
-                                                      //newadj_,
-                                                      permuted_adj_vals,
-                                                      //,old_to_new_map
-                                                      this->num_rows,
-                                                      rows_per_team,
-                                                      block_matrix_size
-                                                      ));
-          }
-          else {
-            Kokkos::parallel_for( "KokkosSparse::GaussSeidel::fill_matrix_numeric",range_pol(0,num_rows),
-                                  fill_matrix_numeric(
-                                                      color_adj,
-                                                      xadj,
-                                                      //adj,
-                                                      adj_vals,
-                                                      newxadj_,
-                                                      //newadj_,
-                                                      permuted_adj_vals,
-                                                      //,old_to_new_map
-                                                      this->num_rows,
-                                                      rows_per_team,
-                                                      block_matrix_size
-                                                      ));
-          }
-          gsHandle->set_new_adj_val(permuted_adj_vals);
-
-          scalar_persistent_work_view_t permuted_inverse_diagonal (Kokkos::view_alloc(Kokkos::WithoutInitializing, "permuted_inverse_diagonal"), num_rows * block_size );
-          if (!have_diagonal_given) {
-            Get_Matrix_Diagonals gmd(newxadj_, newadj_, permuted_adj_vals, permuted_inverse_diagonal,
-                                     this->num_rows,
-                                     rows_per_team,
-                                     block_size,
-                                     block_matrix_size);
-
-            if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>() || block_size > 1){
-              Kokkos::parallel_for("KokkosSparse::GaussSeidel::team_get_matrix_diagonals",
-                                   team_policy_t((num_rows + rows_per_team - 1) / rows_per_team, suggested_team_size, suggested_vector_size),
-                                   gmd );
-            }
-            else {
-              Kokkos::parallel_for("KokkosSparse::GaussSeidel::get_matrix_diagonals",
-                                   range_pol(0,num_rows),
-                                   gmd );
-            }
-
-          } else {
-
-            if (block_size > 1)
-              KokkosKernels::Impl::permute_block_vector
-                <const_scalar_nnz_view_t,
-                 scalar_persistent_work_view_t,
-                 nnz_lno_persistent_work_view_t, MyExecSpace>(
-                                                              num_rows, block_size,
-                                                              old_to_new_map,
-                                                              given_inverse_diagonal,
-                                                              permuted_inverse_diagonal
-                                                              );
-            else
-              KokkosKernels::Impl::permute_vector
-                <const_scalar_nnz_view_t,
-                 scalar_persistent_work_view_t,
-                 nnz_lno_persistent_work_view_t, MyExecSpace>(
-                                                              num_rows,
-                                                              old_to_new_map,
-                                                              given_inverse_diagonal,
-                                                              permuted_inverse_diagonal
-                                                              );
+    {
+      const_lno_row_view_t xadj        = this->row_map;
+      const_lno_nnz_view_t adj         = this->entries;
+      const_scalar_nnz_view_t adj_vals = this->values;
+
+      size_type nnz = adj_vals.extent(0);
+
+      row_lno_persistent_work_view_t newxadj_ = gsHandle->get_new_xadj();
+      nnz_lno_persistent_work_view_t newadj_  = gsHandle->get_new_adj();
+      nnz_lno_persistent_work_view_t old_to_new_map =
+          gsHandle->get_old_to_new_map();
+
+      nnz_lno_persistent_work_view_t color_adj = gsHandle->get_color_adj();
+      scalar_persistent_work_view_t permuted_adj_vals(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "newvals_"), nnz);
+
+      int suggested_vector_size =
+          this->handle->get_suggested_vector_size(num_rows, nnz);
+      int suggested_team_size =
+          this->handle->get_suggested_team_size(suggested_vector_size);
+      nnz_lno_t rows_per_team = this->handle->get_team_work_size(
+          suggested_team_size, MyExecSpace::concurrency(), num_rows);
+
+      nnz_lno_t block_size        = gsHandle->get_block_size();
+      nnz_lno_t block_matrix_size = block_size * block_size;
+
+      // MD NOTE: 03/27/2018: below fill matrix operations will work fine with
+      // block size 1. If the block size is more than 1, below code assumes that
+      // the rows are sorted similar to point crs. for example given a block crs
+      // with 3 blocks in a column a,b,c where each of them is 3x3 matrix as
+      // below.
+      // a11 a12 a13   b11 b12 b13    c11 c12 c13
+      // a21 a22 a23   b21 b22 b23    c21 c22 c23
+      // a31 a32 a33   b31 b32 b33    c31 c32 c33
+      // this copy assumes the storage in the following order
+      // a11 a12 a13   b11 b12 b13    c11 c12 c13 a21 a22 a23   b21 b22 b23 c21
+      // c22 c23 a31 a32 a33   b31 b32 b33    c31 c32 c33 this is the order that
+      // is used in the rest of the algorithm.
+      // !!!!!!!!!!!!if the input has different format than
+      // this!!!!!!!!!!!!!!!!!! change fill_matrix_numeric so that they store
+      // the internal matrix as above. the rest will wok fine.
+
+      if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
+        Kokkos::parallel_for(
+            "KokkosSparse::GaussSeidel::Team_fill_matrix_numeric",
+            team_policy_t((num_rows + rows_per_team - 1) / rows_per_team,
+                          suggested_team_size, suggested_vector_size),
+            fill_matrix_numeric(color_adj, xadj,
+                                // adj,
+                                adj_vals, newxadj_,
+                                // newadj_,
+                                permuted_adj_vals,
+                                //,old_to_new_map
+                                this->num_rows, rows_per_team,
+                                block_matrix_size));
+      } else {
+        Kokkos::parallel_for("KokkosSparse::GaussSeidel::fill_matrix_numeric",
+                             range_pol(0, num_rows),
+                             fill_matrix_numeric(color_adj, xadj,
+                                                 // adj,
+                                                 adj_vals, newxadj_,
+                                                 // newadj_,
+                                                 permuted_adj_vals,
+                                                 //,old_to_new_map
+                                                 this->num_rows, rows_per_team,
+                                                 block_matrix_size));
+      }
+      gsHandle->set_new_adj_val(permuted_adj_vals);
+
+      scalar_persistent_work_view_t permuted_inverse_diagonal(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                             "permuted_inverse_diagonal"),
+          num_rows * block_size);
+      if (!have_diagonal_given) {
+        Get_Matrix_Diagonals gmd(newxadj_, newadj_, permuted_adj_vals,
+                                 permuted_inverse_diagonal, this->num_rows,
+                                 rows_per_team, block_size, block_matrix_size);
+
+        if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>() ||
+            block_size > 1) {
+          Kokkos::parallel_for(
+              "KokkosSparse::GaussSeidel::team_get_matrix_diagonals",
+              team_policy_t((num_rows + rows_per_team - 1) / rows_per_team,
+                            suggested_team_size, suggested_vector_size),
+              gmd);
+        } else {
+          Kokkos::parallel_for(
+              "KokkosSparse::GaussSeidel::get_matrix_diagonals",
+              range_pol(0, num_rows), gmd);
+        }
 
-          }
+      } else {
+        if (block_size > 1)
+          KokkosKernels::Impl::permute_block_vector<
+              const_scalar_nnz_view_t, scalar_persistent_work_view_t,
+              nnz_lno_persistent_work_view_t, MyExecSpace>(
+              num_rows, block_size, old_to_new_map, given_inverse_diagonal,
+              permuted_inverse_diagonal);
+        else
+          KokkosKernels::Impl::permute_vector<
+              const_scalar_nnz_view_t, scalar_persistent_work_view_t,
+              nnz_lno_persistent_work_view_t, MyExecSpace>(
+              num_rows, old_to_new_map, given_inverse_diagonal,
+              permuted_inverse_diagonal);
+      }
 
-          gsHandle->set_permuted_inverse_diagonal(permuted_inverse_diagonal);
-          gsHandle->set_call_numeric(true);
-        }
+      gsHandle->set_permuted_inverse_diagonal(permuted_inverse_diagonal);
+      gsHandle->set_call_numeric(true);
+    }
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
-        MyExecSpace().fence();
-        std::cout << "NUMERIC:" << timer.seconds() << std::endl;
+    MyExecSpace().fence();
+    std::cout << "NUMERIC:" << timer.seconds() << std::endl;
 #endif
-      }
-
-      //Functor to update unknown entries corresponding to long rows (in the permuted x)
-      template<typename x_value_array_type, typename y_value_array_type>
-      struct LongRowUpdateFunctor
-      {
-        LongRowUpdateFunctor(
-            const x_value_array_type& permuted_x_,
-            const y_value_array_type& permuted_y_,
-            const scalar_persistent_work_view_t& long_row_x_,
-            const scalar_persistent_work_view_t& permuted_inverse_diagonal_,
-            nnz_scalar_t omega_,
-            nnz_lno_t long_row_begin_)
-          : permuted_x(permuted_x_), permuted_y(permuted_y_), long_row_x(long_row_x_),
-          permuted_inverse_diagonal(permuted_inverse_diagonal_), omega(omega_), long_row_begin(long_row_begin_)
-        {}
-
-        KOKKOS_INLINE_FUNCTION void operator()(nnz_lno_t i) const
-        {
-          permuted_x(i) += omega * permuted_inverse_diagonal(i) * (permuted_y(i) - long_row_x(i - long_row_begin));
-        }
+  }
 
-        x_value_array_type permuted_x;
-        y_value_array_type permuted_y;
-        scalar_persistent_work_view_t long_row_x;
-        scalar_persistent_work_view_t permuted_inverse_diagonal;
-        nnz_scalar_t omega;
-        nnz_lno_t long_row_begin;
-      };
-
-      template <typename x_value_array_type, typename y_value_array_type>
-      void block_apply(
-                       x_value_array_type x_lhs_output_vec,
-                       y_value_array_type y_rhs_input_vec,
-                       bool init_zero_x_vector = false,
-                       int numIter = 1,
-                       nnz_scalar_t omega = Kokkos::Details::ArithTraits<nnz_scalar_t>::one(),
-                       bool apply_forward = true,
-                       bool apply_backward = true,
-                       bool update_y_vector = true) {
-        auto gsHandle = this->get_gs_handle();
-        if(gsHandle->is_numeric_called() == false){
-          this->initialize_numeric();
-        }
-
-        nnz_lno_t block_size = gsHandle->get_block_size();
-        //nnz_lno_t block_matrix_size = block_size  * block_size ;
-
-        auto Permuted_Xvector = gsHandle->get_permuted_x_vector();
-        auto Permuted_Yvector = gsHandle->get_permuted_y_vector();
-
-        row_lno_persistent_work_view_t newxadj = gsHandle->get_new_xadj();
-        nnz_lno_persistent_work_view_t newadj = gsHandle->get_new_adj();
-        scalar_persistent_work_view_t newadj_vals = gsHandle->get_new_adj_val();
-        nnz_lno_persistent_work_view_t old_to_new_map = gsHandle->get_old_to_new_map();
-        nnz_lno_persistent_work_view_t color_adj = gsHandle->get_color_adj();
-        scalar_persistent_work_view_t permuted_inverse_diagonal = gsHandle->get_permuted_inverse_diagonal();
-
-        color_t numColors = gsHandle->get_num_colors();
-
-        if (update_y_vector) {
-          KokkosKernels::Impl::permute_block_vector
-            <y_value_array_type,
-             scalar_persistent_work_view2d_t,
-             nnz_lno_persistent_work_view_t, MyExecSpace>(
-                                                          num_rows, block_size,
-                                                          old_to_new_map,
-                                                          y_rhs_input_vec,
-                                                          Permuted_Yvector
-                                                          );
-        }
-        if(init_zero_x_vector) {
-          KokkosKernels::Impl::zero_vector<scalar_persistent_work_view2d_t, MyExecSpace>(num_cols * block_size, Permuted_Xvector);
-        }
-        else{
-          KokkosKernels::Impl::permute_block_vector
-            <x_value_array_type, scalar_persistent_work_view2d_t, nnz_lno_persistent_work_view_t, MyExecSpace>(
-                num_cols, block_size,
-                old_to_new_map,
-                x_lhs_output_vec,
-                Permuted_Xvector
-                );
-        }
+  // Functor to update unknown entries corresponding to long rows (in the
+  // permuted x)
+  template <typename x_value_array_type, typename y_value_array_type>
+  struct LongRowUpdateFunctor {
+    LongRowUpdateFunctor(
+        const x_value_array_type& permuted_x_,
+        const y_value_array_type& permuted_y_,
+        const scalar_persistent_work_view_t& long_row_x_,
+        const scalar_persistent_work_view_t& permuted_inverse_diagonal_,
+        nnz_scalar_t omega_, nnz_lno_t long_row_begin_)
+        : permuted_x(permuted_x_),
+          permuted_y(permuted_y_),
+          long_row_x(long_row_x_),
+          permuted_inverse_diagonal(permuted_inverse_diagonal_),
+          omega(omega_),
+          long_row_begin(long_row_begin_) {}
+
+    KOKKOS_INLINE_FUNCTION void operator()(nnz_lno_t i) const {
+      permuted_x(i) += omega * permuted_inverse_diagonal(i) *
+                       (permuted_y(i) - long_row_x(i - long_row_begin));
+    }
+
+    x_value_array_type permuted_x;
+    y_value_array_type permuted_y;
+    scalar_persistent_work_view_t long_row_x;
+    scalar_persistent_work_view_t permuted_inverse_diagonal;
+    nnz_scalar_t omega;
+    nnz_lno_t long_row_begin;
+  };
+
+  template <typename x_value_array_type, typename y_value_array_type>
+  void block_apply(
+      x_value_array_type x_lhs_output_vec, y_value_array_type y_rhs_input_vec,
+      bool init_zero_x_vector = false, int numIter = 1,
+      nnz_scalar_t omega = Kokkos::Details::ArithTraits<nnz_scalar_t>::one(),
+      bool apply_forward = true, bool apply_backward = true,
+      bool update_y_vector = true) {
+    auto gsHandle = this->get_gs_handle();
+    if (gsHandle->is_numeric_called() == false) {
+      this->initialize_numeric();
+    }
+
+    nnz_lno_t block_size = gsHandle->get_block_size();
+    // nnz_lno_t block_matrix_size = block_size  * block_size ;
+
+    auto Permuted_Xvector = gsHandle->get_permuted_x_vector();
+    auto Permuted_Yvector = gsHandle->get_permuted_y_vector();
+
+    row_lno_persistent_work_view_t newxadj    = gsHandle->get_new_xadj();
+    nnz_lno_persistent_work_view_t newadj     = gsHandle->get_new_adj();
+    scalar_persistent_work_view_t newadj_vals = gsHandle->get_new_adj_val();
+    nnz_lno_persistent_work_view_t old_to_new_map =
+        gsHandle->get_old_to_new_map();
+    nnz_lno_persistent_work_view_t color_adj = gsHandle->get_color_adj();
+    scalar_persistent_work_view_t permuted_inverse_diagonal =
+        gsHandle->get_permuted_inverse_diagonal();
+
+    color_t numColors = gsHandle->get_num_colors();
+
+    if (update_y_vector) {
+      KokkosKernels::Impl::permute_block_vector<
+          y_value_array_type, scalar_persistent_work_view2d_t,
+          nnz_lno_persistent_work_view_t, MyExecSpace>(
+          num_rows, block_size, old_to_new_map, y_rhs_input_vec,
+          Permuted_Yvector);
+    }
+    if (init_zero_x_vector) {
+      KokkosKernels::Impl::zero_vector<scalar_persistent_work_view2d_t,
+                                       MyExecSpace>(num_cols * block_size,
+                                                    Permuted_Xvector);
+    } else {
+      KokkosKernels::Impl::permute_block_vector<
+          x_value_array_type, scalar_persistent_work_view2d_t,
+          nnz_lno_persistent_work_view_t, MyExecSpace>(
+          num_cols, block_size, old_to_new_map, x_lhs_output_vec,
+          Permuted_Xvector);
+    }
 
 #if KOKKOSSPARSE_IMPL_PRINTDEBUG
-        std::cout << "Y:";
-        KokkosKernels::Impl::print_1Dview(Permuted_Yvector);
-        std::cout << "Original Y:";
-        KokkosKernels::Impl::print_1Dview(y_rhs_input_vec);
-
-        std::cout << "X:";
-        KokkosKernels::Impl::print_1Dview(Permuted_Xvector);
-
-        std::cout << "permuted_xadj:"; KokkosKernels::Impl::print_1Dview(newxadj);
-        std::cout << "permuted_adj:"; KokkosKernels::Impl::print_1Dview(newadj);
-        std::cout << "permuted_adj_vals:"; KokkosKernels::Impl::print_1Dview(newadj_vals);
-        std::cout << "permuted_diagonals:"; KokkosKernels::Impl::print_1Dview(permuted_inverse_diagonal);
+    std::cout << "Y:";
+    KokkosKernels::Impl::print_1Dview(Permuted_Yvector);
+    std::cout << "Original Y:";
+    KokkosKernels::Impl::print_1Dview(y_rhs_input_vec);
+
+    std::cout << "X:";
+    KokkosKernels::Impl::print_1Dview(Permuted_Xvector);
+
+    std::cout << "permuted_xadj:";
+    KokkosKernels::Impl::print_1Dview(newxadj);
+    std::cout << "permuted_adj:";
+    KokkosKernels::Impl::print_1Dview(newadj);
+    std::cout << "permuted_adj_vals:";
+    KokkosKernels::Impl::print_1Dview(newadj_vals);
+    std::cout << "permuted_diagonals:";
+    KokkosKernels::Impl::print_1Dview(permuted_inverse_diagonal);
 #endif
-        nnz_lno_persistent_work_host_view_t h_color_xadj = gsHandle->get_color_xadj();
-
-        nnz_lno_t brows = newxadj.extent(0) - 1;
-        size_type bnnz = newadj_vals.extent(0);
+    nnz_lno_persistent_work_host_view_t h_color_xadj =
+        gsHandle->get_color_xadj();
 
-        int suggested_vector_size = this->handle->get_suggested_vector_size(brows, bnnz);
-        int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-        nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(suggested_team_size,MyExecSpace::concurrency(), brows);
+    nnz_lno_t brows = newxadj.extent(0) - 1;
+    size_type bnnz  = newadj_vals.extent(0);
 
+    int suggested_vector_size =
+        this->handle->get_suggested_vector_size(brows, bnnz);
+    int suggested_team_size =
+        this->handle->get_suggested_team_size(suggested_vector_size);
+    nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(
+        suggested_team_size, MyExecSpace::concurrency(), brows);
 
-        //size_t shmem_size_to_use = this->handle->get_shmem_size();
-        size_t l1_shmem_size = gsHandle->get_level_1_mem();
-        nnz_lno_t num_values_in_l1 = gsHandle->get_num_values_in_l1();
+    // size_t shmem_size_to_use = this->handle->get_shmem_size();
+    size_t l1_shmem_size       = gsHandle->get_level_1_mem();
+    nnz_lno_t num_values_in_l1 = gsHandle->get_num_values_in_l1();
 
-        size_t level_2_mem = gsHandle->get_level_2_mem();
-        nnz_lno_t num_values_in_l2 = gsHandle->get_num_values_in_l2();
-        nnz_lno_t num_chunks = gsHandle->get_num_big_rows();
+    size_t level_2_mem         = gsHandle->get_level_2_mem();
+    nnz_lno_t num_values_in_l2 = gsHandle->get_num_values_in_l2();
+    nnz_lno_t num_chunks       = gsHandle->get_num_big_rows();
 
-        pool_memory_space m_space(num_chunks, level_2_mem / sizeof(nnz_scalar_t), 0,  KokkosKernels::Impl::ManyThread2OneChunk, false);
+    pool_memory_space m_space(num_chunks, level_2_mem / sizeof(nnz_scalar_t), 0,
+                              KokkosKernels::Impl::ManyThread2OneChunk, false);
 
 #if KOKKOSSPARSE_IMPL_PRINTDEBUG
-        std::cout   << "l1_shmem_size:" << l1_shmem_size << " num_values_in_l1:" << num_values_in_l1
-                    << " level_2_mem:" << level_2_mem << " num_values_in_l2:" << num_values_in_l2
-                    << " num_chunks:" << num_chunks << std::endl;
+    std::cout << "l1_shmem_size:" << l1_shmem_size
+              << " num_values_in_l1:" << num_values_in_l1
+              << " level_2_mem:" << level_2_mem
+              << " num_values_in_l2:" << num_values_in_l2
+              << " num_chunks:" << num_chunks << std::endl;
 #endif
 
-        Team_PSGS gs(newxadj, newadj, newadj_vals,
-                     Permuted_Xvector, Permuted_Yvector,0,0, permuted_inverse_diagonal, m_space,
-                     num_values_in_l1, num_values_in_l2,
-                     omega,
-                     block_size, team_row_chunk_size, l1_shmem_size, suggested_team_size,
-                     suggested_vector_size);
+    Team_PSGS gs(newxadj, newadj, newadj_vals, Permuted_Xvector,
+                 Permuted_Yvector, 0, 0, permuted_inverse_diagonal, m_space,
+                 num_values_in_l1, num_values_in_l2, omega, block_size,
+                 team_row_chunk_size, l1_shmem_size, suggested_team_size,
+                 suggested_vector_size);
 
-        this->IterativePSGS(
-                            gs,
-                            numColors,
-                            h_color_xadj,
-                            numIter,
-                            apply_forward,
-                            apply_backward);
+    this->IterativePSGS(gs, numColors, h_color_xadj, numIter, apply_forward,
+                        apply_backward);
 
+    // Kokkos::parallel_for( range_pol(0,nr), PermuteVector(x_lhs_output_vec,
+    // Permuted_Xvector, color_adj));
 
-        //Kokkos::parallel_for( range_pol(0,nr), PermuteVector(x_lhs_output_vec, Permuted_Xvector, color_adj));
-
-
-        KokkosKernels::Impl::permute_block_vector
-          <scalar_persistent_work_view2d_t,x_value_array_type,  nnz_lno_persistent_work_view_t, MyExecSpace>(
-                                                                                                           num_cols, block_size,
-                                                                                                           color_adj,
-                                                                                                           Permuted_Xvector,
-                                                                                                           x_lhs_output_vec
-                                                                                                           );
+    KokkosKernels::Impl::permute_block_vector<
+        scalar_persistent_work_view2d_t, x_value_array_type,
+        nnz_lno_persistent_work_view_t, MyExecSpace>(
+        num_cols, block_size, color_adj, Permuted_Xvector, x_lhs_output_vec);
 #if KOKKOSSPARSE_IMPL_PRINTDEBUG
-        std::cout << "After X:";
-        KokkosKernels::Impl::print_1Dview(Permuted_Xvector);
-        std::cout << "Result X:";
-        KokkosKernels::Impl::print_1Dview(x_lhs_output_vec);
-        std::cout << "Y:";
-        KokkosKernels::Impl::print_1Dview(Permuted_Yvector);
+    std::cout << "After X:";
+    KokkosKernels::Impl::print_1Dview(Permuted_Xvector);
+    std::cout << "Result X:";
+    KokkosKernels::Impl::print_1Dview(x_lhs_output_vec);
+    std::cout << "Y:";
+    KokkosKernels::Impl::print_1Dview(Permuted_Yvector);
 #endif
-      }
+  }
 
-      template <typename x_value_array_type, typename y_value_array_type>
-      void point_apply(
-                       x_value_array_type x_lhs_output_vec,
-                       y_value_array_type y_rhs_input_vec,
-                       bool init_zero_x_vector = false,
-                       int numIter = 1,
-                       nnz_scalar_t omega = Kokkos::Details::ArithTraits<nnz_scalar_t>::one(),
-                       bool apply_forward = true,
-                       bool apply_backward = true,
-                       bool update_y_vector = true)
-      {
-        auto gsHandle = get_gs_handle();
-
-        auto Permuted_Xvector = gsHandle->get_permuted_x_vector();
-        auto Permuted_Yvector = gsHandle->get_permuted_y_vector();
-
-        row_lno_persistent_work_view_t newxadj = gsHandle->get_new_xadj();
-        nnz_lno_persistent_work_view_t newadj = gsHandle->get_new_adj();
-        scalar_persistent_work_view_t newadj_vals = gsHandle->get_new_adj_val();
-        nnz_lno_persistent_work_view_t old_to_new_map = gsHandle->get_old_to_new_map();
-        nnz_lno_persistent_work_view_t color_adj = gsHandle->get_color_adj();
-        scalar_persistent_work_view_t permuted_inverse_diagonal = gsHandle->get_permuted_inverse_diagonal();
-
-        color_t numColors = gsHandle->get_num_colors();
-
-        if (update_y_vector) {
-          KokkosKernels::Impl::permute_vector
-            <y_value_array_type,
-             scalar_persistent_work_view2d_t,
-             nnz_lno_persistent_work_view_t, MyExecSpace>(
-                 num_rows,
-                 old_to_new_map,
-                 y_rhs_input_vec,
-                 Permuted_Yvector
-                 );
-        }
-        if(init_zero_x_vector) {
-          KokkosKernels::Impl::zero_vector<scalar_persistent_work_view2d_t, MyExecSpace>(num_cols, Permuted_Xvector);
-        }
-        else {
-          KokkosKernels::Impl::permute_vector
-            <x_value_array_type, scalar_persistent_work_view2d_t, nnz_lno_persistent_work_view_t, MyExecSpace>(
-                num_cols,
-                old_to_new_map,
-                x_lhs_output_vec,
-                Permuted_Xvector
-                );
-        }
+  template <typename x_value_array_type, typename y_value_array_type>
+  void point_apply(
+      x_value_array_type x_lhs_output_vec, y_value_array_type y_rhs_input_vec,
+      bool init_zero_x_vector = false, int numIter = 1,
+      nnz_scalar_t omega = Kokkos::Details::ArithTraits<nnz_scalar_t>::one(),
+      bool apply_forward = true, bool apply_backward = true,
+      bool update_y_vector = true) {
+    auto gsHandle = get_gs_handle();
+
+    auto Permuted_Xvector = gsHandle->get_permuted_x_vector();
+    auto Permuted_Yvector = gsHandle->get_permuted_y_vector();
+
+    row_lno_persistent_work_view_t newxadj    = gsHandle->get_new_xadj();
+    nnz_lno_persistent_work_view_t newadj     = gsHandle->get_new_adj();
+    scalar_persistent_work_view_t newadj_vals = gsHandle->get_new_adj_val();
+    nnz_lno_persistent_work_view_t old_to_new_map =
+        gsHandle->get_old_to_new_map();
+    nnz_lno_persistent_work_view_t color_adj = gsHandle->get_color_adj();
+    scalar_persistent_work_view_t permuted_inverse_diagonal =
+        gsHandle->get_permuted_inverse_diagonal();
+
+    color_t numColors = gsHandle->get_num_colors();
+
+    if (update_y_vector) {
+      KokkosKernels::Impl::permute_vector<
+          y_value_array_type, scalar_persistent_work_view2d_t,
+          nnz_lno_persistent_work_view_t, MyExecSpace>(
+          num_rows, old_to_new_map, y_rhs_input_vec, Permuted_Yvector);
+    }
+    if (init_zero_x_vector) {
+      KokkosKernels::Impl::zero_vector<scalar_persistent_work_view2d_t,
+                                       MyExecSpace>(num_cols, Permuted_Xvector);
+    } else {
+      KokkosKernels::Impl::permute_vector<
+          x_value_array_type, scalar_persistent_work_view2d_t,
+          nnz_lno_persistent_work_view_t, MyExecSpace>(
+          num_cols, old_to_new_map, x_lhs_output_vec, Permuted_Xvector);
+    }
 
 #if KOKKOSSPARSE_IMPL_PRINTDEBUG
-        std::cout << "--point Before X:";
-        KokkosKernels::Impl::print_1Dview(Permuted_Xvector,true);
-        std::cout << "--point Before Y:";
-        KokkosKernels::Impl::print_1Dview(Permuted_Yvector,true);
+    std::cout << "--point Before X:";
+    KokkosKernels::Impl::print_1Dview(Permuted_Xvector, true);
+    std::cout << "--point Before Y:";
+    KokkosKernels::Impl::print_1Dview(Permuted_Yvector, true);
 #endif
 
-        nnz_lno_persistent_work_host_view_t h_color_xadj = gsHandle->get_color_xadj();
-        if(gsHandle->get_algorithm_type() == GS_PERMUTED) {
-          PSGS gs(newxadj, newadj, newadj_vals,
-                  Permuted_Xvector, Permuted_Yvector, color_adj, omega, permuted_inverse_diagonal);
-          this->IterativePSGS(
-                              gs,
-                              numColors,
-                              h_color_xadj,
-                              numIter,
-                              apply_forward,
-                              apply_backward);
-        }
-        else {
-          pool_memory_space m_space(0, 0, 0, KokkosKernels::Impl::ManyThread2OneChunk, false);
-
-          Team_PSGS gs(newxadj, newadj, newadj_vals,
-                       Permuted_Xvector, Permuted_Yvector, 0, 0, permuted_inverse_diagonal, m_space, 0, 0, omega);
-
-          this->IterativePSGS(
-                              gs,
-                              numColors,
-                              h_color_xadj,
-                              numIter,
-                              apply_forward,
-                              apply_backward);
-        }
-
-        //Kokkos::parallel_for( range_pol(0,nr), PermuteVector(x_lhs_output_vec, Permuted_Xvector, color_adj));
-
-        KokkosKernels::Impl::permute_vector
-          <scalar_persistent_work_view2d_t, x_value_array_type, nnz_lno_persistent_work_view_t, MyExecSpace>(
-              num_cols,
-              color_adj,
-              Permuted_Xvector,
-              x_lhs_output_vec
-              );
+    nnz_lno_persistent_work_host_view_t h_color_xadj =
+        gsHandle->get_color_xadj();
+    if (gsHandle->get_algorithm_type() == GS_PERMUTED) {
+      PSGS gs(newxadj, newadj, newadj_vals, Permuted_Xvector, Permuted_Yvector,
+              color_adj, omega, permuted_inverse_diagonal);
+      this->IterativePSGS(gs, numColors, h_color_xadj, numIter, apply_forward,
+                          apply_backward);
+    } else {
+      pool_memory_space m_space(
+          0, 0, 0, KokkosKernels::Impl::ManyThread2OneChunk, false);
+
+      Team_PSGS gs(newxadj, newadj, newadj_vals, Permuted_Xvector,
+                   Permuted_Yvector, 0, 0, permuted_inverse_diagonal, m_space,
+                   0, 0, omega);
+
+      this->IterativePSGS(gs, numColors, h_color_xadj, numIter, apply_forward,
+                          apply_backward);
+    }
+
+    // Kokkos::parallel_for( range_pol(0,nr), PermuteVector(x_lhs_output_vec,
+    // Permuted_Xvector, color_adj));
+
+    KokkosKernels::Impl::permute_vector<
+        scalar_persistent_work_view2d_t, x_value_array_type,
+        nnz_lno_persistent_work_view_t, MyExecSpace>(
+        num_cols, color_adj, Permuted_Xvector, x_lhs_output_vec);
 #if KOKKOSSPARSE_IMPL_PRINTDEBUG
-        std::cout << "--point After X:";
-        KokkosKernels::Impl::print_1Dview(Permuted_Xvector);
-        std::cout << "--point Result X:";
-        KokkosKernels::Impl::print_1Dview(x_lhs_output_vec);
+    std::cout << "--point After X:";
+    KokkosKernels::Impl::print_1Dview(Permuted_Xvector);
+    std::cout << "--point Result X:";
+    KokkosKernels::Impl::print_1Dview(x_lhs_output_vec);
 #endif
+  }
 
-      }
-
-      template <typename x_value_array_type, typename y_value_array_type>
-      void apply(
-                 x_value_array_type x_lhs_output_vec,
-                 y_value_array_type y_rhs_input_vec,
-                 bool init_zero_x_vector = false,
-                 int numIter = 1,
-                 nnz_scalar_t omega = Kokkos::Details::ArithTraits<nnz_scalar_t>::one(),
-                 bool apply_forward = true,
-                 bool apply_backward = true,
-                 bool update_y_vector = true)
-      {
-        auto gsHandle = get_gs_handle();
-        if (gsHandle->is_numeric_called() == false){
-          this->initialize_numeric();
-        }
-        //make sure x and y have been allocated with the correct dimensions
-        nnz_lno_t block_size = gsHandle->get_block_size();
-        gsHandle->allocate_x_y_vectors(this->num_rows * block_size, this->num_cols * block_size,
-            x_lhs_output_vec.extent(1));
-        if (block_size == 1){
-          this->point_apply(
-                            x_lhs_output_vec, y_rhs_input_vec,
-                            init_zero_x_vector, numIter ,
-                            omega,
-                            apply_forward, apply_backward,
-                            update_y_vector);
-        }
-        else {
-          this->block_apply(
-                            x_lhs_output_vec, y_rhs_input_vec,
-                            init_zero_x_vector, numIter,
-                            omega,
-                            apply_forward, apply_backward,
-                            update_y_vector);
-        }
-      }
-
-      void IterativePSGS(
-                         Team_PSGS &gs,
-                         color_t numColors,
-                         nnz_lno_persistent_work_host_view_t h_color_xadj,
-                         int num_iteration,
-                         bool apply_forward,
-                         bool apply_backward)
-      {
-        auto gsHandle = this->get_gs_handle();
-        nnz_lno_persistent_work_host_view_t long_rows_per_color;
-        nnz_lno_persistent_work_host_view_t max_row_length_per_color;
-        scalar_persistent_work_view_t long_row_x;
-        bool haveLongRows = false;
-        int longRowTeamSize = 1;
-        if(gsHandle->get_long_row_threshold() > 0)
-        {
-          long_rows_per_color = gsHandle->get_long_rows_per_color();
-          max_row_length_per_color = gsHandle->get_max_row_length_per_color();
-          long_row_x = gsHandle->get_long_row_x();
-          haveLongRows = true;
-          longrow_apply_team_policy_t tempPolicy(1, 1);
-          longRowTeamSize = tempPolicy.team_size_recommended(gs, Kokkos::ParallelForTag());
-        }
+  template <typename x_value_array_type, typename y_value_array_type>
+  void apply(
+      x_value_array_type x_lhs_output_vec, y_value_array_type y_rhs_input_vec,
+      bool init_zero_x_vector = false, int numIter = 1,
+      nnz_scalar_t omega = Kokkos::Details::ArithTraits<nnz_scalar_t>::one(),
+      bool apply_forward = true, bool apply_backward = true,
+      bool update_y_vector = true) {
+    auto gsHandle = get_gs_handle();
+    if (gsHandle->is_numeric_called() == false) {
+      this->initialize_numeric();
+    }
+    // make sure x and y have been allocated with the correct dimensions
+    nnz_lno_t block_size = gsHandle->get_block_size();
+    gsHandle->allocate_x_y_vectors(this->num_rows * block_size,
+                                   this->num_cols * block_size,
+                                   x_lhs_output_vec.extent(1));
+    if (block_size == 1) {
+      this->point_apply(x_lhs_output_vec, y_rhs_input_vec, init_zero_x_vector,
+                        numIter, omega, apply_forward, apply_backward,
+                        update_y_vector);
+    } else {
+      this->block_apply(x_lhs_output_vec, y_rhs_input_vec, init_zero_x_vector,
+                        numIter, omega, apply_forward, apply_backward,
+                        update_y_vector);
+    }
+  }
 
-        for (int iter = 0; iter < num_iteration; ++iter){
-          nnz_lno_t suggested_team_size = gs.suggested_team_size;
-          nnz_lno_t team_row_chunk_size = gs.team_work_size;
-          int vector_size = gs.vector_size;
-          nnz_lno_t block_size = gsHandle->get_block_size();
-
-          for (int doingBackward = 0; doingBackward < 2; doingBackward++) {
-            const char* labelRegular = doingBackward ? "KokkosSparse::GaussSeidel::Team_PSGS::backward" :
-              "KokkosSparse::GaussSeidel::Team_PSGS::forward";
-            const char* labelBlock = doingBackward ? "KokkosSparse::GaussSeidel::BLOCK_Team_PSGS::backward" :
-              "KokkosSparse::GaussSeidel::BLOCK_Team_PSGS::forward";
-            const char* labelBigBlock = doingBackward ? "KokkosSparse::GaussSeidel::BIGBLOCK_Team_PSGS::backward" :
-              "KokkosSparse::GaussSeidel::BIGBLOCK_Team_PSGS::forward";
-            const char* labelLong = doingBackward ? "KokkosSparse::GaussSeidel::Team_PSGS::backwardLongRows" :
-              "KokkosSparse::GaussSeidel::Team_PSGS::forwardLongRows";
-
-            if(!doingBackward && !apply_forward)
-              continue;
-            if(doingBackward && !apply_backward)
-              continue;
-            gs.is_backward = doingBackward;
-
-            for (color_t colorIter = 0; colorIter < numColors; ++colorIter){
-              //i is just the color set now being processed
-              color_t i = doingBackward ? (numColors - colorIter - 1) : colorIter;
-              nnz_lno_t color_index_begin = h_color_xadj(i);
-              nnz_lno_t color_index_end = h_color_xadj(i + 1);
-              nnz_lno_t numLongRows = haveLongRows ? long_rows_per_color(i) : 0;
-              nnz_lno_t numRegularRows = color_index_end - color_index_begin - numLongRows;
-
-              gs._color_set_begin = color_index_begin;
-              gs._color_set_end = color_index_end - numLongRows;
-
-              if (numRegularRows) {
-                if (block_size == 1){
-                  Kokkos::parallel_for(labelRegular,
-                                       team_policy_t((numRegularRows + team_row_chunk_size - 1) / team_row_chunk_size, suggested_team_size, vector_size),
-                                       gs );
-                } else if (gs.num_max_vals_in_l2 == 0){
-                  Kokkos::parallel_for(labelBlock,
-                                       block_apply_team_policy_t((numRegularRows + team_row_chunk_size - 1) / team_row_chunk_size, suggested_team_size, vector_size),
-                                       gs );
-                }
-                else {
-                  Kokkos::parallel_for(labelBigBlock,
-                                       bigblock_apply_team_policy_t((numRegularRows + team_row_chunk_size - 1) / team_row_chunk_size, suggested_team_size, vector_size),
-                                       gs );
-                }
-              }
-              if (numLongRows) {
-                gs._color_set_begin = color_index_end - numLongRows;
-                gs._color_set_end = color_index_end;
-                gs._long_row_x = long_row_x;
-                nnz_lno_t max_par = max_row_length_per_color(i);
-                nnz_lno_t teams_per_row = ((max_par + 3) / 4 + longRowTeamSize - 1) / longRowTeamSize;
-                gs._long_row_par = teams_per_row;
-                for(nnz_lno_t long_row_col = 0; long_row_col < gs._Xvector.extent_int(1); long_row_col++) {
-                  auto Xcol = Kokkos::subview(gs._Xvector, Kokkos::ALL(), long_row_col);
-                  auto Ycol = Kokkos::subview(gs._Yvector, Kokkos::ALL(), long_row_col);
-                  gs._long_row_col = long_row_col;
-                  Kokkos::deep_copy(long_row_x, nnz_scalar_t());
-                  Kokkos::parallel_for(labelLong,
-                      longrow_apply_team_policy_t(numLongRows * teams_per_row, longRowTeamSize), gs);
-                  Kokkos::parallel_for("KokkosSparse::GaussSeidel::LongRows::x_update",
-                      range_pol(color_index_end - numLongRows, color_index_end),
-                      LongRowUpdateFunctor<decltype(Xcol), decltype(Ycol)>
-                      (Xcol, Ycol, long_row_x, gs._permuted_inverse_diagonal, gs.omega, color_index_end - numLongRows));
-                }
-              }
+  void IterativePSGS(Team_PSGS& gs, color_t numColors,
+                     nnz_lno_persistent_work_host_view_t h_color_xadj,
+                     int num_iteration, bool apply_forward,
+                     bool apply_backward) {
+    auto gsHandle = this->get_gs_handle();
+    nnz_lno_persistent_work_host_view_t long_rows_per_color;
+    nnz_lno_persistent_work_host_view_t max_row_length_per_color;
+    scalar_persistent_work_view_t long_row_x;
+    bool haveLongRows   = false;
+    int longRowTeamSize = 1;
+    if (gsHandle->get_long_row_threshold() > 0) {
+      long_rows_per_color      = gsHandle->get_long_rows_per_color();
+      max_row_length_per_color = gsHandle->get_max_row_length_per_color();
+      long_row_x               = gsHandle->get_long_row_x();
+      haveLongRows             = true;
+      longrow_apply_team_policy_t tempPolicy(1, 1);
+      longRowTeamSize =
+          tempPolicy.team_size_recommended(gs, Kokkos::ParallelForTag());
+    }
+
+    for (int iter = 0; iter < num_iteration; ++iter) {
+      nnz_lno_t suggested_team_size = gs.suggested_team_size;
+      nnz_lno_t team_row_chunk_size = gs.team_work_size;
+      int vector_size               = gs.vector_size;
+      nnz_lno_t block_size          = gsHandle->get_block_size();
+
+      for (int doingBackward = 0; doingBackward < 2; doingBackward++) {
+        const char* labelRegular =
+            doingBackward ? "KokkosSparse::GaussSeidel::Team_PSGS::backward"
+                          : "KokkosSparse::GaussSeidel::Team_PSGS::forward";
+        const char* labelBlock =
+            doingBackward
+                ? "KokkosSparse::GaussSeidel::BLOCK_Team_PSGS::backward"
+                : "KokkosSparse::GaussSeidel::BLOCK_Team_PSGS::forward";
+        const char* labelBigBlock =
+            doingBackward
+                ? "KokkosSparse::GaussSeidel::BIGBLOCK_Team_PSGS::backward"
+                : "KokkosSparse::GaussSeidel::BIGBLOCK_Team_PSGS::forward";
+        const char* labelLong =
+            doingBackward
+                ? "KokkosSparse::GaussSeidel::Team_PSGS::backwardLongRows"
+                : "KokkosSparse::GaussSeidel::Team_PSGS::forwardLongRows";
+
+        if (!doingBackward && !apply_forward) continue;
+        if (doingBackward && !apply_backward) continue;
+        gs.is_backward = doingBackward;
+
+        for (color_t colorIter = 0; colorIter < numColors; ++colorIter) {
+          // i is just the color set now being processed
+          color_t i = doingBackward ? (numColors - colorIter - 1) : colorIter;
+          nnz_lno_t color_index_begin = h_color_xadj(i);
+          nnz_lno_t color_index_end   = h_color_xadj(i + 1);
+          nnz_lno_t numLongRows = haveLongRows ? long_rows_per_color(i) : 0;
+          nnz_lno_t numRegularRows =
+              color_index_end - color_index_begin - numLongRows;
+
+          gs._color_set_begin = color_index_begin;
+          gs._color_set_end   = color_index_end - numLongRows;
+
+          if (numRegularRows) {
+            if (block_size == 1) {
+              Kokkos::parallel_for(
+                  labelRegular,
+                  team_policy_t((numRegularRows + team_row_chunk_size - 1) /
+                                    team_row_chunk_size,
+                                suggested_team_size, vector_size),
+                  gs);
+            } else if (gs.num_max_vals_in_l2 == 0) {
+              Kokkos::parallel_for(
+                  labelBlock,
+                  block_apply_team_policy_t(
+                      (numRegularRows + team_row_chunk_size - 1) /
+                          team_row_chunk_size,
+                      suggested_team_size, vector_size),
+                  gs);
+            } else {
+              Kokkos::parallel_for(
+                  labelBigBlock,
+                  bigblock_apply_team_policy_t(
+                      (numRegularRows + team_row_chunk_size - 1) /
+                          team_row_chunk_size,
+                      suggested_team_size, vector_size),
+                  gs);
+            }
+          }
+          if (numLongRows) {
+            gs._color_set_begin = color_index_end - numLongRows;
+            gs._color_set_end   = color_index_end;
+            gs._long_row_x      = long_row_x;
+            nnz_lno_t max_par   = max_row_length_per_color(i);
+            nnz_lno_t teams_per_row =
+                ((max_par + 3) / 4 + longRowTeamSize - 1) / longRowTeamSize;
+            gs._long_row_par = teams_per_row;
+            for (nnz_lno_t long_row_col = 0;
+                 long_row_col < gs._Xvector.extent_int(1); long_row_col++) {
+              auto Xcol =
+                  Kokkos::subview(gs._Xvector, Kokkos::ALL(), long_row_col);
+              auto Ycol =
+                  Kokkos::subview(gs._Yvector, Kokkos::ALL(), long_row_col);
+              gs._long_row_col = long_row_col;
+              Kokkos::deep_copy(long_row_x, nnz_scalar_t());
+              Kokkos::parallel_for(
+                  labelLong,
+                  longrow_apply_team_policy_t(numLongRows * teams_per_row,
+                                              longRowTeamSize),
+                  gs);
+              Kokkos::parallel_for(
+                  "KokkosSparse::GaussSeidel::LongRows::x_update",
+                  range_pol(color_index_end - numLongRows, color_index_end),
+                  LongRowUpdateFunctor<decltype(Xcol), decltype(Ycol)>(
+                      Xcol, Ycol, long_row_x, gs._permuted_inverse_diagonal,
+                      gs.omega, color_index_end - numLongRows));
             }
           }
         }
       }
+    }
+  }
 
-      void IterativePSGS(
-                         PSGS &gs,
-                         color_t numColors,
-                         nnz_lno_persistent_work_host_view_t h_color_xadj,
-                         int num_iteration,
-                         bool apply_forward,
-                         bool apply_backward)
-      {
-        auto gsHandle = this->get_gs_handle();
-        nnz_lno_persistent_work_host_view_t long_rows_per_color;
-        nnz_lno_persistent_work_host_view_t max_row_length_per_color;
-        scalar_persistent_work_view_t long_row_x;
-        bool haveLongRows = false;
-        if(gsHandle->get_long_row_threshold() > 0)
-        {
-          long_rows_per_color = gsHandle->get_long_rows_per_color();
-          max_row_length_per_color = gsHandle->get_max_row_length_per_color();
-          long_row_x = gsHandle->get_long_row_x();
-          gs._long_row_x = long_row_x;
-          haveLongRows = true;
-        }
-
-        for (int iter = 0; iter < num_iteration; ++iter) {
-          for (int doingBackward = 0; doingBackward < 2; doingBackward++) {
-            if(!doingBackward && !apply_forward)
-              continue;
-            if(doingBackward && !apply_backward)
-              continue;
-
-            for (color_t colorIter = 0; colorIter < numColors; ++colorIter) {
-              //i is just the color set now being processed
-              color_t i = doingBackward ? (numColors - colorIter - 1) : colorIter;
-              const char* labelShort = doingBackward ? "KokkosSparse::GaussSeidel::PSGS::backward" :
-                "KokkosSparse::GaussSeidel::PSGS::forward";
-              const char* labelLong = doingBackward ? "KokkosSparse::GaussSeidel::PSGS::backwardLongRows" :
-                "KokkosSparse::GaussSeidel::PSGS::forwardLongRows";
-              nnz_lno_t color_index_begin = h_color_xadj(i);
-              nnz_lno_t color_index_end = h_color_xadj(i + 1);
-              nnz_lno_t numLongRows = haveLongRows ? long_rows_per_color(i) : 0;
-              nnz_lno_t numRegularRows = color_index_end - color_index_begin - numLongRows;
-              if(numRegularRows) {
-                Kokkos::parallel_for (labelShort, range_pol (color_index_begin, color_index_end - numLongRows) , gs);
-              }
-              if(numLongRows) {
-                gs._color_set_begin = color_index_end - numLongRows;
-                nnz_lno_t max_par = max_row_length_per_color(i);
-                nnz_lno_t par_per_row = (max_par + 1023) / 1024;
-                gs._long_row_par = par_per_row;
-                for(nnz_lno_t long_row_col = 0; long_row_col < gs._Xvector.extent_int(1); long_row_col++) {
-                  auto Xcol = Kokkos::subview(gs._Xvector, Kokkos::ALL(), long_row_col);
-                  auto Ycol = Kokkos::subview(gs._Yvector, Kokkos::ALL(), long_row_col);
-                  gs._long_row_col = long_row_col;
-                  Kokkos::deep_copy(long_row_x, nnz_scalar_t());
-                  Kokkos::parallel_for (labelLong, Kokkos::RangePolicy<MyExecSpace, LongRowTag>(0, numLongRows * par_per_row), gs);
-                  Kokkos::parallel_for("KokkosSparse::GaussSeidel::LongRows::x_update",
-                      range_pol(color_index_end - numLongRows, color_index_end),
-                      LongRowUpdateFunctor<decltype(Xcol), decltype(Ycol)>
-                      (Xcol, Ycol, long_row_x, gs._permuted_inverse_diagonal, gs.omega, color_index_end - numLongRows));
-                }
-              }
+  void IterativePSGS(PSGS& gs, color_t numColors,
+                     nnz_lno_persistent_work_host_view_t h_color_xadj,
+                     int num_iteration, bool apply_forward,
+                     bool apply_backward) {
+    auto gsHandle = this->get_gs_handle();
+    nnz_lno_persistent_work_host_view_t long_rows_per_color;
+    nnz_lno_persistent_work_host_view_t max_row_length_per_color;
+    scalar_persistent_work_view_t long_row_x;
+    bool haveLongRows = false;
+    if (gsHandle->get_long_row_threshold() > 0) {
+      long_rows_per_color      = gsHandle->get_long_rows_per_color();
+      max_row_length_per_color = gsHandle->get_max_row_length_per_color();
+      long_row_x               = gsHandle->get_long_row_x();
+      gs._long_row_x           = long_row_x;
+      haveLongRows             = true;
+    }
+
+    for (int iter = 0; iter < num_iteration; ++iter) {
+      for (int doingBackward = 0; doingBackward < 2; doingBackward++) {
+        if (!doingBackward && !apply_forward) continue;
+        if (doingBackward && !apply_backward) continue;
+
+        for (color_t colorIter = 0; colorIter < numColors; ++colorIter) {
+          // i is just the color set now being processed
+          color_t i = doingBackward ? (numColors - colorIter - 1) : colorIter;
+          const char* labelShort =
+              doingBackward ? "KokkosSparse::GaussSeidel::PSGS::backward"
+                            : "KokkosSparse::GaussSeidel::PSGS::forward";
+          const char* labelLong =
+              doingBackward
+                  ? "KokkosSparse::GaussSeidel::PSGS::backwardLongRows"
+                  : "KokkosSparse::GaussSeidel::PSGS::forwardLongRows";
+          nnz_lno_t color_index_begin = h_color_xadj(i);
+          nnz_lno_t color_index_end   = h_color_xadj(i + 1);
+          nnz_lno_t numLongRows = haveLongRows ? long_rows_per_color(i) : 0;
+          nnz_lno_t numRegularRows =
+              color_index_end - color_index_begin - numLongRows;
+          if (numRegularRows) {
+            Kokkos::parallel_for(
+                labelShort,
+                range_pol(color_index_begin, color_index_end - numLongRows),
+                gs);
+          }
+          if (numLongRows) {
+            gs._color_set_begin   = color_index_end - numLongRows;
+            nnz_lno_t max_par     = max_row_length_per_color(i);
+            nnz_lno_t par_per_row = (max_par + 1023) / 1024;
+            gs._long_row_par      = par_per_row;
+            for (nnz_lno_t long_row_col = 0;
+                 long_row_col < gs._Xvector.extent_int(1); long_row_col++) {
+              auto Xcol =
+                  Kokkos::subview(gs._Xvector, Kokkos::ALL(), long_row_col);
+              auto Ycol =
+                  Kokkos::subview(gs._Yvector, Kokkos::ALL(), long_row_col);
+              gs._long_row_col = long_row_col;
+              Kokkos::deep_copy(long_row_x, nnz_scalar_t());
+              Kokkos::parallel_for(labelLong,
+                                   Kokkos::RangePolicy<MyExecSpace, LongRowTag>(
+                                       0, numLongRows * par_per_row),
+                                   gs);
+              Kokkos::parallel_for(
+                  "KokkosSparse::GaussSeidel::LongRows::x_update",
+                  range_pol(color_index_end - numLongRows, color_index_end),
+                  LongRowUpdateFunctor<decltype(Xcol), decltype(Ycol)>(
+                      Xcol, Ycol, long_row_x, gs._permuted_inverse_diagonal,
+                      gs.omega, color_index_end - numLongRows));
             }
           }
         }
       }
-    };
+    }
   }
-}
+};
+}  // namespace Impl
+}  // namespace KokkosSparse
 #endif
diff --git a/src/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp b/src/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp
index 1ca45ece80..182d33a2e7 100644
--- a/src/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp
+++ b/src/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp
@@ -57,478 +57,472 @@
 #endif
 
 namespace KokkosSparse {
-  namespace Impl {
-    // Specialization struct which defines whether a specialization exists
-    template<class KernelHandle, class a_size_view_t_, class a_lno_view_t>
-    struct gauss_seidel_symbolic_eti_spec_avail {
-      enum : bool { value = false };
-    };
-    template<class KernelHandle, class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t>
-    struct gauss_seidel_numeric_eti_spec_avail {
-      enum : bool { value = false };
-    };
-    template<class KernelHandle, class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t, class x_scalar_view_t, class y_scalar_view_t>
-    struct gauss_seidel_apply_eti_spec_avail {
-      enum : bool { value = false };
-    };
-  }
-}
-
-
-#define KOKKOSSPARSE_GAUSS_SEIDEL_SYMBOLIC_ETI_SPEC_AVAIL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE ) \
-  template<>                                                            \
-  struct gauss_seidel_symbolic_eti_spec_avail<                          \
-                                               KokkosKernels::Experimental::KokkosKernelsHandle< \
-                                                                                                const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \
-                                                                                                EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE> , \
-                                               Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE, \
-                                                            Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                                               Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE, \
-                                                            Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                                            Kokkos::MemoryTraits<Kokkos::Unmanaged> > > \
-  { enum : bool { value = true }; };
-
-#define KOKKOSSPARSE_GAUSS_SEIDEL_NUMERIC_ETI_SPEC_AVAIL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE ) \
-  template<>                                                            \
-  struct gauss_seidel_numeric_eti_spec_avail<                           \
-                                              KokkosKernels::Experimental::KokkosKernelsHandle< \
-                                                                                               const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \
-                                                                                               EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE> , \
-                                              Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE, \
-                                                           Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                                              Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE, \
-                                                           Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                                              Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE, \
-                                                           Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                                           Kokkos::MemoryTraits<Kokkos::Unmanaged> > > \
-  { enum : bool { value = true }; };
-
-#define KOKKOSSPARSE_GAUSS_SEIDEL_APPLY_ETI_SPEC_AVAIL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE ) \
-  template<>                                                            \
-  struct gauss_seidel_apply_eti_spec_avail<                             \
-                                            KokkosKernels::Experimental::KokkosKernelsHandle< \
-                                                                                             const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \
-                                                                                             EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE> , \
-                                            Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE, \
-                                                         Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                                            Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE, \
-                                                         Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                                            Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE, \
-                                                         Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                                            Kokkos::View< SCALAR_TYPE **, LAYOUT_TYPE, \
-                                                          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                                            Kokkos::View<const SCALAR_TYPE **, LAYOUT_TYPE, \
-                                                         Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                                         Kokkos::MemoryTraits<Kokkos::Unmanaged> > > \
-  { enum : bool { value = true }; };
+namespace Impl {
+// Specialization struct which defines whether a specialization exists
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t>
+struct gauss_seidel_symbolic_eti_spec_avail {
+  enum : bool { value = false };
+};
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
+          class a_scalar_view_t>
+struct gauss_seidel_numeric_eti_spec_avail {
+  enum : bool { value = false };
+};
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
+          class a_scalar_view_t, class x_scalar_view_t, class y_scalar_view_t>
+struct gauss_seidel_apply_eti_spec_avail {
+  enum : bool { value = false };
+};
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#define KOKKOSSPARSE_GAUSS_SEIDEL_SYMBOLIC_ETI_SPEC_AVAIL(                \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE, SLOW_MEM_SPACE)                                       \
+  template <>                                                             \
+  struct gauss_seidel_symbolic_eti_spec_avail<                            \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE>,               \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {          \
+    enum : bool { value = true };                                         \
+  };
+
+#define KOKKOSSPARSE_GAUSS_SEIDEL_NUMERIC_ETI_SPEC_AVAIL(                 \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE, SLOW_MEM_SPACE)                                       \
+  template <>                                                             \
+  struct gauss_seidel_numeric_eti_spec_avail<                             \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE>,               \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {          \
+    enum : bool { value = true };                                         \
+  };
+
+#define KOKKOSSPARSE_GAUSS_SEIDEL_APPLY_ETI_SPEC_AVAIL(                   \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE, SLOW_MEM_SPACE)                                       \
+  template <>                                                             \
+  struct gauss_seidel_apply_eti_spec_avail<                               \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE>,               \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR_TYPE **, LAYOUT_TYPE,                           \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE **, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {          \
+    enum : bool { value = true };                                         \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosSparse_gauss_seidel_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosSparse_gauss_seidel_symbolic_eti_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosSparse_gauss_seidel_numeric_eti_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosSparse_gauss_seidel_apply_eti_spec_avail.hpp>
+#include <KokkosSparse_gauss_seidel_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosSparse_gauss_seidel_symbolic_eti_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosSparse_gauss_seidel_numeric_eti_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosSparse_gauss_seidel_apply_eti_spec_avail.hpp>
 
 namespace KokkosSparse {
-  namespace Impl {
-
-
-
-    template<
-      class KernelHandle,
-      class a_size_view_t_, class a_lno_view_t,
-      bool tpl_spec_avail =
-      gauss_seidel_symbolic_tpl_spec_avail<
-        KernelHandle, a_size_view_t_,  a_lno_view_t>::value,
-      bool eti_spec_avail =
-      gauss_seidel_symbolic_eti_spec_avail<
-        KernelHandle,
-        a_size_view_t_,  a_lno_view_t>::value >
-    struct GAUSS_SEIDEL_SYMBOLIC{
-      static void
-      gauss_seidel_symbolic (
-                             KernelHandle *handle,
-                             typename KernelHandle::const_nnz_lno_t num_rows,
-                             typename KernelHandle::const_nnz_lno_t num_cols,
-                             a_size_view_t_ row_map,
-                             a_lno_view_t entries,
-                             bool is_graph_symmetric);
-    };
-
-    template<
-      class KernelHandle,
-      class a_size_view_t_, class a_lno_view_t, class  a_scalar_view_t,
-      bool tpl_spec_avail =
-      gauss_seidel_numeric_tpl_spec_avail<
-        KernelHandle, a_size_view_t_,  a_lno_view_t, a_scalar_view_t>::value,
-      bool eti_spec_avail =
-      gauss_seidel_numeric_eti_spec_avail<
-        KernelHandle,
-        a_size_view_t_,  a_lno_view_t, a_scalar_view_t>::value >
-    struct GAUSS_SEIDEL_NUMERIC{
-      static void
-      gauss_seidel_numeric (KernelHandle *handle,
-                            typename KernelHandle::const_nnz_lno_t num_rows,
-                            typename KernelHandle::const_nnz_lno_t num_cols,
-                            a_size_view_t_ row_map,
-                            a_lno_view_t entries,
-                            a_scalar_view_t values,
-                            bool is_graph_symmetric
-                            );
-
-      static void
-      gauss_seidel_numeric (KernelHandle *handle,
-                            typename KernelHandle::const_nnz_lno_t num_rows,
-                            typename KernelHandle::const_nnz_lno_t num_cols,
-                            a_size_view_t_ row_map,
-                            a_lno_view_t entries,
-                            a_scalar_view_t values,
-                            a_scalar_view_t given_inverse_diagonal,
-                            bool is_graph_symmetric
-                            );
-    };
-
-
-    template<
-      class KernelHandle,
-      class a_size_view_t_, class a_lno_view_t, class  a_scalar_view_t, class x_scalar_view_t, class y_scalar_view_t,
-      bool tpl_spec_avail =
-      gauss_seidel_apply_tpl_spec_avail<
-        KernelHandle, a_size_view_t_,  a_lno_view_t, a_scalar_view_t,x_scalar_view_t, y_scalar_view_t>::value,
-      bool eti_spec_avail =
-      gauss_seidel_apply_eti_spec_avail<
-        KernelHandle,
-        a_size_view_t_,  a_lno_view_t, a_scalar_view_t,x_scalar_view_t, y_scalar_view_t>::value >
-    struct GAUSS_SEIDEL_APPLY{
-      static void
-      gauss_seidel_apply (
-                          KernelHandle *handle,
-                          typename KernelHandle::const_nnz_lno_t num_rows,
-                          typename KernelHandle::const_nnz_lno_t num_cols,
-                          a_size_view_t_ row_map,
-                          a_lno_view_t entries,
-                          a_scalar_view_t values,
-                          x_scalar_view_t x_lhs_output_vec,
-                          y_scalar_view_t y_rhs_input_vec,
-                          bool init_zero_x_vector,
-                          bool update_y_vector,
-                          typename KernelHandle::nnz_scalar_t omega, int numIter, bool apply_forward, bool apply_backward);
-    };
-
+namespace Impl {
+
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
+          bool tpl_spec_avail = gauss_seidel_symbolic_tpl_spec_avail<
+              KernelHandle, a_size_view_t_, a_lno_view_t>::value,
+          bool eti_spec_avail = gauss_seidel_symbolic_eti_spec_avail<
+              KernelHandle, a_size_view_t_, a_lno_view_t>::value>
+struct GAUSS_SEIDEL_SYMBOLIC {
+  static void gauss_seidel_symbolic(
+      KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows,
+      typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map,
+      a_lno_view_t entries, bool is_graph_symmetric);
+};
+
+template <
+    class KernelHandle, KokkosKernels::SparseMatrixFormat format,
+    class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t,
+    bool tpl_spec_avail = gauss_seidel_numeric_tpl_spec_avail<
+        KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t>::value,
+    bool eti_spec_avail = gauss_seidel_numeric_eti_spec_avail<
+        KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t>::value>
+struct GAUSS_SEIDEL_NUMERIC {
+  static void gauss_seidel_numeric(
+      KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows,
+      typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map,
+      a_lno_view_t entries, a_scalar_view_t values, bool is_graph_symmetric);
+
+  static void gauss_seidel_numeric(
+      KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows,
+      typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map,
+      a_lno_view_t entries, a_scalar_view_t values,
+      a_scalar_view_t given_inverse_diagonal, bool is_graph_symmetric);
+};
+
+template <class KernelHandle, KokkosKernels::SparseMatrixFormat format,
+          class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t,
+          class x_scalar_view_t, class y_scalar_view_t,
+          bool tpl_spec_avail = gauss_seidel_apply_tpl_spec_avail<
+              KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t,
+              x_scalar_view_t, y_scalar_view_t>::value,
+          bool eti_spec_avail = gauss_seidel_apply_eti_spec_avail<
+              KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t,
+              x_scalar_view_t, y_scalar_view_t>::value>
+struct GAUSS_SEIDEL_APPLY {
+  static void gauss_seidel_apply(
+      KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows,
+      typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map,
+      a_lno_view_t entries, a_scalar_view_t values,
+      x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec,
+      bool init_zero_x_vector, bool update_y_vector,
+      typename KernelHandle::nnz_scalar_t omega, int numIter,
+      bool apply_forward, bool apply_backward);
+};
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 
-
-    template<class KernelHandle, class a_size_view_t_, class a_lno_view_t_>
-    struct GAUSS_SEIDEL_SYMBOLIC<KernelHandle,
-                                 a_size_view_t_,  a_lno_view_t_,
-                                 false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>{
-
-      static void
-      gauss_seidel_symbolic (
-                             KernelHandle *handle,
-                             typename KernelHandle::const_nnz_lno_t num_rows,
-                             typename KernelHandle::const_nnz_lno_t num_cols,
-                             a_size_view_t_ row_map,
-                             a_lno_view_t_ entries,
-                             bool is_graph_symmetric){
-        Kokkos::Profiling::pushRegion("KokkosSparse::Impl::gauss_seidel_symbolic");
-        auto gsHandle = handle->get_gs_handle();
-        if(gsHandle->get_algorithm_type() == GS_CLUSTER)
-        {
-          using SGS = typename Impl::ClusterGaussSeidel
-            <KernelHandle, a_size_view_t_, a_lno_view_t_, typename KernelHandle::in_scalar_nnz_view_t>;
-          SGS sgs(handle,num_rows, num_cols, row_map, entries, is_graph_symmetric);
-          sgs.initialize_symbolic();
-        }
-        else if(gsHandle->get_algorithm_type() == GS_TWOSTAGE)
-        {
-          using SGS = typename Impl::TwostageGaussSeidel
-            <KernelHandle, a_size_view_t_, a_lno_view_t_, typename KernelHandle::in_scalar_nnz_view_t>;
-          SGS sgs(handle, num_rows, num_cols, row_map, entries);
-          sgs.initialize_symbolic();
-        }
-        else
-        {
-          using SGS = typename Impl::PointGaussSeidel
-            <KernelHandle, a_size_view_t_, a_lno_view_t_, typename KernelHandle::in_scalar_nnz_view_t>;
-          SGS sgs(handle,num_rows, num_cols, row_map, entries, is_graph_symmetric);
-          sgs.initialize_symbolic();
-        }
-        Kokkos::Profiling::popRegion();
-      }
-    };
-
-    template<class KernelHandle, class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t>
-    struct GAUSS_SEIDEL_NUMERIC<KernelHandle,
-                                a_size_view_t_,  a_lno_view_t, a_scalar_view_t,
-                                false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>{
-
-      static void
-      gauss_seidel_numeric(KernelHandle *handle,
-                           typename KernelHandle::const_nnz_lno_t num_rows,
-                           typename KernelHandle::const_nnz_lno_t num_cols,
-                           a_size_view_t_ row_map,
-                           a_lno_view_t entries,
-                           a_scalar_view_t values,
-                           bool is_graph_symmetric)
-      {
-        Kokkos::Profiling::pushRegion("KokkosSparse::Impl::gauss_seidel_numeric");
-        auto gsHandle = handle->get_gs_handle();
-        if(gsHandle->get_algorithm_type() == GS_CLUSTER)
-        {
-          using SGS = typename Impl::ClusterGaussSeidel
-            <KernelHandle,a_size_view_t_, a_lno_view_t,a_scalar_view_t>;
-          SGS sgs(handle, num_rows, num_cols, row_map, entries, values, is_graph_symmetric);
-          sgs.initialize_numeric();
-        }
-        else if(gsHandle->get_algorithm_type() == GS_TWOSTAGE)
-        {
-          using SGS = typename Impl::TwostageGaussSeidel
-            <KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t>;
-          SGS sgs(handle, num_rows, num_cols, row_map, entries, values);
-          sgs.initialize_numeric();
-        }
-        else
-        {
-          using SGS = typename Impl::PointGaussSeidel
-            <KernelHandle,a_size_view_t_, a_lno_view_t,a_scalar_view_t>;
-          SGS sgs(handle, num_rows, num_cols, row_map, entries, values, is_graph_symmetric);
-          sgs.initialize_numeric();
-        }
-        Kokkos::Profiling::popRegion();
-      }
-
-      static void
-      gauss_seidel_numeric(KernelHandle *handle,
-                           typename KernelHandle::const_nnz_lno_t num_rows,
-                           typename KernelHandle::const_nnz_lno_t num_cols,
-                           a_size_view_t_ row_map,
-                           a_lno_view_t entries,
-                           a_scalar_view_t values,
-                           a_scalar_view_t given_inverse_diagonal,
-                           bool is_graph_symmetric)
-      {
-        Kokkos::Profiling::pushRegion("KokkosSparse::Impl::gauss_seidel_numeric");
-        auto gsHandle = handle->get_gs_handle();
-        if(gsHandle->get_algorithm_type() == GS_CLUSTER)
-        {
-          using SGS = typename Impl::ClusterGaussSeidel
-            <KernelHandle,a_size_view_t_, a_lno_view_t,a_scalar_view_t>;
-          SGS sgs(handle, num_rows, num_cols, row_map, entries, values, given_inverse_diagonal, is_graph_symmetric);
-          sgs.initialize_numeric();
-        }
-        else if(gsHandle->get_algorithm_type() == GS_TWOSTAGE)
-        {
-          using SGS = typename Impl::TwostageGaussSeidel
-            <KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t>;
-          SGS sgs(handle, num_rows, num_cols, row_map, entries, values, given_inverse_diagonal);
-          sgs.initialize_numeric();
-        }
-        else
-        {
-          using SGS = typename Impl::PointGaussSeidel
-            <KernelHandle,a_size_view_t_, a_lno_view_t,a_scalar_view_t>;
-          SGS sgs(handle, num_rows, num_cols, row_map, entries, values, given_inverse_diagonal, is_graph_symmetric);
-          sgs.initialize_numeric();
-        }
-        Kokkos::Profiling::popRegion();
-      }
-    };
-
-    template<class KernelHandle, class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t, class x_scalar_view_t, class y_scalar_view_t>
-    struct GAUSS_SEIDEL_APPLY<KernelHandle,
-                              a_size_view_t_,  a_lno_view_t, a_scalar_view_t,x_scalar_view_t, y_scalar_view_t,
-                              false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>{
-
-      static void
-      gauss_seidel_apply(
-                         KernelHandle *handle,
-                         typename KernelHandle::const_nnz_lno_t num_rows,
-                         typename KernelHandle::const_nnz_lno_t num_cols,
-                         a_size_view_t_ row_map,
-                         a_lno_view_t entries,
-                         a_scalar_view_t values,
-                         x_scalar_view_t x_lhs_output_vec,
-                         y_scalar_view_t y_rhs_input_vec,
-                         bool init_zero_x_vector,
-                         bool update_y_vector,
-                         typename KernelHandle::nnz_scalar_t omega, int numIter, bool apply_forward, bool apply_backward)
-      {
-        Kokkos::Profiling::pushRegion("KokkosSparse::Impl::gauss_seidel_apply");
-        auto gsHandle = handle->get_gs_handle();
-        if(gsHandle->get_algorithm_type() == GS_CLUSTER)
-        {
-          using SGS = typename Impl::ClusterGaussSeidel <KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t>;
-          SGS sgs(handle, num_rows, num_cols, row_map, entries, values);
-          sgs.apply(
-                    x_lhs_output_vec,
-                    y_rhs_input_vec,
-                    init_zero_x_vector,
-                    numIter,
-                    omega,
-                    apply_forward,
-                    apply_backward, update_y_vector);
-        }
-        else if(gsHandle->get_algorithm_type() == GS_TWOSTAGE)
-        {
-          using SGS = typename Impl::TwostageGaussSeidel
-            <KernelHandle, a_size_view_t_, a_lno_view_t,a_scalar_view_t>;
-          SGS sgs(handle, num_rows, num_cols, row_map, entries, values);
-          sgs.apply(
-                    x_lhs_output_vec,
-                    y_rhs_input_vec,
-                    init_zero_x_vector,
-                    numIter,
-                    omega,
-                    apply_forward,
-                    apply_backward, update_y_vector);
-        }
-        else
-        {
-          using SGS = typename Impl::PointGaussSeidel <KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t>;
-          SGS sgs(handle, num_rows, num_cols, row_map, entries, values);
-          sgs.apply(
-                    x_lhs_output_vec,
-                    y_rhs_input_vec,
-                    init_zero_x_vector,
-                    numIter,
-                    omega,
-                    apply_forward,
-                    apply_backward, update_y_vector);
-        }
-        Kokkos::Profiling::popRegion();
-      }
-    };
-#endif
-
-
-
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t_>
+struct GAUSS_SEIDEL_SYMBOLIC<KernelHandle, a_size_view_t_, a_lno_view_t_, false,
+                             KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  static void gauss_seidel_symbolic(
+      KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows,
+      typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map,
+      a_lno_view_t_ entries, bool is_graph_symmetric) {
+    Kokkos::Profiling::pushRegion("KokkosSparse::Impl::gauss_seidel_symbolic");
+    auto gsHandle = handle->get_gs_handle();
+    if (gsHandle->get_algorithm_type() == GS_CLUSTER) {
+      using SGS = typename Impl::ClusterGaussSeidel<
+          KernelHandle, a_size_view_t_, a_lno_view_t_,
+          typename KernelHandle::in_scalar_nnz_view_t>;
+      SGS sgs(handle, num_rows, num_cols, row_map, entries, is_graph_symmetric);
+      sgs.initialize_symbolic();
+    } else if (gsHandle->get_algorithm_type() == GS_TWOSTAGE) {
+      using SGS = typename Impl::TwostageGaussSeidel<
+          KernelHandle, a_size_view_t_, a_lno_view_t_,
+          typename KernelHandle::in_scalar_nnz_view_t>;
+      SGS sgs(handle, num_rows, num_cols, row_map, entries);
+      sgs.initialize_symbolic();
+    } else {
+      using SGS = typename Impl::PointGaussSeidel<
+          KernelHandle, a_size_view_t_, a_lno_view_t_,
+          typename KernelHandle::in_scalar_nnz_view_t>;
+      SGS sgs(handle, num_rows, num_cols, row_map, entries, is_graph_symmetric);
+      sgs.initialize_symbolic();
+    }
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <class KernelHandle, KokkosKernels::SparseMatrixFormat format,
+          class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t>
+struct GAUSS_SEIDEL_NUMERIC<KernelHandle, format, a_size_view_t_, a_lno_view_t,
+                            a_scalar_view_t, false,
+                            KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  static void gauss_seidel_numeric(
+      KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows,
+      typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map,
+      a_lno_view_t entries, a_scalar_view_t values, bool is_graph_symmetric) {
+    Kokkos::Profiling::pushRegion("KokkosSparse::Impl::gauss_seidel_numeric");
+    auto gsHandle = handle->get_gs_handle();
+    if (gsHandle->get_algorithm_type() == GS_CLUSTER) {
+      using SGS =
+          typename Impl::ClusterGaussSeidel<KernelHandle, a_size_view_t_,
+                                            a_lno_view_t, a_scalar_view_t>;
+      SGS sgs(handle, num_rows, num_cols, row_map, entries, values,
+              is_graph_symmetric);
+      sgs.initialize_numeric();
+    } else if (gsHandle->get_algorithm_type() == GS_TWOSTAGE) {
+      using SGS =
+          typename Impl::TwostageGaussSeidel<KernelHandle, a_size_view_t_,
+                                             a_lno_view_t, a_scalar_view_t>;
+      SGS sgs(handle, num_rows, num_cols, row_map, entries, values);
+      sgs.initialize_numeric();
+    } else {
+      using SGS = typename Impl::PointGaussSeidel<
+          KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t, format>;
+      SGS sgs(handle, num_rows, num_cols, row_map, entries, values,
+              is_graph_symmetric);
+      sgs.initialize_numeric();
+    }
+    Kokkos::Profiling::popRegion();
   }
-}
-
-#define KOKKOSSPARSE_GAUSS_SEIDEL_SYMBOLIC_ETI_SPEC_DECL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE ) \
-  extern template struct                                                \
-  GAUSS_SEIDEL_SYMBOLIC<                                                \
-                         KokkosKernels::Experimental::KokkosKernelsHandle< \
-                                                                          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \
-                                                                          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE> , \
-                         Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE, \
-                                      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                         Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE, \
-                                      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                      Kokkos::MemoryTraits<Kokkos::Unmanaged> > , \
-                         false, true >;
-
-
-#define KOKKOSSPARSE_GAUSS_SEIDEL_SYMBOLIC_ETI_SPEC_INST( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE) \
-  template struct                                                       \
-  GAUSS_SEIDEL_SYMBOLIC<                                                \
-                         KokkosKernels::Experimental::KokkosKernelsHandle< \
-                                                                          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \
-                                                                          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE> , \
-                         Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE, \
-                                      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                         Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE, \
-                                      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                      Kokkos::MemoryTraits<Kokkos::Unmanaged> > , \
-                         false, true > ;
-
-#define KOKKOSSPARSE_GAUSS_SEIDEL_NUMERIC_ETI_SPEC_DECL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE,SLOW_MEM_SPACE ) \
-  extern template struct                                                \
-  GAUSS_SEIDEL_NUMERIC<                                                 \
-                        KokkosKernels::Experimental::KokkosKernelsHandle< \
-                                                                         const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \
-                                                                         EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE> , \
-                        Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-                                     Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                        Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE, \
-                                     Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                        Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,  \
-                                     Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                     Kokkos::MemoryTraits<Kokkos::Unmanaged> > , \
-                        false, true >;
-
-
-#define KOKKOSSPARSE_GAUSS_SEIDEL_NUMERIC_ETI_SPEC_INST( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE) \
-  template struct                                                       \
-  GAUSS_SEIDEL_NUMERIC<                                                 \
-                        KokkosKernels::Experimental::KokkosKernelsHandle< \
-                                                                         const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \
-                                                                         EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE> , \
-                        Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-                                     Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                        Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE, \
-                                     Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                     Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                        Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,  \
-                                     Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                     Kokkos::MemoryTraits<Kokkos::Unmanaged> > , \
-                        false, true > ;
-
-#define KOKKOSSPARSE_GAUSS_SEIDEL_APPLY_ETI_SPEC_DECL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE) \
-  extern template struct                                                \
-  GAUSS_SEIDEL_APPLY<                                                   \
-                      KokkosKernels::Experimental::KokkosKernelsHandle< \
-                                                                       const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \
-                                                                       EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE> , \
-                      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,    \
-                                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,   \
-                                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,    \
-                                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                      Kokkos::View<SCALAR_TYPE **, LAYOUT_TYPE, \
-                                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                      Kokkos::View<const SCALAR_TYPE **, LAYOUT_TYPE, \
-                                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > , \
-                      false, true >;
-
-
-#define KOKKOSSPARSE_GAUSS_SEIDEL_APPLY_ETI_SPEC_INST( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE) \
-  template struct                                                       \
-  GAUSS_SEIDEL_APPLY<                                                   \
-                      KokkosKernels::Experimental::KokkosKernelsHandle< \
-                                                                       const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \
-                                                                       EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE> , \
-                      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,    \
-                                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,   \
-                                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,    \
-                                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                      Kokkos::View<SCALAR_TYPE **, LAYOUT_TYPE, \
-                                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                      Kokkos::View<const SCALAR_TYPE **, LAYOUT_TYPE, \
-                                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > , \
-                      false, true > ;
-
-#include<KokkosSparse_gauss_seidel_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosSparse_gauss_seidel_symbolic_eti_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosSparse_gauss_seidel_numeric_eti_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosSparse_gauss_seidel_apply_eti_spec_decl.hpp>
-
 
+  static void gauss_seidel_numeric(
+      KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows,
+      typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map,
+      a_lno_view_t entries, a_scalar_view_t values,
+      a_scalar_view_t given_inverse_diagonal, bool is_graph_symmetric) {
+    Kokkos::Profiling::pushRegion("KokkosSparse::Impl::gauss_seidel_numeric");
+    auto gsHandle = handle->get_gs_handle();
+    if (gsHandle->get_algorithm_type() == GS_CLUSTER) {
+      using SGS =
+          typename Impl::ClusterGaussSeidel<KernelHandle, a_size_view_t_,
+                                            a_lno_view_t, a_scalar_view_t>;
+      SGS sgs(handle, num_rows, num_cols, row_map, entries, values,
+              given_inverse_diagonal, is_graph_symmetric);
+      sgs.initialize_numeric();
+    } else if (gsHandle->get_algorithm_type() == GS_TWOSTAGE) {
+      using SGS =
+          typename Impl::TwostageGaussSeidel<KernelHandle, a_size_view_t_,
+                                             a_lno_view_t, a_scalar_view_t>;
+      SGS sgs(handle, num_rows, num_cols, row_map, entries, values,
+              given_inverse_diagonal);
+      sgs.initialize_numeric();
+    } else {
+      using SGS =
+          typename Impl::PointGaussSeidel<KernelHandle, a_size_view_t_,
+                                          a_lno_view_t, a_scalar_view_t>;
+      SGS sgs(handle, num_rows, num_cols, row_map, entries, values,
+              given_inverse_diagonal, is_graph_symmetric);
+      sgs.initialize_numeric();
+    }
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <class KernelHandle, KokkosKernels::SparseMatrixFormat format,
+          class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t,
+          class x_scalar_view_t, class y_scalar_view_t>
+struct GAUSS_SEIDEL_APPLY<KernelHandle, format, a_size_view_t_, a_lno_view_t,
+                          a_scalar_view_t, x_scalar_view_t, y_scalar_view_t,
+                          false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  static void gauss_seidel_apply(
+      KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows,
+      typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map,
+      a_lno_view_t entries, a_scalar_view_t values,
+      x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec,
+      bool init_zero_x_vector, bool update_y_vector,
+      typename KernelHandle::nnz_scalar_t omega, int numIter,
+      bool apply_forward, bool apply_backward) {
+    Kokkos::Profiling::pushRegion("KokkosSparse::Impl::gauss_seidel_apply");
+    auto gsHandle = handle->get_gs_handle();
+    if (gsHandle->get_algorithm_type() == GS_CLUSTER) {
+      using SGS =
+          typename Impl::ClusterGaussSeidel<KernelHandle, a_size_view_t_,
+                                            a_lno_view_t, a_scalar_view_t>;
+      SGS sgs(handle, num_rows, num_cols, row_map, entries, values);
+      sgs.apply(x_lhs_output_vec, y_rhs_input_vec, init_zero_x_vector, numIter,
+                omega, apply_forward, apply_backward, update_y_vector);
+    } else if (gsHandle->get_algorithm_type() == GS_TWOSTAGE) {
+      using SGS =
+          typename Impl::TwostageGaussSeidel<KernelHandle, a_size_view_t_,
+                                             a_lno_view_t, a_scalar_view_t>;
+      SGS sgs(handle, num_rows, num_cols, row_map, entries, values);
+      sgs.apply(x_lhs_output_vec, y_rhs_input_vec, init_zero_x_vector, numIter,
+                omega, apply_forward, apply_backward, update_y_vector);
+    } else {
+      using SGS = typename Impl::PointGaussSeidel<
+          KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t, format>;
+      SGS sgs(handle, num_rows, num_cols, row_map, entries, values);
+      sgs.apply(x_lhs_output_vec, y_rhs_input_vec, init_zero_x_vector, numIter,
+                omega, apply_forward, apply_backward, update_y_vector);
+    }
+    Kokkos::Profiling::popRegion();
+  }
+};
+#endif
 
-#endif // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#define KOKKOSSPARSE_GAUSS_SEIDEL_SYMBOLIC_ETI_SPEC_DECL(                 \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE, SLOW_MEM_SPACE)                                       \
+  extern template struct GAUSS_SEIDEL_SYMBOLIC<                           \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE>,               \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      false, true>;
+
+#define KOKKOSSPARSE_GAUSS_SEIDEL_SYMBOLIC_ETI_SPEC_INST(                 \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE, SLOW_MEM_SPACE)                                       \
+  template struct GAUSS_SEIDEL_SYMBOLIC<                                  \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE>,               \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      false, true>;
+
+#define KOKKOSSPARSE_GAUSS_SEIDEL_NUMERIC_ETI_SPEC_DECL(                  \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE, SLOW_MEM_SPACE)                                       \
+  extern template struct GAUSS_SEIDEL_NUMERIC<                            \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE>,               \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      false, true>;
+
+#define KOKKOSSPARSE_GAUSS_SEIDEL_NUMERIC_ETI_SPEC_INST(                  \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE, SLOW_MEM_SPACE)                                       \
+  template struct GAUSS_SEIDEL_NUMERIC<                                   \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE>,               \
+      KokkosKernels::BlockCRS,                                            \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      false, true>;                                                       \
+  template struct GAUSS_SEIDEL_NUMERIC<                                   \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE>,               \
+      KokkosKernels::BSR,                                                 \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      false, true>;
+
+#define KOKKOSSPARSE_GAUSS_SEIDEL_APPLY_ETI_SPEC_DECL(                    \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE, SLOW_MEM_SPACE)                                       \
+  extern template struct GAUSS_SEIDEL_APPLY<                              \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE>,               \
+      KokkosKernels::BlockCRS,                                            \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR_TYPE **, LAYOUT_TYPE,                           \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE **, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      false, true>;                                                       \
+  extern template struct GAUSS_SEIDEL_APPLY<                              \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE>,               \
+      KokkosKernels::BSR,                                                 \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR_TYPE **, LAYOUT_TYPE,                           \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE **, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      false, true>;
+
+#define KOKKOSSPARSE_GAUSS_SEIDEL_APPLY_ETI_SPEC_INST(                    \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE, SLOW_MEM_SPACE)                                       \
+  template struct GAUSS_SEIDEL_APPLY<                                     \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE>,               \
+      KokkosKernels::BlockCRS,                                            \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR_TYPE **, LAYOUT_TYPE,                           \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE **, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      false, true>;                                                       \
+  template struct GAUSS_SEIDEL_APPLY<                                     \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE>,               \
+      KokkosKernels::BSR,                                                 \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR_TYPE **, LAYOUT_TYPE,                           \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE **, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      false, true>;
+
+#include <KokkosSparse_gauss_seidel_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosSparse_gauss_seidel_symbolic_eti_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosSparse_gauss_seidel_numeric_eti_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosSparse_gauss_seidel_apply_eti_spec_decl.hpp>
+
+#endif  // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_
diff --git a/src/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp b/src/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp
index 49b284e5b7..2165387076 100644
--- a/src/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp
@@ -64,11 +64,9 @@ namespace Impl {
 ///
 /// \tparam CrsMatrixType Specialization of KokkosSparse::CrsMatrix
 /// \tparam OffsetType The type of offsets (the entries of \c offsets)
-template<class DiagType,
-         class OffsetsType,
-         class CrsMatrixType>
+template <class DiagType, class OffsetsType, class CrsMatrixType>
 struct CrsMatrixGetDiagCopyWithOffsetsFunctor {
-  typedef typename CrsMatrixType::ordinal_type LO; // local ordinal type
+  typedef typename CrsMatrixType::ordinal_type LO;  // local ordinal type
   typedef typename CrsMatrixType::device_type device_type;
   typedef typename CrsMatrixType::value_type scalar_type;
   typedef typename OffsetsType::non_const_value_type offset_type;
@@ -80,44 +78,42 @@ struct CrsMatrixGetDiagCopyWithOffsetsFunctor {
   /// \param offsets [in] Offsets, precomputed using
   ///   Tpetra::CrsMatrix::getLocalDiagOffsets.
   /// \param A [in] The sparse matrix from which to get the diagonal.
-  CrsMatrixGetDiagCopyWithOffsetsFunctor (const DiagType& D,
-                                          const OffsetsType& offsets,
-                                          const CrsMatrixType& A) :
-    D_ (D), offsets_ (offsets), A_ (A)
-  {
-    static_assert (Kokkos::Impl::is_view<DiagType>::value,
-                   "The DiagType template parameter must be a Kokkos::View.");
-    static_assert (static_cast<int> (DiagType::rank) == 1,
-                   "The DiagType template parameter must be a 1-D Kokkos::View.");
-    static_assert (std::is_same<DiagType, typename DiagType::non_const_type>::value,
-                   "The DiagType template parameter must be a nonconst Kokkos::View.");
-    static_assert (Kokkos::Impl::is_view<OffsetsType>::value,
-                   "The OffsetsType template parameter must be a Kokkos::View.");
-    static_assert (static_cast<int> (OffsetsType::rank) == 1,
-                   "The OffsetsType template parameter must be a 1-D Kokkos::View.");
+  CrsMatrixGetDiagCopyWithOffsetsFunctor(const DiagType& D,
+                                         const OffsetsType& offsets,
+                                         const CrsMatrixType& A)
+      : D_(D), offsets_(offsets), A_(A) {
+    static_assert(Kokkos::is_view<DiagType>::value,
+                  "The DiagType template parameter must be a Kokkos::View.");
+    static_assert(
+        static_cast<int>(DiagType::rank) == 1,
+        "The DiagType template parameter must be a 1-D Kokkos::View.");
+    static_assert(
+        std::is_same<DiagType, typename DiagType::non_const_type>::value,
+        "The DiagType template parameter must be a nonconst Kokkos::View.");
+    static_assert(Kokkos::is_view<OffsetsType>::value,
+                  "The OffsetsType template parameter must be a Kokkos::View.");
+    static_assert(
+        static_cast<int>(OffsetsType::rank) == 1,
+        "The OffsetsType template parameter must be a 1-D Kokkos::View.");
   }
 
   /// \brief Operator for Kokkos::parallel_for.
   ///
   /// \param lclRow [in] The current (local) row of the sparse matrix.
-  KOKKOS_INLINE_FUNCTION void
-  operator () (const LO& lclRow) const
-  {
-    const offset_type INV =
-      KokkosSparse::OrdinalTraits<offset_type>::invalid ();
-    const scalar_type ZERO =
-      Kokkos::Details::ArithTraits<scalar_type>::zero ();
+  KOKKOS_INLINE_FUNCTION void operator()(const LO& lclRow) const {
+    const offset_type INV = KokkosSparse::OrdinalTraits<offset_type>::invalid();
+    const scalar_type ZERO = Kokkos::Details::ArithTraits<scalar_type>::zero();
 
     // If the row lacks a stored diagonal entry, then its value is zero.
-    D_(lclRow) = ZERO;
+    D_(lclRow)               = ZERO;
     const offset_type offset = offsets_(lclRow);
     if (offset != INV) {
-      auto curRow = A_.rowConst (lclRow);
-      D_(lclRow) = curRow.value(offset);
+      auto curRow = A_.rowConst(lclRow);
+      D_(lclRow)  = curRow.value(offset);
     }
   }
 
-private:
+ private:
   //! 1-D Kokkos::View into which to store the matrix's diagonal.
   DiagType D_;
   //! Offsets, precomputed using Tpetra::CrsMatrix::getLocalDiagOffsets.
@@ -126,38 +122,30 @@ struct CrsMatrixGetDiagCopyWithOffsetsFunctor {
   CrsMatrixType A_;
 };
 
-template<class ScalarType,
-         class OrdinalType,
-         class DeviceType,
-         class OffsetType>
+template <class ScalarType, class OrdinalType, class DeviceType,
+          class OffsetType>
 struct CrsMatrixGetDiagCopyWithOffsets {
   typedef ScalarType scalar_type;
   typedef OrdinalType ordinal_type;
   typedef DeviceType device_type;
   typedef OffsetType offset_type;
-  typedef ::KokkosSparse::CrsMatrix<scalar_type,
-                                    ordinal_type,
-                                    device_type,
-                                    void,
-                                    offset_type> crs_matrix_type;
-  typedef Kokkos::View<scalar_type*,
-                       Kokkos::LayoutLeft,
-                       device_type,
-                       Kokkos::MemoryUnmanaged> diag_type;
-  typedef Kokkos::View<const size_t*,
-                       device_type,
-                       Kokkos::MemoryUnmanaged> offsets_type;
-  static void
-  getDiagCopy (const diag_type& D,
-               const offsets_type& offsets,
-               const crs_matrix_type& A)
-  {
+  typedef ::KokkosSparse::CrsMatrix<scalar_type, ordinal_type, device_type,
+                                    void, offset_type>
+      crs_matrix_type;
+  typedef Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type,
+                       Kokkos::MemoryUnmanaged>
+      diag_type;
+  typedef Kokkos::View<const size_t*, device_type, Kokkos::MemoryUnmanaged>
+      offsets_type;
+  static void getDiagCopy(const diag_type& D, const offsets_type& offsets,
+                          const crs_matrix_type& A) {
     typedef typename device_type::execution_space execution_space;
-    const ordinal_type numRows = static_cast<ordinal_type> (D.extent(0));
+    const ordinal_type numRows = static_cast<ordinal_type>(D.extent(0));
     CrsMatrixGetDiagCopyWithOffsetsFunctor<diag_type, offsets_type,
-      crs_matrix_type> functor (D, offsets, A);
+                                           crs_matrix_type>
+        functor(D, offsets, A);
     typedef Kokkos::RangePolicy<execution_space, ordinal_type> policy_type;
-    Kokkos::parallel_for (policy_type (0, numRows), functor);
+    Kokkos::parallel_for(policy_type(0, numRows), functor);
   }
 };
 
@@ -166,23 +154,22 @@ struct CrsMatrixGetDiagCopyWithOffsets {
 // KokkosSparse::Impl::CrsMatrixGetDiagCopyWithOffsets.  This is NOT for
 // users!!!
 //
-#define KOKKOSSPARSE_IMPL_GETDIAGCOPYWITHOFFSETS_DECL( SCALAR, ORDINAL, EXEC_SPACE, MEM_SPACE, OFFSET ) \
-extern template struct CrsMatrixGetDiagCopyWithOffsets< SCALAR , \
-                                         ORDINAL , \
-                                         Kokkos::Device< EXEC_SPACE , MEM_SPACE >, \
-                                         OFFSET >;
+#define KOKKOSSPARSE_IMPL_GETDIAGCOPYWITHOFFSETS_DECL(    \
+    SCALAR, ORDINAL, EXEC_SPACE, MEM_SPACE, OFFSET)       \
+  extern template struct CrsMatrixGetDiagCopyWithOffsets< \
+      SCALAR, ORDINAL, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, OFFSET>;
 
 //
 // Macro for definitions of full specialization of
-// KokkosSparse::Impl::CrsMatrixGetDiagCopyWithOffsets.  This is NOT for users!!!
+// KokkosSparse::Impl::CrsMatrixGetDiagCopyWithOffsets.  This is NOT for
+// users!!!
 //
-#define KOKKOSSPARSE_IMPL_GETDIAGCOPYWITHOFFSETS_DEF( SCALAR, ORDINAL, EXEC_SPACE, MEM_SPACE, OFFSET ) \
-template struct CrsMatrixGetDiagCopyWithOffsets< SCALAR , \
-                                 ORDINAL , \
-                                 Kokkos::Device< EXEC_SPACE , MEM_SPACE >, \
-                                 OFFSET >;
+#define KOKKOSSPARSE_IMPL_GETDIAGCOPYWITHOFFSETS_DEF( \
+    SCALAR, ORDINAL, EXEC_SPACE, MEM_SPACE, OFFSET)   \
+  template struct CrsMatrixGetDiagCopyWithOffsets<    \
+      SCALAR, ORDINAL, Kokkos::Device<EXEC_SPACE, MEM_SPACE>, OFFSET>;
 
-} // namespace Impl
-} // namespace KokkosSparse
+}  // namespace Impl
+}  // namespace KokkosSparse
 
-#endif // KOKKOS_SPARSE_IMPL_GETDIAGCOPYWITHOFFSETS_HPP_
+#endif  // KOKKOS_SPARSE_IMPL_GETDIAGCOPYWITHOFFSETS_HPP_
diff --git a/src/sparse/impl/KokkosSparse_partitioning_impl.hpp b/src/sparse/impl/KokkosSparse_partitioning_impl.hpp
index f38bb9b185..033179e721 100644
--- a/src/sparse/impl/KokkosSparse_partitioning_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_partitioning_impl.hpp
@@ -44,12 +44,8 @@
 
 #include "KokkosKernels_Utils.hpp"
 #include <Kokkos_Core.hpp>
-#include <Kokkos_Atomic.hpp>
-#include <Kokkos_Timer.hpp>
 #include <Kokkos_Sort.hpp>
 #include <Kokkos_Random.hpp>
-#include <Kokkos_MemoryTraits.hpp>
-#include <Kokkos_Parallel_Reduce.hpp>
 #include "KokkosBlas1_fill.hpp"
 #include "KokkosGraph_Distance1Color.hpp"
 #include "KokkosKernels_Uniform_Initialized_MemoryPool.hpp"
@@ -57,58 +53,64 @@
 #ifndef _KOKKOS_PARTITIONING_IMP_HPP
 #define _KOKKOS_PARTITIONING_IMP_HPP
 
-namespace KokkosSparse{
+namespace KokkosSparse {
 
-namespace Impl{
+namespace Impl {
 
-//Fill a view such that v(i) = i
-//Does the same thing as std::iota(begin, end)
-template<typename View, typename Ordinal>
-struct IotaFunctor
-{
+// Fill a view such that v(i) = i
+// Does the same thing as std::iota(begin, end)
+template <typename View, typename Ordinal>
+struct IotaFunctor {
   IotaFunctor(View& v_) : v(v_) {}
-  KOKKOS_INLINE_FUNCTION void operator()(const Ordinal i) const
-  {
-    v(i) = i;
-  }
+  KOKKOS_INLINE_FUNCTION void operator()(const Ordinal i) const { v(i) = i; }
   View v;
 };
 
 template <typename HandleType, typename lno_row_view_t, typename lno_nnz_view_t>
-struct BalloonClustering
-{
+struct BalloonClustering {
   typedef typename HandleType::HandleExecSpace MyExecSpace;
   typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace;
-  typedef typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace;
+  typedef
+      typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace;
 
   typedef typename HandleType::size_type size_type;
   typedef typename HandleType::nnz_lno_t nnz_lno_t;
 
   typedef typename lno_row_view_t::value_type offset_t;
 
-  typedef typename HandleType::row_lno_temp_work_view_t row_lno_temp_work_view_t;
-  typedef typename HandleType::row_lno_persistent_work_view_t row_lno_persistent_work_view_t;
-  typedef typename HandleType::row_lno_persistent_work_host_view_t row_lno_persistent_work_host_view_t; //Host view type
+  typedef
+      typename HandleType::row_lno_temp_work_view_t row_lno_temp_work_view_t;
+  typedef typename HandleType::row_lno_persistent_work_view_t
+      row_lno_persistent_work_view_t;
+  typedef typename HandleType::row_lno_persistent_work_host_view_t
+      row_lno_persistent_work_host_view_t;  // Host view type
 
-  typedef typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t;
-  typedef typename HandleType::nnz_lno_persistent_work_view_t nnz_lno_persistent_work_view_t;
-  typedef typename HandleType::nnz_lno_persistent_work_host_view_t nnz_lno_persistent_work_host_view_t; //Host view type
+  typedef
+      typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t;
+  typedef typename HandleType::nnz_lno_persistent_work_view_t
+      nnz_lno_persistent_work_view_t;
+  typedef typename HandleType::nnz_lno_persistent_work_host_view_t
+      nnz_lno_persistent_work_host_view_t;  // Host view type
 
   typedef nnz_lno_persistent_work_view_t nnz_view_t;
   typedef Kokkos::View<float*, MyPersistentMemorySpace> float_view_t;
-  //typedef Kokkos::View<nnz_lno_t, MyTempMemorySpace, Kokkos::MemoryTraits<0>> single_view_t;
-  //typedef Kokkos::View<nnz_lno_t, Kokkos::HostSpace, Kokkos::MemoryTraits<0>> single_view_host_t;
+  // typedef Kokkos::View<nnz_lno_t, MyTempMemorySpace, Kokkos::MemoryTraits<0>>
+  // single_view_t; typedef Kokkos::View<nnz_lno_t, Kokkos::HostSpace,
+  // Kokkos::MemoryTraits<0>> single_view_host_t;
 
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
   typedef Kokkos::Bitset<MyExecSpace> bitset_t;
 
-  typedef Kokkos::RangePolicy<MyExecSpace> range_policy_t ;
-  typedef Kokkos::TeamPolicy<MyExecSpace> team_policy_t ;
-  typedef typename team_policy_t::member_type team_member_t ;
+  typedef Kokkos::RangePolicy<MyExecSpace> range_policy_t;
+  typedef Kokkos::TeamPolicy<MyExecSpace> team_policy_t;
+  typedef typename team_policy_t::member_type team_member_t;
 
-  BalloonClustering(size_type numRows_, const lno_row_view_t& rowmap_, const lno_nnz_view_t& colinds_)
-    : numRows(numRows_), rowmap(rowmap_), colinds(colinds_), randPool(0xDEADBEEF)
-  {}
+  BalloonClustering(size_type numRows_, const lno_row_view_t& rowmap_,
+                    const lno_nnz_view_t& colinds_)
+      : numRows(numRows_),
+        rowmap(rowmap_),
+        colinds(colinds_),
+        randPool(0xDEADBEEF) {}
 
   nnz_lno_t numRows;
   lno_row_view_t rowmap;
@@ -117,121 +119,125 @@ struct BalloonClustering
   typedef Kokkos::Random_XorShift64_Pool<MyExecSpace> RandPool;
   RandPool randPool;
 
-  struct InitRootsTag {};   //select roots; set their distances to 0
-  struct RandomFillTag {};  //assign non-roots to random clusters, and assign large random distances
+  struct InitRootsTag {};  // select roots; set their distances to 0
+  struct RandomFillTag {
+  };  // assign non-roots to random clusters, and assign large random distances
   struct UpdatePressureTag {};
-  struct BalloonTag {};     //run the "balloon" procedure, where each cluster tries to inflate up to clusterSize
+  struct BalloonTag {};  // run the "balloon" procedure, where each cluster
+                         // tries to inflate up to clusterSize
 
-  struct BalloonFunctor 
-  {
-    BalloonFunctor(const nnz_view_t& vertClusters_, const nnz_view_t& clusterCounts_, const nnz_view_t& distances_, const lno_row_view_t& row_map_, const lno_nnz_view_t& col_inds_, const float_view_t& pressure_, nnz_lno_t clusterSize_, RandPool& randPool_)
-      : vertClusters(vertClusters_), clusterCounts(clusterCounts_), distances(distances_), row_map(row_map_), col_inds(col_inds_), pressure(pressure_), clusterSize(clusterSize_), numRows(row_map.extent(0) - 1), vertLocks(numRows), randPool(randPool_)
-    {
-      numClusters = (numRows + clusterSize - 1) / clusterSize;
-      avgClusterSize = (double) numRows / numClusters;
-      iter = 0;
+  struct BalloonFunctor {
+    BalloonFunctor(const nnz_view_t& vertClusters_,
+                   const nnz_view_t& clusterCounts_,
+                   const nnz_view_t& distances_, const lno_row_view_t& row_map_,
+                   const lno_nnz_view_t& col_inds_,
+                   const float_view_t& pressure_, nnz_lno_t clusterSize_,
+                   RandPool& randPool_)
+        : vertClusters(vertClusters_),
+          clusterCounts(clusterCounts_),
+          distances(distances_),
+          row_map(row_map_),
+          col_inds(col_inds_),
+          pressure(pressure_),
+          clusterSize(clusterSize_),
+          numRows(row_map.extent(0) - 1),
+          vertLocks(numRows),
+          randPool(randPool_) {
+      numClusters    = (numRows + clusterSize - 1) / clusterSize;
+      avgClusterSize = (double)numRows / numClusters;
+      iter           = 0;
     }
 
-    //Run init version over the number of clusters.
-    KOKKOS_INLINE_FUNCTION void operator()(const InitRootsTag, const nnz_lno_t i) const
-    {
+    // Run init version over the number of clusters.
+    KOKKOS_INLINE_FUNCTION void operator()(const InitRootsTag,
+                                           const nnz_lno_t i) const {
       nnz_lno_t root;
       auto state = randPool.get_state();
-      do
-      {
+      do {
         root = state.rand(numRows);
-      }
-      while(!Kokkos::atomic_compare_exchange_strong(&vertClusters(root), numClusters, i));
+      } while (!Kokkos::atomic_compare_exchange_strong(&vertClusters(root),
+                                                       numClusters, i));
       randPool.free_state(state);
       distances(root) = 0;
-      pressure(root) = 1;
+      pressure(root)  = 1;
     }
 
-    KOKKOS_INLINE_FUNCTION void operator()(const RandomFillTag, const nnz_lno_t i) const
-    {
-      if(vertClusters(i) == numClusters)
-      {
-        auto state = randPool.get_state();
+    KOKKOS_INLINE_FUNCTION void operator()(const RandomFillTag,
+                                           const nnz_lno_t i) const {
+      if (vertClusters(i) == numClusters) {
+        auto state        = randPool.get_state();
         nnz_lno_t cluster = state.rand(numClusters);
         randPool.free_state(state);
         vertClusters(i) = cluster;
         Kokkos::atomic_increment(&clusterCounts(cluster));
         distances(i) = numRows;
-        pressure(i) = 0.1;
+        pressure(i)  = 0.1;
       }
     };
 
-    KOKKOS_INLINE_FUNCTION void operator()(const UpdatePressureTag, const nnz_lno_t i) const
-    {
+    KOKKOS_INLINE_FUNCTION void operator()(const UpdatePressureTag,
+                                           const nnz_lno_t i) const {
       nnz_lno_t cluster = vertClusters(i);
-      if(cluster == numClusters)
-      {
-        //unassigned vertices have 0 pressure
+      if (cluster == numClusters) {
+        // unassigned vertices have 0 pressure
         return;
       }
-      //count the number of neighbors in the same cluster
+      // count the number of neighbors in the same cluster
       nnz_lno_t sameClusterNeighbors = 0;
-      for(size_type j = row_map(i); j < row_map(i + 1); j++)
-      {
+      for (size_type j = row_map(i); j < row_map(i + 1); j++) {
         nnz_lno_t nei = col_inds(j);
-        if(nei < numRows && nei != i && vertClusters(nei) == cluster)
-        {
-          //while we're at it, minimize distance to root as in Djikstra's
-          if(distances(nei) + 1 < distances(i))
+        if (nei < numRows && nei != i && vertClusters(nei) == cluster) {
+          // while we're at it, minimize distance to root as in Djikstra's
+          if (distances(nei) + 1 < distances(i))
             distances(i) = distances(nei) + 1;
           sameClusterNeighbors++;
         }
       }
       nnz_lno_t curSize = clusterCounts(cluster);
-      //update pressure, if cluster is undersized
+      // update pressure, if cluster is undersized
       nnz_lno_t shortage = clusterSize - curSize;
-      float pressureChange = (shortage * shortage * (1.0f + 0.2f * sameClusterNeighbors)) / (1.0f + distances(i));
-      if(shortage > 0)
-        pressure(i) += pressureChange;
+      float pressureChange =
+          (shortage * shortage * (1.0f + 0.2f * sameClusterNeighbors)) /
+          (1.0f + distances(i));
+      if (shortage > 0) pressure(i) += pressureChange;
     }
 
-    KOKKOS_INLINE_FUNCTION void operator()(const BalloonTag, const nnz_lno_t i, double& sizeDeviation) const
-    {
+    KOKKOS_INLINE_FUNCTION void operator()(const BalloonTag, const nnz_lno_t i,
+                                           double& sizeDeviation) const {
       nnz_lno_t cluster = vertClusters(i);
-      if(cluster == numClusters)
-        return;
-      //find the weakest affinity neighbor
-      nnz_lno_t weakNei = numRows;
-      float weakestPressure = pressure(i);
+      if (cluster == numClusters) return;
+      // find the weakest affinity neighbor
+      nnz_lno_t weakNei        = numRows;
+      float weakestPressure    = pressure(i);
       nnz_lno_t weakNeiCluster = numClusters;
-      for(size_type j = row_map(i); j < row_map(i + 1); j++)
-      {
+      for (size_type j = row_map(i); j < row_map(i + 1); j++) {
         nnz_lno_t nei = col_inds(j);
-        //to annex another vertex, it must be a non-root in a different cluster
-        if(nei < numRows && nei != i && vertClusters(nei) != cluster && pressure(nei) < weakestPressure && distances(nei) != 0)
-        {
-          weakNei = nei;
+        // to annex another vertex, it must be a non-root in a different cluster
+        if (nei < numRows && nei != i && vertClusters(nei) != cluster &&
+            pressure(nei) < weakestPressure && distances(nei) != 0) {
+          weakNei         = nei;
           weakestPressure = pressure(nei);
-          weakNeiCluster = vertClusters(nei);
+          weakNeiCluster  = vertClusters(nei);
         }
       }
-      if(weakNei != numRows && clusterCounts(cluster) < clusterSize)
-      {
-        //this cluster will take over weakNei
-        if(vertLocks.set(i))
-        {
-          if(vertLocks.set(weakNei))
-          {
+      if (weakNei != numRows && clusterCounts(cluster) < clusterSize) {
+        // this cluster will take over weakNei
+        if (vertLocks.set(i)) {
+          if (vertLocks.set(weakNei)) {
             Kokkos::atomic_increment(&clusterCounts(cluster));
-            if(weakNeiCluster != numClusters)
+            if (weakNeiCluster != numClusters)
               Kokkos::atomic_decrement(&clusterCounts(weakNeiCluster));
             vertClusters(weakNei) = cluster;
             pressure(i) -= pressure(weakNei);
-            pressure(weakNei) = pressure(i);
+            pressure(weakNei)  = pressure(i);
             distances(weakNei) = distances(i) + 1;
             vertLocks.reset(weakNei);
           }
           vertLocks.reset(i);
         }
       }
-      if(distances(i) == 0)
-      {
-        //roots update sizeDeviation on behalf of the cluster
+      if (distances(i) == 0) {
+        // roots update sizeDeviation on behalf of the cluster
         double deviation = clusterCounts(cluster) - avgClusterSize;
         sizeDeviation += deviation * deviation;
       }
@@ -240,11 +246,11 @@ struct BalloonClustering
     nnz_view_t vertClusters;
     nnz_view_t clusterCounts;
     nnz_view_t distances;
-    //row_map/col_inds of input graph (read-only)
+    // row_map/col_inds of input graph (read-only)
     lno_row_view_t row_map;
     lno_nnz_view_t col_inds;
     float_view_t pressure;
-    //constants
+    // constants
     nnz_lno_t clusterSize;
     nnz_lno_t numClusters;
     nnz_lno_t numRows;
@@ -254,13 +260,13 @@ struct BalloonClustering
     double avgClusterSize;
   };
 
-  nnz_view_t run(nnz_lno_t clusterSize)
-  {
+  nnz_view_t run(nnz_lno_t clusterSize) {
     nnz_view_t vertClusters("Vertex cluster labels", numRows);
-    //For the sake of completeness, handle the clusterSize = 1 case by generating a trivial (identity) clustering.
-    if(clusterSize == 1)
-    {
-      Kokkos::parallel_for(Kokkos::RangePolicy<MyExecSpace>(0, numRows), IotaFunctor<nnz_view_t, nnz_lno_t>(vertClusters));
+    // For the sake of completeness, handle the clusterSize = 1 case by
+    // generating a trivial (identity) clustering.
+    if (clusterSize == 1) {
+      Kokkos::parallel_for(Kokkos::RangePolicy<MyExecSpace>(0, numRows),
+                           IotaFunctor<nnz_view_t, nnz_lno_t>(vertClusters));
       return vertClusters;
     }
     nnz_lno_t numClusters = (numRows + clusterSize - 1) / clusterSize;
@@ -268,37 +274,40 @@ struct BalloonClustering
     nnz_view_t clusterCounts("Vertices per cluster", numClusters);
     float_view_t pressure("Cluster pressure", numRows);
     Kokkos::deep_copy(clusterCounts, 1);
-    Kokkos::deep_copy(vertClusters, (nnz_lno_t) numClusters);
+    Kokkos::deep_copy(vertClusters, (nnz_lno_t)numClusters);
     Kokkos::deep_copy(distances, numRows);
-    BalloonFunctor funct(vertClusters, clusterCounts, distances, rowmap, colinds, pressure, clusterSize, randPool);
+    BalloonFunctor funct(vertClusters, clusterCounts, distances, rowmap,
+                         colinds, pressure, clusterSize, randPool);
     Kokkos::Timer globalTimer;
     Kokkos::Timer timer;
     timer.reset();
-    Kokkos::parallel_for(Kokkos::RangePolicy<MyExecSpace, InitRootsTag>(0, numClusters), funct);
+    Kokkos::parallel_for(
+        Kokkos::RangePolicy<MyExecSpace, InitRootsTag>(0, numClusters), funct);
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
     MyExecSpace().fence();
     std::cout << "Creating roots: " << timer.seconds() << '\n';
     timer.reset();
 #endif
-    double stoppingRMS = sqrt(numClusters * (0.02 * clusterSize) * (0.02 * clusterSize));
-    double deviation = (double) numClusters * (clusterSize - 1) * (clusterSize - 1);
+    double stoppingRMS =
+        sqrt(numClusters * (0.02 * clusterSize) * (0.02 * clusterSize));
+    double deviation =
+        (double)numClusters * (clusterSize - 1) * (clusterSize - 1);
     int regressions = 0;
-    while(true)
-    {
-      Kokkos::parallel_for(Kokkos::RangePolicy<MyExecSpace, UpdatePressureTag>(0, numRows), funct);
+    while (true) {
+      Kokkos::parallel_for(
+          Kokkos::RangePolicy<MyExecSpace, UpdatePressureTag>(0, numRows),
+          funct);
       double iterDeviation = 0;
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<MyExecSpace, BalloonTag>(0, numRows), funct, Kokkos::Sum<double>(iterDeviation));
-      if(iterDeviation <= stoppingRMS || iterDeviation == deviation)
-      {
-        //got within 2% RMS of optimal, or stagnated
+      Kokkos::parallel_reduce(
+          Kokkos::RangePolicy<MyExecSpace, BalloonTag>(0, numRows), funct,
+          Kokkos::Sum<double>(iterDeviation));
+      if (iterDeviation <= stoppingRMS || iterDeviation == deviation) {
+        // got within 2% RMS of optimal, or stagnated
         deviation = iterDeviation;
         break;
-      }
-      else if(iterDeviation >= deviation)
-      {
+      } else if (iterDeviation >= deviation) {
         regressions++;
-        if(regressions == 3)
-        {
+        if (regressions == 3) {
           deviation = iterDeviation;
           break;
         }
@@ -308,20 +317,23 @@ struct BalloonClustering
     }
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
     MyExecSpace().fence();
-    std::cout << "Expanding clusters for " << funct.iter << " iterations: " << timer.seconds() << '\n';
+    std::cout << "Expanding clusters for " << funct.iter
+              << " iterations: " << timer.seconds() << '\n';
     timer.reset();
 #endif
-    Kokkos::parallel_for(Kokkos::RangePolicy<MyExecSpace, RandomFillTag>(0, numRows), funct);
+    Kokkos::parallel_for(
+        Kokkos::RangePolicy<MyExecSpace, RandomFillTag>(0, numRows), funct);
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
     MyExecSpace().fence();
-    std::cout << "Randomly assigning clusters to remaining: " << timer.seconds() << '\n';
+    std::cout << "Randomly assigning clusters to remaining: " << timer.seconds()
+              << '\n';
     std::cout << "Clustering total: " << globalTimer.seconds() << "\n\n";
 #endif
     return vertClusters;
   }
 };
 
-}}  //KokkosSparse::Impl
+}  // namespace Impl
+}  // namespace KokkosSparse
 
 #endif
-
diff --git a/src/sparse/impl/KokkosSparse_sor_sequential_impl.hpp b/src/sparse/impl/KokkosSparse_sor_sequential_impl.hpp
index fb84ad6441..69aa11186b 100644
--- a/src/sparse/impl/KokkosSparse_sor_sequential_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_sor_sequential_impl.hpp
@@ -65,7 +65,7 @@
 
 #include <KokkosKernels_config.h>
 #include <Kokkos_ArithTraits.hpp>
-#include <vector> // temporarily
+#include <vector>  // temporarily
 
 namespace KokkosSparse {
 namespace Impl {
@@ -97,44 +97,31 @@ namespace Sequential {
 /// \param omega [in] Damping parameter.
 /// \param direction [in] Sweep direction: "F" for forward, "B" for
 ///   backward.
-template<class LocalOrdinal,
-         class OffsetType,
-         class MatrixScalar,
-         class DomainScalar,
-         class RangeScalar>
-void
-gaussSeidel (const LocalOrdinal numRows,
-             const LocalOrdinal numCols,
-             const OffsetType* const ptr,
-             const LocalOrdinal* const ind,
-             const MatrixScalar* const val,
-             const DomainScalar* const B,
-             const OffsetType b_stride,
-             RangeScalar* const X,
-             const OffsetType x_stride,
-             const MatrixScalar* const D,
-             const MatrixScalar omega,
-             const char direction[])
-{
+template <class LocalOrdinal, class OffsetType, class MatrixScalar,
+          class DomainScalar, class RangeScalar>
+void gaussSeidel(const LocalOrdinal numRows, const LocalOrdinal numCols,
+                 const OffsetType* const ptr, const LocalOrdinal* const ind,
+                 const MatrixScalar* const val, const DomainScalar* const B,
+                 const OffsetType b_stride, RangeScalar* const X,
+                 const OffsetType x_stride, const MatrixScalar* const D,
+                 const MatrixScalar omega, const char direction[]) {
   using Kokkos::Details::ArithTraits;
   typedef LocalOrdinal LO;
-  const OffsetType theNumRows = static_cast<OffsetType> (numRows);
-  const OffsetType theNumCols = static_cast<OffsetType> (numCols);
+  const OffsetType theNumRows = static_cast<OffsetType>(numRows);
+  const OffsetType theNumCols = static_cast<OffsetType>(numCols);
 
   if (numRows == 0 || numCols == 0) {
-    return; // Nothing to do.
-  }
-  else if (numRows > 0 && ptr[numRows] == 0) {
+    return;  // Nothing to do.
+  } else if (numRows > 0 && ptr[numRows] == 0) {
     // All the off-diagonal entries of A are zero, and all the
     // diagonal entries are (implicitly) 1.  Therefore compute: X :=
     // (1 - omega) X + omega B.  There's no need to care about the
     // direction, since there are no cross-row data dependencies in
     // this case.
-    const MatrixScalar oneMinusOmega =
-      ArithTraits<MatrixScalar>::one () - omega;
+    const MatrixScalar oneMinusOmega = ArithTraits<MatrixScalar>::one() - omega;
     for (OffsetType j = 0; j < theNumCols; ++j) {
-      RangeScalar* const x_j = X + j*x_stride;
-      const DomainScalar* const b_j = B + j*b_stride;
+      RangeScalar* const x_j        = X + j * x_stride;
+      const DomainScalar* const b_j = B + j * b_stride;
       for (OffsetType i = 0; i < theNumRows; ++i) {
         x_j[i] = oneMinusOmega * x_j[i] + omega * b_j[i];
       }
@@ -145,9 +132,9 @@ gaussSeidel (const LocalOrdinal numRows,
   if (numCols == 1) {
     if (direction[0] == 'F' || direction[0] == 'f') {
       for (LO i = 0; i < numRows; ++i) {
-        RangeScalar x_temp = ArithTraits<RangeScalar>::zero ();
-        for (OffsetType k = ptr[i]; k < ptr[i+1]; ++k) {
-          const LO j = ind[k];
+        RangeScalar x_temp = ArithTraits<RangeScalar>::zero();
+        for (OffsetType k = ptr[i]; k < ptr[i + 1]; ++k) {
+          const LO j              = ind[k];
           const MatrixScalar A_ij = val[k];
           x_temp += A_ij * X[j];
         }
@@ -158,92 +145,93 @@ gaussSeidel (const LocalOrdinal numRows,
       // It's a bad idea for LO to be unsigned, but we want this to
       // work nevertheless.
       for (LO i = numRows - 1; i != 0; --i) {
-        RangeScalar x_temp = ArithTraits<RangeScalar>::zero ();
-        for (OffsetType k = ptr[i]; k < ptr[i+1]; ++k) {
-          const LO j = ind[k];
+        RangeScalar x_temp = ArithTraits<RangeScalar>::zero();
+        for (OffsetType k = ptr[i]; k < ptr[i + 1]; ++k) {
+          const LO j              = ind[k];
           const MatrixScalar A_ij = val[k];
           x_temp += A_ij * X[j];
         }
         X[i] += omega * D[i] * (B[i] - x_temp);
       }
-      { // last loop iteration
-        const LO i = 0;
-        RangeScalar x_temp = ArithTraits<RangeScalar>::zero ();
-        for (OffsetType k = ptr[i]; k < ptr[i+1]; ++k) {
-          const LO j = ind[k];
+      {  // last loop iteration
+        const LO i         = 0;
+        RangeScalar x_temp = ArithTraits<RangeScalar>::zero();
+        for (OffsetType k = ptr[i]; k < ptr[i + 1]; ++k) {
+          const LO j              = ind[k];
           const MatrixScalar A_ij = val[k];
           x_temp += A_ij * X[j];
         }
         X[i] += omega * D[i] * (B[i] - x_temp);
       }
     }
-  }
-  else { // numCols > 1
+  } else {  // numCols > 1
     // mfh 20 Dec 2012: If Gauss-Seidel for multivectors with
     // multiple columns becomes important, we can add unrolled
     // implementations.  The implementation below is not unrolled.
     // It may also be reasonable to parallelize over right-hand
     // sides, if there are enough of them, especially if the matrix
     // fits in cache.
-    std::vector<RangeScalar> temp (numCols);
+    std::vector<RangeScalar> temp(numCols);
     RangeScalar* const x_temp = numCols == 0 ? NULL : &temp[0];
 
     if (direction[0] == 'F' || direction[0] == 'f') {
       for (LO i = 0; i < numRows; ++i) {
         for (OffsetType c = 0; c < theNumCols; ++c) {
-          x_temp[c] = ArithTraits<RangeScalar>::zero ();
+          x_temp[c] = ArithTraits<RangeScalar>::zero();
         }
-        for (OffsetType k = ptr[i]; k < ptr[i+1]; ++k) {
-          const LO j = ind[k];
+        for (OffsetType k = ptr[i]; k < ptr[i + 1]; ++k) {
+          const LO j              = ind[k];
           const MatrixScalar A_ij = val[k];
           for (OffsetType c = 0; c < theNumCols; ++c) {
-            x_temp[c] += A_ij * X[j + x_stride*c];
+            x_temp[c] += A_ij * X[j + x_stride * c];
           }
         }
         for (OffsetType c = 0; c < theNumCols; ++c) {
-          X[i + x_stride*c] += omega * D[i] * (B[i + b_stride*c] - x_temp[c]);
+          X[i + x_stride * c] +=
+              omega * D[i] * (B[i + b_stride * c] - x_temp[c]);
         }
       }
-    } else if (direction[0] == 'B' || direction[0] == 'b') { // backward mode
+    } else if (direction[0] == 'B' || direction[0] == 'b') {  // backward mode
       // Split the loop so that it is correct even if LO is unsigned.
       // It's a bad idea for LO to be unsigned, but we want this to
       // work nevertheless.
       for (LO i = numRows - 1; i != 0; --i) {
         for (OffsetType c = 0; c < theNumCols; ++c) {
-          x_temp[c] = ArithTraits<RangeScalar>::zero ();
+          x_temp[c] = ArithTraits<RangeScalar>::zero();
         }
-        for (OffsetType k = ptr[i]; k < ptr[i+1]; ++k) {
-          const LO j = ind[k];
+        for (OffsetType k = ptr[i]; k < ptr[i + 1]; ++k) {
+          const LO j              = ind[k];
           const MatrixScalar A_ij = val[k];
           for (OffsetType c = 0; c < theNumCols; ++c) {
-            x_temp[c] += A_ij * X[j + x_stride*c];
+            x_temp[c] += A_ij * X[j + x_stride * c];
           }
         }
         for (OffsetType c = 0; c < theNumCols; ++c) {
-          X[i + x_stride*c] += omega * D[i] * (B[i + b_stride*c] - x_temp[c]);
+          X[i + x_stride * c] +=
+              omega * D[i] * (B[i + b_stride * c] - x_temp[c]);
         }
       }
-      { // last loop iteration
+      {  // last loop iteration
         const LO i = 0;
         for (OffsetType c = 0; c < theNumCols; ++c) {
-          x_temp[c] = ArithTraits<RangeScalar>::zero ();
+          x_temp[c] = ArithTraits<RangeScalar>::zero();
         }
-        for (OffsetType k = ptr[i]; k < ptr[i+1]; ++k) {
-          const LO j = ind[k];
+        for (OffsetType k = ptr[i]; k < ptr[i + 1]; ++k) {
+          const LO j              = ind[k];
           const MatrixScalar A_ij = val[k];
           for (OffsetType c = 0; c < theNumCols; ++c) {
-            x_temp[c] += A_ij * X[j + x_stride*c];
+            x_temp[c] += A_ij * X[j + x_stride * c];
           }
         }
         for (OffsetType c = 0; c < theNumCols; ++c) {
-          X[i + x_stride*c] += omega * D[i] * (B[i + b_stride*c] - x_temp[c]);
+          X[i + x_stride * c] +=
+              omega * D[i] * (B[i + b_stride * c] - x_temp[c]);
         }
       }
     }
   }
 }
 
-
 /// \brief Implementation of reordered local Gauss-Seidel.
 ///
 /// "Local" means local to the MPI process.
@@ -277,46 +265,33 @@ gaussSeidel (const LocalOrdinal numRows,
 /// \param omega [in] Damping parameter.
 /// \param direction [in] Sweep direction: "F" for forward, "B" for
 ///   backward.
-template<class LocalOrdinal,
-         class OffsetType,
-         class MatrixScalar,
-         class DomainScalar,
-         class RangeScalar>
-void
-reorderedGaussSeidel (const LocalOrdinal numRows,
-                      const LocalOrdinal numCols,
-                      const OffsetType* const ptr,
-                      const LocalOrdinal* const ind,
-                      const MatrixScalar* const val,
-                      const DomainScalar* const B,
-                      const OffsetType b_stride,
-                      RangeScalar* const X,
-                      const OffsetType x_stride,
-                      const MatrixScalar* const D,
-                      const LocalOrdinal* const rowInd,
-                      const LocalOrdinal numRowInds, // length of rowInd
-                      const MatrixScalar omega,
-                      const char direction[])
-{
+template <class LocalOrdinal, class OffsetType, class MatrixScalar,
+          class DomainScalar, class RangeScalar>
+void reorderedGaussSeidel(
+    const LocalOrdinal numRows, const LocalOrdinal numCols,
+    const OffsetType* const ptr, const LocalOrdinal* const ind,
+    const MatrixScalar* const val, const DomainScalar* const B,
+    const OffsetType b_stride, RangeScalar* const X, const OffsetType x_stride,
+    const MatrixScalar* const D, const LocalOrdinal* const rowInd,
+    const LocalOrdinal numRowInds,  // length of rowInd
+    const MatrixScalar omega, const char direction[]) {
   using Kokkos::Details::ArithTraits;
   typedef LocalOrdinal LO;
-  const OffsetType theNumRows = static_cast<OffsetType> (numRows);
-  const OffsetType theNumCols = static_cast<OffsetType> (numCols);
+  const OffsetType theNumRows = static_cast<OffsetType>(numRows);
+  const OffsetType theNumCols = static_cast<OffsetType>(numCols);
 
   if (numRows == 0 || numCols == 0) {
-    return; // Nothing to do.
-  }
-  else if (numRows > 0 && ptr[numRows] == 0) {
+    return;  // Nothing to do.
+  } else if (numRows > 0 && ptr[numRows] == 0) {
     // All the off-diagonal entries of A are zero, and all the
     // diagonal entries are (implicitly) 1.  Therefore compute: X :=
     // (1 - omega) X + omega B.  There's no need to care about the
     // direction or row ordering, since there are no cross-row data
     // dependencies in this case.
-    const MatrixScalar oneMinusOmega =
-      ArithTraits<MatrixScalar>::one () - omega;
+    const MatrixScalar oneMinusOmega = ArithTraits<MatrixScalar>::one() - omega;
     for (OffsetType j = 0; j < theNumCols; ++j) {
-      RangeScalar* const x_j = X + j*x_stride;
-      const DomainScalar* const b_j = B + j*b_stride;
+      RangeScalar* const x_j        = X + j * x_stride;
+      const DomainScalar* const b_j = B + j * b_stride;
       for (OffsetType i = 0; i < theNumRows; ++i) {
         x_j[i] = oneMinusOmega * x_j[i] + omega * b_j[i];
       }
@@ -327,10 +302,10 @@ reorderedGaussSeidel (const LocalOrdinal numRows,
   if (numCols == 1) {
     if (direction[0] == 'F' || direction[0] == 'f') {
       for (LO ii = 0; ii < numRowInds; ++ii) {
-        LO i = rowInd[ii];
-        RangeScalar x_temp = ArithTraits<RangeScalar>::zero ();
-        for (OffsetType k = ptr[i]; k < ptr[i+1]; ++k) {
-          const LO j = ind[k];
+        LO i               = rowInd[ii];
+        RangeScalar x_temp = ArithTraits<RangeScalar>::zero();
+        for (OffsetType k = ptr[i]; k < ptr[i + 1]; ++k) {
+          const LO j              = ind[k];
           const MatrixScalar A_ij = val[k];
           x_temp += A_ij * X[j];
         }
@@ -341,98 +316,100 @@ reorderedGaussSeidel (const LocalOrdinal numRows,
       // It's a bad idea for LO to be unsigned, but we want this to
       // work nevertheless.
       for (LO ii = numRowInds - 1; ii != 0; --ii) {
-        LO i = rowInd[ii];
-        RangeScalar x_temp = ArithTraits<RangeScalar>::zero ();
-        for (OffsetType k = ptr[i]; k < ptr[i+1]; ++k) {
-          const LO j = ind[k];
+        LO i               = rowInd[ii];
+        RangeScalar x_temp = ArithTraits<RangeScalar>::zero();
+        for (OffsetType k = ptr[i]; k < ptr[i + 1]; ++k) {
+          const LO j              = ind[k];
           const MatrixScalar A_ij = val[k];
           x_temp += A_ij * X[j];
         }
         X[i] += omega * D[i] * (B[i] - x_temp);
       }
-      { // last loop iteration
-        const LO ii = 0;
-        LO i = rowInd[ii];
-        RangeScalar x_temp = ArithTraits<RangeScalar>::zero ();
-        for (OffsetType k = ptr[i]; k < ptr[i+1]; ++k) {
-          const LO j = ind[k];
+      {  // last loop iteration
+        const LO ii        = 0;
+        LO i               = rowInd[ii];
+        RangeScalar x_temp = ArithTraits<RangeScalar>::zero();
+        for (OffsetType k = ptr[i]; k < ptr[i + 1]; ++k) {
+          const LO j              = ind[k];
           const MatrixScalar A_ij = val[k];
           x_temp += A_ij * X[j];
         }
         X[i] += omega * D[i] * (B[i] - x_temp);
       }
     }
-  }
-  else { // numCols > 1
+  } else {  // numCols > 1
     // mfh 20 Dec 2012: If Gauss-Seidel for multivectors with
     // multiple columns becomes important, we can add unrolled
     // implementations.  The implementation below is not unrolled.
     // It may also be reasonable to parallelize over right-hand
     // sides, if there are enough of them, especially if the matrix
     // fits in cache.
-    std::vector<RangeScalar> temp (numCols);
+    std::vector<RangeScalar> temp(numCols);
     RangeScalar* const x_temp = numCols == 0 ? NULL : &temp[0];
 
     if (direction[0] == 'F' || direction[0] == 'f') {
       for (LO ii = 0; ii < numRowInds; ++ii) {
         LO i = rowInd[ii];
         for (OffsetType c = 0; c < theNumCols; ++c) {
-          x_temp[c] = Kokkos::Details::ArithTraits<RangeScalar>::zero ();
+          x_temp[c] = Kokkos::Details::ArithTraits<RangeScalar>::zero();
         }
-        for (OffsetType k = ptr[i]; k < ptr[i+1]; ++k) {
-          const LO j = ind[k];
+        for (OffsetType k = ptr[i]; k < ptr[i + 1]; ++k) {
+          const LO j              = ind[k];
           const MatrixScalar A_ij = val[k];
           for (OffsetType c = 0; c < theNumCols; ++c) {
-            x_temp[c] += A_ij * X[j + x_stride*c];
+            x_temp[c] += A_ij * X[j + x_stride * c];
           }
         }
         for (OffsetType c = 0; c < theNumCols; ++c) {
-          X[i + x_stride*c] += omega * D[i] * (B[i + b_stride*c] - x_temp[c]);
+          X[i + x_stride * c] +=
+              omega * D[i] * (B[i + b_stride * c] - x_temp[c]);
         }
       }
-    } else if (direction[0] == 'B' || direction[0] == 'b') { // backward mode
+    } else if (direction[0] == 'B' || direction[0] == 'b') {  // backward mode
       // Split the loop so that it is correct even if LO is unsigned.
       // It's a bad idea for LO to be unsigned, but we want this to
       // work nevertheless.
       for (LO ii = numRowInds - 1; ii != 0; --ii) {
         LO i = rowInd[ii];
         for (OffsetType c = 0; c < theNumCols; ++c) {
-          x_temp[c] = Kokkos::Details::ArithTraits<RangeScalar>::zero ();
+          x_temp[c] = Kokkos::Details::ArithTraits<RangeScalar>::zero();
         }
-        for (OffsetType k = ptr[i]; k < ptr[i+1]; ++k) {
-          const LO j = ind[k];
+        for (OffsetType k = ptr[i]; k < ptr[i + 1]; ++k) {
+          const LO j              = ind[k];
           const MatrixScalar A_ij = val[k];
           for (OffsetType c = 0; c < theNumCols; ++c) {
-            x_temp[c] += A_ij * X[j + x_stride*c];
+            x_temp[c] += A_ij * X[j + x_stride * c];
           }
         }
         for (OffsetType c = 0; c < theNumCols; ++c) {
-          X[i + x_stride*c] += omega * D[i] * (B[i + b_stride*c] - x_temp[c]);
+          X[i + x_stride * c] +=
+              omega * D[i] * (B[i + b_stride * c] - x_temp[c]);
         }
       }
-      { // last loop iteration
+      {  // last loop iteration
         const LO ii = 0;
-        LO i = rowInd[ii];
+        LO i        = rowInd[ii];
         for (OffsetType c = 0; c < theNumCols; ++c) {
-          x_temp[c] = Kokkos::Details::ArithTraits<RangeScalar>::zero ();
+          x_temp[c] = Kokkos::Details::ArithTraits<RangeScalar>::zero();
         }
-        for (OffsetType k = ptr[i]; k < ptr[i+1]; ++k) {
-          const LO j = ind[k];
+        for (OffsetType k = ptr[i]; k < ptr[i + 1]; ++k) {
+          const LO j              = ind[k];
           const MatrixScalar A_ij = val[k];
           for (OffsetType c = 0; c < theNumCols; ++c) {
-            x_temp[c] += A_ij * X[j + x_stride*c];
+            x_temp[c] += A_ij * X[j + x_stride * c];
           }
         }
         for (OffsetType c = 0; c < theNumCols; ++c) {
-          X[i + x_stride*c] += omega * D[i] * (B[i + b_stride*c] - x_temp[c]);
+          X[i + x_stride * c] +=
+              omega * D[i] * (B[i + b_stride * c] - x_temp[c]);
         }
       }
     }
   }
 }
 
-} // namespace Sequential
-} // namespace Impl
-} // namespace KokkosSparse
+}  // namespace Sequential
+}  // namespace Impl
+}  // namespace KokkosSparse
 
-#endif // KOKKOSSPARSE_IMPL_SOR_HPP
+#endif  // KOKKOSSPARSE_IMPL_SOR_HPP
diff --git a/src/sparse/impl/KokkosSparse_spgemm_CUSP_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_CUSP_impl.hpp
index f50ba8f37b..ecabc22ecf 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_CUSP_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_CUSP_impl.hpp
@@ -50,53 +50,39 @@
 #include <cusp/csr_matrix.h>
 #endif
 
-namespace KokkosSparse{
-namespace Impl{
-
+namespace KokkosSparse {
+namespace Impl {
 
 template <typename cusparray, typename kokkosarray>
-struct CopyArrayToCuspArray{
+struct CopyArrayToCuspArray {
   cusparray c;
   kokkosarray *k;
 
-  CopyArrayToCuspArray(cusparray &c_, kokkosarray *k_): c(c_), k(k_){}
+  CopyArrayToCuspArray(cusparray &c_, kokkosarray *k_) : c(c_), k(k_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const size_t &i) const {
-    c[i] = k[i];
-  }
+  void operator()(const size_t &i) const { c[i] = k[i]; }
 };
 
-
-
-
-template <typename KernelHandle,
-  typename ain_row_index_view_type,
-  typename ain_nonzero_index_view_type,
-  typename ain_nonzero_value_view_type,
-  typename bin_row_index_view_type,
-  typename bin_nonzero_index_view_type,
-  typename bin_nonzero_value_view_type,
-  typename cin_row_index_view_type,
-  typename cin_nonzero_index_view_type,
-  typename cin_nonzero_value_view_type>
-void CUSP_apply(
-    KernelHandle *handle,
-    typename KernelHandle::nnz_lno_t m,
-    typename KernelHandle::nnz_lno_t n,
-    typename KernelHandle::nnz_lno_t k,
-    ain_row_index_view_type row_mapA,
-    ain_nonzero_index_view_type entriesA,
-    ain_nonzero_value_view_type valuesA,
-
-    bool /* transposeA */,
-    bin_row_index_view_type row_mapB,
-    bin_nonzero_index_view_type entriesB,
-    bin_nonzero_value_view_type valuesB,
-    bool /* transposeB */,
-    cin_row_index_view_type row_mapC,
-    cin_nonzero_index_view_type &entriesC,
-    cin_nonzero_value_view_type &valuesC){
+template <
+    typename KernelHandle, typename ain_row_index_view_type,
+    typename ain_nonzero_index_view_type, typename ain_nonzero_value_view_type,
+    typename bin_row_index_view_type, typename bin_nonzero_index_view_type,
+    typename bin_nonzero_value_view_type, typename cin_row_index_view_type,
+    typename cin_nonzero_index_view_type, typename cin_nonzero_value_view_type>
+void CUSP_apply(KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
+                typename KernelHandle::nnz_lno_t n,
+                typename KernelHandle::nnz_lno_t k,
+                ain_row_index_view_type row_mapA,
+                ain_nonzero_index_view_type entriesA,
+                ain_nonzero_value_view_type valuesA,
+
+                bool /* transposeA */, bin_row_index_view_type row_mapB,
+                bin_nonzero_index_view_type entriesB,
+                bin_nonzero_value_view_type valuesB, bool /* transposeB */,
+                cin_row_index_view_type row_mapC,
+                cin_nonzero_index_view_type &entriesC,
+                cin_nonzero_value_view_type &valuesC) {
 #ifdef KERNELS_HAVE_CUSP
   typedef typename KernelHandle::nnz_lno_t idx;
   typedef typename KernelHandle::nnz_scalar_t value_type;
@@ -105,34 +91,33 @@ void CUSP_apply(
   typedef typename ain_nonzero_index_view_type::device_type device2;
   typedef typename ain_nonzero_value_view_type::device_type device3;
 
-  if (std::is_same<Kokkos::Cuda, device1 >::value){
-    throw std::runtime_error ("MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSP\n");
-    //return;
+  if (std::is_same<Kokkos::Cuda, device1>::value) {
+    throw std::runtime_error(
+        "MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSP\n");
+    // return;
   }
-  if (std::is_same<Kokkos::Cuda, device2 >::value){
-    throw std::runtime_error ("MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSP\n");
-    //return;
+  if (std::is_same<Kokkos::Cuda, device2>::value) {
+    throw std::runtime_error(
+        "MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSP\n");
+    // return;
   }
-  if (std::is_same<Kokkos::Cuda, device3 >::value){
-    throw std::runtime_error ("MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSP\n");
-    //return;
+  if (std::is_same<Kokkos::Cuda, device3>::value) {
+    throw std::runtime_error(
+        "MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSP\n");
+    // return;
   }
 
-  //typedef in_row_index_view_type idx_array_type;
-
-  typedef typename Kokkos::RangePolicy<typename KernelHandle::HandleExecSpace> my_exec_space;
-
-
+  // typedef in_row_index_view_type idx_array_type;
 
+  typedef typename Kokkos::RangePolicy<typename KernelHandle::HandleExecSpace>
+      my_exec_space;
 
   idx nnzA = entriesA.extent(0);
   idx nnzB = entriesB.extent(0);
 
-
   idx *a_xadj = (idx *)row_mapA.data();
   idx *b_xadj = (idx *)row_mapB.data();
 
-
   idx *a_adj = (idx *)entriesA.data();
   idx *b_adj = (idx *)entriesB.data();
 
@@ -148,75 +133,117 @@ void CUSP_apply(
   thrust::device_ptr<value_type> dev_b_ew(b_ew);
 
   */
-  typedef typename cusp::array1d_view< thrust::device_ptr<idx> > IDXArray1dView;
-  typedef typename cusp::array1d_view< thrust::device_ptr<value_type> > VALUEArray1dView;
-  //typedef typename cusp::array1d<idx, cusp::device_memory> IDXArray1dView;
-  //typedef typename cusp::array1d<value_type, cusp::device_memory> VALUEArray1dView;
-  IDXArray1dView arraya_xadj(thrust::device_pointer_cast(a_xadj), thrust::device_pointer_cast(a_xadj) + m + 1);
-  IDXArray1dView arraya_adj(thrust::device_pointer_cast(a_adj), thrust::device_pointer_cast(a_adj) + nnzA);
-  IDXArray1dView arrayb_xadj(thrust::device_pointer_cast(b_xadj), thrust::device_pointer_cast(b_xadj) + n + 1);
-  IDXArray1dView arrayb_adj(thrust::device_pointer_cast(b_adj), thrust::device_pointer_cast(b_adj) + nnzB);
-  VALUEArray1dView arraya_ew(thrust::device_pointer_cast(a_ew), thrust::device_pointer_cast(a_ew) + nnzA);
-  VALUEArray1dView arrayb_ew(thrust::device_pointer_cast(b_ew), thrust::device_pointer_cast(b_ew)+ nnzB);
-
-  typedef typename cusp::csr_matrix_view<IDXArray1dView, IDXArray1dView, VALUEArray1dView, idx,value_type,cusp::device_memory> cuspMatrix_View;
-
-  cuspMatrix_View A(m, n, entriesA.extent(0), arraya_xadj, arraya_adj, arraya_ew);
-  cuspMatrix_View B(n, k, entriesB.extent(0), arrayb_xadj, arrayb_adj, arrayb_ew);
+  typedef typename cusp::array1d_view<thrust::device_ptr<idx> > IDXArray1dView;
+  typedef typename cusp::array1d_view<thrust::device_ptr<value_type> >
+      VALUEArray1dView;
+  // typedef typename cusp::array1d<idx, cusp::device_memory> IDXArray1dView;
+  // typedef typename cusp::array1d<value_type, cusp::device_memory>
+  // VALUEArray1dView;
+  IDXArray1dView arraya_xadj(thrust::device_pointer_cast(a_xadj),
+                             thrust::device_pointer_cast(a_xadj) + m + 1);
+  IDXArray1dView arraya_adj(thrust::device_pointer_cast(a_adj),
+                            thrust::device_pointer_cast(a_adj) + nnzA);
+  IDXArray1dView arrayb_xadj(thrust::device_pointer_cast(b_xadj),
+                             thrust::device_pointer_cast(b_xadj) + n + 1);
+  IDXArray1dView arrayb_adj(thrust::device_pointer_cast(b_adj),
+                            thrust::device_pointer_cast(b_adj) + nnzB);
+  VALUEArray1dView arraya_ew(thrust::device_pointer_cast(a_ew),
+                             thrust::device_pointer_cast(a_ew) + nnzA);
+  VALUEArray1dView arrayb_ew(thrust::device_pointer_cast(b_ew),
+                             thrust::device_pointer_cast(b_ew) + nnzB);
+
+  typedef typename cusp::csr_matrix_view<IDXArray1dView, IDXArray1dView,
+                                         VALUEArray1dView, idx, value_type,
+                                         cusp::device_memory>
+      cuspMatrix_View;
+
+  cuspMatrix_View A(m, n, entriesA.extent(0), arraya_xadj, arraya_adj,
+                    arraya_ew);
+  cuspMatrix_View B(n, k, entriesB.extent(0), arrayb_xadj, arrayb_adj,
+                    arrayb_ew);
 
   /*
-  CopyArrayToCuspArray<typename cuspMatrix::row_offsets_array_type, typename KernelHandle::idx_array_type> Aforward(A.row_offsets, row_mapA);
+  CopyArrayToCuspArray<typename cuspMatrix::row_offsets_array_type, typename
+  KernelHandle::idx_array_type> Aforward(A.row_offsets, row_mapA);
   Kokkos::parallel_for (my_exec_space (0, m + 1) , Aforward);
-  Kokkos::parallel_for (my_exec_space (0, n + 1) , CopyArrayToCuspArray<typename cuspMatrix::row_offsets_array_type, typename KernelHandle::idx_array_type>(B.row_offsets, row_mapB));
-
-  Kokkos::parallel_for (my_exec_space (0, entriesA.extent(0)) , CopyArrayToCuspArray<typename cuspMatrix::column_indices_array_type, typename KernelHandle::idx_edge_array_type>(A.column_indices, entriesA));
-  Kokkos::parallel_for (my_exec_space (0, entriesB.extent(0)) , CopyArrayToCuspArray<typename cuspMatrix::column_indices_array_type, typename KernelHandle::idx_edge_array_type>(B.column_indices, entriesB));
-
-  Kokkos::parallel_for (my_exec_space (0, valuesA.extent(0)) , CopyArrayToCuspArray<typename cuspMatrix::values_array_type, typename KernelHandle::value_array_type>(A.values, valuesA));
-  Kokkos::parallel_for (my_exec_space (0, valuesB.extent(0)) , CopyArrayToCuspArray<typename cuspMatrix::values_array_type, typename KernelHandle::value_array_type>(B.values, valuesB));
+  Kokkos::parallel_for (my_exec_space (0, n + 1) , CopyArrayToCuspArray<typename
+  cuspMatrix::row_offsets_array_type, typename
+  KernelHandle::idx_array_type>(B.row_offsets, row_mapB));
+
+  Kokkos::parallel_for (my_exec_space (0, entriesA.extent(0)) ,
+  CopyArrayToCuspArray<typename cuspMatrix::column_indices_array_type, typename
+  KernelHandle::idx_edge_array_type>(A.column_indices, entriesA));
+  Kokkos::parallel_for (my_exec_space (0, entriesB.extent(0)) ,
+  CopyArrayToCuspArray<typename cuspMatrix::column_indices_array_type, typename
+  KernelHandle::idx_edge_array_type>(B.column_indices, entriesB));
+
+  Kokkos::parallel_for (my_exec_space (0, valuesA.extent(0)) ,
+  CopyArrayToCuspArray<typename cuspMatrix::values_array_type, typename
+  KernelHandle::value_array_type>(A.values, valuesA)); Kokkos::parallel_for
+  (my_exec_space (0, valuesB.extent(0)) , CopyArrayToCuspArray<typename
+  cuspMatrix::values_array_type, typename
+  KernelHandle::value_array_type>(B.values, valuesB));
   */
 
-  typedef typename cusp::csr_matrix<idx,value_type,cusp::device_memory> cuspMatrix;
-  //typedef cuspMatrix_View cuspMatrix;
+  typedef typename cusp::csr_matrix<idx, value_type, cusp::device_memory>
+      cuspMatrix;
+  // typedef cuspMatrix_View cuspMatrix;
   cuspMatrix C;
 
-
   Kokkos::Timer timer1;
-  cusp::multiply(A,B,C);
+  cusp::multiply(A, B, C);
   KernelHandle::HandleExecSpace().fence();
   std::cout << "Actual CUSP SPMM Time:" << timer1.seconds() << std::endl;
 
-
-
-
-  //std::cout << " C.column_indices.size():" <<  C.column_indices.size() << std::endl;
-  //std::cout << " C.values.size():" <<  C.values.size() << std::endl;
-  //row_mapC = typename cin_row_index_view_type::non_const_type("rowmapC", m + 1);
-
-
-  handle->set_c_nnz( C.values.size());
-
-  entriesC = typename cin_nonzero_index_view_type::non_const_type (Kokkos::view_alloc(Kokkos::WithoutInitializing, "EntriesC") ,  C.column_indices.size());
-  valuesC = typename cin_nonzero_value_view_type::non_const_type (Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"),  C.values.size());
-
-  Kokkos::parallel_for (my_exec_space (0, m + 1) , CopyArrayToCuspArray<typename cin_row_index_view_type::non_const_type,
-      idx >(row_mapC, (idx *) thrust::raw_pointer_cast(C.row_offsets.data())));
-  Kokkos::parallel_for (my_exec_space (0, C.column_indices.size()) , CopyArrayToCuspArray<typename cin_nonzero_index_view_type::non_const_type,
-      idx >(entriesC, (idx *) thrust::raw_pointer_cast(C.column_indices.data())));
-  Kokkos::parallel_for (my_exec_space (0, C.values.size()) , CopyArrayToCuspArray<typename cin_nonzero_value_view_type::non_const_type,
-      value_type>(valuesC, (value_type *) thrust::raw_pointer_cast(C.values.data())));
+  // std::cout << " C.column_indices.size():" <<  C.column_indices.size() <<
+  // std::endl; std::cout << " C.values.size():" <<  C.values.size() <<
+  // std::endl; row_mapC = typename
+  // cin_row_index_view_type::non_const_type("rowmapC", m + 1);
+
+  handle->set_c_nnz(C.values.size());
+
+  entriesC = typename cin_nonzero_index_view_type::non_const_type(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "EntriesC"),
+      C.column_indices.size());
+  valuesC = typename cin_nonzero_value_view_type::non_const_type(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"),
+      C.values.size());
+
+  Kokkos::parallel_for(
+      my_exec_space(0, m + 1),
+      CopyArrayToCuspArray<typename cin_row_index_view_type::non_const_type,
+                           idx>(
+          row_mapC, (idx *)thrust::raw_pointer_cast(C.row_offsets.data())));
+  Kokkos::parallel_for(
+      my_exec_space(0, C.column_indices.size()),
+      CopyArrayToCuspArray<typename cin_nonzero_index_view_type::non_const_type,
+                           idx>(
+          entriesC, (idx *)thrust::raw_pointer_cast(C.column_indices.data())));
+  Kokkos::parallel_for(
+      my_exec_space(0, C.values.size()),
+      CopyArrayToCuspArray<typename cin_nonzero_value_view_type::non_const_type,
+                           value_type>(
+          valuesC, (value_type *)thrust::raw_pointer_cast(C.values.data())));
 
 #else
   (void)handle;
-  (void)m;        (void)n;        (void)k;
-  (void)row_mapA; (void)row_mapB; (void)row_mapC;
-  (void)entriesA; (void)entriesB; (void)entriesC;
-  (void)valuesA;  (void)valuesB;  (void)valuesC;
-  throw std::runtime_error ("CUSP IS NOT DEFINED\n");
-  //return;
+  (void)m;
+  (void)n;
+  (void)k;
+  (void)row_mapA;
+  (void)row_mapB;
+  (void)row_mapC;
+  (void)entriesA;
+  (void)entriesB;
+  (void)entriesC;
+  (void)valuesA;
+  (void)valuesB;
+  (void)valuesC;
+  throw std::runtime_error("CUSP IS NOT DEFINED\n");
+  // return;
 #endif
 }
-}
+}  // namespace Impl
 
-}
+}  // namespace KokkosSparse
 #endif
diff --git a/src/sparse/impl/KokkosSparse_spgemm_cuSPARSE_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_cuSPARSE_impl.hpp
index 8d0eec2520..7f7b284d4c 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_cuSPARSE_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_cuSPARSE_impl.hpp
@@ -51,264 +51,209 @@
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
 #include "cusparse.h"
 #endif
-namespace KokkosSparse{
-
-namespace Impl{
-
-
-  template <typename KernelHandle,
-  typename ain_row_index_view_type,
-  typename ain_nonzero_index_view_type,
-  typename bin_row_index_view_type,
-  typename bin_nonzero_index_view_type,
-  typename cin_row_index_view_type>
-  void cuSPARSE_symbolic(
-      KernelHandle *handle,
-      typename KernelHandle::nnz_lno_t m,
-      typename KernelHandle::nnz_lno_t n,
-      typename KernelHandle::nnz_lno_t k,
-      ain_row_index_view_type row_mapA,
-      ain_nonzero_index_view_type entriesA,
-
-      bool transposeA,
-      bin_row_index_view_type row_mapB,
-      bin_nonzero_index_view_type entriesB,
-      bool transposeB,
-      cin_row_index_view_type row_mapC
-      ){
-
+namespace KokkosSparse {
+
+namespace Impl {
+
+template <
+    typename KernelHandle, typename ain_row_index_view_type,
+    typename ain_nonzero_index_view_type, typename bin_row_index_view_type,
+    typename bin_nonzero_index_view_type, typename cin_row_index_view_type>
+void cuSPARSE_symbolic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
+                       typename KernelHandle::nnz_lno_t n,
+                       typename KernelHandle::nnz_lno_t k,
+                       ain_row_index_view_type row_mapA,
+                       ain_nonzero_index_view_type entriesA,
+
+                       bool transposeA, bin_row_index_view_type row_mapB,
+                       bin_nonzero_index_view_type entriesB, bool transposeB,
+                       cin_row_index_view_type row_mapC) {
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
 
-    using device1   = typename ain_row_index_view_type::device_type;
-    using device2   = typename ain_nonzero_index_view_type::device_type;
-    using idx       = typename KernelHandle::nnz_lno_t;
-    using size_type = typename KernelHandle::size_type;
-
+  using device1   = typename ain_row_index_view_type::device_type;
+  using device2   = typename ain_nonzero_index_view_type::device_type;
+  using idx       = typename KernelHandle::nnz_lno_t;
+  using size_type = typename KernelHandle::size_type;
 
-    //TODO this is not correct, check memory space.
-    if (std::is_same<Kokkos::Cuda, device1 >::value){
-      throw std::runtime_error ("MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSPARSE\n");
-      //return;
-    }
-    if (std::is_same<Kokkos::Cuda, device2 >::value){
-      throw std::runtime_error ("MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSPARSE\n");
-      //return;
-    }
+  // TODO this is not correct, check memory space.
+  if (std::is_same<Kokkos::Cuda, device1>::value) {
+    throw std::runtime_error(
+        "MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSPARSE\n");
+    // return;
+  }
+  if (std::is_same<Kokkos::Cuda, device2>::value) {
+    throw std::runtime_error(
+        "MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSPARSE\n");
+    // return;
+  }
 
 #if defined(CUSPARSE_VERSION) && (11000 <= CUSPARSE_VERSION)
-      throw std::runtime_error ("SpGEMM cuSPARSE backend is not yet supported for this CUDA version\n");
+  throw std::runtime_error(
+      "SpGEMM cuSPARSE backend is not yet supported for this CUDA version\n");
 #else
 
-    if (std::is_same<idx, int>::value && std::is_same<size_type, int>::value){
-      const idx *a_xadj = (const idx*) row_mapA.data();
-      const idx *b_xadj = (const idx*) row_mapB.data();
-      idx *c_xadj = (idx*) row_mapC.data();
-
-      const idx *a_adj = entriesA.data();
-      const idx *b_adj = entriesB.data();
-      handle->create_cuSPARSE_Handle(transposeA, transposeB);
-      typename KernelHandle::SPGEMMcuSparseHandleType *h = handle->get_cuSparseHandle();
-
-      int nnzA = entriesA.extent(0);
-      int nnzB = entriesB.extent(0);
-
-      int baseC, nnzC;
-      int *nnzTotalDevHostPtr = &nnzC;
-
-      cusparseXcsrgemmNnz(h->handle,
-                          h->transA,
-                          h->transB,
-                          (int)m,
-                          (int)n,
-                          (int)k,
-                          h->a_descr,
-                          nnzA,
-                          (int *) a_xadj,
-                          (int *)a_adj,
-                          h->b_descr,
-                          nnzB,
-                          (int *)b_xadj,
-                          (int *)b_adj,
-                          h->c_descr,
-                          (int *)c_xadj,
-                          nnzTotalDevHostPtr );
-
-      if (NULL != nnzTotalDevHostPtr){
-          nnzC = *nnzTotalDevHostPtr;
-      }else{
-          cudaMemcpy(&nnzC, c_xadj+m, sizeof(int), cudaMemcpyDeviceToHost);
-          cudaMemcpy(&baseC, c_xadj, sizeof(int), cudaMemcpyDeviceToHost);
-          nnzC -= baseC;
-      }
-      handle->set_c_nnz(nnzC);
-      //entriesC = cin_nonzero_index_view_type(Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), nnzC);
-    }
-    else {
-      throw std::runtime_error ("CUSPARSE requires local ordinals to be integer.\n");
-      //return;
+  if (std::is_same<idx, int>::value && std::is_same<size_type, int>::value) {
+    const idx *a_xadj = (const idx *)row_mapA.data();
+    const idx *b_xadj = (const idx *)row_mapB.data();
+    idx *c_xadj       = (idx *)row_mapC.data();
+
+    const idx *a_adj = entriesA.data();
+    const idx *b_adj = entriesB.data();
+    handle->create_cuSPARSE_Handle(transposeA, transposeB);
+    typename KernelHandle::SPGEMMcuSparseHandleType *h =
+        handle->get_cuSparseHandle();
+
+    int nnzA = entriesA.extent(0);
+    int nnzB = entriesB.extent(0);
+
+    int baseC, nnzC;
+    int *nnzTotalDevHostPtr = &nnzC;
+
+    cusparseXcsrgemmNnz(h->handle, h->transA, h->transB, (int)m, (int)n, (int)k,
+                        h->a_descr, nnzA, (int *)a_xadj, (int *)a_adj,
+                        h->b_descr, nnzB, (int *)b_xadj, (int *)b_adj,
+                        h->c_descr, (int *)c_xadj, nnzTotalDevHostPtr);
+
+    if (NULL != nnzTotalDevHostPtr) {
+      nnzC = *nnzTotalDevHostPtr;
+    } else {
+      cudaMemcpy(&nnzC, c_xadj + m, sizeof(int), cudaMemcpyDeviceToHost);
+      cudaMemcpy(&baseC, c_xadj, sizeof(int), cudaMemcpyDeviceToHost);
+      nnzC -= baseC;
     }
+    handle->set_c_nnz(nnzC);
+    // entriesC =
+    // cin_nonzero_index_view_type(Kokkos::view_alloc(Kokkos::WithoutInitializing,
+    // "entriesC"), nnzC);
+  } else {
+    throw std::runtime_error(
+        "CUSPARSE requires local ordinals to be integer.\n");
+    // return;
+  }
 #endif
 #else
-    (void)handle;
-    (void)m;          (void)n;          (void)k;
-    (void)row_mapA;   (void)row_mapB;   (void)row_mapC;
-    (void)entriesA;   (void)entriesB;
-    (void)transposeA; (void)transposeB;
-    throw std::runtime_error ("CUSPARSE IS NOT DEFINED\n");
-    //return;
+  (void)handle;
+  (void)m;
+  (void)n;
+  (void)k;
+  (void)row_mapA;
+  (void)row_mapB;
+  (void)row_mapC;
+  (void)entriesA;
+  (void)entriesB;
+  (void)transposeA;
+  (void)transposeB;
+  throw std::runtime_error("CUSPARSE IS NOT DEFINED\n");
+  // return;
 #endif
+}
 
-  }
-
-
-
-  template <typename KernelHandle,
-  typename ain_row_index_view_type,
-  typename ain_nonzero_index_view_type,
-  typename ain_nonzero_value_view_type,
-  typename bin_row_index_view_type,
-  typename bin_nonzero_index_view_type,
-  typename bin_nonzero_value_view_type,
-  typename cin_row_index_view_type,
-  typename cin_nonzero_index_view_type,
-  typename cin_nonzero_value_view_type>
-  void cuSPARSE_apply(
-      KernelHandle *handle,
-      typename KernelHandle::nnz_lno_t m,
-      typename KernelHandle::nnz_lno_t n,
-      typename KernelHandle::nnz_lno_t k,
-      ain_row_index_view_type row_mapA,
-      ain_nonzero_index_view_type entriesA,
-      ain_nonzero_value_view_type valuesA,
-
-      bool /* transposeA */,
-      bin_row_index_view_type row_mapB,
-      bin_nonzero_index_view_type entriesB,
-      bin_nonzero_value_view_type valuesB,
-      bool /* transposeB */,
-      cin_row_index_view_type row_mapC,
-      cin_nonzero_index_view_type entriesC,
-      cin_nonzero_value_view_type valuesC){
-
+template <
+    typename KernelHandle, typename ain_row_index_view_type,
+    typename ain_nonzero_index_view_type, typename ain_nonzero_value_view_type,
+    typename bin_row_index_view_type, typename bin_nonzero_index_view_type,
+    typename bin_nonzero_value_view_type, typename cin_row_index_view_type,
+    typename cin_nonzero_index_view_type, typename cin_nonzero_value_view_type>
+void cuSPARSE_apply(
+    KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
+    typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k,
+    ain_row_index_view_type row_mapA, ain_nonzero_index_view_type entriesA,
+    ain_nonzero_value_view_type valuesA,
+
+    bool /* transposeA */, bin_row_index_view_type row_mapB,
+    bin_nonzero_index_view_type entriesB, bin_nonzero_value_view_type valuesB,
+    bool /* transposeB */, cin_row_index_view_type row_mapC,
+    cin_nonzero_index_view_type entriesC, cin_nonzero_value_view_type valuesC) {
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
 #if defined(CUSPARSE_VERSION) && (11000 <= CUSPARSE_VERSION)
-      throw std::runtime_error ("SpGEMM cuSPARSE backend is not yet supported for this CUDA version\n");
+  throw std::runtime_error(
+      "SpGEMM cuSPARSE backend is not yet supported for this CUDA version\n");
 #else
-    typedef typename KernelHandle::nnz_lno_t idx;
+  typedef typename KernelHandle::nnz_lno_t idx;
 
-    typedef typename KernelHandle::nnz_scalar_t value_type;
+  typedef typename KernelHandle::nnz_scalar_t value_type;
 
+  typedef typename ain_row_index_view_type::device_type device1;
+  typedef typename ain_nonzero_index_view_type::device_type device2;
+  typedef typename ain_nonzero_value_view_type::device_type device3;
 
-    typedef typename ain_row_index_view_type::device_type device1;
-    typedef typename ain_nonzero_index_view_type::device_type device2;
-    typedef typename ain_nonzero_value_view_type::device_type device3;
-
+  if (std::is_same<Kokkos::Cuda, device1>::value) {
+    throw std::runtime_error(
+        "MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSPARSE\n");
+    // return;
+  }
+  if (std::is_same<Kokkos::Cuda, device2>::value) {
+    throw std::runtime_error(
+        "MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSPARSE\n");
+    // return;
+  }
+  if (std::is_same<Kokkos::Cuda, device3>::value) {
+    throw std::runtime_error(
+        "MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSPARSE\n");
+    // return;
+  }
 
-    if (std::is_same<Kokkos::Cuda, device1 >::value){
-      throw std::runtime_error ("MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSPARSE\n");
-      //return;
-    }
-    if (std::is_same<Kokkos::Cuda, device2 >::value){
-      throw std::runtime_error ("MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSPARSE\n");
-      //return;
-    }
-    if (std::is_same<Kokkos::Cuda, device3 >::value){
-      throw std::runtime_error ("MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSPARSE\n");
-      //return;
+  if (std::is_same<idx, int>::value) {
+    int *a_xadj = (int *)row_mapA.data();
+    int *b_xadj = (int *)row_mapB.data();
+    int *c_xadj = (int *)row_mapC.data();
+
+    int *a_adj = (int *)entriesA.data();
+    int *b_adj = (int *)entriesB.data();
+    int *c_adj = (int *)entriesC.data();
+
+    typename KernelHandle::SPGEMMcuSparseHandleType *h =
+        handle->get_cuSparseHandle();
+
+    int nnzA = entriesA.extent(0);
+    int nnzB = entriesB.extent(0);
+
+    value_type *a_ew = (value_type *)valuesA.data();
+    value_type *b_ew = (value_type *)valuesB.data();
+    value_type *c_ew = (value_type *)valuesC.data();
+
+    if (std::is_same<value_type, float>::value) {
+      cusparseScsrgemm(h->handle, h->transA, h->transB, m, n, k, h->a_descr,
+                       nnzA, (float *)a_ew, a_xadj, a_adj, h->b_descr, nnzB,
+                       (float *)b_ew, b_xadj, b_adj, h->c_descr, (float *)c_ew,
+                       c_xadj, c_adj);
+    } else if (std::is_same<value_type, double>::value) {
+      cusparseDcsrgemm(h->handle, h->transA, h->transB, m, n, k, h->a_descr,
+                       nnzA, (double *)a_ew, a_xadj, a_adj, h->b_descr, nnzB,
+                       (double *)b_ew, b_xadj, b_adj, h->c_descr,
+                       (double *)c_ew, c_xadj, c_adj);
+    } else {
+      throw std::runtime_error(
+          "CUSPARSE requires float or double values. cuComplex and "
+          "cuDoubleComplex are not implemented yet.\n");
+      // return;
     }
 
-
-
-    if (std::is_same<idx, int>::value){
-      int *a_xadj = (int *)row_mapA.data();
-      int *b_xadj = (int *)row_mapB.data();
-      int *c_xadj = (int *)row_mapC.data();
-
-      int *a_adj = (int *)entriesA.data();
-      int *b_adj = (int *)entriesB.data();
-      int *c_adj = (int *)entriesC.data();
-
-
-      typename KernelHandle::SPGEMMcuSparseHandleType *h = handle->get_cuSparseHandle();
-
-      int nnzA = entriesA.extent(0);
-      int nnzB = entriesB.extent(0);
-
-      value_type *a_ew = (value_type *)valuesA.data();
-      value_type *b_ew = (value_type *)valuesB.data();
-      value_type *c_ew = (value_type *)valuesC.data();
-
-      if (std::is_same<value_type, float>::value){
-        cusparseScsrgemm(
-            h->handle,
-            h->transA,
-            h->transB,
-            m,
-            n,
-            k,
-            h->a_descr,
-            nnzA,
-            (float *)a_ew,
-            a_xadj,
-            a_adj,
-            h->b_descr,
-            nnzB,
-            (float *)b_ew,
-            b_xadj,
-            b_adj,
-            h->c_descr,
-            (float *)c_ew,
-            c_xadj,
-            c_adj);
-      }
-      else if (std::is_same<value_type, double>::value){
-        cusparseDcsrgemm(
-            h->handle,
-            h->transA,
-            h->transB,
-            m,
-            n,
-            k,
-            h->a_descr,
-            nnzA,
-            (double *)a_ew,
-            a_xadj,
-            a_adj,
-            h->b_descr,
-            nnzB,
-            (double *)b_ew,
-            b_xadj,
-            b_adj,
-            h->c_descr,
-            (double *)c_ew,
-            c_xadj,
-            c_adj);
-      }
-      else {
-        throw std::runtime_error ("CUSPARSE requires float or double values. cuComplex and cuDoubleComplex are not implemented yet.\n");
-        //return;
-      }
-
-
-
-
-    }
-    else {
-      throw std::runtime_error ("CUSPARSE requires local ordinals to be integer.\n");
-      //return;
-    }
+  } else {
+    throw std::runtime_error(
+        "CUSPARSE requires local ordinals to be integer.\n");
+    // return;
+  }
 #endif
 #else
-    (void)handle;
-    (void)m;        (void)n;        (void)k;
-    (void)row_mapA; (void)row_mapB; (void)row_mapC;
-    (void)entriesA; (void)entriesB; (void)entriesC;
-    (void)valuesA;  (void)valuesB;  (void)valuesC;
-    throw std::runtime_error ("CUSPARSE IS NOT DEFINED\n");
-    //return;
+  (void)handle;
+  (void)m;
+  (void)n;
+  (void)k;
+  (void)row_mapA;
+  (void)row_mapB;
+  (void)row_mapC;
+  (void)entriesA;
+  (void)entriesB;
+  (void)entriesC;
+  (void)valuesA;
+  (void)valuesB;
+  (void)valuesC;
+  throw std::runtime_error("CUSPARSE IS NOT DEFINED\n");
+  // return;
 #endif
-  }
-}
 }
+}  // namespace Impl
+}  // namespace KokkosSparse
 
 #endif
diff --git a/src/sparse/impl/KokkosSparse_spgemm_imp_outer.hpp b/src/sparse/impl/KokkosSparse_spgemm_imp_outer.hpp
index 57afdbc632..e566e8bf06 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_imp_outer.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_imp_outer.hpp
@@ -42,46 +42,46 @@
 //@HEADER
  */
 
-#if defined( KOKKOS_ENABLE_OPENMP )
+#if defined(KOKKOS_ENABLE_OPENMP)
 #ifdef KOKKOSKERNELS_HAVE_OUTER
-#include<parallel/multiseq_selection.h>
-#include<parallel/multiway_merge.h>
-#include<parallel/merge.h>
-#include<parallel/multiway_mergesort.h>
+#include <parallel/multiseq_selection.h>
+#include <parallel/multiway_merge.h>
+#include <parallel/merge.h>
+#include <parallel/multiway_mergesort.h>
 #endif
 #endif
 
-namespace KokkosSparse{
+namespace KokkosSparse {
 
-namespace Impl{
+namespace Impl {
 
-#if defined( KOKKOS_ENABLE_OPENMP )
+#if defined(KOKKOS_ENABLE_OPENMP)
 #ifdef KOKKOSKERNELS_HAVE_OUTER
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-struct KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-    Triplet{
-  nnz_lno_t src, dst;//, block;
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                    a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                    b_scalar_nnz_view_t_>::Triplet {
+  nnz_lno_t src, dst;  //, block;
   scalar_t val;
-  bool operator<(const Triplet & a) const {
+  bool operator<(const Triplet &a) const {
     return (this->src < a.src) || (this->src == a.src && this->dst < a.dst);
   }
 };
 
-
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-template <typename a_col_view_t, typename a_nnz_view_t, typename a_scalar_view_t,
-typename b_row_view_t, typename b_nnz_view_t, typename b_scalar_view_t,
-typename flop_row_view_t>
-struct KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-  OuterProduct{
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename a_col_view_t, typename a_nnz_view_t,
+          typename a_scalar_view_t, typename b_row_view_t,
+          typename b_nnz_view_t, typename b_scalar_view_t,
+          typename flop_row_view_t>
+struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                    a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                    b_scalar_nnz_view_t_>::OuterProduct {
   nnz_lno_t begin, end, team_work_size;
   a_col_view_t a_col_xadj;
   a_nnz_view_t a_adj;
@@ -93,217 +93,196 @@ struct KokkosSPGEMM
 
   flop_row_view_t flop_per_row;
 
-
-  Kokkos::View <Triplet *,  MyTempMemorySpace> triplets;
-
-  OuterProduct (
-      nnz_lno_t begin_,
-      nnz_lno_t end_,
-      nnz_lno_t team_work_size_,
-      a_col_view_t a_col_xadj_,
-      a_nnz_view_t a_adj_,
-      a_scalar_view_t a_vals_,
-      b_row_view_t b_row_xadj_,
-      b_nnz_view_t b_adj_,
-      b_scalar_view_t b_vals_,
-      flop_row_view_t flop_per_row_,
-      Kokkos::View <Triplet *,  MyTempMemorySpace> triplets_): begin(begin_), end(end_), team_work_size(team_work_size_),
-          a_col_xadj(a_col_xadj_), a_adj(a_adj_), a_vals(a_vals_),
-          b_row_xadj(b_row_xadj_), b_adj(b_adj_), b_vals(b_vals_),
-          flop_per_row(flop_per_row_),
-          triplets(triplets_)
-  {}
-
-
-  //assumes that the vector lane is 1, as in cpus
+  Kokkos::View<Triplet *, MyTempMemorySpace> triplets;
+
+  OuterProduct(nnz_lno_t begin_, nnz_lno_t end_, nnz_lno_t team_work_size_,
+               a_col_view_t a_col_xadj_, a_nnz_view_t a_adj_,
+               a_scalar_view_t a_vals_, b_row_view_t b_row_xadj_,
+               b_nnz_view_t b_adj_, b_scalar_view_t b_vals_,
+               flop_row_view_t flop_per_row_,
+               Kokkos::View<Triplet *, MyTempMemorySpace> triplets_)
+      : begin(begin_),
+        end(end_),
+        team_work_size(team_work_size_),
+        a_col_xadj(a_col_xadj_),
+        a_adj(a_adj_),
+        a_vals(a_vals_),
+        b_row_xadj(b_row_xadj_),
+        b_adj(b_adj_),
+        b_vals(b_vals_),
+        flop_per_row(flop_per_row_),
+        triplets(triplets_) {}
+
+  // assumes that the vector lane is 1, as in cpus
   KOKKOS_INLINE_FUNCTION
-  void operator()(const team_member_t & teamMember) const {
-
-    nnz_lno_t team_row_begin = teamMember.league_rank()  * team_work_size + begin;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, end);
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_col_ind) {
-      const size_type a_col_begin = a_col_xadj[row_col_ind];
-      const size_type a_col_end = a_col_xadj[row_col_ind + 1];
-      size_t write_begin_index = flop_per_row[row_col_ind] - flop_per_row[begin];
-
-      const nnz_lno_t a_col_size = a_col_end - a_col_begin;
-
-
-      const size_type b_row_begin = b_row_xadj[row_col_ind];
-      const size_type b_row_end = b_row_xadj[row_col_ind + 1];
-      const nnz_lno_t b_row_size = b_row_end - b_row_begin;
-
-      for (nnz_lno_t i = 0; i < a_col_size; ++ i){
-        size_type a_nnz_ind = i + a_col_begin;
-
-        nnz_lno_t a_row = a_adj[a_nnz_ind];
-        scalar_t a_val = a_vals[a_nnz_ind];
-
-        Kokkos::parallel_for( Kokkos::ThreadVectorRange(teamMember, b_row_size),
-            [&] (const nnz_lno_t b_ind) {
-          size_type b_nnz_ind = b_ind + b_row_begin;
-          nnz_lno_t b_col = b_adj [b_nnz_ind];
-          scalar_t  b_val = b_vals[b_nnz_ind];
-          scalar_t c_val = b_val * a_val;
-          size_t write_index = write_begin_index + b_ind;
-          triplets[write_index].src = a_row;
-          triplets[write_index].dst = b_col;
-          triplets[write_index].val = c_val;
+  void operator()(const team_member_t &teamMember) const {
+    nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_work_size + begin;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, end);
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_col_ind) {
+          const size_type a_col_begin = a_col_xadj[row_col_ind];
+          const size_type a_col_end   = a_col_xadj[row_col_ind + 1];
+          size_t write_begin_index =
+              flop_per_row[row_col_ind] - flop_per_row[begin];
+
+          const nnz_lno_t a_col_size = a_col_end - a_col_begin;
+
+          const size_type b_row_begin = b_row_xadj[row_col_ind];
+          const size_type b_row_end   = b_row_xadj[row_col_ind + 1];
+          const nnz_lno_t b_row_size  = b_row_end - b_row_begin;
+
+          for (nnz_lno_t i = 0; i < a_col_size; ++i) {
+            size_type a_nnz_ind = i + a_col_begin;
+
+            nnz_lno_t a_row = a_adj[a_nnz_ind];
+            scalar_t a_val  = a_vals[a_nnz_ind];
+
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(teamMember, b_row_size),
+                [&](const nnz_lno_t b_ind) {
+                  size_type b_nnz_ind       = b_ind + b_row_begin;
+                  nnz_lno_t b_col           = b_adj[b_nnz_ind];
+                  scalar_t b_val            = b_vals[b_nnz_ind];
+                  scalar_t c_val            = b_val * a_val;
+                  size_t write_index        = write_begin_index + b_ind;
+                  triplets[write_index].src = a_row;
+                  triplets[write_index].dst = b_col;
+                  triplets[write_index].val = c_val;
+                });
+            write_begin_index += b_row_size;
+          }
         });
-        write_begin_index += b_row_size;
-      }
-
-    });
-
   }
-
-
 };
 
-
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
 template <typename triplet_view_t>
-void KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-    sort_triplets(triplet_view_t triplets, size_t num_triplets){
-
-  //std::sort (triplets.data(), triplets.data() + num_triplets);
+void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                  a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                  b_scalar_nnz_view_t_>::sort_triplets(triplet_view_t triplets,
+                                                       size_t num_triplets) {
+  // std::sort (triplets.data(), triplets.data() + num_triplets);
   typedef typename triplet_view_t::value_type element_t;
-  __gnu_parallel::parallel_sort_mwms<false,true,element_t*>
-  (triplets.data(),
-      triplets.data()+num_triplets,
-      std::less<element_t>(), this->concurrency);
+  __gnu_parallel::parallel_sort_mwms<false, true, element_t *>(
+      triplets.data(), triplets.data() + num_triplets, std::less<element_t>(),
+      this->concurrency);
 }
 
-
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
 template <typename host_triplet_view_t>
-void KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-    merge_triplets_on_slow_memory(
-    host_triplet_view_t *triplets,
-    size_t num_blocks,
-    size_t overall_size,
-    host_triplet_view_t output_triplets){
+void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                  a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                  b_scalar_nnz_view_t_>::
+    merge_triplets_on_slow_memory(host_triplet_view_t *triplets,
+                                  size_t num_blocks, size_t overall_size,
+                                  host_triplet_view_t output_triplets) {
   typedef typename host_triplet_view_t::value_type element_t;
-  typedef typename std::pair<element_t*,element_t*> _RAIterTriple;
+  typedef typename std::pair<element_t *, element_t *> _RAIterTriple;
   std::vector<_RAIterTriple> seqs(num_blocks);
   typedef typename std::vector<_RAIterTriple>::iterator _RAIterTripleIterator;
 
-
-  for (size_t i = 0; i < num_blocks; ++i){
-    seqs[i].first = triplets[i].data();
+  for (size_t i = 0; i < num_blocks; ++i) {
+    seqs[i].first  = triplets[i].data();
     seqs[i].second = triplets[i].data() + triplets[i].extent(0);
   }
 
   __gnu_parallel::multiway_merge
-  //<_RAIterTripleIterator,
-  //element_t*,
-  //uint64_t,
-  //std::less<element_t> >
-  (seqs.begin(),seqs.end(),
-      output_triplets.data(),
-      overall_size,std::less<element_t>());
-
+      //<_RAIterTripleIterator,
+      // element_t*,
+      // uint64_t,
+      // std::less<element_t> >
+      (seqs.begin(), seqs.end(), output_triplets.data(), overall_size,
+       std::less<element_t>());
 }
 
-template <
-  typename HandleType,
-  typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-  typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
 template <typename triplet_view_t>
-size_t KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-    collapse_triplets(triplet_view_t triplets, size_t num_triplets){
-  if (0)
-    return this->collapse_triplets_omp(triplets, num_triplets);
-  //if(0)
-  else
-  {
+size_t
+KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
+             b_lno_row_view_t_, b_lno_nnz_view_t_,
+             b_scalar_nnz_view_t_>::collapse_triplets(triplet_view_t triplets,
+                                                      size_t num_triplets) {
+  if (0) return this->collapse_triplets_omp(triplets, num_triplets);
+  // if(0)
+  else {
     nnz_lno_t src = 0, dst = 0;
     scalar_t val = 0;
-    if (num_triplets > 0){
+    if (num_triplets > 0) {
       src = triplets[0].src;
       dst = triplets[0].dst;
       val = triplets[0].val;
     }
 
     size_t write_index = 0;
-    for(size_t i = 1; i < num_triplets; ++i){
-      if (src == triplets[i].src && dst == triplets[i].dst){
+    for (size_t i = 1; i < num_triplets; ++i) {
+      if (src == triplets[i].src && dst == triplets[i].dst) {
         val += triplets[i].val;
-      }
-      else {
-        triplets[write_index  ].src = src;
-        triplets[write_index  ].dst = dst;
+      } else {
+        triplets[write_index].src   = src;
+        triplets[write_index].dst   = dst;
         triplets[write_index++].val = val;
-        src = triplets[i].src;
-        dst = triplets[i].dst;
-        val = triplets[i].val;
+        src                         = triplets[i].src;
+        dst                         = triplets[i].dst;
+        val                         = triplets[i].val;
       }
     }
 
-    triplets[write_index  ].src = src;
-    triplets[write_index  ].dst = dst;
+    triplets[write_index].src   = src;
+    triplets[write_index].dst   = dst;
     triplets[write_index++].val = val;
     std::cout << "write_index:" << write_index << std::endl;
     return write_index;
   }
 }
 
-
-
-template <
-  typename HandleType,
-  typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-  typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-template <typename triplet_view_t,
-typename c_row_view_t,
-typename c_lno_nnz_view_t, typename c_scalar_nnz_view_t>
-size_t KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-    final_collapse_triplets_omp(
-    triplet_view_t triplets,
-    size_t num_triplets,
-    c_row_view_t &rowmapC_,
-    c_lno_nnz_view_t &entriesC_,
-    c_scalar_nnz_view_t &valuesC_){
-
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename triplet_view_t, typename c_row_view_t,
+          typename c_lno_nnz_view_t, typename c_scalar_nnz_view_t>
+size_t
+KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
+             b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
+    final_collapse_triplets_omp(triplet_view_t triplets, size_t num_triplets,
+                                c_row_view_t &rowmapC_,
+                                c_lno_nnz_view_t &entriesC_,
+                                c_scalar_nnz_view_t &valuesC_) {
   int tnum = 1;
 #pragma omp parallel
-  {
-    tnum = omp_get_num_threads();
-  }
-  std::vector <size_t> num_triplets_begin_index (tnum + 1, 0);
+  { tnum = omp_get_num_threads(); }
+  std::vector<size_t> num_triplets_begin_index(tnum + 1, 0);
   size_t chunksize = num_triplets / tnum + 1;
 
-  for (int i = 1; i < tnum; ++i){
+  for (int i = 1; i < tnum; ++i) {
     size_t begin = chunksize * i;
 
     nnz_lno_t src = 0, dst = 0;
 
     src = triplets[begin].src;
     dst = triplets[begin].dst;
-    while (begin > 0){
-
+    while (begin > 0) {
       nnz_lno_t src2 = 0, dst2 = 0;
 
       src2 = triplets[begin - 1].src;
       dst2 = triplets[begin - 1].dst;
 
-      if (src == src2 && dst2 == dst){
+      if (src == src2 && dst2 == dst) {
         --begin;
-      }
-      else {
+      } else {
         break;
       }
     }
@@ -311,134 +290,123 @@ size_t KokkosSPGEMM
   }
   num_triplets_begin_index[tnum] = num_triplets;
 
-  std::vector <size_t> num_collapsed_triplets_per_thread (tnum + 1);
-  std::vector <size_t> write_triplets_pos_per_thread (tnum + 1);
+  std::vector<size_t> num_collapsed_triplets_per_thread(tnum + 1);
+  std::vector<size_t> write_triplets_pos_per_thread(tnum + 1);
 
 #pragma omp parallel
   {
-
     int tid = 0;
-    tid = omp_get_thread_num();
+    tid     = omp_get_thread_num();
 
-    size_t begin = num_triplets_begin_index[tid];
-    size_t end = num_triplets_begin_index[tid + 1];
+    size_t begin               = num_triplets_begin_index[tid];
+    size_t end                 = num_triplets_begin_index[tid + 1];
     size_t triplet_write_index = begin;
 
-
     nnz_lno_t src = 0, dst = 0;
     scalar_t val = 0;
-    if (begin < end){
+    if (begin < end) {
       src = triplets[begin].src;
       dst = triplets[begin].dst;
       val = triplets[begin].val;
     }
-    for (size_t i = begin + 1; i < end; ++i){
-      if (src == triplets[i].src &&
-          dst == triplets[i].dst){
+    for (size_t i = begin + 1; i < end; ++i) {
+      if (src == triplets[i].src && dst == triplets[i].dst) {
         val += triplets[i].val;
-      }
-      else {
-        triplets[triplet_write_index].src = src;
-        triplets[triplet_write_index].dst = dst;
+      } else {
+        triplets[triplet_write_index].src   = src;
+        triplets[triplet_write_index].dst   = dst;
         triplets[triplet_write_index++].val = val;
 
         src = triplets[i].src;
         dst = triplets[i].dst;
         val = triplets[i].val;
-
       }
     }
-    if (begin < end){
-
-      triplets[triplet_write_index  ].src = src;
-      triplets[triplet_write_index  ].dst = dst;
+    if (begin < end) {
+      triplets[triplet_write_index].src   = src;
+      triplets[triplet_write_index].dst   = dst;
       triplets[triplet_write_index++].val = val;
     }
     num_collapsed_triplets_per_thread[tid] = triplet_write_index;
-
   }
 
-
-
   size_t overall_size = 0;
-  for (int i = 0; i < tnum; ++i){
+  for (int i = 0; i < tnum; ++i) {
     write_triplets_pos_per_thread[i] = overall_size;
-    overall_size += num_collapsed_triplets_per_thread[i] - num_triplets_begin_index[i];
-
+    overall_size +=
+        num_collapsed_triplets_per_thread[i] - num_triplets_begin_index[i];
   }
   write_triplets_pos_per_thread[tnum] = overall_size;
-  //std::cout << "overall_size:" << overall_size << std::endl;
-
+  // std::cout << "overall_size:" << overall_size << std::endl;
 
-  //size_t write_index = num_collapsed_triplets_per_thread[0];
+  // size_t write_index = num_collapsed_triplets_per_thread[0];
 
-  entriesC_ = c_lno_nnz_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC_"),overall_size);
-  valuesC_ = c_scalar_nnz_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC_"),overall_size);
-  rowmapC_(0) = 0;
+  entriesC_ = c_lno_nnz_view_t(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC_"),
+      overall_size);
+  valuesC_ = c_scalar_nnz_view_t(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC_"),
+      overall_size);
+  rowmapC_(0)               = 0;
   rowmapC_(this->a_row_cnt) = overall_size;
 #pragma omp parallel
   {
-
-    int tid = 0;
-    tid = omp_get_thread_num();
-    size_t begin = num_triplets_begin_index[tid];
-    size_t end = num_collapsed_triplets_per_thread[tid];
-    size_t write_index = write_triplets_pos_per_thread[tid];
+    int tid                     = 0;
+    tid                         = omp_get_thread_num();
+    size_t begin                = num_triplets_begin_index[tid];
+    size_t end                  = num_collapsed_triplets_per_thread[tid];
+    size_t write_index          = write_triplets_pos_per_thread[tid];
     nnz_lno_t current_row_index = triplets[begin].src;
     rowmapC_(current_row_index) = write_index;
 
-    for (size_t j = begin; j < end; ++j){
-      while (triplets[j].src != current_row_index){
+    for (size_t j = begin; j < end; ++j) {
+      while (triplets[j].src != current_row_index) {
         rowmapC_(++current_row_index) = write_index + j - begin;
       }
       entriesC_[write_index + j - begin] = triplets[j].dst;
-      valuesC_[write_index + j - begin] = triplets[j].val;
+      valuesC_[write_index + j - begin]  = triplets[j].val;
     }
 
-    //std::cout << "current_row_index:" << current_row_index << " rowend:" << write_index + end - begin << std::endl;
-    //write_index += end - begin;
+    // std::cout << "current_row_index:" << current_row_index << " rowend:" <<
+    // write_index + end - begin << std::endl; write_index += end - begin;
   }
 
   return overall_size;
 }
 
-
-template <
-  typename HandleType,
-  typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-  typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
 template <typename triplet_view_t>
-size_t KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-   collapse_triplets_omp(triplet_view_t triplets, size_t num_triplets, triplet_view_t out_triplets){
-
+size_t KokkosSPGEMM<
+    HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
+    b_lno_row_view_t_, b_lno_nnz_view_t_,
+    b_scalar_nnz_view_t_>::collapse_triplets_omp(triplet_view_t triplets,
+                                                 size_t num_triplets,
+                                                 triplet_view_t out_triplets) {
   int tnum = 1;
 #pragma omp parallel
-  {
-    tnum = omp_get_num_threads();
-  }
-  std::vector <size_t> num_triplets_begin_index (tnum + 1, 0);
+  { tnum = omp_get_num_threads(); }
+  std::vector<size_t> num_triplets_begin_index(tnum + 1, 0);
   size_t chunksize = num_triplets / tnum + 1;
 
-  for (int i = 1; i < tnum; ++i){
+  for (int i = 1; i < tnum; ++i) {
     size_t begin = chunksize * i;
 
     nnz_lno_t src = 0, dst = 0;
 
     src = triplets[begin].src;
     dst = triplets[begin].dst;
-    while (begin > 0){
-
+    while (begin > 0) {
       nnz_lno_t src2 = 0, dst2 = 0;
 
       src2 = triplets[begin - 1].src;
       dst2 = triplets[begin - 1].dst;
 
-      if (src == src2 && dst2 == dst){
+      if (src == src2 && dst2 == dst) {
         --begin;
-      }
-      else {
+      } else {
         break;
       }
     }
@@ -446,321 +414,321 @@ size_t KokkosSPGEMM
   }
   num_triplets_begin_index[tnum] = num_triplets;
 
-  std::vector <size_t> num_collapsed_triplets_per_thread (tnum + 1);
-  std::vector <size_t> write_triplets_pos_per_thread (tnum + 1);
+  std::vector<size_t> num_collapsed_triplets_per_thread(tnum + 1);
+  std::vector<size_t> write_triplets_pos_per_thread(tnum + 1);
 
 #pragma omp parallel
   {
-
     int tid = 0;
-    tid = omp_get_thread_num();
+    tid     = omp_get_thread_num();
 
-    size_t begin = num_triplets_begin_index[tid];
-    size_t end = num_triplets_begin_index[tid + 1];
+    size_t begin               = num_triplets_begin_index[tid];
+    size_t end                 = num_triplets_begin_index[tid + 1];
     size_t triplet_write_index = begin;
 
-
     nnz_lno_t src = 0, dst = 0;
     scalar_t val = 0;
-    if (begin < end){
+    if (begin < end) {
       src = triplets[begin].src;
       dst = triplets[begin].dst;
       val = triplets[begin].val;
     }
-    for (size_t i = begin + 1; i < end; ++i){
-      if (src == triplets[i].src &&
-          dst == triplets[i].dst){
+    for (size_t i = begin + 1; i < end; ++i) {
+      if (src == triplets[i].src && dst == triplets[i].dst) {
         val += triplets[i].val;
-      }
-      else {
-        triplets[triplet_write_index].src = src;
-        triplets[triplet_write_index].dst = dst;
+      } else {
+        triplets[triplet_write_index].src   = src;
+        triplets[triplet_write_index].dst   = dst;
         triplets[triplet_write_index++].val = val;
 
         src = triplets[i].src;
         dst = triplets[i].dst;
         val = triplets[i].val;
-
       }
     }
-    if (begin < end){
-
-      triplets[triplet_write_index  ].src = src;
-      triplets[triplet_write_index  ].dst = dst;
+    if (begin < end) {
+      triplets[triplet_write_index].src   = src;
+      triplets[triplet_write_index].dst   = dst;
       triplets[triplet_write_index++].val = val;
     }
     num_collapsed_triplets_per_thread[tid] = triplet_write_index;
-
   }
 
-
-
   size_t overall_size = 0;
-  for (int i = 0; i < tnum; ++i){
+  for (int i = 0; i < tnum; ++i) {
     write_triplets_pos_per_thread[i] = overall_size;
-    overall_size += num_collapsed_triplets_per_thread[i] - num_triplets_begin_index[i];
-
+    overall_size +=
+        num_collapsed_triplets_per_thread[i] - num_triplets_begin_index[i];
   }
   write_triplets_pos_per_thread[tnum] = overall_size;
-  //std::cout << "overall_size:" << overall_size << std::endl;
+  // std::cout << "overall_size:" << overall_size << std::endl;
 
-
-  //size_t write_index = num_collapsed_triplets_per_thread[0];
+  // size_t write_index = num_collapsed_triplets_per_thread[0];
 
 #pragma omp parallel
   {
-
-    int tid = 0;
-    tid = omp_get_thread_num();
-    size_t begin = num_triplets_begin_index[tid];
-    size_t end = num_collapsed_triplets_per_thread[tid];
+    int tid            = 0;
+    tid                = omp_get_thread_num();
+    size_t begin       = num_triplets_begin_index[tid];
+    size_t end         = num_collapsed_triplets_per_thread[tid];
     size_t write_index = write_triplets_pos_per_thread[tid];
 
-    for (size_t j = begin; j < end; ++j){
+    for (size_t j = begin; j < end; ++j) {
       out_triplets[write_index + j - begin].src = triplets[j].src;
       out_triplets[write_index + j - begin].dst = triplets[j].dst;
       out_triplets[write_index + j - begin].val = triplets[j].val;
     }
-    //write_index += end - begin;
+    // write_index += end - begin;
   }
   return overall_size;
 }
 
-
-template <
-  typename HandleType,
-  typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-  typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-template <typename a_row_view_t, typename b_row_view_t, typename flop_row_view_t>
-struct KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-  FlopsPerRowOuter{
-  nnz_lno_t n; //num rows
-  a_row_view_t row_mapA;  //row pointers of a
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename a_row_view_t, typename b_row_view_t,
+          typename flop_row_view_t>
+struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                    a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                    b_scalar_nnz_view_t_>::FlopsPerRowOuter {
+  nnz_lno_t n;            // num rows
+  a_row_view_t row_mapA;  // row pointers of a
   b_row_view_t row_mapB;
   flop_row_view_t flop_per_row;
-  FlopsPerRowOuter(
-      nnz_lno_t n_,
-      a_row_view_t row_mapA_,
-      b_row_view_t row_mapB_,
-      flop_row_view_t flop_per_row_):
-        n(n_),
+  FlopsPerRowOuter(nnz_lno_t n_, a_row_view_t row_mapA_, b_row_view_t row_mapB_,
+                   flop_row_view_t flop_per_row_)
+      : n(n_),
         row_mapA(row_mapA_),
-        row_mapB(row_mapB_), flop_per_row(flop_per_row_){}
+        row_mapB(row_mapB_),
+        flop_per_row(flop_per_row_) {}
   KOKKOS_INLINE_FUNCTION
-  void operator()(const size_t & i) const {
-    flop_per_row(i) = (row_mapB(i + 1) - row_mapB(i)) * (row_mapA(i + 1) - row_mapA(i));
+  void operator()(const size_t &i) const {
+    flop_per_row(i) =
+        (row_mapB(i + 1) - row_mapB(i)) * (row_mapA(i + 1) - row_mapA(i));
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join (volatile size_type& dst,const volatile size_type& src) const {
-    if (dst < src) { dst = src;}
+  void join(volatile size_type &dst, const volatile size_type &src) const {
+    if (dst < src) {
+      dst = src;
+    }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void init (size_type& dst) const
-  {
-    dst = min_val;
-  }
+  void init(size_type &dst) const { dst = min_val; }
 };
 
-
-
-template <
-  typename HandleType,
-  typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-  typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-template <typename c_row_view_t, typename c_lno_nnz_view_t, typename c_scalar_nnz_view_t>
-void KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename c_row_view_t, typename c_lno_nnz_view_t,
+          typename c_scalar_nnz_view_t>
+void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                  a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                  b_scalar_nnz_view_t_>::
     KokkosSPGEMM_numeric_outer(
-    c_row_view_t &rowmapC_,
-    c_lno_nnz_view_t &entriesC_,
-    c_scalar_nnz_view_t &valuesC_,
-    KokkosKernels::Impl::ExecSpaceType my_exec_space){
-
-  //const size_t block_size = 300000000;
+        c_row_view_t &rowmapC_, c_lno_nnz_view_t &entriesC_,
+        c_scalar_nnz_view_t &valuesC_,
+        KokkosKernels::Impl::ExecSpaceType my_exec_space) {
+  // const size_t block_size = 300000000;
   size_t block_size = 200000000;
-  char* env_p;
-  if(env_p = std::getenv("BS")){
-    std::cout << "resetting blocksize:" << block_size
-        <<  " to:" << env_p << std::endl;
+  char *env_p;
+  if (env_p = std::getenv("BS")) {
+    std::cout << "resetting blocksize:" << block_size << " to:" << env_p
+              << std::endl;
     block_size = atoi(env_p);
   }
 
-
   typedef Kokkos::Device<Kokkos::OpenMP, Kokkos::HostSpace> mySlowMemory;
-  typedef Kokkos::Device<Kokkos::OpenMP, Kokkos::OpenMP::memory_space> myFastMemory;
+  typedef Kokkos::Device<Kokkos::OpenMP, Kokkos::OpenMP::memory_space>
+      myFastMemory;
 
   ////////////////////
-  //TRANSPOSE MATRIX
+  // TRANSPOSE MATRIX
   ////////////////////
-  //get the suggested vectorlane size based on the execution space, and average number of nnzs per row.
-  int suggested_vector_size = this->handle->get_suggested_vector_size(b_row_cnt, entriesB.extent(0));
-  //get the suggested team size.
-  int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-  //get the chunk size suggested by the handle.
-  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(suggested_team_size, this->concurrency , b_row_cnt);
-
-  //step-1 tranpose the first matrix.
+  // get the suggested vectorlane size based on the execution space, and average
+  // number of nnzs per row.
+  int suggested_vector_size =
+      this->handle->get_suggested_vector_size(b_row_cnt, entriesB.extent(0));
+  // get the suggested team size.
+  int suggested_team_size =
+      this->handle->get_suggested_team_size(suggested_vector_size);
+  // get the chunk size suggested by the handle.
+  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(
+      suggested_team_size, this->concurrency, b_row_cnt);
+
+  // step-1 tranpose the first matrix.
   Kokkos::Timer timer1, timer_all;
-  row_lno_temp_work_view_t transpose_col_xadj ("transpose_col_xadj", b_row_cnt + 1);
-  nnz_lno_temp_work_view_t transpose_col_adj (Kokkos::view_alloc(Kokkos::WithoutInitializing, "transpose_col_adj"), entriesA.extent(0));
-  scalar_temp_work_view_t tranpose_vals (Kokkos::view_alloc(Kokkos::WithoutInitializing, "transpose_col_values"), entriesA.extent(0));
-
-  KokkosKernels::Impl::transpose_matrix
-  <const_a_lno_row_view_t, const_a_lno_nnz_view_t, const_a_scalar_nnz_view_t,
-  row_lno_temp_work_view_t, nnz_lno_temp_work_view_t, scalar_temp_work_view_t,
-  row_lno_temp_work_view_t, MyExecSpace>
-  (a_row_cnt, b_row_cnt,
-      row_mapA, entriesA, valsA,
-      transpose_col_xadj,transpose_col_adj, tranpose_vals);
+  row_lno_temp_work_view_t transpose_col_xadj("transpose_col_xadj",
+                                              b_row_cnt + 1);
+  nnz_lno_temp_work_view_t transpose_col_adj(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "transpose_col_adj"),
+      entriesA.extent(0));
+  scalar_temp_work_view_t tranpose_vals(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "transpose_col_values"),
+      entriesA.extent(0));
+
+  KokkosKernels::Impl::transpose_matrix<
+      const_a_lno_row_view_t, const_a_lno_nnz_view_t, const_a_scalar_nnz_view_t,
+      row_lno_temp_work_view_t, nnz_lno_temp_work_view_t,
+      scalar_temp_work_view_t, row_lno_temp_work_view_t, MyExecSpace>(
+      a_row_cnt, b_row_cnt, row_mapA, entriesA, valsA, transpose_col_xadj,
+      transpose_col_adj, tranpose_vals);
 
   MyExecSpace().fence();
-  if (KOKKOSKERNELS_VERBOSE){
-    std::cout << "\t\tTranspose FlopsPerRowOuterCal BlockPartition FastAllocation";
-    std::cout << " Outer Sort Collapse CopyToSLOW MultiWayMerge FinalCollapse Overall"<<std::endl;
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout
+        << "\t\tTranspose FlopsPerRowOuterCal BlockPartition FastAllocation";
+    std::cout
+        << " Outer Sort Collapse CopyToSLOW MultiWayMerge FinalCollapse Overall"
+        << std::endl;
   }
-  if (KOKKOSKERNELS_VERBOSE){
-    //std::cout << "\t\tTranspose TIME:" << timer1.seconds() << std::endl;
+  if (KOKKOSKERNELS_VERBOSE) {
+    // std::cout << "\t\tTranspose TIME:" << timer1.seconds() << std::endl;
     std::cout << "\t\t" << timer1.seconds() << " ";
   }
   ////////////////////
-  //TRANSPOSE MATRIX OVER
+  // TRANSPOSE MATRIX OVER
   ////////////////////
 
-
   ////////////////////////////////////////
-  //CALCULATE FLOPS PER OUTER PRODUCT
+  // CALCULATE FLOPS PER OUTER PRODUCT
   ////////////////////////////////////////
   timer1.reset();
-  typedef Kokkos::View <size_t *,  mySlowMemory> size_t_view_t;
-  size_t_view_t flop_per_row (Kokkos::view_alloc(Kokkos::WithoutInitializing, "flops per row"), b_row_cnt);
-  FlopsPerRowOuter<
-  row_lno_temp_work_view_t,
-  const_b_lno_row_view_t,
-  size_t_view_t> fpr(b_row_cnt, transpose_col_xadj, row_mapB, flop_per_row);
-  Kokkos::parallel_for(Kokkos::RangePolicy<MyExecSpace> (0,b_row_cnt), fpr);
+  typedef Kokkos::View<size_t *, mySlowMemory> size_t_view_t;
+  size_t_view_t flop_per_row(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "flops per row"),
+      b_row_cnt);
+  FlopsPerRowOuter<row_lno_temp_work_view_t, const_b_lno_row_view_t,
+                   size_t_view_t>
+      fpr(b_row_cnt, transpose_col_xadj, row_mapB, flop_per_row);
+  Kokkos::parallel_for(Kokkos::RangePolicy<MyExecSpace>(0, b_row_cnt), fpr);
   MyExecSpace().fence();
 
-  KokkosKernels::Impl::
-  exclusive_parallel_prefix_sum<size_t_view_t, MyExecSpace>
-  (b_row_cnt + 1, flop_per_row);
+  KokkosKernels::Impl::exclusive_parallel_prefix_sum<size_t_view_t,
+                                                     MyExecSpace>(b_row_cnt + 1,
+                                                                  flop_per_row);
 
-  auto num_flops = Kokkos::subview(flop_per_row, b_row_cnt);
-  auto h_num_flops = Kokkos::create_mirror_view (num_flops);
-  Kokkos::deep_copy (h_num_flops, num_flops);
+  auto num_flops   = Kokkos::subview(flop_per_row, b_row_cnt);
+  auto h_num_flops = Kokkos::create_mirror_view(num_flops);
+  Kokkos::deep_copy(h_num_flops, num_flops);
   MyExecSpace().fence();
   size_t num_required_flops = h_num_flops();
 
-  if (KOKKOSKERNELS_VERBOSE){
-
+  if (KOKKOSKERNELS_VERBOSE) {
     std::cout << timer1.seconds() << " ";
-    //std::cout << "\t\tnum_required_flops:" << num_required_flops << std::endl;
-    //std::cout << "\t\tFlopsPerRowOuter TIME:" << timer1.seconds() << std::endl;
+    // std::cout << "\t\tnum_required_flops:" << num_required_flops <<
+    // std::endl; std::cout << "\t\tFlopsPerRowOuter TIME:" << timer1.seconds()
+    // << std::endl;
   }
 
-
   timer1.reset();
 
   ////////////////////////////////////////
-  //DETERMINE BEGINNING AND END of EACH BLOCK
+  // DETERMINE BEGINNING AND END of EACH BLOCK
   ////////////////////////////////////////
-  //Now create the blocks
+  // Now create the blocks
   size_t num_blocks = num_required_flops / block_size;
   if (num_required_flops % block_size > 0) ++num_blocks;
 
-  std::vector<size_t> block_xadj ((num_blocks + 1) * 2);
+  std::vector<size_t> block_xadj((num_blocks + 1) * 2);
   nnz_lno_t total_num_blocks = 0;
   size_t current_block_begin = 0;
-  size_t current_block_end = current_block_begin + block_size;
-  block_xadj[0] = 0;
-  //identify blocks
-  for (nnz_lno_t i = 1; i < this->b_row_cnt; ++i){
-    if (flop_per_row(i) > current_block_end){
+  size_t current_block_end   = current_block_begin + block_size;
+  block_xadj[0]              = 0;
+  // identify blocks
+  for (nnz_lno_t i = 1; i < this->b_row_cnt; ++i) {
+    if (flop_per_row(i) > current_block_end) {
       block_xadj[++total_num_blocks] = i - 1;
 
       current_block_begin = flop_per_row(i - 1);
-      current_block_end = current_block_begin + block_size;
+      current_block_end   = current_block_begin + block_size;
     }
   }
   block_xadj[++total_num_blocks] = this->b_row_cnt;
-  if (KOKKOSKERNELS_VERBOSE){
-    //std::cout << "\t\tLower Bound num_blocks:" << num_blocks << " total_num_blocks:" << total_num_blocks<< std::endl;
-    //std::cout << "\t\tCALCULATE BLOCKS:" << timer1.seconds() << std::endl;
+  if (KOKKOSKERNELS_VERBOSE) {
+    // std::cout << "\t\tLower Bound num_blocks:" << num_blocks << "
+    // total_num_blocks:" << total_num_blocks<< std::endl; std::cout <<
+    // "\t\tCALCULATE BLOCKS:" << timer1.seconds() << std::endl;
     std::cout << timer1.seconds() << " ";
   }
 
   ////////////////////////////////////////
-  //ALLOCATE FAST MEMORY TRIPLETS
+  // ALLOCATE FAST MEMORY TRIPLETS
   ////////////////////////////////////////
   timer1.reset();
-  typedef Kokkos::View <Triplet *,  myFastMemory> fast_triplet_view_t;
-  fast_triplet_view_t fast_memory_triplets (
-      //Kokkos::view_alloc(Kokkos::WithoutInitializing, 
+  typedef Kokkos::View<Triplet *, myFastMemory> fast_triplet_view_t;
+  fast_triplet_view_t fast_memory_triplets(
+      // Kokkos::view_alloc(Kokkos::WithoutInitializing,
       "triplets"
       //    )
-      , block_size);
-  fast_triplet_view_t collapsed_fast_memory_triplets (
-      //Kokkos::view_alloc(Kokkos::WithoutInitializing, 
+      ,
+      block_size);
+  fast_triplet_view_t collapsed_fast_memory_triplets(
+      // Kokkos::view_alloc(Kokkos::WithoutInitializing,
       "triplets"
       //    )
-      , block_size);
+      ,
+      block_size);
 
   MyExecSpace().fence();
-  if (KOKKOSKERNELS_VERBOSE){
+  if (KOKKOSKERNELS_VERBOSE) {
     std::cout << timer1.seconds() << " ";
-    //std::cout << "\t\tAllocation WITH FIRST TOUCH TIME:" << timer1.seconds() << std::endl<< std::endl;
+    // std::cout << "\t\tAllocation WITH FIRST TOUCH TIME:" << timer1.seconds()
+    // << std::endl<< std::endl;
   }
 
-  typedef Kokkos::View <Triplet *,  mySlowMemory> slow_triplet_view_t;
-  std::vector <slow_triplet_view_t> host_triplet_arrays(total_num_blocks);
+  typedef Kokkos::View<Triplet *, mySlowMemory> slow_triplet_view_t;
+  std::vector<slow_triplet_view_t> host_triplet_arrays(total_num_blocks);
   size_t overall_size = 0;
 
-
-  double outerproducttime = 0, sorttime = 0, collapse_time = 0, copy_to_slow_mem_time = 0;
-  for (nnz_lno_t bi = 0; bi < total_num_blocks; ++bi){
-
+  double outerproducttime = 0, sorttime = 0, collapse_time = 0,
+         copy_to_slow_mem_time = 0;
+  for (nnz_lno_t bi = 0; bi < total_num_blocks; ++bi) {
     timer1.reset();
-    nnz_lno_t begin = block_xadj[bi];
-    nnz_lno_t end = block_xadj[bi + 1];
+    nnz_lno_t begin        = block_xadj[bi];
+    nnz_lno_t end          = block_xadj[bi + 1];
     size_t num_block_flops = flop_per_row[end] - flop_per_row[begin];
-    //std::cout << "\t\tBLOCK:" << bi << " Begin:" << begin << " End:" << end << std::endl;
+    // std::cout << "\t\tBLOCK:" << bi << " Begin:" << begin << " End:" << end
+    // << std::endl;
     ////////////////////////////////////////
-    //OUTER PRODUCT FOR BLOCK BI
+    // OUTER PRODUCT FOR BLOCK BI
     ////////////////////////////////////////
-    OuterProduct
-    < row_lno_temp_work_view_t, nnz_lno_temp_work_view_t, scalar_temp_work_view_t,
-    const_b_lno_row_view_t, const_b_lno_nnz_view_t, const_b_scalar_nnz_view_t, size_t_view_t> outer_product(
-        begin,end, team_row_chunk_size,
-        transpose_col_xadj, transpose_col_adj, tranpose_vals,
-        row_mapB, entriesB, valsB,
-        flop_per_row,
-        fast_memory_triplets);
-
-    if(this->use_dynamic_schedule)
-      Kokkos::parallel_for( dynamic_team_policy_t(b_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), outer_product);
+    OuterProduct<row_lno_temp_work_view_t, nnz_lno_temp_work_view_t,
+                 scalar_temp_work_view_t, const_b_lno_row_view_t,
+                 const_b_lno_nnz_view_t, const_b_scalar_nnz_view_t,
+                 size_t_view_t>
+        outer_product(begin, end, team_row_chunk_size, transpose_col_xadj,
+                      transpose_col_adj, tranpose_vals, row_mapB, entriesB,
+                      valsB, flop_per_row, fast_memory_triplets);
+
+    if (this->use_dynamic_schedule)
+      Kokkos::parallel_for(
+          dynamic_team_policy_t(b_row_cnt / team_row_chunk_size + 1,
+                                suggested_team_size, suggested_vector_size),
+          outer_product);
     else
-      Kokkos::parallel_for( team_policy_t(b_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), outer_product);
-
-
+      Kokkos::parallel_for(
+          team_policy_t(b_row_cnt / team_row_chunk_size + 1,
+                        suggested_team_size, suggested_vector_size),
+          outer_product);
 
     MyExecSpace().fence();
-    if (KOKKOSKERNELS_VERBOSE){
+    if (KOKKOSKERNELS_VERBOSE) {
       outerproducttime += timer1.seconds();
-      //std::cout << "\t\tOuter Product TIME:" << timer1.seconds() << std::endl;
+      // std::cout << "\t\tOuter Product TIME:" << timer1.seconds() <<
+      // std::endl;
     }
 
     ////////////////////////////////////////
-    //SORT TRIPLETS IN FAST MEMORY
+    // SORT TRIPLETS IN FAST MEMORY
     ////////////////////////////////////////
     //
-    //std::string filename = char (bi  + '0') + ".triplets";
-
+    // std::string filename = char (bi  + '0') + ".triplets";
 
-    //BELOW CODE writes the blocks to a file.
+    // BELOW CODE writes the blocks to a file.
     /*
     std::fstream ff;
     std::stringstream ss;
@@ -770,142 +738,141 @@ void KokkosSPGEMM
     ff.open(filename.c_str(), std::fstream::out);
     ff << num_block_flops << std::endl;
     for (size_t i = 0; i < num_block_flops; ++i){
-      ff << fast_memory_triplets[i].src << " " << fast_memory_triplets[i].dst << " " << fast_memory_triplets[i].val << std::endl;
+      ff << fast_memory_triplets[i].src << " " << fast_memory_triplets[i].dst <<
+    " " << fast_memory_triplets[i].val << std::endl;
     }
     ff.close();
      */
 
-
     timer1.reset();
     this->sort_triplets(fast_memory_triplets, num_block_flops);
     MyExecSpace().fence();
-    if (KOKKOSKERNELS_VERBOSE){
+    if (KOKKOSKERNELS_VERBOSE) {
       sorttime += timer1.seconds();
-      //std::cout << "\t\tTriplet Sort Time:" << timer1.seconds() << std::endl;
+      // std::cout << "\t\tTriplet Sort Time:" << timer1.seconds() << std::endl;
     }
 
     ////////////////////////////////////////
-    //COLLAPSE TRIPLETS, MERGE SIMILAR ONES
+    // COLLAPSE TRIPLETS, MERGE SIMILAR ONES
     ////////////////////////////////////////
     timer1.reset();
     size_type outsize = this->collapse_triplets_omp(
         fast_memory_triplets, num_block_flops, collapsed_fast_memory_triplets);
     overall_size += outsize;
     MyExecSpace().fence();
-    if (KOKKOSKERNELS_VERBOSE){
-      //std::cout << "\t\toutsize:" << outsize << std::endl;
+    if (KOKKOSKERNELS_VERBOSE) {
+      // std::cout << "\t\toutsize:" << outsize << std::endl;
       collapse_time += timer1.seconds();
-      //std::cout << "\t\tTriplet Collapse Time:" << timer1.seconds() << std::endl;
+      // std::cout << "\t\tTriplet Collapse Time:" << timer1.seconds() <<
+      // std::endl;
     }
 
-
     ////////////////////////////////////////
-    //COPY TO SLOW MEMORY FOR COLLAPSED TRIPLETS
+    // COPY TO SLOW MEMORY FOR COLLAPSED TRIPLETS
     ////////////////////////////////////////
     timer1.reset();
 
-    host_triplet_arrays[bi] = slow_triplet_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "thv"), outsize);
-    KokkosKernels::Impl::kk_copy_vector
-    <fast_triplet_view_t, slow_triplet_view_t, MyExecSpace>(
-        outsize,collapsed_fast_memory_triplets, host_triplet_arrays[bi]);
+    host_triplet_arrays[bi] = slow_triplet_view_t(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "thv"), outsize);
+    KokkosKernels::Impl::kk_copy_vector<fast_triplet_view_t,
+                                        slow_triplet_view_t, MyExecSpace>(
+        outsize, collapsed_fast_memory_triplets, host_triplet_arrays[bi]);
     MyExecSpace().fence();
-    if (KOKKOSKERNELS_VERBOSE){
+    if (KOKKOSKERNELS_VERBOSE) {
       copy_to_slow_mem_time += timer1.seconds();
-      //std::cout << "\t\tTriplet Copy To Slow Memory Time:" << timer1.seconds()  << std::endl << std::endl;
+      // std::cout << "\t\tTriplet Copy To Slow Memory Time:" <<
+      // timer1.seconds()  << std::endl << std::endl;
     }
   }
-  if (KOKKOSKERNELS_VERBOSE){
+  if (KOKKOSKERNELS_VERBOSE) {
     std::cout << outerproducttime << " ";
     std::cout << sorttime << " ";
     std::cout << collapse_time << " ";
     std::cout << copy_to_slow_mem_time << " ";
 
-    //std::cout << "\t\tOuter Product TIME:" << outerproducttime << std::endl;
-    //std::cout << "\t\tTriplet Sort Time:" << sorttime << std::endl;
-    //std::cout << "\t\tTriplet Collapse Time:" << collapse_time << std::endl;
-    //std::cout << "\t\tTriplet Copy To Slow Memory Time:" << copy_to_slow_mem_time  << std::endl << std::endl;
+    // std::cout << "\t\tOuter Product TIME:" << outerproducttime << std::endl;
+    // std::cout << "\t\tTriplet Sort Time:" << sorttime << std::endl;
+    // std::cout << "\t\tTriplet Collapse Time:" << collapse_time << std::endl;
+    // std::cout << "\t\tTriplet Copy To Slow Memory Time:" <<
+    // copy_to_slow_mem_time  << std::endl << std::endl;
   }
 
   ////////////////////////////////////////
-  //ALLOCATE MEMORY FOR ALL TRIPLETS IN SLOW
-  //AND MULTI-WAY-MERGE THEM
+  // ALLOCATE MEMORY FOR ALL TRIPLETS IN SLOW
+  // AND MULTI-WAY-MERGE THEM
   ////////////////////////////////////////
   timer1.reset();
-  slow_triplet_view_t output_triplets("output_triplets",overall_size);
-
-  this->merge_triplets_on_slow_memory(
-      &(host_triplet_arrays[0]),
-      total_num_blocks,
-      overall_size,
-      output_triplets);
-  if (KOKKOSKERNELS_VERBOSE){
-    //std::cout << "\t\tMultiway Merge Time:" << timer1.seconds() << std::endl;
-    std::cout << timer1.seconds() << " ";
+  slow_triplet_view_t output_triplets("output_triplets", overall_size);
 
+  this->merge_triplets_on_slow_memory(&(host_triplet_arrays[0]),
+                                      total_num_blocks, overall_size,
+                                      output_triplets);
+  if (KOKKOSKERNELS_VERBOSE) {
+    // std::cout << "\t\tMultiway Merge Time:" << timer1.seconds() << std::endl;
+    std::cout << timer1.seconds() << " ";
   }
   timer1.reset();
 
-
   ////////////////////////////////////////
-  //COLLAPSE TRIPLETS, MERGE SAME ONES
+  // COLLAPSE TRIPLETS, MERGE SAME ONES
   ////////////////////////////////////////
-  //triplet_host_view_t collapsed_output_triplets("collapsed_output_triplets",overall_size);
+  // triplet_host_view_t
+  // collapsed_output_triplets("collapsed_output_triplets",overall_size);
 
   size_type outsize = this->final_collapse_triplets_omp(
-      output_triplets,
-      overall_size,
-      rowmapC_, entriesC_, valuesC_);
-  if (KOKKOSKERNELS_VERBOSE){
-    //std::cout << "\t\tOutsize:" << outsize << " overall_size:" << overall_size<<  std::endl;
-    //std::cout << "\t\tFinal Collapse:" << timer1.seconds() << std::endl;
+      output_triplets, overall_size, rowmapC_, entriesC_, valuesC_);
+  if (KOKKOSKERNELS_VERBOSE) {
+    // std::cout << "\t\tOutsize:" << outsize << " overall_size:" <<
+    // overall_size<<  std::endl; std::cout << "\t\tFinal Collapse:" <<
+    // timer1.seconds() << std::endl;
     std::cout << timer1.seconds() << " ";
   }
 
-
-  if (KOKKOSKERNELS_VERBOSE){
+  if (KOKKOSKERNELS_VERBOSE) {
     std::cout << timer_all.seconds() << std::endl;
-    //std::cout << "\t\tNumeric TIME:" << timer_all.seconds() << std::endl;
+    // std::cout << "\t\tNumeric TIME:" << timer_all.seconds() << std::endl;
   }
 
-  std::cout << "\t\tLower Bound num_blocks:" << num_blocks << " total_num_blocks:" << total_num_blocks<< std::endl;
+  std::cout << "\t\tLower Bound num_blocks:" << num_blocks
+            << " total_num_blocks:" << total_num_blocks << std::endl;
 }
 #else
-template <
-typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-template <typename c_row_view_t, typename c_lno_nnz_view_t, typename c_scalar_nnz_view_t>
-void KokkosSPGEMM
-<HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-KokkosSPGEMM_numeric_outer(
-    c_row_view_t & /*rowmapC_*/,
-    c_lno_nnz_view_t & /*entriesC_*/,
-    c_scalar_nnz_view_t & /*valuesC_*/,
-    KokkosKernels::Impl::ExecSpaceType /*my_exec_space_*/){
-  throw std::runtime_error ("Cannot run outer product. ENABLE openmp and outer product to run\n");
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename c_row_view_t, typename c_lno_nnz_view_t,
+          typename c_scalar_nnz_view_t>
+void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                  a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                  b_scalar_nnz_view_t_>::
+    KokkosSPGEMM_numeric_outer(
+        c_row_view_t& /*rowmapC_*/, c_lno_nnz_view_t& /*entriesC_*/,
+        c_scalar_nnz_view_t& /*valuesC_*/,
+        KokkosKernels::Impl::ExecSpaceType /*my_exec_space_*/) {
+  throw std::runtime_error(
+      "Cannot run outer product. ENABLE openmp and outer product to run\n");
 }
 #endif
 #else
-template <
-  typename HandleType,
-  typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-  typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-template <typename c_row_view_t, typename c_lno_nnz_view_t, typename c_scalar_nnz_view_t>
-void KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename c_row_view_t, typename c_lno_nnz_view_t,
+          typename c_scalar_nnz_view_t>
+void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                  a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                  b_scalar_nnz_view_t_>::
     KokkosSPGEMM_numeric_outer(
-    c_row_view_t &/*rowmapC_*/,
-    c_lno_nnz_view_t &/*entriesC_*/,
-    c_scalar_nnz_view_t &/*valuesC_*/,
-    KokkosKernels::Impl::ExecSpaceType /*my_exec_space_*/){
-  throw std::runtime_error ("Cannot run outer product. ENABLE openmp and outer product to run\n");
+        c_row_view_t& /*rowmapC_*/, c_lno_nnz_view_t& /*entriesC_*/,
+        c_scalar_nnz_view_t& /*valuesC_*/,
+        KokkosKernels::Impl::ExecSpaceType /*my_exec_space_*/) {
+  throw std::runtime_error(
+      "Cannot run outer product. ENABLE openmp and outer product to run\n");
 }
 #endif
 
+}  // namespace Impl
 
-}
-
-}
-
+}  // namespace KokkosSparse
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp
index 06a3153ad9..09a8bf212a 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp
@@ -67,18 +67,16 @@
 #include "KokkosSparse_spgemm_handle.hpp"
 #include "KokkosGraph_Distance1Color.hpp"
 
-namespace KokkosSparse{
+namespace KokkosSparse {
 
-namespace Impl{
-
-
-
-template <typename HandleType,
-  typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-  typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-class KokkosSPGEMM{
-public:
+namespace Impl {
 
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+class KokkosSPGEMM {
+ public:
   typedef a_row_view_t_ a_row_view_t;
   typedef a_lno_nnz_view_t_ a_in_lno_nnz_view_t;
   typedef a_scalar_nnz_view_t_ a_in_scalar_nnz_view_t;
@@ -87,168 +85,209 @@ class KokkosSPGEMM{
   typedef b_lno_nnz_view_t_ b_in_lno_nnz_view_t;
   typedef b_scalar_nnz_view_t_ b_in_scalar_nnz_view_t;
 
-
-
   typedef typename a_row_view_t::non_const_value_type size_type;
   typedef typename a_row_view_t::const_value_type const_size_type;
 
-
   typedef typename a_in_lno_nnz_view_t::non_const_value_type nnz_lno_t;
   typedef typename a_in_lno_nnz_view_t::const_value_type const_nnz_lno_t;
 
   typedef typename a_in_scalar_nnz_view_t::non_const_value_type scalar_t;
   typedef typename a_in_scalar_nnz_view_t::const_value_type const_scalar_t;
 
-
   typedef typename a_row_view_t::const_type const_a_lno_row_view_t;
   typedef typename a_row_view_t::non_const_type non_const_a_lno_row_view_t;
 
   typedef typename a_in_lno_nnz_view_t::const_type const_a_lno_nnz_view_t;
-  typedef typename a_in_lno_nnz_view_t::non_const_type non_const_a_lno_nnz_view_t;
+  typedef
+      typename a_in_lno_nnz_view_t::non_const_type non_const_a_lno_nnz_view_t;
 
   typedef typename a_in_scalar_nnz_view_t::const_type const_a_scalar_nnz_view_t;
-  typedef typename a_in_scalar_nnz_view_t::non_const_type non_const_a_scalar_nnz_view_t;
-
+  typedef typename a_in_scalar_nnz_view_t::non_const_type
+      non_const_a_scalar_nnz_view_t;
 
   typedef typename b_in_lno_row_view_t::const_type const_b_lno_row_view_t;
-  typedef typename b_in_lno_row_view_t::non_const_type non_const_b_lno_row_view_t;
+  typedef
+      typename b_in_lno_row_view_t::non_const_type non_const_b_lno_row_view_t;
 
   typedef typename b_in_lno_nnz_view_t::const_type const_b_lno_nnz_view_t;
-  typedef typename b_in_lno_nnz_view_t::non_const_type non_const_b_lno_nnz_view_t;
+  typedef
+      typename b_in_lno_nnz_view_t::non_const_type non_const_b_lno_nnz_view_t;
 
   typedef typename b_in_scalar_nnz_view_t::const_type const_b_scalar_nnz_view_t;
-  typedef typename b_in_scalar_nnz_view_t::non_const_type non_const_b_scalar_nnz_view_t;
+  typedef typename b_in_scalar_nnz_view_t::non_const_type
+      non_const_b_scalar_nnz_view_t;
 
   typedef typename HandleType::HandleExecSpace MyExecSpace;
   typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace;
-  typedef typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace;
-
-
-  typedef typename HandleType::row_lno_temp_work_view_t row_lno_temp_work_view_t;
-  typedef typename HandleType::row_lno_persistent_work_view_t row_lno_persistent_work_view_t;
-  typedef typename HandleType::row_lno_persistent_work_host_view_t row_lno_persistent_work_host_view_t; //Host view type
-
-
-  typedef typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t;
-  typedef typename HandleType::nnz_lno_persistent_work_view_t nnz_lno_persistent_work_view_t;
-  typedef typename HandleType::nnz_lno_persistent_work_host_view_t nnz_lno_persistent_work_host_view_t; //Host view type
-
+  typedef
+      typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace;
+
+  typedef
+      typename HandleType::row_lno_temp_work_view_t row_lno_temp_work_view_t;
+  typedef typename HandleType::row_lno_persistent_work_view_t
+      row_lno_persistent_work_view_t;
+  typedef typename HandleType::row_lno_persistent_work_host_view_t
+      row_lno_persistent_work_host_view_t;  // Host view type
+
+  typedef
+      typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t;
+  typedef typename HandleType::nnz_lno_persistent_work_view_t
+      nnz_lno_persistent_work_view_t;
+  typedef typename HandleType::nnz_lno_persistent_work_host_view_t
+      nnz_lno_persistent_work_host_view_t;  // Host view type
 
   typedef typename HandleType::scalar_temp_work_view_t scalar_temp_work_view_t;
-  typedef typename HandleType::scalar_persistent_work_view_t scalar_persistent_work_view_t;
-
+  typedef typename HandleType::scalar_persistent_work_view_t
+      scalar_persistent_work_view_t;
 
   typedef typename HandleType::bool_persistent_view_t bool_persistent_view_t;
   typedef typename HandleType::bool_temp_view_t bool_temp_view_t;
 
-
   typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-  typedef Kokkos::TeamPolicy<MyExecSpace> team_policy_t ;
-  typedef typename team_policy_t::member_type team_member_t ;
-
-  struct CountTag{};
-  struct GPUCountTag{};
-  struct CountTag2{};
-
-
-
-
-  struct FillTag{};
-  struct FillTag2{};
-  struct MultiCoreDenseAccumulatorTag{};
-  struct MultiCoreDenseAccumulatorTag2{};
-  struct MultiCoreDenseAccumulatorTag3{};
-  struct NoCompressMultiCoreDenseAccumulatorTag{};
-  struct NoCompressMultiCoreDenseAccumulatorTag2{};
-  struct NoCompressMultiCoreDenseAccumulatorTag3{};
-  struct MultiCoreTag{};
-  struct MultiCoreTag2{};
-  struct MultiCoreTag3{};
-  struct MultiCoreTag4{};
-  struct MultiCoreTag5{};
-  struct MultiCoreTag6{};
-  struct GPUTag{};
-  struct GPUTag2{};
-  struct GPUTag3{};
-  struct GPUTag4{};
-  struct GPUTag5{};
-  struct GPUTag6{};
-
-
-  struct Numeric1Tag{};
-  struct Numeric2Tag{};
-  struct Numeric3Tag{};
-
-  typedef Kokkos::TeamPolicy<MultiCoreDenseAccumulatorTag, MyExecSpace> multicore_dense_team_count_policy_t ;
-  typedef Kokkos::TeamPolicy<MultiCoreDenseAccumulatorTag2, MyExecSpace> multicore_dense_team2_count_policy_t ;
-  typedef Kokkos::TeamPolicy<MultiCoreDenseAccumulatorTag3, MyExecSpace> multicore_dense_team3_count_policy_t ;
-
-
-  typedef Kokkos::TeamPolicy<NoCompressMultiCoreDenseAccumulatorTag, MyExecSpace> nc_multicore_dense_team_count_policy_t ;
-  typedef Kokkos::TeamPolicy<NoCompressMultiCoreDenseAccumulatorTag2, MyExecSpace> nc_multicore_dense_team2_count_policy_t ;
-  typedef Kokkos::TeamPolicy<NoCompressMultiCoreDenseAccumulatorTag3, MyExecSpace> nc_multicore_dense_team3_count_policy_t ;
-
-  typedef Kokkos::TeamPolicy<NoCompressMultiCoreDenseAccumulatorTag, MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic> > nc_dynamic_multicore_dense_team_count_policy_t ;
-  typedef Kokkos::TeamPolicy<NoCompressMultiCoreDenseAccumulatorTag2, MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic> > nc_dynamic_multicore_dense_team2_count_policy_t ;
-  typedef Kokkos::TeamPolicy<NoCompressMultiCoreDenseAccumulatorTag3, MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic> > nc_dynamic_multicore_dense_team3_count_policy_t ;
-
-
-  typedef Kokkos::TeamPolicy<MultiCoreTag, MyExecSpace> multicore_team_policy_t ;
-  typedef Kokkos::TeamPolicy<MultiCoreTag2, MyExecSpace> multicore_team_policy2_t ;
-  typedef Kokkos::TeamPolicy<MultiCoreTag3, MyExecSpace> multicore_team_policy3_t ;
-  typedef Kokkos::TeamPolicy<MultiCoreTag4, MyExecSpace> multicore_team_policy4_t ;
-  typedef Kokkos::TeamPolicy<MultiCoreTag5, MyExecSpace> multicore_team_policy5_t ;
-  typedef Kokkos::TeamPolicy<MultiCoreTag6, MyExecSpace> multicore_team_policy6_t ;
-
-  typedef Kokkos::TeamPolicy<GPUTag, MyExecSpace> gpu_team_policy_t ;
-  typedef Kokkos::TeamPolicy<GPUTag2, MyExecSpace> gpu_team_policy2_t ;
-  typedef Kokkos::TeamPolicy<GPUTag3, MyExecSpace> gpu_team_policy3_t ;
-  typedef Kokkos::TeamPolicy<GPUTag4, MyExecSpace> gpu_team_policy4_t ;
-  typedef Kokkos::TeamPolicy<GPUTag5, MyExecSpace> gpu_team_policy5_t ;
-  typedef Kokkos::TeamPolicy<GPUTag6, MyExecSpace> gpu_team_policy6_t ;
-
-
-  typedef Kokkos::TeamPolicy<CountTag, MyExecSpace> team_count_policy_t ;
-  typedef Kokkos::TeamPolicy<CountTag2, MyExecSpace> team_count2_policy_t ;
-
-  typedef Kokkos::TeamPolicy<GPUCountTag, MyExecSpace> team_gpucount_policy_t ;
-
-  typedef Kokkos::TeamPolicy<FillTag, MyExecSpace> team_fill_policy_t ;
-  typedef Kokkos::TeamPolicy<FillTag2, MyExecSpace> team_fill2_policy_t ;
-
-  typedef Kokkos::TeamPolicy<Numeric1Tag, MyExecSpace> team_numeric1_policy_t ;
-  typedef Kokkos::TeamPolicy<Numeric2Tag, MyExecSpace> team_numeric2_policy_t ;
-  typedef Kokkos::TeamPolicy<Numeric3Tag, MyExecSpace> team_numeric3_policy_t ;
-
-
-  typedef Kokkos::TeamPolicy<MultiCoreDenseAccumulatorTag, MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic> > dynamic_multicore_dense_team_count_policy_t ;
-  typedef Kokkos::TeamPolicy<MultiCoreDenseAccumulatorTag2, MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic> > dynamic_multicore_dense_team2_count_policy_t ;
-  typedef Kokkos::TeamPolicy<MultiCoreDenseAccumulatorTag3, MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic> > dynamic_multicore_dense_team3_count_policy_t ;
-
-  typedef Kokkos::TeamPolicy<MultiCoreTag, MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic> > dynamic_multicore_team_policy_t ;
-  typedef Kokkos::TeamPolicy<MultiCoreTag2, MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic> > dynamic_multicore_team_policy2_t ;
-  typedef Kokkos::TeamPolicy<MultiCoreTag3, MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic> > dynamic_multicore_team_policy3_t ;
-  typedef Kokkos::TeamPolicy<MultiCoreTag4, MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic> > dynamic_multicore_team_policy4_t ;
-  typedef Kokkos::TeamPolicy<MultiCoreTag5, MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic> > dynamic_multicore_team_policy5_t ;
-  typedef Kokkos::TeamPolicy<MultiCoreTag6, MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic> > dynamic_multicore_team_policy6_t ;
-
-
-  typedef Kokkos::TeamPolicy<CountTag, MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic> > dynamic_team_count_policy_t ;
-  typedef Kokkos::TeamPolicy<FillTag, MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic> > dynamic_team_fill_policy_t ;
-  typedef Kokkos::TeamPolicy<Numeric1Tag, MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic> > dynamic_team_numeric1_policy_t ;
-  typedef Kokkos::TeamPolicy<Numeric2Tag, MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic> > dynamic_team_numeric2_policy_t ;
-  typedef Kokkos::TeamPolicy<Numeric3Tag, MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic> > dynamic_team_numeric3_policy_t ;
-
-  typedef Kokkos::TeamPolicy<MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic> > dynamic_team_policy_t ;
-
-
-private:
+  typedef Kokkos::TeamPolicy<MyExecSpace> team_policy_t;
+  typedef typename team_policy_t::member_type team_member_t;
+
+  struct CountTag {};
+  struct GPUCountTag {};
+  struct CountTag2 {};
+
+  struct FillTag {};
+  struct FillTag2 {};
+  struct MultiCoreDenseAccumulatorTag {};
+  struct MultiCoreDenseAccumulatorTag2 {};
+  struct MultiCoreDenseAccumulatorTag3 {};
+  struct NoCompressMultiCoreDenseAccumulatorTag {};
+  struct NoCompressMultiCoreDenseAccumulatorTag2 {};
+  struct NoCompressMultiCoreDenseAccumulatorTag3 {};
+  struct MultiCoreTag {};
+  struct MultiCoreTag2 {};
+  struct MultiCoreTag3 {};
+  struct MultiCoreTag4 {};
+  struct MultiCoreTag5 {};
+  struct MultiCoreTag6 {};
+  struct GPUTag {};
+  struct GPUTag2 {};
+  struct GPUTag3 {};
+  struct GPUTag4 {};
+  struct GPUTag5 {};
+  struct GPUTag6 {};
+
+  struct Numeric1Tag {};
+  struct Numeric2Tag {};
+  struct Numeric3Tag {};
+
+  typedef Kokkos::TeamPolicy<MultiCoreDenseAccumulatorTag, MyExecSpace>
+      multicore_dense_team_count_policy_t;
+  typedef Kokkos::TeamPolicy<MultiCoreDenseAccumulatorTag2, MyExecSpace>
+      multicore_dense_team2_count_policy_t;
+  typedef Kokkos::TeamPolicy<MultiCoreDenseAccumulatorTag3, MyExecSpace>
+      multicore_dense_team3_count_policy_t;
+
+  typedef Kokkos::TeamPolicy<NoCompressMultiCoreDenseAccumulatorTag,
+                             MyExecSpace>
+      nc_multicore_dense_team_count_policy_t;
+  typedef Kokkos::TeamPolicy<NoCompressMultiCoreDenseAccumulatorTag2,
+                             MyExecSpace>
+      nc_multicore_dense_team2_count_policy_t;
+  typedef Kokkos::TeamPolicy<NoCompressMultiCoreDenseAccumulatorTag3,
+                             MyExecSpace>
+      nc_multicore_dense_team3_count_policy_t;
+
+  typedef Kokkos::TeamPolicy<NoCompressMultiCoreDenseAccumulatorTag,
+                             MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >
+      nc_dynamic_multicore_dense_team_count_policy_t;
+  typedef Kokkos::TeamPolicy<NoCompressMultiCoreDenseAccumulatorTag2,
+                             MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >
+      nc_dynamic_multicore_dense_team2_count_policy_t;
+  typedef Kokkos::TeamPolicy<NoCompressMultiCoreDenseAccumulatorTag3,
+                             MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >
+      nc_dynamic_multicore_dense_team3_count_policy_t;
+
+  typedef Kokkos::TeamPolicy<MultiCoreTag, MyExecSpace> multicore_team_policy_t;
+  typedef Kokkos::TeamPolicy<MultiCoreTag2, MyExecSpace>
+      multicore_team_policy2_t;
+  typedef Kokkos::TeamPolicy<MultiCoreTag3, MyExecSpace>
+      multicore_team_policy3_t;
+  typedef Kokkos::TeamPolicy<MultiCoreTag4, MyExecSpace>
+      multicore_team_policy4_t;
+  typedef Kokkos::TeamPolicy<MultiCoreTag5, MyExecSpace>
+      multicore_team_policy5_t;
+  typedef Kokkos::TeamPolicy<MultiCoreTag6, MyExecSpace>
+      multicore_team_policy6_t;
+
+  typedef Kokkos::TeamPolicy<GPUTag, MyExecSpace> gpu_team_policy_t;
+  typedef Kokkos::TeamPolicy<GPUTag2, MyExecSpace> gpu_team_policy2_t;
+  typedef Kokkos::TeamPolicy<GPUTag3, MyExecSpace> gpu_team_policy3_t;
+  typedef Kokkos::TeamPolicy<GPUTag4, MyExecSpace> gpu_team_policy4_t;
+  typedef Kokkos::TeamPolicy<GPUTag5, MyExecSpace> gpu_team_policy5_t;
+  typedef Kokkos::TeamPolicy<GPUTag6, MyExecSpace> gpu_team_policy6_t;
+
+  typedef Kokkos::TeamPolicy<CountTag, MyExecSpace> team_count_policy_t;
+  typedef Kokkos::TeamPolicy<CountTag2, MyExecSpace> team_count2_policy_t;
+
+  typedef Kokkos::TeamPolicy<GPUCountTag, MyExecSpace> team_gpucount_policy_t;
+
+  typedef Kokkos::TeamPolicy<FillTag, MyExecSpace> team_fill_policy_t;
+  typedef Kokkos::TeamPolicy<FillTag2, MyExecSpace> team_fill2_policy_t;
+
+  typedef Kokkos::TeamPolicy<Numeric1Tag, MyExecSpace> team_numeric1_policy_t;
+  typedef Kokkos::TeamPolicy<Numeric2Tag, MyExecSpace> team_numeric2_policy_t;
+  typedef Kokkos::TeamPolicy<Numeric3Tag, MyExecSpace> team_numeric3_policy_t;
+
+  typedef Kokkos::TeamPolicy<MultiCoreDenseAccumulatorTag, MyExecSpace,
+                             Kokkos::Schedule<Kokkos::Dynamic> >
+      dynamic_multicore_dense_team_count_policy_t;
+  typedef Kokkos::TeamPolicy<MultiCoreDenseAccumulatorTag2, MyExecSpace,
+                             Kokkos::Schedule<Kokkos::Dynamic> >
+      dynamic_multicore_dense_team2_count_policy_t;
+  typedef Kokkos::TeamPolicy<MultiCoreDenseAccumulatorTag3, MyExecSpace,
+                             Kokkos::Schedule<Kokkos::Dynamic> >
+      dynamic_multicore_dense_team3_count_policy_t;
+
+  typedef Kokkos::TeamPolicy<MultiCoreTag, MyExecSpace,
+                             Kokkos::Schedule<Kokkos::Dynamic> >
+      dynamic_multicore_team_policy_t;
+  typedef Kokkos::TeamPolicy<MultiCoreTag2, MyExecSpace,
+                             Kokkos::Schedule<Kokkos::Dynamic> >
+      dynamic_multicore_team_policy2_t;
+  typedef Kokkos::TeamPolicy<MultiCoreTag3, MyExecSpace,
+                             Kokkos::Schedule<Kokkos::Dynamic> >
+      dynamic_multicore_team_policy3_t;
+  typedef Kokkos::TeamPolicy<MultiCoreTag4, MyExecSpace,
+                             Kokkos::Schedule<Kokkos::Dynamic> >
+      dynamic_multicore_team_policy4_t;
+  typedef Kokkos::TeamPolicy<MultiCoreTag5, MyExecSpace,
+                             Kokkos::Schedule<Kokkos::Dynamic> >
+      dynamic_multicore_team_policy5_t;
+  typedef Kokkos::TeamPolicy<MultiCoreTag6, MyExecSpace,
+                             Kokkos::Schedule<Kokkos::Dynamic> >
+      dynamic_multicore_team_policy6_t;
+
+  typedef Kokkos::TeamPolicy<CountTag, MyExecSpace,
+                             Kokkos::Schedule<Kokkos::Dynamic> >
+      dynamic_team_count_policy_t;
+  typedef Kokkos::TeamPolicy<FillTag, MyExecSpace,
+                             Kokkos::Schedule<Kokkos::Dynamic> >
+      dynamic_team_fill_policy_t;
+  typedef Kokkos::TeamPolicy<Numeric1Tag, MyExecSpace,
+                             Kokkos::Schedule<Kokkos::Dynamic> >
+      dynamic_team_numeric1_policy_t;
+  typedef Kokkos::TeamPolicy<Numeric2Tag, MyExecSpace,
+                             Kokkos::Schedule<Kokkos::Dynamic> >
+      dynamic_team_numeric2_policy_t;
+  typedef Kokkos::TeamPolicy<Numeric3Tag, MyExecSpace,
+                             Kokkos::Schedule<Kokkos::Dynamic> >
+      dynamic_team_numeric3_policy_t;
+
+  typedef Kokkos::TeamPolicy<MyExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >
+      dynamic_team_policy_t;
+
+ private:
   HandleType *handle;
   nnz_lno_t a_row_cnt;
   nnz_lno_t b_row_cnt;
   nnz_lno_t b_col_cnt;
 
-
   const_a_lno_row_view_t row_mapA;
   const_a_lno_nnz_view_t entriesA;
   const_a_scalar_nnz_view_t valsA;
@@ -263,7 +302,7 @@ class KokkosSPGEMM{
   size_t concurrency;
   const bool use_dynamic_schedule;
   const bool KOKKOSKERNELS_VERBOSE;
-  //const int KOKKOSKERNELS_VERBOSE = 1;
+  // const int KOKKOSKERNELS_VERBOSE = 1;
 
   const KokkosKernels::Impl::ExecSpaceType MyEnumExecSpace;
   const SPGEMMAlgorithm spgemm_algorithm;
@@ -275,64 +314,55 @@ class KokkosSPGEMM{
   //////////////////////////////////////////////////////////////////////////////
 
   /**
-   * \brief Given a symbolic matrix (a graph), it compresses the graph using bits.
-   * \param in_row_map: input row pointers.
-   * \param in_entries: input column entries
-   * \param out_row_map: output row pointers of the compressed matrix
-   * \param out_nnz_indices: output, column set indices of the output matrix.
-   * \param out_nnz_sets: output, column sets of the output matrix.
+   * \brief Given a symbolic matrix (a graph), it compresses the graph using
+   * bits. \param in_row_map: input row pointers. \param in_entries: input
+   * column entries \param out_row_map: output row pointers of the compressed
+   * matrix \param out_nnz_indices: output, column set indices of the output
+   * matrix. \param out_nnz_sets: output, column sets of the output matrix.
    *
    */
-  template <typename in_row_view_t, typename in_nnz_view_t, typename out_rowmap_view_t, typename out_nnz_view_t>
-  bool compressMatrix(
-      nnz_lno_t n, size_type nnz,
-      in_row_view_t in_row_map, in_nnz_view_t in_entries,
-      out_rowmap_view_t out_row_map,
-      out_nnz_view_t &out_nnz_indices,
-      out_nnz_view_t &out_nnz_sets,
-      bool singleStep);
-
-public:
+  template <typename in_row_view_t, typename in_nnz_view_t,
+            typename out_rowmap_view_t, typename out_nnz_view_t>
+  bool compressMatrix(nnz_lno_t n, size_type nnz, in_row_view_t in_row_map,
+                      in_nnz_view_t in_entries, out_rowmap_view_t out_row_map,
+                      out_nnz_view_t &out_nnz_indices,
+                      out_nnz_view_t &out_nnz_sets, bool singleStep);
+
+ public:
   /**
    *\brief Functor to zip the B matrix.
    */
-  template <typename row_view_t, typename nnz_view_t, typename new_row_view_t, typename new_nnz_view_t, typename pool_memory_space>
+  template <typename row_view_t, typename nnz_view_t, typename new_row_view_t,
+            typename new_nnz_view_t, typename pool_memory_space>
   struct SingleStepZipMatrix;
-private:
-
 
+ private:
   //////////////////////////////////////////////////////////////////////////
   //////////////////////////////////////////////////////////////////////////
   ////BELOW code is for triangle count specific.
   //////////////////////////////////////////////////////////////////////////
   //////////////////////////////////////////////////////////////////////////
   template <typename struct_visit_t>
-  void triangle_count_ai(
-      const int is_symbolic_or_numeric,
-      const nnz_lno_t m,
-      const size_type* row_mapA_,
-      const nnz_lno_t* entriesA_,
-
-      const size_type bnnz,
-      const size_type* old_row_mapB,
-      const size_type* row_mapB_,
-      const nnz_lno_t* entriesSetIndex,
-      const nnz_lno_t* entriesSets,
-
-      size_type* rowmapC,
-      nnz_lno_t *entriesC,
-      struct_visit_t visit_applier
-  );
-public:
+  void triangle_count_ai(const int is_symbolic_or_numeric, const nnz_lno_t m,
+                         const size_type *row_mapA_, const nnz_lno_t *entriesA_,
+
+                         const size_type bnnz, const size_type *old_row_mapB,
+                         const size_type *row_mapB_,
+                         const nnz_lno_t *entriesSetIndex,
+                         const nnz_lno_t *entriesSets,
+
+                         size_type *rowmapC, nnz_lno_t *entriesC,
+                         struct_visit_t visit_applier);
+
+ public:
   template <typename pool_memory_space, typename struct_visit_t>
   struct TriangleCount;
 
-
-  template <typename c_row_view_t, typename c_lno_nnz_view_t, typename c_scalar_nnz_view_t>
-  void KokkosSPGEMM_numeric_triangle(
-        c_row_view_t rowmapC_,
-        c_lno_nnz_view_t entriesC_,
-        c_scalar_nnz_view_t valuesC_);
+  template <typename c_row_view_t, typename c_lno_nnz_view_t,
+            typename c_scalar_nnz_view_t>
+  void KokkosSPGEMM_numeric_triangle(c_row_view_t rowmapC_,
+                                     c_lno_nnz_view_t entriesC_,
+                                     c_scalar_nnz_view_t valuesC_);
 
   template <typename c_row_view_t>
   void KokkosSPGEMM_symbolic_triangle(c_row_view_t rowmapC_);
@@ -356,195 +386,189 @@ class KokkosSPGEMM{
           struct_visit_t visit_applier);
   */
   void KokkosSPGEMM_symbolic_triangle_setup();
-private:
-  template <typename c_row_view_t, typename c_lno_nnz_view_t>
-  void KokkosSPGEMM_numeric_triangle_ai(
-        c_row_view_t rowmapC_,
-        c_lno_nnz_view_t entriesC_);
 
+ private:
+  template <typename c_row_view_t, typename c_lno_nnz_view_t>
+  void KokkosSPGEMM_numeric_triangle_ai(c_row_view_t rowmapC_,
+                                        c_lno_nnz_view_t entriesC_);
 
-public:
+ public:
   //////////////////////////////////////////////////////////////////////////
   /////BELOW CODE IS TO for SPEED SPGEMM
   ////DECL IS AT _speed.hpp
   //////////////////////////////////////////////////////////////////////////
-  template <typename a_row_view_t, typename a_nnz_view_t, typename a_scalar_view_t,
-            typename b_row_view_t, typename b_nnz_view_t, typename b_scalar_view_t,
-            typename c_row_view_t, typename c_nnz_view_t, typename c_scalar_view_t,
-            typename mpool_type>
+  template <typename a_row_view_t, typename a_nnz_view_t,
+            typename a_scalar_view_t, typename b_row_view_t,
+            typename b_nnz_view_t, typename b_scalar_view_t,
+            typename c_row_view_t, typename c_nnz_view_t,
+            typename c_scalar_view_t, typename mpool_type>
   struct NumericCMEM_CPU;
 
-  template <typename a_row_view_t__, typename a_nnz_view_t__, typename a_scalar_view_t__,
-            typename b_row_view_t__, typename b_nnz_view_t__, typename b_scalar_view_t__,
-            typename c_row_view_t__, typename c_nnz_view_t__, typename c_scalar_view_t__,
-            typename c_nnz_tmp_view_t>
+  template <typename a_row_view_t__, typename a_nnz_view_t__,
+            typename a_scalar_view_t__, typename b_row_view_t__,
+            typename b_nnz_view_t__, typename b_scalar_view_t__,
+            typename c_row_view_t__, typename c_nnz_view_t__,
+            typename c_scalar_view_t__, typename c_nnz_tmp_view_t>
   struct NumericCMEM;
-private:
+
+ private:
   /**
    * \brief Numeric phase with speed method
    */
-  template <typename c_row_view_t, typename c_lno_nnz_view_t, typename c_scalar_nnz_view_t>
+  template <typename c_row_view_t, typename c_lno_nnz_view_t,
+            typename c_scalar_nnz_view_t>
   void KokkosSPGEMM_numeric_speed(
-      c_row_view_t rowmapC_,
-      c_lno_nnz_view_t entriesC_,
+      c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_,
       c_scalar_nnz_view_t valuesC_,
       KokkosKernels::Impl::ExecSpaceType my_exec_space);
 
-
-public:
-/*
-  //////////////////////////////////////////////////////////////////////////
-  /////BELOW CODE IS TO for colored SPGEMM
-  ////DECL IS AT _color.hpp
-  //////////////////////////////////////////////////////////////////////////
-  template <typename a_row_view_t__, typename a_nnz_view_t__, typename a_scalar_view_t__,
-            typename b_row_view_t__, typename b_nnz_view_t__, typename b_scalar_view_t__,
-            typename c_row_view_t__, typename c_nnz_view_t__, typename c_scalar_view_t__>
-  struct NumericCCOLOR;
-*/
-private:
+ public:
+  /*
+    //////////////////////////////////////////////////////////////////////////
+    /////BELOW CODE IS TO for colored SPGEMM
+    ////DECL IS AT _color.hpp
+    //////////////////////////////////////////////////////////////////////////
+    template <typename a_row_view_t__, typename a_nnz_view_t__, typename
+    a_scalar_view_t__, typename b_row_view_t__, typename b_nnz_view_t__,
+    typename b_scalar_view_t__, typename c_row_view_t__, typename
+    c_nnz_view_t__, typename c_scalar_view_t__> struct NumericCCOLOR;
+  */
+ private:
   /**
    * \brief Numeric phase with speed method
    */
-/*
-  template <typename c_row_view_t, typename c_lno_nnz_view_t, typename c_scalar_nnz_view_t>
-  void KokkosSPGEMM_numeric_color(
-      c_row_view_t rowmapC_,
-      c_lno_nnz_view_t entriesC_,
-      c_scalar_nnz_view_t valuesC_,
-      SPGEMMAlgorithm spgemm_algorithm);
+  /*
+    template <typename c_row_view_t, typename c_lno_nnz_view_t, typename
+    c_scalar_nnz_view_t> void KokkosSPGEMM_numeric_color( c_row_view_t rowmapC_,
+        c_lno_nnz_view_t entriesC_,
+        c_scalar_nnz_view_t valuesC_,
+        SPGEMMAlgorithm spgemm_algorithm);
 
-  template <typename c_row_view_t, typename c_nnz_view_t>
-  void d2_color_c_matrix(
-      c_row_view_t rowmapC,
-      c_nnz_view_t entryIndicesC_,
+    template <typename c_row_view_t, typename c_nnz_view_t>
+    void d2_color_c_matrix(
+        c_row_view_t rowmapC,
+        c_nnz_view_t entryIndicesC_,
 
-      nnz_lno_t &original_num_colors,
-      nnz_lno_persistent_work_host_view_t &h_color_xadj,
-      nnz_lno_persistent_work_view_t &color_adj,
-      nnz_lno_persistent_work_view_t &vertex_colors_to_store,
+        nnz_lno_t &original_num_colors,
+        nnz_lno_persistent_work_host_view_t &h_color_xadj,
+        nnz_lno_persistent_work_view_t &color_adj,
+        nnz_lno_persistent_work_view_t &vertex_colors_to_store,
 
-      nnz_lno_t &num_colors_in_one_step,
-      nnz_lno_t &num_multi_color_steps,
-      SPGEMMAlgorithm spgemm_algorithm);
-*/
-public:
+        nnz_lno_t &num_colors_in_one_step,
+        nnz_lno_t &num_multi_color_steps,
+        SPGEMMAlgorithm spgemm_algorithm);
+  */
+ public:
   //////////////////////////////////////////////////////////////////////////
   /////BELOW CODE IS TO for kkmem SPGEMM
   ////DECL IS AT _kkmem.hpp
   //////////////////////////////////////////////////////////////////////////
-  template <typename a_row_view_t, typename a_nnz_view_t, typename a_scalar_view_t,
-            typename b_row_view_t, typename b_nnz_view_t, typename b_scalar_view_t,
-            typename c_row_view_t, typename c_nnz_view_t, typename c_scalar_view_t,
-            typename pool_memory_type>
+  template <typename a_row_view_t, typename a_nnz_view_t,
+            typename a_scalar_view_t, typename b_row_view_t,
+            typename b_nnz_view_t, typename b_scalar_view_t,
+            typename c_row_view_t, typename c_nnz_view_t,
+            typename c_scalar_view_t, typename pool_memory_type>
   struct PortableNumericCHASH;
-private:
-  //KKMEM only difference is work memory does not use output memory for 2nd level accumulator.
-  template <typename c_row_view_t, typename c_lno_nnz_view_t, typename c_scalar_nnz_view_t>
+
+ private:
+  // KKMEM only difference is work memory does not use output memory for 2nd
+  // level accumulator.
+  template <typename c_row_view_t, typename c_lno_nnz_view_t,
+            typename c_scalar_nnz_view_t>
   void KokkosSPGEMM_numeric_hash2(
-        c_row_view_t rowmapC_,
-        c_lno_nnz_view_t entriesC_,
-        c_scalar_nnz_view_t valuesC_,
-        KokkosKernels::Impl::ExecSpaceType my_exec_space);
+      c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_,
+      c_scalar_nnz_view_t valuesC_,
+      KokkosKernels::Impl::ExecSpaceType my_exec_space);
 
-  template <typename c_row_view_t, typename c_lno_nnz_view_t, typename c_scalar_nnz_view_t>
+  template <typename c_row_view_t, typename c_lno_nnz_view_t,
+            typename c_scalar_nnz_view_t>
   void KokkosSPGEMM_numeric_hash(
-        c_row_view_t rowmapC_,
-        c_lno_nnz_view_t entriesC_,
-        c_scalar_nnz_view_t valuesC_,
-        KokkosKernels::Impl::ExecSpaceType my_exec_space);
-#if defined( KOKKOS_ENABLE_OPENMP )
+      c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_,
+      c_scalar_nnz_view_t valuesC_,
+      KokkosKernels::Impl::ExecSpaceType my_exec_space);
+#if defined(KOKKOS_ENABLE_OPENMP)
 #ifdef KOKKOSKERNELS_HAVE_OUTER
-public:
-  //OUTER PRODUCT CODES
+ public:
+  // OUTER PRODUCT CODES
   struct Triplet;
 
-  template <typename a_col_view_t, typename a_nnz_view_t, typename a_scalar_view_t,
-            typename b_row_view_t, typename b_nnz_view_t, typename b_scalar_view_t,
+  template <typename a_col_view_t, typename a_nnz_view_t,
+            typename a_scalar_view_t, typename b_row_view_t,
+            typename b_nnz_view_t, typename b_scalar_view_t,
             typename flop_row_view_t>
   struct OuterProduct;
 
-  template <typename a_row_view_t, typename b_row_view_t, typename flop_row_view_t>
+  template <typename a_row_view_t, typename b_row_view_t,
+            typename flop_row_view_t>
   struct FlopsPerRowOuter;
-private:
+
+ private:
   template <typename triplet_view_t>
   void sort_triplets(triplet_view_t triplets, size_t num_triplets);
 
   template <typename host_triplet_view_t>
-  void merge_triplets_on_slow_memory(
-      host_triplet_view_t *triplets,
-      size_t num_blocks,
-      size_t overall_size,
-      host_triplet_view_t output_triplets);
-
-  template <typename triplet_view_t,
-            typename c_row_view_t,
+  void merge_triplets_on_slow_memory(host_triplet_view_t *triplets,
+                                     size_t num_blocks, size_t overall_size,
+                                     host_triplet_view_t output_triplets);
+
+  template <typename triplet_view_t, typename c_row_view_t,
             typename c_lno_nnz_view_t, typename c_scalar_nnz_view_t>
-  size_t final_collapse_triplets_omp(
-      triplet_view_t triplets,
-      size_t num_triplets,
-      c_row_view_t &rowmapC_,
-      c_lno_nnz_view_t &entriesC_,
-      c_scalar_nnz_view_t &valuesC_);
+  size_t final_collapse_triplets_omp(triplet_view_t triplets,
+                                     size_t num_triplets,
+                                     c_row_view_t &rowmapC_,
+                                     c_lno_nnz_view_t &entriesC_,
+                                     c_scalar_nnz_view_t &valuesC_);
 
   template <typename triplet_view_t>
   size_t collapse_triplets(triplet_view_t triplets, size_t num_triplets);
 
   template <typename triplet_view_t>
-  size_t collapse_triplets_omp(triplet_view_t triplets, size_t num_triplets, triplet_view_t out_triplets);
+  size_t collapse_triplets_omp(triplet_view_t triplets, size_t num_triplets,
+                               triplet_view_t out_triplets);
 
 #endif
 #endif
 
-  template <typename c_row_view_t, typename c_lno_nnz_view_t, typename c_scalar_nnz_view_t>
-    void KokkosSPGEMM_numeric_outer(
-          c_row_view_t &rowmapC_,
-          c_lno_nnz_view_t &entriesC_,
-          c_scalar_nnz_view_t &valuesC_,
-          KokkosKernels::Impl::ExecSpaceType my_exec_space);
+  template <typename c_row_view_t, typename c_lno_nnz_view_t,
+            typename c_scalar_nnz_view_t>
+  void KokkosSPGEMM_numeric_outer(
+      c_row_view_t &rowmapC_, c_lno_nnz_view_t &entriesC_,
+      c_scalar_nnz_view_t &valuesC_,
+      KokkosKernels::Impl::ExecSpaceType my_exec_space);
   //////////////////////////////////////////////////////////////////////////
   //////////////////////////////////////////////////////////////////////////
 
-
 #ifdef KOKKOSKERNELS_ANALYZE_MEMORYACCESS
   //////////////////////////////////////////////////////////////////////////
   /////BELOW CODE IS TO CALCULATE MEMORY ACCESSES WITH HYPERGRAPH MODEL/////
   ////DECL IS AT _memaccess.hpp
   //////////////////////////////////////////////////////////////////////////
-public:
-
-  //Functor to calculate how many flops is performed per row of C.
-  template <typename a_row_view_t, typename a_nnz_view_t,
-            typename b_row_view_t, typename b_nnz_view_t, typename c_row_view_t>
+ public:
+  // Functor to calculate how many flops is performed per row of C.
+  template <typename a_row_view_t, typename a_nnz_view_t, typename b_row_view_t,
+            typename b_nnz_view_t, typename c_row_view_t>
   struct FlopsPerRow;
   struct Cache;
-private:
-  void create_read_write_hg(
-      size_t &overall_flops,
-      row_lno_temp_work_view_t &c_flop_rowmap,
-      row_lno_temp_work_view_t &c_comp_a_net_index,
-      row_lno_temp_work_view_t &c_comp_b_net_index,
-      nnz_lno_temp_work_view_t &c_comp_row_index,
-      nnz_lno_temp_work_view_t &c_comp_col_index);
+
+ private:
+  void create_read_write_hg(size_t &overall_flops,
+                            row_lno_temp_work_view_t &c_flop_rowmap,
+                            row_lno_temp_work_view_t &c_comp_a_net_index,
+                            row_lno_temp_work_view_t &c_comp_b_net_index,
+                            nnz_lno_temp_work_view_t &c_comp_row_index,
+                            nnz_lno_temp_work_view_t &c_comp_col_index);
 
   template <typename c_row_view_t>
   void print_read_write_cost(c_row_view_t rowmapC);
 
-
   template <typename c_row_view_t>
   void read_write_cost(
-      nnz_lno_t num_colors,
-      nnz_lno_t num_multi_colors,
-      nnz_lno_t num_parallel_colors,
-      bool isGPU,
-      int num_cores,
-
-      nnz_lno_t num_hyperthreads_in_core,
-      nnz_lno_t hyper_threads_in_team,
-
-      int vectorlane,
-      const int cache_line_size,
-      const int data_size,
+      nnz_lno_t num_colors, nnz_lno_t num_multi_colors,
+      nnz_lno_t num_parallel_colors, bool isGPU, int num_cores,
+
+      nnz_lno_t num_hyperthreads_in_core, nnz_lno_t hyper_threads_in_team,
+
+      int vectorlane, const int cache_line_size, const int data_size,
       const int cache_size,
 
       nnz_lno_persistent_work_host_view_t color_xadj,
@@ -558,119 +582,125 @@ class KokkosSPGEMM{
       typename nnz_lno_temp_work_view_t::HostMirror c_comp_row_index,
       typename nnz_lno_temp_work_view_t::HostMirror c_comp_col_index,
       c_row_view_t rowmapC,
-      int write_type //0 -- KKMEM, 1-KKSPEED, 2- KKCOLOR 3-KKMULTICOLOR 4-KKMULTICOLOR2
-      );
+      int write_type  // 0 -- KKMEM, 1-KKSPEED, 2- KKCOLOR 3-KKMULTICOLOR
+                      // 4-KKMULTICOLOR2
+  );
 
 #endif
 
-public:
-
+ public:
   //////////////////////////////////////////////////////////////////////////
   /////BELOW CODE IS for public symbolic and numeric functions
   ////DECL IS AT _def.hpp
   //////////////////////////////////////////////////////////////////////////
-  template <typename c_row_view_t, typename c_lno_nnz_view_t, typename c_scalar_nnz_view_t>
-  void KokkosSPGEMM_numeric(c_row_view_t &rowmapC_, c_lno_nnz_view_t &entriesC_, c_scalar_nnz_view_t &valuesC_);
-  //TODO: These are references only for outer product algorithm.
-  //If the algorithm is removed, then remove the references.
-
+  template <typename c_row_view_t, typename c_lno_nnz_view_t,
+            typename c_scalar_nnz_view_t>
+  void KokkosSPGEMM_numeric(c_row_view_t &rowmapC_, c_lno_nnz_view_t &entriesC_,
+                            c_scalar_nnz_view_t &valuesC_);
+  // TODO: These are references only for outer product algorithm.
+  // If the algorithm is removed, then remove the references.
 
   /**
    * \brief Symbolic phase of the SPGEMM.
-   * \param rowmapC_: row pointers for the result matrix. Allocated before the call with size (n+1),
-   * where n is the number of rows of first matrix.
+   * \param rowmapC_: row pointers for the result matrix. Allocated before the
+   * call with size (n+1), where n is the number of rows of first matrix.
    */
   template <typename c_row_view_t>
   void KokkosSPGEMM_symbolic(c_row_view_t rowmapC_);
 
   template <typename c_row_view_t, typename c_nnz_view_t>
-  void write_matrix_to_plot(
-      nnz_lno_t &num_colors,
-      nnz_lno_persistent_work_host_view_t &h_color_xadj,
-      nnz_lno_persistent_work_view_t &color_adj,
-      c_row_view_t &rowmapC, c_nnz_view_t &entryIndicesC_);
-
-  KokkosSPGEMM(
-      HandleType *handle_,
-      nnz_lno_t m_,
-      nnz_lno_t n_,
-      nnz_lno_t k_,
-      const_a_lno_row_view_t row_mapA_,
-      const_a_lno_nnz_view_t entriesA_,
-      bool transposeA_,
-      const_b_lno_row_view_t row_mapB_,
-      const_b_lno_nnz_view_t entriesB_,
-      bool transposeB_):handle (handle_), a_row_cnt(m_), b_row_cnt(n_), b_col_cnt(k_),
-          row_mapA(row_mapA_), entriesA(entriesA_), valsA(), transposeA(transposeA_),
-          row_mapB(row_mapB_), entriesB(entriesB_), valsB(), transposeB(transposeB_),
-          shmem_size(handle_->get_shmem_size()), concurrency(MyExecSpace::concurrency()),
-          use_dynamic_schedule(handle_->is_dynamic_scheduling()),
-          KOKKOSKERNELS_VERBOSE(handle_->get_verbose()),
-          MyEnumExecSpace(this->handle->get_handle_exec_space()),
-          spgemm_algorithm(this->handle->get_spgemm_handle()->get_algorithm_type()),
-          spgemm_accumulator(this->handle->get_spgemm_handle()->get_accumulator_type())
-          //,row_mapC(), entriesC(), valsC()
-          {}
-
-  KokkosSPGEMM(
-      HandleType *handle_,
-      nnz_lno_t m_,
-      nnz_lno_t n_,
-      nnz_lno_t k_,
-        const_a_lno_row_view_t row_mapA_,
-        const_a_lno_nnz_view_t entriesA_,
-        const_a_scalar_nnz_view_t valsA_,
-        bool transposeA_,
-        const_b_lno_row_view_t row_mapB_,
-        const_b_lno_nnz_view_t entriesB_,
-        const_b_scalar_nnz_view_t valsB_,
-        bool transposeB_):handle (handle_), a_row_cnt(m_), b_row_cnt(n_), b_col_cnt(k_),
-            row_mapA(row_mapA_), entriesA(entriesA_), valsA(valsA_), transposeA(transposeA_),
-            row_mapB(row_mapB_), entriesB(entriesB_), valsB(valsB_), transposeB(transposeB_),
-            shmem_size(handle_->get_shmem_size()), concurrency(MyExecSpace::concurrency()),
-            use_dynamic_schedule(handle_->is_dynamic_scheduling()),
-            KOKKOSKERNELS_VERBOSE(handle_->get_verbose()),
-            MyEnumExecSpace(this->handle->get_handle_exec_space()),
-            spgemm_algorithm(this->handle->get_spgemm_handle()->get_algorithm_type()),
-            spgemm_accumulator(this->handle->get_spgemm_handle()->get_accumulator_type())
-            //,row_mapB(), entriesC(), valsC()
-            {}
-
-
-
-
-
-
+  void write_matrix_to_plot(nnz_lno_t &num_colors,
+                            nnz_lno_persistent_work_host_view_t &h_color_xadj,
+                            nnz_lno_persistent_work_view_t &color_adj,
+                            c_row_view_t &rowmapC,
+                            c_nnz_view_t &entryIndicesC_);
+
+  KokkosSPGEMM(HandleType *handle_, nnz_lno_t m_, nnz_lno_t n_, nnz_lno_t k_,
+               const_a_lno_row_view_t row_mapA_,
+               const_a_lno_nnz_view_t entriesA_, bool transposeA_,
+               const_b_lno_row_view_t row_mapB_,
+               const_b_lno_nnz_view_t entriesB_, bool transposeB_)
+      : handle(handle_),
+        a_row_cnt(m_),
+        b_row_cnt(n_),
+        b_col_cnt(k_),
+        row_mapA(row_mapA_),
+        entriesA(entriesA_),
+        valsA(),
+        transposeA(transposeA_),
+        row_mapB(row_mapB_),
+        entriesB(entriesB_),
+        valsB(),
+        transposeB(transposeB_),
+        shmem_size(handle_->get_shmem_size()),
+        concurrency(MyExecSpace::concurrency()),
+        use_dynamic_schedule(handle_->is_dynamic_scheduling()),
+        KOKKOSKERNELS_VERBOSE(handle_->get_verbose()),
+        MyEnumExecSpace(this->handle->get_handle_exec_space()),
+        spgemm_algorithm(
+            this->handle->get_spgemm_handle()->get_algorithm_type()),
+        spgemm_accumulator(
+            this->handle->get_spgemm_handle()->get_accumulator_type())
+  //,row_mapC(), entriesC(), valsC()
+  {}
+
+  KokkosSPGEMM(HandleType *handle_, nnz_lno_t m_, nnz_lno_t n_, nnz_lno_t k_,
+               const_a_lno_row_view_t row_mapA_,
+               const_a_lno_nnz_view_t entriesA_,
+               const_a_scalar_nnz_view_t valsA_, bool transposeA_,
+               const_b_lno_row_view_t row_mapB_,
+               const_b_lno_nnz_view_t entriesB_,
+               const_b_scalar_nnz_view_t valsB_, bool transposeB_)
+      : handle(handle_),
+        a_row_cnt(m_),
+        b_row_cnt(n_),
+        b_col_cnt(k_),
+        row_mapA(row_mapA_),
+        entriesA(entriesA_),
+        valsA(valsA_),
+        transposeA(transposeA_),
+        row_mapB(row_mapB_),
+        entriesB(entriesB_),
+        valsB(valsB_),
+        transposeB(transposeB_),
+        shmem_size(handle_->get_shmem_size()),
+        concurrency(MyExecSpace::concurrency()),
+        use_dynamic_schedule(handle_->is_dynamic_scheduling()),
+        KOKKOSKERNELS_VERBOSE(handle_->get_verbose()),
+        MyEnumExecSpace(this->handle->get_handle_exec_space()),
+        spgemm_algorithm(
+            this->handle->get_spgemm_handle()->get_algorithm_type()),
+        spgemm_accumulator(
+            this->handle->get_spgemm_handle()->get_accumulator_type())
+  //,row_mapB(), entriesC(), valsC()
+  {}
 
   //////////////////////////////////////////////////////////////////////////
   /////BELOW CODE IS for symbolic phase
   ////DECL IS AT _symbolic.hpp
   //////////////////////////////////////////////////////////////////////////
-public:
+ public:
   /***
    * \brief Functor to calculate the row sizes of C.
    */
   template <typename a_row_view_t, typename a_nnz_view_t,
-            typename b_original_row_view_t,
-            typename b_compressed_row_view_t, typename b_nnz_view_t,
-            typename c_row_view_t, //typename nnz_lno_temp_work_view_t,
+            typename b_original_row_view_t, typename b_compressed_row_view_t,
+            typename b_nnz_view_t,
+            typename c_row_view_t,  // typename nnz_lno_temp_work_view_t,
             typename pool_memory_space>
   struct StructureC;
 
   template <typename a_row_view_t, typename a_nnz_view_t,
-            typename b_original_row_view_t,
-            typename b_compressed_row_view_t, typename b_nnz_view_t,
-            typename c_row_view_t, //typename nnz_lno_temp_work_view_t,
+            typename b_original_row_view_t, typename b_compressed_row_view_t,
+            typename b_nnz_view_t,
+            typename c_row_view_t,  // typename nnz_lno_temp_work_view_t,
             typename pool_memory_space>
   struct StructureC_NC;
 
-
-
   template <typename a_row_view_t, typename a_nnz_view_t,
-            typename b_original_row_view_t,
-            typename b_compressed_row_view_t, typename b_nnz_view_t,
-            typename c_row_view_t, typename nnz_lno_temp_work_view_t,
-            typename pool_memory_space>
+            typename b_original_row_view_t, typename b_compressed_row_view_t,
+            typename b_nnz_view_t, typename c_row_view_t,
+            typename nnz_lno_temp_work_view_t, typename pool_memory_space>
   struct NonzeroesC;
 
   /**
@@ -683,7 +713,8 @@ class KokkosSPGEMM{
 
   struct PredicMaxRowNNZIntersection;
   struct PredicMaxRowNNZ_p;
-private:
+
+ private:
   /**
    * \brief function return max flops for a row in the result multiplication.
    * \param m: number of rows in A
@@ -694,120 +725,112 @@ class KokkosSPGEMM{
    */
   template <typename a_row_view_t, typename a_nnz_view_t,
             typename b_oldrow_view_t, typename b_row_view_t>
-  size_t getMaxRoughRowNNZ(
-      nnz_lno_t m,
-      a_row_view_t row_mapA_,
-      a_nnz_view_t entriesA_,
-
-      b_oldrow_view_t row_pointers_begin_B,
-      b_row_view_t row_pointers_end_B
-	  ,size_type *flops_per_row = NULL);
+  size_t getMaxRoughRowNNZ(nnz_lno_t m, a_row_view_t row_mapA_,
+                           a_nnz_view_t entriesA_,
 
+                           b_oldrow_view_t row_pointers_begin_B,
+                           b_row_view_t row_pointers_end_B,
+                           size_type *flops_per_row = NULL);
 
-  size_t getMaxRoughRowNNZ_p(
-      const nnz_lno_t m, const  size_type annz,
-      const size_type * row_mapA_,
-      const nnz_lno_t * entriesA_,
+  size_t getMaxRoughRowNNZ_p(const nnz_lno_t m, const size_type annz,
+                             const size_type *row_mapA_,
+                             const nnz_lno_t *entriesA_,
 
-      const size_type * row_pointers_begin_B,
-      const size_type * row_pointers_end_B);
+                             const size_type *row_pointers_begin_B,
+                             const size_type *row_pointers_end_B);
 
   size_t getMaxRoughRowNNZIntersection_p(
-      const nnz_lno_t m,const  size_type annz,
-      const size_type * row_mapA_,
-      const nnz_lno_t * entriesA_,
+      const nnz_lno_t m, const size_type annz, const size_type *row_mapA_,
+      const nnz_lno_t *entriesA_,
 
-      const size_type * row_pointers_begin_B,
-      const size_type * row_pointers_end_B,
-      nnz_lno_t * min_result_row_for_each_row
-      );
+      const size_type *row_pointers_begin_B,
+      const size_type *row_pointers_end_B,
+      nnz_lno_t *min_result_row_for_each_row);
 
   template <typename a_r_view_t, typename a_nnz_view_t,
-              typename b_original_row_view_t,
-              typename b_compressed_row_view_t, typename b_nnz_view_t,
-              typename c_row_view_t>
-  void symbolic_c(
-      nnz_lno_t m,
-      a_r_view_t row_mapA_,
-      a_nnz_view_t entriesA_,
-
-      b_original_row_view_t old_row_mapB,
-      b_compressed_row_view_t row_mapB_,
-      b_nnz_view_t entriesSetIndex,
-      b_nnz_view_t entriesSets,
+            typename b_original_row_view_t, typename b_compressed_row_view_t,
+            typename b_nnz_view_t, typename c_row_view_t>
+  void symbolic_c(nnz_lno_t m, a_r_view_t row_mapA_, a_nnz_view_t entriesA_,
 
-      c_row_view_t rowmapC,
-      nnz_lno_t maxNumRoughNonzeros
-  );
+                  b_original_row_view_t old_row_mapB,
+                  b_compressed_row_view_t row_mapB_,
+                  b_nnz_view_t entriesSetIndex, b_nnz_view_t entriesSets,
+
+                  c_row_view_t rowmapC, nnz_lno_t maxNumRoughNonzeros);
 
   template <typename a_r_view_t, typename a_nnz_view_t,
-              typename b_original_row_view_t,
-              typename b_compressed_row_view_t, typename b_nnz_view_t,
-              typename c_row_view_t>
-  void symbolic_c_no_compression(
-		    nnz_lno_t m,
-		    a_r_view_t row_mapA_,
-		    a_nnz_view_t entriesA_,
-
-		    b_original_row_view_t b_rowmap_begin,
-		    b_compressed_row_view_t b_rowmap_end,
-		    b_nnz_view_t entriesb_,
-		    c_row_view_t rowmapC,
-		    nnz_lno_t maxNumRoughNonzeros
-		  );
+            typename b_original_row_view_t, typename b_compressed_row_view_t,
+            typename b_nnz_view_t, typename c_row_view_t>
+  void symbolic_c_no_compression(nnz_lno_t m, a_r_view_t row_mapA_,
+                                 a_nnz_view_t entriesA_,
+
+                                 b_original_row_view_t b_rowmap_begin,
+                                 b_compressed_row_view_t b_rowmap_end,
+                                 b_nnz_view_t entriesb_, c_row_view_t rowmapC,
+                                 nnz_lno_t maxNumRoughNonzeros);
 
   //////////////////////////////////////////////////////////////////////////
-  ///// Jacobi-fused SpGEMM declarations 
+  ///// Jacobi-fused SpGEMM declarations
   //////////////////////////////////////////////////////////////////////////
-public:
-
-  template <typename a_row_view_t, typename a_nnz_view_t, typename a_scalar_view_t,
-            typename b_row_view_t, typename b_nnz_view_t, typename b_scalar_view_t,
-            typename c_row_view_t, typename c_nnz_view_t, typename c_scalar_view_t,
-	    typename dinv_view_t,
-            typename pool_memory_type>
+ public:
+  template <
+      typename a_row_view_t, typename a_nnz_view_t, typename a_scalar_view_t,
+      typename b_row_view_t, typename b_nnz_view_t, typename b_scalar_view_t,
+      typename c_row_view_t, typename c_nnz_view_t, typename c_scalar_view_t,
+      typename dinv_view_t, typename pool_memory_type>
   struct JacobiSpGEMMSparseAcc;
 
-  template <typename a_row_view_t, typename a_nnz_view_t, typename a_scalar_view_t,
-            typename b_row_view_t, typename b_nnz_view_t, typename b_scalar_view_t,
-            typename c_row_view_t, typename c_nnz_view_t, typename c_scalar_view_t,
-	    typename dinv_view_t,
-            typename mpool_type>
+  template <typename a_row_view_t, typename a_nnz_view_t,
+            typename a_scalar_view_t, typename b_row_view_t,
+            typename b_nnz_view_t, typename b_scalar_view_t,
+            typename c_row_view_t, typename c_nnz_view_t,
+            typename c_scalar_view_t, typename dinv_view_t, typename mpool_type>
   struct JacobiSpGEMMDenseAcc;
 
-  template <typename c_row_view_t, typename c_lno_nnz_view_t, typename c_scalar_nnz_view_t, 
-	    typename dinv_view_t>
-  void KokkosSPGEMM_jacobi_sparseacc(c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_, c_scalar_nnz_view_t valuesC_, 
-				     typename c_scalar_nnz_view_t::const_value_type omega, dinv_view_t dinv, 
-				     KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space);
-
-private:
-  template <typename c_row_view_t, typename c_lno_nnz_view_t, typename c_scalar_nnz_view_t, typename dinv_view_t>
-  void KokkosSPGEMM_jacobi_denseacc(c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_, c_scalar_nnz_view_t valuesC_,
-				    typename c_scalar_nnz_view_t::const_value_type omega, dinv_view_t dinv,
-				    KokkosKernels::Impl::ExecSpaceType my_exec_space);
+  template <typename c_row_view_t, typename c_lno_nnz_view_t,
+            typename c_scalar_nnz_view_t, typename dinv_view_t>
+  void KokkosSPGEMM_jacobi_sparseacc(
+      c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_,
+      c_scalar_nnz_view_t valuesC_,
+      typename c_scalar_nnz_view_t::const_value_type omega, dinv_view_t dinv,
+      KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space);
+
+ private:
+  template <typename c_row_view_t, typename c_lno_nnz_view_t,
+            typename c_scalar_nnz_view_t, typename dinv_view_t>
+  void KokkosSPGEMM_jacobi_denseacc(
+      c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_,
+      c_scalar_nnz_view_t valuesC_,
+      typename c_scalar_nnz_view_t::const_value_type omega, dinv_view_t dinv,
+      KokkosKernels::Impl::ExecSpaceType my_exec_space);
 
-  //Utility to compute the number of pool chunks for L2 hashmap accumulators.
-  //Uses free memory query for accelerators/GPUs but assumes infinite available host memory.
+  // Utility to compute the number of pool chunks for L2 hashmap accumulators.
+  // Uses free memory query for accelerators/GPUs but assumes infinite available
+  // host memory.
   //
-  //chunk_bytes: bytes in each chunk
-  //ideal_num_chunks: number of chunks that would give each thread/team its own chunk (no contention)
-  template<typename Pool>
-  size_t compute_num_pool_chunks(size_t chunk_bytes, size_t ideal_num_chunks)
-  {
-    if(!KokkosKernels::Impl::kk_is_gpu_exec_space<typename Pool::execution_space>())
+  // chunk_bytes: bytes in each chunk
+  // ideal_num_chunks: number of chunks that would give each thread/team its own
+  // chunk (no contention)
+  template <typename Pool>
+  size_t compute_num_pool_chunks(size_t chunk_bytes, size_t ideal_num_chunks) {
+    if (!KokkosKernels::Impl::kk_is_gpu_exec_space<
+            typename Pool::execution_space>())
       return ideal_num_chunks;
     size_t free_byte, total_byte;
-    KokkosKernels::Impl::kk_get_free_total_memory<typename Pool::memory_space>(free_byte, total_byte);
+    KokkosKernels::Impl::kk_get_free_total_memory<typename Pool::memory_space>(
+        free_byte, total_byte);
     size_t required_size = ideal_num_chunks * chunk_bytes;
     if (KOKKOSKERNELS_VERBOSE)
-      std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
+      std::cout << "\tmempool required size:" << required_size
+                << " free_byte:" << free_byte << " total_byte:" << total_byte
+                << std::endl;
     size_t num_chunks = ideal_num_chunks;
-    //If there is not enough memory to safely allocate ideal_num_chunks, use half the free memory, rounded down
+    // If there is not enough memory to safely allocate ideal_num_chunks, use
+    // half the free memory, rounded down
     if (required_size > free_byte / 2) {
       num_chunks = (free_byte / 2) / chunk_bytes;
     }
-    //then take the largest power of 2 smaller than that
+    // then take the largest power of 2 smaller than that
     size_t po2_num_chunks = 1;
     while (po2_num_chunks * 2 < num_chunks) {
       po2_num_chunks *= 2;
@@ -816,8 +839,8 @@ class KokkosSPGEMM{
   }
 };
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosSparse
 #include "KokkosSparse_spgemm_imp_outer.hpp"
 #include "KokkosSparse_spgemm_impl_memaccess.hpp"
 #include "KokkosSparse_spgemm_impl_kkmem.hpp"
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_color.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_color.hpp
index 218d5e12ba..f7c1e5430a 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_color.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_color.hpp
@@ -44,20 +44,22 @@
 
 #include "KokkosGraph_Distance2Color.hpp"
 
-namespace KokkosSparse{
-
-namespace Impl{
-
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-template <typename a_row_view_t__, typename a_nnz_view_t__, typename a_scalar_view_t__,
-          typename b_row_view_t__, typename b_nnz_view_t__, typename b_scalar_view_t__,
-          typename c_row_view_t__, typename c_nnz_view_t__, typename c_scalar_view_t__>
-struct KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-  NumericCCOLOR{
+namespace KokkosSparse {
+
+namespace Impl {
+
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename a_row_view_t__, typename a_nnz_view_t__,
+          typename a_scalar_view_t__, typename b_row_view_t__,
+          typename b_nnz_view_t__, typename b_scalar_view_t__,
+          typename c_row_view_t__, typename c_nnz_view_t__,
+          typename c_scalar_view_t__>
+struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                    a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                    b_scalar_nnz_view_t_>::NumericCCOLOR {
   nnz_lno_t numrows;
   nnz_lno_t numcols;
 
@@ -72,15 +74,14 @@ struct KokkosSPGEMM
   c_row_view_t__ rowmapC;
   c_nnz_view_t__ entriesC;
   c_scalar_view_t__ valuesC;
-  nnz_lno_t *pEntriesC ;
+  nnz_lno_t *pEntriesC;
   scalar_t *pVals;
 
-  scalar_temp_work_view_t denseAccumulator; //initially all zeroes
-  scalar_t *pdenseAccumulator; //initially all zeroes
+  scalar_temp_work_view_t denseAccumulator;  // initially all zeroes
+  scalar_t *pdenseAccumulator;               // initially all zeroes
 
-
-  //bool_temp_view_t denseAccumulatorFlags; //initially all false.
-  //bool * pdenseAccumulatorFlags; //initially all false.
+  // bool_temp_view_t denseAccumulatorFlags; //initially all false.
+  // bool * pdenseAccumulatorFlags; //initially all false.
 
   nnz_lno_t team_work_size;
 
@@ -96,29 +97,23 @@ struct KokkosSPGEMM
   nnz_lno_t chunk_and;
   NumericCCOLOR(
 
-      nnz_lno_t m_,
-      nnz_lno_t k_,
-
+      nnz_lno_t m_, nnz_lno_t k_,
 
-      a_row_view_t__ row_mapA_,
-      a_nnz_view_t__ entriesA_,
+      a_row_view_t__ row_mapA_, a_nnz_view_t__ entriesA_,
       a_scalar_view_t__ valuesA_,
 
-      b_row_view_t__ row_mapB_,
-      b_nnz_view_t__ entriesB_,
+      b_row_view_t__ row_mapB_, b_nnz_view_t__ entriesB_,
       b_scalar_view_t__ valuesB_,
 
-      c_row_view_t__ rowmapC_,
-      c_nnz_view_t__ entriesC_,
+      c_row_view_t__ rowmapC_, c_nnz_view_t__ entriesC_,
       c_scalar_view_t__ valuesC_,
-      scalar_temp_work_view_t denseAccumulator_, //initially all zeroes
-      //bool_temp_view_t denseAccumulatorFlags_, //initially all false.
-
-
+      scalar_temp_work_view_t denseAccumulator_,  // initially all zeroes
+      // bool_temp_view_t denseAccumulatorFlags_, //initially all false.
 
-      nnz_lno_t team_row_work_size_):
-        numrows(m_), numcols(k_),
-        row_mapA (row_mapA_),
+      nnz_lno_t team_row_work_size_)
+      : numrows(m_),
+        numcols(k_),
+        row_mapA(row_mapA_),
         entriesA(entriesA_),
         valuesA(valuesA_),
 
@@ -128,499 +123,554 @@ struct KokkosSPGEMM
 
         rowmapC(rowmapC_),
         entriesC(entriesC_),
-        valuesC(valuesC_), pEntriesC(entriesC_.data()), pVals(valuesC.data()),
-        denseAccumulator (denseAccumulator_), pdenseAccumulator(denseAccumulator_.data()),
-        //denseAccumulatorFlags (denseAccumulatorFlags_), pdenseAccumulatorFlags(denseAccumulatorFlags_.data()),
+        valuesC(valuesC_),
+        pEntriesC(entriesC_.data()),
+        pVals(valuesC.data()),
+        denseAccumulator(denseAccumulator_),
+        pdenseAccumulator(denseAccumulator_.data()),
+        // denseAccumulatorFlags (denseAccumulatorFlags_),
+        // pdenseAccumulatorFlags(denseAccumulatorFlags_.data()),
         team_work_size(team_row_work_size_),
         color_begin(0),
         color_end(0),
         color_adj(),
-        vertex_colors(), consecutive_chunk_size(numcols), consecutive_all_color_chunk_size(numcols), chunk_divison (0), chunk_and(0)
-        {
-        }
+        vertex_colors(),
+        consecutive_chunk_size(numcols),
+        consecutive_all_color_chunk_size(numcols),
+        chunk_divison(0),
+        chunk_and(0) {}
 
-  //one color at a time.
+  // one color at a time.
   KOKKOS_INLINE_FUNCTION
-  void operator()(const Numeric1Tag&, const team_member_t & teamMember) const {
-
+  void operator()(const Numeric1Tag &, const team_member_t &teamMember) const {
     const nnz_lno_t team_row_begin =
         teamMember.league_rank() * team_work_size + color_begin;
     const nnz_lno_t team_row_end =
         KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, color_end);
     Kokkos::parallel_for(
         Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
-        [&] (const nnz_lno_t& color_index_index) {
-      nnz_lno_t row_index = color_adj(color_index_index);
-
-      //nnz_lno_t color = vertex_colors(row_index);
-      scalar_t *mydenseAccumulator = pdenseAccumulator;// + numcols * color;
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t left_work = nnz_lno_t(row_mapA[row_index + 1] - col_begin);
-      const size_type c_row_begin = rowmapC[row_index];
-      nnz_lno_t *my_entries = pEntriesC + c_row_begin;
-      for (nnz_lno_t colind = 0; colind < left_work; ++colind){
-        size_type a_col = colind + col_begin;
-        nnz_lno_t rowB = entriesA[a_col];
-        scalar_t valA = valuesA[a_col];
-
-        size_type rowBegin = row_mapB(rowB);
-        nnz_lno_t left_work_ = row_mapB(rowB + 1) - rowBegin;
-
-        Kokkos::parallel_for(
-            Kokkos::ThreadVectorRange(teamMember, left_work_),
-            [&] (nnz_lno_t i) {
-          const size_type adjind = i + rowBegin;
-          nnz_lno_t acc_index = entriesB[adjind];
-          scalar_t b_val = valuesB[adjind] * valA;
-          mydenseAccumulator[acc_index] += b_val;
+        [&](const nnz_lno_t &color_index_index) {
+          nnz_lno_t row_index = color_adj(color_index_index);
+
+          // nnz_lno_t color = vertex_colors(row_index);
+          scalar_t *mydenseAccumulator =
+              pdenseAccumulator;  // + numcols * color;
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t left_work =
+              nnz_lno_t(row_mapA[row_index + 1] - col_begin);
+          const size_type c_row_begin = rowmapC[row_index];
+          nnz_lno_t *my_entries       = pEntriesC + c_row_begin;
+          for (nnz_lno_t colind = 0; colind < left_work; ++colind) {
+            size_type a_col = colind + col_begin;
+            nnz_lno_t rowB  = entriesA[a_col];
+            scalar_t valA   = valuesA[a_col];
+
+            size_type rowBegin   = row_mapB(rowB);
+            nnz_lno_t left_work_ = row_mapB(rowB + 1) - rowBegin;
+
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(teamMember, left_work_),
+                [&](nnz_lno_t i) {
+                  const size_type adjind = i + rowBegin;
+                  nnz_lno_t acc_index    = entriesB[adjind];
+                  scalar_t b_val         = valuesB[adjind] * valA;
+                  mydenseAccumulator[acc_index] += b_val;
+                });
+          }
+          scalar_t *my_vals = pVals + c_row_begin;
+
+          nnz_lno_t row_size = rowmapC[row_index + 1] - c_row_begin;
+          Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember, row_size),
+                               [&](nnz_lno_t i) {
+                                 nnz_lno_t acc_index = my_entries[i];
+                                 my_vals[i] = mydenseAccumulator[acc_index];
+                                 mydenseAccumulator[acc_index] = 0;
+                               });
         });
-      }
-      scalar_t *my_vals = pVals + c_row_begin;
-
-      nnz_lno_t row_size = rowmapC[row_index + 1] - c_row_begin;
-      Kokkos::parallel_for(
-          Kokkos::ThreadVectorRange(teamMember, row_size),
-          [&] (nnz_lno_t i) {
-        nnz_lno_t acc_index = my_entries[i];
-        my_vals[i] = mydenseAccumulator[acc_index];
-        mydenseAccumulator[acc_index] = 0;
-      });
-    });
   }
 
-
-  //multi-color minimized writes
+  // multi-color minimized writes
   KOKKOS_INLINE_FUNCTION
-  void operator()(const Numeric2Tag&, const team_member_t & teamMember) const {
-
-
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size + color_begin;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, color_end);
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& color_index_index) {
-      nnz_lno_t row_index = color_adj(color_index_index);
-
-      nnz_lno_t color = vertex_colors(row_index);
-      scalar_t *mydenseAccumulator = pdenseAccumulator + numcols * color;
-
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t left_work = nnz_lno_t(row_mapA[row_index + 1] - col_begin);
-      const size_type c_row_begin = rowmapC[row_index];
-      nnz_lno_t *my_entries = pEntriesC + c_row_begin;
-      for (nnz_lno_t colind = 0; colind < left_work; ++colind){
-        size_type a_col = colind + col_begin;
-        nnz_lno_t rowB = entriesA[a_col];
-        scalar_t valA = valuesA[a_col];
-
-        size_type rowBegin = row_mapB(rowB);
-        nnz_lno_t left_work_ = row_mapB(rowB + 1) - rowBegin;
-
-        Kokkos::parallel_for(
-            Kokkos::ThreadVectorRange(teamMember, left_work_),
-            [&] (nnz_lno_t i) {
-          const size_type adjind = i + rowBegin;
-          nnz_lno_t acc_index = entriesB[adjind];
-          scalar_t b_val = valuesB[adjind] * valA;
-          mydenseAccumulator[acc_index] += b_val;
+  void operator()(const Numeric2Tag &, const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_work_size + color_begin;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, color_end);
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &color_index_index) {
+          nnz_lno_t row_index = color_adj(color_index_index);
+
+          nnz_lno_t color              = vertex_colors(row_index);
+          scalar_t *mydenseAccumulator = pdenseAccumulator + numcols * color;
+
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t left_work =
+              nnz_lno_t(row_mapA[row_index + 1] - col_begin);
+          const size_type c_row_begin = rowmapC[row_index];
+          nnz_lno_t *my_entries       = pEntriesC + c_row_begin;
+          for (nnz_lno_t colind = 0; colind < left_work; ++colind) {
+            size_type a_col = colind + col_begin;
+            nnz_lno_t rowB  = entriesA[a_col];
+            scalar_t valA   = valuesA[a_col];
+
+            size_type rowBegin   = row_mapB(rowB);
+            nnz_lno_t left_work_ = row_mapB(rowB + 1) - rowBegin;
+
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(teamMember, left_work_),
+                [&](nnz_lno_t i) {
+                  const size_type adjind = i + rowBegin;
+                  nnz_lno_t acc_index    = entriesB[adjind];
+                  scalar_t b_val         = valuesB[adjind] * valA;
+                  mydenseAccumulator[acc_index] += b_val;
+                });
+          }
+          scalar_t *my_vals = pVals + c_row_begin;
+
+          nnz_lno_t row_size = rowmapC[row_index + 1] - c_row_begin;
+          Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember, row_size),
+                               [&](nnz_lno_t i) {
+                                 nnz_lno_t acc_index = my_entries[i];
+                                 my_vals[i] = mydenseAccumulator[acc_index];
+                                 mydenseAccumulator[acc_index] = 0;
+                               });
         });
-      }
-      scalar_t *my_vals = pVals + c_row_begin;
-
-      nnz_lno_t row_size = rowmapC[row_index + 1] - c_row_begin;
-      Kokkos::parallel_for(
-          Kokkos::ThreadVectorRange(teamMember, row_size),
-          [&] (nnz_lno_t i) {
-        nnz_lno_t acc_index = my_entries[i];
-        my_vals[i] = mydenseAccumulator[acc_index];
-        mydenseAccumulator[acc_index] = 0;
-      });
-    });
   }
 
-  //multi-color minimized reads
+  // multi-color minimized reads
   KOKKOS_INLINE_FUNCTION
-  void operator()(const Numeric3Tag&, const team_member_t & teamMember) const {
-
-
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size + color_begin;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, color_end);
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& color_index_index) {
-      nnz_lno_t row_index = color_adj(color_index_index);
-
-      nnz_lno_t color = vertex_colors(row_index);
-      scalar_t *mydenseAccumulator = pdenseAccumulator + numcols * color;
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t left_work = nnz_lno_t(row_mapA[row_index + 1] - col_begin);
-      const size_type c_row_begin = rowmapC[row_index];
-      nnz_lno_t *my_entries = pEntriesC + c_row_begin;
-      for (nnz_lno_t colind = 0; colind < left_work; ++colind){
-        size_type a_col = colind + col_begin;
-        nnz_lno_t rowB = entriesA[a_col];
-        scalar_t valA = valuesA[a_col];
-
-        size_type rowBegin = row_mapB(rowB);
-        nnz_lno_t left_work_ = row_mapB(rowB + 1) - rowBegin;
-
-        Kokkos::parallel_for(
-            Kokkos::ThreadVectorRange(teamMember, left_work_),
-            [&] (nnz_lno_t i) {
-          const size_type adjind = i + rowBegin;
-          nnz_lno_t col_ind = entriesB[adjind];
-          nnz_lno_t acc_index = col_ind;
-          //nnz_lno_t acc_index = (col_ind  >> chunk_divison) * (consecutive_all_color_chunk_size) + (color << chunk_divison)+ (col_ind & chunk_and);
-          scalar_t b_val = valuesB[adjind] * valA;
-          mydenseAccumulator[acc_index] += b_val;
+  void operator()(const Numeric3Tag &, const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_work_size + color_begin;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, color_end);
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &color_index_index) {
+          nnz_lno_t row_index = color_adj(color_index_index);
+
+          nnz_lno_t color              = vertex_colors(row_index);
+          scalar_t *mydenseAccumulator = pdenseAccumulator + numcols * color;
+          const size_type col_begin    = row_mapA[row_index];
+          const nnz_lno_t left_work =
+              nnz_lno_t(row_mapA[row_index + 1] - col_begin);
+          const size_type c_row_begin = rowmapC[row_index];
+          nnz_lno_t *my_entries       = pEntriesC + c_row_begin;
+          for (nnz_lno_t colind = 0; colind < left_work; ++colind) {
+            size_type a_col = colind + col_begin;
+            nnz_lno_t rowB  = entriesA[a_col];
+            scalar_t valA   = valuesA[a_col];
+
+            size_type rowBegin   = row_mapB(rowB);
+            nnz_lno_t left_work_ = row_mapB(rowB + 1) - rowBegin;
+
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(teamMember, left_work_),
+                [&](nnz_lno_t i) {
+                  const size_type adjind = i + rowBegin;
+                  nnz_lno_t col_ind      = entriesB[adjind];
+                  nnz_lno_t acc_index    = col_ind;
+                  // nnz_lno_t acc_index = (col_ind  >> chunk_divison) *
+                  // (consecutive_all_color_chunk_size) + (color <<
+                  // chunk_divison)+ (col_ind & chunk_and);
+                  scalar_t b_val = valuesB[adjind] * valA;
+                  mydenseAccumulator[acc_index] += b_val;
+                });
+          }
+          scalar_t *my_vals = pVals + c_row_begin;
+
+          nnz_lno_t row_size = rowmapC[row_index + 1] - c_row_begin;
+          Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember, row_size),
+                               [&](nnz_lno_t i) {
+                                 nnz_lno_t col_ind   = my_entries[i];
+                                 nnz_lno_t acc_index = col_ind;
+
+                                 // nnz_lno_t acc_index = (col_ind  >>
+                                 // chunk_divison) *
+                                 // (consecutive_all_color_chunk_size) + (color
+                                 // << chunk_divison)+ (col_ind & chunk_and);
+                                 my_vals[i] = mydenseAccumulator[acc_index];
+                                 mydenseAccumulator[acc_index] = 0;
+                               });
         });
-      }
-      scalar_t *my_vals = pVals + c_row_begin;
-
-      nnz_lno_t row_size = rowmapC[row_index + 1] - c_row_begin;
-      Kokkos::parallel_for(
-          Kokkos::ThreadVectorRange(teamMember, row_size),
-          [&] (nnz_lno_t i) {
-        nnz_lno_t col_ind = my_entries[i];
-        nnz_lno_t acc_index = col_ind;
-
-        //nnz_lno_t acc_index = (col_ind  >> chunk_divison) * (consecutive_all_color_chunk_size) + (color << chunk_divison)+ (col_ind & chunk_and);
-        my_vals[i] = mydenseAccumulator[acc_index];
-        mydenseAccumulator[acc_index] = 0;
-      });
-    });
   }
 
-
-  size_t team_shmem_size (int team_size) const {
-    return team_size * sizeof (nnz_lno_t) * 8;
+  size_t team_shmem_size(int team_size) const {
+    return team_size * sizeof(nnz_lno_t) * 8;
   }
 };
 
-
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-template <typename c_row_view_t, typename c_lno_nnz_view_t, typename c_scalar_nnz_view_t>
-void
-  KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-  KokkosSPGEMM_numeric_color(
-    c_row_view_t rowmapC_,
-    c_lno_nnz_view_t entriesC_,
-    c_scalar_nnz_view_t valuesC_,
-    SPGEMMAlgorithm spgemm_algorithm_){
-
-  if (KOKKOSKERNELS_VERBOSE){
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename c_row_view_t, typename c_lno_nnz_view_t,
+          typename c_scalar_nnz_view_t>
+void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                  a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                  b_scalar_nnz_view_t_>::
+    KokkosSPGEMM_numeric_color(c_row_view_t rowmapC_,
+                               c_lno_nnz_view_t entriesC_,
+                               c_scalar_nnz_view_t valuesC_,
+                               SPGEMMAlgorithm spgemm_algorithm_) {
+  if (KOKKOSKERNELS_VERBOSE) {
     std::cout << "\tCOLOR MODE" << std::endl;
   }
   nnz_lno_temp_work_view_t entryIndicesC_ =
       this->handle->get_spgemm_handle()->get_c_column_indices();
 
-  KokkosKernels::Impl::kk_copy_vector
-    <nnz_lno_temp_work_view_t, c_lno_nnz_view_t, MyExecSpace>
-      (entryIndicesC_.extent(0), entryIndicesC_, entriesC_);
+  KokkosKernels::Impl::kk_copy_vector<nnz_lno_temp_work_view_t,
+                                      c_lno_nnz_view_t, MyExecSpace>(
+      entryIndicesC_.extent(0), entryIndicesC_, entriesC_);
 
-  //KokkosKernels::Impl::ExecSpaceType my_exec_space =
+  // KokkosKernels::Impl::ExecSpaceType my_exec_space =
   //    KokkosKernels::Impl::get_exec_space_type<MyExecSpace>();
 
-
   nnz_lno_t brows = row_mapB.extent(0) - 1;
-  size_type bnnz =  valsB.extent(0);
-  //get vector size, team size.
-  int suggested_vector_size = this->handle->get_suggested_vector_size(brows, bnnz);
-  int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-
-  //get color vertices
+  size_type bnnz  = valsB.extent(0);
+  // get vector size, team size.
+  int suggested_vector_size =
+      this->handle->get_suggested_vector_size(brows, bnnz);
+  int suggested_team_size =
+      this->handle->get_suggested_team_size(suggested_vector_size);
+
+  // get color vertices
   nnz_lno_t num_colors, num_multi_colors, num_used_colors;
   nnz_lno_persistent_work_host_view_t color_xadj;
   nnz_lno_persistent_work_view_t color_adj, vertex_colors;
   this->handle->get_spgemm_handle()->get_color_xadj(
-      num_colors, color_xadj, color_adj, vertex_colors, num_multi_colors, num_used_colors);
+      num_colors, color_xadj, color_adj, vertex_colors, num_multi_colors,
+      num_used_colors);
 
-  const nnz_lno_t block_size = 64;
+  const nnz_lno_t block_size    = 64;
   const nnz_lno_t shift_divisor = 6;
-  scalar_temp_work_view_t denseAccumulator ("Scalar Accumulator", ( (this->b_col_cnt + block_size) * num_multi_colors));
-
-
-  //bool_temp_view_t denseAccumulatorFlags ("Accumulator flags", ((this->k* 1.5)  * num_multi_colors));
-
-
-  NumericCCOLOR<
-    const_a_lno_row_view_t, const_a_lno_nnz_view_t, const_a_scalar_nnz_view_t,
-    const_b_lno_row_view_t, const_b_lno_nnz_view_t, const_b_scalar_nnz_view_t,
-    c_row_view_t, c_lno_nnz_view_t, c_scalar_nnz_view_t>
-    sc(
-      a_row_cnt, b_col_cnt,
-      row_mapA,
-      entriesA,
-      valsA,
+  scalar_temp_work_view_t denseAccumulator(
+      "Scalar Accumulator",
+      ((this->b_col_cnt + block_size) * num_multi_colors));
 
-      row_mapB,
-      entriesB,
-      valsB,
+  // bool_temp_view_t denseAccumulatorFlags ("Accumulator flags",
+  // ((this->k* 1.5)  * num_multi_colors));
 
-      rowmapC_,
-      entriesC_,
-      valuesC_,
+  NumericCCOLOR<const_a_lno_row_view_t, const_a_lno_nnz_view_t,
+                const_a_scalar_nnz_view_t, const_b_lno_row_view_t,
+                const_b_lno_nnz_view_t, const_b_scalar_nnz_view_t, c_row_view_t,
+                c_lno_nnz_view_t, c_scalar_nnz_view_t>
+      sc(a_row_cnt, b_col_cnt, row_mapA, entriesA, valsA,
 
-      denseAccumulator,
-      //denseAccumulatorFlags,
-      -1);
+         row_mapB, entriesB, valsB,
 
+         rowmapC_, entriesC_, valuesC_,
 
+         denseAccumulator,
+         // denseAccumulatorFlags,
+         -1);
 
-
-  if (KOKKOSKERNELS_VERBOSE){
-    std::cout << "\t\tCOLORING-num_multi_colors:" << num_multi_colors << " num_used_colors:" << num_used_colors << std::endl;
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\t\tCOLORING-num_multi_colors:" << num_multi_colors
+              << " num_used_colors:" << num_used_colors << std::endl;
   }
-  sc.color_adj = color_adj;
-  sc.vertex_colors = vertex_colors;
-  sc.chunk_divison = shift_divisor;
-  sc.chunk_and = block_size - 1;
+  sc.color_adj              = color_adj;
+  sc.vertex_colors          = vertex_colors;
+  sc.chunk_divison          = shift_divisor;
+  sc.chunk_and              = block_size - 1;
   sc.consecutive_chunk_size = block_size;
-  sc.consecutive_all_color_chunk_size = sc.consecutive_chunk_size * num_multi_colors;
+  sc.consecutive_all_color_chunk_size =
+      sc.consecutive_chunk_size * num_multi_colors;
 
   Kokkos::Timer timer1;
-  for (nnz_lno_t i = 0; i < num_used_colors; ){
+  for (nnz_lno_t i = 0; i < num_used_colors;) {
     nnz_lno_t color_begin = color_xadj(i);
-    nnz_lno_t lastcolor = i + 1;
-    if (spgemm_algorithm_ == SPGEMM_KK_MULTICOLOR2){
-      lastcolor = KOKKOSKERNELS_MACRO_MIN(i + num_multi_colors, num_used_colors );
+    nnz_lno_t lastcolor   = i + 1;
+    if (spgemm_algorithm_ == SPGEMM_KK_MULTICOLOR2) {
+      lastcolor =
+          KOKKOSKERNELS_MACRO_MIN(i + num_multi_colors, num_used_colors);
       i += num_multi_colors;
-    }
-    else {
+    } else {
       ++i;
     }
 
     nnz_lno_t color_end = color_xadj(lastcolor);
-    sc.color_begin = color_begin;
-    sc.color_end = color_end;
+    sc.color_begin      = color_begin;
+    sc.color_end        = color_end;
 
-    nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(suggested_team_size,concurrency, color_end - color_begin);
+    nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(
+        suggested_team_size, concurrency, color_end - color_begin);
     sc.team_work_size = team_row_chunk_size;
 
-
-    if (use_dynamic_schedule){
-      switch (spgemm_algorithm_){
-      default:
-      case SPGEMM_KK_COLOR:
-        Kokkos::parallel_for( dynamic_team_numeric1_policy_t((color_end - color_begin) / team_row_chunk_size + 1 ,
-            suggested_team_size, suggested_vector_size), sc);
-        break;
-      case SPGEMM_KK_MULTICOLOR2:
-        Kokkos::parallel_for( dynamic_team_numeric2_policy_t((color_end - color_begin) / team_row_chunk_size + 1 ,
-            suggested_team_size, suggested_vector_size), sc);
-        break;
-      case SPGEMM_KK_MULTICOLOR:
-        Kokkos::parallel_for( dynamic_team_numeric3_policy_t((color_end - color_begin) / team_row_chunk_size + 1 ,
-            suggested_team_size, suggested_vector_size), sc);
-        break;
+    if (use_dynamic_schedule) {
+      switch (spgemm_algorithm_) {
+        default:
+        case SPGEMM_KK_COLOR:
+          Kokkos::parallel_for(
+              dynamic_team_numeric1_policy_t(
+                  (color_end - color_begin) / team_row_chunk_size + 1,
+                  suggested_team_size, suggested_vector_size),
+              sc);
+          break;
+        case SPGEMM_KK_MULTICOLOR2:
+          Kokkos::parallel_for(
+              dynamic_team_numeric2_policy_t(
+                  (color_end - color_begin) / team_row_chunk_size + 1,
+                  suggested_team_size, suggested_vector_size),
+              sc);
+          break;
+        case SPGEMM_KK_MULTICOLOR:
+          Kokkos::parallel_for(
+              dynamic_team_numeric3_policy_t(
+                  (color_end - color_begin) / team_row_chunk_size + 1,
+                  suggested_team_size, suggested_vector_size),
+              sc);
+          break;
       }
-    }
-    else {
-      switch (spgemm_algorithm_){
-      default:
-      case SPGEMM_KK_COLOR:
-        Kokkos::parallel_for( team_numeric1_policy_t((color_end - color_begin) / team_row_chunk_size + 1 ,
-            suggested_team_size, suggested_vector_size), sc);
-        break;
-      case SPGEMM_KK_MULTICOLOR2:
-        Kokkos::parallel_for( team_numeric2_policy_t((color_end - color_begin) / team_row_chunk_size + 1 ,
-            suggested_team_size, suggested_vector_size), sc);
-        break;
-      case SPGEMM_KK_MULTICOLOR:
-        Kokkos::parallel_for( team_numeric3_policy_t((color_end - color_begin) / team_row_chunk_size + 1 ,
-            suggested_team_size, suggested_vector_size), sc);
-        break;
+    } else {
+      switch (spgemm_algorithm_) {
+        default:
+        case SPGEMM_KK_COLOR:
+          Kokkos::parallel_for(
+              team_numeric1_policy_t(
+                  (color_end - color_begin) / team_row_chunk_size + 1,
+                  suggested_team_size, suggested_vector_size),
+              sc);
+          break;
+        case SPGEMM_KK_MULTICOLOR2:
+          Kokkos::parallel_for(
+              team_numeric2_policy_t(
+                  (color_end - color_begin) / team_row_chunk_size + 1,
+                  suggested_team_size, suggested_vector_size),
+              sc);
+          break;
+        case SPGEMM_KK_MULTICOLOR:
+          Kokkos::parallel_for(
+              team_numeric3_policy_t(
+                  (color_end - color_begin) / team_row_chunk_size + 1,
+                  suggested_team_size, suggested_vector_size),
+              sc);
+          break;
       }
     }
     MyExecSpace().fence();
   }
 
-  if (KOKKOSKERNELS_VERBOSE){
+  if (KOKKOSKERNELS_VERBOSE) {
     std::cout << "\t\tNumeric TIME:" << timer1.seconds() << std::endl;
   }
 }
 
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
 template <typename c_row_view_t, typename c_nnz_view_t>
-void
-  KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-    d2_color_c_matrix(
-    c_row_view_t rowmapC,
-    c_nnz_view_t entryIndicesC_,
-
-    nnz_lno_t &original_num_colors,
-    nnz_lno_persistent_work_host_view_t &h_color_xadj,
-    nnz_lno_persistent_work_view_t &color_adj,
-    nnz_lno_persistent_work_view_t &vertex_colors_to_store,
-
-    nnz_lno_t &num_colors_in_one_step,
-    nnz_lno_t &num_multi_color_steps,
-    SPGEMMAlgorithm spgemm_algorithm_){
-
+void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                  a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                  b_scalar_nnz_view_t_>::
+    d2_color_c_matrix(c_row_view_t rowmapC, c_nnz_view_t entryIndicesC_,
+
+                      nnz_lno_t &original_num_colors,
+                      nnz_lno_persistent_work_host_view_t &h_color_xadj,
+                      nnz_lno_persistent_work_view_t &color_adj,
+                      nnz_lno_persistent_work_view_t &vertex_colors_to_store,
+
+                      nnz_lno_t &num_colors_in_one_step,
+                      nnz_lno_t &num_multi_color_steps,
+                      SPGEMMAlgorithm spgemm_algorithm_) {
   nnz_lno_persistent_work_view_t color_xadj;
 
-
   size_type c_nnz_size = this->handle->get_spgemm_handle()->get_c_nnz();
-;
-
-  //first we need to transpose the C graph.
-  //allocate memory for that.
-  row_lno_temp_work_view_t transpose_col_xadj;// ("transpose_col_xadj", b_col_cnt + 1);
-  nnz_lno_temp_work_view_t transpose_col_adj;// (Kokkos::view_alloc(Kokkos::WithoutInitializing, "tmp_row_view"), c_nnz_size);
-
-
-  //KokkosKernels::Impl::ExecSpaceType my_exec_space = this->handle->get_handle_exec_space();
-  int suggested_vector_size = this->handle->get_suggested_vector_size(rowmapC.extent(0) - 1, c_nnz_size);
-  int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(suggested_team_size, concurrency,a_row_cnt);
+  ;
+
+  // first we need to transpose the C graph.
+  // allocate memory for that.
+  row_lno_temp_work_view_t
+      transpose_col_xadj;  // ("transpose_col_xadj", b_col_cnt + 1);
+  nnz_lno_temp_work_view_t
+      transpose_col_adj;  // (Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                          // "tmp_row_view"), c_nnz_size);
+
+  // KokkosKernels::Impl::ExecSpaceType my_exec_space =
+  // this->handle->get_handle_exec_space();
+  int suggested_vector_size = this->handle->get_suggested_vector_size(
+      rowmapC.extent(0) - 1, c_nnz_size);
+  int suggested_team_size =
+      this->handle->get_suggested_team_size(suggested_vector_size);
+  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(
+      suggested_team_size, concurrency, a_row_cnt);
 
   Kokkos::Timer timer1;
-  if (this->handle->get_spgemm_handle()->coloring_input_file == ""){
-
-      transpose_col_xadj = row_lno_temp_work_view_t("transpose_col_xadj", b_col_cnt + 1);
-      transpose_col_adj = nnz_lno_temp_work_view_t (Kokkos::view_alloc(Kokkos::WithoutInitializing, "tmp_row_view"), c_nnz_size);
+  if (this->handle->get_spgemm_handle()->coloring_input_file == "") {
+    transpose_col_xadj =
+        row_lno_temp_work_view_t("transpose_col_xadj", b_col_cnt + 1);
+    transpose_col_adj = nnz_lno_temp_work_view_t(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "tmp_row_view"),
+        c_nnz_size);
 
     KokkosKernels::Impl::transpose_graph<
-  c_row_view_t, c_nnz_view_t,
-  row_lno_temp_work_view_t, nnz_lno_temp_work_view_t, row_lno_temp_work_view_t,
-  MyExecSpace>
-    (a_row_cnt, b_col_cnt,
-        rowmapC, entryIndicesC_,
-      transpose_col_xadj,transpose_col_adj,
-      suggested_vector_size,
-      suggested_team_size,
-      team_row_chunk_size,
-      use_dynamic_schedule);
+        c_row_view_t, c_nnz_view_t, row_lno_temp_work_view_t,
+        nnz_lno_temp_work_view_t, row_lno_temp_work_view_t, MyExecSpace>(
+        a_row_cnt, b_col_cnt, rowmapC, entryIndicesC_, transpose_col_xadj,
+        transpose_col_adj, suggested_vector_size, suggested_team_size,
+        team_row_chunk_size, use_dynamic_schedule);
 
     MyExecSpace().fence();
   }
-  if (KOKKOSKERNELS_VERBOSE){
+  if (KOKKOSKERNELS_VERBOSE) {
     std::cout << "\t\tTranspose Time:" << timer1.seconds() << std::endl;
   }
 
-
   {
     timer1.reset();
 
     this->handle->create_graph_coloring_handle();
 
-    typename HandleType::GraphColoringHandleType::color_view_t vertex_color_view;
+    typename HandleType::GraphColoringHandleType::color_view_t
+        vertex_color_view;
 
-    if (this->handle->get_spgemm_handle()->coloring_input_file == "")
-    {
-        //for now only sequential one exists.
-        //find distance-2 graph coloring
+    if (this->handle->get_spgemm_handle()->coloring_input_file == "") {
+      // for now only sequential one exists.
+      // find distance-2 graph coloring
 
-        auto gchD2 = handle->get_distance2_graph_coloring_handle();
+      auto gchD2 = handle->get_distance2_graph_coloring_handle();
 
-        KokkosGraph::Experimental::graph_compute_distance2_color
-          <HandleType, c_row_view_t, c_nnz_view_t, row_lno_temp_work_view_t, nnz_lno_temp_work_view_t>
-          (this->handle, a_row_cnt, b_col_cnt, rowmapC, entryIndicesC_, transpose_col_xadj, transpose_col_adj);
+      KokkosGraph::Experimental::graph_compute_distance2_color<
+          HandleType, c_row_view_t, c_nnz_view_t, row_lno_temp_work_view_t,
+          nnz_lno_temp_work_view_t>(this->handle, a_row_cnt, b_col_cnt, rowmapC,
+                                    entryIndicesC_, transpose_col_xadj,
+                                    transpose_col_adj);
 
-        original_num_colors = handle->get_graph_coloring_handle()->get_num_colors();
+      original_num_colors =
+          handle->get_graph_coloring_handle()->get_num_colors();
 
-        if (KOKKOSKERNELS_VERBOSE)
-        {
-          std::cout << "\t\tNum colors:" << handle->get_graph_coloring_handle()->get_num_colors() 
-                    <<  " coloring time:" << timer1.seconds()  << std::endl;
-        }
-        vertex_color_view = handle->get_graph_coloring_handle()->get_vertex_colors();
+      if (KOKKOSKERNELS_VERBOSE) {
+        std::cout << "\t\tNum colors:"
+                  << handle->get_graph_coloring_handle()->get_num_colors()
+                  << " coloring time:" << timer1.seconds() << std::endl;
+      }
+      vertex_color_view =
+          handle->get_graph_coloring_handle()->get_vertex_colors();
 
-        if (this->handle->get_spgemm_handle()->coloring_output_file != ""){
-          KokkosKernels::Impl::kk_write_1Dview_to_file(vertex_color_view, this->handle->get_spgemm_handle()->coloring_output_file.c_str());
-        }
-    }
-    else 
-    {
-        vertex_color_view = typename HandleType::GraphColoringHandleType::color_view_t("vertex colors from file", a_row_cnt);
-        KokkosKernels::Impl::kk_read_1Dview_from_file(vertex_color_view, this->handle->get_spgemm_handle()->coloring_input_file.c_str());
-        KokkosKernels::Impl::view_reduce_max<typename HandleType::GraphColoringHandleType::color_view_t, MyExecSpace>
-              (a_row_cnt, vertex_color_view, original_num_colors);
-        MyExecSpace().fence();
+      if (this->handle->get_spgemm_handle()->coloring_output_file != "") {
+        KokkosKernels::Impl::kk_write_1Dview_to_file(
+            vertex_color_view,
+            this->handle->get_spgemm_handle()->coloring_output_file.c_str());
+      }
+    } else {
+      vertex_color_view =
+          typename HandleType::GraphColoringHandleType::color_view_t(
+              "vertex colors from file", a_row_cnt);
+      KokkosKernels::Impl::kk_read_1Dview_from_file(
+          vertex_color_view,
+          this->handle->get_spgemm_handle()->coloring_input_file.c_str());
+      KokkosKernels::Impl::view_reduce_max<
+          typename HandleType::GraphColoringHandleType::color_view_t,
+          MyExecSpace>(a_row_cnt, vertex_color_view, original_num_colors);
+      MyExecSpace().fence();
 
-        //KokkosKernels::Impl::kk_print_1Dview(vertex_color_view);
+      // KokkosKernels::Impl::kk_print_1Dview(vertex_color_view);
     }
-    num_multi_color_steps =  original_num_colors;
+    num_multi_color_steps  = original_num_colors;
     num_colors_in_one_step = 1;
 
+    vertex_colors_to_store = nnz_lno_persistent_work_view_t(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "persistent_color_view"),
+        a_row_cnt);
 
-    vertex_colors_to_store = nnz_lno_persistent_work_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "persistent_color_view"), a_row_cnt);
-
-
-    if (KOKKOSKERNELS_VERBOSE){
-      //create histogram and print it.
-      nnz_lno_temp_work_view_t histogram ("histogram", original_num_colors + 1);
+    if (KOKKOSKERNELS_VERBOSE) {
+      // create histogram and print it.
+      nnz_lno_temp_work_view_t histogram("histogram", original_num_colors + 1);
       MyExecSpace().fence();
       timer1.reset();
-      KokkosKernels::Impl::kk_get_histogram
-      <typename HandleType::GraphColoringHandleType::color_view_t, nnz_lno_temp_work_view_t, MyExecSpace>(a_row_cnt, vertex_color_view, histogram);
-      std::cout << "\t\tHistogram" << " time:" << timer1.seconds()  << std::endl << "\t\t";
+      KokkosKernels::Impl::kk_get_histogram<
+          typename HandleType::GraphColoringHandleType::color_view_t,
+          nnz_lno_temp_work_view_t, MyExecSpace>(a_row_cnt, vertex_color_view,
+                                                 histogram);
+      std::cout << "\t\tHistogram"
+                << " time:" << timer1.seconds() << std::endl
+                << "\t\t";
       KokkosKernels::Impl::kk_print_1Dview(histogram);
     }
 
     {
-    //TODO: this should have been as below.
-      //nnz_lno_temp_work_view_t tmp_color_view = vertex_color_view;
-      typename HandleType::GraphColoringHandleType::color_view_t tmp_color_view = vertex_color_view;
-      //if the algorithm is spgemm, then we will have multiple colors per iteration.
-      if (spgemm_algorithm_ == SPGEMM_KK_MULTICOLOR){
-
-        //tmp_color_view = nnz_lno_temp_work_view_t( Kokkos::view_alloc(Kokkos::WithoutInitializing, "tmp_color_view"), a_row_cnt);
-        tmp_color_view = typename HandleType::GraphColoringHandleType::color_view_t ( Kokkos::view_alloc(Kokkos::WithoutInitializing, "tmp_color_view"), a_row_cnt);
-
-        //upper bound is the output size for dense acumulators.
+      // TODO: this should have been as below.
+      // nnz_lno_temp_work_view_t tmp_color_view = vertex_color_view;
+      typename HandleType::GraphColoringHandleType::color_view_t
+          tmp_color_view = vertex_color_view;
+      // if the algorithm is spgemm, then we will have multiple colors per
+      // iteration.
+      if (spgemm_algorithm_ == SPGEMM_KK_MULTICOLOR) {
+        // tmp_color_view = nnz_lno_temp_work_view_t(
+        // Kokkos::view_alloc(Kokkos::WithoutInitializing, "tmp_color_view"),
+        // a_row_cnt);
+        tmp_color_view =
+            typename HandleType::GraphColoringHandleType::color_view_t(
+                Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                                   "tmp_color_view"),
+                a_row_cnt);
+
+        // upper bound is the output size for dense acumulators.
         num_colors_in_one_step = c_nnz_size / this->b_col_cnt;
 
-        //scale if provided.
-        double scale_ = this->handle->get_spgemm_handle()->get_multi_color_scale();
+        // scale if provided.
+        double scale_ =
+            this->handle->get_spgemm_handle()->get_multi_color_scale();
 
         num_colors_in_one_step = scale_ * num_colors_in_one_step;
 
-        //at the end of this tmp_color_view holds the colors that correspond the step colors.
-        // that is if num_multi_colors is 32, first 32 is 1, next 32 is 2 and so on.
-        if (num_colors_in_one_step > 1){
+        // at the end of this tmp_color_view holds the colors that correspond
+        // the step colors.
+        // that is if num_multi_colors is 32, first 32 is 1, next 32 is 2 and so
+        // on.
+        if (num_colors_in_one_step > 1) {
           float scale_factor = 1.0 / num_colors_in_one_step;
 
-          //get the sets multicolors. color(i) / num_multi_colors + 1 is the new color.
-          KokkosKernels::Impl::kk_a_times_x_plus_b
-              <typename HandleType::GraphColoringHandleType::color_view_t  //nnz_lno_temp_work_view_t
-      ,typename HandleType::GraphColoringHandleType::color_view_t, float, float, MyExecSpace>(
-                  a_row_cnt, tmp_color_view, vertex_color_view, scale_factor, 0);
+          // get the sets multicolors. color(i) / num_multi_colors + 1 is the
+          // new color.
+          KokkosKernels::Impl::kk_a_times_x_plus_b<
+              typename HandleType::GraphColoringHandleType::
+                  color_view_t  // nnz_lno_temp_work_view_t
+              ,
+              typename HandleType::GraphColoringHandleType::color_view_t, float,
+              float, MyExecSpace>(a_row_cnt, tmp_color_view, vertex_color_view,
+                                  scale_factor, 0);
           num_multi_color_steps = original_num_colors / num_colors_in_one_step;
-          if (original_num_colors % num_colors_in_one_step) ++num_multi_color_steps;
-        }
-        else {
+          if (original_num_colors % num_colors_in_one_step)
+            ++num_multi_color_steps;
+        } else {
           num_colors_in_one_step = 1;
         }
-      }
-      else {
-        KokkosKernels::Impl::kk_a_times_x_plus_b
-        <typename HandleType::GraphColoringHandleType::color_view_t //nnz_lno_temp_work_view_t,
-     ,typename HandleType::GraphColoringHandleType::color_view_t // nnz_lno_temp_work_view_t
-     , int, int, MyExecSpace>(
-            a_row_cnt, tmp_color_view, tmp_color_view, 1, -1);
+      } else {
+        KokkosKernels::Impl::kk_a_times_x_plus_b<
+            typename HandleType::GraphColoringHandleType::
+                color_view_t  // nnz_lno_temp_work_view_t,
+            ,
+            typename HandleType::GraphColoringHandleType::
+                color_view_t  // nnz_lno_temp_work_view_t
+            ,
+            int, int, MyExecSpace>(a_row_cnt, tmp_color_view, tmp_color_view, 1,
+                                   -1);
       }
 
-      if (spgemm_algorithm_ == SPGEMM_KK_MULTICOLOR2){
-        num_multi_color_steps = original_num_colors;
+      if (spgemm_algorithm_ == SPGEMM_KK_MULTICOLOR2) {
+        num_multi_color_steps  = original_num_colors;
         num_colors_in_one_step = c_nnz_size / this->b_col_cnt;
-        double scale_ = this->handle->get_spgemm_handle()->get_multi_color_scale();
+        double scale_ =
+            this->handle->get_spgemm_handle()->get_multi_color_scale();
         num_colors_in_one_step = scale_ * num_colors_in_one_step;
       }
 
-      //with the modular operation, we find the colors within a step.
-      // that is if num_multi_colors is 32, then color 32 will be 0, 33 will be 1, 34 will be 2.
-      //it will hold their color within their multicolor step.
-      KokkosKernels::Impl::kk_modular_view
-        <nnz_lno_persistent_work_view_t, typename HandleType::GraphColoringHandleType::color_view_t, MyExecSpace>
-        (a_row_cnt, vertex_colors_to_store, vertex_color_view, num_colors_in_one_step);
+      // with the modular operation, we find the colors within a step.
+      // that is if num_multi_colors is 32, then color 32 will be 0, 33 will be
+      // 1, 34 will be 2.
+      // it will hold their color within their multicolor step.
+      KokkosKernels::Impl::kk_modular_view<
+          nnz_lno_persistent_work_view_t,
+          typename HandleType::GraphColoringHandleType::color_view_t,
+          MyExecSpace>(a_row_cnt, vertex_colors_to_store, vertex_color_view,
+                       num_colors_in_one_step);
       timer1.reset();
 
-      //allocate color xadj and adj arrays.
+      // allocate color xadj and adj arrays.
       color_xadj = nnz_lno_persistent_work_view_t(
           Kokkos::view_alloc(Kokkos::WithoutInitializing, "Reverse xadj"),
           num_multi_color_steps + 1);
@@ -628,25 +678,27 @@ void
           Kokkos::view_alloc(Kokkos::WithoutInitializing, "Reverse xadj"),
           a_row_cnt);
 
-      //create reverse map from colors.
-      KokkosKernels::Impl::kk_create_reverse_map
-        <typename HandleType::GraphColoringHandleType::color_view_t //nnz_lno_temp_work_view_t
-        ,nnz_lno_persistent_work_view_t, MyExecSpace>
-            (a_row_cnt, num_multi_color_steps, tmp_color_view, color_xadj, color_adj);
+      // create reverse map from colors.
+      KokkosKernels::Impl::kk_create_reverse_map<
+          typename HandleType::GraphColoringHandleType::
+              color_view_t  // nnz_lno_temp_work_view_t
+          ,
+          nnz_lno_persistent_work_view_t, MyExecSpace>(
+          a_row_cnt, num_multi_color_steps, tmp_color_view, color_xadj,
+          color_adj);
       MyExecSpace().fence();
 
-      if (KOKKOSKERNELS_VERBOSE){
-        std::cout << "\t\tReverse Map Create Time:" << timer1.seconds() << std::endl;
+      if (KOKKOSKERNELS_VERBOSE) {
+        std::cout << "\t\tReverse Map Create Time:" << timer1.seconds()
+                  << std::endl;
       }
-      h_color_xadj = Kokkos::create_mirror_view (color_xadj);
-      Kokkos::deep_copy (h_color_xadj, color_xadj);
+      h_color_xadj = Kokkos::create_mirror_view(color_xadj);
+      Kokkos::deep_copy(h_color_xadj, color_xadj);
       MyExecSpace().fence();
     }
     this->handle->destroy_graph_coloring_handle();
   }
-
-}
-
-}
 }
 
+}  // namespace Impl
+}  // namespace KokkosSparse
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp
index 8650ea5b39..3babad44d2 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp
@@ -41,19 +41,19 @@
 // ************************************************************************
 //@HEADER
 */
-namespace KokkosSparse{
-
-namespace Impl{
-
-
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-template <typename row_view_t, typename nnz_view_t, typename new_row_view_t, typename new_nnz_view_t, typename pool_memory_space>
-struct KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-  SingleStepZipMatrix{
+namespace KokkosSparse {
+
+namespace Impl {
+
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename row_view_t, typename nnz_view_t, typename new_row_view_t,
+          typename new_nnz_view_t, typename pool_memory_space>
+struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                    a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                    b_scalar_nnz_view_t_>::SingleStepZipMatrix {
   const nnz_lno_t numrows;
   const row_view_t row_map;
   const nnz_view_t entries;
@@ -63,7 +63,6 @@ struct KokkosSPGEMM
 
   const new_row_view_t new_row_map;
 
-
   new_nnz_view_t set_index_begins;
   new_nnz_view_t set_index_nexts;
 
@@ -73,9 +72,8 @@ struct KokkosSPGEMM
   nnz_lno_t *pset_index_begins;
   nnz_lno_t *pset_index_nexts;
 
-  nnz_lno_t * pset_index_entries;
-  nnz_lno_t * pset_entries;
-
+  nnz_lno_t *pset_index_entries;
+  nnz_lno_t *pset_entries;
 
   const int shared_memory_size;
   nnz_lno_t team_row_chunk_size;
@@ -83,8 +81,7 @@ struct KokkosSPGEMM
   nnz_lno_t pow2_hash_size;
   nnz_lno_t max_row_size;
 
-
-  const int unit_memory; //begins, nexts, and keys. No need for vals yet.
+  const int unit_memory;  // begins, nexts, and keys. No need for vals yet.
   const int suggested_team_size;
   const int thread_memory;
 
@@ -93,617 +90,608 @@ struct KokkosSPGEMM
   nnz_lno_t shmem_hash_size, pow2_hash_func;
   const KokkosKernels::Impl::ExecSpaceType my_exec_space;
 
-  SingleStepZipMatrix(
-      const row_view_t row_map_,
-      const nnz_view_t entries_,
-      const nnz_lno_t compression_bit_mask_,
-      const int compression_bit_divide_shift_,
-      const int vector_size_,
-      new_row_view_t new_row_map_,
-      /*Global memory hash space. in the size of nnz*/
-      new_nnz_view_t set_index_begins_,
-      new_nnz_view_t set_index_nexts_,
-      new_nnz_view_t set_index_entries_,
-      new_nnz_view_t set_entries_,
-      const int shared_mem,
-      const nnz_lno_t team_row_chunk_size_,
-      int suggested_team_size_, bool KOKKOSKERNELS_VERBOSE_,
-      KokkosKernels::Impl::ExecSpaceType my_exec_space_
-      ):
-    numrows(row_map_.extent(0) - 1),
-    row_map(row_map_),
-    entries(entries_),
-    compression_bit_mask(compression_bit_mask_),
-    compression_bit_divide_shift(compression_bit_divide_shift_),
-    vector_size(vector_size_),
-    new_row_map(new_row_map_),
-    set_index_begins(set_index_begins_),
-    set_index_nexts(set_index_nexts_),
-    set_index_entries(set_index_entries_),
-    set_entries(set_entries_),
-    pset_index_begins(set_index_begins_.data()),
-    pset_index_nexts(set_index_nexts_.data()),
-    pset_index_entries(set_index_entries_.data()),
-    pset_entries(set_entries_.data()),
-    shared_memory_size(shared_mem),
-    team_row_chunk_size(team_row_chunk_size_),
-
-    unit_memory(sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) * 2),
-    suggested_team_size(suggested_team_size_),
-    thread_memory((shared_memory_size /8 / suggested_team_size_) * 8),
-    shmem_key_size(), shared_memory_hash_func(), shmem_hash_size(1),
-    my_exec_space(my_exec_space_)
-    {
-      shmem_key_size = ((thread_memory - sizeof(nnz_lno_t) * (4 + vector_size_ * 2)) / unit_memory);
-      if (KOKKOSKERNELS_VERBOSE_){
-        std::cout << "\t\tCOMPRESS -- thread_memory:" << thread_memory  << " unit_memory:" << unit_memory <<
-          " initial key size:" << shmem_key_size << std::endl;
-      }
-      while (shmem_hash_size * 2 <=  shmem_key_size){
-        shmem_hash_size = shmem_hash_size * 2;
-      }
-      shared_memory_hash_func = shmem_hash_size - 1;
-      shmem_key_size = shmem_key_size + ((shmem_key_size - shmem_hash_size) * sizeof(nnz_lno_t)) / (sizeof (nnz_lno_t) * 2 + sizeof(nnz_lno_t));
-      shmem_key_size = (shmem_key_size >> 1) << 1;
-      if (KOKKOSKERNELS_VERBOSE_){
-        std::cout << "\t\tCOMPRESS -- adjusted hashsize:" << shmem_hash_size  << " shmem_key_size:" << shmem_key_size << std::endl;
-      }
+  SingleStepZipMatrix(const row_view_t row_map_, const nnz_view_t entries_,
+                      const nnz_lno_t compression_bit_mask_,
+                      const int compression_bit_divide_shift_,
+                      const int vector_size_, new_row_view_t new_row_map_,
+                      /*Global memory hash space. in the size of nnz*/
+                      new_nnz_view_t set_index_begins_,
+                      new_nnz_view_t set_index_nexts_,
+                      new_nnz_view_t set_index_entries_,
+                      new_nnz_view_t set_entries_, const int shared_mem,
+                      const nnz_lno_t team_row_chunk_size_,
+                      int suggested_team_size_, bool KOKKOSKERNELS_VERBOSE_,
+                      KokkosKernels::Impl::ExecSpaceType my_exec_space_)
+      : numrows(row_map_.extent(0) - 1),
+        row_map(row_map_),
+        entries(entries_),
+        compression_bit_mask(compression_bit_mask_),
+        compression_bit_divide_shift(compression_bit_divide_shift_),
+        vector_size(vector_size_),
+        new_row_map(new_row_map_),
+        set_index_begins(set_index_begins_),
+        set_index_nexts(set_index_nexts_),
+        set_index_entries(set_index_entries_),
+        set_entries(set_entries_),
+        pset_index_begins(set_index_begins_.data()),
+        pset_index_nexts(set_index_nexts_.data()),
+        pset_index_entries(set_index_entries_.data()),
+        pset_entries(set_entries_.data()),
+        shared_memory_size(shared_mem),
+        team_row_chunk_size(team_row_chunk_size_),
+
+        unit_memory(sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) * 2),
+        suggested_team_size(suggested_team_size_),
+        thread_memory((shared_memory_size / 8 / suggested_team_size_) * 8),
+        shmem_key_size(),
+        shared_memory_hash_func(),
+        shmem_hash_size(1),
+        my_exec_space(my_exec_space_) {
+    shmem_key_size =
+        ((thread_memory - sizeof(nnz_lno_t) * (4 + vector_size_ * 2)) /
+         unit_memory);
+    if (KOKKOSKERNELS_VERBOSE_) {
+      std::cout << "\t\tCOMPRESS -- thread_memory:" << thread_memory
+                << " unit_memory:" << unit_memory
+                << " initial key size:" << shmem_key_size << std::endl;
     }
-
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const CountTag&, const team_member_t & teamMember) const {
-
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_ind)
-    {
-    //CPU part only compares it with the previous index.
-    size_type rowBegin = row_map(row_ind);
-    nnz_lno_t left_work = row_map(row_ind + 1) - rowBegin;
-
-    nnz_lno_t used_size = 0;
-    if (left_work > 0){
-    const nnz_lno_t n = entries(rowBegin);
-    nnz_lno_t prev_nset_ind = n >> compression_bit_divide_shift;
-    //nnz_lno_t prev_nset = 1;
-    //prev_nset = prev_nset << (n & compression_bit_mask);
-
-    for (nnz_lno_t i = 1; i < left_work; ++i){
-      //nnz_lno_t n_set = 1;
-      const size_type adjind = i + rowBegin;
-      const nnz_lno_t nn = entries(adjind);
-      nnz_lno_t n_set_index = nn >> compression_bit_divide_shift;
-      //n_set = n_set << (nn & compression_bit_mask);
-      if (prev_nset_ind != n_set_index){
-        ++used_size;
-        prev_nset_ind = n_set_index;
-      }
-      /*
-      if (prev_nset_ind == n_set_index){
-        //prev_nset = prev_nset | n_set;
-      } else {
-        ++used_size;
-        prev_nset_ind = n_set_index;
-
-      }
-      */
+    while (shmem_hash_size * 2 <= shmem_key_size) {
+      shmem_hash_size = shmem_hash_size * 2;
     }
-    ++used_size;
+    shared_memory_hash_func = shmem_hash_size - 1;
+    shmem_key_size          = shmem_key_size +
+                     ((shmem_key_size - shmem_hash_size) * sizeof(nnz_lno_t)) /
+                         (sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t));
+    shmem_key_size = (shmem_key_size >> 1) << 1;
+    if (KOKKOSKERNELS_VERBOSE_) {
+      std::cout << "\t\tCOMPRESS -- adjusted hashsize:" << shmem_hash_size
+                << " shmem_key_size:" << shmem_key_size << std::endl;
     }
-    new_row_map(row_ind) = used_size;
-    });
   }
 
   KOKKOS_INLINE_FUNCTION
-  size_t get_thread_id(const size_t row_index) const{
-    switch (my_exec_space){
-    default:
-      return row_index;
-#if defined( KOKKOS_ENABLE_SERIAL )
-    case KokkosKernels::Impl::Exec_SERIAL:
-      return 0;
-#endif
-#if defined( KOKKOS_ENABLE_OPENMP )
-    case KokkosKernels::Impl::Exec_OMP:
-      return Kokkos::OpenMP::impl_hardware_thread_id();
+  void operator()(const CountTag &, const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_ind) {
+          // CPU part only compares it with the previous index.
+          size_type rowBegin  = row_map(row_ind);
+          nnz_lno_t left_work = row_map(row_ind + 1) - rowBegin;
+
+          nnz_lno_t used_size = 0;
+          if (left_work > 0) {
+            const nnz_lno_t n       = entries(rowBegin);
+            nnz_lno_t prev_nset_ind = n >> compression_bit_divide_shift;
+            // nnz_lno_t prev_nset = 1;
+            // prev_nset = prev_nset << (n & compression_bit_mask);
+
+            for (nnz_lno_t i = 1; i < left_work; ++i) {
+              // nnz_lno_t n_set = 1;
+              const size_type adjind = i + rowBegin;
+              const nnz_lno_t nn     = entries(adjind);
+              nnz_lno_t n_set_index  = nn >> compression_bit_divide_shift;
+              // n_set = n_set << (nn & compression_bit_mask);
+              if (prev_nset_ind != n_set_index) {
+                ++used_size;
+                prev_nset_ind = n_set_index;
+              }
+              /*
+              if (prev_nset_ind == n_set_index){
+                //prev_nset = prev_nset | n_set;
+              } else {
+                ++used_size;
+                prev_nset_ind = n_set_index;
+
+              }
+              */
+            }
+            ++used_size;
+          }
+          new_row_map(row_ind) = used_size;
+        });
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t get_thread_id(const size_t row_index) const {
+    switch (my_exec_space) {
+      default: return row_index;
+#if defined(KOKKOS_ENABLE_SERIAL)
+      case KokkosKernels::Impl::Exec_SERIAL: return 0;
 #endif
-#if defined( KOKKOS_ENABLE_THREADS )
-    case KokkosKernels::Impl::Exec_PTHREADS:
-      return Kokkos::Threads::impl_hardware_thread_id();
+#if defined(KOKKOS_ENABLE_OPENMP)
+      case KokkosKernels::Impl::Exec_OMP:
+        return Kokkos::OpenMP::impl_hardware_thread_id();
 #endif
-#if defined( KOKKOS_ENABLE_QTHREAD)
-    case KokkosKernels::Impl::Exec_QTHREADS:
-      return 0; // Kokkos does not have a thread_id API for Qthreads
+#if defined(KOKKOS_ENABLE_THREADS)
+      case KokkosKernels::Impl::Exec_THREADS:
+        return Kokkos::Threads::impl_hardware_thread_id();
 #endif
-#if defined( KOKKOS_ENABLE_CUDA )
-    case KokkosKernels::Impl::Exec_CUDA:
-      return row_index;
+#if defined(KOKKOS_ENABLE_CUDA)
+      case KokkosKernels::Impl::Exec_CUDA: return row_index;
 #endif
-#if defined( KOKKOS_ENABLE_HIP )
-    case KokkosKernels::Impl::Exec_HIP:
-      return row_index;
+#if defined(KOKKOS_ENABLE_HIP)
+      case KokkosKernels::Impl::Exec_HIP: return row_index;
 #endif
     }
-
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const CountTag2&, const team_member_t & teamMember) const {
-
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
-
-    KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,nnz_lno_t,nnz_lno_t,KokkosKernels::Experimental::HashOpType::bitwiseAnd>
-    hm2(max_row_size, pow2_hash_func, NULL, NULL, NULL, NULL);
-
-    volatile nnz_lno_t * tmp = NULL;
+  void operator()(const CountTag2 &, const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
+
+    KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, nnz_lno_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>
+        hm2(max_row_size, pow2_hash_func, NULL, NULL, NULL, NULL);
+
+    volatile nnz_lno_t *tmp = NULL;
     size_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-    while (tmp == NULL){
-      tmp = (volatile nnz_lno_t * )( memory_space.allocate_chunk(tid));
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(memory_space.allocate_chunk(tid));
     }
 
-    nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *) tmp;
+    nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *)tmp;
     tmp += pow2_hash_size;
-    hm2.hash_begins = (nnz_lno_t *) (tmp);
+    hm2.hash_begins = (nnz_lno_t *)(tmp);
     tmp += pow2_hash_size;
-    hm2.hash_nexts = (nnz_lno_t *) (tmp);
+    hm2.hash_nexts = (nnz_lno_t *)(tmp);
     tmp += max_row_size;
-    hm2.keys = (nnz_lno_t *) (tmp);
+    hm2.keys = (nnz_lno_t *)(tmp);
     tmp += max_row_size;
-    hm2.values = (nnz_lno_t *) (tmp);
+    hm2.values = (nnz_lno_t *)(tmp);
 
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_ind)
-    {
-      nnz_lno_t globally_used_hash_count = 0;
-    //CPU part only compares it with the previous index.
-    size_type rowBegin = row_map(row_ind);
-    nnz_lno_t left_work = row_map(row_ind + 1) - rowBegin;
-
-    nnz_lno_t used_size = 0;
-    if (left_work > 0){
-      const nnz_lno_t n = entries(rowBegin);
-      nnz_lno_t prev_nset_ind = n >> compression_bit_divide_shift;
-
-      for (nnz_lno_t i = 1; i < left_work; ++i){
-        const size_type adjind = i + rowBegin;
-        const nnz_lno_t nn = entries(adjind);
-        nnz_lno_t n_set_index = nn >> compression_bit_divide_shift;
-        if (prev_nset_ind != n_set_index){
-          //std::cout << " pow2_hash_func:" << pow2_hash_func << " prev_nset_ind:" << prev_nset_ind << std::endl;
-          //insert prev_nset_ind to hashmap
-          hm2.sequential_insert_into_hash_TrackHashes(
-            prev_nset_ind,
-            &used_size,
-            &globally_used_hash_count,
-            globally_used_hash_indices
-          );
-          //++used_size;
-          prev_nset_ind = n_set_index;
-        }
-      }
-      //insert prev_nset_ind to hashmap
-      hm2.sequential_insert_into_hash_TrackHashes(
-        prev_nset_ind,
-        &used_size,
-        &globally_used_hash_count,
-        globally_used_hash_indices
-      );
-      //++used_size;
-    }
-    //get the size of the hashmap
-    new_row_map(row_ind) = used_size;
-    //std::cout << "globally_used_hash_count:" << globally_used_hash_count << std::endl;
-    for (nnz_lno_t i = 0; i < globally_used_hash_count ; ++i){
-      hm2.hash_begins[globally_used_hash_indices[i]] = -1;
-    }
-    });
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_ind) {
+          nnz_lno_t globally_used_hash_count = 0;
+          // CPU part only compares it with the previous index.
+          size_type rowBegin  = row_map(row_ind);
+          nnz_lno_t left_work = row_map(row_ind + 1) - rowBegin;
+
+          nnz_lno_t used_size = 0;
+          if (left_work > 0) {
+            const nnz_lno_t n       = entries(rowBegin);
+            nnz_lno_t prev_nset_ind = n >> compression_bit_divide_shift;
+
+            for (nnz_lno_t i = 1; i < left_work; ++i) {
+              const size_type adjind = i + rowBegin;
+              const nnz_lno_t nn     = entries(adjind);
+              nnz_lno_t n_set_index  = nn >> compression_bit_divide_shift;
+              if (prev_nset_ind != n_set_index) {
+                // std::cout << " pow2_hash_func:" << pow2_hash_func << "
+                // prev_nset_ind:" << prev_nset_ind << std::endl; insert
+                // prev_nset_ind to hashmap
+                hm2.sequential_insert_into_hash_TrackHashes(
+                    prev_nset_ind, &used_size, &globally_used_hash_count,
+                    globally_used_hash_indices);
+                //++used_size;
+                prev_nset_ind = n_set_index;
+              }
+            }
+            // insert prev_nset_ind to hashmap
+            hm2.sequential_insert_into_hash_TrackHashes(
+                prev_nset_ind, &used_size, &globally_used_hash_count,
+                globally_used_hash_indices);
+            //++used_size;
+          }
+          // get the size of the hashmap
+          new_row_map(row_ind) = used_size;
+          // std::cout << "globally_used_hash_count:" <<
+          // globally_used_hash_count << std::endl;
+          for (nnz_lno_t i = 0; i < globally_used_hash_count; ++i) {
+            hm2.hash_begins[globally_used_hash_indices[i]] = -1;
+          }
+        });
     memory_space.release_chunk(globally_used_hash_indices);
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const FillTag2&, const team_member_t & teamMember) const {
-
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
-    KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,nnz_lno_t,nnz_lno_t,KokkosKernels::Experimental::HashOpType::bitwiseAnd>
-    hm2(max_row_size, pow2_hash_func, NULL, NULL, NULL, NULL);
-
-    volatile nnz_lno_t * tmp = NULL;
+  void operator()(const FillTag2 &, const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
+    KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, nnz_lno_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>
+        hm2(max_row_size, pow2_hash_func, NULL, NULL, NULL, NULL);
+
+    volatile nnz_lno_t *tmp = NULL;
     size_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-    while (tmp == NULL){
-      tmp = (volatile nnz_lno_t * )( memory_space.allocate_chunk(tid));
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(memory_space.allocate_chunk(tid));
     }
 
-    nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *) tmp;
+    nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *)tmp;
     tmp += pow2_hash_size;
-    hm2.hash_begins = (nnz_lno_t *) (tmp);
+    hm2.hash_begins = (nnz_lno_t *)(tmp);
     tmp += pow2_hash_size;
-    hm2.hash_nexts = (nnz_lno_t *) (tmp);
-
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_ind)
-    {
-
-
-    //CPU part only compares it with the previous index.
-    size_type rowBegin = row_map(row_ind);
-    nnz_lno_t left_work = row_map(row_ind + 1) - rowBegin;
-
-    size_type outrowBegin = new_row_map(row_ind);
-
-
-    hm2.keys = pset_index_entries + outrowBegin;
-
-    hm2.values = pset_entries + outrowBegin;
-    nnz_lno_t globally_used_hash_count = 0;
-
-    nnz_lno_t used_size = 0;
-    if (left_work > 0){
-    const nnz_lno_t n = entries(rowBegin);
-    nnz_lno_t prev_nset_ind = n >> compression_bit_divide_shift;
-    nnz_lno_t prev_nset = 1;
-    prev_nset = prev_nset << (n & compression_bit_mask);
-
-
-
-
-    for (nnz_lno_t i = 1; i < left_work; ++i){
-      nnz_lno_t n_set = 1;
-      const size_type adjind = i + rowBegin;
-      const nnz_lno_t nn = entries(adjind);
-      nnz_lno_t n_set_index = nn >> compression_bit_divide_shift;
-      n_set = n_set << (nn & compression_bit_mask);
-      if (prev_nset_ind == n_set_index){
-        prev_nset = prev_nset | n_set;
-      } else {
-        hm2.sequential_insert_into_hash_mergeOr_TrackHashes(
-          prev_nset_ind, prev_nset,
-          &used_size,
-          &globally_used_hash_count,
-          globally_used_hash_indices
-        );
+    hm2.hash_nexts = (nnz_lno_t *)(tmp);
 
-        //pset_index_entries[used_size + outrowBegin] = prev_nset_ind ;
-        //pset_entries[used_size + outrowBegin] = prev_nset;
-        //++used_size;
-        prev_nset_ind = n_set_index;
-        prev_nset = n_set;
-      }
-    }
-    hm2.sequential_insert_into_hash_mergeOr_TrackHashes(
-      prev_nset_ind, prev_nset,
-      &used_size,
-      &globally_used_hash_count,
-      globally_used_hash_indices
-    );
-    for (nnz_lno_t i = 0; i < globally_used_hash_count ; ++i){
-      hm2.hash_begins[globally_used_hash_indices[i]] = -1;
-    }
-    //pset_index_entries[used_size + outrowBegin] = prev_nset_ind ;
-    //pset_entries[used_size + outrowBegin] = prev_nset;
-    //++used_size;
-    }
-    });
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_ind) {
+          // CPU part only compares it with the previous index.
+          size_type rowBegin  = row_map(row_ind);
+          nnz_lno_t left_work = row_map(row_ind + 1) - rowBegin;
+
+          size_type outrowBegin = new_row_map(row_ind);
+
+          hm2.keys = pset_index_entries + outrowBegin;
+
+          hm2.values                         = pset_entries + outrowBegin;
+          nnz_lno_t globally_used_hash_count = 0;
+
+          nnz_lno_t used_size = 0;
+          if (left_work > 0) {
+            const nnz_lno_t n       = entries(rowBegin);
+            nnz_lno_t prev_nset_ind = n >> compression_bit_divide_shift;
+            nnz_lno_t prev_nset     = 1;
+            prev_nset               = prev_nset << (n & compression_bit_mask);
+
+            for (nnz_lno_t i = 1; i < left_work; ++i) {
+              nnz_lno_t n_set        = 1;
+              const size_type adjind = i + rowBegin;
+              const nnz_lno_t nn     = entries(adjind);
+              nnz_lno_t n_set_index  = nn >> compression_bit_divide_shift;
+              n_set                  = n_set << (nn & compression_bit_mask);
+              if (prev_nset_ind == n_set_index) {
+                prev_nset = prev_nset | n_set;
+              } else {
+                hm2.sequential_insert_into_hash_mergeOr_TrackHashes(
+                    prev_nset_ind, prev_nset, &used_size,
+                    &globally_used_hash_count, globally_used_hash_indices);
+
+                // pset_index_entries[used_size + outrowBegin] = prev_nset_ind ;
+                // pset_entries[used_size + outrowBegin] = prev_nset;
+                //++used_size;
+                prev_nset_ind = n_set_index;
+                prev_nset     = n_set;
+              }
+            }
+            hm2.sequential_insert_into_hash_mergeOr_TrackHashes(
+                prev_nset_ind, prev_nset, &used_size, &globally_used_hash_count,
+                globally_used_hash_indices);
+            for (nnz_lno_t i = 0; i < globally_used_hash_count; ++i) {
+              hm2.hash_begins[globally_used_hash_indices[i]] = -1;
+            }
+            // pset_index_entries[used_size + outrowBegin] = prev_nset_ind ;
+            // pset_entries[used_size + outrowBegin] = prev_nset;
+            //++used_size;
+          }
+        });
     memory_space.release_chunk(globally_used_hash_indices);
   }
   KOKKOS_INLINE_FUNCTION
-  void operator()(const FillTag&, const team_member_t & teamMember) const {
-
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_ind)
-    {
-
-
-    //CPU part only compares it with the previous index.
-    size_type rowBegin = row_map(row_ind);
-    nnz_lno_t left_work = row_map(row_ind + 1) - rowBegin;
-
-    size_type outrowBegin = new_row_map(row_ind);
-
-
-    nnz_lno_t used_size = 0;
-    if (left_work > 0){
-    const nnz_lno_t n = entries(rowBegin);
-    nnz_lno_t prev_nset_ind = n >> compression_bit_divide_shift;
-    nnz_lno_t prev_nset = 1;
-    prev_nset = prev_nset << (n & compression_bit_mask);
-
-
-
-
-    for (nnz_lno_t i = 1; i < left_work; ++i){
-      nnz_lno_t n_set = 1;
-      const size_type adjind = i + rowBegin;
-      const nnz_lno_t nn = entries(adjind);
-      nnz_lno_t n_set_index = nn >> compression_bit_divide_shift;
-      n_set = n_set << (nn & compression_bit_mask);
-      if (prev_nset_ind == n_set_index){
-        prev_nset = prev_nset | n_set;
-      } else {
-        pset_index_entries[used_size + outrowBegin] = prev_nset_ind ;
-        pset_entries[used_size + outrowBegin] = prev_nset;
-        ++used_size;
-        prev_nset_ind = n_set_index;
-        prev_nset = n_set;
-      }
-    }
-    pset_index_entries[used_size + outrowBegin] = prev_nset_ind ;
-    pset_entries[used_size + outrowBegin] = prev_nset;
-    ++used_size;
-    }
-    });
+  void operator()(const FillTag &, const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_ind) {
+          // CPU part only compares it with the previous index.
+          size_type rowBegin  = row_map(row_ind);
+          nnz_lno_t left_work = row_map(row_ind + 1) - rowBegin;
+
+          size_type outrowBegin = new_row_map(row_ind);
+
+          nnz_lno_t used_size = 0;
+          if (left_work > 0) {
+            const nnz_lno_t n       = entries(rowBegin);
+            nnz_lno_t prev_nset_ind = n >> compression_bit_divide_shift;
+            nnz_lno_t prev_nset     = 1;
+            prev_nset               = prev_nset << (n & compression_bit_mask);
+
+            for (nnz_lno_t i = 1; i < left_work; ++i) {
+              nnz_lno_t n_set        = 1;
+              const size_type adjind = i + rowBegin;
+              const nnz_lno_t nn     = entries(adjind);
+              nnz_lno_t n_set_index  = nn >> compression_bit_divide_shift;
+              n_set                  = n_set << (nn & compression_bit_mask);
+              if (prev_nset_ind == n_set_index) {
+                prev_nset = prev_nset | n_set;
+              } else {
+                pset_index_entries[used_size + outrowBegin] = prev_nset_ind;
+                pset_entries[used_size + outrowBegin]       = prev_nset;
+                ++used_size;
+                prev_nset_ind = n_set_index;
+                prev_nset     = n_set;
+              }
+            }
+            pset_index_entries[used_size + outrowBegin] = prev_nset_ind;
+            pset_entries[used_size + outrowBegin]       = prev_nset;
+            ++used_size;
+          }
+        });
   }
 
-
   KOKKOS_INLINE_FUNCTION
-  void operator()(const MultiCoreTag&, const team_member_t & teamMember) const {
-
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_ind)
-    {
-
-
-    //CPU part only compares it with the previous index.
-    size_type rowBegin = row_map(row_ind);
-    nnz_lno_t left_work = row_map(row_ind + 1) - rowBegin;
-
-    nnz_lno_t used_size = 0;
-    if (left_work > 0){
-    const nnz_lno_t n = entries(rowBegin);
-    nnz_lno_t prev_nset_ind = n >> compression_bit_divide_shift;
-    nnz_lno_t prev_nset = 1;
-    prev_nset = prev_nset << (n & compression_bit_mask);
-
-
-
-
-    for (nnz_lno_t i = 1; i < left_work; ++i){
-      nnz_lno_t n_set = 1;
-      const size_type adjind = i + rowBegin;
-      const nnz_lno_t nn = entries(adjind);
-      nnz_lno_t n_set_index = nn >> compression_bit_divide_shift;
-      n_set = n_set << (nn & compression_bit_mask);
-      if (prev_nset_ind == n_set_index){
-        prev_nset = prev_nset | n_set;
-      } else {
-        pset_index_entries[used_size + rowBegin] = prev_nset_ind ;
-        pset_entries[used_size + rowBegin] = prev_nset;
-        ++used_size;
-        prev_nset_ind = n_set_index;
-        prev_nset = n_set;
-      }
-    }
-    pset_index_entries[used_size + rowBegin] = prev_nset_ind ;
-    pset_entries[used_size + rowBegin] = prev_nset;
-    ++used_size;
-    }
-    new_row_map(row_ind) = rowBegin + used_size;
-    });
+  void operator()(const MultiCoreTag &, const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_ind) {
+          // CPU part only compares it with the previous index.
+          size_type rowBegin  = row_map(row_ind);
+          nnz_lno_t left_work = row_map(row_ind + 1) - rowBegin;
+
+          nnz_lno_t used_size = 0;
+          if (left_work > 0) {
+            const nnz_lno_t n       = entries(rowBegin);
+            nnz_lno_t prev_nset_ind = n >> compression_bit_divide_shift;
+            nnz_lno_t prev_nset     = 1;
+            prev_nset               = prev_nset << (n & compression_bit_mask);
+
+            for (nnz_lno_t i = 1; i < left_work; ++i) {
+              nnz_lno_t n_set        = 1;
+              const size_type adjind = i + rowBegin;
+              const nnz_lno_t nn     = entries(adjind);
+              nnz_lno_t n_set_index  = nn >> compression_bit_divide_shift;
+              n_set                  = n_set << (nn & compression_bit_mask);
+              if (prev_nset_ind == n_set_index) {
+                prev_nset = prev_nset | n_set;
+              } else {
+                pset_index_entries[used_size + rowBegin] = prev_nset_ind;
+                pset_entries[used_size + rowBegin]       = prev_nset;
+                ++used_size;
+                prev_nset_ind = n_set_index;
+                prev_nset     = n_set;
+              }
+            }
+            pset_index_entries[used_size + rowBegin] = prev_nset_ind;
+            pset_entries[used_size + rowBegin]       = prev_nset;
+            ++used_size;
+          }
+          new_row_map(row_ind) = rowBegin + used_size;
+        });
   }
-  //TODO: Implement the GPU count version.
+  // TODO: Implement the GPU count version.
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const GPUTag&, const team_member_t & teamMember) const {
-
-    nnz_lno_t row_ind = teamMember.league_rank()  * teamMember.team_size()+ teamMember.team_rank();
-    //std::cout << "i:" << i << std::endl;
+  void operator()(const GPUTag &, const team_member_t &teamMember) const {
+    nnz_lno_t row_ind = teamMember.league_rank() * teamMember.team_size() +
+                        teamMember.team_rank();
+    // std::cout << "i:" << i << std::endl;
     if (row_ind >= numrows) return;
 
-    //std::cout << "i:" << i << std::endl;
-    //how much shared memory a thread will have in team
-    //int thread_memory = shared_memory_size / teamMember.team_size();
-    //allocate all shared memory
-    char *all_shared_memory = (char *) (teamMember.team_shmem().get_shmem(shared_memory_size));
+    // std::cout << "i:" << i << std::endl;
+    // how much shared memory a thread will have in team
+    // int thread_memory = shared_memory_size / teamMember.team_size();
+    // allocate all shared memory
+    char *all_shared_memory =
+        (char *)(teamMember.team_shmem().get_shmem(shared_memory_size));
 
-    //shift it to the thread private part
+    // shift it to the thread private part
     all_shared_memory += thread_memory * teamMember.team_rank();
 
-    //used_hash_sizes hold the size of 1st and 2nd level hashes
-    volatile nnz_lno_t *used_hash_sizes = (volatile nnz_lno_t *) (all_shared_memory);
+    // used_hash_sizes hold the size of 1st and 2nd level hashes
+    volatile nnz_lno_t *used_hash_sizes =
+        (volatile nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * 2;
 
-
-    nnz_lno_t *globally_used_hash_count = (nnz_lno_t *) (all_shared_memory);
+    nnz_lno_t *globally_used_hash_count = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * 2;
 
-    //allocate memory in the size of vectors
-    nnz_lno_t *result_keys = (nnz_lno_t *) (all_shared_memory);
+    // allocate memory in the size of vectors
+    nnz_lno_t *result_keys = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * vector_size;
-    nnz_lno_t *result_vals = (nnz_lno_t *) (all_shared_memory);
+    nnz_lno_t *result_vals = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * vector_size;
 
-    //thread_memory -= vector_size * sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) * 2;
-
-    //calculate the memory needed for 1 single hash entry.
-    //int unit_memory = sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) * 2; //begins, nexts, and keys. No need for vals yet.
-    //how many hash entries can be held in shared memory.
-    //nnz_lno_t shared_memory_hash_size = thread_memory / unit_memory;
+    // thread_memory -= vector_size * sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t)
+    // * 2;
 
+    // calculate the memory needed for 1 single hash entry.
+    // int unit_memory = sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) * 2;
+    // //begins, nexts, and keys. No need for vals yet. how many hash entries
+    // can be held in shared memory. nnz_lno_t shared_memory_hash_size =
+    // thread_memory / unit_memory;
 
-    //points to the beginning of hashes
-    nnz_lno_t * begins = (nnz_lno_t *) (all_shared_memory);
+    // points to the beginning of hashes
+    nnz_lno_t *begins = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * shmem_hash_size;
 
-    //poins to the next elements
-    nnz_lno_t * nexts = (nnz_lno_t *) (all_shared_memory);
+    // poins to the next elements
+    nnz_lno_t *nexts = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * shmem_hash_size;
 
-    //holds the keys
-    nnz_lno_t * keys = (nnz_lno_t *) (all_shared_memory);
+    // holds the keys
+    nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * shmem_hash_size;
-    nnz_lno_t * vals = (nnz_lno_t *) (all_shared_memory);
+    nnz_lno_t *vals = (nnz_lno_t *)(all_shared_memory);
 
+    // this is a hash for individual row elements. therefore, the size can be at
+    // most the number of elements in a row, so the nnz_lno_t can be used
+    // instead of size_type here.
 
-    //this is a hash for individual row elements. therefore, the size can be at most
-    //the number of elements in a row, so the nnz_lno_t can be used instead of size_type here.
+    // first level hashmap
+    KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, nnz_lno_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>
+        hm(shmem_hash_size, shared_memory_hash_func, begins, nexts, keys, vals);
 
-    //first level hashmap
-    KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,nnz_lno_t,nnz_lno_t,KokkosKernels::Experimental::HashOpType::bitwiseAnd>
-      hm(shmem_hash_size, shared_memory_hash_func, begins, nexts, keys, vals);
-
-    size_type rowBegin = row_map(row_ind);
+    size_type rowBegin  = row_map(row_ind);
     size_type rowBeginP = rowBegin;
 
-
     nnz_lno_t left_work = row_map(row_ind + 1) - rowBegin;
 #ifdef KOKKOSKERNELSMOREMEM
-    //same as second level hash map.
-    //second level hashmap.
-    KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,nnz_lno_t,nnz_lno_t,KokkosKernels::Experimental::HashOpType::modulo>
-      hm2(left_work, left_work, pset_index_begins + rowBegin, pset_index_nexts+ rowBegin, pset_index_entries+ rowBegin, pset_entries+ rowBegin);
+    // same as second level hash map.
+    // second level hashmap.
+    KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, nnz_lno_t,
+        KokkosKernels::Experimental::HashOpType::modulo>
+        hm2(left_work, left_work, pset_index_begins + rowBegin,
+            pset_index_nexts + rowBegin, pset_index_entries + rowBegin,
+            pset_entries + rowBegin);
 #else
-    KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,nnz_lno_t,nnz_lno_t,KokkosKernels::Experimental::HashOpType::bitwiseAnd>
-      hm2(left_work, pow2_hash_func, /*pset_index_begins + rowBegin*/ NULL,
-          NULL, //pset_index_nexts+ rowBegin,
-          pset_index_entries+ rowBegin,
-          pset_entries+ rowBegin);
-    bool l2_allocated = false;
+    KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, nnz_lno_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>
+        hm2(left_work, pow2_hash_func, /*pset_index_begins + rowBegin*/ NULL,
+            NULL,  // pset_index_nexts+ rowBegin,
+            pset_index_entries + rowBegin, pset_entries + rowBegin);
+    bool l2_allocated                     = false;
     nnz_lno_t *globally_used_hash_indices = NULL;
 #endif
 
-
-    //initialize begins.
-    Kokkos::parallel_for(
-        Kokkos::ThreadVectorRange(teamMember, shmem_hash_size),
-        [&] (int i) {
-      begins[i] = -1;
-    });
-    //initialize begins.
-    //these are initialized before the loop.
-    //initialize hash usage sizes
-    Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
-      used_hash_sizes[0] = 0;
-      used_hash_sizes[1] = 0;
+    // initialize begins.
+    Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember, shmem_hash_size),
+                         [&](int i) { begins[i] = -1; });
+    // initialize begins.
+    // these are initialized before the loop.
+    // initialize hash usage sizes
+    Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+      used_hash_sizes[0]          = 0;
+      used_hash_sizes[1]          = 0;
       globally_used_hash_count[0] = 0;
     });
 
-    //lno_t neighbor_set_count = 0;
-
-    while (left_work){
+    // lno_t neighbor_set_count = 0;
 
-      //std::cout << "left_work:" << left_work << std::endl;
-      nnz_lno_t work_to_handle = KOKKOSKERNELS_MACRO_MIN(vector_size, left_work);
+    while (left_work) {
+      // std::cout << "left_work:" << left_work << std::endl;
+      nnz_lno_t work_to_handle =
+          KOKKOSKERNELS_MACRO_MIN(vector_size, left_work);
 
-      //first get the portion of the work for the vector lane.
+      // first get the portion of the work for the vector lane.
       nnz_lno_t n_set_index = -1;
-      nnz_lno_t n_set = 1;
-
-      Kokkos::parallel_for(
-          Kokkos::ThreadVectorRange(teamMember, vector_size),
-          [&] (nnz_lno_t i) {
-        result_keys[i] = -1;
-        //This is required for Nvidia architectures with "independent thread scheduling"
-#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
-        result_vals[i] = 0;
+      nnz_lno_t n_set       = 1;
+
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember, vector_size),
+                           [&](nnz_lno_t i) {
+                             result_keys[i] = -1;
+        // This is required for Nvidia architectures with "independent thread
+        // scheduling"
+#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \
+    defined(KOKKOS_ARCH_AMPERE)
+                             result_vals[i] = 0;
 #endif
-      });
+                           });
 
-      //here we find the column_set indices for each column.
-      //if it is integer, we divide the column index by 32 with shifts.
-      //result_keys is an array of size vector_size
-      //this is used as hashtable.
+      // here we find the column_set indices for each column.
+      // if it is integer, we divide the column index by 32 with shifts.
+      // result_keys is an array of size vector_size
+      // this is used as hashtable.
       Kokkos::parallel_for(
           Kokkos::ThreadVectorRange(teamMember, work_to_handle),
-          [&] (nnz_lno_t i) {
-        const size_type adjind = i + rowBegin;
-        const nnz_lno_t n = entries(adjind);
-        n_set_index = n >> compression_bit_divide_shift;
-        n_set = n_set << (n & compression_bit_mask);
-
-        size_type new_hash = n_set_index & (vector_size - 1);
-        nnz_lno_t r = -1;
-        while (true){
-          if (result_keys[new_hash] == n_set_index){
-            Kokkos::atomic_fetch_or(result_vals + new_hash, n_set);
-            break;
-          }
-        else if (result_keys[new_hash] == r){
-          if (Kokkos::atomic_compare_exchange_strong(result_keys + new_hash, r, n_set_index)){
-	    //MD 4/4/18: one these architectures there can be divergence in the warp.
-	    //once the keys are set, some other vector lane might be doing a
-	    //fetch_or before we set with n_set. Therefore it is necessary to do
-	    //atomic, and set it with zero as above.
-#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
-            Kokkos::atomic_fetch_or(result_vals + new_hash, n_set);
+          [&](nnz_lno_t i) {
+            const size_type adjind = i + rowBegin;
+            const nnz_lno_t n      = entries(adjind);
+            n_set_index            = n >> compression_bit_divide_shift;
+            n_set                  = n_set << (n & compression_bit_mask);
+
+            size_type new_hash = n_set_index & (vector_size - 1);
+            nnz_lno_t r        = -1;
+            while (true) {
+              if (result_keys[new_hash] == n_set_index) {
+                Kokkos::atomic_fetch_or(result_vals + new_hash, n_set);
+                break;
+              } else if (result_keys[new_hash] == r) {
+                if (Kokkos::atomic_compare_exchange_strong(
+                        result_keys + new_hash, r, n_set_index)) {
+              // MD 4/4/18: one these architectures there can be divergence in
+              // the warp. once the keys are set, some other vector lane might
+              // be doing a fetch_or before we set with n_set. Therefore it is
+              // necessary to do atomic, and set it with zero as above.
+#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \
+    defined(KOKKOS_ARCH_AMPERE)
+                  Kokkos::atomic_fetch_or(result_vals + new_hash, n_set);
 #else
-            result_vals[new_hash] = n_set;
+                  result_vals[new_hash] = n_set;
 #endif
-            break;
-          }
-        }
-        else if (++new_hash == decltype(new_hash)(vector_size)){
-          new_hash = 0;
-        }
-      }
-
-      });
-      int num_unsuccess = 0;
+                  break;
+                }
+              } else if (++new_hash == decltype(new_hash)(vector_size)) {
+                new_hash = 0;
+              }
+            }
+          });
+      int num_unsuccess         = 0;
       int overall_num_unsuccess = 0;
 
-      //once the values are written result_keys and result_vals
-      //each vector reads their corresponding value and inserts it into level-1 hash.
-      //if level-1 hash is full, it returns 1 (num_unsuccess=1)
-      //a reduction is done to see whether any vector lanes failed.
-      Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(teamMember, vector_size),
-          [&] (const int i, int &overall_num_unsuccess_) {
-          n_set_index = result_keys[i];
-          n_set = result_vals[i];
-          if (n_set_index != -1) {
-            num_unsuccess = hm.vector_atomic_insert_into_hash_mergeOr(
-                              n_set_index,
-                              n_set, used_hash_sizes);
-            overall_num_unsuccess_ += num_unsuccess;
-          }
-      }, overall_num_unsuccess);
+      // once the values are written result_keys and result_vals
+      // each vector reads their corresponding value and inserts it into level-1
+      // hash. if level-1 hash is full, it returns 1 (num_unsuccess=1) a
+      // reduction is done to see whether any vector lanes failed.
+      Kokkos::parallel_reduce(
+          Kokkos::ThreadVectorRange(teamMember, vector_size),
+          [&](const int i, int &overall_num_unsuccess_) {
+            n_set_index = result_keys[i];
+            n_set       = result_vals[i];
+            if (n_set_index != -1) {
+              num_unsuccess = hm.vector_atomic_insert_into_hash_mergeOr(
+                  n_set_index, n_set, used_hash_sizes);
+              overall_num_unsuccess_ += num_unsuccess;
+            }
+          },
+          overall_num_unsuccess);
 
 #ifdef KOKKOSKERNELSMOREMEM
-      //if one of the inserts was successfull, which means we run out shared memory
-      if (overall_num_unsuccess){
+      // if one of the inserts was successfull, which means we run out shared
+      // memory
+      if (overall_num_unsuccess) {
         if (num_unsuccess) {
-          hm2.vector_atomic_insert_into_hash_mergeOr(
-            n_set_index,n_set, used_hash_sizes + 1);
+          hm2.vector_atomic_insert_into_hash_mergeOr(n_set_index, n_set,
+                                                     used_hash_sizes + 1);
         }
       }
 #else
-      //if one of the inserts was successfull, which means we run out shared memory
-      if (overall_num_unsuccess){
- 	      //then we allocate second level memory using memory pool.
-	      if (!l2_allocated){
-		      volatile nnz_lno_t * tmp = NULL;
-		      while (tmp == NULL){
-			      Kokkos::single(Kokkos::PerThread(teamMember),[&] (volatile nnz_lno_t * &memptr) {
-					      memptr = (volatile nnz_lno_t * )( memory_space.allocate_chunk(row_ind));
-					      }, tmp);
-		      }
-		      globally_used_hash_indices = (nnz_lno_t *)tmp;
-		      hm2.hash_begins = (nnz_lno_t *) (globally_used_hash_indices + pow2_hash_size);
-		      hm2.hash_nexts = (nnz_lno_t *) (globally_used_hash_indices + pow2_hash_size * 2);
-		      l2_allocated = true;
-	      }
-	      
-	      //this parallel_for is not really needed.
-	      //we just need a sync threads at the end of the insertion.
-	      //Basically, we do not want
-	      //new_row_map(row_ind) = rowBeginP + used_hash_sizes[0] + used_hash_sizes[1];
-	      //to execute before the below insertion finishes.
-	      //parallel_for will provide this mechanism.
-#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
-              Kokkos::parallel_for(
-                 Kokkos::ThreadVectorRange(teamMember, vector_size),
-	          [&] (nnz_lno_t /*i*/) {
-#endif
-        //then for those who failed we insert it again to L2-accumulator.
-	      if (num_unsuccess) {
-		      hm2.vector_atomic_insert_into_hash_mergeOr_TrackHashes(
-			      n_set_index,n_set, used_hash_sizes + 1,
-			      globally_used_hash_count, globally_used_hash_indices);
+      // if one of the inserts was successfull, which means we run out shared
+      // memory
+      if (overall_num_unsuccess) {
+        // then we allocate second level memory using memory pool.
+        if (!l2_allocated) {
+          volatile nnz_lno_t *tmp = NULL;
+          while (tmp == NULL) {
+            Kokkos::single(
+                Kokkos::PerThread(teamMember),
+                [&](volatile nnz_lno_t *&memptr) {
+                  memptr = (volatile nnz_lno_t *)(memory_space.allocate_chunk(
+                      row_ind));
+                },
+                tmp);
+          }
+          globally_used_hash_indices = (nnz_lno_t *)tmp;
+          hm2.hash_begins =
+              (nnz_lno_t *)(globally_used_hash_indices + pow2_hash_size);
+          hm2.hash_nexts =
+              (nnz_lno_t *)(globally_used_hash_indices + pow2_hash_size * 2);
+          l2_allocated = true;
         }
-#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
-		});
+
+        // this parallel_for is not really needed.
+        // we just need a sync threads at the end of the insertion.
+        // Basically, we do not want
+        // new_row_map(row_ind) = rowBeginP + used_hash_sizes[0] +
+        // used_hash_sizes[1]; to execute before the below insertion finishes.
+        // parallel_for will provide this mechanism.
+#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \
+    defined(KOKKOS_ARCH_AMPERE)
+        Kokkos::parallel_for(
+            Kokkos::ThreadVectorRange(teamMember, vector_size),
+            [&](nnz_lno_t /*i*/) {
+#endif
+              // then for those who failed we insert it again to L2-accumulator.
+              if (num_unsuccess) {
+                hm2.vector_atomic_insert_into_hash_mergeOr_TrackHashes(
+                    n_set_index, n_set, used_hash_sizes + 1,
+                    globally_used_hash_count, globally_used_hash_indices);
+              }
+#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \
+    defined(KOKKOS_ARCH_AMPERE)
+            });
 #endif
       }
 #endif
@@ -712,80 +700,85 @@ struct KokkosSPGEMM
       rowBegin += work_to_handle;
     }
 
-    Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
-		    if (used_hash_sizes[0] > shmem_hash_size ) used_hash_sizes[0] = shmem_hash_size;
-		    new_row_map(row_ind) = rowBeginP + used_hash_sizes[0] + used_hash_sizes[1];
-		    });
+    Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+      if (used_hash_sizes[0] > shmem_hash_size)
+        used_hash_sizes[0] = shmem_hash_size;
+      new_row_map(row_ind) =
+          rowBeginP + used_hash_sizes[0] + used_hash_sizes[1];
+    });
 
 #ifndef KOKKOSKERNELSMOREMEM
-    if (l2_allocated){
-	    nnz_lno_t dirty_hashes = globally_used_hash_count[0];
-	    Kokkos::parallel_for(
-			    Kokkos::ThreadVectorRange(teamMember, dirty_hashes),
-			    [&] (nnz_lno_t i) {
-			    nnz_lno_t dirty_hash = globally_used_hash_indices[i];
-			    hm2.hash_begins[dirty_hash] = -1;
-			    });
-
-	    Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
-			    memory_space.release_chunk(globally_used_hash_indices);
-			    });
+    if (l2_allocated) {
+      nnz_lno_t dirty_hashes = globally_used_hash_count[0];
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember, dirty_hashes),
+                           [&](nnz_lno_t i) {
+                             nnz_lno_t dirty_hash =
+                                 globally_used_hash_indices[i];
+                             hm2.hash_begins[dirty_hash] = -1;
+                           });
+
+      Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+        memory_space.release_chunk(globally_used_hash_indices);
+      });
     }
 #endif
     size_type written_index = used_hash_sizes[1];
     Kokkos::parallel_for(
-		    Kokkos::ThreadVectorRange(teamMember, used_hash_sizes[0]),
-		    [&] (nnz_lno_t i) {
-		    pset_index_entries[rowBeginP + written_index + i] = keys[i];
-		    pset_entries[rowBeginP + written_index + i] = vals[i];
-		    });
+        Kokkos::ThreadVectorRange(teamMember, used_hash_sizes[0]),
+        [&](nnz_lno_t i) {
+          pset_index_entries[rowBeginP + written_index + i] = keys[i];
+          pset_entries[rowBeginP + written_index + i]       = vals[i];
+        });
   }
 
-  size_t team_shmem_size (int /* team_size */) const {
-	  return shared_memory_size;
+  size_t team_shmem_size(int /* team_size */) const {
+    return shared_memory_size;
   }
-
-  };
-
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-template <typename in_row_view_t, typename in_nnz_view_t, typename out_rowmap_view_t, typename out_nnz_view_t>
-bool KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-    compressMatrix(
-    nnz_lno_t n, size_type nnz,
-    in_row_view_t in_row_map,
-    in_nnz_view_t in_entries,
-
-    out_rowmap_view_t out_row_map,
-    out_nnz_view_t &out_nnz_indices,
-    out_nnz_view_t &out_nnz_sets,
-    bool compress_in_single_step)
-{
-  //get the execution space type.
-  KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space = this->handle->get_handle_exec_space();
-  constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>();
-  //get the suggested vectorlane size based on the execution space, and average number of nnzs per row.
+};
+
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename in_row_view_t, typename in_nnz_view_t,
+          typename out_rowmap_view_t, typename out_nnz_view_t>
+bool KokkosSPGEMM<
+    HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
+    b_lno_row_view_t_, b_lno_nnz_view_t_,
+    b_scalar_nnz_view_t_>::compressMatrix(nnz_lno_t n, size_type nnz,
+                                          in_row_view_t in_row_map,
+                                          in_nnz_view_t in_entries,
+
+                                          out_rowmap_view_t out_row_map,
+                                          out_nnz_view_t &out_nnz_indices,
+                                          out_nnz_view_t &out_nnz_sets,
+                                          bool compress_in_single_step) {
+  // get the execution space type.
+  KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space =
+      this->handle->get_handle_exec_space();
+  constexpr bool exec_gpu =
+      KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>();
+  // get the suggested vectorlane size based on the execution space, and average
+  // number of nnzs per row.
   int suggested_vector_size = this->handle->get_suggested_vector_size(n, nnz);
-  //get the suggested team size.
-  int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-  //get the chunk size suggested by the handle.
-  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(suggested_team_size, this->concurrency , n);
-  if (KOKKOSKERNELS_VERBOSE){
+  // get the suggested team size.
+  int suggested_team_size =
+      this->handle->get_suggested_team_size(suggested_vector_size);
+  // get the chunk size suggested by the handle.
+  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(
+      suggested_team_size, this->concurrency, n);
+  if (KOKKOSKERNELS_VERBOSE) {
     std::cout << "\t\tn:" << n << " nnz:" << nnz
               << " vector_size:" << suggested_vector_size
               << " team_size:" << suggested_team_size
               << " chunk_size::" << team_row_chunk_size
-              << " shmem:" << shmem_size
-              << std::endl;
+              << " shmem:" << shmem_size << std::endl;
   }
 
-  //size of the lno_t, how many bits it can hold.
-  int lnot_size = sizeof(nnz_lno_t) * 8;
+  // size of the lno_t, how many bits it can hold.
+  int lnot_size                     = sizeof(nnz_lno_t) * 8;
   int compression_bit_divide_shift_ = 0;
-  int val = lnot_size;
+  int val                           = lnot_size;
   while (val > 1) {
     ++compression_bit_divide_shift_;
     val = val >> 1;
@@ -793,236 +786,305 @@ bool KokkosSPGEMM
   nnz_lno_t compression_bit_mask_ = lnot_size - 1;
 
   Kokkos::Timer timer1;
-  //Allocate memory for the linked list to be used for the hashmap
+  // Allocate memory for the linked list to be used for the hashmap
   out_nnz_view_t set_nexts_;
   out_nnz_view_t set_begins_;
 #ifdef KOKKOSKERNELSMOREMEM
   if (exec_gpu) {
-    set_nexts_ = out_nnz_view_t (Kokkos::view_alloc(Kokkos::WithoutInitializing, "set_nexts_"), nnz);
-    set_begins_ = out_nnz_view_t (Kokkos::view_alloc(Kokkos::WithoutInitializing, "set_begins_"), nnz);
-    Kokkos::deep_copy (set_begins_, -1);
+    set_nexts_ = out_nnz_view_t(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "set_nexts_"), nnz);
+    set_begins_ = out_nnz_view_t(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "set_begins_"), nnz);
+    Kokkos::deep_copy(set_begins_, -1);
   }
   MyExecSpace().fence();
 #endif
 
-  if (KOKKOSKERNELS_VERBOSE){
-    std::cout << "\t\tCompression Allocations:" <<  timer1.seconds() << std::endl;
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\t\tCompression Allocations:" << timer1.seconds()
+              << std::endl;
   }
 
-  //if compressing in single step, allocate the memory as upperbound.
-  //TODO: two step is not there for GPU.
+  // if compressing in single step, allocate the memory as upperbound.
+  // TODO: two step is not there for GPU.
 
   if (compress_in_single_step || exec_gpu) {
-    out_nnz_indices = out_nnz_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "set_entries_"), nnz);
-    out_nnz_sets = out_nnz_view_t (Kokkos::view_alloc(Kokkos::WithoutInitializing, "set_indices_"), nnz);
+    out_nnz_indices = out_nnz_view_t(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "set_entries_"), nnz);
+    out_nnz_sets = out_nnz_view_t(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "set_indices_"), nnz);
   }
-  typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space;
-  //create functor to compress matrix.
-  SingleStepZipMatrix <in_row_view_t, in_nnz_view_t, out_rowmap_view_t, out_nnz_view_t, pool_memory_space>
-    sszm_compressMatrix(
-      in_row_map, //input symbolic matrix
-      in_entries, //input symbolic matrix
-
-      compression_bit_mask_, //modular
-      compression_bit_divide_shift_,  //divisor
-      suggested_vector_size, //vector size.
-      out_row_map, //output row map
-      set_begins_, //linked list parallel array begin
-      set_nexts_, //linked list parallel array next
-      out_nnz_indices, //output set bits
-      out_nnz_sets, //output set indices
-      shmem_size, //shared memory size.
-      team_row_chunk_size //chunksize.
-      ,suggested_team_size, KOKKOSKERNELS_VERBOSE,
-      lcl_my_exec_space
-  );
-  double min_reduction = this->handle->get_spgemm_handle()->get_compression_cut_off();
-  size_t OriginaltotalFlops = this->handle->get_spgemm_handle()->original_overall_flops;
+  typedef KokkosKernels::Impl::UniformMemoryPool<MyTempMemorySpace, nnz_lno_t>
+      pool_memory_space;
+  // create functor to compress matrix.
+  SingleStepZipMatrix<in_row_view_t, in_nnz_view_t, out_rowmap_view_t,
+                      out_nnz_view_t, pool_memory_space>
+      sszm_compressMatrix(in_row_map,  // input symbolic matrix
+                          in_entries,  // input symbolic matrix
+
+                          compression_bit_mask_,          // modular
+                          compression_bit_divide_shift_,  // divisor
+                          suggested_vector_size,          // vector size.
+                          out_row_map,                    // output row map
+                          set_begins_,      // linked list parallel array begin
+                          set_nexts_,       // linked list parallel array next
+                          out_nnz_indices,  // output set bits
+                          out_nnz_sets,     // output set indices
+                          shmem_size,       // shared memory size.
+                          team_row_chunk_size  // chunksize.
+                          ,
+                          suggested_team_size, KOKKOSKERNELS_VERBOSE,
+                          lcl_my_exec_space);
+  double min_reduction =
+      this->handle->get_spgemm_handle()->get_compression_cut_off();
+  size_t OriginaltotalFlops =
+      this->handle->get_spgemm_handle()->original_overall_flops;
 
   timer1.reset();
-  //bool compression_applied = false;
-  if (KokkosKernels::Impl::kk_is_gpu_exec_space<typename HandleType::HandleExecSpace>()) {
-
-
+  // bool compression_applied = false;
+  if (KokkosKernels::Impl::kk_is_gpu_exec_space<
+          typename HandleType::HandleExecSpace>()) {
 #ifndef KOKKOSKERNELSMOREMEM
     size_type max_row_nnz = 0;
-    KokkosKernels::Impl::view_reduce_maxsizerow<in_row_view_t, MyExecSpace>(n, in_row_map, max_row_nnz);
+    KokkosKernels::Impl::view_reduce_maxsizerow<in_row_view_t, MyExecSpace>(
+        n, in_row_map, max_row_nnz);
     MyExecSpace().fence();
-    KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
+    KokkosKernels::Impl::PoolType my_pool_type =
+        KokkosKernels::Impl::ManyThread2OneChunk;
 
     nnz_lno_t min_hash_size = 1;
-    while (max_row_nnz > (size_type) (min_hash_size)){
+    while (max_row_nnz > (size_type)(min_hash_size)) {
       min_hash_size *= 2;
     }
-    size_t chunksize = min_hash_size ; //this is for used hash indices
-    chunksize += min_hash_size ; //this is for the hash begins
-    chunksize += max_row_nnz ; //this is for hash nexts
+    size_t chunksize = min_hash_size;  // this is for used hash indices
+    chunksize += min_hash_size;        // this is for the hash begins
+    chunksize += max_row_nnz;          // this is for hash nexts
 
     sszm_compressMatrix.pow2_hash_size = min_hash_size;
     sszm_compressMatrix.pow2_hash_func = min_hash_size - 1;
 
-    nnz_lno_t num_chunks = this->template compute_num_pool_chunks<pool_memory_space>
-      (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
-
-    if (KOKKOSKERNELS_VERBOSE){
-
-      std::cout << "\t\tPOOL chunksize:" << chunksize << " num_chunks:"
-           << num_chunks << " min_hash_size:"
-           << min_hash_size << " max_row_nnz:" << max_row_nnz << std::endl;
-      std::cout << "\t\tPool Alloc MB:" << (sizeof(nnz_lno_t) * num_chunks * chunksize) / 1024. / 1024.  << std::endl;
+    nnz_lno_t num_chunks =
+        this->template compute_num_pool_chunks<pool_memory_space>(
+            chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
+
+    if (KOKKOSKERNELS_VERBOSE) {
+      std::cout << "\t\tPOOL chunksize:" << chunksize
+                << " num_chunks:" << num_chunks
+                << " min_hash_size:" << min_hash_size
+                << " max_row_nnz:" << max_row_nnz << std::endl;
+      std::cout << "\t\tPool Alloc MB:"
+                << (sizeof(nnz_lno_t) * num_chunks * chunksize) / 1024. / 1024.
+                << std::endl;
     }
     nnz_lno_t pool_init_val = -1;
-    pool_memory_space m_space(num_chunks, chunksize, pool_init_val,  my_pool_type);
+    pool_memory_space m_space(num_chunks, chunksize, pool_init_val,
+                              my_pool_type);
     MyExecSpace().fence();
     sszm_compressMatrix.memory_space = m_space;
 #endif
-    Kokkos::parallel_for("KokkosSparse::SingleStepZipMatrix::GPUEXEC",  gpu_team_policy_t(n / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sszm_compressMatrix);
-  }
-  else {
-
-    if (!compress_in_single_step){
+    Kokkos::parallel_for(
+        "KokkosSparse::SingleStepZipMatrix::GPUEXEC",
+        gpu_team_policy_t(n / suggested_team_size + 1, suggested_team_size,
+                          suggested_vector_size),
+        sszm_compressMatrix);
+  } else {
+    if (!compress_in_single_step) {
       bool use_unordered_compress = true;
-      if(use_unordered_compress)
-      {
+      if (use_unordered_compress) {
         size_type max_row_nnz = 0;
-        if(n)
-          KokkosKernels::Impl::view_reduce_maxsizerow<in_row_view_t, MyExecSpace>(n, in_row_map, max_row_nnz);
+        if (n)
+          KokkosKernels::Impl::view_reduce_maxsizerow<in_row_view_t,
+                                                      MyExecSpace>(
+              n, in_row_map, max_row_nnz);
         MyExecSpace().fence();
-        KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk;
+        KokkosKernels::Impl::PoolType my_pool_type =
+            KokkosKernels::Impl::OneThread2OneChunk;
 
         nnz_lno_t min_hash_size = 1;
-        while (max_row_nnz > (size_type) (min_hash_size)){
+        while (max_row_nnz > (size_type)(min_hash_size)) {
           min_hash_size *= 2;
         }
-        size_t chunksize = min_hash_size ; //this is for used hash indices
-        chunksize += min_hash_size ; //this is for the hash begins
-        chunksize += max_row_nnz ; //this is for hash nexts
-        chunksize += max_row_nnz ; //this is for hash keys //only for counting.
+        size_t chunksize = min_hash_size;  // this is for used hash indices
+        chunksize += min_hash_size;        // this is for the hash begins
+        chunksize += max_row_nnz;          // this is for hash nexts
+        chunksize += max_row_nnz;  // this is for hash keys //only for counting.
 
         sszm_compressMatrix.pow2_hash_size = min_hash_size;
         sszm_compressMatrix.pow2_hash_func = min_hash_size - 1;
-        sszm_compressMatrix.max_row_size = max_row_nnz;
+        sszm_compressMatrix.max_row_size   = max_row_nnz;
         size_t num_chunks = concurrency / suggested_vector_size;
 
-
-        if (KOKKOSKERNELS_VERBOSE){
-          std::cout << "\t\tPOOL chunksize:" << chunksize << " num_chunks:"
-               << num_chunks << " min_hash_size:"
-               << min_hash_size << " max_row_nnz:" << max_row_nnz << std::endl;
-          std::cout << "\t\tPool Alloc MB:" << (sizeof(nnz_lno_t) * num_chunks * chunksize) / 1024. / 1024.  << std::endl;
+        if (KOKKOSKERNELS_VERBOSE) {
+          std::cout << "\t\tPOOL chunksize:" << chunksize
+                    << " num_chunks:" << num_chunks
+                    << " min_hash_size:" << min_hash_size
+                    << " max_row_nnz:" << max_row_nnz << std::endl;
+          std::cout << "\t\tPool Alloc MB:"
+                    << (sizeof(nnz_lno_t) * num_chunks * chunksize) / 1024. /
+                           1024.
+                    << std::endl;
         }
         nnz_lno_t pool_init_val = -1;
-        pool_memory_space m_space(num_chunks, chunksize, pool_init_val,  my_pool_type);
+        pool_memory_space m_space(num_chunks, chunksize, pool_init_val,
+                                  my_pool_type);
         MyExecSpace().fence();
         sszm_compressMatrix.memory_space = m_space;
       }
 
       Kokkos::Timer timer_count;
       // HashmapAccumulator is populated here
-      if(use_unordered_compress)
-        Kokkos::parallel_for( "KokkosSparse::TwoStepZipMatrix::use_unordered_compress", team_count2_policy_t(n / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sszm_compressMatrix);
+      if (use_unordered_compress)
+        Kokkos::parallel_for(
+            "KokkosSparse::TwoStepZipMatrix::use_unordered_compress",
+            team_count2_policy_t(n / team_row_chunk_size + 1,
+                                 suggested_team_size, suggested_vector_size),
+            sszm_compressMatrix);
       else
-        Kokkos::parallel_for( "KokkosSparse::TwoStepZipMatrix::use_ordered_compress", team_count_policy_t(n / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sszm_compressMatrix);
+        Kokkos::parallel_for(
+            "KokkosSparse::TwoStepZipMatrix::use_ordered_compress",
+            team_count_policy_t(n / team_row_chunk_size + 1,
+                                suggested_team_size, suggested_vector_size),
+            sszm_compressMatrix);
 
       MyExecSpace().fence();
-      if (KOKKOSKERNELS_VERBOSE){
-        std::cout << "\t\tCompression Count Kernel:" <<  timer_count.seconds() << std::endl;
+      if (KOKKOSKERNELS_VERBOSE) {
+        std::cout << "\t\tCompression Count Kernel:" << timer_count.seconds()
+                  << std::endl;
       }
-      KokkosKernels::Impl::exclusive_parallel_prefix_sum<out_rowmap_view_t, MyExecSpace> (n + 1, out_row_map);
+      KokkosKernels::Impl::exclusive_parallel_prefix_sum<out_rowmap_view_t,
+                                                         MyExecSpace>(
+          n + 1, out_row_map);
 
       {
+        nnz_lno_t compressed_maxNumRoughZeros = 0;
+        size_t compressedoverall_flops        = 0;
+        Kokkos::Timer timer1_t;
+        auto new_row_mapB_begin = Kokkos::subview(
+            out_row_map, std::make_pair(nnz_lno_t(0), b_row_cnt));
+        auto new_row_mapB_end = Kokkos::subview(
+            out_row_map, std::make_pair(nnz_lno_t(1), b_row_cnt + 1));
+        row_lno_persistent_work_view_t compressed_flops_per_row(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                               "origianal row flops"),
+            a_row_cnt);
+
+        compressed_maxNumRoughZeros = this->getMaxRoughRowNNZ(
+            a_row_cnt, row_mapA, entriesA, new_row_mapB_begin, new_row_mapB_end,
+            compressed_flops_per_row.data());
+        KokkosKernels::Impl::kk_reduce_view2<row_lno_persistent_work_view_t,
+                                             MyExecSpace>(
+            a_row_cnt, compressed_flops_per_row, compressedoverall_flops);
+        double ratio = 0;
+        if (OriginaltotalFlops)
+          ratio = compressedoverall_flops / ((double)OriginaltotalFlops);
+        if (KOKKOSKERNELS_VERBOSE) {
+          std::cout << "\t\tCompressed Max Row Flops:"
+                    << compressed_maxNumRoughZeros << std::endl;
+          std::cout << "\t\tCompressed Overall Row Flops:"
+                    << compressedoverall_flops << std::endl;
+          std::cout << "\t\tCompressed Flops ratio:" << ratio
+                    << " min_reduction:" << min_reduction << std::endl;
+          std::cout << "\t\tCompressed Max Row Flop Calc Time:"
+                    << timer1_t.seconds() << std::endl;
+        }
 
-    	nnz_lno_t compressed_maxNumRoughZeros = 0;
-    	size_t compressedoverall_flops = 0;
-  		Kokkos::Timer timer1_t;
-  		auto new_row_mapB_begin = Kokkos::subview (out_row_map, std::make_pair (nnz_lno_t(0), b_row_cnt));
-  		auto new_row_mapB_end = Kokkos::subview (out_row_map, std::make_pair (nnz_lno_t(1), b_row_cnt + 1));
-  		row_lno_persistent_work_view_t compressed_flops_per_row(Kokkos::view_alloc(Kokkos::WithoutInitializing, "origianal row flops"), a_row_cnt);
-
-  		compressed_maxNumRoughZeros = this->getMaxRoughRowNNZ(a_row_cnt, row_mapA, entriesA, new_row_mapB_begin, new_row_mapB_end, compressed_flops_per_row.data());
-  		KokkosKernels::Impl::kk_reduce_view2<row_lno_persistent_work_view_t, MyExecSpace>(a_row_cnt, compressed_flops_per_row, compressedoverall_flops);
-                double ratio = 0;
-                if(OriginaltotalFlops)
-                  ratio = compressedoverall_flops / ((double) OriginaltotalFlops);
-  		if (KOKKOSKERNELS_VERBOSE){
-  			std::cout << "\t\tCompressed Max Row Flops:" << compressed_maxNumRoughZeros  << std::endl;
-  			std::cout << "\t\tCompressed Overall Row Flops:" << compressedoverall_flops  << std::endl;
-			std::cout << "\t\tCompressed Flops ratio:" << ratio <<  " min_reduction:" << min_reduction  << std::endl;
-  			std::cout << "\t\tCompressed Max Row Flop Calc Time:" << timer1_t.seconds()  << std::endl;
-  		}
-
-		this->handle->get_spgemm_handle()->compressed_max_row_flops = compressed_maxNumRoughZeros;
-		this->handle->get_spgemm_handle()->compressed_overall_flops = compressedoverall_flops;
-    	if (ratio > min_reduction) {
-    		return false;
-    	}
+        this->handle->get_spgemm_handle()->compressed_max_row_flops =
+            compressed_maxNumRoughZeros;
+        this->handle->get_spgemm_handle()->compressed_overall_flops =
+            compressedoverall_flops;
+        if (ratio > min_reduction) {
+          return false;
+        }
       }
 
-
-
       auto d_c_nnz_size = Kokkos::subview(out_row_map, n);
-      auto h_c_nnz_size = Kokkos::create_mirror_view (d_c_nnz_size);
-      Kokkos::deep_copy (h_c_nnz_size, d_c_nnz_size);
-      typename out_rowmap_view_t::non_const_value_type compressed_b_size = h_c_nnz_size();
-
-      //std::cout << "\tcompressed_b_size:" <<compressed_b_size << std::endl;
-      out_nnz_indices = out_nnz_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "set_entries_"), compressed_b_size);
-      out_nnz_sets = out_nnz_view_t (Kokkos::view_alloc(Kokkos::WithoutInitializing, "set_indices_"), compressed_b_size);
+      auto h_c_nnz_size = Kokkos::create_mirror_view(d_c_nnz_size);
+      Kokkos::deep_copy(h_c_nnz_size, d_c_nnz_size);
+      typename out_rowmap_view_t::non_const_value_type compressed_b_size =
+          h_c_nnz_size();
+
+      // std::cout << "\tcompressed_b_size:" <<compressed_b_size << std::endl;
+      out_nnz_indices = out_nnz_view_t(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "set_entries_"),
+          compressed_b_size);
+      out_nnz_sets = out_nnz_view_t(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "set_indices_"),
+          compressed_b_size);
 
       sszm_compressMatrix.set_index_entries = out_nnz_indices;
-      sszm_compressMatrix.set_entries = out_nnz_sets;
-
+      sszm_compressMatrix.set_entries       = out_nnz_sets;
 
       sszm_compressMatrix.pset_index_entries = out_nnz_indices.data();
-      sszm_compressMatrix.pset_entries = out_nnz_sets.data();
-      if(use_unordered_compress)
-        Kokkos::parallel_for(  "KokkosSparse::TwoStepZipMatrix::fill::use_unordered_compress", team_fill2_policy_t(n / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sszm_compressMatrix);
+      sszm_compressMatrix.pset_entries       = out_nnz_sets.data();
+      if (use_unordered_compress)
+        Kokkos::parallel_for(
+            "KokkosSparse::TwoStepZipMatrix::fill::use_unordered_compress",
+            team_fill2_policy_t(n / team_row_chunk_size + 1,
+                                suggested_team_size, suggested_vector_size),
+            sszm_compressMatrix);
       else
-        Kokkos::parallel_for(  "KokkosSparse::TwoStepZipMatrix::fill::use_unordered_compress", team_fill_policy_t(n / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sszm_compressMatrix);
+        Kokkos::parallel_for(
+            "KokkosSparse::TwoStepZipMatrix::fill::use_unordered_compress",
+            team_fill_policy_t(n / team_row_chunk_size + 1, suggested_team_size,
+                               suggested_vector_size),
+            sszm_compressMatrix);
       return true;
-    }
-    else {
-       //USING DYNAMIC SCHEDULE HERE SLOWS DOWN SIGNIFICANTLY WITH HYPERTHREADS
-      Kokkos::parallel_for("KokkosSparse::SingleStepZipMatrix::fill::use_unordered_compress", multicore_team_policy_t(n / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sszm_compressMatrix);
-
+    } else {
+      // USING DYNAMIC SCHEDULE HERE SLOWS DOWN SIGNIFICANTLY WITH HYPERTHREADS
+      Kokkos::parallel_for(
+          "KokkosSparse::SingleStepZipMatrix::fill::use_unordered_compress",
+          multicore_team_policy_t(n / team_row_chunk_size + 1,
+                                  suggested_team_size, suggested_vector_size),
+          sszm_compressMatrix);
     }
   }
   MyExecSpace().fence();
-  if (KOKKOSKERNELS_VERBOSE){
-    std::cout << "\t\tCompression Kernel time:" <<  timer1.seconds() << std::endl;
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\t\tCompression Kernel time:" << timer1.seconds()
+              << std::endl;
   }
   {
-	  nnz_lno_t compressed_maxNumRoughZeros = 0;
-	  size_t compressedoverall_flops = 0;
-	  Kokkos::Timer timer1_t;
-	  auto new_row_mapB_begin = in_row_map;
-	  auto new_row_mapB_end = out_row_map;
-	  row_lno_persistent_work_view_t compressed_flops_per_row(Kokkos::view_alloc(Kokkos::WithoutInitializing, "origianal row flops"), a_row_cnt);
-
-	  compressed_maxNumRoughZeros = this->getMaxRoughRowNNZ(a_row_cnt, row_mapA, entriesA, new_row_mapB_begin, new_row_mapB_end, compressed_flops_per_row.data());
-	  KokkosKernels::Impl::kk_reduce_view2<row_lno_persistent_work_view_t, MyExecSpace>(a_row_cnt, compressed_flops_per_row, compressedoverall_flops);
-          double ratio = 0;
-          if(OriginaltotalFlops)
-            ratio = compressedoverall_flops / ((double) OriginaltotalFlops);
-	  if (KOKKOSKERNELS_VERBOSE){
-		  std::cout << "\t\tCompressed Max Row Flops:" << compressed_maxNumRoughZeros  << std::endl;
-		  std::cout << "\t\tCompressed Overall Row Flops:" << compressedoverall_flops  << std::endl;
-		  std::cout << "\t\tCompressed Flops ratio:" << ratio <<  " min_reduction:" << min_reduction  << std::endl;
-
-
-		  std::cout << "\t\tCompressed Max Row Flop Calc Time:" << timer1_t.seconds()  << std::endl;
-	  }
-
-	  this->handle->get_spgemm_handle()->compressed_max_row_flops = compressed_maxNumRoughZeros;
-	  this->handle->get_spgemm_handle()->compressed_overall_flops = compressedoverall_flops;
-	  if (ratio > min_reduction) {
-		  return false;
-	  }
+    nnz_lno_t compressed_maxNumRoughZeros = 0;
+    size_t compressedoverall_flops        = 0;
+    Kokkos::Timer timer1_t;
+    auto new_row_mapB_begin = in_row_map;
+    auto new_row_mapB_end   = out_row_map;
+    row_lno_persistent_work_view_t compressed_flops_per_row(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "origianal row flops"),
+        a_row_cnt);
+
+    compressed_maxNumRoughZeros = this->getMaxRoughRowNNZ(
+        a_row_cnt, row_mapA, entriesA, new_row_mapB_begin, new_row_mapB_end,
+        compressed_flops_per_row.data());
+    KokkosKernels::Impl::kk_reduce_view2<row_lno_persistent_work_view_t,
+                                         MyExecSpace>(
+        a_row_cnt, compressed_flops_per_row, compressedoverall_flops);
+    double ratio = 0;
+    if (OriginaltotalFlops)
+      ratio = compressedoverall_flops / ((double)OriginaltotalFlops);
+    if (KOKKOSKERNELS_VERBOSE) {
+      std::cout << "\t\tCompressed Max Row Flops:"
+                << compressed_maxNumRoughZeros << std::endl;
+      std::cout << "\t\tCompressed Overall Row Flops:"
+                << compressedoverall_flops << std::endl;
+      std::cout << "\t\tCompressed Flops ratio:" << ratio
+                << " min_reduction:" << min_reduction << std::endl;
+
+      std::cout << "\t\tCompressed Max Row Flop Calc Time:"
+                << timer1_t.seconds() << std::endl;
+    }
+
+    this->handle->get_spgemm_handle()->compressed_max_row_flops =
+        compressed_maxNumRoughZeros;
+    this->handle->get_spgemm_handle()->compressed_overall_flops =
+        compressedoverall_flops;
+    if (ratio > min_reduction) {
+      return false;
+    }
   }
   return true;
 
 }  // compressMatrix (end)
 
-
-}
-}
+}  // namespace Impl
+}  // namespace KokkosSparse
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp
index 198cae3b8f..173a58b568 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp
@@ -42,226 +42,241 @@
 //@HEADER
 */
 
+namespace KokkosSparse {
+
+namespace Impl {
+
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename c_row_view_t, typename c_lno_nnz_view_t,
+          typename c_scalar_nnz_view_t>
+void KokkosSPGEMM<
+    HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
+    b_lno_row_view_t_, b_lno_nnz_view_t_,
+    b_scalar_nnz_view_t_>::KokkosSPGEMM_numeric(c_row_view_t &rowmapC_,
+                                                c_lno_nnz_view_t &entriesC_,
+                                                c_scalar_nnz_view_t &valuesC_) {
+  // get the algorithm and execution space.
+  // SPGEMMAlgorithm spgemm_algorithm =
+  // this->handle->get_spgemm_handle()->get_algorithm_type();
+  KokkosKernels::Impl::ExecSpaceType my_exec_space_ =
+      KokkosKernels::Impl::get_exec_space_type<MyExecSpace>();
+
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "Numeric PHASE" << std::endl;
+  }
 
-namespace KokkosSparse{
-
-
-namespace Impl{
-
-
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-template <typename c_row_view_t, typename c_lno_nnz_view_t, typename c_scalar_nnz_view_t>
-void KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-    KokkosSPGEMM_numeric(
-      c_row_view_t &rowmapC_,
-      c_lno_nnz_view_t &entriesC_,
-      c_scalar_nnz_view_t &valuesC_){
-
-    //get the algorithm and execution space.
-    //SPGEMMAlgorithm spgemm_algorithm = this->handle->get_spgemm_handle()->get_algorithm_type();
-    KokkosKernels::Impl::ExecSpaceType my_exec_space_ = KokkosKernels::Impl::get_exec_space_type<MyExecSpace>();
+  if (spgemm_algorithm == SPGEMM_KK_SPEED ||
+      spgemm_algorithm == SPGEMM_KK_DENSE) {
+    this->KokkosSPGEMM_numeric_speed(rowmapC_, entriesC_, valuesC_,
+                                     my_exec_space_);
+  } else {
+    this->KokkosSPGEMM_numeric_hash(rowmapC_, entriesC_, valuesC_,
+                                    my_exec_space_);
+  }
+}
 
-    if (KOKKOSKERNELS_VERBOSE){
-      std::cout << "Numeric PHASE" << std::endl;
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename c_row_view_t>
+void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                  a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                  b_scalar_nnz_view_t_>::KokkosSPGEMM_symbolic(c_row_view_t
+                                                                   rowmapC_) {
+  {
+    if (KOKKOSKERNELS_VERBOSE) {
+      std::cout << "SYMBOLIC PHASE" << std::endl;
     }
-
-    if (spgemm_algorithm == SPGEMM_KK_SPEED || spgemm_algorithm == SPGEMM_KK_DENSE)
+    // first calculate the number of original flops required.
     {
-      this->KokkosSPGEMM_numeric_speed(rowmapC_, entriesC_, valuesC_, my_exec_space_);
-    }
-    else {
-      this->KokkosSPGEMM_numeric_hash(rowmapC_, entriesC_, valuesC_, my_exec_space_);
+      nnz_lno_t maxNumRoughZeros = 0;
+      size_t overall_flops       = 0;
+      Kokkos::Timer timer1;
+      auto new_row_mapB_begin =
+          Kokkos::subview(row_mapB, std::make_pair(nnz_lno_t(0), b_row_cnt));
+      auto new_row_mapB_end = Kokkos::subview(
+          row_mapB, std::make_pair(nnz_lno_t(1), b_row_cnt + 1));
+      row_lno_persistent_work_view_t flops_per_row(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "original row flops"),
+          a_row_cnt);
+
+      // get maximum row flops.
+      maxNumRoughZeros = this->getMaxRoughRowNNZ(
+          a_row_cnt, row_mapA, entriesA, new_row_mapB_begin, new_row_mapB_end,
+          flops_per_row.data());
+
+      // calculate overal flops.
+      KokkosKernels::Impl::kk_reduce_view2<row_lno_persistent_work_view_t,
+                                           MyExecSpace>(
+          a_row_cnt, flops_per_row, overall_flops);
+      if (KOKKOSKERNELS_VERBOSE) {
+        std::cout << "\tOriginal Max Row Flops:" << maxNumRoughZeros
+                  << std::endl;
+        std::cout << "\tOriginal overall_flops Flops:" << overall_flops
+                  << std::endl;
+        std::cout << "\ttOriginal Max Row Flop Calc Time:" << timer1.seconds()
+                  << std::endl;
+      }
+      this->handle->get_spgemm_handle()->original_max_row_flops =
+          maxNumRoughZeros;
+      this->handle->get_spgemm_handle()->original_overall_flops = overall_flops;
+      this->handle->get_spgemm_handle()->row_flops              = flops_per_row;
     }
 
-  }
-
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-template <typename c_row_view_t>
-void KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-    KokkosSPGEMM_symbolic(c_row_view_t rowmapC_){
-
-  {
-	if (KOKKOSKERNELS_VERBOSE){
-	  std::cout << "SYMBOLIC PHASE" << std::endl;
-	}
-	//first calculate the number of original flops required.
-	{
-		nnz_lno_t maxNumRoughZeros = 0;
-		size_t overall_flops = 0;
-		Kokkos::Timer timer1;
-		auto new_row_mapB_begin = Kokkos::subview (row_mapB, std::make_pair (nnz_lno_t(0), b_row_cnt));
-		auto new_row_mapB_end = Kokkos::subview (row_mapB, std::make_pair (nnz_lno_t(1), b_row_cnt + 1));
-		row_lno_persistent_work_view_t flops_per_row(Kokkos::view_alloc(Kokkos::WithoutInitializing, "original row flops"), a_row_cnt);
-
-		//get maximum row flops.
-		maxNumRoughZeros = this->getMaxRoughRowNNZ(a_row_cnt, row_mapA, entriesA,
-								new_row_mapB_begin, new_row_mapB_end, flops_per_row.data());
-
-		//calculate overal flops.
-		KokkosKernels::Impl::kk_reduce_view2<row_lno_persistent_work_view_t, MyExecSpace>(
-				a_row_cnt, flops_per_row, overall_flops);
-		if (KOKKOSKERNELS_VERBOSE){
-			std::cout << "\tOriginal Max Row Flops:" << maxNumRoughZeros  << std::endl;
-			std::cout << "\tOriginal overall_flops Flops:" << overall_flops  << std::endl;
-			std::cout << "\ttOriginal Max Row Flop Calc Time:" << timer1.seconds()  << std::endl;
-		}
-		this->handle->get_spgemm_handle()->original_max_row_flops = maxNumRoughZeros;
-		this->handle->get_spgemm_handle()->original_overall_flops = overall_flops;
-		this->handle->get_spgemm_handle()->row_flops = flops_per_row;
-	}
-
-    //number of rows and nnzs
-    nnz_lno_t n = this->row_mapB.extent(0) - 1;
+    // number of rows and nnzs
+    nnz_lno_t n   = this->row_mapB.extent(0) - 1;
     size_type nnz = this->entriesB.extent(0);
 
-    bool compress_in_single_step = this->handle->get_spgemm_handle()->get_compression_step();
-    //compress in single step if it is GPU.
+    bool compress_in_single_step =
+        this->handle->get_spgemm_handle()->get_compression_step();
+    // compress in single step if it is GPU.
     if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>())
-    	compress_in_single_step = true;
+      compress_in_single_step = true;
 
-    //compressed B fields.
-    row_lno_temp_work_view_t new_row_mapB(Kokkos::view_alloc(Kokkos::WithoutInitializing, "new row map"), n+1);
+    // compressed B fields.
+    row_lno_temp_work_view_t new_row_mapB(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "new row map"), n + 1);
     row_lno_temp_work_view_t new_row_mapB_begins;
 
-    nnz_lno_temp_work_view_t set_index_entries; //will be output of compress matrix.
-    nnz_lno_temp_work_view_t set_entries; //will be output of compress matrix
+    nnz_lno_temp_work_view_t
+        set_index_entries;                 // will be output of compress matrix.
+    nnz_lno_temp_work_view_t set_entries;  // will be output of compress matrix
 
-    //First Compress B.
+    // First Compress B.
     Kokkos::Timer timer1;
 
-    if (KOKKOSKERNELS_VERBOSE){
+    if (KOKKOSKERNELS_VERBOSE) {
       std::cout << "\tCOMPRESS MATRIX-B PHASE" << std::endl;
     }
 
-    //call compression.
-    //it might not go through to the end if ratio is not high.
-    bool compression_applied = this->compressMatrix(n, nnz, this->row_mapB, this->entriesB,
-    												new_row_mapB, set_index_entries, set_entries,
-													compress_in_single_step);
-
+    // call compression.
+    // it might not go through to the end if ratio is not high.
+    bool compression_applied = this->compressMatrix(
+        n, nnz, this->row_mapB, this->entriesB, new_row_mapB, set_index_entries,
+        set_entries, compress_in_single_step);
 
-    if (KOKKOSKERNELS_VERBOSE){
-      std::cout << "\t\tCOMPRESS MATRIX-B overall time:" << timer1.seconds() << std::endl << std::endl;
+    if (KOKKOSKERNELS_VERBOSE) {
+      std::cout << "\t\tCOMPRESS MATRIX-B overall time:" << timer1.seconds()
+                << std::endl
+                << std::endl;
     }
 
     timer1.reset();
 
-    //first get the max flops for a row, which will be used for max row size.
-    //If we did compression in single step, row_mapB[i] points the begining of row i,
-    //and new_row_mapB[i] points to the end of row i.
+    // first get the max flops for a row, which will be used for max row size.
+    // If we did compression in single step, row_mapB[i] points the begining of
+    // row i, and new_row_mapB[i] points to the end of row i.
 
     if (compression_applied) {
-		nnz_lno_t maxNumRoughZeros = this->handle->get_spgemm_handle()->compressed_max_row_flops;
-
-    	if (compress_in_single_step){
-    		//calling symbolic structure
-    		this->symbolic_c(a_row_cnt, row_mapA, entriesA,
-    				row_mapB, new_row_mapB, set_index_entries, set_entries,
-					rowmapC_, maxNumRoughZeros);
-
-    	}
-    	else {
-    		nnz_lno_t begin = 0;
-    		auto new_row_mapB_begin = Kokkos::subview (new_row_mapB, std::make_pair (begin, n));
-    		auto new_row_mapB_end = Kokkos::subview (new_row_mapB, std::make_pair (begin + 1, n + 1));
-
-    		//calling symbolic structure
-    		this->symbolic_c(a_row_cnt, row_mapA, entriesA,
-    				new_row_mapB_begin, new_row_mapB_end, set_index_entries, set_entries,
-					rowmapC_, maxNumRoughZeros);
-    	}
-    }
-    else {
-    	new_row_mapB = row_lno_temp_work_view_t();
-    	new_row_mapB_begins = row_lno_temp_work_view_t();
-    	set_index_entries = nnz_lno_temp_work_view_t();
-    	set_entries = nnz_lno_temp_work_view_t();
-    	nnz_lno_t maxNumRoughZeros = this->handle->get_spgemm_handle()->original_max_row_flops;
-    	if (KOKKOSKERNELS_VERBOSE){
-    		std::cout << "SYMBOLIC PHASE -- NO COMPRESSION: maxNumRoughZeros:" << maxNumRoughZeros << std::endl;
-    	}
-
-    	auto new_row_mapB_begin = Kokkos::subview (this->row_mapB, std::make_pair (nnz_lno_t(0), n));
-    	auto new_row_mapB_end = Kokkos::subview (this->row_mapB, std::make_pair (nnz_lno_t(1), n + 1));
-
-    	//calling symbolic structure
-    	this->symbolic_c_no_compression(
-    			a_row_cnt, row_mapA, entriesA,
-				new_row_mapB_begin, new_row_mapB_end, this->entriesB,
-				rowmapC_, maxNumRoughZeros);
+      nnz_lno_t maxNumRoughZeros =
+          this->handle->get_spgemm_handle()->compressed_max_row_flops;
+
+      if (compress_in_single_step) {
+        // calling symbolic structure
+        this->symbolic_c(a_row_cnt, row_mapA, entriesA, row_mapB, new_row_mapB,
+                         set_index_entries, set_entries, rowmapC_,
+                         maxNumRoughZeros);
+
+      } else {
+        nnz_lno_t begin = 0;
+        auto new_row_mapB_begin =
+            Kokkos::subview(new_row_mapB, std::make_pair(begin, n));
+        auto new_row_mapB_end =
+            Kokkos::subview(new_row_mapB, std::make_pair(begin + 1, n + 1));
+
+        // calling symbolic structure
+        this->symbolic_c(a_row_cnt, row_mapA, entriesA, new_row_mapB_begin,
+                         new_row_mapB_end, set_index_entries, set_entries,
+                         rowmapC_, maxNumRoughZeros);
+      }
+    } else {
+      new_row_mapB        = row_lno_temp_work_view_t();
+      new_row_mapB_begins = row_lno_temp_work_view_t();
+      set_index_entries   = nnz_lno_temp_work_view_t();
+      set_entries         = nnz_lno_temp_work_view_t();
+      nnz_lno_t maxNumRoughZeros =
+          this->handle->get_spgemm_handle()->original_max_row_flops;
+      if (KOKKOSKERNELS_VERBOSE) {
+        std::cout << "SYMBOLIC PHASE -- NO COMPRESSION: maxNumRoughZeros:"
+                  << maxNumRoughZeros << std::endl;
+      }
+
+      auto new_row_mapB_begin =
+          Kokkos::subview(this->row_mapB, std::make_pair(nnz_lno_t(0), n));
+      auto new_row_mapB_end =
+          Kokkos::subview(this->row_mapB, std::make_pair(nnz_lno_t(1), n + 1));
 
+      // calling symbolic structure
+      this->symbolic_c_no_compression(
+          a_row_cnt, row_mapA, entriesA, new_row_mapB_begin, new_row_mapB_end,
+          this->entriesB, rowmapC_, maxNumRoughZeros);
     }
 #ifdef KOKKOSKERNELS_ANALYZE_MEMORYACCESS
-    double read_write_cost = this->handle->get_spgemm_handle()->get_read_write_cost_calc();
-    if (read_write_cost){
+    double read_write_cost =
+        this->handle->get_spgemm_handle()->get_read_write_cost_calc();
+    if (read_write_cost) {
       this->print_read_write_cost(rowmapC_);
     }
 #endif
   }
-
 }
 
-
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
 template <typename c_row_view_t, typename c_nnz_view_t>
-void KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-    write_matrix_to_plot(
-    nnz_lno_t &num_colors,
-    nnz_lno_persistent_work_host_view_t &h_color_xadj,
-    nnz_lno_persistent_work_view_t &color_adj,
-    c_row_view_t &rowmapC, c_nnz_view_t &entryIndicesC_){
+void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                  a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                  b_scalar_nnz_view_t_>::
+    write_matrix_to_plot(nnz_lno_t &num_colors,
+                         nnz_lno_persistent_work_host_view_t &h_color_xadj,
+                         nnz_lno_persistent_work_view_t &color_adj,
+                         c_row_view_t &rowmapC, c_nnz_view_t &entryIndicesC_) {
   std::cout << "writing to plot" << std::endl;
 
-  nnz_lno_persistent_work_host_view_t h_color_adj = Kokkos::create_mirror_view (color_adj);
-  Kokkos::deep_copy (h_color_adj, color_adj);
-  auto h_rowmapC = Kokkos::create_mirror_view (rowmapC);
-  Kokkos::deep_copy (h_rowmapC, rowmapC);
-  auto h_entryIndicesC = Kokkos::create_mirror_view (entryIndicesC_);
-  Kokkos::deep_copy (h_entryIndicesC, entryIndicesC_);
+  nnz_lno_persistent_work_host_view_t h_color_adj =
+      Kokkos::create_mirror_view(color_adj);
+  Kokkos::deep_copy(h_color_adj, color_adj);
+  auto h_rowmapC = Kokkos::create_mirror_view(rowmapC);
+  Kokkos::deep_copy(h_rowmapC, rowmapC);
+  auto h_entryIndicesC = Kokkos::create_mirror_view(entryIndicesC_);
+  Kokkos::deep_copy(h_entryIndicesC, entryIndicesC_);
 
-  for (nnz_lno_t i = 0; i < num_colors; ++i){
+  for (nnz_lno_t i = 0; i < num_colors; ++i) {
     nnz_lno_t color_begin = h_color_xadj(i);
-    nnz_lno_t color_end = h_color_xadj(i + 1);
+    nnz_lno_t color_end   = h_color_xadj(i + 1);
 
     std::string colorind = "";
     std::stringstream ss;
     ss << i;
 
-
     ss >> colorind;
     colorind += ".coords";
     std::fstream fs;
     fs.open(colorind.c_str(), std::fstream::out);
 
-    std::cout << "COLOR:" << i << " colorbegin:" << color_begin << " colorend:" << color_end << " size:" << color_end - color_begin << std::endl;
-    for (nnz_lno_t j = color_begin; j < color_end; ++j){
+    std::cout << "COLOR:" << i << " colorbegin:" << color_begin
+              << " colorend:" << color_end
+              << " size:" << color_end - color_begin << std::endl;
+    for (nnz_lno_t j = color_begin; j < color_end; ++j) {
       nnz_lno_t row = h_color_adj(j);
-      for (size_type k = h_rowmapC(row); k < h_rowmapC(row + 1); ++k){
+      for (size_type k = h_rowmapC(row); k < h_rowmapC(row + 1); ++k) {
         nnz_lno_t column = h_entryIndicesC(k);
-        //std::cout << row << " " << column << std::endl;
+        // std::cout << row << " " << column << std::endl;
         fs << row << " " << column << std::endl;
       }
     }
     fs.close();
   }
 
-
-
   std::fstream fs;
   fs.open("plot1.gnuplot", std::fstream::out);
-  for (nnz_lno_t i = 0; i < num_colors; ++i){
+  for (nnz_lno_t i = 0; i < num_colors; ++i) {
     std::string colorind = "\"";
     std::stringstream ss;
     ss << i;
@@ -270,13 +285,10 @@ void KokkosSPGEMM
     colorind += ".coords\"";
     if (i > 0) fs << "re";
     fs << "plot " << colorind << std::endl;
-
   }
   fs << "pause -1" << std::endl;
   fs.close();
 }
 
-
-}
-}
-
+}  // namespace Impl
+}  // namespace KokkosSparse
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
index 153aa6e397..94cec7af04 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
@@ -46,22 +46,22 @@
 
 #include "KokkosKernels_Utils.hpp"
 
-namespace KokkosSparse{
-
-namespace Impl{
-
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-template <typename a_row_view_t, typename a_nnz_view_t, typename a_scalar_view_t,
-          typename b_row_view_t, typename b_nnz_view_t, typename b_scalar_view_t,
-          typename c_row_view_t, typename c_nnz_view_t, typename c_scalar_view_t,
-          typename pool_memory_type>
-struct KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-  PortableNumericCHASH
-{
+namespace KokkosSparse {
+
+namespace Impl {
+
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename a_row_view_t, typename a_nnz_view_t,
+          typename a_scalar_view_t, typename b_row_view_t,
+          typename b_nnz_view_t, typename b_scalar_view_t,
+          typename c_row_view_t, typename c_nnz_view_t,
+          typename c_scalar_view_t, typename pool_memory_type>
+struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                    a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                    b_scalar_nnz_view_t_>::PortableNumericCHASH {
   nnz_lno_t numrows;
 
   a_row_view_t row_mapA;
@@ -76,28 +76,26 @@ struct KokkosSPGEMM
   c_nnz_view_t entriesC;
   c_scalar_view_t valuesC;
 
-
   nnz_lno_t *pEntriesC;
   scalar_t *pvaluesC;
   const size_t shared_memory_size;
   const int vector_size;
   pool_memory_type memory_space;
 
-  //nnz_lno_t max_nnz;
+  // nnz_lno_t max_nnz;
   const nnz_lno_t pow2_hash_size;
   const nnz_lno_t max_nnz;
   const nnz_lno_t pow2_hash_func;
   const KokkosKernels::Impl::ExecSpaceType my_exec_space;
   const nnz_lno_t team_work_size;
 
-  const int unit_memory; //begins, nexts, and keys. No need for vals yet.
+  const int unit_memory;  // begins, nexts, and keys. No need for vals yet.
   const int suggested_team_size;
   const int thread_memory;
   nnz_lno_t thread_shmem_key_size;
   nnz_lno_t thread_shared_memory_hash_func;
   nnz_lno_t thread_shmem_hash_size;
 
-
   nnz_lno_t team_shmem_key_size;
   nnz_lno_t team_shared_memory_hash_func;
   nnz_lno_t team_shmem_hash_size;
@@ -108,30 +106,20 @@ struct KokkosSPGEMM
   row_lno_persistent_work_view_t flops_per_row;
 
   PortableNumericCHASH(
-      nnz_lno_t m_,
-      a_row_view_t row_mapA_,
-      a_nnz_view_t entriesA_,
+      nnz_lno_t m_, a_row_view_t row_mapA_, a_nnz_view_t entriesA_,
       a_scalar_view_t valuesA_,
 
-      b_row_view_t row_mapB_,
-      b_nnz_view_t entriesB_,
-      b_scalar_view_t valuesB_,
-
-      c_row_view_t rowmapC_,
-      c_nnz_view_t entriesC_,
-      c_scalar_view_t valuesC_,
-      size_t shared_memory_size_,
-      int vector_size_,
-      pool_memory_type mpool_,
-      nnz_lno_t min_hash_size, nnz_lno_t max_nnz_,
-      int suggested_team_size_,
+      b_row_view_t row_mapB_, b_nnz_view_t entriesB_, b_scalar_view_t valuesB_,
+
+      c_row_view_t rowmapC_, c_nnz_view_t entriesC_, c_scalar_view_t valuesC_,
+      size_t shared_memory_size_, int vector_size_, pool_memory_type mpool_,
+      nnz_lno_t min_hash_size, nnz_lno_t max_nnz_, int suggested_team_size_,
       const KokkosKernels::Impl::ExecSpaceType my_exec_space_,
       nnz_lno_t team_row_chunk_size, double first_level_cut_off,
-	  row_lno_persistent_work_view_t flops_per_row_,
-      bool KOKKOSKERNELS_VERBOSE_
-      ):
-        numrows(m_),
-        row_mapA (row_mapA_),
+      row_lno_persistent_work_view_t flops_per_row_,
+      bool KOKKOSKERNELS_VERBOSE_)
+      : numrows(m_),
+        row_mapA(row_mapA_),
         entriesA(entriesA_),
         valuesA(valuesA_),
 
@@ -142,834 +130,925 @@ struct KokkosSPGEMM
         rowmapC(rowmapC_),
         entriesC(entriesC_),
         valuesC(valuesC_),
-        pEntriesC(entriesC_.data()), pvaluesC(valuesC_.data()),
-        shared_memory_size(shared_memory_size_),
-        vector_size (vector_size_),
+        pEntriesC(entriesC_.data()),
+        pvaluesC(valuesC_.data()),
+        shared_memory_size(shared_memory_size_ / 8 * 8),
+        vector_size(vector_size_),
         memory_space(mpool_),
-        //max_nnz(),
-        pow2_hash_size(min_hash_size), max_nnz(max_nnz_),
+        // max_nnz(),
+        pow2_hash_size(min_hash_size),
+        max_nnz(max_nnz_),
         pow2_hash_func(min_hash_size - 1),
         my_exec_space(my_exec_space_),
         team_work_size(team_row_chunk_size),
 
-        unit_memory(sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) + sizeof (scalar_t)),
+        unit_memory(sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) +
+                    sizeof(scalar_t)),
         suggested_team_size(suggested_team_size_),
-        thread_memory((shared_memory_size /8 / suggested_team_size_) * 8),
+        thread_memory((shared_memory_size / 8 / suggested_team_size_) * 8),
         thread_shmem_key_size(),
         thread_shared_memory_hash_func(),
         thread_shmem_hash_size(1),
         team_shmem_key_size(),
         team_shared_memory_hash_func(),
         team_shmem_hash_size(1),
-        team_cuckoo_key_size (1),
+        team_cuckoo_key_size(1),
         team_cuckoo_hash_func(1),
         max_first_level_hash_size(1),
         flops_per_row(flops_per_row_)
 
   {
-    nnz_lno_t tmp_team_cuckoo_key_size = ((shared_memory_size - sizeof(nnz_lno_t) * 2) / (sizeof(nnz_lno_t) + sizeof(scalar_t )));
+    nnz_lno_t tmp_team_cuckoo_key_size =
+        ((shared_memory_size - sizeof(nnz_lno_t) * 2) /
+         (sizeof(nnz_lno_t) + sizeof(scalar_t)));
 
-    while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size) team_cuckoo_key_size = team_cuckoo_key_size * 2;
+    while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size)
+      team_cuckoo_key_size = team_cuckoo_key_size * 2;
     team_cuckoo_hash_func = team_cuckoo_key_size - 1;
-    //How many extra bytes are needed to align a scalar_t after an array of nnz_lno_t, in the worst case?
-    constexpr size_t scalarAlignPad = (alignof(scalar_t) > alignof(nnz_lno_t)) ? (alignof(scalar_t) - alignof(nnz_lno_t)) : 0;
-    team_shmem_key_size = ((shared_memory_size - sizeof(nnz_lno_t) * 4 - scalarAlignPad) / unit_memory);
-    thread_shmem_key_size = ((thread_memory - sizeof(nnz_lno_t) * 4 - scalarAlignPad) / unit_memory);
-    if (KOKKOSKERNELS_VERBOSE_){
-      std::cout << "\t\tPortableNumericCHASH -- sizeof(scalar_t): " << sizeof(scalar_t) << "  sizeof(nnz_lno_t): " << sizeof(nnz_lno_t) << "  suggested_team_size: " << suggested_team_size << std::endl;
-      std::cout << "\t\tPortableNumericCHASH -- thread_memory:" << thread_memory  << " unit_memory:" << unit_memory <<" initial key size:" << thread_shmem_key_size << std::endl;
-      std::cout << "\t\tPortableNumericCHASH -- team shared_memory:" << shared_memory_size  << " unit_memory:" << unit_memory <<" initial team key size:" << team_shmem_key_size << std::endl;
+    // How many extra bytes are needed to align a scalar_t after an array of
+    // nnz_lno_t, in the worst case?
+    constexpr size_t scalarAlignPad =
+        (alignof(scalar_t) > alignof(nnz_lno_t))
+            ? (alignof(scalar_t) - alignof(nnz_lno_t))
+            : 0;
+    team_shmem_key_size =
+        ((shared_memory_size - sizeof(nnz_lno_t) * 4 - scalarAlignPad) /
+         unit_memory);
+    thread_shmem_key_size =
+        ((thread_memory - sizeof(nnz_lno_t) * 4 - scalarAlignPad) /
+         unit_memory);
+    if (KOKKOSKERNELS_VERBOSE_) {
+      std::cout << "\t\tPortableNumericCHASH -- sizeof(scalar_t): "
+                << sizeof(scalar_t)
+                << "  sizeof(nnz_lno_t): " << sizeof(nnz_lno_t)
+                << "  suggested_team_size: " << suggested_team_size
+                << std::endl;
+      std::cout << "\t\tPortableNumericCHASH -- thread_memory:" << thread_memory
+                << " unit_memory:" << unit_memory
+                << " initial key size:" << thread_shmem_key_size << std::endl;
+      std::cout << "\t\tPortableNumericCHASH -- team shared_memory:"
+                << shared_memory_size << " unit_memory:" << unit_memory
+                << " initial team key size:" << team_shmem_key_size
+                << std::endl;
     }
-    while (thread_shmem_hash_size * 2 <=  thread_shmem_key_size){
+    while (thread_shmem_hash_size * 2 <= thread_shmem_key_size) {
       thread_shmem_hash_size = thread_shmem_hash_size * 2;
     }
-    while (team_shmem_hash_size * 2 <=  team_shmem_key_size){
-    	team_shmem_hash_size = team_shmem_hash_size * 2;
+    while (team_shmem_hash_size * 2 <= team_shmem_key_size) {
+      team_shmem_hash_size = team_shmem_hash_size * 2;
     }
-    team_shared_memory_hash_func = team_shmem_hash_size - 1;
+    team_shared_memory_hash_func   = team_shmem_hash_size - 1;
     thread_shared_memory_hash_func = thread_shmem_hash_size - 1;
-    team_shmem_key_size = team_shmem_key_size + ((team_shmem_key_size - team_shmem_hash_size) * sizeof(nnz_lno_t)) / (sizeof (nnz_lno_t) * 2 + sizeof(scalar_t));
+    team_shmem_key_size =
+        team_shmem_key_size +
+        ((team_shmem_key_size - team_shmem_hash_size) * sizeof(nnz_lno_t)) /
+            (sizeof(nnz_lno_t) * 2 + sizeof(scalar_t));
     team_shmem_key_size = (team_shmem_key_size >> 1) << 1;
 
-    thread_shmem_key_size = thread_shmem_key_size + ((thread_shmem_key_size - thread_shmem_hash_size) * sizeof(nnz_lno_t)) / (sizeof (nnz_lno_t) * 2 + sizeof(scalar_t));
+    thread_shmem_key_size =
+        thread_shmem_key_size +
+        ((thread_shmem_key_size - thread_shmem_hash_size) * sizeof(nnz_lno_t)) /
+            (sizeof(nnz_lno_t) * 2 + sizeof(scalar_t));
     thread_shmem_key_size = (thread_shmem_key_size >> 1) << 1;
 
-    if (KOKKOSKERNELS_VERBOSE_){
-      std::cout << "\t\tPortableNumericCHASH -- thread_memory:" << thread_memory  << " unit_memory:" << unit_memory <<" resized key size:" << thread_shmem_key_size << std::endl;
-      std::cout << "\t\tPortableNumericCHASH -- team shared_memory:" << shared_memory_size  << " unit_memory:" << unit_memory <<" resized team key size:" << team_shmem_key_size << std::endl;
+    if (KOKKOSKERNELS_VERBOSE_) {
+      std::cout << "\t\tPortableNumericCHASH -- thread_memory:" << thread_memory
+                << " unit_memory:" << unit_memory
+                << " resized key size:" << thread_shmem_key_size << std::endl;
+      std::cout << "\t\tPortableNumericCHASH -- team shared_memory:"
+                << shared_memory_size << " unit_memory:" << unit_memory
+                << " resized team key size:" << team_shmem_key_size
+                << std::endl;
     }
 
     max_first_level_hash_size = first_level_cut_off * team_cuckoo_key_size;
-    if (KOKKOSKERNELS_VERBOSE_){
-      std::cout << "\t\tPortableNumericCHASH -- thread_memory:" << thread_memory  << " unit_memory:" << unit_memory <<" initial key size:" << thread_shmem_key_size << std::endl;
-      std::cout << "\t\tPortableNumericCHASH -- team_memory:" << shared_memory_size  << " unit_memory:" << unit_memory <<" initial team key size:" << team_shmem_key_size << std::endl;
-      std::cout << "\t\tPortableNumericCHASH -- adjusted hashsize:" << thread_shmem_hash_size  << " thread_shmem_key_size:" << thread_shmem_key_size << std::endl;
-      std::cout << "\t\tPortableNumericCHASH -- adjusted team hashsize:" << team_shmem_hash_size  << " team_shmem_key_size:" << team_shmem_key_size << std::endl;
-      std::cout << "\t\t  team_cuckoo_key_size:" << team_cuckoo_key_size << " team_cuckoo_hash_func:" << team_cuckoo_hash_func << " max_first_level_hash_size:" << max_first_level_hash_size << std::endl;
-      std::cout << "\t\t  pow2_hash_size:" << pow2_hash_size << " pow2_hash_func:" << pow2_hash_func << std::endl;
+    if (KOKKOSKERNELS_VERBOSE_) {
+      std::cout << "\t\tPortableNumericCHASH -- thread_memory:" << thread_memory
+                << " unit_memory:" << unit_memory
+                << " initial key size:" << thread_shmem_key_size << std::endl;
+      std::cout << "\t\tPortableNumericCHASH -- team_memory:"
+                << shared_memory_size << " unit_memory:" << unit_memory
+                << " initial team key size:" << team_shmem_key_size
+                << std::endl;
+      std::cout << "\t\tPortableNumericCHASH -- adjusted hashsize:"
+                << thread_shmem_hash_size
+                << " thread_shmem_key_size:" << thread_shmem_key_size
+                << std::endl;
+      std::cout << "\t\tPortableNumericCHASH -- adjusted team hashsize:"
+                << team_shmem_hash_size
+                << " team_shmem_key_size:" << team_shmem_key_size << std::endl;
+      std::cout << "\t\t  team_cuckoo_key_size:" << team_cuckoo_key_size
+                << " team_cuckoo_hash_func:" << team_cuckoo_hash_func
+                << " max_first_level_hash_size:" << max_first_level_hash_size
+                << std::endl;
+      std::cout << "\t\t  pow2_hash_size:" << pow2_hash_size
+                << " pow2_hash_func:" << pow2_hash_func << std::endl;
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  size_t get_thread_id(const size_t row_index) const{
-    switch (my_exec_space){
-    default:
-      return row_index;
-#if defined( KOKKOS_ENABLE_SERIAL )
-    case KokkosKernels::Impl::Exec_SERIAL:
-      return 0;
-#endif
-#if defined( KOKKOS_ENABLE_OPENMP )
-    case KokkosKernels::Impl::Exec_OMP:
-      return Kokkos::OpenMP::impl_hardware_thread_id();
+  size_t get_thread_id(const size_t row_index) const {
+    switch (my_exec_space) {
+      default: return row_index;
+#if defined(KOKKOS_ENABLE_SERIAL)
+      case KokkosKernels::Impl::Exec_SERIAL: return 0;
 #endif
-#if defined( KOKKOS_ENABLE_THREADS )
-    case KokkosKernels::Impl::Exec_PTHREADS:
-      return Kokkos::Threads::impl_hardware_thread_id();
+#if defined(KOKKOS_ENABLE_OPENMP)
+      case KokkosKernels::Impl::Exec_OMP:
+        return Kokkos::OpenMP::impl_hardware_thread_id();
 #endif
-#if defined( KOKKOS_ENABLE_QTHREAD)
-    case KokkosKernels::Impl::Exec_QTHREADS:
-      return 0; // Kokkos does not have a thread_id API for Qthreads
+#if defined(KOKKOS_ENABLE_THREADS)
+      case KokkosKernels::Impl::Exec_THREADS:
+        return Kokkos::Threads::impl_hardware_thread_id();
 #endif
-#if defined( KOKKOS_ENABLE_CUDA )
-    case KokkosKernels::Impl::Exec_CUDA:
-      return row_index;
+#if defined(KOKKOS_ENABLE_CUDA)
+      case KokkosKernels::Impl::Exec_CUDA: return row_index;
 #endif
-#if defined( KOKKOS_ENABLE_HIP )
-    case KokkosKernels::Impl::Exec_HIP:
-      return row_index;
+#if defined(KOKKOS_ENABLE_HIP)
+      case KokkosKernels::Impl::Exec_HIP: return row_index;
 #endif
     }
   }
 
-  //linear probing with tracking.
+  // linear probing with tracking.
   KOKKOS_INLINE_FUNCTION
-  void operator()(const MultiCoreTag4&, const team_member_t & teamMember) const {
-
+  void operator()(const MultiCoreTag4 &,
+                  const team_member_t &teamMember) const {
     const nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
-
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
 
-    volatile nnz_lno_t * tmp = NULL;
+    volatile nnz_lno_t *tmp = NULL;
     size_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-    while (tmp == NULL){
-      tmp = (volatile nnz_lno_t * )( memory_space.allocate_chunk(tid));
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(memory_space.allocate_chunk(tid));
     }
 
-    nnz_lno_t *used_indices = (nnz_lno_t *) (tmp);
+    nnz_lno_t *used_indices = (nnz_lno_t *)(tmp);
     tmp += max_nnz;
-    nnz_lno_t *hash_ids = (nnz_lno_t *) (tmp);
+    nnz_lno_t *hash_ids = (nnz_lno_t *)(tmp);
     tmp += pow2_hash_size;
 
-    scalar_t *hash_values = KokkosKernels::Impl::alignPtr<volatile nnz_lno_t*, scalar_t>(tmp);
-
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index) {
-      nnz_lno_t used_count = 0;
-
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t left_work = row_mapA[row_index + 1] - col_begin;
-      for ( nnz_lno_t ii = 0; ii < left_work; ++ii){
-        size_type a_col = col_begin + ii;
-        nnz_lno_t rowB = entriesA[a_col];
-        scalar_t valA = valuesA[a_col];
-
-        size_type rowBegin = row_mapB(rowB);
-        nnz_lno_t left_workB = row_mapB(rowB + 1) - rowBegin;
-
-        for ( nnz_lno_t i = 0; i < left_workB; ++i){
-          const size_type adjind = i + rowBegin;
-          nnz_lno_t b_col_ind = entriesB[adjind];
-          scalar_t b_val = valuesB[adjind] * valA;
-          nnz_lno_t hash = (b_col_ind * HASHSCALAR) & pow2_hash_func;
-
-          while (true){
-            if (hash_ids[hash] == -1){
-            	used_indices[used_count++] = hash;
-            	hash_ids[hash] = b_col_ind;
-            	hash_values[hash] = b_val;
-            	break;
-            }
-            else if (hash_ids[hash] == b_col_ind){
-            	hash_values[hash] += b_val;
-            	break;
-            }
-            else {
-            	hash = (hash + 1) & pow2_hash_func;
+    scalar_t *hash_values =
+        KokkosKernels::Impl::alignPtr<volatile nnz_lno_t *, scalar_t>(tmp);
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          nnz_lno_t used_count = 0;
+
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t left_work = row_mapA[row_index + 1] - col_begin;
+          for (nnz_lno_t ii = 0; ii < left_work; ++ii) {
+            size_type a_col = col_begin + ii;
+            nnz_lno_t rowB  = entriesA[a_col];
+            scalar_t valA   = valuesA[a_col];
+
+            size_type rowBegin   = row_mapB(rowB);
+            nnz_lno_t left_workB = row_mapB(rowB + 1) - rowBegin;
+
+            for (nnz_lno_t i = 0; i < left_workB; ++i) {
+              const size_type adjind = i + rowBegin;
+              nnz_lno_t b_col_ind    = entriesB[adjind];
+              scalar_t b_val         = valuesB[adjind] * valA;
+              nnz_lno_t hash = (b_col_ind * HASHSCALAR) & pow2_hash_func;
+
+              while (true) {
+                if (hash_ids[hash] == -1) {
+                  used_indices[used_count++] = hash;
+                  hash_ids[hash]             = b_col_ind;
+                  hash_values[hash]          = b_val;
+                  break;
+                } else if (hash_ids[hash] == b_col_ind) {
+                  hash_values[hash] += b_val;
+                  break;
+                } else {
+                  hash = (hash + 1) & pow2_hash_func;
+                }
+              }
             }
           }
-        }
-      }
-      size_type c_row_begin = rowmapC[row_index];
-      for (nnz_lno_t i = 0; i < used_count; ++i){
-    	  nnz_lno_t used_index = used_indices[i];
-    	  pEntriesC[c_row_begin] = hash_ids[used_index];
-    	  pvaluesC [c_row_begin++] = hash_values[used_index];
-    	  hash_ids[used_index] = -1;
-      }
-    });
+          size_type c_row_begin = rowmapC[row_index];
+          for (nnz_lno_t i = 0; i < used_count; ++i) {
+            nnz_lno_t used_index    = used_indices[i];
+            pEntriesC[c_row_begin]  = hash_ids[used_index];
+            pvaluesC[c_row_begin++] = hash_values[used_index];
+            hash_ids[used_index]    = -1;
+          }
+        });
     memory_space.release_chunk(used_indices);
   }
 
-
-
-  //assumes that the vector lane is 1, as in cpus
+  // assumes that the vector lane is 1, as in cpus
   KOKKOS_INLINE_FUNCTION
-  void operator()(const MultiCoreTag&, const team_member_t & teamMember) const {
-
+  void operator()(const MultiCoreTag &, const team_member_t &teamMember) const {
     const nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
 
-    KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,nnz_lno_t,scalar_t,KokkosKernels::Experimental::HashOpType::bitwiseAnd>
-    hm2(pow2_hash_size, pow2_hash_func, NULL, NULL, NULL, NULL);
+    KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, scalar_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>
+        hm2(pow2_hash_size, pow2_hash_func, NULL, NULL, NULL, NULL);
 
-    volatile nnz_lno_t * tmp = NULL;
+    volatile nnz_lno_t *tmp = NULL;
     size_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-    while (tmp == NULL){
-      tmp = (volatile nnz_lno_t * )( memory_space.allocate_chunk(tid));
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(memory_space.allocate_chunk(tid));
     }
-    nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *) tmp;
-    tmp += pow2_hash_size ;
+    nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *)tmp;
+    tmp += pow2_hash_size;
 
-    hm2.hash_begins = (nnz_lno_t *) (tmp);
+    hm2.hash_begins = (nnz_lno_t *)(tmp);
     tmp += pow2_hash_size;
-    hm2.hash_nexts = (nnz_lno_t *) (tmp);
+    hm2.hash_nexts = (nnz_lno_t *)(tmp);
 
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index) {
-      nnz_lno_t globally_used_hash_count = 0;
-      nnz_lno_t used_hash_sizes = 0;
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          nnz_lno_t globally_used_hash_count = 0;
+          nnz_lno_t used_hash_sizes          = 0;
 
-      const size_type c_row_begin = rowmapC[row_index];
+          const size_type c_row_begin = rowmapC[row_index];
 
-      hm2.keys = pEntriesC + c_row_begin;
-      hm2.values = pvaluesC + c_row_begin;
-
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t left_work = row_mapA[row_index + 1] - col_begin;
-
-      for ( nnz_lno_t ii = 0; ii < left_work; ++ii){
-        size_type a_col = col_begin + ii;
-        nnz_lno_t rowB = entriesA[a_col];
-        scalar_t valA = valuesA[a_col];
-
-        size_type rowBegin = row_mapB(rowB);
-        nnz_lno_t left_workB = row_mapB(rowB + 1) - rowBegin;
-
-        for ( nnz_lno_t i = 0; i < left_workB; ++i){
-          const size_type adjind = i + rowBegin;
-          nnz_lno_t b_col_ind = entriesB[adjind];
-          scalar_t b_val = valuesB[adjind] * valA;
-          //nnz_lno_t hash = (b_col_ind * 107) & pow2_hash_func;
-
-          //this has to be a success, we do not need to check for the success.
-          //int insertion =
-          hm2.sequential_insert_into_hash_mergeAdd_TrackHashes(
-            b_col_ind, b_val,
-            &used_hash_sizes,
-            &globally_used_hash_count,
-            globally_used_hash_indices
-          );
-        }
-      }
-      for (nnz_lno_t i = 0; i < globally_used_hash_count; ++i){
-        nnz_lno_t dirty_hash = globally_used_hash_indices[i];
-        hm2.hash_begins[dirty_hash] = -1;
-      }
-    });
+          hm2.keys   = pEntriesC + c_row_begin;
+          hm2.values = pvaluesC + c_row_begin;
+
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t left_work = row_mapA[row_index + 1] - col_begin;
+
+          for (nnz_lno_t ii = 0; ii < left_work; ++ii) {
+            size_type a_col = col_begin + ii;
+            nnz_lno_t rowB  = entriesA[a_col];
+            scalar_t valA   = valuesA[a_col];
+
+            size_type rowBegin   = row_mapB(rowB);
+            nnz_lno_t left_workB = row_mapB(rowB + 1) - rowBegin;
+
+            for (nnz_lno_t i = 0; i < left_workB; ++i) {
+              const size_type adjind = i + rowBegin;
+              nnz_lno_t b_col_ind    = entriesB[adjind];
+              scalar_t b_val         = valuesB[adjind] * valA;
+              // nnz_lno_t hash = (b_col_ind * 107) & pow2_hash_func;
+
+              // this has to be a success, we do not need to check for the
+              // success. int insertion =
+              hm2.sequential_insert_into_hash_mergeAdd_TrackHashes(
+                  b_col_ind, b_val, &used_hash_sizes, &globally_used_hash_count,
+                  globally_used_hash_indices);
+            }
+          }
+          for (nnz_lno_t i = 0; i < globally_used_hash_count; ++i) {
+            nnz_lno_t dirty_hash        = globally_used_hash_indices[i];
+            hm2.hash_begins[dirty_hash] = -1;
+          }
+        });
     memory_space.release_chunk(globally_used_hash_indices);
   }
 
-
-  //assumes that the vector lane is 1, as in cpus
+  // assumes that the vector lane is 1, as in cpus
   KOKKOS_INLINE_FUNCTION
-  void operator()(const MultiCoreTag2&, const team_member_t & teamMember) const {
-
+  void operator()(const MultiCoreTag2 &,
+                  const team_member_t &teamMember) const {
     const nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
-    volatile nnz_lno_t * tmp = NULL;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
+    volatile nnz_lno_t *tmp = NULL;
     size_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
     nnz_lno_t chunk_size = 0;
 
-    while (tmp == NULL){
-      tmp = (volatile nnz_lno_t * )( memory_space.allocate_chunk(tid));
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(memory_space.allocate_chunk(tid));
       // issue-508, TODO: chunk_size = ???
     }
-    nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *) tmp;
+    nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *)tmp;
 
-    KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,nnz_lno_t,scalar_t,KokkosKernels::Experimental::HashOpType::bitwiseAnd>
-    hm2(chunk_size, pow2_hash_func, NULL, NULL, NULL, NULL);
+    KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, scalar_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>
+        hm2(chunk_size, pow2_hash_func, NULL, NULL, NULL, NULL);
 
-    tmp += pow2_hash_size ;
+    tmp += pow2_hash_size;
 
-    hm2.hash_begins = (nnz_lno_t *) (tmp);
+    hm2.hash_begins = (nnz_lno_t *)(tmp);
     tmp += pow2_hash_size;
-    hm2.hash_nexts = (nnz_lno_t *) (tmp);
+    hm2.hash_nexts = (nnz_lno_t *)(tmp);
     tmp += max_nnz;
 
-    hm2.keys = (nnz_lno_t *) (tmp);
+    hm2.keys = (nnz_lno_t *)(tmp);
     tmp += max_nnz;
-    hm2.values = KokkosKernels::Impl::alignPtr<volatile nnz_lno_t*, scalar_t>(tmp);
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index) {
-      nnz_lno_t globally_used_hash_count = 0;
-      nnz_lno_t used_hash_sizes = 0;
-
-      const size_type c_row_begin = rowmapC[row_index];
-      const size_type c_row_end = rowmapC[row_index + 1];
-
-      const nnz_lno_t global_memory_hash_size = nnz_lno_t(c_row_end - c_row_begin);
-
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t left_work = row_mapA[row_index + 1] - col_begin;
-      for ( nnz_lno_t ii = 0; ii < left_work; ++ii){
-        size_type a_col = col_begin + ii;
-        nnz_lno_t rowB = entriesA[a_col];
-        scalar_t valA = valuesA[a_col];
-
-        size_type rowBegin = row_mapB(rowB);
-        nnz_lno_t left_workB = row_mapB(rowB + 1) - rowBegin;
-
-        for ( nnz_lno_t i = 0; i < left_workB; ++i){
-          const size_type adjind = i + rowBegin;
-          nnz_lno_t b_col_ind = entriesB[adjind];
-          scalar_t b_val = valuesB[adjind] * valA;
-          nnz_lno_t hash = b_col_ind & pow2_hash_func;
-
-          //this has to be a success, we do not need to check for the success.
-          //int insertion =
-          hm2.sequential_insert_into_hash_mergeAdd_TrackHashes(
-            b_col_ind, b_val,
-            &used_hash_sizes,
-            &globally_used_hash_count,
-            globally_used_hash_indices
-          );
-        }
-      }
-      for (nnz_lno_t i = 0; i < globally_used_hash_count; ++i){
-        nnz_lno_t dirty_hash = globally_used_hash_indices[i];
-        hm2.hash_begins[dirty_hash] = -1;
-      }
-      for (nnz_lno_t i = 0; i < global_memory_hash_size; ++i){
-        pEntriesC [c_row_begin + i] = hm2.keys[i];
-        pvaluesC [c_row_begin+i] =hm2.values[i];
-      }
-
-    });
+    hm2.values =
+        KokkosKernels::Impl::alignPtr<volatile nnz_lno_t *, scalar_t>(tmp);
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          nnz_lno_t globally_used_hash_count = 0;
+          nnz_lno_t used_hash_sizes          = 0;
+
+          const size_type c_row_begin = rowmapC[row_index];
+          const size_type c_row_end   = rowmapC[row_index + 1];
+
+          const nnz_lno_t global_memory_hash_size =
+              nnz_lno_t(c_row_end - c_row_begin);
+
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t left_work = row_mapA[row_index + 1] - col_begin;
+          for (nnz_lno_t ii = 0; ii < left_work; ++ii) {
+            size_type a_col = col_begin + ii;
+            nnz_lno_t rowB  = entriesA[a_col];
+            scalar_t valA   = valuesA[a_col];
+
+            size_type rowBegin   = row_mapB(rowB);
+            nnz_lno_t left_workB = row_mapB(rowB + 1) - rowBegin;
+
+            for (nnz_lno_t i = 0; i < left_workB; ++i) {
+              const size_type adjind = i + rowBegin;
+              nnz_lno_t b_col_ind    = entriesB[adjind];
+              scalar_t b_val         = valuesB[adjind] * valA;
+              nnz_lno_t hash         = b_col_ind & pow2_hash_func;
+
+              // this has to be a success, we do not need to check for the
+              // success. int insertion =
+              hm2.sequential_insert_into_hash_mergeAdd_TrackHashes(
+                  b_col_ind, b_val, &used_hash_sizes, &globally_used_hash_count,
+                  globally_used_hash_indices);
+            }
+          }
+          for (nnz_lno_t i = 0; i < globally_used_hash_count; ++i) {
+            nnz_lno_t dirty_hash        = globally_used_hash_indices[i];
+            hm2.hash_begins[dirty_hash] = -1;
+          }
+          for (nnz_lno_t i = 0; i < global_memory_hash_size; ++i) {
+            pEntriesC[c_row_begin + i] = hm2.keys[i];
+            pvaluesC[c_row_begin + i]  = hm2.values[i];
+          }
+        });
     memory_space.release_chunk(globally_used_hash_indices);
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const GPUTag&, const team_member_t & teamMember) const {
-
-    nnz_lno_t team_row_begin = teamMember.league_rank()  * team_work_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
-
+  void operator()(const GPUTag &, const team_member_t &teamMember) const {
+    nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
 
-    //int thread_memory = (shared_memory_size / 8 / teamMember.team_size()) * 8;
-    char *all_shared_memory = (char *) (teamMember.team_shmem().get_shmem(shared_memory_size));
+    // int thread_memory = (shared_memory_size / 8 / teamMember.team_size()) *
+    // 8;
+    char *all_shared_memory =
+        (char *)(teamMember.team_shmem().get_shmem(shared_memory_size));
 
-    //shift it to the thread private part
+    // shift it to the thread private part
     all_shared_memory += thread_memory * teamMember.team_rank();
 
-    //used_hash_sizes hold the size of 1st and 2nd level hashes
-    volatile nnz_lno_t *used_hash_sizes = (volatile nnz_lno_t *) (all_shared_memory);
+    // used_hash_sizes hold the size of 1st and 2nd level hashes
+    volatile nnz_lno_t *used_hash_sizes =
+        (volatile nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * 2;
 
-    nnz_lno_t *globally_used_hash_count = (nnz_lno_t *) (all_shared_memory);
+    nnz_lno_t *globally_used_hash_count = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * 2;
 
-    //int unit_memory = sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) + sizeof (scalar_t) ; //begins, nexts, keys and vals .
-    //nnz_lno_t shmem_key_size = (thread_memory - sizeof(nnz_lno_t) * 4) / unit_memory;
-    //if (shmem_key_size & 1) shmem_key_size -= 1;
+    // int unit_memory = sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) + sizeof
+    // (scalar_t) ; //begins, nexts, keys and vals . nnz_lno_t shmem_key_size =
+    // (thread_memory - sizeof(nnz_lno_t) * 4) / unit_memory; if (shmem_key_size
+    // & 1) shmem_key_size -= 1;
 
-    nnz_lno_t * begins = (nnz_lno_t *) (all_shared_memory);
+    nnz_lno_t *begins = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * thread_shmem_hash_size;
 
-    //points to the next elements
-    nnz_lno_t * nexts = (nnz_lno_t *) (all_shared_memory);
+    // points to the next elements
+    nnz_lno_t *nexts = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * thread_shmem_key_size;
 
-    //holds the keys
-    nnz_lno_t * keys = (nnz_lno_t *) (all_shared_memory);
+    // holds the keys
+    nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * thread_shmem_key_size;
     // remainder of shmem allocation for vals
-    scalar_t * vals = KokkosKernels::Impl::alignPtr<char*, scalar_t>(all_shared_memory);
-
-    KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,nnz_lno_t,scalar_t,KokkosKernels::Experimental::HashOpType::bitwiseAnd>
-    hm(thread_shmem_key_size, thread_shared_memory_hash_func, begins, nexts, keys, vals);
-
-    KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,nnz_lno_t,scalar_t,KokkosKernels::Experimental::HashOpType::bitwiseAnd>
-    hm2(pow2_hash_size, pow2_hash_func,
-        NULL, NULL, NULL, NULL);
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index) {
-      const size_type c_row_begin = rowmapC[row_index];
-      const size_type c_row_end = rowmapC[row_index + 1];
-      const nnz_lno_t global_memory_hash_size = nnz_lno_t(c_row_end - c_row_begin);
-
-      bool is_global_alloced = false;
-      nnz_lno_t *globally_used_hash_indices = NULL;
-
-
-	  if (global_memory_hash_size > thread_shmem_key_size){
-		  volatile nnz_lno_t * tmp = NULL;
-		  //size_t tid = get_thread_id(row_index);
-		  //the code gets internal compiler error on gcc 4.7.2
-		  //assuming that this part only runs on GPUs for now, below fix
-		  //has the exact same behaviour and runs okay.
-		  size_t tid = row_index;
-
-		  while (tmp == NULL){
-			  Kokkos::single(Kokkos::PerThread(teamMember),[&] (volatile nnz_lno_t * &memptr) {
-				  memptr = (volatile nnz_lno_t * )( memory_space.allocate_chunk(tid));
-			  }, tmp);
-		  }
-
-
-		  is_global_alloced = true;
-		  globally_used_hash_indices = (nnz_lno_t *) tmp;
-		  tmp += pow2_hash_size ;
-		  hm2.hash_begins = (nnz_lno_t *) (tmp);
-		  tmp += pow2_hash_size ;
-		  hm2.hash_nexts = (nnz_lno_t *) (tmp);
-	  }
-      hm2.keys = pEntriesC + c_row_begin;
-      hm2.values = pvaluesC + c_row_begin;
-
-      //initialize begins.
-      Kokkos::parallel_for( Kokkos::ThreadVectorRange(teamMember, thread_shmem_hash_size), [&] (nnz_lno_t i) {
-          begins[i] = -1; });
-
-      //initialize hash usage sizes
-      Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
-        used_hash_sizes[0] = 0;
-        used_hash_sizes[1] = 0;
-        globally_used_hash_count[0] = 0;
-      });
+    scalar_t *vals =
+        KokkosKernels::Impl::alignPtr<char *, scalar_t>(all_shared_memory);
+
+    KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, scalar_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>
+        hm(thread_shmem_key_size, thread_shared_memory_hash_func, begins, nexts,
+           keys, vals);
+
+    KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, scalar_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>
+        hm2(pow2_hash_size, pow2_hash_func, NULL, NULL, NULL, NULL);
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          const size_type c_row_begin = rowmapC[row_index];
+          const size_type c_row_end   = rowmapC[row_index + 1];
+          const nnz_lno_t global_memory_hash_size =
+              nnz_lno_t(c_row_end - c_row_begin);
+
+          bool is_global_alloced                = false;
+          nnz_lno_t *globally_used_hash_indices = NULL;
+
+          if (global_memory_hash_size > thread_shmem_key_size) {
+            volatile nnz_lno_t *tmp = NULL;
+            // size_t tid = get_thread_id(row_index);
+            // the code gets internal compiler error on gcc 4.7.2
+            // assuming that this part only runs on GPUs for now, below fix
+            // has the exact same behaviour and runs okay.
+            size_t tid = row_index;
+
+            while (tmp == NULL) {
+              Kokkos::single(
+                  Kokkos::PerThread(teamMember),
+                  [&](volatile nnz_lno_t *&memptr) {
+                    memptr = (volatile nnz_lno_t *)(memory_space.allocate_chunk(
+                        tid));
+                  },
+                  tmp);
+            }
 
+            is_global_alloced          = true;
+            globally_used_hash_indices = (nnz_lno_t *)tmp;
+            tmp += pow2_hash_size;
+            hm2.hash_begins = (nnz_lno_t *)(tmp);
+            tmp += pow2_hash_size;
+            hm2.hash_nexts = (nnz_lno_t *)(tmp);
+          }
+          hm2.keys   = pEntriesC + c_row_begin;
+          hm2.values = pvaluesC + c_row_begin;
+
+          // initialize begins.
+          Kokkos::parallel_for(
+              Kokkos::ThreadVectorRange(teamMember, thread_shmem_hash_size),
+              [&](nnz_lno_t i) { begins[i] = -1; });
+
+          // initialize hash usage sizes
+          Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+            used_hash_sizes[0]          = 0;
+            used_hash_sizes[1]          = 0;
+            globally_used_hash_count[0] = 0;
+          });
+
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t left_work = row_mapA[row_index + 1] - col_begin;
+          nnz_lno_t ii              = left_work;
+          // for ( nnz_lno_t ii = 0; ii < left_work; ++ii){
+          while (ii-- > 0) {
+            size_type a_col = col_begin + ii;
+            nnz_lno_t rowB  = entriesA[a_col];
+            scalar_t valA   = valuesA[a_col];
+
+            size_type rowBegin   = row_mapB(rowB);
+            nnz_lno_t left_work_ = row_mapB(rowB + 1) - rowBegin;
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(teamMember, left_work_),
+                [&](nnz_lno_t i) {
+                  const size_type adjind = i + rowBegin;
+                  nnz_lno_t b_col_ind    = entriesB[adjind];
+                  scalar_t b_val         = valuesB[adjind] * valA;
+                  volatile int num_unsuccess =
+                      hm.vector_atomic_insert_into_hash_mergeAdd(
+                          b_col_ind, b_val, used_hash_sizes);
+                  if (num_unsuccess) {
+                    hm2.vector_atomic_insert_into_hash_mergeAdd_TrackHashes(
+                        b_col_ind, b_val, used_hash_sizes + 1,
+                        globally_used_hash_count, globally_used_hash_indices);
+                  }
+                });
+          }
 
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t left_work = row_mapA[row_index + 1] - col_begin;
-      nnz_lno_t ii = left_work;
-//for ( nnz_lno_t ii = 0; ii < left_work; ++ii){
-      while(ii-- > 0){
-    	size_type a_col = col_begin + ii;
-        nnz_lno_t rowB = entriesA[a_col];
-        scalar_t valA = valuesA[a_col];
-
-
-        size_type rowBegin = row_mapB(rowB);
-        nnz_lno_t left_work_ = row_mapB(rowB + 1) - rowBegin;
-        Kokkos::parallel_for( Kokkos::ThreadVectorRange(teamMember, left_work_),
-            [&] (nnz_lno_t i) {
-          const size_type adjind = i + rowBegin;
-          nnz_lno_t b_col_ind = entriesB[adjind];
-          scalar_t b_val = valuesB[adjind] * valA;
-          volatile int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeAdd(
-                                         b_col_ind, b_val,
-                                         used_hash_sizes
-                                       );
-          if (num_unsuccess) {
-        	  hm2.vector_atomic_insert_into_hash_mergeAdd_TrackHashes(
-              b_col_ind,b_val,
-              used_hash_sizes + 1,
-              globally_used_hash_count, globally_used_hash_indices
-            );
+          if (is_global_alloced) {
+            nnz_lno_t dirty_hashes = globally_used_hash_count[0];
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(teamMember, dirty_hashes),
+                [&](nnz_lno_t i) {
+                  nnz_lno_t dirty_hash        = globally_used_hash_indices[i];
+                  hm2.hash_begins[dirty_hash] = -1;
+                });
+
+            Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+              memory_space.release_chunk(globally_used_hash_indices);
+            });
           }
-        });
-      }
 
-      if (is_global_alloced){
+          Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+            if (used_hash_sizes[0] > thread_shmem_key_size)
+              used_hash_sizes[0] = thread_shmem_key_size;
+          });
 
-        nnz_lno_t dirty_hashes = globally_used_hash_count[0];
-        Kokkos::parallel_for(
-            Kokkos::ThreadVectorRange(teamMember, dirty_hashes),
-            [&] (nnz_lno_t i) {
-          nnz_lno_t dirty_hash = globally_used_hash_indices[i];
-          hm2.hash_begins[dirty_hash] = -1;
-        });
+          nnz_lno_t num_elements = used_hash_sizes[0];
 
-        Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
-          memory_space.release_chunk(globally_used_hash_indices);
+          nnz_lno_t written_index = used_hash_sizes[1];
+          Kokkos::parallel_for(
+              Kokkos::ThreadVectorRange(teamMember, num_elements),
+              [&](nnz_lno_t i) {
+                pEntriesC[c_row_begin + written_index + i] = keys[i];
+                pvaluesC[c_row_begin + written_index + i]  = vals[i];
+              });
         });
-      }
-
-      Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
-        if (used_hash_sizes[0] > thread_shmem_key_size) used_hash_sizes[0] = thread_shmem_key_size;
-      });
-
-      nnz_lno_t num_elements = used_hash_sizes[0];
-
-      nnz_lno_t written_index = used_hash_sizes[1];
-      Kokkos::parallel_for(
-          Kokkos::ThreadVectorRange(teamMember, num_elements),
-          [&] (nnz_lno_t i) {
-        pEntriesC[c_row_begin + written_index + i] = keys[i];
-        pvaluesC[c_row_begin + written_index + i] = vals[i];
-      });
-    });
   }
 
-  //one row does not fit into shmem, with thread-flat-parallel
+  // one row does not fit into shmem, with thread-flat-parallel
   KOKKOS_INLINE_FUNCTION
-  void operator()(const GPUTag6&, const team_member_t & teamMember) const {
-
-    nnz_lno_t team_row_begin = teamMember.league_rank()  * team_work_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
-    char *all_shared_memory = (char *) (teamMember.team_shmem().get_shmem(shared_memory_size));
-
-    // shmem == sizeof(nnz_lno_t)*2 + sizeof(nnz_lno_t)*team_cuckoo_key_size + sizeof(scalar_t)*nvals
+  void operator()(const GPUTag6 &, const team_member_t &teamMember) const {
+    nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
+    char *all_shared_memory =
+        (char *)(teamMember.team_shmem().get_shmem(shared_memory_size));
+
+    // shmem == sizeof(nnz_lno_t)*2 + sizeof(nnz_lno_t)*team_cuckoo_key_size +
+    // sizeof(scalar_t)*nvals
     const nnz_lno_t init_value = -1;
-    volatile nnz_lno_t *used_hash_sizes = (volatile nnz_lno_t *) (all_shared_memory);
+    volatile nnz_lno_t *used_hash_sizes =
+        (volatile nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * 2;
-    //holds the keys
-    nnz_lno_t * keys = (nnz_lno_t *) (all_shared_memory);
+    // holds the keys
+    nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * team_cuckoo_key_size;
-    scalar_t * vals = KokkosKernels::Impl::alignPtr<char*, scalar_t>(all_shared_memory);
+    scalar_t *vals =
+        KokkosKernels::Impl::alignPtr<char *, scalar_t>(all_shared_memory);
 
-    int thread_rank =  teamMember.team_rank();
+    int thread_rank = teamMember.team_rank();
 
     int vector_rank = 0;
-    typedef typename std::remove_reference< decltype( *used_hash_sizes ) >::type atomic_incr_type;
+    typedef typename std::remove_reference<decltype(*used_hash_sizes)>::type
+        atomic_incr_type;
     Kokkos::parallel_scan(
         Kokkos::ThreadVectorRange(teamMember, vector_size),
-        [&] (const int /* threadid */, int &update, const bool final) {
-      if (final){
-      	vector_rank = update;
-      }
-      update += 1;
-    });
-    int bs = vector_size * suggested_team_size;
+        [&](const int /* threadid */, int &update, const bool final) {
+          if (final) {
+            vector_rank = update;
+          }
+          update += 1;
+        });
+    int bs           = vector_size * suggested_team_size;
     int vector_shift = thread_rank * vector_size + vector_rank;
 
-    for (nnz_lno_t row_index = team_row_begin; row_index < team_row_end; ++row_index){
-
+    for (nnz_lno_t row_index = team_row_begin; row_index < team_row_end;
+         ++row_index) {
+      if (row_mapA[row_index] == row_mapA[row_index + 1])  // skip empty A rows
+        continue;
 #if 1
-        teamMember.team_barrier();
+      teamMember.team_barrier();
 #endif
-      const size_type c_row_begin = rowmapC[row_index];
-      const size_type c_row_end = rowmapC[row_index + 1];
-      const nnz_lno_t c_row_size = c_row_end -  c_row_begin;
-      nnz_lno_t *c_row = entriesC.data() + c_row_begin;
-      scalar_t *c_row_vals = valuesC.data() + c_row_begin;
+      const size_type c_row_begin    = rowmapC[row_index];
+      const size_type c_row_end      = rowmapC[row_index + 1];
+      const nnz_lno_t c_row_size     = c_row_end - c_row_begin;
+      nnz_lno_t *c_row               = entriesC.data() + c_row_begin;
+      scalar_t *c_row_vals           = valuesC.data() + c_row_begin;
       nnz_lno_t *global_acc_row_keys = c_row;
-      scalar_t *global_acc_row_vals = c_row_vals;
-	  volatile nnz_lno_t * tmp = NULL;
-
-      if (c_row_size > max_first_level_hash_size){
-    	  {
-    		  while (tmp == NULL){
-    			  Kokkos::single(Kokkos::PerTeam(teamMember),[&] (volatile nnz_lno_t * &memptr) {
-    				  memptr = (volatile nnz_lno_t * )( memory_space.allocate_chunk(row_index));
-    			  }, tmp);
-    		  }
-    		  global_acc_row_keys = (nnz_lno_t *) (tmp);
-    		  global_acc_row_vals = KokkosKernels::Impl::alignPtr<volatile nnz_lno_t*, scalar_t>(tmp + pow2_hash_size);
-    	  }
-          //initialize begins.
-          {
-        	  nnz_lno_t num_threads =  pow2_hash_size / vector_size;
-        	  // not needed as team_cuckoo_key_size is always pow2. + (team_cuckoo_key_size & (vector_size - 1)) * 1;
-        	  Kokkos::parallel_for( Kokkos::TeamThreadRange(teamMember, num_threads), [&] (nnz_lno_t teamind) {
-        		  Kokkos::parallel_for( Kokkos::ThreadVectorRange(teamMember, vector_size ), [&] (nnz_lno_t i) {
-        			  global_acc_row_vals[teamind * vector_size + i] = 0;
-        		  });
-        	  });
+      scalar_t *global_acc_row_vals  = c_row_vals;
+      volatile nnz_lno_t *tmp        = NULL;
+
+      if (c_row_size > max_first_level_hash_size) {
+        {
+          while (tmp == NULL) {
+            Kokkos::single(
+                Kokkos::PerTeam(teamMember),
+                [&](volatile nnz_lno_t *&memptr) {
+                  memptr = (volatile nnz_lno_t *)(memory_space.allocate_chunk(
+                      row_index));
+                },
+                tmp);
           }
+          global_acc_row_keys = (nnz_lno_t *)(tmp);
+          global_acc_row_vals =
+              KokkosKernels::Impl::alignPtr<volatile nnz_lno_t *, scalar_t>(
+                  tmp + pow2_hash_size);
+        }
+        // initialize begins.
+        {
+          nnz_lno_t num_threads = pow2_hash_size / vector_size;
+          // not needed as team_cuckoo_key_size is always pow2. +
+          // (team_cuckoo_key_size & (vector_size - 1)) * 1;
+          Kokkos::parallel_for(
+              Kokkos::TeamThreadRange(teamMember, num_threads),
+              [&](nnz_lno_t teamind) {
+                Kokkos::parallel_for(
+                    Kokkos::ThreadVectorRange(teamMember, vector_size),
+                    [&](nnz_lno_t i) {
+                      global_acc_row_vals[teamind * vector_size + i] = 0;
+                    });
+              });
+        }
       }
 
-      //initialize begins.
+      // initialize begins.
       {
-    	  nnz_lno_t num_threads =  team_cuckoo_key_size / vector_size;
-    	  // not needed as team_cuckoo_key_size is always pow2. + (team_cuckoo_key_size & (vector_size - 1)) * 1;
-    	  Kokkos::parallel_for( Kokkos::TeamThreadRange(teamMember, num_threads), [&] (nnz_lno_t teamind) {
-    		  Kokkos::parallel_for( Kokkos::ThreadVectorRange(teamMember, vector_size ), [&] (nnz_lno_t i) {
-    			  keys[teamind * vector_size + i] = init_value; vals[teamind * vector_size + i] = 0;
-    		  });
-    	  });
+        nnz_lno_t num_threads = team_cuckoo_key_size / vector_size;
+        // not needed as team_cuckoo_key_size is always pow2. +
+        // (team_cuckoo_key_size & (vector_size - 1)) * 1;
+        Kokkos::parallel_for(
+            Kokkos::TeamThreadRange(teamMember, num_threads),
+            [&](nnz_lno_t teamind) {
+              Kokkos::parallel_for(
+                  Kokkos::ThreadVectorRange(teamMember, vector_size),
+                  [&](nnz_lno_t i) {
+                    keys[teamind * vector_size + i] = init_value;
+                    vals[teamind * vector_size + i] = 0;
+                  });
+            });
       }
 
-      //initialize hash usage sizes
-      Kokkos::single(Kokkos::PerTeam(teamMember),[&] () {
+      // initialize hash usage sizes
+      Kokkos::single(Kokkos::PerTeam(teamMember), [&]() {
         used_hash_sizes[0] = 0;
         used_hash_sizes[1] = 0;
       });
 
-      bool insert_is_on = true;
+      bool insert_is_on                  = true;
       const size_type a_col_begin_offset = row_mapA[row_index];
 
-
       nnz_lno_t a_col_ind = entriesA[a_col_begin_offset];
-      scalar_t a_col_val = valuesA[a_col_begin_offset];
+      scalar_t a_col_val  = valuesA[a_col_begin_offset];
 
       nnz_lno_t current_a_column_offset_inrow = 0;
-      nnz_lno_t flops_on_the_left_of_offsett = 0;
-      size_type current_b_read_offsett = row_mapB[a_col_ind];
-      nnz_lno_t current_a_column_flops = row_mapB[a_col_ind + 1] - current_b_read_offsett;
+      nnz_lno_t flops_on_the_left_of_offsett  = 0;
+      size_type current_b_read_offsett        = row_mapB[a_col_ind];
+      nnz_lno_t current_a_column_flops =
+          row_mapB[a_col_ind + 1] - current_b_read_offsett;
 
       nnz_lno_t row_flops = flops_per_row(row_index);
 
 #if 1
-        teamMember.team_barrier();
+      teamMember.team_barrier();
 #endif
-        for (nnz_lno_t vector_read_shift = vector_shift; vector_read_shift< row_flops; vector_read_shift += bs){
+      for (nnz_lno_t vector_read_shift = vector_shift;
+           vector_read_shift < row_flops; vector_read_shift += bs) {
         {
-      	  nnz_lno_t my_b_col_shift = vector_read_shift - flops_on_the_left_of_offsett;
-    	  nnz_lno_t my_b_col = init_value; scalar_t my_b_val = 0; nnz_lno_t hash = init_value;
-    	  int fail = 0;
-
-			  if (my_b_col_shift >= current_a_column_flops ){
-				  do {
-					  ++current_a_column_offset_inrow;
-					  my_b_col_shift -= current_a_column_flops;
-					  flops_on_the_left_of_offsett += current_a_column_flops;
-					  a_col_ind = entriesA[a_col_begin_offset + current_a_column_offset_inrow];
-
-					  current_b_read_offsett = row_mapB[a_col_ind];
-					  current_a_column_flops = row_mapB[a_col_ind + 1] - current_b_read_offsett;
-				  } while (my_b_col_shift >= current_a_column_flops);
-				  a_col_val = valuesA[a_col_begin_offset + current_a_column_offset_inrow];
-			  }
-
-			  my_b_col = entriesB[my_b_col_shift + current_b_read_offsett];
-			  my_b_val = valuesB[my_b_col_shift + current_b_read_offsett] * a_col_val;
-			  //now insert it to first level hashmap accumulator.
-			  hash = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func;
-			  fail = 1;
-			  bool try_to_insert = true;
-
-			  //nnz_lno_t max_tries = team_cuckoo_key_size;
-			  nnz_lno_t search_end = team_cuckoo_key_size; //KOKKOSKERNELS_MACRO_MIN(team_cuckoo_key_size, hash + max_tries);
-			  for (nnz_lno_t trial = hash; trial < search_end; ){
-				  if (keys[trial] == my_b_col){
-					  Kokkos::atomic_add(vals + trial, my_b_val);
-					  fail = 0;
-					  break;
-				  }
-				  else if (keys[trial] == init_value){
-					  if (!insert_is_on) {
-						  try_to_insert = false;
-						  break;
-					  }
-					  else if (Kokkos::atomic_compare_exchange_strong(keys + trial, init_value, my_b_col)){
-						  Kokkos::atomic_add(vals + trial, my_b_val);
-						  Kokkos::atomic_increment(used_hash_sizes);
-						  if (used_hash_sizes[0] > max_first_level_hash_size)  insert_is_on = false;
-						  fail = 0;
-						  break;
-					  }
-				  }
-				  else {
-					  ++trial;
-				  }
-			  }
-			  if (fail){
-				  search_end = hash; //max_tries - (team_cuckoo_key_size -  hash);
-
-				  for (nnz_lno_t trial = 0; try_to_insert && trial < search_end; ){
-					  if (keys[trial] == my_b_col){
-						  Kokkos::atomic_add(vals + trial, my_b_val);
-						  fail = 0;
-						  break;
-					  }
-					  else if (keys[trial] == init_value){
-						  if (!insert_is_on) {
-							  break;
-						  }
-						  else if (Kokkos::atomic_compare_exchange_strong(keys + trial, init_value, my_b_col)){
-							  Kokkos::atomic_add(vals + trial, my_b_val);
-							  Kokkos::atomic_increment(used_hash_sizes);
-							  if (used_hash_sizes[0] > max_first_level_hash_size)  insert_is_on = false;
-							  fail = 0;
-							  break;
-						  }
-					  }
-					  else {
-						  ++trial;
-					  }
-				  }
-
-
-				  if (fail) {
-				      nnz_lno_t new_hash = (my_b_col * HASHSCALAR) & pow2_hash_func;
-
-					  for (nnz_lno_t trial = new_hash; trial < pow2_hash_size; ){
-						  if (global_acc_row_keys[trial] == my_b_col){
-							  Kokkos::atomic_add(global_acc_row_vals + trial , my_b_val);
-
-							  //c_row_vals[trial] += my_b_val;
-							  fail = 0;
-							  break;
-						  }
-						  else if (global_acc_row_keys[trial ] == init_value){
-							  if (Kokkos::atomic_compare_exchange_strong(global_acc_row_keys + trial , init_value, my_b_col)){
-								  Kokkos::atomic_add(global_acc_row_vals + trial , my_b_val);
-								  //Kokkos::atomic_increment(used_hash_sizes + 1);
-								  //c_row_vals[trial] = my_b_val;
-								  fail = 0;
-								  break;
-							  }
-						  }
-						  else {
-							  ++trial;
-						  }
-					  }
-					  if (fail){
-						  for (nnz_lno_t trial = 0; trial < new_hash; ){
-							  if (global_acc_row_keys[trial ] == my_b_col){
-								  //c_row_vals[trial] += my_b_val;
-								  Kokkos::atomic_add(global_acc_row_vals + trial , my_b_val);
-
-								  break;
-							  }
-							  else if (global_acc_row_keys[trial ] == init_value){
-								  if (Kokkos::atomic_compare_exchange_strong(global_acc_row_keys + trial , init_value, my_b_col)){
-									  //Kokkos::atomic_increment(used_hash_sizes + 1);
-									  Kokkos::atomic_add(global_acc_row_vals + trial , my_b_val);
-									  //c_row_vals[trial] = my_b_val;
-									  break;
-								  }
-							  }
-							  else {
-								  ++trial;
-							  }
-						  }
-					  }
-				  }
-			  }
-		  }
+          nnz_lno_t my_b_col_shift =
+              vector_read_shift - flops_on_the_left_of_offsett;
+          nnz_lno_t my_b_col = init_value;
+          scalar_t my_b_val  = 0;
+          nnz_lno_t hash     = init_value;
+          int fail           = 0;
+
+          if (my_b_col_shift >= current_a_column_flops) {
+            do {
+              ++current_a_column_offset_inrow;
+              my_b_col_shift -= current_a_column_flops;
+              flops_on_the_left_of_offsett += current_a_column_flops;
+              a_col_ind =
+                  entriesA[a_col_begin_offset + current_a_column_offset_inrow];
+
+              current_b_read_offsett = row_mapB[a_col_ind];
+              current_a_column_flops =
+                  row_mapB[a_col_ind + 1] - current_b_read_offsett;
+            } while (my_b_col_shift >= current_a_column_flops);
+            a_col_val =
+                valuesA[a_col_begin_offset + current_a_column_offset_inrow];
+          }
+
+          my_b_col = entriesB[my_b_col_shift + current_b_read_offsett];
+          my_b_val =
+              valuesB[my_b_col_shift + current_b_read_offsett] * a_col_val;
+          // now insert it to first level hashmap accumulator.
+          hash               = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func;
+          fail               = 1;
+          bool try_to_insert = true;
+
+          // nnz_lno_t max_tries = team_cuckoo_key_size;
+          nnz_lno_t search_end =
+              team_cuckoo_key_size;  // KOKKOSKERNELS_MACRO_MIN(team_cuckoo_key_size,
+                                     // hash + max_tries);
+          for (nnz_lno_t trial = hash; trial < search_end;) {
+            if (keys[trial] == my_b_col) {
+              Kokkos::atomic_add(vals + trial, my_b_val);
+              fail = 0;
+              break;
+            } else if (keys[trial] == init_value) {
+              if (!insert_is_on) {
+                try_to_insert = false;
+                break;
+              } else if (Kokkos::atomic_compare_exchange_strong(
+                             keys + trial, init_value, my_b_col)) {
+                Kokkos::atomic_add(vals + trial, my_b_val);
+                Kokkos::atomic_increment(used_hash_sizes);
+                if (used_hash_sizes[0] > max_first_level_hash_size)
+                  insert_is_on = false;
+                fail = 0;
+                break;
+              }
+            } else {
+              ++trial;
+            }
+          }
+          if (fail) {
+            search_end = hash;  // max_tries - (team_cuckoo_key_size -  hash);
+
+            for (nnz_lno_t trial = 0; try_to_insert && trial < search_end;) {
+              if (keys[trial] == my_b_col) {
+                Kokkos::atomic_add(vals + trial, my_b_val);
+                fail = 0;
+                break;
+              } else if (keys[trial] == init_value) {
+                if (!insert_is_on) {
+                  break;
+                } else if (Kokkos::atomic_compare_exchange_strong(
+                               keys + trial, init_value, my_b_col)) {
+                  Kokkos::atomic_add(vals + trial, my_b_val);
+                  Kokkos::atomic_increment(used_hash_sizes);
+                  if (used_hash_sizes[0] > max_first_level_hash_size)
+                    insert_is_on = false;
+                  fail = 0;
+                  break;
+                }
+              } else {
+                ++trial;
+              }
+            }
+
+            if (fail) {
+              nnz_lno_t new_hash = (my_b_col * HASHSCALAR) & pow2_hash_func;
+
+              for (nnz_lno_t trial = new_hash; trial < pow2_hash_size;) {
+                if (global_acc_row_keys[trial] == my_b_col) {
+                  Kokkos::atomic_add(global_acc_row_vals + trial, my_b_val);
+
+                  // c_row_vals[trial] += my_b_val;
+                  fail = 0;
+                  break;
+                } else if (global_acc_row_keys[trial] == init_value) {
+                  if (Kokkos::atomic_compare_exchange_strong(
+                          global_acc_row_keys + trial, init_value, my_b_col)) {
+                    Kokkos::atomic_add(global_acc_row_vals + trial, my_b_val);
+                    // Kokkos::atomic_increment(used_hash_sizes + 1);
+                    // c_row_vals[trial] = my_b_val;
+                    fail = 0;
+                    break;
+                  }
+                } else {
+                  ++trial;
+                }
+              }
+              if (fail) {
+                for (nnz_lno_t trial = 0; trial < new_hash;) {
+                  if (global_acc_row_keys[trial] == my_b_col) {
+                    // c_row_vals[trial] += my_b_val;
+                    Kokkos::atomic_add(global_acc_row_vals + trial, my_b_val);
+
+                    break;
+                  } else if (global_acc_row_keys[trial] == init_value) {
+                    if (Kokkos::atomic_compare_exchange_strong(
+                            global_acc_row_keys + trial, init_value,
+                            my_b_col)) {
+                      // Kokkos::atomic_increment(used_hash_sizes + 1);
+                      Kokkos::atomic_add(global_acc_row_vals + trial, my_b_val);
+                      // c_row_vals[trial] = my_b_val;
+                      break;
+                    }
+                  } else {
+                    ++trial;
+                  }
+                }
+              }
+            }
+          }
+        }
       }
 
       teamMember.team_barrier();
 
+      if (tmp != NULL) {
+        for (nnz_lno_t my_index = vector_shift; my_index < pow2_hash_size;
+             my_index += bs) {
+          nnz_lno_t my_b_col = global_acc_row_keys[my_index];
+          if (my_b_col != init_value) {
+            scalar_t my_b_val = global_acc_row_vals[my_index];
+            int fail          = 1;
+            {
+              nnz_lno_t hash = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func;
+
+              // nnz_lno_t max_tries = team_cuckoo_key_size;
+              nnz_lno_t search_end =
+                  team_cuckoo_key_size;  // KOKKOSKERNELS_MACRO_MIN(team_cuckoo_key_size,
+                                         // hash + max_tries);
+              for (nnz_lno_t trial = hash; trial < search_end; ++trial) {
+                if (keys[trial] == my_b_col) {
+                  vals[trial] += my_b_val;
+                  fail = 0;
+                  break;
+                } else if (keys[trial] == init_value) {
+                  break;
+                }
+              }
+              search_end = hash;  // max_tries - (team_cuckoo_key_size -  hash);
+
+              for (nnz_lno_t trial = 0; trial < search_end; ++trial) {
+                if (keys[trial] == my_b_col) {
+                  vals[trial] += my_b_val;
+                  fail = 0;
+                  break;
+                } else if (keys[trial] == init_value) {
+                  break;
+                }
+              }
+            }
+            if (fail) {
+              nnz_lno_t write_index = 0;
+              write_index        = Kokkos::atomic_fetch_add(used_hash_sizes + 1,
+                                                     atomic_incr_type(1));
+              c_row[write_index] = my_b_col;
+              c_row_vals[write_index] = my_b_val;
+            }
+            global_acc_row_keys[my_index] = init_value;
+          }
+        }
 
-      if (tmp != NULL){
-
-
-    	  for (nnz_lno_t  my_index = vector_shift; my_index < pow2_hash_size;my_index += bs){
-    		  nnz_lno_t my_b_col = global_acc_row_keys[my_index];
-    		  if (my_b_col != init_value){
-    			  scalar_t my_b_val = global_acc_row_vals[my_index];
-				  int fail = 1;
-    			  {
-					  nnz_lno_t hash = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func;
-
-    				  //nnz_lno_t max_tries = team_cuckoo_key_size;
-    				  nnz_lno_t search_end = team_cuckoo_key_size; //KOKKOSKERNELS_MACRO_MIN(team_cuckoo_key_size, hash + max_tries);
-    				  for (nnz_lno_t trial = hash; trial < search_end;++trial){
-    					  if (keys[trial] == my_b_col){
-    						  vals[trial] += my_b_val;
-    						  fail = 0;
-    						  break;
-    					  }
-    					  else if (keys[trial] == init_value){
-    						  break;
-    					  }
-    				  }
-    				  search_end = hash; //max_tries - (team_cuckoo_key_size -  hash);
-
-    				  for (nnz_lno_t trial = 0; trial < search_end; ++trial){
-    					  if (keys[trial] == my_b_col){
-    						  vals[trial] += my_b_val;
-    						  fail = 0;
-    						  break;
-    					  }
-    					  else if (keys[trial] == init_value){
-    						  break;
-    					  }
-    				  }
-
-    			  }
-    			  if (fail){
-    				  nnz_lno_t write_index = 0;
-    				  write_index = Kokkos::atomic_fetch_add(used_hash_sizes + 1, atomic_incr_type(1));
-    				  c_row[write_index] = my_b_col;
-    				  c_row_vals[write_index] = my_b_val;
-    			  }
-    			  global_acc_row_keys[my_index] = init_value;
-    		  }
-    	  }
-
-          teamMember.team_barrier();
-		  Kokkos::single(Kokkos::PerTeam(teamMember),[&] () {
-			  memory_space.release_chunk(global_acc_row_keys);
-		  });
+        teamMember.team_barrier();
+        Kokkos::single(Kokkos::PerTeam(teamMember), [&]() {
+          memory_space.release_chunk(global_acc_row_keys);
+        });
       }
 
-
-      for (nnz_lno_t  my_index = vector_shift; my_index < team_cuckoo_key_size;my_index += bs){
-    	  nnz_lno_t my_key = keys[my_index];
-    	  if (my_key != init_value){
-    		  scalar_t my_val = vals[my_index];
-    		  nnz_lno_t write_index = 0;
-    		  write_index = Kokkos::atomic_fetch_add(used_hash_sizes + 1, atomic_incr_type(1));
-    		  c_row[write_index] = my_key;
-    		  c_row_vals[write_index] = my_val;
-    	  }
+      for (nnz_lno_t my_index = vector_shift; my_index < team_cuckoo_key_size;
+           my_index += bs) {
+        nnz_lno_t my_key = keys[my_index];
+        if (my_key != init_value) {
+          scalar_t my_val       = vals[my_index];
+          nnz_lno_t write_index = 0;
+          write_index           = Kokkos::atomic_fetch_add(used_hash_sizes + 1,
+                                                 atomic_incr_type(1));
+          c_row[write_index]    = my_key;
+          c_row_vals[write_index] = my_val;
+        }
       }
-
-
     }
   }
 
-
-  //In this one row fits into shmem with team-flat-parallel
+  // In this one row fits into shmem with team-flat-parallel
   KOKKOS_INLINE_FUNCTION
-  void operator()(const GPUTag4&, const team_member_t & teamMember) const {
-
-
-	const nnz_lno_t init_value = -1;
-    nnz_lno_t team_row_begin = teamMember.league_rank()  * team_work_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
+  void operator()(const GPUTag4 &, const team_member_t &teamMember) const {
+    const nnz_lno_t init_value = -1;
+    nnz_lno_t team_row_begin   = teamMember.league_rank() * team_work_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
 
-    // shmem == sizeof(nnz_lno_t)*2 + sizeof(nnz_lno_t)*team_cuckoo_key_size + sizeof(scalar_t)*nvals
-    char *all_shared_memory = (char *) (teamMember.team_shmem().get_shmem(shared_memory_size));
+    // shmem == sizeof(nnz_lno_t)*2 + sizeof(nnz_lno_t)*team_cuckoo_key_size +
+    // sizeof(scalar_t)*nvals
+    char *all_shared_memory =
+        (char *)(teamMember.team_shmem().get_shmem(shared_memory_size));
 
-    volatile nnz_lno_t *used_hash_sizes = (volatile nnz_lno_t *) (all_shared_memory);
+    volatile nnz_lno_t *used_hash_sizes =
+        (volatile nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * 2;
 
-    //holds the keys
-    nnz_lno_t * keys = (nnz_lno_t *) (all_shared_memory);
+    // holds the keys
+    nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * team_cuckoo_key_size;
-    scalar_t * vals = KokkosKernels::Impl::alignPtr<char*, scalar_t>(all_shared_memory);
+    scalar_t *vals =
+        KokkosKernels::Impl::alignPtr<char *, scalar_t>(all_shared_memory);
 
-    int thread_rank =  teamMember.team_rank();
+    int thread_rank = teamMember.team_rank();
 
     int vector_rank = 0;
-    typedef typename std::remove_reference< decltype( *used_hash_sizes ) >::type atomic_incr_type;
+    typedef typename std::remove_reference<decltype(*used_hash_sizes)>::type
+        atomic_incr_type;
     Kokkos::parallel_scan(
         Kokkos::ThreadVectorRange(teamMember, vector_size),
-        [&] (const int /* threadid */, int &update, const bool final) {
-      if (final){
-      	vector_rank = update;
-      }
-      update += 1;
-    });
-    int bs = vector_size * suggested_team_size;
+        [&](const int /* threadid */, int &update, const bool final) {
+          if (final) {
+            vector_rank = update;
+          }
+          update += 1;
+        });
+    int bs           = vector_size * suggested_team_size;
     int vector_shift = thread_rank * vector_size + vector_rank;
-    for (nnz_lno_t row_index = team_row_begin; row_index < team_row_end; ++row_index){
-
+    for (nnz_lno_t row_index = team_row_begin; row_index < team_row_end;
+         ++row_index) {
+      if (row_mapA[row_index] == row_mapA[row_index + 1])  // skip empty A rows
+        continue;
 #if 1
-        teamMember.team_barrier();
+      teamMember.team_barrier();
 #endif
       const size_type c_row_begin = rowmapC[row_index];
-      //const size_type c_row_end = rowmapC[row_index + 1];
-      //const nnz_lno_t c_row_size = c_row_end -  c_row_begin;
-      nnz_lno_t *c_row = entriesC.data() + c_row_begin;
+      // const size_type c_row_end = rowmapC[row_index + 1];
+      // const nnz_lno_t c_row_size = c_row_end -  c_row_begin;
+      nnz_lno_t *c_row     = entriesC.data() + c_row_begin;
       scalar_t *c_row_vals = valuesC.data() + c_row_begin;
 
-      //initialize begins.
+      // initialize begins.
       {
-    	  nnz_lno_t num_threads =  team_cuckoo_key_size / vector_size;// not needed as team_cuckoo_key_size is always pow2. + (team_cuckoo_key_size & (vector_size - 1)) * 1;
-    	  Kokkos::parallel_for( Kokkos::TeamThreadRange(teamMember, num_threads), [&] (nnz_lno_t teamind) {
-    		  //nnz_lno_t team_shift = teamind * vector_size;
-    		  //nnz_lno_t work_to_handle = KOKKOSKERNELS_MACRO_MIN(vector_size, team_shmem_hash_size - team_shift);
-    		  Kokkos::parallel_for( Kokkos::ThreadVectorRange(teamMember, vector_size ), [&] (nnz_lno_t i) {
-    			  keys[teamind * vector_size + i] = init_value; vals[teamind * vector_size + i] = 0;
-    		  });
-    	  });
+        nnz_lno_t num_threads =
+            team_cuckoo_key_size /
+            vector_size;  // not needed as team_cuckoo_key_size is always pow2.
+                          // + (team_cuckoo_key_size & (vector_size - 1)) * 1;
+        Kokkos::parallel_for(
+            Kokkos::TeamThreadRange(teamMember, num_threads),
+            [&](nnz_lno_t teamind) {
+              // nnz_lno_t team_shift = teamind * vector_size;
+              // nnz_lno_t work_to_handle = KOKKOSKERNELS_MACRO_MIN(vector_size,
+              // team_shmem_hash_size - team_shift);
+              Kokkos::parallel_for(
+                  Kokkos::ThreadVectorRange(teamMember, vector_size),
+                  [&](nnz_lno_t i) {
+                    keys[teamind * vector_size + i] = init_value;
+                    vals[teamind * vector_size + i] = 0;
+                  });
+            });
       }
 
-
 #if 0
       teamMember.team_barrier();
 
@@ -984,8 +1063,8 @@ struct KokkosSPGEMM
 
       teamMember.team_barrier();
 #endif
-      //initialize hash usage sizes
-      Kokkos::single(Kokkos::PerTeam(teamMember),[&] () {
+      // initialize hash usage sizes
+      Kokkos::single(Kokkos::PerTeam(teamMember), [&]() {
         used_hash_sizes[0] = 0;
         used_hash_sizes[1] = 0;
 #if 0
@@ -1002,124 +1081,122 @@ struct KokkosSPGEMM
 #endif
       const size_type a_col_begin_offset = row_mapA[row_index];
 
-
       nnz_lno_t a_col_ind = entriesA[a_col_begin_offset];
-      scalar_t a_col_val = valuesA[a_col_begin_offset];
+      scalar_t a_col_val  = valuesA[a_col_begin_offset];
 
       nnz_lno_t current_a_column_offset_inrow = 0;
-      nnz_lno_t flops_on_the_left_of_offsett = 0;
-      size_type current_b_read_offsett = row_mapB[a_col_ind];
-      nnz_lno_t current_a_column_flops = row_mapB[a_col_ind + 1] - current_b_read_offsett;
+      nnz_lno_t flops_on_the_left_of_offsett  = 0;
+      size_type current_b_read_offsett        = row_mapB[a_col_ind];
+      nnz_lno_t current_a_column_flops =
+          row_mapB[a_col_ind + 1] - current_b_read_offsett;
 
-
-
-      //nnz_lno_t ii = left_work;
+      // nnz_lno_t ii = left_work;
       nnz_lno_t row_flops = flops_per_row(row_index);
 
 #if 1
-        teamMember.team_barrier();
+      teamMember.team_barrier();
 #endif
 
-        for (nnz_lno_t vector_read_shift = vector_shift; vector_read_shift< row_flops; vector_read_shift += bs){
+      for (nnz_lno_t vector_read_shift = vector_shift;
+           vector_read_shift < row_flops; vector_read_shift += bs) {
         {
-              	  nnz_lno_t my_b_col_shift = vector_read_shift - flops_on_the_left_of_offsett;
-            	  nnz_lno_t my_b_col = init_value; scalar_t my_b_val = 0; nnz_lno_t hash = init_value;
-            	  int fail = 0;
-
-
-			  if (my_b_col_shift >= current_a_column_flops ){
-
-				  do {
-					  ++current_a_column_offset_inrow;
-					  my_b_col_shift -= current_a_column_flops;
-					  flops_on_the_left_of_offsett += current_a_column_flops;
-					  a_col_ind = entriesA[a_col_begin_offset + current_a_column_offset_inrow];
-
-					  current_b_read_offsett = row_mapB[a_col_ind];
-					  current_a_column_flops = row_mapB[a_col_ind + 1] - current_b_read_offsett;
-				  } while (my_b_col_shift >= current_a_column_flops);
-				  a_col_val = valuesA[a_col_begin_offset + current_a_column_offset_inrow];
-
-			  }
-
-			  my_b_col = entriesB[my_b_col_shift + current_b_read_offsett];
-
-			  my_b_val = valuesB[my_b_col_shift + current_b_read_offsett] * a_col_val;
-
-			  //now insert it to first level hashmap accumulator.
-			  hash = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func;
-			  fail = 1;
-
-			  for (nnz_lno_t trial = hash; trial < team_cuckoo_key_size; ){
-
-				  if (keys[trial] == my_b_col){
-            Kokkos::atomic_add(vals + trial, my_b_val);
-					  fail = 0;
-					  break;
-				  }
-				  else if (keys[trial] == init_value){
-					  if (Kokkos::atomic_compare_exchange_strong(keys + trial, init_value, my_b_col)){
-						  Kokkos::atomic_add(vals + trial, my_b_val);
-						  fail = 0;
-						  break;
-					  }
-				  }
-				  else {
-					  ++trial;
-				  }
-			  }
-			  if (fail){
-				  for (nnz_lno_t trial = 0; trial < hash; ){
-
-					  if (keys[trial] == my_b_col){
-						  Kokkos::atomic_add(vals + trial, my_b_val);
-						  fail = 0;
-						  break;
-					  }
-					  else if (keys[trial] == init_value){
-						  if (Kokkos::atomic_compare_exchange_strong(keys + trial, init_value, my_b_col)){
-							  Kokkos::atomic_add(vals + trial, my_b_val);
-							  fail = 0;
-							  break;
-						  }
-					  }
-					  else {
-						  ++trial;
-					  }
-
-				  }
-			  }
-		  }
+          nnz_lno_t my_b_col_shift =
+              vector_read_shift - flops_on_the_left_of_offsett;
+          nnz_lno_t my_b_col = init_value;
+          scalar_t my_b_val  = 0;
+          nnz_lno_t hash     = init_value;
+          int fail           = 0;
+
+          if (my_b_col_shift >= current_a_column_flops) {
+            do {
+              ++current_a_column_offset_inrow;
+              my_b_col_shift -= current_a_column_flops;
+              flops_on_the_left_of_offsett += current_a_column_flops;
+              a_col_ind =
+                  entriesA[a_col_begin_offset + current_a_column_offset_inrow];
+
+              current_b_read_offsett = row_mapB[a_col_ind];
+              current_a_column_flops =
+                  row_mapB[a_col_ind + 1] - current_b_read_offsett;
+            } while (my_b_col_shift >= current_a_column_flops);
+            a_col_val =
+                valuesA[a_col_begin_offset + current_a_column_offset_inrow];
+          }
+
+          my_b_col = entriesB[my_b_col_shift + current_b_read_offsett];
+
+          my_b_val =
+              valuesB[my_b_col_shift + current_b_read_offsett] * a_col_val;
+
+          // now insert it to first level hashmap accumulator.
+          hash = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func;
+          fail = 1;
+
+          for (nnz_lno_t trial = hash; trial < team_cuckoo_key_size;) {
+            if (keys[trial] == my_b_col) {
+              Kokkos::atomic_add(vals + trial, my_b_val);
+              fail = 0;
+              break;
+            } else if (keys[trial] == init_value) {
+              if (Kokkos::atomic_compare_exchange_strong(
+                      keys + trial, init_value, my_b_col)) {
+                Kokkos::atomic_add(vals + trial, my_b_val);
+                fail = 0;
+                break;
+              }
+            } else {
+              ++trial;
+            }
+          }
+          if (fail) {
+            for (nnz_lno_t trial = 0; trial < hash;) {
+              if (keys[trial] == my_b_col) {
+                Kokkos::atomic_add(vals + trial, my_b_val);
+                fail = 0;
+                break;
+              } else if (keys[trial] == init_value) {
+                if (Kokkos::atomic_compare_exchange_strong(
+                        keys + trial, init_value, my_b_col)) {
+                  Kokkos::atomic_add(vals + trial, my_b_val);
+                  fail = 0;
+                  break;
+                }
+              } else {
+                ++trial;
+              }
+            }
+          }
+        }
       }
 
       teamMember.team_barrier();
-      for (nnz_lno_t  my_index = vector_shift; my_index < team_cuckoo_key_size;my_index += bs){
-    	  nnz_lno_t my_key = keys[my_index];
-    	  if (my_key != init_value){
-    		  scalar_t my_val = vals[my_index];
-    		  nnz_lno_t write_index = Kokkos::atomic_fetch_add(used_hash_sizes, atomic_incr_type(1));
-    		  c_row[write_index] = my_key;
-    		  c_row_vals[write_index] = my_val;
-    	  }
+      for (nnz_lno_t my_index = vector_shift; my_index < team_cuckoo_key_size;
+           my_index += bs) {
+        nnz_lno_t my_key = keys[my_index];
+        if (my_key != init_value) {
+          scalar_t my_val = vals[my_index];
+          nnz_lno_t write_index =
+              Kokkos::atomic_fetch_add(used_hash_sizes, atomic_incr_type(1));
+          c_row[write_index]      = my_key;
+          c_row_vals[write_index] = my_val;
+        }
       }
-
     }
   }
 
-
-  size_t team_shmem_size (int /* team_size */) const {
+  size_t team_shmem_size(int /* team_size */) const {
     return shared_memory_size;
   }
 };
 
-
 //
 // * Notes on KokkosSPGEMM_numeric_hash *
 //
 // Prior to this routine, KokkosSPGEMM_numeric(...) was called
 //
 //   KokkosSPGEMM_numeric(...) :
-//     if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm) :
+//     if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP ==
+//     this->spgemm_algorithm) :
 //       call KokkosSPGEMM_numeric_speed(...)
 //     else:
 //       call  KokkosSPGEMM_numeric_hash(...)  (this code!)
@@ -1131,19 +1208,26 @@ struct KokkosSPGEMM
 //
 // Algorithm selection may be modified as follows
 //
-//   algorithm_to_run: initialized to spgemm_algorithm input to KokkosSPGEMM_numeric_hash
+//   algorithm_to_run: initialized to spgemm_algorithm input to
+//   KokkosSPGEMM_numeric_hash
 //     * spgemm_algorithm CANNOT be SPGEMM_KK_SPEED or SPGEMM_KK_DENSE
 //
-//  if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm) :
+//  if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP ==
+//  this->spgemm_algorithm) :
 //     if Cuda enabled :
-//       1. perform shmem-size + partition computations (used by HashMapAccumulator) and flop estimate
-//       2. from results of 1. select from SPGEMM_KK_MEMORY_SPREADTEAM, SPGEMM_KK_MEMORY_BIGSPREADTEAM, SPGEMM_KK_MEMORY
-//          * Note: These shmem calculations are not passed along to the PortableNumericCHASH functor used by kernels
-//            TODO check the pre-shmem calculations and functor shmem calculations consistent - pass shmem values to functor
+//       1. perform shmem-size + partition computations (used by
+//       HashMapAccumulator) and flop estimate
+//       2. from results of 1. select from SPGEMM_KK_MEMORY_SPREADTEAM,
+//       SPGEMM_KK_MEMORY_BIGSPREADTEAM, SPGEMM_KK_MEMORY
+//          * Note: These shmem calculations are not passed along to the
+//          PortableNumericCHASH functor used by kernels
+//            TODO check the pre-shmem calculations and functor shmem
+//            calculations consistent - pass shmem values to functor
 //     else :
 //       1. determine if problem is "dense"
 //       2. if dense: call "this->KokkosSPGEMM_numeric_speed"
-//          else : no change from algorithm_to_run; that is algorithm_to_run == SPGEMM_KK || SPGEMM_KK_LP
+//          else : no change from algorithm_to_run; that is algorithm_to_run ==
+//          SPGEMM_KK || SPGEMM_KK_LP
 //
 //  else :
 //     skip modification of input algorithm
@@ -1155,281 +1239,354 @@ struct KokkosSPGEMM
 //   Policy typedefs with tags found in: KokkosSparse_spgemm_impl.hpp
 //
 //  Cuda algorithm options:
-//   (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM) : gpu_team_policy4_t,  i.e. GPUTag4
-//   (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM) : gpu_team_policy6_t,  i.e. GPUTag6
-//   (default == SPGEMM_KK_MEMORY) : gpu_team_policy_t,  i.e. GPUTag
+//   (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM) : gpu_team_policy4_t,
+//   i.e. GPUTag4 (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM) :
+//   gpu_team_policy6_t,  i.e. GPUTag6 (default == SPGEMM_KK_MEMORY) :
+//   gpu_team_policy_t,  i.e. GPUTag
 //
 //  Non-Cuda host algorithm options:
 //   SPGEMM_KK_LP:
-//     (algorithm_to_run == SPGEMM_KK_LP + Dynamic) : dynamic_multicore_team_policy4_t,  i.e. MultiCoreTag4
-//     (algorithm_to_run == SPGEMM_KK_LP + Static) :  dynamic_multicore_team_policy4_t // typo/bug, should be multicore_team_policy4_t?
+//     (algorithm_to_run == SPGEMM_KK_LP + Dynamic) :
+//     dynamic_multicore_team_policy4_t,  i.e. MultiCoreTag4 (algorithm_to_run
+//     == SPGEMM_KK_LP + Static) :  dynamic_multicore_team_policy4_t //
+//     typo/bug, should be multicore_team_policy4_t?
 //   else SPGEMM::KKMEM
-//     kernel label: "KOKKOSPARSE::SPGEMM::KKMEM::DYNAMIC" : dynamic_multicore_team_policy_t,  i.e. MultiCoreTag
-//     kernel label: "KOKKOSPARSE::SPGEMM::KKMEM::STATIC"  : multicore_team_policy_t,  i.e. MultiCoreTag
-
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-template <typename c_row_view_t, typename c_lno_nnz_view_t, typename c_scalar_nnz_view_t>
-void
-  KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-  KokkosSPGEMM_numeric_hash(
-      c_row_view_t rowmapC_,
-      c_lno_nnz_view_t entriesC_,
-      c_scalar_nnz_view_t valuesC_,
-      KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space)
-{
-
-  if (KOKKOSKERNELS_VERBOSE){
+//     kernel label: "KOKKOSPARSE::SPGEMM::KKMEM::DYNAMIC" :
+//     dynamic_multicore_team_policy_t,  i.e. MultiCoreTag kernel label:
+//     "KOKKOSPARSE::SPGEMM::KKMEM::STATIC"  : multicore_team_policy_t,  i.e.
+//     MultiCoreTag
+
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename c_row_view_t, typename c_lno_nnz_view_t,
+          typename c_scalar_nnz_view_t>
+void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                  a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                  b_scalar_nnz_view_t_>::
+    KokkosSPGEMM_numeric_hash(
+        c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_,
+        c_scalar_nnz_view_t valuesC_,
+        KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space) {
+  if (KOKKOSKERNELS_VERBOSE) {
     std::cout << "\tHASH MODE" << std::endl;
   }
   KokkosSparse::SPGEMMAlgorithm algorithm_to_run = this->spgemm_algorithm;
-  nnz_lno_t brows = row_mapB.extent(0) - 1;
-  size_type bnnz =  valsB.extent(0);
+  nnz_lno_t brows                                = row_mapB.extent(0) - 1;
+  size_type bnnz                                 = valsB.extent(0);
 
-  int suggested_vector_size = this->handle->get_suggested_vector_size(brows, bnnz);
-  int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
+  int suggested_vector_size =
+      this->handle->get_suggested_vector_size(brows, bnnz);
+  int suggested_team_size =
+      this->handle->get_suggested_team_size(suggested_vector_size);
   size_t shmem_size_to_use = shmem_size;
 
-  row_lno_persistent_work_view_t flops_per_row = this->handle->get_spgemm_handle()->row_flops;
-  size_t original_overall_flops = this->handle->get_spgemm_handle()->original_overall_flops;
+  row_lno_persistent_work_view_t flops_per_row =
+      this->handle->get_spgemm_handle()->row_flops;
+  size_t original_overall_flops =
+      this->handle->get_spgemm_handle()->original_overall_flops;
   nnz_lno_t max_nnz = this->handle->get_spgemm_handle()->get_max_result_nnz();
   size_type overall_nnz = this->handle->get_spgemm_handle()->get_c_nnz();
 
-  typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space;
+  typedef KokkosKernels::Impl::UniformMemoryPool<MyTempMemorySpace, nnz_lno_t>
+      pool_memory_space;
   nnz_lno_t min_hash_size = 1;
-  size_t chunksize  = 1;
-  double first_level_cut_off  = this->handle->get_spgemm_handle()->get_first_level_hash_cut_off();
-  int hash_scaler = this->handle->get_spgemm_handle()->get_min_hash_size_scale();
-  nnz_lno_t tmp_max_nnz =  max_nnz;
-
-  if (hash_scaler == 0){
-	  tmp_max_nnz = KOKKOSKERNELS_MACRO_MAX(max_nnz, nnz_lno_t (this->b_col_cnt / this->concurrency + 1));
-  }
-  else {
-	  tmp_max_nnz *= hash_scaler;
+  size_t chunksize        = 1;
+  double first_level_cut_off =
+      this->handle->get_spgemm_handle()->get_first_level_hash_cut_off();
+  int hash_scaler =
+      this->handle->get_spgemm_handle()->get_min_hash_size_scale();
+  nnz_lno_t tmp_max_nnz = max_nnz;
+
+  if (hash_scaler == 0) {
+    tmp_max_nnz = KOKKOSKERNELS_MACRO_MAX(
+        max_nnz, nnz_lno_t(this->b_col_cnt / this->concurrency + 1));
+  } else {
+    tmp_max_nnz *= hash_scaler;
   }
 
-  //How many extra bytes are needed to align a scalar_t after an array of nnz_lno_t, in the worst case?
-  //Incurred once per hashmap, which may be per team or per thread depending on algorithm
-  constexpr size_t scalarAlignPad = (alignof(scalar_t) > alignof(nnz_lno_t)) ? (alignof(scalar_t) - alignof(nnz_lno_t)) : 0;
-
-  //START OF SHARED MEMORY SIZE CALCULATIONS
-  // NOTE: the values computed here are not actually passed to functors requiring shmem,
-  // the calculations here are used for algorithm selection
-  nnz_lno_t unit_memory = sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) + sizeof(scalar_t);
-  nnz_lno_t team_shmem_key_size = ((shmem_size_to_use - sizeof(nnz_lno_t) * 4 - scalarAlignPad) / unit_memory);
+  // How many extra bytes are needed to align a scalar_t after an array of
+  // nnz_lno_t, in the worst case? Incurred once per hashmap, which may be per
+  // team or per thread depending on algorithm
+  constexpr size_t scalarAlignPad =
+      (alignof(scalar_t) > alignof(nnz_lno_t))
+          ? (alignof(scalar_t) - alignof(nnz_lno_t))
+          : 0;
+
+  // START OF SHARED MEMORY SIZE CALCULATIONS
+  // NOTE: the values computed here are not actually passed to functors
+  // requiring shmem, the calculations here are used for algorithm selection
+  nnz_lno_t unit_memory =
+      sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) + sizeof(scalar_t);
+  nnz_lno_t team_shmem_key_size =
+      ((shmem_size_to_use - sizeof(nnz_lno_t) * 4 - scalarAlignPad) /
+       unit_memory);
   // alignment padding is per-thread for algorithms with per-thread hashmap
-  nnz_lno_t thread_memory = ((shmem_size_to_use / suggested_team_size - scalarAlignPad) / 8) * 8;
-
-
-  nnz_lno_t thread_shmem_key_size = ((thread_memory - sizeof(nnz_lno_t) * 4) / unit_memory);
-  if (KOKKOSKERNELS_VERBOSE){
-	  std::cout << "\t\tinitial PortableNumericCHASH -- thread_memory:" << thread_memory  << " unit_memory:" << unit_memory <<" initial key size:" << thread_shmem_key_size << std::endl;
-	  std::cout << "\t\tinitial PortableNumericCHASH -- team_memory:" << shmem_size_to_use  << " unit_memory:" << unit_memory <<" initial team key size:" << team_shmem_key_size << std::endl;
+  nnz_lno_t thread_memory =
+      ((shmem_size_to_use / suggested_team_size - scalarAlignPad) / 8) * 8;
+
+  nnz_lno_t thread_shmem_key_size =
+      ((thread_memory - sizeof(nnz_lno_t) * 4) / unit_memory);
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\t\tinitial PortableNumericCHASH -- thread_memory:"
+              << thread_memory << " unit_memory:" << unit_memory
+              << " initial key size:" << thread_shmem_key_size << std::endl;
+    std::cout << "\t\tinitial PortableNumericCHASH -- team_memory:"
+              << shmem_size_to_use << " unit_memory:" << unit_memory
+              << " initial team key size:" << team_shmem_key_size << std::endl;
   }
   nnz_lno_t thread_shmem_hash_size = 1;
-  while (thread_shmem_hash_size * 2 <=  thread_shmem_key_size){
-	  thread_shmem_hash_size = thread_shmem_hash_size * 2;
+  while (thread_shmem_hash_size * 2 <= thread_shmem_key_size) {
+    thread_shmem_hash_size = thread_shmem_hash_size * 2;
   }
   nnz_lno_t team_shmem_hash_size = 1;
-  while (team_shmem_hash_size * 2 <=  team_shmem_key_size){
-	  team_shmem_hash_size = team_shmem_hash_size * 2;
+  while (team_shmem_hash_size * 2 <= team_shmem_key_size) {
+    team_shmem_hash_size = team_shmem_hash_size * 2;
   }
-  //nnz_lno_t team_shared_memory_hash_func = team_shmem_hash_size - 1;
+  // nnz_lno_t team_shared_memory_hash_func = team_shmem_hash_size - 1;
 
-  team_shmem_key_size = team_shmem_key_size + ((team_shmem_key_size - team_shmem_hash_size) * sizeof(nnz_lno_t)) / (sizeof (nnz_lno_t) * 2 + sizeof(scalar_t));
+  team_shmem_key_size =
+      team_shmem_key_size +
+      ((team_shmem_key_size - team_shmem_hash_size) * sizeof(nnz_lno_t)) /
+          (sizeof(nnz_lno_t) * 2 + sizeof(scalar_t));
   team_shmem_key_size = (team_shmem_key_size >> 1) << 1;
 
-  thread_shmem_key_size = thread_shmem_key_size + ((thread_shmem_key_size - thread_shmem_hash_size) * sizeof(nnz_lno_t)) / (sizeof (nnz_lno_t) * 2 + sizeof(scalar_t));
+  thread_shmem_key_size =
+      thread_shmem_key_size +
+      ((thread_shmem_key_size - thread_shmem_hash_size) * sizeof(nnz_lno_t)) /
+          (sizeof(nnz_lno_t) * 2 + sizeof(scalar_t));
   thread_shmem_key_size = (thread_shmem_key_size >> 1) << 1;
 
-  //choose parameters
-  if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm){
-	  if (KokkosKernels::Impl::kk_is_gpu_exec_space<typename HandleType::HandleExecSpace>()) {
-		  //then chose the best method and parameters.
-		  size_type average_row_nnz = overall_nnz / this->a_row_cnt;
-		  size_t average_row_flops = original_overall_flops / this->a_row_cnt;
-		  //if we have very low flops per row, or our maximum number of nnz is prett small,
-		  //then we do row-base algorithm.
-		  if (SPGEMM_KK_LP != this->spgemm_algorithm && (average_row_nnz < 32 || average_row_flops < 256)){
-			  algorithm_to_run = SPGEMM_KK_MEMORY;
-			  //if (average_row_nnz / double (thread_shmem_key_size) > 1.5)
-				  while (average_row_nnz > size_type (thread_shmem_key_size) && suggested_vector_size < 32){
-					  suggested_vector_size  = suggested_vector_size * 2;
-					  suggested_vector_size = KOKKOSKERNELS_MACRO_MIN(32, suggested_vector_size);
-					  suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-					  thread_memory = (shmem_size_to_use /8 / suggested_team_size) * 8;
-					  thread_shmem_key_size = ((thread_memory - sizeof(nnz_lno_t) * 4) / unit_memory);
-					  thread_shmem_hash_size = 1;
-					  while (thread_shmem_hash_size * 2 <=  thread_shmem_key_size){
-						  thread_shmem_hash_size = thread_shmem_hash_size * 2;
-					  }
-					  thread_shmem_key_size = thread_shmem_key_size + ((thread_shmem_key_size - thread_shmem_hash_size) * sizeof(nnz_lno_t) - scalarAlignPad) / (sizeof (nnz_lno_t) * 2 + sizeof(scalar_t));
-					  thread_shmem_key_size = (thread_shmem_key_size >> 1) << 1;
-				  }
-
-			  if (KOKKOSKERNELS_VERBOSE){
-				  std::cout << "\t\t\tRunning KKMEM with suggested_vector_size:" << suggested_vector_size << " suggested_team_size:" << suggested_team_size << std::endl;
-			  }
-		  }
-		  else {
-			  nnz_lno_t tmp_team_cuckoo_key_size = ((shmem_size_to_use - sizeof(nnz_lno_t) * 2 - scalarAlignPad) / (sizeof(nnz_lno_t) + sizeof(scalar_t )));
-			  int team_cuckoo_key_size = 1;
-			  while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size) team_cuckoo_key_size = team_cuckoo_key_size * 2;
-			  suggested_vector_size = 32;
-			  suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-			  algorithm_to_run = SPGEMM_KK_MEMORY_BIGSPREADTEAM;
-			  while (average_row_nnz < team_cuckoo_key_size / 2 * (KOKKOSKERNELS_MACRO_MIN (first_level_cut_off + 0.05, 1))){
-				  shmem_size_to_use = shmem_size_to_use / 2;
-				  tmp_team_cuckoo_key_size = ((shmem_size_to_use - sizeof(nnz_lno_t) * 2 - scalarAlignPad) / (sizeof(nnz_lno_t) + sizeof(scalar_t )));
-				  team_cuckoo_key_size = 1;
-				  while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size) team_cuckoo_key_size = team_cuckoo_key_size * 2;
-
-				  suggested_team_size = suggested_team_size / 2;
-			  }
-			  if (average_row_flops > size_t(2) * suggested_team_size * suggested_vector_size &&
-					  average_row_nnz > size_type (team_cuckoo_key_size) * (KOKKOSKERNELS_MACRO_MIN (first_level_cut_off + 0.05, 1))){
-				  shmem_size_to_use = shmem_size_to_use * 2;
-				  tmp_team_cuckoo_key_size = ((shmem_size_to_use - sizeof(nnz_lno_t) * 2 - scalarAlignPad) / (sizeof(nnz_lno_t) + sizeof(scalar_t )));
-				  team_cuckoo_key_size = 1;
-				  while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size) team_cuckoo_key_size = team_cuckoo_key_size * 2;
-				  suggested_team_size = suggested_team_size *2;
-			  }
+  // choose parameters
+  if (this->spgemm_algorithm == SPGEMM_KK ||
+      SPGEMM_KK_LP == this->spgemm_algorithm) {
+    if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
+      // then chose the best method and parameters.
+      size_type average_row_nnz = 0;
+      size_t average_row_flops  = 0;
+      if (this->a_row_cnt > 0) {
+        average_row_nnz   = overall_nnz / this->a_row_cnt;
+        average_row_flops = original_overall_flops / this->a_row_cnt;
+      }
+      int vector_length_max =
+          KokkosKernels::Impl::kk_get_max_vector_size<MyExecSpace>();
+      // if we have very low flops per row, or our maximum number of nnz is
+      // prett small, then we do row-base algorithm.
+      if (SPGEMM_KK_LP != this->spgemm_algorithm &&
+          (average_row_nnz < (size_type)vector_length_max ||
+           average_row_flops < 256)) {
+        algorithm_to_run = SPGEMM_KK_MEMORY;
+        // if (average_row_nnz / double (thread_shmem_key_size) > 1.5)
+        while (average_row_nnz > size_type(thread_shmem_key_size) &&
+               suggested_vector_size < vector_length_max) {
+          suggested_vector_size = suggested_vector_size * 2;
+          suggested_vector_size =
+              KOKKOSKERNELS_MACRO_MIN(vector_length_max, suggested_vector_size);
+          suggested_team_size =
+              this->handle->get_suggested_team_size(suggested_vector_size);
+          thread_memory = (shmem_size_to_use / 8 / suggested_team_size) * 8;
+          thread_shmem_key_size =
+              ((thread_memory - sizeof(nnz_lno_t) * 4) / unit_memory);
+          thread_shmem_hash_size = 1;
+          while (thread_shmem_hash_size * 2 <= thread_shmem_key_size) {
+            thread_shmem_hash_size = thread_shmem_hash_size * 2;
+          }
+          thread_shmem_key_size =
+              thread_shmem_key_size +
+              ((thread_shmem_key_size - thread_shmem_hash_size) *
+                   sizeof(nnz_lno_t) -
+               scalarAlignPad) /
+                  (sizeof(nnz_lno_t) * 2 + sizeof(scalar_t));
+          thread_shmem_key_size = (thread_shmem_key_size >> 1) << 1;
+        }
+
+        if (KOKKOSKERNELS_VERBOSE) {
+          std::cout << "\t\t\tRunning KKMEM with suggested_vector_size:"
+                    << suggested_vector_size
+                    << " suggested_team_size:" << suggested_team_size
+                    << std::endl;
+        }
+      } else {
+        nnz_lno_t tmp_team_cuckoo_key_size =
+            ((shmem_size_to_use - sizeof(nnz_lno_t) * 2 - scalarAlignPad) /
+             (sizeof(nnz_lno_t) + sizeof(scalar_t)));
+        int team_cuckoo_key_size = 1;
+        while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size)
+          team_cuckoo_key_size = team_cuckoo_key_size * 2;
+        suggested_vector_size = vector_length_max;
+        suggested_team_size =
+            this->handle->get_suggested_team_size(suggested_vector_size);
+        algorithm_to_run = SPGEMM_KK_MEMORY_BIGSPREADTEAM;
+        while (average_row_nnz <
+               team_cuckoo_key_size / 2 *
+                   (KOKKOSKERNELS_MACRO_MIN(first_level_cut_off + 0.05, 1))) {
+          shmem_size_to_use = shmem_size_to_use / 2;
+          tmp_team_cuckoo_key_size =
+              ((shmem_size_to_use - sizeof(nnz_lno_t) * 2 - scalarAlignPad) /
+               (sizeof(nnz_lno_t) + sizeof(scalar_t)));
+          team_cuckoo_key_size = 1;
+          while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size)
+            team_cuckoo_key_size = team_cuckoo_key_size * 2;
+
+          suggested_team_size = suggested_team_size / 2;
+        }
+        if (average_row_flops >
+                size_t(2) * suggested_team_size * suggested_vector_size &&
+            average_row_nnz >
+                size_type(team_cuckoo_key_size) *
+                    (KOKKOSKERNELS_MACRO_MIN(first_level_cut_off + 0.05, 1))) {
+          shmem_size_to_use = shmem_size_to_use * 2;
+          tmp_team_cuckoo_key_size =
+              ((shmem_size_to_use - sizeof(nnz_lno_t) * 2 - scalarAlignPad) /
+               (sizeof(nnz_lno_t) + sizeof(scalar_t)));
+          team_cuckoo_key_size = 1;
+          while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size)
+            team_cuckoo_key_size = team_cuckoo_key_size * 2;
+          suggested_team_size = suggested_team_size * 2;
+        }
 #ifdef FIRSTPARAMS
-			  suggested_team_size = KOKKOSKERNELS_MACRO_MAX(4, suggested_team_size);
+        suggested_team_size = KOKKOSKERNELS_MACRO_MAX(4, suggested_team_size);
 #else
-			  suggested_team_size = KOKKOSKERNELS_MACRO_MAX(2, suggested_team_size);
+        suggested_team_size = KOKKOSKERNELS_MACRO_MAX(2, suggested_team_size);
 #endif
-			  if (max_nnz < team_cuckoo_key_size * KOKKOSKERNELS_MACRO_MIN (first_level_cut_off + 0.20, 1)){
-				  algorithm_to_run = SPGEMM_KK_MEMORY_SPREADTEAM;
-				  if (KOKKOSKERNELS_VERBOSE){
-					  std::cout
-					  	  	  << "\t\t\tRunning SPGEMM_KK_MEMORY_SPREADTEAM with suggested_vector_size:" << suggested_vector_size
-							  << " suggested_team_size:" << suggested_team_size
-							  << " shmem_size_to_use:" << shmem_size_to_use << std::endl;
-				  }
-			  }
-			  else {
-				  if (KOKKOSKERNELS_VERBOSE){
-					  std::cout 	<< "\t\t\tRunning SPGEMM_KK_MEMORY_BIGSPREADTEAM with suggested_vector_size:" << suggested_vector_size
-							  << " suggested_team_size:" << suggested_team_size
-							  << " shmem_size_to_use:" << shmem_size_to_use << std::endl;
-				  }
-			  }
-		  }
-	  }
-	  else {
-		  bool run_dense = false;
-		  nnz_lno_t max_column_cut_off = this->handle->get_spgemm_handle()->MaxColDenseAcc;
-		  nnz_lno_t col_size = this->b_col_cnt;
-		  if (col_size < max_column_cut_off){
-			  run_dense = true;
-			  if (KOKKOSKERNELS_VERBOSE){
-				  std::cout << "\t\t\tRunning SPGEMM_KK_DENSE col_size:" << col_size << " max_column_cut_off:" << max_column_cut_off << std::endl;
-			  }
-		  }
-		  else {
-			  //round up maxNumRoughNonzeros to closest power of 2.
-			  nnz_lno_t tmp_min_hash_size = 1;
-			  while (tmp_max_nnz > tmp_min_hash_size){
-			    tmp_min_hash_size *= 4;
-			  }
-
-			  size_t kkmem_chunksize = tmp_min_hash_size ; //this is for used hash indices
-			  kkmem_chunksize += tmp_min_hash_size ; //this is for the hash begins
-			  kkmem_chunksize += max_nnz ; //this is for hash nexts
-			  kkmem_chunksize = kkmem_chunksize * sizeof (nnz_lno_t) + scalarAlignPad;
-			  size_t dense_chunksize = (col_size + col_size / sizeof(scalar_t) + 1) * sizeof(scalar_t);
-
-
-			  if (kkmem_chunksize >= dense_chunksize * 0.5){
-				  run_dense = true;
-				  if (KOKKOSKERNELS_VERBOSE){
-					  std::cout << "\t\t\tRunning SPGEMM_KK_SPEED kkmem_chunksize:" << kkmem_chunksize << " dense_chunksize:" << dense_chunksize << std::endl;
-				  }
-			  }
-			  else {
-				  run_dense = false;
-				  if (KOKKOSKERNELS_VERBOSE){
-					  std::cout << "\t\t\tRunning SPGEMM_KK_MEMORY col_size:" << col_size << " max_column_cut_off:" << max_column_cut_off << std::endl;
-				  }
-			  }
-		  }
-
-		  if (run_dense){
-			  this->KokkosSPGEMM_numeric_speed(
-					  rowmapC_,
-					  entriesC_,
-					  valuesC_,
-					  lcl_my_exec_space);
-			  return;
-		  }
-	  }
-  }
-  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(suggested_team_size,concurrency, a_row_cnt);
-  if (KOKKOSKERNELS_VERBOSE){
-	  std::cout << "\t\tPortableNumericCHASH -- adjusted hashsize:" << thread_shmem_hash_size  << " thread_shmem_key_size:" << thread_shmem_key_size << std::endl;
-	  std::cout << "\t\tPortableNumericCHASH -- adjusted team hashsize:" << team_shmem_hash_size  << " team_shmem_key_size:" << team_shmem_key_size << std::endl;
-  }
-  //END OF SHARED MEMORY SIZE CALCULATIONS
-
-
-  //required memory for L2
-  if (KokkosKernels::Impl::kk_is_gpu_exec_space<typename HandleType::HandleExecSpace>()) {
+        if (max_nnz <
+            team_cuckoo_key_size *
+                KOKKOSKERNELS_MACRO_MIN(first_level_cut_off + 0.20, 1)) {
+          algorithm_to_run = SPGEMM_KK_MEMORY_SPREADTEAM;
+          if (KOKKOSKERNELS_VERBOSE) {
+            std::cout << "\t\t\tRunning SPGEMM_KK_MEMORY_SPREADTEAM with "
+                         "suggested_vector_size:"
+                      << suggested_vector_size
+                      << " suggested_team_size:" << suggested_team_size
+                      << " shmem_size_to_use:" << shmem_size_to_use
+                      << std::endl;
+          }
+        } else {
+          if (KOKKOSKERNELS_VERBOSE) {
+            std::cout << "\t\t\tRunning SPGEMM_KK_MEMORY_BIGSPREADTEAM with "
+                         "suggested_vector_size:"
+                      << suggested_vector_size
+                      << " suggested_team_size:" << suggested_team_size
+                      << " shmem_size_to_use:" << shmem_size_to_use
+                      << std::endl;
+          }
+        }
+      }
+    } else {
+      bool run_dense = false;
+      nnz_lno_t max_column_cut_off =
+          this->handle->get_spgemm_handle()->MaxColDenseAcc;
+      nnz_lno_t col_size = this->b_col_cnt;
+      if (col_size < max_column_cut_off) {
+        run_dense = true;
+        if (KOKKOSKERNELS_VERBOSE) {
+          std::cout << "\t\t\tRunning SPGEMM_KK_DENSE col_size:" << col_size
+                    << " max_column_cut_off:" << max_column_cut_off
+                    << std::endl;
+        }
+      } else {
+        // round up maxNumRoughNonzeros to closest power of 2.
+        nnz_lno_t tmp_min_hash_size = 1;
+        while (tmp_max_nnz > tmp_min_hash_size) {
+          tmp_min_hash_size *= 4;
+        }
 
-	  if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){
-		  tmp_max_nnz = 1;
-	  }
-	  else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM){
+        size_t kkmem_chunksize =
+            tmp_min_hash_size;                 // this is for used hash indices
+        kkmem_chunksize += tmp_min_hash_size;  // this is for the hash begins
+        kkmem_chunksize += max_nnz;            // this is for hash nexts
+        kkmem_chunksize = kkmem_chunksize * sizeof(nnz_lno_t) + scalarAlignPad;
+        size_t dense_chunksize =
+            (col_size + col_size / sizeof(scalar_t) + 1) * sizeof(scalar_t);
+
+        if (kkmem_chunksize >= dense_chunksize * 0.5) {
+          run_dense = true;
+          if (KOKKOSKERNELS_VERBOSE) {
+            std::cout << "\t\t\tRunning SPGEMM_KK_SPEED kkmem_chunksize:"
+                      << kkmem_chunksize
+                      << " dense_chunksize:" << dense_chunksize << std::endl;
+          }
+        } else {
+          run_dense = false;
+          if (KOKKOSKERNELS_VERBOSE) {
+            std::cout << "\t\t\tRunning SPGEMM_KK_MEMORY col_size:" << col_size
+                      << " max_column_cut_off:" << max_column_cut_off
+                      << std::endl;
+          }
+        }
+      }
 
-	  }
-	  else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGTEAM || algorithm_to_run == SPGEMM_KK_MEMORY_TEAM ){
-		  //tmp_max_nnz -= team_shmem_key_size;
-	  }
-	  else {
-		  //tmp_max_nnz -= thread_shmem_key_size;
-	  }
+      if (run_dense) {
+        this->KokkosSPGEMM_numeric_speed(rowmapC_, entriesC_, valuesC_,
+                                         lcl_my_exec_space);
+        return;
+      }
+    }
   }
-
-  // START SIZE CALCULATIONS FOR MEMORYPOOL
-  if (algorithm_to_run == SPGEMM_KK_LP ){
-
-	  while (tmp_max_nnz > min_hash_size){
-		  min_hash_size *= 4;
-	  }
-	  chunksize = min_hash_size; //this is for used hash keys
-	  chunksize += max_nnz; //this is for used hash keys
-	  chunksize += scalarAlignPad;  //for padding betwen keys and values
-	  chunksize += min_hash_size * sizeof(scalar_t) / sizeof(nnz_lno_t) ; //this is for the hash values
+  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(
+      suggested_team_size, concurrency, a_row_cnt);
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\t\tPortableNumericCHASH -- adjusted hashsize:"
+              << thread_shmem_hash_size
+              << " thread_shmem_key_size:" << thread_shmem_key_size
+              << std::endl;
+    std::cout << "\t\tPortableNumericCHASH -- adjusted team hashsize:"
+              << team_shmem_hash_size
+              << " team_shmem_key_size:" << team_shmem_key_size << std::endl;
   }
-  else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM){
-	  while (tmp_max_nnz > min_hash_size){
-	      min_hash_size *= 2;  //try to keep it as low as possible because hashes are not tracked.
- 	  }
-	  chunksize = min_hash_size; //this is for used hash keys
-          chunksize += scalarAlignPad;  //for padding between keys and values
-	  chunksize += min_hash_size * sizeof(scalar_t) / sizeof(nnz_lno_t) ; //this is for the hash values
+  // END OF SHARED MEMORY SIZE CALCULATIONS
+
+  // required memory for L2
+  if (KokkosKernels::Impl::kk_is_gpu_exec_space<
+          typename HandleType::HandleExecSpace>()) {
+    if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM) {
+      tmp_max_nnz = 1;
+    } else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM) {
+    } else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGTEAM ||
+               algorithm_to_run == SPGEMM_KK_MEMORY_TEAM) {
+      // tmp_max_nnz -= team_shmem_key_size;
+    } else {
+      // tmp_max_nnz -= thread_shmem_key_size;
+    }
   }
-  else{
-	  while (tmp_max_nnz > min_hash_size){
-	    min_hash_size *= 4;
-	  }
-	  chunksize = min_hash_size; //this is for used hash indices
-	  chunksize += min_hash_size ; //this is for the hash begins
-	  chunksize += max_nnz; //this is for hash nexts
+
+  // START SIZE CALCULATIONS FOR MEMORYPOOL
+  if (algorithm_to_run == SPGEMM_KK_LP) {
+    while (tmp_max_nnz > min_hash_size) {
+      min_hash_size *= 4;
+    }
+    chunksize = min_hash_size;    // this is for used hash keys
+    chunksize += max_nnz;         // this is for used hash keys
+    chunksize += scalarAlignPad;  // for padding betwen keys and values
+    chunksize += min_hash_size * sizeof(scalar_t) /
+                 sizeof(nnz_lno_t);  // this is for the hash values
+  } else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM) {
+    while (tmp_max_nnz > min_hash_size) {
+      min_hash_size *= 2;  // try to keep it as low as possible because hashes
+                           // are not tracked.
+    }
+    chunksize = min_hash_size;    // this is for used hash keys
+    chunksize += scalarAlignPad;  // for padding between keys and values
+    chunksize += min_hash_size * sizeof(scalar_t) /
+                 sizeof(nnz_lno_t);  // this is for the hash values
+  } else {
+    while (tmp_max_nnz > min_hash_size) {
+      min_hash_size *= 4;
+    }
+    chunksize = min_hash_size;   // this is for used hash indices
+    chunksize += min_hash_size;  // this is for the hash begins
+    chunksize += max_nnz;        // this is for hash nexts
   }
 
-  nnz_lno_t num_chunks = this->template compute_num_pool_chunks<pool_memory_space>
-    (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
+  nnz_lno_t num_chunks =
+      this->template compute_num_pool_chunks<pool_memory_space>(
+          chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
 
   // END SIZE CALCULATIONS FOR MEMORYPOOL
 
-  if (KOKKOSKERNELS_VERBOSE){
-    std::cout << "\t\t max_nnz: " << max_nnz
-              << " chunk_size:" << chunksize
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\t\t max_nnz: " << max_nnz << " chunk_size:" << chunksize
               << " min_hash_size:" << min_hash_size
               << " concurrency:" << concurrency
               << " MyExecSpace::concurrency():" << MyExecSpace::concurrency()
@@ -1444,158 +1601,185 @@ void
   }
 
   Kokkos::Timer timer1;
-  pool_memory_space m_space(num_chunks, chunksize, -1,  my_pool_type);
+  pool_memory_space m_space(num_chunks, chunksize, -1, my_pool_type);
   MyExecSpace().fence();
 
-  if (KOKKOSKERNELS_VERBOSE){
+  if (KOKKOSKERNELS_VERBOSE) {
     m_space.print_memory_pool();
     std::cout << "\t\tPool Alloc Time:" << timer1.seconds() << std::endl;
-    std::cout << "\t\tPool Size(MB):" <<
-        sizeof (nnz_lno_t) * (num_chunks * chunksize) / 1024. / 1024.  << std::endl;
+    std::cout << "\t\tPool Size(MB):"
+              << sizeof(nnz_lno_t) * (num_chunks * chunksize) / 1024. / 1024.
+              << std::endl;
   }
 
   PortableNumericCHASH<
-    const_a_lno_row_view_t, const_a_lno_nnz_view_t, const_a_scalar_nnz_view_t,
-    const_b_lno_row_view_t, const_b_lno_nnz_view_t, const_b_scalar_nnz_view_t,
-    c_row_view_t, c_lno_nnz_view_t, c_scalar_nnz_view_t,
-    pool_memory_space>
-  sc(
-      a_row_cnt,
-      row_mapA,
-      entriesA,
-      valsA,
-
-      row_mapB,
-      entriesB,
-      valsB,
-
-      rowmapC_,
-      entriesC_,
-      valuesC_,
-      shmem_size_to_use,
-      suggested_vector_size,
-      m_space,
-      min_hash_size, max_nnz,
-      suggested_team_size,
-
-      lcl_my_exec_space,
-      team_row_chunk_size,
-      first_level_cut_off,
-      flops_per_row,
-      KOKKOSKERNELS_VERBOSE);
-
-  if (KOKKOSKERNELS_VERBOSE){
-    std::cout << "\t\tvector_size:" << suggested_vector_size  << " chunk_size:" << team_row_chunk_size << " suggested_team_size:" << suggested_team_size<< std::endl;
+      const_a_lno_row_view_t, const_a_lno_nnz_view_t, const_a_scalar_nnz_view_t,
+      const_b_lno_row_view_t, const_b_lno_nnz_view_t, const_b_scalar_nnz_view_t,
+      c_row_view_t, c_lno_nnz_view_t, c_scalar_nnz_view_t, pool_memory_space>
+      sc(a_row_cnt, row_mapA, entriesA, valsA,
+
+         row_mapB, entriesB, valsB,
+
+         rowmapC_, entriesC_, valuesC_, shmem_size_to_use,
+         suggested_vector_size, m_space, min_hash_size, max_nnz,
+         suggested_team_size,
+
+         lcl_my_exec_space, team_row_chunk_size, first_level_cut_off,
+         flops_per_row, KOKKOSKERNELS_VERBOSE);
+
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\t\tvector_size:" << suggested_vector_size
+              << " chunk_size:" << team_row_chunk_size
+              << " suggested_team_size:" << suggested_team_size << std::endl;
   }
   timer1.reset();
 
   if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
-	  if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){
-                  if (thread_shmem_key_size <= 0) {
-                    std::cout << "KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY_SPREADTEAM: Insufficient shmem available for key for hash map accumulator - Terminating" << std::endl;
-                    std::cout << "    thread_shmem_key_size = " << thread_shmem_key_size << std::endl;
-                    throw std::runtime_error(" KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY_SPREADTEAM: Insufficient shmem available for key for hash map accumulator ");
-                  }
-		  Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY_SPREADTEAM", gpu_team_policy4_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-		    MyExecSpace().fence();
-
-	  }
-	  else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM){
-                  if (thread_shmem_key_size <= 0) {
-                    std::cout << "KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY_BIGSPREADTEAM: Insufficient shmem available for key for hash map accumulator - Terminating" << std::endl;
-                    std::cout << "    thread_shmem_key_size = " << thread_shmem_key_size << std::endl;
-                    throw std::runtime_error(" KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY_BIGSPREADTEAM: Insufficient shmem available for key for hash map accumulator ");
-                  }
-		  Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY_BIGSPREADTEAM", gpu_team_policy6_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-	  }
-	  else {
-                  if (team_shmem_key_size <= 0) {
-                    std::cout << "KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY: Insufficient shmem available for key for hash map accumulator - Terminating" << std::endl;
-                    std::cout << "    team_shmem_key_size = " << team_shmem_key_size << std::endl;
-                    throw std::runtime_error(" KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY: Insufficient shmem available for key for hash map accumulator ");
-                  }
-		  Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY", gpu_team_policy_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-	  }
+    if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM) {
+      if (thread_shmem_key_size <= 0) {
+        std::cout << "KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY_SPREADTEAM: "
+                     "Insufficient shmem available for key for hash map "
+                     "accumulator - Terminating"
+                  << std::endl;
+        std::cout << "    thread_shmem_key_size = " << thread_shmem_key_size
+                  << std::endl;
+        throw std::runtime_error(
+            " KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY_SPREADTEAM: "
+            "Insufficient shmem available for key for hash map accumulator ");
+      }
+      Kokkos::parallel_for(
+          "KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY_SPREADTEAM",
+          gpu_team_policy4_t(a_row_cnt / team_row_chunk_size + 1,
+                             suggested_team_size, suggested_vector_size),
+          sc);
+      MyExecSpace().fence();
+
+    } else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM) {
+      if (thread_shmem_key_size <= 0) {
+        std::cout << "KokkosSPGEMM_numeric_hash "
+                     "SPGEMM_KK_MEMORY_BIGSPREADTEAM: Insufficient shmem "
+                     "available for key for hash map accumulator - Terminating"
+                  << std::endl;
+        std::cout << "    thread_shmem_key_size = " << thread_shmem_key_size
+                  << std::endl;
+        throw std::runtime_error(
+            " KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY_BIGSPREADTEAM: "
+            "Insufficient shmem available for key for hash map accumulator ");
+      }
+      Kokkos::parallel_for(
+          "KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY_BIGSPREADTEAM",
+          gpu_team_policy6_t(a_row_cnt / team_row_chunk_size + 1,
+                             suggested_team_size, suggested_vector_size),
+          sc);
+    } else {
+      if (team_shmem_key_size <= 0) {
+        std::cout
+            << "KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY: Insufficient shmem "
+               "available for key for hash map accumulator - Terminating"
+            << std::endl;
+        std::cout << "    team_shmem_key_size = " << team_shmem_key_size
+                  << std::endl;
+        throw std::runtime_error(
+            " KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY: Insufficient shmem "
+            "available for key for hash map accumulator ");
+      }
+      Kokkos::parallel_for(
+          "KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY",
+          gpu_team_policy_t(a_row_cnt / team_row_chunk_size + 1,
+                            suggested_team_size, suggested_vector_size),
+          sc);
+    }
+    MyExecSpace().fence();
+  } else {
+    if (algorithm_to_run == SPGEMM_KK_LP) {
+      if (use_dynamic_schedule) {
+        Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_LP::DYNAMIC",
+                             dynamic_multicore_team_policy4_t(
+                                 a_row_cnt / team_row_chunk_size + 1,
+                                 suggested_team_size, suggested_vector_size),
+                             sc);
+      } else {
+        Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_LP::STATIC",
+                             multicore_team_policy4_t(
+                                 a_row_cnt / team_row_chunk_size + 1,
+                                 suggested_team_size, suggested_vector_size),
+                             sc);
+      }
+    } else {
+      if (use_dynamic_schedule) {
+        Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::KKMEM::DYNAMIC",
+                             dynamic_multicore_team_policy_t(
+                                 a_row_cnt / team_row_chunk_size + 1,
+                                 suggested_team_size, suggested_vector_size),
+                             sc);
+      } else {
+        Kokkos::parallel_for(
+            "KOKKOSPARSE::SPGEMM::KKMEM::STATIC",
+            multicore_team_policy_t(a_row_cnt / team_row_chunk_size + 1,
+                                    suggested_team_size, suggested_vector_size),
+            sc);
+      }
+    }
     MyExecSpace().fence();
-  }
-  else {
-	  if (algorithm_to_run == SPGEMM_KK_LP){
-		  if (use_dynamic_schedule){
-			  Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_LP::DYNAMIC", dynamic_multicore_team_policy4_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-		  }
-		  else {
-
-			  Kokkos::parallel_for( "KOKKOSPARSE::SPGEMM::SPGEMM_KK_LP::STATIC", multicore_team_policy4_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-		  }
-	  }
-	  else {
-		  if (use_dynamic_schedule){
-
-			  Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::KKMEM::DYNAMIC",  dynamic_multicore_team_policy_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-		  }
-		  else {
-
-			  Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::KKMEM::STATIC",  multicore_team_policy_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-		  }
-	  }
-	  MyExecSpace().fence();
   }
 
-  if (KOKKOSKERNELS_VERBOSE){
+  if (KOKKOSKERNELS_VERBOSE) {
     std::cout << "\t\tNumeric TIME:" << timer1.seconds() << std::endl;
   }
-
 }
 
-
-// 01/30/2020: this code seems to be unused within any of the kokkos-kernels spgemm numeric phase algorithms
+// 01/30/2020: this code seems to be unused within any of the kokkos-kernels
+// spgemm numeric phase algorithms
 // TODO determine if this code should be revived for use or removed
-//this is to isolate the memory use of accumulators and A,B,C.
-//normally accumulators can use memory of C directly, but in this one we separate it for experimenting.
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-template <typename c_row_view_t, typename c_lno_nnz_view_t, typename c_scalar_nnz_view_t>
-void
-  KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-  KokkosSPGEMM_numeric_hash2(
-      c_row_view_t rowmapC_,
-      c_lno_nnz_view_t entriesC_,
-      c_scalar_nnz_view_t valuesC_,
-      KokkosKernels::Impl::ExecSpaceType my_exec_space_)
-{
-  if (KOKKOSKERNELS_VERBOSE){
+// this is to isolate the memory use of accumulators and A,B,C.
+// normally accumulators can use memory of C directly, but in this one we
+// separate it for experimenting.
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename c_row_view_t, typename c_lno_nnz_view_t,
+          typename c_scalar_nnz_view_t>
+void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                  a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                  b_scalar_nnz_view_t_>::
+    KokkosSPGEMM_numeric_hash2(
+        c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_,
+        c_scalar_nnz_view_t valuesC_,
+        KokkosKernels::Impl::ExecSpaceType my_exec_space_) {
+  if (KOKKOSKERNELS_VERBOSE) {
     std::cout << "\tHASH MODE" << std::endl;
   }
 
   nnz_lno_t brows = row_mapB.extent(0) - 1;
-  size_type bnnz =  valsB.extent(0);
-
-  int suggested_vector_size = this->handle->get_suggested_vector_size(brows, bnnz);
-  int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(suggested_team_size,concurrency, a_row_cnt);
+  size_type bnnz  = valsB.extent(0);
 
-  typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space;
+  int suggested_vector_size =
+      this->handle->get_suggested_vector_size(brows, bnnz);
+  int suggested_team_size =
+      this->handle->get_suggested_team_size(suggested_vector_size);
+  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(
+      suggested_team_size, concurrency, a_row_cnt);
 
+  typedef KokkosKernels::Impl::UniformMemoryPool<MyTempMemorySpace, nnz_lno_t>
+      pool_memory_space;
 
   nnz_lno_t max_nnz = this->handle->get_spgemm_handle()->get_max_result_nnz();
   nnz_lno_t min_hash_size = 1;
-  while (max_nnz > min_hash_size){
+  while (max_nnz > min_hash_size) {
     min_hash_size *= 4;
   }
 
-  size_t chunksize = min_hash_size; //this is for used hash indices
-  chunksize += min_hash_size ; //this is for the hash begins
-  chunksize += max_nnz; //this is for hash nexts
-  chunksize += max_nnz; //this is for indices
-  chunksize += max_nnz * (sizeof (scalar_t)/ sizeof (nnz_lno_t)); //this is for values
+  size_t chunksize = min_hash_size;  // this is for used hash indices
+  chunksize += min_hash_size;        // this is for the hash begins
+  chunksize += max_nnz;              // this is for hash nexts
+  chunksize += max_nnz;              // this is for indices
+  chunksize +=
+      max_nnz * (sizeof(scalar_t) / sizeof(nnz_lno_t));  // this is for values
   int num_chunks = concurrency / suggested_vector_size;
 
-  if (KOKKOSKERNELS_VERBOSE){
-    std::cout << "\t\t max_nnz: " << max_nnz
-              << " chunk_size:" << chunksize
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\t\t max_nnz: " << max_nnz << " chunk_size:" << chunksize
               << " numchunks:" << num_chunks << std::endl;
   }
 
@@ -1606,72 +1790,66 @@ void
   }
 
   Kokkos::Timer timer1;
-  pool_memory_space m_space(num_chunks, chunksize, -1,  my_pool_type);
+  pool_memory_space m_space(num_chunks, chunksize, -1, my_pool_type);
   MyExecSpace().fence();
 
-  if (KOKKOSKERNELS_VERBOSE){
+  if (KOKKOSKERNELS_VERBOSE) {
     std::cout << "\t\tPool Alloc Time:" << timer1.seconds() << std::endl;
-    std::cout << "\t\tPool Size(MB):" <<
-        sizeof (nnz_lno_t) * (num_chunks * chunksize) / 1024. / 1024.  << std::endl;
+    std::cout << "\t\tPool Size(MB):"
+              << sizeof(nnz_lno_t) * (num_chunks * chunksize) / 1024. / 1024.
+              << std::endl;
   }
-  double first_level_cut_off  = this->handle->get_spgemm_handle()->get_first_level_hash_cut_off();
+  double first_level_cut_off =
+      this->handle->get_spgemm_handle()->get_first_level_hash_cut_off();
 
   PortableNumericCHASH<
-    const_a_lno_row_view_t, const_a_lno_nnz_view_t, const_a_scalar_nnz_view_t,
-    const_b_lno_row_view_t, const_b_lno_nnz_view_t, const_b_scalar_nnz_view_t,
-    c_row_view_t, c_lno_nnz_view_t, c_scalar_nnz_view_t,
-    pool_memory_space>
-  sc(
-      a_row_cnt,
-      row_mapA,
-      entriesA,
-      valsA,
-
-      row_mapB,
-      entriesB,
-      valsB,
-
-      rowmapC_,
-      entriesC_,
-      valuesC_,
-      shmem_size,
-      suggested_vector_size,
-      m_space,
-      min_hash_size, max_nnz,
-      suggested_team_size,
-
-      my_exec_space_,
-	  team_row_chunk_size,
-	  first_level_cut_off,
-       this->handle->get_spgemm_handle()->row_flops, KOKKOSKERNELS_VERBOSE);
-
-
-  if (KOKKOSKERNELS_VERBOSE){
-    std::cout << "\t\tvector_size:" << suggested_vector_size  << " chunk_size:" << team_row_chunk_size << std::endl;
+      const_a_lno_row_view_t, const_a_lno_nnz_view_t, const_a_scalar_nnz_view_t,
+      const_b_lno_row_view_t, const_b_lno_nnz_view_t, const_b_scalar_nnz_view_t,
+      c_row_view_t, c_lno_nnz_view_t, c_scalar_nnz_view_t, pool_memory_space>
+      sc(a_row_cnt, row_mapA, entriesA, valsA,
+
+         row_mapB, entriesB, valsB,
+
+         rowmapC_, entriesC_, valuesC_, shmem_size, suggested_vector_size,
+         m_space, min_hash_size, max_nnz, suggested_team_size,
+
+         my_exec_space_, team_row_chunk_size, first_level_cut_off,
+         this->handle->get_spgemm_handle()->row_flops, KOKKOSKERNELS_VERBOSE);
+
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\t\tvector_size:" << suggested_vector_size
+              << " chunk_size:" << team_row_chunk_size << std::endl;
   }
   timer1.reset();
 
   if (KokkosKernels::Impl::kk_is_gpu_exec_space<my_exec_space>()) {
-    Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY2",  gpu_team_policy_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
+    Kokkos::parallel_for(
+        "KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY2",
+        gpu_team_policy_t(a_row_cnt / team_row_chunk_size + 1,
+                          suggested_team_size, suggested_vector_size),
+        sc);
     MyExecSpace().fence();
-  }
-  else {
-    if (use_dynamic_schedule){
-
-      Kokkos::parallel_for( "KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY_DYNAMIC", dynamic_multicore_team_policy2_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-    }
-    else {
-
-      Kokkos::parallel_for( "KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY_STATIC", multicore_team_policy2_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
+  } else {
+    if (use_dynamic_schedule) {
+      Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY_DYNAMIC",
+                           dynamic_multicore_team_policy2_t(
+                               a_row_cnt / team_row_chunk_size + 1,
+                               suggested_team_size, suggested_vector_size),
+                           sc);
+    } else {
+      Kokkos::parallel_for(
+          "KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY_STATIC",
+          multicore_team_policy2_t(a_row_cnt / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+          sc);
     }
     MyExecSpace().fence();
   }
 
-  if (KOKKOSKERNELS_VERBOSE){
+  if (KOKKOSKERNELS_VERBOSE) {
     std::cout << "\t\tNumeric TIME:" << timer1.seconds() << std::endl;
   }
-
 }
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosSparse
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_memaccess.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_memaccess.hpp
index edd8209c9e..29a9b5b870 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_memaccess.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_memaccess.hpp
@@ -42,77 +42,77 @@
 //@HEADER
 */
 
-namespace KokkosSparse{
-namespace Impl{
+namespace KokkosSparse {
+namespace Impl {
 #ifdef KOKKOSKERNELS_ANALYZE_MEMORYACCESS
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-struct KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-  Cache{
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                    a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                    b_scalar_nnz_view_t_>::Cache {
   const size_type cache_line_count;
-  std::vector <size_type> cache_lines;
-  std::vector <char> cache_line_owners;
-  std::vector <int> cache_inserting_hyperthread;
+  std::vector<size_type> cache_lines;
+  std::vector<char> cache_line_owners;
+  std::vector<int> cache_inserting_hyperthread;
 
-  size_type  *pcache_lines;
+  size_type *pcache_lines;
   char *pcache_line_owners;
   int *pcache_inserting_hyperthread;
 
-
-
-  Cache(size_type cache_line_count_):
-      cache_line_count(cache_line_count_), cache_lines(cache_line_count_,size_type(0)),
-      cache_line_owners(cache_line_count_,char(0)),
-      cache_inserting_hyperthread(cache_line_count_,int(0)){
-    pcache_lines = &(cache_lines[0]);
-    pcache_line_owners = &(cache_line_owners[0]);
+  Cache(size_type cache_line_count_)
+      : cache_line_count(cache_line_count_),
+        cache_lines(cache_line_count_, size_type(0)),
+        cache_line_owners(cache_line_count_, char(0)),
+        cache_inserting_hyperthread(cache_line_count_, int(0)) {
+    pcache_lines                 = &(cache_lines[0]);
+    pcache_line_owners           = &(cache_line_owners[0]);
     pcache_inserting_hyperthread = &(cache_inserting_hyperthread[0]);
   }
 
-  void clear_cache(){
-    for (size_type i = 0; i < cache_line_count; ++i){
-      pcache_lines[i] = 0; pcache_line_owners[i] = 0; pcache_inserting_hyperthread[i] = 0;
+  void clear_cache() {
+    for (size_type i = 0; i < cache_line_count; ++i) {
+      pcache_lines[i]                 = 0;
+      pcache_line_owners[i]           = 0;
+      pcache_inserting_hyperthread[i] = 0;
     }
   }
-  bool is_in_cache(char owner, size_type line_index, size_type insertion_index){
-    if (pcache_line_owners[insertion_index] == owner && pcache_lines[insertion_index] == line_index){
+  bool is_in_cache(char owner, size_type line_index,
+                   size_type insertion_index) {
+    if (pcache_line_owners[insertion_index] == owner &&
+        pcache_lines[insertion_index] == line_index) {
       return true;
-    }
-    else {
+    } else {
       return false;
     }
   }
-  void insert_to_cache(char owner, size_type line_index, size_type insertion_index, int inserting_hyperthread = 0){
-    pcache_line_owners[insertion_index] = owner;
-    pcache_lines[insertion_index] = line_index;
+  void insert_to_cache(char owner, size_type line_index,
+                       size_type insertion_index,
+                       int inserting_hyperthread = 0) {
+    pcache_line_owners[insertion_index]           = owner;
+    pcache_lines[insertion_index]                 = line_index;
     pcache_inserting_hyperthread[insertion_index] = inserting_hyperthread;
   }
 
-  Cache& operator =(const Cache& a)
-  {
-    cache_line_count = a.cache_line_count;
-    pcache_lines = a.pcache_lines;
-    pcache_line_owners = a.pcache_line_owners;
+  Cache &operator=(const Cache &a) {
+    cache_line_count             = a.cache_line_count;
+    pcache_lines                 = a.pcache_lines;
+    pcache_line_owners           = a.pcache_line_owners;
     pcache_inserting_hyperthread = a.pcache_inserting_hyperthread;
-      return *this;
+    return *this;
   }
-
 };
 
-
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-template <typename a_row_view_t, typename a_nnz_view_t,
-          typename b_row_view_t, typename b_nnz_view_t, typename c_row_view_t>
-struct KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-  FlopsPerRow{
-
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename a_row_view_t, typename a_nnz_view_t, typename b_row_view_t,
+          typename b_nnz_view_t, typename c_row_view_t>
+struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                    a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                    b_scalar_nnz_view_t_>::FlopsPerRow {
   nnz_lno_t m;
   a_row_view_t row_mapA;
   a_nnz_view_t entriesA;
@@ -125,407 +125,394 @@ struct KokkosSPGEMM
   const size_type min_val;
   nnz_lno_t team_row_chunk_size;
 
-
   row_lno_temp_work_view_t c_comp_a_net_index;
   row_lno_temp_work_view_t c_comp_b_net_index;
   nnz_lno_temp_work_view_t c_comp_row_index;
   nnz_lno_temp_work_view_t c_comp_col_index;
 
-  FlopsPerRow(
-      nnz_lno_t m_,
-      a_row_view_t row_mapA_,
-      a_nnz_view_t entriesA_,
-      b_row_view_t row_mapB_,
-      b_nnz_view_t entriesB_,
-      c_row_view_t rough_row_mapC_,
-      nnz_lno_t team_row_chunk_size_):
-        m(m_),
-        row_mapA(row_mapA_), entriesA(entriesA_),
-        row_mapB(row_mapB_), entriesB(entriesB_), rough_row_mapC(rough_row_mapC_),
+  FlopsPerRow(nnz_lno_t m_, a_row_view_t row_mapA_, a_nnz_view_t entriesA_,
+              b_row_view_t row_mapB_, b_nnz_view_t entriesB_,
+              c_row_view_t rough_row_mapC_, nnz_lno_t team_row_chunk_size_)
+      : m(m_),
+        row_mapA(row_mapA_),
+        entriesA(entriesA_),
+        row_mapB(row_mapB_),
+        entriesB(entriesB_),
+        rough_row_mapC(rough_row_mapC_),
         min_val(((std::numeric_limits<size_type>::lowest()))),
-        team_row_chunk_size(team_row_chunk_size_){}
+        team_row_chunk_size(team_row_chunk_size_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const CountTag&, const team_member_t & teamMember, size_t &overal_max) const {
-
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, m);
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index)
-    {
-      //nnz_lno_t row_index = teamMember.league_rank()  * teamMember.team_size()+ teamMember.team_rank();
-      //check ii is out of range. if it is, just return.
-      //if (row_index >= m) return;
-      const size_type col_begin = row_mapA[row_index];
-      const size_type col_end = row_mapA[row_index + 1];
-      const nnz_lno_t left_work = col_end - col_begin;
-
-      size_type max_num_results_in_row = 0;
-
-      Kokkos::parallel_reduce(
-          Kokkos::ThreadVectorRange(teamMember, left_work),
-          [&] (nnz_lno_t i, size_type & valueToUpdate) {
-        const size_type adjind = i + col_begin;
-        const nnz_lno_t colIndex = entriesA[adjind];
-        valueToUpdate += row_mapB [colIndex + 1] - row_mapB[colIndex];
-        //valueToUpdate += row_mapB [colIndex] - oldrow_mapB[colIndex];
-      },
-      max_num_results_in_row);
-      overal_max += max_num_results_in_row;
-      rough_row_mapC(row_index)  = max_num_results_in_row;
-    });
+  void operator()(const CountTag &, const team_member_t &teamMember,
+                  size_t &overal_max) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, m);
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          // nnz_lno_t row_index = teamMember.league_rank()  *
+          // teamMember.team_size()+ teamMember.team_rank(); check ii is out of
+          // range. if it is, just return. if (row_index >= m) return;
+          const size_type col_begin = row_mapA[row_index];
+          const size_type col_end   = row_mapA[row_index + 1];
+          const nnz_lno_t left_work = col_end - col_begin;
+
+          size_type max_num_results_in_row = 0;
+
+          Kokkos::parallel_reduce(
+              Kokkos::ThreadVectorRange(teamMember, left_work),
+              [&](nnz_lno_t i, size_type &valueToUpdate) {
+                const size_type adjind   = i + col_begin;
+                const nnz_lno_t colIndex = entriesA[adjind];
+                valueToUpdate += row_mapB[colIndex + 1] - row_mapB[colIndex];
+                // valueToUpdate += row_mapB [colIndex] - oldrow_mapB[colIndex];
+              },
+              max_num_results_in_row);
+          overal_max += max_num_results_in_row;
+          rough_row_mapC(row_index) = max_num_results_in_row;
+        });
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const FillTag&, const team_member_t & teamMember) const {
-
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, m);
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index)
-    {
-      //nnz_lno_t row_index = teamMember.league_rank()  * teamMember.team_size()+ teamMember.team_rank();
-      //check ii is out of range. if it is, just return.
-      //if (row_index >= m) return;
-      const size_type col_begin = row_mapA[row_index];
-      const size_type col_end = row_mapA[row_index + 1];
-      const nnz_lno_t left_work = col_end - col_begin;
-
-      size_type mult_index = rough_row_mapC(row_index);
-
-      //size_type max_num_results_in_row = 0;
-
-      for (nnz_lno_t i = 0; i < left_work; ++i){
-
-        const size_type adjind = i + col_begin;
-        const nnz_lno_t colIndex = entriesA[adjind];
-
-        const size_type browbegin = row_mapB[colIndex];
-        const size_type browend = row_mapB[colIndex + 1];
-        const nnz_lno_t bleft_work = browend - browbegin;
-
-        Kokkos::parallel_for(
-            Kokkos::ThreadVectorRange(teamMember, bleft_work),
-            [&] (nnz_lno_t j) {
-          size_type bcolind = browbegin + j;
-          nnz_lno_t bcol = entriesB(bcolind);
-
-          if (mult_index + j >= rough_row_mapC(row_index + 1))
-            std::cout << "mult_index:" << mult_index << " j:" << j << " rough_row_mapC(row_index + 1):" << rough_row_mapC(row_index + 1) <<  std::endl;
-          c_comp_a_net_index (mult_index + j) = adjind;
-          c_comp_b_net_index (mult_index + j) = bcolind;
-          c_comp_row_index(mult_index + j) = row_index;
-          c_comp_col_index (mult_index + j) = bcol;
-
+  void operator()(const FillTag &, const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, m);
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          // nnz_lno_t row_index = teamMember.league_rank()  *
+          // teamMember.team_size()+ teamMember.team_rank(); check ii is out of
+          // range. if it is, just return. if (row_index >= m) return;
+          const size_type col_begin = row_mapA[row_index];
+          const size_type col_end   = row_mapA[row_index + 1];
+          const nnz_lno_t left_work = col_end - col_begin;
+
+          size_type mult_index = rough_row_mapC(row_index);
+
+          // size_type max_num_results_in_row = 0;
+
+          for (nnz_lno_t i = 0; i < left_work; ++i) {
+            const size_type adjind   = i + col_begin;
+            const nnz_lno_t colIndex = entriesA[adjind];
+
+            const size_type browbegin  = row_mapB[colIndex];
+            const size_type browend    = row_mapB[colIndex + 1];
+            const nnz_lno_t bleft_work = browend - browbegin;
+
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(teamMember, bleft_work),
+                [&](nnz_lno_t j) {
+                  size_type bcolind = browbegin + j;
+                  nnz_lno_t bcol    = entriesB(bcolind);
+
+                  if (mult_index + j >= rough_row_mapC(row_index + 1))
+                    std::cout << "mult_index:" << mult_index << " j:" << j
+                              << " rough_row_mapC(row_index + 1):"
+                              << rough_row_mapC(row_index + 1) << std::endl;
+                  c_comp_a_net_index(mult_index + j) = adjind;
+                  c_comp_b_net_index(mult_index + j) = bcolind;
+                  c_comp_row_index(mult_index + j)   = row_index;
+                  c_comp_col_index(mult_index + j)   = bcol;
+                });
+            mult_index += bleft_work;
+          }
         });
-        mult_index += bleft_work;
-      }
-  });
   }
 };
 
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-void KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-    create_read_write_hg(
-    size_t &overall_flops,
-    row_lno_temp_work_view_t &c_flop_rowmap,
-    row_lno_temp_work_view_t &c_comp_a_net_index,
-    row_lno_temp_work_view_t &c_comp_b_net_index,
-    nnz_lno_temp_work_view_t &c_comp_row_index,
-    nnz_lno_temp_work_view_t &c_comp_col_index){
-
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                  a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                  b_scalar_nnz_view_t_>::
+    create_read_write_hg(size_t &overall_flops,
+                         row_lno_temp_work_view_t &c_flop_rowmap,
+                         row_lno_temp_work_view_t &c_comp_a_net_index,
+                         row_lno_temp_work_view_t &c_comp_b_net_index,
+                         nnz_lno_temp_work_view_t &c_comp_row_index,
+                         nnz_lno_temp_work_view_t &c_comp_col_index) {
   overall_flops = 0;
-  c_flop_rowmap = row_lno_temp_work_view_t ("flops per row", a_row_cnt + 1);
-
-  //KokkosKernels::Impl::ExecSpaceType my_exec_space = KokkosKernels::Impl::get_exec_space_type<MyExecSpace>();
-  int suggested_vector_size = this->handle->get_suggested_vector_size(a_row_cnt, entriesA.extent(0));
-  int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(suggested_team_size,concurrency, a_row_cnt);
-
-  FlopsPerRow  <const_a_lno_row_view_t,const_a_lno_nnz_view_t, const_b_lno_row_view_t, const_b_lno_nnz_view_t, row_lno_temp_work_view_t> pcnnnz(
-      a_row_cnt,
-      row_mapA,
-      entriesA,
-      row_mapB,
-      entriesB,
-      c_flop_rowmap,
-      team_row_chunk_size );
-
-  //calculate how many flops per row is performed
-  Kokkos::parallel_reduce( team_count_policy_t(a_row_cnt / team_row_chunk_size  + 1 , suggested_team_size, suggested_vector_size), pcnnnz, overall_flops);
+  c_flop_rowmap = row_lno_temp_work_view_t("flops per row", a_row_cnt + 1);
+
+  // KokkosKernels::Impl::ExecSpaceType my_exec_space =
+  // KokkosKernels::Impl::get_exec_space_type<MyExecSpace>();
+  int suggested_vector_size =
+      this->handle->get_suggested_vector_size(a_row_cnt, entriesA.extent(0));
+  int suggested_team_size =
+      this->handle->get_suggested_team_size(suggested_vector_size);
+  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(
+      suggested_team_size, concurrency, a_row_cnt);
+
+  FlopsPerRow<const_a_lno_row_view_t, const_a_lno_nnz_view_t,
+              const_b_lno_row_view_t, const_b_lno_nnz_view_t,
+              row_lno_temp_work_view_t>
+      pcnnnz(a_row_cnt, row_mapA, entriesA, row_mapB, entriesB, c_flop_rowmap,
+             team_row_chunk_size);
+
+  // calculate how many flops per row is performed
+  Kokkos::parallel_reduce(
+      team_count_policy_t(a_row_cnt / team_row_chunk_size + 1,
+                          suggested_team_size, suggested_vector_size),
+      pcnnnz, overall_flops);
   MyExecSpace().fence();
 
-  //do a parallel prefix sum
-  KokkosKernels::Impl::exclusive_parallel_prefix_sum<row_lno_temp_work_view_t, MyExecSpace>(a_row_cnt+1, c_flop_rowmap);
+  // do a parallel prefix sum
+  KokkosKernels::Impl::exclusive_parallel_prefix_sum<row_lno_temp_work_view_t,
+                                                     MyExecSpace>(
+      a_row_cnt + 1, c_flop_rowmap);
   MyExecSpace().fence();
 
   std::cout << "overall_flops:" << overall_flops << std::endl;
 
-  //hypergraph creation. each computation is a vertex. There are overall_flops many computation vertices.
-  //Each of them is connected to nonzero of a, nonzero of b and nonzero of c.
+  // hypergraph creation. each computation is a vertex. There are overall_flops
+  // many computation vertices. Each of them is connected to nonzero of a,
+  // nonzero of b and nonzero of c.
   c_comp_a_net_index = row_lno_temp_work_view_t("a_net", overall_flops);
   c_comp_b_net_index = row_lno_temp_work_view_t("b_net", overall_flops);
-  c_comp_row_index = nnz_lno_temp_work_view_t("row ", overall_flops);
-  c_comp_col_index = nnz_lno_temp_work_view_t("col ", overall_flops);
-
-  pcnnnz.c_comp_a_net_index     = c_comp_a_net_index;
-  pcnnnz.c_comp_b_net_index     = c_comp_b_net_index;
-  pcnnnz.c_comp_row_index = c_comp_row_index;
-  pcnnnz.c_comp_col_index = c_comp_col_index;
-
-  //fill the hypergraph values.
-  //indices of nnzs for a and b nets, for c nets, the row and column index.
-  Kokkos::parallel_for( team_fill_policy_t(a_row_cnt / team_row_chunk_size  + 1 , suggested_team_size, suggested_vector_size), pcnnnz);
+  c_comp_row_index   = nnz_lno_temp_work_view_t("row ", overall_flops);
+  c_comp_col_index   = nnz_lno_temp_work_view_t("col ", overall_flops);
+
+  pcnnnz.c_comp_a_net_index = c_comp_a_net_index;
+  pcnnnz.c_comp_b_net_index = c_comp_b_net_index;
+  pcnnnz.c_comp_row_index   = c_comp_row_index;
+  pcnnnz.c_comp_col_index   = c_comp_col_index;
+
+  // fill the hypergraph values.
+  // indices of nnzs for a and b nets, for c nets, the row and column index.
+  Kokkos::parallel_for(
+      team_fill_policy_t(a_row_cnt / team_row_chunk_size + 1,
+                         suggested_team_size, suggested_vector_size),
+      pcnnnz);
   MyExecSpace().fence();
 }
 
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
 template <typename c_row_view_t>
-void KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-  print_read_write_cost(c_row_view_t rowmapC){
-
+void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                  a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                  b_scalar_nnz_view_t_>::print_read_write_cost(c_row_view_t
+                                                                   rowmapC) {
   size_t overall_flops = 0;
-  row_lno_temp_work_view_t c_flop_rowmap;//("flops per row", a_row_cnt + 1);
-
-  row_lno_temp_work_view_t c_comp_a_net_index;//("a_net", overall_flops);
-  row_lno_temp_work_view_t c_comp_b_net_index;//("b_net", overall_flops);
-  nnz_lno_temp_work_view_t c_comp_row_index;//("row ", overall_flops);
-  nnz_lno_temp_work_view_t c_comp_col_index;//("col ", overall_flops);
-
+  row_lno_temp_work_view_t c_flop_rowmap;  //("flops per row", a_row_cnt + 1);
 
-  //create the hypergraph.
-  this->create_read_write_hg(
-      overall_flops, c_flop_rowmap, c_comp_a_net_index,
-      c_comp_b_net_index, c_comp_row_index, c_comp_col_index);
+  row_lno_temp_work_view_t c_comp_a_net_index;  //("a_net", overall_flops);
+  row_lno_temp_work_view_t c_comp_b_net_index;  //("b_net", overall_flops);
+  nnz_lno_temp_work_view_t c_comp_row_index;    //("row ", overall_flops);
+  nnz_lno_temp_work_view_t c_comp_col_index;    //("col ", overall_flops);
 
+  // create the hypergraph.
+  this->create_read_write_hg(overall_flops, c_flop_rowmap, c_comp_a_net_index,
+                             c_comp_b_net_index, c_comp_row_index,
+                             c_comp_col_index);
 
-  int write_type = 0;//0 -- KKMEM, 1-KKSPEED, 2- KKCOLOR 3-KKMULTICOLOR 4-KKMULTICOLOR2
-  SPGEMMAlgorithm spgemm_algorithm = this->handle->get_spgemm_handle()->get_algorithm_type();
+  int write_type =
+      0;  // 0 -- KKMEM, 1-KKSPEED, 2- KKCOLOR 3-KKMULTICOLOR 4-KKMULTICOLOR2
+  SPGEMMAlgorithm spgemm_algorithm =
+      this->handle->get_spgemm_handle()->get_algorithm_type();
 
-  if (spgemm_algorithm == KokkosKernels::Experimental::Graph::SPGEMM_KK_COLOR){
+  if (spgemm_algorithm == KokkosKernels::Experimental::Graph::SPGEMM_KK_COLOR) {
     write_type = 2;
-  }
-  else if (spgemm_algorithm == KokkosKernels::Experimental::Graph::SPGEMM_KK_MULTICOLOR){
+  } else if (spgemm_algorithm ==
+             KokkosKernels::Experimental::Graph::SPGEMM_KK_MULTICOLOR) {
     write_type = 3;
-  }
-  else if (spgemm_algorithm == KokkosKernels::Experimental::Graph::SPGEMM_KK_MULTICOLOR2){
+  } else if (spgemm_algorithm ==
+             KokkosKernels::Experimental::Graph::SPGEMM_KK_MULTICOLOR2) {
     write_type = 4;
-  }
-  else if (spgemm_algorithm == KokkosKernels::Experimental::Graph::SPGEMM_KK_SPEED){
+  } else if (spgemm_algorithm ==
+             KokkosKernels::Experimental::Graph::SPGEMM_KK_SPEED) {
     write_type = 1;
   }
 
-
-  typename row_lno_temp_work_view_t::HostMirror h_c_flop_rowmap = Kokkos::create_mirror_view (c_flop_rowmap);
-  typename row_lno_temp_work_view_t::HostMirror h_c_comp_a_net_index = Kokkos::create_mirror_view (c_comp_a_net_index);
-  typename row_lno_temp_work_view_t::HostMirror h_c_comp_b_net_index = Kokkos::create_mirror_view (c_comp_b_net_index);
-  typename nnz_lno_temp_work_view_t::HostMirror h_c_comp_row_index = Kokkos::create_mirror_view (c_comp_row_index);
-  typename nnz_lno_temp_work_view_t::HostMirror h_c_comp_col_index = Kokkos::create_mirror_view (c_comp_col_index);
-
-  Kokkos::deep_copy (h_c_flop_rowmap, c_flop_rowmap);
-  Kokkos::deep_copy (h_c_comp_a_net_index, c_comp_a_net_index);
-  Kokkos::deep_copy (h_c_comp_b_net_index, c_comp_b_net_index);
-  Kokkos::deep_copy (h_c_comp_row_index, c_comp_row_index);
-  Kokkos::deep_copy (h_c_comp_col_index, c_comp_col_index);
-
-
-  //nnz_lno_t num_parallel_colors = 1;
-  bool isGPU=false;
-  //int num_threads=68;
-  int vectorlane = 1;
+  typename row_lno_temp_work_view_t::HostMirror h_c_flop_rowmap =
+      Kokkos::create_mirror_view(c_flop_rowmap);
+  typename row_lno_temp_work_view_t::HostMirror h_c_comp_a_net_index =
+      Kokkos::create_mirror_view(c_comp_a_net_index);
+  typename row_lno_temp_work_view_t::HostMirror h_c_comp_b_net_index =
+      Kokkos::create_mirror_view(c_comp_b_net_index);
+  typename nnz_lno_temp_work_view_t::HostMirror h_c_comp_row_index =
+      Kokkos::create_mirror_view(c_comp_row_index);
+  typename nnz_lno_temp_work_view_t::HostMirror h_c_comp_col_index =
+      Kokkos::create_mirror_view(c_comp_col_index);
+
+  Kokkos::deep_copy(h_c_flop_rowmap, c_flop_rowmap);
+  Kokkos::deep_copy(h_c_comp_a_net_index, c_comp_a_net_index);
+  Kokkos::deep_copy(h_c_comp_b_net_index, c_comp_b_net_index);
+  Kokkos::deep_copy(h_c_comp_row_index, c_comp_row_index);
+  Kokkos::deep_copy(h_c_comp_col_index, c_comp_col_index);
+
+  // nnz_lno_t num_parallel_colors = 1;
+  bool isGPU = false;
+  // int num_threads=68;
+  int vectorlane      = 1;
   int cache_line_size = 64;
-  int data_size = 8;
-  int cache_l1_size = 32 * 1024 ;
+  int data_size       = 8;
+  int cache_l1_size   = 32 * 1024;
 
   nnz_lno_t num_colors = 0, num_multi_colors = 0, num_used_colors_steps = 0;
   nnz_lno_persistent_work_host_view_t color_xadj;
   nnz_lno_persistent_work_view_t color_adj, vertex_colors;
 
-  this->handle->get_spgemm_handle()->get_color_xadj(num_colors, color_xadj, color_adj, vertex_colors, num_multi_colors, num_used_colors_steps);
-
+  this->handle->get_spgemm_handle()->get_color_xadj(
+      num_colors, color_xadj, color_adj, vertex_colors, num_multi_colors,
+      num_used_colors_steps);
 
   nnz_lno_persistent_work_host_view_t h_color_adj;
   nnz_lno_persistent_work_host_view_t h_vertex_colors;
 
-  if (write_type == 2 || write_type == 3){
-    num_used_colors_steps  = num_colors / num_multi_colors;
+  if (write_type == 2 || write_type == 3) {
+    num_used_colors_steps = num_colors / num_multi_colors;
     if (num_colors % num_multi_colors) num_used_colors_steps++;
     num_multi_colors = 1;
     KokkosKernels::Impl::print_1Dview(vertex_colors);
     KokkosKernels::Impl::print_1Dview(color_adj);
     KokkosKernels::Impl::print_1Dview(color_xadj);
-    h_color_adj= Kokkos::create_mirror_view (color_adj);
-    h_vertex_colors= Kokkos::create_mirror_view (vertex_colors);
-    Kokkos::deep_copy (h_color_adj, color_adj);
-    Kokkos::deep_copy (h_vertex_colors, vertex_colors);
-  }
-  else if (write_type == 4){
-
-    //num_used_colors_steps  = num_colors / num_multi_colors;
-    //if (num_colors % num_multi_colors) num_used_colors_steps++;
+    h_color_adj     = Kokkos::create_mirror_view(color_adj);
+    h_vertex_colors = Kokkos::create_mirror_view(vertex_colors);
+    Kokkos::deep_copy(h_color_adj, color_adj);
+    Kokkos::deep_copy(h_vertex_colors, vertex_colors);
+  } else if (write_type == 4) {
+    // num_used_colors_steps  = num_colors / num_multi_colors;
+    // if (num_colors % num_multi_colors) num_used_colors_steps++;
     num_used_colors_steps = num_colors;
     KokkosKernels::Impl::print_1Dview(vertex_colors);
     KokkosKernels::Impl::print_1Dview(color_adj);
     KokkosKernels::Impl::print_1Dview(color_xadj);
-    h_color_adj= Kokkos::create_mirror_view (color_adj);
-    h_vertex_colors= Kokkos::create_mirror_view (vertex_colors);
-    Kokkos::deep_copy (h_color_adj, color_adj);
-    Kokkos::deep_copy (h_vertex_colors, vertex_colors);
-  }
-  else {
+    h_color_adj     = Kokkos::create_mirror_view(color_adj);
+    h_vertex_colors = Kokkos::create_mirror_view(vertex_colors);
+    Kokkos::deep_copy(h_color_adj, color_adj);
+    Kokkos::deep_copy(h_vertex_colors, vertex_colors);
+  } else {
     num_used_colors_steps = 1;
-    num_multi_colors = 1;
-    num_colors = 1;
-    color_xadj = nnz_lno_persistent_work_host_view_t("tmp", 2);
-    color_xadj (0) = 0;
-    color_xadj (1) = a_row_cnt;
+    num_multi_colors      = 1;
+    num_colors            = 1;
+    color_xadj            = nnz_lno_persistent_work_host_view_t("tmp", 2);
+    color_xadj(0)         = 0;
+    color_xadj(1)         = a_row_cnt;
 
     h_vertex_colors = nnz_lno_persistent_work_host_view_t("tmp", a_row_cnt);
-    h_color_adj = nnz_lno_persistent_work_host_view_t ("tt", a_row_cnt);
+    h_color_adj     = nnz_lno_persistent_work_host_view_t("tt", a_row_cnt);
 
     for (int i = 0; i < a_row_cnt; ++i) h_color_adj(i) = i;
     for (int i = 0; i < a_row_cnt; ++i) h_vertex_colors(i) = 0;
   }
-  std::cout << "num_colors:" << num_colors << " num_multi_colors:" << num_multi_colors << " num_used_colors:" << num_used_colors_steps << std::endl;
-
+  std::cout << "num_colors:" << num_colors
+            << " num_multi_colors:" << num_multi_colors
+            << " num_used_colors:" << num_used_colors_steps << std::endl;
 
-  typename c_row_view_t::HostMirror h_c_rowmap = Kokkos::create_mirror_view (rowmapC);
-  Kokkos::deep_copy (h_c_rowmap, rowmapC);
+  typename c_row_view_t::HostMirror h_c_rowmap =
+      Kokkos::create_mirror_view(rowmapC);
+  Kokkos::deep_copy(h_c_rowmap, rowmapC);
 
   /*
   isGPU = true;
-  vectorlane = get_suggested_vector__size(b_row_cnt, entriesB.extent(0), KokkosKernels::Impl::Exec_CUDA);
-  cache_line_size = 32 * 8;
-  cache_size = cache_line_size * 7;
+  vectorlane = get_suggested_vector__size(b_row_cnt, entriesB.extent(0),
+  KokkosKernels::Impl::Exec_CUDA); cache_line_size = 32 * 8; cache_size =
+  cache_line_size * 7;
   */
 
-
-  int cores[] =         {64, 64, 64, 64, 64, 64};
-  int hyperthreads[] =  {1 , 2 , 2 , 4,  4 , 4};
-  int team_size   [] =  {1 , 1 , 2 , 4 , 2 , 1};
-  for (int k = 5; k >= 0; --k){
-
-
-    int num_cores=cores[k];
+  int cores[]        = {64, 64, 64, 64, 64, 64};
+  int hyperthreads[] = {1, 2, 2, 4, 4, 4};
+  int team_size[]    = {1, 1, 2, 4, 2, 1};
+  for (int k = 5; k >= 0; --k) {
+    int num_cores                = cores[k];
     int num_hyperthreads_in_core = hyperthreads[k];
-    int hyper_threads_in_team = team_size[k];
+    int hyper_threads_in_team    = team_size[k];
     this->read_write_cost(
 
-        num_colors,
-        num_multi_colors,
-        num_used_colors_steps,
-        isGPU,
-        num_cores,
-        num_hyperthreads_in_core,
-        hyper_threads_in_team,
-        vectorlane,
-        cache_line_size,
-        data_size,
-        cache_l1_size,
-
-        color_xadj,
-        h_color_adj,
-        h_vertex_colors,
-
-        overall_flops,
-        h_c_flop_rowmap,
-        h_c_comp_a_net_index,
-        h_c_comp_b_net_index,
-        h_c_comp_row_index,
-        h_c_comp_col_index,
-        h_c_rowmap,
-        write_type
-    );
+        num_colors, num_multi_colors, num_used_colors_steps, isGPU, num_cores,
+        num_hyperthreads_in_core, hyper_threads_in_team, vectorlane,
+        cache_line_size, data_size, cache_l1_size,
+
+        color_xadj, h_color_adj, h_vertex_colors,
+
+        overall_flops, h_c_flop_rowmap, h_c_comp_a_net_index,
+        h_c_comp_b_net_index, h_c_comp_row_index, h_c_comp_col_index,
+        h_c_rowmap, write_type);
   }
 }
 
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
 template <typename c_row_view_t>
-void KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
+void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                  a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                  b_scalar_nnz_view_t_>::
     read_write_cost(
-    nnz_lno_t num_colors,
-    nnz_lno_t num_multi_colors,
-    nnz_lno_t num_parallel_colors,
-    bool isGPU,
-    int num_cores,
-
-    nnz_lno_t num_hyperthreads_in_core,
-    nnz_lno_t hyper_threads_in_team,
-
-    int vectorlane,
-    const int cache_line_size,
-    const int data_size,
-    const int cache_size,
-
-    nnz_lno_persistent_work_host_view_t color_xadj,
-    typename nnz_lno_persistent_work_view_t::HostMirror color_adj,
-    typename nnz_lno_persistent_work_view_t::HostMirror vertex_colors,
-
-    size_t overall_flops,
-    typename row_lno_temp_work_view_t::HostMirror c_flop_rowmap,
-    typename row_lno_temp_work_view_t::HostMirror c_comp_a_net_index,
-    typename row_lno_temp_work_view_t::HostMirror c_comp_b_net_index,
-    typename nnz_lno_temp_work_view_t::HostMirror c_comp_row_index,
-    typename nnz_lno_temp_work_view_t::HostMirror c_comp_col_index,
-    c_row_view_t rowmapC,
-    int write_type //0 -- KKMEM, 1-KKSPEED, 2- KKCOLOR 3-KKMULTICOLOR 4-KKMULTICOLOR2
-    ){
-
-std::cout << "num_colors:" << num_colors << " num_multi_colors:" << num_multi_colors << " num_parallel_colors:" << num_parallel_colors << std::endl;
+        nnz_lno_t num_colors, nnz_lno_t num_multi_colors,
+        nnz_lno_t num_parallel_colors, bool isGPU, int num_cores,
+
+        nnz_lno_t num_hyperthreads_in_core, nnz_lno_t hyper_threads_in_team,
+
+        int vectorlane, const int cache_line_size, const int data_size,
+        const int cache_size,
+
+        nnz_lno_persistent_work_host_view_t color_xadj,
+        typename nnz_lno_persistent_work_view_t::HostMirror color_adj,
+        typename nnz_lno_persistent_work_view_t::HostMirror vertex_colors,
+
+        size_t overall_flops,
+        typename row_lno_temp_work_view_t::HostMirror c_flop_rowmap,
+        typename row_lno_temp_work_view_t::HostMirror c_comp_a_net_index,
+        typename row_lno_temp_work_view_t::HostMirror c_comp_b_net_index,
+        typename nnz_lno_temp_work_view_t::HostMirror c_comp_row_index,
+        typename nnz_lno_temp_work_view_t::HostMirror c_comp_col_index,
+        c_row_view_t rowmapC,
+        int write_type  // 0 -- KKMEM, 1-KKSPEED, 2- KKCOLOR 3-KKMULTICOLOR
+                        // 4-KKMULTICOLOR2
+    ) {
+  std::cout << "num_colors:" << num_colors
+            << " num_multi_colors:" << num_multi_colors
+            << " num_parallel_colors:" << num_parallel_colors << std::endl;
 
   const int cache_1_line_word_count = cache_line_size / data_size;
-  const int cache_1_line_count = cache_size / cache_line_size;
-
-
+  const int cache_1_line_count      = cache_size / cache_line_size;
 
   size_t a_line_size = entriesA.extent(0) / cache_line_size + 1;
   size_t b_line_size = entriesB.extent(0) / cache_line_size + 1;
 
-
   typename nnz_lno_temp_work_view_t::HostMirror tester("t", overall_flops);
-  typename c_row_view_t::HostMirror h_rowmapC = Kokkos::create_mirror_view (rowmapC);
-  Kokkos::deep_copy (h_rowmapC, rowmapC);
+  typename c_row_view_t::HostMirror h_rowmapC =
+      Kokkos::create_mirror_view(rowmapC);
+  Kokkos::deep_copy(h_rowmapC, rowmapC);
 
   size_t overall_a_l1_missread = 0;
   size_t overall_b_l1_missread = 0;
-  size_t overall_a_l1_reuse = 0;
-  size_t overall_b_l1_reuse= 0;
-
+  size_t overall_a_l1_reuse    = 0;
+  size_t overall_b_l1_reuse    = 0;
 
   size_t overall_c_l1_misswrite = 0;
-  size_t overall_c_l1_reuse= 0;
+  size_t overall_c_l1_reuse     = 0;
 
-
-
-  //std::cout << "I:" << i << " color_begin:" << color_begin <<  " color_end:" << color_end << " percore:" << percore << std::endl;
+  // std::cout << "I:" << i << " color_begin:" << color_begin <<  " color_end:"
+  // << color_end << " percore:" << percore << std::endl;
   int num_threads = 1;
-#if defined( KOKKOS_ENABLE_OPENMP )
+#if defined(KOKKOS_ENABLE_OPENMP)
 #pragma omp parallel
-  {
-    num_threads = omp_get_num_threads();
-  }
+  { num_threads = omp_get_num_threads(); }
 #endif
 
-  //std::cout << "num_threads:" << num_threads << std::endl;
-  const int num_teams_in_core = num_hyperthreads_in_core / hyper_threads_in_team;
+  // std::cout << "num_threads:" << num_threads << std::endl;
+  const int num_teams_in_core =
+      num_hyperthreads_in_core / hyper_threads_in_team;
 
-  std::vector <std::vector <size_type>  > t_team_worksize (num_threads);
-  std::vector <std::vector <size_type>  > t_team_begins (num_threads);
-  std::vector <std::vector <size_type>  > t_team_ends (num_threads);
-  std::vector <std::vector <nnz_lno_t>  > t_team_row_col_indices (num_threads);
-  std::vector <Cache *> t_team_caches (num_threads);
+  std::vector<std::vector<size_type> > t_team_worksize(num_threads);
+  std::vector<std::vector<size_type> > t_team_begins(num_threads);
+  std::vector<std::vector<size_type> > t_team_ends(num_threads);
+  std::vector<std::vector<nnz_lno_t> > t_team_row_col_indices(num_threads);
+  std::vector<Cache *> t_team_caches(num_threads);
 
-  for (int tid = 0; tid < num_threads; ++tid){
+  for (int tid = 0; tid < num_threads; ++tid) {
     t_team_worksize[tid].resize(num_teams_in_core);
     t_team_begins[tid].resize(num_teams_in_core);
     t_team_ends[tid].resize(num_teams_in_core);
@@ -535,322 +522,345 @@ std::cout << "num_colors:" << num_colors << " num_multi_colors:" << num_multi_co
     t_team_row_col_indices[tid].resize(b_col_cnt, -1);
   }
 
-
-  //std::cout << "num_parallel_colors:" << num_parallel_colors << std::endl;
-  for (nnz_lno_t i = 0; i < num_parallel_colors; i+= num_multi_colors){
-
-
-    nnz_lno_t color_upperbound = KOKKOSKERNELS_MACRO_MIN(num_parallel_colors, i + num_multi_colors);
-    std::cout   << "i:" << i
-              << " color_upperbound:" << color_upperbound
-        << " num_parallel_colors:" << num_parallel_colors
-        << " num_multi_colors:" << num_multi_colors << std::endl;;
+  // std::cout << "num_parallel_colors:" << num_parallel_colors << std::endl;
+  for (nnz_lno_t i = 0; i < num_parallel_colors; i += num_multi_colors) {
+    nnz_lno_t color_upperbound =
+        KOKKOSKERNELS_MACRO_MIN(num_parallel_colors, i + num_multi_colors);
+    std::cout << "i:" << i << " color_upperbound:" << color_upperbound
+              << " num_parallel_colors:" << num_parallel_colors
+              << " num_multi_colors:" << num_multi_colors << std::endl;
+    ;
 
     nnz_lno_t color_begin = color_xadj(i);
-    nnz_lno_t color_end = color_xadj(color_upperbound);
-    //std::cout << "i:" << i << " color_begin:" << color_begin  << " color_end:" << color_end << std::endl;
+    nnz_lno_t color_end   = color_xadj(color_upperbound);
+    // std::cout << "i:" << i << " color_begin:" << color_begin  << "
+    // color_end:" << color_end << std::endl;
     nnz_lno_t percore = 32 / vectorlane;
-    if (!isGPU) percore = (color_end - color_begin) / num_cores + 1;
+    if (!isGPU)
+      percore = (color_end - color_begin) / num_cores + 1;
     else {
       num_cores = a_row_cnt / (percore) + 1;
     }
 
-
-
-
 #pragma omp parallel for
-    for (nnz_lno_t j = color_begin; j < color_end; j += percore){
-
+    for (nnz_lno_t j = color_begin; j < color_end; j += percore) {
       size_t amissreadcount_l1 = 0;
-      size_t areusecount_l1 = 0;
+      size_t areusecount_l1    = 0;
       size_t bmissreadcount_l1 = 0;
-      size_t breusecount_l1 = 0;
+      size_t breusecount_l1    = 0;
 
       size_t cmisswritecount_l1 = 0;
-      size_t creusecount_l1 = 0;
-
-
-
+      size_t creusecount_l1     = 0;
 
       const size_t upperbound = KOKKOSKERNELS_MACRO_MIN(color_end, j + percore);
 
       const size_t num_work_for_core = upperbound - j;
 
-      //std::vector <size_type> num_work_per_hyperthread (num_hyperthreads, num_work_for_core / num_hyperthreads);
-      //std::vector <size_type> team_index(num_hyperthreads);
-
+      // std::vector <size_type> num_work_per_hyperthread (num_hyperthreads,
+      // num_work_for_core / num_hyperthreads); std::vector <size_type>
+      // team_index(num_hyperthreads);
 
       /*
        *
       Cache L1_cache (cache_1_line_count);
-      std::vector <size_type> team_worksize(num_teams_in_core, num_work_for_core / num_teams_in_core);
-      std::vector <size_type> team_begins(num_teams_in_core, 0);
-      std::vector <size_type> team_ends(num_teams_in_core, 0);
+      std::vector <size_type> team_worksize(num_teams_in_core, num_work_for_core
+      / num_teams_in_core); std::vector <size_type>
+      team_begins(num_teams_in_core, 0); std::vector <size_type>
+      team_ends(num_teams_in_core, 0);
       */
       int mytid = 0;
-#if defined( KOKKOS_ENABLE_OPENMP )
+#if defined(KOKKOS_ENABLE_OPENMP)
       mytid = omp_get_thread_num();
 #endif
-      Cache L1_cache = *(t_team_caches[mytid]);
-      std::vector <size_type> &team_worksize = t_team_worksize[mytid];
-      std::vector <size_type> &team_begins = t_team_begins[mytid];
-      std::vector <size_type> &team_ends = t_team_ends[mytid];
+      Cache L1_cache                        = *(t_team_caches[mytid]);
+      std::vector<size_type> &team_worksize = t_team_worksize[mytid];
+      std::vector<size_type> &team_begins   = t_team_begins[mytid];
+      std::vector<size_type> &team_ends     = t_team_ends[mytid];
       L1_cache.clear_cache();
 
-      std::vector <nnz_lno_t> &rows_col_indices = t_team_row_col_indices[mytid];
+      std::vector<nnz_lno_t> &rows_col_indices = t_team_row_col_indices[mytid];
 
-      for (int ws = 0; ws < num_teams_in_core; ++ws){
+      for (int ws = 0; ws < num_teams_in_core; ++ws) {
         team_worksize[ws] = num_work_for_core / num_teams_in_core;
         team_begins[ws] = team_ends[ws] = 0;
       }
-      //std::vector <bool> teams_are_done(num_teams_in_core, false);
+      // std::vector <bool> teams_are_done(num_teams_in_core, false);
       int not_all_done = num_teams_in_core;
 
-      for (int ht = 0; ht< num_teams_in_core; ht++){
-        if (ht < int (num_work_for_core % num_teams_in_core)) {
+      for (int ht = 0; ht < num_teams_in_core; ht++) {
+        if (ht < int(num_work_for_core % num_teams_in_core)) {
           team_worksize[ht]++;
         }
       }
 
       team_begins[0] = j;
-      team_ends[0] = team_worksize[0] + j;
+      team_ends[0]   = team_worksize[0] + j;
 
-      for (int ht = 1; ht< num_teams_in_core; ht++){
-        team_begins[ht] =team_ends[ht - 1];
-        team_ends[ht] = team_begins[ht] + team_worksize[ht];
+      for (int ht = 1; ht < num_teams_in_core; ht++) {
+        team_begins[ht] = team_ends[ht - 1];
+        team_ends[ht]   = team_begins[ht] + team_worksize[ht];
 
         if (team_begins[ht] == team_ends[ht]) --not_all_done;
 
-        //std::cout << "ht:" << ht << " team_begins[ht]:" << team_begins[ht]  << " team_ends[ht]:" << team_ends[ht] << " upperbound: " << upperbound<< std::endl;
+        // std::cout << "ht:" << ht << " team_begins[ht]:" << team_begins[ht] <<
+        // " team_ends[ht]:" << team_ends[ht] << " upperbound: " << upperbound<<
+        // std::endl;
       }
 
-
-      while (not_all_done){
-
+      while (not_all_done) {
         int hyper_thread_ind = 0;
-        for (int k = 0; k < num_teams_in_core; k++){
-          //std::cout << "k:"<< k << " team_begins[k]:" << team_begins[k] <<  " team_ends[k]:" << team_ends[k] << " not_all_done:" << not_all_done <<  std::endl;
+        for (int k = 0; k < num_teams_in_core; k++) {
+          // std::cout << "k:"<< k << " team_begins[k]:" << team_begins[k] <<  "
+          // team_ends[k]:" << team_ends[k] << " not_all_done:" << not_all_done
+          // <<  std::endl;
 
-          for (nnz_lno_t z = 0; z < hyper_threads_in_team; z++){
+          for (nnz_lno_t z = 0; z < hyper_threads_in_team; z++) {
             ++hyper_thread_ind;
-            if (team_begins[k] < team_ends[k]){
+            if (team_begins[k] < team_ends[k]) {
               nnz_lno_t zz = team_begins[k]++;
-              if (!(team_begins[k] < team_ends[k])){
+              if (!(team_begins[k] < team_ends[k])) {
                 not_all_done--;
               }
-              nnz_lno_t row = color_adj(zz);
+              nnz_lno_t row                     = color_adj(zz);
               const size_t row_flop_begin_index = c_flop_rowmap(row);
-              const size_t row_flop_end_index = c_flop_rowmap(row + 1);
+              const size_t row_flop_end_index   = c_flop_rowmap(row + 1);
 
               nnz_lno_t num_cols = 0;
 
-
-
-              for (size_t z = row_flop_begin_index; z < row_flop_end_index; ++z){
-                //std::cout << "z:" << z << std::endl;
-                size_t areadind = c_comp_a_net_index(z) / cache_1_line_word_count;
-                size_t breadind = c_comp_b_net_index(z) / cache_1_line_word_count;
-
+              for (size_t z = row_flop_begin_index; z < row_flop_end_index;
+                   ++z) {
+                // std::cout << "z:" << z << std::endl;
+                size_t areadind =
+                    c_comp_a_net_index(z) / cache_1_line_word_count;
+                size_t breadind =
+                    c_comp_b_net_index(z) / cache_1_line_word_count;
 
                 ++tester(z);
-                if (L1_cache.is_in_cache('a', areadind, areadind % cache_1_line_count)){
+                if (L1_cache.is_in_cache('a', areadind,
+                                         areadind % cache_1_line_count)) {
                   areusecount_l1++;
-                }
-                else {
+                } else {
                   amissreadcount_l1++;
-                  L1_cache.insert_to_cache('a', areadind, areadind % cache_1_line_count);
+                  L1_cache.insert_to_cache('a', areadind,
+                                           areadind % cache_1_line_count);
                 }
-                if (L1_cache.is_in_cache('b', breadind, (a_line_size + breadind) % cache_1_line_count)){
+                if (L1_cache.is_in_cache(
+                        'b', breadind,
+                        (a_line_size + breadind) % cache_1_line_count)) {
                   breusecount_l1++;
-                }
-                else {
+                } else {
                   bmissreadcount_l1++;
-                  L1_cache.insert_to_cache('b', breadind, (a_line_size + breadind) % cache_1_line_count);
+                  L1_cache.insert_to_cache(
+                      'b', breadind,
+                      (a_line_size + breadind) % cache_1_line_count);
                 }
 
-
                 nnz_lno_t result_col = c_comp_col_index(z);
-                //if (row == 0) std::cout << c_comp_col_index(z) << std::endl;
-
-                switch (write_type){
-                case 0:
-                {
-                  nnz_lno_t colpos = 0;
-                  bool found = false;
-                  for (colpos = 0; colpos <  num_cols; ++colpos){
-                    if (rows_col_indices[colpos] == result_col){
-                      found = true;
-                      break;
+                // if (row == 0) std::cout << c_comp_col_index(z) << std::endl;
+
+                switch (write_type) {
+                  case 0: {
+                    nnz_lno_t colpos = 0;
+                    bool found       = false;
+                    for (colpos = 0; colpos < num_cols; ++colpos) {
+                      if (rows_col_indices[colpos] == result_col) {
+                        found = true;
+                        break;
+                      }
                     }
-                  }
-                  if (!found){
-                    num_cols++;
-                    rows_col_indices[colpos] = result_col;
-                  }
-                  if (L1_cache.is_in_cache('c',
-                      (h_rowmapC(row) + colpos) / cache_1_line_word_count,
-                      (a_line_size + b_line_size + (h_rowmapC(row) + colpos)  / cache_1_line_word_count) % cache_1_line_count)){
-                    creusecount_l1++;
-                  }
-                  else {
-                    cmisswritecount_l1++;
-                    L1_cache.insert_to_cache('c',
-                        (h_rowmapC(row) + colpos) / cache_1_line_word_count,
-                        (a_line_size + b_line_size + (colpos + h_rowmapC(row))/ cache_1_line_word_count) % cache_1_line_count);
-                  }
-                }
-                break;
-                case 2:
-                  hyper_thread_ind = 1;
-                case 1:
-                {
-                  nnz_lno_t result_col_ind = result_col + (hyper_thread_ind - 1)* b_col_cnt;
-                  if (L1_cache.is_in_cache('c',
-                      (result_col_ind) / cache_1_line_word_count,
-                      (a_line_size + b_line_size + (result_col_ind)  / cache_1_line_word_count) % cache_1_line_count)){
-
-                    /*
-                    if (L1_cache.cache_inserting_hyperthread[(
-                        a_line_size + b_line_size + (result_col_ind)  / cache_1_line_word_count) % cache_1_line_count] != hyper_thread_ind){
-                      std::cout << "hyper_thread_ind:" << hyper_thread_ind
-                                << " result_col:" << result_col
-                                << " result_col_ind:" << result_col_ind
-                                << " is in the cache inserted by:" << L1_cache.cache_inserting_hyperthread[(
-                                    a_line_size + b_line_size + (result_col_ind)  / cache_1_line_word_count) % cache_1_line_count] << std::endl;
+                    if (!found) {
+                      num_cols++;
+                      rows_col_indices[colpos] = result_col;
                     }
-
-                    */
-                    creusecount_l1++;
-                  }
-                  else {
-                    cmisswritecount_l1++;
-                    L1_cache.insert_to_cache('c',
-                        (result_col_ind) / cache_1_line_word_count,
-                        (a_line_size + b_line_size + (result_col_ind)/ cache_1_line_word_count) % cache_1_line_count, hyper_thread_ind);
-                  }
-                }
-                break;
-                case 3:
-                case 4:
-                {
-
-                  nnz_lno_t row_color = vertex_colors(row);
-                  //if (row_color < 0 || row_color >= num_multi_colors) std::cout << "row:" << row << " rowcol:" << row_color << std::endl;
-                  result_col = result_col + row_color * b_col_cnt;
-                  if (L1_cache.is_in_cache('c',
-                      (result_col) / cache_1_line_word_count,
-                      (a_line_size + b_line_size + (result_col)  / cache_1_line_word_count) % cache_1_line_count)){
-                    creusecount_l1++;
-                  }
-                  else {
-                    cmisswritecount_l1++;
-                    L1_cache.insert_to_cache('c',
-                        (result_col) / cache_1_line_word_count,
-                        (a_line_size + b_line_size + (result_col)/ cache_1_line_word_count) % cache_1_line_count);
-                  }
-                }
-                break;
+                    if (L1_cache.is_in_cache(
+                            'c',
+                            (h_rowmapC(row) + colpos) / cache_1_line_word_count,
+                            (a_line_size + b_line_size +
+                             (h_rowmapC(row) + colpos) /
+                                 cache_1_line_word_count) %
+                                cache_1_line_count)) {
+                      creusecount_l1++;
+                    } else {
+                      cmisswritecount_l1++;
+                      L1_cache.insert_to_cache(
+                          'c',
+                          (h_rowmapC(row) + colpos) / cache_1_line_word_count,
+                          (a_line_size + b_line_size +
+                           (colpos + h_rowmapC(row)) /
+                               cache_1_line_word_count) %
+                              cache_1_line_count);
+                    }
+                  } break;
+                  case 2: hyper_thread_ind = 1;
+                  case 1: {
+                    nnz_lno_t result_col_ind =
+                        result_col + (hyper_thread_ind - 1) * b_col_cnt;
+                    if (L1_cache.is_in_cache(
+                            'c', (result_col_ind) / cache_1_line_word_count,
+                            (a_line_size + b_line_size +
+                             (result_col_ind) / cache_1_line_word_count) %
+                                cache_1_line_count)) {
+                      /*
+                      if (L1_cache.cache_inserting_hyperthread[(
+                          a_line_size + b_line_size + (result_col_ind)  /
+                      cache_1_line_word_count) % cache_1_line_count] !=
+                      hyper_thread_ind){ std::cout << "hyper_thread_ind:" <<
+                      hyper_thread_ind
+                                  << " result_col:" << result_col
+                                  << " result_col_ind:" << result_col_ind
+                                  << " is in the cache inserted by:" <<
+                      L1_cache.cache_inserting_hyperthread[( a_line_size +
+                      b_line_size + (result_col_ind)  / cache_1_line_word_count)
+                      % cache_1_line_count] << std::endl;
+                      }
+
+                      */
+                      creusecount_l1++;
+                    } else {
+                      cmisswritecount_l1++;
+                      L1_cache.insert_to_cache(
+                          'c', (result_col_ind) / cache_1_line_word_count,
+                          (a_line_size + b_line_size +
+                           (result_col_ind) / cache_1_line_word_count) %
+                              cache_1_line_count,
+                          hyper_thread_ind);
+                    }
+                  } break;
+                  case 3:
+                  case 4: {
+                    nnz_lno_t row_color = vertex_colors(row);
+                    // if (row_color < 0 || row_color >= num_multi_colors)
+                    // std::cout << "row:" << row << " rowcol:" << row_color <<
+                    // std::endl;
+                    result_col = result_col + row_color * b_col_cnt;
+                    if (L1_cache.is_in_cache(
+                            'c', (result_col) / cache_1_line_word_count,
+                            (a_line_size + b_line_size +
+                             (result_col) / cache_1_line_word_count) %
+                                cache_1_line_count)) {
+                      creusecount_l1++;
+                    } else {
+                      cmisswritecount_l1++;
+                      L1_cache.insert_to_cache(
+                          'c', (result_col) / cache_1_line_word_count,
+                          (a_line_size + b_line_size +
+                           (result_col) / cache_1_line_word_count) %
+                              cache_1_line_count);
+                    }
+                  } break;
                 }
               }
 
-              for (size_type z = 0; z < num_cols; ++z){
+              for (size_type z = 0; z < num_cols; ++z) {
                 rows_col_indices[z] = -1;
               }
 
-              if (write_type != 0){
-                cmisswritecount_l1 += ceil ((h_rowmapC(row + 1) - h_rowmapC(row)) / double (cache_1_line_word_count));
-                creusecount_l1 += row_flop_end_index - row_flop_begin_index - ceil ((h_rowmapC(row + 1) - h_rowmapC(row)) / double (cache_1_line_word_count));
+              if (write_type != 0) {
+                cmisswritecount_l1 +=
+                    ceil((h_rowmapC(row + 1) - h_rowmapC(row)) /
+                         double(cache_1_line_word_count));
+                creusecount_l1 += row_flop_end_index - row_flop_begin_index -
+                                  ceil((h_rowmapC(row + 1) - h_rowmapC(row)) /
+                                       double(cache_1_line_word_count));
               }
             }
           }
         }
       }
 
-      Kokkos::atomic_fetch_add(&overall_c_l1_misswrite , cmisswritecount_l1);
-      //overall_c_l1_misswrite += cmisswritecount_l1;
-      Kokkos::atomic_fetch_add(&overall_c_l1_reuse , creusecount_l1);
-      //overall_c_l1_reuse += creusecount_l1;
-      Kokkos::atomic_fetch_add(&overall_a_l1_missread , amissreadcount_l1);
-      //overall_a_l1_missread += amissreadcount_l1;
-      Kokkos::atomic_fetch_add(&overall_b_l1_missread , bmissreadcount_l1);
-      //overall_b_l1_missread += bmissreadcount_l1;
-      Kokkos::atomic_fetch_add(&overall_a_l1_reuse , areusecount_l1);
-      //overall_a_l1_reuse += areusecount_l1;
-      Kokkos::atomic_fetch_add(&overall_b_l1_reuse , breusecount_l1);
-      //overall_b_l1_reuse += breusecount_l1;
+      Kokkos::atomic_fetch_add(&overall_c_l1_misswrite, cmisswritecount_l1);
+      // overall_c_l1_misswrite += cmisswritecount_l1;
+      Kokkos::atomic_fetch_add(&overall_c_l1_reuse, creusecount_l1);
+      // overall_c_l1_reuse += creusecount_l1;
+      Kokkos::atomic_fetch_add(&overall_a_l1_missread, amissreadcount_l1);
+      // overall_a_l1_missread += amissreadcount_l1;
+      Kokkos::atomic_fetch_add(&overall_b_l1_missread, bmissreadcount_l1);
+      // overall_b_l1_missread += bmissreadcount_l1;
+      Kokkos::atomic_fetch_add(&overall_a_l1_reuse, areusecount_l1);
+      // overall_a_l1_reuse += areusecount_l1;
+      Kokkos::atomic_fetch_add(&overall_b_l1_reuse, breusecount_l1);
+      // overall_b_l1_reuse += breusecount_l1;
     }
   }
 
-  for (int tid = 0; tid < num_threads; ++tid){
+  for (int tid = 0; tid < num_threads; ++tid) {
     delete t_team_caches[tid];
   }
-  //std::cout << "write_type:" << write_type << std::endl;
+  // std::cout << "write_type:" << write_type << std::endl;
   std::string algo = "KKMEM";
-  switch (write_type){
-  case 0:
-    algo = "KKMEM";
-    break;
-  case 1:
-    algo = "KKSPEED";
-    break;
-  case 2:
-    algo = "KKCOLOR";
-    break;
-  case 3:
-    algo = "KKMULTICOLOR";
-    break;
-  case 4:
-    algo = "KKMULTICOLOR2";
-    break;
-
+  switch (write_type) {
+    case 0: algo = "KKMEM"; break;
+    case 1: algo = "KKSPEED"; break;
+    case 2: algo = "KKCOLOR"; break;
+    case 3: algo = "KKMULTICOLOR"; break;
+    case 4: algo = "KKMULTICOLOR2"; break;
   }
-  //0 -- KKMEM, 1-KKSPEED, 2- KKCOLOR 3-KKMULTICOLOR 4-KKMULTICOLOR2
+  // 0 -- KKMEM, 1-KKSPEED, 2- KKCOLOR 3-KKMULTICOLOR 4-KKMULTICOLOR2
   std::cout << algo << " numthreads:" << num_cores * num_hyperthreads_in_core
             << " teamsize:" << hyper_threads_in_team
             << " overall_flops: " << overall_flops
-            << " flops per row: " << overall_flops / double (a_row_cnt)
-            << " a_read_perrow:" << overall_a_l1_missread  / double(a_row_cnt)
+            << " flops per row: " << overall_flops / double(a_row_cnt)
+            << " a_read_perrow:" << overall_a_l1_missread / double(a_row_cnt)
             << " b_read_perrow:" << overall_b_l1_missread / double(a_row_cnt)
-            << " reads per row: " << (overall_a_l1_missread + overall_b_l1_missread) / double (a_row_cnt)
-            << " writes per row: " << (overall_c_l1_misswrite) / double (a_row_cnt) << std::endl;
+            << " reads per row: "
+            << (overall_a_l1_missread + overall_b_l1_missread) /
+                   double(a_row_cnt)
+            << " writes per row: "
+            << (overall_c_l1_misswrite) / double(a_row_cnt) << std::endl;
 
   std::cout << algo << " numthreads:" << num_cores * num_hyperthreads_in_core
             << " teamsize:" << hyper_threads_in_team
-            << " overall_flops: " << overall_flops
-            << " a_read_perflop:" << overall_a_l1_missread  / double(overall_flops)
-            << " b_read_perflop:" << overall_b_l1_missread / double(overall_flops)
-            << " total_read_per_flop:" << (overall_a_l1_missread + overall_b_l1_missread) / double(overall_flops)
-            << " write per flop:" << overall_c_l1_misswrite/ double(overall_flops)
-            << std::endl;
-
-  std::cout << "\n\ta_read_pernnz:" << overall_a_l1_missread  / double(rowmapC(a_row_cnt))
-            << "\n\tb_read_pernnz:" << overall_b_l1_missread / double(rowmapC(a_row_cnt))
-            << "\n\ttotal_read_per_nnz:" << (overall_a_l1_missread + overall_b_l1_missread) / double(rowmapC(a_row_cnt))
-            << "\n\twrite per nnz:" << overall_c_l1_misswrite/ double(rowmapC(a_row_cnt))
-            << std::endl;
+            << " overall_flops: " << overall_flops << " a_read_perflop:"
+            << overall_a_l1_missread / double(overall_flops)
+            << " b_read_perflop:"
+            << overall_b_l1_missread / double(overall_flops)
+            << " total_read_per_flop:"
+            << (overall_a_l1_missread + overall_b_l1_missread) /
+                   double(overall_flops)
+            << " write per flop:"
+            << overall_c_l1_misswrite / double(overall_flops) << std::endl;
+
+  std::cout << "\n\ta_read_pernnz:"
+            << overall_a_l1_missread / double(rowmapC(a_row_cnt))
+            << "\n\tb_read_pernnz:"
+            << overall_b_l1_missread / double(rowmapC(a_row_cnt))
+            << "\n\ttotal_read_per_nnz:"
+            << (overall_a_l1_missread + overall_b_l1_missread) /
+                   double(rowmapC(a_row_cnt))
+            << "\n\twrite per nnz:"
+            << overall_c_l1_misswrite / double(rowmapC(a_row_cnt)) << std::endl;
 
   std::cout << "\topt_a_read:" << entriesA.extent(0) / cache_1_line_word_count
             << " opt_b_read:" << entriesB.extent(0) / cache_1_line_word_count
-            << " opt_total_read:" << entriesA.extent(0) / cache_1_line_word_count + entriesB.extent(0) / cache_1_line_word_count << std::endl;
+            << " opt_total_read:"
+            << entriesA.extent(0) / cache_1_line_word_count +
+                   entriesB.extent(0) / cache_1_line_word_count
+            << std::endl;
 
-  std::cout << "\taverage_per_thread a_read:" << overall_a_l1_missread  / num_cores
-            << " average_per_thread b_read:" << overall_b_l1_missread  / num_cores
-            << " average_per_thread total_read:" << (overall_a_l1_missread + overall_b_l1_missread)  / num_cores << std::endl;
+  std::cout << "\taverage_per_thread a_read:"
+            << overall_a_l1_missread / num_cores
+            << " average_per_thread b_read:"
+            << overall_b_l1_missread / num_cores
+            << " average_per_thread total_read:"
+            << (overall_a_l1_missread + overall_b_l1_missread) / num_cores
+            << std::endl;
 
   std::cout << "\toverall_a_read:" << overall_a_l1_missread
             << " overall_b_read:" << overall_b_l1_missread
-            << " total_read:" << overall_a_l1_missread + overall_b_l1_missread << std::endl;
+            << " total_read:" << overall_a_l1_missread + overall_b_l1_missread
+            << std::endl;
 
   std::cout << "\toverall_a_reuse:" << overall_a_l1_reuse
             << " overall_b_reuse:" << overall_b_l1_reuse
-            << " total_reuse:" << overall_a_l1_reuse + overall_b_l1_reuse << std::endl;
+            << " total_reuse:" << overall_a_l1_reuse + overall_b_l1_reuse
+            << std::endl;
 
   std::cout << "\toverall_c_write:" << overall_c_l1_misswrite
-            << " overall_c_reuse:" << overall_c_l1_reuse
-            <<  " opt_c_write:" << double(rowmapC(a_row_cnt)) / cache_1_line_word_count<< std::endl;
+            << " overall_c_reuse:" << overall_c_l1_reuse << " opt_c_write:"
+            << double(rowmapC(a_row_cnt)) / cache_1_line_word_count
+            << std::endl;
 }
 #endif
 
-}
-}
-
+}  // namespace Impl
+}  // namespace KokkosSparse
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_seq.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_seq.hpp
index 8b77da9c0e..ce3501c447 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_seq.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_seq.hpp
@@ -44,148 +44,136 @@
 #ifndef KOKKOSSPARSE_SPGEMM_DEBUG_HPP_
 #define KOKKOSSPARSE_SPGEMM_DEBUG_HPP_
 #include "KokkosKernels_helpers.hpp"
-namespace KokkosSparse{
-
-namespace Impl{
-
-template <typename KernelHandle,
-  typename alno_row_view_t_,
-  typename alno_nnz_view_t_,
-  typename blno_row_view_t_,
-  typename blno_nnz_view_t_,
-  typename clno_row_view_t_>
-void spgemm_debug_symbolic(
-    KernelHandle *handle,
-    typename KernelHandle::nnz_lno_t m,
-    typename KernelHandle::nnz_lno_t /* n */,
-    typename KernelHandle::nnz_lno_t k,
-    alno_row_view_t_ row_mapA,
-    alno_nnz_view_t_ entriesA,
-
-    bool /* transposeA */,
-    blno_row_view_t_ row_mapB,
-    blno_nnz_view_t_ entriesB,
-    bool /* transposeB */,
-    clno_row_view_t_ row_mapC
-    ){
-  typename alno_row_view_t_::HostMirror h_rma = Kokkos::create_mirror_view (row_mapA);
-  Kokkos::deep_copy (h_rma, row_mapA);
-  typename alno_nnz_view_t_::HostMirror h_enta = Kokkos::create_mirror_view (entriesA);
-  Kokkos::deep_copy (h_enta, entriesA);
-
-  typename blno_row_view_t_::HostMirror h_rmb = Kokkos::create_mirror_view (row_mapB);
-  Kokkos::deep_copy (h_rmb, row_mapB);
-  typename blno_nnz_view_t_::HostMirror h_entb = Kokkos::create_mirror_view (entriesB);
-  Kokkos::deep_copy (h_entb, entriesB);
-  typename clno_row_view_t_::HostMirror h_rmc = Kokkos::create_mirror_view (row_mapC);
+namespace KokkosSparse {
+
+namespace Impl {
+
+template <typename KernelHandle, typename alno_row_view_t_,
+          typename alno_nnz_view_t_, typename blno_row_view_t_,
+          typename blno_nnz_view_t_, typename clno_row_view_t_>
+void spgemm_debug_symbolic(KernelHandle *handle,
+                           typename KernelHandle::nnz_lno_t m,
+                           typename KernelHandle::nnz_lno_t /* n */,
+                           typename KernelHandle::nnz_lno_t k,
+                           alno_row_view_t_ row_mapA, alno_nnz_view_t_ entriesA,
+
+                           bool /* transposeA */, blno_row_view_t_ row_mapB,
+                           blno_nnz_view_t_ entriesB, bool /* transposeB */,
+                           clno_row_view_t_ row_mapC) {
+  typename alno_row_view_t_::HostMirror h_rma =
+      Kokkos::create_mirror_view(row_mapA);
+  Kokkos::deep_copy(h_rma, row_mapA);
+  typename alno_nnz_view_t_::HostMirror h_enta =
+      Kokkos::create_mirror_view(entriesA);
+  Kokkos::deep_copy(h_enta, entriesA);
+
+  typename blno_row_view_t_::HostMirror h_rmb =
+      Kokkos::create_mirror_view(row_mapB);
+  Kokkos::deep_copy(h_rmb, row_mapB);
+  typename blno_nnz_view_t_::HostMirror h_entb =
+      Kokkos::create_mirror_view(entriesB);
+  Kokkos::deep_copy(h_entb, entriesB);
+  typename clno_row_view_t_::HostMirror h_rmc =
+      Kokkos::create_mirror_view(row_mapC);
   Kokkos::fence();
 
   typedef typename KernelHandle::nnz_lno_t lno_t;
   typedef typename KernelHandle::size_type size_type;
-  //typedef typename KernelHandle::nnz_scalar_t scalar_t;
-
+  // typedef typename KernelHandle::nnz_scalar_t scalar_t;
 
   std::vector<bool> acc_flag(k, false);
 
   std::vector<lno_t> result_c_col_indices(k);
 
-
-
   size_type result_index = 0;
 
   h_rmc(0) = 0;
-  for (lno_t i = 0; i < m; ++i){
+  for (lno_t i = 0; i < m; ++i) {
     const size_type a_row_begin = h_rma(i);
-    const size_type a_row_end = h_rma(i + 1);
-    lno_t a_row_size = a_row_end - a_row_begin;
-    lno_t row_size = 0;
+    const size_type a_row_end   = h_rma(i + 1);
+    lno_t a_row_size            = a_row_end - a_row_begin;
+    lno_t row_size              = 0;
 
-    for (lno_t j = 0; j < a_row_size; ++j){
+    for (lno_t j = 0; j < a_row_size; ++j) {
       size_type ind = a_row_begin + j;
-      lno_t col = h_enta(ind);
-      //scalar_t val = h_vala(ind);
+      lno_t col     = h_enta(ind);
+      // scalar_t val = h_vala(ind);
 
       const size_type b_row_begin = h_rmb(col);
-      const size_type b_row_end = h_rmb(col + 1);
-      lno_t b_row_size = b_row_end - b_row_begin;
-      for (lno_t z = 0; z < b_row_size; ++z){
+      const size_type b_row_end   = h_rmb(col + 1);
+      lno_t b_row_size            = b_row_end - b_row_begin;
+      for (lno_t z = 0; z < b_row_size; ++z) {
         size_type ind_ = b_row_begin + z;
-        lno_t b_col = h_entb(ind_);
-        //scalar_t b_val = h_valb(ind_);
-        //if (i == 0) std::cout << "\tb col:" <<  b_col << std::endl;
-        if (acc_flag[b_col] == false){
-          acc_flag[b_col] = true;
+        lno_t b_col    = h_entb(ind_);
+        // scalar_t b_val = h_valb(ind_);
+        // if (i == 0) std::cout << "\tb col:" <<  b_col << std::endl;
+        if (acc_flag[b_col] == false) {
+          acc_flag[b_col]                  = true;
           result_c_col_indices[row_size++] = b_col;
         }
       }
     }
     result_index += row_size;
-    h_rmc(i+1) = result_index;
-    //size_type c_row_begin = h_rmc(i);
-
-
-    //if (i == 0) std::cout << "result_cols" << std::endl;
+    h_rmc(i + 1) = result_index;
+    // size_type c_row_begin = h_rmc(i);
 
-    for (lno_t j = 0; j < row_size; ++j){
+    // if (i == 0) std::cout << "result_cols" << std::endl;
 
-      lno_t result_col = result_c_col_indices[j];
+    for (lno_t j = 0; j < row_size; ++j) {
+      lno_t result_col     = result_c_col_indices[j];
       acc_flag[result_col] = false;
     }
   }
 
   handle->get_spgemm_handle()->set_c_nnz(result_index);
-  Kokkos::deep_copy (row_mapC, h_rmc);
+  Kokkos::deep_copy(row_mapC, h_rmc);
   Kokkos::fence();
-
-
 }
 
-template <typename KernelHandle,
-  typename alno_row_view_t_,
-  typename alno_nnz_view_t_,
-  typename ascalar_nnz_view_t_,
-  typename blno_row_view_t_,
-  typename blno_nnz_view_t_,
-  typename bscalar_nnz_view_t_,
-  typename clno_row_view_t_,
-  typename clno_nnz_view_t_,
-  typename cscalar_nnz_view_t_>
-void spgemm_debug_numeric(
-    KernelHandle * /* handle */,
-    typename KernelHandle::nnz_lno_t m,
-    typename KernelHandle::nnz_lno_t /* n */,
-    typename KernelHandle::nnz_lno_t k,
-    alno_row_view_t_ row_mapA,
-    alno_nnz_view_t_ entriesA,
-    ascalar_nnz_view_t_ valuesA,
-
-    bool /* transposeA */,
-    blno_row_view_t_ row_mapB,
-    blno_nnz_view_t_ entriesB,
-    bscalar_nnz_view_t_ valuesB,
-    bool /* transposeB */,
-    clno_row_view_t_ row_mapC,
-    clno_nnz_view_t_ entriesC,
-    cscalar_nnz_view_t_ valuesC
-    ){
-  typename alno_row_view_t_::HostMirror h_rma = Kokkos::create_mirror_view (row_mapA);
-  Kokkos::deep_copy (h_rma, row_mapA);
-  typename alno_nnz_view_t_::HostMirror h_enta = Kokkos::create_mirror_view (entriesA);
-  Kokkos::deep_copy (h_enta, entriesA);
-  typename ascalar_nnz_view_t_::HostMirror h_vala = Kokkos::create_mirror_view (valuesA);
-  Kokkos::deep_copy (h_vala, valuesA);
-
-  typename blno_row_view_t_::HostMirror h_rmb = Kokkos::create_mirror_view (row_mapB);
-  Kokkos::deep_copy (h_rmb, row_mapB);
-  typename blno_nnz_view_t_::HostMirror h_entb = Kokkos::create_mirror_view (entriesB);
-  Kokkos::deep_copy (h_entb, entriesB);
-  typename bscalar_nnz_view_t_::HostMirror h_valb = Kokkos::create_mirror_view (valuesB);
-  Kokkos::deep_copy (h_valb, valuesB);
-  typename clno_row_view_t_::HostMirror h_rmc = Kokkos::create_mirror_view (row_mapC);
-  Kokkos::deep_copy (h_rmc, row_mapC);
-
-  typename clno_nnz_view_t_::HostMirror h_entc = Kokkos::create_mirror_view (entriesC);
-  typename cscalar_nnz_view_t_::HostMirror h_valc = Kokkos::create_mirror_view (valuesC);
+template <typename KernelHandle, typename alno_row_view_t_,
+          typename alno_nnz_view_t_, typename ascalar_nnz_view_t_,
+          typename blno_row_view_t_, typename blno_nnz_view_t_,
+          typename bscalar_nnz_view_t_, typename clno_row_view_t_,
+          typename clno_nnz_view_t_, typename cscalar_nnz_view_t_>
+void spgemm_debug_numeric(KernelHandle * /* handle */,
+                          typename KernelHandle::nnz_lno_t m,
+                          typename KernelHandle::nnz_lno_t /* n */,
+                          typename KernelHandle::nnz_lno_t k,
+                          alno_row_view_t_ row_mapA, alno_nnz_view_t_ entriesA,
+                          ascalar_nnz_view_t_ valuesA,
+
+                          bool /* transposeA */, blno_row_view_t_ row_mapB,
+                          blno_nnz_view_t_ entriesB,
+                          bscalar_nnz_view_t_ valuesB, bool /* transposeB */,
+                          clno_row_view_t_ row_mapC, clno_nnz_view_t_ entriesC,
+                          cscalar_nnz_view_t_ valuesC) {
+  typename alno_row_view_t_::HostMirror h_rma =
+      Kokkos::create_mirror_view(row_mapA);
+  Kokkos::deep_copy(h_rma, row_mapA);
+  typename alno_nnz_view_t_::HostMirror h_enta =
+      Kokkos::create_mirror_view(entriesA);
+  Kokkos::deep_copy(h_enta, entriesA);
+  typename ascalar_nnz_view_t_::HostMirror h_vala =
+      Kokkos::create_mirror_view(valuesA);
+  Kokkos::deep_copy(h_vala, valuesA);
+
+  typename blno_row_view_t_::HostMirror h_rmb =
+      Kokkos::create_mirror_view(row_mapB);
+  Kokkos::deep_copy(h_rmb, row_mapB);
+  typename blno_nnz_view_t_::HostMirror h_entb =
+      Kokkos::create_mirror_view(entriesB);
+  Kokkos::deep_copy(h_entb, entriesB);
+  typename bscalar_nnz_view_t_::HostMirror h_valb =
+      Kokkos::create_mirror_view(valuesB);
+  Kokkos::deep_copy(h_valb, valuesB);
+  typename clno_row_view_t_::HostMirror h_rmc =
+      Kokkos::create_mirror_view(row_mapC);
+  Kokkos::deep_copy(h_rmc, row_mapC);
+
+  typename clno_nnz_view_t_::HostMirror h_entc =
+      Kokkos::create_mirror_view(entriesC);
+  typename cscalar_nnz_view_t_::HostMirror h_valc =
+      Kokkos::create_mirror_view(valuesC);
   Kokkos::fence();
 
   typedef typename KernelHandle::nnz_lno_t lno_t;
@@ -195,61 +183,52 @@ void spgemm_debug_numeric(
   std::vector<scalar_t> accumulator(k, 0);
   std::vector<bool> acc_flag(k, false);
 
-
   h_rmc(0) = 0;
-  for (lno_t i = 0; i < m; ++i){
+  for (lno_t i = 0; i < m; ++i) {
     const size_type a_row_begin = h_rma(i);
-    const size_type a_row_end = h_rma(i + 1);
-    lno_t a_row_size = a_row_end - a_row_begin;
+    const size_type a_row_end   = h_rma(i + 1);
+    lno_t a_row_size            = a_row_end - a_row_begin;
 
-    size_type c_row_begin = h_rmc(i);
-    lno_t c_row_size = h_rmc(i+1) - c_row_begin;
+    size_type c_row_begin    = h_rmc(i);
+    lno_t c_row_size         = h_rmc(i + 1) - c_row_begin;
     lno_t c_row_size_counter = 0;
 
-    for (lno_t j = 0; j < a_row_size; ++j){
-      size_type ind = a_row_begin + j;
-      lno_t col = h_enta(ind);
-      scalar_t val = h_vala(ind);
+    for (lno_t j = 0; j < a_row_size; ++j) {
+      size_type ind               = a_row_begin + j;
+      lno_t col                   = h_enta(ind);
+      scalar_t val                = h_vala(ind);
       const size_type b_row_begin = h_rmb(col);
-      const size_type b_row_end = h_rmb(col + 1);
-      lno_t b_row_size = b_row_end - b_row_begin;
-      for (lno_t z = 0; z < b_row_size; ++z){
+      const size_type b_row_end   = h_rmb(col + 1);
+      lno_t b_row_size            = b_row_end - b_row_begin;
+      for (lno_t z = 0; z < b_row_size; ++z) {
         size_type ind_ = b_row_begin + z;
-        lno_t b_col = h_entb(ind_);
+        lno_t b_col    = h_entb(ind_);
         scalar_t b_val = h_valb(ind_);
 
-        if (acc_flag[b_col] == false){
-          acc_flag[b_col] = true;
+        if (acc_flag[b_col] == false) {
+          acc_flag[b_col]                            = true;
           h_entc(c_row_begin + c_row_size_counter++) = b_col;
         }
         accumulator[b_col] += b_val * val;
       }
     }
 
-    //if (i == 0) std::cout << "result_cols" << std::endl;
-
-    for (lno_t j = 0; j < c_row_size; ++j){
+    // if (i == 0) std::cout << "result_cols" << std::endl;
 
-      size_type ind = c_row_begin + j;
-      lno_t result_col = h_entc(ind);
-      h_valc (ind)=  accumulator[result_col];
+    for (lno_t j = 0; j < c_row_size; ++j) {
+      size_type ind           = c_row_begin + j;
+      lno_t result_col        = h_entc(ind);
+      h_valc(ind)             = accumulator[result_col];
       accumulator[result_col] = 0;
-      acc_flag[result_col] = false;
+      acc_flag[result_col]    = false;
     }
-
   }
 
-
-
-
-
-  Kokkos::deep_copy (entriesC, h_entc);
-  Kokkos::deep_copy (valuesC, h_valc);
+  Kokkos::deep_copy(entriesC, h_entc);
+  Kokkos::deep_copy(valuesC, h_valc);
   Kokkos::fence();
-
-
 }
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosSparse
 #endif
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp
index 69c27fce14..bc185c0cd1 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp
@@ -44,22 +44,22 @@
 
 #include "KokkosKernels_Utils.hpp"
 
-namespace KokkosSparse{
-
-namespace Impl{
-
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-template <typename a_row_view_t, typename a_nnz_view_t, typename a_scalar_view_t,
-          typename b_row_view_t, typename b_nnz_view_t, typename b_scalar_view_t,
-          typename c_row_view_t, typename c_nnz_view_t, typename c_scalar_view_t,
-          typename mpool_type>
-struct KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-  NumericCMEM_CPU
-{
+namespace KokkosSparse {
+
+namespace Impl {
+
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename a_row_view_t, typename a_nnz_view_t,
+          typename a_scalar_view_t, typename b_row_view_t,
+          typename b_nnz_view_t, typename b_scalar_view_t,
+          typename c_row_view_t, typename c_nnz_view_t,
+          typename c_scalar_view_t, typename mpool_type>
+struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                    a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                    b_scalar_nnz_view_t_>::NumericCMEM_CPU {
   nnz_lno_t numrows;
   nnz_lno_t numcols;
 
@@ -81,27 +81,19 @@ struct KokkosSPGEMM
   const KokkosKernels::Impl::ExecSpaceType my_exec_space;
   const nnz_lno_t team_work_size;
 
+  NumericCMEM_CPU(nnz_lno_t m_, nnz_lno_t k_, a_row_view_t row_mapA_,
+                  a_nnz_view_t entriesA_, a_scalar_view_t valuesA_,
 
-  NumericCMEM_CPU(
-      nnz_lno_t m_,
-      nnz_lno_t k_,
-      a_row_view_t row_mapA_,
-      a_nnz_view_t entriesA_,
-      a_scalar_view_t valuesA_,
-
-      b_row_view_t row_mapB_,
-      b_nnz_view_t entriesB_,
-      b_scalar_view_t valuesB_,
-
-      c_row_view_t rowmapC_,
-      c_nnz_view_t entriesC_,
-      c_scalar_view_t valuesC_,
-      mpool_type memory_space_,
-      const KokkosKernels::Impl::ExecSpaceType my_exec_space_,
-      nnz_lno_t team_row_chunk_size):
-        numrows(m_),
+                  b_row_view_t row_mapB_, b_nnz_view_t entriesB_,
+                  b_scalar_view_t valuesB_,
+
+                  c_row_view_t rowmapC_, c_nnz_view_t entriesC_,
+                  c_scalar_view_t valuesC_, mpool_type memory_space_,
+                  const KokkosKernels::Impl::ExecSpaceType my_exec_space_,
+                  nnz_lno_t team_row_chunk_size)
+      : numrows(m_),
         numcols(k_),
-        row_mapA (row_mapA_),
+        row_mapA(row_mapA_),
         entriesA(entriesA_),
         valuesA(valuesA_),
 
@@ -113,112 +105,101 @@ struct KokkosSPGEMM
         entriesC(entriesC_),
         valuesC(valuesC_),
         memory_space(memory_space_),
-        pEntriesC(entriesC_.data()), pVals(valuesC.data()),
+        pEntriesC(entriesC_.data()),
+        pVals(valuesC.data()),
         my_exec_space(my_exec_space_),
-        team_work_size(team_row_chunk_size){
-        }
-
+        team_work_size(team_row_chunk_size) {}
 
   KOKKOS_INLINE_FUNCTION
-  size_t get_thread_id(const size_t row_index) const{
-    switch (my_exec_space){
-    default:
-      return row_index;
-#if defined( KOKKOS_ENABLE_SERIAL )
-    case KokkosKernels::Impl::Exec_SERIAL:
-      return 0;
+  size_t get_thread_id(const size_t row_index) const {
+    switch (my_exec_space) {
+      default: return row_index;
+#if defined(KOKKOS_ENABLE_SERIAL)
+      case KokkosKernels::Impl::Exec_SERIAL: return 0;
 #endif
-#if defined( KOKKOS_ENABLE_OPENMP )
-    case KokkosKernels::Impl::Exec_OMP:
-      return Kokkos::OpenMP::impl_hardware_thread_id();
+#if defined(KOKKOS_ENABLE_OPENMP)
+      case KokkosKernels::Impl::Exec_OMP:
+        return Kokkos::OpenMP::impl_hardware_thread_id();
 #endif
-#if defined( KOKKOS_ENABLE_THREADS )
-    case KokkosKernels::Impl::Exec_PTHREADS:
-      return Kokkos::Threads::impl_hardware_thread_id();
+#if defined(KOKKOS_ENABLE_THREADS)
+      case KokkosKernels::Impl::Exec_THREADS:
+        return Kokkos::Threads::impl_hardware_thread_id();
 #endif
-#if defined( KOKKOS_ENABLE_QTHREAD)
-    case KokkosKernels::Impl::Exec_QTHREADS:
-      return 0; // Kokkos does not have a thread_id API for Qthreads
+#if defined(KOKKOS_ENABLE_CUDA)
+      case KokkosKernels::Impl::Exec_CUDA: return row_index;
 #endif
-#if defined( KOKKOS_ENABLE_CUDA )
-    case KokkosKernels::Impl::Exec_CUDA:
-      return row_index;
-#endif
-#if defined( KOKKOS_ENABLE_HIP )
-    case KokkosKernels::Impl::Exec_HIP:
-      return row_index;
+#if defined(KOKKOS_ENABLE_HIP)
+      case KokkosKernels::Impl::Exec_HIP: return row_index;
 #endif
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const MultiCoreTag&, const team_member_t & teamMember) const {
-
-    nnz_lno_t team_row_begin = teamMember.league_rank()  * team_work_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
+  void operator()(const MultiCoreTag &, const team_member_t &teamMember) const {
+    nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
 
-    scalar_t * dense_accum= NULL;
+    scalar_t *dense_accum = NULL;
     size_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-    while (dense_accum == NULL){
-      dense_accum = (scalar_t * )( memory_space.allocate_chunk(tid));
+    while (dense_accum == NULL) {
+      dense_accum = (scalar_t *)(memory_space.allocate_chunk(tid));
     }
-    char *marker = (char *) (dense_accum + numcols);
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index) {
-
-      const size_type c_row_begin = rowmapC[row_index];
-      nnz_lno_t *myentries = pEntriesC + c_row_begin;
-      scalar_t *myvals = pVals + c_row_begin;
-
-      nnz_lno_t current_col_index = 0;
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t nnza = nnz_lno_t(row_mapA[row_index + 1] - col_begin);
-
-      for (nnz_lno_t colind = 0; colind < nnza; ++colind){
-        size_type a_col = colind + col_begin;
-        nnz_lno_t rowB = entriesA[a_col];
-        scalar_t valA = valuesA[a_col];
-
-        size_type rowBegin = row_mapB(rowB);
-        nnz_lno_t left_work = row_mapB(rowB + 1) - rowBegin;
-        for (int i = 0; i < left_work; ++i){
-          const size_type adjind = i + rowBegin;
-          nnz_lno_t b_col_ind = entriesB[adjind];
-          scalar_t b_val = valuesB[adjind] * valA;
-          if (marker[b_col_ind] == 0){
-            marker[b_col_ind] = 1;
-            myentries[current_col_index++] = b_col_ind;
+    char *marker = (char *)(dense_accum + numcols);
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          const size_type c_row_begin = rowmapC[row_index];
+          nnz_lno_t *myentries        = pEntriesC + c_row_begin;
+          scalar_t *myvals            = pVals + c_row_begin;
+
+          nnz_lno_t current_col_index = 0;
+          const size_type col_begin   = row_mapA[row_index];
+          const nnz_lno_t nnza = nnz_lno_t(row_mapA[row_index + 1] - col_begin);
+
+          for (nnz_lno_t colind = 0; colind < nnza; ++colind) {
+            size_type a_col = colind + col_begin;
+            nnz_lno_t rowB  = entriesA[a_col];
+            scalar_t valA   = valuesA[a_col];
+
+            size_type rowBegin  = row_mapB(rowB);
+            nnz_lno_t left_work = row_mapB(rowB + 1) - rowBegin;
+            for (int i = 0; i < left_work; ++i) {
+              const size_type adjind = i + rowBegin;
+              nnz_lno_t b_col_ind    = entriesB[adjind];
+              scalar_t b_val         = valuesB[adjind] * valA;
+              if (marker[b_col_ind] == 0) {
+                marker[b_col_ind]              = 1;
+                myentries[current_col_index++] = b_col_ind;
+              }
+              dense_accum[b_col_ind] += b_val;
+            }
+          }
+          for (nnz_lno_t i = 0; i < current_col_index; ++i) {
+            nnz_lno_t ind    = myentries[i];
+            myvals[i]        = dense_accum[ind];
+            dense_accum[ind] = 0;
+            marker[ind]      = 0;
           }
-          dense_accum[b_col_ind] += b_val;
-        }
-      }
-      for (nnz_lno_t i = 0; i < current_col_index; ++i){
-        nnz_lno_t ind = myentries[i];
-        myvals[i] = dense_accum[ind];
-        dense_accum[ind] = 0;
-        marker [ind] = 0;
-      }
-    });
+        });
     memory_space.release_chunk(dense_accum);
   }
-
 };
 
-
-
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-template <typename a_row_view_t__, typename a_nnz_view_t__, typename a_scalar_view_t__,
-          typename b_row_view_t__, typename b_nnz_view_t__, typename b_scalar_view_t__,
-          typename c_row_view_t__, typename c_nnz_view_t__, typename c_scalar_view_t__,
-          typename c_nnz_tmp_view_t>
-
-struct KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-  NumericCMEM
-{
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename a_row_view_t__, typename a_nnz_view_t__,
+          typename a_scalar_view_t__, typename b_row_view_t__,
+          typename b_nnz_view_t__, typename b_scalar_view_t__,
+          typename c_row_view_t__, typename c_nnz_view_t__,
+          typename c_scalar_view_t__, typename c_nnz_tmp_view_t>
+
+struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                    a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                    b_scalar_nnz_view_t_>::NumericCMEM {
   nnz_lno_t numrows;
 
   a_row_view_t__ row_mapA;
@@ -243,40 +224,30 @@ struct KokkosSPGEMM
   const int vector_size;
   const nnz_lno_t team_work_size;
 
-  const int unit_memory; //begins, nexts, and keys. No need for vals yet.
+  const int unit_memory;  // begins, nexts, and keys. No need for vals yet.
   const int suggested_team_size;
   const int thread_memory;
   nnz_lno_t shmem_key_size;
   nnz_lno_t shared_memory_hash_func;
   nnz_lno_t shmem_hash_size;
 
+  NumericCMEM(nnz_lno_t m_, a_row_view_t__ row_mapA_, a_nnz_view_t__ entriesA_,
+              a_scalar_view_t__ valuesA_,
 
+              b_row_view_t__ row_mapB_, b_nnz_view_t__ entriesB_,
+              b_scalar_view_t__ valuesB_,
 
-  NumericCMEM(
-      nnz_lno_t m_,
-      a_row_view_t__ row_mapA_,
-      a_nnz_view_t__ entriesA_,
-      a_scalar_view_t__ valuesA_,
+              c_row_view_t__ rowmapC_, c_nnz_view_t__ entriesC_,
+              c_scalar_view_t__ valuesC_,
 
-      b_row_view_t__ row_mapB_,
-      b_nnz_view_t__ entriesB_,
-      b_scalar_view_t__ valuesB_,
+              c_nnz_tmp_view_t beginsC_, c_nnz_tmp_view_t nextsC_,
 
-      c_row_view_t__ rowmapC_,
-      c_nnz_view_t__ entriesC_,
-      c_scalar_view_t__ valuesC_,
-
-      c_nnz_tmp_view_t beginsC_,
-      c_nnz_tmp_view_t nextsC_,
-
-      const size_type sharedMemorySize_,
-      const int suggested_vector_size,
-      const nnz_lno_t team_row_chunk_size,
-      int suggested_team_size_,
-      bool KOKKOSKERNELS_VERBOSE_
-      ):
-        numrows(m_),
-        row_mapA (row_mapA_),
+              const size_type sharedMemorySize_,
+              const int suggested_vector_size,
+              const nnz_lno_t team_row_chunk_size, int suggested_team_size_,
+              bool KOKKOSKERNELS_VERBOSE_)
+      : numrows(m_),
+        row_mapA(row_mapA_),
         entriesA(entriesA_),
         valuesA(valuesA_),
 
@@ -289,191 +260,207 @@ struct KokkosSPGEMM
         valuesC(valuesC_),
         beginsC(beginsC_),
         nextsC(nextsC_),
-        pbeginsC(beginsC_.data()), pnextsC(nextsC_.data()),
-        pEntriesC(entriesC_.data()), pvaluesC(valuesC_.data()),
+        pbeginsC(beginsC_.data()),
+        pnextsC(nextsC_.data()),
+        pEntriesC(entriesC_.data()),
+        pvaluesC(valuesC_.data()),
         shared_memory_size(sharedMemorySize_),
 
-        vector_size (suggested_vector_size),
+        vector_size(suggested_vector_size),
         team_work_size(team_row_chunk_size),
 
-        unit_memory(sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) + sizeof (scalar_t)),
+        unit_memory(sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) +
+                    sizeof(scalar_t)),
         suggested_team_size(suggested_team_size_),
-        thread_memory((shared_memory_size /8 / suggested_team_size_) * 8),
-        shmem_key_size(), shared_memory_hash_func(), shmem_hash_size(1)
-        {
-          constexpr size_t scalarAlignPad = (alignof(scalar_t) > alignof(nnz_lno_t)) ? (alignof(scalar_t) - alignof(nnz_lno_t)) : 0;
-          shmem_key_size = ((thread_memory - sizeof(nnz_lno_t) * 2 - scalarAlignPad) / unit_memory);
-          if (KOKKOSKERNELS_VERBOSE_){
-            std::cout << "\t\tNumericCMEM -- thread_memory:" << thread_memory  << " unit_memory:" << unit_memory <<
-                " initial key size:" << shmem_key_size << std::endl;
-          }
-          while (shmem_hash_size * 2 <=  shmem_key_size){
-            shmem_hash_size = shmem_hash_size * 2;
-          }
-          shared_memory_hash_func = shmem_hash_size - 1;
+        thread_memory((shared_memory_size / 8 / suggested_team_size_) * 8),
+        shmem_key_size(),
+        shared_memory_hash_func(),
+        shmem_hash_size(1) {
+    constexpr size_t scalarAlignPad =
+        (alignof(scalar_t) > alignof(nnz_lno_t))
+            ? (alignof(scalar_t) - alignof(nnz_lno_t))
+            : 0;
+    shmem_key_size = ((thread_memory - sizeof(nnz_lno_t) * 2 - scalarAlignPad) /
+                      unit_memory);
+    if (KOKKOSKERNELS_VERBOSE_) {
+      std::cout << "\t\tNumericCMEM -- thread_memory:" << thread_memory
+                << " unit_memory:" << unit_memory
+                << " initial key size:" << shmem_key_size << std::endl;
+    }
+    while (shmem_hash_size * 2 <= shmem_key_size) {
+      shmem_hash_size = shmem_hash_size * 2;
+    }
+    shared_memory_hash_func = shmem_hash_size - 1;
 
-          shmem_key_size = shmem_key_size + ((shmem_key_size - shmem_hash_size) * sizeof(nnz_lno_t)) / (sizeof (nnz_lno_t) * 2 + sizeof(scalar_t));
-          shmem_key_size = (shmem_key_size >> 1) << 1;
+    shmem_key_size = shmem_key_size +
+                     ((shmem_key_size - shmem_hash_size) * sizeof(nnz_lno_t)) /
+                         (sizeof(nnz_lno_t) * 2 + sizeof(scalar_t));
+    shmem_key_size = (shmem_key_size >> 1) << 1;
 
-          if (KOKKOSKERNELS_VERBOSE_){
-            std::cout << "\t\tNumericCMEM -- adjusted hashsize:" << shmem_hash_size  << " shmem_key_size:" << shmem_key_size << std::endl;
-          }
-        }
+    if (KOKKOSKERNELS_VERBOSE_) {
+      std::cout << "\t\tNumericCMEM -- adjusted hashsize:" << shmem_hash_size
+                << " shmem_key_size:" << shmem_key_size << std::endl;
+    }
+  }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const GPUTag&, const team_member_t & teamMember) const {
-
+  void operator()(const GPUTag &, const team_member_t &teamMember) const {
+    // get the beginning and end rows of the team.
+    nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
 
-    //get the beginning and end rows of the team.
-    nnz_lno_t team_row_begin = teamMember.league_rank()  * team_work_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
+    char *all_shared_memory =
+        (char *)(teamMember.team_shmem().get_shmem(shared_memory_size));
 
-
-    char *all_shared_memory = (char *) (teamMember.team_shmem().get_shmem(shared_memory_size));
-
-    //shift it to the thread private part
+    // shift it to the thread private part
     all_shared_memory += thread_memory * teamMember.team_rank();
 
-    //used_hash_sizes hold the size of 1st and 2nd level hashes
-    volatile nnz_lno_t *used_hash_sizes = (volatile nnz_lno_t *) (all_shared_memory);
+    // used_hash_sizes hold the size of 1st and 2nd level hashes
+    volatile nnz_lno_t *used_hash_sizes =
+        (volatile nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * 2;
 
-    nnz_lno_t * begins = (nnz_lno_t *) (all_shared_memory);
+    nnz_lno_t *begins = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * shmem_hash_size;
 
-    //poins to the next elements
-    nnz_lno_t * nexts = (nnz_lno_t *) (all_shared_memory);
+    // poins to the next elements
+    nnz_lno_t *nexts = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * shmem_key_size;
 
-    //holds the keys
-    nnz_lno_t * keys = (nnz_lno_t *) (all_shared_memory);
+    // holds the keys
+    nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * shmem_key_size;
-    scalar_t* vals = KokkosKernels::Impl::alignPtr<char*, scalar_t>(all_shared_memory);
+    scalar_t *vals =
+        KokkosKernels::Impl::alignPtr<char *, scalar_t>(all_shared_memory);
 
-    KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,nnz_lno_t,scalar_t,KokkosKernels::Experimental::HashOpType::bitwiseAnd>
-    hm(shmem_key_size, shared_memory_hash_func, begins, nexts, keys, vals);
+    KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, scalar_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>
+        hm(shmem_key_size, shared_memory_hash_func, begins, nexts, keys, vals);
 
     // issue-508, TODO: understand and re-work below parallel_for loop.
     // Inialize hm2 with correct max_value_size and hashOpRHS
-    // global_memory_hash_size is computed, per team of threads -- this is hashOpRHS.
+    // global_memory_hash_size is computed, per team of threads -- this is
+    // hashOpRHS.
 
-    KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,nnz_lno_t,scalar_t,KokkosKernels::Experimental::HashOpType::modulo>
-    hm2(0, 0,
-        NULL, NULL, NULL, NULL);
+    KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, scalar_t,
+        KokkosKernels::Experimental::HashOpType::modulo>
+        hm2(0, 0, NULL, NULL, NULL, NULL);
     /*
     KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,nnz_lno_t,scalar_t>
     hm2(global_memory_hash_size, global_memory_hash_size,
-        pbeginsC + c_row_begin, pnextsC + c_row_begin, pEntriesC + c_row_begin, pvaluesC + c_row_begin);
+        pbeginsC + c_row_begin, pnextsC + c_row_begin, pEntriesC + c_row_begin,
+    pvaluesC + c_row_begin);
         */
 
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index) {
-      const size_type c_row_begin = rowmapC[row_index];
-      const nnz_lno_t global_memory_hash_size = nnz_lno_t(rowmapC[row_index + 1] - c_row_begin);
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          const size_type c_row_begin = rowmapC[row_index];
+          const nnz_lno_t global_memory_hash_size =
+              nnz_lno_t(rowmapC[row_index + 1] - c_row_begin);
 
-      hm2.keys = pEntriesC + c_row_begin;
-      hm2.values = pvaluesC + c_row_begin;
-      hm2.hash_begins = pbeginsC + c_row_begin;
-      hm2.hash_nexts = pnextsC + c_row_begin;
+          hm2.keys        = pEntriesC + c_row_begin;
+          hm2.values      = pvaluesC + c_row_begin;
+          hm2.hash_begins = pbeginsC + c_row_begin;
+          hm2.hash_nexts  = pnextsC + c_row_begin;
 
-      //initialize begins.
-      Kokkos::parallel_for(
-          Kokkos::ThreadVectorRange(teamMember, shmem_hash_size),
-          [&] (int i) {
-        begins[i] = -1;
-      });
-
-      //initialize hash usage sizes
-      Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
-        used_hash_sizes[0] = 0;
-        used_hash_sizes[1] = 0;
-      });
-
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t left_work = nnz_lno_t(row_mapA[row_index + 1] - col_begin);
-
-      for (nnz_lno_t colind = 0; colind < left_work; ++colind){
-        size_type a_col = colind + col_begin;
-        nnz_lno_t rowB = entriesA[a_col];
-        scalar_t valA = valuesA[a_col];
-
-        size_type rowBegin = row_mapB(rowB);
-        nnz_lno_t left_work_ = row_mapB(rowB + 1) - rowBegin;
-
-        while (left_work_){
-          nnz_lno_t work_to_handle = KOKKOSKERNELS_MACRO_MIN(vector_size, left_work_);
-          nnz_lno_t b_col_ind = -1;
-          scalar_t b_val = -1;
+          // initialize begins.
           Kokkos::parallel_for(
-              Kokkos::ThreadVectorRange(teamMember, work_to_handle),
-              [&] (nnz_lno_t i) {
-            const size_type adjind = i + rowBegin;
-            b_col_ind = entriesB[adjind];
-            b_val = valuesB[adjind] * valA;
-          });
-
-          int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeAdd(
-                                b_col_ind, b_val,
-                                used_hash_sizes
-                              );
-
-          int overall_num_unsuccess = 0;
+              Kokkos::ThreadVectorRange(teamMember, shmem_hash_size),
+              [&](int i) { begins[i] = -1; });
 
-          Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(teamMember, vector_size),
-              [&] (const int /* threadid */, int &overall_num_unsuccess_) {
-            overall_num_unsuccess_ += num_unsuccess;
-          }, overall_num_unsuccess);
+          // initialize hash usage sizes
+          Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+            used_hash_sizes[0] = 0;
+            used_hash_sizes[1] = 0;
+          });
 
-          if (overall_num_unsuccess){
-            nnz_lno_t hash_ = -1;
-            if (num_unsuccess) {
-              hash_ = b_col_ind % global_memory_hash_size;
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t left_work =
+              nnz_lno_t(row_mapA[row_index + 1] - col_begin);
+
+          for (nnz_lno_t colind = 0; colind < left_work; ++colind) {
+            size_type a_col = colind + col_begin;
+            nnz_lno_t rowB  = entriesA[a_col];
+            scalar_t valA   = valuesA[a_col];
+
+            size_type rowBegin   = row_mapB(rowB);
+            nnz_lno_t left_work_ = row_mapB(rowB + 1) - rowBegin;
+
+            while (left_work_) {
+              nnz_lno_t work_to_handle =
+                  KOKKOSKERNELS_MACRO_MIN(vector_size, left_work_);
+              nnz_lno_t b_col_ind = -1;
+              scalar_t b_val      = -1;
+              Kokkos::parallel_for(
+                  Kokkos::ThreadVectorRange(teamMember, work_to_handle),
+                  [&](nnz_lno_t i) {
+                    const size_type adjind = i + rowBegin;
+                    b_col_ind              = entriesB[adjind];
+                    b_val                  = valuesB[adjind] * valA;
+                  });
+
+              int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeAdd(
+                  b_col_ind, b_val, used_hash_sizes);
+
+              int overall_num_unsuccess = 0;
+
+              Kokkos::parallel_reduce(
+                  Kokkos::ThreadVectorRange(teamMember, vector_size),
+                  [&](const int /* threadid */, int &overall_num_unsuccess_) {
+                    overall_num_unsuccess_ += num_unsuccess;
+                  },
+                  overall_num_unsuccess);
+
+              if (overall_num_unsuccess) {
+                nnz_lno_t hash_ = -1;
+                if (num_unsuccess) {
+                  hash_ = b_col_ind % global_memory_hash_size;
+                }
+
+                // int insertion =
+                hm2.vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length(
+                    teamMember, vector_size, hash_, b_col_ind, b_val,
+                    used_hash_sizes + 1, global_memory_hash_size);
+              }
+              left_work_ -= work_to_handle;
+              rowBegin += work_to_handle;
             }
-
-            //int insertion =
-            hm2.vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length(
-                teamMember, vector_size,
-                hash_,b_col_ind,b_val,
-                used_hash_sizes + 1,
-                global_memory_hash_size
-            );
-
           }
-          left_work_ -= work_to_handle;
-          rowBegin += work_to_handle;
-        }
-      }
-
-      Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
-        if (used_hash_sizes[0] > shmem_key_size) used_hash_sizes[0] = shmem_key_size;
-      });
 
+          Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+            if (used_hash_sizes[0] > shmem_key_size)
+              used_hash_sizes[0] = shmem_key_size;
+          });
 
-      size_type num_elements = used_hash_sizes[0];
-
+          size_type num_elements = used_hash_sizes[0];
 
-      size_type written_index = used_hash_sizes[1];
-      Kokkos::parallel_for(
-          Kokkos::ThreadVectorRange(teamMember, num_elements),
-          [&] (size_type i) {
-        pEntriesC[c_row_begin + written_index + i] = keys[i];
-        pvaluesC[c_row_begin + written_index + i] = vals[i];
-      });
-    });
+          size_type written_index = used_hash_sizes[1];
+          Kokkos::parallel_for(
+              Kokkos::ThreadVectorRange(teamMember, num_elements),
+              [&](size_type i) {
+                pEntriesC[c_row_begin + written_index + i] = keys[i];
+                pvaluesC[c_row_begin + written_index + i]  = vals[i];
+              });
+        });
   }
 
-  size_t team_shmem_size (int /* team_size */) const {
+  size_t team_shmem_size(int /* team_size */) const {
     return shared_memory_size;
   }
 };
 
-
 //
 // * Notes on KokkosSPGEMM_numeric_speed *
 //
 // Prior to this routine, KokkosSPGEMM_numeric(...) was called
 //
 //   KokkosSPGEMM_numeric(...) :
-//     if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm) :
+//     if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP ==
+//     this->spgemm_algorithm) :
 //       call KokkosSPGEMM_numeric_speed(...)
 //     else:
 //       call  KokkosSPGEMM_numeric_hash(...)
@@ -486,178 +473,165 @@ struct KokkosSPGEMM
 //  Policy typedefs with tags found in: KokkosSparse_spgemm_impl.hpp
 //
 //  if GPU:
-//    "KokkosSparse::NumericCMEM::KKSPEED::GPU" : gpu_team_policy_t,  i.e. GPUTag
+//    "KokkosSparse::NumericCMEM::KKSPEED::GPU" : gpu_team_policy_t,  i.e.
+//    GPUTag
 //
 //  else :
-//    "KokkosSparse::NumericCMEM_CPU::DENSE::DYNAMIC" : dynamic_multicore_team_policy_t,  i.e. MultiCoreTag
-//    "KokkosSparse::NumericCMEM_CPU::DENSE::STATIC" :  multicore_team_policy_t,  i.e. MultiCoreTag
+//    "KokkosSparse::NumericCMEM_CPU::DENSE::DYNAMIC" :
+//    dynamic_multicore_team_policy_t,  i.e. MultiCoreTag
+//    "KokkosSparse::NumericCMEM_CPU::DENSE::STATIC" :  multicore_team_policy_t,
+//    i.e. MultiCoreTag
 //
 
-
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-template <typename c_row_view_t, typename c_lno_nnz_view_t, typename c_scalar_nnz_view_t>
-void
-  KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-  KokkosSPGEMM_numeric_speed(
-    c_row_view_t rowmapC_,
-    c_lno_nnz_view_t entriesC_,
-    c_scalar_nnz_view_t valuesC_,
-    KokkosKernels::Impl::ExecSpaceType my_exec_space_)
-{
-
-  if (KOKKOSKERNELS_VERBOSE){
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename c_row_view_t, typename c_lno_nnz_view_t,
+          typename c_scalar_nnz_view_t>
+void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                  a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                  b_scalar_nnz_view_t_>::
+    KokkosSPGEMM_numeric_speed(
+        c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_,
+        c_scalar_nnz_view_t valuesC_,
+        KokkosKernels::Impl::ExecSpaceType my_exec_space_) {
+  if (KOKKOSKERNELS_VERBOSE) {
     std::cout << "\tSPEED MODE" << std::endl;
   }
 
   nnz_lno_t brows = row_mapB.extent(0) - 1;
-  size_type bnnz =  valsB.extent(0);
+  size_type bnnz  = valsB.extent(0);
 
-  //get suggested vector size, teamsize and row chunk size.
-  int suggested_vector_size = this->handle->get_suggested_vector_size(brows, bnnz);
-  int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(suggested_team_size,concurrency, a_row_cnt);
+  // get suggested vector size, teamsize and row chunk size.
+  int suggested_vector_size =
+      this->handle->get_suggested_vector_size(brows, bnnz);
+  int suggested_team_size =
+      this->handle->get_suggested_team_size(suggested_vector_size);
+  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(
+      suggested_team_size, concurrency, a_row_cnt);
 
   Kokkos::Timer numeric_speed_timer_with_free;
 
-  if (KokkosKernels::Impl::kk_is_gpu_exec_space<typename HandleType::HandleExecSpace>()) {
-    //allocate memory for begins and next to be used by the hashmap
-    nnz_lno_temp_work_view_t beginsC
-    (Kokkos::view_alloc(Kokkos::WithoutInitializing, "C keys"), valuesC_.extent(0));
-    nnz_lno_temp_work_view_t nextsC
-    (Kokkos::view_alloc(Kokkos::WithoutInitializing, "C nexts"), valuesC_.extent(0));
+  if (KokkosKernels::Impl::kk_is_gpu_exec_space<
+          typename HandleType::HandleExecSpace>()) {
+    // allocate memory for begins and next to be used by the hashmap
+    nnz_lno_temp_work_view_t beginsC(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "C keys"),
+        valuesC_.extent(0));
+    nnz_lno_temp_work_view_t nextsC(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "C nexts"),
+        valuesC_.extent(0));
     Kokkos::deep_copy(beginsC, -1);
 
-    //create the functor.
-    NumericCMEM<
-    const_a_lno_row_view_t, const_a_lno_nnz_view_t, const_a_scalar_nnz_view_t,
-    const_b_lno_row_view_t, const_b_lno_nnz_view_t, const_b_scalar_nnz_view_t,
-    c_row_view_t, c_lno_nnz_view_t, c_scalar_nnz_view_t, nnz_lno_temp_work_view_t>
-    sc(
-        a_row_cnt,
-        row_mapA,
-        entriesA,
-        valsA,
-
-        row_mapB,
-        entriesB,
-        valsB,
-
-        rowmapC_,
-        entriesC_,
-        valuesC_,
-
-        beginsC, nextsC,
-        shmem_size,
-        suggested_vector_size,
-        team_row_chunk_size,
-        suggested_team_size,
-        KOKKOSKERNELS_VERBOSE);
+    // create the functor.
+    NumericCMEM<const_a_lno_row_view_t, const_a_lno_nnz_view_t,
+                const_a_scalar_nnz_view_t, const_b_lno_row_view_t,
+                const_b_lno_nnz_view_t, const_b_scalar_nnz_view_t, c_row_view_t,
+                c_lno_nnz_view_t, c_scalar_nnz_view_t, nnz_lno_temp_work_view_t>
+        sc(a_row_cnt, row_mapA, entriesA, valsA,
+
+           row_mapB, entriesB, valsB,
+
+           rowmapC_, entriesC_, valuesC_,
+
+           beginsC, nextsC, shmem_size, suggested_vector_size,
+           team_row_chunk_size, suggested_team_size, KOKKOSKERNELS_VERBOSE);
 
     Kokkos::Timer timer1;
     MyExecSpace().fence();
 
-    if (KOKKOSKERNELS_VERBOSE){
+    if (KOKKOSKERNELS_VERBOSE) {
       std::cout << "\t\tGPU vector_size:" << suggested_vector_size
-          <<  " team_size:" << suggested_team_size
-          << " chunk_size:" << team_row_chunk_size
-          << std::endl;
+                << " team_size:" << suggested_team_size
+                << " chunk_size:" << team_row_chunk_size << std::endl;
     }
 
     timer1.reset();
-    //this is basically kkmem without memory pools.
-    //only executed for to check the effect of memory pools.
-    Kokkos::parallel_for( "KokkosSparse::NumericCMEM::KKSPEED::GPU",
-        gpu_team_policy_t(
-            a_row_cnt / team_row_chunk_size + 1 ,
-            suggested_team_size ,
-            suggested_vector_size),
-            sc);
+    // this is basically kkmem without memory pools.
+    // only executed for to check the effect of memory pools.
+    Kokkos::parallel_for(
+        "KokkosSparse::NumericCMEM::KKSPEED::GPU",
+        gpu_team_policy_t(a_row_cnt / team_row_chunk_size + 1,
+                          suggested_team_size, suggested_vector_size),
+        sc);
     MyExecSpace().fence();
 
-    if (KOKKOSKERNELS_VERBOSE){
+    if (KOKKOSKERNELS_VERBOSE) {
       std::cout << "\t\tNumeric TIME:" << timer1.seconds() << std::endl;
     }
-  }
-  else {
-
+  } else {
     Kokkos::Timer numeric_speed_timer;
-    typedef KokkosKernels::Impl::UniformMemoryPool
-        < MyTempMemorySpace, scalar_t> pool_memory_space;
-
+    typedef KokkosKernels::Impl::UniformMemoryPool<MyTempMemorySpace, scalar_t>
+        pool_memory_space;
 
     KokkosKernels::Impl::PoolType my_pool_type =
         KokkosKernels::Impl::OneThread2OneChunk;
     int num_chunks = concurrency;
 
     Kokkos::Timer timer1;
-    pool_memory_space m_space
-    (num_chunks, this->b_col_cnt + (this->b_col_cnt) / sizeof(scalar_t) + 1, 0,  my_pool_type);
+    pool_memory_space m_space(
+        num_chunks, this->b_col_cnt + (this->b_col_cnt) / sizeof(scalar_t) + 1,
+        0, my_pool_type);
     MyExecSpace().fence();
 
-    if (KOKKOSKERNELS_VERBOSE){
+    if (KOKKOSKERNELS_VERBOSE) {
       std::cout << "\t\tPool Alloc Time:" << timer1.seconds() << std::endl;
-      std::cout << "\tPool Size(MB):" <<
-          sizeof(scalar_t) * (num_chunks *
-              (this->b_col_cnt + (this->b_col_cnt) / sizeof(scalar_t) + 1))
-              / 1024. / 1024.  << std::endl;
+      std::cout << "\tPool Size(MB):"
+                << sizeof(scalar_t) *
+                       (num_chunks *
+                        (this->b_col_cnt +
+                         (this->b_col_cnt) / sizeof(scalar_t) + 1)) /
+                       1024. / 1024.
+                << std::endl;
     }
 
-    NumericCMEM_CPU<
-    const_a_lno_row_view_t, const_a_lno_nnz_view_t, const_a_scalar_nnz_view_t,
-    const_b_lno_row_view_t, const_b_lno_nnz_view_t, const_b_scalar_nnz_view_t,
-    c_row_view_t, c_lno_nnz_view_t, c_scalar_nnz_view_t,
-    pool_memory_space>
-    sc(
-        a_row_cnt,
-        b_col_cnt,
-        row_mapA,
-        entriesA,
-        valsA,
-
-        row_mapB,
-        entriesB,
-        valsB,
-
-        rowmapC_,
-        entriesC_,
-        valuesC_,
-        m_space,
-        my_exec_space_,
-        team_row_chunk_size);
+    NumericCMEM_CPU<const_a_lno_row_view_t, const_a_lno_nnz_view_t,
+                    const_a_scalar_nnz_view_t, const_b_lno_row_view_t,
+                    const_b_lno_nnz_view_t, const_b_scalar_nnz_view_t,
+                    c_row_view_t, c_lno_nnz_view_t, c_scalar_nnz_view_t,
+                    pool_memory_space>
+        sc(a_row_cnt, b_col_cnt, row_mapA, entriesA, valsA,
+
+           row_mapB, entriesB, valsB,
+
+           rowmapC_, entriesC_, valuesC_, m_space, my_exec_space_,
+           team_row_chunk_size);
 
     MyExecSpace().fence();
-    if (KOKKOSKERNELS_VERBOSE){
+    if (KOKKOSKERNELS_VERBOSE) {
       std::cout << "\t\tCPU vector_size:" << suggested_vector_size
-          <<  " team_size:" << suggested_team_size
-          << " chunk_size:" << team_row_chunk_size
-          << std::endl;
+                << " team_size:" << suggested_team_size
+                << " chunk_size:" << team_row_chunk_size << std::endl;
     }
     timer1.reset();
 
-    if (use_dynamic_schedule){
-      Kokkos::parallel_for( "KokkosSparse::NumericCMEM_CPU::DENSE::DYNAMIC", dynamic_multicore_team_policy_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-    }
-    else {
-      Kokkos::parallel_for( "KokkosSparse::NumericCMEM_CPU::DENSE::STATIC", multicore_team_policy_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
+    if (use_dynamic_schedule) {
+      Kokkos::parallel_for("KokkosSparse::NumericCMEM_CPU::DENSE::DYNAMIC",
+                           dynamic_multicore_team_policy_t(
+                               a_row_cnt / team_row_chunk_size + 1,
+                               suggested_team_size, suggested_vector_size),
+                           sc);
+    } else {
+      Kokkos::parallel_for(
+          "KokkosSparse::NumericCMEM_CPU::DENSE::STATIC",
+          multicore_team_policy_t(a_row_cnt / team_row_chunk_size + 1,
+                                  suggested_team_size, suggested_vector_size),
+          sc);
     }
 
     MyExecSpace().fence();
 
-    if (KOKKOSKERNELS_VERBOSE){
+    if (KOKKOSKERNELS_VERBOSE) {
       std::cout << "\t\tNumeric TIME:" << timer1.seconds() << std::endl;
-      std::cout << "\t\tNumeric SPEED TIME:" << numeric_speed_timer.seconds() << std::endl;
-
+      std::cout << "\t\tNumeric SPEED TIME:" << numeric_speed_timer.seconds()
+                << std::endl;
     }
   }
-  if (KOKKOSKERNELS_VERBOSE){
-    std::cout << "\t\tNumeric SPEED TIME WITH FREE:" << numeric_speed_timer_with_free.seconds() << std::endl;
-
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\t\tNumeric SPEED TIME WITH FREE:"
+              << numeric_speed_timer_with_free.seconds() << std::endl;
   }
 }
-}
-}
-
+}  // namespace Impl
+}  // namespace KokkosSparse
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
index ea73184252..2b7c4e3b38 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
@@ -43,27 +43,26 @@
 */
 #include "KokkosKernels_BitUtils.hpp"
 #include <unordered_map>
-namespace KokkosSparse{
+namespace KokkosSparse {
 
-namespace Impl{
+namespace Impl {
 
-
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
 template <typename a_row_view_t, typename a_nnz_view_t,
-          typename b_original_row_view_t,
-          typename b_compressed_row_view_t, typename b_nnz_view_t,
-          typename c_row_view_t, //typename nnz_lno_temp_work_view_t,
+          typename b_original_row_view_t, typename b_compressed_row_view_t,
+          typename b_nnz_view_t,
+          typename c_row_view_t,  // typename nnz_lno_temp_work_view_t,
           typename pool_memory_space>
-struct KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-  StructureC_NC{
-  const nnz_lno_t numrows; //num rows in A
+struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                    a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                    b_scalar_nnz_view_t_>::StructureC_NC {
+  const nnz_lno_t numrows;  // num rows in A
 
-  const a_row_view_t row_mapA; //A row pointers
-  const a_nnz_view_t entriesA; // A column indices
+  const a_row_view_t row_mapA;  // A row pointers
+  const a_nnz_view_t entriesA;  // A column indices
 
   const b_original_row_view_t row_pointer_begins_B;
   const b_compressed_row_view_t row_pointer_ends_B;
@@ -71,9 +70,8 @@ struct KokkosSPGEMM
   b_nnz_view_t entriesSetsB;
 
   c_row_view_t rowmapC;
-  //nnz_lno_temp_work_view_t entriesSetIndicesC;
-  //nnz_lno_temp_work_view_t entriesSetsC;
-
+  // nnz_lno_temp_work_view_t entriesSetIndicesC;
+  // nnz_lno_temp_work_view_t entriesSetsC;
 
   const nnz_lno_t pow2_hash_size;
   const nnz_lno_t pow2_hash_func;
@@ -84,8 +82,7 @@ struct KokkosSPGEMM
   pool_memory_space m_space;
   const KokkosKernels::Impl::ExecSpaceType my_exec_space;
 
-
-  const int unit_memory; //begins, nexts, and keys. No need for vals yet.
+  const int unit_memory;  // begins, nexts, and keys. No need for vals yet.
   const int suggested_team_size;
   const int thread_memory;
   nnz_lno_t shmem_key_size;
@@ -110,537 +107,514 @@ struct KokkosSPGEMM
    * \param team_row_chunk_size_: suggested team chunk size
    * \param my_exec_space_ : execution space.
    */
-  StructureC_NC(
-      const nnz_lno_t m_,
-      const a_row_view_t row_mapA_,
-      const a_nnz_view_t entriesA_,
-      const b_original_row_view_t row_ptr_begins_B_,
-      const b_compressed_row_view_t row_ptr_ends_B_,
-      const b_nnz_view_t entries_b,
-      c_row_view_t rowmapC_,
-      const nnz_lno_t hash_size_,
-      const nnz_lno_t MaxRoughNonZero_,
-      const size_t sharedMemorySize_,
-      const int suggested_team_size_,
-      const nnz_lno_t team_row_chunk_size_,
-      const int vector_size_,
-      pool_memory_space mpool_,
-      const KokkosKernels::Impl::ExecSpaceType my_exec_space_
-      ,bool KOKKOSKERNELS_VERBOSE_
-      ):
-        numrows(m_),
-        row_mapA (row_mapA_),
+  StructureC_NC(const nnz_lno_t m_, const a_row_view_t row_mapA_,
+                const a_nnz_view_t entriesA_,
+                const b_original_row_view_t row_ptr_begins_B_,
+                const b_compressed_row_view_t row_ptr_ends_B_,
+                const b_nnz_view_t entries_b, c_row_view_t rowmapC_,
+                const nnz_lno_t hash_size_, const nnz_lno_t MaxRoughNonZero_,
+                const size_t sharedMemorySize_, const int suggested_team_size_,
+                const nnz_lno_t team_row_chunk_size_, const int vector_size_,
+                pool_memory_space mpool_,
+                const KokkosKernels::Impl::ExecSpaceType my_exec_space_,
+                bool KOKKOSKERNELS_VERBOSE_)
+      : numrows(m_),
+        row_mapA(row_mapA_),
         entriesA(entriesA_),
         row_pointer_begins_B(row_ptr_begins_B_),
         row_pointer_ends_B(row_ptr_ends_B_),
         b_entries(entries_b),
         rowmapC(rowmapC_),
-        //entriesSetIndicesC(),
-        //entriesSetsC(),
+        // entriesSetIndicesC(),
+        // entriesSetsC(),
         pow2_hash_size(hash_size_),
         pow2_hash_func(hash_size_ - 1),
         MaxRoughNonZero(MaxRoughNonZero_),
         shared_memory_size(sharedMemorySize_),
-        vector_size (vector_size_),
+        vector_size(vector_size_),
         m_space(mpool_),
         my_exec_space(my_exec_space_),
 
-        //unit memory for a hashmap entry. assuming 1 begin, 1 next, 1 key 1 value.
+        // unit memory for a hashmap entry. assuming 1 begin, 1 next, 1 key 1
+        // value.
         unit_memory(sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) * 2),
         suggested_team_size(suggested_team_size_),
-        thread_memory((shared_memory_size /8 / suggested_team_size_) * 8),
+        thread_memory((shared_memory_size / 8 / suggested_team_size_) * 8),
         shmem_key_size(),
         shared_memory_hash_func(),
         shmem_hash_size(1),
-        team_row_chunk_size(team_row_chunk_size_)
-  {
-
-    //how many keys I can hold?
-    //thread memory - 3 needed entry for size.
+        team_row_chunk_size(team_row_chunk_size_) {
+    // how many keys I can hold?
+    // thread memory - 3 needed entry for size.
     shmem_key_size = ((thread_memory - sizeof(nnz_lno_t) * 3) / unit_memory);
 
-    //put the hash size closest power of 2.
-    //we round down here, because we want to store more keys,
-    //conflicts are cheaper.
-    while (shmem_hash_size * 2 <=  shmem_key_size){
+    // put the hash size closest power of 2.
+    // we round down here, because we want to store more keys,
+    // conflicts are cheaper.
+    while (shmem_hash_size * 2 <= shmem_key_size) {
       shmem_hash_size = shmem_hash_size * 2;
     }
-    //for and opeation we get -1.
+    // for and opeation we get -1.
     shared_memory_hash_func = shmem_hash_size - 1;
 
-    //increase the key size wit the left over from hash size.
-    shmem_key_size = shmem_key_size + ((shmem_key_size - shmem_hash_size) ) / 3;
-    //round it down to 2, because of some alignment issues.
+    // increase the key size wit the left over from hash size.
+    shmem_key_size = shmem_key_size + ((shmem_key_size - shmem_hash_size)) / 3;
+    // round it down to 2, because of some alignment issues.
     shmem_key_size = (shmem_key_size >> 1) << 1;
 
-    if (KOKKOSKERNELS_VERBOSE_){
-
+    if (KOKKOSKERNELS_VERBOSE_) {
       std::cout << "\tStructureC "
                 << " thread_memory:" << thread_memory
                 << " unit_memory:" << unit_memory
                 << " adjusted hashsize:" << shmem_hash_size
-                << " adjusted shmem_key_size:" << shmem_key_size
-                << " using "<< (shmem_key_size * 3  + shmem_hash_size) * sizeof (nnz_lno_t) +    sizeof(nnz_lno_t) * 3
-                << " of thread_memory: " << thread_memory
-                << std::endl;
-          }
+                << " adjusted shmem_key_size:" << shmem_key_size << " using "
+                << (shmem_key_size * 3 + shmem_hash_size) * sizeof(nnz_lno_t) +
+                       sizeof(nnz_lno_t) * 3
+                << " of thread_memory: " << thread_memory << std::endl;
+    }
   }
 
   KOKKOS_INLINE_FUNCTION
-  size_t get_thread_id(const size_t row_index) const{
-    switch (my_exec_space){
-    default:
-      return row_index;
-#if defined( KOKKOS_ENABLE_SERIAL )
-    case KokkosKernels::Impl::Exec_SERIAL:
-      return 0;
-#endif
-#if defined( KOKKOS_ENABLE_OPENMP )
-    case KokkosKernels::Impl::Exec_OMP:
-      return Kokkos::OpenMP::impl_hardware_thread_id();
+  size_t get_thread_id(const size_t row_index) const {
+    switch (my_exec_space) {
+      default: return row_index;
+#if defined(KOKKOS_ENABLE_SERIAL)
+      case KokkosKernels::Impl::Exec_SERIAL: return 0;
 #endif
-#if defined( KOKKOS_ENABLE_THREADS )
-    case KokkosKernels::Impl::Exec_PTHREADS:
-      return Kokkos::Threads::impl_hardware_thread_id();
+#if defined(KOKKOS_ENABLE_OPENMP)
+      case KokkosKernels::Impl::Exec_OMP:
+        return Kokkos::OpenMP::impl_hardware_thread_id();
 #endif
-#if defined( KOKKOS_ENABLE_QTHREAD)
-    case KokkosKernels::Impl::Exec_QTHREADS:
-      return 0; // Kokkos does not have a thread_id API for Qthreads
+#if defined(KOKKOS_ENABLE_THREADS)
+      case KokkosKernels::Impl::Exec_THREADS:
+        return Kokkos::Threads::impl_hardware_thread_id();
 #endif
-#if defined( KOKKOS_ENABLE_CUDA )
-    case KokkosKernels::Impl::Exec_CUDA:
-      return row_index;
+#if defined(KOKKOS_ENABLE_CUDA)
+      case KokkosKernels::Impl::Exec_CUDA: return row_index;
 #endif
-#if defined( KOKKOS_ENABLE_HIP )
-    case KokkosKernels::Impl::Exec_HIP:
-      return row_index;
+#if defined(KOKKOS_ENABLE_HIP)
+      case KokkosKernels::Impl::Exec_HIP: return row_index;
 #endif
     }
   }
 
-
-
   KOKKOS_INLINE_FUNCTION
-  void operator()(const MultiCoreTag&, const team_member_t & teamMember) const {
-
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
-
-    //get memory from memory pool.
-    volatile nnz_lno_t * tmp = NULL;
+  void operator()(const MultiCoreTag &, const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
+
+    // get memory from memory pool.
+    volatile nnz_lno_t *tmp = NULL;
     size_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-    while (tmp == NULL){
-      tmp = (volatile nnz_lno_t * )( m_space.allocate_chunk(tid));
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(m_space.allocate_chunk(tid));
     }
 
-    //set first to globally used hash indices.
-    nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *) tmp;
+    // set first to globally used hash indices.
+    nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *)tmp;
     tmp += pow2_hash_size;
 
-    //create hashmap accumulator.
-    KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,nnz_lno_t,nnz_lno_t,KokkosKernels::Experimental::HashOpType::bitwiseAnd> 
-    hm2(MaxRoughNonZero, pow2_hash_func, nullptr, nullptr, nullptr, nullptr);
+    // create hashmap accumulator.
+    KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, nnz_lno_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>
+        hm2(MaxRoughNonZero, pow2_hash_func, nullptr, nullptr, nullptr,
+            nullptr);
 
-    //set memory for hash begins.
-    hm2.hash_begins = (nnz_lno_t *) (tmp);
-    tmp += pow2_hash_size ;
+    // set memory for hash begins.
+    hm2.hash_begins = (nnz_lno_t *)(tmp);
+    tmp += pow2_hash_size;
 
-    hm2.hash_nexts = (nnz_lno_t *) (tmp);
+    hm2.hash_nexts = (nnz_lno_t *)(tmp);
     tmp += MaxRoughNonZero;
 
-    //holds the keys
-    hm2.keys = (nnz_lno_t *) (tmp);
-    //tmp += MaxRoughNonZero;
-    //hm2.values = (nnz_lno_t *) (tmp);
+    // holds the keys
+    hm2.keys = (nnz_lno_t *)(tmp);
+    // tmp += MaxRoughNonZero;
+    // hm2.values = (nnz_lno_t *) (tmp);
 
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index){
-      nnz_lno_t globally_used_hash_count = 0;
-      nnz_lno_t used_hash_size = 0;
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
-      //traverse columns of A.
-      for (nnz_lno_t colind = 0; colind < col_size; ++colind){
-        size_type a_col = colind + col_begin;
-        nnz_lno_t rowB = entriesA[a_col];
-
-        size_type rowBegin = row_pointer_begins_B(rowB);
-        nnz_lno_t left_work = row_pointer_ends_B(rowB ) - rowBegin;
-        //traverse columns of B
-        for (nnz_lno_t i = 0; i < left_work; ++i){
-
-          const size_type adjind = i + rowBegin;
-
-          nnz_lno_t b_set_ind = b_entries[adjind];
-          //nnz_lno_t b_set = entriesSetsB[adjind];
-
-          //insert it to first hash.
-          hm2.sequential_insert_into_hash_TrackHashes(
-              b_set_ind,
-              &used_hash_size,
-              &globally_used_hash_count,
-              globally_used_hash_indices
-          );
-        }
-      }
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          nnz_lno_t globally_used_hash_count = 0;
+          nnz_lno_t used_hash_size           = 0;
+          const size_type col_begin          = row_mapA[row_index];
+          const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
+          // traverse columns of A.
+          for (nnz_lno_t colind = 0; colind < col_size; ++colind) {
+            size_type a_col = colind + col_begin;
+            nnz_lno_t rowB  = entriesA[a_col];
+
+            size_type rowBegin  = row_pointer_begins_B(rowB);
+            nnz_lno_t left_work = row_pointer_ends_B(rowB) - rowBegin;
+            // traverse columns of B
+            for (nnz_lno_t i = 0; i < left_work; ++i) {
+              const size_type adjind = i + rowBegin;
+
+              nnz_lno_t b_set_ind = b_entries[adjind];
+              // nnz_lno_t b_set = entriesSetsB[adjind];
+
+              // insert it to first hash.
+              hm2.sequential_insert_into_hash_TrackHashes(
+                  b_set_ind, &used_hash_size, &globally_used_hash_count,
+                  globally_used_hash_indices);
+            }
+          }
 
-      //when done with all insertions, traverse insertions and get the size.
-      nnz_lno_t num_el = used_hash_size;
+          // when done with all insertions, traverse insertions and get the
+          // size.
+          nnz_lno_t num_el = used_hash_size;
 
-      //clear the begins.
-      for (int i = 0; i < globally_used_hash_count; ++i){
-        nnz_lno_t dirty_hash = globally_used_hash_indices[i];
-        hm2.hash_begins[dirty_hash] = -1;
-      }
-      //set the row size.
-      rowmapC(row_index) = num_el;
-    });
+          // clear the begins.
+          for (int i = 0; i < globally_used_hash_count; ++i) {
+            nnz_lno_t dirty_hash        = globally_used_hash_indices[i];
+            hm2.hash_begins[dirty_hash] = -1;
+          }
+          // set the row size.
+          rowmapC(row_index) = num_el;
+        });
 
     m_space.release_chunk(globally_used_hash_indices);
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const MultiCoreDenseAccumulatorTag&, const team_member_t & teamMember) const {
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
-
-    //dense accumulators
-    nnz_lno_t *indices = NULL;
-    nnz_lno_t *sets = NULL;
-    volatile nnz_lno_t * tmp = NULL;
+  void operator()(const MultiCoreDenseAccumulatorTag &,
+                  const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
+
+    // dense accumulators
+    nnz_lno_t *indices      = NULL;
+    nnz_lno_t *sets         = NULL;
+    volatile nnz_lno_t *tmp = NULL;
 
     size_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-    while (tmp == NULL){
-      tmp = (volatile nnz_lno_t * )( m_space.allocate_chunk(tid));
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(m_space.allocate_chunk(tid));
     }
 
-    //we need as much as column size for sets.
-    sets = (nnz_lno_t *) tmp;
-    tmp += MaxRoughNonZero; //this is set as column size before calling dense accumulators.
-    //indices only needs max row size.
-    indices = (nnz_lno_t *) tmp;
-
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index)
-    {
-      nnz_lno_t index_cnt = 0;
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
-
-      //traverse columns of A
-      for (nnz_lno_t colind = 0; colind < col_size; ++colind){
-        size_type a_col = colind + col_begin;
-
-        nnz_lno_t rowB = entriesA[a_col];
-        size_type rowBegin = row_pointer_begins_B(rowB);
-        nnz_lno_t left_work = row_pointer_ends_B(rowB ) - rowBegin;
-
-        //traverse columns of B
-        for (nnz_lno_t i = 0; i < left_work; ++i){
-
-          const size_type adjind = i + rowBegin;
-          nnz_lno_t b_set_ind = b_entries[adjind];
-          //nnz_lno_t b_set = entriesSetsB[adjind];
+    // we need as much as column size for sets.
+    sets = (nnz_lno_t *)tmp;
+    tmp += MaxRoughNonZero;  // this is set as column size before calling dense
+                             // accumulators.
+    // indices only needs max row size.
+    indices = (nnz_lno_t *)tmp;
 
-          //if sets are not set before, add this to indices.
-          if (sets[b_set_ind] == 0){
-            indices[index_cnt++] = b_set_ind;
-            sets[b_set_ind] = 1;
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          nnz_lno_t index_cnt       = 0;
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t col_size  = row_mapA[row_index + 1] - col_begin;
+
+          // traverse columns of A
+          for (nnz_lno_t colind = 0; colind < col_size; ++colind) {
+            size_type a_col = colind + col_begin;
+
+            nnz_lno_t rowB      = entriesA[a_col];
+            size_type rowBegin  = row_pointer_begins_B(rowB);
+            nnz_lno_t left_work = row_pointer_ends_B(rowB) - rowBegin;
+
+            // traverse columns of B
+            for (nnz_lno_t i = 0; i < left_work; ++i) {
+              const size_type adjind = i + rowBegin;
+              nnz_lno_t b_set_ind    = b_entries[adjind];
+              // nnz_lno_t b_set = entriesSetsB[adjind];
+
+              // if sets are not set before, add this to indices.
+              if (sets[b_set_ind] == 0) {
+                indices[index_cnt++] = b_set_ind;
+                sets[b_set_ind]      = 1;
+              }
+            }
           }
-        }
-      }
-      for (nnz_lno_t ii = 0; ii < index_cnt; ++ii){
-        nnz_lno_t set_ind = indices[ii];
-        sets[set_ind] = 0;
-      }
-      rowmapC(row_index) = index_cnt;
-    }
-    );
+          for (nnz_lno_t ii = 0; ii < index_cnt; ++ii) {
+            nnz_lno_t set_ind = indices[ii];
+            sets[set_ind]     = 0;
+          }
+          rowmapC(row_index) = index_cnt;
+        });
 
     m_space.release_chunk(sets);
   }
-  //this one will be LP on CPUs
+  // this one will be LP on CPUs
   KOKKOS_INLINE_FUNCTION
-  void operator()(const MultiCoreTag4&, const team_member_t & teamMember) const {
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
-
-
-    //get memory from memory pool.
-    volatile nnz_lno_t * tmp = NULL;
+  void operator()(const MultiCoreTag4 &,
+                  const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
+
+    // get memory from memory pool.
+    volatile nnz_lno_t *tmp = NULL;
     size_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-    while (tmp == NULL){
-      tmp = (volatile nnz_lno_t * )( m_space.allocate_chunk(tid));
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(m_space.allocate_chunk(tid));
     }
 
-    nnz_lno_t *used_indices = (nnz_lno_t *) (tmp);
+    nnz_lno_t *used_indices = (nnz_lno_t *)(tmp);
     tmp += MaxRoughNonZero;
-    nnz_lno_t *hash_ids = (nnz_lno_t *) (tmp);
+    nnz_lno_t *hash_ids = (nnz_lno_t *)(tmp);
     tmp += pow2_hash_size;
-    //nnz_lno_t *hash_values = (nnz_lno_t *) (tmp);
-
-
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index){
-      nnz_lno_t used_count = 0;
-
-      //nnz_lno_t globally_used_hash_count = 0;
-      //nnz_lno_t used_hash_size = 0;
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
-      //traverse columns of A.
-      for (nnz_lno_t colind = 0; colind < col_size; ++colind){
-        size_type a_col = colind + col_begin;
-        nnz_lno_t rowB = entriesA[a_col];
-
-        size_type rowBegin = row_pointer_begins_B(rowB);
-        nnz_lno_t left_work = row_pointer_ends_B(rowB ) - rowBegin;
-        //traverse columns of B
-        for (nnz_lno_t i = 0; i < left_work; ++i){
-
-          const size_type adjind = i + rowBegin;
+    // nnz_lno_t *hash_values = (nnz_lno_t *) (tmp);
 
-          nnz_lno_t b_set_ind = b_entries[adjind];
-          //nnz_lno_t b_set = entriesSetsB[adjind];
-          nnz_lno_t hash = (b_set_ind * HASHSCALAR) & pow2_hash_func;
-
-          while (true){
-            if (hash_ids[hash] == -1){
-            	used_indices[used_count++] = hash;
-            	hash_ids[hash] = b_set_ind;
-            	//hash_values[hash] = b_set;
-            	break;
-            }
-            else if (hash_ids[hash] == b_set_ind){
-            	//hash_values[hash] = hash_values[hash] | b_set;
-            	break;
-            }
-            else {
-            	hash = (hash + 1) & pow2_hash_func;
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          nnz_lno_t used_count = 0;
+
+          // nnz_lno_t globally_used_hash_count = 0;
+          // nnz_lno_t used_hash_size = 0;
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t col_size  = row_mapA[row_index + 1] - col_begin;
+          // traverse columns of A.
+          for (nnz_lno_t colind = 0; colind < col_size; ++colind) {
+            size_type a_col = colind + col_begin;
+            nnz_lno_t rowB  = entriesA[a_col];
+
+            size_type rowBegin  = row_pointer_begins_B(rowB);
+            nnz_lno_t left_work = row_pointer_ends_B(rowB) - rowBegin;
+            // traverse columns of B
+            for (nnz_lno_t i = 0; i < left_work; ++i) {
+              const size_type adjind = i + rowBegin;
+
+              nnz_lno_t b_set_ind = b_entries[adjind];
+              // nnz_lno_t b_set = entriesSetsB[adjind];
+              nnz_lno_t hash = (b_set_ind * HASHSCALAR) & pow2_hash_func;
+
+              while (true) {
+                if (hash_ids[hash] == -1) {
+                  used_indices[used_count++] = hash;
+                  hash_ids[hash]             = b_set_ind;
+                  // hash_values[hash] = b_set;
+                  break;
+                } else if (hash_ids[hash] == b_set_ind) {
+                  // hash_values[hash] = hash_values[hash] | b_set;
+                  break;
+                } else {
+                  hash = (hash + 1) & pow2_hash_func;
+                }
+              }
             }
           }
-        }
-      }
 
-      for (nnz_lno_t ii = 0; ii < used_count; ++ii){
-    	  nnz_lno_t used_index = used_indices[ii];
-    	  hash_ids[used_index] = -1;
-      }
-      rowmapC(row_index) = used_count;
-    });
+          for (nnz_lno_t ii = 0; ii < used_count; ++ii) {
+            nnz_lno_t used_index = used_indices[ii];
+            hash_ids[used_index] = -1;
+          }
+          rowmapC(row_index) = used_count;
+        });
 
     m_space.release_chunk(used_indices);
   }
   KOKKOS_INLINE_FUNCTION
-  void operator()(const GPUTag&, const team_member_t & teamMember) const {
-    nnz_lno_t row_index = teamMember.league_rank()  * teamMember.team_size()+ teamMember.team_rank();
-    
-    using hashmapType = KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,
-                                                                        nnz_lno_t,
-                                                                        nnz_lno_t,
-                                                                        KokkosKernels::Experimental::HashOpType::bitwiseAnd>;
+  void operator()(const GPUTag &, const team_member_t &teamMember) const {
+    nnz_lno_t row_index = teamMember.league_rank() * teamMember.team_size() +
+                          teamMember.team_rank();
 
-    if (row_index >= numrows) return;
+    using hashmapType = KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, nnz_lno_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>;
 
+    if (row_index >= numrows) return;
 
-    //printf("row:%d\n", row_index);
+    // printf("row:%d\n", row_index);
 
-    //int thread_memory = ((shared_memory_size/ 4 / teamMember.team_size())) * 4;
-    char *all_shared_memory = (char *) (teamMember.team_shmem().get_shmem(shared_memory_size));
+    // int thread_memory = ((shared_memory_size/ 4 / teamMember.team_size())) *
+    // 4;
+    char *all_shared_memory =
+        (char *)(teamMember.team_shmem().get_shmem(shared_memory_size));
 
-    //nnz_lno_t *alloc_global_memory = NULL;
+    // nnz_lno_t *alloc_global_memory = NULL;
     nnz_lno_t *globally_used_hash_indices = NULL;
 
-    //shift it to the thread private part
+    // shift it to the thread private part
     all_shared_memory += thread_memory * teamMember.team_rank();
 
-    //used_hash_sizes hold the size of 1st and 2nd level hashes
-    volatile nnz_lno_t *used_hash_sizes = (volatile nnz_lno_t *) (all_shared_memory);
+    // used_hash_sizes hold the size of 1st and 2nd level hashes
+    volatile nnz_lno_t *used_hash_sizes =
+        (volatile nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * 2;
 
-    nnz_lno_t *globally_used_hash_count = (nnz_lno_t *) (all_shared_memory);
+    nnz_lno_t *globally_used_hash_count = (nnz_lno_t *)(all_shared_memory);
 
-    all_shared_memory += sizeof(nnz_lno_t) ;
-    //int unit_memory = sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) * 2;
-    //nnz_lno_t shmem_key_size = (thread_memory - sizeof(nnz_lno_t) * 3) / unit_memory;
+    all_shared_memory += sizeof(nnz_lno_t);
+    // int unit_memory = sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) * 2;
+    // nnz_lno_t shmem_key_size = (thread_memory - sizeof(nnz_lno_t) * 3) /
+    // unit_memory;
 
-    nnz_lno_t * begins = (nnz_lno_t *) (all_shared_memory);
+    nnz_lno_t *begins = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * shmem_hash_size;
 
-    //poins to the next elements
-    nnz_lno_t * nexts = (nnz_lno_t *) (all_shared_memory);
+    // poins to the next elements
+    nnz_lno_t *nexts = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * shmem_key_size;
 
-    //holds the keys
-    nnz_lno_t * keys = (nnz_lno_t *) (all_shared_memory);
-    //all_shared_memory += sizeof(nnz_lno_t) * shmem_key_size;
-    nnz_lno_t * vals = NULL;
-
-    //printf("begins:%ld, nexts:%ld, keys:%ld, vals:%ld\n", begins, nexts, keys, vals);
-    //return;
-    //first level hashmap
-    KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,nnz_lno_t,nnz_lno_t,KokkosKernels::Experimental::HashOpType::bitwiseAnd>
-      hm(shmem_key_size, shared_memory_hash_func, begins, nexts, keys, vals);
-
-    hashmapType hm2(MaxRoughNonZero, pow2_hash_func,
-                    nullptr, nullptr,
-                    nullptr, nullptr);
-
-    //initialize begins.
-    Kokkos::parallel_for(
-        Kokkos::ThreadVectorRange(teamMember, shmem_hash_size),
-        [&] (int i) {
-      begins[i] = -1;
-    });
-
-    //initialize hash usage sizes
-    Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
-      used_hash_sizes[0] = 0;
-      used_hash_sizes[1] = 0;
+    // holds the keys
+    nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory);
+    // all_shared_memory += sizeof(nnz_lno_t) * shmem_key_size;
+    nnz_lno_t *vals = NULL;
+
+    // printf("begins:%ld, nexts:%ld, keys:%ld, vals:%ld\n", begins, nexts,
+    // keys, vals); return; first level hashmap
+    KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, nnz_lno_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>
+        hm(shmem_key_size, shared_memory_hash_func, begins, nexts, keys, vals);
+
+    hashmapType hm2(MaxRoughNonZero, pow2_hash_func, nullptr, nullptr, nullptr,
+                    nullptr);
+
+    // initialize begins.
+    Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember, shmem_hash_size),
+                         [&](int i) { begins[i] = -1; });
+
+    // initialize hash usage sizes
+    Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+      used_hash_sizes[0]          = 0;
+      used_hash_sizes[1]          = 0;
       globally_used_hash_count[0] = 0;
     });
 
     bool is_global_alloced = false;
 
-    const size_type col_end = row_mapA[row_index + 1];
+    const size_type col_end   = row_mapA[row_index + 1];
     const size_type col_begin = row_mapA[row_index];
-    const nnz_lno_t col_size = col_end - col_begin;
+    const nnz_lno_t col_size  = col_end - col_begin;
 
-    for (nnz_lno_t colind = 0; colind < col_size; ++colind){
+    for (nnz_lno_t colind = 0; colind < col_size; ++colind) {
       size_type a_col = colind + col_begin;
 
-      nnz_lno_t rowB = entriesA[a_col];
+      nnz_lno_t rowB     = entriesA[a_col];
       size_type rowBegin = row_pointer_begins_B(rowB);
 
       nnz_lno_t left_work = row_pointer_ends_B(rowB) - rowBegin;
 
-      while (left_work){
-        nnz_lno_t work_to_handle = KOKKOSKERNELS_MACRO_MIN(vector_size, left_work);
+      while (left_work) {
+        nnz_lno_t work_to_handle =
+            KOKKOSKERNELS_MACRO_MIN(vector_size, left_work);
 
-        nnz_lno_t b_set_ind = -1;// , b_set = -1;
+        nnz_lno_t b_set_ind = -1;  // , b_set = -1;
         Kokkos::parallel_for(
             Kokkos::ThreadVectorRange(teamMember, work_to_handle),
-            [&] (nnz_lno_t i) {
-          const size_type adjind = i + rowBegin;
-          b_set_ind = b_entries[adjind];
-          //b_set = entriesSetsB[adjind];
-        });
-
-
-        int num_unsuccess = hm.vector_atomic_insert_into_hash(
-                              b_set_ind,
-                              used_hash_sizes
-                            );
+            [&](nnz_lno_t i) {
+              const size_type adjind = i + rowBegin;
+              b_set_ind              = b_entries[adjind];
+              // b_set = entriesSetsB[adjind];
+            });
 
+        int num_unsuccess =
+            hm.vector_atomic_insert_into_hash(b_set_ind, used_hash_sizes);
 
         int overall_num_unsuccess = 0;
 
-        Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(teamMember, vector_size),
-            [&] (const int /* threadid */, int &overall_num_unsuccess_) {
-          overall_num_unsuccess_ += num_unsuccess;
-        }, overall_num_unsuccess);
-
-
-        if (overall_num_unsuccess){
-
-          //printf("row:%d\n", row_index);
-          if (!is_global_alloced){
-            volatile nnz_lno_t * tmp = NULL;
-            size_t tid = get_thread_id(row_index);
-            while (tmp == NULL){
-              Kokkos::single(Kokkos::PerThread(teamMember),[&] (volatile nnz_lno_t * &memptr) {
-                memptr = (volatile nnz_lno_t * )( m_space.allocate_chunk(tid));
-              }, tmp);
+        Kokkos::parallel_reduce(
+            Kokkos::ThreadVectorRange(teamMember, vector_size),
+            [&](const int /* threadid */, int &overall_num_unsuccess_) {
+              overall_num_unsuccess_ += num_unsuccess;
+            },
+            overall_num_unsuccess);
+
+        if (overall_num_unsuccess) {
+          // printf("row:%d\n", row_index);
+          if (!is_global_alloced) {
+            volatile nnz_lno_t *tmp = NULL;
+            size_t tid              = get_thread_id(row_index);
+            while (tmp == NULL) {
+              Kokkos::single(
+                  Kokkos::PerThread(teamMember),
+                  [&](volatile nnz_lno_t *&memptr) {
+                    memptr =
+                        (volatile nnz_lno_t *)(m_space.allocate_chunk(tid));
+                  },
+                  tmp);
             }
             is_global_alloced = true;
 
-            globally_used_hash_indices = (nnz_lno_t *) tmp;
-            tmp += pow2_hash_size ;
+            globally_used_hash_indices = (nnz_lno_t *)tmp;
+            tmp += pow2_hash_size;
 
-            hm2.hash_begins = (nnz_lno_t *) (tmp);
-            tmp += pow2_hash_size ;
+            hm2.hash_begins = (nnz_lno_t *)(tmp);
+            tmp += pow2_hash_size;
 
-            //poins to the next elements
-            hm2.hash_nexts = (nnz_lno_t *) (tmp);
+            // poins to the next elements
+            hm2.hash_nexts = (nnz_lno_t *)(tmp);
             tmp += MaxRoughNonZero;
 
-            //holds the keys
-            hm2.keys = (nnz_lno_t *) (tmp);
-            //tmp += MaxRoughNonZero;
-            //hm2.values = (nnz_lno_t *) (tmp);
+            // holds the keys
+            hm2.keys = (nnz_lno_t *)(tmp);
+            // tmp += MaxRoughNonZero;
+            // hm2.values = (nnz_lno_t *) (tmp);
           }
 
           if (num_unsuccess) {
-            //int insertion =
+            // int insertion =
             hm2.vector_atomic_insert_into_hash_TrackHashes(
-                b_set_ind,
-                used_hash_sizes + 1,
-                globally_used_hash_count, globally_used_hash_indices
-                );
+                b_set_ind, used_hash_sizes + 1, globally_used_hash_count,
+                globally_used_hash_indices);
           }
-
         }
         left_work -= work_to_handle;
         rowBegin += work_to_handle;
       }
     }
 
-    Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
-      if (used_hash_sizes[0] > shmem_key_size) used_hash_sizes[0] = shmem_key_size;
+    Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+      if (used_hash_sizes[0] > shmem_key_size)
+        used_hash_sizes[0] = shmem_key_size;
     });
 
-
     nnz_lno_t num_elements = used_hash_sizes[0];
 
-
-
-    if (is_global_alloced){
+    if (is_global_alloced) {
       num_elements += used_hash_sizes[1];
 
-      //now thread leaves the memory as it finds. so there is no need to initialize the hash begins
+      // now thread leaves the memory as it finds. so there is no need to
+      // initialize the hash begins
       nnz_lno_t dirty_hashes = globally_used_hash_count[0];
-      Kokkos::parallel_for(
-          Kokkos::ThreadVectorRange(teamMember, dirty_hashes),
-          [&] (nnz_lno_t i) {
-        nnz_lno_t dirty_hash = globally_used_hash_indices[i];
-        hm2.hash_begins[dirty_hash] = -1;
-      });
-
-
-      Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember, dirty_hashes),
+                           [&](nnz_lno_t i) {
+                             nnz_lno_t dirty_hash =
+                                 globally_used_hash_indices[i];
+                             hm2.hash_begins[dirty_hash] = -1;
+                           });
+
+      Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
         m_space.release_chunk(globally_used_hash_indices);
       });
     }
-    Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
-      rowmapC(row_index) = num_elements;
-    });
+    Kokkos::single(Kokkos::PerThread(teamMember),
+                   [&]() { rowmapC(row_index) = num_elements; });
   }
 
   ////
 
-  size_t team_shmem_size (int /* team_size */) const {
+  size_t team_shmem_size(int /* team_size */) const {
     return shared_memory_size;
   }
-
 };
 
-
-
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
 template <typename a_row_view_t, typename a_nnz_view_t,
-          typename b_original_row_view_t,
-          typename b_compressed_row_view_t, typename b_nnz_view_t,
-          typename c_row_view_t, //typename nnz_lno_temp_work_view_t,
+          typename b_original_row_view_t, typename b_compressed_row_view_t,
+          typename b_nnz_view_t,
+          typename c_row_view_t,  // typename nnz_lno_temp_work_view_t,
           typename pool_memory_space>
-struct KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-  StructureC{
-  const nnz_lno_t numrows; //num rows in A
+struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                    a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                    b_scalar_nnz_view_t_>::StructureC {
+  const nnz_lno_t numrows;  // num rows in A
 
-  const a_row_view_t row_mapA; //A row pointers
-  const a_nnz_view_t entriesA; // A column indices
+  const a_row_view_t row_mapA;  // A row pointers
+  const a_nnz_view_t entriesA;  // A column indices
 
   const b_original_row_view_t row_pointer_begins_B;
   const b_compressed_row_view_t row_pointer_ends_B;
@@ -648,9 +622,8 @@ struct KokkosSPGEMM
   b_nnz_view_t entriesSetsB;
 
   c_row_view_t rowmapC;
-  //nnz_lno_temp_work_view_t entriesSetIndicesC;
-  //nnz_lno_temp_work_view_t entriesSetsC;
-
+  // nnz_lno_temp_work_view_t entriesSetIndicesC;
+  // nnz_lno_temp_work_view_t entriesSetsC;
 
   const nnz_lno_t pow2_hash_size;
   const nnz_lno_t pow2_hash_func;
@@ -661,8 +634,7 @@ struct KokkosSPGEMM
   pool_memory_space m_space;
   const KokkosKernels::Impl::ExecSpaceType my_exec_space;
 
-
-  const int unit_memory; //begins, nexts, and keys. No need for vals yet.
+  const int unit_memory;  // begins, nexts, and keys. No need for vals yet.
   const int suggested_team_size;
   const int thread_memory;
   nnz_lno_t shmem_key_size;
@@ -681,533 +653,506 @@ struct KokkosSPGEMM
    * \param entriesSetsB_: columns sets of B             [CS]
    * \param rowmapC_: output rowmap C
    * \param hash_size_: global hashmap hash size.
-   * \param MaxRoughNonZero_: max flops for row.         [upper bound on entries per row]
-   * \param sharedMemorySize_: shared memory size.
-   * \param suggested_team_size_: suggested team size
-   * \param team_row_chunk_size_: suggested team chunk size
-   * \param my_exec_space_ : execution space.
+   * \param MaxRoughNonZero_: max flops for row.         [upper bound on entries
+   * per row] \param sharedMemorySize_: shared memory size. \param
+   * suggested_team_size_: suggested team size \param team_row_chunk_size_:
+   * suggested team chunk size \param my_exec_space_ : execution space.
    */
-  StructureC(
-      const nnz_lno_t m_,
-      const a_row_view_t row_mapA_,
-      const a_nnz_view_t entriesA_,
-      const b_original_row_view_t row_ptr_begins_B_,
-      const b_compressed_row_view_t row_ptr_ends_B_,
-      const b_nnz_view_t entriesSetIndicesB_,
-      const b_nnz_view_t entriesSetsB_,
-      c_row_view_t rowmapC_,
-      const nnz_lno_t hash_size_,
-      const nnz_lno_t MaxRoughNonZero_,
-      const size_t sharedMemorySize_,
-      const int suggested_team_size_,
-      const nnz_lno_t team_row_chunk_size_,
-      const int vector_size_,
-      pool_memory_space mpool_,
-      const KokkosKernels::Impl::ExecSpaceType my_exec_space_
-      ,bool KOKKOSKERNELS_VERBOSE_
-      ):
-        numrows(m_),
-        row_mapA (row_mapA_),
+  StructureC(const nnz_lno_t m_, const a_row_view_t row_mapA_,
+             const a_nnz_view_t entriesA_,
+             const b_original_row_view_t row_ptr_begins_B_,
+             const b_compressed_row_view_t row_ptr_ends_B_,
+             const b_nnz_view_t entriesSetIndicesB_,
+             const b_nnz_view_t entriesSetsB_, c_row_view_t rowmapC_,
+             const nnz_lno_t hash_size_, const nnz_lno_t MaxRoughNonZero_,
+             const size_t sharedMemorySize_, const int suggested_team_size_,
+             const nnz_lno_t team_row_chunk_size_, const int vector_size_,
+             pool_memory_space mpool_,
+             const KokkosKernels::Impl::ExecSpaceType my_exec_space_,
+             bool KOKKOSKERNELS_VERBOSE_)
+      : numrows(m_),
+        row_mapA(row_mapA_),
         entriesA(entriesA_),
         row_pointer_begins_B(row_ptr_begins_B_),
         row_pointer_ends_B(row_ptr_ends_B_),
         entriesSetIndicesB(entriesSetIndicesB_),
         entriesSetsB(entriesSetsB_),
         rowmapC(rowmapC_),
-        //entriesSetIndicesC(),
-        //entriesSetsC(),
+        // entriesSetIndicesC(),
+        // entriesSetsC(),
         pow2_hash_size(hash_size_),
         pow2_hash_func(hash_size_ - 1),
         MaxRoughNonZero(MaxRoughNonZero_),
         shared_memory_size(sharedMemorySize_),
-        vector_size (vector_size_),
+        vector_size(vector_size_),
         m_space(mpool_),
         my_exec_space(my_exec_space_),
 
-        //unit memory for a hashmap entry. assuming 1 begin, 1 next, 1 key 1 value.
+        // unit memory for a hashmap entry. assuming 1 begin, 1 next, 1 key 1
+        // value.
         unit_memory(sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) * 2),
         suggested_team_size(suggested_team_size_),
-        thread_memory((shared_memory_size /8 / suggested_team_size_) * 8),
+        thread_memory((shared_memory_size / 8 / suggested_team_size_) * 8),
         shmem_key_size(),
         shared_memory_hash_func(),
         shmem_hash_size(1),
-        team_row_chunk_size(team_row_chunk_size_)
-  {
-
-    //how many keys I can hold?
-    //thread memory - 3 needed entry for size.
+        team_row_chunk_size(team_row_chunk_size_) {
+    // how many keys I can hold?
+    // thread memory - 3 needed entry for size.
     shmem_key_size = ((thread_memory - sizeof(nnz_lno_t) * 3) / unit_memory);
 
-    //put the hash size closest power of 2.
-    //we round down here, because we want to store more keys,
-    //conflicts are cheaper.
-    while (shmem_hash_size * 2 <=  shmem_key_size){
+    // put the hash size closest power of 2.
+    // we round down here, because we want to store more keys,
+    // conflicts are cheaper.
+    while (shmem_hash_size * 2 <= shmem_key_size) {
       shmem_hash_size = shmem_hash_size * 2;
     }
-    //for and opeation we get -1.
+    // for and opeation we get -1.
     shared_memory_hash_func = shmem_hash_size - 1;
 
-    //increase the key size wit the left over from hash size.
-    shmem_key_size = shmem_key_size + ((shmem_key_size - shmem_hash_size) ) / 3;
-    //round it down to 2, because of some alignment issues.
+    // increase the key size wit the left over from hash size.
+    shmem_key_size = shmem_key_size + ((shmem_key_size - shmem_hash_size)) / 3;
+    // round it down to 2, because of some alignment issues.
     shmem_key_size = (shmem_key_size >> 1) << 1;
 
-    if (KOKKOSKERNELS_VERBOSE_){
-
+    if (KOKKOSKERNELS_VERBOSE_) {
       std::cout << "\tStructureC "
                 << " thread_memory:" << thread_memory
                 << " unit_memory:" << unit_memory
                 << " adjusted hashsize:" << shmem_hash_size
-                << " adjusted shmem_key_size:" << shmem_key_size
-                << " using "<< (shmem_key_size * 3  + shmem_hash_size) * sizeof (nnz_lno_t) +    sizeof(nnz_lno_t) * 3
-                << " of thread_memory: " << thread_memory
-                << std::endl;
-          }
+                << " adjusted shmem_key_size:" << shmem_key_size << " using "
+                << (shmem_key_size * 3 + shmem_hash_size) * sizeof(nnz_lno_t) +
+                       sizeof(nnz_lno_t) * 3
+                << " of thread_memory: " << thread_memory << std::endl;
+    }
   }
 
   KOKKOS_INLINE_FUNCTION
-  size_t get_thread_id(const size_t row_index) const{
-    switch (my_exec_space){
-    default:
-      return row_index;
-#if defined( KOKKOS_ENABLE_SERIAL )
-    case KokkosKernels::Impl::Exec_SERIAL:
-      return 0;
+  size_t get_thread_id(const size_t row_index) const {
+    switch (my_exec_space) {
+      default: return row_index;
+#if defined(KOKKOS_ENABLE_SERIAL)
+      case KokkosKernels::Impl::Exec_SERIAL: return 0;
 #endif
-#if defined( KOKKOS_ENABLE_OPENMP )
-    case KokkosKernels::Impl::Exec_OMP:
-      return Kokkos::OpenMP::impl_hardware_thread_id();
+#if defined(KOKKOS_ENABLE_OPENMP)
+      case KokkosKernels::Impl::Exec_OMP:
+        return Kokkos::OpenMP::impl_hardware_thread_id();
 #endif
-#if defined( KOKKOS_ENABLE_THREADS )
-    case KokkosKernels::Impl::Exec_PTHREADS:
-      return Kokkos::Threads::impl_hardware_thread_id();
+#if defined(KOKKOS_ENABLE_THREADS)
+      case KokkosKernels::Impl::Exec_THREADS:
+        return Kokkos::Threads::impl_hardware_thread_id();
 #endif
-#if defined( KOKKOS_ENABLE_QTHREAD)
-    case KokkosKernels::Impl::Exec_QTHREADS:
-      return 0; // Kokkos does not have a thread_id API for Qthreads
+#if defined(KOKKOS_ENABLE_CUDA)
+      case KokkosKernels::Impl::Exec_CUDA: return row_index;
 #endif
-#if defined( KOKKOS_ENABLE_CUDA )
-    case KokkosKernels::Impl::Exec_CUDA:
-      return row_index;
-#endif
-#if defined( KOKKOS_ENABLE_HIP )
-    case KokkosKernels::Impl::Exec_HIP:
-      return row_index;
+#if defined(KOKKOS_ENABLE_HIP)
+      case KokkosKernels::Impl::Exec_HIP: return row_index;
 #endif
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const MultiCoreDenseAccumulatorTag&, const team_member_t & teamMember) const {
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
-
-    //dense accumulators
-    nnz_lno_t *indices = NULL;
-    nnz_lno_t *sets = NULL;
-    volatile nnz_lno_t * tmp = NULL;
+  void operator()(const MultiCoreDenseAccumulatorTag &,
+                  const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
+
+    // dense accumulators
+    nnz_lno_t *indices      = NULL;
+    nnz_lno_t *sets         = NULL;
+    volatile nnz_lno_t *tmp = NULL;
 
     size_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-    while (tmp == NULL){
-      tmp = (volatile nnz_lno_t * )( m_space.allocate_chunk(tid));
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(m_space.allocate_chunk(tid));
     }
 
-    //we need as much as column size for sets.
-    sets = (nnz_lno_t *) tmp;
-    tmp += MaxRoughNonZero; //this is set as column size before calling dense accumulators.
-    //indices only needs max row size.
-    indices = (nnz_lno_t *) tmp;
-
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index)
-    {
-      nnz_lno_t index_cnt = 0;
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
-
-      //traverse columns of A
-      for (nnz_lno_t colind = 0; colind < col_size; ++colind){
-        size_type a_col = colind + col_begin;
-
-        nnz_lno_t rowB = entriesA[a_col];
-        size_type rowBegin = row_pointer_begins_B(rowB);
-        nnz_lno_t left_work = row_pointer_ends_B(rowB ) - rowBegin;
-
-        //traverse columns of B
-        for (nnz_lno_t i = 0; i < left_work; ++i){
-
-          const size_type adjind = i + rowBegin;
-          nnz_lno_t b_set_ind = entriesSetIndicesB[adjind];
-          nnz_lno_t b_set = entriesSetsB[adjind];
+    // we need as much as column size for sets.
+    sets = (nnz_lno_t *)tmp;
+    tmp += MaxRoughNonZero;  // this is set as column size before calling dense
+                             // accumulators.
+    // indices only needs max row size.
+    indices = (nnz_lno_t *)tmp;
 
-          //if sets are not set before, add this to indices.
-          if (sets[b_set_ind] == 0){
-            indices[index_cnt++] = b_set_ind;
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          nnz_lno_t index_cnt       = 0;
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t col_size  = row_mapA[row_index + 1] - col_begin;
+
+          // traverse columns of A
+          for (nnz_lno_t colind = 0; colind < col_size; ++colind) {
+            size_type a_col = colind + col_begin;
+
+            nnz_lno_t rowB      = entriesA[a_col];
+            size_type rowBegin  = row_pointer_begins_B(rowB);
+            nnz_lno_t left_work = row_pointer_ends_B(rowB) - rowBegin;
+
+            // traverse columns of B
+            for (nnz_lno_t i = 0; i < left_work; ++i) {
+              const size_type adjind = i + rowBegin;
+              nnz_lno_t b_set_ind    = entriesSetIndicesB[adjind];
+              nnz_lno_t b_set        = entriesSetsB[adjind];
+
+              // if sets are not set before, add this to indices.
+              if (sets[b_set_ind] == 0) {
+                indices[index_cnt++] = b_set_ind;
+              }
+              // make a union.
+              sets[b_set_ind] |= b_set;
+            }
           }
-          //make a union.
-          sets[b_set_ind] |= b_set;
-        }
-      }
-      nnz_lno_t num_el = 0;
-      for (nnz_lno_t ii = 0; ii < index_cnt; ++ii){
-        nnz_lno_t set_ind = indices[ii];
-        nnz_lno_t c_rows = sets[set_ind];
-        sets[set_ind] = 0;
-
-        nnz_lno_t num_el2 = KokkosKernels::Impl::pop_count(c_rows);
-/*
-        //count number of set bits
-        nnz_lno_t num_el2 = 0;
-        for (; c_rows; num_el2++) {
-          c_rows = c_rows & (c_rows - 1); // clear the least significant bit set
-        }
-        */
-        num_el += num_el2;
-      }
-      rowmapC(row_index) = num_el;
-    }
-    );
+          nnz_lno_t num_el = 0;
+          for (nnz_lno_t ii = 0; ii < index_cnt; ++ii) {
+            nnz_lno_t set_ind = indices[ii];
+            nnz_lno_t c_rows  = sets[set_ind];
+            sets[set_ind]     = 0;
+
+            nnz_lno_t num_el2 = KokkosKernels::Impl::pop_count(c_rows);
+            /*
+                    //count number of set bits
+                    nnz_lno_t num_el2 = 0;
+                    for (; c_rows; num_el2++) {
+                      c_rows = c_rows & (c_rows - 1); // clear the least
+               significant bit set
+                    }
+                    */
+            num_el += num_el2;
+          }
+          rowmapC(row_index) = num_el;
+        });
 
     m_space.release_chunk(sets);
   }
 
-
-
-  //this one will be LP on CPUs
+  // this one will be LP on CPUs
   KOKKOS_INLINE_FUNCTION
-  void operator()(const MultiCoreTag4&, const team_member_t & teamMember) const {
-
-
-
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
-
-
-    //get memory from memory pool.
-    volatile nnz_lno_t * tmp = NULL;
+  void operator()(const MultiCoreTag4 &,
+                  const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
+
+    // get memory from memory pool.
+    volatile nnz_lno_t *tmp = NULL;
     size_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-    while (tmp == NULL){
-      tmp = (volatile nnz_lno_t * )( m_space.allocate_chunk(tid));
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(m_space.allocate_chunk(tid));
     }
 
-
-
-    nnz_lno_t *used_indices = (nnz_lno_t *) (tmp);
+    nnz_lno_t *used_indices = (nnz_lno_t *)(tmp);
     tmp += MaxRoughNonZero;
-    nnz_lno_t *hash_ids = (nnz_lno_t *) (tmp);
+    nnz_lno_t *hash_ids = (nnz_lno_t *)(tmp);
     tmp += pow2_hash_size;
-    nnz_lno_t *hash_values = (nnz_lno_t *) (tmp);
-
-
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index){
-      nnz_lno_t used_count = 0;
-
-      //nnz_lno_t globally_used_hash_count = 0;
-      //nnz_lno_t used_hash_size = 0;
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
-      //traverse columns of A.
-      for (nnz_lno_t colind = 0; colind < col_size; ++colind){
-        size_type a_col = colind + col_begin;
-        nnz_lno_t rowB = entriesA[a_col];
+    nnz_lno_t *hash_values = (nnz_lno_t *)(tmp);
 
-        size_type rowBegin = row_pointer_begins_B(rowB);
-        nnz_lno_t left_work = row_pointer_ends_B(rowB ) - rowBegin;
-        //traverse columns of B
-        for (nnz_lno_t i = 0; i < left_work; ++i){
-
-          const size_type adjind = i + rowBegin;
-
-          nnz_lno_t b_set_ind = entriesSetIndicesB[adjind];
-          nnz_lno_t b_set = entriesSetsB[adjind];
-          nnz_lno_t hash = (b_set_ind * HASHSCALAR) & pow2_hash_func;
-
-          while (true){
-            if (hash_ids[hash] == -1){
-            	used_indices[used_count++] = hash;
-            	hash_ids[hash] = b_set_ind;
-            	hash_values[hash] = b_set;
-            	break;
-            }
-            else if (hash_ids[hash] == b_set_ind){
-            	hash_values[hash] = hash_values[hash] | b_set;
-            	break;
-            }
-            else {
-            	hash = (hash + 1) & pow2_hash_func;
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          nnz_lno_t used_count = 0;
+
+          // nnz_lno_t globally_used_hash_count = 0;
+          // nnz_lno_t used_hash_size = 0;
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t col_size  = row_mapA[row_index + 1] - col_begin;
+          // traverse columns of A.
+          for (nnz_lno_t colind = 0; colind < col_size; ++colind) {
+            size_type a_col = colind + col_begin;
+            nnz_lno_t rowB  = entriesA[a_col];
+
+            size_type rowBegin  = row_pointer_begins_B(rowB);
+            nnz_lno_t left_work = row_pointer_ends_B(rowB) - rowBegin;
+            // traverse columns of B
+            for (nnz_lno_t i = 0; i < left_work; ++i) {
+              const size_type adjind = i + rowBegin;
+
+              nnz_lno_t b_set_ind = entriesSetIndicesB[adjind];
+              nnz_lno_t b_set     = entriesSetsB[adjind];
+              nnz_lno_t hash      = (b_set_ind * HASHSCALAR) & pow2_hash_func;
+
+              while (true) {
+                if (hash_ids[hash] == -1) {
+                  used_indices[used_count++] = hash;
+                  hash_ids[hash]             = b_set_ind;
+                  hash_values[hash]          = b_set;
+                  break;
+                } else if (hash_ids[hash] == b_set_ind) {
+                  hash_values[hash] = hash_values[hash] | b_set;
+                  break;
+                } else {
+                  hash = (hash + 1) & pow2_hash_func;
+                }
+              }
             }
           }
-        }
-      }
-
-      //when done with all insertions, traverse insertions and get the size.
-      nnz_lno_t num_el = 0;
-      for (nnz_lno_t ii = 0; ii < used_count; ++ii){
-
-    	  nnz_lno_t used_index = used_indices[ii];
-    	  nnz_lno_t c_rows = hash_values[used_index];
-    	  hash_ids[used_index] = -1;
 
-
-		  nnz_lno_t num_el2 = KokkosKernels::Impl::pop_count(c_rows);
-/*
-    	  //the number of set bits.
-    	  for (; c_rows; num_el2++) {
-    		  c_rows = c_rows & (c_rows - 1); // clear the least significant bit set
-    	  }
-    	  */
-    	  num_el += num_el2;
-      }
-      rowmapC(row_index) = num_el;
-    });
+          // when done with all insertions, traverse insertions and get the
+          // size.
+          nnz_lno_t num_el = 0;
+          for (nnz_lno_t ii = 0; ii < used_count; ++ii) {
+            nnz_lno_t used_index = used_indices[ii];
+            nnz_lno_t c_rows     = hash_values[used_index];
+            hash_ids[used_index] = -1;
+
+            nnz_lno_t num_el2 = KokkosKernels::Impl::pop_count(c_rows);
+            /*
+                      //the number of set bits.
+                      for (; c_rows; num_el2++) {
+                              c_rows = c_rows & (c_rows - 1); // clear the least
+               significant bit set
+                      }
+                      */
+            num_el += num_el2;
+          }
+          rowmapC(row_index) = num_el;
+        });
 
     m_space.release_chunk(used_indices);
   }
 
-
-
   KOKKOS_INLINE_FUNCTION
-  void operator()(const MultiCoreTag&, const team_member_t & teamMember) const {
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
-    using hashmapType = 
-      KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,
-                                                      nnz_lno_t,
-                                                      nnz_lno_t,
-                                                      KokkosKernels::Experimental::HashOpType::bitwiseAnd>;
-
-    //get memory from memory pool.
-    volatile nnz_lno_t * tmp = NULL;
+  void operator()(const MultiCoreTag &, const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
+    using hashmapType = KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, nnz_lno_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>;
+
+    // get memory from memory pool.
+    volatile nnz_lno_t *tmp = NULL;
     size_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-    while (tmp == NULL){
-      tmp = (volatile nnz_lno_t * )( m_space.allocate_chunk(tid));
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(m_space.allocate_chunk(tid));
     }
 
-    //set first to globally used hash indices.
-    nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *) tmp;
+    // set first to globally used hash indices.
+    nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *)tmp;
     tmp += pow2_hash_size;
 
-    //create hashmap accumulator.
-    hashmapType hm2(MaxRoughNonZero, pow2_hash_func,
-                    nullptr, nullptr, 
-                    nullptr, nullptr);
+    // create hashmap accumulator.
+    hashmapType hm2(MaxRoughNonZero, pow2_hash_func, nullptr, nullptr, nullptr,
+                    nullptr);
 
-    //set memory for hash begins.
-    hm2.hash_begins = (nnz_lno_t *) (tmp);
-    tmp += pow2_hash_size ;
+    // set memory for hash begins.
+    hm2.hash_begins = (nnz_lno_t *)(tmp);
+    tmp += pow2_hash_size;
 
-    hm2.hash_nexts = (nnz_lno_t *) (tmp);
+    hm2.hash_nexts = (nnz_lno_t *)(tmp);
     tmp += MaxRoughNonZero;
 
-    //holds the keys
-    hm2.keys = (nnz_lno_t *) (tmp);
+    // holds the keys
+    hm2.keys = (nnz_lno_t *)(tmp);
     tmp += MaxRoughNonZero;
-    hm2.values = (nnz_lno_t *) (tmp);
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index){
-      nnz_lno_t globally_used_hash_count = 0;
-      nnz_lno_t used_hash_size = 0;
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
-      //traverse columns of A.
-      for (nnz_lno_t colind = 0; colind < col_size; ++colind){
-        size_type a_col = colind + col_begin;
-        nnz_lno_t rowB = entriesA[a_col];
+    hm2.values = (nnz_lno_t *)(tmp);
 
-        size_type rowBegin = row_pointer_begins_B(rowB);
-        nnz_lno_t left_work = row_pointer_ends_B(rowB ) - rowBegin;
-        //traverse columns of B
-        for (nnz_lno_t i = 0; i < left_work; ++i){
-
-          const size_type adjind = i + rowBegin;
-
-          nnz_lno_t b_set_ind = entriesSetIndicesB[adjind];
-          nnz_lno_t b_set = entriesSetsB[adjind];
-
-          //insert it to first hash.
-          hm2.sequential_insert_into_hash_mergeOr_TrackHashes(
-            b_set_ind, b_set,
-            &used_hash_size,
-            &globally_used_hash_count,
-            globally_used_hash_indices
-          );
-        }
-      }
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          nnz_lno_t globally_used_hash_count = 0;
+          nnz_lno_t used_hash_size           = 0;
+          const size_type col_begin          = row_mapA[row_index];
+          const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
+          // traverse columns of A.
+          for (nnz_lno_t colind = 0; colind < col_size; ++colind) {
+            size_type a_col = colind + col_begin;
+            nnz_lno_t rowB  = entriesA[a_col];
+
+            size_type rowBegin  = row_pointer_begins_B(rowB);
+            nnz_lno_t left_work = row_pointer_ends_B(rowB) - rowBegin;
+            // traverse columns of B
+            for (nnz_lno_t i = 0; i < left_work; ++i) {
+              const size_type adjind = i + rowBegin;
+
+              nnz_lno_t b_set_ind = entriesSetIndicesB[adjind];
+              nnz_lno_t b_set     = entriesSetsB[adjind];
+
+              // insert it to first hash.
+              hm2.sequential_insert_into_hash_mergeOr_TrackHashes(
+                  b_set_ind, b_set, &used_hash_size, &globally_used_hash_count,
+                  globally_used_hash_indices);
+            }
+          }
 
-      //when done with all insertions, traverse insertions and get the size.
-      nnz_lno_t num_el = 0;
-      for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii){
-        nnz_lno_t c_rows = hm2.values[ii];
-		nnz_lno_t num_el2 = KokkosKernels::Impl::pop_count(c_rows);
-/*
-        //the number of set bits.
-        for (; c_rows; num_el2++) {
-          c_rows = c_rows & (c_rows - 1); // clear the least significant bit set
-        }
-        */
-        num_el += num_el2;
-      }
+          // when done with all insertions, traverse insertions and get the
+          // size.
+          nnz_lno_t num_el = 0;
+          for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii) {
+            nnz_lno_t c_rows  = hm2.values[ii];
+            nnz_lno_t num_el2 = KokkosKernels::Impl::pop_count(c_rows);
+            /*
+                    //the number of set bits.
+                    for (; c_rows; num_el2++) {
+                      c_rows = c_rows & (c_rows - 1); // clear the least
+               significant bit set
+                    }
+                    */
+            num_el += num_el2;
+          }
 
-      //clear the begins.
-      for (int i = 0; i < globally_used_hash_count; ++i){
-        nnz_lno_t dirty_hash = globally_used_hash_indices[i];
-        hm2.hash_begins[dirty_hash] = -1;
-      }
-      //set the row size.
-      rowmapC(row_index) = num_el;
-    });
+          // clear the begins.
+          for (int i = 0; i < globally_used_hash_count; ++i) {
+            nnz_lno_t dirty_hash        = globally_used_hash_indices[i];
+            hm2.hash_begins[dirty_hash] = -1;
+          }
+          // set the row size.
+          rowmapC(row_index) = num_el;
+        });
 
     m_space.release_chunk(globally_used_hash_indices);
   }
 
-
   KOKKOS_INLINE_FUNCTION
-  void operator()(const GPUTag&, const team_member_t & teamMember) const {
-    nnz_lno_t row_index = teamMember.league_rank()  * teamMember.team_size()+ teamMember.team_rank();
-    
-    using hashmapType = KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,
-                                                                        nnz_lno_t,
-                                                                        nnz_lno_t,
-                                                                        KokkosKernels::Experimental::HashOpType::bitwiseAnd>;
-    if (row_index >= numrows) return;
+  void operator()(const GPUTag &, const team_member_t &teamMember) const {
+    nnz_lno_t row_index = teamMember.league_rank() * teamMember.team_size() +
+                          teamMember.team_rank();
 
+    using hashmapType = KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, nnz_lno_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>;
+    if (row_index >= numrows) return;
 
-    //printf("row:%d\n", row_index);
+    // printf("row:%d\n", row_index);
 
-    //int thread_memory = ((shared_memory_size/ 4 / teamMember.team_size())) * 4;
-    char *all_shared_memory = (char *) (teamMember.team_shmem().get_shmem(shared_memory_size));
+    // int thread_memory = ((shared_memory_size/ 4 / teamMember.team_size())) *
+    // 4;
+    char *all_shared_memory =
+        (char *)(teamMember.team_shmem().get_shmem(shared_memory_size));
 
-    //nnz_lno_t *alloc_global_memory = NULL;
+    // nnz_lno_t *alloc_global_memory = NULL;
     nnz_lno_t *globally_used_hash_indices = NULL;
 
-    //shift it to the thread private part
+    // shift it to the thread private part
     // Each thread gets a partition and each partition consists of 4 arrays:
     // begin, next, keys/ids, values
     all_shared_memory += thread_memory * teamMember.team_rank();
 
-    //used_hash_sizes hold the size of 1st and 2nd level hashes
-    volatile nnz_lno_t *used_hash_sizes = (volatile nnz_lno_t *) (all_shared_memory);
+    // used_hash_sizes hold the size of 1st and 2nd level hashes
+    volatile nnz_lno_t *used_hash_sizes =
+        (volatile nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * 2;
 
-    nnz_lno_t *globally_used_hash_count = (nnz_lno_t *) (all_shared_memory);
+    nnz_lno_t *globally_used_hash_count = (nnz_lno_t *)(all_shared_memory);
 
-    all_shared_memory += sizeof(nnz_lno_t) ;
-    //int unit_memory = sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) * 2;
-    //nnz_lno_t shmem_key_size = (thread_memory - sizeof(nnz_lno_t) * 3) / unit_memory;
+    all_shared_memory += sizeof(nnz_lno_t);
+    // int unit_memory = sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) * 2;
+    // nnz_lno_t shmem_key_size = (thread_memory - sizeof(nnz_lno_t) * 3) /
+    // unit_memory;
 
-    nnz_lno_t * begins = (nnz_lno_t *) (all_shared_memory);
+    nnz_lno_t *begins = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * shmem_hash_size;
 
-    //poins to the next elements
-    nnz_lno_t * nexts = (nnz_lno_t *) (all_shared_memory);
+    // poins to the next elements
+    nnz_lno_t *nexts = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * shmem_key_size;
 
-    //holds the keys
-    nnz_lno_t * keys = (nnz_lno_t *) (all_shared_memory);
+    // holds the keys
+    nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * shmem_key_size;
-    nnz_lno_t * vals = (nnz_lno_t *) (all_shared_memory);
-
-    //printf("begins:%ld, nexts:%ld, keys:%ld, vals:%ld\n", begins, nexts, keys, vals);
-    //return;
-    //first level hashmap
-    KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,nnz_lno_t,nnz_lno_t,KokkosKernels::Experimental::HashOpType::bitwiseAnd>
-      hm(shmem_key_size, shared_memory_hash_func, begins, nexts, keys, vals);
-
-    hashmapType hm2(MaxRoughNonZero, pow2_hash_func,
-                    nullptr, nullptr,
-                    nullptr, nullptr);
-
-    //initialize begins.
-    Kokkos::parallel_for(
-        Kokkos::ThreadVectorRange(teamMember, shmem_hash_size),
-        [&] (int i) {
-      begins[i] = -1;
-    });
-
-    //initialize hash usage sizes
-    Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
-      used_hash_sizes[0] = 0;
-      used_hash_sizes[1] = 0;
+    nnz_lno_t *vals = (nnz_lno_t *)(all_shared_memory);
+
+    // printf("begins:%ld, nexts:%ld, keys:%ld, vals:%ld\n", begins, nexts,
+    // keys, vals); return; first level hashmap
+    KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, nnz_lno_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>
+        hm(shmem_key_size, shared_memory_hash_func, begins, nexts, keys, vals);
+
+    hashmapType hm2(MaxRoughNonZero, pow2_hash_func, nullptr, nullptr, nullptr,
+                    nullptr);
+
+    // initialize begins.
+    Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember, shmem_hash_size),
+                         [&](int i) { begins[i] = -1; });
+
+    // initialize hash usage sizes
+    Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+      used_hash_sizes[0]          = 0;
+      used_hash_sizes[1]          = 0;
       globally_used_hash_count[0] = 0;
     });
 
     bool is_global_alloced = false;
 
-    const size_type col_end = row_mapA[row_index + 1];
+    const size_type col_end   = row_mapA[row_index + 1];
     const size_type col_begin = row_mapA[row_index];
-    const nnz_lno_t col_size = col_end - col_begin;
+    const nnz_lno_t col_size  = col_end - col_begin;
 
-    for (nnz_lno_t colind = 0; colind < col_size; ++colind){
+    for (nnz_lno_t colind = 0; colind < col_size; ++colind) {
       size_type a_col = colind + col_begin;
 
-      nnz_lno_t rowB = entriesA[a_col];
+      nnz_lno_t rowB     = entriesA[a_col];
       size_type rowBegin = row_pointer_begins_B(rowB);
 
       nnz_lno_t left_work = row_pointer_ends_B(rowB) - rowBegin;
 
-      while (left_work){
-        nnz_lno_t work_to_handle = KOKKOSKERNELS_MACRO_MIN(vector_size, left_work);
+      while (left_work) {
+        nnz_lno_t work_to_handle =
+            KOKKOSKERNELS_MACRO_MIN(vector_size, left_work);
 
         nnz_lno_t b_set_ind = -1, b_set = -1;
         Kokkos::parallel_for(
             Kokkos::ThreadVectorRange(teamMember, work_to_handle),
-            [&] (nnz_lno_t i) {
-          const size_type adjind = i + rowBegin;
-          b_set_ind = entriesSetIndicesB[adjind];
-          b_set = entriesSetsB[adjind];
-        });
-
+            [&](nnz_lno_t i) {
+              const size_type adjind = i + rowBegin;
+              b_set_ind              = entriesSetIndicesB[adjind];
+              b_set                  = entriesSetsB[adjind];
+            });
 
         int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeOr(
-                              b_set_ind, b_set, used_hash_sizes
-                            );
-
+            b_set_ind, b_set, used_hash_sizes);
 
         int overall_num_unsuccess = 0;
 
-        Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(teamMember, vector_size),
-            [&] (const int /* threadid */, int &overall_num_unsuccess_) {
-          overall_num_unsuccess_ += num_unsuccess;
-        }, overall_num_unsuccess);
-
-
-        if (overall_num_unsuccess){
-
-          //printf("row:%d\n", row_index);
-          if (!is_global_alloced){
-            volatile nnz_lno_t * tmp = NULL;
-            size_t tid = get_thread_id(row_index);
-            while (tmp == NULL){
-              Kokkos::single(Kokkos::PerThread(teamMember),[&] (volatile nnz_lno_t * &memptr) {
-                memptr = (volatile nnz_lno_t * )( m_space.allocate_chunk(tid));
-              }, tmp);
+        Kokkos::parallel_reduce(
+            Kokkos::ThreadVectorRange(teamMember, vector_size),
+            [&](const int /* threadid */, int &overall_num_unsuccess_) {
+              overall_num_unsuccess_ += num_unsuccess;
+            },
+            overall_num_unsuccess);
+
+        if (overall_num_unsuccess) {
+          // printf("row:%d\n", row_index);
+          if (!is_global_alloced) {
+            volatile nnz_lno_t *tmp = NULL;
+            size_t tid              = get_thread_id(row_index);
+            while (tmp == NULL) {
+              Kokkos::single(
+                  Kokkos::PerThread(teamMember),
+                  [&](volatile nnz_lno_t *&memptr) {
+                    memptr =
+                        (volatile nnz_lno_t *)(m_space.allocate_chunk(tid));
+                  },
+                  tmp);
             }
             is_global_alloced = true;
 
-            globally_used_hash_indices = (nnz_lno_t *) tmp;
-            tmp += pow2_hash_size ;
+            globally_used_hash_indices = (nnz_lno_t *)tmp;
+            tmp += pow2_hash_size;
 
-            hm2.hash_begins = (nnz_lno_t *) (tmp);
-            tmp += pow2_hash_size ;
+            hm2.hash_begins = (nnz_lno_t *)(tmp);
+            tmp += pow2_hash_size;
 
-            //poins to the next elements
-            hm2.hash_nexts = (nnz_lno_t *) (tmp);
+            // poins to the next elements
+            hm2.hash_nexts = (nnz_lno_t *)(tmp);
             tmp += MaxRoughNonZero;
 
-            //holds the keys
-            hm2.keys = (nnz_lno_t *) (tmp);
+            // holds the keys
+            hm2.keys = (nnz_lno_t *)(tmp);
             tmp += MaxRoughNonZero;
-            hm2.values = (nnz_lno_t *) (tmp);
+            hm2.values = (nnz_lno_t *)(tmp);
           }
 
           if (num_unsuccess) {
-            //int insertion =
+            // int insertion =
             hm2.vector_atomic_insert_into_hash_mergeOr_TrackHashes(
-                b_set_ind,b_set,
-                used_hash_sizes + 1,
-                globally_used_hash_count, globally_used_hash_indices
-                );
+                b_set_ind, b_set, used_hash_sizes + 1, globally_used_hash_count,
+                globally_used_hash_indices);
           }
         }
         left_work -= work_to_handle;
@@ -1215,13 +1160,15 @@ struct KokkosSPGEMM
       }
     }
 
-    Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
-      if (used_hash_sizes[0] > shmem_key_size) used_hash_sizes[0] = shmem_key_size;
+    Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+      if (used_hash_sizes[0] > shmem_key_size)
+        used_hash_sizes[0] = shmem_key_size;
     });
 
     /*
     Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
-      if (used_hash_sizes[1] > hm2.max_value_size) used_hash_sizes[1] = hm2.max_value_size;
+      if (used_hash_sizes[1] > hm2.max_value_size) used_hash_sizes[1] =
+    hm2.max_value_size;
     });
     */
 
@@ -1229,82 +1176,77 @@ struct KokkosSPGEMM
 
     nnz_lno_t num_compressed_elements = used_hash_sizes[0];
 
-    Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(teamMember, num_compressed_elements),
-        [&] (const nnz_lno_t ii, nnz_lno_t &num_nnz_in_row) {
-
-      nnz_lno_t c_rows = hm.values[ii];
-      nnz_lno_t num_el = KokkosKernels::Impl::pop_count(c_rows);
-
-      /*
-      nnz_lno_t num_el = 0;
-      for (; c_rows; num_el++) {
-        c_rows &= c_rows - 1; // clear the least significant bit set
-      }
-      */
-      num_nnz_in_row += num_el;
-    }, num_elements);
+    Kokkos::parallel_reduce(
+        Kokkos::ThreadVectorRange(teamMember, num_compressed_elements),
+        [&](const nnz_lno_t ii, nnz_lno_t &num_nnz_in_row) {
+          nnz_lno_t c_rows = hm.values[ii];
+          nnz_lno_t num_el = KokkosKernels::Impl::pop_count(c_rows);
 
+          /*
+          nnz_lno_t num_el = 0;
+          for (; c_rows; num_el++) {
+            c_rows &= c_rows - 1; // clear the least significant bit set
+          }
+          */
+          num_nnz_in_row += num_el;
+        },
+        num_elements);
 
-    if (is_global_alloced){
-      nnz_lno_t num_global_elements = 0;
+    if (is_global_alloced) {
+      nnz_lno_t num_global_elements      = 0;
       nnz_lno_t num_compressed_elements_ = used_hash_sizes[1];
-      Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(teamMember, num_compressed_elements_),
-          [&] (const nnz_lno_t ii, nnz_lno_t &num_nnz_in_row) {
-        nnz_lno_t c_rows = hm2.values[ii];
-        nnz_lno_t num_el = KokkosKernels::Impl::pop_count(c_rows);
-
-        /*
-        nnz_lno_t num_el = 0;
-        for (; c_rows; num_el++) {
-          c_rows &= c_rows - 1; // clear the least significant bit set
-        }
-        */
-        num_nnz_in_row += num_el;
-      }, num_global_elements);
-
+      Kokkos::parallel_reduce(
+          Kokkos::ThreadVectorRange(teamMember, num_compressed_elements_),
+          [&](const nnz_lno_t ii, nnz_lno_t &num_nnz_in_row) {
+            nnz_lno_t c_rows = hm2.values[ii];
+            nnz_lno_t num_el = KokkosKernels::Impl::pop_count(c_rows);
+
+            /*
+            nnz_lno_t num_el = 0;
+            for (; c_rows; num_el++) {
+              c_rows &= c_rows - 1; // clear the least significant bit set
+            }
+            */
+            num_nnz_in_row += num_el;
+          },
+          num_global_elements);
 
-      //now thread leaves the memory as it finds. so there is no need to initialize the hash begins
+      // now thread leaves the memory as it finds. so there is no need to
+      // initialize the hash begins
       nnz_lno_t dirty_hashes = globally_used_hash_count[0];
-      Kokkos::parallel_for(
-          Kokkos::ThreadVectorRange(teamMember, dirty_hashes),
-          [&] (nnz_lno_t i) {
-        nnz_lno_t dirty_hash = globally_used_hash_indices[i];
-        hm2.hash_begins[dirty_hash] = -1;
-      });
-
-
-      Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember, dirty_hashes),
+                           [&](nnz_lno_t i) {
+                             nnz_lno_t dirty_hash =
+                                 globally_used_hash_indices[i];
+                             hm2.hash_begins[dirty_hash] = -1;
+                           });
+
+      Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
         m_space.release_chunk(globally_used_hash_indices);
       });
       num_elements += num_global_elements;
     }
     Kokkos::single(Kokkos::PerThread(teamMember),
-      [&]()
-      {
-        rowmapC(row_index) = num_elements;
-      });
+                   [&]() { rowmapC(row_index) = num_elements; });
   }
 
-  size_t team_shmem_size (int /* team_size */) const {
+  size_t team_shmem_size(int /* team_size */) const {
     return shared_memory_size;
   }
-
 };
 
-
-
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-template <typename a_r_view_t, typename a_n_view_t,
-          typename b_oldrow_view_t, typename b_row_view_t>
-struct KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-  PredicMaxRowNNZ{
-  nnz_lno_t m; //num rows
-  a_r_view_t row_mapA;  //row pointers of a
-  a_n_view_t entriesA;  //col
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename a_r_view_t, typename a_n_view_t, typename b_oldrow_view_t,
+          typename b_row_view_t>
+struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                    a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                    b_scalar_nnz_view_t_>::PredicMaxRowNNZ {
+  nnz_lno_t m;          // num rows
+  a_r_view_t row_mapA;  // row pointers of a
+  a_n_view_t entriesA;  // col
   b_oldrow_view_t row_begins_B;
   b_row_view_t row_end_indices_B;
   const size_type min_val;
@@ -1320,86 +1262,87 @@ struct KokkosSPGEMM
    * \param row_end_indices_B_: row end indices of B
    * \param team_row_chunk_size_: the number of rows assigned to each team.
    */
-  PredicMaxRowNNZ(
-      nnz_lno_t m_,
-      a_r_view_t row_mapA_,
-      a_n_view_t entriesA_,
-
-      b_oldrow_view_t row_begins_B_,
-      b_row_view_t row_end_indices_B_,
-      nnz_lno_t team_row_chunk_size_,
-	  size_type * flops_per_row_ = NULL):
-        m(m_),
-        row_mapA(row_mapA_), entriesA(entriesA_),
+  PredicMaxRowNNZ(nnz_lno_t m_, a_r_view_t row_mapA_, a_n_view_t entriesA_,
+
+                  b_oldrow_view_t row_begins_B_,
+                  b_row_view_t row_end_indices_B_,
+                  nnz_lno_t team_row_chunk_size_,
+                  size_type *flops_per_row_ = NULL)
+      : m(m_),
+        row_mapA(row_mapA_),
+        entriesA(entriesA_),
         row_begins_B(row_begins_B_),
         row_end_indices_B(row_end_indices_B_),
         min_val(((std::numeric_limits<size_type>::lowest()))),
-        team_row_chunk_size(team_row_chunk_size_), flops_per_row(flops_per_row_){}
+        team_row_chunk_size(team_row_chunk_size_),
+        flops_per_row(flops_per_row_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const team_member_t & teamMember, size_type &overal_max) const {
-    //get the range of rows for team.
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, m);
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index)
-    {
+  void operator()(const team_member_t &teamMember,
+                  size_type &overal_max) const {
+    // get the range of rows for team.
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, m);
 
-      const size_type col_begin = row_mapA[row_index];
-      const size_type col_end = row_mapA[row_index + 1];
-      const nnz_lno_t left_work = col_end - col_begin;
-
-      size_type max_num_results_in_row = 0;
-
-      //get the size of the rows of B, pointed by row of A
-      Kokkos::parallel_reduce(
-          Kokkos::ThreadVectorRange(teamMember, left_work),
-          [&] (nnz_lno_t i, size_type & valueToUpdate) {
-        const size_type adjind = i + col_begin;
-        const nnz_lno_t colIndex = entriesA[adjind];
-        valueToUpdate += row_end_indices_B (colIndex) - row_begins_B(colIndex);
-      },
-      max_num_results_in_row);
-
-      if (flops_per_row != NULL){
-          flops_per_row[row_index] = max_num_results_in_row;
-      }
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          const size_type col_begin = row_mapA[row_index];
+          const size_type col_end   = row_mapA[row_index + 1];
+          const nnz_lno_t left_work = col_end - col_begin;
+
+          size_type max_num_results_in_row = 0;
+
+          // get the size of the rows of B, pointed by row of A
+          Kokkos::parallel_reduce(
+              Kokkos::ThreadVectorRange(teamMember, left_work),
+              [&](nnz_lno_t i, size_type &valueToUpdate) {
+                const size_type adjind   = i + col_begin;
+                const nnz_lno_t colIndex = entriesA[adjind];
+                valueToUpdate +=
+                    row_end_indices_B(colIndex) - row_begins_B(colIndex);
+              },
+              max_num_results_in_row);
+
+          if (flops_per_row != NULL) {
+            flops_per_row[row_index] = max_num_results_in_row;
+          }
 
-      //set max.
-      if (overal_max < max_num_results_in_row) {
-        overal_max = max_num_results_in_row;
-      }
-    });
+          // set max.
+          if (overal_max < max_num_results_in_row) {
+            overal_max = max_num_results_in_row;
+          }
+        });
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join (volatile size_type& dst,const volatile size_type& src) const {
-    if (dst < src) { dst = src;}
+  void join(volatile size_type &dst, const volatile size_type &src) const {
+    if (dst < src) {
+      dst = src;
+    }
   }
 
-
   KOKKOS_INLINE_FUNCTION
-  void init (size_type& dst) const
-  {
-    dst = min_val;
-  }
+  void init(size_type &dst) const { dst = min_val; }
 };
 
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-struct KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-  PredicMaxRowNNZIntersection{
-  const nnz_lno_t m,k; //num rows
-  const size_type * row_mapA;  //row pointers of a
-  const nnz_lno_t * entriesA;  //col
-  const size_type * row_begins_B;
-  const size_type * row_end_indices_B;
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                    a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                    b_scalar_nnz_view_t_>::PredicMaxRowNNZIntersection {
+  const nnz_lno_t m, k;       // num rows
+  const size_type *row_mapA;  // row pointers of a
+  const nnz_lno_t *entriesA;  // col
+  const size_type *row_begins_B;
+  const size_type *row_end_indices_B;
   const size_type min_val;
   const nnz_lno_t team_row_chunk_size;
-  nnz_lno_t * min_result_row_for_each_row;
+  nnz_lno_t *min_result_row_for_each_row;
 
   /**
    * \brief Constructor
@@ -1410,637 +1353,712 @@ struct KokkosSPGEMM
    * \param row_end_indices_B_: row end indices of B
    * \param team_row_chunk_size_: the number of rows assigned to each team.
    */
-  PredicMaxRowNNZIntersection(
-      const nnz_lno_t m_, const nnz_lno_t k_,
-      const size_type * row_mapA_,
-      const nnz_lno_t * entriesA_,
-
-      const size_type * row_begins_B_,
-      const size_type * row_end_indices_B_,
-      const nnz_lno_t team_row_chunk_size_,
-      nnz_lno_t * min_result_row_for_each_row_):
-        m(m_), k(k_),
-        row_mapA(row_mapA_), entriesA(entriesA_),
+  PredicMaxRowNNZIntersection(const nnz_lno_t m_, const nnz_lno_t k_,
+                              const size_type *row_mapA_,
+                              const nnz_lno_t *entriesA_,
+
+                              const size_type *row_begins_B_,
+                              const size_type *row_end_indices_B_,
+                              const nnz_lno_t team_row_chunk_size_,
+                              nnz_lno_t *min_result_row_for_each_row_)
+      : m(m_),
+        k(k_),
+        row_mapA(row_mapA_),
+        entriesA(entriesA_),
         row_begins_B(row_begins_B_),
         row_end_indices_B(row_end_indices_B_),
         min_val(((std::numeric_limits<size_type>::lowest()))),
         team_row_chunk_size(team_row_chunk_size_),
-        min_result_row_for_each_row(min_result_row_for_each_row_){}
+        min_result_row_for_each_row(min_result_row_for_each_row_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const team_member_t & teamMember, nnz_lno_t &overal_max) const {
-    //get the range of rows for team.
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, m);
+  void operator()(const team_member_t &teamMember,
+                  nnz_lno_t &overal_max) const {
+    // get the range of rows for team.
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, m);
 
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index)
-    {
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          const size_type col_begin    = row_mapA[row_index];
+          const size_type col_end      = row_mapA[row_index + 1];
+          const nnz_lno_t left_work    = col_end - col_begin;
+          nnz_lno_t min_num_result_row = -1;
+          if (left_work) {
+            nnz_lno_t min_num_results_in_row = this->k;
+            for (nnz_lno_t i = 0; i < left_work; ++i) {
+              const size_type adjind   = i + col_begin;
+              const nnz_lno_t colIndex = entriesA[adjind];
+              nnz_lno_t rowsize =
+                  row_end_indices_B[colIndex] - row_begins_B[colIndex];
+              if (min_num_results_in_row > rowsize) {
+                min_num_results_in_row = rowsize;
+                min_num_result_row     = colIndex;
+              }
+            }
 
-      const size_type col_begin = row_mapA[row_index];
-      const size_type col_end = row_mapA[row_index + 1];
-      const nnz_lno_t left_work = col_end - col_begin;
-      nnz_lno_t min_num_result_row = -1;
-      if (left_work){
-        nnz_lno_t min_num_results_in_row = this->k;
-        for (nnz_lno_t i = 0; i< left_work; ++i){
-
-          const size_type adjind = i + col_begin;
-          const nnz_lno_t colIndex = entriesA[adjind];
-          nnz_lno_t rowsize = row_end_indices_B [colIndex] - row_begins_B[colIndex];
-          if (min_num_results_in_row > rowsize){
-            min_num_results_in_row = rowsize;
-            min_num_result_row = colIndex;
+            // set max.
+            if (overal_max < min_num_results_in_row) {
+              overal_max = min_num_results_in_row;
+            }
           }
+          min_result_row_for_each_row[row_index] = min_num_result_row;
+        });
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile size_type &dst, const volatile size_type &src) const {
+    if (dst < src) {
+      dst = src;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init(size_type &dst) const { dst = min_val; }
+};
+
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename a_r_view_t, typename a_nnz_view_t,
+          typename b_original_row_view_t, typename b_compressed_row_view_t,
+          typename b_nnz_view_t, typename c_row_view_t>
+void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                  a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                  b_scalar_nnz_view_t_>::
+    symbolic_c_no_compression(nnz_lno_t m, a_r_view_t row_mapA_,
+                              a_nnz_view_t entriesA_,
+
+                              b_original_row_view_t b_rowmap_begin,
+                              b_compressed_row_view_t b_rowmap_end,
+                              b_nnz_view_t entriesb_, c_row_view_t rowmapC,
+                              nnz_lno_t maxNumRoughNonzeros) {
+  SPGEMMAlgorithm current_spgemm_algorithm = this->spgemm_algorithm;
+  constexpr bool exec_gpu =
+      KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>();
+  KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space =
+      this->handle->get_handle_exec_space();
+  if (exec_gpu) {
+    current_spgemm_algorithm = SPGEMM_KK_MEMORY;
+  }
+  maxNumRoughNonzeros =
+      KOKKOSKERNELS_MACRO_MIN(this->b_col_cnt, maxNumRoughNonzeros);
+  int shmem_size_to_use = shmem_size;
+
+  typedef KokkosKernels::Impl::UniformMemoryPool<MyTempMemorySpace, nnz_lno_t>
+      pool_memory_space;
+
+  // get the number of rows and nonzeroes of B.
+  nnz_lno_t brows = b_rowmap_end.extent(0) - 1;
+  size_type bnnz  = entriesb_.extent(0);
+
+  int max_vector_size =
+      KokkosKernels::Impl::kk_get_max_vector_size<MyExecSpace>();
+  int suggested_vector_size =
+      this->handle->get_suggested_vector_size(brows, bnnz);
+
+  // this kernel does not really work well if the vector size is less than 4.
+  if (suggested_vector_size < 4 && exec_gpu) {
+    if (KOKKOSKERNELS_VERBOSE) {
+      std::cout << "\tsuggested_vector_size:" << suggested_vector_size
+                << " setting it to 4 for Structure kernel" << std::endl;
+    }
+    suggested_vector_size = 4;
+  }
+  int suggested_team_size =
+      this->handle->get_suggested_team_size(suggested_vector_size);
+  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(
+      suggested_team_size, concurrency, a_row_cnt);
+
+  if (this->spgemm_algorithm == SPGEMM_KK ||
+      SPGEMM_KK_LP == this->spgemm_algorithm) {
+    if (exec_gpu) {
+      // then chose the best method and parameters.
+      current_spgemm_algorithm = SPGEMM_KK_MEMORY;
+      int estimate_compress    = 8;
+#ifdef FIRSTPARAMS
+      size_t estimate_max_nnz = maxNumRoughNonzeros / estimate_compress;
+#else
+      // THIS IS BETTER PARAMETER SELECTION.
+      size_t original_overall_flops =
+          this->handle->get_spgemm_handle()->original_overall_flops;
+      size_t estimate_max_nnz =
+          (sqrt(maxNumRoughNonzeros) * sqrt(original_overall_flops / m)) /
+          estimate_compress;
+      if (KOKKOSKERNELS_VERBOSE) {
+        std::cout << "\t\t\testimate_max_nnz:" << estimate_max_nnz
+                  << " maxNumRoughNonzeros:" << maxNumRoughNonzeros
+                  << " original_overall_flops / m:"
+                  << original_overall_flops / m << std::endl;
+      }
+#endif
+      int unit_memory = (sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) * 2);
+
+      int thread_memory = ((shmem_size_to_use / 8 / suggested_team_size) * 8);
 
+      int shmem_key_size =
+          ((thread_memory - sizeof(nnz_lno_t) * 3) / unit_memory);
+
+      if (estimate_max_nnz / shmem_key_size > 1) {
+        int scale = estimate_max_nnz / shmem_key_size;
+        while (scale / 2 > 0) {
+          scale                 = scale / 2;
+          suggested_vector_size = suggested_vector_size * 2;
+        }
+        suggested_vector_size =
+            KOKKOSKERNELS_MACRO_MIN(max_vector_size, suggested_vector_size);
+        suggested_team_size =
+            this->handle->get_suggested_team_size(suggested_vector_size);
+      }
+      if (KOKKOSKERNELS_VERBOSE) {
+        std::cout << "\t\t\tRunning KKMEM with suggested_vector_size:"
+                  << suggested_vector_size
+                  << " suggested_team_size:" << suggested_team_size
+                  << std::endl;
+      }
+    } else {
+      nnz_lno_t max_column_cut_off =
+          nnz_lno_t(this->handle->get_spgemm_handle()->MaxColDenseAcc);
+
+      nnz_lno_t col_size = this->b_col_cnt;
+      if (col_size < max_column_cut_off) {
+        current_spgemm_algorithm = SPGEMM_KK_DENSE;
+        if (KOKKOSKERNELS_VERBOSE) {
+          std::cout << "\t\t\tRunning SPGEMM_KK_SPEED col_size:" << col_size
+                    << " max_column_cut_off:" << max_column_cut_off
+                    << std::endl;
         }
+      } else {
+        // round up maxNumRoughNonzeros to closest power of 2.
+        nnz_lno_t min_hash_size = 1;
+        while (maxNumRoughNonzeros > min_hash_size) {
+          min_hash_size *= 2;
+        }
+
+        size_t kkmem_chunksize = min_hash_size;  // this is for used hash
+                                                 // indices
+        kkmem_chunksize += min_hash_size;        // this is for the hash begins
+        kkmem_chunksize += maxNumRoughNonzeros;  // this is for hash nexts
+        kkmem_chunksize += maxNumRoughNonzeros;  // this is for hash keys
 
-        //set max.
-        if (overal_max < min_num_results_in_row) {
-          overal_max = min_num_results_in_row;
+        size_t dense_chunksize = col_size + maxNumRoughNonzeros;
+
+        if (kkmem_chunksize >= dense_chunksize * 0.5) {
+          current_spgemm_algorithm = SPGEMM_KK_DENSE;
+          if (KOKKOSKERNELS_VERBOSE) {
+            std::cout << "\t\t\tRunning SPGEMM_KK_SPEED kkmem_chunksize:"
+                      << kkmem_chunksize
+                      << " dense_chunksize:" << dense_chunksize << std::endl;
+          }
+        } else {
+          current_spgemm_algorithm = SPGEMM_KK_MEMORY;
+          if (KOKKOSKERNELS_VERBOSE) {
+            std::cout << "\t\t\tRunning SPGEMM_KK_MEMORY col_size:" << col_size
+                      << " max_column_cut_off:" << max_column_cut_off
+                      << std::endl;
+          }
         }
       }
-      min_result_row_for_each_row[row_index] = min_num_result_row;
-    });
+    }
   }
 
-  KOKKOS_INLINE_FUNCTION
-  void join (volatile size_type& dst,const volatile size_type& src) const {
-    if (dst < src) { dst = src;}
+  // round up maxNumRoughNonzeros to closest power of 2.
+  nnz_lno_t min_hash_size = 1;
+
+  while (maxNumRoughNonzeros > min_hash_size) {
+    min_hash_size *= 2;
   }
+  min_hash_size *= this->handle->get_spgemm_handle()->get_min_hash_size_scale();
 
+  size_t chunksize = 1;
+  // initizalize value for the mem pool
+  int pool_init_val = -1;
 
-  KOKKOS_INLINE_FUNCTION
-  void init (size_type& dst) const
-  {
-    dst = min_val;
+  // this should not be executed at all.
+  if (current_spgemm_algorithm == SPGEMM_KK_LP) {
+    pool_init_val = -1;
+    chunksize     = min_hash_size;  // this is for keys
+    chunksize += min_hash_size;     // this is for the values
+  } else if (current_spgemm_algorithm == SPGEMM_KK_DENSE) {
+    // nnz_lno_t max_row_size = KOKKOSKERNELS_MACRO_MIN(b_col_cnt,
+    // maxNumRoughNonzeros);
+    chunksize = b_col_cnt + maxNumRoughNonzeros;
+    // if speed is set, and exec space is cpu, then  we use dense accumulators.
+    // or if memspeed is set, and concurrency is not high, we use dense
+    // accumulators.
+    maxNumRoughNonzeros = b_col_cnt;
+    pool_init_val       = 0;
+  } else {
+    pool_init_val = -1;
+    // set the chunksize.
+    chunksize = min_hash_size;         // this is for used hash indices
+    chunksize += min_hash_size;        // this is for the hash begins
+    chunksize += maxNumRoughNonzeros;  // this is for hash nexts
+    chunksize += maxNumRoughNonzeros;  // this is for hash keys
   }
-};
 
+  // initizalize value for the mem pool
+  KokkosKernels::Impl::PoolType my_pool_type =
+      KokkosKernels::Impl::OneThread2OneChunk;
+  if (exec_gpu) {
+    my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
+  }
 
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-template <typename a_r_view_t, typename a_nnz_view_t,
-            typename b_original_row_view_t,
-            typename b_compressed_row_view_t, typename b_nnz_view_t,
-            typename c_row_view_t>
-void KokkosSPGEMM
-  <HandleType,
-      a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-      b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-  symbolic_c_no_compression(
-    nnz_lno_t m,
-    a_r_view_t row_mapA_,
-    a_nnz_view_t entriesA_,
-
-    b_original_row_view_t b_rowmap_begin,
-    b_compressed_row_view_t b_rowmap_end,
-    b_nnz_view_t entriesb_,
-    c_row_view_t rowmapC,
-    nnz_lno_t maxNumRoughNonzeros
-  ){
-
-	SPGEMMAlgorithm current_spgemm_algorithm = this->spgemm_algorithm;
-        constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>();
-	KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space = this->handle->get_handle_exec_space();
-	if (exec_gpu)
-	{
-		current_spgemm_algorithm = SPGEMM_KK_MEMORY;
-	}
-	maxNumRoughNonzeros = KOKKOSKERNELS_MACRO_MIN(this->b_col_cnt, maxNumRoughNonzeros);
-	int shmem_size_to_use = shmem_size;
-
-	typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space;
-
-	//get the number of rows and nonzeroes of B.
-	nnz_lno_t brows = b_rowmap_end.extent(0) - 1;
-	size_type bnnz =  entriesb_.extent(0);
-
-
-	int suggested_vector_size = this->handle->get_suggested_vector_size(brows, bnnz);
-
-	//this kernel does not really work well if the vector size is less than 4.
-	if (suggested_vector_size < 4 && exec_gpu) {
-		if (KOKKOSKERNELS_VERBOSE){
-			std::cout << "\tsuggested_vector_size:" << suggested_vector_size << " setting it to 4 for Structure kernel" << std::endl;
-		}
-		suggested_vector_size = 4;
-	}
-	int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-	nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(suggested_team_size,concurrency, a_row_cnt);
-
-
-	if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm){
-		if (exec_gpu){
-			//then chose the best method and parameters.
-			current_spgemm_algorithm = SPGEMM_KK_MEMORY;
-			int estimate_compress = 8;
-#ifdef FIRSTPARAMS
-		  size_t estimate_max_nnz = maxNumRoughNonzeros / estimate_compress;
-#else
-		  //THIS IS BETTER PARAMAMETER SELECTION.
-		  size_t original_overall_flops = this->handle->get_spgemm_handle()->original_overall_flops;
-		  size_t estimate_max_nnz = (sqrt (maxNumRoughNonzeros) * sqrt (original_overall_flops / m)) / estimate_compress;
-		  if (KOKKOSKERNELS_VERBOSE){
-			  std::cout << "\t\t\testimate_max_nnz:" << estimate_max_nnz << " maxNumRoughNonzeros:" << maxNumRoughNonzeros << " original_overall_flops / m:" << original_overall_flops / m  << std::endl;
-		  }
-#endif
-			int unit_memory = (sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) * 2);
-
-			int thread_memory = ((shmem_size_to_use /8 / suggested_team_size) * 8);
-
-			int shmem_key_size = ((thread_memory - sizeof(nnz_lno_t) * 3) / unit_memory);
-
-			if (estimate_max_nnz / shmem_key_size > 1){
-				int scale = estimate_max_nnz / shmem_key_size;
-				while (scale / 2 > 0){
-					scale = scale / 2;
-					suggested_vector_size  = suggested_vector_size * 2;
-				}
-				suggested_vector_size = KOKKOSKERNELS_MACRO_MIN(32, suggested_vector_size);
-				suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-			}
-			if (KOKKOSKERNELS_VERBOSE){
-				std::cout << "\t\t\tRunning KKMEM with suggested_vector_size:" << suggested_vector_size << " suggested_team_size:" << suggested_team_size << std::endl;
-			}
-		}
-		else {
-			  nnz_lno_t max_column_cut_off = nnz_lno_t (this->handle->get_spgemm_handle()->MaxColDenseAcc);
-
-			  nnz_lno_t col_size = this->b_col_cnt;
-			  if (col_size < max_column_cut_off){
-				  current_spgemm_algorithm = SPGEMM_KK_DENSE;
-				  if (KOKKOSKERNELS_VERBOSE){
-					  std::cout << "\t\t\tRunning SPGEMM_KK_SPEED col_size:" << col_size << " max_column_cut_off:" << max_column_cut_off << std::endl;
-				  }
-			  }
-			  else {
-				  //round up maxNumRoughNonzeros to closest power of 2.
-				  nnz_lno_t min_hash_size = 1;
-				  while (maxNumRoughNonzeros > min_hash_size){
-					  min_hash_size *= 2;
-				  }
-
-				  size_t kkmem_chunksize = min_hash_size ; //this is for used hash indices
-				  kkmem_chunksize += min_hash_size ; //this is for the hash begins
-				  kkmem_chunksize += maxNumRoughNonzeros ; //this is for hash nexts
-				  kkmem_chunksize += maxNumRoughNonzeros ; //this is for hash keys
-
-				  size_t dense_chunksize = col_size + maxNumRoughNonzeros;
-
-
-				  if (kkmem_chunksize >= dense_chunksize * 0.5){
-					  current_spgemm_algorithm = SPGEMM_KK_DENSE;
-					  if (KOKKOSKERNELS_VERBOSE){
-						  std::cout << "\t\t\tRunning SPGEMM_KK_SPEED kkmem_chunksize:" << kkmem_chunksize << " dense_chunksize:" << dense_chunksize << std::endl;
-					  }
-				  }
-				  else {
-					  current_spgemm_algorithm = SPGEMM_KK_MEMORY;
-					  if (KOKKOSKERNELS_VERBOSE){
-						  std::cout << "\t\t\tRunning SPGEMM_KK_MEMORY col_size:" << col_size << " max_column_cut_off:" << max_column_cut_off << std::endl;
-					  }
-				  }
-			  }
-		}
-	}
-
-	//round up maxNumRoughNonzeros to closest power of 2.
-	nnz_lno_t min_hash_size = 1;
-
-	while (maxNumRoughNonzeros > min_hash_size){
-		min_hash_size *= 2;
-	}
-	min_hash_size *= this->handle->get_spgemm_handle()->get_min_hash_size_scale();
-
-	size_t chunksize = 1;
-	//initizalize value for the mem pool
-	int pool_init_val = -1;
-
-	//this should not be executed at all.
-	if (current_spgemm_algorithm == SPGEMM_KK_LP ){
-		pool_init_val = -1;
-		chunksize = min_hash_size ; //this is for keys
-		chunksize += min_hash_size ; //this is for the values
-	}
-	else if (current_spgemm_algorithm == SPGEMM_KK_DENSE){
-
-		//nnz_lno_t max_row_size = KOKKOSKERNELS_MACRO_MIN(b_col_cnt, maxNumRoughNonzeros);
-		chunksize = b_col_cnt + maxNumRoughNonzeros;
-		//if speed is set, and exec space is cpu, then  we use dense accumulators.
-		//or if memspeed is set, and concurrency is not high, we use dense accumulators.
-		maxNumRoughNonzeros = b_col_cnt;
-		pool_init_val = 0;
-	}
-	else {
-		pool_init_val = -1;
-		//set the chunksize.
-		chunksize = min_hash_size ; //this is for used hash indices
-		chunksize += min_hash_size ; //this is for the hash begins
-		chunksize += maxNumRoughNonzeros ; //this is for hash nexts
-		chunksize += maxNumRoughNonzeros ; //this is for hash keys
-	}
-
-	//initizalize value for the mem pool
-	KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk;
-	if (exec_gpu) {
-		my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
-	}
-
-        nnz_lno_t num_chunks = this->template compute_num_pool_chunks<pool_memory_space>
-          (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
-
-	if (KOKKOSKERNELS_VERBOSE){
-		std::cout << "\tPool Size (MB):" << (num_chunks * chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << " chunksize:" << chunksize << std::endl;
-	}
-	Kokkos::Timer timer1;
-	pool_memory_space m_space(num_chunks, chunksize, pool_init_val,  my_pool_type);
-	MyExecSpace().fence();
-
-	if (KOKKOSKERNELS_VERBOSE){
-		std::cout << "\tPool Alloc Time:" << timer1.seconds() << std::endl;
-	}
-
-	StructureC_NC
-	<a_r_view_t, a_nnz_view_t,
-	b_original_row_view_t, b_compressed_row_view_t, b_nnz_view_t,
-	c_row_view_t, /* nnz_lno_temp_work_view_t,*/ pool_memory_space>
-	sc(
-			m,
-			row_mapA_,
-			entriesA_,
-			b_rowmap_begin,
-			b_rowmap_end,
-			entriesb_,
-			rowmapC,
-			min_hash_size,
-			maxNumRoughNonzeros,
-			shmem_size_to_use,
-			suggested_team_size,
-			team_row_chunk_size,
-			suggested_vector_size,
-			m_space,
-			lcl_my_exec_space,
-      KOKKOSKERNELS_VERBOSE
-	);
-
-	if (KOKKOSKERNELS_VERBOSE){
-		std::cout << "\tStructureC vector_size:" << suggested_vector_size
-				<< " team_size:" << suggested_team_size
-				<< " chunk_size:" << team_row_chunk_size
-				<< " shmem_size:" << shmem_size << std::endl;
-	}
-
-	timer1.reset();
-
-
-	if (exec_gpu) {
-		Kokkos::parallel_for("StructureC_NC::GPU_EXEC", gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc);
-	}
-	else {
-		if (current_spgemm_algorithm == SPGEMM_KK_DENSE){
-			if (use_dynamic_schedule){
-				Kokkos::parallel_for("KokkosSparse::StructureC_NC::DENSE_DYNAMIC", dynamic_multicore_dense_team_count_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-			}
-			else {
-				Kokkos::parallel_for("KokkosSparse::StructureC_NC::DENSE_STATIC", multicore_dense_team_count_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-			}
-		}
-		else if (current_spgemm_algorithm == SPGEMM_KK_LP ){
-			if (use_dynamic_schedule){
-				Kokkos::parallel_for( "KokkosSparse::StructureC_NC::LP_DYNAMIC", dynamic_multicore_team_policy4_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-			}
-			else {
-
-				Kokkos::parallel_for( "KokkosSparse::StructureC_NC::LP_STATIC", multicore_team_policy4_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-			}
-		}
-		else {
-			if (use_dynamic_schedule){
-				Kokkos::parallel_for( "KokkosSparse::StructureC_NC::KKMEM_DYNAMIC", dynamic_multicore_team_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-			}
-			else {
-				Kokkos::parallel_for( "KokkosSparse::StructureC_NC::KKMEM_STATIC", multicore_team_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-			}
-		}
-	}
-	MyExecSpace().fence();
-	if (KOKKOSKERNELS_VERBOSE){
-		std::cout << "\tStructureC Kernel time:" << timer1.seconds() << std::endl<< std::endl;
-	}
-	// we need to find the max nnz in a row.
-	{
-		Kokkos::Timer timer1_;
-		size_type c_max_nnz = 0;
-                if(m > 0)
-                {
-                  KokkosKernels::Impl::view_reduce_max<c_row_view_t, MyExecSpace>(m, rowmapC, c_max_nnz);
-                  MyExecSpace().fence();
-                }
-		this->handle->get_spgemm_handle()->set_max_result_nnz(c_max_nnz);
-
-		if (KOKKOSKERNELS_VERBOSE){
-			std::cout << "\tReduce Max Row Size Time:" << timer1_.seconds() << std::endl;
-		}
-	}
-
-	KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<c_row_view_t, MyExecSpace>(m+1, rowmapC);
-	MyExecSpace().fence();
-	auto d_c_nnz_size = Kokkos::subview(rowmapC, m);
-	auto h_c_nnz_size = Kokkos::create_mirror_view (d_c_nnz_size);
-	Kokkos::deep_copy (h_c_nnz_size, d_c_nnz_size);
-	typename c_row_view_t::non_const_value_type c_nnz_size = h_c_nnz_size();
-	this->handle->get_spgemm_handle()->set_c_nnz(c_nnz_size);
-}  // end: symbolic_c_no_compression
+  nnz_lno_t num_chunks =
+      this->template compute_num_pool_chunks<pool_memory_space>(
+          chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
+
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\tPool Size (MB):"
+              << (num_chunks * chunksize * sizeof(nnz_lno_t)) / 1024. / 1024.
+              << " num_chunks:" << num_chunks << " chunksize:" << chunksize
+              << std::endl;
+  }
+  Kokkos::Timer timer1;
+  pool_memory_space m_space(num_chunks, chunksize, pool_init_val, my_pool_type);
+  MyExecSpace().fence();
 
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\tPool Alloc Time:" << timer1.seconds() << std::endl;
+  }
 
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-template <typename a_r_view_t, typename a_nnz_view_t,
-            typename b_original_row_view_t,
-            typename b_compressed_row_view_t, typename b_nnz_view_t,
-            typename c_row_view_t>
-void KokkosSPGEMM
-  <HandleType,
-      a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-      b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-  symbolic_c(
-    nnz_lno_t m,
-    a_r_view_t row_mapA_,
-    a_nnz_view_t entriesA_,
-
-    b_original_row_view_t old_row_mapB,
-    b_compressed_row_view_t row_mapB_,
-    b_nnz_view_t entriesSetIndex,
-    b_nnz_view_t entriesSets,
-
-    c_row_view_t rowmapC,
-    nnz_lno_t maxNumRoughNonzeros
-){
+  StructureC_NC<a_r_view_t, a_nnz_view_t, b_original_row_view_t,
+                b_compressed_row_view_t, b_nnz_view_t, c_row_view_t,
+                /* nnz_lno_temp_work_view_t,*/ pool_memory_space>
+      sc(m, row_mapA_, entriesA_, b_rowmap_begin, b_rowmap_end, entriesb_,
+         rowmapC, min_hash_size, maxNumRoughNonzeros, shmem_size_to_use,
+         suggested_team_size, team_row_chunk_size, suggested_vector_size,
+         m_space, lcl_my_exec_space, KOKKOSKERNELS_VERBOSE);
+
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\tStructureC vector_size:" << suggested_vector_size
+              << " team_size:" << suggested_team_size
+              << " chunk_size:" << team_row_chunk_size
+              << " shmem_size:" << shmem_size << std::endl;
+  }
+
+  timer1.reset();
+
+  if (exec_gpu) {
+    Kokkos::parallel_for(
+        "StructureC_NC::GPU_EXEC",
+        gpu_team_policy_t(m / suggested_team_size + 1, suggested_team_size,
+                          suggested_vector_size),
+        sc);
+  } else {
+    if (current_spgemm_algorithm == SPGEMM_KK_DENSE) {
+      if (use_dynamic_schedule) {
+        Kokkos::parallel_for("KokkosSparse::StructureC_NC::DENSE_DYNAMIC",
+                             dynamic_multicore_dense_team_count_policy_t(
+                                 m / team_row_chunk_size + 1,
+                                 suggested_team_size, suggested_vector_size),
+                             sc);
+      } else {
+        Kokkos::parallel_for("KokkosSparse::StructureC_NC::DENSE_STATIC",
+                             multicore_dense_team_count_policy_t(
+                                 m / team_row_chunk_size + 1,
+                                 suggested_team_size, suggested_vector_size),
+                             sc);
+      }
+    } else if (current_spgemm_algorithm == SPGEMM_KK_LP) {
+      if (use_dynamic_schedule) {
+        Kokkos::parallel_for("KokkosSparse::StructureC_NC::LP_DYNAMIC",
+                             dynamic_multicore_team_policy4_t(
+                                 m / team_row_chunk_size + 1,
+                                 suggested_team_size, suggested_vector_size),
+                             sc);
+      } else {
+        Kokkos::parallel_for("KokkosSparse::StructureC_NC::LP_STATIC",
+                             multicore_team_policy4_t(
+                                 m / team_row_chunk_size + 1,
+                                 suggested_team_size, suggested_vector_size),
+                             sc);
+      }
+    } else {
+      if (use_dynamic_schedule) {
+        Kokkos::parallel_for("KokkosSparse::StructureC_NC::KKMEM_DYNAMIC",
+                             dynamic_multicore_team_policy_t(
+                                 m / team_row_chunk_size + 1,
+                                 suggested_team_size, suggested_vector_size),
+                             sc);
+      } else {
+        Kokkos::parallel_for(
+            "KokkosSparse::StructureC_NC::KKMEM_STATIC",
+            multicore_team_policy_t(m / team_row_chunk_size + 1,
+                                    suggested_team_size, suggested_vector_size),
+            sc);
+      }
+    }
+  }
+  MyExecSpace().fence();
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\tStructureC Kernel time:" << timer1.seconds() << std::endl
+              << std::endl;
+  }
+  // we need to find the max nnz in a row.
+  {
+    Kokkos::Timer timer1_;
+    size_type c_max_nnz = 0;
+    if (m > 0) {
+      KokkosKernels::Impl::view_reduce_max<c_row_view_t, MyExecSpace>(
+          m, rowmapC, c_max_nnz);
+      MyExecSpace().fence();
+    }
+    this->handle->get_spgemm_handle()->set_max_result_nnz(c_max_nnz);
+
+    if (KOKKOSKERNELS_VERBOSE) {
+      std::cout << "\tReduce Max Row Size Time:" << timer1_.seconds()
+                << std::endl;
+    }
+  }
+
+  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<c_row_view_t,
+                                                        MyExecSpace>(m + 1,
+                                                                     rowmapC);
+  MyExecSpace().fence();
+  auto d_c_nnz_size = Kokkos::subview(rowmapC, m);
+  auto h_c_nnz_size = Kokkos::create_mirror_view(d_c_nnz_size);
+  Kokkos::deep_copy(h_c_nnz_size, d_c_nnz_size);
+  typename c_row_view_t::non_const_value_type c_nnz_size = h_c_nnz_size();
+  this->handle->get_spgemm_handle()->set_c_nnz(c_nnz_size);
+}  // end: symbolic_c_no_compression
 
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename a_r_view_t, typename a_nnz_view_t,
+          typename b_original_row_view_t, typename b_compressed_row_view_t,
+          typename b_nnz_view_t, typename c_row_view_t>
+void KokkosSPGEMM<
+    HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
+    b_lno_row_view_t_, b_lno_nnz_view_t_,
+    b_scalar_nnz_view_t_>::symbolic_c(nnz_lno_t m, a_r_view_t row_mapA_,
+                                      a_nnz_view_t entriesA_,
+
+                                      b_original_row_view_t old_row_mapB,
+                                      b_compressed_row_view_t row_mapB_,
+                                      b_nnz_view_t entriesSetIndex,
+                                      b_nnz_view_t entriesSets,
+
+                                      c_row_view_t rowmapC,
+                                      nnz_lno_t maxNumRoughNonzeros) {
   SPGEMMAlgorithm current_spgemm_algorithm = this->spgemm_algorithm;
-  constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space<typename HandleType::HandleExecSpace>();
-  KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space = this->handle->get_handle_exec_space();
+  constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space<
+      typename HandleType::HandleExecSpace>();
+  KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space =
+      this->handle->get_handle_exec_space();
   if (exec_gpu) {
-	current_spgemm_algorithm = SPGEMM_KK_MEMORY;
+    current_spgemm_algorithm = SPGEMM_KK_MEMORY;
   }
 
-  //get the number of rows and nonzeroes of B.
-  nnz_lno_t brows = row_mapB_.extent(0) - 1;
-  size_type bnnz =  entriesSetIndex.extent(0);
+  // get the number of rows and nonzeroes of B.
+  nnz_lno_t brows             = row_mapB_.extent(0) - 1;
+  size_type bnnz              = entriesSetIndex.extent(0);
   size_type compressed_b_size = bnnz;
   if (exec_gpu) {
-	  KokkosKernels::Impl::kk_reduce_diff_view <b_original_row_view_t,
-	  	  	  	  	  	  	  	  	  	  	  	b_compressed_row_view_t, MyExecSpace> (brows, old_row_mapB, row_mapB_, compressed_b_size);
-	  if (KOKKOSKERNELS_VERBOSE){
-		  std::cout << "\tcompressed_b_size:" << compressed_b_size << " bnnz:" << bnnz << std::endl;
-	  }
+    KokkosKernels::Impl::kk_reduce_diff_view<
+        b_original_row_view_t, b_compressed_row_view_t, MyExecSpace>(
+        brows, old_row_mapB, row_mapB_, compressed_b_size);
+    if (KOKKOSKERNELS_VERBOSE) {
+      std::cout << "\tcompressed_b_size:" << compressed_b_size
+                << " bnnz:" << bnnz << std::endl;
+    }
   }
-  int suggested_vector_size = this->handle->get_suggested_vector_size(brows, compressed_b_size);
+  int max_vector_size =
+      KokkosKernels::Impl::kk_get_max_vector_size<MyExecSpace>();
+  int suggested_vector_size =
+      this->handle->get_suggested_vector_size(brows, compressed_b_size);
 
-  //this kernel does not really work well if the vector size is less than 4.
+  // this kernel does not really work well if the vector size is less than 4.
   if (suggested_vector_size < 4 && exec_gpu) {
-      if (KOKKOSKERNELS_VERBOSE){
-        std::cout << "\tsuggested_vector_size:" << suggested_vector_size << " setting it to 4 for Structure kernel" << std::endl;
-      }
-      suggested_vector_size = 4;
+    if (KOKKOSKERNELS_VERBOSE) {
+      std::cout << "\tsuggested_vector_size:" << suggested_vector_size
+                << " setting it to 4 for Structure kernel" << std::endl;
+    }
+    suggested_vector_size = 4;
   }
-  int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-  maxNumRoughNonzeros = KOKKOSKERNELS_MACRO_MIN(size_t (this->b_col_cnt / sizeof (nnz_lno_t) * 8 + 1), size_t (maxNumRoughNonzeros));
+  int suggested_team_size =
+      this->handle->get_suggested_team_size(suggested_vector_size);
+  maxNumRoughNonzeros = KOKKOSKERNELS_MACRO_MIN(
+      size_t(this->b_col_cnt / sizeof(nnz_lno_t) * 8 + 1),
+      size_t(maxNumRoughNonzeros));
   int shmem_size_to_use = shmem_size;
 
-  if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm){
-	  if (exec_gpu) {
-		  //then chose the best method and parameters.
-		  current_spgemm_algorithm = SPGEMM_KK_MEMORY;
-		  int estimate_compress = 8;
+  if (this->spgemm_algorithm == SPGEMM_KK ||
+      SPGEMM_KK_LP == this->spgemm_algorithm) {
+    if (exec_gpu) {
+      // then chose the best method and parameters.
+      current_spgemm_algorithm = SPGEMM_KK_MEMORY;
+      int estimate_compress    = 8;
 #ifdef FIRSTPARAMS
-		  size_t estimate_max_nnz = maxNumRoughNonzeros / estimate_compress;
+      size_t estimate_max_nnz = maxNumRoughNonzeros / estimate_compress;
 #else
-		  size_t original_overall_flops = this->handle->get_spgemm_handle()->compressed_overall_flops;
-		  size_t estimate_max_nnz = (sqrt (maxNumRoughNonzeros) * sqrt (original_overall_flops / m)) / estimate_compress;
-		  if (KOKKOSKERNELS_VERBOSE){
-			  std::cout << "\t\t\testimate_max_nnz:" << estimate_max_nnz << " maxNumRoughNonzeros:" << maxNumRoughNonzeros << " original_overall_flops / m:" << original_overall_flops / m  << std::endl;
-		  }
+      size_t original_overall_flops =
+          this->handle->get_spgemm_handle()->compressed_overall_flops;
+      size_t estimate_max_nnz = 0;
+      if (m > 0)
+        estimate_max_nnz =
+            (sqrt(maxNumRoughNonzeros) * sqrt(original_overall_flops / m)) /
+            estimate_compress;
+      if (KOKKOSKERNELS_VERBOSE) {
+        std::cout << "\t\t\testimate_max_nnz:" << estimate_max_nnz
+                  << " maxNumRoughNonzeros:" << maxNumRoughNonzeros
+                  << " original_overall_flops / m:"
+                  << original_overall_flops / m << std::endl;
+      }
 #endif
 
-
-	      int unit_memory = (sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) * 2);
-	      int thread_memory = ((shmem_size_to_use /8 / suggested_team_size) * 8);
-	      int shmem_key_size = ((thread_memory - sizeof(nnz_lno_t) * 3) / unit_memory);
+      int unit_memory   = (sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) * 2);
+      int thread_memory = ((shmem_size_to_use / 8 / suggested_team_size) * 8);
+      int shmem_key_size =
+          ((thread_memory - sizeof(nnz_lno_t) * 3) / unit_memory);
 
 #ifdef SECONDPARAMS
-		  if (estimate_max_nnz / shmem_key_size > 1){
-			  int scale = estimate_max_nnz / shmem_key_size;
-			  while (scale / 2 > 0){
-				  scale = scale / 2;
-				  suggested_vector_size  = suggested_vector_size * 2;
-			  }
-			  suggested_vector_size = KOKKOSKERNELS_MACRO_MIN(32, suggested_vector_size);
-			  suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-		  }
+      if (estimate_max_nnz / shmem_key_size > 1) {
+        int scale = estimate_max_nnz / shmem_key_size;
+        while (scale / 2 > 0) {
+          scale                 = scale / 2;
+          suggested_vector_size = suggested_vector_size * 2;
+        }
+        suggested_vector_size =
+            KOKKOSKERNELS_MACRO_MIN(max_vector_size, suggested_vector_size);
+        suggested_team_size =
+            this->handle->get_suggested_team_size(suggested_vector_size);
+      }
 #else
-		  //need to adjust the size of shmem based on average row size.
-		  int thread_shmem_hash_size = 1;
-		  while (thread_shmem_hash_size * 2 <=  shmem_key_size){
-			  thread_shmem_hash_size = thread_shmem_hash_size * 2;
-		  }
-		  shmem_key_size = shmem_key_size + (shmem_key_size - thread_shmem_hash_size) / 3;
-		  shmem_key_size = (shmem_key_size >> 1) << 1;
-		  while (estimate_max_nnz > size_t (shmem_key_size) && suggested_vector_size < 32){
-			  suggested_vector_size  = suggested_vector_size * 2;
-			  suggested_vector_size = KOKKOSKERNELS_MACRO_MIN(32, suggested_vector_size);
-			  suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-			  thread_memory = (shmem_size_to_use /8 / suggested_team_size) * 8;
-			  shmem_key_size = ((thread_memory - sizeof(nnz_lno_t) * 3) / unit_memory);
-			  thread_shmem_hash_size = 1;
-			  while (thread_shmem_hash_size * 2 <=  shmem_key_size){
-				  thread_shmem_hash_size = thread_shmem_hash_size * 2;
-			  }
-			  shmem_key_size = shmem_key_size + (shmem_key_size - thread_shmem_hash_size) / 3;
-			  shmem_key_size = (shmem_key_size >> 1) << 1;
-			  //thread_shmem_key_size * 2;
-		  }
+      // need to adjust the size of shmem based on average row size.
+      int thread_shmem_hash_size = 1;
+      while (thread_shmem_hash_size * 2 <= shmem_key_size) {
+        thread_shmem_hash_size = thread_shmem_hash_size * 2;
+      }
+      shmem_key_size =
+          shmem_key_size + (shmem_key_size - thread_shmem_hash_size) / 3;
+      shmem_key_size = (shmem_key_size >> 1) << 1;
+      while (estimate_max_nnz > size_t(shmem_key_size) &&
+             suggested_vector_size < max_vector_size) {
+        suggested_vector_size = suggested_vector_size * 2;
+        suggested_vector_size =
+            KOKKOSKERNELS_MACRO_MIN(max_vector_size, suggested_vector_size);
+        suggested_team_size =
+            this->handle->get_suggested_team_size(suggested_vector_size);
+        thread_memory = (shmem_size_to_use / 8 / suggested_team_size) * 8;
+        shmem_key_size =
+            ((thread_memory - sizeof(nnz_lno_t) * 3) / unit_memory);
+        thread_shmem_hash_size = 1;
+        while (thread_shmem_hash_size * 2 <= shmem_key_size) {
+          thread_shmem_hash_size = thread_shmem_hash_size * 2;
+        }
+        shmem_key_size =
+            shmem_key_size + (shmem_key_size - thread_shmem_hash_size) / 3;
+        shmem_key_size = (shmem_key_size >> 1) << 1;
+        // thread_shmem_key_size * 2;
+      }
 #endif
-		  if (KOKKOSKERNELS_VERBOSE){
-			  std::cout << "\t\t\tRunning KKMEM with suggested_vector_size:" << suggested_vector_size << " suggested_team_size:" << suggested_team_size << std::endl;
-		  }
-	  }
-	  else {
-		  nnz_lno_t max_column_cut_off = nnz_lno_t(this->handle->get_spgemm_handle()->MaxColDenseAcc);
-		  nnz_lno_t col_size = this->b_col_cnt / (sizeof (nnz_lno_t) * 8)+ 1;
-		  if (col_size < max_column_cut_off){
-			  current_spgemm_algorithm = SPGEMM_KK_DENSE;
-			  if (KOKKOSKERNELS_VERBOSE){
-				  std::cout << "\t\t\tRunning SPGEMM_KK_SPEED col_size:" << col_size << " max_column_cut_off:" << max_column_cut_off << std::endl;
-			  }
-		  }
-		  else {
-			  //round up maxNumRoughNonzeros to closest power of 2.
-			  nnz_lno_t min_hash_size = 1;
-			  while (maxNumRoughNonzeros > min_hash_size){
-				  min_hash_size *= 2;
-			  }
-			  size_t kkmem_chunksize = min_hash_size ; //this is for used hash indices
-			  kkmem_chunksize += min_hash_size ; //this is for the hash begins
-			  kkmem_chunksize += maxNumRoughNonzeros ; //this is for hash nexts
-			  kkmem_chunksize += maxNumRoughNonzeros ; //this is for hash keys
-			  kkmem_chunksize += maxNumRoughNonzeros ; //this is for hash values
-
-			  size_t dense_chunksize = col_size + maxNumRoughNonzeros;
-
-
-			  if (kkmem_chunksize >= dense_chunksize * 0.5){
-				  current_spgemm_algorithm = SPGEMM_KK_DENSE;
-				  if (KOKKOSKERNELS_VERBOSE){
-					  std::cout << "\t\t\tRunning SPGEMM_KK_SPEED kkmem_chunksize:" << kkmem_chunksize << " dense_chunksize:" << dense_chunksize << std::endl;
-				  }
-			  }
-			  else {
-				  current_spgemm_algorithm = SPGEMM_KK_MEMORY;
-				  if (KOKKOSKERNELS_VERBOSE){
-					  std::cout << "\t\t\tRunning SPGEMM_KK_MEMORY col_size:" << col_size << " max_column_cut_off:" << max_column_cut_off << std::endl;
-				  }
-			  }
-		  }
-	  }
+      if (KOKKOSKERNELS_VERBOSE) {
+        std::cout << "\t\t\tRunning KKMEM with suggested_vector_size:"
+                  << suggested_vector_size
+                  << " suggested_team_size:" << suggested_team_size
+                  << std::endl;
+      }
+    } else {
+      nnz_lno_t max_column_cut_off =
+          nnz_lno_t(this->handle->get_spgemm_handle()->MaxColDenseAcc);
+      nnz_lno_t col_size = this->b_col_cnt / (sizeof(nnz_lno_t) * 8) + 1;
+      if (col_size < max_column_cut_off) {
+        current_spgemm_algorithm = SPGEMM_KK_DENSE;
+        if (KOKKOSKERNELS_VERBOSE) {
+          std::cout << "\t\t\tRunning SPGEMM_KK_SPEED col_size:" << col_size
+                    << " max_column_cut_off:" << max_column_cut_off
+                    << std::endl;
+        }
+      } else {
+        // round up maxNumRoughNonzeros to closest power of 2.
+        nnz_lno_t min_hash_size = 1;
+        while (maxNumRoughNonzeros > min_hash_size) {
+          min_hash_size *= 2;
+        }
+        size_t kkmem_chunksize = min_hash_size;  // this is for used hash
+                                                 // indices
+        kkmem_chunksize += min_hash_size;        // this is for the hash begins
+        kkmem_chunksize += maxNumRoughNonzeros;  // this is for hash nexts
+        kkmem_chunksize += maxNumRoughNonzeros;  // this is for hash keys
+        kkmem_chunksize += maxNumRoughNonzeros;  // this is for hash values
+
+        size_t dense_chunksize = col_size + maxNumRoughNonzeros;
+
+        if (kkmem_chunksize >= dense_chunksize * 0.5) {
+          current_spgemm_algorithm = SPGEMM_KK_DENSE;
+          if (KOKKOSKERNELS_VERBOSE) {
+            std::cout << "\t\t\tRunning SPGEMM_KK_SPEED kkmem_chunksize:"
+                      << kkmem_chunksize
+                      << " dense_chunksize:" << dense_chunksize << std::endl;
+          }
+        } else {
+          current_spgemm_algorithm = SPGEMM_KK_MEMORY;
+          if (KOKKOSKERNELS_VERBOSE) {
+            std::cout << "\t\t\tRunning SPGEMM_KK_MEMORY col_size:" << col_size
+                      << " max_column_cut_off:" << max_column_cut_off
+                      << std::endl;
+          }
+        }
+      }
+    }
   }
-  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(suggested_team_size,concurrency, a_row_cnt);
-
+  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(
+      suggested_team_size, concurrency, a_row_cnt);
 
-  typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space;
+  typedef KokkosKernels::Impl::UniformMemoryPool<MyTempMemorySpace, nnz_lno_t>
+      pool_memory_space;
 
-  //round up maxNumRoughNonzeros to closest power of 2.
+  // round up maxNumRoughNonzeros to closest power of 2.
   nnz_lno_t min_hash_size = 1;
-  while (maxNumRoughNonzeros > min_hash_size){
+  while (maxNumRoughNonzeros > min_hash_size) {
     min_hash_size *= 2;
   }
 
-  //set the chunksize.
+  // set the chunksize.
   size_t chunksize = 1;
-  //initizalize value for the mem pool
+  // initizalize value for the mem pool
   int pool_init_val = -1;
 
-
-  if (current_spgemm_algorithm == SPGEMM_KK_LP ){
-	  //this should not be executed with the new parameter selectio above.
-	  pool_init_val = -1;
-	  chunksize = min_hash_size ; //this is for keys
-	  chunksize += min_hash_size ; //this is for the values
-	  chunksize += maxNumRoughNonzeros ; //this is for hash values
-  }
-  else {
-		pool_init_val = -1;
-
-	  chunksize = min_hash_size ; //this is for used hash indices
-	  chunksize += min_hash_size ; //this is for the hash begins
-	  chunksize += maxNumRoughNonzeros ; //this is for hash nexts
-	  chunksize += maxNumRoughNonzeros ; //this is for hash keys
-	  chunksize += maxNumRoughNonzeros ; //this is for hash values
+  if (current_spgemm_algorithm == SPGEMM_KK_LP) {
+    // this should not be executed with the new parameter selectio above.
+    pool_init_val = -1;
+    chunksize     = min_hash_size;     // this is for keys
+    chunksize += min_hash_size;        // this is for the values
+    chunksize += maxNumRoughNonzeros;  // this is for hash values
+  } else {
+    pool_init_val = -1;
+
+    chunksize = min_hash_size;         // this is for used hash indices
+    chunksize += min_hash_size;        // this is for the hash begins
+    chunksize += maxNumRoughNonzeros;  // this is for hash nexts
+    chunksize += maxNumRoughNonzeros;  // this is for hash keys
+    chunksize += maxNumRoughNonzeros;  // this is for hash values
   }
 
-
   if (current_spgemm_algorithm == SPGEMM_KK_DENSE && !exec_gpu) {
-    nnz_lno_t col_size = this->b_col_cnt / (sizeof (nnz_lno_t) * 8)+ 1;
-    nnz_lno_t max_row_size = KOKKOSKERNELS_MACRO_MIN(col_size, maxNumRoughNonzeros);
+    nnz_lno_t col_size = this->b_col_cnt / (sizeof(nnz_lno_t) * 8) + 1;
+    nnz_lno_t max_row_size =
+        KOKKOSKERNELS_MACRO_MIN(col_size, maxNumRoughNonzeros);
     chunksize = col_size + max_row_size;
-    //if speed is set, and exec space is cpu, then  we use dense accumulators.
-    //or if memspeed is set, and concurrency is not high, we use dense accumulators.
+    // if speed is set, and exec space is cpu, then  we use dense accumulators.
+    // or if memspeed is set, and concurrency is not high, we use dense
+    // accumulators.
     maxNumRoughNonzeros = col_size;
-    pool_init_val = 0;
-    if (KOKKOSKERNELS_VERBOSE){
-      std::cout << "\tDense Acc - COLS:" << col_size << " max_row_size:" << max_row_size << std::endl;
+    pool_init_val       = 0;
+    if (KOKKOSKERNELS_VERBOSE) {
+      std::cout << "\tDense Acc - COLS:" << col_size
+                << " max_row_size:" << max_row_size << std::endl;
     }
   }
-  KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk;
+  KokkosKernels::Impl::PoolType my_pool_type =
+      KokkosKernels::Impl::OneThread2OneChunk;
   if (exec_gpu) {
     my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
   }
 
-  nnz_lno_t num_chunks = this->template compute_num_pool_chunks<pool_memory_space>
-    (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
+  nnz_lno_t num_chunks =
+      this->template compute_num_pool_chunks<pool_memory_space>(
+          chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
 
-  if (KOKKOSKERNELS_VERBOSE){
-    std::cout << "\tPool Size (MB):" << (num_chunks * chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << " chunksize:" << chunksize << std::endl;
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\tPool Size (MB):"
+              << (num_chunks * chunksize * sizeof(nnz_lno_t)) / 1024. / 1024.
+              << " num_chunks:" << num_chunks << " chunksize:" << chunksize
+              << std::endl;
   }
   Kokkos::Timer timer1;
-  pool_memory_space m_space(num_chunks, chunksize, pool_init_val,  my_pool_type);
+  pool_memory_space m_space(num_chunks, chunksize, pool_init_val, my_pool_type);
   MyExecSpace().fence();
 
-  if (KOKKOSKERNELS_VERBOSE){
+  if (KOKKOSKERNELS_VERBOSE) {
     std::cout << "\tPool Alloc Time:" << timer1.seconds() << std::endl;
   }
 
-  StructureC<a_r_view_t, a_nnz_view_t,
-  b_original_row_view_t, b_compressed_row_view_t, b_nnz_view_t,
-  c_row_view_t, /* nnz_lno_temp_work_view_t,*/ pool_memory_space>
-  sc(
-      m,
-      row_mapA_,
-      entriesA_,
-      old_row_mapB,
-      row_mapB_,
-      entriesSetIndex,
-      entriesSets,
-      rowmapC,
-      min_hash_size,
-      maxNumRoughNonzeros,
-	  shmem_size_to_use,
-      suggested_team_size,
-      team_row_chunk_size,
-      suggested_vector_size,
-      m_space,
-      lcl_my_exec_space,
-      KOKKOSKERNELS_VERBOSE
-   );
-
-  if (KOKKOSKERNELS_VERBOSE){
+  StructureC<a_r_view_t, a_nnz_view_t, b_original_row_view_t,
+             b_compressed_row_view_t, b_nnz_view_t, c_row_view_t,
+             /* nnz_lno_temp_work_view_t,*/ pool_memory_space>
+      sc(m, row_mapA_, entriesA_, old_row_mapB, row_mapB_, entriesSetIndex,
+         entriesSets, rowmapC, min_hash_size, maxNumRoughNonzeros,
+         shmem_size_to_use, suggested_team_size, team_row_chunk_size,
+         suggested_vector_size, m_space, lcl_my_exec_space,
+         KOKKOSKERNELS_VERBOSE);
+
+  if (KOKKOSKERNELS_VERBOSE) {
     std::cout << "\tStructureC vector_size:" << suggested_vector_size
-        << " team_size:" << suggested_team_size
-        << " chunk_size:" << team_row_chunk_size
-        << " shmem_size:" << shmem_size_to_use << std::endl;
+              << " team_size:" << suggested_team_size
+              << " chunk_size:" << team_row_chunk_size
+              << " shmem_size:" << shmem_size_to_use << std::endl;
   }
 
   timer1.reset();
 
   if (exec_gpu) {
-    Kokkos::parallel_for("KokkosSparse::StructureC::GPU_EXEC", gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc);
-  }
-  else {
-	  if (current_spgemm_algorithm == SPGEMM_KK_DENSE){
-		  if (use_dynamic_schedule){
-			  Kokkos::parallel_for( "KokkosSparse::StructureC::SPGEMM_KK_DENSE::DYNAMIC", dynamic_multicore_dense_team_count_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-		  }
-		  else {
-			  Kokkos::parallel_for( "KokkosSparse::StructureC::SPGEMM_KK_DENSE::STATIC", multicore_dense_team_count_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-		  }
-	  }
-	  else if (current_spgemm_algorithm == SPGEMM_KK_LP){
-		  if (use_dynamic_schedule){
-			  Kokkos::parallel_for( "KokkosSparse::StructureC::SPGEMM_KK_LP::DYNAMIC", dynamic_multicore_team_policy4_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-		  }
-		  else {
-
-			  Kokkos::parallel_for( "KokkosSparse::StructureC::SPGEMM_KK_LP::STATIC", multicore_team_policy4_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-		  }
-	  }
-	  else {
-		  if (use_dynamic_schedule){
-			  Kokkos::parallel_for( "KokkosSparse::StructureC::SPGEMM_KK_MEM::DYNAMIC", dynamic_multicore_team_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-		  }
-		  else {
-			  Kokkos::parallel_for( "KokkosSparse::StructureC::SPGEMM_KK_MEM::STATIC", multicore_team_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-		  }
-	  }
+    Kokkos::parallel_for(
+        "KokkosSparse::StructureC::GPU_EXEC",
+        gpu_team_policy_t(m / suggested_team_size + 1, suggested_team_size,
+                          suggested_vector_size),
+        sc);
+  } else {
+    if (current_spgemm_algorithm == SPGEMM_KK_DENSE) {
+      if (use_dynamic_schedule) {
+        Kokkos::parallel_for(
+            "KokkosSparse::StructureC::SPGEMM_KK_DENSE::DYNAMIC",
+            dynamic_multicore_dense_team_count_policy_t(
+                m / team_row_chunk_size + 1, suggested_team_size,
+                suggested_vector_size),
+            sc);
+      } else {
+        Kokkos::parallel_for(
+            "KokkosSparse::StructureC::SPGEMM_KK_DENSE::STATIC",
+            multicore_dense_team_count_policy_t(m / team_row_chunk_size + 1,
+                                                suggested_team_size,
+                                                suggested_vector_size),
+            sc);
+      }
+    } else if (current_spgemm_algorithm == SPGEMM_KK_LP) {
+      if (use_dynamic_schedule) {
+        Kokkos::parallel_for("KokkosSparse::StructureC::SPGEMM_KK_LP::DYNAMIC",
+                             dynamic_multicore_team_policy4_t(
+                                 m / team_row_chunk_size + 1,
+                                 suggested_team_size, suggested_vector_size),
+                             sc);
+      } else {
+        Kokkos::parallel_for("KokkosSparse::StructureC::SPGEMM_KK_LP::STATIC",
+                             multicore_team_policy4_t(
+                                 m / team_row_chunk_size + 1,
+                                 suggested_team_size, suggested_vector_size),
+                             sc);
+      }
+    } else {
+      if (use_dynamic_schedule) {
+        Kokkos::parallel_for("KokkosSparse::StructureC::SPGEMM_KK_MEM::DYNAMIC",
+                             dynamic_multicore_team_policy_t(
+                                 m / team_row_chunk_size + 1,
+                                 suggested_team_size, suggested_vector_size),
+                             sc);
+      } else {
+        Kokkos::parallel_for(
+            "KokkosSparse::StructureC::SPGEMM_KK_MEM::STATIC",
+            multicore_team_policy_t(m / team_row_chunk_size + 1,
+                                    suggested_team_size, suggested_vector_size),
+            sc);
+      }
+    }
   }
   MyExecSpace().fence();
 
-  if (KOKKOSKERNELS_VERBOSE){
-    std::cout << "\tStructureC Kernel time:" << timer1.seconds() << std::endl<< std::endl;
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\tStructureC Kernel time:" << timer1.seconds() << std::endl
+              << std::endl;
   }
 
-
 #if 0
   int read_write_cost = this->handle->get_spgemm_handle()->get_read_write_cost_calc();
 
@@ -2214,87 +2232,87 @@ void KokkosSPGEMM
 	  }
   }
 #endif
-  //if we need to find the max nnz in a row.
+  // if we need to find the max nnz in a row.
   {
     Kokkos::Timer timer1_;
     size_type c_max_nnz = 0;
-    if(m > 0)
-    {
-      KokkosKernels::Impl::view_reduce_max<c_row_view_t, MyExecSpace>(m, rowmapC, c_max_nnz);
+    if (m > 0) {
+      KokkosKernels::Impl::view_reduce_max<c_row_view_t, MyExecSpace>(
+          m, rowmapC, c_max_nnz);
       MyExecSpace().fence();
     }
     this->handle->get_spgemm_handle()->set_max_result_nnz(c_max_nnz);
 
-    if (KOKKOSKERNELS_VERBOSE){
-      std::cout << "\tReduce Max Row Size Time:" << timer1_.seconds() << std::endl;
+    if (KOKKOSKERNELS_VERBOSE) {
+      std::cout << "\tReduce Max Row Size Time:" << timer1_.seconds()
+                << std::endl;
     }
   }
 
-  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<c_row_view_t, MyExecSpace>(m+1, rowmapC);
+  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<c_row_view_t,
+                                                        MyExecSpace>(m + 1,
+                                                                     rowmapC);
   MyExecSpace().fence();
   auto d_c_nnz_size = Kokkos::subview(rowmapC, m);
-  auto h_c_nnz_size = Kokkos::create_mirror_view (d_c_nnz_size);
-  Kokkos::deep_copy (h_c_nnz_size, d_c_nnz_size);
+  auto h_c_nnz_size = Kokkos::create_mirror_view(d_c_nnz_size);
+  Kokkos::deep_copy(h_c_nnz_size, d_c_nnz_size);
   typename c_row_view_t::non_const_value_type c_nnz_size = h_c_nnz_size();
   this->handle->get_spgemm_handle()->set_c_nnz(c_nnz_size);
 }  // symbolic_c (end)
 
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename a_r_view_t, typename a_n_view_t, typename b_oldrow_view_t,
+          typename b_r_view_t>
+
+size_t
+KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
+             b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
+    getMaxRoughRowNNZ(nnz_lno_t m, a_r_view_t row_mapA_, a_n_view_t entriesA_,
+
+                      b_oldrow_view_t row_pointers_begin_B,
+                      b_r_view_t row_pointers_end_B,
+                      size_type *flops_per_row /* = NULL*/) {
+  // get the execution space type.
+  // KokkosKernels::Impl::ExecSpaceType my_exec_space =
+  // this->handle->get_handle_exec_space();
+  int suggested_vector_size =
+      this->handle->get_suggested_vector_size(m, entriesA_.extent(0));
+  int suggested_team_size =
+      this->handle->get_suggested_team_size(suggested_vector_size);
+  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(
+      suggested_team_size, this->concurrency, m);
+
+  PredicMaxRowNNZ<a_r_view_t, a_n_view_t, b_oldrow_view_t, b_r_view_t> pcnnnz(
+      m, row_mapA_, entriesA_, row_pointers_begin_B, row_pointers_end_B,
+      team_row_chunk_size, flops_per_row);
+
+  typename b_oldrow_view_t::non_const_value_type rough_size = 0;
+  Kokkos::parallel_reduce(
+      "KokkosSparse::PredicMaxRowNNZ::STATIC",
+      team_policy_t(m / team_row_chunk_size + 1, suggested_team_size,
+                    suggested_vector_size),
+      pcnnnz, rough_size);
+  MyExecSpace().fence();
+
+  return rough_size;
+}
 
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-template <typename a_r_view_t, typename a_n_view_t, typename b_oldrow_view_t, typename b_r_view_t>
-
-size_t KokkosSPGEMM
-  <HandleType,
-      a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-      b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-    getMaxRoughRowNNZ(
-        nnz_lno_t m,
-    a_r_view_t row_mapA_,
-    a_n_view_t entriesA_,
-
-    b_oldrow_view_t row_pointers_begin_B,
-    b_r_view_t row_pointers_end_B
-	,size_type *flops_per_row/* = NULL*/)
-	{
-	//get the execution space type.
-	//KokkosKernels::Impl::ExecSpaceType my_exec_space = this->handle->get_handle_exec_space();
-	int suggested_vector_size = this->handle->get_suggested_vector_size(m, entriesA_.extent(0));
-	int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-	nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(suggested_team_size, this->concurrency , m);
-
-
-	PredicMaxRowNNZ<a_r_view_t, a_n_view_t, b_oldrow_view_t, b_r_view_t>
-	pcnnnz(
-			m,
-			row_mapA_,
-			entriesA_,
-			row_pointers_begin_B,
-			row_pointers_end_B,
-			team_row_chunk_size,flops_per_row);
-
-	typename b_oldrow_view_t::non_const_value_type rough_size = 0;
-	Kokkos::parallel_reduce( "KokkosSparse::PredicMaxRowNNZ::STATIC", team_policy_t(m / team_row_chunk_size  + 1 , suggested_team_size, suggested_vector_size), pcnnnz, rough_size);
-	MyExecSpace().fence();
-
-	return rough_size;
-	}
-
-
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-
-struct KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-  PredicMaxRowNNZ_p{
-  const nnz_lno_t m; //num rows
-  const size_type * row_mapA;  //row pointers of a
-  const nnz_lno_t * entriesA;  //col
-  const size_type * row_begins_B;
-  const size_type * row_end_indices_B;
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+
+struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                    a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                    b_scalar_nnz_view_t_>::PredicMaxRowNNZ_p {
+  const nnz_lno_t m;          // num rows
+  const size_type *row_mapA;  // row pointers of a
+  const nnz_lno_t *entriesA;  // col
+  const size_type *row_begins_B;
+  const size_type *row_end_indices_B;
   const size_type min_val;
   const nnz_lno_t team_row_chunk_size;
 
@@ -2307,158 +2325,147 @@ struct KokkosSPGEMM
    * \param row_end_indices_B_: row end indices of B
    * \param team_row_chunk_size_: the number of rows assigned to each team.
    */
-  PredicMaxRowNNZ_p(
-      const nnz_lno_t m_,
-      const size_type * row_mapA_,
-      const nnz_lno_t * entriesA_,
-
-      const size_type * row_begins_B_,
-      const size_type * row_end_indices_B_,
-      nnz_lno_t team_row_chunk_size_):
-        m(m_),
-        row_mapA(row_mapA_), entriesA(entriesA_),
+  PredicMaxRowNNZ_p(const nnz_lno_t m_, const size_type *row_mapA_,
+                    const nnz_lno_t *entriesA_,
+
+                    const size_type *row_begins_B_,
+                    const size_type *row_end_indices_B_,
+                    nnz_lno_t team_row_chunk_size_)
+      : m(m_),
+        row_mapA(row_mapA_),
+        entriesA(entriesA_),
         row_begins_B(row_begins_B_),
         row_end_indices_B(row_end_indices_B_),
         min_val(((std::numeric_limits<size_type>::lowest()))),
-        team_row_chunk_size(team_row_chunk_size_){}
+        team_row_chunk_size(team_row_chunk_size_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const team_member_t & teamMember, size_type &overal_max) const {
-    //get the range of rows for team.
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, m);
-
-    //TODO MD: here do I need a reduce as well?
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index)
-    {
-
-      const size_type col_begin = row_mapA[row_index];
-      const size_type col_end = row_mapA[row_index + 1];
-      const nnz_lno_t left_work = col_end - col_begin;
-
-      size_type max_num_results_in_row = 0;
-
-      //get the size of the rows of B, pointed by row of A
-      Kokkos::parallel_reduce(
-          Kokkos::ThreadVectorRange(teamMember, left_work),
-          [&] (nnz_lno_t i, size_type & valueToUpdate) {
-        const size_type adjind = i + col_begin;
-        const nnz_lno_t colIndex = entriesA[adjind];
-        valueToUpdate += row_end_indices_B [colIndex] - row_begins_B[colIndex];
-      },
-      max_num_results_in_row);
-
-
-      //set max.
-      if (overal_max < max_num_results_in_row) {
-        overal_max = max_num_results_in_row;
-      }
-    });
+  void operator()(const team_member_t &teamMember,
+                  size_type &overal_max) const {
+    // get the range of rows for team.
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, m);
+
+    // TODO MD: here do I need a reduce as well?
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          const size_type col_begin = row_mapA[row_index];
+          const size_type col_end   = row_mapA[row_index + 1];
+          const nnz_lno_t left_work = col_end - col_begin;
+
+          size_type max_num_results_in_row = 0;
+
+          // get the size of the rows of B, pointed by row of A
+          Kokkos::parallel_reduce(
+              Kokkos::ThreadVectorRange(teamMember, left_work),
+              [&](nnz_lno_t i, size_type &valueToUpdate) {
+                const size_type adjind   = i + col_begin;
+                const nnz_lno_t colIndex = entriesA[adjind];
+                valueToUpdate +=
+                    row_end_indices_B[colIndex] - row_begins_B[colIndex];
+              },
+              max_num_results_in_row);
+
+          // set max.
+          if (overal_max < max_num_results_in_row) {
+            overal_max = max_num_results_in_row;
+          }
+        });
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join (volatile size_type& dst,const volatile size_type& src) const {
-    if (dst < src) { dst = src;}
+  void join(volatile size_type &dst, const volatile size_type &src) const {
+    if (dst < src) {
+      dst = src;
+    }
   }
 
-
   KOKKOS_INLINE_FUNCTION
-  void init (size_type& dst) const
-  {
-    dst = min_val;
-  }
+  void init(size_type &dst) const { dst = min_val; }
 };
 
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-size_t KokkosSPGEMM
-  <HandleType,
-      a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-      b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-      getMaxRoughRowNNZ_p(
-          const nnz_lno_t m,const  size_type annz,
-          const size_type * row_mapA_,
-          const nnz_lno_t * entriesA_,
-
-          const size_type * row_pointers_begin_B,
-          const size_type * row_pointers_end_B) {
-
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+size_t
+KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
+             b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
+    getMaxRoughRowNNZ_p(const nnz_lno_t m, const size_type annz,
+                        const size_type *row_mapA_, const nnz_lno_t *entriesA_,
+
+                        const size_type *row_pointers_begin_B,
+                        const size_type *row_pointers_end_B) {
   int suggested_vector_size = this->handle->get_suggested_vector_size(m, annz);
-  int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(suggested_team_size, this->concurrency , m);
-
-  PredicMaxRowNNZ_p
-  pcnnnz(
-      m,
-      row_mapA_,
-      entriesA_,
-      row_pointers_begin_B,
-      row_pointers_end_B,
-      team_row_chunk_size );
+  int suggested_team_size =
+      this->handle->get_suggested_team_size(suggested_vector_size);
+  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(
+      suggested_team_size, this->concurrency, m);
 
+  PredicMaxRowNNZ_p pcnnnz(m, row_mapA_, entriesA_, row_pointers_begin_B,
+                           row_pointers_end_B, team_row_chunk_size);
 
   size_type rough_size = 0;
-  Kokkos::parallel_reduce("KokkosSparse::PredicMaxRowNNZ_P::STATIC",  team_policy_t(m / team_row_chunk_size  + 1 , suggested_team_size, suggested_vector_size), pcnnnz, rough_size);
+  Kokkos::parallel_reduce(
+      "KokkosSparse::PredicMaxRowNNZ_P::STATIC",
+      team_policy_t(m / team_row_chunk_size + 1, suggested_team_size,
+                    suggested_vector_size),
+      pcnnnz, rough_size);
   MyExecSpace().fence();
   return rough_size;
 }
 
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-size_t KokkosSPGEMM
-  <HandleType,
-      a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-      b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-    getMaxRoughRowNNZIntersection_p(
-        const nnz_lno_t m,const  size_type annz,
-        const size_type * row_mapA_,
-        const nnz_lno_t * entriesA_,
-
-        const size_type * row_pointers_begin_B,
-        const size_type * row_pointers_end_B,
-        nnz_lno_t * min_result_row_for_each_row
-        )
-    {
-
-  //get the execution space type.
-  //KokkosKernels::Impl::ExecSpaceType my_exec_space = this->handle->get_handle_exec_space();
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+size_t
+KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
+             b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
+    getMaxRoughRowNNZIntersection_p(const nnz_lno_t m, const size_type annz,
+                                    const size_type *row_mapA_,
+                                    const nnz_lno_t *entriesA_,
+
+                                    const size_type *row_pointers_begin_B,
+                                    const size_type *row_pointers_end_B,
+                                    nnz_lno_t *min_result_row_for_each_row) {
+  // get the execution space type.
+  // KokkosKernels::Impl::ExecSpaceType my_exec_space =
+  // this->handle->get_handle_exec_space();
   int suggested_vector_size = this->handle->get_suggested_vector_size(m, annz);
-  int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(suggested_team_size, this->concurrency , m);
-
-
-
-  PredicMaxRowNNZIntersection
-  pcnnnz(
-      m, this->b_col_cnt,
-      row_mapA_,
-      entriesA_,
-      row_pointers_begin_B,
-      row_pointers_end_B,
-      team_row_chunk_size,  min_result_row_for_each_row);
+  int suggested_team_size =
+      this->handle->get_suggested_team_size(suggested_vector_size);
+  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(
+      suggested_team_size, this->concurrency, m);
 
+  PredicMaxRowNNZIntersection pcnnnz(
+      m, this->b_col_cnt, row_mapA_, entriesA_, row_pointers_begin_B,
+      row_pointers_end_B, team_row_chunk_size, min_result_row_for_each_row);
 
   nnz_lno_t rough_size = 0;
-  Kokkos::parallel_reduce( "KokkosSparse::PredicMaxRowNNZIntersection::STATIC", team_policy_t(m / team_row_chunk_size  + 1 , suggested_team_size, suggested_vector_size), pcnnnz, rough_size);
+  Kokkos::parallel_reduce(
+      "KokkosSparse::PredicMaxRowNNZIntersection::STATIC",
+      team_policy_t(m / team_row_chunk_size + 1, suggested_team_size,
+                    suggested_vector_size),
+      pcnnnz, rough_size);
   MyExecSpace().fence();
   return rough_size;
-    }
-
+}
 
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
 template <typename a_row_view_t, typename a_nnz_view_t,
-          typename b_original_row_view_t,
-          typename b_compressed_row_view_t, typename b_nnz_view_t,
-          typename c_row_view_t, typename nnz_lno_temp_work_view_t,
-          typename pool_memory_space>
-struct KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-  NonzeroesC{
+          typename b_original_row_view_t, typename b_compressed_row_view_t,
+          typename b_nnz_view_t, typename c_row_view_t,
+          typename nnz_lno_temp_work_view_t, typename pool_memory_space>
+struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                    a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                    b_scalar_nnz_view_t_>::NonzeroesC {
   nnz_lno_t numrows;
 
   a_row_view_t row_mapA;
@@ -2473,7 +2480,6 @@ struct KokkosSPGEMM
   nnz_lno_temp_work_view_t entriesSetIndicesC;
   nnz_lno_temp_work_view_t entriesSetsC;
 
-
   const nnz_lno_t pow2_hash_size;
   const nnz_lno_t pow2_hash_func;
   const nnz_lno_t MaxRoughNonZero;
@@ -2487,28 +2493,22 @@ struct KokkosSPGEMM
    * \brief Constructor.
    */
 
-  NonzeroesC(
-      nnz_lno_t m_,
-      a_row_view_t row_mapA_,
-      a_nnz_view_t entriesA_,
+  NonzeroesC(nnz_lno_t m_, a_row_view_t row_mapA_, a_nnz_view_t entriesA_,
 
-      b_original_row_view_t old_row_mapB_,
-      b_compressed_row_view_t row_mapB_,
-      b_nnz_view_t entriesSetIndicesB_,
-      b_nnz_view_t entriesSetsB_,
+             b_original_row_view_t old_row_mapB_,
+             b_compressed_row_view_t row_mapB_,
+             b_nnz_view_t entriesSetIndicesB_, b_nnz_view_t entriesSetsB_,
 
-      c_row_view_t rowmapC_,
-      nnz_lno_temp_work_view_t entriesSetIndicesC_,
+             c_row_view_t rowmapC_,
+             nnz_lno_temp_work_view_t entriesSetIndicesC_,
 
-      const nnz_lno_t hash_size_,
-      const nnz_lno_t MaxRoughNonZero_,
-      const size_t sharedMemorySize_,
-      const int vector_size_,
-      pool_memory_space mpool_,
-      const KokkosKernels::Impl::ExecSpaceType my_exec_space_):
-        numrows(m_),
+             const nnz_lno_t hash_size_, const nnz_lno_t MaxRoughNonZero_,
+             const size_t sharedMemorySize_, const int vector_size_,
+             pool_memory_space mpool_,
+             const KokkosKernels::Impl::ExecSpaceType my_exec_space_)
+      : numrows(m_),
 
-        row_mapA (row_mapA_),
+        row_mapA(row_mapA_),
         entriesA(entriesA_),
 
         old_row_mapB(old_row_mapB_),
@@ -2520,136 +2520,123 @@ struct KokkosSPGEMM
         entriesSetIndicesC(entriesSetIndicesC_),
         entriesSetsC(),
 
-
         pow2_hash_size(hash_size_),
         pow2_hash_func(hash_size_ - 1),
         MaxRoughNonZero(MaxRoughNonZero_),
 
         shared_memory_size(sharedMemorySize_),
-        vector_size (vector_size_), m_space(mpool_), my_exec_space(my_exec_space_)
-        {}
+        vector_size(vector_size_),
+        m_space(mpool_),
+        my_exec_space(my_exec_space_) {}
 
   KOKKOS_INLINE_FUNCTION
-  size_t get_thread_id(const size_t row_index) const{
-    switch (my_exec_space){
-    default:
-      return row_index;
-#if defined( KOKKOS_ENABLE_SERIAL )
-    case KokkosKernels::Impl::Exec_SERIAL:
-      return 0;
-#endif
-#if defined( KOKKOS_ENABLE_OPENMP )
-    case KokkosKernels::Impl::Exec_OMP:
-      return Kokkos::OpenMP::impl_hardware_thread_id();
+  size_t get_thread_id(const size_t row_index) const {
+    switch (my_exec_space) {
+      default: return row_index;
+#if defined(KOKKOS_ENABLE_SERIAL)
+      case KokkosKernels::Impl::Exec_SERIAL: return 0;
 #endif
-#if defined( KOKKOS_ENABLE_THREADS )
-    case KokkosKernels::Impl::Exec_PTHREADS:
-      return Kokkos::Threads::impl_hardware_thread_id();
+#if defined(KOKKOS_ENABLE_OPENMP)
+      case KokkosKernels::Impl::Exec_OMP:
+        return Kokkos::OpenMP::impl_hardware_thread_id();
 #endif
-#if defined( KOKKOS_ENABLE_QTHREAD)
-    case KokkosKernels::Impl::Exec_QTHREADS:
-      return 0; // Kokkos does not have a thread_id API for Qthreads
+#if defined(KOKKOS_ENABLE_THREADS)
+      case KokkosKernels::Impl::Exec_THREADS:
+        return Kokkos::Threads::impl_hardware_thread_id();
 #endif
-#if defined( KOKKOS_ENABLE_CUDA )
-    case KokkosKernels::Impl::Exec_CUDA:
-      return row_index;
+#if defined(KOKKOS_ENABLE_CUDA)
+      case KokkosKernels::Impl::Exec_CUDA: return row_index;
 #endif
-#if defined( KOKKOS_ENABLE_HIP )
-    case KokkosKernels::Impl::Exec_HIP:
-      return row_index;
+#if defined(KOKKOS_ENABLE_HIP)
+      case KokkosKernels::Impl::Exec_HIP: return row_index;
 #endif
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const MultiCoreTag&, const team_member_t & teamMember) const {
-    nnz_lno_t row_index = teamMember.league_rank()  * teamMember.team_size()+ teamMember.team_rank();
-    
-    using hashmapType = KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,
-                                                                        nnz_lno_t,
-                                                                        nnz_lno_t,
-                                                                        KokkosKernels::Experimental::HashOpType::bitwiseAnd>;
+  void operator()(const MultiCoreTag &, const team_member_t &teamMember) const {
+    nnz_lno_t row_index = teamMember.league_rank() * teamMember.team_size() +
+                          teamMember.team_rank();
+
+    using hashmapType = KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, nnz_lno_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>;
     if (row_index >= numrows) return;
-    //get row index.
+    // get row index.
 
     nnz_lno_t *globally_used_hash_indices = NULL;
-    nnz_lno_t globally_used_hash_count = 0;
-    nnz_lno_t used_hash_size = 0;
-    hashmapType hm2(MaxRoughNonZero, pow2_hash_func,
-                    nullptr, nullptr,
-                    nullptr, nullptr);
-
-    volatile nnz_lno_t * tmp = NULL;
-    size_t tid = get_thread_id(row_index);
-    while (tmp == NULL){
-      tmp = (volatile nnz_lno_t * )( m_space.allocate_chunk(tid));
+    nnz_lno_t globally_used_hash_count    = 0;
+    nnz_lno_t used_hash_size              = 0;
+    hashmapType hm2(MaxRoughNonZero, pow2_hash_func, nullptr, nullptr, nullptr,
+                    nullptr);
+
+    volatile nnz_lno_t *tmp = NULL;
+    size_t tid              = get_thread_id(row_index);
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(m_space.allocate_chunk(tid));
     }
 
-    globally_used_hash_indices = (nnz_lno_t *) tmp;
-    tmp += pow2_hash_size ;
+    globally_used_hash_indices = (nnz_lno_t *)tmp;
+    tmp += pow2_hash_size;
 
-    hm2.hash_begins = (nnz_lno_t *) (tmp);
-    tmp += pow2_hash_size ;
+    hm2.hash_begins = (nnz_lno_t *)(tmp);
+    tmp += pow2_hash_size;
 
-    //poins to the next elements
-    hm2.hash_nexts = (nnz_lno_t *) (tmp);
+    // poins to the next elements
+    hm2.hash_nexts = (nnz_lno_t *)(tmp);
     tmp += MaxRoughNonZero;
 
-    //holds the keys
-    hm2.keys = (nnz_lno_t *) (tmp);
+    // holds the keys
+    hm2.keys = (nnz_lno_t *)(tmp);
     tmp += MaxRoughNonZero;
-    hm2.values = (nnz_lno_t *) (tmp);
-
+    hm2.values = (nnz_lno_t *)(tmp);
 
     {
       const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
+      const nnz_lno_t col_size  = row_mapA[row_index + 1] - col_begin;
 
-      for (nnz_lno_t colind = 0; colind < col_size; ++colind){
+      for (nnz_lno_t colind = 0; colind < col_size; ++colind) {
         size_type a_col = colind + col_begin;
 
-        nnz_lno_t rowB = entriesA[a_col];
+        nnz_lno_t rowB     = entriesA[a_col];
         size_type rowBegin = old_row_mapB(rowB);
 
-        nnz_lno_t left_work = row_mapB(rowB ) - rowBegin;
-
-        for (nnz_lno_t i = 0; i < left_work; ++i){
+        nnz_lno_t left_work = row_mapB(rowB) - rowBegin;
 
+        for (nnz_lno_t i = 0; i < left_work; ++i) {
           const size_type adjind = i + rowBegin;
-          nnz_lno_t b_set_ind = entriesSetIndicesB[adjind];
-          nnz_lno_t b_set = entriesSetsB[adjind];
-          nnz_lno_t hash = b_set_ind & pow2_hash_func;
+          nnz_lno_t b_set_ind    = entriesSetIndicesB[adjind];
+          nnz_lno_t b_set        = entriesSetsB[adjind];
+          nnz_lno_t hash         = b_set_ind & pow2_hash_func;
 
           hm2.sequential_insert_into_hash_mergeOr_TrackHashes(
-            b_set_ind,b_set,
-            &used_hash_size,
-            &globally_used_hash_count,
-            globally_used_hash_indices
-          );
+              b_set_ind, b_set, &used_hash_size, &globally_used_hash_count,
+              globally_used_hash_indices);
         }
       }
 
-      int set_size = sizeof(nnz_lno_t) * 8;
+      int set_size     = sizeof(nnz_lno_t) * 8;
       nnz_lno_t num_el = rowmapC(row_index);
-      for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii){
+      for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii) {
         nnz_lno_t c_rows_setind = hm2.keys[ii];
-        nnz_lno_t c_rows = hm2.values[ii];
+        nnz_lno_t c_rows        = hm2.values[ii];
 
         int current_row = 0;
-        nnz_lno_t unit = 1;
+        nnz_lno_t unit  = 1;
 
-        while (c_rows){
-          if (c_rows & unit){
-            //insert indices.
-            entriesSetIndicesC(num_el++) = set_size * c_rows_setind + current_row;
+        while (c_rows) {
+          if (c_rows & unit) {
+            // insert indices.
+            entriesSetIndicesC(num_el++) =
+                set_size * c_rows_setind + current_row;
           }
           current_row++;
           c_rows = c_rows & ~unit;
-          unit = unit << 1;
+          unit   = unit << 1;
         }
       }
-      for (int i = 0; i < globally_used_hash_count; ++i){
-        nnz_lno_t dirty_hash = globally_used_hash_indices[i];
+      for (int i = 0; i < globally_used_hash_count; ++i) {
+        nnz_lno_t dirty_hash        = globally_used_hash_indices[i];
         hm2.hash_begins[dirty_hash] = -1;
       }
     }
@@ -2657,155 +2644,155 @@ struct KokkosSPGEMM
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const GPUTag&, const team_member_t & teamMember) const {
-    nnz_lno_t row_index = teamMember.league_rank()  * teamMember.team_size()+ teamMember.team_rank();
-    
-    using hashmapType = KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,
-                                                                        nnz_lno_t,
-                                                                        nnz_lno_t,
-                                                                        KokkosKernels::Experimental::HashOpType::bitwiseAnd>;
-    
-    if (row_index >= numrows) return;
+  void operator()(const GPUTag &, const team_member_t &teamMember) const {
+    nnz_lno_t row_index = teamMember.league_rank() * teamMember.team_size() +
+                          teamMember.team_rank();
 
+    using hashmapType = KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, nnz_lno_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>;
+
+    if (row_index >= numrows) return;
 
-    //printf("row:%d\n", row_index);
+    // printf("row:%d\n", row_index);
 
-    int thread_memory = ((shared_memory_size/ 4 / teamMember.team_size())) * 4;
-    char *all_shared_memory = (char *) (teamMember.team_shmem().get_shmem(shared_memory_size));
+    int thread_memory = ((shared_memory_size / 4 / teamMember.team_size())) * 4;
+    char *all_shared_memory =
+        (char *)(teamMember.team_shmem().get_shmem(shared_memory_size));
 
-    //nnz_lno_t *alloc_global_memory = NULL;
+    // nnz_lno_t *alloc_global_memory = NULL;
     nnz_lno_t *globally_used_hash_indices = NULL;
 
-    //shift it to the thread private part
+    // shift it to the thread private part
     all_shared_memory += thread_memory * teamMember.team_rank();
 
-    //used_hash_sizes hold the size of 1st and 2nd level hashes
-    volatile nnz_lno_t *used_hash_sizes = (volatile nnz_lno_t *) (all_shared_memory);
-    typedef typename std::remove_reference< decltype( *used_hash_sizes ) >::type atomic_incr_type;
+    // used_hash_sizes hold the size of 1st and 2nd level hashes
+    volatile nnz_lno_t *used_hash_sizes =
+        (volatile nnz_lno_t *)(all_shared_memory);
+    typedef typename std::remove_reference<decltype(*used_hash_sizes)>::type
+        atomic_incr_type;
 
     all_shared_memory += sizeof(nnz_lno_t) * 2;
 
-    nnz_lno_t *globally_used_hash_count = (nnz_lno_t *) (all_shared_memory);
+    nnz_lno_t *globally_used_hash_count = (nnz_lno_t *)(all_shared_memory);
 
-    all_shared_memory += sizeof(nnz_lno_t) ;
+    all_shared_memory += sizeof(nnz_lno_t);
     int unit_memory = sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) * 2;
-    nnz_lno_t shared_memory_hash_size = (thread_memory - sizeof(nnz_lno_t) * 3) / unit_memory;
+    nnz_lno_t shared_memory_hash_size =
+        (thread_memory - sizeof(nnz_lno_t) * 3) / unit_memory;
 
-    nnz_lno_t * begins = (nnz_lno_t *) (all_shared_memory);
+    nnz_lno_t *begins = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * shared_memory_hash_size;
 
-    //poins to the next elements
-    nnz_lno_t * nexts = (nnz_lno_t *) (all_shared_memory);
+    // poins to the next elements
+    nnz_lno_t *nexts = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * shared_memory_hash_size;
 
-    //holds the keys
-    nnz_lno_t * keys = (nnz_lno_t *) (all_shared_memory);
+    // holds the keys
+    nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * shared_memory_hash_size;
-    nnz_lno_t * vals = (nnz_lno_t *) (all_shared_memory);
+    nnz_lno_t *vals = (nnz_lno_t *)(all_shared_memory);
 
-    //printf("begins:%ld, nexts:%ld, keys:%ld, vals:%ld\n", begins, nexts, keys, vals);
-    //return;
-    //first level hashmap
-    KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,nnz_lno_t,nnz_lno_t,KokkosKernels::Experimental::HashOpType::modulo>
-      hm(shared_memory_hash_size, shared_memory_hash_size, begins, nexts, keys, vals);
+    // printf("begins:%ld, nexts:%ld, keys:%ld, vals:%ld\n", begins, nexts,
+    // keys, vals); return; first level hashmap
+    KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, nnz_lno_t,
+        KokkosKernels::Experimental::HashOpType::modulo>
+        hm(shared_memory_hash_size, shared_memory_hash_size, begins, nexts,
+           keys, vals);
 
-    hashmapType hm2(MaxRoughNonZero, pow2_hash_func,
-                    nullptr, nullptr,
-                    nullptr, nullptr);
+    hashmapType hm2(MaxRoughNonZero, pow2_hash_func, nullptr, nullptr, nullptr,
+                    nullptr);
 
-    //initialize begins.
+    // initialize begins.
     Kokkos::parallel_for(
         Kokkos::ThreadVectorRange(teamMember, shared_memory_hash_size),
-        [&] (int i) {
-      begins[i] = -1;
-    });
+        [&](int i) { begins[i] = -1; });
 
-    //initialize hash usage sizes
-    Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
-      used_hash_sizes[0] = 0;
-      used_hash_sizes[1] = 0;
+    // initialize hash usage sizes
+    Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+      used_hash_sizes[0]          = 0;
+      used_hash_sizes[1]          = 0;
       globally_used_hash_count[0] = 0;
     });
 
     bool is_global_alloced = false;
 
-    const size_type col_end = row_mapA[row_index + 1];
+    const size_type col_end   = row_mapA[row_index + 1];
     const size_type col_begin = row_mapA[row_index];
-    const nnz_lno_t col_size = col_end - col_begin;
+    const nnz_lno_t col_size  = col_end - col_begin;
 
-    for (nnz_lno_t colind = 0; colind < col_size; ++colind){
+    for (nnz_lno_t colind = 0; colind < col_size; ++colind) {
       size_type a_col = colind + col_begin;
 
-      nnz_lno_t rowB = entriesA[a_col];
+      nnz_lno_t rowB     = entriesA[a_col];
       size_type rowBegin = old_row_mapB(rowB);
 
-      nnz_lno_t left_work = row_mapB(rowB ) - rowBegin;
+      nnz_lno_t left_work = row_mapB(rowB) - rowBegin;
 
-      while (left_work){
-        nnz_lno_t work_to_handle = KOKKOSKERNELS_MACRO_MIN(vector_size, left_work);
+      while (left_work) {
+        nnz_lno_t work_to_handle =
+            KOKKOSKERNELS_MACRO_MIN(vector_size, left_work);
 
         nnz_lno_t b_set_ind = -1, b_set = -1;
         nnz_lno_t hash = -1;
         Kokkos::parallel_for(
             Kokkos::ThreadVectorRange(teamMember, work_to_handle),
-            [&] (nnz_lno_t i) {
-          const size_type adjind = i + rowBegin;
-          b_set_ind = entriesSetIndicesB[adjind];
-          b_set = entriesSetsB[adjind];
-        });
-
+            [&](nnz_lno_t i) {
+              const size_type adjind = i + rowBegin;
+              b_set_ind              = entriesSetIndicesB[adjind];
+              b_set                  = entriesSetsB[adjind];
+            });
 
         int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeOr(
-                              b_set_ind, b_set,
-                              used_hash_sizes,
-                              shared_memory_hash_size
-                            );
-
+            b_set_ind, b_set, used_hash_sizes, shared_memory_hash_size);
 
         int overall_num_unsuccess = 0;
 
-        Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(teamMember, vector_size),
-            [&] (const int /*threadid*/, int &overall_num_unsuccess_) {
-          overall_num_unsuccess_ += num_unsuccess;
-        }, overall_num_unsuccess);
-
-
-        if (overall_num_unsuccess){
-
-          //printf("row:%d\n", row_index);
-          if (!is_global_alloced){
-            volatile nnz_lno_t * tmp = NULL;
-            size_t tid = get_thread_id(row_index);
-            while (tmp == NULL){
-              Kokkos::single(Kokkos::PerThread(teamMember),[&] (volatile nnz_lno_t * &memptr) {
-                memptr = (volatile nnz_lno_t * )( m_space.allocate_chunk(tid));
-              }, tmp);
+        Kokkos::parallel_reduce(
+            Kokkos::ThreadVectorRange(teamMember, vector_size),
+            [&](const int /*threadid*/, int &overall_num_unsuccess_) {
+              overall_num_unsuccess_ += num_unsuccess;
+            },
+            overall_num_unsuccess);
+
+        if (overall_num_unsuccess) {
+          // printf("row:%d\n", row_index);
+          if (!is_global_alloced) {
+            volatile nnz_lno_t *tmp = NULL;
+            size_t tid              = get_thread_id(row_index);
+            while (tmp == NULL) {
+              Kokkos::single(
+                  Kokkos::PerThread(teamMember),
+                  [&](volatile nnz_lno_t *&memptr) {
+                    memptr =
+                        (volatile nnz_lno_t *)(m_space.allocate_chunk(tid));
+                  },
+                  tmp);
             }
             is_global_alloced = true;
 
-            globally_used_hash_indices = (nnz_lno_t *) tmp;
-            tmp += pow2_hash_size ;
+            globally_used_hash_indices = (nnz_lno_t *)tmp;
+            tmp += pow2_hash_size;
 
-            hm2.hash_begins = (nnz_lno_t *) (tmp);
-            tmp += pow2_hash_size ;
+            hm2.hash_begins = (nnz_lno_t *)(tmp);
+            tmp += pow2_hash_size;
 
-            //poins to the next elements
-            hm2.hash_nexts = (nnz_lno_t *) (tmp);
+            // poins to the next elements
+            hm2.hash_nexts = (nnz_lno_t *)(tmp);
             tmp += MaxRoughNonZero;
 
-            //holds the keys
-            hm2.keys = (nnz_lno_t *) (tmp);
+            // holds the keys
+            hm2.keys = (nnz_lno_t *)(tmp);
             tmp += MaxRoughNonZero;
-            hm2.values = (nnz_lno_t *) (tmp);
+            hm2.values = (nnz_lno_t *)(tmp);
           }
 
           if (num_unsuccess) {
-            //int insertion =
+            // int insertion =
             hm2.vector_atomic_insert_into_hash_mergeOr_TrackHashes(
-                b_set_ind,b_set,
-                used_hash_sizes + 1,
-                globally_used_hash_count, globally_used_hash_indices
-                );
+                b_set_ind, b_set, used_hash_sizes + 1, globally_used_hash_count,
+                globally_used_hash_indices);
           }
         }
         left_work -= work_to_handle;
@@ -2813,79 +2800,78 @@ struct KokkosSPGEMM
       }
     }
 
-    Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
-      if (used_hash_sizes[0] > shared_memory_hash_size) used_hash_sizes[0] = shared_memory_hash_size;
+    Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+      if (used_hash_sizes[0] > shared_memory_hash_size)
+        used_hash_sizes[0] = shared_memory_hash_size;
     });
 
     nnz_lno_t num_compressed_elements = used_hash_sizes[0];
-    used_hash_sizes[0] = 0;
-    size_type row_begin = rowmapC(row_index);
-    int set_size = sizeof(nnz_lno_t) * 8;
-    Kokkos::parallel_for( Kokkos::ThreadVectorRange(teamMember, num_compressed_elements),
-        [&] (const nnz_lno_t ii) {
-      nnz_lno_t c_rows_setind = hm.keys[ii];
-      nnz_lno_t c_rows = hm.values[ii];
-
-      int current_row = 0;
-      nnz_lno_t unit = 1;
-
-      while (c_rows){
-        if (c_rows & unit){
-
-          size_type wind = Kokkos::atomic_fetch_add(used_hash_sizes, atomic_incr_type(1));
-          entriesSetIndicesC(wind + row_begin) = set_size * c_rows_setind + current_row;
-        }
-        current_row++;
-        c_rows = c_rows & ~unit;
-        unit = unit << 1;
-      }
-
-    });
-
-
-    if (is_global_alloced){
-      nnz_lno_t num_compressed_elements_ = used_hash_sizes[1];
-      Kokkos::parallel_for( Kokkos::ThreadVectorRange(teamMember, num_compressed_elements_),
-          [&] (const nnz_lno_t ii) {
-        nnz_lno_t c_rows_setind = hm2.keys[ii];
-        nnz_lno_t c_rows = hm2.values[ii];
-
-        int current_row = 0;
-        nnz_lno_t unit = 1;
-
-        while (c_rows){
-          if (c_rows & unit){
-
-            size_type wind = Kokkos::atomic_fetch_add(used_hash_sizes, atomic_incr_type(1));
-            entriesSetIndicesC(wind + row_begin) = set_size * c_rows_setind + current_row;
+    used_hash_sizes[0]                = 0;
+    size_type row_begin               = rowmapC(row_index);
+    int set_size                      = sizeof(nnz_lno_t) * 8;
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(teamMember, num_compressed_elements),
+        [&](const nnz_lno_t ii) {
+          nnz_lno_t c_rows_setind = hm.keys[ii];
+          nnz_lno_t c_rows        = hm.values[ii];
+
+          int current_row = 0;
+          nnz_lno_t unit  = 1;
+
+          while (c_rows) {
+            if (c_rows & unit) {
+              size_type wind = Kokkos::atomic_fetch_add(used_hash_sizes,
+                                                        atomic_incr_type(1));
+              entriesSetIndicesC(wind + row_begin) =
+                  set_size * c_rows_setind + current_row;
+            }
+            current_row++;
+            c_rows = c_rows & ~unit;
+            unit   = unit << 1;
           }
-          current_row++;
-          c_rows = c_rows & ~unit;
-          unit = unit << 1;
-        }
-      });
+        });
 
-      //now thread leaves the memory as it finds. so there is no need to initialize the hash begins
-      nnz_lno_t dirty_hashes = globally_used_hash_count[0];
+    if (is_global_alloced) {
+      nnz_lno_t num_compressed_elements_ = used_hash_sizes[1];
       Kokkos::parallel_for(
-          Kokkos::ThreadVectorRange(teamMember, dirty_hashes),
-          [&] (nnz_lno_t i) {
-        nnz_lno_t dirty_hash = globally_used_hash_indices[i];
-        hm2.hash_begins[dirty_hash] = -1;
-      });
-
+          Kokkos::ThreadVectorRange(teamMember, num_compressed_elements_),
+          [&](const nnz_lno_t ii) {
+            nnz_lno_t c_rows_setind = hm2.keys[ii];
+            nnz_lno_t c_rows        = hm2.values[ii];
+
+            int current_row = 0;
+            nnz_lno_t unit  = 1;
+
+            while (c_rows) {
+              if (c_rows & unit) {
+                size_type wind = Kokkos::atomic_fetch_add(used_hash_sizes,
+                                                          atomic_incr_type(1));
+                entriesSetIndicesC(wind + row_begin) =
+                    set_size * c_rows_setind + current_row;
+              }
+              current_row++;
+              c_rows = c_rows & ~unit;
+              unit   = unit << 1;
+            }
+          });
 
-      Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
+      // now thread leaves the memory as it finds. so there is no need to
+      // initialize the hash begins
+      nnz_lno_t dirty_hashes = globally_used_hash_count[0];
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember, dirty_hashes),
+                           [&](nnz_lno_t i) {
+                             nnz_lno_t dirty_hash =
+                                 globally_used_hash_indices[i];
+                             hm2.hash_begins[dirty_hash] = -1;
+                           });
+
+      Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
         m_space.release_chunk(globally_used_hash_indices);
       });
     }
-
-  }
-
-  size_t team_shmem_size (int /*team_size*/) const {
-    return shared_memory_size;
   }
 
+  size_t team_shmem_size(int /*team_size*/) const { return shared_memory_size; }
 };
-}
-}
+}  // namespace Impl
+}  // namespace KokkosSparse
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp
index d7c82e778b..beb80cab8a 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp
@@ -44,23 +44,22 @@
 
 #include "KokkosKernels_BitUtils.hpp"
 
-namespace KokkosSparse{
+namespace KokkosSparse {
 
-namespace Impl{
+namespace Impl {
 
-
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
 template <typename pool_memory_space, typename struct_visit_t>
-struct KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-  TriangleCount{
-  const nnz_lno_t numrows; //num rows in A
+struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                    a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                    b_scalar_nnz_view_t_>::TriangleCount {
+  const nnz_lno_t numrows;  // num rows in A
 
-  const size_type *row_mapA; //A row pointers
-  const nnz_lno_t *entriesA; // A column indices
+  const size_type *row_mapA;  // A row pointers
+  const nnz_lno_t *entriesA;  // A column indices
 
   const size_type *row_pointer_begins_B;
   const size_type *row_pointer_ends_B;
@@ -68,7 +67,7 @@ struct KokkosSPGEMM
   const nnz_lno_t *entriesSetsB;
 
   size_type *rowmapC;
-  nnz_lno_t* entriesC;
+  nnz_lno_t *entriesC;
 
   const nnz_lno_t pow2_hash_size;
   const nnz_lno_t pow2_hash_func;
@@ -79,8 +78,7 @@ struct KokkosSPGEMM
   pool_memory_space m_space;
   const KokkosKernels::Impl::ExecSpaceType my_exec_space;
 
-
-  const int unit_memory; //begins, nexts, and keys. No need for vals yet.
+  const int unit_memory;  // begins, nexts, and keys. No need for vals yet.
   const int suggested_team_size;
   const int thread_memory;
   nnz_lno_t shmem_key_size;
@@ -109,33 +107,26 @@ struct KokkosSPGEMM
    * \param team_row_chunk_size_: suggested team chunk size
    * \param my_exec_space_ : execution space.
    */
-  TriangleCount(
-      const nnz_lno_t m_,
-      const size_type *row_mapA_,
-      const nnz_lno_t *entriesA_,
-
-      const size_type *row_ptr_begins_B_,
-      const size_type *row_ptr_ends_B_,
-      const nnz_lno_t *entriesSetIndicesB_,
-      const nnz_lno_t *entriesSetsB_,
-
-      size_type *rowmapC_,
-      nnz_lno_t *entriesC_,
-
-      const nnz_lno_t hash_size_,
-      const nnz_lno_t MaxRoughNonZero_,
-      const size_t sharedMemorySize_,
-      const int suggested_team_size_,
-      const nnz_lno_t team_row_chunk_size_,
-      const int vector_size_,
-      pool_memory_space mpool_,
-      const KokkosKernels::Impl::ExecSpaceType my_exec_space_,
-      int mode_,
-      const nnz_lno_t *min_size_row_for_each_row_,
-      const struct_visit_t visit_applier_,
-      bool KOKKOSKERNELS_VERBOSE_):
-        numrows(m_),
-        row_mapA (row_mapA_),
+  TriangleCount(const nnz_lno_t m_, const size_type *row_mapA_,
+                const nnz_lno_t *entriesA_,
+
+                const size_type *row_ptr_begins_B_,
+                const size_type *row_ptr_ends_B_,
+                const nnz_lno_t *entriesSetIndicesB_,
+                const nnz_lno_t *entriesSetsB_,
+
+                size_type *rowmapC_, nnz_lno_t *entriesC_,
+
+                const nnz_lno_t hash_size_, const nnz_lno_t MaxRoughNonZero_,
+                const size_t sharedMemorySize_, const int suggested_team_size_,
+                const nnz_lno_t team_row_chunk_size_, const int vector_size_,
+                pool_memory_space mpool_,
+                const KokkosKernels::Impl::ExecSpaceType my_exec_space_,
+                int mode_, const nnz_lno_t *min_size_row_for_each_row_,
+                const struct_visit_t visit_applier_,
+                bool KOKKOSKERNELS_VERBOSE_)
+      : numrows(m_),
+        row_mapA(row_mapA_),
         entriesA(entriesA_),
         row_pointer_begins_B(row_ptr_begins_B_),
         row_pointer_ends_B(row_ptr_ends_B_),
@@ -147,1802 +138,1782 @@ struct KokkosSPGEMM
         pow2_hash_func(hash_size_ - 1),
         MaxRoughNonZero(MaxRoughNonZero_),
         shared_memory_size(sharedMemorySize_),
-        vector_size (vector_size_),
+        vector_size(vector_size_),
         m_space(mpool_),
         my_exec_space(my_exec_space_),
         unit_memory(sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) * 3),
         suggested_team_size(suggested_team_size_),
-        thread_memory((shared_memory_size /8 / suggested_team_size_) * 8),
+        thread_memory((shared_memory_size / 8 / suggested_team_size_) * 8),
         shmem_key_size(),
         shared_memory_hash_func(),
         shmem_hash_size(1),
         team_row_chunk_size(team_row_chunk_size_),
-        set_size (sizeof(nnz_lno_t) * 8),
+        set_size(sizeof(nnz_lno_t) * 8),
         set_shift(log(double(sizeof(nnz_lno_t) * 8)) / log(2.0) + 0.5),
         count_or_fill_mode(mode_),
         min_size_row_for_each_row(min_size_row_for_each_row_),
-        visit_applier(visit_applier_)
-  {
-
-    //how many keys I can hold?
-    //thread memory - 3 needed entry for size.
+        visit_applier(visit_applier_) {
+    // how many keys I can hold?
+    // thread memory - 3 needed entry for size.
     shmem_key_size = ((thread_memory - sizeof(nnz_lno_t) * 4) / unit_memory);
 
-    //put the hash size closest power of 2.
-    //we round down here, because we want to store more keys,
-    //conflicts are cheaper.
-    while (shmem_hash_size * 2 <=  shmem_key_size){
+    // put the hash size closest power of 2.
+    // we round down here, because we want to store more keys,
+    // conflicts are cheaper.
+    while (shmem_hash_size * 2 <= shmem_key_size) {
       shmem_hash_size = shmem_hash_size * 2;
     }
-    //for and opeation we get -1.
+    // for and opeation we get -1.
     shared_memory_hash_func = shmem_hash_size - 1;
 
-    //increase the key size wit the left over from hash size.
-    shmem_key_size = shmem_key_size + ((shmem_key_size - shmem_hash_size) ) / 4;
-    //round it down to 2, because of some alignment issues.
+    // increase the key size wit the left over from hash size.
+    shmem_key_size = shmem_key_size + ((shmem_key_size - shmem_hash_size)) / 4;
+    // round it down to 2, because of some alignment issues.
     shmem_key_size = (shmem_key_size >> 1) << 1;
 
-    if (KOKKOSKERNELS_VERBOSE_){
+    if (KOKKOSKERNELS_VERBOSE_) {
       std::cout << "\tTriangleCount "
-          << " thread_memory:" << thread_memory
-          << " unit_memory:" << unit_memory
-          << " adjusted hashsize:" << shmem_hash_size
-          << " adjusted shmem_key_size:" << shmem_key_size
-          << " using "<< (shmem_key_size * 4  + shmem_hash_size) * sizeof (nnz_lno_t) +    sizeof(nnz_lno_t) * 3
-          << " of thread_memory: " << thread_memory
-          << " set_shift:" << set_shift << " set_size:" << set_size
-          << std::endl;
+                << " thread_memory:" << thread_memory
+                << " unit_memory:" << unit_memory
+                << " adjusted hashsize:" << shmem_hash_size
+                << " adjusted shmem_key_size:" << shmem_key_size << " using "
+                << (shmem_key_size * 4 + shmem_hash_size) * sizeof(nnz_lno_t) +
+                       sizeof(nnz_lno_t) * 3
+                << " of thread_memory: " << thread_memory
+                << " set_shift:" << set_shift << " set_size:" << set_size
+                << std::endl;
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  nnz_lno_t get_thread_id(const nnz_lno_t row_index) const{
-    switch (my_exec_space){
-    default:
-      return row_index;
-#if defined( KOKKOS_ENABLE_SERIAL )
-    case KokkosKernels::Impl::Exec_SERIAL:
-      return 0;
+  nnz_lno_t get_thread_id(const nnz_lno_t row_index) const {
+    switch (my_exec_space) {
+      default: return row_index;
+#if defined(KOKKOS_ENABLE_SERIAL)
+      case KokkosKernels::Impl::Exec_SERIAL: return 0;
 #endif
-#if defined( KOKKOS_ENABLE_OPENMP )
-    case KokkosKernels::Impl::Exec_OMP:
-      return Kokkos::OpenMP::impl_hardware_thread_id();
+#if defined(KOKKOS_ENABLE_OPENMP)
+      case KokkosKernels::Impl::Exec_OMP:
+        return Kokkos::OpenMP::impl_hardware_thread_id();
 #endif
-#if defined( KOKKOS_ENABLE_THREADS )
-    case KokkosKernels::Impl::Exec_PTHREADS:
-      return Kokkos::Threads::impl_hardware_thread_id();
+#if defined(KOKKOS_ENABLE_THREADS)
+      case KokkosKernels::Impl::Exec_THREADS:
+        return Kokkos::Threads::impl_hardware_thread_id();
 #endif
-#if defined( KOKKOS_ENABLE_QTHREAD)
-    case KokkosKernels::Impl::Exec_QTHREADS:
-      return 0; // Kokkos does not have a thread_id API for Qthreads
+#if defined(KOKKOS_ENABLE_CUDA)
+      case KokkosKernels::Impl::Exec_CUDA: return row_index;
 #endif
-#if defined( KOKKOS_ENABLE_CUDA )
-    case KokkosKernels::Impl::Exec_CUDA:
-      return row_index;
-#endif
-#if defined( KOKKOS_ENABLE_HIP )
-    case KokkosKernels::Impl::Exec_HIP:
-      return row_index;
+#if defined(KOKKOS_ENABLE_HIP)
+      case KokkosKernels::Impl::Exec_HIP: return row_index;
 #endif
     }
   }
 
-
-
   KOKKOS_INLINE_FUNCTION
-  void operator()(const NoCompressMultiCoreDenseAccumulatorTag3&, const team_member_t & teamMember) const {
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
-
-    //dense accumulators
-    //nnz_lno_t *indices = NULL;
+  void operator()(const NoCompressMultiCoreDenseAccumulatorTag3 &,
+                  const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
+
+    // dense accumulators
+    // nnz_lno_t *indices = NULL;
     nnz_lno_t *sets = NULL;
 
-    volatile nnz_lno_t * tmp = NULL;
+    volatile nnz_lno_t *tmp = NULL;
 
     nnz_lno_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-    while (tmp == NULL){
-      tmp = (volatile nnz_lno_t * )( m_space.allocate_chunk(tid));
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(m_space.allocate_chunk(tid));
     }
 
-    //we need as much as column size for sets.
-    sets = (nnz_lno_t *) tmp;
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
-        [&] (const nnz_lno_t& row_index) {
-      //nnz_lno_t insertion_count = 0;
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
-      //nnz_lno_t num_el = 0;
-      if (col_size){
-
-        size_type mask_row_begin = row_pointer_begins_B[row_index];
-        nnz_lno_t mask_row_left_work = row_pointer_ends_B[row_index] - mask_row_begin;
-
-        //traverse columns of B
-        for (nnz_lno_t i = 0; i < mask_row_left_work; ++i){
-          const size_type adjind = i + mask_row_begin;
-          nnz_lno_t b_set_ind = entriesSetIndicesB[adjind];
-          //nnz_lno_t b_set = entriesSetsB[adjind];
-          //here we assume that each element in row is unique.
-          //we need to change compression so that it will always
-          //return unique rows.
-          sets[b_set_ind] = row_index;
-        }
-
-
-        //traverse columns of A
-        for (nnz_lno_t colind = 0; colind < col_size; ++colind){
-          size_type a_col = colind + col_begin;
-
-
-          nnz_lno_t rowB = entriesA[a_col];
-          size_type rowBegin = row_pointer_begins_B[rowB];
-          nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
-
+    // we need as much as column size for sets.
+    sets = (nnz_lno_t *)tmp;
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          // nnz_lno_t insertion_count = 0;
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t col_size  = row_mapA[row_index + 1] - col_begin;
+          // nnz_lno_t num_el = 0;
+          if (col_size) {
+            size_type mask_row_begin = row_pointer_begins_B[row_index];
+            nnz_lno_t mask_row_left_work =
+                row_pointer_ends_B[row_index] - mask_row_begin;
+
+            // traverse columns of B
+            for (nnz_lno_t i = 0; i < mask_row_left_work; ++i) {
+              const size_type adjind = i + mask_row_begin;
+              nnz_lno_t b_set_ind    = entriesSetIndicesB[adjind];
+              // nnz_lno_t b_set = entriesSetsB[adjind];
+              // here we assume that each element in row is unique.
+              // we need to change compression so that it will always
+              // return unique rows.
+              sets[b_set_ind] = row_index;
+            }
 
-          //traverse columns of B
-          for (nnz_lno_t i = 0; i < left_work; ++i){
-            const size_type adjind = i + rowBegin;
-            nnz_lno_t b_set_ind = entriesSetIndicesB[adjind];
-            //nnz_lno_t intersection = sets[b_set_ind] & b_set;
-            if(sets[b_set_ind] == row_index)
-              visit_applier(row_index, b_set_ind, a_col, tid);
+            // traverse columns of A
+            for (nnz_lno_t colind = 0; colind < col_size; ++colind) {
+              size_type a_col = colind + col_begin;
+
+              nnz_lno_t rowB      = entriesA[a_col];
+              size_type rowBegin  = row_pointer_begins_B[rowB];
+              nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
+
+              // traverse columns of B
+              for (nnz_lno_t i = 0; i < left_work; ++i) {
+                const size_type adjind = i + rowBegin;
+                nnz_lno_t b_set_ind    = entriesSetIndicesB[adjind];
+                // nnz_lno_t intersection = sets[b_set_ind] & b_set;
+                if (sets[b_set_ind] == row_index)
+                  visit_applier(row_index, b_set_ind, a_col, tid);
+              }
+            }
           }
-        }
-      }
-    }
-    );
+        });
 
     m_space.release_chunk(sets);
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const NoCompressMultiCoreDenseAccumulatorTag2&, const team_member_t & teamMember) const {
-
-
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
-
-    //dense accumulators
+  void operator()(const NoCompressMultiCoreDenseAccumulatorTag2 &,
+                  const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
+
+    // dense accumulators
     nnz_lno_t *indices = NULL;
-    //nnz_lno_t *sets = NULL;
+    // nnz_lno_t *sets = NULL;
     nnz_lno_t *sets2 = NULL;
 
-    volatile nnz_lno_t * tmp = NULL;
+    volatile nnz_lno_t *tmp = NULL;
 
     nnz_lno_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-    while (tmp == NULL){
-      tmp = (volatile nnz_lno_t * )( m_space.allocate_chunk(tid));
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(m_space.allocate_chunk(tid));
     }
 
-    //we need as much as column size for sets.
-    //sets = (nnz_lno_t *) tmp;
-    //tmp += MaxRoughNonZero; //this is set as column size before calling dense accumulators.
-
-    sets2 = (nnz_lno_t *) tmp;
-    tmp += MaxRoughNonZero; //this is set as column size before calling dense accumulators.
-
-
-    //indices only needs max row size.
-    indices = (nnz_lno_t *) tmp;
-
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
-        [&] (const nnz_lno_t& row_index) {
-      nnz_lno_t insertion_count = 0;
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
-      //nnz_lno_t num_el = 0;
-      if (col_size){
-
-        //first insert the minimum row.
-        nnz_lno_t min_row_b = this->min_size_row_for_each_row[row_index];
-        size_type min_row_begin = row_pointer_begins_B[min_row_b];
-        nnz_lno_t min_row_left_work = row_pointer_ends_B[min_row_b] - min_row_begin;
-
-        //traverse columns of B
-        for (nnz_lno_t i = 0; i < min_row_left_work; ++i){
-          const size_type adjind = i + min_row_begin;
-          nnz_lno_t b_set_ind = entriesSetIndicesB[adjind];
-          //nnz_lno_t b_set = entriesSetsB[adjind];
-          //here we assume that each element in row is unique.
-          //we need to change compression so that it will always
-          //return unique rows.
-          indices[insertion_count++] = b_set_ind;
-          //sets[b_set_ind] = row_index;
-          sets2[b_set_ind] = 1;
-        }
-
-
-        //traverse columns of A
-        for (nnz_lno_t colind = 0; colind < col_size; ++colind){
-          size_type a_col = colind + col_begin;
-
-
-          nnz_lno_t rowB = entriesA[a_col];
-          if (rowB == min_row_b) continue;
-          size_type rowBegin = row_pointer_begins_B[rowB];
-          nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
-
+    // we need as much as column size for sets.
+    // sets = (nnz_lno_t *) tmp;
+    // tmp += MaxRoughNonZero; //this is set as column size before calling dense
+    // accumulators.
+
+    sets2 = (nnz_lno_t *)tmp;
+    tmp += MaxRoughNonZero;  // this is set as column size before calling dense
+                             // accumulators.
+
+    // indices only needs max row size.
+    indices = (nnz_lno_t *)tmp;
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          nnz_lno_t insertion_count = 0;
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t col_size  = row_mapA[row_index + 1] - col_begin;
+          // nnz_lno_t num_el = 0;
+          if (col_size) {
+            // first insert the minimum row.
+            nnz_lno_t min_row_b = this->min_size_row_for_each_row[row_index];
+            size_type min_row_begin = row_pointer_begins_B[min_row_b];
+            nnz_lno_t min_row_left_work =
+                row_pointer_ends_B[min_row_b] - min_row_begin;
+
+            // traverse columns of B
+            for (nnz_lno_t i = 0; i < min_row_left_work; ++i) {
+              const size_type adjind = i + min_row_begin;
+              nnz_lno_t b_set_ind    = entriesSetIndicesB[adjind];
+              // nnz_lno_t b_set = entriesSetsB[adjind];
+              // here we assume that each element in row is unique.
+              // we need to change compression so that it will always
+              // return unique rows.
+              indices[insertion_count++] = b_set_ind;
+              // sets[b_set_ind] = row_index;
+              sets2[b_set_ind] = 1;
+            }
 
-          //traverse columns of B
-          for (nnz_lno_t i = 0; i < left_work; ++i){
-            const size_type adjind = i + rowBegin;
-            nnz_lno_t b_set_ind = entriesSetIndicesB[adjind];
-            //nnz_lno_t b_set = entriesSetsB[adjind];
-            //make a intersection.
-            //sets[b_set_ind] = sets[b_set_ind] & b_set;
-            ++sets2[b_set_ind];
+            // traverse columns of A
+            for (nnz_lno_t colind = 0; colind < col_size; ++colind) {
+              size_type a_col = colind + col_begin;
+
+              nnz_lno_t rowB = entriesA[a_col];
+              if (rowB == min_row_b) continue;
+              size_type rowBegin  = row_pointer_begins_B[rowB];
+              nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
+
+              // traverse columns of B
+              for (nnz_lno_t i = 0; i < left_work; ++i) {
+                const size_type adjind = i + rowBegin;
+                nnz_lno_t b_set_ind    = entriesSetIndicesB[adjind];
+                // nnz_lno_t b_set = entriesSetsB[adjind];
+                // make a intersection.
+                // sets[b_set_ind] = sets[b_set_ind] & b_set;
+                ++sets2[b_set_ind];
+              }
+            }
           }
-        }
-      }
-      for (nnz_lno_t ii = 0; ii < insertion_count; ++ii){
-        const nnz_lno_t set_ind = indices[ii];
+          for (nnz_lno_t ii = 0; ii < insertion_count; ++ii) {
+            const nnz_lno_t set_ind = indices[ii];
 
-        if (sets2[set_ind] != col_size) continue;
-        visit_applier(row_index, set_ind, 0, tid);
-      }
-    }
-    );
+            if (sets2[set_ind] != col_size) continue;
+            visit_applier(row_index, set_ind, 0, tid);
+          }
+        });
 
     m_space.release_chunk(sets2);
   }
 
-
   KOKKOS_INLINE_FUNCTION
-  void operator()(const NoCompressMultiCoreDenseAccumulatorTag&, const team_member_t & teamMember) const {
-
-
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
-
-    //dense accumulators
+  void operator()(const NoCompressMultiCoreDenseAccumulatorTag &,
+                  const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
+
+    // dense accumulators
     nnz_lno_t *indices = NULL;
-    nnz_lno_t *sets = NULL;
-    //nnz_lno_t *sets2 = NULL;
+    nnz_lno_t *sets    = NULL;
+    // nnz_lno_t *sets2 = NULL;
 
-    volatile nnz_lno_t * tmp = NULL;
+    volatile nnz_lno_t *tmp = NULL;
 
     nnz_lno_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-    while (tmp == NULL){
-      tmp = (volatile nnz_lno_t * )( m_space.allocate_chunk(tid));
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(m_space.allocate_chunk(tid));
     }
 
-    //we need as much as column size for sets.
-    sets = (nnz_lno_t *) tmp;
-    tmp += MaxRoughNonZero; //this is set as column size before calling dense accumulators.
-    //sets2 = (nnz_lno_t *) tmp;
-    //tmp += MaxRoughNonZero; //this is set as column size before calling dense accumulators.
-
-    //indices only needs max row size.
-    indices = (nnz_lno_t *) tmp;
-
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
-        [&] (const nnz_lno_t& row_index) {
-      nnz_lno_t insertion_count = 0;
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
-      //nnz_lno_t num_el = 0;
-
-      //traverse columns of A
-      for (nnz_lno_t colind = 0; colind < col_size; ++colind){
-        size_type a_col = colind + col_begin;
-
-        nnz_lno_t rowB = entriesA[a_col];
-
-
-        size_type rowBegin = row_pointer_begins_B[rowB];
-        nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
-
-        //traverse columns of B
-        for (nnz_lno_t i = 0; i < left_work; ++i){
-          const size_type adjind = i + rowBegin;
-          nnz_lno_t b_set_ind = entriesSetIndicesB[adjind];
-          //nnz_lno_t b_set = entriesSetsB[adjind];
-          //if sets are not set before, add this to indices.
-          if (sets[b_set_ind]++ == 0){
-
-
-            //std::cout << "row:"<< row_index << " insertion_count:" << insertion_count << std::endl;
-            indices[insertion_count++] = b_set_ind;
+    // we need as much as column size for sets.
+    sets = (nnz_lno_t *)tmp;
+    tmp += MaxRoughNonZero;  // this is set as column size before calling dense
+                             // accumulators.
+    // sets2 = (nnz_lno_t *) tmp;
+    // tmp += MaxRoughNonZero; //this is set as column size before calling dense
+    // accumulators.
+
+    // indices only needs max row size.
+    indices = (nnz_lno_t *)tmp;
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          nnz_lno_t insertion_count = 0;
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t col_size  = row_mapA[row_index + 1] - col_begin;
+          // nnz_lno_t num_el = 0;
+
+          // traverse columns of A
+          for (nnz_lno_t colind = 0; colind < col_size; ++colind) {
+            size_type a_col = colind + col_begin;
+
+            nnz_lno_t rowB = entriesA[a_col];
+
+            size_type rowBegin  = row_pointer_begins_B[rowB];
+            nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
+
+            // traverse columns of B
+            for (nnz_lno_t i = 0; i < left_work; ++i) {
+              const size_type adjind = i + rowBegin;
+              nnz_lno_t b_set_ind    = entriesSetIndicesB[adjind];
+              // nnz_lno_t b_set = entriesSetsB[adjind];
+              // if sets are not set before, add this to indices.
+              if (sets[b_set_ind]++ == 0) {
+                // std::cout << "row:"<< row_index << " insertion_count:" <<
+                // insertion_count << std::endl;
+                indices[insertion_count++] = b_set_ind;
+              }
+              // sets2[b_set_ind] = sets2[b_set_ind] | (sets[b_set_ind] &
+              // b_set); make a union. sets[b_set_ind] = sets[b_set_ind] |
+              // b_set;
+            }
           }
-          //sets2[b_set_ind] = sets2[b_set_ind] | (sets[b_set_ind] & b_set);
-          //make a union.
-          //sets[b_set_ind] = sets[b_set_ind] | b_set;
-        }
-      }
-      for (nnz_lno_t ii = 0; ii < insertion_count; ++ii){
-        const nnz_lno_t set_ind = indices[ii];
-
-        nnz_lno_t c_rows = sets[set_ind];
-        sets[set_ind] = 0;
-        //std::cout << "c_rows :" << c_rows  << std::endl;
-        if (c_rows == 2){
-          visit_applier(row_index, set_ind, 0, tid);
-        }
-      }
-    }
-    );
+          for (nnz_lno_t ii = 0; ii < insertion_count; ++ii) {
+            const nnz_lno_t set_ind = indices[ii];
+
+            nnz_lno_t c_rows = sets[set_ind];
+            sets[set_ind]    = 0;
+            // std::cout << "c_rows :" << c_rows  << std::endl;
+            if (c_rows == 2) {
+              visit_applier(row_index, set_ind, 0, tid);
+            }
+          }
+        });
 
     m_space.release_chunk(sets);
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const MultiCoreDenseAccumulatorTag3&, const team_member_t & teamMember) const {
-
-
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
-
-    //dense accumulators
-    //nnz_lno_t *indices = NULL;
+  void operator()(const MultiCoreDenseAccumulatorTag3 &,
+                  const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
+
+    // dense accumulators
+    // nnz_lno_t *indices = NULL;
     nnz_lno_t *sets = NULL;
 
-    volatile nnz_lno_t * tmp = NULL;
+    volatile nnz_lno_t *tmp = NULL;
 
     nnz_lno_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-    while (tmp == NULL){
-      tmp = (volatile nnz_lno_t * )( m_space.allocate_chunk(tid));
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(m_space.allocate_chunk(tid));
     }
 
-    //we need as much as column size for sets.
-    sets = (nnz_lno_t *) tmp;
-    //tmp += MaxRoughNonZero; //this is set as column size before calling dense accumulators.
-
-    //indices only needs max row size.
-    //indices = (nnz_lno_t *) tmp;
-
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
-        [&] (const nnz_lno_t& row_index) {
-      //nnz_lno_t insertion_count = 0;
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
-      //nnz_lno_t num_el = 0;
-      if (col_size){
-
-        size_type mask_row_begin = row_pointer_begins_B[row_index];
-        nnz_lno_t mask_row_left_work = row_pointer_ends_B[row_index] - mask_row_begin;
-
-        //traverse columns of B
-        for (nnz_lno_t i = 0; i < mask_row_left_work; ++i){
-          const size_type adjind = i + mask_row_begin;
-          nnz_lno_t b_set_ind = entriesSetIndicesB[adjind];
-          nnz_lno_t b_set = entriesSetsB[adjind];
-          //here we assume that each element in row is unique.
-          //we need to change compression so that it will always
-          //return unique rows.
-          sets[b_set_ind] = b_set;
-        }
-
-
-        //traverse columns of A
-        for (nnz_lno_t colind = 0; colind < col_size; ++colind){
-          size_type a_col = colind + col_begin;
-
-
-          nnz_lno_t rowB = entriesA[a_col];
-          size_type rowBegin = row_pointer_begins_B[rowB];
-          nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
-
+    // we need as much as column size for sets.
+    sets = (nnz_lno_t *)tmp;
+    // tmp += MaxRoughNonZero; //this is set as column size before calling dense
+    // accumulators.
+
+    // indices only needs max row size.
+    // indices = (nnz_lno_t *) tmp;
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          // nnz_lno_t insertion_count = 0;
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t col_size  = row_mapA[row_index + 1] - col_begin;
+          // nnz_lno_t num_el = 0;
+          if (col_size) {
+            size_type mask_row_begin = row_pointer_begins_B[row_index];
+            nnz_lno_t mask_row_left_work =
+                row_pointer_ends_B[row_index] - mask_row_begin;
+
+            // traverse columns of B
+            for (nnz_lno_t i = 0; i < mask_row_left_work; ++i) {
+              const size_type adjind = i + mask_row_begin;
+              nnz_lno_t b_set_ind    = entriesSetIndicesB[adjind];
+              nnz_lno_t b_set        = entriesSetsB[adjind];
+              // here we assume that each element in row is unique.
+              // we need to change compression so that it will always
+              // return unique rows.
+              sets[b_set_ind] = b_set;
+            }
 
-          //traverse columns of B
-          for (nnz_lno_t i = 0; i < left_work; ++i){
-            const size_type adjind = i + rowBegin;
-            nnz_lno_t b_set_ind = entriesSetIndicesB[adjind];
-            nnz_lno_t b_set = entriesSetsB[adjind];
-            //make a intersection.
-            //std::cout << "b_set_ind:" << b_set_ind << std::endl;
-            nnz_lno_t intersection = sets[b_set_ind] & b_set;
-            if(intersection)
-              visit_applier(row_index, b_set_ind, intersection, tid);
+            // traverse columns of A
+            for (nnz_lno_t colind = 0; colind < col_size; ++colind) {
+              size_type a_col = colind + col_begin;
+
+              nnz_lno_t rowB      = entriesA[a_col];
+              size_type rowBegin  = row_pointer_begins_B[rowB];
+              nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
+
+              // traverse columns of B
+              for (nnz_lno_t i = 0; i < left_work; ++i) {
+                const size_type adjind = i + rowBegin;
+                nnz_lno_t b_set_ind    = entriesSetIndicesB[adjind];
+                nnz_lno_t b_set        = entriesSetsB[adjind];
+                // make a intersection.
+                // std::cout << "b_set_ind:" << b_set_ind << std::endl;
+                nnz_lno_t intersection = sets[b_set_ind] & b_set;
+                if (intersection)
+                  visit_applier(row_index, b_set_ind, intersection, tid);
+              }
+            }
 
+            // traverse columns of B
+            for (nnz_lno_t i = 0; i < mask_row_left_work; ++i) {
+              const size_type adjind = i + mask_row_begin;
+              nnz_lno_t b_set_ind    = entriesSetIndicesB[adjind];
+              sets[b_set_ind]        = 0;
+            }
           }
-        }
-
-        //traverse columns of B
-        for (nnz_lno_t i = 0; i < mask_row_left_work; ++i){
-          const size_type adjind = i + mask_row_begin;
-          nnz_lno_t b_set_ind = entriesSetIndicesB[adjind];
-          sets[b_set_ind] = 0;
-        }
-      }
-    }
-    );
+        });
 
     m_space.release_chunk(sets);
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const MultiCoreDenseAccumulatorTag2&, const team_member_t & teamMember) const {
-
-
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
-
-    //dense accumulators
+  void operator()(const MultiCoreDenseAccumulatorTag2 &,
+                  const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
+
+    // dense accumulators
     nnz_lno_t *indices = NULL;
-    nnz_lno_t *sets = NULL;
-    nnz_lno_t *sets2 = NULL;
+    nnz_lno_t *sets    = NULL;
+    nnz_lno_t *sets2   = NULL;
 
-    volatile nnz_lno_t * tmp = NULL;
+    volatile nnz_lno_t *tmp = NULL;
 
     nnz_lno_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-    while (tmp == NULL){
-      tmp = (volatile nnz_lno_t * )( m_space.allocate_chunk(tid));
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(m_space.allocate_chunk(tid));
     }
 
-    //we need as much as column size for sets.
-    sets = (nnz_lno_t *) tmp;
-    tmp += MaxRoughNonZero; //this is set as column size before calling dense accumulators.
-
-    sets2 = (nnz_lno_t *) tmp;
-    tmp += MaxRoughNonZero; //this is set as column size before calling dense accumulators.
-
-
-    //indices only needs max row size.
-    indices = (nnz_lno_t *) tmp;
-
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
-        [&] (const nnz_lno_t& row_index) {
-      nnz_lno_t insertion_count = 0;
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
-      nnz_lno_t num_el = 0;
-      if (col_size){
-
-        //first insert the minimum row.
-        nnz_lno_t min_row_b = this->min_size_row_for_each_row[row_index];
-        size_type min_row_begin = row_pointer_begins_B[min_row_b];
-        nnz_lno_t min_row_left_work = row_pointer_ends_B[min_row_b] - min_row_begin;
-
-        //traverse columns of B
-        for (nnz_lno_t i = 0; i < min_row_left_work; ++i){
-          const size_type adjind = i + min_row_begin;
-          nnz_lno_t b_set_ind = entriesSetIndicesB[adjind];
-          nnz_lno_t b_set = entriesSetsB[adjind];
-          //here we assume that each element in row is unique.
-          //we need to change compression so that it will always
-          //return unique rows.
-          //std::cout << "MaxRoughNonZero:"<<MaxRoughNonZero<< " insertion_count:" << insertion_count << std::endl;
-          indices[insertion_count++] = b_set_ind;
-          sets[b_set_ind] = b_set;
-          sets2[b_set_ind] = 1;
-        }
-
-
-        //traverse columns of A
-        for (nnz_lno_t colind = 0; colind < col_size; ++colind){
-          size_type a_col = colind + col_begin;
-
-
-          nnz_lno_t rowB = entriesA[a_col];
-          if (rowB == min_row_b) continue;
-          size_type rowBegin = row_pointer_begins_B[rowB];
-          nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
-
-
-          //traverse columns of B
-          for (nnz_lno_t i = 0; i < left_work; ++i){
-            const size_type adjind = i + rowBegin;
-            nnz_lno_t b_set_ind = entriesSetIndicesB[adjind];
-            nnz_lno_t b_set = entriesSetsB[adjind];
-            //make a intersection.
-            sets[b_set_ind] = sets[b_set_ind] & b_set;
-            ++sets2[b_set_ind];
-          }
-        }
-      }
-
-      switch (count_or_fill_mode){
-      case 0: //count mode
-      default:
-      {
-        for (nnz_lno_t ii = 0; ii < insertion_count; ++ii){
-          nnz_lno_t set_ind = indices[ii];
-          nnz_lno_t c_rows = sets[set_ind];
-          if (sets2[set_ind] != col_size) continue;
-          //count number of set bits
-          /*
-          nnz_lno_t num_el2 = 0;
-          for (; c_rows; num_el2++) {
-            c_rows = c_rows & (c_rows - 1); // clear the least significant bit set
-          }
-          num_el += num_el2;
-          */
-          //num_el += KokkosKernels::Impl::set_bit_count(c_rows);
+    // we need as much as column size for sets.
+    sets = (nnz_lno_t *)tmp;
+    tmp += MaxRoughNonZero;  // this is set as column size before calling dense
+                             // accumulators.
+
+    sets2 = (nnz_lno_t *)tmp;
+    tmp += MaxRoughNonZero;  // this is set as column size before calling dense
+                             // accumulators.
+
+    // indices only needs max row size.
+    indices = (nnz_lno_t *)tmp;
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          nnz_lno_t insertion_count = 0;
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t col_size  = row_mapA[row_index + 1] - col_begin;
+          nnz_lno_t num_el          = 0;
+          if (col_size) {
+            // first insert the minimum row.
+            nnz_lno_t min_row_b = this->min_size_row_for_each_row[row_index];
+            size_type min_row_begin = row_pointer_begins_B[min_row_b];
+            nnz_lno_t min_row_left_work =
+                row_pointer_ends_B[min_row_b] - min_row_begin;
+
+            // traverse columns of B
+            for (nnz_lno_t i = 0; i < min_row_left_work; ++i) {
+              const size_type adjind = i + min_row_begin;
+              nnz_lno_t b_set_ind    = entriesSetIndicesB[adjind];
+              nnz_lno_t b_set        = entriesSetsB[adjind];
+              // here we assume that each element in row is unique.
+              // we need to change compression so that it will always
+              // return unique rows.
+              // std::cout << "MaxRoughNonZero:"<<MaxRoughNonZero<< "
+              // insertion_count:" << insertion_count << std::endl;
+              indices[insertion_count++] = b_set_ind;
+              sets[b_set_ind]            = b_set;
+              sets2[b_set_ind]           = 1;
+            }
 
-          //num_el += KokkosKernels::Impl::set_bit_count<nnz_lno_t, MyExecSpace>(c_rows);
-          num_el += KokkosKernels::Impl::pop_count(c_rows);
-        }
-        rowmapC[row_index] = num_el;
-        //std::cout << "row_index:" << row_index << " num_el:" << num_el << std::endl;
-      }
-      break;
-      case 1: //fill mode
-      {
-        size_type adj_ind = rowmapC[row_index];
-
-        for (nnz_lno_t ii = 0; ii < insertion_count; ++ii){
-          const nnz_lno_t set_ind = indices[ii];
-
-          if (sets2[set_ind] != col_size) continue;
-          nnz_lno_t c_rows = sets[set_ind];
-          const nnz_lno_t shift = set_ind << set_shift;
-          //int current_row = 0;
-          nnz_lno_t unit = 1;
-          while (c_rows){
-            int least_set = KokkosKernels::Impl::least_set_bit(c_rows) - 1;
-            entriesC[adj_ind++] = shift + least_set;
-            c_rows = c_rows & ~(unit << least_set);
-
-            /*
-            if (c_rows & unit){
-              //insert indices.
-              entriesC[num_el++] = shift + current_row;
+            // traverse columns of A
+            for (nnz_lno_t colind = 0; colind < col_size; ++colind) {
+              size_type a_col = colind + col_begin;
+
+              nnz_lno_t rowB = entriesA[a_col];
+              if (rowB == min_row_b) continue;
+              size_type rowBegin  = row_pointer_begins_B[rowB];
+              nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
+
+              // traverse columns of B
+              for (nnz_lno_t i = 0; i < left_work; ++i) {
+                const size_type adjind = i + rowBegin;
+                nnz_lno_t b_set_ind    = entriesSetIndicesB[adjind];
+                nnz_lno_t b_set        = entriesSetsB[adjind];
+                // make a intersection.
+                sets[b_set_ind] = sets[b_set_ind] & b_set;
+                ++sets2[b_set_ind];
+              }
             }
-            current_row++;
-            c_rows = c_rows & ~unit;
-            unit = unit << 1;
-            */
-          }
-        }
-      }
-      break;
-      case 2:
-      {
-        for (nnz_lno_t ii = 0; ii < insertion_count; ++ii){
-          const nnz_lno_t set_ind = indices[ii];
-
-          if (sets2[set_ind] != col_size) continue;
-          nnz_lno_t c_rows = sets[set_ind];
-          //const nnz_lno_t shift = set_ind << set_shift;
-          if (c_rows){
-            visit_applier(row_index, set_ind, c_rows, tid);
           }
 
-        }
-      }
-      break;
-
-      }
-    }
-    );
+          switch (count_or_fill_mode) {
+            case 0:  // count mode
+            default: {
+              for (nnz_lno_t ii = 0; ii < insertion_count; ++ii) {
+                nnz_lno_t set_ind = indices[ii];
+                nnz_lno_t c_rows  = sets[set_ind];
+                if (sets2[set_ind] != col_size) continue;
+                // count number of set bits
+                /*
+                nnz_lno_t num_el2 = 0;
+                for (; c_rows; num_el2++) {
+                  c_rows = c_rows & (c_rows - 1); // clear the least significant
+                bit set
+                }
+                num_el += num_el2;
+                */
+                // num_el += KokkosKernels::Impl::set_bit_count(c_rows);
+
+                // num_el += KokkosKernels::Impl::set_bit_count<nnz_lno_t,
+                // MyExecSpace>(c_rows);
+                num_el += KokkosKernels::Impl::pop_count(c_rows);
+              }
+              rowmapC[row_index] = num_el;
+              // std::cout << "row_index:" << row_index << " num_el:" << num_el
+              // << std::endl;
+            } break;
+            case 1:  // fill mode
+            {
+              size_type adj_ind = rowmapC[row_index];
+
+              for (nnz_lno_t ii = 0; ii < insertion_count; ++ii) {
+                const nnz_lno_t set_ind = indices[ii];
+
+                if (sets2[set_ind] != col_size) continue;
+                nnz_lno_t c_rows      = sets[set_ind];
+                const nnz_lno_t shift = set_ind << set_shift;
+                // int current_row = 0;
+                nnz_lno_t unit = 1;
+                while (c_rows) {
+                  int least_set =
+                      KokkosKernels::Impl::least_set_bit(c_rows) - 1;
+                  entriesC[adj_ind++] = shift + least_set;
+                  c_rows              = c_rows & ~(unit << least_set);
+
+                  /*
+                  if (c_rows & unit){
+                    //insert indices.
+                    entriesC[num_el++] = shift + current_row;
+                  }
+                  current_row++;
+                  c_rows = c_rows & ~unit;
+                  unit = unit << 1;
+                  */
+                }
+              }
+            } break;
+            case 2: {
+              for (nnz_lno_t ii = 0; ii < insertion_count; ++ii) {
+                const nnz_lno_t set_ind = indices[ii];
+
+                if (sets2[set_ind] != col_size) continue;
+                nnz_lno_t c_rows = sets[set_ind];
+                // const nnz_lno_t shift = set_ind << set_shift;
+                if (c_rows) {
+                  visit_applier(row_index, set_ind, c_rows, tid);
+                }
+              }
+            } break;
+          }
+        });
 
     m_space.release_chunk(sets);
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const MultiCoreDenseAccumulatorTag&, const team_member_t & teamMember) const {
-
-
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
-
-    //dense accumulators
+  void operator()(const MultiCoreDenseAccumulatorTag &,
+                  const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
+
+    // dense accumulators
     nnz_lno_t *indices = NULL;
-    nnz_lno_t *sets = NULL;
-    nnz_lno_t *sets2 = NULL;
+    nnz_lno_t *sets    = NULL;
+    nnz_lno_t *sets2   = NULL;
 
-    volatile nnz_lno_t * tmp = NULL;
+    volatile nnz_lno_t *tmp = NULL;
 
     nnz_lno_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-    while (tmp == NULL){
-      tmp = (volatile nnz_lno_t * )( m_space.allocate_chunk(tid));
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(m_space.allocate_chunk(tid));
     }
 
-    //we need as much as column size for sets.
-    sets = (nnz_lno_t *) tmp;
-    tmp += MaxRoughNonZero; //this is set as column size before calling dense accumulators.
-    sets2 = (nnz_lno_t *) tmp;
-    tmp += MaxRoughNonZero; //this is set as column size before calling dense accumulators.
-
-    //indices only needs max row size.
-    indices = (nnz_lno_t *) tmp;
-
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
-        [&] (const nnz_lno_t& row_index) {
-      nnz_lno_t insertion_count = 0;
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
-      nnz_lno_t num_el = 0;
-
-      //traverse columns of A
-      for (nnz_lno_t colind = 0; colind < col_size; ++colind){
-        size_type a_col = colind + col_begin;
-
-        nnz_lno_t rowB = entriesA[a_col];
-
-
-        size_type rowBegin = row_pointer_begins_B[rowB];
-        nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
-
-        //traverse columns of B
-        for (nnz_lno_t i = 0; i < left_work; ++i){
-          const size_type adjind = i + rowBegin;
-          nnz_lno_t b_set_ind = entriesSetIndicesB[adjind];
-          nnz_lno_t b_set = entriesSetsB[adjind];
-          //if sets are not set before, add this to indices.
-          if (sets[b_set_ind] == 0){
-            indices[insertion_count++] = b_set_ind;
-          }
-          sets2[b_set_ind] = sets2[b_set_ind] | (sets[b_set_ind] & b_set);
-          //make a union.
-          sets[b_set_ind] = sets[b_set_ind] | b_set;
-        }
-      }
-
-      switch (count_or_fill_mode){
-      case 0: //count mode
-      default:
-      {
-        for (nnz_lno_t ii = 0; ii < insertion_count; ++ii){
-          nnz_lno_t set_ind = indices[ii];
-          //nnz_lno_t c_rows = sets[set_ind];
-          nnz_lno_t c_rows = sets2[set_ind];
-          sets[set_ind] = 0;
-          sets2[set_ind] = 0;
-
-          //count number of set bits
-          /*
-          nnz_lno_t num_el2 = 0;
-          for (; c_rows; num_el2++) {
-            c_rows = c_rows & (c_rows - 1); // clear the least significant bit set
-          }
-          num_el += num_el2;
-          */
-          //num_el += KokkosKernels::Impl::set_bit_count(c_rows);
-          num_el += KokkosKernels::Impl::pop_count(c_rows);
-
-        }
-        rowmapC[row_index] = num_el;
-      }
-      break;
-      case 1: //fill mode
-      {
-        size_type adj_ind = rowmapC[row_index];
-
-        for (nnz_lno_t ii = 0; ii < insertion_count; ++ii){
-          const nnz_lno_t set_ind = indices[ii];
-          const nnz_lno_t shift = set_ind << set_shift;
-
-          //nnz_lno_t c_rows = sets[set_ind];
-          nnz_lno_t c_rows = sets2[set_ind];
-          sets[set_ind] = 0;
-          sets2[set_ind] = 0;
-
-          //int current_row = 0;
-          nnz_lno_t unit = 1;
-          while (c_rows){
-
-            int least_set = KokkosKernels::Impl::least_set_bit(c_rows) - 1;
-            entriesC[adj_ind++] = shift + least_set;
-            c_rows = c_rows & ~(unit << least_set);
-            /*
-            if (c_rows & unit){
-              //insert indices.
-              entriesC[num_el++] = shift + current_row;
+    // we need as much as column size for sets.
+    sets = (nnz_lno_t *)tmp;
+    tmp += MaxRoughNonZero;  // this is set as column size before calling dense
+                             // accumulators.
+    sets2 = (nnz_lno_t *)tmp;
+    tmp += MaxRoughNonZero;  // this is set as column size before calling dense
+                             // accumulators.
+
+    // indices only needs max row size.
+    indices = (nnz_lno_t *)tmp;
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          nnz_lno_t insertion_count = 0;
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t col_size  = row_mapA[row_index + 1] - col_begin;
+          nnz_lno_t num_el          = 0;
+
+          // traverse columns of A
+          for (nnz_lno_t colind = 0; colind < col_size; ++colind) {
+            size_type a_col = colind + col_begin;
+
+            nnz_lno_t rowB = entriesA[a_col];
+
+            size_type rowBegin  = row_pointer_begins_B[rowB];
+            nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
+
+            // traverse columns of B
+            for (nnz_lno_t i = 0; i < left_work; ++i) {
+              const size_type adjind = i + rowBegin;
+              nnz_lno_t b_set_ind    = entriesSetIndicesB[adjind];
+              nnz_lno_t b_set        = entriesSetsB[adjind];
+              // if sets are not set before, add this to indices.
+              if (sets[b_set_ind] == 0) {
+                indices[insertion_count++] = b_set_ind;
+              }
+              sets2[b_set_ind] = sets2[b_set_ind] | (sets[b_set_ind] & b_set);
+              // make a union.
+              sets[b_set_ind] = sets[b_set_ind] | b_set;
             }
-            current_row++;
-            c_rows = c_rows & ~unit;
-            unit = unit << 1;
-            */
           }
-        }
-      }
-      break;
-      case 2: //fill mode
-      {
-        for (nnz_lno_t ii = 0; ii < insertion_count; ++ii){
-          const nnz_lno_t set_ind = indices[ii];
-          //const nnz_lno_t shift = set_ind << set_shift;
-
-          //nnz_lno_t c_rows = sets[set_ind];
-          nnz_lno_t c_rows = sets2[set_ind];
-          sets[set_ind] = 0;
-          sets2[set_ind] = 0;
-
-          if (c_rows){
-            visit_applier(row_index, set_ind, c_rows, tid);
+
+          switch (count_or_fill_mode) {
+            case 0:  // count mode
+            default: {
+              for (nnz_lno_t ii = 0; ii < insertion_count; ++ii) {
+                nnz_lno_t set_ind = indices[ii];
+                // nnz_lno_t c_rows = sets[set_ind];
+                nnz_lno_t c_rows = sets2[set_ind];
+                sets[set_ind]    = 0;
+                sets2[set_ind]   = 0;
+
+                // count number of set bits
+                /*
+                nnz_lno_t num_el2 = 0;
+                for (; c_rows; num_el2++) {
+                  c_rows = c_rows & (c_rows - 1); // clear the least significant
+                bit set
+                }
+                num_el += num_el2;
+                */
+                // num_el += KokkosKernels::Impl::set_bit_count(c_rows);
+                num_el += KokkosKernels::Impl::pop_count(c_rows);
+              }
+              rowmapC[row_index] = num_el;
+            } break;
+            case 1:  // fill mode
+            {
+              size_type adj_ind = rowmapC[row_index];
+
+              for (nnz_lno_t ii = 0; ii < insertion_count; ++ii) {
+                const nnz_lno_t set_ind = indices[ii];
+                const nnz_lno_t shift   = set_ind << set_shift;
+
+                // nnz_lno_t c_rows = sets[set_ind];
+                nnz_lno_t c_rows = sets2[set_ind];
+                sets[set_ind]    = 0;
+                sets2[set_ind]   = 0;
+
+                // int current_row = 0;
+                nnz_lno_t unit = 1;
+                while (c_rows) {
+                  int least_set =
+                      KokkosKernels::Impl::least_set_bit(c_rows) - 1;
+                  entriesC[adj_ind++] = shift + least_set;
+                  c_rows              = c_rows & ~(unit << least_set);
+                  /*
+                  if (c_rows & unit){
+                    //insert indices.
+                    entriesC[num_el++] = shift + current_row;
+                  }
+                  current_row++;
+                  c_rows = c_rows & ~unit;
+                  unit = unit << 1;
+                  */
+                }
+              }
+            } break;
+            case 2:  // fill mode
+            {
+              for (nnz_lno_t ii = 0; ii < insertion_count; ++ii) {
+                const nnz_lno_t set_ind = indices[ii];
+                // const nnz_lno_t shift = set_ind << set_shift;
+
+                // nnz_lno_t c_rows = sets[set_ind];
+                nnz_lno_t c_rows = sets2[set_ind];
+                sets[set_ind]    = 0;
+                sets2[set_ind]   = 0;
+
+                if (c_rows) {
+                  visit_applier(row_index, set_ind, c_rows, tid);
+                }
+              }
+            } break;
           }
-        }
-      }
-      break;
-      }
-    }
-    );
+        });
 
     m_space.release_chunk(sets);
   }
 
-
   KOKKOS_INLINE_FUNCTION
-  void operator()(const MultiCoreTag&, const team_member_t & teamMember) const {
-
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
-
-
-    //get memory from memory pool.
-    volatile nnz_lno_t * tmp = NULL;
+  void operator()(const MultiCoreTag &, const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
+
+    // get memory from memory pool.
+    volatile nnz_lno_t *tmp = NULL;
     nnz_lno_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-    while (tmp == NULL){
-      tmp = (volatile nnz_lno_t * )( m_space.allocate_chunk(tid));
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(m_space.allocate_chunk(tid));
     }
 
-    //set first to globally used hash indices.
-    nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *) tmp;
+    // set first to globally used hash indices.
+    nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *)tmp;
     tmp += pow2_hash_size;
 
-    //create hashmap accumulator.
-    KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,nnz_lno_t,nnz_lno_t,KokkosKernels::Experimental::HashOpType::bitwiseAnd> 
-    hm2(MaxRoughNonZero, pow2_hash_func, nullptr, nullptr, nullptr, nullptr);
+    // create hashmap accumulator.
+    KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, nnz_lno_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>
+        hm2(MaxRoughNonZero, pow2_hash_func, nullptr, nullptr, nullptr,
+            nullptr);
 
-    //set memory for hash begins.
-    hm2.hash_begins = (nnz_lno_t *) (tmp);
-    tmp += pow2_hash_size ;
+    // set memory for hash begins.
+    hm2.hash_begins = (nnz_lno_t *)(tmp);
+    tmp += pow2_hash_size;
 
-    hm2.hash_nexts = (nnz_lno_t *) (tmp);
+    hm2.hash_nexts = (nnz_lno_t *)(tmp);
     tmp += MaxRoughNonZero;
 
-    //holds the keys
-    hm2.keys = (nnz_lno_t *) (tmp);
+    // holds the keys
+    hm2.keys = (nnz_lno_t *)(tmp);
     tmp += MaxRoughNonZero;
-    hm2.values = (nnz_lno_t *) (tmp);
+    hm2.values = (nnz_lno_t *)(tmp);
 
-    //this is my values2 array. it is parallel to values.
-    //currently hashmap accumulator wont use it.
+    // this is my values2 array. it is parallel to values.
+    // currently hashmap accumulator wont use it.
     tmp += MaxRoughNonZero;
-    nnz_lno_t *values2 = (nnz_lno_t *) (tmp);
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index){
-      nnz_lno_t globally_used_hash_count = 0;
-      nnz_lno_t used_hash_size = 0;
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
-      //traverse columns of A.
-      for (nnz_lno_t colind = 0; colind < col_size; ++colind){
-        size_type a_col = colind + col_begin;
-        nnz_lno_t rowB = entriesA[a_col];
-
-        size_type rowBegin = row_pointer_begins_B[rowB];
-        nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
-
-        //traverse columns of B
-        for (nnz_lno_t i = 0; i < left_work; ++i){
-
-          const size_type adjind = i + rowBegin;
-
-          nnz_lno_t b_set_ind = entriesSetIndicesB[adjind];
-          nnz_lno_t b_set = entriesSetsB[adjind];
-
-          //insert it to first hash.
-          hm2.sequential_insert_into_hash_mergeOr_TriangleCount_TrackHashes(
-            b_set_ind, b_set, values2,
-            &used_hash_size,
-            &globally_used_hash_count,
-            globally_used_hash_indices
-          );
-        }
-      }
-
-      switch (count_or_fill_mode){
-      case 0: //count mode
-      default:
-      {      //when done with all insertions, traverse insertions and get the size.
-        nnz_lno_t num_el = 0;
-        for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii){
-          nnz_lno_t c_rows = values2[ii];
-          //nnz_lno_t num_el2 = 0;
-
-          //the number of set bits.
-          /*
-          for (; c_rows; num_el2++) {
-            c_rows = c_rows & (c_rows - 1); // clear the least significant bit set
-          }
-          num_el += num_el2;
-          */
-          //num_el += KokkosKernels::Impl::set_bit_count(c_rows);
-          num_el += KokkosKernels::Impl::pop_count(c_rows);
-
-        }
-
-        //set the row size.
-        rowmapC[row_index] = num_el;
-      }
-      break;
-      case 1: //fill mode
-      {
-        size_type num_el = rowmapC[row_index];
-
-        for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii){
-          const nnz_lno_t c_rows_setind = hm2.keys[ii];
-          nnz_lno_t c_rows = values2[ii];
-          const nnz_lno_t shift = c_rows_setind << set_shift;
-
-          //int current_row = 0;
-          nnz_lno_t unit = 1;
-
-          while (c_rows){
-
-            int least_set = KokkosKernels::Impl::least_set_bit(c_rows) - 1;
-            entriesC[num_el++] = shift + least_set;
-            c_rows = c_rows & ~(unit << least_set);
-            /*
-            if (c_rows & unit){
-              //insert indices.
-              entriesC[num_el++] = shift + current_row;
+    nnz_lno_t *values2 = (nnz_lno_t *)(tmp);
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          nnz_lno_t globally_used_hash_count = 0;
+          nnz_lno_t used_hash_size           = 0;
+          const size_type col_begin          = row_mapA[row_index];
+          const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
+          // traverse columns of A.
+          for (nnz_lno_t colind = 0; colind < col_size; ++colind) {
+            size_type a_col = colind + col_begin;
+            nnz_lno_t rowB  = entriesA[a_col];
+
+            size_type rowBegin  = row_pointer_begins_B[rowB];
+            nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
+
+            // traverse columns of B
+            for (nnz_lno_t i = 0; i < left_work; ++i) {
+              const size_type adjind = i + rowBegin;
+
+              nnz_lno_t b_set_ind = entriesSetIndicesB[adjind];
+              nnz_lno_t b_set     = entriesSetsB[adjind];
+
+              // insert it to first hash.
+              hm2.sequential_insert_into_hash_mergeOr_TriangleCount_TrackHashes(
+                  b_set_ind, b_set, values2, &used_hash_size,
+                  &globally_used_hash_count, globally_used_hash_indices);
             }
-            current_row++;
-            c_rows = c_rows & ~unit;
-            unit = unit << 1;
-            */
-          }
-        }
-      }
-      break;
-
-      case 2: //fill mode
-      {
-        for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii){
-          const nnz_lno_t c_rows_setind = hm2.keys[ii];
-          nnz_lno_t c_rows = values2[ii];
-          if (c_rows){
-            visit_applier(row_index, c_rows_setind, c_rows, tid);
           }
 
-        }
-      }
-      break;
-      }
+          switch (count_or_fill_mode) {
+            case 0:     // count mode
+            default: {  // when done with all insertions, traverse insertions
+                        // and get the size.
+              nnz_lno_t num_el = 0;
+              for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii) {
+                nnz_lno_t c_rows = values2[ii];
+                // nnz_lno_t num_el2 = 0;
+
+                // the number of set bits.
+                /*
+                for (; c_rows; num_el2++) {
+                  c_rows = c_rows & (c_rows - 1); // clear the least significant
+                bit set
+                }
+                num_el += num_el2;
+                */
+                // num_el += KokkosKernels::Impl::set_bit_count(c_rows);
+                num_el += KokkosKernels::Impl::pop_count(c_rows);
+              }
+
+              // set the row size.
+              rowmapC[row_index] = num_el;
+            } break;
+            case 1:  // fill mode
+            {
+              size_type num_el = rowmapC[row_index];
+
+              for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii) {
+                const nnz_lno_t c_rows_setind = hm2.keys[ii];
+                nnz_lno_t c_rows              = values2[ii];
+                const nnz_lno_t shift         = c_rows_setind << set_shift;
+
+                // int current_row = 0;
+                nnz_lno_t unit = 1;
+
+                while (c_rows) {
+                  int least_set =
+                      KokkosKernels::Impl::least_set_bit(c_rows) - 1;
+                  entriesC[num_el++] = shift + least_set;
+                  c_rows             = c_rows & ~(unit << least_set);
+                  /*
+                  if (c_rows & unit){
+                    //insert indices.
+                    entriesC[num_el++] = shift + current_row;
+                  }
+                  current_row++;
+                  c_rows = c_rows & ~unit;
+                  unit = unit << 1;
+                  */
+                }
+              }
+            } break;
+
+            case 2:  // fill mode
+            {
+              for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii) {
+                const nnz_lno_t c_rows_setind = hm2.keys[ii];
+                nnz_lno_t c_rows              = values2[ii];
+                if (c_rows) {
+                  visit_applier(row_index, c_rows_setind, c_rows, tid);
+                }
+              }
+            } break;
+          }
 
-      //clear the begins.
-      for (int i = 0; i < globally_used_hash_count; ++i){
-        nnz_lno_t dirty_hash = globally_used_hash_indices[i];
-        hm2.hash_begins[dirty_hash] = -1;
-      }
-    });
+          // clear the begins.
+          for (int i = 0; i < globally_used_hash_count; ++i) {
+            nnz_lno_t dirty_hash        = globally_used_hash_indices[i];
+            hm2.hash_begins[dirty_hash] = -1;
+          }
+        });
 
     m_space.release_chunk(globally_used_hash_indices);
   }
 
-
   KOKKOS_INLINE_FUNCTION
-  void operator()(const MultiCoreTag3&, const team_member_t & teamMember) const {
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
-
-    //get memory from memory pool.
-    volatile nnz_lno_t * tmp = NULL;
+  void operator()(const MultiCoreTag3 &,
+                  const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
+
+    // get memory from memory pool.
+    volatile nnz_lno_t *tmp = NULL;
     nnz_lno_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-    while (tmp == NULL){
-      tmp = (volatile nnz_lno_t * )( m_space.allocate_chunk(tid));
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(m_space.allocate_chunk(tid));
     }
 
-    //set first to globally used hash indices.
-    nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *) tmp;
+    // set first to globally used hash indices.
+    nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *)tmp;
     tmp += pow2_hash_size;
 
-    //create hashmap accumulator.
-    KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,nnz_lno_t,nnz_lno_t,KokkosKernels::Experimental::HashOpType::bitwiseAnd>
-    hm2(MaxRoughNonZero, pow2_hash_func, nullptr, nullptr, nullptr, nullptr);
+    // create hashmap accumulator.
+    KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, nnz_lno_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>
+        hm2(MaxRoughNonZero, pow2_hash_func, nullptr, nullptr, nullptr,
+            nullptr);
 
-    //set memory for hash begins.
-    hm2.hash_begins = (nnz_lno_t *) (tmp);
-    tmp += pow2_hash_size ;
+    // set memory for hash begins.
+    hm2.hash_begins = (nnz_lno_t *)(tmp);
+    tmp += pow2_hash_size;
 
-    hm2.hash_nexts = (nnz_lno_t *) (tmp);
+    hm2.hash_nexts = (nnz_lno_t *)(tmp);
     tmp += MaxRoughNonZero;
 
-    //holds the keys
-    hm2.keys = (nnz_lno_t *) (tmp);
+    // holds the keys
+    hm2.keys = (nnz_lno_t *)(tmp);
     tmp += MaxRoughNonZero;
-    hm2.values = (nnz_lno_t *) (tmp);
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index){
-      nnz_lno_t globally_used_hash_count = 0;
-      nnz_lno_t used_hash_size = 0;
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
-      if (col_size){
-        //first insert the minimum row.
-        size_type mask_row_begin = row_pointer_begins_B[row_index];
-        nnz_lno_t mask_row_left_work = row_pointer_ends_B[row_index] - mask_row_begin;
-
-        //traverse columns of B
-        for (nnz_lno_t i = 0; i < mask_row_left_work; ++i){
-          const size_type adjind = i + mask_row_begin;
-          nnz_lno_t b_set_ind = entriesSetIndicesB[adjind];
-          nnz_lno_t b_set = entriesSetsB[adjind];
-
-          //insert it to first hash.
-          hm2.sequential_insert_into_hash_TriangleCount_TrackHashes(
-            b_set_ind, b_set,
-            &used_hash_size,
-            &globally_used_hash_count,
-            globally_used_hash_indices
-          );
-        }
-
-
-        //traverse columns of A.
-        for (nnz_lno_t colind = 0; colind < col_size; ++colind){
-          size_type a_col = colind + col_begin;
-          nnz_lno_t rowB = entriesA[a_col];
-          size_type rowBegin = row_pointer_begins_B[rowB];
-          nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
-
-          //traverse columns of B
-          for (nnz_lno_t i = 0; i < left_work; ++i){
-
-            const size_type adjind = i + rowBegin;
-
-
-            nnz_lno_t b_set_ind = entriesSetIndicesB[adjind];
-            nnz_lno_t b_set = entriesSetsB[adjind];
+    hm2.values = (nnz_lno_t *)(tmp);
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          nnz_lno_t globally_used_hash_count = 0;
+          nnz_lno_t used_hash_size           = 0;
+          const size_type col_begin          = row_mapA[row_index];
+          const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
+          if (col_size) {
+            // first insert the minimum row.
+            size_type mask_row_begin = row_pointer_begins_B[row_index];
+            nnz_lno_t mask_row_left_work =
+                row_pointer_ends_B[row_index] - mask_row_begin;
+
+            // traverse columns of B
+            for (nnz_lno_t i = 0; i < mask_row_left_work; ++i) {
+              const size_type adjind = i + mask_row_begin;
+              nnz_lno_t b_set_ind    = entriesSetIndicesB[adjind];
+              nnz_lno_t b_set        = entriesSetsB[adjind];
+
+              // insert it to first hash.
+              hm2.sequential_insert_into_hash_TriangleCount_TrackHashes(
+                  b_set_ind, b_set, &used_hash_size, &globally_used_hash_count,
+                  globally_used_hash_indices);
+            }
 
-            //std::cout << "\t and hash:" << hash << " bset:" << b_set << " b_set_ind:" << b_set_ind << std::endl;
-            //insert it to first hash.
-            nnz_lno_t intersection = hm2.sequential_insert_into_hash_mergeAnd_TriangleCount_TrackHashes(b_set_ind, b_set);
-            if(intersection)
-              visit_applier(row_index, b_set_ind, intersection, tid);
+            // traverse columns of A.
+            for (nnz_lno_t colind = 0; colind < col_size; ++colind) {
+              size_type a_col     = colind + col_begin;
+              nnz_lno_t rowB      = entriesA[a_col];
+              size_type rowBegin  = row_pointer_begins_B[rowB];
+              nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
+
+              // traverse columns of B
+              for (nnz_lno_t i = 0; i < left_work; ++i) {
+                const size_type adjind = i + rowBegin;
+
+                nnz_lno_t b_set_ind = entriesSetIndicesB[adjind];
+                nnz_lno_t b_set     = entriesSetsB[adjind];
+
+                // std::cout << "\t and hash:" << hash << " bset:" << b_set << "
+                // b_set_ind:" << b_set_ind << std::endl; insert it to first
+                // hash.
+                nnz_lno_t intersection =
+                    hm2.sequential_insert_into_hash_mergeAnd_TriangleCount_TrackHashes(
+                        b_set_ind, b_set);
+                if (intersection)
+                  visit_applier(row_index, b_set_ind, intersection, tid);
+              }
+            }
           }
-        }
-      }
-      //clear the begins.
-      for (int i = 0; i < globally_used_hash_count; ++i){
-        nnz_lno_t dirty_hash = globally_used_hash_indices[i];
-        hm2.hash_begins[dirty_hash] = -1;
-      }
-    });
+          // clear the begins.
+          for (int i = 0; i < globally_used_hash_count; ++i) {
+            nnz_lno_t dirty_hash        = globally_used_hash_indices[i];
+            hm2.hash_begins[dirty_hash] = -1;
+          }
+        });
 
     m_space.release_chunk(globally_used_hash_indices);
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const MultiCoreTag2&, const team_member_t & teamMember) const {
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
-
-    //get memory from memory pool.
-    volatile nnz_lno_t * tmp = NULL;
+  void operator()(const MultiCoreTag2 &,
+                  const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
+
+    // get memory from memory pool.
+    volatile nnz_lno_t *tmp = NULL;
     nnz_lno_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-    while (tmp == NULL){
-      tmp = (volatile nnz_lno_t * )( m_space.allocate_chunk(tid));
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(m_space.allocate_chunk(tid));
     }
 
-    //set first to globally used hash indices.
-    nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *) tmp;
+    // set first to globally used hash indices.
+    nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *)tmp;
     tmp += pow2_hash_size;
 
-    //create hashmap accumulator.
-    KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,nnz_lno_t,nnz_lno_t,KokkosKernels::Experimental::HashOpType::bitwiseAnd> 
-    hm2(MaxRoughNonZero, pow2_hash_func, nullptr, nullptr, nullptr, nullptr);
+    // create hashmap accumulator.
+    KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, nnz_lno_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>
+        hm2(MaxRoughNonZero, pow2_hash_func, nullptr, nullptr, nullptr,
+            nullptr);
 
-    //set memory for hash begins.
-    hm2.hash_begins = (nnz_lno_t *) (tmp);
-    tmp += pow2_hash_size ;
+    // set memory for hash begins.
+    hm2.hash_begins = (nnz_lno_t *)(tmp);
+    tmp += pow2_hash_size;
 
-    hm2.hash_nexts = (nnz_lno_t *) (tmp);
+    hm2.hash_nexts = (nnz_lno_t *)(tmp);
     tmp += MaxRoughNonZero;
 
-    //holds the keys
-    hm2.keys = (nnz_lno_t *) (tmp);
+    // holds the keys
+    hm2.keys = (nnz_lno_t *)(tmp);
     tmp += MaxRoughNonZero;
-    hm2.values = (nnz_lno_t *) (tmp);
+    hm2.values = (nnz_lno_t *)(tmp);
 
-    //this is my values2 array. it is parallel to values.
-    //currently hashmap accumulator wont use it.
+    // this is my values2 array. it is parallel to values.
+    // currently hashmap accumulator wont use it.
     tmp += MaxRoughNonZero;
-    nnz_lno_t *values2 = (nnz_lno_t *) (tmp);
+    nnz_lno_t *values2 = (nnz_lno_t *)(tmp);
 
     hm2.hash_key_size = pow2_hash_size;
 
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index){
-      nnz_lno_t globally_used_hash_count = 0;
-      nnz_lno_t used_hash_size = 0;
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
-      //std::cout << "row:" << row_index << std::endl;
-      if (col_size){
-        //first insert the minimum row.
-        nnz_lno_t min_row_b = this->min_size_row_for_each_row[row_index];
-        size_type min_row_begin = row_pointer_begins_B[min_row_b];
-        nnz_lno_t min_row_left_work = row_pointer_ends_B[min_row_b] - min_row_begin;
-
-        //traverse columns of B
-        for (nnz_lno_t i = 0; i < min_row_left_work; ++i){
-          const size_type adjind = i + min_row_begin;
-          nnz_lno_t b_set_ind = entriesSetIndicesB[adjind];
-          nnz_lno_t b_set = entriesSetsB[adjind];
-
-
-          //std::cout << "\t union hash:" << hash << " bset:" << b_set << " b_set_ind:" << b_set_ind << std::endl;
-
-          //insert it to first hash.
-          hm2.sequential_insert_into_hash_TriangleCount_TrackHashes(
-              b_set_ind, b_set, values2,
-              &used_hash_size,
-              &globally_used_hash_count,
-              globally_used_hash_indices
-          );
-        }
-
-
-        //traverse columns of A.
-        for (nnz_lno_t colind = 0; colind < col_size; ++colind){
-          size_type a_col = colind + col_begin;
-          nnz_lno_t rowB = entriesA[a_col];
-          if (rowB == min_row_b) continue;
-          size_type rowBegin = row_pointer_begins_B[rowB];
-          nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
-
-          //traverse columns of B
-          for (nnz_lno_t i = 0; i < left_work; ++i){
-
-            const size_type adjind = i + rowBegin;
-
-
-            nnz_lno_t b_set_ind = entriesSetIndicesB[adjind];
-            nnz_lno_t b_set = entriesSetsB[adjind];
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          nnz_lno_t globally_used_hash_count = 0;
+          nnz_lno_t used_hash_size           = 0;
+          const size_type col_begin          = row_mapA[row_index];
+          const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
+          // std::cout << "row:" << row_index << std::endl;
+          if (col_size) {
+            // first insert the minimum row.
+            nnz_lno_t min_row_b = this->min_size_row_for_each_row[row_index];
+            size_type min_row_begin = row_pointer_begins_B[min_row_b];
+            nnz_lno_t min_row_left_work =
+                row_pointer_ends_B[min_row_b] - min_row_begin;
+
+            // traverse columns of B
+            for (nnz_lno_t i = 0; i < min_row_left_work; ++i) {
+              const size_type adjind = i + min_row_begin;
+              nnz_lno_t b_set_ind    = entriesSetIndicesB[adjind];
+              nnz_lno_t b_set        = entriesSetsB[adjind];
+
+              // std::cout << "\t union hash:" << hash << " bset:" << b_set << "
+              // b_set_ind:" << b_set_ind << std::endl;
+
+              // insert it to first hash.
+              hm2.sequential_insert_into_hash_TriangleCount_TrackHashes(
+                  b_set_ind, b_set, values2, &used_hash_size,
+                  &globally_used_hash_count, globally_used_hash_indices);
+            }
 
-            //std::cout << "\t and hash:" << hash << " bset:" << b_set << " b_set_ind:" << b_set_ind << std::endl;
-            //insert it to first hash.
-            hm2.sequential_insert_into_hash_mergeAnd_TriangleCount_TrackHashes(
-              b_set_ind, b_set, values2,
-              &used_hash_size,
-              &globally_used_hash_count,
-              globally_used_hash_indices
-            );
+            // traverse columns of A.
+            for (nnz_lno_t colind = 0; colind < col_size; ++colind) {
+              size_type a_col = colind + col_begin;
+              nnz_lno_t rowB  = entriesA[a_col];
+              if (rowB == min_row_b) continue;
+              size_type rowBegin  = row_pointer_begins_B[rowB];
+              nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
+
+              // traverse columns of B
+              for (nnz_lno_t i = 0; i < left_work; ++i) {
+                const size_type adjind = i + rowBegin;
+
+                nnz_lno_t b_set_ind = entriesSetIndicesB[adjind];
+                nnz_lno_t b_set     = entriesSetsB[adjind];
+
+                // std::cout << "\t and hash:" << hash << " bset:" << b_set << "
+                // b_set_ind:" << b_set_ind << std::endl; insert it to first
+                // hash.
+                hm2.sequential_insert_into_hash_mergeAnd_TriangleCount_TrackHashes(
+                    b_set_ind, b_set, values2, &used_hash_size,
+                    &globally_used_hash_count, globally_used_hash_indices);
+              }
+            }
           }
-        }
-      }
 
-      switch (count_or_fill_mode){
-      case 0: //count mode
-      default:
-      {      //when done with all insertions, traverse insertions and get the size.
-        nnz_lno_t num_el = 0;
-        for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii){
-          if (values2[ii] != col_size) continue;
-          nnz_lno_t c_rows = hm2.values[ii];
-
-
-          //the number of set bits.
-          /*
-          nnz_lno_t num_el2 = 0;
-          for (; c_rows; num_el2++) {
-            c_rows = c_rows & (c_rows - 1); // clear the least significant bit set
+          switch (count_or_fill_mode) {
+            case 0:     // count mode
+            default: {  // when done with all insertions, traverse insertions
+                        // and get the size.
+              nnz_lno_t num_el = 0;
+              for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii) {
+                if (values2[ii] != col_size) continue;
+                nnz_lno_t c_rows = hm2.values[ii];
+
+                // the number of set bits.
+                /*
+                nnz_lno_t num_el2 = 0;
+                for (; c_rows; num_el2++) {
+                  c_rows = c_rows & (c_rows - 1); // clear the least significant
+                bit set
+                }
+                num_el += num_el2;
+                */
+                // num_el += KokkosKernels::Impl::set_bit_count(c_rows);
+                num_el += KokkosKernels::Impl::pop_count(c_rows);
+              }
+
+              // set the row size.
+              rowmapC[row_index] = num_el;
+            } break;
+            case 1:  // fill mode
+            {
+              size_type num_el = rowmapC[row_index];
+
+              for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii) {
+                if (values2[ii] != col_size) continue;
+                nnz_lno_t c_rows_setind = hm2.keys[ii];
+                nnz_lno_t c_rows        = hm2.values[ii];
+                const nnz_lno_t shift   = c_rows_setind << set_shift;
+
+                // int current_row = 0;
+                nnz_lno_t unit = 1;
+
+                while (c_rows) {
+                  int least_set =
+                      KokkosKernels::Impl::least_set_bit(c_rows) - 1;
+                  entriesC[num_el++] = shift + least_set;
+                  c_rows             = c_rows & ~(unit << least_set);
+                  /*
+                  if (c_rows & unit){
+                    //insert indices.
+                    entriesC[num_el++] = shift + current_row;
+                  }
+                  current_row++;
+                  c_rows = c_rows & ~unit;
+                  unit = unit << 1;
+                  */
+                }
+              }
+            } break;
+            case 2:  // fill mode
+            {
+              for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii) {
+                if (values2[ii] != col_size) continue;
+                nnz_lno_t c_rows_setind = hm2.keys[ii];
+                nnz_lno_t c_rows        = hm2.values[ii];
+                if (c_rows) {
+                  visit_applier(row_index, c_rows_setind, c_rows, tid);
+                }
+              }
+            } break;
           }
-          num_el += num_el2;
-          */
-          //num_el += KokkosKernels::Impl::set_bit_count(c_rows);
-          num_el += KokkosKernels::Impl::pop_count(c_rows);
-
-        }
 
-        //set the row size.
-        rowmapC[row_index] = num_el;
-      }
-      break;
-      case 1: //fill mode
-      {
-        size_type num_el = rowmapC[row_index];
-
-        for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii){
-          if (values2[ii] != col_size) continue;
-          nnz_lno_t c_rows_setind = hm2.keys[ii];
-          nnz_lno_t c_rows = hm2.values[ii];
-          const nnz_lno_t shift = c_rows_setind << set_shift;
-
-          //int current_row = 0;
-          nnz_lno_t unit = 1;
-
-          while (c_rows){
-
-            int least_set = KokkosKernels::Impl::least_set_bit(c_rows) - 1;
-            entriesC[num_el++] = shift + least_set;
-            c_rows = c_rows & ~(unit << least_set);
-            /*
-            if (c_rows & unit){
-              //insert indices.
-              entriesC[num_el++] = shift + current_row;
-            }
-            current_row++;
-            c_rows = c_rows & ~unit;
-            unit = unit << 1;
-            */
-          }
-        }
-      }
-      break;
-      case 2: //fill mode
-      {
-        for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii){
-          if (values2[ii] != col_size) continue;
-          nnz_lno_t c_rows_setind = hm2.keys[ii];
-          nnz_lno_t c_rows = hm2.values[ii];
-          if (c_rows){
-            visit_applier(row_index, c_rows_setind, c_rows, tid);
+          // clear the begins.
+          for (int i = 0; i < globally_used_hash_count; ++i) {
+            nnz_lno_t dirty_hash        = globally_used_hash_indices[i];
+            hm2.hash_begins[dirty_hash] = -1;
           }
-
-        }
-      }
-      break;
-
-
-      }
-
-
-
-      //clear the begins.
-      for (int i = 0; i < globally_used_hash_count; ++i){
-        nnz_lno_t dirty_hash = globally_used_hash_indices[i];
-        hm2.hash_begins[dirty_hash] = -1;
-      }
-    });
+        });
 
     m_space.release_chunk(globally_used_hash_indices);
   }
 
-
   KOKKOS_INLINE_FUNCTION
-  void operator()(const GPUTag&, const team_member_t & /*teamMember*/) const {
-  }
-
-  size_t team_shmem_size (int /*team_size*/) const {
-    return shared_memory_size;
-  }
+  void operator()(const GPUTag &, const team_member_t & /*teamMember*/) const {}
 
+  size_t team_shmem_size(int /*team_size*/) const { return shared_memory_size; }
 };
 
-
-
-template <typename HandleType,
-          typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
 template <typename struct_visit_t>
-void KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
+void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                  a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                  b_scalar_nnz_view_t_>::
     triangle_count_ai(
-        const int is_symbolic_or_numeric,
-        const nnz_lno_t m,
-        const size_type* row_mapA_,
-        const nnz_lno_t * entriesA_,
-
-        const size_type bnnz,
-        const size_type * rowmapB_begins,
-        const size_type * rowmapB_ends,
-        const nnz_lno_t * entriesBSetIndex,
-        const nnz_lno_t * entriesBSets,
-        size_type * rowmapC,
-        nnz_lno_t *entriesC, //null if it is symbolic, otherwise not null!
-        struct_visit_t visit_applier
-    ){
-
+        const int is_symbolic_or_numeric, const nnz_lno_t m,
+        const size_type *row_mapA_, const nnz_lno_t *entriesA_,
+
+        const size_type bnnz, const size_type *rowmapB_begins,
+        const size_type *rowmapB_ends, const nnz_lno_t *entriesBSetIndex,
+        const nnz_lno_t *entriesBSets, size_type *rowmapC,
+        nnz_lno_t *entriesC,  // null if it is symbolic, otherwise not null!
+        struct_visit_t visit_applier) {
   bool apply_compression = this->handle->get_spgemm_handle()->get_compression();
-  constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>();
+  constexpr bool exec_gpu =
+      KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>();
 
-  const nnz_lno_t * min_result_row_for_each_row = this->handle->get_spgemm_handle()->get_min_col_of_row().data();
-  nnz_lno_t max_row_size = this->handle->get_spgemm_handle()->get_max_result_nnz();
+  const nnz_lno_t *min_result_row_for_each_row =
+      this->handle->get_spgemm_handle()->get_min_col_of_row().data();
+  nnz_lno_t max_row_size =
+      this->handle->get_spgemm_handle()->get_max_result_nnz();
 
-  typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space;
+  typedef KokkosKernels::Impl::UniformMemoryPool<MyTempMemorySpace, nnz_lno_t>
+      pool_memory_space;
 
-  int suggested_vector_size = this->handle->get_suggested_vector_size(this->b_row_cnt, bnnz);
+  int suggested_vector_size =
+      this->handle->get_suggested_vector_size(this->b_row_cnt, bnnz);
 
-  //this kernel does not really work well if the vector size is less than 4.
+  // this kernel does not really work well if the vector size is less than 4.
   if (suggested_vector_size < 4 && exec_gpu) {
-    if (KOKKOSKERNELS_VERBOSE) std::cout << "\tVecSize:" << suggested_vector_size << " Setting it to 4" << std::endl;
+    if (KOKKOSKERNELS_VERBOSE)
+      std::cout << "\tVecSize:" << suggested_vector_size << " Setting it to 4"
+                << std::endl;
     suggested_vector_size = 4;
   }
-  int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(suggested_team_size,concurrency, a_row_cnt);
+  int suggested_team_size =
+      this->handle->get_suggested_team_size(suggested_vector_size);
+  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(
+      suggested_team_size, concurrency, a_row_cnt);
 
   nnz_lno_t dense_col_size = this->b_col_cnt;
-  if (apply_compression){
-    dense_col_size = this->b_col_cnt / (sizeof (nnz_lno_t) * 8)+ 1;
+  if (apply_compression) {
+    dense_col_size = this->b_col_cnt / (sizeof(nnz_lno_t) * 8) + 1;
   }
 
-
-  //round up maxNumRoughNonzeros to closest power of 2.
+  // round up maxNumRoughNonzeros to closest power of 2.
   nnz_lno_t min_hash_size = 1;
-  while (max_row_size > min_hash_size){
+  while (max_row_size > min_hash_size) {
     min_hash_size *= 2;
   }
 
   min_hash_size *= this->handle->get_spgemm_handle()->get_min_hash_size_scale();
 
-  //set the chunksize.
-  size_t sparse_accumulator_chunksize = min_hash_size ; //this is for used hash indices
-  sparse_accumulator_chunksize += min_hash_size ; //this is for the hash begins
-  sparse_accumulator_chunksize += max_row_size ; //this is for hash nexts
-  sparse_accumulator_chunksize += max_row_size ; //this is for hash keys
-
-  //we do not need values-1 if we do not compress the data.
-  if (apply_compression){
-    sparse_accumulator_chunksize += max_row_size ; //this is for hash values - 1
+  // set the chunksize.
+  size_t sparse_accumulator_chunksize =
+      min_hash_size;  // this is for used hash indices
+  sparse_accumulator_chunksize += min_hash_size;  // this is for the hash begins
+  sparse_accumulator_chunksize += max_row_size;   // this is for hash nexts
+  sparse_accumulator_chunksize += max_row_size;   // this is for hash keys
+
+  // we do not need values-1 if we do not compress the data.
+  if (apply_compression) {
+    sparse_accumulator_chunksize += max_row_size;  // this is for hash values -
+                                                   // 1
   }
 
+  size_t dense_accumulator_chunksize = max_row_size;  // this is for used keys
+  dense_accumulator_chunksize += dense_col_size;      // this is for values-1
+  // std::cout << "b_col_cnt:" << b_col_cnt << " dense_col_size:" <<
+  // dense_col_size << std::endl;
 
-
-  size_t dense_accumulator_chunksize = max_row_size; //this is for used keys
-  dense_accumulator_chunksize += dense_col_size ; //this is for values-1
-  //std::cout << "b_col_cnt:" << b_col_cnt << " dense_col_size:" << dense_col_size << std::endl;
-
-  if (!( spgemm_algorithm == KokkosSparse::SPGEMM_KK_TRIANGLE_LL ||
-          spgemm_algorithm == KokkosSparse::SPGEMM_KK_TRIANGLE_LU)){
-    if(apply_compression){
-      dense_accumulator_chunksize += dense_col_size ; //this is for values-2
+  if (!(spgemm_algorithm == KokkosSparse::SPGEMM_KK_TRIANGLE_LL ||
+        spgemm_algorithm == KokkosSparse::SPGEMM_KK_TRIANGLE_LU)) {
+    if (apply_compression) {
+      dense_accumulator_chunksize += dense_col_size;  // this is for values-2
     }
-    sparse_accumulator_chunksize += max_row_size ; //this is for hash values - 2
+    sparse_accumulator_chunksize += max_row_size;  // this is for hash values -
+                                                   // 2
   }
 
-  //initizalize value for the mem pool
+  // initizalize value for the mem pool
   int pool_init_val = -1;
 
-  if (KOKKOSKERNELS_VERBOSE){
-    std::cout << "\tDense_col_size:" << dense_col_size << " max_row_size:" << max_row_size << " min_hash_size:" << min_hash_size << std::endl;
-    std::cout << "\tSparse chunksize:" << sparse_accumulator_chunksize << " dense_chunksize:"
-                << dense_accumulator_chunksize << " concurrency:" << concurrency << std::endl;
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\tDense_col_size:" << dense_col_size
+              << " max_row_size:" << max_row_size
+              << " min_hash_size:" << min_hash_size << std::endl;
+    std::cout << "\tSparse chunksize:" << sparse_accumulator_chunksize
+              << " dense_chunksize:" << dense_accumulator_chunksize
+              << " concurrency:" << concurrency << std::endl;
   }
   size_t accumulator_chunksize = sparse_accumulator_chunksize;
-  bool use_dense_accumulator = false;
+  bool use_dense_accumulator   = false;
   if (!apply_compression ||
       (spgemm_accumulator == KokkosSparse::SPGEMM_ACC_DEFAULT &&
-        ((concurrency <=  sizeof (nnz_lno_t) * 8) || (dense_accumulator_chunksize < sparse_accumulator_chunksize)))
-     ||
-       spgemm_accumulator == KokkosSparse::SPGEMM_ACC_DENSE ){
+       ((concurrency <= sizeof(nnz_lno_t) * 8) ||
+        (dense_accumulator_chunksize < sparse_accumulator_chunksize))) ||
+      spgemm_accumulator == KokkosSparse::SPGEMM_ACC_DENSE) {
     use_dense_accumulator = true;
-    if (KOKKOSKERNELS_VERBOSE){
-      std::cout << "\tUsing Dense Accumulator instead. Sparse chunksize:" << sparse_accumulator_chunksize <<
-          " dense_chunksize:" << dense_accumulator_chunksize <<
-          " concurrency:" << concurrency << std::endl;
+    if (KOKKOSKERNELS_VERBOSE) {
+      std::cout << "\tUsing Dense Accumulator instead. Sparse chunksize:"
+                << sparse_accumulator_chunksize
+                << " dense_chunksize:" << dense_accumulator_chunksize
+                << " concurrency:" << concurrency << std::endl;
     }
     accumulator_chunksize = dense_accumulator_chunksize;
-    //if speed is set, and exec space is cpu, then  we use dense accumulators.
-    //or if memspeed is set, and concurrency is not high, we use dense accumulators.
+    // if speed is set, and exec space is cpu, then  we use dense accumulators.
+    // or if memspeed is set, and concurrency is not high, we use dense
+    // accumulators.
     max_row_size = dense_col_size;
-    if (!apply_compression && (spgemm_algorithm ==  SPGEMM_KK_TRIANGLE_LL || spgemm_algorithm ==  SPGEMM_KK_TRIANGLE_LU)){
+    if (!apply_compression && (spgemm_algorithm == SPGEMM_KK_TRIANGLE_LL ||
+                               spgemm_algorithm == SPGEMM_KK_TRIANGLE_LU)) {
       pool_init_val = -1;
-    }
-    else {
+    } else {
       pool_init_val = 0;
     }
-
   }
 
-  KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk;
+  KokkosKernels::Impl::PoolType my_pool_type =
+      KokkosKernels::Impl::OneThread2OneChunk;
   if (exec_gpu) {
     my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
   }
 
-  nnz_lno_t num_chunks = this->template compute_num_pool_chunks<pool_memory_space>
-    (accumulator_chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
-
-  if (KOKKOSKERNELS_VERBOSE){
-    std::cout <<  "\tPool Size (MB):" << (num_chunks * accumulator_chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. <<
-        " num_chunks:" << num_chunks <<
-        " chunksize:" << accumulator_chunksize << std::endl;
+  nnz_lno_t num_chunks =
+      this->template compute_num_pool_chunks<pool_memory_space>(
+          accumulator_chunksize * sizeof(nnz_lno_t),
+          concurrency / suggested_vector_size);
+
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\tPool Size (MB):"
+              << (num_chunks * accumulator_chunksize * sizeof(nnz_lno_t)) /
+                     1024. / 1024.
+              << " num_chunks:" << num_chunks
+              << " chunksize:" << accumulator_chunksize << std::endl;
   }
 
   Kokkos::Timer timer1;
-  pool_memory_space m_space(num_chunks, accumulator_chunksize, pool_init_val,  my_pool_type);
+  pool_memory_space m_space(num_chunks, accumulator_chunksize, pool_init_val,
+                            my_pool_type);
   MyExecSpace().fence();
-  if (KOKKOSKERNELS_VERBOSE){
+  if (KOKKOSKERNELS_VERBOSE) {
     std::cout << "\tPool Alloc Time:" << timer1.seconds() << std::endl;
   }
 
-  TriangleCount<pool_memory_space, struct_visit_t>
-  sc(
-      m,
-      row_mapA_,
-      entriesA_,
-
-      rowmapB_begins,
-      rowmapB_ends,
-      entriesBSetIndex,
-      entriesBSets,
-
-      rowmapC,
-      entriesC,
-
-      min_hash_size,
-      max_row_size,
-      shmem_size,
-      suggested_team_size,
-      team_row_chunk_size,
-      suggested_vector_size,
-      m_space,
-      MyEnumExecSpace,
-      is_symbolic_or_numeric,
-      min_result_row_for_each_row,
-      visit_applier,
+  TriangleCount<pool_memory_space, struct_visit_t> sc(
+      m, row_mapA_, entriesA_,
+
+      rowmapB_begins, rowmapB_ends, entriesBSetIndex, entriesBSets,
+
+      rowmapC, entriesC,
+
+      min_hash_size, max_row_size, shmem_size, suggested_team_size,
+      team_row_chunk_size, suggested_vector_size, m_space, MyEnumExecSpace,
+      is_symbolic_or_numeric, min_result_row_for_each_row, visit_applier,
       KOKKOSKERNELS_VERBOSE);
 
-  if (KOKKOSKERNELS_VERBOSE){
-    std::cout << "\tTriangleCount VecS:" << suggested_vector_size <<
-                  " Team::" << suggested_team_size <<
-                  " Chunk:" << team_row_chunk_size <<
-                  " shmem_size:" << shmem_size << std::endl;
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\tTriangleCount VecS:" << suggested_vector_size
+              << " Team::" << suggested_team_size
+              << " Chunk:" << team_row_chunk_size
+              << " shmem_size:" << shmem_size << std::endl;
   }
 
   timer1.reset();
 
   if (exec_gpu) {
-    Kokkos::parallel_for( gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc);
-  }
-  else {
-    if (!apply_compression){
-
-      if (spgemm_algorithm ==  SPGEMM_KK_TRIANGLE_AI ||
-          spgemm_algorithm ==  SPGEMM_KK_TRIANGLE_IA_UNION){
-        if (use_dynamic_schedule){
-          Kokkos::parallel_for( nc_dynamic_multicore_dense_team_count_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-
-        }
-        else {
-          Kokkos::parallel_for( nc_multicore_dense_team_count_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-
+    Kokkos::parallel_for(
+        gpu_team_policy_t(m / suggested_team_size + 1, suggested_team_size,
+                          suggested_vector_size),
+        sc);
+  } else {
+    if (!apply_compression) {
+      if (spgemm_algorithm == SPGEMM_KK_TRIANGLE_AI ||
+          spgemm_algorithm == SPGEMM_KK_TRIANGLE_IA_UNION) {
+        if (use_dynamic_schedule) {
+          Kokkos::parallel_for(nc_dynamic_multicore_dense_team_count_policy_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
+
+        } else {
+          Kokkos::parallel_for(nc_multicore_dense_team_count_policy_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
         }
-      }
-      else if (spgemm_algorithm ==  SPGEMM_KK_TRIANGLE_IA){
-        if (use_dynamic_schedule){
-          Kokkos::parallel_for( nc_dynamic_multicore_dense_team2_count_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-
+      } else if (spgemm_algorithm == SPGEMM_KK_TRIANGLE_IA) {
+        if (use_dynamic_schedule) {
+          Kokkos::parallel_for(nc_dynamic_multicore_dense_team2_count_policy_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
+
+        } else {
+          Kokkos::parallel_for(nc_multicore_dense_team2_count_policy_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
         }
-        else {
-          Kokkos::parallel_for( nc_multicore_dense_team2_count_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
+      } else if (spgemm_algorithm == SPGEMM_KK_TRIANGLE_LL ||
+                 spgemm_algorithm == SPGEMM_KK_TRIANGLE_LU) {
+        if (use_dynamic_schedule) {
+          Kokkos::parallel_for(nc_dynamic_multicore_dense_team3_count_policy_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
+
+        } else {
+          Kokkos::parallel_for(nc_multicore_dense_team3_count_policy_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
         }
       }
-      else if (spgemm_algorithm ==  SPGEMM_KK_TRIANGLE_LL || spgemm_algorithm ==  SPGEMM_KK_TRIANGLE_LU){
-        if (use_dynamic_schedule){
-          Kokkos::parallel_for( nc_dynamic_multicore_dense_team3_count_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
 
+    } else if (use_dense_accumulator) {
+      if (spgemm_algorithm == SPGEMM_KK_TRIANGLE_AI ||
+          spgemm_algorithm == SPGEMM_KK_TRIANGLE_IA_UNION) {
+        if (use_dynamic_schedule) {
+          Kokkos::parallel_for(dynamic_multicore_dense_team_count_policy_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
+
+        } else {
+          Kokkos::parallel_for(multicore_dense_team_count_policy_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
         }
-        else {
-          Kokkos::parallel_for( nc_multicore_dense_team3_count_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
+      } else if (spgemm_algorithm == SPGEMM_KK_TRIANGLE_IA) {
+        if (use_dynamic_schedule) {
+          Kokkos::parallel_for(dynamic_multicore_dense_team2_count_policy_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
+
+        } else {
+          Kokkos::parallel_for(multicore_dense_team2_count_policy_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
         }
-      }
-
-
-
-    }
-    else if (use_dense_accumulator){
-      if (spgemm_algorithm ==  SPGEMM_KK_TRIANGLE_AI ||
-          spgemm_algorithm ==  SPGEMM_KK_TRIANGLE_IA_UNION){
-        if (use_dynamic_schedule){
-          Kokkos::parallel_for( dynamic_multicore_dense_team_count_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-
-        }
-        else {
-          Kokkos::parallel_for( multicore_dense_team_count_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-
+      } else if (spgemm_algorithm == SPGEMM_KK_TRIANGLE_LL ||
+                 spgemm_algorithm == SPGEMM_KK_TRIANGLE_LU) {
+        if (use_dynamic_schedule) {
+          Kokkos::parallel_for(dynamic_multicore_dense_team3_count_policy_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
+
+        } else {
+          Kokkos::parallel_for(multicore_dense_team3_count_policy_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
         }
       }
-      else if (spgemm_algorithm ==  SPGEMM_KK_TRIANGLE_IA){
-        if (use_dynamic_schedule){
-          Kokkos::parallel_for( dynamic_multicore_dense_team2_count_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
 
+    } else {
+      if (spgemm_algorithm == SPGEMM_KK_TRIANGLE_AI ||
+          spgemm_algorithm == SPGEMM_KK_TRIANGLE_IA_UNION) {
+        if (use_dynamic_schedule) {
+          Kokkos::parallel_for(dynamic_multicore_team_policy_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
+        } else {
+          Kokkos::parallel_for(multicore_team_policy_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
         }
-        else {
-          Kokkos::parallel_for( multicore_dense_team2_count_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-        }
-      }
-      else if (spgemm_algorithm ==  SPGEMM_KK_TRIANGLE_LL || spgemm_algorithm ==  SPGEMM_KK_TRIANGLE_LU){
-        if (use_dynamic_schedule){
-          Kokkos::parallel_for( dynamic_multicore_dense_team3_count_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-
+      } else if (spgemm_algorithm == SPGEMM_KK_TRIANGLE_IA) {
+        if (use_dynamic_schedule) {
+          Kokkos::parallel_for(dynamic_multicore_team_policy2_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
+        } else {
+          Kokkos::parallel_for(multicore_team_policy2_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
         }
-        else {
-          Kokkos::parallel_for( multicore_dense_team3_count_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
+      } else if (spgemm_algorithm == SPGEMM_KK_TRIANGLE_LL ||
+                 spgemm_algorithm == SPGEMM_KK_TRIANGLE_LU) {
+        if (use_dynamic_schedule) {
+          Kokkos::parallel_for(dynamic_multicore_team_policy3_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
+        } else {
+          Kokkos::parallel_for(multicore_team_policy3_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
         }
       }
-
-    }    
-    else {
-
-      if (spgemm_algorithm ==  SPGEMM_KK_TRIANGLE_AI ||
-          spgemm_algorithm ==  SPGEMM_KK_TRIANGLE_IA_UNION){
-        if (use_dynamic_schedule){
-            Kokkos::parallel_for( dynamic_multicore_team_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-          }
-          else {
-            Kokkos::parallel_for( multicore_team_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-          }}
-      else if (spgemm_algorithm ==  SPGEMM_KK_TRIANGLE_IA){
-        if (use_dynamic_schedule){
-          Kokkos::parallel_for( dynamic_multicore_team_policy2_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-        }
-        else {
-          Kokkos::parallel_for( multicore_team_policy2_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-        }
-      }
-      else if (spgemm_algorithm ==  SPGEMM_KK_TRIANGLE_LL || spgemm_algorithm ==  SPGEMM_KK_TRIANGLE_LU){
-        if (use_dynamic_schedule){
-          Kokkos::parallel_for( dynamic_multicore_team_policy3_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-        }
-        else {
-          Kokkos::parallel_for( multicore_team_policy3_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-        }
-      }
-
     }
   }
   MyExecSpace().fence();
 
-  if (KOKKOSKERNELS_VERBOSE){
-    std::cout << "\tKernel time:" << timer1.seconds() << std::endl<< std::endl;
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\tKernel time:" << timer1.seconds() << std::endl << std::endl;
   }
 }
 
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-template <typename c_row_view_t, typename c_lno_nnz_view_t, typename c_scalar_nnz_view_t>
-void
-  KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-    KokkosSPGEMM_numeric_triangle(
-      c_row_view_t rowmapC_,
-      c_lno_nnz_view_t entriesC_,
-      c_scalar_nnz_view_t /*valuesC_*/){
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename c_row_view_t, typename c_lno_nnz_view_t,
+          typename c_scalar_nnz_view_t>
+void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                  a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                  b_scalar_nnz_view_t_>::
+    KokkosSPGEMM_numeric_triangle(c_row_view_t rowmapC_,
+                                  c_lno_nnz_view_t entriesC_,
+                                  c_scalar_nnz_view_t /*valuesC_*/) {
   this->KokkosSPGEMM_numeric_triangle_ai(rowmapC_, entriesC_);
-
 }
 
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
 template <typename c_row_view_t, typename c_lno_nnz_view_t>
-void
-  KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-    KokkosSPGEMM_numeric_triangle_ai(
-      c_row_view_t rowmapC_,
-      c_lno_nnz_view_t entriesC_){
-
-  if (KOKKOSKERNELS_VERBOSE){
+void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                  a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                  b_scalar_nnz_view_t_>::
+    KokkosSPGEMM_numeric_triangle_ai(c_row_view_t rowmapC_,
+                                     c_lno_nnz_view_t entriesC_) {
+  if (KOKKOSKERNELS_VERBOSE) {
     std::cout << "\tTRIANGLE GENERATION" << std::endl;
   }
 
   row_lno_temp_work_view_t new_row_mapB;
-  nnz_lno_temp_work_view_t set_index_entries; //will be output of compress matrix.
-  nnz_lno_temp_work_view_t set_entries; //will be output of compress matrix
-  size_type bnnz =  set_index_entries.extent(0);
-  this->handle->get_spgemm_handle()->get_compressed_b(bnnz, new_row_mapB, set_index_entries, set_entries);
-
-
-  bool compress_in_single_step = this->handle->get_spgemm_handle()->get_compression_step();
-
-  //get pointers from views.
-  size_type const *p_rowmapA = row_mapA.data();
-  nnz_lno_t const *p_entriesA = entriesA.data();
+  nnz_lno_temp_work_view_t
+      set_index_entries;                 // will be output of compress matrix.
+  nnz_lno_temp_work_view_t set_entries;  // will be output of compress matrix
+  size_type bnnz = set_index_entries.extent(0);
+  this->handle->get_spgemm_handle()->get_compressed_b(
+      bnnz, new_row_mapB, set_index_entries, set_entries);
+
+  bool compress_in_single_step =
+      this->handle->get_spgemm_handle()->get_compression_step();
+
+  // get pointers from views.
+  size_type const *p_rowmapA        = row_mapA.data();
+  nnz_lno_t const *p_entriesA       = entriesA.data();
   size_type const *p_rowmapB_begins = row_mapB.data();
-  size_type const *p_rowmapB_ends = new_row_mapB.data();
-  nnz_lno_t const *p_set_index_b = set_index_entries.data();
-  nnz_lno_t const *p_set_b = set_entries.data();
-  size_type *p_rowmapC = rowmapC_.data();
-  nnz_lno_t *p_entriesC = entriesC_.data();
-
-  if (!compress_in_single_step){
-    //first get the max flops for a row, which will be used for max row size.
-    //If we did compression in single step, row_mapB[i] points the beginning of row i,
-    //and new_row_mapB[i] points to the end of row i.
+  size_type const *p_rowmapB_ends   = new_row_mapB.data();
+  nnz_lno_t const *p_set_index_b    = set_index_entries.data();
+  nnz_lno_t const *p_set_b          = set_entries.data();
+  size_type *p_rowmapC              = rowmapC_.data();
+  nnz_lno_t *p_entriesC             = entriesC_.data();
+
+  if (!compress_in_single_step) {
+    // first get the max flops for a row, which will be used for max row size.
+    // If we did compression in single step, row_mapB[i] points the beginning of
+    // row i, and new_row_mapB[i] points to the end of row i.
     p_rowmapB_begins = new_row_mapB.data();
-    p_rowmapB_ends = p_rowmapB_begins + 1;
+    p_rowmapB_ends   = p_rowmapB_begins + 1;
   }
 
-
   const int is_symbolic_or_numeric = 1;
-  struct dummy{
-
+  struct dummy {
     KOKKOS_INLINE_FUNCTION
-    void operator ()(const nnz_lno_t &/*row*/, const nnz_lno_t &/*col_set_ind*/, const nnz_lno_t & /*col_set*/, const nnz_lno_t &/*threadid*/) const{
-    }
+    void operator()(const nnz_lno_t & /*row*/,
+                    const nnz_lno_t & /*col_set_ind*/,
+                    const nnz_lno_t & /*col_set*/,
+                    const nnz_lno_t & /*threadid*/) const {}
   } dummy;
-  this->triangle_count_ai(
-      is_symbolic_or_numeric,
-      a_row_cnt,
-      p_rowmapA, p_entriesA,
-      bnnz, p_rowmapB_begins, p_rowmapB_ends,
-      p_set_index_b, p_set_b,
-      p_rowmapC,
-      p_entriesC,
-      dummy
-  );
+  this->triangle_count_ai(is_symbolic_or_numeric, a_row_cnt, p_rowmapA,
+                          p_entriesA, bnnz, p_rowmapB_begins, p_rowmapB_ends,
+                          p_set_index_b, p_set_b, p_rowmapC, p_entriesC, dummy);
 }
 
-
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-void KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-    KokkosSPGEMM_symbolic_triangle_setup(){
-
-  constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>();
-  nnz_lno_t n = this->row_mapB.extent(0) - 1;
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+void KokkosSPGEMM<
+    HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
+    b_lno_row_view_t_, b_lno_nnz_view_t_,
+    b_scalar_nnz_view_t_>::KokkosSPGEMM_symbolic_triangle_setup() {
+  constexpr bool exec_gpu =
+      KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>();
+  nnz_lno_t n   = this->row_mapB.extent(0) - 1;
   size_type nnz = this->entriesB.extent(0);
 
   bool apply_compression = this->handle->get_spgemm_handle()->get_compression();
 
-
-  //get pointers from views.
-  size_type const *p_rowmapA = row_mapA.data();
-  nnz_lno_t const *p_entriesA = entriesA.data();
+  // get pointers from views.
+  size_type const *p_rowmapA        = row_mapA.data();
+  nnz_lno_t const *p_entriesA       = entriesA.data();
   size_type const *p_rowmapB_begins = row_mapB.data();
-  size_type const *p_rowmapB_ends = row_mapB.data() + 1;
+  size_type const *p_rowmapB_ends   = row_mapB.data() + 1;
 
   Kokkos::Timer timer1;
-  if (apply_compression){
-    //compressed b
-    row_lno_temp_work_view_t new_row_mapB(Kokkos::view_alloc(Kokkos::WithoutInitializing, "new row map"), n+1);
-    nnz_lno_temp_work_view_t set_index_entries; //will be output of compress matrix.
-    nnz_lno_temp_work_view_t set_entries; //will be output of compress matrix
-
+  if (apply_compression) {
+    // compressed b
+    row_lno_temp_work_view_t new_row_mapB(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "new row map"), n + 1);
+    nnz_lno_temp_work_view_t
+        set_index_entries;                 // will be output of compress matrix.
+    nnz_lno_temp_work_view_t set_entries;  // will be output of compress matrix
 
     if (KOKKOSKERNELS_VERBOSE) std::cout << "SYMBOLIC PHASE" << std::endl;
-    if (KOKKOSKERNELS_VERBOSE) std::cout << "\tCOMPRESS MATRIX-B PHASE" << std::endl;
-    //First Compress B.
-
-    bool compress_in_single_step = this->handle->get_spgemm_handle()->get_compression_step();
-    this->compressMatrix(n, nnz,
-        this->row_mapB, this->entriesB,
-        new_row_mapB, set_index_entries, set_entries,
-        compress_in_single_step);
-
-    if (KOKKOSKERNELS_VERBOSE){
-      std::cout << "\tNew Size:" << set_index_entries.extent(0) << " old:" << this->entriesB.extent(0)
-            << " ratio:" << set_index_entries.extent(0) / double (this->entriesB.extent(0) )<< std::endl;
-      std::cout << "\t\tCOMPRESS MATRIX-B overall time:" << timer1.seconds() << std::endl << std::endl;
+    if (KOKKOSKERNELS_VERBOSE)
+      std::cout << "\tCOMPRESS MATRIX-B PHASE" << std::endl;
+    // First Compress B.
+
+    bool compress_in_single_step =
+        this->handle->get_spgemm_handle()->get_compression_step();
+    this->compressMatrix(n, nnz, this->row_mapB, this->entriesB, new_row_mapB,
+                         set_index_entries, set_entries,
+                         compress_in_single_step);
+
+    if (KOKKOSKERNELS_VERBOSE) {
+      std::cout << "\tNew Size:" << set_index_entries.extent(0)
+                << " old:" << this->entriesB.extent(0) << " ratio:"
+                << set_index_entries.extent(0) /
+                       double(this->entriesB.extent(0))
+                << std::endl;
+      std::cout << "\t\tCOMPRESS MATRIX-B overall time:" << timer1.seconds()
+                << std::endl
+                << std::endl;
     }
 
-
-
-    //get pointers from views.
+    // get pointers from views.
     p_rowmapB_begins = row_mapB.data();
-    p_rowmapB_ends = new_row_mapB.data();
+    p_rowmapB_ends   = new_row_mapB.data();
 
-    if (!compress_in_single_step){
-      //first get the max flops for a row, which will be used for max row size.
-      //If we did compression in single step, row_mapB[i] points the beginning of row i,
-      //and new_row_mapB[i] points to the end of row i.
+    if (!compress_in_single_step) {
+      // first get the max flops for a row, which will be used for max row size.
+      // If we did compression in single step, row_mapB[i] points the beginning
+      // of row i, and new_row_mapB[i] points to the end of row i.
       p_rowmapB_begins = new_row_mapB.data();
-      p_rowmapB_ends = p_rowmapB_begins + 1;
+      p_rowmapB_ends   = p_rowmapB_begins + 1;
     }
 
-    size_type bnnz =  set_index_entries.extent(0);
+    size_type bnnz = set_index_entries.extent(0);
     if (exec_gpu) {
-      KokkosKernels::Impl::kkp_reduce_diff_view
-      <size_type, MyExecSpace> (this->b_row_cnt, p_rowmapB_begins, p_rowmapB_ends, bnnz);
-      if (KOKKOSKERNELS_VERBOSE){
+      KokkosKernels::Impl::kkp_reduce_diff_view<size_type, MyExecSpace>(
+          this->b_row_cnt, p_rowmapB_begins, p_rowmapB_ends, bnnz);
+      if (KOKKOSKERNELS_VERBOSE) {
         std::cout << "\tcompressed_b_size:" << bnnz << bnnz << std::endl;
       }
     }
-    this->handle->get_spgemm_handle()->set_compressed_b(bnnz, new_row_mapB, set_index_entries, set_entries);
+    this->handle->get_spgemm_handle()->set_compressed_b(
+        bnnz, new_row_mapB, set_index_entries, set_entries);
   }
   nnz_lno_t maxNumRoughZeros = 0;
   nnz_lno_persistent_work_view_t min_result_row_for_each_row;
 
-
-
-
-
-  if (this->handle->get_spgemm_handle()->get_read_write_cost_calc())
-  {
-    size_t compressed_flops = 0;
-    size_t original_flops = 0;
-    size_t compressd_max_flops= 0;
-    size_t original_max_flops = 0;
-    for (int i = 0; i < a_row_cnt; ++i){
-      int arb = row_mapA(i);
-      int are = row_mapA(i + 1);
+  if (this->handle->get_spgemm_handle()->get_read_write_cost_calc()) {
+    size_t compressed_flops    = 0;
+    size_t original_flops      = 0;
+    size_t compressd_max_flops = 0;
+    size_t original_max_flops  = 0;
+    for (int i = 0; i < a_row_cnt; ++i) {
+      int arb                     = row_mapA(i);
+      int are                     = row_mapA(i + 1);
       size_t compressed_row_flops = 0;
-      size_t original_row_flops = 0;
-      for (int j = arb; j < are; ++j){
+      size_t original_row_flops   = 0;
+      for (int j = arb; j < are; ++j) {
         int ae = entriesA(j);
         compressed_row_flops += p_rowmapB_ends[ae] - p_rowmapB_begins[ae];
         original_row_flops += row_mapB(ae + 1) - row_mapB(ae);
       }
-      if (compressed_row_flops > compressd_max_flops) compressd_max_flops = compressed_row_flops;
-      if (original_row_flops > original_max_flops) original_max_flops = original_row_flops;
+      if (compressed_row_flops > compressd_max_flops)
+        compressd_max_flops = compressed_row_flops;
+      if (original_row_flops > original_max_flops)
+        original_max_flops = original_row_flops;
       compressed_flops += compressed_row_flops;
       original_flops += original_row_flops;
     }
-    std::cout
-        << "original_flops:" << original_flops
-        << " compressed_flops:" << compressed_flops
-        << " FLOP_REDUCTION:" << double(compressed_flops) / original_flops
-        << std::endl;
-    std::cout
-        << "original_max_flops:" << original_max_flops
-        << " compressd_max_flops:" << compressd_max_flops
-        << " MEM_REDUCTION:" << double(compressd_max_flops) / original_max_flops * 2
-        << std::endl << std::endl << std::endl;
+    std::cout << "original_flops:" << original_flops
+              << " compressed_flops:" << compressed_flops
+              << " FLOP_REDUCTION:" << double(compressed_flops) / original_flops
+              << std::endl;
+    std::cout << "original_max_flops:" << original_max_flops
+              << " compressd_max_flops:" << compressd_max_flops
+              << " MEM_REDUCTION:"
+              << double(compressd_max_flops) / original_max_flops * 2
+              << std::endl
+              << std::endl
+              << std::endl;
   }
 
-
-
   timer1.reset();
-  if (spgemm_algorithm ==  SPGEMM_KK_TRIANGLE_AI ||
-      spgemm_algorithm ==  SPGEMM_KK_TRIANGLE_IA_UNION){
-
-    maxNumRoughZeros = this->getMaxRoughRowNNZ_p(
-        a_row_cnt, entriesA.extent(0),
-        p_rowmapA, p_entriesA,
-        p_rowmapB_begins, p_rowmapB_ends);
-    //max row size cannot be overeall number of columns.
-    //in this case more than number of compressed columns.
+  if (spgemm_algorithm == SPGEMM_KK_TRIANGLE_AI ||
+      spgemm_algorithm == SPGEMM_KK_TRIANGLE_IA_UNION) {
+    maxNumRoughZeros =
+        this->getMaxRoughRowNNZ_p(a_row_cnt, entriesA.extent(0), p_rowmapA,
+                                  p_entriesA, p_rowmapB_begins, p_rowmapB_ends);
+    // max row size cannot be overeall number of columns.
+    // in this case more than number of compressed columns.
     nnz_lno_t dense_col_size = this->b_col_cnt;
     if (apply_compression)
-      dense_col_size = this->b_col_cnt / (sizeof (nnz_lno_t) * 8)+ 1;
-    maxNumRoughZeros = KOKKOSKERNELS_MACRO_MIN(dense_col_size, maxNumRoughZeros);
-  }
-  else if ( spgemm_algorithm == SPGEMM_KK_TRIANGLE_LL ||
-            spgemm_algorithm == SPGEMM_KK_TRIANGLE_LU){
+      dense_col_size = this->b_col_cnt / (sizeof(nnz_lno_t) * 8) + 1;
+    maxNumRoughZeros =
+        KOKKOSKERNELS_MACRO_MIN(dense_col_size, maxNumRoughZeros);
+  } else if (spgemm_algorithm == SPGEMM_KK_TRIANGLE_LL ||
+             spgemm_algorithm == SPGEMM_KK_TRIANGLE_LU) {
     size_type max_row_size = 0;
-    KokkosKernels::Impl::kk_view_reduce_max_row_size<size_type, MyExecSpace>(this->b_row_cnt, p_rowmapB_begins, p_rowmapB_ends, max_row_size);
+    KokkosKernels::Impl::kk_view_reduce_max_row_size<size_type, MyExecSpace>(
+        this->b_row_cnt, p_rowmapB_begins, p_rowmapB_ends, max_row_size);
     maxNumRoughZeros = max_row_size;
-  }
-  else if ( spgemm_algorithm == SPGEMM_KK_TRIANGLE_IA ){
-
+  } else if (spgemm_algorithm == SPGEMM_KK_TRIANGLE_IA) {
     min_result_row_for_each_row = nnz_lno_persistent_work_view_t(
-          Kokkos::view_alloc(Kokkos::WithoutInitializing, "Min B Row for Each A Row"), this->a_row_cnt);
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Min B Row for Each A Row"),
+        this->a_row_cnt);
     maxNumRoughZeros = this->getMaxRoughRowNNZIntersection_p(
-        a_row_cnt, entriesA.extent(0),
-        p_rowmapA, p_entriesA,
-        p_rowmapB_begins, p_rowmapB_ends,
-        min_result_row_for_each_row.data());
+        a_row_cnt, entriesA.extent(0), p_rowmapA, p_entriesA, p_rowmapB_begins,
+        p_rowmapB_ends, min_result_row_for_each_row.data());
+  } else {
+    // tODO :THrow here.
   }
-  else {
-    //tODO :THrow here.
-  }
-
 
-  if (KOKKOSKERNELS_VERBOSE){
-    std::cout << "\tMax Row Flops:" << maxNumRoughZeros  << std::endl;
-    std::cout << "\tMax Row Flop Calc Time:" << timer1.seconds()  << std::endl;
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\tMax Row Flops:" << maxNumRoughZeros << std::endl;
+    std::cout << "\tMax Row Flop Calc Time:" << timer1.seconds() << std::endl;
   }
 
-
-  this->handle->get_spgemm_handle()->set_min_col_of_row(min_result_row_for_each_row);
+  this->handle->get_spgemm_handle()->set_min_col_of_row(
+      min_result_row_for_each_row);
   this->handle->get_spgemm_handle()->set_max_result_nnz(maxNumRoughZeros);
 }
 
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
 template <typename c_row_view_t>
-void KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-    KokkosSPGEMM_symbolic_triangle(c_row_view_t rowmapC_){
+void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                  a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                  b_scalar_nnz_view_t_>::
+    KokkosSPGEMM_symbolic_triangle(c_row_view_t rowmapC_) {
   this->KokkosSPGEMM_symbolic_triangle_setup();
   row_lno_temp_work_view_t new_row_mapB;
-  nnz_lno_temp_work_view_t set_index_entries; //will be output of compress matrix.
-  nnz_lno_temp_work_view_t set_entries; //will be output of compress matrix
-  size_type bnnz =  set_index_entries.extent(0);
-  this->handle->get_spgemm_handle()->get_compressed_b(bnnz, new_row_mapB, set_index_entries, set_entries);
-
-
-  bool compress_in_single_step = this->handle->get_spgemm_handle()->get_compression_step();
-
-  //get pointers from views.
-  size_type const *p_rowmapA = row_mapA.data();
-  nnz_lno_t const *p_entriesA = entriesA.data();
+  nnz_lno_temp_work_view_t
+      set_index_entries;                 // will be output of compress matrix.
+  nnz_lno_temp_work_view_t set_entries;  // will be output of compress matrix
+  size_type bnnz = set_index_entries.extent(0);
+  this->handle->get_spgemm_handle()->get_compressed_b(
+      bnnz, new_row_mapB, set_index_entries, set_entries);
+
+  bool compress_in_single_step =
+      this->handle->get_spgemm_handle()->get_compression_step();
+
+  // get pointers from views.
+  size_type const *p_rowmapA        = row_mapA.data();
+  nnz_lno_t const *p_entriesA       = entriesA.data();
   size_type const *p_rowmapB_begins = row_mapB.data();
-  size_type const *p_rowmapB_ends = new_row_mapB.data();
-  nnz_lno_t const *p_set_index_b = set_index_entries.data();
-  nnz_lno_t const *p_set_b = set_entries.data();
-  size_type *p_rowmapC = rowmapC_.data();
-  //nnz_lno_t *p_entriesC = entriesC_.data();
-
-  if (!compress_in_single_step){
-    //first get the max flops for a row, which will be used for max row size.
-    //If we did compression in single step, row_mapB[i] points the beginning of row i,
-    //and new_row_mapB[i] points to the end of row i.
+  size_type const *p_rowmapB_ends   = new_row_mapB.data();
+  nnz_lno_t const *p_set_index_b    = set_index_entries.data();
+  nnz_lno_t const *p_set_b          = set_entries.data();
+  size_type *p_rowmapC              = rowmapC_.data();
+  // nnz_lno_t *p_entriesC = entriesC_.data();
+
+  if (!compress_in_single_step) {
+    // first get the max flops for a row, which will be used for max row size.
+    // If we did compression in single step, row_mapB[i] points the beginning of
+    // row i, and new_row_mapB[i] points to the end of row i.
     p_rowmapB_begins = new_row_mapB.data();
-    p_rowmapB_ends = p_rowmapB_begins + 1;
+    p_rowmapB_ends   = p_rowmapB_begins + 1;
   }
 
   const int is_symbolic_or_numeric = 0;
 
-
-  struct dummy{
+  struct dummy {
     KOKKOS_INLINE_FUNCTION
-    void operator ()(const nnz_lno_t &/*row*/, const nnz_lno_t &/*col_set_ind*/, const nnz_lno_t & /*col_set*/, const nnz_lno_t &/*threadid*/) const{
-    }
+    void operator()(const nnz_lno_t & /*row*/,
+                    const nnz_lno_t & /*col_set_ind*/,
+                    const nnz_lno_t & /*col_set*/,
+                    const nnz_lno_t & /*threadid*/) const {}
 
   } dummy;
 
-  this->triangle_count_ai(
-      is_symbolic_or_numeric,
-      a_row_cnt,
-      p_rowmapA, p_entriesA,
-      bnnz, p_rowmapB_begins, p_rowmapB_ends,
-      p_set_index_b, p_set_b,
-      p_rowmapC,
-      NULL,
-      dummy
-  );
-
-  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum
-                <c_row_view_t, MyExecSpace>(this->a_row_cnt + 1, rowmapC_);
+  this->triangle_count_ai(is_symbolic_or_numeric, a_row_cnt, p_rowmapA,
+                          p_entriesA, bnnz, p_rowmapB_begins, p_rowmapB_ends,
+                          p_set_index_b, p_set_b, p_rowmapC, NULL, dummy);
+
+  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<c_row_view_t,
+                                                        MyExecSpace>(
+      this->a_row_cnt + 1, rowmapC_);
   MyExecSpace().fence();
 
   auto d_c_nnz_size = Kokkos::subview(rowmapC_, this->a_row_cnt);
-  auto h_c_nnz_size = Kokkos::create_mirror_view (d_c_nnz_size);
-  Kokkos::deep_copy (h_c_nnz_size, d_c_nnz_size);
+  auto h_c_nnz_size = Kokkos::create_mirror_view(d_c_nnz_size);
+  Kokkos::deep_copy(h_c_nnz_size, d_c_nnz_size);
   typename c_row_view_t::non_const_value_type c_nnz_size = h_c_nnz_size();
   this->handle->get_spgemm_handle()->set_c_nnz(c_nnz_size);
 
-  if (KOKKOSKERNELS_VERBOSE){
-    std::cout << "\t"; KokkosKernels::Impl::kk_print_1Dview(rowmapC_, false, 20);
-    std::cout << "\tSize of C found as:" << c_nnz_size << std::endl<< std::endl;
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\t";
+    KokkosKernels::Impl::kk_print_1Dview(rowmapC_, false, 20);
+    std::cout << "\tSize of C found as:" << c_nnz_size << std::endl
+              << std::endl;
   }
 }
 
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
 template <typename visit_struct_t>
-void KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-    KokkosSPGEMM_generic_triangle(visit_struct_t visit_apply){
+void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                  a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                  b_scalar_nnz_view_t_>::
+    KokkosSPGEMM_generic_triangle(visit_struct_t visit_apply) {
   this->KokkosSPGEMM_symbolic_triangle_setup();
 
-
-
-  //get pointers from views.
-  size_type const *p_rowmapA = row_mapA.data();
-  nnz_lno_t const *p_entriesA = entriesA.data();
+  // get pointers from views.
+  size_type const *p_rowmapA        = row_mapA.data();
+  nnz_lno_t const *p_entriesA       = entriesA.data();
   size_type const *p_rowmapB_begins = row_mapB.data();
-  size_type const *p_rowmapB_ends = row_mapB.data()+1;
-  nnz_lno_t const *p_set_index_b = this->entriesB.data();
-  nnz_lno_t const *p_set_b = NULL;
-  size_type bnnz = this->entriesB.extent(0);
+  size_type const *p_rowmapB_ends   = row_mapB.data() + 1;
+  nnz_lno_t const *p_set_index_b    = this->entriesB.data();
+  nnz_lno_t const *p_set_b          = NULL;
+  size_type bnnz                    = this->entriesB.extent(0);
 
   bool apply_compression = this->handle->get_spgemm_handle()->get_compression();
-  if (apply_compression){
+  if (apply_compression) {
     row_lno_temp_work_view_t new_row_mapB;
-    nnz_lno_temp_work_view_t set_index_entries; //will be output of compress matrix.
-    nnz_lno_temp_work_view_t set_entries; //will be output of compress matrix
-
-    this->handle->get_spgemm_handle()->get_compressed_b(bnnz, new_row_mapB, set_index_entries, set_entries);
-    p_set_index_b = set_index_entries.data();
-    p_set_b = set_entries.data();
+    nnz_lno_temp_work_view_t
+        set_index_entries;                 // will be output of compress matrix.
+    nnz_lno_temp_work_view_t set_entries;  // will be output of compress matrix
+
+    this->handle->get_spgemm_handle()->get_compressed_b(
+        bnnz, new_row_mapB, set_index_entries, set_entries);
+    p_set_index_b  = set_index_entries.data();
+    p_set_b        = set_entries.data();
     p_rowmapB_ends = new_row_mapB.data();
-    bool compress_in_single_step = this->handle->get_spgemm_handle()->get_compression_step();
-    if (!compress_in_single_step){
-      //first get the max flops for a row, which will be used for max row size.
-      //If we did compression in single step, row_mapB[i] points the beginning of row i,
-      //and new_row_mapB[i] points to the end of row i.
+    bool compress_in_single_step =
+        this->handle->get_spgemm_handle()->get_compression_step();
+    if (!compress_in_single_step) {
+      // first get the max flops for a row, which will be used for max row size.
+      // If we did compression in single step, row_mapB[i] points the beginning
+      // of row i, and new_row_mapB[i] points to the end of row i.
       p_rowmapB_begins = new_row_mapB.data();
-      p_rowmapB_ends = p_rowmapB_begins + 1;
+      p_rowmapB_ends   = p_rowmapB_begins + 1;
     }
   }
 
   const int is_symbolic_or_numeric = 2;
-  this->triangle_count_ai(
-      is_symbolic_or_numeric,
-      a_row_cnt,
-      p_rowmapA, p_entriesA,
-      bnnz, p_rowmapB_begins, p_rowmapB_ends,
-      p_set_index_b, p_set_b,
-      NULL,
-      NULL,
-      visit_apply
-  );
+  this->triangle_count_ai(is_symbolic_or_numeric, a_row_cnt, p_rowmapA,
+                          p_entriesA, bnnz, p_rowmapB_begins, p_rowmapB_ends,
+                          p_set_index_b, p_set_b, NULL, NULL, visit_apply);
 }
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosSparse
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp
index bff99c1f00..7e36932b25 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp
@@ -44,28 +44,28 @@
 
 #include "KokkosKernels_BitUtils.hpp"
 
-namespace KokkosSparse{
+namespace KokkosSparse {
 
-namespace Impl{
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
+namespace Impl {
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
 template <typename pool_memory_space, typename struct_visit_t>
-struct KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-  TriangleCount_No_Compression{
-  const nnz_lno_t numrows; //num rows in A
+struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                    a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                    b_scalar_nnz_view_t_>::TriangleCount_No_Compression {
+  const nnz_lno_t numrows;  // num rows in A
 
-  const size_type *row_mapA; //A row pointers
-  const nnz_lno_t *entriesA; // A column indices
+  const size_type *row_mapA;  // A row pointers
+  const nnz_lno_t *entriesA;  // A column indices
 
   const size_type *row_pointer_begins_B;
   const size_type *row_pointer_ends_B;
   const nnz_lno_t *entriesB;
 
   size_type *rowmapC;
-  nnz_lno_t* entriesC;
+  nnz_lno_t *entriesC;
 
   const nnz_lno_t pow2_hash_size;
   const nnz_lno_t pow2_hash_func;
@@ -76,8 +76,7 @@ struct KokkosSPGEMM
   pool_memory_space m_space;
   const KokkosKernels::Impl::ExecSpaceType my_exec_space;
 
-
-  const int unit_memory; //begins, nexts, and keys. No need for vals yet.
+  const int unit_memory;  // begins, nexts, and keys. No need for vals yet.
   const int suggested_team_size;
   const int thread_memory;
   nnz_lno_t shmem_key_size;
@@ -107,32 +106,23 @@ struct KokkosSPGEMM
    * \param my_exec_space_ : execution space.
    */
   TriangleCount_No_Compression(
-      const nnz_lno_t m_,
-      const size_type *row_mapA_,
+      const nnz_lno_t m_, const size_type *row_mapA_,
       const nnz_lno_t *entriesA_,
 
-      const size_type *row_ptr_begins_B_,
-      const size_type *row_ptr_ends_B_,
-      const nnz_lno_t *entriesSetIndicesB_,
-      const nnz_lno_t *entriesSetsB_,
+      const size_type *row_ptr_begins_B_, const size_type *row_ptr_ends_B_,
+      const nnz_lno_t *entriesSetIndicesB_, const nnz_lno_t *entriesSetsB_,
 
-      size_type *rowmapC_,
-      nnz_lno_t *entriesC_,
+      size_type *rowmapC_, nnz_lno_t *entriesC_,
 
-      const nnz_lno_t hash_size_,
-      const nnz_lno_t MaxRoughNonZero_,
-      const size_t sharedMemorySize_,
-      const int suggested_team_size_,
-      const nnz_lno_t team_row_chunk_size_,
-      const int vector_size_,
+      const nnz_lno_t hash_size_, const nnz_lno_t MaxRoughNonZero_,
+      const size_t sharedMemorySize_, const int suggested_team_size_,
+      const nnz_lno_t team_row_chunk_size_, const int vector_size_,
       pool_memory_space mpool_,
-      const KokkosKernels::Impl::ExecSpaceType my_exec_space_,
-      int mode_,
+      const KokkosKernels::Impl::ExecSpaceType my_exec_space_, int mode_,
       const nnz_lno_t *min_size_row_for_each_row_,
-      const struct_visit_t visit_applier_,
-      bool KOKKOSKERNELS_VERBOSE):
-        numrows(m_),
-        row_mapA (row_mapA_),
+      const struct_visit_t visit_applier_, bool KOKKOSKERNELS_VERBOSE)
+      : numrows(m_),
+        row_mapA(row_mapA_),
         entriesA(entriesA_),
         row_pointer_begins_B(row_ptr_begins_B_),
         row_pointer_ends_B(row_ptr_ends_B_),
@@ -143,992 +133,976 @@ struct KokkosSPGEMM
         pow2_hash_func(hash_size_ - 1),
         MaxRoughNonZero(MaxRoughNonZero_),
         shared_memory_size(sharedMemorySize_),
-        vector_size (vector_size_),
+        vector_size(vector_size_),
         m_space(mpool_),
         my_exec_space(my_exec_space_),
         unit_memory(sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) * 3),
         suggested_team_size(suggested_team_size_),
-        thread_memory((shared_memory_size /8 / suggested_team_size_) * 8),
+        thread_memory((shared_memory_size / 8 / suggested_team_size_) * 8),
         shmem_key_size(),
         shared_memory_hash_func(),
         shmem_hash_size(1),
         team_row_chunk_size(team_row_chunk_size_),
-        set_size (sizeof(nnz_lno_t) * 8),
+        set_size(sizeof(nnz_lno_t) * 8),
         set_shift(log(double(sizeof(nnz_lno_t) * 8)) / log(2.0) + 0.5),
         count_or_fill_mode(mode_),
         min_size_row_for_each_row(min_size_row_for_each_row_),
-        visit_applier(visit_applier_)
-  {
-
-    //how many keys I can hold?
-    //thread memory - 3 needed entry for size.
+        visit_applier(visit_applier_) {
+    // how many keys I can hold?
+    // thread memory - 3 needed entry for size.
     shmem_key_size = ((thread_memory - sizeof(nnz_lno_t) * 4) / unit_memory);
 
-    //put the hash size closest power of 2.
-    //we round down here, because we want to store more keys,
-    //conflicts are cheaper.
-    while (shmem_hash_size * 2 <=  shmem_key_size){
+    // put the hash size closest power of 2.
+    // we round down here, because we want to store more keys,
+    // conflicts are cheaper.
+    while (shmem_hash_size * 2 <= shmem_key_size) {
       shmem_hash_size = shmem_hash_size * 2;
     }
-    //for and opeation we get -1.
+    // for and opeation we get -1.
     shared_memory_hash_func = shmem_hash_size - 1;
 
-    //increase the key size wit the left over from hash size.
-    shmem_key_size = shmem_key_size + ((shmem_key_size - shmem_hash_size) ) / 4;
-    //round it down to 2, because of some alignment issues.
+    // increase the key size wit the left over from hash size.
+    shmem_key_size = shmem_key_size + ((shmem_key_size - shmem_hash_size)) / 4;
+    // round it down to 2, because of some alignment issues.
     shmem_key_size = (shmem_key_size >> 1) << 1;
 
-    if (KOKKOSKERNELS_VERBOSE){
+    if (KOKKOSKERNELS_VERBOSE) {
       std::cout << "\tTriangleCount "
-          << " thread_memory:" << thread_memory
-          << " unit_memory:" << unit_memory
-          << " adjusted hashsize:" << shmem_hash_size
-          << " adjusted shmem_key_size:" << shmem_key_size
-          << " using "<< (shmem_key_size * 4  + shmem_hash_size) * sizeof (nnz_lno_t) +    sizeof(nnz_lno_t) * 3
-          << " of thread_memory: " << thread_memory
-          << " set_shift:" << set_shift << " set_size:" << set_size
-          << std::endl;
+                << " thread_memory:" << thread_memory
+                << " unit_memory:" << unit_memory
+                << " adjusted hashsize:" << shmem_hash_size
+                << " adjusted shmem_key_size:" << shmem_key_size << " using "
+                << (shmem_key_size * 4 + shmem_hash_size) * sizeof(nnz_lno_t) +
+                       sizeof(nnz_lno_t) * 3
+                << " of thread_memory: " << thread_memory
+                << " set_shift:" << set_shift << " set_size:" << set_size
+                << std::endl;
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  nnz_lno_t get_thread_id(const nnz_lno_t row_index) const{
-    switch (my_exec_space){
-    default:
-      return row_index;
-#if defined( KOKKOS_ENABLE_SERIAL )
-    case KokkosKernels::Impl::Exec_SERIAL:
-      return 0;
-#endif
-#if defined( KOKKOS_ENABLE_OPENMP )
-    case KokkosKernels::Impl::Exec_OMP:
-      return Kokkos::OpenMP::impl_hardware_thread_id();
+  nnz_lno_t get_thread_id(const nnz_lno_t row_index) const {
+    switch (my_exec_space) {
+      default: return row_index;
+#if defined(KOKKOS_ENABLE_SERIAL)
+      case KokkosKernels::Impl::Exec_SERIAL: return 0;
 #endif
-#if defined( KOKKOS_ENABLE_THREADS )
-    case KokkosKernels::Impl::Exec_PTHREADS:
-      return Kokkos::Threads::impl_hardware_thread_id();
+#if defined(KOKKOS_ENABLE_OPENMP)
+      case KokkosKernels::Impl::Exec_OMP:
+        return Kokkos::OpenMP::impl_hardware_thread_id();
 #endif
-#if defined( KOKKOS_ENABLE_QTHREAD)
-    case KokkosKernels::Impl::Exec_QTHREADS:
-      return 0; // Kokkos does not have a thread_id API for Qthreads
+#if defined(KOKKOS_ENABLE_THREADS)
+      case KokkosKernels::Impl::Exec_THREADS:
+        return Kokkos::Threads::impl_hardware_thread_id();
 #endif
-#if defined( KOKKOS_ENABLE_CUDA )
-    case KokkosKernels::Impl::Exec_CUDA:
-      return row_index;
+#if defined(KOKKOS_ENABLE_CUDA)
+      case KokkosKernels::Impl::Exec_CUDA: return row_index;
 #endif
-#if defined( KOKKOS_ENABLE_HIP )
-    case KokkosKernels::Impl::Exec_HIP:
-      return row_index;
+#if defined(KOKKOS_ENABLE_HIP)
+      case KokkosKernels::Impl::Exec_HIP: return row_index;
 #endif
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const MultiCoreDenseAccumulatorTag2&, const team_member_t & teamMember) const {
-
-
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
-
-    //dense accumulators
+  void operator()(const MultiCoreDenseAccumulatorTag2 &,
+                  const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
+
+    // dense accumulators
     nnz_lno_t *indices = NULL;
-    nnz_lno_t *sets = NULL;
-    nnz_lno_t *sets2 = NULL;
+    nnz_lno_t *sets    = NULL;
+    nnz_lno_t *sets2   = NULL;
 
-    volatile nnz_lno_t * tmp = NULL;
+    volatile nnz_lno_t *tmp = NULL;
 
     nnz_lno_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-    while (tmp == NULL){
-      tmp = (volatile nnz_lno_t * )( m_space.allocate_chunk(tid));
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(m_space.allocate_chunk(tid));
     }
 
-    //we need as much as column size for sets.
-    sets = (nnz_lno_t *) tmp;
-    tmp += MaxRoughNonZero; //this is set as column size before calling dense accumulators.
-
-    //sets2 = (nnz_lno_t *) tmp;
-    //tmp += MaxRoughNonZero; //this is set as column size before calling dense accumulators.
-
-
-    //indices only needs max row size.
-    indices = (nnz_lno_t *) tmp;
-
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
-        [&] (const nnz_lno_t& row_index) {
-      nnz_lno_t insertion_count = 0;
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
-      nnz_lno_t num_el = 0;
-      if (col_size){
-
-        //first insert the minimum row.
-        nnz_lno_t min_row_b = this->min_size_row_for_each_row[row_index];
-        size_type min_row_begin = row_pointer_begins_B[min_row_b];
-        nnz_lno_t min_row_left_work = row_pointer_ends_B[min_row_b] - min_row_begin;
-
-        //traverse columns of B
-        for (nnz_lno_t i = 0; i < min_row_left_work; ++i){
-          const size_type adjind = i + min_row_begin;
-          nnz_lno_t b_set_ind = entriesB[adjind];
-          //here we assume that each element in row is unique.
-          //we need to change compression so that it will always
-          //return unique rows.
-          indices[insertion_count++] = b_set_ind;
-          sets[b_set_ind] = 1;
-          //sets2[b_set_ind] = 1;
-        }
-
-
-        //traverse columns of A
-        for (nnz_lno_t colind = 0; colind < col_size; ++colind){
-          size_type a_col = colind + col_begin;
-
-
-          nnz_lno_t rowB = entriesA[a_col];
-          if (rowB == min_row_b) continue;
-          size_type rowBegin = row_pointer_begins_B[rowB];
-          nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
-
-
-          //traverse columns of B
-          for (nnz_lno_t i = 0; i < left_work; ++i){
-            const size_type adjind = i + rowBegin;
-            nnz_lno_t b_set_ind = entriesB[adjind];
-            //nnz_lno_t b_set = entriesSetsB[adjind];
-            //make a intersection.
-            //sets[b_set_ind] = sets[b_set_ind] & b_set;
-            //sets[b_set_ind]
-            ++sets[b_set_ind];
-          }
-        }
-      }
+    // we need as much as column size for sets.
+    sets = (nnz_lno_t *)tmp;
+    tmp += MaxRoughNonZero;  // this is set as column size before calling dense
+                             // accumulators.
+
+    // sets2 = (nnz_lno_t *) tmp;
+    // tmp += MaxRoughNonZero; //this is set as column size before calling dense
+    // accumulators.
+
+    // indices only needs max row size.
+    indices = (nnz_lno_t *)tmp;
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          nnz_lno_t insertion_count = 0;
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t col_size  = row_mapA[row_index + 1] - col_begin;
+          nnz_lno_t num_el          = 0;
+          if (col_size) {
+            // first insert the minimum row.
+            nnz_lno_t min_row_b = this->min_size_row_for_each_row[row_index];
+            size_type min_row_begin = row_pointer_begins_B[min_row_b];
+            nnz_lno_t min_row_left_work =
+                row_pointer_ends_B[min_row_b] - min_row_begin;
+
+            // traverse columns of B
+            for (nnz_lno_t i = 0; i < min_row_left_work; ++i) {
+              const size_type adjind = i + min_row_begin;
+              nnz_lno_t b_set_ind    = entriesB[adjind];
+              // here we assume that each element in row is unique.
+              // we need to change compression so that it will always
+              // return unique rows.
+              indices[insertion_count++] = b_set_ind;
+              sets[b_set_ind]            = 1;
+              // sets2[b_set_ind] = 1;
+            }
 
-      switch (count_or_fill_mode){
-      case 0: //count mode
-      default:
-      {
-        for (nnz_lno_t ii = 0; ii < insertion_count; ++ii){
-          nnz_lno_t set_ind = indices[ii];
-          nnz_lno_t c_rows = sets[set_ind];
-          if (sets2[set_ind] != col_size) continue;
-          //count number of set bits
-          /*
-          nnz_lno_t num_el2 = 0;
-          for (; c_rows; num_el2++) {
-            c_rows = c_rows & (c_rows - 1); // clear the least significant bit set
-          }
-          num_el += num_el2;
-          */
-          //num_el += KokkosKernels::Impl::set_bit_count(c_rows);
-          num_el += KokkosKernels::Impl::pop_count(c_rows);
-        }
-        rowmapC[row_index] = num_el;
-        //std::cout << "row_index:" << row_index << " num_el:" << num_el << std::endl;
-      }
-      break;
-      case 1: //fill mode
-      {
-        size_type num_el = rowmapC[row_index];
-
-        for (nnz_lno_t ii = 0; ii < insertion_count; ++ii){
-          const nnz_lno_t set_ind = indices[ii];
-
-          if (sets2[set_ind] != col_size) continue;
-          nnz_lno_t c_rows = sets[set_ind];
-          const nnz_lno_t shift = set_ind << set_shift;
-          //int current_row = 0;
-          nnz_lno_t unit = 1;
-          while (c_rows){
-            int least_set = KokkosKernels::Impl::least_set_bit(c_rows) - 1;
-            entriesC[num_el++] = shift + least_set;
-            c_rows = c_rows & ~(unit << least_set);
-
-            /*
-            if (c_rows & unit){
-              //insert indices.
-              entriesC[num_el++] = shift + current_row;
+            // traverse columns of A
+            for (nnz_lno_t colind = 0; colind < col_size; ++colind) {
+              size_type a_col = colind + col_begin;
+
+              nnz_lno_t rowB = entriesA[a_col];
+              if (rowB == min_row_b) continue;
+              size_type rowBegin  = row_pointer_begins_B[rowB];
+              nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
+
+              // traverse columns of B
+              for (nnz_lno_t i = 0; i < left_work; ++i) {
+                const size_type adjind = i + rowBegin;
+                nnz_lno_t b_set_ind    = entriesB[adjind];
+                // nnz_lno_t b_set = entriesSetsB[adjind];
+                // make a intersection.
+                // sets[b_set_ind] = sets[b_set_ind] & b_set;
+                // sets[b_set_ind]
+                ++sets[b_set_ind];
+              }
             }
-            current_row++;
-            c_rows = c_rows & ~unit;
-            unit = unit << 1;
-            */
-          }
-        }
-      }
-      break;
-      case 2:
-      {
-        for (nnz_lno_t ii = 0; ii < insertion_count; ++ii){
-          const nnz_lno_t set_ind = indices[ii];
-
-          if (sets2[set_ind] != col_size) continue;
-          nnz_lno_t c_rows = sets[set_ind];
-          //const nnz_lno_t shift = set_ind << set_shift;
-          if (c_rows){
-            visit_applier(row_index, set_ind, c_rows, tid);
           }
 
-        }
-      }
-      break;
-
-      }
-    }
-    );
+          switch (count_or_fill_mode) {
+            case 0:  // count mode
+            default: {
+              for (nnz_lno_t ii = 0; ii < insertion_count; ++ii) {
+                nnz_lno_t set_ind = indices[ii];
+                nnz_lno_t c_rows  = sets[set_ind];
+                if (sets2[set_ind] != col_size) continue;
+                // count number of set bits
+                /*
+                nnz_lno_t num_el2 = 0;
+                for (; c_rows; num_el2++) {
+                  c_rows = c_rows & (c_rows - 1); // clear the least significant
+                bit set
+                }
+                num_el += num_el2;
+                */
+                // num_el += KokkosKernels::Impl::set_bit_count(c_rows);
+                num_el += KokkosKernels::Impl::pop_count(c_rows);
+              }
+              rowmapC[row_index] = num_el;
+              // std::cout << "row_index:" << row_index << " num_el:" << num_el
+              // << std::endl;
+            } break;
+            case 1:  // fill mode
+            {
+              size_type num_el = rowmapC[row_index];
+
+              for (nnz_lno_t ii = 0; ii < insertion_count; ++ii) {
+                const nnz_lno_t set_ind = indices[ii];
+
+                if (sets2[set_ind] != col_size) continue;
+                nnz_lno_t c_rows      = sets[set_ind];
+                const nnz_lno_t shift = set_ind << set_shift;
+                // int current_row = 0;
+                nnz_lno_t unit = 1;
+                while (c_rows) {
+                  int least_set =
+                      KokkosKernels::Impl::least_set_bit(c_rows) - 1;
+                  entriesC[num_el++] = shift + least_set;
+                  c_rows             = c_rows & ~(unit << least_set);
+
+                  /*
+                  if (c_rows & unit){
+                    //insert indices.
+                    entriesC[num_el++] = shift + current_row;
+                  }
+                  current_row++;
+                  c_rows = c_rows & ~unit;
+                  unit = unit << 1;
+                  */
+                }
+              }
+            } break;
+            case 2: {
+              for (nnz_lno_t ii = 0; ii < insertion_count; ++ii) {
+                const nnz_lno_t set_ind = indices[ii];
+
+                if (sets2[set_ind] != col_size) continue;
+                nnz_lno_t c_rows = sets[set_ind];
+                // const nnz_lno_t shift = set_ind << set_shift;
+                if (c_rows) {
+                  visit_applier(row_index, set_ind, c_rows, tid);
+                }
+              }
+            } break;
+          }
+        });
 
     m_space.release_chunk(indices);
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const MultiCoreDenseAccumulatorTag&, const team_member_t & teamMember) const {
-
-
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
-
-    //dense accumulators
+  void operator()(const MultiCoreDenseAccumulatorTag &,
+                  const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
+
+    // dense accumulators
     nnz_lno_t *indices = NULL;
-    nnz_lno_t *sets = NULL;
-    nnz_lno_t *sets2 = NULL;
+    nnz_lno_t *sets    = NULL;
+    nnz_lno_t *sets2   = NULL;
 
-    volatile nnz_lno_t * tmp = NULL;
+    volatile nnz_lno_t *tmp = NULL;
 
     nnz_lno_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-    while (tmp == NULL){
-      tmp = (volatile nnz_lno_t * )( m_space.allocate_chunk(tid));
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(m_space.allocate_chunk(tid));
     }
 
-    //we need as much as column size for sets.
-    sets = (nnz_lno_t *) tmp;
-    tmp += MaxRoughNonZero; //this is set as column size before calling dense accumulators.
-    sets2 = (nnz_lno_t *) tmp;
-    tmp += MaxRoughNonZero; //this is set as column size before calling dense accumulators.
-
-    //indices only needs max row size.
-    indices = (nnz_lno_t *) tmp;
-
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
-        [&] (const nnz_lno_t& row_index) {
-      nnz_lno_t insertion_count = 0;
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
-      nnz_lno_t num_el = 0;
-
-      //traverse columns of A
-      for (nnz_lno_t colind = 0; colind < col_size; ++colind){
-        size_type a_col = colind + col_begin;
-
-        nnz_lno_t rowB = entriesA[a_col];
-
-
-        size_type rowBegin = row_pointer_begins_B[rowB];
-        nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
-
-        //traverse columns of B
-        for (nnz_lno_t i = 0; i < left_work; ++i){
-          const size_type adjind = i + rowBegin;
-          nnz_lno_t b_set_ind = entriesB[adjind];
-          nnz_lno_t b_set = entriesSetsB[adjind];
-          //if sets are not set before, add this to indices.
-          if (sets[b_set_ind] == 0){
-            indices[insertion_count++] = b_set_ind;
-          }
-          sets2[b_set_ind] = sets2[b_set_ind] | (sets[b_set_ind] & b_set);
-          //make a union.
-          sets[b_set_ind] = sets[b_set_ind] | b_set;
-        }
-      }
-
-      switch (count_or_fill_mode){
-      case 0: //count mode
-      default:
-      {
-        for (nnz_lno_t ii = 0; ii < insertion_count; ++ii){
-          nnz_lno_t set_ind = indices[ii];
-          //nnz_lno_t c_rows = sets[set_ind];
-          nnz_lno_t c_rows = sets2[set_ind];
-          sets[set_ind] = 0;
-          sets2[set_ind] = 0;
-
-          //count number of set bits
-          /*
-          nnz_lno_t num_el2 = 0;
-          for (; c_rows; num_el2++) {
-            c_rows = c_rows & (c_rows - 1); // clear the least significant bit set
-          }
-          num_el += num_el2;
-          */
-          //num_el += KokkosKernels::Impl::set_bit_count(c_rows);
-          num_el += KokkosKernels::Impl::pop_count(c_rows);
-        }
-        rowmapC[row_index] = num_el;
-      }
-      break;
-      case 1: //fill mode
-      {
-        size_type num_el = rowmapC[row_index];
-
-        for (nnz_lno_t ii = 0; ii < insertion_count; ++ii){
-          const nnz_lno_t set_ind = indices[ii];
-          const nnz_lno_t shift = set_ind << set_shift;
-
-          //nnz_lno_t c_rows = sets[set_ind];
-          nnz_lno_t c_rows = sets2[set_ind];
-          sets[set_ind] = 0;
-          sets2[set_ind] = 0;
-
-          //int current_row = 0;
-          nnz_lno_t unit = 1;
-          while (c_rows){
-
-            int least_set = KokkosKernels::Impl::least_set_bit(c_rows) - 1;
-            entriesC[num_el++] = shift + least_set;
-            c_rows = c_rows & ~(unit << least_set);
-            /*
-            if (c_rows & unit){
-              //insert indices.
-              entriesC[num_el++] = shift + current_row;
+    // we need as much as column size for sets.
+    sets = (nnz_lno_t *)tmp;
+    tmp += MaxRoughNonZero;  // this is set as column size before calling dense
+                             // accumulators.
+    sets2 = (nnz_lno_t *)tmp;
+    tmp += MaxRoughNonZero;  // this is set as column size before calling dense
+                             // accumulators.
+
+    // indices only needs max row size.
+    indices = (nnz_lno_t *)tmp;
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          nnz_lno_t insertion_count = 0;
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t col_size  = row_mapA[row_index + 1] - col_begin;
+          nnz_lno_t num_el          = 0;
+
+          // traverse columns of A
+          for (nnz_lno_t colind = 0; colind < col_size; ++colind) {
+            size_type a_col = colind + col_begin;
+
+            nnz_lno_t rowB = entriesA[a_col];
+
+            size_type rowBegin  = row_pointer_begins_B[rowB];
+            nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
+
+            // traverse columns of B
+            for (nnz_lno_t i = 0; i < left_work; ++i) {
+              const size_type adjind = i + rowBegin;
+              nnz_lno_t b_set_ind    = entriesB[adjind];
+              nnz_lno_t b_set        = entriesSetsB[adjind];
+              // if sets are not set before, add this to indices.
+              if (sets[b_set_ind] == 0) {
+                indices[insertion_count++] = b_set_ind;
+              }
+              sets2[b_set_ind] = sets2[b_set_ind] | (sets[b_set_ind] & b_set);
+              // make a union.
+              sets[b_set_ind] = sets[b_set_ind] | b_set;
             }
-            current_row++;
-            c_rows = c_rows & ~unit;
-            unit = unit << 1;
-            */
           }
-        }
-      }
-      break;
-      case 2: //fill mode
-      {
-        for (nnz_lno_t ii = 0; ii < insertion_count; ++ii){
-          const nnz_lno_t set_ind = indices[ii];
-          //const nnz_lno_t shift = set_ind << set_shift;
-
-          //nnz_lno_t c_rows = sets[set_ind];
-          nnz_lno_t c_rows = sets2[set_ind];
-          sets[set_ind] = 0;
-          sets2[set_ind] = 0;
-
-          if (c_rows){
-            visit_applier(row_index, set_ind, c_rows, tid);
+
+          switch (count_or_fill_mode) {
+            case 0:  // count mode
+            default: {
+              for (nnz_lno_t ii = 0; ii < insertion_count; ++ii) {
+                nnz_lno_t set_ind = indices[ii];
+                // nnz_lno_t c_rows = sets[set_ind];
+                nnz_lno_t c_rows = sets2[set_ind];
+                sets[set_ind]    = 0;
+                sets2[set_ind]   = 0;
+
+                // count number of set bits
+                /*
+                nnz_lno_t num_el2 = 0;
+                for (; c_rows; num_el2++) {
+                  c_rows = c_rows & (c_rows - 1); // clear the least significant
+                bit set
+                }
+                num_el += num_el2;
+                */
+                // num_el += KokkosKernels::Impl::set_bit_count(c_rows);
+                num_el += KokkosKernels::Impl::pop_count(c_rows);
+              }
+              rowmapC[row_index] = num_el;
+            } break;
+            case 1:  // fill mode
+            {
+              size_type num_el = rowmapC[row_index];
+
+              for (nnz_lno_t ii = 0; ii < insertion_count; ++ii) {
+                const nnz_lno_t set_ind = indices[ii];
+                const nnz_lno_t shift   = set_ind << set_shift;
+
+                // nnz_lno_t c_rows = sets[set_ind];
+                nnz_lno_t c_rows = sets2[set_ind];
+                sets[set_ind]    = 0;
+                sets2[set_ind]   = 0;
+
+                // int current_row = 0;
+                nnz_lno_t unit = 1;
+                while (c_rows) {
+                  int least_set =
+                      KokkosKernels::Impl::least_set_bit(c_rows) - 1;
+                  entriesC[num_el++] = shift + least_set;
+                  c_rows             = c_rows & ~(unit << least_set);
+                  /*
+                  if (c_rows & unit){
+                    //insert indices.
+                    entriesC[num_el++] = shift + current_row;
+                  }
+                  current_row++;
+                  c_rows = c_rows & ~unit;
+                  unit = unit << 1;
+                  */
+                }
+              }
+            } break;
+            case 2:  // fill mode
+            {
+              for (nnz_lno_t ii = 0; ii < insertion_count; ++ii) {
+                const nnz_lno_t set_ind = indices[ii];
+                // const nnz_lno_t shift = set_ind << set_shift;
+
+                // nnz_lno_t c_rows = sets[set_ind];
+                nnz_lno_t c_rows = sets2[set_ind];
+                sets[set_ind]    = 0;
+                sets2[set_ind]   = 0;
+
+                if (c_rows) {
+                  visit_applier(row_index, set_ind, c_rows, tid);
+                }
+              }
+            } break;
           }
-        }
-      }
-      break;
-      }
-    }
-    );
+        });
 
     m_space.release_chunk(indices);
   }
 
-
   KOKKOS_INLINE_FUNCTION
-  void operator()(const MultiCoreTag&, const team_member_t & teamMember) const {
-
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
-
-    //get memory from memory pool.
-    volatile nnz_lno_t * tmp = NULL;
+  void operator()(const MultiCoreTag &, const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
+
+    // get memory from memory pool.
+    volatile nnz_lno_t *tmp = NULL;
     nnz_lno_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-    while (tmp == NULL){
-      tmp = (volatile nnz_lno_t * )( m_space.allocate_chunk(tid));
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(m_space.allocate_chunk(tid));
     }
 
-    //set first to globally used hash indices.
-    nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *) tmp;
+    // set first to globally used hash indices.
+    nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *)tmp;
     tmp += pow2_hash_size;
 
-    //create hashmap accumulator.
-    KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,nnz_lno_t,nnz_lno_t,KokkosKernels::Experimental::HashOpType::bitwiseAnd> 
-    hm2(MaxRoughNonZero, pow2_hash_func, nullptr, nullptr, nullptr, nullptr);
+    // create hashmap accumulator.
+    KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, nnz_lno_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>
+        hm2(MaxRoughNonZero, pow2_hash_func, nullptr, nullptr, nullptr,
+            nullptr);
 
-    //set memory for hash begins.
-    hm2.hash_begins = (nnz_lno_t *) (tmp);
-    tmp += pow2_hash_size ;
+    // set memory for hash begins.
+    hm2.hash_begins = (nnz_lno_t *)(tmp);
+    tmp += pow2_hash_size;
 
-    hm2.hash_nexts = (nnz_lno_t *) (tmp);
+    hm2.hash_nexts = (nnz_lno_t *)(tmp);
     tmp += MaxRoughNonZero;
 
-    //holds the keys
-    hm2.keys = (nnz_lno_t *) (tmp);
+    // holds the keys
+    hm2.keys = (nnz_lno_t *)(tmp);
     tmp += MaxRoughNonZero;
-    hm2.values = (nnz_lno_t *) (tmp);
+    hm2.values = (nnz_lno_t *)(tmp);
 
-    //this is my values2 array. it is parallel to values.
-    //currently hashmap accumulator wont use it.
+    // this is my values2 array. it is parallel to values.
+    // currently hashmap accumulator wont use it.
     tmp += MaxRoughNonZero;
-    nnz_lno_t *values2 = (nnz_lno_t *) (tmp);
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index){
-      nnz_lno_t globally_used_hash_count = 0;
-      nnz_lno_t used_hash_size = 0;
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
-      //traverse columns of A.
-      for (nnz_lno_t colind = 0; colind < col_size; ++colind){
-        size_type a_col = colind + col_begin;
-        nnz_lno_t rowB = entriesA[a_col];
-
-        size_type rowBegin = row_pointer_begins_B[rowB];
-        nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
-
-        //traverse columns of B
-        for (nnz_lno_t i = 0; i < left_work; ++i){
-
-          const size_type adjind = i + rowBegin;
-
-          nnz_lno_t b_set_ind = entriesB[adjind];
-          nnz_lno_t b_set = entriesSetsB[adjind];
-
-          //insert it to first hash.
-          hm2.sequential_insert_into_hash_mergeOr_TriangleCount_TrackHashes(
-            b_set_ind, b_set, values2,
-            &used_hash_size,
-            &globally_used_hash_count,
-            globally_used_hash_indices
-          );
-        }
-      }
-
-      switch (count_or_fill_mode){
-      case 0: //count mode
-      default:
-      {      //when done with all insertions, traverse insertions and get the size.
-        nnz_lno_t num_el = 0;
-        for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii){
-          nnz_lno_t c_rows = values2[ii];
-          //nnz_lno_t num_el2 = 0;
-
-          //the number of set bits.
-          /*
-          for (; c_rows; num_el2++) {
-            c_rows = c_rows & (c_rows - 1); // clear the least significant bit set
-          }
-          num_el += num_el2;
-          */
-          //num_el += KokkosKernels::Impl::set_bit_count(c_rows);
-          num_el += KokkosKernels::Impl::pop_count(c_rows);
-        }
-
-        //set the row size.
-        rowmapC[row_index] = num_el;
-      }
-      break;
-      case 1: //fill mode
-      {
-        size_type num_el = rowmapC[row_index];
-
-        for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii){
-          const nnz_lno_t c_rows_setind = hm2.keys[ii];
-          nnz_lno_t c_rows = values2[ii];
-          const nnz_lno_t shift = c_rows_setind << set_shift;
-
-          //int current_row = 0;
-          nnz_lno_t unit = 1;
-
-          while (c_rows){
-
-            int least_set = KokkosKernels::Impl::least_set_bit(c_rows) - 1;
-            entriesC[num_el++] = shift + least_set;
-            c_rows = c_rows & ~(unit << least_set);
-            /*
-            if (c_rows & unit){
-              //insert indices.
-              entriesC[num_el++] = shift + current_row;
+    nnz_lno_t *values2 = (nnz_lno_t *)(tmp);
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          nnz_lno_t globally_used_hash_count = 0;
+          nnz_lno_t used_hash_size           = 0;
+          const size_type col_begin          = row_mapA[row_index];
+          const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
+          // traverse columns of A.
+          for (nnz_lno_t colind = 0; colind < col_size; ++colind) {
+            size_type a_col = colind + col_begin;
+            nnz_lno_t rowB  = entriesA[a_col];
+
+            size_type rowBegin  = row_pointer_begins_B[rowB];
+            nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
+
+            // traverse columns of B
+            for (nnz_lno_t i = 0; i < left_work; ++i) {
+              const size_type adjind = i + rowBegin;
+
+              nnz_lno_t b_set_ind = entriesB[adjind];
+              nnz_lno_t b_set     = entriesSetsB[adjind];
+
+              // insert it to first hash.
+              hm2.sequential_insert_into_hash_mergeOr_TriangleCount_TrackHashes(
+                  b_set_ind, b_set, values2, &used_hash_size,
+                  &globally_used_hash_count, globally_used_hash_indices);
             }
-            current_row++;
-            c_rows = c_rows & ~unit;
-            unit = unit << 1;
-            */
-          }
-        }
-      }
-      break;
-
-      case 2: //fill mode
-      {
-        for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii){
-          const nnz_lno_t c_rows_setind = hm2.keys[ii];
-          nnz_lno_t c_rows = values2[ii];
-          if (c_rows){
-            visit_applier(row_index, c_rows_setind, c_rows, tid);
           }
 
-        }
-      }
-      break;
-      }
+          switch (count_or_fill_mode) {
+            case 0:     // count mode
+            default: {  // when done with all insertions, traverse insertions
+                        // and get the size.
+              nnz_lno_t num_el = 0;
+              for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii) {
+                nnz_lno_t c_rows = values2[ii];
+                // nnz_lno_t num_el2 = 0;
+
+                // the number of set bits.
+                /*
+                for (; c_rows; num_el2++) {
+                  c_rows = c_rows & (c_rows - 1); // clear the least significant
+                bit set
+                }
+                num_el += num_el2;
+                */
+                // num_el += KokkosKernels::Impl::set_bit_count(c_rows);
+                num_el += KokkosKernels::Impl::pop_count(c_rows);
+              }
+
+              // set the row size.
+              rowmapC[row_index] = num_el;
+            } break;
+            case 1:  // fill mode
+            {
+              size_type num_el = rowmapC[row_index];
+
+              for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii) {
+                const nnz_lno_t c_rows_setind = hm2.keys[ii];
+                nnz_lno_t c_rows              = values2[ii];
+                const nnz_lno_t shift         = c_rows_setind << set_shift;
+
+                // int current_row = 0;
+                nnz_lno_t unit = 1;
+
+                while (c_rows) {
+                  int least_set =
+                      KokkosKernels::Impl::least_set_bit(c_rows) - 1;
+                  entriesC[num_el++] = shift + least_set;
+                  c_rows             = c_rows & ~(unit << least_set);
+                  /*
+                  if (c_rows & unit){
+                    //insert indices.
+                    entriesC[num_el++] = shift + current_row;
+                  }
+                  current_row++;
+                  c_rows = c_rows & ~unit;
+                  unit = unit << 1;
+                  */
+                }
+              }
+            } break;
+
+            case 2:  // fill mode
+            {
+              for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii) {
+                const nnz_lno_t c_rows_setind = hm2.keys[ii];
+                nnz_lno_t c_rows              = values2[ii];
+                if (c_rows) {
+                  visit_applier(row_index, c_rows_setind, c_rows, tid);
+                }
+              }
+            } break;
+          }
 
-      //clear the begins.
-      for (int i = 0; i < globally_used_hash_count; ++i){
-        nnz_lno_t dirty_hash = globally_used_hash_indices[i];
-        hm2.hash_begins[dirty_hash] = -1;
-      }
-    });
+          // clear the begins.
+          for (int i = 0; i < globally_used_hash_count; ++i) {
+            nnz_lno_t dirty_hash        = globally_used_hash_indices[i];
+            hm2.hash_begins[dirty_hash] = -1;
+          }
+        });
 
     m_space.release_chunk(globally_used_hash_indices);
   }
 
-
-
   KOKKOS_INLINE_FUNCTION
-  void operator()(const MultiCoreTag2&, const team_member_t & teamMember) const {
-    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_row_chunk_size;
-    const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
-
-
-    //get memory from memory pool.
-    volatile nnz_lno_t * tmp = NULL;
+  void operator()(const MultiCoreTag2 &,
+                  const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin =
+        teamMember.league_rank() * team_row_chunk_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_row_chunk_size, numrows);
+
+    // get memory from memory pool.
+    volatile nnz_lno_t *tmp = NULL;
     nnz_lno_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-    while (tmp == NULL){
-      tmp = (volatile nnz_lno_t * )( m_space.allocate_chunk(tid));
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(m_space.allocate_chunk(tid));
     }
 
-    //set first to globally used hash indices.
-    nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *) tmp;
+    // set first to globally used hash indices.
+    nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *)tmp;
     tmp += pow2_hash_size;
 
-    //create hashmap accumulator.
-    KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,nnz_lno_t,nnz_lno_t,KokkosKernels::Experimental::HashOpType::bitwiseAnd> 
-    hm2(MaxRoughNonZero, pow2_hash_func, nullptr, nullptr, nullptr, nullptr);
+    // create hashmap accumulator.
+    KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, nnz_lno_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>
+        hm2(MaxRoughNonZero, pow2_hash_func, nullptr, nullptr, nullptr,
+            nullptr);
 
-    //set memory for hash begins.
-    hm2.hash_begins = (nnz_lno_t *) (tmp);
-    tmp += pow2_hash_size ;
+    // set memory for hash begins.
+    hm2.hash_begins = (nnz_lno_t *)(tmp);
+    tmp += pow2_hash_size;
 
-    hm2.hash_nexts = (nnz_lno_t *) (tmp);
+    hm2.hash_nexts = (nnz_lno_t *)(tmp);
     tmp += MaxRoughNonZero;
 
-    //holds the keys
-    hm2.keys = (nnz_lno_t *) (tmp);
+    // holds the keys
+    hm2.keys = (nnz_lno_t *)(tmp);
     tmp += MaxRoughNonZero;
-    hm2.values = (nnz_lno_t *) (tmp);
+    hm2.values = (nnz_lno_t *)(tmp);
 
-    //this is my values2 array. it is parallel to values.
-    //currently hashmap accumulator wont use it.
+    // this is my values2 array. it is parallel to values.
+    // currently hashmap accumulator wont use it.
     tmp += MaxRoughNonZero;
-    nnz_lno_t *values2 = (nnz_lno_t *) (tmp);
-
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index){
-      nnz_lno_t globally_used_hash_count = 0;
-      nnz_lno_t used_hash_size = 0;
-      const size_type col_begin = row_mapA[row_index];
-      const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
-      //std::cout << "row:" << row_index << std::endl;
-      if (col_size){
-        //first insert the minimum row.
-        nnz_lno_t min_row_b = this->min_size_row_for_each_row[row_index];
-        size_type min_row_begin = row_pointer_begins_B[min_row_b];
-        nnz_lno_t min_row_left_work = row_pointer_ends_B[min_row_b] - min_row_begin;
-
-        //traverse columns of B
-        for (nnz_lno_t i = 0; i < min_row_left_work; ++i){
-          const size_type adjind = i + min_row_begin;
-          nnz_lno_t b_set_ind = entriesB[adjind];
-          nnz_lno_t b_set = entriesSetsB[adjind];
-
-
-          //std::cout << "\t union hash:" << hash << " bset:" << b_set << " b_set_ind:" << b_set_ind << std::endl;
-
-          //insert it to first hash.
-          // issue-508, TODO: this invocation is not correct.
-          
-          hm2.sequential_insert_into_hash_TriangleCount_TrackHashes(
-            b_set_ind, b_set, values2,
-            &used_hash_size,
-            &globally_used_hash_count,
-            globally_used_hash_indices
-          );
-        }
-
-
-        //traverse columns of A.
-        for (nnz_lno_t colind = 0; colind < col_size; ++colind){
-          size_type a_col = colind + col_begin;
-          nnz_lno_t rowB = entriesA[a_col];
-          if (rowB == min_row_b) continue;
-          size_type rowBegin = row_pointer_begins_B[rowB];
-          nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
-
-          //traverse columns of B
-          for (nnz_lno_t i = 0; i < left_work; ++i){
-
-            const size_type adjind = i + rowBegin;
-
-
-            nnz_lno_t b_set_ind = entriesB[adjind];
-            nnz_lno_t b_set = entriesSetsB[adjind];
+    nnz_lno_t *values2 = (nnz_lno_t *)(tmp);
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          nnz_lno_t globally_used_hash_count = 0;
+          nnz_lno_t used_hash_size           = 0;
+          const size_type col_begin          = row_mapA[row_index];
+          const nnz_lno_t col_size = row_mapA[row_index + 1] - col_begin;
+          // std::cout << "row:" << row_index << std::endl;
+          if (col_size) {
+            // first insert the minimum row.
+            nnz_lno_t min_row_b = this->min_size_row_for_each_row[row_index];
+            size_type min_row_begin = row_pointer_begins_B[min_row_b];
+            nnz_lno_t min_row_left_work =
+                row_pointer_ends_B[min_row_b] - min_row_begin;
+
+            // traverse columns of B
+            for (nnz_lno_t i = 0; i < min_row_left_work; ++i) {
+              const size_type adjind = i + min_row_begin;
+              nnz_lno_t b_set_ind    = entriesB[adjind];
+              nnz_lno_t b_set        = entriesSetsB[adjind];
+
+              // std::cout << "\t union hash:" << hash << " bset:" << b_set << "
+              // b_set_ind:" << b_set_ind << std::endl;
+
+              // insert it to first hash.
+              // issue-508, TODO: this invocation is not correct.
+
+              hm2.sequential_insert_into_hash_TriangleCount_TrackHashes(
+                  b_set_ind, b_set, values2, &used_hash_size,
+                  &globally_used_hash_count, globally_used_hash_indices);
+            }
 
-            //std::cout << "\t and hash:" << hash << " bset:" << b_set << " b_set_ind:" << b_set_ind << std::endl;
-            //insert it to first hash.
-            hm2.sequential_insert_into_hash_mergeAnd_TriangleCount_TrackHashes(
-              b_set_ind, b_set, values2,
-              &used_hash_size,
-              &globally_used_hash_count,
-              globally_used_hash_indices
-            );
+            // traverse columns of A.
+            for (nnz_lno_t colind = 0; colind < col_size; ++colind) {
+              size_type a_col = colind + col_begin;
+              nnz_lno_t rowB  = entriesA[a_col];
+              if (rowB == min_row_b) continue;
+              size_type rowBegin  = row_pointer_begins_B[rowB];
+              nnz_lno_t left_work = row_pointer_ends_B[rowB] - rowBegin;
+
+              // traverse columns of B
+              for (nnz_lno_t i = 0; i < left_work; ++i) {
+                const size_type adjind = i + rowBegin;
+
+                nnz_lno_t b_set_ind = entriesB[adjind];
+                nnz_lno_t b_set     = entriesSetsB[adjind];
+
+                // std::cout << "\t and hash:" << hash << " bset:" << b_set << "
+                // b_set_ind:" << b_set_ind << std::endl; insert it to first
+                // hash.
+                hm2.sequential_insert_into_hash_mergeAnd_TriangleCount_TrackHashes(
+                    b_set_ind, b_set, values2, &used_hash_size,
+                    &globally_used_hash_count, globally_used_hash_indices);
+              }
+            }
           }
-        }
-      }
 
-      switch (count_or_fill_mode){
-      case 0: //count mode
-      default:
-      {      //when done with all insertions, traverse insertions and get the size.
-        nnz_lno_t num_el = 0;
-        for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii){
-          if (values2[ii] != col_size) continue;
-          nnz_lno_t c_rows = hm2.values[ii];
-
-
-          //the number of set bits.
-          /*
-          nnz_lno_t num_el2 = 0;
-          for (; c_rows; num_el2++) {
-            c_rows = c_rows & (c_rows - 1); // clear the least significant bit set
+          switch (count_or_fill_mode) {
+            case 0:     // count mode
+            default: {  // when done with all insertions, traverse insertions
+                        // and get the size.
+              nnz_lno_t num_el = 0;
+              for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii) {
+                if (values2[ii] != col_size) continue;
+                nnz_lno_t c_rows = hm2.values[ii];
+
+                // the number of set bits.
+                /*
+                nnz_lno_t num_el2 = 0;
+                for (; c_rows; num_el2++) {
+                  c_rows = c_rows & (c_rows - 1); // clear the least significant
+                bit set
+                }
+                num_el += num_el2;
+                */
+                // num_el += KokkosKernels::Impl::set_bit_count(c_rows);
+                num_el += KokkosKernels::Impl::pop_count(c_rows);
+              }
+
+              // set the row size.
+              rowmapC[row_index] = num_el;
+            } break;
+            case 1:  // fill mode
+            {
+              size_type num_el = rowmapC[row_index];
+
+              for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii) {
+                if (values2[ii] != col_size) continue;
+                nnz_lno_t c_rows_setind = hm2.keys[ii];
+                nnz_lno_t c_rows        = hm2.values[ii];
+                const nnz_lno_t shift   = c_rows_setind << set_shift;
+
+                // int current_row = 0;
+                nnz_lno_t unit = 1;
+
+                while (c_rows) {
+                  int least_set =
+                      KokkosKernels::Impl::least_set_bit(c_rows) - 1;
+                  entriesC[num_el++] = shift + least_set;
+                  c_rows             = c_rows & ~(unit << least_set);
+                  /*
+                  if (c_rows & unit){
+                    //insert indices.
+                    entriesC[num_el++] = shift + current_row;
+                  }
+                  current_row++;
+                  c_rows = c_rows & ~unit;
+                  unit = unit << 1;
+                  */
+                }
+              }
+            } break;
+            case 2:  // fill mode
+            {
+              for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii) {
+                if (values2[ii] != col_size) continue;
+                nnz_lno_t c_rows_setind = hm2.keys[ii];
+                nnz_lno_t c_rows        = hm2.values[ii];
+                if (c_rows) {
+                  visit_applier(row_index, c_rows_setind, c_rows, tid);
+                }
+              }
+            } break;
           }
-          num_el += num_el2;
-          */
-          //num_el += KokkosKernels::Impl::set_bit_count(c_rows);
-          num_el += KokkosKernels::Impl::pop_count(c_rows);
-        }
 
-        //set the row size.
-        rowmapC[row_index] = num_el;
-      }
-      break;
-      case 1: //fill mode
-      {
-        size_type num_el = rowmapC[row_index];
-
-        for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii){
-          if (values2[ii] != col_size) continue;
-          nnz_lno_t c_rows_setind = hm2.keys[ii];
-          nnz_lno_t c_rows = hm2.values[ii];
-          const nnz_lno_t shift = c_rows_setind << set_shift;
-
-          //int current_row = 0;
-          nnz_lno_t unit = 1;
-
-          while (c_rows){
-
-            int least_set = KokkosKernels::Impl::least_set_bit(c_rows) - 1;
-            entriesC[num_el++] = shift + least_set;
-            c_rows = c_rows & ~(unit << least_set);
-            /*
-            if (c_rows & unit){
-              //insert indices.
-              entriesC[num_el++] = shift + current_row;
-            }
-            current_row++;
-            c_rows = c_rows & ~unit;
-            unit = unit << 1;
-            */
+          // clear the begins.
+          for (int i = 0; i < globally_used_hash_count; ++i) {
+            nnz_lno_t dirty_hash        = globally_used_hash_indices[i];
+            hm2.hash_begins[dirty_hash] = -1;
           }
-        }
-      }
-      break;
-      case 2: //fill mode
-      {
-        for (nnz_lno_t ii = 0; ii < used_hash_size; ++ii){
-          if (values2[ii] != col_size) continue;
-          nnz_lno_t c_rows_setind = hm2.keys[ii];
-          nnz_lno_t c_rows = hm2.values[ii];
-          if (c_rows){
-            visit_applier(row_index, c_rows_setind, c_rows, tid);
-          }
-
-        }
-      }
-      break;
-
-
-      }
-
-
-
-      //clear the begins.
-      for (int i = 0; i < globally_used_hash_count; ++i){
-        nnz_lno_t dirty_hash = globally_used_hash_indices[i];
-        hm2.hash_begins[dirty_hash] = -1;
-      }
-    });
+        });
 
     m_space.release_chunk(globally_used_hash_indices);
   }
 
-
   KOKKOS_INLINE_FUNCTION
-  void operator()(const GPUTag&, const team_member_t & teamMember) const {
-  }
-
-  size_t team_shmem_size (int team_size) const {
-    return shared_memory_size;
-  }
+  void operator()(const GPUTag &, const team_member_t &teamMember) const {}
 
+  size_t team_shmem_size(int team_size) const { return shared_memory_size; }
 };
 
-
-template <typename HandleType,
-          typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
 template <typename struct_visit_t>
-void KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-    triangle_count_ai_no_compression(
-        const nnz_lno_t m,
-        const size_type* row_mapA_,
-        const nnz_lno_t * entriesA_,
-
-        const size_type bnnz,
-        const size_type * rowmapB_begins,
-        const size_type * rowmapB_ends,
-        const nnz_lno_t * entriesB,
-        struct_visit_t visit_applier){
-
-  //for now I assume that this is 2 as we are only dealing with triangles.
-  //if we go 4 cliques, this needs to be 3.
-  //I will parametrized this later.
+void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                  a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                  b_scalar_nnz_view_t_>::
+    triangle_count_ai_no_compression(const nnz_lno_t m,
+                                     const size_type *row_mapA_,
+                                     const nnz_lno_t *entriesA_,
+
+                                     const size_type bnnz,
+                                     const size_type *rowmapB_begins,
+                                     const size_type *rowmapB_ends,
+                                     const nnz_lno_t *entriesB,
+                                     struct_visit_t visit_applier) {
+  // for now I assume that this is 2 as we are only dealing with triangles.
+  // if we go 4 cliques, this needs to be 3.
+  // I will parametrized this later.
   const int num_left_side_nnz_per_row = 2;
-  const nnz_lno_t * min_result_row_for_each_row = this->handle->get_spgemm_handle()->get_min_col_of_row().data();
-  nnz_lno_t max_row_size = this->handle->get_spgemm_handle()->get_max_result_nnz();
-  constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>();
-
-  typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space;
-  int suggested_vector_size = this->handle->get_suggested_vector_size(this->b_row_cnt, bnnz);
-
-  //this kernel does not really work well if the vector size is less than 4.
+  const nnz_lno_t *min_result_row_for_each_row =
+      this->handle->get_spgemm_handle()->get_min_col_of_row().data();
+  nnz_lno_t max_row_size =
+      this->handle->get_spgemm_handle()->get_max_result_nnz();
+  constexpr bool exec_gpu =
+      KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>();
+
+  typedef KokkosKernels::Impl::UniformMemoryPool<MyTempMemorySpace, nnz_lno_t>
+      pool_memory_space;
+  int suggested_vector_size =
+      this->handle->get_suggested_vector_size(this->b_row_cnt, bnnz);
+
+  // this kernel does not really work well if the vector size is less than 4.
   if (suggested_vector_size < 4 && exec_gpu) {
-    if (KOKKOSKERNELS_VERBOSE) std::cout << "\tVecSize:" << suggested_vector_size << " Setting it to 4" << std::endl;
+    if (KOKKOSKERNELS_VERBOSE)
+      std::cout << "\tVecSize:" << suggested_vector_size << " Setting it to 4"
+                << std::endl;
     suggested_vector_size = 4;
   }
 
-  int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
+  int suggested_team_size =
+      this->handle->get_suggested_team_size(suggested_vector_size);
   nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(
-        suggested_team_size,concurrency, a_row_cnt);
+      suggested_team_size, concurrency, a_row_cnt);
 
   nnz_lno_t dense_col_size = this->b_col_cnt;
 
-  //round up maxNumRoughNonzeros to closest power of 2.
+  // round up maxNumRoughNonzeros to closest power of 2.
   nnz_lno_t min_hash_size = 1;
-  while (max_row_size > min_hash_size){
+  while (max_row_size > min_hash_size) {
     min_hash_size *= 2;
   }
 
-  //set the chunksize.
-  size_t sparse_accumulator_chunksize = min_hash_size ; //this is for used hash indices
-  sparse_accumulator_chunksize += min_hash_size ; //this is for the hash begins
-  sparse_accumulator_chunksize += max_row_size ; //this is for hash nexts
-  sparse_accumulator_chunksize += max_row_size ; //this is for hash keys
-  //I need to store the edge indices for each result.
-  //Edge indices are size type, and I need edge indices as many as the num_left_side_nnz_per_row.
-  sparse_accumulator_chunksize += max_row_size * num_left_side_nnz_per_row * sizeof(size_type) / sizeof(nnz_lno_t);
-
-  size_t dense_accumulator_chunksize = max_row_size; //this is for used keys
-  dense_accumulator_chunksize += dense_col_size *
-      num_left_side_nnz_per_row * sizeof(size_type) / sizeof(nnz_lno_t); //this is for values-1
-
-
-  //initizalize value for the mem pool
+  // set the chunksize.
+  size_t sparse_accumulator_chunksize =
+      min_hash_size;  // this is for used hash indices
+  sparse_accumulator_chunksize += min_hash_size;  // this is for the hash begins
+  sparse_accumulator_chunksize += max_row_size;   // this is for hash nexts
+  sparse_accumulator_chunksize += max_row_size;   // this is for hash keys
+  // I need to store the edge indices for each result.
+  // Edge indices are size type, and I need edge indices as many as the
+  // num_left_side_nnz_per_row.
+  sparse_accumulator_chunksize += max_row_size * num_left_side_nnz_per_row *
+                                  sizeof(size_type) / sizeof(nnz_lno_t);
+
+  size_t dense_accumulator_chunksize = max_row_size;  // this is for used keys
+  dense_accumulator_chunksize += dense_col_size * num_left_side_nnz_per_row *
+                                 sizeof(size_type) /
+                                 sizeof(nnz_lno_t);  // this is for values-1
+
+  // initizalize value for the mem pool
   int pool_init_val = -1;
 
-  if (KOKKOSKERNELS_VERBOSE){
-    std::cout << "\tDense_col_size:" << dense_col_size << " max_row_size:" << max_row_size << std::endl;
-    std::cout << "\tSparse chunksize:" << sparse_accumulator_chunksize << " dense_chunksize:"
-                << dense_accumulator_chunksize << " concurrency:" << concurrency << std::endl;
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\tDense_col_size:" << dense_col_size
+              << " max_row_size:" << max_row_size << std::endl;
+    std::cout << "\tSparse chunksize:" << sparse_accumulator_chunksize
+              << " dense_chunksize:" << dense_accumulator_chunksize
+              << " concurrency:" << concurrency << std::endl;
   }
   size_t accumulator_chunksize = sparse_accumulator_chunksize;
-  bool use_dense_accumulator = false;
-  if (
-      (  (spgemm_algorithm == KokkosKernels::Experimental::Graph::SPGEMM_KK_TRIANGLE_DEFAULT ||
-          spgemm_algorithm == KokkosKernels::Experimental::Graph::SPGEMM_KK_TRIANGLE_IA_DEFAULT) &&
-          ((concurrency <=  sizeof (nnz_lno_t) * 8) || (dense_accumulator_chunksize < sparse_accumulator_chunksize)))
-          ||
-          (spgemm_algorithm == KokkosKernels::Experimental::Graph::SPGEMM_KK_TRIANGLE_DENSE ||
-              spgemm_algorithm == KokkosKernels::Experimental::Graph::SPGEMM_KK_TRIANGLE_IA_DENSE)){
-
+  bool use_dense_accumulator   = false;
+  if (((spgemm_algorithm ==
+            KokkosKernels::Experimental::Graph::SPGEMM_KK_TRIANGLE_DEFAULT ||
+        spgemm_algorithm == KokkosKernels::Experimental::Graph::
+                                SPGEMM_KK_TRIANGLE_IA_DEFAULT) &&
+       ((concurrency <= sizeof(nnz_lno_t) * 8) ||
+        (dense_accumulator_chunksize < sparse_accumulator_chunksize))) ||
+      (spgemm_algorithm ==
+           KokkosKernels::Experimental::Graph::SPGEMM_KK_TRIANGLE_DENSE ||
+       spgemm_algorithm ==
+           KokkosKernels::Experimental::Graph::SPGEMM_KK_TRIANGLE_IA_DENSE)) {
     use_dense_accumulator = true;
-    if (KOKKOSKERNELS_VERBOSE){
-      std::cout << "\tUsing Dense Accumulator instead. Sparse chunksize:" <<
-          sparse_accumulator_chunksize << " dense_chunksize:" << dense_accumulator_chunksize << " concurrency:" << concurrency << std::endl;
+    if (KOKKOSKERNELS_VERBOSE) {
+      std::cout << "\tUsing Dense Accumulator instead. Sparse chunksize:"
+                << sparse_accumulator_chunksize
+                << " dense_chunksize:" << dense_accumulator_chunksize
+                << " concurrency:" << concurrency << std::endl;
     }
     accumulator_chunksize = dense_accumulator_chunksize;
-    //if speed is set, and exec space is cpu, then  we use dense accumulators.
-    //or if memspeed is set, and concurrency is not high, we use dense accumulators.
-    max_row_size = dense_col_size;
+    // if speed is set, and exec space is cpu, then  we use dense accumulators.
+    // or if memspeed is set, and concurrency is not high, we use dense
+    // accumulators.
+    max_row_size  = dense_col_size;
     pool_init_val = 0;
   }
 
-  KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk;
+  KokkosKernels::Impl::PoolType my_pool_type =
+      KokkosKernels::Impl::OneThread2OneChunk;
   if (exec_gpu) {
     my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
   }
-  nnz_lno_t num_chunks = this->template compute_num_pool_chunks<pool_memory_space>
-    (accumulator_chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
-
-  if (KOKKOSKERNELS_VERBOSE){
-    std::cout <<  "\tPool Size (MB):" << (num_chunks * accumulator_chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. <<
-        " num_chunks:" << num_chunks <<
-        " chunksize:" << accumulator_chunksize << std::endl;
+  nnz_lno_t num_chunks =
+      this->template compute_num_pool_chunks<pool_memory_space>(
+          accumulator_chunksize * sizeof(nnz_lno_t),
+          concurrency / suggested_vector_size);
+
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\tPool Size (MB):"
+              << (num_chunks * accumulator_chunksize * sizeof(nnz_lno_t)) /
+                     1024. / 1024.
+              << " num_chunks:" << num_chunks
+              << " chunksize:" << accumulator_chunksize << std::endl;
   }
 
   Kokkos::Timer timer1;
-  pool_memory_space m_space(num_chunks, accumulator_chunksize, pool_init_val,  my_pool_type);
+  pool_memory_space m_space(num_chunks, accumulator_chunksize, pool_init_val,
+                            my_pool_type);
   MyExecSpace().fence();
-  if (KOKKOSKERNELS_VERBOSE){
+  if (KOKKOSKERNELS_VERBOSE) {
     std::cout << "\tPool Alloc Time:" << timer1.seconds() << std::endl;
   }
 
-  TriangleCount_No_Compression<pool_memory_space, struct_visit_t>
-  sc(
-      m,
-      row_mapA_,
-      entriesA_,
-
-      rowmapB_begins,
-      rowmapB_ends,
-      entriesB,
-      entriesBSets,
-
-      rowmapC,
-      entriesC,
-
-      min_hash_size,
-      max_row_size,
-      shmem_size,
-      suggested_team_size,
-      team_row_chunk_size,
-      suggested_vector_size,
-      m_space,
-      MyEnumExecSpace,
-      is_symbolic_or_numeric,
-      min_result_row_for_each_row,
-      visit_applier,
+  TriangleCount_No_Compression<pool_memory_space, struct_visit_t> sc(
+      m, row_mapA_, entriesA_,
+
+      rowmapB_begins, rowmapB_ends, entriesB, entriesBSets,
+
+      rowmapC, entriesC,
+
+      min_hash_size, max_row_size, shmem_size, suggested_team_size,
+      team_row_chunk_size, suggested_vector_size, m_space, MyEnumExecSpace,
+      is_symbolic_or_numeric, min_result_row_for_each_row, visit_applier,
       KOKKOSKERNELS_VERBOSE);
 
-  if (KOKKOSKERNELS_VERBOSE){
-    std::cout << "\tTriangleCount VecS:" << suggested_vector_size <<
-                  " Team::" << suggested_team_size <<
-                  " Chunk:" << team_row_chunk_size <<
-                  " shmem_size:" << shmem_size << std::endl;
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\tTriangleCount VecS:" << suggested_vector_size
+              << " Team::" << suggested_team_size
+              << " Chunk:" << team_row_chunk_size
+              << " shmem_size:" << shmem_size << std::endl;
   }
 
   timer1.reset();
 
   if (exec_gpu) {
-    Kokkos::parallel_for( gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc);
-  }
-  else {
-    if (use_dense_accumulator){
-      if (spgemm_algorithm ==  SPGEMM_KK_TRIANGLE_DEFAULT ||
+    Kokkos::parallel_for(
+        gpu_team_policy_t(m / suggested_team_size + 1, suggested_team_size,
+                          suggested_vector_size),
+        sc);
+  } else {
+    if (use_dense_accumulator) {
+      if (spgemm_algorithm == SPGEMM_KK_TRIANGLE_DEFAULT ||
           spgemm_algorithm == SPGEMM_KK_TRIANGLE_DENSE ||
-          spgemm_algorithm == SPGEMM_KK_TRIANGLE_MEM){
-        if (use_dynamic_schedule){
-          Kokkos::parallel_for( dynamic_multicore_dense_team_count_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
+          spgemm_algorithm == SPGEMM_KK_TRIANGLE_MEM) {
+        if (use_dynamic_schedule) {
+          Kokkos::parallel_for(dynamic_multicore_dense_team_count_policy_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
+        } else {
+          Kokkos::parallel_for(multicore_dense_team_count_policy_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
         }
-        else {
-          Kokkos::parallel_for( multicore_dense_team_count_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
+      } else {
+        if (use_dynamic_schedule) {
+          Kokkos::parallel_for(dynamic_multicore_dense_team2_count_policy_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
+
+        } else {
+          Kokkos::parallel_for(multicore_dense_team2_count_policy_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
         }
       }
-      else {
-        if (use_dynamic_schedule){
-          Kokkos::parallel_for( dynamic_multicore_dense_team2_count_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-
-        }
-        else {
-          Kokkos::parallel_for( multicore_dense_team2_count_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-        }
-      }
-    }
-    else {
-
-
-      if (spgemm_algorithm ==  SPGEMM_KK_TRIANGLE_DEFAULT ||
+    } else {
+      if (spgemm_algorithm == SPGEMM_KK_TRIANGLE_DEFAULT ||
           spgemm_algorithm == SPGEMM_KK_TRIANGLE_DENSE ||
-          spgemm_algorithm == SPGEMM_KK_TRIANGLE_MEM){
-        if (use_dynamic_schedule){
-            Kokkos::parallel_for( dynamic_multicore_team_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-          }
-          else {
-            Kokkos::parallel_for( multicore_team_policy_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
-          }}
-      else {
-        if (use_dynamic_schedule){
-          Kokkos::parallel_for( dynamic_multicore_team_policy2_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
+          spgemm_algorithm == SPGEMM_KK_TRIANGLE_MEM) {
+        if (use_dynamic_schedule) {
+          Kokkos::parallel_for(dynamic_multicore_team_policy_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
+        } else {
+          Kokkos::parallel_for(multicore_team_policy_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
         }
-        else {
-          Kokkos::parallel_for( multicore_team_policy2_t(m / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
+      } else {
+        if (use_dynamic_schedule) {
+          Kokkos::parallel_for(dynamic_multicore_team_policy2_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
+        } else {
+          Kokkos::parallel_for(multicore_team_policy2_t(
+                                   m / team_row_chunk_size + 1,
+                                   suggested_team_size, suggested_vector_size),
+                               sc);
         }
       }
-
     }
   }
   MyExecSpace().fence();
 
-  if (KOKKOSKERNELS_VERBOSE){
-    std::cout << "\tKernel time:" << timer1.seconds() << std::endl<< std::endl;
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\tKernel time:" << timer1.seconds() << std::endl << std::endl;
   }
 }
 
-template <typename HandleType,
-typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
 template <typename visit_struct_t>
-void KokkosSPGEMM
-  <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-    b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-    KokkosSPGEMM_generic_triangle_no_compression(visit_struct_t visit_apply){
-
-  size_type const *p_rowmapA = row_mapA.data();
-  nnz_lno_t const *p_entriesA = entriesA.data();
+void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                  a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                  b_scalar_nnz_view_t_>::
+    KokkosSPGEMM_generic_triangle_no_compression(visit_struct_t visit_apply) {
+  size_type const *p_rowmapA        = row_mapA.data();
+  nnz_lno_t const *p_entriesA       = entriesA.data();
   size_type const *p_rowmapB_begins = row_mapB.data();
-  size_type const *p_rowmapB_ends = p_rowmapB_begins  + 1;
-  nnz_lno_t const *p_entriesB = entriesB.data();
+  size_type const *p_rowmapB_ends   = p_rowmapB_begins + 1;
+  nnz_lno_t const *p_entriesB       = entriesB.data();
 
   nnz_lno_persistent_work_view_t min_result_row_for_each_row;
 
   nnz_lno_t maxNumRoughZeros = 0;
-  if (spgemm_algorithm ==  SPGEMM_KK_TRIANGLE_DEFAULT ||
+  if (spgemm_algorithm == SPGEMM_KK_TRIANGLE_DEFAULT ||
       spgemm_algorithm == SPGEMM_KK_TRIANGLE_DENSE ||
-      spgemm_algorithm == SPGEMM_KK_TRIANGLE_MEM){
-    size_t s_maxNumRoughZeros = this->getMaxRoughRowNNZ_p(
-        a_row_cnt, entriesA.extent(0),
-        p_rowmapA, p_entriesA,
-        p_rowmapB_begins, p_rowmapB_ends);
-    //max row size cannot be overeall number of columns.
-    //in this case more than number of compressed columns.
+      spgemm_algorithm == SPGEMM_KK_TRIANGLE_MEM) {
+    size_t s_maxNumRoughZeros =
+        this->getMaxRoughRowNNZ_p(a_row_cnt, entriesA.extent(0), p_rowmapA,
+                                  p_entriesA, p_rowmapB_begins, p_rowmapB_ends);
+    // max row size cannot be overeall number of columns.
+    // in this case more than number of compressed columns.
     nnz_lno_t dense_col_size = this->b_col_cnt;
-    maxNumRoughZeros = KOKKOSKERNELS_MACRO_MIN(dense_col_size, s_maxNumRoughZeros);
-  }
-  else {
+    maxNumRoughZeros =
+        KOKKOSKERNELS_MACRO_MIN(dense_col_size, s_maxNumRoughZeros);
+  } else {
     min_result_row_for_each_row = nnz_lno_persistent_work_view_t(
-          Kokkos::view_alloc(Kokkos::WithoutInitializing, "Min B Row for Each A Row"), this->a_row_cnt);
+        Kokkos::view_alloc(Kokkos::WithoutInitializing,
+                           "Min B Row for Each A Row"),
+        this->a_row_cnt);
     maxNumRoughZeros = this->getMaxRoughRowNNZIntersection_p(
-        a_row_cnt, entriesA.extent(0),
-        p_rowmapA, p_entriesA,
-        p_rowmapB_begins, p_rowmapB_ends, min_result_row_for_each_row.data());
+        a_row_cnt, entriesA.extent(0), p_rowmapA, p_entriesA, p_rowmapB_begins,
+        p_rowmapB_ends, min_result_row_for_each_row.data());
   }
 
-  this->handle->get_spgemm_handle()->set_min_col_of_row(min_result_row_for_each_row);
+  this->handle->get_spgemm_handle()->set_min_col_of_row(
+      min_result_row_for_each_row);
   this->handle->get_spgemm_handle()->set_max_result_nnz(maxNumRoughZeros);
 
-  this->triangle_count_ai_no_compression(
-      a_row_cnt,
-      p_rowmapA,
-      p_entriesA,
-
-      entriesB.extent(0),
-      p_rowmapB_begins,
-      p_rowmapB_ends,
-      p_entriesB,
-      visit_apply
-  );
-}
-
+  this->triangle_count_ai_no_compression(a_row_cnt, p_rowmapA, p_entriesA,
 
+                                         entriesB.extent(0), p_rowmapB_begins,
+                                         p_rowmapB_ends, p_entriesB,
+                                         visit_apply);
 }
-}
+
+}  // namespace Impl
+}  // namespace KokkosSparse
diff --git a/src/sparse/impl/KokkosSparse_spgemm_jacobi_denseacc_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_jacobi_denseacc_impl.hpp
index 29eee30a8b..a954efd905 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_jacobi_denseacc_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_jacobi_denseacc_impl.hpp
@@ -44,250 +44,262 @@
 
 #include "KokkosKernels_Utils.hpp"
 
-namespace KokkosSparse{
-
-  namespace Impl{
-
-    template <typename HandleType,
-	      typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-	      typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-    template <typename a_row_view_t, typename a_nnz_view_t, typename a_scalar_view_t,
-	      typename b_row_view_t, typename b_nnz_view_t, typename b_scalar_view_t,
-	      typename c_row_view_t, typename c_nnz_view_t, typename c_scalar_view_t,
-	      typename dinv_view_t,
-	      typename mpool_type>
-    struct KokkosSPGEMM
-    <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-     b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-    JacobiSpGEMMDenseAcc
-    {
-      nnz_lno_t numrows;
-      nnz_lno_t numcols;
-
-      a_row_view_t row_mapA;
-      a_nnz_view_t entriesA;
-      a_scalar_view_t valuesA;
-
-      b_row_view_t row_mapB;
-      b_nnz_view_t entriesB;
-      b_scalar_view_t valuesB;
-
-      c_row_view_t row_mapC;
-      c_nnz_view_t entriesC;
-      c_scalar_view_t valuesC;
-
-      typename c_scalar_view_t::const_value_type omega;
-      dinv_view_t dinv;
-
-      nnz_lno_t *pEntriesC;
-      scalar_t *pVals;
-
-      mpool_type memory_space;
-      const KokkosKernels::Impl::ExecSpaceType my_exec_space;
-      const nnz_lno_t team_work_size;
-
-
-      JacobiSpGEMMDenseAcc(nnz_lno_t m_, nnz_lno_t k_,
-			   a_row_view_t row_mapA_, a_nnz_view_t entriesA_, a_scalar_view_t valuesA_,
-			   b_row_view_t row_mapB_, b_nnz_view_t entriesB_, b_scalar_view_t valuesB_,
-			   c_row_view_t row_mapC_, c_nnz_view_t entriesC_, c_scalar_view_t valuesC_,
-			   typename c_scalar_view_t::const_value_type omega_, dinv_view_t dinv_,
-			   mpool_type memory_space_,
-			   const KokkosKernels::Impl::ExecSpaceType my_exec_space_,
-			   nnz_lno_t team_row_chunk_size):
-	numrows(m_), numcols(k_),
-        row_mapA(row_mapA_), entriesA(entriesA_), valuesA(valuesA_),
-        row_mapB(row_mapB_), entriesB(entriesB_), valuesB(valuesB_),
-        row_mapC(row_mapC_), entriesC(entriesC_), valuesC(valuesC_),
-	omega(omega_), dinv(dinv_),
-	pEntriesC(entriesC_.data()), pVals(valuesC.data()),
+namespace KokkosSparse {
+
+namespace Impl {
+
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename a_row_view_t, typename a_nnz_view_t,
+          typename a_scalar_view_t, typename b_row_view_t,
+          typename b_nnz_view_t, typename b_scalar_view_t,
+          typename c_row_view_t, typename c_nnz_view_t,
+          typename c_scalar_view_t, typename dinv_view_t, typename mpool_type>
+struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                    a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                    b_scalar_nnz_view_t_>::JacobiSpGEMMDenseAcc {
+  nnz_lno_t numrows;
+  nnz_lno_t numcols;
+
+  a_row_view_t row_mapA;
+  a_nnz_view_t entriesA;
+  a_scalar_view_t valuesA;
+
+  b_row_view_t row_mapB;
+  b_nnz_view_t entriesB;
+  b_scalar_view_t valuesB;
+
+  c_row_view_t row_mapC;
+  c_nnz_view_t entriesC;
+  c_scalar_view_t valuesC;
+
+  typename c_scalar_view_t::const_value_type omega;
+  dinv_view_t dinv;
+
+  nnz_lno_t *pEntriesC;
+  scalar_t *pVals;
+
+  mpool_type memory_space;
+  const KokkosKernels::Impl::ExecSpaceType my_exec_space;
+  const nnz_lno_t team_work_size;
+
+  JacobiSpGEMMDenseAcc(nnz_lno_t m_, nnz_lno_t k_, a_row_view_t row_mapA_,
+                       a_nnz_view_t entriesA_, a_scalar_view_t valuesA_,
+                       b_row_view_t row_mapB_, b_nnz_view_t entriesB_,
+                       b_scalar_view_t valuesB_, c_row_view_t row_mapC_,
+                       c_nnz_view_t entriesC_, c_scalar_view_t valuesC_,
+                       typename c_scalar_view_t::const_value_type omega_,
+                       dinv_view_t dinv_, mpool_type memory_space_,
+                       const KokkosKernels::Impl::ExecSpaceType my_exec_space_,
+                       nnz_lno_t team_row_chunk_size)
+      : numrows(m_),
+        numcols(k_),
+        row_mapA(row_mapA_),
+        entriesA(entriesA_),
+        valuesA(valuesA_),
+        row_mapB(row_mapB_),
+        entriesB(entriesB_),
+        valuesB(valuesB_),
+        row_mapC(row_mapC_),
+        entriesC(entriesC_),
+        valuesC(valuesC_),
+        omega(omega_),
+        dinv(dinv_),
+        pEntriesC(entriesC_.data()),
+        pVals(valuesC.data()),
         memory_space(memory_space_),
         my_exec_space(my_exec_space_),
-        team_work_size(team_row_chunk_size){
-      }
-
-      KOKKOS_INLINE_FUNCTION
-      size_t get_thread_id(const size_t row_index) const{
-	switch (my_exec_space){
-	default:
-	  return row_index;
-#if defined( KOKKOS_ENABLE_SERIAL )
-	case KokkosKernels::Impl::Exec_SERIAL:
-	  return 0;
+        team_work_size(team_row_chunk_size) {}
+
+  KOKKOS_INLINE_FUNCTION
+  size_t get_thread_id(const size_t row_index) const {
+    switch (my_exec_space) {
+      default: return row_index;
+#if defined(KOKKOS_ENABLE_SERIAL)
+      case KokkosKernels::Impl::Exec_SERIAL: return 0;
 #endif
-#if defined( KOKKOS_ENABLE_OPENMP )
-	case KokkosKernels::Impl::Exec_OMP:
-	  return Kokkos::OpenMP::impl_hardware_thread_id();
+#if defined(KOKKOS_ENABLE_OPENMP)
+      case KokkosKernels::Impl::Exec_OMP:
+        return Kokkos::OpenMP::impl_hardware_thread_id();
 #endif
-#if defined( KOKKOS_ENABLE_THREADS )
-	case KokkosKernels::Impl::Exec_PTHREADS:
-	  return Kokkos::Threads::impl_hardware_thread_id();
+#if defined(KOKKOS_ENABLE_THREADS)
+      case KokkosKernels::Impl::Exec_THREADS:
+        return Kokkos::Threads::impl_hardware_thread_id();
 #endif
-#if defined( KOKKOS_ENABLE_QTHREAD)
-	case KokkosKernels::Impl::Exec_QTHREADS:
-	  return 0; // Kokkos does not have a thread_id API for Qthreads
-#endif
-	}
-      }
-
-      KOKKOS_INLINE_FUNCTION
-      void operator()(const MultiCoreTag&, const team_member_t & teamMember) const {
-
-	nnz_lno_t team_row_begin = teamMember.league_rank()  * team_work_size;
-	const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
-
-	scalar_t * dense_accum= NULL;
-	size_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-	while (dense_accum == NULL){
-	  dense_accum = (scalar_t * )( memory_space.allocate_chunk(tid));
-	}
-	char *marker = (char *) (dense_accum + numcols);
-
-	Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index) {
-
-
-	    const size_type c_row_begin = row_mapC[row_index];
-	    nnz_lno_t *myentries = pEntriesC + c_row_begin;
-	    scalar_t *myvals = pVals + c_row_begin;
-
-	    nnz_lno_t current_col_index = 0;
-	    const size_type col_begin = row_mapA[row_index];
-	    const nnz_lno_t nnza = nnz_lno_t(row_mapA[row_index + 1] - col_begin);
-
-	    // Insert B
-	    size_type rowBegin = row_mapB(row_index);
-	    nnz_lno_t left_work = row_mapB(row_index + 1) - rowBegin;
-	    for (int i = 0; i < left_work; ++i){
-	      const size_type adjind = i + rowBegin;
-	      nnz_lno_t b_col_ind = entriesB[adjind];
-	      scalar_t b_val = valuesB[adjind];
-	      if (marker[b_col_ind] == 0){
-		marker[b_col_ind] = 1;
-		myentries[current_col_index++] = b_col_ind;
-	      }
-	      dense_accum[b_col_ind] += b_val;
-	    }
-
-	    // Insert -omega * dinv * A*B
-	    const scalar_t mult = -omega *dinv(row_index,0);
-	    for (nnz_lno_t colind = 0; colind < nnza; ++colind){
-	      size_type a_col = colind + col_begin;
-	      nnz_lno_t rowB = entriesA[a_col];
-	      scalar_t valA = valuesA[a_col] * mult;
-
-	      rowBegin = row_mapB(rowB);
-	      left_work = row_mapB(rowB + 1) - rowBegin;
-	      for (int i = 0; i < left_work; ++i){
-		const size_type adjind = i + rowBegin;
-		nnz_lno_t b_col_ind = entriesB[adjind];
-		scalar_t b_val = valuesB[adjind] * valA;
-		if (marker[b_col_ind] == 0){
-		  marker[b_col_ind] = 1;
-		  myentries[current_col_index++] = b_col_ind;
-		}
-		dense_accum[b_col_ind] += b_val;
-	      }
-	    }
-	    for (nnz_lno_t i = 0; i < current_col_index; ++i){
-	      nnz_lno_t ind = myentries[i];
-	      myvals[i] = dense_accum[ind];
-	      dense_accum[ind] = 0;
-	      marker [ind] = 0;
-	    }
-	  });
-	memory_space.release_chunk(dense_accum);
-      }
-
-    };
-
-    // ============================================================================
-    // This is a Jacobi-fused SPGEMM implementation which uses a dense accumulator.
-    // Copied from KokkosSparse_spgemm_impl_speed.hpp, removed the Cuda code, and
-    // updated for the jacobi-fused functionality by adding omega and dinv. The 
-    // Cuda code was removed because it is never called (Deveci19 paper does not 
-    // recommend using dense accumulators on GPUs).
-    // ============================================================================
-
-    template <typename HandleType,
-	      typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-	      typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-    template <typename c_row_view_t, typename c_lno_nnz_view_t, typename c_scalar_nnz_view_t, typename dinv_view_t>
-    void
-    KokkosSPGEMM
-    <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-     b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-    KokkosSPGEMM_jacobi_denseacc(c_row_view_t row_mapC_,
-				 c_lno_nnz_view_t entriesC_,
-				 c_scalar_nnz_view_t valuesC_,
-				 typename c_scalar_nnz_view_t::const_value_type omega, 
-				 dinv_view_t dinv,
-				 KokkosKernels::Impl::ExecSpaceType my_exec_space_)
-    {
-
-      if (KOKKOSKERNELS_VERBOSE){
-	std::cout << "\tDENSE ACC MODE" << std::endl;
-      }
-
-      // Initialize the timer
-      Kokkos::Timer jacobi_speed_timer;
-
-      // Get suggested vector size, teamsize and row chunk size
-      nnz_lno_t brows = row_mapB.extent(0) - 1;
-      size_type bnnz =  valsB.extent(0);
-      int suggested_vector_size = this->handle->get_suggested_vector_size(brows, bnnz);
-      int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-      nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(suggested_team_size,concurrency, a_row_cnt);
-
-      // Allocate the memory pool
-      KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk;
-      int num_chunks = concurrency;
-      typedef KokkosKernels::Impl::UniformMemoryPool<MyTempMemorySpace, scalar_t> pool_memory_space;
-      Kokkos::Timer timer;
-      pool_memory_space m_space(num_chunks, this->b_col_cnt + (this->b_col_cnt) / sizeof(scalar_t) + 1, 0,  my_pool_type);
-      MyExecSpace().fence();
-
-      if (KOKKOSKERNELS_VERBOSE){
-	std::cout << "\t\tPool Alloc Time:" << timer.seconds() << std::endl;
-	std::cout << "\tPool Size(MB):" 
-		  <<  sizeof(scalar_t) * (num_chunks * (this->b_col_cnt + (this->b_col_cnt) / sizeof(scalar_t) + 1)) / 1024. / 1024.  << std::endl;
-      }
-
-      // Initialize the functor
-      JacobiSpGEMMDenseAcc<const_a_lno_row_view_t, const_a_lno_nnz_view_t, const_a_scalar_nnz_view_t,
-			   const_b_lno_row_view_t, const_b_lno_nnz_view_t, const_b_scalar_nnz_view_t,
-			   c_row_view_t, c_lno_nnz_view_t, c_scalar_nnz_view_t, dinv_view_t,pool_memory_space>
-	jacobi(a_row_cnt, b_col_cnt,
-	       row_mapA, entriesA, valsA,
-	       row_mapB, entriesB, valsB,
-	       row_mapC_, entriesC_, valuesC_,
-	       omega, dinv,
-	       m_space, my_exec_space_, team_row_chunk_size);
-      MyExecSpace().fence();
+    }
+  }
 
-      if (KOKKOSKERNELS_VERBOSE){
-	std::cout << "\t\tCPU vector_size:" << suggested_vector_size
-		  << " team_size:" << suggested_team_size
-		  << " chunk_size:" << team_row_chunk_size
-		  << std::endl;
-      }
-      timer.reset();
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const MultiCoreTag &, const team_member_t &teamMember) const {
+    nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
 
-      if (use_dynamic_schedule){
-	Kokkos::parallel_for("KokkosSparse::Jacobi::DenseAcc::Dynamic", dynamic_multicore_team_policy_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), jacobi);
-      }
-      else {
-	Kokkos::parallel_for("KokkosSparse::Jacobi::DenseAcc::Static", multicore_team_policy_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), jacobi);
-      }
+    scalar_t *dense_accum = NULL;
+    size_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
+    while (dense_accum == NULL) {
+      dense_accum = (scalar_t *)(memory_space.allocate_chunk(tid));
+    }
+    char *marker = (char *)(dense_accum + numcols);
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          const size_type c_row_begin = row_mapC[row_index];
+          nnz_lno_t *myentries        = pEntriesC + c_row_begin;
+          scalar_t *myvals            = pVals + c_row_begin;
+
+          nnz_lno_t current_col_index = 0;
+          const size_type col_begin   = row_mapA[row_index];
+          const nnz_lno_t nnza = nnz_lno_t(row_mapA[row_index + 1] - col_begin);
+
+          // Insert B
+          size_type rowBegin  = row_mapB(row_index);
+          nnz_lno_t left_work = row_mapB(row_index + 1) - rowBegin;
+          for (int i = 0; i < left_work; ++i) {
+            const size_type adjind = i + rowBegin;
+            nnz_lno_t b_col_ind    = entriesB[adjind];
+            scalar_t b_val         = valuesB[adjind];
+            if (marker[b_col_ind] == 0) {
+              marker[b_col_ind]              = 1;
+              myentries[current_col_index++] = b_col_ind;
+            }
+            dense_accum[b_col_ind] += b_val;
+          }
+
+          // Insert -omega * dinv * A*B
+          const scalar_t mult = -omega * dinv(row_index, 0);
+          for (nnz_lno_t colind = 0; colind < nnza; ++colind) {
+            size_type a_col = colind + col_begin;
+            nnz_lno_t rowB  = entriesA[a_col];
+            scalar_t valA   = valuesA[a_col] * mult;
+
+            rowBegin  = row_mapB(rowB);
+            left_work = row_mapB(rowB + 1) - rowBegin;
+            for (int i = 0; i < left_work; ++i) {
+              const size_type adjind = i + rowBegin;
+              nnz_lno_t b_col_ind    = entriesB[adjind];
+              scalar_t b_val         = valuesB[adjind] * valA;
+              if (marker[b_col_ind] == 0) {
+                marker[b_col_ind]              = 1;
+                myentries[current_col_index++] = b_col_ind;
+              }
+              dense_accum[b_col_ind] += b_val;
+            }
+          }
+          for (nnz_lno_t i = 0; i < current_col_index; ++i) {
+            nnz_lno_t ind    = myentries[i];
+            myvals[i]        = dense_accum[ind];
+            dense_accum[ind] = 0;
+            marker[ind]      = 0;
+          }
+        });
+    memory_space.release_chunk(dense_accum);
+  }
+};
+
+// ============================================================================
+// This is a Jacobi-fused SPGEMM implementation which uses a dense accumulator.
+// Copied from KokkosSparse_spgemm_impl_speed.hpp, removed the Cuda code, and
+// updated for the jacobi-fused functionality by adding omega and dinv. The
+// Cuda code was removed because it is never called (Deveci19 paper does not
+// recommend using dense accumulators on GPUs).
+// ============================================================================
+
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename c_row_view_t, typename c_lno_nnz_view_t,
+          typename c_scalar_nnz_view_t, typename dinv_view_t>
+void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                  a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                  b_scalar_nnz_view_t_>::
+    KokkosSPGEMM_jacobi_denseacc(
+        c_row_view_t row_mapC_, c_lno_nnz_view_t entriesC_,
+        c_scalar_nnz_view_t valuesC_,
+        typename c_scalar_nnz_view_t::const_value_type omega, dinv_view_t dinv,
+        KokkosKernels::Impl::ExecSpaceType my_exec_space_) {
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\tDENSE ACC MODE" << std::endl;
+  }
 
-      MyExecSpace().fence();
+  // Initialize the timer
+  Kokkos::Timer jacobi_speed_timer;
+
+  // Get suggested vector size, teamsize and row chunk size
+  nnz_lno_t brows = row_mapB.extent(0) - 1;
+  size_type bnnz  = valsB.extent(0);
+  int suggested_vector_size =
+      this->handle->get_suggested_vector_size(brows, bnnz);
+  int suggested_team_size =
+      this->handle->get_suggested_team_size(suggested_vector_size);
+  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(
+      suggested_team_size, concurrency, a_row_cnt);
+
+  // Allocate the memory pool
+  KokkosKernels::Impl::PoolType my_pool_type =
+      KokkosKernels::Impl::OneThread2OneChunk;
+  int num_chunks = concurrency;
+  typedef KokkosKernels::Impl::UniformMemoryPool<MyTempMemorySpace, scalar_t>
+      pool_memory_space;
+  Kokkos::Timer timer;
+  pool_memory_space m_space(
+      num_chunks, this->b_col_cnt + (this->b_col_cnt) / sizeof(scalar_t) + 1, 0,
+      my_pool_type);
+  MyExecSpace().fence();
+
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\t\tPool Alloc Time:" << timer.seconds() << std::endl;
+    std::cout << "\tPool Size(MB):"
+              << sizeof(scalar_t) *
+                     (num_chunks * (this->b_col_cnt +
+                                    (this->b_col_cnt) / sizeof(scalar_t) + 1)) /
+                     1024. / 1024.
+              << std::endl;
+  }
 
-      if (KOKKOSKERNELS_VERBOSE){
-	std::cout << "\t\tJacobi COMP TIME:" << timer.seconds() << std::endl;
-	std::cout << "\t\tJacobi TOTAL TIME:" << jacobi_speed_timer.seconds() << std::endl;
+  // Initialize the functor
+  JacobiSpGEMMDenseAcc<const_a_lno_row_view_t, const_a_lno_nnz_view_t,
+                       const_a_scalar_nnz_view_t, const_b_lno_row_view_t,
+                       const_b_lno_nnz_view_t, const_b_scalar_nnz_view_t,
+                       c_row_view_t, c_lno_nnz_view_t, c_scalar_nnz_view_t,
+                       dinv_view_t, pool_memory_space>
+      jacobi(a_row_cnt, b_col_cnt, row_mapA, entriesA, valsA, row_mapB,
+             entriesB, valsB, row_mapC_, entriesC_, valuesC_, omega, dinv,
+             m_space, my_exec_space_, team_row_chunk_size);
+  MyExecSpace().fence();
+
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\t\tCPU vector_size:" << suggested_vector_size
+              << " team_size:" << suggested_team_size
+              << " chunk_size:" << team_row_chunk_size << std::endl;
+  }
+  timer.reset();
+
+  if (use_dynamic_schedule) {
+    Kokkos::parallel_for("KokkosSparse::Jacobi::DenseAcc::Dynamic",
+                         dynamic_multicore_team_policy_t(
+                             a_row_cnt / team_row_chunk_size + 1,
+                             suggested_team_size, suggested_vector_size),
+                         jacobi);
+  } else {
+    Kokkos::parallel_for(
+        "KokkosSparse::Jacobi::DenseAcc::Static",
+        multicore_team_policy_t(a_row_cnt / team_row_chunk_size + 1,
+                                suggested_team_size, suggested_vector_size),
+        jacobi);
+  }
 
-      }
+  MyExecSpace().fence();
 
-    }
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\t\tJacobi COMP TIME:" << timer.seconds() << std::endl;
+    std::cout << "\t\tJacobi TOTAL TIME:" << jacobi_speed_timer.seconds()
+              << std::endl;
   }
 }
-
+}  // namespace Impl
+}  // namespace KokkosSparse
diff --git a/src/sparse/impl/KokkosSparse_spgemm_jacobi_seq_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_jacobi_seq_impl.hpp
index 9cfaaa6b95..19943e5a0b 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_jacobi_seq_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_jacobi_seq_impl.hpp
@@ -44,64 +44,63 @@
 #ifndef KOKKOSSPARSE_SPGEMM_JACOBI_DEBUG_HPP_
 #define KOKKOSSPARSE_SPGEMM_JACOBI_DEBUG_HPP_
 #include "KokkosKernels_helpers.hpp"
-namespace KokkosSparse{
-
-namespace Impl{
-
-template <typename KernelHandle,
-  typename alno_row_view_t_,
-  typename alno_nnz_view_t_,
-  typename ascalar_nnz_view_t_,
-  typename blno_row_view_t_,
-  typename blno_nnz_view_t_,
-  typename bscalar_nnz_view_t_,
-  typename clno_row_view_t_,
-  typename clno_nnz_view_t_,
-  typename cscalar_nnz_view_t_,
-  typename dinv_scalar_view_t>
-void spgemm_jacobi_seq(
-    KernelHandle * /* handle */,
-    typename KernelHandle::nnz_lno_t m,
-    typename KernelHandle::nnz_lno_t /* n */,
-    typename KernelHandle::nnz_lno_t k,
-    alno_row_view_t_ row_mapA,
-    alno_nnz_view_t_ entriesA,
-    ascalar_nnz_view_t_ valuesA,
-
-    bool /* transposeA */,
-    blno_row_view_t_ row_mapB,
-    blno_nnz_view_t_ entriesB,
-    bscalar_nnz_view_t_ valuesB,
-    bool /* transposeB */,
-    clno_row_view_t_ row_mapC,
-    clno_nnz_view_t_ entriesC,
-    cscalar_nnz_view_t_ valuesC,
-
-    typename ascalar_nnz_view_t_::const_value_type omega,
-    dinv_scalar_view_t dinv
-
-    ){
-  typename alno_row_view_t_::HostMirror h_rma = Kokkos::create_mirror_view (row_mapA);
-  Kokkos::deep_copy (h_rma, row_mapA);
-  typename alno_nnz_view_t_::HostMirror h_enta = Kokkos::create_mirror_view (entriesA);
-  Kokkos::deep_copy (h_enta, entriesA);
-  typename ascalar_nnz_view_t_::HostMirror h_vala = Kokkos::create_mirror_view (valuesA);
-  Kokkos::deep_copy (h_vala, valuesA);
-
-  typename blno_row_view_t_::HostMirror h_rmb = Kokkos::create_mirror_view (row_mapB);
-  Kokkos::deep_copy (h_rmb, row_mapB);
-  typename blno_nnz_view_t_::HostMirror h_entb = Kokkos::create_mirror_view (entriesB);
-  Kokkos::deep_copy (h_entb, entriesB);
-  typename bscalar_nnz_view_t_::HostMirror h_valb = Kokkos::create_mirror_view (valuesB);
-  Kokkos::deep_copy (h_valb, valuesB);
-
-  typename clno_row_view_t_::HostMirror h_rmc = Kokkos::create_mirror_view (row_mapC);
-  Kokkos::deep_copy (h_rmc, row_mapC);
-  typename clno_nnz_view_t_::HostMirror h_entc = Kokkos::create_mirror_view (entriesC);
-  typename cscalar_nnz_view_t_::HostMirror h_valc = Kokkos::create_mirror_view (valuesC);
-
-  typename dinv_scalar_view_t::HostMirror h_dinv = Kokkos::create_mirror_view (dinv);
-  Kokkos::deep_copy (h_dinv, dinv);
+namespace KokkosSparse {
+
+namespace Impl {
+
+template <typename KernelHandle, typename alno_row_view_t_,
+          typename alno_nnz_view_t_, typename ascalar_nnz_view_t_,
+          typename blno_row_view_t_, typename blno_nnz_view_t_,
+          typename bscalar_nnz_view_t_, typename clno_row_view_t_,
+          typename clno_nnz_view_t_, typename cscalar_nnz_view_t_,
+          typename dinv_scalar_view_t>
+void spgemm_jacobi_seq(KernelHandle* /* handle */,
+                       typename KernelHandle::nnz_lno_t m,
+                       typename KernelHandle::nnz_lno_t /* n */,
+                       typename KernelHandle::nnz_lno_t k,
+                       alno_row_view_t_ row_mapA, alno_nnz_view_t_ entriesA,
+                       ascalar_nnz_view_t_ valuesA,
+
+                       bool /* transposeA */, blno_row_view_t_ row_mapB,
+                       blno_nnz_view_t_ entriesB, bscalar_nnz_view_t_ valuesB,
+                       bool /* transposeB */, clno_row_view_t_ row_mapC,
+                       clno_nnz_view_t_ entriesC, cscalar_nnz_view_t_ valuesC,
+
+                       typename ascalar_nnz_view_t_::const_value_type omega,
+                       dinv_scalar_view_t dinv
+
+) {
+  typename alno_row_view_t_::HostMirror h_rma =
+      Kokkos::create_mirror_view(row_mapA);
+  Kokkos::deep_copy(h_rma, row_mapA);
+  typename alno_nnz_view_t_::HostMirror h_enta =
+      Kokkos::create_mirror_view(entriesA);
+  Kokkos::deep_copy(h_enta, entriesA);
+  typename ascalar_nnz_view_t_::HostMirror h_vala =
+      Kokkos::create_mirror_view(valuesA);
+  Kokkos::deep_copy(h_vala, valuesA);
+
+  typename blno_row_view_t_::HostMirror h_rmb =
+      Kokkos::create_mirror_view(row_mapB);
+  Kokkos::deep_copy(h_rmb, row_mapB);
+  typename blno_nnz_view_t_::HostMirror h_entb =
+      Kokkos::create_mirror_view(entriesB);
+  Kokkos::deep_copy(h_entb, entriesB);
+  typename bscalar_nnz_view_t_::HostMirror h_valb =
+      Kokkos::create_mirror_view(valuesB);
+  Kokkos::deep_copy(h_valb, valuesB);
+
+  typename clno_row_view_t_::HostMirror h_rmc =
+      Kokkos::create_mirror_view(row_mapC);
+  Kokkos::deep_copy(h_rmc, row_mapC);
+  typename clno_nnz_view_t_::HostMirror h_entc =
+      Kokkos::create_mirror_view(entriesC);
+  typename cscalar_nnz_view_t_::HostMirror h_valc =
+      Kokkos::create_mirror_view(valuesC);
+
+  typename dinv_scalar_view_t::HostMirror h_dinv =
+      Kokkos::create_mirror_view(dinv);
+  Kokkos::deep_copy(h_dinv, dinv);
 
   Kokkos::fence();
 
@@ -112,73 +111,68 @@ void spgemm_jacobi_seq(
   std::vector<scalar_t> accumulator(k, 0);
   std::vector<bool> acc_flag(k, false);
 
-
   h_rmc(0) = 0;
-  for (lno_t i = 0; i < m; ++i){
+  for (lno_t i = 0; i < m; ++i) {
     const size_type a_row_begin = h_rma(i);
-    const size_type a_row_end = h_rma(i + 1);
-    lno_t a_row_size = a_row_end - a_row_begin;
+    const size_type a_row_end   = h_rma(i + 1);
+    lno_t a_row_size            = a_row_end - a_row_begin;
 
-    size_type c_row_begin = h_rmc(i);
-    lno_t c_row_size = h_rmc(i+1) - c_row_begin;
+    size_type c_row_begin    = h_rmc(i);
+    lno_t c_row_size         = h_rmc(i + 1) - c_row_begin;
     lno_t c_row_size_counter = 0;
 
-    scalar_t mult = -omega * h_dinv(i,0);
+    scalar_t mult = -omega * h_dinv(i, 0);
     // Insert B
     size_type b_row_begin = h_rmb(i);
-    size_type b_row_end = h_rmb(i + 1);
-    lno_t b_row_size = b_row_end - b_row_begin;
-    for (lno_t z = 0; z < b_row_size; ++z){
+    size_type b_row_end   = h_rmb(i + 1);
+    lno_t b_row_size      = b_row_end - b_row_begin;
+    for (lno_t z = 0; z < b_row_size; ++z) {
       size_type ind_ = b_row_begin + z;
-      lno_t b_col = h_entb(ind_);
+      lno_t b_col    = h_entb(ind_);
       scalar_t b_val = h_valb(ind_);
 
-      if (acc_flag[b_col] == false){
-	acc_flag[b_col] = true;
-	h_entc(c_row_begin + c_row_size_counter++) = b_col;
+      if (acc_flag[b_col] == false) {
+        acc_flag[b_col]                            = true;
+        h_entc(c_row_begin + c_row_size_counter++) = b_col;
       }
       accumulator[b_col] += b_val;
     }
 
     // Insert -omega * dinv * A*B
-    for (lno_t j = 0; j < a_row_size; ++j){
+    for (lno_t j = 0; j < a_row_size; ++j) {
       size_type ind = a_row_begin + j;
-      lno_t col = h_enta(ind);
-      scalar_t val = h_vala(ind) * mult;
-      b_row_begin = h_rmb(col);
-      b_row_end = h_rmb(col + 1);
-      b_row_size = b_row_end - b_row_begin;
-      for (lno_t z = 0; z < b_row_size; ++z){
+      lno_t col     = h_enta(ind);
+      scalar_t val  = h_vala(ind) * mult;
+      b_row_begin   = h_rmb(col);
+      b_row_end     = h_rmb(col + 1);
+      b_row_size    = b_row_end - b_row_begin;
+      for (lno_t z = 0; z < b_row_size; ++z) {
         size_type ind_ = b_row_begin + z;
-        lno_t b_col = h_entb(ind_);
+        lno_t b_col    = h_entb(ind_);
         scalar_t b_val = h_valb(ind_) * val;
 
-        if (acc_flag[b_col] == false){
-          acc_flag[b_col] = true;
+        if (acc_flag[b_col] == false) {
+          acc_flag[b_col]                            = true;
           h_entc(c_row_begin + c_row_size_counter++) = b_col;
         }
-        accumulator[b_col] += b_val; 
+        accumulator[b_col] += b_val;
       }
     }
 
-    for (lno_t j = 0; j < c_row_size; ++j){
-
-      size_type ind = c_row_begin + j;
-      lno_t result_col = h_entc(ind);
-      h_valc (ind)=  accumulator[result_col];
+    for (lno_t j = 0; j < c_row_size; ++j) {
+      size_type ind           = c_row_begin + j;
+      lno_t result_col        = h_entc(ind);
+      h_valc(ind)             = accumulator[result_col];
       accumulator[result_col] = 0;
-      acc_flag[result_col] = false;
+      acc_flag[result_col]    = false;
     }
-
   }
 
-  Kokkos::deep_copy (entriesC, h_entc);
-  Kokkos::deep_copy (valuesC, h_valc);
+  Kokkos::deep_copy(entriesC, h_entc);
+  Kokkos::deep_copy(valuesC, h_valc);
   Kokkos::fence();
-
-
 }
 
-}
-}
+}  // namespace Impl
+}  // namespace KokkosSparse
 #endif
diff --git a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp
index 89ccb7d52e..5e1e2b18e6 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp
@@ -46,1471 +46,1658 @@
 
 #include "KokkosKernels_Utils.hpp"
 
-namespace KokkosSparse{
-
-  namespace Impl{
-
-    template <typename HandleType,
-	      typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-	      typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_ >
-    template <typename a_row_view_t, typename a_nnz_view_t, typename a_scalar_view_t,
-	      typename b_row_view_t, typename b_nnz_view_t, typename b_scalar_view_t,
-	      typename c_row_view_t, typename c_nnz_view_t, typename c_scalar_view_t,
-	      typename dinv_view_t,
-	      typename pool_memory_type>
-    struct KokkosSPGEMM
-    <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-     b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-    JacobiSpGEMMSparseAcc
-    {
-      nnz_lno_t numrows;
-
-      a_row_view_t row_mapA;
-      a_nnz_view_t entriesA;
-      a_scalar_view_t valuesA;
-
-      b_row_view_t row_mapB;
-      b_nnz_view_t entriesB;
-      b_scalar_view_t valuesB;
-
-      c_row_view_t row_mapC;
-      c_nnz_view_t entriesC;
-      c_scalar_view_t valuesC;
-
-      typename c_scalar_view_t::const_value_type omega;
-      dinv_view_t dinv;
-
-      nnz_lno_t *pEntriesC;
-      scalar_t *pvaluesC;
-      const size_t shared_memory_size;
-      const int vector_size;
-      pool_memory_type memory_space;
-
-      const nnz_lno_t pow2_hash_size;
-      const nnz_lno_t max_nnz;
-      const nnz_lno_t pow2_hash_func;
-      const KokkosKernels::Impl::ExecSpaceType my_exec_space;
-      const nnz_lno_t team_work_size;
-
-      const int unit_memory; //begins, nexts, and keys. No need for vals yet.
-      const int suggested_team_size;
-      const int thread_memory;
-      nnz_lno_t thread_shmem_key_size;
-      nnz_lno_t thread_shmem_hash_func;
-      nnz_lno_t thread_shmem_hash_size;
-
-      nnz_lno_t team_shmem_key_size;
-      nnz_lno_t team_shmem_hash_func;
-      nnz_lno_t team_shmem_hash_size;
-      nnz_lno_t team_cuckoo_key_size;
-      nnz_lno_t team_cuckoo_hash_func;
-
-      nnz_lno_t max_first_level_hash_size;
-      row_lno_persistent_work_view_t flops_per_row;
-
-
-      JacobiSpGEMMSparseAcc(nnz_lno_t m_,
-			    a_row_view_t row_mapA_, a_nnz_view_t entriesA_, a_scalar_view_t valuesA_,
-			    b_row_view_t row_mapB_, b_nnz_view_t entriesB_, b_scalar_view_t valuesB_,
-			    c_row_view_t row_mapC_, c_nnz_view_t entriesC_, c_scalar_view_t valuesC_,
-			    typename c_scalar_view_t::const_value_type omega_, dinv_view_t dinv_,
-			    size_t shared_memory_size_,
-			    int vector_size_,
-			    pool_memory_type mpool_,
-			    nnz_lno_t min_hash_size, nnz_lno_t max_nnz_,
-			    int suggested_team_size_,
-			    const KokkosKernels::Impl::ExecSpaceType my_exec_space_,
-			    nnz_lno_t team_row_chunk_size, double first_level_cut_off,
-			    row_lno_persistent_work_view_t flops_per_row_,
-			    bool KOKKOSKERNELS_VERBOSE_):
-      numrows(m_),
-        row_mapA(row_mapA_), entriesA(entriesA_), valuesA(valuesA_),
-        row_mapB(row_mapB_), entriesB(entriesB_), valuesB(valuesB_),
-        row_mapC(row_mapC_), entriesC(entriesC_), valuesC(valuesC_),
-	omega(omega_), dinv(dinv_),
-        pEntriesC(entriesC_.data()), pvaluesC(valuesC_.data()),
+namespace KokkosSparse {
+
+namespace Impl {
+
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <
+    typename a_row_view_t, typename a_nnz_view_t, typename a_scalar_view_t,
+    typename b_row_view_t, typename b_nnz_view_t, typename b_scalar_view_t,
+    typename c_row_view_t, typename c_nnz_view_t, typename c_scalar_view_t,
+    typename dinv_view_t, typename pool_memory_type>
+struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                    a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                    b_scalar_nnz_view_t_>::JacobiSpGEMMSparseAcc {
+  nnz_lno_t numrows;
+
+  a_row_view_t row_mapA;
+  a_nnz_view_t entriesA;
+  a_scalar_view_t valuesA;
+
+  b_row_view_t row_mapB;
+  b_nnz_view_t entriesB;
+  b_scalar_view_t valuesB;
+
+  c_row_view_t row_mapC;
+  c_nnz_view_t entriesC;
+  c_scalar_view_t valuesC;
+
+  typename c_scalar_view_t::const_value_type omega;
+  dinv_view_t dinv;
+
+  nnz_lno_t *pEntriesC;
+  scalar_t *pvaluesC;
+  const size_t shared_memory_size;
+  const int vector_size;
+  pool_memory_type memory_space;
+
+  const nnz_lno_t pow2_hash_size;
+  const nnz_lno_t max_nnz;
+  const nnz_lno_t pow2_hash_func;
+  const KokkosKernels::Impl::ExecSpaceType my_exec_space;
+  const nnz_lno_t team_work_size;
+
+  const int unit_memory;  // begins, nexts, and keys. No need for vals yet.
+  const int suggested_team_size;
+  const int thread_memory;
+  nnz_lno_t thread_shmem_key_size;
+  nnz_lno_t thread_shmem_hash_func;
+  nnz_lno_t thread_shmem_hash_size;
+
+  nnz_lno_t team_shmem_key_size;
+  nnz_lno_t team_shmem_hash_func;
+  nnz_lno_t team_shmem_hash_size;
+  nnz_lno_t team_cuckoo_key_size;
+  nnz_lno_t team_cuckoo_hash_func;
+
+  nnz_lno_t max_first_level_hash_size;
+  row_lno_persistent_work_view_t flops_per_row;
+
+  JacobiSpGEMMSparseAcc(
+      nnz_lno_t m_, a_row_view_t row_mapA_, a_nnz_view_t entriesA_,
+      a_scalar_view_t valuesA_, b_row_view_t row_mapB_, b_nnz_view_t entriesB_,
+      b_scalar_view_t valuesB_, c_row_view_t row_mapC_, c_nnz_view_t entriesC_,
+      c_scalar_view_t valuesC_,
+      typename c_scalar_view_t::const_value_type omega_, dinv_view_t dinv_,
+      size_t shared_memory_size_, int vector_size_, pool_memory_type mpool_,
+      nnz_lno_t min_hash_size, nnz_lno_t max_nnz_, int suggested_team_size_,
+      const KokkosKernels::Impl::ExecSpaceType my_exec_space_,
+      nnz_lno_t team_row_chunk_size, double first_level_cut_off,
+      row_lno_persistent_work_view_t flops_per_row_,
+      bool KOKKOSKERNELS_VERBOSE_)
+      : numrows(m_),
+        row_mapA(row_mapA_),
+        entriesA(entriesA_),
+        valuesA(valuesA_),
+        row_mapB(row_mapB_),
+        entriesB(entriesB_),
+        valuesB(valuesB_),
+        row_mapC(row_mapC_),
+        entriesC(entriesC_),
+        valuesC(valuesC_),
+        omega(omega_),
+        dinv(dinv_),
+        pEntriesC(entriesC_.data()),
+        pvaluesC(valuesC_.data()),
         shared_memory_size(shared_memory_size_),
-        vector_size (vector_size_),
+        vector_size(vector_size_),
         memory_space(mpool_),
-        pow2_hash_size(min_hash_size), max_nnz(max_nnz_),
+        pow2_hash_size(min_hash_size),
+        max_nnz(max_nnz_),
         pow2_hash_func(min_hash_size - 1),
         my_exec_space(my_exec_space_),
         team_work_size(team_row_chunk_size),
-        unit_memory(sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) + sizeof (scalar_t)),
+        unit_memory(sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) +
+                    sizeof(scalar_t)),
         suggested_team_size(suggested_team_size_),
-        thread_memory((shared_memory_size /8 / suggested_team_size_) * 8),
+        thread_memory((shared_memory_size / 8 / suggested_team_size_) * 8),
         thread_shmem_key_size(),
         thread_shmem_hash_func(),
         thread_shmem_hash_size(1),
         team_shmem_key_size(),
         team_shmem_hash_func(),
         team_shmem_hash_size(1),
-        team_cuckoo_key_size (1),
+        team_cuckoo_key_size(1),
         team_cuckoo_hash_func(1),
         max_first_level_hash_size(1),
         flops_per_row(flops_per_row_)
 
-      {
-	nnz_lno_t tmp_team_cuckoo_key_size = ((shared_memory_size - sizeof(nnz_lno_t) * 2) / (sizeof(nnz_lno_t) + sizeof(scalar_t )));
-
-	while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size) team_cuckoo_key_size = team_cuckoo_key_size * 2;
-	team_cuckoo_hash_func = team_cuckoo_key_size - 1;
-
-	//How many extra bytes are needed to align a scalar_t after an array of nnz_lno_t, in the worst case?
-	constexpr size_t scalarAlignPad = (alignof(scalar_t) > alignof(nnz_lno_t)) ? (alignof(scalar_t) - alignof(nnz_lno_t)) : 0;
-	team_shmem_key_size = ((shared_memory_size - sizeof(nnz_lno_t) * 4 - scalarAlignPad) / unit_memory);
-	thread_shmem_key_size = ((thread_memory - sizeof(nnz_lno_t) * 4 - scalarAlignPad) / unit_memory);
-	if (KOKKOSKERNELS_VERBOSE_){
-	  std::cout << "\t\tJacobiSpGEMMSparseAcc -- sizeof(scalar_t): " << sizeof(scalar_t) << "  sizeof(nnz_lno_t): " << sizeof(nnz_lno_t) << "  suggested_team_size: " << suggested_team_size << std::endl;
-	  std::cout << "\t\tJacobiSpGEMMSparseAcc -- thread_memory:" << thread_memory  << " unit_memory:" << unit_memory <<" initial key size:" << thread_shmem_key_size << std::endl;
-	  std::cout << "\t\tJacobiSpGEMMSparseAcc -- team shared_memory:" << shared_memory_size  << " unit_memory:" << unit_memory <<" initial team key size:" << team_shmem_key_size << std::endl;
-	}
-	while (thread_shmem_hash_size * 2 <=  thread_shmem_key_size){
-	  thread_shmem_hash_size = thread_shmem_hash_size * 2;
-	}
-	while (team_shmem_hash_size * 2 <=  team_shmem_key_size){
-	  team_shmem_hash_size = team_shmem_hash_size * 2;
-	}
-	team_shmem_hash_func = team_shmem_hash_size - 1;
-	thread_shmem_hash_func = thread_shmem_hash_size - 1;
-	team_shmem_key_size = team_shmem_key_size + ((team_shmem_key_size - team_shmem_hash_size) * sizeof(nnz_lno_t)) / (sizeof (nnz_lno_t) * 2 + sizeof(scalar_t));
-	team_shmem_key_size = (team_shmem_key_size >> 1) << 1;
-
-	thread_shmem_key_size = thread_shmem_key_size + ((thread_shmem_key_size - thread_shmem_hash_size) * sizeof(nnz_lno_t)) / (sizeof (nnz_lno_t) * 2 + sizeof(scalar_t));
-	thread_shmem_key_size = (thread_shmem_key_size >> 1) << 1;
-
-	if (KOKKOSKERNELS_VERBOSE_){
-	  std::cout << "\t\tJacobiSpGEMMSparseAcc -- thread_memory:" << thread_memory  << " unit_memory:" << unit_memory <<" resized key size:" << thread_shmem_key_size << std::endl;
-	  std::cout << "\t\tJacobiSpGEMMSparseAcc -- team shared_memory:" << shared_memory_size  << " unit_memory:" << unit_memory <<" resized team key size:" << team_shmem_key_size << std::endl;
-	}
-
-	max_first_level_hash_size = first_level_cut_off * team_cuckoo_key_size;
-	if (KOKKOSKERNELS_VERBOSE_){
-	  std::cout << "\t\tJacobiSpGEMMSparseAcc -- thread_memory:" << thread_memory  << " unit_memory:" << unit_memory <<" initial key size:" << thread_shmem_key_size << std::endl;
-	  std::cout << "\t\tJacobiSpGEMMSparseAcc -- team_memory:" << shared_memory_size  << " unit_memory:" << unit_memory <<" initial team key size:" << team_shmem_key_size << std::endl;
-	  std::cout << "\t\tJacobiSpGEMMSparseAcc -- adjusted hashsize:" << thread_shmem_hash_size  << " thread_shmem_key_size:" << thread_shmem_key_size << std::endl;
-	  std::cout << "\t\tJacobiSpGEMMSparseAcc -- adjusted team hashsize:" << team_shmem_hash_size  << " team_shmem_key_size:" << team_shmem_key_size << std::endl;
-	  std::cout << "\t\t  team_cuckoo_key_size:" << team_cuckoo_key_size << " team_cuckoo_hash_func:" << team_cuckoo_hash_func << " max_first_level_hash_size:" << max_first_level_hash_size << std::endl;
-	  std::cout << "\t\t  pow2_hash_size:" << pow2_hash_size << " pow2_hash_func:" << pow2_hash_func << std::endl;
-	}
-      }
+  {
+    nnz_lno_t tmp_team_cuckoo_key_size =
+        ((shared_memory_size - sizeof(nnz_lno_t) * 2) /
+         (sizeof(nnz_lno_t) + sizeof(scalar_t)));
+
+    while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size)
+      team_cuckoo_key_size = team_cuckoo_key_size * 2;
+    team_cuckoo_hash_func = team_cuckoo_key_size - 1;
+
+    // How many extra bytes are needed to align a scalar_t after an array of
+    // nnz_lno_t, in the worst case?
+    constexpr size_t scalarAlignPad =
+        (alignof(scalar_t) > alignof(nnz_lno_t))
+            ? (alignof(scalar_t) - alignof(nnz_lno_t))
+            : 0;
+    team_shmem_key_size =
+        ((shared_memory_size - sizeof(nnz_lno_t) * 4 - scalarAlignPad) /
+         unit_memory);
+    thread_shmem_key_size =
+        ((thread_memory - sizeof(nnz_lno_t) * 4 - scalarAlignPad) /
+         unit_memory);
+    if (KOKKOSKERNELS_VERBOSE_) {
+      std::cout << "\t\tJacobiSpGEMMSparseAcc -- sizeof(scalar_t): "
+                << sizeof(scalar_t)
+                << "  sizeof(nnz_lno_t): " << sizeof(nnz_lno_t)
+                << "  suggested_team_size: " << suggested_team_size
+                << std::endl;
+      std::cout << "\t\tJacobiSpGEMMSparseAcc -- thread_memory:"
+                << thread_memory << " unit_memory:" << unit_memory
+                << " initial key size:" << thread_shmem_key_size << std::endl;
+      std::cout << "\t\tJacobiSpGEMMSparseAcc -- team shared_memory:"
+                << shared_memory_size << " unit_memory:" << unit_memory
+                << " initial team key size:" << team_shmem_key_size
+                << std::endl;
+    }
+    while (thread_shmem_hash_size * 2 <= thread_shmem_key_size) {
+      thread_shmem_hash_size = thread_shmem_hash_size * 2;
+    }
+    while (team_shmem_hash_size * 2 <= team_shmem_key_size) {
+      team_shmem_hash_size = team_shmem_hash_size * 2;
+    }
+    team_shmem_hash_func   = team_shmem_hash_size - 1;
+    thread_shmem_hash_func = thread_shmem_hash_size - 1;
+    team_shmem_key_size =
+        team_shmem_key_size +
+        ((team_shmem_key_size - team_shmem_hash_size) * sizeof(nnz_lno_t)) /
+            (sizeof(nnz_lno_t) * 2 + sizeof(scalar_t));
+    team_shmem_key_size = (team_shmem_key_size >> 1) << 1;
+
+    thread_shmem_key_size =
+        thread_shmem_key_size +
+        ((thread_shmem_key_size - thread_shmem_hash_size) * sizeof(nnz_lno_t)) /
+            (sizeof(nnz_lno_t) * 2 + sizeof(scalar_t));
+    thread_shmem_key_size = (thread_shmem_key_size >> 1) << 1;
+
+    if (KOKKOSKERNELS_VERBOSE_) {
+      std::cout << "\t\tJacobiSpGEMMSparseAcc -- thread_memory:"
+                << thread_memory << " unit_memory:" << unit_memory
+                << " resized key size:" << thread_shmem_key_size << std::endl;
+      std::cout << "\t\tJacobiSpGEMMSparseAcc -- team shared_memory:"
+                << shared_memory_size << " unit_memory:" << unit_memory
+                << " resized team key size:" << team_shmem_key_size
+                << std::endl;
+    }
 
-      KOKKOS_INLINE_FUNCTION
-      size_t get_thread_id(const size_t row_index) const{
-	switch (my_exec_space){
-	default:
-	  return row_index;
-#if defined( KOKKOS_ENABLE_SERIAL )
-	case KokkosKernels::Impl::Exec_SERIAL:
-	  return 0;
-#endif
-#if defined( KOKKOS_ENABLE_OPENMP )
-	case KokkosKernels::Impl::Exec_OMP:
-	  return Kokkos::OpenMP::impl_hardware_thread_id();
+    max_first_level_hash_size = first_level_cut_off * team_cuckoo_key_size;
+    if (KOKKOSKERNELS_VERBOSE_) {
+      std::cout << "\t\tJacobiSpGEMMSparseAcc -- thread_memory:"
+                << thread_memory << " unit_memory:" << unit_memory
+                << " initial key size:" << thread_shmem_key_size << std::endl;
+      std::cout << "\t\tJacobiSpGEMMSparseAcc -- team_memory:"
+                << shared_memory_size << " unit_memory:" << unit_memory
+                << " initial team key size:" << team_shmem_key_size
+                << std::endl;
+      std::cout << "\t\tJacobiSpGEMMSparseAcc -- adjusted hashsize:"
+                << thread_shmem_hash_size
+                << " thread_shmem_key_size:" << thread_shmem_key_size
+                << std::endl;
+      std::cout << "\t\tJacobiSpGEMMSparseAcc -- adjusted team hashsize:"
+                << team_shmem_hash_size
+                << " team_shmem_key_size:" << team_shmem_key_size << std::endl;
+      std::cout << "\t\t  team_cuckoo_key_size:" << team_cuckoo_key_size
+                << " team_cuckoo_hash_func:" << team_cuckoo_hash_func
+                << " max_first_level_hash_size:" << max_first_level_hash_size
+                << std::endl;
+      std::cout << "\t\t  pow2_hash_size:" << pow2_hash_size
+                << " pow2_hash_func:" << pow2_hash_func << std::endl;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t get_thread_id(const size_t row_index) const {
+    switch (my_exec_space) {
+      default: return row_index;
+#if defined(KOKKOS_ENABLE_SERIAL)
+      case KokkosKernels::Impl::Exec_SERIAL: return 0;
 #endif
-#if defined( KOKKOS_ENABLE_THREADS )
-	case KokkosKernels::Impl::Exec_PTHREADS:
-	  return Kokkos::Threads::impl_hardware_thread_id();
+#if defined(KOKKOS_ENABLE_OPENMP)
+      case KokkosKernels::Impl::Exec_OMP:
+        return Kokkos::OpenMP::impl_hardware_thread_id();
 #endif
-#if defined( KOKKOS_ENABLE_QTHREAD)
-	case KokkosKernels::Impl::Exec_QTHREADS:
-	  return 0; // Kokkos does not have a thread_id API for Qthreads
+#if defined(KOKKOS_ENABLE_THREADS)
+      case KokkosKernels::Impl::Exec_THREADS:
+        return Kokkos::Threads::impl_hardware_thread_id();
 #endif
-#if defined( KOKKOS_ENABLE_CUDA )
-	case KokkosKernels::Impl::Exec_CUDA:
-	  return row_index;
+#if defined(KOKKOS_ENABLE_CUDA)
+      case KokkosKernels::Impl::Exec_CUDA: return row_index;
 #endif
-#if defined( KOKKOS_ENABLE_HIP )
-	case KokkosKernels::Impl::Exec_HIP:
-	  return row_index;
+#if defined(KOKKOS_ENABLE_HIP)
+      case KokkosKernels::Impl::Exec_HIP: return row_index;
 #endif
-	}
-      }
-
-      //linear probing with tracking.
-      KOKKOS_INLINE_FUNCTION
-      void operator()(const MultiCoreTag4&, const team_member_t & teamMember) const {
-
-	const nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size;
-	const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
-
-	volatile nnz_lno_t * tmp = NULL;
-	size_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-	while (tmp == NULL){
-	  tmp = (volatile nnz_lno_t * )( memory_space.allocate_chunk(tid));
-	}
-
-	nnz_lno_t *used_indices = (nnz_lno_t *) (tmp);
-	tmp += max_nnz;
-	nnz_lno_t *hash_ids = (nnz_lno_t *) (tmp);
-	tmp += pow2_hash_size;
-	scalar_t *hash_values = KokkosKernels::Impl::alignPtr<volatile nnz_lno_t*, scalar_t>(tmp);
-
-	Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index) {
-
-	    nnz_lno_t used_count = 0;
-
-	    // Insert B
-	    size_type rowBegin = row_mapB(row_index);
-	    nnz_lno_t left_workB = row_mapB(row_index + 1) - rowBegin;
-
-	    for ( nnz_lno_t i = 0; i < left_workB; ++i){
-	      const size_type adjind = i + rowBegin;
-	      nnz_lno_t b_col_ind = entriesB[adjind];
-	      scalar_t b_val = valuesB[adjind];
-	      nnz_lno_t hash = (b_col_ind * HASHSCALAR) & pow2_hash_func;
-
-	      while (true){
-		if (hash_ids[hash] == -1){
-		  used_indices[used_count++] = hash;
-		  hash_ids[hash] = b_col_ind;
-		  hash_values[hash] = b_val;
-		  break;
-		}
-		else if (hash_ids[hash] == b_col_ind){
-		  hash_values[hash] += b_val;
-		  break;
-		}
-		else {
-		  hash = (hash + 1) & pow2_hash_func;
-		}
-	      }
-	    }
-
-	    // Insert -omega * dinv * A*B
-	    scalar_t mult = -omega * dinv(row_index,0);
-	    const size_type col_begin = row_mapA[row_index];
-	    const nnz_lno_t left_work = row_mapA[row_index + 1] - col_begin;
-	    for ( nnz_lno_t ii = 0; ii < left_work; ++ii){
-	      size_type a_col = col_begin + ii;
-	      nnz_lno_t rowB = entriesA[a_col];
-	      scalar_t valA = valuesA[a_col] * mult;
-
-	      rowBegin = row_mapB(rowB);
-	      left_workB = row_mapB(rowB + 1) - rowBegin;
-
-	      for ( nnz_lno_t i = 0; i < left_workB; ++i){
-		const size_type adjind = i + rowBegin;
-		nnz_lno_t b_col_ind = entriesB[adjind];
-		scalar_t b_val = valuesB[adjind] * valA;
-		nnz_lno_t hash = (b_col_ind * HASHSCALAR) & pow2_hash_func;
-
-		while (true){
-		  if (hash_ids[hash] == -1){
-		    used_indices[used_count++] = hash;
-		    hash_ids[hash] = b_col_ind;
-		    hash_values[hash] = b_val;
-		    break;
-		  }
-		  else if (hash_ids[hash] == b_col_ind){
-		    hash_values[hash] += b_val;
-		    break;
-		  }
-		  else {
-		    hash = (hash + 1) & pow2_hash_func;
-		  }
-		}
-	      }
-	    }
-	    size_type c_row_begin = row_mapC[row_index];
-	    for (nnz_lno_t i = 0; i < used_count; ++i){
-	      nnz_lno_t used_index = used_indices[i];
-	      pEntriesC[c_row_begin] = hash_ids[used_index];
-	      pvaluesC [c_row_begin++] = hash_values[used_index];
-	      hash_ids[used_index] = -1;
-	    }
-	  });
-	memory_space.release_chunk(used_indices);
-      }
-
-
-
-      //assumes that the vector lane is 1, as in cpus
-      KOKKOS_INLINE_FUNCTION
-      void operator()(const MultiCoreTag&, const team_member_t & teamMember) const {
-
-	const nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size;
-	const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
-
-	KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,nnz_lno_t,scalar_t,KokkosKernels::Experimental::HashOpType::bitwiseAnd>
-	  hm2(pow2_hash_size, pow2_hash_func, NULL, NULL, NULL, NULL);
-
-	volatile nnz_lno_t * tmp = NULL;
-	size_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
-	while (tmp == NULL){
-	  tmp = (volatile nnz_lno_t * )( memory_space.allocate_chunk(tid));
-	}
-	nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *) tmp;
-	tmp += pow2_hash_size ;
-
-	hm2.hash_begins = (nnz_lno_t *) (tmp);
-	tmp += pow2_hash_size;
-	hm2.hash_nexts = (nnz_lno_t *) (tmp);
-
-	Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index) {
-	    nnz_lno_t globally_used_hash_count = 0;
-	    nnz_lno_t used_hash_sizes = 0;
-
-	    const size_type c_row_begin = row_mapC[row_index];
-
-	    hm2.keys = pEntriesC + c_row_begin;
-	    hm2.values = pvaluesC + c_row_begin;
-
-
-	    // Insert B
-	    size_type rowBegin = row_mapB(row_index);
-	    nnz_lno_t left_workB = row_mapB(row_index + 1) - rowBegin;
-
-	    for (nnz_lno_t i = 0; i < left_workB; ++i) {
-	      const size_type adjind = i + rowBegin;
-	      nnz_lno_t b_col_ind = entriesB[adjind];
-	      scalar_t b_val = valuesB[adjind];
-
-	      hm2.sequential_insert_into_hash_mergeAdd_TrackHashes(
-			b_col_ind, b_val,
-			&used_hash_sizes,
-			&globally_used_hash_count,
-			globally_used_hash_indices
-		  );
-	    }
-
-
-	    // Insert -omega * dinv * A*B
-	    const scalar_t mult = -omega * dinv(row_index,0);
-	    const size_type col_begin = row_mapA[row_index];
-	    const nnz_lno_t left_work = row_mapA[row_index + 1] - col_begin;
-
-	    for ( nnz_lno_t ii = 0; ii < left_work; ++ii){
-	      size_type a_col = col_begin + ii;
-	      nnz_lno_t rowB = entriesA[a_col];
-	      scalar_t valA = valuesA[a_col] * mult;
-
-	      rowBegin = row_mapB(rowB);
-	      left_workB = row_mapB(rowB + 1) - rowBegin;
+    }
+  }
 
-	      for (nnz_lno_t i = 0; i < left_workB; ++i) {
-			const size_type adjind = i + rowBegin;
-			nnz_lno_t b_col_ind = entriesB[adjind];
-			scalar_t b_val = valuesB[adjind] * valA;
+  // linear probing with tracking.
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const MultiCoreTag4 &,
+                  const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
+
+    volatile nnz_lno_t *tmp = NULL;
+    size_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(memory_space.allocate_chunk(tid));
+    }
 
-			hm2.sequential_insert_into_hash_mergeAdd_TrackHashes(
-			  b_col_ind, b_val,
-			  &used_hash_sizes,
-			  &globally_used_hash_count,
-			  globally_used_hash_indices
-			);
-	      }
-	    }
+    nnz_lno_t *used_indices = (nnz_lno_t *)(tmp);
+    tmp += max_nnz;
+    nnz_lno_t *hash_ids = (nnz_lno_t *)(tmp);
+    tmp += pow2_hash_size;
+    scalar_t *hash_values =
+        KokkosKernels::Impl::alignPtr<volatile nnz_lno_t *, scalar_t>(tmp);
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          nnz_lno_t used_count = 0;
+
+          // Insert B
+          size_type rowBegin   = row_mapB(row_index);
+          nnz_lno_t left_workB = row_mapB(row_index + 1) - rowBegin;
+
+          for (nnz_lno_t i = 0; i < left_workB; ++i) {
+            const size_type adjind = i + rowBegin;
+            nnz_lno_t b_col_ind    = entriesB[adjind];
+            scalar_t b_val         = valuesB[adjind];
+            nnz_lno_t hash         = (b_col_ind * HASHSCALAR) & pow2_hash_func;
+
+            while (true) {
+              if (hash_ids[hash] == -1) {
+                used_indices[used_count++] = hash;
+                hash_ids[hash]             = b_col_ind;
+                hash_values[hash]          = b_val;
+                break;
+              } else if (hash_ids[hash] == b_col_ind) {
+                hash_values[hash] += b_val;
+                break;
+              } else {
+                hash = (hash + 1) & pow2_hash_func;
+              }
+            }
+          }
+
+          // Insert -omega * dinv * A*B
+          scalar_t mult             = -omega * dinv(row_index, 0);
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t left_work = row_mapA[row_index + 1] - col_begin;
+          for (nnz_lno_t ii = 0; ii < left_work; ++ii) {
+            size_type a_col = col_begin + ii;
+            nnz_lno_t rowB  = entriesA[a_col];
+            scalar_t valA   = valuesA[a_col] * mult;
+
+            rowBegin   = row_mapB(rowB);
+            left_workB = row_mapB(rowB + 1) - rowBegin;
+
+            for (nnz_lno_t i = 0; i < left_workB; ++i) {
+              const size_type adjind = i + rowBegin;
+              nnz_lno_t b_col_ind    = entriesB[adjind];
+              scalar_t b_val         = valuesB[adjind] * valA;
+              nnz_lno_t hash = (b_col_ind * HASHSCALAR) & pow2_hash_func;
+
+              while (true) {
+                if (hash_ids[hash] == -1) {
+                  used_indices[used_count++] = hash;
+                  hash_ids[hash]             = b_col_ind;
+                  hash_values[hash]          = b_val;
+                  break;
+                } else if (hash_ids[hash] == b_col_ind) {
+                  hash_values[hash] += b_val;
+                  break;
+                } else {
+                  hash = (hash + 1) & pow2_hash_func;
+                }
+              }
+            }
+          }
+          size_type c_row_begin = row_mapC[row_index];
+          for (nnz_lno_t i = 0; i < used_count; ++i) {
+            nnz_lno_t used_index    = used_indices[i];
+            pEntriesC[c_row_begin]  = hash_ids[used_index];
+            pvaluesC[c_row_begin++] = hash_values[used_index];
+            hash_ids[used_index]    = -1;
+          }
+        });
+    memory_space.release_chunk(used_indices);
+  }
 
-	    for (nnz_lno_t i = 0; i < globally_used_hash_count; ++i){
-	      nnz_lno_t dirty_hash = globally_used_hash_indices[i];
-	      hm2.hash_begins[dirty_hash] = -1;
-	    }
-	  });
-	memory_space.release_chunk(globally_used_hash_indices);
-      }
+  // assumes that the vector lane is 1, as in cpus
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const MultiCoreTag &, const team_member_t &teamMember) const {
+    const nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
+
+    KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, scalar_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>
+        hm2(pow2_hash_size, pow2_hash_func, NULL, NULL, NULL, NULL);
+
+    volatile nnz_lno_t *tmp = NULL;
+    size_t tid = get_thread_id(team_row_begin + teamMember.team_rank());
+    while (tmp == NULL) {
+      tmp = (volatile nnz_lno_t *)(memory_space.allocate_chunk(tid));
+    }
+    nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *)tmp;
+    tmp += pow2_hash_size;
+
+    hm2.hash_begins = (nnz_lno_t *)(tmp);
+    tmp += pow2_hash_size;
+    hm2.hash_nexts = (nnz_lno_t *)(tmp);
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          nnz_lno_t globally_used_hash_count = 0;
+          nnz_lno_t used_hash_sizes          = 0;
+
+          const size_type c_row_begin = row_mapC[row_index];
+
+          hm2.keys   = pEntriesC + c_row_begin;
+          hm2.values = pvaluesC + c_row_begin;
+
+          // Insert B
+          size_type rowBegin   = row_mapB(row_index);
+          nnz_lno_t left_workB = row_mapB(row_index + 1) - rowBegin;
+
+          for (nnz_lno_t i = 0; i < left_workB; ++i) {
+            const size_type adjind = i + rowBegin;
+            nnz_lno_t b_col_ind    = entriesB[adjind];
+            scalar_t b_val         = valuesB[adjind];
+
+            hm2.sequential_insert_into_hash_mergeAdd_TrackHashes(
+                b_col_ind, b_val, &used_hash_sizes, &globally_used_hash_count,
+                globally_used_hash_indices);
+          }
+
+          // Insert -omega * dinv * A*B
+          const scalar_t mult       = -omega * dinv(row_index, 0);
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t left_work = row_mapA[row_index + 1] - col_begin;
+
+          for (nnz_lno_t ii = 0; ii < left_work; ++ii) {
+            size_type a_col = col_begin + ii;
+            nnz_lno_t rowB  = entriesA[a_col];
+            scalar_t valA   = valuesA[a_col] * mult;
+
+            rowBegin   = row_mapB(rowB);
+            left_workB = row_mapB(rowB + 1) - rowBegin;
+
+            for (nnz_lno_t i = 0; i < left_workB; ++i) {
+              const size_type adjind = i + rowBegin;
+              nnz_lno_t b_col_ind    = entriesB[adjind];
+              scalar_t b_val         = valuesB[adjind] * valA;
+
+              hm2.sequential_insert_into_hash_mergeAdd_TrackHashes(
+                  b_col_ind, b_val, &used_hash_sizes, &globally_used_hash_count,
+                  globally_used_hash_indices);
+            }
+          }
+
+          for (nnz_lno_t i = 0; i < globally_used_hash_count; ++i) {
+            nnz_lno_t dirty_hash        = globally_used_hash_indices[i];
+            hm2.hash_begins[dirty_hash] = -1;
+          }
+        });
+    memory_space.release_chunk(globally_used_hash_indices);
+  }
 
+  // This is the default row-based algorithm (thread-sequential)
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const GPUTag &, const team_member_t &teamMember) const {
+    nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
+
+    char *all_shared_memory =
+        (char *)(teamMember.team_shmem().get_shmem(shared_memory_size));
+
+    // Shift it to the thread private part
+    all_shared_memory += thread_memory * teamMember.team_rank();
+
+    // Holds the size of 1st and 2nd level hashes
+    volatile nnz_lno_t *used_hash_sizes =
+        (volatile nnz_lno_t *)(all_shared_memory);
+    all_shared_memory += sizeof(nnz_lno_t) * 2;
+
+    nnz_lno_t *globally_used_hash_count = (nnz_lno_t *)(all_shared_memory);
+    all_shared_memory += sizeof(nnz_lno_t) * 2;
+
+    nnz_lno_t *begins = (nnz_lno_t *)(all_shared_memory);
+    all_shared_memory += sizeof(nnz_lno_t) * thread_shmem_hash_size;
+
+    // Points to the next elements
+    nnz_lno_t *nexts = (nnz_lno_t *)(all_shared_memory);
+    all_shared_memory += sizeof(nnz_lno_t) * thread_shmem_key_size;
+
+    // Holds the keys
+    nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory);
+    all_shared_memory += sizeof(nnz_lno_t) * thread_shmem_key_size;
+
+    // Remainder of shmem allocation for vals
+    scalar_t *vals =
+        KokkosKernels::Impl::alignPtr<char *, scalar_t>(all_shared_memory);
+
+    // Create the hashmaps
+    KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, scalar_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>
+        hm(thread_shmem_key_size, thread_shmem_hash_func, begins, nexts, keys,
+           vals);
+
+    KokkosKernels::Experimental::HashmapAccumulator<
+        nnz_lno_t, nnz_lno_t, scalar_t,
+        KokkosKernels::Experimental::HashOpType::bitwiseAnd>
+        hm2(pow2_hash_size, pow2_hash_func, NULL, NULL, NULL, NULL);
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
+        [&](const nnz_lno_t &row_index) {
+          scalar_t mult               = -omega * dinv(row_index, 0);
+          const size_type c_row_begin = row_mapC[row_index];
+          const size_type c_row_end   = row_mapC[row_index + 1];
+          const nnz_lno_t global_memory_hash_size =
+              nnz_lno_t(c_row_end - c_row_begin);
+          bool is_global_alloced                = false;
+          nnz_lno_t *globally_used_hash_indices = NULL;
+
+          // TODO: HashmapAccumulator should encapsulate growing the linked
+          // lists.
+          if (global_memory_hash_size > thread_shmem_key_size) {
+            volatile nnz_lno_t *tmp = NULL;
+            size_t tid              = row_index;
+
+            while (tmp == NULL) {
+              Kokkos::single(
+                  Kokkos::PerThread(teamMember),
+                  [&](volatile nnz_lno_t *&memptr) {
+                    memptr = (volatile nnz_lno_t *)(memory_space.allocate_chunk(
+                        tid));
+                  },
+                  tmp);
+            }
+
+            is_global_alloced          = true;
+            globally_used_hash_indices = (nnz_lno_t *)tmp;
+            tmp += pow2_hash_size;
+            hm2.hash_begins = (nnz_lno_t *)(tmp);
+            tmp += pow2_hash_size;
+            hm2.hash_nexts = (nnz_lno_t *)(tmp);
+          }
+          hm2.keys   = pEntriesC + c_row_begin;
+          hm2.values = pvaluesC + c_row_begin;
+
+          // Initialize begins.
+          Kokkos::parallel_for(
+              Kokkos::ThreadVectorRange(teamMember, thread_shmem_hash_size),
+              [&](nnz_lno_t i) { begins[i] = -1; });
+
+          // Initialize hash usage sizes
+          Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+            used_hash_sizes[0]          = 0;
+            used_hash_sizes[1]          = 0;
+            globally_used_hash_count[0] = 0;
+          });
+
+          // Insert B
+          size_type rowBegin   = row_mapB(row_index);
+          nnz_lno_t left_work_ = row_mapB(row_index + 1) - rowBegin;
+          Kokkos::parallel_for(
+              Kokkos::ThreadVectorRange(teamMember, left_work_),
+              [&](nnz_lno_t i) {
+                const size_type adjind = i + rowBegin;
+                nnz_lno_t b_col_ind    = entriesB[adjind];
+                scalar_t b_val         = valuesB[adjind];
+                volatile int num_unsuccess =
+                    hm.vector_atomic_insert_into_hash_mergeAdd(b_col_ind, b_val,
+                                                               used_hash_sizes);
+                if (num_unsuccess) {
+                  hm2.vector_atomic_insert_into_hash_mergeAdd_TrackHashes(
+                      b_col_ind, b_val, used_hash_sizes + 1,
+                      globally_used_hash_count, globally_used_hash_indices);
+                }
+              });
+
+          // Insert -omega * Dinv * A*B
+          const size_type col_begin = row_mapA[row_index];
+          const nnz_lno_t left_work = row_mapA[row_index + 1] - col_begin;
+          nnz_lno_t ii              = left_work;
+
+          while (ii-- > 0) {
+            size_type a_col = col_begin + ii;
+            nnz_lno_t rowB  = entriesA[a_col];
+            scalar_t valA   = valuesA[a_col] * mult;
+            rowBegin        = row_mapB(rowB);
+            left_work_      = row_mapB(rowB + 1) - rowBegin;
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(teamMember, left_work_),
+                [&](nnz_lno_t i) {
+                  const size_type adjind = i + rowBegin;
+                  nnz_lno_t b_col_ind    = entriesB[adjind];
+                  scalar_t b_val         = valuesB[adjind] * valA;
+                  volatile int num_unsuccess =
+                      hm.vector_atomic_insert_into_hash_mergeAdd(
+                          b_col_ind, b_val, used_hash_sizes);
+                  if (num_unsuccess) {
+                    hm2.vector_atomic_insert_into_hash_mergeAdd_TrackHashes(
+                        b_col_ind, b_val, used_hash_sizes + 1,
+                        globally_used_hash_count, globally_used_hash_indices);
+                  }
+                });
+          }
+
+          if (is_global_alloced) {
+            nnz_lno_t dirty_hashes = globally_used_hash_count[0];
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(teamMember, dirty_hashes),
+                [&](nnz_lno_t i) {
+                  nnz_lno_t dirty_hash        = globally_used_hash_indices[i];
+                  hm2.hash_begins[dirty_hash] = -1;
+                });
+
+            Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+              memory_space.release_chunk(globally_used_hash_indices);
+            });
+          }
+
+          Kokkos::single(Kokkos::PerThread(teamMember), [&]() {
+            if (used_hash_sizes[0] > thread_shmem_key_size)
+              used_hash_sizes[0] = thread_shmem_key_size;
+          });
+
+          nnz_lno_t num_elements  = used_hash_sizes[0];
+          nnz_lno_t written_index = used_hash_sizes[1];
+          Kokkos::parallel_for(
+              Kokkos::ThreadVectorRange(teamMember, num_elements),
+              [&](nnz_lno_t i) {
+                pEntriesC[c_row_begin + written_index + i] = keys[i];
+                pvaluesC[c_row_begin + written_index + i]  = vals[i];
+              });
+        });
+  }
 
-      // This is the default row-based algorithm (thread-sequential)
-      KOKKOS_INLINE_FUNCTION
-      void operator()(const GPUTag&, const team_member_t & teamMember) const {
-	nnz_lno_t team_row_begin = teamMember.league_rank()  * team_work_size;
-	const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
-
-	char *all_shared_memory = (char *) (teamMember.team_shmem().get_shmem(shared_memory_size));
-
-	// Shift it to the thread private part
-	all_shared_memory += thread_memory * teamMember.team_rank();
-
-	// Holds the size of 1st and 2nd level hashes
-	volatile nnz_lno_t *used_hash_sizes = (volatile nnz_lno_t *) (all_shared_memory);
-	all_shared_memory += sizeof(nnz_lno_t) * 2;
-
-	nnz_lno_t *globally_used_hash_count = (nnz_lno_t *) (all_shared_memory);
-	all_shared_memory += sizeof(nnz_lno_t) * 2;
-
-	nnz_lno_t * begins = (nnz_lno_t *) (all_shared_memory);
-	all_shared_memory += sizeof(nnz_lno_t) * thread_shmem_hash_size;
-
-	// Points to the next elements
-	nnz_lno_t * nexts = (nnz_lno_t *) (all_shared_memory);
-	all_shared_memory += sizeof(nnz_lno_t) * thread_shmem_key_size;
-
-	// Holds the keys
-	nnz_lno_t * keys = (nnz_lno_t *) (all_shared_memory);
-	all_shared_memory += sizeof(nnz_lno_t) * thread_shmem_key_size;
-
-	// Remainder of shmem allocation for vals
-	scalar_t * vals = KokkosKernels::Impl::alignPtr<char*, scalar_t>(all_shared_memory);
-
-	// Create the hashmaps
-	KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,nnz_lno_t,scalar_t,KokkosKernels::Experimental::HashOpType::bitwiseAnd>
-	  hm(thread_shmem_key_size, thread_shmem_hash_func, begins, nexts, keys, vals);
-
-	KokkosKernels::Experimental::HashmapAccumulator<nnz_lno_t,nnz_lno_t,scalar_t,KokkosKernels::Experimental::HashOpType::bitwiseAnd>
-	  hm2(pow2_hash_size, pow2_hash_func,
-	      NULL, NULL, NULL, NULL);
-
-	Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&] (const nnz_lno_t& row_index) {
-
-	    scalar_t mult = -omega*dinv(row_index,0);
-	    const size_type c_row_begin = row_mapC[row_index];
-	    const size_type c_row_end = row_mapC[row_index + 1];
-	    const nnz_lno_t global_memory_hash_size = nnz_lno_t(c_row_end - c_row_begin);
-	    bool is_global_alloced = false;
-	    nnz_lno_t *globally_used_hash_indices = NULL;
-
-		// TODO: HashmapAccumulator should encapsulate growing the linked lists.
-	    if (global_memory_hash_size > thread_shmem_key_size){
-	      volatile nnz_lno_t * tmp = NULL;
-	      size_t tid = row_index;
-
-	      while (tmp == NULL){
-		Kokkos::single(Kokkos::PerThread(teamMember),[&] (volatile nnz_lno_t * &memptr) {
-		    memptr = (volatile nnz_lno_t * )( memory_space.allocate_chunk(tid));
-		  }, tmp);
-	      }
-
-	      is_global_alloced = true;
-	      globally_used_hash_indices = (nnz_lno_t *) tmp;
-	      tmp += pow2_hash_size ;
-	      hm2.hash_begins = (nnz_lno_t *) (tmp);
-	      tmp += pow2_hash_size ;
-	      hm2.hash_nexts = (nnz_lno_t *) (tmp);
-	    }
-	    hm2.keys = pEntriesC + c_row_begin;
-	    hm2.values = pvaluesC + c_row_begin;
-
-	    // Initialize begins.
-	    Kokkos::parallel_for( Kokkos::ThreadVectorRange(teamMember, thread_shmem_hash_size), [&] (nnz_lno_t i) {
-	    	begins[i] = -1; });
-	      
-	    // Initialize hash usage sizes
-	    Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
-		used_hash_sizes[0] = 0;
-		used_hash_sizes[1] = 0;
-		globally_used_hash_count[0] = 0;
-	      });
-
-
-	    // Insert B
-	    size_type rowBegin = row_mapB(row_index);
-	    nnz_lno_t left_work_ = row_mapB(row_index + 1) - rowBegin;
-	    Kokkos::parallel_for( Kokkos::ThreadVectorRange(teamMember, left_work_), [&] (nnz_lno_t i) {
-		const size_type adjind = i + rowBegin;
-		nnz_lno_t b_col_ind = entriesB[adjind];
-		scalar_t b_val = valuesB[adjind]; 
-		volatile int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeAdd(
-									   b_col_ind, b_val,
-									   used_hash_sizes
-									 );
-		if (num_unsuccess){
-		  hm2.vector_atomic_insert_into_hash_mergeAdd_TrackHashes(
-			b_col_ind,b_val,
-			used_hash_sizes + 1,
-			globally_used_hash_count, globally_used_hash_indices
-		  );
-		}
-	      });
-	    
-
-	    // Insert -omega * Dinv * A*B
-	    const size_type col_begin = row_mapA[row_index];
-	    const nnz_lno_t left_work = row_mapA[row_index + 1] - col_begin;
-	    nnz_lno_t ii = left_work;
-
-	    while(ii-- > 0){
-	      size_type a_col = col_begin + ii;
-	      nnz_lno_t rowB = entriesA[a_col];
-	      scalar_t valA = valuesA[a_col] * mult;
-	      rowBegin = row_mapB(rowB);
-	      left_work_ = row_mapB(rowB + 1) - rowBegin;
-	      Kokkos::parallel_for( Kokkos::ThreadVectorRange(teamMember, left_work_), [&] (nnz_lno_t i) {
-		  const size_type adjind = i + rowBegin;
-		  nnz_lno_t b_col_ind = entriesB[adjind];
-		  scalar_t b_val = valuesB[adjind] * valA;
-		  volatile int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeAdd(
-										 b_col_ind, b_val,
-										 used_hash_sizes
-									   );
-		  if (num_unsuccess){
-		    hm2.vector_atomic_insert_into_hash_mergeAdd_TrackHashes(
-			  b_col_ind,b_val,
-			  used_hash_sizes + 1,
-			  globally_used_hash_count, globally_used_hash_indices
-			);
-		  }
-		});
-	    }
-
-	    if (is_global_alloced){
-	      nnz_lno_t dirty_hashes = globally_used_hash_count[0];
-	      Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember, dirty_hashes),[&] (nnz_lno_t i) {
-		  nnz_lno_t dirty_hash = globally_used_hash_indices[i];
-		  hm2.hash_begins[dirty_hash] = -1;
-		});
-
-	      Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
-		  memory_space.release_chunk(globally_used_hash_indices);
-		});
-	    }
-
-	    Kokkos::single(Kokkos::PerThread(teamMember),[&] () {
-		if (used_hash_sizes[0] > thread_shmem_key_size) 
-		  used_hash_sizes[0] = thread_shmem_key_size;
-	      });
-
-	    nnz_lno_t num_elements = used_hash_sizes[0];
-	    nnz_lno_t written_index = used_hash_sizes[1];
-	    Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember, num_elements), [&] (nnz_lno_t i) {
-		pEntriesC[c_row_begin + written_index + i] = keys[i];
-		pvaluesC[c_row_begin + written_index + i] = vals[i];
-	      });
-
-	  });
+  // The thread-flat-parallel implementation for the case one row fits into
+  // shmem
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const GPUTag4 &, const team_member_t &teamMember) const {
+    const nnz_lno_t init_value = -1;
+    nnz_lno_t team_row_begin   = teamMember.league_rank() * team_work_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
+
+    char *all_shared_memory =
+        (char *)(teamMember.team_shmem().get_shmem(shared_memory_size));
+    volatile nnz_lno_t *used_hash_sizes =
+        (volatile nnz_lno_t *)(all_shared_memory);
+    all_shared_memory += sizeof(nnz_lno_t) * 2;
+
+    nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory);
+    all_shared_memory += sizeof(nnz_lno_t) * team_cuckoo_key_size;
+    scalar_t *vals =
+        KokkosKernels::Impl::alignPtr<char *, scalar_t>(all_shared_memory);
+
+    int thread_rank = teamMember.team_rank();
+    int vector_rank = 0;
+    typedef typename std::remove_reference<decltype(*used_hash_sizes)>::type
+        atomic_incr_type;
+    Kokkos::parallel_scan(
+        Kokkos::ThreadVectorRange(teamMember, vector_size),
+        [&](const int /* threadid */, int &update, const bool final) {
+          if (final) {
+            vector_rank = update;
+          }
+          update += 1;
+        });
+
+    int bs           = vector_size * suggested_team_size;
+    int vector_shift = thread_rank * vector_size + vector_rank;
+
+    for (nnz_lno_t row_index = team_row_begin; row_index < team_row_end;
+         ++row_index) {
+      teamMember.team_barrier();
+
+      scalar_t mult               = -omega * dinv(row_index, 0);
+      const size_type c_row_begin = row_mapC[row_index];
+      nnz_lno_t *c_row            = entriesC.data() + c_row_begin;
+      scalar_t *c_row_vals        = valuesC.data() + c_row_begin;
+
+      // Initialize hashmaps
+      nnz_lno_t num_threads = team_cuckoo_key_size / vector_size;
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(teamMember, num_threads),
+          [&](nnz_lno_t teamind) {
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(teamMember, vector_size),
+                [&](nnz_lno_t i) {
+                  keys[teamind * vector_size + i] = init_value;
+                  vals[teamind * vector_size + i] = 0;
+                });
+          });
+
+      Kokkos::single(Kokkos::PerTeam(teamMember), [&]() {
+        used_hash_sizes[0] = 0;
+        used_hash_sizes[1] = 0;
+      });
+
+      // Insert B
+      teamMember.team_barrier();
+      nnz_lno_t current_b_column_flops =
+          row_mapB[row_index + 1] - row_mapB[row_index];
+      for (nnz_lno_t vector_read_shift = vector_shift;
+           vector_read_shift < current_b_column_flops;
+           vector_read_shift += bs) {
+        nnz_lno_t hash = init_value;
+        int fail       = 0;
+
+        nnz_lno_t my_b_col = entriesB[row_mapB[row_index] + vector_read_shift];
+        scalar_t my_b_val  = valuesB[row_mapB[row_index] + vector_read_shift];
+
+        // now insert it to first level hashmap accumulator.
+        hash = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func;
+        fail = 1;
+        for (nnz_lno_t trial = hash; trial < team_cuckoo_key_size;) {
+          if (keys[trial] == my_b_col) {
+            Kokkos::atomic_add(vals + trial, my_b_val);
+            fail = 0;
+            break;
+          } else if (keys[trial] == init_value) {
+            if (Kokkos::atomic_compare_exchange_strong(keys + trial, init_value,
+                                                       my_b_col)) {
+              Kokkos::atomic_add(vals + trial, my_b_val);
+              fail = 0;
+              break;
+            }
+          } else {
+            ++trial;
+          }
+        }
+
+        if (fail) {
+          for (nnz_lno_t trial = 0; trial < hash;) {
+            if (keys[trial] == my_b_col) {
+              Kokkos::atomic_add(vals + trial, my_b_val);
+              fail = 0;
+              break;
+            } else if (keys[trial] == init_value) {
+              if (Kokkos::atomic_compare_exchange_strong(
+                      keys + trial, init_value, my_b_col)) {
+                Kokkos::atomic_add(vals + trial, my_b_val);
+                fail = 0;
+                break;
+              }
+            } else {
+              ++trial;
+            }
+          }
+        }  // end if (fail)
+      }    // end for (nnz_lno_t vector_read_shift = vector_shift ...
+
+      // Insert -w Dinv A*B
+      const size_type a_col_begin_offset = row_mapA[row_index];
+
+      nnz_lno_t a_col_ind = entriesA[a_col_begin_offset];
+      scalar_t a_col_val  = valuesA[a_col_begin_offset] * mult;
+
+      nnz_lno_t current_a_column_offset_inrow = 0;
+      nnz_lno_t flops_on_the_left_of_offsett  = 0;
+      size_type current_b_read_offsett        = row_mapB[a_col_ind];
+      nnz_lno_t current_a_column_flops =
+          row_mapB[a_col_ind + 1] - current_b_read_offsett;
+
+      nnz_lno_t row_flops = flops_per_row(row_index);
+
+      teamMember.team_barrier();
+
+      for (nnz_lno_t vector_read_shift = vector_shift;
+           vector_read_shift < row_flops; vector_read_shift += bs) {
+        nnz_lno_t my_b_col_shift =
+            vector_read_shift - flops_on_the_left_of_offsett;
+        nnz_lno_t my_b_col = init_value;
+        scalar_t my_b_val  = 0;
+        nnz_lno_t hash     = init_value;
+        int fail           = 0;
+
+        if (my_b_col_shift >= current_a_column_flops) {
+          do {
+            ++current_a_column_offset_inrow;
+            my_b_col_shift -= current_a_column_flops;
+            flops_on_the_left_of_offsett += current_a_column_flops;
+            a_col_ind =
+                entriesA[a_col_begin_offset + current_a_column_offset_inrow];
+
+            current_b_read_offsett = row_mapB[a_col_ind];
+            current_a_column_flops =
+                row_mapB[a_col_ind + 1] - current_b_read_offsett;
+          } while (my_b_col_shift >= current_a_column_flops);
+          a_col_val =
+              valuesA[a_col_begin_offset + current_a_column_offset_inrow];
+        }
+
+        my_b_col = entriesB[my_b_col_shift + current_b_read_offsett];
+
+        my_b_val = valuesB[my_b_col_shift + current_b_read_offsett] * a_col_val;
+
+        // now insert it to first level hashmap accumulator.
+        hash = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func;
+        fail = 1;
+        for (nnz_lno_t trial = hash; trial < team_cuckoo_key_size;) {
+          if (keys[trial] == my_b_col) {
+            Kokkos::atomic_add(vals + trial, my_b_val);
+            fail = 0;
+            break;
+          } else if (keys[trial] == init_value) {
+            if (Kokkos::atomic_compare_exchange_strong(keys + trial, init_value,
+                                                       my_b_col)) {
+              Kokkos::atomic_add(vals + trial, my_b_val);
+              fail = 0;
+              break;
+            }
+          } else {
+            ++trial;
+          }
+        }
+
+        if (fail) {
+          for (nnz_lno_t trial = 0; trial < hash;) {
+            if (keys[trial] == my_b_col) {
+              Kokkos::atomic_add(vals + trial, my_b_val);
+              fail = 0;
+              break;
+            } else if (keys[trial] == init_value) {
+              if (Kokkos::atomic_compare_exchange_strong(
+                      keys + trial, init_value, my_b_col)) {
+                Kokkos::atomic_add(vals + trial, my_b_val);
+                fail = 0;
+                break;
+              }
+            } else {
+              ++trial;
+            }
+          }
+        }  // end if (fail)
+      }    // end for (nnz_lno_t vector_read_shift = vector_shift ...
+
+      teamMember.team_barrier();
+      for (nnz_lno_t my_index = vector_shift; my_index < team_cuckoo_key_size;
+           my_index += bs) {
+        nnz_lno_t my_key = keys[my_index];
+        if (my_key != init_value) {
+          scalar_t my_val = vals[my_index];
+          nnz_lno_t write_index =
+              Kokkos::atomic_fetch_add(used_hash_sizes, atomic_incr_type(1));
+          c_row[write_index]      = my_key;
+          c_row_vals[write_index] = my_val;
+        }
       }
+    }  // end for (nnz_lno_t  my_index = vector_shift ...
+  }
 
-      // The thread-flat-parallel implementation for the case one row fits into shmem
-      KOKKOS_INLINE_FUNCTION
-      void operator()(const GPUTag4&, const team_member_t & teamMember) const {
-
-	const nnz_lno_t init_value = -1;
-	nnz_lno_t team_row_begin = teamMember.league_rank()  * team_work_size;
-	const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
-
-	char *all_shared_memory = (char *) (teamMember.team_shmem().get_shmem(shared_memory_size));
-	volatile nnz_lno_t *used_hash_sizes = (volatile nnz_lno_t *) (all_shared_memory);
-	all_shared_memory += sizeof(nnz_lno_t) * 2;
-
-	nnz_lno_t * keys = (nnz_lno_t *) (all_shared_memory);
-	all_shared_memory += sizeof(nnz_lno_t) * team_cuckoo_key_size;
-	scalar_t * vals = KokkosKernels::Impl::alignPtr<char*, scalar_t>(all_shared_memory);
-
-	int thread_rank =  teamMember.team_rank();
-	int vector_rank = 0;
-	typedef typename std::remove_reference< decltype( *used_hash_sizes ) >::type atomic_incr_type;
-	Kokkos::parallel_scan(Kokkos::ThreadVectorRange(teamMember, vector_size), [&] (const int /* threadid */, int &update, const bool final) {
-	    if (final){
-	      vector_rank = update;
-	    }
-	    update += 1;
-	  });
-
-	int bs = vector_size * suggested_team_size;
-	int vector_shift = thread_rank * vector_size + vector_rank;
-
-	for (nnz_lno_t row_index = team_row_begin; row_index < team_row_end; ++row_index){
-
-	  teamMember.team_barrier();
-
-	  scalar_t mult = -omega*dinv(row_index,0);
-	  const size_type c_row_begin = row_mapC[row_index];
-	  nnz_lno_t *c_row = entriesC.data() + c_row_begin;
-	  scalar_t *c_row_vals = valuesC.data() + c_row_begin;
-
-	  // Initialize hashmaps
-	  nnz_lno_t num_threads =  team_cuckoo_key_size / vector_size; 
-	  Kokkos::parallel_for( Kokkos::TeamThreadRange(teamMember, num_threads), [&] (nnz_lno_t teamind) {
-	      Kokkos::parallel_for( Kokkos::ThreadVectorRange(teamMember, vector_size ), [&] (nnz_lno_t i) {
-		  keys[teamind * vector_size + i] = init_value; vals[teamind * vector_size + i] = 0;
-		});
-	    });
-
-	  Kokkos::single(Kokkos::PerTeam(teamMember),[&] () {
-	      used_hash_sizes[0] = 0;
-	      used_hash_sizes[1] = 0;
-	    });
-
-
-	  // Insert B
-	  teamMember.team_barrier();
-	  nnz_lno_t current_b_column_flops = row_mapB[row_index + 1] - row_mapB[row_index];
-	  for (nnz_lno_t vector_read_shift = vector_shift; vector_read_shift < current_b_column_flops; vector_read_shift += bs){
-
-	    nnz_lno_t hash = init_value;
-	    int fail = 0;
-
-	    nnz_lno_t my_b_col = entriesB[row_mapB[row_index] + vector_read_shift];
-	    scalar_t my_b_val = valuesB[row_mapB[row_index] + vector_read_shift];
-
-	    //now insert it to first level hashmap accumulator.
-	    hash = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func;
-	    fail = 1;
-	    for (nnz_lno_t trial = hash; trial < team_cuckoo_key_size; ){
-	      if (keys[trial] == my_b_col){
-		Kokkos::atomic_add(vals + trial, my_b_val);
-		fail = 0;
-		break;
-	      }
-	      else if (keys[trial] == init_value){
-		if (Kokkos::atomic_compare_exchange_strong(keys + trial, init_value, my_b_col)){
-		  Kokkos::atomic_add(vals + trial, my_b_val);
-		  fail = 0;
-		  break;
-		}
-	      }
-	      else {
-		++trial;
-	      }
-	    }
-
-	    if (fail){
-	      for (nnz_lno_t trial = 0; trial < hash; ){
-
-		if (keys[trial] == my_b_col){
-		  Kokkos::atomic_add(vals + trial, my_b_val);
-		  fail = 0;
-		  break;
-		}
-		else if (keys[trial] == init_value){
-		  if (Kokkos::atomic_compare_exchange_strong(keys + trial, init_value, my_b_col)){
-		    Kokkos::atomic_add(vals + trial, my_b_val);
-		    fail = 0;
-		    break;
-		  }
-		}
-		else {
-		  ++trial;
-		}
-
-	      }
-	    } // end if (fail)
-	  } // end for (nnz_lno_t vector_read_shift = vector_shift ...
-
-
-	  // Insert -w Dinv A*B
-	  const size_type a_col_begin_offset = row_mapA[row_index];
-
-	  nnz_lno_t a_col_ind = entriesA[a_col_begin_offset];
-	  scalar_t a_col_val = valuesA[a_col_begin_offset] * mult;
-
-	  nnz_lno_t current_a_column_offset_inrow = 0;
-	  nnz_lno_t flops_on_the_left_of_offsett = 0;
-	  size_type current_b_read_offsett = row_mapB[a_col_ind];
-	  nnz_lno_t current_a_column_flops = row_mapB[a_col_ind + 1] - current_b_read_offsett;
-
-	  nnz_lno_t row_flops = flops_per_row(row_index);
-
-	  teamMember.team_barrier();
-
-	  for (nnz_lno_t vector_read_shift = vector_shift; vector_read_shift< row_flops; vector_read_shift += bs){
-
-	    nnz_lno_t my_b_col_shift = vector_read_shift - flops_on_the_left_of_offsett;
-	    nnz_lno_t my_b_col = init_value; scalar_t my_b_val = 0; nnz_lno_t hash = init_value;
-	    int fail = 0;
-
-	    if (my_b_col_shift >= current_a_column_flops ){
-	      do {
-		++current_a_column_offset_inrow;
-		my_b_col_shift -= current_a_column_flops;
-		flops_on_the_left_of_offsett += current_a_column_flops;
-		a_col_ind = entriesA[a_col_begin_offset + current_a_column_offset_inrow];
-
-		current_b_read_offsett = row_mapB[a_col_ind];
-		current_a_column_flops = row_mapB[a_col_ind + 1] - current_b_read_offsett;
-	      } while (my_b_col_shift >= current_a_column_flops);
-	      a_col_val = valuesA[a_col_begin_offset + current_a_column_offset_inrow];
-
-	    } 
-
-	    my_b_col = entriesB[my_b_col_shift + current_b_read_offsett];
-
-	    my_b_val = valuesB[my_b_col_shift + current_b_read_offsett] * a_col_val;
-
-	    //now insert it to first level hashmap accumulator.
-	    hash = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func;
-	    fail = 1;
-	    for (nnz_lno_t trial = hash; trial < team_cuckoo_key_size; ){
-	      if (keys[trial] == my_b_col){
-		Kokkos::atomic_add(vals + trial, my_b_val);
-		fail = 0;
-		break;
-	      }
-	      else if (keys[trial] == init_value){
-		if (Kokkos::atomic_compare_exchange_strong(keys + trial, init_value, my_b_col)){
-		  Kokkos::atomic_add(vals + trial, my_b_val);
-		  fail = 0;
-		  break;
-		}
-	      }
-	      else {
-		++trial;
-	      }
-	    }
-
-	    if (fail){
-	      for (nnz_lno_t trial = 0; trial < hash; ){
-		if (keys[trial] == my_b_col){
-		  Kokkos::atomic_add(vals + trial, my_b_val);
-		  fail = 0;
-		  break;
-		}
-		else if (keys[trial] == init_value){
-		  if (Kokkos::atomic_compare_exchange_strong(keys + trial, init_value, my_b_col)){
-		    Kokkos::atomic_add(vals + trial, my_b_val);
-		    fail = 0;
-		    break;
-		  }
-		}
-		else {
-		  ++trial;
-		}
-	      }
-	    } // end if (fail)
-	  } // end for (nnz_lno_t vector_read_shift = vector_shift ...
-
-
-	  teamMember.team_barrier();
-	  for (nnz_lno_t  my_index = vector_shift; my_index < team_cuckoo_key_size;my_index += bs){
-	    nnz_lno_t my_key = keys[my_index];
-	    if (my_key != init_value){
-	      scalar_t my_val = vals[my_index];
-	      nnz_lno_t write_index = Kokkos::atomic_fetch_add(used_hash_sizes, atomic_incr_type(1));
-	      c_row[write_index] = my_key;
-	      c_row_vals[write_index] = my_val;
-	    }
-	  }
-	} // end for (nnz_lno_t  my_index = vector_shift ...
+  // The thread-flat-parallel implementation for the case one row does not fit
+  // into shmem
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const GPUTag6 &, const team_member_t &teamMember) const {
+    const nnz_lno_t init_value = -1;
+    nnz_lno_t team_row_begin   = teamMember.league_rank() * team_work_size;
+    const nnz_lno_t team_row_end =
+        KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
+
+    char *all_shared_memory =
+        (char *)(teamMember.team_shmem().get_shmem(shared_memory_size));
+    volatile nnz_lno_t *used_hash_sizes =
+        (volatile nnz_lno_t *)(all_shared_memory);
+    all_shared_memory += sizeof(nnz_lno_t) * 2;
+
+    nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory);
+    all_shared_memory += sizeof(nnz_lno_t) * team_cuckoo_key_size;
+    scalar_t *vals =
+        KokkosKernels::Impl::alignPtr<char *, scalar_t>(all_shared_memory);
+
+    int thread_rank = teamMember.team_rank();
+    int vector_rank = 0;
+    typedef typename std::remove_reference<decltype(*used_hash_sizes)>::type
+        atomic_incr_type;
+    Kokkos::parallel_scan(
+        Kokkos::ThreadVectorRange(teamMember, vector_size),
+        [&](const int /* threadid */, int &update, const bool final) {
+          if (final) {
+            vector_rank = update;
+          }
+          update += 1;
+        });
+
+    int bs           = vector_size * suggested_team_size;
+    int vector_shift = thread_rank * vector_size + vector_rank;
+
+    for (nnz_lno_t row_index = team_row_begin; row_index < team_row_end;
+         ++row_index) {
+      teamMember.team_barrier();
+
+      scalar_t mult                  = -omega * dinv(row_index, 0);
+      const size_type c_row_begin    = row_mapC[row_index];
+      const size_type c_row_end      = row_mapC[row_index + 1];
+      const nnz_lno_t c_row_size     = c_row_end - c_row_begin;
+      nnz_lno_t *c_row               = entriesC.data() + c_row_begin;
+      scalar_t *c_row_vals           = valuesC.data() + c_row_begin;
+      nnz_lno_t *global_acc_row_keys = c_row;
+      scalar_t *global_acc_row_vals  = c_row_vals;
+      volatile nnz_lno_t *tmp        = NULL;
+
+      // Initialize hashmaps
+      if (c_row_size > max_first_level_hash_size) {
+        while (tmp == NULL) {
+          Kokkos::single(
+              Kokkos::PerTeam(teamMember),
+              [&](volatile nnz_lno_t *&memptr) {
+                memptr = (volatile nnz_lno_t *)(memory_space.allocate_chunk(
+                    row_index));
+              },
+              tmp);
+        }
+        global_acc_row_keys = (nnz_lno_t *)(tmp);
+        global_acc_row_vals =
+            KokkosKernels::Impl::alignPtr<volatile nnz_lno_t *, scalar_t>(
+                tmp + pow2_hash_size);
+
+        nnz_lno_t num_threads = pow2_hash_size / vector_size;
+        Kokkos::parallel_for(
+            Kokkos::TeamThreadRange(teamMember, num_threads),
+            [&](nnz_lno_t teamind) {
+              Kokkos::parallel_for(
+                  Kokkos::ThreadVectorRange(teamMember, vector_size),
+                  [&](nnz_lno_t i) {
+                    global_acc_row_vals[teamind * vector_size + i] = 0;
+                  });
+            });
       }
 
-      // The thread-flat-parallel implementation for the case one row does not fit into shmem
-      KOKKOS_INLINE_FUNCTION
-      void operator()(const GPUTag6&, const team_member_t & teamMember) const {
-
-	const nnz_lno_t init_value = -1;
-	nnz_lno_t team_row_begin = teamMember.league_rank()  * team_work_size;
-	const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows);
-
-	char *all_shared_memory = (char *) (teamMember.team_shmem().get_shmem(shared_memory_size));
-	volatile nnz_lno_t *used_hash_sizes = (volatile nnz_lno_t *) (all_shared_memory);
-	all_shared_memory += sizeof(nnz_lno_t) * 2;
-
-	nnz_lno_t *keys = (nnz_lno_t *) (all_shared_memory);
-	all_shared_memory += sizeof(nnz_lno_t) * team_cuckoo_key_size;
-	scalar_t *vals = KokkosKernels::Impl::alignPtr<char*, scalar_t>(all_shared_memory);
-
-	int thread_rank =  teamMember.team_rank();
-	int vector_rank = 0;
-	typedef typename std::remove_reference<decltype(*used_hash_sizes)>::type atomic_incr_type;
-	Kokkos::parallel_scan(Kokkos::ThreadVectorRange(teamMember, vector_size), [&] (const int /* threadid */, int &update, const bool final) {
-	    if (final){
-	      vector_rank = update;
-	    }
-	    update += 1;
-	  });
-
-	int bs = vector_size * suggested_team_size;
-	int vector_shift = thread_rank * vector_size + vector_rank;
-
-	for (nnz_lno_t row_index = team_row_begin; row_index < team_row_end; ++row_index){
-
-	  teamMember.team_barrier();
-
-	  scalar_t mult = -omega*dinv(row_index,0);
-	  const size_type c_row_begin = row_mapC[row_index];
-	  const size_type c_row_end = row_mapC[row_index + 1];
-	  const nnz_lno_t c_row_size = c_row_end -  c_row_begin;
-	  nnz_lno_t *c_row = entriesC.data() + c_row_begin;
-	  scalar_t *c_row_vals = valuesC.data() + c_row_begin;
-	  nnz_lno_t *global_acc_row_keys = c_row;
-	  scalar_t *global_acc_row_vals = c_row_vals;
-	  volatile nnz_lno_t * tmp = NULL;
-
-
-	  // Initialize hashmaps
-	  if (c_row_size > max_first_level_hash_size){
-	    while (tmp == NULL){
-	      Kokkos::single(Kokkos::PerTeam(teamMember),[&] (volatile nnz_lno_t * &memptr) {
-		  memptr = (volatile nnz_lno_t * )( memory_space.allocate_chunk(row_index));
-		}, tmp);
-	    }
-	    global_acc_row_keys = (nnz_lno_t *) (tmp);
-	    global_acc_row_vals = KokkosKernels::Impl::alignPtr<volatile nnz_lno_t*, scalar_t>(tmp + pow2_hash_size);
-
-	    nnz_lno_t num_threads =  pow2_hash_size / vector_size;
-	    Kokkos::parallel_for( Kokkos::TeamThreadRange(teamMember, num_threads), [&] (nnz_lno_t teamind) {
-		Kokkos::parallel_for( Kokkos::ThreadVectorRange(teamMember, vector_size ), [&] (nnz_lno_t i) {
-		    global_acc_row_vals[teamind * vector_size + i] = 0;
-		  });
-	      });
-	  }
-
-	  nnz_lno_t num_threads =  team_cuckoo_key_size / vector_size;
-	  Kokkos::parallel_for( Kokkos::TeamThreadRange(teamMember, num_threads), [&] (nnz_lno_t teamind) {
-	      Kokkos::parallel_for( Kokkos::ThreadVectorRange(teamMember, vector_size ), [&] (nnz_lno_t i) {
-		  keys[teamind * vector_size + i] = init_value; 
-		  vals[teamind * vector_size + i] = 0;
-		});
-	    });
-
-	  Kokkos::single(Kokkos::PerTeam(teamMember),[&] () {
-	      used_hash_sizes[0] = 0;
-	      used_hash_sizes[1] = 0;
-	    });
-
-
-	  // Insert B
-	  teamMember.team_barrier();
-	  nnz_lno_t current_b_column_flops = row_mapB[row_index + 1] - row_mapB[row_index];
-	  bool insert_is_on = true;
-	  for (nnz_lno_t vector_read_shift = vector_shift; vector_read_shift < current_b_column_flops; vector_read_shift += bs){
-
-	    nnz_lno_t hash = init_value;
-	    int fail = 0;
-
-	    nnz_lno_t my_b_col = entriesB[row_mapB[row_index] + vector_read_shift];
-	    scalar_t my_b_val = valuesB[row_mapB[row_index] + vector_read_shift];
-
-	    //now insert it to first level hashmap accumulator.
-	    hash = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func;
-	    fail = 1;
-	    bool try_to_insert = true;
-	    nnz_lno_t search_end = team_cuckoo_key_size; 
-	    for (nnz_lno_t trial = hash; trial < search_end; ){
-	      if (keys[trial] == my_b_col){
-		Kokkos::atomic_add(vals + trial, my_b_val);
-		fail = 0;
-		break;
-	      }
-	      else if (keys[trial] == init_value){
-		if (!insert_is_on) {
-		  try_to_insert = false;
-		  break;
-		}
-		else if (Kokkos::atomic_compare_exchange_strong(keys + trial, init_value, my_b_col)){
-		  Kokkos::atomic_add(vals + trial, my_b_val);
-		  Kokkos::atomic_increment(used_hash_sizes);
-		  if (used_hash_sizes[0] > max_first_level_hash_size)  
-		    insert_is_on = false;
-		  fail = 0;
-		  break;
-		}
-	      }
-	      else {
-		++trial;
-	      }
-	    } // end for (nnz_lno_t trial = hash; ...
-
-	    if (fail){
-	      search_end = hash; 
-
-	      for (nnz_lno_t trial = 0; try_to_insert && trial < search_end; ){
-		if (keys[trial] == my_b_col){
-		  Kokkos::atomic_add(vals + trial, my_b_val);
-		  fail = 0;
-		  break;
-		}
-		else if (keys[trial] == init_value){
-		  if (!insert_is_on) {
-		    break;
-		  }
-		  else if (Kokkos::atomic_compare_exchange_strong(keys + trial, init_value, my_b_col)){
-		    Kokkos::atomic_add(vals + trial, my_b_val);
-		    Kokkos::atomic_increment(used_hash_sizes);
-		    if (used_hash_sizes[0] > max_first_level_hash_size)  insert_is_on = false;
-		    fail = 0;
-		    break;
-		  }
-		}
-		else {
-		  ++trial;
-		}
-	      }
-
-	      if (fail) {
-		nnz_lno_t new_hash = (my_b_col * HASHSCALAR) & pow2_hash_func;
-
-		for (nnz_lno_t trial = new_hash; trial < pow2_hash_size; ){
-		  if (global_acc_row_keys[trial] == my_b_col){
-		    Kokkos::atomic_add(global_acc_row_vals + trial , my_b_val);
-		    fail = 0;
-		    break;
-		  }
-		  else if (global_acc_row_keys[trial ] == init_value){
-		    if (Kokkos::atomic_compare_exchange_strong(global_acc_row_keys + trial , init_value, my_b_col)){
-		      Kokkos::atomic_add(global_acc_row_vals + trial , my_b_val);
-		      fail = 0;
-		      break;
-		    }
-		  }
-		  else {
-		    ++trial;
-		  }
-		}
-
-		if (fail){
-		  for (nnz_lno_t trial = 0; trial < new_hash; ){
-		    if (global_acc_row_keys[trial ] == my_b_col){
-		      Kokkos::atomic_add(global_acc_row_vals + trial , my_b_val);
-		      break;
-		    }
-		    else if (global_acc_row_keys[trial ] == init_value){
-		      if (Kokkos::atomic_compare_exchange_strong(global_acc_row_keys + trial , init_value, my_b_col)){
-			Kokkos::atomic_add(global_acc_row_vals + trial , my_b_val);
-			break;
-		      }
-		    }
-		    else {
-		      ++trial;
-		    }
-		  }
-		}
-	      }
-	    } // end if (fail)
-	  } // end for (nnz_lno_t vector_read_shift = vector_shift ...
-
-
-	  // Insert - w Dinv A * B
-	  // insert_is_on = true; // I'm not sure if this is necessary
-	  const size_type a_col_begin_offset = row_mapA[row_index];
-
-	  nnz_lno_t a_col_ind = entriesA[a_col_begin_offset];
-	  scalar_t a_col_val = valuesA[a_col_begin_offset] * mult;
-
-	  nnz_lno_t current_a_column_offset_inrow = 0;
-	  nnz_lno_t flops_on_the_left_of_offsett = 0;
-	  size_type current_b_read_offsett = row_mapB[a_col_ind];
-	  nnz_lno_t current_a_column_flops = row_mapB[a_col_ind + 1] - current_b_read_offsett;
-
-	  nnz_lno_t row_flops = flops_per_row(row_index);
-
-	  teamMember.team_barrier();
-	  for (nnz_lno_t vector_read_shift = vector_shift; vector_read_shift< row_flops; vector_read_shift += bs){
-	    nnz_lno_t my_b_col_shift = vector_read_shift - flops_on_the_left_of_offsett;
-	    nnz_lno_t my_b_col = init_value; scalar_t my_b_val = 0; nnz_lno_t hash = init_value;
-	    int fail = 0;
-
-	    if (my_b_col_shift >= current_a_column_flops ){
-	      do {
-		++current_a_column_offset_inrow;
-		my_b_col_shift -= current_a_column_flops;
-		flops_on_the_left_of_offsett += current_a_column_flops;
-		a_col_ind = entriesA[a_col_begin_offset + current_a_column_offset_inrow];
-
-		current_b_read_offsett = row_mapB[a_col_ind];
-		current_a_column_flops = row_mapB[a_col_ind + 1] - current_b_read_offsett;
-	      } while (my_b_col_shift >= current_a_column_flops);
-	      a_col_val = valuesA[a_col_begin_offset + current_a_column_offset_inrow];
-	    }
-
-	    my_b_col = entriesB[my_b_col_shift + current_b_read_offsett];
-	    my_b_val = valuesB[my_b_col_shift + current_b_read_offsett] * a_col_val; 
-
-	    //now insert it to first level hashmap accumulator.
-	    hash = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func;
-	    fail = 1;
-	    bool try_to_insert = true;
-	    nnz_lno_t search_end = team_cuckoo_key_size; 
-	    for (nnz_lno_t trial = hash; trial < search_end; ){
-	      if (keys[trial] == my_b_col){
-		Kokkos::atomic_add(vals + trial, my_b_val);
-		fail = 0;
-		break;
-	      }
-	      else if (keys[trial] == init_value){
-		if (!insert_is_on) {
-		  try_to_insert = false;
-		  break;
-		}
-		else if (Kokkos::atomic_compare_exchange_strong(keys + trial, init_value, my_b_col)){
-		  Kokkos::atomic_add(vals + trial, my_b_val);
-		  Kokkos::atomic_increment(used_hash_sizes);
-		  if (used_hash_sizes[0] > max_first_level_hash_size)
-		    insert_is_on = false;
-		  fail = 0;
-		  break;
-		}
-	      }
-	      else {
-		++trial;
-	      }
-	    } // end for (nnz_lno_t trial = hash ...
-
-	    if (fail){
-	      search_end = hash; 
-
-	      for (nnz_lno_t trial = 0; try_to_insert && trial < search_end; ){
-		if (keys[trial] == my_b_col){
-		  Kokkos::atomic_add(vals + trial, my_b_val);
-		  fail = 0;
-		  break;
-		}
-		else if (keys[trial] == init_value){
-		  if (!insert_is_on) {
-		    break;
-		  }
-		  else if (Kokkos::atomic_compare_exchange_strong(keys + trial, init_value, my_b_col)){
-		    Kokkos::atomic_add(vals + trial, my_b_val);
-		    Kokkos::atomic_increment(used_hash_sizes);
-		    if (used_hash_sizes[0] > max_first_level_hash_size)  insert_is_on = false;
-		    fail = 0;
-		    break;
-		  }
-		}
-		else {
-		  ++trial;
-		}
-	      }
-
-	      if (fail) {
-		nnz_lno_t new_hash = (my_b_col * HASHSCALAR) & pow2_hash_func;
-		for (nnz_lno_t trial = new_hash; trial < pow2_hash_size; ){
-		  if (global_acc_row_keys[trial] == my_b_col){
-		    Kokkos::atomic_add(global_acc_row_vals + trial , my_b_val);
-		    fail = 0;
-		    break;
-		  }
-		  else if (global_acc_row_keys[trial ] == init_value){
-		    if (Kokkos::atomic_compare_exchange_strong(global_acc_row_keys + trial , init_value, my_b_col)){
-		      Kokkos::atomic_add(global_acc_row_vals + trial , my_b_val);
-		      fail = 0;
-		      break;
-		    }
-		  }
-		  else {
-		    ++trial;
-		  }
-		}
-		if (fail){
-		  for (nnz_lno_t trial = 0; trial < new_hash; ){
-		    if (global_acc_row_keys[trial ] == my_b_col){
-		      Kokkos::atomic_add(global_acc_row_vals + trial , my_b_val);
-		      break;
-		    }
-		    else if (global_acc_row_keys[trial ] == init_value){
-		      if (Kokkos::atomic_compare_exchange_strong(global_acc_row_keys + trial , init_value, my_b_col)){
-			Kokkos::atomic_add(global_acc_row_vals + trial , my_b_val);
-			break;
-		      }
-		    }
-		    else {
-		      ++trial;
-		    }
-		  }
-		}
-	      }
-	    } // end if (fail)
-
-	  } // end for (nnz_lno_t vector_read_shift = vector_shift ...
-
-	  teamMember.team_barrier();
-
-	  if (tmp != NULL){
-
-	    for (nnz_lno_t my_index = vector_shift; my_index < pow2_hash_size; my_index += bs){
-	      nnz_lno_t my_b_col = global_acc_row_keys[my_index];
-	      if (my_b_col != init_value){
-		scalar_t my_b_val = global_acc_row_vals[my_index];
-		int fail = 1;
-
-		nnz_lno_t hash = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func;
-		nnz_lno_t search_end = team_cuckoo_key_size; 
-		for (nnz_lno_t trial = hash; trial < search_end;++trial){
-		  if (keys[trial] == my_b_col){
-		    vals[trial] += my_b_val;
-		    fail = 0;
-		    break;
-		  }
-		  else if (keys[trial] == init_value){
-		    break;
-		  }
-		}
-		search_end = hash;
-
-		for (nnz_lno_t trial = 0; trial < search_end; ++trial){
-		  if (keys[trial] == my_b_col){
-		    vals[trial] += my_b_val;
-		    fail = 0;
-		    break;
-		  }
-		  else if (keys[trial] == init_value){
-		    break;
-		  }
-		}
-
-		if (fail){
-		  nnz_lno_t write_index = 0;
-		  write_index = Kokkos::atomic_fetch_add(used_hash_sizes + 1, atomic_incr_type(1));
-		  c_row[write_index] = my_b_col;
-		  c_row_vals[write_index] = my_b_val;
-		}
-		global_acc_row_keys[my_index] = init_value;
-	      } // end if (my_b_col != init_value)
-	    } // end for (nnz_lno_t my_index = vector_shift ...
-
-	    teamMember.team_barrier();
-	    Kokkos::single(Kokkos::PerTeam(teamMember),[&] () {
-		memory_space.release_chunk(global_acc_row_keys);
-	      });
-	  } // end if (tmp != NULL)
-
-
-	  for (nnz_lno_t  my_index = vector_shift; my_index < team_cuckoo_key_size;my_index += bs){
-	    nnz_lno_t my_key = keys[my_index];
-	    if (my_key != init_value){
-	      scalar_t my_val = vals[my_index];
-	      nnz_lno_t write_index = 0;
-	      write_index = Kokkos::atomic_fetch_add(used_hash_sizes + 1, atomic_incr_type(1));
-	      c_row[write_index] = my_key;
-	      c_row_vals[write_index] = my_val;
-	    }
-	  }
-	}
+      nnz_lno_t num_threads = team_cuckoo_key_size / vector_size;
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(teamMember, num_threads),
+          [&](nnz_lno_t teamind) {
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(teamMember, vector_size),
+                [&](nnz_lno_t i) {
+                  keys[teamind * vector_size + i] = init_value;
+                  vals[teamind * vector_size + i] = 0;
+                });
+          });
+
+      Kokkos::single(Kokkos::PerTeam(teamMember), [&]() {
+        used_hash_sizes[0] = 0;
+        used_hash_sizes[1] = 0;
+      });
+
+      // Insert B
+      teamMember.team_barrier();
+      nnz_lno_t current_b_column_flops =
+          row_mapB[row_index + 1] - row_mapB[row_index];
+      bool insert_is_on = true;
+      for (nnz_lno_t vector_read_shift = vector_shift;
+           vector_read_shift < current_b_column_flops;
+           vector_read_shift += bs) {
+        nnz_lno_t hash = init_value;
+        int fail       = 0;
+
+        nnz_lno_t my_b_col = entriesB[row_mapB[row_index] + vector_read_shift];
+        scalar_t my_b_val  = valuesB[row_mapB[row_index] + vector_read_shift];
+
+        // now insert it to first level hashmap accumulator.
+        hash                 = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func;
+        fail                 = 1;
+        bool try_to_insert   = true;
+        nnz_lno_t search_end = team_cuckoo_key_size;
+        for (nnz_lno_t trial = hash; trial < search_end;) {
+          if (keys[trial] == my_b_col) {
+            Kokkos::atomic_add(vals + trial, my_b_val);
+            fail = 0;
+            break;
+          } else if (keys[trial] == init_value) {
+            if (!insert_is_on) {
+              try_to_insert = false;
+              break;
+            } else if (Kokkos::atomic_compare_exchange_strong(
+                           keys + trial, init_value, my_b_col)) {
+              Kokkos::atomic_add(vals + trial, my_b_val);
+              Kokkos::atomic_increment(used_hash_sizes);
+              if (used_hash_sizes[0] > max_first_level_hash_size)
+                insert_is_on = false;
+              fail = 0;
+              break;
+            }
+          } else {
+            ++trial;
+          }
+        }  // end for (nnz_lno_t trial = hash; ...
+
+        if (fail) {
+          search_end = hash;
+
+          for (nnz_lno_t trial = 0; try_to_insert && trial < search_end;) {
+            if (keys[trial] == my_b_col) {
+              Kokkos::atomic_add(vals + trial, my_b_val);
+              fail = 0;
+              break;
+            } else if (keys[trial] == init_value) {
+              if (!insert_is_on) {
+                break;
+              } else if (Kokkos::atomic_compare_exchange_strong(
+                             keys + trial, init_value, my_b_col)) {
+                Kokkos::atomic_add(vals + trial, my_b_val);
+                Kokkos::atomic_increment(used_hash_sizes);
+                if (used_hash_sizes[0] > max_first_level_hash_size)
+                  insert_is_on = false;
+                fail = 0;
+                break;
+              }
+            } else {
+              ++trial;
+            }
+          }
+
+          if (fail) {
+            nnz_lno_t new_hash = (my_b_col * HASHSCALAR) & pow2_hash_func;
+
+            for (nnz_lno_t trial = new_hash; trial < pow2_hash_size;) {
+              if (global_acc_row_keys[trial] == my_b_col) {
+                Kokkos::atomic_add(global_acc_row_vals + trial, my_b_val);
+                fail = 0;
+                break;
+              } else if (global_acc_row_keys[trial] == init_value) {
+                if (Kokkos::atomic_compare_exchange_strong(
+                        global_acc_row_keys + trial, init_value, my_b_col)) {
+                  Kokkos::atomic_add(global_acc_row_vals + trial, my_b_val);
+                  fail = 0;
+                  break;
+                }
+              } else {
+                ++trial;
+              }
+            }
+
+            if (fail) {
+              for (nnz_lno_t trial = 0; trial < new_hash;) {
+                if (global_acc_row_keys[trial] == my_b_col) {
+                  Kokkos::atomic_add(global_acc_row_vals + trial, my_b_val);
+                  break;
+                } else if (global_acc_row_keys[trial] == init_value) {
+                  if (Kokkos::atomic_compare_exchange_strong(
+                          global_acc_row_keys + trial, init_value, my_b_col)) {
+                    Kokkos::atomic_add(global_acc_row_vals + trial, my_b_val);
+                    break;
+                  }
+                } else {
+                  ++trial;
+                }
+              }
+            }
+          }
+        }  // end if (fail)
+      }    // end for (nnz_lno_t vector_read_shift = vector_shift ...
+
+      // Insert - w Dinv A * B
+      // insert_is_on = true; // I'm not sure if this is necessary
+      const size_type a_col_begin_offset = row_mapA[row_index];
+
+      nnz_lno_t a_col_ind = entriesA[a_col_begin_offset];
+      scalar_t a_col_val  = valuesA[a_col_begin_offset] * mult;
+
+      nnz_lno_t current_a_column_offset_inrow = 0;
+      nnz_lno_t flops_on_the_left_of_offsett  = 0;
+      size_type current_b_read_offsett        = row_mapB[a_col_ind];
+      nnz_lno_t current_a_column_flops =
+          row_mapB[a_col_ind + 1] - current_b_read_offsett;
+
+      nnz_lno_t row_flops = flops_per_row(row_index);
+
+      teamMember.team_barrier();
+      for (nnz_lno_t vector_read_shift = vector_shift;
+           vector_read_shift < row_flops; vector_read_shift += bs) {
+        nnz_lno_t my_b_col_shift =
+            vector_read_shift - flops_on_the_left_of_offsett;
+        nnz_lno_t my_b_col = init_value;
+        scalar_t my_b_val  = 0;
+        nnz_lno_t hash     = init_value;
+        int fail           = 0;
+
+        if (my_b_col_shift >= current_a_column_flops) {
+          do {
+            ++current_a_column_offset_inrow;
+            my_b_col_shift -= current_a_column_flops;
+            flops_on_the_left_of_offsett += current_a_column_flops;
+            a_col_ind =
+                entriesA[a_col_begin_offset + current_a_column_offset_inrow];
+
+            current_b_read_offsett = row_mapB[a_col_ind];
+            current_a_column_flops =
+                row_mapB[a_col_ind + 1] - current_b_read_offsett;
+          } while (my_b_col_shift >= current_a_column_flops);
+          a_col_val =
+              valuesA[a_col_begin_offset + current_a_column_offset_inrow];
+        }
+
+        my_b_col = entriesB[my_b_col_shift + current_b_read_offsett];
+        my_b_val = valuesB[my_b_col_shift + current_b_read_offsett] * a_col_val;
+
+        // now insert it to first level hashmap accumulator.
+        hash                 = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func;
+        fail                 = 1;
+        bool try_to_insert   = true;
+        nnz_lno_t search_end = team_cuckoo_key_size;
+        for (nnz_lno_t trial = hash; trial < search_end;) {
+          if (keys[trial] == my_b_col) {
+            Kokkos::atomic_add(vals + trial, my_b_val);
+            fail = 0;
+            break;
+          } else if (keys[trial] == init_value) {
+            if (!insert_is_on) {
+              try_to_insert = false;
+              break;
+            } else if (Kokkos::atomic_compare_exchange_strong(
+                           keys + trial, init_value, my_b_col)) {
+              Kokkos::atomic_add(vals + trial, my_b_val);
+              Kokkos::atomic_increment(used_hash_sizes);
+              if (used_hash_sizes[0] > max_first_level_hash_size)
+                insert_is_on = false;
+              fail = 0;
+              break;
+            }
+          } else {
+            ++trial;
+          }
+        }  // end for (nnz_lno_t trial = hash ...
+
+        if (fail) {
+          search_end = hash;
+
+          for (nnz_lno_t trial = 0; try_to_insert && trial < search_end;) {
+            if (keys[trial] == my_b_col) {
+              Kokkos::atomic_add(vals + trial, my_b_val);
+              fail = 0;
+              break;
+            } else if (keys[trial] == init_value) {
+              if (!insert_is_on) {
+                break;
+              } else if (Kokkos::atomic_compare_exchange_strong(
+                             keys + trial, init_value, my_b_col)) {
+                Kokkos::atomic_add(vals + trial, my_b_val);
+                Kokkos::atomic_increment(used_hash_sizes);
+                if (used_hash_sizes[0] > max_first_level_hash_size)
+                  insert_is_on = false;
+                fail = 0;
+                break;
+              }
+            } else {
+              ++trial;
+            }
+          }
+
+          if (fail) {
+            nnz_lno_t new_hash = (my_b_col * HASHSCALAR) & pow2_hash_func;
+            for (nnz_lno_t trial = new_hash; trial < pow2_hash_size;) {
+              if (global_acc_row_keys[trial] == my_b_col) {
+                Kokkos::atomic_add(global_acc_row_vals + trial, my_b_val);
+                fail = 0;
+                break;
+              } else if (global_acc_row_keys[trial] == init_value) {
+                if (Kokkos::atomic_compare_exchange_strong(
+                        global_acc_row_keys + trial, init_value, my_b_col)) {
+                  Kokkos::atomic_add(global_acc_row_vals + trial, my_b_val);
+                  fail = 0;
+                  break;
+                }
+              } else {
+                ++trial;
+              }
+            }
+            if (fail) {
+              for (nnz_lno_t trial = 0; trial < new_hash;) {
+                if (global_acc_row_keys[trial] == my_b_col) {
+                  Kokkos::atomic_add(global_acc_row_vals + trial, my_b_val);
+                  break;
+                } else if (global_acc_row_keys[trial] == init_value) {
+                  if (Kokkos::atomic_compare_exchange_strong(
+                          global_acc_row_keys + trial, init_value, my_b_col)) {
+                    Kokkos::atomic_add(global_acc_row_vals + trial, my_b_val);
+                    break;
+                  }
+                } else {
+                  ++trial;
+                }
+              }
+            }
+          }
+        }  // end if (fail)
+
+      }  // end for (nnz_lno_t vector_read_shift = vector_shift ...
+
+      teamMember.team_barrier();
+
+      if (tmp != NULL) {
+        for (nnz_lno_t my_index = vector_shift; my_index < pow2_hash_size;
+             my_index += bs) {
+          nnz_lno_t my_b_col = global_acc_row_keys[my_index];
+          if (my_b_col != init_value) {
+            scalar_t my_b_val = global_acc_row_vals[my_index];
+            int fail          = 1;
+
+            nnz_lno_t hash = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func;
+            nnz_lno_t search_end = team_cuckoo_key_size;
+            for (nnz_lno_t trial = hash; trial < search_end; ++trial) {
+              if (keys[trial] == my_b_col) {
+                vals[trial] += my_b_val;
+                fail = 0;
+                break;
+              } else if (keys[trial] == init_value) {
+                break;
+              }
+            }
+            search_end = hash;
+
+            for (nnz_lno_t trial = 0; trial < search_end; ++trial) {
+              if (keys[trial] == my_b_col) {
+                vals[trial] += my_b_val;
+                fail = 0;
+                break;
+              } else if (keys[trial] == init_value) {
+                break;
+              }
+            }
+
+            if (fail) {
+              nnz_lno_t write_index = 0;
+              write_index        = Kokkos::atomic_fetch_add(used_hash_sizes + 1,
+                                                     atomic_incr_type(1));
+              c_row[write_index] = my_b_col;
+              c_row_vals[write_index] = my_b_val;
+            }
+            global_acc_row_keys[my_index] = init_value;
+          }  // end if (my_b_col != init_value)
+        }    // end for (nnz_lno_t my_index = vector_shift ...
+
+        teamMember.team_barrier();
+        Kokkos::single(Kokkos::PerTeam(teamMember), [&]() {
+          memory_space.release_chunk(global_acc_row_keys);
+        });
+      }  // end if (tmp != NULL)
+
+      for (nnz_lno_t my_index = vector_shift; my_index < team_cuckoo_key_size;
+           my_index += bs) {
+        nnz_lno_t my_key = keys[my_index];
+        if (my_key != init_value) {
+          scalar_t my_val       = vals[my_index];
+          nnz_lno_t write_index = 0;
+          write_index           = Kokkos::atomic_fetch_add(used_hash_sizes + 1,
+                                                 atomic_incr_type(1));
+          c_row[write_index]    = my_key;
+          c_row_vals[write_index] = my_val;
+        }
       }
+    }
+  }
 
-      size_t team_shmem_size (int /* team_size */) const {
-	return shared_memory_size;
-      }
-    };
-
-
-    // ============================================================================
-    // This is a Jacobi-fused SPGEMM implementation which uses a sparse accumulator.
-    // Copied from KokkosSparse_spgemm_impl_kkmem.hpp and updated for the 
-    // jacobi-fused functionality by adding omega and dinv. 
-    // ============================================================================
-
-    template <typename HandleType,
-	      typename a_row_view_t_, typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
-	      typename b_lno_row_view_t_, typename b_lno_nnz_view_t_, typename b_scalar_nnz_view_t_  >
-    template <typename c_row_view_t, typename c_lno_nnz_view_t, typename c_scalar_nnz_view_t, typename dinv_view_t>
-    void
-    KokkosSPGEMM
-    <HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_,
-     b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
-    KokkosSPGEMM_jacobi_sparseacc(c_row_view_t row_mapC_,
-				  c_lno_nnz_view_t entriesC_,
-				  c_scalar_nnz_view_t valuesC_,
-				  typename c_scalar_nnz_view_t::const_value_type omega, 
-				  dinv_view_t dinv,
-				  KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space)
-    {
-      using pool_memory_space = KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t>;
-      constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>();
-      if (KOKKOSKERNELS_VERBOSE){
-	std::cout << "\tSPARSE ACC MODE" << std::endl;
-      }
-      // Initialize the variables
-      KokkosSparse::SPGEMMAlgorithm algorithm_to_run = this->spgemm_algorithm;
-      nnz_lno_t brows = row_mapB.extent(0) - 1;
-      size_type bnnz =  valsB.extent(0);
-      int suggested_vector_size = this->handle->get_suggested_vector_size(brows, bnnz);
-      int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-      size_t shmem_size_to_use = shmem_size;
-      row_lno_persistent_work_view_t flops_per_row = this->handle->get_spgemm_handle()->row_flops;
-      size_t original_overall_flops = this->handle->get_spgemm_handle()->original_overall_flops;
-      nnz_lno_t max_nnz = this->handle->get_spgemm_handle()->get_max_result_nnz();
-      size_type overall_nnz = this->handle->get_spgemm_handle()->get_c_nnz();
-      nnz_lno_t min_hash_size = 1;
-      size_t chunksize  = 1;
-      double first_level_cut_off  = this->handle->get_spgemm_handle()->get_first_level_hash_cut_off();
-      int hash_scaler = this->handle->get_spgemm_handle()->get_min_hash_size_scale();
-      nnz_lno_t tmp_max_nnz =  max_nnz;
-
-
-      // Compute the shared memory variables. 
-      // These are not actually passed to functors requiring shmem, but used for algorithm selection
-      constexpr size_t scalarAlignPad = (alignof(scalar_t) > alignof(nnz_lno_t)) ? (alignof(scalar_t) - alignof(nnz_lno_t)) : 0;
-      nnz_lno_t unit_memory = sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) + sizeof(scalar_t);
-      nnz_lno_t team_shmem_key_size = ((shmem_size_to_use - sizeof(nnz_lno_t) * 4 - scalarAlignPad) / unit_memory);
-      // alignment padding is per-thread for algorithms with per-thread hashmap
-      nnz_lno_t thread_memory = ((shmem_size_to_use / suggested_team_size - scalarAlignPad) / 8) * 8;
-      nnz_lno_t thread_shmem_key_size = ((thread_memory - sizeof(nnz_lno_t) * 4) / unit_memory);
-      if (KOKKOSKERNELS_VERBOSE){
-	std::cout << "\t\tinitial JacobiSpGEMMSparseAcc -- thread_memory:" << thread_memory  << " unit_memory:" << unit_memory <<" initial key size:" << thread_shmem_key_size << std::endl;
-	std::cout << "\t\tinitial JacobiSpGEMMSparseAcc -- team_memory:" << shmem_size_to_use  << " unit_memory:" << unit_memory <<" initial team key size:" << team_shmem_key_size << std::endl;
-      }
-      nnz_lno_t thread_shmem_hash_size = 1;
-      while (thread_shmem_hash_size * 2 <=  thread_shmem_key_size){
-	thread_shmem_hash_size = thread_shmem_hash_size * 2;
-      }
-      nnz_lno_t team_shmem_hash_size = 1;
-      while (team_shmem_hash_size * 2 <=  team_shmem_key_size){
-	team_shmem_hash_size = team_shmem_hash_size * 2;
-      }
+  size_t team_shmem_size(int /* team_size */) const {
+    return shared_memory_size;
+  }
+};
+
+// ============================================================================
+// This is a Jacobi-fused SPGEMM implementation which uses a sparse accumulator.
+// Copied from KokkosSparse_spgemm_impl_kkmem.hpp and updated for the
+// jacobi-fused functionality by adding omega and dinv.
+// ============================================================================
+
+template <typename HandleType, typename a_row_view_t_,
+          typename a_lno_nnz_view_t_, typename a_scalar_nnz_view_t_,
+          typename b_lno_row_view_t_, typename b_lno_nnz_view_t_,
+          typename b_scalar_nnz_view_t_>
+template <typename c_row_view_t, typename c_lno_nnz_view_t,
+          typename c_scalar_nnz_view_t, typename dinv_view_t>
+void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
+                  a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
+                  b_scalar_nnz_view_t_>::
+    KokkosSPGEMM_jacobi_sparseacc(
+        c_row_view_t row_mapC_, c_lno_nnz_view_t entriesC_,
+        c_scalar_nnz_view_t valuesC_,
+        typename c_scalar_nnz_view_t::const_value_type omega, dinv_view_t dinv,
+        KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space) {
+  using pool_memory_space =
+      KokkosKernels::Impl::UniformMemoryPool<MyTempMemorySpace, nnz_lno_t>;
+  constexpr bool exec_gpu =
+      KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>();
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\tSPARSE ACC MODE" << std::endl;
+  }
+  // Initialize the variables
+  KokkosSparse::SPGEMMAlgorithm algorithm_to_run = this->spgemm_algorithm;
+  nnz_lno_t brows                                = row_mapB.extent(0) - 1;
+  size_type bnnz                                 = valsB.extent(0);
+  int suggested_vector_size =
+      this->handle->get_suggested_vector_size(brows, bnnz);
+  int suggested_team_size =
+      this->handle->get_suggested_team_size(suggested_vector_size);
+  size_t shmem_size_to_use = shmem_size;
+  row_lno_persistent_work_view_t flops_per_row =
+      this->handle->get_spgemm_handle()->row_flops;
+  size_t original_overall_flops =
+      this->handle->get_spgemm_handle()->original_overall_flops;
+  nnz_lno_t max_nnz = this->handle->get_spgemm_handle()->get_max_result_nnz();
+  size_type overall_nnz   = this->handle->get_spgemm_handle()->get_c_nnz();
+  nnz_lno_t min_hash_size = 1;
+  size_t chunksize        = 1;
+  double first_level_cut_off =
+      this->handle->get_spgemm_handle()->get_first_level_hash_cut_off();
+  int hash_scaler =
+      this->handle->get_spgemm_handle()->get_min_hash_size_scale();
+  nnz_lno_t tmp_max_nnz = max_nnz;
+
+  // Compute the shared memory variables.
+  // These are not actually passed to functors requiring shmem, but used for
+  // algorithm selection
+  constexpr size_t scalarAlignPad =
+      (alignof(scalar_t) > alignof(nnz_lno_t))
+          ? (alignof(scalar_t) - alignof(nnz_lno_t))
+          : 0;
+  nnz_lno_t unit_memory =
+      sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) + sizeof(scalar_t);
+  nnz_lno_t team_shmem_key_size =
+      ((shmem_size_to_use - sizeof(nnz_lno_t) * 4 - scalarAlignPad) /
+       unit_memory);
+  // alignment padding is per-thread for algorithms with per-thread hashmap
+  nnz_lno_t thread_memory =
+      ((shmem_size_to_use / suggested_team_size - scalarAlignPad) / 8) * 8;
+  nnz_lno_t thread_shmem_key_size =
+      ((thread_memory - sizeof(nnz_lno_t) * 4) / unit_memory);
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\t\tinitial JacobiSpGEMMSparseAcc -- thread_memory:"
+              << thread_memory << " unit_memory:" << unit_memory
+              << " initial key size:" << thread_shmem_key_size << std::endl;
+    std::cout << "\t\tinitial JacobiSpGEMMSparseAcc -- team_memory:"
+              << shmem_size_to_use << " unit_memory:" << unit_memory
+              << " initial team key size:" << team_shmem_key_size << std::endl;
+  }
+  nnz_lno_t thread_shmem_hash_size = 1;
+  while (thread_shmem_hash_size * 2 <= thread_shmem_key_size) {
+    thread_shmem_hash_size = thread_shmem_hash_size * 2;
+  }
+  nnz_lno_t team_shmem_hash_size = 1;
+  while (team_shmem_hash_size * 2 <= team_shmem_key_size) {
+    team_shmem_hash_size = team_shmem_hash_size * 2;
+  }
 
-      team_shmem_key_size = team_shmem_key_size + ((team_shmem_key_size - team_shmem_hash_size) * sizeof(nnz_lno_t)) / (sizeof (nnz_lno_t) * 2 + sizeof(scalar_t));
-      team_shmem_key_size = (team_shmem_key_size >> 1) << 1;
-      thread_shmem_key_size = thread_shmem_key_size + ((thread_shmem_key_size - thread_shmem_hash_size) * sizeof(nnz_lno_t)) / (sizeof (nnz_lno_t) * 2 + sizeof(scalar_t));
-      thread_shmem_key_size = (thread_shmem_key_size >> 1) << 1;
+  team_shmem_key_size =
+      team_shmem_key_size +
+      ((team_shmem_key_size - team_shmem_hash_size) * sizeof(nnz_lno_t)) /
+          (sizeof(nnz_lno_t) * 2 + sizeof(scalar_t));
+  team_shmem_key_size = (team_shmem_key_size >> 1) << 1;
+  thread_shmem_key_size =
+      thread_shmem_key_size +
+      ((thread_shmem_key_size - thread_shmem_hash_size) * sizeof(nnz_lno_t)) /
+          (sizeof(nnz_lno_t) * 2 + sizeof(scalar_t));
+  thread_shmem_key_size = (thread_shmem_key_size >> 1) << 1;
+
+  if (hash_scaler == 0) {
+    tmp_max_nnz = KOKKOSKERNELS_MACRO_MAX(
+        max_nnz, nnz_lno_t(this->b_col_cnt / this->concurrency + 1));
+  } else {
+    tmp_max_nnz *= hash_scaler;
+  }
 
-      if (hash_scaler == 0){
-	tmp_max_nnz = KOKKOSKERNELS_MACRO_MAX(max_nnz, nnz_lno_t (this->b_col_cnt / this->concurrency + 1));
+  // Choose the SpGEMM algorithm and corresponding parameters
+  if (this->spgemm_algorithm == SPGEMM_KK ||
+      this->spgemm_algorithm == SPGEMM_KK_LP) {
+    if (exec_gpu) {
+      size_type average_row_nnz = overall_nnz / this->a_row_cnt;
+      size_t average_row_flops  = original_overall_flops / this->a_row_cnt;
+
+      // If we have very low flops per row, or our maximum number of nnz is
+      // pretty small, then we decide on the row-based algorithm
+      // (SPGEMM_KK_MEMORY).
+      if (SPGEMM_KK_LP != this->spgemm_algorithm &&
+          (average_row_nnz < 32 || average_row_flops < 256)) {
+        algorithm_to_run = SPGEMM_KK_MEMORY;
+        while (average_row_nnz > size_type(thread_shmem_key_size) &&
+               suggested_vector_size < 32) {
+          suggested_vector_size = suggested_vector_size * 2;
+          suggested_vector_size =
+              KOKKOSKERNELS_MACRO_MIN(32, suggested_vector_size);
+          suggested_team_size =
+              this->handle->get_suggested_team_size(suggested_vector_size);
+          thread_memory = (shmem_size_to_use / 8 / suggested_team_size) * 8;
+          thread_shmem_key_size =
+              ((thread_memory - sizeof(nnz_lno_t) * 4) / unit_memory);
+          thread_shmem_hash_size = 1;
+          while (thread_shmem_hash_size * 2 <= thread_shmem_key_size) {
+            thread_shmem_hash_size = thread_shmem_hash_size * 2;
+          }
+          thread_shmem_key_size =
+              thread_shmem_key_size +
+              ((thread_shmem_key_size - thread_shmem_hash_size) *
+                   sizeof(nnz_lno_t) -
+               scalarAlignPad) /
+                  (sizeof(nnz_lno_t) * 2 + sizeof(scalar_t));
+          thread_shmem_key_size = (thread_shmem_key_size >> 1) << 1;
+        }
+
+        if (KOKKOSKERNELS_VERBOSE) {
+          std::cout << "\t\t\tRunning KKMEM with suggested_vector_size:"
+                    << suggested_vector_size
+                    << " suggested_team_size:" << suggested_team_size
+                    << std::endl;
+        }
       }
+      // Otherwise, we decide on SPGEMM_KK_MEMORYSPREADTEAM or
+      // SPGEMM_KK_MEMORY_BIDSPREADTEAM
       else {
-	tmp_max_nnz *= hash_scaler;
+        nnz_lno_t tmp_team_cuckoo_key_size =
+            ((shmem_size_to_use - sizeof(nnz_lno_t) * 2 - scalarAlignPad) /
+             (sizeof(nnz_lno_t) + sizeof(scalar_t)));
+        int team_cuckoo_key_size = 1;
+        while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size)
+          team_cuckoo_key_size = team_cuckoo_key_size * 2;
+        suggested_vector_size = 32;
+        suggested_team_size =
+            this->handle->get_suggested_team_size(suggested_vector_size);
+        algorithm_to_run = SPGEMM_KK_MEMORY_BIGSPREADTEAM;
+        while (average_row_nnz <
+               team_cuckoo_key_size / 2 *
+                   (KOKKOSKERNELS_MACRO_MIN(first_level_cut_off + 0.05, 1))) {
+          shmem_size_to_use = shmem_size_to_use / 2;
+          tmp_team_cuckoo_key_size =
+              ((shmem_size_to_use - sizeof(nnz_lno_t) * 2 - scalarAlignPad) /
+               (sizeof(nnz_lno_t) + sizeof(scalar_t)));
+          team_cuckoo_key_size = 1;
+          while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size)
+            team_cuckoo_key_size = team_cuckoo_key_size * 2;
+
+          suggested_team_size = suggested_team_size / 2;
+        }
+        if (average_row_flops >
+                size_t(2) * suggested_team_size * suggested_vector_size &&
+            average_row_nnz >
+                size_type(team_cuckoo_key_size) *
+                    (KOKKOSKERNELS_MACRO_MIN(first_level_cut_off + 0.05, 1))) {
+          shmem_size_to_use = shmem_size_to_use * 2;
+          tmp_team_cuckoo_key_size =
+              ((shmem_size_to_use - sizeof(nnz_lno_t) * 2 - scalarAlignPad) /
+               (sizeof(nnz_lno_t) + sizeof(scalar_t)));
+          team_cuckoo_key_size = 1;
+          while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size)
+            team_cuckoo_key_size = team_cuckoo_key_size * 2;
+          suggested_team_size = suggested_team_size * 2;
+        }
+
+        suggested_team_size = KOKKOSKERNELS_MACRO_MAX(2, suggested_team_size);
+
+        if (max_nnz <
+            team_cuckoo_key_size *
+                KOKKOSKERNELS_MACRO_MIN(first_level_cut_off + 0.20, 1)) {
+          algorithm_to_run = SPGEMM_KK_MEMORY_SPREADTEAM;
+          if (KOKKOSKERNELS_VERBOSE) {
+            std::cout << "\t\t\tRunning SPGEMM_KK_MEMORY_SPREADTEAM with "
+                         "suggested_vector_size:"
+                      << suggested_vector_size
+                      << " suggested_team_size:" << suggested_team_size
+                      << " shmem_size_to_use:" << shmem_size_to_use
+                      << std::endl;
+          }
+        } else {
+          if (KOKKOSKERNELS_VERBOSE) {
+            std::cout << "\t\t\tRunning SPGEMM_KK_MEMORY_BIGSPREADTEAM with "
+                         "suggested_vector_size:"
+                      << suggested_vector_size
+                      << " suggested_team_size:" << suggested_team_size
+                      << " shmem_size_to_use:" << shmem_size_to_use
+                      << std::endl;
+          }
+        }
       }
-
-
-      // Choose the SpGEMM algorithm and corresponding parameters
-      if (this->spgemm_algorithm == SPGEMM_KK || this->spgemm_algorithm == SPGEMM_KK_LP){
-	if (exec_gpu) {
-	  size_type average_row_nnz = overall_nnz / this->a_row_cnt;
-	  size_t average_row_flops = original_overall_flops / this->a_row_cnt;
-
-	  // If we have very low flops per row, or our maximum number of nnz is pretty small,
-	  // then we decide on the row-based algorithm (SPGEMM_KK_MEMORY).
-	  if (SPGEMM_KK_LP != this->spgemm_algorithm && (average_row_nnz < 32 || average_row_flops < 256)){
-	    algorithm_to_run = SPGEMM_KK_MEMORY;
-	    while (average_row_nnz > size_type (thread_shmem_key_size) && suggested_vector_size < 32){
-	      suggested_vector_size  = suggested_vector_size * 2;
-	      suggested_vector_size = KOKKOSKERNELS_MACRO_MIN(32, suggested_vector_size);
-	      suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-	      thread_memory = (shmem_size_to_use /8 / suggested_team_size) * 8;
-	      thread_shmem_key_size = ((thread_memory - sizeof(nnz_lno_t) * 4) / unit_memory);
-	      thread_shmem_hash_size = 1;
-	      while (thread_shmem_hash_size * 2 <=  thread_shmem_key_size){
-	  	thread_shmem_hash_size = thread_shmem_hash_size * 2;
-	      }
-	      thread_shmem_key_size = thread_shmem_key_size + ((thread_shmem_key_size - thread_shmem_hash_size) * sizeof(nnz_lno_t) - scalarAlignPad) / (sizeof (nnz_lno_t) * 2 + sizeof(scalar_t));
-	      thread_shmem_key_size = (thread_shmem_key_size >> 1) << 1;
-	    }
-
-	    if (KOKKOSKERNELS_VERBOSE){
-	      std::cout << "\t\t\tRunning KKMEM with suggested_vector_size:" << suggested_vector_size << " suggested_team_size:" << suggested_team_size << std::endl;
-	    }
-	  }
-	  // Otherwise, we decide on SPGEMM_KK_MEMORYSPREADTEAM or SPGEMM_KK_MEMORY_BIDSPREADTEAM 
-	  else {
-
-	    nnz_lno_t tmp_team_cuckoo_key_size = ((shmem_size_to_use - sizeof(nnz_lno_t) * 2 - scalarAlignPad) / (sizeof(nnz_lno_t) + sizeof(scalar_t )));
-	    int team_cuckoo_key_size = 1;
-	    while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size) team_cuckoo_key_size = team_cuckoo_key_size * 2;
-	    suggested_vector_size = 32;
-	    suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size);
-	    algorithm_to_run = SPGEMM_KK_MEMORY_BIGSPREADTEAM;
-	    while (average_row_nnz < team_cuckoo_key_size / 2 * (KOKKOSKERNELS_MACRO_MIN (first_level_cut_off + 0.05, 1))){
-	      shmem_size_to_use = shmem_size_to_use / 2;
-	      tmp_team_cuckoo_key_size = ((shmem_size_to_use - sizeof(nnz_lno_t) * 2 - scalarAlignPad) / (sizeof(nnz_lno_t) + sizeof(scalar_t )));
-	      team_cuckoo_key_size = 1;
-	      while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size) team_cuckoo_key_size = team_cuckoo_key_size * 2;
-
-	      suggested_team_size = suggested_team_size / 2;
-	    }
-	    if (average_row_flops > size_t(2) * suggested_team_size * suggested_vector_size &&
-		average_row_nnz > size_type (team_cuckoo_key_size) * (KOKKOSKERNELS_MACRO_MIN (first_level_cut_off + 0.05, 1))){
-	      shmem_size_to_use = shmem_size_to_use * 2;
-	      tmp_team_cuckoo_key_size = ((shmem_size_to_use - sizeof(nnz_lno_t) * 2 - scalarAlignPad) / (sizeof(nnz_lno_t) + sizeof(scalar_t )));
-	      team_cuckoo_key_size = 1;
-	      while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size) team_cuckoo_key_size = team_cuckoo_key_size * 2;
-	      suggested_team_size = suggested_team_size *2;
-	    }
-
-	    suggested_team_size = KOKKOSKERNELS_MACRO_MAX(2, suggested_team_size);
-
-	    if (max_nnz < team_cuckoo_key_size * KOKKOSKERNELS_MACRO_MIN (first_level_cut_off + 0.20, 1)){
-	      algorithm_to_run = SPGEMM_KK_MEMORY_SPREADTEAM;
-	      if (KOKKOSKERNELS_VERBOSE){
-		std::cout
-		  << "\t\t\tRunning SPGEMM_KK_MEMORY_SPREADTEAM with suggested_vector_size:" << suggested_vector_size
-		  << " suggested_team_size:" << suggested_team_size
-		  << " shmem_size_to_use:" << shmem_size_to_use << std::endl;
-	      }
-	    }
-	    else {
-	      if (KOKKOSKERNELS_VERBOSE){
-		std::cout 	<< "\t\t\tRunning SPGEMM_KK_MEMORY_BIGSPREADTEAM with suggested_vector_size:" << suggested_vector_size
-				<< " suggested_team_size:" << suggested_team_size
-				<< " shmem_size_to_use:" << shmem_size_to_use << std::endl;
-	      }
-	    }
-	  }
-	}
-	// If non-GPU, we decide whether we want to use a sparse or a dense acumulator 
-	else {
-
-	  bool run_dense = false;
-	  nnz_lno_t max_column_cut_off = this->handle->get_spgemm_handle()->MaxColDenseAcc;
-	  nnz_lno_t col_size = this->b_col_cnt;
-	  if (col_size < max_column_cut_off){
-	    run_dense = true;
-	    if (KOKKOSKERNELS_VERBOSE){
-	      std::cout << "\t\t\tRunning SPGEMM_KK_DENSE col_size:" << col_size << " max_column_cut_off:" << max_column_cut_off << std::endl;
-	    }
-	  }
-	  else {
-
-	    nnz_lno_t tmp_min_hash_size = 1;
-	    while (tmp_max_nnz > tmp_min_hash_size){
-	      tmp_min_hash_size *= 4;
-	    }
-
-	    size_t kkmem_chunksize = tmp_min_hash_size ; //this is for used hash indices
-	    kkmem_chunksize += tmp_min_hash_size ; //this is for the hash begins
-	    kkmem_chunksize += max_nnz ; //this is for hash nexts
-	    kkmem_chunksize = kkmem_chunksize * sizeof (nnz_lno_t) + scalarAlignPad;
-	    size_t dense_chunksize = (col_size + col_size / sizeof(scalar_t) + 1) * sizeof(scalar_t);
-
-	    if (kkmem_chunksize >= dense_chunksize * 0.5){
-	      run_dense = true;
-	      if (KOKKOSKERNELS_VERBOSE){
-		std::cout << "\t\t\tRunning SPGEMM_KK_SPEED kkmem_chunksize:" << kkmem_chunksize << " dense_chunksize:" << dense_chunksize << std::endl;
-	      }
-	    }
-	    else {
-	      run_dense = false;
-	      if (KOKKOSKERNELS_VERBOSE){
-		std::cout << "\t\t\tRunning SPGEMM_KK_MEMORY col_size:" << col_size << " max_column_cut_off:" << max_column_cut_off << std::endl;
-	      }
-	    }
-	  }
-
-	  if (run_dense){
-	    this->KokkosSPGEMM_jacobi_denseacc(row_mapC_, entriesC_, valuesC_, omega, dinv, lcl_my_exec_space);
-	    return;
-	  }
-	}
+    }
+    // If non-GPU, we decide whether we want to use a sparse or a dense
+    // acumulator
+    else {
+      bool run_dense = false;
+      nnz_lno_t max_column_cut_off =
+          this->handle->get_spgemm_handle()->MaxColDenseAcc;
+      nnz_lno_t col_size = this->b_col_cnt;
+      if (col_size < max_column_cut_off) {
+        run_dense = true;
+        if (KOKKOSKERNELS_VERBOSE) {
+          std::cout << "\t\t\tRunning SPGEMM_KK_DENSE col_size:" << col_size
+                    << " max_column_cut_off:" << max_column_cut_off
+                    << std::endl;
+        }
+      } else {
+        nnz_lno_t tmp_min_hash_size = 1;
+        while (tmp_max_nnz > tmp_min_hash_size) {
+          tmp_min_hash_size *= 4;
+        }
+
+        size_t kkmem_chunksize =
+            tmp_min_hash_size;                 // this is for used hash indices
+        kkmem_chunksize += tmp_min_hash_size;  // this is for the hash begins
+        kkmem_chunksize += max_nnz;            // this is for hash nexts
+        kkmem_chunksize = kkmem_chunksize * sizeof(nnz_lno_t) + scalarAlignPad;
+        size_t dense_chunksize =
+            (col_size + col_size / sizeof(scalar_t) + 1) * sizeof(scalar_t);
+
+        if (kkmem_chunksize >= dense_chunksize * 0.5) {
+          run_dense = true;
+          if (KOKKOSKERNELS_VERBOSE) {
+            std::cout << "\t\t\tRunning SPGEMM_KK_SPEED kkmem_chunksize:"
+                      << kkmem_chunksize
+                      << " dense_chunksize:" << dense_chunksize << std::endl;
+          }
+        } else {
+          run_dense = false;
+          if (KOKKOSKERNELS_VERBOSE) {
+            std::cout << "\t\t\tRunning SPGEMM_KK_MEMORY col_size:" << col_size
+                      << " max_column_cut_off:" << max_column_cut_off
+                      << std::endl;
+          }
+        }
       }
-      nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(suggested_team_size,concurrency, a_row_cnt);
-      if (KOKKOSKERNELS_VERBOSE){
-	std::cout << "\t\tJacobiSpGEMMSparseAcc -- adjusted hashsize:" << thread_shmem_hash_size  << " thread_shmem_key_size:" << thread_shmem_key_size << std::endl;
-	std::cout << "\t\tJacobiSpGEMMSparseAcc -- adjusted team hashsize:" << team_shmem_hash_size  << " team_shmem_key_size:" << team_shmem_key_size << std::endl;
+
+      if (run_dense) {
+        this->KokkosSPGEMM_jacobi_denseacc(row_mapC_, entriesC_, valuesC_,
+                                           omega, dinv, lcl_my_exec_space);
+        return;
       }
+    }
+  }
+  nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size(
+      suggested_team_size, concurrency, a_row_cnt);
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\t\tJacobiSpGEMMSparseAcc -- adjusted hashsize:"
+              << thread_shmem_hash_size
+              << " thread_shmem_key_size:" << thread_shmem_key_size
+              << std::endl;
+    std::cout << "\t\tJacobiSpGEMMSparseAcc -- adjusted team hashsize:"
+              << team_shmem_hash_size
+              << " team_shmem_key_size:" << team_shmem_key_size << std::endl;
+  }
 
+  // Compute the memory pool size
+  if (exec_gpu) {
+    if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM) {
+      tmp_max_nnz = 1;
+    }
+  }
 
+  if (algorithm_to_run == SPGEMM_KK_LP) {
+    while (tmp_max_nnz > min_hash_size) {
+      min_hash_size *= 4;
+    }
+    chunksize = min_hash_size;    // this is for used hash keys
+    chunksize += max_nnz;         // this is for used hash keys
+    chunksize += scalarAlignPad;  // for padding betwen keys and values
+    chunksize += min_hash_size * sizeof(scalar_t) /
+                 sizeof(nnz_lno_t);  // this is for the hash values
+  } else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM) {
+    while (tmp_max_nnz > min_hash_size) {
+      min_hash_size *= 2;  // try to keep it as low as possible because hashes
+                           // are not tracked.
+    }
+    chunksize = min_hash_size;    // this is for used hash keys
+    chunksize += scalarAlignPad;  // for padding between keys and values
+    chunksize += min_hash_size * sizeof(scalar_t) /
+                 sizeof(nnz_lno_t);  // this is for the hash values
+  } else {
+    while (tmp_max_nnz > min_hash_size) {
+      min_hash_size *= 4;
+    }
+    chunksize = min_hash_size;   // this is for used hash indices
+    chunksize += min_hash_size;  // this is for the hash begins
+    chunksize += max_nnz;        // this is for hash nexts
+  }
 
-      // Compute the memory pool size
-      if (exec_gpu) {
-	if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){
-	  tmp_max_nnz = 1;
-	}
-      }
+  nnz_lno_t num_chunks =
+      this->template compute_num_pool_chunks<pool_memory_space>(
+          chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
 
-      if (algorithm_to_run == SPGEMM_KK_LP ){
-	while (tmp_max_nnz > min_hash_size){
-	  min_hash_size *= 4;
-	}
-	chunksize = min_hash_size; //this is for used hash keys
-	chunksize += max_nnz; //this is for used hash keys
-	chunksize += scalarAlignPad;  //for padding betwen keys and values
-	chunksize += min_hash_size * sizeof(scalar_t) / sizeof(nnz_lno_t) ; //this is for the hash values
-      }
-      else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM){
-	while (tmp_max_nnz > min_hash_size){
-	  min_hash_size *= 2;  //try to keep it as low as possible because hashes are not tracked.
-	}
-	chunksize = min_hash_size; //this is for used hash keys
-	chunksize += scalarAlignPad;  //for padding between keys and values
-	chunksize += min_hash_size * sizeof(scalar_t) / sizeof(nnz_lno_t) ; //this is for the hash values
-      }
-      else{
-	while (tmp_max_nnz > min_hash_size){
-	  min_hash_size *= 4;
-	}
-	chunksize = min_hash_size; //this is for used hash indices
-	chunksize += min_hash_size ; //this is for the hash begins
-	chunksize += max_nnz; //this is for hash nexts
-      }
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\t\t max_nnz: " << max_nnz << " chunk_size:" << chunksize
+              << " min_hash_size:" << min_hash_size
+              << " concurrency:" << concurrency
+              << " MyExecSpace::concurrency():" << MyExecSpace::concurrency()
+              << " numchunks:" << num_chunks << std::endl;
+  }
 
-      nnz_lno_t num_chunks = this->template compute_num_pool_chunks<pool_memory_space>
-        (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
+  // Allocate the memory pool
+  KokkosKernels::Impl::PoolType my_pool_type =
+      KokkosKernels::Impl::OneThread2OneChunk;
+  if (exec_gpu) {
+    my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
+  }
 
-      if (KOKKOSKERNELS_VERBOSE){
-	std::cout << "\t\t max_nnz: " << max_nnz
-		  << " chunk_size:" << chunksize
-		  << " min_hash_size:" << min_hash_size
-		  << " concurrency:" << concurrency
-		  << " MyExecSpace::concurrency():" << MyExecSpace::concurrency()
-		  << " numchunks:" << num_chunks << std::endl;
-      }
+  Kokkos::Timer timer;
+  pool_memory_space m_space(num_chunks, chunksize, -1, my_pool_type);
+  MyExecSpace().fence();
 
+  if (KOKKOSKERNELS_VERBOSE) {
+    m_space.print_memory_pool();
+    std::cout << "\t\tPool Alloc Time:" << timer.seconds() << std::endl;
+    std::cout << "\t\tPool Size(MB):"
+              << sizeof(nnz_lno_t) * (num_chunks * chunksize) / 1024. / 1024.
+              << std::endl;
+  }
 
-      // Allocate the memory pool
-      KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk;
-      if (exec_gpu) {
-	my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
+  // Initialize the functor
+  JacobiSpGEMMSparseAcc<const_a_lno_row_view_t, const_a_lno_nnz_view_t,
+                        const_a_scalar_nnz_view_t, const_b_lno_row_view_t,
+                        const_b_lno_nnz_view_t, const_b_scalar_nnz_view_t,
+                        c_row_view_t, c_lno_nnz_view_t, c_scalar_nnz_view_t,
+                        dinv_view_t, pool_memory_space>
+      jacobi(a_row_cnt, row_mapA, entriesA, valsA, row_mapB, entriesB, valsB,
+             row_mapC_, entriesC_, valuesC_, omega, dinv, shmem_size_to_use,
+             suggested_vector_size, m_space, min_hash_size, max_nnz,
+             suggested_team_size, lcl_my_exec_space, team_row_chunk_size,
+             first_level_cut_off, flops_per_row, KOKKOSKERNELS_VERBOSE);
+
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\t\tvector_size:" << suggested_vector_size
+              << " chunk_size:" << team_row_chunk_size
+              << " suggested_team_size:" << suggested_team_size << std::endl;
+  }
+  timer.reset();
+
+  if (exec_gpu) {
+    if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM) {
+      if (thread_shmem_key_size <= 0) {
+        std::cout << "KokkosSPGEMM_jacobi_sparseacc "
+                     "SPGEMM_KK_MEMORY_SPREADTEAM: Insufficient shmem "
+                     "available for key for hash map accumulator - Terminating"
+                  << std::endl;
+        std::cout << "    thread_shmem_key_size = " << thread_shmem_key_size
+                  << std::endl;
+        throw std::runtime_error(
+            " KokkosSPGEMM_jacobi_sparseacc SPGEMM_KK_MEMORY_SPREADTEAM: "
+            "Insufficient shmem available for key for hash map accumulator ");
       }
-
-      Kokkos::Timer timer;
-      pool_memory_space m_space(num_chunks, chunksize, -1,  my_pool_type);
+      Kokkos::parallel_for(
+          "KokkosSparse::Jacobi::SparseAcc::GPU::SPGEMM_KK_MEMORY_SPREADTEAM",
+          gpu_team_policy4_t(a_row_cnt / team_row_chunk_size + 1,
+                             suggested_team_size, suggested_vector_size),
+          jacobi);
       MyExecSpace().fence();
 
-      if (KOKKOSKERNELS_VERBOSE){
-	m_space.print_memory_pool();
-	std::cout << "\t\tPool Alloc Time:" << timer.seconds() << std::endl;
-	std::cout << "\t\tPool Size(MB):" <<
-	  sizeof (nnz_lno_t) * (num_chunks * chunksize) / 1024. / 1024.  << std::endl;
+    } else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM) {
+      if (thread_shmem_key_size <= 0) {
+        std::cout << "KokkosSPGEMM_jacobi_sparseacc "
+                     "SPGEMM_KK_MEMORY_BIGSPREADTEAM: Insufficient shmem "
+                     "available for key for hash map accumulator - Terminating"
+                  << std::endl;
+        std::cout << "    thread_shmem_key_size = " << thread_shmem_key_size
+                  << std::endl;
+        throw std::runtime_error(
+            " KokkosSPGEMM_jacobi_sparseacc SPGEMM_KK_MEMORY_BIGSPREADTEAM: "
+            "Insufficient shmem available for key for hash map accumulator ");
       }
-
-      // Initialize the functor
-      JacobiSpGEMMSparseAcc<const_a_lno_row_view_t, const_a_lno_nnz_view_t, const_a_scalar_nnz_view_t,
-			    const_b_lno_row_view_t, const_b_lno_nnz_view_t, const_b_scalar_nnz_view_t,
-			    c_row_view_t, c_lno_nnz_view_t, c_scalar_nnz_view_t,
-			    dinv_view_t, pool_memory_space>
-	jacobi(a_row_cnt,
-	       row_mapA, entriesA, valsA,
-	       row_mapB, entriesB, valsB,
-	       row_mapC_, entriesC_, valuesC_,
-	       omega, dinv,
-	       shmem_size_to_use,
-	       suggested_vector_size,
-	       m_space,
-	       min_hash_size, max_nnz,
-	       suggested_team_size,
-	       lcl_my_exec_space,
-	       team_row_chunk_size,
-	       first_level_cut_off,
-	       flops_per_row,
-	       KOKKOSKERNELS_VERBOSE);
-
-      if (KOKKOSKERNELS_VERBOSE){
-	std::cout << "\t\tvector_size:" << suggested_vector_size  << " chunk_size:" << team_row_chunk_size << " suggested_team_size:" << suggested_team_size<< std::endl;
+      Kokkos::parallel_for(
+          "KokkosSparse::Jacobi::SparseAcc::GPU::SPGEMM_KK_MEMORY_"
+          "BIGSPREADTEAM",
+          gpu_team_policy6_t(a_row_cnt / team_row_chunk_size + 1,
+                             suggested_team_size, suggested_vector_size),
+          jacobi);
+    } else {
+      if (team_shmem_key_size <= 0) {
+        std::cout
+            << "KokkosSPGEMM_jacobi_sparseacc SPGEMM_KK_MEMORY: Insufficient "
+               "shmem available for key for hash map accumulator - Terminating"
+            << std::endl;
+        std::cout << "    team_shmem_key_size = " << team_shmem_key_size
+                  << std::endl;
+        throw std::runtime_error(
+            " KokkosSPGEMM_jacobi_sparseacc SPGEMM_KK_MEMORY: Insufficient "
+            "shmem available for key for hash map accumulator ");
       }
-      timer.reset();
-
-      if (exec_gpu) {
-	if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){
-	  if (thread_shmem_key_size <= 0) {
-	    std::cout << "KokkosSPGEMM_jacobi_sparseacc SPGEMM_KK_MEMORY_SPREADTEAM: Insufficient shmem available for key for hash map accumulator - Terminating" << std::endl;
-	    std::cout << "    thread_shmem_key_size = " << thread_shmem_key_size << std::endl;
-	    throw std::runtime_error(" KokkosSPGEMM_jacobi_sparseacc SPGEMM_KK_MEMORY_SPREADTEAM: Insufficient shmem available for key for hash map accumulator ");
-	  }
-	  Kokkos::parallel_for("KokkosSparse::Jacobi::SparseAcc::GPU::SPGEMM_KK_MEMORY_SPREADTEAM", gpu_team_policy4_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), jacobi);
-	  MyExecSpace().fence();
-
-	}
-	else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM){
-	  if (thread_shmem_key_size <= 0) {
-	    std::cout << "KokkosSPGEMM_jacobi_sparseacc SPGEMM_KK_MEMORY_BIGSPREADTEAM: Insufficient shmem available for key for hash map accumulator - Terminating" << std::endl;
-	    std::cout << "    thread_shmem_key_size = " << thread_shmem_key_size << std::endl;
-	    throw std::runtime_error(" KokkosSPGEMM_jacobi_sparseacc SPGEMM_KK_MEMORY_BIGSPREADTEAM: Insufficient shmem available for key for hash map accumulator ");
-	  }
-	  Kokkos::parallel_for("KokkosSparse::Jacobi::SparseAcc::GPU::SPGEMM_KK_MEMORY_BIGSPREADTEAM", gpu_team_policy6_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), jacobi);
-	}
-	else {
-	  if (team_shmem_key_size <= 0) {
-	    std::cout << "KokkosSPGEMM_jacobi_sparseacc SPGEMM_KK_MEMORY: Insufficient shmem available for key for hash map accumulator - Terminating" << std::endl;
-	    std::cout << "    team_shmem_key_size = " << team_shmem_key_size << std::endl;
-	    throw std::runtime_error(" KokkosSPGEMM_jacobi_sparseacc SPGEMM_KK_MEMORY: Insufficient shmem available for key for hash map accumulator ");
-	  }
-	  Kokkos::parallel_for("KokkosSparse::Jacobi::SparseAcc::GPU::SPGEMM_KK_MEMORY", gpu_team_policy_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), jacobi);
-	}
-	MyExecSpace().fence();
-      }
-      else {
-	if (algorithm_to_run == SPGEMM_KK_LP){
-	  if (use_dynamic_schedule){
-	    Kokkos::parallel_for("KokkosSparse::Jacobi::SparseAcc::CPU::LP::Dynamic", dynamic_multicore_team_policy4_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size),jacobi);
-	  }
-	  else {
-	    Kokkos::parallel_for("KokkosSparse::Jacobi::SparseAcc::CPU::LP::Static", multicore_team_policy4_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), jacobi);
-	  }
-	}
-	else {
-	  if (use_dynamic_schedule){
-	    Kokkos::parallel_for("KokkosSparse::Jacobi::SparseAcc::CPU::Dynamic",  dynamic_multicore_team_policy_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), jacobi);
-	  }
-	  else {
-	    Kokkos::parallel_for("KokkosSparse::Jacobi::SparseAcc::CPU::Static",  multicore_team_policy_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), jacobi);
-	  }
-	}
-	MyExecSpace().fence();
+      Kokkos::parallel_for(
+          "KokkosSparse::Jacobi::SparseAcc::GPU::SPGEMM_KK_MEMORY",
+          gpu_team_policy_t(a_row_cnt / team_row_chunk_size + 1,
+                            suggested_team_size, suggested_vector_size),
+          jacobi);
+    }
+    MyExecSpace().fence();
+  } else {
+    if (algorithm_to_run == SPGEMM_KK_LP) {
+      if (use_dynamic_schedule) {
+        Kokkos::parallel_for(
+            "KokkosSparse::Jacobi::SparseAcc::CPU::LP::Dynamic",
+            dynamic_multicore_team_policy4_t(
+                a_row_cnt / team_row_chunk_size + 1, suggested_team_size,
+                suggested_vector_size),
+            jacobi);
+      } else {
+        Kokkos::parallel_for("KokkosSparse::Jacobi::SparseAcc::CPU::LP::Static",
+                             multicore_team_policy4_t(
+                                 a_row_cnt / team_row_chunk_size + 1,
+                                 suggested_team_size, suggested_vector_size),
+                             jacobi);
       }
-
-      if (KOKKOSKERNELS_VERBOSE){
-	std::cout << "\t\tJacobi COMP TIME:" << timer.seconds() << std::endl;
+    } else {
+      if (use_dynamic_schedule) {
+        Kokkos::parallel_for("KokkosSparse::Jacobi::SparseAcc::CPU::Dynamic",
+                             dynamic_multicore_team_policy_t(
+                                 a_row_cnt / team_row_chunk_size + 1,
+                                 suggested_team_size, suggested_vector_size),
+                             jacobi);
+      } else {
+        Kokkos::parallel_for(
+            "KokkosSparse::Jacobi::SparseAcc::CPU::Static",
+            multicore_team_policy_t(a_row_cnt / team_row_chunk_size + 1,
+                                    suggested_team_size, suggested_vector_size),
+            jacobi);
       }
-
     }
+    MyExecSpace().fence();
+  }
+
+  if (KOKKOSKERNELS_VERBOSE) {
+    std::cout << "\t\tJacobi COMP TIME:" << timer.seconds() << std::endl;
   }
 }
+}  // namespace Impl
+}  // namespace KokkosSparse
diff --git a/src/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp b/src/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp
index 5a7298f187..92a5250158 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp
@@ -54,276 +54,245 @@
 #include "KokkosSparse_spgemm_impl.hpp"
 #include "KokkosSparse_spgemm_jacobi_denseacc_impl.hpp"
 #include "KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp"
-#include "KokkosSparse_spgemm_jacobi_seq_impl.hpp" 
+#include "KokkosSparse_spgemm_jacobi_seq_impl.hpp"
 #endif
 
 namespace KokkosSparse {
-  namespace Impl {
-    // Specialization struct which defines whether a specialization exists
-    template<
-      class KernelHandle,
-      class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t,
-      class b_size_view_t_, class b_lno_view_t, class b_scalar_view_t,
-      class c_size_view_t_, class c_lno_view_t, class c_scalar_view_t,
-      class dinv_scalar_view_t>
-    struct spgemm_jacobi_eti_spec_avail {
-      enum : bool { value = false };
-    };
-
-  }
-}
-
-
-#define KOKKOSSPARSE_SPGEMM_JACOBI_ETI_SPEC_AVAIL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE,FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE ) \
-    template<> \
-    struct spgemm_jacobi_eti_spec_avail< \
-        KokkosKernels::Experimental::KokkosKernelsHandle<\
-        const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,  \
-          EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE> , \
-        Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-	  Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR_TYPE **, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-	  Kokkos::MemoryTraits<Kokkos::Unmanaged> > >	\
-    { enum : bool { value = true }; };
-
+namespace Impl {
+// Specialization struct which defines whether a specialization exists
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
+          class a_scalar_view_t, class b_size_view_t_, class b_lno_view_t,
+          class b_scalar_view_t, class c_size_view_t_, class c_lno_view_t,
+          class c_scalar_view_t, class dinv_scalar_view_t>
+struct spgemm_jacobi_eti_spec_avail {
+  enum : bool { value = false };
+};
+
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#define KOKKOSSPARSE_SPGEMM_JACOBI_ETI_SPEC_AVAIL(                        \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE)                             \
+  template <>                                                             \
+  struct spgemm_jacobi_eti_spec_avail<                                    \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>,     \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,                           \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE **, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {          \
+    enum : bool { value = true };                                         \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosSparse_spgemm_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosSparse_spgemm_jacobi_eti_spec_avail.hpp>
+#include <KokkosSparse_spgemm_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spgemm_jacobi_eti_spec_avail.hpp>
 
 namespace KokkosSparse {
-  namespace Impl {
-
-
-    // Unification layer
-    template<
-      class KernelHandle,
-      class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t,
-      class b_size_view_t_, class b_lno_view_t, class b_scalar_view_t,
-      class c_size_view_t_, class c_lno_view_t, class c_scalar_view_t,
-      class dinv_scalar_view_t,
-      bool tpl_spec_avail =
-      spgemm_jacobi_tpl_spec_avail<
-	KernelHandle,
-	a_size_view_t_,  a_lno_view_t,  a_scalar_view_t,
-	b_size_view_t_,  b_lno_view_t,  b_scalar_view_t,
-	c_size_view_t_,  c_lno_view_t,  c_scalar_view_t,
-	dinv_scalar_view_t>::value,
-      bool eti_spec_avail =
-      spgemm_jacobi_eti_spec_avail<
-	KernelHandle,
-	a_size_view_t_,  a_lno_view_t,  a_scalar_view_t,
-	b_size_view_t_,  b_lno_view_t,  b_scalar_view_t,
-	c_size_view_t_,  c_lno_view_t,  c_scalar_view_t,
-	dinv_scalar_view_t>::value >
-
-    struct SPGEMM_JACOBI{
-      static void
-      spgemm_jacobi (
-		     KernelHandle *handle,
-		     typename KernelHandle::const_nnz_lno_t m,
-		     typename KernelHandle::const_nnz_lno_t n,
-		     typename KernelHandle::const_nnz_lno_t k,
-
-		     a_size_view_t_ row_mapA,
-		     a_lno_view_t entriesA,
-		     a_scalar_view_t valuesA,
-
-		     bool transposeA,
-		     b_size_view_t_ row_mapB,
-		     b_lno_view_t entriesB,
-		     b_scalar_view_t valuesB,
-
-		     bool transposeB,
-		     c_size_view_t_ row_mapC,
-		     c_lno_view_t &entriesC,
-		     c_scalar_view_t &valuesC,
-
-		     typename a_scalar_view_t::const_value_type omega,
-		     dinv_scalar_view_t dinv
-
-		     );
-    };
+namespace Impl {
+
+// Unification layer
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
+          class a_scalar_view_t, class b_size_view_t_, class b_lno_view_t,
+          class b_scalar_view_t, class c_size_view_t_, class c_lno_view_t,
+          class c_scalar_view_t, class dinv_scalar_view_t,
+          bool tpl_spec_avail = spgemm_jacobi_tpl_spec_avail<
+              KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t,
+              b_size_view_t_, b_lno_view_t, b_scalar_view_t, c_size_view_t_,
+              c_lno_view_t, c_scalar_view_t, dinv_scalar_view_t>::value,
+          bool eti_spec_avail = spgemm_jacobi_eti_spec_avail<
+              KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t,
+              b_size_view_t_, b_lno_view_t, b_scalar_view_t, c_size_view_t_,
+              c_lno_view_t, c_scalar_view_t, dinv_scalar_view_t>::value>
+
+struct SPGEMM_JACOBI {
+  static void spgemm_jacobi(KernelHandle *handle,
+                            typename KernelHandle::const_nnz_lno_t m,
+                            typename KernelHandle::const_nnz_lno_t n,
+                            typename KernelHandle::const_nnz_lno_t k,
+
+                            a_size_view_t_ row_mapA, a_lno_view_t entriesA,
+                            a_scalar_view_t valuesA,
+
+                            bool transposeA, b_size_view_t_ row_mapB,
+                            b_lno_view_t entriesB, b_scalar_view_t valuesB,
+
+                            bool transposeB, c_size_view_t_ row_mapC,
+                            c_lno_view_t &entriesC, c_scalar_view_t &valuesC,
+
+                            typename a_scalar_view_t::const_value_type omega,
+                            dinv_scalar_view_t dinv
+
+  );
+};
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 
-
-    //! Full specialization of spgemm jacobi
-    // Unification layer
-    template<class KernelHandle,
-	     class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t,
-	     class b_size_view_t_, class b_lno_view_t, class b_scalar_view_t,
-	     class c_size_view_t_, class c_lno_view_t, class c_scalar_view_t,
-	     class dinv_scalar_view_t>
-    struct SPGEMM_JACOBI<KernelHandle,
-			 a_size_view_t_,  a_lno_view_t,  a_scalar_view_t,
-			 b_size_view_t_,  b_lno_view_t,  b_scalar_view_t,
-			 c_size_view_t_,  c_lno_view_t,  c_scalar_view_t,
-			 dinv_scalar_view_t,	     
-			 false,KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>{
-
-      static void
-      spgemm_jacobi (
-		     KernelHandle *handle,
-		     typename KernelHandle::nnz_lno_t m,
-		     typename KernelHandle::nnz_lno_t n,
-		     typename KernelHandle::nnz_lno_t k,
-
-		     a_size_view_t_ row_mapA,
-		     a_lno_view_t entriesA,
-		     a_scalar_view_t valuesA,
-
-		     bool transposeA,
-		     b_size_view_t_ row_mapB,
-		     b_lno_view_t entriesB,
-		     b_scalar_view_t valuesB,
-
-		     bool transposeB,
-		     c_size_view_t_ row_mapC,
-		     c_lno_view_t &entriesC,
-		     c_scalar_view_t &valuesC,
-
-		     typename c_scalar_view_t::const_value_type omega,
-		     dinv_scalar_view_t dinv
-		     )
-      {
-	typedef typename KernelHandle::SPGEMMHandleType spgemmHandleType;
-	spgemmHandleType *sh = handle->get_spgemm_handle();
-	if (!sh->is_symbolic_called()){
-	  throw std::runtime_error ("Call spgemm symbolic before calling SpGEMM jacobi");
-	}
-
-	if (sh->get_algorithm_type() == SPGEMM_SERIAL) {
-	  spgemm_jacobi_seq(handle, m, n, k,
-			    row_mapA, entriesA, valuesA, transposeA,
-			    row_mapB, entriesB, valuesB, transposeB,
-			    row_mapC, entriesC, valuesC,
-			    omega, dinv);
-	}
-	else {
-	  KokkosSPGEMM
-	    <KernelHandle,
-	     a_size_view_t_, a_lno_view_t, a_scalar_view_t,
-	     b_size_view_t_, b_lno_view_t,  b_scalar_view_t>
-	    kspgemm (handle,m,n,k,row_mapA, entriesA, valuesA, transposeA, row_mapB, entriesB, valuesB, transposeB);
-	  KokkosKernels::Impl::ExecSpaceType myExecSpace = KokkosKernels::Impl::get_exec_space_type<typename KernelHandle::HandleExecSpace>();
-
-	  kspgemm.KokkosSPGEMM_jacobi_sparseacc(row_mapC, entriesC, valuesC, omega, dinv, myExecSpace);
-	}
-      }
-    };
-
-#endif
-
-
-
+//! Full specialization of spgemm jacobi
+// Unification layer
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
+          class a_scalar_view_t, class b_size_view_t_, class b_lno_view_t,
+          class b_scalar_view_t, class c_size_view_t_, class c_lno_view_t,
+          class c_scalar_view_t, class dinv_scalar_view_t>
+struct SPGEMM_JACOBI<KernelHandle, a_size_view_t_, a_lno_view_t,
+                     a_scalar_view_t, b_size_view_t_, b_lno_view_t,
+                     b_scalar_view_t, c_size_view_t_, c_lno_view_t,
+                     c_scalar_view_t, dinv_scalar_view_t, false,
+                     KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  static void spgemm_jacobi(KernelHandle *handle,
+                            typename KernelHandle::nnz_lno_t m,
+                            typename KernelHandle::nnz_lno_t n,
+                            typename KernelHandle::nnz_lno_t k,
+
+                            a_size_view_t_ row_mapA, a_lno_view_t entriesA,
+                            a_scalar_view_t valuesA,
+
+                            bool transposeA, b_size_view_t_ row_mapB,
+                            b_lno_view_t entriesB, b_scalar_view_t valuesB,
+
+                            bool transposeB, c_size_view_t_ row_mapC,
+                            c_lno_view_t &entriesC, c_scalar_view_t &valuesC,
+
+                            typename c_scalar_view_t::const_value_type omega,
+                            dinv_scalar_view_t dinv) {
+    typedef typename KernelHandle::SPGEMMHandleType spgemmHandleType;
+    spgemmHandleType *sh = handle->get_spgemm_handle();
+    if (!sh->is_symbolic_called()) {
+      throw std::runtime_error(
+          "Call spgemm symbolic before calling SpGEMM jacobi");
+    }
+
+    if (sh->get_algorithm_type() == SPGEMM_SERIAL) {
+      spgemm_jacobi_seq(handle, m, n, k, row_mapA, entriesA, valuesA,
+                        transposeA, row_mapB, entriesB, valuesB, transposeB,
+                        row_mapC, entriesC, valuesC, omega, dinv);
+    } else {
+      KokkosSPGEMM<KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t,
+                   b_size_view_t_, b_lno_view_t, b_scalar_view_t>
+          kspgemm(handle, m, n, k, row_mapA, entriesA, valuesA, transposeA,
+                  row_mapB, entriesB, valuesB, transposeB);
+      KokkosKernels::Impl::ExecSpaceType myExecSpace =
+          KokkosKernels::Impl::get_exec_space_type<
+              typename KernelHandle::HandleExecSpace>();
+
+      kspgemm.KokkosSPGEMM_jacobi_sparseacc(row_mapC, entriesC, valuesC, omega,
+                                            dinv, myExecSpace);
+    }
   }
-}
-
-#define KOKKOSSPARSE_SPGEMM_JACOBI_ETI_SPEC_DECL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE ) \
-  extern template struct						\
-  SPGEMM_JACOBI<							\
-		 typename KokkosKernels::Experimental::KokkosKernelsHandle< \
-									   const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \
-									   EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE> , \
-		 Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,		\
-			      Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-			      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-		 Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,	\
-			      Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-			      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-		 Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,		\
-			      Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-			      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-		 Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,		\
-			      Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-			      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-		 Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,	\
-			      Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-			      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-		 Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,		\
-			      Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-			      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-		 Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,		\
-			      Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-			      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-		 Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,		\
-			      Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-			      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-		 Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,		\
-			      Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-			      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-		 Kokkos::View<const SCALAR_TYPE **, LAYOUT_TYPE,	\
-			      Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-			      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, false, true >;
-
+};
 
-#define KOKKOSSPARSE_SPGEMM_JACOBI_ETI_SPEC_INST( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE) \
-  template struct							\
-  SPGEMM_JACOBI<							\
-		 KokkosKernels::Experimental::KokkosKernelsHandle<	\
-								  const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \
-								  EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE> , \
-		 Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,		\
-			      Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-			      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-		 Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,	\
-			      Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-			      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-		 Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,		\
-			      Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-			      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-		 Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,		\
-			      Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-			      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-		 Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,	\
-			      Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-			      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-		 Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,		\
-			      Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-			      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-		 Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,		\
-			      Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-			      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-		 Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,		\
-			      Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-			      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-		 Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,		\
-			      Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-			      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-		 Kokkos::View<const SCALAR_TYPE **, LAYOUT_TYPE,	\
-			      Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-			      Kokkos::MemoryTraits<Kokkos::Unmanaged> >, false, true > ;
-
-
-#include<KokkosSparse_spgemm_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosSparse_spgemm_jacobi_eti_spec_decl.hpp>
+#endif
 
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#define KOKKOSSPARSE_SPGEMM_JACOBI_ETI_SPEC_DECL(                         \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE)                             \
+  extern template struct SPGEMM_JACOBI<                                   \
+      typename KokkosKernels::Experimental::KokkosKernelsHandle<          \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>,     \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,                           \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE **, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      false, true>;
+
+#define KOKKOSSPARSE_SPGEMM_JACOBI_ETI_SPEC_INST(                         \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE)                             \
+  template struct SPGEMM_JACOBI<                                          \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>,     \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,                           \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE **, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      false, true>;
+
+#include <KokkosSparse_spgemm_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spgemm_jacobi_eti_spec_decl.hpp>
 
-#endif 
+#endif
diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp
index 8bff1e6bf2..5715c7f098 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp
@@ -45,7 +45,6 @@
 
 //#define KOKKOSKERNELS_ENABLE_TPL_MKL
 
-
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
 #include "mkl.h"
 #endif
@@ -54,52 +53,44 @@
 #include <Kokkos_Concepts.hpp>
 #include <vector>
 
-namespace KokkosSparse{
-namespace Impl{
-
+namespace KokkosSparse {
+namespace Impl {
 
-template <typename KernelHandle,
-typename in_row_index_view_type,
-typename in_nonzero_index_view_type,
-typename bin_row_index_view_type,
-typename bin_nonzero_index_view_type,
-typename cin_row_index_view_type>
+template <typename KernelHandle, typename in_row_index_view_type,
+          typename in_nonzero_index_view_type, typename bin_row_index_view_type,
+          typename bin_nonzero_index_view_type,
+          typename cin_row_index_view_type>
 void mkl2phase_symbolic(
-    KernelHandle *handle,
-    typename KernelHandle::nnz_lno_t m,
-    typename KernelHandle::nnz_lno_t n,
-    typename KernelHandle::nnz_lno_t k,
-    in_row_index_view_type row_mapA,
-    in_nonzero_index_view_type entriesA,
-
-    bool transposeA,
-    bin_row_index_view_type row_mapB,
-    bin_nonzero_index_view_type entriesB,
+    KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
+    typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k,
+    in_row_index_view_type row_mapA, in_nonzero_index_view_type entriesA,
 
-    bool transposeB,
-    cin_row_index_view_type row_mapC,
-    bool verbose = false){
+    bool transposeA, bin_row_index_view_type row_mapB,
+    bin_nonzero_index_view_type entriesB,
 
+    bool transposeB, cin_row_index_view_type row_mapC, bool verbose = false) {
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
 
   typedef typename KernelHandle::nnz_lno_t idx;
 
-  typedef typename KernelHandle::HandlePersistentMemorySpace HandlePersistentMemorySpace;
+  typedef typename KernelHandle::HandlePersistentMemorySpace
+      HandlePersistentMemorySpace;
 
-  typedef typename Kokkos::View<int *, HandlePersistentMemorySpace> int_persistent_work_view_t;
+  typedef typename Kokkos::View<int *, HandlePersistentMemorySpace>
+      int_persistent_work_view_t;
 
   typedef typename KernelHandle::HandleExecSpace MyExecSpace;
 
-  if (std::is_same<idx, int>::value){
-
+  if (std::is_same<idx, int>::value) {
     int_persistent_work_view_t a_xadj_v, b_xadj_v;
 
     const int max_integer = 2147483647;
-    if (entriesB.extent(0) > max_integer|| entriesA.extent(0) > max_integer){
-      throw std::runtime_error ("MKL requires integer values for size type for SPGEMM. Copying to integer will cause overflow.\n");
+    if (entriesB.extent(0) > max_integer || entriesA.extent(0) > max_integer) {
+      throw std::runtime_error(
+          "MKL requires integer values for size type for SPGEMM. Copying to "
+          "integer will cause overflow.\n");
     }
 
-
     int *a_adj = (int *)entriesA.data();
     int *b_adj = (int *)entriesB.data();
 
@@ -107,40 +98,51 @@ void mkl2phase_symbolic(
     int *b_xadj = (int *)row_mapB.data();
     int *c_xadj = (int *)row_mapC.data();
 
-    if (handle->mkl_convert_to_1base)
-    {
+    if (handle->mkl_convert_to_1base) {
       handle->persistent_a_xadj = int_persistent_work_view_t("tmpa", m + 1);
       handle->persistent_b_xadj = int_persistent_work_view_t("tmpb", n + 1);
       handle->persistent_c_xadj = int_persistent_work_view_t("tmpc", m + 1);
-      int_persistent_work_view_t a_plus_one ("a_plus_one", entriesA.extent(0));
-      int_persistent_work_view_t b_plus_one ("b_plus_one", entriesB.extent(0));
+      int_persistent_work_view_t a_plus_one("a_plus_one", entriesA.extent(0));
+      int_persistent_work_view_t b_plus_one("b_plus_one", entriesB.extent(0));
       handle->persistent_a_adj = a_plus_one;
       handle->persistent_b_adj = b_plus_one;
 
-      KokkosKernels::Impl::kk_a_times_x_plus_b< int_persistent_work_view_t, in_row_index_view_type,   int, int, MyExecSpace>(m + 1,  handle->persistent_a_xadj, row_mapA,  1, 1);
-      KokkosKernels::Impl::kk_a_times_x_plus_b< int_persistent_work_view_t, bin_row_index_view_type,   int, int, MyExecSpace>(n + 1, handle->persistent_b_xadj, row_mapB,  1, 1);
-      KokkosKernels::Impl::kk_a_times_x_plus_b<   int_persistent_work_view_t, in_nonzero_index_view_type, int, int, MyExecSpace>(entriesA.extent(0), a_plus_one, entriesA,  1, 1);
-      KokkosKernels::Impl::kk_a_times_x_plus_b< int_persistent_work_view_t, bin_nonzero_index_view_type,  int, int, MyExecSpace>(entriesB.extent(0), b_plus_one, entriesB,  1, 1);
-
-
-      a_adj = (int *)handle->persistent_a_adj.data();
-      b_adj = (int *)handle->persistent_b_adj.data();
+      KokkosKernels::Impl::kk_a_times_x_plus_b<int_persistent_work_view_t,
+                                               in_row_index_view_type, int, int,
+                                               MyExecSpace>(
+          m + 1, handle->persistent_a_xadj, row_mapA, 1, 1);
+      KokkosKernels::Impl::kk_a_times_x_plus_b<int_persistent_work_view_t,
+                                               bin_row_index_view_type, int,
+                                               int, MyExecSpace>(
+          n + 1, handle->persistent_b_xadj, row_mapB, 1, 1);
+      KokkosKernels::Impl::kk_a_times_x_plus_b<int_persistent_work_view_t,
+                                               in_nonzero_index_view_type, int,
+                                               int, MyExecSpace>(
+          entriesA.extent(0), a_plus_one, entriesA, 1, 1);
+      KokkosKernels::Impl::kk_a_times_x_plus_b<int_persistent_work_view_t,
+                                               bin_nonzero_index_view_type, int,
+                                               int, MyExecSpace>(
+          entriesB.extent(0), b_plus_one, entriesB, 1, 1);
+
+      a_adj  = (int *)handle->persistent_a_adj.data();
+      b_adj  = (int *)handle->persistent_b_adj.data();
       a_xadj = handle->persistent_a_xadj.data();
       b_xadj = handle->persistent_b_xadj.data();
       c_xadj = handle->persistent_c_xadj.data();
     }
 
 #if __INTEL_MKL__ < 2018
-    (void) transposeA; (void) transposeB; // supress unused-parameter warning
-    char trans = 'N';
+    (void)transposeA;
+    (void)transposeB;  // supress unused-parameter warning
+    char trans      = 'N';
     MKL_INT request = 1;
-    MKL_INT sort = handle->get_mkl_sort_option();
+    MKL_INT sort    = handle->get_mkl_sort_option();
     MKL_INT mklm = m, mkln = n, mklk = k;
     MKL_INT info = 0;
 
     double *mynullptr = NULL;
-    int *mynulladj = NULL;
-    const int nzmax = 0;
+    int *mynulladj    = NULL;
+    const int nzmax   = 0;
 
     /*
     KokkosKernels::Impl::print_1Dview(handle->persistent_a_xadj);
@@ -150,24 +152,26 @@ void mkl2phase_symbolic(
     */
     Kokkos::Timer timer1;
 
-    mkl_dcsrmultcsr(&trans, &request, &sort, &mklm, &mkln, &mklk,
-        mynullptr, a_adj, a_xadj,
-        mynullptr, b_adj, b_xadj,
-        mynullptr, mynulladj, c_xadj,
-        &nzmax, &info);
+    mkl_dcsrmultcsr(&trans, &request, &sort, &mklm, &mkln, &mklk, mynullptr,
+                    a_adj, a_xadj, mynullptr, b_adj, b_xadj, mynullptr,
+                    mynulladj, c_xadj, &nzmax, &info);
 
-    if (verbose){
-      std::cout << "Sort:" << sort << " Actual MKL2 Symbolic Time:" << timer1.seconds() << std::endl;
+    if (verbose) {
+      std::cout << "Sort:" << sort
+                << " Actual MKL2 Symbolic Time:" << timer1.seconds()
+                << std::endl;
     }
 
-    if (handle->mkl_convert_to_1base){
-      KokkosKernels::Impl::kk_a_times_x_plus_b< cin_row_index_view_type, int_persistent_work_view_t,  int, int, MyExecSpace>(m + 1, row_mapC, handle->persistent_c_xadj,  1, -1);
+    if (handle->mkl_convert_to_1base) {
+      KokkosKernels::Impl::kk_a_times_x_plus_b<cin_row_index_view_type,
+                                               int_persistent_work_view_t, int,
+                                               int, MyExecSpace>(
+          m + 1, row_mapC, handle->persistent_c_xadj, 1, -1);
       handle->set_c_nnz(row_mapC(m));
-    }
-    else {
+    } else {
       handle->set_c_nnz(row_mapC(m) - 1);
     }
-#endif // __INTEL_MKL__ < 2018
+#endif  // __INTEL_MKL__ < 2018
 
 #if __INTEL_MKL__ == 2018 && __INTEL_MKL_UPDATE__ >= 2
     MKL_INT mklm = m, mkln = n;
@@ -179,33 +183,45 @@ void mkl2phase_symbolic(
 
     // Goal: Set c_xadj (which is the rowptr) from C
 
-    if (handle->mkl_convert_to_1base) { // a*, b* already converted to 1base above...
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_d_create_csr (&A, SPARSE_INDEX_BASE_ONE, mklm, mkln, a_xadj, a_xadj + 1, a_adj, mynullptr)){
-        throw std::runtime_error ("CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
+    if (handle->mkl_convert_to_1base) {  // a*, b* already converted to 1base
+                                         // above...
+      if (SPARSE_STATUS_SUCCESS !=
+          mkl_sparse_d_create_csr(&A, SPARSE_INDEX_BASE_ONE, mklm, mkln, a_xadj,
+                                  a_xadj + 1, a_adj, mynullptr)) {
+        throw std::runtime_error(
+            "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
       }
 
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_d_create_csr (&B, SPARSE_INDEX_BASE_ONE, n, k, b_xadj, b_xadj + 1, b_adj, mynullptr)){
-        throw std::runtime_error ("CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
+      if (SPARSE_STATUS_SUCCESS !=
+          mkl_sparse_d_create_csr(&B, SPARSE_INDEX_BASE_ONE, n, k, b_xadj,
+                                  b_xadj + 1, b_adj, mynullptr)) {
+        throw std::runtime_error(
+            "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
       }
     } else {
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_d_create_csr (&A, SPARSE_INDEX_BASE_ZERO, mklm, mkln, a_xadj, a_xadj + 1, a_adj, mynullptr)){
-        throw std::runtime_error ("CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
+      if (SPARSE_STATUS_SUCCESS !=
+          mkl_sparse_d_create_csr(&A, SPARSE_INDEX_BASE_ZERO, mklm, mkln,
+                                  a_xadj, a_xadj + 1, a_adj, mynullptr)) {
+        throw std::runtime_error(
+            "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
       }
 
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_d_create_csr (&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj, b_xadj + 1, b_adj, mynullptr)){
-        throw std::runtime_error ("CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
+      if (SPARSE_STATUS_SUCCESS !=
+          mkl_sparse_d_create_csr(&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj,
+                                  b_xadj + 1, b_adj, mynullptr)) {
+        throw std::runtime_error(
+            "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
       }
     }
 
     sparse_operation_t operation;
-    if (transposeA && transposeB){
+    if (transposeA && transposeB) {
       operation = SPARSE_OPERATION_TRANSPOSE;
-    }
-    else if (!(transposeA || transposeB)){
+    } else if (!(transposeA || transposeB)) {
       operation = SPARSE_OPERATION_NON_TRANSPOSE;
-    }
-    else {
-      throw std::runtime_error ("MKL either transpose both matrices, or none for SPGEMM\n");
+    } else {
+      throw std::runtime_error(
+          "MKL either transpose both matrices, or none for SPGEMM\n");
     }
 
     matrix_descr common_mtx_props;
@@ -214,371 +230,450 @@ void mkl2phase_symbolic(
     common_mtx_props.diag = SPARSE_DIAG_NON_UNIT;
 
     Kokkos::Timer timer1;
-    // options: SPARSE_STAGE_FULL_MULT vs SPARSE_STAGE_NNZ_COUNT then SPARSE_STAGE_FINALIZE_MULT
-    bool success = SPARSE_STATUS_SUCCESS != mkl_sparse_sp2m (operation, common_mtx_props, A, operation, common_mtx_props, B, SPARSE_STAGE_NNZ_COUNT, &C); // success is "true" if mkl_sparse_spmm does not return success
-
-    if (verbose){
-      std::cout << "Actual DOUBLE MKL SPMM Time:" << timer1.seconds() << std::endl;
+    // options: SPARSE_STAGE_FULL_MULT vs SPARSE_STAGE_NNZ_COUNT then
+    // SPARSE_STAGE_FINALIZE_MULT
+    bool success = SPARSE_STATUS_SUCCESS !=
+                   mkl_sparse_sp2m(operation, common_mtx_props, A, operation,
+                                   common_mtx_props, B, SPARSE_STAGE_NNZ_COUNT,
+                                   &C);  // success is "true" if mkl_sparse_spmm
+                                         // does not return success
+
+    if (verbose) {
+      std::cout << "Actual DOUBLE MKL SPMM Time:" << timer1.seconds()
+                << std::endl;
     }
 
     if (success) {
-      throw std::runtime_error ("ERROR at SPGEMM multiplication in mkl_sparse_spmm\n");
-    }
-    else {
-
+      throw std::runtime_error(
+          "ERROR at SPGEMM multiplication in mkl_sparse_spmm\n");
+    } else {
       // Copy sparse_matrix_t C results back to input data structure
       sparse_index_base_t c_indexing;
-      MKL_INT c_rows, c_cols, *rows_end, *columns; // use c_xadj as rows_start
-      double *values; // should return null
+      MKL_INT c_rows, c_cols, *rows_end, *columns;  // use c_xadj as rows_start
+      double *values;                               // should return null
 
       if (SPARSE_STATUS_SUCCESS !=
-          //mkl_sparse_s_export_csr (C, &c_indexing, &c_rows, &c_cols, &rows_start, &rows_end, &columns, &values))
-          mkl_sparse_d_export_csr (C, &c_indexing, &c_rows, &c_cols, &c_xadj, &rows_end, &columns, &values))
-      {
-        throw std::runtime_error ("ERROR at exporting result matrix in mkl_sparse_spmm\n");
+          // mkl_sparse_s_export_csr (C, &c_indexing, &c_rows, &c_cols,
+          // &rows_start, &rows_end, &columns, &values))
+          mkl_sparse_d_export_csr(C, &c_indexing, &c_rows, &c_cols, &c_xadj,
+                                  &rows_end, &columns, &values)) {
+        throw std::runtime_error(
+            "ERROR at exporting result matrix in mkl_sparse_spmm\n");
       }
 
-//      if (SPARSE_INDEX_BASE_ZERO != c_indexing){
-//        throw std::runtime_error ("C is not zero based indexed\n");
-//      }
-      if (handle->mkl_convert_to_1base && (c_indexing == SPARSE_INDEX_BASE_ONE)) { // Need to convert back to base0
-        KokkosKernels::Impl::kk_a_times_x_plus_b< cin_row_index_view_type, int_persistent_work_view_t,  int, int, MyExecSpace>(m + 1, row_mapC, handle->persistent_c_xadj,  1, -1);
+      //      if (SPARSE_INDEX_BASE_ZERO != c_indexing){
+      //        throw std::runtime_error ("C is not zero based indexed\n");
+      //      }
+      if (handle->mkl_convert_to_1base &&
+          (c_indexing ==
+           SPARSE_INDEX_BASE_ONE)) {  // Need to convert back to base0
+        KokkosKernels::Impl::kk_a_times_x_plus_b<cin_row_index_view_type,
+                                                 int_persistent_work_view_t,
+                                                 int, int, MyExecSpace>(
+            m + 1, row_mapC, handle->persistent_c_xadj, 1, -1);
         handle->set_c_nnz(row_mapC(m));
-      }
-      else {
+      } else {
         handle->set_c_nnz(row_mapC(m) - 1);
       }
 
-    } // end else !success
+    }  // end else !success
 
     // Cleanup...
-    if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy (A)){
-      throw std::runtime_error ("Error at mkl_sparse_destroy A\n");
+    if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(A)) {
+      throw std::runtime_error("Error at mkl_sparse_destroy A\n");
     }
-    if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy (B)){
-      throw std::runtime_error ("Error at mkl_sparse_destroy B\n");
+    if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(B)) {
+      throw std::runtime_error("Error at mkl_sparse_destroy B\n");
     }
-    if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy (C)){
-      throw std::runtime_error ("Error at mkl_sparse_destroy C\n");
+    if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(C)) {
+      throw std::runtime_error("Error at mkl_sparse_destroy C\n");
     }
 #elif __INTEL_MKL__ == 2018 && __INTEL_MKL_UPDATE__ < 2
-    throw std::runtime_error ("Intel MKL version 18 must have update 2 - use intel/18.2.xyz\n");
-    (void) k; (void) transposeA; (void) transposeB; (void) verbose;
+    throw std::runtime_error(
+        "Intel MKL version 18 must have update 2 - use intel/18.2.xyz\n");
+    (void)k;
+    (void)transposeA;
+    (void)transposeB;
+    (void)verbose;
 #else
-    throw std::runtime_error ("Intel MKL versions > 18 are not yet tested/supported\n");
-    (void) k; (void) transposeA; (void) transposeB; (void) verbose;
+    throw std::runtime_error(
+        "Intel MKL versions > 18 are not yet tested/supported\n");
+    (void)k;
+    (void)transposeA;
+    (void)transposeB;
+    (void)verbose;
 #endif
 
+  } else {
+    throw std::runtime_error("MKL requires local ordinals to be integer.\n");
+    (void)k;
+    (void)transposeA;
+    (void)transposeB;
+    (void)verbose;
   }
-  else {
-    throw std::runtime_error ("MKL requires local ordinals to be integer.\n");
-    (void) k; (void) transposeA; (void) transposeB; (void) verbose;
-  }
-#else // KOKKOSKERNELS_ENABLE_TPL_MKL
+#else   // KOKKOSKERNELS_ENABLE_TPL_MKL
   (void)handle;
-  (void)m;          (void)n;          (void)k;
-  (void)row_mapA;   (void)row_mapB;   (void)row_mapC;
-  (void)entriesA;   (void)entriesB;
-  (void)transposeA; (void)transposeB;
+  (void)m;
+  (void)n;
+  (void)k;
+  (void)row_mapA;
+  (void)row_mapB;
+  (void)row_mapC;
+  (void)entriesA;
+  (void)entriesB;
+  (void)transposeA;
+  (void)transposeB;
   (void)verbose;
-  throw std::runtime_error ("MKL IS NOT DEFINED\n");
-#endif // KOKKOSKERNELS_ENABLE_TPL_MKL
+  throw std::runtime_error("MKL IS NOT DEFINED\n");
+#endif  // KOKKOSKERNELS_ENABLE_TPL_MKL
 }
 
-
-  template <typename KernelHandle,
-  typename in_row_index_view_type,
-  typename in_nonzero_index_view_type,
-  typename in_nonzero_value_view_type,
-  typename bin_row_index_view_type,
-  typename bin_nonzero_index_view_type,
-  typename bin_nonzero_value_view_type,
-  typename cin_row_index_view_type,
-  typename cin_nonzero_index_view_type,
-  typename cin_nonzero_value_view_type>
-  void mkl2phase_apply(
-      KernelHandle *handle,
-      typename KernelHandle::nnz_lno_t m,
-      typename KernelHandle::nnz_lno_t n,
-      typename KernelHandle::nnz_lno_t k,
-      in_row_index_view_type row_mapA,
-      in_nonzero_index_view_type entriesA,
-      in_nonzero_value_view_type valuesA,
-
-      bool transposeA,
-      bin_row_index_view_type row_mapB,
-      bin_nonzero_index_view_type entriesB,
-      bin_nonzero_value_view_type valuesB,
-      bool transposeB,
-      cin_row_index_view_type row_mapC,
-      cin_nonzero_index_view_type &entriesC,
-      cin_nonzero_value_view_type &valuesC,
-      bool verbose = false){
-
+template <
+    typename KernelHandle, typename in_row_index_view_type,
+    typename in_nonzero_index_view_type, typename in_nonzero_value_view_type,
+    typename bin_row_index_view_type, typename bin_nonzero_index_view_type,
+    typename bin_nonzero_value_view_type, typename cin_row_index_view_type,
+    typename cin_nonzero_index_view_type, typename cin_nonzero_value_view_type>
+void mkl2phase_apply(
+    KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
+    typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k,
+    in_row_index_view_type row_mapA, in_nonzero_index_view_type entriesA,
+    in_nonzero_value_view_type valuesA,
+
+    bool transposeA, bin_row_index_view_type row_mapB,
+    bin_nonzero_index_view_type entriesB, bin_nonzero_value_view_type valuesB,
+    bool transposeB, cin_row_index_view_type row_mapC,
+    cin_nonzero_index_view_type &entriesC, cin_nonzero_value_view_type &valuesC,
+    bool verbose = false) {
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
 
-    using HandlePersistentMemorySpace = typename KernelHandle::HandlePersistentMemorySpace;
-    using int_persistent_work_view_t  = typename Kokkos::View<int *, HandlePersistentMemorySpace>;
-    using MyExecSpace                 = typename KernelHandle::HandleExecSpace;
-    using value_type                  = typename KernelHandle::nnz_scalar_t;
-    using idx                         = typename KernelHandle::nnz_lno_t;
-
-    if (std::is_same<idx, int>::value){
-
-      int *a_xadj = (int *)row_mapA.data();
-      int *b_xadj = (int *)row_mapB.data();
-      int *c_xadj = (int *)row_mapC.data();
+  using HandlePersistentMemorySpace =
+      typename KernelHandle::HandlePersistentMemorySpace;
+  using int_persistent_work_view_t =
+      typename Kokkos::View<int *, HandlePersistentMemorySpace>;
+  using MyExecSpace = typename KernelHandle::HandleExecSpace;
+  using value_type  = typename KernelHandle::nnz_scalar_t;
+  using idx         = typename KernelHandle::nnz_lno_t;
 
-      int *a_adj = (int *)entriesA.data();
-      int *b_adj = (int *)entriesB.data();
+  if (std::is_same<idx, int>::value) {
+    int *a_xadj = (int *)row_mapA.data();
+    int *b_xadj = (int *)row_mapB.data();
+    int *c_xadj = (int *)row_mapC.data();
 
+    int *a_adj = (int *)entriesA.data();
+    int *b_adj = (int *)entriesB.data();
 
-      if (handle->mkl_convert_to_1base)
-      {
-        int_persistent_work_view_t a_xadj_v, b_xadj_v, c_xadj_v;
-        a_xadj = (int *) handle->persistent_a_xadj.data();
-        b_xadj = (int *) handle->persistent_b_xadj.data();
-        c_xadj = (int *) handle->persistent_c_xadj.data();
-        int_persistent_work_view_t a_plus_one =  handle->persistent_a_adj;
-        int_persistent_work_view_t b_plus_one =  handle->persistent_b_adj;
+    if (handle->mkl_convert_to_1base) {
+      int_persistent_work_view_t a_xadj_v, b_xadj_v, c_xadj_v;
+      a_xadj = (int *)handle->persistent_a_xadj.data();
+      b_xadj = (int *)handle->persistent_b_xadj.data();
+      c_xadj = (int *)handle->persistent_c_xadj.data();
+      int_persistent_work_view_t a_plus_one = handle->persistent_a_adj;
+      int_persistent_work_view_t b_plus_one = handle->persistent_b_adj;
 
-        a_adj = (int *)a_plus_one.data();
-        b_adj = (int *)b_plus_one.data();
-      }
+      a_adj = (int *)a_plus_one.data();
+      b_adj = (int *)b_plus_one.data();
+    }
 
 #if __INTEL_MKL__ < 2018
-      (void) transposeA; (void) transposeB;
-      const value_type *a_ew = valuesA.data();
-      const value_type *b_ew = valuesB.data();
-
-      char trans = 'N';
-      MKL_INT request = 2;
-      MKL_INT sort = handle->get_mkl_sort_option();
-      MKL_INT mklm = m, mkln = n, mklk = k;
-      MKL_INT info = 0, nzmax = 2147483647;
-/*
-      KokkosKernels::Impl::print_1Dview(handle->persistent_a_xadj);
-      KokkosKernels::Impl::print_1Dview(a_plus_one);
-      KokkosKernels::Impl::print_1Dview(handle->persistent_b_xadj);
-      KokkosKernels::Impl::print_1Dview(b_plus_one);
-      KokkosKernels::Impl::print_1Dview(handle->persistent_c_xadj);
-      KokkosKernels::Impl::print_1Dview(valuesA);
-      KokkosKernels::Impl::print_1Dview(valuesB);
-
-
-      std::cout << "A" << std::endl;
-      KokkosKernels::Impl::print_1Dview(row_mapA);
-      KokkosKernels::Impl::print_1Dview(entriesA);
-      std::cout << "B" << std::endl;
-      KokkosKernels::Impl::print_1Dview(row_mapB);
-      KokkosKernels::Impl::print_1Dview(entriesB);
-      std::cout << "c:" << "entriesC:" << entriesC.extent(0) << std::endl;
-      KokkosKernels::Impl::print_1Dview(row_mapC);
-*/
-      Kokkos::Timer timer1;
-
-      if (std::is_same<value_type, float>::value){
-
-        mkl_scsrmultcsr(&trans, &request, &sort, &mklm, &mkln, &mklk,
-                      (float *)a_ew, a_adj, a_xadj,
-                      (float *)b_ew, b_adj, b_xadj,
-                      (float *)valuesC.data(), entriesC.data(), c_xadj,
-                      &nzmax, &info
-                      );
-        mkl_free_buffers();
-      }
-      else if (std::is_same<value_type, double>::value){
-
-        mkl_dcsrmultcsr(&trans, &request, &sort, &mklm, &mkln, &mklk,
-                      (double *)a_ew, a_adj, a_xadj,
-                      (double *)b_ew, b_adj, b_xadj,
-                      (double *)valuesC.data(), entriesC.data(), c_xadj,
-                      &nzmax, &info
-                      );
-        mkl_free_buffers();
-      }
-      else {
-        throw std::runtime_error ("MKL requires float or double values. Complex values are not implemented yet.\n");
-      }
-      if (verbose)
-      { std::cout << "Sort:" << sort << " Actual MKL2 Numeric Time:" << timer1.seconds() << std::endl; }
+    (void)transposeA;
+    (void)transposeB;
+    const value_type *a_ew = valuesA.data();
+    const value_type *b_ew = valuesB.data();
+
+    char trans      = 'N';
+    MKL_INT request = 2;
+    MKL_INT sort    = handle->get_mkl_sort_option();
+    MKL_INT mklm = m, mkln = n, mklk = k;
+    MKL_INT info = 0, nzmax = 2147483647;
+    /*
+          KokkosKernels::Impl::print_1Dview(handle->persistent_a_xadj);
+          KokkosKernels::Impl::print_1Dview(a_plus_one);
+          KokkosKernels::Impl::print_1Dview(handle->persistent_b_xadj);
+          KokkosKernels::Impl::print_1Dview(b_plus_one);
+          KokkosKernels::Impl::print_1Dview(handle->persistent_c_xadj);
+          KokkosKernels::Impl::print_1Dview(valuesA);
+          KokkosKernels::Impl::print_1Dview(valuesB);
+
+
+          std::cout << "A" << std::endl;
+          KokkosKernels::Impl::print_1Dview(row_mapA);
+          KokkosKernels::Impl::print_1Dview(entriesA);
+          std::cout << "B" << std::endl;
+          KokkosKernels::Impl::print_1Dview(row_mapB);
+          KokkosKernels::Impl::print_1Dview(entriesB);
+          std::cout << "c:" << "entriesC:" << entriesC.extent(0) << std::endl;
+          KokkosKernels::Impl::print_1Dview(row_mapC);
+    */
+    Kokkos::Timer timer1;
 
+    if (std::is_same<value_type, float>::value) {
+      mkl_scsrmultcsr(&trans, &request, &sort, &mklm, &mkln, &mklk,
+                      (float *)a_ew, a_adj, a_xadj, (float *)b_ew, b_adj,
+                      b_xadj, (float *)valuesC.data(), entriesC.data(), c_xadj,
+                      &nzmax, &info);
+      mkl_free_buffers();
+    } else if (std::is_same<value_type, double>::value) {
+      mkl_dcsrmultcsr(&trans, &request, &sort, &mklm, &mkln, &mklk,
+                      (double *)a_ew, a_adj, a_xadj, (double *)b_ew, b_adj,
+                      b_xadj, (double *)valuesC.data(), entriesC.data(), c_xadj,
+                      &nzmax, &info);
+      mkl_free_buffers();
+    } else {
+      throw std::runtime_error(
+          "MKL requires float or double values. Complex values are not "
+          "implemented yet.\n");
+    }
+    if (verbose) {
+      std::cout << "Sort:" << sort
+                << " Actual MKL2 Numeric Time:" << timer1.seconds()
+                << std::endl;
+    }
 
-      if (handle->mkl_convert_to_1base)
-      {
-        KokkosKernels::Impl::kk_a_times_x_plus_b< cin_nonzero_index_view_type, cin_nonzero_index_view_type,  int, int, MyExecSpace>(entriesC.extent(0), entriesC, entriesC,  1, -1);
-      }
-#endif // __INTEL_MKL__ < 2018
+    if (handle->mkl_convert_to_1base) {
+      KokkosKernels::Impl::kk_a_times_x_plus_b<cin_nonzero_index_view_type,
+                                               cin_nonzero_index_view_type, int,
+                                               int, MyExecSpace>(
+          entriesC.extent(0), entriesC, entriesC, 1, -1);
+    }
+#endif  // __INTEL_MKL__ < 2018
 
 #if __INTEL_MKL__ == 2018 && __INTEL_MKL_UPDATE__ >= 2
-      value_type *a_ew = const_cast<value_type*>(valuesA.data());
-      value_type *b_ew = const_cast<value_type*>(valuesB.data());
-
-      char trans = 'N';
-      MKL_INT request = 2;
-      MKL_INT sort = handle->get_mkl_sort_option();
-      MKL_INT mklm = m, mkln = n, mklk = k;
-      MKL_INT info = 0, nzmax = 2147483647;
+    value_type *a_ew = const_cast<value_type *>(valuesA.data());
+    value_type *b_ew = const_cast<value_type *>(valuesB.data());
 
+    char trans      = 'N';
+    MKL_INT request = 2;
+    MKL_INT sort    = handle->get_mkl_sort_option();
+    MKL_INT mklm = m, mkln = n, mklk = k;
+    MKL_INT info = 0, nzmax = 2147483647;
 
-      sparse_matrix_t A;
-      sparse_matrix_t B;
-      sparse_matrix_t C;
+    sparse_matrix_t A;
+    sparse_matrix_t B;
+    sparse_matrix_t C;
 
-      if (handle->mkl_convert_to_1base) { // a*, b* already converted to 1base above...
-        if (std::is_same<value_type, double>::value){
-          if (SPARSE_STATUS_SUCCESS != mkl_sparse_d_create_csr (&A, SPARSE_INDEX_BASE_ONE, mklm, mkln, a_xadj, a_xadj + 1, a_adj, reinterpret_cast<double*>(a_ew))){
-            throw std::runtime_error ("CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
-          }
+    if (handle->mkl_convert_to_1base) {  // a*, b* already converted to 1base
+                                         // above...
+      if (std::is_same<value_type, double>::value) {
+        if (SPARSE_STATUS_SUCCESS !=
+            mkl_sparse_d_create_csr(&A, SPARSE_INDEX_BASE_ONE, mklm, mkln,
+                                    a_xadj, a_xadj + 1, a_adj,
+                                    reinterpret_cast<double *>(a_ew))) {
+          throw std::runtime_error(
+              "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
         }
-        else if (std::is_same<value_type, float>::value){
-          if (SPARSE_STATUS_SUCCESS != mkl_sparse_s_create_csr (&A, SPARSE_INDEX_BASE_ONE, mklm, mkln, a_xadj, a_xadj + 1, a_adj, reinterpret_cast<float*>(a_ew))){
-            throw std::runtime_error ("CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
-          }
+      } else if (std::is_same<value_type, float>::value) {
+        if (SPARSE_STATUS_SUCCESS !=
+            mkl_sparse_s_create_csr(&A, SPARSE_INDEX_BASE_ONE, mklm, mkln,
+                                    a_xadj, a_xadj + 1, a_adj,
+                                    reinterpret_cast<float *>(a_ew))) {
+          throw std::runtime_error(
+              "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
         }
+      }
 
-        if (std::is_same<value_type, double>::value){
-          if (SPARSE_STATUS_SUCCESS != mkl_sparse_d_create_csr (&B, SPARSE_INDEX_BASE_ONE, n, k, b_xadj, b_xadj + 1, b_adj, reinterpret_cast<double*>(b_ew))){
-            throw std::runtime_error ("CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
-          }
+      if (std::is_same<value_type, double>::value) {
+        if (SPARSE_STATUS_SUCCESS !=
+            mkl_sparse_d_create_csr(&B, SPARSE_INDEX_BASE_ONE, n, k, b_xadj,
+                                    b_xadj + 1, b_adj,
+                                    reinterpret_cast<double *>(b_ew))) {
+          throw std::runtime_error(
+              "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
         }
-        else if (std::is_same<value_type, float>::value){
-          if (SPARSE_STATUS_SUCCESS != mkl_sparse_s_create_csr (&B, SPARSE_INDEX_BASE_ONE, n, k, b_xadj, b_xadj + 1, b_adj, reinterpret_cast<float*>(b_ew))){
-            throw std::runtime_error ("CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
-          }
+      } else if (std::is_same<value_type, float>::value) {
+        if (SPARSE_STATUS_SUCCESS !=
+            mkl_sparse_s_create_csr(&B, SPARSE_INDEX_BASE_ONE, n, k, b_xadj,
+                                    b_xadj + 1, b_adj,
+                                    reinterpret_cast<float *>(b_ew))) {
+          throw std::runtime_error(
+              "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
         }
-      } else {
-        if (std::is_same<value_type, double>::value){
-          if (SPARSE_STATUS_SUCCESS != mkl_sparse_d_create_csr (&A, SPARSE_INDEX_BASE_ZERO, mklm, mkln, a_xadj, a_xadj + 1, a_adj, reinterpret_cast<double*>(a_ew))){
-            throw std::runtime_error ("CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
-          }
+      }
+    } else {
+      if (std::is_same<value_type, double>::value) {
+        if (SPARSE_STATUS_SUCCESS !=
+            mkl_sparse_d_create_csr(&A, SPARSE_INDEX_BASE_ZERO, mklm, mkln,
+                                    a_xadj, a_xadj + 1, a_adj,
+                                    reinterpret_cast<double *>(a_ew))) {
+          throw std::runtime_error(
+              "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
         }
-        else if (std::is_same<value_type, float>::value){
-          if (SPARSE_STATUS_SUCCESS != mkl_sparse_s_create_csr (&A, SPARSE_INDEX_BASE_ZERO, mklm, mkln, a_xadj, a_xadj + 1, a_adj, reinterpret_cast<float*>(a_ew))){
-            throw std::runtime_error ("CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
-          }
+      } else if (std::is_same<value_type, float>::value) {
+        if (SPARSE_STATUS_SUCCESS !=
+            mkl_sparse_s_create_csr(&A, SPARSE_INDEX_BASE_ZERO, mklm, mkln,
+                                    a_xadj, a_xadj + 1, a_adj,
+                                    reinterpret_cast<float *>(a_ew))) {
+          throw std::runtime_error(
+              "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
         }
+      }
 
-        if (std::is_same<value_type, double>::value){
-          if (SPARSE_STATUS_SUCCESS != mkl_sparse_d_create_csr (&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj, b_xadj + 1, b_adj, reinterpret_cast<double*>(b_ew))){
-            throw std::runtime_error ("CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
-          }
+      if (std::is_same<value_type, double>::value) {
+        if (SPARSE_STATUS_SUCCESS !=
+            mkl_sparse_d_create_csr(&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj,
+                                    b_xadj + 1, b_adj,
+                                    reinterpret_cast<double *>(b_ew))) {
+          throw std::runtime_error(
+              "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
         }
-        else if (std::is_same<value_type, float>::value){
-          if (SPARSE_STATUS_SUCCESS != mkl_sparse_s_create_csr (&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj, b_xadj + 1, b_adj, reinterpret_cast<float*>(b_ew))){
-            throw std::runtime_error ("CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
-          }
+      } else if (std::is_same<value_type, float>::value) {
+        if (SPARSE_STATUS_SUCCESS !=
+            mkl_sparse_s_create_csr(&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj,
+                                    b_xadj + 1, b_adj,
+                                    reinterpret_cast<float *>(b_ew))) {
+          throw std::runtime_error(
+              "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
         }
       }
+    }
 
-      sparse_operation_t operation;
-      if (transposeA && transposeB){
-        operation = SPARSE_OPERATION_TRANSPOSE;
-      }
-      else if (!(transposeA || transposeB)){
-        operation = SPARSE_OPERATION_NON_TRANSPOSE;
-      }
-      else {
-        throw std::runtime_error ("MKL either transpose both matrices, or none for SPGEMM\n");
-      }
-
-      matrix_descr common_mtx_props;
-      common_mtx_props.type = SPARSE_MATRIX_TYPE_GENERAL;
-      common_mtx_props.mode = SPARSE_FILL_MODE_FULL;
-      common_mtx_props.diag = SPARSE_DIAG_NON_UNIT;
-
-      Kokkos::Timer timer1;
-      // options: SPARSE_STAGE_FULL_MULT vs SPARSE_STAGE_NNZ_COUNT then SPARSE_STAGE_FINALIZE_MULT
-      bool success = SPARSE_STATUS_SUCCESS != mkl_sparse_sp2m (operation, common_mtx_props, A, operation, common_mtx_props, B, SPARSE_STAGE_FINALIZE_MULT, &C); // success is "true" if mkl_sparse_spmm does not return success
-
-      if (verbose){
-        std::cout << "Actual MKL SPMM Time:" << timer1.seconds() << std::endl;
-      }
-
-      if (success) {
-        throw std::runtime_error ("ERROR at SPGEMM multiplication in mkl_sparse_spmm\n");
-      }
-
-      // Copy C components back
-      sparse_index_base_t c_indexing;
-      MKL_INT c_rows, c_cols, *rows_start, *rows_end, *columns;
-      typedef double values_type;
-      values_type *values;
-
-      if (SPARSE_STATUS_SUCCESS !=
-          //mkl_sparse_s_export_csr (C,
-          mkl_sparse_d_export_csr (C,
-              &c_indexing, &c_rows, &c_cols, &rows_start, &rows_end, &columns, &values)){
-        throw std::runtime_error ("ERROR at exporting result matrix in mkl_sparse_spmm\n");
-      }
+    sparse_operation_t operation;
+    if (transposeA && transposeB) {
+      operation = SPARSE_OPERATION_TRANSPOSE;
+    } else if (!(transposeA || transposeB)) {
+      operation = SPARSE_OPERATION_NON_TRANSPOSE;
+    } else {
+      throw std::runtime_error(
+          "MKL either transpose both matrices, or none for SPGEMM\n");
+    }
 
-      if (SPARSE_INDEX_BASE_ZERO != c_indexing){
-        throw std::runtime_error ("C is not zero based indexed\n");
-      }
+    matrix_descr common_mtx_props;
+    common_mtx_props.type = SPARSE_MATRIX_TYPE_GENERAL;
+    common_mtx_props.mode = SPARSE_FILL_MODE_FULL;
+    common_mtx_props.diag = SPARSE_DIAG_NON_UNIT;
 
-      //KokkosKernels::Impl::copy_vector<MKL_INT *, typename cin_row_index_view_type::non_const_type, MyExecSpace> (m, rows_start, row_mapC);
-      //idx nnz = row_mapC(m) =  rows_end[m - 1];
-      idx nnz = rows_end[m - 1];
-      using non_const_size_type = typename cin_row_index_view_type::non_const_value_type;
-      auto* tmpPtr = const_cast<non_const_size_type*>(row_mapC.data());
-      tmpPtr[m] = nnz;
+    Kokkos::Timer timer1;
+    // options: SPARSE_STAGE_FULL_MULT vs SPARSE_STAGE_NNZ_COUNT then
+    // SPARSE_STAGE_FINALIZE_MULT
+    bool success =
+        SPARSE_STATUS_SUCCESS !=
+        mkl_sparse_sp2m(operation, common_mtx_props, A, operation,
+                        common_mtx_props, B, SPARSE_STAGE_FINALIZE_MULT,
+                        &C);  // success is "true" if mkl_sparse_spmm does not
+                              // return success
+
+    if (verbose) {
+      std::cout << "Actual MKL SPMM Time:" << timer1.seconds() << std::endl;
+    }
 
-      KokkosKernels::Impl::copy_vector<MKL_INT *, typename cin_nonzero_index_view_type::non_const_type , MyExecSpace> (nnz, columns, entriesC);
-      KokkosKernels::Impl::copy_vector<values_type *, typename cin_nonzero_value_view_type::non_const_type, MyExecSpace> (nnz, values, valuesC);
+    if (success) {
+      throw std::runtime_error(
+          "ERROR at SPGEMM multiplication in mkl_sparse_spmm\n");
+    }
 
+    // Copy C components back
+    sparse_index_base_t c_indexing;
+    MKL_INT c_rows, c_cols, *rows_start, *rows_end, *columns;
+    typedef double values_type;
+    values_type *values;
+
+    if (SPARSE_STATUS_SUCCESS !=
+        // mkl_sparse_s_export_csr (C,
+        mkl_sparse_d_export_csr(C, &c_indexing, &c_rows, &c_cols, &rows_start,
+                                &rows_end, &columns, &values)) {
+      throw std::runtime_error(
+          "ERROR at exporting result matrix in mkl_sparse_spmm\n");
+    }
 
-      if (handle->mkl_convert_to_1base)
-      {
-        KokkosKernels::Impl::kk_a_times_x_plus_b< cin_nonzero_index_view_type, cin_nonzero_index_view_type,  int, int, MyExecSpace>(entriesC.extent(0), entriesC, entriesC,  1, -1);
-      }
+    if (SPARSE_INDEX_BASE_ZERO != c_indexing) {
+      throw std::runtime_error("C is not zero based indexed\n");
+    }
 
+    // KokkosKernels::Impl::copy_vector<MKL_INT *, typename
+    // cin_row_index_view_type::non_const_type, MyExecSpace> (m, rows_start,
+    // row_mapC); idx nnz = row_mapC(m) =  rows_end[m - 1];
+    idx nnz = rows_end[m - 1];
+    using non_const_size_type =
+        typename cin_row_index_view_type::non_const_value_type;
+    auto *tmpPtr = const_cast<non_const_size_type *>(row_mapC.data());
+    tmpPtr[m]    = nnz;
+
+    KokkosKernels::Impl::copy_vector<
+        MKL_INT *, typename cin_nonzero_index_view_type::non_const_type,
+        MyExecSpace>(nnz, columns, entriesC);
+    KokkosKernels::Impl::copy_vector<
+        values_type *, typename cin_nonzero_value_view_type::non_const_type,
+        MyExecSpace>(nnz, values, valuesC);
+
+    if (handle->mkl_convert_to_1base) {
+      KokkosKernels::Impl::kk_a_times_x_plus_b<cin_nonzero_index_view_type,
+                                               cin_nonzero_index_view_type, int,
+                                               int, MyExecSpace>(
+          entriesC.extent(0), entriesC, entriesC, 1, -1);
+    }
 
-      // Cleanup
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy (A)){
-        throw std::runtime_error ("Error at mkl_sparse_destroy A\n");
-      }
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy (B)){
-        throw std::runtime_error ("Error at mkl_sparse_destroy B\n");
-      }
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy (C)){
-        throw std::runtime_error ("Error at mkl_sparse_destroy C\n");
-      }
-#elif __INTEL_MKL__ == 2018 && __INTEL_MKL_UPDATE__ < 2
-      throw std::runtime_error ("Intel MKL version 18 must have update 2 - use intel/18.2.xyz\n");
-      (void) m;         (void) n;         (void) k;
-      (void)entriesC;
-      (void)valuesA;    (void)valuesB;    (void)valuesC;
-      (void)transposeA; (void)transposeB;
-      (void)verbose;
-#else
-      throw std::runtime_error ("Intel MKL versions > 18 are not yet tested/supported\n");
-      // Supress -Wunused-parameter on intel-18
-      (void) m;         (void) n;         (void) k;
-      (void)entriesC;
-      (void)valuesA;    (void)valuesB;    (void)valuesC;
-      (void)transposeA; (void)transposeB;
-      (void)verbose;
-#endif // __INTEL_MKL__ == 2018 && __INTEL_MKL_UPDATE__ >= 2
+    // Cleanup
+    if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(A)) {
+      throw std::runtime_error("Error at mkl_sparse_destroy A\n");
+    }
+    if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(B)) {
+      throw std::runtime_error("Error at mkl_sparse_destroy B\n");
     }
-    else {
-      (void) m;         (void) n;         (void) k;
-      (void)entriesC;
-      (void)valuesA;    (void)valuesB;    (void)valuesC;
-      (void)transposeA; (void)transposeB;
-      (void)verbose;
-      throw std::runtime_error ("MKL requires local ordinals to be integer.\n");
+    if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(C)) {
+      throw std::runtime_error("Error at mkl_sparse_destroy C\n");
     }
+#elif __INTEL_MKL__ == 2018 && __INTEL_MKL_UPDATE__ < 2
+    throw std::runtime_error(
+        "Intel MKL version 18 must have update 2 - use intel/18.2.xyz\n");
+    (void)m;
+    (void)n;
+    (void)k;
+    (void)entriesC;
+    (void)valuesA;
+    (void)valuesB;
+    (void)valuesC;
+    (void)transposeA;
+    (void)transposeB;
+    (void)verbose;
 #else
-    (void)handle;
-    (void)m;          (void)n;          (void)k;
-    (void)row_mapA;   (void)row_mapB;   (void)row_mapC;
-    (void)entriesA;   (void)entriesB;   (void)entriesC;
-    (void)valuesA;    (void)valuesB;    (void)valuesC;
-    (void)transposeA; (void)transposeB;
+    throw std::runtime_error(
+        "Intel MKL versions > 18 are not yet tested/supported\n");
+    // Supress -Wunused-parameter on intel-18
+    (void)m;
+    (void)n;
+    (void)k;
+    (void)entriesC;
+    (void)valuesA;
+    (void)valuesB;
+    (void)valuesC;
+    (void)transposeA;
+    (void)transposeB;
+    (void)verbose;
+#endif  // __INTEL_MKL__ == 2018 && __INTEL_MKL_UPDATE__ >= 2
+  } else {
+    (void)m;
+    (void)n;
+    (void)k;
+    (void)entriesC;
+    (void)valuesA;
+    (void)valuesB;
+    (void)valuesC;
+    (void)transposeA;
+    (void)transposeB;
     (void)verbose;
-    throw std::runtime_error ("MKL IS NOT DEFINED\n");
-#endif // KOKKOSKERNELS_ENABLE_TPL_MKL
-  } // end mkl2phase_apply
-} } // namespace KokkosKernels::Impl
+    throw std::runtime_error("MKL requires local ordinals to be integer.\n");
+  }
+#else
+  (void)handle;
+  (void)m;
+  (void)n;
+  (void)k;
+  (void)row_mapA;
+  (void)row_mapB;
+  (void)row_mapC;
+  (void)entriesA;
+  (void)entriesB;
+  (void)entriesC;
+  (void)valuesA;
+  (void)valuesB;
+  (void)valuesC;
+  (void)transposeA;
+  (void)transposeB;
+  (void)verbose;
+  throw std::runtime_error("MKL IS NOT DEFINED\n");
+#endif  // KOKKOSKERNELS_ENABLE_TPL_MKL
+}  // end mkl2phase_apply
+}  // namespace Impl
+}  // namespace KokkosSparse
 
 #endif
diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp
index 6632d270d4..8eb0bd3930 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp
@@ -45,7 +45,6 @@
 #ifndef _KOKKOSSPGEMMMKL_HPP
 #define _KOKKOSSPGEMMMKL_HPP
 
-
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
 #include "mkl_spblas.h"
 #include "mkl.h"
@@ -54,51 +53,34 @@
 #include "KokkosKernels_Utils.hpp"
 #include <Kokkos_Concepts.hpp>
 
+namespace KokkosSparse {
 
-namespace KokkosSparse{
-
-namespace Impl{
-
-
-template <typename KernelHandle,
-typename in_row_index_view_type,
-typename in_nonzero_index_view_type,
-typename bin_row_index_view_type,
-typename bin_nonzero_index_view_type,
-typename cin_row_index_view_type>
-void mkl_symbolic(
-    KernelHandle *handle,
-    typename KernelHandle::nnz_lno_t m,
-    typename KernelHandle::nnz_lno_t n,
-    typename KernelHandle::nnz_lno_t k,
-    in_row_index_view_type row_mapA,
-    in_nonzero_index_view_type entriesA,
+namespace Impl {
 
-    bool transposeA,
-    bin_row_index_view_type row_mapB,
-    bin_nonzero_index_view_type entriesB,
-    bool transposeB,
-    cin_row_index_view_type row_mapC,
-    bool verbose = false){
+template <typename KernelHandle, typename in_row_index_view_type,
+          typename in_nonzero_index_view_type, typename bin_row_index_view_type,
+          typename bin_nonzero_index_view_type,
+          typename cin_row_index_view_type>
+void mkl_symbolic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
+                  typename KernelHandle::nnz_lno_t n,
+                  typename KernelHandle::nnz_lno_t k,
+                  in_row_index_view_type row_mapA,
+                  in_nonzero_index_view_type entriesA,
 
+                  bool transposeA, bin_row_index_view_type row_mapB,
+                  bin_nonzero_index_view_type entriesB, bool transposeB,
+                  cin_row_index_view_type row_mapC, bool verbose = false) {
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
 
   typedef typename KernelHandle::nnz_lno_t idx;
   typedef typename KernelHandle::size_type size_type;
 
-
   typedef typename KernelHandle::HandleTempMemorySpace HandleTempMemorySpace;
-  typedef typename Kokkos::View<int *, HandleTempMemorySpace> int_temp_work_view_t;
-
-
+  typedef typename Kokkos::View<int *, HandleTempMemorySpace>
+      int_temp_work_view_t;
 
   typedef typename KernelHandle::nnz_scalar_t value_type;
 
-
-
-
-
-
   typedef typename KernelHandle::HandleExecSpace MyExecSpace;
   /*
     if (!(
@@ -121,293 +103,287 @@ void mkl_symbolic(
     int *b_xadj = NULL;
     int_temp_work_view_t a_xadj_v, b_xadj_v;
 
-    if (std::is_same<size_type, int>::value){
-
+    if (std::is_same<size_type, int>::value) {
       a_xadj = (int *)row_mapA.data();
       b_xadj = (int *)row_mapB.data();
-    }
-    else {
-
-
-      //TODO test this case.
+    } else {
+      // TODO test this case.
 
       Kokkos::Timer copy_time;
       const int max_integer = 2147483647;
-      if (entriesB.extent(0) > max_integer|| entriesA.extent(0) > max_integer){
-        throw std::runtime_error ("MKL requires integer values for size type for SPGEMM. Copying to integer will cause overflow.\n");
+      if (entriesB.extent(0) > max_integer ||
+          entriesA.extent(0) > max_integer) {
+        throw std::runtime_error(
+            "MKL requires integer values for size type for SPGEMM. Copying to "
+            "integer will cause overflow.\n");
         return;
       }
       a_xadj_v = int_temp_work_view_t("tmpa", m + 1);
-      a_xadj = (int *) a_xadj_v.data();
+      a_xadj   = (int *)a_xadj_v.data();
       b_xadj_v = int_temp_work_view_t("tmpb", n + 1);
-      b_xadj = (int *) b_xadj_v.data();
+      b_xadj   = (int *)b_xadj_v.data();
 
-      KokkosKernels::Impl::copy_vector<
-          in_row_index_view_type,
-          int_temp_work_view_t,
-          MyExecSpace> (m+1, row_mapA, a_xadj_v);
+      KokkosKernels::Impl::copy_vector<in_row_index_view_type,
+                                       int_temp_work_view_t, MyExecSpace>(
+          m + 1, row_mapA, a_xadj_v);
 
-      KokkosKernels::Impl::copy_vector<
-			bin_row_index_view_type,
-          int_temp_work_view_t,
-          MyExecSpace> (m+1, row_mapB, b_xadj_v);
+      KokkosKernels::Impl::copy_vector<bin_row_index_view_type,
+                                       int_temp_work_view_t, MyExecSpace>(
+          m + 1, row_mapB, b_xadj_v);
 
       if (verbose)
-        std::cout << "MKL COPY size type to int TIME:" << copy_time.seconds() << std::endl;
-
+        std::cout << "MKL COPY size type to int TIME:" << copy_time.seconds()
+                  << std::endl;
     }
 
-
     int *a_adj = (int *)entriesA.data();
     int *b_adj = (int *)entriesB.data();
 
-
-
-    std::vector <value_type> tmp_values (KOKKOSKERNELS_MACRO_MAX(entriesB.extent(0), entriesA.extent(0)));
+    std::vector<value_type> tmp_values(
+        KOKKOSKERNELS_MACRO_MAX(entriesB.extent(0), entriesA.extent(0)));
     value_type *ptmp_values = &(tmp_values[0]);
-    value_type *a_ew = ptmp_values;
-    value_type *b_ew = ptmp_values;
-
+    value_type *a_ew        = ptmp_values;
+    value_type *b_ew        = ptmp_values;
 
     sparse_matrix_t A;
     sparse_matrix_t B;
     sparse_matrix_t C;
 
-    if (std::is_same<value_type, float>::value){
-
-
-
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_s_create_csr (&A, SPARSE_INDEX_BASE_ZERO, m, n, a_xadj, a_xadj + 1, a_adj, (float *)a_ew)){
-        throw std::runtime_error ("CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
+    if (std::is_same<value_type, float>::value) {
+      if (SPARSE_STATUS_SUCCESS !=
+          mkl_sparse_s_create_csr(&A, SPARSE_INDEX_BASE_ZERO, m, n, a_xadj,
+                                  a_xadj + 1, a_adj, (float *)a_ew)) {
+        throw std::runtime_error(
+            "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
         return;
       }
 
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_s_create_csr (&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj, b_xadj + 1, b_adj, (float *)b_ew)){
-        throw std::runtime_error ("CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
+      if (SPARSE_STATUS_SUCCESS !=
+          mkl_sparse_s_create_csr(&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj,
+                                  b_xadj + 1, b_adj, (float *)b_ew)) {
+        throw std::runtime_error(
+            "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
         return;
       }
 
-
       sparse_operation_t operation;
-      if (transposeA && transposeB){
+      if (transposeA && transposeB) {
         operation = SPARSE_OPERATION_TRANSPOSE;
-      }
-      else if (!(transposeA || transposeB)){
+      } else if (!(transposeA || transposeB)) {
         operation = SPARSE_OPERATION_NON_TRANSPOSE;
-      }
-      else {
-
-        throw std::runtime_error ("MKL either transpose both matrices, or none for SPGEMM\n");
+      } else {
+        throw std::runtime_error(
+            "MKL either transpose both matrices, or none for SPGEMM\n");
         return;
       }
 
-
       Kokkos::Timer timer1;
-      bool success = SPARSE_STATUS_SUCCESS != mkl_sparse_spmm (operation, A, B, &C);
+      bool success =
+          SPARSE_STATUS_SUCCESS != mkl_sparse_spmm(operation, A, B, &C);
       if (verbose)
-      std::cout << "Actual FLOAT MKL SPMM Time in symbolic:" << timer1.seconds() << std::endl;
-
-      if (success){
-        throw std::runtime_error ("ERROR at SPGEMM multiplication in mkl_sparse_spmm\n");
+        std::cout << "Actual FLOAT MKL SPMM Time in symbolic:"
+                  << timer1.seconds() << std::endl;
 
+      if (success) {
+        throw std::runtime_error(
+            "ERROR at SPGEMM multiplication in mkl_sparse_spmm\n");
 
         return;
-      }
-      else{
-
+      } else {
         sparse_index_base_t c_indexing;
         MKL_INT c_rows, c_cols, *rows_start, *rows_end, *columns;
         float *values;
 
         if (SPARSE_STATUS_SUCCESS !=
-            mkl_sparse_s_export_csr (C,
-                &c_indexing, &c_rows, &c_cols, &rows_start, &rows_end, &columns, &values)){
-          throw std::runtime_error ("ERROR at exporting result matrix in mkl_sparse_spmm\n");
+            mkl_sparse_s_export_csr(C, &c_indexing, &c_rows, &c_cols,
+                                    &rows_start, &rows_end, &columns,
+                                    &values)) {
+          throw std::runtime_error(
+              "ERROR at exporting result matrix in mkl_sparse_spmm\n");
           return;
         }
 
-        if (SPARSE_INDEX_BASE_ZERO != c_indexing){
-          throw std::runtime_error ("C is not zero based indexed\n");
+        if (SPARSE_INDEX_BASE_ZERO != c_indexing) {
+          throw std::runtime_error("C is not zero based indexed\n");
           return;
         }
 
-
-
-        KokkosKernels::Impl::copy_vector<MKL_INT *, typename cin_row_index_view_type::non_const_type, MyExecSpace> (m, rows_start, row_mapC);
-        idx nnz = row_mapC(m) =  rows_end[m - 1];
+        KokkosKernels::Impl::copy_vector<
+            MKL_INT *, typename cin_row_index_view_type::non_const_type,
+            MyExecSpace>(m, rows_start, row_mapC);
+        idx nnz = row_mapC(m) = rows_end[m - 1];
         handle->set_c_nnz(nnz);
-
       }
 
-
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy (A)){
-        throw std::runtime_error ("Error at mkl_sparse_destroy A\n");
+      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(A)) {
+        throw std::runtime_error("Error at mkl_sparse_destroy A\n");
         return;
       }
 
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy (B)){
-        throw std::runtime_error ("Error at mkl_sparse_destroy B\n");
+      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(B)) {
+        throw std::runtime_error("Error at mkl_sparse_destroy B\n");
         return;
       }
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy (C)){
-        throw std::runtime_error ("Error at mkl_sparse_destroy C\n");
+      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(C)) {
+        throw std::runtime_error("Error at mkl_sparse_destroy C\n");
         return;
       }
-    }
-    else if (std::is_same<value_type, double>::value){
-
+    } else if (std::is_same<value_type, double>::value) {
       /*
       std::cout << "create a" << std::endl;
       std::cout << "m:" << m << " n:" << n << std::endl;
-      std::cout << "a_xadj[0]:" << a_xadj[0] << " a_xadj[m]:" << a_xadj[m] << std::endl;
-      std::cout << "a_adj[a_xadj[m] - 1]:" << a_adj[a_xadj[m] - 1] << " a_ew[a_xadj[m] - 1]:" << a_ew[a_xadj[m] - 1] << std::endl;
+      std::cout << "a_xadj[0]:" << a_xadj[0] << " a_xadj[m]:" << a_xadj[m] <<
+      std::endl; std::cout << "a_adj[a_xadj[m] - 1]:" << a_adj[a_xadj[m] - 1] <<
+      " a_ew[a_xadj[m] - 1]:" << a_ew[a_xadj[m] - 1] << std::endl;
       */
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_d_create_csr (&A, SPARSE_INDEX_BASE_ZERO, m, n, a_xadj, a_xadj + 1, a_adj, (double *)a_ew)){
-        throw std::runtime_error ("CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
+      if (SPARSE_STATUS_SUCCESS !=
+          mkl_sparse_d_create_csr(&A, SPARSE_INDEX_BASE_ZERO, m, n, a_xadj,
+                                  a_xadj + 1, a_adj, (double *)a_ew)) {
+        throw std::runtime_error(
+            "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
         return;
       }
 
-      //std::cout << "create b" << std::endl;
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_d_create_csr (&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj, b_xadj + 1, b_adj, (double *) b_ew)){
-        throw std::runtime_error ("CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
+      // std::cout << "create b" << std::endl;
+      if (SPARSE_STATUS_SUCCESS !=
+          mkl_sparse_d_create_csr(&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj,
+                                  b_xadj + 1, b_adj, (double *)b_ew)) {
+        throw std::runtime_error(
+            "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
         return;
       }
 
       sparse_operation_t operation;
-      if (transposeA && transposeB){
+      if (transposeA && transposeB) {
         operation = SPARSE_OPERATION_TRANSPOSE;
-      }
-      else if (!(transposeA || transposeB)){
+      } else if (!(transposeA || transposeB)) {
         operation = SPARSE_OPERATION_NON_TRANSPOSE;
-      }
-      else {
-        throw std::runtime_error ("MKL either transpose both matrices, or none for SPGEMM\n");
+      } else {
+        throw std::runtime_error(
+            "MKL either transpose both matrices, or none for SPGEMM\n");
         return;
       }
 
-
       Kokkos::Timer timer1;
-      bool success = SPARSE_STATUS_SUCCESS != mkl_sparse_spmm (operation, A, B, &C);
+      bool success =
+          SPARSE_STATUS_SUCCESS != mkl_sparse_spmm(operation, A, B, &C);
       if (verbose)
-      std::cout << "Actual DOUBLE MKL SPMM Time Without Free:" << timer1.seconds() << std::endl;
+        std::cout << "Actual DOUBLE MKL SPMM Time Without Free:"
+                  << timer1.seconds() << std::endl;
       mkl_free_buffers();
       if (verbose)
-      std::cout << "Actual DOUBLE MKL SPMM Time:" << timer1.seconds() << std::endl;
+        std::cout << "Actual DOUBLE MKL SPMM Time:" << timer1.seconds()
+                  << std::endl;
 
-      if (success){
-        throw std::runtime_error ("ERROR at SPGEMM multiplication in mkl_sparse_spmm\n");
+      if (success) {
+        throw std::runtime_error(
+            "ERROR at SPGEMM multiplication in mkl_sparse_spmm\n");
         return;
-      }
-      else{
-
-
+      } else {
         sparse_index_base_t c_indexing;
         MKL_INT c_rows, c_cols, *rows_start, *rows_end, *columns;
         double *values;
 
         if (SPARSE_STATUS_SUCCESS !=
-            mkl_sparse_d_export_csr (C,
-                &c_indexing, &c_rows, &c_cols, &rows_start, &rows_end, &columns, &values)){
-          throw std::runtime_error ("ERROR at exporting result matrix in mkl_sparse_spmm\n");
+            mkl_sparse_d_export_csr(C, &c_indexing, &c_rows, &c_cols,
+                                    &rows_start, &rows_end, &columns,
+                                    &values)) {
+          throw std::runtime_error(
+              "ERROR at exporting result matrix in mkl_sparse_spmm\n");
           return;
         }
 
-        if (SPARSE_INDEX_BASE_ZERO != c_indexing){
-          throw std::runtime_error ("C is not zero based indexed\n");
+        if (SPARSE_INDEX_BASE_ZERO != c_indexing) {
+          throw std::runtime_error("C is not zero based indexed\n");
           return;
         }
-        if (handle->mkl_keep_output)
-        {
+        if (handle->mkl_keep_output) {
           Kokkos::Timer copy_time;
 
-          KokkosKernels::Impl::copy_vector<MKL_INT *, typename cin_row_index_view_type::non_const_type, MyExecSpace> (m, rows_start, row_mapC);
-          idx nnz = row_mapC(m) =  rows_end[m - 1];
+          KokkosKernels::Impl::copy_vector<
+              MKL_INT *, typename cin_row_index_view_type::non_const_type,
+              MyExecSpace>(m, rows_start, row_mapC);
+          idx nnz = row_mapC(m) = rows_end[m - 1];
           handle->set_c_nnz(nnz);
 
           double copy_time_d = copy_time.seconds();
-          if (verbose)
-          std::cout << "MKL COPYTIME:" << copy_time_d << std::endl;
+          if (verbose) std::cout << "MKL COPYTIME:" << copy_time_d << std::endl;
         }
-
       }
 
-
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy (A)){
-        throw std::runtime_error ("Error at mkl_sparse_destroy A\n");
+      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(A)) {
+        throw std::runtime_error("Error at mkl_sparse_destroy A\n");
         return;
       }
 
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy (B)){
-        throw std::runtime_error ("Error at mkl_sparse_destroy B\n");
+      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(B)) {
+        throw std::runtime_error("Error at mkl_sparse_destroy B\n");
         return;
       }
-      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy (C)){
-        throw std::runtime_error ("Error at mkl_sparse_destroy C\n");
+      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(C)) {
+        throw std::runtime_error("Error at mkl_sparse_destroy C\n");
         return;
       }
 
-    }
-    else {
-      throw std::runtime_error ("MKL requires float or double values. Complex values are not implemented yet.\n");
+    } else {
+      throw std::runtime_error(
+          "MKL requires float or double values. Complex values are not "
+          "implemented yet.\n");
       return;
     }
-  }
-  else {
-    throw std::runtime_error ("MKL requires local ordinals to be integer.\n");
+  } else {
+    throw std::runtime_error("MKL requires local ordinals to be integer.\n");
     return;
   }
 #else
   (void)handle;
-  (void)m;          (void)n;          (void)k;
-  (void)row_mapA;   (void)row_mapB;   (void)row_mapC;
-  (void)entriesA;   (void)entriesB;
-  (void)transposeA; (void)transposeB;
+  (void)m;
+  (void)n;
+  (void)k;
+  (void)row_mapA;
+  (void)row_mapB;
+  (void)row_mapC;
+  (void)entriesA;
+  (void)entriesB;
+  (void)transposeA;
+  (void)transposeB;
   (void)verbose;
-  throw std::runtime_error ("MKL IS NOT DEFINED\n");
-  //return;
+  throw std::runtime_error("MKL IS NOT DEFINED\n");
+  // return;
 #endif
 }
 
-
-
-  template <typename KernelHandle,
-  typename in_row_index_view_type,
-  typename in_nonzero_index_view_type,
-  typename in_nonzero_value_view_type,
-  typename bin_row_index_view_type,
-  typename bin_nonzero_index_view_type,
-  typename bin_nonzero_value_view_type,
-  typename cin_row_index_view_type,
-  typename cin_nonzero_index_view_type,
-  typename cin_nonzero_value_view_type>
-  void mkl_apply(
-      KernelHandle *handle,
-      typename KernelHandle::nnz_lno_t m,
-      typename KernelHandle::nnz_lno_t n,
-      typename KernelHandle::nnz_lno_t k,
-      in_row_index_view_type row_mapA,
-      in_nonzero_index_view_type entriesA,
-      in_nonzero_value_view_type valuesA,
-
-      bool transposeA,
-      bin_row_index_view_type row_mapB,
-      bin_nonzero_index_view_type entriesB,
-      bin_nonzero_value_view_type valuesB,
-      bool transposeB,
-      cin_row_index_view_type row_mapC,
-      cin_nonzero_index_view_type entriesC,
-      cin_nonzero_value_view_type valuesC,
-      bool verbose = false){
-
+template <
+    typename KernelHandle, typename in_row_index_view_type,
+    typename in_nonzero_index_view_type, typename in_nonzero_value_view_type,
+    typename bin_row_index_view_type, typename bin_nonzero_index_view_type,
+    typename bin_nonzero_value_view_type, typename cin_row_index_view_type,
+    typename cin_nonzero_index_view_type, typename cin_nonzero_value_view_type>
+void mkl_apply(KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
+               typename KernelHandle::nnz_lno_t n,
+               typename KernelHandle::nnz_lno_t k,
+               in_row_index_view_type row_mapA,
+               in_nonzero_index_view_type entriesA,
+               in_nonzero_value_view_type valuesA,
+
+               bool transposeA, bin_row_index_view_type row_mapB,
+               bin_nonzero_index_view_type entriesB,
+               bin_nonzero_value_view_type valuesB, bool transposeB,
+               cin_row_index_view_type row_mapC,
+               cin_nonzero_index_view_type entriesC,
+               cin_nonzero_value_view_type valuesC, bool verbose = false) {
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
 
-    typedef typename KernelHandle::nnz_lno_t idx;
-    typedef typename KernelHandle::size_type size_type;
+  typedef typename KernelHandle::nnz_lno_t idx;
+  typedef typename KernelHandle::size_type size_type;
 
+  typedef typename KernelHandle::HandleTempMemorySpace HandleTempMemorySpace;
+  typedef typename Kokkos::View<int *, HandleTempMemorySpace>
+      int_temp_work_view_t;
 
-    typedef typename KernelHandle::HandleTempMemorySpace HandleTempMemorySpace;
-    typedef typename Kokkos::View<int *, HandleTempMemorySpace> int_temp_work_view_t;
+  typedef typename KernelHandle::nnz_scalar_t value_type;
 
+  typedef typename KernelHandle::HandleExecSpace MyExecSpace;
   /*
       if (!(
           (Kokkos::SpaceAccessibility<typename
@@ -424,285 +400,282 @@ void mkl_symbolic(
      MKL\n"); return;
       }
   */
-    typedef typename KernelHandle::nnz_scalar_t value_type;
-
-    
-
-
-
-    typedef typename KernelHandle::HandleExecSpace MyExecSpace;
-/*
-    if (!(
-        (Kokkos::SpaceAccessibility<typename Kokkos::HostSpace::execution_space, typename device1::memory_space>::accessible) &&
-        (Kokkos::SpaceAccessibility<typename Kokkos::HostSpace::execution_space, typename device2::memory_space>::accessible) &&
-        (Kokkos::SpaceAccessibility<typename Kokkos::HostSpace::execution_space, typename device3::memory_space>::accessible) )
-        ){
-      throw std::runtime_error ("MEMORY IS NOT ALLOCATED IN HOST DEVICE for MKL\n");
-      return;
-    }
-*/
-    if (std::is_same<idx, int>::value){
-
-      int *a_xadj = NULL;
-      int *b_xadj = NULL;
-      int_temp_work_view_t a_xadj_v, b_xadj_v;
+  if (std::is_same<idx, int>::value) {
+    int *a_xadj = NULL;
+    int *b_xadj = NULL;
+    int_temp_work_view_t a_xadj_v, b_xadj_v;
 
-      if (std::is_same<size_type, int>::value){
+    if (std::is_same<size_type, int>::value) {
+      a_xadj = (int *)row_mapA.data();
+      b_xadj = (int *)row_mapB.data();
+    } else {
+      // TODO test this case.
 
-        a_xadj = (int *)row_mapA.data();
-        b_xadj = (int *)row_mapB.data();
+      Kokkos::Timer copy_time;
+      const int max_integer = 2147483647;
+      if (entriesB.extent(0) > max_integer ||
+          entriesA.extent(0) > max_integer) {
+        throw std::runtime_error(
+            "MKL requires integer values for size type for SPGEMM. Copying to "
+            "integer will cause overflow.\n");
+        return;
       }
-      else {
+      a_xadj_v = int_temp_work_view_t("tmpa", m + 1);
+      a_xadj   = (int *)a_xadj_v.data();
+      b_xadj_v = int_temp_work_view_t("tmpb", n + 1);
+      b_xadj   = (int *)b_xadj_v.data();
 
+      KokkosKernels::Impl::copy_vector<in_row_index_view_type,
+                                       int_temp_work_view_t, MyExecSpace>(
+          m + 1, row_mapA, a_xadj_v);
 
-        //TODO test this case.
+      KokkosKernels::Impl::copy_vector<bin_row_index_view_type,
+                                       int_temp_work_view_t, MyExecSpace>(
+          m + 1, row_mapB, b_xadj_v);
 
-        Kokkos::Timer copy_time;
-        const int max_integer = 2147483647;
-        if (entriesB.extent(0) > max_integer|| entriesA.extent(0) > max_integer){
-          throw std::runtime_error ("MKL requires integer values for size type for SPGEMM. Copying to integer will cause overflow.\n");
-          return;
-        }
-        a_xadj_v = int_temp_work_view_t("tmpa", m + 1);
-        a_xadj = (int *) a_xadj_v.data();
-        b_xadj_v = int_temp_work_view_t("tmpb", n + 1);
-        b_xadj = (int *) b_xadj_v.data();
+      if (verbose)
+        std::cout << "MKL COPY size type to int TIME:" << copy_time.seconds()
+                  << std::endl;
+    }
 
-        KokkosKernels::Impl::copy_vector<
-            in_row_index_view_type,
-            int_temp_work_view_t,
-            MyExecSpace> (m+1, row_mapA, a_xadj_v);
+    int *a_adj = (int *)entriesA.data();
+    int *b_adj = (int *)entriesB.data();
 
-        KokkosKernels::Impl::copy_vector<
-			bin_row_index_view_type,
-            int_temp_work_view_t,
-            MyExecSpace> (m+1, row_mapB, b_xadj_v);
+    const value_type *a_ew = valuesA.data();
+    const value_type *b_ew = valuesB.data();
 
-        if (verbose)
-          std::cout << "MKL COPY size type to int TIME:" << copy_time.seconds() << std::endl;
+    sparse_matrix_t A;
+    sparse_matrix_t B;
+    sparse_matrix_t C;
 
+    if (std::is_same<value_type, float>::value) {
+      if (SPARSE_STATUS_SUCCESS !=
+          mkl_sparse_s_create_csr(&A, SPARSE_INDEX_BASE_ZERO, m, n, a_xadj,
+                                  a_xadj + 1, a_adj, (float *)a_ew)) {
+        throw std::runtime_error(
+            "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
+        return;
       }
 
+      if (SPARSE_STATUS_SUCCESS !=
+          mkl_sparse_s_create_csr(&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj,
+                                  b_xadj + 1, b_adj, (float *)b_ew)) {
+        throw std::runtime_error(
+            "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
+        return;
+      }
 
-      int *a_adj = (int *)entriesA.data();
-      int *b_adj = (int *)entriesB.data();
-
-
- 
-      const value_type *a_ew = valuesA.data();
-      const value_type *b_ew = valuesB.data();
-
-
-      sparse_matrix_t A;
-      sparse_matrix_t B;
-      sparse_matrix_t C;
-
-      if (std::is_same<value_type, float>::value){
-
-
-
-        if (SPARSE_STATUS_SUCCESS != mkl_sparse_s_create_csr (&A, SPARSE_INDEX_BASE_ZERO, m, n, a_xadj, a_xadj + 1, a_adj, (float *)a_ew)){
-          throw std::runtime_error ("CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
-          return;
-        }
+      sparse_operation_t operation;
+      if (transposeA && transposeB) {
+        operation = SPARSE_OPERATION_TRANSPOSE;
+      } else if (!(transposeA || transposeB)) {
+        operation = SPARSE_OPERATION_NON_TRANSPOSE;
+      } else {
+        throw std::runtime_error(
+            "MKL either transpose both matrices, or none for SPGEMM\n");
+        return;
+      }
 
-        if (SPARSE_STATUS_SUCCESS != mkl_sparse_s_create_csr (&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj, b_xadj + 1, b_adj, (float *)b_ew)){
-          throw std::runtime_error ("CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
-          return;
-        }
+      Kokkos::Timer timer1;
+      bool success =
+          SPARSE_STATUS_SUCCESS != mkl_sparse_spmm(operation, A, B, &C);
+      if (verbose)
+        std::cout << "Actual FLOAT MKL SPMM Time:" << timer1.seconds()
+                  << std::endl;
 
+      if (success) {
+        throw std::runtime_error(
+            "ERROR at SPGEMM multiplication in mkl_sparse_spmm\n");
 
-        sparse_operation_t operation;
-        if (transposeA && transposeB){
-          operation = SPARSE_OPERATION_TRANSPOSE;
-        }
-        else if (!(transposeA || transposeB)){
-          operation = SPARSE_OPERATION_NON_TRANSPOSE;
-        }
-        else {
+        return;
+      } else {
+        sparse_index_base_t c_indexing;
+        MKL_INT c_rows, c_cols, *rows_start, *rows_end, *columns;
+        float *values;
 
-          throw std::runtime_error ("MKL either transpose both matrices, or none for SPGEMM\n");
+        if (SPARSE_STATUS_SUCCESS !=
+            mkl_sparse_s_export_csr(C, &c_indexing, &c_rows, &c_cols,
+                                    &rows_start, &rows_end, &columns,
+                                    &values)) {
+          throw std::runtime_error(
+              "ERROR at exporting result matrix in mkl_sparse_spmm\n");
           return;
         }
 
-
-        Kokkos::Timer timer1;
-        bool success = SPARSE_STATUS_SUCCESS != mkl_sparse_spmm (operation, A, B, &C);
-        if (verbose)
-        std::cout << "Actual FLOAT MKL SPMM Time:" << timer1.seconds() << std::endl;
-
-        if (success){
-          throw std::runtime_error ("ERROR at SPGEMM multiplication in mkl_sparse_spmm\n");
-
-
+        if (SPARSE_INDEX_BASE_ZERO != c_indexing) {
+          throw std::runtime_error("C is not zero based indexed\n");
           return;
         }
-        else{
 
-          sparse_index_base_t c_indexing;
-          MKL_INT c_rows, c_cols, *rows_start, *rows_end, *columns;
-          float *values;
-
-          if (SPARSE_STATUS_SUCCESS !=
-              mkl_sparse_s_export_csr (C,
-                  &c_indexing, &c_rows, &c_cols, &rows_start, &rows_end, &columns, &values)){
-            throw std::runtime_error ("ERROR at exporting result matrix in mkl_sparse_spmm\n");
-            return;
-          }
-
-          if (SPARSE_INDEX_BASE_ZERO != c_indexing){
-            throw std::runtime_error ("C is not zero based indexed\n");
-            return;
-          }
-
-
-          //KokkosKernels::Impl::copy_vector<MKL_INT *, typename cin_row_index_view_type::non_const_type, MyExecSpace> (m, rows_start, row_mapC);
-          //idx nnz = row_mapC(m) = rows_end[m - 1];
-          idx nnz = rows_end[m - 1];
-          using non_const_size_type = typename cin_row_index_view_type::non_const_value_type;
-          auto* tmpPtr = const_cast<non_const_size_type*>(row_mapC.data());
-          tmpPtr[m] = nnz;
-
-          KokkosKernels::Impl::copy_vector<MKL_INT *, typename cin_nonzero_index_view_type::non_const_type , MyExecSpace> (nnz, columns, entriesC);
-          KokkosKernels::Impl::copy_vector<float *, typename cin_nonzero_value_view_type::non_const_type, MyExecSpace> (nnz, values, valuesC);
-        }
+        // KokkosKernels::Impl::copy_vector<MKL_INT *, typename
+        // cin_row_index_view_type::non_const_type, MyExecSpace> (m, rows_start,
+        // row_mapC); idx nnz = row_mapC(m) = rows_end[m - 1];
+        idx nnz = rows_end[m - 1];
+        using non_const_size_type =
+            typename cin_row_index_view_type::non_const_value_type;
+        auto *tmpPtr = const_cast<non_const_size_type *>(row_mapC.data());
+        tmpPtr[m]    = nnz;
 
+        KokkosKernels::Impl::copy_vector<
+            MKL_INT *, typename cin_nonzero_index_view_type::non_const_type,
+            MyExecSpace>(nnz, columns, entriesC);
+        KokkosKernels::Impl::copy_vector<
+            float *, typename cin_nonzero_value_view_type::non_const_type,
+            MyExecSpace>(nnz, values, valuesC);
+      }
 
-        if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy (A)){
-          throw std::runtime_error ("Error at mkl_sparse_destroy A\n");
-          return;
-        }
+      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(A)) {
+        throw std::runtime_error("Error at mkl_sparse_destroy A\n");
+        return;
+      }
 
-        if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy (B)){
-          throw std::runtime_error ("Error at mkl_sparse_destroy B\n");
-          return;
-        }
-        if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy (C)){
-          throw std::runtime_error ("Error at mkl_sparse_destroy C\n");
-          return;
-        }
+      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(B)) {
+        throw std::runtime_error("Error at mkl_sparse_destroy B\n");
+        return;
+      }
+      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(C)) {
+        throw std::runtime_error("Error at mkl_sparse_destroy C\n");
+        return;
+      }
+    } else if (std::is_same<value_type, double>::value) {
+      /*
+      std::cout << "create a" << std::endl;
+      std::cout << "m:" << m << " n:" << n << std::endl;
+      std::cout << "a_xadj[0]:" << a_xadj[0] << " a_xadj[m]:" << a_xadj[m] <<
+      std::endl; std::cout << "a_adj[a_xadj[m] - 1]:" << a_adj[a_xadj[m] - 1] <<
+      " a_ew[a_xadj[m] - 1]:" << a_ew[a_xadj[m] - 1] << std::endl;
+      */
+      if (SPARSE_STATUS_SUCCESS !=
+          mkl_sparse_d_create_csr(&A, SPARSE_INDEX_BASE_ZERO, m, n, a_xadj,
+                                  a_xadj + 1, a_adj, (double *)a_ew)) {
+        throw std::runtime_error(
+            "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
+        return;
       }
-      else if (std::is_same<value_type, double>::value){
-
-        /*
-        std::cout << "create a" << std::endl;
-        std::cout << "m:" << m << " n:" << n << std::endl;
-        std::cout << "a_xadj[0]:" << a_xadj[0] << " a_xadj[m]:" << a_xadj[m] << std::endl;
-        std::cout << "a_adj[a_xadj[m] - 1]:" << a_adj[a_xadj[m] - 1] << " a_ew[a_xadj[m] - 1]:" << a_ew[a_xadj[m] - 1] << std::endl;
-        */
-        if (SPARSE_STATUS_SUCCESS != mkl_sparse_d_create_csr (&A, SPARSE_INDEX_BASE_ZERO, m, n, a_xadj, a_xadj + 1, a_adj, ( double *)a_ew)){
-          throw std::runtime_error ("CANNOT CREATE mkl_sparse_s_create_csr A matrix\n");
-          return;
-        }
 
-        //std::cout << "create b" << std::endl;
-        if (SPARSE_STATUS_SUCCESS != mkl_sparse_d_create_csr (&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj, b_xadj + 1, b_adj, ( double *) b_ew)){
-          throw std::runtime_error ("CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
-          return;
-        }
+      // std::cout << "create b" << std::endl;
+      if (SPARSE_STATUS_SUCCESS !=
+          mkl_sparse_d_create_csr(&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj,
+                                  b_xadj + 1, b_adj, (double *)b_ew)) {
+        throw std::runtime_error(
+            "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n");
+        return;
+      }
 
-        sparse_operation_t operation;
-        if (transposeA && transposeB){
-          operation = SPARSE_OPERATION_TRANSPOSE;
-        }
-        else if (!(transposeA || transposeB)){
-          operation = SPARSE_OPERATION_NON_TRANSPOSE;
-        }
-        else {
-          throw std::runtime_error ("MKL either transpose both matrices, or none for SPGEMM\n");
-          return;
-        }
+      sparse_operation_t operation;
+      if (transposeA && transposeB) {
+        operation = SPARSE_OPERATION_TRANSPOSE;
+      } else if (!(transposeA || transposeB)) {
+        operation = SPARSE_OPERATION_NON_TRANSPOSE;
+      } else {
+        throw std::runtime_error(
+            "MKL either transpose both matrices, or none for SPGEMM\n");
+        return;
+      }
 
+      Kokkos::Timer timer1;
+      bool success =
+          SPARSE_STATUS_SUCCESS != mkl_sparse_spmm(operation, A, B, &C);
+      if (verbose)
+        std::cout << "Actual DOUBLE MKL SPMM Time Without Free:"
+                  << timer1.seconds() << std::endl;
 
-        Kokkos::Timer timer1;
-        bool success = SPARSE_STATUS_SUCCESS != mkl_sparse_spmm (operation, A, B, &C);
-        if (verbose)
-        std::cout << "Actual DOUBLE MKL SPMM Time Without Free:" << timer1.seconds() << std::endl;
+      mkl_free_buffers();
+      if (verbose)
+        std::cout << "Actual DOUBLE MKL SPMM Time:" << timer1.seconds()
+                  << std::endl;
 
-        mkl_free_buffers();
-        if (verbose)
-        std::cout << "Actual DOUBLE MKL SPMM Time:" << timer1.seconds() << std::endl;
+      if (success) {
+        throw std::runtime_error(
+            "ERROR at SPGEMM multiplication in mkl_sparse_spmm\n");
+        return;
+      } else {
+        sparse_index_base_t c_indexing;
+        MKL_INT c_rows, c_cols, *rows_start, *rows_end, *columns;
+        double *values;
 
-        if (success){
-          throw std::runtime_error ("ERROR at SPGEMM multiplication in mkl_sparse_spmm\n");
+        if (SPARSE_STATUS_SUCCESS !=
+            mkl_sparse_d_export_csr(C, &c_indexing, &c_rows, &c_cols,
+                                    &rows_start, &rows_end, &columns,
+                                    &values)) {
+          throw std::runtime_error(
+              "ERROR at exporting result matrix in mkl_sparse_spmm\n");
           return;
         }
-        else{
-
-
-          sparse_index_base_t c_indexing;
-          MKL_INT c_rows, c_cols, *rows_start, *rows_end, *columns;
-          double *values;
-
-          if (SPARSE_STATUS_SUCCESS !=
-              mkl_sparse_d_export_csr (C,
-                  &c_indexing, &c_rows, &c_cols, &rows_start, &rows_end, &columns, &values)){
-            throw std::runtime_error ("ERROR at exporting result matrix in mkl_sparse_spmm\n");
-            return;
-          }
-
-          if (SPARSE_INDEX_BASE_ZERO != c_indexing){
-            throw std::runtime_error ("C is not zero based indexed\n");
-            return;
-          }
-          if (handle->mkl_keep_output)
-          {
-            Kokkos::Timer copy_time;
-
-            //KokkosKernels::Impl::copy_vector<MKL_INT *, typename cin_row_index_view_type::non_const_type, MyExecSpace> (m, rows_start, row_mapC);
-            //idx nnz = row_mapC(m) = rows_end[m - 1];
-            idx nnz = rows_end[m - 1];
-            using non_const_size_type = typename cin_row_index_view_type::non_const_value_type;
-            auto* tmpPtr = const_cast<non_const_size_type*>(row_mapC.data());
-            tmpPtr[m] = nnz;
-
-            KokkosKernels::Impl::copy_vector<MKL_INT *, typename cin_nonzero_index_view_type::non_const_type, MyExecSpace> (nnz, columns, entriesC);
-            KokkosKernels::Impl::copy_vector<double *, typename cin_nonzero_value_view_type::non_const_type, MyExecSpace> (nnz, values, valuesC);
-            double copy_time_d = copy_time.seconds();
-            if (verbose)
-            std::cout << "MKL COPYTIME:" << copy_time_d << std::endl;
-          }
-
-        }
-
 
-        if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy (A)){
-          throw std::runtime_error ("Error at mkl_sparse_destroy A\n");
+        if (SPARSE_INDEX_BASE_ZERO != c_indexing) {
+          throw std::runtime_error("C is not zero based indexed\n");
           return;
         }
+        if (handle->mkl_keep_output) {
+          Kokkos::Timer copy_time;
 
-        if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy (B)){
-          throw std::runtime_error ("Error at mkl_sparse_destroy B\n");
-          return;
-        }
-        if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy (C)){
-          throw std::runtime_error ("Error at mkl_sparse_destroy C\n");
-          return;
+          // KokkosKernels::Impl::copy_vector<MKL_INT *, typename
+          // cin_row_index_view_type::non_const_type, MyExecSpace> (m,
+          // rows_start, row_mapC); idx nnz = row_mapC(m) = rows_end[m - 1];
+          idx nnz = rows_end[m - 1];
+          using non_const_size_type =
+              typename cin_row_index_view_type::non_const_value_type;
+          auto *tmpPtr = const_cast<non_const_size_type *>(row_mapC.data());
+          tmpPtr[m]    = nnz;
+
+          KokkosKernels::Impl::copy_vector<
+              MKL_INT *, typename cin_nonzero_index_view_type::non_const_type,
+              MyExecSpace>(nnz, columns, entriesC);
+          KokkosKernels::Impl::copy_vector<
+              double *, typename cin_nonzero_value_view_type::non_const_type,
+              MyExecSpace>(nnz, values, valuesC);
+          double copy_time_d = copy_time.seconds();
+          if (verbose) std::cout << "MKL COPYTIME:" << copy_time_d << std::endl;
         }
+      }
 
+      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(A)) {
+        throw std::runtime_error("Error at mkl_sparse_destroy A\n");
+        return;
       }
-      else {
-        throw std::runtime_error ("MKL requires float or double values. Complex values are not implemented yet.\n");
+
+      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(B)) {
+        throw std::runtime_error("Error at mkl_sparse_destroy B\n");
         return;
       }
-    }
-    else {
-      throw std::runtime_error ("MKL requires local ordinals to be integer.\n");
+      if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(C)) {
+        throw std::runtime_error("Error at mkl_sparse_destroy C\n");
+        return;
+      }
+
+    } else {
+      throw std::runtime_error(
+          "MKL requires float or double values. Complex values are not "
+          "implemented yet.\n");
       return;
     }
+  } else {
+    throw std::runtime_error("MKL requires local ordinals to be integer.\n");
+    return;
+  }
 #else
-    (void)handle;
-    (void)m;          (void)n;          (void)k;
-    (void)row_mapA;   (void)row_mapB;   (void)row_mapC;
-    (void)entriesA;   (void)entriesB;   (void)entriesC;
-    (void)valuesA;    (void)valuesB;    (void)valuesC;
-    (void)transposeA; (void)transposeB;
-    (void)verbose;
-    throw std::runtime_error ("MKL IS NOT DEFINED\n");
-    //return;
+  (void)handle;
+  (void)m;
+  (void)n;
+  (void)k;
+  (void)row_mapA;
+  (void)row_mapB;
+  (void)row_mapC;
+  (void)entriesA;
+  (void)entriesB;
+  (void)entriesC;
+  (void)valuesA;
+  (void)valuesB;
+  (void)valuesC;
+  (void)transposeA;
+  (void)transposeB;
+  (void)verbose;
+  throw std::runtime_error("MKL IS NOT DEFINED\n");
+  // return;
 #endif
-  }
-}
 }
+}  // namespace Impl
+}  // namespace KokkosSparse
 
 #endif
diff --git a/src/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp b/src/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp
index ad5d041c8b..beb969fc77 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp
@@ -64,181 +64,153 @@
 namespace KokkosSparse {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<
-  class KernelHandle,
-  class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t,
-  class b_size_view_t_, class b_lno_view_t, class b_scalar_view_t,
-  class c_size_view_t_, class c_lno_view_t, class c_scalar_view_t>
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
+          class a_scalar_view_t, class b_size_view_t_, class b_lno_view_t,
+          class b_scalar_view_t, class c_size_view_t_, class c_lno_view_t,
+          class c_scalar_view_t>
 struct spgemm_numeric_eti_spec_avail {
   enum : bool { value = false };
 };
 
-}
-}
-
-
-#define KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_AVAIL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE,FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE ) \
-    template<> \
-    struct spgemm_numeric_eti_spec_avail< \
-        KokkosKernels::Experimental::KokkosKernelsHandle<\
-        const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,  \
-          EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE> , \
-        Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> > > \
-    { enum : bool { value = true }; }; \
-    \
-    template<> \
-    struct spgemm_numeric_eti_spec_avail< \
-        KokkosKernels::Experimental::KokkosKernelsHandle<\
-        const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,  \
-          EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE> , \
-        Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> > > \
-    { enum : bool { value = true }; };
-
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#define KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_AVAIL(                       \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE)                             \
+  template <>                                                             \
+  struct spgemm_numeric_eti_spec_avail<                                   \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>,     \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,                           \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {          \
+    enum : bool { value = true };                                         \
+  };                                                                      \
+                                                                          \
+  template <>                                                             \
+  struct spgemm_numeric_eti_spec_avail<                                   \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>,     \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,                           \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {          \
+    enum : bool { value = true };                                         \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosSparse_spgemm_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosSparse_spgemm_numeric_eti_spec_avail.hpp>
+#include <KokkosSparse_spgemm_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spgemm_numeric_eti_spec_avail.hpp>
 
 namespace KokkosSparse {
 namespace Impl {
 
-
 // Unification layer
 /// \brief Implementation of KokkosBlas::spgemm (sparse matrix - dense
 ///   vector multiply) for multiple vectors at a time (multivectors)
 ///   and possibly multiple coefficients at a time.
 
-template<
-    class KernelHandle,
-    class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t,
-    class b_size_view_t_, class b_lno_view_t, class b_scalar_view_t,
-    class c_size_view_t_, class c_lno_view_t, class c_scalar_view_t,
-         bool tpl_spec_avail =
-             spgemm_numeric_tpl_spec_avail<
-               KernelHandle,
-               a_size_view_t_,  a_lno_view_t,  a_scalar_view_t,
-               b_size_view_t_,  b_lno_view_t,  b_scalar_view_t,
-               c_size_view_t_,  c_lno_view_t,  c_scalar_view_t>::value,
-         bool eti_spec_avail =
-             spgemm_numeric_eti_spec_avail<
-               KernelHandle,
-               a_size_view_t_,  a_lno_view_t,  a_scalar_view_t,
-               b_size_view_t_,  b_lno_view_t,  b_scalar_view_t,
-               c_size_view_t_,  c_lno_view_t,  c_scalar_view_t>::value >
-struct SPGEMM_NUMERIC{
-  static void
-  spgemm_numeric (
-      KernelHandle *handle,
-      typename KernelHandle::const_nnz_lno_t m,
-      typename KernelHandle::const_nnz_lno_t n,
-      typename KernelHandle::const_nnz_lno_t k,
-      a_size_view_t_ row_mapA,
-      a_lno_view_t entriesA,
-      a_scalar_view_t valuesA,
-
-      bool transposeA,
-      b_size_view_t_ row_mapB,
-      b_lno_view_t entriesB,
-      b_scalar_view_t valuesB,
-      bool transposeB,
-      c_size_view_t_ row_mapC,
-      c_lno_view_t &entriesC,
-      c_scalar_view_t &valuesC
-      );
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
+          class a_scalar_view_t, class b_size_view_t_, class b_lno_view_t,
+          class b_scalar_view_t, class c_size_view_t_, class c_lno_view_t,
+          class c_scalar_view_t,
+          bool tpl_spec_avail = spgemm_numeric_tpl_spec_avail<
+              KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t,
+              b_size_view_t_, b_lno_view_t, b_scalar_view_t, c_size_view_t_,
+              c_lno_view_t, c_scalar_view_t>::value,
+          bool eti_spec_avail = spgemm_numeric_eti_spec_avail<
+              KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t,
+              b_size_view_t_, b_lno_view_t, b_scalar_view_t, c_size_view_t_,
+              c_lno_view_t, c_scalar_view_t>::value>
+struct SPGEMM_NUMERIC {
+  static void spgemm_numeric(KernelHandle *handle,
+                             typename KernelHandle::const_nnz_lno_t m,
+                             typename KernelHandle::const_nnz_lno_t n,
+                             typename KernelHandle::const_nnz_lno_t k,
+                             a_size_view_t_ row_mapA, a_lno_view_t entriesA,
+                             a_scalar_view_t valuesA,
+
+                             bool transposeA, b_size_view_t_ row_mapB,
+                             b_lno_view_t entriesB, b_scalar_view_t valuesB,
+                             bool transposeB, c_size_view_t_ row_mapC,
+                             c_lno_view_t &entriesC, c_scalar_view_t &valuesC);
 };
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 
-
 //! Full specialization of spgemm_mv for single vectors (2-D Views).
 // Unification layer
-template<class KernelHandle,
-        class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t,
-        class b_size_view_t_, class b_lno_view_t, class b_scalar_view_t,
-        class c_size_view_t_, class c_lno_view_t, class c_scalar_view_t>
-struct SPGEMM_NUMERIC<KernelHandle,
-        a_size_view_t_,  a_lno_view_t,  a_scalar_view_t,
-        b_size_view_t_,  b_lno_view_t,  b_scalar_view_t,
-        c_size_view_t_,  c_lno_view_t,  c_scalar_view_t,
-        false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>{
-
-  static void
-  spgemm_numeric (
-      KernelHandle *handle,
-      typename KernelHandle::nnz_lno_t m,
-      typename KernelHandle::nnz_lno_t n,
-      typename KernelHandle::nnz_lno_t k,
-      a_size_view_t_ row_mapA,
-      a_lno_view_t entriesA,
-      a_scalar_view_t valuesA,
-
-      bool transposeA,
-      b_size_view_t_ row_mapB,
-      b_lno_view_t entriesB,
-      b_scalar_view_t valuesB,
-      bool transposeB,
-      c_size_view_t_ row_mapC,
-      c_lno_view_t &entriesC,
-      c_scalar_view_t &valuesC
-      )
-  {
-
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
+          class a_scalar_view_t, class b_size_view_t_, class b_lno_view_t,
+          class b_scalar_view_t, class c_size_view_t_, class c_lno_view_t,
+          class c_scalar_view_t>
+struct SPGEMM_NUMERIC<
+    KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t, b_size_view_t_,
+    b_lno_view_t, b_scalar_view_t, c_size_view_t_, c_lno_view_t,
+    c_scalar_view_t, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  static void spgemm_numeric(
+      KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
+      typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k,
+      a_size_view_t_ row_mapA, a_lno_view_t entriesA, a_scalar_view_t valuesA,
+
+      bool transposeA, b_size_view_t_ row_mapB, b_lno_view_t entriesB,
+      b_scalar_view_t valuesB, bool transposeB, c_size_view_t_ row_mapC,
+      c_lno_view_t &entriesC, c_scalar_view_t &valuesC) {
     typedef typename KernelHandle::SPGEMMHandleType spgemmHandleType;
     spgemmHandleType *sh = handle->get_spgemm_handle();
-    if (!sh->is_symbolic_called()){
-      throw std::runtime_error ("Call spgemm symbolic before calling SpGEMM numeric");
+    if (!sh->is_symbolic_called()) {
+      throw std::runtime_error(
+          "Call spgemm symbolic before calling SpGEMM numeric");
       /*
       KokkosSparse::Experimental::spgemm_symbolic<KernelHandle,
                     a_size_view_t_, a_lno_view_t,
@@ -249,247 +221,212 @@ struct SPGEMM_NUMERIC<KernelHandle,
           row_mapB, entriesB, transposeB,
           row_mapC
           );
-      typename c_size_view_t_::value_type c_nnz_size = handle->get_spgemm_handle()->get_c_nnz();
-      if (c_nnz_size){
-        entriesC = c_lno_view_t (Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), c_nnz_size);
-        valuesC = c_scalar_view_t (Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size);
+      typename c_size_view_t_::value_type c_nnz_size =
+      handle->get_spgemm_handle()->get_c_nnz(); if (c_nnz_size){ entriesC =
+      c_lno_view_t (Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"),
+      c_nnz_size); valuesC = c_scalar_view_t
+      (Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size);
       }
       */
     }
 
-
-    switch (sh->get_algorithm_type()){
-    case SPGEMM_CUSPARSE:
-      cuSPARSE_apply<spgemmHandleType>(
-          sh,
-          m,n,k,
-          row_mapA, entriesA, valuesA, transposeA,
-          row_mapB, entriesB, valuesB, transposeB,
-          row_mapC, entriesC, valuesC);
-      break;
-    case SPGEMM_CUSP:
-      CUSP_apply<spgemmHandleType,
-        a_size_view_t_,
-        a_lno_view_t,
-        a_scalar_view_t,
-        b_size_view_t_,
-        b_lno_view_t,
-        b_scalar_view_t,
-        c_size_view_t_,
-        c_lno_view_t,
-        c_scalar_view_t >(
-          sh,
-          m,n,k,
-          row_mapA, entriesA, valuesA, transposeA,
-          row_mapB, entriesB, valuesB, transposeB,
-          row_mapC, entriesC, valuesC);
-          break;
-    case SPGEMM_MKL:
-      mkl_apply(
-                sh,
-                m,n,k,
-                row_mapA, entriesA, valuesA, transposeA,
-                row_mapB, entriesB, valuesB, transposeB,
-                row_mapC, entriesC, valuesC, handle->get_verbose());
-      break;
-    case SPGEMM_MKL2PHASE:
-      mkl2phase_apply(
-          sh,
-          m,n,k,
-          row_mapA, entriesA, valuesA, transposeA,
-          row_mapB, entriesB, valuesB, transposeB,
-          row_mapC, entriesC, valuesC, handle->get_verbose());
-      break;
-
-    case SPGEMM_VIENNA:
-      viennaCL_apply<spgemmHandleType>(
-                sh,
-                m,n,k,
-                row_mapA, entriesA, valuesA, transposeA,
-                row_mapB, entriesB, valuesB, transposeB,
-                row_mapC, entriesC, valuesC, handle->get_verbose());
-      break;
-
-    default:
-
-    {
-      KokkosSPGEMM
-      <KernelHandle,
-      a_size_view_t_, a_lno_view_t, a_scalar_view_t,
-      b_size_view_t_, b_lno_view_t,  b_scalar_view_t>
-      kspgemm (handle,m,n,k,row_mapA, entriesA, valuesA, transposeA, row_mapB, entriesB, valuesB, transposeB);
-      kspgemm.KokkosSPGEMM_numeric(row_mapC, entriesC, valuesC);
-    }
-    break;
-    case SPGEMM_SERIAL:
-    case SPGEMM_DEBUG:
-      spgemm_debug_numeric(
-          handle,
-          m,
-          n,
-          k,
-          row_mapA,
-          entriesA,
-          valuesA,
-
-          transposeA,
-          row_mapB,
-          entriesB,
-          valuesB,
-          transposeB,
-          row_mapC,
-          entriesC,
-          valuesC
-          );
-      break;
+    switch (sh->get_algorithm_type()) {
+      case SPGEMM_CUSPARSE:
+        cuSPARSE_apply<spgemmHandleType>(
+            sh, m, n, k, row_mapA, entriesA, valuesA, transposeA, row_mapB,
+            entriesB, valuesB, transposeB, row_mapC, entriesC, valuesC);
+        break;
+      case SPGEMM_CUSP:
+        CUSP_apply<spgemmHandleType, a_size_view_t_, a_lno_view_t,
+                   a_scalar_view_t, b_size_view_t_, b_lno_view_t,
+                   b_scalar_view_t, c_size_view_t_, c_lno_view_t,
+                   c_scalar_view_t>(sh, m, n, k, row_mapA, entriesA, valuesA,
+                                    transposeA, row_mapB, entriesB, valuesB,
+                                    transposeB, row_mapC, entriesC, valuesC);
+        break;
+      case SPGEMM_MKL:
+        mkl_apply(sh, m, n, k, row_mapA, entriesA, valuesA, transposeA,
+                  row_mapB, entriesB, valuesB, transposeB, row_mapC, entriesC,
+                  valuesC, handle->get_verbose());
+        break;
+      case SPGEMM_MKL2PHASE:
+        mkl2phase_apply(sh, m, n, k, row_mapA, entriesA, valuesA, transposeA,
+                        row_mapB, entriesB, valuesB, transposeB, row_mapC,
+                        entriesC, valuesC, handle->get_verbose());
+        break;
+
+      case SPGEMM_VIENNA:
+        viennaCL_apply<spgemmHandleType>(
+            sh, m, n, k, row_mapA, entriesA, valuesA, transposeA, row_mapB,
+            entriesB, valuesB, transposeB, row_mapC, entriesC, valuesC,
+            handle->get_verbose());
+        break;
+
+      default:
+
+      {
+        KokkosSPGEMM<KernelHandle, a_size_view_t_, a_lno_view_t,
+                     a_scalar_view_t, b_size_view_t_, b_lno_view_t,
+                     b_scalar_view_t>
+            kspgemm(handle, m, n, k, row_mapA, entriesA, valuesA, transposeA,
+                    row_mapB, entriesB, valuesB, transposeB);
+        kspgemm.KokkosSPGEMM_numeric(row_mapC, entriesC, valuesC);
+      } break;
+      case SPGEMM_SERIAL:
+      case SPGEMM_DEBUG:
+        spgemm_debug_numeric(handle, m, n, k, row_mapA, entriesA, valuesA,
+
+                             transposeA, row_mapB, entriesB, valuesB,
+                             transposeB, row_mapC, entriesC, valuesC);
+        break;
     }
-}
+  }
 };
 
 #endif
 
-
-
-}
-}
-
-#define KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_DECL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE ) \
-    extern template struct  \
-    SPGEMM_NUMERIC< \
-          typename KokkosKernels::Experimental::KokkosKernelsHandle<\
-          	const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,  \
-            EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE> , \
-          Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, false, true >; \
-    \
-    extern template struct  \
-    SPGEMM_NUMERIC< \
-          typename KokkosKernels::Experimental::KokkosKernelsHandle<\
-          	const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,  \
-            EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE> , \
-          Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, false, true >;
-
-#define KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_INST( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE) \
-    template struct  \
-    SPGEMM_NUMERIC< \
-          KokkosKernels::Experimental::KokkosKernelsHandle<\
-          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,  \
-            EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE> , \
-          Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, false, true > ; \
-\
-    template struct  \
-    SPGEMM_NUMERIC< \
-          KokkosKernels::Experimental::KokkosKernelsHandle<\
-          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,  \
-            EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE> , \
-          Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-          Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,  \
-            Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>, \
-            Kokkos::MemoryTraits<Kokkos::Unmanaged> >, false, true > ;
-
-
-#include<KokkosSparse_spgemm_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosSparse_spgemm_numeric_eti_spec_decl.hpp>
-
-
-#endif // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#define KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_DECL(                        \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE)                             \
+  extern template struct SPGEMM_NUMERIC<                                  \
+      typename KokkosKernels::Experimental::KokkosKernelsHandle<          \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>,     \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,                           \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      false, true>;                                                       \
+                                                                          \
+  extern template struct SPGEMM_NUMERIC<                                  \
+      typename KokkosKernels::Experimental::KokkosKernelsHandle<          \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>,     \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,                           \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      false, true>;
+
+#define KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_INST(                        \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE)                             \
+  template struct SPGEMM_NUMERIC<                                         \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>,     \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,                           \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      false, true>;                                                       \
+                                                                          \
+  template struct SPGEMM_NUMERIC<                                         \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>,     \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,                           \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE>,  \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      false, true>;
+
+#include <KokkosSparse_spgemm_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spgemm_numeric_eti_spec_decl.hpp>
+
+#endif  // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_
diff --git a/src/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp b/src/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp
index a44083c1ea..181984ebe9 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp
@@ -63,47 +63,44 @@
 namespace KokkosSparse {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<
-  class KernelHandle,
-  class a_size_view_t_, class a_lno_view_t,
-  class b_size_view_t_, class b_lno_view_t,
-  class c_size_view_t_>
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
+          class b_size_view_t_, class b_lno_view_t, class c_size_view_t_>
 struct spgemm_symbolic_eti_spec_avail {
   enum : bool { value = false };
 };
 
-}
-}
-
-
-#define KOKKOSSPARSE_SPGEMM_SYMBOLIC_ETI_SPEC_AVAIL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE,SLOW_MEM_SPACE_TYPE) \
-    template<> \
-    struct spgemm_symbolic_eti_spec_avail< \
-        KokkosKernels::Experimental::KokkosKernelsHandle<\
-          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,  \
-          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE> , \
-        Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> > > \
-    { enum : bool { value = true }; };
-
-
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#define KOKKOSSPARSE_SPGEMM_SYMBOLIC_ETI_SPEC_AVAIL(                      \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE)                                  \
+  template <>                                                             \
+  struct spgemm_symbolic_eti_spec_avail<                                  \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>,          \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {          \
+    enum : bool { value = true };                                         \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosSparse_spgemm_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosSparse_spgemm_symbolic_eti_spec_avail.hpp>
+#include <KokkosSparse_spgemm_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spgemm_symbolic_eti_spec_avail.hpp>
 
 namespace KokkosSparse {
 namespace Impl {
@@ -112,144 +109,88 @@ namespace Impl {
 /// \brief Implementation of KokkosSparse::spgemm (sparse matrix - sparse
 ///   matrix multiply)
 ///
-template<
-  class KernelHandle,
-  class a_size_view_t_, class a_lno_view_t,
-  class b_size_view_t_, class b_lno_view_t,
-  class c_size_view_t_,
-  bool tpl_spec_avail =
-       spgemm_symbolic_tpl_spec_avail<
-             KernelHandle,
-             a_size_view_t_, a_lno_view_t,
-             b_size_view_t_, b_lno_view_t,
-             c_size_view_t_>::value,
-   bool eti_spec_avail =
-       spgemm_symbolic_eti_spec_avail<
-             KernelHandle,
-             a_size_view_t_, a_lno_view_t,
-             b_size_view_t_, b_lno_view_t,
-             c_size_view_t_>::value >
-struct SPGEMM_SYMBOLIC{
-
-  static void spgemm_symbolic (
-      KernelHandle *handle,
-      typename  KernelHandle::const_nnz_lno_t m,
-      typename  KernelHandle::const_nnz_lno_t n,
-      typename  KernelHandle::const_nnz_lno_t k,
-      a_size_view_t_ row_mapA,
-      a_lno_view_t entriesA,
-      bool transposeA,
-      b_size_view_t_ row_mapB,
-      b_lno_view_t entriesB,
-      bool transposeB,
-      c_size_view_t_ row_mapC);
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
+          class b_size_view_t_, class b_lno_view_t, class c_size_view_t_,
+          bool tpl_spec_avail = spgemm_symbolic_tpl_spec_avail<
+              KernelHandle, a_size_view_t_, a_lno_view_t, b_size_view_t_,
+              b_lno_view_t, c_size_view_t_>::value,
+          bool eti_spec_avail = spgemm_symbolic_eti_spec_avail<
+              KernelHandle, a_size_view_t_, a_lno_view_t, b_size_view_t_,
+              b_lno_view_t, c_size_view_t_>::value>
+struct SPGEMM_SYMBOLIC {
+  static void spgemm_symbolic(KernelHandle *handle,
+                              typename KernelHandle::const_nnz_lno_t m,
+                              typename KernelHandle::const_nnz_lno_t n,
+                              typename KernelHandle::const_nnz_lno_t k,
+                              a_size_view_t_ row_mapA, a_lno_view_t entriesA,
+                              bool transposeA, b_size_view_t_ row_mapB,
+                              b_lno_view_t entriesB, bool transposeB,
+                              c_size_view_t_ row_mapC);
 };
 
-
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 //! Full specialization of spgemm for single vectors (1-D Views).
 // Unification layer
-template< class KernelHandle,
-          class a_size_view_t_, class a_lno_view_t,
-          class b_size_view_t_, class b_lno_view_t,
-          class c_size_view_t_>
-struct SPGEMM_SYMBOLIC < KernelHandle,
-                a_size_view_t_, a_lno_view_t,
-                b_size_view_t_, b_lno_view_t,
-                c_size_view_t_, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>{
-
-  static void
-  spgemm_symbolic (
-      KernelHandle *handle,
-      typename KernelHandle::nnz_lno_t m,
-      typename KernelHandle::nnz_lno_t n,
-      typename KernelHandle::nnz_lno_t k,
-      a_size_view_t_ row_mapA,
-      a_lno_view_t entriesA,
-      bool transposeA,
-      b_size_view_t_ row_mapB,
-      b_lno_view_t entriesB,
-      bool transposeB,
-      c_size_view_t_ row_mapC)
-  {
-
+template <class KernelHandle, class a_size_view_t_, class a_lno_view_t,
+          class b_size_view_t_, class b_lno_view_t, class c_size_view_t_>
+struct SPGEMM_SYMBOLIC<KernelHandle, a_size_view_t_, a_lno_view_t,
+                       b_size_view_t_, b_lno_view_t, c_size_view_t_, false,
+                       KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  static void spgemm_symbolic(KernelHandle *handle,
+                              typename KernelHandle::nnz_lno_t m,
+                              typename KernelHandle::nnz_lno_t n,
+                              typename KernelHandle::nnz_lno_t k,
+                              a_size_view_t_ row_mapA, a_lno_view_t entriesA,
+                              bool transposeA, b_size_view_t_ row_mapB,
+                              b_lno_view_t entriesB, bool transposeB,
+                              c_size_view_t_ row_mapC) {
     typedef typename KernelHandle::SPGEMMHandleType spgemmHandleType;
     spgemmHandleType *sh = handle->get_spgemm_handle();
-    switch (sh->get_algorithm_type()){
-
-    case SPGEMM_CUSPARSE:
-      cuSPARSE_symbolic
-      <spgemmHandleType,
-      a_size_view_t_,
-      a_lno_view_t,
-      b_size_view_t_,
-      b_lno_view_t,
-      c_size_view_t_>(sh, m,n,k,
-          row_mapA, entriesA, transposeA,
-          row_mapB, entriesB, transposeB,
-          row_mapC);
-      break;
-    case SPGEMM_CUSP:
-    case SPGEMM_VIENNA:
-      break;
-
-    case SPGEMM_MKL2PHASE:
-      mkl2phase_symbolic(
-          sh,
-          m, n, k,
-          row_mapA, entriesA, transposeA,
-          row_mapB, entriesB, transposeB,
-          row_mapC,handle->get_verbose());
-      break;
-
-    default:
-    {
-      KokkosSPGEMM
-      <KernelHandle,
-      a_size_view_t_, a_lno_view_t, typename KernelHandle::in_scalar_nnz_view_t,
-      b_size_view_t_, b_lno_view_t, typename KernelHandle::in_scalar_nnz_view_t>
-      kspgemm (handle,m,n,k,row_mapA, entriesA, transposeA, row_mapB, entriesB, transposeB);
-      kspgemm.KokkosSPGEMM_symbolic(row_mapC);
-    }
-    break;
-    case SPGEMM_SERIAL:
-    case SPGEMM_DEBUG:
-      spgemm_debug_symbolic(
-          handle,
-          m,
-          n,
-          k,
-          row_mapA,
-          entriesA,
-
-          transposeA,
-          row_mapB,
-          entriesB,
-          transposeB,
-          row_mapC
-          );
-      break;
-    case SPGEMM_MKL:
-    	mkl_symbolic(
-                  sh,
-                  m,n,k,
-                  row_mapA, entriesA,  transposeA,
-                  row_mapB, entriesB,  transposeB,
-                  row_mapC, handle->get_verbose());
-      break;
+    switch (sh->get_algorithm_type()) {
+      case SPGEMM_CUSPARSE:
+        cuSPARSE_symbolic<spgemmHandleType, a_size_view_t_, a_lno_view_t,
+                          b_size_view_t_, b_lno_view_t, c_size_view_t_>(
+            sh, m, n, k, row_mapA, entriesA, transposeA, row_mapB, entriesB,
+            transposeB, row_mapC);
+        break;
+      case SPGEMM_CUSP:
+      case SPGEMM_VIENNA: break;
+
+      case SPGEMM_MKL2PHASE:
+        mkl2phase_symbolic(sh, m, n, k, row_mapA, entriesA, transposeA,
+                           row_mapB, entriesB, transposeB, row_mapC,
+                           handle->get_verbose());
+        break;
+
+      default: {
+        KokkosSPGEMM<KernelHandle, a_size_view_t_, a_lno_view_t,
+                     typename KernelHandle::in_scalar_nnz_view_t,
+                     b_size_view_t_, b_lno_view_t,
+                     typename KernelHandle::in_scalar_nnz_view_t>
+            kspgemm(handle, m, n, k, row_mapA, entriesA, transposeA, row_mapB,
+                    entriesB, transposeB);
+        kspgemm.KokkosSPGEMM_symbolic(row_mapC);
+      } break;
+      case SPGEMM_SERIAL:
+      case SPGEMM_DEBUG:
+        spgemm_debug_symbolic(handle, m, n, k, row_mapA, entriesA,
+
+                              transposeA, row_mapB, entriesB, transposeB,
+                              row_mapC);
+        break;
+      case SPGEMM_MKL:
+        mkl_symbolic(sh, m, n, k, row_mapA, entriesA, transposeA, row_mapB,
+                     entriesB, transposeB, row_mapC, handle->get_verbose());
+        break;
     }
     sh->set_call_symbolic();
-
-}
+  }
 };
 
-
 #endif
 
-
-
-}
-}
+}  // namespace Impl
+}  // namespace KokkosSparse
 
 //
 // Macro for declaration of full specialization of
@@ -258,53 +199,55 @@ struct SPGEMM_SYMBOLIC < KernelHandle,
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSSPARSE_SPGEMM_SYMBOLIC_ETI_SPEC_DECL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE,SLOW_MEM_SPACE_TYPE ) \
-    extern template struct  \
-    SPGEMM_SYMBOLIC< \
-        KokkosKernels::Experimental::KokkosKernelsHandle< \
-        const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,  \
-          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE> , \
-        Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, false, true > ;
-
-
-#define KOKKOSSPARSE_SPGEMM_SYMBOLIC_ETI_SPEC_INST( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE) \
-    template struct  \
-    SPGEMM_SYMBOLIC< \
-        KokkosKernels::Experimental::KokkosKernelsHandle<\
-        const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,  \
-          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE> , \
-        Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-        Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,  \
-          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-          Kokkos::MemoryTraits<Kokkos::Unmanaged> >, false, true > ;
-
-
-#include<KokkosSparse_spgemm_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosSparse_spgemm_symbolic_eti_spec_decl.hpp>
-
-#endif // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_
+#define KOKKOSSPARSE_SPGEMM_SYMBOLIC_ETI_SPEC_DECL(                       \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE)                                  \
+  extern template struct SPGEMM_SYMBOLIC<                                 \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>,          \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      false, true>;
+
+#define KOKKOSSPARSE_SPGEMM_SYMBOLIC_ETI_SPEC_INST(                       \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE)                                  \
+  template struct SPGEMM_SYMBOLIC<                                        \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                   \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,       \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>,          \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,                      \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,                     \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,                            \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,             \
+      false, true>;
+
+#include <KokkosSparse_spgemm_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spgemm_symbolic_eti_spec_decl.hpp>
+
+#endif  // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_
diff --git a/src/sparse/impl/KokkosSparse_spgemm_viennaCL_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_viennaCL_impl.hpp
index bd36a433c9..7c2ace3338 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_viennaCL_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_viennaCL_impl.hpp
@@ -47,7 +47,6 @@
 
 //#define KERNELS_HAVE_VIENNACL
 
-
 #ifdef KERNELS_HAVE_VIENNACL1
 #define VIENNACL_WITH_OPENMP
 #include "viennacl/compressed_matrix.hpp"
@@ -57,167 +56,179 @@
 #endif
 
 #include "KokkosKernels_Utils.hpp"
-namespace KokkosSparse{
-
-namespace Impl{
-
-
-
-
-  template <typename KernelHandle,
-  typename in_row_index_view_type,
-  typename in_nonzero_index_view_type,
-  typename in_nonzero_value_view_type,
-  typename bin_row_index_view_type,
-  typename bin_nonzero_index_view_type,
-  typename bin_nonzero_value_view_type,
-  typename cin_row_index_view_type,
-  typename cin_nonzero_index_view_type,
-  typename cin_nonzero_value_view_type>
-  void viennaCL_apply(
-      KernelHandle *handle,
-      typename KernelHandle::nnz_lno_t m,
-      typename KernelHandle::nnz_lno_t n,
-      typename KernelHandle::nnz_lno_t k,
-      in_row_index_view_type row_mapA,
-      in_nonzero_index_view_type entriesA,
-      in_nonzero_value_view_type valuesA,
-
-      bool transposeA,
-      bin_row_index_view_type row_mapB,
-      bin_nonzero_index_view_type entriesB,
-      bin_nonzero_value_view_type valuesB,
-      bool transposeB,
-      cin_row_index_view_type row_mapC,
-      cin_nonzero_index_view_type &entriesC,
-      cin_nonzero_value_view_type &valuesC,
-      bool verbose = false){
-
+namespace KokkosSparse {
+
+namespace Impl {
+
+template <
+    typename KernelHandle, typename in_row_index_view_type,
+    typename in_nonzero_index_view_type, typename in_nonzero_value_view_type,
+    typename bin_row_index_view_type, typename bin_nonzero_index_view_type,
+    typename bin_nonzero_value_view_type, typename cin_row_index_view_type,
+    typename cin_nonzero_index_view_type, typename cin_nonzero_value_view_type>
+void viennaCL_apply(
+    KernelHandle *handle, typename KernelHandle::nnz_lno_t m,
+    typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k,
+    in_row_index_view_type row_mapA, in_nonzero_index_view_type entriesA,
+    in_nonzero_value_view_type valuesA,
+
+    bool transposeA, bin_row_index_view_type row_mapB,
+    bin_nonzero_index_view_type entriesB, bin_nonzero_value_view_type valuesB,
+    bool transposeB, cin_row_index_view_type row_mapC,
+    cin_nonzero_index_view_type &entriesC, cin_nonzero_value_view_type &valuesC,
+    bool verbose = false) {
 #ifdef KERNELS_HAVE_VIENNACL1
 
-    typedef typename KernelHandle::nnz_lno_t idx;
-    typedef in_row_index_view_type idx_array_type;
-
-    typedef typename KernelHandle::nnz_scalar_t value_type;
-
-
-    typedef typename in_row_index_view_type::device_type device1;
-    typedef typename in_nonzero_index_view_type::device_type device2;
-    typedef typename in_nonzero_value_view_type::device_type device3;
-
-    typedef typename KernelHandle::HandleExecSpace MyExecSpace;
+  typedef typename KernelHandle::nnz_lno_t idx;
+  typedef in_row_index_view_type idx_array_type;
 
-    typedef typename viennacl::compressed_matrix<value_type>::handle_type it;
-    typedef typename viennacl::compressed_matrix<value_type>::value_type vt;
+  typedef typename KernelHandle::nnz_scalar_t value_type;
 
-    if ((std::is_same<idx, int>::value && std::is_same<typename KernelHandle::size_type, int>::value )||
-        (std::is_same<idx, unsigned int>::value && std::is_same<typename KernelHandle::size_type, unsigned int>::value ) ||
-        (std::is_same<idx, it>::value && std::is_same<typename KernelHandle::size_type, it>::value )
-        ){
+  typedef typename in_row_index_view_type::device_type device1;
+  typedef typename in_nonzero_index_view_type::device_type device2;
+  typedef typename in_nonzero_value_view_type::device_type device3;
 
-      unsigned int * a_xadj = (unsigned int *)row_mapA.data();
-      unsigned int * b_xadj = (unsigned int * )row_mapB.data();
-      unsigned int * c_xadj = (unsigned int * )row_mapC.data();
+  typedef typename KernelHandle::HandleExecSpace MyExecSpace;
 
-      unsigned int * a_adj = (unsigned int * )entriesA.data();
-      unsigned int * b_adj = (unsigned int * )entriesB.data();
-      unsigned int * c_adj = (unsigned int * )entriesC.data();
+  typedef typename viennacl::compressed_matrix<value_type>::handle_type it;
+  typedef typename viennacl::compressed_matrix<value_type>::value_type vt;
 
-      int nnzA = entriesA.extent(0);
-      int nnzB = entriesB.extent(0);
+  if ((std::is_same<idx, int>::value &&
+       std::is_same<typename KernelHandle::size_type, int>::value) ||
+      (std::is_same<idx, unsigned int>::value &&
+       std::is_same<typename KernelHandle::size_type, unsigned int>::value) ||
+      (std::is_same<idx, it>::value &&
+       std::is_same<typename KernelHandle::size_type, it>::value)) {
+    unsigned int *a_xadj = (unsigned int *)row_mapA.data();
+    unsigned int *b_xadj = (unsigned int *)row_mapB.data();
+    unsigned int *c_xadj = (unsigned int *)row_mapC.data();
 
-      value_type *a_ew = valuesA.data();
-      value_type *b_ew = valuesB.data();
-      value_type *c_ew = valuesC.data();
+    unsigned int *a_adj = (unsigned int *)entriesA.data();
+    unsigned int *b_adj = (unsigned int *)entriesB.data();
+    unsigned int *c_adj = (unsigned int *)entriesC.data();
 
+    int nnzA = entriesA.extent(0);
+    int nnzB = entriesB.extent(0);
 
+    value_type *a_ew = valuesA.data();
+    value_type *b_ew = valuesB.data();
+    value_type *c_ew = valuesC.data();
 
-      /*
-        std::cout << "create a" << std::endl;
-        std::cout << "m:" << m << " n:" << n << std::endl;
-        std::cout << "a_xadj[0]:" << a_xadj[0] << " a_xadj[m]:" << a_xadj[m] << std::endl;
-        std::cout << "a_adj[a_xadj[m] - 1]:" << a_adj[a_xadj[m] - 1] << " a_ew[a_xadj[m] - 1]:" << a_ew[a_xadj[m] - 1] << std::endl;
-       */
+    /*
+      std::cout << "create a" << std::endl;
+      std::cout << "m:" << m << " n:" << n << std::endl;
+      std::cout << "a_xadj[0]:" << a_xadj[0] << " a_xadj[m]:" << a_xadj[m] <<
+      std::endl; std::cout << "a_adj[a_xadj[m] - 1]:" << a_adj[a_xadj[m] - 1] <<
+      " a_ew[a_xadj[m] - 1]:" << a_ew[a_xadj[m] - 1] << std::endl;
+     */
 
-
-      Kokkos::Timer timerset;
-      if (verbose)
-	std::cout << "viennacl matrix create begins here" << std::endl;
+    Kokkos::Timer timerset;
+    if (verbose) std::cout << "viennacl matrix create begins here" << std::endl;
 #ifdef VIENNACL_WITH_CUDA
-      viennacl::compressed_matrix<value_type> A(a_xadj, a_adj, a_ew, viennacl::CUDA_MEMORY , m, n, nnzA);
-      viennacl::compressed_matrix<value_type> B(b_xadj, b_adj, b_ew, viennacl::CUDA_MEMORY, n, k, nnzB);
-#else 
-      viennacl::compressed_matrix<value_type> A(a_xadj, a_adj, a_ew, viennacl::MAIN_MEMORY, m, n, nnzA);
-      viennacl::compressed_matrix<value_type> B(b_xadj, b_adj, b_ew, viennacl::MAIN_MEMORY, n, k, nnzB);
+    viennacl::compressed_matrix<value_type> A(
+        a_xadj, a_adj, a_ew, viennacl::CUDA_MEMORY, m, n, nnzA);
+    viennacl::compressed_matrix<value_type> B(
+        b_xadj, b_adj, b_ew, viennacl::CUDA_MEMORY, n, k, nnzB);
+#else
+    viennacl::compressed_matrix<value_type> A(
+        a_xadj, a_adj, a_ew, viennacl::MAIN_MEMORY, m, n, nnzA);
+    viennacl::compressed_matrix<value_type> B(
+        b_xadj, b_adj, b_ew, viennacl::MAIN_MEMORY, n, k, nnzB);
 
 #endif
-      //viennacl::compressed_matrix<value_type> A;
-      //viennacl::compressed_matrix<value_type> B;
-      //A.set(a_xadj, a_adj, a_ew, m, n, nnzA);
-      //B.set(b_xadj, b_adj, b_ew, n, k, nnzB);
-      if (verbose)
-      std::cout << "VIENNACL compress matrix create:" << timerset.seconds() << std::endl;
-
-
-      Kokkos::Timer timer1;
-      viennacl::compressed_matrix<value_type> C = viennacl::linalg::prod(A, B);
-      MyExecSpace().fence();
-
-      if (verbose)
-      std::cout << "Actual VIENNACL SPMM Time:" << timer1.seconds() << std::endl;
-
-      {
+    // viennacl::compressed_matrix<value_type> A;
+    // viennacl::compressed_matrix<value_type> B;
+    // A.set(a_xadj, a_adj, a_ew, m, n, nnzA);
+    // B.set(b_xadj, b_adj, b_ew, n, k, nnzB);
+    if (verbose)
+      std::cout << "VIENNACL compress matrix create:" << timerset.seconds()
+                << std::endl;
 
+    Kokkos::Timer timer1;
+    viennacl::compressed_matrix<value_type> C = viennacl::linalg::prod(A, B);
+    MyExecSpace().fence();
 
+    if (verbose)
+      std::cout << "Actual VIENNACL SPMM Time:" << timer1.seconds()
+                << std::endl;
 
-        unsigned int c_rows = m, c_cols = k, cnnz = C.nnz();
+    {
+      unsigned int c_rows = m, c_cols = k, cnnz = C.nnz();
 
 #ifdef VIENNACL_WITH_CUDA
-        value_type   const * values   = viennacl::cuda_arg<value_type>(C.handle());
-        unsigned int const * rows_start = viennacl::cuda_arg<unsigned int>(C.handle1());
-        unsigned int const * columns = viennacl::cuda_arg<unsigned int>(C.handle2());
-#else 
-        value_type   const * values   = viennacl::linalg::host_based::detail::extract_raw_pointer<value_type>(C.handle());
-        unsigned int const * rows_start = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(C.handle1());
-        unsigned int const * columns = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(C.handle2());
+      value_type const *values = viennacl::cuda_arg<value_type>(C.handle());
+      unsigned int const *rows_start =
+          viennacl::cuda_arg<unsigned int>(C.handle1());
+      unsigned int const *columns =
+          viennacl::cuda_arg<unsigned int>(C.handle2());
+#else
+      value_type const *values =
+          viennacl::linalg::host_based::detail::extract_raw_pointer<value_type>(
+              C.handle());
+      unsigned int const *rows_start =
+          viennacl::linalg::host_based::detail::extract_raw_pointer<
+              unsigned int>(C.handle1());
+      unsigned int const *columns =
+          viennacl::linalg::host_based::detail::extract_raw_pointer<
+              unsigned int>(C.handle2());
 
 #endif
 
-        {
-          Kokkos::Timer copy_time;
-          //row_mapC = typename cin_row_index_view_type::non_const_type(Kokkos::view_alloc(Kokkos::WithoutInitializing, "rowmapC"), c_rows + 1);
-          entriesC = typename cin_nonzero_index_view_type::non_const_type (Kokkos::view_alloc(Kokkos::WithoutInitializing, "EntriesC") , cnnz);
-          valuesC = typename cin_nonzero_value_view_type::non_const_type (Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC") ,  cnnz);
-	  MyExecSpace().fence();
-          KokkosKernels::Impl::copy_vector<unsigned int const *, typename cin_row_index_view_type::non_const_type, MyExecSpace> (m + 1, rows_start, row_mapC);
-          KokkosKernels::Impl::copy_vector<unsigned int const *, typename cin_nonzero_index_view_type::non_const_type, MyExecSpace> (cnnz, columns, entriesC);
-          KokkosKernels::Impl::copy_vector<value_type   const *, typename cin_nonzero_value_view_type::non_const_type, MyExecSpace> (cnnz, values, valuesC);
-          MyExecSpace().fence();
-
-
-          double copy_time_d = copy_time.seconds();
-          if (verbose)
+      {
+        Kokkos::Timer copy_time;
+        // row_mapC = typename
+        // cin_row_index_view_type::non_const_type(Kokkos::view_alloc(Kokkos::WithoutInitializing,
+        // "rowmapC"), c_rows + 1);
+        entriesC = typename cin_nonzero_index_view_type::non_const_type(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing, "EntriesC"), cnnz);
+        valuesC = typename cin_nonzero_value_view_type::non_const_type(
+            Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), cnnz);
+        MyExecSpace().fence();
+        KokkosKernels::Impl::copy_vector<
+            unsigned int const *,
+            typename cin_row_index_view_type::non_const_type, MyExecSpace>(
+            m + 1, rows_start, row_mapC);
+        KokkosKernels::Impl::copy_vector<
+            unsigned int const *,
+            typename cin_nonzero_index_view_type::non_const_type, MyExecSpace>(
+            cnnz, columns, entriesC);
+        KokkosKernels::Impl::copy_vector<
+            value_type const *,
+            typename cin_nonzero_value_view_type::non_const_type, MyExecSpace>(
+            cnnz, values, valuesC);
+        MyExecSpace().fence();
+
+        double copy_time_d = copy_time.seconds();
+        if (verbose)
           std::cout << "VIENNACL COPYTIME:" << copy_time_d << std::endl;
-        }
-
       }
     }
-    else {
-      throw std::runtime_error ("VIENNACL requires local ordinals to be integer.\n");
-    }
+  } else {
+    throw std::runtime_error(
+        "VIENNACL requires local ordinals to be integer.\n");
+  }
 #else
-    (void)handle;
-    (void)m;          (void)n;          (void)k;
-    (void)row_mapA;   (void)row_mapB;   (void)row_mapC;
-    (void)entriesA;   (void)entriesB;   (void)entriesC;
-    (void)valuesA;    (void)valuesB;    (void)valuesC;
-    (void)transposeA; (void)transposeB;
-    (void)verbose;
-    throw std::runtime_error ("VIENNACL IS NOT DEFINED\n");
-    //return;
+  (void)handle;
+  (void)m;
+  (void)n;
+  (void)k;
+  (void)row_mapA;
+  (void)row_mapB;
+  (void)row_mapC;
+  (void)entriesA;
+  (void)entriesB;
+  (void)entriesC;
+  (void)valuesA;
+  (void)valuesB;
+  (void)valuesC;
+  (void)transposeA;
+  (void)transposeB;
+  (void)verbose;
+  throw std::runtime_error("VIENNACL IS NOT DEFINED\n");
+  // return;
 #endif
-  }
-}
 }
+}  // namespace Impl
+}  // namespace KokkosSparse
 
 #endif
diff --git a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp
index 41e2c98273..6a1300d747 100644
--- a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp
@@ -58,85 +58,92 @@ namespace KokkosSparse {
 namespace Impl {
 namespace Experimental {
 
+// struct UnsortedTag {};
 
-//struct UnsortedTag {};
-
-template <class ARowMapType,
-          class AEntriesType,
-          class AValuesType,
-          class LRowMapType,
-          class LEntriesType,
-          class LValuesType,
-          class URowMapType,
-          class UEntriesType,
-          class UValuesType,
-          class LevelViewType,
-          class WorkViewType,
-          class nnz_lno_t>
-struct ILUKLvlSchedRPNumericFunctor
-{
+template <class ARowMapType, class AEntriesType, class AValuesType,
+          class LRowMapType, class LEntriesType, class LValuesType,
+          class URowMapType, class UEntriesType, class UValuesType,
+          class LevelViewType, class WorkViewType, class nnz_lno_t>
+struct ILUKLvlSchedRPNumericFunctor {
   using lno_t    = typename AEntriesType::non_const_value_type;
   using scalar_t = typename AValuesType::non_const_value_type;
-  ARowMapType   A_row_map;
-  AEntriesType  A_entries;
-  AValuesType   A_values;
-  LRowMapType   L_row_map;
-  LEntriesType  L_entries;
-  LValuesType   L_values;
-  URowMapType   U_row_map;
-  UEntriesType  U_entries;
-  UValuesType   U_values;
+  ARowMapType A_row_map;
+  AEntriesType A_entries;
+  AValuesType A_values;
+  LRowMapType L_row_map;
+  LEntriesType L_entries;
+  LValuesType L_values;
+  URowMapType U_row_map;
+  UEntriesType U_entries;
+  UValuesType U_values;
   LevelViewType level_idx;
-  WorkViewType  iw;
-  nnz_lno_t     lev_start;
-
-  ILUKLvlSchedRPNumericFunctor( const ARowMapType &A_row_map_, const AEntriesType &A_entries_, const AValuesType &A_values_, const LRowMapType &L_row_map_, const LEntriesType &L_entries_, LValuesType &L_values_, const URowMapType &U_row_map_, const UEntriesType &U_entries_, UValuesType &U_values_, const LevelViewType &level_idx_, WorkViewType &iw_, const nnz_lno_t &lev_start_ ) :
-    A_row_map(A_row_map_), A_entries(A_entries_), A_values(A_values_), L_row_map(L_row_map_), L_entries(L_entries_), L_values(L_values_), U_row_map(U_row_map_), U_entries(U_entries_), U_values(U_values_), level_idx(level_idx_), iw(iw_), lev_start(lev_start_) {}
+  WorkViewType iw;
+  nnz_lno_t lev_start;
+
+  ILUKLvlSchedRPNumericFunctor(
+      const ARowMapType &A_row_map_, const AEntriesType &A_entries_,
+      const AValuesType &A_values_, const LRowMapType &L_row_map_,
+      const LEntriesType &L_entries_, LValuesType &L_values_,
+      const URowMapType &U_row_map_, const UEntriesType &U_entries_,
+      UValuesType &U_values_, const LevelViewType &level_idx_,
+      WorkViewType &iw_, const nnz_lno_t &lev_start_)
+      : A_row_map(A_row_map_),
+        A_entries(A_entries_),
+        A_values(A_values_),
+        L_row_map(L_row_map_),
+        L_entries(L_entries_),
+        L_values(L_values_),
+        U_row_map(U_row_map_),
+        U_entries(U_entries_),
+        U_values(U_values_),
+        level_idx(level_idx_),
+        iw(iw_),
+        lev_start(lev_start_) {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const lno_t i) const {
-    auto rowid  = level_idx(i);
-    auto tid    = i-lev_start;
-    auto k1 = L_row_map(rowid); 
-    auto k2 = L_row_map(rowid+1);
+    auto rowid = level_idx(i);
+    auto tid   = i - lev_start;
+    auto k1    = L_row_map(rowid);
+    auto k2    = L_row_map(rowid + 1);
 #ifdef KEEP_DIAG
-    for (auto k = k1; k < k2-1; ++k) {
+    for (auto k = k1; k < k2 - 1; ++k) {
 #else
     for (auto k = k1; k < k2; ++k) {
 #endif
-      auto col    = L_entries(k);
-      L_values(k) = 0.0;
-      iw(tid,col) = k;
+      auto col     = L_entries(k);
+      L_values(k)  = 0.0;
+      iw(tid, col) = k;
     }
 #ifdef KEEP_DIAG
-    L_values(k2-1) = scalar_t(1.0);
+    L_values(k2 - 1) = scalar_t(1.0);
 #endif
 
-    k1 = U_row_map(rowid); 
-    k2 = U_row_map(rowid+1);
+    k1 = U_row_map(rowid);
+    k2 = U_row_map(rowid + 1);
     for (auto k = k1; k < k2; ++k) {
-      auto col    = U_entries(k);
-      U_values(k) = 0.0;
-      iw(tid,col) = k;
+      auto col     = U_entries(k);
+      U_values(k)  = 0.0;
+      iw(tid, col) = k;
     }
 
-    //Unpack the ith row of A
+    // Unpack the ith row of A
     k1 = A_row_map(rowid);
-    k2 = A_row_map(rowid+1);
+    k2 = A_row_map(rowid + 1);
     for (auto k = k1; k < k2; ++k) {
       auto col  = A_entries(k);
-      auto ipos = iw(tid,col);
+      auto ipos = iw(tid, col);
       if (col < rowid)
         L_values(ipos) = A_values(k);
       else
         U_values(ipos) = A_values(k);
     }
 
-    //Eliminate prev rows
-    k1 = L_row_map(rowid); 
-    k2 = L_row_map(rowid+1);
+    // Eliminate prev rows
+    k1 = L_row_map(rowid);
+    k2 = L_row_map(rowid + 1);
 #ifdef KEEP_DIAG
-    for (auto k = k1; k < k2-1; ++k) {
+    for (auto k = k1; k < k2 - 1; ++k) {
 #else
     for (auto k = k1; k < k2; ++k) {
 #endif
@@ -147,142 +154,153 @@ struct ILUKLvlSchedRPNumericFunctor
       auto fact = L_values(k) * U_values(U_row_map(prev_row));
 #endif
       L_values(k) = fact;
-      for (auto kk = U_row_map(prev_row)+1; kk < U_row_map(prev_row+1); ++kk) {
+      for (auto kk = U_row_map(prev_row) + 1; kk < U_row_map(prev_row + 1);
+           ++kk) {
         auto col  = U_entries(kk);
-        auto ipos = iw(tid,col);
+        auto ipos = iw(tid, col);
         if (ipos == -1) continue;
         auto lxu = -U_values(kk) * fact;
         if (col < rowid)
           L_values(ipos) += lxu;
         else
           U_values(ipos) += lxu;
-      }// end for kk
-    }// end for k
+      }  // end for kk
+    }    // end for k
 
 #ifdef KEEP_DIAG
-    if (U_values(iw(tid,rowid)) == 0.0) {
-      U_values(iw(tid,rowid)) = 1e6;
+    if (U_values(iw(tid, rowid)) == 0.0) {
+      U_values(iw(tid, rowid)) = 1e6;
     }
 #else
-    if (U_values(iw(tid,rowid)) == 0.0) {
-      U_values(iw(tid,rowid)) = 1e6;
-    }
-    else {
-      U_values(iw(tid,rowid)) = 1.0 / U_values(iw(tid,rowid));
+    if (U_values(iw(tid, rowid)) == 0.0) {
+      U_values(iw(tid, rowid)) = 1e6;
+    } else {
+      U_values(iw(tid, rowid)) = 1.0 / U_values(iw(tid, rowid));
     }
 #endif
 
-    //Reset
-    k1 = L_row_map(rowid); 
-    k2 = L_row_map(rowid+1);
+    // Reset
+    k1 = L_row_map(rowid);
+    k2 = L_row_map(rowid + 1);
 #ifdef KEEP_DIAG
-    for (auto k = k1; k < k2-1; ++k)
+    for (auto k = k1; k < k2 - 1; ++k)
 #else
     for (auto k = k1; k < k2; ++k)
 #endif
-      iw(tid,L_entries(k)) = -1;
+      iw(tid, L_entries(k)) = -1;
 
-    k1 = U_row_map(rowid); 
-    k2 = U_row_map(rowid+1);
-    for (auto k = k1; k < k2; ++k)
-      iw(tid,U_entries(k)) = -1;
+    k1 = U_row_map(rowid);
+    k2 = U_row_map(rowid + 1);
+    for (auto k = k1; k < k2; ++k) iw(tid, U_entries(k)) = -1;
   }
 };
 
-template <class ARowMapType,
-          class AEntriesType,
-          class AValuesType,
-          class LRowMapType,
-          class LEntriesType,
-          class LValuesType,
-          class URowMapType,
-          class UEntriesType,
-          class UValuesType,
-          class LevelViewType,
-          class WorkViewType,
-          class nnz_lno_t>
-struct ILUKLvlSchedTP1NumericFunctor
-{
+template <class ARowMapType, class AEntriesType, class AValuesType,
+          class LRowMapType, class LEntriesType, class LValuesType,
+          class URowMapType, class UEntriesType, class UValuesType,
+          class LevelViewType, class WorkViewType, class nnz_lno_t>
+struct ILUKLvlSchedTP1NumericFunctor {
   using execution_space = typename ARowMapType::execution_space;
   using policy_type     = Kokkos::TeamPolicy<execution_space>;
   using member_type     = typename policy_type::member_type;
   using size_type       = typename ARowMapType::non_const_value_type;
   using lno_t           = typename AEntriesType::non_const_value_type;
-  using scalar_t        = typename AValuesType::non_const_value_type ;
-
-  ARowMapType   A_row_map;
-  AEntriesType  A_entries;
-  AValuesType   A_values;
-  LRowMapType   L_row_map;
-  LEntriesType  L_entries;
-  LValuesType   L_values;
-  URowMapType   U_row_map;
-  UEntriesType  U_entries;
-  UValuesType   U_values;
+  using scalar_t        = typename AValuesType::non_const_value_type;
+
+  ARowMapType A_row_map;
+  AEntriesType A_entries;
+  AValuesType A_values;
+  LRowMapType L_row_map;
+  LEntriesType L_entries;
+  LValuesType L_values;
+  URowMapType U_row_map;
+  UEntriesType U_entries;
+  UValuesType U_values;
   LevelViewType level_idx;
-  WorkViewType  iw;
-  nnz_lno_t     lev_start;
-
-  ILUKLvlSchedTP1NumericFunctor( const ARowMapType &A_row_map_, const AEntriesType &A_entries_, const AValuesType &A_values_, const LRowMapType &L_row_map_, const LEntriesType &L_entries_, LValuesType &L_values_, const URowMapType &U_row_map_, const UEntriesType &U_entries_, UValuesType &U_values_, const LevelViewType &level_idx_, WorkViewType &iw_, const nnz_lno_t &lev_start_ ) :
-    A_row_map(A_row_map_), A_entries(A_entries_), A_values(A_values_), L_row_map(L_row_map_), L_entries(L_entries_), L_values(L_values_), U_row_map(U_row_map_), U_entries(U_entries_), U_values(U_values_), level_idx(level_idx_), iw(iw_), lev_start(lev_start_) {}
+  WorkViewType iw;
+  nnz_lno_t lev_start;
+
+  ILUKLvlSchedTP1NumericFunctor(
+      const ARowMapType &A_row_map_, const AEntriesType &A_entries_,
+      const AValuesType &A_values_, const LRowMapType &L_row_map_,
+      const LEntriesType &L_entries_, LValuesType &L_values_,
+      const URowMapType &U_row_map_, const UEntriesType &U_entries_,
+      UValuesType &U_values_, const LevelViewType &level_idx_,
+      WorkViewType &iw_, const nnz_lno_t &lev_start_)
+      : A_row_map(A_row_map_),
+        A_entries(A_entries_),
+        A_values(A_values_),
+        L_row_map(L_row_map_),
+        L_entries(L_entries_),
+        L_values(L_values_),
+        U_row_map(U_row_map_),
+        U_entries(U_entries_),
+        U_values(U_values_),
+        level_idx(level_idx_),
+        iw(iw_),
+        lev_start(lev_start_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const member_type & team ) const {
-    auto my_league = team.league_rank(); // map to rowid
+  void operator()(const member_type &team) const {
+    auto my_league = team.league_rank();  // map to rowid
     auto rowid     = level_idx(my_league + lev_start);
     auto my_team   = team.team_rank();
 
-    auto k1 = L_row_map(rowid); 
-    auto k2 = L_row_map(rowid+1);
+    auto k1 = L_row_map(rowid);
+    auto k2 = L_row_map(rowid + 1);
 #ifdef KEEP_DIAG
-    Kokkos::parallel_for( Kokkos::TeamThreadRange( team, k1, k2-1 ), [&] ( const size_type k ) { 
-      auto col          = L_entries(k);
-      L_values(k)       = 0.0;
-      iw(my_league,col) = k;
-    });
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1),
+                         [&](const size_type k) {
+                           auto col           = L_entries(k);
+                           L_values(k)        = 0.0;
+                           iw(my_league, col) = k;
+                         });
 #else
-    Kokkos::parallel_for( Kokkos::TeamThreadRange( team, k1, k2 ), [&] ( const size_type k ) { 
-      auto col          = L_entries(k);
-      L_values(k)       = 0.0;
-      iw(my_league,col) = k;
-    });
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2),
+                         [&](const size_type k) {
+                           auto col           = L_entries(k);
+                           L_values(k)        = 0.0;
+                           iw(my_league, col) = k;
+                         });
 #endif
 
 #ifdef KEEP_DIAG
-    if ( my_team == 0 ) L_values(k2-1) = scalar_t(1.0);
+    if (my_team == 0) L_values(k2 - 1) = scalar_t(1.0);
 #endif
 
     team.team_barrier();
 
-    k1 = U_row_map(rowid); 
-    k2 = U_row_map(rowid+1);
-    Kokkos::parallel_for( Kokkos::TeamThreadRange( team, k1, k2 ), [&] ( const size_type k ) { 
-      auto col          = U_entries(k);
-      U_values(k)       = 0.0;
-      iw(my_league,col) = k;
-    });
+    k1 = U_row_map(rowid);
+    k2 = U_row_map(rowid + 1);
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2),
+                         [&](const size_type k) {
+                           auto col           = U_entries(k);
+                           U_values(k)        = 0.0;
+                           iw(my_league, col) = k;
+                         });
 
     team.team_barrier();
-	
-    //Unpack the ith row of A
+
+    // Unpack the ith row of A
     k1 = A_row_map(rowid);
-    k2 = A_row_map(rowid+1);
-    Kokkos::parallel_for( Kokkos::TeamThreadRange( team, k1, k2 ), [&] ( const size_type k ) {
-      auto col  = A_entries(k);
-      auto ipos = iw(my_league,col);
-      if (col < rowid)
-        L_values(ipos) = A_values(k);
-      else
-        U_values(ipos) = A_values(k);	  
-    });
+    k2 = A_row_map(rowid + 1);
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2),
+                         [&](const size_type k) {
+                           auto col  = A_entries(k);
+                           auto ipos = iw(my_league, col);
+                           if (col < rowid)
+                             L_values(ipos) = A_values(k);
+                           else
+                             U_values(ipos) = A_values(k);
+                         });
 
     team.team_barrier();
-	
-    //Eliminate prev rows
-    k1 = L_row_map(rowid); 
-    k2 = L_row_map(rowid+1);
+
+    // Eliminate prev rows
+    k1 = L_row_map(rowid);
+    k2 = L_row_map(rowid + 1);
 #ifdef KEEP_DIAG
-    for (auto k = k1; k < k2-1; ++k) {
+    for (auto k = k1; k < k2 - 1; ++k) {
 #else
     for (auto k = k1; k < k2; ++k) {
 #endif
@@ -292,88 +310,78 @@ struct ILUKLvlSchedTP1NumericFunctor
 #else
       auto fact = L_values(k) * U_values(U_row_map(prev_row));
 #endif
-      if ( my_team == 0 ) L_values(k) = fact; 
+      if (my_team == 0) L_values(k) = fact;
 
       team.team_barrier();
 
-      Kokkos::parallel_for( Kokkos::TeamThreadRange( team, U_row_map(prev_row)+1, U_row_map(prev_row+1) ), [&] ( const size_type kk ) {
-        auto col  = U_entries(kk);
-        auto ipos = iw(my_league,col);
-        if (ipos != -1) {
-          auto lxu = -U_values(kk) * fact;
-          if (col < rowid)
-            L_values(ipos) += lxu;
-          else
-            U_values(ipos) += lxu;
-        }
-      });// end for kk
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, U_row_map(prev_row) + 1,
+                                  U_row_map(prev_row + 1)),
+          [&](const size_type kk) {
+            auto col  = U_entries(kk);
+            auto ipos = iw(my_league, col);
+            if (ipos != -1) {
+              auto lxu = -U_values(kk) * fact;
+              if (col < rowid)
+                L_values(ipos) += lxu;
+              else
+                U_values(ipos) += lxu;
+            }
+          });  // end for kk
 
       team.team_barrier();
-    }// end for k
+    }  // end for k
 
-    if ( my_team == 0 ) {
+    if (my_team == 0) {
 #ifdef KEEP_DIAG
-      if (U_values(iw(my_league,rowid)) == 0.0) {
-        U_values(iw(my_league,rowid)) = 1e6;
+      if (U_values(iw(my_league, rowid)) == 0.0) {
+        U_values(iw(my_league, rowid)) = 1e6;
       }
 #else
-      if (U_values(iw(my_league,rowid)) == 0.0) {
-        U_values(iw(my_league,rowid)) = 1e6;
-      }
-      else {
-        U_values(iw(my_league,rowid)) = 1.0 / U_values(iw(my_league,rowid));
+      if (U_values(iw(my_league, rowid)) == 0.0) {
+        U_values(iw(my_league, rowid)) = 1e6;
+      } else {
+        U_values(iw(my_league, rowid)) = 1.0 / U_values(iw(my_league, rowid));
       }
 #endif
     }
 
     team.team_barrier();
 
-    //Reset
-    k1 = L_row_map(rowid); 
-    k2 = L_row_map(rowid+1);
+    // Reset
+    k1 = L_row_map(rowid);
+    k2 = L_row_map(rowid + 1);
 #ifdef KEEP_DIAG
-    Kokkos::parallel_for( Kokkos::TeamThreadRange( team, k1, k2-1 ), [&] ( const size_type k ) {
-      iw(my_league,L_entries(k)) = -1;
-    });
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(team, k1, k2 - 1),
+        [&](const size_type k) { iw(my_league, L_entries(k)) = -1; });
 #else
-    Kokkos::parallel_for( Kokkos::TeamThreadRange( team, k1, k2 ), [&] ( const size_type k ) {
-      iw(my_league,L_entries(k)) = -1;
-    });
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(team, k1, k2),
+        [&](const size_type k) { iw(my_league, L_entries(k)) = -1; });
 #endif
 
-    k1 = U_row_map(rowid); 
-    k2 = U_row_map(rowid+1);
-    Kokkos::parallel_for( Kokkos::TeamThreadRange( team, k1, k2 ), [&] ( const size_type k ) {
-      iw(my_league,U_entries(k)) = -1;
-    });
+    k1 = U_row_map(rowid);
+    k2 = U_row_map(rowid + 1);
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(team, k1, k2),
+        [&](const size_type k) { iw(my_league, U_entries(k)) = -1; });
   }
 };
 
-template <class IlukHandle,
-          class ARowMapType,
-          class AEntriesType,
-          class AValuesType,
-          class LRowMapType,
-          class LEntriesType,
-          class LValuesType,
-          class URowMapType,
-          class UEntriesType,
+template <class IlukHandle, class ARowMapType, class AEntriesType,
+          class AValuesType, class LRowMapType, class LEntriesType,
+          class LValuesType, class URowMapType, class UEntriesType,
           class UValuesType>
-void iluk_numeric ( IlukHandle& thandle,
-                    const ARowMapType&  A_row_map,
-                    const AEntriesType& A_entries,
-                    const AValuesType&  A_values,
-                    const LRowMapType&  L_row_map,
-                    const LEntriesType& L_entries,
-                          LValuesType&  L_values,
-                    const URowMapType&  U_row_map,
-                    const UEntriesType& U_entries,
-                          UValuesType&  U_values ) {
-
-  using execution_space = typename IlukHandle::execution_space;
-  using memory_space    = typename IlukHandle::memory_space;
-  using size_type       = typename IlukHandle::size_type;
-  using nnz_lno_t       = typename IlukHandle::nnz_lno_t;
+void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map,
+                  const AEntriesType &A_entries, const AValuesType &A_values,
+                  const LRowMapType &L_row_map, const LEntriesType &L_entries,
+                  LValuesType &L_values, const URowMapType &U_row_map,
+                  const UEntriesType &U_entries, UValuesType &U_values) {
+  using execution_space         = typename IlukHandle::execution_space;
+  using memory_space            = typename IlukHandle::memory_space;
+  using size_type               = typename IlukHandle::size_type;
+  using nnz_lno_t               = typename IlukHandle::nnz_lno_t;
   using HandleDeviceEntriesType = typename IlukHandle::nnz_lno_view_t;
 
   size_type nlevels = thandle.get_num_levels();
@@ -381,91 +389,100 @@ void iluk_numeric ( IlukHandle& thandle,
 
   // Keep this as host View, create device version and copy to back to host
   HandleDeviceEntriesType level_ptr = thandle.get_level_ptr();
-  //Make level_ptr_h a separate allocation, since it will be accessed on host
-  //between kernel launches. If a mirror were used and level_ptr is in UVM space,
-  //a fence would be required before each access since UVM views can share pages.
-  Kokkos::View<nnz_lno_t*, Kokkos::HostSpace> level_ptr_h(
+  // Make level_ptr_h a separate allocation, since it will be accessed on host
+  // between kernel launches. If a mirror were used and level_ptr is in UVM
+  // space, a fence would be required before each access since UVM views can
+  // share pages.
+  Kokkos::View<nnz_lno_t *, Kokkos::HostSpace> level_ptr_h(
       Kokkos::view_alloc(Kokkos::WithoutInitializing, "Host level pointers"),
       level_ptr.extent(0));
   Kokkos::deep_copy(level_ptr_h, level_ptr);
 
   HandleDeviceEntriesType level_idx = thandle.get_level_idx();
 
-  using WorkViewType = Kokkos::View<nnz_lno_t**, Kokkos::Device<execution_space,memory_space>>;
-  
-  WorkViewType iw ( "iw", thandle.get_level_maxrows(), nrows );
+  using WorkViewType =
+      Kokkos::View<nnz_lno_t **, Kokkos::Device<execution_space, memory_space>>;
+
+  WorkViewType iw("iw", thandle.get_level_maxrows(), nrows);
   Kokkos::deep_copy(iw, nnz_lno_t(-1));
 
-  // Main loop must be performed sequential. Question: Try out Cuda's graph stuff to reduce kernel launch overhead
-  for ( size_type lvl = 0; lvl < nlevels; ++lvl ) {
+  // Main loop must be performed sequential. Question: Try out Cuda's graph
+  // stuff to reduce kernel launch overhead
+  for (size_type lvl = 0; lvl < nlevels; ++lvl) {
     nnz_lno_t lev_start = level_ptr_h(lvl);
-    nnz_lno_t lev_end   = level_ptr_h(lvl+1);
-
-    if ( (lev_end - lev_start) != 0 ) {
-
-      if ( thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP ) {
-        Kokkos::parallel_for( "parfor_fixed_lvl", Kokkos::RangePolicy<execution_space>( lev_start, lev_end ), 
-                                                  ILUKLvlSchedRPNumericFunctor<ARowMapType,
-                                                                               AEntriesType,
-                                                                               AValuesType,
-                                                                               LRowMapType,
-                                                                               LEntriesType,
-                                                                               LValuesType,
-                                                                               URowMapType,
-                                                                               UEntriesType,
-                                                                               UValuesType,
-                                                                               HandleDeviceEntriesType,
-                                                                               WorkViewType,
-                                                                               nnz_lno_t> (A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values, level_idx, iw, lev_start) );
-      }
-      else if ( thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1 ) {
+    nnz_lno_t lev_end   = level_ptr_h(lvl + 1);
+
+    if ((lev_end - lev_start) != 0) {
+      if (thandle.get_algorithm() ==
+          KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP) {
+        Kokkos::parallel_for(
+            "parfor_fixed_lvl",
+            Kokkos::RangePolicy<execution_space>(lev_start, lev_end),
+            ILUKLvlSchedRPNumericFunctor<
+                ARowMapType, AEntriesType, AValuesType, LRowMapType,
+                LEntriesType, LValuesType, URowMapType, UEntriesType,
+                UValuesType, HandleDeviceEntriesType, WorkViewType, nnz_lno_t>(
+                A_row_map, A_entries, A_values, L_row_map, L_entries, L_values,
+                U_row_map, U_entries, U_values, level_idx, iw, lev_start));
+      } else if (thandle.get_algorithm() ==
+                 KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) {
         using policy_type = Kokkos::TeamPolicy<execution_space>;
-        int team_size = thandle.get_team_size();
-
-        ILUKLvlSchedTP1NumericFunctor<ARowMapType,
-                                      AEntriesType,
-                                      AValuesType,
-                                      LRowMapType,
-                                      LEntriesType,
-                                      LValuesType,
-                                      URowMapType,
-                                      UEntriesType,
-                                      UValuesType,
-                                      HandleDeviceEntriesType,
-                                      WorkViewType,
-                                      nnz_lno_t> tstf(A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values, level_idx, iw, lev_start);
-        if ( team_size == -1 )
-          Kokkos::parallel_for("parfor_l_team", policy_type( lev_end - lev_start , Kokkos::AUTO ), tstf);
+        int team_size     = thandle.get_team_size();
+
+        ILUKLvlSchedTP1NumericFunctor<
+            ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType,
+            LValuesType, URowMapType, UEntriesType, UValuesType,
+            HandleDeviceEntriesType, WorkViewType, nnz_lno_t>
+            tstf(A_row_map, A_entries, A_values, L_row_map, L_entries, L_values,
+                 U_row_map, U_entries, U_values, level_idx, iw, lev_start);
+        if (team_size == -1)
+          Kokkos::parallel_for("parfor_l_team",
+                               policy_type(lev_end - lev_start, Kokkos::AUTO),
+                               tstf);
         else
-          Kokkos::parallel_for("parfor_l_team", policy_type( lev_end - lev_start , team_size ), tstf);
+          Kokkos::parallel_for("parfor_l_team",
+                               policy_type(lev_end - lev_start, team_size),
+                               tstf);
       }
-//      /*
-//      // TP2 algorithm has issues with some offset-ordinal combo to be addressed
-//      else if ( thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) {
-//        typedef Kokkos::TeamPolicy<execution_space> tvt_policy_type;
-//
-//        int team_size = thandle.get_team_size();
-//        if ( team_size == -1 ) {
-//          team_size = std::is_same< typename Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : 128;
-//        }
-//        int vector_size = thandle.get_team_size();
-//        if ( vector_size == -1 ) {
-//          vector_size = std::is_same< typename Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : 4;
-//        }
-//
-//        // This impl: "chunk" lvl_nodes into node_groups; a league_rank is responsible for processing that many nodes
-//        //       TeamThreadRange over number of node_groups
-//        //       To avoid masking threads, 1 thread (team) per node in node_group
-//        //       ThreadVectorRange responsible for the actual solve computation
-//        const int node_groups = team_size;
-//
-//        LowerTriLvlSchedTP2SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, HandleDeviceEntriesType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, row_count, node_groups);
-//        Kokkos::parallel_for("parfor_u_team_vector", tvt_policy_type( (int)std::ceil((float)lvl_nodes/(float)node_groups) , team_size, vector_size ), tstf);
-//      } // end elseif
-//      */
-
-    } // end if
-  } // end for lvl
+      //      /*
+      //      // TP2 algorithm has issues with some offset-ordinal combo to be
+      //      addressed else if ( thandle.get_algorithm() ==
+      //      KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) {
+      //        typedef Kokkos::TeamPolicy<execution_space> tvt_policy_type;
+      //
+      //        int team_size = thandle.get_team_size();
+      //        if ( team_size == -1 ) {
+      //          team_size = std::is_same< typename
+      //          Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace
+      //          >::value ? 1 : 128;
+      //        }
+      //        int vector_size = thandle.get_team_size();
+      //        if ( vector_size == -1 ) {
+      //          vector_size = std::is_same< typename
+      //          Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace
+      //          >::value ? 1 : 4;
+      //        }
+      //
+      //        // This impl: "chunk" lvl_nodes into node_groups; a league_rank
+      //        is responsible for processing that many nodes
+      //        //       TeamThreadRange over number of node_groups
+      //        //       To avoid masking threads, 1 thread (team) per node in
+      //        node_group
+      //        //       ThreadVectorRange responsible for the actual solve
+      //        computation const int node_groups = team_size;
+      //
+      //        LowerTriLvlSchedTP2SolverFunctor<RowMapType, EntriesType,
+      //        ValuesType, LHSType, RHSType, HandleDeviceEntriesType>
+      //        tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+      //        row_count, node_groups);
+      //        Kokkos::parallel_for("parfor_u_team_vector", tvt_policy_type(
+      //        (int)std::ceil((float)lvl_nodes/(float)node_groups) , team_size,
+      //        vector_size ), tstf);
+      //      } // end elseif
+      //      */
+
+    }  // end if
+  }    // end for lvl
 
 // Output check
 #ifdef NUMERIC_OUTPUT_INFO
@@ -473,42 +490,47 @@ void iluk_numeric ( IlukHandle& thandle,
 
   std::cout << "  nnzL: " << thandle.get_nnzL() << std::endl;
   std::cout << "  L_row_map = ";
-  for ( size_type i = 0; i < nrows+1; ++i )
-  { std::cout << L_row_map(i) << " "; }
+  for (size_type i = 0; i < nrows + 1; ++i) {
+    std::cout << L_row_map(i) << " ";
+  }
   std::cout << std::endl;
 
   std::cout << "  L_entries = ";
-  for ( size_type i = 0; i < thandle.get_nnzL(); ++i )
-  { std::cout << L_entries(i) << " "; }
+  for (size_type i = 0; i < thandle.get_nnzL(); ++i) {
+    std::cout << L_entries(i) << " ";
+  }
   std::cout << std::endl;
 
   std::cout << "  L_values = ";
-  for ( size_type i = 0; i < thandle.get_nnzL(); ++i )
-  { std::cout << L_values(i) << " "; }
+  for (size_type i = 0; i < thandle.get_nnzL(); ++i) {
+    std::cout << L_values(i) << " ";
+  }
   std::cout << std::endl;
 
   std::cout << "  nnzU: " << thandle.get_nnzU() << std::endl;
   std::cout << "  U_row_map = ";
-  for ( size_type i = 0; i < nrows+1; ++i )
-  { std::cout << U_row_map(i) << " "; }
+  for (size_type i = 0; i < nrows + 1; ++i) {
+    std::cout << U_row_map(i) << " ";
+  }
   std::cout << std::endl;
 
   std::cout << "  U_entries = ";
-  for ( size_type i = 0; i < thandle.get_nnzU(); ++i )
-  { std::cout << U_entries(i) << " "; }
+  for (size_type i = 0; i < thandle.get_nnzU(); ++i) {
+    std::cout << U_entries(i) << " ";
+  }
   std::cout << std::endl;
 
   std::cout << "  U_values = ";
-  for ( size_type i = 0; i < thandle.get_nnzU(); ++i )
-  { std::cout << U_values(i) << " "; }
+  for (size_type i = 0; i < thandle.get_nnzU(); ++i) {
+    std::cout << U_values(i) << " ";
+  }
   std::cout << std::endl;
 #endif
 
-} // end iluk_numeric
-
+}  // end iluk_numeric
 
-} // namespace Experimental
-} // namespace Impl
-} // namespace KokkosSparse
+}  // namespace Experimental
+}  // namespace Impl
+}  // namespace KokkosSparse
 
 #endif
diff --git a/src/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp b/src/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp
index 33b0d65821..8f5847fd6a 100644
--- a/src/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp
+++ b/src/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp
@@ -51,7 +51,7 @@
 #include "KokkosKernels_Handle.hpp"
 
 // Include the actual functors
-#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY 
+#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 #include <KokkosSparse_spiluk_numeric_impl.hpp>
 #include <KokkosSparse_spiluk_symbolic_impl.hpp>
 #endif
@@ -59,62 +59,67 @@
 namespace KokkosSparse {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class KernelHandle,
-         class ARowMapType,
-         class AEntriesType,
-         class AValuesType,
-         class LRowMapType,
-         class LEntriesType,
-         class LValuesType,
-         class URowMapType,
-         class UEntriesType,
-         class UValuesType>
+template <class KernelHandle, class ARowMapType, class AEntriesType,
+          class AValuesType, class LRowMapType, class LEntriesType,
+          class LValuesType, class URowMapType, class UEntriesType,
+          class UValuesType>
 struct spiluk_numeric_eti_spec_avail {
   enum : bool { value = false };
 };
 
-}
-}
-
+}  // namespace Impl
+}  // namespace KokkosSparse
 
-#define KOKKOSSPARSE_SPILUK_NUMERIC_ETI_SPEC_AVAIL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \
-    template<> \
-    struct spiluk_numeric_eti_spec_avail< \
-                  KokkosKernels::Experimental::KokkosKernelsHandle<\
-                               const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,  \
-                               EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE> , \
-                  Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > > \
-    { enum : bool { value = true }; };
+#define KOKKOSSPARSE_SPILUK_NUMERIC_ETI_SPEC_AVAIL(                            \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE,      \
+    MEM_SPACE_TYPE)                                                            \
+  template <>                                                                  \
+  struct spiluk_numeric_eti_spec_avail<                                        \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                        \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,            \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::View<                                                            \
+          const OFFSET_TYPE *, LAYOUT_TYPE,                                    \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                     \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >,    \
+      Kokkos::View<                                                            \
+          const ORDINAL_TYPE *, LAYOUT_TYPE,                                   \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                     \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >,    \
+      Kokkos::View<                                                            \
+          const SCALAR_TYPE *, LAYOUT_TYPE,                                    \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                     \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >,    \
+      Kokkos::View<                                                            \
+          const OFFSET_TYPE *, LAYOUT_TYPE,                                    \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                     \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >,    \
+      Kokkos::View<                                                            \
+          ORDINAL_TYPE *, LAYOUT_TYPE,                                         \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                     \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >,    \
+      Kokkos::View<                                                            \
+          SCALAR_TYPE *, LAYOUT_TYPE,                                          \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                     \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >,    \
+      Kokkos::View<                                                            \
+          const OFFSET_TYPE *, LAYOUT_TYPE,                                    \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                     \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >,    \
+      Kokkos::View<                                                            \
+          ORDINAL_TYPE *, LAYOUT_TYPE,                                         \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                     \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >,    \
+      Kokkos::View<                                                            \
+          SCALAR_TYPE *, LAYOUT_TYPE,                                          \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                     \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> > > { \
+    enum : bool { value = true };                                              \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosSparse_spiluk_numeric_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosSparse_spiluk_numeric_eti_spec_avail.hpp>
+#include <KokkosSparse_spiluk_numeric_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spiluk_numeric_eti_spec_avail.hpp>
 
 namespace KokkosSparse {
 namespace Impl {
@@ -122,95 +127,58 @@ namespace Impl {
 // Unification layer
 /// \brief Implementation of KokkosSparse::spiluk_numeric
 
-template<class KernelHandle,
-         class ARowMapType,
-         class AEntriesType,
-         class AValuesType,
-         class LRowMapType,
-         class LEntriesType,
-         class LValuesType,
-         class URowMapType,
-         class UEntriesType,
-         class UValuesType,
-         bool tpl_spec_avail =
-              spiluk_numeric_tpl_spec_avail<KernelHandle,
-                                            ARowMapType,
-                                            AEntriesType,
-                                            AValuesType,
-                                            LRowMapType,
-                                            LEntriesType,
-                                            LValuesType,
-                                            URowMapType,
-                                            UEntriesType,
-                                            UValuesType>::value,
-         bool eti_spec_avail =
-              spiluk_numeric_eti_spec_avail<KernelHandle,
-                                            ARowMapType,
-                                            AEntriesType,
-                                            AValuesType,
-                                            LRowMapType,
-                                            LEntriesType,
-                                            LValuesType,
-                                            URowMapType,
-                                            UEntriesType,
-                                            UValuesType>::value>
-struct SPILUK_NUMERIC{
-  static void
-  spiluk_numeric (KernelHandle *handle,
-                  const typename KernelHandle::const_nnz_lno_t &fill_lev,
-                  const ARowMapType&  A_row_map,
-                  const AEntriesType& A_entries,
-                  const AValuesType&  A_values,
-                        LRowMapType&  L_row_map,
-                        LEntriesType& L_entries,
-                        LValuesType&  L_values,
-                        URowMapType&  U_row_map,
-                        UEntriesType& U_entries,
-                        UValuesType&  U_values);
+template <class KernelHandle, class ARowMapType, class AEntriesType,
+          class AValuesType, class LRowMapType, class LEntriesType,
+          class LValuesType, class URowMapType, class UEntriesType,
+          class UValuesType,
+          bool tpl_spec_avail = spiluk_numeric_tpl_spec_avail<
+              KernelHandle, ARowMapType, AEntriesType, AValuesType, LRowMapType,
+              LEntriesType, LValuesType, URowMapType, UEntriesType,
+              UValuesType>::value,
+          bool eti_spec_avail = spiluk_numeric_eti_spec_avail<
+              KernelHandle, ARowMapType, AEntriesType, AValuesType, LRowMapType,
+              LEntriesType, LValuesType, URowMapType, UEntriesType,
+              UValuesType>::value>
+struct SPILUK_NUMERIC {
+  static void spiluk_numeric(
+      KernelHandle *handle,
+      const typename KernelHandle::const_nnz_lno_t &fill_lev,
+      const ARowMapType &A_row_map, const AEntriesType &A_entries,
+      const AValuesType &A_values, LRowMapType &L_row_map,
+      LEntriesType &L_entries, LValuesType &L_values, URowMapType &U_row_map,
+      UEntriesType &U_entries, UValuesType &U_values);
 };
 
-
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 //! Full specialization of spiluk_numeric
 // Unification layer
-template<class KernelHandle,
-         class ARowMapType,
-         class AEntriesType,
-         class AValuesType,
-         class LRowMapType,
-         class LEntriesType,
-         class LValuesType,
-         class URowMapType,
-         class UEntriesType,
-         class UValuesType>
-struct SPILUK_NUMERIC<KernelHandle, ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType, LValuesType, URowMapType, UEntriesType, UValuesType, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>{
-  static void
-  spiluk_numeric (KernelHandle *handle,
-                  const typename KernelHandle::const_nnz_lno_t &/*fill_lev*/,
-                  const ARowMapType&  A_row_map,
-                  const AEntriesType& A_entries,
-                  const AValuesType&  A_values,
-                        LRowMapType&  L_row_map,
-                        LEntriesType& L_entries,
-                        LValuesType&  L_values,
-                        URowMapType&  U_row_map,
-                        UEntriesType& U_entries,
-                        UValuesType&  U_values)
-  {
+template <class KernelHandle, class ARowMapType, class AEntriesType,
+          class AValuesType, class LRowMapType, class LEntriesType,
+          class LValuesType, class URowMapType, class UEntriesType,
+          class UValuesType>
+struct SPILUK_NUMERIC<KernelHandle, ARowMapType, AEntriesType, AValuesType,
+                      LRowMapType, LEntriesType, LValuesType, URowMapType,
+                      UEntriesType, UValuesType, false,
+                      KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  static void spiluk_numeric(
+      KernelHandle *handle,
+      const typename KernelHandle::const_nnz_lno_t & /*fill_lev*/,
+      const ARowMapType &A_row_map, const AEntriesType &A_entries,
+      const AValuesType &A_values, LRowMapType &L_row_map,
+      LEntriesType &L_entries, LValuesType &L_values, URowMapType &U_row_map,
+      UEntriesType &U_entries, UValuesType &U_values) {
     // Call specific algorithm type
     auto spiluk_handle = handle->get_spiluk_handle();
 
     Experimental::iluk_numeric(*spiluk_handle, A_row_map, A_entries, A_values,
-                                               L_row_map, L_entries, L_values, 
-                                               U_row_map, U_entries, U_values);
+                               L_row_map, L_entries, L_values, U_row_map,
+                               U_entries, U_values);
   }
-
 };
 
-
 #endif
-} // namespace Impl
-} // namespace KokkosSparse
+}  // namespace Impl
+}  // namespace KokkosSparse
 
 //
 // Macro for declaration of full specialization of
@@ -219,76 +187,97 @@ struct SPILUK_NUMERIC<KernelHandle, ARowMapType, AEntriesType, AValuesType, LRow
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSSPARSE_SPILUK_NUMERIC_ETI_SPEC_DECL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE ) \
-    extern template struct  \
-    SPILUK_NUMERIC<\
-                  KokkosKernels::Experimental::KokkosKernelsHandle<\
-                               const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,  \
-                               EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE> , \
-                  Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, false, true >;
-
-#define KOKKOSSPARSE_SPILUK_NUMERIC_ETI_SPEC_INST( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \
-    template struct  \
-    SPILUK_NUMERIC<\
-                  KokkosKernels::Experimental::KokkosKernelsHandle<\
-                               const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,  \
-                               EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE> , \
-                  Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, false, true >;
+#define KOKKOSSPARSE_SPILUK_NUMERIC_ETI_SPEC_DECL(                          \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE,   \
+    MEM_SPACE_TYPE)                                                         \
+  extern template struct SPILUK_NUMERIC<                                    \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                     \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,         \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>,                 \
+      Kokkos::View<                                                         \
+          const OFFSET_TYPE *, LAYOUT_TYPE,                                 \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          const ORDINAL_TYPE *, LAYOUT_TYPE,                                \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          const SCALAR_TYPE *, LAYOUT_TYPE,                                 \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          const OFFSET_TYPE *, LAYOUT_TYPE,                                 \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          ORDINAL_TYPE *, LAYOUT_TYPE,                                      \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          SCALAR_TYPE *, LAYOUT_TYPE,                                       \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          const OFFSET_TYPE *, LAYOUT_TYPE,                                 \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          ORDINAL_TYPE *, LAYOUT_TYPE,                                      \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          SCALAR_TYPE *, LAYOUT_TYPE,                                       \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      false, true>;
 
-#include<KokkosSparse_spiluk_numeric_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosSparse_spiluk_numeric_eti_spec_decl.hpp>
+#define KOKKOSSPARSE_SPILUK_NUMERIC_ETI_SPEC_INST(                          \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE,   \
+    MEM_SPACE_TYPE)                                                         \
+  template struct SPILUK_NUMERIC<                                           \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                     \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,         \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>,                 \
+      Kokkos::View<                                                         \
+          const OFFSET_TYPE *, LAYOUT_TYPE,                                 \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          const ORDINAL_TYPE *, LAYOUT_TYPE,                                \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          const SCALAR_TYPE *, LAYOUT_TYPE,                                 \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          const OFFSET_TYPE *, LAYOUT_TYPE,                                 \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          ORDINAL_TYPE *, LAYOUT_TYPE,                                      \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          SCALAR_TYPE *, LAYOUT_TYPE,                                       \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          const OFFSET_TYPE *, LAYOUT_TYPE,                                 \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          ORDINAL_TYPE *, LAYOUT_TYPE,                                      \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          SCALAR_TYPE *, LAYOUT_TYPE,                                       \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      false, true>;
 
+#include <KokkosSparse_spiluk_numeric_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spiluk_numeric_eti_spec_decl.hpp>
 
 #endif
diff --git a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp
index 630a617f38..ff464951c7 100644
--- a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp
@@ -52,6 +52,7 @@
 #include <Kokkos_ArithTraits.hpp>
 #include <KokkosSparse_spiluk_handle.hpp>
 #include <Kokkos_Sort.hpp>
+#include <KokkosKernels_Error.hpp>
 
 //#define SYMBOLIC_OUTPUT_INFO
 
@@ -59,16 +60,12 @@ namespace KokkosSparse {
 namespace Impl {
 namespace Experimental {
 
-
-template <class IlukHandle,
-          class RowMapType, 
-          class EntriesType,
-          class LevelType1,
-          class LevelType2,
-          class size_type>
-void level_sched ( IlukHandle& thandle,
-                   const RowMapType row_map, const EntriesType entries, const size_type nrows, 
-                   LevelType1& level_list, LevelType2& level_ptr, LevelType2& level_idx, size_type &nlevels ) {
+template <class IlukHandle, class RowMapType, class EntriesType,
+          class LevelType1, class LevelType2, class size_type>
+void level_sched(IlukHandle& thandle, const RowMapType row_map,
+                 const EntriesType entries, const size_type nrows,
+                 LevelType1& level_list, LevelType2& level_ptr,
+                 LevelType2& level_idx, size_type& nlevels) {
   // Scheduling currently compute on host
 
   typedef typename IlukHandle::nnz_lno_t nnz_lno_t;
@@ -76,349 +73,374 @@ void level_sched ( IlukHandle& thandle,
   nlevels      = 0;
   level_ptr(0) = 0;
 
-  for ( size_type i = 0; i < nrows; ++i ) {
-    size_type l = 0;
-    size_type rowstart= row_map(i);
-    size_type rowend  = row_map(i+1);
-    for ( size_type j = rowstart; j < rowend; ++j ) {
+  for (size_type i = 0; i < nrows; ++i) {
+    size_type l        = 0;
+    size_type rowstart = row_map(i);
+    size_type rowend   = row_map(i + 1);
+    for (size_type j = rowstart; j < rowend; ++j) {
       nnz_lno_t col = entries(j);
-      l = std::max(l, level_list(col));
+      l             = std::max(l, level_list(col));
     }
-    level_list(i)   = l+1;
-    level_ptr(l+1) += 1;
-    nlevels         = std::max(nlevels, l+1);
+    level_list(i) = l + 1;
+    level_ptr(l + 1) += 1;
+    nlevels = std::max(nlevels, l + 1);
   }
 
-  for ( size_type i = 1; i <= nlevels; ++i ) {
-    level_ptr(i) += level_ptr(i-1);
+  for (size_type i = 1; i <= nlevels; ++i) {
+    level_ptr(i) += level_ptr(i - 1);
   }
 
-  for ( size_type i = 0; i < nrows; i++ ) {
-    level_idx(level_ptr(level_list(i)-1)) = i;
-    level_ptr(level_list(i)-1) += 1;
+  for (size_type i = 0; i < nrows; i++) {
+    level_idx(level_ptr(level_list(i) - 1)) = i;
+    level_ptr(level_list(i) - 1) += 1;
   }
 
-  if (nlevels>0) {//note: to avoid wrapping around to the max of size_t when nlevels = 0.
-    for ( size_type i = nlevels-1; i > 0; --i ) {
-      level_ptr(i) = level_ptr(i-1);
+  if (nlevels > 0) {  // note: to avoid wrapping around to the max of size_t
+                      // when nlevels = 0.
+    for (size_type i = nlevels - 1; i > 0; --i) {
+      level_ptr(i) = level_ptr(i - 1);
     }
   }
 
   level_ptr(0) = 0;
 
-  //Find the maximum number of rows of levels
+  // Find the maximum number of rows of levels
   size_type maxrows = 0;
-  for ( size_type i = 0; i < nlevels; ++i ) {
-    size_type lnrows = level_ptr(i+1) - level_ptr(i);
-    if( maxrows < lnrows ) {
+  for (size_type i = 0; i < nlevels; ++i) {
+    size_type lnrows = level_ptr(i + 1) - level_ptr(i);
+    if (maxrows < lnrows) {
       maxrows = lnrows;
     }
   }
 
   thandle.set_num_levels(nlevels);
   thandle.set_level_maxrows(maxrows);
- 
 }
 
- //Linear Search for the smallest row index
+// Linear Search for the smallest row index
 template <class size_type, class nnz_lno_t, class ViewType>
-size_type search_col_index ( nnz_lno_t j, size_type lenl, ViewType h_iL, ViewType h_llev, ViewType h_iw ) {
-
+size_type search_col_index(nnz_lno_t j, size_type lenl, ViewType h_iL,
+                           ViewType h_llev, ViewType h_iw) {
   nnz_lno_t irow = h_iL(j);
   nnz_lno_t ipos = j;
 
-  //Find the smallest col index
-  for(size_type k = j+1; k < lenl; ++k) {
-    if( h_iL(k) < irow ) {
+  // Find the smallest col index
+  for (size_type k = j + 1; k < lenl; ++k) {
+    if (h_iL(k) < irow) {
       irow = h_iL(k);
       ipos = k;
     }
   }
 
-  if( ipos != j ) {//Swap entries
+  if (ipos != j) {  // Swap entries
     nnz_lno_t row = h_iL(j);
     h_iL(j)       = h_iL(ipos);
     h_iL(ipos)    = row;
 
-    nnz_lno_t t   = h_llev(j);
-    h_llev(j)     = h_llev(ipos);
-    h_llev(ipos)  = t;
+    nnz_lno_t t  = h_llev(j);
+    h_llev(j)    = h_llev(ipos);
+    h_llev(ipos) = t;
 
-    h_iw(irow)    = j;
-    h_iw(row)     = ipos;
+    h_iw(irow) = j;
+    h_iw(row)  = ipos;
   }
   return ((size_type)irow);
 }
 
-template <class IlukHandle,
-          class ARowMapType,
-          class AEntriesType,
-          class LRowMapType,
-          class LEntriesType,
-          class URowMapType,
+template <class IlukHandle, class ARowMapType, class AEntriesType,
+          class LRowMapType, class LEntriesType, class URowMapType,
           class UEntriesType>
-void iluk_symbolic ( IlukHandle& thandle,
-                     const typename IlukHandle::const_nnz_lno_t &fill_lev,
-                     const ARowMapType&  A_row_map_d,
-                     const AEntriesType& A_entries_d,
-                           LRowMapType&  L_row_map_d,
-                           LEntriesType& L_entries_d,
-                           URowMapType&  U_row_map_d,
-                           UEntriesType& U_entries_d ) {
-
- if ( thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP ||
-      thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1 )
-/*   || thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHED_TP2 )*/
- {
-  // Scheduling and symbolic phase currently compute on host - need host copy of all views
-
-  typedef typename IlukHandle::size_type size_type;
-  typedef typename IlukHandle::nnz_lno_t nnz_lno_t;
-
-  typedef typename IlukHandle::nnz_lno_view_t             HandleDeviceEntriesType;
-  typedef typename IlukHandle::nnz_row_view_t             HandleDeviceRowMapType;
-
-  //typedef typename IlukHandle::signed_integral_t signed_integral_t;
-
-  size_type nrows = thandle.get_nrows();
-
-  auto A_row_map = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), A_row_map_d );
-  auto A_entries = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), A_entries_d );
-  auto L_row_map = Kokkos::create_mirror_view(Kokkos::HostSpace(), L_row_map_d);
-  auto L_entries = Kokkos::create_mirror_view(Kokkos::HostSpace(), L_entries_d);
-  auto U_row_map = Kokkos::create_mirror_view(Kokkos::HostSpace(), U_row_map_d);
-  auto U_entries = Kokkos::create_mirror_view(Kokkos::HostSpace(), U_entries_d);
-
-  HandleDeviceRowMapType dlevel_list = thandle.get_level_list();
-  auto level_list = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), dlevel_list );
-
-  HandleDeviceEntriesType dlevel_ptr = thandle.get_level_ptr();
-  auto level_ptr = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), dlevel_ptr );
-
-  HandleDeviceEntriesType dlevel_idx = thandle.get_level_idx();
-  auto level_idx = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), dlevel_idx );
-
-  size_type nlev = 0;
-
-  //Level scheduling on A???
-  //level_sched<IlukHandle, AHostRowMapType, AHostEntriesType, HandleHostRowMapType, HandleHostEntriesType, size_type > 
-  //                                    (thandle, A_row_map, A_entries, nrows, level_list, level_ptr, level_idx, nlev);
-  //level_sched (thandle, A_row_map, A_entries, nrows, level_list, level_ptr, level_idx, nlev);
-
-  //Symbolic phase
-  //Kokkos::resize(L_row_map_d, nrows-3);// error: static assertion failed: Can only resize managed views
-  //Kokkos::resize(L_entries_d, L_entries_d.extent(0)-3);
-  //thandle.set_nnzL(L_entries_d.extent(0)+5);
-
-  typedef Kokkos::View<nnz_lno_t*, Kokkos::LayoutLeft, Kokkos::HostSpace> HostTmpViewType;
-    
-  HostTmpViewType h_lev ( "h_lev",  thandle.get_nnzU() );
-  HostTmpViewType h_iw  ( "h_iw",   nrows              );
-  HostTmpViewType h_iL  ( "h_iL",   nrows              );
-  HostTmpViewType h_llev( "h_llev", nrows              );
+void iluk_symbolic(IlukHandle& thandle,
+                   const typename IlukHandle::const_nnz_lno_t& fill_lev,
+                   const ARowMapType& A_row_map_d,
+                   const AEntriesType& A_entries_d, LRowMapType& L_row_map_d,
+                   LEntriesType& L_entries_d, URowMapType& U_row_map_d,
+                   UEntriesType& U_entries_d) {
+  if (thandle.get_algorithm() ==
+          KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP ||
+      thandle.get_algorithm() ==
+          KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1)
+  /*   || thandle.get_algorithm() ==
+     KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHED_TP2 )*/
+  {
+    // Scheduling and symbolic phase currently compute on host - need host copy
+    // of all views
+
+    typedef typename IlukHandle::size_type size_type;
+    typedef typename IlukHandle::nnz_lno_t nnz_lno_t;
+
+    typedef typename IlukHandle::nnz_lno_view_t HandleDeviceEntriesType;
+    typedef typename IlukHandle::nnz_row_view_t HandleDeviceRowMapType;
+
+    // typedef typename IlukHandle::signed_integral_t signed_integral_t;
+
+    size_type nrows = thandle.get_nrows();
+
+    auto A_row_map =
+        Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_row_map_d);
+    auto A_entries =
+        Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_entries_d);
+    auto L_row_map =
+        Kokkos::create_mirror_view(Kokkos::HostSpace(), L_row_map_d);
+    auto L_entries =
+        Kokkos::create_mirror_view(Kokkos::HostSpace(), L_entries_d);
+    auto U_row_map =
+        Kokkos::create_mirror_view(Kokkos::HostSpace(), U_row_map_d);
+    auto U_entries =
+        Kokkos::create_mirror_view(Kokkos::HostSpace(), U_entries_d);
+
+    HandleDeviceRowMapType dlevel_list = thandle.get_level_list();
+    auto level_list =
+        Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), dlevel_list);
+
+    HandleDeviceEntriesType dlevel_ptr = thandle.get_level_ptr();
+    auto level_ptr =
+        Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), dlevel_ptr);
+
+    HandleDeviceEntriesType dlevel_idx = thandle.get_level_idx();
+    auto level_idx =
+        Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), dlevel_idx);
+
+    size_type nlev = 0;
+
+    // Level scheduling on A???
+    // level_sched<IlukHandle, AHostRowMapType, AHostEntriesType,
+    // HandleHostRowMapType, HandleHostEntriesType, size_type >
+    //                                    (thandle, A_row_map, A_entries, nrows,
+    //                                    level_list, level_ptr, level_idx,
+    //                                    nlev);
+    // level_sched (thandle, A_row_map, A_entries, nrows, level_list, level_ptr,
+    // level_idx, nlev);
+
+    // Symbolic phase
+    // Kokkos::resize(L_row_map_d, nrows-3);// error: static assertion failed:
+    // Can only resize managed views Kokkos::resize(L_entries_d,
+    // L_entries_d.extent(0)-3); thandle.set_nnzL(L_entries_d.extent(0)+5);
+
+    typedef Kokkos::View<nnz_lno_t*, Kokkos::LayoutLeft, Kokkos::HostSpace>
+        HostTmpViewType;
+
+    HostTmpViewType h_lev("h_lev", thandle.get_nnzU());
+    HostTmpViewType h_iw("h_iw", nrows);
+    HostTmpViewType h_iL("h_iL", nrows);
+    HostTmpViewType h_llev("h_llev", nrows);
+
+    size_type cntL = 0;
+    size_type cntU = 0;
+    size_type iU, ulev, lenu, lenl;
+
+    L_row_map(0) = 0;
+    U_row_map(0) = 0;
+
+    Kokkos::deep_copy(h_iw, nnz_lno_t(-1));
+
+    // Main loop
+    for (size_type i = 0; i < nrows; ++i) {
+      iU   = i;
+      ulev = i;
+      lenl = lenu = 0;
+
+      // Unpack the ith row
+      size_type k1 = A_row_map(i);
+      size_type k2 = A_row_map(i + 1);
 
-  size_type cntL = 0;
-  size_type cntU = 0;
-  size_type iU, ulev, lenu, lenl;
-  
-  L_row_map(0) = 0;
-  U_row_map(0) = 0;
-
-  Kokkos::deep_copy( h_iw, nnz_lno_t(-1) );
-
-  //Main loop
-  for (size_type i = 0; i < nrows; ++i) {
-    iU = i;
-    ulev = i;
-    lenl = lenu = 0;
-
-    //Unpack the ith row
-    size_type k1 = A_row_map(i);
-    size_type k2 = A_row_map(i+1);
-
-    for ( size_type k = k1; k < k2; ++k ) {
-      size_type col = static_cast<size_type>(A_entries(k));
-      if (col < nrows) { // Ignore column elements that are not in the square matrix
-        if (col > i) {//U part
-          h_iw(col)         = lenu;
-          h_iL(iU+lenu)     = col;
-          h_llev(ulev+lenu) = 0;
-          lenu++;
-        } 
-        else if (col < i) {//L part
-          h_iw(col)    = lenl;
-          h_iL(lenl)   = col;
-          h_llev(lenl) = 0;
-          lenl++;
-        }
-      }  
-    }
-
-    //Eliminate rows
-    nnz_lno_t j = -1;
-    while (static_cast<size_type>(++j) < lenl) {
-      size_type row  = search_col_index(j, lenl, h_iL, h_llev, h_iw);
-      nnz_lno_t jlev = h_llev(j);
-      k1 = U_row_map(row)  +1;
-      k2 = U_row_map(row+1);
       for (size_type k = k1; k < k2; ++k) {
-        size_type col  = static_cast<size_type>(U_entries(k));
-        nnz_lno_t lev1 = jlev + h_lev(k) + 1;
-        if (lev1 > fill_lev) continue;
-        nnz_lno_t ipos = h_iw(col);
-        if (ipos == -1) {//Fill-in
-          if (col > i) {//U part
-            h_iw(col)         = lenu;
-            h_iL(iU+lenu)     = col;
-            h_llev(ulev+lenu) = lev1;
+        size_type col = static_cast<size_type>(A_entries(k));
+        if (col < nrows) {  // Ignore column elements that are not in the square
+                            // matrix
+          if (col > i) {    // U part
+            h_iw(col)           = lenu;
+            h_iL(iU + lenu)     = col;
+            h_llev(ulev + lenu) = 0;
             lenu++;
-          }
-          else if (col < i) {//L part
+          } else if (col < i) {  // L part
             h_iw(col)    = lenl;
             h_iL(lenl)   = col;
-            h_llev(lenl) = lev1;
+            h_llev(lenl) = 0;
             lenl++;
           }
         }
-        else {//Not a fill-in
-          if (col > i) 
-            h_llev(ulev+ipos) = std::min(h_llev(ulev+ipos), lev1);
-          else if (col < i)
-            h_llev(ipos) = std::min(h_llev(ipos), lev1);
+      }
+
+      // Eliminate rows
+      nnz_lno_t j = -1;
+      while (static_cast<size_type>(++j) < lenl) {
+        size_type row  = search_col_index(j, lenl, h_iL, h_llev, h_iw);
+        nnz_lno_t jlev = h_llev(j);
+        k1             = U_row_map(row) + 1;
+        k2             = U_row_map(row + 1);
+        for (size_type k = k1; k < k2; ++k) {
+          size_type col  = static_cast<size_type>(U_entries(k));
+          nnz_lno_t lev1 = jlev + h_lev(k) + 1;
+          if (lev1 > fill_lev) continue;
+          nnz_lno_t ipos = h_iw(col);
+          if (ipos == -1) {  // Fill-in
+            if (col > i) {   // U part
+              h_iw(col)           = lenu;
+              h_iL(iU + lenu)     = col;
+              h_llev(ulev + lenu) = lev1;
+              lenu++;
+            } else if (col < i) {  // L part
+              h_iw(col)    = lenl;
+              h_iL(lenl)   = col;
+              h_llev(lenl) = lev1;
+              lenl++;
+            }
+          } else {  // Not a fill-in
+            if (col > i)
+              h_llev(ulev + ipos) = std::min(h_llev(ulev + ipos), lev1);
+            else if (col < i)
+              h_llev(ipos) = std::min(h_llev(ipos), lev1);
+          }
         }
       }
-    }
 
-    //Reset iw
-    for (size_type k = 0; k < lenl; ++k) h_iw(h_iL(k))    = -1;
-    for (size_type k = 0; k < lenu; ++k) h_iw(h_iL(iU+k)) = -1;
-
-    //Copy U part+diag and levels
-    if (cntU+lenu+1 > static_cast<size_type>(U_entries_d.extent(0))) {
-      //size_type newsize = (size_type)(U_entries_d.extent(0)*EXPAND_FACT);
-      //Kokkos::resize(h_lev, newsize);
-      //Kokkos::resize(U_entries, newsize);
-      //Kokkos::resize(U_entries_d, newsize);
-      std::ostringstream os;
-      os << "KokkosSparse::Experimental::spiluk_symbolic: U_entries's extent must be larger than " << U_entries_d.extent(0);
-      Kokkos::Impl::throw_runtime_exception (os.str ());
-    }
-    //U diag entry
-    U_entries(cntU) = i;
-    cntU++;
-    //U part
-    for (size_type k = 0; k < lenu; ++k) {
-      U_entries(cntU) = h_iL(iU+k);
-      h_lev(cntU)     = h_llev(ulev+k);
+      // Reset iw
+      for (size_type k = 0; k < lenl; ++k) h_iw(h_iL(k)) = -1;
+      for (size_type k = 0; k < lenu; ++k) h_iw(h_iL(iU + k)) = -1;
+
+      // Copy U part+diag and levels
+      if (cntU + lenu + 1 > static_cast<size_type>(U_entries_d.extent(0))) {
+        // size_type newsize = (size_type)(U_entries_d.extent(0)*EXPAND_FACT);
+        // Kokkos::resize(h_lev, newsize);
+        // Kokkos::resize(U_entries, newsize);
+        // Kokkos::resize(U_entries_d, newsize);
+        std::ostringstream os;
+        os << "KokkosSparse::Experimental::spiluk_symbolic: U_entries's extent "
+              "must be larger than "
+           << U_entries_d.extent(0);
+        KokkosKernels::Impl::throw_runtime_exception(os.str());
+      }
+      // U diag entry
+      U_entries(cntU) = i;
       cntU++;
-    }
-    U_row_map(i+1) = cntU;
+      // U part
+      for (size_type k = 0; k < lenu; ++k) {
+        U_entries(cntU) = h_iL(iU + k);
+        h_lev(cntU)     = h_llev(ulev + k);
+        cntU++;
+      }
+      U_row_map(i + 1) = cntU;
 
-    //Copy L part
+      // Copy L part
 #ifdef KEEP_DIAG
-    if (cntL+lenl+1 > static_cast<size_type>(L_entries_d.extent(0))) {
+      if (cntL + lenl + 1 > static_cast<size_type>(L_entries_d.extent(0))) {
 #else
-    if (cntL+lenl > static_cast<size_type>(L_entries_d.extent(0))) {
+      if (cntL + lenl > static_cast<size_type>(L_entries_d.extent(0))) {
 #endif
-      //size_type newsize = (size_type) (L_entries_d.extent(0)*EXPAND_FACT);
-      //Kokkos::resize(L_entries, newsize);
-      //Kokkos::resize(L_entries_d, newsize);
-      std::ostringstream os;
-      os << "KokkosSparse::Experimental::spiluk_symbolic: L_entries's extent must be larger than " << L_entries_d.extent(0);
-      Kokkos::Impl::throw_runtime_exception (os.str ());
-    }
-    for (size_type k = 0; k < lenl; ++k) {
-      L_entries(cntL) = h_iL(k);
-      cntL++;
-    }
+        // size_type newsize = (size_type) (L_entries_d.extent(0)*EXPAND_FACT);
+        // Kokkos::resize(L_entries, newsize);
+        // Kokkos::resize(L_entries_d, newsize);
+        std::ostringstream os;
+        os << "KokkosSparse::Experimental::spiluk_symbolic: L_entries's extent "
+              "must be larger than "
+           << L_entries_d.extent(0);
+        KokkosKernels::Impl::throw_runtime_exception(os.str());
+      }
+      for (size_type k = 0; k < lenl; ++k) {
+        L_entries(cntL) = h_iL(k);
+        cntL++;
+      }
 #ifdef KEEP_DIAG
-    //L diag entry
-    L_entries(cntL) = i;
-    cntL++;
+      // L diag entry
+      L_entries(cntL) = i;
+      cntL++;
 #endif
-    L_row_map(i+1) = cntL;
-  }//End main loop i
+      L_row_map(i + 1) = cntL;
+    }  // End main loop i
+
+    thandle.set_nnzL(cntL);
+    thandle.set_nnzU(cntU);
+
+    // Sort
+    for (size_type row_id = 0;
+         row_id < static_cast<size_type>(L_row_map.extent(0)) - 1; row_id++) {
+      size_type row_start = L_row_map(row_id);
+      size_type row_end   = L_row_map(row_id + 1);
+      Kokkos::sort(subview(L_entries, Kokkos::make_pair(row_start, row_end)));
+    }
+    for (size_type row_id = 0;
+         row_id < static_cast<size_type>(U_row_map.extent(0)) - 1; row_id++) {
+      size_type row_start = U_row_map(row_id);
+      size_type row_end   = U_row_map(row_id + 1);
+      Kokkos::sort(subview(U_entries, Kokkos::make_pair(row_start, row_end)));
+    }
 
-  thandle.set_nnzL(cntL);
-  thandle.set_nnzU(cntU);
+    // Level scheduling on L
+    level_sched(thandle, L_row_map, L_entries, nrows, level_list, level_ptr,
+                level_idx, nlev);
 
-  // Sort
-  for (size_type row_id = 0; row_id < static_cast<size_type>(L_row_map.extent(0))-1; row_id++) {
-    size_type row_start = L_row_map(row_id);
-    size_type row_end   = L_row_map(row_id + 1);
-    Kokkos::sort(subview(L_entries, Kokkos::make_pair(row_start, row_end)));
-  }
-  for (size_type row_id = 0; row_id < static_cast<size_type>(U_row_map.extent(0))-1; row_id++) {
-    size_type row_start = U_row_map(row_id);
-    size_type row_end   = U_row_map(row_id + 1);
-    Kokkos::sort(subview(U_entries, Kokkos::make_pair(row_start, row_end)));
-  }
+    thandle.set_symbolic_complete();
 
-  //Level scheduling on L
-  level_sched (thandle, L_row_map, L_entries, nrows, level_list, level_ptr, level_idx, nlev);  
-  
-  thandle.set_symbolic_complete();
-
-  // Output check
+    // Output check
 #ifdef SYMBOLIC_OUTPUT_INFO
-  std::cout << "  ILU(k) fill_level: " << fill_lev << std::endl;
-  std::cout << "  symbolic complete: " << thandle.is_symbolic_complete() << std::endl;
-  std::cout << "  num levels: " << thandle.get_num_levels() << std::endl;
-  std::cout << "  max num rows levels: " << thandle.get_level_maxrows() << std::endl;
-
-  std::cout << "  iluk_symbolic result: " << std::endl;
-
-  std::cout << "  level_list = ";
-  for ( size_type i = 0; i < nrows; ++i )
-  { std::cout << level_list(i) << " "; }
-  std::cout << std::endl;
-
-  std::cout << "  level_ptr = ";
-  for ( size_type i = 0; i < nlev+1; ++i )
-  { std::cout << level_ptr(i) << " "; }
-  std::cout << std::endl;
-
-  std::cout << "  level_idx = ";
-  for ( size_type i = 0; i < nrows; ++i )
-  { std::cout << level_idx(i) << " "; }
-  std::cout << std::endl;
-
-  std::cout << "  nnzL: " << thandle.get_nnzL() << std::endl;
-  std::cout << "  L_row_map = ";
-  for ( size_type i = 0; i < nrows+1; ++i )
-  { std::cout << L_row_map(i) << " "; }
-  std::cout << std::endl;
-
-  std::cout << "  L_entries = ";
-  for ( size_type i = 0; i < thandle.get_nnzL(); ++i )
-  { std::cout << L_entries(i) << " "; }
-  std::cout << std::endl;
-
-  std::cout << "  nnzU: " << thandle.get_nnzU() << std::endl;
-  std::cout << "  U_row_map = ";
-  for ( size_type i = 0; i < nrows+1; ++i )
-  { std::cout << U_row_map(i) << " "; }
-  std::cout << std::endl;
-
-  std::cout << "  U_entries = ";
-  for ( size_type i = 0; i < thandle.get_nnzU(); ++i )
-  { std::cout << U_entries(i) << " "; }
-  std::cout << std::endl;
+    std::cout << "  ILU(k) fill_level: " << fill_lev << std::endl;
+    std::cout << "  symbolic complete: " << thandle.is_symbolic_complete()
+              << std::endl;
+    std::cout << "  num levels: " << thandle.get_num_levels() << std::endl;
+    std::cout << "  max num rows levels: " << thandle.get_level_maxrows()
+              << std::endl;
+
+    std::cout << "  iluk_symbolic result: " << std::endl;
+
+    std::cout << "  level_list = ";
+    for (size_type i = 0; i < nrows; ++i) {
+      std::cout << level_list(i) << " ";
+    }
+    std::cout << std::endl;
+
+    std::cout << "  level_ptr = ";
+    for (size_type i = 0; i < nlev + 1; ++i) {
+      std::cout << level_ptr(i) << " ";
+    }
+    std::cout << std::endl;
+
+    std::cout << "  level_idx = ";
+    for (size_type i = 0; i < nrows; ++i) {
+      std::cout << level_idx(i) << " ";
+    }
+    std::cout << std::endl;
+
+    std::cout << "  nnzL: " << thandle.get_nnzL() << std::endl;
+    std::cout << "  L_row_map = ";
+    for (size_type i = 0; i < nrows + 1; ++i) {
+      std::cout << L_row_map(i) << " ";
+    }
+    std::cout << std::endl;
+
+    std::cout << "  L_entries = ";
+    for (size_type i = 0; i < thandle.get_nnzL(); ++i) {
+      std::cout << L_entries(i) << " ";
+    }
+    std::cout << std::endl;
+
+    std::cout << "  nnzU: " << thandle.get_nnzU() << std::endl;
+    std::cout << "  U_row_map = ";
+    for (size_type i = 0; i < nrows + 1; ++i) {
+      std::cout << U_row_map(i) << " ";
+    }
+    std::cout << std::endl;
+
+    std::cout << "  U_entries = ";
+    for (size_type i = 0; i < thandle.get_nnzU(); ++i) {
+      std::cout << U_entries(i) << " ";
+    }
+    std::cout << std::endl;
 #endif
 
-  Kokkos::deep_copy(dlevel_ptr, level_ptr);
-  Kokkos::deep_copy(dlevel_idx, level_idx);
-  Kokkos::deep_copy(dlevel_list, level_list);
-  
-  Kokkos::deep_copy(L_row_map_d, L_row_map);
-  Kokkos::deep_copy(L_entries_d, L_entries);
-  Kokkos::deep_copy(U_row_map_d, U_row_map);
-  Kokkos::deep_copy(U_entries_d, U_entries);
- }
-} // end iluk_symbolic
-
-} // namespace Experimental
-} // namespace Impl
-} // namespace KokkosSparse
+    Kokkos::deep_copy(dlevel_ptr, level_ptr);
+    Kokkos::deep_copy(dlevel_idx, level_idx);
+    Kokkos::deep_copy(dlevel_list, level_list);
+
+    Kokkos::deep_copy(L_row_map_d, L_row_map);
+    Kokkos::deep_copy(L_entries_d, L_entries);
+    Kokkos::deep_copy(U_row_map_d, U_row_map);
+    Kokkos::deep_copy(U_entries_d, U_entries);
+  }
+}  // end iluk_symbolic
+
+}  // namespace Experimental
+}  // namespace Impl
+}  // namespace KokkosSparse
 
 #endif
diff --git a/src/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp b/src/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp
index 597074475c..658d288a48 100644
--- a/src/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp
+++ b/src/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp
@@ -51,57 +51,61 @@
 #include "KokkosKernels_Handle.hpp"
 
 // Include the actual functors
-#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY 
+#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 #include <KokkosSparse_spiluk_symbolic_impl.hpp>
 #endif
 
 namespace KokkosSparse {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class KernelHandle,
-         class ARowMapType,
-         class AEntriesType,
-         class LRowMapType,
-         class LEntriesType,
-         class URowMapType,
-         class UEntriesType>
+template <class KernelHandle, class ARowMapType, class AEntriesType,
+          class LRowMapType, class LEntriesType, class URowMapType,
+          class UEntriesType>
 struct spiluk_symbolic_eti_spec_avail {
   enum : bool { value = false };
 };
 
-}
-}
-
-
-#define KOKKOSSPARSE_SPILUK_SYMBOLIC_ETI_SPEC_AVAIL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \
-    template<> \
-    struct spiluk_symbolic_eti_spec_avail< \
-                  KokkosKernels::Experimental::KokkosKernelsHandle<\
-                               const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,  \
-                               EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE> , \
-                  Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > > \
-    { enum : bool { value = true }; };
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#define KOKKOSSPARSE_SPILUK_SYMBOLIC_ETI_SPEC_AVAIL(                           \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE,      \
+    MEM_SPACE_TYPE)                                                            \
+  template <>                                                                  \
+  struct spiluk_symbolic_eti_spec_avail<                                       \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                        \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,            \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::View<                                                            \
+          const OFFSET_TYPE *, LAYOUT_TYPE,                                    \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                     \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >,    \
+      Kokkos::View<                                                            \
+          const ORDINAL_TYPE *, LAYOUT_TYPE,                                   \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                     \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >,    \
+      Kokkos::View<                                                            \
+          OFFSET_TYPE *, LAYOUT_TYPE,                                          \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                     \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >,    \
+      Kokkos::View<                                                            \
+          ORDINAL_TYPE *, LAYOUT_TYPE,                                         \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                     \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >,    \
+      Kokkos::View<                                                            \
+          OFFSET_TYPE *, LAYOUT_TYPE,                                          \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                     \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >,    \
+      Kokkos::View<                                                            \
+          ORDINAL_TYPE *, LAYOUT_TYPE,                                         \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                     \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> > > { \
+    enum : bool { value = true };                                              \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosSparse_spiluk_symbolic_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosSparse_spiluk_symbolic_eti_spec_avail.hpp>
+#include <KokkosSparse_spiluk_symbolic_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spiluk_symbolic_eti_spec_avail.hpp>
 
 namespace KokkosSparse {
 namespace Impl {
@@ -109,73 +113,49 @@ namespace Impl {
 // Unification layer
 /// \brief Implementation of KokkosSparse::spiluk_symbolic
 
-template<class KernelHandle,
-         class ARowMapType,
-         class AEntriesType,
-         class LRowMapType,
-         class LEntriesType,
-         class URowMapType,
-         class UEntriesType,
-         bool tpl_spec_avail =
-              spiluk_symbolic_tpl_spec_avail<KernelHandle,
-                                             ARowMapType,
-                                             AEntriesType,
-                                             LRowMapType,
-                                             LEntriesType,
-                                             URowMapType,
-                                             UEntriesType>::value,
-         bool eti_spec_avail =
-              spiluk_symbolic_eti_spec_avail<KernelHandle,
-                                             ARowMapType,
-                                             AEntriesType,
-                                             LRowMapType,
-                                             LEntriesType,
-                                             URowMapType,
-                                             UEntriesType>::value>
-struct SPILUK_SYMBOLIC{
-  static void
-  spiluk_symbolic (KernelHandle *handle,
-                   const typename KernelHandle::const_nnz_lno_t &fill_lev,
-                   const ARowMapType&  A_row_map,
-                   const AEntriesType& A_entries,
-                         LRowMapType&  L_row_map,
-                         LEntriesType& L_entries,
-                         URowMapType&  U_row_map,
-                         UEntriesType& U_entries);
+template <class KernelHandle, class ARowMapType, class AEntriesType,
+          class LRowMapType, class LEntriesType, class URowMapType,
+          class UEntriesType,
+          bool tpl_spec_avail = spiluk_symbolic_tpl_spec_avail<
+              KernelHandle, ARowMapType, AEntriesType, LRowMapType,
+              LEntriesType, URowMapType, UEntriesType>::value,
+          bool eti_spec_avail = spiluk_symbolic_eti_spec_avail<
+              KernelHandle, ARowMapType, AEntriesType, LRowMapType,
+              LEntriesType, URowMapType, UEntriesType>::value>
+struct SPILUK_SYMBOLIC {
+  static void spiluk_symbolic(
+      KernelHandle *handle,
+      const typename KernelHandle::const_nnz_lno_t &fill_lev,
+      const ARowMapType &A_row_map, const AEntriesType &A_entries,
+      LRowMapType &L_row_map, LEntriesType &L_entries, URowMapType &U_row_map,
+      UEntriesType &U_entries);
 };
 
-
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 //! Full specialization of spiluk_symbolic
 // Unification layer
-template<class KernelHandle,
-         class ARowMapType,
-         class AEntriesType,
-         class LRowMapType,
-         class LEntriesType,
-         class URowMapType,
-         class UEntriesType>
-struct SPILUK_SYMBOLIC<KernelHandle, ARowMapType, AEntriesType, LRowMapType, LEntriesType, URowMapType, UEntriesType, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>{
-  static void
-  spiluk_symbolic (KernelHandle *handle,
-                   const typename KernelHandle::const_nnz_lno_t &fill_lev,
-                   const ARowMapType&  A_row_map,
-                   const AEntriesType& A_entries,
-                         LRowMapType&  L_row_map,
-                         LEntriesType& L_entries,
-                         URowMapType&  U_row_map,
-                         UEntriesType& U_entries)
-  {
+template <class KernelHandle, class ARowMapType, class AEntriesType,
+          class LRowMapType, class LEntriesType, class URowMapType,
+          class UEntriesType>
+struct SPILUK_SYMBOLIC<KernelHandle, ARowMapType, AEntriesType, LRowMapType,
+                       LEntriesType, URowMapType, UEntriesType, false,
+                       KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  static void spiluk_symbolic(
+      KernelHandle *handle,
+      const typename KernelHandle::const_nnz_lno_t &fill_lev,
+      const ARowMapType &A_row_map, const AEntriesType &A_entries,
+      LRowMapType &L_row_map, LEntriesType &L_entries, URowMapType &U_row_map,
+      UEntriesType &U_entries) {
     auto spiluk_handle = handle->get_spiluk_handle();
 
-    Experimental::iluk_symbolic(*spiluk_handle, fill_lev, A_row_map, A_entries, L_row_map, L_entries, U_row_map, U_entries);
+    Experimental::iluk_symbolic(*spiluk_handle, fill_lev, A_row_map, A_entries,
+                                L_row_map, L_entries, U_row_map, U_entries);
     spiluk_handle->set_symbolic_complete();
   }
-
 };
 #endif
-} // namespace Impl
-} // namespace KokkosSparse
+}  // namespace Impl
+}  // namespace KokkosSparse
 
 //
 // Macro for declaration of full specialization of
@@ -184,60 +164,73 @@ struct SPILUK_SYMBOLIC<KernelHandle, ARowMapType, AEntriesType, LRowMapType, LEn
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSSPARSE_SPILUK_SYMBOLIC_ETI_SPEC_DECL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE ) \
-    extern template struct  \
-    SPILUK_SYMBOLIC<\
-                  KokkosKernels::Experimental::KokkosKernelsHandle<\
-                               const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,  \
-                               EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE> , \
-                  Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                      false, true >; \
-
-#define KOKKOSSPARSE_SPILUK_SYMBOLIC_ETI_SPEC_INST( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \
-    template struct  \
-    SPILUK_SYMBOLIC<\
-                  KokkosKernels::Experimental::KokkosKernelsHandle<\
-                               const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,  \
-                               EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE> , \
-                  Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<OFFSET_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<ORDINAL_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                      false, true > ;
-
-#include<KokkosSparse_spiluk_symbolic_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosSparse_spiluk_symbolic_eti_spec_decl.hpp>
-
+#define KOKKOSSPARSE_SPILUK_SYMBOLIC_ETI_SPEC_DECL(                         \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE,   \
+    MEM_SPACE_TYPE)                                                         \
+  extern template struct SPILUK_SYMBOLIC<                                   \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                     \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,         \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>,                 \
+      Kokkos::View<                                                         \
+          const OFFSET_TYPE *, LAYOUT_TYPE,                                 \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          const ORDINAL_TYPE *, LAYOUT_TYPE,                                \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          OFFSET_TYPE *, LAYOUT_TYPE,                                       \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          ORDINAL_TYPE *, LAYOUT_TYPE,                                      \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          OFFSET_TYPE *, LAYOUT_TYPE,                                       \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          ORDINAL_TYPE *, LAYOUT_TYPE,                                      \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      false, true>;
+
+#define KOKKOSSPARSE_SPILUK_SYMBOLIC_ETI_SPEC_INST(                         \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE,   \
+    MEM_SPACE_TYPE)                                                         \
+  template struct SPILUK_SYMBOLIC<                                          \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                     \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,         \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>,                 \
+      Kokkos::View<                                                         \
+          const OFFSET_TYPE *, LAYOUT_TYPE,                                 \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          const ORDINAL_TYPE *, LAYOUT_TYPE,                                \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          OFFSET_TYPE *, LAYOUT_TYPE,                                       \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          ORDINAL_TYPE *, LAYOUT_TYPE,                                      \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          OFFSET_TYPE *, LAYOUT_TYPE,                                       \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          ORDINAL_TYPE *, LAYOUT_TYPE,                                      \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      false, true>;
+
+#include <KokkosSparse_spiluk_symbolic_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spiluk_symbolic_eti_spec_decl.hpp>
 
 #endif
diff --git a/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_impl.hpp
new file mode 100644
index 0000000000..0dc0dfceb6
--- /dev/null
+++ b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_impl.hpp
@@ -0,0 +1,1178 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_IMPL_HPP
+#define KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_IMPL_HPP
+
+#include "KokkosBlas.hpp"
+#include "KokkosBatched_Gemv_Serial_Internal.hpp"
+#include "KokkosBatched_Gemm_Serial_Internal.hpp"
+#include "KokkosKernels_ExecSpaceUtils.hpp"
+#include "KokkosSparse_spmv_impl.hpp"
+
+namespace KokkosSparse {
+namespace Experimental {
+namespace Impl {
+namespace BCRS {
+
+template <class AMatrix, class XVector, class YVector>
+struct BCRS_GEMV_Functor {
+  typedef typename AMatrix::execution_space execution_space;
+  typedef typename AMatrix::non_const_value_type value_type;
+  typedef typename Kokkos::TeamPolicy<execution_space> team_policy;
+  typedef typename team_policy::member_type team_member;
+  typedef Kokkos::Details::ArithTraits<value_type> ATV;
+
+  //! Nonconst version of the type of column indices in the sparse matrix.
+  typedef typename AMatrix::non_const_ordinal_type ordinal_type;
+  //! Nonconst version of the type of row offsets in the sparse matrix.
+  typedef typename AMatrix::non_const_size_type size_type;
+
+  const value_type alpha;
+  AMatrix m_A;
+  XVector m_x;
+  YVector m_y;
+
+  const ordinal_type block_dim;
+  const ordinal_type blocks_per_team;
+
+  bool conjugate = false;
+
+  BCRS_GEMV_Functor(const value_type alpha_, const AMatrix m_A_,
+                    const XVector m_x_, const YVector m_y_,
+                    const int blocks_per_team_, bool conj_)
+      : alpha(alpha_),
+        m_A(m_A_),
+        m_x(m_x_),
+        m_y(m_y_),
+        block_dim(m_A_.blockDim()),
+        blocks_per_team(blocks_per_team_),
+        conjugate(conj_) {
+    static_assert(static_cast<int>(XVector::rank) == 1,
+                  "XVector must be a rank 1 View.");
+    static_assert(static_cast<int>(YVector::rank) == 1,
+                  "YVector must be a rank 1 View.");
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const ordinal_type iBlock) const {
+    const auto ystart = iBlock * block_dim;
+    const auto start  = m_A.graph.row_map(iBlock);
+    const ordinal_type count =
+        static_cast<ordinal_type>(m_A.graph.row_map(iBlock + 1) - start);
+    const auto row   = m_A.block_row_Const(iBlock);
+    const auto beta1 = static_cast<value_type>(1);
+    //
+    if (conjugate) {
+      for (ordinal_type ic = 0; ic < count; ++ic) {
+        const auto Aview  = row.block(ic);
+        const auto xstart = row.block_colidx(ic) * block_dim;
+        for (ordinal_type ii = 0; ii < block_dim; ++ii) {
+          value_type t(0);
+          for (ordinal_type jj = 0; jj < block_dim; ++jj) {
+            const auto aval =
+                Kokkos::ArithTraits<value_type>::conj(Aview(ii, jj));
+            t += aval * m_x(xstart + jj);
+          }
+          m_y(ystart + ii) += alpha * t;
+        }
+      }
+    } else {
+      for (ordinal_type ic = 0; ic < count; ++ic) {
+        const auto Aview  = row.block(ic);
+        const auto xstart = row.block_colidx(ic) * block_dim;
+        KokkosBatched::SerialGemvInternal<KokkosBatched::Algo::Gemv::Blocked>::
+            invoke<value_type, value_type>(
+                block_dim, block_dim, alpha, Aview.data(), Aview.stride_0(),
+                Aview.stride_1(), &m_x(xstart), m_x.stride_0(), beta1,
+                &m_y(ystart), m_y.stride_0());
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const team_member &dev) const {
+    using y_value_type = typename YVector::non_const_value_type;
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(dev, 0, blocks_per_team),
+        [&](const ordinal_type &loop) {
+          const ordinal_type iBlock =
+              static_cast<ordinal_type>(dev.league_rank()) * blocks_per_team +
+              loop;
+          if (iBlock >= m_A.numRows()) {
+            return;
+          }
+          const auto start = m_A.graph.row_map(iBlock);
+          const ordinal_type count =
+              static_cast<ordinal_type>(m_A.graph.row_map(iBlock + 1) - start);
+          const auto row = m_A.block_row_Const(iBlock);
+          //
+          auto yview = Kokkos::subview(
+              m_y, Kokkos::make_pair(iBlock * block_dim,
+                                     iBlock * block_dim + block_dim));
+          //
+          for (ordinal_type ir = 0; ir < block_dim; ++ir) {
+            y_value_type sum = 0;
+
+            Kokkos::parallel_reduce(
+                Kokkos::ThreadVectorRange(dev, count),
+                [&](const ordinal_type &iEntry, y_value_type &lsum) {
+                  const auto start_col = row.block_colidx(iEntry) * block_dim;
+                  for (ordinal_type jr = 0; jr < block_dim; ++jr) {
+                    const value_type val =
+                        conjugate
+                            ? ATV::conj(row.local_block_value(iEntry, ir, jr))
+                            : row.local_block_value(iEntry, ir, jr);
+                    lsum += val * m_x(start_col + jr);
+                  }
+                },
+                sum);
+
+            Kokkos::single(Kokkos::PerThread(dev), [&]() {
+              sum *= alpha;
+              yview(ir) += sum;
+            });
+          }
+        });
+  }
+};
+
+/* ******************* */
+
+//
+// spMatVec_no_transpose: version for CPU execution spaces
+// (RangePolicy or trivial serial impl used)
+//
+template <class AT, class AO, class AD, class AS, class AlphaType,
+          class XVector, class BetaType, class YVector,
+          typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename YVector::execution_space>()>::type * = nullptr>
+void spMatVec_no_transpose(
+    const KokkosKernels::Experimental::Controls &controls,
+    const AlphaType &alpha,
+    const KokkosSparse::Experimental::BlockCrsMatrix<
+        AT, AO, AD, Kokkos::MemoryTraits<Kokkos::Unmanaged>, AS> &A,
+    const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) {
+  // This is required to maintain semantics of KokkosKernels native SpMV:
+  // if y contains NaN but beta = 0, the result y should be filled with 0.
+  // For example, this is useful for passing in uninitialized y and beta=0.
+  if (beta == Kokkos::ArithTraits<BetaType>::zero())
+    Kokkos::deep_copy(y, Kokkos::ArithTraits<BetaType>::zero());
+  else
+    KokkosBlas::scal(y, beta, y);
+
+  //
+  // Treat the case y <- alpha * A * x + beta * y
+  //
+
+  typedef KokkosSparse::Experimental::BlockCrsMatrix<
+      AT, AO, AD, Kokkos::MemoryTraits<Kokkos::Unmanaged>, AS>
+      AMatrix_Internal;
+
+  bool use_dynamic_schedule = false;  // Forces the use of a dynamic schedule
+  bool use_static_schedule  = false;  // Forces the use of a static schedule
+  if (controls.isParameter("schedule")) {
+    if (controls.getParameter("schedule") == "dynamic") {
+      use_dynamic_schedule = true;
+    } else if (controls.getParameter("schedule") == "static") {
+      use_static_schedule = true;
+    }
+  }
+
+  BCRS_GEMV_Functor<AMatrix_Internal, XVector, YVector> func(alpha, A, x, y, 1,
+                                                             useConjugate);
+  if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) {
+    Kokkos::parallel_for(
+        "KokkosSparse::bcrs_spmv<NoTranspose,Dynamic>",
+        Kokkos::RangePolicy<
+            typename AMatrix_Internal::device_type::execution_space,
+            Kokkos::Schedule<Kokkos::Dynamic>>(0, A.numRows()),
+        func);
+  } else {
+    Kokkos::parallel_for(
+        "KokkosSparse::bcrs_spmv<NoTranspose,Static>",
+        Kokkos::RangePolicy<
+            typename AMatrix_Internal::device_type::execution_space,
+            Kokkos::Schedule<Kokkos::Static>>(0, A.numRows()),
+        func);
+  }
+}
+
+/* ******************* */
+
+//
+// spMatVec_no_transpose: version for GPU execution spaces (TeamPolicy used)
+//
+template <class AT, class AO, class AD, class AS, class AlphaType,
+          class XVector, class BetaType, class YVector,
+          typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename YVector::execution_space>()>::type * = nullptr>
+void spMatVec_no_transpose(
+    const KokkosKernels::Experimental::Controls &controls,
+    const AlphaType &alpha,
+    const KokkosSparse::Experimental::BlockCrsMatrix<
+        AT, AO, AD, Kokkos::MemoryTraits<Kokkos::Unmanaged>, AS> &A,
+    const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) {
+  if (A.numRows() <= static_cast<AO>(0)) {
+    return;
+  }
+
+  // We need to scale y first ("scaling" by zero just means filling
+  // with zeros), since the functor updates y (by adding alpha Op(A) x).
+  KokkosBlas::scal(y, beta, y);
+
+  typedef KokkosSparse::Experimental::BlockCrsMatrix<
+      AT, AO, AD, Kokkos::MemoryTraits<Kokkos::Unmanaged>, AS>
+      AMatrix_Internal;
+  typedef typename AMatrix_Internal::execution_space execution_space;
+
+  bool use_dynamic_schedule = false;  // Forces the use of a dynamic schedule
+  bool use_static_schedule  = false;  // Forces the use of a static schedule
+  if (controls.isParameter("schedule")) {
+    if (controls.getParameter("schedule") == "dynamic") {
+      use_dynamic_schedule = true;
+    } else if (controls.getParameter("schedule") == "static") {
+      use_static_schedule = true;
+    }
+  }
+  int team_size             = -1;
+  int vector_length         = -1;
+  int64_t blocks_per_thread = -1;
+
+  //
+  // Use the controls to allow the user to pass in some tuning parameters.
+  //
+  if (controls.isParameter("team size")) {
+    team_size = std::stoi(controls.getParameter("team size"));
+  }
+  if (controls.isParameter("vector length")) {
+    vector_length = std::stoi(controls.getParameter("vector length"));
+  }
+  if (controls.isParameter("rows per thread")) {
+    blocks_per_thread = std::stoll(controls.getParameter("rows per thread"));
+  }
+
+  //
+  // Use the existing launch parameters routine from SPMV
+  //
+  int64_t blocks_per_team =
+      KokkosSparse::Impl::spmv_launch_parameters<execution_space>(
+          A.numRows(), A.nnz(), blocks_per_thread, team_size, vector_length);
+  int64_t worksets = (A.numRows() + blocks_per_team - 1) / blocks_per_team;
+
+  AMatrix_Internal A_internal = A;
+
+  BCRS_GEMV_Functor<AMatrix_Internal, XVector, YVector> func(
+      alpha, A_internal, x, y, blocks_per_team, useConjugate);
+
+  if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) {
+    Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic>>
+        policy(1, 1);
+    if (team_size < 0)
+      policy = Kokkos::TeamPolicy<execution_space,
+                                  Kokkos::Schedule<Kokkos::Dynamic>>(
+          worksets, Kokkos::AUTO, vector_length);
+    else
+      policy = Kokkos::TeamPolicy<execution_space,
+                                  Kokkos::Schedule<Kokkos::Dynamic>>(
+          worksets, team_size, vector_length);
+    Kokkos::parallel_for("KokkosSparse::bcrs_spmv<NoTranspose,Dynamic>", policy,
+                         func);
+  } else {
+    Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>
+        policy(1, 1);
+    if (team_size < 0)
+      policy =
+          Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>(
+              worksets, Kokkos::AUTO, vector_length);
+    else
+      policy =
+          Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>(
+              worksets, team_size, vector_length);
+    Kokkos::parallel_for("KokkosSparse::bcrs_spmv<NoTranspose, Static>", policy,
+                         func);
+  }
+}
+
+/* ******************* */
+
+template <class AMatrix, class XVector, class YVector>
+struct BCRS_GEMV_Transpose_Functor {
+  typedef typename AMatrix::execution_space execution_space;
+  typedef typename AMatrix::non_const_value_type value_type;
+  typedef typename Kokkos::TeamPolicy<execution_space> team_policy;
+  typedef typename team_policy::member_type team_member;
+  typedef Kokkos::Details::ArithTraits<value_type> ATV;
+
+  //! Nonconst version of the type of column indices in the sparse matrix.
+  typedef typename AMatrix::non_const_ordinal_type ordinal_type;
+  //! Nonconst version of the type of row offsets in the sparse matrix.
+  typedef typename AMatrix::non_const_size_type size_type;
+
+  const value_type alpha;
+
+  AMatrix m_A;
+  XVector m_x;
+  YVector m_y;
+
+  const ordinal_type block_dim;
+  const ordinal_type blocks_per_team;
+
+  bool conjugate = false;
+
+  BCRS_GEMV_Transpose_Functor(const value_type alpha_, const AMatrix m_A_,
+                              const XVector m_x_, const YVector m_y_,
+                              const int blocks_per_team_, bool conj_)
+      : alpha(alpha_),
+        m_A(m_A_),
+        m_x(m_x_),
+        m_y(m_y_),
+        block_dim(m_A_.blockDim()),
+        blocks_per_team(blocks_per_team_),
+        conjugate(conj_) {
+    static_assert(static_cast<int>(XVector::rank) == 1,
+                  "XVector must be a rank 1 View.");
+    static_assert(static_cast<int>(YVector::rank) == 1,
+                  "YVector must be a rank 1 View.");
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const ordinal_type iBlock) const {
+    //
+    // Assume that alpha is not zero
+    //
+    const auto xstart = iBlock * block_dim;
+    const auto xview =
+        Kokkos::subview(m_x, Kokkos::make_pair(xstart, xstart + block_dim));
+    const auto start = m_A.graph.row_map(iBlock);
+    const ordinal_type count =
+        static_cast<ordinal_type>(m_A.graph.row_map(iBlock + 1) - start);
+    const auto row    = m_A.block_row_Const(iBlock);
+    const auto beta1  = static_cast<value_type>(1);
+    const auto alpha1 = beta1;
+    if (conjugate) {
+      for (ordinal_type ic = 0; ic < count; ++ic) {
+        const auto Aview  = row.block(ic);
+        const auto ystart = row.block_colidx(ic) * block_dim;
+        for (ordinal_type jj = 0; jj < block_dim; ++jj) {
+          value_type t(0);
+          for (ordinal_type ii = 0; ii < block_dim; ++ii) {
+            const auto aval =
+                Kokkos::ArithTraits<value_type>::conj(Aview(ii, jj));
+            t += aval * xview(ii);
+          }
+          t *= alpha;
+          Kokkos::atomic_add(&m_y(ystart + jj), t);
+        }
+      }
+    } else {
+      for (ordinal_type ic = 0; ic < count; ++ic) {
+        const auto Aview  = row.block(ic);
+        const auto ystart = row.block_colidx(ic) * block_dim;
+        for (ordinal_type jj = 0; jj < block_dim; ++jj) {
+          value_type t(0);
+          KokkosBatched::SerialGemvInternal<
+              KokkosBatched::Algo::Gemv::Blocked>::invoke<value_type,
+                                                          value_type>(
+              1, block_dim, alpha1, Aview.data() + jj, Aview.stride_1(),
+              Aview.stride_0(), xview.data(), xview.stride_0(), beta1, &t, 1);
+          t *= alpha;
+          Kokkos::atomic_add(&m_y(ystart + jj), t);
+        }
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const team_member &dev) const {
+    using y_value_type = typename YVector::non_const_value_type;
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(dev, 0, blocks_per_team),
+        [&](const ordinal_type &loop) {
+          const ordinal_type iBlock =
+              static_cast<ordinal_type>(dev.league_rank()) * blocks_per_team +
+              loop;
+          if (iBlock >= m_A.numRows()) {
+            return;
+          }
+          const auto start = m_A.graph.row_map(iBlock);
+          const ordinal_type count =
+              static_cast<ordinal_type>(m_A.graph.row_map(iBlock + 1) - start);
+          const auto row = m_A.block_row_Const(iBlock);
+          //
+          for (ordinal_type ir = 0; ir < block_dim; ++ir) {
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(dev, count),
+                [&](const ordinal_type &iEntry) {
+                  for (ordinal_type jr = 0; jr < block_dim; ++jr) {
+                    const value_type val =
+                        conjugate
+                            ? ATV::conj(row.local_block_value(iEntry, jr, ir))
+                            : row.local_block_value(iEntry, jr, ir);
+                    const ordinal_type ind = row.block_colidx(iEntry);
+                    Kokkos::atomic_add(
+                        &m_y(block_dim * ind + ir),
+                        static_cast<y_value_type>(
+                            alpha * val * m_x(block_dim * iBlock + jr)));
+                  }
+                });
+          }
+        });
+  }
+};
+
+/* ******************* */
+
+/// \brief  spMatVec_transpose: version for CPU execution spaces (RangePolicy or
+/// trivial serial impl used)
+template <class AT, class AO, class AD, class AS, class AlphaType,
+          class XVector, class BetaType, class YVector,
+          typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename YVector::execution_space>()>::type * = nullptr>
+void spMatVec_transpose(
+    const KokkosKernels::Experimental::Controls &controls,
+    const AlphaType &alpha,
+    const KokkosSparse::Experimental::BlockCrsMatrix<
+        AT, AO, AD, Kokkos::MemoryTraits<Kokkos::Unmanaged>, AS> &A,
+    const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) {
+  // This is required to maintain semantics of KokkosKernels native SpMV:
+  // if y contains NaN but beta = 0, the result y should be filled with 0.
+  // For example, this is useful for passing in uninitialized y and beta=0.
+  if (beta == Kokkos::ArithTraits<BetaType>::zero())
+    Kokkos::deep_copy(y, Kokkos::ArithTraits<BetaType>::zero());
+  else
+    KokkosBlas::scal(y, beta, y);
+
+  if (alpha == Kokkos::ArithTraits<AlphaType>::zero()) return;
+
+  //
+  // Treat the case y <- alpha * A^T * x + beta * y
+  //
+
+  typedef KokkosSparse::Experimental::BlockCrsMatrix<
+      AT, AO, AD, Kokkos::MemoryTraits<Kokkos::Unmanaged>, AS>
+      AMatrix_Internal;
+
+  AMatrix_Internal A_internal = A;
+
+  bool use_dynamic_schedule = false;  // Forces the use of a dynamic schedule
+  bool use_static_schedule  = false;  // Forces the use of a static schedule
+  if (controls.isParameter("schedule")) {
+    if (controls.getParameter("schedule") == "dynamic") {
+      use_dynamic_schedule = true;
+    } else if (controls.getParameter("schedule") == "static") {
+      use_static_schedule = true;
+    }
+  }
+
+  BCRS_GEMV_Transpose_Functor<AMatrix_Internal, XVector, YVector> func(
+      alpha, A_internal, x, y, 1, useConjugate);
+  if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) {
+    Kokkos::parallel_for(
+        "KokkosSparse::blockcrs_spmv<Transpose,Dynamic>",
+        Kokkos::RangePolicy<
+            typename AMatrix_Internal::device_type::execution_space,
+            Kokkos::Schedule<Kokkos::Dynamic>>(0, A.numRows()),
+        func);
+  } else {
+    Kokkos::parallel_for(
+        "KokkosSparse::blockcrs_spmv<Transpose,Static>",
+        Kokkos::RangePolicy<
+            typename AMatrix_Internal::device_type::execution_space,
+            Kokkos::Schedule<Kokkos::Static>>(0, A.numRows()),
+        func);
+  }
+}
+
+//
+// spMatVec_transpose: version for GPU execution spaces (TeamPolicy used)
+//
+template <class AMatrix, class AlphaType, class XVector, class BetaType,
+          class YVector,
+          typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename YVector::execution_space>()>::type * = nullptr>
+void spMatVec_transpose(const KokkosKernels::Experimental::Controls &controls,
+                        const AlphaType &alpha, const AMatrix &A,
+                        const XVector &x, const BetaType &beta, YVector &y,
+                        bool useConjugate) {
+  if (A.numRows() <= 0) {
+    return;
+  }
+
+  // We need to scale y first ("scaling" by zero just means filling
+  // with zeros), since the functor works by atomic-adding into y.
+  KokkosBlas::scal(y, beta, y);
+
+  typedef typename AMatrix::execution_space execution_space;
+
+  bool use_dynamic_schedule = false;  // Forces the use of a dynamic schedule
+  bool use_static_schedule  = false;  // Forces the use of a static schedule
+  if (controls.isParameter("schedule")) {
+    if (controls.getParameter("schedule") == "dynamic") {
+      use_dynamic_schedule = true;
+    } else if (controls.getParameter("schedule") == "static") {
+      use_static_schedule = true;
+    }
+  }
+  int team_size             = -1;
+  int vector_length         = -1;
+  int64_t blocks_per_thread = -1;
+
+  //
+  // Use the controls to allow the user to pass in some tuning parameters.
+  //
+  if (controls.isParameter("team size")) {
+    team_size = std::stoi(controls.getParameter("team size"));
+  }
+  if (controls.isParameter("vector length")) {
+    vector_length = std::stoi(controls.getParameter("vector length"));
+  }
+  if (controls.isParameter("rows per thread")) {
+    blocks_per_thread = std::stoll(controls.getParameter("rows per thread"));
+  }
+
+  //
+  // Use the existing launch parameters routine from SPMV
+  //
+  int64_t blocks_per_team =
+      KokkosSparse::Impl::spmv_launch_parameters<execution_space>(
+          A.numRows(), A.nnz(), blocks_per_thread, team_size, vector_length);
+  int64_t worksets = (A.numRows() + blocks_per_team - 1) / blocks_per_team;
+
+  BCRS_GEMV_Transpose_Functor<AMatrix, XVector, YVector> func(
+      alpha, A, x, y, blocks_per_team, useConjugate);
+
+  if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) {
+    Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic>>
+        policy(1, 1);
+    if (team_size < 0)
+      policy = Kokkos::TeamPolicy<execution_space,
+                                  Kokkos::Schedule<Kokkos::Dynamic>>(
+          worksets, Kokkos::AUTO, vector_length);
+    else
+      policy = Kokkos::TeamPolicy<execution_space,
+                                  Kokkos::Schedule<Kokkos::Dynamic>>(
+          worksets, team_size, vector_length);
+    Kokkos::parallel_for("KokkosSparse::bspmv<Transpose,Dynamic>", policy,
+                         func);
+  } else {
+    Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>
+        policy(1, 1);
+    if (team_size < 0)
+      policy =
+          Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>(
+              worksets, Kokkos::AUTO, vector_length);
+    else
+      policy =
+          Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>(
+              worksets, team_size, vector_length);
+    Kokkos::parallel_for("KokkosSparse::bspmv<Transpose, Static>", policy,
+                         func);
+  }
+}
+
+/* ******************* */
+
+template <class AMatrix, class XVector, class YVector>
+struct BCRS_GEMM_Functor {
+  typedef typename AMatrix::execution_space execution_space;
+  typedef typename AMatrix::non_const_value_type value_type;
+  typedef typename Kokkos::TeamPolicy<execution_space> team_policy;
+  typedef typename team_policy::member_type team_member;
+  typedef Kokkos::Details::ArithTraits<value_type> ATV;
+
+  //! Nonconst version of the type of column indices in the sparse matrix.
+  typedef typename AMatrix::non_const_ordinal_type ordinal_type;
+  //! Nonconst version of the type of row offsets in the sparse matrix.
+  typedef typename AMatrix::non_const_size_type size_type;
+
+  const value_type alpha;
+  AMatrix m_A;
+  XVector m_x;
+  YVector m_y;
+  const ordinal_type block_dim;
+  const ordinal_type num_rhs;
+
+  const ordinal_type blocks_per_team;
+
+  bool conjugate = false;
+
+  BCRS_GEMM_Functor(const value_type alpha_, const AMatrix m_A_,
+                    const XVector m_x_, const YVector m_y_,
+                    const int blocks_per_team_, bool conj_)
+      : alpha(alpha_),
+        m_A(m_A_),
+        m_x(m_x_),
+        m_y(m_y_),
+        block_dim(m_A_.blockDim()),
+        num_rhs(m_x_.extent(1)),
+        blocks_per_team(blocks_per_team_),
+        conjugate(conj_) {
+    static_assert(static_cast<int>(XVector::rank) == 2,
+                  "XVector must be a rank 2 View.");
+    static_assert(static_cast<int>(YVector::rank) == 2,
+                  "YVector must be a rank 2 View.");
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const ordinal_type iBlock) const {
+    //
+    const auto ystart = iBlock * block_dim;
+    const auto start  = m_A.graph.row_map(iBlock);
+    const ordinal_type count =
+        static_cast<ordinal_type>(m_A.graph.row_map(iBlock + 1) - start);
+    const auto row = m_A.block_row_Const(iBlock);
+    //
+    for (ordinal_type ic = 0; ic < count; ++ic) {
+      const auto Aview  = row.block(ic);
+      const auto xstart = row.block_colidx(ic) * block_dim;
+      for (ordinal_type jr = 0; jr < num_rhs; ++jr) {
+        for (ordinal_type ii = 0; ii < block_dim; ++ii) {
+          value_type t(0);
+          for (ordinal_type jj = 0; jj < block_dim; ++jj) {
+            const auto aval =
+                (conjugate)
+                    ? Kokkos::ArithTraits<value_type>::conj(Aview(ii, jj))
+                    : Aview(ii, jj);
+            t += aval * m_x(xstart + jj, jr);
+          }
+          m_y(ystart + ii, jr) += alpha * t;
+        }
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const team_member &dev) const {
+    using y_value_type = typename YVector::non_const_value_type;
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(dev, 0, blocks_per_team),
+        [&](const ordinal_type &loop) {
+          const ordinal_type iBlock =
+              static_cast<ordinal_type>(dev.league_rank()) * blocks_per_team +
+              loop;
+          if (iBlock >= m_A.numRows()) {
+            return;
+          }
+          //
+          const auto start = m_A.graph.row_map(iBlock);
+          const ordinal_type count =
+              static_cast<ordinal_type>(m_A.graph.row_map(iBlock + 1) - start);
+          const auto row  = m_A.block_row_Const(iBlock);
+          const auto nrhs = num_rhs;
+          //
+          for (ordinal_type ic = 0; ic < nrhs; ++ic) {
+            for (ordinal_type ir = 0; ir < block_dim; ++ir) {
+              y_value_type sum = 0;
+
+              Kokkos::parallel_reduce(
+                  Kokkos::ThreadVectorRange(dev, count),
+                  [&](const ordinal_type &iEntry, y_value_type &lsum) {
+                    const auto start_col = row.block_colidx(iEntry) * block_dim;
+                    for (ordinal_type jr = 0; jr < block_dim; ++jr) {
+                      const value_type val =
+                          conjugate
+                              ? ATV::conj(row.local_block_value(iEntry, ir, jr))
+                              : row.local_block_value(iEntry, ir, jr);
+                      lsum += val * m_x(start_col + jr, ic);
+                    }
+                  },
+                  sum);
+
+              Kokkos::single(Kokkos::PerThread(dev), [&]() {
+                sum *= alpha;
+                m_y(iBlock * block_dim + ir, ic) += sum;
+              });
+            }
+          }
+          //
+        });
+  }
+};
+
+/* ******************* */
+
+//
+// spMatMultiVec_no_transpose: version for CPU execution spaces
+// (RangePolicy or trivial serial impl used)
+//
+template <class AT, class AO, class AD, class AS, class AlphaType,
+          class XVector, class BetaType, class YVector,
+          typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename YVector::execution_space>()>::type * = nullptr>
+void spMatMultiVec_no_transpose(
+    const KokkosKernels::Experimental::Controls &controls,
+    const AlphaType &alpha,
+    const KokkosSparse::Experimental::BlockCrsMatrix<
+        AT, AO, AD, Kokkos::MemoryTraits<Kokkos::Unmanaged>, AS> &A,
+    const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) {
+  // This is required to maintain semantics of KokkosKernels native SpMV:
+  // if y contains NaN but beta = 0, the result y should be filled with 0.
+  // For example, this is useful for passing in uninitialized y and beta=0.
+  if (beta == Kokkos::ArithTraits<BetaType>::zero())
+    Kokkos::deep_copy(y, Kokkos::ArithTraits<BetaType>::zero());
+  else
+    KokkosBlas::scal(y, beta, y);
+  //
+  // Treat the case y <- alpha * A * x + beta * y
+  //
+  typedef KokkosSparse::Experimental::BlockCrsMatrix<
+      AT, AO, AD, Kokkos::MemoryTraits<Kokkos::Unmanaged>, AS>
+      AMatrix_Internal;
+
+  bool use_dynamic_schedule = false;  // Forces the use of a dynamic schedule
+  bool use_static_schedule  = false;  // Forces the use of a static schedule
+  if (controls.isParameter("schedule")) {
+    if (controls.getParameter("schedule") == "dynamic") {
+      use_dynamic_schedule = true;
+    } else if (controls.getParameter("schedule") == "static") {
+      use_static_schedule = true;
+    }
+  }
+
+  BCRS_GEMM_Functor<AMatrix_Internal, XVector, YVector> func(alpha, A, x, y, 1,
+                                                             useConjugate);
+  if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) {
+    Kokkos::parallel_for(
+        "KokkosSparse::bcrs_spm_mv<NoTranspose,Dynamic>",
+        Kokkos::RangePolicy<
+            typename AMatrix_Internal::device_type::execution_space,
+            Kokkos::Schedule<Kokkos::Dynamic>>(0, A.numRows()),
+        func);
+  } else {
+    Kokkos::parallel_for(
+        "KokkosSparse::bcrs_spm_mv<NoTranspose,Static>",
+        Kokkos::RangePolicy<
+            typename AMatrix_Internal::device_type::execution_space,
+            Kokkos::Schedule<Kokkos::Static>>(0, A.numRows()),
+        func);
+  }
+}
+
+/* ******************* */
+
+//
+// spMatMultiVec_no_transpose: version for GPU execution spaces (TeamPolicy
+// used)
+//
+template <class AT, class AO, class AD, class AS, class AlphaType,
+          class XVector, class BetaType, class YVector,
+          typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename YVector::execution_space>()>::type * = nullptr>
+void spMatMultiVec_no_transpose(
+    const KokkosKernels::Experimental::Controls &controls,
+    const AlphaType &alpha,
+    const KokkosSparse::Experimental::BlockCrsMatrix<
+        AT, AO, AD, Kokkos::MemoryTraits<Kokkos::Unmanaged>, AS> &A,
+    const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) {
+  if (A.numRows() <= static_cast<AO>(0)) {
+    return;
+  }
+
+  KokkosBlas::scal(y, beta, y);
+
+  typedef KokkosSparse::Experimental::BlockCrsMatrix<
+      AT, AO, AD, Kokkos::MemoryTraits<Kokkos::Unmanaged>, AS>
+      AMatrix_Internal;
+  typedef typename AMatrix_Internal::execution_space execution_space;
+
+  bool use_dynamic_schedule = false;  // Forces the use of a dynamic schedule
+  bool use_static_schedule  = false;  // Forces the use of a static schedule
+  if (controls.isParameter("schedule")) {
+    if (controls.getParameter("schedule") == "dynamic") {
+      use_dynamic_schedule = true;
+    } else if (controls.getParameter("schedule") == "static") {
+      use_static_schedule = true;
+    }
+  }
+  int team_size             = -1;
+  int vector_length         = -1;
+  int64_t blocks_per_thread = -1;
+
+  //
+  // Use the controls to allow the user to pass in some tuning parameters.
+  //
+  if (controls.isParameter("team size")) {
+    team_size = std::stoi(controls.getParameter("team size"));
+  }
+  if (controls.isParameter("vector length")) {
+    vector_length = std::stoi(controls.getParameter("vector length"));
+  }
+  if (controls.isParameter("rows per thread")) {
+    blocks_per_thread = std::stoll(controls.getParameter("rows per thread"));
+  }
+
+  //
+  // Use the existing launch parameters routine from SPMV
+  //
+  int64_t blocks_per_team =
+      KokkosSparse::Impl::spmv_launch_parameters<execution_space>(
+          A.numRows(), A.nnz(), blocks_per_thread, team_size, vector_length);
+  int64_t worksets = (A.numRows() + blocks_per_team - 1) / blocks_per_team;
+
+  AMatrix_Internal A_internal = A;
+
+  BCRS_GEMM_Functor<AMatrix_Internal, XVector, YVector> func(
+      alpha, A_internal, x, y, blocks_per_team, useConjugate);
+
+  if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) {
+    Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic>>
+        policy(1, 1);
+    if (team_size < 0)
+      policy = Kokkos::TeamPolicy<execution_space,
+                                  Kokkos::Schedule<Kokkos::Dynamic>>(
+          worksets, Kokkos::AUTO, vector_length);
+    else
+      policy = Kokkos::TeamPolicy<execution_space,
+                                  Kokkos::Schedule<Kokkos::Dynamic>>(
+          worksets, team_size, vector_length);
+    Kokkos::parallel_for("KokkosSparse::bcrs_spm_mv<NoTranspose,Dynamic>",
+                         policy, func);
+  } else {
+    Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>
+        policy(1, 1);
+    if (team_size < 0)
+      policy =
+          Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>(
+              worksets, Kokkos::AUTO, vector_length);
+    else
+      policy =
+          Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>(
+              worksets, team_size, vector_length);
+    Kokkos::parallel_for("KokkosSparse::bcrs_spm_mv<NoTranspose, Static>",
+                         policy, func);
+  }
+}
+
+/* ******************* */
+
+template <class AMatrix, class XVector, class YVector>
+struct BCRS_GEMM_Transpose_Functor {
+  typedef typename AMatrix::execution_space execution_space;
+  typedef typename AMatrix::non_const_value_type value_type;
+  typedef typename Kokkos::TeamPolicy<execution_space> team_policy;
+  typedef typename team_policy::member_type team_member;
+  typedef Kokkos::Details::ArithTraits<value_type> ATV;
+
+  //! Nonconst version of the type of column indices in the sparse matrix.
+  typedef typename AMatrix::non_const_ordinal_type ordinal_type;
+  //! Nonconst version of the type of row offsets in the sparse matrix.
+  typedef typename AMatrix::non_const_size_type size_type;
+
+  const value_type alpha;
+  AMatrix m_A;
+  XVector m_x;
+  YVector m_y;
+  const ordinal_type block_dim;
+  const ordinal_type num_rhs;
+
+  const ordinal_type blocks_per_team;
+
+  bool conjugate = false;
+
+  BCRS_GEMM_Transpose_Functor(const value_type alpha_, const AMatrix m_A_,
+                              const XVector m_x_, const YVector m_y_,
+                              const int blocks_per_team_, bool conj_)
+      : alpha(alpha_),
+        m_A(m_A_),
+        m_x(m_x_),
+        m_y(m_y_),
+        block_dim(m_A_.blockDim()),
+        num_rhs(m_x_.extent(1)),
+        blocks_per_team(blocks_per_team_),
+        conjugate(conj_) {
+    static_assert(static_cast<int>(XVector::rank) == 2,
+                  "XVector must be a rank 2 View.");
+    static_assert(static_cast<int>(YVector::rank) == 2,
+                  "YVector must be a rank 2 View.");
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const ordinal_type iBlock) const {
+    //
+    const auto xstart = iBlock * block_dim;
+    const auto xview  = Kokkos::subview(
+        m_x, Kokkos::make_pair(xstart, xstart + block_dim), Kokkos::ALL());
+    const auto start = m_A.graph.row_map(iBlock);
+    const ordinal_type count =
+        static_cast<ordinal_type>(m_A.graph.row_map(iBlock + 1) - start);
+    const auto row    = m_A.block_row_Const(iBlock);
+    const auto beta1  = static_cast<value_type>(1);
+    const auto alpha1 = beta1;
+    const auto ldx    = m_x.stride_1();
+    //
+    if (conjugate) {
+      for (ordinal_type ic = 0; ic < count; ++ic) {
+        const auto Aview  = row.block(ic);
+        const auto ystart = row.block_colidx(ic) * block_dim;
+        for (ordinal_type jr = 0; jr < num_rhs; ++jr) {
+          for (ordinal_type jj = 0; jj < block_dim; ++jj) {
+            value_type t(0);
+            for (ordinal_type ii = 0; ii < block_dim; ++ii) {
+              const auto aval =
+                  Kokkos::ArithTraits<value_type>::conj(Aview(ii, jj));
+              t += aval * xview(ii, jr);
+            }
+            t *= alpha;
+            Kokkos::atomic_add(&m_y(ystart + jj, jr), t);
+          }
+        }
+      }
+    } else {
+      for (ordinal_type ic = 0; ic < count; ++ic) {
+        const auto Aview  = row.block(ic);
+        const auto ystart = row.block_colidx(ic) * block_dim;
+        for (ordinal_type jr = 0; jr < num_rhs; ++jr) {
+          for (ordinal_type jj = 0; jj < block_dim; ++jj) {
+            value_type t(0);
+            KokkosBatched::SerialGemvInternal<
+                KokkosBatched::Algo::Gemv::Blocked>::invoke<value_type,
+                                                            value_type>(
+                1, block_dim, alpha1, Aview.data() + jj, Aview.stride_1(),
+                Aview.stride_0(), xview.data() + jr * ldx, xview.stride_0(),
+                beta1, &t, 1);
+            t *= alpha;
+            Kokkos::atomic_add(&m_y(ystart + jj, jr), t);
+          }
+        }
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const team_member &dev) const {
+    using y_value_type = typename YVector::non_const_value_type;
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(dev, 0, blocks_per_team),
+        [&](const ordinal_type &loop) {
+          const ordinal_type iBlock =
+              static_cast<ordinal_type>(dev.league_rank()) * blocks_per_team +
+              loop;
+          if (iBlock >= m_A.numRows()) {
+            return;
+          }
+          //
+          const auto start = m_A.graph.row_map(iBlock);
+          const ordinal_type count =
+              static_cast<ordinal_type>(m_A.graph.row_map(iBlock + 1) - start);
+          const auto row  = m_A.block_row_Const(iBlock);
+          const auto nrhs = m_x.extent(1);
+          //
+          for (size_t ic = 0; ic < nrhs; ++ic) {
+            for (ordinal_type ir = 0; ir < block_dim; ++ir) {
+              Kokkos::parallel_for(
+                  Kokkos::ThreadVectorRange(dev, count),
+                  [&](const ordinal_type &iEntry) {
+                    for (ordinal_type jr = 0; jr < block_dim; ++jr) {
+                      const value_type val =
+                          conjugate
+                              ? ATV::conj(row.local_block_value(iEntry, jr, ir))
+                              : row.local_block_value(iEntry, jr, ir);
+                      const ordinal_type ind = row.block_colidx(iEntry);
+                      Kokkos::atomic_add(
+                          &m_y(block_dim * ind + ir, ic),
+                          static_cast<y_value_type>(
+                              alpha * val * m_x(block_dim * iBlock + jr, ic)));
+                    }
+                  });
+            }
+          }
+          //
+        });
+  }
+};
+
+/* ******************* */
+
+/// \brief  spMatMultiVec_transpose: version for CPU execution spaces
+/// (RangePolicy or trivial serial impl used)
+template <class AT, class AO, class AD, class AS, class AlphaType,
+          class XVector, class BetaType, class YVector,
+          typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename YVector::execution_space>()>::type * = nullptr>
+void spMatMultiVec_transpose(
+    const KokkosKernels::Experimental::Controls &controls,
+    const AlphaType &alpha,
+    const KokkosSparse::Experimental::BlockCrsMatrix<
+        AT, AO, AD, Kokkos::MemoryTraits<Kokkos::Unmanaged>, AS> &A,
+    const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) {
+  // This is required to maintain semantics of KokkosKernels native SpMV:
+  // if y contains NaN but beta = 0, the result y should be filled with 0.
+  // For example, this is useful for passing in uninitialized y and beta=0.
+  if (beta == Kokkos::ArithTraits<BetaType>::zero())
+    Kokkos::deep_copy(y, Kokkos::ArithTraits<BetaType>::zero());
+  else
+    KokkosBlas::scal(y, beta, y);
+  //
+  // Treat the case y <- alpha * A^T * x + beta * y
+  //
+  typedef KokkosSparse::Experimental::BlockCrsMatrix<
+      AT, AO, AD, Kokkos::MemoryTraits<Kokkos::Unmanaged>, AS>
+      AMatrix_Internal;
+  typedef typename AMatrix_Internal::execution_space execution_space;
+
+  AMatrix_Internal A_internal = A;
+
+  bool use_dynamic_schedule = false;  // Forces the use of a dynamic schedule
+  bool use_static_schedule  = false;  // Forces the use of a static schedule
+  if (controls.isParameter("schedule")) {
+    if (controls.getParameter("schedule") == "dynamic") {
+      use_dynamic_schedule = true;
+    } else if (controls.getParameter("schedule") == "static") {
+      use_static_schedule = true;
+    }
+  }
+
+  BCRS_GEMM_Transpose_Functor<AMatrix_Internal, XVector, YVector> func(
+      alpha, A_internal, x, y, 1, useConjugate);
+  if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) {
+    Kokkos::parallel_for(
+        "KokkosSparse::blockcrs_spm_mv<Transpose,Dynamic>",
+        Kokkos::RangePolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic>>(
+            0, A.numRows()),
+        func);
+  } else {
+    Kokkos::parallel_for(
+        "KokkosSparse::blockcrs_spm_mv<Transpose,Static>",
+        Kokkos::RangePolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>(
+            0, A.numRows()),
+        func);
+  }
+}
+
+//
+// spMatMultiVec_transpose: version for GPU execution spaces (TeamPolicy used)
+//
+template <class AMatrix, class AlphaType, class XVector, class BetaType,
+          class YVector,
+          typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename YVector::execution_space>()>::type * = nullptr>
+void spMatMultiVec_transpose(
+    const KokkosKernels::Experimental::Controls &controls,
+    const AlphaType &alpha, const AMatrix &A, const XVector &x,
+    const BetaType &beta, YVector &y, bool useConjugate) {
+  if (A.numRows() <= 0) {
+    return;
+  }
+
+  KokkosBlas::scal(y, beta, y);
+
+  typedef typename AMatrix::execution_space execution_space;
+
+  bool use_dynamic_schedule = false;  // Forces the use of a dynamic schedule
+  bool use_static_schedule  = false;  // Forces the use of a static schedule
+  if (controls.isParameter("schedule")) {
+    if (controls.getParameter("schedule") == "dynamic") {
+      use_dynamic_schedule = true;
+    } else if (controls.getParameter("schedule") == "static") {
+      use_static_schedule = true;
+    }
+  }
+  int team_size             = -1;
+  int vector_length         = -1;
+  int64_t blocks_per_thread = -1;
+
+  //
+  // Use the controls to allow the user to pass in some tuning
+  // parameters.
+  //
+  if (controls.isParameter("team size")) {
+    team_size = std::stoi(controls.getParameter("team size"));
+  }
+  if (controls.isParameter("vector length")) {
+    vector_length = std::stoi(controls.getParameter("vector length"));
+  }
+  if (controls.isParameter("rows per thread")) {
+    blocks_per_thread = std::stoll(controls.getParameter("rows per thread"));
+  }
+
+  //
+  // Use the existing launch parameters routine from SPMV
+  //
+  int64_t blocks_per_team =
+      KokkosSparse::Impl::spmv_launch_parameters<execution_space>(
+          A.numRows(), A.nnz(), blocks_per_thread, team_size, vector_length);
+  int64_t worksets = (A.numRows() + blocks_per_team - 1) / blocks_per_team;
+
+  BCRS_GEMM_Transpose_Functor<AMatrix, XVector, YVector> func(
+      alpha, A, x, y, blocks_per_team, useConjugate);
+
+  if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) {
+    Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic>>
+        policy(1, 1);
+    if (team_size < 0)
+      policy = Kokkos::TeamPolicy<execution_space,
+                                  Kokkos::Schedule<Kokkos::Dynamic>>(
+          worksets, Kokkos::AUTO, vector_length);
+    else
+      policy = Kokkos::TeamPolicy<execution_space,
+                                  Kokkos::Schedule<Kokkos::Dynamic>>(
+          worksets, team_size, vector_length);
+    Kokkos::parallel_for("KokkosSparse::blockcrs_spm_mv<Transpose,Dynamic>",
+                         policy, func);
+  } else {
+    Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>
+        policy(1, 1);
+    if (team_size < 0)
+      policy =
+          Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>(
+              worksets, Kokkos::AUTO, vector_length);
+    else
+      policy =
+          Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>(
+              worksets, team_size, vector_length);
+    Kokkos::parallel_for("KokkosSparse::blockcrs_spm_mv<Transpose, Static>",
+                         policy, func);
+  }
+}
+
+/* ******************* */
+
+}  // namespace BCRS
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosSparse
+
+#endif  // KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_IMPL_HPP
diff --git a/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp
new file mode 100644
index 0000000000..7132ec0fe1
--- /dev/null
+++ b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp
@@ -0,0 +1,284 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOSSPARSE_IMPL_SPMV_BLOCKCRSMATRIX_SPEC_HPP_
+#define KOKKOSSPARSE_IMPL_SPMV_BLOCKCRSMATRIX_SPEC_HPP_
+
+#include <KokkosKernels_config.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_ArithTraits.hpp>
+
+#include "KokkosSparse_BlockCrsMatrix.hpp"
+#include "KokkosKernels_Controls.hpp"
+#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+#include <KokkosSparse_spmv_blockcrsmatrix_impl.hpp>
+#endif
+
+namespace KokkosSparse {
+namespace Experimental {
+namespace Impl {
+
+// default is no eti available
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM>
+struct spmv_blockcrsmatrix_eti_spec_avail {
+  enum : bool { value = false };
+};
+
+// default is no eti available
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM>
+struct spmv_mv_blockcrsmatrix_eti_spec_avail {
+  enum : bool { value = false };
+};
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosSparse
+
+#define KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_ETI_SPEC_AVAIL(                  \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE)                                                       \
+  template <>                                                             \
+  struct spmv_blockcrsmatrix_eti_spec_avail<                              \
+      const SCALAR_TYPE, const ORDINAL_TYPE,                              \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET_TYPE,         \
+      SCALAR_TYPE const *, LAYOUT_TYPE,                                   \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,     \
+      SCALAR_TYPE *, LAYOUT_TYPE,                                         \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged> > {                         \
+    enum : bool { value = true };                                         \
+  };
+
+#define KOKKOSSPARSE_SPMV_MV_BLOCKCRSMATRIX_ETI_SPEC_AVAIL(               \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE)                                                       \
+  template <>                                                             \
+  struct spmv_mv_blockcrsmatrix_eti_spec_avail<                           \
+      const SCALAR_TYPE, const ORDINAL_TYPE,                              \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET_TYPE,         \
+      SCALAR_TYPE const *, LAYOUT_TYPE,                                   \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,     \
+      SCALAR_TYPE *, LAYOUT_TYPE,                                         \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged> > {                         \
+    enum : bool { value = true };                                         \
+  };
+
+// Include which ETIs are available
+#include <KokkosSparse_spmv_blockcrsmatrix_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spmv_blockcrsmatrix_eti_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spmv_mv_blockcrsmatrix_eti_spec_avail.hpp>
+
+namespace KokkosSparse {
+namespace Experimental {
+namespace Impl {
+
+// declaration
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM,
+          bool eti_spec_avail = spmv_blockcrsmatrix_eti_spec_avail<
+              AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value>
+struct SPMV_BLOCKCRSMATRIX {
+  typedef BlockCrsMatrix<AT, AO, AD, AM, AS> AMatrix;
+  typedef Kokkos::View<XT, XL, XD, XM> XVector;
+  typedef Kokkos::View<YT, YL, YD, YM> YVector;
+  typedef typename YVector::non_const_value_type YScalar;
+
+  static void spmv_blockcrsmatrix(
+      const KokkosKernels::Experimental::Controls &controls, const char mode[],
+      const YScalar &alpha, const AMatrix &A, const XVector &x,
+      const YScalar &beta, const YVector &y);
+};
+
+// declaration
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM,
+          bool eti_spec_avail = spmv_mv_blockcrsmatrix_eti_spec_avail<
+              AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value>
+struct SPMV_MV_BLOCKCRSMATRIX {
+  typedef BlockCrsMatrix<AT, AO, AD, AM, AS> AMatrix;
+  typedef Kokkos::View<XT, XL, XD, XM> XVector;
+  typedef Kokkos::View<YT, YL, YD, YM> YVector;
+  typedef typename YVector::non_const_value_type YScalar;
+
+  static void spmv_mv_blockcrsmatrix(
+      const KokkosKernels::Experimental::Controls &controls, const char mode[],
+      const YScalar &alpha, const AMatrix &A, const XVector &x,
+      const YScalar &beta, const YVector &y);
+};
+
+// actual implementations to be compiled
+#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM>
+struct SPMV_BLOCKCRSMATRIX<AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM,
+                           KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  typedef BlockCrsMatrix<AT, AO, AD, AM, AS> AMatrix;
+  typedef Kokkos::View<XT, XL, XD, XM> XVector;
+  typedef Kokkos::View<YT, YL, YD, YM> YVector;
+  typedef typename YVector::non_const_value_type YScalar;
+
+  static void spmv_blockcrsmatrix(
+      const KokkosKernels::Experimental::Controls &controls, const char mode[],
+      const YScalar &alpha, const AMatrix &A, const XVector &X,
+      const YScalar &beta, const YVector &Y) {
+    //
+    if ((mode[0] == KokkosSparse::NoTranspose[0]) ||
+        (mode[0] == KokkosSparse::Conjugate[0])) {
+      bool useConjugate = (mode[0] == KokkosSparse::Conjugate[0]);
+      return BCRS::spMatVec_no_transpose(controls, alpha, A, X, beta, Y,
+                                         useConjugate);
+    } else {
+      bool useConjugate = (mode[0] == KokkosSparse::ConjugateTranspose[0]);
+      return BCRS::spMatVec_transpose(controls, alpha, A, X, beta, Y,
+                                      useConjugate);
+    }
+  }
+};
+
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM>
+struct SPMV_MV_BLOCKCRSMATRIX<AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD,
+                              YM, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  typedef BlockCrsMatrix<AT, AO, AD, AM, AS> AMatrix;
+  typedef Kokkos::View<XT, XL, XD, XM> XVector;
+  typedef Kokkos::View<YT, YL, YD, YM> YVector;
+  typedef typename YVector::non_const_value_type YScalar;
+
+  static void spmv_mv_blockcrsmatrix(
+      const KokkosKernels::Experimental::Controls &controls, const char mode[],
+      const YScalar &alpha, const AMatrix &A, const XVector &X,
+      const YScalar &beta, const YVector &Y) {
+    //
+    if ((mode[0] == KokkosSparse::NoTranspose[0]) ||
+        (mode[0] == KokkosSparse::Conjugate[0])) {
+      bool useConjugate = (mode[0] == KokkosSparse::Conjugate[0]);
+      return BCRS::spMatMultiVec_no_transpose(controls, alpha, A, X, beta, Y,
+                                              useConjugate);
+    } else {
+      bool useConjugate = (mode[0] == KokkosSparse::ConjugateTranspose[0]);
+      return BCRS::spMatMultiVec_transpose(controls, alpha, A, X, beta, Y,
+                                           useConjugate);
+    }
+  }
+};
+
+#endif  // !defined(KOKKOSKERNELS_ETI_ONLY) ||
+// KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosSparse
+
+// declare / instantiate the vector version
+// Instantiate with A,x,y are all the requested Scalar type (no instantiation of
+// mixed-precision operands)
+#define KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_ETI_SPEC_DECL(                   \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE)                                                       \
+  extern template struct SPMV_BLOCKCRSMATRIX<                             \
+      const SCALAR_TYPE, const ORDINAL_TYPE,                              \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET_TYPE,         \
+      SCALAR_TYPE const *, LAYOUT_TYPE,                                   \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,     \
+      SCALAR_TYPE *, LAYOUT_TYPE,                                         \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, true>;
+
+#define KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_ETI_SPEC_INST(                   \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE)                                                       \
+  template struct SPMV_BLOCKCRSMATRIX<                                    \
+      const SCALAR_TYPE, const ORDINAL_TYPE,                              \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET_TYPE,         \
+      SCALAR_TYPE const *, LAYOUT_TYPE,                                   \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,     \
+      SCALAR_TYPE *, LAYOUT_TYPE,                                         \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, true>;
+
+// declare / instantiate the 2D MV version
+// Instantiate with A,x,y are all the requested Scalar type (no instantiation of
+// mixed-precision operands)
+#define KOKKOSSPARSE_SPMV_MV_BLOCKCRSMATRIX_ETI_SPEC_DECL(                \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE)                                                       \
+  extern template struct SPMV_MV_BLOCKCRSMATRIX<                          \
+      const SCALAR_TYPE, const ORDINAL_TYPE,                              \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET_TYPE,         \
+      SCALAR_TYPE const **, LAYOUT_TYPE,                                  \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,     \
+      SCALAR_TYPE **, LAYOUT_TYPE,                                        \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, true>;
+
+#define KOKKOSSPARSE_SPMV_MV_BLOCKCRSMATRIX_ETI_SPEC_INST(                \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE)                                                       \
+  template struct SPMV_MV_BLOCKCRSMATRIX<                                 \
+      const SCALAR_TYPE, const ORDINAL_TYPE,                              \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET_TYPE,         \
+      SCALAR_TYPE const **, LAYOUT_TYPE,                                  \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,     \
+      SCALAR_TYPE **, LAYOUT_TYPE,                                        \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, true>;
+
+#include <KokkosSparse_spmv_blockcrsmatrix_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spmv_blockcrsmatrix_eti_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spmv_mv_blockcrsmatrix_eti_spec_decl.hpp>
+
+#endif  // KOKKOSSPARSE_IMPL_SPMV_BLOCKCRSMATRIX_SPEC_HPP_
diff --git a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp
new file mode 100644
index 0000000000..b87a9fa460
--- /dev/null
+++ b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp
@@ -0,0 +1,1839 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSSPARSE_IMPL_SPMV_BSRMATRIX_IMPL_HPP_
+#define KOKKOSSPARSE_IMPL_SPMV_BSRMATRIX_IMPL_HPP_
+
+#include "KokkosKernels_Error.hpp"
+
+#if defined(KOKKOS_ENABLE_CUDA) && \
+    (defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_AMPERE))
+
+#include <type_traits>
+#include <mma.h>
+
+namespace KokkosSparse {
+namespace Experimental {
+namespace Impl {
+
+struct BsrMatrixSpMVTensorCoreFunctorParams {
+  int teamsPerBlockM;
+  int teamsPerBlockN;
+  int leagueDim_x;
+  int leagueDim_y;
+};
+
+/// \brief Functor for the BsrMatrix SpMV multivector implementation utilizing
+/// tensor cores.
+///
+/// \tparam AMatrix The type of the A matrix (a BsrMatrix)
+/// \tparam AFragScalar The type of the CUDA wmma fragment that will be loaded
+/// from the A matrix. The scalar type of the wmma fragment may be different
+/// that that of the A matrix. \tparam FRAG_M (with FRAG_N and FRAG_K), the
+/// m-n-k size of the CUDA wmma fragment type. \tparam LEAGUE_DIM_X (with
+/// TEAMS_PER_BLOCK_M and TEAMS_PER_BLOCK_N) if non-zero, statically-known
+/// launch parameters to reduce the cost of divmod operations on the GPU. If 0,
+/// provided runtime values will be used instead.
+template <typename AMatrix,
+          typename AFragScalar,  // input matrix type and fragment scalar type
+          typename XMatrix, typename XFragScalar, typename YMatrix,
+          typename YFragScalar, unsigned FRAG_M, unsigned FRAG_N,
+          unsigned FRAG_K,  // fragment sizes
+          unsigned LEAGUE_DIM_X = 0, unsigned TEAMS_PER_BLOCK_M = 0,
+          unsigned TEAMS_PER_BLOCK_N = 0>
+struct BsrMatrixSpMVTensorCoreFunctor {
+  typedef nvcuda::wmma::accumulator accumulator;
+  typedef nvcuda::wmma::row_major row_major;
+  typedef nvcuda::wmma::col_major col_major;
+  typedef nvcuda::wmma::matrix_a matrix_a;
+  typedef nvcuda::wmma::matrix_b matrix_b;
+  using FragA = nvcuda::wmma::fragment<matrix_a, FRAG_M, FRAG_N, FRAG_K,
+                                       AFragScalar, row_major>;
+  using FragX = nvcuda::wmma::fragment<matrix_b, FRAG_M, FRAG_N, FRAG_K,
+                                       XFragScalar, row_major>;
+  using FragY =
+      nvcuda::wmma::fragment<accumulator, FRAG_M, FRAG_N, FRAG_K, YFragScalar>;
+
+  typedef typename AMatrix::device_type Device;
+  typedef Kokkos::TeamPolicy<typename Device::execution_space> team_policy;
+  typedef typename team_policy::member_type team_member;
+  typedef typename AMatrix::value_type AScalar;
+  typedef typename YMatrix::value_type YScalar;
+  typedef typename XMatrix::value_type XScalar;
+  typedef typename AMatrix::non_const_ordinal_type AOrdinal;
+  typedef typename AMatrix::non_const_size_type AOffset;
+
+  // views of the shared memory used in the functor to cast types to the CUDA
+  // wmma types A matrix is MxK X matrix is KxN Y matrix is MxN
+  typedef typename Kokkos::View<
+      AFragScalar * [FRAG_M][FRAG_K],  // one fragment per warp in the team (2D
+                                       // grid of warps in team)
+      Kokkos::LayoutRight,
+      typename Device::execution_space::scratch_memory_space,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      AScratchView;
+  typedef typename Kokkos::View<
+      XFragScalar * [FRAG_K][FRAG_N],
+      typename Kokkos::LayoutRight,  // so that [FRAG_K][FRAG_N] part is
+                                     // contiguous in memory
+      typename Device::execution_space::scratch_memory_space,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      XScratchView;
+  typedef typename Kokkos::View<
+      YFragScalar * * [FRAG_M][FRAG_N],
+      typename Kokkos::LayoutRight,  // so that [FRAG_M][FRAG_N] part is
+                                     // contiguous in memory
+      typename Device::execution_space::scratch_memory_space,
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      YScratchView;
+
+  YScalar alpha;
+  AMatrix a;
+  XMatrix x;
+  YScalar beta;
+  YMatrix y;
+
+  BsrMatrixSpMVTensorCoreFunctorParams params;
+
+  // a team is a 2D grid of warps
+  static constexpr int WARPS_PER_TEAM_X = 2;
+  static constexpr int WARPS_PER_TEAM_Y = 2;
+  static constexpr int THREADS_PER_WARP = 32;
+
+  BsrMatrixSpMVTensorCoreFunctor() = delete;  // need all runtime parameters
+
+  // the launch parameters should be generated by a call to ::launch_parameters
+  BsrMatrixSpMVTensorCoreFunctor(
+      YScalar _alpha, AMatrix _a, XMatrix _x, YScalar _beta, YMatrix _y,
+      const BsrMatrixSpMVTensorCoreFunctorParams &_params)
+      : alpha(_alpha), a(_a), x(_x), beta(_beta), y(_y), params(_params) {}
+
+  size_t league_size() const { return params.leagueDim_x * params.leagueDim_y; }
+
+  size_t team_size() const {
+    return THREADS_PER_WARP * WARPS_PER_TEAM_X * WARPS_PER_TEAM_Y;
+  }
+
+  // single column of fragments from A
+  KOKKOS_INLINE_FUNCTION size_t a_scratch_size() const {
+    return WARPS_PER_TEAM_Y * FRAG_M * FRAG_K * sizeof(AFragScalar);
+  }
+  // single row of fragments from X
+  KOKKOS_INLINE_FUNCTION size_t x_scratch_size() const {
+    return WARPS_PER_TEAM_X * FRAG_K * FRAG_N * sizeof(XFragScalar);
+  }
+  // one fragment per warp in the team
+  KOKKOS_INLINE_FUNCTION size_t y_scratch_size() const {
+    return WARPS_PER_TEAM_X * WARPS_PER_TEAM_Y * FRAG_M * FRAG_N *
+           sizeof(YFragScalar);
+  }
+
+  size_t team_scratch_size() const {
+    return a_scratch_size() + x_scratch_size() + y_scratch_size();
+  }
+
+  /// \brief determine the mapping parameters for the 1D Kokkos::parallel_for
+  /// space to the hierarchical 2D space of the functor kernel. This should be
+  /// called to determine what arguments to pass to the constructor
+  static BsrMatrixSpMVTensorCoreFunctorParams launch_parameters(
+      const YScalar & /*alpha*/, const AMatrix &a, const XMatrix & /*x*/,
+      const YScalar & /*beta*/, const YMatrix &y) {
+    BsrMatrixSpMVTensorCoreFunctorParams params;
+
+    // compute how many blocks there are in each dimension of the product MV
+    int blocksPerYM = (y.extent(0) + a.blockDim() - 1) / a.blockDim();
+    int blocksPerYN = (y.extent(1) + a.blockDim() - 1) / a.blockDim();
+
+    // compute how many fragments are needed to cover each block
+    int fragsPerBlockM = (a.blockDim() + FRAG_M - 1) / FRAG_M;
+    int fragsPerBlockN = (a.blockDim() + FRAG_N - 1) / FRAG_N;
+
+    // determine how many teams will need to cover each block (Y in M direction,
+    // X in N direction)
+    params.teamsPerBlockM =
+        (fragsPerBlockM + WARPS_PER_TEAM_Y - 1) / WARPS_PER_TEAM_Y;
+    params.teamsPerBlockN =
+        (fragsPerBlockN + WARPS_PER_TEAM_X - 1) / WARPS_PER_TEAM_X;
+
+    // determine how many teams will be needed co cover the product vector
+    int yTeamsM = params.teamsPerBlockM * blocksPerYM;
+    int yTeamsN = params.teamsPerBlockN * blocksPerYN;
+
+    // Y dimension to M, X dimension to N
+    params.leagueDim_x = yTeamsN;
+    params.leagueDim_y = yTeamsM;
+
+    return params;
+  }
+
+  // execute the functor with provided launch parameters
+  void dispatch() {
+    typename BsrMatrixSpMVTensorCoreFunctor::team_policy policy(league_size(),
+                                                                team_size());
+    policy.set_scratch_size(0, Kokkos::PerTeam(team_scratch_size()));
+    Kokkos::parallel_for("KokkosSparse::BsrMatrixSpMVTensorCoreFunctor", policy,
+                         *this);
+  }
+
+  /*
+     Consider the product vector as being made up of blocks that are the
+     same size as the blocks in the input sparse matrix.
+     teams are tiled across each block
+     the size of each team is determined by the 2D grid of warps in the team,
+     and the shape of each warp's fragment
+
+     The number of warps per team is static:
+     WARPS_PER_TEAM_X * WARPS_PER_TEAM_Y
+
+     Based on its position in the product vector, each team steps through
+     corresponding block-sized tiles of A and X. Those tiles are loaded into
+     shared memory
+     Warps in the team iterate over the shared-memory tile and perform the
+     accumuation
+     then the fragments write the results back to shared memory, and then
+     global memory
+  */
+
+  KOKKOS_INLINE_FUNCTION void operator()(const team_member &mbr) const {
+    using nvcuda::wmma::fill_fragment;
+    using nvcuda::wmma::load_matrix_sync;
+    using nvcuda::wmma::mma_sync;
+    using nvcuda::wmma::store_matrix_sync;
+
+    FragA fa;
+    FragX fx;
+    FragY fy;
+
+    // override with template params if given
+    const int ld_x = LEAGUE_DIM_X > 0 ? LEAGUE_DIM_X : params.leagueDim_x;
+    const int tpbn =
+        TEAMS_PER_BLOCK_N > 0 ? TEAMS_PER_BLOCK_N : params.teamsPerBlockN;
+    const int tpbm =
+        TEAMS_PER_BLOCK_M > 0 ? TEAMS_PER_BLOCK_M : params.teamsPerBlockM;
+
+    // which team I am in the league
+    const int teamIdx_x = mbr.league_rank() % ld_x;
+    const int teamIdx_y = mbr.league_rank() / ld_x;
+
+    // which block I contribute to in the product vector
+    const int blockIdx_i = teamIdx_y / tpbm;
+    const int blockIdx_j = teamIdx_x / tpbn;
+
+    // which team am I in the block
+    const int teamIdx_i = teamIdx_y % tpbm;
+    const int teamIdx_j = teamIdx_x % tpbn;
+
+    // which warp I am in the team
+    const int warpIdx_x = (mbr.team_rank() / 32) % WARPS_PER_TEAM_X;
+    const int warpIdx_y = (mbr.team_rank() / 32) / WARPS_PER_TEAM_X;
+
+    // which lane I am in the warp
+    const int lx = mbr.team_rank() % THREADS_PER_WARP;
+
+    // which row of a/y the fragment this warp contributes to starts at
+    const AOrdinal ay_i =
+        blockIdx_i * a.blockDim()                // offset due to block
+        + teamIdx_i * WARPS_PER_TEAM_Y * FRAG_M  // offset of team within block
+        + warpIdx_y * FRAG_M;                    // offset of warp within team
+
+    // which column of x/y the fragments warp will read from/contribute to
+    // starts at
+    const AOrdinal xy_j = blockIdx_j * a.blockDim() +
+                          teamIdx_j * WARPS_PER_TEAM_X * FRAG_N +
+                          warpIdx_x * FRAG_N;
+
+    AFragScalar *_sa =
+        (AFragScalar *)mbr.team_shmem().get_shmem(a_scratch_size());
+    XFragScalar *_sx =
+        (XFragScalar *)mbr.team_shmem().get_shmem(x_scratch_size());
+    YFragScalar *_sy =
+        (YFragScalar *)mbr.team_shmem().get_shmem(y_scratch_size());
+
+    AScratchView sa(_sa, WARPS_PER_TEAM_Y);
+    XScratchView sx(_sx, WARPS_PER_TEAM_X);
+    YScratchView sy(_sy, WARPS_PER_TEAM_Y, WARPS_PER_TEAM_X);
+
+    // team loads its fragments of Y that make up part or all of the block of Y
+    // it's responsible for. each warp loads the part corresponding to its y
+    // fragment stage through shared memory to convert to fragment type
+
+    // no need for a team barrier because each warp uses an individual part of
+    // shared memory
+    for (unsigned i = lx; i < FRAG_M * FRAG_N; i += THREADS_PER_WARP) {
+      const unsigned fi = i / FRAG_N;  // position in fragment of Y
+      const unsigned fj = i % FRAG_N;
+      const AOrdinal bi = teamIdx_i * WARPS_PER_TEAM_Y * FRAG_M +
+                          warpIdx_y * FRAG_M + fi;  // position in block of Y
+      const AOrdinal bj =
+          teamIdx_j * WARPS_PER_TEAM_X * FRAG_N + warpIdx_x * FRAG_N + fj;
+
+      // load 0 outside of the block boundary and y vector boundary
+      // load 0 outside of the vector boundary
+      if (bi < a.blockDim() && bj < a.blockDim() && xy_j + fj < y.extent(1)) {
+        sy(warpIdx_y, warpIdx_x, fi, fj) =
+            YFragScalar(beta * y(ay_i + fi, xy_j + fj));
+      } else {
+        sy(warpIdx_y, warpIdx_x, fi, fj) = YFragScalar(0);
+      }
+    }
+    // no barrier - each warp uses independent shared memory
+
+    // load from the shared memory
+#ifdef __CUDA_ARCH__
+    load_matrix_sync(fy, &sy(warpIdx_y, warpIdx_x, 0, 0), FRAG_N,
+                     nvcuda::wmma::mem_row_major);
+#endif
+
+    auto rowView = a.block_row_Const(blockIdx_i);
+
+    // team loops through all blocks in the row
+    for (AOffset ci = a.graph.row_map(blockIdx_i);
+         ci < a.graph.row_map(blockIdx_i + 1); ++ci) {
+      AOrdinal j = a.graph.entries(ci);
+
+      // pointer to the beginning of the block
+      const AScalar *ap = nullptr;
+      {
+        size_t off =
+            ci - a.graph.row_map(blockIdx_i);     // which block in this row
+        ap = rowView.local_row_in_block(off, 0);  // offset of this block
+      }
+
+      // the block may be bigger than a single team,
+      // each team is only one fragment long in the K direction
+      // so will need to iterate fragments in the K direction across the block
+      // the team will collaboratively load the fragments from A and X
+
+      // and require multiple loads and accumulates
+      // for mxn grid of fragments in the product vector, we need m rows of
+      // fragments from A and n cols of fragments from X. only hold part of a
+      // single column of fragments (from A) or part of a single row (from X) at
+      // once
+      for (AOrdinal bk = 0; bk < a.blockDim(); bk += FRAG_K /*M*/) {
+        // team collaborative load of A
+        // the footprint is one fragment wide in K direction
+        mbr.team_barrier();
+        for (unsigned i = mbr.team_rank();
+             i < WARPS_PER_TEAM_Y * FRAG_M * FRAG_K; i += mbr.team_size()) {
+          const unsigned ti = i / FRAG_K;  // offset inside the fragments
+          const unsigned tj = i % FRAG_K;
+          // add in offset within block
+          const AOrdinal bi = teamIdx_i * WARPS_PER_TEAM_Y * FRAG_M + ti;
+          const AOrdinal bj = bk + tj;
+
+          // fill shmem with 0 outside of the block boundary
+#ifdef __CUDA_ARCH__
+          if (bi < a.blockDim() && bj < a.blockDim()) {
+            sa(ti / FRAG_M, ti % FRAG_M, tj) =
+                AFragScalar(alpha * ap[bi * a.blockDim() + bj]);
+          } else {
+            sa(ti / FRAG_M, ti % FRAG_M, tj) = AFragScalar(0);
+          }
+#else
+          (void)bi;
+          (void)bj;
+#endif
+        }
+
+        // collaborative load of X fragments into shared memory
+        // entire team loads fragment footprint
+        for (unsigned i = mbr.team_rank();
+             i < WARPS_PER_TEAM_X * FRAG_N * FRAG_K; i += mbr.team_size()) {
+          const unsigned ti =
+              i / (WARPS_PER_TEAM_X * FRAG_N);  // position in combined tiles
+          const unsigned tj = i % (WARPS_PER_TEAM_X * FRAG_N);
+
+          // add in offset within block
+          const AOrdinal bi = bk + ti;
+          const AOrdinal bj = teamIdx_j * WARPS_PER_TEAM_X * FRAG_N + tj;
+
+          // load 0 outside of the block boundary
+          // x is not necessarily a multiple of block size, so make sure access
+          // is in bounds
+#ifdef __CUDA_ARCH__
+          if (bi < a.blockDim() && bj < a.blockDim() &&
+              unsigned(blockIdx_j * a.blockDim() + bj) < x.extent(1)) {
+            // tile is some fragments in the j/n direction that are frag_n wide
+            sx(tj / FRAG_N, ti, tj % FRAG_N) = XFragScalar(
+                x(j * a.blockDim() + bi, blockIdx_j * a.blockDim() + bj));
+          } else {
+            sx(tj / FRAG_N, ti, tj % FRAG_N) = XFragScalar(0);
+          }
+#else
+          (void)bi;
+          (void)bj;
+#endif
+        }
+        mbr.team_barrier();
+
+        // load correct fragment from shared memory and accumulate
+#ifdef __CUDA_ARCH__
+        // only need to do any math if our fragment will write a result back to
+        // Y
+        if (ay_i < static_cast<AOrdinal>(y.extent(0)) &&
+            xy_j < static_cast<AOrdinal>(y.extent(1))) {
+          load_matrix_sync(fa, &sa(warpIdx_y, 0, 0), FRAG_K);
+          load_matrix_sync(fx, &sx(warpIdx_x, 0, 0), FRAG_N);
+          mma_sync(fy, fa, fx, fy);
+        }
+#endif
+      }
+      (void)j;
+      (void)ap;
+    }  // loop through blocks in row of A
+
+#ifdef __CUDA_ARCH__
+    // store Y fragments into shared memory
+    store_matrix_sync(&sy(warpIdx_y, warpIdx_x, 0, 0), fy, FRAG_N,
+                      nvcuda::wmma::mem_row_major);
+#endif
+    // team loads its fragments of Y that make up part or all of the block of Y
+    // it's responsible for. each warp loads the part corresponding to its y
+    // fragment
+    mbr.team_barrier();
+    for (unsigned i = lx; i < FRAG_M * FRAG_N; i += THREADS_PER_WARP) {
+      const unsigned fi = i / FRAG_N;  // position in fragment of Y
+      const unsigned fj = i % FRAG_N;
+      const AOrdinal bi = teamIdx_i * WARPS_PER_TEAM_Y * FRAG_M +
+                          warpIdx_y * FRAG_M + fi;  // position in block of Y
+      const AOrdinal bj =
+          teamIdx_j * WARPS_PER_TEAM_X * FRAG_N + warpIdx_x * FRAG_N + fj;
+
+      // only store inside the block boundary
+      // FIXME: what if Y is not wide enough? check y(_, j)
+      if (bi < a.blockDim() && bj < a.blockDim() && xy_j + fj < y.extent(1)) {
+        y(ay_i + fi, xy_j + fj) = sy(warpIdx_y, warpIdx_x, fi, fj);
+      }
+    }
+    mbr.team_barrier();
+
+    // Suppress unused var warnings
+    // TODO (@cwpearson): Should this functor only compile on device?
+    (void)fx;
+    (void)fa;
+    (void)fy;
+  }
+};
+
+/* Instantiate some common template parameter values
+   for BsrMatrixSpMVTensorCoreFunctor.
+   This is a struct instead of a function for template...using shorthand
+   Discriminates between complex (supported) and non-complex (unsupported)
+   scalar types, and throws a runtime error for unsupported types
+*/
+template <typename AMatrix,
+          typename AFragScalar,  // input matrix type and fragment scalar type
+          typename XMatrix, typename XFragScalar, typename YMatrix,
+          typename YFragScalar, unsigned FRAG_M, unsigned FRAG_N,
+          unsigned FRAG_K>
+struct BsrMatrixSpMVTensorCoreDispatcher {
+  typedef typename AMatrix::value_type AScalar;
+  typedef typename YMatrix::value_type YScalar;
+  typedef typename XMatrix::value_type XScalar;
+
+  template <unsigned X, unsigned Y, unsigned Z>
+  using Dyn = BsrMatrixSpMVTensorCoreFunctor<AMatrix, AFragScalar, XMatrix,
+                                             XFragScalar, YMatrix, YFragScalar,
+                                             FRAG_M, FRAG_N, FRAG_K, X, Y, Z>;
+
+  // to be used when the various matrix types are supported
+  static void tag_dispatch(std::true_type, YScalar alpha, AMatrix a, XMatrix x,
+                           YScalar beta, YMatrix y) {
+    BsrMatrixSpMVTensorCoreFunctorParams params =
+        Dyn<0, 0, 0>::launch_parameters(alpha, a, x, beta, y);
+
+    if (false) {  // consistency of formatting for next sections
+    } else if (1 == params.leagueDim_x && 1 == params.teamsPerBlockM &&
+               1 == params.teamsPerBlockN) {
+      Dyn<1, 1, 1>(alpha, a, x, beta, y, params).dispatch();
+    } else if (1 == params.leagueDim_x && 2 == params.teamsPerBlockM &&
+               2 == params.teamsPerBlockN) {
+      Dyn<1, 2, 2>(alpha, a, x, beta, y, params).dispatch();
+    } else if (1 == params.leagueDim_x && 4 == params.teamsPerBlockM &&
+               4 == params.teamsPerBlockN) {
+      Dyn<1, 4, 4>(alpha, a, x, beta, y, params).dispatch();
+    } else if (1 == params.leagueDim_x && 8 == params.teamsPerBlockM &&
+               8 == params.teamsPerBlockN) {
+      Dyn<1, 8, 8>(alpha, a, x, beta, y, params).dispatch();
+    } else if (2 == params.leagueDim_x && 1 == params.teamsPerBlockM &&
+               1 == params.teamsPerBlockN) {
+      Dyn<2, 1, 1>(alpha, a, x, beta, y, params).dispatch();
+    } else if (2 == params.leagueDim_x && 2 == params.teamsPerBlockM &&
+               2 == params.teamsPerBlockN) {
+      Dyn<2, 2, 2>(alpha, a, x, beta, y, params).dispatch();
+    } else if (2 == params.leagueDim_x && 4 == params.teamsPerBlockM &&
+               4 == params.teamsPerBlockN) {
+      Dyn<2, 4, 4>(alpha, a, x, beta, y, params).dispatch();
+    } else if (2 == params.leagueDim_x && 8 == params.teamsPerBlockM &&
+               8 == params.teamsPerBlockN) {
+      Dyn<2, 8, 8>(alpha, a, x, beta, y, params).dispatch();
+    } else {
+      Dyn<0, 0, 0>(alpha, a, x, beta, y, params).dispatch();
+    }
+  }
+
+  // to be used to avoid instantiating on unsupported types
+  static void tag_dispatch(std::false_type, YScalar, AMatrix, XMatrix, YScalar,
+                           YMatrix) {
+    KokkosKernels::Impl::throw_runtime_exception(
+        "unsupported for complex types");
+  }
+
+  /*true if T1, T2, or T3 are complex*/
+  template <typename T1, typename T2, typename T3>
+  struct none_complex {
+    const static bool value = !Kokkos::ArithTraits<T1>::is_complex &&
+                              !Kokkos::ArithTraits<T2>::is_complex &&
+                              !Kokkos::ArithTraits<T3>::is_complex;
+  };
+
+  static void dispatch(YScalar alpha, AMatrix a, XMatrix x, YScalar beta,
+                       YMatrix y) {
+    using tag =
+        std::integral_constant<bool,
+                               none_complex<AScalar, XScalar, YScalar>::value>;
+    tag_dispatch(tag{}, alpha, a, x, beta, y);
+  }
+};
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosSparse
+
+#endif  // #if CUDA && (VOLTA || AMPERE)
+
+//
+//
+//
+
+#include "KokkosBlas.hpp"
+#include "KokkosBatched_Gemv_Serial_Internal.hpp"
+#include "KokkosBatched_Gemv_TeamVector_Internal.hpp"
+#include "KokkosBatched_Gemm_Serial_Internal.hpp"
+#include "KokkosBatched_Gemm_TeamVector_Internal.hpp"
+#include "KokkosBatched_Scale_Internal.hpp"
+#include "KokkosKernels_ExecSpaceUtils.hpp"
+
+#include "KokkosBlas2_team_gemv_spec.hpp"
+
+namespace KokkosSparse {
+namespace Experimental {
+namespace Impl {
+namespace Bsr {
+
+template <class AMatrix, class XVector, class YVector>
+struct BSR_GEMV_Functor {
+  typedef typename AMatrix::execution_space execution_space;
+  typedef typename AMatrix::non_const_value_type value_type;
+  typedef typename Kokkos::TeamPolicy<execution_space> team_policy;
+  typedef typename team_policy::member_type team_member;
+  typedef Kokkos::Details::ArithTraits<value_type> ATV;
+
+  //! Nonconst version of the type of column indices in the sparse matrix.
+  typedef typename AMatrix::non_const_ordinal_type ordinal_type;
+  //! Nonconst version of the type of row offsets in the sparse matrix.
+  typedef typename AMatrix::non_const_size_type size_type;
+
+  const value_type alpha;
+  const value_type beta;
+
+  AMatrix m_A;
+  XVector m_x;
+  YVector m_y;
+
+  const int block_dim;
+  const bool conjugate;
+
+  BSR_GEMV_Functor(const value_type alpha_, const AMatrix &m_A_,
+                   const XVector &m_x_, const value_type beta_, YVector &m_y_,
+                   const int block_dim_, const bool conj_)
+      : alpha(alpha_),
+        beta(beta_),
+        m_A(m_A_),
+        m_x(m_x_),
+        m_y(m_y_),
+        block_dim(block_dim_),
+        conjugate(conj_) {
+    static_assert(static_cast<int>(XVector::rank) == 1,
+                  "XVector must be a rank 1 View.");
+    static_assert(static_cast<int>(YVector::rank) == 1,
+                  "YVector must be a rank 1 View.");
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const ordinal_type iBlock) const {
+    const auto ystart = iBlock * block_dim;
+    const auto start  = m_A.graph.row_map(iBlock);
+    const ordinal_type count =
+        static_cast<ordinal_type>(m_A.graph.row_map(iBlock + 1) - start);
+    const auto row   = m_A.block_row_Const(iBlock);
+    const auto beta1 = static_cast<value_type>(1);
+    //
+    if (conjugate) {
+      for (ordinal_type ic = 0; ic < count; ++ic) {
+        const auto Aview  = row.block(ic);
+        const auto xstart = row.block_colidx(ic) * block_dim;
+        for (ordinal_type ii = 0; ii < block_dim; ++ii) {
+          value_type t(0);
+          for (ordinal_type jj = 0; jj < block_dim; ++jj) {
+            const auto aval =
+                Kokkos::ArithTraits<value_type>::conj(Aview(ii, jj));
+            t += aval * m_x(xstart + jj);
+          }
+          m_y(ystart + ii) += alpha * t;
+        }
+      }
+    } else {
+      for (ordinal_type ic = 0; ic < count; ++ic) {
+        const auto Aview  = row.block(ic);
+        const auto xstart = row.block_colidx(ic) * block_dim;
+        KokkosBatched::SerialGemvInternal<KokkosBatched::Algo::Gemv::Blocked>::
+            invoke<value_type, value_type>(
+                block_dim, block_dim, alpha, Aview.data(), block_dim, 1,
+                &m_x(xstart), static_cast<int>(m_x.stride_0()), beta1,
+                &m_y(ystart), static_cast<int>(m_y.stride_0()));
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const team_member &dev) const {
+    using y_value_type        = typename YVector::non_const_value_type;
+    const ordinal_type iBlock = static_cast<ordinal_type>(dev.league_rank());
+
+    const size_type Y_ptBeg = iBlock * block_dim;
+    const size_type Y_ptEnd = Y_ptBeg + block_dim;
+    auto Y_cur = Kokkos::subview(m_y, ::Kokkos::make_pair(Y_ptBeg, Y_ptEnd));
+
+    const y_value_type val_one = Kokkos::ArithTraits<y_value_type>::one();
+    ;
+    if (beta != val_one) {
+      KokkosBatched::TeamVectorScaleInternal::invoke(
+          dev, block_dim, beta, Y_cur.data(),
+          static_cast<int>(Y_cur.stride_0()));
+    }
+
+    dev.team_barrier();
+
+    const auto myRow = m_A.block_row_Const(iBlock);
+    const auto count = myRow.length;
+
+    if (conjugate) {
+      typedef Kokkos::View<const y_value_type **, Kokkos::LayoutRight,
+                           typename AMatrix::device_type,
+                           Kokkos::MemoryUnmanaged>
+          block_values_type;
+
+      for (ordinal_type jBlock = 0; jBlock < count; ++jBlock) {
+        const auto A_cur    = myRow.block(jBlock);
+        const auto X_blkCol = myRow.block_colidx(jBlock);
+        const auto X_ptBeg  = X_blkCol * block_dim;
+        const auto X_cur    = Kokkos::subview(
+            m_x, ::Kokkos::make_pair(X_ptBeg, X_ptBeg + block_dim));
+        KokkosBlas::Experimental::Impl::TeamGEMV<
+            team_member, block_values_type, XVector, YVector, -1,
+            false>::team_gemv(dev, alpha, A_cur, X_cur, val_one, Y_cur);
+      }
+    } else {
+      for (ordinal_type jBlock = 0; jBlock < count; ++jBlock) {
+        const auto A_cur    = myRow.block(jBlock);
+        const auto X_blkCol = myRow.block_colidx(jBlock);
+        const auto X_ptBeg  = X_blkCol * block_dim;
+        const auto X_cur    = Kokkos::subview(
+            m_x, ::Kokkos::make_pair(X_ptBeg, X_ptBeg + block_dim));
+        KokkosBatched::TeamVectorGemvInternal<
+            KokkosBatched::Algo::Gemv::Unblocked>::
+            invoke(dev, block_dim, block_dim, alpha, A_cur.data(),
+                   static_cast<int>(A_cur.stride_0()),
+                   static_cast<int>(A_cur.stride_1()), X_cur.data(),
+                   static_cast<int>(X_cur.stride_0()), val_one, Y_cur.data(),
+                   static_cast<int>(Y_cur.stride_0()));
+      }
+    }
+  }
+};
+
+/* ******************* */
+
+//
+// spMatVec_no_transpose: version for CPU execution spaces
+// (RangePolicy or trivial serial impl used)
+//
+template <class AT, class AO, class AD, class AS, class AlphaType,
+          class XVector, class BetaType, class YVector,
+          typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename YVector::execution_space>()>::type * = nullptr>
+void spMatVec_no_transpose(
+    const KokkosKernels::Experimental::Controls &controls,
+    const AlphaType &alpha,
+    const KokkosSparse::Experimental::BsrMatrix<
+        AT, AO, AD, Kokkos::MemoryTraits<Kokkos::Unmanaged>, AS> &A,
+    const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) {
+  // This is required to maintain semantics of KokkosKernels native SpMV:
+  // if y contains NaN but beta = 0, the result y should be filled with 0.
+  // For example, this is useful for passing in uninitialized y and beta=0.
+  if (beta == Kokkos::ArithTraits<BetaType>::zero())
+    Kokkos::deep_copy(y, Kokkos::ArithTraits<BetaType>::zero());
+  else
+    KokkosBlas::scal(y, beta, y);
+
+  //
+  // Treat the case y <- alpha * A * x + beta * y
+  //
+
+  typedef KokkosSparse::Experimental::BsrMatrix<
+      AT, AO, AD, Kokkos::MemoryTraits<Kokkos::Unmanaged>, AS>
+      AMatrix_Internal;
+
+  bool use_dynamic_schedule = false;  // Forces the use of a dynamic schedule
+  bool use_static_schedule  = false;  // Forces the use of a static schedule
+  if (controls.isParameter("schedule")) {
+    if (controls.getParameter("schedule") == "dynamic") {
+      use_dynamic_schedule = true;
+    } else if (controls.getParameter("schedule") == "static") {
+      use_static_schedule = true;
+    }
+  }
+
+  BSR_GEMV_Functor<AMatrix_Internal, XVector, YVector> func(
+      alpha, A, x, beta, y, A.blockDim(), useConjugate);
+  if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) {
+    Kokkos::parallel_for(
+        "KokkosSparse::bspmv<NoTranspose,Dynamic>",
+        Kokkos::RangePolicy<
+            typename AMatrix_Internal::device_type::execution_space,
+            Kokkos::Schedule<Kokkos::Dynamic>>(0, A.numRows()),
+        func);
+  } else {
+    Kokkos::parallel_for(
+        "KokkosSparse::bspmv<NoTranspose,Static>",
+        Kokkos::RangePolicy<
+            typename AMatrix_Internal::device_type::execution_space,
+            Kokkos::Schedule<Kokkos::Static>>(0, A.numRows()),
+        func);
+  }
+}
+
+/* ******************* */
+
+//
+// spMatVec_no_transpose: version for GPU execution spaces (TeamPolicy used)
+//
+template <class AT, class AO, class AD, class AS, class AlphaType,
+          class XVector, class BetaType, class YVector,
+          typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename YVector::execution_space>()>::type * = nullptr>
+void spMatVec_no_transpose(
+    const KokkosKernels::Experimental::Controls &controls,
+    const AlphaType &alpha,
+    const KokkosSparse::Experimental::BsrMatrix<
+        AT, AO, AD, Kokkos::MemoryTraits<Kokkos::Unmanaged>, AS> &A,
+    const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) {
+  if (A.numRows() <= static_cast<AO>(0)) {
+    return;
+  }
+
+  typedef KokkosSparse::Experimental::BsrMatrix<
+      AT, AO, AD, Kokkos::MemoryTraits<Kokkos::Unmanaged>, AS>
+      AMatrix_Internal;
+  typedef typename AMatrix_Internal::execution_space execution_space;
+
+  bool use_dynamic_schedule = false;  // Forces the use of a dynamic schedule
+  bool use_static_schedule  = false;  // Forces the use of a static schedule
+  if (controls.isParameter("schedule")) {
+    if (controls.getParameter("schedule") == "dynamic") {
+      use_dynamic_schedule = true;
+    } else if (controls.getParameter("schedule") == "static") {
+      use_static_schedule = true;
+    }
+  }
+  int team_size        = -1;
+  int vector_length    = -1;
+  const auto block_dim = A.blockDim();
+
+  team_size = 8;
+  if (block_dim <= 4) {
+    vector_length = 4;
+    team_size     = 64;
+  } else if (block_dim <= 8) {
+    vector_length = 8;
+    team_size     = 32;
+  } else if (block_dim <= 16) {
+    vector_length = 16;
+    team_size     = 16;
+  } else {
+    vector_length = 32;
+    team_size     = 8;
+  }
+  int64_t worksets = A.numRows();
+
+  //
+  // Use the controls to allow the user to pass in some tuning parameters.
+  //
+  if (controls.isParameter("team size")) {
+    team_size = std::stoi(controls.getParameter("team size"));
+  }
+  if (controls.isParameter("vector length")) {
+    vector_length = std::stoi(controls.getParameter("vector length"));
+  }
+
+  BSR_GEMV_Functor<AMatrix_Internal, XVector, YVector> func(
+      alpha, A, x, beta, y, block_dim, useConjugate);
+
+  if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) {
+    Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic>>
+        policy(1, 1);
+    if (team_size < 0)
+      policy = Kokkos::TeamPolicy<execution_space,
+                                  Kokkos::Schedule<Kokkos::Dynamic>>(
+          worksets, Kokkos::AUTO, vector_length);
+    else
+      policy = Kokkos::TeamPolicy<execution_space,
+                                  Kokkos::Schedule<Kokkos::Dynamic>>(
+          worksets, team_size, vector_length);
+    Kokkos::parallel_for("KokkosSparse::bspmv<NoTranspose,Dynamic>", policy,
+                         func);
+  } else {
+    Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>
+        policy(1, 1);
+    if (team_size < 0)
+      policy =
+          Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>(
+              worksets, Kokkos::AUTO, vector_length);
+    else
+      policy =
+          Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>(
+              worksets, team_size, vector_length);
+    Kokkos::parallel_for("KokkosSparse::bspmv<NoTranspose, Static>", policy,
+                         func);
+  }
+}
+
+/* ******************* */
+
+template <class AMatrix, class XVector, class YVector>
+struct BSR_GEMV_Transpose_Functor {
+  typedef typename AMatrix::execution_space execution_space;
+  typedef typename AMatrix::non_const_value_type value_type;
+  typedef typename Kokkos::TeamPolicy<execution_space> team_policy;
+  typedef typename team_policy::member_type team_member;
+  typedef Kokkos::Details::ArithTraits<value_type> ATV;
+
+  //! Nonconst version of the type of column indices in the sparse matrix.
+  typedef typename AMatrix::non_const_ordinal_type ordinal_type;
+  //! Nonconst version of the type of row offsets in the sparse matrix.
+  typedef typename AMatrix::non_const_size_type size_type;
+
+  const value_type alpha;
+
+  AMatrix m_A;
+  XVector m_x;
+  YVector m_y;
+
+  const int block_dim;
+  const bool conjugate;
+
+  BSR_GEMV_Transpose_Functor(const value_type alpha_, const AMatrix &m_A_,
+                             const XVector &m_x_, const YVector &m_y_,
+                             const bool conj_)
+      : alpha(alpha_),
+        m_A(m_A_),
+        m_x(m_x_),
+        m_y(m_y_),
+        block_dim(static_cast<int>(m_A_.blockDim())),
+        conjugate(conj_) {
+    static_assert(static_cast<int>(XVector::rank) == 1,
+                  "XVector must be a rank 1 View.");
+    static_assert(static_cast<int>(YVector::rank) == 1,
+                  "YVector must be a rank 1 View.");
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const ordinal_type iBlock) const {
+    //
+    // Assume that alpha is not zero
+    //
+    const auto xstart = iBlock * block_dim;
+    const auto xview =
+        Kokkos::subview(m_x, Kokkos::make_pair(xstart, xstart + block_dim));
+    const auto start = m_A.graph.row_map(iBlock);
+    const ordinal_type count =
+        static_cast<ordinal_type>(m_A.graph.row_map(iBlock + 1) - start);
+    const auto row    = m_A.block_row_Const(iBlock);
+    const auto beta1  = static_cast<value_type>(1);
+    const auto alpha1 = beta1;
+    if (conjugate) {
+      for (ordinal_type ic = 0; ic < count; ++ic) {
+        const auto Aview  = row.block(ic);
+        const auto ystart = row.block_colidx(ic) * block_dim;
+        for (ordinal_type jj = 0; jj < block_dim; ++jj) {
+          value_type t(0);
+          for (ordinal_type ii = 0; ii < block_dim; ++ii) {
+            const auto aval =
+                Kokkos::ArithTraits<value_type>::conj(Aview(ii, jj));
+            t += aval * xview(ii);
+          }
+          t *= alpha;
+          Kokkos::atomic_add(&m_y(ystart + jj), t);
+        }
+      }
+    } else {
+      for (ordinal_type ic = 0; ic < count; ++ic) {
+        const auto Aview  = row.block(ic);
+        const auto ystart = row.block_colidx(ic) * block_dim;
+        for (ordinal_type jj = 0; jj < block_dim; ++jj) {
+          value_type t(0);
+          KokkosBatched::SerialGemvInternal<
+              KokkosBatched::Algo::Gemv::Blocked>::invoke<value_type,
+                                                          value_type>(
+              1, block_dim, alpha1, Aview.data() + jj, Aview.stride_1(),
+              Aview.stride_0(), xview.data(), xview.stride_0(), beta1, &t, 1);
+          t *= alpha;
+          Kokkos::atomic_add(&m_y(ystart + jj), t);
+        }
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const team_member &dev) const {
+    using y_value_type        = typename YVector::non_const_value_type;
+    const ordinal_type iBlock = static_cast<ordinal_type>(dev.league_rank());
+
+    const size_type X_ptBeg = iBlock * block_dim;
+    const size_type X_ptEnd = X_ptBeg + block_dim;
+    const auto X_cur =
+        Kokkos::subview(m_x, ::Kokkos::make_pair(X_ptBeg, X_ptEnd));
+
+    const auto myRow = m_A.block_row_Const(iBlock);
+    const auto count = myRow.length;
+
+    const y_value_type val_zero = Kokkos::ArithTraits<y_value_type>::zero();
+    y_value_type *shared_y      = (y_value_type *)dev.team_shmem().get_shmem(
+        block_dim * sizeof(y_value_type));
+
+    if (conjugate) {
+      typedef Kokkos::View<const y_value_type **, Kokkos::LayoutRight,
+                           typename AMatrix::device_type,
+                           Kokkos::MemoryUnmanaged>
+          block_values_type;
+      Kokkos::View<y_value_type *, typename AMatrix::device_type,
+                   Kokkos::MemoryUnmanaged>
+          shared_view(shared_y, block_dim);
+      for (ordinal_type jBlock = 0; jBlock < count; ++jBlock) {
+        const auto A_cur = myRow.block(jBlock);
+        //
+        KokkosBlas::Experimental::Impl::TeamGEMV<
+            team_member, block_values_type, XVector, YVector, 2,
+            false>::team_gemv(dev, alpha, A_cur, X_cur, val_zero, shared_view);
+        //
+        dev.team_barrier();
+        //
+        const auto Y_blkCol = myRow.block_colidx(jBlock);
+        const auto Y_ptBeg  = Y_blkCol * block_dim;
+        auto Y_cur          = Kokkos::subview(
+            m_y, ::Kokkos::make_pair(Y_ptBeg, Y_ptBeg + block_dim));
+        Kokkos::parallel_for(Kokkos::TeamVectorRange(dev, 0, block_dim),
+                             [&](const ordinal_type &ijk) {
+                               Kokkos::atomic_add(&Y_cur(ijk),
+                                                  shared_view(ijk));
+                             });
+      }
+    } else {
+      for (ordinal_type jBlock = 0; jBlock < count; ++jBlock) {
+        const auto A_cur = myRow.block(jBlock);
+        //
+        KokkosBatched::TeamVectorGemvInternal<
+            KokkosBatched::Algo::Gemv::Unblocked>::invoke(dev, block_dim,
+                                                          block_dim, alpha,
+                                                          A_cur.data(),
+                                                          static_cast<int>(
+                                                              A_cur.stride_1()),
+                                                          static_cast<int>(
+                                                              A_cur.stride_0()),
+                                                          X_cur.data(),
+                                                          static_cast<int>(
+                                                              X_cur.stride_0()),
+                                                          val_zero, shared_y,
+                                                          1);
+        //
+        dev.team_barrier();
+        //
+        const auto Y_blkCol = myRow.block_colidx(jBlock);
+        const auto Y_ptBeg  = Y_blkCol * block_dim;
+        auto Y_cur          = Kokkos::subview(
+            m_y, ::Kokkos::make_pair(Y_ptBeg, Y_ptBeg + block_dim));
+        Kokkos::parallel_for(Kokkos::TeamVectorRange(dev, 0, block_dim),
+                             [&](const ordinal_type &ijk) {
+                               Kokkos::atomic_add(&Y_cur(ijk), shared_y[ijk]);
+                             });
+      }
+    }
+  }
+};
+
+/* ******************* */
+
+/// \brief  spMatVec_transpose: version for CPU execution spaces (RangePolicy or
+/// trivial serial impl used)
+template <class AT, class AO, class AD, class AS, class AlphaType,
+          class XVector, class BetaType, class YVector,
+          typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename YVector::execution_space>()>::type * = nullptr>
+void spMatVec_transpose(
+    const KokkosKernels::Experimental::Controls &controls,
+    const AlphaType &alpha,
+    const KokkosSparse::Experimental::BsrMatrix<
+        AT, AO, AD, Kokkos::MemoryTraits<Kokkos::Unmanaged>, AS> &A,
+    const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) {
+  // This is required to maintain semantics of KokkosKernels native SpMV:
+  // if y contains NaN but beta = 0, the result y should be filled with 0.
+  // For example, this is useful for passing in uninitialized y and beta=0.
+  if (beta == Kokkos::ArithTraits<BetaType>::zero())
+    Kokkos::deep_copy(y, Kokkos::ArithTraits<BetaType>::zero());
+  else
+    KokkosBlas::scal(y, beta, y);
+
+  if (alpha == Kokkos::ArithTraits<AlphaType>::zero()) return;
+
+  //
+  // Treat the case y <- alpha * A^T * x + beta * y
+  //
+
+  typedef KokkosSparse::Experimental::BsrMatrix<
+      AT, AO, AD, Kokkos::MemoryTraits<Kokkos::Unmanaged>, AS>
+      AMatrix_Internal;
+
+  bool use_dynamic_schedule = false;  // Forces the use of a dynamic schedule
+  bool use_static_schedule  = false;  // Forces the use of a static schedule
+  if (controls.isParameter("schedule")) {
+    if (controls.getParameter("schedule") == "dynamic") {
+      use_dynamic_schedule = true;
+    } else if (controls.getParameter("schedule") == "static") {
+      use_static_schedule = true;
+    }
+  }
+
+  BSR_GEMV_Transpose_Functor<AMatrix_Internal, XVector, YVector> func(
+      alpha, A, x, y, useConjugate);
+  if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) {
+    Kokkos::parallel_for(
+        "KokkosSparse::bspmv<Transpose,Dynamic>",
+        Kokkos::RangePolicy<
+            typename AMatrix_Internal::device_type::execution_space,
+            Kokkos::Schedule<Kokkos::Dynamic>>(0, A.numRows()),
+        func);
+  } else {
+    Kokkos::parallel_for(
+        "KokkosSparse::bspmv<Transpose,Static>",
+        Kokkos::RangePolicy<
+            typename AMatrix_Internal::device_type::execution_space,
+            Kokkos::Schedule<Kokkos::Static>>(0, A.numRows()),
+        func);
+  }
+}
+
+//
+// spMatVec_transpose: version for GPU execution spaces (TeamPolicy used)
+//
+template <class AMatrix, class AlphaType, class XVector, class BetaType,
+          class YVector,
+          typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename YVector::execution_space>()>::type * = nullptr>
+void spMatVec_transpose(const KokkosKernels::Experimental::Controls &controls,
+                        const AlphaType &alpha, const AMatrix &A,
+                        const XVector &x, const BetaType &beta, YVector &y,
+                        bool useConjugate) {
+  if (A.numRows() <= 0) {
+    return;
+  }
+
+  typedef typename AMatrix::execution_space execution_space;
+
+  const auto block_dim = A.blockDim();
+
+  KokkosBlas::scal(y, beta, y);
+
+  bool use_dynamic_schedule = false;  // Forces the use of a dynamic schedule
+  bool use_static_schedule  = false;  // Forces the use of a static schedule
+  if (controls.isParameter("schedule")) {
+    if (controls.getParameter("schedule") == "dynamic") {
+      use_dynamic_schedule = true;
+    } else if (controls.getParameter("schedule") == "static") {
+      use_static_schedule = true;
+    }
+  }
+  int team_size     = -1;
+  int vector_length = -1;
+
+  int64_t worksets = A.numRows();
+
+  if (block_dim <= 4) {
+    vector_length = 4;
+    team_size     = 64;
+  }
+  if (block_dim <= 8) {
+    vector_length = 8;
+    team_size     = 32;
+  }
+  if (block_dim <= 16) {
+    vector_length = 16;
+    team_size     = 16;
+  } else {
+    vector_length = 32;
+    team_size     = 8;
+  }
+
+  //
+  // Use the controls to allow the user to pass in some tuning parameters.
+  //
+  if (controls.isParameter("team size")) {
+    team_size = std::stoi(controls.getParameter("team size"));
+  }
+  if (controls.isParameter("vector length")) {
+    vector_length = std::stoi(controls.getParameter("vector length"));
+  }
+
+  BSR_GEMV_Transpose_Functor<AMatrix, XVector, YVector> func(alpha, A, x, y,
+                                                             useConjugate);
+
+  if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) {
+    Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic>>
+        policy(1, 1);
+    if (team_size < 0)
+      policy = Kokkos::TeamPolicy<execution_space,
+                                  Kokkos::Schedule<Kokkos::Dynamic>>(
+                   worksets, Kokkos::AUTO, vector_length)
+                   .set_scratch_size(
+                       0, Kokkos::PerTeam(
+                              block_dim *
+                              sizeof(typename YVector::non_const_value_type)));
+    else
+      policy = Kokkos::TeamPolicy<execution_space,
+                                  Kokkos::Schedule<Kokkos::Dynamic>>(
+                   worksets, team_size, vector_length)
+                   .set_scratch_size(
+                       0, Kokkos::PerTeam(
+                              block_dim *
+                              sizeof(typename YVector::non_const_value_type)));
+    Kokkos::parallel_for("KokkosSparse::bspmv<Transpose,Dynamic>", policy,
+                         func);
+  } else {
+    Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>
+        policy(1, 1);
+    if (team_size < 0)
+      policy =
+          Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>(
+              worksets, Kokkos::AUTO, vector_length)
+              .set_scratch_size(
+                  0, Kokkos::PerTeam(
+                         block_dim *
+                         sizeof(typename YVector::non_const_value_type)));
+    else
+      policy =
+          Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>(
+              worksets, team_size, vector_length)
+              .set_scratch_size(
+                  0, Kokkos::PerTeam(
+                         block_dim *
+                         sizeof(typename YVector::non_const_value_type)));
+    Kokkos::parallel_for("KokkosSparse::bspmv<Transpose, Static>", policy,
+                         func);
+  }
+}
+
+/* ******************* */
+
+template <class AMatrix, class XVector, class YVector>
+struct BSR_GEMM_Functor {
+  typedef typename AMatrix::execution_space execution_space;
+  typedef typename AMatrix::non_const_value_type value_type;
+  typedef typename Kokkos::TeamPolicy<execution_space> team_policy;
+  typedef typename team_policy::member_type team_member;
+  typedef Kokkos::Details::ArithTraits<value_type> ATV;
+
+  //! Nonconst version of the type of column indices in the sparse matrix.
+  typedef typename AMatrix::non_const_ordinal_type ordinal_type;
+  //! Nonconst version of the type of row offsets in the sparse matrix.
+  typedef typename AMatrix::non_const_size_type size_type;
+
+  const value_type alpha;
+  const value_type beta;
+
+  AMatrix m_A;
+  XVector m_x;
+  YVector m_y;
+
+  const int block_dim;
+  const int num_rhs;
+  const bool conjugate;
+
+  BSR_GEMM_Functor(const value_type alpha_, const AMatrix m_A_,
+                   const XVector m_x_, const value_type beta_,
+                   const YVector m_y_, const bool conj_)
+      : alpha(alpha_),
+        beta(beta_),
+        m_A(m_A_),
+        m_x(m_x_),
+        m_y(m_y_),
+        block_dim(static_cast<int>(m_A_.blockDim())),
+        num_rhs(static_cast<int>(m_x_.extent(1))),
+        conjugate(conj_) {
+    static_assert(static_cast<int>(XVector::rank) == 2,
+                  "XVector must be a rank 2 View.");
+    static_assert(static_cast<int>(YVector::rank) == 2,
+                  "YVector must be a rank 2 View.");
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const ordinal_type iBlock) const {
+    //
+    const auto ystart = iBlock * block_dim;
+    const auto start  = m_A.graph.row_map(iBlock);
+    const ordinal_type count =
+        static_cast<ordinal_type>(m_A.graph.row_map(iBlock + 1) - start);
+    const auto row   = m_A.block_row_Const(iBlock);
+    const auto beta1 = static_cast<value_type>(1);
+    const auto ldx   = m_x.stride_1();
+    const auto ldy   = m_y.stride_1();
+    //
+    if (conjugate) {
+      for (ordinal_type ic = 0; ic < count; ++ic) {
+        const auto Aview  = row.block(ic);
+        const auto xstart = row.block_colidx(ic) * block_dim;
+        for (ordinal_type jr = 0; jr < num_rhs; ++jr) {
+          for (ordinal_type ii = 0; ii < block_dim; ++ii) {
+            value_type t(0);
+            for (ordinal_type jj = 0; jj < block_dim; ++jj) {
+              const auto aval =
+                  Kokkos::ArithTraits<value_type>::conj(Aview(ii, jj));
+              t += aval * m_x(xstart + jj, jr);
+            }
+            m_y(ystart + ii, jr) += alpha * t;
+          }
+        }
+      }
+    } else {
+      for (ordinal_type ic = 0; ic < count; ++ic) {
+        const auto Aview  = row.block(ic);
+        const auto xstart = row.block_colidx(ic) * block_dim;
+        KokkosBatched::SerialGemmInternal<KokkosBatched::Algo::Gemm::Blocked>::
+            invoke<value_type, value_type>(
+                static_cast<ordinal_type>(block_dim),
+                static_cast<ordinal_type>(num_rhs),
+                static_cast<ordinal_type>(block_dim), alpha, Aview.data(),
+                Aview.stride_0(), Aview.stride_1(), &m_x(xstart, 0),
+                m_x.stride_0(), ldx, beta1, &m_y(ystart, 0), m_y.stride_0(),
+                ldy);
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const team_member &dev) const {
+    using y_value_type        = typename YVector::non_const_value_type;
+    const ordinal_type iBlock = static_cast<ordinal_type>(dev.league_rank());
+
+    const size_type Y_ptBeg = iBlock * block_dim;
+    const size_type Y_ptEnd = Y_ptBeg + block_dim;
+    auto Y_cur = Kokkos::subview(m_y, ::Kokkos::make_pair(Y_ptBeg, Y_ptEnd),
+                                 Kokkos::ALL());
+
+    const y_value_type val_one = Kokkos::ArithTraits<y_value_type>::one();
+    if (beta != val_one) {
+      KokkosBatched::TeamVectorScaleInternal::invoke(
+          dev, block_dim, num_rhs, beta, Y_cur.data(),
+          static_cast<int>(Y_cur.stride_0()),
+          static_cast<int>(Y_cur.stride_1()));
+    }
+
+    dev.team_barrier();
+
+    const auto myRow = m_A.block_row_Const(iBlock);
+    const auto count = myRow.length;
+
+    if (conjugate) {
+      for (ordinal_type jBlock = 0; jBlock < count; ++jBlock) {
+        const auto A_cur    = myRow.block(jBlock);
+        const auto X_blkCol = myRow.block_colidx(jBlock);
+        const auto X_ptBeg  = X_blkCol * block_dim;
+        const auto X_cur    = Kokkos::subview(
+            m_x, ::Kokkos::make_pair(X_ptBeg, X_ptBeg + block_dim),
+            Kokkos::ALL());
+        KokkosBatched::TeamVectorGemmInternal<
+            KokkosBatched::Algo::Gemm::Unblocked,
+            true>::invoke(dev, static_cast<int>(block_dim),
+                          static_cast<int>(num_rhs),
+                          static_cast<int>(block_dim), alpha, A_cur.data(),
+                          static_cast<int>(A_cur.stride_0()),
+                          static_cast<int>(A_cur.stride_1()), X_cur.data(),
+                          static_cast<int>(X_cur.stride_0()),
+                          static_cast<int>(X_cur.stride_1()), val_one,
+                          Y_cur.data(), static_cast<int>(Y_cur.stride_0()),
+                          static_cast<int>(Y_cur.stride_1()));
+      }
+    } else {
+      for (ordinal_type jBlock = 0; jBlock < count; ++jBlock) {
+        const auto A_cur    = myRow.block(jBlock);
+        const auto X_blkCol = myRow.block_colidx(jBlock);
+        const auto X_ptBeg  = X_blkCol * block_dim;
+        const auto X_cur    = Kokkos::subview(
+            m_x, ::Kokkos::make_pair(X_ptBeg, X_ptBeg + block_dim),
+            Kokkos::ALL());
+        KokkosBatched::TeamVectorGemmInternal<
+            KokkosBatched::Algo::Gemm::Unblocked,
+            false>::invoke(dev, block_dim, num_rhs, block_dim, alpha,
+                           A_cur.data(), static_cast<int>(A_cur.stride_0()),
+                           static_cast<int>(A_cur.stride_1()), X_cur.data(),
+                           static_cast<int>(X_cur.stride_0()),
+                           static_cast<int>(X_cur.stride_1()), val_one,
+                           Y_cur.data(), static_cast<int>(Y_cur.stride_0()),
+                           static_cast<int>(Y_cur.stride_1()));
+      }
+    }
+  }
+};
+
+/* ******************* */
+
+//
+// spMatMultiVec_no_transpose: version for CPU execution spaces
+// (RangePolicy or trivial serial impl used)
+//
+template <class AT, class AO, class AD, class AS, class AlphaType,
+          class XVector, class BetaType, class YVector,
+          typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename YVector::execution_space>()>::type * = nullptr>
+void spMatMultiVec_no_transpose(
+    const KokkosKernels::Experimental::Controls &controls,
+    const AlphaType &alpha,
+    const KokkosSparse::Experimental::BsrMatrix<
+        AT, AO, AD, Kokkos::MemoryTraits<Kokkos::Unmanaged>, AS> &A,
+    const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) {
+  // This is required to maintain semantics of KokkosKernels native SpMV:
+  // if y contains NaN but beta = 0, the result y should be filled with 0.
+  // For example, this is useful for passing in uninitialized y and beta=0.
+  if (beta == Kokkos::ArithTraits<BetaType>::zero())
+    Kokkos::deep_copy(y, Kokkos::ArithTraits<BetaType>::zero());
+  else
+    KokkosBlas::scal(y, beta, y);
+  //
+  // Treat the case y <- alpha * A * x + beta * y
+  //
+  typedef KokkosSparse::Experimental::BsrMatrix<
+      AT, AO, AD, Kokkos::MemoryTraits<Kokkos::Unmanaged>, AS>
+      AMatrix_Internal;
+
+  bool use_dynamic_schedule = false;  // Forces the use of a dynamic schedule
+  bool use_static_schedule  = false;  // Forces the use of a static schedule
+  if (controls.isParameter("schedule")) {
+    if (controls.getParameter("schedule") == "dynamic") {
+      use_dynamic_schedule = true;
+    } else if (controls.getParameter("schedule") == "static") {
+      use_static_schedule = true;
+    }
+  }
+
+  BSR_GEMM_Functor<AMatrix_Internal, XVector, YVector> func(alpha, A, x, beta,
+                                                            y, useConjugate);
+  if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) {
+    Kokkos::parallel_for(
+        "KokkosSparse::bsr_spm_mv<NoTranspose,Dynamic>",
+        Kokkos::RangePolicy<
+            typename AMatrix_Internal::device_type::execution_space,
+            Kokkos::Schedule<Kokkos::Dynamic>>(0, A.numRows()),
+        func);
+  } else {
+    Kokkos::parallel_for(
+        "KokkosSparse::bsr_spm_mv<NoTranspose,Static>",
+        Kokkos::RangePolicy<
+            typename AMatrix_Internal::device_type::execution_space,
+            Kokkos::Schedule<Kokkos::Static>>(0, A.numRows()),
+        func);
+  }
+}
+
+/* ******************* */
+
+//
+// spMatMultiVec_no_transpose: version for GPU execution spaces (TeamPolicy
+// used)
+//
+template <class AT, class AO, class AD, class AS, class AlphaType,
+          class XVector, class BetaType, class YVector,
+          typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename YVector::execution_space>()>::type * = nullptr>
+void spMatMultiVec_no_transpose(
+    const KokkosKernels::Experimental::Controls &controls,
+    const AlphaType &alpha,
+    const KokkosSparse::Experimental::BsrMatrix<
+        AT, AO, AD, Kokkos::MemoryTraits<Kokkos::Unmanaged>, AS> &A,
+    const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) {
+  if (A.numRows() <= static_cast<AO>(0)) {
+    return;
+  }
+
+  typedef KokkosSparse::Experimental::BsrMatrix<
+      AT, AO, AD, Kokkos::MemoryTraits<Kokkos::Unmanaged>, AS>
+      AMatrix_Internal;
+  typedef typename AMatrix_Internal::execution_space execution_space;
+
+  bool use_dynamic_schedule = false;  // Forces the use of a dynamic schedule
+  bool use_static_schedule  = false;  // Forces the use of a static schedule
+  if (controls.isParameter("schedule")) {
+    if (controls.getParameter("schedule") == "dynamic") {
+      use_dynamic_schedule = true;
+    } else if (controls.getParameter("schedule") == "static") {
+      use_static_schedule = true;
+    }
+  }
+
+  int team_size     = -1;
+  int vector_length = -1;
+  int64_t worksets  = A.numRows();
+
+  const auto block_dim = A.blockDim();
+  if (block_dim <= 4) {
+    vector_length = 4;
+    team_size     = 64;
+  } else if (block_dim <= 8) {
+    vector_length = 8;
+    team_size     = 32;
+  } else if (block_dim <= 16) {
+    vector_length = 16;
+    team_size     = 16;
+  } else {
+    vector_length = 32;
+    team_size     = 8;
+  }
+
+  //
+  // Use the controls to allow the user to pass in some tuning parameters.
+  //
+  if (controls.isParameter("team size")) {
+    team_size = std::stoi(controls.getParameter("team size"));
+  }
+  if (controls.isParameter("vector length")) {
+    vector_length = std::stoi(controls.getParameter("vector length"));
+  }
+
+  BSR_GEMM_Functor<AMatrix_Internal, XVector, YVector> func(alpha, A, x, beta,
+                                                            y, useConjugate);
+
+  if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) {
+    Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic>>
+        policy(1, 1);
+    if (team_size < 0)
+      policy = Kokkos::TeamPolicy<execution_space,
+                                  Kokkos::Schedule<Kokkos::Dynamic>>(
+          worksets, Kokkos::AUTO, vector_length);
+    else
+      policy = Kokkos::TeamPolicy<execution_space,
+                                  Kokkos::Schedule<Kokkos::Dynamic>>(
+          worksets, team_size, vector_length);
+    Kokkos::parallel_for("KokkosSparse::bsr_spm_mv<NoTranspose,Dynamic>",
+                         policy, func);
+  } else {
+    Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>
+        policy(1, 1);
+    if (team_size < 0)
+      policy =
+          Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>(
+              worksets, Kokkos::AUTO, vector_length);
+    else
+      policy =
+          Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>(
+              worksets, team_size, vector_length);
+    Kokkos::parallel_for("KokkosSparse::bsr_spm_mv<NoTranspose, Static>",
+                         policy, func);
+  }
+}
+
+/* ******************* */
+template <class AMatrix, class XVector, class YVector>
+struct BSR_GEMM_Transpose_Functor {
+  typedef typename AMatrix::execution_space execution_space;
+  typedef typename AMatrix::non_const_value_type value_type;
+  typedef typename Kokkos::TeamPolicy<execution_space> team_policy;
+  typedef typename team_policy::member_type team_member;
+  typedef Kokkos::Details::ArithTraits<value_type> ATV;
+
+  //! Nonconst version of the type of column indices in the sparse matrix.
+  typedef typename AMatrix::non_const_ordinal_type ordinal_type;
+  //! Nonconst version of the type of row offsets in the sparse matrix.
+  typedef typename AMatrix::non_const_size_type size_type;
+
+  const value_type alpha;
+  AMatrix m_A;
+  XVector m_x;
+  YVector m_y;
+
+  const int block_dim;
+  const int num_rhs;
+  const bool conjugate;
+
+  BSR_GEMM_Transpose_Functor(const value_type alpha_, const AMatrix &m_A_,
+                             const XVector &m_x_, YVector &m_y_,
+                             const bool conj_)
+      : alpha(alpha_),
+        m_A(m_A_),
+        m_x(m_x_),
+        m_y(m_y_),
+        block_dim(static_cast<int>(m_A_.blockDim())),
+        num_rhs(static_cast<int>(m_x_.extent(1))),
+        conjugate(conj_) {
+    static_assert(static_cast<int>(XVector::rank) == 2,
+                  "XVector must be a rank 2 View.");
+    static_assert(static_cast<int>(YVector::rank) == 2,
+                  "YVector must be a rank 2 View.");
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const ordinal_type iBlock) const {
+    //
+    const auto xstart = iBlock * block_dim;
+    const auto xview  = Kokkos::subview(
+        m_x, Kokkos::make_pair(xstart, xstart + block_dim), Kokkos::ALL());
+    const auto start = m_A.graph.row_map(iBlock);
+    const ordinal_type count =
+        static_cast<ordinal_type>(m_A.graph.row_map(iBlock + 1) - start);
+    const auto row    = m_A.block_row_Const(iBlock);
+    const auto beta1  = static_cast<value_type>(1);
+    const auto alpha1 = beta1;
+    const auto ldx    = m_x.stride_1();
+    //
+    if (conjugate) {
+      for (ordinal_type ic = 0; ic < count; ++ic) {
+        const auto Aview  = row.block(ic);
+        const auto ystart = row.block_colidx(ic) * block_dim;
+        for (ordinal_type jr = 0; jr < num_rhs; ++jr) {
+          for (ordinal_type jj = 0; jj < block_dim; ++jj) {
+            value_type t(0);
+            for (ordinal_type ii = 0; ii < block_dim; ++ii) {
+              const auto aval =
+                  Kokkos::ArithTraits<value_type>::conj(Aview(ii, jj));
+              t += aval * xview(ii, jr);
+            }
+            t *= alpha;
+            Kokkos::atomic_add(&m_y(ystart + jj, jr), t);
+          }
+        }
+      }
+    } else {
+      for (ordinal_type ic = 0; ic < count; ++ic) {
+        const auto Aview  = row.block(ic);
+        const auto ystart = row.block_colidx(ic) * block_dim;
+        for (ordinal_type jr = 0; jr < num_rhs; ++jr) {
+          for (ordinal_type jj = 0; jj < block_dim; ++jj) {
+            value_type t(0);
+            KokkosBatched::SerialGemvInternal<
+                KokkosBatched::Algo::Gemv::Blocked>::invoke<value_type,
+                                                            value_type>(
+                1, block_dim, alpha1, Aview.data() + jj, Aview.stride_1(),
+                Aview.stride_0(), xview.data() + jr * ldx, xview.stride_0(),
+                beta1, &t, 1);
+            t *= alpha;
+            Kokkos::atomic_add(&m_y(ystart + jj, jr), t);
+          }
+        }
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const team_member &dev) const {
+    using y_value_type        = typename YVector::non_const_value_type;
+    const ordinal_type iBlock = static_cast<ordinal_type>(dev.league_rank());
+
+    const size_type X_ptBeg = iBlock * block_dim;
+    const size_type X_ptEnd = X_ptBeg + block_dim;
+    const auto X_cur        = Kokkos::subview(
+        m_x, ::Kokkos::make_pair(X_ptBeg, X_ptEnd), Kokkos::ALL());
+
+    const auto myRow = m_A.block_row_Const(iBlock);
+    const auto count = myRow.length;
+
+    const y_value_type val_zero = Kokkos::ArithTraits<y_value_type>::zero();
+    y_value_type *shared_y      = (y_value_type *)dev.team_shmem().get_shmem(
+        block_dim * num_rhs * sizeof(y_value_type));
+
+    if (conjugate) {
+      for (ordinal_type jBlock = 0; jBlock < count; ++jBlock) {
+        const auto A_cur = myRow.block(jBlock);
+        //
+        KokkosBatched::TeamVectorGemmInternal<
+            KokkosBatched::Algo::Gemm::Unblocked,
+            true>::invoke(dev, block_dim, num_rhs, block_dim, alpha,
+                          A_cur.data(), static_cast<int>(A_cur.stride_1()),
+                          static_cast<int>(A_cur.stride_0()), X_cur.data(),
+                          static_cast<int>(X_cur.stride_0()),
+                          static_cast<int>(X_cur.stride_1()), val_zero,
+                          shared_y, 1, block_dim);
+        //
+        dev.team_barrier();
+        //
+        const auto Y_blkCol = myRow.block_colidx(jBlock);
+        const auto Y_ptBeg  = Y_blkCol * block_dim;
+        auto Y_cur          = Kokkos::subview(
+            m_y, ::Kokkos::make_pair(Y_ptBeg, Y_ptBeg + block_dim),
+            Kokkos::ALL());
+        Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, num_rhs),
+                             [&](const ordinal_type &kc) {
+                               Kokkos::parallel_for(
+                                   Kokkos::ThreadVectorRange(dev, 0, block_dim),
+                                   [&](const ordinal_type &kr) {
+                                     Kokkos::atomic_add(
+                                         &Y_cur(kr, kc),
+                                         shared_y[kr + kc * block_dim]);
+                                   });
+                             });
+        dev.team_barrier();
+      }
+    } else {
+      for (ordinal_type jBlock = 0; jBlock < count; ++jBlock) {
+        const auto A_cur = myRow.block(jBlock);
+        //
+        KokkosBatched::TeamVectorGemmInternal<
+            KokkosBatched::Algo::Gemm::Unblocked,
+            false>::invoke(dev, block_dim, num_rhs, block_dim, alpha,
+                           A_cur.data(), static_cast<int>(A_cur.stride_1()),
+                           static_cast<int>(A_cur.stride_0()), X_cur.data(),
+                           static_cast<int>(X_cur.stride_0()),
+                           static_cast<int>(X_cur.stride_1()), val_zero,
+                           shared_y, 1, block_dim);
+        //
+        dev.team_barrier();
+        //
+        const auto Y_blkCol = myRow.block_colidx(jBlock);
+        const auto Y_ptBeg  = Y_blkCol * block_dim;
+        auto Y_cur          = Kokkos::subview(
+            m_y, ::Kokkos::make_pair(Y_ptBeg, Y_ptBeg + block_dim),
+            Kokkos::ALL());
+        Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, num_rhs),
+                             [&](const ordinal_type &kc) {
+                               Kokkos::parallel_for(
+                                   Kokkos::ThreadVectorRange(dev, 0, block_dim),
+                                   [&](const ordinal_type &kr) {
+                                     Kokkos::atomic_add(
+                                         &Y_cur(kr, kc),
+                                         shared_y[kr + kc * block_dim]);
+                                   });
+                             });
+        dev.team_barrier();
+      }
+    }
+  }
+};
+
+/* ******************* */
+
+/// \brief  spMatMultiVec_transpose: version for CPU execution spaces
+/// (RangePolicy or trivial serial impl used)
+template <class AT, class AO, class AD, class AS, class AlphaType,
+          class XVector, class BetaType, class YVector,
+          typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename YVector::execution_space>()>::type * = nullptr>
+void spMatMultiVec_transpose(
+    const KokkosKernels::Experimental::Controls &controls,
+    const AlphaType &alpha,
+    const KokkosSparse::Experimental::BsrMatrix<
+        AT, AO, AD, Kokkos::MemoryTraits<Kokkos::Unmanaged>, AS> &A,
+    const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) {
+  // This is required to maintain semantics of KokkosKernels native SpMV:
+  // if y contains NaN but beta = 0, the result y should be filled with 0.
+  // For example, this is useful for passing in uninitialized y and beta=0.
+  if (beta == Kokkos::ArithTraits<BetaType>::zero())
+    Kokkos::deep_copy(y, Kokkos::ArithTraits<BetaType>::zero());
+  else
+    KokkosBlas::scal(y, beta, y);
+  //
+  // Treat the case y <- alpha * A^T * x + beta * y
+  //
+  typedef KokkosSparse::Experimental::BsrMatrix<
+      AT, AO, AD, Kokkos::MemoryTraits<Kokkos::Unmanaged>, AS>
+      AMatrix_Internal;
+  typedef typename AMatrix_Internal::execution_space execution_space;
+
+  bool use_dynamic_schedule = false;  // Forces the use of a dynamic schedule
+  bool use_static_schedule  = false;  // Forces the use of a static schedule
+  if (controls.isParameter("schedule")) {
+    if (controls.getParameter("schedule") == "dynamic") {
+      use_dynamic_schedule = true;
+    } else if (controls.getParameter("schedule") == "static") {
+      use_static_schedule = true;
+    }
+  }
+
+  BSR_GEMM_Transpose_Functor<AMatrix_Internal, XVector, YVector> func(
+      alpha, A, x, y, useConjugate);
+  if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) {
+    Kokkos::parallel_for(
+        "KokkosSparse::bsr_spm_mv<Transpose,Dynamic>",
+        Kokkos::RangePolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic>>(
+            0, A.numRows()),
+        func);
+  } else {
+    Kokkos::parallel_for(
+        "KokkosSparse::bsr_spm_mv<Transpose,Static>",
+        Kokkos::RangePolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>(
+            0, A.numRows()),
+        func);
+  }
+}
+
+//
+// spMatMultiVec_transpose: version for GPU execution spaces (TeamPolicy used)
+//
+template <class AMatrix, class AlphaType, class XVector, class BetaType,
+          class YVector,
+          typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename YVector::execution_space>()>::type * = nullptr>
+void spMatMultiVec_transpose(
+    const KokkosKernels::Experimental::Controls &controls,
+    const AlphaType &alpha, const AMatrix &A, const XVector &x,
+    const BetaType &beta, YVector &y, bool useConjugate) {
+  if (A.numRows() <= 0) {
+    return;
+  }
+
+  KokkosBlas::scal(y, beta, y);
+
+  typedef typename AMatrix::execution_space execution_space;
+
+  bool use_dynamic_schedule = false;  // Forces the use of a dynamic schedule
+  bool use_static_schedule  = false;  // Forces the use of a static schedule
+  if (controls.isParameter("schedule")) {
+    if (controls.getParameter("schedule") == "dynamic") {
+      use_dynamic_schedule = true;
+    } else if (controls.getParameter("schedule") == "static") {
+      use_static_schedule = true;
+    }
+  }
+  int team_size     = -1;
+  int vector_length = -1;
+  int64_t worksets  = A.numRows();
+
+  const auto block_dim = A.blockDim();
+  if (block_dim <= 4) {
+    vector_length = 4;
+    team_size     = 64;
+  } else if (block_dim <= 8) {
+    vector_length = 8;
+    team_size     = 32;
+  } else if (block_dim <= 8) {
+    vector_length = 16;
+    team_size     = 16;
+  } else {
+    vector_length = 32;
+    team_size     = 8;
+  }
+
+  //
+  // Use the controls to allow the user to pass in some tuning
+  // parameters.
+  //
+  if (controls.isParameter("team size")) {
+    team_size = std::stoi(controls.getParameter("team size"));
+  }
+  if (controls.isParameter("vector length")) {
+    vector_length = std::stoi(controls.getParameter("vector length"));
+  }
+
+  BSR_GEMM_Transpose_Functor<AMatrix, XVector, YVector> func(alpha, A, x, y,
+                                                             useConjugate);
+
+  if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) {
+    Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic>>
+        policy(1, 1);
+    if (team_size < 0)
+      policy = Kokkos::TeamPolicy<execution_space,
+                                  Kokkos::Schedule<Kokkos::Dynamic>>(
+                   worksets, Kokkos::AUTO, vector_length)
+                   .set_scratch_size(
+                       0, Kokkos::PerTeam(
+                              block_dim * x.extent(1) *
+                              sizeof(typename YVector::non_const_value_type)));
+    else
+      policy = Kokkos::TeamPolicy<execution_space,
+                                  Kokkos::Schedule<Kokkos::Dynamic>>(
+                   worksets, team_size, vector_length)
+                   .set_scratch_size(
+                       0, Kokkos::PerTeam(
+                              block_dim * x.extent(1) *
+                              sizeof(typename YVector::non_const_value_type)));
+    Kokkos::parallel_for("KokkosSparse::bsr_spm_mv<Transpose,Dynamic>", policy,
+                         func);
+  } else {
+    Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>
+        policy(1, 1);
+    if (team_size < 0)
+      policy =
+          Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>(
+              worksets, Kokkos::AUTO, vector_length)
+              .set_scratch_size(
+                  0, Kokkos::PerTeam(
+                         block_dim * x.extent(1) *
+                         sizeof(typename YVector::non_const_value_type)));
+    else
+      policy =
+          Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>(
+              worksets, team_size, vector_length)
+              .set_scratch_size(
+                  0, Kokkos::PerTeam(
+                         block_dim * x.extent(1) *
+                         sizeof(typename YVector::non_const_value_type)));
+    Kokkos::parallel_for("KokkosSparse::bsr_spm_mv<Transpose, Static>", policy,
+                         func);
+  }
+}
+
+/* ******************* */
+
+}  // namespace Bsr
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosSparse
+
+#endif  // KOKKOSSPARSE_IMPL_SPMV_BSRMATRIX_IMPL_HPP_
diff --git a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp
new file mode 100644
index 0000000000..4d6d6cd1b5
--- /dev/null
+++ b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp
@@ -0,0 +1,405 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOSSPARSE_IMPL_SPMV_BSRMATRIX_SPEC_HPP_
+#define KOKKOSSPARSE_IMPL_SPMV_BSRMATRIX_SPEC_HPP_
+
+#include <KokkosKernels_config.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_ArithTraits.hpp>
+
+#include "KokkosSparse_BsrMatrix.hpp"
+#include "KokkosKernels_Controls.hpp"
+#include "KokkosKernels_Error.hpp"
+#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+#include <KokkosSparse_spmv_bsrmatrix_impl.hpp>
+#endif
+
+namespace KokkosSparse {
+namespace Experimental {
+namespace Impl {
+
+// default is no eti available
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM>
+struct spmv_bsrmatrix_eti_spec_avail {
+  enum : bool { value = false };
+};
+
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM,
+          const bool integerScalarType =
+              std::is_integral<typename std::decay<AT>::type>::value>
+struct spmv_mv_bsrmatrix_eti_spec_avail {
+  enum : bool { value = false };
+};
+
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosSparse
+
+#define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_AVAIL(                       \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE)                                                       \
+  template <>                                                             \
+  struct spmv_bsrmatrix_eti_spec_avail<                                   \
+      const SCALAR_TYPE, const ORDINAL_TYPE,                              \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET_TYPE,         \
+      SCALAR_TYPE const *, LAYOUT_TYPE,                                   \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,     \
+      SCALAR_TYPE *, LAYOUT_TYPE,                                         \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged> > {                         \
+    enum : bool { value = true };                                         \
+  };
+
+#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_AVAIL(                    \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE)                                                       \
+  template <>                                                             \
+  struct spmv_mv_bsrmatrix_eti_spec_avail<                                \
+      const SCALAR_TYPE, const ORDINAL_TYPE,                              \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET_TYPE,         \
+      SCALAR_TYPE const **, LAYOUT_TYPE,                                  \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,     \
+      SCALAR_TYPE **, LAYOUT_TYPE,                                        \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged> > {                         \
+    enum : bool { value = true };                                         \
+  };
+
+// Include which ETIs are available
+#include <KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_avail.hpp>
+
+namespace KokkosSparse {
+namespace Experimental {
+namespace Impl {
+
+// declaration
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM,
+          bool tpl_spec_avail = spmv_bsrmatrix_tpl_spec_avail<
+              AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value,
+          bool eti_spec_avail = spmv_bsrmatrix_eti_spec_avail<
+              AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value>
+struct SPMV_BSRMATRIX {
+  typedef BsrMatrix<AT, AO, AD, AM, AS> AMatrix;
+  typedef Kokkos::View<XT, XL, XD, XM> XVector;
+  typedef Kokkos::View<YT, YL, YD, YM> YVector;
+  typedef typename YVector::non_const_value_type YScalar;
+
+  static void spmv_bsrmatrix(
+      const KokkosKernels::Experimental::Controls &controls, const char mode[],
+      const YScalar &alpha, const AMatrix &A, const XVector &x,
+      const YScalar &beta, const YVector &y);
+};
+
+// declaration
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM,
+          const bool integerScalarType =
+              std::is_integral<typename std::decay<AT>::type>::value,
+          bool tpl_spec_avail = spmv_mv_bsrmatrix_tpl_spec_avail<
+              AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value,
+          bool eti_spec_avail = spmv_mv_bsrmatrix_eti_spec_avail<
+              AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value>
+struct SPMV_MV_BSRMATRIX {
+  typedef BsrMatrix<AT, AO, AD, AM, AS> AMatrix;
+  typedef Kokkos::View<XT, XL, XD, XM> XVector;
+  typedef Kokkos::View<YT, YL, YD, YM> YVector;
+  typedef typename YVector::non_const_value_type YScalar;
+
+  static void spmv_mv_bsrmatrix(
+      const KokkosKernels::Experimental::Controls &controls, const char mode[],
+      const YScalar &alpha, const AMatrix &A, const XVector &x,
+      const YScalar &beta, const YVector &y);
+};
+
+// actual implementations to be compiled
+#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM>
+struct SPMV_BSRMATRIX<AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM, false,
+                      KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  typedef BsrMatrix<AT, AO, AD, AM, AS> AMatrix;
+  typedef Kokkos::View<XT, XL, XD, XM> XVector;
+  typedef Kokkos::View<YT, YL, YD, YM> YVector;
+  typedef typename YVector::non_const_value_type YScalar;
+
+  static void spmv_bsrmatrix(
+      const KokkosKernels::Experimental::Controls &controls, const char mode[],
+      const YScalar &alpha, const AMatrix &A, const XVector &X,
+      const YScalar &beta, const YVector &Y) {
+    //
+    if ((mode[0] == KokkosSparse::NoTranspose[0]) ||
+        (mode[0] == KokkosSparse::Conjugate[0])) {
+      bool useConjugate = (mode[0] == KokkosSparse::Conjugate[0]);
+      return Bsr::spMatVec_no_transpose(controls, alpha, A, X, beta, Y,
+                                        useConjugate);
+    } else if ((mode[0] == KokkosSparse::Transpose[0]) ||
+               (mode[0] == KokkosSparse::ConjugateTranspose[0])) {
+      bool useConjugate = (mode[0] == KokkosSparse::ConjugateTranspose[0]);
+      return Bsr::spMatVec_transpose(controls, alpha, A, X, beta, Y,
+                                     useConjugate);
+    }
+  }
+};
+
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM>
+struct SPMV_MV_BSRMATRIX<AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM,
+                         false, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  typedef BsrMatrix<AT, AO, AD, AM, AS> AMatrix;
+  typedef Kokkos::View<XT, XL, XD, XM> XVector;
+  typedef Kokkos::View<YT, YL, YD, YM> YVector;
+  typedef typename YVector::non_const_value_type YScalar;
+
+  static void spmv_mv_bsrmatrix(
+      const KokkosKernels::Experimental::Controls &controls, const char mode[],
+      const YScalar &alpha, const AMatrix &A, const XVector &X,
+      const YScalar &beta, const YVector &Y) {
+#if defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_VOLTA)
+    // user explicitly requests a particular precision
+    bool requestMixed  = false;
+    bool requestDouble = false;
+    if (controls.isParameter("tc_precision")) {
+      if (controls.getParameter("tc_precision") == "mixed") {
+        requestMixed = true;
+      } else if (controls.getParameter("tc_precision") == "double") {
+        requestDouble = true;
+      }
+    }
+    //
+    bool use_tc = false;
+    if ((controls.isParameter("algorithm")) &&
+        (controls.getParameter("algorithm") == "experimental_bsr_tc")) {
+      if (Kokkos::Details::ArithTraits<YScalar>::is_complex == false)
+        use_tc = true;
+    }
+#endif
+
+#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ARCH_AMPERE)
+    typedef typename XVector::non_const_value_type XScalar;
+    typedef typename AMatrix::non_const_value_type AScalar;
+    typedef Kokkos::Experimental::half_t Half;
+
+    /* Ampere has double += double * double and float += half * half
+
+    use whichever is requested.
+    If none requested, used mixed precision if the inputs are mixed, otherwise
+    use double
+    */
+
+    // input precision matches a tensor core fragment type
+    constexpr bool operandsHalfHalfFloat = std::is_same<AScalar, Half>::value &&
+                                           std::is_same<XScalar, Half>::value &&
+                                           std::is_same<YScalar, float>::value;
+
+    if (use_tc) {
+      if (requestMixed) {
+        BsrMatrixSpMVTensorCoreDispatcher<AMatrix, half, XVector, half, YVector,
+                                          float, 16, 16, 16>::dispatch(alpha, A,
+                                                                       X, beta,
+                                                                       Y);
+        return;
+      } else if (requestDouble) {
+        BsrMatrixSpMVTensorCoreDispatcher<AMatrix, double, XVector, double,
+                                          YVector, double, 8, 8,
+                                          4>::dispatch(alpha, A, X, beta, Y);
+        return;
+      } else if (operandsHalfHalfFloat) {
+        BsrMatrixSpMVTensorCoreDispatcher<AMatrix, half, XVector, half, YVector,
+                                          float, 16, 16, 16>::dispatch(alpha, A,
+                                                                       X, beta,
+                                                                       Y);
+        return;
+      } else {
+        BsrMatrixSpMVTensorCoreDispatcher<AMatrix, double, XVector, double,
+                                          YVector, double, 8, 8,
+                                          4>::dispatch(alpha, A, X, beta, Y);
+        return;
+      }
+    }
+#elif defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ARCH_VOLTA)
+    /* Volta has float += half * half
+       use it for all matrices
+    */
+    if (use_tc) {
+      if (requestDouble) {
+        KokkosKernels::Impl::throw_runtime_exception(
+            "KokkosSparse::spmv[algorithm=experimental_bsr_tc] "
+            "tc_precision=double unsupported KOKKOS_ARCH_VOLTA");
+      }
+      BsrMatrixSpMVTensorCoreDispatcher<AMatrix, half, XVector, half, YVector,
+                                        float, 16, 16, 16>::dispatch(alpha, A,
+                                                                     X, beta,
+                                                                     Y);
+      (void)requestMixed;  // unused
+      return;
+    }
+#endif  // KOKKOS_ARCH
+
+    if ((mode[0] == KokkosSparse::NoTranspose[0]) ||
+        (mode[0] == KokkosSparse::Conjugate[0])) {
+      bool useConjugate = (mode[0] == KokkosSparse::Conjugate[0]);
+      return Bsr::spMatMultiVec_no_transpose(controls, alpha, A, X, beta, Y,
+                                             useConjugate);
+    } else if ((mode[0] == KokkosSparse::Transpose[0]) ||
+               (mode[0] == KokkosSparse::ConjugateTranspose[0])) {
+      bool useConjugate = (mode[0] == KokkosSparse::ConjugateTranspose[0]);
+      return Bsr::spMatMultiVec_transpose(controls, alpha, A, X, beta, Y,
+                                          useConjugate);
+    }
+  }
+};
+
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM>
+struct SPMV_MV_BSRMATRIX<AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM,
+                         true, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  typedef BsrMatrix<AT, AO, AD, AM, AS> AMatrix;
+  typedef Kokkos::View<XT, XL, XD, XM> XVector;
+  typedef Kokkos::View<YT, YL, YD, YM> YVector;
+  typedef typename YVector::non_const_value_type YScalar;
+
+  static void spmv_mv_bsrmatrix(
+      const KokkosKernels::Experimental::Controls &controls, const char mode[],
+      const YScalar &alpha, const AMatrix &A, const XVector &X,
+      const YScalar &beta, const YVector &Y) {
+    static_assert(std::is_integral<AT>::value,
+                  "This implementation is only for integer Scalar types.");
+    typedef SPMV_BSRMATRIX<AT, AO, AD, AM, AS, typename XVector::value_type *,
+                           XL, XD, XM, typename YVector::value_type *, YL, YD,
+                           YM>
+        impl_type;
+    for (typename AMatrix::non_const_size_type j = 0; j < X.extent(1); ++j) {
+      const auto x_j = Kokkos::subview(X, Kokkos::ALL(), j);
+      auto y_j       = Kokkos::subview(Y, Kokkos::ALL(), j);
+      impl_type::spmv_bsrmatrix(controls, mode, alpha, A, x_j, beta, y_j);
+    }
+  }
+};
+#endif  // !defined(KOKKOSKERNELS_ETI_ONLY) ||
+        // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
+}  // namespace Impl
+}  // namespace Experimental
+}  // namespace KokkosSparse
+
+// declare / instantiate the vector version
+// Instantiate with A,x,y are all the requested Scalar type (no instantiation of
+// mixed-precision operands)
+#define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_DECL(                        \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE)                                                       \
+  extern template struct SPMV_BSRMATRIX<                                  \
+      const SCALAR_TYPE, const ORDINAL_TYPE,                              \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET_TYPE,         \
+      SCALAR_TYPE const *, LAYOUT_TYPE,                                   \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,     \
+      SCALAR_TYPE *, LAYOUT_TYPE,                                         \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, false, true>;
+
+#define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_INST(                        \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE)                                                       \
+  template struct SPMV_BSRMATRIX<                                         \
+      const SCALAR_TYPE, const ORDINAL_TYPE,                              \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET_TYPE,         \
+      SCALAR_TYPE const *, LAYOUT_TYPE,                                   \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,     \
+      SCALAR_TYPE *, LAYOUT_TYPE,                                         \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, false, true>;
+
+// declare / instantiate the 2D MV version
+// Instantiate with A,x,y are all the requested Scalar type (no instantiation of
+// mixed-precision operands)
+#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_DECL(                         \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE,     \
+    MEM_SPACE_TYPE)                                                           \
+  extern template struct SPMV_MV_BSRMATRIX<                                   \
+      const SCALAR_TYPE, const ORDINAL_TYPE,                                  \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET_TYPE,             \
+      SCALAR_TYPE const **, LAYOUT_TYPE,                                      \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,         \
+      SCALAR_TYPE **, LAYOUT_TYPE,                                            \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>,                                \
+      std::is_integral<typename std::decay<SCALAR_TYPE>::type>::value, false, \
+      true>;
+
+#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_INST(                         \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE,     \
+    MEM_SPACE_TYPE)                                                           \
+  template struct SPMV_MV_BSRMATRIX<                                          \
+      const SCALAR_TYPE, const ORDINAL_TYPE,                                  \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET_TYPE,             \
+      SCALAR_TYPE const **, LAYOUT_TYPE,                                      \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,         \
+      SCALAR_TYPE **, LAYOUT_TYPE,                                            \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>,                                \
+      std::is_integral<typename std::decay<SCALAR_TYPE>::type>::value, false, \
+      true>;
+
+#include <KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_decl.hpp>
+
+#endif  // KOKKOSSPARSE_IMPL_SPMV_BSRMATRIX_SPEC_HPP_
diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
index 9db3048966..41843d8674 100644
--- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
@@ -51,46 +51,44 @@
 #include "KokkosKernels_ExecSpaceUtils.hpp"
 #include "KokkosSparse_CrsMatrix.hpp"
 #include "KokkosSparse_spmv_impl_omp.hpp"
+#include "KokkosKernels_Error.hpp"
 
 namespace KokkosSparse {
 namespace Impl {
 
-template<class InputType, class DeviceType>
+template <class InputType, class DeviceType>
 struct GetCoeffView {
-  typedef Kokkos::View<InputType*,Kokkos::LayoutLeft,DeviceType> view_type;
+  typedef Kokkos::View<InputType*, Kokkos::LayoutLeft, DeviceType> view_type;
   typedef Kokkos::View<typename view_type::non_const_value_type*,
-                       Kokkos::LayoutLeft,DeviceType> non_const_view_type;
+                       Kokkos::LayoutLeft, DeviceType>
+      non_const_view_type;
   static non_const_view_type get_view(const InputType in, const int size) {
-    non_const_view_type aview("CoeffView",size);
-    if(size>0)
-      Kokkos::deep_copy(aview,in);
+    non_const_view_type aview("CoeffView", size);
+    if (size > 0) Kokkos::deep_copy(aview, in);
     return aview;
   }
 };
 
-template<class IT, class IL, class ID, class IM, class IS, class DeviceType>
-struct GetCoeffView<Kokkos::View<IT*,IL,ID,IM,IS>,DeviceType> {
-  typedef Kokkos::View<IT*,IL,ID,IM,IS> view_type;
-  static Kokkos::View<IT*,IL,ID,IM,IS> get_view(const Kokkos::View<IT*,IL,ID,IM,IS>& in, int /*size*/) {
+template <class IT, class IL, class ID, class IM, class IS, class DeviceType>
+struct GetCoeffView<Kokkos::View<IT*, IL, ID, IM, IS>, DeviceType> {
+  typedef Kokkos::View<IT*, IL, ID, IM, IS> view_type;
+  static Kokkos::View<IT*, IL, ID, IM, IS> get_view(
+      const Kokkos::View<IT*, IL, ID, IM, IS>& in, int /*size*/) {
     return in;
   }
 };
 
-
 // This TransposeFunctor is functional, but not necessarily performant.
-template<class AMatrix,
-         class XVector,
-         class YVector,
-         bool conjugate>
+template <class AMatrix, class XVector, class YVector, bool conjugate>
 struct SPMV_Transpose_Functor {
-  typedef typename AMatrix::execution_space            execution_space;
-  typedef typename AMatrix::non_const_ordinal_type     ordinal_type;
-  typedef typename AMatrix::non_const_value_type       value_type;
+  typedef typename AMatrix::execution_space execution_space;
+  typedef typename AMatrix::non_const_ordinal_type ordinal_type;
+  typedef typename AMatrix::non_const_value_type value_type;
   typedef typename Kokkos::TeamPolicy<execution_space> team_policy;
-  typedef typename team_policy::member_type            team_member;
-  typedef Kokkos::Details::ArithTraits<value_type>     ATV;
-  typedef typename YVector::non_const_value_type       coefficient_type;
-  typedef typename YVector::non_const_value_type       y_value_type;
+  typedef typename team_policy::member_type team_member;
+  typedef Kokkos::Details::ArithTraits<value_type> ATV;
+  typedef typename YVector::non_const_value_type coefficient_type;
+  typedef typename YVector::non_const_value_type y_value_type;
 
   const coefficient_type alpha;
   AMatrix m_A;
@@ -98,495 +96,514 @@ struct SPMV_Transpose_Functor {
   YVector m_y;
   ordinal_type rows_per_team;
 
-  SPMV_Transpose_Functor (const coefficient_type& alpha_,
-                          const AMatrix& m_A_,
-                          const XVector& m_x_,
-                          const YVector& m_y_) :
-    alpha (alpha_), m_A (m_A_), m_x (m_x_), m_y (m_y_)
-  {}
+  SPMV_Transpose_Functor(const coefficient_type& alpha_, const AMatrix& m_A_,
+                         const XVector& m_x_, const YVector& m_y_)
+      : alpha(alpha_), m_A(m_A_), m_x(m_x_), m_y(m_y_) {}
 
-  KOKKOS_INLINE_FUNCTION void
-  operator() (const ordinal_type iRow) const
-  {
-    const auto row = m_A.rowConst (iRow);
+  KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type iRow) const {
+    const auto row                = m_A.rowConst(iRow);
     const ordinal_type row_length = row.length;
-    for(ordinal_type iEntry = 0; iEntry < row_length; iEntry++)
-    {
-      const value_type val = conjugate ?
-        ATV::conj (row.value(iEntry)) :
-        row.value(iEntry);
+    for (ordinal_type iEntry = 0; iEntry < row_length; iEntry++) {
+      const value_type val =
+          conjugate ? ATV::conj(row.value(iEntry)) : row.value(iEntry);
       const ordinal_type ind = row.colidx(iEntry);
-      Kokkos::atomic_add (&m_y(ind), static_cast<y_value_type> (alpha * val * m_x(iRow)));
+      Kokkos::atomic_add(&m_y(ind),
+                         static_cast<y_value_type>(alpha * val * m_x(iRow)));
     }
   }
 
-  KOKKOS_INLINE_FUNCTION void
-  operator() (const team_member& dev) const
-  {
+  KOKKOS_INLINE_FUNCTION void operator()(const team_member& dev) const {
     const ordinal_type teamWork = dev.league_rank() * rows_per_team;
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_team),
-    [&](ordinal_type loop)
-    {
-      // iRow represents a row of the matrix, so its correct type is
-      // ordinal_type.
-      const ordinal_type iRow = teamWork + loop;
-      if (iRow >= m_A.numRows ()) {
-        return;
-      }
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(dev, rows_per_team), [&](ordinal_type loop) {
+          // iRow represents a row of the matrix, so its correct type is
+          // ordinal_type.
+          const ordinal_type iRow = teamWork + loop;
+          if (iRow >= m_A.numRows()) {
+            return;
+          }
 
-      const auto row = m_A.rowConst (iRow);
-      const ordinal_type row_length = row.length;
-      Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row_length),
-      [&](ordinal_type iEntry)
-      {
-        const value_type val = conjugate ?
-          ATV::conj (row.value(iEntry)) :
-          row.value(iEntry);
-        const ordinal_type ind = row.colidx(iEntry);
-        Kokkos::atomic_add (&m_y(ind), static_cast<y_value_type> (alpha * val * m_x(iRow)));
-      });
-    });
+          const auto row                = m_A.rowConst(iRow);
+          const ordinal_type row_length = row.length;
+          Kokkos::parallel_for(
+              Kokkos::ThreadVectorRange(dev, row_length),
+              [&](ordinal_type iEntry) {
+                const value_type val = conjugate ? ATV::conj(row.value(iEntry))
+                                                 : row.value(iEntry);
+                const ordinal_type ind = row.colidx(iEntry);
+                Kokkos::atomic_add(&m_y(ind), static_cast<y_value_type>(
+                                                  alpha * val * m_x(iRow)));
+              });
+        });
   }
 };
 
-template<class AMatrix,
-         class XVector,
-         class YVector,
-         int dobeta,
-         bool conjugate>
+template <class AMatrix, class XVector, class YVector, int dobeta,
+          bool conjugate>
 struct SPMV_Functor {
-  typedef typename AMatrix::execution_space            execution_space;
-  typedef typename AMatrix::non_const_ordinal_type     ordinal_type;
-  typedef typename AMatrix::non_const_value_type       value_type;
+  typedef typename AMatrix::execution_space execution_space;
+  typedef typename AMatrix::non_const_ordinal_type ordinal_type;
+  typedef typename AMatrix::non_const_value_type value_type;
   typedef typename Kokkos::TeamPolicy<execution_space> team_policy;
-  typedef typename team_policy::member_type            team_member;
-  typedef Kokkos::Details::ArithTraits<value_type>     ATV;
+  typedef typename team_policy::member_type team_member;
+  typedef Kokkos::Details::ArithTraits<value_type> ATV;
 
   const value_type alpha;
-  AMatrix  m_A;
+  AMatrix m_A;
   XVector m_x;
   const value_type beta;
   YVector m_y;
 
   const ordinal_type rows_per_team;
 
-  SPMV_Functor (const value_type alpha_,
-                const AMatrix m_A_,
-                const XVector m_x_,
-                const value_type beta_,
-                const YVector m_y_,
-                const int rows_per_team_) :
-     alpha (alpha_), m_A (m_A_), m_x (m_x_),
-     beta (beta_), m_y (m_y_),
-     rows_per_team (rows_per_team_)
-  {
-    static_assert (static_cast<int> (XVector::rank) == 1,
-                   "XVector must be a rank 1 View.");
-    static_assert (static_cast<int> (YVector::rank) == 1,
-                   "YVector must be a rank 1 View.");
+  SPMV_Functor(const value_type alpha_, const AMatrix m_A_, const XVector m_x_,
+               const value_type beta_, const YVector m_y_,
+               const int rows_per_team_)
+      : alpha(alpha_),
+        m_A(m_A_),
+        m_x(m_x_),
+        beta(beta_),
+        m_y(m_y_),
+        rows_per_team(rows_per_team_) {
+    static_assert(static_cast<int>(XVector::rank) == 1,
+                  "XVector must be a rank 1 View.");
+    static_assert(static_cast<int>(YVector::rank) == 1,
+                  "YVector must be a rank 1 View.");
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const ordinal_type iRow) const
-  {
+  void operator()(const ordinal_type iRow) const {
     using y_value_type = typename YVector::non_const_value_type;
-    if (iRow >= m_A.numRows ()) {
+    if (iRow >= m_A.numRows()) {
       return;
     }
     const KokkosSparse::SparseRowViewConst<AMatrix> row = m_A.rowConst(iRow);
-    const ordinal_type row_length = static_cast<ordinal_type> (row.length);
-    y_value_type sum = 0;
-
-    for(ordinal_type iEntry = 0; iEntry < row_length; iEntry++)
-    {
-      const value_type val = conjugate ?
-              ATV::conj (row.value(iEntry)) :
-              row.value(iEntry);
+    const ordinal_type row_length = static_cast<ordinal_type>(row.length);
+    y_value_type sum              = 0;
+
+    for (ordinal_type iEntry = 0; iEntry < row_length; iEntry++) {
+      const value_type val =
+          conjugate ? ATV::conj(row.value(iEntry)) : row.value(iEntry);
       sum += val * m_x(row.colidx(iEntry));
     }
 
     sum *= alpha;
 
     if (dobeta == 0) {
-      m_y(iRow) = sum ;
+      m_y(iRow) = sum;
     } else {
       m_y(iRow) = beta * m_y(iRow) + sum;
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const team_member& dev) const
-  {
+  void operator()(const team_member& dev) const {
     using y_value_type = typename YVector::non_const_value_type;
 
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev,0,rows_per_team), [&] (const ordinal_type& loop) {
-
-      const ordinal_type iRow = static_cast<ordinal_type> ( dev.league_rank() ) * rows_per_team + loop;
-      if (iRow >= m_A.numRows ()) {
-        return;
-      }
-      const KokkosSparse::SparseRowViewConst<AMatrix> row = m_A.rowConst(iRow);
-      const ordinal_type row_length = static_cast<ordinal_type> (row.length);
-      y_value_type sum = 0;
-
-      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev,row_length), [&] (const ordinal_type& iEntry, y_value_type& lsum) {
-        const value_type val = conjugate ?
-                ATV::conj (row.value(iEntry)) :
-                row.value(iEntry);
-        lsum += val * m_x(row.colidx(iEntry));
-      },sum);
-
-      Kokkos::single(Kokkos::PerThread(dev), [&] () {
-        sum *= alpha;
-
-        if (dobeta == 0) {
-          m_y(iRow) = sum ;
-        } else {
-          m_y(iRow) = beta * m_y(iRow) + sum;
-        }
-      });
-    });
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(dev, 0, rows_per_team),
+        [&](const ordinal_type& loop) {
+          const ordinal_type iRow =
+              static_cast<ordinal_type>(dev.league_rank()) * rows_per_team +
+              loop;
+          if (iRow >= m_A.numRows()) {
+            return;
+          }
+          const KokkosSparse::SparseRowViewConst<AMatrix> row =
+              m_A.rowConst(iRow);
+          const ordinal_type row_length = static_cast<ordinal_type>(row.length);
+          y_value_type sum              = 0;
+
+          Kokkos::parallel_reduce(
+              Kokkos::ThreadVectorRange(dev, row_length),
+              [&](const ordinal_type& iEntry, y_value_type& lsum) {
+                const value_type val = conjugate ? ATV::conj(row.value(iEntry))
+                                                 : row.value(iEntry);
+                lsum += val * m_x(row.colidx(iEntry));
+              },
+              sum);
+
+          Kokkos::single(Kokkos::PerThread(dev), [&]() {
+            sum *= alpha;
+
+            if (dobeta == 0) {
+              m_y(iRow) = sum;
+            } else {
+              m_y(iRow) = beta * m_y(iRow) + sum;
+            }
+          });
+        });
   }
 };
 
-template<class execution_space>
-int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, int64_t rows_per_thread, int& team_size, int& vector_length) {
+template <class execution_space>
+int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz,
+                               int64_t rows_per_thread, int& team_size,
+                               int& vector_length) {
   int64_t rows_per_team;
-  int64_t nnz_per_row = nnz/numRows;
+  int64_t nnz_per_row = nnz / numRows;
 
-  if(nnz_per_row < 1) nnz_per_row = 1;
+  if (nnz_per_row < 1) nnz_per_row = 1;
 
   int max_vector_length = 1;
 #ifdef KOKKOS_ENABLE_CUDA
-  if(std::is_same<execution_space, Kokkos::Cuda>::value)
+  if (std::is_same<execution_space, Kokkos::Cuda>::value)
     max_vector_length = 32;
 #endif
 #ifdef KOKKOS_ENABLE_HIP
-  if(std::is_same<execution_space, Kokkos::Experimental::HIP>::value)
+  if (std::is_same<execution_space, Kokkos::Experimental::HIP>::value)
     max_vector_length = 64;
 #endif
 
-  if(vector_length < 1) {
+  if (vector_length < 1) {
     vector_length = 1;
-    while(vector_length < max_vector_length && vector_length * 6 < nnz_per_row)
-      vector_length*=2;
+    while (vector_length < max_vector_length && vector_length * 6 < nnz_per_row)
+      vector_length *= 2;
   }
 
   // Determine rows per thread
-  if(rows_per_thread < 1) {
-    if(KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>())
+  if (rows_per_thread < 1) {
+    if (KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>())
       rows_per_thread = 1;
-    else
-    {
-      if(nnz_per_row < 20 && nnz > 5000000 ) {
+    else {
+      if (nnz_per_row < 20 && nnz > 5000000) {
         rows_per_thread = 256;
       } else
         rows_per_thread = 64;
     }
   }
 
-  if(team_size < 1) {
-    if(KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>())
-    { team_size = 256/vector_length; }
-    else
-    { team_size = 1; }
+  if (team_size < 1) {
+    if (KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>()) {
+      team_size = 256 / vector_length;
+    } else {
+      team_size = 1;
+    }
   }
 
   rows_per_team = rows_per_thread * team_size;
 
-  if(rows_per_team < 0) {
+  if (rows_per_team < 0) {
     int64_t nnz_per_team = 4096;
-    int64_t conc = execution_space::concurrency();
-    while((conc * nnz_per_team * 4> nnz)&&(nnz_per_team>256)) nnz_per_team/=2;
-    rows_per_team = (nnz_per_team+nnz_per_row - 1)/nnz_per_row;
+    int64_t conc         = execution_space::concurrency();
+    while ((conc * nnz_per_team * 4 > nnz) && (nnz_per_team > 256))
+      nnz_per_team /= 2;
+    rows_per_team = (nnz_per_team + nnz_per_row - 1) / nnz_per_row;
   }
 
-
   return rows_per_team;
 }
 
-//spmv_beta_no_transpose: version for CPU execution spaces (RangePolicy or trivial serial impl used)
-template<class AMatrix,
-         class XVector,
-         class YVector,
-         int dobeta,
-         bool conjugate,
-         typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space<typename AMatrix::execution_space>()>::type* = nullptr>
-static void
-spmv_beta_no_transpose (const KokkosKernels::Experimental::Controls& controls,
-			typename YVector::const_value_type& alpha,
-			const AMatrix& A,
-			const XVector& x,
-			typename YVector::const_value_type& beta,
-			const YVector& y)
-{
+// spmv_beta_no_transpose: version for CPU execution spaces (RangePolicy or
+// trivial serial impl used)
+template <class AMatrix, class XVector, class YVector, int dobeta,
+          bool conjugate,
+          typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename AMatrix::execution_space>()>::type* = nullptr>
+static void spmv_beta_no_transpose(
+    const KokkosKernels::Experimental::Controls& controls,
+    typename YVector::const_value_type& alpha, const AMatrix& A,
+    const XVector& x, typename YVector::const_value_type& beta,
+    const YVector& y) {
   typedef typename AMatrix::non_const_ordinal_type ordinal_type;
   typedef typename AMatrix::execution_space execution_space;
 
-  if (A.numRows () <= static_cast<ordinal_type> (0)) {
+  if (A.numRows() <= static_cast<ordinal_type>(0)) {
     return;
   }
-#if defined(KOKKOS_ENABLE_SERIAL) 
-  if(std::is_same<execution_space,Kokkos::Serial>::value) {
-    /// serial impl                                                                                         
+#if defined(KOKKOS_ENABLE_SERIAL)
+  if (std::is_same<execution_space, Kokkos::Serial>::value) {
+    /// serial impl
     typedef typename AMatrix::non_const_value_type value_type;
     typedef typename AMatrix::non_const_size_type size_type;
     typedef Kokkos::ArithTraits<value_type> ATV;
 
-    const size_type *__restrict__ row_map_ptr = A.graph.row_map.data();
-    const ordinal_type *__restrict__ col_idx_ptr = A.graph.entries.data();
-    const value_type *__restrict__ values_ptr = A.values.data();
+    const size_type* KOKKOS_RESTRICT row_map_ptr    = A.graph.row_map.data();
+    const ordinal_type* KOKKOS_RESTRICT col_idx_ptr = A.graph.entries.data();
+    const value_type* KOKKOS_RESTRICT values_ptr    = A.values.data();
 
-    typename YVector::non_const_value_type *__restrict__  y_ptr = y.data();
-    typename XVector::const_value_type *__restrict__  x_ptr = x.data();
+    typename YVector::non_const_value_type* KOKKOS_RESTRICT y_ptr = y.data();
+    typename XVector::const_value_type* KOKKOS_RESTRICT x_ptr     = x.data();
 
     const typename YVector::non_const_value_type zero(0);
     const ordinal_type nrow = A.numRows();
     if (alpha == zero) {
       if (dobeta == 0) {
         /// not working with kkosDev2_CUDA110_GCC92_cpp17/
-        ///memset(y_ptr, 0, sizeof(typename YVector::value_type)*nrow);
-        for (int i=0;i<nrow;++i) 
-          y_ptr[i] = zero;
+        /// memset(y_ptr, 0, sizeof(typename YVector::value_type)*nrow);
+        for (int i = 0; i < nrow; ++i) y_ptr[i] = zero;
       } else if (dobeta == 1) {
         /// so nothing
       } else {
-        for (int i=0;i<nrow;++i) 
-          y_ptr[i] *= beta;
-      }        
+        for (int i = 0; i < nrow; ++i) y_ptr[i] *= beta;
+      }
     } else {
-      for (int i=0;i<nrow;++i) {
-      	const int jbeg = row_map_ptr[i];
-      	const int jend = row_map_ptr[i+1];
-    	int j = jbeg;
-
-	{
-	  const int jdist = (jend-jbeg)/4;
-	  typename YVector::non_const_value_type tmp1(0), tmp2(0), tmp3(0), tmp4(0);
-	  for (int jj=0;jj<jdist;++jj) {
-            const value_type value1 = conjugate ? ATV::conj(values_ptr[j]) : values_ptr[j];
-            const value_type value2 = conjugate ? ATV::conj(values_ptr[j+1]) : values_ptr[j+1];
-            const value_type value3 = conjugate ? ATV::conj(values_ptr[j+2]) : values_ptr[j+2];
-            const value_type value4 = conjugate ? ATV::conj(values_ptr[j+3]) : values_ptr[j+3];
-	    const int col_idx1 = col_idx_ptr[j];
-	    const int col_idx2 = col_idx_ptr[j+1];
-	    const int col_idx3 = col_idx_ptr[j+2];
-	    const int col_idx4 = col_idx_ptr[j+3];
-	    const typename XVector::value_type x_val1 = x_ptr[col_idx1];
-	    const typename XVector::value_type x_val2 = x_ptr[col_idx2];
-	    const typename XVector::value_type x_val3 = x_ptr[col_idx3];
-	    const typename XVector::value_type x_val4 = x_ptr[col_idx4];
-	    tmp1 += value1*x_val1;
-	    tmp2 += value2*x_val2;
-	    tmp3 += value3*x_val3;
-	    tmp4 += value4*x_val4;
-	    j += 4;
-	  }
-	  for (;j<jend;++j) {
-            const value_type value = conjugate ? ATV::conj(values_ptr[j]) : values_ptr[j];
-	    const int col_idx = col_idx_ptr[j];
-	    tmp1 += value*x_ptr[col_idx];
-	  }
+      for (int i = 0; i < nrow; ++i) {
+        const int jbeg = row_map_ptr[i];
+        const int jend = row_map_ptr[i + 1];
+        int j          = jbeg;
+
+        {
+          const int jdist = (jend - jbeg) / 4;
+          typename YVector::non_const_value_type tmp1(0), tmp2(0), tmp3(0),
+              tmp4(0);
+          for (int jj = 0; jj < jdist; ++jj) {
+            const value_type value1 =
+                conjugate ? ATV::conj(values_ptr[j]) : values_ptr[j];
+            const value_type value2 =
+                conjugate ? ATV::conj(values_ptr[j + 1]) : values_ptr[j + 1];
+            const value_type value3 =
+                conjugate ? ATV::conj(values_ptr[j + 2]) : values_ptr[j + 2];
+            const value_type value4 =
+                conjugate ? ATV::conj(values_ptr[j + 3]) : values_ptr[j + 3];
+            const int col_idx1                        = col_idx_ptr[j];
+            const int col_idx2                        = col_idx_ptr[j + 1];
+            const int col_idx3                        = col_idx_ptr[j + 2];
+            const int col_idx4                        = col_idx_ptr[j + 3];
+            const typename XVector::value_type x_val1 = x_ptr[col_idx1];
+            const typename XVector::value_type x_val2 = x_ptr[col_idx2];
+            const typename XVector::value_type x_val3 = x_ptr[col_idx3];
+            const typename XVector::value_type x_val4 = x_ptr[col_idx4];
+            tmp1 += value1 * x_val1;
+            tmp2 += value2 * x_val2;
+            tmp3 += value3 * x_val3;
+            tmp4 += value4 * x_val4;
+            j += 4;
+          }
+          for (; j < jend; ++j) {
+            const value_type value =
+                conjugate ? ATV::conj(values_ptr[j]) : values_ptr[j];
+            const int col_idx = col_idx_ptr[j];
+            tmp1 += value * x_ptr[col_idx];
+          }
           if (dobeta == 0) {
-            y_ptr[i] = alpha*(tmp1 + tmp2 + tmp3 + tmp4);
+            y_ptr[i] = alpha * (tmp1 + tmp2 + tmp3 + tmp4);
           } else if (dobeta == 1) {
-            y_ptr[i] += alpha*(tmp1 + tmp2 + tmp3 + tmp4);
+            y_ptr[i] += alpha * (tmp1 + tmp2 + tmp3 + tmp4);
           } else {
-            const auto y_val = y_ptr[i]*beta;
-            y_ptr[i] = y_val + alpha*(tmp1 + tmp2 + tmp3 + tmp4);
-          } 
-	}
+            const auto y_val = y_ptr[i] * beta;
+            y_ptr[i]         = y_val + alpha * (tmp1 + tmp2 + tmp3 + tmp4);
+          }
+        }
       }
     }
     return;
   }
-  #endif
-
-  #ifdef KOKKOS_ENABLE_OPENMP
-  if((std::is_same<execution_space,Kokkos::OpenMP>::value) &&
-     (std::is_same<typename std::remove_cv<typename AMatrix::value_type>::type,double>::value) &&
-     (std::is_same<typename XVector::non_const_value_type,double>::value) &&
-     (std::is_same<typename YVector::non_const_value_type,double>::value) &&
-     ((int) A.graph.row_block_offsets.extent(0) == (int) omp_get_max_threads()+1) &&
-     (((uintptr_t)(const void*)(x.data())%64)==0) && (((uintptr_t)(const void*)(y.data())%64)==0)
-     && !conjugate
-     ) {
-    //Note BMK: this case is typically not called in practice even for OpenMP, since
-    //it requires row_block_offsets to have been computed in the graph.
-    spmv_raw_openmp_no_transpose<AMatrix,XVector,YVector>(alpha,A,x,beta,y);
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMP
+  if ((std::is_same<execution_space, Kokkos::OpenMP>::value) &&
+      (std::is_same<typename std::remove_cv<typename AMatrix::value_type>::type,
+                    double>::value) &&
+      (std::is_same<typename XVector::non_const_value_type, double>::value) &&
+      (std::is_same<typename YVector::non_const_value_type, double>::value) &&
+      ((int)A.graph.row_block_offsets.extent(0) ==
+       (int)omp_get_max_threads() + 1) &&
+      (((uintptr_t)(const void*)(x.data()) % 64) == 0) &&
+      (((uintptr_t)(const void*)(y.data()) % 64) == 0) && !conjugate) {
+    // Note BMK: this case is typically not called in practice even for OpenMP,
+    // since it requires row_block_offsets to have been computed in the graph.
+    spmv_raw_openmp_no_transpose<AMatrix, XVector, YVector>(alpha, A, x, beta,
+                                                            y);
     return;
   }
-  #endif
+#endif
 
-  bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule
-  bool use_static_schedule  = false; // Forces the use of a static schedule
-  if(controls.isParameter("schedule")) {
-    if(controls.getParameter("schedule") == "dynamic") {
+  bool use_dynamic_schedule = false;  // Forces the use of a dynamic schedule
+  bool use_static_schedule  = false;  // Forces the use of a static schedule
+  if (controls.isParameter("schedule")) {
+    if (controls.getParameter("schedule") == "dynamic") {
       use_dynamic_schedule = true;
-    } else if(controls.getParameter("schedule") == "static") {
-      use_static_schedule  = true;
+    } else if (controls.getParameter("schedule") == "static") {
+      use_static_schedule = true;
     }
   }
-  SPMV_Functor<AMatrix,XVector,YVector,dobeta,conjugate> func (alpha,A,x,beta,y,1);
-  if(((A.nnz()>10000000) || use_dynamic_schedule) && !use_static_schedule)
-    Kokkos::parallel_for("KokkosSparse::spmv<NoTranspose,Dynamic>",Kokkos::RangePolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic>>(0, A.numRows()),func);
+  SPMV_Functor<AMatrix, XVector, YVector, dobeta, conjugate> func(alpha, A, x,
+                                                                  beta, y, 1);
+  if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule)
+    Kokkos::parallel_for(
+        "KokkosSparse::spmv<NoTranspose,Dynamic>",
+        Kokkos::RangePolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic>>(
+            0, A.numRows()),
+        func);
   else
-    Kokkos::parallel_for("KokkosSparse::spmv<NoTranspose,Static>",Kokkos::RangePolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>(0, A.numRows()),func);
+    Kokkos::parallel_for(
+        "KokkosSparse::spmv<NoTranspose,Static>",
+        Kokkos::RangePolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>(
+            0, A.numRows()),
+        func);
 }
 
-//spmv_beta_no_transpose: version for GPU execution spaces (TeamPolicy used)
-template<class AMatrix,
-         class XVector,
-         class YVector,
-         int dobeta,
-         bool conjugate,
-         typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space<typename AMatrix::execution_space>()>::type* = nullptr>
-static void
-spmv_beta_no_transpose (const KokkosKernels::Experimental::Controls& controls,
-			typename YVector::const_value_type& alpha,
-			const AMatrix& A,
-			const XVector& x,
-			typename YVector::const_value_type& beta,
-			const YVector& y)
-{
+// spmv_beta_no_transpose: version for GPU execution spaces (TeamPolicy used)
+template <class AMatrix, class XVector, class YVector, int dobeta,
+          bool conjugate,
+          typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename AMatrix::execution_space>()>::type* = nullptr>
+static void spmv_beta_no_transpose(
+    const KokkosKernels::Experimental::Controls& controls,
+    typename YVector::const_value_type& alpha, const AMatrix& A,
+    const XVector& x, typename YVector::const_value_type& beta,
+    const YVector& y) {
   typedef typename AMatrix::non_const_ordinal_type ordinal_type;
   typedef typename AMatrix::execution_space execution_space;
 
-  if (A.numRows () <= static_cast<ordinal_type> (0)) {
+  if (A.numRows() <= static_cast<ordinal_type>(0)) {
     return;
   }
 
-  bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule
-  bool use_static_schedule  = false; // Forces the use of a static schedule
-  if(controls.isParameter("schedule")) {
-    if(controls.getParameter("schedule") == "dynamic") {
+  bool use_dynamic_schedule = false;  // Forces the use of a dynamic schedule
+  bool use_static_schedule  = false;  // Forces the use of a static schedule
+  if (controls.isParameter("schedule")) {
+    if (controls.getParameter("schedule") == "dynamic") {
       use_dynamic_schedule = true;
-    } else if(controls.getParameter("schedule") == "static") {
-      use_static_schedule  = true;
+    } else if (controls.getParameter("schedule") == "static") {
+      use_static_schedule = true;
     }
   }
-  int team_size = -1;
-  int vector_length = -1;
+  int team_size           = -1;
+  int vector_length       = -1;
   int64_t rows_per_thread = -1;
 
   // Note on 03/24/20, lbv: We can use the controls
   // here to allow the user to pass in some tunning
   // parameters.
-  if(controls.isParameter("team size"))       {team_size       = std::stoi(controls.getParameter("team size"));}
-  if(controls.isParameter("vector length"))   {vector_length   = std::stoi(controls.getParameter("vector length"));}
-  if(controls.isParameter("rows per thread")) {rows_per_thread = std::stoll(controls.getParameter("rows per thread"));}
+  if (controls.isParameter("team size")) {
+    team_size = std::stoi(controls.getParameter("team size"));
+  }
+  if (controls.isParameter("vector length")) {
+    vector_length = std::stoi(controls.getParameter("vector length"));
+  }
+  if (controls.isParameter("rows per thread")) {
+    rows_per_thread = std::stoll(controls.getParameter("rows per thread"));
+  }
 
-  int64_t rows_per_team = spmv_launch_parameters<execution_space>(A.numRows(),A.nnz(),rows_per_thread,team_size,vector_length);
-  int64_t worksets = (y.extent(0)+rows_per_team-1)/rows_per_team;
+  int64_t rows_per_team = spmv_launch_parameters<execution_space>(
+      A.numRows(), A.nnz(), rows_per_thread, team_size, vector_length);
+  int64_t worksets = (y.extent(0) + rows_per_team - 1) / rows_per_team;
 
-  SPMV_Functor<AMatrix,XVector,YVector,dobeta,conjugate> func (alpha,A,x,beta,y,rows_per_team);
+  SPMV_Functor<AMatrix, XVector, YVector, dobeta, conjugate> func(
+      alpha, A, x, beta, y, rows_per_team);
 
-  if(((A.nnz()>10000000) || use_dynamic_schedule) && !use_static_schedule) {
-    Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic> > policy(1,1);
-    if(team_size<0)
-      policy = Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic> >(worksets,Kokkos::AUTO,vector_length);
+  if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) {
+    Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic>>
+        policy(1, 1);
+    if (team_size < 0)
+      policy = Kokkos::TeamPolicy<execution_space,
+                                  Kokkos::Schedule<Kokkos::Dynamic>>(
+          worksets, Kokkos::AUTO, vector_length);
     else
-      policy = Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic> >(worksets,team_size,vector_length);
-    Kokkos::parallel_for("KokkosSparse::spmv<NoTranspose,Dynamic>",policy,func);
+      policy = Kokkos::TeamPolicy<execution_space,
+                                  Kokkos::Schedule<Kokkos::Dynamic>>(
+          worksets, team_size, vector_length);
+    Kokkos::parallel_for("KokkosSparse::spmv<NoTranspose,Dynamic>", policy,
+                         func);
   } else {
-    Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static> > policy(1,1);
-    if(team_size<0)
-      policy = Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static> >(worksets,Kokkos::AUTO,vector_length);
+    Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>
+        policy(1, 1);
+    if (team_size < 0)
+      policy =
+          Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>(
+              worksets, Kokkos::AUTO, vector_length);
     else
-      policy = Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static> >(worksets,team_size,vector_length);
-    Kokkos::parallel_for("KokkosSparse::spmv<NoTranspose,Static>",policy,func);
+      policy =
+          Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>(
+              worksets, team_size, vector_length);
+    Kokkos::parallel_for("KokkosSparse::spmv<NoTranspose,Static>", policy,
+                         func);
   }
 }
 
-//spmv_beta_transpose: version for CPU execution spaces (RangePolicy or trivial serial impl used)
-template<class AMatrix,
-         class XVector,
-         class YVector,
-         int dobeta,
-         bool conjugate,
-         typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space<typename AMatrix::execution_space>()>::type* = nullptr>
-static void
-spmv_beta_transpose (typename YVector::const_value_type& alpha,
-                           const AMatrix& A,
-                           const XVector& x,
-                           typename YVector::const_value_type& beta,
-                           const YVector& y)
-{
-  using ordinal_type = typename AMatrix::non_const_ordinal_type;
-  using size_type = typename AMatrix::non_const_size_type;
+// spmv_beta_transpose: version for CPU execution spaces (RangePolicy or trivial
+// serial impl used)
+template <class AMatrix, class XVector, class YVector, int dobeta,
+          bool conjugate,
+          typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename AMatrix::execution_space>()>::type* = nullptr>
+static void spmv_beta_transpose(typename YVector::const_value_type& alpha,
+                                const AMatrix& A, const XVector& x,
+                                typename YVector::const_value_type& beta,
+                                const YVector& y) {
+  using ordinal_type    = typename AMatrix::non_const_ordinal_type;
+  using size_type       = typename AMatrix::non_const_size_type;
   using execution_space = typename AMatrix::execution_space;
 
-  if (A.numRows () <= static_cast<ordinal_type> (0)) {
+  if (A.numRows() <= static_cast<ordinal_type>(0)) {
     return;
   }
 
   // We need to scale y first ("scaling" by zero just means filling
   // with zeros), since the functor works by atomic-adding into y.
   if (dobeta != 1) {
-    KokkosBlas::scal (y, beta, y);
+    KokkosBlas::scal(y, beta, y);
   }
 
-#if defined(KOKKOS_ENABLE_SERIAL) || defined(KOKKOS_ENABLE_OPENMP) || defined(KOKKOS_ENABLE_THREADS) 
+#if defined(KOKKOS_ENABLE_SERIAL) || defined(KOKKOS_ENABLE_OPENMP) || \
+    defined(KOKKOS_ENABLE_THREADS)
   {
     int impl_thread_pool_size(0);
 #if defined(KOKKOS_ENABLE_SERIAL)
-    if (std::is_same<execution_space,Kokkos::Serial>::value) 
+    if (std::is_same<execution_space, Kokkos::Serial>::value)
       impl_thread_pool_size = 1;
 #endif
 #if defined(KOKKOS_ENABLE_OPENMP)
-    if (std::is_same<execution_space,Kokkos::OpenMP>::value) 
+    if (std::is_same<execution_space, Kokkos::OpenMP>::value)
       impl_thread_pool_size = Kokkos::OpenMP::impl_thread_pool_size();
 #endif
 #if defined(KOKKOS_ENABLE_THREADS)
-    if (std::is_same<execution_space,Kokkos::Threads>::value) 
+    if (std::is_same<execution_space, Kokkos::Threads>::value)
       impl_thread_pool_size = Kokkos::Threads::impl_thread_pool_size();
 #endif
 
     if (impl_thread_pool_size == 1) {
-      /// serial impl                                                                                         
+      /// serial impl
       typedef typename AMatrix::non_const_value_type value_type;
       typedef Kokkos::Details::ArithTraits<value_type> ATV;
-      const size_type *__restrict__ row_map_ptr = A.graph.row_map.data();
-      const ordinal_type *__restrict__ col_idx_ptr = A.graph.entries.data();
-      const value_type *__restrict__ values_ptr = A.values.data();
-      
-      typename YVector::value_type *__restrict__  y_ptr = y.data();
-      typename XVector::value_type *__restrict__  x_ptr = x.data();
-      
+      const size_type* KOKKOS_RESTRICT row_map_ptr    = A.graph.row_map.data();
+      const ordinal_type* KOKKOS_RESTRICT col_idx_ptr = A.graph.entries.data();
+      const value_type* KOKKOS_RESTRICT values_ptr    = A.values.data();
+
+      typename YVector::value_type* KOKKOS_RESTRICT y_ptr = y.data();
+      typename XVector::value_type* KOKKOS_RESTRICT x_ptr = x.data();
+
       const typename YVector::non_const_value_type zero(0);
       const ordinal_type nrow = A.numRows();
       if (alpha == zero) {
         /// do nothing
       } else {
-        for (int i=0;i<nrow;++i) {
-          const int jbeg = row_map_ptr[i];
-          const int jend = row_map_ptr[i+1];
-          const int jdist = (jend-jbeg)/4;
-          
-          const typename XVector::const_value_type x_val = alpha*x_ptr[i];
-          int j = jbeg;
-          for (int jj=0;jj<jdist;++jj) {
-            const value_type value1 = conjugate ? ATV::conj(values_ptr[j]) : values_ptr[j];
-            const value_type value2 = conjugate ? ATV::conj(values_ptr[j+1]) : values_ptr[j+1];
-            const value_type value3 = conjugate ? ATV::conj(values_ptr[j+2]) : values_ptr[j+2];
-            const value_type value4 = conjugate ? ATV::conj(values_ptr[j+3]) : values_ptr[j+3];
+        for (int i = 0; i < nrow; ++i) {
+          const int jbeg  = row_map_ptr[i];
+          const int jend  = row_map_ptr[i + 1];
+          const int jdist = (jend - jbeg) / 4;
+
+          const typename XVector::const_value_type x_val = alpha * x_ptr[i];
+          int j                                          = jbeg;
+          for (int jj = 0; jj < jdist; ++jj) {
+            const value_type value1 =
+                conjugate ? ATV::conj(values_ptr[j]) : values_ptr[j];
+            const value_type value2 =
+                conjugate ? ATV::conj(values_ptr[j + 1]) : values_ptr[j + 1];
+            const value_type value3 =
+                conjugate ? ATV::conj(values_ptr[j + 2]) : values_ptr[j + 2];
+            const value_type value4 =
+                conjugate ? ATV::conj(values_ptr[j + 3]) : values_ptr[j + 3];
             const int col_idx1 = col_idx_ptr[j];
-            const int col_idx2 = col_idx_ptr[j+1];
-            const int col_idx3 = col_idx_ptr[j+2];
-            const int col_idx4 = col_idx_ptr[j+3];
-            y_ptr[col_idx1] += value1*x_val;
-            y_ptr[col_idx2] += value2*x_val;
-            y_ptr[col_idx3] += value3*x_val;
-            y_ptr[col_idx4] += value4*x_val;
+            const int col_idx2 = col_idx_ptr[j + 1];
+            const int col_idx3 = col_idx_ptr[j + 2];
+            const int col_idx4 = col_idx_ptr[j + 3];
+            y_ptr[col_idx1] += value1 * x_val;
+            y_ptr[col_idx2] += value2 * x_val;
+            y_ptr[col_idx3] += value3 * x_val;
+            y_ptr[col_idx4] += value4 * x_val;
             j += 4;
           }
-          for (;j<jend;++j) {
-            const value_type value = conjugate ? ATV::conj(values_ptr[j]) : values_ptr[j];
+          for (; j < jend; ++j) {
+            const value_type value =
+                conjugate ? ATV::conj(values_ptr[j]) : values_ptr[j];
             const int col_idx = col_idx_ptr[j];
-            y_ptr[col_idx] += value*x_val;
+            y_ptr[col_idx] += value * x_val;
           }
         }
-      }    
+      }
       return;
     }
   }
@@ -594,123 +611,112 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha,
 
   typedef SPMV_Transpose_Functor<AMatrix, XVector, YVector, conjugate> OpType;
   typename AMatrix::const_ordinal_type nrow = A.numRows();
-  Kokkos::parallel_for("KokkosSparse::spmv<Transpose>", Kokkos::RangePolicy< execution_space >
-      ( 0 , nrow ) , OpType(alpha, A, x, y) );
+  Kokkos::parallel_for("KokkosSparse::spmv<Transpose>",
+                       Kokkos::RangePolicy<execution_space>(0, nrow),
+                       OpType(alpha, A, x, y));
 }
 
-//spmv_beta_transpose: version for GPU execution spaces (TeamPolicy used)
-template<class AMatrix,
-         class XVector,
-         class YVector,
-         int dobeta,
-         bool conjugate,
-         typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space<typename AMatrix::execution_space>()>::type* = nullptr>
-static void
-spmv_beta_transpose (typename YVector::const_value_type& alpha,
-                           const AMatrix& A,
-                           const XVector& x,
-                           typename YVector::const_value_type& beta,
-                           const YVector& y)
-{
-  using ordinal_type = typename AMatrix::non_const_ordinal_type;
-  using size_type = typename AMatrix::non_const_size_type;
+// spmv_beta_transpose: version for GPU execution spaces (TeamPolicy used)
+template <class AMatrix, class XVector, class YVector, int dobeta,
+          bool conjugate,
+          typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename AMatrix::execution_space>()>::type* = nullptr>
+static void spmv_beta_transpose(typename YVector::const_value_type& alpha,
+                                const AMatrix& A, const XVector& x,
+                                typename YVector::const_value_type& beta,
+                                const YVector& y) {
+  using ordinal_type    = typename AMatrix::non_const_ordinal_type;
+  using size_type       = typename AMatrix::non_const_size_type;
   using execution_space = typename AMatrix::execution_space;
 
-  if (A.numRows () <= static_cast<ordinal_type> (0)) {
+  if (A.numRows() <= static_cast<ordinal_type>(0)) {
     return;
   }
 
   // We need to scale y first ("scaling" by zero just means filling
   // with zeros), since the functor works by atomic-adding into y.
   if (dobeta != 1) {
-    KokkosBlas::scal (y, beta, y);
+    KokkosBlas::scal(y, beta, y);
   }
 
   // Assuming that no row contains duplicate entries, NNZPerRow
   // cannot be more than the number of columns of the matrix.  Thus,
   // the appropriate type is ordinal_type.
-  const ordinal_type NNZPerRow = A.nnz () / A.numRows ();
-  
-  int vector_length = 1;
+  const ordinal_type NNZPerRow = A.nnz() / A.numRows();
+
+  int vector_length     = 1;
   int max_vector_length = 1;
 #ifdef KOKKOS_ENABLE_CUDA
-  if(std::is_same<execution_space, Kokkos::Cuda>::value)
+  if (std::is_same<execution_space, Kokkos::Cuda>::value)
     max_vector_length = 32;
 #endif
 #ifdef KOKKOS_ENABLE_HIP
-  if(std::is_same<execution_space, Kokkos::Experimental::HIP>::value)
+  if (std::is_same<execution_space, Kokkos::Experimental::HIP>::value)
     max_vector_length = 64;
 #endif
-  while( (vector_length*2*3 <= NNZPerRow) && (vector_length < max_vector_length) )
-    vector_length*=2;
-  
+  while ((vector_length * 2 * 3 <= NNZPerRow) &&
+         (vector_length < max_vector_length))
+    vector_length *= 2;
+
   typedef SPMV_Transpose_Functor<AMatrix, XVector, YVector, conjugate> OpType;
-  
+
   typename AMatrix::const_ordinal_type nrow = A.numRows();
-  
-  OpType op (alpha, A, x, y);
-  
-  const ordinal_type rows_per_thread = RowsPerThread<execution_space > (NNZPerRow);
-  const ordinal_type team_size = Kokkos::TeamPolicy<execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
+
+  OpType op(alpha, A, x, y);
+
+  const ordinal_type rows_per_thread =
+      RowsPerThread<execution_space>(NNZPerRow);
+  const ordinal_type team_size =
+      Kokkos::TeamPolicy<execution_space>(rows_per_thread, Kokkos::AUTO,
+                                          vector_length)
+          .team_size_recommended(op, Kokkos::ParallelForTag());
   const ordinal_type rows_per_team = rows_per_thread * team_size;
-  op.rows_per_team = rows_per_team;
-  const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
-  Kokkos::parallel_for("KokkosSparse::spmv<Transpose>", Kokkos::TeamPolicy< execution_space >
-                       ( nteams , team_size , vector_length ) , op );
+  op.rows_per_team                 = rows_per_team;
+  const size_type nteams           = (nrow + rows_per_team - 1) / rows_per_team;
+  Kokkos::parallel_for(
+      "KokkosSparse::spmv<Transpose>",
+      Kokkos::TeamPolicy<execution_space>(nteams, team_size, vector_length),
+      op);
 }
 
-template<class AMatrix,
-         class XVector,
-         class YVector,
-         int dobeta>
-static void
-spmv_beta (const KokkosKernels::Experimental::Controls& controls,
-	   const char mode[],
-	   typename YVector::const_value_type& alpha,
-	   const AMatrix& A,
-	   const XVector& x,
-	   typename YVector::const_value_type& beta,
-	   const YVector& y)
-{
+template <class AMatrix, class XVector, class YVector, int dobeta>
+static void spmv_beta(const KokkosKernels::Experimental::Controls& controls,
+                      const char mode[],
+                      typename YVector::const_value_type& alpha,
+                      const AMatrix& A, const XVector& x,
+                      typename YVector::const_value_type& beta,
+                      const YVector& y) {
   if (mode[0] == NoTranspose[0]) {
-    spmv_beta_no_transpose<AMatrix,XVector,YVector,dobeta,false>
-      (controls,alpha,A,x,beta,y);
-  }
-  else if (mode[0] == Conjugate[0]) {
-    spmv_beta_no_transpose<AMatrix,XVector,YVector,dobeta,true>
-      (controls,alpha,A,x,beta,y);
-  }
-  else if (mode[0]==Transpose[0]) {
-    spmv_beta_transpose<AMatrix,XVector,YVector,dobeta,false>
-      (alpha,A,x,beta,y);
-  }
-  else if(mode[0]==ConjugateTranspose[0]) {
-    spmv_beta_transpose<AMatrix,XVector,YVector,dobeta,true>
-      (alpha,A,x,beta,y);
-  }
-  else {
-    Kokkos::Impl::throw_runtime_exception("Invalid Transpose Mode for KokkosSparse::spmv()");
+    spmv_beta_no_transpose<AMatrix, XVector, YVector, dobeta, false>(
+        controls, alpha, A, x, beta, y);
+  } else if (mode[0] == Conjugate[0]) {
+    spmv_beta_no_transpose<AMatrix, XVector, YVector, dobeta, true>(
+        controls, alpha, A, x, beta, y);
+  } else if (mode[0] == Transpose[0]) {
+    spmv_beta_transpose<AMatrix, XVector, YVector, dobeta, false>(alpha, A, x,
+                                                                  beta, y);
+  } else if (mode[0] == ConjugateTranspose[0]) {
+    spmv_beta_transpose<AMatrix, XVector, YVector, dobeta, true>(alpha, A, x,
+                                                                 beta, y);
+  } else {
+    KokkosKernels::Impl::throw_runtime_exception(
+        "Invalid Transpose Mode for KokkosSparse::spmv()");
   }
 }
 
-
 // Functor for implementing transpose and conjugate transpose sparse
 // matrix-vector multiply with multivector (2-D View) input and
 // output.  This functor works, but is not necessarily performant.
-template<class AMatrix,
-         class XVector,
-         class YVector,
-         int doalpha,
-         int dobeta,
-         bool conjugate>
+template <class AMatrix, class XVector, class YVector, int doalpha, int dobeta,
+          bool conjugate>
 struct SPMV_MV_Transpose_Functor {
-  typedef typename AMatrix::execution_space            execution_space;
-  typedef typename AMatrix::non_const_ordinal_type     ordinal_type;
-  typedef typename AMatrix::non_const_value_type       A_value_type;
-  typedef typename YVector::non_const_value_type       y_value_type;
+  typedef typename AMatrix::execution_space execution_space;
+  typedef typename AMatrix::non_const_ordinal_type ordinal_type;
+  typedef typename AMatrix::non_const_value_type A_value_type;
+  typedef typename YVector::non_const_value_type y_value_type;
   typedef typename Kokkos::TeamPolicy<execution_space> team_policy;
-  typedef typename team_policy::member_type            team_member;
-  typedef typename YVector::non_const_value_type       coefficient_type;
+  typedef typename team_policy::member_type team_member;
+  typedef typename YVector::non_const_value_type coefficient_type;
 
   const coefficient_type alpha;
   AMatrix m_A;
@@ -721,109 +727,104 @@ struct SPMV_MV_Transpose_Functor {
   const ordinal_type n;
   ordinal_type rows_per_team;
 
-  SPMV_MV_Transpose_Functor (const coefficient_type& alpha_,
-                             const AMatrix& m_A_,
-                             const XVector& m_x_,
-                             const coefficient_type& beta_,
-                             const YVector& m_y_) :
-    alpha (alpha_),
-    m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1))
-  {}
-
-  KOKKOS_INLINE_FUNCTION void
-  operator() (const ordinal_type iRow) const
-  {
-    const auto row = m_A.rowConst (iRow);
+  SPMV_MV_Transpose_Functor(const coefficient_type& alpha_, const AMatrix& m_A_,
+                            const XVector& m_x_, const coefficient_type& beta_,
+                            const YVector& m_y_)
+      : alpha(alpha_),
+        m_A(m_A_),
+        m_x(m_x_),
+        beta(beta_),
+        m_y(m_y_),
+        n(m_x_.extent(1)) {}
+
+  KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type iRow) const {
+    const auto row                = m_A.rowConst(iRow);
     const ordinal_type row_length = row.length;
 
-    for(ordinal_type iEntry = 0; iEntry < row_length; iEntry++)
-    {
-      const A_value_type val = conjugate ?
-        Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
-        row.value(iEntry);
+    for (ordinal_type iEntry = 0; iEntry < row_length; iEntry++) {
+      const A_value_type val =
+          conjugate ? Kokkos::Details::ArithTraits<A_value_type>::conj(
+                          row.value(iEntry))
+                    : row.value(iEntry);
       const ordinal_type ind = row.colidx(iEntry);
 
       if (doalpha != 1) {
-        #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-        #pragma unroll
-        #endif
+#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
+#pragma unroll
+#endif
         for (ordinal_type k = 0; k < n; ++k) {
-          Kokkos::atomic_add (&m_y(ind,k),
-                              static_cast<y_value_type> (alpha * val * m_x(iRow, k)));
+          Kokkos::atomic_add(&m_y(ind, k), static_cast<y_value_type>(
+                                               alpha * val * m_x(iRow, k)));
         }
       } else {
-        #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-        #pragma unroll
-        #endif
+#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
+#pragma unroll
+#endif
         for (ordinal_type k = 0; k < n; ++k) {
-          Kokkos::atomic_add (&m_y(ind,k),
-                              static_cast<y_value_type> (val * m_x(iRow, k)));
+          Kokkos::atomic_add(&m_y(ind, k),
+                             static_cast<y_value_type>(val * m_x(iRow, k)));
         }
       }
     }
   }
 
-  KOKKOS_INLINE_FUNCTION void
-  operator() (const team_member& dev) const
-  {
+  KOKKOS_INLINE_FUNCTION void operator()(const team_member& dev) const {
     const ordinal_type teamWork = dev.league_rank() * rows_per_team;
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_team),
-    [&](ordinal_type loop)
-    {
-      // iRow represents a row of the matrix, so its correct type is
-      // ordinal_type.
-      const ordinal_type iRow = teamWork + loop;
-      if (iRow >= m_A.numRows ()) {
-        return;
-      }
-
-      const auto row = m_A.rowConst (iRow);
-      const ordinal_type row_length = row.length;
-
-      Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row_length),
-      [&](ordinal_type iEntry)
-      {
-        const A_value_type val = conjugate ?
-          Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
-          row.value(iEntry);
-        const ordinal_type ind = row.colidx(iEntry);
-
-        if (doalpha != 1) {
-          #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-          #pragma unroll
-          #endif
-          for (ordinal_type k = 0; k < n; ++k) {
-            Kokkos::atomic_add (&m_y(ind,k),
-                                static_cast<y_value_type> (alpha * val * m_x(iRow, k)));
-          }
-        } else {
-          #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-          #pragma unroll
-          #endif
-          for (ordinal_type k = 0; k < n; ++k) {
-            Kokkos::atomic_add (&m_y(ind,k),
-                                static_cast<y_value_type> (val * m_x(iRow, k)));
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(dev, rows_per_team), [&](ordinal_type loop) {
+          // iRow represents a row of the matrix, so its correct type is
+          // ordinal_type.
+          const ordinal_type iRow = teamWork + loop;
+          if (iRow >= m_A.numRows()) {
+            return;
           }
-        }
-      });
-    });
+
+          const auto row                = m_A.rowConst(iRow);
+          const ordinal_type row_length = row.length;
+
+          Kokkos::parallel_for(
+              Kokkos::ThreadVectorRange(dev, row_length),
+              [&](ordinal_type iEntry) {
+                const A_value_type val =
+                    conjugate
+                        ? Kokkos::Details::ArithTraits<A_value_type>::conj(
+                              row.value(iEntry))
+                        : row.value(iEntry);
+                const ordinal_type ind = row.colidx(iEntry);
+
+                if (doalpha != 1) {
+#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
+#pragma unroll
+#endif
+                  for (ordinal_type k = 0; k < n; ++k) {
+                    Kokkos::atomic_add(
+                        &m_y(ind, k),
+                        static_cast<y_value_type>(alpha * val * m_x(iRow, k)));
+                  }
+                } else {
+#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
+#pragma unroll
+#endif
+                  for (ordinal_type k = 0; k < n; ++k) {
+                    Kokkos::atomic_add(&m_y(ind, k), static_cast<y_value_type>(
+                                                         val * m_x(iRow, k)));
+                  }
+                }
+              });
+        });
   }
 };
 
-template<class AMatrix,
-         class XVector,
-         class YVector,
-         int doalpha,
-         int dobeta,
-         bool conjugate>
+template <class AMatrix, class XVector, class YVector, int doalpha, int dobeta,
+          bool conjugate>
 struct SPMV_MV_LayoutLeft_Functor {
-  typedef typename AMatrix::execution_space            execution_space;
-  typedef typename AMatrix::non_const_ordinal_type     ordinal_type;
-  typedef typename AMatrix::non_const_value_type       A_value_type;
-  typedef typename YVector::non_const_value_type       y_value_type;
+  typedef typename AMatrix::execution_space execution_space;
+  typedef typename AMatrix::non_const_ordinal_type ordinal_type;
+  typedef typename AMatrix::non_const_value_type A_value_type;
+  typedef typename YVector::non_const_value_type y_value_type;
   typedef typename Kokkos::TeamPolicy<execution_space> team_policy;
-  typedef typename team_policy::member_type            team_member;
-  typedef typename YVector::non_const_value_type       coefficient_type;
+  typedef typename team_policy::member_type team_member;
+  typedef typename YVector::non_const_value_type coefficient_type;
 
   const coefficient_type alpha;
   AMatrix m_A;
@@ -835,22 +836,24 @@ struct SPMV_MV_LayoutLeft_Functor {
   ordinal_type rows_per_thread;
   int vector_length;
 
-  SPMV_MV_LayoutLeft_Functor (const coefficient_type& alpha_,
-                              const AMatrix& m_A_,
-                              const XVector& m_x_,
-                              const coefficient_type& beta_,
-                              const YVector& m_y_,
-                              const ordinal_type rows_per_thread_,
-                              int vector_length_) :
-    alpha (alpha_),
-    m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1)),
-    rows_per_thread (rows_per_thread_), vector_length(vector_length_)
-  {}
-
-  template<int UNROLL>
-  KOKKOS_INLINE_FUNCTION void
-  strip_mine (const team_member& dev, const ordinal_type& iRow, const ordinal_type& kk) const
-  {
+  SPMV_MV_LayoutLeft_Functor(const coefficient_type& alpha_,
+                             const AMatrix& m_A_, const XVector& m_x_,
+                             const coefficient_type& beta_, const YVector& m_y_,
+                             const ordinal_type rows_per_thread_,
+                             int vector_length_)
+      : alpha(alpha_),
+        m_A(m_A_),
+        m_x(m_x_),
+        beta(beta_),
+        m_y(m_y_),
+        n(m_x_.extent(1)),
+        rows_per_thread(rows_per_thread_),
+        vector_length(vector_length_) {}
+
+  template <int UNROLL>
+  KOKKOS_INLINE_FUNCTION void strip_mine(const team_member& dev,
+                                         const ordinal_type& iRow,
+                                         const ordinal_type& kk) const {
     y_value_type sum[UNROLL];
 
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
@@ -860,54 +863,57 @@ struct SPMV_MV_LayoutLeft_Functor {
 #pragma unroll
 #endif
     for (int k = 0; k < UNROLL; ++k) {
-      sum[k] = Kokkos::Details::ArithTraits<y_value_type>::zero ();
+      sum[k] = Kokkos::Details::ArithTraits<y_value_type>::zero();
     }
 
-    const auto row = m_A.rowConst (iRow);
+    const auto row = m_A.rowConst(iRow);
 
     // The correct type of iEntry is ordinal_type, the type of the
     // number of columns in the (local) matrix.  This is because we
     // assume either that rows have no duplicate entries, or that rows
     // never have enough duplicate entries to overflow ordinal_type.
 
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row.length),
-    [&](ordinal_type iEntry)
-    {
-      const A_value_type val = conjugate ?
-        Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
-        row.value(iEntry);
-      const ordinal_type ind = row.colidx(iEntry);
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(dev, row.length), [&](ordinal_type iEntry) {
+          const A_value_type val =
+              conjugate ? Kokkos::Details::ArithTraits<A_value_type>::conj(
+                              row.value(iEntry))
+                        : row.value(iEntry);
+          const ordinal_type ind = row.colidx(iEntry);
 #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
 #pragma unroll
 #endif
-      for (int k = 0; k < UNROLL; ++k) {
-        sum[k] += val * m_x(ind, kk + k);
-      }
-    });
+          for (int k = 0; k < UNROLL; ++k) {
+            sum[k] += val * m_x(ind, kk + k);
+          }
+        });
 
     if (doalpha == -1) {
-      for (int ii=0; ii < UNROLL; ++ii) {
+      for (int ii = 0; ii < UNROLL; ++ii) {
         y_value_type sumt;
-        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, vector_length),
-        [&](ordinal_type, y_value_type& lsum)
-        {
-          //in this context, sum[ii] is a partial sum ii on one of the vector lanes.
-          lsum -= sum[ii];
-        }, sumt);
+        Kokkos::parallel_reduce(
+            Kokkos::ThreadVectorRange(dev, vector_length),
+            [&](ordinal_type, y_value_type& lsum) {
+              // in this context, sum[ii] is a partial sum ii on one of the
+              // vector lanes.
+              lsum -= sum[ii];
+            },
+            sumt);
         sum[ii] = sumt;
-        //that was an all-reduce, so sum[ii] is the same on every vector lane
+        // that was an all-reduce, so sum[ii] is the same on every vector lane
       }
-    }
-    else {
-      for (int ii=0; ii < UNROLL; ++ii) {
+    } else {
+      for (int ii = 0; ii < UNROLL; ++ii) {
         y_value_type sumt;
-        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, vector_length),
-        [&](ordinal_type, y_value_type& lsum)
-        {
-          //in this context, sum[ii] is a partial sum ii on one of the vector lanes.
-          lsum += sum[ii];
-        }, sumt);
-        if(doalpha == 1)
+        Kokkos::parallel_reduce(
+            Kokkos::ThreadVectorRange(dev, vector_length),
+            [&](ordinal_type, y_value_type& lsum) {
+              // in this context, sum[ii] is a partial sum ii on one of the
+              // vector lanes.
+              lsum += sum[ii];
+            },
+            sumt);
+        if (doalpha == 1)
           sum[ii] = sumt;
         else
           sum[ii] = sumt * alpha;
@@ -916,35 +922,28 @@ struct SPMV_MV_LayoutLeft_Functor {
 
     if (dobeta == 0) {
       Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
-      [&](ordinal_type k)
-      {
-        m_y(iRow, kk + k) = sum[k];
-      });
+                           [&](ordinal_type k) { m_y(iRow, kk + k) = sum[k]; });
     } else if (dobeta == 1) {
       Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
-      [&](ordinal_type k)
-      {
-        m_y(iRow, kk + k) = m_y(iRow, kk + k) + sum[k];
-      });
+                           [&](ordinal_type k) {
+                             m_y(iRow, kk + k) = m_y(iRow, kk + k) + sum[k];
+                           });
     } else if (dobeta == -1) {
       Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
-      [&](ordinal_type k)
-      {
-        m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k];
-      });
+                           [&](ordinal_type k) {
+                             m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k];
+                           });
     } else {
-      Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
-      [&](ordinal_type k)
-      {
-        m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k];
-      });
+      Kokkos::parallel_for(
+          Kokkos::ThreadVectorRange(dev, UNROLL), [&](ordinal_type k) {
+            m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k];
+          });
     }
   }
 
-  template<int UNROLL>
-  KOKKOS_INLINE_FUNCTION void
-  strip_mine (const ordinal_type& iRow, const ordinal_type& kk) const
-  {
+  template <int UNROLL>
+  KOKKOS_INLINE_FUNCTION void strip_mine(const ordinal_type& iRow,
+                                         const ordinal_type& kk) const {
     y_value_type sum[UNROLL];
 
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
@@ -954,29 +953,29 @@ struct SPMV_MV_LayoutLeft_Functor {
 #pragma unroll
 #endif
     for (int k = 0; k < UNROLL; ++k) {
-      sum[k] = Kokkos::Details::ArithTraits<y_value_type>::zero ();
+      sum[k] = Kokkos::Details::ArithTraits<y_value_type>::zero();
     }
 
-    const auto row = m_A.rowConst (iRow);
+    const auto row = m_A.rowConst(iRow);
 
     // The correct type of iEntry is ordinal_type, the type of the
     // number of columns in the (local) matrix.  This is because we
     // assume either that rows have no duplicate entries, or that rows
     // never have enough duplicate entries to overflow ordinal_type.
 
-    for(ordinal_type iEntry = 0; iEntry < row.length; iEntry++)
-    {
-      const A_value_type val = conjugate ?
-        Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
-        row.value(iEntry);
+    for (ordinal_type iEntry = 0; iEntry < row.length; iEntry++) {
+      const A_value_type val =
+          conjugate ? Kokkos::Details::ArithTraits<A_value_type>::conj(
+                          row.value(iEntry))
+                    : row.value(iEntry);
       const ordinal_type ind = row.colidx(iEntry);
 #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
-        if(doalpha == 1)
+        if (doalpha == 1)
           sum[k] += val * m_x(ind, kk + k);
-        else if(doalpha == -1)
+        else if (doalpha == -1)
           sum[k] -= val * m_x(ind, kk + k);
         else
           sum[k] += alpha * val * m_x(ind, kk + k);
@@ -984,24 +983,22 @@ struct SPMV_MV_LayoutLeft_Functor {
     }
 
     if (dobeta == 0) {
-      for(ordinal_type k = 0; k < UNROLL; k++)
-        m_y(iRow, kk + k) = sum[k];
+      for (ordinal_type k = 0; k < UNROLL; k++) m_y(iRow, kk + k) = sum[k];
     } else if (dobeta == 1) {
-      for(ordinal_type k = 0; k < UNROLL; k++)
+      for (ordinal_type k = 0; k < UNROLL; k++)
         m_y(iRow, kk + k) = m_y(iRow, kk + k) + sum[k];
     } else if (dobeta == -1) {
-      for(ordinal_type k = 0; k < UNROLL; k++)
+      for (ordinal_type k = 0; k < UNROLL; k++)
         m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k];
     } else {
-      for(ordinal_type k = 0; k < UNROLL; k++)
+      for (ordinal_type k = 0; k < UNROLL; k++)
         m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k];
     }
   }
 
-  KOKKOS_INLINE_FUNCTION void
-  strip_mine_1 (const team_member& dev, const ordinal_type& iRow) const
-  {
-    const auto row = m_A.rowConst (iRow);
+  KOKKOS_INLINE_FUNCTION void strip_mine_1(const team_member& dev,
+                                           const ordinal_type& iRow) const {
+    const auto row = m_A.rowConst(iRow);
 
     // The correct type of iEntry is ordinal_type, the type of the
     // number of columns in the (local) matrix.  This is because we
@@ -1009,17 +1006,17 @@ struct SPMV_MV_LayoutLeft_Functor {
     // never have enough duplicate entries to overflow ordinal_type.
 
     y_value_type sum;
-    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, row.length),
-    [&](ordinal_type iEntry, y_value_type& lsum)
-    {
-      const A_value_type val = conjugate ?
-          Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
-          row.value(iEntry);
-      lsum += val * m_x(row.colidx(iEntry),0);
-    }, sum);
-    Kokkos::single(Kokkos::PerThread(dev),
-    [&]()
-    {
+    Kokkos::parallel_reduce(
+        Kokkos::ThreadVectorRange(dev, row.length),
+        [&](ordinal_type iEntry, y_value_type& lsum) {
+          const A_value_type val =
+              conjugate ? Kokkos::Details::ArithTraits<A_value_type>::conj(
+                              row.value(iEntry))
+                        : row.value(iEntry);
+          lsum += val * m_x(row.colidx(iEntry), 0);
+        },
+        sum);
+    Kokkos::single(Kokkos::PerThread(dev), [&]() {
       if (doalpha == -1) {
         sum = -sum;
       } else if (doalpha * doalpha != 1) {
@@ -1027,21 +1024,19 @@ struct SPMV_MV_LayoutLeft_Functor {
       }
 
       if (dobeta == 0) {
-        m_y(iRow, 0) = sum ;
+        m_y(iRow, 0) = sum;
       } else if (dobeta == 1) {
-        m_y(iRow, 0) += sum ;
+        m_y(iRow, 0) += sum;
       } else if (dobeta == -1) {
-        m_y(iRow, 0) = -m_y(iRow, 0) +  sum;
+        m_y(iRow, 0) = -m_y(iRow, 0) + sum;
       } else {
         m_y(iRow, 0) = beta * m_y(iRow, 0) + sum;
       }
     });
   }
 
-  KOKKOS_INLINE_FUNCTION void
-  strip_mine_1 (const ordinal_type& iRow) const
-  {
-    const auto row = m_A.rowConst (iRow);
+  KOKKOS_INLINE_FUNCTION void strip_mine_1(const ordinal_type& iRow) const {
+    const auto row = m_A.rowConst(iRow);
 
     // The correct type of iEntry is ordinal_type, the type of the
     // number of columns in the (local) matrix.  This is because we
@@ -1049,12 +1044,12 @@ struct SPMV_MV_LayoutLeft_Functor {
     // never have enough duplicate entries to overflow ordinal_type.
 
     y_value_type sum = y_value_type();
-    for(ordinal_type iEntry = 0; iEntry < row.length; iEntry++)
-    {
-      const A_value_type val = conjugate ?
-          Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
-          row.value(iEntry);
-      sum += val * m_x(row.colidx(iEntry),0);
+    for (ordinal_type iEntry = 0; iEntry < row.length; iEntry++) {
+      const A_value_type val =
+          conjugate ? Kokkos::Details::ArithTraits<A_value_type>::conj(
+                          row.value(iEntry))
+                    : row.value(iEntry);
+      sum += val * m_x(row.colidx(iEntry), 0);
     }
     if (doalpha == -1) {
       sum = -sum;
@@ -1063,19 +1058,17 @@ struct SPMV_MV_LayoutLeft_Functor {
     }
 
     if (dobeta == 0) {
-      m_y(iRow, 0) = sum ;
+      m_y(iRow, 0) = sum;
     } else if (dobeta == 1) {
-      m_y(iRow, 0) += sum ;
+      m_y(iRow, 0) += sum;
     } else if (dobeta == -1) {
-      m_y(iRow, 0) = -m_y(iRow, 0) +  sum;
+      m_y(iRow, 0) = -m_y(iRow, 0) + sum;
     } else {
       m_y(iRow, 0) = beta * m_y(iRow, 0) + sum;
     }
   }
 
-  KOKKOS_INLINE_FUNCTION void
-  operator() (const ordinal_type& iRow) const
-  {
+  KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type& iRow) const {
     // mfh 20 Mar 2015, 07 Jun 2016: This is ordinal_type because it
     // needs to have the same type as n.
     ordinal_type kk = 0;
@@ -1084,107 +1077,74 @@ struct SPMV_MV_LayoutLeft_Functor {
     for (; kk + 4 <= n; kk += 4) {
       strip_mine<4>(dev, iRow, kk);
     }
-    for( ; kk < n; ++kk) {
+    for (; kk < n; ++kk) {
       strip_mine<1>(dev, iRow, kk);
     }
 #else
-#  if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-      if ((n > 8) && (n % 8 == 1)) {
-        strip_mine<9>(iRow, kk);
-        kk += 9;
-      }
-      for(; kk + 8 <= n; kk += 8)
-        strip_mine<8>(iRow, kk);
-      if(kk < n) {
-        switch(n - kk) {
-#  else // NOT a GPU
-      if ((n > 16) && (n % 16 == 1)) {
-        strip_mine<17>(iRow, kk);
-        kk += 17;
-      }
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+    if ((n > 8) && (n % 8 == 1)) {
+      strip_mine<9>(iRow, kk);
+      kk += 9;
+    }
+    for (; kk + 8 <= n; kk += 8) strip_mine<8>(iRow, kk);
+    if (kk < n) {
+      switch (n - kk) {
+#else   // NOT a GPU
+    if ((n > 16) && (n % 16 == 1)) {
+      strip_mine<17>(iRow, kk);
+      kk += 17;
+    }
 
-      for (; kk + 16 <= n; kk += 16) {
-        strip_mine<16>(iRow, kk);
-      }
+    for (; kk + 16 <= n; kk += 16) {
+      strip_mine<16>(iRow, kk);
+    }
 
-      if(kk < n) {
-        switch(n - kk) {
-        case 15:
-          strip_mine<15>(iRow, kk);
-          break;
-
-        case 14:
-          strip_mine<14>(iRow, kk);
-          break;
-
-        case 13:
-          strip_mine<13>(iRow, kk);
-          break;
-
-        case 12:
-          strip_mine<12>(iRow, kk);
-          break;
-
-        case 11:
-          strip_mine<11>(iRow, kk);
-          break;
-
-        case 10:
-          strip_mine<10>(iRow, kk);
-          break;
-
-        case 9:
-          strip_mine<9>(iRow, kk);
-          break;
-
-        case 8:
-          strip_mine<8>(iRow, kk);
-          break;
-#  endif // if/else: __CUDA_ARCH__ or __HIP_DEVICE_COMPILE__
-        case 7:
-          strip_mine<7>(iRow, kk);
-          break;
-
-        case 6:
-          strip_mine<6>(iRow, kk);
-          break;
-
-        case 5:
-          strip_mine<5>(iRow, kk);
-          break;
-
-        case 4:
-          strip_mine<4>(iRow, kk);
-          break;
-
-        case 3:
-          strip_mine<3>(iRow, kk);
-          break;
-
-        case 2:
-          strip_mine<2>(iRow, kk);
-          break;
-
-        case 1:
-          strip_mine_1(iRow);
-          break;
-        }
+    if (kk < n) {
+      switch (n - kk) {
+        case 15: strip_mine<15>(iRow, kk); break;
+
+        case 14: strip_mine<14>(iRow, kk); break;
+
+        case 13: strip_mine<13>(iRow, kk); break;
+
+        case 12: strip_mine<12>(iRow, kk); break;
+
+        case 11: strip_mine<11>(iRow, kk); break;
+
+        case 10: strip_mine<10>(iRow, kk); break;
+
+        case 9: strip_mine<9>(iRow, kk); break;
+
+        case 8: strip_mine<8>(iRow, kk); break;
+#endif  // if/else: __CUDA_ARCH__ or __HIP_DEVICE_COMPILE__
+        case 7: strip_mine<7>(iRow, kk); break;
+
+        case 6: strip_mine<6>(iRow, kk); break;
+
+        case 5: strip_mine<5>(iRow, kk); break;
+
+        case 4: strip_mine<4>(iRow, kk); break;
+
+        case 3: strip_mine<3>(iRow, kk); break;
+
+        case 2: strip_mine<2>(iRow, kk); break;
+
+        case 1: strip_mine_1(iRow); break;
       }
-#endif // KOKKOS_FAST_COMPILE
     }
+#endif  // KOKKOS_FAST_COMPILE
+  }
 
-
-  KOKKOS_INLINE_FUNCTION void
-  operator() (const team_member& dev) const
-  {
+  KOKKOS_INLINE_FUNCTION void operator()(const team_member& dev) const {
     for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) {
-
       // iRow indexes over (local) rows of the matrix, so its correct
       // type is ordinal_type.
 
-      const ordinal_type iRow = (dev.league_rank() * dev.team_size() + dev.team_rank())
-                                * rows_per_thread + loop;
-      if (iRow >= m_A.numRows ()) {
+      const ordinal_type iRow =
+          (dev.league_rank() * dev.team_size() + dev.team_rank()) *
+              rows_per_thread +
+          loop;
+      if (iRow >= m_A.numRows()) {
         return;
       }
 
@@ -1196,20 +1156,19 @@ struct SPMV_MV_LayoutLeft_Functor {
       for (; kk + 4 <= n; kk += 4) {
         strip_mine<4>(dev, iRow, kk);
       }
-      for( ; kk < n; ++kk) {
+      for (; kk < n; ++kk) {
         strip_mine<1>(dev, iRow, kk);
       }
 #else
-#  if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
       if ((n > 8) && (n % 8 == 1)) {
         strip_mine<9>(dev, iRow, kk);
         kk += 9;
       }
-      for(; kk + 8 <= n; kk += 8)
-        strip_mine<8>(dev, iRow, kk);
-      if(kk < n) {
-        switch(n - kk) {
-#  else // NOT a GPU
+      for (; kk + 8 <= n; kk += 8) strip_mine<8>(dev, iRow, kk);
+      if (kk < n) {
+        switch (n - kk) {
+#else   // NOT a GPU
       if ((n > 16) && (n % 16 == 1)) {
         strip_mine<17>(dev, iRow, kk);
         kk += 17;
@@ -1219,390 +1178,376 @@ struct SPMV_MV_LayoutLeft_Functor {
         strip_mine<16>(dev, iRow, kk);
       }
 
-      if(kk < n) {
-        switch(n - kk) {
-        case 15:
-          strip_mine<15>(dev, iRow, kk);
-          break;
-
-        case 14:
-          strip_mine<14>(dev, iRow, kk);
-          break;
-
-        case 13:
-          strip_mine<13>(dev, iRow, kk);
-          break;
-
-        case 12:
-          strip_mine<12>(dev, iRow, kk);
-          break;
-
-        case 11:
-          strip_mine<11>(dev, iRow, kk);
-          break;
-
-        case 10:
-          strip_mine<10>(dev, iRow, kk);
-          break;
-
-        case 9:
-          strip_mine<9>(dev, iRow, kk);
-          break;
-
-        case 8:
-          strip_mine<8>(dev, iRow, kk);
-          break;
-#  endif // if/else: __CUDA_ARCH__ or __HIP_DEVICE_COMPILE__
-        case 7:
-          strip_mine<7>(dev, iRow, kk);
-          break;
-
-        case 6:
-          strip_mine<6>(dev, iRow, kk);
-          break;
-
-        case 5:
-          strip_mine<5>(dev, iRow, kk);
-          break;
-
-        case 4:
-          strip_mine<4>(dev, iRow, kk);
-          break;
-
-        case 3:
-          strip_mine<3>(dev, iRow, kk);
-          break;
-
-        case 2:
-          strip_mine<2>(dev, iRow, kk);
-          break;
-
-        case 1:
-          strip_mine_1(dev, iRow);
-          break;
+      if (kk < n) {
+        switch (n - kk) {
+          case 15: strip_mine<15>(dev, iRow, kk); break;
+
+          case 14: strip_mine<14>(dev, iRow, kk); break;
+
+          case 13: strip_mine<13>(dev, iRow, kk); break;
+
+          case 12: strip_mine<12>(dev, iRow, kk); break;
+
+          case 11: strip_mine<11>(dev, iRow, kk); break;
+
+          case 10: strip_mine<10>(dev, iRow, kk); break;
+
+          case 9: strip_mine<9>(dev, iRow, kk); break;
+
+          case 8: strip_mine<8>(dev, iRow, kk); break;
+#endif  // if/else: __CUDA_ARCH__ or __HIP_DEVICE_COMPILE__
+          case 7: strip_mine<7>(dev, iRow, kk); break;
+
+          case 6: strip_mine<6>(dev, iRow, kk); break;
+
+          case 5: strip_mine<5>(dev, iRow, kk); break;
+
+          case 4: strip_mine<4>(dev, iRow, kk); break;
+
+          case 3: strip_mine<3>(dev, iRow, kk); break;
+
+          case 2: strip_mine<2>(dev, iRow, kk); break;
+
+          case 1: strip_mine_1(dev, iRow); break;
         }
       }
-#endif // KOKKOS_FAST_COMPILE
+#endif  // KOKKOS_FAST_COMPILE
     }
   }
 };
 
-//spmv_alpha_beta_mv_no_transpose: version for CPU execution spaces (RangePolicy)
-template<class AMatrix,
-         class XVector,
-         class YVector,
-         int doalpha,
-         int dobeta,
-         bool conjugate,
-         typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space<typename AMatrix::execution_space>()>::type* = nullptr>
-static void
-spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& alpha,
-                                 const AMatrix& A,
-                                 const XVector& x,
-                                 const typename YVector::non_const_value_type& beta,
-                                 const YVector& y)
-{
+// spmv_alpha_beta_mv_no_transpose: version for CPU execution spaces
+// (RangePolicy)
+template <class AMatrix, class XVector, class YVector, int doalpha, int dobeta,
+          bool conjugate,
+          typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename AMatrix::execution_space>()>::type* = nullptr>
+static void spmv_alpha_beta_mv_no_transpose(
+    const typename YVector::non_const_value_type& alpha, const AMatrix& A,
+    const XVector& x, const typename YVector::non_const_value_type& beta,
+    const YVector& y) {
   using ordinal_type = typename AMatrix::non_const_ordinal_type;
 
-  if (A.numRows () <= static_cast<ordinal_type> (0)) {
+  if (A.numRows() <= static_cast<ordinal_type>(0)) {
     return;
   }
   if (doalpha == 0) {
     if (dobeta != 1) {
-      KokkosBlas::scal (y, beta, y);
+      KokkosBlas::scal(y, beta, y);
     }
     return;
-  }
-  else {
-
+  } else {
     // Assuming that no row contains duplicate entries, NNZPerRow
     // cannot be more than the number of columns of the matrix.  Thus,
     // the appropriate type is ordinal_type.
-    const ordinal_type NNZPerRow = A.nnz () / A.numRows ();
-    const int vector_length = 1;
+    const ordinal_type NNZPerRow = A.nnz() / A.numRows();
+    const int vector_length      = 1;
 
-#ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels
+#ifndef KOKKOS_FAST_COMPILE  // This uses templated functions on doalpha and
+                             // dobeta and will produce 16 kernels
 
-    typedef SPMV_MV_LayoutLeft_Functor<AMatrix, XVector, YVector,
-                                       doalpha, dobeta, conjugate> OpType;
-    OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow), vector_length);
+    typedef SPMV_MV_LayoutLeft_Functor<AMatrix, XVector, YVector, doalpha,
+                                       dobeta, conjugate>
+        OpType;
+    OpType op(alpha, A, x, beta, y,
+              RowsPerThread<typename AMatrix::execution_space>(NNZPerRow),
+              vector_length);
 
     typename AMatrix::const_ordinal_type nrow = A.numRows();
 
-    Kokkos::parallel_for("KokkosSparse::spmv<MV,NoTranspose>", Kokkos::RangePolicy< typename AMatrix::execution_space >( 0, nrow ), op );
+    Kokkos::parallel_for(
+        "KokkosSparse::spmv<MV,NoTranspose>",
+        Kokkos::RangePolicy<typename AMatrix::execution_space>(0, nrow), op);
 
-#else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta
+#else   // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for
+        // alpha/beta
 
-    typedef SPMV_MV_LayoutLeft_Functor<AMatrix, XVector, YVector,
-      2, 2, conjugate> OpType;
+    typedef SPMV_MV_LayoutLeft_Functor<AMatrix, XVector, YVector, 2, 2,
+                                       conjugate>
+        OpType;
 
     typename AMatrix::const_ordinal_type nrow = A.numRows();
 
-    OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow), vector_length);
+    OpType op(alpha, A, x, beta, y,
+              RowsPerThread<typename AMatrix::execution_space>(NNZPerRow),
+              vector_length);
 
-    Kokkos::parallel_for("KokkosSparse::spmv<MV,NoTranspose>",  Kokkos::RangePolicy< typename AMatrix::execution_space >
-       ( 0, nrow ) , op );
-#endif // KOKKOS_FAST_COMPILE
+    Kokkos::parallel_for(
+        "KokkosSparse::spmv<MV,NoTranspose>",
+        Kokkos::RangePolicy<typename AMatrix::execution_space>(0, nrow), op);
+#endif  // KOKKOS_FAST_COMPILE
   }
 }
 
-//spmv_alpha_beta_mv_no_transpose: version for GPU execution spaces (TeamPolicy)
-template<class AMatrix,
-         class XVector,
-         class YVector,
-         int doalpha,
-         int dobeta,
-         bool conjugate,
-         typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space<typename AMatrix::execution_space>()>::type* = nullptr>
-static void
-spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& alpha,
-                                 const AMatrix& A,
-                                 const XVector& x,
-                                 const typename YVector::non_const_value_type& beta,
-                                 const YVector& y)
-{
+// spmv_alpha_beta_mv_no_transpose: version for GPU execution spaces
+// (TeamPolicy)
+template <class AMatrix, class XVector, class YVector, int doalpha, int dobeta,
+          bool conjugate,
+          typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename AMatrix::execution_space>()>::type* = nullptr>
+static void spmv_alpha_beta_mv_no_transpose(
+    const typename YVector::non_const_value_type& alpha, const AMatrix& A,
+    const XVector& x, const typename YVector::non_const_value_type& beta,
+    const YVector& y) {
   using ordinal_type = typename AMatrix::non_const_ordinal_type;
-  using size_type = typename AMatrix::non_const_size_type;
+  using size_type    = typename AMatrix::non_const_size_type;
 
-  if (A.numRows () <= static_cast<ordinal_type> (0)) {
+  if (A.numRows() <= static_cast<ordinal_type>(0)) {
     return;
   }
   if (doalpha == 0) {
     if (dobeta != 1) {
-      KokkosBlas::scal (y, beta, y);
+      KokkosBlas::scal(y, beta, y);
     }
     return;
-  }
-  else {
-
+  } else {
     // Assuming that no row contains duplicate entries, NNZPerRow
     // cannot be more than the number of columns of the matrix.  Thus,
     // the appropriate type is ordinal_type.
-    const ordinal_type NNZPerRow = A.nnz () / A.numRows ();
+    const ordinal_type NNZPerRow = A.nnz() / A.numRows();
 
     ordinal_type vector_length = 1;
-    while( (vector_length*2*3 <= NNZPerRow) && (vector_length<8) )
+    while ((vector_length * 2 * 3 <= NNZPerRow) && (vector_length < 8))
       vector_length *= 2;
 
-#ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels
+#ifndef KOKKOS_FAST_COMPILE  // This uses templated functions on doalpha and
+                             // dobeta and will produce 16 kernels
 
-    typedef SPMV_MV_LayoutLeft_Functor<AMatrix, XVector, YVector,
-                                       doalpha, dobeta, conjugate> OpType;
-    OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow), vector_length);
+    typedef SPMV_MV_LayoutLeft_Functor<AMatrix, XVector, YVector, doalpha,
+                                       dobeta, conjugate>
+        OpType;
+    OpType op(alpha, A, x, beta, y,
+              RowsPerThread<typename AMatrix::execution_space>(NNZPerRow),
+              vector_length);
 
     typename AMatrix::const_ordinal_type nrow = A.numRows();
 
-    const ordinal_type rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-    const ordinal_type team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
+    const ordinal_type rows_per_thread =
+        RowsPerThread<typename AMatrix::execution_space>(NNZPerRow);
+    const ordinal_type team_size =
+        Kokkos::TeamPolicy<typename AMatrix::execution_space>(
+            rows_per_thread, Kokkos::AUTO, vector_length)
+            .team_size_recommended(op, Kokkos::ParallelForTag());
     const ordinal_type rows_per_team = rows_per_thread * team_size;
-    const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
-    Kokkos::parallel_for("KokkosSparse::spmv<MV,NoTranspose>", Kokkos::TeamPolicy< typename AMatrix::execution_space >
-       ( nteams , team_size , vector_length ) , op );
+    const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team;
+    Kokkos::parallel_for("KokkosSparse::spmv<MV,NoTranspose>",
+                         Kokkos::TeamPolicy<typename AMatrix::execution_space>(
+                             nteams, team_size, vector_length),
+                         op);
 
-#else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta
+#else   // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for
+        // alpha/beta
 
-    typedef SPMV_MV_LayoutLeft_Functor<AMatrix, XVector, YVector,
-      2, 2, conjugate> OpType;
+    typedef SPMV_MV_LayoutLeft_Functor<AMatrix, XVector, YVector, 2, 2,
+                                       conjugate>
+        OpType;
 
     typename AMatrix::const_ordinal_type nrow = A.numRows();
 
-    OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow), vector_length);
+    OpType op(alpha, A, x, beta, y,
+              RowsPerThread<typename AMatrix::execution_space>(NNZPerRow),
+              vector_length);
 
-    const ordinal_type rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-    const ordinal_type team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
+    const ordinal_type rows_per_thread =
+        RowsPerThread<typename AMatrix::execution_space>(NNZPerRow);
+    const ordinal_type team_size =
+        Kokkos::TeamPolicy<typename AMatrix::execution_space>(
+            rows_per_thread, Kokkos::AUTO, vector_length)
+            .team_size_recommended(op, Kokkos::ParallelForTag());
     const ordinal_type rows_per_team = rows_per_thread * team_size;
-    const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
-    Kokkos::parallel_for("KokkosSparse::spmv<MV,NoTranspose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
-       ( nteams , team_size , vector_length ) , op );
-#endif // KOKKOS_FAST_COMPILE
+    const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team;
+    Kokkos::parallel_for("KokkosSparse::spmv<MV,NoTranspose>",
+                         Kokkos::TeamPolicy<typename AMatrix::execution_space>(
+                             nteams, team_size, vector_length),
+                         op);
+#endif  // KOKKOS_FAST_COMPILE
   }
 }
 
-//spmv_alpha_beta_mv_transpose: version for CPU execution spaces (RangePolicy)
-template<class AMatrix,
-         class XVector,
-         class YVector,
-         int doalpha,
-         int dobeta,
-         bool conjugate,
-         typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space<typename AMatrix::execution_space>()>::type* = nullptr>
-static void
-spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alpha,
-                              const AMatrix& A,
-                              const XVector& x,
-                              const typename YVector::non_const_value_type& beta,
-                              const YVector& y)
-{
+// spmv_alpha_beta_mv_transpose: version for CPU execution spaces (RangePolicy)
+template <class AMatrix, class XVector, class YVector, int doalpha, int dobeta,
+          bool conjugate,
+          typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename AMatrix::execution_space>()>::type* = nullptr>
+static void spmv_alpha_beta_mv_transpose(
+    const typename YVector::non_const_value_type& alpha, const AMatrix& A,
+    const XVector& x, const typename YVector::non_const_value_type& beta,
+    const YVector& y) {
   using ordinal_type = typename AMatrix::non_const_ordinal_type;
 
-  if (A.numRows () <= static_cast<ordinal_type> (0)) {
+  if (A.numRows() <= static_cast<ordinal_type>(0)) {
     return;
   }
 
   // We need to scale y first ("scaling" by zero just means filling
   // with zeros), since the functor works by atomic-adding into y.
   if (dobeta != 1) {
-    KokkosBlas::scal (y, beta, y);
+    KokkosBlas::scal(y, beta, y);
   }
 
   if (doalpha != 0) {
-#ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels
+#ifndef KOKKOS_FAST_COMPILE  // This uses templated functions on doalpha and
+                             // dobeta and will produce 16 kernels
 
-    typedef SPMV_MV_Transpose_Functor<AMatrix, XVector, YVector,
-      doalpha, dobeta, conjugate> OpType;
-    OpType op (alpha, A, x, beta, y);
+    typedef SPMV_MV_Transpose_Functor<AMatrix, XVector, YVector, doalpha,
+                                      dobeta, conjugate>
+        OpType;
+    OpType op(alpha, A, x, beta, y);
 
     const ordinal_type nrow = A.numRows();
-    Kokkos::parallel_for ("KokkosSparse::spmv<MV,Transpose>",  Kokkos::RangePolicy < typename AMatrix::execution_space >
-       ( 0 , nrow ) , op );
+    Kokkos::parallel_for(
+        "KokkosSparse::spmv<MV,Transpose>",
+        Kokkos::RangePolicy<typename AMatrix::execution_space>(0, nrow), op);
 
-#else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta
+#else  // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for
+       // alpha/beta
 
-    typedef SPMV_MV_Transpose_Functor<AMatrix, XVector, YVector,
-      2, 2, conjugate, SizeType> OpType;
+    typedef SPMV_MV_Transpose_Functor<AMatrix, XVector, YVector, 2, 2,
+                                      conjugate, SizeType>
+        OpType;
 
     typename AMatrix::const_ordinal_type nrow = A.numRows();
-    Kokkos::parallel_for("KokkosSparse::spmv<MV,Transpose>",  Kokkos::RangePolicy< typename AMatrix::execution_space >
-       ( 0, nrow ) , op );
+    Kokkos::parallel_for(
+        "KokkosSparse::spmv<MV,Transpose>",
+        Kokkos::RangePolicy<typename AMatrix::execution_space>(0, nrow), op);
 
-#endif // KOKKOS_FAST_COMPILE
+#endif  // KOKKOS_FAST_COMPILE
   }
 }
 
-//spmv_alpha_beta_mv_transpose: version for GPU execution spaces (TeamPolicy)
-template<class AMatrix,
-         class XVector,
-         class YVector,
-         int doalpha,
-         int dobeta,
-         bool conjugate,
-         typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space<typename AMatrix::execution_space>()>::type* = nullptr>
-static void
-spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alpha,
-                              const AMatrix& A,
-                              const XVector& x,
-                              const typename YVector::non_const_value_type& beta,
-                              const YVector& y)
-{
+// spmv_alpha_beta_mv_transpose: version for GPU execution spaces (TeamPolicy)
+template <class AMatrix, class XVector, class YVector, int doalpha, int dobeta,
+          bool conjugate,
+          typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space<
+              typename AMatrix::execution_space>()>::type* = nullptr>
+static void spmv_alpha_beta_mv_transpose(
+    const typename YVector::non_const_value_type& alpha, const AMatrix& A,
+    const XVector& x, const typename YVector::non_const_value_type& beta,
+    const YVector& y) {
   using ordinal_type = typename AMatrix::non_const_ordinal_type;
-  using size_type = typename AMatrix::non_const_size_type;
+  using size_type    = typename AMatrix::non_const_size_type;
 
-  if (A.numRows () <= static_cast<ordinal_type> (0)) {
+  if (A.numRows() <= static_cast<ordinal_type>(0)) {
     return;
   }
 
   // We need to scale y first ("scaling" by zero just means filling
   // with zeros), since the functor works by atomic-adding into y.
   if (dobeta != 1) {
-    KokkosBlas::scal (y, beta, y);
+    KokkosBlas::scal(y, beta, y);
   }
 
   if (doalpha != 0) {
-
     // Assuming that no row contains duplicate entries, NNZPerRow
     // cannot be more than the number of columns of the matrix.  Thus,
     // the appropriate type is ordinal_type.
-    const ordinal_type NNZPerRow = static_cast<ordinal_type> (A.nnz () / A.numRows ());
+    const ordinal_type NNZPerRow =
+        static_cast<ordinal_type>(A.nnz() / A.numRows());
 
     ordinal_type vector_length = 1;
-    //Transpose functor uses atomics which can't be vectorized on CPU
-    while( (vector_length*2*3 <= NNZPerRow) && (vector_length<8) )
-      vector_length*=2;
+    // Transpose functor uses atomics which can't be vectorized on CPU
+    while ((vector_length * 2 * 3 <= NNZPerRow) && (vector_length < 8))
+      vector_length *= 2;
 
-#ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels
+#ifndef KOKKOS_FAST_COMPILE  // This uses templated functions on doalpha and
+                             // dobeta and will produce 16 kernels
 
-    typedef SPMV_MV_Transpose_Functor<AMatrix, XVector, YVector,
-      doalpha, dobeta, conjugate> OpType;
-    OpType op (alpha, A, x, beta, y);
+    typedef SPMV_MV_Transpose_Functor<AMatrix, XVector, YVector, doalpha,
+                                      dobeta, conjugate>
+        OpType;
+    OpType op(alpha, A, x, beta, y);
 
     const ordinal_type nrow = A.numRows();
-    const ordinal_type rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-    const ordinal_type team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
+    const ordinal_type rows_per_thread =
+        RowsPerThread<typename AMatrix::execution_space>(NNZPerRow);
+    const ordinal_type team_size =
+        Kokkos::TeamPolicy<typename AMatrix::execution_space>(
+            rows_per_thread, Kokkos::AUTO, vector_length)
+            .team_size_recommended(op, Kokkos::ParallelForTag());
     const ordinal_type rows_per_team = rows_per_thread * team_size;
-    op.rows_per_team = rows_per_team;
-    const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
-    Kokkos::parallel_for ("KokkosSparse::spmv<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
-       ( nteams , team_size , vector_length ) , op );
+    op.rows_per_team                 = rows_per_team;
+    const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team;
+    Kokkos::parallel_for("KokkosSparse::spmv<MV,Transpose>",
+                         Kokkos::TeamPolicy<typename AMatrix::execution_space>(
+                             nteams, team_size, vector_length),
+                         op);
 
-#else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta
+#else  // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for
+       // alpha/beta
 
-    typedef SPMV_MV_Transpose_Functor<AMatrix, XVector, YVector,
-      2, 2, conjugate, SizeType> OpType;
+    typedef SPMV_MV_Transpose_Functor<AMatrix, XVector, YVector, 2, 2,
+                                      conjugate, SizeType>
+        OpType;
 
     typename AMatrix::const_ordinal_type nrow = A.numRows();
-    OpType op (alpha, A, x, beta, y);
-
-    const ordinal_type rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-    const ordinal_type team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
+    OpType op(alpha, A, x, beta, y);
+
+    const ordinal_type rows_per_thread =
+        RowsPerThread<typename AMatrix::execution_space>(NNZPerRow);
+    const ordinal_type team_size =
+        Kokkos::TeamPolicy<typename AMatrix::execution_space>(
+            rows_per_thread, Kokkos::AUTO, vector_length)
+            .team_size_recommended(op, Kokkos::ParallelForTag());
     const ordinal_type rows_per_team = rows_per_thread * team_size;
-    op.rows_per_team = rows_per_team;
-    const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
-    Kokkos::parallel_for("KokkosSparse::spmv<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
-       ( nteams , team_size , vector_length ) , op );
-
-#endif // KOKKOS_FAST_COMPILE
+    op.rows_per_team                 = rows_per_team;
+    const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team;
+    Kokkos::parallel_for("KokkosSparse::spmv<MV,Transpose>",
+                         Kokkos::TeamPolicy<typename AMatrix::execution_space>(
+                             nteams, team_size, vector_length),
+                         op);
+
+#endif  // KOKKOS_FAST_COMPILE
   }
 }
 
-template<class AMatrix,
-         class XVector,
-         class YVector,
-         int doalpha,
-         int dobeta>
-static void
-spmv_alpha_beta_mv (const char mode[],
-                    const typename YVector::non_const_value_type& alpha,
-                    const AMatrix& A,
-                    const XVector& x,
-                    const typename YVector::non_const_value_type& beta,
-                    const YVector& y)
-{
+template <class AMatrix, class XVector, class YVector, int doalpha, int dobeta>
+static void spmv_alpha_beta_mv(
+    const char mode[], const typename YVector::non_const_value_type& alpha,
+    const AMatrix& A, const XVector& x,
+    const typename YVector::non_const_value_type& beta, const YVector& y) {
   if (mode[0] == NoTranspose[0]) {
-    spmv_alpha_beta_mv_no_transpose<AMatrix, XVector, YVector, doalpha, dobeta, false> (alpha, A, x, beta, y);
-  }
-  else if (mode[0] == Conjugate[0]) {
-    spmv_alpha_beta_mv_no_transpose<AMatrix, XVector, YVector, doalpha, dobeta, true> (alpha, A, x, beta, y);
-  }
-  else if (mode[0] == Transpose[0]) {
-    spmv_alpha_beta_mv_transpose<AMatrix, XVector, YVector, doalpha, dobeta, false> (alpha, A, x, beta, y);
-  }
-  else if (mode[0] == ConjugateTranspose[0]) {
-    spmv_alpha_beta_mv_transpose<AMatrix, XVector, YVector, doalpha, dobeta, true> (alpha, A, x, beta, y);
-  }
-  else {
-    Kokkos::Impl::throw_runtime_exception ("Invalid Transpose Mode for KokkosSparse::spmv()");
+    spmv_alpha_beta_mv_no_transpose<AMatrix, XVector, YVector, doalpha, dobeta,
+                                    false>(alpha, A, x, beta, y);
+  } else if (mode[0] == Conjugate[0]) {
+    spmv_alpha_beta_mv_no_transpose<AMatrix, XVector, YVector, doalpha, dobeta,
+                                    true>(alpha, A, x, beta, y);
+  } else if (mode[0] == Transpose[0]) {
+    spmv_alpha_beta_mv_transpose<AMatrix, XVector, YVector, doalpha, dobeta,
+                                 false>(alpha, A, x, beta, y);
+  } else if (mode[0] == ConjugateTranspose[0]) {
+    spmv_alpha_beta_mv_transpose<AMatrix, XVector, YVector, doalpha, dobeta,
+                                 true>(alpha, A, x, beta, y);
+  } else {
+    KokkosKernels::Impl::throw_runtime_exception(
+        "Invalid Transpose Mode for KokkosSparse::spmv()");
   }
 }
 
-template<class AMatrix,
-         class XVector,
-         class YVector,
-         int doalpha>
-void
-spmv_alpha_mv (const char mode[],
-               const typename YVector::non_const_value_type& alpha,
-               const AMatrix& A,
-               const XVector& x,
-               const typename YVector::non_const_value_type& beta,
-               const YVector& y)
-{
+template <class AMatrix, class XVector, class YVector, int doalpha>
+void spmv_alpha_mv(const char mode[],
+                   const typename YVector::non_const_value_type& alpha,
+                   const AMatrix& A, const XVector& x,
+                   const typename YVector::non_const_value_type& beta,
+                   const YVector& y) {
   typedef typename YVector::non_const_value_type coefficient_type;
   typedef Kokkos::Details::ArithTraits<coefficient_type> KAT;
 
-  if (beta == KAT::zero ()) {
-    spmv_alpha_beta_mv<AMatrix, XVector, YVector, doalpha, 0> (mode, alpha, A, x, beta, y);
-  }
-  else if (beta == KAT::one ()) {
-    spmv_alpha_beta_mv<AMatrix, XVector, YVector, doalpha, 1> (mode, alpha, A, x, beta, y);
-  }
-  else if (beta == -KAT::one ()) {
-    spmv_alpha_beta_mv<AMatrix, XVector, YVector, doalpha, -1> (mode, alpha, A, x, beta, y);
-  }
-  else {
-    spmv_alpha_beta_mv<AMatrix, XVector, YVector, doalpha, 2> (mode, alpha, A, x, beta, y);
+  if (beta == KAT::zero()) {
+    spmv_alpha_beta_mv<AMatrix, XVector, YVector, doalpha, 0>(mode, alpha, A, x,
+                                                              beta, y);
+  } else if (beta == KAT::one()) {
+    spmv_alpha_beta_mv<AMatrix, XVector, YVector, doalpha, 1>(mode, alpha, A, x,
+                                                              beta, y);
+  } else if (beta == -KAT::one()) {
+    spmv_alpha_beta_mv<AMatrix, XVector, YVector, doalpha, -1>(mode, alpha, A,
+                                                               x, beta, y);
+  } else {
+    spmv_alpha_beta_mv<AMatrix, XVector, YVector, doalpha, 2>(mode, alpha, A, x,
+                                                              beta, y);
   }
 }
 
-}}  //namespace KokkosSparse::Impl
+}  // namespace Impl
+}  // namespace KokkosSparse
 
-#endif // KOKKOSSPARSE_IMPL_SPMV_DEF_HPP_
+#endif  // KOKKOSSPARSE_IMPL_SPMV_DEF_HPP_
diff --git a/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp b/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp
index 7ac4936f51..93af1b82dd 100644
--- a/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp
@@ -45,66 +45,71 @@ namespace KokkosSparse {
 namespace Impl {
 
 #ifdef KOKKOS_ENABLE_OPENMP
-template<typename AMatrix, typename XVector, typename YVector>
-void spmv_raw_openmp_no_transpose(typename YVector::const_value_type& s_a, AMatrix A, XVector x, typename YVector::const_value_type& s_b, YVector y) {
+template <typename AMatrix, typename XVector, typename YVector>
+void spmv_raw_openmp_no_transpose(typename YVector::const_value_type& s_a,
+                                  AMatrix A, XVector x,
+                                  typename YVector::const_value_type& s_b,
+                                  YVector y) {
   typedef typename YVector::non_const_value_type value_type;
-  typedef typename AMatrix::ordinal_type         ordinal_type;
-  typedef typename AMatrix::non_const_size_type            size_type;
+  typedef typename AMatrix::ordinal_type ordinal_type;
+  typedef typename AMatrix::non_const_size_type size_type;
 
-  typename XVector::const_value_type* KOKKOS_RESTRICT x_ptr = x.data();
+  typename XVector::const_value_type* KOKKOS_RESTRICT x_ptr     = x.data();
   typename YVector::non_const_value_type* KOKKOS_RESTRICT y_ptr = y.data();
 
-  const typename AMatrix::value_type* KOKKOS_RESTRICT matrixCoeffs = A.values.data();
-  const ordinal_type* KOKKOS_RESTRICT matrixCols     = A.graph.entries.data();
-  const size_type* KOKKOS_RESTRICT matrixRowOffsets  = A.graph.row_map.data();
-  const size_type* KOKKOS_RESTRICT threadStarts     = A.graph.row_block_offsets.data();
+  const typename AMatrix::value_type* KOKKOS_RESTRICT matrixCoeffs =
+      A.values.data();
+  const ordinal_type* KOKKOS_RESTRICT matrixCols    = A.graph.entries.data();
+  const size_type* KOKKOS_RESTRICT matrixRowOffsets = A.graph.row_map.data();
+  const size_type* KOKKOS_RESTRICT threadStarts =
+      A.graph.row_block_offsets.data();
 
 #if defined(KOKKOS_ENABLE_PROFILING)
-    uint64_t kpID = 0;
-     if(Kokkos::Profiling::profileLibraryLoaded()) {
-      Kokkos::Profiling::beginParallelFor("KokkosSparse::spmv<RawOpenMP,NoTranspose>", 0, &kpID);
-     }
+  uint64_t kpID = 0;
+  if (Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::beginParallelFor(
+        "KokkosSparse::spmv<RawOpenMP,NoTranspose>", 0, &kpID);
+  }
 #endif
 
   typename YVector::const_value_type zero = 0;
-  #pragma omp parallel
+#pragma omp parallel
   {
 #if defined(KOKKOS_COMPILER_INTEL) && !defined(__clang__)
     __assume_aligned(x_ptr, 64);
     __assume_aligned(y_ptr, 64);
 #endif
 
-    const int myID    = omp_get_thread_num();
+    const int myID          = omp_get_thread_num();
     const size_type myStart = threadStarts[myID];
     const size_type myEnd   = threadStarts[myID + 1];
 
-    for(size_type row = myStart; row < myEnd; ++row) {
+    for (size_type row = myStart; row < myEnd; ++row) {
       const size_type rowStart = matrixRowOffsets[row];
       const size_type rowEnd   = matrixRowOffsets[row + 1];
 
       value_type sum = 0.0;
 
-      for(size_type i = rowStart; i < rowEnd; ++i) {
-        const ordinal_type x_entry =  matrixCols[i];
-        const value_type alpha_MC  =  s_a * matrixCoeffs[i];
-        sum                    += alpha_MC * x_ptr[x_entry];
+      for (size_type i = rowStart; i < rowEnd; ++i) {
+        const ordinal_type x_entry = matrixCols[i];
+        const value_type alpha_MC  = s_a * matrixCoeffs[i];
+        sum += alpha_MC * x_ptr[x_entry];
       }
 
-      if(zero == s_b) {
+      if (zero == s_b) {
         y_ptr[row] = sum;
       } else {
         y_ptr[row] = s_b * y_ptr[row] + sum;
       }
-   }
+    }
   }
 #if defined(KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Profiling::profileLibraryLoaded()) {
-        Kokkos::Profiling::endParallelFor(kpID);
-     }
+  if (Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::endParallelFor(kpID);
+  }
 #endif
-
 }
 
 #endif
-}
-}
+}  // namespace Impl
+}  // namespace KokkosSparse
diff --git a/src/sparse/impl/KokkosSparse_spmv_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_spec.hpp
index b678142dbe..e0fdb2b6cd 100644
--- a/src/sparse/impl/KokkosSparse_spmv_spec.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_spec.hpp
@@ -51,72 +51,67 @@
 #include "KokkosSparse_CrsMatrix.hpp"
 #include "KokkosKernels_Controls.hpp"
 // Include the actual functors
-#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY 
+#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 #include <KokkosSparse_spmv_impl.hpp>
 #endif
 
 namespace KokkosSparse {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class AT, class AO, class AD, class AM, class AS,
-         class XT, class XL, class XD, class XM,
-         class YT, class YL, class YD, class YM>
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM>
 struct spmv_eti_spec_avail {
   enum : bool { value = false };
 };
-template<class AT, class AO, class AD, class AM, class AS,
-         class XT, class XL, class XD, class XM,
-         class YT, class YL, class YD, class YM,
-         const bool integerScalarType =
-           std::is_integral<typename std::decay<AT>::type>::value>
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM,
+          const bool integerScalarType =
+              std::is_integral<typename std::decay<AT>::type>::value>
 struct spmv_mv_eti_spec_avail {
   enum : bool { value = false };
 };
 
-}
-}
-
-
-#define KOKKOSSPARSE_SPMV_ETI_SPEC_AVAIL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \
-    template<> \
-    struct spmv_eti_spec_avail<const SCALAR_TYPE, \
-                  const ORDINAL_TYPE, \
-                  Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged>, \
-                  const OFFSET_TYPE, \
-                  SCALAR_TYPE const*, \
-                  LAYOUT_TYPE, \
-                  Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess>, \
-                  SCALAR_TYPE*, \
-                  LAYOUT_TYPE, \
-                  Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                  Kokkos::MemoryTraits<Kokkos::Unmanaged> > \
-    { enum : bool { value = true }; };
-
-
-#define KOKKOSSPARSE_SPMV_MV_ETI_SPEC_AVAIL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE ) \
-    template<> \
-    struct spmv_mv_eti_spec_avail <const SCALAR_TYPE, \
-                                       const ORDINAL_TYPE, \
-                                       Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                       Kokkos::MemoryTraits<Kokkos::Unmanaged>, \
-                                       const OFFSET_TYPE, \
-                                       SCALAR_TYPE const**, \
-                                       LAYOUT_TYPE, \
-                                       Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                       Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess>, \
-                                       SCALAR_TYPE**, \
-                                       LAYOUT_TYPE, \
-                                       Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > \
-      { enum : bool { value = true }; };
-
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#define KOKKOSSPARSE_SPMV_ETI_SPEC_AVAIL(SCALAR_TYPE, ORDINAL_TYPE,       \
+                                         OFFSET_TYPE, LAYOUT_TYPE,        \
+                                         EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \
+  template <>                                                             \
+  struct spmv_eti_spec_avail<                                             \
+      const SCALAR_TYPE, const ORDINAL_TYPE,                              \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET_TYPE,         \
+      SCALAR_TYPE const*, LAYOUT_TYPE,                                    \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,     \
+      SCALAR_TYPE*, LAYOUT_TYPE,                                          \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged> > {                         \
+    enum : bool { value = true };                                         \
+  };
+
+#define KOKKOSSPARSE_SPMV_MV_ETI_SPEC_AVAIL(SCALAR_TYPE, ORDINAL_TYPE,       \
+                                            OFFSET_TYPE, LAYOUT_TYPE,        \
+                                            EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \
+  template <>                                                                \
+  struct spmv_mv_eti_spec_avail<                                             \
+      const SCALAR_TYPE, const ORDINAL_TYPE,                                 \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                       \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET_TYPE,            \
+      SCALAR_TYPE const**, LAYOUT_TYPE,                                      \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                       \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,        \
+      SCALAR_TYPE**, LAYOUT_TYPE,                                            \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                       \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged> > {                            \
+    enum : bool { value = true };                                            \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosSparse_spmv_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosSparse_spmv_eti_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosSparse_spmv_mv_eti_spec_avail.hpp>
+#include <KokkosSparse_spmv_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spmv_eti_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spmv_mv_eti_spec_avail.hpp>
 
 namespace KokkosSparse {
 namespace Impl {
@@ -138,31 +133,23 @@ namespace Impl {
 ///
 /// For the implementation of KokkosSparse::spmv for multivectors (2-D
 /// Views), see the SPMV_MV struct below.
-template<class AT, class AO, class AD, class AM, class AS,
-         class XT, class XL, class XD, class XM,
-         class YT, class YL, class YD, class YM,
-         bool tpl_spec_avail =
-             spmv_tpl_spec_avail< AT, AO, AD, AM, AS,
-                                  XT, XL, XD, XM,
-                                  YT, YL, YD, YM>::value,
-         bool eti_spec_avail =
-             spmv_eti_spec_avail< AT, AO, AD, AM, AS,
-                                  XT, XL, XD, XM,
-                                  YT, YL, YD, YM>::value >
-struct SPMV{
-  typedef CrsMatrix<AT,AO,AD,AM,AS> AMatrix;
-  typedef Kokkos::View<XT,XL,XD,XM> XVector;
-  typedef Kokkos::View<YT,YL,YD,YM> YVector;
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM,
+          bool tpl_spec_avail = spmv_tpl_spec_avail<
+              AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value,
+          bool eti_spec_avail = spmv_eti_spec_avail<
+              AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value>
+struct SPMV {
+  typedef CrsMatrix<AT, AO, AD, AM, AS> AMatrix;
+  typedef Kokkos::View<XT, XL, XD, XM> XVector;
+  typedef Kokkos::View<YT, YL, YD, YM> YVector;
 
   typedef typename YVector::non_const_value_type coefficient_type;
 
-  static void spmv (const KokkosKernels::Experimental::Controls& controls,
-		    const char mode[],
-		    const coefficient_type& alpha,
-		    const AMatrix& A,
-		    const XVector& x,
-		    const coefficient_type& beta,
-		    const YVector& y);
+  static void spmv(const KokkosKernels::Experimental::Controls& controls,
+                   const char mode[], const coefficient_type& alpha,
+                   const AMatrix& A, const XVector& x,
+                   const coefficient_type& beta, const YVector& y);
 };
 
 // Unification layer
@@ -203,162 +190,123 @@ struct SPMV{
 /// matrix's entries have integer type.  Per Github Issue #700, we
 /// don't optimize as heavily for that case, in order to reduce build
 /// times and library sizes.
-template<class AT, class AO, class AD, class AM, class AS,
-         class XT, class XL, class XD, class XM,
-         class YT, class YL, class YD, class YM,
-         const bool integerScalarType =
-           std::is_integral<typename std::decay<AT>::type>::value,
-         bool tpl_spec_avail =
-             spmv_mv_tpl_spec_avail< AT, AO, AD, AM, AS,
-                                  XT, XL, XD, XM,
-                                  YT, YL, YD, YM>::value,
-         bool eti_spec_avail =
-             spmv_mv_eti_spec_avail< AT, AO, AD, AM, AS,
-                                  XT, XL, XD, XM,
-                                  YT, YL, YD, YM>::value >
-struct SPMV_MV{
-  typedef CrsMatrix<AT,AO,AD,AM,AS> AMatrix;
-  typedef Kokkos::View<XT,XL,XD,XM> XVector;
-  typedef Kokkos::View<YT,YL,YD,YM> YVector;
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM,
+          const bool integerScalarType =
+              std::is_integral<typename std::decay<AT>::type>::value,
+          bool tpl_spec_avail = spmv_mv_tpl_spec_avail<
+              AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value,
+          bool eti_spec_avail = spmv_mv_eti_spec_avail<
+              AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value>
+struct SPMV_MV {
+  typedef CrsMatrix<AT, AO, AD, AM, AS> AMatrix;
+  typedef Kokkos::View<XT, XL, XD, XM> XVector;
+  typedef Kokkos::View<YT, YL, YD, YM> YVector;
   typedef typename YVector::non_const_value_type coefficient_type;
 
-  static void
-  spmv_mv (const char mode[],
-           const coefficient_type& alpha,
-           const AMatrix& A,
-           const XVector& x,
-           const coefficient_type& beta,
-           const YVector& y);
+  static void spmv_mv(const char mode[], const coefficient_type& alpha,
+                      const AMatrix& A, const XVector& x,
+                      const coefficient_type& beta, const YVector& y);
 };
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 //! Full specialization of spmv for single vectors (1-D Views).
 // Unification layer
-template<class AT, class AO, class AD, class AM, class AS,
-         class XT, class XL, class XD, class XM,
-         class YT, class YL, class YD, class YM>
-struct SPMV < AT, AO, AD, AM, AS,
-              XT, XL, XD, XM,
-              YT, YL, YD, YM, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>{
-
-  typedef CrsMatrix<AT,AO,AD,AM,AS> AMatrix;
-  typedef Kokkos::View<XT,XL,XD,XM> XVector;
-  typedef Kokkos::View<YT,YL,YD,YM> YVector;
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM>
+struct SPMV<AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM, false,
+            KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  typedef CrsMatrix<AT, AO, AD, AM, AS> AMatrix;
+  typedef Kokkos::View<XT, XL, XD, XM> XVector;
+  typedef Kokkos::View<YT, YL, YD, YM> YVector;
   typedef typename YVector::non_const_value_type coefficient_type;
 
-  static void
-  spmv (const KokkosKernels::Experimental::Controls& controls,
-	const char mode[],
-	const coefficient_type& alpha,
-	const AMatrix& A,
-	const XVector& x,
-	const coefficient_type& beta,
-	const YVector& y)
-  {
+  static void spmv(const KokkosKernels::Experimental::Controls& controls,
+                   const char mode[], const coefficient_type& alpha,
+                   const AMatrix& A, const XVector& x,
+                   const coefficient_type& beta, const YVector& y) {
     typedef Kokkos::Details::ArithTraits<coefficient_type> KAT;
 
-    if (alpha == KAT::zero ()) {
-      if (beta != KAT::one ()) {
-        KokkosBlas::scal (y, beta, y);
+    if (alpha == KAT::zero()) {
+      if (beta != KAT::one()) {
+        KokkosBlas::scal(y, beta, y);
       }
       return;
     }
 
-    if (beta == KAT::zero ()) {
-      spmv_beta<AMatrix, XVector, YVector, 0> (controls, mode, alpha, A, x, beta, y);
-    }
-    else if (beta == KAT::one ()) {
-      spmv_beta<AMatrix, XVector, YVector, 1> (controls, mode, alpha, A, x, beta, y);
-    }
-    else if (beta == -KAT::one ()) {
-      spmv_beta<AMatrix, XVector, YVector, -1> (controls, mode, alpha, A, x, beta, y);
-    }
-    else {
-      spmv_beta<AMatrix, XVector, YVector, 2> (controls, mode, alpha, A, x, beta, y);
+    if (beta == KAT::zero()) {
+      spmv_beta<AMatrix, XVector, YVector, 0>(controls, mode, alpha, A, x, beta,
+                                              y);
+    } else if (beta == KAT::one()) {
+      spmv_beta<AMatrix, XVector, YVector, 1>(controls, mode, alpha, A, x, beta,
+                                              y);
+    } else if (beta == -KAT::one()) {
+      spmv_beta<AMatrix, XVector, YVector, -1>(controls, mode, alpha, A, x,
+                                               beta, y);
+    } else {
+      spmv_beta<AMatrix, XVector, YVector, 2>(controls, mode, alpha, A, x, beta,
+                                              y);
     }
   }
 };
 
 //! Full specialization of spmv_mv for single vectors (2-D Views).
 // Unification layer
-template<class AT, class AO, class AD, class AM, class AS,
-         class XT, class XL, class XD, class XM,
-         class YT, class YL, class YD, class YM>
-struct SPMV_MV<AT, AO, AD, AM, AS,
-               XT, XL, XD, XM,
-               YT, YL, YD, YM,
-               false,
-               false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>{
-  typedef CrsMatrix<AT,AO,AD,AM,AS> AMatrix;
-  typedef Kokkos::View<XT,XL,XD,XM> XVector;
-  typedef Kokkos::View<YT,YL,YD,YM> YVector;
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM>
+struct SPMV_MV<AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM, false, false,
+               KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  typedef CrsMatrix<AT, AO, AD, AM, AS> AMatrix;
+  typedef Kokkos::View<XT, XL, XD, XM> XVector;
+  typedef Kokkos::View<YT, YL, YD, YM> YVector;
   typedef typename YVector::non_const_value_type coefficient_type;
 
-  static void
-  spmv_mv (const char mode[],
-           const coefficient_type& alpha,
-           const AMatrix& A,
-           const XVector& x,
-           const coefficient_type& beta,
-           const YVector& y)
-  {
+  static void spmv_mv(const char mode[], const coefficient_type& alpha,
+                      const AMatrix& A, const XVector& x,
+                      const coefficient_type& beta, const YVector& y) {
     typedef Kokkos::Details::ArithTraits<coefficient_type> KAT;
 
-    if (alpha == KAT::zero ()) {
-      spmv_alpha_mv<AMatrix, XVector, YVector, 0> (mode, alpha, A, x, beta, y);
-    }
-    else if (alpha == KAT::one ()) {
-      spmv_alpha_mv<AMatrix, XVector, YVector, 1> (mode, alpha, A, x, beta, y);
-    }
-    else if (alpha == -KAT::one ()) {
-      spmv_alpha_mv<AMatrix, XVector, YVector, -1> (mode, alpha, A, x, beta, y);
-    }
-    else {
-      spmv_alpha_mv<AMatrix, XVector, YVector, 2> (mode, alpha, A, x, beta, y);
+    if (alpha == KAT::zero()) {
+      spmv_alpha_mv<AMatrix, XVector, YVector, 0>(mode, alpha, A, x, beta, y);
+    } else if (alpha == KAT::one()) {
+      spmv_alpha_mv<AMatrix, XVector, YVector, 1>(mode, alpha, A, x, beta, y);
+    } else if (alpha == -KAT::one()) {
+      spmv_alpha_mv<AMatrix, XVector, YVector, -1>(mode, alpha, A, x, beta, y);
+    } else {
+      spmv_alpha_mv<AMatrix, XVector, YVector, 2>(mode, alpha, A, x, beta, y);
     }
   }
 };
 
-template<class AT, class AO, class AD, class AM, class AS,
-         class XT, class XL, class XD, class XM,
-         class YT, class YL, class YD, class YM>
-struct SPMV_MV<AT, AO, AD, AM, AS,
-               XT, XL, XD, XM,
-               YT, YL, YD, YM,
-               true,
-               false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>{
-  typedef CrsMatrix<AT,AO,AD,AM,AS> AMatrix;
-  typedef Kokkos::View<XT,XL,XD,XM> XVector;
-  typedef Kokkos::View<YT,YL,YD,YM> YVector;
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM>
+struct SPMV_MV<AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM, true, false,
+               KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  typedef CrsMatrix<AT, AO, AD, AM, AS> AMatrix;
+  typedef Kokkos::View<XT, XL, XD, XM> XVector;
+  typedef Kokkos::View<YT, YL, YD, YM> YVector;
   typedef typename YVector::non_const_value_type coefficient_type;
 
-  static void
-  spmv_mv (const char mode[],
-           const coefficient_type& alpha,
-           const AMatrix& A,
-           const XVector& x,
-           const coefficient_type& beta,
-           const YVector& y)
-  {
-    static_assert (std::is_integral<AT>::value,
-                   "This implementation is only for integer Scalar types.");
-    typedef SPMV<AT, AO, AD, AM, AS,
-      typename XVector::value_type*, XL, XD, XM,
-      typename YVector::value_type*, YL, YD, YM> impl_type;
+  static void spmv_mv(const char mode[], const coefficient_type& alpha,
+                      const AMatrix& A, const XVector& x,
+                      const coefficient_type& beta, const YVector& y) {
+    static_assert(std::is_integral<AT>::value,
+                  "This implementation is only for integer Scalar types.");
+    typedef SPMV<AT, AO, AD, AM, AS, typename XVector::value_type*, XL, XD, XM,
+                 typename YVector::value_type*, YL, YD, YM>
+        impl_type;
     KokkosKernels::Experimental::Controls defaultControls;
     for (typename AMatrix::non_const_size_type j = 0; j < x.extent(1); ++j) {
-      auto x_j = Kokkos::subview (x, Kokkos::ALL (), j);
-      auto y_j = Kokkos::subview (y, Kokkos::ALL (), j);
-      impl_type::spmv (defaultControls, mode, alpha, A, x_j, beta, y_j);
+      auto x_j = Kokkos::subview(x, Kokkos::ALL(), j);
+      auto y_j = Kokkos::subview(y, Kokkos::ALL(), j);
+      impl_type::spmv(defaultControls, mode, alpha, A, x_j, beta, y_j);
     }
   }
 };
 #endif
 
-
-
-}
-}
+}  // namespace Impl
+}  // namespace KokkosSparse
 
 //
 // Macro for declaration of full specialization of
@@ -367,74 +315,68 @@ struct SPMV_MV<AT, AO, AD, AM, AS,
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSSPARSE_SPMV_ETI_SPEC_DECL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE ) \
-    extern template struct  \
-    SPMV<const SCALAR_TYPE, \
-         const ORDINAL_TYPE, \
-         Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-         Kokkos::MemoryTraits<Kokkos::Unmanaged>, \
-         const OFFSET_TYPE, \
-         SCALAR_TYPE const*, \
-         LAYOUT_TYPE, \
-         Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-         Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess>, \
-         SCALAR_TYPE*, \
-         LAYOUT_TYPE, \
-         Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-         Kokkos::MemoryTraits<Kokkos::Unmanaged>, false, true >;
-
-
-#define KOKKOSSPARSE_SPMV_ETI_SPEC_INST( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \
-    template struct  \
-    SPMV<const SCALAR_TYPE, \
-         const ORDINAL_TYPE, \
-         Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-         Kokkos::MemoryTraits<Kokkos::Unmanaged>, \
-         const OFFSET_TYPE, \
-         SCALAR_TYPE const*, \
-         LAYOUT_TYPE, \
-         Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-         Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess>, \
-         SCALAR_TYPE*, \
-         LAYOUT_TYPE, \
-         Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-         Kokkos::MemoryTraits<Kokkos::Unmanaged>, false, true >;
-
-#define KOKKOSSPARSE_SPMV_MV_ETI_SPEC_DECL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE ) \
-    extern template struct  \
-    SPMV_MV<const SCALAR_TYPE, \
-         const ORDINAL_TYPE, \
-         Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-         Kokkos::MemoryTraits<Kokkos::Unmanaged>, \
-         const OFFSET_TYPE, \
-         SCALAR_TYPE const**, \
-         LAYOUT_TYPE, \
-         Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-         Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess>, \
-         SCALAR_TYPE**, \
-         LAYOUT_TYPE, \
-         Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-         Kokkos::MemoryTraits<Kokkos::Unmanaged>, std::is_integral<typename std::decay<SCALAR_TYPE>::type>::value, false, true >;
-
-
-#define KOKKOSSPARSE_SPMV_MV_ETI_SPEC_INST( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \
-    template struct  \
-    SPMV_MV<const SCALAR_TYPE, \
-         const ORDINAL_TYPE, \
-         Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-         Kokkos::MemoryTraits<Kokkos::Unmanaged>, \
-         const OFFSET_TYPE, \
-         SCALAR_TYPE const**, \
-         LAYOUT_TYPE, \
-         Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-         Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess>, \
-         SCALAR_TYPE**, \
-         LAYOUT_TYPE, \
-         Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-         Kokkos::MemoryTraits<Kokkos::Unmanaged>, std::is_integral<typename std::decay<SCALAR_TYPE>::type>::value, false, true >;
-
-#include<KokkosSparse_spmv_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosSparse_spmv_eti_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosSparse_spmv_mv_eti_spec_decl.hpp>
-
-#endif // KOKKOSSPARSE_IMPL_SPMV_SPEC_HPP_
+#define KOKKOSSPARSE_SPMV_ETI_SPEC_DECL(SCALAR_TYPE, ORDINAL_TYPE,       \
+                                        OFFSET_TYPE, LAYOUT_TYPE,        \
+                                        EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \
+  extern template struct SPMV<                                           \
+      const SCALAR_TYPE, const ORDINAL_TYPE,                             \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                   \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET_TYPE,        \
+      SCALAR_TYPE const*, LAYOUT_TYPE,                                   \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                   \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,    \
+      SCALAR_TYPE*, LAYOUT_TYPE,                                         \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                   \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, false, true>;
+
+#define KOKKOSSPARSE_SPMV_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE,       \
+                                        OFFSET_TYPE, LAYOUT_TYPE,        \
+                                        EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \
+  template struct SPMV<                                                  \
+      const SCALAR_TYPE, const ORDINAL_TYPE,                             \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                   \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET_TYPE,        \
+      SCALAR_TYPE const*, LAYOUT_TYPE,                                   \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                   \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,    \
+      SCALAR_TYPE*, LAYOUT_TYPE,                                         \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                   \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, false, true>;
+
+#define KOKKOSSPARSE_SPMV_MV_ETI_SPEC_DECL(SCALAR_TYPE, ORDINAL_TYPE,         \
+                                           OFFSET_TYPE, LAYOUT_TYPE,          \
+                                           EXEC_SPACE_TYPE, MEM_SPACE_TYPE)   \
+  extern template struct SPMV_MV<                                             \
+      const SCALAR_TYPE, const ORDINAL_TYPE,                                  \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET_TYPE,             \
+      SCALAR_TYPE const**, LAYOUT_TYPE,                                       \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,         \
+      SCALAR_TYPE**, LAYOUT_TYPE,                                             \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>,                                \
+      std::is_integral<typename std::decay<SCALAR_TYPE>::type>::value, false, \
+      true>;
+
+#define KOKKOSSPARSE_SPMV_MV_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE,         \
+                                           OFFSET_TYPE, LAYOUT_TYPE,          \
+                                           EXEC_SPACE_TYPE, MEM_SPACE_TYPE)   \
+  template struct SPMV_MV<                                                    \
+      const SCALAR_TYPE, const ORDINAL_TYPE,                                  \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET_TYPE,             \
+      SCALAR_TYPE const**, LAYOUT_TYPE,                                       \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,         \
+      SCALAR_TYPE**, LAYOUT_TYPE,                                             \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>,                                \
+      std::is_integral<typename std::decay<SCALAR_TYPE>::type>::value, false, \
+      true>;
+
+#include <KokkosSparse_spmv_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spmv_eti_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spmv_mv_eti_spec_decl.hpp>
+
+#endif  // KOKKOSSPARSE_IMPL_SPMV_SPEC_HPP_
diff --git a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp
index bae84dd224..71387b5d06 100644
--- a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp
@@ -49,27 +49,25 @@
 #include "KokkosKernels_ExecSpaceUtils.hpp"
 #include "KokkosBlas1_scal.hpp"
 #include "KokkosSparse_CrsMatrix.hpp"
+#include "KokkosKernels_Error.hpp"
 
 namespace KokkosSparse {
 namespace Impl {
 
-enum {FD, FE};
+enum { FD, FE };
 
 // This TransposeFunctor is functional, but not necessarily performant.
-template<class AMatrix,
-         class XVector,
-         class YVector,
-         int dobeta,
-         bool conjugate>
+template <class AMatrix, class XVector, class YVector, int dobeta,
+          bool conjugate>
 struct SPMV_Struct_Transpose_Functor {
-  typedef typename AMatrix::execution_space            execution_space;
-  typedef typename AMatrix::non_const_ordinal_type     ordinal_type;
-  typedef typename AMatrix::non_const_value_type       value_type;
+  typedef typename AMatrix::execution_space execution_space;
+  typedef typename AMatrix::non_const_ordinal_type ordinal_type;
+  typedef typename AMatrix::non_const_value_type value_type;
   typedef typename Kokkos::TeamPolicy<execution_space> team_policy;
-  typedef typename team_policy::member_type            team_member;
-  typedef Kokkos::Details::ArithTraits<value_type>     ATV;
-  typedef typename YVector::non_const_value_type       coefficient_type;
-  typedef typename YVector::non_const_value_type       y_value_type;
+  typedef typename team_policy::member_type team_member;
+  typedef Kokkos::Details::ArithTraits<value_type> ATV;
+  typedef typename YVector::non_const_value_type coefficient_type;
+  typedef typename YVector::non_const_value_type y_value_type;
 
   const coefficient_type alpha;
   AMatrix m_A;
@@ -78,80 +76,79 @@ struct SPMV_Struct_Transpose_Functor {
   YVector m_y;
   const ordinal_type rows_per_thread;
 
-  SPMV_Struct_Transpose_Functor (const coefficient_type& alpha_,
-                                 const AMatrix& m_A_,
-                                 const XVector& m_x_,
-                                 const coefficient_type& beta_,
-                                 const YVector& m_y_,
-                                 const ordinal_type rows_per_thread_) :
-    alpha (alpha_), m_A (m_A_), m_x (m_x_),
-    beta (beta_), m_y (m_y_),
-    rows_per_thread (rows_per_thread_)
-  {}
-
-  KOKKOS_INLINE_FUNCTION void
-  operator() (const team_member& dev) const
-  {
-    const ordinal_type teamWorkStart = (static_cast<ordinal_type> (dev.league_rank() * dev.team_size() + dev.team_rank())) * rows_per_thread;
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_thread),
-    [&](ordinal_type loop)
-    {
-      // iRow represents a row of the matrix, so its correct type is
-      // ordinal_type.
-      ordinal_type iRow = teamWorkStart + loop;
-      if (iRow >= m_A.numRows ()) {
-        return;
-      }
+  SPMV_Struct_Transpose_Functor(const coefficient_type& alpha_,
+                                const AMatrix& m_A_, const XVector& m_x_,
+                                const coefficient_type& beta_,
+                                const YVector& m_y_,
+                                const ordinal_type rows_per_thread_)
+      : alpha(alpha_),
+        m_A(m_A_),
+        m_x(m_x_),
+        beta(beta_),
+        m_y(m_y_),
+        rows_per_thread(rows_per_thread_) {}
+
+  KOKKOS_INLINE_FUNCTION void operator()(const team_member& dev) const {
+    const ordinal_type teamWorkStart =
+        (static_cast<ordinal_type>(dev.league_rank() * dev.team_size() +
+                                   dev.team_rank())) *
+        rows_per_thread;
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(dev, rows_per_thread), [&](ordinal_type loop) {
+          // iRow represents a row of the matrix, so its correct type is
+          // ordinal_type.
+          ordinal_type iRow = teamWorkStart + loop;
+          if (iRow >= m_A.numRows()) {
+            return;
+          }
 
-      const auto row = m_A.rowConst (iRow);
-      const ordinal_type row_length = row.length;
+          const auto row                = m_A.rowConst(iRow);
+          const ordinal_type row_length = row.length;
 
-      Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row_length),
-      [&](ordinal_type iEntry)
-      {
-        const value_type val = conjugate ?
-          ATV::conj (row.value(iEntry)) :
-          row.value(iEntry);
-        const ordinal_type ind = row.colidx(iEntry);
+          Kokkos::parallel_for(
+              Kokkos::ThreadVectorRange(dev, row_length),
+              [&](ordinal_type iEntry) {
+                const value_type val = conjugate ? ATV::conj(row.value(iEntry))
+                                                 : row.value(iEntry);
+                const ordinal_type ind = row.colidx(iEntry);
 
-        Kokkos::atomic_add (&m_y(ind), static_cast<y_value_type> (alpha * val * m_x(iRow)));
-      });
-    });
+                Kokkos::atomic_add(&m_y(ind), static_cast<y_value_type>(
+                                                  alpha * val * m_x(iRow)));
+              });
+        });
   }
 };
 
-template<class AMatrix,
-         class XVector,
-         class YVector,
-         int dobeta,
-         bool conjugate>
+template <class AMatrix, class XVector, class YVector, int dobeta,
+          bool conjugate>
 struct SPMV_Struct_Functor {
-  typedef typename AMatrix::non_const_size_type              size_type;
-  typedef typename AMatrix::non_const_ordinal_type           ordinal_type;
-  typedef typename AMatrix::non_const_value_type             value_type;
-  typedef typename AMatrix::execution_space                  execution_space;
-  typedef typename execution_space::scratch_memory_space     scratch_space;
+  typedef typename AMatrix::non_const_size_type size_type;
+  typedef typename AMatrix::non_const_ordinal_type ordinal_type;
+  typedef typename AMatrix::non_const_value_type value_type;
+  typedef typename AMatrix::execution_space execution_space;
+  typedef typename execution_space::scratch_memory_space scratch_space;
   typedef typename KokkosSparse::SparseRowViewConst<AMatrix> row_view_const;
-  typedef typename Kokkos::TeamPolicy<execution_space>       team_policy;
-  typedef typename team_policy::member_type                  team_member;
-  typedef Kokkos::Details::ArithTraits<value_type>           ATV;
+  typedef typename Kokkos::TeamPolicy<execution_space> team_policy;
+  typedef typename team_policy::member_type team_member;
+  typedef Kokkos::Details::ArithTraits<value_type> ATV;
   typedef Kokkos::View<ordinal_type*, scratch_space,
-                       Kokkos::MemoryTraits<Kokkos::Unmanaged> > shared_ordinal_1d;
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+      shared_ordinal_1d;
   using y_value_type = typename YVector::non_const_value_type;
 
   // Tags to perform SPMV on interior and boundaries
-  struct interior3ptTag{};    // 1D FD and FE discretization
-  struct interior5ptTag{};    // 2D FD discretization
-  struct interior9ptTag{};    // 2D FE discretization
-  struct interior7ptTag{};    // 3D FD discretization
-  struct interior27ptTag{};   // 3D FE discretization
-  struct exterior1DTag{};
-  struct exterior2DTag{};
-  struct exterior3DTag{};
+  struct interior3ptTag {};   // 1D FD and FE discretization
+  struct interior5ptTag {};   // 2D FD discretization
+  struct interior9ptTag {};   // 2D FE discretization
+  struct interior7ptTag {};   // 3D FD discretization
+  struct interior27ptTag {};  // 3D FE discretization
+  struct exterior1DTag {};
+  struct exterior2DTag {};
+  struct exterior3DTag {};
 
   // Classic spmv params
   const y_value_type alpha;
-  AMatrix  m_A;
+  AMatrix m_A;
   XVector m_x;
   const y_value_type beta;
   YVector m_y;
@@ -164,631 +161,751 @@ struct SPMV_Struct_Functor {
   const int64_t rows_per_team;
   const int64_t rows_per_team_ext;
 
-  SPMV_Struct_Functor (const Kokkos::View<ordinal_type*, Kokkos::HostSpace> structure_,
-                       const int stencil_type_,
-                       const y_value_type alpha_,
-                       const AMatrix m_A_,
-                       const XVector m_x_,
-                       const y_value_type beta_,
-                       const YVector m_y_,
-                       const int64_t rows_per_team_,
-                       const int64_t rows_per_team_ext_) :
-    alpha (alpha_), m_A (m_A_), m_x (m_x_),
-    beta (beta_), m_y (m_y_),
-    stencil_type(stencil_type_),
-    rows_per_team (rows_per_team_),
-    rows_per_team_ext (rows_per_team_ext_)
-  {
-    static_assert (static_cast<int> (XVector::rank) == 1,
-                   "XVector must be a rank 1 View.");
-    static_assert (static_cast<int> (YVector::rank) == 1,
-                   "YVector must be a rank 1 View.");
+  SPMV_Struct_Functor(
+      const Kokkos::View<ordinal_type*, Kokkos::HostSpace> structure_,
+      const int stencil_type_, const y_value_type alpha_, const AMatrix m_A_,
+      const XVector m_x_, const y_value_type beta_, const YVector m_y_,
+      const int64_t rows_per_team_, const int64_t rows_per_team_ext_)
+      : alpha(alpha_),
+        m_A(m_A_),
+        m_x(m_x_),
+        beta(beta_),
+        m_y(m_y_),
+        stencil_type(stencil_type_),
+        rows_per_team(rows_per_team_),
+        rows_per_team_ext(rows_per_team_ext_) {
+    static_assert(static_cast<int>(XVector::rank) == 1,
+                  "XVector must be a rank 1 View.");
+    static_assert(static_cast<int>(YVector::rank) == 1,
+                  "YVector must be a rank 1 View.");
 
     numDimensions = structure_.extent(0);
-    if(numDimensions == 1) {
+    if (numDimensions == 1) {
       ni = static_cast<ordinal_type>(structure_(0));
-    } else if(numDimensions == 2) {
+    } else if (numDimensions == 2) {
       ni = static_cast<ordinal_type>(structure_(0));
       nj = static_cast<ordinal_type>(structure_(1));
-    } else if(numDimensions == 3) {
+    } else if (numDimensions == 3) {
       ni = static_cast<ordinal_type>(structure_(0));
       nj = static_cast<ordinal_type>(structure_(1));
       nk = static_cast<ordinal_type>(structure_(2));
     }
   }
 
-  void compute_interior(const int64_t worksets, const int team_size, const int vector_length) {
-
-    if(numDimensions == 1) {
+  void compute_interior(const int64_t worksets, const int team_size,
+                        const int vector_length) {
+    if (numDimensions == 1) {
       // Treat interior points using structured algorithm
       numInterior = ni - 2;
-      if(numInterior > 0) {
+      if (numInterior > 0) {
         size_t shared_size = shared_ordinal_1d::shmem_size(3);
-        Kokkos::TeamPolicy<interior3ptTag,
-                           execution_space,
-                           Kokkos::Schedule<Kokkos::Static> > policy(1,1);
-        if(team_size < 0) {
-          policy = Kokkos::TeamPolicy<interior3ptTag, execution_space, Kokkos::Schedule<Kokkos::Static> >(worksets,Kokkos::AUTO,vector_length).
-              set_scratch_size(0, Kokkos::PerTeam( shared_size ));
+        Kokkos::TeamPolicy<interior3ptTag, execution_space,
+                           Kokkos::Schedule<Kokkos::Static> >
+            policy(1, 1);
+        if (team_size < 0) {
+          policy = Kokkos::TeamPolicy<interior3ptTag, execution_space,
+                                      Kokkos::Schedule<Kokkos::Static> >(
+                       worksets, Kokkos::AUTO, vector_length)
+                       .set_scratch_size(0, Kokkos::PerTeam(shared_size));
         } else {
-          policy = Kokkos::TeamPolicy<interior3ptTag, execution_space, Kokkos::Schedule<Kokkos::Static> >(worksets,team_size,vector_length).
-              set_scratch_size(0, Kokkos::PerTeam( shared_size ));
+          policy = Kokkos::TeamPolicy<interior3ptTag, execution_space,
+                                      Kokkos::Schedule<Kokkos::Static> >(
+                       worksets, team_size, vector_length)
+                       .set_scratch_size(0, Kokkos::PerTeam(shared_size));
         }
-        Kokkos::parallel_for("KokkosSparse::spmv_struct<NoTranspose,Static>: interior", policy, *this);
+        Kokkos::parallel_for(
+            "KokkosSparse::spmv_struct<NoTranspose,Static>: interior", policy,
+            *this);
       }
 
-    } else if(numDimensions == 2) {
+    } else if (numDimensions == 2) {
       // Treat interior points using structured algorithm
-      numInterior = (ni - 2)*(nj - 2);
-      if(numInterior > 0) {
-        if(stencil_type == 1) {
+      numInterior = (ni - 2) * (nj - 2);
+      if (numInterior > 0) {
+        if (stencil_type == 1) {
           size_t shared_size = shared_ordinal_1d::shmem_size(5);
-          Kokkos::TeamPolicy<interior5ptTag,
-                             execution_space,
-                             Kokkos::Schedule<Kokkos::Static> > policy(1,1);
-          if(team_size < 0) {
-            policy = Kokkos::TeamPolicy<interior5ptTag, execution_space, Kokkos::Schedule<Kokkos::Static> >(worksets,Kokkos::AUTO,vector_length).
-              set_scratch_size(0, Kokkos::PerTeam( shared_size ));
+          Kokkos::TeamPolicy<interior5ptTag, execution_space,
+                             Kokkos::Schedule<Kokkos::Static> >
+              policy(1, 1);
+          if (team_size < 0) {
+            policy = Kokkos::TeamPolicy<interior5ptTag, execution_space,
+                                        Kokkos::Schedule<Kokkos::Static> >(
+                         worksets, Kokkos::AUTO, vector_length)
+                         .set_scratch_size(0, Kokkos::PerTeam(shared_size));
           } else {
-            policy = Kokkos::TeamPolicy<interior5ptTag, execution_space, Kokkos::Schedule<Kokkos::Static> >(worksets,team_size,vector_length).
-              set_scratch_size(0, Kokkos::PerTeam( shared_size ));
+            policy = Kokkos::TeamPolicy<interior5ptTag, execution_space,
+                                        Kokkos::Schedule<Kokkos::Static> >(
+                         worksets, team_size, vector_length)
+                         .set_scratch_size(0, Kokkos::PerTeam(shared_size));
           }
-          Kokkos::parallel_for("KokkosSparse::spmv_struct<NoTranspose,Static>: interior", policy, *this);
-        } else if(stencil_type == 2) {
+          Kokkos::parallel_for(
+              "KokkosSparse::spmv_struct<NoTranspose,Static>: interior", policy,
+              *this);
+        } else if (stencil_type == 2) {
           size_t shared_size = shared_ordinal_1d::shmem_size(9);
-          Kokkos::TeamPolicy<interior9ptTag,
-                             execution_space,
-                             Kokkos::Schedule<Kokkos::Static> > policy(1,1);
-          if(team_size < 0) {
-            policy = Kokkos::TeamPolicy<interior9ptTag, execution_space, Kokkos::Schedule<Kokkos::Static> >(worksets,Kokkos::AUTO,vector_length).
-              set_scratch_size(0, Kokkos::PerTeam( shared_size ));
+          Kokkos::TeamPolicy<interior9ptTag, execution_space,
+                             Kokkos::Schedule<Kokkos::Static> >
+              policy(1, 1);
+          if (team_size < 0) {
+            policy = Kokkos::TeamPolicy<interior9ptTag, execution_space,
+                                        Kokkos::Schedule<Kokkos::Static> >(
+                         worksets, Kokkos::AUTO, vector_length)
+                         .set_scratch_size(0, Kokkos::PerTeam(shared_size));
           } else {
-            policy = Kokkos::TeamPolicy<interior9ptTag, execution_space, Kokkos::Schedule<Kokkos::Static> >(worksets,team_size,vector_length).
-              set_scratch_size(0, Kokkos::PerTeam( shared_size ));
+            policy = Kokkos::TeamPolicy<interior9ptTag, execution_space,
+                                        Kokkos::Schedule<Kokkos::Static> >(
+                         worksets, team_size, vector_length)
+                         .set_scratch_size(0, Kokkos::PerTeam(shared_size));
           }
-          Kokkos::parallel_for("KokkosSparse::spmv_struct<NoTranspose,Static>: interior", policy, *this);
+          Kokkos::parallel_for(
+              "KokkosSparse::spmv_struct<NoTranspose,Static>: interior", policy,
+              *this);
         }
       }
 
-    } else if(numDimensions == 3) {
+    } else if (numDimensions == 3) {
       // Treat interior points using structured algorithm
-      numInterior = (ni - 2)*(nj - 2)*(nk - 2);
-      if(numInterior > 0) {
-        if(stencil_type == 1) {
+      numInterior = (ni - 2) * (nj - 2) * (nk - 2);
+      if (numInterior > 0) {
+        if (stencil_type == 1) {
           size_t shared_size = shared_ordinal_1d::shmem_size(7);
-          Kokkos::TeamPolicy<interior7ptTag,
-                             execution_space,
-                             Kokkos::Schedule<Kokkos::Static> > policy(1,1);
-          if(team_size < 0) {
-            policy = Kokkos::TeamPolicy<interior7ptTag, execution_space, Kokkos::Schedule<Kokkos::Static> >(worksets,Kokkos::AUTO,vector_length).
-              set_scratch_size(0, Kokkos::PerTeam( shared_size ));
+          Kokkos::TeamPolicy<interior7ptTag, execution_space,
+                             Kokkos::Schedule<Kokkos::Static> >
+              policy(1, 1);
+          if (team_size < 0) {
+            policy = Kokkos::TeamPolicy<interior7ptTag, execution_space,
+                                        Kokkos::Schedule<Kokkos::Static> >(
+                         worksets, Kokkos::AUTO, vector_length)
+                         .set_scratch_size(0, Kokkos::PerTeam(shared_size));
           } else {
-            policy = Kokkos::TeamPolicy<interior7ptTag, execution_space, Kokkos::Schedule<Kokkos::Static> >(worksets,team_size,vector_length).
-              set_scratch_size(0, Kokkos::PerTeam( shared_size ));
+            policy = Kokkos::TeamPolicy<interior7ptTag, execution_space,
+                                        Kokkos::Schedule<Kokkos::Static> >(
+                         worksets, team_size, vector_length)
+                         .set_scratch_size(0, Kokkos::PerTeam(shared_size));
           }
-          Kokkos::parallel_for("KokkosSparse::spmv_struct<NoTranspose,Static>: interior", policy, *this);
-        } else if(stencil_type == 2) {
+          Kokkos::parallel_for(
+              "KokkosSparse::spmv_struct<NoTranspose,Static>: interior", policy,
+              *this);
+        } else if (stencil_type == 2) {
           size_t shared_size = shared_ordinal_1d::shmem_size(27);
-          Kokkos::TeamPolicy<interior27ptTag,
-                             execution_space,
-                             Kokkos::Schedule<Kokkos::Static> > policy(1,1);
-          if(team_size < 0) {
-            policy = Kokkos::TeamPolicy<interior27ptTag, execution_space, Kokkos::Schedule<Kokkos::Static> >(worksets,Kokkos::AUTO,vector_length).
-              set_scratch_size(0, Kokkos::PerTeam(shared_size));
+          Kokkos::TeamPolicy<interior27ptTag, execution_space,
+                             Kokkos::Schedule<Kokkos::Static> >
+              policy(1, 1);
+          if (team_size < 0) {
+            policy = Kokkos::TeamPolicy<interior27ptTag, execution_space,
+                                        Kokkos::Schedule<Kokkos::Static> >(
+                         worksets, Kokkos::AUTO, vector_length)
+                         .set_scratch_size(0, Kokkos::PerTeam(shared_size));
           } else {
-            policy = Kokkos::TeamPolicy<interior27ptTag, execution_space, Kokkos::Schedule<Kokkos::Static> >(worksets,team_size,vector_length).
-              set_scratch_size(0, Kokkos::PerTeam(shared_size));
+            policy = Kokkos::TeamPolicy<interior27ptTag, execution_space,
+                                        Kokkos::Schedule<Kokkos::Static> >(
+                         worksets, team_size, vector_length)
+                         .set_scratch_size(0, Kokkos::PerTeam(shared_size));
           }
-          Kokkos::parallel_for("KokkosSparse::spmv_struct<NoTranspose,Static>: interior", policy, *this);
+          Kokkos::parallel_for(
+              "KokkosSparse::spmv_struct<NoTranspose,Static>: interior", policy,
+              *this);
         }
       }
     }
-  } // compute_interior
+  }  // compute_interior
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const interior3ptTag&, const team_member& dev) const
-  {
+  void operator()(const interior3ptTag&, const team_member& dev) const {
     // Allocate and initialize columnOffsets array for the team
     shared_ordinal_1d columnOffsets(dev.team_scratch(0), 3);
-    Kokkos::single(Kokkos::PerTeam(dev), [&] () {
-        columnOffsets(0) = -1;
-        columnOffsets(1) = 0;
-        columnOffsets(2) = 1;
-      });
+    Kokkos::single(Kokkos::PerTeam(dev), [&]() {
+      columnOffsets(0) = -1;
+      columnOffsets(1) = 0;
+      columnOffsets(2) = 1;
+    });
     dev.team_barrier();
 
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, rows_per_team),[&] (const ordinal_type& loop) {
-        const ordinal_type interiorIdx = static_cast<ordinal_type> ( dev.league_rank() ) * rows_per_team + loop;
-        if(interiorIdx >= numInterior) { return; }
-
-        ordinal_type rowIdx;
-        rowIdx = interiorIdx + 1;
-
-        const size_type rowOffset = m_A.graph.row_map(rowIdx);
-        y_value_type sum = 0.0;
-        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, 3), [&] (const ordinal_type& idx, y_value_type& lclSum) {
-            lclSum += (conjugate ? ATV::conj(m_A.values(rowOffset + idx)) : m_A.values(rowOffset + idx))
-              *m_x(rowIdx + columnOffsets(idx));
-          }, sum);
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(dev, 0, rows_per_team),
+        [&](const ordinal_type& loop) {
+          const ordinal_type interiorIdx =
+              static_cast<ordinal_type>(dev.league_rank()) * rows_per_team +
+              loop;
+          if (interiorIdx >= numInterior) {
+            return;
+          }
 
-        Kokkos::single(Kokkos::PerThread(dev), [&] () {
-            m_y(rowIdx) = beta*m_y(rowIdx) + alpha*sum;
+          ordinal_type rowIdx;
+          rowIdx = interiorIdx + 1;
+
+          const size_type rowOffset = m_A.graph.row_map(rowIdx);
+          y_value_type sum          = 0.0;
+          Kokkos::parallel_reduce(
+              Kokkos::ThreadVectorRange(dev, 3),
+              [&](const ordinal_type& idx, y_value_type& lclSum) {
+                lclSum += (conjugate ? ATV::conj(m_A.values(rowOffset + idx))
+                                     : m_A.values(rowOffset + idx)) *
+                          m_x(rowIdx + columnOffsets(idx));
+              },
+              sum);
+
+          Kokkos::single(Kokkos::PerThread(dev), [&]() {
+            m_y(rowIdx) = beta * m_y(rowIdx) + alpha * sum;
           });
-      });
+        });
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const interior5ptTag&, const team_member& dev) const
-  {
+  void operator()(const interior5ptTag&, const team_member& dev) const {
     // Allocate and initialize columnOffsets array for the team
     shared_ordinal_1d columnOffsets(dev.team_scratch(0), 5);
-    Kokkos::single(Kokkos::PerTeam(dev), [&] () {
-        columnOffsets(0) = -ni;
-        columnOffsets(1) = -1;
-        columnOffsets(2) = 0;
-        columnOffsets(3) = 1;
-        columnOffsets(4) = ni;
-      });
+    Kokkos::single(Kokkos::PerTeam(dev), [&]() {
+      columnOffsets(0) = -ni;
+      columnOffsets(1) = -1;
+      columnOffsets(2) = 0;
+      columnOffsets(3) = 1;
+      columnOffsets(4) = ni;
+    });
     dev.team_barrier();
 
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, rows_per_team), [&] (const ordinal_type& loop) {
-        const ordinal_type interiorIdx = static_cast<ordinal_type> ( dev.league_rank() ) * rows_per_team + loop;
-        if(interiorIdx >= numInterior) { return; }
-
-        ordinal_type i, j, rowIdx;
-        j = interiorIdx / (ni - 2);
-        i = interiorIdx % (ni - 2);
-        rowIdx = (j + 1)*ni + i + 1;
-
-        const size_type   rowOffset = m_A.graph.row_map(rowIdx);
-        y_value_type sum = 0.0;
-        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, 5), [&] (const ordinal_type& idx, y_value_type& lclSum) {
-            lclSum += (conjugate ? ATV::conj(m_A.values(rowOffset + idx)) : m_A.values(rowOffset + idx))
-              *m_x(rowIdx + columnOffsets(idx));
-          }, sum);
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(dev, 0, rows_per_team),
+        [&](const ordinal_type& loop) {
+          const ordinal_type interiorIdx =
+              static_cast<ordinal_type>(dev.league_rank()) * rows_per_team +
+              loop;
+          if (interiorIdx >= numInterior) {
+            return;
+          }
 
-        Kokkos::single(Kokkos::PerThread(dev), [&] () {
-            m_y(rowIdx) = beta*m_y(rowIdx) + alpha*sum;
+          ordinal_type i, j, rowIdx;
+          j      = interiorIdx / (ni - 2);
+          i      = interiorIdx % (ni - 2);
+          rowIdx = (j + 1) * ni + i + 1;
+
+          const size_type rowOffset = m_A.graph.row_map(rowIdx);
+          y_value_type sum          = 0.0;
+          Kokkos::parallel_reduce(
+              Kokkos::ThreadVectorRange(dev, 5),
+              [&](const ordinal_type& idx, y_value_type& lclSum) {
+                lclSum += (conjugate ? ATV::conj(m_A.values(rowOffset + idx))
+                                     : m_A.values(rowOffset + idx)) *
+                          m_x(rowIdx + columnOffsets(idx));
+              },
+              sum);
+
+          Kokkos::single(Kokkos::PerThread(dev), [&]() {
+            m_y(rowIdx) = beta * m_y(rowIdx) + alpha * sum;
           });
-      });
+        });
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const interior9ptTag&, const team_member& dev) const
-  {
+  void operator()(const interior9ptTag&, const team_member& dev) const {
     // Allocate and initialize columnOffsets array for the team
     shared_ordinal_1d columnOffsets(dev.team_scratch(0), 9);
-    Kokkos::single(Kokkos::PerTeam(dev), [&] () {
-        columnOffsets(0) = -ni - 1;
-        columnOffsets(1) = -ni;
-        columnOffsets(2) = -ni + 1;
-        columnOffsets(3) = -1;
-        columnOffsets(4) = 0;
-        columnOffsets(5) = 1;
-        columnOffsets(6) = ni - 1;
-        columnOffsets(7) = ni;
-        columnOffsets(8) = ni + 1;
-      });
+    Kokkos::single(Kokkos::PerTeam(dev), [&]() {
+      columnOffsets(0) = -ni - 1;
+      columnOffsets(1) = -ni;
+      columnOffsets(2) = -ni + 1;
+      columnOffsets(3) = -1;
+      columnOffsets(4) = 0;
+      columnOffsets(5) = 1;
+      columnOffsets(6) = ni - 1;
+      columnOffsets(7) = ni;
+      columnOffsets(8) = ni + 1;
+    });
     dev.team_barrier();
 
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, rows_per_team), [&] (const ordinal_type& loop) {
-        const ordinal_type interiorIdx = static_cast<ordinal_type> ( dev.league_rank() ) * rows_per_team + loop;
-        if(interiorIdx >= numInterior) { return; }
-
-        ordinal_type i, j, rowIdx;
-        j = interiorIdx / (ni - 2);
-        i = interiorIdx % (ni - 2);
-        rowIdx = (j + 1)*ni + i + 1;
-
-        const size_type rowOffset = m_A.graph.row_map(rowIdx);
-        y_value_type sum = 0.0;
-        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, 9), [&] (const ordinal_type& idx, y_value_type& lclSum) {
-            lclSum += (conjugate ? ATV::conj(m_A.values(rowOffset + idx)) : m_A.values(rowOffset + idx))
-              *m_x(rowIdx + columnOffsets(idx));
-          }, sum);
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(dev, 0, rows_per_team),
+        [&](const ordinal_type& loop) {
+          const ordinal_type interiorIdx =
+              static_cast<ordinal_type>(dev.league_rank()) * rows_per_team +
+              loop;
+          if (interiorIdx >= numInterior) {
+            return;
+          }
 
-        Kokkos::single(Kokkos::PerThread(dev), [&] () {
-            m_y(rowIdx) = beta*m_y(rowIdx) + alpha*sum;
+          ordinal_type i, j, rowIdx;
+          j      = interiorIdx / (ni - 2);
+          i      = interiorIdx % (ni - 2);
+          rowIdx = (j + 1) * ni + i + 1;
+
+          const size_type rowOffset = m_A.graph.row_map(rowIdx);
+          y_value_type sum          = 0.0;
+          Kokkos::parallel_reduce(
+              Kokkos::ThreadVectorRange(dev, 9),
+              [&](const ordinal_type& idx, y_value_type& lclSum) {
+                lclSum += (conjugate ? ATV::conj(m_A.values(rowOffset + idx))
+                                     : m_A.values(rowOffset + idx)) *
+                          m_x(rowIdx + columnOffsets(idx));
+              },
+              sum);
+
+          Kokkos::single(Kokkos::PerThread(dev), [&]() {
+            m_y(rowIdx) = beta * m_y(rowIdx) + alpha * sum;
           });
-      });
+        });
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const interior7ptTag&, const team_member& dev) const
-  {
+  void operator()(const interior7ptTag&, const team_member& dev) const {
     // Allocate and initialize columnOffsets array for the team
     shared_ordinal_1d columnOffsets(dev.team_scratch(0), 7);
-    Kokkos::single(Kokkos::PerTeam(dev), [&] () {
-        columnOffsets(0) = -ni*nj;
-        columnOffsets(1) = -ni;
-        columnOffsets(2) = -1;
-        columnOffsets(3) = 0;
-        columnOffsets(4) = 1;
-        columnOffsets(5) = ni;
-        columnOffsets(6) = ni*nj;
-      });
+    Kokkos::single(Kokkos::PerTeam(dev), [&]() {
+      columnOffsets(0) = -ni * nj;
+      columnOffsets(1) = -ni;
+      columnOffsets(2) = -1;
+      columnOffsets(3) = 0;
+      columnOffsets(4) = 1;
+      columnOffsets(5) = ni;
+      columnOffsets(6) = ni * nj;
+    });
     dev.team_barrier();
 
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, rows_per_team), [&] (const ordinal_type& loop) {
-        const ordinal_type interiorIdx = static_cast<ordinal_type> ( dev.league_rank() ) * rows_per_team + loop;
-        if(interiorIdx >= numInterior) { return; }
-
-        ordinal_type i, j, k, rowIdx, rem;
-        k = interiorIdx / ((ni - 2)*(nj - 2));
-        rem  = interiorIdx % ((ni - 2)*(nj - 2));
-        j = rem / (ni - 2);
-        i = rem % (ni - 2);
-        rowIdx = (k + 1)*nj*ni + (j + 1)*ni + (i + 1);
-
-        const size_type rowOffset = m_A.graph.row_map(rowIdx);
-        y_value_type sum = 0.0;
-        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, 7), [&] (const ordinal_type& idx, y_value_type& lclSum) {
-            lclSum += (conjugate ? ATV::conj(m_A.values(rowOffset + idx)) : m_A.values(rowOffset + idx))
-              *m_x(rowIdx + columnOffsets(idx));
-          }, sum);
-
-        Kokkos::single(Kokkos::PerThread(dev), [&] () {
-            m_y(rowIdx) = beta*m_y(rowIdx) + alpha*sum;
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(dev, 0, rows_per_team),
+        [&](const ordinal_type& loop) {
+          const ordinal_type interiorIdx =
+              static_cast<ordinal_type>(dev.league_rank()) * rows_per_team +
+              loop;
+          if (interiorIdx >= numInterior) {
+            return;
+          }
+
+          ordinal_type i, j, k, rowIdx, rem;
+          k      = interiorIdx / ((ni - 2) * (nj - 2));
+          rem    = interiorIdx % ((ni - 2) * (nj - 2));
+          j      = rem / (ni - 2);
+          i      = rem % (ni - 2);
+          rowIdx = (k + 1) * nj * ni + (j + 1) * ni + (i + 1);
+
+          const size_type rowOffset = m_A.graph.row_map(rowIdx);
+          y_value_type sum          = 0.0;
+          Kokkos::parallel_reduce(
+              Kokkos::ThreadVectorRange(dev, 7),
+              [&](const ordinal_type& idx, y_value_type& lclSum) {
+                lclSum += (conjugate ? ATV::conj(m_A.values(rowOffset + idx))
+                                     : m_A.values(rowOffset + idx)) *
+                          m_x(rowIdx + columnOffsets(idx));
+              },
+              sum);
+
+          Kokkos::single(Kokkos::PerThread(dev), [&]() {
+            m_y(rowIdx) = beta * m_y(rowIdx) + alpha * sum;
           });
-      });
+        });
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const interior27ptTag&, const team_member& dev) const
-  {
+  void operator()(const interior27ptTag&, const team_member& dev) const {
     // Allocate and initialize columnOffsets array for the team
     shared_ordinal_1d columnOffsets(dev.team_scratch(0), 27);
-    Kokkos::single(Kokkos::PerTeam(dev), [&] () {
-        columnOffsets(0)  = -ni*nj - ni - 1;
-        columnOffsets(1)  = -ni*nj - ni;
-        columnOffsets(2)  = -ni*nj - ni + 1;
-        columnOffsets(3)  = -ni*nj - 1;
-        columnOffsets(4)  = -ni*nj;
-        columnOffsets(5)  = -ni*nj + 1;
-        columnOffsets(6)  = -ni*nj + ni - 1;
-        columnOffsets(7)  = -ni*nj + ni;
-        columnOffsets(8)  = -ni*nj + ni + 1;
-        columnOffsets(9)  = -ni - 1;
-        columnOffsets(10) = -ni;
-        columnOffsets(11) = -ni + 1;
-        columnOffsets(12) = -1;
-        columnOffsets(13) = 0;
-        columnOffsets(14) = 1;
-        columnOffsets(15) = ni - 1;
-        columnOffsets(16) = ni;
-        columnOffsets(17) = ni + 1;
-        columnOffsets(18) = ni*nj - ni - 1;
-        columnOffsets(19) = ni*nj - ni;
-        columnOffsets(20) = ni*nj - ni + 1;
-        columnOffsets(21) = ni*nj - 1;
-        columnOffsets(22) = ni*nj;
-        columnOffsets(23) = ni*nj + 1;
-        columnOffsets(24) = ni*nj + ni - 1;
-        columnOffsets(25) = ni*nj + ni;
-        columnOffsets(26) = ni*nj + ni + 1;
-      });
+    Kokkos::single(Kokkos::PerTeam(dev), [&]() {
+      columnOffsets(0)  = -ni * nj - ni - 1;
+      columnOffsets(1)  = -ni * nj - ni;
+      columnOffsets(2)  = -ni * nj - ni + 1;
+      columnOffsets(3)  = -ni * nj - 1;
+      columnOffsets(4)  = -ni * nj;
+      columnOffsets(5)  = -ni * nj + 1;
+      columnOffsets(6)  = -ni * nj + ni - 1;
+      columnOffsets(7)  = -ni * nj + ni;
+      columnOffsets(8)  = -ni * nj + ni + 1;
+      columnOffsets(9)  = -ni - 1;
+      columnOffsets(10) = -ni;
+      columnOffsets(11) = -ni + 1;
+      columnOffsets(12) = -1;
+      columnOffsets(13) = 0;
+      columnOffsets(14) = 1;
+      columnOffsets(15) = ni - 1;
+      columnOffsets(16) = ni;
+      columnOffsets(17) = ni + 1;
+      columnOffsets(18) = ni * nj - ni - 1;
+      columnOffsets(19) = ni * nj - ni;
+      columnOffsets(20) = ni * nj - ni + 1;
+      columnOffsets(21) = ni * nj - 1;
+      columnOffsets(22) = ni * nj;
+      columnOffsets(23) = ni * nj + 1;
+      columnOffsets(24) = ni * nj + ni - 1;
+      columnOffsets(25) = ni * nj + ni;
+      columnOffsets(26) = ni * nj + ni + 1;
+    });
     dev.team_barrier();
 
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, rows_per_team), [&] (const ordinal_type& loop) {
-        const ordinal_type interiorIdx = static_cast<ordinal_type> ( dev.league_rank() ) * rows_per_team + loop;
-        if(interiorIdx >= numInterior) { return; }
-
-        ordinal_type i, j, k, rowIdx, rem;
-        k = interiorIdx / ((ni - 2)*(nj - 2));
-        rem  = interiorIdx % ((ni - 2)*(nj - 2));
-        j = rem / (ni - 2);
-        i = rem % (ni - 2);
-        rowIdx = (k + 1)*nj*ni + (j + 1)*ni + (i + 1);
-
-        const size_type rowOffset = m_A.graph.row_map(rowIdx);
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(dev, 0, rows_per_team),
+        [&](const ordinal_type& loop) {
+          const ordinal_type interiorIdx =
+              static_cast<ordinal_type>(dev.league_rank()) * rows_per_team +
+              loop;
+          if (interiorIdx >= numInterior) {
+            return;
+          }
 
-        y_value_type sum(0.0);
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-        for (ordinal_type idx = 0; idx < 27; ++idx) {
-         sum += m_A.values(rowOffset + idx)*m_x(rowIdx + columnOffsets(idx));
-        }
+          ordinal_type i, j, k, rowIdx, rem;
+          k      = interiorIdx / ((ni - 2) * (nj - 2));
+          rem    = interiorIdx % ((ni - 2) * (nj - 2));
+          j      = rem / (ni - 2);
+          i      = rem % (ni - 2);
+          rowIdx = (k + 1) * nj * ni + (j + 1) * ni + (i + 1);
+
+          const size_type rowOffset = m_A.graph.row_map(rowIdx);
+
+          y_value_type sum(0.0);
+#if defined(KOKKOS_IF_ON_HOST)
+          // clang-format off
+          KOKKOS_IF_ON_HOST((
+          for (ordinal_type idx = 0; idx < 27; ++idx) {
+            sum +=
+                m_A.values(rowOffset + idx) * m_x(rowIdx + columnOffsets(idx));
+          }
+          ))
+
+          KOKKOS_IF_ON_DEVICE((
+          Kokkos::parallel_reduce(
+              Kokkos::ThreadVectorRange(dev, 27),
+              [&](const ordinal_type& idx, y_value_type& lclSum) {
+                lclSum += (conjugate ? ATV::conj(m_A.values(rowOffset + idx))
+                                     : m_A.values(rowOffset + idx)) *
+                          m_x(rowIdx + columnOffsets(idx));
+              },
+              sum);
+          ))
+      // clang-format on
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)  // FIXME remove when
+                                                          // requiring minimum
+                                                          // version of
+                                                          // Kokkos 3.6
+          for (ordinal_type idx = 0; idx < 27; ++idx) {
+            sum +=
+                m_A.values(rowOffset + idx) * m_x(rowIdx + columnOffsets(idx));
+          }
 #else
-        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, 27), [&] (const ordinal_type& idx, y_value_type& lclSum) {
-            lclSum += (conjugate ? ATV::conj(m_A.values(rowOffset + idx)) : m_A.values(rowOffset + idx))
-              *m_x(rowIdx + columnOffsets(idx));
-        }, sum);
+          Kokkos::parallel_reduce(
+              Kokkos::ThreadVectorRange(dev, 27),
+              [&](const ordinal_type& idx, y_value_type& lclSum) {
+                lclSum += (conjugate ? ATV::conj(m_A.values(rowOffset + idx))
+                                     : m_A.values(rowOffset + idx)) *
+                          m_x(rowIdx + columnOffsets(idx));
+              },
+              sum);
 #endif
 
-        Kokkos::single(Kokkos::PerThread(dev), [&] () {
-            m_y(rowIdx) = beta*m_y(rowIdx) + alpha*sum;
+          Kokkos::single(Kokkos::PerThread(dev), [&]() {
+            m_y(rowIdx) = beta * m_y(rowIdx) + alpha * sum;
           });
-      });
+        });
   }
 
-  void compute_exterior(const int64_t worksets, const int team_size, const int vector_length) {
+  void compute_exterior(const int64_t worksets, const int team_size,
+                        const int vector_length) {
     // Treat exterior points using unstructured algorithm
-    if(numDimensions == 1) {
+    if (numDimensions == 1) {
       numExterior = 2;
-      if(numExterior > 0) {
-        Kokkos::RangePolicy<exterior1DTag,
-                            execution_space,
-                            Kokkos::Schedule<Kokkos::Static> > policy(0, numExterior);
-        Kokkos::parallel_for("KokkosSparse::spmv_struct<NoTranspose,Static>: exterior", policy, *this);
+      if (numExterior > 0) {
+        Kokkos::RangePolicy<exterior1DTag, execution_space,
+                            Kokkos::Schedule<Kokkos::Static> >
+            policy(0, numExterior);
+        Kokkos::parallel_for(
+            "KokkosSparse::spmv_struct<NoTranspose,Static>: exterior", policy,
+            *this);
       }
 
-    } else if(numDimensions == 2) {
-      numExterior = 2*(nj + ni - 2);
-      if(numExterior > 0) {
-        Kokkos::TeamPolicy<exterior2DTag,
-                           execution_space,
-                           Kokkos::Schedule<Kokkos::Static> > policy(1, 1);
-        if(team_size < 0) {
-          policy = Kokkos::TeamPolicy<exterior2DTag,
-                                      execution_space,
-                                      Kokkos::Schedule<Kokkos::Static> >(worksets,Kokkos::AUTO,vector_length);
+    } else if (numDimensions == 2) {
+      numExterior = 2 * (nj + ni - 2);
+      if (numExterior > 0) {
+        Kokkos::TeamPolicy<exterior2DTag, execution_space,
+                           Kokkos::Schedule<Kokkos::Static> >
+            policy(1, 1);
+        if (team_size < 0) {
+          policy = Kokkos::TeamPolicy<exterior2DTag, execution_space,
+                                      Kokkos::Schedule<Kokkos::Static> >(
+              worksets, Kokkos::AUTO, vector_length);
         } else {
-          policy = Kokkos::TeamPolicy<exterior2DTag,
-                                      execution_space,
-                                      Kokkos::Schedule<Kokkos::Static> >(worksets,team_size,vector_length);
+          policy = Kokkos::TeamPolicy<exterior2DTag, execution_space,
+                                      Kokkos::Schedule<Kokkos::Static> >(
+              worksets, team_size, vector_length);
         }
-        Kokkos::parallel_for("KokkosSparse::spmv_struct<NoTranspose,Static>: exterior", policy, *this);
+        Kokkos::parallel_for(
+            "KokkosSparse::spmv_struct<NoTranspose,Static>: exterior", policy,
+            *this);
       }
 
-    } else if(numDimensions == 3) {
-      numExterior = ni*nj*nk - (ni - 2)*(nj - 2)*(nk - 2);
-      if(numExterior > 0) {
-        Kokkos::TeamPolicy<exterior3DTag,
-                           execution_space,
-                           Kokkos::Schedule<Kokkos::Static> > policy(1, 1);
-        if(team_size < 0) {
-          policy = Kokkos::TeamPolicy<exterior3DTag,
-                                      execution_space,
-                                      Kokkos::Schedule<Kokkos::Static> >(worksets,Kokkos::AUTO,vector_length);
+    } else if (numDimensions == 3) {
+      numExterior = ni * nj * nk - (ni - 2) * (nj - 2) * (nk - 2);
+      if (numExterior > 0) {
+        Kokkos::TeamPolicy<exterior3DTag, execution_space,
+                           Kokkos::Schedule<Kokkos::Static> >
+            policy(1, 1);
+        if (team_size < 0) {
+          policy = Kokkos::TeamPolicy<exterior3DTag, execution_space,
+                                      Kokkos::Schedule<Kokkos::Static> >(
+              worksets, Kokkos::AUTO, vector_length);
         } else {
-          policy = Kokkos::TeamPolicy<exterior3DTag,
-                                      execution_space,
-                                      Kokkos::Schedule<Kokkos::Static> >(worksets,team_size,vector_length);
+          policy = Kokkos::TeamPolicy<exterior3DTag, execution_space,
+                                      Kokkos::Schedule<Kokkos::Static> >(
+              worksets, team_size, vector_length);
         }
 
-        Kokkos::parallel_for("KokkosSparse::spmv_struct<NoTranspose,Static>: exterior", policy, *this);
+        Kokkos::parallel_for(
+            "KokkosSparse::spmv_struct<NoTranspose,Static>: exterior", policy,
+            *this);
       }
     }
-  } // compute_exterior
+  }  // compute_exterior
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const exterior1DTag&, const ordinal_type& exteriorIdx) const
-  {
-    if(exteriorIdx >= numExterior) { return; }
-    ordinal_type rowIdx = exteriorIdx*(ni - 1);
+  void operator()(const exterior1DTag&, const ordinal_type& exteriorIdx) const {
+    if (exteriorIdx >= numExterior) {
+      return;
+    }
+    ordinal_type rowIdx = exteriorIdx * (ni - 1);
 
     const size_type rowOffset = m_A.graph.row_map(rowIdx);
-    const ordinal_type row_length = static_cast<ordinal_type> (m_A.graph.row_map(rowIdx + 1) - rowOffset);
+    const ordinal_type row_length =
+        static_cast<ordinal_type>(m_A.graph.row_map(rowIdx + 1) - rowOffset);
     y_value_type sum = 0.0;
-    for(ordinal_type idx = 0; idx < row_length; ++idx) {
-      sum += (conjugate ? ATV::conj(m_A.values(rowOffset + idx)) : m_A.values(rowOffset + idx))
-              *m_x(m_A.graph.entries(rowOffset + idx));
+    for (ordinal_type idx = 0; idx < row_length; ++idx) {
+      sum += (conjugate ? ATV::conj(m_A.values(rowOffset + idx))
+                        : m_A.values(rowOffset + idx)) *
+             m_x(m_A.graph.entries(rowOffset + idx));
     }
-    m_y(rowIdx) = beta*m_y(rowIdx) + alpha*sum;
+    m_y(rowIdx) = beta * m_y(rowIdx) + alpha * sum;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const exterior2DTag&, const team_member& dev) const
-  {
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, rows_per_team_ext), [&] (const ordinal_type& loop){
-        const ordinal_type exteriorIdx = static_cast<ordinal_type> ( dev.league_rank() ) * rows_per_team_ext + loop;
-        if(exteriorIdx >= numExterior) { return; }
-        const ordinal_type topFlag = exteriorIdx / (ni + 2*nj - 4);
-        const ordinal_type bottomFlag = static_cast<ordinal_type>((exteriorIdx / ni) == 0);
-
-        ordinal_type rowIdx = 0;
-        if(bottomFlag == 1) {
-          rowIdx = exteriorIdx;
-        } else if(topFlag == 1) {
-          rowIdx = exteriorIdx - (ni + 2*nj - 4)
-            + ni*(nj - 1);
-        } else {
-          ordinal_type edgeIdx = (exteriorIdx - ni) / 2;
-          ordinal_type edgeFlg = (exteriorIdx - ni) % 2;
-          rowIdx = (edgeIdx + 1)*ni + edgeFlg*(ni - 1);
-        }
+  void operator()(const exterior2DTag&, const team_member& dev) const {
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(dev, 0, rows_per_team_ext),
+        [&](const ordinal_type& loop) {
+          const ordinal_type exteriorIdx =
+              static_cast<ordinal_type>(dev.league_rank()) * rows_per_team_ext +
+              loop;
+          if (exteriorIdx >= numExterior) {
+            return;
+          }
+          const ordinal_type topFlag = exteriorIdx / (ni + 2 * nj - 4);
+          const ordinal_type bottomFlag =
+              static_cast<ordinal_type>((exteriorIdx / ni) == 0);
+
+          ordinal_type rowIdx = 0;
+          if (bottomFlag == 1) {
+            rowIdx = exteriorIdx;
+          } else if (topFlag == 1) {
+            rowIdx = exteriorIdx - (ni + 2 * nj - 4) + ni * (nj - 1);
+          } else {
+            ordinal_type edgeIdx = (exteriorIdx - ni) / 2;
+            ordinal_type edgeFlg = (exteriorIdx - ni) % 2;
+            rowIdx               = (edgeIdx + 1) * ni + edgeFlg * (ni - 1);
+          }
 
-        const size_type rowOffset = m_A.graph.row_map(rowIdx);
-        const ordinal_type row_length = static_cast<ordinal_type> (m_A.graph.row_map(rowIdx + 1) - rowOffset);
-        y_value_type sum = 0;
-        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, row_length),
-                                [&] (const ordinal_type& idx, y_value_type& lclSum) {
-                                  lclSum += (conjugate ? ATV::conj(m_A.values(rowOffset + idx)) : m_A.values(rowOffset + idx))
-                                    *m_x(m_A.graph.entries(rowOffset + idx));
-                                }, sum);
-        Kokkos::single(Kokkos::PerThread(dev), [&] () {
-            m_y(rowIdx) = beta*m_y(rowIdx) + alpha*sum;
+          const size_type rowOffset     = m_A.graph.row_map(rowIdx);
+          const ordinal_type row_length = static_cast<ordinal_type>(
+              m_A.graph.row_map(rowIdx + 1) - rowOffset);
+          y_value_type sum = 0;
+          Kokkos::parallel_reduce(
+              Kokkos::ThreadVectorRange(dev, row_length),
+              [&](const ordinal_type& idx, y_value_type& lclSum) {
+                lclSum += (conjugate ? ATV::conj(m_A.values(rowOffset + idx))
+                                     : m_A.values(rowOffset + idx)) *
+                          m_x(m_A.graph.entries(rowOffset + idx));
+              },
+              sum);
+          Kokkos::single(Kokkos::PerThread(dev), [&]() {
+            m_y(rowIdx) = beta * m_y(rowIdx) + alpha * sum;
           });
-      });
+        });
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const exterior3DTag&, const team_member& dev) const
-  {
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, rows_per_team_ext), [&] (const ordinal_type& loop){
-        const ordinal_type exteriorIdx = static_cast<ordinal_type> ( dev.league_rank() ) * rows_per_team_ext + loop;
-        if(exteriorIdx >= numExterior) { return; }
-        const ordinal_type topFlag = static_cast<ordinal_type>(numExterior - exteriorIdx - 1 < ni*nj);
-        const ordinal_type bottomFlag = static_cast<ordinal_type>(exteriorIdx / (ni*nj) == 0);
-
-        ordinal_type rowIdx = 0;
-        if(bottomFlag == 1) {
-          rowIdx = exteriorIdx;
-        } else if(topFlag == 1) {
-          rowIdx = exteriorIdx - ni*nj - 2*(nk - 2)*(nj + ni - 2) + (nk - 1)*ni*nj;
-        } else {
-          ordinal_type k, rem;
-          k = (exteriorIdx - ni*nj) / (2*(ni - 1 + nj - 1));
-          rem = (exteriorIdx - ni*nj) % (2*(ni - 1 + nj - 1));
-          // ordinal_type frontFlg = static_cast<ordinal_type>(rem < ni);
-          // ordinal_type backFlg = static_cast<ordinal_type>(rem - ni - 2*(nj - 1) - 1 > 0);
-          if(rem < ni) {
-            rowIdx = (k + 1)*ni*nj + rem;
-          } else if(rem < ni + 2*(nj - 2)) {
-            ordinal_type edgeIdx = (rem - ni) / 2;
-            ordinal_type edgeFlg = (rem - ni) % 2;
-            if(edgeFlg == 0) {
-              rowIdx = (k + 1)*ni*nj + (edgeIdx + 1)*ni;
-            } else if(edgeFlg == 1) {
-              rowIdx = (k + 1)*ni*nj + (edgeIdx + 2)*ni - 1;
-            }
+  void operator()(const exterior3DTag&, const team_member& dev) const {
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(dev, 0, rows_per_team_ext),
+        [&](const ordinal_type& loop) {
+          const ordinal_type exteriorIdx =
+              static_cast<ordinal_type>(dev.league_rank()) * rows_per_team_ext +
+              loop;
+          if (exteriorIdx >= numExterior) {
+            return;
+          }
+          const ordinal_type topFlag = static_cast<ordinal_type>(
+              numExterior - exteriorIdx - 1 < ni * nj);
+          const ordinal_type bottomFlag =
+              static_cast<ordinal_type>(exteriorIdx / (ni * nj) == 0);
+
+          ordinal_type rowIdx = 0;
+          if (bottomFlag == 1) {
+            rowIdx = exteriorIdx;
+          } else if (topFlag == 1) {
+            rowIdx = exteriorIdx - ni * nj - 2 * (nk - 2) * (nj + ni - 2) +
+                     (nk - 1) * ni * nj;
           } else {
-            rowIdx = (k + 1)*ni*nj + rem - ni - 2*(nj - 2) + (nj - 1)*ni;
+            ordinal_type k, rem;
+            k   = (exteriorIdx - ni * nj) / (2 * (ni - 1 + nj - 1));
+            rem = (exteriorIdx - ni * nj) % (2 * (ni - 1 + nj - 1));
+            // ordinal_type frontFlg = static_cast<ordinal_type>(rem < ni);
+            // ordinal_type backFlg = static_cast<ordinal_type>(rem - ni - 2*(nj
+            // - 1) - 1 > 0);
+            if (rem < ni) {
+              rowIdx = (k + 1) * ni * nj + rem;
+            } else if (rem < ni + 2 * (nj - 2)) {
+              ordinal_type edgeIdx = (rem - ni) / 2;
+              ordinal_type edgeFlg = (rem - ni) % 2;
+              if (edgeFlg == 0) {
+                rowIdx = (k + 1) * ni * nj + (edgeIdx + 1) * ni;
+              } else if (edgeFlg == 1) {
+                rowIdx = (k + 1) * ni * nj + (edgeIdx + 2) * ni - 1;
+              }
+            } else {
+              rowIdx =
+                  (k + 1) * ni * nj + rem - ni - 2 * (nj - 2) + (nj - 1) * ni;
+            }
           }
-        }
 
-        const size_type rowOffset = m_A.graph.row_map(rowIdx);
-        const ordinal_type row_length = static_cast<ordinal_type> (m_A.graph.row_map(rowIdx + 1) - rowOffset);
-        y_value_type sum = 0;
-        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, row_length),
-                                [&] (const ordinal_type& idx, y_value_type& lclSum) {
-                                  lclSum += (conjugate ? ATV::conj(m_A.values(rowOffset + idx)) : m_A.values(rowOffset + idx))
-                                    *m_x(m_A.graph.entries(rowOffset + idx));
-                                }, sum);
-        Kokkos::single(Kokkos::PerThread(dev), [&] () {
-            m_y(rowIdx) = beta*m_y(rowIdx) + alpha*sum;
+          const size_type rowOffset     = m_A.graph.row_map(rowIdx);
+          const ordinal_type row_length = static_cast<ordinal_type>(
+              m_A.graph.row_map(rowIdx + 1) - rowOffset);
+          y_value_type sum = 0;
+          Kokkos::parallel_reduce(
+              Kokkos::ThreadVectorRange(dev, row_length),
+              [&](const ordinal_type& idx, y_value_type& lclSum) {
+                lclSum += (conjugate ? ATV::conj(m_A.values(rowOffset + idx))
+                                     : m_A.values(rowOffset + idx)) *
+                          m_x(m_A.graph.entries(rowOffset + idx));
+              },
+              sum);
+          Kokkos::single(Kokkos::PerThread(dev), [&]() {
+            m_y(rowIdx) = beta * m_y(rowIdx) + alpha * sum;
           });
-      });
+        });
   }
-}; // SPMV_Struct_Functor
+};  // SPMV_Struct_Functor
 
-template<class execution_space>
-int64_t spmv_struct_launch_parameters(int64_t numInterior, int64_t nnz, int nnz_per_row,
-                                      int64_t rows_per_thread, int& team_size, int& vector_length) {
+template <class execution_space>
+int64_t spmv_struct_launch_parameters(int64_t numInterior, int64_t nnz,
+                                      int nnz_per_row, int64_t rows_per_thread,
+                                      int& team_size, int& vector_length) {
   int64_t rows_per_team;
 
-  if(nnz_per_row < 1) nnz_per_row = 1;
+  if (nnz_per_row < 1) nnz_per_row = 1;
 
   // Determine rows per thread
-  if(rows_per_thread < 1) {
-    if(KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>())
+  if (rows_per_thread < 1) {
+    if (KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>())
       rows_per_thread = 1;
-    else
-    {
-      if(nnz_per_row < 20 && numInterior*nnz_per_row > 5000000 ) {
+    else {
+      if (nnz_per_row < 20 && numInterior * nnz_per_row > 5000000) {
         rows_per_thread = 256;
       } else
         rows_per_thread = 64;
     }
   }
 
-  if(team_size < 1) {
-    if(KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>())
-    { team_size = 128 / vector_length; }
-    else
-    { team_size = 1; }
+  if (team_size < 1) {
+    if (KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>()) {
+      team_size = 128 / vector_length;
+    } else {
+      team_size = 1;
+    }
   }
 
   rows_per_team = rows_per_thread * team_size;
 
-  if(rows_per_team < 0) {
+  if (rows_per_team < 0) {
     int64_t nnz_per_team = 4096;
-    int64_t conc = execution_space::concurrency();
-    while((conc * nnz_per_team * 4 > nnz)&&(nnz_per_team > 256)) nnz_per_team/=2;
-    rows_per_team = (nnz_per_team + nnz_per_row - 1)/nnz_per_row;
+    int64_t conc         = execution_space::concurrency();
+    while ((conc * nnz_per_team * 4 > nnz) && (nnz_per_team > 256))
+      nnz_per_team /= 2;
+    rows_per_team = (nnz_per_team + nnz_per_row - 1) / nnz_per_row;
   }
 
-
   return rows_per_team;
-} // spmv_struct_launch_parameters
-
-template<class AMatrix,
-         class XVector,
-         class YVector,
-         int dobeta,
-         bool conjugate>
-static void
-spmv_struct_beta_no_transpose (const int stencil_type,
-                               const Kokkos::View<typename AMatrix::non_const_ordinal_type*, Kokkos::HostSpace>& structure,
-                               typename YVector::const_value_type& alpha,
-                               const AMatrix& A,
-                               const XVector& x,
-                               typename YVector::const_value_type& beta,
-                               const YVector& y)
-{
+}  // spmv_struct_launch_parameters
+
+template <class AMatrix, class XVector, class YVector, int dobeta,
+          bool conjugate>
+static void spmv_struct_beta_no_transpose(
+    const int stencil_type,
+    const Kokkos::View<typename AMatrix::non_const_ordinal_type*,
+                       Kokkos::HostSpace>& structure,
+    typename YVector::const_value_type& alpha, const AMatrix& A,
+    const XVector& x, typename YVector::const_value_type& beta,
+    const YVector& y) {
   typedef typename AMatrix::ordinal_type ordinal_type;
   typedef typename AMatrix::execution_space execution_space;
-  if (A.numRows () <= static_cast<ordinal_type> (0)) {
+  if (A.numRows() <= static_cast<ordinal_type>(0)) {
     return;
   }
 
-  int vector_length = -1;
-  int nnzPerRow = -1;
-  int team_size_int = -1;
-  int team_size_ext = -1;
+  int vector_length           = -1;
+  int nnzPerRow               = -1;
+  int team_size_int           = -1;
+  int team_size_ext           = -1;
   int64_t rows_per_thread_int = -1;
   int64_t rows_per_thread_ext = -1;
-  int64_t numInteriorPts = 0;
-  int64_t numExteriorPts = 0;
+  int64_t numInteriorPts      = 0;
+  int64_t numExteriorPts      = 0;
 
-  if(structure.extent(0) == 1) {
+  if (structure.extent(0) == 1) {
     numInteriorPts = structure(0) - 2;
     numExteriorPts = 2;
-    vector_length = 1;
-  } else if(structure.extent(0) == 2) {
-    numInteriorPts = (structure(1) - 2)*(structure(0) - 2);
-    numExteriorPts = 2*(structure(1) + structure(0) - 2);
-    if(stencil_type == 1) {
+    vector_length  = 1;
+  } else if (structure.extent(0) == 2) {
+    numInteriorPts = (structure(1) - 2) * (structure(0) - 2);
+    numExteriorPts = 2 * (structure(1) + structure(0) - 2);
+    if (stencil_type == 1) {
       vector_length = 2;
-    } else if(stencil_type == 2) {
+    } else if (stencil_type == 2) {
       vector_length = 2;
     }
-  } else if(structure.extent(0) == 3) {
-    numInteriorPts = (structure(2) - 2)*(structure(1) - 2)*(structure(0) - 2);
-    numExteriorPts = structure(2)*structure(1)*structure(0) - numInteriorPts;
-    if(stencil_type == 1) {
+  } else if (structure.extent(0) == 3) {
+    numInteriorPts =
+        (structure(2) - 2) * (structure(1) - 2) * (structure(0) - 2);
+    numExteriorPts =
+        structure(2) * structure(1) * structure(0) - numInteriorPts;
+    if (stencil_type == 1) {
       vector_length = 2;
-    } else if(stencil_type == 2) {
+    } else if (stencil_type == 2) {
       vector_length = 8;
     }
   }
 
-  int64_t rows_per_team_int = spmv_struct_launch_parameters<execution_space>(numInteriorPts,
-                                                                             A.nnz(),
-                                                                             nnzPerRow,
-                                                                             rows_per_thread_int,
-                                                                             team_size_int,
-                                                                             vector_length);
-  int64_t worksets_interior = (numInteriorPts + rows_per_team_int - 1) / rows_per_team_int;
-
-  int64_t rows_per_team_ext = spmv_struct_launch_parameters<execution_space>(numExteriorPts,
-                                                                             A.nnz(),
-                                                                             nnzPerRow,
-                                                                             rows_per_thread_ext,
-                                                                             team_size_ext,
-                                                                             vector_length);
-  int64_t worksets_exterior = (numExteriorPts + rows_per_team_ext - 1) / rows_per_team_ext;
-
-  SPMV_Struct_Functor<AMatrix,XVector,YVector,dobeta,conjugate>
-    spmv_struct(structure, stencil_type, alpha,A,x,beta,y, rows_per_team_int, rows_per_team_ext);
+  int64_t rows_per_team_int = spmv_struct_launch_parameters<execution_space>(
+      numInteriorPts, A.nnz(), nnzPerRow, rows_per_thread_int, team_size_int,
+      vector_length);
+  int64_t worksets_interior =
+      (numInteriorPts + rows_per_team_int - 1) / rows_per_team_int;
+
+  int64_t rows_per_team_ext = spmv_struct_launch_parameters<execution_space>(
+      numExteriorPts, A.nnz(), nnzPerRow, rows_per_thread_ext, team_size_ext,
+      vector_length);
+  int64_t worksets_exterior =
+      (numExteriorPts + rows_per_team_ext - 1) / rows_per_team_ext;
+
+  SPMV_Struct_Functor<AMatrix, XVector, YVector, dobeta, conjugate> spmv_struct(
+      structure, stencil_type, alpha, A, x, beta, y, rows_per_team_int,
+      rows_per_team_ext);
 
   spmv_struct.compute_interior(worksets_interior, team_size_int, vector_length);
   spmv_struct.compute_exterior(worksets_exterior, team_size_ext, vector_length);
-} // spmv_struct_beta_no_transpose
-
-template<class AMatrix,
-         class XVector,
-         class YVector,
-         int dobeta,
-         bool conjugate>
-static void
-spmv_struct_beta_transpose (const int /*stencil_type*/,
-                            const Kokkos::View<typename AMatrix::non_const_ordinal_type*, Kokkos::HostSpace>& /*structure*/,
-                            typename YVector::const_value_type& alpha,
-                            const AMatrix& A,
-                            const XVector& x,
-                            typename YVector::const_value_type& beta,
-                            const YVector& y)
-{
+}  // spmv_struct_beta_no_transpose
+
+template <class AMatrix, class XVector, class YVector, int dobeta,
+          bool conjugate>
+static void spmv_struct_beta_transpose(
+    const int /*stencil_type*/,
+    const Kokkos::View<typename AMatrix::non_const_ordinal_type*,
+                       Kokkos::HostSpace>& /*structure*/,
+    typename YVector::const_value_type& alpha, const AMatrix& A,
+    const XVector& x, typename YVector::const_value_type& beta,
+    const YVector& y) {
   typedef typename AMatrix::ordinal_type ordinal_type;
 
-  if (A.numRows () <= static_cast<ordinal_type> (0)) {
+  if (A.numRows() <= static_cast<ordinal_type>(0)) {
     return;
   }
 
   // We need to scale y first ("scaling" by zero just means filling
   // with zeros), since the functor works by atomic-adding into y.
   if (dobeta != 1) {
-    KokkosBlas::scal (y, beta, y);
+    KokkosBlas::scal(y, beta, y);
   }
 
   typedef typename AMatrix::size_type size_type;
@@ -796,79 +913,76 @@ spmv_struct_beta_transpose (const int /*stencil_type*/,
   // Assuming that no row contains duplicate entries, NNZPerRow
   // cannot be more than the number of columns of the matrix.  Thus,
   // the appropriate type is ordinal_type.
-  const ordinal_type NNZPerRow = static_cast<ordinal_type> (A.nnz () / A.numRows ());
+  const ordinal_type NNZPerRow =
+      static_cast<ordinal_type>(A.nnz() / A.numRows());
 
   int vector_length = 1;
-  while( (static_cast<ordinal_type> (vector_length*2*3) <= NNZPerRow) && (vector_length<32) ) vector_length*=2;
+  while ((static_cast<ordinal_type>(vector_length * 2 * 3) <= NNZPerRow) &&
+         (vector_length < 32))
+    vector_length *= 2;
 
-  typedef SPMV_Struct_Transpose_Functor<AMatrix, XVector, YVector, dobeta, conjugate> OpType;
+  typedef SPMV_Struct_Transpose_Functor<AMatrix, XVector, YVector, dobeta,
+                                        conjugate>
+      OpType;
 
   typename AMatrix::const_ordinal_type nrow = A.numRows();
 
-  OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
+  OpType op(alpha, A, x, beta, y,
+            RowsPerThread<typename AMatrix::execution_space>(NNZPerRow));
 
-  const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space > (NNZPerRow);
-  const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
+  const int rows_per_thread =
+      RowsPerThread<typename AMatrix::execution_space>(NNZPerRow);
+  const int team_size =
+      Kokkos::TeamPolicy<typename AMatrix::execution_space>(
+          rows_per_thread, Kokkos::AUTO, vector_length)
+          .team_size_recommended(op, Kokkos::ParallelForTag());
   const int rows_per_team = rows_per_thread * team_size;
-  const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
-  Kokkos::parallel_for("KokkosSparse::spmv_struct<Transpose>", Kokkos::TeamPolicy< typename AMatrix::execution_space >
-     ( nteams , team_size , vector_length ) , op );
-
+  const size_type nteams  = (nrow + rows_per_team - 1) / rows_per_team;
+  Kokkos::parallel_for("KokkosSparse::spmv_struct<Transpose>",
+                       Kokkos::TeamPolicy<typename AMatrix::execution_space>(
+                           nteams, team_size, vector_length),
+                       op);
 }
 
-template<class AMatrix,
-         class XVector,
-         class YVector,
-         int dobeta>
-static void
-spmv_struct_beta (const char mode[],
-                  const int stencil_type,
-                  const Kokkos::View<typename AMatrix::non_const_ordinal_type*, Kokkos::HostSpace>& structure,
-                  typename YVector::const_value_type& alpha,
-                  const AMatrix& A,
-                  const XVector& x,
-                  typename YVector::const_value_type& beta,
-                  const YVector& y)
-{
+template <class AMatrix, class XVector, class YVector, int dobeta>
+static void spmv_struct_beta(
+    const char mode[], const int stencil_type,
+    const Kokkos::View<typename AMatrix::non_const_ordinal_type*,
+                       Kokkos::HostSpace>& structure,
+    typename YVector::const_value_type& alpha, const AMatrix& A,
+    const XVector& x, typename YVector::const_value_type& beta,
+    const YVector& y) {
   if (mode[0] == NoTranspose[0]) {
-    spmv_struct_beta_no_transpose<AMatrix,XVector,YVector,dobeta,false>
-      (stencil_type, structure, alpha, A, x, beta, y);
-  }
-  else if (mode[0] == Conjugate[0]) {
-    spmv_struct_beta_no_transpose<AMatrix,XVector,YVector,dobeta,true>
-      (stencil_type, structure, alpha, A, x, beta, y);
-  }
-  else if (mode[0]==Transpose[0]) {
-    spmv_struct_beta_transpose<AMatrix,XVector,YVector,dobeta,false>
-      (stencil_type, structure, alpha, A, x, beta, y);
-  }
-  else if(mode[0]==ConjugateTranspose[0]) {
-    spmv_struct_beta_transpose<AMatrix,XVector,YVector,dobeta,true>
-      (stencil_type, structure, alpha, A, x, beta, y);
-  }
-  else {
-    Kokkos::Impl::throw_runtime_exception("Invalid Transpose Mode for KokkosSparse::spmv_struct()");
+    spmv_struct_beta_no_transpose<AMatrix, XVector, YVector, dobeta, false>(
+        stencil_type, structure, alpha, A, x, beta, y);
+  } else if (mode[0] == Conjugate[0]) {
+    spmv_struct_beta_no_transpose<AMatrix, XVector, YVector, dobeta, true>(
+        stencil_type, structure, alpha, A, x, beta, y);
+  } else if (mode[0] == Transpose[0]) {
+    spmv_struct_beta_transpose<AMatrix, XVector, YVector, dobeta, false>(
+        stencil_type, structure, alpha, A, x, beta, y);
+  } else if (mode[0] == ConjugateTranspose[0]) {
+    spmv_struct_beta_transpose<AMatrix, XVector, YVector, dobeta, true>(
+        stencil_type, structure, alpha, A, x, beta, y);
+  } else {
+    KokkosKernels::Impl::throw_runtime_exception(
+        "Invalid Transpose Mode for KokkosSparse::spmv_struct()");
   }
 }
 
-
 // Functor for implementing transpose and conjugate transpose sparse
 // matrix-vector multiply with multivector (2-D View) input and
 // output.  This functor works, but is not necessarily performant.
-template<class AMatrix,
-         class XVector,
-         class YVector,
-         int doalpha,
-         int dobeta,
-         bool conjugate>
+template <class AMatrix, class XVector, class YVector, int doalpha, int dobeta,
+          bool conjugate>
 struct SPMV_MV_Struct_Transpose_Functor {
-  typedef typename AMatrix::execution_space            execution_space;
-  typedef typename AMatrix::non_const_ordinal_type     ordinal_type;
-  typedef typename AMatrix::non_const_value_type       A_value_type;
-  typedef typename YVector::non_const_value_type       y_value_type;
+  typedef typename AMatrix::execution_space execution_space;
+  typedef typename AMatrix::non_const_ordinal_type ordinal_type;
+  typedef typename AMatrix::non_const_value_type A_value_type;
+  typedef typename YVector::non_const_value_type y_value_type;
   typedef typename Kokkos::TeamPolicy<execution_space> team_policy;
-  typedef typename team_policy::member_type            team_member;
-  typedef typename YVector::non_const_value_type       coefficient_type;
+  typedef typename team_policy::member_type team_member;
+  typedef typename YVector::non_const_value_type coefficient_type;
 
   const coefficient_type alpha;
   AMatrix m_A;
@@ -879,76 +993,77 @@ struct SPMV_MV_Struct_Transpose_Functor {
   const ordinal_type n;
   const ordinal_type rows_per_thread;
 
-  SPMV_MV_Struct_Transpose_Functor (const coefficient_type& alpha_,
-                                    const AMatrix& m_A_,
-                                    const XVector& m_x_,
-                                    const coefficient_type& beta_,
-                                    const YVector& m_y_,
-                                    const ordinal_type rows_per_thread_) :
-    alpha (alpha_),
-    m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1)),
-    rows_per_thread (rows_per_thread_)
-  {}
-
-  KOKKOS_INLINE_FUNCTION void
-  operator() (const team_member& dev) const
-  {
+  SPMV_MV_Struct_Transpose_Functor(const coefficient_type& alpha_,
+                                   const AMatrix& m_A_, const XVector& m_x_,
+                                   const coefficient_type& beta_,
+                                   const YVector& m_y_,
+                                   const ordinal_type rows_per_thread_)
+      : alpha(alpha_),
+        m_A(m_A_),
+        m_x(m_x_),
+        beta(beta_),
+        m_y(m_y_),
+        n(m_x_.extent(1)),
+        rows_per_thread(rows_per_thread_) {}
+
+  KOKKOS_INLINE_FUNCTION void operator()(const team_member& dev) const {
     // This should be a thread loop as soon as we can use C++11
-    const ordinal_type teamWorkStart = (static_cast<ordinal_type> (dev.league_rank() * dev.team_size() + dev.team_rank())) * rows_per_thread;
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_thread),
-    [&](ordinal_type loop)
-    {
-      const ordinal_type iRow = teamWorkStart + loop;
-      if (iRow >= m_A.numRows ()) {
-        return;
-      }
-      const auto row = m_A.rowConst (iRow);
-      const ordinal_type row_length = row.length;
-
-      Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row_length),
-      [&](ordinal_type iEntry)
-      {
-        const A_value_type val = conjugate ?
-          Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
-          row.value(iEntry);
-        const ordinal_type ind = row.colidx(iEntry);
-
-        if (doalpha != 1) {
-          #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-          #pragma unroll
-          #endif
-          for (ordinal_type k = 0; k < n; ++k) {
-            Kokkos::atomic_add (&m_y(ind,k),
-                                static_cast<y_value_type> (alpha * val * m_x(iRow, k)));
-          }
-        } else {
-          #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-          #pragma unroll
-          #endif
-          for (ordinal_type k = 0; k < n; ++k) {
-            Kokkos::atomic_add (&m_y(ind,k),
-                                static_cast<y_value_type> (val * m_x(iRow, k)));
+    const ordinal_type teamWorkStart =
+        (static_cast<ordinal_type>(dev.league_rank() * dev.team_size() +
+                                   dev.team_rank())) *
+        rows_per_thread;
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(dev, rows_per_thread), [&](ordinal_type loop) {
+          const ordinal_type iRow = teamWorkStart + loop;
+          if (iRow >= m_A.numRows()) {
+            return;
           }
-        }
-      });
-    });
+          const auto row                = m_A.rowConst(iRow);
+          const ordinal_type row_length = row.length;
+
+          Kokkos::parallel_for(
+              Kokkos::ThreadVectorRange(dev, row_length),
+              [&](ordinal_type iEntry) {
+                const A_value_type val =
+                    conjugate
+                        ? Kokkos::Details::ArithTraits<A_value_type>::conj(
+                              row.value(iEntry))
+                        : row.value(iEntry);
+                const ordinal_type ind = row.colidx(iEntry);
+
+                if (doalpha != 1) {
+#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
+#pragma unroll
+#endif
+                  for (ordinal_type k = 0; k < n; ++k) {
+                    Kokkos::atomic_add(
+                        &m_y(ind, k),
+                        static_cast<y_value_type>(alpha * val * m_x(iRow, k)));
+                  }
+                } else {
+#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
+#pragma unroll
+#endif
+                  for (ordinal_type k = 0; k < n; ++k) {
+                    Kokkos::atomic_add(&m_y(ind, k), static_cast<y_value_type>(
+                                                         val * m_x(iRow, k)));
+                  }
+                }
+              });
+        });
   }
 };
 
-template<class AMatrix,
-         class XVector,
-         class YVector,
-         int doalpha,
-         int dobeta,
-         bool conjugate>
+template <class AMatrix, class XVector, class YVector, int doalpha, int dobeta,
+          bool conjugate>
 struct SPMV_MV_Struct_LayoutLeft_Functor {
-  typedef typename AMatrix::execution_space            execution_space;
-  typedef typename AMatrix::non_const_ordinal_type     ordinal_type;
-  typedef typename AMatrix::non_const_value_type       A_value_type;
-  typedef typename YVector::non_const_value_type       y_value_type;
+  typedef typename AMatrix::execution_space execution_space;
+  typedef typename AMatrix::non_const_ordinal_type ordinal_type;
+  typedef typename AMatrix::non_const_value_type A_value_type;
+  typedef typename YVector::non_const_value_type y_value_type;
   typedef typename Kokkos::TeamPolicy<execution_space> team_policy;
-  typedef typename team_policy::member_type            team_member;
-  typedef typename YVector::non_const_value_type       coefficient_type;
+  typedef typename team_policy::member_type team_member;
+  typedef typename YVector::non_const_value_type coefficient_type;
 
   const coefficient_type alpha;
   AMatrix m_A;
@@ -960,127 +1075,124 @@ struct SPMV_MV_Struct_LayoutLeft_Functor {
   ordinal_type rows_per_thread;
   int vector_length;
 
-  SPMV_MV_Struct_LayoutLeft_Functor (const coefficient_type& alpha_,
-                                     const AMatrix& m_A_,
-                                     const XVector& m_x_,
-                                     const coefficient_type& beta_,
-                                     const YVector& m_y_,
-                                     const ordinal_type rows_per_thread_,
-                                     int vector_length_) :
-    alpha (alpha_),
-    m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1)),
-    rows_per_thread (rows_per_thread_), vector_length(vector_length_)
-  {}
-
-  template<int UNROLL>
-  KOKKOS_INLINE_FUNCTION void
-  strip_mine (const team_member& dev, const ordinal_type& iRow, const ordinal_type& kk) const
-  {
+  SPMV_MV_Struct_LayoutLeft_Functor(const coefficient_type& alpha_,
+                                    const AMatrix& m_A_, const XVector& m_x_,
+                                    const coefficient_type& beta_,
+                                    const YVector& m_y_,
+                                    const ordinal_type rows_per_thread_,
+                                    int vector_length_)
+      : alpha(alpha_),
+        m_A(m_A_),
+        m_x(m_x_),
+        beta(beta_),
+        m_y(m_y_),
+        n(m_x_.extent(1)),
+        rows_per_thread(rows_per_thread_),
+        vector_length(vector_length_) {}
+
+  template <int UNROLL>
+  KOKKOS_INLINE_FUNCTION void strip_mine(const team_member& dev,
+                                         const ordinal_type& iRow,
+                                         const ordinal_type& kk) const {
     y_value_type sum[UNROLL];
 
 #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
 #pragma unroll
 #endif
     for (int k = 0; k < UNROLL; ++k) {
-      sum[k] = Kokkos::Details::ArithTraits<y_value_type>::zero ();
+      sum[k] = Kokkos::Details::ArithTraits<y_value_type>::zero();
     }
 
-    const auto row = m_A.rowConst (iRow);
+    const auto row = m_A.rowConst(iRow);
 
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row.length),
-    [&](ordinal_type iEntry)
-    {
-      const A_value_type val = conjugate ?
-        Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
-        row.value(iEntry);
-      const ordinal_type ind = row.colidx(iEntry);
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(dev, row.length), [&](ordinal_type iEntry) {
+          const A_value_type val =
+              conjugate ? Kokkos::Details::ArithTraits<A_value_type>::conj(
+                              row.value(iEntry))
+                        : row.value(iEntry);
+          const ordinal_type ind = row.colidx(iEntry);
 
 #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
 #pragma unroll
 #endif
-      for (int k = 0; k < UNROLL; ++k) {
-        sum[k] += val * m_x(ind, kk + k);
-      }
-    });
+          for (int k = 0; k < UNROLL; ++k) {
+            sum[k] += val * m_x(ind, kk + k);
+          }
+        });
 
     if (doalpha == -1) {
-      for (int ii=0; ii < UNROLL; ++ii) {
+      for (int ii = 0; ii < UNROLL; ++ii) {
         y_value_type sumt;
-        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, vector_length),
-        [&](ordinal_type , y_value_type& lsum)
-        {
-          //in this context, sum[ii] is a partial sum ii on one of the vector lanes.
-          lsum -= sum[ii];
-        }, sumt);
+        Kokkos::parallel_reduce(
+            Kokkos::ThreadVectorRange(dev, vector_length),
+            [&](ordinal_type, y_value_type& lsum) {
+              // in this context, sum[ii] is a partial sum ii on one of the
+              // vector lanes.
+              lsum -= sum[ii];
+            },
+            sumt);
         sum[ii] = sumt;
-        //that was an all-reduce, so sum[ii] is the same on every vector lane
+        // that was an all-reduce, so sum[ii] is the same on every vector lane
       }
-    }
-    else {
-      for (int ii=0; ii < UNROLL; ++ii) {
+    } else {
+      for (int ii = 0; ii < UNROLL; ++ii) {
         y_value_type sumt;
-        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, vector_length),
-        [&](ordinal_type, y_value_type& lsum)
-        {
-          //in this context, sum[ii] is a partial sum ii on one of the vector lanes.
-          lsum += sum[ii];
-        }, sumt);
-        if(doalpha == 1)
+        Kokkos::parallel_reduce(
+            Kokkos::ThreadVectorRange(dev, vector_length),
+            [&](ordinal_type, y_value_type& lsum) {
+              // in this context, sum[ii] is a partial sum ii on one of the
+              // vector lanes.
+              lsum += sum[ii];
+            },
+            sumt);
+        if (doalpha == 1)
           sum[ii] = sumt;
         else
           sum[ii] = sumt * alpha;
       }
     }
 
-    Kokkos::single(Kokkos::PerThread(dev),
-    [&]()
-    {
+    Kokkos::single(Kokkos::PerThread(dev), [&]() {
       if (dobeta == 0) {
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
-        [&](ordinal_type k)
-        {
-          m_y(iRow, kk + k) = sum[k];
-        });
+        Kokkos::parallel_for(
+            Kokkos::ThreadVectorRange(dev, UNROLL),
+            [&](ordinal_type k) { m_y(iRow, kk + k) = sum[k]; });
       } else if (dobeta == 1) {
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
-        [&](ordinal_type k)
-        {
-          m_y(iRow, kk + k) += sum[k];
-        });
+        Kokkos::parallel_for(
+            Kokkos::ThreadVectorRange(dev, UNROLL),
+            [&](ordinal_type k) { m_y(iRow, kk + k) += sum[k]; });
       } else if (dobeta == -1) {
         Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
-        [&](ordinal_type k)
-        {
-          m_y(iRow, kk + k) = -m_y(iRow, kk + k) +  sum[k];
-        });
+                             [&](ordinal_type k) {
+                               m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k];
+                             });
       } else {
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
-        [&](ordinal_type k)
-        {
-          m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k];
-        });
+        Kokkos::parallel_for(
+            Kokkos::ThreadVectorRange(dev, UNROLL), [&](ordinal_type k) {
+              m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k];
+            });
       }
     });
   }
 
-  KOKKOS_INLINE_FUNCTION void
-  strip_mine_1 (const team_member& dev, const ordinal_type& iRow) const
-  {
-    const auto row = m_A.rowConst (iRow);
+  KOKKOS_INLINE_FUNCTION void strip_mine_1(const team_member& dev,
+                                           const ordinal_type& iRow) const {
+    const auto row = m_A.rowConst(iRow);
 
     y_value_type sum;
-    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, row.length),
-    [&](ordinal_type iEntry, y_value_type& lsum)
-    {
-      const A_value_type val = conjugate ?
-        Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
-        row.value(iEntry);
-      lsum += val * m_x(row.colidx(iEntry),0);
-    }, sum);
-
-    Kokkos::single(Kokkos::PerThread(dev),
-    [&]()
-    {
+    Kokkos::parallel_reduce(
+        Kokkos::ThreadVectorRange(dev, row.length),
+        [&](ordinal_type iEntry, y_value_type& lsum) {
+          const A_value_type val =
+              conjugate ? Kokkos::Details::ArithTraits<A_value_type>::conj(
+                              row.value(iEntry))
+                        : row.value(iEntry);
+          lsum += val * m_x(row.colidx(iEntry), 0);
+        },
+        sum);
+
+    Kokkos::single(Kokkos::PerThread(dev), [&]() {
       if (doalpha == -1) {
         sum = -sum;
       } else if (doalpha * doalpha != 1) {
@@ -1092,25 +1204,23 @@ struct SPMV_MV_Struct_LayoutLeft_Functor {
       } else if (dobeta == 1) {
         m_y(iRow, 0) += sum;
       } else if (dobeta == -1) {
-        m_y(iRow, 0) = -m_y(iRow, 0) +  sum;
+        m_y(iRow, 0) = -m_y(iRow, 0) + sum;
       } else {
         m_y(iRow, 0) = beta * m_y(iRow, 0) + sum;
       }
     });
   }
 
-
-  KOKKOS_INLINE_FUNCTION void
-  operator() (const team_member& dev) const
-  {
+  KOKKOS_INLINE_FUNCTION void operator()(const team_member& dev) const {
     for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) {
-
       // iRow indexes over (local) rows of the matrix, so its correct
       // type is ordinal_type.
 
-      const ordinal_type iRow = (dev.league_rank() * dev.team_size() + dev.team_rank())
-        * rows_per_thread + loop;
-      if (iRow >= m_A.numRows ()) {
+      const ordinal_type iRow =
+          (dev.league_rank() * dev.team_size() + dev.team_rank()) *
+              rows_per_thread +
+          loop;
+      if (iRow >= m_A.numRows()) {
         return;
       }
 
@@ -1122,21 +1232,19 @@ struct SPMV_MV_Struct_LayoutLeft_Functor {
       for (; kk + 4 <= n; kk += 4) {
         strip_mine<4>(dev, iRow, kk);
       }
-      for( ; kk < n; ++kk) {
+      for (; kk < n; ++kk) {
         strip_mine<1>(dev, iRow, kk);
       }
 #else
-#  if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
       if ((n > 8) && (n % 8 == 1)) {
         strip_mine<9>(dev, iRow, kk);
         kk += 9;
       }
-      for(; kk + 8 <= n; kk += 8)
-        strip_mine<8>(dev, iRow, kk);
-      if(kk < n)
-      {
-        switch(n - kk) {
-#  else // NOT a CUDA device
+      for (; kk + 8 <= n; kk += 8) strip_mine<8>(dev, iRow, kk);
+      if (kk < n) {
+        switch (n - kk) {
+#else   // NOT a CUDA device
       if ((n > 16) && (n % 16 == 1)) {
         strip_mine<17>(dev, iRow, kk);
         kk += 17;
@@ -1146,296 +1254,281 @@ struct SPMV_MV_Struct_LayoutLeft_Functor {
         strip_mine<16>(dev, iRow, kk);
       }
 
-      if(kk < n)
-      {
-        switch(n - kk) {
-          case 15:
-            strip_mine<15>(dev, iRow, kk);
-            break;
-
-          case 14:
-            strip_mine<14>(dev, iRow, kk);
-            break;
-
-          case 13:
-            strip_mine<13>(dev, iRow, kk);
-            break;
-
-          case 12:
-            strip_mine<12>(dev, iRow, kk);
-            break;
-
-          case 11:
-            strip_mine<11>(dev, iRow, kk);
-            break;
-
-          case 10:
-            strip_mine<10>(dev, iRow, kk);
-            break;
-
-          case 9:
-            strip_mine<9>(dev, iRow, kk);
-            break;
-
-          case 8:
-            strip_mine<8>(dev, iRow, kk);
-            break;
-  #endif // __CUDA_ARCH__ or __HIP_DEVICE_COMPILE__
-          case 7:
-            strip_mine<7>(dev, iRow, kk);
-            break;
-
-          case 6:
-            strip_mine<6>(dev, iRow, kk);
-            break;
-
-          case 5:
-            strip_mine<5>(dev, iRow, kk);
-            break;
-
-          case 4:
-            strip_mine<4>(dev, iRow, kk);
-            break;
-
-          case 3:
-            strip_mine<3>(dev, iRow, kk);
-            break;
-
-          case 2:
-            strip_mine<2>(dev, iRow, kk);
-            break;
-
-          case 1:
-            strip_mine_1(dev, iRow);
-            break;
-        }
-      }
-#endif // KOKKOS_FAST_COMPILE
-    }
-  }
-};
+      if (kk < n) {
+        switch (n - kk) {
+          case 15: strip_mine<15>(dev, iRow, kk); break;
 
+          case 14: strip_mine<14>(dev, iRow, kk); break;
 
-  template<class AMatrix,
-           class XVector,
-           class YVector,
-           int doalpha,
-           int dobeta,
-           bool conjugate>
-  static void
-  spmv_alpha_beta_mv_struct_no_transpose (const typename YVector::non_const_value_type& alpha,
-                                          const AMatrix& A,
-                                          const XVector& x,
-                                          const typename YVector::non_const_value_type& beta,
-                                          const YVector& y)
-  {
-    typedef typename AMatrix::ordinal_type ordinal_type;
-
-    if (A.numRows () <= static_cast<ordinal_type> (0)) {
-      return;
-    }
-    if (doalpha == 0) {
-      if (dobeta != 1) {
-        KokkosBlas::scal (y, beta, y);
-      }
-      return;
-    }
-    else {
-      typedef typename AMatrix::size_type size_type;
-
-      // Assuming that no row contains duplicate entries, NNZPerRow
-      // cannot be more than the number of columns of the matrix.  Thus,
-      // the appropriate type is ordinal_type.
-      const ordinal_type NNZPerRow = static_cast<ordinal_type> (A.nnz () / A.numRows ());
-
-      int vector_length = 1;
-      while( (static_cast<ordinal_type> (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2;
-
-#ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels
-
-      typedef SPMV_MV_Struct_LayoutLeft_Functor<AMatrix, XVector, YVector,
-                                                doalpha, dobeta, conjugate> OpType;
-      OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow), vector_length);
-
-      typename AMatrix::const_ordinal_type nrow = A.numRows();
-
-        // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
-        // instead of int?  For example, if the number of threads is 1,
-        // then this is just the number of rows.  Ditto for rows_per_team.
-        // team_size is a hardware resource thing so it might legitimately
-        // be int.
-        const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-        const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
-        const int rows_per_team = rows_per_thread * team_size;
-        const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
-        Kokkos::parallel_for("KokkosSparse::spmv_struct<MV,NoTranspose>", Kokkos::TeamPolicy< typename AMatrix::execution_space >
-                             ( nteams , team_size , vector_length ) , op );
-
-#else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta
-
-      typedef SPMV_MV_Struct_LayoutLeft_Functor<AMatrix, XVector, YVector,
-                                                2, 2, conjugate> OpType;
-
-      typename AMatrix::const_ordinal_type nrow = A.numRows();
-
-      OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow), vector_length);
-
-        // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
-        // instead of int?  For example, if the number of threads is 1,
-        // then this is just the number of rows.  Ditto for rows_per_team.
-        // team_size is a hardware resource thing so it might legitimately
-        // be int.
-        const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-        const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
-        const int rows_per_team = rows_per_thread * team_size;
-        const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
-        Kokkos::parallel_for("KokkosSparse::spmv_struct<MV,NoTranspose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
-                             ( nteams , team_size , vector_length ) , op );
-#endif // KOKKOS_FAST_COMPILE
-    }
-  }
+          case 13: strip_mine<13>(dev, iRow, kk); break;
 
-  template<class AMatrix,
-           class XVector,
-           class YVector,
-           int doalpha,
-           int dobeta,
-           bool conjugate>
-  static void
-  spmv_alpha_beta_mv_struct_transpose (const typename YVector::non_const_value_type& alpha,
-                                       const AMatrix& A,
-                                       const XVector& x,
-                                       const typename YVector::non_const_value_type& beta,
-                                       const YVector& y)
-  {
-    typedef typename AMatrix::ordinal_type ordinal_type;
-
-    if (A.numRows () <= static_cast<ordinal_type> (0)) {
-      return;
-    }
+          case 12: strip_mine<12>(dev, iRow, kk); break;
 
-    // We need to scale y first ("scaling" by zero just means filling
-    // with zeros), since the functor works by atomic-adding into y.
-    if (dobeta != 1) {
-      KokkosBlas::scal (y, beta, y);
-    }
+          case 11: strip_mine<11>(dev, iRow, kk); break;
 
-    if (doalpha != 0) {
-      typedef typename AMatrix::size_type size_type;
+          case 10: strip_mine<10>(dev, iRow, kk); break;
 
-      // Assuming that no row contains duplicate entries, NNZPerRow
-      // cannot be more than the number of columns of the matrix.  Thus,
-      // the appropriate type is ordinal_type.
-      const ordinal_type NNZPerRow = static_cast<ordinal_type> (A.nnz () / A.numRows ());
+          case 9: strip_mine<9>(dev, iRow, kk); break;
 
-      int vector_length = 1;
-      while( (static_cast<ordinal_type> (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2;
+          case 8: strip_mine<8>(dev, iRow, kk); break;
+#endif  // __CUDA_ARCH__ or __HIP_DEVICE_COMPILE__
+          case 7: strip_mine<7>(dev, iRow, kk); break;
 
-#ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels
+          case 6: strip_mine<6>(dev, iRow, kk); break;
 
-      typedef SPMV_MV_Struct_Transpose_Functor<AMatrix, XVector, YVector,
-                                               doalpha, dobeta, conjugate> OpType;
-      OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
+          case 5: strip_mine<5>(dev, iRow, kk); break;
 
-      typename AMatrix::const_ordinal_type nrow = A.numRows();
+          case 4: strip_mine<4>(dev, iRow, kk); break;
 
-        // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
-        // instead of int?  For example, if the number of threads is 1,
-        // then this is just the number of rows.  Ditto for rows_per_team.
-        // team_size is a hardware resource thing so it might legitimately
-        // be int.
-        const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-        const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
-        const int rows_per_team = rows_per_thread * team_size;
-        const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
-        Kokkos::parallel_for ("KokkosSparse::spmv_struct<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
-                              ( nteams , team_size , vector_length ) , op );
+          case 3: strip_mine<3>(dev, iRow, kk); break;
 
-#else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta
+          case 2: strip_mine<2>(dev, iRow, kk); break;
 
-      typedef SPMV_MV_Struct_Transpose_Functor<AMatrix, XVector, YVector,
-                                               2, 2, conjugate, SizeType> OpType;
+          case 1: strip_mine_1(dev, iRow); break;
+        }
+      }
+#endif  // KOKKOS_FAST_COMPILE
+    }
+  }
+};
 
-      typename AMatrix::const_ordinal_type nrow = A.numRows();
+template <class AMatrix, class XVector, class YVector, int doalpha, int dobeta,
+          bool conjugate>
+static void spmv_alpha_beta_mv_struct_no_transpose(
+    const typename YVector::non_const_value_type& alpha, const AMatrix& A,
+    const XVector& x, const typename YVector::non_const_value_type& beta,
+    const YVector& y) {
+  typedef typename AMatrix::ordinal_type ordinal_type;
 
-      OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
+  if (A.numRows() <= static_cast<ordinal_type>(0)) {
+    return;
+  }
+  if (doalpha == 0) {
+    if (dobeta != 1) {
+      KokkosBlas::scal(y, beta, y);
+    }
+    return;
+  } else {
+    typedef typename AMatrix::size_type size_type;
+
+    // Assuming that no row contains duplicate entries, NNZPerRow
+    // cannot be more than the number of columns of the matrix.  Thus,
+    // the appropriate type is ordinal_type.
+    const ordinal_type NNZPerRow =
+        static_cast<ordinal_type>(A.nnz() / A.numRows());
+
+    int vector_length = 1;
+    while ((static_cast<ordinal_type>(vector_length * 2 * 3) <= NNZPerRow) &&
+           (vector_length < 8))
+      vector_length *= 2;
+
+#ifndef KOKKOS_FAST_COMPILE  // This uses templated functions on doalpha and
+                             // dobeta and will produce 16 kernels
+
+    typedef SPMV_MV_Struct_LayoutLeft_Functor<AMatrix, XVector, YVector,
+                                              doalpha, dobeta, conjugate>
+        OpType;
+    OpType op(alpha, A, x, beta, y,
+              RowsPerThread<typename AMatrix::execution_space>(NNZPerRow),
+              vector_length);
+
+    typename AMatrix::const_ordinal_type nrow = A.numRows();
+
+    // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
+    // instead of int?  For example, if the number of threads is 1,
+    // then this is just the number of rows.  Ditto for rows_per_team.
+    // team_size is a hardware resource thing so it might legitimately
+    // be int.
+    const int rows_per_thread =
+        RowsPerThread<typename AMatrix::execution_space>(NNZPerRow);
+    const int team_size =
+        Kokkos::TeamPolicy<typename AMatrix::execution_space>(
+            rows_per_thread, Kokkos::AUTO, vector_length)
+            .team_size_recommended(op, Kokkos::ParallelForTag());
+    const int rows_per_team = rows_per_thread * team_size;
+    const size_type nteams  = (nrow + rows_per_team - 1) / rows_per_team;
+    Kokkos::parallel_for("KokkosSparse::spmv_struct<MV,NoTranspose>",
+                         Kokkos::TeamPolicy<typename AMatrix::execution_space>(
+                             nteams, team_size, vector_length),
+                         op);
+
+#else   // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for
+        // alpha/beta
+
+    typedef SPMV_MV_Struct_LayoutLeft_Functor<AMatrix, XVector, YVector, 2, 2,
+                                              conjugate>
+        OpType;
+
+    typename AMatrix::const_ordinal_type nrow = A.numRows();
+
+    OpType op(alpha, A, x, beta, y,
+              RowsPerThread<typename AMatrix::execution_space>(NNZPerRow),
+              vector_length);
+
+    // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
+    // instead of int?  For example, if the number of threads is 1,
+    // then this is just the number of rows.  Ditto for rows_per_team.
+    // team_size is a hardware resource thing so it might legitimately
+    // be int.
+    const int rows_per_thread =
+        RowsPerThread<typename AMatrix::execution_space>(NNZPerRow);
+    const int team_size =
+        Kokkos::TeamPolicy<typename AMatrix::execution_space>(
+            rows_per_thread, Kokkos::AUTO, vector_length)
+            .team_size_recommended(op, Kokkos::ParallelForTag());
+    const int rows_per_team = rows_per_thread * team_size;
+    const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team;
+    Kokkos::parallel_for("KokkosSparse::spmv_struct<MV,NoTranspose>",
+                         Kokkos::TeamPolicy<typename AMatrix::execution_space>(
+                             nteams, team_size, vector_length),
+                         op);
+#endif  // KOKKOS_FAST_COMPILE
+  }
+}
 
-        // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
-        // instead of int?  For example, if the number of threads is 1,
-        // then this is just the number of rows.  Ditto for rows_per_team.
-        // team_size is a hardware resource thing so it might legitimately
-        // be int.
-        const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-        const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
-        const int rows_per_team = rows_per_thread * team_size;
-        const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
-        Kokkos::parallel_for("KokkosSparse::spmv_struct<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
-                             ( nteams , team_size , vector_length ) , op );
+template <class AMatrix, class XVector, class YVector, int doalpha, int dobeta,
+          bool conjugate>
+static void spmv_alpha_beta_mv_struct_transpose(
+    const typename YVector::non_const_value_type& alpha, const AMatrix& A,
+    const XVector& x, const typename YVector::non_const_value_type& beta,
+    const YVector& y) {
+  typedef typename AMatrix::ordinal_type ordinal_type;
 
-#endif // KOKKOS_FAST_COMPILE
-    }
+  if (A.numRows() <= static_cast<ordinal_type>(0)) {
+    return;
   }
 
-  template<class AMatrix,
-           class XVector,
-           class YVector,
-           int doalpha,
-           int dobeta>
-  static void
-  spmv_alpha_beta_mv_struct (const char mode[],
-                             const typename YVector::non_const_value_type& alpha,
-                             const AMatrix& A,
-                             const XVector& x,
-                             const typename YVector::non_const_value_type& beta,
-                             const YVector& y)
-  {
-    if (mode[0] == NoTranspose[0]) {
-      spmv_alpha_beta_mv_struct_no_transpose<AMatrix, XVector, YVector, doalpha, dobeta, false> (alpha, A, x, beta, y);
-    }
-    else if (mode[0] == Conjugate[0]) {
-      spmv_alpha_beta_mv_struct_no_transpose<AMatrix, XVector, YVector, doalpha, dobeta, true> (alpha, A, x, beta, y);
-    }
-    else if (mode[0] == Transpose[0]) {
-      spmv_alpha_beta_mv_struct_transpose<AMatrix, XVector, YVector, doalpha, dobeta, false> (alpha, A, x, beta, y);
-    }
-    else if (mode[0] == ConjugateTranspose[0]) {
-      spmv_alpha_beta_mv_struct_transpose<AMatrix, XVector, YVector, doalpha, dobeta, true> (alpha, A, x, beta, y);
-    }
-    else {
-      Kokkos::Impl::throw_runtime_exception ("Invalid Transpose Mode for KokkosSparse::spmv()");
-    }
+  // We need to scale y first ("scaling" by zero just means filling
+  // with zeros), since the functor works by atomic-adding into y.
+  if (dobeta != 1) {
+    KokkosBlas::scal(y, beta, y);
   }
 
-  template<class AMatrix,
-           class XVector,
-           class YVector,
-           int doalpha>
-  void
-  spmv_alpha_mv_struct (const char mode[],
-                        const typename YVector::non_const_value_type& alpha,
-                        const AMatrix& A,
-                        const XVector& x,
-                        const typename YVector::non_const_value_type& beta,
-                        const YVector& y)
-  {
-    typedef typename YVector::non_const_value_type coefficient_type;
-    typedef Kokkos::Details::ArithTraits<coefficient_type> KAT;
-
-    if (beta == KAT::zero ()) {
-      spmv_alpha_beta_mv_struct<AMatrix, XVector, YVector, doalpha, 0> (mode, alpha, A, x, beta, y);
-    }
-    else if (beta == KAT::one ()) {
-      spmv_alpha_beta_mv_struct<AMatrix, XVector, YVector, doalpha, 1> (mode, alpha, A, x, beta, y);
-    }
-    else if (beta == -KAT::one ()) {
-      spmv_alpha_beta_mv_struct<AMatrix, XVector, YVector, doalpha, -1> (mode, alpha, A, x, beta, y);
-    }
-    else {
-      spmv_alpha_beta_mv_struct<AMatrix, XVector, YVector, doalpha, 2> (mode, alpha, A, x, beta, y);
-    }
+  if (doalpha != 0) {
+    typedef typename AMatrix::size_type size_type;
+
+    // Assuming that no row contains duplicate entries, NNZPerRow
+    // cannot be more than the number of columns of the matrix.  Thus,
+    // the appropriate type is ordinal_type.
+    const ordinal_type NNZPerRow =
+        static_cast<ordinal_type>(A.nnz() / A.numRows());
+
+    int vector_length = 1;
+    while ((static_cast<ordinal_type>(vector_length * 2 * 3) <= NNZPerRow) &&
+           (vector_length < 8))
+      vector_length *= 2;
+
+#ifndef KOKKOS_FAST_COMPILE  // This uses templated functions on doalpha and
+                             // dobeta and will produce 16 kernels
+
+    typedef SPMV_MV_Struct_Transpose_Functor<AMatrix, XVector, YVector, doalpha,
+                                             dobeta, conjugate>
+        OpType;
+    OpType op(alpha, A, x, beta, y,
+              RowsPerThread<typename AMatrix::execution_space>(NNZPerRow));
+
+    typename AMatrix::const_ordinal_type nrow = A.numRows();
+
+    // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
+    // instead of int?  For example, if the number of threads is 1,
+    // then this is just the number of rows.  Ditto for rows_per_team.
+    // team_size is a hardware resource thing so it might legitimately
+    // be int.
+    const int rows_per_thread =
+        RowsPerThread<typename AMatrix::execution_space>(NNZPerRow);
+    const int team_size =
+        Kokkos::TeamPolicy<typename AMatrix::execution_space>(
+            rows_per_thread, Kokkos::AUTO, vector_length)
+            .team_size_recommended(op, Kokkos::ParallelForTag());
+    const int rows_per_team = rows_per_thread * team_size;
+    const size_type nteams  = (nrow + rows_per_team - 1) / rows_per_team;
+    Kokkos::parallel_for("KokkosSparse::spmv_struct<MV,Transpose>",
+                         Kokkos::TeamPolicy<typename AMatrix::execution_space>(
+                             nteams, team_size, vector_length),
+                         op);
+
+#else  // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for
+       // alpha/beta
+
+    typedef SPMV_MV_Struct_Transpose_Functor<AMatrix, XVector, YVector, 2, 2,
+                                             conjugate, SizeType>
+        OpType;
+
+    typename AMatrix::const_ordinal_type nrow = A.numRows();
+
+    OpType op(alpha, A, x, beta, y,
+              RowsPerThread<typename AMatrix::execution_space>(NNZPerRow));
+
+    // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
+    // instead of int?  For example, if the number of threads is 1,
+    // then this is just the number of rows.  Ditto for rows_per_team.
+    // team_size is a hardware resource thing so it might legitimately
+    // be int.
+    const int rows_per_thread =
+        RowsPerThread<typename AMatrix::execution_space>(NNZPerRow);
+    const int team_size =
+        Kokkos::TeamPolicy<typename AMatrix::execution_space>(
+            rows_per_thread, Kokkos::AUTO, vector_length)
+            .team_size_recommended(op, Kokkos::ParallelForTag());
+    const int rows_per_team = rows_per_thread * team_size;
+    const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team;
+    Kokkos::parallel_for("KokkosSparse::spmv_struct<MV,Transpose>",
+                         Kokkos::TeamPolicy<typename AMatrix::execution_space>(
+                             nteams, team_size, vector_length),
+                         op);
+
+#endif  // KOKKOS_FAST_COMPILE
   }
+}
 
+template <class AMatrix, class XVector, class YVector, int doalpha, int dobeta>
+static void spmv_alpha_beta_mv_struct(
+    const char mode[], const typename YVector::non_const_value_type& alpha,
+    const AMatrix& A, const XVector& x,
+    const typename YVector::non_const_value_type& beta, const YVector& y) {
+  if (mode[0] == NoTranspose[0]) {
+    spmv_alpha_beta_mv_struct_no_transpose<AMatrix, XVector, YVector, doalpha,
+                                           dobeta, false>(alpha, A, x, beta, y);
+  } else if (mode[0] == Conjugate[0]) {
+    spmv_alpha_beta_mv_struct_no_transpose<AMatrix, XVector, YVector, doalpha,
+                                           dobeta, true>(alpha, A, x, beta, y);
+  } else if (mode[0] == Transpose[0]) {
+    spmv_alpha_beta_mv_struct_transpose<AMatrix, XVector, YVector, doalpha,
+                                        dobeta, false>(alpha, A, x, beta, y);
+  } else if (mode[0] == ConjugateTranspose[0]) {
+    spmv_alpha_beta_mv_struct_transpose<AMatrix, XVector, YVector, doalpha,
+                                        dobeta, true>(alpha, A, x, beta, y);
+  } else {
+    KokkosKernels::Impl::throw_runtime_exception(
+        "Invalid Transpose Mode for KokkosSparse::spmv()");
+  }
 }
+
+template <class AMatrix, class XVector, class YVector, int doalpha>
+void spmv_alpha_mv_struct(const char mode[],
+                          const typename YVector::non_const_value_type& alpha,
+                          const AMatrix& A, const XVector& x,
+                          const typename YVector::non_const_value_type& beta,
+                          const YVector& y) {
+  typedef typename YVector::non_const_value_type coefficient_type;
+  typedef Kokkos::Details::ArithTraits<coefficient_type> KAT;
+
+  if (beta == KAT::zero()) {
+    spmv_alpha_beta_mv_struct<AMatrix, XVector, YVector, doalpha, 0>(
+        mode, alpha, A, x, beta, y);
+  } else if (beta == KAT::one()) {
+    spmv_alpha_beta_mv_struct<AMatrix, XVector, YVector, doalpha, 1>(
+        mode, alpha, A, x, beta, y);
+  } else if (beta == -KAT::one()) {
+    spmv_alpha_beta_mv_struct<AMatrix, XVector, YVector, doalpha, -1>(
+        mode, alpha, A, x, beta, y);
+  } else {
+    spmv_alpha_beta_mv_struct<AMatrix, XVector, YVector, doalpha, 2>(
+        mode, alpha, A, x, beta, y);
+  }
 }
 
-#endif // KOKKOSSPARSE_IMPL_SPMV_STRUCT_DEF_HPP_
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#endif  // KOKKOSSPARSE_IMPL_SPMV_STRUCT_DEF_HPP_
diff --git a/src/sparse/impl/KokkosSparse_spmv_struct_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_struct_spec.hpp
index fe2a6d0521..ad175c2dfa 100644
--- a/src/sparse/impl/KokkosSparse_spmv_struct_spec.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_struct_spec.hpp
@@ -55,317 +55,267 @@
 #endif
 
 namespace KokkosSparse {
-  namespace Impl {
-    // Specialization struct which defines whether a specialization exists
-    template<class AT, class AO, class AD, class AM, class AS,
-             class XT, class XL, class XD, class XM,
-             class YT, class YL, class YD, class YM>
-    struct spmv_struct_eti_spec_avail {
-      enum : bool { value = false };
-    };
-
-    template<class AT, class AO, class AD, class AM, class AS,
-             class XT, class XL, class XD, class XM,
-             class YT, class YL, class YD, class YM,
-             const bool integerScalarType =
-             std::is_integral<typename std::decay<AT>::type>::value>
-    struct spmv_mv_struct_eti_spec_avail {
-      enum : bool { value = false };
-    };
-
-  } // namespace Impl
-} // namespace KokkosSparse
-
-
-#define KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_AVAIL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \
-  template<>                                                            \
-  struct spmv_struct_eti_spec_avail<const SCALAR_TYPE,                  \
-                                    const ORDINAL_TYPE,                 \
-                                    Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                    Kokkos::MemoryTraits<Kokkos::Unmanaged>, \
-                                    const OFFSET_TYPE,                  \
-                                    SCALAR_TYPE const*,                 \
-                                    LAYOUT_TYPE,                        \
-                                    Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                    Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess>, \
-                                    SCALAR_TYPE*,                       \
-                                    LAYOUT_TYPE,                        \
-                                    Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                    Kokkos::MemoryTraits<Kokkos::Unmanaged> > \
-  { enum : bool { value = true }; };
-
-
-#define KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_AVAIL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE ) \
-  template<>                                                            \
-  struct spmv_mv_struct_eti_spec_avail <const SCALAR_TYPE,              \
-                                        const ORDINAL_TYPE,             \
-                                        Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                        Kokkos::MemoryTraits<Kokkos::Unmanaged>, \
-                                        const OFFSET_TYPE,              \
-                                        SCALAR_TYPE const**,            \
-                                        LAYOUT_TYPE,                    \
-                                        Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                        Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess>, \
-                                        SCALAR_TYPE**,                  \
-                                        LAYOUT_TYPE,                    \
-                                        Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                        Kokkos::MemoryTraits<Kokkos::Unmanaged> > \
-  { enum : bool { value = true }; };
-
+namespace Impl {
+// Specialization struct which defines whether a specialization exists
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM>
+struct spmv_struct_eti_spec_avail {
+  enum : bool { value = false };
+};
+
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM,
+          const bool integerScalarType =
+              std::is_integral<typename std::decay<AT>::type>::value>
+struct spmv_mv_struct_eti_spec_avail {
+  enum : bool { value = false };
+};
+
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#define KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_AVAIL(                          \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE)                                                       \
+  template <>                                                             \
+  struct spmv_struct_eti_spec_avail<                                      \
+      const SCALAR_TYPE, const ORDINAL_TYPE,                              \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET_TYPE,         \
+      SCALAR_TYPE const*, LAYOUT_TYPE,                                    \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,     \
+      SCALAR_TYPE*, LAYOUT_TYPE,                                          \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged> > {                         \
+    enum : bool { value = true };                                         \
+  };
+
+#define KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_AVAIL(                       \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE)                                                       \
+  template <>                                                             \
+  struct spmv_mv_struct_eti_spec_avail<                                   \
+      const SCALAR_TYPE, const ORDINAL_TYPE,                              \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET_TYPE,         \
+      SCALAR_TYPE const**, LAYOUT_TYPE,                                   \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,     \
+      SCALAR_TYPE**, LAYOUT_TYPE,                                         \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged> > {                         \
+    enum : bool { value = true };                                         \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosSparse_spmv_struct_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosSparse_spmv_struct_eti_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosSparse_spmv_mv_struct_eti_spec_avail.hpp>
+#include <KokkosSparse_spmv_struct_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spmv_struct_eti_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spmv_mv_struct_eti_spec_avail.hpp>
 
 namespace KokkosSparse {
-  namespace Impl {
-
-    // Unification layer
-    /// \brief Implementation of KokkosSparse::spmv_struct (sparse structured matrix
-    ///   - dense vector multiply) for single vectors (1-D Views).
-    ///
-    /// The first 5 template parameters are the same as those of
-    /// KokkosSparse::CrsMatrix.  In particular:
-    ///
-    /// AT: type of each entry of the sparse matrix
-    /// AO: ordinal type (type of column indices) of the sparse matrix
-    /// AS: offset type (type of row offsets) of the sparse matrix
-    ///
-    /// The next 4 template parameters (that start with X) correspond to
-    /// the input Kokkos::View.  The last 4 template parameters (that start
-    /// with Y) correspond to the output Kokkos::View.
-    ///
-    /// For the implementation of KokkosSparse::spmv_struct for multivectors (2-D
-    /// Views), see the SPMV_STRUCT struct below.
-    template<class AT, class AO, class AD, class AM, class AS,
-             class XT, class XL, class XD, class XM,
-             class YT, class YL, class YD, class YM,
-             bool tpl_spec_avail =
-             spmv_struct_tpl_spec_avail< AT, AO, AD, AM, AS,
-                                         XT, XL, XD, XM,
-                                         YT, YL, YD, YM>::value,
-             bool eti_spec_avail =
-             spmv_struct_eti_spec_avail< AT, AO, AD, AM, AS,
-                                         XT, XL, XD, XM,
-                                         YT, YL, YD, YM>::value >
-    struct SPMV_STRUCT{
-      typedef CrsMatrix<AT,AO,AD,AM,AS> AMatrix;
-      typedef Kokkos::View<XT,XL,XD,XM> XVector;
-      typedef Kokkos::View<YT,YL,YD,YM> YVector;
-
-      typedef typename YVector::non_const_value_type coefficient_type;
-
-      static void spmv_struct (const char mode[],
-                               const int stencil_type,
-                               const Kokkos::View<typename AMatrix::non_const_ordinal_type*, Kokkos::HostSpace>& structure,
-                               const coefficient_type& alpha,
-                               const AMatrix& A,
-                               const XVector& x,
-                               const coefficient_type& beta,
-                               const YVector& y);
-    };
-
-    // Unification layer
-    /// \brief Implementation of KokkosBlas::spmv_struct (sparse structured matrix
-    ///   - dense vector multiply) for multiple vectors at a time (multivectors)
-    ///   and possibly multiple coefficients at a time.
-    ///
-    /// This struct implements the following operations:
-    ///
-    ///   1. Y(:,j) := beta(j) * Y(:,j) + alpha(j) * Op(A) * X(:,j)
-    ///   2. Y(:,j) := beta(j) * Y(:,j) + alpha * Op(A) * X(:,j)
-    ///   3. Y(:,j) := beta * Y(:,j) + alpha(j) * Op(A) * X(:,j)
-    ///   4. Y(:,j) := beta * Y(:,j) + alpha * Op(A) * X(:,j)
-    ///
-    /// In #1 and #2 above, beta is a 1-D View of coefficients, one for
-    /// each column of Y.  In #1 and #3 above, alpha is a 1-D View of
-    /// coefficients, one for each column of X.  Otherwise, alpha
-    /// resp. beta are each a single coefficient.  In all of these
-    /// operations, X and Y are 2-D Views ("multivectors").  A is a sparse
-    /// matrix, and Op(A) is either A itself, its transpose, or its
-    /// conjugate transpose, depending on the 'mode' argument.
-    ///
-    /// The first 5 template parameters are the template parameters of the
-    /// input 1-D View of coefficients 'alpha'.  The next 5 template
-    /// parameters are the same as those of KokkosSparse::CrsMatrix.  In
-    /// particular:
-    ///
-    /// AT: type of each entry of the sparse matrix
-    /// AO: ordinal type (type of column indices) of the sparse matrix
-    /// AS: offset type (type of row offsets) of the sparse matrix
-    ///
-    /// The next 4 template parameters (that start with X) correspond to
-    /// the input Kokkos::View.  The 4 template parameters after that
-    /// (that start with lower-case b) are the template parameters of the
-    /// input 1-D View of coefficients 'beta'.  Next, the 4 template
-    /// parameters that start with Y correspond to the output
-    /// Kokkos::View.  The last template parameter indicates whether the
-    /// matrix's entries have integer type.  Per Github Issue #700, we
-    /// don't optimize as heavily for that case, in order to reduce build
-    /// times and library sizes.
-    template<class AT, class AO, class AD, class AM, class AS,
-             class XT, class XL, class XD, class XM,
-             class YT, class YL, class YD, class YM,
-             const bool integerScalarType =
-             std::is_integral<typename std::decay<AT>::type>::value,
-             bool tpl_spec_avail =
-             spmv_mv_struct_tpl_spec_avail< AT, AO, AD, AM, AS,
-                                            XT, XL, XD, XM,
-                                            YT, YL, YD, YM>::value,
-             bool eti_spec_avail =
-             spmv_mv_struct_eti_spec_avail< AT, AO, AD, AM, AS,
-                                            XT, XL, XD, XM,
-                                            YT, YL, YD, YM>::value >
-    struct SPMV_MV_STRUCT{
-      typedef CrsMatrix<AT,AO,AD,AM,AS> AMatrix;
-      typedef Kokkos::View<XT,XL,XD,XM> XVector;
-      typedef Kokkos::View<YT,YL,YD,YM> YVector;
-      typedef typename YVector::non_const_value_type coefficient_type;
-
-      static void
-      spmv_mv_struct (const char mode[],
-                      const coefficient_type& alpha,
-                      const AMatrix& A,
-                      const XVector& x,
-                      const coefficient_type& beta,
-                      const YVector& y);
-    };
+namespace Impl {
+
+// Unification layer
+/// \brief Implementation of KokkosSparse::spmv_struct (sparse structured matrix
+///   - dense vector multiply) for single vectors (1-D Views).
+///
+/// The first 5 template parameters are the same as those of
+/// KokkosSparse::CrsMatrix.  In particular:
+///
+/// AT: type of each entry of the sparse matrix
+/// AO: ordinal type (type of column indices) of the sparse matrix
+/// AS: offset type (type of row offsets) of the sparse matrix
+///
+/// The next 4 template parameters (that start with X) correspond to
+/// the input Kokkos::View.  The last 4 template parameters (that start
+/// with Y) correspond to the output Kokkos::View.
+///
+/// For the implementation of KokkosSparse::spmv_struct for multivectors (2-D
+/// Views), see the SPMV_STRUCT struct below.
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM,
+          bool tpl_spec_avail = spmv_struct_tpl_spec_avail<
+              AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value,
+          bool eti_spec_avail = spmv_struct_eti_spec_avail<
+              AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value>
+struct SPMV_STRUCT {
+  typedef CrsMatrix<AT, AO, AD, AM, AS> AMatrix;
+  typedef Kokkos::View<XT, XL, XD, XM> XVector;
+  typedef Kokkos::View<YT, YL, YD, YM> YVector;
+
+  typedef typename YVector::non_const_value_type coefficient_type;
+
+  static void spmv_struct(
+      const char mode[], const int stencil_type,
+      const Kokkos::View<typename AMatrix::non_const_ordinal_type*,
+                         Kokkos::HostSpace>& structure,
+      const coefficient_type& alpha, const AMatrix& A, const XVector& x,
+      const coefficient_type& beta, const YVector& y);
+};
+
+// Unification layer
+/// \brief Implementation of KokkosBlas::spmv_struct (sparse structured matrix
+///   - dense vector multiply) for multiple vectors at a time (multivectors)
+///   and possibly multiple coefficients at a time.
+///
+/// This struct implements the following operations:
+///
+///   1. Y(:,j) := beta(j) * Y(:,j) + alpha(j) * Op(A) * X(:,j)
+///   2. Y(:,j) := beta(j) * Y(:,j) + alpha * Op(A) * X(:,j)
+///   3. Y(:,j) := beta * Y(:,j) + alpha(j) * Op(A) * X(:,j)
+///   4. Y(:,j) := beta * Y(:,j) + alpha * Op(A) * X(:,j)
+///
+/// In #1 and #2 above, beta is a 1-D View of coefficients, one for
+/// each column of Y.  In #1 and #3 above, alpha is a 1-D View of
+/// coefficients, one for each column of X.  Otherwise, alpha
+/// resp. beta are each a single coefficient.  In all of these
+/// operations, X and Y are 2-D Views ("multivectors").  A is a sparse
+/// matrix, and Op(A) is either A itself, its transpose, or its
+/// conjugate transpose, depending on the 'mode' argument.
+///
+/// The first 5 template parameters are the template parameters of the
+/// input 1-D View of coefficients 'alpha'.  The next 5 template
+/// parameters are the same as those of KokkosSparse::CrsMatrix.  In
+/// particular:
+///
+/// AT: type of each entry of the sparse matrix
+/// AO: ordinal type (type of column indices) of the sparse matrix
+/// AS: offset type (type of row offsets) of the sparse matrix
+///
+/// The next 4 template parameters (that start with X) correspond to
+/// the input Kokkos::View.  The 4 template parameters after that
+/// (that start with lower-case b) are the template parameters of the
+/// input 1-D View of coefficients 'beta'.  Next, the 4 template
+/// parameters that start with Y correspond to the output
+/// Kokkos::View.  The last template parameter indicates whether the
+/// matrix's entries have integer type.  Per Github Issue #700, we
+/// don't optimize as heavily for that case, in order to reduce build
+/// times and library sizes.
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM,
+          const bool integerScalarType =
+              std::is_integral<typename std::decay<AT>::type>::value,
+          bool tpl_spec_avail = spmv_mv_struct_tpl_spec_avail<
+              AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value,
+          bool eti_spec_avail = spmv_mv_struct_eti_spec_avail<
+              AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value>
+struct SPMV_MV_STRUCT {
+  typedef CrsMatrix<AT, AO, AD, AM, AS> AMatrix;
+  typedef Kokkos::View<XT, XL, XD, XM> XVector;
+  typedef Kokkos::View<YT, YL, YD, YM> YVector;
+  typedef typename YVector::non_const_value_type coefficient_type;
+
+  static void spmv_mv_struct(const char mode[], const coefficient_type& alpha,
+                             const AMatrix& A, const XVector& x,
+                             const coefficient_type& beta, const YVector& y);
+};
 
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
-    //! Full specialization of spmv for single vectors (1-D Views).
-    // Unification layer
-    template<class AT, class AO, class AD, class AM, class AS,
-             class XT, class XL, class XD, class XM,
-             class YT, class YL, class YD, class YM>
-    struct SPMV_STRUCT < AT, AO, AD, AM, AS,
-                         XT, XL, XD, XM,
-                         YT, YL, YD, YM, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>{
-
-      typedef CrsMatrix<AT,AO,AD,AM,AS> AMatrix;
-      typedef Kokkos::View<XT,XL,XD,XM> XVector;
-      typedef Kokkos::View<YT,YL,YD,YM> YVector;
-      typedef typename YVector::non_const_value_type coefficient_type;
-
-      static void
-      spmv_struct (const char mode[],
-                   const int stencil_type,
-                   const Kokkos::View<typename AMatrix::non_const_ordinal_type*, Kokkos::HostSpace>& structure,
-                   const coefficient_type& alpha,
-                   const AMatrix& A,
-                   const XVector& x,
-                   const coefficient_type& beta,
-                   const YVector& y)
-      {
-        typedef Kokkos::Details::ArithTraits<coefficient_type> KAT;
-
-        typedef Kokkos::Details::ArithTraits<coefficient_type> KAT;
-
-        if (alpha == KAT::zero ()) {
-          if (beta != KAT::one ()) {
-            KokkosBlas::scal (y, beta, y);
-          }
-          return;
-        }
-
-        if (beta == KAT::zero ()) {
-          spmv_struct_beta<AMatrix, XVector, YVector, 0> (mode, stencil_type, structure,
-                                                          alpha, A, x, beta, y);
-        }
-        else if (beta == KAT::one ()) {
-          spmv_struct_beta<AMatrix, XVector, YVector, 1> (mode, stencil_type, structure,
-                                                          alpha, A, x, beta, y);
-        }
-        else if (beta == -KAT::one ()) {
-          spmv_struct_beta<AMatrix, XVector, YVector, -1> (mode, stencil_type, structure,
-                                                           alpha, A, x, beta, y);
-        }
-        else {
-          spmv_struct_beta<AMatrix, XVector, YVector, 2> (mode, stencil_type, structure,
-                                                          alpha, A, x, beta, y);
-        }
+//! Full specialization of spmv for single vectors (1-D Views).
+// Unification layer
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM>
+struct SPMV_STRUCT<AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM, false,
+                   KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  typedef CrsMatrix<AT, AO, AD, AM, AS> AMatrix;
+  typedef Kokkos::View<XT, XL, XD, XM> XVector;
+  typedef Kokkos::View<YT, YL, YD, YM> YVector;
+  typedef typename YVector::non_const_value_type coefficient_type;
+
+  static void spmv_struct(
+      const char mode[], const int stencil_type,
+      const Kokkos::View<typename AMatrix::non_const_ordinal_type*,
+                         Kokkos::HostSpace>& structure,
+      const coefficient_type& alpha, const AMatrix& A, const XVector& x,
+      const coefficient_type& beta, const YVector& y) {
+    typedef Kokkos::Details::ArithTraits<coefficient_type> KAT;
+
+    typedef Kokkos::Details::ArithTraits<coefficient_type> KAT;
+
+    if (alpha == KAT::zero()) {
+      if (beta != KAT::one()) {
+        KokkosBlas::scal(y, beta, y);
       }
-    };
-
-    //! Full specialization of spmv_mv for single vectors (2-D Views).
-    // Unification layer
-    template<class AT, class AO, class AD, class AM, class AS,
-             class XT, class XL, class XD, class XM,
-             class YT, class YL, class YD, class YM>
-    struct SPMV_MV_STRUCT<AT, AO, AD, AM, AS,
-                          XT, XL, XD, XM,
-                          YT, YL, YD, YM,
-                          false,
-                          false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>{
-      typedef CrsMatrix<AT,AO,AD,AM,AS> AMatrix;
-      typedef Kokkos::View<XT,XL,XD,XM> XVector;
-      typedef Kokkos::View<YT,YL,YD,YM> YVector;
-      typedef typename YVector::non_const_value_type coefficient_type;
-
-      static void
-      spmv_mv_struct (const char mode[],
-                      const coefficient_type& alpha,
-                      const AMatrix& A,
-                      const XVector& x,
-                      const coefficient_type& beta,
-                      const YVector& y)
-      {
-        typedef Kokkos::Details::ArithTraits<coefficient_type> KAT;
-
-        if (alpha == KAT::zero ()) {
-          spmv_alpha_mv_struct<AMatrix, XVector, YVector, 0> (mode, alpha, A, x, beta, y);
-        }
-        else if (alpha == KAT::one ()) {
-          spmv_alpha_mv_struct<AMatrix, XVector, YVector, 1> (mode, alpha, A, x, beta, y);
-        }
-        else if (alpha == -KAT::one ()) {
-          spmv_alpha_mv_struct<AMatrix, XVector, YVector, -1> (mode, alpha, A, x, beta, y);
-        }
-        else {
-          spmv_alpha_mv_struct<AMatrix, XVector, YVector, 2> (mode, alpha, A, x, beta, y);
-        }
-      }
-    };
-
-    template<class AT, class AO, class AD, class AM, class AS,
-             class XT, class XL, class XD, class XM,
-             class YT, class YL, class YD, class YM>
-    struct SPMV_MV_STRUCT<AT, AO, AD, AM, AS,
-                          XT, XL, XD, XM,
-                          YT, YL, YD, YM,
-                          true,
-                          false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>{
-      typedef CrsMatrix<AT,AO,AD,AM,AS> AMatrix;
-      typedef Kokkos::View<XT,XL,XD,XM> XVector;
-      typedef Kokkos::View<YT,YL,YD,YM> YVector;
-      typedef typename YVector::non_const_value_type coefficient_type;
-
-      static void
-      spmv_mv_struct (const char mode[],
-                      const coefficient_type& alpha,
-                      const AMatrix& A,
-                      const XVector& x,
-                      const coefficient_type& beta,
-                      const YVector& y)
-      {
-        static_assert (std::is_integral<AT>::value,
-                       "This implementation is only for integer Scalar types.");
-        typedef SPMV_STRUCT<AT, AO, AD, AM, AS,
-                            typename XVector::value_type*, XL, XD, XM,
-                            typename YVector::value_type*, YL, YD, YM> impl_type;
-        for (typename AMatrix::non_const_size_type j = 0; j < x.extent(1); ++j) {
-          auto x_j = Kokkos::subview (x, Kokkos::ALL (), j);
-          auto y_j = Kokkos::subview (y, Kokkos::ALL (), j);
-          impl_type::spmv_struct (mode, alpha, A, x_j, beta, y_j);
-        }
-      }
-    };
+      return;
+    }
+
+    if (beta == KAT::zero()) {
+      spmv_struct_beta<AMatrix, XVector, YVector, 0>(
+          mode, stencil_type, structure, alpha, A, x, beta, y);
+    } else if (beta == KAT::one()) {
+      spmv_struct_beta<AMatrix, XVector, YVector, 1>(
+          mode, stencil_type, structure, alpha, A, x, beta, y);
+    } else if (beta == -KAT::one()) {
+      spmv_struct_beta<AMatrix, XVector, YVector, -1>(
+          mode, stencil_type, structure, alpha, A, x, beta, y);
+    } else {
+      spmv_struct_beta<AMatrix, XVector, YVector, 2>(
+          mode, stencil_type, structure, alpha, A, x, beta, y);
+    }
+  }
+};
+
+//! Full specialization of spmv_mv for single vectors (2-D Views).
+// Unification layer
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM>
+struct SPMV_MV_STRUCT<AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM, false,
+                      false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  typedef CrsMatrix<AT, AO, AD, AM, AS> AMatrix;
+  typedef Kokkos::View<XT, XL, XD, XM> XVector;
+  typedef Kokkos::View<YT, YL, YD, YM> YVector;
+  typedef typename YVector::non_const_value_type coefficient_type;
+
+  static void spmv_mv_struct(const char mode[], const coefficient_type& alpha,
+                             const AMatrix& A, const XVector& x,
+                             const coefficient_type& beta, const YVector& y) {
+    typedef Kokkos::Details::ArithTraits<coefficient_type> KAT;
+
+    if (alpha == KAT::zero()) {
+      spmv_alpha_mv_struct<AMatrix, XVector, YVector, 0>(mode, alpha, A, x,
+                                                         beta, y);
+    } else if (alpha == KAT::one()) {
+      spmv_alpha_mv_struct<AMatrix, XVector, YVector, 1>(mode, alpha, A, x,
+                                                         beta, y);
+    } else if (alpha == -KAT::one()) {
+      spmv_alpha_mv_struct<AMatrix, XVector, YVector, -1>(mode, alpha, A, x,
+                                                          beta, y);
+    } else {
+      spmv_alpha_mv_struct<AMatrix, XVector, YVector, 2>(mode, alpha, A, x,
+                                                         beta, y);
+    }
+  }
+};
+
+template <class AT, class AO, class AD, class AM, class AS, class XT, class XL,
+          class XD, class XM, class YT, class YL, class YD, class YM>
+struct SPMV_MV_STRUCT<AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM, true,
+                      false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  typedef CrsMatrix<AT, AO, AD, AM, AS> AMatrix;
+  typedef Kokkos::View<XT, XL, XD, XM> XVector;
+  typedef Kokkos::View<YT, YL, YD, YM> YVector;
+  typedef typename YVector::non_const_value_type coefficient_type;
+
+  static void spmv_mv_struct(const char mode[], const coefficient_type& alpha,
+                             const AMatrix& A, const XVector& x,
+                             const coefficient_type& beta, const YVector& y) {
+    static_assert(std::is_integral<AT>::value,
+                  "This implementation is only for integer Scalar types.");
+    typedef SPMV_STRUCT<AT, AO, AD, AM, AS, typename XVector::value_type*, XL,
+                        XD, XM, typename YVector::value_type*, YL, YD, YM>
+        impl_type;
+    for (typename AMatrix::non_const_size_type j = 0; j < x.extent(1); ++j) {
+      auto x_j = Kokkos::subview(x, Kokkos::ALL(), j);
+      auto y_j = Kokkos::subview(y, Kokkos::ALL(), j);
+      impl_type::spmv_struct(mode, alpha, A, x_j, beta, y_j);
+    }
+  }
+};
 #endif
 
-
-
-  } // Impl
-} // KokkosSparse
+}  // namespace Impl
+}  // namespace KokkosSparse
 
 //
 // Macro for declaration of full specialization of
@@ -374,74 +324,68 @@ namespace KokkosSparse {
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_DECL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE ) \
-  extern template struct                                                \
-  SPMV_STRUCT<const SCALAR_TYPE,                                        \
-              const ORDINAL_TYPE,                                       \
-              Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,          \
-              Kokkos::MemoryTraits<Kokkos::Unmanaged>,                  \
-              const OFFSET_TYPE,                                        \
-              SCALAR_TYPE const*,                                       \
-              LAYOUT_TYPE,                                              \
-              Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,          \
-              Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess>, \
-              SCALAR_TYPE*,                                             \
-              LAYOUT_TYPE,                                              \
-              Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,          \
-              Kokkos::MemoryTraits<Kokkos::Unmanaged>, false, true >;
-
-
-#define KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_INST( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \
-  template struct                                                       \
-  SPMV_STRUCT<const SCALAR_TYPE,                                        \
-              const ORDINAL_TYPE,                                       \
-              Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,          \
-              Kokkos::MemoryTraits<Kokkos::Unmanaged>,                  \
-              const OFFSET_TYPE,                                        \
-              SCALAR_TYPE const*,                                       \
-              LAYOUT_TYPE,                                              \
-              Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,          \
-              Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess>, \
-              SCALAR_TYPE*,                                             \
-              LAYOUT_TYPE,                                              \
-              Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,          \
-              Kokkos::MemoryTraits<Kokkos::Unmanaged>, false, true >;
-
-#define KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_DECL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE ) \
-  extern template struct                                                \
-  SPMV_MV_STRUCT<const SCALAR_TYPE,                                     \
-                 const ORDINAL_TYPE,                                    \
-                 Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                 Kokkos::MemoryTraits<Kokkos::Unmanaged>,               \
-                 const OFFSET_TYPE,                                     \
-                 SCALAR_TYPE const**,                                   \
-                 LAYOUT_TYPE,                                           \
-                 Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                 Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess>, \
-                 SCALAR_TYPE**,                                         \
-                 LAYOUT_TYPE,                                           \
-                 Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                 Kokkos::MemoryTraits<Kokkos::Unmanaged>, std::is_integral<typename std::decay<SCALAR_TYPE>::type>::value, false, true >;
-
-
-#define KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_INST( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \
-  template struct                                                       \
-  SPMV_MV_STRUCT<const SCALAR_TYPE,                                     \
-                 const ORDINAL_TYPE,                                    \
-                 Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                 Kokkos::MemoryTraits<Kokkos::Unmanaged>,               \
-                 const OFFSET_TYPE,                                     \
-                 SCALAR_TYPE const**,                                   \
-                 LAYOUT_TYPE,                                           \
-                 Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                 Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess>, \
-                 SCALAR_TYPE**,                                         \
-                 LAYOUT_TYPE,                                           \
-                 Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,       \
-                 Kokkos::MemoryTraits<Kokkos::Unmanaged>, std::is_integral<typename std::decay<SCALAR_TYPE>::type>::value, false, true >;
-
-#include<KokkosSparse_spmv_struct_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosSparse_spmv_struct_eti_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosSparse_spmv_mv_struct_eti_spec_decl.hpp>
-
-#endif // KOKKOSSPARSE_IMPL_SPMV_STRUCT_SPEC_HPP_
+#define KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_DECL(                           \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE)                                                       \
+  extern template struct SPMV_STRUCT<                                     \
+      const SCALAR_TYPE, const ORDINAL_TYPE,                              \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET_TYPE,         \
+      SCALAR_TYPE const*, LAYOUT_TYPE,                                    \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,     \
+      SCALAR_TYPE*, LAYOUT_TYPE,                                          \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, false, true>;
+
+#define KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_INST(                           \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \
+    MEM_SPACE_TYPE)                                                       \
+  template struct SPMV_STRUCT<                                            \
+      const SCALAR_TYPE, const ORDINAL_TYPE,                              \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET_TYPE,         \
+      SCALAR_TYPE const*, LAYOUT_TYPE,                                    \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,     \
+      SCALAR_TYPE*, LAYOUT_TYPE,                                          \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, false, true>;
+
+#define KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_DECL(                            \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE,     \
+    MEM_SPACE_TYPE)                                                           \
+  extern template struct SPMV_MV_STRUCT<                                      \
+      const SCALAR_TYPE, const ORDINAL_TYPE,                                  \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET_TYPE,             \
+      SCALAR_TYPE const**, LAYOUT_TYPE,                                       \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,         \
+      SCALAR_TYPE**, LAYOUT_TYPE,                                             \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>,                                \
+      std::is_integral<typename std::decay<SCALAR_TYPE>::type>::value, false, \
+      true>;
+
+#define KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_INST(                            \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE,     \
+    MEM_SPACE_TYPE)                                                           \
+  template struct SPMV_MV_STRUCT<                                             \
+      const SCALAR_TYPE, const ORDINAL_TYPE,                                  \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET_TYPE,             \
+      SCALAR_TYPE const**, LAYOUT_TYPE,                                       \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>,         \
+      SCALAR_TYPE**, LAYOUT_TYPE,                                             \
+      Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                        \
+      Kokkos::MemoryTraits<Kokkos::Unmanaged>,                                \
+      std::is_integral<typename std::decay<SCALAR_TYPE>::type>::value, false, \
+      true>;
+
+#include <KokkosSparse_spmv_struct_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spmv_struct_eti_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosSparse_spmv_mv_struct_eti_spec_decl.hpp>
+
+#endif  // KOKKOSSPARSE_IMPL_SPMV_STRUCT_SPEC_HPP_
diff --git a/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp
index 99381dd447..61d0dc3ccf 100644
--- a/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp
@@ -48,45 +48,44 @@
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
 #include "cusparse.h"
 #endif
-namespace KokkosSparse{
-namespace Impl{
-
-  template <typename KernelHandle,
-  typename ain_row_index_view_type,
-  typename ain_nonzero_index_view_type,
-  typename ain_values_scalar_view_type>
-  void sptrsvcuSPARSE_symbolic(
-      KernelHandle *sptrsv_handle,
-      typename KernelHandle::nnz_lno_t nrows,
-      ain_row_index_view_type row_map,
-      ain_nonzero_index_view_type entries,
-      ain_values_scalar_view_type values,
-      bool trans
-      )
-  {
-
+namespace KokkosSparse {
+namespace Impl {
+
+template <typename KernelHandle, typename ain_row_index_view_type,
+          typename ain_nonzero_index_view_type,
+          typename ain_values_scalar_view_type>
+void sptrsvcuSPARSE_symbolic(KernelHandle* sptrsv_handle,
+                             typename KernelHandle::nnz_lno_t nrows,
+                             ain_row_index_view_type row_map,
+                             ain_nonzero_index_view_type entries,
+                             ain_values_scalar_view_type values, bool trans) {
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
   typedef typename KernelHandle::nnz_lno_t idx_type;
-  typedef typename KernelHandle::size_type  size_type;
-  typedef typename KernelHandle::scalar_t  scalar_type;
-  typedef typename KernelHandle::memory_space  memory_space;
+  typedef typename KernelHandle::size_type size_type;
+  typedef typename KernelHandle::scalar_t scalar_type;
+  typedef typename KernelHandle::memory_space memory_space;
 
-  const bool is_cuda_space = std::is_same<memory_space, Kokkos::CudaSpace>::value || std::is_same<memory_space, Kokkos::CudaUVMSpace>::value || std::is_same<memory_space, Kokkos::CudaHostPinnedSpace>::value;
+  const bool is_cuda_space =
+      std::is_same<memory_space, Kokkos::CudaSpace>::value ||
+      std::is_same<memory_space, Kokkos::CudaUVMSpace>::value ||
+      std::is_same<memory_space, Kokkos::CudaHostPinnedSpace>::value;
 
   if (!is_cuda_space) {
-    throw std::runtime_error ("KokkosKernels sptrsvcuSPARSE_symbolic: MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSPARSE\n");
-  }
-  else if (std::is_same<idx_type, int>::value) {
-
+    throw std::runtime_error(
+        "KokkosKernels sptrsvcuSPARSE_symbolic: MEMORY IS NOT ALLOCATED IN GPU "
+        "DEVICE for CUSPARSE\n");
+  } else if (std::is_same<idx_type, int>::value) {
     bool is_lower = sptrsv_handle->is_lower_tri();
     sptrsv_handle->create_cuSPARSE_Handle(trans, is_lower);
 
-    typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle();
+    typename KernelHandle::SPTRSVcuSparseHandleType* h =
+        sptrsv_handle->get_cuSparseHandle();
 
     cusparseStatus_t status;
     status = cusparseCreateCsrsv2Info(&(h->info));
     if (CUSPARSE_STATUS_SUCCESS != status)
-      std::cout << "csrsv2info create status error name " << (status) << std::endl;
+      std::cout << "csrsv2info create status error name " << (status)
+                << std::endl;
 
     // query how much memory used in csrsv2, and allocate the buffer
     int nnz = entries.extent_int(0);
@@ -94,331 +93,209 @@ namespace Impl{
 
     if (!std::is_same<size_type, int>::value)
       sptrsv_handle->allocate_tmp_int_rowmap(row_map.extent(0));
-    const int* rm  = !std::is_same<size_type, int>::value ? sptrsv_handle->get_int_rowmap_ptr_copy(row_map) : (const int*)row_map.data();
-    const int* ent = (const int*) entries.data();
+    const int* rm = !std::is_same<size_type, int>::value
+                        ? sptrsv_handle->get_int_rowmap_ptr_copy(row_map)
+                        : (const int*)row_map.data();
+    const int* ent          = (const int*)entries.data();
     const scalar_type* vals = values.data();
 
-    if (std::is_same<scalar_type,double>::value) {
-    cusparseDcsrsv2_bufferSize(
-      h->handle,
-      h->transpose,
-      nrows,
-      nnz,
-      h->descr,
-      (double*)vals,
-      (int*)rm,
-      (int*)ent,
-      h->info,
-      &pBufferSize);
-
+    if (std::is_same<scalar_type, double>::value) {
+      cusparseDcsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr,
+                                 (double*)vals, (int*)rm, (int*)ent, h->info,
+                                 &pBufferSize);
 
       // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes.
       cudaError_t my_error;
       my_error = cudaMalloc((void**)&(h->pBuffer), pBufferSize);
 
       if (cudaSuccess != my_error)
-        std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(my_error) << std::endl;
+        std::cout << "cudmalloc pBuffer error_t error name "
+                  << cudaGetErrorString(my_error) << std::endl;
 
       status = cusparseDcsrsv2_analysis(
-        h->handle,
-        h->transpose,
-        nrows,
-        nnz,
-        h->descr,
-        (double*)vals,
-        (int*)rm,
-        (int*)ent,
-        h->info,
-        h->policy,
-        h->pBuffer);
+          h->handle, h->transpose, nrows, nnz, h->descr, (double*)vals,
+          (int*)rm, (int*)ent, h->info, h->policy, h->pBuffer);
 
       if (CUSPARSE_STATUS_SUCCESS != status)
         std::cout << "analysis status error name " << (status) << std::endl;
-    }
-    else if (std::is_same<scalar_type,float>::value) {
-    cusparseScsrsv2_bufferSize(
-      h->handle,
-      h->transpose,
-      nrows,
-      nnz,
-      h->descr,
-      (float*)vals,
-      (int*)rm,
-      (int*)ent,
-      h->info,
-      &pBufferSize);
-
+    } else if (std::is_same<scalar_type, float>::value) {
+      cusparseScsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr,
+                                 (float*)vals, (int*)rm, (int*)ent, h->info,
+                                 &pBufferSize);
 
       // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes.
       cudaError_t my_error;
       my_error = cudaMalloc((void**)&(h->pBuffer), pBufferSize);
 
       if (cudaSuccess != my_error)
-        std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(my_error) << std::endl;
+        std::cout << "cudmalloc pBuffer error_t error name "
+                  << cudaGetErrorString(my_error) << std::endl;
 
       status = cusparseScsrsv2_analysis(
-        h->handle,
-        h->transpose,
-        nrows,
-        nnz,
-        h->descr,
-        (float*)vals,
-        (int*)rm,
-        (int*)ent,
-        h->info,
-        h->policy,
-        h->pBuffer);
+          h->handle, h->transpose, nrows, nnz, h->descr, (float*)vals, (int*)rm,
+          (int*)ent, h->info, h->policy, h->pBuffer);
 
       if (CUSPARSE_STATUS_SUCCESS != status)
         std::cout << "analysis status error name " << (status) << std::endl;
-    }
-    else if (std::is_same<scalar_type,Kokkos::complex<double>>::value) {
-    cusparseZcsrsv2_bufferSize(
-      h->handle,
-      h->transpose,
-      nrows,
-      nnz,
-      h->descr,
-      (cuDoubleComplex*)vals,
-      (int*)rm,
-      (int*)ent,
-      h->info,
-      &pBufferSize);
+    } else if (std::is_same<scalar_type, Kokkos::complex<double>>::value) {
+      cusparseZcsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr,
+                                 (cuDoubleComplex*)vals, (int*)rm, (int*)ent,
+                                 h->info, &pBufferSize);
 
       // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes.
       cudaError_t my_error;
       my_error = cudaMalloc((void**)&(h->pBuffer), pBufferSize);
 
       if (cudaSuccess != my_error)
-        std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(my_error) << std::endl;
+        std::cout << "cudmalloc pBuffer error_t error name "
+                  << cudaGetErrorString(my_error) << std::endl;
 
       status = cusparseZcsrsv2_analysis(
-        h->handle,
-        h->transpose,
-        nrows,
-        nnz,
-        h->descr,
-        (cuDoubleComplex*)vals,
-        (int*)rm,
-        (int*)ent,
-        h->info,
-        h->policy,
-        h->pBuffer);
+          h->handle, h->transpose, nrows, nnz, h->descr, (cuDoubleComplex*)vals,
+          (int*)rm, (int*)ent, h->info, h->policy, h->pBuffer);
 
       if (CUSPARSE_STATUS_SUCCESS != status)
         std::cout << "analysis status error name " << (status) << std::endl;
-    }
-    else if (std::is_same<scalar_type,Kokkos::complex<float>>::value) {
-    cusparseCcsrsv2_bufferSize(
-      h->handle,
-      h->transpose,
-      nrows,
-      nnz,
-      h->descr,
-      (cuComplex*)vals,
-      (int*)rm,
-      (int*)ent,
-      h->info,
-      &pBufferSize);
+    } else if (std::is_same<scalar_type, Kokkos::complex<float>>::value) {
+      cusparseCcsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr,
+                                 (cuComplex*)vals, (int*)rm, (int*)ent, h->info,
+                                 &pBufferSize);
 
       // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes.
       cudaError_t my_error;
       my_error = cudaMalloc((void**)&(h->pBuffer), pBufferSize);
 
       if (cudaSuccess != my_error)
-        std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(my_error) << std::endl;
+        std::cout << "cudmalloc pBuffer error_t error name "
+                  << cudaGetErrorString(my_error) << std::endl;
 
       status = cusparseCcsrsv2_analysis(
-        h->handle,
-        h->transpose,
-        nrows,
-        nnz,
-        h->descr,
-        (cuComplex*)vals,
-        (int*)rm,
-        (int*)ent,
-        h->info,
-        h->policy,
-        h->pBuffer);
+          h->handle, h->transpose, nrows, nnz, h->descr, (cuComplex*)vals,
+          (int*)rm, (int*)ent, h->info, h->policy, h->pBuffer);
 
       if (CUSPARSE_STATUS_SUCCESS != status)
         std::cout << "analysis status error name " << (status) << std::endl;
+    } else {
+      throw std::runtime_error("CUSPARSE wrapper error: unsupported type.\n");
     }
-    else {
-      throw std::runtime_error ("CUSPARSE wrapper error: unsupported type.\n");
-    }
-  }
-  else {
-    throw std::runtime_error ("CUSPARSE requires local ordinals to be integer.\n");
+  } else {
+    throw std::runtime_error(
+        "CUSPARSE requires local ordinals to be integer.\n");
   }
 #else
-    (void)sptrsv_handle;
-    (void)nrows;
-    (void)row_map;
-    (void)entries;
-    (void)values;
-    (void)trans;
-    throw std::runtime_error ("CUSPARSE IS NOT DEFINED\n");
-    //return;
+  (void)sptrsv_handle;
+  (void)nrows;
+  (void)row_map;
+  (void)entries;
+  (void)values;
+  (void)trans;
+  throw std::runtime_error("CUSPARSE IS NOT DEFINED\n");
+  // return;
 #endif
+}
 
-  }
-
-
-  template <typename KernelHandle,
-  typename ain_row_index_view_type,
-  typename ain_nonzero_index_view_type,
-  typename ain_values_scalar_view_type,
-  typename b_values_scalar_view_type,
-  typename x_values_scalar_view_type>
-  void sptrsvcuSPARSE_solve(
-      KernelHandle *sptrsv_handle,
-      typename KernelHandle::nnz_lno_t nrows,
-      ain_row_index_view_type row_map,
-      ain_nonzero_index_view_type entries,
-      ain_values_scalar_view_type values,
-      b_values_scalar_view_type rhs,
-      x_values_scalar_view_type lhs,
-      bool /*trans*/
-      )
-  {
-
+template <
+    typename KernelHandle, typename ain_row_index_view_type,
+    typename ain_nonzero_index_view_type, typename ain_values_scalar_view_type,
+    typename b_values_scalar_view_type, typename x_values_scalar_view_type>
+void sptrsvcuSPARSE_solve(KernelHandle* sptrsv_handle,
+                          typename KernelHandle::nnz_lno_t nrows,
+                          ain_row_index_view_type row_map,
+                          ain_nonzero_index_view_type entries,
+                          ain_values_scalar_view_type values,
+                          b_values_scalar_view_type rhs,
+                          x_values_scalar_view_type lhs, bool /*trans*/
+) {
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
   typedef typename KernelHandle::nnz_lno_t idx_type;
-  typedef typename KernelHandle::size_type  size_type;
-  typedef typename KernelHandle::scalar_t  scalar_type;
+  typedef typename KernelHandle::size_type size_type;
+  typedef typename KernelHandle::scalar_t scalar_type;
 
   if (std::is_same<idx_type, int>::value) {
-
     cusparseStatus_t status;
 
-    typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle();
+    typename KernelHandle::SPTRSVcuSparseHandleType* h =
+        sptrsv_handle->get_cuSparseHandle();
 
     int nnz = entries.extent_int(0);
 
-    const int* rm  = !std::is_same<size_type, int>::value ? sptrsv_handle->get_int_rowmap_ptr() : (const int*)row_map.data();
-    const int* ent = (const int*) entries.data();
+    const int* rm = !std::is_same<size_type, int>::value
+                        ? sptrsv_handle->get_int_rowmap_ptr()
+                        : (const int*)row_map.data();
+    const int* ent          = (const int*)entries.data();
     const scalar_type* vals = values.data();
-    const scalar_type* bv = rhs.data();
-    scalar_type* xv = lhs.data();
-
+    const scalar_type* bv   = rhs.data();
+    scalar_type* xv         = lhs.data();
 
-    if (std::is_same<scalar_type,double>::value) {
-
-      if (h->pBuffer == nullptr) { std::cout << "  pBuffer invalid" << std::endl; }
+    if (std::is_same<scalar_type, double>::value) {
+      if (h->pBuffer == nullptr) {
+        std::cout << "  pBuffer invalid" << std::endl;
+      }
       const double alpha = double(1);
 
-      status = cusparseDcsrsv2_solve(
-        h->handle,
-        h->transpose,
-        nrows,
-        nnz,
-        &alpha,
-        h->descr,
-        (double*)vals,
-        (int*)rm,
-        (int*)ent,
-        h->info,
-        (double*)bv,
-        (double*)xv,
-        h->policy,
-        h->pBuffer);
+      status = cusparseDcsrsv2_solve(h->handle, h->transpose, nrows, nnz,
+                                     &alpha, h->descr, (double*)vals, (int*)rm,
+                                     (int*)ent, h->info, (double*)bv,
+                                     (double*)xv, h->policy, h->pBuffer);
 
       if (CUSPARSE_STATUS_SUCCESS != status)
         std::cout << "solve status error name " << (status) << std::endl;
-    }
-    else if (std::is_same<scalar_type,float>::value) {
-
-      if (h->pBuffer == nullptr) { std::cout << "  pBuffer invalid" << std::endl; }
+    } else if (std::is_same<scalar_type, float>::value) {
+      if (h->pBuffer == nullptr) {
+        std::cout << "  pBuffer invalid" << std::endl;
+      }
       const float alpha = float(1);
 
-      status = cusparseScsrsv2_solve(
-        h->handle,
-        h->transpose,
-        nrows,
-        nnz,
-        &alpha,
-        h->descr,
-        (float*)vals,
-        (int*)rm,
-        (int*)ent,
-        h->info,
-        (float*)bv,
-        (float*)xv,
-        h->policy,
-        h->pBuffer);
+      status = cusparseScsrsv2_solve(h->handle, h->transpose, nrows, nnz,
+                                     &alpha, h->descr, (float*)vals, (int*)rm,
+                                     (int*)ent, h->info, (float*)bv, (float*)xv,
+                                     h->policy, h->pBuffer);
 
       if (CUSPARSE_STATUS_SUCCESS != status)
         std::cout << "solve status error name " << (status) << std::endl;
-    }
-    else if (std::is_same<scalar_type,Kokkos::complex<double>>::value) {
+    } else if (std::is_same<scalar_type, Kokkos::complex<double>>::value) {
       cuDoubleComplex cualpha;
       cualpha.x = 1.0;
       cualpha.y = 0.0;
-      status = cusparseZcsrsv2_solve(
-        h->handle,
-        h->transpose,
-        nrows,
-        nnz,
-        &cualpha,
-        h->descr,
-        (cuDoubleComplex*)vals,
-        (int*)rm,
-        (int*)ent,
-        h->info,
-        (cuDoubleComplex*)bv,
-        (cuDoubleComplex*)xv,
-        h->policy,
-        h->pBuffer);
+      status    = cusparseZcsrsv2_solve(
+          h->handle, h->transpose, nrows, nnz, &cualpha, h->descr,
+          (cuDoubleComplex*)vals, (int*)rm, (int*)ent, h->info,
+          (cuDoubleComplex*)bv, (cuDoubleComplex*)xv, h->policy, h->pBuffer);
 
       if (CUSPARSE_STATUS_SUCCESS != status)
         std::cout << "solve status error name " << (status) << std::endl;
-    }
-    else if (std::is_same<scalar_type,Kokkos::complex<float>>::value) {
+    } else if (std::is_same<scalar_type, Kokkos::complex<float>>::value) {
       cuComplex cualpha;
       cualpha.x = 1.0;
       cualpha.y = 0.0;
-      status = cusparseCcsrsv2_solve(
-        h->handle,
-        h->transpose,
-        nrows,
-        nnz,
-        &cualpha,
-        h->descr,
-        (cuComplex*)vals,
-        (int*)rm,
-        (int*)ent,
-        h->info,
-        (cuComplex*)bv,
-        (cuComplex*)xv,
-        h->policy,
-        h->pBuffer);
+      status    = cusparseCcsrsv2_solve(
+          h->handle, h->transpose, nrows, nnz, &cualpha, h->descr,
+          (cuComplex*)vals, (int*)rm, (int*)ent, h->info, (cuComplex*)bv,
+          (cuComplex*)xv, h->policy, h->pBuffer);
 
       if (CUSPARSE_STATUS_SUCCESS != status)
         std::cout << "solve status error name " << (status) << std::endl;
-    }
-    else {
-      throw std::runtime_error ("CUSPARSE wrapper error: unsupported type.\n");
+    } else {
+      throw std::runtime_error("CUSPARSE wrapper error: unsupported type.\n");
     }
 
-  }
-  else {
-    throw std::runtime_error ("CUSPARSE requires local ordinals to be integer.\n");
+  } else {
+    throw std::runtime_error(
+        "CUSPARSE requires local ordinals to be integer.\n");
   }
 #else
-    (void)sptrsv_handle;
-    (void)nrows;
-    (void)row_map;
-    (void)entries;
-    (void)values;
-    (void)rhs;
-    (void)lhs;
-    throw std::runtime_error ("CUSPARSE IS NOT DEFINED\n");
+  (void)sptrsv_handle;
+  (void)nrows;
+  (void)row_map;
+  (void)entries;
+  (void)values;
+  (void)rhs;
+  (void)lhs;
+  throw std::runtime_error("CUSPARSE IS NOT DEFINED\n");
 #endif
-
-  }
-
-}
 }
 
+}  // namespace Impl
+}  // namespace KokkosSparse
+
 #endif
diff --git a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp
index 628e59b433..7943b1e602 100644
--- a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp
@@ -56,19 +56,19 @@
 
 #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
 
- // Enable supernodal sptrsv
- #include "KokkosBlas2_gemv.hpp"
- #include "KokkosBlas2_team_gemv.hpp"
- #include "KokkosBlas3_trsm.hpp"
- #include "KokkosSparse_spmv.hpp"
+// Enable supernodal sptrsv
+#include "KokkosBlas2_gemv.hpp"
+#include "KokkosBlas2_team_gemv.hpp"
+#include "KokkosBlas3_trsm.hpp"
+#include "KokkosSparse_spmv.hpp"
 
- #include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Util.hpp"
 
- #include "KokkosBatched_Gemv_Decl.hpp"
- #include "KokkosBatched_Gemv_Team_Impl.hpp"
- #include "KokkosBatched_Gemv_Serial_Impl.hpp"
+#include "KokkosBatched_Gemv_Decl.hpp"
+#include "KokkosBatched_Gemv_Team_Impl.hpp"
+#include "KokkosBatched_Gemv_Serial_Impl.hpp"
 
- #include "KokkosBatched_Trsm_Team_Impl.hpp"
+#include "KokkosBatched_Trsm_Team_Impl.hpp"
 #endif
 
 //#define SERIAL_FOR_LOOP
@@ -84,8 +84,9 @@ namespace KokkosSparse {
 namespace Impl {
 namespace Experimental {
 
-#if defined(KOKKOS_ENABLE_CUDA) && 10000 < CUDA_VERSION && defined(KOKKOSKERNELS_ENABLE_EXP_CUDAGRAPH)
-  #define KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
+#if defined(KOKKOS_ENABLE_CUDA) && 10000 < CUDA_VERSION && \
+    defined(KOKKOSKERNELS_ENABLE_EXP_CUDAGRAPH)
+#define KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
 #endif
 
 struct UnsortedTag {};
@@ -112,11 +113,11 @@ struct EmptyFunctor {
   void operator()(const int) const {}
 };
 
-
-// This functor unifies the lower and upper implementations, the hope is the "is_lowertri" check does not add noticable time on larger problems
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType, class RHSType, class NGBLType>
-struct TriLvlSchedTP1SolverFunctor
-{
+// This functor unifies the lower and upper implementations, the hope is the
+// "is_lowertri" check does not add noticable time on larger problems
+template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
+          class RHSType, class NGBLType>
+struct TriLvlSchedTP1SolverFunctor {
   typedef typename RowMapType::execution_space execution_space;
   typedef Kokkos::TeamPolicy<execution_space> policy_type;
   typedef typename policy_type::member_type member_type;
@@ -132,83 +133,99 @@ struct TriLvlSchedTP1SolverFunctor
 
   const bool is_lowertri;
 
-  long node_count; // like "block" offset into ngbl, my_league is the "local" offset
+  long node_count;  // like "block" offset into ngbl, my_league is the "local"
+                    // offset
   long dense_nrows;
 
-
-  TriLvlSchedTP1SolverFunctor(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, const bool is_lowertri_, long node_count_, long dense_nrows_ = 0) :
-    row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), is_lowertri(is_lowertri_), node_count(node_count_), dense_nrows(dense_nrows_) {}
-
+  TriLvlSchedTP1SolverFunctor(const RowMapType &row_map_,
+                              const EntriesType &entries_,
+                              const ValuesType &values_, LHSType &lhs_,
+                              const RHSType &rhs_,
+                              const NGBLType &nodes_grouped_by_level_,
+                              const bool is_lowertri_, long node_count_,
+                              long dense_nrows_ = 0)
+      : row_map(row_map_),
+        entries(entries_),
+        values(values_),
+        lhs(lhs_),
+        rhs(rhs_),
+        nodes_grouped_by_level(nodes_grouped_by_level_),
+        is_lowertri(is_lowertri_),
+        node_count(node_count_),
+        dense_nrows(dense_nrows_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const member_type & team ) const {
-        auto my_league = team.league_rank(); // map to rowid
-        auto rowid = nodes_grouped_by_level(my_league + node_count);
-        auto my_rank = team.team_rank();
+  void operator()(const member_type &team) const {
+    auto my_league = team.league_rank();  // map to rowid
+    auto rowid     = nodes_grouped_by_level(my_league + node_count);
+    auto my_rank   = team.team_rank();
 
-        auto soffset = row_map(rowid);
-        auto eoffset = row_map(rowid+1);
-        auto rhs_rowid = rhs(rowid);
-        scalar_t diff = scalar_t(0.0);
+    auto soffset   = row_map(rowid);
+    auto eoffset   = row_map(rowid + 1);
+    auto rhs_rowid = rhs(rowid);
+    scalar_t diff  = scalar_t(0.0);
 
-      Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, soffset, eoffset ), [&] ( const long ptr, scalar_t &tdiff ) {
+    Kokkos::parallel_reduce(
+        Kokkos::TeamThreadRange(team, soffset, eoffset),
+        [&](const long ptr, scalar_t &tdiff) {
           auto colid = entries(ptr);
 
-          auto val   = values(ptr);
-          if ( colid != rowid ) {
-            tdiff = tdiff - val*lhs(colid);
+          auto val = values(ptr);
+          if (colid != rowid) {
+            tdiff = tdiff - val * lhs(colid);
           }
-      }, diff );
+        },
+        diff);
 
-        team.team_barrier();
+    team.team_barrier();
 
-        // At end, finalize rowid == colid
-        // only one thread should do this; can also use Kokkos::single
-        if ( my_rank == 0 )
-        {
-        // ASSUMPTION: sorted diagonal value located at eoffset - 1
-          lhs(rowid) = is_lowertri ? (rhs_rowid+diff)/values(eoffset-1) : (rhs_rowid+diff)/values(soffset);
-        }
+    // At end, finalize rowid == colid
+    // only one thread should do this; can also use Kokkos::single
+    if (my_rank == 0) {
+      // ASSUMPTION: sorted diagonal value located at eoffset - 1
+      lhs(rowid) = is_lowertri ? (rhs_rowid + diff) / values(eoffset - 1)
+                               : (rhs_rowid + diff) / values(soffset);
+    }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const UnsortedTag&, const member_type & team) const {
-        auto my_league = team.league_rank(); // map to rowid
-        auto rowid = nodes_grouped_by_level(my_league + node_count);
-        auto my_rank = team.team_rank();
+  void operator()(const UnsortedTag &, const member_type &team) const {
+    auto my_league = team.league_rank();  // map to rowid
+    auto rowid     = nodes_grouped_by_level(my_league + node_count);
+    auto my_rank   = team.team_rank();
 
-        auto soffset = row_map(rowid);
-        auto eoffset = row_map(rowid+1);
-        auto rhs_rowid = rhs(rowid);
-        scalar_t diff = scalar_t(0.0);
+    auto soffset   = row_map(rowid);
+    auto eoffset   = row_map(rowid + 1);
+    auto rhs_rowid = rhs(rowid);
+    scalar_t diff  = scalar_t(0.0);
 
-        auto diag = -1;
+    auto diag = -1;
 
-        Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, soffset, eoffset ), [&] ( const long ptr, scalar_t &tdiff ) {
+    Kokkos::parallel_reduce(
+        Kokkos::TeamThreadRange(team, soffset, eoffset),
+        [&](const long ptr, scalar_t &tdiff) {
           auto colid = entries(ptr);
           auto val   = values(ptr);
-          if ( colid != rowid ) {
-            tdiff = tdiff - val*lhs(colid);
-          }
-          else {
+          if (colid != rowid) {
+            tdiff = tdiff - val * lhs(colid);
+          } else {
             diag = ptr;
           }
-        }, diff );
-        team.team_barrier();
+        },
+        diff);
+    team.team_barrier();
 
-        // At end, finalize rowid == colid
-        // only one thread should do this; can also use Kokkos::single
-        if ( my_rank == 0 )
-        {
-          lhs(rowid) = (rhs_rowid+diff)/values(diag);
-        }
+    // At end, finalize rowid == colid
+    // only one thread should do this; can also use Kokkos::single
+    if (my_rank == 0) {
+      lhs(rowid) = (rhs_rowid + diff) / values(diag);
+    }
   }
 };
 
-
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType, class RHSType, class NGBLType>
-struct TriLvlSchedTP1SolverFunctorDiagValues
-{
+template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
+          class RHSType, class NGBLType>
+struct TriLvlSchedTP1SolverFunctorDiagValues {
   typedef typename RowMapType::execution_space execution_space;
   typedef Kokkos::TeamPolicy<execution_space> policy_type;
   typedef typename policy_type::member_type member_type;
@@ -221,54 +238,70 @@ struct TriLvlSchedTP1SolverFunctorDiagValues
   LHSType lhs;
   RHSType rhs;
   NGBLType nodes_grouped_by_level;
-  ValuesType diagonal_values; // inserted according to rowid
+  ValuesType diagonal_values;  // inserted according to rowid
 
   const bool is_lowertri;
 
-  long node_count; // like "block" offset into ngbl, my_league is the "local" offset
+  long node_count;  // like "block" offset into ngbl, my_league is the "local"
+                    // offset
   long dense_nrows;
 
-
-  TriLvlSchedTP1SolverFunctorDiagValues( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, const ValuesType &diagonal_values_, const bool is_lowertri_, long node_count_, long dense_nrows_ = 0) :
-    row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), diagonal_values(diagonal_values_), is_lowertri(is_lowertri_), node_count(node_count_), dense_nrows(dense_nrows_) {}
-
+  TriLvlSchedTP1SolverFunctorDiagValues(const RowMapType &row_map_,
+                                        const EntriesType &entries_,
+                                        const ValuesType &values_,
+                                        LHSType &lhs_, const RHSType &rhs_,
+                                        const NGBLType &nodes_grouped_by_level_,
+                                        const ValuesType &diagonal_values_,
+                                        const bool is_lowertri_,
+                                        long node_count_, long dense_nrows_ = 0)
+      : row_map(row_map_),
+        entries(entries_),
+        values(values_),
+        lhs(lhs_),
+        rhs(rhs_),
+        nodes_grouped_by_level(nodes_grouped_by_level_),
+        diagonal_values(diagonal_values_),
+        is_lowertri(is_lowertri_),
+        node_count(node_count_),
+        dense_nrows(dense_nrows_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const member_type & team ) const {
-        auto my_league = team.league_rank(); // map to rowid
-        auto rowid = nodes_grouped_by_level(my_league + node_count);
-        auto my_rank = team.team_rank();
+  void operator()(const member_type &team) const {
+    auto my_league = team.league_rank();  // map to rowid
+    auto rowid     = nodes_grouped_by_level(my_league + node_count);
+    auto my_rank   = team.team_rank();
 
-        auto soffset = row_map(rowid);
-        auto eoffset = row_map(rowid+1);
-        auto rhs_rowid = rhs(rowid);
-        scalar_t diff = scalar_t(0.0);
+    auto soffset   = row_map(rowid);
+    auto eoffset   = row_map(rowid + 1);
+    auto rhs_rowid = rhs(rowid);
+    scalar_t diff  = scalar_t(0.0);
 
-      Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, soffset, eoffset ), [&] ( const long ptr, scalar_t &tdiff ) {
+    Kokkos::parallel_reduce(
+        Kokkos::TeamThreadRange(team, soffset, eoffset),
+        [&](const long ptr, scalar_t &tdiff) {
           auto colid = entries(ptr);
           auto val   = values(ptr);
-          if ( colid != rowid ) {
-            tdiff = tdiff - val*lhs(colid);
+          if (colid != rowid) {
+            tdiff = tdiff - val * lhs(colid);
           }
-      }, diff );
+        },
+        diff);
 
-        team.team_barrier();
+    team.team_barrier();
 
-        // At end, finalize rowid == colid
-        // only one thread should do this; can also use Kokkos::single
-        if ( my_rank == 0 )
-        {
-          //lhs(rowid) = is_lowertri ? (rhs_rowid+diff)/values(eoffset-1) : (rhs_rowid+diff)/values(soffset);
-          lhs(rowid) = (rhs_rowid+diff)/diagonal_values(rowid);
-        }
+    // At end, finalize rowid == colid
+    // only one thread should do this; can also use Kokkos::single
+    if (my_rank == 0) {
+      // lhs(rowid) = is_lowertri ? (rhs_rowid+diff)/values(eoffset-1) :
+      // (rhs_rowid+diff)/values(soffset);
+      lhs(rowid) = (rhs_rowid + diff) / diagonal_values(rowid);
+    }
   }
-
 };
 
-
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType, class RHSType, class NGBLType>
-struct TriLvlSchedTP2SolverFunctor
-{
+template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
+          class RHSType, class NGBLType>
+struct TriLvlSchedTP2SolverFunctor {
   typedef typename RowMapType::execution_space execution_space;
   typedef Kokkos::TeamPolicy<execution_space> policy_type;
   typedef typename policy_type::member_type member_type;
@@ -283,86 +316,108 @@ struct TriLvlSchedTP2SolverFunctor
   NGBLType nodes_grouped_by_level;
 
   const bool is_lowertri;
-  long node_count; // like "block" offset into ngbl, my_league is the "local" offset
+  long node_count;  // like "block" offset into ngbl, my_league is the "local"
+                    // offset
   long node_groups;
   long dense_nrows;
 
-
-  TriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, const bool is_lowertri_, long node_count_, long node_groups_ = 0, long dense_nrows_ = 0) :
-    row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), is_lowertri(is_lowertri_), node_count(node_count_), node_groups(node_groups_), dense_nrows(dense_nrows_) {}
-
+  TriLvlSchedTP2SolverFunctor(const RowMapType &row_map_,
+                              const EntriesType &entries_,
+                              const ValuesType &values_, LHSType &lhs_,
+                              const RHSType &rhs_,
+                              const NGBLType &nodes_grouped_by_level_,
+                              const bool is_lowertri_, long node_count_,
+                              long node_groups_ = 0, long dense_nrows_ = 0)
+      : row_map(row_map_),
+        entries(entries_),
+        values(values_),
+        lhs(lhs_),
+        rhs(rhs_),
+        nodes_grouped_by_level(nodes_grouped_by_level_),
+        is_lowertri(is_lowertri_),
+        node_count(node_count_),
+        node_groups(node_groups_),
+        dense_nrows(dense_nrows_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const member_type & team) const {
-        auto my_league = team.league_rank(); // map to rowid
-
-        size_t nrows = row_map.extent(0) - 1;
-
-        Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 0, node_groups ), [&] ( const long ng ) {
-          auto rowid = nodes_grouped_by_level(node_count + my_league*node_groups + ng);
-          if ( size_t(rowid) < nrows ) {
-
-            auto soffset = row_map(rowid);
-            auto eoffset = row_map(rowid+1);
+  void operator()(const member_type &team) const {
+    auto my_league = team.league_rank();  // map to rowid
+
+    size_t nrows = row_map.extent(0) - 1;
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) {
+          auto rowid =
+              nodes_grouped_by_level(node_count + my_league * node_groups + ng);
+          if (size_t(rowid) < nrows) {
+            auto soffset   = row_map(rowid);
+            auto eoffset   = row_map(rowid + 1);
             auto rhs_rowid = rhs(rowid);
-            scalar_t diff = scalar_t(0.0);
-
-            Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( team, soffset, eoffset ), [&] ( const long ptr, scalar_t &tdiff ) {
-              auto colid = entries(ptr);
-              auto val   = values(ptr);
-              if ( colid != rowid ) {
-                tdiff = tdiff - val*lhs(colid);
-              }
-            }, diff );
+            scalar_t diff  = scalar_t(0.0);
+
+            Kokkos::parallel_reduce(
+                Kokkos::ThreadVectorRange(team, soffset, eoffset),
+                [&](const long ptr, scalar_t &tdiff) {
+                  auto colid = entries(ptr);
+                  auto val   = values(ptr);
+                  if (colid != rowid) {
+                    tdiff = tdiff - val * lhs(colid);
+                  }
+                },
+                diff);
 
             // ASSUMPTION: sorted diagonal value located at eoffset - 1
-            lhs(rowid) = is_lowertri ? (rhs_rowid+diff)/values(eoffset-1) : (rhs_rowid+diff)/values(soffset);
-          } // end if
-        }); // end TeamThreadRange
+            lhs(rowid) = is_lowertri ? (rhs_rowid + diff) / values(eoffset - 1)
+                                     : (rhs_rowid + diff) / values(soffset);
+          }  // end if
+        });  // end TeamThreadRange
 
-        team.team_barrier();
+    team.team_barrier();
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const UnsortedTag&, const member_type & team) const {
-        auto my_league = team.league_rank(); // map to rowid
-
-        size_t nrows = row_map.extent(0) - 1;
-
-        Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 0, node_groups ), [&] ( const long ng ) {
-          auto rowid = nodes_grouped_by_level(node_count + my_league*node_groups + ng);
-          if ( size_t(rowid) < nrows ) {
-            auto soffset = row_map(rowid);
-            auto eoffset = row_map(rowid+1);
+  void operator()(const UnsortedTag &, const member_type &team) const {
+    auto my_league = team.league_rank();  // map to rowid
+
+    size_t nrows = row_map.extent(0) - 1;
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) {
+          auto rowid =
+              nodes_grouped_by_level(node_count + my_league * node_groups + ng);
+          if (size_t(rowid) < nrows) {
+            auto soffset   = row_map(rowid);
+            auto eoffset   = row_map(rowid + 1);
             auto rhs_rowid = rhs(rowid);
-            scalar_t diff = scalar_t(0.0);
+            scalar_t diff  = scalar_t(0.0);
 
             auto diag = -1;
-            Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( team, soffset, eoffset ), [&] ( const long ptr, scalar_t &tdiff ) {
-              auto colid = entries(ptr);
-              auto val   = values(ptr);
-              if ( colid != rowid ) {
-                tdiff = tdiff - val*lhs(colid);
-              }
-              else {
-                diag = ptr;
-              }
-            }, diff );
-
-            lhs(rowid) = (rhs_rowid+diff)/values(diag);
-          } // end if
-        }); // end TeamThreadRange
+            Kokkos::parallel_reduce(
+                Kokkos::ThreadVectorRange(team, soffset, eoffset),
+                [&](const long ptr, scalar_t &tdiff) {
+                  auto colid = entries(ptr);
+                  auto val   = values(ptr);
+                  if (colid != rowid) {
+                    tdiff = tdiff - val * lhs(colid);
+                  } else {
+                    diag = ptr;
+                  }
+                },
+                diff);
+
+            lhs(rowid) = (rhs_rowid + diff) / values(diag);
+          }  // end if
+        });  // end TeamThreadRange
 
-        team.team_barrier();
+    team.team_barrier();
   }
 };
 
-
 // Lower vs Upper Multi-block Functors
 
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType, class RHSType, class NGBLType>
-struct LowerTriLvlSchedRPSolverFunctor
-{
+template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
+          class RHSType, class NGBLType>
+struct LowerTriLvlSchedRPSolverFunctor {
   typedef typename EntriesType::non_const_value_type lno_t;
   RowMapType row_map;
   EntriesType entries;
@@ -371,58 +426,63 @@ struct LowerTriLvlSchedRPSolverFunctor
   RHSType rhs;
   NGBLType nodes_grouped_by_level;
 
-  LowerTriLvlSchedRPSolverFunctor( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, NGBLType nodes_grouped_by_level_ ) :
-    row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_) {}
-
+  LowerTriLvlSchedRPSolverFunctor(const RowMapType &row_map_,
+                                  const EntriesType &entries_,
+                                  const ValuesType &values_, LHSType &lhs_,
+                                  const RHSType &rhs_,
+                                  NGBLType nodes_grouped_by_level_)
+      : row_map(row_map_),
+        entries(entries_),
+        values(values_),
+        lhs(lhs_),
+        rhs(rhs_),
+        nodes_grouped_by_level(nodes_grouped_by_level_) {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const lno_t i) const {
     auto rowid = nodes_grouped_by_level(i);
-    // Assuming indices are sorted per row, diag entry is final index in the list
+    // Assuming indices are sorted per row, diag entry is final index in the
+    // list
 
-    auto soffset = row_map(rowid);
-    auto eoffset = row_map(rowid+1);
+    auto soffset   = row_map(rowid);
+    auto eoffset   = row_map(rowid + 1);
     auto rhs_rowid = rhs(rowid);
 
-    for ( auto ptr = soffset; ptr < eoffset; ++ptr ) {
+    for (auto ptr = soffset; ptr < eoffset; ++ptr) {
       auto colid = entries(ptr);
       auto val   = values(ptr);
-      if ( colid != rowid ) {
-        rhs_rowid = rhs_rowid - val*lhs(colid);
-      }
-      else {
-        lhs(rowid) = rhs_rowid/val;
+      if (colid != rowid) {
+        rhs_rowid = rhs_rowid - val * lhs(colid);
+      } else {
+        lhs(rowid) = rhs_rowid / val;
       }
-    } // end for ptr
+    }  // end for ptr
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const UnsortedTag&, const lno_t i) const {
-    auto rowid = nodes_grouped_by_level(i);
-    auto soffset = row_map(rowid);
-    auto eoffset = row_map(rowid+1);
+  void operator()(const UnsortedTag &, const lno_t i) const {
+    auto rowid     = nodes_grouped_by_level(i);
+    auto soffset   = row_map(rowid);
+    auto eoffset   = row_map(rowid + 1);
     auto rhs_rowid = rhs(rowid);
-    auto diag = -1;
+    auto diag      = -1;
 
-    for ( auto ptr = soffset; ptr < eoffset; ++ptr ) {
+    for (auto ptr = soffset; ptr < eoffset; ++ptr) {
       auto colid = entries(ptr);
       auto val   = values(ptr);
-      if ( colid != rowid ) {
-        rhs_rowid = rhs_rowid - val*lhs(colid);
-      }
-      else {
+      if (colid != rowid) {
+        rhs_rowid = rhs_rowid - val * lhs(colid);
+      } else {
         diag = ptr;
       }
-    } // end for ptr
-    lhs(rowid) = rhs_rowid/values(diag);
+    }  // end for ptr
+    lhs(rowid) = rhs_rowid / values(diag);
   }
 };
 
-
-
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType, class RHSType, class NGBLType>
-struct LowerTriLvlSchedTP1SolverFunctor
-{
+template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
+          class RHSType, class NGBLType>
+struct LowerTriLvlSchedTP1SolverFunctor {
   typedef typename RowMapType::execution_space execution_space;
   typedef Kokkos::TeamPolicy<execution_space> policy_type;
   typedef typename policy_type::member_type member_type;
@@ -436,84 +496,98 @@ struct LowerTriLvlSchedTP1SolverFunctor
   RHSType rhs;
   NGBLType nodes_grouped_by_level;
 
-  long node_count; // like "block" offset into ngbl, my_league is the "local" offset
+  long node_count;  // like "block" offset into ngbl, my_league is the "local"
+                    // offset
   long node_groups;
 
-
-  LowerTriLvlSchedTP1SolverFunctor( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, long node_count_, long node_groups_ = 0) :
-    row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), node_count(node_count_), node_groups(node_groups_) {}
-
+  LowerTriLvlSchedTP1SolverFunctor(const RowMapType &row_map_,
+                                   const EntriesType &entries_,
+                                   const ValuesType &values_, LHSType &lhs_,
+                                   const RHSType &rhs_,
+                                   const NGBLType &nodes_grouped_by_level_,
+                                   long node_count_, long node_groups_ = 0)
+      : row_map(row_map_),
+        entries(entries_),
+        values(values_),
+        lhs(lhs_),
+        rhs(rhs_),
+        nodes_grouped_by_level(nodes_grouped_by_level_),
+        node_count(node_count_),
+        node_groups(node_groups_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const member_type & team ) const {
-        auto my_league = team.league_rank(); // map to rowid
-        auto rowid = nodes_grouped_by_level(my_league + node_count);
-        auto my_rank = team.team_rank();
+  void operator()(const member_type &team) const {
+    auto my_league = team.league_rank();  // map to rowid
+    auto rowid     = nodes_grouped_by_level(my_league + node_count);
+    auto my_rank   = team.team_rank();
 
-        auto soffset = row_map(rowid);
-        auto eoffset = row_map(rowid+1);
-        auto rhs_rowid = rhs(rowid);
-        scalar_t diff = scalar_t(0.0);
+    auto soffset   = row_map(rowid);
+    auto eoffset   = row_map(rowid + 1);
+    auto rhs_rowid = rhs(rowid);
+    scalar_t diff  = scalar_t(0.0);
 
-      Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, soffset, eoffset ), [&] ( const long ptr, scalar_t &tdiff ) {
+    Kokkos::parallel_reduce(
+        Kokkos::TeamThreadRange(team, soffset, eoffset),
+        [&](const long ptr, scalar_t &tdiff) {
           auto colid = entries(ptr);
           auto val   = values(ptr);
-          if ( colid != rowid ) {
-            tdiff = tdiff - val*lhs(colid);
+          if (colid != rowid) {
+            tdiff = tdiff - val * lhs(colid);
           }
-      }, diff );
+        },
+        diff);
 
-        team.team_barrier();
+    team.team_barrier();
 
-        // At end, finalize rowid == colid
-        // only one thread should do this; can also use Kokkos::single
-        if ( my_rank == 0 )
-        {
-        // ASSUMPTION: sorted diagonal value located at eoffset - 1
-          lhs(rowid) = (rhs_rowid+diff)/values(eoffset-1);
-        }
+    // At end, finalize rowid == colid
+    // only one thread should do this; can also use Kokkos::single
+    if (my_rank == 0) {
+      // ASSUMPTION: sorted diagonal value located at eoffset - 1
+      lhs(rowid) = (rhs_rowid + diff) / values(eoffset - 1);
+    }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const UnsortedTag&, const member_type & team) const {
-        auto my_league = team.league_rank(); // map to rowid
-        auto rowid = nodes_grouped_by_level(my_league + node_count);
-        auto my_rank = team.team_rank();
+  void operator()(const UnsortedTag &, const member_type &team) const {
+    auto my_league = team.league_rank();  // map to rowid
+    auto rowid     = nodes_grouped_by_level(my_league + node_count);
+    auto my_rank   = team.team_rank();
 
-        auto soffset = row_map(rowid);
-        auto eoffset = row_map(rowid+1);
-        auto rhs_rowid = rhs(rowid);
-        scalar_t diff = scalar_t(0.0);
+    auto soffset   = row_map(rowid);
+    auto eoffset   = row_map(rowid + 1);
+    auto rhs_rowid = rhs(rowid);
+    scalar_t diff  = scalar_t(0.0);
 
-        auto diag = -1;
+    auto diag = -1;
 
-        Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, soffset, eoffset ), [&] ( const long ptr, scalar_t &tdiff ) {
+    Kokkos::parallel_reduce(
+        Kokkos::TeamThreadRange(team, soffset, eoffset),
+        [&](const long ptr, scalar_t &tdiff) {
           auto colid = entries(ptr);
           auto val   = values(ptr);
-          if ( colid != rowid ) {
-            tdiff = tdiff - val*lhs(colid);
-          }
-          else {
+          if (colid != rowid) {
+            tdiff = tdiff - val * lhs(colid);
+          } else {
             diag = ptr;
           }
-        }, diff );
-        team.team_barrier();
+        },
+        diff);
+    team.team_barrier();
 
-        // At end, finalize rowid == colid
-        // only one thread should do this; can also use Kokkos::single
-        if ( my_rank == 0 )
-        {
-          lhs(rowid) = (rhs_rowid+diff)/values(diag);
-        }
+    // At end, finalize rowid == colid
+    // only one thread should do this; can also use Kokkos::single
+    if (my_rank == 0) {
+      lhs(rowid) = (rhs_rowid + diff) / values(diag);
+    }
   }
 };
 
-
 // FIXME CUDA: This algorithm not working with all integral type combos
-// In any case, this serves as a skeleton for 3-level hierarchical parallelism for alg dev
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType, class RHSType, class NGBLType>
-struct LowerTriLvlSchedTP2SolverFunctor
-{
+// In any case, this serves as a skeleton for 3-level hierarchical parallelism
+// for alg dev
+template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
+          class RHSType, class NGBLType>
+struct LowerTriLvlSchedTP2SolverFunctor {
   typedef typename RowMapType::execution_space execution_space;
   typedef Kokkos::TeamPolicy<execution_space> policy_type;
   typedef typename policy_type::member_type member_type;
@@ -527,77 +601,96 @@ struct LowerTriLvlSchedTP2SolverFunctor
   RHSType rhs;
   NGBLType nodes_grouped_by_level;
 
-  long node_count; // like "block" offset into ngbl, my_league is the "local" offset
+  long node_count;  // like "block" offset into ngbl, my_league is the "local"
+                    // offset
   long node_groups;
 
-
-  LowerTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, long node_count_, long node_groups_ = 0) :
-    row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), node_count(node_count_), node_groups(node_groups_) {}
-
+  LowerTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_,
+                                   const EntriesType &entries_,
+                                   const ValuesType &values_, LHSType &lhs_,
+                                   const RHSType &rhs_,
+                                   const NGBLType &nodes_grouped_by_level_,
+                                   long node_count_, long node_groups_ = 0)
+      : row_map(row_map_),
+        entries(entries_),
+        values(values_),
+        lhs(lhs_),
+        rhs(rhs_),
+        nodes_grouped_by_level(nodes_grouped_by_level_),
+        node_count(node_count_),
+        node_groups(node_groups_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const member_type & team) const {
-        auto my_league = team.league_rank(); // map to rowid
-
-        size_t nrows = row_map.extent(0) - 1;
-
-        Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 0, node_groups ), [&] ( const long ng ) {
-          auto rowid = nodes_grouped_by_level(node_count + my_league*node_groups + ng);
-          if ( size_t(rowid) < nrows ) {
-
-            auto soffset = row_map(rowid);
-            auto eoffset = row_map(rowid+1);
+  void operator()(const member_type &team) const {
+    auto my_league = team.league_rank();  // map to rowid
+
+    size_t nrows = row_map.extent(0) - 1;
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) {
+          auto rowid =
+              nodes_grouped_by_level(node_count + my_league * node_groups + ng);
+          if (size_t(rowid) < nrows) {
+            auto soffset   = row_map(rowid);
+            auto eoffset   = row_map(rowid + 1);
             auto rhs_rowid = rhs(rowid);
-            scalar_t diff = scalar_t(0.0);
-
-            Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( team, soffset, eoffset ), [&] ( const long ptr, scalar_t &tdiff ) {
-              auto colid = entries(ptr);
-              auto val   = values(ptr);
-              if ( colid != rowid ) {
-                tdiff = tdiff - val*lhs(colid);
-              }
-            }, diff );
+            scalar_t diff  = scalar_t(0.0);
+
+            Kokkos::parallel_reduce(
+                Kokkos::ThreadVectorRange(team, soffset, eoffset),
+                [&](const long ptr, scalar_t &tdiff) {
+                  auto colid = entries(ptr);
+                  auto val   = values(ptr);
+                  if (colid != rowid) {
+                    tdiff = tdiff - val * lhs(colid);
+                  }
+                },
+                diff);
 
             // ASSUMPTION: sorted diagonal value located at eoffset - 1
-            lhs(rowid) = (rhs_rowid+diff)/values(eoffset-1);
-          } // end if
-        }); // end TeamThreadRange
+            lhs(rowid) = (rhs_rowid + diff) / values(eoffset - 1);
+          }  // end if
+        });  // end TeamThreadRange
 
-        team.team_barrier();
+    team.team_barrier();
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const UnsortedTag&, const member_type & team) const {
-        auto my_league = team.league_rank(); // map to rowid
-
-        size_t nrows = row_map.extent(0) - 1;
-
-        Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 0, node_groups ), [&] ( const long ng ) {
-          auto rowid = nodes_grouped_by_level(node_count + my_league*node_groups + ng);
-          if ( size_t(rowid) < nrows ) {
-            auto soffset = row_map(rowid);
-            auto eoffset = row_map(rowid+1);
+  void operator()(const UnsortedTag &, const member_type &team) const {
+    auto my_league = team.league_rank();  // map to rowid
+
+    size_t nrows = row_map.extent(0) - 1;
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) {
+          auto rowid =
+              nodes_grouped_by_level(node_count + my_league * node_groups + ng);
+          if (size_t(rowid) < nrows) {
+            auto soffset   = row_map(rowid);
+            auto eoffset   = row_map(rowid + 1);
             auto rhs_rowid = rhs(rowid);
-            scalar_t diff = scalar_t(0.0);
+            scalar_t diff  = scalar_t(0.0);
 
             auto diag = -1;
-            Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( team, soffset, eoffset ), [&] ( const long ptr, scalar_t &tdiff ) {
-              auto colid = entries(ptr);
-              auto val   = values(ptr);
-              if ( colid != rowid ) {
-                tdiff = tdiff - val*lhs(colid);
-              }
-              else {
-                diag = ptr;
-              }
-            }, diff );
+            Kokkos::parallel_reduce(
+                Kokkos::ThreadVectorRange(team, soffset, eoffset),
+                [&](const long ptr, scalar_t &tdiff) {
+                  auto colid = entries(ptr);
+                  auto val   = values(ptr);
+                  if (colid != rowid) {
+                    tdiff = tdiff - val * lhs(colid);
+                  } else {
+                    diag = ptr;
+                  }
+                },
+                diff);
 
             // ASSUMPTION: sorted diagonal value located at eoffset - 1
-            lhs(rowid) = (rhs_rowid+diff)/values(diag);
-          } // end if
-        }); // end TeamThreadRange
+            lhs(rowid) = (rhs_rowid + diff) / values(diag);
+          }  // end if
+        });  // end TeamThreadRange
 
-        team.team_barrier();
+    team.team_barrier();
   }
 };
 
@@ -605,19 +698,20 @@ struct LowerTriLvlSchedTP2SolverFunctor
 // -----------------------------------------------------------
 // Helper functors for Lower-triangular solve with SpMV
 template <class TriSolveHandle, class LHSType, class NGBLType>
-struct SparseTriSupernodalSpMVFunctor
-{
-  //using execution_space = typename LHSType::execution_space;
-  //using memory_space = typename execution_space::memory_space;
+struct SparseTriSupernodalSpMVFunctor {
+  // using execution_space = typename LHSType::execution_space;
+  // using memory_space = typename execution_space::memory_space;
   using execution_space = typename TriSolveHandle::HandleExecSpace;
-  using memory_space = typename TriSolveHandle::HandleTempMemorySpace;
+  using memory_space    = typename TriSolveHandle::HandleTempMemorySpace;
 
   using policy_type = Kokkos::TeamPolicy<execution_space>;
   using member_type = typename policy_type::member_type;
 
   using scalar_t = typename LHSType::non_const_value_type;
 
-  using work_view_t = typename Kokkos::View<scalar_t*, Kokkos::Device<execution_space, memory_space>>;
+  using work_view_t =
+      typename Kokkos::View<scalar_t *,
+                            Kokkos::Device<execution_space, memory_space>>;
 
   int flag;
   long node_count;
@@ -630,75 +724,77 @@ struct SparseTriSupernodalSpMVFunctor
   work_view_t work;
 
   // constructor
-  SparseTriSupernodalSpMVFunctor (int flag_,
-                                 long  node_count_,
+  SparseTriSupernodalSpMVFunctor(int flag_, long node_count_,
                                  const NGBLType &nodes_grouped_by_level_,
-                                 const int *supercols_,
-                                 const int *workoffset_,
-                                 LHSType &X_,
-                                 work_view_t work_) :
-    flag(flag_), node_count(node_count_), nodes_grouped_by_level(nodes_grouped_by_level_), supercols(supercols_),
-    workoffset(workoffset_), X(X_), work(work_) {
-  }
+                                 const int *supercols_, const int *workoffset_,
+                                 LHSType &X_, work_view_t work_)
+      : flag(flag_),
+        node_count(node_count_),
+        nodes_grouped_by_level(nodes_grouped_by_level_),
+        supercols(supercols_),
+        workoffset(workoffset_),
+        X(X_),
+        work(work_) {}
 
   // operator
   KOKKOS_INLINE_FUNCTION
-  void operator()(const member_type & team) const {
-    const int league_rank = team.league_rank(); // batch id
-    const int team_size = team.team_size ();
-    const int team_rank = team.team_rank ();
-    const scalar_t zero (0.0);
+  void operator()(const member_type &team) const {
+    const int league_rank = team.league_rank();  // batch id
+    const int team_size   = team.team_size();
+    const int team_rank   = team.team_rank();
+    const scalar_t zero(0.0);
 
-    auto s = nodes_grouped_by_level (node_count + league_rank);
+    auto s = nodes_grouped_by_level(node_count + league_rank);
 
     // copy vector elements for the diagonal to input vector (work)
     // and zero out the corresponding elements in output (X)
     int w1 = workoffset[s];
     int j1 = supercols[s];
     // number of columns in the s-th supernode column
-    int nscol = supercols[s+1] - j1 ;
+    int nscol = supercols[s + 1] - j1;
 
     if (flag == -2) {
       // copy X to work
       for (int j = team_rank; j < nscol; j += team_size) {
-        work (w1 + j) = X (j1 + j);
+        work(w1 + j) = X(j1 + j);
       }
     } else if (flag == -1) {
       // copy work to X
       for (int j = team_rank; j < nscol; j += team_size) {
-        X (j1 + j) = work (w1 + j);
+        X(j1 + j) = work(w1 + j);
       }
     } else if (flag == 1) {
       for (int j = team_rank; j < nscol; j += team_size) {
-        work (w1 + j) = X (j1 + j);
-        X (j1 + j) = zero;
+        work(w1 + j) = X(j1 + j);
+        X(j1 + j)    = zero;
       }
     } else {
       // reinitialize work to zero
       for (int j = team_rank; j < nscol; j += team_size) {
-        work (w1 + j) = zero;
+        work(w1 + j) = zero;
       }
     }
-    team.team_barrier ();
+    team.team_barrier();
   }
 };
 
-
 // -----------------------------------------------------------
 // Functor for Lower-triangular solve
-template <class TriSolveHandle, class ColptrView, class RowindType, class ValuesType, class LHSType, class NGBLType>
-struct LowerTriSupernodalFunctor
-{
+template <class TriSolveHandle, class ColptrView, class RowindType,
+          class ValuesType, class LHSType, class NGBLType>
+struct LowerTriSupernodalFunctor {
   using execution_space = typename TriSolveHandle::HandleExecSpace;
-  using memory_space = typename TriSolveHandle::HandleTempMemorySpace;
+  using memory_space    = typename TriSolveHandle::HandleTempMemorySpace;
 
-  using policy_type =  Kokkos::TeamPolicy<execution_space>;
+  using policy_type = Kokkos::TeamPolicy<execution_space>;
   using member_type = typename policy_type::member_type;
 
   using scalar_t = typename ValuesType::non_const_value_type;
 
-  using integer_view_t = Kokkos::View<int*, memory_space>;
-  using work_view_t = typename Kokkos::View<scalar_t*, Kokkos::Device<execution_space, memory_space>>;
+  using integer_view_t = Kokkos::View<int *, memory_space>;
+  using work_view_t =
+      typename Kokkos::View<scalar_t *,
+                            Kokkos::Device<execution_space, memory_space>>;
 
   using range_type = Kokkos::pair<int, int>;
 
@@ -716,7 +812,7 @@ struct LowerTriSupernodalFunctor
 
   LHSType X;
 
-  work_view_t work; // needed with gemv for update&scatter
+  work_view_t work;  // needed with gemv for update&scatter
   integer_view_t work_offset;
 
   NGBLType nodes_grouped_by_level;
@@ -724,164 +820,186 @@ struct LowerTriSupernodalFunctor
   long node_count;
 
   // constructor
-  LowerTriSupernodalFunctor (// supernode info
-                             const bool unit_diagonal_,
-                             const bool invert_diagonal_,
-                             const bool invert_offdiagonal_,
-                             const int *supercols_,
-                             // L in CSC
-                             const ColptrView  &colptr_,
-                             const RowindType &rowind_,
-                             const ValuesType &values_,
-                             // options to pick kernel type
-                             int level_,
-                             integer_view_t &kernel_type_,
-                             integer_view_t &diag_kernel_type_,
-                             // right-hand-side (input), solution (output)
-                             LHSType &X_,
-                             // workspace
-                             work_view_t work_,
-                             integer_view_t &work_offset_,
-                             //
-                             const NGBLType &nodes_grouped_by_level_,
-                             long  node_count_) :
-    unit_diagonal(unit_diagonal_), invert_diagonal(invert_diagonal_), invert_offdiagonal(invert_offdiagonal_),
-    supercols(supercols_), colptr(colptr_), rowind(rowind_), values(values_),
-    level(level_), kernel_type(kernel_type_), diag_kernel_type(diag_kernel_type_),
-    X(X_), work(work_), work_offset(work_offset_),
-    nodes_grouped_by_level(nodes_grouped_by_level_), node_count(node_count_) {
-  }
+  LowerTriSupernodalFunctor(  // supernode info
+      const bool unit_diagonal_, const bool invert_diagonal_,
+      const bool invert_offdiagonal_, const int *supercols_,
+      // L in CSC
+      const ColptrView &colptr_, const RowindType &rowind_,
+      const ValuesType &values_,
+      // options to pick kernel type
+      int level_, integer_view_t &kernel_type_,
+      integer_view_t &diag_kernel_type_,
+      // right-hand-side (input), solution (output)
+      LHSType &X_,
+      // workspace
+      work_view_t work_, integer_view_t &work_offset_,
+      //
+      const NGBLType &nodes_grouped_by_level_, long node_count_)
+      : unit_diagonal(unit_diagonal_),
+        invert_diagonal(invert_diagonal_),
+        invert_offdiagonal(invert_offdiagonal_),
+        supercols(supercols_),
+        colptr(colptr_),
+        rowind(rowind_),
+        values(values_),
+        level(level_),
+        kernel_type(kernel_type_),
+        diag_kernel_type(diag_kernel_type_),
+        X(X_),
+        work(work_),
+        work_offset(work_offset_),
+        nodes_grouped_by_level(nodes_grouped_by_level_),
+        node_count(node_count_) {}
 
   // operator
   KOKKOS_INLINE_FUNCTION
-  void operator()(const member_type & team) const {
+  void operator()(const member_type &team) const {
     /* ---------------------------------------------------------------------- */
     /* get inputs */
     /* ---------------------------------------------------------------------- */
-    const int league_rank = team.league_rank(); // batch id
-    const int team_size = team.team_size ();
-    const int team_rank = team.team_rank ();
-    const scalar_t zero (0.0);
-    const scalar_t one (1.0);
+    const int league_rank = team.league_rank();  // batch id
+    const int team_size   = team.team_size();
+    const int team_rank   = team.team_rank();
+    const scalar_t zero(0.0);
+    const scalar_t one(1.0);
 
-    auto s = nodes_grouped_by_level (node_count + league_rank);
+    auto s = nodes_grouped_by_level(node_count + league_rank);
 
     // supernodal column size
     const int j1 = supercols[s];
-    const int j2 = supercols[s+1];
+    const int j2 = supercols[s + 1];
     // > number of columns in the s-th supernode column
     const int nscol = j2 - j1;
     // "total" number of rows in all the supernodes (diagonal+off-diagonal)
-    const int i1 = colptr (j1);
-    const int nsrow = colptr (j1+1) - i1;
+    const int i1    = colptr(j1);
+    const int nsrow = colptr(j1 + 1) - i1;
 
     // create a view for the s-th supernocal column
     // NOTE: we currently supports only default_layout = LayoutLeft
-    scalar_t *dataL = const_cast<scalar_t*> (values.data ());
-    Kokkos::View<scalar_t**, default_layout, memory_space, Kokkos::MemoryUnmanaged> viewL (&dataL[i1], nsrow, nscol);
+    scalar_t *dataL = const_cast<scalar_t *>(values.data());
+    Kokkos::View<scalar_t **, default_layout, memory_space,
+                 Kokkos::MemoryUnmanaged>
+        viewL(&dataL[i1], nsrow, nscol);
 
     // extract part of the solution, corresponding to the diagonal block
-    auto Xj = Kokkos::subview (X, range_type(j1, j2));
+    auto Xj = Kokkos::subview(X, range_type(j1, j2));
 
     // workspace
-    const int workoffset = work_offset (s);
-    auto Z = Kokkos::subview (work, range_type(workoffset+nscol, workoffset+nsrow));
+    const int workoffset = work_offset(s);
+    auto Z               = Kokkos::subview(
+        work, range_type(workoffset + nscol, workoffset + nsrow));
 
-    if (diag_kernel_type (level) != 3) { // not a device-level TRSM-solve
+    if (diag_kernel_type(level) != 3) {  // not a device-level TRSM-solve
       if (invert_offdiagonal) {
         // combined TRSM solve with diagonal + GEMV update with off-diagonal
-        auto Y = Kokkos::subview (work, range_type(workoffset, workoffset+nsrow));  // needed for gemv instead of trmv/trsv
-        auto Ljj = Kokkos::subview (viewL, range_type (0, nsrow), Kokkos::ALL ());
-        KokkosBatched::TeamGemv<member_type,
-                                KokkosBatched::Trans::NoTranspose,
-                                KokkosBatched::Algo::Gemv::Unblocked>
-          ::invoke(team, one, Ljj, Xj, zero, Y);
-        team.team_barrier ();
+        auto Y = Kokkos::subview(
+            work,
+            range_type(
+                workoffset,
+                workoffset + nsrow));  // needed for gemv instead of trmv/trsv
+        auto Ljj = Kokkos::subview(viewL, range_type(0, nsrow), Kokkos::ALL());
+        KokkosBatched::TeamGemv<
+            member_type, KokkosBatched::Trans::NoTranspose,
+            KokkosBatched::Algo::Gemv::Unblocked>::invoke(team, one, Ljj, Xj,
+                                                          zero, Y);
+        team.team_barrier();
         for (int ii = team_rank; ii < nscol; ii += team_size) {
           Xj(ii) = Y(ii);
         }
-        team.team_barrier ();
+        team.team_barrier();
       } else {
         /* TRSM with diagonal block */
         // extract diagonal and off-diagonal blocks of L
-        auto Ljj = Kokkos::subview (viewL, range_type (0, nscol), Kokkos::ALL ());
+        auto Ljj = Kokkos::subview(viewL, range_type(0, nscol), Kokkos::ALL());
         if (invert_diagonal) {
           // workspace
-          auto Y = Kokkos::subview (work, range_type(workoffset, workoffset+nscol));  // needed for gemv instead of trmv/trsv
+          auto Y = Kokkos::subview(
+              work,
+              range_type(
+                  workoffset,
+                  workoffset + nscol));  // needed for gemv instead of trmv/trsv
           for (int ii = team_rank; ii < nscol; ii += team_size) {
             Y(ii) = Xj(ii);
           }
-          team.team_barrier ();
-          // calling team-level "Unblocked" gemv on small-size diagonal in KokkosBatched
-          KokkosBatched::TeamGemv<member_type,
-                                  KokkosBatched::Trans::NoTranspose,
-                                  KokkosBatched::Algo::Gemv::Unblocked>
-            ::invoke(team, one, Ljj, Y, zero, Xj);
+          team.team_barrier();
+          // calling team-level "Unblocked" gemv on small-size diagonal in
+          // KokkosBatched
+          KokkosBatched::TeamGemv<
+              member_type, KokkosBatched::Trans::NoTranspose,
+              KokkosBatched::Algo::Gemv::Unblocked>::invoke(team, one, Ljj, Y,
+                                                            zero, Xj);
         } else {
           // NOTE: we currently supports only default_layout = LayoutLeft
-          Kokkos::View<scalar_t**, default_layout, memory_space, Kokkos::MemoryUnmanaged> Xjj (Xj.data (), nscol, 1);
+          Kokkos::View<scalar_t **, default_layout, memory_space,
+                       Kokkos::MemoryUnmanaged>
+              Xjj(Xj.data(), nscol, 1);
           if (unit_diagonal) {
-            KokkosBatched::TeamTrsm<member_type,
-                                    KokkosBatched::Side::Left,
-                                    KokkosBatched::Uplo::Lower,
-                                    KokkosBatched::Trans::NoTranspose,
-                                    KokkosBatched::Diag::Unit,
-                                    KokkosBatched::Algo::Trsm::Unblocked>
-              ::invoke(team, one, Ljj, Xjj);
+            KokkosBatched::TeamTrsm<
+                member_type, KokkosBatched::Side::Left,
+                KokkosBatched::Uplo::Lower, KokkosBatched::Trans::NoTranspose,
+                KokkosBatched::Diag::Unit,
+                KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ljj,
+                                                              Xjj);
           } else {
-            KokkosBatched::TeamTrsm<member_type,
-                                    KokkosBatched::Side::Left,
-                                    KokkosBatched::Uplo::Lower,
-                                    KokkosBatched::Trans::NoTranspose,
-                                    KokkosBatched::Diag::NonUnit,
-                                    KokkosBatched::Algo::Trsm::Unblocked>
-              ::invoke(team, one, Ljj, Xjj);
+            KokkosBatched::TeamTrsm<
+                member_type, KokkosBatched::Side::Left,
+                KokkosBatched::Uplo::Lower, KokkosBatched::Trans::NoTranspose,
+                KokkosBatched::Diag::NonUnit,
+                KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ljj,
+                                                              Xjj);
           }
         }
-        team.team_barrier ();
+        team.team_barrier();
 
         /* GEMM to update with off diagonal blocks */
-        auto Lij = Kokkos::subview (viewL, range_type (nscol, nsrow), Kokkos::ALL ());
-        KokkosBatched::TeamGemv<member_type,
-                                KokkosBatched::Trans::NoTranspose,
-                                KokkosBatched::Algo::Gemv::Unblocked>
-          ::invoke(team, one, Lij, Xj, zero, Z);
+        auto Lij =
+            Kokkos::subview(viewL, range_type(nscol, nsrow), Kokkos::ALL());
+        KokkosBatched::TeamGemv<
+            member_type, KokkosBatched::Trans::NoTranspose,
+            KokkosBatched::Algo::Gemv::Unblocked>::invoke(team, one, Lij, Xj,
+                                                          zero, Z);
         team.team_barrier();
       }
     }
 
     /* scatter vectors back into X */
-    int i2 = i1 + nscol;   // offset into rowind
-    int nsrow2 = nsrow - nscol;   // "total" number of rows in all the off-diagonal supernodes
-    Kokkos::View<scalar_t*, memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::Atomic> > Xatomic(X.data(), X.extent(0));
+    int i2 = i1 + nscol;  // offset into rowind
+    int nsrow2 =
+        nsrow -
+        nscol;  // "total" number of rows in all the off-diagonal supernodes
+    Kokkos::View<scalar_t *, memory_space,
+                 Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::Atomic>>
+        Xatomic(X.data(), X.extent(0));
     for (int ii = team_rank; ii < nsrow2; ii += team_size) {
-      int i = rowind (i2 + ii);
-      Xatomic (i) -= Z (ii);
+      int i = rowind(i2 + ii);
+      Xatomic(i) -= Z(ii);
     }
     team.team_barrier();
   }
 };
 
-
 // -----------------------------------------------------------
 // Functor for Upper-triangular solve in CSR
-template <class TriSolveHandle, class ColptrType, class RowindType, class ValuesType, class LHSType, class NGBLType>
-struct UpperTriSupernodalFunctor
-{
+template <class TriSolveHandle, class ColptrType, class RowindType,
+          class ValuesType, class LHSType, class NGBLType>
+struct UpperTriSupernodalFunctor {
   using execution_space = typename TriSolveHandle::HandleExecSpace;
-  using memory_space = typename TriSolveHandle::HandleTempMemorySpace;
+  using memory_space    = typename TriSolveHandle::HandleTempMemorySpace;
 
   using policy_type = Kokkos::TeamPolicy<execution_space>;
   using member_type = typename policy_type::member_type;
 
   using scalar_t = typename ValuesType::non_const_value_type;
 
-  using integer_view_t = Kokkos::View<int*, memory_space>;
-  using work_view_t = typename Kokkos::View<scalar_t*, Kokkos::Device<execution_space, memory_space>>;
+  using integer_view_t = Kokkos::View<int *, memory_space>;
+  using work_view_t =
+      typename Kokkos::View<scalar_t *,
+                            Kokkos::Device<execution_space, memory_space>>;
 
   // NOTE: we currently supports only default_layout = LayoutLeft
-  using SupernodeView = typename Kokkos::View<scalar_t**, default_layout,
-                                              memory_space, Kokkos::MemoryUnmanaged>;
+  using SupernodeView =
+      typename Kokkos::View<scalar_t **, default_layout, memory_space,
+                            Kokkos::MemoryUnmanaged>;
 
   using range_type = Kokkos::pair<int, int>;
 
@@ -897,7 +1015,7 @@ struct UpperTriSupernodalFunctor
 
   LHSType X;
 
-  work_view_t work; // needed with gemv for update&scatter
+  work_view_t work;  // needed with gemv for update&scatter
   integer_view_t work_offset;
 
   NGBLType nodes_grouped_by_level;
@@ -905,141 +1023,151 @@ struct UpperTriSupernodalFunctor
   long node_count;
 
   // constructor
-  UpperTriSupernodalFunctor (// supernode info
-                             bool invert_diagonal_,
-                             const int *supercols_,
-                             // U in CSR
-                             const ColptrType &colptr_,
-                             const RowindType &rowind_,
-                             const ValuesType &values_,
-                             // options to pick kernel type
-                             int level_,
-                             integer_view_t &kernel_type_,
-                             integer_view_t &diag_kernel_type_,
-                             // right-hand-side (input), solution (output)
-                             LHSType &X_,
-                             // workspace
-                             work_view_t &work_,
-                             integer_view_t &work_offset_,
-                             //
-                             const NGBLType &nodes_grouped_by_level_,
-                             long  node_count_) :
-    invert_diagonal(invert_diagonal_), supercols(supercols_),
-    colptr(colptr_), rowind(rowind_), values(values_),
-    level(level_), kernel_type(kernel_type_), diag_kernel_type(diag_kernel_type_),
-    X(X_), work(work_), work_offset(work_offset_),
-    nodes_grouped_by_level(nodes_grouped_by_level_), node_count(node_count_) {
-  }
+  UpperTriSupernodalFunctor(  // supernode info
+      bool invert_diagonal_, const int *supercols_,
+      // U in CSR
+      const ColptrType &colptr_, const RowindType &rowind_,
+      const ValuesType &values_,
+      // options to pick kernel type
+      int level_, integer_view_t &kernel_type_,
+      integer_view_t &diag_kernel_type_,
+      // right-hand-side (input), solution (output)
+      LHSType &X_,
+      // workspace
+      work_view_t &work_, integer_view_t &work_offset_,
+      //
+      const NGBLType &nodes_grouped_by_level_, long node_count_)
+      : invert_diagonal(invert_diagonal_),
+        supercols(supercols_),
+        colptr(colptr_),
+        rowind(rowind_),
+        values(values_),
+        level(level_),
+        kernel_type(kernel_type_),
+        diag_kernel_type(diag_kernel_type_),
+        X(X_),
+        work(work_),
+        work_offset(work_offset_),
+        nodes_grouped_by_level(nodes_grouped_by_level_),
+        node_count(node_count_) {}
 
   // operator
   KOKKOS_INLINE_FUNCTION
-  void operator()(const member_type & team) const {
-
+  void operator()(const member_type &team) const {
     /* ---------------------------------------------------------------------- */
     /* get inputs */
     /* ---------------------------------------------------------------------- */
-    const int league_rank = team.league_rank(); // batch id
-    const int team_size = team.team_size ();
-    const int team_rank = team.team_rank ();
-    const scalar_t zero (0.0);
-    const scalar_t one (1.0);
+    const int league_rank = team.league_rank();  // batch id
+    const int team_size   = team.team_size();
+    const int team_rank   = team.team_rank();
+    const scalar_t zero(0.0);
+    const scalar_t one(1.0);
 
-    auto s = nodes_grouped_by_level (node_count + league_rank);
+    auto s = nodes_grouped_by_level(node_count + league_rank);
 
     // number of columns in the s-th supernode column
-    int j1 = supercols[s];
-    int j2 = supercols[s+1];
+    int j1    = supercols[s];
+    int j2    = supercols[s + 1];
     int nscol = j2 - j1;
     // "total" number of rows in all the supernodes (diagonal+off-diagonal)
-    int i1 = colptr (j1);
-    int nsrow = colptr (j1+1) - i1 ;
+    int i1    = colptr(j1);
+    int nsrow = colptr(j1 + 1) - i1;
 
     // create a view of the s-th supernocal row of U
-    scalar_t *dataU = const_cast<scalar_t*> (values.data ());
-    SupernodeView viewU (&dataU[i1], nsrow, nscol);
+    scalar_t *dataU = const_cast<scalar_t *>(values.data());
+    SupernodeView viewU(&dataU[i1], nsrow, nscol);
 
     // extract part of solution, corresponding to the diagonal block U(s, s)
-    auto Xj = Kokkos::subview (X, range_type(j1, j2));
+    auto Xj       = Kokkos::subview(X, range_type(j1, j2));
     using Xj_type = decltype(Xj);
 
     // workspaces
-    int workoffset = work_offset (s);
+    int workoffset = work_offset(s);
 
     // "total" number of rows in all the off-diagonal supernodes
     int nsrow2 = nsrow - nscol;
     /* gather vector into Z */
-    int i2 = i1 + nscol;     // offset into rowind
-    auto Z = Kokkos::subview(work, range_type(workoffset+nscol, workoffset+nsrow));  // needed with gemv for update&scatter
+    int i2 = i1 + nscol;  // offset into rowind
+    auto Z = Kokkos::subview(
+        work,
+        range_type(workoffset + nscol,
+                   workoffset + nsrow));  // needed with gemv for update&scatter
     using Z_type = decltype(Z);
-    for (int ii = team_rank; ii < nsrow2 ; ii += team_size) {
-      int i = rowind (i2 + ii);
-      Z (ii) = X (i);
+    for (int ii = team_rank; ii < nsrow2; ii += team_size) {
+      int i = rowind(i2 + ii);
+      Z(ii) = X(i);
     }
     team.team_barrier();
     /* GEMM to update with off diagonal blocks, Xj = -Uij^T * Z */
-    if (diag_kernel_type (level) != 3) {
+    if (diag_kernel_type(level) != 3) {
       // not device-level GEMV-udpate
-      auto Uij = Kokkos::subview (viewU, range_type (nscol, nsrow), Kokkos::ALL ());
+      auto Uij =
+          Kokkos::subview(viewU, range_type(nscol, nsrow), Kokkos::ALL());
       using Uij_type = decltype(Uij);
-      KokkosBatched::TeamGemv<member_type,
-                              KokkosBatched::Trans::Transpose,
-                              KokkosBatched::Algo::Gemv::Unblocked>
-        ::template invoke<const scalar_t, Uij_type, Z_type, Xj_type>(team, -one, Uij, Z, one, Xj);
+      KokkosBatched::TeamGemv<member_type, KokkosBatched::Trans::Transpose,
+                              KokkosBatched::Algo::Gemv::Unblocked>::
+          template invoke<const scalar_t, Uij_type, Z_type, Xj_type>(
+              team, -one, Uij, Z, one, Xj);
       team.team_barrier();
 
       /* TRSM with diagonal block */
       // extract diagonal and off-diagonal blocks of U
-      auto Ujj = Kokkos::subview (viewU, range_type (0, nscol), Kokkos::ALL ());
+      auto Ujj = Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL());
       using Ujj_type = decltype(Ujj);
 
       if (invert_diagonal) {
         // workspace
-        auto Y = Kokkos::subview (work, range_type(workoffset, workoffset+nscol));  // needed for gemv instead of trmv/trsv
+        auto Y = Kokkos::subview(
+            work,
+            range_type(
+                workoffset,
+                workoffset + nscol));  // needed for gemv instead of trmv/trsv
         using Y_type = decltype(Y);
         for (int ii = team_rank; ii < nscol; ii += team_size) {
-          Y (ii) = Xj (ii);
+          Y(ii) = Xj(ii);
         }
         team.team_barrier();
 
         // caling team-level kernel in KokkosBatched on a small-size diagonal
-        KokkosBatched::TeamGemv<member_type,
-                                KokkosBatched::Trans::Transpose,
-                                KokkosBatched::Algo::Gemv::Unblocked>
-          ::template invoke<const scalar_t, Ujj_type, Y_type, Xj_type>(team, one, Ujj, Y, zero, Xj);
+        KokkosBatched::TeamGemv<member_type, KokkosBatched::Trans::Transpose,
+                                KokkosBatched::Algo::Gemv::Unblocked>::
+            template invoke<const scalar_t, Ujj_type, Y_type, Xj_type>(
+                team, one, Ujj, Y, zero, Xj);
       } else {
         // NOTE: we currently supports only default_layout = LayoutLeft
-        Kokkos::View<scalar_t**, default_layout, memory_space, Kokkos::MemoryUnmanaged> Xjj (Xj.data (), nscol, 1);
-        KokkosBatched::TeamTrsm<member_type,
-                                KokkosBatched::Side::Left,
-                                KokkosBatched::Uplo::Lower,
-                                KokkosBatched::Trans::Transpose,
-                                KokkosBatched::Diag::NonUnit,
-                                KokkosBatched::Algo::Trsm::Unblocked>
-          ::template invoke(team, one, Ujj, Xjj);
+        Kokkos::View<scalar_t **, default_layout, memory_space,
+                     Kokkos::MemoryUnmanaged>
+            Xjj(Xj.data(), nscol, 1);
+        KokkosBatched::TeamTrsm<
+            member_type, KokkosBatched::Side::Left, KokkosBatched::Uplo::Lower,
+            KokkosBatched::Trans::Transpose, KokkosBatched::Diag::NonUnit,
+            KokkosBatched::Algo::Trsm::Unblocked>::template invoke(team, one,
+                                                                   Ujj, Xjj);
       }
       team.team_barrier();
     }
   }
 };
 
-
 // -----------------------------------------------------------
 // Functor for Upper-triangular solve in CSC
-template <class TriSolveHandle, class ColptrType, class RowindType, class ValuesType, class LHSType, class NGBLType>
-struct UpperTriTranSupernodalFunctor
-{
+template <class TriSolveHandle, class ColptrType, class RowindType,
+          class ValuesType, class LHSType, class NGBLType>
+struct UpperTriTranSupernodalFunctor {
   using execution_space = typename TriSolveHandle::HandleExecSpace;
-  using memory_space = typename TriSolveHandle::HandleTempMemorySpace;
+  using memory_space    = typename TriSolveHandle::HandleTempMemorySpace;
 
   using policy_type = Kokkos::TeamPolicy<execution_space>;
   using member_type = typename policy_type::member_type;
 
   using scalar_t = typename ValuesType::non_const_value_type;
 
-  using integer_view_t = Kokkos::View<int*, memory_space>;
-  using work_view_t = typename Kokkos::View<scalar_t*, Kokkos::Device<execution_space, memory_space>>;
+  using integer_view_t = Kokkos::View<int *, memory_space>;
+  using work_view_t =
+      typename Kokkos::View<scalar_t *,
+                            Kokkos::Device<execution_space, memory_space>>;
 
-  using range_type =  Kokkos::pair<int, int>;
+  using range_type = Kokkos::pair<int, int>;
 
   const bool invert_diagonal;
   const bool invert_offdiagonal;
@@ -1054,7 +1182,7 @@ struct UpperTriTranSupernodalFunctor
 
   LHSType X;
 
-  work_view_t work; // needed with gemv for update&scatter
+  work_view_t work;  // needed with gemv for update&scatter
   integer_view_t work_offset;
 
   NGBLType nodes_grouped_by_level;
@@ -1062,132 +1190,149 @@ struct UpperTriTranSupernodalFunctor
   long node_count;
 
   // constructor
-  UpperTriTranSupernodalFunctor (// supernode info
-                                 const bool invert_diagonal_,
-                                 const bool invert_offdiagonal_,
-                                 const int *supercols_,
-
-                                 // U in CSC
-                                 const ColptrType &colptr_,
-                                 const RowindType &rowind_,
-                                 const ValuesType &values_,
-                                 // options to pick kernel type
-                                 const int level_,
-                                 const integer_view_t &kernel_type_,
-                                 const integer_view_t &diag_kernel_type_,
-                                 // right-hand-side (input), solution (output)
-                                 const LHSType &X_,
-                                 // workspace
-                                 const work_view_t &work_,
-                                 const integer_view_t &work_offset_,
-                                 //
-                                 const NGBLType &nodes_grouped_by_level_,
-                                 const long  node_count_) :
-    invert_diagonal(invert_diagonal_), invert_offdiagonal(invert_offdiagonal_),
-    supercols(supercols_), colptr(colptr_), rowind(rowind_), values(values_),
-    level(level_), kernel_type(kernel_type_), diag_kernel_type(diag_kernel_type_),
-    X(X_), work(work_), work_offset(work_offset_),
-    nodes_grouped_by_level(nodes_grouped_by_level_), node_count(node_count_) {
-  }
+  UpperTriTranSupernodalFunctor(  // supernode info
+      const bool invert_diagonal_, const bool invert_offdiagonal_,
+      const int *supercols_,
+
+      // U in CSC
+      const ColptrType &colptr_, const RowindType &rowind_,
+      const ValuesType &values_,
+      // options to pick kernel type
+      const int level_, const integer_view_t &kernel_type_,
+      const integer_view_t &diag_kernel_type_,
+      // right-hand-side (input), solution (output)
+      const LHSType &X_,
+      // workspace
+      const work_view_t &work_, const integer_view_t &work_offset_,
+      //
+      const NGBLType &nodes_grouped_by_level_, const long node_count_)
+      : invert_diagonal(invert_diagonal_),
+        invert_offdiagonal(invert_offdiagonal_),
+        supercols(supercols_),
+        colptr(colptr_),
+        rowind(rowind_),
+        values(values_),
+        level(level_),
+        kernel_type(kernel_type_),
+        diag_kernel_type(diag_kernel_type_),
+        X(X_),
+        work(work_),
+        work_offset(work_offset_),
+        nodes_grouped_by_level(nodes_grouped_by_level_),
+        node_count(node_count_) {}
 
   // operator
   KOKKOS_INLINE_FUNCTION
-  void operator()(const member_type & team) const {
-
+  void operator()(const member_type &team) const {
     /* ---------------------------------------------------------------------- */
     /* get inputs */
     /* ---------------------------------------------------------------------- */
-    const int league_rank = team.league_rank(); // batch id
-    const int team_size = team.team_size ();
-    const int team_rank = team.team_rank ();
-    const scalar_t zero (0.0);
-    const scalar_t one (1.0);
+    const int league_rank = team.league_rank();  // batch id
+    const int team_size   = team.team_size();
+    const int team_rank   = team.team_rank();
+    const scalar_t zero(0.0);
+    const scalar_t one(1.0);
 
-    auto s = nodes_grouped_by_level (node_count + league_rank);
+    auto s = nodes_grouped_by_level(node_count + league_rank);
 
     // number of columns in the s-th supernode column
-    const int j1 = supercols[s];
-    const int j2 = supercols[s+1];
+    const int j1    = supercols[s];
+    const int j2    = supercols[s + 1];
     const int nscol = j2 - j1;
     // "total" number of rows in all the supernodes (diagonal+off-diagonal)
-    const int i1 = colptr (j1);
-    const int nsrow = colptr (j1+1) - i1 ;
+    const int i1    = colptr(j1);
+    const int nsrow = colptr(j1 + 1) - i1;
     // "total" number of rows in all the off-diagonal supernodes
     const int nsrow2 = nsrow - nscol;
 
     // create a view of the s-th supernocal column of U
     // NOTE: we currently supports only default_layout = LayoutLeft
-    scalar_t *dataU = const_cast<scalar_t*> (values.data ());
-    Kokkos::View<scalar_t**, default_layout, memory_space, Kokkos::MemoryUnmanaged> viewU (&dataU[i1], nsrow, nscol);
+    scalar_t *dataU = const_cast<scalar_t *>(values.data());
+    Kokkos::View<scalar_t **, default_layout, memory_space,
+                 Kokkos::MemoryUnmanaged>
+        viewU(&dataU[i1], nsrow, nscol);
 
     // extract part of solution, corresponding to the diagonal block U(s, s)
-    auto Xj = Kokkos::subview (X, range_type(j1, j2));
+    auto Xj = Kokkos::subview(X, range_type(j1, j2));
 
     // workspaces
-    int workoffset = work_offset (s);
+    int workoffset = work_offset(s);
 
     /* TRSM with diagonal block */
-    if (diag_kernel_type (level) != 3) {
+    if (diag_kernel_type(level) != 3) {
       // not device-level TRSM-solve
       if (invert_offdiagonal) {
         // extract diagonal + off-diagonal blocks of U
-        auto Y = Kokkos::subview(work, range_type(workoffset, workoffset+nsrow));  // needed with gemv for update&scatter
-        auto Uij = Kokkos::subview (viewU, range_type (0, nsrow), Kokkos::ALL ());
-        KokkosBatched::TeamGemv<member_type,
-                                KokkosBatched::Trans::NoTranspose,
-                                KokkosBatched::Algo::Gemv::Unblocked>
-          ::invoke(team, one, Uij, Xj, zero, Y);
+        auto Y = Kokkos::subview(
+            work,
+            range_type(
+                workoffset,
+                workoffset + nsrow));  // needed with gemv for update&scatter
+        auto Uij = Kokkos::subview(viewU, range_type(0, nsrow), Kokkos::ALL());
+        KokkosBatched::TeamGemv<
+            member_type, KokkosBatched::Trans::NoTranspose,
+            KokkosBatched::Algo::Gemv::Unblocked>::invoke(team, one, Uij, Xj,
+                                                          zero, Y);
         team.team_barrier();
         // copy the diagonal back to output
         for (int ii = team_rank; ii < nscol; ii += team_size) {
-          Xj (ii) = Y (ii);
+          Xj(ii) = Y(ii);
         }
       } else {
         // extract diagonal block of U (stored on top)
-        auto Ujj = Kokkos::subview (viewU, range_type (0, nscol), Kokkos::ALL ());
+        auto Ujj = Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL());
         if (invert_diagonal) {
-          auto Y = Kokkos::subview (work, range_type(workoffset, workoffset+nscol));  // needed for gemv instead of trmv/trsv
+          auto Y = Kokkos::subview(
+              work,
+              range_type(
+                  workoffset,
+                  workoffset + nscol));  // needed for gemv instead of trmv/trsv
           for (int ii = team_rank; ii < nscol; ii += team_size) {
-            Y (ii) = Xj (ii);
+            Y(ii) = Xj(ii);
           }
           team.team_barrier();
-          KokkosBatched::TeamGemv<member_type,
-                                  KokkosBatched::Trans::NoTranspose,
-                                  KokkosBatched::Algo::Gemv::Unblocked>
-            ::invoke(team, one, Ujj, Y, zero, Xj);
+          KokkosBatched::TeamGemv<
+              member_type, KokkosBatched::Trans::NoTranspose,
+              KokkosBatched::Algo::Gemv::Unblocked>::invoke(team, one, Ujj, Y,
+                                                            zero, Xj);
         } else {
           // NOTE: we currently supports only default_layout = LayoutLeft
-          Kokkos::View<scalar_t**, default_layout, memory_space, Kokkos::MemoryUnmanaged> Xjj (Xj.data (), nscol, 1);
-          KokkosBatched::TeamTrsm<member_type,
-                                  KokkosBatched::Side::Left,
-                                  KokkosBatched::Uplo::Upper,
-                                  KokkosBatched::Trans::NoTranspose,
-                                  KokkosBatched::Diag::NonUnit,
-                                  KokkosBatched::Algo::Trsm::Unblocked>
-            ::invoke(team, one, Ujj, Xjj);
+          Kokkos::View<scalar_t **, default_layout, memory_space,
+                       Kokkos::MemoryUnmanaged>
+              Xjj(Xj.data(), nscol, 1);
+          KokkosBatched::TeamTrsm<
+              member_type, KokkosBatched::Side::Left,
+              KokkosBatched::Uplo::Upper, KokkosBatched::Trans::NoTranspose,
+              KokkosBatched::Diag::NonUnit,
+              KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ujj,
+                                                            Xjj);
         }
       }
       team.team_barrier();
     }
     if (nsrow2 > 0) {
       /* GEMM to update off diagonal blocks, Z = Uij * Xj */
-      auto Z = Kokkos::subview(work, range_type(workoffset+nscol, workoffset+nsrow));
-      if (!invert_offdiagonal && diag_kernel_type (level) != 3) {
+      auto Z = Kokkos::subview(
+          work, range_type(workoffset + nscol, workoffset + nsrow));
+      if (!invert_offdiagonal && diag_kernel_type(level) != 3) {
         // not device-level TRSM-solve
-        auto Uij = Kokkos::subview (viewU, range_type (nscol, nsrow), Kokkos::ALL ());
-        KokkosBatched::TeamGemv<member_type,
-                                KokkosBatched::Trans::NoTranspose,
-                                KokkosBatched::Algo::Gemv::Unblocked>
-          ::invoke(team, one, Uij, Xj, zero, Z);
+        auto Uij =
+            Kokkos::subview(viewU, range_type(nscol, nsrow), Kokkos::ALL());
+        KokkosBatched::TeamGemv<
+            member_type, KokkosBatched::Trans::NoTranspose,
+            KokkosBatched::Algo::Gemv::Unblocked>::invoke(team, one, Uij, Xj,
+                                                          zero, Z);
         team.team_barrier();
       }
 
       /* scatter vector into Z */
-      int i2 = i1 + nscol;     // offset into rowind
-      Kokkos::View<scalar_t*, memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::Atomic> > Xatomic(X.data(), X.extent(0));
-      for (int ii = team_rank; ii < nsrow2 ; ii += team_size) {
-        int i = rowind (i2 + ii);
-        Xatomic (i) -= Z (ii);
+      int i2 = i1 + nscol;  // offset into rowind
+      Kokkos::View<scalar_t *, memory_space,
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::Atomic>>
+          Xatomic(X.data(), X.extent(0));
+      for (int ii = team_rank; ii < nsrow2; ii += team_size) {
+        int i = rowind(i2 + ii);
+        Xatomic(i) -= Z(ii);
       }
       team.team_barrier();
     }
@@ -1195,9 +1340,9 @@ struct UpperTriTranSupernodalFunctor
 };
 #endif
 
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType, class RHSType, class NGBLType>
-struct UpperTriLvlSchedRPSolverFunctor
-{
+template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
+          class RHSType, class NGBLType>
+struct UpperTriLvlSchedRPSolverFunctor {
   typedef typename EntriesType::non_const_value_type lno_t;
   RowMapType row_map;
   EntriesType entries;
@@ -1206,56 +1351,60 @@ struct UpperTriLvlSchedRPSolverFunctor
   RHSType rhs;
   NGBLType nodes_grouped_by_level;
 
-
-  UpperTriLvlSchedRPSolverFunctor( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_ ) :
-    row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_) {}
-
+  UpperTriLvlSchedRPSolverFunctor(const RowMapType &row_map_,
+                                  const EntriesType &entries_,
+                                  const ValuesType &values_, LHSType &lhs_,
+                                  const RHSType &rhs_,
+                                  const NGBLType &nodes_grouped_by_level_)
+      : row_map(row_map_),
+        entries(entries_),
+        values(values_),
+        lhs(lhs_),
+        rhs(rhs_),
+        nodes_grouped_by_level(nodes_grouped_by_level_) {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const lno_t i) const {
     auto rowid = nodes_grouped_by_level(i);
-    // Assuming indices are sorted per row, diag entry is final index in the list
-    long soffset = row_map(rowid);
-    long eoffset = row_map(rowid+1);
+    // Assuming indices are sorted per row, diag entry is final index in the
+    // list
+    long soffset   = row_map(rowid);
+    long eoffset   = row_map(rowid + 1);
     auto rhs_rowid = rhs(rowid);
-    for ( long ptr = eoffset-1; ptr >= soffset; --ptr ) {
+    for (long ptr = eoffset - 1; ptr >= soffset; --ptr) {
       auto colid = entries(ptr);
       auto val   = values(ptr);
-      if ( colid != rowid ) {
-        rhs_rowid = rhs_rowid - val*lhs(colid);
-      }
-      else {
-        lhs(rowid) = rhs_rowid/val;
+      if (colid != rowid) {
+        rhs_rowid = rhs_rowid - val * lhs(colid);
+      } else {
+        lhs(rowid) = rhs_rowid / val;
       }
-    } // end for ptr
+    }  // end for ptr
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const UnsortedTag&, const lno_t i) const {
-    auto rowid = nodes_grouped_by_level(i);
-    long soffset = row_map(rowid);
-    long eoffset = row_map(rowid+1);
+  void operator()(const UnsortedTag &, const lno_t i) const {
+    auto rowid     = nodes_grouped_by_level(i);
+    long soffset   = row_map(rowid);
+    long eoffset   = row_map(rowid + 1);
     auto rhs_rowid = rhs(rowid);
-    auto diag = -1;
-    for ( long ptr = eoffset-1; ptr >= soffset; --ptr ) {
+    auto diag      = -1;
+    for (long ptr = eoffset - 1; ptr >= soffset; --ptr) {
       auto colid = entries(ptr);
       auto val   = values(ptr);
-      if ( colid != rowid ) {
-        rhs_rowid = rhs_rowid - val*lhs(colid);
-      }
-      else {
+      if (colid != rowid) {
+        rhs_rowid = rhs_rowid - val * lhs(colid);
+      } else {
         diag = ptr;
       }
-    } // end for ptr
-    lhs(rowid) = rhs_rowid/values(diag);
+    }  // end for ptr
+    lhs(rowid) = rhs_rowid / values(diag);
   }
-
 };
 
-
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType, class RHSType, class NGBLType>
-struct UpperTriLvlSchedTP1SolverFunctor
-{
+template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
+          class RHSType, class NGBLType>
+struct UpperTriLvlSchedTP1SolverFunctor {
   typedef typename RowMapType::execution_space execution_space;
   typedef Kokkos::TeamPolicy<execution_space> policy_type;
   typedef typename policy_type::member_type member_type;
@@ -1269,84 +1418,98 @@ struct UpperTriLvlSchedTP1SolverFunctor
   RHSType rhs;
   NGBLType nodes_grouped_by_level;
 
-  long node_count; // like "block" offset into ngbl, my_league is the "local" offset
+  long node_count;  // like "block" offset into ngbl, my_league is the "local"
+                    // offset
   long node_groups;
 
-
-  UpperTriLvlSchedTP1SolverFunctor( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, long node_count_, long node_groups_ = 0 ) :
-    row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), node_count(node_count_), node_groups(node_groups_) {}
-
+  UpperTriLvlSchedTP1SolverFunctor(const RowMapType &row_map_,
+                                   const EntriesType &entries_,
+                                   const ValuesType &values_, LHSType &lhs_,
+                                   const RHSType &rhs_,
+                                   const NGBLType &nodes_grouped_by_level_,
+                                   long node_count_, long node_groups_ = 0)
+      : row_map(row_map_),
+        entries(entries_),
+        values(values_),
+        lhs(lhs_),
+        rhs(rhs_),
+        nodes_grouped_by_level(nodes_grouped_by_level_),
+        node_count(node_count_),
+        node_groups(node_groups_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const member_type & team) const {
-        auto my_league = team.league_rank(); // map to rowid
-        auto rowid = nodes_grouped_by_level(my_league + node_count);
-        auto my_rank = team.team_rank();
+  void operator()(const member_type &team) const {
+    auto my_league = team.league_rank();  // map to rowid
+    auto rowid     = nodes_grouped_by_level(my_league + node_count);
+    auto my_rank   = team.team_rank();
 
-        auto soffset = row_map(rowid);
-        auto eoffset = row_map(rowid+1);
-        auto rhs_rowid = rhs(rowid);
-        scalar_t diff = scalar_t(0.0);
+    auto soffset   = row_map(rowid);
+    auto eoffset   = row_map(rowid + 1);
+    auto rhs_rowid = rhs(rowid);
+    scalar_t diff  = scalar_t(0.0);
 
-        Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, soffset, eoffset ), [&] ( const long ptr, scalar_t &tdiff ) {
+    Kokkos::parallel_reduce(
+        Kokkos::TeamThreadRange(team, soffset, eoffset),
+        [&](const long ptr, scalar_t &tdiff) {
           auto colid = entries(ptr);
           auto val   = values(ptr);
-          if ( colid != rowid ) {
-            tdiff = tdiff - val*lhs(colid);
+          if (colid != rowid) {
+            tdiff = tdiff - val * lhs(colid);
           }
-        }, diff );
+        },
+        diff);
 
-        team.team_barrier();
+    team.team_barrier();
 
-        // At end, finalize rowid == colid
-        // only one thread should do this, also can use Kokkos::single
-        if ( my_rank == 0 )
-        {
-        // ASSUMPTION: sorted diagonal value located at start offset
-          lhs(rowid) = (rhs_rowid+diff)/values(soffset);
-        }
+    // At end, finalize rowid == colid
+    // only one thread should do this, also can use Kokkos::single
+    if (my_rank == 0) {
+      // ASSUMPTION: sorted diagonal value located at start offset
+      lhs(rowid) = (rhs_rowid + diff) / values(soffset);
+    }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const UnsortedTag&, const member_type & team) const {
-        auto my_league = team.league_rank(); // map to rowid
-        auto rowid = nodes_grouped_by_level(my_league + node_count);
-        auto my_rank = team.team_rank();
+  void operator()(const UnsortedTag &, const member_type &team) const {
+    auto my_league = team.league_rank();  // map to rowid
+    auto rowid     = nodes_grouped_by_level(my_league + node_count);
+    auto my_rank   = team.team_rank();
 
-        auto soffset = row_map(rowid);
-        auto eoffset = row_map(rowid+1);
-        auto rhs_rowid = rhs(rowid);
-        scalar_t diff = scalar_t(0.0);
+    auto soffset   = row_map(rowid);
+    auto eoffset   = row_map(rowid + 1);
+    auto rhs_rowid = rhs(rowid);
+    scalar_t diff  = scalar_t(0.0);
 
-        auto diag = -1;
+    auto diag = -1;
 
-        Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, soffset, eoffset ), [&] ( const long ptr, scalar_t &tdiff ) {
+    Kokkos::parallel_reduce(
+        Kokkos::TeamThreadRange(team, soffset, eoffset),
+        [&](const long ptr, scalar_t &tdiff) {
           auto colid = entries(ptr);
           auto val   = values(ptr);
-          if ( colid != rowid ) {
-            tdiff = tdiff - val*lhs(colid);
-          }
-          else {
+          if (colid != rowid) {
+            tdiff = tdiff - val * lhs(colid);
+          } else {
             diag = ptr;
           }
-        }, diff );
-        team.team_barrier();
+        },
+        diff);
+    team.team_barrier();
 
-        // At end, finalize rowid == colid
-        // only one thread should do this, also can use Kokkos::single
-        if ( my_rank == 0 )
-        {
-          lhs(rowid) = (rhs_rowid+diff)/values(diag);
-        }
+    // At end, finalize rowid == colid
+    // only one thread should do this, also can use Kokkos::single
+    if (my_rank == 0) {
+      lhs(rowid) = (rhs_rowid + diff) / values(diag);
+    }
   }
 };
 
-
 // FIXME CUDA: This algorithm not working with all integral type combos
-// In any case, this serves as a skeleton for 3-level hierarchical parallelism for alg dev
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType, class RHSType, class NGBLType>
-struct UpperTriLvlSchedTP2SolverFunctor
-{
+// In any case, this serves as a skeleton for 3-level hierarchical parallelism
+// for alg dev
+template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
+          class RHSType, class NGBLType>
+struct UpperTriLvlSchedTP2SolverFunctor {
   typedef typename RowMapType::execution_space execution_space;
   typedef Kokkos::TeamPolicy<execution_space> policy_type;
   typedef typename policy_type::member_type member_type;
@@ -1360,88 +1523,105 @@ struct UpperTriLvlSchedTP2SolverFunctor
   RHSType rhs;
   NGBLType nodes_grouped_by_level;
 
-  long node_count; // like "block" offset into ngbl, my_league is the "local" offset
+  long node_count;  // like "block" offset into ngbl, my_league is the "local"
+                    // offset
   long node_groups;
 
-
-  UpperTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, long node_count_, long node_groups_ = 0) :
-    row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), node_count(node_count_), node_groups(node_groups_) {}
-
+  UpperTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_,
+                                   const EntriesType &entries_,
+                                   const ValuesType &values_, LHSType &lhs_,
+                                   const RHSType &rhs_,
+                                   const NGBLType &nodes_grouped_by_level_,
+                                   long node_count_, long node_groups_ = 0)
+      : row_map(row_map_),
+        entries(entries_),
+        values(values_),
+        lhs(lhs_),
+        rhs(rhs_),
+        nodes_grouped_by_level(nodes_grouped_by_level_),
+        node_count(node_count_),
+        node_groups(node_groups_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const member_type & team) const {
-        auto my_league = team.league_rank(); // map to rowid
-
-        size_t nrows = row_map.extent(0) - 1;
-
-        Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 0, node_groups ), [&] ( const long ng ) {
-          auto rowid = nodes_grouped_by_level(node_count + my_league*node_groups + ng);
-          if ( size_t(rowid) < nrows ) {
-
-            auto soffset = row_map(rowid);
-            auto eoffset = row_map(rowid+1);
+  void operator()(const member_type &team) const {
+    auto my_league = team.league_rank();  // map to rowid
+
+    size_t nrows = row_map.extent(0) - 1;
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) {
+          auto rowid =
+              nodes_grouped_by_level(node_count + my_league * node_groups + ng);
+          if (size_t(rowid) < nrows) {
+            auto soffset   = row_map(rowid);
+            auto eoffset   = row_map(rowid + 1);
             auto rhs_rowid = rhs(rowid);
-            scalar_t diff = scalar_t(0.0);
-
-            Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( team, soffset, eoffset ), [&] ( const long ptr, scalar_t &tdiff ) {
-              auto colid = entries(ptr);
-              auto val   = values(ptr);
-              if ( colid != rowid ) {
-                tdiff = tdiff - val*lhs(colid);
-              }
-            }, diff );
+            scalar_t diff  = scalar_t(0.0);
+
+            Kokkos::parallel_reduce(
+                Kokkos::ThreadVectorRange(team, soffset, eoffset),
+                [&](const long ptr, scalar_t &tdiff) {
+                  auto colid = entries(ptr);
+                  auto val   = values(ptr);
+                  if (colid != rowid) {
+                    tdiff = tdiff - val * lhs(colid);
+                  }
+                },
+                diff);
 
             // ASSUMPTION: sorted diagonal value located at start offset
-            lhs(rowid) = (rhs_rowid+diff)/values(soffset);
-          } // end if
-        }); // end TeamThreadRange
+            lhs(rowid) = (rhs_rowid + diff) / values(soffset);
+          }  // end if
+        });  // end TeamThreadRange
 
-        team.team_barrier();
+    team.team_barrier();
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const UnsortedTag&, const member_type & team ) const {
-        auto my_league = team.league_rank(); // map to rowid
-
-        size_t nrows = row_map.extent(0) - 1;
-
-        Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 0, node_groups ), [&] ( const long ng ) {
-          auto rowid = nodes_grouped_by_level(node_count + my_league*node_groups + ng);
-          if ( size_t(rowid) < nrows ) {
-            auto soffset = row_map(rowid);
-            auto eoffset = row_map(rowid+1);
+  void operator()(const UnsortedTag &, const member_type &team) const {
+    auto my_league = team.league_rank();  // map to rowid
+
+    size_t nrows = row_map.extent(0) - 1;
+
+    Kokkos::parallel_for(
+        Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) {
+          auto rowid =
+              nodes_grouped_by_level(node_count + my_league * node_groups + ng);
+          if (size_t(rowid) < nrows) {
+            auto soffset   = row_map(rowid);
+            auto eoffset   = row_map(rowid + 1);
             auto rhs_rowid = rhs(rowid);
-            scalar_t diff = scalar_t(0.0);
+            scalar_t diff  = scalar_t(0.0);
 
             auto diag = -1;
-            Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( team, soffset, eoffset ), [&] ( const long ptr, scalar_t &tdiff ) {
-              auto colid = entries(ptr);
-              auto val   = values(ptr);
-              if ( colid != rowid ) {
-                tdiff = tdiff - val*lhs(colid);
-              }
-              else {
-                diag = ptr;
-              }
-            }, diff );
-
-            lhs(rowid) = (rhs_rowid+diff)/values(diag);
-          } // end if
-        }); // end TeamThreadRange
+            Kokkos::parallel_reduce(
+                Kokkos::ThreadVectorRange(team, soffset, eoffset),
+                [&](const long ptr, scalar_t &tdiff) {
+                  auto colid = entries(ptr);
+                  auto val   = values(ptr);
+                  if (colid != rowid) {
+                    tdiff = tdiff - val * lhs(colid);
+                  } else {
+                    diag = ptr;
+                  }
+                },
+                diff);
+
+            lhs(rowid) = (rhs_rowid + diff) / values(diag);
+          }  // end if
+        });  // end TeamThreadRange
 
-        team.team_barrier();
+    team.team_barrier();
   }
-
 };
 
-
 // --------------------------------
 // Single-block functors
 // --------------------------------
 
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType, class RHSType, class NGBLType>
-struct LowerTriLvlSchedTP1SingleBlockFunctor
-{
+template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
+          class RHSType, class NGBLType>
+struct LowerTriLvlSchedTP1SingleBlockFunctor {
   typedef typename RowMapType::execution_space execution_space;
   typedef Kokkos::TeamPolicy<execution_space> policy_type;
   typedef typename policy_type::member_type member_type;
@@ -1456,244 +1636,272 @@ struct LowerTriLvlSchedTP1SingleBlockFunctor
   NGBLType nodes_grouped_by_level;
   NGBLType nodes_per_level;
 
-  long node_count; // like "block" offset into ngbl, my_league is the "local" offset
+  long node_count;  // like "block" offset into ngbl, my_league is the "local"
+                    // offset
   long lvl_start;
   long lvl_end;
   long cutoff;
   // team_size: each team can be assigned a row, if there are enough rows...
 
-
-  LowerTriLvlSchedTP1SingleBlockFunctor( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, NGBLType &nodes_per_level_, long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0 ) :
-    row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), nodes_per_level(nodes_per_level_), node_count(node_count_), lvl_start(lvl_start_), lvl_end(lvl_end_), cutoff(cutoff_) {}
-
-  // SingleBlock: Only one block (or league) executing; team_rank used to map thread to row
+  LowerTriLvlSchedTP1SingleBlockFunctor(
+      const RowMapType &row_map_, const EntriesType &entries_,
+      const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_,
+      const NGBLType &nodes_grouped_by_level_, NGBLType &nodes_per_level_,
+      long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0)
+      : row_map(row_map_),
+        entries(entries_),
+        values(values_),
+        lhs(lhs_),
+        rhs(rhs_),
+        nodes_grouped_by_level(nodes_grouped_by_level_),
+        nodes_per_level(nodes_per_level_),
+        node_count(node_count_),
+        lvl_start(lvl_start_),
+        lvl_end(lvl_end_),
+        cutoff(cutoff_) {}
+
+  // SingleBlock: Only one block (or league) executing; team_rank used to map
+  // thread to row
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const member_type & team ) const {
+  void operator()(const member_type &team) const {
     long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid {0};
-    typename RowMapType::non_const_value_type soffset {0};
-    typename RowMapType::non_const_value_type eoffset {0};
-    typename RHSType::non_const_value_type rhs_val {0};
+    typename NGBLType::non_const_value_type rowid{0};
+    typename RowMapType::non_const_value_type soffset{0};
+    typename RowMapType::non_const_value_type eoffset{0};
+    typename RHSType::non_const_value_type rhs_val{0};
     scalar_t diff = scalar_t(0.0);
-    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
       auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_rank = team.team_rank();
-      diff = scalar_t(0.0);
+      int my_rank         = team.team_rank();
+      diff                = scalar_t(0.0);
 
       if (my_rank < nodes_this_lvl) {
         // THIS is where the mapping of threadid to rowid happens
         rowid = nodes_grouped_by_level(my_rank + mut_node_count);
 
         soffset = row_map(rowid);
-        eoffset = row_map(rowid+1);
+        eoffset = row_map(rowid + 1);
         rhs_val = rhs(rowid);
 
 #ifdef SERIAL_FOR_LOOP
         for (auto ptr = soffset; ptr < eoffset; ++ptr) {
           auto colid = entries(ptr);
           auto val   = values(ptr);
-          if ( colid != rowid ) {
-            diff -= val*lhs(colid);
+          if (colid != rowid) {
+            diff -= val * lhs(colid);
           }
         }
 #else
         auto trange = eoffset - soffset;
-        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
-        {
-          auto ptr = soffset + loffset;
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if ( colid != rowid ) {
-            tdiff -= val*lhs(colid);
-          }
-        }, diff);
+        Kokkos::parallel_reduce(
+            Kokkos::ThreadVectorRange(team, trange),
+            [&](const int loffset, scalar_t &tdiff) {
+              auto ptr   = soffset + loffset;
+              auto colid = entries(ptr);
+              auto val   = values(ptr);
+              if (colid != rowid) {
+                tdiff -= val * lhs(colid);
+              }
+            },
+            diff);
 #endif
         // ASSUMPTION: sorted diagonal value located at eoffset - 1
-        lhs(rowid) = (rhs_val+diff)/values(eoffset-1);
-      } // end if team.team_rank() < nodes_this_lvl
+        lhs(rowid) = (rhs_val + diff) / values(eoffset - 1);
+      }  // end if team.team_rank() < nodes_this_lvl
       {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
+        // per thread
         mut_node_count += nodes_this_lvl;
       }
       team.team_barrier();
-    } // end for lvl
-  } // end operator
+    }  // end for lvl
+  }    // end operator
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const UnsortedTag&, const member_type & team ) const {
+  void operator()(const UnsortedTag &, const member_type &team) const {
     long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid {0};
-    typename RowMapType::non_const_value_type soffset {0};
-    typename RowMapType::non_const_value_type eoffset {0};
-    typename RHSType::non_const_value_type rhs_val {0};
+    typename NGBLType::non_const_value_type rowid{0};
+    typename RowMapType::non_const_value_type soffset{0};
+    typename RowMapType::non_const_value_type eoffset{0};
+    typename RHSType::non_const_value_type rhs_val{0};
     scalar_t diff = scalar_t(0.0);
-    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
       auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_rank = team.team_rank();
-      diff = scalar_t(0.0);
+      int my_rank         = team.team_rank();
+      diff                = scalar_t(0.0);
 
       if (my_rank < nodes_this_lvl) {
         // THIS is where the mapping of threadid to rowid happens
-        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
+        rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
         soffset = row_map(rowid);
-        eoffset = row_map(rowid+1);
+        eoffset = row_map(rowid + 1);
         rhs_val = rhs(rowid);
 
 #ifdef SERIAL_FOR_LOOP
         for (auto ptr = soffset; ptr < eoffset; ++ptr) {
           auto colid = entries(ptr);
           auto val   = values(ptr);
-          if ( colid != rowid ) {
-            diff -= val*lhs(colid);
+          if (colid != rowid) {
+            diff -= val * lhs(colid);
           }
         }
 #else
         auto trange = eoffset - soffset;
-        auto diag = -1;
+        auto diag   = -1;
 
-        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
-        {
-          auto ptr = soffset + loffset;
+        Kokkos::parallel_reduce(
+            Kokkos::ThreadVectorRange(team, trange),
+            [&](const int loffset, scalar_t &tdiff) {
+              auto ptr = soffset + loffset;
 
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if ( colid != rowid ) {
-            tdiff -= val*lhs(colid);
-          }
-          else {
-            diag = ptr;
-          }
-        }, diff);
+              auto colid = entries(ptr);
+              auto val   = values(ptr);
+              if (colid != rowid) {
+                tdiff -= val * lhs(colid);
+              } else {
+                diag = ptr;
+              }
+            },
+            diff);
 #endif
-        lhs(rowid) = (rhs_val+diff)/values(diag);
-      } // end if team.team_rank() < nodes_this_lvl
+        lhs(rowid) = (rhs_val + diff) / values(diag);
+      }  // end if team.team_rank() < nodes_this_lvl
       {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
+        // per thread
         mut_node_count += nodes_this_lvl;
       }
       team.team_barrier();
-    } // end for lvl
-  } // end operator
-
+    }  // end for lvl
+  }    // end operator
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const LargerCutoffTag&, const member_type & team ) const {
+  void operator()(const LargerCutoffTag &, const member_type &team) const {
     long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid {0};
-    typename RowMapType::non_const_value_type soffset {0};
-    typename RowMapType::non_const_value_type eoffset {0};
-    typename RHSType::non_const_value_type rhs_val {0};
+    typename NGBLType::non_const_value_type rowid{0};
+    typename RowMapType::non_const_value_type soffset{0};
+    typename RowMapType::non_const_value_type eoffset{0};
+    typename RHSType::non_const_value_type rhs_val{0};
     scalar_t diff = scalar_t(0.0);
-    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
       auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_team_rank = team.team_rank();
-      // If cutoff > team_size, then a thread will be responsible for multiple rows - this may be a helpful scenario depending on occupancy etc.
-      for (int my_rank = my_team_rank; my_rank < cutoff; my_rank+=team.team_size() ) {
-      diff = scalar_t(0.0);
-      if (my_rank < nodes_this_lvl) {
-        // THIS is where the mapping of threadid to rowid happens
-        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
-        soffset = row_map(rowid);
-        eoffset = row_map(rowid+1);
-        rhs_val = rhs(rowid);
+      int my_team_rank    = team.team_rank();
+      // If cutoff > team_size, then a thread will be responsible for multiple
+      // rows - this may be a helpful scenario depending on occupancy etc.
+      for (int my_rank = my_team_rank; my_rank < cutoff;
+           my_rank += team.team_size()) {
+        diff = scalar_t(0.0);
+        if (my_rank < nodes_this_lvl) {
+          // THIS is where the mapping of threadid to rowid happens
+          rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
+          soffset = row_map(rowid);
+          eoffset = row_map(rowid + 1);
+          rhs_val = rhs(rowid);
 
 #ifdef SERIAL_FOR_LOOP
-        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if ( colid != rowid ) {
-            diff -= val*lhs(colid);
+          for (auto ptr = soffset; ptr < eoffset; ++ptr) {
+            auto colid = entries(ptr);
+            auto val   = values(ptr);
+            if (colid != rowid) {
+              diff -= val * lhs(colid);
+            }
           }
-        }
 #else
-        auto trange = eoffset - soffset;
-        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
-        {
-          auto ptr = soffset + loffset;
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if ( colid != rowid ) {
-            tdiff -= val*lhs(colid);
-          }
-        },diff);
+          auto trange = eoffset - soffset;
+          Kokkos::parallel_reduce(
+              Kokkos::ThreadVectorRange(team, trange),
+              [&](const int loffset, scalar_t &tdiff) {
+                auto ptr   = soffset + loffset;
+                auto colid = entries(ptr);
+                auto val   = values(ptr);
+                if (colid != rowid) {
+                  tdiff -= val * lhs(colid);
+                }
+              },
+              diff);
 #endif
-         // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower tri, soffset for upper tri
-         lhs(rowid) = (rhs_val+diff)/values(eoffset-1);
-       } // end if team.team_rank() < nodes_this_lvl
-      } // end for my_rank loop
+          // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower
+          // tri, soffset for upper tri
+          lhs(rowid) = (rhs_val + diff) / values(eoffset - 1);
+        }  // end if team.team_rank() < nodes_this_lvl
+      }    // end for my_rank loop
       {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
+        // per thread
         mut_node_count += nodes_this_lvl;
       }
       team.team_barrier();
-    } // end for lvl
-  } // end tagged operator
+    }  // end for lvl
+  }    // end tagged operator
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const UnsortedLargerCutoffTag&, const member_type & team ) const {
+  void operator()(const UnsortedLargerCutoffTag &,
+                  const member_type &team) const {
     long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid {0};
-    typename RowMapType::non_const_value_type soffset {0};
-    typename RowMapType::non_const_value_type eoffset {0};
-    typename RHSType::non_const_value_type rhs_val {0};
+    typename NGBLType::non_const_value_type rowid{0};
+    typename RowMapType::non_const_value_type soffset{0};
+    typename RowMapType::non_const_value_type eoffset{0};
+    typename RHSType::non_const_value_type rhs_val{0};
     scalar_t diff = scalar_t(0.0);
 
-    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
       auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_team_rank = team.team_rank();
-      // If cutoff > team_size, then a thread will be responsible for multiple rows - this may be a helpful scenario depending on occupancy etc.
-      for (int my_rank = my_team_rank; my_rank < cutoff; my_rank+=team.team_size() ) {
-      diff = scalar_t(0.0);
-       if (my_rank < nodes_this_lvl) {
-        // THIS is where the mapping of threadid to rowid happens
-        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
-        soffset = row_map(rowid);
-        eoffset = row_map(rowid+1);
-        rhs_val = rhs(rowid);
+      int my_team_rank    = team.team_rank();
+      // If cutoff > team_size, then a thread will be responsible for multiple
+      // rows - this may be a helpful scenario depending on occupancy etc.
+      for (int my_rank = my_team_rank; my_rank < cutoff;
+           my_rank += team.team_size()) {
+        diff = scalar_t(0.0);
+        if (my_rank < nodes_this_lvl) {
+          // THIS is where the mapping of threadid to rowid happens
+          rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
+          soffset = row_map(rowid);
+          eoffset = row_map(rowid + 1);
+          rhs_val = rhs(rowid);
 
 #ifdef SERIAL_FOR_LOOP
-        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if ( colid != rowid ) {
-            diff -= val*lhs(colid);
+          for (auto ptr = soffset; ptr < eoffset; ++ptr) {
+            auto colid = entries(ptr);
+            auto val   = values(ptr);
+            if (colid != rowid) {
+              diff -= val * lhs(colid);
+            }
           }
-        }
 #else
-        auto trange = eoffset - soffset;
-        auto diag = -1;
-
-        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
-        {
-          auto ptr = soffset + loffset;
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if ( colid != rowid ) {
-            tdiff -= val*lhs(colid);
-          }
-          else {
-            diag = ptr;
-          }
-        },diff);
+          auto trange = eoffset - soffset;
+          auto diag   = -1;
+
+          Kokkos::parallel_reduce(
+              Kokkos::ThreadVectorRange(team, trange),
+              [&](const int loffset, scalar_t &tdiff) {
+                auto ptr   = soffset + loffset;
+                auto colid = entries(ptr);
+                auto val   = values(ptr);
+                if (colid != rowid) {
+                  tdiff -= val * lhs(colid);
+                } else {
+                  diag = ptr;
+                }
+              },
+              diff);
 #endif
-        lhs(rowid) = (rhs_val+diff)/values(diag);
-       } // end if team.team_rank() < nodes_this_lvl
-      } // end for my_rank loop
+          lhs(rowid) = (rhs_val + diff) / values(diag);
+        }  // end if team.team_rank() < nodes_this_lvl
+      }    // end for my_rank loop
       {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
+        // per thread
         mut_node_count += nodes_this_lvl;
       }
       team.team_barrier();
-    } // end for lvl
-  } // end tagged operator
-
+    }  // end for lvl
+  }    // end tagged operator
 };
 
-
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType, class RHSType, class NGBLType>
-struct UpperTriLvlSchedTP1SingleBlockFunctor
-{
+template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
+          class RHSType, class NGBLType>
+struct UpperTriLvlSchedTP1SingleBlockFunctor {
   typedef typename RowMapType::execution_space execution_space;
   typedef Kokkos::TeamPolicy<execution_space> policy_type;
   typedef typename policy_type::member_type member_type;
@@ -1708,89 +1916,107 @@ struct UpperTriLvlSchedTP1SingleBlockFunctor
   NGBLType nodes_grouped_by_level;
   NGBLType nodes_per_level;
 
-  long node_count; // like "block" offset into ngbl, my_league is the "local" offset
+  long node_count;  // like "block" offset into ngbl, my_league is the "local"
+                    // offset
   long lvl_start;
   long lvl_end;
   long cutoff;
   // team_size: each team can be assigned a row, if there are enough rows...
 
-
-  UpperTriLvlSchedTP1SingleBlockFunctor( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, NGBLType &nodes_per_level_, long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0 ) :
-    row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), nodes_per_level(nodes_per_level_), node_count(node_count_), lvl_start(lvl_start_), lvl_end(lvl_end_), cutoff(cutoff_) {}
-
-  // SingleBlock: Only one block (or league) executing; team_rank used to map thread to row
+  UpperTriLvlSchedTP1SingleBlockFunctor(
+      const RowMapType &row_map_, const EntriesType &entries_,
+      const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_,
+      const NGBLType &nodes_grouped_by_level_, NGBLType &nodes_per_level_,
+      long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0)
+      : row_map(row_map_),
+        entries(entries_),
+        values(values_),
+        lhs(lhs_),
+        rhs(rhs_),
+        nodes_grouped_by_level(nodes_grouped_by_level_),
+        nodes_per_level(nodes_per_level_),
+        node_count(node_count_),
+        lvl_start(lvl_start_),
+        lvl_end(lvl_end_),
+        cutoff(cutoff_) {}
+
+  // SingleBlock: Only one block (or league) executing; team_rank used to map
+  // thread to row
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const member_type & team ) const {
+  void operator()(const member_type &team) const {
     long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid {0};
-    typename RowMapType::non_const_value_type soffset {0};
-    typename RowMapType::non_const_value_type eoffset {0};
-    typename RHSType::non_const_value_type rhs_val {0};
+    typename NGBLType::non_const_value_type rowid{0};
+    typename RowMapType::non_const_value_type soffset{0};
+    typename RowMapType::non_const_value_type eoffset{0};
+    typename RHSType::non_const_value_type rhs_val{0};
     scalar_t diff = scalar_t(0.0);
 
-    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
       auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_rank = team.team_rank();
-      diff = scalar_t(0.0);
+      int my_rank         = team.team_rank();
+      diff                = scalar_t(0.0);
 
       if (my_rank < nodes_this_lvl) {
         // THIS is where the mapping of threadid to rowid happens
-        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
+        rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
         soffset = row_map(rowid);
-        eoffset = row_map(rowid+1);
+        eoffset = row_map(rowid + 1);
         rhs_val = rhs(rowid);
 
 #ifdef SERIAL_FOR_LOOP
         for (auto ptr = soffset; ptr < eoffset; ++ptr) {
           auto colid = entries(ptr);
           auto val   = values(ptr);
-          if ( colid != rowid ) {
-            diff -= val*lhs(colid);
+          if (colid != rowid) {
+            diff -= val * lhs(colid);
           }
         }
 #else
         auto trange = eoffset - soffset;
-        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
-        {
-          auto ptr = soffset + loffset;
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if ( colid != rowid ) {
-            tdiff -= val*lhs(colid);
-          }
-        }, diff);
+        Kokkos::parallel_reduce(
+            Kokkos::ThreadVectorRange(team, trange),
+            [&](const int loffset, scalar_t &tdiff) {
+              auto ptr   = soffset + loffset;
+              auto colid = entries(ptr);
+              auto val   = values(ptr);
+              if (colid != rowid) {
+                tdiff -= val * lhs(colid);
+              }
+            },
+            diff);
 #endif
         // ASSUMPTION: sorted diagonal value located at soffset
-        lhs(rowid) = (rhs_val+diff)/values(soffset);
-      } // end if
+        lhs(rowid) = (rhs_val + diff) / values(soffset);
+      }  // end if
       {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl each thread
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
+        // each thread
         mut_node_count += nodes_this_lvl;
       }
       team.team_barrier();
-    } // end for lvl
-  } // end operator
+    }  // end for lvl
+  }    // end operator
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const UnsortedTag&, const member_type & team ) const {
+  void operator()(const UnsortedTag &, const member_type &team) const {
     long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid {0};
-    typename RowMapType::non_const_value_type soffset {0};
-    typename RowMapType::non_const_value_type eoffset {0};
-    typename RHSType::non_const_value_type rhs_val {0};
+    typename NGBLType::non_const_value_type rowid{0};
+    typename RowMapType::non_const_value_type soffset{0};
+    typename RowMapType::non_const_value_type eoffset{0};
+    typename RHSType::non_const_value_type rhs_val{0};
     scalar_t diff = scalar_t(0.0);
 
-    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
       auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_rank = team.team_rank();
-      diff = scalar_t(0.0);
+      int my_rank         = team.team_rank();
+      diff                = scalar_t(0.0);
 
       if (my_rank < nodes_this_lvl) {
         // THIS is where the mapping of threadid to rowid happens
-        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
+        rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
         soffset = row_map(rowid);
-        eoffset = row_map(rowid+1);
+        eoffset = row_map(rowid + 1);
         rhs_val = rhs(rowid);
 
 #ifdef SERIAL_FOR_LOOP
@@ -1798,161 +2024,170 @@ struct UpperTriLvlSchedTP1SingleBlockFunctor
         for (auto ptr = soffset; ptr < eoffset; ++ptr) {
           auto colid = entries(ptr);
           auto val   = values(ptr);
-          if ( colid != rowid ) {
-            diff -= val*lhs(colid);
-          }
-          else {
+          if (colid != rowid) {
+            diff -= val * lhs(colid);
+          } else {
             diag = ptr;
           }
         }
 #else
         auto trange = eoffset - soffset;
-        auto diag = -1;
+        auto diag   = -1;
 
-        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
-        {
-          auto ptr = soffset + loffset;
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if ( colid != rowid ) {
-            tdiff -= val*lhs(colid);
-          }
-          else {
-            diag = ptr;
-          }
-        }, diff);
+        Kokkos::parallel_reduce(
+            Kokkos::ThreadVectorRange(team, trange),
+            [&](const int loffset, scalar_t &tdiff) {
+              auto ptr   = soffset + loffset;
+              auto colid = entries(ptr);
+              auto val   = values(ptr);
+              if (colid != rowid) {
+                tdiff -= val * lhs(colid);
+              } else {
+                diag = ptr;
+              }
+            },
+            diff);
 #endif
-        lhs(rowid) = (rhs_val+diff)/values(diag);
-      } // end if
+        lhs(rowid) = (rhs_val + diff) / values(diag);
+      }  // end if
       {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl each thread
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
+        // each thread
         mut_node_count += nodes_this_lvl;
       }
       team.team_barrier();
-    } // end for lvl
-  } // end operator
-
+    }  // end for lvl
+  }    // end operator
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const LargerCutoffTag&, const member_type & team ) const {
+  void operator()(const LargerCutoffTag &, const member_type &team) const {
     long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid {0};
-    typename RowMapType::non_const_value_type soffset {0};
-    typename RowMapType::non_const_value_type eoffset {0};
-    typename RHSType::non_const_value_type rhs_val {0};
+    typename NGBLType::non_const_value_type rowid{0};
+    typename RowMapType::non_const_value_type soffset{0};
+    typename RowMapType::non_const_value_type eoffset{0};
+    typename RHSType::non_const_value_type rhs_val{0};
     scalar_t diff = scalar_t(0.0);
 
-    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
       auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_team_rank = team.team_rank();
-      // If cutoff > team_size, then a thread will be responsible for multiple rows - this may be a helpful scenario depending on occupancy etc.
-      for (int my_rank = my_team_rank; my_rank < cutoff; my_rank+=team.team_size() ) {
-      diff = scalar_t(0.0);
-       if (my_rank < nodes_this_lvl) {
-        // THIS is where the mapping of threadid to rowid happens
-        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
-        soffset = row_map(rowid);
-        eoffset = row_map(rowid+1);
-        rhs_val = rhs(rowid);
+      int my_team_rank    = team.team_rank();
+      // If cutoff > team_size, then a thread will be responsible for multiple
+      // rows - this may be a helpful scenario depending on occupancy etc.
+      for (int my_rank = my_team_rank; my_rank < cutoff;
+           my_rank += team.team_size()) {
+        diff = scalar_t(0.0);
+        if (my_rank < nodes_this_lvl) {
+          // THIS is where the mapping of threadid to rowid happens
+          rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
+          soffset = row_map(rowid);
+          eoffset = row_map(rowid + 1);
+          rhs_val = rhs(rowid);
 
 #ifdef SERIAL_FOR_LOOP
-        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if ( colid != rowid ) {
-            diff -= val*lhs(colid);
+          for (auto ptr = soffset; ptr < eoffset; ++ptr) {
+            auto colid = entries(ptr);
+            auto val   = values(ptr);
+            if (colid != rowid) {
+              diff -= val * lhs(colid);
+            }
           }
-        }
 #else
-        auto trange = eoffset - soffset;
-        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
-        {
-          auto ptr = soffset + loffset;
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if ( colid != rowid ) {
-            tdiff -= val*lhs(colid);
-          }
-        }, diff);
+          auto trange = eoffset - soffset;
+          Kokkos::parallel_reduce(
+              Kokkos::ThreadVectorRange(team, trange),
+              [&](const int loffset, scalar_t &tdiff) {
+                auto ptr   = soffset + loffset;
+                auto colid = entries(ptr);
+                auto val   = values(ptr);
+                if (colid != rowid) {
+                  tdiff -= val * lhs(colid);
+                }
+              },
+              diff);
 #endif
-        // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower tri, soffset for upper tri
-          lhs(rowid) = (rhs_val+diff)/values(soffset);
-       } // end if team.team_rank() < nodes_this_lvl
-      } // end for my_rank loop
+          // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower
+          // tri, soffset for upper tri
+          lhs(rowid) = (rhs_val + diff) / values(soffset);
+        }  // end if team.team_rank() < nodes_this_lvl
+      }    // end for my_rank loop
       {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
+        // per thread
         mut_node_count += nodes_this_lvl;
       }
       team.team_barrier();
-    } // end for lvl
-  } // end tagged operator
+    }  // end for lvl
+  }    // end tagged operator
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const UnsortedLargerCutoffTag&, const member_type & team ) const {
+  void operator()(const UnsortedLargerCutoffTag &,
+                  const member_type &team) const {
     long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid {0};
-    typename RowMapType::non_const_value_type soffset {0};
-    typename RowMapType::non_const_value_type eoffset {0};
-    typename RHSType::non_const_value_type rhs_val {0};
+    typename NGBLType::non_const_value_type rowid{0};
+    typename RowMapType::non_const_value_type soffset{0};
+    typename RowMapType::non_const_value_type eoffset{0};
+    typename RHSType::non_const_value_type rhs_val{0};
     scalar_t diff = scalar_t(0.0);
 
-    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
       auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_team_rank = team.team_rank();
-      // If cutoff > team_size, then a thread will be responsible for multiple rows - this may be a helpful scenario depending on occupancy etc.
-      for (int my_rank = my_team_rank; my_rank < cutoff; my_rank+=team.team_size() ) {
-       diff = scalar_t(0.0);
-       if (my_rank < nodes_this_lvl) {
-        // THIS is where the mapping of threadid to rowid happens
-        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
-        soffset = row_map(rowid);
-        eoffset = row_map(rowid+1);
-        rhs_val = rhs(rowid);
+      int my_team_rank    = team.team_rank();
+      // If cutoff > team_size, then a thread will be responsible for multiple
+      // rows - this may be a helpful scenario depending on occupancy etc.
+      for (int my_rank = my_team_rank; my_rank < cutoff;
+           my_rank += team.team_size()) {
+        diff = scalar_t(0.0);
+        if (my_rank < nodes_this_lvl) {
+          // THIS is where the mapping of threadid to rowid happens
+          rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
+          soffset = row_map(rowid);
+          eoffset = row_map(rowid + 1);
+          rhs_val = rhs(rowid);
 
 #ifdef SERIAL_FOR_LOOP
-        auto diag = -1;
-        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if ( colid != rowid ) {
-            diff -= val*lhs(colid);
-          }
-          else {
-            diag = ptr;
+          auto diag = -1;
+          for (auto ptr = soffset; ptr < eoffset; ++ptr) {
+            auto colid = entries(ptr);
+            auto val   = values(ptr);
+            if (colid != rowid) {
+              diff -= val * lhs(colid);
+            } else {
+              diag = ptr;
+            }
           }
-        }
 #else
-        auto trange = eoffset - soffset;
-        auto diag = -1;
-        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
-        {
-          auto ptr = soffset + loffset;
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if ( colid != rowid ) {
-            tdiff -= val*lhs(colid);
-          }
-          else {
-            diag = ptr;
-          }
-        }, diff);
+          auto trange = eoffset - soffset;
+          auto diag   = -1;
+          Kokkos::parallel_reduce(
+              Kokkos::ThreadVectorRange(team, trange),
+              [&](const int loffset, scalar_t &tdiff) {
+                auto ptr   = soffset + loffset;
+                auto colid = entries(ptr);
+                auto val   = values(ptr);
+                if (colid != rowid) {
+                  tdiff -= val * lhs(colid);
+                } else {
+                  diag = ptr;
+                }
+              },
+              diff);
 #endif
-        lhs(rowid) = (rhs_val+diff)/values(diag);
-       } // end if team.team_rank() < nodes_this_lvl
-      } // end for my_rank loop
+          lhs(rowid) = (rhs_val + diff) / values(diag);
+        }  // end if team.team_rank() < nodes_this_lvl
+      }    // end for my_rank loop
       {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
+        // per thread
         mut_node_count += nodes_this_lvl;
       }
       team.team_barrier();
-    } // end for lvl
-  } // end tagged operator
+    }  // end for lvl
+  }    // end tagged operator
 };
 
-
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType, class RHSType, class NGBLType>
-struct TriLvlSchedTP1SingleBlockFunctor
-{
+template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
+          class RHSType, class NGBLType>
+struct TriLvlSchedTP1SingleBlockFunctor {
   typedef typename RowMapType::execution_space execution_space;
   typedef Kokkos::TeamPolicy<execution_space> policy_type;
   typedef typename policy_type::member_type member_type;
@@ -1967,95 +2202,117 @@ struct TriLvlSchedTP1SingleBlockFunctor
   NGBLType nodes_grouped_by_level;
   NGBLType nodes_per_level;
 
-  long node_count; // like "block" offset into ngbl, my_league is the "local" offset
+  long node_count;  // like "block" offset into ngbl, my_league is the "local"
+                    // offset
   long lvl_start;
   long lvl_end;
   const bool is_lowertri;
   const int dense_nrows;
-  const int  cutoff;
+  const int cutoff;
   // team_size: each team can be assigned a row, if there are enough rows...
 
-
-  TriLvlSchedTP1SingleBlockFunctor( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, NGBLType &nodes_per_level_, long node_count_, long lvl_start_, long lvl_end_, const bool is_lower_, const int dense_nrows_ = 0, const int cutoff_ = 0 ) :
-    row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), nodes_per_level(nodes_per_level_), node_count(node_count_), lvl_start(lvl_start_), lvl_end(lvl_end_), is_lowertri(is_lower_), dense_nrows(dense_nrows_), cutoff(cutoff_) {}
-
-  // SingleBlock: Only one block (or league) executing; team_rank used to map thread to row
+  TriLvlSchedTP1SingleBlockFunctor(
+      const RowMapType &row_map_, const EntriesType &entries_,
+      const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_,
+      const NGBLType &nodes_grouped_by_level_, NGBLType &nodes_per_level_,
+      long node_count_, long lvl_start_, long lvl_end_, const bool is_lower_,
+      const int dense_nrows_ = 0, const int cutoff_ = 0)
+      : row_map(row_map_),
+        entries(entries_),
+        values(values_),
+        lhs(lhs_),
+        rhs(rhs_),
+        nodes_grouped_by_level(nodes_grouped_by_level_),
+        nodes_per_level(nodes_per_level_),
+        node_count(node_count_),
+        lvl_start(lvl_start_),
+        lvl_end(lvl_end_),
+        is_lowertri(is_lower_),
+        dense_nrows(dense_nrows_),
+        cutoff(cutoff_) {}
+
+  // SingleBlock: Only one block (or league) executing; team_rank used to map
+  // thread to row
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const member_type & team ) const {
+  void operator()(const member_type &team) const {
     long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid {0};
-    typename RowMapType::non_const_value_type soffset {0};
-    typename RowMapType::non_const_value_type eoffset {0};
-    typename RHSType::non_const_value_type rhs_val {0};
+    typename NGBLType::non_const_value_type rowid{0};
+    typename RowMapType::non_const_value_type soffset{0};
+    typename RowMapType::non_const_value_type eoffset{0};
+    typename RHSType::non_const_value_type rhs_val{0};
     scalar_t diff = scalar_t(0.0);
 
-    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
       auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_rank = team.team_rank();
-      diff = scalar_t(0.0);
+      int my_rank         = team.team_rank();
+      diff                = scalar_t(0.0);
 
       if (my_rank < nodes_this_lvl) {
         // THIS is where the mapping of threadid to rowid happens
-        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
+        rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
         soffset = row_map(rowid);
-        eoffset = row_map(rowid+1);
+        eoffset = row_map(rowid + 1);
         rhs_val = rhs(rowid);
 
 #ifdef SERIAL_FOR_LOOP
         for (auto ptr = soffset; ptr < eoffset; ++ptr) {
           auto colid = entries(ptr);
           auto val   = values(ptr);
-          if ( colid != rowid ) {
-            diff -= val*lhs(colid);
+          if (colid != rowid) {
+            diff -= val * lhs(colid);
           }
         }
 #else
         auto trange = eoffset - soffset;
-        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
-        {
-          auto ptr = soffset + loffset;
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if ( colid != rowid ) {
-            tdiff -= val*lhs(colid);
-          }
-        }, diff);
+        Kokkos::parallel_reduce(
+            Kokkos::ThreadVectorRange(team, trange),
+            [&](const int loffset, scalar_t &tdiff) {
+              auto ptr   = soffset + loffset;
+              auto colid = entries(ptr);
+              auto val   = values(ptr);
+              if (colid != rowid) {
+                tdiff -= val * lhs(colid);
+              }
+            },
+            diff);
 #endif
 
-        // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower tri, soffset for upper tri
+        // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower
+        // tri, soffset for upper tri
         if (is_lowertri)
-          lhs(rowid) = (rhs_val+diff)/values(eoffset-1);
+          lhs(rowid) = (rhs_val + diff) / values(eoffset - 1);
         else
-          lhs(rowid) = (rhs_val+diff)/values(soffset);
-      } // end if team.team_rank() < nodes_this_lvl
+          lhs(rowid) = (rhs_val + diff) / values(soffset);
+      }  // end if team.team_rank() < nodes_this_lvl
       {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
+        // per thread
         mut_node_count += nodes_this_lvl;
       }
       team.team_barrier();
-    } // end for lvl
-  } // end operator
+    }  // end for lvl
+  }    // end operator
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const UnsortedTag&, const member_type & team ) const {
+  void operator()(const UnsortedTag &, const member_type &team) const {
     long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid {0};
-    typename RowMapType::non_const_value_type soffset {0};
-    typename RowMapType::non_const_value_type eoffset {0};
-    typename RHSType::non_const_value_type rhs_val {0};
+    typename NGBLType::non_const_value_type rowid{0};
+    typename RowMapType::non_const_value_type soffset{0};
+    typename RowMapType::non_const_value_type eoffset{0};
+    typename RHSType::non_const_value_type rhs_val{0};
     scalar_t diff = scalar_t(0.0);
 
-    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
       auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_rank = team.team_rank();
-      diff = scalar_t(0.0);
+      int my_rank         = team.team_rank();
+      diff                = scalar_t(0.0);
 
       if (my_rank < nodes_this_lvl) {
         // THIS is where the mapping of threadid to rowid happens
-        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
+        rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
         soffset = row_map(rowid);
-        eoffset = row_map(rowid+1);
+        eoffset = row_map(rowid + 1);
         rhs_val = rhs(rowid);
 
 #ifdef SERIAL_FOR_LOOP
@@ -2063,165 +2320,173 @@ struct TriLvlSchedTP1SingleBlockFunctor
         for (auto ptr = soffset; ptr < eoffset; ++ptr) {
           auto colid = entries(ptr);
           auto val   = values(ptr);
-          if ( colid != rowid ) {
-            diff -= val*lhs(colid);
-          }
-          else {
+          if (colid != rowid) {
+            diff -= val * lhs(colid);
+          } else {
             diag = ptr;
           }
         }
 #else
         auto trange = eoffset - soffset;
-        auto diag = -1;
-        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
-        {
-          auto ptr = soffset + loffset;
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if ( colid != rowid ) {
-            tdiff -= val*lhs(colid);
-          }
-          else {
-            diag = ptr;
-          }
-        }, diff);
+        auto diag   = -1;
+        Kokkos::parallel_reduce(
+            Kokkos::ThreadVectorRange(team, trange),
+            [&](const int loffset, scalar_t &tdiff) {
+              auto ptr   = soffset + loffset;
+              auto colid = entries(ptr);
+              auto val   = values(ptr);
+              if (colid != rowid) {
+                tdiff -= val * lhs(colid);
+              } else {
+                diag = ptr;
+              }
+            },
+            diff);
 #endif
-        lhs(rowid) = (rhs_val+diff)/values(diag);
-      } // end if team.team_rank() < nodes_this_lvl
+        lhs(rowid) = (rhs_val + diff) / values(diag);
+      }  // end if team.team_rank() < nodes_this_lvl
       {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
+        // per thread
         mut_node_count += nodes_this_lvl;
       }
       team.team_barrier();
-    } // end for lvl
-  } // end operator
-
+    }  // end for lvl
+  }    // end operator
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const LargerCutoffTag&, const member_type & team ) const {
+  void operator()(const LargerCutoffTag &, const member_type &team) const {
     long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid {0};
-    typename RowMapType::non_const_value_type soffset {0};
-    typename RowMapType::non_const_value_type eoffset {0};
-    typename RHSType::non_const_value_type rhs_val {0};
+    typename NGBLType::non_const_value_type rowid{0};
+    typename RowMapType::non_const_value_type soffset{0};
+    typename RowMapType::non_const_value_type eoffset{0};
+    typename RHSType::non_const_value_type rhs_val{0};
     scalar_t diff = scalar_t(0.0);
 
-    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
       auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_team_rank = team.team_rank();
-      // If cutoff > team_size, then a thread will be responsible for multiple rows - this may be a helpful scenario depending on occupancy etc.
-      for (int my_rank = my_team_rank; my_rank < cutoff; my_rank+=team.team_size() ) {
-       diff = scalar_t(0.0);
-       if (my_rank < nodes_this_lvl) {
-        // THIS is where the mapping of threadid to rowid happens
-        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
-        soffset = row_map(rowid);
-        eoffset = row_map(rowid+1);
-        rhs_val = rhs(rowid);
+      int my_team_rank    = team.team_rank();
+      // If cutoff > team_size, then a thread will be responsible for multiple
+      // rows - this may be a helpful scenario depending on occupancy etc.
+      for (int my_rank = my_team_rank; my_rank < cutoff;
+           my_rank += team.team_size()) {
+        diff = scalar_t(0.0);
+        if (my_rank < nodes_this_lvl) {
+          // THIS is where the mapping of threadid to rowid happens
+          rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
+          soffset = row_map(rowid);
+          eoffset = row_map(rowid + 1);
+          rhs_val = rhs(rowid);
 
 #ifdef SERIAL_FOR_LOOP
-        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if ( colid != rowid ) {
-            diff -= val*lhs(colid);
+          for (auto ptr = soffset; ptr < eoffset; ++ptr) {
+            auto colid = entries(ptr);
+            auto val   = values(ptr);
+            if (colid != rowid) {
+              diff -= val * lhs(colid);
+            }
           }
-        }
 #else
-        auto trange = eoffset - soffset;
-        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
-        {
-          auto ptr = soffset + loffset;
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if ( colid != rowid ) {
-            tdiff -= val*lhs(colid);
-          }
-        }, diff);
+          auto trange = eoffset - soffset;
+          Kokkos::parallel_reduce(
+              Kokkos::ThreadVectorRange(team, trange),
+              [&](const int loffset, scalar_t &tdiff) {
+                auto ptr   = soffset + loffset;
+                auto colid = entries(ptr);
+                auto val   = values(ptr);
+                if (colid != rowid) {
+                  tdiff -= val * lhs(colid);
+                }
+              },
+              diff);
 #endif
 
-        // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower tri, soffset for upper tri
-        if (is_lowertri)
-          lhs(rowid) = (rhs_val+diff)/values(eoffset-1);
-        else
-          lhs(rowid) = (rhs_val+diff)/values(soffset);
-       } // end if team.team_rank() < nodes_this_lvl
-      } // end for my_rank loop
+          // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower
+          // tri, soffset for upper tri
+          if (is_lowertri)
+            lhs(rowid) = (rhs_val + diff) / values(eoffset - 1);
+          else
+            lhs(rowid) = (rhs_val + diff) / values(soffset);
+        }  // end if team.team_rank() < nodes_this_lvl
+      }    // end for my_rank loop
       {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
+        // per thread
         mut_node_count += nodes_this_lvl;
       }
       team.team_barrier();
-    } // end for lvl
-  } // end tagged operator
+    }  // end for lvl
+  }    // end tagged operator
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const UnsortedLargerCutoffTag&, const member_type & team ) const {
+  void operator()(const UnsortedLargerCutoffTag &,
+                  const member_type &team) const {
     long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid {0};
-    typename RowMapType::non_const_value_type soffset {0};
-    typename RowMapType::non_const_value_type eoffset {0};
-    typename RHSType::non_const_value_type rhs_val {0};
+    typename NGBLType::non_const_value_type rowid{0};
+    typename RowMapType::non_const_value_type soffset{0};
+    typename RowMapType::non_const_value_type eoffset{0};
+    typename RHSType::non_const_value_type rhs_val{0};
     scalar_t diff = scalar_t(0.0);
 
-    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
       auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_team_rank = team.team_rank();
-      // If cutoff > team_size, then a thread will be responsible for multiple rows - this may be a helpful scenario depending on occupancy etc.
-      for (int my_rank = my_team_rank; my_rank < cutoff; my_rank+=team.team_size() ) {
-       diff = scalar_t(0.0);
-       if (my_rank < nodes_this_lvl) {
-        // THIS is where the mapping of threadid to rowid happens
-        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
-        soffset = row_map(rowid);
-        eoffset = row_map(rowid+1);
-        rhs_val = rhs(rowid);
+      int my_team_rank    = team.team_rank();
+      // If cutoff > team_size, then a thread will be responsible for multiple
+      // rows - this may be a helpful scenario depending on occupancy etc.
+      for (int my_rank = my_team_rank; my_rank < cutoff;
+           my_rank += team.team_size()) {
+        diff = scalar_t(0.0);
+        if (my_rank < nodes_this_lvl) {
+          // THIS is where the mapping of threadid to rowid happens
+          rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
+          soffset = row_map(rowid);
+          eoffset = row_map(rowid + 1);
+          rhs_val = rhs(rowid);
 
 #ifdef SERIAL_FOR_LOOP
-        auto diag = -1;
-        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if ( colid != rowid ) {
-            diff -= val*lhs(colid);
-          }
-          else {
-            diag = ptr;
+          auto diag = -1;
+          for (auto ptr = soffset; ptr < eoffset; ++ptr) {
+            auto colid = entries(ptr);
+            auto val   = values(ptr);
+            if (colid != rowid) {
+              diff -= val * lhs(colid);
+            } else {
+              diag = ptr;
+            }
           }
-        }
 #else
-        auto trange = eoffset - soffset;
-        auto diag = -1;
-        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
-        {
-          auto ptr = soffset + loffset;
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if ( colid != rowid ) {
-            tdiff -= val*lhs(colid);
-          }
-          else {
-            diag = ptr;
-          }
-        }, diff);
+          auto trange = eoffset - soffset;
+          auto diag   = -1;
+          Kokkos::parallel_reduce(
+              Kokkos::ThreadVectorRange(team, trange),
+              [&](const int loffset, scalar_t &tdiff) {
+                auto ptr   = soffset + loffset;
+                auto colid = entries(ptr);
+                auto val   = values(ptr);
+                if (colid != rowid) {
+                  tdiff -= val * lhs(colid);
+                } else {
+                  diag = ptr;
+                }
+              },
+              diff);
 #endif
-          lhs(rowid) = (rhs_val+diff)/values(diag);
-       } // end if team.team_rank() < nodes_this_lvl
-      } // end for my_rank loop
+          lhs(rowid) = (rhs_val + diff) / values(diag);
+        }  // end if team.team_rank() < nodes_this_lvl
+      }    // end for my_rank loop
       {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
+        // per thread
         mut_node_count += nodes_this_lvl;
       }
       team.team_barrier();
-    } // end for lvl
-  } // end tagged operator
-
+    }  // end for lvl
+  }    // end tagged operator
 };
 
-
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType, class RHSType, class NGBLType>
-struct TriLvlSchedTP1SingleBlockFunctorDiagValues
-{
+template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
+          class RHSType, class NGBLType>
+struct TriLvlSchedTP1SingleBlockFunctorDiagValues {
   typedef typename RowMapType::execution_space execution_space;
   typedef Kokkos::TeamPolicy<execution_space> policy_type;
   typedef typename policy_type::member_type member_type;
@@ -2237,130 +2502,156 @@ struct TriLvlSchedTP1SingleBlockFunctorDiagValues
   NGBLType nodes_per_level;
   ValuesType diagonal_values;
 
-  long node_count; // like "block" offset into ngbl, my_league is the "local" offset
+  long node_count;  // like "block" offset into ngbl, my_league is the "local"
+                    // offset
   long lvl_start;
   long lvl_end;
   const bool is_lowertri;
   const int dense_nrows;
-  const int  cutoff;
+  const int cutoff;
   // team_size: each team can be assigned a row, if there are enough rows...
 
-
-  TriLvlSchedTP1SingleBlockFunctorDiagValues( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, const NGBLType &nodes_per_level_, const ValuesType &diagonal_values_, long node_count_, const long lvl_start_, const long lvl_end_, const bool is_lower_, const int dense_nrows_ = 0, const int cutoff_ = 0 ) :
-    row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), nodes_per_level(nodes_per_level_), diagonal_values(diagonal_values_), node_count(node_count_), lvl_start(lvl_start_), lvl_end(lvl_end_), is_lowertri(is_lower_), dense_nrows(dense_nrows_), cutoff(cutoff_) {}
-
-  // SingleBlock: Only one block (or league) executing; team_rank used to map thread to row
+  TriLvlSchedTP1SingleBlockFunctorDiagValues(
+      const RowMapType &row_map_, const EntriesType &entries_,
+      const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_,
+      const NGBLType &nodes_grouped_by_level_, const NGBLType &nodes_per_level_,
+      const ValuesType &diagonal_values_, long node_count_,
+      const long lvl_start_, const long lvl_end_, const bool is_lower_,
+      const int dense_nrows_ = 0, const int cutoff_ = 0)
+      : row_map(row_map_),
+        entries(entries_),
+        values(values_),
+        lhs(lhs_),
+        rhs(rhs_),
+        nodes_grouped_by_level(nodes_grouped_by_level_),
+        nodes_per_level(nodes_per_level_),
+        diagonal_values(diagonal_values_),
+        node_count(node_count_),
+        lvl_start(lvl_start_),
+        lvl_end(lvl_end_),
+        is_lowertri(is_lower_),
+        dense_nrows(dense_nrows_),
+        cutoff(cutoff_) {}
+
+  // SingleBlock: Only one block (or league) executing; team_rank used to map
+  // thread to row
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const member_type & team ) const {
+  void operator()(const member_type &team) const {
     long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid {0};
-    typename RowMapType::non_const_value_type soffset {0};
-    typename RowMapType::non_const_value_type eoffset {0};
-    typename RHSType::non_const_value_type rhs_val {0};
+    typename NGBLType::non_const_value_type rowid{0};
+    typename RowMapType::non_const_value_type soffset{0};
+    typename RowMapType::non_const_value_type eoffset{0};
+    typename RHSType::non_const_value_type rhs_val{0};
     scalar_t diff = scalar_t(0.0);
 
-    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
       auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_rank = team.team_rank();
-      diff = scalar_t(0.0);
+      int my_rank         = team.team_rank();
+      diff                = scalar_t(0.0);
 
       if (my_rank < nodes_this_lvl) {
         // THIS is where the mapping of threadid to rowid happens
-        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
+        rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
         soffset = row_map(rowid);
-        eoffset = row_map(rowid+1);
+        eoffset = row_map(rowid + 1);
         rhs_val = rhs(rowid);
 
 #ifdef SERIAL_FOR_LOOP
         for (auto ptr = soffset; ptr < eoffset; ++ptr) {
           auto colid = entries(ptr);
           auto val   = values(ptr);
-          if ( colid != rowid ) {
-            diff -= val*lhs(colid);
+          if (colid != rowid) {
+            diff -= val * lhs(colid);
           }
         }
 #else
-      auto trange = eoffset - soffset;
-      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
-      {
-        auto ptr = soffset + loffset;
-        auto colid = entries(ptr);
-        auto val   = values(ptr);
+        auto trange = eoffset - soffset;
+        Kokkos::parallel_reduce(
+            Kokkos::ThreadVectorRange(team, trange),
+            [&](const int loffset, scalar_t &tdiff) {
+              auto ptr   = soffset + loffset;
+              auto colid = entries(ptr);
+              auto val   = values(ptr);
 
-        if ( colid != rowid ) {
-          tdiff -= val*lhs(colid);
-        }
-      }, diff);
+              if (colid != rowid) {
+                tdiff -= val * lhs(colid);
+              }
+            },
+            diff);
 #endif
-        // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower tri, soffset for upper tri
-        lhs(rowid) = (rhs_val+diff)/diagonal_values(rowid);
-      } // end if team.team_rank() < nodes_this_lvl
+        // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower
+        // tri, soffset for upper tri
+        lhs(rowid) = (rhs_val + diff) / diagonal_values(rowid);
+      }  // end if team.team_rank() < nodes_this_lvl
       {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
+        // per thread
         mut_node_count += nodes_this_lvl;
       }
       team.team_barrier();
-    } // end for lvl
-  } // end operator
-
+    }  // end for lvl
+  }    // end operator
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const LargerCutoffTag&, const member_type & team ) const {
+  void operator()(const LargerCutoffTag &, const member_type &team) const {
     long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid {0};
-    typename RowMapType::non_const_value_type soffset {0};
-    typename RowMapType::non_const_value_type eoffset {0};
-    typename RHSType::non_const_value_type rhs_val {0};
+    typename NGBLType::non_const_value_type rowid{0};
+    typename RowMapType::non_const_value_type soffset{0};
+    typename RowMapType::non_const_value_type eoffset{0};
+    typename RHSType::non_const_value_type rhs_val{0};
     scalar_t diff = scalar_t(0.0);
 
-    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
       auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_team_rank = team.team_rank();
-      // If cutoff > team_size, then a thread will be responsible for multiple rows - this may be a helpful scenario depending on occupancy etc.
-      for (int my_rank = my_team_rank; my_rank < cutoff; my_rank+=team.team_size() ) {
-       diff = scalar_t(0.0);
-       if (my_rank < nodes_this_lvl) {
-        // THIS is where the mapping of threadid to rowid happens
-        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
-        soffset = row_map(rowid);
-        eoffset = row_map(rowid+1);
-        rhs_val = rhs(rowid);
+      int my_team_rank    = team.team_rank();
+      // If cutoff > team_size, then a thread will be responsible for multiple
+      // rows - this may be a helpful scenario depending on occupancy etc.
+      for (int my_rank = my_team_rank; my_rank < cutoff;
+           my_rank += team.team_size()) {
+        diff = scalar_t(0.0);
+        if (my_rank < nodes_this_lvl) {
+          // THIS is where the mapping of threadid to rowid happens
+          rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
+          soffset = row_map(rowid);
+          eoffset = row_map(rowid + 1);
+          rhs_val = rhs(rowid);
 
 #ifdef SERIAL_FOR_LOOP
-        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if ( colid != rowid ) {
-            diff -= val*lhs(colid);
+          for (auto ptr = soffset; ptr < eoffset; ++ptr) {
+            auto colid = entries(ptr);
+            auto val   = values(ptr);
+            if (colid != rowid) {
+              diff -= val * lhs(colid);
+            }
           }
-        }
 #else
-      auto trange = eoffset - soffset;
-      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
-      {
-        auto ptr = soffset + loffset;
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if ( colid != rowid ) {
-            tdiff -= val*lhs(colid);
-          }
-        }, diff);
+          auto trange = eoffset - soffset;
+          Kokkos::parallel_reduce(
+              Kokkos::ThreadVectorRange(team, trange),
+              [&](const int loffset, scalar_t &tdiff) {
+                auto ptr   = soffset + loffset;
+                auto colid = entries(ptr);
+                auto val   = values(ptr);
+                if (colid != rowid) {
+                  tdiff -= val * lhs(colid);
+                }
+              },
+              diff);
 #endif
-        lhs(rowid) = (rhs_val+diff)/diagonal_values(rowid);
-       } // end if team.team_rank() < nodes_this_lvl
-      } // end for my_rank loop
+          lhs(rowid) = (rhs_val + diff) / diagonal_values(rowid);
+        }  // end if team.team_rank() < nodes_this_lvl
+      }    // end for my_rank loop
       {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
+        // per thread
         mut_node_count += nodes_this_lvl;
       }
       team.team_barrier();
-    } // end for lvl
-  } // end tagged operator
-
+    }  // end for lvl
+  }    // end tagged operator
 };
 
-
 #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
 template <class SpaceType>
 struct ReturnTeamPolicyType;
@@ -2370,16 +2661,14 @@ template <>
 struct ReturnTeamPolicyType<Kokkos::Serial> {
   using PolicyType = Kokkos::TeamPolicy<Kokkos::Serial>;
 
-  static inline
-  PolicyType get_policy(int nt, int ts) {
-    return PolicyType(nt,ts);
+  static inline PolicyType get_policy(int nt, int ts) {
+    return PolicyType(nt, ts);
   }
 
   template <class ExecInstanceType>
-  static inline
-  PolicyType get_policy(int nt, int ts, ExecInstanceType ) {
-    return PolicyType(nt,ts);
-    //return PolicyType(ExecInstanceType(),nt,ts);
+  static inline PolicyType get_policy(int nt, int ts, ExecInstanceType) {
+    return PolicyType(nt, ts);
+    // return PolicyType(ExecInstanceType(),nt,ts);
   }
 };
 #endif
@@ -2388,16 +2677,14 @@ template <>
 struct ReturnTeamPolicyType<Kokkos::OpenMP> {
   using PolicyType = Kokkos::TeamPolicy<Kokkos::OpenMP>;
 
-  static inline
-  PolicyType get_policy(int nt, int ts) {
-    return PolicyType(nt,ts);
+  static inline PolicyType get_policy(int nt, int ts) {
+    return PolicyType(nt, ts);
   }
 
   template <class ExecInstanceType>
-  static inline
-  PolicyType get_policy(int nt, int ts, ExecInstanceType ) {
-    return PolicyType(nt,ts);
-    //return PolicyType(ExecInstanceType(),nt,ts);
+  static inline PolicyType get_policy(int nt, int ts, ExecInstanceType) {
+    return PolicyType(nt, ts);
+    // return PolicyType(ExecInstanceType(),nt,ts);
   }
 };
 #endif
@@ -2406,15 +2693,13 @@ template <>
 struct ReturnTeamPolicyType<Kokkos::Cuda> {
   using PolicyType = Kokkos::TeamPolicy<Kokkos::Cuda>;
 
-  static inline
-  PolicyType get_policy(int nt, int ts) {
-    return PolicyType(nt,ts);
+  static inline PolicyType get_policy(int nt, int ts) {
+    return PolicyType(nt, ts);
   }
 
   template <class ExecInstanceType>
-  static inline
-  PolicyType get_policy(int nt, int ts, ExecInstanceType stream) {
-    return PolicyType(stream,nt,ts);
+  static inline PolicyType get_policy(int nt, int ts, ExecInstanceType stream) {
+    return PolicyType(stream, nt, ts);
   }
 };
 #endif
@@ -2427,16 +2712,14 @@ template <>
 struct ReturnRangePolicyType<Kokkos::Serial> {
   using PolicyType = Kokkos::RangePolicy<Kokkos::Serial>;
 
-  static inline
-  PolicyType get_policy(int nt, int ts) {
-    return PolicyType(nt,ts);
+  static inline PolicyType get_policy(int nt, int ts) {
+    return PolicyType(nt, ts);
   }
 
   template <class ExecInstanceType>
-  static inline
-  PolicyType get_policy(int nt, int ts, ExecInstanceType ) {
-    return PolicyType(nt,ts);
-    //return PolicyType(ExecInstanceType(),nt,ts);
+  static inline PolicyType get_policy(int nt, int ts, ExecInstanceType) {
+    return PolicyType(nt, ts);
+    // return PolicyType(ExecInstanceType(),nt,ts);
   }
 };
 #endif
@@ -2445,16 +2728,14 @@ template <>
 struct ReturnRangePolicyType<Kokkos::OpenMP> {
   using PolicyType = Kokkos::RangePolicy<Kokkos::OpenMP>;
 
-  static inline
-  PolicyType get_policy(int nt, int ts) {
-    return PolicyType(nt,ts);
+  static inline PolicyType get_policy(int nt, int ts) {
+    return PolicyType(nt, ts);
   }
 
   template <class ExecInstanceType>
-  static inline
-  PolicyType get_policy(int nt, int ts, ExecInstanceType ) {
-    return PolicyType(nt,ts);
-    //return PolicyType(ExecInstanceType(),nt,ts);
+  static inline PolicyType get_policy(int nt, int ts, ExecInstanceType) {
+    return PolicyType(nt, ts);
+    // return PolicyType(ExecInstanceType(),nt,ts);
   }
 };
 #endif
@@ -2463,15 +2744,13 @@ template <>
 struct ReturnRangePolicyType<Kokkos::Cuda> {
   using PolicyType = Kokkos::RangePolicy<Kokkos::Cuda>;
 
-  static inline
-  PolicyType get_policy(int nt, int ts) {
-    return PolicyType(nt,ts);
+  static inline PolicyType get_policy(int nt, int ts) {
+    return PolicyType(nt, ts);
   }
 
   template <class ExecInstanceType>
-  static inline
-  PolicyType get_policy(int nt, int ts, ExecInstanceType stream) {
-    return PolicyType(stream,nt,ts);
+  static inline PolicyType get_policy(int nt, int ts, ExecInstanceType stream) {
+    return PolicyType(stream, nt, ts);
   }
 };
 #endif
@@ -2480,48 +2759,50 @@ template <>
 struct ReturnRangePolicyType<Kokkos::Experimental::HIP> {
   using PolicyType = Kokkos::RangePolicy<Kokkos::Experimental::HIP>;
 
-  static inline
-  PolicyType get_policy(int nt, int ts) {
-    return PolicyType(nt,ts);
+  static inline PolicyType get_policy(int nt, int ts) {
+    return PolicyType(nt, ts);
   }
 
   template <class ExecInstanceType>
-  static inline
-  PolicyType get_policy(int nt, int ts, ExecInstanceType stream) {
-    return PolicyType(stream,nt,ts);
+  static inline PolicyType get_policy(int nt, int ts, ExecInstanceType stream) {
+    return PolicyType(stream, nt, ts);
   }
 };
 #endif
 
-template < class TriSolveHandle, class RowMapType, class EntriesType, class ValuesType, class RHSType, class LHSType >
-void lower_tri_solve_cg( TriSolveHandle & thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType & rhs, LHSType &lhs) {
-
-    typedef typename TriSolveHandle::nnz_lno_view_t NGBLType;
-    typedef typename TriSolveHandle::execution_space execution_space;
-    typedef typename TriSolveHandle::size_type size_type;
-    typename TriSolveHandle::SPTRSVcudaGraphWrapperType* lcl_cudagraph = thandle.get_sptrsvCudaGraph();
+template <class TriSolveHandle, class RowMapType, class EntriesType,
+          class ValuesType, class RHSType, class LHSType>
+void lower_tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map,
+                        const EntriesType entries, const ValuesType values,
+                        const RHSType &rhs, LHSType &lhs) {
+  typedef typename TriSolveHandle::nnz_lno_view_t NGBLType;
+  typedef typename TriSolveHandle::execution_space execution_space;
+  typedef typename TriSolveHandle::size_type size_type;
+  typename TriSolveHandle::SPTRSVcudaGraphWrapperType *lcl_cudagraph =
+      thandle.get_sptrsvCudaGraph();
 
-    auto nlevels = thandle.get_num_levels();
+  auto nlevels = thandle.get_num_levels();
 
-    auto stream1 = lcl_cudagraph->stream;
-    Kokkos::Cuda cuda1(stream1);
-    auto graph = lcl_cudagraph->cudagraph;
+  auto stream1 = lcl_cudagraph->stream;
+  Kokkos::Cuda cuda1(stream1);
+  auto graph = lcl_cudagraph->cudagraph;
 
-    Kokkos::parallel_for("Init", Kokkos::RangePolicy<execution_space>(0,1), EmptyFunctor());
-    Kokkos::Cuda().fence();
-    cudaStreamSynchronize(stream1);
-    //Kokkos::fence();
+  Kokkos::parallel_for("Init", Kokkos::RangePolicy<execution_space>(0, 1),
+                       EmptyFunctor());
+  Kokkos::Cuda().fence();
+  cudaStreamSynchronize(stream1);
+  // Kokkos::fence();
 
-    auto hnodes_per_level = thandle.get_host_nodes_per_level();
-    auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
+  auto hnodes_per_level       = thandle.get_host_nodes_per_level();
+  auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
 
-    size_type node_count = 0;
+  size_type node_count = 0;
 
-    int team_size = thandle.get_team_size();
-    team_size = team_size == -1 ? 64 : team_size;
+  int team_size = thandle.get_team_size();
+  team_size     = team_size == -1 ? 64 : team_size;
 
-    // Start capturing stream
-    if(thandle.cudagraphCreated == false) {
+  // Start capturing stream
+  if (thandle.cudagraphCreated == false) {
     Kokkos::fence();
     cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal);
     {
@@ -2530,7 +2811,17 @@ void lower_tri_solve_cg( TriSolveHandle & thandle, const RowMapType row_map, con
 
         using policy_type = ReturnTeamPolicyType<execution_space>;
 
-        Kokkos::parallel_for("parfor_l_team_cudagraph",  Kokkos::Experimental::require(ReturnTeamPolicyType<execution_space>::get_policy(lvl_nodes,team_size,cuda1), Kokkos::Experimental::WorkItemProperty::HintLightWeight), LowerTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count));
+        Kokkos::parallel_for(
+            "parfor_l_team_cudagraph",
+            Kokkos::Experimental::require(
+                ReturnTeamPolicyType<execution_space>::get_policy(
+                    lvl_nodes, team_size, cuda1),
+                Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+            LowerTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType,
+                                             ValuesType, LHSType, RHSType,
+                                             NGBLType>(
+                row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                node_count));
 
         node_count += hnodes_per_level(iter);
       }
@@ -2538,46 +2829,50 @@ void lower_tri_solve_cg( TriSolveHandle & thandle, const RowMapType row_map, con
     cudaStreamEndCapture(stream1, &graph);
 
     // Create graphExec
-    cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL, NULL, 0);
-      thandle.cudagraphCreated = true;
-    }
-    // Run graph
-    Kokkos::fence();
-    cudaGraphLaunch(lcl_cudagraph->cudagraphinstance, stream1);
-
-    cudaStreamSynchronize(stream1);
-    Kokkos::fence();
-} // end lower_tri_solve_cg
-
+    cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL, NULL,
+                         0);
+    thandle.cudagraphCreated = true;
+  }
+  // Run graph
+  Kokkos::fence();
+  cudaGraphLaunch(lcl_cudagraph->cudagraphinstance, stream1);
 
-template < class TriSolveHandle, class RowMapType, class EntriesType, class ValuesType, class RHSType, class LHSType >
-void upper_tri_solve_cg( TriSolveHandle & thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType & rhs, LHSType &lhs) {
+  cudaStreamSynchronize(stream1);
+  Kokkos::fence();
+}  // end lower_tri_solve_cg
 
-    typedef typename TriSolveHandle::nnz_lno_view_t NGBLType;
-    typedef typename TriSolveHandle::execution_space execution_space;
-    typedef typename TriSolveHandle::size_type size_type;
-    typename TriSolveHandle::SPTRSVcudaGraphWrapperType* lcl_cudagraph = thandle.get_sptrsvCudaGraph();
+template <class TriSolveHandle, class RowMapType, class EntriesType,
+          class ValuesType, class RHSType, class LHSType>
+void upper_tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map,
+                        const EntriesType entries, const ValuesType values,
+                        const RHSType &rhs, LHSType &lhs) {
+  typedef typename TriSolveHandle::nnz_lno_view_t NGBLType;
+  typedef typename TriSolveHandle::execution_space execution_space;
+  typedef typename TriSolveHandle::size_type size_type;
+  typename TriSolveHandle::SPTRSVcudaGraphWrapperType *lcl_cudagraph =
+      thandle.get_sptrsvCudaGraph();
 
-    auto nlevels = thandle.get_num_levels();
+  auto nlevels = thandle.get_num_levels();
 
-    auto stream1 = lcl_cudagraph->stream;
-    Kokkos::Cuda cuda1(stream1);
-    auto graph = lcl_cudagraph->cudagraph;
+  auto stream1 = lcl_cudagraph->stream;
+  Kokkos::Cuda cuda1(stream1);
+  auto graph = lcl_cudagraph->cudagraph;
 
-    Kokkos::parallel_for("Init", Kokkos::RangePolicy<execution_space>(0,1), EmptyFunctor());
-    Kokkos::Cuda().fence();
-    cudaStreamSynchronize(stream1);
+  Kokkos::parallel_for("Init", Kokkos::RangePolicy<execution_space>(0, 1),
+                       EmptyFunctor());
+  Kokkos::Cuda().fence();
+  cudaStreamSynchronize(stream1);
 
-    auto hnodes_per_level = thandle.get_host_nodes_per_level();
-    auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
+  auto hnodes_per_level       = thandle.get_host_nodes_per_level();
+  auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
 
-    size_type node_count = 0;
+  size_type node_count = 0;
 
-    int team_size = thandle.get_team_size();
-    team_size = team_size == -1 ? 64 : team_size;
+  int team_size = thandle.get_team_size();
+  team_size     = team_size == -1 ? 64 : team_size;
 
-    // Start capturing stream
-    if(thandle.cudagraphCreated == false) {
+  // Start capturing stream
+  if (thandle.cudagraphCreated == false) {
     Kokkos::fence();
     cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal);
     {
@@ -2586,7 +2881,17 @@ void upper_tri_solve_cg( TriSolveHandle & thandle, const RowMapType row_map, con
 
         using policy_type = ReturnTeamPolicyType<execution_space>;
 
-        Kokkos::parallel_for("parfor_u_team_cudagraph",  Kokkos::Experimental::require(ReturnTeamPolicyType<execution_space>::get_policy(lvl_nodes,team_size,cuda1), Kokkos::Experimental::WorkItemProperty::HintLightWeight), UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count));
+        Kokkos::parallel_for(
+            "parfor_u_team_cudagraph",
+            Kokkos::Experimental::require(
+                ReturnTeamPolicyType<execution_space>::get_policy(
+                    lvl_nodes, team_size, cuda1),
+                Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+            UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType,
+                                             ValuesType, LHSType, RHSType,
+                                             NGBLType>(
+                row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                node_count));
 
         node_count += hnodes_per_level(iter);
       }
@@ -2594,25 +2899,27 @@ void upper_tri_solve_cg( TriSolveHandle & thandle, const RowMapType row_map, con
     cudaStreamEndCapture(stream1, &graph);
 
     // Create graphExec
-    cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL, NULL, 0);
-      thandle.cudagraphCreated = true;
-    }
-    // Run graph
-    Kokkos::fence();
-    cudaGraphLaunch(lcl_cudagraph->cudagraphinstance, stream1);
+    cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL, NULL,
+                         0);
+    thandle.cudagraphCreated = true;
+  }
+  // Run graph
+  Kokkos::fence();
+  cudaGraphLaunch(lcl_cudagraph->cudagraphinstance, stream1);
 
-    cudaStreamSynchronize(stream1);
-    Kokkos::fence();
-} // end upper_tri_solve_cg
+  cudaStreamSynchronize(stream1);
+  Kokkos::fence();
+}  // end upper_tri_solve_cg
 
 #endif
 
-
-template < class TriSolveHandle, class RowMapType, class EntriesType, class ValuesType, class RHSType, class LHSType >
-void lower_tri_solve(TriSolveHandle & thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType & rhs, LHSType &lhs) {
-
+template <class TriSolveHandle, class RowMapType, class EntriesType,
+          class ValuesType, class RHSType, class LHSType>
+void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map,
+                     const EntriesType entries, const ValuesType values,
+                     const RHSType &rhs, LHSType &lhs) {
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE)
-cudaProfilerStop();
+  cudaProfilerStop();
 #endif
 
   typedef typename TriSolveHandle::execution_space execution_space;
@@ -2620,11 +2927,12 @@ cudaProfilerStop();
   typedef typename TriSolveHandle::nnz_lno_view_t NGBLType;
 
   auto nlevels = thandle.get_num_levels();
-  // Keep this a host View, create device version and copy to back to host during scheduling
-  // This requires making sure the host view in the handle is properly updated after the symbolic phase
-  auto nodes_per_level = thandle.get_nodes_per_level();
-  auto hnodes_per_level = thandle.get_host_nodes_per_level();
-  auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
+  // Keep this a host View, create device version and copy to back to host
+  // during scheduling This requires making sure the host view in the handle is
+  // properly updated after the symbolic phase
+  auto nodes_per_level             = thandle.get_nodes_per_level();
+  auto hnodes_per_level            = thandle.get_host_nodes_per_level();
+  auto nodes_grouped_by_level      = thandle.get_nodes_grouped_by_level();
   auto nodes_grouped_by_level_host = thandle.get_host_nodes_grouped_by_level();
 
 #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV)
@@ -2633,304 +2941,369 @@ cudaProfilerStop();
   using integer_view_t      = typename TriSolveHandle::integer_view_t;
   using integer_view_host_t = typename TriSolveHandle::integer_view_host_t;
   using scalar_t            = typename ValuesType::non_const_value_type;
-  using range_type = Kokkos::pair<int, int>;
+  using range_type          = Kokkos::pair<int, int>;
 
-  const scalar_t zero (0.0);
-  const scalar_t one (1.0);
-  Kokkos::deep_copy (nodes_grouped_by_level_host, nodes_grouped_by_level);
+  const scalar_t zero(0.0);
+  const scalar_t one(1.0);
+  Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level);
 
-  Kokkos::View<size_type*, Kokkos::HostSpace> row_map_host(
-      Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"), row_map.extent(0));
-  Kokkos::deep_copy (row_map_host, row_map);
+  Kokkos::View<size_type *, Kokkos::HostSpace> row_map_host(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"),
+      row_map.extent(0));
+  Kokkos::deep_copy(row_map_host, row_map);
 
   // inversion options
-  const bool invert_diagonal = thandle.get_invert_diagonal ();
-  const bool invert_offdiagonal = thandle.get_invert_offdiagonal ();
-  const bool unit_diagonal = thandle.is_unit_diagonal ();
+  const bool invert_diagonal    = thandle.get_invert_diagonal();
+  const bool invert_offdiagonal = thandle.get_invert_offdiagonal();
+  const bool unit_diagonal      = thandle.is_unit_diagonal();
 
   // supernode sizes
-  const int* supercols = thandle.get_supercols ();
-  const int* supercols_host = thandle.get_supercols_host ();
+  const int *supercols      = thandle.get_supercols();
+  const int *supercols_host = thandle.get_supercols_host();
 
   // kernel types
-  integer_view_t kernel_type = thandle.get_kernel_type ();
-  integer_view_t diag_kernel_type = thandle.get_diag_kernel_type ();
+  integer_view_t kernel_type      = thandle.get_kernel_type();
+  integer_view_t diag_kernel_type = thandle.get_diag_kernel_type();
 
-  integer_view_host_t kernel_type_host = thandle.get_kernel_type_host ();
-  integer_view_host_t diag_kernel_type_host = thandle.get_diag_kernel_type_host ();
+  integer_view_host_t kernel_type_host = thandle.get_kernel_type_host();
+  integer_view_host_t diag_kernel_type_host =
+      thandle.get_diag_kernel_type_host();
 
   // workspaces
-  integer_view_t work_offset = thandle.get_work_offset ();
-  integer_view_host_t work_offset_host = thandle.get_work_offset_host ();
-  auto work = thandle.get_workspace ();
+  integer_view_t work_offset           = thandle.get_work_offset();
+  integer_view_host_t work_offset_host = thandle.get_work_offset_host();
+  auto work                            = thandle.get_workspace();
 #endif
 
   size_type node_count = 0;
 
-  #ifdef profile_supernodal_etree
+#ifdef profile_supernodal_etree
   Kokkos::Timer sptrsv_timer;
   sptrsv_timer.reset();
-  #endif
-  for ( size_type lvl = 0; lvl < nlevels; ++lvl ) {
-   {
-    size_type lvl_nodes = hnodes_per_level(lvl);
-
-    if ( lvl_nodes != 0 ) {
+#endif
+  for (size_type lvl = 0; lvl < nlevels; ++lvl) {
+    {
+      size_type lvl_nodes = hnodes_per_level(lvl);
 
+      if (lvl_nodes != 0) {
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE)
-cudaProfilerStart();
+        cudaProfilerStart();
 #endif
-      if ( thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP ) {
-        Kokkos::parallel_for( "parfor_fixed_lvl", Kokkos::RangePolicy<execution_space>( node_count, node_count+lvl_nodes ), LowerTriLvlSchedRPSolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> (row_map, entries, values, lhs, rhs, nodes_grouped_by_level) );
-      }
-      else if ( thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1 ) {
-        typedef Kokkos::TeamPolicy<execution_space> policy_type;
-        int team_size = thandle.get_team_size();
+        if (thandle.get_algorithm() ==
+            KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) {
+          Kokkos::parallel_for(
+              "parfor_fixed_lvl",
+              Kokkos::RangePolicy<execution_space>(node_count,
+                                                   node_count + lvl_nodes),
+              LowerTriLvlSchedRPSolverFunctor<RowMapType, EntriesType,
+                                              ValuesType, LHSType, RHSType,
+                                              NGBLType>(
+                  row_map, entries, values, lhs, rhs, nodes_grouped_by_level));
+        } else if (thandle.get_algorithm() ==
+                   KokkosSparse::Experimental::SPTRSVAlgorithm::
+                       SEQLVLSCHD_TP1) {
+          typedef Kokkos::TeamPolicy<execution_space> policy_type;
+          int team_size = thandle.get_team_size();
 
 #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-        TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, true, node_count);
+          TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
+                                      LHSType, RHSType, NGBLType>
+              tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                   true, node_count);
 #else
-        LowerTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count);
+          LowerTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
+                                           LHSType, RHSType, NGBLType>
+              tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                   node_count);
 #endif
-        if ( team_size == -1 )
-          Kokkos::parallel_for("parfor_l_team", policy_type( lvl_nodes , Kokkos::AUTO ), tstf);
-        else
-          Kokkos::parallel_for("parfor_l_team", policy_type( lvl_nodes , team_size ), tstf);
-      }
-      // TP2 algorithm has issues with some offset-ordinal combo to be addressed
-      /*
-      else if ( thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) {
-        typedef Kokkos::TeamPolicy<execution_space> tvt_policy_type;
-
-        int team_size = thandle.get_team_size();
-        if ( team_size == -1 ) {
-          team_size = std::is_same< typename Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : 64;
+          if (team_size == -1)
+            Kokkos::parallel_for("parfor_l_team",
+                                 policy_type(lvl_nodes, Kokkos::AUTO), tstf);
+          else
+            Kokkos::parallel_for("parfor_l_team",
+                                 policy_type(lvl_nodes, team_size), tstf);
         }
-        int vector_size = thandle.get_team_size();
-        if ( vector_size == -1 ) {
-          vector_size = std::is_same< typename Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : 4;
-        }
-
-        // This impl: "chunk" lvl_nodes into node_groups; a league_rank is responsible for processing team_size # nodes
-        //       TeamThreadRange over number nodes of node_groups
-        //       To avoid masking threads, 1 thread (team) per node in node_group (thread has full ownership of a node)
-        //       ThreadVectorRange responsible for the actual solve computation
-        //const int node_groups = team_size;
-        const int node_groups = vector_size;
+        // TP2 algorithm has issues with some offset-ordinal combo to be
+        // addressed
+        /*
+        else if ( thandle.get_algorithm() ==
+  KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) { typedef
+  Kokkos::TeamPolicy<execution_space> tvt_policy_type;
+
+          int team_size = thandle.get_team_size();
+          if ( team_size == -1 ) {
+            team_size = std::is_same< typename
+  Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 :
+  64;
+          }
+          int vector_size = thandle.get_team_size();
+          if ( vector_size == -1 ) {
+            vector_size = std::is_same< typename
+  Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 :
+  4;
+          }
 
-#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-        TriLvlSchedTP2SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, true, node_count, vector_size, 0);
-#else
-        LowerTriLvlSchedTP2SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count, node_groups);
-#endif
-        Kokkos::parallel_for("parfor_u_team_vector", tvt_policy_type( (int)std::ceil((float)lvl_nodes/(float)node_groups) , team_size, vector_size ), tstf);
-      } // end elseif
-      */
+          // This impl: "chunk" lvl_nodes into node_groups; a league_rank is
+  responsible for processing team_size # nodes
+          //       TeamThreadRange over number nodes of node_groups
+          //       To avoid masking threads, 1 thread (team) per node in
+  node_group (thread has full ownership of a node)
+          //       ThreadVectorRange responsible for the actual solve
+  computation
+          //const int node_groups = team_size;
+          const int node_groups = vector_size;
+
+  #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
+          TriLvlSchedTP2SolverFunctor<RowMapType, EntriesType, ValuesType,
+  LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs,
+  nodes_grouped_by_level, true, node_count, vector_size, 0); #else
+          LowerTriLvlSchedTP2SolverFunctor<RowMapType, EntriesType, ValuesType,
+  LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs,
+  nodes_grouped_by_level, node_count, node_groups); #endif
+          Kokkos::parallel_for("parfor_u_team_vector", tvt_policy_type(
+  (int)std::ceil((float)lvl_nodes/(float)node_groups) , team_size, vector_size
+  ), tstf); } // end elseif
+        */
 #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV)
-      else if (thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_NAIVE ||
-               thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_ETREE ||
-               thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_DAG) {
-
-        #ifdef profile_supernodal_etree
-        size_t flops = 0;
-        Kokkos::Timer timer;
-        timer.reset();
-        #endif
+        else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE ||
+                 thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE ||
+                 thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) {
+
+#ifdef profile_supernodal_etree
+          size_t flops = 0;
+          Kokkos::Timer timer;
+          timer.reset();
+#endif
 
-        // NOTE: we currently supports only default_layout = LayoutLeft
-        using team_policy_type = Kokkos::TeamPolicy<execution_space>;
-        using supernode_view_type = Kokkos::View<scalar_t**, default_layout, memory_space, Kokkos::MemoryUnmanaged>;
-        if (diag_kernel_type_host (lvl) == 3) {
-          // using device-level kernels (functor is called to scatter the results)
-          scalar_t *dataL = const_cast<scalar_t*> (values.data ());
-
-          if (invert_diagonal && !invert_offdiagonal) {
-            // copy diagonals to workspaces
-            const int* work_offset_data = work_offset.data ();
-            SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
-              sptrsv_init_functor (-2, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work);
-            Kokkos::parallel_for ("parfor_tri_supernode_spmv", team_policy_type(lvl_nodes , Kokkos::AUTO), sptrsv_init_functor);
-          }
+          // NOTE: we currently supports only default_layout = LayoutLeft
+          using team_policy_type = Kokkos::TeamPolicy<execution_space>;
+          using supernode_view_type =
+              Kokkos::View<scalar_t **, default_layout, memory_space,
+                           Kokkos::MemoryUnmanaged>;
+          if (diag_kernel_type_host(lvl) == 3) {
+            // using device-level kernels (functor is called to scatter the
+            // results)
+            scalar_t *dataL = const_cast<scalar_t *>(values.data());
 
-          for (size_type league_rank = 0; league_rank < lvl_nodes; league_rank++) {
+            if (invert_diagonal && !invert_offdiagonal) {
+              // copy diagonals to workspaces
+              const int *work_offset_data = work_offset.data();
+              SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
+                  sptrsv_init_functor(-2, node_count, nodes_grouped_by_level,
+                                      supercols, work_offset_data, lhs, work);
+              Kokkos::parallel_for("parfor_tri_supernode_spmv",
+                                   team_policy_type(lvl_nodes, Kokkos::AUTO),
+                                   sptrsv_init_functor);
+            }
 
-            auto s = nodes_grouped_by_level_host (node_count + league_rank);
+            for (size_type league_rank = 0; league_rank < lvl_nodes;
+                 league_rank++) {
+              auto s = nodes_grouped_by_level_host(node_count + league_rank);
 
-            // supernodal column size
-            int j1 = supercols_host[s];
-            int j2 = supercols_host[s+1];
-            // number of columns in the s-th supernode column
-            int nscol = j2 - j1;
-            // "total" number of rows in all the supernodes (diagonal+off-diagonal)
-            int i1 = row_map_host (j1);
-            int nsrow = row_map_host (j1+1) - i1;
-            #ifdef profile_supernodal_etree
-            flops += 2*(nscol*nsrow);
-            #endif
+              // supernodal column size
+              int j1 = supercols_host[s];
+              int j2 = supercols_host[s + 1];
+              // number of columns in the s-th supernode column
+              int nscol = j2 - j1;
+              // "total" number of rows in all the supernodes
+              // (diagonal+off-diagonal)
+              int i1    = row_map_host(j1);
+              int nsrow = row_map_host(j1 + 1) - i1;
+#ifdef profile_supernodal_etree
+              flops += 2 * (nscol * nsrow);
+#endif
 
-            // workspace  (needed for gemv instead of trmv/trsv)
-            int workoffset = work_offset_host (s);
+              // workspace  (needed for gemv instead of trmv/trsv)
+              int workoffset = work_offset_host(s);
 
-            // create a view for the s-th supernocal block column
-            supernode_view_type viewL (&dataL[i1], nsrow, nscol);
+              // create a view for the s-th supernocal block column
+              supernode_view_type viewL(&dataL[i1], nsrow, nscol);
 
-            // "triangular-solve" to compute Xj
-            if (invert_offdiagonal) {
-              auto Y = Kokkos::subview (work, range_type(workoffset, workoffset+nsrow));
-              auto Xj = Kokkos::subview (lhs, range_type (j1, j2));                      // part of the solution, corresponding to the diagonal block
-              auto Ljj = Kokkos::subview (viewL, range_type (0, nsrow), Kokkos::ALL ()); // s-th supernocal column of L
-              KokkosBlas::
-              gemv("N", one,  Ljj,
-                              Xj,
-                        zero, Y);
-            } else {
-              auto Xj = Kokkos::subview (lhs, range_type (j1, j2));                      // part of the solution, corresponding to the diagonal block
-              auto Ljj = Kokkos::subview (viewL, range_type (0, nscol), Kokkos::ALL ()); // diagonal block of s-th supernocal column of L
-              if (invert_diagonal) {
-                auto Y = Kokkos::subview (work, range_type(workoffset, workoffset+nscol));
-                KokkosBlas::
-                gemv("N", one,  Ljj,
-                                Y,
-                          zero, Xj);
+              // "triangular-solve" to compute Xj
+              if (invert_offdiagonal) {
+                auto Y = Kokkos::subview(
+                    work, range_type(workoffset, workoffset + nsrow));
+                auto Xj = Kokkos::subview(
+                    lhs,
+                    range_type(j1, j2));  // part of the solution, corresponding
+                                          // to the diagonal block
+                auto Ljj = Kokkos::subview(
+                    viewL, range_type(0, nsrow),
+                    Kokkos::ALL());  // s-th supernocal column of L
+                KokkosBlas::gemv("N", one, Ljj, Xj, zero, Y);
               } else {
-                char unit_diag = (unit_diagonal ? 'U' : 'N');
-                // NOTE: we currently supports only default_layout = LayoutLeft
-                Kokkos::View<scalar_t**, default_layout, memory_space, Kokkos::MemoryUnmanaged> Xjj (Xj.data (), nscol, 1);
-                KokkosBlas::
-                trsm("L", "L", "N", &unit_diag,
-                     one,  Ljj, Xjj);
-                Kokkos::fence();
-              }
-              // update off-diagonal blocks
-              int nsrow2 = nsrow - nscol;  // "total" number of rows in all the off-diagonal supernodes
-              if (nsrow2 > 0) {
-                auto Z = Kokkos::subview (work, range_type(workoffset+nscol, workoffset+nsrow));  // workspace, needed with gemv for update&scatter
-                auto Lij = Kokkos::subview (viewL, range_type (nscol, nsrow), Kokkos::ALL ()); // off-diagonal blocks of s-th supernodal column of L
-                KokkosBlas::
-                gemv("N", one,  Lij,
-                                Xj,
-                          zero, Z);
+                auto Xj = Kokkos::subview(
+                    lhs,
+                    range_type(j1, j2));  // part of the solution, corresponding
+                                          // to the diagonal block
+                auto Ljj =
+                    Kokkos::subview(viewL, range_type(0, nscol),
+                                    Kokkos::ALL());  // diagonal block of s-th
+                                                     // supernocal column of L
+                if (invert_diagonal) {
+                  auto Y = Kokkos::subview(
+                      work, range_type(workoffset, workoffset + nscol));
+                  KokkosBlas::gemv("N", one, Ljj, Y, zero, Xj);
+                } else {
+                  char unit_diag = (unit_diagonal ? 'U' : 'N');
+                  // NOTE: we currently supports only default_layout =
+                  // LayoutLeft
+                  Kokkos::View<scalar_t **, default_layout, memory_space,
+                               Kokkos::MemoryUnmanaged>
+                      Xjj(Xj.data(), nscol, 1);
+                  KokkosBlas::trsm("L", "L", "N", &unit_diag, one, Ljj, Xjj);
+                  Kokkos::fence();
+                }
+                // update off-diagonal blocks
+                int nsrow2 = nsrow - nscol;  // "total" number of rows in all
+                                             // the off-diagonal supernodes
+                if (nsrow2 > 0) {
+                  auto Z = Kokkos::subview(
+                      work, range_type(workoffset + nscol,
+                                       workoffset +
+                                           nsrow));  // workspace, needed with
+                                                     // gemv for update&scatter
+                  auto Lij = Kokkos::subview(
+                      viewL, range_type(nscol, nsrow),
+                      Kokkos::ALL());  // off-diagonal blocks of s-th supernodal
+                                       // column of L
+                  KokkosBlas::gemv("N", one, Lij, Xj, zero, Z);
+                }
               }
             }
+            if (invert_offdiagonal) {
+              // copy diagonals from workspaces
+              const int *work_offset_data = work_offset.data();
+              SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
+                  sptrsv_init_functor(-1, node_count, nodes_grouped_by_level,
+                                      supercols, work_offset_data, lhs, work);
+              Kokkos::parallel_for("parfor_tri_supernode_spmv",
+                                   team_policy_type(lvl_nodes, Kokkos::AUTO),
+                                   sptrsv_init_functor);
+            }
           }
-          if (invert_offdiagonal) {
-            // copy diagonals from workspaces
-            const int* work_offset_data = work_offset.data ();
-            SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
-              sptrsv_init_functor (-1, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work);
-            Kokkos::parallel_for ("parfor_tri_supernode_spmv", team_policy_type(lvl_nodes , Kokkos::AUTO), sptrsv_init_functor);
-          }
-        }
 
-        // launching sparse-triangular solve functor
-        LowerTriSupernodalFunctor<TriSolveHandle, RowMapType, EntriesType, ValuesType, LHSType, NGBLType>
-          sptrsv_functor (unit_diagonal, invert_diagonal, invert_offdiagonal,
-                          supercols, row_map, entries, values, lvl, kernel_type, diag_kernel_type, lhs,
-                          work, work_offset, nodes_grouped_by_level, node_count);
-        Kokkos::parallel_for ("parfor_lsolve_supernode", team_policy_type(lvl_nodes, Kokkos::AUTO), sptrsv_functor);
+          // launching sparse-triangular solve functor
+          LowerTriSupernodalFunctor<TriSolveHandle, RowMapType, EntriesType,
+                                    ValuesType, LHSType, NGBLType>
+              sptrsv_functor(unit_diagonal, invert_diagonal, invert_offdiagonal,
+                             supercols, row_map, entries, values, lvl,
+                             kernel_type, diag_kernel_type, lhs, work,
+                             work_offset, nodes_grouped_by_level, node_count);
+          Kokkos::parallel_for("parfor_lsolve_supernode",
+                               team_policy_type(lvl_nodes, Kokkos::AUTO),
+                               sptrsv_functor);
+
+#ifdef profile_supernodal_etree
+          Kokkos::fence();
+          double time_seconds = timer.seconds();
+          std::cout << " > SUPERNODAL LowerTri: " << lvl << " " << time_seconds
+                    << " flop count: " << flops
+                    << " kernel-type: " << kernel_type_host(lvl)
+                    << " # of supernodes: " << lvl_nodes << std::endl;
+#endif
+        } else if (thandle.get_algorithm() ==
+                       SPTRSVAlgorithm::SUPERNODAL_SPMV ||
+                   thandle.get_algorithm() ==
+                       SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) {
+#ifdef profile_supernodal_etree
+          Kokkos::Timer timer;
+          timer.reset();
+#endif
 
-        #ifdef profile_supernodal_etree
-        Kokkos::fence();
-        double time_seconds = timer.seconds ();
-        std::cout << " > SUPERNODAL LowerTri: " << lvl << " " << time_seconds
-                  << " flop count: " << flops
-                  << " kernel-type: " << kernel_type_host (lvl)
-                  << " # of supernodes: " << lvl_nodes << std::endl;
-        #endif
-      }
-      else if (thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV ||
-               thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) {
+          // initialize input & output vectors
+          using team_policy_type = Kokkos::TeamPolicy<execution_space>;
 
-        #ifdef profile_supernodal_etree
-        Kokkos::Timer timer;
-        timer.reset();
-        #endif
-
-        // initialize input & output vectors
-        using team_policy_type = Kokkos::TeamPolicy<execution_space>;
+          // update with spmv (one or two SpMV)
+          bool transpose_spmv =
+              ((!thandle.transpose_spmv() && thandle.is_column_major()) ||
+               (thandle.transpose_spmv() && !thandle.is_column_major()));
+          const char *tran = (transpose_spmv ? "T" : "N");
+          if (!invert_offdiagonal) {
+            // solve with diagonals
+            auto digmat = thandle.get_diagblock(lvl);
+            KokkosSparse::spmv(tran, one, digmat, lhs, one, work);
+            // copy from work to lhs corresponding to diagonal blocks
+            SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
+                sptrsv_init_functor(-1, node_count, nodes_grouped_by_level,
+                                    supercols, supercols, lhs, work);
+            Kokkos::parallel_for("parfor_lsolve_supernode",
+                                 team_policy_type(lvl_nodes, Kokkos::AUTO),
+                                 sptrsv_init_functor);
+          } else {
+            // copy lhs corresponding to diagonal blocks to work and zero out in
+            // lhs
+            SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
+                sptrsv_init_functor(1, node_count, nodes_grouped_by_level,
+                                    supercols, supercols, lhs, work);
+            Kokkos::parallel_for("parfor_lsolve_supernode",
+                                 team_policy_type(lvl_nodes, Kokkos::AUTO),
+                                 sptrsv_init_functor);
+          }
+          // update off-diagonals (potentiall combined with solve with
+          // diagonals)
+          auto submat = thandle.get_submatrix(lvl);
+          KokkosSparse::spmv(tran, one, submat, work, one, lhs);
 
-        // update with spmv (one or two SpMV)
-        bool transpose_spmv = ((!thandle.transpose_spmv() &&  thandle.is_column_major ()) ||
-                               ( thandle.transpose_spmv() && !thandle.is_column_major ()));
-        const char *tran = (transpose_spmv ? "T" : "N");
-        if (!invert_offdiagonal) {
-          // solve with diagonals
-          auto digmat = thandle.get_diagblock (lvl);
-          KokkosSparse::
-          spmv(tran, one, digmat,
-                          lhs,
-                     one, work);
-          // copy from work to lhs corresponding to diagonal blocks
+          // reinitialize workspace
           SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
-            sptrsv_init_functor (-1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work);
-          Kokkos::parallel_for ("parfor_lsolve_supernode", team_policy_type(lvl_nodes, Kokkos::AUTO), sptrsv_init_functor);
-        } else {
-          // copy lhs corresponding to diagonal blocks to work and zero out in lhs
-          SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
-            sptrsv_init_functor (1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work);
-          Kokkos::parallel_for ("parfor_lsolve_supernode", team_policy_type(lvl_nodes, Kokkos::AUTO), sptrsv_init_functor);
+              sptrsv_finalize_functor(0, node_count, nodes_grouped_by_level,
+                                      supercols, supercols, lhs, work);
+          Kokkos::parallel_for("parfor_lsolve_supernode",
+                               team_policy_type(lvl_nodes, Kokkos::AUTO),
+                               sptrsv_finalize_functor);
+
+#ifdef profile_supernodal_etree
+          Kokkos::fence();
+          double time_seconds = timer.seconds();
+          std::cout << " > SUPERNODAL LowerTri: " << lvl << " " << time_seconds
+                    << " kernel-type: " << kernel_type_host(lvl)
+                    << " # of supernodes: " << lvl_nodes << std::endl;
+#endif
         }
-        // update off-diagonals (potentiall combined with solve with diagonals)
-        auto submat = thandle.get_submatrix (lvl);
-        KokkosSparse::
-        spmv(tran, one, submat,
-                        work,
-                   one, lhs);
-
-        // reinitialize workspace
-        SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
-          sptrsv_finalize_functor (0, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work);
-        Kokkos::parallel_for ("parfor_lsolve_supernode", team_policy_type(lvl_nodes, Kokkos::AUTO), sptrsv_finalize_functor);
-
-        #ifdef profile_supernodal_etree
-        Kokkos::fence();
-        double time_seconds = timer.seconds ();
-        std::cout << " > SUPERNODAL LowerTri: " << lvl << " " << time_seconds
-                  << " kernel-type: " << kernel_type_host (lvl)
-                  << " # of supernodes: " << lvl_nodes << std::endl;
-        #endif
-      }
 #endif
-      node_count += lvl_nodes;
+        node_count += lvl_nodes;
 
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE)
-cudaProfilerStop();
+        cudaProfilerStop();
 #endif
-    } // end if
-   } // scope for if-block
+      }  // end if
+    }    // scope for if-block
 
-  } // end for lvl
-  #ifdef profile_supernodal_etree
+  }  // end for lvl
+#ifdef profile_supernodal_etree
   Kokkos::fence();
-  double sptrsv_time_seconds = sptrsv_timer.seconds ();
-  std::cout << " + Execution space   : " << execution_space::name () << std::endl;
-  std::cout << " + Memory space      : " << memory_space::name () << std::endl;
-  std::cout << " + SpTrsv(lower) time: " << sptrsv_time_seconds << std::endl << std::endl;
-  #endif
-
-} // end lower_tri_solve
-
-
+  double sptrsv_time_seconds = sptrsv_timer.seconds();
+  std::cout << " + Execution space   : " << execution_space::name()
+            << std::endl;
+  std::cout << " + Memory space      : " << memory_space::name() << std::endl;
+  std::cout << " + SpTrsv(lower) time: " << sptrsv_time_seconds << std::endl
+            << std::endl;
+#endif
 
-template < class TriSolveHandle, class RowMapType, class EntriesType, class ValuesType, class RHSType, class LHSType >
-void upper_tri_solve(TriSolveHandle & thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType & rhs, LHSType &lhs) {
+}  // end lower_tri_solve
 
+template <class TriSolveHandle, class RowMapType, class EntriesType,
+          class ValuesType, class RHSType, class LHSType>
+void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map,
+                     const EntriesType entries, const ValuesType values,
+                     const RHSType &rhs, LHSType &lhs) {
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE)
-cudaProfilerStop();
+  cudaProfilerStop();
 #endif
   typedef typename TriSolveHandle::execution_space execution_space;
 
   typedef typename TriSolveHandle::size_type size_type;
   typedef typename TriSolveHandle::nnz_lno_view_t NGBLType;
 
-
   auto nlevels = thandle.get_num_levels();
-  // Keep this a host View, create device version and copy to back to host during scheduling
-  // This requires making sure the host view in the handle is properly updated after the symbolic phase
-  auto nodes_per_level = thandle.get_nodes_per_level();
+  // Keep this a host View, create device version and copy to back to host
+  // during scheduling This requires making sure the host view in the handle is
+  // properly updated after the symbolic phase
+  auto nodes_per_level  = thandle.get_nodes_per_level();
   auto hnodes_per_level = thandle.get_host_nodes_per_level();
-  //auto hnodes_per_level = Kokkos::create_mirror_view(nodes_per_level);
-  //Kokkos::deep_copy(hnodes_per_level, nodes_per_level);
+  // auto hnodes_per_level = Kokkos::create_mirror_view(nodes_per_level);
+  // Kokkos::deep_copy(hnodes_per_level, nodes_per_level);
 
   auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
 
@@ -2943,425 +3316,520 @@ cudaProfilerStop();
 
   using range_type = Kokkos::pair<int, int>;
 
-  const scalar_t zero (0.0);
-  const scalar_t one (1.0);
+  const scalar_t zero(0.0);
+  const scalar_t one(1.0);
 
   auto nodes_grouped_by_level_host = thandle.get_host_nodes_grouped_by_level();
-  Kokkos::deep_copy (nodes_grouped_by_level_host, nodes_grouped_by_level);
+  Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level);
 
-  Kokkos::View<size_type*, Kokkos::HostSpace> row_map_host(
-      Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"), row_map.extent(0));
-  Kokkos::deep_copy (row_map_host, row_map);
+  Kokkos::View<size_type *, Kokkos::HostSpace> row_map_host(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"),
+      row_map.extent(0));
+  Kokkos::deep_copy(row_map_host, row_map);
 
   // supernode sizes
-  const int* supercols = thandle.get_supercols ();
-  const int* supercols_host = thandle.get_supercols_host ();
+  const int *supercols      = thandle.get_supercols();
+  const int *supercols_host = thandle.get_supercols_host();
 
   // inversion option
-  const bool invert_diagonal = thandle.get_invert_diagonal ();
-  const bool invert_offdiagonal = thandle.get_invert_offdiagonal ();
+  const bool invert_diagonal    = thandle.get_invert_diagonal();
+  const bool invert_offdiagonal = thandle.get_invert_offdiagonal();
 
   // kernel types
-  integer_view_t kernel_type = thandle.get_kernel_type ();
-  integer_view_t diag_kernel_type = thandle.get_diag_kernel_type ();
+  integer_view_t kernel_type      = thandle.get_kernel_type();
+  integer_view_t diag_kernel_type = thandle.get_diag_kernel_type();
 
-  integer_view_host_t kernel_type_host = thandle.get_kernel_type_host ();
-  integer_view_host_t diag_kernel_type_host = thandle.get_diag_kernel_type_host ();
+  integer_view_host_t kernel_type_host = thandle.get_kernel_type_host();
+  integer_view_host_t diag_kernel_type_host =
+      thandle.get_diag_kernel_type_host();
 
   // workspace
-  integer_view_t work_offset = thandle.get_work_offset ();
-  integer_view_host_t work_offset_host = thandle.get_work_offset_host ();
-  auto work = thandle.get_workspace ();
+  integer_view_t work_offset           = thandle.get_work_offset();
+  integer_view_host_t work_offset_host = thandle.get_work_offset_host();
+  auto work                            = thandle.get_workspace();
 #endif
 
   size_type node_count = 0;
 
-  // This must stay serial; would be nice to try out Cuda's graph stuff to reduce kernel launch overhead
-  #ifdef profile_supernodal_etree
+// This must stay serial; would be nice to try out Cuda's graph stuff to reduce
+// kernel launch overhead
+#ifdef profile_supernodal_etree
   Kokkos::Timer sptrsv_timer;
   sptrsv_timer.reset();
-  #endif
-  for ( size_type lvl = 0; lvl < nlevels; ++lvl ) {
+#endif
+  for (size_type lvl = 0; lvl < nlevels; ++lvl) {
     size_type lvl_nodes = hnodes_per_level(lvl);
 
-    if ( lvl_nodes != 0 ) {
+    if (lvl_nodes != 0) {
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE)
-cudaProfilerStart();
+      cudaProfilerStart();
 #endif
 
-      if ( thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP ) {
-        Kokkos::parallel_for( "parfor_fixed_lvl", Kokkos::RangePolicy<execution_space>( node_count, node_count+lvl_nodes ), UpperTriLvlSchedRPSolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> (row_map, entries, values, lhs, rhs, nodes_grouped_by_level) );
-      }
-      else if ( thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1 ) {
+      if (thandle.get_algorithm() ==
+          KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) {
+        Kokkos::parallel_for(
+            "parfor_fixed_lvl",
+            Kokkos::RangePolicy<execution_space>(node_count,
+                                                 node_count + lvl_nodes),
+            UpperTriLvlSchedRPSolverFunctor<RowMapType, EntriesType, ValuesType,
+                                            LHSType, RHSType, NGBLType>(
+                row_map, entries, values, lhs, rhs, nodes_grouped_by_level));
+      } else if (thandle.get_algorithm() ==
+                 KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1) {
         typedef Kokkos::TeamPolicy<execution_space> policy_type;
 
         int team_size = thandle.get_team_size();
 
 #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-        TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, false, node_count);
+        TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
+                                    LHSType, RHSType, NGBLType>
+            tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                 false, node_count);
 #else
-        UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count);
+        UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
+                                         LHSType, RHSType, NGBLType>
+            tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                 node_count);
 #endif
-        if ( team_size == -1 )
-          Kokkos::parallel_for("parfor_u_team", policy_type( lvl_nodes , Kokkos::AUTO ), tstf);
+        if (team_size == -1)
+          Kokkos::parallel_for("parfor_u_team",
+                               policy_type(lvl_nodes, Kokkos::AUTO), tstf);
         else
-          Kokkos::parallel_for("parfor_u_team", policy_type( lvl_nodes , team_size ), tstf);
+          Kokkos::parallel_for("parfor_u_team",
+                               policy_type(lvl_nodes, team_size), tstf);
       }
       // TP2 algorithm has issues with some offset-ordinal combo to be addressed
       /*
-      else if ( thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) {
-        typedef Kokkos::TeamPolicy<execution_space> tvt_policy_type;
+      else if ( thandle.get_algorithm() ==
+KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) { typedef
+Kokkos::TeamPolicy<execution_space> tvt_policy_type;
 
         int team_size = thandle.get_team_size();
         if ( team_size == -1 ) {
-          team_size = std::is_same< typename Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : 64;
+          team_size = std::is_same< typename
+Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 :
+64;
         }
         int vector_size = thandle.get_team_size();
         if ( vector_size == -1 ) {
-          vector_size = std::is_same< typename Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : 4;
+          vector_size = std::is_same< typename
+Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : 4;
         }
 
-        // This impl: "chunk" lvl_nodes into node_groups; a league_rank is responsible for processing that many nodes
+        // This impl: "chunk" lvl_nodes into node_groups; a league_rank is
+responsible for processing that many nodes
         //       TeamThreadRange over number nodes of node_groups
-        //       To avoid masking threads, 1 thread (team) per node in node_group (thread has full ownership of a node)
+        //       To avoid masking threads, 1 thread (team) per node in
+node_group (thread has full ownership of a node)
         //       ThreadVectorRange responsible for the actual solve computation
         //const int node_groups = team_size;
         const int node_groups = vector_size;
 
 #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-        TriLvlSchedTP2SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, false, node_count, vector_size, 0);
-#else
-        UpperTriLvlSchedTP2SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count, node_groups);
-#endif
-
-        Kokkos::parallel_for("parfor_u_team_vector", tvt_policy_type( (int)std::ceil((float)lvl_nodes/(float)node_groups) , team_size, vector_size ), tstf);
-      } // end elseif
+        TriLvlSchedTP2SolverFunctor<RowMapType, EntriesType, ValuesType,
+LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs,
+nodes_grouped_by_level, false, node_count, vector_size, 0); #else
+        UpperTriLvlSchedTP2SolverFunctor<RowMapType, EntriesType, ValuesType,
+LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs,
+nodes_grouped_by_level, node_count, node_groups); #endif
+
+        Kokkos::parallel_for("parfor_u_team_vector", tvt_policy_type(
+(int)std::ceil((float)lvl_nodes/(float)node_groups) , team_size, vector_size ),
+tstf); } // end elseif
       */
 #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV)
-      else if (thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_NAIVE ||
-               thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_ETREE ||
-               thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_DAG) {
+      else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE ||
+               thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE ||
+               thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) {
 
-        #ifdef profile_supernodal_etree
+#ifdef profile_supernodal_etree
         size_t flops = 0;
         Kokkos::Timer timer;
         timer.reset();
-        #endif
+#endif
 
         using team_policy_type = Kokkos::TeamPolicy<execution_space>;
-        if (thandle.is_column_major ()) { // U stored in CSC
-          if (diag_kernel_type_host (lvl) == 3) {
-            // using device-level kernels (functor is called to gather the input into workspace)
-            scalar_t *dataU = const_cast<scalar_t*> (values.data ());
+        if (thandle.is_column_major()) {  // U stored in CSC
+          if (diag_kernel_type_host(lvl) == 3) {
+            // using device-level kernels (functor is called to gather the input
+            // into workspace)
+            scalar_t *dataU = const_cast<scalar_t *>(values.data());
 
             if (invert_diagonal && !invert_offdiagonal) {
               // copy diagonals to workspaces
-              const int* work_offset_data = work_offset.data ();
+              const int *work_offset_data = work_offset.data();
               SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
-                sptrsv_init_functor (-2, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work);
-              Kokkos::parallel_for ("parfor_tri_supernode_spmv", team_policy_type(lvl_nodes, Kokkos::AUTO), sptrsv_init_functor);
+                  sptrsv_init_functor(-2, node_count, nodes_grouped_by_level,
+                                      supercols, work_offset_data, lhs, work);
+              Kokkos::parallel_for("parfor_tri_supernode_spmv",
+                                   team_policy_type(lvl_nodes, Kokkos::AUTO),
+                                   sptrsv_init_functor);
             }
-            for (size_type league_rank = 0; league_rank < lvl_nodes; league_rank++) {
-
-              auto s = nodes_grouped_by_level_host (node_count + league_rank);
+            for (size_type league_rank = 0; league_rank < lvl_nodes;
+                 league_rank++) {
+              auto s = nodes_grouped_by_level_host(node_count + league_rank);
 
               // supernodal column size
               int j1 = supercols_host[s];
-              int j2 = supercols_host[s+1];
-              int nscol = j2 - j1 ;        // number of columns in the s-th supernode column
-
-              int i1 = row_map_host (j1);
-              int i2 = row_map_host (j1+1);
-              int nsrow = i2 - i1;         // "total" number of rows in all the supernodes (diagonal+off-diagonal)
-              int nsrow2 = nsrow - nscol;  // "total" number of rows in all the off-diagonal supernodes
-              #ifdef profile_supernodal_etree
-              flops += 2*(nscol*nsrow);
-              #endif
+              int j2 = supercols_host[s + 1];
+              int nscol =
+                  j2 - j1;  // number of columns in the s-th supernode column
+
+              int i1    = row_map_host(j1);
+              int i2    = row_map_host(j1 + 1);
+              int nsrow = i2 - i1;         // "total" number of rows in all the
+                                           // supernodes (diagonal+off-diagonal)
+              int nsrow2 = nsrow - nscol;  // "total" number of rows in all the
+                                           // off-diagonal supernodes
+#ifdef profile_supernodal_etree
+              flops += 2 * (nscol * nsrow);
+#endif
 
               // workspace
-              int workoffset = work_offset_host (s);
+              int workoffset = work_offset_host(s);
 
               // create a view for the s-th supernocal block column
               // NOTE: we currently supports only default_layout = LayoutLeft
-              Kokkos::View<scalar_t**, default_layout, memory_space, Kokkos::MemoryUnmanaged> viewU (&dataU[i1], nsrow, nscol);
+              Kokkos::View<scalar_t **, default_layout, memory_space,
+                           Kokkos::MemoryUnmanaged>
+                  viewU(&dataU[i1], nsrow, nscol);
 
               if (invert_offdiagonal) {
-                auto Uij = Kokkos::subview (viewU, range_type (0, nsrow), Kokkos::ALL ());
-                auto Xj = Kokkos::subview (lhs, range_type(j1, j2));
-                auto Z = Kokkos::subview (work, range_type(workoffset, workoffset+nsrow));  // needed with gemv for update&scatter
-                KokkosBlas::
-                gemv("N", one,  Uij,
-                                Xj,
-                          zero, Z);
+                auto Uij =
+                    Kokkos::subview(viewU, range_type(0, nsrow), Kokkos::ALL());
+                auto Xj = Kokkos::subview(lhs, range_type(j1, j2));
+                auto Z  = Kokkos::subview(
+                    work,
+                    range_type(
+                        workoffset,
+                        workoffset +
+                            nsrow));  // needed with gemv for update&scatter
+                KokkosBlas::gemv("N", one, Uij, Xj, zero, Z);
               } else {
-                // extract part of the solution, corresponding to the diagonal block
-                auto Xj = Kokkos::subview (lhs, range_type(j1, j2));
+                // extract part of the solution, corresponding to the diagonal
+                // block
+                auto Xj = Kokkos::subview(lhs, range_type(j1, j2));
 
                 // "triangular-solve" to compute Xj
                 // extract the diagonal block of s-th supernocal column of U
-                auto Ujj = Kokkos::subview (viewU, range_type (0, nscol), Kokkos::ALL ());
+                auto Ujj =
+                    Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL());
                 if (invert_diagonal) {
-                  auto Y = Kokkos::subview (work, range_type(workoffset, workoffset+nscol));  // needed for gemv instead of trmv/trsv
-                  KokkosBlas::
-                  gemv("N", one,  Ujj,
-                                  Y,
-                            zero, Xj);
+                  auto Y = Kokkos::subview(
+                      work,
+                      range_type(
+                          workoffset,
+                          workoffset +
+                              nscol));  // needed for gemv instead of trmv/trsv
+                  KokkosBlas::gemv("N", one, Ujj, Y, zero, Xj);
                 } else {
-                  // NOTE: we currently supports only default_layout = LayoutLeft
-                  Kokkos::View<scalar_t**, default_layout, memory_space, Kokkos::MemoryUnmanaged> Xjj (Xj.data (), nscol, 1);
-                  KokkosBlas::
-                  trsm("L", "U", "N", "N",
-                       one,  Ujj, Xjj);
+                  // NOTE: we currently supports only default_layout =
+                  // LayoutLeft
+                  Kokkos::View<scalar_t **, default_layout, memory_space,
+                               Kokkos::MemoryUnmanaged>
+                      Xjj(Xj.data(), nscol, 1);
+                  KokkosBlas::trsm("L", "U", "N", "N", one, Ujj, Xjj);
                 }
                 // update off-diagonal blocks
                 if (nsrow2 > 0) {
-                  // extract the off-diagonal blocks of s-th supernodal column of U
-                  auto Uij = Kokkos::subview (viewU, range_type (nscol, nsrow), Kokkos::ALL ());
-                  auto Z = Kokkos::subview (work, range_type(workoffset+nscol, workoffset+nscol+nsrow2));  // needed with gemv for update&scatter
-                  KokkosBlas::
-                  gemv("N", one, Uij,
-                                  Xj,
-                            zero, Z);
+                  // extract the off-diagonal blocks of s-th supernodal column
+                  // of U
+                  auto Uij = Kokkos::subview(viewU, range_type(nscol, nsrow),
+                                             Kokkos::ALL());
+                  auto Z   = Kokkos::subview(
+                      work,
+                      range_type(
+                          workoffset + nscol,
+                          workoffset + nscol +
+                              nsrow2));  // needed with gemv for update&scatter
+                  KokkosBlas::gemv("N", one, Uij, Xj, zero, Z);
                 }
               }
             }
             if (invert_offdiagonal) {
               // copy diagonals from workspaces
-              const int* work_offset_data = work_offset.data ();
+              const int *work_offset_data = work_offset.data();
               SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
-                sptrsv_init_functor (-1, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work);
-              Kokkos::parallel_for ("parfor_tri_supernode_spmv", team_policy_type(lvl_nodes, Kokkos::AUTO), sptrsv_init_functor);
+                  sptrsv_init_functor(-1, node_count, nodes_grouped_by_level,
+                                      supercols, work_offset_data, lhs, work);
+              Kokkos::parallel_for("parfor_tri_supernode_spmv",
+                                   team_policy_type(lvl_nodes, Kokkos::AUTO),
+                                   sptrsv_init_functor);
             }
           }
 
           // launching sparse-triangular solve functor
-          UpperTriTranSupernodalFunctor<TriSolveHandle, RowMapType, EntriesType, ValuesType, LHSType, NGBLType>
-            sptrsv_functor (invert_diagonal, invert_offdiagonal, supercols, row_map, entries, values,lvl, kernel_type, diag_kernel_type, lhs,
-                            work, work_offset, nodes_grouped_by_level, node_count);
+          UpperTriTranSupernodalFunctor<TriSolveHandle, RowMapType, EntriesType,
+                                        ValuesType, LHSType, NGBLType>
+              sptrsv_functor(invert_diagonal, invert_offdiagonal, supercols,
+                             row_map, entries, values, lvl, kernel_type,
+                             diag_kernel_type, lhs, work, work_offset,
+                             nodes_grouped_by_level, node_count);
 
           using policy_type = Kokkos::TeamPolicy<execution_space>;
-          Kokkos::parallel_for ("parfor_usolve_tran_supernode", policy_type (lvl_nodes , Kokkos::AUTO), sptrsv_functor);
-        } else { // U stored in CSR
+          Kokkos::parallel_for("parfor_usolve_tran_supernode",
+                               policy_type(lvl_nodes, Kokkos::AUTO),
+                               sptrsv_functor);
+        } else {  // U stored in CSR
           // launching sparse-triangular solve functor
-          UpperTriSupernodalFunctor<TriSolveHandle, RowMapType, EntriesType, ValuesType, LHSType, NGBLType>
-            sptrsv_functor (invert_diagonal, supercols, row_map, entries, values,lvl, kernel_type, diag_kernel_type, lhs,
-                            work, work_offset, nodes_grouped_by_level, node_count);
+          UpperTriSupernodalFunctor<TriSolveHandle, RowMapType, EntriesType,
+                                    ValuesType, LHSType, NGBLType>
+              sptrsv_functor(invert_diagonal, supercols, row_map, entries,
+                             values, lvl, kernel_type, diag_kernel_type, lhs,
+                             work, work_offset, nodes_grouped_by_level,
+                             node_count);
 
           using policy_type = Kokkos::TeamPolicy<execution_space>;
-          Kokkos::parallel_for ("parfor_usolve_supernode", policy_type (lvl_nodes , Kokkos::AUTO), sptrsv_functor);
+          Kokkos::parallel_for("parfor_usolve_supernode",
+                               policy_type(lvl_nodes, Kokkos::AUTO),
+                               sptrsv_functor);
 
-          if (diag_kernel_type_host (lvl) == 3) {
-            // using device-level kernels (functor is called to gather the input into workspace)
-            scalar_t *dataU = const_cast<scalar_t*> (values.data ());
+          if (diag_kernel_type_host(lvl) == 3) {
+            // using device-level kernels (functor is called to gather the input
+            // into workspace)
+            scalar_t *dataU = const_cast<scalar_t *>(values.data());
 
-            for (size_type league_rank = 0; league_rank < lvl_nodes; league_rank++) {
-
-              auto s = nodes_grouped_by_level_host (node_count + league_rank);
+            for (size_type league_rank = 0; league_rank < lvl_nodes;
+                 league_rank++) {
+              auto s = nodes_grouped_by_level_host(node_count + league_rank);
 
               // supernodal column size
               int j1 = supercols_host[s];
-              int j2 = supercols_host[s+1];
-              int nscol = j2 - j1 ;        // number of columns in the s-th supernode column
-
-              // "total" number of rows in all the supernodes (diagonal+off-diagonal)
-              int i1 = row_map_host (j1);
-              int i2 = row_map_host (j1+1);
+              int j2 = supercols_host[s + 1];
+              int nscol =
+                  j2 - j1;  // number of columns in the s-th supernode column
+
+              // "total" number of rows in all the supernodes
+              // (diagonal+off-diagonal)
+              int i1    = row_map_host(j1);
+              int i2    = row_map_host(j1 + 1);
               int nsrow = i2 - i1;
               // "total" number of rows in all the off-diagonal supernodes
               int nsrow2 = nsrow - nscol;
 
               // workspace
-              int workoffset = work_offset_host (s);
+              int workoffset = work_offset_host(s);
 
               // create a view for the s-th supernocal block column
               // NOTE: we currently supports only default_layout = LayoutLeft
-              Kokkos::View<scalar_t**, default_layout, memory_space, Kokkos::MemoryUnmanaged> viewU (&dataU[i1], nsrow, nscol);
-
-              // extract part of the solution, corresponding to the diagonal block
-              auto Xj = Kokkos::subview (lhs, range_type(j1, j2));
-              auto Y = Kokkos::subview (work, range_type(workoffset, workoffset+nscol));  // needed for gemv instead of trmv/trsv
+              Kokkos::View<scalar_t **, default_layout, memory_space,
+                           Kokkos::MemoryUnmanaged>
+                  viewU(&dataU[i1], nsrow, nscol);
+
+              // extract part of the solution, corresponding to the diagonal
+              // block
+              auto Xj = Kokkos::subview(lhs, range_type(j1, j2));
+              auto Y  = Kokkos::subview(
+                  work,
+                  range_type(
+                      workoffset,
+                      workoffset +
+                          nscol));  // needed for gemv instead of trmv/trsv
 
               // update with off-diagonal blocks
               if (nsrow2 > 0) {
-                // extract the off-diagonal blocks of s-th supernodal column of U
-                auto Uij = Kokkos::subview (viewU, range_type (nscol, nsrow), Kokkos::ALL ());
-                auto Z = Kokkos::subview (work, range_type(workoffset+nscol, workoffset+nscol+nsrow2));  // needed with gemv for update&scatter
-                KokkosBlas::
-                gemv("T", -one, Uij,
-                                Z,
-                           one, Xj);
+                // extract the off-diagonal blocks of s-th supernodal column of
+                // U
+                auto Uij = Kokkos::subview(viewU, range_type(nscol, nsrow),
+                                           Kokkos::ALL());
+                auto Z   = Kokkos::subview(
+                    work,
+                    range_type(
+                        workoffset + nscol,
+                        workoffset + nscol +
+                            nsrow2));  // needed with gemv for update&scatter
+                KokkosBlas::gemv("T", -one, Uij, Z, one, Xj);
               }
 
               // "triangular-solve" to compute Xj
               // extract the diagonal block of s-th supernocal column of U
-              auto Ujj = Kokkos::subview (viewU, range_type (0, nscol), Kokkos::ALL ());
+              auto Ujj =
+                  Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL());
               if (invert_diagonal) {
-                KokkosBlas::
-                gemv("T", one,  Ujj,
-                                Xj,
-                          zero, Y);
+                KokkosBlas::gemv("T", one, Ujj, Xj, zero, Y);
               } else {
                 // NOTE: we currently supports only default_layout = LayoutLeft
-                Kokkos::View<scalar_t**, default_layout, memory_space, Kokkos::MemoryUnmanaged> Xjj (Xj.data (), nscol, 1);
-                KokkosBlas::
-                trsm("L", "L", "T", "N",
-                     one,  Ujj, Xjj);
+                Kokkos::View<scalar_t **, default_layout, memory_space,
+                             Kokkos::MemoryUnmanaged>
+                    Xjj(Xj.data(), nscol, 1);
+                KokkosBlas::trsm("L", "L", "T", "N", one, Ujj, Xjj);
               }
             }
             if (invert_diagonal) {
               // copy diagonals from workspaces
-              const int* work_offset_data = work_offset.data ();
+              const int *work_offset_data = work_offset.data();
               SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
-                sptrsv_init_functor (-1, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work);
-              Kokkos::parallel_for ("parfor_tri_supernode_spmv", team_policy_type(lvl_nodes, Kokkos::AUTO), sptrsv_init_functor);
+                  sptrsv_init_functor(-1, node_count, nodes_grouped_by_level,
+                                      supercols, work_offset_data, lhs, work);
+              Kokkos::parallel_for("parfor_tri_supernode_spmv",
+                                   team_policy_type(lvl_nodes, Kokkos::AUTO),
+                                   sptrsv_init_functor);
             }
           }
         }
-        #ifdef profile_supernodal_etree
+#ifdef profile_supernodal_etree
         Kokkos::fence();
-        double time_seconds = timer.seconds ();
+        double time_seconds = timer.seconds();
         std::cout << " > SUPERNODAL UpperTri: " << lvl << " " << time_seconds
                   << " flop count: " << flops
-                  << " kernel-type: " << kernel_type_host (lvl)
+                  << " kernel-type: " << kernel_type_host(lvl)
                   << " # of supernodes: " << lvl_nodes << std::endl;
-        #endif
-      }
-      else if (thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV ||
-               thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) {
-
-        #ifdef profile_supernodal_etree
+#endif
+      } else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV ||
+                 thandle.get_algorithm() ==
+                     SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) {
+#ifdef profile_supernodal_etree
         Kokkos::Timer timer;
         timer.reset();
-        #endif
+#endif
 
         // initialize input & output vectors
         using team_policy_type = Kokkos::TeamPolicy<execution_space>;
 
         // update with one, or two, spmv
-        bool transpose_spmv = ((!thandle.transpose_spmv() &&  thandle.is_column_major ()) ||
-                               ( thandle.transpose_spmv() && !thandle.is_column_major ()));
+        bool transpose_spmv =
+            ((!thandle.transpose_spmv() && thandle.is_column_major()) ||
+             (thandle.transpose_spmv() && !thandle.is_column_major()));
         const char *tran = (transpose_spmv ? "T" : "N");
-        if (!transpose_spmv) { // U stored in CSR
+        if (!transpose_spmv) {  // U stored in CSR
           if (!invert_offdiagonal) {
             // solve with diagonals
-            auto digmat = thandle.get_diagblock (lvl);
-            KokkosSparse::
-            spmv(tran, one, digmat,
-                            lhs,
-                       one, work);
+            auto digmat = thandle.get_diagblock(lvl);
+            KokkosSparse::spmv(tran, one, digmat, lhs, one, work);
             // copy from work to lhs corresponding to diagonal blocks
             SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
-              sptrsv_init_functor (-1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work);
-            Kokkos::parallel_for ("parfor_lsolve_supernode", team_policy_type(lvl_nodes, Kokkos::AUTO), sptrsv_init_functor);
+                sptrsv_init_functor(-1, node_count, nodes_grouped_by_level,
+                                    supercols, supercols, lhs, work);
+            Kokkos::parallel_for("parfor_lsolve_supernode",
+                                 team_policy_type(lvl_nodes, Kokkos::AUTO),
+                                 sptrsv_init_functor);
           } else {
-            // zero out lhs corresponding to diagonal blocks in lhs, and copy to work
+            // zero out lhs corresponding to diagonal blocks in lhs, and copy to
+            // work
             SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
-              sptrsv_init_functor (1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work);
-            Kokkos::parallel_for ("parfor_lsolve_supernode", team_policy_type(lvl_nodes, Kokkos::AUTO), sptrsv_init_functor);
+                sptrsv_init_functor(1, node_count, nodes_grouped_by_level,
+                                    supercols, supercols, lhs, work);
+            Kokkos::parallel_for("parfor_lsolve_supernode",
+                                 team_policy_type(lvl_nodes, Kokkos::AUTO),
+                                 sptrsv_init_functor);
           }
-          // update with off-diagonals (potentiall combined with diagonal solves)
-          auto submat = thandle.get_submatrix (lvl);
-          KokkosSparse::
-          spmv(tran, one, submat,
-                          work,
-                     one, lhs);
+          // update with off-diagonals (potentiall combined with diagonal
+          // solves)
+          auto submat = thandle.get_submatrix(lvl);
+          KokkosSparse::spmv(tran, one, submat, work, one, lhs);
         } else {
           if (!invert_offdiagonal) {
-            // zero out lhs corresponding to diagonal blocks in lhs, and copy to work
+            // zero out lhs corresponding to diagonal blocks in lhs, and copy to
+            // work
             SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
-              sptrsv_init_functor (1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work);
-            Kokkos::parallel_for ("parfor_lsolve_supernode", team_policy_type(lvl_nodes, Kokkos::AUTO), sptrsv_init_functor);
+                sptrsv_init_functor(1, node_count, nodes_grouped_by_level,
+                                    supercols, supercols, lhs, work);
+            Kokkos::parallel_for("parfor_lsolve_supernode",
+                                 team_policy_type(lvl_nodes, Kokkos::AUTO),
+                                 sptrsv_init_functor);
 
             // update with off-diagonals
-            auto submat = thandle.get_submatrix (lvl);
-            KokkosSparse::
-            spmv(tran, one, submat,
-                            lhs,
-                       one, work);
+            auto submat = thandle.get_submatrix(lvl);
+            KokkosSparse::spmv(tran, one, submat, lhs, one, work);
 
             // solve with diagonals
-            auto digmat = thandle.get_diagblock (lvl);
-            KokkosSparse::
-            spmv(tran, one, digmat,
-                            work,
-                       one, lhs);
+            auto digmat = thandle.get_diagblock(lvl);
+            KokkosSparse::spmv(tran, one, digmat, work, one, lhs);
           } else {
-            std::cout << " ** invert_offdiag with U in CSR not supported **" << std::endl;
+            std::cout << " ** invert_offdiag with U in CSR not supported **"
+                      << std::endl;
           }
         }
         // reinitialize workspace
         SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
-          sptrsv_finalize_functor (0, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work);
-        Kokkos::parallel_for ("parfor_lsolve_supernode", team_policy_type(lvl_nodes, Kokkos::AUTO), sptrsv_finalize_functor);
+            sptrsv_finalize_functor(0, node_count, nodes_grouped_by_level,
+                                    supercols, supercols, lhs, work);
+        Kokkos::parallel_for("parfor_lsolve_supernode",
+                             team_policy_type(lvl_nodes, Kokkos::AUTO),
+                             sptrsv_finalize_functor);
 
-        #ifdef profile_supernodal_etree
+#ifdef profile_supernodal_etree
         Kokkos::fence();
-        double time_seconds = timer.seconds ();
+        double time_seconds = timer.seconds();
         std::cout << " > SUPERNODAL UpperTri: " << lvl << " " << time_seconds
-                  << " kernel-type: " << kernel_type_host (lvl)
+                  << " kernel-type: " << kernel_type_host(lvl)
                   << " # of supernodes: " << lvl_nodes << std::endl;
-        #endif
+#endif
       }
 #endif
       node_count += lvl_nodes;
 
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE)
-cudaProfilerStop();
+      cudaProfilerStop();
 #endif
-    } // end if
-  } // end for lvl
-  #ifdef profile_supernodal_etree
+    }  // end if
+  }    // end for lvl
+#ifdef profile_supernodal_etree
   Kokkos::fence();
-  double sptrsv_time_seconds = sptrsv_timer.seconds ();
-  std::cout << " + SpTrsv(uppper) time: " << sptrsv_time_seconds << std::endl << std::endl;
-  std::cout <<"  + Execution space    : " << execution_space::name () << std::endl;
-  std::cout << " + Memory space       : " << memory_space::name () << std::endl;
-  #endif
-
-} // end upper_tri_solve
-
+  double sptrsv_time_seconds = sptrsv_timer.seconds();
+  std::cout << " + SpTrsv(uppper) time: " << sptrsv_time_seconds << std::endl
+            << std::endl;
+  std::cout << "  + Execution space    : " << execution_space::name()
+            << std::endl;
+  std::cout << " + Memory space       : " << memory_space::name() << std::endl;
+#endif
 
-template < class TriSolveHandle, class RowMapType, class EntriesType, class ValuesType, class RHSType, class LHSType >
-void tri_solve_chain(TriSolveHandle & thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType & rhs, LHSType &lhs, const bool /*is_lowertri_*/) {
+}  // end upper_tri_solve
 
+template <class TriSolveHandle, class RowMapType, class EntriesType,
+          class ValuesType, class RHSType, class LHSType>
+void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map,
+                     const EntriesType entries, const ValuesType values,
+                     const RHSType &rhs, LHSType &lhs,
+                     const bool /*is_lowertri_*/) {
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE)
-cudaProfilerStop();
+  cudaProfilerStop();
 #endif
   typedef typename TriSolveHandle::execution_space execution_space;
   typedef typename TriSolveHandle::size_type size_type;
   typedef typename TriSolveHandle::nnz_lno_view_t NGBLType;
 
   // Algorithm is checked before this function is called
-  auto h_chain_ptr = thandle.get_host_chain_ptr();
+  auto h_chain_ptr            = thandle.get_host_chain_ptr();
   size_type num_chain_entries = thandle.get_num_chain_entries();
 
-  // Keep this a host View, create device version and copy to back to host during scheduling
-  // This requires making sure the host view in the handle is properly updated after the symbolic phase
-  auto nodes_per_level = thandle.get_nodes_per_level();
+  // Keep this a host View, create device version and copy to back to host
+  // during scheduling This requires making sure the host view in the handle is
+  // properly updated after the symbolic phase
+  auto nodes_per_level  = thandle.get_nodes_per_level();
   auto hnodes_per_level = thandle.get_host_nodes_per_level();
 
   auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
 
-  const bool is_lowertri =  thandle.is_lower_tri();
+  const bool is_lowertri = thandle.is_lower_tri();
 
   size_type node_count = 0;
 
-// REFACTORED to cleanup; next, need debug and timer routines
+  // REFACTORED to cleanup; next, need debug and timer routines
   using policy_type = Kokkos::TeamPolicy<execution_space>;
-  using large_cutoff_policy_type = Kokkos::TeamPolicy<LargerCutoffTag, execution_space>;
-/*
-  using TP1Functor = TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>;
-  using LTP1Functor = LowerTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>;
-  using UTP1Functor = UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>;
-  using LSingleBlockFunctor = LowerTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>;
-  using USingleBlockFunctor = UpperTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>;
-*/
-  using SingleBlockFunctor = TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>;
+  using large_cutoff_policy_type =
+      Kokkos::TeamPolicy<LargerCutoffTag, execution_space>;
+  /*
+    using TP1Functor = TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType,
+    ValuesType, LHSType, RHSType, NGBLType>; using LTP1Functor =
+    LowerTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
+    LHSType, RHSType, NGBLType>; using UTP1Functor =
+    UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
+    LHSType, RHSType, NGBLType>; using LSingleBlockFunctor =
+    LowerTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType,
+    LHSType, RHSType, NGBLType>; using USingleBlockFunctor =
+    UpperTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType,
+    LHSType, RHSType, NGBLType>;
+  */
+  using SingleBlockFunctor =
+      TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType,
+                                       LHSType, RHSType, NGBLType>;
 
   int team_size = thandle.get_team_size();
-  int vector_size = thandle.get_vector_size() > 0 ? thandle.get_vector_size() : 1;
+  int vector_size =
+      thandle.get_vector_size() > 0 ? thandle.get_vector_size() : 1;
 
-  auto cutoff = thandle.get_chain_threshold();
+  auto cutoff               = thandle.get_chain_threshold();
   int team_size_singleblock = team_size;
 
   // Enumerate options
@@ -3370,38 +3838,48 @@ cudaProfilerStop();
   // ts > 0 | cu 0 - set
   // ts > 0 | cu > 0 - set
   // Controls ts,cu > 0
-  // co > ts  - not all rows can be mapped to a thread - must call largercutoff impl
-  // co <= ts - okay, kernel must be careful not to access out-of-bounds; some threads idol
+  // co > ts  - not all rows can be mapped to a thread - must call largercutoff
+  // impl co <= ts - okay, kernel must be careful not to access out-of-bounds;
+  // some threads idol
   if (team_size_singleblock <= 0 && cutoff == 0) {
     team_size_singleblock = 1;
-    // If cutoff == 0, no single-block calls will be made, team_size_singleblock is unimportant
+    // If cutoff == 0, no single-block calls will be made, team_size_singleblock
+    // is unimportant
   }
 
-  // This is only necessary for Lower,UpperTri functor versions; else, is_lowertri can be passed as arg to the generic Tri functor...
+  // This is only necessary for Lower,UpperTri functor versions; else,
+  // is_lowertri can be passed as arg to the generic Tri functor...
   if (is_lowertri) {
-
-    for ( size_type chainlink = 0; chainlink < num_chain_entries; ++chainlink ) {
+    for (size_type chainlink = 0; chainlink < num_chain_entries; ++chainlink) {
       size_type schain = h_chain_ptr(chainlink);
-      size_type echain = h_chain_ptr(chainlink+1);
-
-      if ( echain - schain == 1 ) {
+      size_type echain = h_chain_ptr(chainlink + 1);
 
+      if (echain - schain == 1) {
         // if team_size is -1 (unset), get recommended size from Kokkos
 #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-        TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, true, node_count);
+        TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
+                                    LHSType, RHSType, NGBLType>
+            tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                 true, node_count);
 #else
-        LowerTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count);
+        LowerTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
+                                         LHSType, RHSType, NGBLType>
+            tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                 node_count);
 #endif
-        if (team_size == - 1) {
-          team_size = policy_type(1, 1, vector_size).team_size_recommended(tstf, Kokkos::ParallelForTag());
+        if (team_size == -1) {
+          team_size =
+              policy_type(1, 1, vector_size)
+                  .team_size_recommended(tstf, Kokkos::ParallelForTag());
         }
 
-        size_type lvl_nodes = hnodes_per_level(schain); //lvl == echain????
-        Kokkos::parallel_for("parfor_l_team_chain1", policy_type(lvl_nodes , team_size, vector_size), tstf);
+        size_type lvl_nodes = hnodes_per_level(schain);  // lvl == echain????
+        Kokkos::parallel_for("parfor_l_team_chain1",
+                             policy_type(lvl_nodes, team_size, vector_size),
+                             tstf);
         node_count += lvl_nodes;
 
-      }
-      else {
+      } else {
         size_type lvl_nodes = 0;
 
         for (size_type i = schain; i < echain; ++i) {
@@ -3409,57 +3887,91 @@ cudaProfilerStop();
         }
 
         if (team_size_singleblock <= 0) {
-          team_size_singleblock = policy_type(1, 1, vector_size).team_size_recommended(SingleBlockFunctor(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, is_lowertri), Kokkos::ParallelForTag());
+          team_size_singleblock =
+              policy_type(1, 1, vector_size)
+                  .team_size_recommended(
+                      SingleBlockFunctor(row_map, entries, values, lhs, rhs,
+                                         nodes_grouped_by_level,
+                                         nodes_per_level, node_count, schain,
+                                         echain, is_lowertri),
+                      Kokkos::ParallelForTag());
         }
 
         if (cutoff <= team_size_singleblock) {
 #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-          TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, true);
+          TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType,
+                                           LHSType, RHSType, NGBLType>
+              tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                   nodes_per_level, node_count, schain, echain, true);
 #else
-          LowerTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain);
+          LowerTriLvlSchedTP1SingleBlockFunctor<
+              RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>
+              tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                   nodes_per_level, node_count, schain, echain);
 #endif
-          Kokkos::parallel_for("parfor_l_team_chainmulti", policy_type(1, team_size_singleblock, vector_size), tstf);
-        }
-        else {
-          // team_size_singleblock < cutoff => kernel must allow for a block-stride internally
+          Kokkos::parallel_for(
+              "parfor_l_team_chainmulti",
+              policy_type(1, team_size_singleblock, vector_size), tstf);
+        } else {
+          // team_size_singleblock < cutoff => kernel must allow for a
+          // block-stride internally
 #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-          TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, true, 0, cutoff);
+          TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType,
+                                           LHSType, RHSType, NGBLType>
+              tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                   nodes_per_level, node_count, schain, echain, true, 0,
+                   cutoff);
 #else
-          LowerTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, cutoff);
+          LowerTriLvlSchedTP1SingleBlockFunctor<
+              RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>
+              tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                   nodes_per_level, node_count, schain, echain, cutoff);
 #endif
-          Kokkos::parallel_for("parfor_l_team_chainmulti_cutoff", large_cutoff_policy_type(1, team_size_singleblock, vector_size), tstf);
+          Kokkos::parallel_for(
+              "parfor_l_team_chainmulti_cutoff",
+              large_cutoff_policy_type(1, team_size_singleblock, vector_size),
+              tstf);
         }
         node_count += lvl_nodes;
       }
-      Kokkos::fence(); // TODO - is this necessary? that is, can the parallel_for launch before the s/echain values have been updated?
+      Kokkos::fence();  // TODO - is this necessary? that is, can the
+                        // parallel_for launch before the s/echain values have
+                        // been updated?
     }
 
-  }
-  else {
-
-    for ( size_type chainlink = 0; chainlink < num_chain_entries; ++chainlink ) {
+  } else {
+    for (size_type chainlink = 0; chainlink < num_chain_entries; ++chainlink) {
       size_type schain = h_chain_ptr(chainlink);
-      size_type echain = h_chain_ptr(chainlink+1);
-
-      if ( echain - schain == 1 ) {
+      size_type echain = h_chain_ptr(chainlink + 1);
 
+      if (echain - schain == 1) {
         // if team_size is -1 (unset), get recommended size from Kokkos
 #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-        TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, is_lowertri, node_count);
+        TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
+                                    LHSType, RHSType, NGBLType>
+            tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                 is_lowertri, node_count);
 #else
-        UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count);
+        UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
+                                         LHSType, RHSType, NGBLType>
+            tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                 node_count);
 #endif
-        if (team_size == - 1) {
-          team_size = policy_type(1, 1, vector_size).team_size_recommended(tstf, Kokkos::ParallelForTag());
+        if (team_size == -1) {
+          team_size =
+              policy_type(1, 1, vector_size)
+                  .team_size_recommended(tstf, Kokkos::ParallelForTag());
         }
 
-        // TODO To use cudagraph here, need to know how many non-unit chains there are, create a graph for each and launch accordingly
-        size_type lvl_nodes = hnodes_per_level(schain); //lvl == echain????
-        Kokkos::parallel_for("parfor_u_team_chain1", policy_type(lvl_nodes , team_size, vector_size), tstf);
+        // TODO To use cudagraph here, need to know how many non-unit chains
+        // there are, create a graph for each and launch accordingly
+        size_type lvl_nodes = hnodes_per_level(schain);  // lvl == echain????
+        Kokkos::parallel_for("parfor_u_team_chain1",
+                             policy_type(lvl_nodes, team_size, vector_size),
+                             tstf);
         node_count += lvl_nodes;
 
-      }
-      else {
+      } else {
         size_type lvl_nodes = 0;
 
         for (size_type i = schain; i < echain; ++i) {
@@ -3467,38 +3979,67 @@ cudaProfilerStop();
         }
 
         if (team_size_singleblock <= 0) {
-          //team_size_singleblock = policy_type(1, 1, 1).team_size_recommended(SingleBlockFunctor(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, is_lowertri, node_count), Kokkos::ParallelForTag());
-          team_size_singleblock = policy_type(1, 1, vector_size).team_size_recommended(SingleBlockFunctor(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, is_lowertri), Kokkos::ParallelForTag());
+          // team_size_singleblock = policy_type(1, 1,
+          // 1).team_size_recommended(SingleBlockFunctor(row_map, entries,
+          // values, lhs, rhs, nodes_grouped_by_level, is_lowertri, node_count),
+          // Kokkos::ParallelForTag());
+          team_size_singleblock =
+              policy_type(1, 1, vector_size)
+                  .team_size_recommended(
+                      SingleBlockFunctor(row_map, entries, values, lhs, rhs,
+                                         nodes_grouped_by_level,
+                                         nodes_per_level, node_count, schain,
+                                         echain, is_lowertri),
+                      Kokkos::ParallelForTag());
         }
 
         if (cutoff <= team_size_singleblock) {
 #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-          TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, is_lowertri);
+          TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType,
+                                           LHSType, RHSType, NGBLType>
+              tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                   nodes_per_level, node_count, schain, echain, is_lowertri);
 #else
-          UpperTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain);
+          UpperTriLvlSchedTP1SingleBlockFunctor<
+              RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>
+              tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                   nodes_per_level, node_count, schain, echain);
 #endif
-          Kokkos::parallel_for("parfor_u_team_chainmulti", policy_type(1, team_size_singleblock, vector_size), tstf);
-        }
-        else {
-          // team_size_singleblock < cutoff => kernel must allow for a block-stride internally
+          Kokkos::parallel_for(
+              "parfor_u_team_chainmulti",
+              policy_type(1, team_size_singleblock, vector_size), tstf);
+        } else {
+          // team_size_singleblock < cutoff => kernel must allow for a
+          // block-stride internally
 #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-          TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, is_lowertri, 0, cutoff);
+          TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType,
+                                           LHSType, RHSType, NGBLType>
+              tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                   nodes_per_level, node_count, schain, echain, is_lowertri, 0,
+                   cutoff);
 #else
-          UpperTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, cutoff);
+          UpperTriLvlSchedTP1SingleBlockFunctor<
+              RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>
+              tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                   nodes_per_level, node_count, schain, echain, cutoff);
 #endif
-          Kokkos::parallel_for("parfor_u_team_chainmulti_cutoff", large_cutoff_policy_type(1, team_size_singleblock, vector_size), tstf);
+          Kokkos::parallel_for(
+              "parfor_u_team_chainmulti_cutoff",
+              large_cutoff_policy_type(1, team_size_singleblock, vector_size),
+              tstf);
         }
         node_count += lvl_nodes;
       }
-      Kokkos::fence(); // TODO - is this necessary? that is, can the parallel_for launch before the s/echain values have been updated?
+      Kokkos::fence();  // TODO - is this necessary? that is, can the
+                        // parallel_for launch before the s/echain values have
+                        // been updated?
     }
-
   }
 
-} // end tri_solve_chain
+}  // end tri_solve_chain
 
-} // namespace Experimental
-} // namespace Impl
-} // namespace KokkosSparse
+}  // namespace Experimental
+}  // namespace Impl
+}  // namespace KokkosSparse
 
 #endif
diff --git a/src/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/src/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp
index df962c2006..bf7b8afea9 100644
--- a/src/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp
+++ b/src/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp
@@ -51,7 +51,7 @@
 #include "KokkosKernels_Handle.hpp"
 
 // Include the actual functors
-#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY 
+#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 #include <KokkosSparse_sptrsv_solve_impl.hpp>
 #include <KokkosSparse_sptrsv_symbolic_impl.hpp>
 #endif
@@ -59,152 +59,135 @@
 namespace KokkosSparse {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class KernelHandle,
-         class RowMapType,
-         class EntriesType,
-         class ValuesType,
-         class BType,
-         class XType>
+template <class KernelHandle, class RowMapType, class EntriesType,
+          class ValuesType, class BType, class XType>
 struct sptrsv_solve_eti_spec_avail {
   enum : bool { value = false };
 };
 
-}
-}
-
+}  // namespace Impl
+}  // namespace KokkosSparse
 
-#define KOKKOSSPARSE_SPTRSV_SOLVE_ETI_SPEC_AVAIL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \
-    template<> \
-    struct sptrsv_solve_eti_spec_avail< \
-                  KokkosKernels::Experimental::KokkosKernelsHandle<\
-                               const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,  \
-                               EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE> , \
-                  Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged> > > \
-    { enum : bool { value = true }; };
+#define KOKKOSSPARSE_SPTRSV_SOLVE_ETI_SPEC_AVAIL(                           \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE,   \
+    MEM_SPACE_TYPE)                                                         \
+  template <>                                                               \
+  struct sptrsv_solve_eti_spec_avail<                                       \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                     \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,         \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>,                 \
+      Kokkos::View<                                                         \
+          const OFFSET_TYPE *, LAYOUT_TYPE,                                 \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          const ORDINAL_TYPE *, LAYOUT_TYPE,                                \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          const SCALAR_TYPE *, LAYOUT_TYPE,                                 \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          const SCALAR_TYPE *, LAYOUT_TYPE,                                 \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                              \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,         \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {            \
+    enum : bool { value = true };                                           \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosSparse_sptrsv_solve_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosSparse_sptrsv_solve_eti_spec_avail.hpp>
+#include <KokkosSparse_sptrsv_solve_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosSparse_sptrsv_solve_eti_spec_avail.hpp>
 
 namespace KokkosSparse {
 namespace Impl {
 
-#if defined(KOKKOS_ENABLE_CUDA) && 10000 < CUDA_VERSION && defined(KOKKOSKERNELS_ENABLE_EXP_CUDAGRAPH)
-  #define KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
+#if defined(KOKKOS_ENABLE_CUDA) && 10000 < CUDA_VERSION && \
+    defined(KOKKOSKERNELS_ENABLE_EXP_CUDAGRAPH)
+#define KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
 #endif
 
 // Unification layer
 /// \brief Implementation of KokkosSparse::sptrsv_solve
 
-template<class KernelHandle,
-         class RowMapType,
-         class EntriesType,
-         class ValuesType,
-         class BType,
-         class XType,
-         bool tpl_spec_avail =
-             sptrsv_solve_tpl_spec_avail< KernelHandle,
-                                  RowMapType,
-                                  EntriesType,
-                                  ValuesType,
-                                  BType,
-                                  XType >::value,
-         bool eti_spec_avail =
-             sptrsv_solve_eti_spec_avail< KernelHandle,
-                                  RowMapType,
-                                  EntriesType,
-                                  ValuesType,
-                                  BType,
-                                  XType >::value >
-struct SPTRSV_SOLVE{
-  static void
-  sptrsv_solve (KernelHandle *handle,
-                const RowMapType row_map,
-                const EntriesType entries,
-                const ValuesType values,
-                BType b,
-                XType x);
+template <class KernelHandle, class RowMapType, class EntriesType,
+          class ValuesType, class BType, class XType,
+          bool tpl_spec_avail =
+              sptrsv_solve_tpl_spec_avail<KernelHandle, RowMapType, EntriesType,
+                                          ValuesType, BType, XType>::value,
+          bool eti_spec_avail =
+              sptrsv_solve_eti_spec_avail<KernelHandle, RowMapType, EntriesType,
+                                          ValuesType, BType, XType>::value>
+struct SPTRSV_SOLVE {
+  static void sptrsv_solve(KernelHandle *handle, const RowMapType row_map,
+                           const EntriesType entries, const ValuesType values,
+                           BType b, XType x);
 };
 
-
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 //! Full specialization of sptrsv_solve
 // Unification layer
-template<class KernelHandle,
-         class RowMapType,
-         class EntriesType,
-         class ValuesType,
-         class BType,
-         class XType>
-struct SPTRSV_SOLVE<KernelHandle, RowMapType, EntriesType, ValuesType, BType, XType, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>{
-  static void
-  sptrsv_solve (KernelHandle *handle,
-                const RowMapType row_map,
-                const EntriesType entries,
-                const ValuesType values,
-                BType b,
-                XType x)
-  {
+template <class KernelHandle, class RowMapType, class EntriesType,
+          class ValuesType, class BType, class XType>
+struct SPTRSV_SOLVE<KernelHandle, RowMapType, EntriesType, ValuesType, BType,
+                    XType, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  static void sptrsv_solve(KernelHandle *handle, const RowMapType row_map,
+                           const EntriesType entries, const ValuesType values,
+                           BType b, XType x) {
     // Call specific algorithm type
     auto sptrsv_handle = handle->get_sptrsv_handle();
-    Kokkos::Profiling::pushRegion(sptrsv_handle->is_lower_tri() ? "KokkosSparse_sptrsv[lower]" : "KokkosSparse_sptrsv[upper]");
-    if ( sptrsv_handle->is_lower_tri() ) {
-      if ( sptrsv_handle->is_symbolic_complete() == false ) {
+    Kokkos::Profiling::pushRegion(sptrsv_handle->is_lower_tri()
+                                      ? "KokkosSparse_sptrsv[lower]"
+                                      : "KokkosSparse_sptrsv[upper]");
+    if (sptrsv_handle->is_lower_tri()) {
+      if (sptrsv_handle->is_symbolic_complete() == false) {
         Experimental::lower_tri_symbolic(*sptrsv_handle, row_map, entries);
       }
-      if ( sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN ) {
-        Experimental::tri_solve_chain( *sptrsv_handle, row_map, entries, values, b, x, true);
-      }
-      else {
+      if (sptrsv_handle->get_algorithm() ==
+          KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) {
+        Experimental::tri_solve_chain(*sptrsv_handle, row_map, entries, values,
+                                      b, x, true);
+      } else {
 #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
         using ExecSpace = typename RowMapType::memory_space::execution_space;
-        if ( std::is_same<ExecSpace, Kokkos::Cuda>::value)
-          Experimental::lower_tri_solve_cg( *sptrsv_handle, row_map, entries, values, b, x);
+        if (std::is_same<ExecSpace, Kokkos::Cuda>::value)
+          Experimental::lower_tri_solve_cg(*sptrsv_handle, row_map, entries,
+                                           values, b, x);
         else
 #endif
-          Experimental::lower_tri_solve( *sptrsv_handle, row_map, entries, values, b, x);
+          Experimental::lower_tri_solve(*sptrsv_handle, row_map, entries,
+                                        values, b, x);
       }
-    }
-    else {
-      if ( sptrsv_handle->is_symbolic_complete() == false ) {
+    } else {
+      if (sptrsv_handle->is_symbolic_complete() == false) {
         Experimental::upper_tri_symbolic(*sptrsv_handle, row_map, entries);
       }
-      if ( sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN ) {
-        Experimental::tri_solve_chain( *sptrsv_handle, row_map, entries, values, b, x, false);
-      }
-      else {
+      if (sptrsv_handle->get_algorithm() ==
+          KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) {
+        Experimental::tri_solve_chain(*sptrsv_handle, row_map, entries, values,
+                                      b, x, false);
+      } else {
 #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
         using ExecSpace = typename RowMapType::memory_space::execution_space;
-        if ( std::is_same<ExecSpace, Kokkos::Cuda>::value)
-          Experimental::upper_tri_solve_cg( *sptrsv_handle, row_map, entries, values, b, x);
+        if (std::is_same<ExecSpace, Kokkos::Cuda>::value)
+          Experimental::upper_tri_solve_cg(*sptrsv_handle, row_map, entries,
+                                           values, b, x);
         else
 #endif
-          Experimental::upper_tri_solve( *sptrsv_handle, row_map, entries, values, b, x);
+          Experimental::upper_tri_solve(*sptrsv_handle, row_map, entries,
+                                        values, b, x);
       }
     }
     Kokkos::Profiling::popRegion();
   }
-
 };
 
-
 #endif
-}
-}
+}  // namespace Impl
+}  // namespace KokkosSparse
 
 //
 // Macro for declaration of full specialization of
@@ -213,52 +196,63 @@ struct SPTRSV_SOLVE<KernelHandle, RowMapType, EntriesType, ValuesType, BType, XT
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSSPARSE_SPTRSV_SOLVE_ETI_SPEC_DECL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE ) \
-    extern template struct  \
-    SPTRSV_SOLVE<\
-                  KokkosKernels::Experimental::KokkosKernelsHandle<\
-                               const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,  \
-                               EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE> , \
-                  Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged> >, false, true >;
-
-#define KOKKOSSPARSE_SPTRSV_SOLVE_ETI_SPEC_INST( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \
-    template struct  \
-    SPTRSV_SOLVE<\
-                  KokkosKernels::Experimental::KokkosKernelsHandle<\
-                               const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,  \
-                               EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE> , \
-                  Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const SCALAR_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged> >, false, true >;
+#define KOKKOSSPARSE_SPTRSV_SOLVE_ETI_SPEC_DECL(                            \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE,   \
+    MEM_SPACE_TYPE)                                                         \
+  extern template struct SPTRSV_SOLVE<                                      \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                     \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,         \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>,                 \
+      Kokkos::View<                                                         \
+          const OFFSET_TYPE *, LAYOUT_TYPE,                                 \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          const ORDINAL_TYPE *, LAYOUT_TYPE,                                \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          const SCALAR_TYPE *, LAYOUT_TYPE,                                 \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          const SCALAR_TYPE *, LAYOUT_TYPE,                                 \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                              \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,         \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      false, true>;
 
-#include<KokkosSparse_sptrsv_solve_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosSparse_sptrsv_solve_eti_spec_decl.hpp>
+#define KOKKOSSPARSE_SPTRSV_SOLVE_ETI_SPEC_INST(                            \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE,   \
+    MEM_SPACE_TYPE)                                                         \
+  template struct SPTRSV_SOLVE<                                             \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                     \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,         \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>,                 \
+      Kokkos::View<                                                         \
+          const OFFSET_TYPE *, LAYOUT_TYPE,                                 \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          const ORDINAL_TYPE *, LAYOUT_TYPE,                                \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          const SCALAR_TYPE *, LAYOUT_TYPE,                                 \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          const SCALAR_TYPE *, LAYOUT_TYPE,                                 \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<SCALAR_TYPE *, LAYOUT_TYPE,                              \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,         \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
+      false, true>;
 
+#include <KokkosSparse_sptrsv_solve_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosSparse_sptrsv_solve_eti_spec_decl.hpp>
 
 #endif
diff --git a/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp
index db5830599b..4d11112493 100644
--- a/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp
@@ -55,7 +55,8 @@
 //#define LVL_OUTPUT_INFO
 //#define CHAIN_LVL_OUTPUT_INFO
 
-// TODO Pass values array and store diagonal entries - should this always be done or optional?
+// TODO Pass values array and store diagonal entries - should this always be
+// done or optional?
 
 namespace KokkosSparse {
 namespace Impl {
@@ -73,19 +74,18 @@ void print_view1d_symbolic(const ViewType dv, size_t range = 0) {
   std::cout << std::endl;
 }
 
-
 // Usage:
-  // for c in [0, num_chain_entries)
-  //   s = h_chain_ptr(c); e = h_chain_ptr(c+1);
-  //   num_levels_in_current_chain = e - s;
-  //   if nlicc > 256
-  //     call current_alg
-  //   else
-  //     call single_block(s,e)
-
-template < class TriSolveHandle, class NPLViewType >
-void symbolic_chain_phase(TriSolveHandle &thandle, const NPLViewType &nodes_per_level) {
-
+// for c in [0, num_chain_entries)
+//   s = h_chain_ptr(c); e = h_chain_ptr(c+1);
+//   num_levels_in_current_chain = e - s;
+//   if nlicc > 256
+//     call current_alg
+//   else
+//     call single_block(s,e)
+
+template <class TriSolveHandle, class NPLViewType>
+void symbolic_chain_phase(TriSolveHandle& thandle,
+                          const NPLViewType& nodes_per_level) {
 #ifdef TRISOLVE_SYMB_TIMERS
   Kokkos::Timer timer_sym_chain_total;
 #endif
@@ -95,987 +95,1071 @@ void symbolic_chain_phase(TriSolveHandle &thandle, const NPLViewType &nodes_per_
 
   // Create the chain now
   // FIXME Implementations will need to be templated on exec space it seems...
- auto cutoff_threshold = thandle.get_chain_threshold();
- if ( thandle.algm_requires_symb_chain() ) {
-  auto h_chain_ptr = thandle.get_host_chain_ptr();
-  h_chain_ptr(0) = 0;
-  size_type chainlinks_length = 0;
-  size_type num_chain_entries = 0;
-  int chain_state = 0;
-  const int cutoff = cutoff_threshold;
-  for ( size_type i = 0; i < nlevels; ++i ) {
-    auto cnpl = nodes_per_level(i);
-    if (cnpl <= cutoff) {
-      // this nlevels may be part of a chain passed to the "single_block" solver to reduce kernel launches
-      chainlinks_length += 1;
-    }
-    else {
-      // Too many levels to run on single block...
-      // If first lvl <= cutoff but next nlevels isn't, the two aren't separately updated and info is lost...
-      // if chainlinks_length > 0, take path so that chain-links updated, then current too large chain updated (i.e. 2 updates); if chainlinks_length == 0, then no previous chains and only one update required (npl too large for single-block
-      chain_state = chainlinks_length > 0 ? 2 : 1;
-    }
-
-    // if we hit final nlevels before a trigger to update the chain, than override it 
-    // in this case, there was not a larger value to miss cutoff and reset the update
-    if ( chain_state == 0 && i == nlevels-1 ) { chain_state = 1; }
+  auto cutoff_threshold = thandle.get_chain_threshold();
+  if (thandle.algm_requires_symb_chain()) {
+    auto h_chain_ptr            = thandle.get_host_chain_ptr();
+    h_chain_ptr(0)              = 0;
+    size_type chainlinks_length = 0;
+    size_type num_chain_entries = 0;
+    int chain_state             = 0;
+    const int cutoff            = cutoff_threshold;
+    for (size_type i = 0; i < nlevels; ++i) {
+      auto cnpl = nodes_per_level(i);
+      if (cnpl <= cutoff) {
+        // this nlevels may be part of a chain passed to the "single_block"
+        // solver to reduce kernel launches
+        chainlinks_length += 1;
+      } else {
+        // Too many levels to run on single block...
+        // If first lvl <= cutoff but next nlevels isn't, the two aren't
+        // separately updated and info is lost... if chainlinks_length > 0, take
+        // path so that chain-links updated, then current too large chain
+        // updated (i.e. 2 updates); if chainlinks_length == 0, then no previous
+        // chains and only one update required (npl too large for single-block
+        chain_state = chainlinks_length > 0 ? 2 : 1;
+      }
 
-    if (chain_state == 1) {
-      num_chain_entries += 1;
-      if (chainlinks_length == 0) {
-        h_chain_ptr(num_chain_entries) = h_chain_ptr(num_chain_entries-1) + 1;
+      // if we hit final nlevels before a trigger to update the chain, than
+      // override it in this case, there was not a larger value to miss cutoff
+      // and reset the update
+      if (chain_state == 0 && i == nlevels - 1) {
+        chain_state = 1;
       }
-      else {
-        h_chain_ptr(num_chain_entries) = h_chain_ptr(num_chain_entries-1) + chainlinks_length;
+
+      if (chain_state == 1) {
+        num_chain_entries += 1;
+        if (chainlinks_length == 0) {
+          h_chain_ptr(num_chain_entries) =
+              h_chain_ptr(num_chain_entries - 1) + 1;
+        } else {
+          h_chain_ptr(num_chain_entries) =
+              h_chain_ptr(num_chain_entries - 1) + chainlinks_length;
+        }
+        chainlinks_length = 0;  // reset
+        chain_state       = 0;  // reset
       }
-      chainlinks_length = 0; //reset
-      chain_state = 0; //reset
-    }
-    // Two updates required - should only occur if chainlinks_length > 0
-    // We have found two things: a non-one length chain, and a subsequent one length chain
-    if (chain_state == 2) {
-      if (chainlinks_length == 0) { throw(std::runtime_error("MAJOR LOGIC ERROR! TERMINATE!")); }
+      // Two updates required - should only occur if chainlinks_length > 0
+      // We have found two things: a non-one length chain, and a subsequent one
+      // length chain
+      if (chain_state == 2) {
+        if (chainlinks_length == 0) {
+          throw(std::runtime_error("MAJOR LOGIC ERROR! TERMINATE!"));
+        }
 
-      num_chain_entries += 1;
-      h_chain_ptr(num_chain_entries) = h_chain_ptr(num_chain_entries-1) + chainlinks_length;
+        num_chain_entries += 1;
+        h_chain_ptr(num_chain_entries) =
+            h_chain_ptr(num_chain_entries - 1) + chainlinks_length;
 
-      num_chain_entries += 1;
-      h_chain_ptr(num_chain_entries) = h_chain_ptr(num_chain_entries-1) + 1;
+        num_chain_entries += 1;
+        h_chain_ptr(num_chain_entries) = h_chain_ptr(num_chain_entries - 1) + 1;
 
-      chainlinks_length = 0; //reset
-      chain_state = 0; //reset
+        chainlinks_length = 0;  // reset
+        chain_state       = 0;  // reset
+      }
     }
-  }
-  thandle.set_num_chain_entries(num_chain_entries);
+    thandle.set_num_chain_entries(num_chain_entries);
 
 #ifdef CHAIN_LVL_OUTPUT_INFO
-  std::cout << "  num_chain_entries = " << thandle.get_num_chain_entries() << std::endl;
-  for ( size_type i = 0; i < num_chain_entries+1; ++i )
-  {
-    std::cout << "chain_ptr(" << i << "): " << h_chain_ptr(i) << std::endl;
-  }
+    std::cout << "  num_chain_entries = " << thandle.get_num_chain_entries()
+              << std::endl;
+    for (size_type i = 0; i < num_chain_entries + 1; ++i) {
+      std::cout << "chain_ptr(" << i << "): " << h_chain_ptr(i) << std::endl;
+    }
 #endif
- }
+  }
 
 #ifdef TRISOLVE_SYMB_TIMERS
- std::cout << "  Symbolic Chain Phase Total Time: " << timer_sym_chain_total.seconds() << std::endl;;
+  std::cout << "  Symbolic Chain Phase Total Time: "
+            << timer_sym_chain_total.seconds() << std::endl;
+  ;
 #endif
-} // end symbolic_chain_phase
+}  // end symbolic_chain_phase
 
-
-template < class TriSolveHandle, class RowMapType, class EntriesType >
-void lower_tri_symbolic (TriSolveHandle &thandle, const RowMapType drow_map, const EntriesType dentries) {
+template <class TriSolveHandle, class RowMapType, class EntriesType>
+void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map,
+                        const EntriesType dentries) {
 #ifdef TRISOLVE_SYMB_TIMERS
   Kokkos::Timer timer_sym_lowertri_total;
   Kokkos::Timer timer;
 #endif
 
- using namespace KokkosSparse::Experimental;
- if (thandle.get_algorithm () == SPTRSVAlgorithm::SEQLVLSCHD_RP  ||
-     thandle.get_algorithm () == SPTRSVAlgorithm::SEQLVLSCHD_TP1 ||
-   /*thandle.get_algorithm () == SPTRSVAlgorithm::SEQLVLSCHED_TP2*/
-     thandle.get_algorithm () == SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN)
- {
-  // Scheduling currently computes on host - need host copy of all views
-
-  typedef typename TriSolveHandle::size_type size_type;
-
-  typedef typename TriSolveHandle::nnz_lno_view_t  DeviceEntriesType;
-
-  typedef typename TriSolveHandle::signed_nnz_lno_view_t DeviceSignedEntriesType;
-  typedef typename TriSolveHandle::signed_nnz_lno_view_t::HostMirror HostSignedEntriesType;
-
-  typedef typename TriSolveHandle::signed_integral_t signed_integral_t;
-
-  // Necessary for partitioned persisting sparse matrix
-  size_type nrows = drow_map.extent(0)-1;
-
-  auto row_map = Kokkos::create_mirror_view(drow_map);
-  Kokkos::deep_copy(row_map, drow_map);
-
-  auto entries = Kokkos::create_mirror_view(dentries);
-  Kokkos::deep_copy(entries, dentries);
-  
-  // get device view - will deep_copy to it at end of this host routine
-  DeviceEntriesType dnodes_per_level = thandle.get_nodes_per_level();
-  auto nodes_per_level = thandle.get_host_nodes_per_level();
-
-  // get device view - will deep_copy to it at end of this host routine
-  DeviceEntriesType dnodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
-  auto nodes_grouped_by_level = thandle.get_host_nodes_grouped_by_level();
-
-  DeviceSignedEntriesType dlevel_list = thandle.get_level_list();
-  HostSignedEntriesType level_list = Kokkos::create_mirror_view(dlevel_list);
-  Kokkos::deep_copy(level_list, dlevel_list);
-
-  HostSignedEntriesType previous_level_list( Kokkos::view_alloc(Kokkos::WithoutInitializing, "previous_level_list"), nrows );
-  Kokkos::deep_copy( previous_level_list, signed_integral_t(-1) );
-
-  const bool stored_diagonal = thandle.is_stored_diagonal();
-  // diagonal_offsets is uninitialized - deep_copy unnecessary at the beginning, only needed at the end
-  auto diagonal_offsets = thandle.get_diagonal_offsets();
-  auto hdiagonal_offsets = thandle.get_host_diagonal_offsets();
-
-  size_type level = 0;
-  auto starting_node = 0;
-  auto ending_node = nrows;
-
-  size_type node_count = 0;
-
-  while (node_count < nrows) {
-
-    for ( size_type row = starting_node; row < ending_node; ++row )
-    {
-      if ( level_list(row) == -1 ) { // unmarked
-        bool is_root = true;
-        signed_integral_t ptrstart = row_map(row);
-        signed_integral_t ptrend   = row_map(row+1);
-
-        for (signed_integral_t offset = ptrstart; offset < ptrend; ++offset) {
-          size_type col = entries(offset);
-          if ( previous_level_list(col) == -1 && col != row ) { // unmarked
-            if ( col < row ) {
-              is_root = false;
-              break;
+  using namespace KokkosSparse::Experimental;
+  if (thandle.get_algorithm() == SPTRSVAlgorithm::SEQLVLSCHD_RP ||
+      thandle.get_algorithm() == SPTRSVAlgorithm::SEQLVLSCHD_TP1 ||
+      /*thandle.get_algorithm () == SPTRSVAlgorithm::SEQLVLSCHED_TP2*/
+      thandle.get_algorithm() == SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) {
+    // Scheduling currently computes on host - need host copy of all views
+
+    typedef typename TriSolveHandle::size_type size_type;
+
+    typedef typename TriSolveHandle::nnz_lno_view_t DeviceEntriesType;
+
+    typedef
+        typename TriSolveHandle::signed_nnz_lno_view_t DeviceSignedEntriesType;
+    typedef typename TriSolveHandle::signed_nnz_lno_view_t::HostMirror
+        HostSignedEntriesType;
+
+    typedef typename TriSolveHandle::signed_integral_t signed_integral_t;
+
+    // Necessary for partitioned persisting sparse matrix
+    size_type nrows = drow_map.extent(0) - 1;
+
+    auto row_map = Kokkos::create_mirror_view(drow_map);
+    Kokkos::deep_copy(row_map, drow_map);
+
+    auto entries = Kokkos::create_mirror_view(dentries);
+    Kokkos::deep_copy(entries, dentries);
+
+    // get device view - will deep_copy to it at end of this host routine
+    DeviceEntriesType dnodes_per_level = thandle.get_nodes_per_level();
+    auto nodes_per_level               = thandle.get_host_nodes_per_level();
+
+    // get device view - will deep_copy to it at end of this host routine
+    DeviceEntriesType dnodes_grouped_by_level =
+        thandle.get_nodes_grouped_by_level();
+    auto nodes_grouped_by_level = thandle.get_host_nodes_grouped_by_level();
+
+    DeviceSignedEntriesType dlevel_list = thandle.get_level_list();
+    HostSignedEntriesType level_list = Kokkos::create_mirror_view(dlevel_list);
+    Kokkos::deep_copy(level_list, dlevel_list);
+
+    HostSignedEntriesType previous_level_list(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "previous_level_list"),
+        nrows);
+    Kokkos::deep_copy(previous_level_list, signed_integral_t(-1));
+
+    const bool stored_diagonal = thandle.is_stored_diagonal();
+    // diagonal_offsets is uninitialized - deep_copy unnecessary at the
+    // beginning, only needed at the end
+    auto diagonal_offsets  = thandle.get_diagonal_offsets();
+    auto hdiagonal_offsets = thandle.get_host_diagonal_offsets();
+
+    size_type level    = 0;
+    auto starting_node = 0;
+    auto ending_node   = nrows;
+
+    size_type node_count = 0;
+
+    while (node_count < nrows) {
+      for (size_type row = starting_node; row < ending_node; ++row) {
+        if (level_list(row) == -1) {  // unmarked
+          bool is_root               = true;
+          signed_integral_t ptrstart = row_map(row);
+          signed_integral_t ptrend   = row_map(row + 1);
+
+          for (signed_integral_t offset = ptrstart; offset < ptrend; ++offset) {
+            size_type col = entries(offset);
+            if (previous_level_list(col) == -1 && col != row) {  // unmarked
+              if (col < row) {
+                is_root = false;
+                break;
+              }
+            } else if (col == row) {
+              if (stored_diagonal) hdiagonal_offsets(row) = offset;
+            } else if (col > row) {
+              std::cout << "\nrow = " << row << "  col = " << col
+                        << "  offset = " << offset << std::endl;
+              throw(
+                  std::runtime_error("SYMB ERROR: Lower tri with colid > rowid "
+                                     "- SHOULD NOT HAPPEN!!!"));
             }
-          }
-          else if ( col == row ) {
-            if (stored_diagonal)
-              hdiagonal_offsets(row) = offset;
-          }
-          else if ( col > row ) {
-            std::cout << "\nrow = " << row << "  col = " << col << "  offset = " << offset << std::endl;
-            throw(std::runtime_error("SYMB ERROR: Lower tri with colid > rowid - SHOULD NOT HAPPEN!!!"));
-          }
-        } // end for offset , i.e. cols of this row
+          }  // end for offset , i.e. cols of this row
 
-        if ( is_root == true ) {
-          level_list(row) = level;
-          nodes_per_level(level) += 1;
-          nodes_grouped_by_level(node_count) = row;
-          node_count += 1;
-        }
+          if (is_root == true) {
+            level_list(row) = level;
+            nodes_per_level(level) += 1;
+            nodes_grouped_by_level(node_count) = row;
+            node_count += 1;
+          }
 
-      } // end if
-    } // end for row
+        }  // end if
+      }    // end for row
 
-    //Kokkos::deep_copy(previous_level_list, level_list);
-    for ( size_type i = 0; i < nrows; ++i ) {
-      previous_level_list(i) = level_list(i);
-    }
+      // Kokkos::deep_copy(previous_level_list, level_list);
+      for (size_type i = 0; i < nrows; ++i) {
+        previous_level_list(i) = level_list(i);
+      }
 
-    level += 1;
-  } // end while
+      level += 1;
+    }  // end while
 
-  thandle.set_num_levels(level);
+    thandle.set_num_levels(level);
 
-  // Create the chain now
-  if ( thandle.algm_requires_symb_chain() ) {
-    symbolic_chain_phase(thandle, nodes_per_level);
-  }
+    // Create the chain now
+    if (thandle.algm_requires_symb_chain()) {
+      symbolic_chain_phase(thandle, nodes_per_level);
+    }
 
-  thandle.set_symbolic_complete();
+    thandle.set_symbolic_complete();
 
-  // Output check
+    // Output check
 #ifdef LVL_OUTPUT_INFO
-  std::cout << "  set symbolic complete: " << thandle.is_symbolic_complete() << std::endl;
-  std::cout << "  set num levels: " << thandle.get_num_levels() << std::endl;
-
-  std::cout << "  lower_tri_symbolic result: " << std::endl;
-  for ( size_type i = 0; i < node_count; ++i )
-  { std::cout << "node: " << i << "  level_list = " << level_list(i) << std::endl; }
+    std::cout << "  set symbolic complete: " << thandle.is_symbolic_complete()
+              << std::endl;
+    std::cout << "  set num levels: " << thandle.get_num_levels() << std::endl;
+
+    std::cout << "  lower_tri_symbolic result: " << std::endl;
+    for (size_type i = 0; i < node_count; ++i) {
+      std::cout << "node: " << i << "  level_list = " << level_list(i)
+                << std::endl;
+    }
 
-  for ( size_type i = 0; i < level; ++i )
-  { std::cout << "level: " << i << "  nodes_per_level = " << nodes_per_level(i) << std::endl; }
+    for (size_type i = 0; i < level; ++i) {
+      std::cout << "level: " << i
+                << "  nodes_per_level = " << nodes_per_level(i) << std::endl;
+    }
 
-  for ( size_type i = 0; i < node_count; ++i )
-  { std::cout << "i: " << i << "  nodes_grouped_by_level = " << nodes_grouped_by_level(i) << std::endl; }
+    for (size_type i = 0; i < node_count; ++i) {
+      std::cout << "i: " << i
+                << "  nodes_grouped_by_level = " << nodes_grouped_by_level(i)
+                << std::endl;
+    }
 #endif
 
-  // Deep copy to device views
-  Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level);
-  Kokkos::deep_copy(dnodes_per_level, nodes_per_level);
-  Kokkos::deep_copy(dlevel_list, level_list);
-  if (stored_diagonal)
-    Kokkos::deep_copy(diagonal_offsets, hdiagonal_offsets);
+    // Deep copy to device views
+    Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level);
+    Kokkos::deep_copy(dnodes_per_level, nodes_per_level);
+    Kokkos::deep_copy(dlevel_list, level_list);
+    if (stored_diagonal) Kokkos::deep_copy(diagonal_offsets, hdiagonal_offsets);
 
-  // Extra check:
+      // Extra check:
 #ifdef LVL_OUTPUT_INFO
-  {
-  std::cout << "  End symb - extra checks" << std::endl;
-  std::cout << "  node_count = " << node_count << std::endl;
-  std::cout << "  nlevel = " << level << std::endl;
-  std::cout << "  npl.extent = " << nodes_per_level.extent(0) << std::endl;
-  long check_count = 0;
-  Kokkos::parallel_reduce("check_count host", Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, nodes_per_level.extent(0)),
-    KOKKOS_LAMBDA (const long i, long& update) {
-      update+=nodes_per_level(i);
-    }, check_count);
-  std::cout << "  host check_count= " << check_count << std::endl;
-
-  check_count = 0; // reset
-  Kokkos::parallel_reduce("check_count device", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, dnodes_per_level.extent(0)),
-    KOKKOS_LAMBDA (const long i, long& update) {
-      update+=dnodes_per_level(i);
-    }, check_count);
-  std::cout << "  devicecheck_count= " << check_count << std::endl;
-  }
+    {
+      std::cout << "  End symb - extra checks" << std::endl;
+      std::cout << "  node_count = " << node_count << std::endl;
+      std::cout << "  nlevel = " << level << std::endl;
+      std::cout << "  npl.extent = " << nodes_per_level.extent(0) << std::endl;
+      long check_count = 0;
+      Kokkos::parallel_reduce(
+          "check_count host",
+          Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(
+              0, nodes_per_level.extent(0)),
+          KOKKOS_LAMBDA(const long i, long& update) {
+            update += nodes_per_level(i);
+          },
+          check_count);
+      std::cout << "  host check_count= " << check_count << std::endl;
+
+      check_count = 0;  // reset
+      Kokkos::parallel_reduce(
+          "check_count device",
+          Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(
+              0, dnodes_per_level.extent(0)),
+          KOKKOS_LAMBDA(const long i, long& update) {
+            update += dnodes_per_level(i);
+          },
+          check_count);
+      std::cout << "  devicecheck_count= " << check_count << std::endl;
+    }
 #endif
- }
+  }
 #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
- else if (thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_NAIVE ||
-          thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_ETREE ||
-          thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_DAG   ||
-          thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV  ||
-          thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) {
-
-  using size_type = typename TriSolveHandle::size_type;
-  using signed_integral_t = typename TriSolveHandle::signed_integral_t;
-  using integer_view_t = typename TriSolveHandle::integer_view_t;
-  using integer_view_host_t = typename integer_view_t::HostMirror;
-
-  // rowptr: pointer to begining of each row (CRS)
-  auto row_map = Kokkos::create_mirror_view(drow_map);
-  Kokkos::deep_copy(row_map, drow_map);
-
-  // # of nodes per level
-  auto dnodes_per_level = thandle.get_nodes_per_level ();
-  auto  nodes_per_level = thandle.get_host_nodes_per_level ();
-
-  // node ids in each level
-  auto dnodes_grouped_by_level = thandle.get_nodes_grouped_by_level ();
-  auto  nodes_grouped_by_level = thandle.get_host_nodes_grouped_by_level();
-
-  // map node id to level that this node belongs to
-  auto dlevel_list = thandle.get_level_list ();
-  auto  level_list = Kokkos::create_mirror_view (dlevel_list);
-
-  // type of kernels used at each level
-  int size_unblocked = thandle.get_supernode_size_unblocked();
-  //int size_blocked = thandle.get_supernode_size_blocked();
-  auto kernel_type_by_level = thandle.get_kernel_type_host ();
-  auto diag_kernel_type_by_level = thandle.get_diag_kernel_type_host ();
-
-  // # of supernodal columns
-  size_type nsuper = thandle.get_num_supernodes ();
-  const int* supercols = thandle.get_supercols_host ();
-
-  // workspace
-  signed_integral_t max_lwork = 0;
-  auto work_offset_host = thandle.get_work_offset_host ();
-  if (thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_NAIVE) {
-    // >> Naive (sequential) version: going through supernodal column one at a time from 1 to nsuper
-    // Set number of level equal to be the number of supernodal columns
-    thandle.set_num_levels (nsuper);
-
-    // Set up level sets: going through supernodal column one at a time from 1 to nsuper
-    for (size_type s = 0; s < nsuper; s++) {
-      nodes_per_level (s) = 1;           // # of nodes per level
-      nodes_grouped_by_level (s) = s;    // only one task per level (task id)
-      level_list (s) = s;                // map task id to level
-
-      // local/max workspace size
-      size_type row = supercols[s];
-      signed_integral_t lwork = row_map (row+1) - row_map(row);
-      if (max_lwork < lwork) {
-        max_lwork = lwork;
-      }
-
-      // kernel type
-      if (lwork < size_unblocked) {
-        // batched unblocked
-        kernel_type_by_level (s) = 0;
-        diag_kernel_type_by_level (s) = 0;
-      //} else if (lwork < size_blocked) {
-      //  // batched blocked
-      //  kernel_type_by_level (s) = 1;
-      //  diag_kernel_type_by_level (s) = 1;
-      } else {
-        // device
-        kernel_type_by_level (s) = 3;
-        diag_kernel_type_by_level (s) = 3;
-      }
-      work_offset_host (s) = 0;
-    }
-  } else {
-    //#define profile_supernodal_etree
-    #ifdef profile_supernodal_etree
-    // min, max, tot size of supernodes
-    signed_integral_t max_nsrow = 0;
-    signed_integral_t min_nsrow = 0;
-    signed_integral_t tot_nsrow = 0;
-
-    signed_integral_t max_nscol = 0;
-    signed_integral_t min_nscol = 0;
-    signed_integral_t tot_nscol = 0;
-
-    // min, max, tot num of leaves
-    signed_integral_t max_nleave = 0;
-    signed_integral_t min_nleave = 0;
-    signed_integral_t tot_nleave = 0;
-    #endif
-
-    /* initialize the ready tasks with leaves */
-    const int *parents = thandle.get_etree_parents ();
-    integer_view_host_t check ("check", nsuper);
-    Kokkos::deep_copy (check, 0);
-
-    auto dag = thandle.get_supernodal_dag ();
-    auto dag_row_map = dag.row_map;
-    auto dag_entries = dag.entries;
-    bool use_dag = (thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_DAG ||
-                    thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG);
-    for (size_type s = 0; s < nsuper; s++) {
-      if (use_dag) {
-        for (size_type e = dag_row_map (s); e < dag_row_map (s+1); e++) {
-          check (dag_entries (e)) ++;
+  else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE ||
+           thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE ||
+           thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG ||
+           thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV ||
+           thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) {
+
+    using size_type           = typename TriSolveHandle::size_type;
+    using signed_integral_t   = typename TriSolveHandle::signed_integral_t;
+    using integer_view_t      = typename TriSolveHandle::integer_view_t;
+    using integer_view_host_t = typename integer_view_t::HostMirror;
+
+    // rowptr: pointer to begining of each row (CRS)
+    auto row_map = Kokkos::create_mirror_view(drow_map);
+    Kokkos::deep_copy(row_map, drow_map);
+
+    // # of nodes per level
+    auto dnodes_per_level = thandle.get_nodes_per_level();
+    auto nodes_per_level  = thandle.get_host_nodes_per_level();
+
+    // node ids in each level
+    auto dnodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
+    auto nodes_grouped_by_level  = thandle.get_host_nodes_grouped_by_level();
+
+    // map node id to level that this node belongs to
+    auto dlevel_list = thandle.get_level_list();
+    auto level_list  = Kokkos::create_mirror_view(dlevel_list);
+
+    // type of kernels used at each level
+    int size_unblocked = thandle.get_supernode_size_unblocked();
+    // int size_blocked = thandle.get_supernode_size_blocked();
+    auto kernel_type_by_level      = thandle.get_kernel_type_host();
+    auto diag_kernel_type_by_level = thandle.get_diag_kernel_type_host();
+
+    // # of supernodal columns
+    size_type nsuper     = thandle.get_num_supernodes();
+    const int* supercols = thandle.get_supercols_host();
+
+    // workspace
+    signed_integral_t max_lwork = 0;
+    auto work_offset_host       = thandle.get_work_offset_host();
+    if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE) {
+      // >> Naive (sequential) version: going through supernodal column one at a
+      // time from 1 to nsuper Set number of level equal to be the number of
+      // supernodal columns
+      thandle.set_num_levels(nsuper);
+
+      // Set up level sets: going through supernodal column one at a time from 1
+      // to nsuper
+      for (size_type s = 0; s < nsuper; s++) {
+        nodes_per_level(s)        = 1;  // # of nodes per level
+        nodes_grouped_by_level(s) = s;  // only one task per level (task id)
+        level_list(s)             = s;  // map task id to level
+
+        // local/max workspace size
+        size_type row           = supercols[s];
+        signed_integral_t lwork = row_map(row + 1) - row_map(row);
+        if (max_lwork < lwork) {
+          max_lwork = lwork;
         }
-      } else {
-        if (parents[s] >= 0) {
-          check (parents[s]) ++;
+
+        // kernel type
+        if (lwork < size_unblocked) {
+          // batched unblocked
+          kernel_type_by_level(s)      = 0;
+          diag_kernel_type_by_level(s) = 0;
+          //} else if (lwork < size_blocked) {
+          //  // batched blocked
+          //  kernel_type_by_level (s) = 1;
+          //  diag_kernel_type_by_level (s) = 1;
+        } else {
+          // device
+          kernel_type_by_level(s)      = 3;
+          diag_kernel_type_by_level(s) = 3;
         }
+        work_offset_host(s) = 0;
       }
-    }
+    } else {
+//#define profile_supernodal_etree
+#ifdef profile_supernodal_etree
+      // min, max, tot size of supernodes
+      signed_integral_t max_nsrow = 0;
+      signed_integral_t min_nsrow = 0;
+      signed_integral_t tot_nsrow = 0;
+
+      signed_integral_t max_nscol = 0;
+      signed_integral_t min_nscol = 0;
+      signed_integral_t tot_nscol = 0;
+
+      // min, max, tot num of leaves
+      signed_integral_t max_nleave = 0;
+      signed_integral_t min_nleave = 0;
+      signed_integral_t tot_nleave = 0;
+#endif
 
-    size_type num_done = 0;
-    size_type level = 0;
-    while (num_done < nsuper) {
-      nodes_per_level (level) = 0; 
-      // look for ready-tasks
-      signed_integral_t lwork = 0;
-      signed_integral_t num_leave = 0;
-      signed_integral_t avg_nscol = 0;
-      signed_integral_t avg_nsrow = 0;
+      /* initialize the ready tasks with leaves */
+      const int* parents = thandle.get_etree_parents();
+      integer_view_host_t check("check", nsuper);
+      Kokkos::deep_copy(check, 0);
+
+      auto dag         = thandle.get_supernodal_dag();
+      auto dag_row_map = dag.row_map;
+      auto dag_entries = dag.entries;
+      bool use_dag =
+          (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG ||
+           thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG);
       for (size_type s = 0; s < nsuper; s++) {
-        if (check (s) == 0) {
-          nodes_per_level (level) ++; 
-          nodes_grouped_by_level (num_done + num_leave) = s;
-          level_list (s) = level;
-
-          // work offset
-          work_offset_host (s) = lwork;
- 
-          // update workspace size
-          size_type row = supercols[s];
-          signed_integral_t nsrow = row_map (row+1) - row_map(row);
-          lwork += nsrow;
-          //printf( " %d: nuum_leave=%d, level=%d, nsrow=%d, nscol=%d, s=%d\n",(int)(num_done+num_leave), (int)num_leave, (int)level, (int)nsrow, (int)(supercols[s+1]-supercols[s]),(int)s );
-          //for (int i = supercols[s]; i < supercols[s+1]; i++) printf("%d %d %d\n",i,s,level );  // permute matrix based on scheduling
-
-          // total supernode size
-          avg_nsrow += row_map (row+1) - row_map(row);
-          avg_nscol += supercols[s+1] - supercols[s];
-
-          #ifdef profile_supernodal_etree
-          // gather static if requested
-          signed_integral_t nscol = supercols[s+1] - supercols[s];
-          if (num_done+num_leave == 0) {
-            // initialization at the initial step
-            max_nscol = nscol;
-            min_nscol = nscol;
-
-            max_nsrow = nsrow;
-            min_nsrow = nsrow;
-          } else {
-            if (max_nscol < nscol) {
+        if (use_dag) {
+          for (size_type e = dag_row_map(s); e < dag_row_map(s + 1); e++) {
+            check(dag_entries(e))++;
+          }
+        } else {
+          if (parents[s] >= 0) {
+            check(parents[s])++;
+          }
+        }
+      }
+
+      size_type num_done = 0;
+      size_type level    = 0;
+      while (num_done < nsuper) {
+        nodes_per_level(level) = 0;
+        // look for ready-tasks
+        signed_integral_t lwork     = 0;
+        signed_integral_t num_leave = 0;
+        signed_integral_t avg_nscol = 0;
+        signed_integral_t avg_nsrow = 0;
+        for (size_type s = 0; s < nsuper; s++) {
+          if (check(s) == 0) {
+            nodes_per_level(level)++;
+            nodes_grouped_by_level(num_done + num_leave) = s;
+            level_list(s)                                = level;
+
+            // work offset
+            work_offset_host(s) = lwork;
+
+            // update workspace size
+            size_type row           = supercols[s];
+            signed_integral_t nsrow = row_map(row + 1) - row_map(row);
+            lwork += nsrow;
+            // printf( " %d: nuum_leave=%d, level=%d, nsrow=%d, nscol=%d,
+            // s=%d\n",(int)(num_done+num_leave), (int)num_leave, (int)level,
+            // (int)nsrow, (int)(supercols[s+1]-supercols[s]),(int)s ); for (int
+            // i = supercols[s]; i < supercols[s+1]; i++) printf("%d %d
+            // %d\n",i,s,level );  // permute matrix based on scheduling
+
+            // total supernode size
+            avg_nsrow += row_map(row + 1) - row_map(row);
+            avg_nscol += supercols[s + 1] - supercols[s];
+
+#ifdef profile_supernodal_etree
+            // gather static if requested
+            signed_integral_t nscol = supercols[s + 1] - supercols[s];
+            if (num_done + num_leave == 0) {
+              // initialization at the initial step
               max_nscol = nscol;
-            }
-            if (min_nscol > nscol) {
               min_nscol = nscol;
-            }
 
-            if (max_nsrow < nsrow) {
               max_nsrow = nsrow;
-            }
-            if (min_nsrow > nsrow) {
               min_nsrow = nsrow;
+            } else {
+              if (max_nscol < nscol) {
+                max_nscol = nscol;
+              }
+              if (min_nscol > nscol) {
+                min_nscol = nscol;
+              }
+
+              if (max_nsrow < nsrow) {
+                max_nsrow = nsrow;
+              }
+              if (min_nsrow > nsrow) {
+                min_nsrow = nsrow;
+              }
             }
-          }
-          tot_nsrow += nsrow;
-          tot_nscol += nscol;
-          #endif
+            tot_nsrow += nsrow;
+            tot_nscol += nscol;
+#endif
 
-          num_leave ++;
+            num_leave++;
+          }
+        }
+        if (lwork > max_lwork) {
+          max_lwork = lwork;
         }
-      }
-      if (lwork > max_lwork) {
-        max_lwork = lwork;
-      }
 
-      // average supernode size at this level
-      avg_nsrow /= num_leave;
-      avg_nscol /= num_leave;
-      // kernel type
-      if (avg_nscol < size_unblocked) {
-        // batched unblocked
-        kernel_type_by_level (level) = 0;
-        diag_kernel_type_by_level (level) = 0;
-      //} else if (avg_nscol < size_blocked) {
-      //  // batched blocked
-      //  kernel_type_by_level (level) = 1;
-      //  diag_kernel_type_by_level (level) = 1;
-      } else {
-        // device
-        kernel_type_by_level (level) = 3;
-        diag_kernel_type_by_level (level) = 3;
-      }
-      #ifdef profile_supernodal_etree
-      std::cout << level <<  " : num_leave="
-                << num_leave << ", nsrow=" << min_nsrow << ", " << avg_nsrow << ", " << max_nsrow 
-                             << ", nscol=" << min_nscol << ", " << avg_nscol << ", " << max_nscol
-                << ", kernel_type=" << diag_kernel_type_by_level (level)
-                << "(" << size_unblocked << "," << thandle.get_supernode_size_blocked() << ")" << std::endl;
-      if (level == 0) {
-        max_nleave = num_leave;
-        min_nleave = num_leave;
-      } else {
-        if (max_nleave < num_leave) {
-          max_nleave = num_leave;
+        // average supernode size at this level
+        avg_nsrow /= num_leave;
+        avg_nscol /= num_leave;
+        // kernel type
+        if (avg_nscol < size_unblocked) {
+          // batched unblocked
+          kernel_type_by_level(level)      = 0;
+          diag_kernel_type_by_level(level) = 0;
+          //} else if (avg_nscol < size_blocked) {
+          //  // batched blocked
+          //  kernel_type_by_level (level) = 1;
+          //  diag_kernel_type_by_level (level) = 1;
+        } else {
+          // device
+          kernel_type_by_level(level)      = 3;
+          diag_kernel_type_by_level(level) = 3;
         }
-        if (min_nleave > num_leave) {
+#ifdef profile_supernodal_etree
+        std::cout << level << " : num_leave=" << num_leave
+                  << ", nsrow=" << min_nsrow << ", " << avg_nsrow << ", "
+                  << max_nsrow << ", nscol=" << min_nscol << ", " << avg_nscol
+                  << ", " << max_nscol
+                  << ", kernel_type=" << diag_kernel_type_by_level(level) << "("
+                  << size_unblocked << ","
+                  << thandle.get_supernode_size_blocked() << ")" << std::endl;
+        if (level == 0) {
+          max_nleave = num_leave;
           min_nleave = num_leave;
-        }
-      }
-      tot_nleave += num_leave;
-      #endif
-
-      // free the dependency
-      for (signed_integral_t task = 0; task < num_leave; task++) {
-        size_type s = nodes_grouped_by_level (num_done + task);
-        check (s) = -1;
-        //printf( " %d: check[%d]=%d ",level,s,check[s]);
-        if (use_dag) {
-          for (size_type e = dag_row_map (s); e < dag_row_map (s+1); e++) {
-            check (dag_entries (e)) --;
-          }
         } else {
-          if (parents[s] >= 0) {
-            check (parents[s]) --;
-            //printf( " -> check[%d]=%d",parents[s],check (parents[s]));
+          if (max_nleave < num_leave) {
+            max_nleave = num_leave;
+          }
+          if (min_nleave > num_leave) {
+            min_nleave = num_leave;
           }
         }
-        //printf( "\n" );
+        tot_nleave += num_leave;
+#endif
+
+        // free the dependency
+        for (signed_integral_t task = 0; task < num_leave; task++) {
+          size_type s = nodes_grouped_by_level(num_done + task);
+          check(s)    = -1;
+          // printf( " %d: check[%d]=%d ",level,s,check[s]);
+          if (use_dag) {
+            for (size_type e = dag_row_map(s); e < dag_row_map(s + 1); e++) {
+              check(dag_entries(e))--;
+            }
+          } else {
+            if (parents[s] >= 0) {
+              check(parents[s])--;
+              // printf( " -> check[%d]=%d",parents[s],check (parents[s]));
+            }
+          }
+          // printf( "\n" );
+        }
+        num_done += num_leave;
+        // printf( " level=%d: num_done=%d / %d\n",level,num_done,nsuper );
+        level++;
       }
-      num_done += num_leave;
-      //printf( " level=%d: num_done=%d / %d\n",level,num_done,nsuper );
-      level ++;
+#ifdef profile_supernodal_etree
+      std::cout << "   * number of supernodes = " << nsuper << std::endl;
+      std::cout << "   * supernodal rows: min = " << min_nsrow
+                << "\t max = " << max_nsrow << "\t avg = " << tot_nsrow / nsuper
+                << std::endl;
+      std::cout << "   * supernodal cols: min = " << min_nscol
+                << "\t max = " << max_nscol << "\t avg = " << tot_nscol / nsuper
+                << std::endl;
+      std::cout << "   * numer of leaves: min = " << min_nleave
+                << "\t max = " << max_nleave
+                << "\t avg = " << tot_nleave / level << std::endl;
+      std::cout << "   * level = " << level << std::endl;
+#endif
+#ifdef TRISOLVE_SYMB_TIMERS
+      std::cout << "   + scheduling time = " << timer.seconds() << std::endl;
+#endif
+      // Set number of level equal to be the number of supernodal columns
+      thandle.set_num_levels(level);
     }
-    #ifdef profile_supernodal_etree
-    std::cout << "   * number of supernodes = " << nsuper << std::endl;
-    std::cout << "   * supernodal rows: min = " << min_nsrow  << "\t max = " << max_nsrow  << "\t avg = " << tot_nsrow/nsuper << std::endl;
-    std::cout << "   * supernodal cols: min = " << min_nscol  << "\t max = " << max_nscol  << "\t avg = " << tot_nscol/nsuper << std::endl;
-    std::cout << "   * numer of leaves: min = " << min_nleave << "\t max = " << max_nleave << "\t avg = " << tot_nleave/level << std::endl;
-    std::cout << "   * level = " << level << std::endl;
-    #endif
-    #ifdef TRISOLVE_SYMB_TIMERS
-    std::cout << "   + scheduling time = " << timer.seconds() << std::endl;
-    #endif
-    // Set number of level equal to be the number of supernodal columns
-    thandle.set_num_levels (level);
-  }
-  #ifdef TRISOLVE_SYMB_TIMERS
-  timer.reset();
-  #endif
-  // workspace size
-  if (thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV  ||
-      thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) {
-    max_lwork = thandle.get_nrows ();
-  }
-  thandle.set_workspace_size (max_lwork);
-  // workspace offset initialized to be zero
-  integer_view_t work_offset = thandle.get_work_offset ();
-  Kokkos::deep_copy (work_offset, work_offset_host);
-
-  // kernel types
-  // > off-diagonal
-  integer_view_t dkernel_type_by_level = thandle.get_kernel_type ();
-  Kokkos::deep_copy (dkernel_type_by_level, kernel_type_by_level);
-  // > diagonal
-  integer_view_t ddiag_kernel_type_by_level = thandle.get_diag_kernel_type ();
-  Kokkos::deep_copy (ddiag_kernel_type_by_level, diag_kernel_type_by_level);
-
-  // deep copy to device (of scheduling info)
-  Kokkos::deep_copy (dnodes_grouped_by_level, nodes_grouped_by_level);
-  Kokkos::deep_copy (dnodes_per_level, nodes_per_level);
-  Kokkos::deep_copy (dlevel_list, level_list);
-
-  #ifdef TRISOLVE_SYMB_TIMERS
-  std::cout << "   + workspace  time = " << timer.seconds() << std::endl;
-  #endif
-
-  thandle.set_symbolic_complete();
- }
+#ifdef TRISOLVE_SYMB_TIMERS
+    timer.reset();
 #endif
+    // workspace size
+    if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV ||
+        thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) {
+      max_lwork = thandle.get_nrows();
+    }
+    thandle.set_workspace_size(max_lwork);
+    // workspace offset initialized to be zero
+    integer_view_t work_offset = thandle.get_work_offset();
+    Kokkos::deep_copy(work_offset, work_offset_host);
+
+    // kernel types
+    // > off-diagonal
+    integer_view_t dkernel_type_by_level = thandle.get_kernel_type();
+    Kokkos::deep_copy(dkernel_type_by_level, kernel_type_by_level);
+    // > diagonal
+    integer_view_t ddiag_kernel_type_by_level = thandle.get_diag_kernel_type();
+    Kokkos::deep_copy(ddiag_kernel_type_by_level, diag_kernel_type_by_level);
+
+    // deep copy to device (of scheduling info)
+    Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level);
+    Kokkos::deep_copy(dnodes_per_level, nodes_per_level);
+    Kokkos::deep_copy(dlevel_list, level_list);
 
 #ifdef TRISOLVE_SYMB_TIMERS
- std::cout << "  Symbolic (lower tri) Total Time: " << timer_sym_lowertri_total.seconds() << std::endl;;
+    std::cout << "   + workspace  time = " << timer.seconds() << std::endl;
+#endif
+
+    thandle.set_symbolic_complete();
+  }
 #endif
-} // end lower_tri_symbolic
 
+#ifdef TRISOLVE_SYMB_TIMERS
+  std::cout << "  Symbolic (lower tri) Total Time: "
+            << timer_sym_lowertri_total.seconds() << std::endl;
+  ;
+#endif
+}  // end lower_tri_symbolic
 
-template < class TriSolveHandle, class RowMapType, class EntriesType >
-void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, const EntriesType dentries ) {
+template <class TriSolveHandle, class RowMapType, class EntriesType>
+void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map,
+                        const EntriesType dentries) {
 #ifdef TRISOLVE_SYMB_TIMERS
   Kokkos::Timer timer_sym_uppertri_total;
   Kokkos::Timer timer;
 #endif
 
- using namespace KokkosSparse::Experimental;
- if (thandle.get_algorithm () == SPTRSVAlgorithm::SEQLVLSCHD_RP  ||
-     thandle.get_algorithm () == SPTRSVAlgorithm::SEQLVLSCHD_TP1 ||
-   /*thandle.get_algorithm () == SPTRSVAlgorithm::SEQLVLSCHED_TP2*/
-     thandle.get_algorithm () == SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN)
- {
-  // Scheduling currently compute on host - need host copy of all views
-
-  typedef typename TriSolveHandle::size_type size_type;
-  typedef typename TriSolveHandle::nnz_lno_view_t  DeviceEntriesType;
-  typedef typename TriSolveHandle::signed_nnz_lno_view_t DeviceSignedEntriesType;
-  typedef typename TriSolveHandle::signed_nnz_lno_view_t::HostMirror HostSignedEntriesType;
-  typedef typename TriSolveHandle::signed_integral_t signed_integral_t;
-
-//  size_type nrows = thandle.get_nrows();
-  // Necessary for partitioned persisting sparse matrix
-  size_type nrows = drow_map.extent(0)-1;
-
-  auto row_map = Kokkos::create_mirror_view(drow_map);
-  Kokkos::deep_copy(row_map, drow_map);
-
-  auto entries = Kokkos::create_mirror_view(dentries);
-  Kokkos::deep_copy(entries, dentries);
-  
-  // get device view - will deep_copy to it at end of this host routine
-  DeviceEntriesType dnodes_per_level = thandle.get_nodes_per_level();
-  auto nodes_per_level = thandle.get_host_nodes_per_level();
-
-  // get device view - will deep_copy to it at end of this host routine
-  DeviceEntriesType dnodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
-  auto nodes_grouped_by_level = thandle.get_host_nodes_grouped_by_level();
-
-  DeviceSignedEntriesType dlevel_list = thandle.get_level_list();
-  HostSignedEntriesType level_list = Kokkos::create_mirror_view(dlevel_list);
-  Kokkos::deep_copy(level_list, dlevel_list);
-
-  HostSignedEntriesType previous_level_list( Kokkos::view_alloc(Kokkos::WithoutInitializing, "previous_level_list"), nrows);
-  Kokkos::deep_copy( previous_level_list, signed_integral_t(-1) );
-
-  const bool stored_diagonal = thandle.is_stored_diagonal();
-  // diagonal_offsets is uninitialized - deep_copy unnecessary at the beginning, only needed at the end
-  auto diagonal_offsets = thandle.get_diagonal_offsets();
-  auto hdiagonal_offsets = thandle.get_host_diagonal_offsets();
-
-  size_type level = 0;
-  auto starting_node = nrows - 1;
-  auto ending_node = 0;
-
-  size_type node_count = 0;
-
-  while (node_count < nrows) {
-
-    for ( signed_integral_t row = starting_node; row >= ending_node; --row )
-    {
-      if ( level_list(row) == -1 ) { // unmarked
-        bool is_root = true;
-        signed_integral_t ptrstart = row_map(row);
-        signed_integral_t ptrend   = row_map(row+1);
-
-        for (signed_integral_t offset = ptrend-1; offset >= ptrstart; --offset) {
-          signed_integral_t col = entries(offset);
-
-          if (previous_level_list(col) == -1 && col != row) { // unmarked
-            if ( col > row ) {
-              is_root = false;
-              break;
+  using namespace KokkosSparse::Experimental;
+  if (thandle.get_algorithm() == SPTRSVAlgorithm::SEQLVLSCHD_RP ||
+      thandle.get_algorithm() == SPTRSVAlgorithm::SEQLVLSCHD_TP1 ||
+      /*thandle.get_algorithm () == SPTRSVAlgorithm::SEQLVLSCHED_TP2*/
+      thandle.get_algorithm() == SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) {
+    // Scheduling currently compute on host - need host copy of all views
+
+    typedef typename TriSolveHandle::size_type size_type;
+    typedef typename TriSolveHandle::nnz_lno_view_t DeviceEntriesType;
+    typedef
+        typename TriSolveHandle::signed_nnz_lno_view_t DeviceSignedEntriesType;
+    typedef typename TriSolveHandle::signed_nnz_lno_view_t::HostMirror
+        HostSignedEntriesType;
+    typedef typename TriSolveHandle::signed_integral_t signed_integral_t;
+
+    //  size_type nrows = thandle.get_nrows();
+    // Necessary for partitioned persisting sparse matrix
+    size_type nrows = drow_map.extent(0) - 1;
+
+    auto row_map = Kokkos::create_mirror_view(drow_map);
+    Kokkos::deep_copy(row_map, drow_map);
+
+    auto entries = Kokkos::create_mirror_view(dentries);
+    Kokkos::deep_copy(entries, dentries);
+
+    // get device view - will deep_copy to it at end of this host routine
+    DeviceEntriesType dnodes_per_level = thandle.get_nodes_per_level();
+    auto nodes_per_level               = thandle.get_host_nodes_per_level();
+
+    // get device view - will deep_copy to it at end of this host routine
+    DeviceEntriesType dnodes_grouped_by_level =
+        thandle.get_nodes_grouped_by_level();
+    auto nodes_grouped_by_level = thandle.get_host_nodes_grouped_by_level();
+
+    DeviceSignedEntriesType dlevel_list = thandle.get_level_list();
+    HostSignedEntriesType level_list = Kokkos::create_mirror_view(dlevel_list);
+    Kokkos::deep_copy(level_list, dlevel_list);
+
+    HostSignedEntriesType previous_level_list(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "previous_level_list"),
+        nrows);
+    Kokkos::deep_copy(previous_level_list, signed_integral_t(-1));
+
+    const bool stored_diagonal = thandle.is_stored_diagonal();
+    // diagonal_offsets is uninitialized - deep_copy unnecessary at the
+    // beginning, only needed at the end
+    auto diagonal_offsets  = thandle.get_diagonal_offsets();
+    auto hdiagonal_offsets = thandle.get_host_diagonal_offsets();
+
+    size_type level    = 0;
+    auto starting_node = nrows - 1;
+    auto ending_node   = 0;
+
+    size_type node_count = 0;
+
+    while (node_count < nrows) {
+      for (signed_integral_t row = starting_node; row >= ending_node; --row) {
+        if (level_list(row) == -1) {  // unmarked
+          bool is_root               = true;
+          signed_integral_t ptrstart = row_map(row);
+          signed_integral_t ptrend   = row_map(row + 1);
+
+          for (signed_integral_t offset = ptrend - 1; offset >= ptrstart;
+               --offset) {
+            signed_integral_t col = entries(offset);
+
+            if (previous_level_list(col) == -1 && col != row) {  // unmarked
+              if (col > row) {
+                is_root = false;
+                break;
+              }
+            } else if (col == row) {
+              if (stored_diagonal) hdiagonal_offsets(row) = offset;
             }
-          }
-          else if ( col == row ) {
-            if (stored_diagonal)
-              hdiagonal_offsets(row) = offset;
-          }
-        } // end for offset , i.e. cols of this row
+          }  // end for offset , i.e. cols of this row
 
-        if ( is_root == true ) {
-          level_list(row) = level;
-          nodes_per_level(level) += 1;
-          nodes_grouped_by_level(node_count) = row;
-          node_count += 1;
-        }
+          if (is_root == true) {
+            level_list(row) = level;
+            nodes_per_level(level) += 1;
+            nodes_grouped_by_level(node_count) = row;
+            node_count += 1;
+          }
 
-      } // end if
-    } // end for row
+        }  // end if
+      }    // end for row
 
-    //Kokkos::deep_copy(previous_level_list, level_list);
-    for ( size_type i = 0; i < nrows; ++i ) {
-      previous_level_list(i) = level_list(i);
-    }
+      // Kokkos::deep_copy(previous_level_list, level_list);
+      for (size_type i = 0; i < nrows; ++i) {
+        previous_level_list(i) = level_list(i);
+      }
 
-    level += 1;
-  } // end while
+      level += 1;
+    }  // end while
 
-  thandle.set_num_levels(level);
+    thandle.set_num_levels(level);
 
-  // Create the chain now
-  if ( thandle.algm_requires_symb_chain() ) {
-    symbolic_chain_phase(thandle, nodes_per_level);
-  }
+    // Create the chain now
+    if (thandle.algm_requires_symb_chain()) {
+      symbolic_chain_phase(thandle, nodes_per_level);
+    }
 
-  thandle.set_symbolic_complete();
+    thandle.set_symbolic_complete();
 
-  // Output check
+    // Output check
 #ifdef LVL_OUTPUT_INFO
-  std::cout << "  set symbolic complete: " << thandle.is_symbolic_complete() << std::endl;
-  std::cout << "  set num levels: " << thandle.get_num_levels() << std::endl;
-
-  std::cout << "  upper_tri_symbolic result: " << std::endl;
-  for ( size_type i = 0; i < node_count; ++i )
-  { std::cout << "node: " << i << "  level_list = " << level_list(i) << std::endl; }
+    std::cout << "  set symbolic complete: " << thandle.is_symbolic_complete()
+              << std::endl;
+    std::cout << "  set num levels: " << thandle.get_num_levels() << std::endl;
+
+    std::cout << "  upper_tri_symbolic result: " << std::endl;
+    for (size_type i = 0; i < node_count; ++i) {
+      std::cout << "node: " << i << "  level_list = " << level_list(i)
+                << std::endl;
+    }
 
-  for ( size_type i = 0; i < level; ++i )
-  { std::cout << "level: " << i << "  nodes_per_level = " << nodes_per_level(i) << std::endl; }
+    for (size_type i = 0; i < level; ++i) {
+      std::cout << "level: " << i
+                << "  nodes_per_level = " << nodes_per_level(i) << std::endl;
+    }
 
-  for ( size_type i = 0; i < node_count; ++i )
-  { std::cout << "i: " << i << "  nodes_grouped_by_level = " << nodes_grouped_by_level(i) << std::endl; }
+    for (size_type i = 0; i < node_count; ++i) {
+      std::cout << "i: " << i
+                << "  nodes_grouped_by_level = " << nodes_grouped_by_level(i)
+                << std::endl;
+    }
 #endif
 
-  // Deep copy to device views
-  Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level);
-  Kokkos::deep_copy(dnodes_per_level, nodes_per_level);
-  Kokkos::deep_copy(dlevel_list, level_list);
-  if (stored_diagonal)
-    Kokkos::deep_copy(diagonal_offsets, hdiagonal_offsets);
+    // Deep copy to device views
+    Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level);
+    Kokkos::deep_copy(dnodes_per_level, nodes_per_level);
+    Kokkos::deep_copy(dlevel_list, level_list);
+    if (stored_diagonal) Kokkos::deep_copy(diagonal_offsets, hdiagonal_offsets);
 
-  // Extra check:
+      // Extra check:
 #ifdef LVL_OUTPUT_INFO
-  {
-  std::cout << "  End symb - extra checks" << std::endl;
-  std::cout << "  node_count = " << node_count << std::endl;
-  std::cout << "  nlevel = " << level << std::endl;
-  std::cout << "  npl.extent = " << nodes_per_level.extent(0) << std::endl;
-  long check_count = 0;
-  Kokkos::parallel_reduce("check_count host", Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, nodes_per_level.extent(0)),
-    KOKKOS_LAMBDA (const long i, long& update) {
-      update+=nodes_per_level(i);
-    }, check_count);
-  std::cout << "  host check_count= " << check_count << std::endl;
-
-  check_count = 0; // reset
-  Kokkos::parallel_reduce("check_count device", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, dnodes_per_level.extent(0)),
-    KOKKOS_LAMBDA (const long i, long& update) {
-      update+=dnodes_per_level(i);
-    }, check_count);
-  std::cout << "  devicecheck_count= " << check_count << std::endl;
-  }
-#endif
- }
-#ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
- else if (thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_NAIVE ||
-          thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_ETREE ||
-          thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_DAG ||
-          thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV ||
-          thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) {
-
-  using size_type = typename TriSolveHandle::size_type;
-
-  using signed_integral_t = typename TriSolveHandle::signed_integral_t;
-  using integer_view_t = typename TriSolveHandle::integer_view_t;
-  using integer_view_host_t = typename integer_view_t::HostMirror;
-
-  // rowptr: pointer to begining of each row (CRS)
-  auto row_map = Kokkos::create_mirror_view(drow_map);
-  Kokkos::deep_copy(row_map, drow_map);
-
-  // # of nodes per level
-  auto dnodes_per_level = thandle.get_nodes_per_level ();
-  auto  nodes_per_level = thandle.get_host_nodes_per_level ();
-
-  // node ids in each level
-  auto dnodes_grouped_by_level = thandle.get_nodes_grouped_by_level ();
-  auto  nodes_grouped_by_level = thandle.get_host_nodes_grouped_by_level();
-
-  // type of kernels used at each level
-  int size_unblocked = thandle.get_supernode_size_unblocked();
-  auto kernel_type_by_level = thandle.get_kernel_type_host ();
-  auto diag_kernel_type_by_level = thandle.get_diag_kernel_type_host ();
-
-  // map node id to level that this node belongs to
-  auto dlevel_list = thandle.get_level_list ();
-  auto  level_list = Kokkos::create_mirror_view (dlevel_list);
-
-  // # of supernodal columns
-  size_type nsuper = thandle.get_num_supernodes ();
-  const int* supercols = thandle.get_supercols_host ();
-
-  // workspace
-  signed_integral_t max_lwork = 0;
-  auto work_offset_host = thandle.get_work_offset_host ();
-  if (thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_NAIVE) {
-    // >> Naive (sequential) version: going through supernodal column one at a time from 1 to nsuper
-    // Set number of level equal to be the number of supernodal columns
-    thandle.set_num_levels (nsuper);
-
-    // Set up level sets: going through supernodal column one at a time from 1 to nsuper
-    for (size_type s = 0; s < nsuper; s++) {
-      nodes_per_level (s) = 1;                  // # of nodes per level
-      nodes_grouped_by_level (s) = nsuper-1-s;  // only one task per level (task id)
-      level_list (nsuper-1-s) = s;              // map task id to level
-
-      size_type row = supercols[s];
-      signed_integral_t lwork = row_map (row+1) - row_map(row);
-      if (max_lwork < lwork) {
-        max_lwork = lwork;
-      }
-      work_offset_host (s) = 0;
-
-      if (lwork < size_unblocked) {
-        // batched unblocked
-        kernel_type_by_level (s) = 0;
-        diag_kernel_type_by_level (s) = 0;
-      } else {
-        // device
-        kernel_type_by_level (s) = 3;
-        diag_kernel_type_by_level (s) = 3;
-      }
+    {
+      std::cout << "  End symb - extra checks" << std::endl;
+      std::cout << "  node_count = " << node_count << std::endl;
+      std::cout << "  nlevel = " << level << std::endl;
+      std::cout << "  npl.extent = " << nodes_per_level.extent(0) << std::endl;
+      long check_count = 0;
+      Kokkos::parallel_reduce(
+          "check_count host",
+          Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(
+              0, nodes_per_level.extent(0)),
+          KOKKOS_LAMBDA(const long i, long& update) {
+            update += nodes_per_level(i);
+          },
+          check_count);
+      std::cout << "  host check_count= " << check_count << std::endl;
+
+      check_count = 0;  // reset
+      Kokkos::parallel_reduce(
+          "check_count device",
+          Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(
+              0, dnodes_per_level.extent(0)),
+          KOKKOS_LAMBDA(const long i, long& update) {
+            update += dnodes_per_level(i);
+          },
+          check_count);
+      std::cout << "  devicecheck_count= " << check_count << std::endl;
     }
+#endif
   }
-  else {
-    /* schduling from bottom to top (as for L-solve) *
-     * then reverse it for U-solve                   */
-    #ifdef profile_supernodal_etree
-    // min, max, tot size of supernodes
-    signed_integral_t max_nsrow = 0;
-    signed_integral_t min_nsrow = 0;
-    signed_integral_t tot_nsrow = 0;
-
-    signed_integral_t max_nscol = 0;
-    signed_integral_t min_nscol = 0;
-    signed_integral_t tot_nscol = 0;
-
-    // min, max, tot num of leaves
-    signed_integral_t max_nleave = 0;
-    signed_integral_t min_nleave = 0;
-    signed_integral_t tot_nleave = 0;
-    #endif
-
-    /* initialize the ready tasks with leaves */
-    const int *parents = thandle.get_etree_parents ();
-    integer_view_host_t check ("check", nsuper);
-    Kokkos::deep_copy (check, 0);
-
-    auto dag = thandle.get_supernodal_dag ();
-    auto dag_row_map = dag.row_map;
-    auto dag_entries = dag.entries;
-    bool use_dag = (thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_DAG ||
-                    thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG);
-    if (use_dag) {
+#ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
+  else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE ||
+           thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE ||
+           thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG ||
+           thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV ||
+           thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) {
+
+    using size_type = typename TriSolveHandle::size_type;
+
+    using signed_integral_t   = typename TriSolveHandle::signed_integral_t;
+    using integer_view_t      = typename TriSolveHandle::integer_view_t;
+    using integer_view_host_t = typename integer_view_t::HostMirror;
+
+    // rowptr: pointer to begining of each row (CRS)
+    auto row_map = Kokkos::create_mirror_view(drow_map);
+    Kokkos::deep_copy(row_map, drow_map);
+
+    // # of nodes per level
+    auto dnodes_per_level = thandle.get_nodes_per_level();
+    auto nodes_per_level  = thandle.get_host_nodes_per_level();
+
+    // node ids in each level
+    auto dnodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
+    auto nodes_grouped_by_level  = thandle.get_host_nodes_grouped_by_level();
+
+    // type of kernels used at each level
+    int size_unblocked             = thandle.get_supernode_size_unblocked();
+    auto kernel_type_by_level      = thandle.get_kernel_type_host();
+    auto diag_kernel_type_by_level = thandle.get_diag_kernel_type_host();
+
+    // map node id to level that this node belongs to
+    auto dlevel_list = thandle.get_level_list();
+    auto level_list  = Kokkos::create_mirror_view(dlevel_list);
+
+    // # of supernodal columns
+    size_type nsuper     = thandle.get_num_supernodes();
+    const int* supercols = thandle.get_supercols_host();
+
+    // workspace
+    signed_integral_t max_lwork = 0;
+    auto work_offset_host       = thandle.get_work_offset_host();
+    if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE) {
+      // >> Naive (sequential) version: going through supernodal column one at a
+      // time from 1 to nsuper Set number of level equal to be the number of
+      // supernodal columns
+      thandle.set_num_levels(nsuper);
+
+      // Set up level sets: going through supernodal column one at a time from 1
+      // to nsuper
       for (size_type s = 0; s < nsuper; s++) {
-        for (size_type e = dag_row_map (s); e < dag_row_map (s+1); e++) {
-          check (dag_entries (e)) ++;
+        nodes_per_level(s) = 1;  // # of nodes per level
+        nodes_grouped_by_level(s) =
+            nsuper - 1 - s;              // only one task per level (task id)
+        level_list(nsuper - 1 - s) = s;  // map task id to level
+
+        size_type row           = supercols[s];
+        signed_integral_t lwork = row_map(row + 1) - row_map(row);
+        if (max_lwork < lwork) {
+          max_lwork = lwork;
+        }
+        work_offset_host(s) = 0;
+
+        if (lwork < size_unblocked) {
+          // batched unblocked
+          kernel_type_by_level(s)      = 0;
+          diag_kernel_type_by_level(s) = 0;
+        } else {
+          // device
+          kernel_type_by_level(s)      = 3;
+          diag_kernel_type_by_level(s) = 3;
         }
       }
     } else {
-      for (size_type s = 0; s < nsuper; s++) {
-        if (parents[s] >= 0) {
-          check (parents[s]) ++;
+/* schduling from bottom to top (as for L-solve) *
+ * then reverse it for U-solve                   */
+#ifdef profile_supernodal_etree
+      // min, max, tot size of supernodes
+      signed_integral_t max_nsrow = 0;
+      signed_integral_t min_nsrow = 0;
+      signed_integral_t tot_nsrow = 0;
+
+      signed_integral_t max_nscol = 0;
+      signed_integral_t min_nscol = 0;
+      signed_integral_t tot_nscol = 0;
+
+      // min, max, tot num of leaves
+      signed_integral_t max_nleave = 0;
+      signed_integral_t min_nleave = 0;
+      signed_integral_t tot_nleave = 0;
+#endif
+
+      /* initialize the ready tasks with leaves */
+      const int* parents = thandle.get_etree_parents();
+      integer_view_host_t check("check", nsuper);
+      Kokkos::deep_copy(check, 0);
+
+      auto dag         = thandle.get_supernodal_dag();
+      auto dag_row_map = dag.row_map;
+      auto dag_entries = dag.entries;
+      bool use_dag =
+          (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG ||
+           thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG);
+      if (use_dag) {
+        for (size_type s = 0; s < nsuper; s++) {
+          for (size_type e = dag_row_map(s); e < dag_row_map(s + 1); e++) {
+            check(dag_entries(e))++;
+          }
+        }
+      } else {
+        for (size_type s = 0; s < nsuper; s++) {
+          if (parents[s] >= 0) {
+            check(parents[s])++;
+          }
         }
       }
-    }
 
-    //printf( " Init:\n" );
-    //for (size_type s = 0; s <nsuper; s++) printf( " check[%d] = %d\n",s,check (s) );
-
-    size_type nrows = thandle.get_nrows();
-    integer_view_host_t inverse_nodes_per_level ("nodes_per_level", nrows);
-    integer_view_host_t inverse_nodes_grouped_by_level ("nodes_grouped_by_level", nrows);
-
-    size_type num_done = 0;
-    size_type level = 0;
-    while (num_done < nsuper) {
-      nodes_per_level (level) = 0; 
-      // look for ready-tasks
-      signed_integral_t lwork = 0;
-      signed_integral_t num_leave = 0;
-      for (size_type s = 0; s < nsuper; s++) {
-        if (check (s) == 0) {
-          inverse_nodes_per_level (level) ++; 
-          inverse_nodes_grouped_by_level (num_done + num_leave) = s;
-          //printf( " level=%d: %d/%d: s=%d\n",level, num_done+num_leave,nsuper, s );
-
-          // work offset
-          work_offset_host (s) = lwork;
- 
-          // update workspace size
-          size_type row = supercols[s];
-          signed_integral_t nsrow = row_map (row+1) - row_map(row);
-          //printf( " %d %d %d %d %d\n",num_done+num_leave, level, nsrow, supercols[s+1]-supercols[s],s );
-          //for (int i = supercols[s]; i < supercols[s+1]; i++) printf("%d %d %d\n",i,s,level );  // permute matrix based on scheduling
-          lwork += nsrow;
-
-          #ifdef profile_supernodal_etree
-          // gather static if requested
-          signed_integral_t nscol = supercols[s+1] - supercols[s];
-          if (tot_nscol == 0) {
-            max_nscol = nscol;
-            min_nscol = nscol;
-
-            max_nsrow = nsrow;
-            min_nsrow = nsrow;
-          } else {
-            if (max_nscol < nscol) {
+      // printf( " Init:\n" );
+      // for (size_type s = 0; s <nsuper; s++) printf( " check[%d] =
+      // %d\n",s,check (s) );
+
+      size_type nrows = thandle.get_nrows();
+      integer_view_host_t inverse_nodes_per_level("nodes_per_level", nrows);
+      integer_view_host_t inverse_nodes_grouped_by_level(
+          "nodes_grouped_by_level", nrows);
+
+      size_type num_done = 0;
+      size_type level    = 0;
+      while (num_done < nsuper) {
+        nodes_per_level(level) = 0;
+        // look for ready-tasks
+        signed_integral_t lwork     = 0;
+        signed_integral_t num_leave = 0;
+        for (size_type s = 0; s < nsuper; s++) {
+          if (check(s) == 0) {
+            inverse_nodes_per_level(level)++;
+            inverse_nodes_grouped_by_level(num_done + num_leave) = s;
+            // printf( " level=%d: %d/%d: s=%d\n",level,
+            // num_done+num_leave,nsuper, s );
+
+            // work offset
+            work_offset_host(s) = lwork;
+
+            // update workspace size
+            size_type row           = supercols[s];
+            signed_integral_t nsrow = row_map(row + 1) - row_map(row);
+            // printf( " %d %d %d %d %d\n",num_done+num_leave, level, nsrow,
+            // supercols[s+1]-supercols[s],s ); for (int i = supercols[s]; i <
+            // supercols[s+1]; i++) printf("%d %d %d\n",i,s,level );  // permute
+            // matrix based on scheduling
+            lwork += nsrow;
+
+#ifdef profile_supernodal_etree
+            // gather static if requested
+            signed_integral_t nscol = supercols[s + 1] - supercols[s];
+            if (tot_nscol == 0) {
               max_nscol = nscol;
-            }
-            if (min_nscol > nscol) {
               min_nscol = nscol;
-            }
 
-            if (max_nsrow < nsrow) {
               max_nsrow = nsrow;
-            }
-            if (min_nsrow > nsrow) {
               min_nsrow = nsrow;
+            } else {
+              if (max_nscol < nscol) {
+                max_nscol = nscol;
+              }
+              if (min_nscol > nscol) {
+                min_nscol = nscol;
+              }
+
+              if (max_nsrow < nsrow) {
+                max_nsrow = nsrow;
+              }
+              if (min_nsrow > nsrow) {
+                min_nsrow = nsrow;
+              }
             }
-          }
-          tot_nsrow += nsrow;
-          tot_nscol += nscol;
-          #endif
+            tot_nsrow += nsrow;
+            tot_nscol += nscol;
+#endif
 
-          num_leave ++;
+            num_leave++;
+          }
         }
-      }
-      //printf( " lwork = %d\n",lwork );
-      if (lwork > max_lwork) {
-        max_lwork = lwork;
-      }
-      #ifdef profile_supernodal_etree
-      if (level == 0) {
-        max_nleave = num_leave;
-        min_nleave = num_leave;
-      } else {
-        if (max_nleave < num_leave) {
-          max_nleave = num_leave;
+        // printf( " lwork = %d\n",lwork );
+        if (lwork > max_lwork) {
+          max_lwork = lwork;
         }
-        if (min_nleave > num_leave) {
+#ifdef profile_supernodal_etree
+        if (level == 0) {
+          max_nleave = num_leave;
           min_nleave = num_leave;
+        } else {
+          if (max_nleave < num_leave) {
+            max_nleave = num_leave;
+          }
+          if (min_nleave > num_leave) {
+            min_nleave = num_leave;
+          }
         }
-      }
-      tot_nleave += num_leave;
-      #endif
-
-      // free the dependency
-      for (signed_integral_t task = 0; task < num_leave; task++) {
-        size_type s = inverse_nodes_grouped_by_level (num_done + task);
-        check (s) = -1;
-        //printf( " %d: check[%d]=%d ",level,s,check (s));
-       if (use_dag) {
-          for (size_type e = dag_row_map (s); e < dag_row_map (s+1); e++) {
-            check (dag_entries (e)) --;
+        tot_nleave += num_leave;
+#endif
+
+        // free the dependency
+        for (signed_integral_t task = 0; task < num_leave; task++) {
+          size_type s = inverse_nodes_grouped_by_level(num_done + task);
+          check(s)    = -1;
+          // printf( " %d: check[%d]=%d ",level,s,check (s));
+          if (use_dag) {
+            for (size_type e = dag_row_map(s); e < dag_row_map(s + 1); e++) {
+              check(dag_entries(e))--;
+            }
+          } else {
+            if (parents[s] >= 0) {
+              check(parents[s])--;
+              // printf( " -> check[%d]=%d",parents[s],check (parents[s]));
+            }
           }
+          // printf( "\n" );
+        }
+        num_done += num_leave;
+        // printf( " level=%d: num_done=%d / %d\n",level,num_done,nsuper );
+        level++;
+      }
+#ifdef profile_supernodal_etree
+      std::cout << "   * number of supernodes = " << nsuper << std::endl;
+      std::cout << "   * supernodal rows: min = " << min_nsrow
+                << "\t max = " << max_nsrow << "\t avg = " << tot_nsrow / nsuper
+                << std::endl;
+      std::cout << "   * supernodal cols: min = " << min_nscol
+                << "\t max = " << max_nscol << "\t avg = " << tot_nscol / nsuper
+                << std::endl;
+      std::cout << "   * numer of leaves: min = " << min_nleave
+                << "\t max = " << max_nleave
+                << "\t avg = " << tot_nleave / level << std::endl;
+      std::cout << "   * level = " << level << std::endl;
+#endif
+
+      // now invert the lists
+      num_done            = 0;
+      size_type num_level = level;
+      for (level = 0; level < num_level; level++) {
+        signed_integral_t num_leave =
+            inverse_nodes_per_level(num_level - level - 1);
+        nodes_per_level(level) = num_leave;
+        // printf( " -> nodes_per_level(%d -> %d) = %d\n",num_level-level-1,
+        // level, num_leave );
+
+        signed_integral_t avg_nscol = 0;
+        signed_integral_t avg_nsrow = 0;
+        for (signed_integral_t task = 0; task < num_leave; task++) {
+          // signed_integral_t s = inverse_nodes_grouped_by_level (nsuper -
+          // (num_done+task) - 1);
+          signed_integral_t s = inverse_nodes_grouped_by_level(
+              nsuper - (num_done + num_leave - 1 - task) - 1);
+
+          nodes_grouped_by_level(num_done + task) = s;
+          level_list(s)                           = level;
+          // printf( " -> level=%d: %d->%d: s=%d\n",level,
+          // nsuper-(num_done+task)-1, num_done+task, s );
+
+          size_type row = supercols[s];
+          avg_nsrow += row_map(row + 1) - row_map(row);
+          avg_nscol += supercols[s + 1] - supercols[s];
+        }
+        num_done += num_leave;
+
+        // average supernodal size at this level
+        avg_nscol /= num_leave;
+        avg_nsrow /= num_leave;
+        // kernel type
+        if (avg_nscol < size_unblocked) {
+          // batched unblocked
+          kernel_type_by_level(level)      = 0;
+          diag_kernel_type_by_level(level) = 0;
         } else {
-          if (parents[s] >= 0) {
-            check (parents[s]) --;
-            //printf( " -> check[%d]=%d",parents[s],check (parents[s]));
-          }
+          // device
+          kernel_type_by_level(level)      = 3;
+          diag_kernel_type_by_level(level) = 3;
         }
-        //printf( "\n" );
       }
-      num_done += num_leave;
-      //printf( " level=%d: num_done=%d / %d\n",level,num_done,nsuper );
-      level ++;
+#ifdef TRISOLVE_SYMB_TIMERS
+      std::cout << "   + scheduling time = " << timer.seconds() << std::endl;
+#endif
+
+      // Set number of levels
+      thandle.set_num_levels(num_level);
     }
-    #ifdef profile_supernodal_etree
-    std::cout << "   * number of supernodes = " << nsuper << std::endl;
-    std::cout << "   * supernodal rows: min = " << min_nsrow  << "\t max = " << max_nsrow  << "\t avg = " << tot_nsrow/nsuper << std::endl;
-    std::cout << "   * supernodal cols: min = " << min_nscol  << "\t max = " << max_nscol  << "\t avg = " << tot_nscol/nsuper << std::endl;
-    std::cout << "   * numer of leaves: min = " << min_nleave << "\t max = " << max_nleave << "\t avg = " << tot_nleave/level << std::endl;
-    std::cout << "   * level = " << level << std::endl;
-    #endif
-
-    // now invert the lists
-    num_done = 0;
-    size_type num_level = level;
-    for (level = 0; level < num_level; level ++) {
-      signed_integral_t num_leave = inverse_nodes_per_level (num_level - level - 1);
-      nodes_per_level (level) = num_leave;
-      //printf( " -> nodes_per_level(%d -> %d) = %d\n",num_level-level-1, level, num_leave );
-
-      signed_integral_t avg_nscol = 0;
-      signed_integral_t avg_nsrow = 0;
-      for (signed_integral_t task = 0; task < num_leave; task++) {
-        //signed_integral_t s = inverse_nodes_grouped_by_level (nsuper - (num_done+task) - 1);
-        signed_integral_t s = inverse_nodes_grouped_by_level (nsuper - (num_done + num_leave-1 - task) - 1);
-
-        nodes_grouped_by_level (num_done+task) = s;
-        level_list (s) = level;
-        //printf( " -> level=%d: %d->%d: s=%d\n",level, nsuper-(num_done+task)-1, num_done+task, s );
-
-        size_type row = supercols[s];
-        avg_nsrow += row_map (row+1) - row_map(row);
-        avg_nscol += supercols[s+1] - supercols[s];
-      }
-      num_done += num_leave;
-
-      // average supernodal size at this level
-      avg_nscol /= num_leave;
-      avg_nsrow /= num_leave;
-      // kernel type
-      if (avg_nscol < size_unblocked) {
-        // batched unblocked
-        kernel_type_by_level (level) = 0;
-        diag_kernel_type_by_level (level) = 0;
-      } else {
-        // device
-        kernel_type_by_level (level) = 3;
-        diag_kernel_type_by_level (level) = 3;
-      }
+#ifdef TRISOLVE_SYMB_TIMERS
+    timer.reset();
+#endif
+    // workspace size
+    if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV ||
+        thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) {
+      max_lwork = thandle.get_nrows();
     }
-    #ifdef TRISOLVE_SYMB_TIMERS
-    std::cout << "   + scheduling time = " << timer.seconds() << std::endl;
-    #endif
+    thandle.set_workspace_size(max_lwork);
+    // workspace offset initialized to be zero
+    integer_view_t work_offset = thandle.get_work_offset();
+    Kokkos::deep_copy(work_offset, work_offset_host);
+
+    // kernel type
+    // > off-diagonal
+    integer_view_t dkernel_type_by_level = thandle.get_kernel_type();
+    Kokkos::deep_copy(dkernel_type_by_level, kernel_type_by_level);
+    // > diagonal
+    integer_view_t ddiag_kernel_type_by_level = thandle.get_diag_kernel_type();
+    Kokkos::deep_copy(ddiag_kernel_type_by_level, diag_kernel_type_by_level);
+
+    // deep copy to device (info about scheduling)
+    Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level);
+    Kokkos::deep_copy(dnodes_per_level, nodes_per_level);
+    Kokkos::deep_copy(dlevel_list, level_list);
+#ifdef TRISOLVE_SYMB_TIMERS
+    std::cout << "   + workspace  time = " << timer.seconds() << std::endl;
+#endif
 
-    // Set number of levels
-    thandle.set_num_levels (num_level);
-  }
-  #ifdef TRISOLVE_SYMB_TIMERS
-  timer.reset();
-  #endif
-  // workspace size
-  if (thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV  ||
-      thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) {
-    max_lwork = thandle.get_nrows ();
+    thandle.set_symbolic_complete();
   }
-  thandle.set_workspace_size (max_lwork);
-  // workspace offset initialized to be zero
-  integer_view_t work_offset = thandle.get_work_offset ();
-  Kokkos::deep_copy (work_offset, work_offset_host);
-
-  // kernel type
-  // > off-diagonal
-  integer_view_t dkernel_type_by_level = thandle.get_kernel_type ();
-  Kokkos::deep_copy (dkernel_type_by_level, kernel_type_by_level);
-  // > diagonal
-  integer_view_t ddiag_kernel_type_by_level = thandle.get_diag_kernel_type ();
-  Kokkos::deep_copy (ddiag_kernel_type_by_level, diag_kernel_type_by_level);
-
-  // deep copy to device (info about scheduling)
-  Kokkos::deep_copy (dnodes_grouped_by_level, nodes_grouped_by_level);
-  Kokkos::deep_copy (dnodes_per_level, nodes_per_level);
-  Kokkos::deep_copy (dlevel_list, level_list);
-  #ifdef TRISOLVE_SYMB_TIMERS
-  std::cout << "   + workspace  time = " << timer.seconds() << std::endl;
-  #endif
-
-  thandle.set_symbolic_complete ();
- }
 #endif
 
 #ifdef TRISOLVE_SYMB_TIMERS
- std::cout << "  Symbolic (upper tri) Total Time: " << timer_sym_uppertri_total.seconds() << std::endl;;
+  std::cout << "  Symbolic (upper tri) Total Time: "
+            << timer_sym_uppertri_total.seconds() << std::endl;
+  ;
 #endif
-} // end upper_tri_symbolic
-
+}  // end upper_tri_symbolic
 
-} // namespace Experimental
-} // namespace Impl
-} // namespace KokkosSparse
+}  // namespace Experimental
+}  // namespace Impl
+}  // namespace KokkosSparse
 
 #ifdef LVL_OUTPUT_INFO
 #undef LVL_OUTPUT_INFO
diff --git a/src/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp b/src/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp
index 94e8524cac..41a750da8e 100644
--- a/src/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp
+++ b/src/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp
@@ -51,41 +51,43 @@
 #include "KokkosKernels_Handle.hpp"
 
 // Include the actual functors
-#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY 
+#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 #include <KokkosSparse_sptrsv_symbolic_impl.hpp>
 #endif
 
 namespace KokkosSparse {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class KernelHandle,
-         class RowMapType,
-         class EntriesType>
+template <class KernelHandle, class RowMapType, class EntriesType>
 struct sptrsv_symbolic_eti_spec_avail {
   enum : bool { value = false };
 };
 
-}
-}
-
-
-#define KOKKOSSPARSE_SPTRSV_SYMBOLIC_ETI_SPEC_AVAIL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \
-    template<> \
-    struct sptrsv_symbolic_eti_spec_avail< \
-                  KokkosKernels::Experimental::KokkosKernelsHandle<\
-                               const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,  \
-                               EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE> , \
-                  Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > > \
-    { enum : bool { value = true }; };
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#define KOKKOSSPARSE_SPTRSV_SYMBOLIC_ETI_SPEC_AVAIL(                           \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE,      \
+    MEM_SPACE_TYPE)                                                            \
+  template <>                                                                  \
+  struct sptrsv_symbolic_eti_spec_avail<                                       \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                        \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,            \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>,                    \
+      Kokkos::View<                                                            \
+          const OFFSET_TYPE *, LAYOUT_TYPE,                                    \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                     \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >,    \
+      Kokkos::View<                                                            \
+          const ORDINAL_TYPE *, LAYOUT_TYPE,                                   \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                     \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> > > { \
+    enum : bool { value = true };                                              \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosSparse_sptrsv_symbolic_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosSparse_sptrsv_symbolic_eti_spec_avail.hpp>
+#include <KokkosSparse_sptrsv_symbolic_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosSparse_sptrsv_symbolic_eti_spec_avail.hpp>
 
 namespace KokkosSparse {
 namespace Impl {
@@ -93,55 +95,40 @@ namespace Impl {
 // Unification layer
 /// \brief Implementation of KokkosSparse::sptrsv_symbolic
 
-template<class KernelHandle,
-         class RowMapType,
-         class EntriesType,
-         bool tpl_spec_avail =
-             sptrsv_symbolic_tpl_spec_avail< KernelHandle,
-                                  RowMapType,
-                                  EntriesType>::value,
-         bool eti_spec_avail =
-             sptrsv_symbolic_eti_spec_avail< KernelHandle,
-                                  RowMapType,
-                                  EntriesType>::value >
-struct SPTRSV_SYMBOLIC{
-  static void
-  sptrsv_symbolic (KernelHandle *handle,
-                   const RowMapType row_map,
-                   const EntriesType entries);
+template <class KernelHandle, class RowMapType, class EntriesType,
+          bool tpl_spec_avail = sptrsv_symbolic_tpl_spec_avail<
+              KernelHandle, RowMapType, EntriesType>::value,
+          bool eti_spec_avail = sptrsv_symbolic_eti_spec_avail<
+              KernelHandle, RowMapType, EntriesType>::value>
+struct SPTRSV_SYMBOLIC {
+  static void sptrsv_symbolic(KernelHandle *handle, const RowMapType row_map,
+                              const EntriesType entries);
 };
 
-
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 //! Full specialization of sptrsv_symbolic
 // Unification layer
-template<class KernelHandle,
-         class RowMapType,
-         class EntriesType>
-struct SPTRSV_SYMBOLIC<KernelHandle, RowMapType, EntriesType, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>{
-  static void
-  sptrsv_symbolic (KernelHandle *handle,
-                   const RowMapType row_map,
-                   const EntriesType entries)
-  {
+template <class KernelHandle, class RowMapType, class EntriesType>
+struct SPTRSV_SYMBOLIC<KernelHandle, RowMapType, EntriesType, false,
+                       KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  static void sptrsv_symbolic(KernelHandle *handle, const RowMapType row_map,
+                              const EntriesType entries) {
     auto sptrsv_handle = handle->get_sptrsv_handle();
-    auto nrows = row_map.extent(0)-1;
+    auto nrows         = row_map.extent(0) - 1;
     sptrsv_handle->new_init_handle(nrows);
 
-    if ( sptrsv_handle->is_lower_tri() ) {
+    if (sptrsv_handle->is_lower_tri()) {
       Experimental::lower_tri_symbolic(*sptrsv_handle, row_map, entries);
       sptrsv_handle->set_symbolic_complete();
-    }
-    else {
+    } else {
       Experimental::upper_tri_symbolic(*sptrsv_handle, row_map, entries);
       sptrsv_handle->set_symbolic_complete();
     }
   }
-
 };
 #endif
-} // namespace Impl
-} // namespace KokkosSparse
+}  // namespace Impl
+}  // namespace KokkosSparse
 
 //
 // Macro for declaration of full specialization of
@@ -150,36 +137,41 @@ struct SPTRSV_SYMBOLIC<KernelHandle, RowMapType, EntriesType, false, KOKKOSKERNE
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSSPARSE_SPTRSV_SYMBOLIC_ETI_SPEC_DECL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE ) \
-    extern template struct  \
-    SPTRSV_SYMBOLIC<\
-                  KokkosKernels::Experimental::KokkosKernelsHandle<\
-                               const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,  \
-                               EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE> , \
-                  Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                      false, true >; \
-
-#define KOKKOSSPARSE_SPTRSV_SYMBOLIC_ETI_SPEC_INST( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \
-    template struct  \
-    SPTRSV_SYMBOLIC<\
-                  KokkosKernels::Experimental::KokkosKernelsHandle<\
-                               const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,  \
-                               EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE> , \
-                  Kokkos::View<const OFFSET_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<const ORDINAL_TYPE *, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                      false, true > ;
-
-#include<KokkosSparse_sptrsv_symbolic_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosSparse_sptrsv_symbolic_eti_spec_decl.hpp>
-
+#define KOKKOSSPARSE_SPTRSV_SYMBOLIC_ETI_SPEC_DECL(                         \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE,   \
+    MEM_SPACE_TYPE)                                                         \
+  extern template struct SPTRSV_SYMBOLIC<                                   \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                     \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,         \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>,                 \
+      Kokkos::View<                                                         \
+          const OFFSET_TYPE *, LAYOUT_TYPE,                                 \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          const ORDINAL_TYPE *, LAYOUT_TYPE,                                \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      false, true>;
+
+#define KOKKOSSPARSE_SPTRSV_SYMBOLIC_ETI_SPEC_INST(                         \
+    SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE,   \
+    MEM_SPACE_TYPE)                                                         \
+  template struct SPTRSV_SYMBOLIC<                                          \
+      KokkosKernels::Experimental::KokkosKernelsHandle<                     \
+          const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE,         \
+          EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>,                 \
+      Kokkos::View<                                                         \
+          const OFFSET_TYPE *, LAYOUT_TYPE,                                 \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      Kokkos::View<                                                         \
+          const ORDINAL_TYPE *, LAYOUT_TYPE,                                \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                  \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >, \
+      false, true>;
+
+#include <KokkosSparse_sptrsv_symbolic_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosSparse_sptrsv_symbolic_eti_spec_decl.hpp>
 
 #endif
diff --git a/src/sparse/impl/KokkosSparse_trsv_impl.hpp b/src/sparse/impl/KokkosSparse_trsv_impl.hpp
index 1e3d006630..f076368827 100644
--- a/src/sparse/impl/KokkosSparse_trsv_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_trsv_impl.hpp
@@ -50,80 +50,77 @@
 
 #include <KokkosKernels_config.h>
 #include <Kokkos_ArithTraits.hpp>
-#include <vector> // temporarily
+#include <vector>  // temporarily
 
 namespace KokkosSparse {
 namespace Impl {
 namespace Sequential {
 
-template<class CrsMatrixType,
-         class DomainMultiVectorType,
-         class RangeMultiVectorType>
-void
-lowerTriSolveCsrUnitDiag (RangeMultiVectorType X,
-                          const CrsMatrixType& A,
-                          DomainMultiVectorType Y)
-{
-  typedef typename CrsMatrixType::row_map_type::non_const_value_type offset_type;
-  typedef typename CrsMatrixType::index_type::non_const_value_type local_ordinal_type;
-  typedef typename CrsMatrixType::values_type::non_const_value_type matrix_scalar_type;
-
-  const local_ordinal_type numRows = A.numRows ();
-  //const local_ordinal_type numCols = A.numCols ();
-  const local_ordinal_type numVecs = X.extent(1);
+template <class CrsMatrixType, class DomainMultiVectorType,
+          class RangeMultiVectorType>
+void lowerTriSolveCsrUnitDiag(RangeMultiVectorType X, const CrsMatrixType& A,
+                              DomainMultiVectorType Y) {
+  typedef
+      typename CrsMatrixType::row_map_type::non_const_value_type offset_type;
+  typedef typename CrsMatrixType::index_type::non_const_value_type
+      local_ordinal_type;
+  typedef typename CrsMatrixType::values_type::non_const_value_type
+      matrix_scalar_type;
+
+  const local_ordinal_type numRows = A.numRows();
+  // const local_ordinal_type numCols = A.numCols ();
+  const local_ordinal_type numVecs         = X.extent(1);
   typename CrsMatrixType::row_map_type ptr = A.graph.row_map;
-  typename CrsMatrixType::index_type ind = A.graph.entries;
-  typename CrsMatrixType::values_type val = A.values;
+  typename CrsMatrixType::index_type ind   = A.graph.entries;
+  typename CrsMatrixType::values_type val  = A.values;
 
   for (local_ordinal_type r = 0; r < numRows; ++r) {
     for (local_ordinal_type j = 0; j < numVecs; ++j) {
       X(r, j) = Y(r, j);
     }
     const offset_type beg = ptr(r);
-    const offset_type end = ptr(r+1);
+    const offset_type end = ptr(r + 1);
     for (offset_type k = beg; k < end; ++k) {
       const matrix_scalar_type A_rc = val(k);
-      const local_ordinal_type c = ind(k);
+      const local_ordinal_type c    = ind(k);
       for (local_ordinal_type j = 0; j < numVecs; ++j) {
         X(r, j) -= A_rc * X(c, j);
       }
-    } // for each entry A_rc in the current row r
-  } // for each row r
+    }  // for each entry A_rc in the current row r
+  }    // for each row r
 }
 
-
-template<class CrsMatrixType,
-         class DomainMultiVectorType,
-         class RangeMultiVectorType>
-void
-lowerTriSolveCsr (RangeMultiVectorType X,
-                  const CrsMatrixType& A,
-                  DomainMultiVectorType Y)
-{
-  typedef typename CrsMatrixType::row_map_type::non_const_value_type offset_type;
-  typedef typename CrsMatrixType::index_type::non_const_value_type local_ordinal_type;
-  typedef typename CrsMatrixType::values_type::non_const_value_type matrix_scalar_type;
+template <class CrsMatrixType, class DomainMultiVectorType,
+          class RangeMultiVectorType>
+void lowerTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A,
+                      DomainMultiVectorType Y) {
+  typedef
+      typename CrsMatrixType::row_map_type::non_const_value_type offset_type;
+  typedef typename CrsMatrixType::index_type::non_const_value_type
+      local_ordinal_type;
+  typedef typename CrsMatrixType::values_type::non_const_value_type
+      matrix_scalar_type;
   typedef Kokkos::Details::ArithTraits<matrix_scalar_type> STS;
 
-  const local_ordinal_type numRows = A.numRows ();
-  //const local_ordinal_type numCols = A.numCols ();
-  const local_ordinal_type numVecs = X.extent(1);
+  const local_ordinal_type numRows = A.numRows();
+  // const local_ordinal_type numCols = A.numCols ();
+  const local_ordinal_type numVecs         = X.extent(1);
   typename CrsMatrixType::row_map_type ptr = A.graph.row_map;
-  typename CrsMatrixType::index_type ind = A.graph.entries;
-  typename CrsMatrixType::values_type val = A.values;
+  typename CrsMatrixType::index_type ind   = A.graph.entries;
+  typename CrsMatrixType::values_type val  = A.values;
 
   for (local_ordinal_type r = 0; r < numRows; ++r) {
     for (local_ordinal_type j = 0; j < numVecs; ++j) {
       X(r, j) = Y(r, j);
     }
 
-    matrix_scalar_type A_rr = STS::zero ();
-    const offset_type beg = ptr(r);
-    const offset_type end = ptr(r+1);
+    matrix_scalar_type A_rr = STS::zero();
+    const offset_type beg   = ptr(r);
+    const offset_type end   = ptr(r + 1);
 
     for (offset_type k = beg; k < end; ++k) {
       const matrix_scalar_type A_rc = val(k);
-      const local_ordinal_type c = ind(k);
+      const local_ordinal_type c    = ind(k);
       // FIXME (mfh 28 Aug 2014) This assumes that the diagonal entry
       // has equal local row and column indices.  That may not
       // necessarily hold, depending on the row and column Maps.  The
@@ -137,32 +134,30 @@ lowerTriSolveCsr (RangeMultiVectorType X,
           X(r, j) -= A_rc * X(c, j);
         }
       }
-    } // for each entry A_rc in the current row r
+    }  // for each entry A_rc in the current row r
     for (local_ordinal_type j = 0; j < numVecs; ++j) {
       X(r, j) = X(r, j) / A_rr;
     }
-  } // for each row r
+  }  // for each row r
 }
 
-
-template<class CrsMatrixType,
-         class DomainMultiVectorType,
-         class RangeMultiVectorType>
-void
-upperTriSolveCsrUnitDiag (RangeMultiVectorType X,
-                          const CrsMatrixType& A,
-                          DomainMultiVectorType Y)
-{
-  typedef typename CrsMatrixType::row_map_type::non_const_value_type offset_type;
-  typedef typename CrsMatrixType::index_type::non_const_value_type local_ordinal_type;
-  typedef typename CrsMatrixType::values_type::non_const_value_type matrix_scalar_type;
-
-  const local_ordinal_type numRows = A.numRows ();
-  //const local_ordinal_type numCols = A.numCols ();
-  const local_ordinal_type numVecs = X.extent(1);
+template <class CrsMatrixType, class DomainMultiVectorType,
+          class RangeMultiVectorType>
+void upperTriSolveCsrUnitDiag(RangeMultiVectorType X, const CrsMatrixType& A,
+                              DomainMultiVectorType Y) {
+  typedef
+      typename CrsMatrixType::row_map_type::non_const_value_type offset_type;
+  typedef typename CrsMatrixType::index_type::non_const_value_type
+      local_ordinal_type;
+  typedef typename CrsMatrixType::values_type::non_const_value_type
+      matrix_scalar_type;
+
+  const local_ordinal_type numRows = A.numRows();
+  // const local_ordinal_type numCols = A.numCols ();
+  const local_ordinal_type numVecs         = X.extent(1);
   typename CrsMatrixType::row_map_type ptr = A.graph.row_map;
-  typename CrsMatrixType::index_type ind = A.graph.entries;
-  typename CrsMatrixType::values_type val = A.values;
+  typename CrsMatrixType::index_type ind   = A.graph.entries;
+  typename CrsMatrixType::values_type val  = A.values;
 
   // If local_ordinal_type is unsigned and numRows is 0, the loop
   // below will have entirely the wrong number of iterations.
@@ -178,15 +173,15 @@ upperTriSolveCsrUnitDiag (RangeMultiVectorType X,
       X(r, j) = Y(r, j);
     }
     const offset_type beg = ptr(r);
-    const offset_type end = ptr(r+1);
+    const offset_type end = ptr(r + 1);
     for (offset_type k = beg; k < end; ++k) {
       const matrix_scalar_type A_rc = val(k);
-      const local_ordinal_type c = ind(k);
+      const local_ordinal_type c    = ind(k);
       for (local_ordinal_type j = 0; j < numVecs; ++j) {
         X(r, j) -= A_rc * X(c, j);
       }
-    } // for each entry A_rc in the current row r
-  } // for each row r
+    }  // for each entry A_rc in the current row r
+  }    // for each row r
 
   // Last iteration: r = 0.
   {
@@ -195,36 +190,34 @@ upperTriSolveCsrUnitDiag (RangeMultiVectorType X,
       X(r, j) = Y(r, j);
     }
     const offset_type beg = ptr(r);
-    const offset_type end = ptr(r+1);
+    const offset_type end = ptr(r + 1);
     for (offset_type k = beg; k < end; ++k) {
       const matrix_scalar_type A_rc = val(k);
-      const local_ordinal_type c = ind(k);
+      const local_ordinal_type c    = ind(k);
       for (local_ordinal_type j = 0; j < numVecs; ++j) {
         X(r, j) -= A_rc * X(c, j);
       }
-    } // for each entry A_rc in the current row r
-  } // last iteration: r = 0
+    }  // for each entry A_rc in the current row r
+  }    // last iteration: r = 0
 }
 
-
-template<class CrsMatrixType,
-         class DomainMultiVectorType,
-         class RangeMultiVectorType>
-void
-upperTriSolveCsr (RangeMultiVectorType X,
-                  const CrsMatrixType& A,
-                  DomainMultiVectorType Y)
-{
-  typedef typename CrsMatrixType::row_map_type::non_const_value_type offset_type;
-  typedef typename CrsMatrixType::index_type::non_const_value_type local_ordinal_type;
-  typedef typename CrsMatrixType::values_type::non_const_value_type matrix_scalar_type;
-
-  const local_ordinal_type numRows = A.numRows ();
-  //const local_ordinal_type numCols = A.numCols ();
-  const local_ordinal_type numVecs = X.extent(1);
+template <class CrsMatrixType, class DomainMultiVectorType,
+          class RangeMultiVectorType>
+void upperTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A,
+                      DomainMultiVectorType Y) {
+  typedef
+      typename CrsMatrixType::row_map_type::non_const_value_type offset_type;
+  typedef typename CrsMatrixType::index_type::non_const_value_type
+      local_ordinal_type;
+  typedef typename CrsMatrixType::values_type::non_const_value_type
+      matrix_scalar_type;
+
+  const local_ordinal_type numRows = A.numRows();
+  // const local_ordinal_type numCols = A.numCols ();
+  const local_ordinal_type numVecs         = X.extent(1);
   typename CrsMatrixType::row_map_type ptr = A.graph.row_map;
-  typename CrsMatrixType::index_type ind = A.graph.entries;
-  typename CrsMatrixType::values_type val = A.values;
+  typename CrsMatrixType::index_type ind   = A.graph.entries;
+  typename CrsMatrixType::values_type val  = A.values;
 
   // If local_ordinal_type is unsigned and numRows is 0, the loop
   // below will have entirely the wrong number of iterations.
@@ -240,20 +233,20 @@ upperTriSolveCsr (RangeMultiVectorType X,
       X(r, j) = Y(r, j);
     }
     const offset_type beg = ptr(r);
-    const offset_type end = ptr(r+1);
+    const offset_type end = ptr(r + 1);
     // We assume the diagonal entry is first in the row.
     const matrix_scalar_type A_rr = val(beg);
-    for (offset_type k = beg + static_cast<offset_type> (1); k < end; ++k) {
+    for (offset_type k = beg + static_cast<offset_type>(1); k < end; ++k) {
       const matrix_scalar_type A_rc = val(k);
-      const local_ordinal_type c = ind(k);
+      const local_ordinal_type c    = ind(k);
       for (local_ordinal_type j = 0; j < numVecs; ++j) {
         X(r, j) -= A_rc * X(c, j);
       }
-    } // for each entry A_rc in the current row r
+    }  // for each entry A_rc in the current row r
     for (local_ordinal_type j = 0; j < numVecs; ++j) {
       X(r, j) = X(r, j) / A_rr;
     }
-  } // for each row r
+  }  // for each row r
 
   // Last iteration: r = 0.
   {
@@ -262,41 +255,39 @@ upperTriSolveCsr (RangeMultiVectorType X,
       X(r, j) = Y(r, j);
     }
     const offset_type beg = ptr(r);
-    const offset_type end = ptr(r+1);
+    const offset_type end = ptr(r + 1);
     // We assume the diagonal entry is first in the row.
     const matrix_scalar_type A_rr = val(beg);
     for (offset_type k = beg + 1; k < end; ++k) {
       const matrix_scalar_type A_rc = val(k);
-      const local_ordinal_type c = ind(k);
+      const local_ordinal_type c    = ind(k);
       for (local_ordinal_type j = 0; j < numVecs; ++j) {
         X(r, j) -= A_rc * X(c, j);
       }
-    } // for each entry A_rc in the current row r
+    }  // for each entry A_rc in the current row r
     for (local_ordinal_type j = 0; j < numVecs; ++j) {
       X(r, j) = X(r, j) / A_rr;
     }
-  } // last iteration: r = 0
+  }  // last iteration: r = 0
 }
 
-
-template<class CrsMatrixType,
-         class DomainMultiVectorType,
-         class RangeMultiVectorType>
-void
-upperTriSolveCscUnitDiag (RangeMultiVectorType X,
-                          const CrsMatrixType& A,
-                          DomainMultiVectorType Y)
-{
-  typedef typename CrsMatrixType::row_map_type::non_const_value_type offset_type;
-  typedef typename CrsMatrixType::index_type::non_const_value_type local_ordinal_type;
-  typedef typename CrsMatrixType::values_type::non_const_value_type matrix_scalar_type;
-
-  const local_ordinal_type numRows = A.numRows ();
-  const local_ordinal_type numCols = A.numCols ();
-  const local_ordinal_type numVecs = X.extent(1);
+template <class CrsMatrixType, class DomainMultiVectorType,
+          class RangeMultiVectorType>
+void upperTriSolveCscUnitDiag(RangeMultiVectorType X, const CrsMatrixType& A,
+                              DomainMultiVectorType Y) {
+  typedef
+      typename CrsMatrixType::row_map_type::non_const_value_type offset_type;
+  typedef typename CrsMatrixType::index_type::non_const_value_type
+      local_ordinal_type;
+  typedef typename CrsMatrixType::values_type::non_const_value_type
+      matrix_scalar_type;
+
+  const local_ordinal_type numRows         = A.numRows();
+  const local_ordinal_type numCols         = A.numCols();
+  const local_ordinal_type numVecs         = X.extent(1);
   typename CrsMatrixType::row_map_type ptr = A.graph.row_map;
-  typename CrsMatrixType::index_type ind = A.graph.entries;
-  typename CrsMatrixType::values_type val = A.values;
+  typename CrsMatrixType::index_type ind   = A.graph.entries;
+  typename CrsMatrixType::values_type val  = A.values;
 
   for (local_ordinal_type j = 0; j < numVecs; ++j) {
     for (local_ordinal_type i = 0; i < numRows; ++i) {
@@ -315,50 +306,48 @@ upperTriSolveCscUnitDiag (RangeMultiVectorType X,
   // iteration) below.
   for (local_ordinal_type c = numCols - 1; c != 0; --c) {
     const offset_type beg = ptr(c);
-    const offset_type end = ptr(c+1);
+    const offset_type end = ptr(c + 1);
     for (offset_type k = beg; k < end; ++k) {
       const matrix_scalar_type A_rc = val(k);
-      const local_ordinal_type r = ind(k);
+      const local_ordinal_type r    = ind(k);
       for (local_ordinal_type j = 0; j < numVecs; ++j) {
         X(r, j) -= A_rc * X(c, j);
       }
-    } // for each entry A_rc in the current column c
-  } // for each column c
+    }  // for each entry A_rc in the current column c
+  }    // for each column c
 
   // Last iteration: c = 0.
   {
     const local_ordinal_type c = 0;
-    const offset_type beg = ptr(c);
-    const offset_type end = ptr(c+1);
+    const offset_type beg      = ptr(c);
+    const offset_type end      = ptr(c + 1);
     for (offset_type k = beg; k < end; ++k) {
       const matrix_scalar_type A_rc = val(k);
-      const local_ordinal_type r = ind(k);
+      const local_ordinal_type r    = ind(k);
       for (local_ordinal_type j = 0; j < numVecs; ++j) {
         X(r, j) -= A_rc * X(c, j);
       }
-    } // for each entry A_rc in the current column c
+    }  // for each entry A_rc in the current column c
   }
 }
 
-
-template<class CrsMatrixType,
-         class DomainMultiVectorType,
-         class RangeMultiVectorType>
-void
-upperTriSolveCsc (RangeMultiVectorType X,
-                  const CrsMatrixType& A,
-                  DomainMultiVectorType Y)
-{
-  typedef typename CrsMatrixType::row_map_type::non_const_value_type offset_type;
-  typedef typename CrsMatrixType::index_type::non_const_value_type local_ordinal_type;
-  typedef typename CrsMatrixType::values_type::non_const_value_type matrix_scalar_type;
-
-  const local_ordinal_type numRows = A.numRows ();
-  const local_ordinal_type numCols = A.numCols ();
-  const local_ordinal_type numVecs = X.extent(1);
+template <class CrsMatrixType, class DomainMultiVectorType,
+          class RangeMultiVectorType>
+void upperTriSolveCsc(RangeMultiVectorType X, const CrsMatrixType& A,
+                      DomainMultiVectorType Y) {
+  typedef
+      typename CrsMatrixType::row_map_type::non_const_value_type offset_type;
+  typedef typename CrsMatrixType::index_type::non_const_value_type
+      local_ordinal_type;
+  typedef typename CrsMatrixType::values_type::non_const_value_type
+      matrix_scalar_type;
+
+  const local_ordinal_type numRows         = A.numRows();
+  const local_ordinal_type numCols         = A.numCols();
+  const local_ordinal_type numVecs         = X.extent(1);
   typename CrsMatrixType::row_map_type ptr = A.graph.row_map;
-  typename CrsMatrixType::index_type ind = A.graph.entries;
-  typename CrsMatrixType::values_type val = A.values;
+  typename CrsMatrixType::index_type ind   = A.graph.entries;
+  typename CrsMatrixType::values_type val  = A.values;
 
   for (local_ordinal_type j = 0; j < numVecs; ++j) {
     for (local_ordinal_type i = 0; i < numRows; ++i) {
@@ -377,9 +366,9 @@ upperTriSolveCsc (RangeMultiVectorType X,
   // iteration) below.
   for (local_ordinal_type c = numCols - 1; c != 0; --c) {
     const offset_type beg = ptr(c);
-    const offset_type end = ptr(c+1);
+    const offset_type end = ptr(c + 1);
     for (offset_type k = end - 1; k >= beg; --k) {
-      const local_ordinal_type r = ind(k);
+      const local_ordinal_type r    = ind(k);
       const matrix_scalar_type A_rc = val(k);
       /*(vqd 20 Jul 2020) This assumes that the diagonal entry
         has equal local row and column indices.  That may not
@@ -392,12 +381,12 @@ upperTriSolveCsc (RangeMultiVectorType X,
           X(r, j) -= A_rc * X(c, j);
         }
       }
-    } // for each entry A_rc in the current column c
-  } // for each column c
+    }  // for each entry A_rc in the current column c
+  }    // for each column c
 
   // Last iteration: c = 0.
   {
-    const offset_type beg = ptr(0);
+    const offset_type beg         = ptr(0);
     const matrix_scalar_type A_rc = val(beg);
     /*(vqd 20 Jul 2020) This assumes that the diagonal entry
       has equal local row and column indices.  That may not
@@ -409,25 +398,23 @@ upperTriSolveCsc (RangeMultiVectorType X,
   }
 }
 
-
-template<class CrsMatrixType,
-         class DomainMultiVectorType,
-         class RangeMultiVectorType>
-void
-lowerTriSolveCscUnitDiag (RangeMultiVectorType X,
-                          const CrsMatrixType& A,
-                          DomainMultiVectorType Y)
-{
-  typedef typename CrsMatrixType::row_map_type::non_const_value_type offset_type;
-  typedef typename CrsMatrixType::index_type::non_const_value_type local_ordinal_type;
-  typedef typename CrsMatrixType::values_type::non_const_value_type matrix_scalar_type;
-
-  const local_ordinal_type numRows = A.numRows ();
-  const local_ordinal_type numCols = A.numCols ();
-  const local_ordinal_type numVecs = X.extent(1);
+template <class CrsMatrixType, class DomainMultiVectorType,
+          class RangeMultiVectorType>
+void lowerTriSolveCscUnitDiag(RangeMultiVectorType X, const CrsMatrixType& A,
+                              DomainMultiVectorType Y) {
+  typedef
+      typename CrsMatrixType::row_map_type::non_const_value_type offset_type;
+  typedef typename CrsMatrixType::index_type::non_const_value_type
+      local_ordinal_type;
+  typedef typename CrsMatrixType::values_type::non_const_value_type
+      matrix_scalar_type;
+
+  const local_ordinal_type numRows         = A.numRows();
+  const local_ordinal_type numCols         = A.numCols();
+  const local_ordinal_type numVecs         = X.extent(1);
   typename CrsMatrixType::row_map_type ptr = A.graph.row_map;
-  typename CrsMatrixType::index_type ind = A.graph.entries;
-  typename CrsMatrixType::values_type val = A.values;
+  typename CrsMatrixType::index_type ind   = A.graph.entries;
+  typename CrsMatrixType::values_type val  = A.values;
 
   for (local_ordinal_type j = 0; j < numVecs; ++j) {
     for (local_ordinal_type i = 0; i < numRows; ++i) {
@@ -437,37 +424,36 @@ lowerTriSolveCscUnitDiag (RangeMultiVectorType X,
 
   for (local_ordinal_type c = 0; c < numCols; ++c) {
     const offset_type beg = ptr(c);
-    const offset_type end = ptr(c+1);
+    const offset_type end = ptr(c + 1);
     for (offset_type k = beg; k < end; ++k) {
-      const local_ordinal_type r = ind(k);
+      const local_ordinal_type r    = ind(k);
       const matrix_scalar_type A_rc = val(k);
       for (local_ordinal_type j = 0; j < numVecs; ++j) {
         X(r, j) -= A_rc * X(c, j);
       }
-    } // for each entry A_rc in the current column c
-  } // for each column c
+    }  // for each entry A_rc in the current column c
+  }    // for each column c
 }
 
-
-template<class CrsMatrixType,
-         class DomainMultiVectorType,
-         class RangeMultiVectorType>
-void
-upperTriSolveCscUnitDiagConj (RangeMultiVectorType X,
-                              const CrsMatrixType& A,
-                              DomainMultiVectorType Y)
-{
-  typedef typename CrsMatrixType::row_map_type::non_const_value_type offset_type;
-  typedef typename CrsMatrixType::index_type::non_const_value_type local_ordinal_type;
-  typedef typename CrsMatrixType::values_type::non_const_value_type matrix_scalar_type;
+template <class CrsMatrixType, class DomainMultiVectorType,
+          class RangeMultiVectorType>
+void upperTriSolveCscUnitDiagConj(RangeMultiVectorType X,
+                                  const CrsMatrixType& A,
+                                  DomainMultiVectorType Y) {
+  typedef
+      typename CrsMatrixType::row_map_type::non_const_value_type offset_type;
+  typedef typename CrsMatrixType::index_type::non_const_value_type
+      local_ordinal_type;
+  typedef typename CrsMatrixType::values_type::non_const_value_type
+      matrix_scalar_type;
   typedef Kokkos::Details::ArithTraits<matrix_scalar_type> STS;
 
-  const local_ordinal_type numRows = A.numRows ();
-  const local_ordinal_type numCols = A.numCols ();
-  const local_ordinal_type numVecs = X.extent(1);
+  const local_ordinal_type numRows         = A.numRows();
+  const local_ordinal_type numCols         = A.numCols();
+  const local_ordinal_type numVecs         = X.extent(1);
   typename CrsMatrixType::row_map_type ptr = A.graph.row_map;
-  typename CrsMatrixType::index_type ind = A.graph.entries;
-  typename CrsMatrixType::values_type val = A.values;
+  typename CrsMatrixType::index_type ind   = A.graph.entries;
+  typename CrsMatrixType::values_type val  = A.values;
 
   for (local_ordinal_type j = 0; j < numVecs; ++j) {
     for (local_ordinal_type i = 0; i < numRows; ++i) {
@@ -486,51 +472,49 @@ upperTriSolveCscUnitDiagConj (RangeMultiVectorType X,
   // iteration) below.
   for (local_ordinal_type c = numCols - 1; c != 0; --c) {
     const offset_type beg = ptr(c);
-    const offset_type end = ptr(c+1);
+    const offset_type end = ptr(c + 1);
     for (offset_type k = beg; k < end; ++k) {
-      const local_ordinal_type r = ind(k);
-      const matrix_scalar_type A_rc = STS::conj (val(k));
+      const local_ordinal_type r    = ind(k);
+      const matrix_scalar_type A_rc = STS::conj(val(k));
       for (local_ordinal_type j = 0; j < numVecs; ++j) {
         X(r, j) -= A_rc * X(c, j);
       }
-    } // for each entry A_rc in the current column c
-  } // for each column c
+    }  // for each entry A_rc in the current column c
+  }    // for each column c
 
   // Last iteration: c = 0.
   {
     const local_ordinal_type c = 0;
-    const offset_type beg = ptr(c);
-    const offset_type end = ptr(c+1);
+    const offset_type beg      = ptr(c);
+    const offset_type end      = ptr(c + 1);
     for (offset_type k = beg; k < end; ++k) {
-      const local_ordinal_type r = ind(k);
-      const matrix_scalar_type A_rc = STS::conj (val(k));
+      const local_ordinal_type r    = ind(k);
+      const matrix_scalar_type A_rc = STS::conj(val(k));
       for (local_ordinal_type j = 0; j < numVecs; ++j) {
         X(r, j) -= A_rc * X(c, j);
       }
-    } // for each entry A_rc in the current column c
+    }  // for each entry A_rc in the current column c
   }
 }
 
-
-template<class CrsMatrixType,
-         class DomainMultiVectorType,
-         class RangeMultiVectorType>
-void
-upperTriSolveCscConj (RangeMultiVectorType X,
-                      const CrsMatrixType& A,
-                      DomainMultiVectorType Y)
-{
-  typedef typename CrsMatrixType::row_map_type::non_const_value_type offset_type;
-  typedef typename CrsMatrixType::index_type::non_const_value_type local_ordinal_type;
-  typedef typename CrsMatrixType::values_type::non_const_value_type matrix_scalar_type;
+template <class CrsMatrixType, class DomainMultiVectorType,
+          class RangeMultiVectorType>
+void upperTriSolveCscConj(RangeMultiVectorType X, const CrsMatrixType& A,
+                          DomainMultiVectorType Y) {
+  typedef
+      typename CrsMatrixType::row_map_type::non_const_value_type offset_type;
+  typedef typename CrsMatrixType::index_type::non_const_value_type
+      local_ordinal_type;
+  typedef typename CrsMatrixType::values_type::non_const_value_type
+      matrix_scalar_type;
   typedef Kokkos::Details::ArithTraits<matrix_scalar_type> STS;
 
-  const local_ordinal_type numRows = A.numRows ();
-  const local_ordinal_type numCols = A.numCols ();
-  const local_ordinal_type numVecs = X.extent(1);
+  const local_ordinal_type numRows         = A.numRows();
+  const local_ordinal_type numCols         = A.numCols();
+  const local_ordinal_type numVecs         = X.extent(1);
   typename CrsMatrixType::row_map_type ptr = A.graph.row_map;
-  typename CrsMatrixType::index_type ind = A.graph.entries;
-  typename CrsMatrixType::values_type val = A.values;
+  typename CrsMatrixType::index_type ind   = A.graph.entries;
+  typename CrsMatrixType::values_type val  = A.values;
 
   for (local_ordinal_type j = 0; j < numVecs; ++j) {
     for (local_ordinal_type i = 0; i < numRows; ++i) {
@@ -549,10 +533,10 @@ upperTriSolveCscConj (RangeMultiVectorType X,
   // iteration) below.
   for (local_ordinal_type c = numCols - 1; c != 0; --c) {
     const offset_type beg = ptr(c);
-    const offset_type end = ptr(c+1);
+    const offset_type end = ptr(c + 1);
     for (offset_type k = end - 1; k >= beg; --k) {
-      const local_ordinal_type r = ind(k);
-      const matrix_scalar_type A_rc = STS::conj (val(k));
+      const local_ordinal_type r    = ind(k);
+      const matrix_scalar_type A_rc = STS::conj(val(k));
       /*(vqd 20 Jul 2020) This assumes that the diagonal entry
         has equal local row and column indices.  That may not
         necessarily hold, depending on the row and column Maps.  See
@@ -564,13 +548,13 @@ upperTriSolveCscConj (RangeMultiVectorType X,
           X(r, j) -= A_rc * X(c, j);
         }
       }
-    } // for each entry A_rc in the current column c
-  } // for each column c
+    }  // for each entry A_rc in the current column c
+  }    // for each column c
 
   // Last iteration: c = 0.
   {
-    const offset_type beg = ptr(0);
-    const matrix_scalar_type A_rc = STS::conj (val(beg));
+    const offset_type beg         = ptr(0);
+    const matrix_scalar_type A_rc = STS::conj(val(beg));
     /*(vqd 20 Jul 2020) This assumes that the diagonal entry
       has equal local row and column indices.  That may not
       necessarily hold, depending on the row and column Maps.  See
@@ -581,25 +565,23 @@ upperTriSolveCscConj (RangeMultiVectorType X,
   }
 }
 
-
-template<class CrsMatrixType,
-         class DomainMultiVectorType,
-         class RangeMultiVectorType>
-void
-lowerTriSolveCsc (RangeMultiVectorType X,
-                  const CrsMatrixType& A,
-                  DomainMultiVectorType Y)
-{
-  typedef typename CrsMatrixType::row_map_type::non_const_value_type offset_type;
-  typedef typename CrsMatrixType::index_type::non_const_value_type local_ordinal_type;
-  typedef typename CrsMatrixType::values_type::non_const_value_type matrix_scalar_type;
-
-  const local_ordinal_type numRows = A.numRows ();
-  const local_ordinal_type numCols = A.numCols ();
-  const local_ordinal_type numVecs = X.extent(1);
+template <class CrsMatrixType, class DomainMultiVectorType,
+          class RangeMultiVectorType>
+void lowerTriSolveCsc(RangeMultiVectorType X, const CrsMatrixType& A,
+                      DomainMultiVectorType Y) {
+  typedef
+      typename CrsMatrixType::row_map_type::non_const_value_type offset_type;
+  typedef typename CrsMatrixType::index_type::non_const_value_type
+      local_ordinal_type;
+  typedef typename CrsMatrixType::values_type::non_const_value_type
+      matrix_scalar_type;
+
+  const local_ordinal_type numRows         = A.numRows();
+  const local_ordinal_type numCols         = A.numCols();
+  const local_ordinal_type numVecs         = X.extent(1);
   typename CrsMatrixType::row_map_type ptr = A.graph.row_map;
-  typename CrsMatrixType::index_type ind = A.graph.entries;
-  typename CrsMatrixType::values_type val = A.values;
+  typename CrsMatrixType::index_type ind   = A.graph.entries;
+  typename CrsMatrixType::values_type val  = A.values;
 
   for (local_ordinal_type j = 0; j < numVecs; ++j) {
     for (local_ordinal_type i = 0; i < numRows; ++i) {
@@ -609,9 +591,9 @@ lowerTriSolveCsc (RangeMultiVectorType X,
 
   for (local_ordinal_type c = 0; c < numCols; ++c) {
     const offset_type beg = ptr(c);
-    const offset_type end = ptr(c+1);
+    const offset_type end = ptr(c + 1);
     for (offset_type k = beg; k < end; ++k) {
-      const local_ordinal_type r = ind(k);
+      const local_ordinal_type r    = ind(k);
       const matrix_scalar_type A_rc = val(k);
       /*(vqd 20 Jul 2020) This assumes that the diagonal entry
         has equal local row and column indices.  That may not
@@ -620,35 +602,33 @@ lowerTriSolveCsc (RangeMultiVectorType X,
       for (local_ordinal_type j = 0; j < numVecs; ++j) {
         if (r == c) {
           X(c, j) = X(c, j) / A_rc;
-        } 
-        else {
+        } else {
           X(r, j) -= A_rc * X(c, j);
         }
       }
-    } // for each entry A_rc in the current column c
-  } // for each column c
+    }  // for each entry A_rc in the current column c
+  }    // for each column c
 }
 
-
-template<class CrsMatrixType,
-         class DomainMultiVectorType,
-         class RangeMultiVectorType>
-void
-lowerTriSolveCscUnitDiagConj (RangeMultiVectorType X,
-                              const CrsMatrixType& A,
-                              DomainMultiVectorType Y)
-{
-  typedef typename CrsMatrixType::row_map_type::non_const_value_type offset_type;
-  typedef typename CrsMatrixType::index_type::non_const_value_type local_ordinal_type;
-  typedef typename CrsMatrixType::values_type::non_const_value_type matrix_scalar_type;
+template <class CrsMatrixType, class DomainMultiVectorType,
+          class RangeMultiVectorType>
+void lowerTriSolveCscUnitDiagConj(RangeMultiVectorType X,
+                                  const CrsMatrixType& A,
+                                  DomainMultiVectorType Y) {
+  typedef
+      typename CrsMatrixType::row_map_type::non_const_value_type offset_type;
+  typedef typename CrsMatrixType::index_type::non_const_value_type
+      local_ordinal_type;
+  typedef typename CrsMatrixType::values_type::non_const_value_type
+      matrix_scalar_type;
   typedef Kokkos::Details::ArithTraits<matrix_scalar_type> STS;
 
-  const local_ordinal_type numRows = A.numRows ();
-  const local_ordinal_type numCols = A.numCols ();
-  const local_ordinal_type numVecs = X.extent(1);
+  const local_ordinal_type numRows         = A.numRows();
+  const local_ordinal_type numCols         = A.numCols();
+  const local_ordinal_type numVecs         = X.extent(1);
   typename CrsMatrixType::row_map_type ptr = A.graph.row_map;
-  typename CrsMatrixType::index_type ind = A.graph.entries;
-  typename CrsMatrixType::values_type val = A.values;
+  typename CrsMatrixType::index_type ind   = A.graph.entries;
+  typename CrsMatrixType::values_type val  = A.values;
 
   for (local_ordinal_type j = 0; j < numVecs; ++j) {
     for (local_ordinal_type i = 0; i < numRows; ++i) {
@@ -658,37 +638,35 @@ lowerTriSolveCscUnitDiagConj (RangeMultiVectorType X,
 
   for (local_ordinal_type c = 0; c < numCols; ++c) {
     const offset_type beg = ptr(c);
-    const offset_type end = ptr(c+1);
+    const offset_type end = ptr(c + 1);
     for (offset_type k = beg; k < end; ++k) {
-      const local_ordinal_type r = ind(k);
-      const matrix_scalar_type A_rc = STS::conj (val(k));
+      const local_ordinal_type r    = ind(k);
+      const matrix_scalar_type A_rc = STS::conj(val(k));
       for (local_ordinal_type j = 0; j < numVecs; ++j) {
         X(r, j) -= A_rc * X(c, j);
       }
-    } // for each entry A_rc in the current column c
-  } // for each column c
+    }  // for each entry A_rc in the current column c
+  }    // for each column c
 }
 
-
-template<class CrsMatrixType,
-         class DomainMultiVectorType,
-         class RangeMultiVectorType>
-void
-lowerTriSolveCscConj (RangeMultiVectorType X,
-                      const CrsMatrixType& A,
-                      DomainMultiVectorType Y)
-{
-  typedef typename CrsMatrixType::row_map_type::non_const_value_type offset_type;
-  typedef typename CrsMatrixType::index_type::non_const_value_type local_ordinal_type;
-  typedef typename CrsMatrixType::values_type::non_const_value_type matrix_scalar_type;
+template <class CrsMatrixType, class DomainMultiVectorType,
+          class RangeMultiVectorType>
+void lowerTriSolveCscConj(RangeMultiVectorType X, const CrsMatrixType& A,
+                          DomainMultiVectorType Y) {
+  typedef
+      typename CrsMatrixType::row_map_type::non_const_value_type offset_type;
+  typedef typename CrsMatrixType::index_type::non_const_value_type
+      local_ordinal_type;
+  typedef typename CrsMatrixType::values_type::non_const_value_type
+      matrix_scalar_type;
   typedef Kokkos::Details::ArithTraits<matrix_scalar_type> STS;
 
-  const local_ordinal_type numRows = A.numRows ();
-  const local_ordinal_type numCols = A.numCols ();
-  const local_ordinal_type numVecs = X.extent(1);
+  const local_ordinal_type numRows         = A.numRows();
+  const local_ordinal_type numCols         = A.numCols();
+  const local_ordinal_type numVecs         = X.extent(1);
   typename CrsMatrixType::row_map_type ptr = A.graph.row_map;
-  typename CrsMatrixType::index_type ind = A.graph.entries;
-  typename CrsMatrixType::values_type val = A.values;
+  typename CrsMatrixType::index_type ind   = A.graph.entries;
+  typename CrsMatrixType::values_type val  = A.values;
 
   for (local_ordinal_type j = 0; j < numVecs; ++j) {
     for (local_ordinal_type i = 0; i < numRows; ++i) {
@@ -698,28 +676,27 @@ lowerTriSolveCscConj (RangeMultiVectorType X,
 
   for (local_ordinal_type c = 0; c < numCols; ++c) {
     const offset_type beg = ptr(c);
-    const offset_type end = ptr(c+1);
+    const offset_type end = ptr(c + 1);
     for (offset_type k = beg; k < end; ++k) {
-      const local_ordinal_type r = ind(k);
-      const matrix_scalar_type A_rc = STS::conj (val(k));
+      const local_ordinal_type r    = ind(k);
+      const matrix_scalar_type A_rc = STS::conj(val(k));
       /*(vqd 20 Jul 2020) This assumes that the diagonal entry
         has equal local row and column indices.  That may not
         necessarily hold, depending on the row and column Maps.  See
         note above.*/
       for (local_ordinal_type j = 0; j < numVecs; ++j) {
-        if (r == c) {    
+        if (r == c) {
           X(c, j) = X(c, j) / A_rc;
-        } 
-        else {
+        } else {
           X(r, j) -= A_rc * X(c, j);
         }
       }
-    } // for each entry A_rc in the current column c
-  } // for each column c
+    }  // for each entry A_rc in the current column c
+  }    // for each column c
 }
 
-} // namespace Sequential
-} // namespace Impl
-} // namespace KokkosSparse
+}  // namespace Sequential
+}  // namespace Impl
+}  // namespace KokkosSparse
 
-#endif // KOKKOSSPARSE_IMPL_TRSM_HPP
+#endif  // KOKKOSSPARSE_IMPL_TRSM_HPP
diff --git a/src/sparse/impl/KokkosSparse_trsv_spec.hpp b/src/sparse/impl/KokkosSparse_trsv_spec.hpp
index e01105c030..6fe478c875 100644
--- a/src/sparse/impl/KokkosSparse_trsv_spec.hpp
+++ b/src/sparse/impl/KokkosSparse_trsv_spec.hpp
@@ -50,42 +50,44 @@
 #include "KokkosSparse_CrsMatrix.hpp"
 
 // Include the actual functors
-#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY 
+#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 #include <KokkosSparse_trsv_impl.hpp>
 #endif
 
 namespace KokkosSparse {
 namespace Impl {
 // Specialization struct which defines whether a specialization exists
-template<class CrsMatrixType,
-         class DomainMultiVectorType,
-         class RangeMultiVectorType>
+template <class CrsMatrixType, class DomainMultiVectorType,
+          class RangeMultiVectorType>
 struct trsv_eti_spec_avail {
   enum : bool { value = false };
 };
 
-}
-}
-
-
-#define KOKKOSSPARSE_TRSV_ETI_SPEC_AVAIL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \
-    template<> \
-    struct trsv_eti_spec_avail< \
-                  KokkosSparse::CrsMatrix< const SCALAR_TYPE, const ORDINAL_TYPE, \
-                                           Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                           Kokkos::MemoryTraits<Kokkos::Unmanaged>, \
-                                           const OFFSET_TYPE>, \
-                  Kokkos::View<const SCALAR_TYPE **, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                  Kokkos::View<SCALAR_TYPE **, LAYOUT_TYPE,  \
-                               Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                               Kokkos::MemoryTraits<Kokkos::Unmanaged> > > \
-    { enum : bool { value = true }; };
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#define KOKKOSSPARSE_TRSV_ETI_SPEC_AVAIL(SCALAR_TYPE, ORDINAL_TYPE,            \
+                                         OFFSET_TYPE, LAYOUT_TYPE,             \
+                                         EXEC_SPACE_TYPE, MEM_SPACE_TYPE)      \
+  template <>                                                                  \
+  struct trsv_eti_spec_avail<                                                  \
+      KokkosSparse::CrsMatrix<const SCALAR_TYPE, const ORDINAL_TYPE,           \
+                              Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
+                              Kokkos::MemoryTraits<Kokkos::Unmanaged>,         \
+                              const OFFSET_TYPE>,                              \
+      Kokkos::View<                                                            \
+          const SCALAR_TYPE **, LAYOUT_TYPE,                                   \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                     \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >,    \
+      Kokkos::View<SCALAR_TYPE **, LAYOUT_TYPE,                                \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> > > {               \
+    enum : bool { value = true };                                              \
+  };
 
 // Include the actual specialization declarations
-#include<KokkosSparse_trsv_tpl_spec_avail.hpp>
-#include<generated_specializations_hpp/KokkosSparse_trsv_eti_spec_avail.hpp>
+#include <KokkosSparse_trsv_tpl_spec_avail.hpp>
+#include <generated_specializations_hpp/KokkosSparse_trsv_eti_spec_avail.hpp>
 
 namespace KokkosSparse {
 namespace Impl {
@@ -94,91 +96,75 @@ namespace Impl {
 /// \brief Implementation of KokkosSparse::trsv (sparse matrix - dense
 ///   vector multiply) for single vectors (1-D Views).
 
-template<class CrsMatrixType,
-         class DomainMultiVectorType,
-         class RangeMultiVectorType,
-         bool tpl_spec_avail =
-             trsv_tpl_spec_avail< CrsMatrixType,
-                                   DomainMultiVectorType,
-                                   RangeMultiVectorType>::value,
-         bool eti_spec_avail =
-             trsv_eti_spec_avail<   CrsMatrixType,
-                                   DomainMultiVectorType,
-                                   RangeMultiVectorType>::value >
-struct TRSV{
-  static void
-  trsv (const char uplo[],
-        const char trans[],
-        const char diag[],
-        const CrsMatrixType& A,
-        DomainMultiVectorType B,
-        RangeMultiVectorType X);
+template <class CrsMatrixType, class DomainMultiVectorType,
+          class RangeMultiVectorType,
+          bool tpl_spec_avail =
+              trsv_tpl_spec_avail<CrsMatrixType, DomainMultiVectorType,
+                                  RangeMultiVectorType>::value,
+          bool eti_spec_avail =
+              trsv_eti_spec_avail<CrsMatrixType, DomainMultiVectorType,
+                                  RangeMultiVectorType>::value>
+struct TRSV {
+  static void trsv(const char uplo[], const char trans[], const char diag[],
+                   const CrsMatrixType &A, DomainMultiVectorType B,
+                   RangeMultiVectorType X);
 };
 
-
 #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY
 //! Full specialization of trsv for multi vectors.
 // Unification layer
-template<class CrsMatrixType,
-         class DomainMultiVectorType,
-         class RangeMultiVectorType>
-struct TRSV< CrsMatrixType, DomainMultiVectorType, RangeMultiVectorType, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY>{
-  static void
-  trsv (const char uplo[],
-        const char trans[],
-        const char diag[],
-        const CrsMatrixType& A,
-        DomainMultiVectorType B,
-        RangeMultiVectorType X) // X is the output MV
+template <class CrsMatrixType, class DomainMultiVectorType,
+          class RangeMultiVectorType>
+struct TRSV<CrsMatrixType, DomainMultiVectorType, RangeMultiVectorType, false,
+            KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
+  static void trsv(const char uplo[], const char trans[], const char diag[],
+                   const CrsMatrixType &A, DomainMultiVectorType B,
+                   RangeMultiVectorType X)  // X is the output MV
   {
-    if (trans[0] == 'N' || trans[0] == 'n') {       // no transpose
-      if (uplo[0] == 'L' || uplo[0] == 'l') {   // lower triangular
-        if (diag[0] == 'U' || diag[0] == 'u') {    // unit diagonal
-          Sequential::lowerTriSolveCsrUnitDiag (X, A, B);
-        } else {                               // non unit diagonal
-          Sequential::lowerTriSolveCsr (X, A, B);
+    if (trans[0] == 'N' || trans[0] == 'n') {    // no transpose
+      if (uplo[0] == 'L' || uplo[0] == 'l') {    // lower triangular
+        if (diag[0] == 'U' || diag[0] == 'u') {  // unit diagonal
+          Sequential::lowerTriSolveCsrUnitDiag(X, A, B);
+        } else {  // non unit diagonal
+          Sequential::lowerTriSolveCsr(X, A, B);
         }
-      } else {                                  // upper triangular
-        if (diag[0] == 'U' || diag[0] == 'u') {    // unit diagonal
-          Sequential::upperTriSolveCsrUnitDiag (X, A, B);
-        } else {                               // non unit diagonal
-          Sequential::upperTriSolveCsr (X, A, B);
+      } else {                                   // upper triangular
+        if (diag[0] == 'U' || diag[0] == 'u') {  // unit diagonal
+          Sequential::upperTriSolveCsrUnitDiag(X, A, B);
+        } else {  // non unit diagonal
+          Sequential::upperTriSolveCsr(X, A, B);
         }
       }
-    }
-    else if (trans[0] == 'T' || trans[0] == 't') {     // transpose
-      if (uplo[0] == 'L' || uplo[0] == 'l') {   // lower triangular
+    } else if (trans[0] == 'T' || trans[0] == 't') {  // transpose
+      if (uplo[0] == 'L' || uplo[0] == 'l') {         // lower triangular
         // Transposed lower tri CSR => upper tri CSC.
-        if (diag[0] == 'U' || diag[0] == 'u') {    // unit diagonal
-          Sequential::upperTriSolveCscUnitDiag (X, A, B);
-        } else {                               // non unit diagonal
-          Sequential::upperTriSolveCsc (X, A, B);
+        if (diag[0] == 'U' || diag[0] == 'u') {  // unit diagonal
+          Sequential::upperTriSolveCscUnitDiag(X, A, B);
+        } else {  // non unit diagonal
+          Sequential::upperTriSolveCsc(X, A, B);
         }
-      }
-      else {                                    // upper triangular
+      } else {  // upper triangular
         // Transposed upper tri CSR => lower tri CSC.
-        if (diag[0] == 'U' || diag[0] == 'u') {    // unit diagonal
-          Sequential::lowerTriSolveCscUnitDiag (X, A, B);
-        } else {                               // non unit diagonal
-          Sequential::lowerTriSolveCsc (X, A, B);
+        if (diag[0] == 'U' || diag[0] == 'u') {  // unit diagonal
+          Sequential::lowerTriSolveCscUnitDiag(X, A, B);
+        } else {  // non unit diagonal
+          Sequential::lowerTriSolveCsc(X, A, B);
         }
       }
-    }
-    else if (trans[0] == 'C' || trans[0] == 'c') { // conj transpose
-      if (uplo[0] == 'L' || uplo[0] == 'l') {    // lower triangular
+    } else if (trans[0] == 'C' || trans[0] == 'c') {  // conj transpose
+      if (uplo[0] == 'L' || uplo[0] == 'l') {         // lower triangular
         // Transposed lower tri CSR => upper tri CSC.
-        if (diag[0] == 'U' || diag[0] == 'u') {     // unit diagonal
-          Sequential::upperTriSolveCscUnitDiagConj (X, A, B);
-        } else {                                // non unit diagonal
-          Sequential::upperTriSolveCscConj (X, A, B);
+        if (diag[0] == 'U' || diag[0] == 'u') {  // unit diagonal
+          Sequential::upperTriSolveCscUnitDiagConj(X, A, B);
+        } else {  // non unit diagonal
+          Sequential::upperTriSolveCscConj(X, A, B);
         }
-      }
-      else {                                     // upper triangular
+      } else {  // upper triangular
         // Transposed upper tri CSR => lower tri CSC.
-        if (diag[0] == 'U' || diag[0] == 'u') {     // unit diagonal
-          Sequential::lowerTriSolveCscUnitDiagConj (X, A, B);
-        } else {                                // non unit diagonal
-          Sequential::lowerTriSolveCscConj (X, A, B);
+        if (diag[0] == 'U' || diag[0] == 'u') {  // unit diagonal
+          Sequential::lowerTriSolveCscUnitDiagConj(X, A, B);
+        } else {  // non unit diagonal
+          Sequential::lowerTriSolveCscConj(X, A, B);
         }
       }
     }
@@ -186,8 +172,8 @@ struct TRSV< CrsMatrixType, DomainMultiVectorType, RangeMultiVectorType, false,
 };
 
 #endif
-}
-}
+}  // namespace Impl
+}  // namespace KokkosSparse
 
 //
 // Macro for declaration of full specialization of
@@ -196,36 +182,41 @@ struct TRSV< CrsMatrixType, DomainMultiVectorType, RangeMultiVectorType, false,
 // We may spread out definitions (see _DEF macro below) across one or
 // more .cpp files.
 //
-#define KOKKOSSPARSE_TRSV_ETI_SPEC_DECL( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE ) \
-    extern template struct  \
-    TRSV<             KokkosSparse::CrsMatrix< const SCALAR_TYPE, const ORDINAL_TYPE, \
-                                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                   Kokkos::MemoryTraits<Kokkos::Unmanaged>, \
-                                   const OFFSET_TYPE>, \
-                      Kokkos::View<const SCALAR_TYPE **, LAYOUT_TYPE,  \
-                                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                   Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                      Kokkos::View<SCALAR_TYPE **, LAYOUT_TYPE,  \
-                                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                      false, true >; \
-
-#define KOKKOSSPARSE_TRSV_ETI_SPEC_INST( SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \
-    template struct  \
-    TRSV< KokkosSparse::CrsMatrix< const SCALAR_TYPE, const ORDINAL_TYPE, \
-                                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                   Kokkos::MemoryTraits<Kokkos::Unmanaged>, \
-                                   const OFFSET_TYPE>, \
-                      Kokkos::View<const SCALAR_TYPE **, LAYOUT_TYPE,  \
-                                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                   Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> >, \
-                      Kokkos::View<SCALAR_TYPE **, LAYOUT_TYPE,  \
-                                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
-                                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >, \
-                      false, true > ;
-
-#include<KokkosSparse_trsv_tpl_spec_decl.hpp>
-#include<generated_specializations_hpp/KokkosSparse_trsv_eti_spec_decl.hpp>
-
-
-#endif // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_
+#define KOKKOSSPARSE_TRSV_ETI_SPEC_DECL(SCALAR_TYPE, ORDINAL_TYPE,             \
+                                        OFFSET_TYPE, LAYOUT_TYPE,              \
+                                        EXEC_SPACE_TYPE, MEM_SPACE_TYPE)       \
+  extern template struct TRSV<                                                 \
+      KokkosSparse::CrsMatrix<const SCALAR_TYPE, const ORDINAL_TYPE,           \
+                              Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
+                              Kokkos::MemoryTraits<Kokkos::Unmanaged>,         \
+                              const OFFSET_TYPE>,                              \
+      Kokkos::View<                                                            \
+          const SCALAR_TYPE **, LAYOUT_TYPE,                                   \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                     \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >,    \
+      Kokkos::View<SCALAR_TYPE **, LAYOUT_TYPE,                                \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      false, true>;
+
+#define KOKKOSSPARSE_TRSV_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE,             \
+                                        OFFSET_TYPE, LAYOUT_TYPE,              \
+                                        EXEC_SPACE_TYPE, MEM_SPACE_TYPE)       \
+  template struct TRSV<                                                        \
+      KokkosSparse::CrsMatrix<const SCALAR_TYPE, const ORDINAL_TYPE,           \
+                              Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>, \
+                              Kokkos::MemoryTraits<Kokkos::Unmanaged>,         \
+                              const OFFSET_TYPE>,                              \
+      Kokkos::View<                                                            \
+          const SCALAR_TYPE **, LAYOUT_TYPE,                                   \
+          Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,                     \
+          Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >,    \
+      Kokkos::View<SCALAR_TYPE **, LAYOUT_TYPE,                                \
+                   Kokkos::Device<EXEC_SPACE_TYPE, MEM_SPACE_TYPE>,            \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      false, true>;
+
+#include <KokkosSparse_trsv_tpl_spec_decl.hpp>
+#include <generated_specializations_hpp/KokkosSparse_trsv_eti_spec_decl.hpp>
+
+#endif  // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_
diff --git a/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp
index 8762ea1bae..19bc5ec163 100644
--- a/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp
@@ -63,135 +63,137 @@
 
 //#define KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS
 
-namespace KokkosSparse{
-  namespace Impl{
-
-    template <typename HandleType, typename input_row_map_view_t, typename input_entries_view_t, typename input_values_view_t>
-    class TwostageGaussSeidel{
-
-    public:
-      using TwoStageGaussSeidelHandleType = typename HandleType::TwoStageGaussSeidelHandleType;
-      using execution_space = typename HandleType::HandleExecSpace;
-      using memory_space    = typename TwoStageGaussSeidelHandleType::memory_space;
-
-      using  const_scalar_t = typename TwoStageGaussSeidelHandleType::const_scalar_t;
-      using        scalar_t = typename TwoStageGaussSeidelHandleType::scalar_t;
-      using const_ordinal_t = typename TwoStageGaussSeidelHandleType::const_ordinal_t;
-      using       ordinal_t = typename TwoStageGaussSeidelHandleType::ordinal_t;
-      using       size_type = typename TwoStageGaussSeidelHandleType::size_type;
-
-      using  const_row_map_view_t = typename TwoStageGaussSeidelHandleType::const_row_map_view_t;
-      using  row_map_view_t       = typename TwoStageGaussSeidelHandleType::row_map_view_t;
-      using  entries_view_t       = typename TwoStageGaussSeidelHandleType::entries_view_t;
-      using   values_view_t       = typename TwoStageGaussSeidelHandleType::values_view_t;
-
-      using       crsmat_t      = typename TwoStageGaussSeidelHandleType::crsmat_t;
-      using        graph_t      = typename TwoStageGaussSeidelHandleType::graph_t;
-
-      using range_type = Kokkos::pair<int, int>;
-
-      // to wrap input (rowmap, colind, values) into crsmat
-      using input_device_t  = Kokkos::Device<typename input_row_map_view_t::execution_space, typename input_row_map_view_t::memory_space>;
-      using input_memory_t  = typename input_values_view_t::memory_traits;
-      using input_scalar_t  = typename input_values_view_t::value_type;
-      using input_ordinal_t = typename input_entries_view_t::value_type;
-      using input_size_t    = typename input_row_map_view_t::value_type;
-      using input_crsmat_t  = KokkosSparse::CrsMatrix <input_scalar_t,
-                                                       input_ordinal_t,
-                                                       input_device_t,
-                                                       input_memory_t,
-                                                       input_size_t>;
-      using input_graph_t  = typename input_crsmat_t::StaticCrsGraphType;
-      using single_vector_view_t = Kokkos::View<scalar_t*, Kokkos::LayoutLeft, input_device_t, input_memory_t>;
-      using internal_vector_view_t = typename TwoStageGaussSeidelHandleType::vector_view_t;
-
-      using ST = Kokkos::Details::ArithTraits<scalar_t>;
-      using mag_t = typename ST::mag_type;
-
-    private:
-      HandleType *handle;
-
-      //Get the specialized TwostageGaussSeidel handle from the main handle
-      TwoStageGaussSeidelHandleType* get_gs_handle()
-      {
-        auto gsHandle = dynamic_cast<TwoStageGaussSeidelHandleType*>(this->handle->get_gs_handle());
-        if(!gsHandle)
-        {
-          throw std::runtime_error("TwostageGaussSeidel: GS handle has not been created, or is set up for Cluster GS.");
-        }
-        return gsHandle;
-      }
+namespace KokkosSparse {
+namespace Impl {
+
+template <typename HandleType, typename input_row_map_view_t,
+          typename input_entries_view_t, typename input_values_view_t>
+class TwostageGaussSeidel {
+ public:
+  using TwoStageGaussSeidelHandleType =
+      typename HandleType::TwoStageGaussSeidelHandleType;
+  using execution_space = typename HandleType::HandleExecSpace;
+  using memory_space    = typename TwoStageGaussSeidelHandleType::memory_space;
+
+  using const_scalar_t = typename TwoStageGaussSeidelHandleType::const_scalar_t;
+  using scalar_t       = typename TwoStageGaussSeidelHandleType::scalar_t;
+  using const_ordinal_t =
+      typename TwoStageGaussSeidelHandleType::const_ordinal_t;
+  using ordinal_t = typename TwoStageGaussSeidelHandleType::ordinal_t;
+  using size_type = typename TwoStageGaussSeidelHandleType::size_type;
+
+  using const_row_map_view_t =
+      typename TwoStageGaussSeidelHandleType::const_row_map_view_t;
+  using row_map_view_t = typename TwoStageGaussSeidelHandleType::row_map_view_t;
+  using entries_view_t = typename TwoStageGaussSeidelHandleType::entries_view_t;
+  using values_view_t  = typename TwoStageGaussSeidelHandleType::values_view_t;
+
+  using crsmat_t = typename TwoStageGaussSeidelHandleType::crsmat_t;
+  using graph_t  = typename TwoStageGaussSeidelHandleType::graph_t;
+
+  using range_type = Kokkos::pair<int, int>;
+
+  // to wrap input (rowmap, colind, values) into crsmat
+  using input_device_t =
+      Kokkos::Device<typename input_row_map_view_t::execution_space,
+                     typename input_row_map_view_t::memory_space>;
+  using input_memory_t  = typename input_values_view_t::memory_traits;
+  using input_scalar_t  = typename input_values_view_t::value_type;
+  using input_ordinal_t = typename input_entries_view_t::value_type;
+  using input_size_t    = typename input_row_map_view_t::value_type;
+  using input_crsmat_t =
+      KokkosSparse::CrsMatrix<input_scalar_t, input_ordinal_t, input_device_t,
+                              input_memory_t, input_size_t>;
+  using input_graph_t        = typename input_crsmat_t::StaticCrsGraphType;
+  using single_vector_view_t = Kokkos::View<scalar_t *, Kokkos::LayoutLeft,
+                                            input_device_t, input_memory_t>;
+  using internal_vector_view_t =
+      typename TwoStageGaussSeidelHandleType::vector_view_t;
+
+  using ST    = Kokkos::Details::ArithTraits<scalar_t>;
+  using mag_t = typename ST::mag_type;
+
+ private:
+  HandleType *handle;
+
+  // Get the specialized TwostageGaussSeidel handle from the main handle
+  TwoStageGaussSeidelHandleType *get_gs_handle() {
+    auto gsHandle = dynamic_cast<TwoStageGaussSeidelHandleType *>(
+        this->handle->get_gs_handle());
+    if (!gsHandle) {
+      throw std::runtime_error(
+          "TwostageGaussSeidel: GS handle has not been created, or is set up "
+          "for Cluster GS.");
+    }
+    return gsHandle;
+  }
 
-      bool diagos_given;
-      const_ordinal_t num_rows, num_cols;
-
-      input_row_map_view_t rowmap_view;
-      input_entries_view_t column_view;
-      input_values_view_t  values_view;
-      input_values_view_t  d_invert_view;
-
-    // --------------------------------------------------------- //
-    public:
-      // tag for counting nnz
-      struct Tag_countNnzL{};
-      struct Tag_countNnzU{};
-      // tag for inserting entries
-      struct Tag_entriesLU{};
-      // tag for inserting values
-      struct Tag_valuesLU{};
-      // tag for computing residual norm
-      struct Tag_normR{};
-
-      template <typename output_row_map_view_t, 
-                typename output_entries_view_t, 
-                typename output_values_view_t>
-      struct TwostageGaussSeidel_functor {
-
-        public:
-        // input
-        bool two_stage;
-        bool compact_form;
-        bool diagos_given;
-        const_ordinal_t num_rows;
-        input_row_map_view_t rowmap_view;
-        input_entries_view_t column_view;
-        input_values_view_t  values_view;
-        input_values_view_t  d_invert_view;
-        // output (lower)
-        output_row_map_view_t  row_map;
-        output_entries_view_t  entries;
-        output_values_view_t   values;
-        output_values_view_t   diags;
-        // output (upper)
-        output_row_map_view_t  row_map2;
-        output_entries_view_t  entries2;
-        output_values_view_t   values2;
-        // output (complement of U+D)
-        output_row_map_view_t  row_map_a;
-        output_entries_view_t  entries_a;
-        output_values_view_t   values_a;
-        output_values_view_t   diags_a;
-        // output (complement of L+D)
-        output_row_map_view_t  row_map_a2;
-        output_entries_view_t  entries_a2;
-        output_values_view_t   values_a2;
-        // for computing residual norm with
-        bool forward_sweep;
-        internal_vector_view_t localX;
-        internal_vector_view_t localB;
-
-        // ------------------------------------------------------- //
-        // Constructors
-        // for counting nnz
-        TwostageGaussSeidel_functor (
-                  bool two_stage_,
-                  bool compact_form_,
-                  const_ordinal_t num_rows_,
-                  input_row_map_view_t  rowmap_view_,
-                  input_entries_view_t  column_view_,
-                  output_row_map_view_t row_map_,
-                  output_row_map_view_t row_map_a_) :
-          two_stage(two_stage_),
+  bool diagos_given;
+  const_ordinal_t num_rows, num_cols;
+
+  input_row_map_view_t rowmap_view;
+  input_entries_view_t column_view;
+  input_values_view_t values_view;
+  input_values_view_t d_invert_view;
+
+  // --------------------------------------------------------- //
+ public:
+  // tag for counting nnz
+  struct Tag_countNnzL {};
+  struct Tag_countNnzU {};
+  // tag for inserting entries
+  struct Tag_entriesLU {};
+  // tag for inserting values
+  struct Tag_valuesLU {};
+  // tag for computing residual norm
+  struct Tag_normR {};
+
+  template <typename output_row_map_view_t, typename output_entries_view_t,
+            typename output_values_view_t>
+  struct TwostageGaussSeidel_functor {
+   public:
+    // input
+    bool two_stage;
+    bool compact_form;
+    bool diagos_given;
+    const_ordinal_t num_rows;
+    input_row_map_view_t rowmap_view;
+    input_entries_view_t column_view;
+    input_values_view_t values_view;
+    input_values_view_t d_invert_view;
+    // output (lower)
+    output_row_map_view_t row_map;
+    output_entries_view_t entries;
+    output_values_view_t values;
+    output_values_view_t diags;
+    // output (upper)
+    output_row_map_view_t row_map2;
+    output_entries_view_t entries2;
+    output_values_view_t values2;
+    // output (complement of U+D)
+    output_row_map_view_t row_map_a;
+    output_entries_view_t entries_a;
+    output_values_view_t values_a;
+    output_values_view_t diags_a;
+    // output (complement of L+D)
+    output_row_map_view_t row_map_a2;
+    output_entries_view_t entries_a2;
+    output_values_view_t values_a2;
+    // for computing residual norm with
+    bool forward_sweep;
+    internal_vector_view_t localX;
+    internal_vector_view_t localB;
+
+    // ------------------------------------------------------- //
+    // Constructors
+    // for counting nnz
+    TwostageGaussSeidel_functor(bool two_stage_, bool compact_form_,
+                                const_ordinal_t num_rows_,
+                                input_row_map_view_t rowmap_view_,
+                                input_entries_view_t column_view_,
+                                output_row_map_view_t row_map_,
+                                output_row_map_view_t row_map_a_)
+        : two_stage(two_stage_),
           compact_form(compact_form_),
           diagos_given(false),
           num_rows(num_rows_),
@@ -209,25 +211,17 @@ namespace KokkosSparse{
           entries2(),
           values2(),
           // output (complement)
-          row_map_a(row_map_a_)
-        {}
-
-        // for storing booth L&U entries
-        TwostageGaussSeidel_functor (
-                  bool two_stage_,
-                  bool compact_form_,
-                  const_ordinal_t num_rows_,
-                  input_row_map_view_t  rowmap_view_,
-                  input_entries_view_t  column_view_,
-                  output_row_map_view_t row_map_,
-                  output_entries_view_t entries_,
-                  output_row_map_view_t row_map2_,
-                  output_entries_view_t entries2_,
-                  output_row_map_view_t row_map_a_,
-                  output_entries_view_t entries_a_,
-                  output_row_map_view_t row_map_a2_,
-                  output_entries_view_t entries_a2_) :
-          two_stage(two_stage_),
+          row_map_a(row_map_a_) {}
+
+    // for storing booth L&U entries
+    TwostageGaussSeidel_functor(
+        bool two_stage_, bool compact_form_, const_ordinal_t num_rows_,
+        input_row_map_view_t rowmap_view_, input_entries_view_t column_view_,
+        output_row_map_view_t row_map_, output_entries_view_t entries_,
+        output_row_map_view_t row_map2_, output_entries_view_t entries2_,
+        output_row_map_view_t row_map_a_, output_entries_view_t entries_a_,
+        output_row_map_view_t row_map_a2_, output_entries_view_t entries_a2_)
+        : two_stage(two_stage_),
           compact_form(compact_form_),
           diagos_given(false),
           num_rows(num_rows_),
@@ -253,30 +247,20 @@ namespace KokkosSparse{
           values_a(),
           // output complement of L+D
           row_map_a2(row_map_a2_),
-          entries_a2(entries_a2_)
-        {}
-
-        // for storing both L&U values (with D extracted)
-        TwostageGaussSeidel_functor (
-                  bool two_stage_,
-                  bool compact_form_,
-                  bool diagos_given_,
-                  const_ordinal_t num_rows_,
-                  input_row_map_view_t  rowmap_view_,
-                  input_entries_view_t  column_view_,
-                  input_values_view_t   values_view_,
-                  input_values_view_t   d_invert_view_,
-                  output_row_map_view_t row_map_,
-                  output_values_view_t  values_,
-                  output_values_view_t  diags_,
-                  output_row_map_view_t row_map2_,
-                  output_values_view_t  values2_,
-                  output_row_map_view_t row_map_a_,
-                  output_values_view_t  values_a_,
-                  output_values_view_t  diags_a_,
-                  output_row_map_view_t row_map_a2_,
-                  output_values_view_t  values_a2_) :
-          two_stage(two_stage_),
+          entries_a2(entries_a2_) {}
+
+    // for storing both L&U values (with D extracted)
+    TwostageGaussSeidel_functor(
+        bool two_stage_, bool compact_form_, bool diagos_given_,
+        const_ordinal_t num_rows_, input_row_map_view_t rowmap_view_,
+        input_entries_view_t column_view_, input_values_view_t values_view_,
+        input_values_view_t d_invert_view_, output_row_map_view_t row_map_,
+        output_values_view_t values_, output_values_view_t diags_,
+        output_row_map_view_t row_map2_, output_values_view_t values2_,
+        output_row_map_view_t row_map_a_, output_values_view_t values_a_,
+        output_values_view_t diags_a_, output_row_map_view_t row_map_a2_,
+        output_values_view_t values_a2_)
+        : two_stage(two_stage_),
           compact_form(compact_form_),
           diagos_given(diagos_given_),
           num_rows(num_rows_),
@@ -304,20 +288,17 @@ namespace KokkosSparse{
           // output complement of L
           row_map_a2(row_map_a2_),
           entries_a2(),
-          values_a2(values_a2_)
-        {}
-
-        // for computing residual norm
-        TwostageGaussSeidel_functor (
-                  bool forward_sweep_,
-                  const_ordinal_t num_rows_,
-                  input_row_map_view_t   rowmap_view_,
-                  input_entries_view_t   column_view_,
-                  input_values_view_t    values_view_,
-                  output_values_view_t   diags_,
-                  internal_vector_view_t localX_,
-                  internal_vector_view_t localB_) :
-          two_stage(false),
+          values_a2(values_a2_) {}
+
+    // for computing residual norm
+    TwostageGaussSeidel_functor(bool forward_sweep_, const_ordinal_t num_rows_,
+                                input_row_map_view_t rowmap_view_,
+                                input_entries_view_t column_view_,
+                                input_values_view_t values_view_,
+                                output_values_view_t diags_,
+                                internal_vector_view_t localX_,
+                                internal_vector_view_t localB_)
+        : two_stage(false),
           compact_form(false),
           diagos_given(false),
           num_rows(num_rows_),
@@ -333,831 +314,863 @@ namespace KokkosSparse{
           // input vectors
           forward_sweep(forward_sweep_),
           localX(localX_),
-          localB(localB_)
-        {}
-
-        // ------------------------------------------------------- //
-        // functor for counting nnzL (with parallel_reduce)
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const Tag_countNnzL&, const ordinal_t i, ordinal_t &nnz) const
-        {
-          ordinal_t nnz_i = 0;
-          for (size_type k = rowmap_view (i); k < rowmap_view (i+1); k++) {
-            if (column_view (k) < i) {
-              nnz_i ++;
-            } else if(!two_stage && column_view (k) == i) {
-              nnz_i ++;
-            }
-          }
-          row_map (i+1) = nnz_i;
-          if (i == 0) {
-            row_map (0) = 0;
-          }
-          if (compact_form) {
-            // complement of L+D
-            row_map_a (i+1) = (rowmap_view (i+1) - rowmap_view (i)) - nnz_i;
-            if (two_stage) {
-              // two-stage iterates with L (no D)
-              row_map_a (i+1) --;
-            }
-            if (i == 0) {
-              row_map_a (0) = 0;
-            }
-          }
-          nnz +=  nnz_i;
+          localB(localB_) {}
+
+    // ------------------------------------------------------- //
+    // functor for counting nnzL (with parallel_reduce)
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const Tag_countNnzL &, const ordinal_t i,
+                    ordinal_t &nnz) const {
+      ordinal_t nnz_i = 0;
+      for (size_type k = rowmap_view(i); k < rowmap_view(i + 1); k++) {
+        if (column_view(k) < i) {
+          nnz_i++;
+        } else if (!two_stage && column_view(k) == i) {
+          nnz_i++;
         }
-
-        // ------------------------------------------------------- //
-        // functor for counting nnzU (with parallel_reduce)
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const Tag_countNnzU&, const ordinal_t i, ordinal_t &nnz) const
-        {
-          ordinal_t nnz_i = 0;
-          for (size_type k = rowmap_view (i); k < rowmap_view (i+1); k++) {
-            if (column_view (k) > i && column_view (k) < num_rows) {
-              nnz_i ++;
-            } else if(!two_stage && column_view (k) == i) {
-              nnz_i ++;
-            }
-          }
-          row_map (i+1) = nnz_i;
-          if (i == 0) {
-            row_map (0) = 0;
-          }
-          if (compact_form) {
-            // complement of U+D
-            row_map_a (i+1) = (rowmap_view (i+1) - rowmap_view (i)) - nnz_i;
-            if (two_stage) {
-              // two-stage iterates with U (no D)
-              row_map_a (i+1) --;
-            }
-            if (i == 0) {
-              row_map_a (0) = 0;
-            }
-          }
-          nnz +=  nnz_i;
+      }
+      row_map(i + 1) = nnz_i;
+      if (i == 0) {
+        row_map(0) = 0;
+      }
+      if (compact_form) {
+        // complement of L+D
+        row_map_a(i + 1) = (rowmap_view(i + 1) - rowmap_view(i)) - nnz_i;
+        if (two_stage) {
+          // two-stage iterates with L (no D)
+          row_map_a(i + 1)--;
         }
-
-        // ------------------------------------------------------- //
-        // functor for storing entriesL and entriesU (with parallel_for)
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const Tag_entriesLU&, const ordinal_t i) const
-        {
-          ordinal_t nnzL = row_map (i);
-          ordinal_t nnzU = row_map2 (i);
-          ordinal_t nnzLa = 0;
-          ordinal_t nnzUa = 0;
+        if (i == 0) {
+          row_map_a(0) = 0;
+        }
+      }
+      nnz += nnz_i;
+    }
+
+    // ------------------------------------------------------- //
+    // functor for counting nnzU (with parallel_reduce)
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const Tag_countNnzU &, const ordinal_t i,
+                    ordinal_t &nnz) const {
+      ordinal_t nnz_i = 0;
+      for (size_type k = rowmap_view(i); k < rowmap_view(i + 1); k++) {
+        if (column_view(k) > i && column_view(k) < num_rows) {
+          nnz_i++;
+        } else if (!two_stage && column_view(k) == i) {
+          nnz_i++;
+        }
+      }
+      row_map(i + 1) = nnz_i;
+      if (i == 0) {
+        row_map(0) = 0;
+      }
+      if (compact_form) {
+        // complement of U+D
+        row_map_a(i + 1) = (rowmap_view(i + 1) - rowmap_view(i)) - nnz_i;
+        if (two_stage) {
+          // two-stage iterates with U (no D)
+          row_map_a(i + 1)--;
+        }
+        if (i == 0) {
+          row_map_a(0) = 0;
+        }
+      }
+      nnz += nnz_i;
+    }
+
+    // ------------------------------------------------------- //
+    // functor for storing entriesL and entriesU (with parallel_for)
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const Tag_entriesLU &, const ordinal_t i) const {
+      ordinal_t nnzL  = row_map(i);
+      ordinal_t nnzU  = row_map2(i);
+      ordinal_t nnzLa = 0;
+      ordinal_t nnzUa = 0;
+      if (compact_form) {
+        nnzLa = row_map_a(i);
+        nnzUa = row_map_a2(i);
+      }
+      if (!two_stage) {
+        // NOTE: Kokkos' sptrsv assumes diagonal of U to be at the start
+        entries2(nnzU) = i;
+        nnzU++;
+      }
+      for (size_type k = rowmap_view(i); k < rowmap_view(i + 1); k++) {
+        if (column_view(k) < i) {
+          // L
+          entries(nnzL) = column_view(k);
+          nnzL++;
           if (compact_form) {
-            nnzLa = row_map_a (i);
-            nnzUa = row_map_a2 (i);
-          }
-          if (!two_stage) {
-            // NOTE: Kokkos' sptrsv assumes diagonal of U to be at the start
-            entries2 (nnzU) = i;
-            nnzU ++;
+            // complement of U+D
+            entries_a(nnzLa) = column_view(k);
+            nnzLa++;
           }
-          for (size_type k = rowmap_view (i); k < rowmap_view (i+1); k++) {
-            if (column_view (k) < i) {
-              // L
-              entries (nnzL) = column_view (k);
-              nnzL ++;
-              if (compact_form) {
-                // complement of U+D
-                entries_a (nnzLa) = column_view (k);
-                nnzLa ++;
-              }
-            } else if (column_view (k) > i) {
-              if (column_view (k) < num_rows) {
-                // U
-                entries2 (nnzU) = column_view (k);
-                nnzU ++;
-                if (compact_form) {
-                  // complement of L+D
-                  entries_a2 (nnzUa) = column_view (k);
-                  nnzUa ++;
-                }
-              } else if (compact_form) {
-                // complement of U+D
-                entries_a (nnzLa) = column_view (k);
-                nnzLa ++;
-                // complement of L+D
-                entries_a2 (nnzUa) = column_view (k);
-                nnzUa ++;
-              }
+        } else if (column_view(k) > i) {
+          if (column_view(k) < num_rows) {
+            // U
+            entries2(nnzU) = column_view(k);
+            nnzU++;
+            if (compact_form) {
+              // complement of L+D
+              entries_a2(nnzUa) = column_view(k);
+              nnzUa++;
             }
-          }
-          if (!two_stage) {
-            // NOTE: Kokkos' sptrsv assumes diagonal of L to be at the end
-            entries (nnzL) = i;
-            nnzL ++;
+          } else if (compact_form) {
+            // complement of U+D
+            entries_a(nnzLa) = column_view(k);
+            nnzLa++;
+            // complement of L+D
+            entries_a2(nnzUa) = column_view(k);
+            nnzUa++;
           }
         }
-
-        // functor for storing both valuesL & valuesU (with parallel_for)
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const Tag_valuesLU&, const ordinal_t i) const
-        {
-          const_scalar_t one = Kokkos::Details::ArithTraits<scalar_t>::one ();
-          ordinal_t nnzL = row_map (i);
-          ordinal_t nnzU = row_map2 (i);
-          ordinal_t nnzLa = 0;
-          ordinal_t nnzUa = 0;
+      }
+      if (!two_stage) {
+        // NOTE: Kokkos' sptrsv assumes diagonal of L to be at the end
+        entries(nnzL) = i;
+        nnzL++;
+      }
+    }
+
+    // functor for storing both valuesL & valuesU (with parallel_for)
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const Tag_valuesLU &, const ordinal_t i) const {
+      const_scalar_t one = Kokkos::Details::ArithTraits<scalar_t>::one();
+      ordinal_t nnzL     = row_map(i);
+      ordinal_t nnzU     = row_map2(i);
+      ordinal_t nnzLa    = 0;
+      ordinal_t nnzUa    = 0;
+      if (compact_form) {
+        nnzLa = row_map_a(i);
+        nnzUa = row_map_a2(i);
+      }
+      if (!two_stage) {
+        // Kokkos' sptrsv assumes diagonal U to come at the start, so increment
+        // nnzU
+        nnzU++;
+      }
+      for (size_type k = rowmap_view(i); k < rowmap_view(i + 1); k++) {
+        if (column_view(k) < i) {
+          // save L (without diag)
+          values(nnzL) = values_view(k);
+          nnzL++;
           if (compact_form) {
-            nnzLa = row_map_a (i);
-            nnzUa = row_map_a2 (i);
+            // complement of U+D
+            values_a(nnzLa) = values_view(k);
+            nnzLa++;
           }
-          if (!two_stage) {
-            // Kokkos' sptrsv assumes diagonal U to come at the start, so increment nnzU
-            nnzU ++;
+        } else if (column_view(k) == i) {
+          // save D
+          if (diagos_given) {
+            // as inverse
+            diags(i) = d_invert_view(i);
+          } else {
+            // as original
+            diags(i) = values_view(k);
           }
-          for (size_type k = rowmap_view (i); k < rowmap_view (i+1); k++) {
-            if (column_view (k) < i) {
-              // save L (without diag)
-              values (nnzL) = values_view (k);
-              nnzL ++;
-              if (compact_form) {
-                // complement of U+D
-                values_a (nnzLa) = values_view (k);
-                nnzLa ++;
-              }
-            } else if (column_view (k) == i) {
-              // save D
-              if (diagos_given) {
-                // as inverse
-                diags (i) = d_invert_view (i);
-              } else {
-                // as original
-                diags (i) = values_view (k);
-              }
-              if (compact_form) {
-                diags_a (i) = values_view (k);
-              }
-            } else {
-              if (column_view (k) < num_rows) {
-                // save U (without diag)
-                values2 (nnzU) = values_view (k);
-                nnzU ++;
-                if (compact_form) {
-                  // complement of L+D
-                  values_a2 (nnzUa) = values_view (k);
-                  nnzUa ++;
-                }
-              } else if (compact_form) {
-                // complement of U+D
-                values_a (nnzLa) = values_view (k);
-                nnzLa ++;
-                // complement of L+D
-                values_a2 (nnzUa) = values_view (k);
-                nnzUa ++;
-              }
-            }
+          if (compact_form) {
+            diags_a(i) = values_view(k);
           }
-          if (!two_stage) {
-            // if using sptrsv, add diagonals in L and U
-            // > Kokkos' sptrsv assumes diagonal of L and U to come at end and start
-            nnzU = row_map2 (i);
-            if (diagos_given) {
-              values2 (nnzU) = one / diags (i);
-              values  (nnzL) = one / diags (i);
-            } else {
-              values2 (nnzU) = diags (i);
-              values  (nnzL) = diags (i);
+        } else {
+          if (column_view(k) < num_rows) {
+            // save U (without diag)
+            values2(nnzU) = values_view(k);
+            nnzU++;
+            if (compact_form) {
+              // complement of L+D
+              values_a2(nnzUa) = values_view(k);
+              nnzUa++;
             }
+          } else if (compact_form) {
+            // complement of U+D
+            values_a(nnzLa) = values_view(k);
+            nnzLa++;
+            // complement of L+D
+            values_a2(nnzUa) = values_view(k);
+            nnzUa++;
           }
-          if (two_stage) {
-            if (!diagos_given) {
-              // when diag is provided, it is already provided as inverse
-              diags (i) = one / diags (i);
-            }
-            // compute inv(D)*L (apply row-scaling to valueL)
-            for (size_type k = row_map (i); k < row_map (i+1); k++) {
-              values (k) *= diags (i);
-            }
-            // compute inv(D)*U (apply row-scaling to valueU)
-            for (size_type k = row_map2 (i); k < row_map2 (i+1); k++) {
-              values2 (k) *= diags (i);
-            }
+        }
+      }
+      if (!two_stage) {
+        // if using sptrsv, add diagonals in L and U
+        // > Kokkos' sptrsv assumes diagonal of L and U to come at end and start
+        nnzU = row_map2(i);
+        if (diagos_given) {
+          values2(nnzU) = one / diags(i);
+          values(nnzL)  = one / diags(i);
+        } else {
+          values2(nnzU) = diags(i);
+          values(nnzL)  = diags(i);
+        }
+      }
+      if (two_stage) {
+        if (!diagos_given) {
+          // when diag is provided, it is already provided as inverse
+          diags(i) = one / diags(i);
+        }
+        // compute inv(D)*L (apply row-scaling to valueL)
+        for (size_type k = row_map(i); k < row_map(i + 1); k++) {
+          values(k) *= diags(i);
+        }
+        // compute inv(D)*U (apply row-scaling to valueU)
+        for (size_type k = row_map2(i); k < row_map2(i + 1); k++) {
+          values2(k) *= diags(i);
+        }
+      }
+    }
+
+    // ------------------------------------------------------- //
+    // functor for computing residual norm (with parallel_reduce)
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const Tag_normR &, const ordinal_t i, mag_t &normR) const {
+      scalar_t normRi = localB(i, 0);
+      if (forward_sweep) {
+        // compute R(i) = B(i) - (L+D)(i,:)*X
+        for (size_type k = rowmap_view(i); k < rowmap_view(i + 1); k++) {
+          if (column_view(k) <= i) {
+            normRi -= values_view(k) * localX(column_view(k), 0);
           }
         }
-
-        // ------------------------------------------------------- //
-        // functor for computing residual norm (with parallel_reduce)
-        KOKKOS_INLINE_FUNCTION
-        void operator()(const Tag_normR&, const ordinal_t i, mag_t &normR) const
-        {
-          scalar_t normRi = localB (i, 0);
-          if (forward_sweep) {
-            // compute R(i) = B(i) - (L+D)(i,:)*X
-            for (size_type k = rowmap_view (i); k < rowmap_view (i+1); k++) {
-              if (column_view (k) <= i) {
-                normRi -= values_view (k) * localX (column_view (k), 0);
-              }
-            }
-          } else {
-            // compute R(i) = B(i) - (D+U)(i,:)*X
-            for (size_type k = rowmap_view (i); k < rowmap_view (i+1); k++) {
-              if (column_view (k) >= i && column_view (k) < num_rows) {
-                normRi -= values_view (k) * localX (column_view (k), 0);
-              }
-            }
+      } else {
+        // compute R(i) = B(i) - (D+U)(i,:)*X
+        for (size_type k = rowmap_view(i); k < rowmap_view(i + 1); k++) {
+          if (column_view(k) >= i && column_view(k) < num_rows) {
+            normRi -= values_view(k) * localX(column_view(k), 0);
           }
-          normR += ST::abs (normRi * normRi);
         }
-      };
-    // --------------------------------------------------------- //
-
-
-    public:
-      /**
-       * \brief constructor
-       */
-      // for symbolic (wihout values)
-      TwostageGaussSeidel(HandleType *handle_,
-                  const_ordinal_t num_rows_,
-                  const_ordinal_t num_cols_,
-                  input_row_map_view_t rowmap_view_,
-                  input_entries_view_t column_view_) :
-        handle(handle_),
+      }
+      normR += ST::abs(normRi * normRi);
+    }
+  };
+  // --------------------------------------------------------- //
+
+ public:
+  /**
+   * \brief constructor
+   */
+  // for symbolic (wihout values)
+  TwostageGaussSeidel(HandleType *handle_, const_ordinal_t num_rows_,
+                      const_ordinal_t num_cols_,
+                      input_row_map_view_t rowmap_view_,
+                      input_entries_view_t column_view_)
+      : handle(handle_),
         diagos_given(false),
-        num_rows(num_rows_), num_cols(num_cols_),
+        num_rows(num_rows_),
+        num_cols(num_cols_),
         rowmap_view(rowmap_view_),
         column_view(column_view_),
         values_view(),
         d_invert_view() {}
 
-      // for numeric/solve (with values)
-      TwostageGaussSeidel (HandleType *handle_,
-                           const_ordinal_t num_rows_,
-                           const_ordinal_t num_cols_,
-                           input_row_map_view_t rowmap_view_,
-                           input_entries_view_t column_view_,
-                           input_values_view_t values_view_) :
-        handle(handle_),
+  // for numeric/solve (with values)
+  TwostageGaussSeidel(HandleType *handle_, const_ordinal_t num_rows_,
+                      const_ordinal_t num_cols_,
+                      input_row_map_view_t rowmap_view_,
+                      input_entries_view_t column_view_,
+                      input_values_view_t values_view_)
+      : handle(handle_),
         diagos_given(false),
-        num_rows(num_rows_), num_cols(num_cols_),
+        num_rows(num_rows_),
+        num_cols(num_cols_),
         rowmap_view(rowmap_view_),
         column_view(column_view_),
         values_view(values_view_),
         d_invert_view() {}
 
-      // for numeric/solve (with values and diagonal)
-      TwostageGaussSeidel (HandleType *handle_,
-                           const_ordinal_t num_rows_,
-                           const_ordinal_t num_cols_,
-                           input_row_map_view_t rowmap_view_,
-                           input_entries_view_t column_view_,
-                           input_values_view_t values_view_,
-                           input_values_view_t d_invert_view_) :
-        handle(handle_),
+  // for numeric/solve (with values and diagonal)
+  TwostageGaussSeidel(HandleType *handle_, const_ordinal_t num_rows_,
+                      const_ordinal_t num_cols_,
+                      input_row_map_view_t rowmap_view_,
+                      input_entries_view_t column_view_,
+                      input_values_view_t values_view_,
+                      input_values_view_t d_invert_view_)
+      : handle(handle_),
         diagos_given(true),
-        num_rows(num_rows_), num_cols(num_cols_),
+        num_rows(num_rows_),
+        num_cols(num_cols_),
         rowmap_view(rowmap_view_),
         column_view(column_view_),
         values_view(values_view_),
         d_invert_view(d_invert_view_) {}
 
-
-      /**
-       * Symbolic setup
-       */
-      void initialize_symbolic ()
-      {
+  /**
+   * Symbolic setup
+   */
+  void initialize_symbolic() {
 #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS
-        double tic;
-        Kokkos::Timer timer;
-        Kokkos::fence();
-        tic = timer.seconds ();
+    double tic;
+    Kokkos::Timer timer;
+    Kokkos::fence();
+    tic = timer.seconds();
 #endif
-        auto *gsHandle = get_gs_handle();
-        bool two_stage = gsHandle->isTwoStage ();
-        bool compact_form = gsHandle->isCompactForm ();
-        GSDirection direction = gsHandle->getSweepDirection ();
-        using GS_Functor_t = TwostageGaussSeidel_functor<row_map_view_t, entries_view_t, values_view_t>;
-        // count nnz in local L & U matrices (rowmap_viewL/rowmap_viewU stores offsets for each row)
-        ordinal_t nnzA = column_view.extent (0);
-        ordinal_t nnzL = 0; // lower-part of diagonal block
-        ordinal_t nnzU = 0; // upper-part of diagonal block
-        row_map_view_t  rowmap_viewL ("row_mapL", num_rows+1);   // lower-part of diagonal block
-        row_map_view_t  rowmap_viewU ("row_mapU", num_rows+1);   // upper-part of diagonal block
-        row_map_view_t  rowmap_viewLa ("row_mapLa", num_rows+1); // complement of U+D
-        row_map_view_t  rowmap_viewUa ("row_mapUa", num_rows+1); // complement of L+D
-        if (direction == GS_FORWARD || direction == GS_SYMMETRIC) {
-          using range_policy = Kokkos::RangePolicy <Tag_countNnzL, execution_space>;
-          Kokkos::parallel_reduce ("nnzL", range_policy (0, num_rows),
-                                   GS_Functor_t (two_stage, compact_form,
-                                                 num_rows, rowmap_view, column_view,
-                                                 rowmap_viewL, rowmap_viewUa),
-                                   nnzL);
-        }
-        if (direction == GS_BACKWARD || direction == GS_SYMMETRIC) {
-          using range_policy = Kokkos::RangePolicy <Tag_countNnzU, execution_space>;
-          Kokkos::parallel_reduce ("nnzU", range_policy (0, num_rows),
-                                   GS_Functor_t (two_stage, compact_form,
-                                                 num_rows, rowmap_view, column_view,
-                                                 rowmap_viewU, rowmap_viewLa),
-                                   nnzU);
-        }
-        ordinal_t nnzLa = 0; // complement of U+D
-        ordinal_t nnzUa = 0; // complement of L+D
-        if (compact_form) {
-          nnzLa = nnzA - nnzU;
-          nnzUa = nnzA - nnzL;
-          if (two_stage) {
-            // two-stage iterates with L or U (no D)
-            nnzLa -= num_rows;
-            nnzUa -= num_rows;
-          }
-        }
+    auto *gsHandle        = get_gs_handle();
+    bool two_stage        = gsHandle->isTwoStage();
+    bool compact_form     = gsHandle->isCompactForm();
+    GSDirection direction = gsHandle->getSweepDirection();
+    using GS_Functor_t =
+        TwostageGaussSeidel_functor<row_map_view_t, entries_view_t,
+                                    values_view_t>;
+    // count nnz in local L & U matrices (rowmap_viewL/rowmap_viewU stores
+    // offsets for each row)
+    ordinal_t nnzA = column_view.extent(0);
+    ordinal_t nnzL = 0;  // lower-part of diagonal block
+    ordinal_t nnzU = 0;  // upper-part of diagonal block
+    row_map_view_t rowmap_viewL("row_mapL",
+                                num_rows + 1);  // lower-part of diagonal block
+    row_map_view_t rowmap_viewU("row_mapU",
+                                num_rows + 1);  // upper-part of diagonal block
+    row_map_view_t rowmap_viewLa("row_mapLa",
+                                 num_rows + 1);  // complement of U+D
+    row_map_view_t rowmap_viewUa("row_mapUa",
+                                 num_rows + 1);  // complement of L+D
+    if (direction == GS_FORWARD || direction == GS_SYMMETRIC) {
+      using range_policy = Kokkos::RangePolicy<Tag_countNnzL, execution_space>;
+      Kokkos::parallel_reduce(
+          "nnzL", range_policy(0, num_rows),
+          GS_Functor_t(two_stage, compact_form, num_rows, rowmap_view,
+                       column_view, rowmap_viewL, rowmap_viewUa),
+          nnzL);
+    }
+    if (direction == GS_BACKWARD || direction == GS_SYMMETRIC) {
+      using range_policy = Kokkos::RangePolicy<Tag_countNnzU, execution_space>;
+      Kokkos::parallel_reduce(
+          "nnzU", range_policy(0, num_rows),
+          GS_Functor_t(two_stage, compact_form, num_rows, rowmap_view,
+                       column_view, rowmap_viewU, rowmap_viewLa),
+          nnzU);
+    }
+    ordinal_t nnzLa = 0;  // complement of U+D
+    ordinal_t nnzUa = 0;  // complement of L+D
+    if (compact_form) {
+      nnzLa = nnzA - nnzU;
+      nnzUa = nnzA - nnzL;
+      if (two_stage) {
+        // two-stage iterates with L or U (no D)
+        nnzLa -= num_rows;
+        nnzUa -= num_rows;
+      }
+    }
 #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS
-        Kokkos::fence();
-        tic = timer.seconds ();
-        std::cout << std::endl << "TWO-STAGE GS::SYMBOLIC::COUNT-NNZ TIME : " << tic << std::endl;
-        timer.reset();
+    Kokkos::fence();
+    tic = timer.seconds();
+    std::cout << std::endl
+              << "TWO-STAGE GS::SYMBOLIC::COUNT-NNZ TIME : " << tic
+              << std::endl;
+    timer.reset();
 #endif
-        // shift ptr so that it now contains offsets (combine it with the previous functor calls?)
-        if (direction == GS_FORWARD || direction == GS_SYMMETRIC) {
-          KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum<row_map_view_t, execution_space> 
-            (1+num_rows, rowmap_viewL);
-          if (compact_form) {
-            KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum<row_map_view_t, execution_space> 
-              (1+num_rows, rowmap_viewLa);
-          }
-        }
-        if (direction == GS_BACKWARD || direction == GS_SYMMETRIC) {
-          KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum<row_map_view_t, execution_space> 
-            (1+num_rows, rowmap_viewU);
-          if (compact_form) {
-            KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum<row_map_view_t, execution_space> 
-              (1+num_rows, rowmap_viewUa);
-          }
-        }
+    // shift ptr so that it now contains offsets (combine it with the previous
+    // functor calls?)
+    if (direction == GS_FORWARD || direction == GS_SYMMETRIC) {
+      KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum<row_map_view_t,
+                                                            execution_space>(
+          1 + num_rows, rowmap_viewL);
+      if (compact_form) {
+        KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum<row_map_view_t,
+                                                              execution_space>(
+            1 + num_rows, rowmap_viewLa);
+      }
+    }
+    if (direction == GS_BACKWARD || direction == GS_SYMMETRIC) {
+      KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum<row_map_view_t,
+                                                            execution_space>(
+          1 + num_rows, rowmap_viewU);
+      if (compact_form) {
+        KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum<row_map_view_t,
+                                                              execution_space>(
+            1 + num_rows, rowmap_viewUa);
+      }
+    }
 #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS
-        Kokkos::fence();
-        tic = timer.seconds ();
-        std::cout << "TWO-STAGE GS::SYMBOLIC::COMP-PTR TIME  : " << tic << std::endl;
-        timer.reset();
+    Kokkos::fence();
+    tic = timer.seconds();
+    std::cout << "TWO-STAGE GS::SYMBOLIC::COMP-PTR TIME  : " << tic
+              << std::endl;
+    timer.reset();
 #endif
-        // allocate memory to store local D
-        values_view_t viewD (Kokkos::view_alloc(Kokkos::WithoutInitializing, "diags"), num_rows);
-
-        // allocate memory to store local L
-        entries_view_t  column_viewL (Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesL"), nnzL);
-        values_view_t   values_viewL (Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesL"),  nnzL);
-
-        // allocate memory to store local U
-        entries_view_t  column_viewU (Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesU"), nnzU);
-        values_view_t   values_viewU (Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesU"),  nnzU);
-
-        // allocate memory to store complement of U+D
-        entries_view_t  column_viewLa (Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesLa"), nnzLa);
-        values_view_t   values_viewLa (Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesLa"),  nnzLa);
-
-        // allocate memory to store complement of L+D
-        entries_view_t  column_viewUa (Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesUa"), nnzUa);
-        values_view_t   values_viewUa (Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesUa"),  nnzUa);
+    // allocate memory to store local D
+    values_view_t viewD(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "diags"), num_rows);
+
+    // allocate memory to store local L
+    entries_view_t column_viewL(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesL"), nnzL);
+    values_view_t values_viewL(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesL"), nnzL);
+
+    // allocate memory to store local U
+    entries_view_t column_viewU(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesU"), nnzU);
+    values_view_t values_viewU(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesU"), nnzU);
+
+    // allocate memory to store complement of U+D
+    entries_view_t column_viewLa(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesLa"), nnzLa);
+    values_view_t values_viewLa(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesLa"), nnzLa);
+
+    // allocate memory to store complement of L+D
+    entries_view_t column_viewUa(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesUa"), nnzUa);
+    values_view_t values_viewUa(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesUa"), nnzUa);
 #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS
-        Kokkos::fence();
-        tic = timer.seconds ();
-        std::cout << "TWO-STAGE GS::SYMBOLIC::ALLOCATE TIME  : " << tic << std::endl;
-        timer.reset();
+    Kokkos::fence();
+    tic = timer.seconds();
+    std::cout << "TWO-STAGE GS::SYMBOLIC::ALLOCATE TIME  : " << tic
+              << std::endl;
+    timer.reset();
 #endif
 
-        {
-          // extract local L & U structures (for computing (L+D)^{-1} or (D+U)^{-1})
-          using range_policy = Kokkos::RangePolicy <Tag_entriesLU, execution_space>;
-          Kokkos::parallel_for ("entriesLU", range_policy (0, num_rows),
-                                GS_Functor_t (two_stage, compact_form,
-                                              num_rows, rowmap_view,  column_view,
-                                                        rowmap_viewL, column_viewL,
-                                                        rowmap_viewU, column_viewU,
-                                                        //
-                                                        rowmap_viewLa, column_viewLa,
-                                                        rowmap_viewUa, column_viewUa));
-        }
+    {
+      // extract local L & U structures (for computing (L+D)^{-1} or (D+U)^{-1})
+      using range_policy = Kokkos::RangePolicy<Tag_entriesLU, execution_space>;
+      Kokkos::parallel_for(
+          "entriesLU", range_policy(0, num_rows),
+          GS_Functor_t(
+              two_stage, compact_form, num_rows, rowmap_view, column_view,
+              rowmap_viewL, column_viewL, rowmap_viewU, column_viewU,
+              //
+              rowmap_viewLa, column_viewLa, rowmap_viewUa, column_viewUa));
+    }
 #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS
-        Kokkos::fence();
-        tic = timer.seconds ();
-        std::cout << "TWO-STAGE GS::SYMBOLIC::INSERT TIME    : " << tic << std::endl;
-        timer.reset();
+    Kokkos::fence();
+    tic = timer.seconds();
+    std::cout << "TWO-STAGE GS::SYMBOLIC::INSERT TIME    : " << tic
+              << std::endl;
+    timer.reset();
 #endif
 
-        // construct CrsMat with them
-        graph_t graphL (column_viewL, rowmap_viewL);
-        graph_t graphU (column_viewU, rowmap_viewU);
-        crsmat_t crsmatL ("L", num_rows, values_viewL, graphL);
-        crsmat_t crsmatU ("U", num_rows, values_viewU, graphU);
-
-        // store them in handle
-        gsHandle->setL (crsmatL);
-        gsHandle->setU (crsmatU);
-        gsHandle->setD (viewD);
-
-        if (compact_form) {
-          // construct complements
-          graph_t graphLa (column_viewLa, rowmap_viewLa);
-          graph_t graphUa (column_viewUa, rowmap_viewUa);
-          crsmat_t crsmatLa ("La", num_rows, values_viewLa, graphLa);
-          crsmat_t crsmatUa ("Ua", num_rows, values_viewUa, graphUa);
-
-          // store them in handle
-          gsHandle->setLa (crsmatLa);
-          gsHandle->setUa (crsmatUa);
-
-          values_view_t viewDa (Kokkos::view_alloc(Kokkos::WithoutInitializing, "diags"), num_rows);
-          gsHandle->setDa (viewDa);
-        }
-
-        if (!(gsHandle->isTwoStage ())) {
-          // create SpTRSV handles for classical GS
-          using namespace KokkosSparse::Experimental;
-          auto sptrsv_algo = handle->get_gs_sptrsvL_handle()->get_sptrsv_handle()->get_algorithm();
-          if (sptrsv_algo != SPTRSVAlgorithm::SPTRSV_CUSPARSE) { // symbolic with CuSparse needs values
-            sptrsv_symbolic (handle->get_gs_sptrsvL_handle(), rowmap_viewL, crsmatL.graph.entries);
-            sptrsv_symbolic (handle->get_gs_sptrsvU_handle(), rowmap_viewU, crsmatU.graph.entries);
-          }
-        }
+    // construct CrsMat with them
+    graph_t graphL(column_viewL, rowmap_viewL);
+    graph_t graphU(column_viewU, rowmap_viewU);
+    crsmat_t crsmatL("L", num_rows, values_viewL, graphL);
+    crsmat_t crsmatU("U", num_rows, values_viewU, graphU);
+
+    // store them in handle
+    gsHandle->setL(crsmatL);
+    gsHandle->setU(crsmatU);
+    gsHandle->setD(viewD);
+
+    if (compact_form) {
+      // construct complements
+      graph_t graphLa(column_viewLa, rowmap_viewLa);
+      graph_t graphUa(column_viewUa, rowmap_viewUa);
+      crsmat_t crsmatLa("La", num_rows, values_viewLa, graphLa);
+      crsmat_t crsmatUa("Ua", num_rows, values_viewUa, graphUa);
+
+      // store them in handle
+      gsHandle->setLa(crsmatLa);
+      gsHandle->setUa(crsmatUa);
+
+      values_view_t viewDa(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "diags"), num_rows);
+      gsHandle->setDa(viewDa);
+    }
+
+    if (!(gsHandle->isTwoStage())) {
+      // create SpTRSV handles for classical GS
+      using namespace KokkosSparse::Experimental;
+      auto sptrsv_algo =
+          handle->get_gs_sptrsvL_handle()->get_sptrsv_handle()->get_algorithm();
+      if (sptrsv_algo !=
+          SPTRSVAlgorithm::SPTRSV_CUSPARSE) {  // symbolic with CuSparse needs
+                                               // values
+        sptrsv_symbolic(handle->get_gs_sptrsvL_handle(), rowmap_viewL,
+                        crsmatL.graph.entries);
+        sptrsv_symbolic(handle->get_gs_sptrsvU_handle(), rowmap_viewU,
+                        crsmatU.graph.entries);
       }
+    }
+  }
 
-
-      /**
-       * Numerical setup
-       */
-      void initialize_numeric ()
-      {
+  /**
+   * Numerical setup
+   */
+  void initialize_numeric() {
 #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS
-        double tic;
-        Kokkos::Timer timer;
-        Kokkos::fence();
-        timer.reset();
+    double tic;
+    Kokkos::Timer timer;
+    Kokkos::fence();
+    timer.reset();
 #endif
-        using GS_Functor_t = TwostageGaussSeidel_functor<const_row_map_view_t, entries_view_t, values_view_t>;
-
-        auto *gsHandle = get_gs_handle();
-        bool two_stage = gsHandle->isTwoStage ();
-        bool compact_form = gsHandle->isCompactForm ();
-
-        // load local D from handle
-        auto viewD = gsHandle->getD ();
-        auto viewDa = gsHandle->getDa ();
-
-        // load local L from handle
-        auto crsmatL = gsHandle->getL ();
-        auto values_viewL = crsmatL.values;
-        auto rowmap_viewL = crsmatL.graph.row_map;
-        auto column_viewL = crsmatL.graph.entries;
-
-        // load local U from handle
-        auto crsmatU = gsHandle->getU ();
-        auto values_viewU = crsmatU.values;
-        auto rowmap_viewU = crsmatU.graph.row_map;
-        auto column_viewU = crsmatU.graph.entries;
-
-        // load complement of U+D from handle
-        auto crsmatLa = gsHandle->getLa ();
-        auto values_viewLa = crsmatLa.values;
-        auto rowmap_viewLa = crsmatLa.graph.row_map;
-
-        // load complement of L+D from handle
-        auto crsmatUa = gsHandle->getUa ();
-        auto values_viewUa = crsmatUa.values;
-        auto rowmap_viewUa = crsmatUa.graph.row_map;
-
-
-        // extract local L, D & U matrices
-        using range_policy = Kokkos::RangePolicy <Tag_valuesLU, execution_space>;
-        Kokkos::parallel_for ("valueLU", range_policy (0, num_rows),
-                              GS_Functor_t (two_stage, compact_form, diagos_given, num_rows,
-                                            rowmap_view, column_view, values_view, d_invert_view,
-                                            rowmap_viewL, values_viewL, viewD,
-                                            rowmap_viewU, values_viewU,
-                                            rowmap_viewLa, values_viewLa, viewDa,
-                                            rowmap_viewUa, values_viewUa));
+    using GS_Functor_t =
+        TwostageGaussSeidel_functor<const_row_map_view_t, entries_view_t,
+                                    values_view_t>;
+
+    auto *gsHandle    = get_gs_handle();
+    bool two_stage    = gsHandle->isTwoStage();
+    bool compact_form = gsHandle->isCompactForm();
+
+    // load local D from handle
+    auto viewD  = gsHandle->getD();
+    auto viewDa = gsHandle->getDa();
+
+    // load local L from handle
+    auto crsmatL      = gsHandle->getL();
+    auto values_viewL = crsmatL.values;
+    auto rowmap_viewL = crsmatL.graph.row_map;
+    auto column_viewL = crsmatL.graph.entries;
+
+    // load local U from handle
+    auto crsmatU      = gsHandle->getU();
+    auto values_viewU = crsmatU.values;
+    auto rowmap_viewU = crsmatU.graph.row_map;
+    auto column_viewU = crsmatU.graph.entries;
+
+    // load complement of U+D from handle
+    auto crsmatLa      = gsHandle->getLa();
+    auto values_viewLa = crsmatLa.values;
+    auto rowmap_viewLa = crsmatLa.graph.row_map;
+
+    // load complement of L+D from handle
+    auto crsmatUa      = gsHandle->getUa();
+    auto values_viewUa = crsmatUa.values;
+    auto rowmap_viewUa = crsmatUa.graph.row_map;
+
+    // extract local L, D & U matrices
+    using range_policy = Kokkos::RangePolicy<Tag_valuesLU, execution_space>;
+    Kokkos::parallel_for(
+        "valueLU", range_policy(0, num_rows),
+        GS_Functor_t(two_stage, compact_form, diagos_given, num_rows,
+                     rowmap_view, column_view, values_view, d_invert_view,
+                     rowmap_viewL, values_viewL, viewD, rowmap_viewU,
+                     values_viewU, rowmap_viewLa, values_viewLa, viewDa,
+                     rowmap_viewUa, values_viewUa));
 #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS
-        Kokkos::fence();
-        tic = timer.seconds ();
-        std::cout << std::endl << "TWO-STAGE GS::NUMERIC::INSERT LU TIME : " << tic << std::endl;
-        timer.reset();
+    Kokkos::fence();
+    tic = timer.seconds();
+    std::cout << std::endl
+              << "TWO-STAGE GS::NUMERIC::INSERT LU TIME : " << tic << std::endl;
+    timer.reset();
 #endif
 
-        if (!(gsHandle->isTwoStage ())) {
-          using namespace KokkosSparse::Experimental;
-          auto sptrsv_algo = handle->get_gs_sptrsvL_handle()->get_sptrsv_handle()->get_algorithm();
-          if (sptrsv_algo == SPTRSVAlgorithm::SPTRSV_CUSPARSE) { // symbolic with CuSparse needs values
-            // CuSparse needs matrix sorted by column indexes for each row
-            // TODO: may need to move this to symbolic/numeric of sptrsv
-            KokkosKernels::sort_crs_matrix <execution_space, const_row_map_view_t, entries_view_t, values_view_t>
-              (rowmap_viewL, column_viewL, values_viewL);
-            KokkosKernels::sort_crs_matrix <execution_space, const_row_map_view_t, entries_view_t, values_view_t>
-              (rowmap_viewU, column_viewU, values_viewU);
-
-            // now do symbolic
-            sptrsv_symbolic (handle->get_gs_sptrsvL_handle(), rowmap_viewL, crsmatL.graph.entries, values_viewL);
-            sptrsv_symbolic (handle->get_gs_sptrsvU_handle(), rowmap_viewU, crsmatU.graph.entries, values_viewU);
-          }
-        }
+    if (!(gsHandle->isTwoStage())) {
+      using namespace KokkosSparse::Experimental;
+      auto sptrsv_algo =
+          handle->get_gs_sptrsvL_handle()->get_sptrsv_handle()->get_algorithm();
+      if (sptrsv_algo ==
+          SPTRSVAlgorithm::SPTRSV_CUSPARSE) {  // symbolic with CuSparse needs
+                                               // values
+        // CuSparse needs matrix sorted by column indexes for each row
+        // TODO: may need to move this to symbolic/numeric of sptrsv
+        KokkosKernels::sort_crs_matrix<execution_space, const_row_map_view_t,
+                                       entries_view_t, values_view_t>(
+            rowmap_viewL, column_viewL, values_viewL);
+        KokkosKernels::sort_crs_matrix<execution_space, const_row_map_view_t,
+                                       entries_view_t, values_view_t>(
+            rowmap_viewU, column_viewU, values_viewU);
+
+        // now do symbolic
+        sptrsv_symbolic(handle->get_gs_sptrsvL_handle(), rowmap_viewL,
+                        crsmatL.graph.entries, values_viewL);
+        sptrsv_symbolic(handle->get_gs_sptrsvU_handle(), rowmap_viewU,
+                        crsmatU.graph.entries, values_viewU);
       }
+    }
+  }
 
-
-      /**
-       * Apply solve
-       */
-      template <typename x_value_array_type, typename y_value_array_type>
-      void apply (x_value_array_type localX, // in/out
-                  y_value_array_type localB, // in
-                  bool init_zero_x_vector = false,
-                  int numIter = 1,
-                  scalar_t omega = ST::one(),
-                  bool apply_forward = true,
-                  bool apply_backward = true,
-                  bool /*update_y_vector*/ = true)
-      {
-        const_scalar_t one = Kokkos::Details::ArithTraits<scalar_t>::one ();
-        const_scalar_t zero = Kokkos::Details::ArithTraits<scalar_t>::zero ();
+  /**
+   * Apply solve
+   */
+  template <typename x_value_array_type, typename y_value_array_type>
+  void apply(x_value_array_type localX,  // in/out
+             y_value_array_type localB,  // in
+             bool init_zero_x_vector = false, int numIter = 1,
+             scalar_t omega = ST::one(), bool apply_forward = true,
+             bool apply_backward = true, bool /*update_y_vector*/ = true) {
+    const_scalar_t one  = Kokkos::Details::ArithTraits<scalar_t>::one();
+    const_scalar_t zero = Kokkos::Details::ArithTraits<scalar_t>::zero();
 #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS
-        double tic;
-        Kokkos::Timer timer;
-        Kokkos::fence();
-        tic = timer.seconds ();
+    double tic;
+    Kokkos::Timer timer;
+    Kokkos::fence();
+    tic = timer.seconds();
 #endif
 
-        //
-        auto *gsHandle = get_gs_handle();
-        bool two_stage = gsHandle->isTwoStage ();
-        bool compact_form = gsHandle->isCompactForm ();
-        scalar_t gamma = gsHandle->getInnerDampFactor ();
-
-        GSDirection direction = gsHandle->getSweepDirection ();
-        if (apply_forward && apply_backward) {
-          direction = GS_SYMMETRIC;
-        } else if (apply_forward) {
-          direction = GS_FORWARD;
-        } else if (apply_backward) {
-          direction = GS_BACKWARD;
-        } else {
-          return;
-        }
-
-        // load auxiliary matrices from handle
-        auto localD = gsHandle->getD ();
-        auto crsmatL = gsHandle->getL (); // lower-part of diagonal block
-        auto crsmatU = gsHandle->getU (); // upper-part of diagonal block
-        auto localDa = gsHandle->getDa ();
-        auto crsmatLa = gsHandle->getLa (); // complement of L+D (used only for compact form)
-        auto crsmatUa = gsHandle->getUa (); // complement of U+D (used only for compact form)
-
-        // wratp A into crsmat
-        input_graph_t graphA (column_view, rowmap_view);
-        input_crsmat_t crsmatA ("A", num_rows, values_view, graphA);
+    //
+    auto *gsHandle    = get_gs_handle();
+    bool two_stage    = gsHandle->isTwoStage();
+    bool compact_form = gsHandle->isCompactForm();
+    scalar_t gamma    = gsHandle->getInnerDampFactor();
+
+    GSDirection direction = gsHandle->getSweepDirection();
+    if (apply_forward && apply_backward) {
+      direction = GS_SYMMETRIC;
+    } else if (apply_forward) {
+      direction = GS_FORWARD;
+    } else if (apply_backward) {
+      direction = GS_BACKWARD;
+    } else {
+      return;
+    }
+
+    // load auxiliary matrices from handle
+    auto localD  = gsHandle->getD();
+    auto crsmatL = gsHandle->getL();  // lower-part of diagonal block
+    auto crsmatU = gsHandle->getU();  // upper-part of diagonal block
+    auto localDa = gsHandle->getDa();
+    auto crsmatLa =
+        gsHandle->getLa();  // complement of L+D (used only for compact form)
+    auto crsmatUa =
+        gsHandle->getUa();  // complement of U+D (used only for compact form)
+
+    // wratp A into crsmat
+    input_crsmat_t crsmatA("A", num_rows, num_cols, values_view.extent(0),
+                           values_view, rowmap_view, column_view);
 #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS
-        Kokkos::fence();
-        tic = timer.seconds ();
-        std::cout << std::endl << "TWO-STAGE GS::APPLY with " << numIter << " outer GS sweeps with omega = " << omega 
-                  << ", and " << gsHandle->getNumInnerSweeps () << " inner JR sweeps, with gamma = " << gamma
-                  << " (numRows=" << num_rows << ")"
-                  << std::endl;
-        std::cout << std::endl << "TWO-STAGE GS::APPLY::CREATE CRS_A TIME : " << tic << std::endl;
-        timer.reset();
+    Kokkos::fence();
+    tic = timer.seconds();
+    std::cout << std::endl
+              << "TWO-STAGE GS::APPLY with " << numIter
+              << " outer GS sweeps with omega = " << omega << ", and "
+              << gsHandle->getNumInnerSweeps()
+              << " inner JR sweeps, with gamma = " << gamma
+              << " (numRows=" << num_rows << ")" << std::endl;
+    std::cout << std::endl
+              << "TWO-STAGE GS::APPLY::CREATE CRS_A TIME : " << tic
+              << std::endl;
+    timer.reset();
 #endif
 
-        // load auxiliary vectors
-        int nrhs = localX.extent (1);
-        gsHandle->initVectors (num_rows, nrhs);
-        auto localR = gsHandle->getVectorR ();
-        auto localT = gsHandle->getVectorT ();
-        auto localZ = gsHandle->getVectorZ ();
-
-        // outer Gauss-Seidel iteration
-        int NumOuterSweeps = gsHandle->getNumOuterSweeps ();
-        int NumInnerSweeps = gsHandle->getNumInnerSweeps ();
-        int NumSweeps = (NumOuterSweeps > numIter ? NumOuterSweeps : numIter);
-        if (direction == GS_SYMMETRIC) {
-          NumSweeps *= 2;
-        }
-        if (init_zero_x_vector) {
-          KokkosKernels::Impl::zero_vector<x_value_array_type, execution_space>(nrhs, localX);
-        }
-        for (int sweep = 0; sweep < NumSweeps; ++sweep) {
-          bool forward_sweep = (direction == GS_FORWARD ||
-                               (direction == GS_SYMMETRIC && sweep%2 == 0));
-          // compute residual vector
-          KokkosBlas::scal (localR, one, localB);
-          if (sweep > 0 || !init_zero_x_vector) {
-            if (compact_form) {
-              if (forward_sweep) {
-                // R = B - U*x
-                KokkosSparse::
-                spmv ("N", scalar_t(-one), crsmatUa,
-                                           localX,
-                                     one,  localR);
-              } else {
-                // R = B - L*x
-                KokkosSparse::
-                spmv ("N", scalar_t(-one), crsmatLa,
-                                           localX,
-                                     one,  localR);
-              }
-              if (omega != one) {
-                // R = B - (U + (1-1/omega)D)*x
-                scalar_t omega2 = (one/omega - one);
-                auto localY = Kokkos::subview (localX, range_type(0, num_rows), Kokkos::ALL ());
-                KokkosBlas::mult (zero, localZ,
-                                  one,  localDa, localY);
-                KokkosBlas::axpy (omega2, localZ, localR);
-              }
-            } else { // not compact_form
-              // R = B - A*x
-              KokkosSparse::
-              spmv ("N", scalar_t(-one), crsmatA,
-                                         localX,
-                                   one,  localR);
+    // load auxiliary vectors
+    int nrhs = localX.extent(1);
+    gsHandle->initVectors(num_rows, nrhs);
+    auto localR = gsHandle->getVectorR();
+    auto localT = gsHandle->getVectorT();
+    auto localZ = gsHandle->getVectorZ();
+
+    // outer Gauss-Seidel iteration
+    int NumOuterSweeps = gsHandle->getNumOuterSweeps();
+    int NumInnerSweeps = gsHandle->getNumInnerSweeps();
+    int NumSweeps      = (NumOuterSweeps > numIter ? NumOuterSweeps : numIter);
+    if (direction == GS_SYMMETRIC) {
+      NumSweeps *= 2;
+    }
+    if (init_zero_x_vector) {
+      KokkosKernels::Impl::zero_vector<x_value_array_type, execution_space>(
+          nrhs, localX);
+    }
+    for (int sweep = 0; sweep < NumSweeps; ++sweep) {
+      bool forward_sweep = (direction == GS_FORWARD ||
+                            (direction == GS_SYMMETRIC && sweep % 2 == 0));
+      // compute residual vector
+      KokkosBlas::scal(localR, one, localB);
+      if (sweep > 0 || !init_zero_x_vector) {
+        if (compact_form) {
+          if (forward_sweep) {
+            // R = B - U*x
+            KokkosSparse::spmv("N", scalar_t(-one), crsmatUa, localX, one,
+                               localR);
+          } else {
+            // R = B - L*x
+            KokkosSparse::spmv("N", scalar_t(-one), crsmatLa, localX, one,
+                               localR);
+          }
+          if (omega != one) {
+            // R = B - (U + (1-1/omega)D)*x
+            scalar_t omega2 = (one / omega - one);
+            auto localY =
+                Kokkos::subview(localX, range_type(0, num_rows), Kokkos::ALL());
+            KokkosBlas::mult(zero, localZ, one, localDa, localY);
+            KokkosBlas::axpy(omega2, localZ, localR);
+          }
+        } else {  // not compact_form
+          // R = B - A*x
+          KokkosSparse::spmv("N", scalar_t(-one), crsmatA, localX, one, localR);
 #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS
-              {
-                auto localRj = Kokkos::subview (localR, Kokkos::ALL (), range_type (0, 1));
-                single_vector_view_t Rj (localRj.data (), num_rows);
-                std::cout << "norm(GS)-" << sweep << " " << KokkosBlas::nrm2 (Rj)
-                          << " (" << (forward_sweep ? "forward" : "backward" )  << ")"
-                          << std::endl;
-              }
+          {
+            auto localRj =
+                Kokkos::subview(localR, Kokkos::ALL(), range_type(0, 1));
+            single_vector_view_t Rj(localRj.data(), num_rows);
+            std::cout << "norm(GS)-" << sweep << " " << KokkosBlas::nrm2(Rj)
+                      << " (" << (forward_sweep ? "forward" : "backward") << ")"
+                      << std::endl;
+          }
 #endif
-            }
+        }
+      }
+      if (!two_stage) {
+        // ===== sparse-triangular solve =====
+        // TODO: omega is not supported here
+        //       (L + D is extracted in initialize_numeric,
+        //        but (omega*L + D)^{-1} needs to be applied with omega passed
+        //        into apply)
+        //       hence, omega = one
+        if (omega != one) {
+          throw std::invalid_argument(
+              " *** TwostageGaussSeidel::apply with omega != one is not "
+              "supported with sptrsv ***\n");
+        }
+        if (forward_sweep) {
+          // Z = (omega * L + D)^{-1} * R
+          // NOTE: need to go over RHSs
+          using namespace KokkosSparse::Experimental;
+          for (int j = 0; j < nrhs; j++) {
+            auto localRj =
+                Kokkos::subview(localR, Kokkos::ALL(), range_type(j, j + 1));
+            auto localZj =
+                Kokkos::subview(localZ, Kokkos::ALL(), range_type(j, j + 1));
+            single_vector_view_t Rj(localRj.data(), num_rows);
+            single_vector_view_t Zj(localZj.data(), num_rows);
+            sptrsv_solve(handle->get_gs_sptrsvL_handle(), crsmatL.graph.row_map,
+                         crsmatL.graph.entries, crsmatL.values, Rj, Zj);
           }
-          if (!two_stage) {
-            // ===== sparse-triangular solve =====
-            // TODO: omega is not supported here 
-            //       (L + D is extracted in initialize_numeric, 
-            //        but (omega*L + D)^{-1} needs to be applied with omega passed into apply)
-            //       hence, omega = one
-            if (omega != one) {
-              throw std::invalid_argument (" *** TwostageGaussSeidel::apply with omega != one is not supported with sptrsv ***\n");
-            }
-            if (forward_sweep) {
-              // Z = (omega * L + D)^{-1} * R
-              // NOTE: need to go over RHSs
-              using namespace KokkosSparse::Experimental;
-              for (int j = 0; j < nrhs; j++) {
-                auto localRj = Kokkos::subview (localR, Kokkos::ALL (), range_type (j, j+1));
-                auto localZj = Kokkos::subview (localZ, Kokkos::ALL (), range_type (j, j+1));
-                single_vector_view_t Rj (localRj.data (), num_rows);
-                single_vector_view_t Zj (localZj.data (), num_rows);
-                sptrsv_solve (handle->get_gs_sptrsvL_handle(), crsmatL.graph.row_map, crsmatL.graph.entries, crsmatL.values, Rj, Zj);
-              }
-            } else {
-              using namespace KokkosSparse::Experimental;
-              // Z = (omega * U + D)^{-1} * R
-              // NOTE: need to go over RHSs
-              for (int j = 0; j < nrhs; j++) {
-                auto localRj = Kokkos::subview (localR, Kokkos::ALL (), range_type (j, j+1));
-                auto localZj = Kokkos::subview (localZ, Kokkos::ALL (), range_type (j, j+1));
-                single_vector_view_t Rj (localRj.data (), num_rows);
-                single_vector_view_t Zj (localZj.data (), num_rows);
-                sptrsv_solve (handle->get_gs_sptrsvU_handle(), crsmatU.graph.row_map, crsmatU.graph.entries, crsmatU.values, Rj, Zj);
-              }
-            }
+        } else {
+          using namespace KokkosSparse::Experimental;
+          // Z = (omega * U + D)^{-1} * R
+          // NOTE: need to go over RHSs
+          for (int j = 0; j < nrhs; j++) {
+            auto localRj =
+                Kokkos::subview(localR, Kokkos::ALL(), range_type(j, j + 1));
+            auto localZj =
+                Kokkos::subview(localZ, Kokkos::ALL(), range_type(j, j + 1));
+            single_vector_view_t Rj(localRj.data(), num_rows);
+            single_vector_view_t Zj(localZj.data(), num_rows);
+            sptrsv_solve(handle->get_gs_sptrsvU_handle(), crsmatU.graph.row_map,
+                         crsmatU.graph.entries, crsmatU.values, Rj, Zj);
+          }
+        }
 
-            // update solution (no omega)
-            auto localY = Kokkos::subview (localX, range_type(0, num_rows), Kokkos::ALL ());
-            if (compact_form) {
-              // Y = omega * Z
-              KokkosBlas::scal (localY, one, localZ);
-            } else {
-              // Y = Y + omega * Z
-              KokkosBlas::axpy (one, localZ, localY);
-            }
-          } else {
-            // ====== inner Jacobi-Richardson =====
+        // update solution (no omega)
+        auto localY =
+            Kokkos::subview(localX, range_type(0, num_rows), Kokkos::ALL());
+        if (compact_form) {
+          // Y = omega * Z
+          KokkosBlas::scal(localY, one, localZ);
+        } else {
+          // Y = Y + omega * Z
+          KokkosBlas::axpy(one, localZ, localY);
+        }
+      } else {
+        // ====== inner Jacobi-Richardson =====
 #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS
-            //compute initial residual norm
-            // > compute RHS for the inner loop, R = B - A*x
-            internal_vector_view_t tempR ("tempR", num_rows, 1);
-            KokkosBlas::scal (tempR, one, localB);
-            KokkosSparse::
-            spmv ("N", scalar_t(-one), crsmatA,
-                                       localX,
-                                 one,  tempR);
-            // > initial vector for the inner loop is zero
-            Kokkos::deep_copy (localZ, zero);
-            using Norm_Functor_t = TwostageGaussSeidel_functor<row_map_view_t, entries_view_t, values_view_t>;
-            using range_policy = Kokkos::RangePolicy <Tag_normR, execution_space>;
-            {
-              mag_t normR = zero;
-              Kokkos::parallel_reduce ("normR", range_policy (0, num_rows),
-                                       Norm_Functor_t (forward_sweep, num_rows,
-                                                       rowmap_view, column_view, values_view,
-                                                       localD, localZ, tempR),
-                                       normR);
-              std::cout << "> norm(JR)-" << 0 << " " << sqrt(normR) << std::endl;
-            }
+        // compute initial residual norm
+        // > compute RHS for the inner loop, R = B - A*x
+        internal_vector_view_t tempR("tempR", num_rows, 1);
+        KokkosBlas::scal(tempR, one, localB);
+        KokkosSparse::spmv("N", scalar_t(-one), crsmatA, localX, one, tempR);
+        // > initial vector for the inner loop is zero
+        Kokkos::deep_copy(localZ, zero);
+        using Norm_Functor_t =
+            TwostageGaussSeidel_functor<row_map_view_t, entries_view_t,
+                                        values_view_t>;
+        using range_policy = Kokkos::RangePolicy<Tag_normR, execution_space>;
+        {
+          mag_t normR = zero;
+          Kokkos::parallel_reduce(
+              "normR", range_policy(0, num_rows),
+              Norm_Functor_t(forward_sweep, num_rows, rowmap_view, column_view,
+                             values_view, localD, localZ, tempR),
+              normR);
+          std::cout << "> norm(JR)-" << 0 << " " << sqrt(normR) << std::endl;
+        }
 #endif
-            // compute starting vector: Z = D^{-1}*R (Z is correction, i.e., output of JR)
-            if (NumInnerSweeps == 0) {
-              // this is Jacobi-Richardson X_{k+1} := X_{k} + D^{-1}(b-A*X_{k})
-              // copy to localZ (output of JR iteration)
-
-              // row-scale: (D^{-1}*L)*Y = D^{-1}*B
-              // compute Z := D^{-1}*R
-              KokkosBlas::mult (zero, localZ,
-                                one,  localD, localR);
-              // apply inner damping factor, if not one
-              if (gamma != one) {
-                // Z = gamma * Z
-                KokkosBlas::scal (localZ, gamma, localZ);
-              }
-            } else {
-              // copy to localT (workspace used to save D^{-1}*R for JR iteration)
-              KokkosBlas::mult (zero, localT,
-                                one,  localD, localR);
-              // initialize Jacobi-Richardson (using R as workspace for JR iteration)
-              KokkosBlas::scal (localR, one, localT);
-
-              // apply inner damping factor, if not one
-              if (gamma != one) {
-                // R = gamma * R
-                KokkosBlas::scal (localR, gamma, localR);
-              }
-            }
+        // compute starting vector: Z = D^{-1}*R (Z is correction, i.e., output
+        // of JR)
+        if (NumInnerSweeps == 0) {
+          // this is Jacobi-Richardson X_{k+1} := X_{k} + D^{-1}(b-A*X_{k})
+          // copy to localZ (output of JR iteration)
+
+          // row-scale: (D^{-1}*L)*Y = D^{-1}*B
+          // compute Z := D^{-1}*R
+          KokkosBlas::mult(zero, localZ, one, localD, localR);
+          // apply inner damping factor, if not one
+          if (gamma != one) {
+            // Z = gamma * Z
+            KokkosBlas::scal(localZ, gamma, localZ);
+          }
+        } else {
+          // copy to localT (workspace used to save D^{-1}*R for JR iteration)
+          KokkosBlas::mult(zero, localT, one, localD, localR);
+          // initialize Jacobi-Richardson (using R as workspace for JR
+          // iteration)
+          KokkosBlas::scal(localR, one, localT);
+
+          // apply inner damping factor, if not one
+          if (gamma != one) {
+            // R = gamma * R
+            KokkosBlas::scal(localR, gamma, localR);
+          }
+        }
 #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS
-            {
-              // compute residual norm of the starting vector (D^{-1}R)
-              mag_t normR = zero;
-              Kokkos::parallel_reduce ("normR", range_policy (0, num_rows),
-                                       Norm_Functor_t (forward_sweep, num_rows,
-                                                       rowmap_view, column_view, values_view,
-                                                       localD, localT, tempR),
-                                       normR);
-              std::cout << "> norm(JR)-" << 1 << " " << sqrt(normR) << std::endl;
-            }
+        {
+          // compute residual norm of the starting vector (D^{-1}R)
+          mag_t normR = zero;
+          Kokkos::parallel_reduce(
+              "normR", range_policy(0, num_rows),
+              Norm_Functor_t(forward_sweep, num_rows, rowmap_view, column_view,
+                             values_view, localD, localT, tempR),
+              normR);
+          std::cout << "> norm(JR)-" << 1 << " " << sqrt(normR) << std::endl;
+        }
 #endif
-            // inner Jacobi-Richardson:
-            for (int ii = 0; ii < NumInnerSweeps; ii++) {
-              // T = D^{-1}*R, and L = D^{-1}*L and U = D^{-1}*U
-              // copy T into Z
-              KokkosBlas::scal (localZ, one, localT);
-              if (forward_sweep) {
-                // Z = Z - L*R
-                KokkosSparse::
-                spmv("N", scalar_t(-omega), crsmatL,
-                                            localR,
-                                      one,  localZ);
-              }
-              else {
-                // Z = R - U*T
-                KokkosSparse::
-                spmv("N", scalar_t(-omega), crsmatU,
-                                             localR,
-                                       one,  localZ);
-              }
-              // apply inner damping factor, if not one
-              if (gamma != one) {
-                // Z = gamma * Z
-                KokkosBlas::scal (localZ, gamma, localZ);
-                // Z = Z + (one - one/gamma) * R
-                scalar_t gamma2 = one - gamma;
-                KokkosBlas::axpy (gamma2, localR, localZ);
-              }
-              if (ii+1 < NumInnerSweeps) {
-                // reinitialize (R to be Z)
-                KokkosBlas::scal (localR, one, localZ);
-              }
+        // inner Jacobi-Richardson:
+        for (int ii = 0; ii < NumInnerSweeps; ii++) {
+          // T = D^{-1}*R, and L = D^{-1}*L and U = D^{-1}*U
+          // copy T into Z
+          KokkosBlas::scal(localZ, one, localT);
+          if (forward_sweep) {
+            // Z = Z - L*R
+            KokkosSparse::spmv("N", scalar_t(-omega), crsmatL, localR, one,
+                               localZ);
+          } else {
+            // Z = R - U*T
+            KokkosSparse::spmv("N", scalar_t(-omega), crsmatU, localR, one,
+                               localZ);
+          }
+          // apply inner damping factor, if not one
+          if (gamma != one) {
+            // Z = gamma * Z
+            KokkosBlas::scal(localZ, gamma, localZ);
+            // Z = Z + (one - one/gamma) * R
+            scalar_t gamma2 = one - gamma;
+            KokkosBlas::axpy(gamma2, localR, localZ);
+          }
+          if (ii + 1 < NumInnerSweeps) {
+            // reinitialize (R to be Z)
+            KokkosBlas::scal(localR, one, localZ);
+          }
 #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS
-              {
-                // compute residual norm(r - (L+D)*y)
-                mag_t normR = zero;
-                Kokkos::parallel_reduce ("normR", range_policy (0, num_rows),
-                                         Norm_Functor_t (forward_sweep, num_rows,
-                                                         rowmap_view, column_view, values_view,
-                                                         localD, localZ, tempR),
-                                         normR);
-                std::cout << "> norm(JR)-" << 2+ii << " " << sqrt(normR) << std::endl;
-              }
+          {
+            // compute residual norm(r - (L+D)*y)
+            mag_t normR = zero;
+            Kokkos::parallel_reduce(
+                "normR", range_policy(0, num_rows),
+                Norm_Functor_t(forward_sweep, num_rows, rowmap_view,
+                               column_view, values_view, localD, localZ, tempR),
+                normR);
+            std::cout << "> norm(JR)-" << 2 + ii << " " << sqrt(normR)
+                      << std::endl;
+          }
 #endif
-            } // end of inner Jacobi Richardson
+        }  // end of inner Jacobi Richardson
 
-            // update solution
-            auto localY = Kokkos::subview (localX, range_type(0, num_rows), Kokkos::ALL ());
-            if (compact_form) {
-              // Y := omega * z
-              KokkosBlas::scal (localY, omega, localZ);
-            } else {
-              // Y := X + omega * Z
-              KokkosBlas::axpy (omega, localZ, localY);
-            }
-          } // end of inner GS sweep
-        } // end of outer GS sweep
-#ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS
-        {
-          // R = B - A*x
-          KokkosBlas::scal (localR, one, localB);
-          KokkosSparse::
-          spmv ("N", scalar_t(-one), crsmatA,
-                                     localX,
-                               one,  localR);
-          auto localRj = Kokkos::subview (localR, Kokkos::ALL (), range_type (0, 1));
-          single_vector_view_t Rj (localRj.data (), num_rows);
-          std::cout << "norm(GS)-" << NumSweeps << " " << KokkosBlas::nrm2 (Rj) << std::endl;
+        // update solution
+        auto localY =
+            Kokkos::subview(localX, range_type(0, num_rows), Kokkos::ALL());
+        if (compact_form) {
+          // Y := omega * z
+          KokkosBlas::scal(localY, omega, localZ);
+        } else {
+          // Y := X + omega * Z
+          KokkosBlas::axpy(omega, localZ, localY);
         }
+      }  // end of inner GS sweep
+    }    // end of outer GS sweep
+#ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS
+    {
+      // R = B - A*x
+      KokkosBlas::scal(localR, one, localB);
+      KokkosSparse::spmv("N", scalar_t(-one), crsmatA, localX, one, localR);
+      auto localRj = Kokkos::subview(localR, Kokkos::ALL(), range_type(0, 1));
+      single_vector_view_t Rj(localRj.data(), num_rows);
+      std::cout << "norm(GS)-" << NumSweeps << " " << KokkosBlas::nrm2(Rj)
+                << std::endl;
+    }
 #endif
-      }
-    };
   }
-}
+};
+}  // namespace Impl
+}  // namespace KokkosSparse
 #endif
diff --git a/src/stage/blas3/Kokkos_Blas3.hpp b/src/stage/blas3/Kokkos_Blas3.hpp
index dac91d63ef..cf9dd9a1bf 100644
--- a/src/stage/blas3/Kokkos_Blas3.hpp
+++ b/src/stage/blas3/Kokkos_Blas3.hpp
@@ -42,179 +42,217 @@
 //@HEADER
 */
 
-
-//Do not use this it has not been tested at all!
+// Do not use this it has not been tested at all!
 #ifndef KOKKOS_BLAS3_HPP_
 #define KOKKOS_BLAS3_HPP_
 
 #include <Kokkos_Blas3_impl.hpp>
-#include <type_traits> // requires C++11
+#include <type_traits>  // requires C++11
+#include "KokkosKernels_Error.hpp"
 
 namespace KokkosBlas {
 
-template<class AMat,class BMat,class CMat>
-void gemm(const char transA,const char transB,AMat::const_value_type alpha ,const AMat& a, const BMat& b,CMat::const_value_type beta, const CMat &c)
-{
-  static_assert (Kokkos::Impl::is_view<AMat>::value,
-                 "KokkosBlas::gemm: AMat must be a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<BMat>::value,
-                 "KokkosBlas::gemm: BMat must be a Kokkos::View.");
-  static_assert (Kokkos::Impl::is_view<CMat>::value,
-                 "KokkosBlas::gemm: CMat must be a Kokkos::View.");
-  static_assert ( AMat::rank !=  BMat::rank && CMat::rank !=  BMat::rank,
-                 "KokkosBlas::gemm: Matrix ranks do not match.");
-  static_assert ( AMat::rank != 2 || AMat::rank != 3 ,
-                 "KokkosBlas::gemm: Matrix ranks do not match.");
+template <class AMat, class BMat, class CMat>
+void gemm(const char transA, const char transB, AMat::const_value_type alpha,
+          const AMat& a, const BMat& b, CMat::const_value_type beta,
+          const CMat& c) {
+  static_assert(Kokkos::is_view<AMat>::value,
+                "KokkosBlas::gemm: AMat must be a Kokkos::View.");
+  static_assert(Kokkos::is_view<BMat>::value,
+                "KokkosBlas::gemm: BMat must be a Kokkos::View.");
+  static_assert(Kokkos::is_view<CMat>::value,
+                "KokkosBlas::gemm: CMat must be a Kokkos::View.");
+  static_assert(AMat::rank != BMat::rank && CMat::rank != BMat::rank,
+                "KokkosBlas::gemm: Matrix ranks do not match.");
+  static_assert(AMat::rank != 2 || AMat::rank != 3,
+                "KokkosBlas::gemm: Matrix ranks do not match.");
   // Check compatibility of dimensions at run time.
-  if ((transA=='n'||transA=='N')&&(transB=='n'||transB=='N')) {
-if(rank==2){
-if(a.extent(0)!=c.extent(0)||a.extent(1)!=b.extent(0)||b.extent(1)!=c.extent(1)){
-    std::ostringstream os;
-    os << "KokkosBlas::gemm: Matrix Dimensions Dimensions do not match for A notrans and B notrans: "
-       << ", A: " << "("<<a.extent(0)<<","<<a.extent(1)<<")"
-       << ", B: " << "("<<b.extent(0)<<","<<b.extent(1)<<")"
-       << ", C: " << "("<<c.extent(0)<<","<<c.extent(1)<<")";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
-}
-}
-else if(rank==3){
-if(a.extent(1)!=c.extent(1)||a.extent(2)!=b.extent(1)||b.extent(2)!=c.extent(2)||a.extent(0)!=b.extent(0)||b.extent(0)!=c.extent(0)){
-    std::ostringstream os;
-    os << "KokkosBlas::gemm: Matrix Dimensions Dimensions do not match for A notrans and B notrans: "
-       << ", A: " << "("<<a.extent(0)<<","<<a.extent(1)<<","<<a.extent(2)<<")"
-       << ", B: " << "("<<b.extent(0)<<","<<b.extent(1)<<","<<b.extent(2)<<")"
-       << ", C: " << "("<<c.extent(0)<<","<<c.extent(1)<<","<<c.extent(2)<<")";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
-}
-}
+  if ((transA == 'n' || transA == 'N') && (transB == 'n' || transB == 'N')) {
+    if (rank == 2) {
+      if (a.extent(0) != c.extent(0) || a.extent(1) != b.extent(0) ||
+          b.extent(1) != c.extent(1)) {
+        std::ostringstream os;
+        os << "KokkosBlas::gemm: Matrix Dimensions Dimensions do not match for "
+              "A notrans and B notrans: "
+           << ", A: "
+           << "(" << a.extent(0) << "," << a.extent(1) << ")"
+           << ", B: "
+           << "(" << b.extent(0) << "," << b.extent(1) << ")"
+           << ", C: "
+           << "(" << c.extent(0) << "," << c.extent(1) << ")";
+        KokkosKernels::Impl::throw_runtime_exception(os.str());
+      }
+    } else if (rank == 3) {
+      if (a.extent(1) != c.extent(1) || a.extent(2) != b.extent(1) ||
+          b.extent(2) != c.extent(2) || a.extent(0) != b.extent(0) ||
+          b.extent(0) != c.extent(0)) {
+        std::ostringstream os;
+        os << "KokkosBlas::gemm: Matrix Dimensions Dimensions do not match for "
+              "A notrans and B notrans: "
+           << ", A: "
+           << "(" << a.extent(0) << "," << a.extent(1) << "," << a.extent(2)
+           << ")"
+           << ", B: "
+           << "(" << b.extent(0) << "," << b.extent(1) << "," << b.extent(2)
+           << ")"
+           << ", C: "
+           << "(" << c.extent(0) << "," << c.extent(1) << "," << c.extent(2)
+           << ")";
+        KokkosKernels::Impl::throw_runtime_exception(os.str());
+      }
+    }
 
-}else if((transA=='n'||transA=='N')&&(transB=='t'||transB=='T')){
-if(rank==2){
-if(a.extent(0)!=c.extent(0)||a.extent(1)!=b.extent(1)||b.extent(0)!=c.extent(1)){
-    std::ostringstream os;
-    os << "KokkosBlas::gemm: Matrix Dimensions Dimensions do not match for A notrans and B notrans: "
-       << ", A: " << "("<<a.extent(0)<<","<<a.extent(1)<<")"
-       << ", B: " << "("<<b.extent(0)<<","<<b.extent(1)<<")"
-       << ", C: " << "("<<c.extent(0)<<","<<c.extent(1)<<")";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
-}
-}
-else if(rank==3){
-if(a.extent(1)!=c.extent(1)||a.extent(2)!=b.extent(2)||b.extent(1)!=c.extent(2)||a.extent(0)!=b.extent(0)||b.extent(0)!=c.extent(0)){
-    std::ostringstream os;
-    os << "KokkosBlas::gemm: Matrix Dimensions Dimensions do not match for A notrans and B notrans: "
-       << ", A: " << "("<<a.extent(0)<<","<<a.extent(1)<<","<<a.extent(2)<<")"
-       << ", B: " << "("<<b.extent(0)<<","<<b.extent(1)<<","<<b.extent(2)<<")"
-       << ", C: " << "("<<c.extent(0)<<","<<c.extent(1)<<","<<c.extent(2)<<")";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
-}
-}
+  } else if ((transA == 'n' || transA == 'N') &&
+             (transB == 't' || transB == 'T')) {
+    if (rank == 2) {
+      if (a.extent(0) != c.extent(0) || a.extent(1) != b.extent(1) ||
+          b.extent(0) != c.extent(1)) {
+        std::ostringstream os;
+        os << "KokkosBlas::gemm: Matrix Dimensions Dimensions do not match for "
+              "A notrans and B notrans: "
+           << ", A: "
+           << "(" << a.extent(0) << "," << a.extent(1) << ")"
+           << ", B: "
+           << "(" << b.extent(0) << "," << b.extent(1) << ")"
+           << ", C: "
+           << "(" << c.extent(0) << "," << c.extent(1) << ")";
+        KokkosKernels::Impl::throw_runtime_exception(os.str());
+      }
+    } else if (rank == 3) {
+      if (a.extent(1) != c.extent(1) || a.extent(2) != b.extent(2) ||
+          b.extent(1) != c.extent(2) || a.extent(0) != b.extent(0) ||
+          b.extent(0) != c.extent(0)) {
+        std::ostringstream os;
+        os << "KokkosBlas::gemm: Matrix Dimensions Dimensions do not match for "
+              "A notrans and B notrans: "
+           << ", A: "
+           << "(" << a.extent(0) << "," << a.extent(1) << "," << a.extent(2)
+           << ")"
+           << ", B: "
+           << "(" << b.extent(0) << "," << b.extent(1) << "," << b.extent(2)
+           << ")"
+           << ", C: "
+           << "(" << c.extent(0) << "," << c.extent(1) << "," << c.extent(2)
+           << ")";
+        KokkosKernels::Impl::throw_runtime_exception(os.str());
+      }
+    }
 
+  } else if ((transA == 't' || transA == 'T') &&
+             (transB == 'n' || transB == 'N')) {
+    if (rank == 2) {
+      if (a.extent(1) != c.extent(0) || a.extent(0) != b.extent(0) ||
+          b.extent(1) != c.extent(1)) {
+        std::ostringstream os;
+        os << "KokkosBlas::gemm: Matrix Dimensions Dimensions do not match for "
+              "A notrans and B notrans: "
+           << ", A: "
+           << "(" << a.extent(0) << "," << a.extent(1) << ")"
+           << ", B: "
+           << "(" << b.extent(0) << "," << b.extent(1) << ")"
+           << ", C: "
+           << "(" << c.extent(0) << "," << c.extent(1) << ")";
+        KokkosKernels::Impl::throw_runtime_exception(os.str());
+      }
+    } else if (rank == 3) {
+      if (a.extent(2) != c.extent(1) || a.extent(1) != b.extent(1) ||
+          b.extent(2) != c.extent(2) || a.extent(0) != b.extent(0) ||
+          b.extent(0) != c.extent(0)) {
+        std::ostringstream os;
+        os << "KokkosBlas::gemm: Matrix Dimensions Dimensions do not match for "
+              "A notrans and B notrans: "
+           << ", A: "
+           << "(" << a.extent(0) << "," << a.extent(1) << "," << a.extent(2)
+           << ")"
+           << ", B: "
+           << "(" << b.extent(0) << "," << b.extent(1) << "," << b.extent(2)
+           << ")"
+           << ", C: "
+           << "(" << c.extent(0) << "," << c.extent(1) << "," << c.extent(2)
+           << ")";
+        KokkosKernels::Impl::throw_runtime_exception(os.str());
+      }
+    }
 
-}else if ((transA=='t'||transA=='T')&&(transB=='n'||transB=='N')) {
-if(rank==2){
-if(a.extent(1)!=c.extent(0)||a.extent(0)!=b.extent(0)||b.extent(1)!=c.extent(1)){
-    std::ostringstream os;
-    os << "KokkosBlas::gemm: Matrix Dimensions Dimensions do not match for A notrans and B notrans: "
-       << ", A: " << "("<<a.extent(0)<<","<<a.extent(1)<<")"
-       << ", B: " << "("<<b.extent(0)<<","<<b.extent(1)<<")"
-       << ", C: " << "("<<c.extent(0)<<","<<c.extent(1)<<")";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
-}
-}
-else if(rank==3){
-if(a.extent(2)!=c.extent(1)||a.extent(1)!=b.extent(1)||b.extent(2)!=c.extent(2)||a.extent(0)!=b.extent(0)||b.extent(0)!=c.extent(0)){
-    std::ostringstream os;
-    os << "KokkosBlas::gemm: Matrix Dimensions Dimensions do not match for A notrans and B notrans: "
-       << ", A: " << "("<<a.extent(0)<<","<<a.extent(1)<<","<<a.extent(2)<<")"
-       << ", B: " << "("<<b.extent(0)<<","<<b.extent(1)<<","<<b.extent(2)<<")"
-       << ", C: " << "("<<c.extent(0)<<","<<c.extent(1)<<","<<c.extent(2)<<")";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
-}
-}
-
-}else if ((transA=='t'||transA=='T')&&(transB=='t'||transB=='T')) {
-if(rank==2){
-if(a.extent(1)!=c.extent(0)||a.extent(0)!=b.extent(1)||b.extent(0)!=c.extent(1)){
-    std::ostringstream os;
-    os << "KokkosBlas::gemm: Matrix Dimensions Dimensions do not match for A notrans and B notrans: "
-       << ", A: " << "("<<a.extent(0)<<","<<a.extent(1)<<")"
-       << ", B: " << "("<<b.extent(0)<<","<<b.extent(1)<<")"
-       << ", C: " << "("<<c.extent(0)<<","<<c.extent(1)<<")";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
-}
-}
-else if(rank==3){
-if(a.extent(2)!=c.extent(1)||a.extent(1)!=b.extent(2)||b.extent(1)!=c.extent(2)||a.extent(0)!=b.extent(0)||b.extent(0)!=c.extent(0)){
-    std::ostringstream os;
-    os << "KokkosBlas::gemm: Matrix Dimensions Dimensions do not match for A notrans and B notrans: "
-       << ", A: " << "("<<a.extent(0)<<","<<a.extent(1)<<","<<a.extent(2)<<")"
-       << ", B: " << "("<<b.extent(0)<<","<<b.extent(1)<<","<<b.extent(2)<<")"
-       << ", C: " << "("<<c.extent(0)<<","<<c.extent(1)<<","<<c.extent(2)<<")";
-    Kokkos::Impl::throw_runtime_exception (os.str ());
-}
-}
+  } else if ((transA == 't' || transA == 'T') &&
+             (transB == 't' || transB == 'T')) {
+    if (rank == 2) {
+      if (a.extent(1) != c.extent(0) || a.extent(0) != b.extent(1) ||
+          b.extent(0) != c.extent(1)) {
+        std::ostringstream os;
+        os << "KokkosBlas::gemm: Matrix Dimensions Dimensions do not match for "
+              "A notrans and B notrans: "
+           << ", A: "
+           << "(" << a.extent(0) << "," << a.extent(1) << ")"
+           << ", B: "
+           << "(" << b.extent(0) << "," << b.extent(1) << ")"
+           << ", C: "
+           << "(" << c.extent(0) << "," << c.extent(1) << ")";
+        KokkosKernels::Impl::throw_runtime_exception(os.str());
+      }
+    } else if (rank == 3) {
+      if (a.extent(2) != c.extent(1) || a.extent(1) != b.extent(2) ||
+          b.extent(1) != c.extent(2) || a.extent(0) != b.extent(0) ||
+          b.extent(0) != c.extent(0)) {
+        std::ostringstream os;
+        os << "KokkosBlas::gemm: Matrix Dimensions Dimensions do not match for "
+              "A notrans and B notrans: "
+           << ", A: "
+           << "(" << a.extent(0) << "," << a.extent(1) << "," << a.extent(2)
+           << ")"
+           << ", B: "
+           << "(" << b.extent(0) << "," << b.extent(1) << "," << b.extent(2)
+           << ")"
+           << ", C: "
+           << "(" << c.extent(0) << "," << c.extent(1) << "," << c.extent(2)
+           << ")";
+        KokkosKernels::Impl::throw_runtime_exception(os.str());
+      }
+    }
 
-}else{
+  } else {
     std::ostringstream os;
-    os << "KokkosBlas::gemm: values for transA or transB should be T t or N n you have input: "
-    <<"TransA:"<<transA<<" TransB:"<<transB;
-    Kokkos::Impl::throw_runtime_exception (os.str ());
+    os << "KokkosBlas::gemm: values for transA or transB should be T t or N n "
+          "you have input: "
+       << "TransA:" << transA << " TransB:" << transB;
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
+  }
 
-}
-  
-
-
-  typedef Kokkos::View<
-    typename Kokkos::Impl::if_c<
-      AMat::rank == 2,
-      typename AMat::const_value_type**,
-      typename AMat::const_value_type*** >::type,
-    typename AMat::array_layout,
-    typename AMat::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged>,
-    typename AMat::specialize> AMat_Internal;
-
-  typedef Kokkos::View<
-    typename Kokkos::Impl::if_c<
-      BMat::rank == 2,
-      typename BMat::const_value_type**,
-      typename BMat::const_value_type*** >::type,
-    typename BMat::array_layout,
-    typename BMat::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged>,
-    typename BMat::specialize> BMat_Internal;
-
-  typedef Kokkos::View<
-    typename Kokkos::Impl::if_c<
-      CMat::rank == 2,
-      typename CMat::const_value_type**,
-      typename CMat::const_value_type*** >::type,
-    typename CMat::array_layout,
-    typename CMat::device_type,
-    Kokkos::MemoryTraits<Kokkos::Unmanaged>,
-    typename CMat::specialize> CMat_Internal;
+  typedef Kokkos::View<typename std::conditional<
+                           AMat::rank == 2, typename AMat::const_value_type**,
+                           typename AMat::const_value_type***>::type,
+                       typename AMat::array_layout, typename AMat::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>,
+                       typename AMat::specialize>
+      AMat_Internal;
 
+  typedef Kokkos::View<typename std::conditional<
+                           BMat::rank == 2, typename BMat::const_value_type**,
+                           typename BMat::const_value_type***>::type,
+                       typename BMat::array_layout, typename BMat::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>,
+                       typename BMat::specialize>
+      BMat_Internal;
 
+  typedef Kokkos::View<typename std::conditional<
+                           CMat::rank == 2, typename CMat::const_value_type**,
+                           typename CMat::const_value_type***>::type,
+                       typename CMat::array_layout, typename CMat::device_type,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>,
+                       typename CMat::specialize>
+      CMat_Internal;
 
   AMat_Internal a_i = a;
   BMat_Internal b_i = b;
   CMat_Internal c_i = c;
 
-Impl::MultiGemm<AMat::non_const_value_type,
-                BMat::non_const_value_type,
-                CMat::non_const_value_type,
-                CMat::execution_space,
-                AMat::array_layout,
-                BMat::array_layout,
-                CMat::array_layout,
-                CMat::size_type,
-                CMat::rank>::GEMM(transA, transB, alpha, A, B, beta, C);
-
+  Impl::MultiGemm<AMat::non_const_value_type, BMat::non_const_value_type,
+                  CMat::non_const_value_type, CMat::execution_space,
+                  AMat::array_layout, BMat::array_layout, CMat::array_layout,
+                  CMat::size_type, CMat::rank>::GEMM(transA, transB, alpha, A,
+                                                     B, beta, C);
 }
 
+}  // namespace KokkosBlas
 
-
-} // namespace KokkosBlas
-
-#endif // KOKKOS_BLAS3_HPP_
+#endif  // KOKKOS_BLAS3_HPP_
diff --git a/src/stage/blas3/Kokkos_Blas3_impl.hpp b/src/stage/blas3/Kokkos_Blas3_impl.hpp
index eae6c0168f..c33372f695 100644
--- a/src/stage/blas3/Kokkos_Blas3_impl.hpp
+++ b/src/stage/blas3/Kokkos_Blas3_impl.hpp
@@ -42,7 +42,7 @@
 //@HEADER
 */
 
-//Do not use this it has not been tested at all!
+// Do not use this it has not been tested at all!
 #ifndef KOKKOS_BLAS3_IMPL_HPP_
 #define KOKKOS_BLAS3_IMPL_HPP_
 
@@ -57,31 +57,31 @@
 #endif
 namespace KokkosBlas {
 namespace Impl {
-//Define block size
+// Define block size
 
-size_t block_size=32;
+size_t block_size = 32;
 
-template<class AMat, class BMat, class CMat>
-struct blas3_right_2_N_N
-{
-  typedef typename AMat::execution_space        execution_space;
-  typedef AMat::size_type                             size_type;
+template <class AMat, class BMat, class CMat>
+struct blas3_right_2_N_N {
+  typedef typename AMat::execution_space execution_space;
+  typedef AMat::size_type size_type;
 
-  AMat  a;
-  BMat  b;
-  CMat  c;
+  AMat a;
+  BMat b;
+  CMat c;
   AMat::non_const_value_type alpha;
   CMat::non_const_value_type beta;
-   (AMat::const_value_type alpha_,const AMat &a_, const BMat &b_,CMat::const_value_type beta, CMat &c_) : alpha(alpha_), a (a_), b (b_), beta(beta_), c(c_) {}
+  (AMat::const_value_type alpha_, const AMat &a_, const BMat &b_,
+   CMat::const_value_type beta, CMat &c_)
+      : alpha(alpha_), a(a_), b(b_), beta(beta_), c(c_) {}
 
   // Prefer const size_type& to const size_type or size_type,
   // since the compiler has an easier time inlining the former.
-  KOKKOS_FORCEINLINE_FUNCTION void
-  operator() (const size_type &jjblocknum) const
-  {
-size_type jj=jjblocknum*block_size;
+  KOKKOS_FORCEINLINE_FUNCTION void operator()(
+      const size_type &jjblocknum) const {
+    size_type jj = jjblocknum * block_size;
 
-for(size_type i=0;i<c.extent(0);i++){
+    for (size_type i = 0; i < c.extent(0); i++) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
@@ -89,60 +89,61 @@ for(size_type i=0;i<c.extent(0);i++){
 #pragma vector always
 #endif
 
-	for(size_type j = jj; j<((jj+block_size)>c.extent(1)?c.extent(1):(jj+block_size)); j++){
-                                c(i,j) = beta*c(i,j);
-                                }
-}
-
-
-        for(size_type kk=0;kk<a.extent(1);kk+= block_size){
-          
-	      for(size_type i=0;i<c.extent(0);i++){
-                               
-                     for(size_type k = kk; k<((kk+block_size)>a.extent(1)?a.extent(1):(kk+block_size)); k++){
-AMat::const_value_type alpha_a=alpha*a(i,k);
+      for (size_type j = jj;
+           j <
+           ((jj + block_size) > c.extent(1) ? c.extent(1) : (jj + block_size));
+           j++) {
+        c(i, j) = beta * c(i, j);
+      }
+    }
+
+    for (size_type kk = 0; kk < a.extent(1); kk += block_size) {
+      for (size_type i = 0; i < c.extent(0); i++) {
+        for (size_type k = kk;
+             k < ((kk + block_size) > a.extent(1) ? a.extent(1)
+                                                  : (kk + block_size));
+             k++) {
+          AMat::const_value_type alpha_a = alpha * a(i, k);
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
 #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
 #pragma vector always
 #endif
-                                
-                        for(size_type j = jj; j<((jj+block_size)>c.extent(1)?c.extent(1):(jj+block_size)); j++){
-			        
-				c(i,j) +=alpha_a*b(k,j);
-                               
-				 }
-                               
-                        }
-                }
-        }
 
+          for (size_type j = jj;
+               j < ((jj + block_size) > c.extent(1) ? c.extent(1)
+                                                    : (jj + block_size));
+               j++) {
+            c(i, j) += alpha_a * b(k, j);
+          }
+        }
+      }
+    }
   }
 };
 
+template <class AMat, class BMat, class CMat>
+struct blas3_right_2_N_T {
+  typedef typename AMat::execution_space execution_space;
+  typedef AMat::size_type size_type;
 
-template<class AMat, class BMat, class CMat>
-struct blas3_right_2_N_T
-{
-  typedef typename AMat::execution_space        execution_space;
-  typedef AMat::size_type                             size_type;
-
-  AMat  a;
-  BMat  b;
-  CMat  c;
+  AMat a;
+  BMat b;
+  CMat c;
   AMat::non_const_value_type alpha;
   CMat::non_const_value_type beta;
-   (AMat::const_value_type alpha_,const AMat &a_, const BMat &b_,CMat::const_value_type beta, CMat &c_) : alpha(alpha_), a (a_), b (b_), beta(beta_), c(c_) {}
+  (AMat::const_value_type alpha_, const AMat &a_, const BMat &b_,
+   CMat::const_value_type beta, CMat &c_)
+      : alpha(alpha_), a(a_), b(b_), beta(beta_), c(c_) {}
 
   // Prefer const size_type& to const size_type or size_type,
   // since the compiler has an easier time inlining the former.
-  KOKKOS_FORCEINLINE_FUNCTION void
-  operator() (const size_type &jjblocknum) const
-  {
-size_type jj=jjblocknum*block_size;
+  KOKKOS_FORCEINLINE_FUNCTION void operator()(
+      const size_type &jjblocknum) const {
+    size_type jj = jjblocknum * block_size;
 
-for(size_type i=0;i<c.extent(0);i++){
+    for (size_type i = 0; i < c.extent(0); i++) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
@@ -150,59 +151,59 @@ for(size_type i=0;i<c.extent(0);i++){
 #pragma vector always
 #endif
 
-	for(size_type j = jj; j<((jj+block_size)>c.extent(1)?c.extent(1):(jj+block_size)); j++){
-                                c(i,j) = beta*c(i,j);
-                                }
-}
-
-
-        for(size_type kk=0;kk<a.extent(1);kk+= block_size){
-          
-	      for(size_type i=0;i<c.extent(0);i++){
-                               
-                                
-                        for(size_type j = jj; j<((jj+block_size)>c.extent(1)?c.extent(1):(jj+block_size)); j++){
-CMat::non_const_value_type temp=0;
+      for (size_type j = jj;
+           j <
+           ((jj + block_size) > c.extent(1) ? c.extent(1) : (jj + block_size));
+           j++) {
+        c(i, j) = beta * c(i, j);
+      }
+    }
+
+    for (size_type kk = 0; kk < a.extent(1); kk += block_size) {
+      for (size_type i = 0; i < c.extent(0); i++) {
+        for (size_type j = jj;
+             j < ((jj + block_size) > c.extent(1) ? c.extent(1)
+                                                  : (jj + block_size));
+             j++) {
+          CMat::non_const_value_type temp = 0;
 #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
 #pragma unroll
 #endif
 
-			        for(size_type k = kk; k<((kk+block_size)>a.extent(1)?a.extent(1):(kk+block_size)); k++){
-
-				temp +=alpha*a(i,k)*b(j,k);
-                               
-				 }
-                               c(i,j) += temp;                              
-                        }
-                }
+          for (size_type k = kk;
+               k < ((kk + block_size) > a.extent(1) ? a.extent(1)
+                                                    : (kk + block_size));
+               k++) {
+            temp += alpha * a(i, k) * b(j, k);
+          }
+          c(i, j) += temp;
         }
-
+      }
+    }
   }
 };
 
+template <class AMat, class BMat, class CMat>
+struct blas3_right_2_T_N {
+  typedef typename AMat::execution_space execution_space;
+  typedef AMat::size_type size_type;
 
-
-template<class AMat, class BMat, class CMat>
-struct blas3_right_2_T_N
-{
-  typedef typename AMat::execution_space        execution_space;
-  typedef AMat::size_type                             size_type;
-
-  AMat  a;
-  BMat  b;
-  CMat  c;
+  AMat a;
+  BMat b;
+  CMat c;
   AMat::non_const_value_type alpha;
   CMat::non_const_value_type beta;
-   (AMat::const_value_type alpha_,const AMat &a_, const BMat &b_,CMat::const_value_type beta, CMat &c_) : alpha(alpha_), a (a_), b (b_), beta(beta_), c(c_) {}
+  (AMat::const_value_type alpha_, const AMat &a_, const BMat &b_,
+   CMat::const_value_type beta, CMat &c_)
+      : alpha(alpha_), a(a_), b(b_), beta(beta_), c(c_) {}
 
   // Prefer const size_type& to const size_type or size_type,
   // since the compiler has an easier time inlining the former.
-  KOKKOS_FORCEINLINE_FUNCTION void
-  operator() (const size_type &jjblocknum) const
-  {
-size_type jj=jjblocknum*block_size;
+  KOKKOS_FORCEINLINE_FUNCTION void operator()(
+      const size_type &jjblocknum) const {
+    size_type jj = jjblocknum * block_size;
 
-for(size_type i=0;i<c.extent(0);i++){
+    for (size_type i = 0; i < c.extent(0); i++) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
@@ -210,61 +211,61 @@ for(size_type i=0;i<c.extent(0);i++){
 #pragma vector always
 #endif
 
-	for(size_type j = jj; j<((jj+block_size)>c.extent(1)?c.extent(1):(jj+block_size)); j++){
-                                c(i,j) = beta*c(i,j);
-                                }
-}
-
-
-        for(size_type kk=0;kk<a.extent(0);kk+= block_size){
-          
-                               
-                     for(size_type k = kk; k<((kk+block_size)>a.extent(0)?a.extent(0):(kk+block_size)); k++){
-
-                          for(size_type i=0;i<c.extent(0);i++){
-AMat::const_value_type alpha_a=alpha*a(k,i);
+      for (size_type j = jj;
+           j <
+           ((jj + block_size) > c.extent(1) ? c.extent(1) : (jj + block_size));
+           j++) {
+        c(i, j) = beta * c(i, j);
+      }
+    }
+
+    for (size_type kk = 0; kk < a.extent(0); kk += block_size) {
+      for (size_type k = kk;
+           k <
+           ((kk + block_size) > a.extent(0) ? a.extent(0) : (kk + block_size));
+           k++) {
+        for (size_type i = 0; i < c.extent(0); i++) {
+          AMat::const_value_type alpha_a = alpha * a(k, i);
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
 #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
 #pragma vector always
 #endif
-                                
-                        for(size_type j = jj; j<((jj+block_size)>c.extent(1)?c.extent(1):(jj+block_size)); j++){
-			        
-				c(i,j) +=alpha_a*b(k,j);
-                               
-				 }
-                               
-                        }
-                }
-        }
 
+          for (size_type j = jj;
+               j < ((jj + block_size) > c.extent(1) ? c.extent(1)
+                                                    : (jj + block_size));
+               j++) {
+            c(i, j) += alpha_a * b(k, j);
+          }
+        }
+      }
+    }
   }
 };
 
+template <class AMat, class BMat, class CMat>
+struct blas3_right_2_T_T {
+  typedef typename AMat::execution_space execution_space;
+  typedef AMat::size_type size_type;
 
-template<class AMat, class BMat, class CMat>
-struct blas3_right_2_T_T
-{
-  typedef typename AMat::execution_space        execution_space;
-  typedef AMat::size_type                             size_type;
-
-  AMat  a;
-  BMat  b;
-  CMat  c;
+  AMat a;
+  BMat b;
+  CMat c;
   AMat::non_const_value_type alpha;
   CMat::non_const_value_type beta;
-   (AMat::const_value_type alpha_,const AMat &a_, const BMat &b_,CMat::const_value_type beta, CMat &c_) : alpha(alpha_), a (a_), b (b_), beta(beta_), c(c_) {}
+  (AMat::const_value_type alpha_, const AMat &a_, const BMat &b_,
+   CMat::const_value_type beta, CMat &c_)
+      : alpha(alpha_), a(a_), b(b_), beta(beta_), c(c_) {}
 
   // Prefer const size_type& to const size_type or size_type,
   // since the compiler has an easier time inlining the former.
-  KOKKOS_FORCEINLINE_FUNCTION void
-  operator() (const size_type &jjblocknum) const
-  {
-size_type jj=jjblocknum*block_size;
+  KOKKOS_FORCEINLINE_FUNCTION void operator()(
+      const size_type &jjblocknum) const {
+    size_type jj = jjblocknum * block_size;
 
-for(size_type i=0;i<c.extent(0);i++){
+    for (size_type i = 0; i < c.extent(0); i++) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
@@ -272,144 +273,146 @@ for(size_type i=0;i<c.extent(0);i++){
 #pragma vector always
 #endif
 
-	for(size_type j = jj; j<((jj+block_size)>c.extent(1)?c.extent(1):(jj+block_size)); j++){
-                                c(i,j) = beta*c(i,j);
-                                }
-}
-
-
-        for(size_type kk=0;kk<a.extent(0);kk+= block_size){
-          
-	      for(size_type i=0;i<c.extent(0);i++){
-                               
-                     for(size_type k = kk; k<((kk+block_size)>a.extent(0)?a.extent(0):(kk+block_size)); k++){
- AMat::const_value_type alpha_a=alpha*a(k,i);
+      for (size_type j = jj;
+           j <
+           ((jj + block_size) > c.extent(1) ? c.extent(1) : (jj + block_size));
+           j++) {
+        c(i, j) = beta * c(i, j);
+      }
+    }
+
+    for (size_type kk = 0; kk < a.extent(0); kk += block_size) {
+      for (size_type i = 0; i < c.extent(0); i++) {
+        for (size_type k = kk;
+             k < ((kk + block_size) > a.extent(0) ? a.extent(0)
+                                                  : (kk + block_size));
+             k++) {
+          AMat::const_value_type alpha_a = alpha * a(k, i);
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
 #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
 #pragma vector always
 #endif
-                                
-                        for(size_type j = jj; j<((jj+block_size)>c.extent(1)?c.extent(1):(jj+block_size)); j++){
-			        
-				c(i,j) +=alpha_a*b(j,k);
-                               
-				 }
-                               
-                        }
-                }
-        }
 
+          for (size_type j = jj;
+               j < ((jj + block_size) > c.extent(1) ? c.extent(1)
+                                                    : (jj + block_size));
+               j++) {
+            c(i, j) += alpha_a * b(j, k);
+          }
+        }
+      }
+    }
   }
 };
 
+template <class AMat, class BMat, class CMat>
+struct blas3_left_2_N_N {
+  typedef typename AMat::execution_space execution_space;
+  typedef AMat::size_type size_type;
 
-template<class AMat, class BMat, class CMat>
-struct blas3_left_2_N_N
-{
-  typedef typename AMat::execution_space        execution_space;
-  typedef AMat::size_type                             size_type;
-
-  AMat  a;
-  BMat  b;
-  CMat  c;
+  AMat a;
+  BMat b;
+  CMat c;
   AMat::non_const_value_type alpha;
   CMat::non_const_value_type beta;
-   (AMat::const_value_type alpha_,const AMat &a_, const BMat &b_,CMat::const_value_type beta, CMat &c_) : alpha(alpha_), a (a_), b (b_), beta(beta_), c(c_) {}
+  (AMat::const_value_type alpha_, const AMat &a_, const BMat &b_,
+   CMat::const_value_type beta, CMat &c_)
+      : alpha(alpha_), a(a_), b(b_), beta(beta_), c(c_) {}
 
   // Prefer const size_type& to const size_type or size_type,
   // since the compiler has an easier time inlining the former.
-  KOKKOS_FORCEINLINE_FUNCTION void
-  operator() (const size_type &jjblocknum) const
-  {
-size_type jj=jjblocknum*block_size;
-
-
-	for(size_type j = jj; j<((jj+block_size)>c.extent(1)?c.extent(1):(jj+block_size)); j++){
+  KOKKOS_FORCEINLINE_FUNCTION void operator()(
+      const size_type &jjblocknum) const {
+    size_type jj = jjblocknum * block_size;
+
+    for (size_type j = jj;
+         j <
+         ((jj + block_size) > c.extent(1) ? c.extent(1) : (jj + block_size));
+         j++) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
 #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
 #pragma vector always
-#endif 
-                 for(size_type i=0;i<c.extent(0);i++){                
-                     c(i,j) = beta*c(i,j);
-                          
-                 }
-          }
-
-
-        for(size_type kk=0;kk<a.extent(1);kk+= block_size){
-          
-                               
-                        for(size_type j = jj; j<((jj+block_size)>c.extent(1)?c.extent(1):(jj+block_size)); j++){
-
-                                for(size_type k = kk; k<((kk+block_size)>a.extent(1)?a.extent(1):(kk+block_size)); k++){
- BMat::const_value_type alpha_b=alpha*b(k,j);
+#endif
+      for (size_type i = 0; i < c.extent(0); i++) {
+        c(i, j) = beta * c(i, j);
+      }
+    }
+
+    for (size_type kk = 0; kk < a.extent(1); kk += block_size) {
+      for (size_type j = jj;
+           j <
+           ((jj + block_size) > c.extent(1) ? c.extent(1) : (jj + block_size));
+           j++) {
+        for (size_type k = kk;
+             k < ((kk + block_size) > a.extent(1) ? a.extent(1)
+                                                  : (kk + block_size));
+             k++) {
+          BMat::const_value_type alpha_b = alpha * b(k, j);
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
 #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
 #pragma vector always
 #endif
-                                      
-                                        for(size_type i=0;i<c.extent(0);i++){		        
-                                     c(i,j) +=alpha_b*a(i,k);
-                                
-				 }
-                               
-                        }
-                }
-        }
 
+          for (size_type i = 0; i < c.extent(0); i++) {
+            c(i, j) += alpha_b * a(i, k);
+          }
+        }
+      }
+    }
   }
 };
 
+template <class AMat, class BMat, class CMat>
+struct blas3_left_2_N_T {
+  typedef typename AMat::execution_space execution_space;
+  typedef AMat::size_type size_type;
 
-template<class AMat, class BMat, class CMat>
-struct blas3_left_2_N_T
-{
-  typedef typename AMat::execution_space        execution_space;
-  typedef AMat::size_type                             size_type;
-
-  AMat  a;
-  BMat  b;
-  CMat  c;
+  AMat a;
+  BMat b;
+  CMat c;
   AMat::non_const_value_type alpha;
   CMat::non_const_value_type beta;
-   (AMat::const_value_type alpha_,const AMat &a_, const BMat &b_,CMat::const_value_type beta, CMat &c_) : alpha(alpha_), a (a_), b (b_), beta(beta_), c(c_) {}
+  (AMat::const_value_type alpha_, const AMat &a_, const BMat &b_,
+   CMat::const_value_type beta, CMat &c_)
+      : alpha(alpha_), a(a_), b(b_), beta(beta_), c(c_) {}
 
   // Prefer const size_type& to const size_type or size_type,
   // since the compiler has an easier time inlining the former.
-  KOKKOS_FORCEINLINE_FUNCTION void
-  operator() (const size_type &jjblocknum) const
-  {
-size_type jj=jjblocknum*block_size;
-
-
-        for(size_type j = jj; j<((jj+block_size)>c.extent(1)?c.extent(1):(jj+block_size)); j++){
+  KOKKOS_FORCEINLINE_FUNCTION void operator()(
+      const size_type &jjblocknum) const {
+    size_type jj = jjblocknum * block_size;
+
+    for (size_type j = jj;
+         j <
+         ((jj + block_size) > c.extent(1) ? c.extent(1) : (jj + block_size));
+         j++) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
 #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
 #pragma vector always
 #endif
-                 for(size_type i=0;i<c.extent(0);i++){
-                     c(i,j) = beta*c(i,j);
-
-                 }
-          }
-
-
-        for(size_type kk=0;kk<a.extent(1);kk+= block_size){
-
-
-                                for(size_type k = kk; k<((kk+block_size)>a.extent(1)?a.extent(1):(kk+block_size)); k++){
-
-                                        for(size_type j = jj; j<((jj+block_size)>c.extent(1)?c.extent(1):(jj+block_size)); j++){
-
- BMat::const_value_type alpha_b=alpha*b(j,k);
+      for (size_type i = 0; i < c.extent(0); i++) {
+        c(i, j) = beta * c(i, j);
+      }
+    }
+
+    for (size_type kk = 0; kk < a.extent(1); kk += block_size) {
+      for (size_type k = kk;
+           k <
+           ((kk + block_size) > a.extent(1) ? a.extent(1) : (kk + block_size));
+           k++) {
+        for (size_type j = jj;
+             j < ((jj + block_size) > c.extent(1) ? c.extent(1)
+                                                  : (jj + block_size));
+             j++) {
+          BMat::const_value_type alpha_b = alpha * b(j, k);
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
@@ -417,123 +420,119 @@ size_type jj=jjblocknum*block_size;
 #pragma vector always
 #endif
 
-                                        for(size_type i=0;i<c.extent(0);i++){
-                                     c(i,j) +=alpha_b*a(i,k);
-
-                                 }
-
-                        }
-                }
+          for (size_type i = 0; i < c.extent(0); i++) {
+            c(i, j) += alpha_b * a(i, k);
+          }
         }
-
+      }
+    }
   }
 };
 
+template <class AMat, class BMat, class CMat>
+struct blas3_left_2_T_N {
+  typedef typename AMat::execution_space execution_space;
+  typedef AMat::size_type size_type;
 
-
-template<class AMat, class BMat, class CMat>
-struct blas3_left_2_T_N
-{
-  typedef typename AMat::execution_space        execution_space;
-  typedef AMat::size_type                             size_type;
-
-  AMat  a;
-  BMat  b;
-  CMat  c;
+  AMat a;
+  BMat b;
+  CMat c;
   AMat::non_const_value_type alpha;
   CMat::non_const_value_type beta;
-   (AMat::const_value_type alpha_,const AMat &a_, const BMat &b_,CMat::const_value_type beta, CMat &c_) : alpha(alpha_), a (a_), b (b_), beta(beta_), c(c_) {}
+  (AMat::const_value_type alpha_, const AMat &a_, const BMat &b_,
+   CMat::const_value_type beta, CMat &c_)
+      : alpha(alpha_), a(a_), b(b_), beta(beta_), c(c_) {}
 
   // Prefer const size_type& to const size_type or size_type,
   // since the compiler has an easier time inlining the former.
-  KOKKOS_FORCEINLINE_FUNCTION void
-  operator() (const size_type &jjblocknum) const
-  {
-size_type jj=jjblocknum*block_size;
-
-
-	for(size_type j = jj; j<((jj+block_size)>c.extent(1)?c.extent(1):(jj+block_size)); j++){
+  KOKKOS_FORCEINLINE_FUNCTION void operator()(
+      const size_type &jjblocknum) const {
+    size_type jj = jjblocknum * block_size;
+
+    for (size_type j = jj;
+         j <
+         ((jj + block_size) > c.extent(1) ? c.extent(1) : (jj + block_size));
+         j++) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
 #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
 #pragma vector always
 #endif
-                       for(size_type i=0;i<c.extent(0);i++){
-    
-                            c(i,j) = beta*c(i,j);
-                                }
-}
-
-
-        for(size_type kk=0;kk<a.extent(0);kk+= block_size){
-
-              for(size_type i=0;i<c.extent(0);i++){
-
-
-                        for(size_type j = jj; j<((jj+block_size)>c.extent(1)?c.extent(1):(jj+block_size)); j++){
-CMat::non_const_value_type temp=0;
+      for (size_type i = 0; i < c.extent(0); i++) {
+        c(i, j) = beta * c(i, j);
+      }
+    }
+
+    for (size_type kk = 0; kk < a.extent(0); kk += block_size) {
+      for (size_type i = 0; i < c.extent(0); i++) {
+        for (size_type j = jj;
+             j < ((jj + block_size) > c.extent(1) ? c.extent(1)
+                                                  : (jj + block_size));
+             j++) {
+          CMat::non_const_value_type temp = 0;
 #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
 #pragma unroll
 #endif
 
-                                for(size_type k = kk; k<((kk+block_size)>a.extent(0)?a.extent(0):(kk+block_size)); k++){
-
-                                temp +=alpha*a(k,i)*b(k,j);
-
-                                 }
-                               c(i,j) += temp;
-                        }
-                }
+          for (size_type k = kk;
+               k < ((kk + block_size) > a.extent(0) ? a.extent(0)
+                                                    : (kk + block_size));
+               k++) {
+            temp += alpha * a(k, i) * b(k, j);
+          }
+          c(i, j) += temp;
         }
-
+      }
+    }
   }
 };
 
+template <class AMat, class BMat, class CMat>
+struct blas3_left_2_T_T {
+  typedef typename AMat::execution_space execution_space;
+  typedef AMat::size_type size_type;
 
-template<class AMat, class BMat, class CMat>
-struct blas3_left_2_T_T
-{
-  typedef typename AMat::execution_space        execution_space;
-  typedef AMat::size_type                             size_type;
-
-  AMat  a;
-  BMat  b;
-  CMat  c;
+  AMat a;
+  BMat b;
+  CMat c;
   AMat::non_const_value_type alpha;
   CMat::non_const_value_type beta;
-   (AMat::const_value_type alpha_,const AMat &a_, const BMat &b_,CMat::const_value_type beta, CMat &c_) : alpha(alpha_), a (a_), b (b_), beta(beta_), c(c_) {}
+  (AMat::const_value_type alpha_, const AMat &a_, const BMat &b_,
+   CMat::const_value_type beta, CMat &c_)
+      : alpha(alpha_), a(a_), b(b_), beta(beta_), c(c_) {}
 
   // Prefer const size_type& to const size_type or size_type,
   // since the compiler has an easier time inlining the former.
-  KOKKOS_FORCEINLINE_FUNCTION void
-  operator() (const size_type &jjblocknum) const
-  {
-
-size_type jj=jjblocknum*block_size;
-
-
-        for(size_type j = jj; j<((jj+block_size)>c.extent(1)?c.extent(1):(jj+block_size)); j++){
+  KOKKOS_FORCEINLINE_FUNCTION void operator()(
+      const size_type &jjblocknum) const {
+    size_type jj = jjblocknum * block_size;
+
+    for (size_type j = jj;
+         j <
+         ((jj + block_size) > c.extent(1) ? c.extent(1) : (jj + block_size));
+         j++) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
 #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
 #pragma vector always
 #endif
-                 for(size_type i=0;i<c.extent(0);i++){
-                     c(i,j) = beta*c(i,j);
-
-                 }
-          }
-
-
-        for(size_type kk=0;kk<a.extent(0);kk+= block_size){
-
-
-                                for(size_type k = kk; k<((kk+block_size)>a.extent(0)?a.extent(0):(kk+block_size)); k++){
-
-                                     for(size_type j = jj; j<((jj+block_size)>c.extent(1)?c.extent(1):(jj+block_size)); j++){
- BMat::const_value_type alpha_b=alpha*b(j,k);
+      for (size_type i = 0; i < c.extent(0); i++) {
+        c(i, j) = beta * c(i, j);
+      }
+    }
+
+    for (size_type kk = 0; kk < a.extent(0); kk += block_size) {
+      for (size_type k = kk;
+           k <
+           ((kk + block_size) > a.extent(0) ? a.extent(0) : (kk + block_size));
+           k++) {
+        for (size_type j = jj;
+             j < ((jj + block_size) > c.extent(1) ? c.extent(1)
+                                                  : (jj + block_size));
+             j++) {
+          BMat::const_value_type alpha_b = alpha * b(j, k);
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
@@ -541,40 +540,34 @@ size_type jj=jjblocknum*block_size;
 #pragma vector always
 #endif
 
-                                        for(size_type i=0;i<c.extent(0);i++){
-                                     c(i,j) +=alpha_b*a(k,i);
-
-                                 }
-
-                        }
-                }
+          for (size_type i = 0; i < c.extent(0); i++) {
+            c(i, j) += alpha_b * a(k, i);
+          }
         }
-
-
+      }
+    }
   }
 };
 
-template<class AMat, class BMat, class CMat>
-struct blas3_right_3_N_N
-{
-  typedef typename AMat::execution_space        execution_space;
-  typedef AMat::size_type                             size_type;
+template <class AMat, class BMat, class CMat>
+struct blas3_right_3_N_N {
+  typedef typename AMat::execution_space execution_space;
+  typedef AMat::size_type size_type;
 
-  AMat  a;
-  BMat  b;
-  CMat  c;
+  AMat a;
+  BMat b;
+  CMat c;
   AMat::non_const_value_type alpha;
   CMat::non_const_value_type beta;
-   (AMat::const_value_type alpha_,const AMat &a_, const BMat &b_,CMat::const_value_type beta, CMat &c_) : alpha(alpha_), a (a_), b (b_), beta(beta_), c(c_) {}
+  (AMat::const_value_type alpha_, const AMat &a_, const BMat &b_,
+   CMat::const_value_type beta, CMat &c_)
+      : alpha(alpha_), a(a_), b(b_), beta(beta_), c(c_) {}
 
   // Prefer const size_type& to const size_type or size_type,
   // since the compiler has an easier time inlining the former.
-  KOKKOS_FORCEINLINE_FUNCTION void
-  operator() (const size_type &matnum) const
-  {
-
-for(size_type jj=0;jj<c.extent(2);jj+= block_size){
-	for(size_type i=0;i<c.extent(1);i++){
+  KOKKOS_FORCEINLINE_FUNCTION void operator()(const size_type &matnum) const {
+    for (size_type jj = 0; jj < c.extent(2); jj += block_size) {
+      for (size_type i = 0; i < c.extent(1); i++) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
@@ -582,60 +575,60 @@ for(size_type jj=0;jj<c.extent(2);jj+= block_size){
 #pragma vector always
 #endif
 
-	for(size_type j = jj; j<((jj+block_size)>c.extent(2)?c.extent(2):(jj+block_size)); j++){
-                                c(matnum,i,j) = beta*c(matnum,i,j);
-                                }
-}
-
-
-        for(size_type kk=0;kk<a.extent(2);kk+= block_size){
-          
-	      for(size_type i=0;i<c.extent(1);i++){
-                               
-                     for(size_type k = kk; k<((kk+block_size)>a.extent(2)?a.extent(2):(kk+block_size)); k++){
-AMat::const_value_type alpha_a=alpha*a(matnum,i,k);
+        for (size_type j = jj;
+             j < ((jj + block_size) > c.extent(2) ? c.extent(2)
+                                                  : (jj + block_size));
+             j++) {
+          c(matnum, i, j) = beta * c(matnum, i, j);
+        }
+      }
+
+      for (size_type kk = 0; kk < a.extent(2); kk += block_size) {
+        for (size_type i = 0; i < c.extent(1); i++) {
+          for (size_type k = kk;
+               k < ((kk + block_size) > a.extent(2) ? a.extent(2)
+                                                    : (kk + block_size));
+               k++) {
+            AMat::const_value_type alpha_a = alpha * a(matnum, i, k);
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
 #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
 #pragma vector always
 #endif
-                                
-                        for(size_type j = jj; j<((jj+block_size)>c.extent(2)?c.extent(2):(jj+block_size)); j++){
-			        
-				c(matnum,i,j) +=alpha_a*b(matnum,k,j);
-                               
-				 }
-                               
-                        }
-                }
+
+            for (size_type j = jj;
+                 j < ((jj + block_size) > c.extent(2) ? c.extent(2)
+                                                      : (jj + block_size));
+                 j++) {
+              c(matnum, i, j) += alpha_a * b(matnum, k, j);
+            }
+          }
         }
-}
+      }
+    }
   }
 };
 
+template <class AMat, class BMat, class CMat>
+struct blas3_right_3_N_T {
+  typedef typename AMat::execution_space execution_space;
+  typedef AMat::size_type size_type;
 
-template<class AMat, class BMat, class CMat>
-struct blas3_right_3_N_T
-{
-  typedef typename AMat::execution_space        execution_space;
-  typedef AMat::size_type                             size_type;
-
-  AMat  a;
-  BMat  b;
-  CMat  c;
+  AMat a;
+  BMat b;
+  CMat c;
   AMat::non_const_value_type alpha;
   CMat::non_const_value_type beta;
-   (AMat::const_value_type alpha_,const AMat &a_, const BMat &b_,CMat::const_value_type beta, CMat &c_) : alpha(alpha_), a (a_), b (b_), beta(beta_), c(c_) {}
+  (AMat::const_value_type alpha_, const AMat &a_, const BMat &b_,
+   CMat::const_value_type beta, CMat &c_)
+      : alpha(alpha_), a(a_), b(b_), beta(beta_), c(c_) {}
 
   // Prefer const size_type& to const size_type or size_type,
   // since the compiler has an easier time inlining the former.
-  KOKKOS_FORCEINLINE_FUNCTION void
-  operator() (const size_type &matnum) const
-  {
-for(size_type jj=0;jj<c.extent(2);jj+= block_size){
-
-for(size_type i=0;i<c.extent(1);i++){
+  KOKKOS_FORCEINLINE_FUNCTION void operator()(const size_type &matnum) const {
+    for (size_type jj = 0; jj < c.extent(2); jj += block_size) {
+      for (size_type i = 0; i < c.extent(1); i++) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
@@ -643,59 +636,58 @@ for(size_type i=0;i<c.extent(1);i++){
 #pragma vector always
 #endif
 
-        for(size_type j = jj; j<((jj+block_size)>c.extent(2)?c.extent(2):(jj+block_size)); j++){
-                                c(matnum,i,j) = beta*c(matnum,i,j);
-                                }
-}
-
-
-        for(size_type kk=0;kk<a.extent(2);kk+= block_size){
-
-              for(size_type i=0;i<c.extent(1);i++){
-
-
-                        for(size_type j = jj; j<((jj+block_size)>c.extent(2)?c.extent(2):(jj+block_size)); j++){
-CMat::non_const_value_type temp=0;
+        for (size_type j = jj;
+             j < ((jj + block_size) > c.extent(2) ? c.extent(2)
+                                                  : (jj + block_size));
+             j++) {
+          c(matnum, i, j) = beta * c(matnum, i, j);
+        }
+      }
+
+      for (size_type kk = 0; kk < a.extent(2); kk += block_size) {
+        for (size_type i = 0; i < c.extent(1); i++) {
+          for (size_type j = jj;
+               j < ((jj + block_size) > c.extent(2) ? c.extent(2)
+                                                    : (jj + block_size));
+               j++) {
+            CMat::non_const_value_type temp = 0;
 #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
 #pragma unroll
 #endif
 
-                                for(size_type k = kk; k<((kk+block_size)>a.extent(2)?a.extent(2):(kk+block_size)); k++){
-
-                                temp +=alpha*a(matnum,i,k)*b(matnum,j,k);
-
-                                 }
-                               c(matnum,i,j) += temp;
-                        }
-                }
+            for (size_type k = kk;
+                 k < ((kk + block_size) > a.extent(2) ? a.extent(2)
+                                                      : (kk + block_size));
+                 k++) {
+              temp += alpha * a(matnum, i, k) * b(matnum, j, k);
+            }
+            c(matnum, i, j) += temp;
+          }
         }
-}
+      }
+    }
   }
 };
 
+template <class AMat, class BMat, class CMat>
+struct blas3_right_3_T_N {
+  typedef typename AMat::execution_space execution_space;
+  typedef AMat::size_type size_type;
 
-
-template<class AMat, class BMat, class CMat>
-struct blas3_right_3_T_N
-{
-  typedef typename AMat::execution_space        execution_space;
-  typedef AMat::size_type                             size_type;
-
-  AMat  a;
-  BMat  b;
-  CMat  c;
+  AMat a;
+  BMat b;
+  CMat c;
   AMat::non_const_value_type alpha;
   CMat::non_const_value_type beta;
-   (AMat::const_value_type alpha_,const AMat &a_, const BMat &b_,CMat::const_value_type beta, CMat &c_) : alpha(alpha_), a (a_), b (b_), beta(beta_), c(c_) {}
+  (AMat::const_value_type alpha_, const AMat &a_, const BMat &b_,
+   CMat::const_value_type beta, CMat &c_)
+      : alpha(alpha_), a(a_), b(b_), beta(beta_), c(c_) {}
 
   // Prefer const size_type& to const size_type or size_type,
   // since the compiler has an easier time inlining the former.
-  KOKKOS_FORCEINLINE_FUNCTION void
-  operator() (const size_type &matnum) const
-  {
-for(size_type jj=0;jj<c.extent(2);jj+= block_size){
-
-for(size_type i=0;i<c.extent(1);i++){
+  KOKKOS_FORCEINLINE_FUNCTION void operator()(const size_type &matnum) const {
+    for (size_type jj = 0; jj < c.extent(2); jj += block_size) {
+      for (size_type i = 0; i < c.extent(1); i++) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
@@ -703,61 +695,60 @@ for(size_type i=0;i<c.extent(1);i++){
 #pragma vector always
 #endif
 
-	for(size_type j = jj; j<((jj+block_size)>c.extent(2)?c.extent(2):(jj+block_size)); j++){
-                                c(matnum,i,j) = beta*c(matnum,i,j);
-                                }
-}
-
-
-        for(size_type kk=0;kk<a.extent(1);kk+= block_size){
-          
-                               
-                     for(size_type k = kk; k<((kk+block_size)>a.extent(1)?a.extent(1):(kk+block_size)); k++){
-
-                          for(size_type i=0;i<c.extent(1);i++){
-AMat::const_value_type alpha_a=alpha*a(matnum,k,i);
+        for (size_type j = jj;
+             j < ((jj + block_size) > c.extent(2) ? c.extent(2)
+                                                  : (jj + block_size));
+             j++) {
+          c(matnum, i, j) = beta * c(matnum, i, j);
+        }
+      }
+
+      for (size_type kk = 0; kk < a.extent(1); kk += block_size) {
+        for (size_type k = kk;
+             k < ((kk + block_size) > a.extent(1) ? a.extent(1)
+                                                  : (kk + block_size));
+             k++) {
+          for (size_type i = 0; i < c.extent(1); i++) {
+            AMat::const_value_type alpha_a = alpha * a(matnum, k, i);
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
 #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
 #pragma vector always
 #endif
-                                
-                        for(size_type j = jj; j<((jj+block_size)>c.extent(2)?c.extent(2):(jj+block_size)); j++){
-			        
-				c(matnum,i,j) +=alpha_a*b(matnum,k,j);
-                               
-				 }
-                               
-                        }
-                }
+
+            for (size_type j = jj;
+                 j < ((jj + block_size) > c.extent(2) ? c.extent(2)
+                                                      : (jj + block_size));
+                 j++) {
+              c(matnum, i, j) += alpha_a * b(matnum, k, j);
+            }
+          }
         }
-}
+      }
+    }
   }
 };
 
+template <class AMat, class BMat, class CMat>
+struct blas3_right_3_T_T {
+  typedef typename AMat::execution_space execution_space;
+  typedef AMat::size_type size_type;
 
-template<class AMat, class BMat, class CMat>
-struct blas3_right_3_T_T
-{
-  typedef typename AMat::execution_space        execution_space;
-  typedef AMat::size_type                             size_type;
-
-  AMat  a;
-  BMat  b;
-  CMat  c;
+  AMat a;
+  BMat b;
+  CMat c;
   AMat::non_const_value_type alpha;
   CMat::non_const_value_type beta;
-   (AMat::const_value_type alpha_,const AMat &a_, const BMat &b_,CMat::const_value_type beta, CMat &c_) : alpha(alpha_), a (a_), b (b_), beta(beta_), c(c_) {}
+  (AMat::const_value_type alpha_, const AMat &a_, const BMat &b_,
+   CMat::const_value_type beta, CMat &c_)
+      : alpha(alpha_), a(a_), b(b_), beta(beta_), c(c_) {}
 
   // Prefer const size_type& to const size_type or size_type,
   // since the compiler has an easier time inlining the former.
-  KOKKOS_FORCEINLINE_FUNCTION void
-  operator() (const size_type &matnum) const
-  {
-for(size_type jj=0;jj<c.extent(2);jj+= block_size){
-
-for(size_type i=0;i<c.extent(1);i++){
+  KOKKOS_FORCEINLINE_FUNCTION void operator()(const size_type &matnum) const {
+    for (size_type jj = 0; jj < c.extent(2); jj += block_size) {
+      for (size_type i = 0; i < c.extent(1); i++) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
@@ -765,151 +756,151 @@ for(size_type i=0;i<c.extent(1);i++){
 #pragma vector always
 #endif
 
-	for(size_type j = jj; j<((jj+block_size)>c.extent(2)?c.extent(2):(jj+block_size)); j++){
-                                c(matnum,i,j) = beta*c(matnum,i,j);
-                                }
-}
-
-
-        for(size_type kk=0;kk<a.extent(1);kk+= block_size){
-          
-	      for(size_type i=0;i<c.extent(1);i++){
-                               
-                     for(size_type k = kk; k<((kk+block_size)>a.extent(1)?a.extent(1):(kk+block_size)); k++){
- AMat::const_value_type alpha_a=alpha*a(k,i);
+        for (size_type j = jj;
+             j < ((jj + block_size) > c.extent(2) ? c.extent(2)
+                                                  : (jj + block_size));
+             j++) {
+          c(matnum, i, j) = beta * c(matnum, i, j);
+        }
+      }
+
+      for (size_type kk = 0; kk < a.extent(1); kk += block_size) {
+        for (size_type i = 0; i < c.extent(1); i++) {
+          for (size_type k = kk;
+               k < ((kk + block_size) > a.extent(1) ? a.extent(1)
+                                                    : (kk + block_size));
+               k++) {
+            AMat::const_value_type alpha_a = alpha * a(k, i);
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
 #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
 #pragma vector always
 #endif
-                                
-                        for(size_type j = jj; j<((jj+block_size)>c.extent(2)?c.extent(2):(jj+block_size)); j++){
-			        
-				c(i,j) +=alpha_a*b(j,k);
-                               
-				 }
-                               
-                        }
-                }
+
+            for (size_type j = jj;
+                 j < ((jj + block_size) > c.extent(2) ? c.extent(2)
+                                                      : (jj + block_size));
+                 j++) {
+              c(i, j) += alpha_a * b(j, k);
+            }
+          }
         }
-}
+      }
+    }
   }
 };
 
+template <class AMat, class BMat, class CMat>
+struct blas3_left_3_N_N {
+  typedef typename AMat::execution_space execution_space;
+  typedef AMat::size_type size_type;
 
-template<class AMat, class BMat, class CMat>
-struct blas3_left_3_N_N
-{
-  typedef typename AMat::execution_space        execution_space;
-  typedef AMat::size_type                             size_type;
-
-  AMat  a;
-  BMat  b;
-  CMat  c;
+  AMat a;
+  BMat b;
+  CMat c;
   AMat::non_const_value_type alpha;
   CMat::non_const_value_type beta;
-   (AMat::const_value_type alpha_,const AMat &a_, const BMat &b_,CMat::const_value_type beta, CMat &c_) : alpha(alpha_), a (a_), b (b_), beta(beta_), c(c_) {}
+  (AMat::const_value_type alpha_, const AMat &a_, const BMat &b_,
+   CMat::const_value_type beta, CMat &c_)
+      : alpha(alpha_), a(a_), b(b_), beta(beta_), c(c_) {}
 
   // Prefer const size_type& to const size_type or size_type,
   // since the compiler has an easier time inlining the former.
-  KOKKOS_FORCEINLINE_FUNCTION void
-  operator() (const size_type &jjblocknum) const
-  {
-size_type jj=jjblocknum*block_size;
-
-
-	for(size_type j = jj; j<((jj+block_size)>c.extent(2)?c.extent(2):(jj+block_size)); j++){
-
-                 for(size_type i=0;i<c.extent(1);i++){            
+  KOKKOS_FORCEINLINE_FUNCTION void operator()(
+      const size_type &jjblocknum) const {
+    size_type jj = jjblocknum * block_size;
+
+    for (size_type j = jj;
+         j <
+         ((jj + block_size) > c.extent(2) ? c.extent(2) : (jj + block_size));
+         j++) {
+      for (size_type i = 0; i < c.extent(1); i++) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
 #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
 #pragma vector always
 #endif
-                    for(size_type matnum=0;matnum<c.extent(0);matnum++){
-    
-                     c(matnum,i,j) = beta*c(matnum,i,j);
-                        }  
-                 }
-          }
-
-
-        for(size_type kk=0;kk<a.extent(2);kk+= block_size){
-          
-                               
-                        for(size_type j = jj; j<((jj+block_size)>c.extent(2)?c.extent(2):(jj+block_size)); j++){
-
-                                for(size_type k = kk; k<((kk+block_size)>a.extent(2)?a.extent(2):(kk+block_size)); k++){
-                                      
-                                        for(size_type i=0;i<c.extent(1);i++){		        
+        for (size_type matnum = 0; matnum < c.extent(0); matnum++) {
+          c(matnum, i, j) = beta * c(matnum, i, j);
+        }
+      }
+    }
+
+    for (size_type kk = 0; kk < a.extent(2); kk += block_size) {
+      for (size_type j = jj;
+           j <
+           ((jj + block_size) > c.extent(2) ? c.extent(2) : (jj + block_size));
+           j++) {
+        for (size_type k = kk;
+             k < ((kk + block_size) > a.extent(2) ? a.extent(2)
+                                                  : (kk + block_size));
+             k++) {
+          for (size_type i = 0; i < c.extent(1); i++) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
 #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
 #pragma vector always
 #endif
-                    for(size_type matnum=0;matnum<c.extent(0);matnum++){
-
-                                     c(matnum,i,j) +=alpha*b(matnum,k,j)*a(matnum,i,k);
-                         }       
-				 }
-                               
-                        }
-                }
+            for (size_type matnum = 0; matnum < c.extent(0); matnum++) {
+              c(matnum, i, j) += alpha * b(matnum, k, j) * a(matnum, i, k);
+            }
+          }
         }
-
+      }
+    }
   }
 };
 
+template <class AMat, class BMat, class CMat>
+struct blas3_left_3_N_T {
+  typedef typename AMat::execution_space execution_space;
+  typedef AMat::size_type size_type;
 
-template<class AMat, class BMat, class CMat>
-struct blas3_left_3_N_T
-{
-  typedef typename AMat::execution_space        execution_space;
-  typedef AMat::size_type                             size_type;
-
-  AMat  a;
-  BMat  b;
-  CMat  c;
+  AMat a;
+  BMat b;
+  CMat c;
   AMat::non_const_value_type alpha;
   CMat::non_const_value_type beta;
-   (AMat::const_value_type alpha_,const AMat &a_, const BMat &b_,CMat::const_value_type beta, CMat &c_) : alpha(alpha_), a (a_), b (b_), beta(beta_), c(c_) {}
+  (AMat::const_value_type alpha_, const AMat &a_, const BMat &b_,
+   CMat::const_value_type beta, CMat &c_)
+      : alpha(alpha_), a(a_), b(b_), beta(beta_), c(c_) {}
 
   // Prefer const size_type& to const size_type or size_type,
   // since the compiler has an easier time inlining the former.
-  KOKKOS_FORCEINLINE_FUNCTION void
-  operator() (const size_type &jjblocknum) const
-  {
-size_type jj=jjblocknum*block_size;
-
-
-        for(size_type j = jj; j<((jj+block_size)>c.extent(2)?c.extent(2):(jj+block_size)); j++){
-
-                 for(size_type i=0;i<c.extent(1);i++){
+  KOKKOS_FORCEINLINE_FUNCTION void operator()(
+      const size_type &jjblocknum) const {
+    size_type jj = jjblocknum * block_size;
+
+    for (size_type j = jj;
+         j <
+         ((jj + block_size) > c.extent(2) ? c.extent(2) : (jj + block_size));
+         j++) {
+      for (size_type i = 0; i < c.extent(1); i++) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
 #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
 #pragma vector always
 #endif
-                    for(size_type matnum=0;matnum<c.extent(0);matnum++){   
-                  c(matnum,i,j) = beta*c(matnum,i,j);
-
-                 }
-          }
-}
-
-        for(size_type kk=0;kk<a.extent(2);kk+= block_size){
-
-
-                                for(size_type k = kk; k<((kk+block_size)>a.extent(2)?a.extent(2):(kk+block_size)); k++){
-
-                                        for(size_type j = jj; j<((jj+block_size)>c.extent(2)?c.extent(2):(jj+block_size)); j++){
-
-
-                                        for(size_type i=0;i<c.extent(1);i++){
+        for (size_type matnum = 0; matnum < c.extent(0); matnum++) {
+          c(matnum, i, j) = beta * c(matnum, i, j);
+        }
+      }
+    }
+
+    for (size_type kk = 0; kk < a.extent(2); kk += block_size) {
+      for (size_type k = kk;
+           k <
+           ((kk + block_size) > a.extent(2) ? a.extent(2) : (kk + block_size));
+           k++) {
+        for (size_type j = jj;
+             j < ((jj + block_size) > c.extent(2) ? c.extent(2)
+                                                  : (jj + block_size));
+             j++) {
+          for (size_type i = 0; i < c.extent(1); i++) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
@@ -917,43 +908,40 @@ size_type jj=jjblocknum*block_size;
 #pragma vector always
 #endif
 
-                                               for(size_type matnum=0;matnum<c.extent(0);matnum++){ 
-                                    c(matnum,i,j) +=alpha*b(matnum,j,k)*a(matnum,i,k);
-
-                                 }
-
-                        }
-                }
+            for (size_type matnum = 0; matnum < c.extent(0); matnum++) {
+              c(matnum, i, j) += alpha * b(matnum, j, k) * a(matnum, i, k);
+            }
+          }
         }
-
-  }
-};
-
-
-
-template<class AMat, class BMat, class CMat>
-struct blas3_left_3_T_N
-{
-  typedef typename AMat::execution_space        execution_space;
-  typedef AMat::size_type                             size_type;
-
-  AMat  a;
-  BMat  b;
-  CMat  c;
-  AMat::non_const_value_type alpha;
-  CMat::non_const_value_type beta;
-   (AMat::const_value_type alpha_,const AMat &a_, const BMat &b_,CMat::const_value_type beta, CMat &c_) : alpha(alpha_), a (a_), b (b_), beta(beta_), c(c_) {}
-
-  // Prefer const size_type& to const size_type or size_type,
-  // since the compiler has an easier time inlining the former.
-  KOKKOS_FORCEINLINE_FUNCTION void
-  operator() (const size_type &jjblocknum) const
-  {
-size_type jj=jjblocknum*block_size;
-
-
-	for(size_type j = jj; j<((jj+block_size)>c.extent(2)?c.extent(2):(jj+block_size)); j++){
-                       for(size_type i=0;i<c.extent(1);i++){
+      }
+    }
+  };
+
+  template <class AMat, class BMat, class CMat>
+  struct blas3_left_3_T_N {
+    typedef typename AMat::execution_space execution_space;
+    typedef AMat::size_type size_type;
+
+    AMat a;
+    BMat b;
+    CMat c;
+    AMat::non_const_value_type alpha;
+    CMat::non_const_value_type beta;
+    (AMat::const_value_type alpha_, const AMat &a_, const BMat &b_,
+     CMat::const_value_type beta, CMat &c_)
+        : alpha(alpha_), a(a_), b(b_), beta(beta_), c(c_) {}
+
+    // Prefer const size_type& to const size_type or size_type,
+    // since the compiler has an easier time inlining the former.
+    KOKKOS_FORCEINLINE_FUNCTION void operator()(
+        const size_type &jjblocknum) const {
+      size_type jj = jjblocknum * block_size;
+
+      for (size_type j = jj;
+           j <
+           ((jj + block_size) > c.extent(2) ? c.extent(2) : (jj + block_size));
+           j++) {
+        for (size_type i = 0; i < c.extent(1); i++) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
@@ -961,20 +949,22 @@ size_type jj=jjblocknum*block_size;
 #pragma vector always
 #endif
 
-                           for(size_type matnum=0;matnum<c.extent(0);matnum++){    
-                            c(matnum,i,j) = beta*c(matnum,i,j);
-                                }
-                }
-}
-
-        for(size_type kk=0;kk<a.extent(1);kk+= block_size){
-
-              for(size_type i=0;i<c.extent(1);i++){
-
-
-                        for(size_type j = jj; j<((jj+block_size)>c.extent(2)?c.extent(2):(jj+block_size)); j++){
-
-                                for(size_type k = kk; k<((kk+block_size)>a.extent(1)?a.extent(1):(kk+block_size)); k++){
+          for (size_type matnum = 0; matnum < c.extent(0); matnum++) {
+            c(matnum, i, j) = beta * c(matnum, i, j);
+          }
+        }
+      }
+
+      for (size_type kk = 0; kk < a.extent(1); kk += block_size) {
+        for (size_type i = 0; i < c.extent(1); i++) {
+          for (size_type j = jj;
+               j < ((jj + block_size) > c.extent(2) ? c.extent(2)
+                                                    : (jj + block_size));
+               j++) {
+            for (size_type k = kk;
+                 k < ((kk + block_size) > a.extent(1) ? a.extent(1)
+                                                      : (kk + block_size));
+                 k++) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
@@ -982,43 +972,40 @@ size_type jj=jjblocknum*block_size;
 #pragma vector always
 #endif
 
-                                    for(size_type matnum=0;matnum<c.extent(0);matnum++){
-                                c(matnum,i,j) +=alpha*a(matnum,k,i)*b(matnum,k,j);
-
-                                 }
-                              
-                        }
-                }
+              for (size_type matnum = 0; matnum < c.extent(0); matnum++) {
+                c(matnum, i, j) += alpha * a(matnum, k, i) * b(matnum, k, j);
+              }
+            }
+          }
         }
-
-  }
-};
-
-
-template<class AMat, class BMat, class CMat>
-struct blas3_left_3_T_T
-{
-  typedef typename AMat::execution_space        execution_space;
-  typedef AMat::size_type                             size_type;
-
-  AMat  a;
-  BMat  b;
-  CMat  c;
-  AMat::non_const_value_type alpha;
-  CMat::non_const_value_type beta;
-   (AMat::const_value_type alpha_,const AMat &a_, const BMat &b_,CMat::const_value_type beta, CMat &c_) : alpha(alpha_), a (a_), b (b_), beta(beta_), c(c_) {}
-
-  // Prefer const size_type& to const size_type or size_type,
-  // since the compiler has an easier time inlining the former.
-  KOKKOS_FORCEINLINE_FUNCTION void
-  operator() (const size_type &jjblocknum) const
-  {
-
-size_type jj=jjblocknum*block_size;
-
-
-        for(size_type j = jj; j<((jj+block_size)>c.extent(2)?c.extent(2):(jj+block_size)); j++){
-                 for(size_type i=0;i<c.extent(1);i++){
+      }
+    };
+
+    template <class AMat, class BMat, class CMat>
+    struct blas3_left_3_T_T {
+      typedef typename AMat::execution_space execution_space;
+      typedef AMat::size_type size_type;
+
+      AMat a;
+      BMat b;
+      CMat c;
+      AMat::non_const_value_type alpha;
+      CMat::non_const_value_type beta;
+      (AMat::const_value_type alpha_, const AMat &a_, const BMat &b_,
+       CMat::const_value_type beta, CMat &c_)
+          : alpha(alpha_), a(a_), b(b_), beta(beta_), c(c_) {}
+
+      // Prefer const size_type& to const size_type or size_type,
+      // since the compiler has an easier time inlining the former.
+      KOKKOS_FORCEINLINE_FUNCTION void operator()(
+          const size_type &jjblocknum) const {
+        size_type jj = jjblocknum * block_size;
+
+        for (size_type j = jj;
+             j < ((jj + block_size) > c.extent(2) ? c.extent(2)
+                                                  : (jj + block_size));
+             j++) {
+          for (size_type i = 0; i < c.extent(1); i++) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
@@ -1026,22 +1013,21 @@ size_type jj=jjblocknum*block_size;
 #pragma vector always
 #endif
 
-                                    for(size_type matnum=0;matnum<c.extent(0);matnum++){
-                     c(matnum,i,j) = beta*c(matnum,i,j);
-
-                 }
+            for (size_type matnum = 0; matnum < c.extent(0); matnum++) {
+              c(matnum, i, j) = beta * c(matnum, i, j);
+            }
           }
 
-
-        for(size_type kk=0;kk<a.extent(1);kk+= block_size){
-
-
-                                for(size_type k = kk; k<((kk+block_size)>a.extent(1)?a.extent(1):(kk+block_size)); k++){
-
-                                     for(size_type j = jj; j<((jj+block_size)>c.extent(2)?c.extent(2):(jj+block_size)); j++){
-
-
-                                        for(size_type i=0;i<c.extent(1);i++){
+          for (size_type kk = 0; kk < a.extent(1); kk += block_size) {
+            for (size_type k = kk;
+                 k < ((kk + block_size) > a.extent(1) ? a.extent(1)
+                                                      : (kk + block_size));
+                 k++) {
+              for (size_type j = jj;
+                   j < ((jj + block_size) > c.extent(2) ? c.extent(2)
+                                                        : (jj + block_size));
+                   j++) {
+                for (size_type i = 0; i < c.extent(1); i++) {
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
@@ -1049,524 +1035,763 @@ size_type jj=jjblocknum*block_size;
 #pragma vector always
 #endif
 
-                                    for(size_type matnum=0;matnum<c.extent(0);matnum++){ 
-                                     c(matnum,i,j) +=alpha*b(matnum,j,k)*a(matnum,k,i);
-
-                                 }
-
-                        }
+                  for (size_type matnum = 0; matnum < c.extent(0); matnum++) {
+                    c(matnum, i, j) +=
+                        alpha * b(matnum, j, k) * a(matnum, k, i);
+                  }
                 }
-        }
-
-
-  }
-};
-
-
+              }
+            }
+          }
+        };
 
-  template<typename ScalarA,typename ScalarB, typename ScalarC, typename eSpace,typename LayoutA,typename LayoutB,typename LayoutC,typename sizeType,int rank>
+        template <typename ScalarA, typename ScalarB, typename ScalarC,
+                  typename eSpace, typename LayoutA, typename LayoutB,
+                  typename LayoutC, typename sizeType, int rank>
         struct MultiGemm;
 
-template <typename ScalarA,typename ScalarB, typename ScalarC,typename eSpace, typename LayoutA,typename LayoutB,typename LayoutC,typename sizeType>
-struct MultiGemm<ScalarA,ScalarB,ScalarC,eSpace,LayoutA,LayoutB,LayoutC,sizetype,2>{
-        static void GEMM(const char transA, const char transB, ScalarA alpha,
-          Kokkos::View<ScalarA**,LayoutA,eSpace> A, Kokkos::View<ScalarB**,LayoutB,eSpace> B,
-          ScalarC beta, Kokkos::View<ScalarB**,LayoutC,eSpace> C){
-if((transA=='n'||transA=='N') && (transB=='n'||transB=='N')){
-sizetype jblocks=0;
-if(C.extent(1)%block_size==0){
-jblocks=C.extent(1)/block_size;
-}else{
-jblocks=C.extent(1)/block_size+1;
-}
-Kokkos::parallel_for(jblocks,blas3_right_2_N_N<Kokkos::View<ScalarA**,LayoutA,eSpace>,Kokkos::View<ScalarB**,LayoutB,eSpace>,Kokkos::View<ScalarC**,LayoutC,eSpace> >(alpha,A,B,beta,C));
-}else if((transA=='n'||transA=='N') && (transB=='t'||transB=='T')){
-sizetype jblocks=0;
-if(C.extent(1)%block_size==0){
-jblocks=C.extent(1)/block_size;
-}else{
-jblocks=C.extent(1)/block_size+1;
-}
-Kokkos::parallel_for(jblocks,blas3_right_2_N_T<Kokkos::View<ScalarA**,LayoutA,eSpace>,Kokkos::View<ScalarB**,LayoutB,eSpace>,Kokkos::View<ScalarC**,LayoutC,eSpace> >(alpha,A,B,beta,C));
-}else if((transA=='t'||transA=='T') && (transB=='n'||transB=='N')){
-sizetype jblocks=0;
-if(C.extent(1)%block_size==0){
-jblocks=C.extent(1)/block_size;
-}else{
-jblocks=C.extent(1)/block_size+1;
-}
-Kokkos::parallel_for(jblocks,blas3_right_2_T_N<Kokkos::View<ScalarA**,LayoutA,eSpace>,Kokkos::View<ScalarB**,LayoutB,eSpace>,Kokkos::View<ScalarC**,LayoutC,eSpace> >(alpha,A,B,beta,C));
-}else if((transA=='t'||transA=='T') && (transB=='t'||transB=='T')){
-sizetype jblocks=0;
-if(C.extent(1)%block_size==0){
-jblocks=C.extent(1)/block_size;
-}else{
-jblocks=C.extent(1)/block_size+1;
-}
-Kokkos::parallel_for(jblocks,blas3_right_2_T_T<Kokkos::View<ScalarA**,LayoutA,eSpace>,Kokkos::View<ScalarB**,LayoutB,eSpace>,Kokkos::View<ScalarC**,LayoutC,eSpace> >(alpha,A,B,beta,C));
-}
-
-
-
-}
-};
-
-template <typename ScalarA,typename ScalarB, typename ScalarC,typename eSpace, typename LayoutA,typename LayoutB,typename LayoutC,typename sizeType>
-struct MultiGemm<ScalarA,ScalarB,ScalarC,eSpace,LayoutA,LayoutB,LayoutC,sizetype,3>{
-        static void GEMM(const char transA, const char transB, ScalarA alpha,
-          Kokkos::View<ScalarA***,LayoutA,eSpace> A, Kokkos::View<ScalarB***,LayoutB,eSpace> B,
-          ScalarC beta, Kokkos::View<ScalarC***,LayoutC,eSpace> C){
-if((transA=='n'||transA=='N') && (transB=='n'||transB=='N')){
-
-Kokkos::parallel_for(C.extent(0),blas3_right_3_N_N<Kokkos::View<ScalarA***,LayoutA,eSpace>,Kokkos::View<ScalarB***,LayoutB,eSpace>,Kokkos::View<ScalarC***,LayoutC,eSpace> >(alpha,A,B,beta,C));
-
-}else if((transA=='n'||transA=='N') && (transB=='t'||transB=='T')){
-
-Kokkos::parallel_for(C.extent(0),blas3_right_3_N_T<Kokkos::View<ScalarA***,LayoutA,eSpace>,Kokkos::View<ScalarB***,LayoutB,eSpace>,Kokkos::View<ScalarC***,LayoutC,eSpace> >(alpha,A,B,beta,C));
-
-}else if((transA=='t'||transA=='T') && (transB=='n'||transB=='N')){
-
-Kokkos::parallel_for(C.extent(0),blas3_right_3_T_N<Kokkos::View<ScalarA***,LayoutA,eSpace>,Kokkos::View<ScalarB***,LayoutB,eSpace>,Kokkos::View<ScalarC***,LayoutC,eSpace> >(alpha,A,B,beta,C));
-
-}else if((transA=='t'||transA=='T') && (transB=='t'||transB=='T')){
-
-Kokkos::parallel_for(C.extent(0),blas3_right_3_T_T<Kokkos::View<ScalarA***,LayoutA,eSpace>,Kokkos::View<ScalarB***,LayoutB,eSpace>,Kokkos::View<ScalarC***,LayoutC,eSpace> >(alpha,A,B,beta,C));
-
-}
-
-}
-};
-template <typename ScalarA,typename ScalarB, typename ScalarC,typename eSpace,typename sizeType>
-struct MultiGemm<ScalarA,ScalarB,ScalarC,eSpace,Kokkos::LayoutRight,Kokkos::LayoutRight,Kokkos::LayoutRight,sizetype,2>{
-        static void GEMM(const char transA, const char transB, ScalarA alpha,
-          Kokkos::View<ScalarA**,Kokkos::LayoutRight,eSpace> A, Kokkos::View<ScalarB**,Kokkos::LayoutRight,eSpace> B,
-          ScalarC beta, Kokkos::View<ScalarC**,Kokkos::LayoutRight,eSpace> C){
-if((transA=='n'||transA=='N') && (transB=='n'||transB=='N')){
-sizetype jblocks=0;
-if(C.extent(1)%block_size==0){
-jblocks=C.extent(1)/block_size;
-}else{
-jblocks=C.extent(1)/block_size+1;
-}
-Kokkos::parallel_for(jblocks,blas3_right_2_N_N<Kokkos::View<ScalarA**,LayoutA,eSpace>,Kokkos::View<ScalarB**,LayoutB,eSpace>,Kokkos::View<ScalarB**,LayoutC,eSpace> >(alpha,A,B,beta,C));
-}else if((transA=='n'||transA=='N') && (transB=='t'||transB=='T')){
-sizetype jblocks=0;
-if(C.extent(1)%block_size==0){
-jblocks=C.extent(1)/block_size;
-}else{
-jblocks=C.extent(1)/block_size+1;
-}
-Kokkos::parallel_for(jblocks,blas3_right_2_N_T<Kokkos::View<ScalarA**,LayoutA,eSpace>,Kokkos::View<ScalarB**,LayoutB,eSpace>,Kokkos::View<ScalarB**,LayoutC,eSpace> >(alpha,A,B,beta,C));
-}else if((transA=='t'||transA=='T') && (transB=='n'||transB=='N')){
-sizetype jblocks=0;
-if(C.extent(1)%block_size==0){
-jblocks=C.extent(1)/block_size;
-}else{
-jblocks=C.extent(1)/block_size+1;
-}
-Kokkos::parallel_for(jblocks,blas3_right_2_T_N<Kokkos::View<ScalarA**,LayoutA,eSpace>,Kokkos::View<ScalarB**,LayoutB,eSpace>,Kokkos::View<ScalarB**,LayoutC,eSpace> >(alpha,A,B,beta,C));
-}else if((transA=='t'||transA=='T') && (transB=='t'||transB=='T')){
-sizetype jblocks=0;
-if(C.extent(1)%block_size==0){
-jblocks=C.extent(1)/block_size;
-}else{
-jblocks=C.extent(1)/block_size+1;
-}
-Kokkos::parallel_for(jblocks,blas3_right_2_T_T<Kokkos::View<ScalarA**,LayoutA,eSpace>,Kokkos::View<ScalarB**,LayoutB,eSpace>,Kokkos::View<ScalarB**,LayoutC,eSpace> >(alpha,A,B,beta,C));
-}
-
-}
-};
-
-template <typename ScalarA,typename ScalarB, typename ScalarC,typename eSpace,typename sizeType>
-struct MultiGemm<ScalarA,ScalarB,ScalarC,eSpace,Kokkos::LayoutRight,Kokkos::LayoutRight,Kokkos::LayoutRight,sizetype,3>{
-        static void GEMM(const char transA, const char transB, ScalarA alpha,
-          Kokkos::View<ScalarA***,Kokkos::LayoutRight,eSpace> A, Kokkos::View<ScalarB***,Kokkos::LayoutRight,eSpace> B,
-          ScalarC beta, Kokkos::View<ScalarC***,Kokkos::LayoutRight,eSpace> C){
-if((transA=='n'||transA=='N') && (transB=='n'||transB=='N')){
-
-Kokkos::parallel_for(C.extent(0),blas3_right_3_N_N<Kokkos::View<ScalarA***,Kokkos::LayoutRight,eSpace>,Kokkos::View<ScalarB***,Kokkos::LayoutRight,eSpace>,Kokkos::View<ScalarC***,Kokkos::LayoutRight,eSpace> >(alpha,A,B,beta,C));
-
-}else if((transA=='n'||transA=='N') && (transB=='t'||transB=='T')){
-
-Kokkos::parallel_for(C.extent(0),blas3_right_3_N_T<Kokkos::View<ScalarA***,Kokkos::LayoutRight,eSpace>,Kokkos::View<ScalarB***,Kokkos::LayoutRight,eSpace>,Kokkos::View<ScalarC***,Kokkos::LayoutRight,eSpace> >(alpha,A,B,beta,C));
-
-}else if((transA=='t'||transA=='T') && (transB=='n'||transB=='N')){
-
-Kokkos::parallel_for(C.extent(0),blas3_right_3_T_N<Kokkos::View<ScalarA***,Kokkos::LayoutRight,eSpace>,Kokkos::View<ScalarB***,Kokkos::LayoutRight,eSpace>,Kokkos::View<ScalarC***,Kokkos::LayoutRight,eSpace> >(alpha,A,B,beta,C));
-
-}else if((transA=='t'||transA=='T') && (transB=='t'||transB=='T')){
-
-Kokkos::parallel_for(C.extent(0),blas3_right_3_T_T<Kokkos::View<ScalarA***,Kokkos::LayoutRight,eSpace>,Kokkos::View<ScalarB***,Kokkos::LayoutRight,eSpace>,Kokkos::View<ScalarC***,Kokkos::LayoutRight,eSpace> >(alpha,A,B,beta,C));
-
-}
-
-}
-};
-
-template <typename ScalarA,typename ScalarB, typename ScalarC,typename eSpace,typename sizeType>
-struct MultiGemm<ScalarA,ScalarB,ScalarC,eSpace,Kokkos::LayoutLeft,Kokkos::LayoutLeft,Kokkos::LayoutLeft,sizetype,2>{
-        static void GEMM(const char transA, const char transB, ScalarA alpha,
-          Kokkos::View<ScalarA**,Kokkos::LayoutLeft,eSpace> A, Kokkos::View<ScalarB**,Kokkos::LayoutLeft,eSpace> B,
-          ScalarC beta, Kokkos::View<ScalarC**,Kokkos::LayoutLeft,eSpace> C){
-if((transA=='n'||transA=='N') && (transB=='n'||transB=='N')){
-sizetype jblocks=0;
-if(C.extent(1)%block_size==0){
-jblocks=C.extent(1)/block_size;
-}else{
-jblocks=C.extent(1)/block_size+1;
-}
-Kokkos::parallel_for(jblocks,blas3_left_2_N_N<Kokkos::View<ScalarA**,LayoutA,eSpace>,Kokkos::View<ScalarB**,LayoutB,eSpace>,Kokkos::View<ScalarB**,LayoutC,eSpace> >(alpha,A,B,beta,C));
-}else if((transA=='n'||transA=='N') && (transB=='t'||transB=='T')){
-sizetype jblocks=0;
-if(C.extent(1)%block_size==0){
-jblocks=C.extent(1)/block_size;
-}else{
-jblocks=C.extent(1)/block_size+1;
-}
-Kokkos::parallel_for(jblocks,blas3_left_2_N_T<Kokkos::View<ScalarA**,LayoutA,eSpace>,Kokkos::View<ScalarB**,LayoutB,eSpace>,Kokkos::View<ScalarB**,LayoutC,eSpace> >(alpha,A,B,beta,C));
-}else if((transA=='t'||transA=='T') && (transB=='n'||transB=='N')){
-sizetype jblocks=0;
-if(C.extent(1)%block_size==0){
-jblocks=C.extent(1)/block_size;
-}else{
-jblocks=C.extent(1)/block_size+1;
-}
-Kokkos::parallel_for(jblocks,blas3_left_2_T_N<Kokkos::View<ScalarA**,LayoutA,eSpace>,Kokkos::View<ScalarB**,LayoutB,eSpace>,Kokkos::View<ScalarB**,LayoutC,eSpace> >(alpha,A,B,beta,C));
-}else if((transA=='t'||transA=='T') && (transB=='t'||transB=='T')){
-sizetype jblocks=0;
-if(C.extent(1)%block_size==0){
-jblocks=C.extent(1)/block_size;
-}else{
-jblocks=C.extent(1)/block_size+1;
-}
-Kokkos::parallel_for(jblocks,blas3_left_2_T_T<Kokkos::View<ScalarA**,LayoutA,eSpace>,Kokkos::View<ScalarB**,LayoutB,eSpace>,Kokkos::View<ScalarB**,LayoutC,eSpace> >(alpha,A,B,beta,C));
-}
-
-}
-};
-
-template <typename ScalarA,typename ScalarB, typename ScalarC,typename eSpace,typename sizeType>
-struct MultiGemm<ScalarA,ScalarB,ScalarC,eSpace,Kokkos::LayoutLeft,Kokkos::LayoutLeft,Kokkos::LayoutLeft,sizetype,3>{
-        static void GEMM(const char transA, const char transB, ScalarA alpha,
-          Kokkos::View<ScalarA***,Kokkos::LayoutLeft,eSpace> A, Kokkos::View<ScalarB***,Kokkos::LayoutLeft,eSpace> B,
-          ScalarC beta, Kokkos::View<ScalarC***,Kokkos::LayoutLeft,eSpace> C){
-if((transA=='n'||transA=='N') && (transB=='n'||transB=='N')){
-
-Kokkos::parallel_for(C.extent(0),blas3_left_3_N_N<Kokkos::View<ScalarA***,Kokkos::LayoutLeft,eSpace>,Kokkos::View<ScalarB***,Kokkos::LayoutLeft,eSpace>,Kokkos::View<ScalarC***,Kokkos::LayoutLeft,eSpace> >(alpha,A,B,beta,C));
-
-}else if((transA=='n'||transA=='N') && (transB=='t'||transB=='T')){
-
-Kokkos::parallel_for(C.extent(0),blas3_left_3_N_T<Kokkos::View<ScalarA***,Kokkos::LayoutLeft,eSpace>,Kokkos::View<ScalarB***,Kokkos::LayoutLeft,eSpace>,Kokkos::View<ScalarC***,Kokkos::LayoutLeft,eSpace> >(alpha,A,B,beta,C));
-
-}else if((transA=='t'||transA=='T') && (transB=='n'||transB=='N')){
-
-Kokkos::parallel_for(C.extent(0),blas3_left_3_T_N<Kokkos::View<ScalarA***,Kokkos::LayoutLeft,eSpace>,Kokkos::View<ScalarB***,Kokkos::LayoutLeft,eSpace>,Kokkos::View<ScalarC***,Kokkos::LayoutLeft,eSpace> >(alpha,A,B,beta,C));
-
-}else if((transA=='t'||transA=='T') && (transB=='t'||transB=='T')){
-
-Kokkos::parallel_for(C.extent(0),blas3_left_3_T_T<Kokkos::View<ScalarA***,Kokkos::LayoutLeft,eSpace>,Kokkos::View<ScalarB***,Kokkos::LayoutLeft,eSpace>,Kokkos::View<ScalarC***,Kokkos::LayoutLeft,eSpace> >(alpha,A,B,beta,C));
-
-}
-
-}
-};
+        template <typename ScalarA, typename ScalarB, typename ScalarC,
+                  typename eSpace, typename LayoutA, typename LayoutB,
+                  typename LayoutC, typename sizeType>
+        struct MultiGemm<ScalarA, ScalarB, ScalarC, eSpace, LayoutA, LayoutB,
+                         LayoutC, sizetype, 2> {
+          static void GEMM(const char transA, const char transB, ScalarA alpha,
+                           Kokkos::View<ScalarA **, LayoutA, eSpace> A,
+                           Kokkos::View<ScalarB **, LayoutB, eSpace> B,
+                           ScalarC beta,
+                           Kokkos::View<ScalarB **, LayoutC, eSpace> C) {
+            if ((transA == 'n' || transA == 'N') &&
+                (transB == 'n' || transB == 'N')) {
+              sizetype jblocks = 0;
+              if (C.extent(1) % block_size == 0) {
+                jblocks = C.extent(1) / block_size;
+              } else {
+                jblocks = C.extent(1) / block_size + 1;
+              }
+              Kokkos::parallel_for(
+                  jblocks,
+                  blas3_right_2_N_N<Kokkos::View<ScalarA **, LayoutA, eSpace>,
+                                    Kokkos::View<ScalarB **, LayoutB, eSpace>,
+                                    Kokkos::View<ScalarC **, LayoutC, eSpace> >(
+                      alpha, A, B, beta, C));
+            } else if ((transA == 'n' || transA == 'N') &&
+                       (transB == 't' || transB == 'T')) {
+              sizetype jblocks = 0;
+              if (C.extent(1) % block_size == 0) {
+                jblocks = C.extent(1) / block_size;
+              } else {
+                jblocks = C.extent(1) / block_size + 1;
+              }
+              Kokkos::parallel_for(
+                  jblocks,
+                  blas3_right_2_N_T<Kokkos::View<ScalarA **, LayoutA, eSpace>,
+                                    Kokkos::View<ScalarB **, LayoutB, eSpace>,
+                                    Kokkos::View<ScalarC **, LayoutC, eSpace> >(
+                      alpha, A, B, beta, C));
+            } else if ((transA == 't' || transA == 'T') &&
+                       (transB == 'n' || transB == 'N')) {
+              sizetype jblocks = 0;
+              if (C.extent(1) % block_size == 0) {
+                jblocks = C.extent(1) / block_size;
+              } else {
+                jblocks = C.extent(1) / block_size + 1;
+              }
+              Kokkos::parallel_for(
+                  jblocks,
+                  blas3_right_2_T_N<Kokkos::View<ScalarA **, LayoutA, eSpace>,
+                                    Kokkos::View<ScalarB **, LayoutB, eSpace>,
+                                    Kokkos::View<ScalarC **, LayoutC, eSpace> >(
+                      alpha, A, B, beta, C));
+            } else if ((transA == 't' || transA == 'T') &&
+                       (transB == 't' || transB == 'T')) {
+              sizetype jblocks = 0;
+              if (C.extent(1) % block_size == 0) {
+                jblocks = C.extent(1) / block_size;
+              } else {
+                jblocks = C.extent(1) / block_size + 1;
+              }
+              Kokkos::parallel_for(
+                  jblocks,
+                  blas3_right_2_T_T<Kokkos::View<ScalarA **, LayoutA, eSpace>,
+                                    Kokkos::View<ScalarB **, LayoutB, eSpace>,
+                                    Kokkos::View<ScalarC **, LayoutC, eSpace> >(
+                      alpha, A, B, beta, C));
+            }
+          }
+        };
+
+        template <typename ScalarA, typename ScalarB, typename ScalarC,
+                  typename eSpace, typename LayoutA, typename LayoutB,
+                  typename LayoutC, typename sizeType>
+        struct MultiGemm<ScalarA, ScalarB, ScalarC, eSpace, LayoutA, LayoutB,
+                         LayoutC, sizetype, 3> {
+          static void GEMM(const char transA, const char transB, ScalarA alpha,
+                           Kokkos::View<ScalarA ***, LayoutA, eSpace> A,
+                           Kokkos::View<ScalarB ***, LayoutB, eSpace> B,
+                           ScalarC beta,
+                           Kokkos::View<ScalarC ***, LayoutC, eSpace> C) {
+            if ((transA == 'n' || transA == 'N') &&
+                (transB == 'n' || transB == 'N')) {
+              Kokkos::parallel_for(
+                  C.extent(0), blas3_right_3_N_N<
+                                   Kokkos::View<ScalarA ***, LayoutA, eSpace>,
+                                   Kokkos::View<ScalarB ***, LayoutB, eSpace>,
+                                   Kokkos::View<ScalarC ***, LayoutC, eSpace> >(
+                                   alpha, A, B, beta, C));
+
+            } else if ((transA == 'n' || transA == 'N') &&
+                       (transB == 't' || transB == 'T')) {
+              Kokkos::parallel_for(
+                  C.extent(0), blas3_right_3_N_T<
+                                   Kokkos::View<ScalarA ***, LayoutA, eSpace>,
+                                   Kokkos::View<ScalarB ***, LayoutB, eSpace>,
+                                   Kokkos::View<ScalarC ***, LayoutC, eSpace> >(
+                                   alpha, A, B, beta, C));
+
+            } else if ((transA == 't' || transA == 'T') &&
+                       (transB == 'n' || transB == 'N')) {
+              Kokkos::parallel_for(
+                  C.extent(0), blas3_right_3_T_N<
+                                   Kokkos::View<ScalarA ***, LayoutA, eSpace>,
+                                   Kokkos::View<ScalarB ***, LayoutB, eSpace>,
+                                   Kokkos::View<ScalarC ***, LayoutC, eSpace> >(
+                                   alpha, A, B, beta, C));
+
+            } else if ((transA == 't' || transA == 'T') &&
+                       (transB == 't' || transB == 'T')) {
+              Kokkos::parallel_for(
+                  C.extent(0), blas3_right_3_T_T<
+                                   Kokkos::View<ScalarA ***, LayoutA, eSpace>,
+                                   Kokkos::View<ScalarB ***, LayoutB, eSpace>,
+                                   Kokkos::View<ScalarC ***, LayoutC, eSpace> >(
+                                   alpha, A, B, beta, C));
+            }
+          }
+        };
+        template <typename ScalarA, typename ScalarB, typename ScalarC,
+                  typename eSpace, typename sizeType>
+        struct MultiGemm<ScalarA, ScalarB, ScalarC, eSpace, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::LayoutRight, sizetype,
+                         2> {
+          static void GEMM(
+              const char transA, const char transB, ScalarA alpha,
+              Kokkos::View<ScalarA **, Kokkos::LayoutRight, eSpace> A,
+              Kokkos::View<ScalarB **, Kokkos::LayoutRight, eSpace> B,
+              ScalarC beta,
+              Kokkos::View<ScalarC **, Kokkos::LayoutRight, eSpace> C) {
+            if ((transA == 'n' || transA == 'N') &&
+                (transB == 'n' || transB == 'N')) {
+              sizetype jblocks = 0;
+              if (C.extent(1) % block_size == 0) {
+                jblocks = C.extent(1) / block_size;
+              } else {
+                jblocks = C.extent(1) / block_size + 1;
+              }
+              Kokkos::parallel_for(
+                  jblocks,
+                  blas3_right_2_N_N<Kokkos::View<ScalarA **, LayoutA, eSpace>,
+                                    Kokkos::View<ScalarB **, LayoutB, eSpace>,
+                                    Kokkos::View<ScalarB **, LayoutC, eSpace> >(
+                      alpha, A, B, beta, C));
+            } else if ((transA == 'n' || transA == 'N') &&
+                       (transB == 't' || transB == 'T')) {
+              sizetype jblocks = 0;
+              if (C.extent(1) % block_size == 0) {
+                jblocks = C.extent(1) / block_size;
+              } else {
+                jblocks = C.extent(1) / block_size + 1;
+              }
+              Kokkos::parallel_for(
+                  jblocks,
+                  blas3_right_2_N_T<Kokkos::View<ScalarA **, LayoutA, eSpace>,
+                                    Kokkos::View<ScalarB **, LayoutB, eSpace>,
+                                    Kokkos::View<ScalarB **, LayoutC, eSpace> >(
+                      alpha, A, B, beta, C));
+            } else if ((transA == 't' || transA == 'T') &&
+                       (transB == 'n' || transB == 'N')) {
+              sizetype jblocks = 0;
+              if (C.extent(1) % block_size == 0) {
+                jblocks = C.extent(1) / block_size;
+              } else {
+                jblocks = C.extent(1) / block_size + 1;
+              }
+              Kokkos::parallel_for(
+                  jblocks,
+                  blas3_right_2_T_N<Kokkos::View<ScalarA **, LayoutA, eSpace>,
+                                    Kokkos::View<ScalarB **, LayoutB, eSpace>,
+                                    Kokkos::View<ScalarB **, LayoutC, eSpace> >(
+                      alpha, A, B, beta, C));
+            } else if ((transA == 't' || transA == 'T') &&
+                       (transB == 't' || transB == 'T')) {
+              sizetype jblocks = 0;
+              if (C.extent(1) % block_size == 0) {
+                jblocks = C.extent(1) / block_size;
+              } else {
+                jblocks = C.extent(1) / block_size + 1;
+              }
+              Kokkos::parallel_for(
+                  jblocks,
+                  blas3_right_2_T_T<Kokkos::View<ScalarA **, LayoutA, eSpace>,
+                                    Kokkos::View<ScalarB **, LayoutB, eSpace>,
+                                    Kokkos::View<ScalarB **, LayoutC, eSpace> >(
+                      alpha, A, B, beta, C));
+            }
+          }
+        };
+
+        template <typename ScalarA, typename ScalarB, typename ScalarC,
+                  typename eSpace, typename sizeType>
+        struct MultiGemm<ScalarA, ScalarB, ScalarC, eSpace, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::LayoutRight, sizetype,
+                         3> {
+          static void GEMM(
+              const char transA, const char transB, ScalarA alpha,
+              Kokkos::View<ScalarA ***, Kokkos::LayoutRight, eSpace> A,
+              Kokkos::View<ScalarB ***, Kokkos::LayoutRight, eSpace> B,
+              ScalarC beta,
+              Kokkos::View<ScalarC ***, Kokkos::LayoutRight, eSpace> C) {
+            if ((transA == 'n' || transA == 'N') &&
+                (transB == 'n' || transB == 'N')) {
+              Kokkos::parallel_for(
+                  C.extent(0),
+                  blas3_right_3_N_N<
+                      Kokkos::View<ScalarA ***, Kokkos::LayoutRight, eSpace>,
+                      Kokkos::View<ScalarB ***, Kokkos::LayoutRight, eSpace>,
+                      Kokkos::View<ScalarC ***, Kokkos::LayoutRight, eSpace> >(
+                      alpha, A, B, beta, C));
+
+            } else if ((transA == 'n' || transA == 'N') &&
+                       (transB == 't' || transB == 'T')) {
+              Kokkos::parallel_for(
+                  C.extent(0),
+                  blas3_right_3_N_T<
+                      Kokkos::View<ScalarA ***, Kokkos::LayoutRight, eSpace>,
+                      Kokkos::View<ScalarB ***, Kokkos::LayoutRight, eSpace>,
+                      Kokkos::View<ScalarC ***, Kokkos::LayoutRight, eSpace> >(
+                      alpha, A, B, beta, C));
+
+            } else if ((transA == 't' || transA == 'T') &&
+                       (transB == 'n' || transB == 'N')) {
+              Kokkos::parallel_for(
+                  C.extent(0),
+                  blas3_right_3_T_N<
+                      Kokkos::View<ScalarA ***, Kokkos::LayoutRight, eSpace>,
+                      Kokkos::View<ScalarB ***, Kokkos::LayoutRight, eSpace>,
+                      Kokkos::View<ScalarC ***, Kokkos::LayoutRight, eSpace> >(
+                      alpha, A, B, beta, C));
+
+            } else if ((transA == 't' || transA == 'T') &&
+                       (transB == 't' || transB == 'T')) {
+              Kokkos::parallel_for(
+                  C.extent(0),
+                  blas3_right_3_T_T<
+                      Kokkos::View<ScalarA ***, Kokkos::LayoutRight, eSpace>,
+                      Kokkos::View<ScalarB ***, Kokkos::LayoutRight, eSpace>,
+                      Kokkos::View<ScalarC ***, Kokkos::LayoutRight, eSpace> >(
+                      alpha, A, B, beta, C));
+            }
+          }
+        };
+
+        template <typename ScalarA, typename ScalarB, typename ScalarC,
+                  typename eSpace, typename sizeType>
+        struct MultiGemm<ScalarA, ScalarB, ScalarC, eSpace, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::LayoutLeft, sizetype, 2> {
+          static void GEMM(
+              const char transA, const char transB, ScalarA alpha,
+              Kokkos::View<ScalarA **, Kokkos::LayoutLeft, eSpace> A,
+              Kokkos::View<ScalarB **, Kokkos::LayoutLeft, eSpace> B,
+              ScalarC beta,
+              Kokkos::View<ScalarC **, Kokkos::LayoutLeft, eSpace> C) {
+            if ((transA == 'n' || transA == 'N') &&
+                (transB == 'n' || transB == 'N')) {
+              sizetype jblocks = 0;
+              if (C.extent(1) % block_size == 0) {
+                jblocks = C.extent(1) / block_size;
+              } else {
+                jblocks = C.extent(1) / block_size + 1;
+              }
+              Kokkos::parallel_for(
+                  jblocks,
+                  blas3_left_2_N_N<Kokkos::View<ScalarA **, LayoutA, eSpace>,
+                                   Kokkos::View<ScalarB **, LayoutB, eSpace>,
+                                   Kokkos::View<ScalarB **, LayoutC, eSpace> >(
+                      alpha, A, B, beta, C));
+            } else if ((transA == 'n' || transA == 'N') &&
+                       (transB == 't' || transB == 'T')) {
+              sizetype jblocks = 0;
+              if (C.extent(1) % block_size == 0) {
+                jblocks = C.extent(1) / block_size;
+              } else {
+                jblocks = C.extent(1) / block_size + 1;
+              }
+              Kokkos::parallel_for(
+                  jblocks,
+                  blas3_left_2_N_T<Kokkos::View<ScalarA **, LayoutA, eSpace>,
+                                   Kokkos::View<ScalarB **, LayoutB, eSpace>,
+                                   Kokkos::View<ScalarB **, LayoutC, eSpace> >(
+                      alpha, A, B, beta, C));
+            } else if ((transA == 't' || transA == 'T') &&
+                       (transB == 'n' || transB == 'N')) {
+              sizetype jblocks = 0;
+              if (C.extent(1) % block_size == 0) {
+                jblocks = C.extent(1) / block_size;
+              } else {
+                jblocks = C.extent(1) / block_size + 1;
+              }
+              Kokkos::parallel_for(
+                  jblocks,
+                  blas3_left_2_T_N<Kokkos::View<ScalarA **, LayoutA, eSpace>,
+                                   Kokkos::View<ScalarB **, LayoutB, eSpace>,
+                                   Kokkos::View<ScalarB **, LayoutC, eSpace> >(
+                      alpha, A, B, beta, C));
+            } else if ((transA == 't' || transA == 'T') &&
+                       (transB == 't' || transB == 'T')) {
+              sizetype jblocks = 0;
+              if (C.extent(1) % block_size == 0) {
+                jblocks = C.extent(1) / block_size;
+              } else {
+                jblocks = C.extent(1) / block_size + 1;
+              }
+              Kokkos::parallel_for(
+                  jblocks,
+                  blas3_left_2_T_T<Kokkos::View<ScalarA **, LayoutA, eSpace>,
+                                   Kokkos::View<ScalarB **, LayoutB, eSpace>,
+                                   Kokkos::View<ScalarB **, LayoutC, eSpace> >(
+                      alpha, A, B, beta, C));
+            }
+          }
+        };
+
+        template <typename ScalarA, typename ScalarB, typename ScalarC,
+                  typename eSpace, typename sizeType>
+        struct MultiGemm<ScalarA, ScalarB, ScalarC, eSpace, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::LayoutLeft, sizetype, 3> {
+          static void GEMM(
+              const char transA, const char transB, ScalarA alpha,
+              Kokkos::View<ScalarA ***, Kokkos::LayoutLeft, eSpace> A,
+              Kokkos::View<ScalarB ***, Kokkos::LayoutLeft, eSpace> B,
+              ScalarC beta,
+              Kokkos::View<ScalarC ***, Kokkos::LayoutLeft, eSpace> C) {
+            if ((transA == 'n' || transA == 'N') &&
+                (transB == 'n' || transB == 'N')) {
+              Kokkos::parallel_for(
+                  C.extent(0),
+                  blas3_left_3_N_N<
+                      Kokkos::View<ScalarA ***, Kokkos::LayoutLeft, eSpace>,
+                      Kokkos::View<ScalarB ***, Kokkos::LayoutLeft, eSpace>,
+                      Kokkos::View<ScalarC ***, Kokkos::LayoutLeft, eSpace> >(
+                      alpha, A, B, beta, C));
+
+            } else if ((transA == 'n' || transA == 'N') &&
+                       (transB == 't' || transB == 'T')) {
+              Kokkos::parallel_for(
+                  C.extent(0),
+                  blas3_left_3_N_T<
+                      Kokkos::View<ScalarA ***, Kokkos::LayoutLeft, eSpace>,
+                      Kokkos::View<ScalarB ***, Kokkos::LayoutLeft, eSpace>,
+                      Kokkos::View<ScalarC ***, Kokkos::LayoutLeft, eSpace> >(
+                      alpha, A, B, beta, C));
+
+            } else if ((transA == 't' || transA == 'T') &&
+                       (transB == 'n' || transB == 'N')) {
+              Kokkos::parallel_for(
+                  C.extent(0),
+                  blas3_left_3_T_N<
+                      Kokkos::View<ScalarA ***, Kokkos::LayoutLeft, eSpace>,
+                      Kokkos::View<ScalarB ***, Kokkos::LayoutLeft, eSpace>,
+                      Kokkos::View<ScalarC ***, Kokkos::LayoutLeft, eSpace> >(
+                      alpha, A, B, beta, C));
+
+            } else if ((transA == 't' || transA == 'T') &&
+                       (transB == 't' || transB == 'T')) {
+              Kokkos::parallel_for(
+                  C.extent(0),
+                  blas3_left_3_T_T<
+                      Kokkos::View<ScalarA ***, Kokkos::LayoutLeft, eSpace>,
+                      Kokkos::View<ScalarB ***, Kokkos::LayoutLeft, eSpace>,
+                      Kokkos::View<ScalarC ***, Kokkos::LayoutLeft, eSpace> >(
+                      alpha, A, B, beta, C));
+            }
+          }
+        };
 
 #ifdef KOKKOS_USE_MKL
 
-
-struct MultiGemm<double,double,double,Kokkos::Serial,Kokkos::LayoutRight,Kokkos::LayoutRight,Kokkos::LayoutRight,int,2>{
-        static void GEMM(const char transA, const char transB, double alpha,
-          Kokkos::View<double**,Kokkos::LayoutRight,Kokkos::Serial> A, Kokkos::View<double**,Kokkos::LayoutRight,Kokkos::Serial> B,
-          double beta, Kokkos::View<double**,Kokkos::LayoutRight,Kokkos::Serial> C){
-const int m=C.extent(0); 
-const int n=C.extent(1);
-const int k= (transA == 'N'||transA == 'n') ? A.extent(1) : A.extent(0);	  
-const int lda=(transA == 'N'||transA == 'n') ? k : m;
-const int ldb=(transA == 'N'||transA == 'n') ? n : k;
-const int ldc= n; 
-cblas_dgemm (CblasRowMajor, transA, transB, m, n, k, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc);
-}
-};
-
-struct MultiGemm<double,double,double,Kokkos::OpenMP,Kokkos::LayoutRight,Kokkos::LayoutRight,Kokkos::LayoutRight,int,2>{
-        static void GEMM(const char transA, const char transB, double alpha,
-          Kokkos::View<double**,Kokkos::LayoutRight,Kokkos::OpenMP> A, Kokkos::View<double**,Kokkos::LayoutRight,Kokkos::OpenMP> B,
-          double beta, Kokkos::View<double**,Kokkos::LayoutRight,Kokkos::OpenMP> C){
-const int m=C.extent(0);
-const int n=C.extent(1);
-const int k= (transA == 'N'||transA == 'n') ? A.extent(1) : A.extent(0);
-const int lda=(transA == 'N'||transA == 'n') ? k : m;
-const int ldb=(transA == 'N'||transA == 'n') ? n : k;
-const int ldc= n;       
-cblas_dgemm (CblasRowMajor, transA, transB, m, n, k, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc);
-
-}
-};
-
-struct MultiGemm<double,double,double,Kokkos::Serial,Kokkos::LayoutLeft,Kokkos::LayoutLeft,Kokkos::LayoutLeft,int,2>{
-        static void GEMM(const char transA, const char transB, double alpha,
-          Kokkos::View<double**,Kokkos::LayoutLeft,Kokkos::Serial> A, Kokkos::View<double**,Kokkos::LayoutLeft,Kokkos::Serial> B,
-          double beta, Kokkos::View<double**,Kokkos::LayoutLeft,Kokkos::Serial> C){
-const int m=C.extent(0);
-const int n=C.extent(1);
-const int k= (transA == 'N'||transA == 'n') ? A.extent(1) : A.extent(0);
-const int lda=(transA == 'N'||transA == 'n') ? m : k;
-const int ldb=(transA == 'N'||transA == 'n') ? k : n;
-const int ldc= m;
-cblas_dgemm (CblasColMajor, transA, transB, m, n, k, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc);
-
-}
-};
-
-struct MultiGemm<double,double,double,Kokkos::OpenMP,Kokkos::LayoutLeft,Kokkos::LayoutLeft,Kokkos::LayoutLeft,int,2>{
-        static void GEMM(const char transA, const char transB, double alpha,
-          Kokkos::View<double**,Kokkos::LayoutLeft,Kokkos::OpenMP> A, Kokkos::View<double**,Kokkos::LayoutLeft,Kokkos::OpenMP> B,
-          double beta, Kokkos::View<double**,Kokkos::LayoutLeft,Kokkos::OpenMP> C){
-const int m=C.extent(0);
-const int n=C.extent(1);
-const int k= (transA == 'N'||transA == 'n') ? A.extent(1) : A.extent(0);
-const int lda=(transA == 'N'||transA == 'n') ? m : k;
-const int ldb=(transA == 'N'||transA == 'n') ? k : n;
-const int ldc= m;
-cblas_dgemm (CblasColMajor, transA, transB, m, n, k, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc);
-
-
-}
-};
-
-
-struct MultiGemm<float,float,float,Kokkos::Serial,Kokkos::LayoutRight,Kokkos::LayoutRight,Kokkos::LayoutRight,int,2>{
-        static void GEMM(const char transA, const char transB, float alpha,
-          Kokkos::View<float**,Kokkos::LayoutRight,Kokkos::Serial> A, Kokkos::View<float**,Kokkos::LayoutRight,Kokkos::Serial> B,
-          float beta, Kokkos::View<float**,Kokkos::LayoutRight,Kokkos::Serial> C){
-const int m=C.extent(0);
-const int n=C.extent(1);
-const int k= (transA == 'N'||transA == 'n') ? A.extent(1) : A.extent(0);
-const int lda=(transA == 'N'||transA == 'n') ? k : m;
-const int ldb=(transA == 'N'||transA == 'n') ? n : k;
-const int ldc= n;       
-cblas_sgemm (CblasRowMajor, transA, transB, m, n, k, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc);
-
-}
-};
-
-struct MultiGemm<float,float,float,Kokkos::OpenMP,Kokkos::LayoutRight,Kokkos::LayoutRight,Kokkos::LayoutRight,int,2>{
-        static void GEMM(const char transA, const char transB, float alpha,
-          Kokkos::View<float**,Kokkos::LayoutRight,Kokkos::OpenMP> A, Kokkos::View<float**,Kokkos::LayoutRight,Kokkos::OpenMP> B,
-          float beta, Kokkos::View<float**,Kokkos::LayoutRight,Kokkos::OpenMP> C){
-const int m=C.extent(0);
-const int n=C.extent(1);
-const int k= (transA == 'N'||transA == 'n') ? A.extent(1) : A.extent(0);
-const int lda=(transA == 'N'||transA == 'n') ? k : m;
-const int ldb=(transA == 'N'||transA == 'n') ? n : k;
-const int ldc= n;
-cblas_sgemm (CblasRowMajor, transA, transB, m, n, k, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc);
-
-}
-};
-
-struct MultiGemm<float,float,float,Kokkos::Serial,Kokkos::LayoutLeft,Kokkos::LayoutLeft,Kokkos::LayoutLeft,int,2>{
-        static void GEMM(const char transA, const char transB, float alpha,
-          Kokkos::View<float**,Kokkos::LayoutLeft,Kokkos::Serial> A, Kokkos::View<float**,Kokkos::LayoutLeft,Kokkos::Serial> B,
-          float beta, Kokkos::View<float**,Kokkos::LayoutLeft,Kokkos::Serial> C){
-const int m=C.extent(0);
-const int n=C.extent(1);
-const int k= (transA == 'N'||transA == 'n') ? A.extent(1) : A.extent(0);
-const int lda=(transA == 'N'||transA == 'n') ? m : k;
-const int ldb=(transA == 'N'||transA == 'n') ? k : n;
-const int ldc= m;
-cblas_sgemm (CblasColMajor, transA, transB, m, n, k, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc);
-
-
-}
-};
-
-struct MultiGemm<float,float,float,Kokkos::OpenMP,Kokkos::LayoutLeft,Kokkos::LayoutLeft,Kokkos::LayoutLeft,int,2>{
-        static void GEMM(const char transA, const char transB, float alpha,
-          Kokkos::View<float**,Kokkos::LayoutLeft,Kokkos::OpenMP> A, Kokkos::View<float**,Kokkos::LayoutLeft,Kokkos::OpenMP> B,
-          float beta, Kokkos::View<float**,Kokkos::LayoutLeft,Kokkos::OpenMP> C){
-const int m=C.extent(0);
-const int n=C.extent(1);
-const int k= (transA == 'N'||transA == 'n') ? A.extent(1) : A.extent(0);
-const int lda=(transA == 'N'||transA == 'n') ? m : k;
-const int ldb=(transA == 'N'||transA == 'n') ? k : n;
-const int ldc= m;
-cblas_sgemm (CblasColMajor, transA, transB, m, n, k, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc);
-
-
-}
-};
-
-struct MultiGemm<int,int,int,Kokkos::Serial,Kokkos::LayoutRight,Kokkos::LayoutRight,Kokkos::LayoutRight,int,2>{
-        static void GEMM(const char transA, const char transB, int alpha,
-          Kokkos::View<int**,Kokkos::LayoutRight,Kokkos::Serial> A, Kokkos::View<int**,Kokkos::LayoutRight,Kokkos::Serial> B,
-          int beta, Kokkos::View<int**,Kokkos::LayoutRight,Kokkos::Serial> C){
-const int m=C.extent(0);
-const int n=C.extent(1);
-const int k= (transA == 'N'||transA == 'n') ? A.extent(1) : A.extent(0);
-const int lda=(transA == 'N'||transA == 'n') ? k : m;
-const int ldb=(transA == 'N'||transA == 'n') ? n : k;
-const int ldc= n;
-cblas_zgemm (CblasRowMajor, transA, transB, m, n, k, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc);
-
-}
-};
-
-struct MultiGemm<int,int,int,Kokkos::OpenMP,Kokkos::LayoutRight,Kokkos::LayoutRight,Kokkos::LayoutRight,int,2>{
-        static void GEMM(const char transA, const char transB, int alpha,
-          Kokkos::View<int**,Kokkos::LayoutRight,Kokkos::OpenMP> A, Kokkos::View<int**,Kokkos::LayoutRight,Kokkos::OpenMP> B,
-          int beta, Kokkos::View<int**,Kokkos::LayoutRight,Kokkos::OpenMP> C){
-const int m=C.extent(0);
-const int n=C.extent(1);
-const int k= (transA == 'N'||transA == 'n') ? A.extent(1) : A.extent(0);
-const int lda=(transA == 'N'||transA == 'n') ? k : m;
-const int ldb=(transA == 'N'||transA == 'n') ? n : k;
-const int ldc= n;
-cblas_zgemm (CblasRowMajor, transA, transB, m, n, k, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc);
-
-}
-};
-
-struct MultiGemm<int,int,int,Kokkos::Serial,Kokkos::LayoutLeft,Kokkos::LayoutLeft,Kokkos::LayoutLeft,int,2>{
-        static void GEMM(const char transA, const char transB, int alpha,
-          Kokkos::View<int**,Kokkos::LayoutLeft,Kokkos::Serial> A, Kokkos::View<int**,Kokkos::LayoutLeft,Kokkos::Serial> B,
-          int beta, Kokkos::View<int**,Kokkos::LayoutLeft,Kokkos::Serial> C){
-const int m=C.extent(0);
-const int n=C.extent(1);
-const int k= (transA == 'N'||transA == 'n') ? A.extent(1) : A.extent(0);
-const int lda=(transA == 'N'||transA == 'n') ? m : k;
-const int ldb=(transA == 'N'||transA == 'n') ? k : n;
-const int ldc= m;
-cblas_zgemm (CblasColMajor, transA, transB, m, n, k, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc);
-
-
-}
-};
-
-struct MultiGemm<int,int,int,Kokkos::OpenMP,Kokkos::LayoutLeft,Kokkos::LayoutLeft,Kokkos::LayoutLeft,int,2>{
-        static void GEMM(const char transA, const char transB, int alpha,
-          Kokkos::View<int**,Kokkos::LayoutLeft,Kokkos::OpenMP> A, Kokkos::View<int**,Kokkos::LayoutLeft,Kokkos::OpenMP> B,
-          int beta, Kokkos::View<int**,Kokkos::LayoutLeft,Kokkos::OpenMP> C){
-const int m=C.extent(0);
-const int n=C.extent(1);
-const int k= (transA == 'N'||transA == 'n') ? A.extent(1) : A.extent(0);
-const int lda=(transA == 'N'||transA == 'n') ? m : k;
-const int ldb=(transA == 'N'||transA == 'n') ? k : n;
-const int ldc= m;
-cblas_zgemm (CblasColMajor, transA, transB, m, n, k, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc);
-
-
-}
-};
-
+        struct MultiGemm<double, double, double, Kokkos::Serial,
+                         Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, int, 2> {
+          static void GEMM(
+              const char transA, const char transB, double alpha,
+              Kokkos::View<double **, Kokkos::LayoutRight, Kokkos::Serial> A,
+              Kokkos::View<double **, Kokkos::LayoutRight, Kokkos::Serial> B,
+              double beta,
+              Kokkos::View<double **, Kokkos::LayoutRight, Kokkos::Serial> C) {
+            const int m = C.extent(0);
+            const int n = C.extent(1);
+            const int k =
+                (transA == 'N' || transA == 'n') ? A.extent(1) : A.extent(0);
+            const int lda = (transA == 'N' || transA == 'n') ? k : m;
+            const int ldb = (transA == 'N' || transA == 'n') ? n : k;
+            const int ldc = n;
+            cblas_dgemm(CblasRowMajor, transA, transB, m, n, k, alpha, A.data(),
+                        lda, B.data(), ldb, beta, C.data(), ldc);
+          }
+        };
+
+        struct MultiGemm<double, double, double, Kokkos::OpenMP,
+                         Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, int, 2> {
+          static void GEMM(
+              const char transA, const char transB, double alpha,
+              Kokkos::View<double **, Kokkos::LayoutRight, Kokkos::OpenMP> A,
+              Kokkos::View<double **, Kokkos::LayoutRight, Kokkos::OpenMP> B,
+              double beta,
+              Kokkos::View<double **, Kokkos::LayoutRight, Kokkos::OpenMP> C) {
+            const int m = C.extent(0);
+            const int n = C.extent(1);
+            const int k =
+                (transA == 'N' || transA == 'n') ? A.extent(1) : A.extent(0);
+            const int lda = (transA == 'N' || transA == 'n') ? k : m;
+            const int ldb = (transA == 'N' || transA == 'n') ? n : k;
+            const int ldc = n;
+            cblas_dgemm(CblasRowMajor, transA, transB, m, n, k, alpha, A.data(),
+                        lda, B.data(), ldb, beta, C.data(), ldc);
+          }
+        };
+
+        struct MultiGemm<double, double, double, Kokkos::Serial,
+                         Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, int, 2> {
+          static void GEMM(
+              const char transA, const char transB, double alpha,
+              Kokkos::View<double **, Kokkos::LayoutLeft, Kokkos::Serial> A,
+              Kokkos::View<double **, Kokkos::LayoutLeft, Kokkos::Serial> B,
+              double beta,
+              Kokkos::View<double **, Kokkos::LayoutLeft, Kokkos::Serial> C) {
+            const int m = C.extent(0);
+            const int n = C.extent(1);
+            const int k =
+                (transA == 'N' || transA == 'n') ? A.extent(1) : A.extent(0);
+            const int lda = (transA == 'N' || transA == 'n') ? m : k;
+            const int ldb = (transA == 'N' || transA == 'n') ? k : n;
+            const int ldc = m;
+            cblas_dgemm(CblasColMajor, transA, transB, m, n, k, alpha, A.data(),
+                        lda, B.data(), ldb, beta, C.data(), ldc);
+          }
+        };
+
+        struct MultiGemm<double, double, double, Kokkos::OpenMP,
+                         Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, int, 2> {
+          static void GEMM(
+              const char transA, const char transB, double alpha,
+              Kokkos::View<double **, Kokkos::LayoutLeft, Kokkos::OpenMP> A,
+              Kokkos::View<double **, Kokkos::LayoutLeft, Kokkos::OpenMP> B,
+              double beta,
+              Kokkos::View<double **, Kokkos::LayoutLeft, Kokkos::OpenMP> C) {
+            const int m = C.extent(0);
+            const int n = C.extent(1);
+            const int k =
+                (transA == 'N' || transA == 'n') ? A.extent(1) : A.extent(0);
+            const int lda = (transA == 'N' || transA == 'n') ? m : k;
+            const int ldb = (transA == 'N' || transA == 'n') ? k : n;
+            const int ldc = m;
+            cblas_dgemm(CblasColMajor, transA, transB, m, n, k, alpha, A.data(),
+                        lda, B.data(), ldb, beta, C.data(), ldc);
+          }
+        };
+
+        struct MultiGemm<float, float, float, Kokkos::Serial,
+                         Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, int, 2> {
+          static void GEMM(
+              const char transA, const char transB, float alpha,
+              Kokkos::View<float **, Kokkos::LayoutRight, Kokkos::Serial> A,
+              Kokkos::View<float **, Kokkos::LayoutRight, Kokkos::Serial> B,
+              float beta,
+              Kokkos::View<float **, Kokkos::LayoutRight, Kokkos::Serial> C) {
+            const int m = C.extent(0);
+            const int n = C.extent(1);
+            const int k =
+                (transA == 'N' || transA == 'n') ? A.extent(1) : A.extent(0);
+            const int lda = (transA == 'N' || transA == 'n') ? k : m;
+            const int ldb = (transA == 'N' || transA == 'n') ? n : k;
+            const int ldc = n;
+            cblas_sgemm(CblasRowMajor, transA, transB, m, n, k, alpha, A.data(),
+                        lda, B.data(), ldb, beta, C.data(), ldc);
+          }
+        };
+
+        struct MultiGemm<float, float, float, Kokkos::OpenMP,
+                         Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, int, 2> {
+          static void GEMM(
+              const char transA, const char transB, float alpha,
+              Kokkos::View<float **, Kokkos::LayoutRight, Kokkos::OpenMP> A,
+              Kokkos::View<float **, Kokkos::LayoutRight, Kokkos::OpenMP> B,
+              float beta,
+              Kokkos::View<float **, Kokkos::LayoutRight, Kokkos::OpenMP> C) {
+            const int m = C.extent(0);
+            const int n = C.extent(1);
+            const int k =
+                (transA == 'N' || transA == 'n') ? A.extent(1) : A.extent(0);
+            const int lda = (transA == 'N' || transA == 'n') ? k : m;
+            const int ldb = (transA == 'N' || transA == 'n') ? n : k;
+            const int ldc = n;
+            cblas_sgemm(CblasRowMajor, transA, transB, m, n, k, alpha, A.data(),
+                        lda, B.data(), ldb, beta, C.data(), ldc);
+          }
+        };
+
+        struct MultiGemm<float, float, float, Kokkos::Serial,
+                         Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, int, 2> {
+          static void GEMM(
+              const char transA, const char transB, float alpha,
+              Kokkos::View<float **, Kokkos::LayoutLeft, Kokkos::Serial> A,
+              Kokkos::View<float **, Kokkos::LayoutLeft, Kokkos::Serial> B,
+              float beta,
+              Kokkos::View<float **, Kokkos::LayoutLeft, Kokkos::Serial> C) {
+            const int m = C.extent(0);
+            const int n = C.extent(1);
+            const int k =
+                (transA == 'N' || transA == 'n') ? A.extent(1) : A.extent(0);
+            const int lda = (transA == 'N' || transA == 'n') ? m : k;
+            const int ldb = (transA == 'N' || transA == 'n') ? k : n;
+            const int ldc = m;
+            cblas_sgemm(CblasColMajor, transA, transB, m, n, k, alpha, A.data(),
+                        lda, B.data(), ldb, beta, C.data(), ldc);
+          }
+        };
+
+        struct MultiGemm<float, float, float, Kokkos::OpenMP,
+                         Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, int, 2> {
+          static void GEMM(
+              const char transA, const char transB, float alpha,
+              Kokkos::View<float **, Kokkos::LayoutLeft, Kokkos::OpenMP> A,
+              Kokkos::View<float **, Kokkos::LayoutLeft, Kokkos::OpenMP> B,
+              float beta,
+              Kokkos::View<float **, Kokkos::LayoutLeft, Kokkos::OpenMP> C) {
+            const int m = C.extent(0);
+            const int n = C.extent(1);
+            const int k =
+                (transA == 'N' || transA == 'n') ? A.extent(1) : A.extent(0);
+            const int lda = (transA == 'N' || transA == 'n') ? m : k;
+            const int ldb = (transA == 'N' || transA == 'n') ? k : n;
+            const int ldc = m;
+            cblas_sgemm(CblasColMajor, transA, transB, m, n, k, alpha, A.data(),
+                        lda, B.data(), ldb, beta, C.data(), ldc);
+          }
+        };
+
+        struct MultiGemm<int, int, int, Kokkos::Serial, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::LayoutRight, int, 2> {
+          static void GEMM(
+              const char transA, const char transB, int alpha,
+              Kokkos::View<int **, Kokkos::LayoutRight, Kokkos::Serial> A,
+              Kokkos::View<int **, Kokkos::LayoutRight, Kokkos::Serial> B,
+              int beta,
+              Kokkos::View<int **, Kokkos::LayoutRight, Kokkos::Serial> C) {
+            const int m = C.extent(0);
+            const int n = C.extent(1);
+            const int k =
+                (transA == 'N' || transA == 'n') ? A.extent(1) : A.extent(0);
+            const int lda = (transA == 'N' || transA == 'n') ? k : m;
+            const int ldb = (transA == 'N' || transA == 'n') ? n : k;
+            const int ldc = n;
+            cblas_zgemm(CblasRowMajor, transA, transB, m, n, k, alpha, A.data(),
+                        lda, B.data(), ldb, beta, C.data(), ldc);
+          }
+        };
+
+        struct MultiGemm<int, int, int, Kokkos::OpenMP, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::LayoutRight, int, 2> {
+          static void GEMM(
+              const char transA, const char transB, int alpha,
+              Kokkos::View<int **, Kokkos::LayoutRight, Kokkos::OpenMP> A,
+              Kokkos::View<int **, Kokkos::LayoutRight, Kokkos::OpenMP> B,
+              int beta,
+              Kokkos::View<int **, Kokkos::LayoutRight, Kokkos::OpenMP> C) {
+            const int m = C.extent(0);
+            const int n = C.extent(1);
+            const int k =
+                (transA == 'N' || transA == 'n') ? A.extent(1) : A.extent(0);
+            const int lda = (transA == 'N' || transA == 'n') ? k : m;
+            const int ldb = (transA == 'N' || transA == 'n') ? n : k;
+            const int ldc = n;
+            cblas_zgemm(CblasRowMajor, transA, transB, m, n, k, alpha, A.data(),
+                        lda, B.data(), ldb, beta, C.data(), ldc);
+          }
+        };
+
+        struct MultiGemm<int, int, int, Kokkos::Serial, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::LayoutLeft, int, 2> {
+          static void GEMM(
+              const char transA, const char transB, int alpha,
+              Kokkos::View<int **, Kokkos::LayoutLeft, Kokkos::Serial> A,
+              Kokkos::View<int **, Kokkos::LayoutLeft, Kokkos::Serial> B,
+              int beta,
+              Kokkos::View<int **, Kokkos::LayoutLeft, Kokkos::Serial> C) {
+            const int m = C.extent(0);
+            const int n = C.extent(1);
+            const int k =
+                (transA == 'N' || transA == 'n') ? A.extent(1) : A.extent(0);
+            const int lda = (transA == 'N' || transA == 'n') ? m : k;
+            const int ldb = (transA == 'N' || transA == 'n') ? k : n;
+            const int ldc = m;
+            cblas_zgemm(CblasColMajor, transA, transB, m, n, k, alpha, A.data(),
+                        lda, B.data(), ldb, beta, C.data(), ldc);
+          }
+        };
+
+        struct MultiGemm<int, int, int, Kokkos::OpenMP, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::LayoutLeft, int, 2> {
+          static void GEMM(
+              const char transA, const char transB, int alpha,
+              Kokkos::View<int **, Kokkos::LayoutLeft, Kokkos::OpenMP> A,
+              Kokkos::View<int **, Kokkos::LayoutLeft, Kokkos::OpenMP> B,
+              int beta,
+              Kokkos::View<int **, Kokkos::LayoutLeft, Kokkos::OpenMP> C) {
+            const int m = C.extent(0);
+            const int n = C.extent(1);
+            const int k =
+                (transA == 'N' || transA == 'n') ? A.extent(1) : A.extent(0);
+            const int lda = (transA == 'N' || transA == 'n') ? m : k;
+            const int ldb = (transA == 'N' || transA == 'n') ? k : n;
+            const int ldc = m;
+            cblas_zgemm(CblasColMajor, transA, transB, m, n, k, alpha, A.data(),
+                        lda, B.data(), ldb, beta, C.data(), ldc);
+          }
+        };
 
 #endif
 
 #ifdef KOKKOS_ENABLE_CUDA
 
-struct MultiGemm<double,double,double,Kokkos::Cuda,Kokkos::LayoutRight,Kokkos::LayoutRight,Kokkos::LayoutRight,int,2>{
-        static void GEMM(const char transA, const char transB, double alpha,
-          Kokkos::View<double**,Kokkos::LayoutRight,Kokkos::Cuda> A, Kokkos::View<double**,Kokkos::LayoutRight,Kokkos::Cuda> B,
-          double beta, Kokkos::View<double**,Kokkos::LayoutRight,Kokkos::Cuda> C){
-const int m=C.extent(0); 
-const int n=C.extent(1);
-const int k= (transA == 'N'||transA == 'n') ? A.extent(1) : A.extent(0);	  
-const int lda=(transA == 'N'||transA == 'n') ? k : m;
-const int ldb=(transA == 'N'||transA == 'n') ? n : k;
-const int ldc= n; 
-cublasDgemm(transB,transA, n, m, k, alpha, B.data(), ldb, A.data(), lda, beta, C.data(), ldc);
-
-}
-};
-
-
-struct MultiGemm<double,double,double,Kokkos::Cuda,Kokkos::LayoutLeft,Kokkos::LayoutLeft,Kokkos::LayoutLeft,int,2>{
-        static void GEMM(const char transA, const char transB, double alpha,
-          Kokkos::View<double**,Kokkos::LayoutLeft,Kokkos::Cuda> A, Kokkos::View<double**,Kokkos::LayoutLeft,Kokkos::Cuda> B,
-          double beta, Kokkos::View<double**,Kokkos::LayoutLeft,Kokkos::Cuda> C){
-const int m=C.extent(0);
-const int n=C.extent(1);
-const int k= (transA == 'N'||transA == 'n') ? A.extent(1) : A.extent(0);
-const int lda=(transA == 'N'||transA == 'n') ? m : k;
-const int ldb=(transA == 'N'||transA == 'n') ? k : n;
-const int ldc= m;
-cublasDgemm (transA, transB, m, n, k, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc);
-
-}
-};
-
-
-
-struct MultiGemm<float,float,float,Kokkos::Cuda,Kokkos::LayoutRight,Kokkos::LayoutRight,Kokkos::LayoutRight,int,2>{
-        static void GEMM(const char transA, const char transB, float alpha,
-          Kokkos::View<float**,Kokkos::LayoutRight,Kokkos::Cuda> A, Kokkos::View<float**,Kokkos::LayoutRight,Kokkos::Cuda> B,
-          float beta, Kokkos::View<float**,Kokkos::LayoutRight,Kokkos::Cuda> C){
-const int m=C.extent(0);
-const int n=C.extent(1);
-const int k= (transA == 'N'||transA == 'n') ? A.extent(1) : A.extent(0);
-const int lda=(transA == 'N'||transA == 'n') ? k : m;
-const int ldb=(transA == 'N'||transA == 'n') ? n : k;
-const int ldc= n;
-cublasSgemm(transB,transA, n, m, k, alpha, B.data(), ldb, A.data(), lda, beta, C.data(), ldc);
-
-}
-};
-
-
-struct MultiGemm<float,float,float,Kokkos::Cuda,Kokkos::LayoutLeft,Kokkos::LayoutLeft,Kokkos::LayoutLeft,int,2>{
-        static void GEMM(const char transA, const char transB, float alpha,
-          Kokkos::View<float**,Kokkos::LayoutLeft,Kokkos::Cuda> A, Kokkos::View<float**,Kokkos::LayoutLeft,Kokkos::Cuda> B,
-          float beta, Kokkos::View<float**,Kokkos::LayoutLeft,Kokkos::Cuda> C){
-const int m=C.extent(0);
-const int n=C.extent(1);
-const int k= (transA == 'N'||transA == 'n') ? A.extent(1) : A.extent(0);
-const int lda=(transA == 'N'||transA == 'n') ? m : k;
-const int ldb=(transA == 'N'||transA == 'n') ? k : n;
-const int ldc= m;
-cublasSgemm ( transA, transB, m, n, k, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc);
-
-
-}
-};
-
-
-struct MultiGemm<int,int,int,Kokkos::Cuda,Kokkos::LayoutRight,Kokkos::LayoutRight,Kokkos::LayoutRight,int,2>{
-        static void GEMM(const char transA, const char transB, int alpha,
-          Kokkos::View<int**,Kokkos::LayoutRight,Kokkos::Cuda> A, Kokkos::View<int**,Kokkos::LayoutRight,Kokkos::Cuda> B,
-          int beta, Kokkos::View<int**,Kokkos::LayoutRight,Kokkos::Cuda> C){
-const int m=C.extent(0);
-const int n=C.extent(1);
-const int k= (transA == 'N'||transA == 'n') ? A.extent(1) : A.extent(0);
-const int lda=(transA == 'N'||transA == 'n') ? k : m;
-const int ldb=(transA == 'N'||transA == 'n') ? n : k;
-const int ldc= n;
-cublasZgemm(transB,transA, n, m, k, alpha, B.data(), ldb, A.data(), lda, beta, C.data(), ldc);
-
-}
-};
-
-
-struct MultiGemm<int,int,int,Kokkos::Cuda,Kokkos::LayoutLeft,Kokkos::LayoutLeft,Kokkos::LayoutLeft,int,2>{
-        static void GEMM(const char transA, const char transB, int alpha,
-          Kokkos::View<int**,Kokkos::LayoutLeft,Kokkos::Cuda> A, Kokkos::View<int**,Kokkos::LayoutLeft,Kokkos::Cuda> B,
-          int beta, Kokkos::View<int**,Kokkos::LayoutLeft,Kokkos::Cuda> C){
-const int m=C.extent(0);
-const int n=C.extent(1);
-const int k= (transA == 'N'||transA == 'n') ? A.extent(1) : A.extent(0);
-const int lda=(transA == 'N'||transA == 'n') ? m : k;
-const int ldb=(transA == 'N'||transA == 'n') ? k : n;
-const int ldc= m;
-cublasZgemm ( transA, transB, m, n, k, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc);
-
-
-}
-};
-
-
-
+        struct MultiGemm<double, double, double, Kokkos::Cuda,
+                         Kokkos::LayoutRight, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, int, 2> {
+          static void GEMM(
+              const char transA, const char transB, double alpha,
+              Kokkos::View<double **, Kokkos::LayoutRight, Kokkos::Cuda> A,
+              Kokkos::View<double **, Kokkos::LayoutRight, Kokkos::Cuda> B,
+              double beta,
+              Kokkos::View<double **, Kokkos::LayoutRight, Kokkos::Cuda> C) {
+            const int m = C.extent(0);
+            const int n = C.extent(1);
+            const int k =
+                (transA == 'N' || transA == 'n') ? A.extent(1) : A.extent(0);
+            const int lda = (transA == 'N' || transA == 'n') ? k : m;
+            const int ldb = (transA == 'N' || transA == 'n') ? n : k;
+            const int ldc = n;
+            cublasDgemm(transB, transA, n, m, k, alpha, B.data(), ldb, A.data(),
+                        lda, beta, C.data(), ldc);
+          }
+        };
+
+        struct MultiGemm<double, double, double, Kokkos::Cuda,
+                         Kokkos::LayoutLeft, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, int, 2> {
+          static void GEMM(
+              const char transA, const char transB, double alpha,
+              Kokkos::View<double **, Kokkos::LayoutLeft, Kokkos::Cuda> A,
+              Kokkos::View<double **, Kokkos::LayoutLeft, Kokkos::Cuda> B,
+              double beta,
+              Kokkos::View<double **, Kokkos::LayoutLeft, Kokkos::Cuda> C) {
+            const int m = C.extent(0);
+            const int n = C.extent(1);
+            const int k =
+                (transA == 'N' || transA == 'n') ? A.extent(1) : A.extent(0);
+            const int lda = (transA == 'N' || transA == 'n') ? m : k;
+            const int ldb = (transA == 'N' || transA == 'n') ? k : n;
+            const int ldc = m;
+            cublasDgemm(transA, transB, m, n, k, alpha, A.data(), lda, B.data(),
+                        ldb, beta, C.data(), ldc);
+          }
+        };
+
+        struct MultiGemm<float, float, float, Kokkos::Cuda, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::LayoutRight, int, 2> {
+          static void GEMM(
+              const char transA, const char transB, float alpha,
+              Kokkos::View<float **, Kokkos::LayoutRight, Kokkos::Cuda> A,
+              Kokkos::View<float **, Kokkos::LayoutRight, Kokkos::Cuda> B,
+              float beta,
+              Kokkos::View<float **, Kokkos::LayoutRight, Kokkos::Cuda> C) {
+            const int m = C.extent(0);
+            const int n = C.extent(1);
+            const int k =
+                (transA == 'N' || transA == 'n') ? A.extent(1) : A.extent(0);
+            const int lda = (transA == 'N' || transA == 'n') ? k : m;
+            const int ldb = (transA == 'N' || transA == 'n') ? n : k;
+            const int ldc = n;
+            cublasSgemm(transB, transA, n, m, k, alpha, B.data(), ldb, A.data(),
+                        lda, beta, C.data(), ldc);
+          }
+        };
+
+        struct MultiGemm<float, float, float, Kokkos::Cuda, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::LayoutLeft, int, 2> {
+          static void GEMM(
+              const char transA, const char transB, float alpha,
+              Kokkos::View<float **, Kokkos::LayoutLeft, Kokkos::Cuda> A,
+              Kokkos::View<float **, Kokkos::LayoutLeft, Kokkos::Cuda> B,
+              float beta,
+              Kokkos::View<float **, Kokkos::LayoutLeft, Kokkos::Cuda> C) {
+            const int m = C.extent(0);
+            const int n = C.extent(1);
+            const int k =
+                (transA == 'N' || transA == 'n') ? A.extent(1) : A.extent(0);
+            const int lda = (transA == 'N' || transA == 'n') ? m : k;
+            const int ldb = (transA == 'N' || transA == 'n') ? k : n;
+            const int ldc = m;
+            cublasSgemm(transA, transB, m, n, k, alpha, A.data(), lda, B.data(),
+                        ldb, beta, C.data(), ldc);
+          }
+        };
+
+        struct MultiGemm<int, int, int, Kokkos::Cuda, Kokkos::LayoutRight,
+                         Kokkos::LayoutRight, Kokkos::LayoutRight, int, 2> {
+          static void GEMM(
+              const char transA, const char transB, int alpha,
+              Kokkos::View<int **, Kokkos::LayoutRight, Kokkos::Cuda> A,
+              Kokkos::View<int **, Kokkos::LayoutRight, Kokkos::Cuda> B,
+              int beta,
+              Kokkos::View<int **, Kokkos::LayoutRight, Kokkos::Cuda> C) {
+            const int m = C.extent(0);
+            const int n = C.extent(1);
+            const int k =
+                (transA == 'N' || transA == 'n') ? A.extent(1) : A.extent(0);
+            const int lda = (transA == 'N' || transA == 'n') ? k : m;
+            const int ldb = (transA == 'N' || transA == 'n') ? n : k;
+            const int ldc = n;
+            cublasZgemm(transB, transA, n, m, k, alpha, B.data(), ldb, A.data(),
+                        lda, beta, C.data(), ldc);
+          }
+        };
+
+        struct MultiGemm<int, int, int, Kokkos::Cuda, Kokkos::LayoutLeft,
+                         Kokkos::LayoutLeft, Kokkos::LayoutLeft, int, 2> {
+          static void GEMM(
+              const char transA, const char transB, int alpha,
+              Kokkos::View<int **, Kokkos::LayoutLeft, Kokkos::Cuda> A,
+              Kokkos::View<int **, Kokkos::LayoutLeft, Kokkos::Cuda> B,
+              int beta,
+              Kokkos::View<int **, Kokkos::LayoutLeft, Kokkos::Cuda> C) {
+            const int m = C.extent(0);
+            const int n = C.extent(1);
+            const int k =
+                (transA == 'N' || transA == 'n') ? A.extent(1) : A.extent(0);
+            const int lda = (transA == 'N' || transA == 'n') ? m : k;
+            const int ldb = (transA == 'N' || transA == 'n') ? k : n;
+            const int ldc = m;
+            cublasZgemm(transA, transB, m, n, k, alpha, A.data(), lda, B.data(),
+                        ldb, beta, C.data(), ldc);
+          }
+        };
 
 #endif
-
-}
-}
+      }
+    }
diff --git a/src/stage/blas3/blas3_UnitTest_01.cpp b/src/stage/blas3/blas3_UnitTest_01.cpp
index 97536d818f..fe727aea69 100644
--- a/src/stage/blas3/blas3_UnitTest_01.cpp
+++ b/src/stage/blas3/blas3_UnitTest_01.cpp
@@ -43,8 +43,4 @@
 */
 #include <Kokkos_Blas1_MV.hpp>
 
-int main(){
-
-
-return 0;
-}
+int main() { return 0; }
diff --git a/test_common/KokkosBatched_Test_BlockCrs.hpp b/test_common/KokkosBatched_Test_BlockCrs.hpp
index b4712e4dbe..32734da625 100644
--- a/test_common/KokkosBatched_Test_BlockCrs.hpp
+++ b/test_common/KokkosBatched_Test_BlockCrs.hpp
@@ -14,7 +14,6 @@
 #endif
 
 #include "Kokkos_Core.hpp"
-#include "Kokkos_Timer.hpp"
 
 #include "KokkosBatched_Vector.hpp"
 
@@ -47,119 +46,126 @@
 #include "KokkosBatched_Test_BlockCrs_Util.hpp"
 
 namespace KokkosBatched {
-  namespace Test {
-
-    struct RangeTag {};
-    struct TeamTag {};
-    struct TeamShmemTag {};
-    
-    template <typename ExecSpace, typename ValueType, typename ArrayLayout,
-              int VectorLength,
-              typename LU_AlgoTagType,
-              typename Trsm_AlgoTagType,
-              typename Gemm_AlgoTagType>
-    class FactorizeBlockTridiagMatrices {
-    public:
-      typedef ExecSpace exec_space;
-      typedef ValueType value_type;
-      typedef ArrayLayout array_layout;
-
-      typedef BlockTridiagMatrices<exec_space,value_type,array_layout> block_tridiag_matrices_type;
-
-    private:
-      ordinal_type _ntridiag, _m, _blocksize, _shmemlvl;
-
-      UnmanagedViewType<typename block_tridiag_matrices_type::value_array_type> _TA, _TB, _TC;
-      typedef typename MagnitudeScalarType<value_type>::type magnitude_scalar_type;
-
-    public:
-      FactorizeBlockTridiagMatrices() {}
-
-      // A thread maps nonzero blocks
-      KOKKOS_INLINE_FUNCTION 
-      void operator()(const RangeTag &, const ordinal_type ij) const {
-        auto A = Kokkos::subview(_TA, ij, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
-        auto B = Kokkos::subview(_TB, ij, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
-        auto C = Kokkos::subview(_TC, ij, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
-
-        auto AA = Kokkos::subview(A, 0,   Kokkos::ALL(), Kokkos::ALL());
-        auto BB = Kokkos::subview(B, 0,   Kokkos::ALL(), Kokkos::ALL());
-        auto CC = Kokkos::subview(C, 0,   Kokkos::ALL(), Kokkos::ALL());
-        auto DD = AA;
-
-        const auto tiny = Kokkos::Details::ArithTraits<magnitude_scalar_type>::epsilon()*100;
+namespace Test {
+
+struct RangeTag {};
+struct TeamTag {};
+struct TeamShmemTag {};
+
+template <typename ExecSpace, typename ValueType, typename ArrayLayout,
+          int VectorLength, typename LU_AlgoTagType, typename Trsm_AlgoTagType,
+          typename Gemm_AlgoTagType>
+class FactorizeBlockTridiagMatrices {
+ public:
+  typedef ExecSpace exec_space;
+  typedef ValueType value_type;
+  typedef ArrayLayout array_layout;
+
+  typedef BlockTridiagMatrices<exec_space, value_type, array_layout>
+      block_tridiag_matrices_type;
+
+ private:
+  ordinal_type _ntridiag, _m, _blocksize, _shmemlvl;
+
+  UnmanagedViewType<typename block_tridiag_matrices_type::value_array_type> _TA,
+      _TB, _TC;
+  typedef typename MagnitudeScalarType<value_type>::type magnitude_scalar_type;
+
+ public:
+  FactorizeBlockTridiagMatrices() {}
+
+  // A thread maps nonzero blocks
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const RangeTag &, const ordinal_type ij) const {
+    auto A =
+        Kokkos::subview(_TA, ij, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
+    auto B =
+        Kokkos::subview(_TB, ij, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
+    auto C =
+        Kokkos::subview(_TC, ij, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
+
+    auto AA = Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL());
+    auto BB = Kokkos::subview(B, 0, Kokkos::ALL(), Kokkos::ALL());
+    auto CC = Kokkos::subview(C, 0, Kokkos::ALL(), Kokkos::ALL());
+    auto DD = AA;
+
+    const auto tiny =
+        Kokkos::Details::ArithTraits<magnitude_scalar_type>::epsilon() * 100;
+
+    const ordinal_type kend = _m - 1;
+    for (ordinal_type k = 0; k < kend; ++k) {
+      AA.assign_data(&A(k, 0, 0));
+      BB.assign_data(&B(k, 0, 0));
+      CC.assign_data(&C(k, 0, 0));
+      DD.assign_data(&A(k + 1, 0, 0));
+
+      SerialAddRadial::invoke(tiny, AA);
+      SerialLU<LU_AlgoTagType>::invoke(AA);
+      SerialTrsm<Side::Left, Uplo::Lower, Trans::NoTranspose, Diag::Unit,
+                 Trsm_AlgoTagType>::invoke(1.0, AA, BB);
+      SerialTrsm<Side::Right, Uplo::Upper, Trans::NoTranspose, Diag::NonUnit,
+                 Trsm_AlgoTagType>::invoke(1.0, AA, CC);
+      SerialGemm<Trans::NoTranspose, Trans::NoTranspose,
+                 Gemm_AlgoTagType>::invoke(-1.0, CC, BB, 1.0, DD);
+    }
+    AA.assign_data(&A(kend, 0, 0));
+    SerialLU<LU_AlgoTagType>::invoke(AA);
+  }
 
-        const ordinal_type kend = _m - 1;
-        for (ordinal_type k=0;k<kend;++k) {
-          AA.assign_data( &A(k  ,0,0) );
-          BB.assign_data( &B(k  ,0,0) );
-          CC.assign_data( &C(k  ,0,0) );
-          DD.assign_data( &A(k+1,0,0) );
-
-          SerialAddRadial::invoke(tiny, AA);
-          SerialLU<LU_AlgoTagType>
-            ::invoke(AA);
-          SerialTrsm<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit,Trsm_AlgoTagType>
-            ::invoke(1.0, AA, BB);
-          SerialTrsm<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,Trsm_AlgoTagType>
-            ::invoke(1.0, AA, CC);
-          SerialGemm<Trans::NoTranspose,Trans::NoTranspose,Gemm_AlgoTagType>
-            ::invoke(-1.0, CC, BB, 1.0, DD);
-        }
-        AA.assign_data( &A(kend,0,0) );
-        SerialLU<LU_AlgoTagType>
-          ::invoke(AA);
-      }
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamTag &,
+                                         const MemberType &member) const {
+    const int ijbeg = member.league_rank() * VectorLength;
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &idx) {
+          const int ij = ijbeg + idx;
+          if (ij < _ntridiag) {
+            auto A = Kokkos::subview(_TA, ij, Kokkos::ALL(), Kokkos::ALL(),
+                                     Kokkos::ALL());
+            auto B = Kokkos::subview(_TB, ij, Kokkos::ALL(), Kokkos::ALL(),
+                                     Kokkos::ALL());
+            auto C = Kokkos::subview(_TC, ij, Kokkos::ALL(), Kokkos::ALL(),
+                                     Kokkos::ALL());
+
+            const auto tiny =
+                Kokkos::Details::ArithTraits<magnitude_scalar_type>::epsilon() *
+                100;
 
-      template<typename MemberType>
-      KOKKOS_INLINE_FUNCTION 
-      void operator()(const TeamTag &, const MemberType &member) const {
-        const int ijbeg = member.league_rank()*VectorLength;
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength),
-           [&](const int &idx) {
-            const int ij = ijbeg + idx;
-            if (ij < _ntridiag) {
-              auto A = Kokkos::subview(_TA, ij, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
-              auto B = Kokkos::subview(_TB, ij, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
-              auto C = Kokkos::subview(_TC, ij, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
-
-              const auto tiny = Kokkos::Details::ArithTraits<magnitude_scalar_type>::epsilon()*100;
-
-              const ordinal_type kend = _m - 1;
-              {
-                auto AA = Kokkos::subview(A, 0,   Kokkos::ALL(), Kokkos::ALL());
-                auto BB = Kokkos::subview(B, 0,   Kokkos::ALL(), Kokkos::ALL());
-                auto CC = Kokkos::subview(C, 0,   Kokkos::ALL(), Kokkos::ALL());
-                auto DD = AA;
+            const ordinal_type kend = _m - 1;
+            {
+              auto AA = Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL());
+              auto BB = Kokkos::subview(B, 0, Kokkos::ALL(), Kokkos::ALL());
+              auto CC = Kokkos::subview(C, 0, Kokkos::ALL(), Kokkos::ALL());
+              auto DD = AA;
 
-                for (ordinal_type k=0;k<kend;++k) {
+              for (ordinal_type k = 0; k < kend; ++k) {
+                AA.assign_data(&A(k, 0, 0));
+                BB.assign_data(&B(k, 0, 0));
+                CC.assign_data(&C(k, 0, 0));
+                DD.assign_data(&A(k + 1, 0, 0));
 
-                  AA.assign_data( &A(k  ,0,0) );
-                  BB.assign_data( &B(k  ,0,0) );
-                  CC.assign_data( &C(k  ,0,0) );
-                  DD.assign_data( &A(k+1,0,0) );
-                  
-                  member.team_barrier();
-                  TeamAddRadial<MemberType>::invoke(member, tiny, AA);
-                  member.team_barrier();
-                  TeamLU<MemberType,LU_AlgoTagType>
-                    ::invoke(member, AA);
-                  member.team_barrier();
-                  TeamTrsm<MemberType,Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit,Trsm_AlgoTagType>
-                    ::invoke(member, 1.0, AA, BB);
-                  TeamTrsm<MemberType,Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,Trsm_AlgoTagType>
-                    ::invoke(member, 1.0, AA, CC);
-                  member.team_barrier();
-                  TeamGemm<MemberType,Trans::NoTranspose,Trans::NoTranspose,Gemm_AlgoTagType>
-                    ::invoke(member, -1.0, CC, BB, 1.0, DD);
-                }
-                {
-                  member.team_barrier();
-                  AA.assign_data( &A(kend,0,0) );
-                  TeamLU<MemberType,LU_AlgoTagType>
-                    ::invoke(member, AA);
-                }
+                member.team_barrier();
+                TeamAddRadial<MemberType>::invoke(member, tiny, AA);
+                member.team_barrier();
+                TeamLU<MemberType, LU_AlgoTagType>::invoke(member, AA);
+                member.team_barrier();
+                TeamTrsm<MemberType, Side::Left, Uplo::Lower,
+                         Trans::NoTranspose, Diag::Unit,
+                         Trsm_AlgoTagType>::invoke(member, 1.0, AA, BB);
+                TeamTrsm<MemberType, Side::Right, Uplo::Upper,
+                         Trans::NoTranspose, Diag::NonUnit,
+                         Trsm_AlgoTagType>::invoke(member, 1.0, AA, CC);
+                member.team_barrier();
+                TeamGemm<MemberType, Trans::NoTranspose, Trans::NoTranspose,
+                         Gemm_AlgoTagType>::invoke(member, -1.0, CC, BB, 1.0,
+                                                   DD);
+              }
+              {
+                member.team_barrier();
+                AA.assign_data(&A(kend, 0, 0));
+                TeamLU<MemberType, LU_AlgoTagType>::invoke(member, AA);
               }
+            }
 #if 0
               { // 0.028 vs 0.035; without subview it performs 0.028
                 const int as0 = A.stride_1(), as1 = A.stride_2();
@@ -198,1177 +204,1264 @@ namespace KokkosBatched {
                 }
               }
 #endif
-            }
-          });
-      }
-
-      template<typename MemberType>
-      KOKKOS_INLINE_FUNCTION 
-      void operator()(const TeamShmemTag &, const MemberType &member) const {
-        typedef Kokkos::View<ValueType***,exec_space> packed_view_type;
-        ScratchViewType<packed_view_type> sA(member.team_scratch(_shmemlvl), VectorLength, _blocksize, _blocksize);
-
-        const int ijbeg = member.league_rank()*VectorLength;
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength),
-           [&](const int &idx) {
-            const int ij = ijbeg + idx;
-            if (ij < _ntridiag) {
-              auto A = Kokkos::subview(_TA, ij, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
-              auto B = Kokkos::subview(_TB, ij, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
-              auto C = Kokkos::subview(_TC, ij, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
-
-              auto sAA = Kokkos::subview(sA, idx, Kokkos::ALL(), Kokkos::ALL());
-              const ordinal_type kend = _m - 1;
-              for (ordinal_type k=0;k<kend;++k) {
-                auto AA = Kokkos::subview(A, k,   Kokkos::ALL(), Kokkos::ALL());
-                auto BB = Kokkos::subview(B, k,   Kokkos::ALL(), Kokkos::ALL());
-                auto CC = Kokkos::subview(C, k,   Kokkos::ALL(), Kokkos::ALL());
-                auto DD = Kokkos::subview(A, k+1, Kokkos::ALL(), Kokkos::ALL());
-
-                TeamCopy<MemberType,Trans::NoTranspose>::invoke(member, AA, sAA);
-                member.team_barrier();
-
-                TeamLU<MemberType,LU_AlgoTagType>
-                  ::invoke(member, sAA);
-                member.team_barrier();
+          }
+        });
+  }
 
-                TeamCopy<MemberType,Trans::NoTranspose>::invoke(member, sAA, AA);
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamShmemTag &,
+                                         const MemberType &member) const {
+    typedef Kokkos::View<ValueType ***, exec_space> packed_view_type;
+    ScratchViewType<packed_view_type> sA(member.team_scratch(_shmemlvl),
+                                         VectorLength, _blocksize, _blocksize);
+
+    const int ijbeg = member.league_rank() * VectorLength;
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &idx) {
+          const int ij = ijbeg + idx;
+          if (ij < _ntridiag) {
+            auto A = Kokkos::subview(_TA, ij, Kokkos::ALL(), Kokkos::ALL(),
+                                     Kokkos::ALL());
+            auto B = Kokkos::subview(_TB, ij, Kokkos::ALL(), Kokkos::ALL(),
+                                     Kokkos::ALL());
+            auto C = Kokkos::subview(_TC, ij, Kokkos::ALL(), Kokkos::ALL(),
+                                     Kokkos::ALL());
+
+            auto sAA = Kokkos::subview(sA, idx, Kokkos::ALL(), Kokkos::ALL());
+            const ordinal_type kend = _m - 1;
+            for (ordinal_type k = 0; k < kend; ++k) {
+              auto AA = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL());
+              auto BB = Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL());
+              auto CC = Kokkos::subview(C, k, Kokkos::ALL(), Kokkos::ALL());
+              auto DD = Kokkos::subview(A, k + 1, Kokkos::ALL(), Kokkos::ALL());
+
+              TeamCopy<MemberType, Trans::NoTranspose>::invoke(member, AA, sAA);
+              member.team_barrier();
+
+              TeamLU<MemberType, LU_AlgoTagType>::invoke(member, sAA);
+              member.team_barrier();
+
+              TeamCopy<MemberType, Trans::NoTranspose>::invoke(member, sAA, AA);
+
+              TeamTrsm<MemberType, Side::Left, Uplo::Lower, Trans::NoTranspose,
+                       Diag::Unit, Trsm_AlgoTagType>::invoke(member, 1.0, sAA,
+                                                             BB);
+              TeamTrsm<MemberType, Side::Right, Uplo::Upper, Trans::NoTranspose,
+                       Diag::NonUnit, Trsm_AlgoTagType>::invoke(member, 1.0,
+                                                                sAA, CC);
+              member.team_barrier();
+
+              TeamGemm<MemberType, Trans::NoTranspose, Trans::NoTranspose,
+                       Gemm_AlgoTagType>::invoke(member, -1.0, CC, BB, 1.0, DD);
+            }
 
-                TeamTrsm<MemberType,Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit,Trsm_AlgoTagType>
-                  ::invoke(member, 1.0, sAA, BB);
-                TeamTrsm<MemberType,Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,Trsm_AlgoTagType>
-                  ::invoke(member, 1.0, sAA, CC);
-                member.team_barrier();
+            {
+              member.team_barrier();
+              auto AA = Kokkos::subview(A, kend, Kokkos::ALL(), Kokkos::ALL());
+              TeamLU<MemberType, LU_AlgoTagType>::invoke(member, AA);
+            }
+          }
+        });
+  }
 
-                TeamGemm<MemberType,Trans::NoTranspose,Trans::NoTranspose,Gemm_AlgoTagType>
-                  ::invoke(member, -1.0, CC, BB, 1.0, DD);
-              }
+  double FlopCount(const block_tridiag_matrices_type T) {
+    const int ntridiag = T.NumTridiagMatrices(), m = T.NumRows(),
+              blocksize = T.BlockSize();
 
-              {
-                member.team_barrier();
-                auto AA = Kokkos::subview(A, kend, Kokkos::ALL(), Kokkos::ALL());
-                TeamLU<MemberType,LU_AlgoTagType>
-                  ::invoke(member, AA);
-              }
-            }
-          });
-      }
-      
-      double FlopCount(const block_tridiag_matrices_type T) {
-        const int 
-          ntridiag = T.NumTridiagMatrices(),
-          m = T.NumRows(),
-          blocksize = T.BlockSize();
-        
-        return ntridiag*( (m-1)*(LU_FlopCount(blocksize, blocksize) + 
-                                 Trsm_Lower_FlopCountLower(blocksize, blocksize) +
-                                 Trsm_Upper_FlopCountUpper(blocksize, blocksize) +
-                                 Gemm_FlopCount(blocksize, blocksize, blocksize)) +
-                          LU_FlopCount(blocksize, blocksize) );
-      }
+    return ntridiag *
+           ((m - 1) * (LU_FlopCount(blocksize, blocksize) +
+                       Trsm_Lower_FlopCountLower(blocksize, blocksize) +
+                       Trsm_Upper_FlopCountUpper(blocksize, blocksize) +
+                       Gemm_FlopCount(blocksize, blocksize, blocksize)) +
+            LU_FlopCount(blocksize, blocksize));
+  }
 
-      // for batched blas check
-      void run(const int op, const block_tridiag_matrices_type T, const bool fake = false) { 
-        _ntridiag = T.NumTridiagMatrices();
-        _m = T.NumRows(); 
-        _blocksize = T.BlockSize();
+  // for batched blas check
+  void run(const int op, const block_tridiag_matrices_type T,
+           const bool fake = false) {
+    _ntridiag  = T.NumTridiagMatrices();
+    _m         = T.NumRows();
+    _blocksize = T.BlockSize();
 
-        _TA = T.A(); 
-        _TB = T.B(); 
-        _TC = T.C();
+    _TA = T.A();
+    _TB = T.B();
+    _TC = T.C();
 
-        // parallel over the instances of tridiagonal matrices
-        if (!fake) {
+    // parallel over the instances of tridiagonal matrices
+    if (!fake) {
 #if defined(KOKKOS_ENABLE_CUDA) && defined(__KOKKOSBATCHED_TEST_ENABLE_CUDA__)
-          typedef FactorizeBlockTridiagMatrices<exec_space,value_type,array_layout,
-	    VectorLength,
-	    LU_AlgoTagType,
-	    Trsm_AlgoTagType,
-	    Gemm_AlgoTagType> functor_type;
+      typedef FactorizeBlockTridiagMatrices<
+          exec_space, value_type, array_layout, VectorLength, LU_AlgoTagType,
+          Trsm_AlgoTagType, Gemm_AlgoTagType>
+          functor_type;
 #endif
 
-          switch (op) {
-          case 0: {
-            std::cout << "KokkosBatched::RangeTag::" << Gemm_AlgoTagType::name() << "\n";
-            const Kokkos::RangePolicy<exec_space,RangeTag> policy(0, _ntridiag);
-            Kokkos::parallel_for("KokkosBatched::Test::BlockCrs::FactorizeBlockTridiagMatrices::Op0", policy, *this);
-            break;
-          }
+      switch (op) {
+        case 0: {
+          std::cout << "KokkosBatched::RangeTag::" << Gemm_AlgoTagType::name()
+                    << "\n";
+          const Kokkos::RangePolicy<exec_space, RangeTag> policy(0, _ntridiag);
+          Kokkos::parallel_for(
+              "KokkosBatched::Test::BlockCrs::FactorizeBlockTridiagMatrices::"
+              "Op0",
+              policy, *this);
+          break;
+        }
 #if defined(KOKKOS_ENABLE_CUDA) && defined(__KOKKOSBATCHED_TEST_ENABLE_CUDA__)
-          case 1: {
-            typedef Kokkos::TeamPolicy<exec_space,TeamTag> policy_type;
+        case 1: {
+          typedef Kokkos::TeamPolicy<exec_space, TeamTag> policy_type;
+
+          int team_size = 0;
+
+          // this is what cuda allows
+          const int max_team_size =
+              policy_type(_ntridiag, Kokkos::AUTO, VectorLength)
+                  .team_size_max(functor_type(), Kokkos::ParallelForTag());
+
+          // this is what algorithm allows
+          if (std::is_same<Gemm_AlgoTagType, Algo::Gemm::Blocked>::value) {
+            const int mb =
+                Algo::Gemm::Blocked::mb<typename exec_space::memory_space>();
+            const int mp = _blocksize % mb, mblk = (_blocksize / mb) + (mp > 0);
+            // - max parallelism in gemm / 2 (no idea...)
+            team_size =
+                std::min(std::max(mblk * mblk / 2, 1), max_team_size / 2);
+            std::cout << "KokkosBatched::TeamTag::Blocked::TeamSize:: "
+                      << team_size << " " << (max_team_size) << "\n";
+          } else {
+            // - max parallelism in trsm * scheduling efficiency 2
+            // - max cuda team size / scheduling efficiency 2
+            team_size =
+                std::min(std::max(_blocksize * 2, 4), max_team_size / 2);
+            std::cout << "KokkosBatched::TeamTag::Unblocked::TeamSize:: "
+                      << team_size << " " << (max_team_size) << "\n";
+          }
 
+          const policy_type policy(_ntridiag, team_size, VectorLength);
+          Kokkos::parallel_for(
+              "KokkosBatched::Test::BlockCrs::FactorizeBlockTridiagMatrices::"
+              "Op1",
+              policy, *this);
+          break;
+        }
+        case 2: {
+          typedef Kokkos::View<ValueType ***, exec_space> packed_view_type;
+          typedef Kokkos::TeamPolicy<exec_space, TeamShmemTag> policy_type;
+
+          const int per_team_scratch =
+              ScratchViewType<packed_view_type>::shmem_size(
+                  VectorLength, _blocksize, _blocksize);
+
+          _shmemlvl = ((per_team_scratch / 1024) < 48 ? 0 : 1);
+          {
             int team_size = 0;
 
             // this is what cuda allows
-            const int max_team_size = 
-              policy_type(_ntridiag, Kokkos::AUTO, VectorLength).team_size_max(functor_type(), Kokkos::ParallelForTag());
+            const int max_team_size =
+                policy_type(_ntridiag, Kokkos::AUTO, VectorLength)
+                    .set_scratch_size(_shmemlvl,
+                                      Kokkos::PerTeam(per_team_scratch))
+                    .team_size_max(functor_type(), Kokkos::ParallelForTag());
 
             // this is what algorithm allows
-            if (std::is_same<Gemm_AlgoTagType,Algo::Gemm::Blocked>::value) {
-              const int mb = Algo::Gemm::Blocked::mb<typename exec_space::memory_space>();
-              const int mp = _blocksize%mb, mblk = (_blocksize/mb) + (mp>0);
+            if (std::is_same<Gemm_AlgoTagType, Algo::Gemm::Blocked>::value) {
+              const int mb =
+                  Algo::Gemm::Blocked::mb<typename exec_space::memory_space>();
+              const int mp   = _blocksize % mb,
+                        mblk = (_blocksize / mb) + (mp > 0);
               // - max parallelism in gemm / 2 (no idea...)
-              team_size = std::min(std::max(mblk*mblk/2,1), max_team_size/2);
-              std::cout << "KokkosBatched::TeamTag::Blocked::TeamSize:: " << team_size << " " << (max_team_size) << "\n";
+              team_size =
+                  std::min(std::max(mblk * mblk / 2, 1), max_team_size / 2);
+              std::cout << "KokkosBatched::TeamShmemTag::Blocked::TeamSize:: "
+                        << team_size << " " << (max_team_size) << "\n";
             } else {
-              // - max parallelism in trsm * scheduling efficiency 2
-              // - max cuda team size / scheduling efficiency 2
-              team_size = std::min(std::max(_blocksize*2,4), max_team_size/2);
-              std::cout << "KokkosBatched::TeamTag::Unblocked::TeamSize:: " << team_size << " " << (max_team_size) << "\n";
+              team_size =
+                  std::min(std::max(_blocksize * 2, 4), max_team_size / 2);
+              std::cout << "KokkosBatched::TeamShmemTag::Unblocked::TeamSize:: "
+                        << team_size << " " << (max_team_size) << "\n";
             }
 
-            const policy_type policy(_ntridiag, team_size, VectorLength);
-            Kokkos::parallel_for("KokkosBatched::Test::BlockCrs::FactorizeBlockTridiagMatrices::Op1", policy, *this);
-            break;
-          }
-          case 2: {
-            typedef Kokkos::View<ValueType***,exec_space> packed_view_type;
-            typedef Kokkos::TeamPolicy<exec_space,TeamShmemTag> policy_type;
-            
-            const int per_team_scratch 
-              = ScratchViewType<packed_view_type>::shmem_size(VectorLength, _blocksize, _blocksize);
-            
-            _shmemlvl = ((per_team_scratch/1024) < 48 ? 0 : 1);
-            {
-              int team_size = 0;
-              
-              // this is what cuda allows
-              const int max_team_size =
-                policy_type(_ntridiag, Kokkos::AUTO, VectorLength).set_scratch_size(_shmemlvl, Kokkos::PerTeam(per_team_scratch))
-                .team_size_max(functor_type(), Kokkos::ParallelForTag());   
-              
-              // this is what algorithm allows
-              if (std::is_same<Gemm_AlgoTagType,Algo::Gemm::Blocked>::value) {
-                const int mb = Algo::Gemm::Blocked::mb<typename exec_space::memory_space>();
-                const int mp = _blocksize%mb, mblk = (_blocksize/mb) + (mp>0);
-                // - max parallelism in gemm / 2 (no idea...)
-                team_size = std::min(std::max(mblk*mblk/2,1), max_team_size/2);
-                std::cout << "KokkosBatched::TeamShmemTag::Blocked::TeamSize:: " << team_size << " " << (max_team_size) << "\n";
-              } else {
-                team_size = std::min(std::max(_blocksize*2,4), max_team_size/2);
-                std::cout << "KokkosBatched::TeamShmemTag::Unblocked::TeamSize:: " << team_size << " " << (max_team_size) << "\n";
-              }
-              
-              policy_type policy = policy_type(_ntridiag, team_size, VectorLength).set_scratch_size(_shmemlvl, Kokkos::PerTeam(per_team_scratch));
-              Kokkos::parallel_for("KokkosBatched::Test::BlockCrs::FactorizeBlockTridiagMatrices::Op2", policy, *this);
-            } 
-            break;
+            policy_type policy =
+                policy_type(_ntridiag, team_size, VectorLength)
+                    .set_scratch_size(_shmemlvl,
+                                      Kokkos::PerTeam(per_team_scratch));
+            Kokkos::parallel_for(
+                "KokkosBatched::Test::BlockCrs::FactorizeBlockTridiagMatrices::"
+                "Op2",
+                policy, *this);
           }
+          break;
+        }
 #endif
-          default: {
-            std::cout << "Not supported operation mode: " << op << " \n";
-            break;
-          }
-          }
-          
+        default: {
+          std::cout << "Not supported operation mode: " << op << " \n";
+          break;
         }
       }
+    }
+  }
 
-      template<typename AViewType, typename LViewType, typename UViewType>
-      void a_subtract_mult_l_and_u(const ordinal_type tl, const ordinal_type il, AViewType A,
-                                   const ordinal_type tr, const ordinal_type ir, LViewType L, UViewType U) {
-        for (ordinal_type ii=0;ii<_blocksize;++ii)
-          for (ordinal_type jj=0;jj<_blocksize;++jj)
-            for (ordinal_type kk=0;kk<_blocksize;++kk) {
-              const auto l = ( ii == kk ? 1 :
-                               ii >  kk ? tdiag_val(L, tr, ir, ii, kk) : 0 );
-              const auto u = ( kk <= jj ? tdiag_val(U, tr, ir, kk, jj) : 0 );
-              tdiag_val(A, tl, il, ii, jj) -= l*u;
-            }
-      }
-
-      template<typename AViewType, typename BViewType, typename CViewType>
-      void a_subtract_mult_b_and_c(const ordinal_type tl, const ordinal_type il, AViewType A,
-                                   const ordinal_type tr, const ordinal_type ir, BViewType B, CViewType C) {
-        for (ordinal_type ii=0;ii<_blocksize;++ii)
-          for (ordinal_type jj=0;jj<_blocksize;++jj)
-            for (ordinal_type kk=0;kk<_blocksize;++kk) 
-              tdiag_val(A, tl, il, ii, jj) -= ( tdiag_val(B, tr, ir, ii, kk)*
-                                                tdiag_val(C, tr, ir, kk, jj) );
-      }
+  template <typename AViewType, typename LViewType, typename UViewType>
+  void a_subtract_mult_l_and_u(const ordinal_type tl, const ordinal_type il,
+                               AViewType A, const ordinal_type tr,
+                               const ordinal_type ir, LViewType L,
+                               UViewType U) {
+    for (ordinal_type ii = 0; ii < _blocksize; ++ii)
+      for (ordinal_type jj = 0; jj < _blocksize; ++jj)
+        for (ordinal_type kk = 0; kk < _blocksize; ++kk) {
+          const auto l =
+              (ii == kk ? 1 : ii > kk ? tdiag_val(L, tr, ir, ii, kk) : 0);
+          const auto u = (kk <= jj ? tdiag_val(U, tr, ir, kk, jj) : 0);
+          tdiag_val(A, tl, il, ii, jj) -= l * u;
+        }
+  }
 
-      template<typename AViewType, typename LViewType, typename BViewType>
-      void a_subtract_mult_l_and_b(const ordinal_type tl, const ordinal_type il, AViewType A,
-                                   const ordinal_type tr, const ordinal_type ir, LViewType L, BViewType B) {
-        for (ordinal_type ii=0;ii<_blocksize;++ii)
-          for (ordinal_type jj=0;jj<_blocksize;++jj)
-            for (ordinal_type kk=0;kk<_blocksize;++kk) {
-              const auto l = ( ii == kk ? 1.0 :
-                               ii >  kk ? tdiag_val(L, tr, ir, ii, kk) : 0 );
-              tdiag_val(A, tl, il, ii, jj) -= l*tdiag_val(B, tr, ir, kk, jj);
-            }
-      }
-      
-      template<typename AViewType, typename BViewType, typename UViewType>
-      void a_subtract_mult_b_and_u(const ordinal_type tl, const ordinal_type il, AViewType A,
-                                   const ordinal_type tr, const ordinal_type ir, BViewType B, UViewType U) {
-        for (ordinal_type ii=0;ii<_blocksize;++ii)
-          for (ordinal_type jj=0;jj<_blocksize;++jj)
-            for (ordinal_type kk=0;kk<_blocksize;++kk) {
-              const auto u = ( kk <= jj ? tdiag_val(U, tr, ir, kk, jj) : 0 );
-              tdiag_val(A, tl, il, ii, jj) -= tdiag_val(B, tr, ir, ii, kk)*u;
-            }
-      }
+  template <typename AViewType, typename BViewType, typename CViewType>
+  void a_subtract_mult_b_and_c(const ordinal_type tl, const ordinal_type il,
+                               AViewType A, const ordinal_type tr,
+                               const ordinal_type ir, BViewType B,
+                               CViewType C) {
+    for (ordinal_type ii = 0; ii < _blocksize; ++ii)
+      for (ordinal_type jj = 0; jj < _blocksize; ++jj)
+        for (ordinal_type kk = 0; kk < _blocksize; ++kk)
+          tdiag_val(A, tl, il, ii, jj) -=
+              (tdiag_val(B, tr, ir, ii, kk) * tdiag_val(C, tr, ir, kk, jj));
+  }
 
-      bool check(const block_tridiag_matrices_type T) {
-        // factors
-        auto DD = Kokkos::create_mirror_view(_TA);    Kokkos::deep_copy(DD, _TA);
-        auto UU = Kokkos::create_mirror_view(_TB);    Kokkos::deep_copy(UU, _TB);
-        auto LL = Kokkos::create_mirror_view(_TC);    Kokkos::deep_copy(LL, _TC);
-
-        // input A
-        auto A = Kokkos::create_mirror_view(T.A());   Kokkos::deep_copy(A, T.A());
-        auto B = Kokkos::create_mirror_view(T.B());   Kokkos::deep_copy(B, T.B());
-        auto C = Kokkos::create_mirror_view(T.C());   Kokkos::deep_copy(C, T.C());
-        
-        // diffs
-        Kokkos::View<value_type****,Kokkos::DefaultHostExecutionSpace> 
-          AA("AA", _ntridiag, _m,   _blocksize, _blocksize),
-          BB("BB", _ntridiag, _m-1, _blocksize, _blocksize),
-          CC("CC", _ntridiag, _m-1, _blocksize, _blocksize);
-        
-        Kokkos::deep_copy(AA, A);
-        Kokkos::deep_copy(BB, B);
-        Kokkos::deep_copy(CC, C);
-
-        // Check | A - L U | / | A |
-        for (ordinal_type t=0;t<_ntridiag;++t) {
-          a_subtract_mult_l_and_u(t, 0, AA, 
-                                  t, 0, DD, DD);
-          for (ordinal_type i=1;i<_m;++i) {
-            a_subtract_mult_l_and_u(t, i,   AA, 
-                                    t, i,   DD, DD);
-            a_subtract_mult_b_and_c(t, i,   AA, 
-                                    t, i-1, LL, UU);
-            a_subtract_mult_l_and_b(t, i-1, BB, 
-                                    t, i-1, DD, UU);
-            a_subtract_mult_b_and_u(t, i-1, CC, 
-                                    t, i-1, LL, DD);
-          }
+  template <typename AViewType, typename LViewType, typename BViewType>
+  void a_subtract_mult_l_and_b(const ordinal_type tl, const ordinal_type il,
+                               AViewType A, const ordinal_type tr,
+                               const ordinal_type ir, LViewType L,
+                               BViewType B) {
+    for (ordinal_type ii = 0; ii < _blocksize; ++ii)
+      for (ordinal_type jj = 0; jj < _blocksize; ++jj)
+        for (ordinal_type kk = 0; kk < _blocksize; ++kk) {
+          const auto l =
+              (ii == kk ? 1.0 : ii > kk ? tdiag_val(L, tr, ir, ii, kk) : 0);
+          tdiag_val(A, tl, il, ii, jj) -= l * tdiag_val(B, tr, ir, kk, jj);
         }
+  }
 
-        double norm = 0, diff = 0;
-        for (ordinal_type t=0;t<_ntridiag;++t) {
-          for (ordinal_type ii=0;ii<_blocksize;++ii)
-            for (ordinal_type jj=0;jj<_blocksize;++jj) {
-              norm += std::abs(tdiag_val(A ,t, 0, ii, jj));
-              diff += std::abs(tdiag_val(AA,t, 0, ii, jj));
-            }
-          for (ordinal_type i=1;i<_m;++i) 
-            for (ordinal_type ii=0;ii<_blocksize;++ii)
-              for (ordinal_type jj=0;jj<_blocksize;++jj) {
-                norm += std::abs(tdiag_val(A ,t, i,   ii, jj));
-                diff += std::abs(tdiag_val(AA,t, i,   ii, jj));
-                norm += std::abs(tdiag_val(B ,t, i-1, ii, jj));
-                diff += std::abs(tdiag_val(BB,t, i-1, ii, jj));
-                norm += std::abs(tdiag_val(C ,t, i-1, ii, jj));
-                diff += std::abs(tdiag_val(CC,t, i-1, ii, jj));
-              }
+  template <typename AViewType, typename BViewType, typename UViewType>
+  void a_subtract_mult_b_and_u(const ordinal_type tl, const ordinal_type il,
+                               AViewType A, const ordinal_type tr,
+                               const ordinal_type ir, BViewType B,
+                               UViewType U) {
+    for (ordinal_type ii = 0; ii < _blocksize; ++ii)
+      for (ordinal_type jj = 0; jj < _blocksize; ++jj)
+        for (ordinal_type kk = 0; kk < _blocksize; ++kk) {
+          const auto u = (kk <= jj ? tdiag_val(U, tr, ir, kk, jj) : 0);
+          tdiag_val(A, tl, il, ii, jj) -= tdiag_val(B, tr, ir, ii, kk) * u;
         }
-        //std::cout << "tridiag factor check  norm = " << norm << "  diff = " << diff << std::endl;
-        const bool r_val = diff/norm < 1e2*std::numeric_limits<scalar_type>::epsilon();
-        return r_val;
+  }
+
+  bool check(const block_tridiag_matrices_type T) {
+    // factors
+    auto DD = Kokkos::create_mirror_view(_TA);
+    Kokkos::deep_copy(DD, _TA);
+    auto UU = Kokkos::create_mirror_view(_TB);
+    Kokkos::deep_copy(UU, _TB);
+    auto LL = Kokkos::create_mirror_view(_TC);
+    Kokkos::deep_copy(LL, _TC);
+
+    // input A
+    auto A = Kokkos::create_mirror_view(T.A());
+    Kokkos::deep_copy(A, T.A());
+    auto B = Kokkos::create_mirror_view(T.B());
+    Kokkos::deep_copy(B, T.B());
+    auto C = Kokkos::create_mirror_view(T.C());
+    Kokkos::deep_copy(C, T.C());
+
+    // diffs
+    Kokkos::View<value_type ****, Kokkos::DefaultHostExecutionSpace> AA(
+        "AA", _ntridiag, _m, _blocksize, _blocksize),
+        BB("BB", _ntridiag, _m - 1, _blocksize, _blocksize),
+        CC("CC", _ntridiag, _m - 1, _blocksize, _blocksize);
+
+    Kokkos::deep_copy(AA, A);
+    Kokkos::deep_copy(BB, B);
+    Kokkos::deep_copy(CC, C);
+
+    // Check | A - L U | / | A |
+    for (ordinal_type t = 0; t < _ntridiag; ++t) {
+      a_subtract_mult_l_and_u(t, 0, AA, t, 0, DD, DD);
+      for (ordinal_type i = 1; i < _m; ++i) {
+        a_subtract_mult_l_and_u(t, i, AA, t, i, DD, DD);
+        a_subtract_mult_b_and_c(t, i, AA, t, i - 1, LL, UU);
+        a_subtract_mult_l_and_b(t, i - 1, BB, t, i - 1, DD, UU);
+        a_subtract_mult_b_and_u(t, i - 1, CC, t, i - 1, LL, DD);
       }
-    };
-
-    template <typename ExecSpace, typename ValueType, typename ArrayLayout,
-              int VectorLength,
-              typename Trsv_AlgoTagType,
-              typename Gemv_AlgoTagType>
-    class SolveBlockTridiagMatrices {
-    public:
-      typedef ExecSpace exec_space;
-      typedef ValueType value_type;
-      typedef ArrayLayout array_layout;
-
-      typedef BlockTridiagMatrices<exec_space,value_type,array_layout> block_tridiag_matrices_type;
-      typedef PartitionedBlockMultiVector<exec_space,value_type,array_layout> partitioned_block_multi_vector_type;
-      
-    private:
-      ordinal_type _ntridiag, _m, _blocksize, _nvectors, _shmemlvl;
-
-      ConstUnmanagedViewType<typename block_tridiag_matrices_type::value_array_type> _TA, _TB, _TC;
-      ConstUnmanagedViewType<typename partitioned_block_multi_vector_type::value_array_type> _b;
-      /**/ UnmanagedViewType<typename partitioned_block_multi_vector_type::value_array_type> _x;
-
-    public:
-      SolveBlockTridiagMatrices() {}
-
-      KOKKOS_INLINE_FUNCTION 
-      void operator()(const RangeTag &, const ordinal_type ij) const {
-        auto A = Kokkos::subview(_TA, ij, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
-        auto B = Kokkos::subview(_TB, ij, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
-        auto C = Kokkos::subview(_TC, ij, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
-
-        // subview patterns
-        auto A_0_all_all = Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL());
-        auto B_0_all_all = Kokkos::subview(B, 0, Kokkos::ALL(), Kokkos::ALL());
-        auto C_0_all_all = Kokkos::subview(C, 0, Kokkos::ALL(), Kokkos::ALL());
-        
-        auto mx_0_0_all_all = Kokkos::subview(_x, 0, 0, Kokkos::ALL(), Kokkos::ALL());
-        auto mb_0_0_all_all = Kokkos::subview(_b, 0, 0, Kokkos::ALL(), Kokkos::ALL());
-
-        auto x_0_all = Kokkos::subview(mx_0_0_all_all, 0, Kokkos::ALL());
-        auto b_0_all = Kokkos::subview(mb_0_0_all_all, 0, Kokkos::ALL());
-
-        ///
-        /// loop over multivectors
-        ///
-
-        auto &x = mx_0_0_all_all;
-        auto &b = mb_0_0_all_all;
-
-        auto &xt = x_0_all;
-        auto  xb = x_0_all;
-
-        auto &bt = b_0_all;
-        auto  bb = b_0_all;
-
-        for (int jvec=0;jvec<_nvectors;++jvec) {
-          x.assign_data( &_x(ij,jvec,0,0) );
-          b.assign_data( &_b(ij,jvec,0,0) );
-          
-          ///
-          /// forward substitution
-          ///
-          {
-            auto &LT = A_0_all_all; 
-            auto &LB = C_0_all_all; 
-            
-            const bool is_same_x_and_b = (x.data() == b.data());
-            {
-              if (!is_same_x_and_b) {
-                xt.assign_data( &x(0,0) );
-                bt.assign_data( &b(0,0) );
-                SerialCopy<Trans::NoTranspose>::invoke(bt, xt);
-              }
-            }
-            const ordinal_type kend = _m - 1;
-            for (ordinal_type k=0;k<kend;++k) {
-              LT.assign_data( &A(k,0,0) );
-              LB.assign_data( &C(k,0,0) );
-              
-              xt.assign_data( &x(k  ,0) );
-              xb.assign_data( &x(k+1,0) );
-
-              if (!is_same_x_and_b) {
-                bb.assign_data( &b(k+1,0) );
-                SerialCopy<Trans::NoTranspose>::invoke(bb, xb);
-              }
-              
-              SerialTrsv<Uplo::Lower,Trans::NoTranspose,Diag::Unit,Trsv_AlgoTagType>
-                ::invoke(1.0, LT, xt);
-              SerialGemv<Trans::NoTranspose,Gemv_AlgoTagType>
-                ::invoke(-1.0, LB, xt, 1.0, xb);
-            }
-            
-            LT.assign_data( &A(kend,0,0) );
-            xt.assign_data( &x(kend,0) );
-            SerialTrsv<Uplo::Lower,Trans::NoTranspose,Diag::Unit,Trsv_AlgoTagType>
-              ::invoke(1.0, LT, xt);
+    }
+
+    double norm = 0, diff = 0;
+    for (ordinal_type t = 0; t < _ntridiag; ++t) {
+      for (ordinal_type ii = 0; ii < _blocksize; ++ii)
+        for (ordinal_type jj = 0; jj < _blocksize; ++jj) {
+          norm +=
+              Kokkos::ArithTraits<value_type>::abs(tdiag_val(A, t, 0, ii, jj));
+          diff +=
+              Kokkos::ArithTraits<value_type>::abs(tdiag_val(AA, t, 0, ii, jj));
+        }
+      for (ordinal_type i = 1; i < _m; ++i)
+        for (ordinal_type ii = 0; ii < _blocksize; ++ii)
+          for (ordinal_type jj = 0; jj < _blocksize; ++jj) {
+            norm += Kokkos::ArithTraits<value_type>::abs(
+                tdiag_val(A, t, i, ii, jj));
+            diff += Kokkos::ArithTraits<value_type>::abs(
+                tdiag_val(AA, t, i, ii, jj));
+            norm += Kokkos::ArithTraits<value_type>::abs(
+                tdiag_val(B, t, i - 1, ii, jj));
+            diff += Kokkos::ArithTraits<value_type>::abs(
+                tdiag_val(BB, t, i - 1, ii, jj));
+            norm += Kokkos::ArithTraits<value_type>::abs(
+                tdiag_val(C, t, i - 1, ii, jj));
+            diff += Kokkos::ArithTraits<value_type>::abs(
+                tdiag_val(CC, t, i - 1, ii, jj));
           }
+    }
+    // std::cout << "tridiag factor check  norm = " << norm << "  diff = " <<
+    // diff << std::endl;
+    const bool r_val =
+        diff / norm < 1e2 * std::numeric_limits<scalar_type>::epsilon();
+    return r_val;
+  }
+};
+
+template <typename ExecSpace, typename ValueType, typename ArrayLayout,
+          int VectorLength, typename Trsv_AlgoTagType,
+          typename Gemv_AlgoTagType>
+class SolveBlockTridiagMatrices {
+ public:
+  typedef ExecSpace exec_space;
+  typedef ValueType value_type;
+  typedef ArrayLayout array_layout;
+
+  typedef BlockTridiagMatrices<exec_space, value_type, array_layout>
+      block_tridiag_matrices_type;
+  typedef PartitionedBlockMultiVector<exec_space, value_type, array_layout>
+      partitioned_block_multi_vector_type;
+
+ private:
+  ordinal_type _ntridiag, _m, _blocksize, _nvectors, _shmemlvl;
+
+  ConstUnmanagedViewType<typename block_tridiag_matrices_type::value_array_type>
+      _TA, _TB, _TC;
+  ConstUnmanagedViewType<
+      typename partitioned_block_multi_vector_type::value_array_type>
+      _b;
+  /**/ UnmanagedViewType<
+      typename partitioned_block_multi_vector_type::value_array_type>
+      _x;
+
+ public:
+  SolveBlockTridiagMatrices() {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const RangeTag &, const ordinal_type ij) const {
+    auto A =
+        Kokkos::subview(_TA, ij, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
+    auto B =
+        Kokkos::subview(_TB, ij, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
+    auto C =
+        Kokkos::subview(_TC, ij, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
+
+    // subview patterns
+    auto A_0_all_all = Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL());
+    auto B_0_all_all = Kokkos::subview(B, 0, Kokkos::ALL(), Kokkos::ALL());
+    auto C_0_all_all = Kokkos::subview(C, 0, Kokkos::ALL(), Kokkos::ALL());
+
+    auto mx_0_0_all_all =
+        Kokkos::subview(_x, 0, 0, Kokkos::ALL(), Kokkos::ALL());
+    auto mb_0_0_all_all =
+        Kokkos::subview(_b, 0, 0, Kokkos::ALL(), Kokkos::ALL());
+
+    auto x_0_all = Kokkos::subview(mx_0_0_all_all, 0, Kokkos::ALL());
+    auto b_0_all = Kokkos::subview(mb_0_0_all_all, 0, Kokkos::ALL());
+
+    ///
+    /// loop over multivectors
+    ///
+
+    auto &x = mx_0_0_all_all;
+    auto &b = mb_0_0_all_all;
+
+    auto &xt = x_0_all;
+    auto xb  = x_0_all;
+
+    auto &bt = b_0_all;
+    auto bb  = b_0_all;
+
+    for (int jvec = 0; jvec < _nvectors; ++jvec) {
+      x.assign_data(&_x(ij, jvec, 0, 0));
+      b.assign_data(&_b(ij, jvec, 0, 0));
 
-          ///
-          /// backward substitution
-          ///
-          {
-            auto &UT = B_0_all_all;
-            auto &UB = A_0_all_all;
-
-            const ordinal_type kbegin = _m - 1;
-            for (ordinal_type k=kbegin;k>0;--k) {
-              UT.assign_data( &B(k-1,0,0) );
-              UB.assign_data( &A(k  ,0,0) );
-              
-              xt.assign_data( &x(k-1,0) );
-              xb.assign_data( &x(k  ,0) );
-
-              SerialTrsv<Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,Trsv_AlgoTagType>
-                ::invoke(1.0, UB, xb);
-              SerialGemv<Trans::NoTranspose,Gemv_AlgoTagType>
-                ::invoke(-1.0, UT, xb, 1.0, xt);
-            }
-            UT.assign_data( &A(0,0,0) );
-            xt.assign_data( &x(0,0) );
-            SerialTrsv<Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,Trsv_AlgoTagType>
-              ::invoke(1.0, UT, xt);
+      ///
+      /// forward substitution
+      ///
+      {
+        auto &LT = A_0_all_all;
+        auto &LB = C_0_all_all;
+
+        const bool is_same_x_and_b = (x.data() == b.data());
+        {
+          if (!is_same_x_and_b) {
+            xt.assign_data(&x(0, 0));
+            bt.assign_data(&b(0, 0));
+            SerialCopy<Trans::NoTranspose>::invoke(bt, xt);
+          }
+        }
+        const ordinal_type kend = _m - 1;
+        for (ordinal_type k = 0; k < kend; ++k) {
+          LT.assign_data(&A(k, 0, 0));
+          LB.assign_data(&C(k, 0, 0));
+
+          xt.assign_data(&x(k, 0));
+          xb.assign_data(&x(k + 1, 0));
+
+          if (!is_same_x_and_b) {
+            bb.assign_data(&b(k + 1, 0));
+            SerialCopy<Trans::NoTranspose>::invoke(bb, xb);
           }
+
+          SerialTrsv<Uplo::Lower, Trans::NoTranspose, Diag::Unit,
+                     Trsv_AlgoTagType>::invoke(1.0, LT, xt);
+          SerialGemv<Trans::NoTranspose, Gemv_AlgoTagType>::invoke(-1.0, LB, xt,
+                                                                   1.0, xb);
         }
+
+        LT.assign_data(&A(kend, 0, 0));
+        xt.assign_data(&x(kend, 0));
+        SerialTrsv<Uplo::Lower, Trans::NoTranspose, Diag::Unit,
+                   Trsv_AlgoTagType>::invoke(1.0, LT, xt);
       }
 
-      template<typename MemberType>
-      KOKKOS_INLINE_FUNCTION 
-      void operator()(const TeamTag &, const MemberType &member) const {
-        const int ijbeg = member.league_rank()*VectorLength;
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength),
-           [&](const int &idx) {
-            const int ij = ijbeg + idx;
-            if (ij < _ntridiag) {
-              auto A = Kokkos::subview(_TA, ij, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
-              auto B = Kokkos::subview(_TB, ij, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
-              auto C = Kokkos::subview(_TC, ij, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
-              
-              ///
-              /// loop over multivectors
-              ///
-              for (int jvec=0;jvec<_nvectors;++jvec) {
-                auto x = Kokkos::subview(_x, ij, jvec, Kokkos::ALL(), Kokkos::ALL());
-                auto b = Kokkos::subview(_b, ij, jvec, Kokkos::ALL(), Kokkos::ALL());
-                
-                ///
-                /// forward substitution
-                ///
-                {
-                  const bool is_same_x_and_b = (x.data() == b.data());
-                  {
-                    if (!is_same_x_and_b) {
-                      auto x0 = Kokkos::subview(x, 0, Kokkos::ALL());          
-                      auto b0 = Kokkos::subview(b, 0, Kokkos::ALL());
-                      TeamCopy<MemberType,Trans::NoTranspose>::invoke(member, b0, x0);
-                      member.team_barrier();
-                    }
-                  }
-                  const ordinal_type kend = _m - 1;
-                  for (ordinal_type k=0;k<kend;++k) {
-                    auto LT = Kokkos::subview(A, k,   Kokkos::ALL(), Kokkos::ALL());
-                    auto LB = Kokkos::subview(C, k,   Kokkos::ALL(), Kokkos::ALL());
+      ///
+      /// backward substitution
+      ///
+      {
+        auto &UT = B_0_all_all;
+        auto &UB = A_0_all_all;
 
-                    auto xt = Kokkos::subview(x, k,   Kokkos::ALL());
-                    auto xb = Kokkos::subview(x, k+1, Kokkos::ALL());
+        const ordinal_type kbegin = _m - 1;
+        for (ordinal_type k = kbegin; k > 0; --k) {
+          UT.assign_data(&B(k - 1, 0, 0));
+          UB.assign_data(&A(k, 0, 0));
 
-                    if (!is_same_x_and_b) {
-                      auto bb = Kokkos::subview(b, k+1, Kokkos::ALL());
-                      TeamCopy<MemberType,Trans::NoTranspose>::invoke(member, bb, xb);
-                    }
+          xt.assign_data(&x(k - 1, 0));
+          xb.assign_data(&x(k, 0));
 
-                    member.team_barrier();
-                    TeamTrsv<MemberType,Uplo::Lower,Trans::NoTranspose,Diag::Unit,Trsv_AlgoTagType>
-                      ::invoke(member, 1.0, LT, xt);
+          SerialTrsv<Uplo::Upper, Trans::NoTranspose, Diag::NonUnit,
+                     Trsv_AlgoTagType>::invoke(1.0, UB, xb);
+          SerialGemv<Trans::NoTranspose, Gemv_AlgoTagType>::invoke(-1.0, UT, xb,
+                                                                   1.0, xt);
+        }
+        UT.assign_data(&A(0, 0, 0));
+        xt.assign_data(&x(0, 0));
+        SerialTrsv<Uplo::Upper, Trans::NoTranspose, Diag::NonUnit,
+                   Trsv_AlgoTagType>::invoke(1.0, UT, xt);
+      }
+    }
+  }
 
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamTag &,
+                                         const MemberType &member) const {
+    const int ijbeg = member.league_rank() * VectorLength;
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &idx) {
+          const int ij = ijbeg + idx;
+          if (ij < _ntridiag) {
+            auto A = Kokkos::subview(_TA, ij, Kokkos::ALL(), Kokkos::ALL(),
+                                     Kokkos::ALL());
+            auto B = Kokkos::subview(_TB, ij, Kokkos::ALL(), Kokkos::ALL(),
+                                     Kokkos::ALL());
+            auto C = Kokkos::subview(_TC, ij, Kokkos::ALL(), Kokkos::ALL(),
+                                     Kokkos::ALL());
+
+            ///
+            /// loop over multivectors
+            ///
+            for (int jvec = 0; jvec < _nvectors; ++jvec) {
+              auto x =
+                  Kokkos::subview(_x, ij, jvec, Kokkos::ALL(), Kokkos::ALL());
+              auto b =
+                  Kokkos::subview(_b, ij, jvec, Kokkos::ALL(), Kokkos::ALL());
+
+              ///
+              /// forward substitution
+              ///
+              {
+                const bool is_same_x_and_b = (x.data() == b.data());
+                {
+                  if (!is_same_x_and_b) {
+                    auto x0 = Kokkos::subview(x, 0, Kokkos::ALL());
+                    auto b0 = Kokkos::subview(b, 0, Kokkos::ALL());
+                    TeamCopy<MemberType, Trans::NoTranspose>::invoke(member, b0,
+                                                                     x0);
                     member.team_barrier();
-                    TeamGemv<MemberType,Trans::NoTranspose,Gemv_AlgoTagType>
-                      ::invoke(member, -1.0, LB, xt, 1.0, xb);
-                  }
-                  {
-                    auto LL = Kokkos::subview(A, kend, Kokkos::ALL(), Kokkos::ALL());
-                    auto xx = Kokkos::subview(x, kend, Kokkos::ALL());
-                    member.team_barrier();
-                    TeamTrsv<MemberType,Uplo::Lower,Trans::NoTranspose,Diag::Unit,Trsv_AlgoTagType>
-                      ::invoke(member, 1.0, LL, xx);
                   }
                 }
+                const ordinal_type kend = _m - 1;
+                for (ordinal_type k = 0; k < kend; ++k) {
+                  auto LT = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL());
+                  auto LB = Kokkos::subview(C, k, Kokkos::ALL(), Kokkos::ALL());
+
+                  auto xt = Kokkos::subview(x, k, Kokkos::ALL());
+                  auto xb = Kokkos::subview(x, k + 1, Kokkos::ALL());
+
+                  if (!is_same_x_and_b) {
+                    auto bb = Kokkos::subview(b, k + 1, Kokkos::ALL());
+                    TeamCopy<MemberType, Trans::NoTranspose>::invoke(member, bb,
+                                                                     xb);
+                  }
 
-                ///
-                /// backward substitution
-                ///
+                  member.team_barrier();
+                  TeamTrsv<MemberType, Uplo::Lower, Trans::NoTranspose,
+                           Diag::Unit, Trsv_AlgoTagType>::invoke(member, 1.0,
+                                                                 LT, xt);
+
+                  member.team_barrier();
+                  TeamGemv<MemberType, Trans::NoTranspose,
+                           Gemv_AlgoTagType>::invoke(member, -1.0, LB, xt, 1.0,
+                                                     xb);
+                }
                 {
-                  const ordinal_type kbegin = _m - 1;
-                  for (ordinal_type k=kbegin;k>0;--k) {
-                    auto UT = Kokkos::subview(B, k-1, Kokkos::ALL(), Kokkos::ALL());
-                    auto UB = Kokkos::subview(A, k,   Kokkos::ALL(), Kokkos::ALL());
-            
-                    auto xt = Kokkos::subview(x, k-1, Kokkos::ALL());
-                    auto xb = Kokkos::subview(x, k,   Kokkos::ALL());
+                  auto LL =
+                      Kokkos::subview(A, kend, Kokkos::ALL(), Kokkos::ALL());
+                  auto xx = Kokkos::subview(x, kend, Kokkos::ALL());
+                  member.team_barrier();
+                  TeamTrsv<MemberType, Uplo::Lower, Trans::NoTranspose,
+                           Diag::Unit, Trsv_AlgoTagType>::invoke(member, 1.0,
+                                                                 LL, xx);
+                }
+              }
 
-                    member.team_barrier();
-                    TeamTrsv<MemberType,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,Trsv_AlgoTagType>
-                      ::invoke(member, 1.0, UB, xb);
+              ///
+              /// backward substitution
+              ///
+              {
+                const ordinal_type kbegin = _m - 1;
+                for (ordinal_type k = kbegin; k > 0; --k) {
+                  auto UT =
+                      Kokkos::subview(B, k - 1, Kokkos::ALL(), Kokkos::ALL());
+                  auto UB = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL());
 
-                    member.team_barrier();
-                    TeamGemv<MemberType,Trans::NoTranspose,Gemv_AlgoTagType>
-                      ::invoke(member, -1.0, UT, xb, 1.0, xt);
-                  }
-                  {
-                    auto UU = Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL());
-                    auto xx = Kokkos::subview(x, 0, Kokkos::ALL());
+                  auto xt = Kokkos::subview(x, k - 1, Kokkos::ALL());
+                  auto xb = Kokkos::subview(x, k, Kokkos::ALL());
 
-                    member.team_barrier();
-                    TeamTrsv<MemberType,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,Trsv_AlgoTagType>
-                      ::invoke(member, 1.0, UU, xx);
-                  }
+                  member.team_barrier();
+                  TeamTrsv<MemberType, Uplo::Upper, Trans::NoTranspose,
+                           Diag::NonUnit, Trsv_AlgoTagType>::invoke(member, 1.0,
+                                                                    UB, xb);
+
+                  member.team_barrier();
+                  TeamGemv<MemberType, Trans::NoTranspose,
+                           Gemv_AlgoTagType>::invoke(member, -1.0, UT, xb, 1.0,
+                                                     xt);
+                }
+                {
+                  auto UU = Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL());
+                  auto xx = Kokkos::subview(x, 0, Kokkos::ALL());
+
+                  member.team_barrier();
+                  TeamTrsv<MemberType, Uplo::Upper, Trans::NoTranspose,
+                           Diag::NonUnit, Trsv_AlgoTagType>::invoke(member, 1.0,
+                                                                    UU, xx);
                 }
               }
             }
-          });
-      }
-      
-      template<typename MemberType>
-      KOKKOS_INLINE_FUNCTION 
-      void operator()(const TeamShmemTag &, const MemberType &member) const {
-        typedef Kokkos::View<ValueType***,exec_space> packed_view_type;        
-        ScratchViewType<packed_view_type> s(member.team_scratch(_shmemlvl), VectorLength, _m, _blocksize);
-
-        const int ijbeg = member.league_rank()*VectorLength;
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength),
-           [&](const int &idx) {
-            const int ij = ijbeg + idx;
-            if (ij < _ntridiag) {
-              auto A = Kokkos::subview(_TA, ij, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
-              auto B = Kokkos::subview(_TB, ij, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
-              auto C = Kokkos::subview(_TC, ij, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
-
-              auto sx = Kokkos::subview(s, idx, Kokkos::ALL(), Kokkos::ALL());              
+          }
+        });
+  }
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamShmemTag &,
+                                         const MemberType &member) const {
+    typedef Kokkos::View<ValueType ***, exec_space> packed_view_type;
+    ScratchViewType<packed_view_type> s(member.team_scratch(_shmemlvl),
+                                        VectorLength, _m, _blocksize);
+
+    const int ijbeg = member.league_rank() * VectorLength;
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &idx) {
+          const int ij = ijbeg + idx;
+          if (ij < _ntridiag) {
+            auto A = Kokkos::subview(_TA, ij, Kokkos::ALL(), Kokkos::ALL(),
+                                     Kokkos::ALL());
+            auto B = Kokkos::subview(_TB, ij, Kokkos::ALL(), Kokkos::ALL(),
+                                     Kokkos::ALL());
+            auto C = Kokkos::subview(_TC, ij, Kokkos::ALL(), Kokkos::ALL(),
+                                     Kokkos::ALL());
+
+            auto sx = Kokkos::subview(s, idx, Kokkos::ALL(), Kokkos::ALL());
+
+            ///
+            /// loop over multivectors
+            ///
+            for (int jvec = 0; jvec < _nvectors; ++jvec) {
+              auto x =
+                  Kokkos::subview(_x, ij, jvec, Kokkos::ALL(), Kokkos::ALL());
+              auto b =
+                  Kokkos::subview(_b, ij, jvec, Kokkos::ALL(), Kokkos::ALL());
+
+              // copy the entire vector into shared memory (if necessary it
+              // needs chunking)
+              TeamCopy<MemberType, Trans::NoTranspose>::invoke(member, b, sx);
+              member.team_barrier();
 
               ///
-              /// loop over multivectors
+              /// forward substitution
               ///
-              for (int jvec=0;jvec<_nvectors;++jvec) {
-                auto x = Kokkos::subview(_x, ij, jvec, Kokkos::ALL(), Kokkos::ALL());
-                auto b = Kokkos::subview(_b, ij, jvec, Kokkos::ALL(), Kokkos::ALL());
-
-                // copy the entire vector into shared memory (if necessary it needs chunking)
-                TeamCopy<MemberType,Trans::NoTranspose>::invoke(member, b, sx);
-                member.team_barrier();
-                
-                ///
-                /// forward substitution
-                ///
-                {
-                  const ordinal_type kend = _m - 1;
-                  for (ordinal_type k=0;k<kend;++k) {
-                    auto LT = Kokkos::subview(A, k,   Kokkos::ALL(), Kokkos::ALL());
-                    auto LB = Kokkos::subview(C, k,   Kokkos::ALL(), Kokkos::ALL());
+              {
+                const ordinal_type kend = _m - 1;
+                for (ordinal_type k = 0; k < kend; ++k) {
+                  auto LT = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL());
+                  auto LB = Kokkos::subview(C, k, Kokkos::ALL(), Kokkos::ALL());
 
-                    auto xt = Kokkos::subview(sx, k,   Kokkos::ALL());
-                    auto xb = Kokkos::subview(sx, k+1, Kokkos::ALL());
+                  auto xt = Kokkos::subview(sx, k, Kokkos::ALL());
+                  auto xb = Kokkos::subview(sx, k + 1, Kokkos::ALL());
 
-                    member.team_barrier();
-                    TeamTrsv<MemberType,Uplo::Lower,Trans::NoTranspose,Diag::Unit,Trsv_AlgoTagType>
-                      ::invoke(member, 1.0, LT, xt);
+                  member.team_barrier();
+                  TeamTrsv<MemberType, Uplo::Lower, Trans::NoTranspose,
+                           Diag::Unit, Trsv_AlgoTagType>::invoke(member, 1.0,
+                                                                 LT, xt);
 
-                    member.team_barrier();
-                    TeamGemv<MemberType,Trans::NoTranspose,Gemv_AlgoTagType>
-                      ::invoke(member, -1.0, LB, xt, 1.0, xb);
-                  }
-                  {
-                    auto LL = Kokkos::subview(A, kend, Kokkos::ALL(), Kokkos::ALL());
-                    auto xx = Kokkos::subview(sx, kend, Kokkos::ALL());
-                    member.team_barrier();
-                    TeamTrsv<MemberType,Uplo::Lower,Trans::NoTranspose,Diag::Unit,Trsv_AlgoTagType>
-                      ::invoke(member, 1.0, LL, xx);
-                  }
+                  member.team_barrier();
+                  TeamGemv<MemberType, Trans::NoTranspose,
+                           Gemv_AlgoTagType>::invoke(member, -1.0, LB, xt, 1.0,
+                                                     xb);
                 }
-
-                ///
-                /// backward substitution
-                ///
                 {
-                  const ordinal_type kbegin = _m - 1;
-                  for (ordinal_type k=kbegin;k>0;--k) {
-                    auto UT = Kokkos::subview(B, k-1, Kokkos::ALL(), Kokkos::ALL());
-                    auto UB = Kokkos::subview(A, k,   Kokkos::ALL(), Kokkos::ALL());
-            
-                    auto xt = Kokkos::subview(sx, k-1, Kokkos::ALL());
-                    auto xb = Kokkos::subview(sx, k,   Kokkos::ALL());
+                  auto LL =
+                      Kokkos::subview(A, kend, Kokkos::ALL(), Kokkos::ALL());
+                  auto xx = Kokkos::subview(sx, kend, Kokkos::ALL());
+                  member.team_barrier();
+                  TeamTrsv<MemberType, Uplo::Lower, Trans::NoTranspose,
+                           Diag::Unit, Trsv_AlgoTagType>::invoke(member, 1.0,
+                                                                 LL, xx);
+                }
+              }
 
-                    member.team_barrier();
-                    TeamTrsv<MemberType,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,Trsv_AlgoTagType>
-                      ::invoke(member, 1.0, UB, xb);
+              ///
+              /// backward substitution
+              ///
+              {
+                const ordinal_type kbegin = _m - 1;
+                for (ordinal_type k = kbegin; k > 0; --k) {
+                  auto UT =
+                      Kokkos::subview(B, k - 1, Kokkos::ALL(), Kokkos::ALL());
+                  auto UB = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL());
 
-                    member.team_barrier();
-                    TeamGemv<MemberType,Trans::NoTranspose,Gemv_AlgoTagType>
-                      ::invoke(member, -1.0, UT, xb, 1.0, xt);
-                  }
-                  {
-                    auto UU = Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL());
-                    auto xx = Kokkos::subview(sx, 0, Kokkos::ALL());
+                  auto xt = Kokkos::subview(sx, k - 1, Kokkos::ALL());
+                  auto xb = Kokkos::subview(sx, k, Kokkos::ALL());
 
-                    member.team_barrier();
-                    TeamTrsv<MemberType,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,Trsv_AlgoTagType>
-                      ::invoke(member, 1.0, UU, xx);
-                  }
+                  member.team_barrier();
+                  TeamTrsv<MemberType, Uplo::Upper, Trans::NoTranspose,
+                           Diag::NonUnit, Trsv_AlgoTagType>::invoke(member, 1.0,
+                                                                    UB, xb);
+
+                  member.team_barrier();
+                  TeamGemv<MemberType, Trans::NoTranspose,
+                           Gemv_AlgoTagType>::invoke(member, -1.0, UT, xb, 1.0,
+                                                     xt);
+                }
+                {
+                  auto UU = Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL());
+                  auto xx = Kokkos::subview(sx, 0, Kokkos::ALL());
+
+                  member.team_barrier();
+                  TeamTrsv<MemberType, Uplo::Upper, Trans::NoTranspose,
+                           Diag::NonUnit, Trsv_AlgoTagType>::invoke(member, 1.0,
+                                                                    UU, xx);
                 }
-                // copy the entire vector into shared memory (if necessary it needs chunking)
-                TeamCopy<MemberType,Trans::NoTranspose>::invoke(member, sx, x);
-                member.team_barrier();
               }
+              // copy the entire vector into shared memory (if necessary it
+              // needs chunking)
+              TeamCopy<MemberType, Trans::NoTranspose>::invoke(member, sx, x);
+              member.team_barrier();
             }
-          });
+          }
+        });
+  }
 
+  void run(const int op, const block_tridiag_matrices_type T,
+           const partitioned_block_multi_vector_type x,
+           const partitioned_block_multi_vector_type b) {
+    assert(T.NumTridiagMatrices() == x.NumPartitions());
+    assert(T.NumRows() == x.NumRows());
+    assert(T.BlockSize() == x.BlockSize());
 
-      }
+    _ntridiag  = T.NumTridiagMatrices();
+    _m         = T.NumRows();
+    _blocksize = T.BlockSize();
+    _nvectors  = x.NumVectors();
 
-      void run(const int op, 
-               const block_tridiag_matrices_type T,
-               const partitioned_block_multi_vector_type x,
-               const partitioned_block_multi_vector_type b) {
-        assert(T.NumTridiagMatrices() == x.NumPartitions());
-        assert(T.NumRows() == x.NumRows());
-        assert(T.BlockSize() == x.BlockSize());
-
-        _ntridiag = T.NumTridiagMatrices();
-        _m = T.NumRows(); 
-        _blocksize = T.BlockSize();
-        _nvectors = x.NumVectors();
-
-        _TA = T.A(); 
-        _TB = T.B(); 
-        _TC = T.C();
-        
-        _x = x.Values();
-        _b = b.Values();
+    _TA = T.A();
+    _TB = T.B();
+    _TC = T.C();
 
-        {
+    _x = x.Values();
+    _b = b.Values();
+
+    {
 #if defined(KOKKOS_ENABLE_CUDA) && defined(__KOKKOSBATCHED_TEST_ENABLE_CUDA__)
-          typedef SolveBlockTridiagMatrices<exec_space,value_type,array_layout,
-	    VectorLength,
-	    Trsv_AlgoTagType,
-	    Gemv_AlgoTagType> functor_type;
+      typedef SolveBlockTridiagMatrices<exec_space, value_type, array_layout,
+                                        VectorLength, Trsv_AlgoTagType,
+                                        Gemv_AlgoTagType>
+          functor_type;
 #endif
 
-          switch (op) {
-          case 0: {
-            const Kokkos::RangePolicy<exec_space,RangeTag> policy(0, _ntridiag);
-            Kokkos::parallel_for("KokkosBatched::Test::BlockCrs::SolveBlockTridiagMatrices::Op0", policy, *this);
-            break;
-          }
+      switch (op) {
+        case 0: {
+          const Kokkos::RangePolicy<exec_space, RangeTag> policy(0, _ntridiag);
+          Kokkos::parallel_for(
+              "KokkosBatched::Test::BlockCrs::SolveBlockTridiagMatrices::Op0",
+              policy, *this);
+          break;
+        }
 #if defined(KOKKOS_ENABLE_CUDA) && defined(__KOKKOSBATCHED_TEST_ENABLE_CUDA__)
-          case 1: {
-            typedef Kokkos::TeamPolicy<exec_space,TeamTag> policy_type;
-            
+        case 1: {
+          typedef Kokkos::TeamPolicy<exec_space, TeamTag> policy_type;
+
+          int team_size = 0;
+
+          // this is what cuda allows
+          const int max_team_size =
+              policy_type(_ntridiag, Kokkos::AUTO, VectorLength)
+                  .team_size_max(functor_type(), Kokkos::ParallelForTag());
+
+          // this is what algorithm allows
+          if (std::is_same<Gemv_AlgoTagType, Algo::Gemv::Blocked>::value) {
+            const int mb =
+                Algo::Gemv::Blocked::mb<typename exec_space::memory_space>();
+            const int mp = _blocksize % mb, mblk = (_blocksize / mb) + (mp > 0);
+            team_size = std::min(std::max(mblk / 2, 1), int(max_team_size / 2));
+          } else {
+            // in solve phase, max peak parallelism is same as blocksize (one
+            // iteration) better to give blocksize/2
+            team_size =
+                std::min(std::max(_blocksize / 2, 4), int(max_team_size / 2));
+          }
+
+          const policy_type policy(_ntridiag, team_size, VectorLength);
+          Kokkos::parallel_for(
+              "KokkosBatched::Test::BlockCrs::SolveBlockTridiagMatrices::Op1",
+              policy, *this);
+          break;
+        }
+        case 2: {
+          typedef Kokkos::View<ValueType ***, exec_space> packed_view_type;
+          typedef Kokkos::TeamPolicy<exec_space, TeamShmemTag> policy_type;
+
+          const int per_team_scratch =
+              ScratchViewType<packed_view_type>::shmem_size(VectorLength, _m,
+                                                            _blocksize);
+
+          _shmemlvl = ((per_team_scratch / 1024) < 48 ? 0 : 1);
+          {
             int team_size = 0;
-            
+
             // this is what cuda allows
-            const int max_team_size = 
-              policy_type(_ntridiag, Kokkos::AUTO, VectorLength).team_size_max(functor_type(), Kokkos::ParallelForTag());
+            const int max_team_size =
+                policy_type(_ntridiag, Kokkos::AUTO, VectorLength)
+                    .set_scratch_size(_shmemlvl,
+                                      Kokkos::PerTeam(per_team_scratch))
+                    .team_size_max(functor_type(), Kokkos::ParallelForTag());
 
             // this is what algorithm allows
-            if (std::is_same<Gemv_AlgoTagType,Algo::Gemv::Blocked>::value) {
-              const int mb = Algo::Gemv::Blocked::mb<typename exec_space::memory_space>();
-              const int mp = _blocksize%mb, mblk = (_blocksize/mb) + (mp>0);
-              team_size = std::min(std::max(mblk/2,1), int(max_team_size/2));
+            if (std::is_same<Gemv_AlgoTagType, Algo::Gemv::Blocked>::value) {
+              const int mb =
+                  Algo::Gemv::Blocked::mb<typename exec_space::memory_space>();
+              const int mp   = _blocksize % mb,
+                        mblk = (_blocksize / mb) + (mp > 0);
+              team_size =
+                  std::min(std::max(mblk / 2, 1), int(max_team_size / 2));
             } else {
-              // in solve phase, max peak parallelism is same as blocksize (one iteration)
-              // better to give blocksize/2 
-              team_size = std::min(std::max(_blocksize/2,4), int(max_team_size/2));
+              team_size =
+                  std::min(std::max(_blocksize / 2, 4), int(max_team_size / 2));
             }
-            
-            const policy_type policy(_ntridiag, team_size, VectorLength);
-            Kokkos::parallel_for("KokkosBatched::Test::BlockCrs::SolveBlockTridiagMatrices::Op1", policy, *this);
-            break;
-          }
-          case 2: {
-            typedef Kokkos::View<ValueType***,exec_space> packed_view_type;
-            typedef Kokkos::TeamPolicy<exec_space,TeamShmemTag> policy_type;
 
-            const int per_team_scratch 
-              = ScratchViewType<packed_view_type>::shmem_size(VectorLength, _m, _blocksize);
-
-            _shmemlvl = ((per_team_scratch/1024) < 48 ? 0 : 1);            
-            {
-              int team_size = 0;
-              
-              // this is what cuda allows
-              const int max_team_size =
-                policy_type(_ntridiag, Kokkos::AUTO, VectorLength).set_scratch_size(_shmemlvl, Kokkos::PerTeam(per_team_scratch))
-                .team_size_max(functor_type(), Kokkos::ParallelForTag());   
-              
-              // this is what algorithm allows
-              if (std::is_same<Gemv_AlgoTagType,Algo::Gemv::Blocked>::value) {
-                const int mb = Algo::Gemv::Blocked::mb<typename exec_space::memory_space>();
-                const int mp = _blocksize%mb, mblk = (_blocksize/mb) + (mp>0);
-                team_size = std::min(std::max(mblk/2,1), int(max_team_size/2));
-              } else {
-                team_size = std::min(std::max(_blocksize/2,4), int(max_team_size/2));
-              }
-              
-              policy_type policy = policy_type(_ntridiag, team_size, VectorLength).set_scratch_size(_shmemlvl, Kokkos::PerTeam(per_team_scratch));;
-              Kokkos::parallel_for("KokkosBatched::Test::BlockCrs::SolveBlockTridiagMatrices::Op2", policy, *this);
-            }
-            break;
+            policy_type policy =
+                policy_type(_ntridiag, team_size, VectorLength)
+                    .set_scratch_size(_shmemlvl,
+                                      Kokkos::PerTeam(per_team_scratch));
+            ;
+            Kokkos::parallel_for(
+                "KokkosBatched::Test::BlockCrs::SolveBlockTridiagMatrices::Op2",
+                policy, *this);
           }
+          break;
+        }
 #endif
-          default: {
-            std::cout << "Not supported operation mode: " << op << " \n";
-            break;
-          }
-          }
+        default: {
+          std::cout << "Not supported operation mode: " << op << " \n";
+          break;
         }
       }
+    }
+  }
+
+  template <typename RViewType, typename AViewType, typename XViewType>
+  void r_subtract_mult_a_and_x(const ordinal_type tr, const ordinal_type ir,
+                               RViewType R, const ordinal_type ta,
+                               const ordinal_type ia, AViewType A,
+                               const ordinal_type tx, const ordinal_type ix,
+                               XViewType X) {
+    for (ordinal_type kk = 0; kk < _nvectors; ++kk)
+      for (ordinal_type ii = 0; ii < _blocksize; ++ii)
+        for (ordinal_type jj = 0; jj < _blocksize; ++jj)
+          tdiag_val(R, tr, kk, ir, ii) -=
+              tdiag_val(A, ta, ia, ii, jj) * tdiag_val(X, tx, kk, ix, jj);
+  }
 
-      template<typename RViewType, typename AViewType, typename XViewType>
-      void r_subtract_mult_a_and_x(const ordinal_type tr, const ordinal_type ir, RViewType R,
-                                   const ordinal_type ta, const ordinal_type ia, AViewType A,
-                                   const ordinal_type tx, const ordinal_type ix, XViewType X) {
-        for (ordinal_type kk=0;kk<_nvectors;++kk) 
-          for (ordinal_type ii=0;ii<_blocksize;++ii)
-            for (ordinal_type jj=0;jj<_blocksize;++jj)
-              tdiag_val(R, tr, kk, ir, ii) -= tdiag_val(A, ta, ia, ii, jj) * tdiag_val(X, tx, kk, ix, jj);
+  bool check(const block_tridiag_matrices_type T,
+             const partitioned_block_multi_vector_type b) {
+    // input A
+    auto AA = Kokkos::create_mirror_view(T.A());
+    Kokkos::deep_copy(AA, T.A());
+    auto BB = Kokkos::create_mirror_view(T.B());
+    Kokkos::deep_copy(BB, T.B());
+    auto CC = Kokkos::create_mirror_view(T.C());
+    Kokkos::deep_copy(CC, T.C());
+
+    auto bb = Kokkos::create_mirror_view(b.Values());
+    Kokkos::deep_copy(bb, b.Values());
+    auto xx = Kokkos::create_mirror_view(_x);
+    Kokkos::deep_copy(xx, _x);
+
+    // diffs
+    Kokkos::View<value_type ****, Kokkos::DefaultHostExecutionSpace> rr(
+        "rr", bb.extent(0), bb.extent(1), bb.extent(2), bb.extent(3));
+
+    Kokkos::deep_copy(rr, bb);
+
+    // Check | Ax - b | / | b |
+    for (ordinal_type t = 0; t < _ntridiag; ++t) {
+      r_subtract_mult_a_and_x(t, 0, rr, t, 0, AA, t, 0, xx);
+      r_subtract_mult_a_and_x(t, 0, rr, t, 0, BB, t, 1, xx);
+
+      for (ordinal_type i = 1; i < (_m - 1); ++i) {
+        r_subtract_mult_a_and_x(t, i, rr, t, i - 1, CC, t, i - 1, xx);
+        r_subtract_mult_a_and_x(t, i, rr, t, i, AA, t, i, xx);
+        r_subtract_mult_a_and_x(t, i, rr, t, i, BB, t, i + 1, xx);
       }
+      r_subtract_mult_a_and_x(t, _m - 1, rr, t, _m - 2, CC, t, _m - 2, xx);
+      r_subtract_mult_a_and_x(t, _m - 1, rr, t, _m - 1, AA, t, _m - 1, xx);
+    }
 
-      bool check(const block_tridiag_matrices_type T, 
-                 const partitioned_block_multi_vector_type b) {
-        // input A
-        auto AA = Kokkos::create_mirror_view(T.A());      Kokkos::deep_copy(AA, T.A());
-        auto BB = Kokkos::create_mirror_view(T.B());      Kokkos::deep_copy(BB, T.B());
-        auto CC = Kokkos::create_mirror_view(T.C());      Kokkos::deep_copy(CC, T.C());
-
-        auto bb = Kokkos::create_mirror_view(b.Values()); Kokkos::deep_copy(bb, b.Values());
-        auto xx = Kokkos::create_mirror_view(_x);         Kokkos::deep_copy(xx, _x);
-        
-        // diffs
-        Kokkos::View<value_type****,Kokkos::DefaultHostExecutionSpace> 
-          rr("rr", bb.extent(0), bb.extent(1), bb.extent(2), bb.extent(3)); 
-        
-        Kokkos::deep_copy(rr, bb);
-
-        // Check | Ax - b | / | b |
-        for (ordinal_type t=0;t<_ntridiag;++t) {
-          r_subtract_mult_a_and_x(t, 0, rr, 
-                                  t, 0, AA, 
-                                  t, 0, xx);
-          r_subtract_mult_a_and_x(t, 0, rr, 
-                                  t, 0, BB, 
-                                  t, 1, xx);
-          
-          for (ordinal_type i=1;i<(_m-1);++i) {
-            r_subtract_mult_a_and_x(t, i,   rr, 
-                                    t, i-1, CC, 
-                                    t, i-1, xx);
-            r_subtract_mult_a_and_x(t, i,   rr, 
-                                    t, i,   AA, 
-                                    t, i,   xx);
-            r_subtract_mult_a_and_x(t, i,   rr, 
-                                    t, i,   BB, 
-                                    t, i+1, xx);
+    double norm = 0, diff = 0;
+    for (ordinal_type t = 0; t < _ntridiag; ++t)
+      for (ordinal_type jvec = 0; jvec < _nvectors; ++jvec)
+        for (ordinal_type i = 0; i < _m; ++i)
+          for (ordinal_type ii = 0; ii < _blocksize; ++ii) {
+            norm += Kokkos::ArithTraits<value_type>::abs(
+                tdiag_val(bb, t, jvec, i, ii));
+            diff += Kokkos::ArithTraits<value_type>::abs(
+                tdiag_val(rr, t, jvec, i, ii));
           }
-          r_subtract_mult_a_and_x(t, _m-1, rr, 
-                                  t, _m-2, CC, 
-                                  t, _m-2, xx);
-          r_subtract_mult_a_and_x(t, _m-1, rr, 
-                                  t, _m-1, AA, 
-                                  t, _m-1, xx);
-        }
 
-        double norm = 0, diff = 0;
-        for (ordinal_type t=0;t<_ntridiag;++t) 
-          for (ordinal_type jvec=0;jvec<_nvectors;++jvec) 
-            for (ordinal_type i=0;i<_m;++i) 
-              for (ordinal_type ii=0;ii<_blocksize;++ii) {
-                norm += std::abs(tdiag_val(bb, t, jvec, i, ii));
-                diff += std::abs(tdiag_val(rr, t, jvec, i, ii));
-              }
+    // std::cout << "tridiag solve check  norm = " << norm << "  diff = " <<
+    // diff << std::endl;
+    const bool r_val =
+        diff / norm < 1e2 * std::numeric_limits<scalar_type>::epsilon();
+    return r_val;
+  }
+};
+
+// unit tests
+template <typename DeviceSpace, typename ValueType, int VectorLength, int Oper>
+void run(const ordinal_type ni, const ordinal_type nj, const ordinal_type nk,
+         const ordinal_type blocksize, const ordinal_type nrhs,
+         const bool test_tpl = false) {
+  typedef typename DeviceSpace::array_layout DeviceArrayLayout;
+  typedef Kokkos::DefaultHostExecutionSpace HostSpace;
+
+  bool success = true;
+  StructuredBlock mesh(ni, nj, nk);
+
+  // Test StructuredBlock.
+  for (ordinal_type c = 0; c < mesh.size(); ++c) {
+    ordinal_type i, j, k;
+    mesh.id2ijk(c, i, j, k);
+    TEST_ASSERT(i >= 0 && i < mesh.ni, success);
+    TEST_ASSERT(j >= 0 && j < mesh.nj, success);
+    TEST_ASSERT(k >= 0 && k < mesh.nk, success);
+    TEST_ASSERT(mesh.ijk2id(i, j, k) == c, success);
+  }
 
-        //std::cout << "tridiag solve check  norm = " << norm << "  diff = " << diff << std::endl;
-        const bool r_val = diff/norm < 1e2*std::numeric_limits<scalar_type>::epsilon();
-        return r_val;
-      }
-    };
-
-    // unit tests
-    template<typename DeviceSpace, typename ValueType, int VectorLength, int Oper>
-    void run(const ordinal_type ni, const ordinal_type nj, const ordinal_type nk, 
-             const ordinal_type blocksize, 
-             const ordinal_type nrhs,
-             const bool test_tpl = false) {
-      typedef typename DeviceSpace::array_layout DeviceArrayLayout;
-      typedef Kokkos::DefaultHostExecutionSpace HostSpace;
-
-      bool success = true;
-      StructuredBlock mesh(ni, nj, nk);
-
-      // Test StructuredBlock.
-      for (ordinal_type c=0;c<mesh.size();++c) {
-        ordinal_type i, j, k;
-        mesh.id2ijk(c, i, j, k);
-        TEST_ASSERT(i >= 0 && i < mesh.ni, success);
-        TEST_ASSERT(j >= 0 && j < mesh.nj, success);
-        TEST_ASSERT(k >= 0 && k < mesh.nk, success);
-        TEST_ASSERT(mesh.ijk2id(i, j, k) == c, success);
-      }
+  // Graph construction
+  CrsGraph<HostSpace, DeviceArrayLayout> graph_host =
+      create_graph_host_for_structured_block<DeviceArrayLayout>(
+          mesh, StencilShape::cross);
 
-      // Graph construction
-      CrsGraph<HostSpace,DeviceArrayLayout> graph_host 
-        = create_graph_host_for_structured_block<DeviceArrayLayout>(mesh, StencilShape::cross);
+  // Crs matrix and multi vector construction
+  BlockCrsMatrix<HostSpace, DeviceArrayLayout> A_host(graph_host, blocksize);
+  fill_block_crs_matrix_host(A_host);
 
-      // Crs matrix and multi vector construction
-      BlockCrsMatrix<HostSpace,DeviceArrayLayout> A_host(graph_host, blocksize);
-      fill_block_crs_matrix_host(A_host);      
+  // Device mirroring
+  auto A_device = create_mirror<DeviceSpace>(A_host);
+  deep_copy(A_device, A_host);
 
-      // Device mirroring
-      auto A_device = create_mirror<DeviceSpace>(A_host);
-      deep_copy(A_device, A_host);
+  // Test Matrix Vector product
+  {
+    const ordinal_type m = graph_host.NumRows();
 
-      // Test Matrix Vector product
-      {
-        const ordinal_type m = graph_host.NumRows();
-        
-        BlockMultiVector<HostSpace,DeviceArrayLayout> x_host(nrhs, m, blocksize);
-        fill_block_multi_vector_host(x_host);
-        
-        auto x_device = create_mirror<DeviceSpace>(x_host);
-        deep_copy(x_device, x_host);
-        
-        BlockMultiVector<DeviceSpace,DeviceArrayLayout> 
-          y1_device(nrhs, m, blocksize), 
-          y2_device(nrhs, m, blocksize);
+    BlockMultiVector<HostSpace, DeviceArrayLayout> x_host(nrhs, m, blocksize);
+    fill_block_multi_vector_host(x_host);
 
-        {
-          BlockCrsMatrixVectorProductByRow<DeviceSpace,DeviceArrayLayout> matvec;
-          matvec.run(A_device, x_device, y1_device);
-        }
-        {
-          BlockCrsMatrixVectorProductByBlockRow<DeviceSpace,DeviceArrayLayout> matvec;
-          matvec.run(A_device, x_device, y2_device);
-        }
+    auto x_device = create_mirror<DeviceSpace>(x_host);
+    deep_copy(x_device, x_host);
 
-        const double rdiff = compute_relative_diff(y1_device.Values(), y2_device.Values());
-        TEST_ASSERT(rdiff <= 1e2*std::numeric_limits<scalar_type>::epsilon(), success);
-      }
+    BlockMultiVector<DeviceSpace, DeviceArrayLayout> y1_device(nrhs, m,
+                                                               blocksize),
+        y2_device(nrhs, m, blocksize);
 
-      // Test Block TriDiag Extraction
-      BlockTridiagMatrices<DeviceSpace,ValueType,DeviceArrayLayout> T_device
-        = create_block_tridiag_matrices
-        <DeviceSpace,ValueType,DeviceArrayLayout>(mesh.ni*mesh.nj, 
-                                                  mesh.nk,
-                                                  blocksize);
-      {
-        ExtractBlockTridiagMatrices<DeviceSpace,ValueType,DeviceArrayLayout> extblk(mesh);
-        extblk.run(A_device, T_device);
-        TEST_ASSERT(extblk.check(), success);
-      }
+    {
+      BlockCrsMatrixVectorProductByRow<DeviceSpace, DeviceArrayLayout> matvec;
+      matvec.run(A_device, x_device, y1_device);
+    }
+    {
+      BlockCrsMatrixVectorProductByBlockRow<DeviceSpace, DeviceArrayLayout>
+          matvec;
+      matvec.run(A_device, x_device, y2_device);
+    }
+
+    const double rdiff =
+        compute_relative_diff(y1_device.Values(), y2_device.Values());
+    TEST_ASSERT(rdiff <= 1e2 * std::numeric_limits<scalar_type>::epsilon(),
+                success);
+  }
+
+  // Test Block TriDiag Extraction
+  BlockTridiagMatrices<DeviceSpace, ValueType, DeviceArrayLayout> T_device =
+      create_block_tridiag_matrices<DeviceSpace, ValueType, DeviceArrayLayout>(
+          mesh.ni * mesh.nj, mesh.nk, blocksize);
+  {
+    ExtractBlockTridiagMatrices<DeviceSpace, ValueType, DeviceArrayLayout>
+        extblk(mesh);
+    extblk.run(A_device, T_device);
+    TEST_ASSERT(extblk.check(), success);
+  }
 
-      BlockTridiagMatrices<DeviceSpace,ValueType,DeviceArrayLayout> T_org_device
-        = create_block_tridiag_matrices
-        <DeviceSpace,ValueType,DeviceArrayLayout>(mesh.ni*mesh.nj, 
-                                                  mesh.nk,
-                                                  blocksize);
-      
-      deep_copy(T_org_device, T_device);
-      
-      // Test Block TriDiag Factorization
-      if (test_tpl) {
-#if                                                     \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&               \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) &&       \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
-        FactorizeBlockTridiagMatrices<DeviceSpace,
-	  ValueType,
-	  DeviceArrayLayout,
-	  VectorLength,
-	  Algo::LU::CompactMKL,
-	  Algo::Trsm::CompactMKL,
-	  Algo::Gemm::CompactMKL> factorblk;
-        factorblk.run(0, T_device); // range policy only now
-        TEST_ASSERT(factorblk.check(T_org_device), success);
+  BlockTridiagMatrices<DeviceSpace, ValueType, DeviceArrayLayout> T_org_device =
+      create_block_tridiag_matrices<DeviceSpace, ValueType, DeviceArrayLayout>(
+          mesh.ni * mesh.nj, mesh.nk, blocksize);
+
+  deep_copy(T_org_device, T_device);
+
+  // Test Block TriDiag Factorization
+  if (test_tpl) {
+#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&         \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
+    FactorizeBlockTridiagMatrices<
+        DeviceSpace, ValueType, DeviceArrayLayout, VectorLength,
+        Algo::LU::CompactMKL, Algo::Trsm::CompactMKL, Algo::Gemm::CompactMKL>
+        factorblk;
+    factorblk.run(0, T_device);  // range policy only now
+    TEST_ASSERT(factorblk.check(T_org_device), success);
 #elif defined(__KOKKOSBATCHED_NVIDIA_CUBLAS__)
-        std::cout << "CUBLAS compact version does not exist\n"; 
+    std::cout << "CUBLAS compact version does not exist\n";
 #else
-        std::cout << "TPLs (CompactMKL or CUBLAS) are not found\n"; 
-#endif 
-      } else {
-        FactorizeBlockTridiagMatrices<DeviceSpace,
-	  ValueType, 
-	  DeviceArrayLayout,
-	  VectorLength,
-	  AlgoLU,
-	  AlgoTrsm,
-	  AlgoGemm> factorblk;
-        factorblk.run(Oper, T_device);
-        TEST_ASSERT(factorblk.check(T_org_device), success);
-      }
-
-      // Test Block TriDiag Solve
-      {
-        PartitionedBlockMultiVector<HostSpace,ValueType,DeviceArrayLayout> b_host
-          = create_partitioned_block_multi_vector
-          <HostSpace,ValueType,DeviceArrayLayout>(mesh.ni*mesh.nj, 
-                                                  nrhs,
-                                                  mesh.nk, 
-                                                  blocksize);
-        fill_partitioned_block_multi_vector_host(b_host, mesh.ni*mesh.nj);
-
-        auto b_device = create_mirror<DeviceSpace>(b_host);
-        deep_copy(b_device, b_host);
-
-        PartitionedBlockMultiVector<DeviceSpace,ValueType,DeviceArrayLayout> x_device
-          = create_partitioned_block_multi_vector
-          <DeviceSpace,ValueType,DeviceArrayLayout>(mesh.ni*mesh.nj, 
-                                                    nrhs,
-                                                    mesh.nk, 
-                                                    blocksize);
-        if (test_tpl) {
-#if                                                     \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&               \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) &&       \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
-          SolveBlockTridiagMatrices<DeviceSpace,
-	    ValueType,
-	    DeviceArrayLayout,
-	    VectorLength,
-	    Algo::Trsv::CompactMKL,
-	    Algo::Gemv::CompactMKL> solveblk;
+    std::cout << "TPLs (CompactMKL or CUBLAS) are not found\n";
+#endif
+  } else {
+    FactorizeBlockTridiagMatrices<DeviceSpace, ValueType, DeviceArrayLayout,
+                                  VectorLength, AlgoLU, AlgoTrsm, AlgoGemm>
+        factorblk;
+    factorblk.run(Oper, T_device);
+    TEST_ASSERT(factorblk.check(T_org_device), success);
+  }
 
-          solveblk.run(0, T_device, x_device, b_device);
-          TEST_ASSERT(solveblk.check(T_org_device, b_device), success);
+  // Test Block TriDiag Solve
+  {
+    PartitionedBlockMultiVector<HostSpace, ValueType, DeviceArrayLayout>
+        b_host = create_partitioned_block_multi_vector<HostSpace, ValueType,
+                                                       DeviceArrayLayout>(
+            mesh.ni * mesh.nj, nrhs, mesh.nk, blocksize);
+    fill_partitioned_block_multi_vector_host(b_host, mesh.ni * mesh.nj);
+
+    auto b_device = create_mirror<DeviceSpace>(b_host);
+    deep_copy(b_device, b_host);
+
+    PartitionedBlockMultiVector<DeviceSpace, ValueType, DeviceArrayLayout>
+        x_device = create_partitioned_block_multi_vector<DeviceSpace, ValueType,
+                                                         DeviceArrayLayout>(
+            mesh.ni * mesh.nj, nrhs, mesh.nk, blocksize);
+    if (test_tpl) {
+#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&         \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
+      SolveBlockTridiagMatrices<DeviceSpace, ValueType, DeviceArrayLayout,
+                                VectorLength, Algo::Trsv::CompactMKL,
+                                Algo::Gemv::CompactMKL>
+          solveblk;
+
+      solveblk.run(0, T_device, x_device, b_device);
+      TEST_ASSERT(solveblk.check(T_org_device, b_device), success);
 #elif defined(__KOKKOSBATCHED_NVIDIA_CUBLAS__)
-	  std::cout << "CUBLAS compact version does not exist\n"; 
+      std::cout << "CUBLAS compact version does not exist\n";
 #else
-	  std::cout << "TPLs (CompactMKL or CUBLAS) are not found\n"; 
-#endif 
-        } else {
-          SolveBlockTridiagMatrices<DeviceSpace,
-	    ValueType, 
-	    DeviceArrayLayout,
-	    VectorLength,
-	    AlgoTrsv,
-	    AlgoGemv> solveblk;
-          
-          solveblk.run(Oper, T_device, x_device, b_device);
-          TEST_ASSERT(solveblk.check(T_org_device, b_device), success);
-        }
-      }
+      std::cout << "TPLs (CompactMKL or CUBLAS) are not found\n";
+#endif
+    } else {
+      SolveBlockTridiagMatrices<DeviceSpace, ValueType, DeviceArrayLayout,
+                                VectorLength, AlgoTrsv, AlgoGemv>
+          solveblk;
 
-      if (!success)
-        std::cout << "Unit Tests:: Failed:: "
-                  << " ni = " << ni << " nj = " << nj << " nk = " << nk 
-                  << " blocksize = " << blocksize << " nrhs = " << nrhs << " \n"; 
+      solveblk.run(Oper, T_device, x_device, b_device);
+      TEST_ASSERT(solveblk.check(T_org_device, b_device), success);
     }
-    
-    // performance tests
-    template<typename DeviceSpace, typename ValueType, int VectorLength>
-    int run(const Input<DeviceSpace> &input, const bool test_tpl = false) { 
-      typedef typename DeviceSpace::array_layout DeviceArrayLayout;
-      typedef Kokkos::DefaultHostExecutionSpace HostSpace;
-
-      const ordinal_type niter = 50;      
-      int dontopt = 0;
-      bool success = true;
-      
-      /// 
-      /// construct a discrete system of equations
-      ///
-      const ordinal_type 
-        ni = input.ni, 
-        nj = input.nj, 
-        nk = input.nk,
-        blocksize = input.bs, 
-        nrhs = input.nrhs,
-        opf =input.opf,
-        ops =input.ops;
-
-      StructuredBlock mesh(ni, nj, nk);
-
-      // something is not copyable ... don't know why yet...
-      BlockCrsMatrix<DeviceSpace,DeviceArrayLayout> A_device;
-      //double t_fill_block_crs_matrix = 0.0, t_fill_graph = 0.0;
-      {
-        const StencilShape::Enum stencil_shape = input.stencil_shape;
-        CrsGraph<HostSpace,DeviceArrayLayout> graph_host;
-        {
-          Timer timer("Fill Graph _______________");
-          timer.reset();
-          graph_host = create_graph_host_for_structured_block<DeviceArrayLayout>(mesh, stencil_shape);
-          /* t_fill_graph = */ timer.seconds();
-        }
-        BlockCrsMatrix<HostSpace,DeviceArrayLayout> A_host(graph_host, blocksize);
-        {
-          Timer timer("Fill Block CRS Matrix_______________");
-          timer.reset();
-          fill_block_crs_matrix_host(A_host);       
-          /* t_fill_block_crs_matrix = */timer.seconds();       
-        }
-        A_device = create_mirror<DeviceSpace>(A_host);
-        deep_copy(A_device, A_host);
-      }
-
-      // memory size
-      const double memsize_A = A_device.Values().extent(0)*blocksize*blocksize*8;
-      
-      ///
-      /// matrix vector multiplication test
-      ///
-      double t_matvec = 0.0;
-      //double t_fill_block_multi_vector = 0.0;
-      {
-        const ordinal_type m = mesh.size();
+  }
 
-        BlockMultiVector<HostSpace,DeviceArrayLayout> x_host(nrhs, m, blocksize);
-        {
-          Timer timer("Fill Block Multi Vector______________");
-          timer.reset();
-          fill_block_multi_vector_host(x_host);
-          /* t_fill_block_multi_vector = */ timer.seconds();
-        }
-        auto x_device = create_mirror<DeviceSpace>(x_host);
-        deep_copy(x_device, x_host);
-        
-        BlockMultiVector<DeviceSpace,DeviceArrayLayout> y_device(nrhs, m, blocksize);
-        {
-          //BlockCrsMatrixVectorProductByRow<DeviceSpace> matvec;
-          BlockCrsMatrixVectorProductByBlockRow<DeviceSpace,DeviceArrayLayout> matvec;
-          {
-            Timer timer("50 BlockCrsMatrixVectorProduct");
-            timer.reset();
-            for (ordinal_type i=0;i<niter;++i) {
-              matvec.run(A_device, x_device, y_device);
-              dontopt += i;
-            }
-            t_matvec = timer.seconds();
-          }
-        }
-      }
+  if (!success)
+    std::cout << "Unit Tests:: Failed:: "
+              << " ni = " << ni << " nj = " << nj << " nk = " << nk
+              << " blocksize = " << blocksize << " nrhs = " << nrhs << " \n";
+}
 
-      ///
-      /// block tridiag extraction test
-      ///
-      const double memsize_T = ni*nj*(3*(nk-1)*blocksize*blocksize + blocksize*blocksize)*8;
+// performance tests
+template <typename DeviceSpace, typename ValueType, int VectorLength>
+int run(const Input<DeviceSpace> &input, const bool test_tpl = false) {
+  typedef typename DeviceSpace::array_layout DeviceArrayLayout;
+  typedef Kokkos::DefaultHostExecutionSpace HostSpace;
+
+  const ordinal_type niter = 50;
+  int dontopt              = 0;
+  bool success             = true;
+
+  ///
+  /// construct a discrete system of equations
+  ///
+  const ordinal_type ni = input.ni, nj = input.nj, nk = input.nk,
+                     blocksize = input.bs, nrhs = input.nrhs, opf = input.opf,
+                     ops = input.ops;
+
+  StructuredBlock mesh(ni, nj, nk);
+
+  // something is not copyable ... don't know why yet...
+  BlockCrsMatrix<DeviceSpace, DeviceArrayLayout> A_device;
+  // double t_fill_block_crs_matrix = 0.0, t_fill_graph = 0.0;
+  {
+    const StencilShape::Enum stencil_shape = input.stencil_shape;
+    CrsGraph<HostSpace, DeviceArrayLayout> graph_host;
+    {
+      Timer timer("Fill Graph _______________");
+      timer.reset();
+      graph_host = create_graph_host_for_structured_block<DeviceArrayLayout>(
+          mesh, stencil_shape);
+      /* t_fill_graph = */ timer.seconds();
+    }
+    BlockCrsMatrix<HostSpace, DeviceArrayLayout> A_host(graph_host, blocksize);
+    {
+      Timer timer("Fill Block CRS Matrix_______________");
+      timer.reset();
+      fill_block_crs_matrix_host(A_host);
+      /* t_fill_block_crs_matrix = */ timer.seconds();
+    }
+    A_device = create_mirror<DeviceSpace>(A_host);
+    deep_copy(A_device, A_host);
+  }
 
-      double t_extract = 0.0;
-      BlockTridiagMatrices<DeviceSpace,ValueType,DeviceArrayLayout> T_device
-        = create_block_tridiag_matrices<DeviceSpace,ValueType,DeviceArrayLayout>(ni*nj, nk, blocksize);
+  // memory size
+  const double memsize_A =
+      A_device.Values().extent(0) * blocksize * blocksize * 8;
+
+  ///
+  /// matrix vector multiplication test
+  ///
+  double t_matvec = 0.0;
+  // double t_fill_block_multi_vector = 0.0;
+  {
+    const ordinal_type m = mesh.size();
+
+    BlockMultiVector<HostSpace, DeviceArrayLayout> x_host(nrhs, m, blocksize);
+    {
+      Timer timer("Fill Block Multi Vector______________");
+      timer.reset();
+      fill_block_multi_vector_host(x_host);
+      /* t_fill_block_multi_vector = */ timer.seconds();
+    }
+    auto x_device = create_mirror<DeviceSpace>(x_host);
+    deep_copy(x_device, x_host);
+
+    BlockMultiVector<DeviceSpace, DeviceArrayLayout> y_device(nrhs, m,
+                                                              blocksize);
+    {
+      // BlockCrsMatrixVectorProductByRow<DeviceSpace> matvec;
+      BlockCrsMatrixVectorProductByBlockRow<DeviceSpace, DeviceArrayLayout>
+          matvec;
       {
-        ExtractBlockTridiagMatrices<DeviceSpace,ValueType,DeviceArrayLayout> extblk(mesh);
-        {
-          Timer timer("ExtractBlockTridiagMatrices");
-          timer.reset();
-          extblk.run(A_device, T_device);
-          t_extract = timer.seconds();
+        Timer timer("50 BlockCrsMatrixVectorProduct");
+        timer.reset();
+        for (ordinal_type i = 0; i < niter; ++i) {
+          matvec.run(A_device, x_device, y_device);
+          dontopt += i;
         }
-        if (input.check) TEST_ASSERT(extblk.check(), success);
+        t_matvec = timer.seconds();
       }
+    }
+  }
 
-      // keep original matrix for check
-      BlockTridiagMatrices<DeviceSpace,ValueType,DeviceArrayLayout> T_org_device
-        = create_block_tridiag_matrices<DeviceSpace,ValueType,DeviceArrayLayout>(ni*nj, nk, blocksize);
-      
-      deep_copy(T_org_device, T_device);
-      
-      ///
-      /// block tridiag factorization test
-      ///
-      double t_factorize = 0.0, f_factorize = 0.0;
-      if (test_tpl) {
-#if                                                     \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&               \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) &&       \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
-        FactorizeBlockTridiagMatrices<DeviceSpace,
-	  ValueType,
-	  DeviceArrayLayout,
-	  VectorLength,
-	  Algo::LU::CompactMKL,
-	  Algo::Trsm::CompactMKL,
-	  Algo::Gemm::CompactMKL> factorblk;
-        f_factorize = factorblk.FlopCount(T_device)*(sizeof(ValueType)/sizeof(double));
-        {
-          Timer timer("FactorizeBlockTridiagMatrices");
-          timer.reset();
-          Kokkos::fence();
-          factorblk.run(0, T_device);
-          Kokkos::fence();
-          t_factorize = timer.seconds();
-        }
-        TEST_ASSERT(factorblk.check(T_org_device), success);
+  ///
+  /// block tridiag extraction test
+  ///
+  const double memsize_T =
+      ni * nj * (3 * (nk - 1) * blocksize * blocksize + blocksize * blocksize) *
+      8;
+
+  double t_extract = 0.0;
+  BlockTridiagMatrices<DeviceSpace, ValueType, DeviceArrayLayout> T_device =
+      create_block_tridiag_matrices<DeviceSpace, ValueType, DeviceArrayLayout>(
+          ni * nj, nk, blocksize);
+  {
+    ExtractBlockTridiagMatrices<DeviceSpace, ValueType, DeviceArrayLayout>
+        extblk(mesh);
+    {
+      Timer timer("ExtractBlockTridiagMatrices");
+      timer.reset();
+      extblk.run(A_device, T_device);
+      t_extract = timer.seconds();
+    }
+    if (input.check) TEST_ASSERT(extblk.check(), success);
+  }
+
+  // keep original matrix for check
+  BlockTridiagMatrices<DeviceSpace, ValueType, DeviceArrayLayout> T_org_device =
+      create_block_tridiag_matrices<DeviceSpace, ValueType, DeviceArrayLayout>(
+          ni * nj, nk, blocksize);
+
+  deep_copy(T_org_device, T_device);
+
+  ///
+  /// block tridiag factorization test
+  ///
+  double t_factorize = 0.0, f_factorize = 0.0;
+  if (test_tpl) {
+#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&         \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
+    FactorizeBlockTridiagMatrices<
+        DeviceSpace, ValueType, DeviceArrayLayout, VectorLength,
+        Algo::LU::CompactMKL, Algo::Trsm::CompactMKL, Algo::Gemm::CompactMKL>
+        factorblk;
+    f_factorize =
+        factorblk.FlopCount(T_device) * (sizeof(ValueType) / sizeof(double));
+    {
+      Timer timer("FactorizeBlockTridiagMatrices");
+      timer.reset();
+      Kokkos::fence();
+      factorblk.run(0, T_device);
+      Kokkos::fence();
+      t_factorize = timer.seconds();
+    }
+    TEST_ASSERT(factorblk.check(T_org_device), success);
 #elif defined(__KOKKOSBATCHED_NVIDIA_CUBLAS__)
-        std::cout << "CUBLAS compact version does not exist\n"; 
+    std::cout << "CUBLAS compact version does not exist\n";
 #else
-        std::cout << "TPLs (CompactMKL or CUBLAS) are not found\n"; 
-#endif 
-      } else {
-        FactorizeBlockTridiagMatrices<DeviceSpace,
-	  ValueType, 
-	  DeviceArrayLayout,
-	  VectorLength,
-	  AlgoLU,
-	  AlgoTrsm,
-	  AlgoGemm> factorblk;
-        
-        f_factorize = factorblk.FlopCount(T_device)*(sizeof(ValueType)/sizeof(double));
-        {
-          Timer timer("FactorizeBlockTridiagMatrices");
-          timer.reset();
-          Kokkos::fence();
-          factorblk.run(opf, T_device);
-          Kokkos::fence();
-          t_factorize = timer.seconds();
+    std::cout << "TPLs (CompactMKL or CUBLAS) are not found\n";
+#endif
+  } else {
+    FactorizeBlockTridiagMatrices<DeviceSpace, ValueType, DeviceArrayLayout,
+                                  VectorLength, AlgoLU, AlgoTrsm, AlgoGemm>
+        factorblk;
+
+    f_factorize =
+        factorblk.FlopCount(T_device) * (sizeof(ValueType) / sizeof(double));
+    {
+      Timer timer("FactorizeBlockTridiagMatrices");
+      timer.reset();
+      Kokkos::fence();
+      factorblk.run(opf, T_device);
+      Kokkos::fence();
+      t_factorize = timer.seconds();
+    }
+    if (input.check) TEST_ASSERT(factorblk.check(T_org_device), success);
+  }
+
+  ///
+  /// block tridiag solve test
+  ///
+  double t_solve = 0.0;
+  {
+    PartitionedBlockMultiVector<HostSpace, ValueType, DeviceArrayLayout>
+        b_host = create_partitioned_block_multi_vector<HostSpace, ValueType,
+                                                       DeviceArrayLayout>(
+            ni * nj, nrhs, nk, blocksize);
+    fill_partitioned_block_multi_vector_host(b_host, ni * nj);
+
+    auto b_device = create_mirror<DeviceSpace>(b_host);
+    deep_copy(b_device, b_host);
+
+    PartitionedBlockMultiVector<DeviceSpace, ValueType, DeviceArrayLayout>
+        x_device = create_partitioned_block_multi_vector<DeviceSpace, ValueType,
+                                                         DeviceArrayLayout>(
+            ni * nj, nrhs, nk, blocksize);
+    if (test_tpl) {
+#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&         \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
+      SolveBlockTridiagMatrices<DeviceSpace, ValueType, DeviceArrayLayout,
+                                VectorLength, Algo::Trsv::CompactMKL,
+                                Algo::Gemv::CompactMKL>
+          solveblk;
+      {
+        Timer timer("50 SolveBlockTridiagMatrices");
+        timer.reset();
+        Kokkos::fence();
+        for (ordinal_type i = 0; i < niter; ++i) {
+          solveblk.run(0, T_device, x_device, b_device);
+          dontopt += i;
         }
-        if (input.check) TEST_ASSERT(factorblk.check(T_org_device), success);
+        Kokkos::fence();
+        t_solve = timer.seconds();
       }
-      
-      ///
-      /// block tridiag solve test
-      ///
-      double t_solve = 0.0;
-      {
-        PartitionedBlockMultiVector<HostSpace,ValueType,DeviceArrayLayout> b_host
-          = create_partitioned_block_multi_vector
-          <HostSpace,ValueType,DeviceArrayLayout>(ni*nj, 
-                                                  nrhs,
-                                                  nk, 
-                                                  blocksize);
-        fill_partitioned_block_multi_vector_host(b_host, ni*nj);
-
-        auto b_device = create_mirror<DeviceSpace>(b_host);
-        deep_copy(b_device, b_host);
-        
-        PartitionedBlockMultiVector<DeviceSpace,ValueType,DeviceArrayLayout> x_device
-          = create_partitioned_block_multi_vector
-          <DeviceSpace,ValueType,DeviceArrayLayout>(ni*nj, 
-                                                    nrhs,
-                                                    nk, 
-                                                    blocksize);
-        if (test_tpl) {
-#if                                                     \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&               \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) &&       \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
-          SolveBlockTridiagMatrices<DeviceSpace,
-	    ValueType, 
-	    DeviceArrayLayout,
-	    VectorLength,
-	    Algo::Trsv::CompactMKL,
-	    Algo::Gemv::CompactMKL> solveblk;
-          {
-            Timer timer("50 SolveBlockTridiagMatrices");
-            timer.reset();
-            Kokkos::fence();
-            for (ordinal_type i=0;i<niter;++i) {
-              solveblk.run(0, T_device, x_device, b_device);
-              dontopt += i;
-            }          
-            Kokkos::fence();
-            t_solve = timer.seconds();
-          }
-          if (input.check) TEST_ASSERT(solveblk.check(T_org_device, b_device), success);
+      if (input.check)
+        TEST_ASSERT(solveblk.check(T_org_device, b_device), success);
 #elif defined(__KOKKOSBATCHED_NVIDIA_CUBLAS__)
-	  std::cout << "CUBLAS compact version does not exist\n"; 
+      std::cout << "CUBLAS compact version does not exist\n";
 #else
-	  std::cout << "TPLs (CompactMKL or CUBLAS) are not found\n"; 
-#endif 
-        } else {
-          SolveBlockTridiagMatrices<DeviceSpace,
-	    ValueType, 
-	    DeviceArrayLayout,
-	    VectorLength,
-	    AlgoTrsv,
-	    AlgoGemv> solveblk;
-          {
-            Timer timer("50 SolveBlockTridiagMatrices");
-            timer.reset();
-            Kokkos::fence();
-            for (ordinal_type i=0;i<niter;++i) {
-              solveblk.run(ops, T_device, x_device, b_device);
-              dontopt += i;
-            }          
-            Kokkos::fence();
-            t_solve = timer.seconds();
-          }
-          if (input.check) TEST_ASSERT(solveblk.check(T_org_device, b_device), success);
+      std::cout << "TPLs (CompactMKL or CUBLAS) are not found\n";
+#endif
+    } else {
+      SolveBlockTridiagMatrices<DeviceSpace, ValueType, DeviceArrayLayout,
+                                VectorLength, AlgoTrsv, AlgoGemv>
+          solveblk;
+      {
+        Timer timer("50 SolveBlockTridiagMatrices");
+        timer.reset();
+        Kokkos::fence();
+        for (ordinal_type i = 0; i < niter; ++i) {
+          solveblk.run(ops, T_device, x_device, b_device);
+          dontopt += i;
         }
+        Kokkos::fence();
+        t_solve = timer.seconds();
       }
-      
-      const double t_matvec_per_iter = t_matvec/double(niter), t_solve_per_iter = t_solve/double(niter);
-      std::cout << " matvec     = " << t_matvec_per_iter << std::endl; 
-      std::cout << " extract    = " << t_extract        << " extract/matvec = " << (t_extract/t_matvec_per_iter) << std::endl; 
-      //std::cout << " factor     = " << t_factorize      << " factor/matvec  = " << (t_factorize/t_matvec_per_iter) << std::endl; 
-      std::cout << " factor     = " << t_factorize      << " factor/matvec  = " << (t_factorize/t_matvec_per_iter) << " flop = " << f_factorize << " flop/s = " << (f_factorize/t_factorize) << std::endl; 
-      std::cout << " solve      = " << t_solve_per_iter << " solve/matvec   = " << (t_solve_per_iter/t_matvec_per_iter) << std::endl; 
-      std::cout << " memory used     = " << (memsize_A + memsize_T) << std::endl; 
-
-      return dontopt + success;
+      if (input.check)
+        TEST_ASSERT(solveblk.check(T_org_device, b_device), success);
     }
-
   }
+
+  const double t_matvec_per_iter = t_matvec / double(niter),
+               t_solve_per_iter  = t_solve / double(niter);
+  std::cout << " matvec     = " << t_matvec_per_iter << std::endl;
+  std::cout << " extract    = " << t_extract
+            << " extract/matvec = " << (t_extract / t_matvec_per_iter)
+            << std::endl;
+  // std::cout << " factor     = " << t_factorize      << " factor/matvec  = "
+  // << (t_factorize/t_matvec_per_iter) << std::endl;
+  std::cout << " factor     = " << t_factorize
+            << " factor/matvec  = " << (t_factorize / t_matvec_per_iter)
+            << " flop = " << f_factorize
+            << " flop/s = " << (f_factorize / t_factorize) << std::endl;
+  std::cout << " solve      = " << t_solve_per_iter
+            << " solve/matvec   = " << (t_solve_per_iter / t_matvec_per_iter)
+            << std::endl;
+  std::cout << " memory used     = " << (memsize_A + memsize_T) << std::endl;
+
+  return dontopt + success;
 }
+
+}  // namespace Test
+}  // namespace KokkosBatched
diff --git a/test_common/KokkosKernels_MatrixConverter.cpp b/test_common/KokkosKernels_MatrixConverter.cpp
index 4148452fa8..a56c8788aa 100644
--- a/test_common/KokkosKernels_MatrixConverter.cpp
+++ b/test_common/KokkosKernels_MatrixConverter.cpp
@@ -50,216 +50,220 @@
 #include <string.h>
 #include "KokkosSparse_CrsMatrix.hpp"
 
-int main (int argc, char* argv[]){
+int main(int argc, char *argv[]) {
   typedef int size_type;
   typedef int idx;
   typedef double wt;
-  
+
   bool symmetrize = false, remove_diagonal = false, transpose = false;
   char *in_mtx = NULL, *out_bin = NULL;
-  //bool create_incidence = false;
-  for ( int i = 1 ; i < argc ; ++i ) {
-    if ( 0 == strcasecmp( argv[i] , "--symmetrize" ) ) {
+  // bool create_incidence = false;
+  for (int i = 1; i < argc; ++i) {
+    if (0 == Test::string_compare_no_case(argv[i], "--symmetrize")) {
       symmetrize = true;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--remove_diagonal" ) ) {
+    } else if (0 ==
+               Test::string_compare_no_case(argv[i], "--remove_diagonal")) {
       remove_diagonal = true;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--transpose" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--transpose")) {
       transpose = true;
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--in_mtx" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--in_mtx")) {
       in_mtx = argv[++i];
-    }
-    else if ( 0 == strcasecmp( argv[i] , "--out_mtx" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "--out_mtx")) {
       out_bin = argv[++i];
-    }
-    else {
+    } else {
       std::cerr << "Usage:" << argv[0]
-                << " --in_mtx matrixfile --out_mtx output_file [--symmetrize] [--remove_diagonal] [--transpose]" << std::endl;
-    std::cerr << "Input format .mtx for matrix market, .bin for binary, .crs for crs format" << std::endl;
-    std::cerr << "Output format .mtx for matrix market, .bin for binary, .crs for crs format, .ligra for ligra output format" << std::endl;
+                << " --in_mtx matrixfile --out_mtx output_file [--symmetrize] "
+                   "[--remove_diagonal] [--transpose]"
+                << std::endl;
+      std::cerr << "Input format .mtx for matrix market, .bin for binary, .crs "
+                   "for crs format"
+                << std::endl;
+      std::cerr << "Output format .mtx for matrix market, .bin for binary, "
+                   ".crs for crs format, .ligra for ligra output format"
+                << std::endl;
 
       exit(1);
     }
   }
-  if (in_mtx == NULL || out_bin == NULL){
+  if (in_mtx == NULL || out_bin == NULL) {
     std::cerr << "Usage:" << argv[0]
-              << " --in_mtx matrixfile --out_mtx output_file [--symmetrize] [--remove_diagonal] [--transpose]" << std::endl;
-    std::cerr << "Input format .mtx for matrix market, .bin for binary, .crs for crs format" << std::endl;
-    std::cerr << "Output format .mtx for matrix market, .bin for binary, .crs for crs format, .ligra for ligra output format" << std::endl;
+              << " --in_mtx matrixfile --out_mtx output_file [--symmetrize] "
+                 "[--remove_diagonal] [--transpose]"
+              << std::endl;
+    std::cerr << "Input format .mtx for matrix market, .bin for binary, .crs "
+                 "for crs format"
+              << std::endl;
+    std::cerr << "Output format .mtx for matrix market, .bin for binary, .crs "
+                 "for crs format, .ligra for ligra output format"
+              << std::endl;
 
     exit(1);
   }
 
-  Kokkos::initialize(argc,argv);
+  Kokkos::initialize(argc, argv);
   {
-
     typedef Kokkos::DefaultHostExecutionSpace MyExecSpace;
 
-    typedef typename KokkosSparse::CrsMatrix<wt, idx, MyExecSpace, void, size_type > crstmat_t;
+    typedef
+        typename KokkosSparse::CrsMatrix<wt, idx, MyExecSpace, void, size_type>
+            crstmat_t;
     typedef typename crstmat_t::StaticCrsGraphType graph_t;
     typedef typename graph_t::row_map_type::non_const_type row_map_view_t;
-    typedef typename graph_t::entries_type::non_const_type   cols_view_t;
-    typedef typename crstmat_t::values_type::non_const_type   values_view_t;
-
+    typedef typename graph_t::entries_type::non_const_type cols_view_t;
+    typedef typename crstmat_t::values_type::non_const_type values_view_t;
 
     typedef typename graph_t::row_map_type::const_type c_row_map_view_t;
-    typedef typename graph_t::entries_type::const_type  c_cols_view_t;
-    typedef typename crstmat_t::values_type::const_type   c_values_view_t;
+    typedef typename graph_t::entries_type::const_type c_cols_view_t;
+    typedef typename crstmat_t::values_type::const_type c_values_view_t;
 
-    crstmat_t a_crsmat = KokkosKernels::Impl::read_kokkos_crst_matrix<crstmat_t>(in_mtx);
+    crstmat_t a_crsmat =
+        KokkosKernels::Impl::read_kokkos_crst_matrix<crstmat_t>(in_mtx);
 
-    c_row_map_view_t orm = a_crsmat.graph.row_map;
-    c_cols_view_t oentries = a_crsmat.graph.entries;
+    c_row_map_view_t orm    = a_crsmat.graph.row_map;
+    c_cols_view_t oentries  = a_crsmat.graph.entries;
     c_values_view_t ovalues = a_crsmat.values;
 
     const size_type *prm = orm.data();
-    const idx *pentries = oentries.data();
-    const wt *pvals = ovalues.data();
+    const idx *pentries  = oentries.data();
+    const wt *pvals      = ovalues.data();
 
     idx numrows = a_crsmat.numRows();
-    //idx numcols = a_crsmat.numCols();
+    // idx numcols = a_crsmat.numCols();
     idx nnz = ovalues.extent(0);
-    std::cout << "numrows :" << numrows << " nnz:" << nnz << std::endl; 
-    //Kokkos::deep_copy(new_rowmap, a_crsmat.graph.row_map);
+    std::cout << "numrows :" << numrows << " nnz:" << nnz << std::endl;
+    // Kokkos::deep_copy(new_rowmap, a_crsmat.graph.row_map);
 
     if (remove_diagonal) {
       std::vector<size_type> nrm(numrows + 1, 0);
       std::vector<idx> nentries(nnz + 1);
       std::vector<wt> nvals(nnz + 1);
 
-      for (idx i = 0; i < numrows; ++i){
-
-	size_type begin = prm[i];
-	size_type end = prm[i+1];
-	for (size_type j = begin; j < end; ++ j){
-	  idx col = pentries[j];
-	  //wt val = pvals[j];
-
-	  if (i == col){
-	    nrm[i] = 1;
-	    break;
-	  }
-	}
+      for (idx i = 0; i < numrows; ++i) {
+        size_type begin = prm[i];
+        size_type end   = prm[i + 1];
+        for (size_type j = begin; j < end; ++j) {
+          idx col = pentries[j];
+          // wt val = pvals[j];
+
+          if (i == col) {
+            nrm[i] = 1;
+            break;
+          }
+        }
       }
 
       size_type prefix = 0;
-      for (idx i = 0; i <= numrows; ++i){
-	size_type current = nrm[i];
-	nrm[i] = prefix;
-	prefix += current;
-
+      for (idx i = 0; i <= numrows; ++i) {
+        size_type current = nrm[i];
+        nrm[i]            = prefix;
+        prefix += current;
       }
 
-
-      for (idx i = 0; i <= numrows; ++i){
-	nrm[i] = prm[i] - nrm[i];
+      for (idx i = 0; i <= numrows; ++i) {
+        nrm[i] = prm[i] - nrm[i];
       }
 
-
-      for (idx i = 0; i < numrows; ++i){
-
-	size_type begin = prm[i];
-	size_type end = prm[i+1];
-
-	size_type obegin = nrm[i];
-
-
-	for (size_type j = begin; j < end; ++ j){
-	  idx col = pentries[j];
-	  wt val = pvals[j];
-	  if (i != col){
-	    nentries[obegin] = col;
-	    nvals[obegin++] = val;
-	  }
-	}
-	if (obegin != nrm[i+1]){
-	  std::cout << "i:" << i << " nrm[i+1]:" << nrm[i+1] << " obegin:" << obegin << std::endl;
-	  exit(1);
-	}
+      for (idx i = 0; i < numrows; ++i) {
+        size_type begin = prm[i];
+        size_type end   = prm[i + 1];
+
+        size_type obegin = nrm[i];
+
+        for (size_type j = begin; j < end; ++j) {
+          idx col = pentries[j];
+          wt val  = pvals[j];
+          if (i != col) {
+            nentries[obegin] = col;
+            nvals[obegin++]  = val;
+          }
+        }
+        if (obegin != nrm[i + 1]) {
+          std::cout << "i:" << i << " nrm[i+1]:" << nrm[i + 1]
+                    << " obegin:" << obegin << std::endl;
+          exit(1);
+        }
       }
 
-
-
-      row_map_view_t new_rowmap ("new rowmap", numrows + 1);
+      row_map_view_t new_rowmap("new rowmap", numrows + 1);
 
       cols_view_t new_entries("new colmap", nrm[numrows]);
-      values_view_t new_values("new values", nrm[numrows ]);
+      values_view_t new_values("new values", nrm[numrows]);
 
-      for (idx i = 0; i <= numrows; ++i){
-	new_rowmap(i) = nrm[i];
+      for (idx i = 0; i <= numrows; ++i) {
+        new_rowmap(i) = nrm[i];
       }
 
-      for (size_type i = 0; i < nrm[numrows ]; ++i){
-	new_entries(i) = nentries[i];
-	new_values(i) = nvals[i];
+      for (size_type i = 0; i < nrm[numrows]; ++i) {
+        new_entries(i) = nentries[i];
+        new_values(i)  = nvals[i];
       }
 
       graph_t transpose_graph(new_entries, new_rowmap);
-      crstmat_t transpose_matrix("transpose", numrows, new_values, transpose_graph);
+      crstmat_t transpose_matrix("transpose", numrows, new_values,
+                                 transpose_graph);
       a_crsmat = transpose_matrix;
 
-
-      orm = a_crsmat.graph.row_map;
+      orm      = a_crsmat.graph.row_map;
       oentries = a_crsmat.graph.entries;
-      ovalues = a_crsmat.values;
+      ovalues  = a_crsmat.values;
 
-      prm = orm.data();
+      prm      = orm.data();
       pentries = oentries.data();
-      pvals = ovalues.data();
+      pvals    = ovalues.data();
 
       numrows = a_crsmat.numRows();
-      //numcols = a_crsmat.numCols();
+      // numcols = a_crsmat.numCols();
       nnz = ovalues.extent(0);
     }
 
     if (symmetrize) {
-
       row_map_view_t new_rowmap;
       cols_view_t new_entries;
 
-      KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap
-	<c_row_map_view_t, c_cols_view_t, row_map_view_t, cols_view_t,MyExecSpace>
-	(numrows, orm, oentries, new_rowmap, new_entries);
-      values_view_t new_values("new_values",new_entries.extent(0));
+      KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap<
+          c_row_map_view_t, c_cols_view_t, row_map_view_t, cols_view_t,
+          MyExecSpace>(numrows, orm, oentries, new_rowmap, new_entries);
+      values_view_t new_values("new_values", new_entries.extent(0));
 
-      KokkosKernels::sort_crs_matrix<MyExecSpace, row_map_view_t, cols_view_t, values_view_t>
-        (new_rowmap, new_entries, new_values);
+      KokkosKernels::sort_crs_matrix<MyExecSpace, row_map_view_t, cols_view_t,
+                                     values_view_t>(new_rowmap, new_entries,
+                                                    new_values);
 
       graph_t symmetric_graph(new_entries, new_rowmap);
-      crstmat_t symmetric_marix("transpose", numrows, new_values, symmetric_graph);
+      crstmat_t symmetric_marix("transpose", numrows, new_values,
+                                symmetric_graph);
       a_crsmat = symmetric_marix;
 
-      orm = a_crsmat.graph.row_map;
+      orm      = a_crsmat.graph.row_map;
       oentries = a_crsmat.graph.entries;
-      ovalues = a_crsmat.values;
+      ovalues  = a_crsmat.values;
 
-      prm = orm.data();
+      prm      = orm.data();
       pentries = oentries.data();
-      pvals = ovalues.data();
+      pvals    = ovalues.data();
 
       numrows = a_crsmat.numRows();
-      //numcols = a_crsmat.numCols();
+      // numcols = a_crsmat.numCols();
       nnz = ovalues.extent(0);
     }
     if (transpose) {
-      row_map_view_t new_rowmap ("new_rowmap", a_crsmat.numCols() + 1);
-      cols_view_t new_entries ("new_rowmap", a_crsmat.nnz());
-      values_view_t new_values ("new_rowmap", a_crsmat.nnz());
+      row_map_view_t new_rowmap("new_rowmap", a_crsmat.numCols() + 1);
+      cols_view_t new_entries("new_rowmap", a_crsmat.nnz());
+      values_view_t new_values("new_rowmap", a_crsmat.nnz());
 
       KokkosKernels::Impl::transpose_matrix<
-	c_row_map_view_t, c_cols_view_t, c_values_view_t,
-	row_map_view_t, cols_view_t, values_view_t, row_map_view_t, MyExecSpace>(
-										 a_crsmat.numRows(), a_crsmat.numCols(),
-										 a_crsmat.graph.row_map, a_crsmat.graph.entries, a_crsmat.values,
-										 new_rowmap, new_entries, new_values);
+          c_row_map_view_t, c_cols_view_t, c_values_view_t, row_map_view_t,
+          cols_view_t, values_view_t, row_map_view_t, MyExecSpace>(
+          a_crsmat.numRows(), a_crsmat.numCols(), a_crsmat.graph.row_map,
+          a_crsmat.graph.entries, a_crsmat.values, new_rowmap, new_entries,
+          new_values);
 
       std::cout << 1 << std::endl;
       std::cout << 2 << std::endl;
 
-      KokkosKernels::sort_crs_matrix<MyExecSpace, row_map_view_t, cols_view_t, values_view_t>
-        (new_rowmap, new_entries, new_values);
+      KokkosKernels::sort_crs_matrix<MyExecSpace, row_map_view_t, cols_view_t,
+                                     values_view_t>(new_rowmap, new_entries,
+                                                    new_values);
 
       std::cout << 3 << std::endl;
       MyExecSpace().fence();
@@ -267,26 +271,25 @@ int main (int argc, char* argv[]){
       KokkosKernels::Impl::kk_print_1Dview(new_values);
 
       graph_t transpose_graph(new_entries, new_rowmap);
-      crstmat_t transpose_matrix("transpose", a_crsmat.numRows(), new_values, transpose_graph);
+      crstmat_t transpose_matrix("transpose", a_crsmat.numRows(), new_values,
+                                 transpose_graph);
       a_crsmat = transpose_matrix;
 
-      orm = a_crsmat.graph.row_map;
+      orm      = a_crsmat.graph.row_map;
       oentries = a_crsmat.graph.entries;
-      ovalues = a_crsmat.values;
+      ovalues  = a_crsmat.values;
 
-      prm = orm.data();
+      prm      = orm.data();
       pentries = oentries.data();
-      pvals = ovalues.data();
+      pvals    = ovalues.data();
 
       numrows = a_crsmat.numRows();
-      //numcols = a_crsmat.numCols();
+      // numcols = a_crsmat.numCols();
       nnz = ovalues.extent(0);
     }
 
-
-    KokkosKernels::Impl::write_kokkos_crst_matrix (a_crsmat, out_bin);
+    KokkosKernels::Impl::write_kokkos_crst_matrix(a_crsmat, out_bin);
   }
 
   Kokkos::finalize();
-
 }
diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp
index d472e2cee9..2878543f33 100644
--- a/test_common/KokkosKernels_TestUtils.hpp
+++ b/test_common/KokkosKernels_TestUtils.hpp
@@ -48,7 +48,8 @@
 #include "KokkosKernels_Utils.hpp"
 #include "Kokkos_ArithTraits.hpp"
 #include "KokkosSparse_spmv.hpp"
-#include "gtest/gtest.h"  //for EXPECT_**
+// Make this include-able from all subdirectories
+#include "../tpls/gtest/gtest/gtest.h"  //for EXPECT_**
 
 namespace Test {
 template <class ViewType,
@@ -93,10 +94,11 @@ struct multivector_layout_adapter<ViewType, false> {
 };
 
 template <class Scalar1, class Scalar2, class Scalar3>
-void EXPECT_NEAR_KK(Scalar1 val1, Scalar2 val2, Scalar3 tol) {
+void EXPECT_NEAR_KK(Scalar1 val1, Scalar2 val2, Scalar3 tol,
+                    std::string msg = "") {
   typedef Kokkos::Details::ArithTraits<Scalar1> AT1;
   typedef Kokkos::Details::ArithTraits<Scalar3> AT3;
-  EXPECT_LE((double)AT1::abs(val1 - val2), (double)AT3::abs(tol));
+  EXPECT_LE((double)AT1::abs(val1 - val2), (double)AT3::abs(tol)) << msg;
 }
 
 template <class ViewType1, class ViewType2, class Scalar>
@@ -116,10 +118,27 @@ void EXPECT_NEAR_KK_1DVIEW(ViewType1 v1, ViewType2 v2, Scalar tol) {
   }
 }
 
+/// This function returns a descriptive user defined failure string for
+/// insertion into gtest macros such as FAIL() and EXPECT_LE(). \param file The
+/// filename where the failure originated \param func The function where the
+/// failure originated \param line The line number where the failure originated
+/// \return a new string containing: "  > from file:func:line\n    > "
+static inline const std::string kk_failure_str(std::string file,
+                                               std::string func,
+                                               const int line) {
+  std::string failure_msg = "  > from ";
+  failure_msg += (file + ":" + func + ":" + std::to_string(line) + "\n    > ");
+  return std::string(failure_msg);
+}
+
 #if defined(KOKKOS_HALF_T_IS_FLOAT)
 using halfScalarType = Kokkos::Experimental::half_t;
 #endif  // KOKKOS_HALF_T_IS_FLOAT
 
+#if defined(KOKKOS_BHALF_T_IS_FLOAT)
+using bhalfScalarType = Kokkos::Experimental::bhalf_t;
+#endif  // KOKKOS_BHALF_T_IS_FLOAT
+
 template <class ViewTypeA, class ViewTypeB, class ViewTypeC,
           class ExecutionSpace>
 struct SharedVanillaGEMM {
@@ -238,29 +257,27 @@ struct Functor_BatchedVanillaGEMM {
     Kokkos::parallel_for(
         "Test::VanillaGEMM",
         Kokkos::TeamPolicy<ExecutionSpace>(
-            batch_size_last_dim ? C.extent(2) : C.extent(0), Kokkos::AUTO, 16),
+            batch_size_last_dim ? C.extent(2) : C.extent(0), Kokkos::AUTO,
+            KokkosKernels::Impl::kk_get_max_vector_size<ExecutionSpace>()),
         *this);
   }
 };
 
-//Compute C := alpha * AB + beta * C
+// Compute C := alpha * AB + beta * C
 template <class ViewTypeA, class ViewTypeB, class ViewTypeC>
 void vanillaGEMM(typename ViewTypeC::non_const_value_type alpha,
                  const ViewTypeA& A, const ViewTypeB& B,
                  typename ViewTypeC::non_const_value_type beta,
                  const ViewTypeC& C) {
   using value_type = typename ViewTypeC::non_const_value_type;
-  using KAT = Kokkos::ArithTraits<value_type>;
-  int m = A.extent(0);
-  int k = A.extent(1);
-  int n = B.extent(1);
-  for(int i = 0; i < m; i++)
-  {
-    for(int j = 0; j < n; j++)
-    {
+  using KAT        = Kokkos::ArithTraits<value_type>;
+  int m            = A.extent(0);
+  int k            = A.extent(1);
+  int n            = B.extent(1);
+  for (int i = 0; i < m; i++) {
+    for (int j = 0; j < n; j++) {
       value_type sum = KAT::zero();
-      for(int ii = 0; ii < k; ii++)
-      {
+      for (int ii = 0; ii < k; ii++) {
         sum += A(i, ii) * B(ii, j);
       }
       C(i, j) = alpha * sum + beta * C(i, j);
@@ -323,103 +340,104 @@ class epsilon<Kokkos::Experimental::half_t> {
 
 using KokkosKernels::Impl::getRandomBounds;
 
-  template<typename scalar_t, typename lno_t, typename size_type, typename device, typename crsMat_t>
-  crsMat_t symmetrize(crsMat_t A)
-  {
-    typedef typename crsMat_t::StaticCrsGraphType graph_t;
-    typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
-    typedef typename graph_t::row_map_type::non_const_type lno_view_t;
-    typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
-    auto host_rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map);
-    auto host_entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries);
-    auto host_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.values);
-    lno_t numRows = A.numRows();
-    //symmetrize as input_mat + input_mat^T, to still have a diagonally dominant matrix
-    typedef std::map<lno_t, scalar_t> Row;
-    std::vector<Row> symRows(numRows);
-    for(lno_t r = 0; r < numRows; r++)
-    {
-      auto& row = symRows[r];
-      for(size_type i = host_rowmap(r); i < host_rowmap(r + 1); i++)
-      {
-        lno_t c = host_entries(i);
-        auto& col = symRows[c];
-        auto it = row.find(c);
-        if(it == row.end())
-          row[c] = host_values(i);
-        else
-          row[c] += host_values(i);
-        it = col.find(r);
-        if(it == col.end())
-          col[r] = host_values(i);
-        else
-          col[r] += host_values(i);
-      }
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device, typename crsMat_t>
+crsMat_t symmetrize(crsMat_t A) {
+  typedef typename crsMat_t::StaticCrsGraphType graph_t;
+  typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
+  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
+  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
+  auto host_rowmap =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map);
+  auto host_entries =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries);
+  auto host_values =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.values);
+  lno_t numRows = A.numRows();
+  // symmetrize as input_mat + input_mat^T, to still have a diagonally dominant
+  // matrix
+  typedef std::map<lno_t, scalar_t> Row;
+  std::vector<Row> symRows(numRows);
+  for (lno_t r = 0; r < numRows; r++) {
+    auto& row = symRows[r];
+    for (size_type i = host_rowmap(r); i < host_rowmap(r + 1); i++) {
+      lno_t c   = host_entries(i);
+      auto& col = symRows[c];
+      auto it   = row.find(c);
+      if (it == row.end())
+        row[c] = host_values(i);
+      else
+        row[c] += host_values(i);
+      it = col.find(r);
+      if (it == col.end())
+        col[r] = host_values(i);
+      else
+        col[r] += host_values(i);
     }
-    //Count entries
-    Kokkos::View<size_type*, Kokkos::LayoutLeft, Kokkos::HostSpace> new_host_rowmap("Rowmap", numRows + 1);
-    size_t accum = 0;
-    for(lno_t r = 0; r <= numRows; r++)
-    {
-      new_host_rowmap(r) = accum;
-      if(r < numRows)
-        accum += symRows[r].size();
-    }
-    //Allocate new entries/values
-    Kokkos::View<lno_t*, Kokkos::LayoutLeft, Kokkos::HostSpace> new_host_entries("Entries", accum);
-    Kokkos::View<scalar_t*, Kokkos::LayoutLeft, Kokkos::HostSpace> new_host_values("Values", accum);
-    for(lno_t r = 0; r < numRows; r++)
-    {
-      auto rowIt = symRows[r].begin();
-      for(size_type i = new_host_rowmap(r); i < new_host_rowmap(r + 1); i++)
-      {
-        new_host_entries(i) = rowIt->first;
-        new_host_values(i) = rowIt->second;
-        rowIt++;
-      }
+  }
+  // Count entries
+  Kokkos::View<size_type*, Kokkos::LayoutLeft, Kokkos::HostSpace>
+      new_host_rowmap("Rowmap", numRows + 1);
+  size_t accum = 0;
+  for (lno_t r = 0; r <= numRows; r++) {
+    new_host_rowmap(r) = accum;
+    if (r < numRows) accum += symRows[r].size();
+  }
+  // Allocate new entries/values
+  Kokkos::View<lno_t*, Kokkos::LayoutLeft, Kokkos::HostSpace> new_host_entries(
+      "Entries", accum);
+  Kokkos::View<scalar_t*, Kokkos::LayoutLeft, Kokkos::HostSpace>
+      new_host_values("Values", accum);
+  for (lno_t r = 0; r < numRows; r++) {
+    auto rowIt = symRows[r].begin();
+    for (size_type i = new_host_rowmap(r); i < new_host_rowmap(r + 1); i++) {
+      new_host_entries(i) = rowIt->first;
+      new_host_values(i)  = rowIt->second;
+      rowIt++;
     }
-    lno_view_t new_rowmap("Rowmap", numRows + 1);
-    lno_nnz_view_t new_entries("Entries", accum);
-    scalar_view_t new_values("Values", accum);
-    Kokkos::deep_copy(new_rowmap, new_host_rowmap);
-    Kokkos::deep_copy(new_entries, new_host_entries);
-    Kokkos::deep_copy(new_values, new_host_values);
-    return crsMat_t("SymA", numRows, numRows, accum, new_values, new_rowmap, new_entries);
   }
+  lno_view_t new_rowmap("Rowmap", numRows + 1);
+  lno_nnz_view_t new_entries("Entries", accum);
+  scalar_view_t new_values("Values", accum);
+  Kokkos::deep_copy(new_rowmap, new_host_rowmap);
+  Kokkos::deep_copy(new_entries, new_host_entries);
+  Kokkos::deep_copy(new_values, new_host_values);
+  return crsMat_t("SymA", numRows, numRows, accum, new_values, new_rowmap,
+                  new_entries);
+}
 
-  //create_random_x_vector and create_random_y_vector can be used together to generate a random 
-  //linear system Ax = y.
-  template<typename vec_t>
-  vec_t create_random_x_vector(vec_t& kok_x, double max_value = 10.0) {
-    typedef typename vec_t::value_type scalar_t;
-    auto h_x = Kokkos::create_mirror_view (kok_x);
-    for (size_t j = 0; j < h_x.extent(1); ++j){
-      for (size_t i = 0; i < h_x.extent(0); ++i){
-        scalar_t r =
-            static_cast <scalar_t> (rand()) /
-            static_cast <scalar_t> (RAND_MAX / max_value);
-        h_x.access(i, j) = r;
-      }
+// create_random_x_vector and create_random_y_vector can be used together to
+// generate a random linear system Ax = y.
+template <typename vec_t>
+vec_t create_random_x_vector(vec_t& kok_x, double max_value = 10.0) {
+  typedef typename vec_t::value_type scalar_t;
+  auto h_x = Kokkos::create_mirror_view(kok_x);
+  for (size_t j = 0; j < h_x.extent(1); ++j) {
+    for (size_t i = 0; i < h_x.extent(0); ++i) {
+      scalar_t r = static_cast<scalar_t>(rand()) /
+                   static_cast<scalar_t>(RAND_MAX / max_value);
+      h_x.access(i, j) = r;
     }
-    Kokkos::deep_copy (kok_x, h_x);
-    return kok_x;
   }
+  Kokkos::deep_copy(kok_x, h_x);
+  return kok_x;
+}
 
-  template <typename crsMat_t, typename vector_t>
-  vector_t create_random_y_vector(crsMat_t crsMat, vector_t x_vector){
-    vector_t y_vector (Kokkos::view_alloc(Kokkos::WithoutInitializing, "Y VECTOR"),
-        crsMat.numRows());
-    KokkosSparse::spmv("N", 1, crsMat, x_vector, 0, y_vector);
-    return y_vector;
-  }
+template <typename crsMat_t, typename vector_t>
+vector_t create_random_y_vector(crsMat_t crsMat, vector_t x_vector) {
+  vector_t y_vector(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Y VECTOR"),
+                    crsMat.numRows());
+  KokkosSparse::spmv("N", 1, crsMat, x_vector, 0, y_vector);
+  return y_vector;
+}
 
-  template <typename crsMat_t, typename vector_t>
-  vector_t create_random_y_vector_mv(crsMat_t crsMat, vector_t x_vector){
-    vector_t y_vector (Kokkos::view_alloc(Kokkos::WithoutInitializing, "Y VECTOR"),
-        crsMat.numRows(), x_vector.extent(1));
-    KokkosSparse::spmv("N", 1, crsMat, x_vector, 0, y_vector);
-    return y_vector;
-  }
+template <typename crsMat_t, typename vector_t>
+vector_t create_random_y_vector_mv(crsMat_t crsMat, vector_t x_vector) {
+  vector_t y_vector(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Y VECTOR"),
+                    crsMat.numRows(), x_vector.extent(1));
+  KokkosSparse::spmv("N", 1, crsMat, x_vector, 0, y_vector);
+  return y_vector;
+}
 
 /// \brief SharedParamTag class used to specify how to invoke templates within
 ///                       batched unit tests
@@ -433,5 +451,42 @@ struct SharedParamTag {
   using transB      = TB;
   using batchLayout = BL;
 };
+
+/// \brief value_type_name returns a string with the value type name
+template <typename T>
+std::string value_type_name() {
+  return "::UnknownValueType";
+}
+
+template <>
+std::string value_type_name<float>() {
+  return "::Float";
+}
+
+template <>
+std::string value_type_name<double>() {
+  return "::Double";
+}
+
+template <>
+std::string value_type_name<Kokkos::complex<float>>() {
+  return "::ComplexFloat";
+}
+
+template <>
+std::string value_type_name<Kokkos::complex<double>>() {
+  return "::ComplexDouble";
+}
+
+int string_compare_no_case(const char* str1, const char* str2) {
+  std::string str1_s(str1);
+  std::string str2_s(str2);
+  for (size_t i = 0; i < str1_s.size(); i++)
+    str1_s[i] = std::tolower(str1_s[i]);
+  for (size_t i = 0; i < str2_s.size(); i++)
+    str2_s[i] = std::tolower(str2_s[i]);
+  return strcmp(str1_s.c_str(), str2_s.c_str());
+}
+
 }  // namespace Test
 #endif
diff --git a/test_common/KokkosKernels_Test_Structured_Matrix.hpp b/test_common/KokkosKernels_Test_Structured_Matrix.hpp
index 8fd3de2651..107917060a 100644
--- a/test_common/KokkosKernels_Test_Structured_Matrix.hpp
+++ b/test_common/KokkosKernels_Test_Structured_Matrix.hpp
@@ -46,3497 +46,3537 @@
 #define KOKKOSKERNELS_TEST_STRUCTURE_MATRIX_HPP
 
 #include "KokkosKernels_Utils.hpp"
+#include "KokkosKernels_Error.hpp"
 namespace Test {
 
-  enum {FD, FE};
-
-  template<class CrsMatrix_t>
-  struct fill_1D_matrix_functor {
-
-    // Define types used by the CrsMatrix
-    typedef typename CrsMatrix_t::execution_space execution_space;
-    typedef typename CrsMatrix_t::row_map_type::non_const_type row_map_view_t;
-    typedef typename CrsMatrix_t::index_type::non_const_type   cols_view_t;
-    typedef typename CrsMatrix_t::values_type::non_const_type  scalar_view_t;
-    typedef typename CrsMatrix_t::non_const_size_type size_type;
-    typedef typename CrsMatrix_t::non_const_ordinal_type ordinal_type;
-
-    // Dispatch tags
-    struct interiorTag{};
-    struct exteriorTag{};
-
-    // Internal variables and temporaries
-    const ordinal_type numNodes;
-    const int leftBC, rightBC;
-    const ordinal_type interiorStencilLength, cornerStencilLength;
-    ordinal_type numInterior;
-    size_type    numEntries;
-
-    // Matrix views
-    row_map_view_t rowmap;
-    cols_view_t    columns;
-    scalar_view_t  values;
-
-    fill_1D_matrix_functor(const ordinal_type numNodes_,
-			   const int leftBC_, const int rightBC_,
-			   const row_map_view_t rowmap_, const cols_view_t columns_,
-			   const scalar_view_t values_) :
-      numNodes(numNodes_), leftBC(leftBC_), rightBC(rightBC_),
-      interiorStencilLength(3), cornerStencilLength(2),
-      rowmap(rowmap_), columns(columns_), values(values_) {
-
-      if(numNodes == 1) {
-        std::ostringstream os;
-        os << "You need at least two points per direction to obtain a valid discretization !"
-           << std::endl;
-        throw std::runtime_error(os.str());
-      }
-
-      numInterior = numNodes - 2;
-      numEntries  = numInterior*interiorStencilLength + 2*cornerStencilLength;
-
+enum { FD, FE };
+
+template <class CrsMatrix_t>
+struct fill_1D_matrix_functor {
+  // Define types used by the CrsMatrix
+  typedef typename CrsMatrix_t::execution_space execution_space;
+  typedef typename CrsMatrix_t::row_map_type::non_const_type row_map_view_t;
+  typedef typename CrsMatrix_t::index_type::non_const_type cols_view_t;
+  typedef typename CrsMatrix_t::values_type::non_const_type scalar_view_t;
+  typedef typename CrsMatrix_t::non_const_size_type size_type;
+  typedef typename CrsMatrix_t::non_const_ordinal_type ordinal_type;
+
+  // Dispatch tags
+  struct interiorTag {};
+  struct exteriorTag {};
+
+  // Internal variables and temporaries
+  const ordinal_type numNodes;
+  const int leftBC, rightBC;
+  const ordinal_type interiorStencilLength, cornerStencilLength;
+  ordinal_type numInterior;
+  size_type numEntries;
+
+  // Matrix views
+  row_map_view_t rowmap;
+  cols_view_t columns;
+  scalar_view_t values;
+
+  fill_1D_matrix_functor(const ordinal_type numNodes_, const int leftBC_,
+                         const int rightBC_, const row_map_view_t rowmap_,
+                         const cols_view_t columns_,
+                         const scalar_view_t values_)
+      : numNodes(numNodes_),
+        leftBC(leftBC_),
+        rightBC(rightBC_),
+        interiorStencilLength(3),
+        cornerStencilLength(2),
+        rowmap(rowmap_),
+        columns(columns_),
+        values(values_) {
+    if (numNodes == 1) {
+      std::ostringstream os;
+      os << "You need at least two points per direction to obtain a valid "
+            "discretization !"
+         << std::endl;
+      throw std::runtime_error(os.str());
     }
 
-    void compute() {
-      // Fill interior points
-      if(0 < numInterior) {
-	Kokkos::RangePolicy<execution_space, interiorTag> interiorPolicy(0, numInterior);
-	Kokkos::parallel_for("Fill 1D matrix: interior points", interiorPolicy, *this);
-      }
-
-      // Fill exterior points a.k.a. boundary points
-      Kokkos::RangePolicy<execution_space, exteriorTag> exteriorPolicy(0, 1);
-      Kokkos::parallel_for("Fill 1D matrix: exterior points", exteriorPolicy, *this);
+    numInterior = numNodes - 2;
+    numEntries  = numInterior * interiorStencilLength + 2 * cornerStencilLength;
+  }
+
+  void compute() {
+    // Fill interior points
+    if (0 < numInterior) {
+      Kokkos::RangePolicy<execution_space, interiorTag> interiorPolicy(
+          0, numInterior);
+      Kokkos::parallel_for("Fill 1D matrix: interior points", interiorPolicy,
+                           *this);
     }
 
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const interiorTag&, const size_type idx) const {
-      const ordinal_type rowIdx = idx + 1; // Offset by one since first node has BC
-      const size_type rowOffset = rowIdx*interiorStencilLength + cornerStencilLength;
-
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 3) = rowIdx - 1;
-      columns(rowOffset - 2) = rowIdx;
-      columns(rowOffset - 1) = rowIdx + 1;
-
-      // Fill values
-      values(rowOffset - 3) = -1.0;
-      values(rowOffset - 2) =  2.0;
-      values(rowOffset - 1) = -1.0;
+    // Fill exterior points a.k.a. boundary points
+    Kokkos::RangePolicy<execution_space, exteriorTag> exteriorPolicy(0, 1);
+    Kokkos::parallel_for("Fill 1D matrix: exterior points", exteriorPolicy,
+                         *this);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const interiorTag&, const ordinal_type idx) const {
+    const ordinal_type rowIdx =
+        idx + 1;  // Offset by one since first node has BC
+    const size_type rowOffset =
+        size_type(rowIdx) * interiorStencilLength + cornerStencilLength;
+
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 3) = rowIdx - 1;
+    columns(rowOffset - 2) = rowIdx;
+    columns(rowOffset - 1) = rowIdx + 1;
+
+    // Fill values
+    values(rowOffset - 3) = -1.0;
+    values(rowOffset - 2) = 2.0;
+    values(rowOffset - 1) = -1.0;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const exteriorTag&, const ordinal_type /*idx*/) const {
+    // LeftBC
+    rowmap(1) = 2;
+
+    columns(0) = 0;
+    columns(1) = 1;
+    if (leftBC == 1) {
+      values(0) = 1.0;
+      values(1) = 0.0;
+    } else {
+      values(0) = 1.0;
+      values(1) = -1.0;
     }
 
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const exteriorTag&, const size_type /*idx*/) const {
-      // LeftBC
-      rowmap(1) = 2;
-
-      columns(0) = 0;
-      columns(1) = 1;
-      if(leftBC == 1) {
-	values(0) = 1.0;
-	values(1) = 0.0;
-      } else {
-	values(0) =  1.0;
-	values(1) = -1.0;
-      }
+    // RightBC
+    rowmap(numNodes) = numEntries;
 
-      // RightBC
-      rowmap(numNodes) = numEntries;
-
-      columns(numEntries - 2) = numNodes - 2;
-      columns(numEntries - 1) = numNodes - 1;
-      if(rightBC == 1) {
-	values(numEntries - 2) = 0.0;
-	values(numEntries - 1) = 1.0;
-      } else {
-	values(numEntries - 2) = -1.0;
-	values(numEntries - 1) =  1.0;
-      }
+    columns(numEntries - 2) = numNodes - 2;
+    columns(numEntries - 1) = numNodes - 1;
+    if (rightBC == 1) {
+      values(numEntries - 2) = 0.0;
+      values(numEntries - 1) = 1.0;
+    } else {
+      values(numEntries - 2) = -1.0;
+      values(numEntries - 1) = 1.0;
+    }
+  }
+};
+
+template <typename CrsMatrix_t, typename mat_structure>
+CrsMatrix_t generate_structured_matrix1D(const mat_structure& structure) {
+  typedef typename CrsMatrix_t::StaticCrsGraphType graph_t;
+  typedef typename CrsMatrix_t::row_map_type::non_const_type row_map_view_t;
+  typedef typename CrsMatrix_t::index_type::non_const_type cols_view_t;
+  typedef typename CrsMatrix_t::values_type::non_const_type scalar_view_t;
+  typedef typename CrsMatrix_t::non_const_size_type size_type;
+  typedef typename CrsMatrix_t::non_const_ordinal_type ordinal_type;
+
+  // Extract geometric data
+  const ordinal_type nx                    = structure(0, 0);
+  const ordinal_type numNodes              = nx;
+  const ordinal_type leftBC                = structure(0, 1);
+  const ordinal_type rightBC               = structure(0, 2);
+  const ordinal_type numInterior           = (nx - leftBC - rightBC);
+  const ordinal_type numCorner             = leftBC + rightBC;
+  const ordinal_type interiorStencilLength = 3, cornerStencilLength = 2;
+  const size_type numEntries =
+      numInterior * interiorStencilLength + numCorner * cornerStencilLength;
+
+  // Create matrix data
+  row_map_view_t rowmap_view("rowmap_view", numNodes + 1);
+  cols_view_t columns_view("colsmap_view", numEntries);
+  scalar_view_t values_view("values_view", numEntries);
+
+  fill_1D_matrix_functor<CrsMatrix_t> fill_matrix(
+      numNodes, leftBC, rightBC, rowmap_view, columns_view, values_view);
+  fill_matrix.compute();
+
+  graph_t static_graph(columns_view, rowmap_view);
+  std::string name = "CrsMatrixFE";
+
+  return CrsMatrix_t(name, numNodes, values_view, static_graph);
+
+}  // generate_structured_matrix1D
+
+template <class CrsMatrix_t>
+struct fill_2D_matrix_functor {
+  // Define types used by the CrsMatrix
+  using execution_space = typename CrsMatrix_t::execution_space;
+  using row_map_view_t  = typename CrsMatrix_t::row_map_type::non_const_type;
+  using cols_view_t     = typename CrsMatrix_t::index_type::non_const_type;
+  using scalar_view_t   = typename CrsMatrix_t::values_type::non_const_type;
+  using size_type       = typename CrsMatrix_t::non_const_size_type;
+  using ordinal_type    = typename CrsMatrix_t::non_const_ordinal_type;
+
+  // Finite difference dispatch tags
+  struct interiorFDTag {};
+
+  struct xEdgeFDTag {};
+  struct yEdgeFDTag {};
+
+  struct cornerFDTag {};
+
+  // Finite element dispatch tags
+  struct interiorFETag {};
+
+  struct xEdgeFETag {};
+  struct yEdgeFETag {};
+
+  struct cornerFETag {};
+
+  // Internal variables and temporaries
+  const int stencil_type;
+  const ordinal_type nx, ny;
+  const int leftBC, rightBC, bottomBC, topBC;
+
+  // Matrix views
+  row_map_view_t rowmap;
+  cols_view_t columns;
+  scalar_view_t values;
+
+  ordinal_type interiorStencilLength, edgeStencilLength, cornerStencilLength;
+  ordinal_type numInterior;
+  ordinal_type numXEdge;
+  ordinal_type numYEdge;
+  ordinal_type numCorner;
+  ordinal_type numEntriesPerGridRow;
+  ordinal_type numEntriesBottomRow;
+  size_type numEntries;
+
+  fill_2D_matrix_functor(const int stencil_type_, const ordinal_type nx_,
+                         const ordinal_type ny_, const int leftBC_,
+                         const int rightBC_, const int bottomBC_,
+                         const int topBC_, const row_map_view_t rowmap_,
+                         const cols_view_t columns_,
+                         const scalar_view_t values_)
+      : stencil_type(stencil_type_),
+        nx(nx_),
+        ny(ny_),
+        leftBC(leftBC_),
+        rightBC(rightBC_),
+        bottomBC(bottomBC_),
+        topBC(topBC_),
+        rowmap(rowmap_),
+        columns(columns_),
+        values(values_) {
+    if (nx == 1 || ny == 1) {
+      std::ostringstream os;
+      os << "You need at least two points per direction to obtain a valid "
+            "discretization!"
+         << std::endl;
+      throw std::runtime_error(os.str());
     }
-  };
-
-  template <typename CrsMatrix_t, typename mat_structure>
-  CrsMatrix_t generate_structured_matrix1D(const mat_structure& structure) {
-
-    typedef typename CrsMatrix_t::StaticCrsGraphType graph_t;
-    typedef typename CrsMatrix_t::row_map_type::non_const_type row_map_view_t;
-    typedef typename CrsMatrix_t::index_type::non_const_type   cols_view_t;
-    typedef typename CrsMatrix_t::values_type::non_const_type  scalar_view_t;
-    typedef typename CrsMatrix_t::non_const_size_type size_type;
-    typedef typename CrsMatrix_t::non_const_ordinal_type ordinal_type;
-
-    // Extract geometric data
-    const ordinal_type nx          = structure(0,0);
-    const ordinal_type numNodes    = nx;
-    const ordinal_type leftBC      = structure(0,1);
-    const ordinal_type rightBC     = structure(0,2);
-    const ordinal_type numInterior = (nx - leftBC - rightBC);
-    const ordinal_type numCorner   = leftBC + rightBC;
-    const ordinal_type interiorStencilLength = 3, cornerStencilLength = 2;
-    const size_type numEntries = numInterior*interiorStencilLength + numCorner*cornerStencilLength;
-
-    // Create matrix data
-    row_map_view_t rowmap_view ("rowmap_view",  numNodes + 1);
-    cols_view_t    columns_view("colsmap_view", numEntries);
-    scalar_view_t  values_view ("values_view",  numEntries);
-
-    fill_1D_matrix_functor<CrsMatrix_t> fill_matrix(numNodes, leftBC, rightBC,
-						    rowmap_view, columns_view, values_view);
-    fill_matrix.compute();
-
-    graph_t static_graph (columns_view, rowmap_view);
-    std::string name = "CrsMatrixFE";
-
-    return CrsMatrix_t(name, numNodes, values_view, static_graph);
-
-  } // generate_structured_matrix1D
-
-  template<class CrsMatrix_t>
-  struct fill_2D_matrix_functor {
-
-    // Define types used by the CrsMatrix
-    using execution_space = typename CrsMatrix_t::execution_space;
-    using row_map_view_t  = typename CrsMatrix_t::row_map_type::non_const_type;
-    using cols_view_t     = typename CrsMatrix_t::index_type::non_const_type;
-    using scalar_view_t   = typename CrsMatrix_t::values_type::non_const_type;
-    using size_type       = typename CrsMatrix_t::non_const_size_type;
-    using ordinal_type    = typename CrsMatrix_t::non_const_ordinal_type;
-
-    // Finite difference dispatch tags
-    struct interiorFDTag{};
-
-    struct xEdgeFDTag{};
-    struct yEdgeFDTag{};
-
-    struct cornerFDTag{};
-
-    // Finite element dispatch tags
-    struct interiorFETag{};
-
-    struct xEdgeFETag{};
-    struct yEdgeFETag{};
-
-    struct cornerFETag{};
-
-    // Internal variables and temporaries
-    const int stencil_type;
-    const ordinal_type nx, ny;
-    const int leftBC, rightBC, bottomBC, topBC;
-
-    // Matrix views
-    row_map_view_t rowmap;
-    cols_view_t    columns;
-    scalar_view_t  values;
-
-    ordinal_type interiorStencilLength, edgeStencilLength, cornerStencilLength;
-    ordinal_type numInterior;
-    ordinal_type numXEdge;
-    ordinal_type numYEdge;
-    ordinal_type numCorner;
-    ordinal_type numEntriesPerGridRow;
-    ordinal_type numEntriesBottomRow;
-    size_type    numEntries;
-
-    fill_2D_matrix_functor(const int stencil_type_, const ordinal_type nx_,
-			   const ordinal_type ny_,
-			   const int leftBC_, const int rightBC_,
-			   const int bottomBC_, const int topBC_,
-			   const row_map_view_t rowmap_, const cols_view_t columns_,
-			   const scalar_view_t values_) :
-      stencil_type(stencil_type_), nx(nx_), ny(ny_),
-      leftBC(leftBC_), rightBC(rightBC_), bottomBC(bottomBC_), topBC(topBC_),
-      rowmap(rowmap_), columns(columns_), values(values_) {
-
-      if(nx == 1 || ny == 1) {
-        std::ostringstream os;
-        os << "You need at least two points per direction to obtain a valid discretization!"
-           << std::endl;
-        throw std::runtime_error(os.str());
-      }
-
-      if(stencil_type == FD) {
-	interiorStencilLength = 5;
-	edgeStencilLength     = 4;
-	cornerStencilLength   = 3;
-      } else if(stencil_type == FE) {
-	interiorStencilLength = 9;
-	edgeStencilLength     = 6;
-	cornerStencilLength   = 4;
-      }
-
-      numInterior = (nx - 2)*(ny - 2);
-      numXEdge    = nx - 2;
-      numYEdge    = ny - 2;
-      numCorner   = 4;
-
-      numEntriesPerGridRow = (nx - 2)*interiorStencilLength
-	+ 2*edgeStencilLength;
-
-      numEntriesBottomRow = (nx - 2)*edgeStencilLength
-	+ 2*cornerStencilLength;
 
-      numEntries = numInterior*interiorStencilLength
-	+ (2*numXEdge + 2*numYEdge)*edgeStencilLength
-	+ numCorner*cornerStencilLength;
+    if (stencil_type == FD) {
+      interiorStencilLength = 5;
+      edgeStencilLength     = 4;
+      cornerStencilLength   = 3;
+    } else if (stencil_type == FE) {
+      interiorStencilLength = 9;
+      edgeStencilLength     = 6;
+      cornerStencilLength   = 4;
     }
 
-    void compute() {
-      // Fill interior points
-      if(0 < numInterior) {
-	if(stencil_type == FD) {
-	  Kokkos::RangePolicy<execution_space, interiorFDTag> policy(0, numInterior);
-	  Kokkos::parallel_for("Fill 2D FD matrix: interior points", policy, *this);
-	} else if(stencil_type == FE) {
-	  Kokkos::RangePolicy<execution_space, interiorFETag> policy(0, numInterior);
-	  Kokkos::parallel_for("Fill 2D FE matrix: interior points", policy, *this);
-	}
+    numInterior = (nx - 2) * (ny - 2);
+    numXEdge    = nx - 2;
+    numYEdge    = ny - 2;
+    numCorner   = 4;
+
+    numEntriesPerGridRow =
+        (nx - 2) * interiorStencilLength + 2 * edgeStencilLength;
+
+    numEntriesBottomRow =
+        (nx - 2) * edgeStencilLength + 2 * cornerStencilLength;
+
+    numEntries = numInterior * interiorStencilLength +
+                 (2 * numXEdge + 2 * numYEdge) * edgeStencilLength +
+                 numCorner * cornerStencilLength;
+  }
+
+  void compute() {
+    // Fill interior points
+    if (0 < numInterior) {
+      if (stencil_type == FD) {
+        Kokkos::RangePolicy<execution_space, interiorFDTag> policy(0,
+                                                                   numInterior);
+        Kokkos::parallel_for("Fill 2D FD matrix: interior points", policy,
+                             *this);
+      } else if (stencil_type == FE) {
+        Kokkos::RangePolicy<execution_space, interiorFETag> policy(0,
+                                                                   numInterior);
+        Kokkos::parallel_for("Fill 2D FE matrix: interior points", policy,
+                             *this);
       }
+    }
 
-      // Fill x-edge points
-      if(0 < numXEdge) {
-	if(stencil_type == FD) {
-	  Kokkos::RangePolicy<execution_space, xEdgeFDTag> policy(0, numXEdge);
-	  Kokkos::parallel_for("Fill 2D FD matrix: x-edge points", policy, *this);
-	} else if(stencil_type == FE) {
-	  Kokkos::RangePolicy<execution_space, xEdgeFETag> policy(0, numXEdge);
-	  Kokkos::parallel_for("Fill 2D FE matrix: x-edge points", policy, *this);
-	}
+    // Fill x-edge points
+    if (0 < numXEdge) {
+      if (stencil_type == FD) {
+        Kokkos::RangePolicy<execution_space, xEdgeFDTag> policy(0, numXEdge);
+        Kokkos::parallel_for("Fill 2D FD matrix: x-edge points", policy, *this);
+      } else if (stencil_type == FE) {
+        Kokkos::RangePolicy<execution_space, xEdgeFETag> policy(0, numXEdge);
+        Kokkos::parallel_for("Fill 2D FE matrix: x-edge points", policy, *this);
       }
+    }
 
-      // Fill y-edge points
-      if(0 < numYEdge) {
-	if(stencil_type == FD) {
-	  Kokkos::RangePolicy<execution_space, yEdgeFDTag> policy(0, numYEdge);
-	  Kokkos::parallel_for("Fill 2D FD matrix: y-edge points", policy, *this);
-	} else if(stencil_type == FE) {
-	  Kokkos::RangePolicy<execution_space, yEdgeFETag> policy(0, numYEdge);
-	  Kokkos::parallel_for("Fill 2D FE matrix: y-edge points", policy, *this);
-	}
+    // Fill y-edge points
+    if (0 < numYEdge) {
+      if (stencil_type == FD) {
+        Kokkos::RangePolicy<execution_space, yEdgeFDTag> policy(0, numYEdge);
+        Kokkos::parallel_for("Fill 2D FD matrix: y-edge points", policy, *this);
+      } else if (stencil_type == FE) {
+        Kokkos::RangePolicy<execution_space, yEdgeFETag> policy(0, numYEdge);
+        Kokkos::parallel_for("Fill 2D FE matrix: y-edge points", policy, *this);
       }
+    }
 
-      // Fill corner points
-      if(0 < numCorner) {
-	if(stencil_type == FD) {
-	  Kokkos::RangePolicy<execution_space, cornerFDTag> policy(0, 1);
-	  Kokkos::parallel_for("Fill 2D FD matrix: corner points", policy, *this);
-	} else if(stencil_type == FE) {
-	  Kokkos::RangePolicy<execution_space, cornerFETag> policy(0, 1);
-	  Kokkos::parallel_for("Fill 2D FE matrix: corner points", policy, *this);
-	}
+    // Fill corner points
+    if (0 < numCorner) {
+      if (stencil_type == FD) {
+        Kokkos::RangePolicy<execution_space, cornerFDTag> policy(0, 1);
+        Kokkos::parallel_for("Fill 2D FD matrix: corner points", policy, *this);
+      } else if (stencil_type == FE) {
+        Kokkos::RangePolicy<execution_space, cornerFETag> policy(0, 1);
+        Kokkos::parallel_for("Fill 2D FE matrix: corner points", policy, *this);
       }
     }
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const interiorFDTag&, const size_type idx) const {
-      ordinal_type i, j;
-
-      // Compute row index
-      j = idx / (nx - 2);
-      i = idx % (nx - 2);
-      const ordinal_type rowIdx = (j + 1)*nx + i + 1;
-
-      // Compute rowOffset
-      const size_type rowOffset = j*numEntriesPerGridRow + numEntriesBottomRow
-        + (i + 1)*interiorStencilLength + edgeStencilLength;
-
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 5) = rowIdx - nx;
-      columns(rowOffset - 4) = rowIdx - 1;
-      columns(rowOffset - 3) = rowIdx;
-      columns(rowOffset - 2) = rowIdx + 1;
-      columns(rowOffset - 1) = rowIdx + nx;
-
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const interiorFDTag&, const ordinal_type idx) const {
+    ordinal_type i, j;
+
+    // Compute row index
+    j                         = idx / (nx - 2);
+    i                         = idx % (nx - 2);
+    const ordinal_type rowIdx = (j + 1) * nx + i + 1;
+
+    // Compute rowOffset
+    const size_type rowOffset =
+        size_type(j) * numEntriesPerGridRow + numEntriesBottomRow +
+        size_type(i + 1) * interiorStencilLength + edgeStencilLength;
+
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 5) = rowIdx - nx;
+    columns(rowOffset - 4) = rowIdx - 1;
+    columns(rowOffset - 3) = rowIdx;
+    columns(rowOffset - 2) = rowIdx + 1;
+    columns(rowOffset - 1) = rowIdx + nx;
+
+    // Fill values
+    values(rowOffset - 5) = -1.0;
+    values(rowOffset - 4) = -1.0;
+    values(rowOffset - 3) = 4.0;
+    values(rowOffset - 2) = -1.0;
+    values(rowOffset - 1) = -1.0;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const xEdgeFDTag&, const ordinal_type idx) const {
+    /***************/
+    /* Bottom edge */
+    /***************/
+    ordinal_type rowIdx = idx + 1;
+    size_type rowOffset =
+        size_type(idx + 1) * edgeStencilLength + cornerStencilLength;
+
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 4) = rowIdx - 1;
+    columns(rowOffset - 3) = rowIdx;
+    columns(rowOffset - 2) = rowIdx + 1;
+    columns(rowOffset - 1) = rowIdx + nx;
+    if (bottomBC == 1) {
+      // Fill values
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 1.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
       // Fill values
-      values(rowOffset - 5) = -1.0;
       values(rowOffset - 4) = -1.0;
-      values(rowOffset - 3) =  4.0;
+      values(rowOffset - 3) = 3.0;
       values(rowOffset - 2) = -1.0;
       values(rowOffset - 1) = -1.0;
     }
 
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const xEdgeFDTag&, const size_type idx) const {
-
-      /***************/
-      /* Bottom edge */
-      /***************/
-      ordinal_type rowIdx = idx + 1;
-      size_type rowOffset = (idx + 1)*edgeStencilLength + cornerStencilLength;
-
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 4) = rowIdx - 1;
-      columns(rowOffset - 3) = rowIdx;
-      columns(rowOffset - 2) = rowIdx + 1;
-      columns(rowOffset - 1) = rowIdx + nx;
-      if(bottomBC == 1) {
-	// Fill values
-	values(rowOffset - 4) = 0.0;
-	values(rowOffset - 3) = 1.0;
-	values(rowOffset - 2) = 0.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 4) = -1.0;
-	values(rowOffset - 3) =  3.0;
-	values(rowOffset - 2) = -1.0;
-	values(rowOffset - 1) = -1.0;
-      }
-
-      /************/
-      /* Top edge */
-      /************/
-      rowIdx = (ny - 1)*nx + idx + 1;
-      rowOffset = (ny - 2)*numEntriesPerGridRow + numEntriesBottomRow
-        + (idx + 1)*edgeStencilLength + cornerStencilLength;
-
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 4) = rowIdx - nx;
-      columns(rowOffset - 3) = rowIdx - 1;
-      columns(rowOffset - 2) = rowIdx;
-      columns(rowOffset - 1) = rowIdx + 1;
-      if(topBC == 1) {
-	// Fill values
-	values(rowOffset - 4) = 0.0;
-	values(rowOffset - 3) = 0.0;
-	values(rowOffset - 2) = 1.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 4) = -1.0;
-	values(rowOffset - 3) = -1.0;
-	values(rowOffset - 2) =  3.0;
-	values(rowOffset - 1) = -1.0;
-      }
+    /************/
+    /* Top edge */
+    /************/
+    rowIdx    = (ny - 1) * nx + idx + 1;
+    rowOffset = size_type(ny - 2) * numEntriesPerGridRow + numEntriesBottomRow +
+                size_type(idx + 1) * edgeStencilLength + cornerStencilLength;
+
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 4) = rowIdx - nx;
+    columns(rowOffset - 3) = rowIdx - 1;
+    columns(rowOffset - 2) = rowIdx;
+    columns(rowOffset - 1) = rowIdx + 1;
+    if (topBC == 1) {
+      // Fill values
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 1.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = 3.0;
+      values(rowOffset - 1) = -1.0;
     }
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const yEdgeFDTag&, const size_type idx) const {
-
-      /*************/
-      /* Left edge */
-      /*************/
-      ordinal_type rowIdx = (idx + 1)*nx;
-      size_type rowOffset = idx*numEntriesPerGridRow + numEntriesBottomRow
-        + edgeStencilLength;
-
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 4) = rowIdx - nx;
-      columns(rowOffset - 3) = rowIdx;
-      columns(rowOffset - 2) = rowIdx + 1;
-      columns(rowOffset - 1) = rowIdx + nx;
-      if(leftBC == 1) {
-	// Fill values
-	values(rowOffset - 4) = 0.0;
-	values(rowOffset - 3) = 1.0;
-	values(rowOffset - 2) = 0.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 4) = -1.0;
-	values(rowOffset - 3) =  3.0;
-	values(rowOffset - 2) = -1.0;
-	values(rowOffset - 1) = -1.0;
-      }
-
-      /**************/
-      /* Right edge */
-      /**************/
-      rowIdx = (idx + 2)*nx - 1;
-      rowOffset = (idx + 1)*numEntriesPerGridRow + numEntriesBottomRow;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 4) = rowIdx - nx;
-      columns(rowOffset - 3) = rowIdx - 1;
-      columns(rowOffset - 2) = rowIdx;
-      columns(rowOffset - 1) = rowIdx + nx;
-      if(rightBC == 1) {
-	// Fill values
-	values(rowOffset - 4) = 0.0;
-	values(rowOffset - 3) = 0.0;
-	values(rowOffset - 2) = 1.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 4) = -1.0;
-	values(rowOffset - 3) = -1.0;
-	values(rowOffset - 2) =  3.0;
-	values(rowOffset - 1) = -1.0;
-      }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const yEdgeFDTag&, const ordinal_type idx) const {
+    /*************/
+    /* Left edge */
+    /*************/
+    ordinal_type rowIdx = (idx + 1) * nx;
+    size_type rowOffset = size_type(idx) * numEntriesPerGridRow +
+                          numEntriesBottomRow + edgeStencilLength;
+
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 4) = rowIdx - nx;
+    columns(rowOffset - 3) = rowIdx;
+    columns(rowOffset - 2) = rowIdx + 1;
+    columns(rowOffset - 1) = rowIdx + nx;
+    if (leftBC == 1) {
+      // Fill values
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 1.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = 3.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = -1.0;
     }
 
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const cornerFDTag&, const size_type /*idx*/) const {
-      // Bottom-left corner
-      ordinal_type rowIdx = 0;
-      size_type rowOffset = cornerStencilLength;
-      rowmap(rowIdx + 1)     = rowOffset;
-
-      columns(rowOffset - 3) = rowIdx;
-      columns(rowOffset - 2) = rowIdx + 1;
-      columns(rowOffset - 1) = rowIdx + nx;
-      if(bottomBC == 1 || leftBC == 1) {
-        values(rowOffset - 3)  = 1.0;
-        values(rowOffset - 2)  = 0.0;
-        values(rowOffset - 1)  = 0.0;
-      } else {
-        values(rowOffset - 3)  =  2.0;
-        values(rowOffset - 2)  = -1.0;
-        values(rowOffset - 1)  = -1.0;
-      }
-
-      // Bottom-right corner
-      rowIdx = nx - 1;
-      rowOffset = numEntriesBottomRow;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      columns(rowOffset - 3) = rowIdx - 1;
-      columns(rowOffset - 2) = rowIdx;
-      columns(rowOffset - 1) = rowIdx + nx;
-      if(bottomBC == 1 || rightBC == 1) {
-        values(rowOffset - 3)  = 0.0;
-        values(rowOffset - 2)  = 0.0;
-        values(rowOffset - 1)  = 0.0;
-      } else {
-        values(rowOffset - 3)  = -1.0;
-        values(rowOffset - 2)  =  2.0;
-        values(rowOffset - 1)  = -1.0;
-      }
-
-      // Top-left corner
-      rowIdx = (ny - 1)*nx;
-      rowOffset = (ny - 2)*numEntriesPerGridRow + numEntriesBottomRow
-	  + cornerStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      columns(rowOffset - 3) = rowIdx - nx;
-      columns(rowOffset - 2) = rowIdx;
-      columns(rowOffset - 1) = rowIdx + 1;
-      if(topBC == 1 || leftBC == 1) {
-        values(rowOffset - 3)  = 0.0;
-        values(rowOffset - 2)  = 1.0;
-        values(rowOffset - 1)  = 0.0;
-      } else {
-        values(rowOffset - 3)  = -1.0;
-        values(rowOffset - 2)  =  2.0;
-        values(rowOffset - 1)  = -1.0;
-      }
-
-      // Top-right corner
-      rowIdx = ny*nx - 1;
-      rowOffset = numEntries;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      columns(rowOffset - 3) = rowIdx - nx;
-      columns(rowOffset - 2) = rowIdx - 1;
-      columns(rowOffset - 1) = rowIdx;
-      if(topBC == 1 || rightBC == 1) {
-	values(rowOffset - 3)  = 0.0;
-	values(rowOffset - 2)  = 0.0;
-	values(rowOffset - 1)  = 1.0;
-      } else {
-	values(rowOffset - 3)  = -1.0;
-	values(rowOffset - 2)  = -1.0;
-	values(rowOffset - 1)  =  2.0;
-      }
+    /**************/
+    /* Right edge */
+    /**************/
+    rowIdx    = (idx + 2) * nx - 1;
+    rowOffset = size_type(idx + 1) * numEntriesPerGridRow + numEntriesBottomRow;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 4) = rowIdx - nx;
+    columns(rowOffset - 3) = rowIdx - 1;
+    columns(rowOffset - 2) = rowIdx;
+    columns(rowOffset - 1) = rowIdx + nx;
+    if (rightBC == 1) {
+      // Fill values
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 1.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = 3.0;
+      values(rowOffset - 1) = -1.0;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const cornerFDTag&, const ordinal_type /*idx*/) const {
+    // Bottom-left corner
+    ordinal_type rowIdx = 0;
+    size_type rowOffset = cornerStencilLength;
+    rowmap(rowIdx + 1)  = rowOffset;
+
+    columns(rowOffset - 3) = rowIdx;
+    columns(rowOffset - 2) = rowIdx + 1;
+    columns(rowOffset - 1) = rowIdx + nx;
+    if (bottomBC == 1 || leftBC == 1) {
+      values(rowOffset - 3) = 1.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      values(rowOffset - 3) = 2.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = -1.0;
     }
 
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const interiorFETag&, const size_type idx) const {
-      ordinal_type i, j;
-
-      // Compute row index
-      j = idx / (nx - 2);
-      i = idx % (nx - 2);
-      const ordinal_type rowIdx = (j + 1)*nx + i + 1;
+    // Bottom-right corner
+    rowIdx             = nx - 1;
+    rowOffset          = numEntriesBottomRow;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    columns(rowOffset - 3) = rowIdx - 1;
+    columns(rowOffset - 2) = rowIdx;
+    columns(rowOffset - 1) = rowIdx + nx;
+    if (bottomBC == 1 || rightBC == 1) {
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = 2.0;
+      values(rowOffset - 1) = -1.0;
+    }
 
-      // Compute rowOffset
-      const size_type rowOffset = j*numEntriesPerGridRow + numEntriesBottomRow
-        + (i + 1)*interiorStencilLength + edgeStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
+    // Top-left corner
+    rowIdx    = (ny - 1) * nx;
+    rowOffset = size_type(ny - 2) * numEntriesPerGridRow + numEntriesBottomRow +
+                cornerStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    columns(rowOffset - 3) = rowIdx - nx;
+    columns(rowOffset - 2) = rowIdx;
+    columns(rowOffset - 1) = rowIdx + 1;
+    if (topBC == 1 || leftBC == 1) {
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 1.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = 2.0;
+      values(rowOffset - 1) = -1.0;
+    }
 
-      // Fill column indices
-      columns(rowOffset - 9) = rowIdx - nx - 1;
-      columns(rowOffset - 8) = rowIdx - nx;
-      columns(rowOffset - 7) = rowIdx - nx + 1;
-      columns(rowOffset - 6) = rowIdx - 1;
-      columns(rowOffset - 5) = rowIdx;
-      columns(rowOffset - 4) = rowIdx + 1;
-      columns(rowOffset - 3) = rowIdx + nx - 1;
-      columns(rowOffset - 2) = rowIdx + nx;
-      columns(rowOffset - 1) = rowIdx + nx + 1;
+    // Top-right corner
+    rowIdx             = ny * nx - 1;
+    rowOffset          = numEntries;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    columns(rowOffset - 3) = rowIdx - nx;
+    columns(rowOffset - 2) = rowIdx - 1;
+    columns(rowOffset - 1) = rowIdx;
+    if (topBC == 1 || rightBC == 1) {
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 1.0;
+    } else {
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = 2.0;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const interiorFETag&, const ordinal_type idx) const {
+    ordinal_type i, j;
+
+    // Compute row index
+    j                         = idx / (nx - 2);
+    i                         = idx % (nx - 2);
+    const ordinal_type rowIdx = (j + 1) * nx + i + 1;
+
+    // Compute rowOffset
+    const size_type rowOffset =
+        size_type(j) * numEntriesPerGridRow + numEntriesBottomRow +
+        size_type(i + 1) * interiorStencilLength + edgeStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 9) = rowIdx - nx - 1;
+    columns(rowOffset - 8) = rowIdx - nx;
+    columns(rowOffset - 7) = rowIdx - nx + 1;
+    columns(rowOffset - 6) = rowIdx - 1;
+    columns(rowOffset - 5) = rowIdx;
+    columns(rowOffset - 4) = rowIdx + 1;
+    columns(rowOffset - 3) = rowIdx + nx - 1;
+    columns(rowOffset - 2) = rowIdx + nx;
+    columns(rowOffset - 1) = rowIdx + nx + 1;
+
+    // Fill values
+    values(rowOffset - 9) = -2.0;
+    values(rowOffset - 8) = -2.0;
+    values(rowOffset - 7) = -2.0;
+    values(rowOffset - 6) = -2.0;
+    values(rowOffset - 5) = 16.0;
+    values(rowOffset - 4) = -2.0;
+    values(rowOffset - 3) = -2.0;
+    values(rowOffset - 2) = -2.0;
+    values(rowOffset - 1) = -2.0;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const xEdgeFETag&, const ordinal_type idx) const {
+    /***************/
+    /* Bottom edge */
+    /***************/
+    ordinal_type rowIdx = idx + 1;
+    size_type rowOffset =
+        size_type(idx + 1) * edgeStencilLength + cornerStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 6) = rowIdx - 1;
+    columns(rowOffset - 5) = rowIdx;
+    columns(rowOffset - 4) = rowIdx + 1;
+    columns(rowOffset - 3) = rowIdx + nx - 1;
+    columns(rowOffset - 2) = rowIdx + nx;
+    columns(rowOffset - 1) = rowIdx + nx + 1;
+    if (bottomBC == 1) {
+      // Fill values
+      values(rowOffset - 6) = 0.0;
+      values(rowOffset - 5) = 1.0;
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 6) = -1.0;
+      values(rowOffset - 5) = 8.0;
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = -2.0;
+      values(rowOffset - 2) = -2.0;
+      values(rowOffset - 1) = -2.0;
+    }
 
+    /************/
+    /* Top edge */
+    /************/
+    rowIdx    = (ny - 1) * nx + idx + 1;
+    rowOffset = size_type(ny - 2) * numEntriesPerGridRow + numEntriesBottomRow +
+                size_type(idx + 1) * edgeStencilLength + cornerStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 6) = rowIdx - nx - 1;
+    columns(rowOffset - 5) = rowIdx - nx;
+    columns(rowOffset - 4) = rowIdx - nx + 1;
+    columns(rowOffset - 3) = rowIdx - 1;
+    columns(rowOffset - 2) = rowIdx;
+    columns(rowOffset - 1) = rowIdx + 1;
+    if (topBC == 1) {
+      // Fill values
+      values(rowOffset - 6) = 0.0;
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 1.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
       // Fill values
-      values(rowOffset - 9) = -2.0;
-      values(rowOffset - 8) = -2.0;
-      values(rowOffset - 7) = -2.0;
       values(rowOffset - 6) = -2.0;
-      values(rowOffset - 5) = 16.0;
+      values(rowOffset - 5) = -2.0;
       values(rowOffset - 4) = -2.0;
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = 8.0;
+      values(rowOffset - 1) = -1.0;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const yEdgeFETag&, const ordinal_type idx) const {
+    /*************/
+    /* Left edge */
+    /*************/
+    ordinal_type rowIdx = (idx + 1) * nx;
+    size_type rowOffset = size_type(idx) * numEntriesPerGridRow +
+                          numEntriesBottomRow + edgeStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 6) = rowIdx - nx;
+    columns(rowOffset - 5) = rowIdx - nx + 1;
+    columns(rowOffset - 4) = rowIdx;
+    columns(rowOffset - 3) = rowIdx + 1;
+    columns(rowOffset - 2) = rowIdx + nx;
+    columns(rowOffset - 1) = rowIdx + nx + 1;
+    if (leftBC == 1) {
+      // Fill values
+      values(rowOffset - 6) = 0.0;
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 1.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 6) = -1.0;
+      values(rowOffset - 5) = -2.0;
+      values(rowOffset - 4) = 8.0;
       values(rowOffset - 3) = -2.0;
-      values(rowOffset - 2) = -2.0;
+      values(rowOffset - 2) = -1.0;
       values(rowOffset - 1) = -2.0;
     }
 
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const xEdgeFETag&, const size_type idx) const {
-
-      /***************/
-      /* Bottom edge */
-      /***************/
-      ordinal_type rowIdx = idx + 1;
-      size_type rowOffset = (idx + 1)*edgeStencilLength + cornerStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 6) = rowIdx - 1;
-      columns(rowOffset - 5) = rowIdx;
-      columns(rowOffset - 4) = rowIdx + 1;
-      columns(rowOffset - 3) = rowIdx + nx - 1;
-      columns(rowOffset - 2) = rowIdx + nx;
-      columns(rowOffset - 1) = rowIdx + nx + 1;
-      if(bottomBC == 1) {
-	// Fill values
-	values(rowOffset - 6) = 0.0;
-	values(rowOffset - 5) = 1.0;
-	values(rowOffset - 4) = 0.0;
-	values(rowOffset - 3) = 0.0;
-	values(rowOffset - 2) = 0.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 6) = -1.0;
-	values(rowOffset - 5) =  8.0;
-	values(rowOffset - 4) = -1.0;
-	values(rowOffset - 3) = -2.0;
-	values(rowOffset - 2) = -2.0;
-	values(rowOffset - 1) = -2.0;
-      }
-
-      /************/
-      /* Top edge */
-      /************/
-      rowIdx = (ny - 1)*nx + idx + 1;
-      rowOffset = (ny - 2)*numEntriesPerGridRow + numEntriesBottomRow
-        + (idx + 1)*edgeStencilLength + cornerStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 6) = rowIdx - nx - 1;
-      columns(rowOffset - 5) = rowIdx - nx;
-      columns(rowOffset - 4) = rowIdx - nx + 1;
-      columns(rowOffset - 3) = rowIdx - 1;
-      columns(rowOffset - 2) = rowIdx;
-      columns(rowOffset - 1) = rowIdx + 1;
-      if(topBC == 1) {
-	// Fill values
-	values(rowOffset - 6) = 0.0;
-	values(rowOffset - 5) = 0.0;
-	values(rowOffset - 4) = 0.0;
-	values(rowOffset - 3) = 0.0;
-	values(rowOffset - 2) = 1.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 6) = -2.0;
-	values(rowOffset - 5) = -2.0;
-	values(rowOffset - 4) = -2.0;
-	values(rowOffset - 3) = -1.0;
-	values(rowOffset - 2) =  8.0;
-	values(rowOffset - 1) = -1.0;
-      }
+    /**************/
+    /* Right edge */
+    /**************/
+    rowIdx    = (idx + 2) * nx - 1;
+    rowOffset = size_type(idx + 1) * numEntriesPerGridRow + numEntriesBottomRow;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 6) = rowIdx - nx - 1;
+    columns(rowOffset - 5) = rowIdx - nx;
+    columns(rowOffset - 4) = rowIdx - 1;
+    columns(rowOffset - 3) = rowIdx;
+    columns(rowOffset - 2) = rowIdx + nx - 1;
+    columns(rowOffset - 1) = rowIdx + nx;
+    if (rightBC == 1) {
+      // Fill values
+      values(rowOffset - 6) = 0.0;
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 1.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 6) = -2.0;
+      values(rowOffset - 5) = -1.0;
+      values(rowOffset - 4) = -2.0;
+      values(rowOffset - 3) = 8.0;
+      values(rowOffset - 2) = -2.0;
+      values(rowOffset - 1) = -1.0;
     }
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const yEdgeFETag&, const size_type idx) const {
-
-      /*************/
-      /* Left edge */
-      /*************/
-      ordinal_type rowIdx = (idx + 1)*nx;
-      size_type rowOffset = idx*numEntriesPerGridRow + numEntriesBottomRow
-        + edgeStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 6) = rowIdx - nx;
-      columns(rowOffset - 5) = rowIdx - nx + 1;
-      columns(rowOffset - 4) = rowIdx;
-      columns(rowOffset - 3) = rowIdx + 1;
-      columns(rowOffset - 2) = rowIdx + nx;
-      columns(rowOffset - 1) = rowIdx + nx + 1;
-      if(leftBC == 1) {
-	// Fill values
-	values(rowOffset - 6) = 0.0;
-	values(rowOffset - 5) = 0.0;
-	values(rowOffset - 4) = 1.0;
-	values(rowOffset - 3) = 0.0;
-	values(rowOffset - 2) = 0.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 6) = -1.0;
-	values(rowOffset - 5) = -2.0;
-	values(rowOffset - 4) =  8.0;
-	values(rowOffset - 3) = -2.0;
-	values(rowOffset - 2) = -1.0;
-	values(rowOffset - 1) = -2.0;
-      }
-
-      /**************/
-      /* Right edge */
-      /**************/
-      rowIdx = (idx + 2)*nx - 1;
-      rowOffset = (idx + 1)*numEntriesPerGridRow + numEntriesBottomRow;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 6) = rowIdx - nx - 1;
-      columns(rowOffset - 5) = rowIdx - nx;
-      columns(rowOffset - 4) = rowIdx - 1;
-      columns(rowOffset - 3) = rowIdx;
-      columns(rowOffset - 2) = rowIdx + nx - 1;
-      columns(rowOffset - 1) = rowIdx + nx;
-      if(rightBC == 1) {
-	// Fill values
-	values(rowOffset - 6) = 0.0;
-	values(rowOffset - 5) = 0.0;
-	values(rowOffset - 4) = 0.0;
-	values(rowOffset - 3) = 1.0;
-	values(rowOffset - 2) = 0.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 6) = -2.0;
-	values(rowOffset - 5) = -1.0;
-	values(rowOffset - 4) = -2.0;
-	values(rowOffset - 3) =  8.0;
-	values(rowOffset - 2) = -2.0;
-	values(rowOffset - 1) = -1.0;
-      }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const cornerFETag&, const ordinal_type /*idx*/) const {
+    // Bottom-left corner
+    ordinal_type rowIdx = 0;
+    size_type rowOffset = cornerStencilLength;
+    rowmap(rowIdx + 1)  = rowOffset;
+
+    columns(rowOffset - 4) = rowIdx;
+    columns(rowOffset - 3) = rowIdx + 1;
+    columns(rowOffset - 2) = rowIdx + nx;
+    columns(rowOffset - 1) = rowIdx + nx + 1;
+    if (bottomBC == 1 || leftBC == 1) {
+      values(rowOffset - 4) = 1.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      values(rowOffset - 4) = 4.0;
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = -2.0;
     }
 
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const cornerFETag&, const size_type /*idx*/) const {
-      // Bottom-left corner
-      ordinal_type rowIdx = 0;
-      size_type rowOffset = cornerStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      columns(rowOffset - 4) = rowIdx;
-      columns(rowOffset - 3) = rowIdx + 1;
-      columns(rowOffset - 2) = rowIdx + nx;
-      columns(rowOffset - 1) = rowIdx + nx + 1;
-      if(bottomBC == 1 || leftBC == 1) {
-        values(rowOffset - 4)  = 1.0;
-        values(rowOffset - 3)  = 0.0;
-        values(rowOffset - 2)  = 0.0;
-        values(rowOffset - 1)  = 0.0;
-      } else {
-        values(rowOffset - 4)  =  4.0;
-        values(rowOffset - 3)  = -1.0;
-        values(rowOffset - 2)  = -1.0;
-        values(rowOffset - 1)  = -2.0;
-      }
-
-      // Bottom-right corner
-      rowIdx = nx - 1;
-      rowOffset = numEntriesBottomRow;
-      rowmap(rowIdx + 1)     = rowOffset;
-
-      columns(rowOffset - 4) = rowIdx - 1;
-      columns(rowOffset - 3) = rowIdx;
-      columns(rowOffset - 2) = rowIdx + nx - 1;
-      columns(rowOffset - 1) = rowIdx + nx;
-      if(bottomBC == 1 || rightBC == 1) {
-        values(rowOffset - 4)  = 0.0;
-        values(rowOffset - 3)  = 1.0;
-        values(rowOffset - 2)  = 0.0;
-        values(rowOffset - 1)  = 0.0;
-      } else {
-        values(rowOffset - 4)  = -1.0;
-        values(rowOffset - 3)  =  4.0;
-        values(rowOffset - 2)  = -2.0;
-        values(rowOffset - 1)  = -1.0;
-      }
-
-      // Top-left corner
-      rowIdx = (ny - 1)*nx;
-      rowOffset = (ny - 2)*numEntriesPerGridRow + numEntriesBottomRow
-        + cornerStencilLength;
-      rowmap(rowIdx + 1)     = rowOffset;
-
-      columns(rowOffset - 4) = rowIdx - nx;
-      columns(rowOffset - 3) = rowIdx - nx + 1;
-      columns(rowOffset - 2) = rowIdx;
-      columns(rowOffset - 1) = rowIdx + 1;
-      if(topBC == 1 || leftBC == 1) {
-        values(rowOffset - 4)  = 0.0;
-        values(rowOffset - 3)  = 0.0;
-        values(rowOffset - 2)  = 1.0;
-        values(rowOffset - 1)  = 0.0;
-      } else {
-        values(rowOffset - 4)  = -1.0;
-        values(rowOffset - 3)  = -2.0;
-        values(rowOffset - 2)  =  4.0;
-        values(rowOffset - 1)  = -1.0;
-      }
-
-      // Top-right corner
-      rowIdx = ny*nx - 1;
-      rowOffset = numEntries;
-      rowmap(rowIdx + 1)     = rowOffset;
-
-      columns(rowOffset - 4) = rowIdx - nx - 1;
-      columns(rowOffset - 3) = rowIdx - nx;
-      columns(rowOffset - 2) = rowIdx - 1;
-      columns(rowOffset - 1) = rowIdx;
-      if(topBC == 1 || rightBC == 1) {
-        values(rowOffset - 4)  = 0.0;
-        values(rowOffset - 3)  = 0.0;
-        values(rowOffset - 2)  = 0.0;
-        values(rowOffset - 1)  = 1.0;
-      } else {
-        values(rowOffset - 4)  = -2.0;
-        values(rowOffset - 3)  = -1.0;
-        values(rowOffset - 2)  = -1.0;
-        values(rowOffset - 1)  =  4.0;
-      }
+    // Bottom-right corner
+    rowIdx             = nx - 1;
+    rowOffset          = numEntriesBottomRow;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    columns(rowOffset - 4) = rowIdx - 1;
+    columns(rowOffset - 3) = rowIdx;
+    columns(rowOffset - 2) = rowIdx + nx - 1;
+    columns(rowOffset - 1) = rowIdx + nx;
+    if (bottomBC == 1 || rightBC == 1) {
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 1.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = 4.0;
+      values(rowOffset - 2) = -2.0;
+      values(rowOffset - 1) = -1.0;
     }
-  };
 
-  template <typename CrsMatrix_t, typename mat_structure>
-  CrsMatrix_t generate_structured_matrix2D(const std::string stencil,
-					   const mat_structure& structure) {
-
-    typedef typename CrsMatrix_t::StaticCrsGraphType graph_t;
-    typedef typename CrsMatrix_t::row_map_type::non_const_type row_map_view_t;
-    typedef typename CrsMatrix_t::index_type::non_const_type   cols_view_t;
-    typedef typename CrsMatrix_t::values_type::non_const_type  scalar_view_t;
-    typedef typename CrsMatrix_t::non_const_size_type size_type;
-    typedef typename CrsMatrix_t::non_const_ordinal_type ordinal_type;
+    // Top-left corner
+    rowIdx    = (ny - 1) * nx;
+    rowOffset = size_type(ny - 2) * numEntriesPerGridRow + numEntriesBottomRow +
+                cornerStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    columns(rowOffset - 4) = rowIdx - nx;
+    columns(rowOffset - 3) = rowIdx - nx + 1;
+    columns(rowOffset - 2) = rowIdx;
+    columns(rowOffset - 1) = rowIdx + 1;
+    if (topBC == 1 || leftBC == 1) {
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 1.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = -2.0;
+      values(rowOffset - 2) = 4.0;
+      values(rowOffset - 1) = -1.0;
+    }
 
-    int stencil_type = 0;
-    if (stencil == "FD") {
-      stencil_type = FD;
-    } else if (stencil == "FE") {
-      stencil_type = FE;
+    // Top-right corner
+    rowIdx             = ny * nx - 1;
+    rowOffset          = numEntries;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    columns(rowOffset - 4) = rowIdx - nx - 1;
+    columns(rowOffset - 3) = rowIdx - nx;
+    columns(rowOffset - 2) = rowIdx - 1;
+    columns(rowOffset - 1) = rowIdx;
+    if (topBC == 1 || rightBC == 1) {
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 1.0;
     } else {
-      std::ostringstream os;
-      os << "Test::generate_structured_matrix2D only accepts stencil: FD and FEM, you passed: "
-         << stencil <<" !" << std::endl;
-      Kokkos::Impl::throw_runtime_exception (os.str ());
-    }
-
-    // Extract geometric data
-    const ordinal_type nx          = structure(0,0);
-    const ordinal_type ny          = structure(1,0);
-    const ordinal_type numNodes    = ny*nx;
-    const ordinal_type leftBC      = structure(0,1);
-    const ordinal_type rightBC     = structure(0,2);
-    const ordinal_type bottomBC    = structure(1,1);
-    const ordinal_type topBC       = structure(1,2);
-    const ordinal_type numInterior = (nx - 2)*(ny - 2);
-    const ordinal_type numEdge     = 2*(nx - 2) + 2*(ny - 2);
-    const ordinal_type numCorner   = 4;
-    ordinal_type interiorStencilLength = 0, edgeStencilLength = 0, cornerStencilLength = 0;
-
-    if(stencil_type == FD) {
-      interiorStencilLength = 5;
-      edgeStencilLength     = 4;
-      cornerStencilLength   = 3;
-    } else if(stencil_type == FE) {
-      interiorStencilLength = 9;
-      edgeStencilLength     = 6;
+      values(rowOffset - 4) = -2.0;
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = 4.0;
+    }
+  }
+};
+
+template <typename CrsMatrix_t, typename mat_structure>
+CrsMatrix_t generate_structured_matrix2D(const std::string stencil,
+                                         const mat_structure& structure) {
+  typedef typename CrsMatrix_t::StaticCrsGraphType graph_t;
+  typedef typename CrsMatrix_t::row_map_type::non_const_type row_map_view_t;
+  typedef typename CrsMatrix_t::index_type::non_const_type cols_view_t;
+  typedef typename CrsMatrix_t::values_type::non_const_type scalar_view_t;
+  typedef typename CrsMatrix_t::non_const_size_type size_type;
+  typedef typename CrsMatrix_t::non_const_ordinal_type ordinal_type;
+
+  int stencil_type = 0;
+  if (stencil == "FD") {
+    stencil_type = FD;
+  } else if (stencil == "FE") {
+    stencil_type = FE;
+  } else {
+    std::ostringstream os;
+    os << "Test::generate_structured_matrix2D only accepts stencil: FD and "
+          "FEM, you passed: "
+       << stencil << " !" << std::endl;
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
+  }
+
+  // Extract geometric data
+  const ordinal_type nx              = structure(0, 0);
+  const ordinal_type ny              = structure(1, 0);
+  const ordinal_type numNodes        = ny * nx;
+  const ordinal_type leftBC          = structure(0, 1);
+  const ordinal_type rightBC         = structure(0, 2);
+  const ordinal_type bottomBC        = structure(1, 1);
+  const ordinal_type topBC           = structure(1, 2);
+  const ordinal_type numInterior     = (nx - 2) * (ny - 2);
+  const ordinal_type numEdge         = 2 * (nx - 2) + 2 * (ny - 2);
+  const ordinal_type numCorner       = 4;
+  ordinal_type interiorStencilLength = 0, edgeStencilLength = 0,
+               cornerStencilLength = 0;
+
+  if (stencil_type == FD) {
+    interiorStencilLength = 5;
+    edgeStencilLength     = 4;
+    cornerStencilLength   = 3;
+  } else if (stencil_type == FE) {
+    interiorStencilLength = 9;
+    edgeStencilLength     = 6;
+    cornerStencilLength   = 4;
+  }
+
+  const size_type numEntries = numInterior * interiorStencilLength +
+                               numEdge * edgeStencilLength +
+                               numCorner * cornerStencilLength;
+
+  // Create matrix data
+  row_map_view_t rowmap_view("rowmap_view", numNodes + 1);
+  cols_view_t columns_view("colsmap_view", numEntries);
+  scalar_view_t values_view("values_view", numEntries);
+
+  fill_2D_matrix_functor<CrsMatrix_t> fill_2D_matrix(
+      stencil_type, nx, ny, leftBC, rightBC, bottomBC, topBC, rowmap_view,
+      columns_view, values_view);
+
+  fill_2D_matrix.compute();
+
+  graph_t static_graph(columns_view, rowmap_view);
+  std::string name;
+  if (stencil_type == FD) {
+    name = "CrsMatrixFD";
+  } else if (stencil_type == FE) {
+    name = "CrsMatrixFE";
+  }
+
+  return CrsMatrix_t(name, numNodes, values_view, static_graph);
+
+}  // generate_structured_matrix2D
+
+template <class CrsMatrix_t>
+struct fill_3D_matrix_functor {
+  // Define types used by the CrsMatrix
+  typedef typename CrsMatrix_t::execution_space execution_space;
+  typedef typename CrsMatrix_t::row_map_type::non_const_type row_map_view_t;
+  typedef typename CrsMatrix_t::index_type::non_const_type cols_view_t;
+  typedef typename CrsMatrix_t::values_type::non_const_type scalar_view_t;
+  typedef typename CrsMatrix_t::non_const_size_type size_type;
+  typedef typename CrsMatrix_t::non_const_ordinal_type ordinal_type;
+
+  // Finite Difference dispatch tags
+  struct interiorFDTag {};
+
+  struct xFaceFDTag {};
+  struct yFaceFDTag {};
+  struct zFaceFDTag {};
+
+  struct xEdgeFDTag {};
+  struct yEdgeFDTag {};
+  struct zEdgeFDTag {};
+
+  struct cornerFDTag {};
+
+  // Finite Element dispatch tags
+  struct interiorFETag {};
+
+  struct xFaceFETag {};
+  struct yFaceFETag {};
+  struct zFaceFETag {};
+
+  struct xEdgeFETag {};
+  struct yEdgeFETag {};
+  struct zEdgeFETag {};
+
+  struct cornerFETag {};
+
+  // Internal variables and temporaries
+  const int stencil_type;
+  const ordinal_type nx, ny, nz;
+  const int leftBC, rightBC, frontBC, backBC, bottomBC, topBC;
+
+  // Matrix views
+  row_map_view_t rowmap;
+  cols_view_t columns;
+  scalar_view_t values;
+
+  size_type numEntries;
+  ordinal_type numInterior;
+  ordinal_type numXFace;
+  ordinal_type numYFace;
+  ordinal_type numZFace;
+  ordinal_type numXEdge;
+  ordinal_type numYEdge;
+  ordinal_type numZEdge;
+
+  ordinal_type interiorStencilLength;
+  ordinal_type faceStencilLength;
+  ordinal_type edgeStencilLength;
+  ordinal_type cornerStencilLength;
+  ordinal_type numEntriesPerGridPlane;
+  ordinal_type numEntriesBottomPlane;
+  ordinal_type numEntriesPerGridRow;
+  ordinal_type numEntriesFrontRow;
+  ordinal_type numEntriesBottomFrontRow;
+
+  fill_3D_matrix_functor(const int stencil_type_, const ordinal_type nx_,
+                         const ordinal_type ny_, const ordinal_type nz_,
+                         const int leftBC_, const int rightBC_,
+                         const int frontBC_, const int backBC_,
+                         const int bottomBC_, const int topBC_,
+                         const row_map_view_t rowmap_,
+                         const cols_view_t columns_,
+                         const scalar_view_t values_)
+      : stencil_type(stencil_type_),
+        nx(nx_),
+        ny(ny_),
+        nz(nz_),
+        leftBC(leftBC_),
+        rightBC(rightBC_),
+        frontBC(frontBC_),
+        backBC(backBC_),
+        bottomBC(bottomBC_),
+        topBC(topBC_),
+        rowmap(rowmap_),
+        columns(columns_),
+        values(values_) {
+    if (stencil_type == FD) {
+      interiorStencilLength = 7;
+      faceStencilLength     = 6;
+      edgeStencilLength     = 5;
       cornerStencilLength   = 4;
+    } else if (stencil_type == FE) {
+      interiorStencilLength = 27;
+      faceStencilLength     = 18;
+      edgeStencilLength     = 12;
+      cornerStencilLength   = 8;
     }
 
-    const size_type numEntries = numInterior*interiorStencilLength
-      + numEdge*edgeStencilLength
-      + numCorner*cornerStencilLength;
-
-    // Create matrix data
-    row_map_view_t rowmap_view ("rowmap_view",  numNodes + 1);
-    cols_view_t    columns_view("colsmap_view", numEntries);
-    scalar_view_t  values_view ("values_view",  numEntries);
-
-    fill_2D_matrix_functor<CrsMatrix_t> fill_2D_matrix(stencil_type,
-						       nx, ny,
-						       leftBC, rightBC,
-						       bottomBC, topBC,
-						       rowmap_view,
-						       columns_view,
-						       values_view);
-
-    fill_2D_matrix.compute();
-
-    graph_t static_graph (columns_view, rowmap_view);
-    std::string name;
-    if(stencil_type == FD) {
-      name = "CrsMatrixFD";
-    } else if(stencil_type == FE) {
-      name = "CrsMatrixFE";
-    }
-
-    return CrsMatrix_t(name, numNodes, values_view, static_graph);
-
-  } // generate_structured_matrix2D
-
-  template<class CrsMatrix_t>
-  struct fill_3D_matrix_functor {
-
-    // Define types used by the CrsMatrix
-    typedef typename CrsMatrix_t::execution_space execution_space;
-    typedef typename CrsMatrix_t::row_map_type::non_const_type row_map_view_t;
-    typedef typename CrsMatrix_t::index_type::non_const_type   cols_view_t;
-    typedef typename CrsMatrix_t::values_type::non_const_type  scalar_view_t;
-    typedef typename CrsMatrix_t::non_const_size_type size_type;
-    typedef typename CrsMatrix_t::non_const_ordinal_type ordinal_type;
-
-    // Finite Difference dispatch tags
-    struct interiorFDTag{};
-
-    struct xFaceFDTag{};
-    struct yFaceFDTag{};
-    struct zFaceFDTag{};
-
-    struct xEdgeFDTag{};
-    struct yEdgeFDTag{};
-    struct zEdgeFDTag{};
-
-    struct cornerFDTag{};
-
-    // Finite Element dispatch tags
-    struct interiorFETag{};
-
-    struct xFaceFETag{};
-    struct yFaceFETag{};
-    struct zFaceFETag{};
-
-    struct xEdgeFETag{};
-    struct yEdgeFETag{};
-    struct zEdgeFETag{};
-
-    struct cornerFETag{};
-
-    // Internal variables and temporaries
-    const int stencil_type;
-    const ordinal_type nx, ny, nz;
-    const int leftBC, rightBC, frontBC, backBC, bottomBC, topBC;
-
-    // Matrix views
-    row_map_view_t rowmap;
-    cols_view_t    columns;
-    scalar_view_t  values;
-
-    size_type    numEntries;
-    ordinal_type numInterior;
-    ordinal_type numXFace;
-    ordinal_type numYFace;
-    ordinal_type numZFace;
-    ordinal_type numXEdge;
-    ordinal_type numYEdge;
-    ordinal_type numZEdge;
-
-    ordinal_type interiorStencilLength;
-    ordinal_type faceStencilLength;
-    ordinal_type edgeStencilLength;
-    ordinal_type cornerStencilLength;
-    ordinal_type numEntriesPerGridPlane;
-    ordinal_type numEntriesBottomPlane;
-    ordinal_type numEntriesPerGridRow;
-    ordinal_type numEntriesFrontRow;
-    ordinal_type numEntriesBottomFrontRow;
-
-    fill_3D_matrix_functor(const int stencil_type_, const ordinal_type nx_,
-			   const ordinal_type ny_, const ordinal_type nz_,
-			   const int leftBC_, const int rightBC_,
-			   const int frontBC_, const int backBC_,
-			   const int bottomBC_, const int topBC_,
-			   const row_map_view_t rowmap_, const cols_view_t columns_,
-			   const scalar_view_t values_) :
-      stencil_type(stencil_type_), nx(nx_), ny(ny_), nz(nz_),
-      leftBC(leftBC_), rightBC(rightBC_), frontBC(frontBC_),
-      backBC(backBC_), bottomBC(bottomBC_), topBC(topBC_),
-      rowmap(rowmap_), columns(columns_), values(values_) {
-
-      if(stencil_type == FD) {
-	interiorStencilLength = 7;
-	faceStencilLength = 6;
-	edgeStencilLength = 5;
-	cornerStencilLength = 4;
-      } else if(stencil_type == FE) {
-	interiorStencilLength = 27;
-	faceStencilLength = 18;
-	edgeStencilLength = 12;
-	cornerStencilLength = 8;
-      }
-
-      numInterior = (nx - 2)*(ny - 2)*(nz - 2);
-      numXFace = (ny - 2)*(nz - 2);
-      numYFace = (nx - 2)*(nz - 2);
-      numZFace = (nx - 2)*(ny - 2);
-      numXEdge = nx - 2;
-      numYEdge = ny - 2;
-      numZEdge = nz - 2;
-
-      numEntries = numInterior*interiorStencilLength
-        + 2*(numXFace + numYFace + numZFace)*faceStencilLength
-        + 4*(numXEdge + numYEdge + numZEdge)*edgeStencilLength
-        + 8*cornerStencilLength;
-      numEntriesPerGridPlane = numZFace*interiorStencilLength
-	+ 2*numXEdge*faceStencilLength
-	+ 2*numYEdge*faceStencilLength
-	+ 4*edgeStencilLength;;
-      numEntriesBottomPlane = numZFace*faceStencilLength
-	+ 2*numXEdge*edgeStencilLength
-	+ 2*numYEdge*edgeStencilLength
-	+ 4*cornerStencilLength;;
-      numEntriesPerGridRow = numXEdge*interiorStencilLength
-	+ 2*faceStencilLength;
-      numEntriesFrontRow = numXEdge*faceStencilLength
-	+ 2*edgeStencilLength;
-      numEntriesBottomFrontRow = numXEdge*edgeStencilLength
-	+ 2*cornerStencilLength;
-    }
-
-    void compute() {
-      // Fill interior points
-      if(0 < numInterior) {
-	if(stencil_type == FD) {
-	  Kokkos::RangePolicy<execution_space, interiorFDTag> policy(0, numInterior);
-	  Kokkos::parallel_for("Fill 3D FD matrix: interior points", policy, *this);
-	} else if(stencil_type == FE) {
-	  Kokkos::RangePolicy<execution_space, interiorFETag> policy(0, numInterior);
-	  Kokkos::parallel_for("Fill 3D FE matrix: interior points", policy, *this);
-	}
-      }
-
-      // Fill x-faces
-      if(0 < numXFace) {
-	if(stencil_type == FD) {
-	  Kokkos::RangePolicy<execution_space, xFaceFDTag> policy(0, numXFace);
-	  Kokkos::parallel_for("Fill 3D FD matrix: x-face points", policy, *this);
-	} else if(stencil_type == FE) {
-	  Kokkos::RangePolicy<execution_space, xFaceFETag> policy(0, numXFace);
-	  Kokkos::parallel_for("Fill 3D FE matrix: x-face points", policy, *this);
-	}
+    numInterior = (nx - 2) * (ny - 2) * (nz - 2);
+    numXFace    = (ny - 2) * (nz - 2);
+    numYFace    = (nx - 2) * (nz - 2);
+    numZFace    = (nx - 2) * (ny - 2);
+    numXEdge    = nx - 2;
+    numYEdge    = ny - 2;
+    numZEdge    = nz - 2;
+
+    numEntries = numInterior * interiorStencilLength +
+                 2 * (numXFace + numYFace + numZFace) * faceStencilLength +
+                 4 * (numXEdge + numYEdge + numZEdge) * edgeStencilLength +
+                 8 * cornerStencilLength;
+    numEntriesPerGridPlane =
+        numZFace * interiorStencilLength + 2 * numXEdge * faceStencilLength +
+        2 * numYEdge * faceStencilLength + 4 * edgeStencilLength;
+    ;
+    numEntriesBottomPlane =
+        numZFace * faceStencilLength + 2 * numXEdge * edgeStencilLength +
+        2 * numYEdge * edgeStencilLength + 4 * cornerStencilLength;
+    ;
+    numEntriesPerGridRow =
+        numXEdge * interiorStencilLength + 2 * faceStencilLength;
+    numEntriesFrontRow = numXEdge * faceStencilLength + 2 * edgeStencilLength;
+    numEntriesBottomFrontRow =
+        numXEdge * edgeStencilLength + 2 * cornerStencilLength;
+  }
+
+  void compute() {
+    // Fill interior points
+    if (0 < numInterior) {
+      if (stencil_type == FD) {
+        Kokkos::RangePolicy<execution_space, interiorFDTag> policy(0,
+                                                                   numInterior);
+        Kokkos::parallel_for("Fill 3D FD matrix: interior points", policy,
+                             *this);
+      } else if (stencil_type == FE) {
+        Kokkos::RangePolicy<execution_space, interiorFETag> policy(0,
+                                                                   numInterior);
+        Kokkos::parallel_for("Fill 3D FE matrix: interior points", policy,
+                             *this);
       }
+    }
 
-      // Fill y-faces
-      if(0 < numYFace) {
-	if(stencil_type == FD) {
-	  Kokkos::RangePolicy<execution_space, yFaceFDTag> policy(0, numYFace);
-	  Kokkos::parallel_for("Fill 3D FD matrix: y-face points", policy, *this);
-	} else if(stencil_type == FE) {
-	  Kokkos::RangePolicy<execution_space, yFaceFETag> policy(0, numYFace);
-	  Kokkos::parallel_for("Fill 3D FE matrix: y-face points", policy, *this);
-	}
+    // Fill x-faces
+    if (0 < numXFace) {
+      if (stencil_type == FD) {
+        Kokkos::RangePolicy<execution_space, xFaceFDTag> policy(0, numXFace);
+        Kokkos::parallel_for("Fill 3D FD matrix: x-face points", policy, *this);
+      } else if (stencil_type == FE) {
+        Kokkos::RangePolicy<execution_space, xFaceFETag> policy(0, numXFace);
+        Kokkos::parallel_for("Fill 3D FE matrix: x-face points", policy, *this);
       }
+    }
 
-      // Fill z-faces
-      if(0 < numZFace) {
-	if(stencil_type == FD) {
-	  Kokkos::RangePolicy<execution_space, zFaceFDTag> policy(0, numZFace);
-	  Kokkos::parallel_for("Fill 3D FD matrix: z-face points", policy, *this);
-	} else if(stencil_type == FE) {
-	  Kokkos::RangePolicy<execution_space, zFaceFETag> policy(0, numZFace);
-	  Kokkos::parallel_for("Fill 3D FE matrix: z-face points", policy, *this);
-	}
+    // Fill y-faces
+    if (0 < numYFace) {
+      if (stencil_type == FD) {
+        Kokkos::RangePolicy<execution_space, yFaceFDTag> policy(0, numYFace);
+        Kokkos::parallel_for("Fill 3D FD matrix: y-face points", policy, *this);
+      } else if (stencil_type == FE) {
+        Kokkos::RangePolicy<execution_space, yFaceFETag> policy(0, numYFace);
+        Kokkos::parallel_for("Fill 3D FE matrix: y-face points", policy, *this);
       }
+    }
 
-      // Fill x-edges
-      if(0 < numXEdge) {
-	if(stencil_type == FD) {
-	  Kokkos::RangePolicy<execution_space, xEdgeFDTag> policy(0, numXEdge);
-	  Kokkos::parallel_for("Fill 3D FD matrix: x-edge points", policy, *this);
-	} else if(stencil_type == FE) {
-	  Kokkos::RangePolicy<execution_space, xEdgeFETag> policy(0, numXEdge);
-	  Kokkos::parallel_for("Fill 3D FE matrix: x-edge points", policy, *this);
-	}
+    // Fill z-faces
+    if (0 < numZFace) {
+      if (stencil_type == FD) {
+        Kokkos::RangePolicy<execution_space, zFaceFDTag> policy(0, numZFace);
+        Kokkos::parallel_for("Fill 3D FD matrix: z-face points", policy, *this);
+      } else if (stencil_type == FE) {
+        Kokkos::RangePolicy<execution_space, zFaceFETag> policy(0, numZFace);
+        Kokkos::parallel_for("Fill 3D FE matrix: z-face points", policy, *this);
       }
+    }
 
-      // Fill y-edges
-      if(0 < numYEdge) {
-	if(stencil_type == FD) {
-	  Kokkos::RangePolicy<execution_space, yEdgeFDTag> policy(0, numYEdge);
-	  Kokkos::parallel_for("Fill 3D FD matrix: y-edge points", policy, *this);
-	} else if(stencil_type == FE) {
-	  Kokkos::RangePolicy<execution_space, yEdgeFETag> policy(0, numYEdge);
-	  Kokkos::parallel_for("Fill 3D FE matrix: y-edge points", policy, *this);
-	}
+    // Fill x-edges
+    if (0 < numXEdge) {
+      if (stencil_type == FD) {
+        Kokkos::RangePolicy<execution_space, xEdgeFDTag> policy(0, numXEdge);
+        Kokkos::parallel_for("Fill 3D FD matrix: x-edge points", policy, *this);
+      } else if (stencil_type == FE) {
+        Kokkos::RangePolicy<execution_space, xEdgeFETag> policy(0, numXEdge);
+        Kokkos::parallel_for("Fill 3D FE matrix: x-edge points", policy, *this);
       }
+    }
 
-      // Fill z-edges
-      if(0 < numZEdge) {
-	if(stencil_type == FD) {
-	  Kokkos::RangePolicy<execution_space, zEdgeFDTag> policy(0, numZEdge);
-	  Kokkos::parallel_for("Fill 3D FD matrix: z-edge points", policy, *this);
-	} else if(stencil_type == FE) {
-	  Kokkos::RangePolicy<execution_space, zEdgeFETag> policy(0, numZEdge);
-	  Kokkos::parallel_for("Fill 3D FE matrix: z-edge points", policy, *this);
-	}
+    // Fill y-edges
+    if (0 < numYEdge) {
+      if (stencil_type == FD) {
+        Kokkos::RangePolicy<execution_space, yEdgeFDTag> policy(0, numYEdge);
+        Kokkos::parallel_for("Fill 3D FD matrix: y-edge points", policy, *this);
+      } else if (stencil_type == FE) {
+        Kokkos::RangePolicy<execution_space, yEdgeFETag> policy(0, numYEdge);
+        Kokkos::parallel_for("Fill 3D FE matrix: y-edge points", policy, *this);
       }
+    }
 
-      if(stencil_type == FD) {
-	Kokkos::RangePolicy<execution_space, cornerFDTag> policy(0, 1);
-	Kokkos::parallel_for("Fill 3D FD matrix: corner points", policy, *this);
-      } else if(stencil_type == FE) {
-	Kokkos::RangePolicy<execution_space, cornerFETag> policy(0, 1);
-	Kokkos::parallel_for("Fill 3D FE matrix: corner points", policy, *this);
+    // Fill z-edges
+    if (0 < numZEdge) {
+      if (stencil_type == FD) {
+        Kokkos::RangePolicy<execution_space, zEdgeFDTag> policy(0, numZEdge);
+        Kokkos::parallel_for("Fill 3D FD matrix: z-edge points", policy, *this);
+      } else if (stencil_type == FE) {
+        Kokkos::RangePolicy<execution_space, zEdgeFETag> policy(0, numZEdge);
+        Kokkos::parallel_for("Fill 3D FE matrix: z-edge points", policy, *this);
       }
-    } // compute()
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const interiorFDTag&, const size_type idx) const {
-      // Compute row index
-      const ordinal_type k   = idx / ((ny - 2)*(nx - 2));
-      const ordinal_type rem = idx % ((ny - 2)*(nx - 2));
-      const ordinal_type j   = rem / (nx - 2);
-      const ordinal_type i   = rem % (nx - 2);
-      const ordinal_type rowIdx = (k + 1)*ny*nx + (j + 1)*nx + i + 1;
-
-      // Compute rowOffset
-      const size_type rowOffset = k*numEntriesPerGridPlane + numEntriesBottomPlane
-        + j*numEntriesPerGridRow + numEntriesFrontRow
-        + (i + 1)*interiorStencilLength + faceStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 7) = rowIdx - ny*nx;
-      columns(rowOffset - 6) = rowIdx - nx;
-      columns(rowOffset - 5) = rowIdx - 1;
-      columns(rowOffset - 4) = rowIdx;
-      columns(rowOffset - 3) = rowIdx + 1;
-      columns(rowOffset - 2) = rowIdx + nx;
-      columns(rowOffset - 1) = rowIdx + ny*nx;
+    }
 
+    if (stencil_type == FD) {
+      Kokkos::RangePolicy<execution_space, cornerFDTag> policy(0, 1);
+      Kokkos::parallel_for("Fill 3D FD matrix: corner points", policy, *this);
+    } else if (stencil_type == FE) {
+      Kokkos::RangePolicy<execution_space, cornerFETag> policy(0, 1);
+      Kokkos::parallel_for("Fill 3D FE matrix: corner points", policy, *this);
+    }
+  }  // compute()
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const interiorFDTag&, const ordinal_type idx) const {
+    // Compute row index
+    const ordinal_type k      = idx / ((ny - 2) * (nx - 2));
+    const ordinal_type rem    = idx % ((ny - 2) * (nx - 2));
+    const ordinal_type j      = rem / (nx - 2);
+    const ordinal_type i      = rem % (nx - 2);
+    const ordinal_type rowIdx = (k + 1) * ny * nx + (j + 1) * nx + i + 1;
+
+    // Compute rowOffset
+    const size_type rowOffset =
+        size_type(k) * numEntriesPerGridPlane + numEntriesBottomPlane +
+        size_type(j) * numEntriesPerGridRow + numEntriesFrontRow +
+        size_type(i + 1) * interiorStencilLength + faceStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 7) = rowIdx - ny * nx;
+    columns(rowOffset - 6) = rowIdx - nx;
+    columns(rowOffset - 5) = rowIdx - 1;
+    columns(rowOffset - 4) = rowIdx;
+    columns(rowOffset - 3) = rowIdx + 1;
+    columns(rowOffset - 2) = rowIdx + nx;
+    columns(rowOffset - 1) = rowIdx + ny * nx;
+
+    // Fill values
+    values(rowOffset - 7) = -1.0;
+    values(rowOffset - 6) = -1.0;
+    values(rowOffset - 5) = -1.0;
+    values(rowOffset - 4) = 6.0;
+    values(rowOffset - 3) = -1.0;
+    values(rowOffset - 2) = -1.0;
+    values(rowOffset - 1) = -1.0;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const xFaceFDTag&, const ordinal_type idx) const {
+    /*******************/
+    /*   x == 0 face   */
+    /*******************/
+    // Compute row index
+    ordinal_type k      = idx / (ny - 2);
+    ordinal_type j      = idx % (ny - 2);
+    ordinal_type i      = 0;
+    ordinal_type rowIdx = (k + 1) * ny * nx + (j + 1) * nx + i;
+
+    // Compute rowOffset
+    size_type rowOffset = size_type(k) * numEntriesPerGridPlane +
+                          numEntriesBottomPlane +
+                          size_type(j) * numEntriesPerGridRow +
+                          numEntriesFrontRow + faceStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 6) = rowIdx - ny * nx;
+    columns(rowOffset - 5) = rowIdx - nx;
+    columns(rowOffset - 4) = rowIdx;
+    columns(rowOffset - 3) = rowIdx + 1;
+    columns(rowOffset - 2) = rowIdx + nx;
+    columns(rowOffset - 1) = rowIdx + ny * nx;
+    if (leftBC == 1) {
+      // Fill values
+      values(rowOffset - 6) = 0.0;
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 1.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
       // Fill values
-      values(rowOffset - 7) = -1.0;
       values(rowOffset - 6) = -1.0;
       values(rowOffset - 5) = -1.0;
-      values(rowOffset - 4) =  6.0;
+      values(rowOffset - 4) = 5.0;
       values(rowOffset - 3) = -1.0;
       values(rowOffset - 2) = -1.0;
       values(rowOffset - 1) = -1.0;
     }
 
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const xFaceFDTag&, const size_type idx) const {
-      /*******************/
-      /*   x == 0 face   */
-      /*******************/
-      // Compute row index
-      ordinal_type k = idx / (ny - 2);
-      ordinal_type j = idx % (ny - 2);
-      ordinal_type i = 0;
-      ordinal_type rowIdx = (k + 1)*ny*nx + (j + 1)*nx + i;
-
-      // Compute rowOffset
-      size_type rowOffset = k*numEntriesPerGridPlane + numEntriesBottomPlane
-        + j*numEntriesPerGridRow + numEntriesFrontRow + faceStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 6) = rowIdx - ny*nx;
-      columns(rowOffset - 5) = rowIdx - nx;
-      columns(rowOffset - 4) = rowIdx;
-      columns(rowOffset - 3) = rowIdx + 1;
-      columns(rowOffset - 2) = rowIdx + nx;
-      columns(rowOffset - 1) = rowIdx + ny*nx;
-      if(leftBC == 1) {
-	// Fill values
-	values(rowOffset - 6) = 0.0;
-	values(rowOffset - 5) = 0.0;
-	values(rowOffset - 4) = 1.0;
-	values(rowOffset - 3) = 0.0;
-	values(rowOffset - 2) = 0.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 6) = -1.0;
-	values(rowOffset - 5) = -1.0;
-	values(rowOffset - 4) =  5.0;
-	values(rowOffset - 3) = -1.0;
-	values(rowOffset - 2) = -1.0;
-	values(rowOffset - 1) = -1.0;
-      }
-
-      /********************/
-      /*   x == nx face   */
-      /********************/
-      // Compute row index
-      k = idx / (ny - 2);
-      j = idx % (ny - 2);
-      i = nx - 1;
-      rowIdx = (k + 1)*ny*nx + (j + 1)*nx + i;
-
-      // Compute rowOffset
-      rowOffset = k*numEntriesPerGridPlane + numEntriesBottomPlane
-        + (j + 1)*numEntriesPerGridRow + numEntriesFrontRow;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 6) = rowIdx - ny*nx;
-      columns(rowOffset - 5) = rowIdx - nx;
-      columns(rowOffset - 4) = rowIdx - 1;
-      columns(rowOffset - 3) = rowIdx;
-      columns(rowOffset - 2) = rowIdx + nx;
-      columns(rowOffset - 1) = rowIdx + ny*nx;
-      if(rightBC == 1) {
-	// Fill values
-	values(rowOffset - 6) = 0.0;
-	values(rowOffset - 5) = 0.0;
-	values(rowOffset - 4) = 0.0;
-	values(rowOffset - 3) = 1.0;
-	values(rowOffset - 2) = 0.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 6) = -1.0;
-	values(rowOffset - 5) = -1.0;
-	values(rowOffset - 4) = -1.0;
-	values(rowOffset - 3) =  5.0;
-	values(rowOffset - 2) = -1.0;
-	values(rowOffset - 1) = -1.0;
-      }
+    /********************/
+    /*   x == nx face   */
+    /********************/
+    // Compute row index
+    k      = idx / (ny - 2);
+    j      = idx % (ny - 2);
+    i      = nx - 1;
+    rowIdx = (k + 1) * ny * nx + (j + 1) * nx + i;
+
+    // Compute rowOffset
+    rowOffset = size_type(k) * numEntriesPerGridPlane + numEntriesBottomPlane +
+                size_type(j + 1) * numEntriesPerGridRow + numEntriesFrontRow;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 6) = rowIdx - ny * nx;
+    columns(rowOffset - 5) = rowIdx - nx;
+    columns(rowOffset - 4) = rowIdx - 1;
+    columns(rowOffset - 3) = rowIdx;
+    columns(rowOffset - 2) = rowIdx + nx;
+    columns(rowOffset - 1) = rowIdx + ny * nx;
+    if (rightBC == 1) {
+      // Fill values
+      values(rowOffset - 6) = 0.0;
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 1.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 6) = -1.0;
+      values(rowOffset - 5) = -1.0;
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = 5.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = -1.0;
     }
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const yFaceFDTag&, const size_type idx) const {
-      /*******************/
-      /*   y == 0 face   */
-      /*******************/
-      // Compute row index
-      ordinal_type k = idx / (nx - 2);
-      ordinal_type i = idx % (nx - 2);
-      ordinal_type rowIdx = (k + 1)*ny*nx + i + 1;
-
-      // Compute rowOffset
-      size_type rowOffset = k*numEntriesPerGridPlane + numEntriesBottomPlane
-        + (i + 1)*faceStencilLength + edgeStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 6) = rowIdx - ny*nx;
-      columns(rowOffset - 5) = rowIdx - 1;
-      columns(rowOffset - 4) = rowIdx;
-      columns(rowOffset - 3) = rowIdx + 1;
-      columns(rowOffset - 2) = rowIdx + nx;
-      columns(rowOffset - 1) = rowIdx + ny*nx;
-      if(frontBC == 1) {
-	// Fill values
-	values(rowOffset - 6) = 0.0;
-	values(rowOffset - 5) = 0.0;
-	values(rowOffset - 4) = 1.0;
-	values(rowOffset - 3) = 0.0;
-	values(rowOffset - 2) = 0.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 6) = -1.0;
-	values(rowOffset - 5) = -1.0;
-	values(rowOffset - 4) =  5.0;
-	values(rowOffset - 3) = -1.0;
-	values(rowOffset - 2) = -1.0;
-	values(rowOffset - 1) = -1.0;
-      }
-
-      /********************/
-      /*   y == ny face   */
-      /********************/
-      // Compute row index
-      k = idx / (nx - 2);
-      ordinal_type j = ny - 2;
-      i = idx % (nx - 2);
-      rowIdx = (k + 1)*ny*nx + (j + 1)*nx + i + 1;
-
-      // Compute rowOffset
-      rowOffset = k*numEntriesPerGridPlane + numEntriesBottomPlane
-        + j*numEntriesPerGridRow + numEntriesFrontRow
-        + (i + 1)*faceStencilLength + edgeStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 6) = rowIdx - ny*nx;
-      columns(rowOffset - 5) = rowIdx - nx;
-      columns(rowOffset - 4) = rowIdx - 1;
-      columns(rowOffset - 3) = rowIdx;
-      columns(rowOffset - 2) = rowIdx + 1;
-      columns(rowOffset - 1) = rowIdx + ny*nx;
-      if(backBC == 1) {
-	// Fill values
-	values(rowOffset - 6) = 0.0;
-	values(rowOffset - 5) = 0.0;
-	values(rowOffset - 4) = 0.0;
-	values(rowOffset - 3) = 1.0;
-	values(rowOffset - 2) = 0.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 6) = -1.0;
-	values(rowOffset - 5) = -1.0;
-	values(rowOffset - 4) = -1.0;
-	values(rowOffset - 3) =  5.0;
-	values(rowOffset - 2) = -1.0;
-	values(rowOffset - 1) = -1.0;
-      }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const yFaceFDTag&, const ordinal_type idx) const {
+    /*******************/
+    /*   y == 0 face   */
+    /*******************/
+    // Compute row index
+    ordinal_type k      = idx / (nx - 2);
+    ordinal_type i      = idx % (nx - 2);
+    ordinal_type rowIdx = (k + 1) * ny * nx + i + 1;
+
+    // Compute rowOffset
+    size_type rowOffset =
+        size_type(k) * numEntriesPerGridPlane + numEntriesBottomPlane +
+        size_type(i + 1) * faceStencilLength + edgeStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 6) = rowIdx - ny * nx;
+    columns(rowOffset - 5) = rowIdx - 1;
+    columns(rowOffset - 4) = rowIdx;
+    columns(rowOffset - 3) = rowIdx + 1;
+    columns(rowOffset - 2) = rowIdx + nx;
+    columns(rowOffset - 1) = rowIdx + ny * nx;
+    if (frontBC == 1) {
+      // Fill values
+      values(rowOffset - 6) = 0.0;
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 1.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 6) = -1.0;
+      values(rowOffset - 5) = -1.0;
+      values(rowOffset - 4) = 5.0;
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = -1.0;
     }
 
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const zFaceFDTag&, const size_type idx) const {
-      /*******************/
-      /*   z == 0 face   */
-      /*******************/
-      // Compute row index
-      ordinal_type j = idx / (nx - 2);
-      ordinal_type i = idx % (nx - 2);
-      ordinal_type rowIdx = (j + 1)*nx + i + 1;
-
-      // Compute rowOffset
-      size_type rowOffset = j*numEntriesFrontRow + numEntriesBottomFrontRow
-        + (i + 1)*faceStencilLength + edgeStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 6) = rowIdx - nx;
-      columns(rowOffset - 5) = rowIdx - 1;
-      columns(rowOffset - 4) = rowIdx;
-      columns(rowOffset - 3) = rowIdx + 1;
-      columns(rowOffset - 2) = rowIdx + nx;
-      columns(rowOffset - 1) = rowIdx + ny*nx;
-      if(bottomBC == 1) {
-	// Fill values
-	values(rowOffset - 6) = 0.0;
-	values(rowOffset - 5) = 0.0;
-	values(rowOffset - 4) = 1.0;
-	values(rowOffset - 3) = 0.0;
-	values(rowOffset - 2) = 0.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 6) = -1.0;
-	values(rowOffset - 5) = -1.0;
-	values(rowOffset - 4) =  5.0;
-	values(rowOffset - 3) = -1.0;
-	values(rowOffset - 2) = -1.0;
-	values(rowOffset - 1) = -1.0;
-      }
-
-      /********************/
-      /*   z == nz face   */
-      /********************/
-      // Compute row index
-      ordinal_type k = nz - 2;
-      j = idx / (nx - 2);
-      i = idx % (nx - 2);
-      rowIdx = (k + 1)*ny*nx + (j + 1)*nx + i + 1;
-
-      // Compute rowOffset
-      rowOffset = k*numEntriesPerGridPlane + numEntriesBottomPlane
-        + j*numEntriesFrontRow + numEntriesBottomFrontRow
-        + (i + 1)*faceStencilLength + edgeStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 6) = rowIdx - ny*nx;
-      columns(rowOffset - 5) = rowIdx - nx;
-      columns(rowOffset - 4) = rowIdx - 1;
-      columns(rowOffset - 3) = rowIdx;
-      columns(rowOffset - 2) = rowIdx + 1;
-      columns(rowOffset - 1) = rowIdx + nx;
-      if(topBC == 1) {
-	// Fill values
-	values(rowOffset - 6) = 0.0;
-	values(rowOffset - 5) = 0.0;
-	values(rowOffset - 4) = 0.0;
-	values(rowOffset - 3) = 1.0;
-	values(rowOffset - 2) = 0.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 6) = -1.0;
-	values(rowOffset - 5) = -1.0;
-	values(rowOffset - 4) = -1.0;
-	values(rowOffset - 3) =  5.0;
-	values(rowOffset - 2) = -1.0;
-	values(rowOffset - 1) = -1.0;
-      }
+    /********************/
+    /*   y == ny face   */
+    /********************/
+    // Compute row index
+    ordinal_type j = ny - 2;
+    k              = idx / (nx - 2);
+    i              = idx % (nx - 2);
+    rowIdx         = (k + 1) * ny * nx + (j + 1) * nx + i + 1;
+
+    // Compute rowOffset
+    rowOffset = size_type(k) * numEntriesPerGridPlane + numEntriesBottomPlane +
+                size_type(j) * numEntriesPerGridRow + numEntriesFrontRow +
+                size_type(i + 1) * faceStencilLength + edgeStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 6) = rowIdx - ny * nx;
+    columns(rowOffset - 5) = rowIdx - nx;
+    columns(rowOffset - 4) = rowIdx - 1;
+    columns(rowOffset - 3) = rowIdx;
+    columns(rowOffset - 2) = rowIdx + 1;
+    columns(rowOffset - 1) = rowIdx + ny * nx;
+    if (backBC == 1) {
+      // Fill values
+      values(rowOffset - 6) = 0.0;
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 1.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 6) = -1.0;
+      values(rowOffset - 5) = -1.0;
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = 5.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = -1.0;
     }
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const xEdgeFDTag&, const size_type idx) const {
-      // Compute row index
-      ordinal_type i = idx;
-      ordinal_type rowIdx = i + 1;
-
-      // Compute rowOffset
-      size_type rowOffset = (i + 1)*edgeStencilLength + cornerStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 5) = rowIdx - 1;
-      columns(rowOffset - 4) = rowIdx;
-      columns(rowOffset - 3) = rowIdx + 1;
-      columns(rowOffset - 2) = rowIdx + nx;
-      columns(rowOffset - 1) = rowIdx + ny*nx;
-      if(bottomBC == 1 || frontBC == 1) {
-	// Fill values
-	values(rowOffset - 5) = 0.0;
-	values(rowOffset - 4) = 1.0;
-	values(rowOffset - 3) = 0.0;
-	values(rowOffset - 2) = 0.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 5) = -1.0;
-	values(rowOffset - 4) =  4.0;
-	values(rowOffset - 3) = -1.0;
-	values(rowOffset - 2) = -1.0;
-	values(rowOffset - 1) = -1.0;
-      }
-
-      // Compute row index
-      ordinal_type j = ny - 2;
-      i = idx;
-      rowIdx = (j + 1)*nx + i + 1;
-
-      // Compute rowOffset
-      rowOffset = j*numEntriesFrontRow + numEntriesBottomFrontRow
-        + (i + 1)*edgeStencilLength + cornerStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 5) = rowIdx - 1;
-      columns(rowOffset - 4) = rowIdx;
-      columns(rowOffset - 3) = rowIdx + 1;
-      columns(rowOffset - 2) = rowIdx + nx;
-      columns(rowOffset - 1) = rowIdx + ny*nx;
-      if(bottomBC == 1 || backBC == 1) {
-	// Fill values
-	values(rowOffset - 5) = 0.0;
-	values(rowOffset - 4) = 1.0;
-	values(rowOffset - 3) = 0.0;
-	values(rowOffset - 2) = 0.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 5) = -1.0;
-	values(rowOffset - 4) =  4.0;
-	values(rowOffset - 3) = -1.0;
-	values(rowOffset - 2) = -1.0;
-	values(rowOffset - 1) = -1.0;
-      }
-
-      // Compute row index
-      ordinal_type k = nz - 2;
-      i = idx;
-      rowIdx = (k + 1)*ny*nx + i + 1;
-
-      // Compute rowOffset
-      rowOffset = k*numEntriesPerGridPlane + numEntriesBottomPlane
-        + (i + 1)*edgeStencilLength + cornerStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 5) = rowIdx - ny*nx;
-      columns(rowOffset - 4) = rowIdx - 1;
-      columns(rowOffset - 3) = rowIdx;
-      columns(rowOffset - 2) = rowIdx + 1;
-      columns(rowOffset - 1) = rowIdx + nx;
-      if(topBC == 1 || frontBC == 1) {
-	// Fill values
-	values(rowOffset - 5) = 0.0;
-	values(rowOffset - 4) = 0.0;
-	values(rowOffset - 3) = 1.0;
-	values(rowOffset - 2) = 0.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 5) = -1.0;
-	values(rowOffset - 4) = -1.0;
-	values(rowOffset - 3) =  4.0;
-	values(rowOffset - 2) = -1.0;
-	values(rowOffset - 1) = -1.0;
-      }
-
-      // Compute row index
-      k = nz - 2;
-      j = ny - 2;
-      i = idx;
-      rowIdx = (k + 1)*ny*nx + (j + 1)*nx + i + 1;
-
-      // Compute rowOffset
-      rowOffset = k*numEntriesPerGridPlane + numEntriesBottomPlane
-        + j*numEntriesFrontRow + numEntriesBottomFrontRow
-        + (i + 1)*edgeStencilLength + cornerStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 5) = rowIdx - ny*nx;
-      columns(rowOffset - 4) = rowIdx - nx;
-      columns(rowOffset - 3) = rowIdx - 1;
-      columns(rowOffset - 2) = rowIdx;
-      columns(rowOffset - 1) = rowIdx + 1;
-      if(topBC == 1 || backBC == 1) {
-	// Fill values
-	values(rowOffset - 5) = 0.0;
-	values(rowOffset - 4) = 0.0;
-	values(rowOffset - 3) = 0.0;
-	values(rowOffset - 2) = 1.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 5) = -1.0;
-	values(rowOffset - 4) = -1.0;
-	values(rowOffset - 3) = -1.0;
-	values(rowOffset - 2) =  4.0;
-	values(rowOffset - 1) = -1.0;
-      }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const zFaceFDTag&, const ordinal_type idx) const {
+    /*******************/
+    /*   z == 0 face   */
+    /*******************/
+    // Compute row index
+    ordinal_type j      = idx / (nx - 2);
+    ordinal_type i      = idx % (nx - 2);
+    ordinal_type rowIdx = (j + 1) * nx + i + 1;
+
+    // Compute rowOffset
+    size_type rowOffset =
+        size_type(j) * numEntriesFrontRow + numEntriesBottomFrontRow +
+        size_type(i + 1) * faceStencilLength + edgeStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 6) = rowIdx - nx;
+    columns(rowOffset - 5) = rowIdx - 1;
+    columns(rowOffset - 4) = rowIdx;
+    columns(rowOffset - 3) = rowIdx + 1;
+    columns(rowOffset - 2) = rowIdx + nx;
+    columns(rowOffset - 1) = rowIdx + ny * nx;
+    if (bottomBC == 1) {
+      // Fill values
+      values(rowOffset - 6) = 0.0;
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 1.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 6) = -1.0;
+      values(rowOffset - 5) = -1.0;
+      values(rowOffset - 4) = 5.0;
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = -1.0;
     }
 
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const yEdgeFDTag&, const size_type idx) const {
-      // Compute row index
-      ordinal_type j = idx;
-      ordinal_type rowIdx = (j + 1)*nx;
-
-      // Compute rowOffset
-      size_type rowOffset = j*numEntriesFrontRow + numEntriesBottomFrontRow + edgeStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-
-      // Fill column indices
-      columns(rowOffset - 5) = rowIdx - nx;
-      columns(rowOffset - 4) = rowIdx;
-      columns(rowOffset - 3) = rowIdx + 1;
-      columns(rowOffset - 2) = rowIdx + nx;
-      columns(rowOffset - 1) = rowIdx + ny*nx;
-      if(bottomBC == 1 || leftBC == 1) {
-	// Fill values
-	values(rowOffset - 5) = 0.0;
-	values(rowOffset - 4) = 1.0;
-	values(rowOffset - 3) = 0.0;
-	values(rowOffset - 2) = 0.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 5) = -1.0;
-	values(rowOffset - 4) =  4.0;
-	values(rowOffset - 3) = -1.0;
-	values(rowOffset - 2) = -1.0;
-	values(rowOffset - 1) = -1.0;
-      }
-
-      // Compute row index
-      j = idx;
-      ordinal_type i = nx - 1;
-      rowIdx = (j + 1)*nx + i;
-
-      // Compute rowOffset
-      rowOffset = (j + 1)*numEntriesFrontRow + numEntriesBottomFrontRow;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 5) = rowIdx - nx;
-      columns(rowOffset - 4) = rowIdx - 1;
-      columns(rowOffset - 3) = rowIdx;
-      columns(rowOffset - 2) = rowIdx + nx;
-      columns(rowOffset - 1) = rowIdx + ny*nx;
-      if(bottomBC == 1 || rightBC == 1) {
-	// Fill values
-	values(rowOffset - 5) = 0.0;
-	values(rowOffset - 4) = 0.0;
-	values(rowOffset - 3) = 1.0;
-	values(rowOffset - 2) = 0.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 5) = -1.0;
-	values(rowOffset - 4) = -1.0;
-	values(rowOffset - 3) =  4.0;
-	values(rowOffset - 2) = -1.0;
-	values(rowOffset - 1) = -1.0;
-      }
-
-      // Compute row index
-      ordinal_type k = nz - 2;
-      j = idx;
-      rowIdx = (k + 1)*ny*nx + (j + 1)*nx;
-
-      // Compute rowOffset
-      rowOffset = k*numEntriesPerGridPlane + numEntriesBottomPlane
-        + j*numEntriesFrontRow + numEntriesBottomFrontRow
-        + edgeStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 5) = rowIdx - ny*nx;
-      columns(rowOffset - 4) = rowIdx - 1;
-      columns(rowOffset - 3) = rowIdx;
-      columns(rowOffset - 2) = rowIdx + 1;
-      columns(rowOffset - 1) = rowIdx + nx;
-      if(topBC == 1 || leftBC == 1) {
-	// Fill values
-	values(rowOffset - 5) = 0.0;
-	values(rowOffset - 4) = 0.0;
-	values(rowOffset - 3) = 1.0;
-	values(rowOffset - 2) = 0.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 5) = -1.0;
-	values(rowOffset - 4) = -1.0;
-	values(rowOffset - 3) =  4.0;
-	values(rowOffset - 2) = -1.0;
-	values(rowOffset - 1) = -1.0;
-      }
-
-      // Compute row index
-      k = nz - 2;
-      j = idx;
-      i = nx - 1;
-      rowIdx = (k + 1)*ny*nx + (j + 1)*nx + i;
-
-      // Compute rowOffset
-      rowOffset = k*numEntriesPerGridPlane + numEntriesBottomPlane
-        + (j + 1)*numEntriesFrontRow + numEntriesBottomFrontRow;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 5) = rowIdx - ny*nx;
-      columns(rowOffset - 4) = rowIdx - nx;
-      columns(rowOffset - 3) = rowIdx - 1;
-      columns(rowOffset - 2) = rowIdx;
-      columns(rowOffset - 1) = rowIdx + nx;
-      if(topBC == 1 || rightBC == 1) {
-	// Fill values
-	values(rowOffset - 5) = 0.0;
-	values(rowOffset - 4) = 0.0;
-	values(rowOffset - 3) = 0.0;
-	values(rowOffset - 2) = 1.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 5) = -1.0;
-	values(rowOffset - 4) = -1.0;
-	values(rowOffset - 3) = -1.0;
-	values(rowOffset - 2) =  4.0;
-	values(rowOffset - 1) = -1.0;
-      }
+    /********************/
+    /*   z == nz face   */
+    /********************/
+    // Compute row index
+    ordinal_type k = nz - 2;
+    j              = idx / (nx - 2);
+    i              = idx % (nx - 2);
+    rowIdx         = (k + 1) * ny * nx + (j + 1) * nx + i + 1;
+
+    // Compute rowOffset
+    rowOffset = size_type(k) * numEntriesPerGridPlane + numEntriesBottomPlane +
+                size_type(j) * numEntriesFrontRow + numEntriesBottomFrontRow +
+                size_type(i + 1) * faceStencilLength + edgeStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 6) = rowIdx - ny * nx;
+    columns(rowOffset - 5) = rowIdx - nx;
+    columns(rowOffset - 4) = rowIdx - 1;
+    columns(rowOffset - 3) = rowIdx;
+    columns(rowOffset - 2) = rowIdx + 1;
+    columns(rowOffset - 1) = rowIdx + nx;
+    if (topBC == 1) {
+      // Fill values
+      values(rowOffset - 6) = 0.0;
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 1.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 6) = -1.0;
+      values(rowOffset - 5) = -1.0;
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = 5.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = -1.0;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const xEdgeFDTag&, const ordinal_type idx) const {
+    // Compute row index
+    ordinal_type i      = idx;
+    ordinal_type rowIdx = i + 1;
+
+    // Compute rowOffset
+    size_type rowOffset =
+        size_type(i + 1) * edgeStencilLength + cornerStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 5) = rowIdx - 1;
+    columns(rowOffset - 4) = rowIdx;
+    columns(rowOffset - 3) = rowIdx + 1;
+    columns(rowOffset - 2) = rowIdx + nx;
+    columns(rowOffset - 1) = rowIdx + ny * nx;
+    if (bottomBC == 1 || frontBC == 1) {
+      // Fill values
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 1.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 5) = -1.0;
+      values(rowOffset - 4) = 4.0;
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = -1.0;
     }
 
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const zEdgeFDTag&, const size_type idx) const {
-      // Compute row index
-      ordinal_type k = idx;
-      ordinal_type rowIdx = (k + 1)*ny*nx;
-
-      // Compute rowOffset
-      size_type rowOffset = k*numEntriesPerGridPlane
-        + numEntriesBottomPlane + edgeStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 5) = rowIdx - ny*nx;
-      columns(rowOffset - 4) = rowIdx;
-      columns(rowOffset - 3) = rowIdx + 1;
-      columns(rowOffset - 2) = rowIdx + nx;
-      columns(rowOffset - 1) = rowIdx + ny*nx;
-      if(frontBC == 1 || leftBC == 1) {
-	// Fill values
-	values(rowOffset - 5) = 0.0;
-	values(rowOffset - 4) = 1.0;
-	values(rowOffset - 3) = 0.0;
-	values(rowOffset - 2) = 0.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 5) = -1.0;
-	values(rowOffset - 4) =  4.0;
-	values(rowOffset - 3) = -1.0;
-	values(rowOffset - 2) = -1.0;
-	values(rowOffset - 1) = -1.0;
-      }
-
-      // Compute row index
-      k = idx;
-      ordinal_type i = nx - 2;
-      rowIdx = (k + 1)*ny*nx + i + 1;
-
-      // Compute rowOffset
-      rowOffset = k*numEntriesPerGridPlane + numEntriesBottomPlane
-        + i*faceStencilLength + 2*edgeStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 5) = rowIdx - ny*nx;
-      columns(rowOffset - 4) = rowIdx - 1;
-      columns(rowOffset - 3) = rowIdx;
-      columns(rowOffset - 2) = rowIdx + nx;
-      columns(rowOffset - 1) = rowIdx + ny*nx;
-      if(frontBC == 1 || rightBC == 1) {
-	// Fill values
-	values(rowOffset - 5) = 0.0;
-	values(rowOffset - 4) = 0.0;
-	values(rowOffset - 3) = 1.0;
-	values(rowOffset - 2) = 0.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 5) = -1.0;
-	values(rowOffset - 4) = -1.0;
-	values(rowOffset - 3) =  4.0;
-	values(rowOffset - 2) = -1.0;
-	values(rowOffset - 1) = -1.0;
-      }
-
-      // Compute row index
-      k = idx;
-      ordinal_type j = ny - 2;
-      rowIdx = (k + 1)*ny*nx
-        + (j + 1)*nx;
-
-      // Compute rowOffset
-      rowOffset = k*numEntriesPerGridPlane
-        + numEntriesBottomPlane
-        + j*numEntriesPerGridRow + numEntriesFrontRow
-        + edgeStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-
-      // Fill column indices
-      columns(rowOffset - 5) = rowIdx - ny*nx;
-      columns(rowOffset - 4) = rowIdx - nx;
-      columns(rowOffset - 3) = rowIdx;
-      columns(rowOffset - 2) = rowIdx + 1;
-      columns(rowOffset - 1) = rowIdx + ny*nx;
-      if(backBC == 1 || leftBC == 1) {
-	// Fill values
-	values(rowOffset - 5) = 0.0;
-	values(rowOffset - 4) = 0.0;
-	values(rowOffset - 3) = 1.0;
-	values(rowOffset - 2) = 0.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 5) = -1.0;
-	values(rowOffset - 4) = -1.0;
-	values(rowOffset - 3) =  4.0;
-	values(rowOffset - 2) = -1.0;
-	values(rowOffset - 1) = -1.0;
-      }
-
-      // Compute row index
-      k = idx;
-      j = ny - 2;
-      i = nx - 2;
-      rowIdx = (k + 1)*ny*nx
-        + (j + 1)*nx + i + 1;
-
-      // Compute rowOffset
-      rowOffset = k*numEntriesPerGridPlane
-        + numEntriesBottomPlane
-        + j*numEntriesPerGridRow + numEntriesFrontRow
-        + i*faceStencilLength + 2*edgeStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 5) = rowIdx - ny*nx;
-      columns(rowOffset - 4) = rowIdx - nx;
-      columns(rowOffset - 3) = rowIdx - 1;
-      columns(rowOffset - 2) = rowIdx;
-      columns(rowOffset - 1) = rowIdx + ny*nx;
-      if(backBC == 1 || rightBC == 1) {
-	// Fill values
-	values(rowOffset - 5) = 0.0;
-	values(rowOffset - 4) = 0.0;
-	values(rowOffset - 3) = 0.0;
-	values(rowOffset - 2) = 1.0;
-	values(rowOffset - 1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 5) = -1.0;
-	values(rowOffset - 4) = -1.0;
-	values(rowOffset - 3) = -1.0;
-	values(rowOffset - 2) =  4.0;
-	values(rowOffset - 1) = -1.0;
-      }
+    // Compute row index
+    ordinal_type j = ny - 2;
+    i              = idx;
+    rowIdx         = (j + 1) * nx + i + 1;
+
+    // Compute rowOffset
+    rowOffset = size_type(j) * numEntriesFrontRow + numEntriesBottomFrontRow +
+                size_type(i + 1) * edgeStencilLength + cornerStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 5) = rowIdx - 1;
+    columns(rowOffset - 4) = rowIdx;
+    columns(rowOffset - 3) = rowIdx + 1;
+    columns(rowOffset - 2) = rowIdx + nx;
+    columns(rowOffset - 1) = rowIdx + ny * nx;
+    if (bottomBC == 1 || backBC == 1) {
+      // Fill values
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 1.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 5) = -1.0;
+      values(rowOffset - 4) = 4.0;
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = -1.0;
     }
 
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const cornerFDTag&, const size_type /*idx*/) const {
-      // Bottom corners
-      ordinal_type rowIdx = 0;
-      size_type rowOffset = cornerStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 4) = rowIdx;
-      columns(rowOffset - 3) = rowIdx + 1;
-      columns(rowOffset - 2) = rowIdx + nx;
-      columns(rowOffset - 1) = rowIdx + ny*nx;
-      if(bottomBC == 1 || frontBC == 1 || leftBC == 1) {
-        // Fill values
-        values(rowOffset - 4) = 1.0;
-        values(rowOffset - 3) = 0.0;
-        values(rowOffset - 2) = 0.0;
-        values(rowOffset - 1) = 0.0;
-      } else {
-        // Fill values
-        values(rowOffset - 4) =  3.0;
-        values(rowOffset - 3) = -1.0;
-        values(rowOffset - 2) = -1.0;
-        values(rowOffset - 1) = -1.0;
-      }
-
-      rowIdx = nx - 1;
-      rowOffset = numEntriesBottomFrontRow;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 4) = rowIdx - 1;
-      columns(rowOffset - 3) = rowIdx;
-      columns(rowOffset - 2) = rowIdx + nx;
-      columns(rowOffset - 1) = rowIdx + ny*nx;
-      if(bottomBC == 1 || frontBC == 1 || rightBC == 1) {
-        // Fill values
-        values(rowOffset - 4) = 0.0;
-        values(rowOffset - 3) = 1.0;
-        values(rowOffset - 2) = 0.0;
-        values(rowOffset - 1) = 0.0;
-      } else {
-        // Fill values
-        values(rowOffset - 4) = -1.0;
-        values(rowOffset - 3) =  3.0;
-        values(rowOffset - 2) = -1.0;
-        values(rowOffset - 1) = -1.0;
-      }
-
-      rowIdx = (ny - 1)*nx;
-      rowOffset = (ny - 2)*numEntriesFrontRow
-        + numEntriesBottomFrontRow + cornerStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 4) = rowIdx - nx;
-      columns(rowOffset - 3) = rowIdx;
-      columns(rowOffset - 2) = rowIdx + 1;
-      columns(rowOffset - 1) = rowIdx + ny*nx;
-      if(bottomBC == 1 || backBC == 1 || leftBC == 1) {
-        // Fill values
-        values(rowOffset - 4) = 0.0;
-        values(rowOffset - 3) = 1.0;
-        values(rowOffset - 2) = 0.0;
-        values(rowOffset - 1) = 0.0;
-      } else {
-        // Fill values
-        values(rowOffset - 4) = -1.0;
-        values(rowOffset - 3) =  3.0;
-        values(rowOffset - 2) = -1.0;
-        values(rowOffset - 1) = -1.0;
-      }
-
-      rowIdx = ny*nx - 1;
-      rowOffset = numEntriesBottomPlane;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 4) = rowIdx - nx;
-      columns(rowOffset - 3) = rowIdx - 1;
-      columns(rowOffset - 2) = rowIdx;
-      columns(rowOffset - 1) = rowIdx + ny*nx;
-      if(bottomBC == 1 || backBC == 1 || rightBC == 1) {
-        // Fill values
-        values(rowOffset - 4) = 0.0;
-        values(rowOffset - 3) = 0.0;
-        values(rowOffset - 2) = 1.0;
-        values(rowOffset - 1) = 0.0;
-      } else {
-        // Fill values
-        values(rowOffset - 4) = -1.0;
-        values(rowOffset - 3) = -1.0;
-        values(rowOffset - 2) =  3.0;
-        values(rowOffset - 1) = -1.0;
-      }
-
-      rowIdx = (nz - 1)*ny*nx;
-      rowOffset = (nz - 2)*numEntriesPerGridPlane
-        + numEntriesBottomPlane + cornerStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 4) = rowIdx - ny*nx;
-      columns(rowOffset - 3) = rowIdx;
-      columns(rowOffset - 2) = rowIdx + 1;
-      columns(rowOffset - 1) = rowIdx + nx;
-      if(topBC == 1 || frontBC == 1 || leftBC == 1) {
-        // Fill values
-        values(rowOffset - 4) = 0.0;
-        values(rowOffset - 3) = 1.0;
-        values(rowOffset - 2) = 0.0;
-        values(rowOffset - 1) = 0.0;
-      } else {
-        // Fill values
-        values(rowOffset - 4) = -1.0;
-        values(rowOffset - 3) =  3.0;
-        values(rowOffset - 2) = -1.0;
-        values(rowOffset - 1) = -1.0;
-      }
-
-      rowIdx = (nz - 1)*ny*nx + nx - 1;
-      rowOffset = (nz - 2)*numEntriesPerGridPlane
-        + numEntriesBottomPlane + numEntriesBottomFrontRow;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 4) = rowIdx - ny*nx;
-      columns(rowOffset - 3) = rowIdx - 1;
-      columns(rowOffset - 2) = rowIdx;
-      columns(rowOffset - 1) = rowIdx + nx;
-      if(topBC == 1 || frontBC == 1 || rightBC == 1) {
-        // Fill values
-        values(rowOffset - 4) = 0.0;
-        values(rowOffset - 3) = 0.0;
-        values(rowOffset - 2) = 1.0;
-        values(rowOffset - 1) = 0.0;
-      } else {
-        // Fill values
-        values(rowOffset - 4) = -1.0;
-        values(rowOffset - 3) = -1.0;
-        values(rowOffset - 2) =  3.0;
-        values(rowOffset - 1) = -1.0;
-      }
-
-      rowIdx = nz*ny*nx - nx;
-      rowOffset = (nz - 2)*numEntriesPerGridPlane
-        + numEntriesBottomPlane + (ny - 2)*numEntriesFrontRow
-        + numEntriesBottomFrontRow + cornerStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 4) = rowIdx - ny*nx;
-      columns(rowOffset - 3) = rowIdx - nx;
-      columns(rowOffset - 2) = rowIdx;
-      columns(rowOffset - 1) = rowIdx + 1;
-      if(topBC == 1 || backBC == 1 || leftBC == 1) {
-        // Fill values
-        values(rowOffset - 4) = 0.0;
-        values(rowOffset - 3) = 0.0;
-        values(rowOffset - 2) = 1.0;
-        values(rowOffset - 1) = 0.0;
-      } else {
-        // Fill values
-        values(rowOffset - 4) = -1.0;
-        values(rowOffset - 3) = -1.0;
-        values(rowOffset - 2) =  3.0;
-        values(rowOffset - 1) = -1.0;
-      }
-
-      rowIdx = nz*ny*nx - 1;
-      rowOffset = numEntries;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 4) = rowIdx - ny*nx;
-      columns(rowOffset - 3) = rowIdx - nx;
-      columns(rowOffset - 2) = rowIdx - 1;
-      columns(rowOffset - 1) = rowIdx;
-      if(topBC == 1 || backBC == 1 || rightBC == 1) {
-        // Fill values
-        values(rowOffset - 4) = 0.0;
-        values(rowOffset - 3) = 0.0;
-        values(rowOffset - 2) = 0.0;
-        values(rowOffset - 1) = 1.0;
-      } else {
-        // Fill values
-        values(rowOffset - 4) = -1.0;
-        values(rowOffset - 3) = -1.0;
-        values(rowOffset - 2) = -1.0;
-        values(rowOffset - 1) =  3.0;
-      }
+    // Compute row index
+    ordinal_type k = nz - 2;
+    i              = idx;
+    rowIdx         = (k + 1) * ny * nx + i + 1;
+
+    // Compute rowOffset
+    rowOffset = size_type(k) * numEntriesPerGridPlane + numEntriesBottomPlane +
+                size_type(i + 1) * edgeStencilLength + cornerStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 5) = rowIdx - ny * nx;
+    columns(rowOffset - 4) = rowIdx - 1;
+    columns(rowOffset - 3) = rowIdx;
+    columns(rowOffset - 2) = rowIdx + 1;
+    columns(rowOffset - 1) = rowIdx + nx;
+    if (topBC == 1 || frontBC == 1) {
+      // Fill values
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 1.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 5) = -1.0;
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = 4.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = -1.0;
     }
 
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const interiorFETag&, const size_type idx) const {
-      // Compute row index
-      const ordinal_type k   = idx / ((ny - 2)*(nx - 2));
-      const ordinal_type rem = idx % ((ny - 2)*(nx - 2));
-      const ordinal_type j   = rem / (nx - 2);
-      const ordinal_type i   = rem % (nx - 2);
-      const ordinal_type rowIdx = (k + 1)*ny*nx + (j + 1)*nx + i + 1;
-
-      // Compute rowOffset
-      const size_type rowOffset = k*numEntriesPerGridPlane + numEntriesBottomPlane
-        + j*numEntriesPerGridRow + numEntriesFrontRow
-        + (i + 1)*interiorStencilLength + faceStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 27) = rowIdx - ny*nx - nx - 1;
-      columns(rowOffset - 26) = rowIdx - ny*nx - nx;
-      columns(rowOffset - 25) = rowIdx - ny*nx - nx + 1;
-      columns(rowOffset - 24) = rowIdx - ny*nx - 1;
-      columns(rowOffset - 23) = rowIdx - ny*nx;
-      columns(rowOffset - 22) = rowIdx - ny*nx + 1;
-      columns(rowOffset - 21) = rowIdx - ny*nx + nx - 1;
-      columns(rowOffset - 20) = rowIdx - ny*nx + nx;
-      columns(rowOffset - 19) = rowIdx - ny*nx + nx + 1;
-      columns(rowOffset - 18) = rowIdx - nx - 1;
-      columns(rowOffset - 17) = rowIdx - nx;
-      columns(rowOffset - 16) = rowIdx - nx + 1;
-      columns(rowOffset - 15) = rowIdx - 1;
-      columns(rowOffset - 14) = rowIdx;
-      columns(rowOffset - 13) = rowIdx + 1;
-      columns(rowOffset - 12) = rowIdx + nx - 1;
-      columns(rowOffset - 11) = rowIdx + nx;
-      columns(rowOffset - 10) = rowIdx + nx + 1;
-      columns(rowOffset -  9) = rowIdx + nx*ny - nx - 1;
-      columns(rowOffset -  8) = rowIdx + nx*ny - nx;
-      columns(rowOffset -  7) = rowIdx + nx*ny - nx + 1;
-      columns(rowOffset -  6) = rowIdx + nx*ny - 1;
-      columns(rowOffset -  5) = rowIdx + nx*ny;
-      columns(rowOffset -  4) = rowIdx + nx*ny + 1;
-      columns(rowOffset -  3) = rowIdx + nx*ny + nx - 1;
-      columns(rowOffset -  2) = rowIdx + nx*ny + nx;
-      columns(rowOffset -  1) = rowIdx + nx*ny + nx + 1;
-
-      // Fill values
-      values(rowOffset - 27) = -1.0;
-      values(rowOffset - 26) = -2.0;
-      values(rowOffset - 25) = -1.0;
-      values(rowOffset - 24) = -2.0;
-      values(rowOffset - 23) =  0.0;
-      values(rowOffset - 22) = -2.0;
-      values(rowOffset - 21) = -1.0;
-      values(rowOffset - 20) = -2.0;
-      values(rowOffset - 19) = -1.0;
-      values(rowOffset - 18) = -2.0;
-      values(rowOffset - 17) =  0.0;
-      values(rowOffset - 16) = -2.0;
-      values(rowOffset - 15) =  0.0;
-      values(rowOffset - 14) = 32.0;
-      values(rowOffset - 13) =  0.0;
-      values(rowOffset - 12) = -2.0;
-      values(rowOffset - 11) =  0.0;
-      values(rowOffset - 10) = -2.0;
-      values(rowOffset -  9) = -1.0;
-      values(rowOffset -  8) = -2.0;
-      values(rowOffset -  7) = -1.0;
-      values(rowOffset -  6) = -2.0;
-      values(rowOffset -  5) =  0.0;
-      values(rowOffset -  4) = -2.0;
-      values(rowOffset -  3) = -1.0;
-      values(rowOffset -  2) = -2.0;
-      values(rowOffset -  1) = -1.0;
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const xFaceFETag&, const size_type idx) const {
-      /*******************/
-      /*   x == 0 face   */
-      /*******************/
-      // Compute row index
-      ordinal_type k = idx / (ny - 2);
-      ordinal_type j = idx % (ny - 2);
-      ordinal_type i = 0;
-      ordinal_type rowIdx = (k + 1)*ny*nx + (j + 1)*nx + i;
-
-      // Compute rowOffset
-      size_type rowOffset = k*numEntriesPerGridPlane + numEntriesBottomPlane
-        + j*numEntriesPerGridRow + numEntriesFrontRow + faceStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 18) = rowIdx - ny*nx - nx;
-      columns(rowOffset - 17) = rowIdx - ny*nx - nx + 1;
-      columns(rowOffset - 16) = rowIdx - ny*nx;
-      columns(rowOffset - 15) = rowIdx - ny*nx + 1;
-      columns(rowOffset - 14) = rowIdx - ny*nx + nx;
-      columns(rowOffset - 13) = rowIdx - ny*nx + nx + 1;
-      columns(rowOffset - 12) = rowIdx - nx;
-      columns(rowOffset - 11) = rowIdx - nx + 1;
-      columns(rowOffset - 10) = rowIdx;
-      columns(rowOffset -  9) = rowIdx + 1;
-      columns(rowOffset -  8) = rowIdx + nx;
-      columns(rowOffset -  7) = rowIdx + nx + 1;
-      columns(rowOffset -  6) = rowIdx + nx*ny - nx;
-      columns(rowOffset -  5) = rowIdx + nx*ny - nx + 1;
-      columns(rowOffset -  4) = rowIdx + nx*ny;
-      columns(rowOffset -  3) = rowIdx + nx*ny + 1;
-      columns(rowOffset -  2) = rowIdx + nx*ny + nx;
-      columns(rowOffset -  1) = rowIdx + nx*ny + nx + 1;
-      if(leftBC == 1) {
-	// Fill values
-	values(rowOffset - 18) = 0.0;
-	values(rowOffset - 17) = 0.0;
-	values(rowOffset - 16) = 0.0;
-	values(rowOffset - 15) = 0.0;
-	values(rowOffset - 14) = 0.0;
-	values(rowOffset - 13) = 0.0;
-	values(rowOffset - 12) = 0.0;
-	values(rowOffset - 11) = 0.0;
-	values(rowOffset - 10) = 1.0;
-	values(rowOffset -  9) = 0.0;
-	values(rowOffset -  8) = 0.0;
-	values(rowOffset -  7) = 0.0;
-	values(rowOffset -  6) = 0.0;
-	values(rowOffset -  5) = 0.0;
-	values(rowOffset -  4) = 0.0;
-	values(rowOffset -  3) = 0.0;
-	values(rowOffset -  2) = 0.0;
-	values(rowOffset -  1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 18) = -1.0;
-	values(rowOffset - 17) = -1.0;
-	values(rowOffset - 16) =  0.0;
-	values(rowOffset - 15) = -2.0;
-	values(rowOffset - 14) = -1.0;
-	values(rowOffset - 13) = -1.0;
-	values(rowOffset - 12) =  0.0;
-	values(rowOffset - 11) = -2.0;
-	values(rowOffset - 10) = 16.0;
-	values(rowOffset -  9) =  0.0;
-	values(rowOffset -  8) =  0.0;
-	values(rowOffset -  7) = -2.0;
-	values(rowOffset -  6) = -1.0;
-	values(rowOffset -  5) = -1.0;
-	values(rowOffset -  4) =  0.0;
-	values(rowOffset -  3) = -2.0;
-	values(rowOffset -  2) = -1.0;
-	values(rowOffset -  1) = -1.0;
-      }
-
-      /********************/
-      /*   x == nx face   */
-      /********************/
-      // Compute row index
-      k = idx / (ny - 2);
-      j = idx % (ny - 2);
-      i   = nx - 1;
-      rowIdx = (k + 1)*ny*nx + (j + 1)*nx + i;
-
-      // Compute rowOffset
-      rowOffset = k*numEntriesPerGridPlane + numEntriesBottomPlane
-        + (j + 1)*numEntriesPerGridRow + numEntriesFrontRow;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 18) = rowIdx - ny*nx - nx - 1;
-      columns(rowOffset - 17) = rowIdx - ny*nx - nx;
-      columns(rowOffset - 16) = rowIdx - ny*nx - 1;
-      columns(rowOffset - 15) = rowIdx - ny*nx;
-      columns(rowOffset - 14) = rowIdx - ny*nx + nx - 1;
-      columns(rowOffset - 13) = rowIdx - ny*nx + nx;
-      columns(rowOffset - 12) = rowIdx - nx - 1;
-      columns(rowOffset - 11) = rowIdx - nx;
-      columns(rowOffset - 10) = rowIdx - 1;
-      columns(rowOffset -  9) = rowIdx;
-      columns(rowOffset -  8) = rowIdx + nx - 1;
-      columns(rowOffset -  7) = rowIdx + nx;
-      columns(rowOffset -  6) = rowIdx + nx*ny - nx - 1;
-      columns(rowOffset -  5) = rowIdx + nx*ny - nx;
-      columns(rowOffset -  4) = rowIdx + nx*ny - 1;
-      columns(rowOffset -  3) = rowIdx + nx*ny;
-      columns(rowOffset -  2) = rowIdx + nx*ny + nx - 1;
-      columns(rowOffset -  1) = rowIdx + nx*ny + nx;
-      if(rightBC == 1) {
-	// Fill values
-	values(rowOffset - 18) = 0.0;
-	values(rowOffset - 17) = 0.0;
-	values(rowOffset - 16) = 0.0;
-	values(rowOffset - 15) = 0.0;
-	values(rowOffset - 14) = 0.0;
-	values(rowOffset - 13) = 0.0;
-	values(rowOffset - 12) = 0.0;
-	values(rowOffset - 11) = 0.0;
-	values(rowOffset - 10) = 0.0;
-	values(rowOffset -  9) = 1.0;
-	values(rowOffset -  8) = 0.0;
-	values(rowOffset -  7) = 0.0;
-	values(rowOffset -  6) = 0.0;
-	values(rowOffset -  5) = 0.0;
-	values(rowOffset -  4) = 0.0;
-	values(rowOffset -  3) = 0.0;
-	values(rowOffset -  2) = 0.0;
-	values(rowOffset -  1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 18) = -1.0;
-	values(rowOffset - 17) = -1.0;
-	values(rowOffset - 16) = -2.0;
-	values(rowOffset - 15) =  0.0;
-	values(rowOffset - 14) = -1.0;
-	values(rowOffset - 13) = -1.0;
-	values(rowOffset - 12) = -2.0;
-	values(rowOffset - 11) =  0.0;
-	values(rowOffset - 10) =  0.0;
-	values(rowOffset -  9) = 16.0;
-	values(rowOffset -  8) = -2.0;
-	values(rowOffset -  7) =  0.0;
-	values(rowOffset -  6) = -1.0;
-	values(rowOffset -  5) = -1.0;
-	values(rowOffset -  4) = -2.0;
-	values(rowOffset -  3) =  0.0;
-	values(rowOffset -  2) = -1.0;
-	values(rowOffset -  1) = -1.0;
-      }
+    // Compute row index
+    k      = nz - 2;
+    j      = ny - 2;
+    i      = idx;
+    rowIdx = (k + 1) * ny * nx + (j + 1) * nx + i + 1;
+
+    // Compute rowOffset
+    rowOffset = size_type(k) * numEntriesPerGridPlane + numEntriesBottomPlane +
+                size_type(j) * numEntriesFrontRow + numEntriesBottomFrontRow +
+                size_type(i + 1) * edgeStencilLength + cornerStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 5) = rowIdx - ny * nx;
+    columns(rowOffset - 4) = rowIdx - nx;
+    columns(rowOffset - 3) = rowIdx - 1;
+    columns(rowOffset - 2) = rowIdx;
+    columns(rowOffset - 1) = rowIdx + 1;
+    if (topBC == 1 || backBC == 1) {
+      // Fill values
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 1.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 5) = -1.0;
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = 4.0;
+      values(rowOffset - 1) = -1.0;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const yEdgeFDTag&, const ordinal_type idx) const {
+    // Compute row index
+    ordinal_type j      = idx;
+    ordinal_type rowIdx = (j + 1) * nx;
+
+    // Compute rowOffset
+    size_type rowOffset = size_type(j) * numEntriesFrontRow +
+                          numEntriesBottomFrontRow + edgeStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 5) = rowIdx - nx;
+    columns(rowOffset - 4) = rowIdx;
+    columns(rowOffset - 3) = rowIdx + 1;
+    columns(rowOffset - 2) = rowIdx + nx;
+    columns(rowOffset - 1) = rowIdx + ny * nx;
+    if (bottomBC == 1 || leftBC == 1) {
+      // Fill values
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 1.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 5) = -1.0;
+      values(rowOffset - 4) = 4.0;
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = -1.0;
     }
 
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const yFaceFETag&, const size_type idx) const {
-      /*******************/
-      /*   y == 0 face   */
-      /*******************/
-      // Compute row index
-      ordinal_type k = idx / (nx - 2);
-      ordinal_type i = idx % (nx - 2);
-      ordinal_type rowIdx = (k + 1)*ny*nx + i + 1;
-
-      // Compute rowOffset
-      size_type rowOffset = k*numEntriesPerGridPlane + numEntriesBottomPlane
-        + (i + 1)*faceStencilLength + edgeStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 18) = rowIdx - ny*nx - 1;
-      columns(rowOffset - 17) = rowIdx - ny*nx;
-      columns(rowOffset - 16) = rowIdx - ny*nx + 1;
-      columns(rowOffset - 15) = rowIdx - ny*nx + nx - 1;
-      columns(rowOffset - 14) = rowIdx - ny*nx + nx;
-      columns(rowOffset - 13) = rowIdx - ny*nx + nx + 1;
-      columns(rowOffset - 12) = rowIdx - 1;
-      columns(rowOffset - 11) = rowIdx;
-      columns(rowOffset - 10) = rowIdx + 1;
-      columns(rowOffset -  9) = rowIdx + nx - 1;
-      columns(rowOffset -  8) = rowIdx + nx;
-      columns(rowOffset -  7) = rowIdx + nx + 1;
-      columns(rowOffset -  6) = rowIdx + nx*ny - 1;
-      columns(rowOffset -  5) = rowIdx + nx*ny;
-      columns(rowOffset -  4) = rowIdx + nx*ny + 1;
-      columns(rowOffset -  3) = rowIdx + nx*ny + nx - 1;
-      columns(rowOffset -  2) = rowIdx + nx*ny + nx;
-      columns(rowOffset -  1) = rowIdx + nx*ny + nx + 1;
-      if(frontBC == 1) {
-	// Fill values
-	values(rowOffset - 18) = 0.0;
-	values(rowOffset - 17) = 0.0;
-	values(rowOffset - 16) = 0.0;
-	values(rowOffset - 15) = 0.0;
-	values(rowOffset - 14) = 0.0;
-	values(rowOffset - 13) = 0.0;
-	values(rowOffset - 12) = 0.0;
-	values(rowOffset - 11) = 1.0;
-	values(rowOffset - 10) = 0.0;
-	values(rowOffset -  9) = 0.0;
-	values(rowOffset -  8) = 0.0;
-	values(rowOffset -  7) = 0.0;
-	values(rowOffset -  6) = 0.0;
-	values(rowOffset -  5) = 0.0;
-	values(rowOffset -  4) = 0.0;
-	values(rowOffset -  3) = 0.0;
-	values(rowOffset -  2) = 0.0;
-	values(rowOffset -  1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 18) = -1.0;
-	values(rowOffset - 17) =  0.0;
-	values(rowOffset - 16) = -1.0;
-	values(rowOffset - 15) = -1.0;
-	values(rowOffset - 14) = -2.0;
-	values(rowOffset - 13) = -1.0;
-	values(rowOffset - 12) =  0.0;
-	values(rowOffset - 11) = 16.0;
-	values(rowOffset - 10) =  0.0;
-	values(rowOffset -  9) = -2.0;
-	values(rowOffset -  8) =  0.0;
-	values(rowOffset -  7) = -2.0;
-	values(rowOffset -  6) = -1.0;
-	values(rowOffset -  5) =  0.0;
-	values(rowOffset -  4) = -1.0;
-	values(rowOffset -  3) = -1.0;
-	values(rowOffset -  2) = -2.0;
-	values(rowOffset -  1) = -1.0;
-      }
+    // Compute row index
+    j              = idx;
+    ordinal_type i = nx - 1;
+    rowIdx         = (j + 1) * nx + i;
+
+    // Compute rowOffset
+    rowOffset =
+        size_type(j + 1) * numEntriesFrontRow + numEntriesBottomFrontRow;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 5) = rowIdx - nx;
+    columns(rowOffset - 4) = rowIdx - 1;
+    columns(rowOffset - 3) = rowIdx;
+    columns(rowOffset - 2) = rowIdx + nx;
+    columns(rowOffset - 1) = rowIdx + ny * nx;
+    if (bottomBC == 1 || rightBC == 1) {
+      // Fill values
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 1.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 5) = -1.0;
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = 4.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = -1.0;
+    }
 
-      /********************/
-      /*   y == ny face   */
-      /********************/
-      // Compute row index
-      k = idx / (nx - 2);
-      ordinal_type j = ny - 2;
-      i = idx % (nx - 2);
-      rowIdx = (k + 1)*ny*nx + (j + 1)*nx + i + 1;
-
-      // Compute rowOffset
-      rowOffset = k*numEntriesPerGridPlane + numEntriesBottomPlane
-        + j*numEntriesPerGridRow + numEntriesFrontRow
-        + (i + 1)*faceStencilLength + edgeStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 18) = rowIdx - ny*nx - nx - 1;
-      columns(rowOffset - 17) = rowIdx - ny*nx - nx;
-      columns(rowOffset - 16) = rowIdx - ny*nx - 1;
-      columns(rowOffset - 15) = rowIdx - ny*nx - 1;
-      columns(rowOffset - 14) = rowIdx - ny*nx;
-      columns(rowOffset - 13) = rowIdx - ny*nx + 1;
-      columns(rowOffset - 12) = rowIdx - nx - 1;
-      columns(rowOffset - 11) = rowIdx - nx;
-      columns(rowOffset - 10) = rowIdx - nx + 1;
-      columns(rowOffset -  9) = rowIdx - 1;
-      columns(rowOffset -  8) = rowIdx;
-      columns(rowOffset -  7) = rowIdx + 1;
-      columns(rowOffset -  6) = rowIdx + nx*ny - nx - 1;
-      columns(rowOffset -  5) = rowIdx + nx*ny - nx;
-      columns(rowOffset -  4) = rowIdx + nx*ny - nx + 1;
-      columns(rowOffset -  3) = rowIdx + nx*ny - 1;
-      columns(rowOffset -  2) = rowIdx + nx*ny;
-      columns(rowOffset -  1) = rowIdx + nx*ny + 1;
-      if(backBC == 1) {
-	// Fill values
-	values(rowOffset - 18) = 0.0;
-	values(rowOffset - 17) = 0.0;
-	values(rowOffset - 16) = 0.0;
-	values(rowOffset - 15) = 0.0;
-	values(rowOffset - 14) = 0.0;
-	values(rowOffset - 13) = 0.0;
-	values(rowOffset - 12) = 0.0;
-	values(rowOffset - 11) = 0.0;
-	values(rowOffset - 10) = 0.0;
-	values(rowOffset -  9) = 0.0;
-	values(rowOffset -  8) = 1.0;
-	values(rowOffset -  7) = 0.0;
-	values(rowOffset -  6) = 0.0;
-	values(rowOffset -  5) = 0.0;
-	values(rowOffset -  4) = 0.0;
-	values(rowOffset -  3) = 0.0;
-	values(rowOffset -  2) = 0.0;
-	values(rowOffset -  1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 18) = -1.0;
-	values(rowOffset - 17) = -2.0;
-	values(rowOffset - 16) = -1.0;
-	values(rowOffset - 15) = -1.0;
-	values(rowOffset - 14) =  0.0;
-	values(rowOffset - 13) = -1.0;
-	values(rowOffset - 12) = -2.0;
-	values(rowOffset - 11) =  0.0;
-	values(rowOffset - 10) = -2.0;
-	values(rowOffset -  9) =  0.0;
-	values(rowOffset -  8) = 16.0;
-	values(rowOffset -  7) =  0.0;
-	values(rowOffset -  6) = -1.0;
-	values(rowOffset -  5) = -2.0;
-	values(rowOffset -  4) = -1.0;
-	values(rowOffset -  3) = -1.0;
-	values(rowOffset -  2) =  0.0;
-	values(rowOffset -  1) = -1.0;
-      }
+    // Compute row index
+    ordinal_type k = nz - 2;
+    j              = idx;
+    rowIdx         = (k + 1) * ny * nx + (j + 1) * nx;
+
+    // Compute rowOffset
+    rowOffset = size_type(k) * numEntriesPerGridPlane + numEntriesBottomPlane +
+                size_type(j) * numEntriesFrontRow + numEntriesBottomFrontRow +
+                edgeStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 5) = rowIdx - ny * nx;
+    columns(rowOffset - 4) = rowIdx - 1;
+    columns(rowOffset - 3) = rowIdx;
+    columns(rowOffset - 2) = rowIdx + 1;
+    columns(rowOffset - 1) = rowIdx + nx;
+    if (topBC == 1 || leftBC == 1) {
+      // Fill values
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 1.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 5) = -1.0;
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = 4.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = -1.0;
     }
 
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const zFaceFETag&, const size_type idx) const {
-      /*******************/
-      /*   z == 0 face   */
-      /*******************/
-      // Compute row index
-      ordinal_type j = idx / (nx - 2);
-      ordinal_type i = idx % (nx - 2);
-      ordinal_type rowIdx = (j + 1)*nx + i + 1;
-
-      // Compute rowOffset
-      size_type rowOffset = j*numEntriesFrontRow + numEntriesBottomFrontRow
-        + (i + 1)*faceStencilLength + edgeStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 18) = rowIdx - nx - 1;
-      columns(rowOffset - 17) = rowIdx - nx;
-      columns(rowOffset - 16) = rowIdx - nx + 1;
-      columns(rowOffset - 15) = rowIdx - 1;
-      columns(rowOffset - 14) = rowIdx;
-      columns(rowOffset - 13) = rowIdx + 1;
-      columns(rowOffset - 12) = rowIdx + nx - 1;
-      columns(rowOffset - 11) = rowIdx + nx;
-      columns(rowOffset - 10) = rowIdx + nx + 1;
-      columns(rowOffset -  9) = rowIdx + nx*ny - nx - 1;
-      columns(rowOffset -  8) = rowIdx + nx*ny - nx;
-      columns(rowOffset -  7) = rowIdx + nx*ny - nx + 1;
-      columns(rowOffset -  6) = rowIdx + nx*ny - 1;
-      columns(rowOffset -  5) = rowIdx + nx*ny;
-      columns(rowOffset -  4) = rowIdx + nx*ny + 1;
-      columns(rowOffset -  3) = rowIdx + nx*ny + nx - 1;
-      columns(rowOffset -  2) = rowIdx + nx*ny + nx;
-      columns(rowOffset -  1) = rowIdx + nx*ny + nx + 1;
-      if(bottomBC == 1) {
-	// Fill values
-	values(rowOffset - 18) = 0.0;
-	values(rowOffset - 17) = 0.0;
-	values(rowOffset - 16) = 0.0;
-	values(rowOffset - 15) = 0.0;
-	values(rowOffset - 14) = 1.0;
-	values(rowOffset - 13) = 0.0;
-	values(rowOffset - 12) = 0.0;
-	values(rowOffset - 11) = 0.0;
-	values(rowOffset - 10) = 0.0;
-	values(rowOffset -  9) = 0.0;
-	values(rowOffset -  8) = 0.0;
-	values(rowOffset -  7) = 0.0;
-	values(rowOffset -  6) = 0.0;
-	values(rowOffset -  5) = 0.0;
-	values(rowOffset -  4) = 0.0;
-	values(rowOffset -  3) = 0.0;
-	values(rowOffset -  2) = 0.0;
-	values(rowOffset -  1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 18) = -1.0;
-	values(rowOffset - 17) =  0.0;
-	values(rowOffset - 16) = -1.0;
-	values(rowOffset - 15) =  0.0;
-	values(rowOffset - 14) = 16.0;
-	values(rowOffset - 13) =  0.0;
-	values(rowOffset - 12) = -1.0;
-	values(rowOffset - 11) =  0.0;
-	values(rowOffset - 10) = -1.0;
-	values(rowOffset -  9) = -1.0;
-	values(rowOffset -  8) = -2.0;
-	values(rowOffset -  7) = -1.0;
-	values(rowOffset -  6) = -2.0;
-	values(rowOffset -  5) =  0.0;
-	values(rowOffset -  4) = -2.0;
-	values(rowOffset -  3) = -1.0;
-	values(rowOffset -  2) = -2.0;
-	values(rowOffset -  1) = -1.0;
-      }
+    // Compute row index
+    k      = nz - 2;
+    j      = idx;
+    i      = nx - 1;
+    rowIdx = (k + 1) * ny * nx + (j + 1) * nx + i;
+
+    // Compute rowOffset
+    rowOffset = size_type(k) * numEntriesPerGridPlane + numEntriesBottomPlane +
+                size_type(j + 1) * numEntriesFrontRow +
+                numEntriesBottomFrontRow;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 5) = rowIdx - ny * nx;
+    columns(rowOffset - 4) = rowIdx - nx;
+    columns(rowOffset - 3) = rowIdx - 1;
+    columns(rowOffset - 2) = rowIdx;
+    columns(rowOffset - 1) = rowIdx + nx;
+    if (topBC == 1 || rightBC == 1) {
+      // Fill values
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 1.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 5) = -1.0;
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = 4.0;
+      values(rowOffset - 1) = -1.0;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const zEdgeFDTag&, const ordinal_type idx) const {
+    // Compute row index
+    ordinal_type k      = idx;
+    ordinal_type rowIdx = (k + 1) * ny * nx;
+
+    // Compute rowOffset
+    size_type rowOffset = size_type(k) * numEntriesPerGridPlane +
+                          numEntriesBottomPlane + edgeStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 5) = rowIdx - ny * nx;
+    columns(rowOffset - 4) = rowIdx;
+    columns(rowOffset - 3) = rowIdx + 1;
+    columns(rowOffset - 2) = rowIdx + nx;
+    columns(rowOffset - 1) = rowIdx + ny * nx;
+    if (frontBC == 1 || leftBC == 1) {
+      // Fill values
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 1.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 5) = -1.0;
+      values(rowOffset - 4) = 4.0;
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = -1.0;
+    }
 
-      /********************/
-      /*   z == nz face   */
-      /********************/
-      // Compute row index
-      ordinal_type k = nz - 2;
-      j = idx / (nx - 2);
-      i = idx % (nx - 2);
-      rowIdx = (k + 1)*ny*nx + (j + 1)*nx + i + 1;
-
-      // Compute rowOffset
-      rowOffset = k*numEntriesPerGridPlane + numEntriesBottomPlane
-        + j*numEntriesFrontRow + numEntriesBottomFrontRow
-        + (i + 1)*faceStencilLength + edgeStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 18) = rowIdx - nx*ny - nx - 1;
-      columns(rowOffset - 17) = rowIdx - nx*ny - nx;
-      columns(rowOffset - 16) = rowIdx - nx*ny - nx + 1;
-      columns(rowOffset - 15) = rowIdx - nx*ny - 1;
-      columns(rowOffset - 14) = rowIdx - nx*ny;
-      columns(rowOffset - 13) = rowIdx - nx*ny + 1;
-      columns(rowOffset - 12) = rowIdx - nx*ny + nx - 1;
-      columns(rowOffset - 11) = rowIdx - nx*ny + nx;
-      columns(rowOffset - 10) = rowIdx - nx*ny + nx + 1;
-      columns(rowOffset -  9) = rowIdx - nx - 1;
-      columns(rowOffset -  8) = rowIdx - nx;
-      columns(rowOffset -  7) = rowIdx - nx + 1;
-      columns(rowOffset -  6) = rowIdx - 1;
-      columns(rowOffset -  5) = rowIdx;
-      columns(rowOffset -  4) = rowIdx + 1;
-      columns(rowOffset -  3) = rowIdx + nx - 1;
-      columns(rowOffset -  2) = rowIdx + nx;
-      columns(rowOffset -  1) = rowIdx + nx + 1;
-      if(topBC == 1) {
-	// Fill values
-	values(rowOffset - 18) = 0.0;
-	values(rowOffset - 17) = 0.0;
-	values(rowOffset - 16) = 0.0;
-	values(rowOffset - 15) = 0.0;
-	values(rowOffset - 14) = 0.0;
-	values(rowOffset - 13) = 0.0;
-	values(rowOffset - 12) = 0.0;
-	values(rowOffset - 11) = 0.0;
-	values(rowOffset - 10) = 0.0;
-	values(rowOffset -  9) = 0.0;
-	values(rowOffset -  8) = 0.0;
-	values(rowOffset -  7) = 0.0;
-	values(rowOffset -  6) = 0.0;
-	values(rowOffset -  5) = 1.0;
-	values(rowOffset -  4) = 0.0;
-	values(rowOffset -  3) = 0.0;
-	values(rowOffset -  2) = 0.0;
-	values(rowOffset -  1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 18) = -1.0;
-	values(rowOffset - 17) = -2.0;
-	values(rowOffset - 16) = -1.0;
-	values(rowOffset - 15) = -2.0;
-	values(rowOffset - 14) =  0.0;
-	values(rowOffset - 13) = -2.0;
-	values(rowOffset - 12) = -1.0;
-	values(rowOffset - 11) = -2.0;
-	values(rowOffset - 10) = -1.0;
-	values(rowOffset -  9) = -1.0;
-	values(rowOffset -  8) =  0.0;
-	values(rowOffset -  7) = -1.0;
-	values(rowOffset -  6) =  0.0;
-	values(rowOffset -  5) = 16.0;
-	values(rowOffset -  4) =  0.0;
-	values(rowOffset -  3) = -1.0;
-	values(rowOffset -  2) =  0.0;
-	values(rowOffset -  1) = -1.0;
-      }
+    // Compute row index
+    k              = idx;
+    ordinal_type i = nx - 2;
+    rowIdx         = (k + 1) * ny * nx + i + 1;
+
+    // Compute rowOffset
+    rowOffset = size_type(k) * numEntriesPerGridPlane + numEntriesBottomPlane +
+                size_type(i) * faceStencilLength + 2 * edgeStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 5) = rowIdx - ny * nx;
+    columns(rowOffset - 4) = rowIdx - 1;
+    columns(rowOffset - 3) = rowIdx;
+    columns(rowOffset - 2) = rowIdx + nx;
+    columns(rowOffset - 1) = rowIdx + ny * nx;
+    if (frontBC == 1 || rightBC == 1) {
+      // Fill values
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 1.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 5) = -1.0;
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = 4.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = -1.0;
     }
 
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const xEdgeFETag&, const size_type idx) const {
-
-      // Compute row index
-      ordinal_type i = idx;
-      ordinal_type rowIdx = i + 1;
-
-      // Compute rowOffset
-      size_type rowOffset = (i + 1)*edgeStencilLength + cornerStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 12) = rowIdx - 1;
-      columns(rowOffset - 11) = rowIdx;
-      columns(rowOffset - 10) = rowIdx + 1;
-      columns(rowOffset -  9) = rowIdx + nx - 1;
-      columns(rowOffset -  8) = rowIdx + nx;
-      columns(rowOffset -  7) = rowIdx + nx + 1;
-      columns(rowOffset -  6) = rowIdx + nx*ny - 1;
-      columns(rowOffset -  5) = rowIdx + nx*ny;
-      columns(rowOffset -  4) = rowIdx + nx*ny + 1;
-      columns(rowOffset -  3) = rowIdx + nx*ny + nx - 1;
-      columns(rowOffset -  2) = rowIdx + nx*ny + nx;
-      columns(rowOffset -  1) = rowIdx + nx*ny + nx + 1;
-      if(bottomBC == 1 || frontBC == 1) {
-	// Fill values
-	values(rowOffset - 12) = 0.0;
-	values(rowOffset - 11) = 1.0;
-	values(rowOffset - 10) = 0.0;
-	values(rowOffset -  9) = 0.0;
-	values(rowOffset -  8) = 0.0;
-	values(rowOffset -  7) = 0.0;
-	values(rowOffset -  6) = 0.0;
-	values(rowOffset -  5) = 0.0;
-	values(rowOffset -  4) = 0.0;
-	values(rowOffset -  3) = 0.0;
-	values(rowOffset -  2) = 0.0;
-	values(rowOffset -  1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 12) =  0.0;
-	values(rowOffset - 11) =  8.0;
-	values(rowOffset - 10) =  0.0;
-	values(rowOffset -  9) = -1.0;
-	values(rowOffset -  8) =  0.0;
-	values(rowOffset -  7) = -1.0;
-	values(rowOffset -  6) = -1.0;
-	values(rowOffset -  5) =  0.0;
-	values(rowOffset -  4) = -1.0;
-	values(rowOffset -  3) = -1.0;
-	values(rowOffset -  2) = -2.0;
-	values(rowOffset -  1) = -1.0;
-      }
+    // Compute row index
+    k              = idx;
+    ordinal_type j = ny - 2;
+    rowIdx         = (k + 1) * ny * nx + (j + 1) * nx;
+
+    // Compute rowOffset
+    rowOffset = size_type(k) * numEntriesPerGridPlane + numEntriesBottomPlane +
+                size_type(j) * numEntriesPerGridRow + numEntriesFrontRow +
+                edgeStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 5) = rowIdx - ny * nx;
+    columns(rowOffset - 4) = rowIdx - nx;
+    columns(rowOffset - 3) = rowIdx;
+    columns(rowOffset - 2) = rowIdx + 1;
+    columns(rowOffset - 1) = rowIdx + ny * nx;
+    if (backBC == 1 || leftBC == 1) {
+      // Fill values
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 1.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 5) = -1.0;
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = 4.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = -1.0;
+    }
 
-      // Compute row index
-      ordinal_type j = ny - 2;
-      i = idx;
-      rowIdx = (j + 1)*nx + i + 1;
-
-      // Compute rowOffset
-      rowOffset = j*numEntriesFrontRow + numEntriesBottomFrontRow
-        + (i + 1)*edgeStencilLength + cornerStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 12) = rowIdx - nx - 1;
-      columns(rowOffset - 11) = rowIdx - nx;
-      columns(rowOffset - 10) = rowIdx - nx + 1;
-      columns(rowOffset -  9) = rowIdx - 1;
-      columns(rowOffset -  8) = rowIdx;
-      columns(rowOffset -  7) = rowIdx + 1;
-      columns(rowOffset -  6) = rowIdx + nx*ny - nx - 1;
-      columns(rowOffset -  5) = rowIdx + nx*ny - nx;
-      columns(rowOffset -  4) = rowIdx + nx*ny - nx + 1;
-      columns(rowOffset -  3) = rowIdx + nx*ny - 1;
-      columns(rowOffset -  2) = rowIdx + nx*ny;
-      columns(rowOffset -  1) = rowIdx + nx*ny + 1;
-      if(bottomBC == 1 || backBC == 1) {
-	// Fill values
-	values(rowOffset - 12) = 0.0;
-	values(rowOffset - 11) = 0.0;
-	values(rowOffset - 10) = 0.0;
-	values(rowOffset -  9) = 0.0;
-	values(rowOffset -  8) = 1.0;
-	values(rowOffset -  7) = 0.0;
-	values(rowOffset -  6) = 0.0;
-	values(rowOffset -  5) = 0.0;
-	values(rowOffset -  4) = 0.0;
-	values(rowOffset -  3) = 0.0;
-	values(rowOffset -  2) = 0.0;
-	values(rowOffset -  1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 12) = -1.0;
-	values(rowOffset - 11) =  0.0;
-	values(rowOffset - 10) = -1.0;
-	values(rowOffset -  9) =  0.0;
-	values(rowOffset -  8) =  8.0;
-	values(rowOffset -  7) =  0.0;
-	values(rowOffset -  6) = -1.0;
-	values(rowOffset -  5) = -2.0;
-	values(rowOffset -  4) = -1.0;
-	values(rowOffset -  3) = -1.0;
-	values(rowOffset -  2) =  0.0;
-	values(rowOffset -  1) = -1.0;
-      }
+    // Compute row index
+    k      = idx;
+    j      = ny - 2;
+    i      = nx - 2;
+    rowIdx = (k + 1) * ny * nx + (j + 1) * nx + i + 1;
+
+    // Compute rowOffset
+    rowOffset = size_type(k) * numEntriesPerGridPlane + numEntriesBottomPlane +
+                size_type(j) * numEntriesPerGridRow + numEntriesFrontRow +
+                size_type(i) * faceStencilLength + 2 * edgeStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 5) = rowIdx - ny * nx;
+    columns(rowOffset - 4) = rowIdx - nx;
+    columns(rowOffset - 3) = rowIdx - 1;
+    columns(rowOffset - 2) = rowIdx;
+    columns(rowOffset - 1) = rowIdx + ny * nx;
+    if (backBC == 1 || rightBC == 1) {
+      // Fill values
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 1.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 5) = -1.0;
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = 4.0;
+      values(rowOffset - 1) = -1.0;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const cornerFDTag&, const ordinal_type /*idx*/) const {
+    // Bottom corners
+    ordinal_type rowIdx = 0;
+    size_type rowOffset = cornerStencilLength;
+    rowmap(rowIdx + 1)  = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 4) = rowIdx;
+    columns(rowOffset - 3) = rowIdx + 1;
+    columns(rowOffset - 2) = rowIdx + nx;
+    columns(rowOffset - 1) = rowIdx + ny * nx;
+    if (bottomBC == 1 || frontBC == 1 || leftBC == 1) {
+      // Fill values
+      values(rowOffset - 4) = 1.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 4) = 3.0;
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = -1.0;
+    }
 
-      // Compute row index
-      ordinal_type k = nz - 2;
-      i = idx;
-      rowIdx = (k + 1)*ny*nx + i + 1;
-
-      // Compute rowOffset
-      rowOffset = k*numEntriesPerGridPlane + numEntriesBottomPlane
-        + (i + 1)*edgeStencilLength + cornerStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 12) = rowIdx - nx*ny - 1;
-      columns(rowOffset - 11) = rowIdx - nx*ny;
-      columns(rowOffset - 10) = rowIdx - nx*ny + 1;
-      columns(rowOffset -  9) = rowIdx - nx*ny + nx - 1;
-      columns(rowOffset -  8) = rowIdx - nx*ny + nx;
-      columns(rowOffset -  7) = rowIdx - nx*ny + nx + 1;
-      columns(rowOffset -  6) = rowIdx - 1;
-      columns(rowOffset -  5) = rowIdx;
-      columns(rowOffset -  4) = rowIdx + 1;
-      columns(rowOffset -  3) = rowIdx + nx - 1;
-      columns(rowOffset -  2) = rowIdx + nx;
-      columns(rowOffset -  1) = rowIdx + nx + 1;
-      if(topBC == 1 || frontBC == 1) {
-	// Fill values
-	values(rowOffset - 12) = 0.0;
-	values(rowOffset - 11) = 0.0;
-	values(rowOffset - 10) = 0.0;
-	values(rowOffset -  9) = 0.0;
-	values(rowOffset -  8) = 0.0;
-	values(rowOffset -  7) = 0.0;
-	values(rowOffset -  6) = 0.0;
-	values(rowOffset -  5) = 1.0;
-	values(rowOffset -  4) = 0.0;
-	values(rowOffset -  3) = 0.0;
-	values(rowOffset -  2) = 0.0;
-	values(rowOffset -  1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 12) = -1.0;
-	values(rowOffset - 11) =  0.0;
-	values(rowOffset - 10) = -1.0;
-	values(rowOffset -  9) = -1.0;
-	values(rowOffset -  8) = -2.0;
-	values(rowOffset -  7) = -1.0;
-	values(rowOffset -  6) =  0.0;
-	values(rowOffset -  5) =  8.0;
-	values(rowOffset -  4) =  0.0;
-	values(rowOffset -  3) = -1.0;
-	values(rowOffset -  2) =  0.0;
-	values(rowOffset -  1) = -1.0;
-      }
+    rowIdx             = nx - 1;
+    rowOffset          = numEntriesBottomFrontRow;
+    rowmap(rowIdx + 1) = rowOffset;
 
-      // Compute row index
-      k = nz - 2;
-      j = ny - 2;
-      i = idx;
-      rowIdx = (k + 1)*ny*nx + (j + 1)*nx + i + 1;
-
-      // Compute rowOffset
-      rowOffset = k*numEntriesPerGridPlane + numEntriesBottomPlane
-        + j*numEntriesFrontRow + numEntriesBottomFrontRow
-        + (i + 1)*edgeStencilLength + cornerStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 12) = rowIdx - nx*ny - nx - 1;
-      columns(rowOffset - 11) = rowIdx - nx*ny - nx;
-      columns(rowOffset - 10) = rowIdx - nx*ny - nx + 1;
-      columns(rowOffset -  9) = rowIdx - nx*ny - 1;
-      columns(rowOffset -  8) = rowIdx - nx*ny;
-      columns(rowOffset -  7) = rowIdx - nx*ny + 1;
-      columns(rowOffset -  6) = rowIdx - nx - 1;
-      columns(rowOffset -  5) = rowIdx - nx;
-      columns(rowOffset -  4) = rowIdx - nx + 1;
-      columns(rowOffset -  3) = rowIdx - 1;
-      columns(rowOffset -  2) = rowIdx;
-      columns(rowOffset -  1) = rowIdx + 1;
-      if(topBC == 1 || backBC == 1) {
-	// Fill values
-	values(rowOffset - 12) = 0.0;
-	values(rowOffset - 11) = 0.0;
-	values(rowOffset - 10) = 0.0;
-	values(rowOffset -  9) = 0.0;
-	values(rowOffset -  8) = 0.0;
-	values(rowOffset -  7) = 0.0;
-	values(rowOffset -  6) = 0.0;
-	values(rowOffset -  5) = 0.0;
-	values(rowOffset -  4) = 0.0;
-	values(rowOffset -  3) = 0.0;
-	values(rowOffset -  2) = 1.0;
-	values(rowOffset -  1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 12) = -1.0;
-	values(rowOffset - 11) = -2.0;
-	values(rowOffset - 10) = -1.0;
-	values(rowOffset -  9) = -1.0;
-	values(rowOffset -  8) =  0.0;
-	values(rowOffset -  7) = -1.0;
-	values(rowOffset -  6) = -1.0;
-	values(rowOffset -  5) =  0.0;
-	values(rowOffset -  4) = -1.0;
-	values(rowOffset -  3) =  0.0;
-	values(rowOffset -  2) =  8.0;
-	values(rowOffset -  1) =  0.0;
-      }
+    // Fill column indices
+    columns(rowOffset - 4) = rowIdx - 1;
+    columns(rowOffset - 3) = rowIdx;
+    columns(rowOffset - 2) = rowIdx + nx;
+    columns(rowOffset - 1) = rowIdx + ny * nx;
+    if (bottomBC == 1 || frontBC == 1 || rightBC == 1) {
+      // Fill values
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 1.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = 3.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = -1.0;
     }
 
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const yEdgeFETag&, const size_type idx) const {
-      // Compute row index
-      ordinal_type j = idx;
-      ordinal_type rowIdx = (j + 1)*nx;
-
-      // Compute rowOffset
-      size_type rowOffset = j*numEntriesFrontRow + numEntriesBottomFrontRow + edgeStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 12) = rowIdx - nx;
-      columns(rowOffset - 11) = rowIdx - nx + 1;
-      columns(rowOffset - 10) = rowIdx;
-      columns(rowOffset -  9) = rowIdx + 1;
-      columns(rowOffset -  8) = rowIdx + nx;
-      columns(rowOffset -  7) = rowIdx + nx + 1;
-      columns(rowOffset -  6) = rowIdx + nx*ny - nx;
-      columns(rowOffset -  5) = rowIdx + nx*ny - nx + 1;
-      columns(rowOffset -  4) = rowIdx + nx*ny;
-      columns(rowOffset -  3) = rowIdx + nx*ny + 1;
-      columns(rowOffset -  2) = rowIdx + nx*ny + nx;
-      columns(rowOffset -  1) = rowIdx + nx*ny + nx + 1;
-      if(bottomBC == 1 || leftBC == 1) {
-	// Fill values
-	values(rowOffset - 12) = 0.0;
-	values(rowOffset - 11) = 0.0;
-	values(rowOffset - 10) = 1.0;
-	values(rowOffset -  9) = 0.0;
-	values(rowOffset -  8) = 0.0;
-	values(rowOffset -  7) = 0.0;
-	values(rowOffset -  6) = 0.0;
-	values(rowOffset -  5) = 0.0;
-	values(rowOffset -  4) = 0.0;
-	values(rowOffset -  3) = 0.0;
-	values(rowOffset -  2) = 0.0;
-	values(rowOffset -  1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 12) =  0.0;
-	values(rowOffset - 11) = -1.0;
-	values(rowOffset - 10) =  8.0;
-	values(rowOffset -  9) =  0.0;
-	values(rowOffset -  8) =  0.0;
-	values(rowOffset -  7) = -1.0;
-	values(rowOffset -  6) = -1.0;
-	values(rowOffset -  5) = -1.0;
-	values(rowOffset -  4) =  0.0;
-	values(rowOffset -  3) = -2.0;
-	values(rowOffset -  2) = -1.0;
-	values(rowOffset -  1) = -1.0;
-      }
-
-      // Compute row index
-      j = idx;
-      ordinal_type i = nx - 1;
-      rowIdx = (j + 1)*nx + i;
-
-      // Compute rowOffset
-      rowOffset = (j + 1)*numEntriesFrontRow + numEntriesBottomFrontRow;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 12) = rowIdx - nx - 1;
-      columns(rowOffset - 11) = rowIdx - nx;
-      columns(rowOffset - 10) = rowIdx - 1;
-      columns(rowOffset -  9) = rowIdx;
-      columns(rowOffset -  8) = rowIdx + nx - 1;
-      columns(rowOffset -  7) = rowIdx + nx;
-      columns(rowOffset -  6) = rowIdx + nx*ny - nx - 1;
-      columns(rowOffset -  5) = rowIdx + nx*ny - nx;
-      columns(rowOffset -  4) = rowIdx + nx*ny - 1;
-      columns(rowOffset -  3) = rowIdx + nx*ny;
-      columns(rowOffset -  2) = rowIdx + nx*ny + nx - 1;
-      columns(rowOffset -  1) = rowIdx + nx*ny + nx;
-      if(bottomBC == 1 || rightBC == 1) {
-	// Fill values
-	values(rowOffset - 12) = 0.0;
-	values(rowOffset - 11) = 0.0;
-	values(rowOffset - 10) = 0.0;
-	values(rowOffset -  9) = 0.0;
-	values(rowOffset -  8) = 0.0;
-	values(rowOffset -  7) = 0.0;
-	values(rowOffset -  6) = 0.0;
-	values(rowOffset -  5) = 0.0;
-	values(rowOffset -  4) = 0.0;
-	values(rowOffset -  3) = 1.0;
-	values(rowOffset -  2) = 0.0;
-	values(rowOffset -  1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 12) = -1.0;
-	values(rowOffset - 11) = -1.0;
-	values(rowOffset - 10) = -2.0;
-	values(rowOffset -  9) =  0.0;
-	values(rowOffset -  8) = -1.0;
-	values(rowOffset -  7) = -1.0;
-	values(rowOffset -  6) = -1.0;
-	values(rowOffset -  5) =  0.0;
-	values(rowOffset -  4) =  0.0;
-	values(rowOffset -  3) =  8.0;
-	values(rowOffset -  2) = -1.0;
-	values(rowOffset -  1) =  0.0;
-      }
+    rowIdx    = (ny - 1) * nx;
+    rowOffset = size_type(ny - 2) * numEntriesFrontRow +
+                numEntriesBottomFrontRow + cornerStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 4) = rowIdx - nx;
+    columns(rowOffset - 3) = rowIdx;
+    columns(rowOffset - 2) = rowIdx + 1;
+    columns(rowOffset - 1) = rowIdx + ny * nx;
+    if (bottomBC == 1 || backBC == 1 || leftBC == 1) {
+      // Fill values
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 1.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = 3.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = -1.0;
+    }
 
-      // Compute row index
-      ordinal_type k = nz - 2;
-      j = idx;
-      rowIdx = (k + 1)*ny*nx + (j + 1)*nx;
-
-      // Compute rowOffset
-      rowOffset = k*numEntriesPerGridPlane + numEntriesBottomPlane
-        + j*numEntriesFrontRow + numEntriesBottomFrontRow
-        + edgeStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 12) = rowIdx - nx*ny - 1;
-      columns(rowOffset - 11) = rowIdx - nx*ny;
-      columns(rowOffset - 10) = rowIdx - nx*ny + 1;
-      columns(rowOffset -  9) = rowIdx - nx*ny + nx - 1;
-      columns(rowOffset -  8) = rowIdx - nx*ny + nx;
-      columns(rowOffset -  7) = rowIdx - nx*ny + nx + 1;
-      columns(rowOffset -  6) = rowIdx - 1;
-      columns(rowOffset -  5) = rowIdx;
-      columns(rowOffset -  4) = rowIdx + 1;
-      columns(rowOffset -  3) = rowIdx + nx - 1;
-      columns(rowOffset -  2) = rowIdx + nx;
-      columns(rowOffset -  1) = rowIdx + nx + 1;
-      if(topBC == 1 || leftBC == 1) {
-	// Fill values
-	values(rowOffset - 12) = 0.0;
-	values(rowOffset - 11) = 0.0;
-	values(rowOffset - 10) = 0.0;
-	values(rowOffset -  9) = 0.0;
-	values(rowOffset -  8) = 0.0;
-	values(rowOffset -  7) = 0.0;
-	values(rowOffset -  6) = 0.0;
-	values(rowOffset -  5) = 1.0;
-	values(rowOffset -  4) = 0.0;
-	values(rowOffset -  3) = 0.0;
-	values(rowOffset -  2) = 0.0;
-	values(rowOffset -  1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 12) = -1.0;
-	values(rowOffset - 11) =  0.0;
-	values(rowOffset - 10) = -1.0;
-	values(rowOffset -  9) = -1.0;
-	values(rowOffset -  8) = -2.0;
-	values(rowOffset -  7) = -1.0;
-	values(rowOffset -  6) =  0.0;
-	values(rowOffset -  5) =  8.0;
-	values(rowOffset -  4) =  0.0;
-	values(rowOffset -  3) = -1.0;
-	values(rowOffset -  2) =  0.0;
-	values(rowOffset -  1) = -1.0;
-      }
+    rowIdx             = ny * nx - 1;
+    rowOffset          = numEntriesBottomPlane;
+    rowmap(rowIdx + 1) = rowOffset;
 
-      // Compute row index
-      k = nz - 2;
-      j = idx;
-      i = nx - 1;
-      rowIdx = (k + 1)*ny*nx + (j + 1)*nx + i;
-
-      // Compute rowOffset
-      rowOffset = k*numEntriesPerGridPlane + numEntriesBottomPlane
-        + (j + 1)*numEntriesFrontRow + numEntriesBottomFrontRow;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 12) = rowIdx - nx*ny - nx - 1;
-      columns(rowOffset - 11) = rowIdx - nx*ny - nx;
-      columns(rowOffset - 10) = rowIdx - nx*ny - 1;
-      columns(rowOffset -  9) = rowIdx - nx*ny;
-      columns(rowOffset -  8) = rowIdx - nx*ny + nx - 1;
-      columns(rowOffset -  7) = rowIdx - nx*ny + nx;
-      columns(rowOffset -  6) = rowIdx - nx - 1;
-      columns(rowOffset -  5) = rowIdx - nx;
-      columns(rowOffset -  4) = rowIdx - 1;
-      columns(rowOffset -  3) = rowIdx;
-      columns(rowOffset -  2) = rowIdx + nx - 1;
-      columns(rowOffset -  1) = rowIdx + nx;
-      if(topBC == 1 || rightBC == 1) {
-	// Fill values
-	values(rowOffset - 12) = 0.0;
-	values(rowOffset - 11) = 0.0;
-	values(rowOffset - 10) = 0.0;
-	values(rowOffset -  9) = 0.0;
-	values(rowOffset -  8) = 0.0;
-	values(rowOffset -  7) = 0.0;
-	values(rowOffset -  6) = 0.0;
-	values(rowOffset -  5) = 0.0;
-	values(rowOffset -  4) = 0.0;
-	values(rowOffset -  3) = 1.0;
-	values(rowOffset -  2) = 0.0;
-	values(rowOffset -  1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 12) = -1.0;
-	values(rowOffset - 11) = -1.0;
-	values(rowOffset - 10) = -2.0;
-	values(rowOffset -  9) =  0.0;
-	values(rowOffset -  8) = -1.0;
-	values(rowOffset -  7) = -1.0;
-	values(rowOffset -  6) = -1.0;
-	values(rowOffset -  5) =  0.0;
-	values(rowOffset -  4) =  0.0;
-	values(rowOffset -  3) =  8.0;
-	values(rowOffset -  2) = -1.0;
-	values(rowOffset -  1) =  0.0;
-      }
+    // Fill column indices
+    columns(rowOffset - 4) = rowIdx - nx;
+    columns(rowOffset - 3) = rowIdx - 1;
+    columns(rowOffset - 2) = rowIdx;
+    columns(rowOffset - 1) = rowIdx + ny * nx;
+    if (bottomBC == 1 || backBC == 1 || rightBC == 1) {
+      // Fill values
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 1.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = 3.0;
+      values(rowOffset - 1) = -1.0;
     }
 
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const zEdgeFETag&, const size_type idx) const {
-      // Compute row index
-      ordinal_type k = idx;
-      ordinal_type rowIdx = (k + 1)*ny*nx;
-
-      // Compute rowOffset
-      size_type rowOffset = k*numEntriesPerGridPlane
-        + numEntriesBottomPlane + edgeStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 12) = rowIdx - nx*ny;
-      columns(rowOffset - 11) = rowIdx - nx*ny + 1;
-      columns(rowOffset - 10) = rowIdx - nx*ny + nx;
-      columns(rowOffset -  9) = rowIdx - nx*ny + nx + 1;
-      columns(rowOffset -  8) = rowIdx;
-      columns(rowOffset -  7) = rowIdx + 1;
-      columns(rowOffset -  6) = rowIdx + nx;
-      columns(rowOffset -  5) = rowIdx + nx + 1;
-      columns(rowOffset -  4) = rowIdx + ny*nx;
-      columns(rowOffset -  3) = rowIdx + ny*nx + 1;
-      columns(rowOffset -  2) = rowIdx + ny*nx + nx;
-      columns(rowOffset -  1) = rowIdx + ny*nx + nx + 1;
-      if(frontBC == 1 || leftBC == 1) {
-	// Fill values
-	values(rowOffset - 12) = 0.0;
-	values(rowOffset - 11) = 0.0;
-	values(rowOffset - 10) = 0.0;
-	values(rowOffset -  9) = 0.0;
-	values(rowOffset -  8) = 1.0;
-	values(rowOffset -  7) = 0.0;
-	values(rowOffset -  6) = 0.0;
-	values(rowOffset -  5) = 0.0;
-	values(rowOffset -  4) = 0.0;
-	values(rowOffset -  3) = 0.0;
-	values(rowOffset -  2) = 0.0;
-	values(rowOffset -  1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 12) =  0.0;
-	values(rowOffset - 11) = -1.0;
-	values(rowOffset - 10) = -1.0;
-	values(rowOffset -  9) = -1.0;
-	values(rowOffset -  8) =  8.0;
-	values(rowOffset -  7) =  0.0;
-	values(rowOffset -  6) =  0.0;
-	values(rowOffset -  5) = -2.0;
-	values(rowOffset -  4) =  0.0;
-	values(rowOffset -  3) = -1.0;
-	values(rowOffset -  2) = -1.0;
-	values(rowOffset -  1) = -1.0;
-      }
-
-      // Compute row index
-      k = idx;
-      ordinal_type i = nx - 2;
-      rowIdx = (k + 1)*ny*nx + i + 1;
-
-      // Compute rowOffset
-      rowOffset = k*numEntriesPerGridPlane + numEntriesBottomPlane
-        + i*faceStencilLength + 2*edgeStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 12) = rowIdx - nx*ny - 1;
-      columns(rowOffset - 11) = rowIdx - nx*ny;
-      columns(rowOffset - 10) = rowIdx - nx*ny + nx - 1;
-      columns(rowOffset -  9) = rowIdx - nx*ny + nx;
-      columns(rowOffset -  8) = rowIdx - 1;
-      columns(rowOffset -  7) = rowIdx;
-      columns(rowOffset -  6) = rowIdx + nx - 1;
-      columns(rowOffset -  5) = rowIdx + nx;
-      columns(rowOffset -  4) = rowIdx + ny*nx - 1;
-      columns(rowOffset -  3) = rowIdx + ny*nx;
-      columns(rowOffset -  2) = rowIdx + ny*nx + nx - 1;
-      columns(rowOffset -  1) = rowIdx + ny*nx + nx;
-      if(frontBC == 1 || rightBC == 1) {
-	// Fill values
-	values(rowOffset - 12) = 0.0;
-	values(rowOffset - 11) = 0.0;
-	values(rowOffset - 10) = 0.0;
-	values(rowOffset -  9) = 0.0;
-	values(rowOffset -  8) = 0.0;
-	values(rowOffset -  7) = 1.0;
-	values(rowOffset -  6) = 0.0;
-	values(rowOffset -  5) = 0.0;
-	values(rowOffset -  4) = 0.0;
-	values(rowOffset -  3) = 0.0;
-	values(rowOffset -  2) = 0.0;
-	values(rowOffset -  1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 12) = -1.0;
-	values(rowOffset - 11) =  0.0;
-	values(rowOffset - 10) = -1.0;
-	values(rowOffset -  9) = -1.0;
-	values(rowOffset -  8) =  0.0;
-	values(rowOffset -  7) =  8.0;
-	values(rowOffset -  6) = -2.0;
-	values(rowOffset -  5) =  0.0;
-	values(rowOffset -  4) = -1.0;
-	values(rowOffset -  3) =  0.0;
-	values(rowOffset -  2) = -1.0;
-	values(rowOffset -  1) = -1.0;
-      }
+    rowIdx    = (nz - 1) * ny * nx;
+    rowOffset = size_type(nz - 2) * numEntriesPerGridPlane +
+                numEntriesBottomPlane + cornerStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 4) = rowIdx - ny * nx;
+    columns(rowOffset - 3) = rowIdx;
+    columns(rowOffset - 2) = rowIdx + 1;
+    columns(rowOffset - 1) = rowIdx + nx;
+    if (topBC == 1 || frontBC == 1 || leftBC == 1) {
+      // Fill values
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 1.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = 3.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = -1.0;
+    }
 
-	// Compute row index
-	k = idx;
-	ordinal_type j = ny - 2;
-	rowIdx = (k + 1)*ny*nx
-	  + (j + 1)*nx;
-
-	// Compute rowOffset
-	rowOffset = k*numEntriesPerGridPlane
-	  + numEntriesBottomPlane
-	  + j*numEntriesPerGridRow + numEntriesFrontRow
-	  + edgeStencilLength;
-	rowmap(rowIdx + 1) = rowOffset;
-
-	// Fill column indices
-	columns(rowOffset - 12) = rowIdx - nx*ny - nx;
-	columns(rowOffset - 11) = rowIdx - nx*ny - nx + 1;
-	columns(rowOffset - 10) = rowIdx - nx*ny;
-	columns(rowOffset -  9) = rowIdx - nx*ny + 1;
-	columns(rowOffset -  8) = rowIdx - nx;
-	columns(rowOffset -  7) = rowIdx - nx + 1;
-	columns(rowOffset -  6) = rowIdx;
-	columns(rowOffset -  5) = rowIdx + 1;
-	columns(rowOffset -  4) = rowIdx + ny*nx - nx;
-	columns(rowOffset -  3) = rowIdx + ny*nx - nx + 1;
-	columns(rowOffset -  2) = rowIdx + ny*nx;
-	columns(rowOffset -  1) = rowIdx + ny*nx + 1;
-      if(backBC == 1 || leftBC == 1) {
-	// Fill values
-	values(rowOffset - 12) = 0.0;
-	values(rowOffset - 11) = 0.0;
-	values(rowOffset - 10) = 0.0;
-	values(rowOffset -  9) = 0.0;
-	values(rowOffset -  8) = 0.0;
-	values(rowOffset -  7) = 0.0;
-	values(rowOffset -  6) = 1.0;
-	values(rowOffset -  5) = 0.0;
-	values(rowOffset -  4) = 0.0;
-	values(rowOffset -  3) = 0.0;
-	values(rowOffset -  2) = 0.0;
-	values(rowOffset -  1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 12) = -1.0;
-	values(rowOffset - 11) = -1.0;
-	values(rowOffset - 10) =  0.0;
-	values(rowOffset -  9) = -1.0;
-	values(rowOffset -  8) =  0.0;
-	values(rowOffset -  7) = -2.0;
-	values(rowOffset -  6) =  8.0;
-	values(rowOffset -  5) =  0.0;
-	values(rowOffset -  4) = -1.0;
-	values(rowOffset -  3) = -1.0;
-	values(rowOffset -  2) =  0.0;
-	values(rowOffset -  1) = -1.0;
-      }
+    rowIdx    = (nz - 1) * ny * nx + nx - 1;
+    rowOffset = size_type(nz - 2) * numEntriesPerGridPlane +
+                numEntriesBottomPlane + numEntriesBottomFrontRow;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 4) = rowIdx - ny * nx;
+    columns(rowOffset - 3) = rowIdx - 1;
+    columns(rowOffset - 2) = rowIdx;
+    columns(rowOffset - 1) = rowIdx + nx;
+    if (topBC == 1 || frontBC == 1 || rightBC == 1) {
+      // Fill values
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 1.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = 3.0;
+      values(rowOffset - 1) = -1.0;
+    }
 
-      // Compute row index
-      k = idx;
-      j = ny - 2;
-      i = nx - 2;
-      rowIdx = (k + 1)*ny*nx + (j + 1)*nx + i + 1;
-
-      // Compute rowOffset
-      rowOffset = k*numEntriesPerGridPlane
-        + numEntriesBottomPlane
-        + j*numEntriesPerGridRow + numEntriesFrontRow
-        + i*faceStencilLength + 2*edgeStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset - 12) = rowIdx - nx*ny - nx - 1;
-      columns(rowOffset - 11) = rowIdx - nx*ny - nx;
-      columns(rowOffset - 10) = rowIdx - nx*ny - 1;
-      columns(rowOffset -  9) = rowIdx - nx*ny;
-      columns(rowOffset -  8) = rowIdx - nx - 1;
-      columns(rowOffset -  7) = rowIdx - nx;
-      columns(rowOffset -  6) = rowIdx - 1;
-      columns(rowOffset -  5) = rowIdx;
-      columns(rowOffset -  4) = rowIdx + ny*nx - nx - 1;
-      columns(rowOffset -  3) = rowIdx + ny*nx - nx;
-      columns(rowOffset -  2) = rowIdx + ny*nx - 1;
-      columns(rowOffset -  1) = rowIdx + ny*nx;
-      if(backBC == 1 || rightBC == 1) {
-	// Fill values
-	values(rowOffset - 12) = 0.0;
-	values(rowOffset - 11) = 0.0;
-	values(rowOffset - 10) = 0.0;
-	values(rowOffset -  9) = 0.0;
-	values(rowOffset -  8) = 0.0;
-	values(rowOffset -  7) = 0.0;
-	values(rowOffset -  6) = 0.0;
-	values(rowOffset -  5) = 1.0;
-	values(rowOffset -  4) = 0.0;
-	values(rowOffset -  3) = 0.0;
-	values(rowOffset -  2) = 0.0;
-	values(rowOffset -  1) = 0.0;
-      } else {
-	// Fill values
-	values(rowOffset - 12) = -1.0;
-	values(rowOffset - 11) = -1.0;
-	values(rowOffset - 10) = -1.0;
-	values(rowOffset -  9) =  0.0;
-	values(rowOffset -  8) = -2.0;
-	values(rowOffset -  7) =  0.0;
-	values(rowOffset -  6) =  0.0;
-	values(rowOffset -  5) =  8.0;
-	values(rowOffset -  4) = -1.0;
-	values(rowOffset -  3) = -1.0;
-	values(rowOffset -  2) = -1.0;
-	values(rowOffset -  1) =  0.0;
-      }
+    rowIdx    = nz * ny * nx - nx;
+    rowOffset = size_type(nz - 2) * numEntriesPerGridPlane +
+                numEntriesBottomPlane + (ny - 2) * numEntriesFrontRow +
+                numEntriesBottomFrontRow + cornerStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 4) = rowIdx - ny * nx;
+    columns(rowOffset - 3) = rowIdx - nx;
+    columns(rowOffset - 2) = rowIdx;
+    columns(rowOffset - 1) = rowIdx + 1;
+    if (topBC == 1 || backBC == 1 || leftBC == 1) {
+      // Fill values
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 1.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = 3.0;
+      values(rowOffset - 1) = -1.0;
     }
 
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const cornerFETag&, const size_type /*idx*/) const {
-      // Bottom corners
-      ordinal_type rowIdx = 0;
-      size_type rowOffset = cornerStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset -  8) = rowIdx;
-      columns(rowOffset -  7) = rowIdx + 1;
-      columns(rowOffset -  6) = rowIdx + nx;
-      columns(rowOffset -  5) = rowIdx + nx + 1;
-      columns(rowOffset -  4) = rowIdx + ny*nx;
-      columns(rowOffset -  3) = rowIdx + ny*nx + 1;
-      columns(rowOffset -  2) = rowIdx + ny*nx + nx;
-      columns(rowOffset -  1) = rowIdx + ny*nx + nx + 1;
-      if(bottomBC == 1 || frontBC == 1 || leftBC == 1) {
-        // Fill values
-        values(rowOffset -  8) = 1.0;
-        values(rowOffset -  7) = 0.0;
-        values(rowOffset -  6) = 0.0;
-        values(rowOffset -  5) = 0.0;
-        values(rowOffset -  4) = 0.0;
-        values(rowOffset -  3) = 0.0;
-        values(rowOffset -  2) = 0.0;
-        values(rowOffset -  1) = 0.0;
-      } else {
-        // Fill values
-        values(rowOffset -  8) =  4.0;
-        values(rowOffset -  7) =  0.0;
-        values(rowOffset -  6) =  0.0;
-        values(rowOffset -  5) = -1.0;
-        values(rowOffset -  4) =  0.0;
-        values(rowOffset -  3) = -1.0;
-        values(rowOffset -  2) = -1.0;
-        values(rowOffset -  1) = -1.0;
-      }
+    rowIdx             = nz * ny * nx - 1;
+    rowOffset          = numEntries;
+    rowmap(rowIdx + 1) = rowOffset;
 
-      rowIdx = nx - 1;
-      rowOffset = numEntriesBottomFrontRow;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset -  8) = rowIdx - 1;
-      columns(rowOffset -  7) = rowIdx;
-      columns(rowOffset -  6) = rowIdx + nx - 1;
-      columns(rowOffset -  5) = rowIdx + nx;
-      columns(rowOffset -  4) = rowIdx + ny*nx - 1;
-      columns(rowOffset -  3) = rowIdx + ny*nx;
-      columns(rowOffset -  2) = rowIdx + ny*nx + nx - 1;
-      columns(rowOffset -  1) = rowIdx + ny*nx + nx;
-      if(bottomBC == 1 || frontBC == 1 || rightBC == 1) {
-        // Fill values
-        values(rowOffset -  8) = 0.0;
-        values(rowOffset -  7) = 1.0;
-        values(rowOffset -  6) = 0.0;
-        values(rowOffset -  5) = 0.0;
-        values(rowOffset -  4) = 0.0;
-        values(rowOffset -  3) = 0.0;
-        values(rowOffset -  2) = 0.0;
-        values(rowOffset -  1) = 0.0;
-      } else {
-        // Fill values
-        values(rowOffset -  8) =  0.0;
-        values(rowOffset -  7) =  4.0;
-        values(rowOffset -  6) = -1.0;
-        values(rowOffset -  5) =  0.0;
-        values(rowOffset -  4) = -1.0;
-        values(rowOffset -  3) =  0.0;
-        values(rowOffset -  2) = -1.0;
-        values(rowOffset -  1) = -1.0;
-      }
+    // Fill column indices
+    columns(rowOffset - 4) = rowIdx - ny * nx;
+    columns(rowOffset - 3) = rowIdx - nx;
+    columns(rowOffset - 2) = rowIdx - 1;
+    columns(rowOffset - 1) = rowIdx;
+    if (topBC == 1 || backBC == 1 || rightBC == 1) {
+      // Fill values
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 1.0;
+    } else {
+      // Fill values
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = 3.0;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const interiorFETag&, const ordinal_type idx) const {
+    // Compute row index
+    const ordinal_type k      = idx / ((ny - 2) * (nx - 2));
+    const ordinal_type rem    = idx % ((ny - 2) * (nx - 2));
+    const ordinal_type j      = rem / (nx - 2);
+    const ordinal_type i      = rem % (nx - 2);
+    const ordinal_type rowIdx = (k + 1) * ny * nx + (j + 1) * nx + i + 1;
+
+    // Compute rowOffset
+    const size_type rowOffset =
+        size_type(k) * numEntriesPerGridPlane + numEntriesBottomPlane +
+        size_type(j) * numEntriesPerGridRow + numEntriesFrontRow +
+        size_type(i + 1) * interiorStencilLength + faceStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 27) = rowIdx - ny * nx - nx - 1;
+    columns(rowOffset - 26) = rowIdx - ny * nx - nx;
+    columns(rowOffset - 25) = rowIdx - ny * nx - nx + 1;
+    columns(rowOffset - 24) = rowIdx - ny * nx - 1;
+    columns(rowOffset - 23) = rowIdx - ny * nx;
+    columns(rowOffset - 22) = rowIdx - ny * nx + 1;
+    columns(rowOffset - 21) = rowIdx - ny * nx + nx - 1;
+    columns(rowOffset - 20) = rowIdx - ny * nx + nx;
+    columns(rowOffset - 19) = rowIdx - ny * nx + nx + 1;
+    columns(rowOffset - 18) = rowIdx - nx - 1;
+    columns(rowOffset - 17) = rowIdx - nx;
+    columns(rowOffset - 16) = rowIdx - nx + 1;
+    columns(rowOffset - 15) = rowIdx - 1;
+    columns(rowOffset - 14) = rowIdx;
+    columns(rowOffset - 13) = rowIdx + 1;
+    columns(rowOffset - 12) = rowIdx + nx - 1;
+    columns(rowOffset - 11) = rowIdx + nx;
+    columns(rowOffset - 10) = rowIdx + nx + 1;
+    columns(rowOffset - 9)  = rowIdx + nx * ny - nx - 1;
+    columns(rowOffset - 8)  = rowIdx + nx * ny - nx;
+    columns(rowOffset - 7)  = rowIdx + nx * ny - nx + 1;
+    columns(rowOffset - 6)  = rowIdx + nx * ny - 1;
+    columns(rowOffset - 5)  = rowIdx + nx * ny;
+    columns(rowOffset - 4)  = rowIdx + nx * ny + 1;
+    columns(rowOffset - 3)  = rowIdx + nx * ny + nx - 1;
+    columns(rowOffset - 2)  = rowIdx + nx * ny + nx;
+    columns(rowOffset - 1)  = rowIdx + nx * ny + nx + 1;
+
+    // Fill values
+    values(rowOffset - 27) = -1.0;
+    values(rowOffset - 26) = -2.0;
+    values(rowOffset - 25) = -1.0;
+    values(rowOffset - 24) = -2.0;
+    values(rowOffset - 23) = 0.0;
+    values(rowOffset - 22) = -2.0;
+    values(rowOffset - 21) = -1.0;
+    values(rowOffset - 20) = -2.0;
+    values(rowOffset - 19) = -1.0;
+    values(rowOffset - 18) = -2.0;
+    values(rowOffset - 17) = 0.0;
+    values(rowOffset - 16) = -2.0;
+    values(rowOffset - 15) = 0.0;
+    values(rowOffset - 14) = 32.0;
+    values(rowOffset - 13) = 0.0;
+    values(rowOffset - 12) = -2.0;
+    values(rowOffset - 11) = 0.0;
+    values(rowOffset - 10) = -2.0;
+    values(rowOffset - 9)  = -1.0;
+    values(rowOffset - 8)  = -2.0;
+    values(rowOffset - 7)  = -1.0;
+    values(rowOffset - 6)  = -2.0;
+    values(rowOffset - 5)  = 0.0;
+    values(rowOffset - 4)  = -2.0;
+    values(rowOffset - 3)  = -1.0;
+    values(rowOffset - 2)  = -2.0;
+    values(rowOffset - 1)  = -1.0;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const xFaceFETag&, const ordinal_type idx) const {
+    /*******************/
+    /*   x == 0 face   */
+    /*******************/
+    // Compute row index
+    ordinal_type k      = idx / (ny - 2);
+    ordinal_type j      = idx % (ny - 2);
+    ordinal_type i      = 0;
+    ordinal_type rowIdx = (k + 1) * ny * nx + (j + 1) * nx + i;
+
+    // Compute rowOffset
+    size_type rowOffset = size_type(k) * numEntriesPerGridPlane +
+                          numEntriesBottomPlane +
+                          size_type(j) * numEntriesPerGridRow +
+                          numEntriesFrontRow + faceStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 18) = rowIdx - ny * nx - nx;
+    columns(rowOffset - 17) = rowIdx - ny * nx - nx + 1;
+    columns(rowOffset - 16) = rowIdx - ny * nx;
+    columns(rowOffset - 15) = rowIdx - ny * nx + 1;
+    columns(rowOffset - 14) = rowIdx - ny * nx + nx;
+    columns(rowOffset - 13) = rowIdx - ny * nx + nx + 1;
+    columns(rowOffset - 12) = rowIdx - nx;
+    columns(rowOffset - 11) = rowIdx - nx + 1;
+    columns(rowOffset - 10) = rowIdx;
+    columns(rowOffset - 9)  = rowIdx + 1;
+    columns(rowOffset - 8)  = rowIdx + nx;
+    columns(rowOffset - 7)  = rowIdx + nx + 1;
+    columns(rowOffset - 6)  = rowIdx + nx * ny - nx;
+    columns(rowOffset - 5)  = rowIdx + nx * ny - nx + 1;
+    columns(rowOffset - 4)  = rowIdx + nx * ny;
+    columns(rowOffset - 3)  = rowIdx + nx * ny + 1;
+    columns(rowOffset - 2)  = rowIdx + nx * ny + nx;
+    columns(rowOffset - 1)  = rowIdx + nx * ny + nx + 1;
+    if (leftBC == 1) {
+      // Fill values
+      values(rowOffset - 18) = 0.0;
+      values(rowOffset - 17) = 0.0;
+      values(rowOffset - 16) = 0.0;
+      values(rowOffset - 15) = 0.0;
+      values(rowOffset - 14) = 0.0;
+      values(rowOffset - 13) = 0.0;
+      values(rowOffset - 12) = 0.0;
+      values(rowOffset - 11) = 0.0;
+      values(rowOffset - 10) = 1.0;
+      values(rowOffset - 9)  = 0.0;
+      values(rowOffset - 8)  = 0.0;
+      values(rowOffset - 7)  = 0.0;
+      values(rowOffset - 6)  = 0.0;
+      values(rowOffset - 5)  = 0.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = 0.0;
+      values(rowOffset - 2)  = 0.0;
+      values(rowOffset - 1)  = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 18) = -1.0;
+      values(rowOffset - 17) = -1.0;
+      values(rowOffset - 16) = 0.0;
+      values(rowOffset - 15) = -2.0;
+      values(rowOffset - 14) = -1.0;
+      values(rowOffset - 13) = -1.0;
+      values(rowOffset - 12) = 0.0;
+      values(rowOffset - 11) = -2.0;
+      values(rowOffset - 10) = 16.0;
+      values(rowOffset - 9)  = 0.0;
+      values(rowOffset - 8)  = 0.0;
+      values(rowOffset - 7)  = -2.0;
+      values(rowOffset - 6)  = -1.0;
+      values(rowOffset - 5)  = -1.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = -2.0;
+      values(rowOffset - 2)  = -1.0;
+      values(rowOffset - 1)  = -1.0;
+    }
 
-      rowIdx = (ny - 1)*nx;
-      rowOffset = (ny - 2)*numEntriesFrontRow
-        + numEntriesBottomFrontRow + cornerStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset -  8) = rowIdx - nx;
-      columns(rowOffset -  7) = rowIdx - nx + 1;
-      columns(rowOffset -  6) = rowIdx;
-      columns(rowOffset -  5) = rowIdx + 1;
-      columns(rowOffset -  4) = rowIdx + ny*nx - nx;
-      columns(rowOffset -  3) = rowIdx + ny*nx - nx + 1;
-      columns(rowOffset -  2) = rowIdx + ny*nx;
-      columns(rowOffset -  1) = rowIdx + ny*nx + 1;
-      if(bottomBC == 1 || backBC == 1 || leftBC == 1) {
-        // Fill values
-        values(rowOffset -  8) = 0.0;
-        values(rowOffset -  7) = 0.0;
-        values(rowOffset -  6) = 1.0;
-        values(rowOffset -  5) = 0.0;
-        values(rowOffset -  4) = 0.0;
-        values(rowOffset -  3) = 0.0;
-        values(rowOffset -  2) = 0.0;
-        values(rowOffset -  1) = 0.0;
-      } else {
-        // Fill values
-        values(rowOffset -  8) =  0.0;
-        values(rowOffset -  7) = -1.0;
-        values(rowOffset -  6) =  4.0;
-        values(rowOffset -  5) =  0.0;
-        values(rowOffset -  4) = -1.0;
-        values(rowOffset -  3) = -1.0;
-        values(rowOffset -  2) =  0.0;
-        values(rowOffset -  1) = -1.0;
-      }
+    /********************/
+    /*   x == nx face   */
+    /********************/
+    // Compute row index
+    k      = idx / (ny - 2);
+    j      = idx % (ny - 2);
+    i      = nx - 1;
+    rowIdx = (k + 1) * ny * nx + (j + 1) * nx + i;
+
+    // Compute rowOffset
+    rowOffset = size_type(k) * numEntriesPerGridPlane + numEntriesBottomPlane +
+                size_type(j + 1) * numEntriesPerGridRow + numEntriesFrontRow;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 18) = rowIdx - ny * nx - nx - 1;
+    columns(rowOffset - 17) = rowIdx - ny * nx - nx;
+    columns(rowOffset - 16) = rowIdx - ny * nx - 1;
+    columns(rowOffset - 15) = rowIdx - ny * nx;
+    columns(rowOffset - 14) = rowIdx - ny * nx + nx - 1;
+    columns(rowOffset - 13) = rowIdx - ny * nx + nx;
+    columns(rowOffset - 12) = rowIdx - nx - 1;
+    columns(rowOffset - 11) = rowIdx - nx;
+    columns(rowOffset - 10) = rowIdx - 1;
+    columns(rowOffset - 9)  = rowIdx;
+    columns(rowOffset - 8)  = rowIdx + nx - 1;
+    columns(rowOffset - 7)  = rowIdx + nx;
+    columns(rowOffset - 6)  = rowIdx + nx * ny - nx - 1;
+    columns(rowOffset - 5)  = rowIdx + nx * ny - nx;
+    columns(rowOffset - 4)  = rowIdx + nx * ny - 1;
+    columns(rowOffset - 3)  = rowIdx + nx * ny;
+    columns(rowOffset - 2)  = rowIdx + nx * ny + nx - 1;
+    columns(rowOffset - 1)  = rowIdx + nx * ny + nx;
+    if (rightBC == 1) {
+      // Fill values
+      values(rowOffset - 18) = 0.0;
+      values(rowOffset - 17) = 0.0;
+      values(rowOffset - 16) = 0.0;
+      values(rowOffset - 15) = 0.0;
+      values(rowOffset - 14) = 0.0;
+      values(rowOffset - 13) = 0.0;
+      values(rowOffset - 12) = 0.0;
+      values(rowOffset - 11) = 0.0;
+      values(rowOffset - 10) = 0.0;
+      values(rowOffset - 9)  = 1.0;
+      values(rowOffset - 8)  = 0.0;
+      values(rowOffset - 7)  = 0.0;
+      values(rowOffset - 6)  = 0.0;
+      values(rowOffset - 5)  = 0.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = 0.0;
+      values(rowOffset - 2)  = 0.0;
+      values(rowOffset - 1)  = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 18) = -1.0;
+      values(rowOffset - 17) = -1.0;
+      values(rowOffset - 16) = -2.0;
+      values(rowOffset - 15) = 0.0;
+      values(rowOffset - 14) = -1.0;
+      values(rowOffset - 13) = -1.0;
+      values(rowOffset - 12) = -2.0;
+      values(rowOffset - 11) = 0.0;
+      values(rowOffset - 10) = 0.0;
+      values(rowOffset - 9)  = 16.0;
+      values(rowOffset - 8)  = -2.0;
+      values(rowOffset - 7)  = 0.0;
+      values(rowOffset - 6)  = -1.0;
+      values(rowOffset - 5)  = -1.0;
+      values(rowOffset - 4)  = -2.0;
+      values(rowOffset - 3)  = 0.0;
+      values(rowOffset - 2)  = -1.0;
+      values(rowOffset - 1)  = -1.0;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const yFaceFETag&, const ordinal_type idx) const {
+    /*******************/
+    /*   y == 0 face   */
+    /*******************/
+    // Compute row index
+    ordinal_type k      = idx / (nx - 2);
+    ordinal_type i      = idx % (nx - 2);
+    ordinal_type rowIdx = (k + 1) * ny * nx + i + 1;
+
+    // Compute rowOffset
+    size_type rowOffset =
+        size_type(k) * numEntriesPerGridPlane + numEntriesBottomPlane +
+        size_type(i + 1) * faceStencilLength + edgeStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 18) = rowIdx - ny * nx - 1;
+    columns(rowOffset - 17) = rowIdx - ny * nx;
+    columns(rowOffset - 16) = rowIdx - ny * nx + 1;
+    columns(rowOffset - 15) = rowIdx - ny * nx + nx - 1;
+    columns(rowOffset - 14) = rowIdx - ny * nx + nx;
+    columns(rowOffset - 13) = rowIdx - ny * nx + nx + 1;
+    columns(rowOffset - 12) = rowIdx - 1;
+    columns(rowOffset - 11) = rowIdx;
+    columns(rowOffset - 10) = rowIdx + 1;
+    columns(rowOffset - 9)  = rowIdx + nx - 1;
+    columns(rowOffset - 8)  = rowIdx + nx;
+    columns(rowOffset - 7)  = rowIdx + nx + 1;
+    columns(rowOffset - 6)  = rowIdx + nx * ny - 1;
+    columns(rowOffset - 5)  = rowIdx + nx * ny;
+    columns(rowOffset - 4)  = rowIdx + nx * ny + 1;
+    columns(rowOffset - 3)  = rowIdx + nx * ny + nx - 1;
+    columns(rowOffset - 2)  = rowIdx + nx * ny + nx;
+    columns(rowOffset - 1)  = rowIdx + nx * ny + nx + 1;
+    if (frontBC == 1) {
+      // Fill values
+      values(rowOffset - 18) = 0.0;
+      values(rowOffset - 17) = 0.0;
+      values(rowOffset - 16) = 0.0;
+      values(rowOffset - 15) = 0.0;
+      values(rowOffset - 14) = 0.0;
+      values(rowOffset - 13) = 0.0;
+      values(rowOffset - 12) = 0.0;
+      values(rowOffset - 11) = 1.0;
+      values(rowOffset - 10) = 0.0;
+      values(rowOffset - 9)  = 0.0;
+      values(rowOffset - 8)  = 0.0;
+      values(rowOffset - 7)  = 0.0;
+      values(rowOffset - 6)  = 0.0;
+      values(rowOffset - 5)  = 0.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = 0.0;
+      values(rowOffset - 2)  = 0.0;
+      values(rowOffset - 1)  = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 18) = -1.0;
+      values(rowOffset - 17) = 0.0;
+      values(rowOffset - 16) = -1.0;
+      values(rowOffset - 15) = -1.0;
+      values(rowOffset - 14) = -2.0;
+      values(rowOffset - 13) = -1.0;
+      values(rowOffset - 12) = 0.0;
+      values(rowOffset - 11) = 16.0;
+      values(rowOffset - 10) = 0.0;
+      values(rowOffset - 9)  = -2.0;
+      values(rowOffset - 8)  = 0.0;
+      values(rowOffset - 7)  = -2.0;
+      values(rowOffset - 6)  = -1.0;
+      values(rowOffset - 5)  = 0.0;
+      values(rowOffset - 4)  = -1.0;
+      values(rowOffset - 3)  = -1.0;
+      values(rowOffset - 2)  = -2.0;
+      values(rowOffset - 1)  = -1.0;
+    }
 
-      rowIdx = ny*nx - 1;
-      rowOffset = numEntriesBottomPlane;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset -  8) = rowIdx - nx - 1;
-      columns(rowOffset -  7) = rowIdx - nx;
-      columns(rowOffset -  6) = rowIdx - 1;
-      columns(rowOffset -  5) = rowIdx;
-      columns(rowOffset -  4) = rowIdx + ny*nx - nx - 1;
-      columns(rowOffset -  3) = rowIdx + ny*nx - nx;
-      columns(rowOffset -  2) = rowIdx + ny*nx - 1;
-      columns(rowOffset -  1) = rowIdx + ny*nx;
-      if(bottomBC == 1 || backBC == 1 || rightBC == 1) {
-        // Fill values
-        values(rowOffset -  8) = 0.0;
-        values(rowOffset -  7) = 0.0;
-        values(rowOffset -  6) = 0.0;
-        values(rowOffset -  5) = 1.0;
-        values(rowOffset -  4) = 0.0;
-        values(rowOffset -  3) = 0.0;
-        values(rowOffset -  2) = 0.0;
-        values(rowOffset -  1) = 0.0;
-      } else {
-        // Fill values
-        values(rowOffset -  8) = -1.0;
-        values(rowOffset -  7) =  0.0;
-        values(rowOffset -  6) =  0.0;
-        values(rowOffset -  5) =  4.0;
-        values(rowOffset -  4) = -1.0;
-        values(rowOffset -  3) = -1.0;
-        values(rowOffset -  2) = -1.0;
-        values(rowOffset -  1) =  0.0;
-      }
+    /********************/
+    /*   y == ny face   */
+    /********************/
+    // Compute row index
+    ordinal_type j = ny - 2;
+    k              = idx / (nx - 2);
+    i              = idx % (nx - 2);
+    rowIdx         = (k + 1) * ny * nx + (j + 1) * nx + i + 1;
+
+    // Compute rowOffset
+    rowOffset = size_type(k) * numEntriesPerGridPlane + numEntriesBottomPlane +
+                size_type(j) * numEntriesPerGridRow + numEntriesFrontRow +
+                size_type(i + 1) * faceStencilLength + edgeStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 18) = rowIdx - ny * nx - nx - 1;
+    columns(rowOffset - 17) = rowIdx - ny * nx - nx;
+    columns(rowOffset - 16) = rowIdx - ny * nx - 1;
+    columns(rowOffset - 15) = rowIdx - ny * nx - 1;
+    columns(rowOffset - 14) = rowIdx - ny * nx;
+    columns(rowOffset - 13) = rowIdx - ny * nx + 1;
+    columns(rowOffset - 12) = rowIdx - nx - 1;
+    columns(rowOffset - 11) = rowIdx - nx;
+    columns(rowOffset - 10) = rowIdx - nx + 1;
+    columns(rowOffset - 9)  = rowIdx - 1;
+    columns(rowOffset - 8)  = rowIdx;
+    columns(rowOffset - 7)  = rowIdx + 1;
+    columns(rowOffset - 6)  = rowIdx + nx * ny - nx - 1;
+    columns(rowOffset - 5)  = rowIdx + nx * ny - nx;
+    columns(rowOffset - 4)  = rowIdx + nx * ny - nx + 1;
+    columns(rowOffset - 3)  = rowIdx + nx * ny - 1;
+    columns(rowOffset - 2)  = rowIdx + nx * ny;
+    columns(rowOffset - 1)  = rowIdx + nx * ny + 1;
+    if (backBC == 1) {
+      // Fill values
+      values(rowOffset - 18) = 0.0;
+      values(rowOffset - 17) = 0.0;
+      values(rowOffset - 16) = 0.0;
+      values(rowOffset - 15) = 0.0;
+      values(rowOffset - 14) = 0.0;
+      values(rowOffset - 13) = 0.0;
+      values(rowOffset - 12) = 0.0;
+      values(rowOffset - 11) = 0.0;
+      values(rowOffset - 10) = 0.0;
+      values(rowOffset - 9)  = 0.0;
+      values(rowOffset - 8)  = 1.0;
+      values(rowOffset - 7)  = 0.0;
+      values(rowOffset - 6)  = 0.0;
+      values(rowOffset - 5)  = 0.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = 0.0;
+      values(rowOffset - 2)  = 0.0;
+      values(rowOffset - 1)  = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 18) = -1.0;
+      values(rowOffset - 17) = -2.0;
+      values(rowOffset - 16) = -1.0;
+      values(rowOffset - 15) = -1.0;
+      values(rowOffset - 14) = 0.0;
+      values(rowOffset - 13) = -1.0;
+      values(rowOffset - 12) = -2.0;
+      values(rowOffset - 11) = 0.0;
+      values(rowOffset - 10) = -2.0;
+      values(rowOffset - 9)  = 0.0;
+      values(rowOffset - 8)  = 16.0;
+      values(rowOffset - 7)  = 0.0;
+      values(rowOffset - 6)  = -1.0;
+      values(rowOffset - 5)  = -2.0;
+      values(rowOffset - 4)  = -1.0;
+      values(rowOffset - 3)  = -1.0;
+      values(rowOffset - 2)  = 0.0;
+      values(rowOffset - 1)  = -1.0;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const zFaceFETag&, const ordinal_type idx) const {
+    /*******************/
+    /*   z == 0 face   */
+    /*******************/
+    // Compute row index
+    ordinal_type j      = idx / (nx - 2);
+    ordinal_type i      = idx % (nx - 2);
+    ordinal_type rowIdx = (j + 1) * nx + i + 1;
+
+    // Compute rowOffset
+    size_type rowOffset =
+        size_type(j) * numEntriesFrontRow + numEntriesBottomFrontRow +
+        size_type(i + 1) * faceStencilLength + edgeStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 18) = rowIdx - nx - 1;
+    columns(rowOffset - 17) = rowIdx - nx;
+    columns(rowOffset - 16) = rowIdx - nx + 1;
+    columns(rowOffset - 15) = rowIdx - 1;
+    columns(rowOffset - 14) = rowIdx;
+    columns(rowOffset - 13) = rowIdx + 1;
+    columns(rowOffset - 12) = rowIdx + nx - 1;
+    columns(rowOffset - 11) = rowIdx + nx;
+    columns(rowOffset - 10) = rowIdx + nx + 1;
+    columns(rowOffset - 9)  = rowIdx + nx * ny - nx - 1;
+    columns(rowOffset - 8)  = rowIdx + nx * ny - nx;
+    columns(rowOffset - 7)  = rowIdx + nx * ny - nx + 1;
+    columns(rowOffset - 6)  = rowIdx + nx * ny - 1;
+    columns(rowOffset - 5)  = rowIdx + nx * ny;
+    columns(rowOffset - 4)  = rowIdx + nx * ny + 1;
+    columns(rowOffset - 3)  = rowIdx + nx * ny + nx - 1;
+    columns(rowOffset - 2)  = rowIdx + nx * ny + nx;
+    columns(rowOffset - 1)  = rowIdx + nx * ny + nx + 1;
+    if (bottomBC == 1) {
+      // Fill values
+      values(rowOffset - 18) = 0.0;
+      values(rowOffset - 17) = 0.0;
+      values(rowOffset - 16) = 0.0;
+      values(rowOffset - 15) = 0.0;
+      values(rowOffset - 14) = 1.0;
+      values(rowOffset - 13) = 0.0;
+      values(rowOffset - 12) = 0.0;
+      values(rowOffset - 11) = 0.0;
+      values(rowOffset - 10) = 0.0;
+      values(rowOffset - 9)  = 0.0;
+      values(rowOffset - 8)  = 0.0;
+      values(rowOffset - 7)  = 0.0;
+      values(rowOffset - 6)  = 0.0;
+      values(rowOffset - 5)  = 0.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = 0.0;
+      values(rowOffset - 2)  = 0.0;
+      values(rowOffset - 1)  = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 18) = -1.0;
+      values(rowOffset - 17) = 0.0;
+      values(rowOffset - 16) = -1.0;
+      values(rowOffset - 15) = 0.0;
+      values(rowOffset - 14) = 16.0;
+      values(rowOffset - 13) = 0.0;
+      values(rowOffset - 12) = -1.0;
+      values(rowOffset - 11) = 0.0;
+      values(rowOffset - 10) = -1.0;
+      values(rowOffset - 9)  = -1.0;
+      values(rowOffset - 8)  = -2.0;
+      values(rowOffset - 7)  = -1.0;
+      values(rowOffset - 6)  = -2.0;
+      values(rowOffset - 5)  = 0.0;
+      values(rowOffset - 4)  = -2.0;
+      values(rowOffset - 3)  = -1.0;
+      values(rowOffset - 2)  = -2.0;
+      values(rowOffset - 1)  = -1.0;
+    }
 
-      // Top corners
-      rowIdx = (nz - 1)*ny*nx;
-      rowOffset = (nz - 2)*numEntriesPerGridPlane
-        + numEntriesBottomPlane + cornerStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset -  8) = rowIdx - ny*nx;
-      columns(rowOffset -  7) = rowIdx - ny*nx + 1;
-      columns(rowOffset -  6) = rowIdx - ny*nx + nx;
-      columns(rowOffset -  5) = rowIdx - ny*nx + nx + 1;
-      columns(rowOffset -  4) = rowIdx;
-      columns(rowOffset -  3) = rowIdx + 1;
-      columns(rowOffset -  2) = rowIdx + nx;
-      columns(rowOffset -  1) = rowIdx + nx + 1;
-      if(topBC == 1 || frontBC == 1 || leftBC == 1) {
-        // Fill values
-        values(rowOffset -  8) = 0.0;
-        values(rowOffset -  7) = 0.0;
-        values(rowOffset -  6) = 0.0;
-        values(rowOffset -  5) = 0.0;
-        values(rowOffset -  4) = 1.0;
-        values(rowOffset -  3) = 0.0;
-        values(rowOffset -  2) = 0.0;
-        values(rowOffset -  1) = 0.0;
-      } else {
-        // Fill values
-        values(rowOffset -  8) =  0.0;
-        values(rowOffset -  7) = -1.0;
-        values(rowOffset -  6) = -1.0;
-        values(rowOffset -  5) = -1.0;
-        values(rowOffset -  4) =  4.0;
-        values(rowOffset -  3) =  0.0;
-        values(rowOffset -  2) =  0.0;
-        values(rowOffset -  1) = -1.0;
-      }
+    /********************/
+    /*   z == nz face   */
+    /********************/
+    // Compute row index
+    ordinal_type k = nz - 2;
+    j              = idx / (nx - 2);
+    i              = idx % (nx - 2);
+    rowIdx         = (k + 1) * ny * nx + (j + 1) * nx + i + 1;
+
+    // Compute rowOffset
+    rowOffset = size_type(k) * numEntriesPerGridPlane + numEntriesBottomPlane +
+                size_type(j) * numEntriesFrontRow + numEntriesBottomFrontRow +
+                size_type(i + 1) * faceStencilLength + edgeStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 18) = rowIdx - nx * ny - nx - 1;
+    columns(rowOffset - 17) = rowIdx - nx * ny - nx;
+    columns(rowOffset - 16) = rowIdx - nx * ny - nx + 1;
+    columns(rowOffset - 15) = rowIdx - nx * ny - 1;
+    columns(rowOffset - 14) = rowIdx - nx * ny;
+    columns(rowOffset - 13) = rowIdx - nx * ny + 1;
+    columns(rowOffset - 12) = rowIdx - nx * ny + nx - 1;
+    columns(rowOffset - 11) = rowIdx - nx * ny + nx;
+    columns(rowOffset - 10) = rowIdx - nx * ny + nx + 1;
+    columns(rowOffset - 9)  = rowIdx - nx - 1;
+    columns(rowOffset - 8)  = rowIdx - nx;
+    columns(rowOffset - 7)  = rowIdx - nx + 1;
+    columns(rowOffset - 6)  = rowIdx - 1;
+    columns(rowOffset - 5)  = rowIdx;
+    columns(rowOffset - 4)  = rowIdx + 1;
+    columns(rowOffset - 3)  = rowIdx + nx - 1;
+    columns(rowOffset - 2)  = rowIdx + nx;
+    columns(rowOffset - 1)  = rowIdx + nx + 1;
+    if (topBC == 1) {
+      // Fill values
+      values(rowOffset - 18) = 0.0;
+      values(rowOffset - 17) = 0.0;
+      values(rowOffset - 16) = 0.0;
+      values(rowOffset - 15) = 0.0;
+      values(rowOffset - 14) = 0.0;
+      values(rowOffset - 13) = 0.0;
+      values(rowOffset - 12) = 0.0;
+      values(rowOffset - 11) = 0.0;
+      values(rowOffset - 10) = 0.0;
+      values(rowOffset - 9)  = 0.0;
+      values(rowOffset - 8)  = 0.0;
+      values(rowOffset - 7)  = 0.0;
+      values(rowOffset - 6)  = 0.0;
+      values(rowOffset - 5)  = 1.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = 0.0;
+      values(rowOffset - 2)  = 0.0;
+      values(rowOffset - 1)  = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 18) = -1.0;
+      values(rowOffset - 17) = -2.0;
+      values(rowOffset - 16) = -1.0;
+      values(rowOffset - 15) = -2.0;
+      values(rowOffset - 14) = 0.0;
+      values(rowOffset - 13) = -2.0;
+      values(rowOffset - 12) = -1.0;
+      values(rowOffset - 11) = -2.0;
+      values(rowOffset - 10) = -1.0;
+      values(rowOffset - 9)  = -1.0;
+      values(rowOffset - 8)  = 0.0;
+      values(rowOffset - 7)  = -1.0;
+      values(rowOffset - 6)  = 0.0;
+      values(rowOffset - 5)  = 16.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = -1.0;
+      values(rowOffset - 2)  = 0.0;
+      values(rowOffset - 1)  = -1.0;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const xEdgeFETag&, const ordinal_type idx) const {
+    // Compute row index
+    ordinal_type i      = idx;
+    ordinal_type rowIdx = i + 1;
+
+    // Compute rowOffset
+    size_type rowOffset =
+        size_type(i + 1) * edgeStencilLength + cornerStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 12) = rowIdx - 1;
+    columns(rowOffset - 11) = rowIdx;
+    columns(rowOffset - 10) = rowIdx + 1;
+    columns(rowOffset - 9)  = rowIdx + nx - 1;
+    columns(rowOffset - 8)  = rowIdx + nx;
+    columns(rowOffset - 7)  = rowIdx + nx + 1;
+    columns(rowOffset - 6)  = rowIdx + nx * ny - 1;
+    columns(rowOffset - 5)  = rowIdx + nx * ny;
+    columns(rowOffset - 4)  = rowIdx + nx * ny + 1;
+    columns(rowOffset - 3)  = rowIdx + nx * ny + nx - 1;
+    columns(rowOffset - 2)  = rowIdx + nx * ny + nx;
+    columns(rowOffset - 1)  = rowIdx + nx * ny + nx + 1;
+    if (bottomBC == 1 || frontBC == 1) {
+      // Fill values
+      values(rowOffset - 12) = 0.0;
+      values(rowOffset - 11) = 1.0;
+      values(rowOffset - 10) = 0.0;
+      values(rowOffset - 9)  = 0.0;
+      values(rowOffset - 8)  = 0.0;
+      values(rowOffset - 7)  = 0.0;
+      values(rowOffset - 6)  = 0.0;
+      values(rowOffset - 5)  = 0.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = 0.0;
+      values(rowOffset - 2)  = 0.0;
+      values(rowOffset - 1)  = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 12) = 0.0;
+      values(rowOffset - 11) = 8.0;
+      values(rowOffset - 10) = 0.0;
+      values(rowOffset - 9)  = -1.0;
+      values(rowOffset - 8)  = 0.0;
+      values(rowOffset - 7)  = -1.0;
+      values(rowOffset - 6)  = -1.0;
+      values(rowOffset - 5)  = 0.0;
+      values(rowOffset - 4)  = -1.0;
+      values(rowOffset - 3)  = -1.0;
+      values(rowOffset - 2)  = -2.0;
+      values(rowOffset - 1)  = -1.0;
+    }
 
-      rowIdx = (nz - 1)*ny*nx + nx - 1;
-      rowOffset = (nz - 2)*numEntriesPerGridPlane
-        + numEntriesBottomPlane + numEntriesBottomFrontRow;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset -  8) = rowIdx - ny*nx - 1;
-      columns(rowOffset -  7) = rowIdx - ny*nx;
-      columns(rowOffset -  6) = rowIdx - ny*nx + nx - 1;
-      columns(rowOffset -  5) = rowIdx - ny*nx + nx;
-      columns(rowOffset -  4) = rowIdx - 1;
-      columns(rowOffset -  3) = rowIdx;
-      columns(rowOffset -  2) = rowIdx + nx - 1;
-      columns(rowOffset -  1) = rowIdx + nx;
-      if(topBC == 1 || frontBC == 1 || rightBC == 1) {
-        // Fill values
-        values(rowOffset -  8) = 0.0;
-        values(rowOffset -  7) = 0.0;
-        values(rowOffset -  6) = 0.0;
-        values(rowOffset -  5) = 0.0;
-        values(rowOffset -  4) = 0.0;
-        values(rowOffset -  3) = 1.0;
-        values(rowOffset -  2) = 0.0;
-        values(rowOffset -  1) = 0.0;
-      } else {
-        // Fill values
-        values(rowOffset -  8) = -1.0;
-        values(rowOffset -  7) =  0.0;
-        values(rowOffset -  6) = -1.0;
-        values(rowOffset -  5) = -1.0;
-        values(rowOffset -  4) =  0.0;
-        values(rowOffset -  3) =  4.0;
-        values(rowOffset -  2) = -1.0;
-        values(rowOffset -  1) =  0.0;
-      }
+    // Compute row index
+    ordinal_type j = ny - 2;
+    i              = idx;
+    rowIdx         = (j + 1) * nx + i + 1;
+
+    // Compute rowOffset
+    rowOffset = size_type(j) * numEntriesFrontRow + numEntriesBottomFrontRow +
+                size_type(i + 1) * edgeStencilLength + cornerStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 12) = rowIdx - nx - 1;
+    columns(rowOffset - 11) = rowIdx - nx;
+    columns(rowOffset - 10) = rowIdx - nx + 1;
+    columns(rowOffset - 9)  = rowIdx - 1;
+    columns(rowOffset - 8)  = rowIdx;
+    columns(rowOffset - 7)  = rowIdx + 1;
+    columns(rowOffset - 6)  = rowIdx + nx * ny - nx - 1;
+    columns(rowOffset - 5)  = rowIdx + nx * ny - nx;
+    columns(rowOffset - 4)  = rowIdx + nx * ny - nx + 1;
+    columns(rowOffset - 3)  = rowIdx + nx * ny - 1;
+    columns(rowOffset - 2)  = rowIdx + nx * ny;
+    columns(rowOffset - 1)  = rowIdx + nx * ny + 1;
+    if (bottomBC == 1 || backBC == 1) {
+      // Fill values
+      values(rowOffset - 12) = 0.0;
+      values(rowOffset - 11) = 0.0;
+      values(rowOffset - 10) = 0.0;
+      values(rowOffset - 9)  = 0.0;
+      values(rowOffset - 8)  = 1.0;
+      values(rowOffset - 7)  = 0.0;
+      values(rowOffset - 6)  = 0.0;
+      values(rowOffset - 5)  = 0.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = 0.0;
+      values(rowOffset - 2)  = 0.0;
+      values(rowOffset - 1)  = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 12) = -1.0;
+      values(rowOffset - 11) = 0.0;
+      values(rowOffset - 10) = -1.0;
+      values(rowOffset - 9)  = 0.0;
+      values(rowOffset - 8)  = 8.0;
+      values(rowOffset - 7)  = 0.0;
+      values(rowOffset - 6)  = -1.0;
+      values(rowOffset - 5)  = -2.0;
+      values(rowOffset - 4)  = -1.0;
+      values(rowOffset - 3)  = -1.0;
+      values(rowOffset - 2)  = 0.0;
+      values(rowOffset - 1)  = -1.0;
+    }
 
-      rowIdx = nz*ny*nx - nx;
-      rowOffset = (nz - 2)*numEntriesPerGridPlane
-        + numEntriesBottomPlane + (ny - 2)*numEntriesFrontRow
-        + numEntriesBottomFrontRow + cornerStencilLength;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset -  8) = rowIdx - ny*nx - nx;
-      columns(rowOffset -  7) = rowIdx - ny*nx - nx + 1;
-      columns(rowOffset -  6) = rowIdx - ny*nx;
-      columns(rowOffset -  5) = rowIdx - ny*nx + 1;
-      columns(rowOffset -  4) = rowIdx - nx;
-      columns(rowOffset -  3) = rowIdx - nx + 1;
-      columns(rowOffset -  2) = rowIdx;
-      columns(rowOffset -  1) = rowIdx + 1;
-      if(topBC == 1 || backBC == 1 || leftBC == 1) {
-        // Fill values
-        values(rowOffset -  8) = 0.0;
-        values(rowOffset -  7) = 0.0;
-        values(rowOffset -  6) = 0.0;
-        values(rowOffset -  5) = 0.0;
-        values(rowOffset -  4) = 0.0;
-        values(rowOffset -  3) = 0.0;
-        values(rowOffset -  2) = 1.0;
-        values(rowOffset -  1) = 0.0;
-      } else {
-        // Fill values
-        values(rowOffset -  8) = -1.0;
-        values(rowOffset -  7) = -1.0;
-        values(rowOffset -  6) =  0.0;
-        values(rowOffset -  5) = -1.0;
-        values(rowOffset -  4) =  0.0;
-        values(rowOffset -  3) = -1.0;
-        values(rowOffset -  2) =  4.0;
-        values(rowOffset -  1) =  0.0;
-      }
+    // Compute row index
+    ordinal_type k = nz - 2;
+    i              = idx;
+    rowIdx         = (k + 1) * ny * nx + i + 1;
+
+    // Compute rowOffset
+    rowOffset = size_type(k) * numEntriesPerGridPlane + numEntriesBottomPlane +
+                size_type(i + 1) * edgeStencilLength + cornerStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 12) = rowIdx - nx * ny - 1;
+    columns(rowOffset - 11) = rowIdx - nx * ny;
+    columns(rowOffset - 10) = rowIdx - nx * ny + 1;
+    columns(rowOffset - 9)  = rowIdx - nx * ny + nx - 1;
+    columns(rowOffset - 8)  = rowIdx - nx * ny + nx;
+    columns(rowOffset - 7)  = rowIdx - nx * ny + nx + 1;
+    columns(rowOffset - 6)  = rowIdx - 1;
+    columns(rowOffset - 5)  = rowIdx;
+    columns(rowOffset - 4)  = rowIdx + 1;
+    columns(rowOffset - 3)  = rowIdx + nx - 1;
+    columns(rowOffset - 2)  = rowIdx + nx;
+    columns(rowOffset - 1)  = rowIdx + nx + 1;
+    if (topBC == 1 || frontBC == 1) {
+      // Fill values
+      values(rowOffset - 12) = 0.0;
+      values(rowOffset - 11) = 0.0;
+      values(rowOffset - 10) = 0.0;
+      values(rowOffset - 9)  = 0.0;
+      values(rowOffset - 8)  = 0.0;
+      values(rowOffset - 7)  = 0.0;
+      values(rowOffset - 6)  = 0.0;
+      values(rowOffset - 5)  = 1.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = 0.0;
+      values(rowOffset - 2)  = 0.0;
+      values(rowOffset - 1)  = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 12) = -1.0;
+      values(rowOffset - 11) = 0.0;
+      values(rowOffset - 10) = -1.0;
+      values(rowOffset - 9)  = -1.0;
+      values(rowOffset - 8)  = -2.0;
+      values(rowOffset - 7)  = -1.0;
+      values(rowOffset - 6)  = 0.0;
+      values(rowOffset - 5)  = 8.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = -1.0;
+      values(rowOffset - 2)  = 0.0;
+      values(rowOffset - 1)  = -1.0;
+    }
 
-      rowIdx = nz*ny*nx - 1;
-      rowOffset = numEntries;
-      rowmap(rowIdx + 1) = rowOffset;
-
-      // Fill column indices
-      columns(rowOffset -  8) = rowIdx - ny*nx - nx - 1;
-      columns(rowOffset -  7) = rowIdx - ny*nx - nx;
-      columns(rowOffset -  6) = rowIdx - ny*nx - 1;
-      columns(rowOffset -  5) = rowIdx - ny*nx;
-      columns(rowOffset -  4) = rowIdx - nx - 1;
-      columns(rowOffset -  3) = rowIdx - nx;
-      columns(rowOffset -  2) = rowIdx - 1;
-      columns(rowOffset -  1) = rowIdx;
-      if(topBC == 1 || backBC == 1 || rightBC == 1) {
-        // Fill values
-        values(rowOffset -  8) = 0.0;
-        values(rowOffset -  7) = 0.0;
-        values(rowOffset -  6) = 0.0;
-        values(rowOffset -  5) = 0.0;
-        values(rowOffset -  4) = 0.0;
-        values(rowOffset -  3) = 0.0;
-        values(rowOffset -  2) = 0.0;
-        values(rowOffset -  1) = 1.0;
-      } else {
-        // Fill values
-        values(rowOffset -  8) = -1.0;
-        values(rowOffset -  7) = -1.0;
-        values(rowOffset -  6) = -1.0;
-        values(rowOffset -  5) =  0.0;
-        values(rowOffset -  4) = -1.0;
-        values(rowOffset -  3) =  0.0;
-        values(rowOffset -  2) =  0.0;
-        values(rowOffset -  1) =  4.0;
-      }
+    // Compute row index
+    k      = nz - 2;
+    j      = ny - 2;
+    i      = idx;
+    rowIdx = (k + 1) * ny * nx + (j + 1) * nx + i + 1;
+
+    // Compute rowOffset
+    rowOffset = size_type(k) * numEntriesPerGridPlane + numEntriesBottomPlane +
+                size_type(j) * numEntriesFrontRow + numEntriesBottomFrontRow +
+                size_type(i + 1) * edgeStencilLength + cornerStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 12) = rowIdx - nx * ny - nx - 1;
+    columns(rowOffset - 11) = rowIdx - nx * ny - nx;
+    columns(rowOffset - 10) = rowIdx - nx * ny - nx + 1;
+    columns(rowOffset - 9)  = rowIdx - nx * ny - 1;
+    columns(rowOffset - 8)  = rowIdx - nx * ny;
+    columns(rowOffset - 7)  = rowIdx - nx * ny + 1;
+    columns(rowOffset - 6)  = rowIdx - nx - 1;
+    columns(rowOffset - 5)  = rowIdx - nx;
+    columns(rowOffset - 4)  = rowIdx - nx + 1;
+    columns(rowOffset - 3)  = rowIdx - 1;
+    columns(rowOffset - 2)  = rowIdx;
+    columns(rowOffset - 1)  = rowIdx + 1;
+    if (topBC == 1 || backBC == 1) {
+      // Fill values
+      values(rowOffset - 12) = 0.0;
+      values(rowOffset - 11) = 0.0;
+      values(rowOffset - 10) = 0.0;
+      values(rowOffset - 9)  = 0.0;
+      values(rowOffset - 8)  = 0.0;
+      values(rowOffset - 7)  = 0.0;
+      values(rowOffset - 6)  = 0.0;
+      values(rowOffset - 5)  = 0.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = 0.0;
+      values(rowOffset - 2)  = 1.0;
+      values(rowOffset - 1)  = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 12) = -1.0;
+      values(rowOffset - 11) = -2.0;
+      values(rowOffset - 10) = -1.0;
+      values(rowOffset - 9)  = -1.0;
+      values(rowOffset - 8)  = 0.0;
+      values(rowOffset - 7)  = -1.0;
+      values(rowOffset - 6)  = -1.0;
+      values(rowOffset - 5)  = 0.0;
+      values(rowOffset - 4)  = -1.0;
+      values(rowOffset - 3)  = 0.0;
+      values(rowOffset - 2)  = 8.0;
+      values(rowOffset - 1)  = 0.0;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const yEdgeFETag&, const ordinal_type idx) const {
+    // Compute row index
+    ordinal_type j      = idx;
+    ordinal_type rowIdx = (j + 1) * nx;
+
+    // Compute rowOffset
+    size_type rowOffset = size_type(j) * numEntriesFrontRow +
+                          numEntriesBottomFrontRow + edgeStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 12) = rowIdx - nx;
+    columns(rowOffset - 11) = rowIdx - nx + 1;
+    columns(rowOffset - 10) = rowIdx;
+    columns(rowOffset - 9)  = rowIdx + 1;
+    columns(rowOffset - 8)  = rowIdx + nx;
+    columns(rowOffset - 7)  = rowIdx + nx + 1;
+    columns(rowOffset - 6)  = rowIdx + nx * ny - nx;
+    columns(rowOffset - 5)  = rowIdx + nx * ny - nx + 1;
+    columns(rowOffset - 4)  = rowIdx + nx * ny;
+    columns(rowOffset - 3)  = rowIdx + nx * ny + 1;
+    columns(rowOffset - 2)  = rowIdx + nx * ny + nx;
+    columns(rowOffset - 1)  = rowIdx + nx * ny + nx + 1;
+    if (bottomBC == 1 || leftBC == 1) {
+      // Fill values
+      values(rowOffset - 12) = 0.0;
+      values(rowOffset - 11) = 0.0;
+      values(rowOffset - 10) = 1.0;
+      values(rowOffset - 9)  = 0.0;
+      values(rowOffset - 8)  = 0.0;
+      values(rowOffset - 7)  = 0.0;
+      values(rowOffset - 6)  = 0.0;
+      values(rowOffset - 5)  = 0.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = 0.0;
+      values(rowOffset - 2)  = 0.0;
+      values(rowOffset - 1)  = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 12) = 0.0;
+      values(rowOffset - 11) = -1.0;
+      values(rowOffset - 10) = 8.0;
+      values(rowOffset - 9)  = 0.0;
+      values(rowOffset - 8)  = 0.0;
+      values(rowOffset - 7)  = -1.0;
+      values(rowOffset - 6)  = -1.0;
+      values(rowOffset - 5)  = -1.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = -2.0;
+      values(rowOffset - 2)  = -1.0;
+      values(rowOffset - 1)  = -1.0;
     }
-  };
 
-  template <typename CrsMatrix_t, typename mat_structure>
-  CrsMatrix_t generate_structured_matrix3D(const std::string stencil,
-                                    const mat_structure& structure) {
+    // Compute row index
+    j              = idx;
+    ordinal_type i = nx - 1;
+    rowIdx         = (j + 1) * nx + i;
+
+    // Compute rowOffset
+    rowOffset =
+        size_type(j + 1) * numEntriesFrontRow + numEntriesBottomFrontRow;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 12) = rowIdx - nx - 1;
+    columns(rowOffset - 11) = rowIdx - nx;
+    columns(rowOffset - 10) = rowIdx - 1;
+    columns(rowOffset - 9)  = rowIdx;
+    columns(rowOffset - 8)  = rowIdx + nx - 1;
+    columns(rowOffset - 7)  = rowIdx + nx;
+    columns(rowOffset - 6)  = rowIdx + nx * ny - nx - 1;
+    columns(rowOffset - 5)  = rowIdx + nx * ny - nx;
+    columns(rowOffset - 4)  = rowIdx + nx * ny - 1;
+    columns(rowOffset - 3)  = rowIdx + nx * ny;
+    columns(rowOffset - 2)  = rowIdx + nx * ny + nx - 1;
+    columns(rowOffset - 1)  = rowIdx + nx * ny + nx;
+    if (bottomBC == 1 || rightBC == 1) {
+      // Fill values
+      values(rowOffset - 12) = 0.0;
+      values(rowOffset - 11) = 0.0;
+      values(rowOffset - 10) = 0.0;
+      values(rowOffset - 9)  = 0.0;
+      values(rowOffset - 8)  = 0.0;
+      values(rowOffset - 7)  = 0.0;
+      values(rowOffset - 6)  = 0.0;
+      values(rowOffset - 5)  = 0.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = 1.0;
+      values(rowOffset - 2)  = 0.0;
+      values(rowOffset - 1)  = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 12) = -1.0;
+      values(rowOffset - 11) = -1.0;
+      values(rowOffset - 10) = -2.0;
+      values(rowOffset - 9)  = 0.0;
+      values(rowOffset - 8)  = -1.0;
+      values(rowOffset - 7)  = -1.0;
+      values(rowOffset - 6)  = -1.0;
+      values(rowOffset - 5)  = 0.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = 8.0;
+      values(rowOffset - 2)  = -1.0;
+      values(rowOffset - 1)  = 0.0;
+    }
 
-    typedef typename CrsMatrix_t::StaticCrsGraphType graph_t;
-    typedef typename CrsMatrix_t::row_map_type::non_const_type row_map_view_t;
-    typedef typename CrsMatrix_t::index_type::non_const_type   cols_view_t;
-    typedef typename CrsMatrix_t::values_type::non_const_type  scalar_view_t;
-    typedef typename CrsMatrix_t::non_const_size_type size_type;
-    typedef typename CrsMatrix_t::non_const_ordinal_type ordinal_type;
+    // Compute row index
+    ordinal_type k = nz - 2;
+    j              = idx;
+    rowIdx         = (k + 1) * ny * nx + (j + 1) * nx;
+
+    // Compute rowOffset
+    rowOffset = size_type(k) * numEntriesPerGridPlane + numEntriesBottomPlane +
+                size_type(j) * numEntriesFrontRow + numEntriesBottomFrontRow +
+                edgeStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 12) = rowIdx - nx * ny - 1;
+    columns(rowOffset - 11) = rowIdx - nx * ny;
+    columns(rowOffset - 10) = rowIdx - nx * ny + 1;
+    columns(rowOffset - 9)  = rowIdx - nx * ny + nx - 1;
+    columns(rowOffset - 8)  = rowIdx - nx * ny + nx;
+    columns(rowOffset - 7)  = rowIdx - nx * ny + nx + 1;
+    columns(rowOffset - 6)  = rowIdx - 1;
+    columns(rowOffset - 5)  = rowIdx;
+    columns(rowOffset - 4)  = rowIdx + 1;
+    columns(rowOffset - 3)  = rowIdx + nx - 1;
+    columns(rowOffset - 2)  = rowIdx + nx;
+    columns(rowOffset - 1)  = rowIdx + nx + 1;
+    if (topBC == 1 || leftBC == 1) {
+      // Fill values
+      values(rowOffset - 12) = 0.0;
+      values(rowOffset - 11) = 0.0;
+      values(rowOffset - 10) = 0.0;
+      values(rowOffset - 9)  = 0.0;
+      values(rowOffset - 8)  = 0.0;
+      values(rowOffset - 7)  = 0.0;
+      values(rowOffset - 6)  = 0.0;
+      values(rowOffset - 5)  = 1.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = 0.0;
+      values(rowOffset - 2)  = 0.0;
+      values(rowOffset - 1)  = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 12) = -1.0;
+      values(rowOffset - 11) = 0.0;
+      values(rowOffset - 10) = -1.0;
+      values(rowOffset - 9)  = -1.0;
+      values(rowOffset - 8)  = -2.0;
+      values(rowOffset - 7)  = -1.0;
+      values(rowOffset - 6)  = 0.0;
+      values(rowOffset - 5)  = 8.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = -1.0;
+      values(rowOffset - 2)  = 0.0;
+      values(rowOffset - 1)  = -1.0;
+    }
 
-    int stencil_type = 0;
-    if (stencil == "FD") {
-      stencil_type = FD;
-    } else if (stencil == "FE") {
-      stencil_type = FE;
+    // Compute row index
+    k      = nz - 2;
+    j      = idx;
+    i      = nx - 1;
+    rowIdx = (k + 1) * ny * nx + (j + 1) * nx + i;
+
+    // Compute rowOffset
+    rowOffset = size_type(k) * numEntriesPerGridPlane + numEntriesBottomPlane +
+                size_type(j + 1) * numEntriesFrontRow +
+                numEntriesBottomFrontRow;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 12) = rowIdx - nx * ny - nx - 1;
+    columns(rowOffset - 11) = rowIdx - nx * ny - nx;
+    columns(rowOffset - 10) = rowIdx - nx * ny - 1;
+    columns(rowOffset - 9)  = rowIdx - nx * ny;
+    columns(rowOffset - 8)  = rowIdx - nx * ny + nx - 1;
+    columns(rowOffset - 7)  = rowIdx - nx * ny + nx;
+    columns(rowOffset - 6)  = rowIdx - nx - 1;
+    columns(rowOffset - 5)  = rowIdx - nx;
+    columns(rowOffset - 4)  = rowIdx - 1;
+    columns(rowOffset - 3)  = rowIdx;
+    columns(rowOffset - 2)  = rowIdx + nx - 1;
+    columns(rowOffset - 1)  = rowIdx + nx;
+    if (topBC == 1 || rightBC == 1) {
+      // Fill values
+      values(rowOffset - 12) = 0.0;
+      values(rowOffset - 11) = 0.0;
+      values(rowOffset - 10) = 0.0;
+      values(rowOffset - 9)  = 0.0;
+      values(rowOffset - 8)  = 0.0;
+      values(rowOffset - 7)  = 0.0;
+      values(rowOffset - 6)  = 0.0;
+      values(rowOffset - 5)  = 0.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = 1.0;
+      values(rowOffset - 2)  = 0.0;
+      values(rowOffset - 1)  = 0.0;
     } else {
-      std::ostringstream os;
-      os << "Test::generate_structured_matrix3D only accepts stencil: FD and FEM, you passed: "
-         << stencil <<" !" << std::endl;
-      Kokkos::Impl::throw_runtime_exception (os.str ());
-    }
-
-    // Extract geometric data
-    const ordinal_type nx          = structure(0,0);
-    const ordinal_type ny          = structure(1,0);
-    const ordinal_type nz          = structure(2,0);
-    const ordinal_type numNodes    = ny*nx*nz;
-    const ordinal_type leftBC      = structure(0,1);
-    const ordinal_type rightBC     = structure(0,2);
-    const ordinal_type frontBC     = structure(1,1);
-    const ordinal_type backBC      = structure(1,2);
-    const ordinal_type bottomBC    = structure(2,1);
-    const ordinal_type topBC       = structure(2,2);
-    const ordinal_type numInterior = (nx - leftBC - rightBC)*(ny - frontBC - backBC)
-      *(nz - bottomBC - topBC);
-    const ordinal_type numFace     =
-      (leftBC + rightBC)*(ny - frontBC - backBC)*(nz - bottomBC - topBC)
-      + (frontBC + backBC)*(nx - leftBC - rightBC)*(nz - bottomBC - topBC)
-      + (bottomBC + topBC)*(nx - leftBC - rightBC)*(ny - frontBC - backBC);
-    const ordinal_type numEdge     =
-      (frontBC*bottomBC + frontBC*topBC + backBC*bottomBC + backBC*topBC)*(nx - leftBC - rightBC)
-       + (leftBC*bottomBC + leftBC*topBC + rightBC*bottomBC + rightBC*topBC)*(ny - frontBC - backBC)
-       + (leftBC*frontBC + leftBC*backBC + rightBC*frontBC + rightBC*backBC)*(nz - bottomBC - topBC);
-    const ordinal_type numCorner   = leftBC*frontBC*bottomBC + rightBC*frontBC*bottomBC
-      + leftBC*backBC*bottomBC + rightBC*backBC*bottomBC
-      + leftBC*frontBC*topBC + rightBC*frontBC*topBC
-      + leftBC*backBC*topBC + rightBC*backBC*topBC;
-    ordinal_type interiorStencilLength = 0, faceStencilLength = 0, edgeStencilLength = 0, cornerStencilLength = 0;
-
-    if(stencil_type == FD) {
-      interiorStencilLength = 7;
-      faceStencilLength     = 6;
-      edgeStencilLength     = 5;
-      cornerStencilLength   = 4;
-    } else if(stencil_type == FE) {
-      interiorStencilLength = 27;
-      faceStencilLength     = 18;
-      edgeStencilLength     = 12;
-      cornerStencilLength   = 8;
+      // Fill values
+      values(rowOffset - 12) = -1.0;
+      values(rowOffset - 11) = -1.0;
+      values(rowOffset - 10) = -2.0;
+      values(rowOffset - 9)  = 0.0;
+      values(rowOffset - 8)  = -1.0;
+      values(rowOffset - 7)  = -1.0;
+      values(rowOffset - 6)  = -1.0;
+      values(rowOffset - 5)  = 0.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = 8.0;
+      values(rowOffset - 2)  = -1.0;
+      values(rowOffset - 1)  = 0.0;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const zEdgeFETag&, const ordinal_type idx) const {
+    // Compute row index
+    ordinal_type k      = idx;
+    ordinal_type rowIdx = (k + 1) * ny * nx;
+
+    // Compute rowOffset
+    size_type rowOffset = size_type(k) * numEntriesPerGridPlane +
+                          numEntriesBottomPlane + edgeStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 12) = rowIdx - nx * ny;
+    columns(rowOffset - 11) = rowIdx - nx * ny + 1;
+    columns(rowOffset - 10) = rowIdx - nx * ny + nx;
+    columns(rowOffset - 9)  = rowIdx - nx * ny + nx + 1;
+    columns(rowOffset - 8)  = rowIdx;
+    columns(rowOffset - 7)  = rowIdx + 1;
+    columns(rowOffset - 6)  = rowIdx + nx;
+    columns(rowOffset - 5)  = rowIdx + nx + 1;
+    columns(rowOffset - 4)  = rowIdx + ny * nx;
+    columns(rowOffset - 3)  = rowIdx + ny * nx + 1;
+    columns(rowOffset - 2)  = rowIdx + ny * nx + nx;
+    columns(rowOffset - 1)  = rowIdx + ny * nx + nx + 1;
+    if (frontBC == 1 || leftBC == 1) {
+      // Fill values
+      values(rowOffset - 12) = 0.0;
+      values(rowOffset - 11) = 0.0;
+      values(rowOffset - 10) = 0.0;
+      values(rowOffset - 9)  = 0.0;
+      values(rowOffset - 8)  = 1.0;
+      values(rowOffset - 7)  = 0.0;
+      values(rowOffset - 6)  = 0.0;
+      values(rowOffset - 5)  = 0.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = 0.0;
+      values(rowOffset - 2)  = 0.0;
+      values(rowOffset - 1)  = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 12) = 0.0;
+      values(rowOffset - 11) = -1.0;
+      values(rowOffset - 10) = -1.0;
+      values(rowOffset - 9)  = -1.0;
+      values(rowOffset - 8)  = 8.0;
+      values(rowOffset - 7)  = 0.0;
+      values(rowOffset - 6)  = 0.0;
+      values(rowOffset - 5)  = -2.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = -1.0;
+      values(rowOffset - 2)  = -1.0;
+      values(rowOffset - 1)  = -1.0;
     }
 
-    const size_type numEntries = numInterior*interiorStencilLength
-      + numFace*faceStencilLength
-      + numEdge*edgeStencilLength
-      + numCorner*cornerStencilLength;
+    // Compute row index
+    k              = idx;
+    ordinal_type i = nx - 2;
+    rowIdx         = (k + 1) * ny * nx + i + 1;
+
+    // Compute rowOffset
+    rowOffset = size_type(k) * numEntriesPerGridPlane + numEntriesBottomPlane +
+                size_type(i) * faceStencilLength + 2 * edgeStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 12) = rowIdx - nx * ny - 1;
+    columns(rowOffset - 11) = rowIdx - nx * ny;
+    columns(rowOffset - 10) = rowIdx - nx * ny + nx - 1;
+    columns(rowOffset - 9)  = rowIdx - nx * ny + nx;
+    columns(rowOffset - 8)  = rowIdx - 1;
+    columns(rowOffset - 7)  = rowIdx;
+    columns(rowOffset - 6)  = rowIdx + nx - 1;
+    columns(rowOffset - 5)  = rowIdx + nx;
+    columns(rowOffset - 4)  = rowIdx + ny * nx - 1;
+    columns(rowOffset - 3)  = rowIdx + ny * nx;
+    columns(rowOffset - 2)  = rowIdx + ny * nx + nx - 1;
+    columns(rowOffset - 1)  = rowIdx + ny * nx + nx;
+    if (frontBC == 1 || rightBC == 1) {
+      // Fill values
+      values(rowOffset - 12) = 0.0;
+      values(rowOffset - 11) = 0.0;
+      values(rowOffset - 10) = 0.0;
+      values(rowOffset - 9)  = 0.0;
+      values(rowOffset - 8)  = 0.0;
+      values(rowOffset - 7)  = 1.0;
+      values(rowOffset - 6)  = 0.0;
+      values(rowOffset - 5)  = 0.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = 0.0;
+      values(rowOffset - 2)  = 0.0;
+      values(rowOffset - 1)  = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 12) = -1.0;
+      values(rowOffset - 11) = 0.0;
+      values(rowOffset - 10) = -1.0;
+      values(rowOffset - 9)  = -1.0;
+      values(rowOffset - 8)  = 0.0;
+      values(rowOffset - 7)  = 8.0;
+      values(rowOffset - 6)  = -2.0;
+      values(rowOffset - 5)  = 0.0;
+      values(rowOffset - 4)  = -1.0;
+      values(rowOffset - 3)  = 0.0;
+      values(rowOffset - 2)  = -1.0;
+      values(rowOffset - 1)  = -1.0;
+    }
 
-    // Create matrix data
-    row_map_view_t rowmap_view ("rowmap_view",  numNodes + 1);
-    cols_view_t    columns_view("colsmap_view", numEntries);
-    scalar_view_t  values_view ("values_view",  numEntries);
+    // Compute row index
+    k              = idx;
+    ordinal_type j = ny - 2;
+    rowIdx         = (k + 1) * ny * nx + (j + 1) * nx;
+
+    // Compute rowOffset
+    rowOffset = size_type(k) * numEntriesPerGridPlane + numEntriesBottomPlane +
+                size_type(j) * numEntriesPerGridRow + numEntriesFrontRow +
+                edgeStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 12) = rowIdx - nx * ny - nx;
+    columns(rowOffset - 11) = rowIdx - nx * ny - nx + 1;
+    columns(rowOffset - 10) = rowIdx - nx * ny;
+    columns(rowOffset - 9)  = rowIdx - nx * ny + 1;
+    columns(rowOffset - 8)  = rowIdx - nx;
+    columns(rowOffset - 7)  = rowIdx - nx + 1;
+    columns(rowOffset - 6)  = rowIdx;
+    columns(rowOffset - 5)  = rowIdx + 1;
+    columns(rowOffset - 4)  = rowIdx + ny * nx - nx;
+    columns(rowOffset - 3)  = rowIdx + ny * nx - nx + 1;
+    columns(rowOffset - 2)  = rowIdx + ny * nx;
+    columns(rowOffset - 1)  = rowIdx + ny * nx + 1;
+    if (backBC == 1 || leftBC == 1) {
+      // Fill values
+      values(rowOffset - 12) = 0.0;
+      values(rowOffset - 11) = 0.0;
+      values(rowOffset - 10) = 0.0;
+      values(rowOffset - 9)  = 0.0;
+      values(rowOffset - 8)  = 0.0;
+      values(rowOffset - 7)  = 0.0;
+      values(rowOffset - 6)  = 1.0;
+      values(rowOffset - 5)  = 0.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = 0.0;
+      values(rowOffset - 2)  = 0.0;
+      values(rowOffset - 1)  = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 12) = -1.0;
+      values(rowOffset - 11) = -1.0;
+      values(rowOffset - 10) = 0.0;
+      values(rowOffset - 9)  = -1.0;
+      values(rowOffset - 8)  = 0.0;
+      values(rowOffset - 7)  = -2.0;
+      values(rowOffset - 6)  = 8.0;
+      values(rowOffset - 5)  = 0.0;
+      values(rowOffset - 4)  = -1.0;
+      values(rowOffset - 3)  = -1.0;
+      values(rowOffset - 2)  = 0.0;
+      values(rowOffset - 1)  = -1.0;
+    }
 
-    // Fill the CrsGraph and the CrsMatrix
-    // To start simple we construct 2D 5pt stencil Laplacian.
-    // We assume Neumann boundary conditions on the edge of the domain.
+    // Compute row index
+    k      = idx;
+    j      = ny - 2;
+    i      = nx - 2;
+    rowIdx = (k + 1) * ny * nx + (j + 1) * nx + i + 1;
+
+    // Compute rowOffset
+    rowOffset = size_type(k) * numEntriesPerGridPlane + numEntriesBottomPlane +
+                size_type(j) * numEntriesPerGridRow + numEntriesFrontRow +
+                size_type(i) * faceStencilLength + 2 * edgeStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 12) = rowIdx - nx * ny - nx - 1;
+    columns(rowOffset - 11) = rowIdx - nx * ny - nx;
+    columns(rowOffset - 10) = rowIdx - nx * ny - 1;
+    columns(rowOffset - 9)  = rowIdx - nx * ny;
+    columns(rowOffset - 8)  = rowIdx - nx - 1;
+    columns(rowOffset - 7)  = rowIdx - nx;
+    columns(rowOffset - 6)  = rowIdx - 1;
+    columns(rowOffset - 5)  = rowIdx;
+    columns(rowOffset - 4)  = rowIdx + ny * nx - nx - 1;
+    columns(rowOffset - 3)  = rowIdx + ny * nx - nx;
+    columns(rowOffset - 2)  = rowIdx + ny * nx - 1;
+    columns(rowOffset - 1)  = rowIdx + ny * nx;
+    if (backBC == 1 || rightBC == 1) {
+      // Fill values
+      values(rowOffset - 12) = 0.0;
+      values(rowOffset - 11) = 0.0;
+      values(rowOffset - 10) = 0.0;
+      values(rowOffset - 9)  = 0.0;
+      values(rowOffset - 8)  = 0.0;
+      values(rowOffset - 7)  = 0.0;
+      values(rowOffset - 6)  = 0.0;
+      values(rowOffset - 5)  = 1.0;
+      values(rowOffset - 4)  = 0.0;
+      values(rowOffset - 3)  = 0.0;
+      values(rowOffset - 2)  = 0.0;
+      values(rowOffset - 1)  = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 12) = -1.0;
+      values(rowOffset - 11) = -1.0;
+      values(rowOffset - 10) = -1.0;
+      values(rowOffset - 9)  = 0.0;
+      values(rowOffset - 8)  = -2.0;
+      values(rowOffset - 7)  = 0.0;
+      values(rowOffset - 6)  = 0.0;
+      values(rowOffset - 5)  = 8.0;
+      values(rowOffset - 4)  = -1.0;
+      values(rowOffset - 3)  = -1.0;
+      values(rowOffset - 2)  = -1.0;
+      values(rowOffset - 1)  = 0.0;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const cornerFETag&, const ordinal_type /*idx*/) const {
+    // Bottom corners
+    ordinal_type rowIdx = 0;
+    size_type rowOffset = cornerStencilLength;
+    rowmap(rowIdx + 1)  = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 8) = rowIdx;
+    columns(rowOffset - 7) = rowIdx + 1;
+    columns(rowOffset - 6) = rowIdx + nx;
+    columns(rowOffset - 5) = rowIdx + nx + 1;
+    columns(rowOffset - 4) = rowIdx + ny * nx;
+    columns(rowOffset - 3) = rowIdx + ny * nx + 1;
+    columns(rowOffset - 2) = rowIdx + ny * nx + nx;
+    columns(rowOffset - 1) = rowIdx + ny * nx + nx + 1;
+    if (bottomBC == 1 || frontBC == 1 || leftBC == 1) {
+      // Fill values
+      values(rowOffset - 8) = 1.0;
+      values(rowOffset - 7) = 0.0;
+      values(rowOffset - 6) = 0.0;
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 8) = 4.0;
+      values(rowOffset - 7) = 0.0;
+      values(rowOffset - 6) = 0.0;
+      values(rowOffset - 5) = -1.0;
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = -1.0;
+    }
 
-    fill_3D_matrix_functor<CrsMatrix_t> fill_3D_matrix(stencil_type, nx, ny, nz,
-						       leftBC, rightBC, frontBC,
-						       backBC, bottomBC, topBC,
-						       rowmap_view, columns_view,
-						       values_view);
+    rowIdx             = nx - 1;
+    rowOffset          = numEntriesBottomFrontRow;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 8) = rowIdx - 1;
+    columns(rowOffset - 7) = rowIdx;
+    columns(rowOffset - 6) = rowIdx + nx - 1;
+    columns(rowOffset - 5) = rowIdx + nx;
+    columns(rowOffset - 4) = rowIdx + ny * nx - 1;
+    columns(rowOffset - 3) = rowIdx + ny * nx;
+    columns(rowOffset - 2) = rowIdx + ny * nx + nx - 1;
+    columns(rowOffset - 1) = rowIdx + ny * nx + nx;
+    if (bottomBC == 1 || frontBC == 1 || rightBC == 1) {
+      // Fill values
+      values(rowOffset - 8) = 0.0;
+      values(rowOffset - 7) = 1.0;
+      values(rowOffset - 6) = 0.0;
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 8) = 0.0;
+      values(rowOffset - 7) = 4.0;
+      values(rowOffset - 6) = -1.0;
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = -1.0;
+    }
 
-    fill_3D_matrix.compute();
+    rowIdx    = (ny - 1) * nx;
+    rowOffset = size_type(ny - 2) * numEntriesFrontRow +
+                numEntriesBottomFrontRow + cornerStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 8) = rowIdx - nx;
+    columns(rowOffset - 7) = rowIdx - nx + 1;
+    columns(rowOffset - 6) = rowIdx;
+    columns(rowOffset - 5) = rowIdx + 1;
+    columns(rowOffset - 4) = rowIdx + ny * nx - nx;
+    columns(rowOffset - 3) = rowIdx + ny * nx - nx + 1;
+    columns(rowOffset - 2) = rowIdx + ny * nx;
+    columns(rowOffset - 1) = rowIdx + ny * nx + 1;
+    if (bottomBC == 1 || backBC == 1 || leftBC == 1) {
+      // Fill values
+      values(rowOffset - 8) = 0.0;
+      values(rowOffset - 7) = 0.0;
+      values(rowOffset - 6) = 1.0;
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 8) = 0.0;
+      values(rowOffset - 7) = -1.0;
+      values(rowOffset - 6) = 4.0;
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = -1.0;
+    }
 
-    graph_t static_graph (columns_view, rowmap_view);
-    std::string name;
-    if(stencil_type == FD) {
-      name = "CrsMatrixFD";
-    } else if(stencil_type == FE) {
-      name = "CrsMatrixFE";
+    rowIdx             = ny * nx - 1;
+    rowOffset          = numEntriesBottomPlane;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 8) = rowIdx - nx - 1;
+    columns(rowOffset - 7) = rowIdx - nx;
+    columns(rowOffset - 6) = rowIdx - 1;
+    columns(rowOffset - 5) = rowIdx;
+    columns(rowOffset - 4) = rowIdx + ny * nx - nx - 1;
+    columns(rowOffset - 3) = rowIdx + ny * nx - nx;
+    columns(rowOffset - 2) = rowIdx + ny * nx - 1;
+    columns(rowOffset - 1) = rowIdx + ny * nx;
+    if (bottomBC == 1 || backBC == 1 || rightBC == 1) {
+      // Fill values
+      values(rowOffset - 8) = 0.0;
+      values(rowOffset - 7) = 0.0;
+      values(rowOffset - 6) = 0.0;
+      values(rowOffset - 5) = 1.0;
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 8) = -1.0;
+      values(rowOffset - 7) = 0.0;
+      values(rowOffset - 6) = 0.0;
+      values(rowOffset - 5) = 4.0;
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = 0.0;
     }
 
-    return CrsMatrix_t(name, numNodes, values_view, static_graph);
+    // Top corners
+    rowIdx    = (nz - 1) * ny * nx;
+    rowOffset = size_type(nz - 2) * numEntriesPerGridPlane +
+                numEntriesBottomPlane + cornerStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 8) = rowIdx - ny * nx;
+    columns(rowOffset - 7) = rowIdx - ny * nx + 1;
+    columns(rowOffset - 6) = rowIdx - ny * nx + nx;
+    columns(rowOffset - 5) = rowIdx - ny * nx + nx + 1;
+    columns(rowOffset - 4) = rowIdx;
+    columns(rowOffset - 3) = rowIdx + 1;
+    columns(rowOffset - 2) = rowIdx + nx;
+    columns(rowOffset - 1) = rowIdx + nx + 1;
+    if (topBC == 1 || frontBC == 1 || leftBC == 1) {
+      // Fill values
+      values(rowOffset - 8) = 0.0;
+      values(rowOffset - 7) = 0.0;
+      values(rowOffset - 6) = 0.0;
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 1.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 8) = 0.0;
+      values(rowOffset - 7) = -1.0;
+      values(rowOffset - 6) = -1.0;
+      values(rowOffset - 5) = -1.0;
+      values(rowOffset - 4) = 4.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = -1.0;
+    }
 
-  } // generate_structured_matrix3D
+    rowIdx    = (nz - 1) * ny * nx + nx - 1;
+    rowOffset = size_type(nz - 2) * numEntriesPerGridPlane +
+                numEntriesBottomPlane + numEntriesBottomFrontRow;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 8) = rowIdx - ny * nx - 1;
+    columns(rowOffset - 7) = rowIdx - ny * nx;
+    columns(rowOffset - 6) = rowIdx - ny * nx + nx - 1;
+    columns(rowOffset - 5) = rowIdx - ny * nx + nx;
+    columns(rowOffset - 4) = rowIdx - 1;
+    columns(rowOffset - 3) = rowIdx;
+    columns(rowOffset - 2) = rowIdx + nx - 1;
+    columns(rowOffset - 1) = rowIdx + nx;
+    if (topBC == 1 || frontBC == 1 || rightBC == 1) {
+      // Fill values
+      values(rowOffset - 8) = 0.0;
+      values(rowOffset - 7) = 0.0;
+      values(rowOffset - 6) = 0.0;
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 1.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 8) = -1.0;
+      values(rowOffset - 7) = 0.0;
+      values(rowOffset - 6) = -1.0;
+      values(rowOffset - 5) = -1.0;
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 4.0;
+      values(rowOffset - 2) = -1.0;
+      values(rowOffset - 1) = 0.0;
+    }
 
-}
+    rowIdx    = nz * ny * nx - nx;
+    rowOffset = size_type(nz - 2) * numEntriesPerGridPlane +
+                numEntriesBottomPlane + size_type(ny - 2) * numEntriesFrontRow +
+                numEntriesBottomFrontRow + cornerStencilLength;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 8) = rowIdx - ny * nx - nx;
+    columns(rowOffset - 7) = rowIdx - ny * nx - nx + 1;
+    columns(rowOffset - 6) = rowIdx - ny * nx;
+    columns(rowOffset - 5) = rowIdx - ny * nx + 1;
+    columns(rowOffset - 4) = rowIdx - nx;
+    columns(rowOffset - 3) = rowIdx - nx + 1;
+    columns(rowOffset - 2) = rowIdx;
+    columns(rowOffset - 1) = rowIdx + 1;
+    if (topBC == 1 || backBC == 1 || leftBC == 1) {
+      // Fill values
+      values(rowOffset - 8) = 0.0;
+      values(rowOffset - 7) = 0.0;
+      values(rowOffset - 6) = 0.0;
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 1.0;
+      values(rowOffset - 1) = 0.0;
+    } else {
+      // Fill values
+      values(rowOffset - 8) = -1.0;
+      values(rowOffset - 7) = -1.0;
+      values(rowOffset - 6) = 0.0;
+      values(rowOffset - 5) = -1.0;
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = -1.0;
+      values(rowOffset - 2) = 4.0;
+      values(rowOffset - 1) = 0.0;
+    }
 
-#endif // KOKKOSKERNELS_TEST_STRUCTURE_MATRIX_HPP
+    rowIdx             = nz * ny * nx - 1;
+    rowOffset          = numEntries;
+    rowmap(rowIdx + 1) = rowOffset;
+
+    // Fill column indices
+    columns(rowOffset - 8) = rowIdx - ny * nx - nx - 1;
+    columns(rowOffset - 7) = rowIdx - ny * nx - nx;
+    columns(rowOffset - 6) = rowIdx - ny * nx - 1;
+    columns(rowOffset - 5) = rowIdx - ny * nx;
+    columns(rowOffset - 4) = rowIdx - nx - 1;
+    columns(rowOffset - 3) = rowIdx - nx;
+    columns(rowOffset - 2) = rowIdx - 1;
+    columns(rowOffset - 1) = rowIdx;
+    if (topBC == 1 || backBC == 1 || rightBC == 1) {
+      // Fill values
+      values(rowOffset - 8) = 0.0;
+      values(rowOffset - 7) = 0.0;
+      values(rowOffset - 6) = 0.0;
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = 0.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 1.0;
+    } else {
+      // Fill values
+      values(rowOffset - 8) = -1.0;
+      values(rowOffset - 7) = -1.0;
+      values(rowOffset - 6) = -1.0;
+      values(rowOffset - 5) = 0.0;
+      values(rowOffset - 4) = -1.0;
+      values(rowOffset - 3) = 0.0;
+      values(rowOffset - 2) = 0.0;
+      values(rowOffset - 1) = 4.0;
+    }
+  }
+};
+
+template <typename CrsMatrix_t, typename mat_structure>
+CrsMatrix_t generate_structured_matrix3D(const std::string stencil,
+                                         const mat_structure& structure) {
+  typedef typename CrsMatrix_t::StaticCrsGraphType graph_t;
+  typedef typename CrsMatrix_t::row_map_type::non_const_type row_map_view_t;
+  typedef typename CrsMatrix_t::index_type::non_const_type cols_view_t;
+  typedef typename CrsMatrix_t::values_type::non_const_type scalar_view_t;
+  typedef typename CrsMatrix_t::non_const_size_type size_type;
+  typedef typename CrsMatrix_t::non_const_ordinal_type ordinal_type;
+
+  int stencil_type = 0;
+  if (stencil == "FD") {
+    stencil_type = FD;
+  } else if (stencil == "FE") {
+    stencil_type = FE;
+  } else {
+    std::ostringstream os;
+    os << "Test::generate_structured_matrix3D only accepts stencil: FD and "
+          "FEM, you passed: "
+       << stencil << " !" << std::endl;
+    KokkosKernels::Impl::throw_runtime_exception(os.str());
+  }
+
+  // Extract geometric data
+  const ordinal_type nx          = structure(0, 0);
+  const ordinal_type ny          = structure(1, 0);
+  const ordinal_type nz          = structure(2, 0);
+  const ordinal_type numNodes    = ny * nx * nz;
+  const ordinal_type leftBC      = structure(0, 1);
+  const ordinal_type rightBC     = structure(0, 2);
+  const ordinal_type frontBC     = structure(1, 1);
+  const ordinal_type backBC      = structure(1, 2);
+  const ordinal_type bottomBC    = structure(2, 1);
+  const ordinal_type topBC       = structure(2, 2);
+  const ordinal_type numInterior = (nx - leftBC - rightBC) *
+                                   (ny - frontBC - backBC) *
+                                   (nz - bottomBC - topBC);
+  const ordinal_type numFace =
+      (leftBC + rightBC) * (ny - frontBC - backBC) * (nz - bottomBC - topBC) +
+      (frontBC + backBC) * (nx - leftBC - rightBC) * (nz - bottomBC - topBC) +
+      (bottomBC + topBC) * (nx - leftBC - rightBC) * (ny - frontBC - backBC);
+  const ordinal_type numEdge = (frontBC * bottomBC + frontBC * topBC +
+                                backBC * bottomBC + backBC * topBC) *
+                                   (nx - leftBC - rightBC) +
+                               (leftBC * bottomBC + leftBC * topBC +
+                                rightBC * bottomBC + rightBC * topBC) *
+                                   (ny - frontBC - backBC) +
+                               (leftBC * frontBC + leftBC * backBC +
+                                rightBC * frontBC + rightBC * backBC) *
+                                   (nz - bottomBC - topBC);
+  const ordinal_type numCorner =
+      leftBC * frontBC * bottomBC + rightBC * frontBC * bottomBC +
+      leftBC * backBC * bottomBC + rightBC * backBC * bottomBC +
+      leftBC * frontBC * topBC + rightBC * frontBC * topBC +
+      leftBC * backBC * topBC + rightBC * backBC * topBC;
+  ordinal_type interiorStencilLength = 0, faceStencilLength = 0,
+               edgeStencilLength = 0, cornerStencilLength = 0;
+
+  if (stencil_type == FD) {
+    interiorStencilLength = 7;
+    faceStencilLength     = 6;
+    edgeStencilLength     = 5;
+    cornerStencilLength   = 4;
+  } else if (stencil_type == FE) {
+    interiorStencilLength = 27;
+    faceStencilLength     = 18;
+    edgeStencilLength     = 12;
+    cornerStencilLength   = 8;
+  }
+
+  const size_type numEntries =
+      numInterior * interiorStencilLength + numFace * faceStencilLength +
+      numEdge * edgeStencilLength + numCorner * cornerStencilLength;
+
+  // Create matrix data
+  row_map_view_t rowmap_view("rowmap_view", numNodes + 1);
+  cols_view_t columns_view("colsmap_view", numEntries);
+  scalar_view_t values_view("values_view", numEntries);
+
+  // Fill the CrsGraph and the CrsMatrix
+  // To start simple we construct 2D 5pt stencil Laplacian.
+  // We assume Neumann boundary conditions on the edge of the domain.
+
+  fill_3D_matrix_functor<CrsMatrix_t> fill_3D_matrix(
+      stencil_type, nx, ny, nz, leftBC, rightBC, frontBC, backBC, bottomBC,
+      topBC, rowmap_view, columns_view, values_view);
+
+  fill_3D_matrix.compute();
+
+  graph_t static_graph(columns_view, rowmap_view);
+  std::string name;
+  if (stencil_type == FD) {
+    name = "CrsMatrixFD";
+  } else if (stencil_type == FE) {
+    name = "CrsMatrixFE";
+  }
+
+  return CrsMatrix_t(name, numNodes, values_view, static_graph);
+
+}  // generate_structured_matrix3D
+
+}  // namespace Test
+
+#endif  // KOKKOSKERNELS_TEST_STRUCTURE_MATRIX_HPP
diff --git a/test_common/KokkosKernels_WriteBinaryFromBinSrcDst.cpp b/test_common/KokkosKernels_WriteBinaryFromBinSrcDst.cpp
index 5a5b2b986f..4a33a4b985 100644
--- a/test_common/KokkosKernels_WriteBinaryFromBinSrcDst.cpp
+++ b/test_common/KokkosKernels_WriteBinaryFromBinSrcDst.cpp
@@ -46,59 +46,53 @@
 #include "KokkosKernels_IOUtils.hpp"
 #include <string.h>
 
-//typedef long long size_type;
-//typedef int lno_t;
-//typedef double wt;
+// typedef long long size_type;
+// typedef int lno_t;
+// typedef double wt;
 typedef size_t size_type;
 typedef size_t lno_t;
 typedef double wt;
 
-
-
 typedef size_t input_lno_t;
-int main (int argc, char ** argv){
-
-
+int main(int argc, char **argv) {
   char *in_src = NULL, *in_dst = NULL;
-  for ( int i = 1 ; i < argc ; ++i ) {
-    if ( 0 == strcasecmp( argv[i] , "in_src" ) ) {
+  for (int i = 1; i < argc; ++i) {
+    if (0 == Test::string_compare_no_case(argv[i], "in_src")) {
       in_src = argv[++i];
-    }
-    else if ( 0 == strcasecmp( argv[i] , "in_dst" ) ) {
+    } else if (0 == Test::string_compare_no_case(argv[i], "in_dst")) {
       in_dst = argv[++i];
-    }
-    else {
-      std::cerr << "Usage:" << argv[0]
-                << " in_src srcs.bin in_dst dsts.bin" << std::endl;
+    } else {
+      std::cerr << "Usage:" << argv[0] << " in_src srcs.bin in_dst dsts.bin"
+                << std::endl;
       exit(1);
     }
   }
-  if (in_src == NULL || in_dst == NULL){
-    std::cerr << "Usage:" << argv[0]
-              << " in_src srcs.bin in_dst dsts.bin" << std::endl;
+  if (in_src == NULL || in_dst == NULL) {
+    std::cerr << "Usage:" << argv[0] << " in_src srcs.bin in_dst dsts.bin"
+              << std::endl;
     exit(1);
   }
 
   size_t numEdges = 0;
-  size_t *srcs, *dst; //this type is hard coded
+  size_t *srcs, *dst;  // this type is hard coded
   KokkosKernels::Impl::buildEdgeListFromBinSrcTarg_undirected<size_t>(
-      in_src, in_dst,
-      numEdges,
-      &srcs, &dst);
+      in_src, in_dst, numEdges, &srcs, &dst);
   std::cout << "read numEdges:" << numEdges << std::endl;
   size_t num_vertex = 0;
-  for (size_t i = 0; i < numEdges; ++i){
-    if (srcs[i] == 0 || dst[i] == 0) std::cout << "i:" << i << " src:" << srcs[i] << " dst:" << dst[i] << std::endl;
+  for (size_t i = 0; i < numEdges; ++i) {
+    if (srcs[i] == 0 || dst[i] == 0)
+      std::cout << "i:" << i << " src:" << srcs[i] << " dst:" << dst[i]
+                << std::endl;
     if (num_vertex < srcs[i]) num_vertex = srcs[i];
     if (num_vertex < dst[i]) num_vertex = dst[i];
   }
   num_vertex += 1;
   std::cout << "num_vertex:" << num_vertex << std::endl;
 
-  lno_t nv = num_vertex;
+  lno_t nv     = num_vertex;
   size_type ne = numEdges * 2;
-  //std::vector<wt> ew1(ne); 
-  //wt *ew = &(ew1[0]);
+  // std::vector<wt> ew1(ne);
+  // wt *ew = &(ew1[0]);
   wt *ew;
   KokkosKernels::Impl::md_malloc<wt>(&ew, ne);
 
@@ -109,74 +103,72 @@ int main (int argc, char ** argv){
   KokkosKernels::Impl::md_malloc<lno_t>(&adj, ne);
 
   std::cout << "converting" << std::endl;
-  KokkosKernels::Impl::convert_undirected_edge_list_to_csr <size_t, size_type, lno_t>(
-      nv, numEdges, //numEdges should be num undirected edges.
-      srcs, dst,
-      xadj, adj);
-  delete [] srcs; delete [] dst;
+  KokkosKernels::Impl::convert_undirected_edge_list_to_csr<size_t, size_type,
+                                                           lno_t>(
+      nv, numEdges,  // numEdges should be num undirected edges.
+      srcs, dst, xadj, adj);
+  delete[] srcs;
+  delete[] dst;
   size_t num_cols = 0;
-  for (size_t i = 0; i < numEdges*2; ++i){
+  for (size_t i = 0; i < numEdges * 2; ++i) {
     if (num_cols < adj[i]) num_cols = adj[i];
   }
   std::cout << "num_cols:" << num_cols << std::endl;
-  //std::vector<size_type> i_xadj(ne / 2 + 1);
+  // std::vector<size_type> i_xadj(ne / 2 + 1);
   size_type *i_xadj;
-  KokkosKernels::Impl::md_malloc<size_type>(&i_xadj, ne/2+1);
+  KokkosKernels::Impl::md_malloc<size_type>(&i_xadj, ne / 2 + 1);
   lno_t *i_adj;
   KokkosKernels::Impl::md_malloc<lno_t>(&i_adj, ne);
-  //std::vector<lno_t> i_adj(ne);
- 
+  // std::vector<lno_t> i_adj(ne);
 
-  std::cout <<"writing original" << std::endl;
-  KokkosKernels::Impl::write_graph_bin (nv, ne, xadj, adj, ew, "actual.bin");
+  std::cout << "writing original" << std::endl;
+  KokkosKernels::Impl::write_graph_bin(nv, ne, xadj, adj, ew, "actual.bin");
 
   std::cout << "calculating incidence transpose" << std::endl;
   KokkosKernels::Impl::kk_sequential_create_incidence_matrix_transpose(
-      nv,
-      ne,
-      xadj,
-      adj,
-      &(i_xadj[0]), //output. preallocated
-      &(i_adj[0]) //output. preallocated
+      nv, ne, xadj, adj,
+      &(i_xadj[0]),  // output. preallocated
+      &(i_adj[0])    // output. preallocated
   );
   std::cout << "writing bin incidence transpose" << std::endl;
-  KokkosKernels::Impl::write_graph_bin (lno_t (ne / 2), ne, &(i_xadj[0]), &(i_adj[0]), ew , "incidence-transpose.bin");
+  KokkosKernels::Impl::write_graph_bin(lno_t(ne / 2), ne, &(i_xadj[0]),
+                                       &(i_adj[0]), ew,
+                                       "incidence-transpose.bin");
   size_type *i_adj2;
   KokkosKernels::Impl::md_malloc<size_type>(&i_adj2, ne);
 
   std::cout << "calculating incidence " << std::endl;
   KokkosKernels::Impl::kk_sequential_create_incidence_matrix(
-        nv,
-        xadj,
-        adj,
-        &(i_adj2[0]) //output. preallocated
-    );
+      nv, xadj, adj,
+      &(i_adj2[0])  // output. preallocated
+  );
   std::cout << "writing bin incidence" << std::endl;
-  KokkosKernels::Impl::write_graph_bin (nv, ne, xadj, i_adj2, ew, "incidence.bin");
+  KokkosKernels::Impl::write_graph_bin(nv, ne, xadj, i_adj2, ew,
+                                       "incidence.bin");
 
-  lno_t average_degree = ne/nv;
-  std::vector<lno_t> row_sizes (nv, 0);
-  for (lno_t i = 0 ; i < nv; ++i){
-    size_type row_s= xadj[i + 1] - xadj[i];
+  lno_t average_degree = ne / nv;
+  std::vector<lno_t> row_sizes(nv, 0);
+  for (lno_t i = 0; i < nv; ++i) {
+    size_type row_s = xadj[i + 1] - xadj[i];
     if (row_s > 1000)
-    std::cout << "row:" << i << " size:" << row_s << " average_degree:" << average_degree << std::endl;
+      std::cout << "row:" << i << " size:" << row_s
+                << " average_degree:" << average_degree << std::endl;
     row_sizes[row_s] += 1;
   }
 
-  for (lno_t i = 0 ; i < nv; ++i){
-    if (row_sizes[i] != 0){
-      std::cout << row_sizes[i] << " rows has " << i << " nonzeroes" << std::endl;
+  for (lno_t i = 0; i < nv; ++i) {
+    if (row_sizes[i] != 0) {
+      std::cout << row_sizes[i] << " rows has " << i << " nonzeroes"
+                << std::endl;
     }
   }
-  delete [] i_xadj;
-
-  delete [] i_adj;
-
+  delete[] i_xadj;
 
-  delete [] xadj;
+  delete[] i_adj;
 
-  delete [] adj;
+  delete[] xadj;
 
-  delete [] ew;
+  delete[] adj;
 
+  delete[] ew;
 }
diff --git a/tpls/gtest/gtest/gtest-all.cc b/tpls/gtest/gtest/gtest-all.cc
index 735f581c95..15d0cb7bca 100644
--- a/tpls/gtest/gtest/gtest-all.cc
+++ b/tpls/gtest/gtest/gtest-all.cc
@@ -7482,8 +7482,8 @@ void StackLowerThanAddress(const void* ptr, bool* result) {
 }
 
 bool StackGrowsDown() {
-  int dummy;
-  bool result;
+  int dummy = 1;
+  bool result = 0;
   StackLowerThanAddress(&dummy, &result);
   return result;
 }
diff --git a/unit_test/CMakeLists.txt b/unit_test/CMakeLists.txt
index f226063166..8eb4cc7a6e 100644
--- a/unit_test/CMakeLists.txt
+++ b/unit_test/CMakeLists.txt
@@ -133,14 +133,13 @@ IF (KOKKOS_ENABLE_SYCL)
     COMPONENTS batched
   )
 
-  # FIXME_SYCL
-  #KOKKOSKERNELS_ADD_UNIT_TEST(
-  #  sparse_sycl
-  #  SOURCES
-  #    Test_Main.cpp
-  #    sycl/Test_SYCL_Sparse.cpp
-  #  COMPONENTS sparse
-  #)
+  KOKKOSKERNELS_ADD_UNIT_TEST(
+    sparse_sycl
+    SOURCES
+      Test_Main.cpp
+      sycl/Test_SYCL_Sparse.cpp
+    COMPONENTS sparse
+  )
 
   KOKKOSKERNELS_ADD_UNIT_TEST(
     graph_sycl
@@ -367,7 +366,7 @@ IF (KOKKOS_ENABLE_SERIAL)
   )
 ENDIF ()
 
-IF (KOKKOS_ENABLE_PTHREAD)
+IF (KOKKOS_ENABLE_THREADS)
   KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/threads)
   KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/threads)
 
diff --git a/unit_test/Test_Main.cpp b/unit_test/Test_Main.cpp
index 7a0641d7d6..f48ac7e43a 100644
--- a/unit_test/Test_Main.cpp
+++ b/unit_test/Test_Main.cpp
@@ -1,11 +1,10 @@
-#include<Kokkos_Core.hpp>
+#include <Kokkos_Core.hpp>
 #include <gtest/gtest.h>
 
-int main( int argc, char* argv[] )
-{
-  Kokkos::initialize(argc,argv);
+int main(int argc, char* argv[]) {
+  Kokkos::initialize(argc, argv);
 
-  ::testing::InitGoogleTest( &argc, argv );
+  ::testing::InitGoogleTest(&argc, argv);
   int result = RUN_ALL_TESTS();
 
   Kokkos::finalize();
diff --git a/unit_test/batched/dense/Test_Batched_BatchedGemm.hpp b/unit_test/batched/dense/Test_Batched_BatchedGemm.hpp
index f189cf0a6a..2d38dd8076 100644
--- a/unit_test/batched/dense/Test_Batched_BatchedGemm.hpp
+++ b/unit_test/batched/dense/Test_Batched_BatchedGemm.hpp
@@ -26,8 +26,18 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle,
 
   int ret        = 0;
   auto algo_type = batchedGemmHandle->get_kernel_algo_type();
-
   ViewType a_expected, a_actual, b_expected, b_actual, c_expected, c_actual;
+  std::string fmsg;
+  std::string fmsg_rhs =
+      "algo_type:" + batchedGemmHandle->get_kernel_algo_type_str() + ", ";
+  fmsg_rhs += ("N:" + std::to_string(N) + ", ");
+  fmsg_rhs +=
+      ("A:" + std::to_string(matAdim1) + "x" + std::to_string(matAdim2) + ", ");
+  fmsg_rhs +=
+      ("B:" + std::to_string(matBdim1) + "x" + std::to_string(matBdim2) + ", ");
+  fmsg_rhs +=
+      ("C:" + std::to_string(matCdim1) + "x" + std::to_string(matCdim2) + "\n");
+
   if (std::is_same<batchLayout, BatchLayout::Left>::value) {
     a_expected = ViewType("a_expected", N, matAdim1, matAdim2);
     a_actual   = ViewType("a_actual", N, matAdim1, matAdim2);
@@ -60,26 +70,28 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle,
   if (algo_type == GemmKokkosBatchedAlgos::KK_DBLBUF) {
     // Check for DblBuf runtime errors related to team_size
     try {
+      fmsg = kk_failure_str(__FILE__, __FUNCTION__, __LINE__);
       Impl::BatchedDblBufGemm<transA, transB, batchLayout, BatchedGemmHandle,
                               ScalarType, decltype(a_actual),
                               decltype(b_actual), decltype(c_actual),
-                              BoundsCheck::Yes, 65536, 1, 65536>(
+                              BoundsCheck::Yes, AlphaTag::No, 65536, 1, 65536>(
           batchedGemmHandle, alpha, a_actual, b_actual, beta, c_actual)
           .invoke();
-      FAIL();
+      FAIL() << (fmsg + fmsg_rhs);
     } catch (const std::runtime_error& error) {
       ;
     }
 
     // Check for DblBuf runtime errors related to vector_len
     try {
-      Impl::BatchedDblBufGemm<transA, transB, batchLayout, BatchedGemmHandle,
-                              ScalarType, decltype(a_actual),
-                              decltype(b_actual), decltype(c_actual),
-                              BoundsCheck::No, 65536, 65536 * 2, 65536>(
+      fmsg = kk_failure_str(__FILE__, __FUNCTION__, __LINE__);
+      Impl::BatchedDblBufGemm<
+          transA, transB, batchLayout, BatchedGemmHandle, ScalarType,
+          decltype(a_actual), decltype(b_actual), decltype(c_actual),
+          BoundsCheck::No, AlphaTag::No, 65536, 65536 * 2, 65536>(
           batchedGemmHandle, alpha, a_actual, b_actual, beta, c_actual)
           .invoke();
-      FAIL();
+      FAIL() << (fmsg + fmsg_rhs);
     } catch (const std::runtime_error& error) {
       ;
     }
@@ -87,22 +99,48 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle,
 
   // Check for expected BatchedGemm runtime errors
   try {
-    ret = BatchedGemm<transA, transB, batchLayout>(
+#if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL)
+    if (algo_type == BaseTplAlgos::ARMPL && N % 2 == 0) {
+      auto ninter = batchedGemmHandle->get_tpl_params();
+      // Set ninter parameter for underlying armpl_dgemm_interleave_batch call
+      *ninter = N / 2;
+    }
+#endif
+
+    fmsg = kk_failure_str(__FILE__, __FUNCTION__, __LINE__);
+    ret  = BatchedGemm<transA, transB, batchLayout>(
         batchedGemmHandle, alpha, a_actual, b_actual, beta,
         c_actual);  // Compute c_actual
   } catch (const std::runtime_error& error) {
-    // std::cout << "Caught expected runtime error" << std::endl;
-    if (algo_type == BaseHeuristicAlgos::SQUARE && matCdim1 != matCdim2)
+    bool is_invalid_layout =
+        (std::is_same<view_layout, Kokkos::LayoutLeft>::value &&
+         std::is_same<batchLayout, BatchLayout::Left>::value) ||
+        (std::is_same<view_layout, Kokkos::LayoutRight>::value &&
+         std::is_same<batchLayout, BatchLayout::Right>::value);
+    std::string error_msg = error.what();
+    if (algo_type == BaseHeuristicAlgos::SQUARE && matCdim1 != matCdim2) {
       ;
-    else if (!((std::is_same<view_layout, Kokkos::LayoutLeft>::value &&
-                !std::is_same<batchLayout, BatchLayout::Right>::value) ||
-               (std::is_same<view_layout, Kokkos::LayoutRight>::value &&
-                !std::is_same<batchLayout, BatchLayout::Left>::value))) {
-      FAIL();
+    } else if (algo_type == BaseTplAlgos::ARMPL) {
+#if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) && ARMPL_BUILD >= 1058
+      auto ninter = batchedGemmHandle->get_tpl_params()[0];
+      // No runtime errors expected since layout is valid, double is a supported
+      // type, and ninter != 0
+      if (!is_invalid_layout &&
+          std::is_same<typename ViewType::value_type, double>::value &&
+          ninter != 0) {
+        FAIL() << (error_msg + fmsg + fmsg_rhs);
+      }
+#else
+      ;  // We expect a runtime error if the ARMPL TPL is not enabled
+#endif
+    } else if (!is_invalid_layout) {
+      // No runtime errors expected since we only support certain BatchLayouts
+      // for LayoutLeft and LayoutRight.
+      FAIL() << (error_msg + fmsg + fmsg_rhs);
     }
     return;
   }
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(ret, 0) << (fmsg + fmsg_rhs);
 
   Functor_BatchedVanillaGEMM<ViewType, ViewType, ViewType, execution_space>
       vgemm;
@@ -122,11 +160,12 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle,
 
   typename ViewType::HostMirror c_expected_host =
       Kokkos::create_mirror_view(c_expected);
-  typename ViewType::HostMirror c1_host = Kokkos::create_mirror_view(c_actual);
+  typename ViewType::HostMirror c_actual_host =
+      Kokkos::create_mirror_view(c_actual);
 
   // Copy to host
   Kokkos::deep_copy(c_expected_host, c_expected);
-  Kokkos::deep_copy(c1_host, c_actual);
+  Kokkos::deep_copy(c_actual_host, c_actual);
 
   Kokkos::fence();
 
@@ -135,22 +174,28 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle,
   using mag_type = float;
   mag_type sum(1), diff(0);
 
-  mag_type eps = (mag_type)(1 << 1) * KOKKOSKERNELS_IMPL_FP16_EPSILON;
+  auto eps = static_cast<mag_type>(ats::epsilon());
+
+  eps *= std::is_same<ScalarType, Kokkos::Experimental::half_t>::value ||
+                 std::is_same<ScalarType, Kokkos::Experimental::bhalf_t>::value
+             ? 4
+             : 1e3;
 
   for (int k = 0; k < N; ++k) {
     for (int i = 0; i < matCdim1; ++i) {
       for (int j = 0; j < matCdim2; ++j) {
         if (std::is_same<batchLayout, BatchLayout::Right>::value) {
           sum += ats::abs(c_expected_host(i, j, k));
-          diff += ats::abs(c_expected_host(i, j, k) - c1_host(i, j, k));
+          diff += ats::abs(c_expected_host(i, j, k) - c_actual_host(i, j, k));
         } else {
           sum += ats::abs(c_expected_host(k, i, j));
-          diff += ats::abs(c_expected_host(k, i, j) - c1_host(k, i, j));
+          diff += ats::abs(c_expected_host(k, i, j) - c_actual_host(k, i, j));
         }
       }
     }
   }
-  EXPECT_NEAR_KK(diff / sum, 0, eps);
+
+  EXPECT_NEAR_KK(diff / sum, 0, eps, fmsg + fmsg_rhs);
 }
 
 template <typename DeviceType, typename ViewType, typename ScalarType,
@@ -192,47 +237,60 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2,
   for (int algo_type = BaseHeuristicAlgos::SQUARE;
        algo_type < GemmKokkosBatchedAlgos::N; ++algo_type) {
     {
-      BatchedGemmHandle batchedGemmHandle(algo_type);
-
-      ASSERT_EQ(batchedGemmHandle.get_kernel_algo_type(), algo_type);
-
-      if (algo_type == BaseKokkosBatchedAlgos::KK_SERIAL ||
-          algo_type == BaseHeuristicAlgos::SQUARE ||
-          algo_type == GemmKokkosBatchedAlgos::KK_DBLBUF ||
-          algo_type == GemmKokkosBatchedAlgos::KK_SERIAL_RANK0) {
-        // Invoke 4 times to ensure we cover all paths for alpha and beta
-        impl_test_batched_gemm_with_handle<DeviceType, ViewType, ScalarType,
-                                           ParamTagType>(
-            &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2,
-            matCdim1, matCdim2, 0.0, 0.0);
-        impl_test_batched_gemm_with_handle<DeviceType, ViewType, ScalarType,
-                                           ParamTagType>(
-            &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2,
-            matCdim1, matCdim2, 1.0, 0.0);
-        impl_test_batched_gemm_with_handle<DeviceType, ViewType, ScalarType,
-                                           ParamTagType>(
-            &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2,
-            matCdim1, matCdim2, 0.0, 1.0);
-        impl_test_batched_gemm_with_handle<DeviceType, ViewType, ScalarType,
-                                           ParamTagType>(
-            &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2,
-            matCdim1, matCdim2, 1.5, 3.0);
-      } else {
-        try {
-          // Allocate these views to invoke BatchedGemm with an unsupported algo
-          // type
-          ViewType a_actual("a_actual", N, matAdim1, matAdim2);
-          ViewType b_actual("b_actual", N, matBdim1, matBdim2);
-          ViewType c_actual("c_actual", N, matCdim1, matCdim2);
-          using ta = typename ParamTagType::transA;
-          using tb = typename ParamTagType::transB;
-          using bl = typename ParamTagType::batchLayout;
-          BatchedGemm<ta, tb, bl>(&batchedGemmHandle, 0.34, a_actual, b_actual,
-                                  0.43, c_actual);
-          FAIL();
-        } catch (const std::runtime_error& error) {
+      try {
+        BatchedGemmHandle batchedGemmHandle(algo_type);
+
+        ASSERT_EQ(batchedGemmHandle.get_kernel_algo_type(), algo_type);
+
+        if (algo_type == BaseHeuristicAlgos::SQUARE ||
+            algo_type == BaseTplAlgos::ARMPL ||
+            algo_type == BaseKokkosBatchedAlgos::KK_SERIAL ||
+            algo_type == GemmKokkosBatchedAlgos::KK_SERIAL_RANK0 ||
+            algo_type == GemmKokkosBatchedAlgos::KK_DBLBUF) {
+          // Invoke 4 times to ensure we cover all paths for alpha and beta
+          impl_test_batched_gemm_with_handle<DeviceType, ViewType, ScalarType,
+                                             ParamTagType>(
+              &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2,
+              matCdim1, matCdim2, 0.0, 0.0);
+          impl_test_batched_gemm_with_handle<DeviceType, ViewType, ScalarType,
+                                             ParamTagType>(
+              &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2,
+              matCdim1, matCdim2, 1.0, 0.0);
+          impl_test_batched_gemm_with_handle<DeviceType, ViewType, ScalarType,
+                                             ParamTagType>(
+              &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2,
+              matCdim1, matCdim2, 0.0, 1.0);
+          impl_test_batched_gemm_with_handle<DeviceType, ViewType, ScalarType,
+                                             ParamTagType>(
+              &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2,
+              matCdim1, matCdim2, 1.5, 3.0);
+        } else {
+          try {
+            // Allocate these views to invoke BatchedGemm with an unsupported
+            // algo type
+            ViewType a_actual("a_actual", N, matAdim1, matAdim2);
+            ViewType b_actual("b_actual", N, matBdim1, matBdim2);
+            ViewType c_actual("c_actual", N, matCdim1, matCdim2);
+            using ta = typename ParamTagType::transA;
+            using tb = typename ParamTagType::transB;
+            using bl = typename ParamTagType::batchLayout;
+            BatchedGemm<ta, tb, bl>(&batchedGemmHandle, 0.34, a_actual,
+                                    b_actual, 0.43, c_actual);
+            std::string fmsg = kk_failure_str(__FILE__, __FUNCTION__, __LINE__);
+            FAIL() << fmsg;
+          } catch (const std::runtime_error& error) {
+            ;
+          }
+        }
+      } catch (const std::runtime_error& error) {
+#if !defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) || (ARMPL_BUILD < 1058)
+        if (algo_type == BaseTplAlgos::ARMPL) {
           ;
+        } else {
+          std::string fmsg = kk_failure_str(__FILE__, __FUNCTION__, __LINE__);
+          FAIL() << fmsg;
         }
+#endif  // KOKKOSKERNELS_ENABLE_TPL_ARMPL
       }
     }
   }
@@ -241,25 +299,25 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2,
 
 template <typename ViewType, typename DeviceType, typename ValueType,
           typename ScalarType, typename ParamTagType>
-void test_batched_gemm_with_layout() {
+void test_batched_gemm_with_layout(int N) {
   // Square cases
   for (int i = 0; i < 5; ++i) {
     Test::impl_test_batched_gemm<DeviceType, ViewType, ScalarType,
-                                 ParamTagType>(4, i, i, i, i, i, i);
+                                 ParamTagType>(N, i, i, i, i, i, i);
   }
 
   {
     int i = 10;
     Test::impl_test_batched_gemm<DeviceType, ViewType, ScalarType,
-                                 ParamTagType>(0, i, i, i, i, i, i);
+                                 ParamTagType>(N, i, i, i, i, i, i);
 
     i = 25;
     Test::impl_test_batched_gemm<DeviceType, ViewType, ScalarType,
-                                 ParamTagType>(8, i, i, i, i, i, i);
+                                 ParamTagType>(N, i, i, i, i, i, i);
 
     i = 32;
     Test::impl_test_batched_gemm<DeviceType, ViewType, ScalarType,
-                                 ParamTagType>(8, i, i, i, i, i, i);
+                                 ParamTagType>(N, i, i, i, i, i, i);
   }
 
   // Non-square cases
@@ -272,7 +330,7 @@ void test_batched_gemm_with_layout() {
         (std::is_same<typename ParamTagType::transB,
                       KokkosBatched::Trans::NoTranspose>::value)) {
       Test::impl_test_batched_gemm<DeviceType, ViewType, ScalarType,
-                                   ParamTagType>(16, dimM, dimK, dimK, dimN,
+                                   ParamTagType>(N, dimM, dimK, dimK, dimN,
                                                  dimM, dimN);
     }
     if ((std::is_same<typename ParamTagType::transA,
@@ -280,7 +338,7 @@ void test_batched_gemm_with_layout() {
         (std::is_same<typename ParamTagType::transB,
                       KokkosBatched::Trans::Transpose>::value)) {
       Test::impl_test_batched_gemm<DeviceType, ViewType, ScalarType,
-                                   ParamTagType>(16, dimM, dimK, dimN, dimK,
+                                   ParamTagType>(N, dimM, dimK, dimN, dimK,
                                                  dimM, dimN);
     }
     if ((std::is_same<typename ParamTagType::transA,
@@ -288,7 +346,7 @@ void test_batched_gemm_with_layout() {
         (std::is_same<typename ParamTagType::transB,
                       KokkosBatched::Trans::NoTranspose>::value)) {
       Test::impl_test_batched_gemm<DeviceType, ViewType, ScalarType,
-                                   ParamTagType>(16, dimK, dimM, dimK, dimN,
+                                   ParamTagType>(N, dimK, dimM, dimK, dimN,
                                                  dimM, dimN);
     }
     if ((std::is_same<typename ParamTagType::transA,
@@ -296,7 +354,7 @@ void test_batched_gemm_with_layout() {
         (std::is_same<typename ParamTagType::transB,
                       KokkosBatched::Trans::Transpose>::value)) {
       Test::impl_test_batched_gemm<DeviceType, ViewType, ScalarType,
-                                   ParamTagType>(16, dimK, dimM, dimN, dimK,
+                                   ParamTagType>(N, dimK, dimM, dimN, dimK,
                                                  dimM, dimN);
     }
   }
@@ -308,13 +366,29 @@ int test_batched_gemm() {
 #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
   typedef Kokkos::View<ValueType***, Kokkos::LayoutLeft, DeviceType> llVt;
   test_batched_gemm_with_layout<llVt, DeviceType, ValueType, ScalarType,
-                                ParamTagType>();
+                                ParamTagType>(0);
+  test_batched_gemm_with_layout<llVt, DeviceType, ValueType, ScalarType,
+                                ParamTagType>(1);
+  test_batched_gemm_with_layout<llVt, DeviceType, ValueType, ScalarType,
+                                ParamTagType>(4);
+  test_batched_gemm_with_layout<llVt, DeviceType, ValueType, ScalarType,
+                                ParamTagType>(8);
+  test_batched_gemm_with_layout<llVt, DeviceType, ValueType, ScalarType,
+                                ParamTagType>(16);
 #endif  // KOKKOSKERNELS_INST_LAYOUTLEFT
 
 #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
   typedef Kokkos::View<ValueType***, Kokkos::LayoutRight, DeviceType> lrVt;
   test_batched_gemm_with_layout<lrVt, DeviceType, ValueType, ScalarType,
-                                ParamTagType>();
+                                ParamTagType>(0);
+  test_batched_gemm_with_layout<lrVt, DeviceType, ValueType, ScalarType,
+                                ParamTagType>(1);
+  test_batched_gemm_with_layout<lrVt, DeviceType, ValueType, ScalarType,
+                                ParamTagType>(4);
+  test_batched_gemm_with_layout<lrVt, DeviceType, ValueType, ScalarType,
+                                ParamTagType>(8);
+  test_batched_gemm_with_layout<lrVt, DeviceType, ValueType, ScalarType,
+                                ParamTagType>(16);
 #endif  // KOKKOSKERNELS_INST_LAYOUTRIGHT
   return 0;
 }
diff --git a/unit_test/batched/dense/Test_Batched_BatchedGemm_Real.hpp b/unit_test/batched/dense/Test_Batched_BatchedGemm_Real.hpp
index 1e70dd6a44..f816839258 100644
--- a/unit_test/batched/dense/Test_Batched_BatchedGemm_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_BatchedGemm_Real.hpp
@@ -1,3 +1,88 @@
+#if defined(KOKKOS_BHALF_T_IS_FLOAT)
+/********************* BatchLayout::Left *********************/
+TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_bhalf_bhalf_left) {
+  typedef ::Test::SharedParamTag<Trans::NoTranspose, Trans::NoTranspose,
+                                 BatchLayout::Left>
+      param_tag_type;
+
+  test_batched_gemm<TestExecSpace, ::Test::bhalfScalarType,
+                    ::Test::bhalfScalarType, param_tag_type>();
+  test_batched_gemm<TestExecSpace, ::Test::bhalfScalarType,
+                    ::Test::bhalfScalarType, param_tag_type>();
+}
+TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_bhalf_bhalf_left) {
+  typedef ::Test::SharedParamTag<Trans::Transpose, Trans::NoTranspose,
+                                 BatchLayout::Left>
+      param_tag_type;
+
+  test_batched_gemm<TestExecSpace, ::Test::bhalfScalarType,
+                    ::Test::bhalfScalarType, param_tag_type>();
+  test_batched_gemm<TestExecSpace, ::Test::bhalfScalarType,
+                    ::Test::bhalfScalarType, param_tag_type>();
+}
+TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_bhalf_bhalf_left) {
+  typedef ::Test::SharedParamTag<Trans::NoTranspose, Trans::Transpose,
+                                 BatchLayout::Left>
+      param_tag_type;
+
+  test_batched_gemm<TestExecSpace, ::Test::bhalfScalarType,
+                    ::Test::bhalfScalarType, param_tag_type>();
+  test_batched_gemm<TestExecSpace, ::Test::bhalfScalarType,
+                    ::Test::bhalfScalarType, param_tag_type>();
+}
+TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_bhalf_bhalf_left) {
+  typedef ::Test::SharedParamTag<Trans::Transpose, Trans::Transpose,
+                                 BatchLayout::Left>
+      param_tag_type;
+
+  test_batched_gemm<TestExecSpace, ::Test::bhalfScalarType,
+                    ::Test::bhalfScalarType, param_tag_type>();
+  test_batched_gemm<TestExecSpace, ::Test::bhalfScalarType,
+                    ::Test::bhalfScalarType, param_tag_type>();
+}
+/********************* BatchLayout::Right *********************/
+TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_bhalf_bhalf_right) {
+  typedef ::Test::SharedParamTag<Trans::NoTranspose, Trans::NoTranspose,
+                                 BatchLayout::Right>
+      param_tag_type;
+
+  test_batched_gemm<TestExecSpace, ::Test::bhalfScalarType,
+                    ::Test::bhalfScalarType, param_tag_type>();
+  test_batched_gemm<TestExecSpace, ::Test::bhalfScalarType,
+                    ::Test::bhalfScalarType, param_tag_type>();
+}
+TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_bhalf_bhalf_right) {
+  typedef ::Test::SharedParamTag<Trans::Transpose, Trans::NoTranspose,
+                                 BatchLayout::Right>
+      param_tag_type;
+
+  test_batched_gemm<TestExecSpace, ::Test::bhalfScalarType,
+                    ::Test::bhalfScalarType, param_tag_type>();
+  test_batched_gemm<TestExecSpace, ::Test::bhalfScalarType,
+                    ::Test::bhalfScalarType, param_tag_type>();
+}
+TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_bhalf_bhalf_right) {
+  typedef ::Test::SharedParamTag<Trans::NoTranspose, Trans::Transpose,
+                                 BatchLayout::Right>
+      param_tag_type;
+
+  test_batched_gemm<TestExecSpace, ::Test::bhalfScalarType,
+                    ::Test::bhalfScalarType, param_tag_type>();
+  test_batched_gemm<TestExecSpace, ::Test::bhalfScalarType,
+                    ::Test::bhalfScalarType, param_tag_type>();
+}
+TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_bhalf_bhalf_right) {
+  typedef ::Test::SharedParamTag<Trans::Transpose, Trans::Transpose,
+                                 BatchLayout::Right>
+      param_tag_type;
+
+  test_batched_gemm<TestExecSpace, ::Test::bhalfScalarType,
+                    ::Test::bhalfScalarType, param_tag_type>();
+  test_batched_gemm<TestExecSpace, ::Test::bhalfScalarType,
+                    ::Test::bhalfScalarType, param_tag_type>();
+}
+#endif  // KOKKOS_BHALF_T_IS_FLOAT
+
 #if defined(KOKKOS_HALF_T_IS_FLOAT)
 /********************* BatchLayout::Left *********************/
 TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_half_half_left) {
diff --git a/unit_test/batched/dense/Test_Batched_Dense.hpp b/unit_test/batched/dense/Test_Batched_Dense.hpp
index 60dcda20cc..47a1cf1fd4 100644
--- a/unit_test/batched/dense/Test_Batched_Dense.hpp
+++ b/unit_test/batched/dense/Test_Batched_Dense.hpp
@@ -2,6 +2,9 @@
 #define TEST_BATCHED_DENSE_HPP
 
 // Serial kernels
+#include "Test_Batched_SerialAxpy.hpp"
+#include "Test_Batched_SerialAxpy_Real.hpp"
+#include "Test_Batched_SerialAxpy_Complex.hpp"
 #include "Test_Batched_SerialEigendecomposition.hpp"
 #include "Test_Batched_SerialEigendecomposition_Real.hpp"
 #include "Test_Batched_SerialGemm.hpp"
@@ -40,6 +43,9 @@
 #include "Test_Batched_SerialSVD.hpp"
 
 // Team Kernels
+#include "Test_Batched_TeamAxpy.hpp"
+#include "Test_Batched_TeamAxpy_Real.hpp"
+#include "Test_Batched_TeamAxpy_Complex.hpp"
 #include "Test_Batched_TeamGemm.hpp"
 #include "Test_Batched_TeamGemm_Real.hpp"
 #include "Test_Batched_TeamGemm_Complex.hpp"
@@ -66,6 +72,9 @@
 #include "Test_Batched_TeamTrsv_Complex.hpp"
 
 // TeamVector Kernels
+#include "Test_Batched_TeamVectorAxpy.hpp"
+#include "Test_Batched_TeamVectorAxpy_Real.hpp"
+#include "Test_Batched_TeamVectorAxpy_Complex.hpp"
 #include "Test_Batched_TeamVectorEigendecomposition.hpp"
 #include "Test_Batched_TeamVectorEigendecomposition_Real.hpp"
 #include "Test_Batched_TeamVectorGemm.hpp"
@@ -82,7 +91,6 @@
 #include "Test_Batched_TeamVectorUTV.hpp"
 #include "Test_Batched_TeamVectorUTV_Real.hpp"
 
-
 // Vector Kernels
 #include "Test_Batched_VectorArithmatic.hpp"
 #include "Test_Batched_VectorLogical.hpp"
@@ -91,4 +99,4 @@
 #include "Test_Batched_VectorRelation.hpp"
 #include "Test_Batched_VectorView.hpp"
 
-#endif // TEST_BATCHED_DENSE_HPP
+#endif  // TEST_BATCHED_DENSE_HPP
diff --git a/unit_test/batched/dense/Test_Batched_SerialAxpy.hpp b/unit_test/batched/dense/Test_Batched_SerialAxpy.hpp
new file mode 100644
index 0000000000..0fa1c99935
--- /dev/null
+++ b/unit_test/batched/dense/Test_Batched_SerialAxpy.hpp
@@ -0,0 +1,139 @@
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "gtest/gtest.h"
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Random.hpp"
+
+#include "KokkosBatched_Axpy.hpp"
+
+#include "KokkosKernels_TestUtils.hpp"
+
+using namespace KokkosBatched;
+
+namespace Test {
+namespace Axpy {
+
+template <typename DeviceType, typename ViewType, typename alphaViewType>
+struct Functor_TestBatchedSerialAxpy {
+  const alphaViewType _alpha;
+  const ViewType _X;
+  const ViewType _Y;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedSerialAxpy(const alphaViewType &alpha, const ViewType &X,
+                                const ViewType &Y)
+      : _alpha(alpha), _X(X), _Y(Y) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int k) const {
+    auto alpha = Kokkos::subview(_alpha, Kokkos::make_pair(k, k + 1));
+    auto x     = Kokkos::subview(_X, Kokkos::make_pair(k, k + 1), Kokkos::ALL);
+    auto y     = Kokkos::subview(_Y, Kokkos::make_pair(k, k + 1), Kokkos::ALL);
+
+    KokkosBatched::SerialAxpy::invoke(alpha, x, y);
+  }
+
+  inline void run() {
+    typedef typename ViewType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::SerialAxpy");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::RangePolicy<DeviceType> policy(0, _X.extent(0));
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename ViewType, typename alphaViewType>
+void impl_test_batched_axpy(const int N, const int BlkSize) {
+  typedef typename ViewType::value_type value_type;
+  typedef typename ViewType::const_value_type const_value_type;
+  typedef typename alphaViewType::const_value_type alpha_const_value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  ViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize), Y0("y0", N, BlkSize),
+      Y1("y1", N, BlkSize);
+
+  alphaViewType alpha("alpha", N);
+
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
+  Kokkos::fill_random(X0, random, const_value_type(1.0));
+  Kokkos::fill_random(Y0, random, const_value_type(1.0));
+  Kokkos::fill_random(alpha, random, alpha_const_value_type(1.0));
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(X1, X0);
+  Kokkos::deep_copy(Y1, Y0);
+
+  /// test body
+  auto alpha_host = Kokkos::create_mirror_view(alpha);
+  auto X0_host    = Kokkos::create_mirror_view(X0);
+  auto Y0_host    = Kokkos::create_mirror_view(Y0);
+
+  Kokkos::deep_copy(alpha_host, alpha);
+  Kokkos::deep_copy(X0_host, X0);
+  Kokkos::deep_copy(Y0_host, Y0);
+
+  for (int l = 0; l < N; ++l)
+    for (int i = 0; i < BlkSize; ++i)
+      Y0_host(l, i) += alpha_host(l) * X0_host(l, i);
+
+  Functor_TestBatchedSerialAxpy<DeviceType, ViewType, alphaViewType>(alpha, X1,
+                                                                     Y1)
+      .run();
+
+  Kokkos::fence();
+
+  /// for comparison send it to host
+  auto Y1_host = Kokkos::create_mirror_view(Y1);
+
+  Kokkos::deep_copy(Y1_host, Y1);
+
+  /// check c0 = c1 ; this eps is about 10^-14
+  typedef typename ats::mag_type mag_type;
+  mag_type sum(1), diff(0);
+  const mag_type eps = 1.0e3 * ats::epsilon();
+
+  for (int l = 0; l < N; ++l)
+    for (int i = 0; i < BlkSize; ++i) {
+      sum += ats::abs(Y0_host(l, i));
+      diff += ats::abs(Y0_host(l, i) - Y1_host(l, i));
+    }
+  EXPECT_NEAR_KK(diff / sum, 0, eps);
+}
+}  // namespace Axpy
+}  // namespace Test
+
+template <typename DeviceType, typename ValueType, typename ScalarType>
+int test_batched_axpy() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
+  {
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType> ViewType;
+    typedef Kokkos::View<ScalarType *, Kokkos::LayoutLeft, DeviceType>
+        alphaViewType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::Axpy::impl_test_batched_axpy<DeviceType, ViewType, alphaViewType>(
+          1024, i);
+    }
+  }
+#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
+  {
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    typedef Kokkos::View<ScalarType *, Kokkos::LayoutRight, DeviceType>
+        alphaViewType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::Axpy::impl_test_batched_axpy<DeviceType, ViewType, alphaViewType>(
+          1024, i);
+    }
+  }
+#endif
+
+  return 0;
+}
diff --git a/unit_test/batched/dense/Test_Batched_SerialAxpy_Complex.hpp b/unit_test/batched/dense/Test_Batched_SerialAxpy_Complex.hpp
new file mode 100644
index 0000000000..9603080699
--- /dev/null
+++ b/unit_test/batched/dense/Test_Batched_SerialAxpy_Complex.hpp
@@ -0,0 +1,11 @@
+
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
+TEST_F(TestCategory, batched_scalar_serial_axpy_nt_dcomplex_dcomplex) {
+  test_batched_axpy<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>>();
+}
+
+TEST_F(TestCategory, batched_scalar_serial_axpy_nt_dcomplex_double) {
+  test_batched_axpy<TestExecSpace, Kokkos::complex<double>, double>();
+}
+#endif
diff --git a/unit_test/batched/dense/Test_Batched_SerialAxpy_Real.hpp b/unit_test/batched/dense/Test_Batched_SerialAxpy_Real.hpp
new file mode 100644
index 0000000000..6694b2bcbe
--- /dev/null
+++ b/unit_test/batched/dense/Test_Batched_SerialAxpy_Real.hpp
@@ -0,0 +1,12 @@
+
+#if defined(KOKKOSKERNELS_INST_FLOAT)
+TEST_F(TestCategory, batched_scalar_serial_axpy_nt_float_float) {
+  test_batched_axpy<TestExecSpace, float, float>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+TEST_F(TestCategory, batched_scalar_serial_axpy_nt_double_double) {
+  test_batched_axpy<TestExecSpace, double, double>();
+}
+#endif
diff --git a/unit_test/batched/dense/Test_Batched_SerialEigendecomposition.hpp b/unit_test/batched/dense/Test_Batched_SerialEigendecomposition.hpp
index b63998b75f..1572f4cb8b 100644
--- a/unit_test/batched/dense/Test_Batched_SerialEigendecomposition.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialEigendecomposition.hpp
@@ -17,13 +17,13 @@ namespace Test {
   template<typename DeviceType,
            typename ViewRank3Type,
            typename ViewRank2Type,
-	   typename WorkViewType>
+           typename WorkViewType>
   struct Functor_TestBatchedSerialEigendecomposition {
-    ViewRank3Type _A; 
+    ViewRank3Type _A;
     ViewRank2Type _Er, _Ei;
     ViewRank3Type _UL, _UR;
     WorkViewType  _W;
-    
+
     KOKKOS_INLINE_FUNCTION
     Functor_TestBatchedSerialEigendecomposition(const ViewRank3Type A,
                                                 const ViewRank2Type Er,
@@ -42,25 +42,25 @@ namespace Test {
       auto UL = Kokkos::subview(_UL, k, Kokkos::ALL(), Kokkos::ALL());
       auto UR = Kokkos::subview(_UR, k, Kokkos::ALL(), Kokkos::ALL());
       auto W  = Kokkos::subview(_W,  k, Kokkos::ALL());
-      SerialEigendecomposition::invoke(A, er, ei, UL, UR, W);        
+      SerialEigendecomposition::invoke(A, er, ei, UL, UR, W);
     }
-    
+
     inline
     void run() {
       typedef typename ViewRank3Type::value_type value_type;
       std::string name_region("KokkosBatched::Test::SerialEigendecomposition");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion( name.c_str() );
+      std::string name_value_type = ( std::is_same<value_type,float>::value ?
+"::Float" : std::is_same<value_type,double>::value ? "::Double" :
+                                      std::is_same<value_type,Kokkos::complex<float>
+>::value ? "::ComplexFloat" : std::is_same<value_type,Kokkos::complex<double>
+>::value ? "::ComplexDouble" : "::UnknownValueType" ); std::string name =
+name_region + name_value_type; Kokkos::Profiling::pushRegion( name.c_str() );
       Kokkos::RangePolicy<DeviceType> policy(0, _A.extent(0));
-      Kokkos::parallel_for(name.c_str(), policy, *this);            
+      Kokkos::parallel_for(name.c_str(), policy, *this);
       Kokkos::Profiling::popRegion();
     }
   };
-    
+
   template<typename DeviceType,
            typename ValueType,
            typename LayoutType>
@@ -68,11 +68,12 @@ namespace Test {
     typedef ValueType value_type;
     typedef Kokkos::View<value_type***,LayoutType,DeviceType> ViewRank3Type;
     typedef Kokkos::View<value_type**,LayoutType,DeviceType> ViewRank2Type;
-    typedef Kokkos::View<value_type**,Kokkos::LayoutRight,DeviceType> WorkViewType;
+    typedef Kokkos::View<value_type**,Kokkos::LayoutRight,DeviceType>
+WorkViewType;
 
     /// input
     ViewRank3Type A("A", N, m, m);
-    WorkViewType  W("W", N, 2*m*m+m*5);    
+    WorkViewType  W("W", N, 2*m*m+m*5);
 
     /// output
     ViewRank2Type Er("Er", N, m);
@@ -80,36 +81,40 @@ namespace Test {
     ViewRank3Type UL("UL", N, m, m);
     ViewRank3Type UR("UR", N, m, m);
 
-    
-    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(13718);
-    Kokkos::fill_random(A, random, value_type(1.0));
-    Kokkos::fence();
+
+    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space>
+random(13718); Kokkos::fill_random(A, random, value_type(1.0)); Kokkos::fence();
 
     /// test body
     Functor_TestBatchedSerialEigendecomposition
-      <DeviceType,ViewRank3Type,ViewRank2Type,WorkViewType>(A, Er, Ei, UL, UR, W).run();
-    Kokkos::fence();
+      <DeviceType,ViewRank3Type,ViewRank2Type,WorkViewType>(A, Er, Ei, UL, UR,
+W).run(); Kokkos::fence();
   }
 }
 
-template<typename DeviceType, 
+template<typename DeviceType,
          typename ValueType>
 int test_batched_serial_eigendecomposition() {
-// #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) 
+// #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
 //   {
-//     Test::impl_test_batched_serial_eigendecomposition<DeviceType,ValueType,Kokkos::LayoutLeft>(10, 10);
+//
+Test::impl_test_batched_serial_eigendecomposition<DeviceType,ValueType,Kokkos::LayoutLeft>(10,
+10);
 //     for (int i=0;i<10;++i)
-//       Test::impl_test_batched_serial_eigendecomposition<DeviceType,ValueType,Kokkos::LayoutLeft>(10, 1);
+//
+Test::impl_test_batched_serial_eigendecomposition<DeviceType,ValueType,Kokkos::LayoutLeft>(10,
+1);
 //   }
 // #endif
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) 
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
   {
-    Test::impl_test_batched_serial_eigendecomposition<DeviceType,ValueType,Kokkos::LayoutRight>(10, 10);
-    for (int i=0;i<10;++i)
-      Test::impl_test_batched_serial_eigendecomposition<DeviceType,ValueType,Kokkos::LayoutRight>(10, 1);
+    Test::impl_test_batched_serial_eigendecomposition<DeviceType,ValueType,Kokkos::LayoutRight>(10,
+10); for (int i=0;i<10;++i)
+      Test::impl_test_batched_serial_eigendecomposition<DeviceType,ValueType,Kokkos::LayoutRight>(10,
+1);
   }
 #endif
-  
+
   return 0;
 }
 */
diff --git a/unit_test/batched/dense/Test_Batched_SerialEigendecomposition_Real.hpp b/unit_test/batched/dense/Test_Batched_SerialEigendecomposition_Real.hpp
index 344438e719..b9b9239c01 100644
--- a/unit_test/batched/dense/Test_Batched_SerialEigendecomposition_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialEigendecomposition_Real.hpp
@@ -11,4 +11,3 @@ TEST_F( TestCategory, batched_scalar_serial_eigendecomposition_double ) {
 }
 #endif
 */
-
diff --git a/unit_test/batched/dense/Test_Batched_SerialGemm.hpp b/unit_test/batched/dense/Test_Batched_SerialGemm.hpp
index 12f73dfe3b..a7ec3db6a9 100644
--- a/unit_test/batched/dense/Test_Batched_SerialGemm.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialGemm.hpp
@@ -48,15 +48,8 @@ struct Functor_TestBatchedSerialGemm {
   inline void run() {
     typedef typename ViewType::value_type value_type;
     std::string name_region("KokkosBatched::Test::SerialGemm");
-    std::string name_value_type =
-        (std::is_same<value_type, float>::value    ? "::Float"
-         : std::is_same<value_type, double>::value ? "::Double"
-         : std::is_same<value_type, Kokkos::complex<float> >::value
-             ? "::ComplexFloat"
-         : std::is_same<value_type, Kokkos::complex<double> >::value
-             ? "::ComplexDouble"
-             : "::UnknownValueType");
-    std::string name = name_region + name_value_type;
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
     Kokkos::Profiling::pushRegion(name.c_str());
     Kokkos::RangePolicy<DeviceType, ParamTagType> policy(0, _c.extent(0));
     Kokkos::parallel_for(name.c_str(), policy, *this);
@@ -132,8 +125,10 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2,
 
   mag_type eps = ats::epsilon();
 
-  eps *=
-      std::is_same<value_type, Kokkos::Experimental::half_t>::value ? 4 : 1e3;
+  eps *= std::is_same<value_type, Kokkos::Experimental::half_t>::value ||
+                 std::is_same<value_type, Kokkos::Experimental::bhalf_t>::value
+             ? 4
+             : 1e3;
 
   for (int k = 0; k < N; ++k)
     for (int i = 0; i < matCdim1; ++i)
@@ -143,7 +138,7 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2,
       }
   EXPECT_NEAR_KK(diff / sum, 0, eps);
 }
-}
+}  // namespace Gemm
 }  // namespace Test
 
 template <typename DeviceType, typename ValueType, typename ScalarType,
@@ -187,10 +182,18 @@ int test_batched_gemm() {
                         KokkosBatched::Trans::Transpose>::value) &&
           (std::is_same<typename ParamTagType::transB,
                         KokkosBatched::Trans::NoTranspose>::value)) {
-        Test::Gemm::impl_test_batched_gemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimK, dimN, dimM, dimN); }
-      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::Transpose>::value) &&
-        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::Transpose>::value)) {
-          Test::Gemm::impl_test_batched_gemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimN, dimK, dimM, dimN); }
+        Test::Gemm::impl_test_batched_gemm<DeviceType, ViewType, ScalarType,
+                                           ParamTagType, AlgoTagType>(
+            1024, dimK, dimM, dimK, dimN, dimM, dimN);
+      }
+      if ((std::is_same<typename ParamTagType::transA,
+                        KokkosBatched::Trans::Transpose>::value) &&
+          (std::is_same<typename ParamTagType::transB,
+                        KokkosBatched::Trans::Transpose>::value)) {
+        Test::Gemm::impl_test_batched_gemm<DeviceType, ViewType, ScalarType,
+                                           ParamTagType, AlgoTagType>(
+            1024, dimK, dimM, dimN, dimK, dimM, dimN);
+      }
     }
   }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_SerialGemm_Complex.hpp b/unit_test/batched/dense/Test_Batched_SerialGemm_Complex.hpp
index 60597d669b..225b043f71 100644
--- a/unit_test/batched/dense/Test_Batched_SerialGemm_Complex.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialGemm_Complex.hpp
@@ -2,67 +2,83 @@
 
 /// dcomplex, dcomplex
 
-TEST_F( TestCategory, batched_scalar_serial_gemm_nt_nt_dcomplex_dcomplex ) {
-  typedef ::Test::Gemm::ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_dcomplex_dcomplex) {
+  typedef ::Test::Gemm::ParamTag<Trans::NoTranspose, Trans::NoTranspose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+  test_batched_gemm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_gemm_t_nt_dcomplex_dcomplex ) {
-  typedef ::Test::Gemm::ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_dcomplex_dcomplex) {
+  typedef ::Test::Gemm::ParamTag<Trans::Transpose, Trans::NoTranspose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+  test_batched_gemm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_gemm_nt_t_dcomplex_dcomplex ) {
-  typedef ::Test::Gemm::ParamTag<Trans::NoTranspose,Trans::Transpose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_dcomplex_dcomplex) {
+  typedef ::Test::Gemm::ParamTag<Trans::NoTranspose, Trans::Transpose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+  test_batched_gemm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_gemm_t_t_dcomplex_dcomplex ) {
-  typedef ::Test::Gemm::ParamTag<Trans::Transpose,Trans::Transpose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_dcomplex_dcomplex) {
+  typedef ::Test::Gemm::ParamTag<Trans::Transpose, Trans::Transpose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+  test_batched_gemm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>();
 }
 // TEST_F( TestCategory, batched_scalar_serial_gemm_ct_nt_dcomplex_dcomplex ) {
-//   typedef ::Test::Gemm::ParamTag<Trans::ConjTranspose,Trans::NoTranspose> param_tag_type;
-//   typedef Algo::Gemm::Blocked algo_tag_type;
+//   typedef ::Test::Gemm::ParamTag<Trans::ConjTranspose,Trans::NoTranspose>
+//   param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type;
 //   test_batched_gemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
 // }
 // TEST_F( TestCategory, batched_scalar_serial_gemm_nt_ct_dcomplex_dcomplex ) {
-//   typedef ::Test::Gemm::ParamTag<Trans::NoTranspose,Trans::ConjTranspose> param_tag_type;
-//   typedef Algo::Gemm::Blocked algo_tag_type;
+//   typedef ::Test::Gemm::ParamTag<Trans::NoTranspose,Trans::ConjTranspose>
+//   param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type;
 //   test_batched_gemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
 // }
 
 /// dcomplex, double
 
-TEST_F( TestCategory, batched_scalar_serial_gemm_nt_nt_dcomplex_double ) {
-  typedef ::Test::Gemm::ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_dcomplex_double) {
+  typedef ::Test::Gemm::ParamTag<Trans::NoTranspose, Trans::NoTranspose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_gemm<TestExecSpace, Kokkos::complex<double>, double,
+                    param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_gemm_t_nt_dcomplex_double ) {
-  typedef ::Test::Gemm::ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_dcomplex_double) {
+  typedef ::Test::Gemm::ParamTag<Trans::Transpose, Trans::NoTranspose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_gemm<TestExecSpace, Kokkos::complex<double>, double,
+                    param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_gemm_nt_t_dcomplex_double ) {
-  typedef ::Test::Gemm::ParamTag<Trans::NoTranspose,Trans::Transpose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_dcomplex_double) {
+  typedef ::Test::Gemm::ParamTag<Trans::NoTranspose, Trans::Transpose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_gemm<TestExecSpace, Kokkos::complex<double>, double,
+                    param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_gemm_t_t_dcomplex_double ) {
-  typedef ::Test::Gemm::ParamTag<Trans::Transpose,Trans::Transpose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_dcomplex_double) {
+  typedef ::Test::Gemm::ParamTag<Trans::Transpose, Trans::Transpose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_gemm<TestExecSpace, Kokkos::complex<double>, double,
+                    param_tag_type, algo_tag_type>();
 }
 // TEST_F( TestCategory, batched_scalar_serial_gemm_ct_nt_dcomplex_double ) {
-//   typedef ::Test::Gemm::ParamTag<Trans::ConjTranspose,Trans::NoTranspose> param_tag_type;
-//   typedef Algo::Gemm::Blocked algo_tag_type;
+//   typedef ::Test::Gemm::ParamTag<Trans::ConjTranspose,Trans::NoTranspose>
+//   param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type;
 //   test_batched_gemm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
 // }
 // TEST_F( TestCategory, batched_scalar_serial_gemm_nt_ct_dcomplex_double ) {
-//   typedef ::Test::Gemm::ParamTag<Trans::NoTranspose,Trans::ConjTranspose> param_tag_type;
-//   typedef Algo::Gemm::Blocked algo_tag_type;
+//   typedef ::Test::Gemm::ParamTag<Trans::NoTranspose,Trans::ConjTranspose>
+//   param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type;
 //   test_batched_gemm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
 // }
 
diff --git a/unit_test/batched/dense/Test_Batched_SerialGemm_Real.hpp b/unit_test/batched/dense/Test_Batched_SerialGemm_Real.hpp
index d7ae86e045..c10e6c0b78 100644
--- a/unit_test/batched/dense/Test_Batched_SerialGemm_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialGemm_Real.hpp
@@ -1,5 +1,52 @@
+#if defined(KOKKOS_BHALF_T_IS_FLOAT)
+TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_bhalf_bhalf) {
+  typedef ::Test::Gemm::ParamTag<Trans::NoTranspose, Trans::NoTranspose>
+      param_tag_type;
+
+  test_batched_gemm<TestExecSpace, ::Test::bhalfScalarType,
+                    ::Test::bhalfScalarType, param_tag_type,
+                    Algo::Gemm::Blocked>();
+  test_batched_gemm<TestExecSpace, ::Test::bhalfScalarType,
+                    ::Test::bhalfScalarType, param_tag_type,
+                    Algo::Gemm::Unblocked>();
+}
+TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_bhalf_bhalf) {
+  typedef ::Test::Gemm::ParamTag<Trans::Transpose, Trans::NoTranspose>
+      param_tag_type;
+
+  test_batched_gemm<TestExecSpace, ::Test::bhalfScalarType,
+                    ::Test::bhalfScalarType, param_tag_type,
+                    Algo::Gemm::Blocked>();
+  test_batched_gemm<TestExecSpace, ::Test::bhalfScalarType,
+                    ::Test::bhalfScalarType, param_tag_type,
+                    Algo::Gemm::Unblocked>();
+}
+TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_bhalf_bhalf) {
+  typedef ::Test::Gemm::ParamTag<Trans::NoTranspose, Trans::Transpose>
+      param_tag_type;
+
+  test_batched_gemm<TestExecSpace, ::Test::bhalfScalarType,
+                    ::Test::bhalfScalarType, param_tag_type,
+                    Algo::Gemm::Blocked>();
+  test_batched_gemm<TestExecSpace, ::Test::bhalfScalarType,
+                    ::Test::bhalfScalarType, param_tag_type,
+                    Algo::Gemm::Unblocked>();
+}
+TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_bhalf_bhalf) {
+  typedef ::Test::Gemm::ParamTag<Trans::Transpose, Trans::Transpose>
+      param_tag_type;
+
+  test_batched_gemm<TestExecSpace, ::Test::bhalfScalarType,
+                    ::Test::bhalfScalarType, param_tag_type,
+                    Algo::Gemm::Blocked>();
+  test_batched_gemm<TestExecSpace, ::Test::bhalfScalarType,
+                    ::Test::bhalfScalarType, param_tag_type,
+                    Algo::Gemm::Unblocked>();
+}
+#endif  // KOKKOS_BHALF_T_IS_FLOAT
+
 #if defined(KOKKOS_HALF_T_IS_FLOAT)
-TEST_F( TestCategory, batched_scalar_serial_gemm_nt_nt_half_half ) {
+TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_half_half) {
   typedef ::Test::Gemm::ParamTag<Trans::NoTranspose, Trans::NoTranspose>
       param_tag_type;
 
@@ -10,7 +57,7 @@ TEST_F( TestCategory, batched_scalar_serial_gemm_nt_nt_half_half ) {
                     ::Test::halfScalarType, param_tag_type,
                     Algo::Gemm::Unblocked>();
 }
-TEST_F( TestCategory, batched_scalar_serial_gemm_t_nt_half_half ) {
+TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_half_half) {
   typedef ::Test::Gemm::ParamTag<Trans::Transpose, Trans::NoTranspose>
       param_tag_type;
 
@@ -21,7 +68,7 @@ TEST_F( TestCategory, batched_scalar_serial_gemm_t_nt_half_half ) {
                     ::Test::halfScalarType, param_tag_type,
                     Algo::Gemm::Unblocked>();
 }
-TEST_F( TestCategory, batched_scalar_serial_gemm_nt_t_half_half ) {
+TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_half_half) {
   typedef ::Test::Gemm::ParamTag<Trans::NoTranspose, Trans::Transpose>
       param_tag_type;
 
@@ -32,7 +79,7 @@ TEST_F( TestCategory, batched_scalar_serial_gemm_nt_t_half_half ) {
                     ::Test::halfScalarType, param_tag_type,
                     Algo::Gemm::Unblocked>();
 }
-TEST_F( TestCategory, batched_scalar_serial_gemm_t_t_half_half ) {
+TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_half_half) {
   typedef ::Test::Gemm::ParamTag<Trans::Transpose, Trans::Transpose>
       param_tag_type;
 
@@ -43,52 +90,66 @@ TEST_F( TestCategory, batched_scalar_serial_gemm_t_t_half_half ) {
                     ::Test::halfScalarType, param_tag_type,
                     Algo::Gemm::Unblocked>();
 }
-#endif // KOKKOS_HALF_T_IS_FLOAT
+#endif  // KOKKOS_HALF_T_IS_FLOAT
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_scalar_serial_gemm_nt_nt_float_float ) {
-  typedef ::Test::Gemm::ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_float_float) {
+  typedef ::Test::Gemm::ParamTag<Trans::NoTranspose, Trans::NoTranspose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_gemm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_gemm_t_nt_float_float ) {
-  typedef ::Test::Gemm::ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_float_float) {
+  typedef ::Test::Gemm::ParamTag<Trans::Transpose, Trans::NoTranspose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_gemm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_gemm_nt_t_float_float ) {
-  typedef ::Test::Gemm::ParamTag<Trans::NoTranspose,Trans::Transpose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_float_float) {
+  typedef ::Test::Gemm::ParamTag<Trans::NoTranspose, Trans::Transpose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_gemm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_gemm_t_t_float_float ) {
-  typedef ::Test::Gemm::ParamTag<Trans::Transpose,Trans::Transpose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_float_float) {
+  typedef ::Test::Gemm::ParamTag<Trans::Transpose, Trans::Transpose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_gemm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
 #endif
 
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F( TestCategory, batched_scalar_serial_gemm_nt_nt_double_double ) {
-  typedef ::Test::Gemm::ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_double_double) {
+  typedef ::Test::Gemm::ParamTag<Trans::NoTranspose, Trans::NoTranspose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_gemm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_gemm_t_nt_double_double ) {
-  typedef ::Test::Gemm::ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_double_double) {
+  typedef ::Test::Gemm::ParamTag<Trans::Transpose, Trans::NoTranspose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_gemm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_gemm_nt_t_double_double ) {
-  typedef ::Test::Gemm::ParamTag<Trans::NoTranspose,Trans::Transpose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_double_double) {
+  typedef ::Test::Gemm::ParamTag<Trans::NoTranspose, Trans::Transpose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_gemm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_gemm_t_t_double_double ) {
-  typedef ::Test::Gemm::ParamTag<Trans::Transpose,Trans::Transpose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_double_double) {
+  typedef ::Test::Gemm::ParamTag<Trans::Transpose, Trans::Transpose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_gemm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_gemm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
 #endif
-
-
diff --git a/unit_test/batched/dense/Test_Batched_SerialGemv.hpp b/unit_test/batched/dense/Test_Batched_SerialGemv.hpp
index ac7dc3e9ea..eb1d8f285c 100644
--- a/unit_test/batched/dense/Test_Batched_SerialGemv.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialGemv.hpp
@@ -16,142 +16,133 @@ using namespace KokkosBatched;
 namespace Test {
 namespace Gemv {
 
-  template<typename T>
-  struct ParamTag { 
-    typedef T trans;
-  };
- 
-  template<typename DeviceType,
-           typename ViewType,
-           typename ScalarType,
-           typename ParamTagType, 
-           typename AlgoTagType>
-  struct Functor_TestBatchedSerialGemv {
-    ViewType _a, _b, _c;
-    
-    ScalarType _alpha, _beta;
-    
-    KOKKOS_INLINE_FUNCTION
-    Functor_TestBatchedSerialGemv(const ScalarType alpha, 
-            const ViewType &a,
-            const ViewType &b,
-            const ScalarType beta,
-            const ViewType &c)
+template <typename T>
+struct ParamTag {
+  typedef T trans;
+};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
+struct Functor_TestBatchedSerialGemv {
+  ViewType _a, _b, _c;
+
+  ScalarType _alpha, _beta;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedSerialGemv(const ScalarType alpha, const ViewType &a,
+                                const ViewType &b, const ScalarType beta,
+                                const ViewType &c)
       : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {}
-    
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const ParamTagType &, const int k) const {
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-      auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), 0);
-      auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), 0);
-      
-      SerialGemv<typename ParamTagType::trans,
-        AlgoTagType>::
-        invoke(_alpha, aa, bb, _beta, cc);
-    }
-    
-    inline
-    void run() {
-      typedef typename ViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::SerialGemv");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion(name.c_str() );
-      Kokkos::RangePolicy<DeviceType,ParamTagType> policy(0, _c.extent(0));
-      Kokkos::parallel_for(name.c_str(), policy, *this);
-      Kokkos::Profiling::popRegion();
-    }
-  };
-    
-  template<typename DeviceType,
-           typename ViewType,
-           typename ScalarType,
-           typename ParamTagType, 
-           typename AlgoTagType>
-  void impl_test_batched_gemv(const int N, const int BlkSize) {
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const ParamTagType &, const int k) const {
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), 0);
+    auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), 0);
+
+    SerialGemv<typename ParamTagType::trans, AlgoTagType>::invoke(
+        _alpha, aa, bb, _beta, cc);
+  }
+
+  inline void run() {
     typedef typename ViewType::value_type value_type;
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
-
-    /// randomized input testing views
-    ScalarType alpha = 1.5, beta = 3.0;
-
-    ViewType
-      a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize),
-      b0("b0", N, BlkSize, 1),       b1("b1", N, BlkSize, 1),
-      c0("c0", N, BlkSize, 1),       c1("c1", N, BlkSize, 1);
-
-    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(13718);
-    Kokkos::fill_random(a0, random, value_type(1.0));
-    Kokkos::fill_random(b0, random, value_type(1.0));
-    Kokkos::fill_random(c0, random, value_type(1.0));
-
-    Kokkos::fence();
-
-    Kokkos::deep_copy(a1, a0);
-    Kokkos::deep_copy(b1, b0);
-    Kokkos::deep_copy(c1, c0);
-
-    /// test body
-    Functor_TestBatchedSerialGemv<DeviceType,ViewType,ScalarType,
-      ParamTagType,Algo::Gemv::Unblocked>(alpha, a0, b0, beta, c0).run();
-    Functor_TestBatchedSerialGemv<DeviceType,ViewType,ScalarType,
-      ParamTagType,AlgoTagType>(alpha, a1, b1, beta, c1).run();
-
-    Kokkos::fence();
-
-    /// for comparison send it to host
-    typename ViewType::HostMirror c0_host = Kokkos::create_mirror_view(c0);
-    typename ViewType::HostMirror c1_host = Kokkos::create_mirror_view(c1);
-
-    Kokkos::deep_copy(c0_host, c0);
-    Kokkos::deep_copy(c1_host, c1);
-
-    /// check c0 = c1 ; this eps is about 10^-14
-    typedef typename ats::mag_type mag_type;
-    mag_type sum(1), diff(0);
-    const mag_type eps = 1.0e3 * ats::epsilon();
-
-    for (int k=0;k<N;++k) 
-      for (int i=0;i<BlkSize;++i) 
-        for (int j=0;j<1;++j) {
-          sum  += ats::abs(c0_host(k,i,j));
-          diff += ats::abs(c0_host(k,i,j)-c1_host(k,i,j));
-        }
-    EXPECT_NEAR_KK( diff/sum, 0, eps);
+    std::string name_region("KokkosBatched::Test::SerialGemv");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::RangePolicy<DeviceType, ParamTagType> policy(0, _c.extent(0));
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
   }
+};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
+void impl_test_batched_gemv(const int N, const int BlkSize) {
+  typedef typename ViewType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  /// randomized input testing views
+  ScalarType alpha = 1.5, beta = 3.0;
+
+  ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize),
+      b0("b0", N, BlkSize, 1), b1("b1", N, BlkSize, 1), c0("c0", N, BlkSize, 1),
+      c1("c1", N, BlkSize, 1);
+
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
+  Kokkos::fill_random(a0, random, value_type(1.0));
+  Kokkos::fill_random(b0, random, value_type(1.0));
+  Kokkos::fill_random(c0, random, value_type(1.0));
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(a1, a0);
+  Kokkos::deep_copy(b1, b0);
+  Kokkos::deep_copy(c1, c0);
+
+  /// test body
+  Functor_TestBatchedSerialGemv<DeviceType, ViewType, ScalarType, ParamTagType,
+                                Algo::Gemv::Unblocked>(alpha, a0, b0, beta, c0)
+      .run();
+  Functor_TestBatchedSerialGemv<DeviceType, ViewType, ScalarType, ParamTagType,
+                                AlgoTagType>(alpha, a1, b1, beta, c1)
+      .run();
+
+  Kokkos::fence();
+
+  /// for comparison send it to host
+  typename ViewType::HostMirror c0_host = Kokkos::create_mirror_view(c0);
+  typename ViewType::HostMirror c1_host = Kokkos::create_mirror_view(c1);
+
+  Kokkos::deep_copy(c0_host, c0);
+  Kokkos::deep_copy(c1_host, c1);
+
+  /// check c0 = c1 ; this eps is about 10^-14
+  typedef typename ats::mag_type mag_type;
+  mag_type sum(1), diff(0);
+  const mag_type eps = 1.0e3 * ats::epsilon();
+
+  for (int k = 0; k < N; ++k)
+    for (int i = 0; i < BlkSize; ++i)
+      for (int j = 0; j < 1; ++j) {
+        sum += ats::abs(c0_host(k, i, j));
+        diff += ats::abs(c0_host(k, i, j) - c1_host(k, i, j));
+      }
+  EXPECT_NEAR_KK(diff / sum, 0, eps);
 }
-}
+}  // namespace Gemv
+}  // namespace Test
 
-template<typename DeviceType, 
-         typename ValueType, 
-         typename ScalarType,
-         typename ParamTagType,
-         typename AlgoTagType>
+template <typename DeviceType, typename ValueType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
 int test_batched_gemv() {
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) 
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft,DeviceType> ViewType;
-    Test::Gemv::impl_test_batched_gemv<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(     0, 10);
-    for (int i=0;i<10;++i) {                                                                                        
-      //printf("Testing: LayoutLeft,  Blksize %d\n", i); 
-      Test::Gemv::impl_test_batched_gemv<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024,  i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        ViewType;
+    Test::Gemv::impl_test_batched_gemv<DeviceType, ViewType, ScalarType,
+                                       ParamTagType, AlgoTagType>(0, 10);
+    for (int i = 0; i < 10; ++i) {
+      // printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      Test::Gemv::impl_test_batched_gemv<DeviceType, ViewType, ScalarType,
+                                         ParamTagType, AlgoTagType>(1024, i);
     }
   }
 #endif
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) 
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> ViewType;
-    Test::Gemv::impl_test_batched_gemv<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(     0, 10);
-    for (int i=0;i<10;++i) {                                                                                        
-      //printf("Testing: LayoutRight, Blksize %d\n", i); 
-      Test::Gemv::impl_test_batched_gemv<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024,  i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    Test::Gemv::impl_test_batched_gemv<DeviceType, ViewType, ScalarType,
+                                       ParamTagType, AlgoTagType>(0, 10);
+    for (int i = 0; i < 10; ++i) {
+      // printf("Testing: LayoutRight, Blksize %d\n", i);
+      Test::Gemv::impl_test_batched_gemv<DeviceType, ViewType, ScalarType,
+                                         ParamTagType, AlgoTagType>(1024, i);
     }
   }
 #endif
-  
+
   return 0;
 }
-
diff --git a/unit_test/batched/dense/Test_Batched_SerialGemv_Complex.hpp b/unit_test/batched/dense/Test_Batched_SerialGemv_Complex.hpp
index 2c874ca0ff..6b072824f3 100644
--- a/unit_test/batched/dense/Test_Batched_SerialGemv_Complex.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialGemv_Complex.hpp
@@ -2,15 +2,17 @@
 
 /// dcomplex, dcomplex
 
-TEST_F( TestCategory, batched_scalar_serial_gemv_nt_dcomplex_dcomplex ) {
+TEST_F(TestCategory, batched_scalar_serial_gemv_nt_dcomplex_dcomplex) {
   typedef ::Test::Gemv::ParamTag<Trans::NoTranspose> param_tag_type;
   typedef Algo::Gemv::Blocked algo_tag_type;
-  test_batched_gemv<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+  test_batched_gemv<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_gemv_t_dcomplex_dcomplex ) {
+TEST_F(TestCategory, batched_scalar_serial_gemv_t_dcomplex_dcomplex) {
   typedef ::Test::Gemv::ParamTag<Trans::Transpose> param_tag_type;
   typedef Algo::Gemv::Blocked algo_tag_type;
-  test_batched_gemv<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+  test_batched_gemv<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>();
 }
 // TEST_F( TestCategory, batched_scalar_serial_gemv_ct_dcomplex_dcomplex ) {
 //   typedef ::Test::ParamTag<Trans::ConjTranspose> param_tag_type;
@@ -20,15 +22,17 @@ TEST_F( TestCategory, batched_scalar_serial_gemv_t_dcomplex_dcomplex ) {
 
 /// dcomplex, double
 
-TEST_F( TestCategory, batched_scalar_serial_gemv_nt_dcomplex_double ) {
+TEST_F(TestCategory, batched_scalar_serial_gemv_nt_dcomplex_double) {
   typedef ::Test::Gemv::ParamTag<Trans::NoTranspose> param_tag_type;
   typedef Algo::Gemv::Blocked algo_tag_type;
-  test_batched_gemv<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_gemv<TestExecSpace, Kokkos::complex<double>, double,
+                    param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_gemv_t_dcomplex_double ) {
+TEST_F(TestCategory, batched_scalar_serial_gemv_t_dcomplex_double) {
   typedef ::Test::Gemv::ParamTag<Trans::Transpose> param_tag_type;
   typedef Algo::Gemv::Blocked algo_tag_type;
-  test_batched_gemv<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_gemv<TestExecSpace, Kokkos::complex<double>, double,
+                    param_tag_type, algo_tag_type>();
 }
 // TEST_F( TestCategory, batched_scalar_serial_gemv_ct_dcomplex_double ) {
 //   typedef ::Test::ParamTag<Trans::ConjTranspose> param_tag_type;
diff --git a/unit_test/batched/dense/Test_Batched_SerialGemv_Real.hpp b/unit_test/batched/dense/Test_Batched_SerialGemv_Real.hpp
index be7e2ef48b..ac3774eb66 100644
--- a/unit_test/batched/dense/Test_Batched_SerialGemv_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialGemv_Real.hpp
@@ -1,26 +1,30 @@
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_scalar_serial_gemv_nt_float_float ) {
+TEST_F(TestCategory, batched_scalar_serial_gemv_nt_float_float) {
   typedef ::Test::Gemv::ParamTag<Trans::NoTranspose> param_tag_type;
   typedef Algo::Gemv::Blocked algo_tag_type;
-  test_batched_gemv<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_gemv<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_gemv_t_float_float ) {
+TEST_F(TestCategory, batched_scalar_serial_gemv_t_float_float) {
   typedef ::Test::Gemv::ParamTag<Trans::Transpose> param_tag_type;
   typedef Algo::Gemv::Blocked algo_tag_type;
-  test_batched_gemv<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_gemv<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
 #endif
 
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F( TestCategory, batched_scalar_serial_gemv_nt_double_double ) {
+TEST_F(TestCategory, batched_scalar_serial_gemv_nt_double_double) {
   typedef ::Test::Gemv::ParamTag<Trans::NoTranspose> param_tag_type;
   typedef Algo::Gemv::Blocked algo_tag_type;
-  test_batched_gemv<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_gemv<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_gemv_t_double_double ) {
+TEST_F(TestCategory, batched_scalar_serial_gemv_t_double_double) {
   typedef ::Test::Gemv::ParamTag<Trans::Transpose> param_tag_type;
   typedef Algo::Gemv::Blocked algo_tag_type;
-  test_batched_gemv<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_gemv<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_SerialInverseLU.hpp b/unit_test/batched/dense/Test_Batched_SerialInverseLU.hpp
index a761b55d14..fd7d0478fc 100644
--- a/unit_test/batched/dense/Test_Batched_SerialInverseLU.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialInverseLU.hpp
@@ -20,211 +20,195 @@ using namespace KokkosBatched;
 namespace Test {
 namespace SerialInverseLU {
 
-  template<typename TA, typename TB>
-  struct ParamTag { 
-    typedef TA transA;
-    typedef TB transB;
-  };
- 
-  template<typename DeviceType,
-           typename ViewType,
-           typename ScalarType,
-           typename ParamTagType, 
-           typename AlgoTagType>
-  struct Functor_BatchedSerialGemm {
-    ViewType _a, _b, _c;
-    
-    ScalarType _alpha, _beta;
-    
-    KOKKOS_INLINE_FUNCTION
-    Functor_BatchedSerialGemm(const ScalarType alpha, 
-            const ViewType &a,
-            const ViewType &b,
-            const ScalarType beta,
-            const ViewType &c)
+template <typename TA, typename TB>
+struct ParamTag {
+  typedef TA transA;
+  typedef TB transB;
+};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
+struct Functor_BatchedSerialGemm {
+  ViewType _a, _b, _c;
+
+  ScalarType _alpha, _beta;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_BatchedSerialGemm(const ScalarType alpha, const ViewType &a,
+                            const ViewType &b, const ScalarType beta,
+                            const ViewType &c)
       : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {}
-    
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const ParamTagType &, const int k) const {
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-      auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
-      auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL());
-
-      for (int i=0;i<static_cast<int>(aa.extent(0));++i)
-        aa(i,i) += 10.0;
-     
-      SerialGemm<typename ParamTagType::transA,
-        typename ParamTagType::transB,
-        AlgoTagType>::
-        invoke(_alpha, aa, bb, _beta, cc);
-    }
-    
-    inline
-    void run() {
-      typedef typename ViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::SerialInverseLU");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion(name.c_str() );
-      Kokkos::RangePolicy<DeviceType,ParamTagType> policy(0, _c.extent(0));
-      Kokkos::parallel_for((name+"::GemmFunctor").c_str(), policy, *this);            
-      Kokkos::Profiling::popRegion();
-    }
-  };
 
-  template<typename DeviceType,
-           typename ViewType,
-           typename AlgoTagType>
-  struct Functor_BatchedSerialLU {
-    ViewType _a;
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const ParamTagType &, const int k) const {
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
+    auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL());
 
-    KOKKOS_INLINE_FUNCTION
-    Functor_BatchedSerialLU(const ViewType &a) 
-      : _a(a) {} 
+    for (int i = 0; i < static_cast<int>(aa.extent(0)); ++i) aa(i, i) += 10.0;
 
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const int k) const {
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    SerialGemm<typename ParamTagType::transA, typename ParamTagType::transB,
+               AlgoTagType>::invoke(_alpha, aa, bb, _beta, cc);
+  }
 
-      for (int i=0;i<static_cast<int>(aa.extent(0));++i)
-        aa(i,i) += 10.0;
+  inline void run() {
+    typedef typename ViewType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::SerialInverseLU");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::RangePolicy<DeviceType, ParamTagType> policy(0, _c.extent(0));
+    Kokkos::parallel_for((name + "::GemmFunctor").c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
 
-      SerialLU<AlgoTagType>::invoke(aa);
-    }
+template <typename DeviceType, typename ViewType, typename AlgoTagType>
+struct Functor_BatchedSerialLU {
+  ViewType _a;
 
-    inline
-    void run() {
-      typedef typename ViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::SerialInverseLU");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion(name.c_str() );
-      Kokkos::RangePolicy<DeviceType> policy(0, _a.extent(0));
-      Kokkos::parallel_for((name+"::LUFunctor").c_str(), policy, *this);
-      Kokkos::Profiling::popRegion();
-    }
-  };
-
-  template<typename DeviceType,
-           typename AViewType,
-           typename WViewType,
-           typename AlgoTagType>
-  struct Functor_TestBatchedSerialInverseLU {
-    AViewType _a;
-    WViewType _w;
-
-    KOKKOS_INLINE_FUNCTION
-    Functor_TestBatchedSerialInverseLU(const AViewType &a, const WViewType &w) 
-      : _a(a), _w(w) {} 
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const int k) const {
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-      auto ww = Kokkos::subview(_w, k, Kokkos::ALL());
-
-      KokkosBatched::SerialInverseLU<AlgoTagType>::invoke(aa,ww);
-    }
+  KOKKOS_INLINE_FUNCTION
+  Functor_BatchedSerialLU(const ViewType &a) : _a(a) {}
 
-    inline
-    void run() {
-      typedef typename AViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::SerialInverseLU");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion(name.c_str() );
-      Kokkos::RangePolicy<DeviceType> policy(0, _a.extent(0));
-      Kokkos::parallel_for((name+"::InverseLUFunctor").c_str(), policy, *this);
-      Kokkos::Profiling::popRegion();
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int k) const {
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
 
-    }
-  };
+    for (int i = 0; i < static_cast<int>(aa.extent(0)); ++i) aa(i, i) += 10.0;
+
+    SerialLU<AlgoTagType>::invoke(aa);
+  }
+
+  inline void run() {
+    typedef typename ViewType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::SerialInverseLU");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::RangePolicy<DeviceType> policy(0, _a.extent(0));
+    Kokkos::parallel_for((name + "::LUFunctor").c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename AViewType, typename WViewType,
+          typename AlgoTagType>
+struct Functor_TestBatchedSerialInverseLU {
+  AViewType _a;
+  WViewType _w;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedSerialInverseLU(const AViewType &a, const WViewType &w)
+      : _a(a), _w(w) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int k) const {
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto ww = Kokkos::subview(_w, k, Kokkos::ALL());
+
+    KokkosBatched::SerialInverseLU<AlgoTagType>::invoke(aa, ww);
+  }
 
-  template<typename DeviceType,
-           typename AViewType,
-           typename WViewType,
-           typename AlgoTagType>
-  void impl_test_batched_inverselu(const int N, const int BlkSize) {
+  inline void run() {
     typedef typename AViewType::value_type value_type;
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
+    std::string name_region("KokkosBatched::Test::SerialInverseLU");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::RangePolicy<DeviceType> policy(0, _a.extent(0));
+    Kokkos::parallel_for((name + "::InverseLUFunctor").c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
 
-    /// randomized input testing views
-    AViewType a0("a0", N, BlkSize, BlkSize);
-    AViewType a1("a1", N, BlkSize, BlkSize);
-    WViewType w ("w",  N, BlkSize* BlkSize);
-    AViewType c0("c0", N, BlkSize, BlkSize);
+template <typename DeviceType, typename AViewType, typename WViewType,
+          typename AlgoTagType>
+void impl_test_batched_inverselu(const int N, const int BlkSize) {
+  typedef typename AViewType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
 
-    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(13718);
-    Kokkos::fill_random(a0, random, value_type(1.0));
+  /// randomized input testing views
+  AViewType a0("a0", N, BlkSize, BlkSize);
+  AViewType a1("a1", N, BlkSize, BlkSize);
+  WViewType w("w", N, BlkSize * BlkSize);
+  AViewType c0("c0", N, BlkSize, BlkSize);
 
-    Kokkos::fence();
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
+  Kokkos::fill_random(a0, random, value_type(1.0));
 
-    Kokkos::deep_copy(a1, a0);
-    Kokkos::deep_copy(w, value_type(0.0));
+  Kokkos::fence();
 
-    Functor_BatchedSerialLU<DeviceType,AViewType,AlgoTagType>(a1).run();
+  Kokkos::deep_copy(a1, a0);
+  Kokkos::deep_copy(w, value_type(0.0));
 
-    Functor_TestBatchedSerialInverseLU<DeviceType,AViewType,WViewType,AlgoTagType>(a1,w).run();
+  Functor_BatchedSerialLU<DeviceType, AViewType, AlgoTagType>(a1).run();
 
-    value_type alpha = 1.0, beta = 0.0;   
-    typedef SerialInverseLU::ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
+  Functor_TestBatchedSerialInverseLU<DeviceType, AViewType, WViewType,
+                                     AlgoTagType>(a1, w)
+      .run();
 
-    Functor_BatchedSerialGemm<DeviceType,AViewType,value_type,
-      param_tag_type,AlgoTagType>(alpha, a0, a1, beta, c0).run();
+  value_type alpha = 1.0, beta = 0.0;
+  typedef SerialInverseLU::ParamTag<Trans::NoTranspose, Trans::NoTranspose>
+      param_tag_type;
 
-    Kokkos::fence();
+  Functor_BatchedSerialGemm<DeviceType, AViewType, value_type, param_tag_type,
+                            AlgoTagType>(alpha, a0, a1, beta, c0)
+      .run();
 
-    /// for comparison send it to host
-    typename AViewType::HostMirror c0_host = Kokkos::create_mirror_view(c0);
+  Kokkos::fence();
 
-    Kokkos::deep_copy(c0_host, c0);
+  /// for comparison send it to host
+  typename AViewType::HostMirror c0_host = Kokkos::create_mirror_view(c0);
 
-    /// check identity matrix ; this eps is about 10^-14
-    typedef typename ats::mag_type mag_type;
-    mag_type sum_diag(0), sum_all(0), sum_diag_ref(N*BlkSize);
-    const mag_type eps = 1.0e3 * ats::epsilon();
-    
-    for (int k=0;k<N;++k)
-      for (int i=0;i<BlkSize;++i)
-        for (int j=0;j<BlkSize;++j) {
-          sum_all  += ats::abs(c0_host(k,i,j));
-          if (i==j) sum_diag += ats::abs(c0_host(k,i,j));
-        }
-    EXPECT_NEAR_KK( sum_all - sum_diag, 0, eps);
-    EXPECT_NEAR_KK( sum_diag - sum_diag_ref, 0, eps);
-  }
-}
+  Kokkos::deep_copy(c0_host, c0);
+
+  /// check identity matrix ; this eps is about 10^-14
+  typedef typename ats::mag_type mag_type;
+  mag_type sum_diag(0), sum_all(0), sum_diag_ref(N * BlkSize);
+  const mag_type eps = 1.0e3 * ats::epsilon();
+
+  for (int k = 0; k < N; ++k)
+    for (int i = 0; i < BlkSize; ++i)
+      for (int j = 0; j < BlkSize; ++j) {
+        sum_all += ats::abs(c0_host(k, i, j));
+        if (i == j) sum_diag += ats::abs(c0_host(k, i, j));
+      }
+  EXPECT_NEAR_KK(sum_all - sum_diag, 0, eps);
+  EXPECT_NEAR_KK(sum_diag - sum_diag_ref, 0, eps);
 }
+}  // namespace SerialInverseLU
+}  // namespace Test
 
-template<typename DeviceType,
-         typename ValueType,
-         typename AlgoTagType>
+template <typename DeviceType, typename ValueType, typename AlgoTagType>
 int test_batched_inverselu() {
 #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft,DeviceType> AViewType;
-    typedef Kokkos::View<ValueType**, Kokkos::LayoutRight,DeviceType> WViewType;
-    Test::SerialInverseLU::impl_test_batched_inverselu<DeviceType,AViewType,WViewType,AlgoTagType>(     0, 10);
-    for (int i=0;i<10;++i) {
-      Test::SerialInverseLU::impl_test_batched_inverselu<DeviceType,AViewType,WViewType,AlgoTagType>(1024,  i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        AViewType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        WViewType;
+    Test::SerialInverseLU::impl_test_batched_inverselu<DeviceType, AViewType,
+                                                       WViewType, AlgoTagType>(
+        0, 10);
+    for (int i = 0; i < 10; ++i) {
+      Test::SerialInverseLU::impl_test_batched_inverselu<
+          DeviceType, AViewType, WViewType, AlgoTagType>(1024, i);
     }
   }
 #endif
 #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> AViewType;
-    typedef Kokkos::View<ValueType**, Kokkos::LayoutRight,DeviceType> WViewType;
-    Test::SerialInverseLU::impl_test_batched_inverselu<DeviceType,AViewType,WViewType,AlgoTagType>(     0, 10);
-    for (int i=0;i<10;++i) {
-      Test::SerialInverseLU::impl_test_batched_inverselu<DeviceType,AViewType,WViewType,AlgoTagType>(1024,  i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        AViewType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        WViewType;
+    Test::SerialInverseLU::impl_test_batched_inverselu<DeviceType, AViewType,
+                                                       WViewType, AlgoTagType>(
+        0, 10);
+    for (int i = 0; i < 10; ++i) {
+      Test::SerialInverseLU::impl_test_batched_inverselu<
+          DeviceType, AViewType, WViewType, AlgoTagType>(1024, i);
     }
   }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_SerialInverseLU_Complex.hpp b/unit_test/batched/dense/Test_Batched_SerialInverseLU_Complex.hpp
index ac3fc14456..8ea52075ff 100644
--- a/unit_test/batched/dense/Test_Batched_SerialInverseLU_Complex.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialInverseLU_Complex.hpp
@@ -1,9 +1,13 @@
 
 #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
-TEST_F( TestCategory, batched_scalar_serial_inverselu_dcomplex ) {
-  //printf("Batched serial inverse LU - double complex - algorithm type: Unblocked\n");
-  test_batched_inverselu<TestExecSpace,Kokkos::complex<double>,Algo::InverseLU::Unblocked>();
-  //printf("Batched serial inverse LU - double complex - algorithm type: Blocked\n");
-  test_batched_inverselu<TestExecSpace,Kokkos::complex<double>,Algo::InverseLU::Blocked>();  
+TEST_F(TestCategory, batched_scalar_serial_inverselu_dcomplex) {
+  // printf("Batched serial inverse LU - double complex - algorithm type:
+  // Unblocked\n");
+  test_batched_inverselu<TestExecSpace, Kokkos::complex<double>,
+                         Algo::InverseLU::Unblocked>();
+  // printf("Batched serial inverse LU - double complex - algorithm type:
+  // Blocked\n");
+  test_batched_inverselu<TestExecSpace, Kokkos::complex<double>,
+                         Algo::InverseLU::Blocked>();
 }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_SerialInverseLU_Real.hpp b/unit_test/batched/dense/Test_Batched_SerialInverseLU_Real.hpp
index 1706b22581..8d44bdc0b5 100644
--- a/unit_test/batched/dense/Test_Batched_SerialInverseLU_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialInverseLU_Real.hpp
@@ -1,20 +1,18 @@
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_scalar_serial_inverselu_float ) {
-  //printf("Batched serial inverse LU - float - algorithm type: Unblocked\n");
-  test_batched_inverselu<TestExecSpace,float,Algo::InverseLU::Unblocked>();
-  //printf("Batched serial inverse LU - float - algorithm type: Blocked\n");
-  test_batched_inverselu<TestExecSpace,float,Algo::InverseLU::Blocked>();
+TEST_F(TestCategory, batched_scalar_serial_inverselu_float) {
+  // printf("Batched serial inverse LU - float - algorithm type: Unblocked\n");
+  test_batched_inverselu<TestExecSpace, float, Algo::InverseLU::Unblocked>();
+  // printf("Batched serial inverse LU - float - algorithm type: Blocked\n");
+  test_batched_inverselu<TestExecSpace, float, Algo::InverseLU::Blocked>();
 }
 #endif
 
-
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F( TestCategory, batched_scalar_serial_inverselu_double ) {
-  //printf("Batched serial inverse LU - double - algorithm type: Unblocked\n");
-  test_batched_inverselu<TestExecSpace,double,Algo::InverseLU::Unblocked>();
-  //printf("Batched serial inverse LU - double - algorithm type: Blocked\n");
-  test_batched_inverselu<TestExecSpace,double,Algo::InverseLU::Blocked>();
+TEST_F(TestCategory, batched_scalar_serial_inverselu_double) {
+  // printf("Batched serial inverse LU - double - algorithm type: Unblocked\n");
+  test_batched_inverselu<TestExecSpace, double, Algo::InverseLU::Unblocked>();
+  // printf("Batched serial inverse LU - double - algorithm type: Blocked\n");
+  test_batched_inverselu<TestExecSpace, double, Algo::InverseLU::Blocked>();
 }
 #endif
-
diff --git a/unit_test/batched/dense/Test_Batched_SerialLU.hpp b/unit_test/batched/dense/Test_Batched_SerialLU.hpp
index 27c88fa31b..9bb92340a2 100644
--- a/unit_test/batched/dense/Test_Batched_SerialLU.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialLU.hpp
@@ -15,109 +15,99 @@ using namespace KokkosBatched;
 
 namespace Test {
 
-  template<typename DeviceType,
-           typename ViewType,
-           typename AlgoTagType>
-  struct Functor_TestBatchedSerialLU {
-    ViewType _a;
+template <typename DeviceType, typename ViewType, typename AlgoTagType>
+struct Functor_TestBatchedSerialLU {
+  ViewType _a;
 
-    KOKKOS_INLINE_FUNCTION
-    Functor_TestBatchedSerialLU(const ViewType &a) 
-      : _a(a) {} 
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedSerialLU(const ViewType &a) : _a(a) {}
 
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const int k) const {
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int k) const {
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
 
-      for (int i=0;i<static_cast<int>(aa.extent(0));++i)
-        aa(i,i) += 10.0;
+    for (int i = 0; i < static_cast<int>(aa.extent(0)); ++i) aa(i, i) += 10.0;
 
-      SerialLU<AlgoTagType>::invoke(aa);
-    }
-
-    inline
-    void run() {
-      typedef typename ViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::SerialLU");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion( name.c_str() );
-      Kokkos::RangePolicy<DeviceType> policy(0, _a.extent(0));
-      Kokkos::parallel_for(name.c_str(), policy, *this);
-      Kokkos::Profiling::popRegion();
-    }
-  };
+    SerialLU<AlgoTagType>::invoke(aa);
+  }
 
-  template<typename DeviceType,
-           typename ViewType,
-           typename AlgoTagType>
-  void impl_test_batched_lu(const int N, const int BlkSize) {
+  inline void run() {
     typedef typename ViewType::value_type value_type;
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
+    std::string name_region("KokkosBatched::Test::SerialLU");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::RangePolicy<DeviceType> policy(0, _a.extent(0));
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
 
-    /// randomized input testing views
-    ViewType
-      a0("a0", N, BlkSize,BlkSize), a1("a1", N, BlkSize, BlkSize);
+template <typename DeviceType, typename ViewType, typename AlgoTagType>
+void impl_test_batched_lu(const int N, const int BlkSize) {
+  typedef typename ViewType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
 
-    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(13718);
-    Kokkos::fill_random(a0, random, value_type(1.0));
+  /// randomized input testing views
+  ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize);
 
-    Kokkos::fence();
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
+  Kokkos::fill_random(a0, random, value_type(1.0));
 
-    Kokkos::deep_copy(a1, a0);
+  Kokkos::fence();
 
-    Functor_TestBatchedSerialLU<DeviceType,ViewType,Algo::LU::Unblocked>(a0).run();
-    Functor_TestBatchedSerialLU<DeviceType,ViewType,AlgoTagType>(a1).run();
+  Kokkos::deep_copy(a1, a0);
 
-    Kokkos::fence();
+  Functor_TestBatchedSerialLU<DeviceType, ViewType, Algo::LU::Unblocked>(a0)
+      .run();
+  Functor_TestBatchedSerialLU<DeviceType, ViewType, AlgoTagType>(a1).run();
 
-    /// for comparison send it to host
-    typename ViewType::HostMirror a0_host = Kokkos::create_mirror_view(a0);
-    typename ViewType::HostMirror a1_host = Kokkos::create_mirror_view(a1);
+  Kokkos::fence();
 
-    Kokkos::deep_copy(a0_host, a0);
-    Kokkos::deep_copy(a1_host, a1);
+  /// for comparison send it to host
+  typename ViewType::HostMirror a0_host = Kokkos::create_mirror_view(a0);
+  typename ViewType::HostMirror a1_host = Kokkos::create_mirror_view(a1);
 
-    /// check b0 = b1 ; this eps is about 10^-14
-    typedef typename ats::mag_type mag_type;
-    mag_type sum(1), diff(0);
-    const mag_type eps = 1.0e3 * ats::epsilon();
+  Kokkos::deep_copy(a0_host, a0);
+  Kokkos::deep_copy(a1_host, a1);
 
-    for (int k=0;k<N;++k)
-      for (int i=0;i<BlkSize;++i)
-        for (int j=0;j<BlkSize;++j) {
-          sum  += ats::abs(a0_host(k,i,j));
-          diff += ats::abs(a0_host(k,i,j)-a1_host(k,i,j));
-        }
-    EXPECT_NEAR_KK( diff/sum, 0, eps);
-  }
-}
+  /// check b0 = b1 ; this eps is about 10^-14
+  typedef typename ats::mag_type mag_type;
+  mag_type sum(1), diff(0);
+  const mag_type eps = 1.0e3 * ats::epsilon();
 
+  for (int k = 0; k < N; ++k)
+    for (int i = 0; i < BlkSize; ++i)
+      for (int j = 0; j < BlkSize; ++j) {
+        sum += ats::abs(a0_host(k, i, j));
+        diff += ats::abs(a0_host(k, i, j) - a1_host(k, i, j));
+      }
+  EXPECT_NEAR_KK(diff / sum, 0, eps);
+}
+}  // namespace Test
 
-template<typename DeviceType,
-         typename ValueType,
-         typename AlgoTagType>
+template <typename DeviceType, typename ValueType, typename AlgoTagType>
 int test_batched_lu() {
 #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft,DeviceType> ViewType;
-    Test::impl_test_batched_lu<DeviceType,ViewType,AlgoTagType>(     0, 10);
-    for (int i=0;i<10;++i) {                                                                                        
-      //printf("Testing: LayoutLeft,  Blksize %d\n", i); 
-      Test::impl_test_batched_lu<DeviceType,ViewType,AlgoTagType>(1024,  i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        ViewType;
+    Test::impl_test_batched_lu<DeviceType, ViewType, AlgoTagType>(0, 10);
+    for (int i = 0; i < 10; ++i) {
+      // printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      Test::impl_test_batched_lu<DeviceType, ViewType, AlgoTagType>(1024, i);
     }
   }
 #endif
 #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> ViewType;
-    Test::impl_test_batched_lu<DeviceType,ViewType,AlgoTagType>(     0, 10);
-    for (int i=0;i<10;++i) {                                                                                        
-      //printf("Testing: LayoutLeft,  Blksize %d\n", i); 
-      Test::impl_test_batched_lu<DeviceType,ViewType,AlgoTagType>(1024,  i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    Test::impl_test_batched_lu<DeviceType, ViewType, AlgoTagType>(0, 10);
+    for (int i = 0; i < 10; ++i) {
+      // printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      Test::impl_test_batched_lu<DeviceType, ViewType, AlgoTagType>(1024, i);
     }
   }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_SerialLU_Complex.hpp b/unit_test/batched/dense/Test_Batched_SerialLU_Complex.hpp
index 41f1b37201..0e0e0285c9 100644
--- a/unit_test/batched/dense/Test_Batched_SerialLU_Complex.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialLU_Complex.hpp
@@ -1,7 +1,7 @@
 
 #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
-TEST_F( TestCategory, batched_scalar_serial_lu_dcomplex ) {
+TEST_F(TestCategory, batched_scalar_serial_lu_dcomplex) {
   typedef Algo::LU::Blocked algo_tag_type;
-  test_batched_lu<TestExecSpace,Kokkos::complex<double>,algo_tag_type>();
+  test_batched_lu<TestExecSpace, Kokkos::complex<double>, algo_tag_type>();
 }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_SerialLU_Real.hpp b/unit_test/batched/dense/Test_Batched_SerialLU_Real.hpp
index 3e9e86bc21..6a537a638c 100644
--- a/unit_test/batched/dense/Test_Batched_SerialLU_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialLU_Real.hpp
@@ -1,16 +1,14 @@
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_scalar_serial_lu_float ) {
+TEST_F(TestCategory, batched_scalar_serial_lu_float) {
   typedef Algo::LU::Blocked algo_tag_type;
-  test_batched_lu<TestExecSpace,float,algo_tag_type>();
+  test_batched_lu<TestExecSpace, float, algo_tag_type>();
 }
 #endif
 
-
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F( TestCategory, batched_scalar_serial_lu_double ) {
+TEST_F(TestCategory, batched_scalar_serial_lu_double) {
   typedef Algo::LU::Blocked algo_tag_type;
-  test_batched_lu<TestExecSpace,double,algo_tag_type>();
+  test_batched_lu<TestExecSpace, double, algo_tag_type>();
 }
 #endif
-
diff --git a/unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp b/unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp
index 66fdb14d71..f9a58f5442 100644
--- a/unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp
@@ -15,146 +15,151 @@
 using namespace KokkosBatched;
 
 namespace Test {
-        
-  enum : int  { BatchedSet = 0,
-                BatchedScale = 1 };
-  
-  struct KokkosKernelTag {};
-  struct NaiveTag {};
-  
-  template<typename DeviceType, 
-           typename ViewType, 
-           typename ScalarType, 
-           typename AlgoTagType, 
-           int TestID>
-  struct Functor_TestBatchedSerialMatUtil {
-          
-    ScalarType _alpha;
-    ViewType _a;
-
-    KOKKOS_INLINE_FUNCTION
-    Functor_TestBatchedSerialMatUtil(const ScalarType alpha, 
-            const ViewType &a) 
+
+enum : int { BatchedSet = 0, BatchedScale = 1 };
+
+struct KokkosKernelTag {};
+struct NaiveTag {};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename AlgoTagType, int TestID>
+struct Functor_TestBatchedSerialMatUtil {
+  ScalarType _alpha;
+  ViewType _a;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedSerialMatUtil(const ScalarType alpha, const ViewType &a)
       : _alpha(alpha), _a(a) {}
-    
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const KokkosKernelTag &, const int i) const {
-      auto A = Kokkos::subview(_a, i, Kokkos::ALL(), Kokkos::ALL());
-      switch (TestID) {
-      case BatchedSet:   SerialSet  ::invoke(_alpha, A); break;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const KokkosKernelTag &, const int i) const {
+    auto A = Kokkos::subview(_a, i, Kokkos::ALL(), Kokkos::ALL());
+    switch (TestID) {
+      case BatchedSet: SerialSet ::invoke(_alpha, A); break;
       case BatchedScale: SerialScale::invoke(_alpha, A); break;
-      }
     }
+  }
 
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const NaiveTag &, const int k) const {
-      //MD Note: changing because of the error with -werror
-      auto A = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-      const int m = A.extent(0), n = A.extent(1);
-      switch (TestID) {
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const NaiveTag &, const int k) const {
+    // MD Note: changing because of the error with -werror
+    auto A      = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    const int m = A.extent(0), n = A.extent(1);
+    switch (TestID) {
       case BatchedSet: {
-        for (int i=0;i<m;++i) 
-          for (int j=0;j<n;++j)
-            A(i,j)  = _alpha;
+        for (int i = 0; i < m; ++i)
+          for (int j = 0; j < n; ++j) A(i, j) = _alpha;
         break;
       }
       case BatchedScale: {
-        for (int i=0;i<m;++i) 
-          for (int j=0;j<n;++j)
-            A(i,j) *= _alpha;
+        for (int i = 0; i < m; ++i)
+          for (int j = 0; j < n; ++j) A(i, j) *= _alpha;
         break;
       }
-      }
     }
+  }
 
-    inline
-    int run() {
-      typedef typename ViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::SerialMatUtil");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name_work_tag = ( std::is_same<AlgoTagType,KokkosKernelTag>::value ? "::KokkosBatched" :
-                                    std::is_same<AlgoTagType,NaiveTag>::value ? "::Naive" : "::UnknownWorkTag");
-      std::string name_test_id = ( TestID == BatchedSet ? "Set" : 
-                                   TestID == BatchedScale ? "Scale" : "UnknownTest");
-      std::string name = name_region + name_value_type + name_work_tag + name_test_id;
-      Kokkos::Profiling::pushRegion( name.c_str() );
-      Kokkos::RangePolicy<DeviceType,AlgoTagType> policy(0, _a.extent(0));
-      Kokkos::parallel_for(name.c_str(), policy, *this);
-      Kokkos::Profiling::popRegion();
-      return 0; 
-    }      
-  };
-
-  template<typename DeviceType,
-           typename ViewType, 
-           typename ScalarType,
-           int TestID>
-  void impl_test_batched_matutil(const int N, const int BlkSize) {
-
-    /// typedefs
+  inline int run() {
     typedef typename ViewType::value_type value_type;
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
-
-    /// radomized input testing views 
-    const ScalarType alpha = 11.1;
-    ViewType a("a", N, BlkSize, BlkSize);
-    ViewType b("b", N, BlkSize, BlkSize);
-
-    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(13718);
-    Kokkos::fill_random(a, random, value_type(1.0));
-
-    Kokkos::fence();
-
-    Kokkos::deep_copy(b, a);
-
-    /// test body
-    Functor_TestBatchedSerialMatUtil<DeviceType,ViewType,ScalarType,NaiveTag,       TestID>(alpha, a).run();
-    Functor_TestBatchedSerialMatUtil<DeviceType,ViewType,ScalarType,KokkosKernelTag,TestID>(alpha, b).run();
-
-    Kokkos::fence();
-
-    /// for comparison send it to host
-    typename ViewType::HostMirror a_host = Kokkos::create_mirror_view(a);
-    typename ViewType::HostMirror b_host = Kokkos::create_mirror_view(b);
-
-    Kokkos::deep_copy(a_host, a);
-    Kokkos::deep_copy(b_host, b);
-      
-    /// check a = b
-    typename ats::mag_type eps = 100 * std::numeric_limits<typename ats::mag_type>::epsilon();
-    for (int k=0;k<N;++k) 
-      for (int i=0;i<BlkSize;++i) 
-        for (int j=0;j<BlkSize;++j) 
-          EXPECT_NEAR_KK( b_host(k,i,j), a_host(k,i,j), eps);
+    std::string name_region("KokkosBatched::Test::SerialMatUtil");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name_work_tag =
+        (std::is_same<AlgoTagType, KokkosKernelTag>::value
+             ? "::KokkosBatched"
+             : std::is_same<AlgoTagType, NaiveTag>::value ? "::Naive"
+                                                          : "::UnknownWorkTag");
+    std::string name_test_id =
+        (TestID == BatchedSet
+             ? "Set"
+             : TestID == BatchedScale ? "Scale" : "UnknownTest");
+    std::string name =
+        name_region + name_value_type + name_work_tag + name_test_id;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::RangePolicy<DeviceType, AlgoTagType> policy(0, _a.extent(0));
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+    return 0;
   }
+};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          int TestID>
+void impl_test_batched_matutil(const int N, const int BlkSize) {
+  /// typedefs
+  typedef typename ViewType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  /// radomized input testing views
+  const ScalarType alpha = 11.1;
+  ViewType a("a", N, BlkSize, BlkSize);
+  ViewType b("b", N, BlkSize, BlkSize);
+
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
+  Kokkos::fill_random(a, random, value_type(1.0));
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(b, a);
+
+  /// test body
+  Functor_TestBatchedSerialMatUtil<DeviceType, ViewType, ScalarType, NaiveTag,
+                                   TestID>(alpha, a)
+      .run();
+  Functor_TestBatchedSerialMatUtil<DeviceType, ViewType, ScalarType,
+                                   KokkosKernelTag, TestID>(alpha, b)
+      .run();
+
+  Kokkos::fence();
+
+  /// for comparison send it to host
+  typename ViewType::HostMirror a_host = Kokkos::create_mirror_view(a);
+  typename ViewType::HostMirror b_host = Kokkos::create_mirror_view(b);
+
+  Kokkos::deep_copy(a_host, a);
+  Kokkos::deep_copy(b_host, b);
+
+  /// check a = b
+  typename ats::mag_type eps =
+      100 * std::numeric_limits<typename ats::mag_type>::epsilon();
+  for (int k = 0; k < N; ++k)
+    for (int i = 0; i < BlkSize; ++i)
+      for (int j = 0; j < BlkSize; ++j)
+        EXPECT_NEAR_KK(b_host(k, i, j), a_host(k, i, j), eps);
 }
+}  // namespace Test
 
-template<typename DeviceType, 
-         typename ValueType, 
-         typename ScalarType,
-         int TestID>
+template <typename DeviceType, typename ValueType, typename ScalarType,
+          int TestID>
 int test_batched_matutil() {
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) 
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft,DeviceType> ViewType;
-    Test::impl_test_batched_matutil<DeviceType,ViewType,ScalarType,TestID>(     0, 10);
-    Test::impl_test_batched_matutil<DeviceType,ViewType,ScalarType,TestID>(    10, 15);
-    Test::impl_test_batched_matutil<DeviceType,ViewType,ScalarType,TestID>(  1024,  9);
-    Test::impl_test_batched_matutil<DeviceType,ViewType,ScalarType,TestID>(132231,  3);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        ViewType;
+    Test::impl_test_batched_matutil<DeviceType, ViewType, ScalarType, TestID>(
+        0, 10);
+    Test::impl_test_batched_matutil<DeviceType, ViewType, ScalarType, TestID>(
+        10, 15);
+    Test::impl_test_batched_matutil<DeviceType, ViewType, ScalarType, TestID>(
+        1024, 9);
+    Test::impl_test_batched_matutil<DeviceType, ViewType, ScalarType, TestID>(
+        132231, 3);
   }
 #endif
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) 
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> ViewType;
-    Test::impl_test_batched_matutil<DeviceType,ViewType,ScalarType,TestID>(     0, 10);
-    Test::impl_test_batched_matutil<DeviceType,ViewType,ScalarType,TestID>(    10, 15);
-    Test::impl_test_batched_matutil<DeviceType,ViewType,ScalarType,TestID>(  1024,  9);
-    Test::impl_test_batched_matutil<DeviceType,ViewType,ScalarType,TestID>(132231,  3);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    Test::impl_test_batched_matutil<DeviceType, ViewType, ScalarType, TestID>(
+        0, 10);
+    Test::impl_test_batched_matutil<DeviceType, ViewType, ScalarType, TestID>(
+        10, 15);
+    Test::impl_test_batched_matutil<DeviceType, ViewType, ScalarType, TestID>(
+        1024, 9);
+    Test::impl_test_batched_matutil<DeviceType, ViewType, ScalarType, TestID>(
+        132231, 3);
   }
 #endif
-  
+
   return 0;
 }
diff --git a/unit_test/batched/dense/Test_Batched_SerialMatUtil_Complex.hpp b/unit_test/batched/dense/Test_Batched_SerialMatUtil_Complex.hpp
index 3fb2f5ed72..055a0cae62 100644
--- a/unit_test/batched/dense/Test_Batched_SerialMatUtil_Complex.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialMatUtil_Complex.hpp
@@ -1,15 +1,19 @@
 
 #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
-TEST_F( TestCategory, batched_scalar_serial_set_dcomplex_dcomplex ) {
-  test_batched_matutil<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,::Test::BatchedSet>();
+TEST_F(TestCategory, batched_scalar_serial_set_dcomplex_dcomplex) {
+  test_batched_matutil<TestExecSpace, Kokkos::complex<double>,
+                       Kokkos::complex<double>, ::Test::BatchedSet>();
 }
-TEST_F( TestCategory, batched_scalar_serial_scale_dcomplex_dcomplex ) {
-  test_batched_matutil<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,::Test::BatchedScale>();
+TEST_F(TestCategory, batched_scalar_serial_scale_dcomplex_dcomplex) {
+  test_batched_matutil<TestExecSpace, Kokkos::complex<double>,
+                       Kokkos::complex<double>, ::Test::BatchedScale>();
 }
-TEST_F( TestCategory, batched_scalar_serial_set_dcomplex_double ) {
-  test_batched_matutil<TestExecSpace,Kokkos::complex<double>,double,::Test::BatchedSet>();
+TEST_F(TestCategory, batched_scalar_serial_set_dcomplex_double) {
+  test_batched_matutil<TestExecSpace, Kokkos::complex<double>, double,
+                       ::Test::BatchedSet>();
 }
-TEST_F( TestCategory, batched_scalar_serial_scale_dcomplex_double ) {
-  test_batched_matutil<TestExecSpace,Kokkos::complex<double>,double,::Test::BatchedScale>();
+TEST_F(TestCategory, batched_scalar_serial_scale_dcomplex_double) {
+  test_batched_matutil<TestExecSpace, Kokkos::complex<double>, double,
+                       ::Test::BatchedScale>();
 }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_SerialMatUtil_Real.hpp b/unit_test/batched/dense/Test_Batched_SerialMatUtil_Real.hpp
index 8e37f8d808..c1644f9798 100644
--- a/unit_test/batched/dense/Test_Batched_SerialMatUtil_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialMatUtil_Real.hpp
@@ -1,19 +1,18 @@
-  
+
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_scalar_serial_set_float_float ) {
-  test_batched_matutil<TestExecSpace,float,float,::Test::BatchedSet>();
+TEST_F(TestCategory, batched_scalar_serial_set_float_float) {
+  test_batched_matutil<TestExecSpace, float, float, ::Test::BatchedSet>();
 }
-TEST_F( TestCategory, batched_scalar_serial_scale_float_float ) {
-  test_batched_matutil<TestExecSpace,float,float,::Test::BatchedScale>();
+TEST_F(TestCategory, batched_scalar_serial_scale_float_float) {
+  test_batched_matutil<TestExecSpace, float, float, ::Test::BatchedScale>();
 }
 #endif
 
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F( TestCategory, batched_scalar_serial_set_double_double ) {
-  test_batched_matutil<TestExecSpace,double,double,::Test::BatchedSet>();
+TEST_F(TestCategory, batched_scalar_serial_set_double_double) {
+  test_batched_matutil<TestExecSpace, double, double, ::Test::BatchedSet>();
 }
-TEST_F( TestCategory, batched_scalar_serial_scale_double_double ) {
-  test_batched_matutil<TestExecSpace,double,double,::Test::BatchedScale>();
+TEST_F(TestCategory, batched_scalar_serial_scale_double_double) {
+  test_batched_matutil<TestExecSpace, double, double, ::Test::BatchedScale>();
 }
 #endif
-
diff --git a/unit_test/batched/dense/Test_Batched_SerialSVD.hpp b/unit_test/batched/dense/Test_Batched_SerialSVD.hpp
index fbdfbc207a..57ec7f645b 100644
--- a/unit_test/batched/dense/Test_Batched_SerialSVD.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialSVD.hpp
@@ -4,145 +4,131 @@
 #include "KokkosBatched_SVD_Serial_Internal.hpp"  //For unit testing individual components
 #include "KokkosBatched_SetIdentity_Decl.hpp"
 
-namespace Test
-{
-  template<typename Scalar>
-  Scalar svdEpsilon()
-  {throw std::runtime_error("Unsupported scalar type");}
+namespace Test {
+template <typename Scalar>
+Scalar svdEpsilon() {
+  throw std::runtime_error("Unsupported scalar type");
+}
 
-  template<>
-  double svdEpsilon()
-  {return 1e-13;}
+template <>
+double svdEpsilon() {
+  return 1e-13;
+}
 
-  template<>
-  float svdEpsilon()
-  {return 2e-6f;}
+template <>
+float svdEpsilon() {
+  return 2e-6f;
 }
+}  // namespace Test
 
-template<typename Vector>
-double simpleNorm2(const Vector& v)
-{
+template <typename Vector>
+double simpleNorm2(const Vector& v) {
   using Scalar = typename Vector::non_const_value_type;
-  using KAT = Kokkos::ArithTraits<Scalar>;
-  auto vhost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v);
-  double d = 0;
-  for(size_t i = 0; i < v.extent(0); i++)
-  {
+  using KAT    = Kokkos::ArithTraits<Scalar>;
+  auto vhost   = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v);
+  double d     = 0;
+  for (size_t i = 0; i < v.extent(0); i++) {
     double m = KAT::abs(vhost(i));
     d += m * m;
   }
   return Kokkos::Experimental::sqrt(d);
 }
 
-template<typename V1, typename V2>
-typename V1::non_const_value_type simpleDot(const V1& v1, const V2& v2)
-{
+template <typename V1, typename V2>
+typename V1::non_const_value_type simpleDot(const V1& v1, const V2& v2) {
   using Scalar = typename V1::non_const_value_type;
-  using KAT = Kokkos::ArithTraits<Scalar>;
-  auto v1host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v1);
-  auto v2host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v2);
+  using KAT    = Kokkos::ArithTraits<Scalar>;
+  auto v1host  = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v1);
+  auto v2host  = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v2);
   typename V1::non_const_value_type val = KAT::zero();
-  for(size_t i = 0; i < v1.extent(0); i++)
-  {
+  for (size_t i = 0; i < v1.extent(0); i++) {
     val += v1host(i) * v2host(i);
   }
   return val;
 }
 
-//Check that all columns of X are unit length and pairwise orthogonal
-template<typename Mat>
-void verifyOrthogonal(const Mat& X)
-{
+// Check that all columns of X are unit length and pairwise orthogonal
+template <typename Mat>
+void verifyOrthogonal(const Mat& X) {
   using value_type = typename Mat::non_const_value_type;
-  int k = X.extent(1);
-  for(int i = 0; i < k; i++)
-  {
-    auto col1 = Kokkos::subview(X, Kokkos::ALL(), i);
+  int k            = X.extent(1);
+  for (int i = 0; i < k; i++) {
+    auto col1  = Kokkos::subview(X, Kokkos::ALL(), i);
     double len = simpleNorm2(col1);
     Test::EXPECT_NEAR_KK(len, 1.0, Test::svdEpsilon<value_type>());
-    for(int j = 0; j < i; j++)
-    {
+    for (int j = 0; j < i; j++) {
       auto col2 = Kokkos::subview(X, Kokkos::ALL(), j);
-      double d = Kokkos::ArithTraits<value_type>::abs(simpleDot(col1, col2));
+      double d  = Kokkos::ArithTraits<value_type>::abs(simpleDot(col1, col2));
       Test::EXPECT_NEAR_KK(d, 0.0, Test::svdEpsilon<value_type>());
     }
   }
 }
 
-template<typename AView, typename UView, typename VtView, typename SigmaView>
-void verifySVD(const AView& A, const UView& U, const VtView& Vt, const SigmaView& sigma)
-{
+template <typename AView, typename UView, typename VtView, typename SigmaView>
+void verifySVD(const AView& A, const UView& U, const VtView& Vt,
+               const SigmaView& sigma) {
   using value_type = typename AView::non_const_value_type;
-  using KAT = Kokkos::ArithTraits<value_type>;
-  //Check that U/V columns are unit length and orthogonal, and that U * diag(sigma) * V^T == A
-  int m = A.extent(0);
-  int n = A.extent(1);
+  using KAT        = Kokkos::ArithTraits<value_type>;
+  // Check that U/V columns are unit length and orthogonal, and that U *
+  // diag(sigma) * V^T == A
+  int m       = A.extent(0);
+  int n       = A.extent(1);
   int maxrank = std::min(m, n);
   verifyOrthogonal(U);
-  //NOTE: V^T being square and orthonormal implies that V is, so we don't have to transpose it here.
+  // NOTE: V^T being square and orthonormal implies that V is, so we don't have
+  // to transpose it here.
   verifyOrthogonal(Vt);
   AView usvt("USV^T", m, n);
-  for(int i = 0; i < maxrank; i++)
-  {
-    auto Ucol = Kokkos::subview(U, Kokkos::ALL(), Kokkos::make_pair<int>(i, i + 1));
-    auto Vtrow = Kokkos::subview(Vt, Kokkos::make_pair<int>(i, i + 1), Kokkos::ALL());
+  for (int i = 0; i < maxrank; i++) {
+    auto Ucol =
+        Kokkos::subview(U, Kokkos::ALL(), Kokkos::make_pair<int>(i, i + 1));
+    auto Vtrow =
+        Kokkos::subview(Vt, Kokkos::make_pair<int>(i, i + 1), Kokkos::ALL());
     Test::vanillaGEMM(sigma(i), Ucol, Vtrow, 1.0, usvt);
   }
-  for(int i = 0; i < m; i++)
-  {
-    for(int j = 0; j < n; j++)
-    {
+  for (int i = 0; i < m; i++) {
+    for (int j = 0; j < n; j++) {
       Test::EXPECT_NEAR_KK(usvt(i, j), A(i, j), Test::svdEpsilon<value_type>());
     }
   }
-  //Make sure all singular values are positive
-  for(int i = 0; i < maxrank; i++)
-  {
+  // Make sure all singular values are positive
+  for (int i = 0; i < maxrank; i++) {
     EXPECT_GE(sigma(i), KAT::zero());
   }
-  //Make sure singular values are in descending order
-  for(int i = 0; i < maxrank - 1; i++)
-  {
+  // Make sure singular values are in descending order
+  for (int i = 0; i < maxrank - 1; i++) {
     EXPECT_GE(sigma(i), sigma(i + 1));
   }
 }
 
-template<typename Matrix>
-Matrix createRandomMatrix(int m, int n, int deficiency, double maxval = 1.0)
-{
+template <typename Matrix>
+Matrix createRandomMatrix(int m, int n, int deficiency, double maxval = 1.0) {
   using Scalar = typename Matrix::non_const_value_type;
   Matrix mat("A", m, n);
   auto mhost = Kokkos::create_mirror_view(mat);
-  //Fill mat with random values first
-  if(maxval != 0.0)
-  {
-    Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace> rand_pool(13718);
+  // Fill mat with random values first
+  if (maxval != 0.0) {
+    Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace> rand_pool(
+        13718);
     Scalar minrand, maxrand;
     Test::getRandomBounds<Scalar>(maxval, minrand, maxrand);
     Kokkos::fill_random(mhost, rand_pool, minrand, maxrand);
   }
-  //Apply the rank deficiency.
-  //If m < n, make some rows a multiple of the first row.
-  //Otherwise, make some columns a multiple of the first column.
-  if(m < n)
-  {
-    for(int i = 0; i < deficiency; i++)
-    {
-      //make row i + 1 a multiple of row 0
-      for(int j = 0; j < n; j++)
-      {
-        mhost(i + 1, j) = (double) (i + 2) * mhost(0, j);
+  // Apply the rank deficiency.
+  // If m < n, make some rows a multiple of the first row.
+  // Otherwise, make some columns a multiple of the first column.
+  if (m < n) {
+    for (int i = 0; i < deficiency; i++) {
+      // make row i + 1 a multiple of row 0
+      for (int j = 0; j < n; j++) {
+        mhost(i + 1, j) = (double)(i + 2) * mhost(0, j);
       }
     }
-  }
-  else
-  {
-    for(int i = 0; i < deficiency; i++)
-    {
-      //make col i + 1 a multiple of col 0
-      for(int j = 0; j < m; j++)
-      {
-        mhost(j, i + 1) = (double) (i + 2) * mhost(j, 0);
+  } else {
+    for (int i = 0; i < deficiency; i++) {
+      // make col i + 1 a multiple of col 0
+      for (int j = 0; j < m; j++) {
+        mhost(j, i + 1) = (double)(i + 2) * mhost(j, 0);
       }
     }
   }
@@ -150,17 +136,17 @@ Matrix createRandomMatrix(int m, int n, int deficiency, double maxval = 1.0)
   return mat;
 }
 
-template<typename Matrix, typename Vector>
-struct SerialSVDFunctor_Full
-{
-  SerialSVDFunctor_Full(const Matrix& A_, const Matrix& U_, const Matrix& Vt_, const Vector& sigma_, const Vector& work_)
-    : A(A_), U(U_), Vt(Vt_), sigma(sigma_), work(work_)
-  {}
+template <typename Matrix, typename Vector>
+struct SerialSVDFunctor_Full {
+  SerialSVDFunctor_Full(const Matrix& A_, const Matrix& U_, const Matrix& Vt_,
+                        const Vector& sigma_, const Vector& work_)
+      : A(A_), U(U_), Vt(Vt_), sigma(sigma_), work(work_) {}
 
-  //NOTE: this functor is only meant to be launched with a single element range policy
-  KOKKOS_INLINE_FUNCTION void operator()(int) const
-  {
-    KokkosBatched::SerialSVD::invoke(KokkosBatched::SVD_USV_Tag(), A, U, sigma, Vt, work);
+  // NOTE: this functor is only meant to be launched with a single element range
+  // policy
+  KOKKOS_INLINE_FUNCTION void operator()(int) const {
+    KokkosBatched::SerialSVD::invoke(KokkosBatched::SVD_USV_Tag(), A, U, sigma,
+                                     Vt, work);
   }
 
   Matrix A;
@@ -170,17 +156,17 @@ struct SerialSVDFunctor_Full
   Vector work;
 };
 
-template<typename Matrix, typename Vector>
-struct SerialSVDFunctor_SingularValuesOnly
-{
-  SerialSVDFunctor_SingularValuesOnly(const Matrix& A_, const Vector& sigma_, const Vector& work_)
-    : A(A_), sigma(sigma_), work(work_)
-  {}
+template <typename Matrix, typename Vector>
+struct SerialSVDFunctor_SingularValuesOnly {
+  SerialSVDFunctor_SingularValuesOnly(const Matrix& A_, const Vector& sigma_,
+                                      const Vector& work_)
+      : A(A_), sigma(sigma_), work(work_) {}
 
-  //NOTE: this functor is only meant to be launched with a single element range policy
-  KOKKOS_INLINE_FUNCTION void operator()(int) const
-  {
-    KokkosBatched::SerialSVD::invoke(KokkosBatched::SVD_S_Tag(), A, sigma, work);
+  // NOTE: this functor is only meant to be launched with a single element range
+  // policy
+  KOKKOS_INLINE_FUNCTION void operator()(int) const {
+    KokkosBatched::SerialSVD::invoke(KokkosBatched::SVD_S_Tag(), A, sigma,
+                                     work);
   }
 
   Matrix A;
@@ -188,14 +174,14 @@ struct SerialSVDFunctor_SingularValuesOnly
   Vector work;
 };
 
-template<typename Scalar, typename Layout, typename Device>
-void testSerialSVD(int m, int n, int deficiency, double maxval = 1.0)
-{
-  using Matrix = Kokkos::View<Scalar**, Layout, Device>;
-  using Vector = Kokkos::View<Scalar*, Device>;
+template <typename Scalar, typename Layout, typename Device>
+void testSerialSVD(int m, int n, int deficiency, double maxval = 1.0) {
+  using Matrix    = Kokkos::View<Scalar**, Layout, Device>;
+  using Vector    = Kokkos::View<Scalar*, Device>;
   using ExecSpace = typename Device::execution_space;
-  Matrix A = createRandomMatrix<Matrix>(m, n, deficiency, maxval);
-  //Fill U, Vt, sigma with nonzeros as well to make sure they are properly overwritten
+  Matrix A        = createRandomMatrix<Matrix>(m, n, deficiency, maxval);
+  // Fill U, Vt, sigma with nonzeros as well to make sure they are properly
+  // overwritten
   Matrix U("U", m, m);
   Matrix Vt("Vt", n, n);
   int maxrank = std::min(m, n);
@@ -205,28 +191,31 @@ void testSerialSVD(int m, int n, int deficiency, double maxval = 1.0)
   Kokkos::deep_copy(Vt, -5.0);
   Kokkos::deep_copy(sigma, -5.0);
   Kokkos::deep_copy(work, -5.0);
-  //Make a copy of A (before SVD) for verification, since the original will be overwritten
+  // Make a copy of A (before SVD) for verification, since the original will be
+  // overwritten
   typename Matrix::HostMirror Acopy("Acopy", m, n);
   Kokkos::deep_copy(Acopy, A);
-  //Run the SVD
-  Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, 1),
+  // Run the SVD
+  Kokkos::parallel_for(
+      Kokkos::RangePolicy<ExecSpace>(0, 1),
       SerialSVDFunctor_Full<Matrix, Vector>(A, U, Vt, sigma, work));
-  //Get the results back
-  auto Uhost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), U);
+  // Get the results back
+  auto Uhost  = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), U);
   auto Vthost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Vt);
-  auto sigmaHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), sigma);
-  //Verify the SVD is correct
+  auto sigmaHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), sigma);
+  // Verify the SVD is correct
   verifySVD(Acopy, Uhost, Vthost, sigmaHost);
 }
 
-template<typename Scalar, typename Layout, typename Device>
-void testSerialSVDSingularValuesOnly(int m, int n)
-{
-  using Matrix = Kokkos::View<Scalar**, Layout, Device>;
-  using Vector = Kokkos::View<Scalar*, Device>;
+template <typename Scalar, typename Layout, typename Device>
+void testSerialSVDSingularValuesOnly(int m, int n) {
+  using Matrix    = Kokkos::View<Scalar**, Layout, Device>;
+  using Vector    = Kokkos::View<Scalar*, Device>;
   using ExecSpace = typename Device::execution_space;
-  Matrix A = createRandomMatrix<Matrix>(m, n, 0);
-  //Fill U, Vt, sigma with nonzeros as well to make sure they are properly overwritten
+  Matrix A        = createRandomMatrix<Matrix>(m, n, 0);
+  // Fill U, Vt, sigma with nonzeros as well to make sure they are properly
+  // overwritten
   Matrix U("U", m, m);
   Matrix Vt("Vt", n, n);
   int maxrank = std::min(m, n);
@@ -238,139 +227,133 @@ void testSerialSVDSingularValuesOnly(int m, int n)
   Kokkos::deep_copy(sigma1, -5.0);
   Kokkos::deep_copy(sigma2, -7.0);
   Kokkos::deep_copy(work, -5.0);
-  //Make a copy of A (before SVD) for verification, since the original will be overwritten
+  // Make a copy of A (before SVD) for verification, since the original will be
+  // overwritten
   typename Matrix::HostMirror Acopy("Acopy", m, n);
   Kokkos::deep_copy(Acopy, A);
-  //Run the SVD (full mode)
-  Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, 1),
+  // Run the SVD (full mode)
+  Kokkos::parallel_for(
+      Kokkos::RangePolicy<ExecSpace>(0, 1),
       SerialSVDFunctor_Full<Matrix, Vector>(A, U, Vt, sigma1, work));
   Kokkos::deep_copy(A, Acopy);
-  //Run the same SVD (singular values only mode)
-  Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, 1),
+  // Run the same SVD (singular values only mode)
+  Kokkos::parallel_for(
+      Kokkos::RangePolicy<ExecSpace>(0, 1),
       SerialSVDFunctor_SingularValuesOnly<Matrix, Vector>(A, sigma2, work));
-  auto sigma1Host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), sigma1);
-  auto sigma2Host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), sigma2);
-  //Make sure they match
-  for(int i = 0; i < maxrank; i++)
-  {
-    Test::EXPECT_NEAR_KK(sigma1Host(i), sigma2Host(i), Test::svdEpsilon<Scalar>());
+  auto sigma1Host =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), sigma1);
+  auto sigma2Host =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), sigma2);
+  // Make sure they match
+  for (int i = 0; i < maxrank; i++) {
+    Test::EXPECT_NEAR_KK(sigma1Host(i), sigma2Host(i),
+                         Test::svdEpsilon<Scalar>());
   }
 }
 
-//Test the bidiagonal n*n SVD step where the last diagonal entry is 0
-template<typename Scalar, typename Layout>
-void testSerialSVDZeroLastRow(int n)
-{
-  //Generate a bidiagonal matrix
+// Test the bidiagonal n*n SVD step where the last diagonal entry is 0
+template <typename Scalar, typename Layout>
+void testSerialSVDZeroLastRow(int n) {
+  // Generate a bidiagonal matrix
   using Matrix = Kokkos::View<Scalar**, Layout, Kokkos::HostSpace>;
-  using KAT = Kokkos::ArithTraits<Scalar>;
-  Matrix B = createRandomMatrix<Matrix>(n, n, 0, 1.0);
-  //Zero out entries to make B bidiagonal
-  for(int i = 0; i < n; i++)
-  {
-    for(int j = 0; j < n; j++)
-    {
-      if(i != j && i + 1 != j)
-      {
+  using KAT    = Kokkos::ArithTraits<Scalar>;
+  Matrix B     = createRandomMatrix<Matrix>(n, n, 0, 1.0);
+  // Zero out entries to make B bidiagonal
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < n; j++) {
+      if (i != j && i + 1 != j) {
         B(i, j) = KAT::zero();
       }
     }
   }
-  //Also zero out the final diagonal to test this routine
+  // Also zero out the final diagonal to test this routine
   B(n - 1, n - 1) = KAT::zero();
   Matrix Vt("Vt", n, n);
   KokkosBatched::SerialSetIdentity::invoke<Matrix>(Vt);
-  //Compute the initial product to make sure it's maintained by the routine
+  // Compute the initial product to make sure it's maintained by the routine
   Matrix BVt("UBVt", n, n);
   Test::vanillaGEMM(1.0, B, Vt, 0.0, BVt);
-  //Run the routine (just on host)
-  KokkosBatched::SerialSVDInternal::svdZeroLastColumn<Scalar>(B.data(), n, B.stride(0), B.stride(1), Vt.data(), Vt.stride(0), Vt.stride(1));
-  //Check that B is still bidiagonal (to a tight tolerance, but not exactly zero)
-  for(int i = 0; i < n; i++)
-  {
-    for(int j = 0; j < n; j++)
-    {
-      if(i != j && i + 1 != j)
-      {
+  // Run the routine (just on host)
+  KokkosBatched::SerialSVDInternal::svdZeroLastColumn<Scalar>(
+      B.data(), n, B.stride(0), B.stride(1), Vt.data(), Vt.stride(0),
+      Vt.stride(1));
+  // Check that B is still bidiagonal (to a tight tolerance, but not exactly
+  // zero)
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < n; j++) {
+      if (i != j && i + 1 != j) {
         Test::EXPECT_NEAR_KK(B(i, j), KAT::zero(), Test::svdEpsilon<Scalar>());
       }
     }
   }
-  //Check that the last superdiagonal is now zero
-  Test::EXPECT_NEAR_KK(B(n - 2, n - 1), KAT::zero(), Test::svdEpsilon<Scalar>());
-  //Check that the product is still maintained
+  // Check that the last superdiagonal is now zero
+  Test::EXPECT_NEAR_KK(B(n - 2, n - 1), KAT::zero(),
+                       Test::svdEpsilon<Scalar>());
+  // Check that the product is still maintained
   Matrix BVt2("UBVt", n, n);
   Test::vanillaGEMM(1.0, B, Vt, 0.0, BVt2);
-  for(int i = 0; i < n; i++)
-  {
-    for(int j = 0; j < n; j++)
-    {
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < n; j++) {
       Test::EXPECT_NEAR_KK(BVt(i, j), BVt2(i, j), Test::svdEpsilon<Scalar>());
     }
   }
-  //Check that Vt is still orthogonal
+  // Check that Vt is still orthogonal
   verifyOrthogonal(Vt);
 }
 
-//Test bidiagonal n*n SVD step where some diagonal i (not the last) is 0.
-template<typename Scalar, typename Layout>
-void testSerialSVDZeroDiagonal(int n, int row)
-{
-  //Generate a bidiagonal matrix
+// Test bidiagonal n*n SVD step where some diagonal i (not the last) is 0.
+template <typename Scalar, typename Layout>
+void testSerialSVDZeroDiagonal(int n, int row) {
+  // Generate a bidiagonal matrix
   using Matrix = Kokkos::View<Scalar**, Layout, Kokkos::HostSpace>;
-  using KAT = Kokkos::ArithTraits<Scalar>;
-  int m = n + 2;  //Make U somewhat bigger to make sure the Givens transforms are applied correctly
+  using KAT    = Kokkos::ArithTraits<Scalar>;
+  int m = n + 2;  // Make U somewhat bigger to make sure the Givens transforms
+                  // are applied correctly
   Matrix B = createRandomMatrix<Matrix>(m, n, 0, 1.0);
-  //Zero out entries to make B bidiagonal
-  for(int i = 0; i < m; i++)
-  {
-    for(int j = 0; j < n; j++)
-    {
-      if(i != j && i + 1 != j)
-      {
+  // Zero out entries to make B bidiagonal
+  for (int i = 0; i < m; i++) {
+    for (int j = 0; j < n; j++) {
+      if (i != j && i + 1 != j) {
         B(i, j) = KAT::zero();
       }
     }
   }
-  //Also zero out a diagonal to test this routine
+  // Also zero out a diagonal to test this routine
   B(row, row) = KAT::zero();
   Matrix U("U", m, m);
   KokkosBatched::SerialSetIdentity::invoke<Matrix>(U);
-  //Compute the initial product to make sure it's maintained by the routine
+  // Compute the initial product to make sure it's maintained by the routine
   Matrix UB("UB", m, n);
   Test::vanillaGEMM(1.0, U, B, 0.0, UB);
-  //Run the routine (just on host)
-  KokkosBatched::SerialSVDInternal::svdZeroRow<Scalar>(row, B.data(), n, B.stride(0), B.stride(1), U.data(), m, U.stride(0), U.stride(1));
-  //Check that B is still bidiagonal (to a tight tolerance, but not exactly zero)
-  for(int i = 0; i < m; i++)
-  {
-    for(int j = 0; j < n; j++)
-    {
-      if(i != j && i + 1 != j)
-      {
+  // Run the routine (just on host)
+  KokkosBatched::SerialSVDInternal::svdZeroRow<Scalar>(
+      row, B.data(), n, B.stride(0), B.stride(1), U.data(), m, U.stride(0),
+      U.stride(1));
+  // Check that B is still bidiagonal (to a tight tolerance, but not exactly
+  // zero)
+  for (int i = 0; i < m; i++) {
+    for (int j = 0; j < n; j++) {
+      if (i != j && i + 1 != j) {
         Test::EXPECT_NEAR_KK(B(i, j), KAT::zero(), Test::svdEpsilon<Scalar>());
       }
     }
   }
-  //Check that row's diagonal is now zero
+  // Check that row's diagonal is now zero
   Test::EXPECT_NEAR_KK(B(row, row), KAT::zero(), Test::svdEpsilon<Scalar>());
-  //Check that the product is still maintained
+  // Check that the product is still maintained
   Matrix UB2("UB", m, n);
   Test::vanillaGEMM(1.0, U, B, 0.0, UB2);
-  for(int i = 0; i < m; i++)
-  {
-    for(int j = 0; j < n; j++)
-    {
+  for (int i = 0; i < m; i++) {
+    for (int j = 0; j < n; j++) {
       Test::EXPECT_NEAR_KK(UB(i, j), UB2(i, j), Test::svdEpsilon<Scalar>());
     }
   }
-  //Check that U is still orthogonal
+  // Check that U is still orthogonal
   verifyOrthogonal(U);
 }
 
-template<typename Scalar, typename Layout, typename Device>
-void testSVD()
-{
+template <typename Scalar, typename Layout, typename Device>
+void testSVD() {
   testSerialSVD<Scalar, Layout, Device>(0, 0, 0);
   testSerialSVD<Scalar, Layout, Device>(1, 0, 0);
   testSerialSVD<Scalar, Layout, Device>(0, 1, 0);
@@ -382,28 +365,27 @@ void testSVD()
   testSerialSVD<Scalar, Layout, Device>(1, 10, 0);
   testSerialSVD<Scalar, Layout, Device>(10, 8, 3);
   testSerialSVD<Scalar, Layout, Device>(8, 10, 4);
-  //Test with all-zero matrix
+  // Test with all-zero matrix
   testSerialSVD<Scalar, Layout, Device>(8, 10, 0, 0.0);
-  //Test some important internal routines which are not called often
+  // Test some important internal routines which are not called often
   testSerialSVDZeroLastRow<Scalar, Layout>(10);
   testSerialSVDZeroDiagonal<Scalar, Layout>(10, 3);
-  //Test the mode that just computes singular values
+  // Test the mode that just computes singular values
   testSerialSVDSingularValuesOnly<Scalar, Layout, Device>(10, 8);
 }
 
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F( TestCategory, batched_scalar_serial_svd_double ) {
-  //Test general SVD on a few different input sizes (full rank randomized)
+TEST_F(TestCategory, batched_scalar_serial_svd_double) {
+  // Test general SVD on a few different input sizes (full rank randomized)
   testSVD<double, Kokkos::LayoutLeft, TestExecSpace>();
   testSVD<double, Kokkos::LayoutRight, TestExecSpace>();
 }
 #endif
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_scalar_serial_svd_float ) {
-  //Test general SVD on a few different input sizes (full rank randomized)
+TEST_F(TestCategory, batched_scalar_serial_svd_float) {
+  // Test general SVD on a few different input sizes (full rank randomized)
   testSVD<float, Kokkos::LayoutLeft, TestExecSpace>();
   testSVD<float, Kokkos::LayoutRight, TestExecSpace>();
 }
 #endif
-
diff --git a/unit_test/batched/dense/Test_Batched_SerialSolveLU.hpp b/unit_test/batched/dense/Test_Batched_SerialSolveLU.hpp
index 67b49b3671..b6d8e1aecf 100644
--- a/unit_test/batched/dense/Test_Batched_SerialSolveLU.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialSolveLU.hpp
@@ -20,236 +20,217 @@ using namespace KokkosBatched;
 namespace Test {
 namespace SerialSolveLU {
 
-  template<typename TA, typename TB>
-  struct ParamTag { 
-    typedef TA transA;
-    typedef TB transB;
-  };
- 
-  template<typename DeviceType,
-           typename ViewType,
-           typename ScalarType,
-           typename ParamTagType, 
-           typename AlgoTagType>
-  struct Functor_BatchedSerialGemm {
-    ViewType _a, _b, _c;
-    
-    ScalarType _alpha, _beta;
-    
-    KOKKOS_INLINE_FUNCTION
-    Functor_BatchedSerialGemm(const ScalarType alpha, 
-            const ViewType &a,
-            const ViewType &b,
-            const ScalarType beta,
-            const ViewType &c)
+template <typename TA, typename TB>
+struct ParamTag {
+  typedef TA transA;
+  typedef TB transB;
+};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
+struct Functor_BatchedSerialGemm {
+  ViewType _a, _b, _c;
+
+  ScalarType _alpha, _beta;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_BatchedSerialGemm(const ScalarType alpha, const ViewType &a,
+                            const ViewType &b, const ScalarType beta,
+                            const ViewType &c)
       : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {}
-    
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const ParamTagType &, const int k) const {
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-      auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
-      auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL());
-
-      for (int i=0;i<static_cast<int>(aa.extent(0));++i)
-        aa(i,i) += 10.0;
-     
-      SerialGemm<typename ParamTagType::transA,
-        typename ParamTagType::transB,
-        AlgoTagType>::
-        invoke(_alpha, aa, bb, _beta, cc);
-    }
-    
-    inline
-    void run() {
-      typedef typename ViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::SerialSolveLU");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion( name.c_str() );
-      Kokkos::RangePolicy<DeviceType,ParamTagType> policy(0, _c.extent(0));
-      Kokkos::parallel_for((name+"::GemmFunctor").c_str(), policy, *this);          
-      Kokkos::Profiling::popRegion();
-    }
-  };
 
-  template<typename DeviceType,
-           typename ViewType,
-           typename AlgoTagType>
-  struct Functor_BatchedSerialLU {
-    ViewType _a;
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const ParamTagType &, const int k) const {
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
+    auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL());
 
-    KOKKOS_INLINE_FUNCTION
-    Functor_BatchedSerialLU(const ViewType &a) 
-      : _a(a) {} 
+    for (int i = 0; i < static_cast<int>(aa.extent(0)); ++i) aa(i, i) += 10.0;
 
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const int k) const {
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    SerialGemm<typename ParamTagType::transA, typename ParamTagType::transB,
+               AlgoTagType>::invoke(_alpha, aa, bb, _beta, cc);
+  }
 
-      for (int i=0;i<static_cast<int>(aa.extent(0));++i)
-        aa(i,i) += 10.0;
+  inline void run() {
+    typedef typename ViewType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::SerialSolveLU");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::RangePolicy<DeviceType, ParamTagType> policy(0, _c.extent(0));
+    Kokkos::parallel_for((name + "::GemmFunctor").c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
 
-      SerialLU<AlgoTagType>::invoke(aa);
-    }
+template <typename DeviceType, typename ViewType, typename AlgoTagType>
+struct Functor_BatchedSerialLU {
+  ViewType _a;
 
-    inline
-    void run() {
-      typedef typename ViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::SerialSolveLU");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion( name.c_str() );
-      Kokkos::RangePolicy<DeviceType> policy(0, _a.extent(0));
-      Kokkos::parallel_for((name+"::LUFunctor").c_str(), policy, *this);
-      Kokkos::Profiling::popRegion();
-    }
-  };
-
-  template<typename DeviceType,
-           typename ViewType,
-           typename TransType,
-           typename AlgoTagType>
-  struct Functor_TestBatchedSerialSolveLU {
-    ViewType _a;
-    ViewType _b;
-
-    KOKKOS_INLINE_FUNCTION
-    Functor_TestBatchedSerialSolveLU(const ViewType &a, const ViewType &b) 
-      : _a(a), _b(b) {} 
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const int k) const {
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-      auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
-
-      KokkosBatched::SerialSolveLU<TransType,AlgoTagType>::invoke(aa,bb);
-    }
+  KOKKOS_INLINE_FUNCTION
+  Functor_BatchedSerialLU(const ViewType &a) : _a(a) {}
 
-    inline
-    void run() {
-      typedef typename ViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::SerialSolveLU");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion( name.c_str() );
-      Kokkos::RangePolicy<DeviceType> policy(0, _a.extent(0));
-      Kokkos::parallel_for((name+"::SolveLUFunctor").c_str(), policy, *this);
-      Kokkos::Profiling::popRegion();      
-    }
-  };
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int k) const {
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
 
-  template<typename DeviceType,
-           typename ViewType,
-           typename AlgoTagType>
-  void impl_test_batched_solvelu(const int N, const int BlkSize) {
+    for (int i = 0; i < static_cast<int>(aa.extent(0)); ++i) aa(i, i) += 10.0;
+
+    SerialLU<AlgoTagType>::invoke(aa);
+  }
+
+  inline void run() {
     typedef typename ViewType::value_type value_type;
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
-
-    /// randomized input testing views
-    ViewType a0("a0", N, BlkSize, BlkSize);
-    ViewType a1("a1", N, BlkSize, BlkSize);
-    ViewType b ("b",  N, BlkSize, 5 );
-    ViewType x0("x0", N, BlkSize, 5 );
-    //ViewType a0_T("a0_T", N, BlkSize, BlkSize);
-    //ViewType b_T ("b_T",  N, BlkSize, 5 );
-	
-    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(13718);
-    Kokkos::fill_random(a0, random, value_type(1.0));
-    Kokkos::fill_random(x0, random, value_type(1.0));
-
-    Kokkos::fence();
-
-    Kokkos::deep_copy(a1, a0);
-    //Kokkos::deep_copy(a0_T, a0);
-
-    value_type alpha = 1.0, beta = 0.0;   
-    typedef ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
-
-    Functor_BatchedSerialGemm<DeviceType,ViewType,value_type,
-      param_tag_type,AlgoTagType>(alpha, a0, x0, beta, b).run();
-
-    Functor_BatchedSerialLU<DeviceType,ViewType,AlgoTagType>(a1).run();
-
-    Functor_TestBatchedSerialSolveLU<DeviceType,ViewType,Trans::NoTranspose,AlgoTagType>(a1,b).run();
-	  
-    Kokkos::fence();
-
-    // //Transpose
-    // typedef ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type_T;
-
-    // Functor_BatchedSerialGemm<DeviceType,ViewType,value_type,
-    //   param_tag_type_T,AlgoTagType>(alpha, a0_T, x0, beta, b_T).run();
-
-    // Functor_TestBatchedSerialSolveLU<DeviceType,ViewType,Trans::Transpose,AlgoTagType>(a1,b_T).run();
-
-    // Kokkos::fence();
-
-    /// for comparison send it to host
-    typename ViewType::HostMirror x0_host  = Kokkos::create_mirror_view(x0);
-    typename ViewType::HostMirror b_host   = Kokkos::create_mirror_view(b);
-    //typename ViewType::HostMirror b_T_host = Kokkos::create_mirror_view(b_T);
-
-    Kokkos::deep_copy(x0_host, x0);
-    Kokkos::deep_copy(b_host, b);
-    //Kokkos::deep_copy(b_T_host, b_T);
-
-    /// check x0 = b ; this eps is about 10^-14
-    typedef typename ats::mag_type mag_type;
-    mag_type sum(1), diff(0);
-    const mag_type eps = 1.0e3 * ats::epsilon();
-
-    for (int k=0;k<N;++k)
-      for (int i=0;i<BlkSize;++i)
-        for (int j=0;j<5;++j) {
-          sum  += ats::abs(x0_host(k,i,j));
-          diff += ats::abs(x0_host(k,i,j)-b_host(k,i,j));
-        }
-    //printf("NoTranspose -- N=%d, BlkSize=%d, sum=%f, diff=%f\n", N, BlkSize, sum, diff);
-    EXPECT_NEAR_KK( diff/sum, 0.0, eps);
-
-    /// check x0 = b_T ; this eps is about 10^-14
-    // mag_type sum_T(1), diff_T(0);
-    
-    // for (int k=0;k<N;++k)
-    //   for (int i=0;i<BlkSize;++i)
-    //     for (int j=0;j<5;++j) {
-    //       sum_T  += ats::abs(x0_host(k,i,j));
-    //       diff_T += ats::abs(x0_host(k,i,j)-b_T_host(k,i,j));
-    //     }
-    // //printf("Transpose -- N=%d, BlkSize=%d, sum=%f, diff=%f\n", N, BlkSize, sum_T, diff_T);
-    // EXPECT_NEAR_KK( diff_T/sum_T, 0.0, eps);
+    std::string name_region("KokkosBatched::Test::SerialSolveLU");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::RangePolicy<DeviceType> policy(0, _a.extent(0));
+    Kokkos::parallel_for((name + "::LUFunctor").c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
   }
+};
+
+template <typename DeviceType, typename ViewType, typename TransType,
+          typename AlgoTagType>
+struct Functor_TestBatchedSerialSolveLU {
+  ViewType _a;
+  ViewType _b;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedSerialSolveLU(const ViewType &a, const ViewType &b)
+      : _a(a), _b(b) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int k) const {
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
+
+    KokkosBatched::SerialSolveLU<TransType, AlgoTagType>::invoke(aa, bb);
+  }
+
+  inline void run() {
+    typedef typename ViewType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::SerialSolveLU");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::RangePolicy<DeviceType> policy(0, _a.extent(0));
+    Kokkos::parallel_for((name + "::SolveLUFunctor").c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename ViewType, typename AlgoTagType>
+void impl_test_batched_solvelu(const int N, const int BlkSize) {
+  typedef typename ViewType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  /// randomized input testing views
+  ViewType a0("a0", N, BlkSize, BlkSize);
+  ViewType a1("a1", N, BlkSize, BlkSize);
+  ViewType b("b", N, BlkSize, 5);
+  ViewType x0("x0", N, BlkSize, 5);
+  // ViewType a0_T("a0_T", N, BlkSize, BlkSize);
+  // ViewType b_T ("b_T",  N, BlkSize, 5 );
+
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
+  Kokkos::fill_random(a0, random, value_type(1.0));
+  Kokkos::fill_random(x0, random, value_type(1.0));
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(a1, a0);
+  // Kokkos::deep_copy(a0_T, a0);
+
+  value_type alpha = 1.0, beta = 0.0;
+  typedef ParamTag<Trans::NoTranspose, Trans::NoTranspose> param_tag_type;
+
+  Functor_BatchedSerialGemm<DeviceType, ViewType, value_type, param_tag_type,
+                            AlgoTagType>(alpha, a0, x0, beta, b)
+      .run();
+
+  Functor_BatchedSerialLU<DeviceType, ViewType, AlgoTagType>(a1).run();
+
+  Functor_TestBatchedSerialSolveLU<DeviceType, ViewType, Trans::NoTranspose,
+                                   AlgoTagType>(a1, b)
+      .run();
+
+  Kokkos::fence();
+
+  // //Transpose
+  // typedef ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type_T;
+
+  // Functor_BatchedSerialGemm<DeviceType,ViewType,value_type,
+  //   param_tag_type_T,AlgoTagType>(alpha, a0_T, x0, beta, b_T).run();
+
+  // Functor_TestBatchedSerialSolveLU<DeviceType,ViewType,Trans::Transpose,AlgoTagType>(a1,b_T).run();
+
+  // Kokkos::fence();
+
+  /// for comparison send it to host
+  typename ViewType::HostMirror x0_host = Kokkos::create_mirror_view(x0);
+  typename ViewType::HostMirror b_host  = Kokkos::create_mirror_view(b);
+  // typename ViewType::HostMirror b_T_host = Kokkos::create_mirror_view(b_T);
+
+  Kokkos::deep_copy(x0_host, x0);
+  Kokkos::deep_copy(b_host, b);
+  // Kokkos::deep_copy(b_T_host, b_T);
+
+  /// check x0 = b ; this eps is about 10^-14
+  typedef typename ats::mag_type mag_type;
+  mag_type sum(1), diff(0);
+  const mag_type eps = 1.0e3 * ats::epsilon();
+
+  for (int k = 0; k < N; ++k)
+    for (int i = 0; i < BlkSize; ++i)
+      for (int j = 0; j < 5; ++j) {
+        sum += ats::abs(x0_host(k, i, j));
+        diff += ats::abs(x0_host(k, i, j) - b_host(k, i, j));
+      }
+  // printf("NoTranspose -- N=%d, BlkSize=%d, sum=%f, diff=%f\n", N, BlkSize,
+  // sum, diff);
+  EXPECT_NEAR_KK(diff / sum, 0.0, eps);
+
+  /// check x0 = b_T ; this eps is about 10^-14
+  // mag_type sum_T(1), diff_T(0);
+
+  // for (int k=0;k<N;++k)
+  //   for (int i=0;i<BlkSize;++i)
+  //     for (int j=0;j<5;++j) {
+  //       sum_T  += ats::abs(x0_host(k,i,j));
+  //       diff_T += ats::abs(x0_host(k,i,j)-b_T_host(k,i,j));
+  //     }
+  // //printf("Transpose -- N=%d, BlkSize=%d, sum=%f, diff=%f\n", N, BlkSize,
+  // sum_T, diff_T); EXPECT_NEAR_KK( diff_T/sum_T, 0.0, eps);
 }
-}
+}  // namespace SerialSolveLU
+}  // namespace Test
 
-template<typename DeviceType,
-         typename ValueType,
-         typename AlgoTagType>
+template <typename DeviceType, typename ValueType, typename AlgoTagType>
 int test_batched_solvelu() {
 #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft,DeviceType> ViewType;
-    Test::SerialSolveLU::impl_test_batched_solvelu<DeviceType,ViewType,AlgoTagType>(     0, 10);
-    for (int i=0;i<10;++i) {
-      Test::SerialSolveLU::impl_test_batched_solvelu<DeviceType,ViewType,AlgoTagType>(1024,  i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        ViewType;
+    Test::SerialSolveLU::impl_test_batched_solvelu<DeviceType, ViewType,
+                                                   AlgoTagType>(0, 10);
+    for (int i = 0; i < 10; ++i) {
+      Test::SerialSolveLU::impl_test_batched_solvelu<DeviceType, ViewType,
+                                                     AlgoTagType>(1024, i);
     }
   }
 #endif
 #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> ViewType;
-    Test::SerialSolveLU::impl_test_batched_solvelu<DeviceType,ViewType,AlgoTagType>(     0, 10);
-    for (int i=0;i<10;++i) {
-      Test::SerialSolveLU::impl_test_batched_solvelu<DeviceType,ViewType,AlgoTagType>(1024,  i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    Test::SerialSolveLU::impl_test_batched_solvelu<DeviceType, ViewType,
+                                                   AlgoTagType>(0, 10);
+    for (int i = 0; i < 10; ++i) {
+      Test::SerialSolveLU::impl_test_batched_solvelu<DeviceType, ViewType,
+                                                     AlgoTagType>(1024, i);
     }
   }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_SerialSolveLU_Complex.hpp b/unit_test/batched/dense/Test_Batched_SerialSolveLU_Complex.hpp
index 67af48c726..5463d111bc 100644
--- a/unit_test/batched/dense/Test_Batched_SerialSolveLU_Complex.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialSolveLU_Complex.hpp
@@ -1,9 +1,13 @@
 
 #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
-TEST_F( TestCategory, batched_scalar_serial_solvelu_dcomplex ) {
-  //printf("Batched serial solveLU - double complex - algorithm type: Unblocked\n");
-  test_batched_solvelu<TestExecSpace,Kokkos::complex<double>,Algo::SolveLU::Unblocked>();
-  //printf("Batched serial solveLU - double complex - algorithm type: Blocked\n");
-  test_batched_solvelu<TestExecSpace,Kokkos::complex<double>,Algo::SolveLU::Blocked>();
+TEST_F(TestCategory, batched_scalar_serial_solvelu_dcomplex) {
+  // printf("Batched serial solveLU - double complex - algorithm type:
+  // Unblocked\n");
+  test_batched_solvelu<TestExecSpace, Kokkos::complex<double>,
+                       Algo::SolveLU::Unblocked>();
+  // printf("Batched serial solveLU - double complex - algorithm type:
+  // Blocked\n");
+  test_batched_solvelu<TestExecSpace, Kokkos::complex<double>,
+                       Algo::SolveLU::Blocked>();
 }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_SerialSolveLU_Real.hpp b/unit_test/batched/dense/Test_Batched_SerialSolveLU_Real.hpp
index 1a551093ac..1d21ff30be 100644
--- a/unit_test/batched/dense/Test_Batched_SerialSolveLU_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialSolveLU_Real.hpp
@@ -1,20 +1,18 @@
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_scalar_serial_solvelu_float ) {
-  //printf("Batched serial solveLU - float - algorithm type: Unblocked\n");
-  test_batched_solvelu<TestExecSpace,float,Algo::SolveLU::Unblocked>();
-  //printf("Batched serial solveLU - float - algorithm type: Blocked\n");
-  test_batched_solvelu<TestExecSpace,float,Algo::SolveLU::Blocked>();
+TEST_F(TestCategory, batched_scalar_serial_solvelu_float) {
+  // printf("Batched serial solveLU - float - algorithm type: Unblocked\n");
+  test_batched_solvelu<TestExecSpace, float, Algo::SolveLU::Unblocked>();
+  // printf("Batched serial solveLU - float - algorithm type: Blocked\n");
+  test_batched_solvelu<TestExecSpace, float, Algo::SolveLU::Blocked>();
 }
 #endif
 
-
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F( TestCategory, batched_scalar_serial_solvelu_double ) {
-  //printf("Batched serial solveLU - double - algorithm type: Unblocked\n");
-  test_batched_solvelu<TestExecSpace,double,Algo::SolveLU::Unblocked>();
-  //printf("Batched serial solveLU - double - algorithm type: Blocked\n");
-  test_batched_solvelu<TestExecSpace,double,Algo::SolveLU::Blocked>();
+TEST_F(TestCategory, batched_scalar_serial_solvelu_double) {
+  // printf("Batched serial solveLU - double - algorithm type: Unblocked\n");
+  test_batched_solvelu<TestExecSpace, double, Algo::SolveLU::Unblocked>();
+  // printf("Batched serial solveLU - double - algorithm type: Blocked\n");
+  test_batched_solvelu<TestExecSpace, double, Algo::SolveLU::Blocked>();
 }
 #endif
-
diff --git a/unit_test/batched/dense/Test_Batched_SerialTrmm.hpp b/unit_test/batched/dense/Test_Batched_SerialTrmm.hpp
index cbbbbd2bf5..8df653b48f 100644
--- a/unit_test/batched/dense/Test_Batched_SerialTrmm.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialTrmm.hpp
@@ -11,66 +11,67 @@ using namespace KokkosBatched;
 
 namespace Test {
 namespace Trmm {
-  
-  template<class ViewTypeA, class ExecutionSpace>
-  struct UnitDiagTRMM {
-    ViewTypeA A_;
-    using ScalarA = typename ViewTypeA::value_type;
 
-    UnitDiagTRMM (const ViewTypeA& A) : A_(A) {}
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i) const {
-      A_(i,i) = ScalarA(1);
-    }
-  };
-  template<class ViewTypeA, class ExecutionSpace>
-  struct NonUnitDiagTRMM {
-    ViewTypeA A_;
-    using ScalarA = typename ViewTypeA::value_type;
-
-    NonUnitDiagTRMM (const ViewTypeA& A) : A_(A) {}
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i) const {
-      A_(i,i) = A_(i,i)+10;
-    }
-  };
-  template<class ViewTypeA, class ViewTypeB, class ViewTypeC, class ExecutionSpace>
-  struct VanillaGEMM {
-    bool A_t, B_t, A_c, B_c;
-    int N,K;
-    ViewTypeA A;
-    ViewTypeB B;
-    ViewTypeC C;
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-    typedef typename ViewTypeC::value_type ScalarC;
-    typedef Kokkos::Details::ArithTraits<ScalarC> APT;
-    typedef typename APT::mag_type mag_type;
-    ScalarA alpha;
-    ScalarC beta;
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team) const {
+template <class ViewTypeA, class ExecutionSpace>
+struct UnitDiagTRMM {
+  ViewTypeA A_;
+  using ScalarA = typename ViewTypeA::value_type;
+
+  UnitDiagTRMM(const ViewTypeA& A) : A_(A) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const { A_(i, i) = ScalarA(1); }
+};
+template <class ViewTypeA, class ExecutionSpace>
+struct NonUnitDiagTRMM {
+  ViewTypeA A_;
+  using ScalarA = typename ViewTypeA::value_type;
+
+  NonUnitDiagTRMM(const ViewTypeA& A) : A_(A) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const { A_(i, i) = A_(i, i) + 10; }
+};
+template <class ViewTypeA, class ViewTypeB, class ViewTypeC,
+          class ExecutionSpace>
+struct VanillaGEMM {
+  bool A_t, B_t, A_c, B_c;
+  int N, K;
+  ViewTypeA A;
+  ViewTypeB B;
+  ViewTypeC C;
+
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+  typedef typename ViewTypeC::value_type ScalarC;
+  typedef Kokkos::Details::ArithTraits<ScalarC> APT;
+  typedef typename APT::mag_type mag_type;
+  ScalarA alpha;
+  ScalarC beta;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(
+      const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team)
+      const {
 // GNU COMPILER BUG WORKAROUND
-#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
-      int i = team.league_rank();
+#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && \
+    !defined(__HIP_DEVICE_COMPILE__)
+    int i = team.league_rank();
 #else
-      const int i = team.league_rank();
+    const int i = team.league_rank();
 #endif
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,N), [&] (const int& j) {
-        ScalarC C_ij = 0.0;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& j) {
+      ScalarC C_ij = 0.0;
 
-        // GNU 5.3, 5.4 and 6.1 (and maybe more) crash with another nested lambda here
+      // GNU 5.3, 5.4 and 6.1 (and maybe more) crash with another nested lambda
+      // here
 
 #if defined(KOKKOS_COMPILER_GNU) && !defined(KOKKOS_COMPILER_NVCC)
-        for(int k=0; k<K; k++) {
-          ScalarA A_ik = A_t?(A_c?APT::conj(A(k,i)):A(k,i)):A(i,k);
-          ScalarB B_kj = B_t?(B_c?APT::conj(B(j,k)):B(j,k)):B(k,j);
-          C_ij += A_ik*B_kj;
-        }
+      for (int k = 0; k < K; k++) {
+        ScalarA A_ik = A_t ? (A_c ? APT::conj(A(k, i)) : A(k, i)) : A(i, k);
+        ScalarB B_kj = B_t ? (B_c ? APT::conj(B(j, k)) : B(j, k)) : B(k, j);
+        C_ij += A_ik * B_kj;
+      }
 #else
         Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,K), [&] (const int& k, ScalarC& lsum) {
            ScalarA A_ik = A_t?(A_c?APT::conj(A(k,i)):A(k,i)):A(i,k);
@@ -79,231 +80,262 @@ namespace Trmm {
         },C_ij);
 #endif
 
-        C(i,j) = beta*C(i,j) + alpha*C_ij;
-      });
-    }
-  };
-
-  template<typename S, typename U, typename T, typename D>
-  struct ParamTag {
-    typedef S side;
-    typedef U uplo;
-    typedef T trans;
-    typedef D diag;
-  };
-
-  template<typename DeviceType,
-           typename ViewType,
-           typename ScalarType,
-           typename ParamTagType,
-           typename AlgoTagType>
-  struct Functor_TestBatchedSerialTrmm {
-    ViewType _a, _b;
-    
-    ScalarType _alpha;
-
-    KOKKOS_INLINE_FUNCTION
-    Functor_TestBatchedSerialTrmm(const ScalarType alpha, 
-            const ViewType &a,
-            const ViewType &b) 
+      C(i, j) = beta * C(i, j) + alpha * C_ij;
+    });
+  }
+};
+
+template <typename S, typename U, typename T, typename D>
+struct ParamTag {
+  typedef S side;
+  typedef U uplo;
+  typedef T trans;
+  typedef D diag;
+};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
+struct Functor_TestBatchedSerialTrmm {
+  ViewType _a, _b;
+
+  ScalarType _alpha;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedSerialTrmm(const ScalarType alpha, const ViewType& a,
+                                const ViewType& b)
       : _a(a), _b(b), _alpha(alpha) {}
 
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const ParamTagType &, const int k) const {
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-      auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
-
-      SerialTrmm<typename ParamTagType::side,
-        typename ParamTagType::uplo,
-        typename ParamTagType::trans,
-        typename ParamTagType::diag,
-        AlgoTagType>::
-        invoke(_alpha, aa, bb);
-    }
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const ParamTagType&, const int k) const {
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
 
-    inline
-    void run() {
-      typedef typename ViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::SerialTrmm");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion( name.c_str() );
-      Kokkos::RangePolicy<DeviceType,ParamTagType> policy(0, _a.extent(0));
-      Kokkos::parallel_for(name.c_str(), policy, *this);
-      Kokkos::Profiling::popRegion();
-    }
-  };
-
-  template<typename DeviceType,
-           typename ViewType,
-           typename ScalarType,
-           typename ParamTagType,
-           typename AlgoTagType>
-  void impl_test_batched_trmm(const int N, const int nRows, const int nCols, const char *trans) {
+    SerialTrmm<typename ParamTagType::side, typename ParamTagType::uplo,
+               typename ParamTagType::trans, typename ParamTagType::diag,
+               AlgoTagType>::invoke(_alpha, aa, bb);
+  }
+
+  inline void run() {
     typedef typename ViewType::value_type value_type;
-    typedef typename DeviceType::execution_space execution_space;
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
-
-    ScalarType alpha(1.0);
-    ScalarType beta(0.0);
-
-    const bool is_side_right = std::is_same<typename ParamTagType::side,Side::Right>::value;
-    const bool is_A_lower = std::is_same<typename ParamTagType::uplo,Uplo::Lower>::value;
-    const int K = is_side_right ? nCols : nRows;
-    ViewType
-      A("A", N, K, K),
-      B_actual("B_actual", N, nRows, nCols), 
+    std::string name_region("KokkosBatched::Test::SerialTrmm");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::RangePolicy<DeviceType, ParamTagType> policy(0, _a.extent(0));
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
+void impl_test_batched_trmm(const int N, const int nRows, const int nCols,
+                            const char* trans) {
+  typedef typename ViewType::value_type value_type;
+  typedef typename DeviceType::execution_space execution_space;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  ScalarType alpha(1.0);
+  ScalarType beta(0.0);
+
+  const bool is_side_right =
+      std::is_same<typename ParamTagType::side, Side::Right>::value;
+  const bool is_A_lower =
+      std::is_same<typename ParamTagType::uplo, Uplo::Lower>::value;
+  const int K = is_side_right ? nCols : nRows;
+  ViewType A("A", N, K, K), B_actual("B_actual", N, nRows, nCols),
       B_expected("B_expected", N, nRows, nCols);
-    typename ViewType::HostMirror A_host = Kokkos::create_mirror_view(A);
-    typename ViewType::HostMirror B_actual_host = Kokkos::create_mirror_view(B_actual);
-    typename ViewType::HostMirror B_expected_host = Kokkos::create_mirror_view(B_expected);
-    uint64_t seed = Kokkos::Impl::clock_tic();
-
-    using ViewTypeSubA = decltype(Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()));
-    using ViewTypeSubB = decltype(Kokkos::subview(B_actual, 0, Kokkos::ALL(), Kokkos::ALL()));
-
-    Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(seed);    
-
-    if(std::is_same<typename ParamTagType::diag,Diag::NonUnit>::value) {
-      // Initialize A with deterministic random numbers
-      Kokkos::fill_random(A, rand_pool, Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, ScalarType>::max());
-      using functor_type = UnitDiagTRMM<ViewTypeSubA,execution_space>;
-      for (int k = 0; k < N; ++k) {
-        functor_type udtrmm(Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()));
-        // Initialize As diag with 1s
-        Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRMM", Kokkos::RangePolicy<execution_space>(0,K), udtrmm);
-      }
-    } else {//(diag[0]=='N')||(diag[0]=='n')
-      // Initialize A with random numbers
-      Kokkos::fill_random(A, rand_pool, Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, ScalarType>::max());
-      using functor_type = NonUnitDiagTRMM<ViewTypeSubA,execution_space>;
-      for (int k = 0; k < N; ++k) {
-        functor_type nudtrmm(Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()));
-        // Initialize As diag with A(i,i)+10
-        Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRMM", Kokkos::RangePolicy<execution_space>(0,K), nudtrmm);
-      }
+  typename ViewType::HostMirror A_host = Kokkos::create_mirror_view(A);
+  typename ViewType::HostMirror B_actual_host =
+      Kokkos::create_mirror_view(B_actual);
+  typename ViewType::HostMirror B_expected_host =
+      Kokkos::create_mirror_view(B_expected);
+  uint64_t seed = Kokkos::Impl::clock_tic();
+
+  using ViewTypeSubA =
+      decltype(Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()));
+  using ViewTypeSubB =
+      decltype(Kokkos::subview(B_actual, 0, Kokkos::ALL(), Kokkos::ALL()));
+
+  Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(seed);
+
+  if (std::is_same<typename ParamTagType::diag, Diag::NonUnit>::value) {
+    // Initialize A with deterministic random numbers
+    Kokkos::fill_random(A, rand_pool,
+                        Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
+                                     ScalarType>::max());
+    using functor_type = UnitDiagTRMM<ViewTypeSubA, execution_space>;
+    for (int k = 0; k < N; ++k) {
+      functor_type udtrmm(Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()));
+      // Initialize As diag with 1s
+      Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRMM",
+                           Kokkos::RangePolicy<execution_space>(0, K), udtrmm);
     }
-    Kokkos::fill_random(B_actual, rand_pool, Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, ScalarType>::max());
-    Kokkos::fence();
-
-    Kokkos::deep_copy(B_expected, B_actual);
-    Kokkos::fence();
-
-    Kokkos::deep_copy(A_host,  A);
-    // Make A_host a lower triangle
-    for (int k = 0; k < N; k++) {
-      if (is_A_lower) {
-        for (int i = 0; i < K-1; i++)
-          for (int j = i+1; j < K; j++)
-            A_host(k,i,j) = ScalarType(0);
-      }
-      else {
-        // Make A_host a upper triangle
-        for (int i = 1; i < K; i++)
-          for (int j = 0; j < i; j++)
-            A_host(k,i,j) = ScalarType(0); 
-      }
+  } else {  //(diag[0]=='N')||(diag[0]=='n')
+    // Initialize A with random numbers
+    Kokkos::fill_random(A, rand_pool,
+                        Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
+                                     ScalarType>::max());
+    using functor_type = NonUnitDiagTRMM<ViewTypeSubA, execution_space>;
+    for (int k = 0; k < N; ++k) {
+      functor_type nudtrmm(Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()));
+      // Initialize As diag with A(i,i)+10
+      Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRMM",
+                           Kokkos::RangePolicy<execution_space>(0, K), nudtrmm);
     }
-    Kokkos::deep_copy(A, A_host);
-
-    if (!is_side_right){
-      // B_expected = alpha * op(A) * B + beta * C = 1 * op(A) * B + 0 * C
-      struct VanillaGEMM<ViewTypeSubA,ViewTypeSubB,ViewTypeSubB,execution_space> vgemm;
-      vgemm.A_t = (trans[0]!='N') && (trans[0]!='n'); vgemm.B_t = false;
-      vgemm.A_c = (trans[0]=='C') || (trans[0]=='c'); vgemm.B_c = false;
-      vgemm.N = nCols;    vgemm.K = K;
-      vgemm.alpha = alpha;
-      vgemm.beta = beta;
-      for (int i = 0; i < N; i++) {
-        vgemm.A = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); 
-        vgemm.B = Kokkos::subview(B_actual, i, Kokkos::ALL(), Kokkos::ALL());;
-        vgemm.C = Kokkos::subview(B_expected, i, Kokkos::ALL(), Kokkos::ALL());;
-        Kokkos::parallel_for("KokkosBlas::Test::VanillaGEMM", Kokkos::TeamPolicy<execution_space>(nRows,Kokkos::AUTO,16), vgemm);
-      }
+  }
+  Kokkos::fill_random(B_actual, rand_pool,
+                      Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
+                                   ScalarType>::max());
+  Kokkos::fence();
+
+  Kokkos::deep_copy(B_expected, B_actual);
+  Kokkos::fence();
+
+  Kokkos::deep_copy(A_host, A);
+  // Make A_host a lower triangle
+  for (int k = 0; k < N; k++) {
+    if (is_A_lower) {
+      for (int i = 0; i < K - 1; i++)
+        for (int j = i + 1; j < K; j++) A_host(k, i, j) = ScalarType(0);
+    } else {
+      // Make A_host a upper triangle
+      for (int i = 1; i < K; i++)
+        for (int j = 0; j < i; j++) A_host(k, i, j) = ScalarType(0);
     }
-    else {
-      // B_expected = alpha * B * op(A) + beta * C = 1 * B * op(A) + 0 * C
-      struct VanillaGEMM<ViewTypeSubB,ViewTypeSubA,ViewTypeSubB,execution_space> vgemm;
-      vgemm.A_t = false; vgemm.B_t = (trans[0]!='N') && (trans[0]!='n');
-      vgemm.A_c = false; vgemm.B_c = (trans[0]=='C') || (trans[0]=='c');
-      vgemm.N = nCols;     vgemm.K = K;
-      vgemm.alpha = alpha;
-      vgemm.beta = beta;
-      for (int i = 0; i < N; i++) {
-        vgemm.A = Kokkos::subview(B_actual, i, Kokkos::ALL(), Kokkos::ALL()); 
-        vgemm.B = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());;
-        vgemm.C = Kokkos::subview(B_expected, i, Kokkos::ALL(), Kokkos::ALL());;
-        Kokkos::parallel_for("KokkosBlas::Test::VanillaGEMM", Kokkos::TeamPolicy<execution_space>(nRows,Kokkos::AUTO,16), vgemm);
-      }
+  }
+  Kokkos::deep_copy(A, A_host);
+
+  if (!is_side_right) {
+    // B_expected = alpha * op(A) * B + beta * C = 1 * op(A) * B + 0 * C
+    struct VanillaGEMM<ViewTypeSubA, ViewTypeSubB, ViewTypeSubB,
+                       execution_space>
+        vgemm;
+    vgemm.A_t   = (trans[0] != 'N') && (trans[0] != 'n');
+    vgemm.B_t   = false;
+    vgemm.A_c   = (trans[0] == 'C') || (trans[0] == 'c');
+    vgemm.B_c   = false;
+    vgemm.N     = nCols;
+    vgemm.K     = K;
+    vgemm.alpha = alpha;
+    vgemm.beta  = beta;
+    for (int i = 0; i < N; i++) {
+      vgemm.A = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
+      vgemm.B = Kokkos::subview(B_actual, i, Kokkos::ALL(), Kokkos::ALL());
+      ;
+      vgemm.C = Kokkos::subview(B_expected, i, Kokkos::ALL(), Kokkos::ALL());
+      ;
+      Kokkos::parallel_for(
+          "KokkosBlas::Test::VanillaGEMM",
+          Kokkos::TeamPolicy<execution_space>(nRows, Kokkos::AUTO, 16), vgemm);
     }
+  } else {
+    // B_expected = alpha * B * op(A) + beta * C = 1 * B * op(A) + 0 * C
+    struct VanillaGEMM<ViewTypeSubB, ViewTypeSubA, ViewTypeSubB,
+                       execution_space>
+        vgemm;
+    vgemm.A_t   = false;
+    vgemm.B_t   = (trans[0] != 'N') && (trans[0] != 'n');
+    vgemm.A_c   = false;
+    vgemm.B_c   = (trans[0] == 'C') || (trans[0] == 'c');
+    vgemm.N     = nCols;
+    vgemm.K     = K;
+    vgemm.alpha = alpha;
+    vgemm.beta  = beta;
+    for (int i = 0; i < N; i++) {
+      vgemm.A = Kokkos::subview(B_actual, i, Kokkos::ALL(), Kokkos::ALL());
+      vgemm.B = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
+      ;
+      vgemm.C = Kokkos::subview(B_expected, i, Kokkos::ALL(), Kokkos::ALL());
+      ;
+      Kokkos::parallel_for(
+          "KokkosBlas::Test::VanillaGEMM",
+          Kokkos::TeamPolicy<execution_space>(nRows, Kokkos::AUTO, 16), vgemm);
+    }
+  }
 
-    Functor_TestBatchedSerialTrmm<DeviceType,ViewType,ScalarType,
-      ParamTagType,Algo::Trmm::Unblocked>(alpha, A, B_actual).run();
+  Functor_TestBatchedSerialTrmm<DeviceType, ViewType, ScalarType, ParamTagType,
+                                Algo::Trmm::Unblocked>(alpha, A, B_actual)
+      .run();
 
-    Kokkos::fence();
+  Kokkos::fence();
 
-    Kokkos::deep_copy(B_actual_host, B_actual);
-    Kokkos::deep_copy(B_expected_host, B_expected);
+  Kokkos::deep_copy(B_actual_host, B_actual);
+  Kokkos::deep_copy(B_expected_host, B_expected);
 
-    Kokkos::fence();
+  Kokkos::fence();
 
-    // eps is ~ 10^-13 for double
-    typedef typename ats::mag_type mag_type;
-    const mag_type eps = 1.0e8 * ats::epsilon();
-    bool fail_flag = false;
+  // eps is ~ 10^-13 for double
+  typedef typename ats::mag_type mag_type;
+  const mag_type eps = 1.0e8 * ats::epsilon();
+  bool fail_flag     = false;
 
-    for (int k=0;k<N;++k) {
-      for (int i=0;i<nRows;++i) {
-        for (int j=0;j<nCols;++j) {
-          if (ats::abs(B_actual_host(k,i,j)-B_expected_host(k,i,j)) > eps) {
-            //printf("   Error: eps ( %g ), abs_result( %.15lf ) != abs_solution( %.15lf ) (abs result-solution %g) at (k %d, i %d, j %d)\n", eps, ats::abs(B_actual_host(k,i,j)), ats::abs(B_expected_host(k,i,j)), ats::abs(B_actual_host(k,i,j) - B_expected_host(k,i,j)), k, i, j);
-            fail_flag = true;
-          }
+  for (int k = 0; k < N; ++k) {
+    for (int i = 0; i < nRows; ++i) {
+      for (int j = 0; j < nCols; ++j) {
+        if (ats::abs(B_actual_host(k, i, j) - B_expected_host(k, i, j)) > eps) {
+          // printf("   Error: eps ( %g ), abs_result( %.15lf ) != abs_solution(
+          // %.15lf ) (abs result-solution %g) at (k %d, i %d, j %d)\n", eps,
+          // ats::abs(B_actual_host(k,i,j)), ats::abs(B_expected_host(k,i,j)),
+          // ats::abs(B_actual_host(k,i,j) - B_expected_host(k,i,j)), k, i, j);
+          fail_flag = true;
         }
       }
     }
-
-    ASSERT_EQ( fail_flag, false );
   }
-}
-}
 
+  ASSERT_EQ(fail_flag, false);
+}
+}  // namespace Trmm
+}  // namespace Test
 
-template<typename DeviceType,
-         typename ValueType,
-         typename ScalarType,
-         typename ParamTagType,
-         typename AlgoTagType>
+template <typename DeviceType, typename ValueType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
 int test_batched_trmm(int batchSize = 512) {
-  char trans = std::is_same<typename ParamTagType::trans,Trans::NoTranspose>::value ? 'N' :
-                std::is_same<typename ParamTagType::trans,Trans::Transpose>::value ? 'T' :
-                std::is_same<typename ParamTagType::trans,Trans::ConjTranspose>::value ? 'C' : 'E';
+  char trans =
+      std::is_same<typename ParamTagType::trans, Trans::NoTranspose>::value
+          ? 'N'
+          : std::is_same<typename ParamTagType::trans, Trans::Transpose>::value
+                ? 'T'
+                : std::is_same<typename ParamTagType::trans,
+                               Trans::ConjTranspose>::value
+                      ? 'C'
+                      : 'E';
 #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft,DeviceType> ViewType;
-    Test::Trmm::impl_test_batched_trmm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(     0, 10, 4, &trans);
-    for (int i=0;i<10;++i) {
-      //printf("Testing: LayoutLeft,  Blksize %d\n", i);  
-      Test::Trmm::impl_test_batched_trmm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(batchSize,  i, 4, &trans);
-      Test::Trmm::impl_test_batched_trmm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(batchSize,  i, 1, &trans);
+    typedef Kokkos::View<ValueType***, Kokkos::LayoutLeft, DeviceType> ViewType;
+    Test::Trmm::impl_test_batched_trmm<DeviceType, ViewType, ScalarType,
+                                       ParamTagType, AlgoTagType>(0, 10, 4,
+                                                                  &trans);
+    for (int i = 0; i < 10; ++i) {
+      // printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      Test::Trmm::impl_test_batched_trmm<DeviceType, ViewType, ScalarType,
+                                         ParamTagType, AlgoTagType>(
+          batchSize, i, 4, &trans);
+      Test::Trmm::impl_test_batched_trmm<DeviceType, ViewType, ScalarType,
+                                         ParamTagType, AlgoTagType>(
+          batchSize, i, 1, &trans);
     }
   }
 #endif
 #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> ViewType;
-    Test::Trmm::impl_test_batched_trmm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(     0, 10, 4, &trans);
-    for (int i=0;i<10;++i) {
-      //printf("Testing: LayoutRight, Blksize %d\n", i);  
-      Test::Trmm::impl_test_batched_trmm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(batchSize,  i, 4, &trans);
-      Test::Trmm::impl_test_batched_trmm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(batchSize,  i, 1, &trans);
+    typedef Kokkos::View<ValueType***, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    Test::Trmm::impl_test_batched_trmm<DeviceType, ViewType, ScalarType,
+                                       ParamTagType, AlgoTagType>(0, 10, 4,
+                                                                  &trans);
+    for (int i = 0; i < 10; ++i) {
+      // printf("Testing: LayoutRight, Blksize %d\n", i);
+      Test::Trmm::impl_test_batched_trmm<DeviceType, ViewType, ScalarType,
+                                         ParamTagType, AlgoTagType>(
+          batchSize, i, 4, &trans);
+      Test::Trmm::impl_test_batched_trmm<DeviceType, ViewType, ScalarType,
+                                         ParamTagType, AlgoTagType>(
+          batchSize, i, 1, &trans);
     }
   }
 #endif
 
   return 0;
 }
-
diff --git a/unit_test/batched/dense/Test_Batched_SerialTrmm_Complex.hpp b/unit_test/batched/dense/Test_Batched_SerialTrmm_Complex.hpp
index 13e3b70049..06949095ef 100644
--- a/unit_test/batched/dense/Test_Batched_SerialTrmm_Complex.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialTrmm_Complex.hpp
@@ -1,230 +1,354 @@
 
 #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT)
 // NO TRANSPOSE
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_l_nt_u_scomplex_scomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_u_scomplex_scomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::NoTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<float>,
+                    Kokkos::complex<float>, param_tag_type, algo_tag_type>(128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_l_nt_n_scomplex_scomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_n_scomplex_scomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::NoTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<float>,
+                    Kokkos::complex<float>, param_tag_type, algo_tag_type>(128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_u_nt_u_scomplex_scomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_u_scomplex_scomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<float>,
+                    Kokkos::complex<float>, param_tag_type, algo_tag_type>(128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_u_nt_n_scomplex_scomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_n_scomplex_scomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<float>,
+                    Kokkos::complex<float>, param_tag_type, algo_tag_type>(128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_r_u_nt_u_scomplex_scomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_u_scomplex_scomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<float>,
+                    Kokkos::complex<float>, param_tag_type, algo_tag_type>(128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_r_u_nt_n_scomplex_scomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_n_scomplex_scomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<float>,
+                    Kokkos::complex<float>, param_tag_type, algo_tag_type>(128);
 }
 // TRANSPOSE
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_l_t_u_scomplex_scomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::Transpose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_u_scomplex_scomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::Transpose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<float>,
+                    Kokkos::complex<float>, param_tag_type, algo_tag_type>(128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_l_t_n_scomplex_scomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::Transpose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_n_scomplex_scomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::Transpose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<float>,
+                    Kokkos::complex<float>, param_tag_type, algo_tag_type>(128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_u_t_u_scomplex_scomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_u_scomplex_scomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::Transpose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<float>,
+                    Kokkos::complex<float>, param_tag_type, algo_tag_type>(128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_u_t_n_scomplex_scomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_n_scomplex_scomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::Transpose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<float>,
+                    Kokkos::complex<float>, param_tag_type, algo_tag_type>(128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_r_u_t_u_scomplex_scomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::Transpose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_u_scomplex_scomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::Transpose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<float>,
+                    Kokkos::complex<float>, param_tag_type, algo_tag_type>(128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_r_u_t_n_scomplex_scomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::Transpose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_n_scomplex_scomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::Transpose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<float>,
+                    Kokkos::complex<float>, param_tag_type, algo_tag_type>(128);
 }
 // CONJUGATE TRANSPOSE
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_l_ct_u_scomplex_scomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::ConjTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_u_scomplex_scomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::ConjTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<float>,
+                    Kokkos::complex<float>, param_tag_type, algo_tag_type>(128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_l_ct_n_scomplex_scomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::ConjTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_n_scomplex_scomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::ConjTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<float>,
+                    Kokkos::complex<float>, param_tag_type, algo_tag_type>(128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_u_ct_u_scomplex_scomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::ConjTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_u_scomplex_scomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::ConjTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<float>,
+                    Kokkos::complex<float>, param_tag_type, algo_tag_type>(128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_u_ct_n_scomplex_scomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::ConjTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_n_scomplex_scomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::ConjTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<float>,
+                    Kokkos::complex<float>, param_tag_type, algo_tag_type>(128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_r_u_ct_u_scomplex_scomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::ConjTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_u_scomplex_scomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::ConjTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<float>,
+                    Kokkos::complex<float>, param_tag_type, algo_tag_type>(128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_r_u_ct_n_scomplex_scomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::ConjTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_n_scomplex_scomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::ConjTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<float>,
+                    Kokkos::complex<float>, param_tag_type, algo_tag_type>(128);
 }
 #endif
 
-
 #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
 // NO TRANSPOSE
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_l_nt_u_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_u_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::NoTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>(
+      128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_l_nt_n_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_n_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::NoTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>(
+      128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_u_nt_u_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_u_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>(
+      128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_u_nt_n_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_n_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>(
+      128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_r_u_nt_u_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_u_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>(
+      128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_r_u_nt_n_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_n_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>(
+      128);
 }
 // TRANSPOSE
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_l_t_u_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::Transpose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_u_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::Transpose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>(
+      128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_l_t_n_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::Transpose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_n_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::Transpose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>(
+      128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_u_t_u_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_u_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::Transpose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>(
+      128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_u_t_n_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_n_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::Transpose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>(
+      128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_r_u_t_u_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::Transpose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_u_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::Transpose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>(
+      128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_r_u_t_n_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::Transpose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_n_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::Transpose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>(
+      128);
 }
 // CONJUGATE TRANSPOSE
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_l_ct_u_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::ConjTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_u_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::ConjTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>(
+      128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_l_ct_n_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::ConjTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_n_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::ConjTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>(
+      128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_u_ct_u_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::ConjTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_u_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::ConjTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>(
+      128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_u_ct_n_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::ConjTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_n_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::ConjTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>(
+      128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_r_u_ct_u_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::ConjTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_u_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::ConjTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>(
+      128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_r_u_ct_n_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::ConjTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_n_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::ConjTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trmm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>(
+      128);
 }
 #endif
-
diff --git a/unit_test/batched/dense/Test_Batched_SerialTrmm_Real.hpp b/unit_test/batched/dense/Test_Batched_SerialTrmm_Real.hpp
index 50c3289625..34f3540ab9 100644
--- a/unit_test/batched/dense/Test_Batched_SerialTrmm_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialTrmm_Real.hpp
@@ -1,230 +1,336 @@
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
 // NO TRANSPOSE
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_l_nt_u_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_u_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::NoTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_l_nt_n_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_n_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::NoTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_u_nt_u_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_u_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_u_nt_n_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_n_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_r_u_nt_u_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_u_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_r_u_nt_n_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_n_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
 // TRANSPOSE
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_l_t_u_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::Transpose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_u_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::Transpose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_l_t_n_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::Transpose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_n_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::Transpose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_u_t_u_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_u_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::Transpose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_u_t_n_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_n_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::Transpose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_r_u_t_u_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::Transpose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_u_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::Transpose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_r_u_t_n_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::Transpose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_n_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::Transpose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
 // CONJUGATE TRANSPOSE
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_l_ct_u_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::ConjTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_u_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::ConjTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_l_ct_n_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::ConjTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_n_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::ConjTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_u_ct_u_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::ConjTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_u_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::ConjTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_u_ct_n_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::ConjTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_n_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::ConjTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_r_u_ct_u_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::ConjTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_u_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::ConjTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_r_u_ct_n_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::ConjTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_n_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::ConjTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
 #endif
 
-
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
 // NO TRANSPOSE
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_l_nt_u_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_u_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::NoTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_l_nt_n_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_n_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::NoTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_u_nt_u_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_u_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_u_nt_n_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_n_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_r_u_nt_u_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_u_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_r_u_nt_n_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_n_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
 // TRANSPOSE
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_l_t_u_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::Transpose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_u_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::Transpose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_l_t_n_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::Transpose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_n_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::Transpose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_u_t_u_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_u_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::Transpose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_u_t_n_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_n_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::Transpose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_r_u_t_u_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::Transpose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_u_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::Transpose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_r_u_t_n_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::Transpose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_n_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::Transpose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
 // CONJUGATE TRANSPOSE
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_l_ct_u_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::ConjTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_u_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::ConjTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_l_ct_n_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::ConjTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_n_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::ConjTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_u_ct_u_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::ConjTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_u_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::ConjTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_l_u_ct_n_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::ConjTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_n_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::ConjTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_r_u_ct_u_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::ConjTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_u_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::ConjTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trmm_r_u_ct_n_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::ConjTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_n_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::ConjTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trmm::Unblocked algo_tag_type;
-  
-  test_batched_trmm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+
+  test_batched_trmm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
 #endif
-
diff --git a/unit_test/batched/dense/Test_Batched_SerialTrsm.hpp b/unit_test/batched/dense/Test_Batched_SerialTrsm.hpp
index f6ddebaa9a..81a0456623 100644
--- a/unit_test/batched/dense/Test_Batched_SerialTrsm.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialTrsm.hpp
@@ -14,147 +14,139 @@ using namespace KokkosBatched;
 namespace Test {
 namespace Trsm {
 
-  template<typename S, typename U, typename T, typename D>
-  struct ParamTag {
-    typedef S side;
-    typedef U uplo;
-    typedef T trans;
-    typedef D diag;
-  };
-
-  template<typename DeviceType,
-           typename ViewType,
-           typename ScalarType,
-           typename ParamTagType,
-           typename AlgoTagType>
-  struct Functor_TestBatchedSerialTrsm {
-    ViewType _a, _b;
-    
-    ScalarType _alpha;
-
-    KOKKOS_INLINE_FUNCTION
-    Functor_TestBatchedSerialTrsm(const ScalarType alpha, 
-            const ViewType &a,
-            const ViewType &b) 
+template <typename S, typename U, typename T, typename D>
+struct ParamTag {
+  typedef S side;
+  typedef U uplo;
+  typedef T trans;
+  typedef D diag;
+};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
+struct Functor_TestBatchedSerialTrsm {
+  ViewType _a, _b;
+
+  ScalarType _alpha;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedSerialTrsm(const ScalarType alpha, const ViewType &a,
+                                const ViewType &b)
       : _a(a), _b(b), _alpha(alpha) {}
 
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const ParamTagType &, const int k) const {
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-      auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
-
-      SerialTrsm<typename ParamTagType::side,
-        typename ParamTagType::uplo,
-        typename ParamTagType::trans,
-        typename ParamTagType::diag,
-        AlgoTagType>::
-        invoke(_alpha, aa, bb);
-    }
-
-    inline
-    void run() {
-      typedef typename ViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::SerialTrsm");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion( name.c_str() );
-      Kokkos::RangePolicy<DeviceType,ParamTagType> policy(0, _b.extent(0));
-      Kokkos::parallel_for(name.c_str(), policy, *this);
-      Kokkos::Profiling::popRegion();
-    }
-  };
-
-  template<typename DeviceType,
-           typename ViewType,
-           typename ScalarType,
-           typename ParamTagType,
-           typename AlgoTagType>
-  void impl_test_batched_trsm(const int N, const int BlkSize, const int NumCols) {
-    typedef typename ViewType::value_type value_type;
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
-
-    /// randomized input testing views
-    ScalarType alpha(1.0);
-
-    const bool is_side_right = std::is_same<typename ParamTagType::side,Side::Right>::value;
-    const int b_nrows = is_side_right ? NumCols : BlkSize;
-    const int b_ncols = is_side_right ? BlkSize : NumCols;
-    ViewType
-      a0("a0", N, BlkSize,BlkSize), a1("a1", N, BlkSize, BlkSize),
-      b0("b0", N, b_nrows,b_ncols), b1("b1", N, b_nrows, b_ncols);
-
-    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(13718);
-    Kokkos::fill_random(a0, random, value_type(1.0));
-    Kokkos::fill_random(b0, random, value_type(1.0));
-
-    Kokkos::fence();
-
-    Kokkos::deep_copy(a1, a0);
-    Kokkos::deep_copy(b1, b0);
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const ParamTagType &, const int k) const {
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
 
-    Functor_TestBatchedSerialTrsm<DeviceType,ViewType,ScalarType,
-      ParamTagType,Algo::Trsm::Unblocked>(alpha, a0, b0).run();
-    Functor_TestBatchedSerialTrsm<DeviceType,ViewType,ScalarType,
-      ParamTagType,AlgoTagType>(alpha, a1, b1).run();
-
-    Kokkos::fence();
-
-    /// for comparison send it to host
-    typename ViewType::HostMirror b0_host = Kokkos::create_mirror_view(b0);
-    typename ViewType::HostMirror b1_host = Kokkos::create_mirror_view(b1);
-
-    Kokkos::deep_copy(b0_host, b0);
-    Kokkos::deep_copy(b1_host, b1);
-
-    /// check b0 = b1 ; this eps is about 10^-14
-    typedef typename ats::mag_type mag_type;
-    mag_type sum(1), diff(0);
-    const mag_type eps = 1.0e3 * ats::epsilon();
+    SerialTrsm<typename ParamTagType::side, typename ParamTagType::uplo,
+               typename ParamTagType::trans, typename ParamTagType::diag,
+               AlgoTagType>::invoke(_alpha, aa, bb);
+  }
 
-    for (int k=0;k<N;++k)
-      for (int i=0;i<b_nrows;++i)
-        for (int j=0;j<b_ncols;++j) {
-          sum  += ats::abs(b0_host(k,i,j));
-          diff += ats::abs(b0_host(k,i,j)-b1_host(k,i,j));
-        }
-    EXPECT_NEAR_KK( diff/sum, 0.0, eps);
+  inline void run() {
+    typedef typename ViewType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::SerialTrsm");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::RangePolicy<DeviceType, ParamTagType> policy(0, _b.extent(0));
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
   }
+};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
+void impl_test_batched_trsm(const int N, const int BlkSize, const int NumCols) {
+  typedef typename ViewType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  /// randomized input testing views
+  ScalarType alpha(1.0);
+
+  const bool is_side_right =
+      std::is_same<typename ParamTagType::side, Side::Right>::value;
+  const int b_nrows = is_side_right ? NumCols : BlkSize;
+  const int b_ncols = is_side_right ? BlkSize : NumCols;
+  ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize),
+      b0("b0", N, b_nrows, b_ncols), b1("b1", N, b_nrows, b_ncols);
+
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
+  Kokkos::fill_random(a0, random, value_type(1.0));
+  Kokkos::fill_random(b0, random, value_type(1.0));
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(a1, a0);
+  Kokkos::deep_copy(b1, b0);
+
+  Functor_TestBatchedSerialTrsm<DeviceType, ViewType, ScalarType, ParamTagType,
+                                Algo::Trsm::Unblocked>(alpha, a0, b0)
+      .run();
+  Functor_TestBatchedSerialTrsm<DeviceType, ViewType, ScalarType, ParamTagType,
+                                AlgoTagType>(alpha, a1, b1)
+      .run();
+
+  Kokkos::fence();
+
+  /// for comparison send it to host
+  typename ViewType::HostMirror b0_host = Kokkos::create_mirror_view(b0);
+  typename ViewType::HostMirror b1_host = Kokkos::create_mirror_view(b1);
+
+  Kokkos::deep_copy(b0_host, b0);
+  Kokkos::deep_copy(b1_host, b1);
+
+  /// check b0 = b1 ; this eps is about 10^-14
+  typedef typename ats::mag_type mag_type;
+  mag_type sum(1), diff(0);
+  const mag_type eps = 1.0e3 * ats::epsilon();
+
+  for (int k = 0; k < N; ++k)
+    for (int i = 0; i < b_nrows; ++i)
+      for (int j = 0; j < b_ncols; ++j) {
+        sum += ats::abs(b0_host(k, i, j));
+        diff += ats::abs(b0_host(k, i, j) - b1_host(k, i, j));
+      }
+  EXPECT_NEAR_KK(diff / sum, 0.0, eps);
 }
-}
+}  // namespace Trsm
+}  // namespace Test
 
-
-template<typename DeviceType,
-         typename ValueType,
-         typename ScalarType,
-         typename ParamTagType,
-         typename AlgoTagType>
+template <typename DeviceType, typename ValueType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
 int test_batched_trsm() {
 #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft,DeviceType> ViewType;
-    Test::Trsm::impl_test_batched_trsm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(     0, 10, 4);
-    for (int i=0;i<10;++i) {
-      //printf("Testing: LayoutLeft,  Blksize %d\n", i);  
-      Test::Trsm::impl_test_batched_trsm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024,  i, 4);
-      Test::Trsm::impl_test_batched_trsm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024,  i, 1);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        ViewType;
+    Test::Trsm::impl_test_batched_trsm<DeviceType, ViewType, ScalarType,
+                                       ParamTagType, AlgoTagType>(0, 10, 4);
+    for (int i = 0; i < 10; ++i) {
+      // printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      Test::Trsm::impl_test_batched_trsm<DeviceType, ViewType, ScalarType,
+                                         ParamTagType, AlgoTagType>(1024, i, 4);
+      Test::Trsm::impl_test_batched_trsm<DeviceType, ViewType, ScalarType,
+                                         ParamTagType, AlgoTagType>(1024, i, 1);
     }
   }
 #endif
 #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> ViewType;
-    Test::Trsm::impl_test_batched_trsm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(     0, 10, 4);
-    for (int i=0;i<10;++i) {
-      //printf("Testing: LayoutRight, Blksize %d\n", i);  
-      Test::Trsm::impl_test_batched_trsm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024,  i, 4);
-      Test::Trsm::impl_test_batched_trsm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024,  i, 1);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    Test::Trsm::impl_test_batched_trsm<DeviceType, ViewType, ScalarType,
+                                       ParamTagType, AlgoTagType>(0, 10, 4);
+    for (int i = 0; i < 10; ++i) {
+      // printf("Testing: LayoutRight, Blksize %d\n", i);
+      Test::Trsm::impl_test_batched_trsm<DeviceType, ViewType, ScalarType,
+                                         ParamTagType, AlgoTagType>(1024, i, 4);
+      Test::Trsm::impl_test_batched_trsm<DeviceType, ViewType, ScalarType,
+                                         ParamTagType, AlgoTagType>(1024, i, 1);
     }
   }
 #endif
 
   return 0;
 }
-
diff --git a/unit_test/batched/dense/Test_Batched_SerialTrsm_Complex.hpp b/unit_test/batched/dense/Test_Batched_SerialTrsm_Complex.hpp
index 713abe3c1e..7540c79f83 100644
--- a/unit_test/batched/dense/Test_Batched_SerialTrsm_Complex.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialTrsm_Complex.hpp
@@ -1,108 +1,161 @@
 
 
 #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_l_nt_u_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit> param_tag_type;
-  typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
-}
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_l_nt_n_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
-  typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
-}
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_nt_u_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
-  typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
-}
-// TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_nt_n_dcomplex_dcomplex ) {
-//   typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
-//   typedef Algo::Trsm::Blocked algo_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_u_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::NoTranspose,
+                                 Diag::Unit>
+      param_tag_type;
+  typedef Algo::Trsm::Blocked algo_tag_type;
+  test_batched_trsm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>();
+}
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_n_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::NoTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
+  typedef Algo::Trsm::Blocked algo_tag_type;
+  test_batched_trsm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>();
+}
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::Unit>
+      param_tag_type;
+  typedef Algo::Trsm::Blocked algo_tag_type;
+  test_batched_trsm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>();
+}
+// TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_nt_n_dcomplex_dcomplex )
+// {
+//   typedef
+//   ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit>
+//   param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type;
 //   test_batched_trsm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
 // }
-TEST_F( TestCategory, batched_scalar_serial_trsm_r_u_nt_u_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_u_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsm_r_u_nt_n_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_n_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>();
 }
 //
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_l_t_u_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::Transpose,Diag::Unit> param_tag_type;
-  typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
-}
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_l_t_n_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::Transpose,Diag::NonUnit> param_tag_type;
-  typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
-}
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_t_u_dcomplex_dcomplex ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::Unit> param_tag_type;
-  typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
-}
-// TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_t_n_dcomplex_dcomplex ) {
-//   typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::NonUnit> param_tag_type;
-//   typedef Algo::Trsm::Blocked algo_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_u_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::Transpose,
+                                 Diag::Unit>
+      param_tag_type;
+  typedef Algo::Trsm::Blocked algo_tag_type;
+  test_batched_trsm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>();
+}
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_n_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::Transpose,
+                                 Diag::NonUnit>
+      param_tag_type;
+  typedef Algo::Trsm::Blocked algo_tag_type;
+  test_batched_trsm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>();
+}
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_dcomplex_dcomplex) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::Transpose,
+                                 Diag::Unit>
+      param_tag_type;
+  typedef Algo::Trsm::Blocked algo_tag_type;
+  test_batched_trsm<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>();
+}
+// TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_t_n_dcomplex_dcomplex )
+// {
+//   typedef
+//   ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::NonUnit>
+//   param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type;
 //   test_batched_trsm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
 // }
 
-
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_l_nt_u_dcomplex_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_u_dcomplex_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::NoTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, Kokkos::complex<double>, double,
+                    param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_l_nt_n_dcomplex_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_n_dcomplex_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::NoTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, Kokkos::complex<double>, double,
+                    param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_nt_u_dcomplex_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_dcomplex_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, Kokkos::complex<double>, double,
+                    param_tag_type, algo_tag_type>();
 }
 // TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_nt_n_dcomplex_double ) {
-//   typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
-//   typedef Algo::Trsm::Blocked algo_tag_type;
+//   typedef
+//   ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit>
+//   param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type;
 //   test_batched_trsm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
 // }
-TEST_F( TestCategory, batched_scalar_serial_trsm_r_u_nt_u_dcomplex_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_u_dcomplex_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, Kokkos::complex<double>, double,
+                    param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsm_r_u_nt_n_dcomplex_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_n_dcomplex_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, Kokkos::complex<double>, double,
+                    param_tag_type, algo_tag_type>();
 }
 //
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_l_t_u_dcomplex_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::Transpose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_u_dcomplex_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::Transpose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, Kokkos::complex<double>, double,
+                    param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_l_t_n_dcomplex_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::Transpose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_n_dcomplex_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::Transpose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, Kokkos::complex<double>, double,
+                    param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_t_u_dcomplex_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_dcomplex_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::Transpose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, Kokkos::complex<double>, double,
+                    param_tag_type, algo_tag_type>();
 }
 // TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_t_n_dcomplex_double ) {
-//   typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::NonUnit> param_tag_type;
-//   typedef Algo::Trsm::Blocked algo_tag_type;
+//   typedef
+//   ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::NonUnit>
+//   param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type;
 //   test_batched_trsm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
 // }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_SerialTrsm_Real.hpp b/unit_test/batched/dense/Test_Batched_SerialTrsm_Real.hpp
index 5876b228fe..ab9c3ab3b0 100644
--- a/unit_test/batched/dense/Test_Batched_SerialTrsm_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialTrsm_Real.hpp
@@ -1,110 +1,168 @@
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_l_nt_u_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_u_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::NoTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_l_nt_n_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_n_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::NoTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_nt_u_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_nt_n_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_n_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsm_r_u_nt_u_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_u_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsm_r_u_nt_n_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_n_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
 //
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_l_t_u_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::Transpose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_u_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::Transpose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_l_t_n_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::Transpose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_n_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::Transpose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_t_u_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::Transpose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_t_n_float_float ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_n_float_float) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::Transpose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
 #endif
 
-
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_l_nt_u_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_u_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::NoTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_l_nt_n_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_n_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::NoTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_nt_u_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_nt_n_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_n_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsm_r_u_nt_u_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_u_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsm_r_u_nt_n_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_n_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Right, Uplo::Upper, Trans::NoTranspose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
 //
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_l_t_u_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::Transpose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_u_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::Transpose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_l_t_n_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Lower,Trans::Transpose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_n_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Lower, Trans::Transpose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_t_u_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::Transpose,
+                                 Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_t_n_double_double ) {
-  typedef ::Test::Trmm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_n_double_double) {
+  typedef ::Test::Trmm::ParamTag<Side::Left, Uplo::Upper, Trans::Transpose,
+                                 Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_trsm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_trsm<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
 #endif
-
diff --git a/unit_test/batched/dense/Test_Batched_SerialTrsv.hpp b/unit_test/batched/dense/Test_Batched_SerialTrsv.hpp
index 2fd6c4e9ff..995f33178d 100644
--- a/unit_test/batched/dense/Test_Batched_SerialTrsv.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialTrsv.hpp
@@ -14,201 +14,200 @@ using namespace KokkosBatched;
 namespace Test {
 namespace Trsv {
 
-  template<typename U, typename T, typename D>
-  struct ParamTag {
-    typedef U uplo;
-    typedef T trans;
-    typedef D diag;
-  };
-
-  template<typename DeviceType,
-           typename ViewType,
-           typename ScalarType,
-           typename ParamTagType,
-           typename AlgoTagType>
-  struct Functor_TestBatchedSerialTrsv {
-    ViewType _a, _b;
-    
-    ScalarType _alpha;
-
-    KOKKOS_INLINE_FUNCTION
-    Functor_TestBatchedSerialTrsv(const ScalarType alpha, 
-            const ViewType &a,
-            const ViewType &b) 
+template <typename U, typename T, typename D>
+struct ParamTag {
+  typedef U uplo;
+  typedef T trans;
+  typedef D diag;
+};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
+struct Functor_TestBatchedSerialTrsv {
+  ViewType _a, _b;
+
+  ScalarType _alpha;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedSerialTrsv(const ScalarType alpha, const ViewType &a,
+                                const ViewType &b)
       : _a(a), _b(b), _alpha(alpha) {}
 
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const ParamTagType &, const int k) const {
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-      auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), 0);
-      
-      SerialTrsv<typename ParamTagType::uplo,
-        typename ParamTagType::trans,
-        typename ParamTagType::diag,
-        AlgoTagType>::
-        invoke(_alpha, aa, bb);
-    }
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const ParamTagType &, const int k) const {
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), 0);
 
-    inline
-    void run() {
-      typedef typename ViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::SerialTrsv");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion( name.c_str() );
-      Kokkos::RangePolicy<DeviceType,ParamTagType> policy(0, _b.extent(0));
-      Kokkos::parallel_for(name.c_str(), policy, *this);
-      Kokkos::Profiling::popRegion();
-    }
-  };
-
-  template<typename DeviceType,
-           typename ViewType,
-           typename ScalarType,
-           typename ParamTagType,
-           typename AlgoTagType>
-  void impl_test_batched_trsv(const int N, const int BlkSize) {
+    SerialTrsv<typename ParamTagType::uplo, typename ParamTagType::trans,
+               typename ParamTagType::diag, AlgoTagType>::invoke(_alpha, aa,
+                                                                 bb);
+  }
+
+  inline void run() {
     typedef typename ViewType::value_type value_type;
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
-
-    /// randomized input testing views
-    ScalarType alpha(1.5);
-
-    ViewType
-      a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize),
-      b0("b0", N, BlkSize, 1),       b1("b1", N, BlkSize, 1);
-
-    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(13718);
-    Kokkos::fill_random(a0, random, value_type(1.0));
-    Kokkos::fill_random(b0, random, value_type(1.0));
-
-    Kokkos::fence();
-
-    Kokkos::deep_copy(b0, 1.0);
-
-    Kokkos::deep_copy(a1, a0);
-    Kokkos::deep_copy(b1, b0);
-
-    Functor_TestBatchedSerialTrsv<DeviceType,ViewType,ScalarType,
-      ParamTagType,Algo::Trsv::Unblocked>(alpha, a0, b0).run();
-    Functor_TestBatchedSerialTrsv<DeviceType,ViewType,ScalarType,
-      ParamTagType,AlgoTagType>(alpha, a1, b1).run();
-
-    Kokkos::fence();
-
-    /// for comparison send it to host
-    typename ViewType::HostMirror a0_host = Kokkos::create_mirror_view(a0);
-    typename ViewType::HostMirror b0_host = Kokkos::create_mirror_view(b0);
-    typename ViewType::HostMirror b1_host = Kokkos::create_mirror_view(b1);
-
-    Kokkos::deep_copy(a0_host, a0);    
-    Kokkos::deep_copy(b0_host, b0);
-    Kokkos::deep_copy(b1_host, b1);
-
-
-    /// this eps is about 10^-14
-    typedef typename ats::mag_type mag_type;
-    mag_type sum(1), diff(0);
-    const mag_type eps = 1.0e3 * ats::epsilon();
-
-    /// check b0 and b1 are correct
-    const value_type one(1);
-    const bool is_unit_diag = std::is_same<typename ParamTagType::diag,Diag::Unit>::value;
-    for (int k=0;k<N;++k) {
-      if (std::is_same<typename ParamTagType::trans,Trans::NoTranspose>::value) {
-        if (std::is_same<typename ParamTagType::uplo, Uplo::Lower>::value) {
-          for (int i=0;i<BlkSize;++i) {
-            value_type tmp(0);
-            for (int j=0;j<=i;++j) {
-              const value_type aval = (i == j && is_unit_diag ? one : a0_host(k,i,j));
-              const value_type bval = b0_host(k,j,0);
-              tmp += aval * bval;
-            }
-            EXPECT_NEAR(ats::abs(tmp), ats::abs(alpha), eps);
+    std::string name_region("KokkosBatched::Test::SerialTrsv");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::RangePolicy<DeviceType, ParamTagType> policy(0, _b.extent(0));
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
+void impl_test_batched_trsv(const int N, const int BlkSize) {
+  typedef typename ViewType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  /// randomized input testing views
+  ScalarType alpha(1.5);
+
+  ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize),
+      b0("b0", N, BlkSize, 1), b1("b1", N, BlkSize, 1);
+
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
+  Kokkos::fill_random(a0, random, value_type(1.0));
+  Kokkos::fill_random(b0, random, value_type(1.0));
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(b0, 1.0);
+
+  Kokkos::deep_copy(a1, a0);
+  Kokkos::deep_copy(b1, b0);
+
+  Functor_TestBatchedSerialTrsv<DeviceType, ViewType, ScalarType, ParamTagType,
+                                Algo::Trsv::Unblocked>(alpha, a0, b0)
+      .run();
+  Functor_TestBatchedSerialTrsv<DeviceType, ViewType, ScalarType, ParamTagType,
+                                AlgoTagType>(alpha, a1, b1)
+      .run();
+
+  Kokkos::fence();
+
+  /// for comparison send it to host
+  typename ViewType::HostMirror a0_host = Kokkos::create_mirror_view(a0);
+  typename ViewType::HostMirror b0_host = Kokkos::create_mirror_view(b0);
+  typename ViewType::HostMirror b1_host = Kokkos::create_mirror_view(b1);
+
+  Kokkos::deep_copy(a0_host, a0);
+  Kokkos::deep_copy(b0_host, b0);
+  Kokkos::deep_copy(b1_host, b1);
+
+  /// this eps is about 10^-14
+  typedef typename ats::mag_type mag_type;
+  mag_type sum(1), diff(0);
+  const mag_type eps = 1.0e3 * ats::epsilon();
+
+  /// check b0 and b1 are correct
+  const value_type one(1);
+  const bool is_unit_diag =
+      std::is_same<typename ParamTagType::diag, Diag::Unit>::value;
+  for (int k = 0; k < N; ++k) {
+    if (std::is_same<typename ParamTagType::trans, Trans::NoTranspose>::value) {
+      if (std::is_same<typename ParamTagType::uplo, Uplo::Lower>::value) {
+        for (int i = 0; i < BlkSize; ++i) {
+          value_type tmp(0);
+          for (int j = 0; j <= i; ++j) {
+            const value_type aval =
+                (i == j && is_unit_diag ? one : a0_host(k, i, j));
+            const value_type bval = b0_host(k, j, 0);
+            tmp += aval * bval;
           }
-          for (int i=0;i<BlkSize;++i) {
-            value_type tmp(0);
-            for (int j=0;j<=i;++j) {
-              const value_type aval = (i == j && is_unit_diag ? one : a0_host(k,i,j));
-              const value_type bval = b1_host(k,j,0);
-              tmp += aval * bval;
-            }
-            EXPECT_NEAR(ats::abs(tmp), ats::abs(alpha), eps);
+          EXPECT_NEAR(ats::abs(tmp), ats::abs(alpha), eps);
+        }
+        for (int i = 0; i < BlkSize; ++i) {
+          value_type tmp(0);
+          for (int j = 0; j <= i; ++j) {
+            const value_type aval =
+                (i == j && is_unit_diag ? one : a0_host(k, i, j));
+            const value_type bval = b1_host(k, j, 0);
+            tmp += aval * bval;
           }
-        } else if (std::is_same<typename ParamTagType::uplo, Uplo::Upper>::value) {
-          for (int i=0;i<BlkSize;++i) {
-            value_type tmp(0);
-            for (int j=i;j<BlkSize;++j) {
-              const value_type aval = (i == j && is_unit_diag ? one : a0_host(k,i,j));
-              const value_type bval = b0_host(k,j,0);
-              tmp += aval * bval;
-            }
-            EXPECT_NEAR(ats::abs(tmp), ats::abs(alpha), eps);
+          EXPECT_NEAR(ats::abs(tmp), ats::abs(alpha), eps);
+        }
+      } else if (std::is_same<typename ParamTagType::uplo,
+                              Uplo::Upper>::value) {
+        for (int i = 0; i < BlkSize; ++i) {
+          value_type tmp(0);
+          for (int j = i; j < BlkSize; ++j) {
+            const value_type aval =
+                (i == j && is_unit_diag ? one : a0_host(k, i, j));
+            const value_type bval = b0_host(k, j, 0);
+            tmp += aval * bval;
           }
-          for (int i=0;i<BlkSize;++i) {
-            value_type tmp(0);
-            for (int j=i;j<BlkSize;++j) {
-              const value_type aval = (i == j && is_unit_diag ? one : a0_host(k,i,j));
-              const value_type bval = b1_host(k,j,0);
-              tmp += aval * bval;
-            }
-            EXPECT_NEAR(ats::abs(tmp), ats::abs(alpha), eps);
+          EXPECT_NEAR(ats::abs(tmp), ats::abs(alpha), eps);
+        }
+        for (int i = 0; i < BlkSize; ++i) {
+          value_type tmp(0);
+          for (int j = i; j < BlkSize; ++j) {
+            const value_type aval =
+                (i == j && is_unit_diag ? one : a0_host(k, i, j));
+            const value_type bval = b1_host(k, j, 0);
+            tmp += aval * bval;
           }
+          EXPECT_NEAR(ats::abs(tmp), ats::abs(alpha), eps);
         }
       }
     }
-      
-    /// check b0 = b1 ; 
-    for (int k=0;k<N;++k)
-      for (int i=0;i<BlkSize;++i)
-        for (int j=0;j<1;++j) {
-          sum  += ats::abs(b0_host(k,i,j));
-          diff += ats::abs(b0_host(k,i,j)-b1_host(k,i,j));
-        }
-    EXPECT_NEAR( diff/sum, 0.0, eps);
   }
-}
-}
 
+  /// check b0 = b1 ;
+  for (int k = 0; k < N; ++k)
+    for (int i = 0; i < BlkSize; ++i)
+      for (int j = 0; j < 1; ++j) {
+        sum += ats::abs(b0_host(k, i, j));
+        diff += ats::abs(b0_host(k, i, j) - b1_host(k, i, j));
+      }
+  EXPECT_NEAR(diff / sum, 0.0, eps);
+}
+}  // namespace Trsv
+}  // namespace Test
 
-template<typename DeviceType,
-         typename ValueType,
-         typename ScalarType,
-         typename ParamTagType,
-         typename AlgoTagType>
+template <typename DeviceType, typename ValueType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
 int test_batched_trsv() {
 #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft,DeviceType> ViewType;
-    Test::Trsv::impl_test_batched_trsv<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(     0, 10);
-    for (int i=0;i<10;++i) {
-      // printf("Testing: LayoutLeft,  Blksize %d, Uplo %d, Trans %d, Diag %d\n", 
-      //        i, 
-      //        std::is_same<typename ParamTagType::uplo, Uplo::Lower>::value, 
-      //        std::is_same<typename ParamTagType::trans, Trans::NoTranspose>::value, 
-      //        std::is_same<typename ParamTagType::diag, Diag::Unit>::value);
-      Test::Trsv::impl_test_batched_trsv<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1,  i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        ViewType;
+    Test::Trsv::impl_test_batched_trsv<DeviceType, ViewType, ScalarType,
+                                       ParamTagType, AlgoTagType>(0, 10);
+    for (int i = 0; i < 10; ++i) {
+      // printf("Testing: LayoutLeft,  Blksize %d, Uplo %d, Trans %d, Diag
+      // %d\n",
+      //        i,
+      //        std::is_same<typename ParamTagType::uplo, Uplo::Lower>::value,
+      //        std::is_same<typename ParamTagType::trans,
+      //        Trans::NoTranspose>::value, std::is_same<typename
+      //        ParamTagType::diag, Diag::Unit>::value);
+      Test::Trsv::impl_test_batched_trsv<DeviceType, ViewType, ScalarType,
+                                         ParamTagType, AlgoTagType>(1, i);
     }
   }
 #endif
 #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> ViewType;
-    Test::Trsv::impl_test_batched_trsv<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(     0, 10);
-    for (int i=0;i<10;++i) {
-      // printf("Testing: LayoutRight,  Blksize %d, Uplo %d, Trans %d, Diag %d\n", 
-      //        i, 
-      //        std::is_same<typename ParamTagType::uplo, Uplo::Lower>::value, 
-      //        std::is_same<typename ParamTagType::trans, Trans::NoTranspose>::value, 
-      //        std::is_same<typename ParamTagType::diag, Diag::Unit>::value);
-      Test::Trsv::impl_test_batched_trsv<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1,  i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    Test::Trsv::impl_test_batched_trsv<DeviceType, ViewType, ScalarType,
+                                       ParamTagType, AlgoTagType>(0, 10);
+    for (int i = 0; i < 10; ++i) {
+      // printf("Testing: LayoutRight,  Blksize %d, Uplo %d, Trans %d, Diag
+      // %d\n",
+      //        i,
+      //        std::is_same<typename ParamTagType::uplo, Uplo::Lower>::value,
+      //        std::is_same<typename ParamTagType::trans,
+      //        Trans::NoTranspose>::value, std::is_same<typename
+      //        ParamTagType::diag, Diag::Unit>::value);
+      Test::Trsv::impl_test_batched_trsv<DeviceType, ViewType, ScalarType,
+                                         ParamTagType, AlgoTagType>(1, i);
     }
   }
 #endif
 
   return 0;
 }
-
diff --git a/unit_test/batched/dense/Test_Batched_SerialTrsv_Complex.hpp b/unit_test/batched/dense/Test_Batched_SerialTrsv_Complex.hpp
index 34efdebbb1..e99679a767 100644
--- a/unit_test/batched/dense/Test_Batched_SerialTrsv_Complex.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialTrsv_Complex.hpp
@@ -1,46 +1,60 @@
 
 #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
-TEST_F( TestCategory, batched_scalar_serial_trsv_l_nt_u_dcomplex_dcomplex ) {
-  typedef ::Test::Trsv::ParamTag<Uplo::Lower,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_u_dcomplex_dcomplex) {
+  typedef ::Test::Trsv::ParamTag<Uplo::Lower, Trans::NoTranspose, Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsv::Blocked algo_tag_type;
-  test_batched_trsv<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+  test_batched_trsv<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsv_l_nt_n_dcomplex_dcomplex ) {
-  typedef ::Test::Trsv::ParamTag<Uplo::Lower,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_n_dcomplex_dcomplex) {
+  typedef ::Test::Trsv::ParamTag<Uplo::Lower, Trans::NoTranspose, Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsv::Blocked algo_tag_type;
-  test_batched_trsv<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+  test_batched_trsv<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsv_u_nt_u_dcomplex_dcomplex ) {
-  typedef ::Test::Trsv::ParamTag<Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_u_dcomplex_dcomplex) {
+  typedef ::Test::Trsv::ParamTag<Uplo::Upper, Trans::NoTranspose, Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsv::Blocked algo_tag_type;
-  test_batched_trsv<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+  test_batched_trsv<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsv_u_nt_n_dcomplex_dcomplex ) {
-  typedef ::Test::Trsv::ParamTag<Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_n_dcomplex_dcomplex) {
+  typedef ::Test::Trsv::ParamTag<Uplo::Upper, Trans::NoTranspose, Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsv::Blocked algo_tag_type;
-  test_batched_trsv<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+  test_batched_trsv<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>();
 }
 
-
-
-TEST_F( TestCategory, batched_scalar_serial_trsv_l_nt_u_dcomplex_double ) {
-  typedef ::Test::Trsv::ParamTag<Uplo::Lower,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_u_dcomplex_double) {
+  typedef ::Test::Trsv::ParamTag<Uplo::Lower, Trans::NoTranspose, Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsv::Blocked algo_tag_type;
-  test_batched_trsv<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_trsv<TestExecSpace, Kokkos::complex<double>, double,
+                    param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsv_l_nt_n_dcomplex_double ) {
-  typedef ::Test::Trsv::ParamTag<Uplo::Lower,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_n_dcomplex_double) {
+  typedef ::Test::Trsv::ParamTag<Uplo::Lower, Trans::NoTranspose, Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsv::Blocked algo_tag_type;
-  test_batched_trsv<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_trsv<TestExecSpace, Kokkos::complex<double>, double,
+                    param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsv_u_nt_u_dcomplex_double ) {
-  typedef ::Test::Trsv::ParamTag<Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_u_dcomplex_double) {
+  typedef ::Test::Trsv::ParamTag<Uplo::Upper, Trans::NoTranspose, Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsv::Blocked algo_tag_type;
-  test_batched_trsv<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_trsv<TestExecSpace, Kokkos::complex<double>, double,
+                    param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsv_u_nt_n_dcomplex_double ) {
-  typedef ::Test::Trsv::ParamTag<Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_n_dcomplex_double) {
+  typedef ::Test::Trsv::ParamTag<Uplo::Upper, Trans::NoTranspose, Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsv::Blocked algo_tag_type;
-  test_batched_trsv<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_trsv<TestExecSpace, Kokkos::complex<double>, double,
+                    param_tag_type, algo_tag_type>();
 }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_SerialTrsv_Real.hpp b/unit_test/batched/dense/Test_Batched_SerialTrsv_Real.hpp
index dd36ded538..1aa00a6497 100644
--- a/unit_test/batched/dense/Test_Batched_SerialTrsv_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialTrsv_Real.hpp
@@ -1,49 +1,62 @@
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_scalar_serial_trsv_l_nt_u_float_float ) {
-  typedef ::Test::Trsv::ParamTag<Uplo::Lower,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_u_float_float) {
+  typedef ::Test::Trsv::ParamTag<Uplo::Lower, Trans::NoTranspose, Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsv::Blocked algo_tag_type;
-  test_batched_trsv<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_trsv<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsv_l_nt_n_float_float ) {
-  typedef ::Test::Trsv::ParamTag<Uplo::Lower,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_n_float_float) {
+  typedef ::Test::Trsv::ParamTag<Uplo::Lower, Trans::NoTranspose, Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsv::Blocked algo_tag_type;
-  test_batched_trsv<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_trsv<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsv_u_nt_u_float_float ) {
-  typedef ::Test::Trsv::ParamTag<Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_u_float_float) {
+  typedef ::Test::Trsv::ParamTag<Uplo::Upper, Trans::NoTranspose, Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsv::Blocked algo_tag_type;
-  test_batched_trsv<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_trsv<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsv_u_nt_n_float_float ) {
-  typedef ::Test::Trsv::ParamTag<Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_n_float_float) {
+  typedef ::Test::Trsv::ParamTag<Uplo::Upper, Trans::NoTranspose, Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsv::Blocked algo_tag_type;
-  test_batched_trsv<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_trsv<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
 #endif
 
-
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F( TestCategory, batched_scalar_serial_trsv_l_nt_u_double_double ) {
-  typedef ::Test::Trsv::ParamTag<Uplo::Lower,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_u_double_double) {
+  typedef ::Test::Trsv::ParamTag<Uplo::Lower, Trans::NoTranspose, Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsv::Blocked algo_tag_type;
-  test_batched_trsv<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_trsv<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsv_l_nt_n_double_double ) {
-  typedef ::Test::Trsv::ParamTag<Uplo::Lower,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_n_double_double) {
+  typedef ::Test::Trsv::ParamTag<Uplo::Lower, Trans::NoTranspose, Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsv::Blocked algo_tag_type;
-  test_batched_trsv<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_trsv<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsv_u_nt_u_double_double ) {
-  typedef ::Test::Trsv::ParamTag<Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_u_double_double) {
+  typedef ::Test::Trsv::ParamTag<Uplo::Upper, Trans::NoTranspose, Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsv::Blocked algo_tag_type;
-  test_batched_trsv<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_trsv<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trsv_u_nt_n_double_double ) {
-  typedef ::Test::Trsv::ParamTag<Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_n_double_double) {
+  typedef ::Test::Trsv::ParamTag<Uplo::Upper, Trans::NoTranspose, Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsv::Blocked algo_tag_type;
-  test_batched_trsv<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_trsv<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
 #endif
-
-
diff --git a/unit_test/batched/dense/Test_Batched_SerialTrtri.hpp b/unit_test/batched/dense/Test_Batched_SerialTrtri.hpp
index 472bc87d4e..2af8420be3 100644
--- a/unit_test/batched/dense/Test_Batched_SerialTrtri.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialTrtri.hpp
@@ -14,65 +14,66 @@ using namespace KokkosBatched;
 namespace Test {
 namespace Trtri {
 
-  template<class ViewTypeA, class ExecutionSpace>
-  struct UnitDiagTRTRI {
-    ViewTypeA A_;
-    using ScalarA = typename ViewTypeA::value_type;
-
-    UnitDiagTRTRI (const ViewTypeA& A) : A_(A) {}
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i) const {
-      A_(i,i) = ScalarA(1);
-    }
-  };
-  template<class ViewTypeA, class ExecutionSpace>
-  struct NonUnitDiagTRTRI {
-    ViewTypeA A_;
-    using ScalarA = typename ViewTypeA::value_type;
-
-    NonUnitDiagTRTRI (const ViewTypeA& A) : A_(A) {}
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i) const {
-      A_(i,i) = A_(i,i)+10;
-    }
-  };
-  template<class ViewTypeA, class ViewTypeB, class ViewTypeC, class ExecutionSpace>
-  struct VanillaGEMM {
-    bool A_t, B_t, A_c, B_c;
-    int N,K;
-    ViewTypeA A;
-    ViewTypeB B;
-    ViewTypeC C;
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-    typedef typename ViewTypeC::value_type ScalarC;
-    typedef Kokkos::Details::ArithTraits<ScalarC> APT;
-    typedef typename APT::mag_type mag_type;
-    ScalarA alpha;
-    ScalarC beta;
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team) const {
+template <class ViewTypeA, class ExecutionSpace>
+struct UnitDiagTRTRI {
+  ViewTypeA A_;
+  using ScalarA = typename ViewTypeA::value_type;
+
+  UnitDiagTRTRI(const ViewTypeA& A) : A_(A) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const { A_(i, i) = ScalarA(1); }
+};
+template <class ViewTypeA, class ExecutionSpace>
+struct NonUnitDiagTRTRI {
+  ViewTypeA A_;
+  using ScalarA = typename ViewTypeA::value_type;
+
+  NonUnitDiagTRTRI(const ViewTypeA& A) : A_(A) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const { A_(i, i) = A_(i, i) + 10; }
+};
+template <class ViewTypeA, class ViewTypeB, class ViewTypeC,
+          class ExecutionSpace>
+struct VanillaGEMM {
+  bool A_t, B_t, A_c, B_c;
+  int N, K;
+  ViewTypeA A;
+  ViewTypeB B;
+  ViewTypeC C;
+
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+  typedef typename ViewTypeC::value_type ScalarC;
+  typedef Kokkos::Details::ArithTraits<ScalarC> APT;
+  typedef typename APT::mag_type mag_type;
+  ScalarA alpha;
+  ScalarC beta;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(
+      const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team)
+      const {
 // GNU COMPILER BUG WORKAROUND
-#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
-      int i = team.league_rank();
+#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && \
+    !defined(__HIP_DEVICE_COMPILE__)
+    int i = team.league_rank();
 #else
-      const int i = team.league_rank();
+    const int i = team.league_rank();
 #endif
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,N), [&] (const int& j) {
-        ScalarC C_ij = 0.0;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& j) {
+      ScalarC C_ij = 0.0;
 
-        // GNU 5.3, 5.4 and 6.1 (and maybe more) crash with another nested lambda here
+      // GNU 5.3, 5.4 and 6.1 (and maybe more) crash with another nested lambda
+      // here
 
 #if defined(KOKKOS_COMPILER_GNU) && !defined(KOKKOS_COMPILER_NVCC)
-        for(int k=0; k<K; k++) {
-          ScalarA A_ik = A_t?(A_c?APT::conj(A(k,i)):A(k,i)):A(i,k);
-          ScalarB B_kj = B_t?(B_c?APT::conj(B(j,k)):B(j,k)):B(k,j);
-          C_ij += A_ik*B_kj;
-        }
+      for (int k = 0; k < K; k++) {
+        ScalarA A_ik = A_t ? (A_c ? APT::conj(A(k, i)) : A(k, i)) : A(i, k);
+        ScalarB B_kj = B_t ? (B_c ? APT::conj(B(j, k)) : B(j, k)) : B(k, j);
+        C_ij += A_ik * B_kj;
+      }
 #else
         Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,K), [&] (const int& k, ScalarC& lsum) {
            ScalarA A_ik = A_t?(A_c?APT::conj(A(k,i)):A(k,i)):A(i,k);
@@ -81,256 +82,272 @@ namespace Trtri {
         },C_ij);
 #endif
 
-        C(i,j) = beta*C(i,j) + alpha*C_ij;
-      });
-    }
-  };
-
-  template<typename U, typename D>
-  struct ParamTag {
-    typedef U uplo;
-    typedef D diag;
-  };
-
-  template<typename DeviceType,
-           typename ViewType,
-           typename ParamTagType,
-           typename AlgoTagType>
-  struct Functor_TestBatchedSerialTrtri {
-    ViewType _a;
-
-    KOKKOS_INLINE_FUNCTION
-    Functor_TestBatchedSerialTrtri(const ViewType &a) 
-      : _a(a) {}
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const ParamTagType &, const int k) const {
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-
-      SerialTrtri<typename ParamTagType::uplo,
-                  typename ParamTagType::diag,
-                  AlgoTagType>::invoke(aa);
-    }
-
-    inline
-    void run() {
-      typedef typename ViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::SerialTrtri");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion( name.c_str() );
-      Kokkos::RangePolicy<DeviceType,ParamTagType> policy(0,_a.extent(0));
-      Kokkos::parallel_for("Functor_TestBatchedSerialTrtri", policy, *this);
-      Kokkos::Profiling::popRegion();
-    }
-  };
-
-  template<typename DeviceType,
-           typename ViewType,
-           typename ScalarType,
-           typename ParamTagType,
-           typename AlgoTagType>
-  void impl_test_batched_trtri(const int N, const int K) {
-    typedef typename ViewType::value_type value_type;
-    typedef typename DeviceType::execution_space execution_space;
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
-
-    ScalarType alpha(1.0);
-    ScalarType beta(0.0);
-
-    // eps is ~ 10^-13 for double
-    typedef typename ats::mag_type mag_type;
-    const mag_type eps = 1.0e8 * ats::epsilon();
-    bool fail_flag = false;
-    ScalarType cur_check_val; // Either 1 or 0, to check A_I
+      C(i, j) = beta * C(i, j) + alpha * C_ij;
+    });
+  }
+};
 
-    const bool is_A_lower = std::is_same<typename ParamTagType::uplo,Uplo::Lower>::value;
-    ViewType A("A", N, K, K);
-    ViewType A_original("A_original", N, K, K);
-    ViewType A_I("A_I", N, K, K);
+template <typename U, typename D>
+struct ParamTag {
+  typedef U uplo;
+  typedef D diag;
+};
 
-    typename ViewType::HostMirror I_host  = Kokkos::create_mirror_view(A_I);
-    typename ViewType::HostMirror A_host = Kokkos::create_mirror_view(A);
+template <typename DeviceType, typename ViewType, typename ParamTagType,
+          typename AlgoTagType>
+struct Functor_TestBatchedSerialTrtri {
+  ViewType _a;
 
-    uint64_t seed = Kokkos::Impl::clock_tic();
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedSerialTrtri(const ViewType& a) : _a(a) {}
 
-    using ViewTypeSubA = decltype(Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()));
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const ParamTagType&, const int k) const {
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
 
-    Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(seed);    
+    SerialTrtri<typename ParamTagType::uplo, typename ParamTagType::diag,
+                AlgoTagType>::invoke(aa);
+  }
 
-    if(std::is_same<typename ParamTagType::diag,Diag::Unit>::value) {
-      // Initialize A with deterministic random numbers
-      Kokkos::fill_random(A, rand_pool, Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, ScalarType>::max());
-      using functor_type = UnitDiagTRTRI<ViewTypeSubA,execution_space>;
-      for (int k = 0; k < N; ++k) {
-        functor_type udtrtri(Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()));
-        // Initialize As diag with 1s
-        Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRTRI", Kokkos::RangePolicy<execution_space>(0,K), udtrtri);
-      }
-    } else {//(diag[0]=='N')||(diag[0]=='n')
-      // Initialize A with random numbers
-      Kokkos::fill_random(A, rand_pool, Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, ScalarType>::max());
-      using functor_type = NonUnitDiagTRTRI<ViewTypeSubA,execution_space>;
-      for (int k = 0; k < N; ++k) {
-        functor_type nudtrtri(Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()));
-        // Initialize As diag with A(i,i)+10
-        Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRTRI", Kokkos::RangePolicy<execution_space>(0,K), nudtrtri);
-      }
-    }
-    Kokkos::fence();
-
-    Kokkos::deep_copy(A_host,  A);
-    // Make A_host a lower triangle
-    for (int k = 0; k < N; k++) {
-      if (is_A_lower) {
-        for (int i = 0; i < K-1; i++)
-          for (int j = i+1; j < K; j++)
-            A_host(k,i,j) = ScalarType(0);
-      }
-      else {
-        // Make A_host a upper triangle
-        for (int i = 1; i < K; i++)
-          for (int j = 0; j < i; j++)
-            A_host(k,i,j) = ScalarType(0); 
-      }
+  inline void run() {
+    typedef typename ViewType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::SerialTrtri");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::RangePolicy<DeviceType, ParamTagType> policy(0, _a.extent(0));
+    Kokkos::parallel_for("Functor_TestBatchedSerialTrtri", policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
+void impl_test_batched_trtri(const int N, const int K) {
+  typedef typename ViewType::value_type value_type;
+  typedef typename DeviceType::execution_space execution_space;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  ScalarType alpha(1.0);
+  ScalarType beta(0.0);
+
+  // eps is ~ 10^-13 for double
+  typedef typename ats::mag_type mag_type;
+  const mag_type eps = 1.0e8 * ats::epsilon();
+  bool fail_flag     = false;
+  ScalarType cur_check_val;  // Either 1 or 0, to check A_I
+
+  const bool is_A_lower =
+      std::is_same<typename ParamTagType::uplo, Uplo::Lower>::value;
+  ViewType A("A", N, K, K);
+  ViewType A_original("A_original", N, K, K);
+  ViewType A_I("A_I", N, K, K);
+
+  typename ViewType::HostMirror I_host = Kokkos::create_mirror_view(A_I);
+  typename ViewType::HostMirror A_host = Kokkos::create_mirror_view(A);
+
+  uint64_t seed = Kokkos::Impl::clock_tic();
+
+  using ViewTypeSubA =
+      decltype(Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()));
+
+  Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(seed);
+
+  if (std::is_same<typename ParamTagType::diag, Diag::Unit>::value) {
+    // Initialize A with deterministic random numbers
+    Kokkos::fill_random(A, rand_pool,
+                        Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
+                                     ScalarType>::max());
+    using functor_type = UnitDiagTRTRI<ViewTypeSubA, execution_space>;
+    for (int k = 0; k < N; ++k) {
+      functor_type udtrtri(Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()));
+      // Initialize As diag with 1s
+      Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRTRI",
+                           Kokkos::RangePolicy<execution_space>(0, K), udtrtri);
     }
-    Kokkos::deep_copy(A, A_host);
-    Kokkos::deep_copy(A_original, A);
-    Kokkos::fence();
-        
-    #if PRINT_MAT
-    printf("A_original:\n");
+  } else {  //(diag[0]=='N')||(diag[0]=='n')
+    // Initialize A with random numbers
+    Kokkos::fill_random(A, rand_pool,
+                        Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
+                                     ScalarType>::max());
+    using functor_type = NonUnitDiagTRTRI<ViewTypeSubA, execution_space>;
     for (int k = 0; k < N; ++k) {
-      for (int i = 0; i < K; i++) {
-          for (int j = 0; j < K; j++) {
-            printf("%*.13lf ", 20, A_original(k,i,j));
-          }
-          printf("\n");
+      functor_type nudtrtri(
+          Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()));
+      // Initialize As diag with A(i,i)+10
+      Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRTRI",
+                           Kokkos::RangePolicy<execution_space>(0, K),
+                           nudtrtri);
+    }
+  }
+  Kokkos::fence();
+
+  Kokkos::deep_copy(A_host, A);
+  // Make A_host a lower triangle
+  for (int k = 0; k < N; k++) {
+    if (is_A_lower) {
+      for (int i = 0; i < K - 1; i++)
+        for (int j = i + 1; j < K; j++) A_host(k, i, j) = ScalarType(0);
+    } else {
+      // Make A_host a upper triangle
+      for (int i = 1; i < K; i++)
+        for (int j = 0; j < i; j++) A_host(k, i, j) = ScalarType(0);
+    }
+  }
+  Kokkos::deep_copy(A, A_host);
+  Kokkos::deep_copy(A_original, A);
+  Kokkos::fence();
+
+#if PRINT_MAT
+  printf("A_original:\n");
+  for (int k = 0; k < N; ++k) {
+    for (int i = 0; i < K; i++) {
+      for (int j = 0; j < K; j++) {
+        printf("%*.13lf ", 20, A_original(k, i, j));
       }
+      printf("\n");
     }
-    #endif
+  }
+#endif
 
-    #if PRINT_MAT
-    printf("A:\n");
-    for (int k = 0; k < N; ++k) {
-      for (int i = 0; i < K; i++) {
-          for (int j = 0; j < K; j++) {
-            printf("%*.13lf ", 20, A(k,i,j));
-          }
-          printf("\n");
+#if PRINT_MAT
+  printf("A:\n");
+  for (int k = 0; k < N; ++k) {
+    for (int i = 0; i < K; i++) {
+      for (int j = 0; j < K; j++) {
+        printf("%*.13lf ", 20, A(k, i, j));
       }
+      printf("\n");
     }
-    #endif
+  }
+#endif
 
-    Functor_TestBatchedSerialTrtri<DeviceType,ViewType,ParamTagType,Algo::Trtri::Unblocked>(A).run();
-    
-    #if PRINT_MAT
-    printf("A_original:\n");
-    for (int k = 0; k < N; ++k) {
-      for (int i = 0; i < K; i++) {
-          for (int j = 0; j < K; j++) {
-            printf("%*.13lf ", 20, A_original(k,i,j));
-          }
-          printf("\n");
+  Functor_TestBatchedSerialTrtri<DeviceType, ViewType, ParamTagType,
+                                 Algo::Trtri::Unblocked>(A)
+      .run();
+
+#if PRINT_MAT
+  printf("A_original:\n");
+  for (int k = 0; k < N; ++k) {
+    for (int i = 0; i < K; i++) {
+      for (int j = 0; j < K; j++) {
+        printf("%*.13lf ", 20, A_original(k, i, j));
       }
+      printf("\n");
     }
-    #endif
+  }
+#endif
 
-    #if PRINT_MAT
-    printf("A:\n");
-    for (int k = 0; k < N; ++k) {
-      for (int i = 0; i < K; i++) {
-          for (int j = 0; j < K; j++) {
-            printf("%*.13lf ", 20, A(k,i,j));
-          }
-          printf("\n");
+#if PRINT_MAT
+  printf("A:\n");
+  for (int k = 0; k < N; ++k) {
+    for (int i = 0; i < K; i++) {
+      for (int j = 0; j < K; j++) {
+        printf("%*.13lf ", 20, A(k, i, j));
       }
+      printf("\n");
     }
-    #endif
-
-    Kokkos::fence();
-
-    struct VanillaGEMM<ViewTypeSubA,ViewTypeSubA,ViewTypeSubA,execution_space> vgemm;
-    vgemm.A_t = false; vgemm.B_t = false;
-    vgemm.A_c = false; vgemm.B_c = false;
-    vgemm.N = K;    vgemm.K = K;
-    vgemm.alpha = alpha;
-    vgemm.beta = beta;
-    for (int i = 0; i < N; i++) {
-      vgemm.A = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); 
-      vgemm.B = Kokkos::subview(A_original, i, Kokkos::ALL(), Kokkos::ALL());;
-      vgemm.C = Kokkos::subview(A_I, i, Kokkos::ALL(), Kokkos::ALL());;
-      Kokkos::parallel_for("KokkosBlas::Test::VanillaGEMM", Kokkos::TeamPolicy<execution_space>(K,Kokkos::AUTO,16), vgemm);
-    }
+  }
+#endif
 
-    Kokkos::fence();
-    Kokkos::deep_copy(I_host, A_I);
-    Kokkos::fence();
+  Kokkos::fence();
+
+  struct VanillaGEMM<ViewTypeSubA, ViewTypeSubA, ViewTypeSubA, execution_space>
+      vgemm;
+  vgemm.A_t   = false;
+  vgemm.B_t   = false;
+  vgemm.A_c   = false;
+  vgemm.B_c   = false;
+  vgemm.N     = K;
+  vgemm.K     = K;
+  vgemm.alpha = alpha;
+  vgemm.beta  = beta;
+  for (int i = 0; i < N; i++) {
+    vgemm.A = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
+    vgemm.B = Kokkos::subview(A_original, i, Kokkos::ALL(), Kokkos::ALL());
+    ;
+    vgemm.C = Kokkos::subview(A_I, i, Kokkos::ALL(), Kokkos::ALL());
+    ;
+    Kokkos::parallel_for(
+        "KokkosBlas::Test::VanillaGEMM",
+        Kokkos::TeamPolicy<execution_space>(K, Kokkos::AUTO, 16), vgemm);
+  }
 
-    #if PRINT_MAT
-    printf("I_host:\n");
-    for (int k = 0; k < N; ++k) {
-      for (int i = 0; i < K; i++) {
-          for (int j = 0; j < K; j++) {
-            printf("%*.13lf ", 20, I_host(k,i,j));
-          }
-          printf("\n");
+  Kokkos::fence();
+  Kokkos::deep_copy(I_host, A_I);
+  Kokkos::fence();
+
+#if PRINT_MAT
+  printf("I_host:\n");
+  for (int k = 0; k < N; ++k) {
+    for (int i = 0; i < K; i++) {
+      for (int j = 0; j < K; j++) {
+        printf("%*.13lf ", 20, I_host(k, i, j));
       }
+      printf("\n");
     }
-    #endif
-
-    for (int k=0;k<N;++k) {
-      for (int i=0;i<K;++i) {
-        for (int j=0;j<K;++j) {
-          cur_check_val = (i==j) ? ScalarType(1) : ScalarType(0);//ats::abs(host_A(i,j));
-          if (ats::abs(ats::abs(I_host(k,i,j)) - cur_check_val) > eps) {
-            fail_flag = true;
-            //printf("   Error: eps ( %g ), I_host ( %.15f ) != cur_check_val (%.15f) (abs result-cur_check_val %g) at (k %d, i %d, j %d)\n", 
-                  //eps, I_host(k,i,j), cur_check_val, ats::abs(I_host(k,i,j) - cur_check_val), k, i, j);
-          }
+  }
+#endif
+
+  for (int k = 0; k < N; ++k) {
+    for (int i = 0; i < K; ++i) {
+      for (int j = 0; j < K; ++j) {
+        cur_check_val =
+            (i == j) ? ScalarType(1) : ScalarType(0);  // ats::abs(host_A(i,j));
+        if (ats::abs(ats::abs(I_host(k, i, j)) - cur_check_val) > eps) {
+          fail_flag = true;
+          // printf("   Error: eps ( %g ), I_host ( %.15f ) != cur_check_val
+          // (%.15f) (abs result-cur_check_val %g) at (k %d, i %d, j %d)\n",
+          // eps, I_host(k,i,j), cur_check_val, ats::abs(I_host(k,i,j) -
+          // cur_check_val), k, i, j);
         }
       }
     }
-
-    ASSERT_EQ( fail_flag, false );
   }
-}
-}
 
+  ASSERT_EQ(fail_flag, false);
+}
+}  // namespace Trtri
+}  // namespace Test
 
-template<typename DeviceType,
-         typename ValueType,
-         typename ScalarType,
-         typename ParamTagType,
-         typename AlgoTagType>
+template <typename DeviceType, typename ValueType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
 int test_batched_trtri(int batchSize = 512) {
 #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft,DeviceType> ViewType;
-    Test::Trtri::impl_test_batched_trtri<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(     0, 10);
-    //Test::impl_test_batched_trtri<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(     1, 2);
-    for (int i=0;i<10;++i) {
-      //printf("Testing: LayoutLeft,  Blksize %d\n", i);  
-      Test::Trtri::impl_test_batched_trtri<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(batchSize,  i);
-      Test::Trtri::impl_test_batched_trtri<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(batchSize,  i);
+    typedef Kokkos::View<ValueType***, Kokkos::LayoutLeft, DeviceType> ViewType;
+    Test::Trtri::impl_test_batched_trtri<DeviceType, ViewType, ScalarType,
+                                         ParamTagType, AlgoTagType>(0, 10);
+    // Test::impl_test_batched_trtri<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(
+    // 1, 2);
+    for (int i = 0; i < 10; ++i) {
+      // printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      Test::Trtri::impl_test_batched_trtri<DeviceType, ViewType, ScalarType,
+                                           ParamTagType, AlgoTagType>(batchSize,
+                                                                      i);
+      Test::Trtri::impl_test_batched_trtri<DeviceType, ViewType, ScalarType,
+                                           ParamTagType, AlgoTagType>(batchSize,
+                                                                      i);
     }
   }
 #endif
 #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> ViewType;
-    Test::Trtri::impl_test_batched_trtri<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(     0, 10);
-    for (int i=0;i<10;++i) {
-      //printf("Testing: LayoutRight, Blksize %d\n", i);  
-      Test::Trtri::impl_test_batched_trtri<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(batchSize,  i);
-      Test::Trtri::impl_test_batched_trtri<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(batchSize,  i);
+    typedef Kokkos::View<ValueType***, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    Test::Trtri::impl_test_batched_trtri<DeviceType, ViewType, ScalarType,
+                                         ParamTagType, AlgoTagType>(0, 10);
+    for (int i = 0; i < 10; ++i) {
+      // printf("Testing: LayoutRight, Blksize %d\n", i);
+      Test::Trtri::impl_test_batched_trtri<DeviceType, ViewType, ScalarType,
+                                           ParamTagType, AlgoTagType>(batchSize,
+                                                                      i);
+      Test::Trtri::impl_test_batched_trtri<DeviceType, ViewType, ScalarType,
+                                           ParamTagType, AlgoTagType>(batchSize,
+                                                                      i);
     }
   }
 #endif
 
   return 0;
 }
-
diff --git a/unit_test/batched/dense/Test_Batched_SerialTrtri_Complex.hpp b/unit_test/batched/dense/Test_Batched_SerialTrtri_Complex.hpp
index edbbd5072f..59e5e367f6 100644
--- a/unit_test/batched/dense/Test_Batched_SerialTrtri_Complex.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialTrtri_Complex.hpp
@@ -1,58 +1,72 @@
 
 #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT)
 // NO TRANSPOSE
-TEST_F( TestCategory, batched_scalar_serial_trtri_u_n_scomplex_scomplex ) {
-  typedef ::Test::Trtri::ParamTag<Uplo::Upper,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trtri_u_n_scomplex_scomplex) {
+  typedef ::Test::Trtri::ParamTag<Uplo::Upper, Diag::NonUnit> param_tag_type;
   typedef Algo::Trtri::Unblocked algo_tag_type;
-  
-  test_batched_trtri<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trtri<TestExecSpace, Kokkos::complex<float>,
+                     Kokkos::complex<float>, param_tag_type, algo_tag_type>(
+      128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trtri_u_u_scomplex_scomplex ) {
-  typedef ::Test::Trtri::ParamTag<Uplo::Upper,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trtri_u_u_scomplex_scomplex) {
+  typedef ::Test::Trtri::ParamTag<Uplo::Upper, Diag::Unit> param_tag_type;
   typedef Algo::Trtri::Unblocked algo_tag_type;
-  
-  test_batched_trtri<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trtri<TestExecSpace, Kokkos::complex<float>,
+                     Kokkos::complex<float>, param_tag_type, algo_tag_type>(
+      128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trtri_l_n_scomplex_scomplex ) {
-  typedef ::Test::Trtri::ParamTag<Uplo::Lower,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trtri_l_n_scomplex_scomplex) {
+  typedef ::Test::Trtri::ParamTag<Uplo::Lower, Diag::NonUnit> param_tag_type;
   typedef Algo::Trtri::Unblocked algo_tag_type;
-  
-  test_batched_trtri<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trtri<TestExecSpace, Kokkos::complex<float>,
+                     Kokkos::complex<float>, param_tag_type, algo_tag_type>(
+      128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trtri_l_u_scomplex_scomplex ) {
-  typedef ::Test::Trtri::ParamTag<Uplo::Lower,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trtri_l_u_scomplex_scomplex) {
+  typedef ::Test::Trtri::ParamTag<Uplo::Lower, Diag::Unit> param_tag_type;
   typedef Algo::Trtri::Unblocked algo_tag_type;
-  
-  test_batched_trtri<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trtri<TestExecSpace, Kokkos::complex<float>,
+                     Kokkos::complex<float>, param_tag_type, algo_tag_type>(
+      128);
 }
 #endif
 
-
 #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
 // NO TRANSPOSE
-TEST_F( TestCategory, batched_scalar_serial_trtri_u_n_dcomplex_dcomplex ) {
-  typedef ::Test::Trtri::ParamTag<Uplo::Upper,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trtri_u_n_dcomplex_dcomplex) {
+  typedef ::Test::Trtri::ParamTag<Uplo::Upper, Diag::NonUnit> param_tag_type;
   typedef Algo::Trtri::Unblocked algo_tag_type;
-  
-  test_batched_trtri<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trtri<TestExecSpace, Kokkos::complex<double>,
+                     Kokkos::complex<double>, param_tag_type, algo_tag_type>(
+      128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trtri_u_u_dcomplex_dcomplex ) {
-  typedef ::Test::Trtri::ParamTag<Uplo::Upper,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trtri_u_u_dcomplex_dcomplex) {
+  typedef ::Test::Trtri::ParamTag<Uplo::Upper, Diag::Unit> param_tag_type;
   typedef Algo::Trtri::Unblocked algo_tag_type;
-  
-  test_batched_trtri<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trtri<TestExecSpace, Kokkos::complex<double>,
+                     Kokkos::complex<double>, param_tag_type, algo_tag_type>(
+      128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trtri_l_n_dcomplex_dcomplex ) {
-  typedef ::Test::Trtri::ParamTag<Uplo::Lower,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trtri_l_n_dcomplex_dcomplex) {
+  typedef ::Test::Trtri::ParamTag<Uplo::Lower, Diag::NonUnit> param_tag_type;
   typedef Algo::Trtri::Unblocked algo_tag_type;
-  
-  test_batched_trtri<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trtri<TestExecSpace, Kokkos::complex<double>,
+                     Kokkos::complex<double>, param_tag_type, algo_tag_type>(
+      128);
 }
-TEST_F( TestCategory, batched_scalar_serial_trtri_l_u_dcomplex_dcomplex ) {
-  typedef ::Test::Trtri::ParamTag<Uplo::Lower,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trtri_l_u_dcomplex_dcomplex) {
+  typedef ::Test::Trtri::ParamTag<Uplo::Lower, Diag::Unit> param_tag_type;
   typedef Algo::Trtri::Unblocked algo_tag_type;
-  
-  test_batched_trtri<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>(128);
+
+  test_batched_trtri<TestExecSpace, Kokkos::complex<double>,
+                     Kokkos::complex<double>, param_tag_type, algo_tag_type>(
+      128);
 }
 #endif
-
diff --git a/unit_test/batched/dense/Test_Batched_SerialTrtri_Real.hpp b/unit_test/batched/dense/Test_Batched_SerialTrtri_Real.hpp
index 7a3b8686b2..632408fb47 100644
--- a/unit_test/batched/dense/Test_Batched_SerialTrtri_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_SerialTrtri_Real.hpp
@@ -1,58 +1,64 @@
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
 // NO TRANSPOSE
-TEST_F( TestCategory, batched_scalar_serial_trtri_u_n_float_float ) {
-  typedef ::Test::Trtri::ParamTag<Uplo::Upper,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trtri_u_n_float_float) {
+  typedef ::Test::Trtri::ParamTag<Uplo::Upper, Diag::NonUnit> param_tag_type;
   typedef Algo::Trtri::Unblocked algo_tag_type;
-  
-  test_batched_trtri<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+
+  test_batched_trtri<TestExecSpace, float, float, param_tag_type,
+                     algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trtri_u_u_float_float ) {
-  typedef ::Test::Trtri::ParamTag<Uplo::Upper,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trtri_u_u_float_float) {
+  typedef ::Test::Trtri::ParamTag<Uplo::Upper, Diag::Unit> param_tag_type;
   typedef Algo::Trtri::Unblocked algo_tag_type;
-  
-  test_batched_trtri<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+
+  test_batched_trtri<TestExecSpace, float, float, param_tag_type,
+                     algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trtri_l_n_float_float ) {
-  typedef ::Test::Trtri::ParamTag<Uplo::Lower,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trtri_l_n_float_float) {
+  typedef ::Test::Trtri::ParamTag<Uplo::Lower, Diag::NonUnit> param_tag_type;
   typedef Algo::Trtri::Unblocked algo_tag_type;
-  
-  test_batched_trtri<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+
+  test_batched_trtri<TestExecSpace, float, float, param_tag_type,
+                     algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trtri_l_u_float_float ) {
-  typedef ::Test::Trtri::ParamTag<Uplo::Lower,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trtri_l_u_float_float) {
+  typedef ::Test::Trtri::ParamTag<Uplo::Lower, Diag::Unit> param_tag_type;
   typedef Algo::Trtri::Unblocked algo_tag_type;
-  
-  test_batched_trtri<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+
+  test_batched_trtri<TestExecSpace, float, float, param_tag_type,
+                     algo_tag_type>();
 }
 #endif
 
-
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
 // NO TRANSPOSE
-TEST_F( TestCategory, batched_scalar_serial_trtri_u_n_double_double ) {
-  typedef ::Test::Trtri::ParamTag<Uplo::Upper,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trtri_u_n_double_double) {
+  typedef ::Test::Trtri::ParamTag<Uplo::Upper, Diag::NonUnit> param_tag_type;
   typedef Algo::Trtri::Unblocked algo_tag_type;
-  
-  test_batched_trtri<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+
+  test_batched_trtri<TestExecSpace, double, double, param_tag_type,
+                     algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trtri_u_u_double_double ) {
-  typedef ::Test::Trtri::ParamTag<Uplo::Upper,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trtri_u_u_double_double) {
+  typedef ::Test::Trtri::ParamTag<Uplo::Upper, Diag::Unit> param_tag_type;
   typedef Algo::Trtri::Unblocked algo_tag_type;
-  
-  test_batched_trtri<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+
+  test_batched_trtri<TestExecSpace, double, double, param_tag_type,
+                     algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trtri_l_n_double_double ) {
-  typedef ::Test::Trtri::ParamTag<Uplo::Lower,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trtri_l_n_double_double) {
+  typedef ::Test::Trtri::ParamTag<Uplo::Lower, Diag::NonUnit> param_tag_type;
   typedef Algo::Trtri::Unblocked algo_tag_type;
-  
-  test_batched_trtri<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+
+  test_batched_trtri<TestExecSpace, double, double, param_tag_type,
+                     algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_serial_trtri_l_u_double_double ) {
-  typedef ::Test::Trtri::ParamTag<Uplo::Lower,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_serial_trtri_l_u_double_double) {
+  typedef ::Test::Trtri::ParamTag<Uplo::Lower, Diag::Unit> param_tag_type;
   typedef Algo::Trtri::Unblocked algo_tag_type;
-  
-  test_batched_trtri<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+
+  test_batched_trtri<TestExecSpace, double, double, param_tag_type,
+                     algo_tag_type>();
 }
 #endif
-
diff --git a/unit_test/batched/dense/Test_Batched_TeamAxpy.hpp b/unit_test/batched/dense/Test_Batched_TeamAxpy.hpp
new file mode 100644
index 0000000000..144b1f9390
--- /dev/null
+++ b/unit_test/batched/dense/Test_Batched_TeamAxpy.hpp
@@ -0,0 +1,151 @@
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "gtest/gtest.h"
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Random.hpp"
+
+#include "KokkosBatched_Axpy.hpp"
+
+#include "KokkosKernels_TestUtils.hpp"
+
+using namespace KokkosBatched;
+
+namespace Test {
+namespace TeamAxpy {
+
+template <typename DeviceType, typename ViewType, typename alphaViewType>
+struct Functor_TestBatchedTeamAxpy {
+  const alphaViewType _alpha;
+  const ViewType _X;
+  const ViewType _Y;
+  const int _N_team;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamAxpy(const alphaViewType &alpha, const ViewType &X,
+                              const ViewType &Y, const int N_team)
+      : _alpha(alpha), _X(X), _Y(Y), _N_team(N_team) {}
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const {
+    const int first_matrix = static_cast<int>(member.league_rank()) * _N_team;
+    const int N            = _X.extent(0);
+    const int last_matrix =
+        (static_cast<int>(member.league_rank() + 1) * _N_team < N
+             ? static_cast<int>(member.league_rank() + 1) * _N_team
+             : N);
+
+    auto alpha =
+        Kokkos::subview(_alpha, Kokkos::make_pair(first_matrix, last_matrix));
+    auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+    auto y = Kokkos::subview(_Y, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+
+    KokkosBatched::TeamAxpy<MemberType>::invoke(member, alpha, x, y);
+  }
+
+  inline void run() {
+    typedef typename ViewType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::TeamAxpy");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::TeamPolicy<DeviceType> policy(_X.extent(0) / _N_team,
+                                          Kokkos::AUTO(), Kokkos::AUTO());
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename ViewType, typename alphaViewType>
+void impl_test_batched_axpy(const int N, const int BlkSize, const int N_team) {
+  typedef typename ViewType::value_type value_type;
+  typedef typename ViewType::const_value_type const_value_type;
+  typedef typename alphaViewType::const_value_type alpha_const_value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  ViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize), Y0("y0", N, BlkSize),
+      Y1("y1", N, BlkSize);
+
+  alphaViewType alpha("alpha", N);
+
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
+  Kokkos::fill_random(X0, random, const_value_type(1.0));
+  Kokkos::fill_random(Y0, random, const_value_type(1.0));
+  Kokkos::fill_random(alpha, random, alpha_const_value_type(1.0));
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(X1, X0);
+  Kokkos::deep_copy(Y1, Y0);
+
+  /// test body
+  auto alpha_host = Kokkos::create_mirror_view(alpha);
+  auto X0_host    = Kokkos::create_mirror_view(X0);
+  auto Y0_host    = Kokkos::create_mirror_view(Y0);
+
+  Kokkos::deep_copy(alpha_host, alpha);
+  Kokkos::deep_copy(X0_host, X0);
+  Kokkos::deep_copy(Y0_host, Y0);
+
+  for (int l = 0; l < N; ++l)
+    for (int i = 0; i < BlkSize; ++i)
+      Y0_host(l, i) += alpha_host(l) * X0_host(l, i);
+
+  Functor_TestBatchedTeamAxpy<DeviceType, ViewType, alphaViewType>(alpha, X1,
+                                                                   Y1, N_team)
+      .run();
+
+  Kokkos::fence();
+
+  /// for comparison send it to host
+  auto Y1_host = Kokkos::create_mirror_view(Y1);
+
+  Kokkos::deep_copy(Y1_host, Y1);
+
+  /// check c0 = c1 ; this eps is about 10^-14
+  typedef typename ats::mag_type mag_type;
+  mag_type sum(1), diff(0);
+  const mag_type eps = 1.0e3 * ats::epsilon();
+
+  for (int l = 0; l < N; ++l)
+    for (int i = 0; i < BlkSize; ++i) {
+      sum += ats::abs(Y0_host(l, i));
+      diff += ats::abs(Y0_host(l, i) - Y1_host(l, i));
+    }
+  EXPECT_NEAR_KK(diff / sum, 0, eps);
+}
+}  // namespace TeamAxpy
+}  // namespace Test
+
+template <typename DeviceType, typename ValueType, typename ScalarType>
+int test_batched_team_axpy() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
+  {
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType> ViewType;
+    typedef Kokkos::View<ScalarType *, Kokkos::LayoutLeft, DeviceType>
+        alphaViewType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::TeamAxpy::impl_test_batched_axpy<DeviceType, ViewType,
+                                             alphaViewType>(1024, i, 2);
+    }
+  }
+#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
+  {
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    typedef Kokkos::View<ScalarType *, Kokkos::LayoutRight, DeviceType>
+        alphaViewType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::TeamAxpy::impl_test_batched_axpy<DeviceType, ViewType,
+                                             alphaViewType>(1024, i, 2);
+    }
+  }
+#endif
+
+  return 0;
+}
diff --git a/unit_test/batched/dense/Test_Batched_TeamAxpy_Complex.hpp b/unit_test/batched/dense/Test_Batched_TeamAxpy_Complex.hpp
new file mode 100644
index 0000000000..f8047179d4
--- /dev/null
+++ b/unit_test/batched/dense/Test_Batched_TeamAxpy_Complex.hpp
@@ -0,0 +1,11 @@
+
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
+TEST_F(TestCategory, batched_scalar_team_axpy_nt_dcomplex_dcomplex) {
+  test_batched_team_axpy<TestExecSpace, Kokkos::complex<double>,
+                         Kokkos::complex<double>>();
+}
+
+TEST_F(TestCategory, batched_scalar_team_axpy_nt_dcomplex_double) {
+  test_batched_team_axpy<TestExecSpace, Kokkos::complex<double>, double>();
+}
+#endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamAxpy_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamAxpy_Real.hpp
new file mode 100644
index 0000000000..2049f4ac45
--- /dev/null
+++ b/unit_test/batched/dense/Test_Batched_TeamAxpy_Real.hpp
@@ -0,0 +1,12 @@
+
+#if defined(KOKKOSKERNELS_INST_FLOAT)
+TEST_F(TestCategory, batched_scalar_team_axpy_nt_float_float) {
+  test_batched_team_axpy<TestExecSpace, float, float>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+TEST_F(TestCategory, batched_scalar_team_axpy_nt_double_double) {
+  test_batched_team_axpy<TestExecSpace, double, double>();
+}
+#endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamGemm.hpp b/unit_test/batched/dense/Test_Batched_TeamGemm.hpp
index 3217f3e3d5..d5aa853482 100644
--- a/unit_test/batched/dense/Test_Batched_TeamGemm.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamGemm.hpp
@@ -17,141 +17,136 @@ using namespace KokkosBatched;
 namespace Test {
 namespace TeamGemm {
 
-  template<typename TA, typename TB>
+template <typename TA, typename TB>
 struct ParamTag {
   typedef TA transA;
   typedef TB transB;
 };
 
-  template <typename DeviceType, typename ViewType, typename ScalarType,
+template <typename DeviceType, typename ViewType, typename ScalarType,
           typename ParamTagType, typename AlgoTagType>
 struct Functor_TestBatchedTeamGemm {
-    ViewType _a, _b, _c;
+  ViewType _a, _b, _c;
 
-    ScalarType _alpha, _beta;
+  ScalarType _alpha, _beta;
 
-    KOKKOS_INLINE_FUNCTION
-    Functor_TestBatchedTeamGemm(const ScalarType alpha, const ViewType &a,
-                                const ViewType &b, const ScalarType beta,
-                                const ViewType &c)
-        : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {}
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamGemm(const ScalarType alpha, const ViewType &a,
+                              const ViewType &b, const ScalarType beta,
+                              const ViewType &c)
+      : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {}
 
-    template<typename MemberType>
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const ParamTagType &, const MemberType &member) const {
-      const int k = member.league_rank();
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &,
+                                         const MemberType &member) const {
+    const int k = member.league_rank();
 
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-      auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
-      auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL());
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
+    auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL());
 
-      KokkosBatched::TeamGemm<MemberType,
-        typename ParamTagType::transA,
-        typename ParamTagType::transB,
-        AlgoTagType>::
-        invoke(member, _alpha, aa, bb, _beta, cc);
-    }
+    KokkosBatched::TeamGemm<MemberType, typename ParamTagType::transA,
+                            typename ParamTagType::transB,
+                            AlgoTagType>::invoke(member, _alpha, aa, bb, _beta,
+                                                 cc);
+  }
 
-    inline
-    void run() {
-      typedef typename ViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::TeamGemm");
-      std::string name_value_type =
-          (std::is_same<value_type, float>::value    ? "::Float"
-           : std::is_same<value_type, double>::value ? "::Double"
-           : std::is_same<value_type, Kokkos::complex<float> >::value
-               ? "::ComplexFloat"
-           : std::is_same<value_type, Kokkos::complex<double> >::value
-               ? "::ComplexDouble"
-               : "::UnknownValueType");
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion(name.c_str());
-      const int league_size = _c.extent(0);
-      Kokkos::TeamPolicy<DeviceType, ParamTagType> policy(league_size,
-                                                          Kokkos::AUTO);
-      Kokkos::parallel_for(name.c_str(), policy, *this);
-      Kokkos::Profiling::popRegion();
-    }
-  };
+  inline void run() {
+    typedef typename ViewType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::TeamGemm");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    const int league_size = _c.extent(0);
+    Kokkos::TeamPolicy<DeviceType, ParamTagType> policy(league_size,
+                                                        Kokkos::AUTO);
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
 
-  template <typename DeviceType, typename ViewType, typename ScalarType,
-            typename ParamTagType, typename AlgoTagType>
-  void impl_test_batched_teamgemm(const int N, const int matAdim1, const int matAdim2, const int matBdim1, const int matBdim2,
-      const int matCdim1, const int matCdim2) {
-    using transA          = typename ParamTagType::transA;
-    using transB          = typename ParamTagType::transB;
-    using execution_space = typename DeviceType::execution_space;
-    using value_type      = typename ViewType::value_type;
-    using ats             = Kokkos::Details::ArithTraits<value_type>;
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
+void impl_test_batched_teamgemm(const int N, const int matAdim1,
+                                const int matAdim2, const int matBdim1,
+                                const int matBdim2, const int matCdim1,
+                                const int matCdim2) {
+  using transA          = typename ParamTagType::transA;
+  using transB          = typename ParamTagType::transB;
+  using execution_space = typename DeviceType::execution_space;
+  using value_type      = typename ViewType::value_type;
+  using ats             = Kokkos::Details::ArithTraits<value_type>;
 
-    /// randomized input testing views
-    ScalarType alpha = ScalarType(1.5), beta = ScalarType(3.0);
+  /// randomized input testing views
+  ScalarType alpha = ScalarType(1.5), beta = ScalarType(3.0);
 
-    ViewType a_expected("a_expected", N, matAdim1, matAdim2),
-        a_actual("a_actual", N, matAdim1, matAdim2),
-        b_expected("b_expected", N, matBdim1, matBdim2),
-        b_actual("b_actual", N, matBdim1, matBdim2),
-        c_expected("c_expected", N, matCdim1, matCdim2),
-        c_actual("c_actual", N, matCdim1, matCdim2);
+  ViewType a_expected("a_expected", N, matAdim1, matAdim2),
+      a_actual("a_actual", N, matAdim1, matAdim2),
+      b_expected("b_expected", N, matBdim1, matBdim2),
+      b_actual("b_actual", N, matBdim1, matBdim2),
+      c_expected("c_expected", N, matCdim1, matCdim2),
+      c_actual("c_actual", N, matCdim1, matCdim2);
 
-    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
-        13718);
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
 
-    Kokkos::fill_random(a_expected, random, value_type(1.0));
-    Kokkos::fill_random(b_expected, random, value_type(1.0));
-    Kokkos::fill_random(c_expected, random, value_type(1.0));
+  Kokkos::fill_random(a_expected, random, value_type(1.0));
+  Kokkos::fill_random(b_expected, random, value_type(1.0));
+  Kokkos::fill_random(c_expected, random, value_type(1.0));
 
-    Kokkos::fence();
+  Kokkos::fence();
 
-    Kokkos::deep_copy(a_actual, a_expected);
-    Kokkos::deep_copy(b_actual, b_expected);
-    Kokkos::deep_copy(c_actual, c_expected);
+  Kokkos::deep_copy(a_actual, a_expected);
+  Kokkos::deep_copy(b_actual, b_expected);
+  Kokkos::deep_copy(c_actual, c_expected);
 
-    Functor_BatchedVanillaGEMM<ViewType, ViewType, ViewType, execution_space>
-        vgemm;
-    vgemm.A_t = std::is_same<transA, Trans::Transpose>::value;
-    vgemm.B_t = std::is_same<transB, Trans::Transpose>::value;
-    vgemm.A_c = vgemm.B_c = false;
-    vgemm.A               = a_expected;
-    vgemm.B               = b_expected;
-    vgemm.C               = c_expected;
-    vgemm.alpha           = alpha;
-    vgemm.beta            = beta;
-    vgemm.run();  // Compute c_expected
+  Functor_BatchedVanillaGEMM<ViewType, ViewType, ViewType, execution_space>
+      vgemm;
+  vgemm.A_t = std::is_same<transA, Trans::Transpose>::value;
+  vgemm.B_t = std::is_same<transB, Trans::Transpose>::value;
+  vgemm.A_c = vgemm.B_c = false;
+  vgemm.A               = a_expected;
+  vgemm.B               = b_expected;
+  vgemm.C               = c_expected;
+  vgemm.alpha           = alpha;
+  vgemm.beta            = beta;
+  vgemm.run();  // Compute c_expected
 
-    Functor_TestBatchedTeamGemm<DeviceType, ViewType, ScalarType, ParamTagType,
-                                AlgoTagType>(alpha, a_actual, b_actual, beta,
-                                             c_actual)
-        .run();
+  Functor_TestBatchedTeamGemm<DeviceType, ViewType, ScalarType, ParamTagType,
+                              AlgoTagType>(alpha, a_actual, b_actual, beta,
+                                           c_actual)
+      .run();
 
-    Kokkos::fence();
+  Kokkos::fence();
 
-    typename ViewType::HostMirror c_expected_host =
-        Kokkos::create_mirror_view(c_expected);
-    typename ViewType::HostMirror c_actual_host =
-        Kokkos::create_mirror_view(c_actual);
+  typename ViewType::HostMirror c_expected_host =
+      Kokkos::create_mirror_view(c_expected);
+  typename ViewType::HostMirror c_actual_host =
+      Kokkos::create_mirror_view(c_actual);
 
-    // Copy to host for comparision
-    Kokkos::deep_copy(c_expected_host, c_expected);
-    Kokkos::deep_copy(c_actual_host, c_actual);
+  // Copy to host for comparision
+  Kokkos::deep_copy(c_expected_host, c_expected);
+  Kokkos::deep_copy(c_actual_host, c_actual);
 
-    using mag_type = typename ats::mag_type;
-    mag_type sum(1), diff(0);
-    mag_type eps = ats::epsilon();
+  using mag_type = typename ats::mag_type;
+  mag_type sum(1), diff(0);
+  mag_type eps = ats::epsilon();
 
-    eps *=
-        std::is_same<value_type, Kokkos::Experimental::half_t>::value ? 4 : 1e3;
+  eps *= std::is_same<value_type, Kokkos::Experimental::half_t>::value ||
+                 std::is_same<value_type, Kokkos::Experimental::bhalf_t>::value
+             ? 4
+             : 1e3;
 
-    for (int k = 0; k < N; ++k)
-      for (int i = 0; i < matCdim1; ++i)
-        for (int j = 0; j < matCdim2; ++j) {
-          sum += ats::abs(c_expected_host(k, i, j));
-          diff += ats::abs(c_expected_host(k, i, j) - c_actual_host(k, i, j));
-        }
-    EXPECT_NEAR_KK( diff/sum, 0, eps);
-  }
-}
+  for (int k = 0; k < N; ++k)
+    for (int i = 0; i < matCdim1; ++i)
+      for (int j = 0; j < matCdim2; ++j) {
+        sum += ats::abs(c_expected_host(k, i, j));
+        diff += ats::abs(c_expected_host(k, i, j) - c_actual_host(k, i, j));
+      }
+  EXPECT_NEAR_KK(diff / sum, 0, eps);
 }
+}  // namespace TeamGemm
+}  // namespace Test
 
 // void (*impl_test)(const int, const int, const int, const int, const int,
 // const int, const int)
@@ -196,10 +191,18 @@ int test_batched_teamgemm() {
                         KokkosBatched::Trans::Transpose>::value) &&
           (std::is_same<typename ParamTagType::transB,
                         KokkosBatched::Trans::NoTranspose>::value)) {
-        Test::TeamGemm::impl_test_batched_teamgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimK, dimN, dimM, dimN); }
-      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::Transpose>::value) &&
-        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::Transpose>::value)) {
-          Test::TeamGemm::impl_test_batched_teamgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimN, dimK, dimM, dimN); }
+        Test::TeamGemm::impl_test_batched_teamgemm<
+            DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>(
+            1024, dimK, dimM, dimK, dimN, dimM, dimN);
+      }
+      if ((std::is_same<typename ParamTagType::transA,
+                        KokkosBatched::Trans::Transpose>::value) &&
+          (std::is_same<typename ParamTagType::transB,
+                        KokkosBatched::Trans::Transpose>::value)) {
+        Test::TeamGemm::impl_test_batched_teamgemm<
+            DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>(
+            1024, dimK, dimM, dimN, dimK, dimM, dimN);
+      }
     }
   }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamGemm_Complex.hpp b/unit_test/batched/dense/Test_Batched_TeamGemm_Complex.hpp
index 831ef1e941..92852f45af 100644
--- a/unit_test/batched/dense/Test_Batched_TeamGemm_Complex.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamGemm_Complex.hpp
@@ -3,67 +3,87 @@
 
 /// dcomplex, dcomplex
 
-TEST_F( TestCategory, batched_scalar_team_gemm_nt_nt_dcomplex_dcomplex ) {
-  typedef ::Test::TeamGemm::ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_dcomplex_dcomplex) {
+  typedef ::Test::TeamGemm::ParamTag<Trans::NoTranspose, Trans::NoTranspose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_teamgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace, Kokkos::complex<double>,
+                        Kokkos::complex<double>, param_tag_type,
+                        algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_gemm_t_nt_dcomplex_dcomplex ) {
-  typedef ::Test::TeamGemm::ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_dcomplex_dcomplex) {
+  typedef ::Test::TeamGemm::ParamTag<Trans::Transpose, Trans::NoTranspose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_teamgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace, Kokkos::complex<double>,
+                        Kokkos::complex<double>, param_tag_type,
+                        algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_gemm_nt_t_dcomplex_dcomplex ) {
-  typedef ::Test::TeamGemm::ParamTag<Trans::NoTranspose,Trans::Transpose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_dcomplex_dcomplex) {
+  typedef ::Test::TeamGemm::ParamTag<Trans::NoTranspose, Trans::Transpose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_teamgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace, Kokkos::complex<double>,
+                        Kokkos::complex<double>, param_tag_type,
+                        algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_gemm_t_t_dcomplex_dcomplex ) {
-  typedef ::Test::TeamGemm::ParamTag<Trans::Transpose,Trans::Transpose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_gemm_t_t_dcomplex_dcomplex) {
+  typedef ::Test::TeamGemm::ParamTag<Trans::Transpose, Trans::Transpose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_teamgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace, Kokkos::complex<double>,
+                        Kokkos::complex<double>, param_tag_type,
+                        algo_tag_type>();
 }
 // TEST_F( TestCategory, batched_scalar_team_gemm_ct_nt_dcomplex_dcomplex ) {
-//   typedef ::Test::TeamGemm::ParamTag<Trans::ConjTranspose,Trans::NoTranspose> param_tag_type;
-//   typedef Algo::Gemm::Blocked algo_tag_type;
+//   typedef ::Test::TeamGemm::ParamTag<Trans::ConjTranspose,Trans::NoTranspose>
+//   param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type;
 //   test_batched_teamgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
 // }
 // TEST_F( TestCategory, batched_scalar_team_gemm_nt_ct_dcomplex_dcomplex ) {
-//   typedef ::Test::TeamGemm::ParamTag<Trans::NoTranspose,Trans::ConjTranspose> param_tag_type;
-//   typedef Algo::Gemm::Blocked algo_tag_type;
+//   typedef ::Test::TeamGemm::ParamTag<Trans::NoTranspose,Trans::ConjTranspose>
+//   param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type;
 //   test_batched_teamgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
 // }
 
 /// dcomplex, double
 
-TEST_F( TestCategory, batched_scalar_team_gemm_nt_nt_dcomplex_double ) {
-  typedef ::Test::TeamGemm::ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_dcomplex_double) {
+  typedef ::Test::TeamGemm::ParamTag<Trans::NoTranspose, Trans::NoTranspose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_teamgemm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace, Kokkos::complex<double>, double,
+                        param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_gemm_t_nt_dcomplex_double ) {
-  typedef ::Test::TeamGemm::ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_dcomplex_double) {
+  typedef ::Test::TeamGemm::ParamTag<Trans::Transpose, Trans::NoTranspose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_teamgemm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace, Kokkos::complex<double>, double,
+                        param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_gemm_nt_t_dcomplex_double ) {
-  typedef ::Test::TeamGemm::ParamTag<Trans::NoTranspose,Trans::Transpose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_dcomplex_double) {
+  typedef ::Test::TeamGemm::ParamTag<Trans::NoTranspose, Trans::Transpose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_teamgemm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace, Kokkos::complex<double>, double,
+                        param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_gemm_t_t_dcomplex_double ) {
-  typedef ::Test::TeamGemm::ParamTag<Trans::Transpose,Trans::Transpose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_gemm_t_t_dcomplex_double) {
+  typedef ::Test::TeamGemm::ParamTag<Trans::Transpose, Trans::Transpose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_teamgemm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace, Kokkos::complex<double>, double,
+                        param_tag_type, algo_tag_type>();
 }
 // TEST_F( TestCategory, batched_scalar_team_gemm_ct_nt_dcomplex_double ) {
-//   typedef ::Test::TeamGemm::ParamTag<Trans::ConjTranspose,Trans::NoTranspose> param_tag_type;
-//   typedef Algo::Gemm::Blocked algo_tag_type;
+//   typedef ::Test::TeamGemm::ParamTag<Trans::ConjTranspose,Trans::NoTranspose>
+//   param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type;
 //   test_batched_teamgemm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
 // }
 // TEST_F( TestCategory, batched_scalar_team_gemm_nt_ct_dcomplex_double ) {
-//   typedef ::Test::TeamGemm::ParamTag<Trans::NoTranspose,Trans::ConjTranspose> param_tag_type;
-//   typedef Algo::Gemm::Blocked algo_tag_type;
+//   typedef ::Test::TeamGemm::ParamTag<Trans::NoTranspose,Trans::ConjTranspose>
+//   param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type;
 //   test_batched_teamgemm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
 // }
 
diff --git a/unit_test/batched/dense/Test_Batched_TeamGemm_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamGemm_Real.hpp
index b1ea9a415c..361675ed9c 100644
--- a/unit_test/batched/dense/Test_Batched_TeamGemm_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamGemm_Real.hpp
@@ -1,5 +1,52 @@
+#if defined(KOKKOS_BHALF_T_IS_FLOAT)
+TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_bhalf_bhalf) {
+  typedef ::Test::TeamGemm::ParamTag<Trans::NoTranspose, Trans::NoTranspose>
+      param_tag_type;
+
+  test_batched_teamgemm<TestExecSpace, ::Test::bhalfScalarType,
+                        ::Test::bhalfScalarType, param_tag_type,
+                        Algo::Gemm::Blocked>();
+  test_batched_teamgemm<TestExecSpace, ::Test::bhalfScalarType,
+                        ::Test::bhalfScalarType, param_tag_type,
+                        Algo::Gemm::Unblocked>();
+}
+TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_bhalf_bhalf) {
+  typedef ::Test::TeamGemm::ParamTag<Trans::Transpose, Trans::NoTranspose>
+      param_tag_type;
+
+  test_batched_teamgemm<TestExecSpace, ::Test::bhalfScalarType,
+                        ::Test::bhalfScalarType, param_tag_type,
+                        Algo::Gemm::Blocked>();
+  test_batched_teamgemm<TestExecSpace, ::Test::bhalfScalarType,
+                        ::Test::bhalfScalarType, param_tag_type,
+                        Algo::Gemm::Unblocked>();
+}
+TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_bhalf_bhalf) {
+  typedef ::Test::TeamGemm::ParamTag<Trans::NoTranspose, Trans::Transpose>
+      param_tag_type;
+
+  test_batched_teamgemm<TestExecSpace, ::Test::bhalfScalarType,
+                        ::Test::bhalfScalarType, param_tag_type,
+                        Algo::Gemm::Blocked>();
+  test_batched_teamgemm<TestExecSpace, ::Test::bhalfScalarType,
+                        ::Test::bhalfScalarType, param_tag_type,
+                        Algo::Gemm::Unblocked>();
+}
+TEST_F(TestCategory, batched_scalar_team_gemm_t_t_bhalf_bhalf) {
+  typedef ::Test::TeamGemm::ParamTag<Trans::Transpose, Trans::Transpose>
+      param_tag_type;
+
+  test_batched_teamgemm<TestExecSpace, ::Test::bhalfScalarType,
+                        ::Test::bhalfScalarType, param_tag_type,
+                        Algo::Gemm::Blocked>();
+  test_batched_teamgemm<TestExecSpace, ::Test::bhalfScalarType,
+                        ::Test::bhalfScalarType, param_tag_type,
+                        Algo::Gemm::Unblocked>();
+}
+#endif  // KOKKOS_BHALF_T_IS_FLOAT
+
 #if defined(KOKKOS_HALF_T_IS_FLOAT)
-TEST_F( TestCategory, batched_scalar_team_gemm_nt_nt_half_half ) {
+TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_half_half) {
   typedef ::Test::TeamGemm::ParamTag<Trans::NoTranspose, Trans::NoTranspose>
       param_tag_type;
 
@@ -10,7 +57,7 @@ TEST_F( TestCategory, batched_scalar_team_gemm_nt_nt_half_half ) {
                         ::Test::halfScalarType, param_tag_type,
                         Algo::Gemm::Unblocked>();
 }
-TEST_F( TestCategory, batched_scalar_team_gemm_t_nt_half_half ) {
+TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_half_half) {
   typedef ::Test::TeamGemm::ParamTag<Trans::Transpose, Trans::NoTranspose>
       param_tag_type;
 
@@ -21,7 +68,7 @@ TEST_F( TestCategory, batched_scalar_team_gemm_t_nt_half_half ) {
                         ::Test::halfScalarType, param_tag_type,
                         Algo::Gemm::Unblocked>();
 }
-TEST_F( TestCategory, batched_scalar_team_gemm_nt_t_half_half ) {
+TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_half_half) {
   typedef ::Test::TeamGemm::ParamTag<Trans::NoTranspose, Trans::Transpose>
       param_tag_type;
 
@@ -32,7 +79,7 @@ TEST_F( TestCategory, batched_scalar_team_gemm_nt_t_half_half ) {
                         ::Test::halfScalarType, param_tag_type,
                         Algo::Gemm::Unblocked>();
 }
-TEST_F( TestCategory, batched_scalar_team_gemm_t_t_half_half ) {
+TEST_F(TestCategory, batched_scalar_team_gemm_t_t_half_half) {
   typedef ::Test::TeamGemm::ParamTag<Trans::Transpose, Trans::Transpose>
       param_tag_type;
 
@@ -43,51 +90,66 @@ TEST_F( TestCategory, batched_scalar_team_gemm_t_t_half_half ) {
                         ::Test::halfScalarType, param_tag_type,
                         Algo::Gemm::Unblocked>();
 }
-#endif // KOKKOS_HALF_T_IS_FLOAT
+#endif  // KOKKOS_HALF_T_IS_FLOAT
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_scalar_team_gemm_nt_nt_float_float ) {
-  typedef ::Test::TeamGemm::ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_float_float) {
+  typedef ::Test::TeamGemm::ParamTag<Trans::NoTranspose, Trans::NoTranspose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_teamgemm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace, float, float, param_tag_type,
+                        algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_gemm_t_nt_float_float ) {
-  typedef ::Test::TeamGemm::ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_float_float) {
+  typedef ::Test::TeamGemm::ParamTag<Trans::Transpose, Trans::NoTranspose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_teamgemm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace, float, float, param_tag_type,
+                        algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_gemm_nt_t_float_float ) {
-  typedef ::Test::TeamGemm::ParamTag<Trans::NoTranspose,Trans::Transpose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_float_float) {
+  typedef ::Test::TeamGemm::ParamTag<Trans::NoTranspose, Trans::Transpose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_teamgemm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace, float, float, param_tag_type,
+                        algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_gemm_t_t_float_float ) {
-  typedef ::Test::TeamGemm::ParamTag<Trans::Transpose,Trans::Transpose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_gemm_t_t_float_float) {
+  typedef ::Test::TeamGemm::ParamTag<Trans::Transpose, Trans::Transpose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_teamgemm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace, float, float, param_tag_type,
+                        algo_tag_type>();
 }
 #endif
 
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F( TestCategory, batched_scalar_team_gemm_nt_nt_double_double ) {
-  typedef ::Test::TeamGemm::ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_double_double) {
+  typedef ::Test::TeamGemm::ParamTag<Trans::NoTranspose, Trans::NoTranspose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_teamgemm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace, double, double, param_tag_type,
+                        algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_gemm_t_nt_double_double ) {
-  typedef ::Test::TeamGemm::ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_double_double) {
+  typedef ::Test::TeamGemm::ParamTag<Trans::Transpose, Trans::NoTranspose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_teamgemm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace, double, double, param_tag_type,
+                        algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_gemm_nt_t_double_double ) {
-  typedef ::Test::TeamGemm::ParamTag<Trans::NoTranspose,Trans::Transpose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_double_double) {
+  typedef ::Test::TeamGemm::ParamTag<Trans::NoTranspose, Trans::Transpose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_teamgemm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace, double, double, param_tag_type,
+                        algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_gemm_t_t_double_double ) {
-  typedef ::Test::TeamGemm::ParamTag<Trans::Transpose,Trans::Transpose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_gemm_t_t_double_double) {
+  typedef ::Test::TeamGemm::ParamTag<Trans::Transpose, Trans::Transpose>
+      param_tag_type;
   typedef Algo::Gemm::Blocked algo_tag_type;
-  test_batched_teamgemm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_teamgemm<TestExecSpace, double, double, param_tag_type,
+                        algo_tag_type>();
 }
 #endif
-
diff --git a/unit_test/batched/dense/Test_Batched_TeamGemv.hpp b/unit_test/batched/dense/Test_Batched_TeamGemv.hpp
index d2085c6c37..9f19180a3f 100644
--- a/unit_test/batched/dense/Test_Batched_TeamGemv.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamGemv.hpp
@@ -17,147 +17,141 @@ using namespace KokkosBatched;
 namespace Test {
 namespace TeamGemv {
 
-  template<typename T>
-  struct ParamTag { 
-    typedef T trans;
-  };
- 
-  template<typename DeviceType,
-           typename ViewType,
-           typename ScalarType,
-           typename ParamTagType, 
-           typename AlgoTagType>
-  struct Functor_TestBatchedTeamGemv {
-    ViewType _a, _b, _c;
-    
-    ScalarType _alpha, _beta;
-    
-    KOKKOS_INLINE_FUNCTION
-    Functor_TestBatchedTeamGemv(const ScalarType alpha, 
-            const ViewType &a,
-            const ViewType &b,
-            const ScalarType beta,
-            const ViewType &c)
+template <typename T>
+struct ParamTag {
+  typedef T trans;
+};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
+struct Functor_TestBatchedTeamGemv {
+  ViewType _a, _b, _c;
+
+  ScalarType _alpha, _beta;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamGemv(const ScalarType alpha, const ViewType &a,
+                              const ViewType &b, const ScalarType beta,
+                              const ViewType &c)
       : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {}
-    
-    template<typename MemberType>
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const ParamTagType &, const MemberType &member) const {
-      const int k = member.league_rank();
-
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-      auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), 0);
-      auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), 0);
-      
-      KokkosBatched::TeamGemv<MemberType,
-        typename ParamTagType::trans,
-        AlgoTagType>::
-        invoke(member, _alpha, aa, bb, _beta, cc);
-    }
-    
-    inline
-    void run() {
-      typedef typename ViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::SerialGemm");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion( name.c_str() );
-      const int league_size = _c.extent(0);
-      Kokkos::TeamPolicy<DeviceType,ParamTagType> policy(league_size, Kokkos::AUTO);
-      Kokkos::parallel_for(name.c_str(), policy, *this);          
-      Kokkos::Profiling::popRegion();   
-    }
-  };
-    
-  template<typename DeviceType,
-           typename ViewType,
-           typename ScalarType,
-           typename ParamTagType, 
-           typename AlgoTagType>
-  void impl_test_batched_gemv(const int N, const int BlkSize) {
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &,
+                                         const MemberType &member) const {
+    const int k = member.league_rank();
+
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), 0);
+    auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), 0);
+
+    KokkosBatched::TeamGemv<MemberType, typename ParamTagType::trans,
+                            AlgoTagType>::invoke(member, _alpha, aa, bb, _beta,
+                                                 cc);
+  }
+
+  inline void run() {
     typedef typename ViewType::value_type value_type;
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
-
-    /// randomized input testing views
-    ScalarType alpha = 1.5, beta = 3.0;
-
-    ViewType
-      a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize),
-      b0("b0", N, BlkSize, 1),       b1("b1", N, BlkSize, 1),
-      c0("c0", N, BlkSize, 1),       c1("c1", N, BlkSize, 1);
-
-    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(13718);
-    Kokkos::fill_random(a0, random, value_type(1.0));
-    Kokkos::fill_random(b0, random, value_type(1.0));
-    Kokkos::fill_random(c0, random, value_type(1.0));
-
-    Kokkos::fence();
-
-    Kokkos::deep_copy(a1, a0);
-    Kokkos::deep_copy(b1, b0);
-    Kokkos::deep_copy(c1, c0);
-
-    /// test body
-    Functor_TestBatchedTeamGemv<DeviceType,ViewType,ScalarType,
-      ParamTagType,Algo::Gemv::Unblocked>(alpha, a0, b0, beta, c0).run();
-    Functor_TestBatchedTeamGemv<DeviceType,ViewType,ScalarType,
-      ParamTagType,AlgoTagType>(alpha, a1, b1, beta, c1).run();
-
-    Kokkos::fence();
-
-    /// for comparison send it to host
-    typename ViewType::HostMirror c0_host = Kokkos::create_mirror_view(c0);
-    typename ViewType::HostMirror c1_host = Kokkos::create_mirror_view(c1);
-
-    Kokkos::deep_copy(c0_host, c0);
-    Kokkos::deep_copy(c1_host, c1);
-
-    /// check c0 = c1 ; this eps is about 10^-14
-    typedef typename ats::mag_type mag_type;
-    mag_type sum(1), diff(0);
-    const mag_type eps = 1.0e3 * ats::epsilon();
-
-    for (int k=0;k<N;++k) 
-      for (int i=0;i<BlkSize;++i) 
-        for (int j=0;j<1;++j) {
-          sum  += ats::abs(c0_host(k,i,j));
-          diff += ats::abs(c0_host(k,i,j)-c1_host(k,i,j));
-        }
-    EXPECT_NEAR_KK( diff/sum, 0, eps);
+    std::string name_region("KokkosBatched::Test::SerialGemm");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    const int league_size = _c.extent(0);
+    Kokkos::TeamPolicy<DeviceType, ParamTagType> policy(league_size,
+                                                        Kokkos::AUTO);
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
   }
+};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
+void impl_test_batched_gemv(const int N, const int BlkSize) {
+  typedef typename ViewType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  /// randomized input testing views
+  ScalarType alpha = 1.5, beta = 3.0;
+
+  ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize),
+      b0("b0", N, BlkSize, 1), b1("b1", N, BlkSize, 1), c0("c0", N, BlkSize, 1),
+      c1("c1", N, BlkSize, 1);
+
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
+  Kokkos::fill_random(a0, random, value_type(1.0));
+  Kokkos::fill_random(b0, random, value_type(1.0));
+  Kokkos::fill_random(c0, random, value_type(1.0));
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(a1, a0);
+  Kokkos::deep_copy(b1, b0);
+  Kokkos::deep_copy(c1, c0);
+
+  /// test body
+  Functor_TestBatchedTeamGemv<DeviceType, ViewType, ScalarType, ParamTagType,
+                              Algo::Gemv::Unblocked>(alpha, a0, b0, beta, c0)
+      .run();
+  Functor_TestBatchedTeamGemv<DeviceType, ViewType, ScalarType, ParamTagType,
+                              AlgoTagType>(alpha, a1, b1, beta, c1)
+      .run();
+
+  Kokkos::fence();
+
+  /// for comparison send it to host
+  typename ViewType::HostMirror c0_host = Kokkos::create_mirror_view(c0);
+  typename ViewType::HostMirror c1_host = Kokkos::create_mirror_view(c1);
+
+  Kokkos::deep_copy(c0_host, c0);
+  Kokkos::deep_copy(c1_host, c1);
+
+  /// check c0 = c1 ; this eps is about 10^-14
+  typedef typename ats::mag_type mag_type;
+  mag_type sum(1), diff(0);
+  const mag_type eps = 1.0e3 * ats::epsilon();
+
+  for (int k = 0; k < N; ++k)
+    for (int i = 0; i < BlkSize; ++i)
+      for (int j = 0; j < 1; ++j) {
+        sum += ats::abs(c0_host(k, i, j));
+        diff += ats::abs(c0_host(k, i, j) - c1_host(k, i, j));
+      }
+  EXPECT_NEAR_KK(diff / sum, 0, eps);
 }
-}
+}  // namespace TeamGemv
+}  // namespace Test
 
-template<typename DeviceType, 
-         typename ValueType, 
-         typename ScalarType,
-         typename ParamTagType,
-         typename AlgoTagType>
+template <typename DeviceType, typename ValueType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
 int test_batched_team_gemv() {
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) 
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft,DeviceType> ViewType;
-    Test::TeamGemv::impl_test_batched_gemv<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(     0, 10);
-    for (int i=0;i<10;++i) {                                                                                        
-      //printf("Testing: LayoutLeft,  Blksize %d\n", i); 
-      Test::TeamGemv::impl_test_batched_gemv<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024,  i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        ViewType;
+    Test::TeamGemv::impl_test_batched_gemv<DeviceType, ViewType, ScalarType,
+                                           ParamTagType, AlgoTagType>(0, 10);
+    for (int i = 0; i < 10; ++i) {
+      // printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      Test::TeamGemv::impl_test_batched_gemv<DeviceType, ViewType, ScalarType,
+                                             ParamTagType, AlgoTagType>(1024,
+                                                                        i);
     }
   }
 #endif
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) 
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> ViewType;
-    Test::TeamGemv::impl_test_batched_gemv<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(     0, 10);
-    for (int i=0;i<10;++i) {                                                                                        
-      //printf("Testing: LayoutRight, Blksize %d\n", i); 
-      Test::TeamGemv::impl_test_batched_gemv<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024,  i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    Test::TeamGemv::impl_test_batched_gemv<DeviceType, ViewType, ScalarType,
+                                           ParamTagType, AlgoTagType>(0, 10);
+    for (int i = 0; i < 10; ++i) {
+      // printf("Testing: LayoutRight, Blksize %d\n", i);
+      Test::TeamGemv::impl_test_batched_gemv<DeviceType, ViewType, ScalarType,
+                                             ParamTagType, AlgoTagType>(1024,
+                                                                        i);
     }
   }
 #endif
-  
+
   return 0;
 }
-
diff --git a/unit_test/batched/dense/Test_Batched_TeamGemv_Complex.hpp b/unit_test/batched/dense/Test_Batched_TeamGemv_Complex.hpp
index c76588e322..cdcd00cff2 100644
--- a/unit_test/batched/dense/Test_Batched_TeamGemv_Complex.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamGemv_Complex.hpp
@@ -2,15 +2,17 @@
 
 /// dcomplex, dcomplex
 
-TEST_F( TestCategory, batched_scalar_team_gemv_nt_dcomplex_dcomplex ) {
+TEST_F(TestCategory, batched_scalar_team_gemv_nt_dcomplex_dcomplex) {
   typedef ::Test::TeamGemv::ParamTag<Trans::NoTranspose> param_tag_type;
   typedef Algo::Gemv::Blocked algo_tag_type;
-  test_batched_gemv<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+  test_batched_gemv<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_gemv_t_dcomplex_dcomplex ) {
+TEST_F(TestCategory, batched_scalar_team_gemv_t_dcomplex_dcomplex) {
   typedef ::Test::TeamGemv::ParamTag<Trans::Transpose> param_tag_type;
   typedef Algo::Gemv::Blocked algo_tag_type;
-  test_batched_gemv<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+  test_batched_gemv<TestExecSpace, Kokkos::complex<double>,
+                    Kokkos::complex<double>, param_tag_type, algo_tag_type>();
 }
 // TEST_F( TestCategory, batched_scalar_team_gemv_ct_dcomplex_dcomplex ) {
 //   typedef ::Test::TeamGemv::ParamTag<Trans::ConjTranspose> param_tag_type;
@@ -20,15 +22,17 @@ TEST_F( TestCategory, batched_scalar_team_gemv_t_dcomplex_dcomplex ) {
 
 /// dcomplex, double
 
-TEST_F( TestCategory, batched_scalar_team_gemv_nt_dcomplex_double ) {
+TEST_F(TestCategory, batched_scalar_team_gemv_nt_dcomplex_double) {
   typedef ::Test::TeamGemv::ParamTag<Trans::NoTranspose> param_tag_type;
   typedef Algo::Gemv::Blocked algo_tag_type;
-  test_batched_gemv<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_gemv<TestExecSpace, Kokkos::complex<double>, double,
+                    param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_gemv_t_dcomplex_double ) {
+TEST_F(TestCategory, batched_scalar_team_gemv_t_dcomplex_double) {
   typedef ::Test::TeamGemv::ParamTag<Trans::Transpose> param_tag_type;
   typedef Algo::Gemv::Blocked algo_tag_type;
-  test_batched_gemv<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_gemv<TestExecSpace, Kokkos::complex<double>, double,
+                    param_tag_type, algo_tag_type>();
 }
 // TEST_F( TestCategory, batched_scalar_team_gemv_ct_dcomplex_double ) {
 //   typedef ::Test::TeamGemv::ParamTag<Trans::ConjTranspose> param_tag_type;
diff --git a/unit_test/batched/dense/Test_Batched_TeamGemv_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamGemv_Real.hpp
index 464d31f6eb..8401075f47 100644
--- a/unit_test/batched/dense/Test_Batched_TeamGemv_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamGemv_Real.hpp
@@ -1,28 +1,30 @@
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_scalar_team_gemv_nt_float_float ) {
+TEST_F(TestCategory, batched_scalar_team_gemv_nt_float_float) {
   typedef ::Test::TeamGemv::ParamTag<Trans::NoTranspose> param_tag_type;
   typedef Algo::Gemv::Blocked algo_tag_type;
-  test_batched_gemv<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_gemv<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_gemv_t_float_float ) {
+TEST_F(TestCategory, batched_scalar_team_gemv_t_float_float) {
   typedef ::Test::TeamGemv::ParamTag<Trans::Transpose> param_tag_type;
   typedef Algo::Gemv::Blocked algo_tag_type;
-  test_batched_gemv<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_gemv<TestExecSpace, float, float, param_tag_type,
+                    algo_tag_type>();
 }
 #endif
 
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F( TestCategory, batched_scalar_team_gemv_nt_double_double ) {
+TEST_F(TestCategory, batched_scalar_team_gemv_nt_double_double) {
   typedef ::Test::TeamGemv::ParamTag<Trans::NoTranspose> param_tag_type;
   typedef Algo::Gemv::Blocked algo_tag_type;
-  test_batched_gemv<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_gemv<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_gemv_t_double_double ) {
+TEST_F(TestCategory, batched_scalar_team_gemv_t_double_double) {
   typedef ::Test::TeamGemv::ParamTag<Trans::Transpose> param_tag_type;
   typedef Algo::Gemv::Blocked algo_tag_type;
-  test_batched_gemv<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_gemv<TestExecSpace, double, double, param_tag_type,
+                    algo_tag_type>();
 }
 #endif
-
-
diff --git a/unit_test/batched/dense/Test_Batched_TeamInverseLU.hpp b/unit_test/batched/dense/Test_Batched_TeamInverseLU.hpp
index f5fc17512f..4db8a69155 100644
--- a/unit_test/batched/dense/Test_Batched_TeamInverseLU.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamInverseLU.hpp
@@ -20,231 +20,217 @@ using namespace KokkosBatched;
 namespace Test {
 namespace TeamInverseLU {
 
-  template<typename TA, typename TB>
-  struct ParamTag { 
-    typedef TA transA;
-    typedef TB transB;
-  };
- 
-  template<typename DeviceType,
-           typename ViewType,
-           typename ScalarType,
-           typename ParamTagType, 
-           typename AlgoTagType>
-  struct Functor_BatchedTeamGemm {
-    ViewType _a, _b, _c;
-    
-    ScalarType _alpha, _beta;
-    
-    KOKKOS_INLINE_FUNCTION
-    Functor_BatchedTeamGemm(const ScalarType alpha, 
-            const ViewType &a,
-            const ViewType &b,
-            const ScalarType beta,
-            const ViewType &c)
+template <typename TA, typename TB>
+struct ParamTag {
+  typedef TA transA;
+  typedef TB transB;
+};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
+struct Functor_BatchedTeamGemm {
+  ViewType _a, _b, _c;
+
+  ScalarType _alpha, _beta;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_BatchedTeamGemm(const ScalarType alpha, const ViewType &a,
+                          const ViewType &b, const ScalarType beta,
+                          const ViewType &c)
       : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {}
 
-    template<typename MemberType>
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const ParamTagType &, const MemberType &member) const {
-      const int k = member.league_rank();
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &,
+                                         const MemberType &member) const {
+    const int k = member.league_rank();
 
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-      auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
-      auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL());
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
+    auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL());
 
-      if (member.team_rank() == 0) {
-        for (int i=0;i<static_cast<int>(aa.extent(0));++i)                                                                          
-          aa(i,i) += 10.0;  
-      }
-      member.team_barrier();
-	  
-      KokkosBatched::TeamGemm<MemberType,
-        typename ParamTagType::transA,
-        typename ParamTagType::transB,
-        AlgoTagType>::
-        invoke(member, _alpha, aa, bb, _beta, cc);
-    }
-    
-    inline
-    void run() {
-      typedef typename ViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::TeamInverseLU");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion( name.c_str() );
-
-      const int league_size = _c.extent(0);
-      Kokkos::TeamPolicy<DeviceType,ParamTagType> policy(league_size, Kokkos::AUTO);
-      Kokkos::parallel_for((name+"::GemmFunctor").c_str(), policy, *this);            
-      Kokkos::Profiling::popRegion(); 
+    if (member.team_rank() == 0) {
+      for (int i = 0; i < static_cast<int>(aa.extent(0)); ++i) aa(i, i) += 10.0;
     }
-  };
-
-  template<typename DeviceType,
-           typename ViewType,
-           typename AlgoTagType>
-  struct Functor_BatchedTeamLU {
-    ViewType _a;
-
-    KOKKOS_INLINE_FUNCTION
-    Functor_BatchedTeamLU(const ViewType &a) 
-      : _a(a) {} 
-
-    template<typename MemberType>
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const MemberType &member) const {
-      const int k = member.league_rank();
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-
-      if (member.team_rank() == 0) {
-        for (int i=0;i<static_cast<int>(aa.extent(0));++i)                                                                          
-          aa(i,i) += 10.0;  
-      }
-      member.team_barrier();
+    member.team_barrier();
 
-      TeamLU<MemberType,AlgoTagType>::invoke(member, aa);
-    }
+    KokkosBatched::TeamGemm<MemberType, typename ParamTagType::transA,
+                            typename ParamTagType::transB,
+                            AlgoTagType>::invoke(member, _alpha, aa, bb, _beta,
+                                                 cc);
+  }
 
-    inline
-    void run() {
-      typedef typename ViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::TeamInverseLU");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion( name.c_str() );
-
-
-      const int league_size = _a.extent(0);
-      Kokkos::TeamPolicy<DeviceType> policy(league_size, Kokkos::AUTO);
-      Kokkos::parallel_for((name+"::LUFunctor").c_str(), policy, *this);
-      Kokkos::Profiling::popRegion(); 
-    }
-  };
-
-  template<typename DeviceType,
-           typename AViewType,
-           typename WViewType,
-           typename AlgoTagType>
-  struct Functor_TestBatchedTeamInverseLU {
-    AViewType _a;
-    WViewType _w;
-
-    KOKKOS_INLINE_FUNCTION
-    Functor_TestBatchedTeamInverseLU(const AViewType &a, const WViewType &w) 
-      : _a(a), _w(w) {} 
-
-    template<typename MemberType>
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const MemberType &member) const {
-      const int k = member.league_rank();
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-      auto ww = Kokkos::subview(_w, k, Kokkos::ALL());
-
-      KokkosBatched::TeamInverseLU<MemberType,AlgoTagType>::invoke(member, aa, ww);
-    }
+  inline void run() {
+    typedef typename ViewType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::TeamInverseLU");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+
+    const int league_size = _c.extent(0);
+    Kokkos::TeamPolicy<DeviceType, ParamTagType> policy(league_size,
+                                                        Kokkos::AUTO);
+    Kokkos::parallel_for((name + "::GemmFunctor").c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename ViewType, typename AlgoTagType>
+struct Functor_BatchedTeamLU {
+  ViewType _a;
 
-    inline
-    void run() {
-      typedef typename AViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::TeamInverseLU");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion( name.c_str() );
-
-      const int league_size = _a.extent(0);
-      Kokkos::TeamPolicy<DeviceType> policy(league_size, Kokkos::AUTO);
-      Kokkos::parallel_for((name+"::InverseLUFunctor").c_str(), policy, *this);
-      Kokkos::Profiling::popRegion(); 
+  KOKKOS_INLINE_FUNCTION
+  Functor_BatchedTeamLU(const ViewType &a) : _a(a) {}
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const {
+    const int k = member.league_rank();
+    auto aa     = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+
+    if (member.team_rank() == 0) {
+      for (int i = 0; i < static_cast<int>(aa.extent(0)); ++i) aa(i, i) += 10.0;
     }
-  };
+    member.team_barrier();
 
-  template<typename DeviceType,
-           typename AViewType,
-           typename WViewType,
-           typename AlgoTagType>
-  void impl_test_batched_inverselu(const int N, const int BlkSize) {
+    TeamLU<MemberType, AlgoTagType>::invoke(member, aa);
+  }
+
+  inline void run() {
+    typedef typename ViewType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::TeamInverseLU");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+
+    const int league_size = _a.extent(0);
+    Kokkos::TeamPolicy<DeviceType> policy(league_size, Kokkos::AUTO);
+    Kokkos::parallel_for((name + "::LUFunctor").c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename AViewType, typename WViewType,
+          typename AlgoTagType>
+struct Functor_TestBatchedTeamInverseLU {
+  AViewType _a;
+  WViewType _w;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamInverseLU(const AViewType &a, const WViewType &w)
+      : _a(a), _w(w) {}
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const {
+    const int k = member.league_rank();
+    auto aa     = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto ww     = Kokkos::subview(_w, k, Kokkos::ALL());
+
+    KokkosBatched::TeamInverseLU<MemberType, AlgoTagType>::invoke(member, aa,
+                                                                  ww);
+  }
+
+  inline void run() {
     typedef typename AViewType::value_type value_type;
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
+    std::string name_region("KokkosBatched::Test::TeamInverseLU");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+
+    const int league_size = _a.extent(0);
+    Kokkos::TeamPolicy<DeviceType> policy(league_size, Kokkos::AUTO);
+    Kokkos::parallel_for((name + "::InverseLUFunctor").c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
 
-    /// randomized input testing views
-    AViewType a0("a0", N, BlkSize, BlkSize);
-    AViewType a1("a1", N, BlkSize, BlkSize);
-    WViewType w ("w",  N, BlkSize*BlkSize );
-    AViewType c0("c0", N, BlkSize, BlkSize);
+template <typename DeviceType, typename AViewType, typename WViewType,
+          typename AlgoTagType>
+void impl_test_batched_inverselu(const int N, const int BlkSize) {
+  typedef typename AViewType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
 
-    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(13718);
-    Kokkos::fill_random(a0, random, value_type(1.0));
+  /// randomized input testing views
+  AViewType a0("a0", N, BlkSize, BlkSize);
+  AViewType a1("a1", N, BlkSize, BlkSize);
+  WViewType w("w", N, BlkSize * BlkSize);
+  AViewType c0("c0", N, BlkSize, BlkSize);
 
-    Kokkos::fence();
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
+  Kokkos::fill_random(a0, random, value_type(1.0));
 
-    Kokkos::deep_copy(a1, a0);
-    Kokkos::deep_copy(w, value_type(0.0));
+  Kokkos::fence();
 
-    Functor_BatchedTeamLU<DeviceType,AViewType,AlgoTagType>(a1).run();
+  Kokkos::deep_copy(a1, a0);
+  Kokkos::deep_copy(w, value_type(0.0));
 
-    Functor_TestBatchedTeamInverseLU<DeviceType,AViewType,WViewType,AlgoTagType>(a1,w).run();
+  Functor_BatchedTeamLU<DeviceType, AViewType, AlgoTagType>(a1).run();
 
-    value_type alpha = 1.0, beta = 0.0;   
-    typedef ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
+  Functor_TestBatchedTeamInverseLU<DeviceType, AViewType, WViewType,
+                                   AlgoTagType>(a1, w)
+      .run();
 
-    Functor_BatchedTeamGemm<DeviceType,AViewType,value_type,
-      param_tag_type,AlgoTagType>(alpha, a0, a1, beta, c0).run();
+  value_type alpha = 1.0, beta = 0.0;
+  typedef ParamTag<Trans::NoTranspose, Trans::NoTranspose> param_tag_type;
 
-    Kokkos::fence();
+  Functor_BatchedTeamGemm<DeviceType, AViewType, value_type, param_tag_type,
+                          AlgoTagType>(alpha, a0, a1, beta, c0)
+      .run();
 
-    /// for comparison send it to host
-    typename AViewType::HostMirror c0_host = Kokkos::create_mirror_view(c0);
+  Kokkos::fence();
 
-    Kokkos::deep_copy(c0_host, c0);
-	
-    /// check identity matrix ; this eps is about 10^-14
-    typedef typename ats::mag_type mag_type;
-    mag_type sum_diag(0), sum_all(0), sum_diag_ref(N*BlkSize);
-    const mag_type eps = 1.0e3 * ats::epsilon();
-    
-    for (int k=0;k<N;++k)
-      for (int i=0;i<BlkSize;++i)
-        for (int j=0;j<BlkSize;++j) {
-          sum_all  += ats::abs(c0_host(k,i,j));
-          if (i==j) sum_diag += ats::abs(c0_host(k,i,j));
-        }
-    EXPECT_NEAR_KK( sum_all - sum_diag, 0, eps);
-    EXPECT_NEAR_KK( sum_diag - sum_diag_ref, 0, eps);
-  }
-}
+  /// for comparison send it to host
+  typename AViewType::HostMirror c0_host = Kokkos::create_mirror_view(c0);
+
+  Kokkos::deep_copy(c0_host, c0);
+
+  /// check identity matrix ; this eps is about 10^-14
+  typedef typename ats::mag_type mag_type;
+  mag_type sum_diag(0), sum_all(0), sum_diag_ref(N * BlkSize);
+  const mag_type eps = 1.0e3 * ats::epsilon();
+
+  for (int k = 0; k < N; ++k)
+    for (int i = 0; i < BlkSize; ++i)
+      for (int j = 0; j < BlkSize; ++j) {
+        sum_all += ats::abs(c0_host(k, i, j));
+        if (i == j) sum_diag += ats::abs(c0_host(k, i, j));
+      }
+  EXPECT_NEAR_KK(sum_all - sum_diag, 0, eps);
+  EXPECT_NEAR_KK(sum_diag - sum_diag_ref, 0, eps);
 }
+}  // namespace TeamInverseLU
+}  // namespace Test
 
-template<typename DeviceType,
-         typename ValueType,
-         typename AlgoTagType>
+template <typename DeviceType, typename ValueType, typename AlgoTagType>
 int test_batched_team_inverselu() {
 #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft,DeviceType> AViewType;
-    typedef Kokkos::View<ValueType**, Kokkos::LayoutRight,DeviceType> WViewType;
-    Test::TeamInverseLU::impl_test_batched_inverselu<DeviceType,AViewType,WViewType,AlgoTagType>(     0, 10);
-    for (int i=0;i<10;++i) {                                                                                         
-      Test::TeamInverseLU::impl_test_batched_inverselu<DeviceType,AViewType,WViewType,AlgoTagType>(1024,  i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        AViewType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        WViewType;
+    Test::TeamInverseLU::impl_test_batched_inverselu<DeviceType, AViewType,
+                                                     WViewType, AlgoTagType>(
+        0, 10);
+    for (int i = 0; i < 10; ++i) {
+      Test::TeamInverseLU::impl_test_batched_inverselu<DeviceType, AViewType,
+                                                       WViewType, AlgoTagType>(
+          1024, i);
     }
   }
 #endif
 #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> AViewType;
-    typedef Kokkos::View<ValueType**, Kokkos::LayoutRight,DeviceType> WViewType;
-    Test::TeamInverseLU::impl_test_batched_inverselu<DeviceType,AViewType,WViewType,AlgoTagType>(     0, 10);
-    for (int i=0;i<10;++i) {                                                                                        
-      Test::TeamInverseLU::impl_test_batched_inverselu<DeviceType,AViewType,WViewType,AlgoTagType>(1024,  i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        AViewType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        WViewType;
+    Test::TeamInverseLU::impl_test_batched_inverselu<DeviceType, AViewType,
+                                                     WViewType, AlgoTagType>(
+        0, 10);
+    for (int i = 0; i < 10; ++i) {
+      Test::TeamInverseLU::impl_test_batched_inverselu<DeviceType, AViewType,
+                                                       WViewType, AlgoTagType>(
+          1024, i);
     }
   }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamInverseLU_Complex.hpp b/unit_test/batched/dense/Test_Batched_TeamInverseLU_Complex.hpp
index d2edaab355..508466b30a 100644
--- a/unit_test/batched/dense/Test_Batched_TeamInverseLU_Complex.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamInverseLU_Complex.hpp
@@ -1,9 +1,13 @@
 
 #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
-TEST_F( TestCategory, batched_scalar_team_inverselu_dcomplex ) {
-  //printf("Batched team inverse LU - double complex - algorithm type: Unblocked\n");
-  test_batched_inverselu<TestExecSpace,Kokkos::complex<double>,Algo::InverseLU::Unblocked>();
-  //printf("Batched team inverse LU - double complex - algorithm type: Blocked\n");
-  test_batched_inverselu<TestExecSpace,Kokkos::complex<double>,Algo::InverseLU::Blocked>();
+TEST_F(TestCategory, batched_scalar_team_inverselu_dcomplex) {
+  // printf("Batched team inverse LU - double complex - algorithm type:
+  // Unblocked\n");
+  test_batched_inverselu<TestExecSpace, Kokkos::complex<double>,
+                         Algo::InverseLU::Unblocked>();
+  // printf("Batched team inverse LU - double complex - algorithm type:
+  // Blocked\n");
+  test_batched_inverselu<TestExecSpace, Kokkos::complex<double>,
+                         Algo::InverseLU::Blocked>();
 }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamInverseLU_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamInverseLU_Real.hpp
index bb8e58d87a..6fed40aaaf 100644
--- a/unit_test/batched/dense/Test_Batched_TeamInverseLU_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamInverseLU_Real.hpp
@@ -1,20 +1,18 @@
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_scalar_team_inverselu_float ) {
-  //printf("Batched team inverse LU - float - algorithm type: Unblocked\n");
-  test_batched_inverselu<TestExecSpace,float,Algo::InverseLU::Unblocked>();
-  //printf("Batched team inverse LU - float - algorithm type: Blocked\n");
-  test_batched_inverselu<TestExecSpace,float,Algo::InverseLU::Blocked>();
+TEST_F(TestCategory, batched_scalar_team_inverselu_float) {
+  // printf("Batched team inverse LU - float - algorithm type: Unblocked\n");
+  test_batched_inverselu<TestExecSpace, float, Algo::InverseLU::Unblocked>();
+  // printf("Batched team inverse LU - float - algorithm type: Blocked\n");
+  test_batched_inverselu<TestExecSpace, float, Algo::InverseLU::Blocked>();
 }
 #endif
 
-
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F( TestCategory, batched_scalar_team_inverselu_double ) {
-  //printf("Batched team inverse LU - double - algorithm type: Unblocked\n");
-  test_batched_inverselu<TestExecSpace,double,Algo::InverseLU::Unblocked>();
-  //printf("Batched team inverse LU - double - algorithm type: Blocked\n");
-  test_batched_inverselu<TestExecSpace,double,Algo::InverseLU::Blocked>();
+TEST_F(TestCategory, batched_scalar_team_inverselu_double) {
+  // printf("Batched team inverse LU - double - algorithm type: Unblocked\n");
+  test_batched_inverselu<TestExecSpace, double, Algo::InverseLU::Unblocked>();
+  // printf("Batched team inverse LU - double - algorithm type: Blocked\n");
+  test_batched_inverselu<TestExecSpace, double, Algo::InverseLU::Blocked>();
 }
 #endif
-
diff --git a/unit_test/batched/dense/Test_Batched_TeamLU.hpp b/unit_test/batched/dense/Test_Batched_TeamLU.hpp
index affcaffa0e..ec9ab01a7d 100644
--- a/unit_test/batched/dense/Test_Batched_TeamLU.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamLU.hpp
@@ -17,117 +17,110 @@ using namespace KokkosBatched;
 namespace Test {
 namespace TeamLU {
 
-  template<typename DeviceType,
-           typename ViewType,
-           typename AlgoTagType>
-  struct Functor_TestBatchedTeamLU {
-    ViewType _a;
-
-    KOKKOS_INLINE_FUNCTION
-    Functor_TestBatchedTeamLU(const ViewType &a) 
-      : _a(a) {} 
-
-    template<typename MemberType>
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const MemberType &member) const {
-      const int k = member.league_rank();
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-
-      if (member.team_rank() == 0) {
-        for (int i=0;i<static_cast<int>(aa.extent(0));++i)                                                                          
-          aa(i,i) += 10.0;  
-      }
-      member.team_barrier();
+template <typename DeviceType, typename ViewType, typename AlgoTagType>
+struct Functor_TestBatchedTeamLU {
+  ViewType _a;
 
-      KokkosBatched::TeamLU<MemberType,AlgoTagType>::invoke(member, aa);
-    }
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamLU(const ViewType &a) : _a(a) {}
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const {
+    const int k = member.league_rank();
+    auto aa     = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
 
-    inline
-    void run() {
-      typedef typename ViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::TeamLU");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion( name.c_str() );
-
-      const int league_size = _a.extent(0);
-      Kokkos::TeamPolicy<DeviceType> policy(league_size, Kokkos::AUTO);
-      Kokkos::parallel_for(name.c_str(), policy, *this);
-      Kokkos::Profiling::popRegion(); 
+    if (member.team_rank() == 0) {
+      for (int i = 0; i < static_cast<int>(aa.extent(0)); ++i) aa(i, i) += 10.0;
     }
-  };
+    member.team_barrier();
 
-  template<typename DeviceType,
-           typename ViewType,
-           typename AlgoTagType>
-  void impl_test_batched_lu(const int N, const int BlkSize) {
+    KokkosBatched::TeamLU<MemberType, AlgoTagType>::invoke(member, aa);
+  }
+
+  inline void run() {
     typedef typename ViewType::value_type value_type;
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
+    std::string name_region("KokkosBatched::Test::TeamLU");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+
+    const int league_size = _a.extent(0);
+    Kokkos::TeamPolicy<DeviceType> policy(league_size, Kokkos::AUTO);
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
 
-    /// randomized input testing views
-    ViewType
-      a0("a0", N, BlkSize,BlkSize), a1("a1", N, BlkSize, BlkSize);
+template <typename DeviceType, typename ViewType, typename AlgoTagType>
+void impl_test_batched_lu(const int N, const int BlkSize) {
+  typedef typename ViewType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
 
-    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(13718);
-    Kokkos::fill_random(a0, random, value_type(1.0));
+  /// randomized input testing views
+  ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize);
 
-    Kokkos::fence();
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
+  Kokkos::fill_random(a0, random, value_type(1.0));
 
-    Kokkos::deep_copy(a1, a0);
+  Kokkos::fence();
 
-    Functor_TestBatchedTeamLU<DeviceType,ViewType,Algo::LU::Unblocked>(a0).run();
-    Functor_TestBatchedTeamLU<DeviceType,ViewType,AlgoTagType>(a1).run();
+  Kokkos::deep_copy(a1, a0);
 
-    Kokkos::fence();
+  Functor_TestBatchedTeamLU<DeviceType, ViewType, Algo::LU::Unblocked>(a0)
+      .run();
+  Functor_TestBatchedTeamLU<DeviceType, ViewType, AlgoTagType>(a1).run();
 
-    /// for comparison send it to host
-    typename ViewType::HostMirror a0_host = Kokkos::create_mirror_view(a0);
-    typename ViewType::HostMirror a1_host = Kokkos::create_mirror_view(a1);
+  Kokkos::fence();
 
-    Kokkos::deep_copy(a0_host, a0);
-    Kokkos::deep_copy(a1_host, a1);
+  /// for comparison send it to host
+  typename ViewType::HostMirror a0_host = Kokkos::create_mirror_view(a0);
+  typename ViewType::HostMirror a1_host = Kokkos::create_mirror_view(a1);
 
-    /// check b0 = b1 ; this eps is about 10^-14
-    typedef typename ats::mag_type mag_type;
-    mag_type sum(1), diff(0);
-    const mag_type eps = 1.0e3 * ats::epsilon();
+  Kokkos::deep_copy(a0_host, a0);
+  Kokkos::deep_copy(a1_host, a1);
 
-    for (int k=0;k<N;++k)
-      for (int i=0;i<BlkSize;++i)
-        for (int j=0;j<BlkSize;++j) {
-          sum  += ats::abs(a0_host(k,i,j));
-          diff += ats::abs(a0_host(k,i,j)-a1_host(k,i,j));
-        }
-    EXPECT_NEAR_KK( diff/sum, 0, eps);
-  }
-}
-}
+  /// check b0 = b1 ; this eps is about 10^-14
+  typedef typename ats::mag_type mag_type;
+  mag_type sum(1), diff(0);
+  const mag_type eps = 1.0e3 * ats::epsilon();
 
+  for (int k = 0; k < N; ++k)
+    for (int i = 0; i < BlkSize; ++i)
+      for (int j = 0; j < BlkSize; ++j) {
+        sum += ats::abs(a0_host(k, i, j));
+        diff += ats::abs(a0_host(k, i, j) - a1_host(k, i, j));
+      }
+  EXPECT_NEAR_KK(diff / sum, 0, eps);
+}
+}  // namespace TeamLU
+}  // namespace Test
 
-template<typename DeviceType,
-         typename ValueType,
-         typename AlgoTagType>
+template <typename DeviceType, typename ValueType, typename AlgoTagType>
 int test_batched_team_lu() {
 #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft,DeviceType> ViewType;
-    Test::TeamLU::impl_test_batched_lu<DeviceType,ViewType,AlgoTagType>(     0, 10);
-    for (int i=0;i<10;++i) {                                                                                        
-      //printf("Testing: LayoutLeft,  Blksize %d\n", i); 
-      Test::TeamLU::impl_test_batched_lu<DeviceType,ViewType,AlgoTagType>(1024,  i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        ViewType;
+    Test::TeamLU::impl_test_batched_lu<DeviceType, ViewType, AlgoTagType>(0,
+                                                                          10);
+    for (int i = 0; i < 10; ++i) {
+      // printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      Test::TeamLU::impl_test_batched_lu<DeviceType, ViewType, AlgoTagType>(
+          1024, i);
     }
   }
 #endif
 #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> ViewType;
-    Test::TeamLU::impl_test_batched_lu<DeviceType,ViewType,AlgoTagType>(     0, 10);
-    for (int i=0;i<10;++i) {                                                                                        
-      //printf("Testing: LayoutLeft,  Blksize %d\n", i); 
-      Test::TeamLU::impl_test_batched_lu<DeviceType,ViewType,AlgoTagType>(1024,  i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    Test::TeamLU::impl_test_batched_lu<DeviceType, ViewType, AlgoTagType>(0,
+                                                                          10);
+    for (int i = 0; i < 10; ++i) {
+      // printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      Test::TeamLU::impl_test_batched_lu<DeviceType, ViewType, AlgoTagType>(
+          1024, i);
     }
   }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamLU_Complex.hpp b/unit_test/batched/dense/Test_Batched_TeamLU_Complex.hpp
index 34f2acbd33..e34b1aa4e9 100644
--- a/unit_test/batched/dense/Test_Batched_TeamLU_Complex.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamLU_Complex.hpp
@@ -1,8 +1,8 @@
 
 
 #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
-TEST_F( TestCategory, batched_scalar_team_lu_dcomplex ) {
+TEST_F(TestCategory, batched_scalar_team_lu_dcomplex) {
   typedef Algo::LU::Blocked algo_tag_type;
-  test_batched_lu<TestExecSpace,Kokkos::complex<double>,algo_tag_type>();
+  test_batched_lu<TestExecSpace, Kokkos::complex<double>, algo_tag_type>();
 }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamLU_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamLU_Real.hpp
index f523462c96..79c396f90f 100644
--- a/unit_test/batched/dense/Test_Batched_TeamLU_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamLU_Real.hpp
@@ -1,16 +1,14 @@
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_scalar_team_lu_float ) {
+TEST_F(TestCategory, batched_scalar_team_lu_float) {
   typedef Algo::LU::Blocked algo_tag_type;
-  test_batched_lu<TestExecSpace,float,algo_tag_type>();
+  test_batched_lu<TestExecSpace, float, algo_tag_type>();
 }
 #endif
 
-
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F( TestCategory, batched_scalar_team_lu_double ) {
+TEST_F(TestCategory, batched_scalar_team_lu_double) {
   typedef Algo::LU::Blocked algo_tag_type;
-  test_batched_lu<TestExecSpace,double,algo_tag_type>();
+  test_batched_lu<TestExecSpace, double, algo_tag_type>();
 }
 #endif
-
diff --git a/unit_test/batched/dense/Test_Batched_TeamMatUtil.hpp b/unit_test/batched/dense/Test_Batched_TeamMatUtil.hpp
index 233922aeec..16879444f7 100644
--- a/unit_test/batched/dense/Test_Batched_TeamMatUtil.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamMatUtil.hpp
@@ -16,155 +16,163 @@ using namespace KokkosBatched;
 
 namespace Test {
 namespace TeamMatUtil {
-        
-  enum : int  { BatchedSet = 0,
-                BatchedScale = 1 };
-  
-  struct KokkosKernelTag {};
-  struct NaiveTag {};
-  
-  template<typename DeviceType, 
-           typename ViewType, 
-           typename ScalarType, 
-           typename AlgoTagType, 
-           int TestID>
-  struct Functor_TestBatchedTeamMatUtil {
-          
-    ScalarType _alpha;
-    ViewType _a;
-
-    KOKKOS_INLINE_FUNCTION
-    Functor_TestBatchedTeamMatUtil(const ScalarType alpha, 
-            const ViewType &a) 
+
+enum : int { BatchedSet = 0, BatchedScale = 1 };
+
+struct KokkosKernelTag {};
+struct NaiveTag {};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename AlgoTagType, int TestID>
+struct Functor_TestBatchedTeamMatUtil {
+  ScalarType _alpha;
+  ViewType _a;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamMatUtil(const ScalarType alpha, const ViewType &a)
       : _alpha(alpha), _a(a) {}
-    
-    template<typename MemberType>
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const KokkosKernelTag &, const MemberType &member) const {
-      const int i = member.league_rank();
-      auto A = Kokkos::subview(_a, i, Kokkos::ALL(), Kokkos::ALL());
-      switch (TestID) {
-      case BatchedSet:   TeamSet  <MemberType>::invoke(member, _alpha, A); break;
-      case BatchedScale: TeamScale<MemberType>::invoke(member, _alpha, A); break;
-      }
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const KokkosKernelTag &,
+                                         const MemberType &member) const {
+    const int i = member.league_rank();
+    auto A      = Kokkos::subview(_a, i, Kokkos::ALL(), Kokkos::ALL());
+    switch (TestID) {
+      case BatchedSet: TeamSet<MemberType>::invoke(member, _alpha, A); break;
+      case BatchedScale:
+        TeamScale<MemberType>::invoke(member, _alpha, A);
+        break;
     }
+  }
 
-    template<typename MemberType>
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const NaiveTag &, const MemberType &member) const {
-      if (member.team_rank() == 0) {
-        const int k = member.league_rank();
-        auto A = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-        const int m = A.extent(0), n = A.extent(1);
-        switch (TestID) {
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const NaiveTag &,
+                                         const MemberType &member) const {
+    if (member.team_rank() == 0) {
+      const int k = member.league_rank();
+      auto A      = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+      const int m = A.extent(0), n = A.extent(1);
+      switch (TestID) {
         case BatchedSet: {
-          for (int i=0;i<m;++i) 
-            for (int j=0;j<n;++j)
-              A(i,j)  = _alpha;
+          for (int i = 0; i < m; ++i)
+            for (int j = 0; j < n; ++j) A(i, j) = _alpha;
           break;
         }
         case BatchedScale: {
-          for (int i=0;i<m;++i) 
-            for (int j=0;j<n;++j)
-              A(i,j) *= _alpha;
+          for (int i = 0; i < m; ++i)
+            for (int j = 0; j < n; ++j) A(i, j) *= _alpha;
           break;
         }
-        }
       }
     }
+  }
 
-    inline
-    int run() {
-      typedef typename ViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::SerialMatUtil");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name_work_tag = ( std::is_same<AlgoTagType,KokkosKernelTag>::value ? "::KokkosBatched" :
-                                    std::is_same<AlgoTagType,NaiveTag>::value ? "::Naive" : "::UnknownWorkTag");
-      std::string name_test_id = ( TestID == BatchedSet ? "Set" : 
-                                   TestID == BatchedScale ? "Scale" : "UnknownTest");
-      std::string name = name_region + name_value_type + name_work_tag + name_test_id;
-      Kokkos::Profiling::pushRegion( name.c_str() );
-
-      const int league_size = _a.extent(0);
-      Kokkos::TeamPolicy<DeviceType,AlgoTagType> policy(league_size, Kokkos::AUTO);
-      Kokkos::parallel_for(name.c_str(), policy, *this);
-      Kokkos::Profiling::popRegion(); 
-
-      return 0; 
-    }      
-  };
-
-  template<typename DeviceType,
-           typename ViewType, 
-           typename ScalarType,
-           int TestID>
-  void impl_test_batched_matutil(const int N, const int BlkSize) {
-
-    /// typedefs
+  inline int run() {
     typedef typename ViewType::value_type value_type;
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
-
-    /// radomized input testing views 
-    const ScalarType alpha = 11.1;
-    ViewType a("a", N, BlkSize, BlkSize);
-    ViewType b("b", N, BlkSize, BlkSize);
-
-    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(13718);
-    Kokkos::fill_random(a, random, value_type(1.0));
-
-    Kokkos::fence();
-
-    Kokkos::deep_copy(b, a);
-
-    /// test body
-    Functor_TestBatchedTeamMatUtil<DeviceType,ViewType,ScalarType,NaiveTag,       TestID>(alpha, a).run();
-    Functor_TestBatchedTeamMatUtil<DeviceType,ViewType,ScalarType,KokkosKernelTag,TestID>(alpha, b).run();
-
-    Kokkos::fence();
-
-    /// for comparison send it to host
-    typename ViewType::HostMirror a_host = Kokkos::create_mirror_view(a);
-    typename ViewType::HostMirror b_host = Kokkos::create_mirror_view(b);
-
-    Kokkos::deep_copy(a_host, a);
-    Kokkos::deep_copy(b_host, b);
-      
-    /// check a = b
-    typename ats::mag_type eps = 100 * std::numeric_limits<typename ats::mag_type>::epsilon();
-    for (int k=0;k<N;++k) 
-      for (int i=0;i<BlkSize;++i) 
-        for (int j=0;j<BlkSize;++j) 
-          EXPECT_NEAR_KK( b_host(k,i,j), a_host(k,i,j), eps);
+    std::string name_region("KokkosBatched::Test::SerialMatUtil");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name_work_tag =
+        (std::is_same<AlgoTagType, KokkosKernelTag>::value
+             ? "::KokkosBatched"
+             : std::is_same<AlgoTagType, NaiveTag>::value ? "::Naive"
+                                                          : "::UnknownWorkTag");
+    std::string name_test_id =
+        (TestID == BatchedSet
+             ? "Set"
+             : TestID == BatchedScale ? "Scale" : "UnknownTest");
+    std::string name =
+        name_region + name_value_type + name_work_tag + name_test_id;
+    Kokkos::Profiling::pushRegion(name.c_str());
+
+    const int league_size = _a.extent(0);
+    Kokkos::TeamPolicy<DeviceType, AlgoTagType> policy(league_size,
+                                                       Kokkos::AUTO);
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+
+    return 0;
   }
+};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          int TestID>
+void impl_test_batched_matutil(const int N, const int BlkSize) {
+  /// typedefs
+  typedef typename ViewType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  /// radomized input testing views
+  const ScalarType alpha = 11.1;
+  ViewType a("a", N, BlkSize, BlkSize);
+  ViewType b("b", N, BlkSize, BlkSize);
+
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
+  Kokkos::fill_random(a, random, value_type(1.0));
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(b, a);
+
+  /// test body
+  Functor_TestBatchedTeamMatUtil<DeviceType, ViewType, ScalarType, NaiveTag,
+                                 TestID>(alpha, a)
+      .run();
+  Functor_TestBatchedTeamMatUtil<DeviceType, ViewType, ScalarType,
+                                 KokkosKernelTag, TestID>(alpha, b)
+      .run();
+
+  Kokkos::fence();
+
+  /// for comparison send it to host
+  typename ViewType::HostMirror a_host = Kokkos::create_mirror_view(a);
+  typename ViewType::HostMirror b_host = Kokkos::create_mirror_view(b);
+
+  Kokkos::deep_copy(a_host, a);
+  Kokkos::deep_copy(b_host, b);
+
+  /// check a = b
+  typename ats::mag_type eps =
+      100 * std::numeric_limits<typename ats::mag_type>::epsilon();
+  for (int k = 0; k < N; ++k)
+    for (int i = 0; i < BlkSize; ++i)
+      for (int j = 0; j < BlkSize; ++j)
+        EXPECT_NEAR_KK(b_host(k, i, j), a_host(k, i, j), eps);
 }
-}
+}  // namespace TeamMatUtil
+}  // namespace Test
 
-template<typename DeviceType, 
-         typename ValueType, 
-         typename ScalarType,
-         int TestID>
+template <typename DeviceType, typename ValueType, typename ScalarType,
+          int TestID>
 int test_batched_team_matutil() {
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) 
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft,DeviceType> ViewType;
-    Test::TeamMatUtil::impl_test_batched_matutil<DeviceType,ViewType,ScalarType,TestID>(     0, 10);
-    Test::TeamMatUtil::impl_test_batched_matutil<DeviceType,ViewType,ScalarType,TestID>(    10, 15);
-    Test::TeamMatUtil::impl_test_batched_matutil<DeviceType,ViewType,ScalarType,TestID>(  1024,  9);
-    Test::TeamMatUtil::impl_test_batched_matutil<DeviceType,ViewType,ScalarType,TestID>(132231,  3);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        ViewType;
+    Test::TeamMatUtil::impl_test_batched_matutil<DeviceType, ViewType,
+                                                 ScalarType, TestID>(0, 10);
+    Test::TeamMatUtil::impl_test_batched_matutil<DeviceType, ViewType,
+                                                 ScalarType, TestID>(10, 15);
+    Test::TeamMatUtil::impl_test_batched_matutil<DeviceType, ViewType,
+                                                 ScalarType, TestID>(1024, 9);
+    Test::TeamMatUtil::impl_test_batched_matutil<DeviceType, ViewType,
+                                                 ScalarType, TestID>(132231, 3);
   }
 #endif
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) 
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> ViewType;
-    Test::TeamMatUtil::impl_test_batched_matutil<DeviceType,ViewType,ScalarType,TestID>(     0, 10);
-    Test::TeamMatUtil::impl_test_batched_matutil<DeviceType,ViewType,ScalarType,TestID>(    10, 15);
-    Test::TeamMatUtil::impl_test_batched_matutil<DeviceType,ViewType,ScalarType,TestID>(  1024,  9);
-    Test::TeamMatUtil::impl_test_batched_matutil<DeviceType,ViewType,ScalarType,TestID>(132231,  3);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    Test::TeamMatUtil::impl_test_batched_matutil<DeviceType, ViewType,
+                                                 ScalarType, TestID>(0, 10);
+    Test::TeamMatUtil::impl_test_batched_matutil<DeviceType, ViewType,
+                                                 ScalarType, TestID>(10, 15);
+    Test::TeamMatUtil::impl_test_batched_matutil<DeviceType, ViewType,
+                                                 ScalarType, TestID>(1024, 9);
+    Test::TeamMatUtil::impl_test_batched_matutil<DeviceType, ViewType,
+                                                 ScalarType, TestID>(132231, 3);
   }
 #endif
-  
+
   return 0;
 }
diff --git a/unit_test/batched/dense/Test_Batched_TeamMatUtil_Complex.hpp b/unit_test/batched/dense/Test_Batched_TeamMatUtil_Complex.hpp
index 9d4f36bca9..7f573354d8 100644
--- a/unit_test/batched/dense/Test_Batched_TeamMatUtil_Complex.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamMatUtil_Complex.hpp
@@ -1,15 +1,19 @@
 
 #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
-TEST_F( TestCategory, batched_scalar_team_set_dcomplex_dcomplex ) {
-  test_batched_team_matutil<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,::Test::BatchedSet>();
+TEST_F(TestCategory, batched_scalar_team_set_dcomplex_dcomplex) {
+  test_batched_team_matutil<TestExecSpace, Kokkos::complex<double>,
+                            Kokkos::complex<double>, ::Test::BatchedSet>();
 }
-TEST_F( TestCategory, batched_scalar_team_scale_dcomplex_dcomplex ) {
-  test_batched_team_matutil<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,::Test::BatchedScale>();
+TEST_F(TestCategory, batched_scalar_team_scale_dcomplex_dcomplex) {
+  test_batched_team_matutil<TestExecSpace, Kokkos::complex<double>,
+                            Kokkos::complex<double>, ::Test::BatchedScale>();
 }
-TEST_F( TestCategory, batched_scalar_team_set_dcomplex_double ) {
-  test_batched_team_matutil<TestExecSpace,Kokkos::complex<double>,double,::Test::BatchedSet>();
+TEST_F(TestCategory, batched_scalar_team_set_dcomplex_double) {
+  test_batched_team_matutil<TestExecSpace, Kokkos::complex<double>, double,
+                            ::Test::BatchedSet>();
 }
-TEST_F( TestCategory, batched_scalar_team_scale_dcomplex_double ) {
-  test_batched_team_matutil<TestExecSpace,Kokkos::complex<double>,double,::Test::BatchedScale>();
+TEST_F(TestCategory, batched_scalar_team_scale_dcomplex_double) {
+  test_batched_team_matutil<TestExecSpace, Kokkos::complex<double>, double,
+                            ::Test::BatchedScale>();
 }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamMatUtil_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamMatUtil_Real.hpp
index 44e3c32b3f..1f13b79cca 100644
--- a/unit_test/batched/dense/Test_Batched_TeamMatUtil_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamMatUtil_Real.hpp
@@ -1,20 +1,21 @@
-  
+
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_scalar_team_set_float_float ) {
-  test_batched_team_matutil<TestExecSpace,float,float,::Test::BatchedSet>();
+TEST_F(TestCategory, batched_scalar_team_set_float_float) {
+  test_batched_team_matutil<TestExecSpace, float, float, ::Test::BatchedSet>();
 }
-TEST_F( TestCategory, batched_scalar_team_scale_float_float ) {
-  test_batched_team_matutil<TestExecSpace,float,float,::Test::BatchedScale>();
+TEST_F(TestCategory, batched_scalar_team_scale_float_float) {
+  test_batched_team_matutil<TestExecSpace, float, float,
+                            ::Test::BatchedScale>();
 }
 #endif
 
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F( TestCategory, batched_scalar_team_set_double_double ) {
-  test_batched_team_matutil<TestExecSpace,double,double,::Test::BatchedSet>();
+TEST_F(TestCategory, batched_scalar_team_set_double_double) {
+  test_batched_team_matutil<TestExecSpace, double, double,
+                            ::Test::BatchedSet>();
 }
-TEST_F( TestCategory, batched_scalar_team_scale_double_double ) {
-  test_batched_team_matutil<TestExecSpace,double,double,::Test::BatchedScale>();
+TEST_F(TestCategory, batched_scalar_team_scale_double_double) {
+  test_batched_team_matutil<TestExecSpace, double, double,
+                            ::Test::BatchedScale>();
 }
 #endif
-
-
diff --git a/unit_test/batched/dense/Test_Batched_TeamSolveLU.hpp b/unit_test/batched/dense/Test_Batched_TeamSolveLU.hpp
index 01526184f6..201cc025fc 100644
--- a/unit_test/batched/dense/Test_Batched_TeamSolveLU.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamSolveLU.hpp
@@ -19,253 +19,234 @@ using namespace KokkosBatched;
 
 namespace Test {
 namespace TeamSolveLU {
-	
-  template<typename TA, typename TB>
-  struct ParamTag { 
-    typedef TA transA;
-    typedef TB transB;
-  };
- 
-  template<typename DeviceType,
-           typename ViewType,
-           typename ScalarType,
-           typename ParamTagType, 
-           typename AlgoTagType>
-  struct Functor_BatchedTeamGemm {
-    ViewType _a, _b, _c;
-    
-    ScalarType _alpha, _beta;
-    
-    KOKKOS_INLINE_FUNCTION
-    Functor_BatchedTeamGemm(const ScalarType alpha, 
-                            const ViewType &a,
-                            const ViewType &b,
-                            const ScalarType beta,
-                            const ViewType &c)
+
+template <typename TA, typename TB>
+struct ParamTag {
+  typedef TA transA;
+  typedef TB transB;
+};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
+struct Functor_BatchedTeamGemm {
+  ViewType _a, _b, _c;
+
+  ScalarType _alpha, _beta;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_BatchedTeamGemm(const ScalarType alpha, const ViewType &a,
+                          const ViewType &b, const ScalarType beta,
+                          const ViewType &c)
       : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {}
 
-    template<typename MemberType>
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const ParamTagType &, const MemberType &member) const {
-      const int k = member.league_rank();
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &,
+                                         const MemberType &member) const {
+    const int k = member.league_rank();
 
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-      auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
-      auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL());
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
+    auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL());
 
-      if (member.team_rank() == 0) {
-        for (int i=0;i<static_cast<int>(aa.extent(0));++i)                                                                          
-          aa(i,i) += 10.0;  
-      }
-      member.team_barrier();
-	  
-      KokkosBatched::TeamGemm<MemberType,
-               typename ParamTagType::transA,
-               typename ParamTagType::transB,
-               AlgoTagType>::
-        invoke(member, _alpha, aa, bb, _beta, cc);
-    }
-    
-    inline
-    void run() {
-      typedef typename ViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::TeamSolveLU");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion( name.c_str() );
-      const int league_size = _c.extent(0);
-      Kokkos::TeamPolicy<DeviceType,ParamTagType> policy(league_size, Kokkos::AUTO);
-      Kokkos::parallel_for((name+"::GemmFunctor").c_str(), policy, *this);            
-      Kokkos::Profiling::popRegion(); 
+    if (member.team_rank() == 0) {
+      for (int i = 0; i < static_cast<int>(aa.extent(0)); ++i) aa(i, i) += 10.0;
     }
-  };
-  template<typename DeviceType,
-           typename ViewType,
-           typename AlgoTagType>
-  struct Functor_BatchedTeamLU {
-    ViewType _a;
-    
-    KOKKOS_INLINE_FUNCTION
-    Functor_BatchedTeamLU(const ViewType &a) 
-      : _a(a) {} 
-    
-    template<typename MemberType>
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const MemberType &member) const {
-      const int k = member.league_rank();
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-      
-      if (member.team_rank() == 0) {
-        for (int i=0;i<static_cast<int>(aa.extent(0));++i)                                                                          
-          aa(i,i) += 10.0;  
-      }
-      member.team_barrier();
-      
-      KokkosBatched::TeamLU<MemberType,AlgoTagType>::invoke(member, aa);
-    }
-    inline
-    void run() {
-      typedef typename ViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::TeamSolveLU");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion( name.c_str() );
-      const int league_size = _a.extent(0);
-      Kokkos::TeamPolicy<DeviceType> policy(league_size, Kokkos::AUTO);
-      Kokkos::parallel_for((name+"::LUFunctor").c_str(), policy, *this);
-      Kokkos::Profiling::popRegion(); 
-    }
-  };
-  template<typename DeviceType,
-           typename ViewType,
-           typename TransType,
-           typename AlgoTagType>
-  struct Functor_TestBatchedTeamSolveLU {
-    ViewType _a;
-    ViewType _b;
-    
-    KOKKOS_INLINE_FUNCTION
-    Functor_TestBatchedTeamSolveLU(const ViewType &a, const ViewType &b) 
-      : _a(a), _b(b) {} 
-    
-    template<typename MemberType>
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const MemberType &member) const {
-      const int k = member.league_rank();
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-      auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
-      
-      KokkosBatched::TeamSolveLU<MemberType,TransType,AlgoTagType>::invoke(member, aa, bb);
-    }
-    
-    inline
-    void run() {
-      typedef typename ViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::TeamSolveLU");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion( name.c_str() );
-      
-      const int league_size = _a.extent(0);
-      Kokkos::TeamPolicy<DeviceType> policy(league_size, Kokkos::AUTO);
-      Kokkos::parallel_for((name+"::SolveLU").c_str(), policy, *this);
-      Kokkos::Profiling::popRegion(); 
+    member.team_barrier();
+
+    KokkosBatched::TeamGemm<MemberType, typename ParamTagType::transA,
+                            typename ParamTagType::transB,
+                            AlgoTagType>::invoke(member, _alpha, aa, bb, _beta,
+                                                 cc);
+  }
+
+  inline void run() {
+    typedef typename ViewType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::TeamSolveLU");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    const int league_size = _c.extent(0);
+    Kokkos::TeamPolicy<DeviceType, ParamTagType> policy(league_size,
+                                                        Kokkos::AUTO);
+    Kokkos::parallel_for((name + "::GemmFunctor").c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+template <typename DeviceType, typename ViewType, typename AlgoTagType>
+struct Functor_BatchedTeamLU {
+  ViewType _a;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_BatchedTeamLU(const ViewType &a) : _a(a) {}
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const {
+    const int k = member.league_rank();
+    auto aa     = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+
+    if (member.team_rank() == 0) {
+      for (int i = 0; i < static_cast<int>(aa.extent(0)); ++i) aa(i, i) += 10.0;
     }
-  };
-  
-  template<typename DeviceType,
-           typename ViewType,
-           typename AlgoTagType>
-  void impl_test_batched_solvelu(const int N, const int BlkSize) {
+    member.team_barrier();
+
+    KokkosBatched::TeamLU<MemberType, AlgoTagType>::invoke(member, aa);
+  }
+  inline void run() {
     typedef typename ViewType::value_type value_type;
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
-    
-    /// randomized input testing views
-    ViewType a0("a0", N, BlkSize, BlkSize);
-    ViewType a1("a1", N, BlkSize, BlkSize);
-    ViewType b ("b",  N, BlkSize, 5 );
-    ViewType x0("x0", N, BlkSize, 5 );
-    //ViewType a0_T("a0_T", N, BlkSize, BlkSize);
-    //ViewType b_T ("b_T",  N, BlkSize, 5 );
-    
-    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(13718);
-    Kokkos::fill_random(a0, random, value_type(1.0));
-    Kokkos::fill_random(x0, random, value_type(1.0));
-    
-    Kokkos::fence();
-    
-    Kokkos::deep_copy(a1, a0);
-    //Kokkos::deep_copy(a0_T, a0);
-    
-    value_type alpha = 1.0, beta = 0.0;   
-    typedef ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
-    
-    Functor_BatchedTeamGemm<DeviceType,ViewType,value_type,
-                            param_tag_type,AlgoTagType>(alpha, a0, x0, beta, b).run();
-    
-    Functor_BatchedTeamLU<DeviceType,ViewType,AlgoTagType>(a1).run();
-    
-    Functor_TestBatchedTeamSolveLU<DeviceType,ViewType,Trans::NoTranspose,AlgoTagType>(a1,b).run();
-    
-    Kokkos::fence();
-    
-    // //Transpose
-    // typedef ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type_T;
-    
-    // Functor_BatchedTeamGemm<DeviceType,ViewType,value_type,
-    //   param_tag_type_T,AlgoTagType>(alpha, a0_T, x0, beta, b_T).run();
-    
-    // Functor_TestBatchedTeamSolveLU<DeviceType,ViewType,AlgoTagType,Trans::Transpose>(a1,b_T).run();
-    
-    // Kokkos::fence();
-    
-    /// for comparison send it to host
-    typename ViewType::HostMirror x0_host = Kokkos::create_mirror_view(x0);
-    typename ViewType::HostMirror b_host  = Kokkos::create_mirror_view(b);
-    //typename ViewType::HostMirror b_T_host = Kokkos::create_mirror_view(b_T);
-    
-    Kokkos::deep_copy(x0_host, x0);
-    Kokkos::deep_copy(b_host, b);
-    //Kokkos::deep_copy(b_T_host, b_T);
-    
-    /// check x0 = b ; this eps is about 10^-14
-    typedef typename ats::mag_type mag_type;
-    mag_type sum(1), diff(0);
-    const mag_type eps = 1.0e3 * ats::epsilon();
-    
-    for (int k=0;k<N;++k)
-      for (int i=0;i<BlkSize;++i)
-        for (int j=0;j<5;++j) {
-          sum  += ats::abs(x0_host(k,i,j));
-          diff += ats::abs(x0_host(k,i,j)-b_host(k,i,j));
-        }
-    //printf("NoTranspose -- N=%d, BlkSize=%d, sum=%f, diff=%f\n", N, BlkSize, sum, diff);
-    EXPECT_NEAR_KK( diff/sum, 0.0, eps);
-    
-    // /// check x0 = b_T ; this eps is about 10^-14
-    // mag_type sum_T(1), diff_T(0);
-    
-    // for (int k=0;k<N;++k)
-    //   for (int i=0;i<BlkSize;++i)
-    //     for (int j=0;j<5;++j) {
-    //       sum_T  += ats::abs(x0_host(k,i,j));
-    //       diff_T += ats::abs(x0_host(k,i,j)-b_T_host(k,i,j));
-    //     }
-    // //printf("Transpose -- N=%d, BlkSize=%d, sum=%f, diff=%f\n", N, BlkSize, sum_T, diff_T);
-    // EXPECT_NEAR_KK( diff_T/sum_T, 0.0, eps);
+    std::string name_region("KokkosBatched::Test::TeamSolveLU");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    const int league_size = _a.extent(0);
+    Kokkos::TeamPolicy<DeviceType> policy(league_size, Kokkos::AUTO);
+    Kokkos::parallel_for((name + "::LUFunctor").c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+template <typename DeviceType, typename ViewType, typename TransType,
+          typename AlgoTagType>
+struct Functor_TestBatchedTeamSolveLU {
+  ViewType _a;
+  ViewType _b;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamSolveLU(const ViewType &a, const ViewType &b)
+      : _a(a), _b(b) {}
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const {
+    const int k = member.league_rank();
+    auto aa     = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto bb     = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
+
+    KokkosBatched::TeamSolveLU<MemberType, TransType, AlgoTagType>::invoke(
+        member, aa, bb);
   }
-}
-}
 
+  inline void run() {
+    typedef typename ViewType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::TeamSolveLU");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+
+    const int league_size = _a.extent(0);
+    Kokkos::TeamPolicy<DeviceType> policy(league_size, Kokkos::AUTO);
+    Kokkos::parallel_for((name + "::SolveLU").c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename ViewType, typename AlgoTagType>
+void impl_test_batched_solvelu(const int N, const int BlkSize) {
+  typedef typename ViewType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  /// randomized input testing views
+  ViewType a0("a0", N, BlkSize, BlkSize);
+  ViewType a1("a1", N, BlkSize, BlkSize);
+  ViewType b("b", N, BlkSize, 5);
+  ViewType x0("x0", N, BlkSize, 5);
+  // ViewType a0_T("a0_T", N, BlkSize, BlkSize);
+  // ViewType b_T ("b_T",  N, BlkSize, 5 );
+
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
+  Kokkos::fill_random(a0, random, value_type(1.0));
+  Kokkos::fill_random(x0, random, value_type(1.0));
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(a1, a0);
+  // Kokkos::deep_copy(a0_T, a0);
+
+  value_type alpha = 1.0, beta = 0.0;
+  typedef ParamTag<Trans::NoTranspose, Trans::NoTranspose> param_tag_type;
+
+  Functor_BatchedTeamGemm<DeviceType, ViewType, value_type, param_tag_type,
+                          AlgoTagType>(alpha, a0, x0, beta, b)
+      .run();
+
+  Functor_BatchedTeamLU<DeviceType, ViewType, AlgoTagType>(a1).run();
+
+  Functor_TestBatchedTeamSolveLU<DeviceType, ViewType, Trans::NoTranspose,
+                                 AlgoTagType>(a1, b)
+      .run();
+
+  Kokkos::fence();
+
+  // //Transpose
+  // typedef ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type_T;
+
+  // Functor_BatchedTeamGemm<DeviceType,ViewType,value_type,
+  //   param_tag_type_T,AlgoTagType>(alpha, a0_T, x0, beta, b_T).run();
+
+  // Functor_TestBatchedTeamSolveLU<DeviceType,ViewType,AlgoTagType,Trans::Transpose>(a1,b_T).run();
+
+  // Kokkos::fence();
+
+  /// for comparison send it to host
+  typename ViewType::HostMirror x0_host = Kokkos::create_mirror_view(x0);
+  typename ViewType::HostMirror b_host  = Kokkos::create_mirror_view(b);
+  // typename ViewType::HostMirror b_T_host = Kokkos::create_mirror_view(b_T);
+
+  Kokkos::deep_copy(x0_host, x0);
+  Kokkos::deep_copy(b_host, b);
+  // Kokkos::deep_copy(b_T_host, b_T);
+
+  /// check x0 = b ; this eps is about 10^-14
+  typedef typename ats::mag_type mag_type;
+  mag_type sum(1), diff(0);
+  const mag_type eps = 1.0e3 * ats::epsilon();
+
+  for (int k = 0; k < N; ++k)
+    for (int i = 0; i < BlkSize; ++i)
+      for (int j = 0; j < 5; ++j) {
+        sum += ats::abs(x0_host(k, i, j));
+        diff += ats::abs(x0_host(k, i, j) - b_host(k, i, j));
+      }
+  // printf("NoTranspose -- N=%d, BlkSize=%d, sum=%f, diff=%f\n", N, BlkSize,
+  // sum, diff);
+  EXPECT_NEAR_KK(diff / sum, 0.0, eps);
+
+  // /// check x0 = b_T ; this eps is about 10^-14
+  // mag_type sum_T(1), diff_T(0);
+
+  // for (int k=0;k<N;++k)
+  //   for (int i=0;i<BlkSize;++i)
+  //     for (int j=0;j<5;++j) {
+  //       sum_T  += ats::abs(x0_host(k,i,j));
+  //       diff_T += ats::abs(x0_host(k,i,j)-b_T_host(k,i,j));
+  //     }
+  // //printf("Transpose -- N=%d, BlkSize=%d, sum=%f, diff=%f\n", N, BlkSize,
+  // sum_T, diff_T); EXPECT_NEAR_KK( diff_T/sum_T, 0.0, eps);
+}
+}  // namespace TeamSolveLU
+}  // namespace Test
 
-template<typename DeviceType,
-         typename ValueType,
-         typename AlgoTagType>
+template <typename DeviceType, typename ValueType, typename AlgoTagType>
 int test_batched_team_solvelu() {
 #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft,DeviceType> ViewType;
-    Test::TeamSolveLU::impl_test_batched_solvelu<DeviceType,ViewType,AlgoTagType>(     0, 10);
-    for (int i=0;i<10;++i) {                                                                                         
-      Test::TeamSolveLU::impl_test_batched_solvelu<DeviceType,ViewType,AlgoTagType>(1024,  i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        ViewType;
+    Test::TeamSolveLU::impl_test_batched_solvelu<DeviceType, ViewType,
+                                                 AlgoTagType>(0, 10);
+    for (int i = 0; i < 10; ++i) {
+      Test::TeamSolveLU::impl_test_batched_solvelu<DeviceType, ViewType,
+                                                   AlgoTagType>(1024, i);
     }
   }
 #endif
 #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> ViewType;
-    Test::TeamSolveLU::impl_test_batched_solvelu<DeviceType,ViewType,AlgoTagType>(     0, 10);
-    for (int i=0;i<10;++i) {                                                                                        
-      Test::TeamSolveLU::impl_test_batched_solvelu<DeviceType,ViewType,AlgoTagType>(1024,  i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    Test::TeamSolveLU::impl_test_batched_solvelu<DeviceType, ViewType,
+                                                 AlgoTagType>(0, 10);
+    for (int i = 0; i < 10; ++i) {
+      Test::TeamSolveLU::impl_test_batched_solvelu<DeviceType, ViewType,
+                                                   AlgoTagType>(1024, i);
     }
   }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamSolveLU_Complex.hpp b/unit_test/batched/dense/Test_Batched_TeamSolveLU_Complex.hpp
index fe60da59e4..093cf4e61a 100644
--- a/unit_test/batched/dense/Test_Batched_TeamSolveLU_Complex.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamSolveLU_Complex.hpp
@@ -1,9 +1,13 @@
 
 #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
-TEST_F( TestCategory, batched_scalar_team_solvelu_dcomplex ) {
-  //printf("Batched team solveLU - double complex - algorithm type: Unblocked\n");
-  test_batched_team_solvelu<TestExecSpace,Kokkos::complex<double>,Algo::SolveLU::Unblocked>();
-  //printf("Batched team solveLU - double complex - algorithm type: Blocked\n");
-  test_batched_team_solvelu<TestExecSpace,Kokkos::complex<double>,Algo::SolveLU::Blocked>();
+TEST_F(TestCategory, batched_scalar_team_solvelu_dcomplex) {
+  // printf("Batched team solveLU - double complex - algorithm type:
+  // Unblocked\n");
+  test_batched_team_solvelu<TestExecSpace, Kokkos::complex<double>,
+                            Algo::SolveLU::Unblocked>();
+  // printf("Batched team solveLU - double complex - algorithm type:
+  // Blocked\n");
+  test_batched_team_solvelu<TestExecSpace, Kokkos::complex<double>,
+                            Algo::SolveLU::Blocked>();
 }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamSolveLU_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamSolveLU_Real.hpp
index 3e3d593a84..44131ec223 100644
--- a/unit_test/batched/dense/Test_Batched_TeamSolveLU_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamSolveLU_Real.hpp
@@ -1,20 +1,18 @@
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_scalar_team_solvelu_float ) {
-  //printf("Batched team solveLU - float - algorithm type: Unblocked\n");
-  test_batched_team_solvelu<TestExecSpace,float,Algo::SolveLU::Unblocked>();
-  //printf("Batched team solveLU - float - algorithm type: Blocked\n");
-  test_batched_team_solvelu<TestExecSpace,float,Algo::SolveLU::Blocked>();
+TEST_F(TestCategory, batched_scalar_team_solvelu_float) {
+  // printf("Batched team solveLU - float - algorithm type: Unblocked\n");
+  test_batched_team_solvelu<TestExecSpace, float, Algo::SolveLU::Unblocked>();
+  // printf("Batched team solveLU - float - algorithm type: Blocked\n");
+  test_batched_team_solvelu<TestExecSpace, float, Algo::SolveLU::Blocked>();
 }
 #endif
 
-
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F( TestCategory, batched_scalar_team_solvelu_double ) {
-  //printf("Batched team solveLU - double - algorithm type: Unblocked\n");
-  test_batched_team_solvelu<TestExecSpace,double,Algo::SolveLU::Unblocked>();
-  //printf("Batched team solveLU - double - algorithm type: Blocked\n");
-  test_batched_team_solvelu<TestExecSpace,double,Algo::SolveLU::Blocked>();
+TEST_F(TestCategory, batched_scalar_team_solvelu_double) {
+  // printf("Batched team solveLU - double - algorithm type: Unblocked\n");
+  test_batched_team_solvelu<TestExecSpace, double, Algo::SolveLU::Unblocked>();
+  // printf("Batched team solveLU - double - algorithm type: Blocked\n");
+  test_batched_team_solvelu<TestExecSpace, double, Algo::SolveLU::Blocked>();
 }
 #endif
-
diff --git a/unit_test/batched/dense/Test_Batched_TeamTrsm.hpp b/unit_test/batched/dense/Test_Batched_TeamTrsm.hpp
index a61adfddea..da4b70933a 100644
--- a/unit_test/batched/dense/Test_Batched_TeamTrsm.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamTrsm.hpp
@@ -17,149 +17,147 @@ using namespace KokkosBatched;
 namespace Test {
 namespace TeamTrsm {
 
-  template<typename S, typename U, typename T, typename D>
-  struct ParamTag {
-    typedef S side;
-    typedef U uplo;
-    typedef T trans;
-    typedef D diag;
-  };
-
-  template<typename DeviceType,
-           typename ViewType,
-           typename ScalarType,
-           typename ParamTagType,
-           typename AlgoTagType>
-  struct Functor_TestBatchedTeamTrsm {
-    ViewType _a, _b;
-    
-    ScalarType _alpha;
-
-    KOKKOS_INLINE_FUNCTION
-    Functor_TestBatchedTeamTrsm(const ScalarType alpha, 
-            const ViewType &a,
-            const ViewType &b) 
+template <typename S, typename U, typename T, typename D>
+struct ParamTag {
+  typedef S side;
+  typedef U uplo;
+  typedef T trans;
+  typedef D diag;
+};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
+struct Functor_TestBatchedTeamTrsm {
+  ViewType _a, _b;
+
+  ScalarType _alpha;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamTrsm(const ScalarType alpha, const ViewType &a,
+                              const ViewType &b)
       : _a(a), _b(b), _alpha(alpha) {}
 
-    template<typename MemberType>
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const ParamTagType &, const MemberType &member) const {
-      const int k = member.league_rank();
-      
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-      auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
-
-      KokkosBatched::TeamTrsm<MemberType,
-        typename ParamTagType::side,
-        typename ParamTagType::uplo,
-        typename ParamTagType::trans,
-        typename ParamTagType::diag,
-        AlgoTagType>::
-        invoke(member, _alpha, aa, bb);
-    }
-
-    inline
-    void run() {
-      typedef typename ViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::TeamTrsm");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion( name.c_str() );
-
-      const int league_size = _b.extent(0);
-      Kokkos::TeamPolicy<DeviceType,ParamTagType> policy(league_size, Kokkos::AUTO);
-      Kokkos::parallel_for(name.c_str(), policy, *this);
-      Kokkos::Profiling::popRegion();
-    }
-  };
-
-  template<typename DeviceType,
-           typename ViewType,
-           typename ScalarType,
-           typename ParamTagType,
-           typename AlgoTagType>
-  void impl_test_batched_trsm(const int N, const int BlkSize, const int NumCols) {
-    typedef typename ViewType::value_type value_type;
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
-
-    /// randomized input testing views
-    ScalarType alpha(1.0);
-
-    const bool is_side_right = std::is_same<typename ParamTagType::side,Side::Right>::value;
-    const int b_nrows = is_side_right ? NumCols : BlkSize;
-    const int b_ncols = is_side_right ? BlkSize : NumCols;
-    ViewType
-      a0("a0", N, BlkSize,BlkSize), a1("a1", N, BlkSize, BlkSize),
-      b0("b0", N, b_nrows,b_ncols), b1("b1", N, b_nrows, b_ncols);
-
-    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(13718);
-    Kokkos::fill_random(a0, random, value_type(1.0));
-    Kokkos::fill_random(b0, random, value_type(1.0));
-
-    Kokkos::fence();
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &,
+                                         const MemberType &member) const {
+    const int k = member.league_rank();
 
-    Kokkos::deep_copy(a1, a0);
-    Kokkos::deep_copy(b1, b0);
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
 
-    Functor_TestBatchedTeamTrsm<DeviceType,ViewType,ScalarType,
-      ParamTagType,Algo::Trsm::Unblocked>(alpha, a0, b0).run();
-    Functor_TestBatchedTeamTrsm<DeviceType,ViewType,ScalarType,
-      ParamTagType,AlgoTagType>(alpha, a1, b1).run();
-
-    Kokkos::fence();
-
-    /// for comparison send it to host
-    typename ViewType::HostMirror b0_host = Kokkos::create_mirror_view(b0);
-    typename ViewType::HostMirror b1_host = Kokkos::create_mirror_view(b1);
-
-    Kokkos::deep_copy(b0_host, b0);
-    Kokkos::deep_copy(b1_host, b1);
-
-    /// check b0 = b1 ; this eps is about 10^-14
-    typedef typename ats::mag_type mag_type;
-    mag_type sum(1), diff(0);
-    const mag_type eps = 1.0e3 * ats::epsilon();
+    KokkosBatched::TeamTrsm<
+        MemberType, typename ParamTagType::side, typename ParamTagType::uplo,
+        typename ParamTagType::trans, typename ParamTagType::diag,
+        AlgoTagType>::invoke(member, _alpha, aa, bb);
+  }
 
-    for (int k=0;k<N;++k)
-      for (int i=0;i<b_nrows;++i)
-        for (int j=0;j<b_ncols;++j) {
-          sum  += ats::abs(b0_host(k,i,j));
-          diff += ats::abs(b0_host(k,i,j)-b1_host(k,i,j));
-        }
-    EXPECT_NEAR_KK( diff/sum, 0.0, eps);
+  inline void run() {
+    typedef typename ViewType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::TeamTrsm");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+
+    const int league_size = _b.extent(0);
+    Kokkos::TeamPolicy<DeviceType, ParamTagType> policy(league_size,
+                                                        Kokkos::AUTO);
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
   }
+};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
+void impl_test_batched_trsm(const int N, const int BlkSize, const int NumCols) {
+  typedef typename ViewType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  /// randomized input testing views
+  ScalarType alpha(1.0);
+
+  const bool is_side_right =
+      std::is_same<typename ParamTagType::side, Side::Right>::value;
+  const int b_nrows = is_side_right ? NumCols : BlkSize;
+  const int b_ncols = is_side_right ? BlkSize : NumCols;
+  ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize),
+      b0("b0", N, b_nrows, b_ncols), b1("b1", N, b_nrows, b_ncols);
+
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
+  Kokkos::fill_random(a0, random, value_type(1.0));
+  Kokkos::fill_random(b0, random, value_type(1.0));
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(a1, a0);
+  Kokkos::deep_copy(b1, b0);
+
+  Functor_TestBatchedTeamTrsm<DeviceType, ViewType, ScalarType, ParamTagType,
+                              Algo::Trsm::Unblocked>(alpha, a0, b0)
+      .run();
+  Functor_TestBatchedTeamTrsm<DeviceType, ViewType, ScalarType, ParamTagType,
+                              AlgoTagType>(alpha, a1, b1)
+      .run();
+
+  Kokkos::fence();
+
+  /// for comparison send it to host
+  typename ViewType::HostMirror b0_host = Kokkos::create_mirror_view(b0);
+  typename ViewType::HostMirror b1_host = Kokkos::create_mirror_view(b1);
+
+  Kokkos::deep_copy(b0_host, b0);
+  Kokkos::deep_copy(b1_host, b1);
+
+  /// check b0 = b1 ; this eps is about 10^-14
+  typedef typename ats::mag_type mag_type;
+  mag_type sum(1), diff(0);
+  const mag_type eps = 1.0e3 * ats::epsilon();
+
+  for (int k = 0; k < N; ++k)
+    for (int i = 0; i < b_nrows; ++i)
+      for (int j = 0; j < b_ncols; ++j) {
+        sum += ats::abs(b0_host(k, i, j));
+        diff += ats::abs(b0_host(k, i, j) - b1_host(k, i, j));
+      }
+  EXPECT_NEAR_KK(diff / sum, 0.0, eps);
 }
-}
-
+}  // namespace TeamTrsm
+}  // namespace Test
 
-template<typename DeviceType,
-         typename ValueType,
-         typename ScalarType,
-         typename ParamTagType,
-         typename AlgoTagType>
+template <typename DeviceType, typename ValueType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
 int test_batched_team_trsm() {
 #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft,DeviceType> ViewType;
-    Test::TeamTrsm::impl_test_batched_trsm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(     0, 10, 4);
-    for (int i=0;i<10;++i) {
-      //printf("Testing: LayoutLeft,  Blksize %d\n", i);  
-      Test::TeamTrsm::impl_test_batched_trsm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024,  i, 4);
-      Test::TeamTrsm::impl_test_batched_trsm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024,  i, 1);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        ViewType;
+    Test::TeamTrsm::impl_test_batched_trsm<DeviceType, ViewType, ScalarType,
+                                           ParamTagType, AlgoTagType>(0, 10, 4);
+    for (int i = 0; i < 10; ++i) {
+      // printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      Test::TeamTrsm::impl_test_batched_trsm<DeviceType, ViewType, ScalarType,
+                                             ParamTagType, AlgoTagType>(1024, i,
+                                                                        4);
+      Test::TeamTrsm::impl_test_batched_trsm<DeviceType, ViewType, ScalarType,
+                                             ParamTagType, AlgoTagType>(1024, i,
+                                                                        1);
     }
   }
 #endif
 #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> ViewType;
-    Test::TeamTrsm::impl_test_batched_trsm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(     0, 10, 4);
-    for (int i=0;i<10;++i) {
-      //printf("Testing: LayoutRight, Blksize %d\n", i);  
-      Test::TeamTrsm::impl_test_batched_trsm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024,  i, 4);
-      Test::TeamTrsm::impl_test_batched_trsm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024,  i, 1);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    Test::TeamTrsm::impl_test_batched_trsm<DeviceType, ViewType, ScalarType,
+                                           ParamTagType, AlgoTagType>(0, 10, 4);
+    for (int i = 0; i < 10; ++i) {
+      // printf("Testing: LayoutRight, Blksize %d\n", i);
+      Test::TeamTrsm::impl_test_batched_trsm<DeviceType, ViewType, ScalarType,
+                                             ParamTagType, AlgoTagType>(1024, i,
+                                                                        4);
+      Test::TeamTrsm::impl_test_batched_trsm<DeviceType, ViewType, ScalarType,
+                                             ParamTagType, AlgoTagType>(1024, i,
+                                                                        1);
     }
   }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamTrsm_Complex.hpp b/unit_test/batched/dense/Test_Batched_TeamTrsm_Complex.hpp
index bf8ae40f6c..20015ca0d5 100644
--- a/unit_test/batched/dense/Test_Batched_TeamTrsm_Complex.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamTrsm_Complex.hpp
@@ -1,108 +1,177 @@
 
 
 #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
-TEST_F( TestCategory, batched_scalar_team_trsm_l_l_nt_u_dcomplex_dcomplex ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit> param_tag_type;
-  typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
-}
-TEST_F( TestCategory, batched_scalar_team_trsm_l_l_nt_n_dcomplex_dcomplex ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
-  typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
-}
-TEST_F( TestCategory, batched_scalar_team_trsm_l_u_nt_u_dcomplex_dcomplex ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
-  typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
-}
-TEST_F( TestCategory, batched_scalar_team_trsm_l_u_nt_n_dcomplex_dcomplex ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
-  typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
-}
-TEST_F( TestCategory, batched_scalar_team_trsm_r_u_nt_u_dcomplex_dcomplex ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
-  typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
-}
-TEST_F( TestCategory, batched_scalar_team_trsm_r_u_nt_n_dcomplex_dcomplex ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
-  typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_u_dcomplex_dcomplex) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Lower,
+                                     Trans::NoTranspose, Diag::Unit>
+      param_tag_type;
+  typedef Algo::Trsm::Blocked algo_tag_type;
+  test_batched_team_trsm<TestExecSpace, Kokkos::complex<double>,
+                         Kokkos::complex<double>, param_tag_type,
+                         algo_tag_type>();
+}
+TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_dcomplex_dcomplex) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Lower,
+                                     Trans::NoTranspose, Diag::NonUnit>
+      param_tag_type;
+  typedef Algo::Trsm::Blocked algo_tag_type;
+  test_batched_team_trsm<TestExecSpace, Kokkos::complex<double>,
+                         Kokkos::complex<double>, param_tag_type,
+                         algo_tag_type>();
+}
+TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_dcomplex_dcomplex) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Upper,
+                                     Trans::NoTranspose, Diag::Unit>
+      param_tag_type;
+  typedef Algo::Trsm::Blocked algo_tag_type;
+  test_batched_team_trsm<TestExecSpace, Kokkos::complex<double>,
+                         Kokkos::complex<double>, param_tag_type,
+                         algo_tag_type>();
+}
+TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_dcomplex_dcomplex) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Upper,
+                                     Trans::NoTranspose, Diag::NonUnit>
+      param_tag_type;
+  typedef Algo::Trsm::Blocked algo_tag_type;
+  test_batched_team_trsm<TestExecSpace, Kokkos::complex<double>,
+                         Kokkos::complex<double>, param_tag_type,
+                         algo_tag_type>();
+}
+TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_dcomplex_dcomplex) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Right, Uplo::Upper,
+                                     Trans::NoTranspose, Diag::Unit>
+      param_tag_type;
+  typedef Algo::Trsm::Blocked algo_tag_type;
+  test_batched_team_trsm<TestExecSpace, Kokkos::complex<double>,
+                         Kokkos::complex<double>, param_tag_type,
+                         algo_tag_type>();
+}
+TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_dcomplex_dcomplex) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Right, Uplo::Upper,
+                                     Trans::NoTranspose, Diag::NonUnit>
+      param_tag_type;
+  typedef Algo::Trsm::Blocked algo_tag_type;
+  test_batched_team_trsm<TestExecSpace, Kokkos::complex<double>,
+                         Kokkos::complex<double>, param_tag_type,
+                         algo_tag_type>();
 }
 //
-TEST_F( TestCategory, batched_scalar_team_trsm_l_l_t_u_dcomplex_dcomplex ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Lower,Trans::Transpose,Diag::Unit> param_tag_type;
-  typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
+TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_u_dcomplex_dcomplex) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Lower, Trans::Transpose,
+                                     Diag::Unit>
+      param_tag_type;
+  typedef Algo::Trsm::Blocked algo_tag_type;
+  test_batched_team_trsm<TestExecSpace, Kokkos::complex<double>,
+                         Kokkos::complex<double>, param_tag_type,
+                         algo_tag_type>();
+}
+TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_dcomplex_dcomplex) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Lower, Trans::Transpose,
+                                     Diag::NonUnit>
+      param_tag_type;
+  typedef Algo::Trsm::Blocked algo_tag_type;
+  test_batched_team_trsm<TestExecSpace, Kokkos::complex<double>,
+                         Kokkos::complex<double>, param_tag_type,
+                         algo_tag_type>();
+}
+TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_dcomplex_dcomplex) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Upper, Trans::Transpose,
+                                     Diag::Unit>
+      param_tag_type;
+  typedef Algo::Trsm::Blocked algo_tag_type;
+  test_batched_team_trsm<TestExecSpace, Kokkos::complex<double>,
+                         Kokkos::complex<double>, param_tag_type,
+                         algo_tag_type>();
+}
+TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_dcomplex_dcomplex) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Upper, Trans::Transpose,
+                                     Diag::NonUnit>
+      param_tag_type;
+  typedef Algo::Trsm::Blocked algo_tag_type;
+  test_batched_team_trsm<TestExecSpace, Kokkos::complex<double>,
+                         Kokkos::complex<double>, param_tag_type,
+                         algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_trsm_l_l_t_n_dcomplex_dcomplex ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Lower,Trans::Transpose,Diag::NonUnit> param_tag_type;
-  typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
-}
-TEST_F( TestCategory, batched_scalar_team_trsm_l_u_t_u_dcomplex_dcomplex ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::Unit> param_tag_type;
-  typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
-}
-TEST_F( TestCategory, batched_scalar_team_trsm_l_u_t_n_dcomplex_dcomplex ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::NonUnit> param_tag_type;
-  typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
-}
-
 
-TEST_F( TestCategory, batched_scalar_team_trsm_l_l_nt_u_dcomplex_double ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_u_dcomplex_double) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Lower,
+                                     Trans::NoTranspose, Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, Kokkos::complex<double>, double,
+                         param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_trsm_l_l_nt_n_dcomplex_double ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_dcomplex_double) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Lower,
+                                     Trans::NoTranspose, Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, Kokkos::complex<double>, double,
+                         param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_trsm_l_u_nt_u_dcomplex_double ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_dcomplex_double) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Upper,
+                                     Trans::NoTranspose, Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, Kokkos::complex<double>, double,
+                         param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_trsm_l_u_nt_n_dcomplex_double ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_dcomplex_double) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Upper,
+                                     Trans::NoTranspose, Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, Kokkos::complex<double>, double,
+                         param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_trsm_r_u_nt_u_dcomplex_double ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_dcomplex_double) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Right, Uplo::Upper,
+                                     Trans::NoTranspose, Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, Kokkos::complex<double>, double,
+                         param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_trsm_r_u_nt_n_dcomplex_double ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_dcomplex_double) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Right, Uplo::Upper,
+                                     Trans::NoTranspose, Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, Kokkos::complex<double>, double,
+                         param_tag_type, algo_tag_type>();
 }
 //
-TEST_F( TestCategory, batched_scalar_team_trsm_l_l_t_u_dcomplex_double ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Lower,Trans::Transpose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_u_dcomplex_double) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Lower, Trans::Transpose,
+                                     Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, Kokkos::complex<double>, double,
+                         param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_trsm_l_l_t_n_dcomplex_double ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Lower,Trans::Transpose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_dcomplex_double) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Lower, Trans::Transpose,
+                                     Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, Kokkos::complex<double>, double,
+                         param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_trsm_l_u_t_u_dcomplex_double ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_dcomplex_double) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Upper, Trans::Transpose,
+                                     Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, Kokkos::complex<double>, double,
+                         param_tag_type, algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_trsm_l_u_t_n_dcomplex_double ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_dcomplex_double) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Upper, Trans::Transpose,
+                                     Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, Kokkos::complex<double>, double,
+                         param_tag_type, algo_tag_type>();
 }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamTrsm_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamTrsm_Real.hpp
index 5d391869c1..9743262c43 100644
--- a/unit_test/batched/dense/Test_Batched_TeamTrsm_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamTrsm_Real.hpp
@@ -1,110 +1,168 @@
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_scalar_team_trsm_l_l_nt_u_float_float ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_u_float_float) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Lower,
+                                     Trans::NoTranspose, Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, float, float, param_tag_type,
+                         algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_trsm_l_l_nt_n_float_float ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_float_float) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Lower,
+                                     Trans::NoTranspose, Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, float, float, param_tag_type,
+                         algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_trsm_l_u_nt_u_float_float ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_float_float) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Upper,
+                                     Trans::NoTranspose, Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, float, float, param_tag_type,
+                         algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_trsm_l_u_nt_n_float_float ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_float_float) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Upper,
+                                     Trans::NoTranspose, Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, float, float, param_tag_type,
+                         algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_trsm_r_u_nt_u_float_float ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_float_float) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Right, Uplo::Upper,
+                                     Trans::NoTranspose, Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, float, float, param_tag_type,
+                         algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_trsm_r_u_nt_n_float_float ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_float_float) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Right, Uplo::Upper,
+                                     Trans::NoTranspose, Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, float, float, param_tag_type,
+                         algo_tag_type>();
 }
 //
-TEST_F( TestCategory, batched_scalar_team_trsm_l_l_t_u_float_float ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Lower,Trans::Transpose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_u_float_float) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Lower, Trans::Transpose,
+                                     Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, float, float, param_tag_type,
+                         algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_trsm_l_l_t_n_float_float ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Lower,Trans::Transpose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_float_float) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Lower, Trans::Transpose,
+                                     Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, float, float, param_tag_type,
+                         algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_trsm_l_u_t_u_float_float ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_float_float) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Upper, Trans::Transpose,
+                                     Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, float, float, param_tag_type,
+                         algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_trsm_l_u_t_n_float_float ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_float_float) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Upper, Trans::Transpose,
+                                     Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, float, float, param_tag_type,
+                         algo_tag_type>();
 }
 #endif
 
-
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F( TestCategory, batched_scalar_team_trsm_l_l_nt_u_double_double ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_u_double_double) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Lower,
+                                     Trans::NoTranspose, Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, double, double, param_tag_type,
+                         algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_trsm_l_l_nt_n_double_double ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Lower,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_double_double) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Lower,
+                                     Trans::NoTranspose, Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, double, double, param_tag_type,
+                         algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_trsm_l_u_nt_u_double_double ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_double_double) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Upper,
+                                     Trans::NoTranspose, Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, double, double, param_tag_type,
+                         algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_trsm_l_u_nt_n_double_double ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_double_double) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Upper,
+                                     Trans::NoTranspose, Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, double, double, param_tag_type,
+                         algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_trsm_r_u_nt_u_double_double ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_double_double) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Right, Uplo::Upper,
+                                     Trans::NoTranspose, Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, double, double, param_tag_type,
+                         algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_trsm_r_u_nt_n_double_double ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Right,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_double_double) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Right, Uplo::Upper,
+                                     Trans::NoTranspose, Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, double, double, param_tag_type,
+                         algo_tag_type>();
 }
 //
-TEST_F( TestCategory, batched_scalar_team_trsm_l_l_t_u_double_double ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Lower,Trans::Transpose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_u_double_double) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Lower, Trans::Transpose,
+                                     Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, double, double, param_tag_type,
+                         algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_trsm_l_l_t_n_double_double ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Lower,Trans::Transpose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_double_double) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Lower, Trans::Transpose,
+                                     Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, double, double, param_tag_type,
+                         algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_trsm_l_u_t_u_double_double ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::Unit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_double_double) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Upper, Trans::Transpose,
+                                     Diag::Unit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, double, double, param_tag_type,
+                         algo_tag_type>();
 }
-TEST_F( TestCategory, batched_scalar_team_trsm_l_u_t_n_double_double ) {
-  typedef ::Test::TeamTrsm::ParamTag<Side::Left,Uplo::Upper,Trans::Transpose,Diag::NonUnit> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_double_double) {
+  typedef ::Test::TeamTrsm::ParamTag<Side::Left, Uplo::Upper, Trans::Transpose,
+                                     Diag::NonUnit>
+      param_tag_type;
   typedef Algo::Trsm::Blocked algo_tag_type;
-  test_batched_team_trsm<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
+  test_batched_team_trsm<TestExecSpace, double, double, param_tag_type,
+                         algo_tag_type>();
 }
 #endif
-
diff --git a/unit_test/batched/dense/Test_Batched_TeamTrsv.hpp b/unit_test/batched/dense/Test_Batched_TeamTrsv.hpp
index 57da5e95ab..c33c939488 100644
--- a/unit_test/batched/dense/Test_Batched_TeamTrsv.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamTrsv.hpp
@@ -16,148 +16,141 @@ using namespace KokkosBatched;
 
 namespace Test {
 namespace TeamTrsv {
-  template<typename U, typename T, typename D>
-  struct ParamTag {
-    typedef U uplo;
-    typedef T trans;
-    typedef D diag;
-  };
-
-  template<typename DeviceType,
-           typename ViewType,
-           typename ScalarType,
-           typename ParamTagType,
-           typename AlgoTagType>
-  struct Functor_TestBatchedTeamTrsv {
-    ViewType _a, _b;
-    
-    ScalarType _alpha;
-
-    KOKKOS_INLINE_FUNCTION
-    Functor_TestBatchedTeamTrsv(const ScalarType alpha, 
-            const ViewType &a,
-            const ViewType &b) 
+template <typename U, typename T, typename D>
+struct ParamTag {
+  typedef U uplo;
+  typedef T trans;
+  typedef D diag;
+};
+
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
+struct Functor_TestBatchedTeamTrsv {
+  ViewType _a, _b;
+
+  ScalarType _alpha;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamTrsv(const ScalarType alpha, const ViewType &a,
+                              const ViewType &b)
       : _a(a), _b(b), _alpha(alpha) {}
 
-    template<typename MemberType>
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const ParamTagType &, const MemberType &member) const {
-      const int k = member.league_rank();
-
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-      auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), 0);
-      
-      KokkosBatched::TeamTrsv<MemberType,
-        typename ParamTagType::uplo,
-        typename ParamTagType::trans,
-        typename ParamTagType::diag,
-        AlgoTagType>::
-        invoke(member, _alpha, aa, bb);
-    }
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &,
+                                         const MemberType &member) const {
+    const int k = member.league_rank();
 
-    inline
-    void run() {
-      typedef typename ViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::TeamTrsv");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion( name.c_str() );
-
-      const int league_size = _b.extent(0);
-      Kokkos::TeamPolicy<DeviceType,ParamTagType> policy(league_size, Kokkos::AUTO);
-      Kokkos::parallel_for(name.c_str(), policy, *this);
-      Kokkos::Profiling::popRegion();
-    }
-  };
-
-  template<typename DeviceType,
-           typename ViewType,
-           typename ScalarType,
-           typename ParamTagType,
-           typename AlgoTagType>
-  void impl_test_batched_trsv(const int N, const int BlkSize) {
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), 0);
+
+    KokkosBatched::TeamTrsv<
+        MemberType, typename ParamTagType::uplo, typename ParamTagType::trans,
+        typename ParamTagType::diag, AlgoTagType>::invoke(member, _alpha, aa,
+                                                          bb);
+  }
+
+  inline void run() {
     typedef typename ViewType::value_type value_type;
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
+    std::string name_region("KokkosBatched::Test::TeamTrsv");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+
+    const int league_size = _b.extent(0);
+    Kokkos::TeamPolicy<DeviceType, ParamTagType> policy(league_size,
+                                                        Kokkos::AUTO);
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
 
-    /// randomized input testing views
-    ScalarType alpha(1.5);
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
+void impl_test_batched_trsv(const int N, const int BlkSize) {
+  typedef typename ViewType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
 
-    ViewType
-      a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize),
-      b0("b0", N, BlkSize, 1),       b1("b1", N, BlkSize, 1);
+  /// randomized input testing views
+  ScalarType alpha(1.5);
 
-    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(13718);
-    Kokkos::fill_random(a0, random, value_type(1.0));
-    Kokkos::fill_random(b0, random, value_type(1.0));
+  ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize),
+      b0("b0", N, BlkSize, 1), b1("b1", N, BlkSize, 1);
 
-    Kokkos::fence();
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
+  Kokkos::fill_random(a0, random, value_type(1.0));
+  Kokkos::fill_random(b0, random, value_type(1.0));
 
-    Kokkos::deep_copy(b0, 1.0);
+  Kokkos::fence();
 
-    Kokkos::deep_copy(a1, a0);
-    Kokkos::deep_copy(b1, b0);
+  Kokkos::deep_copy(b0, 1.0);
 
-    Functor_TestBatchedTeamTrsv<DeviceType,ViewType,ScalarType,
-      ParamTagType,Algo::Trsv::Unblocked>(alpha, a0, b0).run();
-    Functor_TestBatchedTeamTrsv<DeviceType,ViewType,ScalarType,
-      ParamTagType,AlgoTagType>(alpha, a1, b1).run();
+  Kokkos::deep_copy(a1, a0);
+  Kokkos::deep_copy(b1, b0);
 
-    Kokkos::fence();
+  Functor_TestBatchedTeamTrsv<DeviceType, ViewType, ScalarType, ParamTagType,
+                              Algo::Trsv::Unblocked>(alpha, a0, b0)
+      .run();
+  Functor_TestBatchedTeamTrsv<DeviceType, ViewType, ScalarType, ParamTagType,
+                              AlgoTagType>(alpha, a1, b1)
+      .run();
 
-    /// for comparison send it to host
-    typename ViewType::HostMirror b0_host = Kokkos::create_mirror_view(b0);
-    typename ViewType::HostMirror b1_host = Kokkos::create_mirror_view(b1);
+  Kokkos::fence();
 
-    Kokkos::deep_copy(b0_host, b0);
-    Kokkos::deep_copy(b1_host, b1);
+  /// for comparison send it to host
+  typename ViewType::HostMirror b0_host = Kokkos::create_mirror_view(b0);
+  typename ViewType::HostMirror b1_host = Kokkos::create_mirror_view(b1);
 
-    /// check b0 = b1 ; this eps is about 10^-14
-    typedef typename ats::mag_type mag_type;
-    mag_type sum(1), diff(0);
-    const mag_type eps = 1.0e3 * ats::epsilon();
+  Kokkos::deep_copy(b0_host, b0);
+  Kokkos::deep_copy(b1_host, b1);
 
-    for (int k=0;k<N;++k)
-      for (int i=0;i<BlkSize;++i)
-        for (int j=0;j<1;++j) {
-          sum  += ats::abs(b0_host(k,i,j));
-          diff += ats::abs(b0_host(k,i,j)-b1_host(k,i,j));
-        }
-    EXPECT_NEAR_KK( diff/sum, 0.0, eps);
-  }
-}
-}
+  /// check b0 = b1 ; this eps is about 10^-14
+  typedef typename ats::mag_type mag_type;
+  mag_type sum(1), diff(0);
+  const mag_type eps = 1.0e3 * ats::epsilon();
 
+  for (int k = 0; k < N; ++k)
+    for (int i = 0; i < BlkSize; ++i)
+      for (int j = 0; j < 1; ++j) {
+        sum += ats::abs(b0_host(k, i, j));
+        diff += ats::abs(b0_host(k, i, j) - b1_host(k, i, j));
+      }
+  EXPECT_NEAR_KK(diff / sum, 0.0, eps);
+}
+}  // namespace TeamTrsv
+}  // namespace Test
 
-template<typename DeviceType,
-         typename ValueType,
-         typename ScalarType,
-         typename ParamTagType,
-         typename AlgoTagType>
+template <typename DeviceType, typename ValueType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
 int test_batched_team_trsv() {
 #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft,DeviceType> ViewType;
-    Test::TeamTrsv::impl_test_batched_trsv<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(     0, 10);
-    for (int i=0;i<10;++i) {
-      //printf("Testing: LayoutLeft,  Blksize %d\n", i);
-      Test::TeamTrsv::impl_test_batched_trsv<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024,  i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        ViewType;
+    Test::TeamTrsv::impl_test_batched_trsv<DeviceType, ViewType, ScalarType,
+                                           ParamTagType, AlgoTagType>(0, 10);
+    for (int i = 0; i < 10; ++i) {
+      // printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      Test::TeamTrsv::impl_test_batched_trsv<DeviceType, ViewType, ScalarType,
+                                             ParamTagType, AlgoTagType>(1024,
+                                                                        i);
     }
   }
 #endif
 #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> ViewType;
-    Test::TeamTrsv::impl_test_batched_trsv<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(     0, 10);
-    for (int i=0;i<10;++i) {
-      //printf("Testing: LayoutRight, Blksize %d\n", i);
-      Test::TeamTrsv::impl_test_batched_trsv<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024,  i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    Test::TeamTrsv::impl_test_batched_trsv<DeviceType, ViewType, ScalarType,
+                                           ParamTagType, AlgoTagType>(0, 10);
+    for (int i = 0; i < 10; ++i) {
+      // printf("Testing: LayoutRight, Blksize %d\n", i);
+      Test::TeamTrsv::impl_test_batched_trsv<DeviceType, ViewType, ScalarType,
+                                             ParamTagType, AlgoTagType>(1024,
+                                                                        i);
     }
   }
 #endif
 
   return 0;
 }
-
diff --git a/unit_test/batched/dense/Test_Batched_TeamTrsv_Complex.hpp b/unit_test/batched/dense/Test_Batched_TeamTrsv_Complex.hpp
index c5a1b73e46..54074e18c1 100644
--- a/unit_test/batched/dense/Test_Batched_TeamTrsv_Complex.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamTrsv_Complex.hpp
@@ -2,46 +2,52 @@
 
 #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
 // TEST_F( TestCategory, batched_scalar_team_trsv_l_nt_u_dcomplex_dcomplex ) {
-//   typedef ::Test::TeamTrsv::ParamTag<Uplo::Lower,Trans::NoTranspose,Diag::Unit> param_tag_type;
-//   typedef Algo::Trsv::Blocked algo_tag_type;
+//   typedef
+//   ::Test::TeamTrsv::ParamTag<Uplo::Lower,Trans::NoTranspose,Diag::Unit>
+//   param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type;
 //   test_batched_team_trsv<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
 // }
 // TEST_F( TestCategory, batched_scalar_team_trsv_l_nt_n_dcomplex_dcomplex ) {
-//   typedef ::Test::TeamTrsv::ParamTag<Uplo::Lower,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
-//   typedef Algo::Trsv::Blocked algo_tag_type;
+//   typedef
+//   ::Test::TeamTrsv::ParamTag<Uplo::Lower,Trans::NoTranspose,Diag::NonUnit>
+//   param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type;
 //   test_batched_team_trsv<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
 // }
 // TEST_F( TestCategory, batched_scalar_team_trsv_u_nt_u_dcomplex_dcomplex ) {
-//   typedef ::Test::TeamTrsv::ParamTag<Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
-//   typedef Algo::Trsv::Blocked algo_tag_type;
+//   typedef
+//   ::Test::TeamTrsv::ParamTag<Uplo::Upper,Trans::NoTranspose,Diag::Unit>
+//   param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type;
 //   test_batched_team_trsv<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
 // }
 // TEST_F( TestCategory, batched_scalar_team_trsv_u_nt_n_dcomplex_dcomplex ) {
-//   typedef ::Test::TeamTrsv::ParamTag<Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
-//   typedef Algo::Trsv::Blocked algo_tag_type;
+//   typedef
+//   ::Test::TeamTrsv::ParamTag<Uplo::Upper,Trans::NoTranspose,Diag::NonUnit>
+//   param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type;
 //   test_batched_team_trsv<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,algo_tag_type>();
 // }
 
-
-
 // TEST_F( TestCategory, batched_scalar_team_trsv_l_nt_u_dcomplex_double ) {
-//   typedef ::Test::TeamTrsv::ParamTag<Uplo::Lower,Trans::NoTranspose,Diag::Unit> param_tag_type;
-//   typedef Algo::Trsv::Blocked algo_tag_type;
+//   typedef
+//   ::Test::TeamTrsv::ParamTag<Uplo::Lower,Trans::NoTranspose,Diag::Unit>
+//   param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type;
 //   test_batched_team_trsv<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
 // }
 // TEST_F( TestCategory, batched_scalar_team_trsv_l_nt_n_dcomplex_double ) {
-//   typedef ::Test::TeamTrsv::ParamTag<Uplo::Lower,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
-//   typedef Algo::Trsv::Blocked algo_tag_type;
+//   typedef
+//   ::Test::TeamTrsv::ParamTag<Uplo::Lower,Trans::NoTranspose,Diag::NonUnit>
+//   param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type;
 //   test_batched_team_trsv<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
 // }
 // TEST_F( TestCategory, batched_scalar_team_trsv_u_nt_u_dcomplex_double ) {
-//   typedef ::Test::TeamTrsv::ParamTag<Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
-//   typedef Algo::Trsv::Blocked algo_tag_type;
+//   typedef
+//   ::Test::TeamTrsv::ParamTag<Uplo::Upper,Trans::NoTranspose,Diag::Unit>
+//   param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type;
 //   test_batched_team_trsv<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
 // }
 // TEST_F( TestCategory, batched_scalar_team_trsv_u_nt_n_dcomplex_double ) {
-//   typedef ::Test::TeamTrsv::ParamTag<Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
-//   typedef Algo::Trsv::Blocked algo_tag_type;
+//   typedef
+//   ::Test::TeamTrsv::ParamTag<Uplo::Upper,Trans::NoTranspose,Diag::NonUnit>
+//   param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type;
 //   test_batched_team_trsv<TestExecSpace,Kokkos::complex<double>,double,param_tag_type,algo_tag_type>();
 // }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamTrsv_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamTrsv_Real.hpp
index abbdc29c73..b36e8f5477 100644
--- a/unit_test/batched/dense/Test_Batched_TeamTrsv_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamTrsv_Real.hpp
@@ -1,47 +1,54 @@
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
 // TEST_F( TestCategory, batched_scalar_team_trsv_l_nt_u_float_float ) {
-//   typedef ::Test::TeamTrsv::ParamTag<Uplo::Lower,Trans::NoTranspose,Diag::Unit> param_tag_type;
-//   typedef Algo::Trsv::Blocked algo_tag_type;
+//   typedef
+//   ::Test::TeamTrsv::ParamTag<Uplo::Lower,Trans::NoTranspose,Diag::Unit>
+//   param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type;
 //   test_batched_team_trsv<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
 // }
 // TEST_F( TestCategory, batched_scalar_team_trsv_l_nt_n_float_float ) {
-//   typedef ::Test::TeamTrsv::ParamTag<Uplo::Lower,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
-//   typedef Algo::Trsv::Blocked algo_tag_type;
+//   typedef
+//   ::Test::TeamTrsv::ParamTag<Uplo::Lower,Trans::NoTranspose,Diag::NonUnit>
+//   param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type;
 //   test_batched_team_trsv<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
 // }
 // TEST_F( TestCategory, batched_scalar_team_trsv_u_nt_u_float_float ) {
-//   typedef ::Test::TeamTrsv::ParamTag<Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
-//   typedef Algo::Trsv::Blocked algo_tag_type;
+//   typedef
+//   ::Test::TeamTrsv::ParamTag<Uplo::Upper,Trans::NoTranspose,Diag::Unit>
+//   param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type;
 //   test_batched_team_trsv<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
 // }
 // TEST_F( TestCategory, batched_scalar_team_trsv_u_nt_n_float_float ) {
-//   typedef ::Test::TeamTrsv::ParamTag<Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
-//   typedef Algo::Trsv::Blocked algo_tag_type;
+//   typedef
+//   ::Test::TeamTrsv::ParamTag<Uplo::Upper,Trans::NoTranspose,Diag::NonUnit>
+//   param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type;
 //   test_batched_team_trsv<TestExecSpace,float,float,param_tag_type,algo_tag_type>();
 // }
 #endif
 
-
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
 // TEST_F( TestCategory, batched_scalar_team_trsv_l_nt_u_double_double ) {
-//   typedef ::Test::TeamTrsv::ParamTag<Uplo::Lower,Trans::NoTranspose,Diag::Unit> param_tag_type;
-//   typedef Algo::Trsv::Blocked algo_tag_type;
+//   typedef
+//   ::Test::TeamTrsv::ParamTag<Uplo::Lower,Trans::NoTranspose,Diag::Unit>
+//   param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type;
 //   test_batched_team_trsv<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
 // }
 // TEST_F( TestCategory, batched_scalar_team_trsv_l_nt_n_double_double ) {
-//   typedef ::Test::TeamTrsv::ParamTag<Uplo::Lower,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
-//   typedef Algo::Trsv::Blocked algo_tag_type;
+//   typedef
+//   ::Test::TeamTrsv::ParamTag<Uplo::Lower,Trans::NoTranspose,Diag::NonUnit>
+//   param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type;
 //   test_batched_team_trsv<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
 // }
 // TEST_F( TestCategory, batched_scalar_team_trsv_u_nt_u_double_double ) {
-//   typedef ::Test::TeamTrsv::ParamTag<Uplo::Upper,Trans::NoTranspose,Diag::Unit> param_tag_type;
-//   typedef Algo::Trsv::Blocked algo_tag_type;
+//   typedef
+//   ::Test::TeamTrsv::ParamTag<Uplo::Upper,Trans::NoTranspose,Diag::Unit>
+//   param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type;
 //   test_batched_team_trsv<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
 // }
 // TEST_F( TestCategory, batched_scalar_team_trsv_u_nt_n_double_double ) {
-//   typedef ::Test::TeamTrsv::ParamTag<Uplo::Upper,Trans::NoTranspose,Diag::NonUnit> param_tag_type;
-//   typedef Algo::Trsv::Blocked algo_tag_type;
+//   typedef
+//   ::Test::TeamTrsv::ParamTag<Uplo::Upper,Trans::NoTranspose,Diag::NonUnit>
+//   param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type;
 //   test_batched_team_trsv<TestExecSpace,double,double,param_tag_type,algo_tag_type>();
 // }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorAxpy.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorAxpy.hpp
new file mode 100644
index 0000000000..7194757687
--- /dev/null
+++ b/unit_test/batched/dense/Test_Batched_TeamVectorAxpy.hpp
@@ -0,0 +1,152 @@
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "gtest/gtest.h"
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Random.hpp"
+
+#include "KokkosBatched_Axpy.hpp"
+
+#include "KokkosKernels_TestUtils.hpp"
+
+using namespace KokkosBatched;
+
+namespace Test {
+namespace TeamVectorAxpy {
+
+template <typename DeviceType, typename ViewType, typename alphaViewType>
+struct Functor_TestBatchedTeamVectorAxpy {
+  const alphaViewType _alpha;
+  const ViewType _X;
+  const ViewType _Y;
+  const int _N_team;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamVectorAxpy(const alphaViewType &alpha,
+                                    const ViewType &X, const ViewType &Y,
+                                    const int N_team)
+      : _alpha(alpha), _X(X), _Y(Y), _N_team(N_team) {}
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const {
+    const int first_matrix = static_cast<int>(member.league_rank()) * _N_team;
+    const int N            = _X.extent(0);
+    const int last_matrix =
+        (static_cast<int>(member.league_rank() + 1) * _N_team < N
+             ? static_cast<int>(member.league_rank() + 1) * _N_team
+             : N);
+
+    auto alpha =
+        Kokkos::subview(_alpha, Kokkos::make_pair(first_matrix, last_matrix));
+    auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+    auto y = Kokkos::subview(_Y, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+
+    KokkosBatched::TeamVectorAxpy<MemberType>::invoke(member, alpha, x, y);
+  }
+
+  inline void run() {
+    typedef typename ViewType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::TeamVectorAxpy");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::TeamPolicy<DeviceType> policy(_X.extent(0) / _N_team,
+                                          Kokkos::AUTO(), Kokkos::AUTO());
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename ViewType, typename alphaViewType>
+void impl_test_batched_axpy(const int N, const int BlkSize, const int N_team) {
+  typedef typename ViewType::value_type value_type;
+  typedef typename ViewType::const_value_type const_value_type;
+  typedef typename alphaViewType::const_value_type alpha_const_value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  ViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize), Y0("y0", N, BlkSize),
+      Y1("y1", N, BlkSize);
+
+  alphaViewType alpha("alpha", N);
+
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
+  Kokkos::fill_random(X0, random, const_value_type(1.0));
+  Kokkos::fill_random(Y0, random, const_value_type(1.0));
+  Kokkos::fill_random(alpha, random, alpha_const_value_type(1.0));
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(X1, X0);
+  Kokkos::deep_copy(Y1, Y0);
+
+  /// test body
+  auto alpha_host = Kokkos::create_mirror_view(alpha);
+  auto X0_host    = Kokkos::create_mirror_view(X0);
+  auto Y0_host    = Kokkos::create_mirror_view(Y0);
+
+  Kokkos::deep_copy(alpha_host, alpha);
+  Kokkos::deep_copy(X0_host, X0);
+  Kokkos::deep_copy(Y0_host, Y0);
+
+  for (int l = 0; l < N; ++l)
+    for (int i = 0; i < BlkSize; ++i)
+      Y0_host(l, i) += alpha_host(l) * X0_host(l, i);
+
+  Functor_TestBatchedTeamVectorAxpy<DeviceType, ViewType, alphaViewType>(
+      alpha, X1, Y1, N_team)
+      .run();
+
+  Kokkos::fence();
+
+  /// for comparison send it to host
+  auto Y1_host = Kokkos::create_mirror_view(Y1);
+
+  Kokkos::deep_copy(Y1_host, Y1);
+
+  /// check c0 = c1 ; this eps is about 10^-14
+  typedef typename ats::mag_type mag_type;
+  mag_type sum(1), diff(0);
+  const mag_type eps = 1.0e3 * ats::epsilon();
+
+  for (int l = 0; l < N; ++l)
+    for (int i = 0; i < BlkSize; ++i) {
+      sum += ats::abs(Y0_host(l, i));
+      diff += ats::abs(Y0_host(l, i) - Y1_host(l, i));
+    }
+  EXPECT_NEAR_KK(diff / sum, 0, eps);
+}
+}  // namespace TeamVectorAxpy
+}  // namespace Test
+
+template <typename DeviceType, typename ValueType, typename ScalarType>
+int test_batched_teamvector_axpy() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
+  {
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType> ViewType;
+    typedef Kokkos::View<ScalarType *, Kokkos::LayoutLeft, DeviceType>
+        alphaViewType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::TeamVectorAxpy::impl_test_batched_axpy<DeviceType, ViewType,
+                                                   alphaViewType>(1024, i, 2);
+    }
+  }
+#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
+  {
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    typedef Kokkos::View<ScalarType *, Kokkos::LayoutRight, DeviceType>
+        alphaViewType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::TeamVectorAxpy::impl_test_batched_axpy<DeviceType, ViewType,
+                                                   alphaViewType>(1024, i, 2);
+    }
+  }
+#endif
+
+  return 0;
+}
diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorAxpy_Complex.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorAxpy_Complex.hpp
new file mode 100644
index 0000000000..71e820a981
--- /dev/null
+++ b/unit_test/batched/dense/Test_Batched_TeamVectorAxpy_Complex.hpp
@@ -0,0 +1,12 @@
+
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
+TEST_F(TestCategory, batched_scalar_teamvector_axpy_nt_dcomplex_dcomplex) {
+  test_batched_teamvector_axpy<TestExecSpace, Kokkos::complex<double>,
+                               Kokkos::complex<double>>();
+}
+
+TEST_F(TestCategory, batched_scalar_teamvector_axpy_nt_dcomplex_double) {
+  test_batched_teamvector_axpy<TestExecSpace, Kokkos::complex<double>,
+                               double>();
+}
+#endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorAxpy_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorAxpy_Real.hpp
new file mode 100644
index 0000000000..b97f8fd693
--- /dev/null
+++ b/unit_test/batched/dense/Test_Batched_TeamVectorAxpy_Real.hpp
@@ -0,0 +1,12 @@
+
+#if defined(KOKKOSKERNELS_INST_FLOAT)
+TEST_F(TestCategory, batched_scalar_teamvector_axpy_nt_float_float) {
+  test_batched_teamvector_axpy<TestExecSpace, float, float>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+TEST_F(TestCategory, batched_scalar_teamvector_axpy_nt_double_double) {
+  test_batched_teamvector_axpy<TestExecSpace, double, double>();
+}
+#endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorEigendecomposition.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorEigendecomposition.hpp
index a02c701acd..ceae3ec46e 100644
--- a/unit_test/batched/dense/Test_Batched_TeamVectorEigendecomposition.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamVectorEigendecomposition.hpp
@@ -18,18 +18,18 @@ namespace Test {
            typename ViewRank3Type,
            typename ViewRank2Type>
   struct Functor_TestBatchedTeamVectorEigendecomposition {
-    ViewRank3Type _A; 
+    ViewRank3Type _A;
     ViewRank2Type _Er, _Ei;
     ViewRank3Type _UL, _UR;
     ViewRank2Type _W;
-    
+
     KOKKOS_INLINE_FUNCTION
     Functor_TestBatchedTeamVectorEigendecomposition(const ViewRank3Type A,
-						    const ViewRank2Type Er,
-						    const ViewRank2Type Ei,
-						    const ViewRank3Type UL,
-						    const ViewRank3Type UR,
-						    const ViewRank2Type W)
+                                                    const ViewRank2Type Er,
+                                                    const ViewRank2Type Ei,
+                                                    const ViewRank3Type UL,
+                                                    const ViewRank3Type UR,
+                                                    const ViewRank2Type W)
       : _A(A), _Er(Er), _Ei(Ei), _UL(UL), _UR(UR), _W(W)
     {}
 
@@ -44,36 +44,38 @@ namespace Test {
       auto UR = Kokkos::subview(_UR, k, Kokkos::ALL(), Kokkos::ALL());
       auto W  = Kokkos::subview(_W,  k, Kokkos::ALL());
 
-      TeamVectorEigendecomposition<MemberType>::invoke(member, A, er, ei, UL, UR, W);        
+      TeamVectorEigendecomposition<MemberType>::invoke(member, A, er, ei, UL,
+UR, W);
     }
-    
+
     inline
     void run() {
       typedef typename ViewRank3Type::value_type value_type;
-      std::string name_region("KokkosBatched::Test::TeamVectorEigendecomposition");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion( name.c_str() );
+      std::string
+name_region("KokkosBatched::Test::TeamVectorEigendecomposition"); std::string
+name_value_type = ( std::is_same<value_type,float>::value ? "::Float" :
+                                      std::is_same<value_type,double>::value ?
+"::Double" : std::is_same<value_type,Kokkos::complex<float> >::value ?
+"::ComplexFloat" : std::is_same<value_type,Kokkos::complex<double> >::value ?
+"::ComplexDouble" : "::UnknownValueType" ); std::string name = name_region +
+name_value_type; Kokkos::Profiling::pushRegion( name.c_str() );
       Kokkos::TeamPolicy<DeviceType> policy(_A.extent(0), Kokkos::AUTO);
-      Kokkos::parallel_for(name.c_str(), policy, *this);            
+      Kokkos::parallel_for(name.c_str(), policy, *this);
       Kokkos::Profiling::popRegion();
     }
   };
-    
+
   template<typename DeviceType,
            typename ValueType,
            typename LayoutType>
-  void impl_test_batched_teamvector_eigendecomposition(const int N, const int m) {
-    typedef ValueType value_type;
-    typedef Kokkos::View<value_type***,LayoutType,DeviceType> ViewRank3Type;
-    typedef Kokkos::View<value_type**,LayoutType,DeviceType> ViewRank2Type;
+  void impl_test_batched_teamvector_eigendecomposition(const int N, const int m)
+{ typedef ValueType value_type; typedef
+Kokkos::View<value_type***,LayoutType,DeviceType> ViewRank3Type; typedef
+Kokkos::View<value_type**,LayoutType,DeviceType> ViewRank2Type;
 
     /// input
     ViewRank3Type A("A", N, m, m);
-    ViewRank2Type W("W", N, 2*m*m+m*5);    
+    ViewRank2Type W("W", N, 2*m*m+m*5);
 
     /// output
     ViewRank2Type Er("Er", N, m);
@@ -81,10 +83,9 @@ namespace Test {
     ViewRank3Type UL("UL", N, m, m);
     ViewRank3Type UR("UR", N, m, m);
 
-    
-    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(13718);
-    Kokkos::fill_random(A, random, value_type(1.0));
-    Kokkos::fence();
+
+    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space>
+random(13718); Kokkos::fill_random(A, random, value_type(1.0)); Kokkos::fence();
 
     /// test body
     Functor_TestBatchedTeamVectorEigendecomposition
@@ -93,26 +94,31 @@ namespace Test {
   }
 }
 
-template<typename DeviceType, 
+template<typename DeviceType,
          typename ValueType>
 int test_batched_teamvector_eigendecomposition() {
-// #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) 
+// #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
 //   {
-//     Test::impl_test_batched_teamvector_eigendecomposition<DeviceType,ValueType,Kokkos::LayoutLeft>(10, 10);
-//     for (int i=0;i<10;++i) {                                                                   
-//       Test::impl_test_batched_teamvector_eigendecomposition<DeviceType,ValueType,Kokkos::LayoutLeft>(10, 1);                     
+//
+Test::impl_test_batched_teamvector_eigendecomposition<DeviceType,ValueType,Kokkos::LayoutLeft>(10,
+10);
+//     for (int i=0;i<10;++i) {
+//
+Test::impl_test_batched_teamvector_eigendecomposition<DeviceType,ValueType,Kokkos::LayoutLeft>(10,
+1);
 //     }
 //   }
 // #endif
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) 
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
   {
-    Test::impl_test_batched_teamvector_eigendecomposition<DeviceType,ValueType,Kokkos::LayoutRight>(10, 10);
-    for (int i=0;i<10;++i) {                                                                   
-      Test::impl_test_batched_teamvector_eigendecomposition<DeviceType,ValueType,Kokkos::LayoutRight>(10, 1);
+    Test::impl_test_batched_teamvector_eigendecomposition<DeviceType,ValueType,Kokkos::LayoutRight>(10,
+10); for (int i=0;i<10;++i) {
+      Test::impl_test_batched_teamvector_eigendecomposition<DeviceType,ValueType,Kokkos::LayoutRight>(10,
+1);
     }
   }
 #endif
-  
+
   return 0;
 }
 */
diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorGemm.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorGemm.hpp
index 03234040d9..8d10440bc2 100644
--- a/unit_test/batched/dense/Test_Batched_TeamVectorGemm.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamVectorGemm.hpp
@@ -12,145 +12,140 @@ using namespace KokkosBatched;
 namespace Test {
 namespace TeamVectorGemm {
 
-  template<typename TA, typename TB>
+template <typename TA, typename TB>
 struct ParamTag {
   typedef TA transA;
   typedef TB transB;
 };
 
-  template <typename DeviceType, typename ViewType, typename ScalarType,
+template <typename DeviceType, typename ViewType, typename ScalarType,
           typename ParamTagType, typename AlgoTagType>
 struct Functor_TestBatchedTeamVector {
-    ViewType _a, _b, _c;
+  ViewType _a, _b, _c;
 
-    ScalarType _alpha, _beta;
+  ScalarType _alpha, _beta;
 
-    KOKKOS_INLINE_FUNCTION
-    Functor_TestBatchedTeamVector(const ScalarType alpha, const ViewType &a,
-                                  const ViewType &b, const ScalarType beta,
-                                  const ViewType &c)
-        : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {}
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamVector(const ScalarType alpha, const ViewType &a,
+                                const ViewType &b, const ScalarType beta,
+                                const ViewType &c)
+      : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {}
 
-    template<typename MemberType>
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const ParamTagType &, const MemberType &member) const {
-      const int k = member.league_rank();
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &,
+                                         const MemberType &member) const {
+    const int k = member.league_rank();
 
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-      auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
-      auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL());
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
+    auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL());
 
-      KokkosBatched::TeamVectorGemm<MemberType,
-        typename ParamTagType::transA,
-        typename ParamTagType::transB,
-        AlgoTagType>::
-        invoke(member, _alpha, aa, bb, _beta, cc);
-    }
+    KokkosBatched::TeamVectorGemm<MemberType, typename ParamTagType::transA,
+                                  typename ParamTagType::transB,
+                                  AlgoTagType>::invoke(member, _alpha, aa, bb,
+                                                       _beta, cc);
+  }
 
-    inline
-    void run() {
-      typedef typename ViewType::value_type value_type;
-      std::string name_region("KokkosBatched::Test::TeamVector");
-      std::string name_value_type =
-          (std::is_same<value_type, float>::value    ? "::Float"
-           : std::is_same<value_type, double>::value ? "::Double"
-           : std::is_same<value_type, Kokkos::complex<float> >::value
-               ? "::ComplexFloat"
-           : std::is_same<value_type, Kokkos::complex<double> >::value
-               ? "::ComplexDouble"
-               : "::UnknownValueType");
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion(name.c_str());
-      const int league_size = _c.extent(0);
-      Kokkos::TeamPolicy<DeviceType, ParamTagType> policy(league_size,
-                                                          Kokkos::AUTO);
-      Kokkos::parallel_for(name.c_str(), policy, *this);
-      Kokkos::Profiling::popRegion();
-    }
-  };
+  inline void run() {
+    typedef typename ViewType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::TeamVector");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    const int league_size = _c.extent(0);
+    Kokkos::TeamPolicy<DeviceType, ParamTagType> policy(league_size,
+                                                        Kokkos::AUTO);
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
 
-  template <typename DeviceType, typename ViewType, typename ScalarType,
-            typename ParamTagType, typename AlgoTagType>
-  void impl_test_batched_teamvectorgemm(const int N, const int matAdim1, const int matAdim2, const int matBdim1, const int matBdim2,
-      const int matCdim1, const int matCdim2) {
-    using transA          = typename ParamTagType::transA;
-    using transB          = typename ParamTagType::transB;
-    using execution_space = typename DeviceType::execution_space;
-    using value_type      = typename ViewType::value_type;
-    using ats             = Kokkos::Details::ArithTraits<value_type>;
+template <typename DeviceType, typename ViewType, typename ScalarType,
+          typename ParamTagType, typename AlgoTagType>
+void impl_test_batched_teamvectorgemm(const int N, const int matAdim1,
+                                      const int matAdim2, const int matBdim1,
+                                      const int matBdim2, const int matCdim1,
+                                      const int matCdim2) {
+  using transA          = typename ParamTagType::transA;
+  using transB          = typename ParamTagType::transB;
+  using execution_space = typename DeviceType::execution_space;
+  using value_type      = typename ViewType::value_type;
+  using ats             = Kokkos::Details::ArithTraits<value_type>;
 
-    /// randomized input testing views
-    ScalarType alpha = ScalarType(1.5), beta = ScalarType(3.0);
+  /// randomized input testing views
+  ScalarType alpha = ScalarType(1.5), beta = ScalarType(3.0);
 
-    ViewType a_expected("a_expected", N, matAdim1, matAdim2),
-        a_actual("a_actual", N, matAdim1, matAdim2),
-        b_expected("b_expected", N, matBdim1, matBdim2),
-        b_actual("b_actual", N, matBdim1, matBdim2),
-        c_expected("c_expected", N, matCdim1, matCdim2),
-        c_actual("c_actual", N, matCdim1, matCdim2);
+  ViewType a_expected("a_expected", N, matAdim1, matAdim2),
+      a_actual("a_actual", N, matAdim1, matAdim2),
+      b_expected("b_expected", N, matBdim1, matBdim2),
+      b_actual("b_actual", N, matBdim1, matBdim2),
+      c_expected("c_expected", N, matCdim1, matCdim2),
+      c_actual("c_actual", N, matCdim1, matCdim2);
 
-    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
-        13718);
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
 
-    Kokkos::fill_random(a_expected, random, value_type(1.0));
-    Kokkos::fill_random(b_expected, random, value_type(1.0));
-    Kokkos::fill_random(c_expected, random, value_type(1.0));
+  Kokkos::fill_random(a_expected, random, value_type(1.0));
+  Kokkos::fill_random(b_expected, random, value_type(1.0));
+  Kokkos::fill_random(c_expected, random, value_type(1.0));
 
-    Kokkos::fence();
+  Kokkos::fence();
 
-    Kokkos::deep_copy(a_actual, a_expected);
-    Kokkos::deep_copy(b_actual, b_expected);
-    Kokkos::deep_copy(c_actual, c_expected);
+  Kokkos::deep_copy(a_actual, a_expected);
+  Kokkos::deep_copy(b_actual, b_expected);
+  Kokkos::deep_copy(c_actual, c_expected);
 
-    // Functor_TestBatchedTeamVector<DeviceType,ViewType,ScalarType,
-    //   ParamTagType,Algo::Gemm::Unblocked>(alpha, a_expected, b_expected,
-    //   beta, c_expected).run();
-    Functor_BatchedVanillaGEMM<ViewType, ViewType, ViewType, execution_space>
-        vgemm;
-    vgemm.A_t = std::is_same<transA, Trans::Transpose>::value;
-    vgemm.B_t = std::is_same<transB, Trans::Transpose>::value;
-    vgemm.A_c = vgemm.B_c = false;
-    vgemm.A               = a_expected;
-    vgemm.B               = b_expected;
-    vgemm.C               = c_expected;
-    vgemm.alpha           = alpha;
-    vgemm.beta            = beta;
-    vgemm.run();  // Compute c_expected
+  // Functor_TestBatchedTeamVector<DeviceType,ViewType,ScalarType,
+  //   ParamTagType,Algo::Gemm::Unblocked>(alpha, a_expected, b_expected,
+  //   beta, c_expected).run();
+  Functor_BatchedVanillaGEMM<ViewType, ViewType, ViewType, execution_space>
+      vgemm;
+  vgemm.A_t = std::is_same<transA, Trans::Transpose>::value;
+  vgemm.B_t = std::is_same<transB, Trans::Transpose>::value;
+  vgemm.A_c = vgemm.B_c = false;
+  vgemm.A               = a_expected;
+  vgemm.B               = b_expected;
+  vgemm.C               = c_expected;
+  vgemm.alpha           = alpha;
+  vgemm.beta            = beta;
+  vgemm.run();  // Compute c_expected
 
-    Functor_TestBatchedTeamVector<DeviceType, ViewType, ScalarType,
-                                  ParamTagType, AlgoTagType>(
-        alpha, a_actual, b_actual, beta, c_actual)
-        .run();
+  Functor_TestBatchedTeamVector<DeviceType, ViewType, ScalarType, ParamTagType,
+                                AlgoTagType>(alpha, a_actual, b_actual, beta,
+                                             c_actual)
+      .run();
 
-    Kokkos::fence();
+  Kokkos::fence();
 
-    typename ViewType::HostMirror c_expected_host =
-        Kokkos::create_mirror_view(c_expected);
-    typename ViewType::HostMirror c_actual_host =
-        Kokkos::create_mirror_view(c_actual);
+  typename ViewType::HostMirror c_expected_host =
+      Kokkos::create_mirror_view(c_expected);
+  typename ViewType::HostMirror c_actual_host =
+      Kokkos::create_mirror_view(c_actual);
 
-    // Copy to host for comparison
-    Kokkos::deep_copy(c_expected_host, c_expected);
-    Kokkos::deep_copy(c_actual_host, c_actual);
+  // Copy to host for comparison
+  Kokkos::deep_copy(c_expected_host, c_expected);
+  Kokkos::deep_copy(c_actual_host, c_actual);
 
-    using mag_type = typename ats::mag_type;
-    mag_type sum(1), diff(0);
+  using mag_type = typename ats::mag_type;
+  mag_type sum(1), diff(0);
 
-    mag_type eps = ats::epsilon();
+  mag_type eps = ats::epsilon();
 
-    eps *=
-        std::is_same<value_type, Kokkos::Experimental::half_t>::value ? 4 : 1e3;
+  eps *= std::is_same<value_type, Kokkos::Experimental::half_t>::value ||
+                 std::is_same<value_type, Kokkos::Experimental::bhalf_t>::value
+             ? 4
+             : 1e3;
 
-    for (int k = 0; k < N; ++k)
-      for (int i = 0; i < matCdim1; ++i)
-        for (int j = 0; j < matCdim2; ++j) {
-          sum += ats::abs(c_expected_host(k, i, j));
-          diff += ats::abs(c_expected_host(k, i, j) - c_actual_host(k, i, j));
-        }
-    EXPECT_NEAR_KK(diff / sum, 0, eps);
-  }
-  }
+  for (int k = 0; k < N; ++k)
+    for (int i = 0; i < matCdim1; ++i)
+      for (int j = 0; j < matCdim2; ++j) {
+        sum += ats::abs(c_expected_host(k, i, j));
+        diff += ats::abs(c_expected_host(k, i, j) - c_actual_host(k, i, j));
+      }
+  EXPECT_NEAR_KK(diff / sum, 0, eps);
 }
+}  // namespace TeamVectorGemm
+}  // namespace Test
 
 // void (*impl_test)(const int, const int, const int, const int, const int,
 // const int, const int)
@@ -195,10 +190,18 @@ int test_batched_teamvectorgemm() {
                         KokkosBatched::Trans::Transpose>::value) &&
           (std::is_same<typename ParamTagType::transB,
                         KokkosBatched::Trans::NoTranspose>::value)) {
-        Test::TeamVectorGemm::impl_test_batched_teamvectorgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimK, dimN, dimM, dimN); }
-      if ((std::is_same<typename ParamTagType::transA,KokkosBatched::Trans::Transpose>::value) &&
-        (std::is_same<typename ParamTagType::transB,KokkosBatched::Trans::Transpose>::value)) {
-          Test::TeamVectorGemm::impl_test_batched_teamvectorgemm<DeviceType,ViewType,ScalarType,ParamTagType,AlgoTagType>(1024, dimK, dimM, dimN, dimK, dimM, dimN); }
+        Test::TeamVectorGemm::impl_test_batched_teamvectorgemm<
+            DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>(
+            1024, dimK, dimM, dimK, dimN, dimM, dimN);
+      }
+      if ((std::is_same<typename ParamTagType::transA,
+                        KokkosBatched::Trans::Transpose>::value) &&
+          (std::is_same<typename ParamTagType::transB,
+                        KokkosBatched::Trans::Transpose>::value)) {
+        Test::TeamVectorGemm::impl_test_batched_teamvectorgemm<
+            DeviceType, ViewType, ScalarType, ParamTagType, AlgoTagType>(
+            1024, dimK, dimM, dimN, dimK, dimM, dimN);
+      }
     }
   }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorGemm_Complex.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorGemm_Complex.hpp
index 7bdeda78e4..a348c35b98 100644
--- a/unit_test/batched/dense/Test_Batched_TeamVectorGemm_Complex.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamVectorGemm_Complex.hpp
@@ -1,53 +1,79 @@
 #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT)
-TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_nt_scomplex_scomplex ) {
-  typedef ::Test::TeamVectorGemm::ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_scomplex_scomplex) {
+  typedef ::Test::TeamVectorGemm::ParamTag<Trans::NoTranspose,
+                                           Trans::NoTranspose>
+      param_tag_type;
 
-  //test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,Algo::Gemm::Blocked>();
-  test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,Algo::Gemm::Unblocked>();
+  // test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace, Kokkos::complex<float>,
+                              Kokkos::complex<float>, param_tag_type,
+                              Algo::Gemm::Unblocked>();
 }
-TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_nt_scomplex_scomplex ) {
-  typedef ::Test::TeamVectorGemm::ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_scomplex_scomplex) {
+  typedef ::Test::TeamVectorGemm::ParamTag<Trans::Transpose, Trans::NoTranspose>
+      param_tag_type;
 
-  //test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,Algo::Gemm::Blocked>();
-  test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,Algo::Gemm::Unblocked>();
+  // test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace, Kokkos::complex<float>,
+                              Kokkos::complex<float>, param_tag_type,
+                              Algo::Gemm::Unblocked>();
 }
-TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_t_scomplex_scomplex ) {
-  typedef ::Test::TeamVectorGemm::ParamTag<Trans::NoTranspose,Trans::Transpose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_scomplex_scomplex) {
+  typedef ::Test::TeamVectorGemm::ParamTag<Trans::NoTranspose, Trans::Transpose>
+      param_tag_type;
 
-  //test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,Algo::Gemm::Blocked>();
-  test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,Algo::Gemm::Unblocked>();
+  // test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace, Kokkos::complex<float>,
+                              Kokkos::complex<float>, param_tag_type,
+                              Algo::Gemm::Unblocked>();
 }
-TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_t_scomplex_scomplex ) {
-  typedef ::Test::TeamVectorGemm::ParamTag<Trans::Transpose,Trans::Transpose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_scomplex_scomplex) {
+  typedef ::Test::TeamVectorGemm::ParamTag<Trans::Transpose, Trans::Transpose>
+      param_tag_type;
 
-  //test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,Algo::Gemm::Blocked>();
-  test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,Algo::Gemm::Unblocked>();
+  // test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<float>,Kokkos::complex<float>,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace, Kokkos::complex<float>,
+                              Kokkos::complex<float>, param_tag_type,
+                              Algo::Gemm::Unblocked>();
 }
 #endif
 
 #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
-TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_nt_dcomplex_dcomplex ) {
-  typedef ::Test::TeamVectorGemm::ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_dcomplex_dcomplex) {
+  typedef ::Test::TeamVectorGemm::ParamTag<Trans::NoTranspose,
+                                           Trans::NoTranspose>
+      param_tag_type;
 
-  //test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,Algo::Gemm::Blocked>();
-  test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,Algo::Gemm::Unblocked>();
+  // test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace, Kokkos::complex<double>,
+                              Kokkos::complex<double>, param_tag_type,
+                              Algo::Gemm::Unblocked>();
 }
-TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_nt_dcomplex_dcomplex ) {
-  typedef ::Test::TeamVectorGemm::ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_dcomplex_dcomplex) {
+  typedef ::Test::TeamVectorGemm::ParamTag<Trans::Transpose, Trans::NoTranspose>
+      param_tag_type;
 
-  //test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,Algo::Gemm::Blocked>();
-  test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,Algo::Gemm::Unblocked>();
+  // test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace, Kokkos::complex<double>,
+                              Kokkos::complex<double>, param_tag_type,
+                              Algo::Gemm::Unblocked>();
 }
-TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_t_dcomplex_dcomplex ) {
-  typedef ::Test::TeamVectorGemm::ParamTag<Trans::NoTranspose,Trans::Transpose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_dcomplex_dcomplex) {
+  typedef ::Test::TeamVectorGemm::ParamTag<Trans::NoTranspose, Trans::Transpose>
+      param_tag_type;
 
-  //test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,Algo::Gemm::Blocked>();
-  test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,Algo::Gemm::Unblocked>();
+  // test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace, Kokkos::complex<double>,
+                              Kokkos::complex<double>, param_tag_type,
+                              Algo::Gemm::Unblocked>();
 }
-TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_t_dcomplex_dcomplex ) {
-  typedef ::Test::TeamVectorGemm::ParamTag<Trans::Transpose,Trans::Transpose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_dcomplex_dcomplex) {
+  typedef ::Test::TeamVectorGemm::ParamTag<Trans::Transpose, Trans::Transpose>
+      param_tag_type;
 
-  //test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,Algo::Gemm::Blocked>();
-  test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,Algo::Gemm::Unblocked>();
+  // test_batched_teamvectorgemm<TestExecSpace,Kokkos::complex<double>,Kokkos::complex<double>,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace, Kokkos::complex<double>,
+                              Kokkos::complex<double>, param_tag_type,
+                              Algo::Gemm::Unblocked>();
 }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorGemm_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorGemm_Real.hpp
index 1e595dcdb7..ed43e31bf7 100644
--- a/unit_test/batched/dense/Test_Batched_TeamVectorGemm_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamVectorGemm_Real.hpp
@@ -1,5 +1,45 @@
+#if defined(KOKKOS_BHALF_T_IS_FLOAT)
+TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_bhalf_bhalf) {
+  typedef ::Test::TeamVectorGemm::ParamTag<Trans::NoTranspose,
+                                           Trans::NoTranspose>
+      param_tag_type;
+
+  // test_batched_teamvectorgemm<TestExecSpace,::Test::bhalfScalarType,::Test::bhalfScalarType,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace, ::Test::bhalfScalarType,
+                              ::Test::bhalfScalarType, param_tag_type,
+                              Algo::Gemm::Unblocked>();
+}
+TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_bhalf_bhalf) {
+  typedef ::Test::TeamVectorGemm::ParamTag<Trans::Transpose, Trans::NoTranspose>
+      param_tag_type;
+
+  // test_batched_teamvectorgemm<TestExecSpace,::Test::bhalfScalarType,::Test::bhalfScalarType,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace, ::Test::bhalfScalarType,
+                              ::Test::bhalfScalarType, param_tag_type,
+                              Algo::Gemm::Unblocked>();
+}
+TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_bhalf_bhalf) {
+  typedef ::Test::TeamVectorGemm::ParamTag<Trans::NoTranspose, Trans::Transpose>
+      param_tag_type;
+
+  // test_batched_teamvectorgemm<TestExecSpace,::Test::bhalfScalarType,::Test::bhalfScalarType,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace, ::Test::bhalfScalarType,
+                              ::Test::bhalfScalarType, param_tag_type,
+                              Algo::Gemm::Unblocked>();
+}
+TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_bhalf_bhalf) {
+  typedef ::Test::TeamVectorGemm::ParamTag<Trans::Transpose, Trans::Transpose>
+      param_tag_type;
+
+  // test_batched_teamvectorgemm<TestExecSpace,::Test::bhalfScalarType,::Test::bhalfScalarType,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace, ::Test::bhalfScalarType,
+                              ::Test::bhalfScalarType, param_tag_type,
+                              Algo::Gemm::Unblocked>();
+}
+#endif  // KOKKOS_BHALF_T_IS_FLOAT
+
 #if defined(KOKKOS_HALF_T_IS_FLOAT)
-TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_nt_half_half ) {
+TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_half_half) {
   typedef ::Test::TeamVectorGemm::ParamTag<Trans::NoTranspose,
                                            Trans::NoTranspose>
       param_tag_type;
@@ -9,7 +49,7 @@ TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_nt_half_half ) {
                               ::Test::halfScalarType, param_tag_type,
                               Algo::Gemm::Unblocked>();
 }
-TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_nt_half_half ) {
+TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_half_half) {
   typedef ::Test::TeamVectorGemm::ParamTag<Trans::Transpose, Trans::NoTranspose>
       param_tag_type;
 
@@ -18,7 +58,7 @@ TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_nt_half_half ) {
                               ::Test::halfScalarType, param_tag_type,
                               Algo::Gemm::Unblocked>();
 }
-TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_t_half_half ) {
+TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_half_half) {
   typedef ::Test::TeamVectorGemm::ParamTag<Trans::NoTranspose, Trans::Transpose>
       param_tag_type;
 
@@ -27,7 +67,7 @@ TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_t_half_half ) {
                               ::Test::halfScalarType, param_tag_type,
                               Algo::Gemm::Unblocked>();
 }
-TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_t_half_half ) {
+TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_half_half) {
   typedef ::Test::TeamVectorGemm::ParamTag<Trans::Transpose, Trans::Transpose>
       param_tag_type;
 
@@ -36,58 +76,76 @@ TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_t_half_half ) {
                               ::Test::halfScalarType, param_tag_type,
                               Algo::Gemm::Unblocked>();
 }
-#endif // KOKKOS_HALF_T_IS_FLOAT
+#endif  // KOKKOS_HALF_T_IS_FLOAT
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_nt_float_float ) {
-  typedef ::Test::TeamVectorGemm::ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_float_float) {
+  typedef ::Test::TeamVectorGemm::ParamTag<Trans::NoTranspose,
+                                           Trans::NoTranspose>
+      param_tag_type;
 
-  //test_batched_teamvectorgemm<TestExecSpace,float,float,param_tag_type,Algo::Gemm::Blocked>();
-  test_batched_teamvectorgemm<TestExecSpace,float,float,param_tag_type,Algo::Gemm::Unblocked>();
+  // test_batched_teamvectorgemm<TestExecSpace,float,float,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace, float, float, param_tag_type,
+                              Algo::Gemm::Unblocked>();
 }
-TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_nt_float_float ) {
-  typedef ::Test::TeamVectorGemm::ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_float_float) {
+  typedef ::Test::TeamVectorGemm::ParamTag<Trans::Transpose, Trans::NoTranspose>
+      param_tag_type;
 
-  //test_batched_teamvectorgemm<TestExecSpace,float,float,param_tag_type,Algo::Gemm::Blocked>();
-  test_batched_teamvectorgemm<TestExecSpace,float,float,param_tag_type,Algo::Gemm::Unblocked>();
+  // test_batched_teamvectorgemm<TestExecSpace,float,float,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace, float, float, param_tag_type,
+                              Algo::Gemm::Unblocked>();
 }
-TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_t_float_float ) {
-  typedef ::Test::TeamVectorGemm::ParamTag<Trans::NoTranspose,Trans::Transpose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_float_float) {
+  typedef ::Test::TeamVectorGemm::ParamTag<Trans::NoTranspose, Trans::Transpose>
+      param_tag_type;
 
-  //test_batched_teamvectorgemm<TestExecSpace,float,float,param_tag_type,Algo::Gemm::Blocked>();
-  test_batched_teamvectorgemm<TestExecSpace,float,float,param_tag_type,Algo::Gemm::Unblocked>();
+  // test_batched_teamvectorgemm<TestExecSpace,float,float,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace, float, float, param_tag_type,
+                              Algo::Gemm::Unblocked>();
 }
-TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_t_float_float ) {
-  typedef ::Test::TeamVectorGemm::ParamTag<Trans::Transpose,Trans::Transpose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_float_float) {
+  typedef ::Test::TeamVectorGemm::ParamTag<Trans::Transpose, Trans::Transpose>
+      param_tag_type;
 
-  //test_batched_teamvectorgemm<TestExecSpace,float,float,param_tag_type,Algo::Gemm::Blocked>();
-  test_batched_teamvectorgemm<TestExecSpace,float,float,param_tag_type,Algo::Gemm::Unblocked>();
+  // test_batched_teamvectorgemm<TestExecSpace,float,float,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace, float, float, param_tag_type,
+                              Algo::Gemm::Unblocked>();
 }
 #endif
 
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_nt_double_double ) {
-  typedef ::Test::TeamVectorGemm::ParamTag<Trans::NoTranspose,Trans::NoTranspose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_double_double) {
+  typedef ::Test::TeamVectorGemm::ParamTag<Trans::NoTranspose,
+                                           Trans::NoTranspose>
+      param_tag_type;
 
-  //test_batched_teamvectorgemm<TestExecSpace,double,double,param_tag_type,Algo::Gemm::Blocked>();
-  test_batched_teamvectorgemm<TestExecSpace,double,double,param_tag_type,Algo::Gemm::Unblocked>();
+  // test_batched_teamvectorgemm<TestExecSpace,double,double,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace, double, double, param_tag_type,
+                              Algo::Gemm::Unblocked>();
 }
-TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_nt_double_double ) {
-  typedef ::Test::TeamVectorGemm::ParamTag<Trans::Transpose,Trans::NoTranspose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_double_double) {
+  typedef ::Test::TeamVectorGemm::ParamTag<Trans::Transpose, Trans::NoTranspose>
+      param_tag_type;
 
-  //test_batched_teamvectorgemm<TestExecSpace,double,double,param_tag_type,Algo::Gemm::Blocked>();
-  test_batched_teamvectorgemm<TestExecSpace,double,double,param_tag_type,Algo::Gemm::Unblocked>();
+  // test_batched_teamvectorgemm<TestExecSpace,double,double,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace, double, double, param_tag_type,
+                              Algo::Gemm::Unblocked>();
 }
-TEST_F( TestCategory, batched_scalar_team_vector_gemm_nt_t_double_double ) {
-  typedef ::Test::TeamVectorGemm::ParamTag<Trans::NoTranspose,Trans::Transpose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_double_double) {
+  typedef ::Test::TeamVectorGemm::ParamTag<Trans::NoTranspose, Trans::Transpose>
+      param_tag_type;
 
-  //test_batched_teamvectorgemm<TestExecSpace,double,double,param_tag_type,Algo::Gemm::Blocked>();
-  test_batched_teamvectorgemm<TestExecSpace,double,double,param_tag_type,Algo::Gemm::Unblocked>();
+  // test_batched_teamvectorgemm<TestExecSpace,double,double,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace, double, double, param_tag_type,
+                              Algo::Gemm::Unblocked>();
 }
-TEST_F( TestCategory, batched_scalar_team_vector_gemm_t_t_double_double ) {
-  typedef ::Test::TeamVectorGemm::ParamTag<Trans::Transpose,Trans::Transpose> param_tag_type;
+TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_double_double) {
+  typedef ::Test::TeamVectorGemm::ParamTag<Trans::Transpose, Trans::Transpose>
+      param_tag_type;
 
-  //test_batched_teamvectorgemm<TestExecSpace,double,double,param_tag_type,Algo::Gemm::Blocked>();
-  test_batched_teamvectorgemm<TestExecSpace,double,double,param_tag_type,Algo::Gemm::Unblocked>();
+  // test_batched_teamvectorgemm<TestExecSpace,double,double,param_tag_type,Algo::Gemm::Blocked>();
+  test_batched_teamvectorgemm<TestExecSpace, double, double, param_tag_type,
+                              Algo::Gemm::Unblocked>();
 }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorQR.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorQR.hpp
index bbe483a5ce..4ae4ee4133 100644
--- a/unit_test/batched/dense/Test_Batched_TeamVectorQR.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamVectorQR.hpp
@@ -17,160 +17,159 @@ using namespace KokkosBatched;
 
 namespace Test {
 
-  template<typename DeviceType,
-           typename MatrixViewType,
-           typename VectorViewType,
-           typename WorkViewType,
-           typename AlgoTagType>
-  struct Functor_TestBatchedTeamVectorQR {
-    MatrixViewType _a;
-    VectorViewType _x, _b, _t;
-    WorkViewType _w;
-
-    KOKKOS_INLINE_FUNCTION
-    Functor_TestBatchedTeamVectorQR(const MatrixViewType &a,
-                                    const VectorViewType &x,
-                                    const VectorViewType &b,
-                                    const VectorViewType &t,
-                                    const WorkViewType &w)
-      : _a(a), _x(x), _b(b), _t(t), _w(w) {} 
-
-    template<typename MemberType>
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const MemberType &member) const {
-      typedef typename MatrixViewType::non_const_value_type value_type;
-      const value_type one(1), zero(0), add_this(10);
-
-      const int k = member.league_rank();
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-      auto bb = Kokkos::subview(_b, k, Kokkos::ALL());
-      auto xx = Kokkos::subview(_x, k, Kokkos::ALL());
-      auto tt = Kokkos::subview(_t, k, Kokkos::ALL());
-      auto ww = Kokkos::subview(_w, k, Kokkos::ALL());
-
-      // make diagonal dominant
-      Kokkos::parallel_for(Kokkos::TeamVectorRange(member, aa.extent(0)),
-                           [&](const int &i) {
-                             aa(i,i) += add_this;
-                           });
-
-      /// xx = 1
-      TeamVectorSet<MemberType>::invoke(member, one, xx);
-      member.team_barrier();
-
-      /// bb = AA*xx
-      TeamVectorGemv<MemberType,Trans::NoTranspose,Algo::Gemv::Unblocked>::invoke(member, one, aa, xx, zero, bb); 
-      member.team_barrier();
-
-      /// AA = QR
-      TeamVectorQR<MemberType,AlgoTagType>::invoke(member, aa, tt, ww);
-      member.team_barrier();
-
-      /// xx = bb;
-      TeamVectorCopy<MemberType,Trans::NoTranspose>::invoke(member, bb, xx);
-      member.team_barrier();
-
-      /// xx = Q^{T}xx;
-      TeamVectorApplyQ<MemberType,Side::Left,Trans::Transpose,Algo::ApplyQ::Unblocked>::invoke(member, aa, tt, xx, ww);
-      member.team_barrier();
-
-      /// xx = R^{-1} xx
-      TeamVectorTrsv<MemberType,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,Algo::Trsv::Unblocked>
-        ::invoke(member, one, aa, xx);
-    }
+template <typename DeviceType, typename MatrixViewType, typename VectorViewType,
+          typename WorkViewType, typename AlgoTagType>
+struct Functor_TestBatchedTeamVectorQR {
+  MatrixViewType _a;
+  VectorViewType _x, _b, _t;
+  WorkViewType _w;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamVectorQR(const MatrixViewType &a,
+                                  const VectorViewType &x,
+                                  const VectorViewType &b,
+                                  const VectorViewType &t,
+                                  const WorkViewType &w)
+      : _a(a), _x(x), _b(b), _t(t), _w(w) {}
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const {
+    typedef typename MatrixViewType::non_const_value_type value_type;
+    const value_type one(1), zero(0), add_this(10);
+
+    const int k = member.league_rank();
+    auto aa     = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto bb     = Kokkos::subview(_b, k, Kokkos::ALL());
+    auto xx     = Kokkos::subview(_x, k, Kokkos::ALL());
+    auto tt     = Kokkos::subview(_t, k, Kokkos::ALL());
+    auto ww     = Kokkos::subview(_w, k, Kokkos::ALL());
+
+    // make diagonal dominant
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, aa.extent(0)),
+                         [&](const int &i) { aa(i, i) += add_this; });
+
+    /// xx = 1
+    TeamVectorSet<MemberType>::invoke(member, one, xx);
+    member.team_barrier();
+
+    /// bb = AA*xx
+    TeamVectorGemv<MemberType, Trans::NoTranspose,
+                   Algo::Gemv::Unblocked>::invoke(member, one, aa, xx, zero,
+                                                  bb);
+    member.team_barrier();
+
+    /// AA = QR
+    TeamVectorQR<MemberType, AlgoTagType>::invoke(member, aa, tt, ww);
+    member.team_barrier();
+
+    /// xx = bb;
+    TeamVectorCopy<MemberType, Trans::NoTranspose>::invoke(member, bb, xx);
+    member.team_barrier();
+
+    /// xx = Q^{T}xx;
+    TeamVectorApplyQ<MemberType, Side::Left, Trans::Transpose,
+                     Algo::ApplyQ::Unblocked>::invoke(member, aa, tt, xx, ww);
+    member.team_barrier();
+
+    /// xx = R^{-1} xx
+    TeamVectorTrsv<MemberType, Uplo::Upper, Trans::NoTranspose, Diag::NonUnit,
+                   Algo::Trsv::Unblocked>::invoke(member, one, aa, xx);
+  }
 
-    inline
-    void run() {
-      typedef typename MatrixViewType::non_const_value_type value_type;
-      std::string name_region("KokkosBatched::Test::TeamVectorQR");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion( name.c_str() );
-
-      const int league_size = _a.extent(0);
-      Kokkos::TeamPolicy<DeviceType> policy(league_size, Kokkos::AUTO);
-
-      Kokkos::parallel_for(name.c_str(), policy, *this);
-      Kokkos::Profiling::popRegion(); 
-    }
-  };
-
-  template<typename DeviceType,
-           typename MatrixViewType,
-           typename VectorViewType,
-           typename WorkViewType,
-           typename AlgoTagType>
-  void impl_test_batched_qr(const int N, const int BlkSize) {
+  inline void run() {
     typedef typename MatrixViewType::non_const_value_type value_type;
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
-    const value_type one(1);
-    /// randomized input testing views
-    MatrixViewType a("a", N, BlkSize, BlkSize);
-    VectorViewType x("x", N, BlkSize);
-    VectorViewType b("b", N, BlkSize);
-    VectorViewType t("t", N, BlkSize);
-    WorkViewType   w("w", N, BlkSize);
-
-    Kokkos::fence();
-
-    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(13718);
-    Kokkos::fill_random(a, random, value_type(1.0));
-
-    Kokkos::fence();
-
-    Functor_TestBatchedTeamVectorQR
-      <DeviceType,MatrixViewType,VectorViewType,WorkViewType,AlgoTagType>(a,x,b,t,w).run();
-
-    Kokkos::fence();
-
-    /// for comparison send it to host
-    typename VectorViewType::HostMirror x_host = Kokkos::create_mirror_view(x);
-    Kokkos::deep_copy(x_host, x);
-
-    /// check x = 1; this eps is about 1e-14
-    typedef typename ats::mag_type mag_type;
-    const mag_type eps = 1e3 * ats::epsilon();
-
-    for (int k=0;k<N;++k) {
-      for (int i=0;i<BlkSize;++i) {
-        const mag_type sum  = ats::abs(x_host(k,i));
-        const mag_type diff = ats::abs(x_host(k,i)-one);
-        EXPECT_NEAR_KK( diff/sum, mag_type(0), eps);
-        //printf("k = %d, i = %d, sum %e diff %e \n", k, i, sum, diff );
-      }
+    std::string name_region("KokkosBatched::Test::TeamVectorQR");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+
+    const int league_size = _a.extent(0);
+    Kokkos::TeamPolicy<DeviceType> policy(league_size, Kokkos::AUTO);
+
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename MatrixViewType, typename VectorViewType,
+          typename WorkViewType, typename AlgoTagType>
+void impl_test_batched_qr(const int N, const int BlkSize) {
+  typedef typename MatrixViewType::non_const_value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+  const value_type one(1);
+  /// randomized input testing views
+  MatrixViewType a("a", N, BlkSize, BlkSize);
+  VectorViewType x("x", N, BlkSize);
+  VectorViewType b("b", N, BlkSize);
+  VectorViewType t("t", N, BlkSize);
+  WorkViewType w("w", N, BlkSize);
+
+  Kokkos::fence();
+
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
+  Kokkos::fill_random(a, random, value_type(1.0));
+
+  Kokkos::fence();
+
+  Functor_TestBatchedTeamVectorQR<DeviceType, MatrixViewType, VectorViewType,
+                                  WorkViewType, AlgoTagType>(a, x, b, t, w)
+      .run();
+
+  Kokkos::fence();
+
+  /// for comparison send it to host
+  typename VectorViewType::HostMirror x_host = Kokkos::create_mirror_view(x);
+  Kokkos::deep_copy(x_host, x);
+
+  /// check x = 1; this eps is about 1e-14
+  typedef typename ats::mag_type mag_type;
+  const mag_type eps = 1e3 * ats::epsilon();
+
+  for (int k = 0; k < N; ++k) {
+    for (int i = 0; i < BlkSize; ++i) {
+      const mag_type sum  = ats::abs(x_host(k, i));
+      const mag_type diff = ats::abs(x_host(k, i) - one);
+      EXPECT_NEAR_KK(diff / sum, mag_type(0), eps);
+      // printf("k = %d, i = %d, sum %e diff %e \n", k, i, sum, diff );
     }
   }
 }
+}  // namespace Test
 
-
-template<typename DeviceType,
-         typename ValueType,
-         typename AlgoTagType>
+template <typename DeviceType, typename ValueType, typename AlgoTagType>
 int test_batched_qr() {
 #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft, DeviceType> MatrixViewType;
-    typedef Kokkos::View<ValueType**, Kokkos::LayoutLeft, DeviceType> VectorViewType;
-    typedef Kokkos::View<ValueType**, Kokkos::LayoutRight,DeviceType> WorkViewType;
-    Test::impl_test_batched_qr<DeviceType,MatrixViewType,VectorViewType,WorkViewType,AlgoTagType>(     0, 10);
-    for (int i=1;i<10;++i) {                                                                                        
-      //printf("Testing: LayoutLeft,  Blksize %d\n", i); 
-      Test::impl_test_batched_qr<DeviceType,MatrixViewType,VectorViewType,WorkViewType,AlgoTagType>(1024, i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        MatrixViewType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType>
+        VectorViewType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        WorkViewType;
+    Test::impl_test_batched_qr<DeviceType, MatrixViewType, VectorViewType,
+                               WorkViewType, AlgoTagType>(0, 10);
+    for (int i = 1; i < 10; ++i) {
+      // printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      Test::impl_test_batched_qr<DeviceType, MatrixViewType, VectorViewType,
+                                 WorkViewType, AlgoTagType>(1024, i);
     }
   }
 #endif
 #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> MatrixViewType;
-    typedef Kokkos::View<ValueType**, Kokkos::LayoutRight,DeviceType> VectorViewType;
-    typedef Kokkos::View<ValueType**, Kokkos::LayoutRight,DeviceType> WorkViewType;
-    Test::impl_test_batched_qr<DeviceType,MatrixViewType,VectorViewType,WorkViewType,AlgoTagType>(     0, 10);
-    for (int i=1;i<10;++i) {                                                                                        
-      //printf("Testing: LayoutRight, Blksize %d\n", i); 
-      Test::impl_test_batched_qr<DeviceType,MatrixViewType,VectorViewType,WorkViewType,AlgoTagType>(1024, i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        MatrixViewType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        VectorViewType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        WorkViewType;
+    Test::impl_test_batched_qr<DeviceType, MatrixViewType, VectorViewType,
+                               WorkViewType, AlgoTagType>(0, 10);
+    for (int i = 1; i < 10; ++i) {
+      // printf("Testing: LayoutRight, Blksize %d\n", i);
+      Test::impl_test_batched_qr<DeviceType, MatrixViewType, VectorViewType,
+                                 WorkViewType, AlgoTagType>(1024, i);
     }
   }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorQR_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorQR_Real.hpp
index 3f7eb2af3f..0a7e3bb8f0 100644
--- a/unit_test/batched/dense/Test_Batched_TeamVectorQR_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamVectorQR_Real.hpp
@@ -1,16 +1,17 @@
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_scalar_teamvector_qr_float ) {
+TEST_F(TestCategory, batched_scalar_teamvector_qr_float) {
   typedef Algo::QR::Unblocked algo_tag_type;
-  test_batched_qr<TestExecSpace,float,algo_tag_type>();
+  test_batched_qr<TestExecSpace, float, algo_tag_type>();
 }
 #endif
 
-
+// FIXME_SYCL timeout
+#ifndef KOKKOS_ENABLE_SYCL
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F( TestCategory, batched_scalar_teamvector_qr_double ) {
+TEST_F(TestCategory, batched_scalar_teamvector_qr_double) {
   typedef Algo::QR::Unblocked algo_tag_type;
-  test_batched_qr<TestExecSpace,double,algo_tag_type>();
+  test_batched_qr<TestExecSpace, double, algo_tag_type>();
 }
 #endif
-
+#endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp
index b1f758cc20..3ae24bda84 100644
--- a/unit_test/batched/dense/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp
@@ -18,183 +18,179 @@ using namespace KokkosBatched;
 
 namespace Test {
 
-  template<typename DeviceType,
-           typename MatrixViewType,
-           typename VectorViewType,
-	   typename PivotViewType,
-           typename WorkViewType,
-           typename AlgoTagType>
-  struct Functor_TestBatchedTeamVectorQR_WithColumnPivoting {
-    MatrixViewType _a;
-    VectorViewType _x, _b, _t;
-    PivotViewType _p;
-    WorkViewType _w;
-
-    KOKKOS_INLINE_FUNCTION
-    Functor_TestBatchedTeamVectorQR_WithColumnPivoting(const MatrixViewType &a,
-						       const VectorViewType &x,
-						       const VectorViewType &b,
-						       const VectorViewType &t,
-						       const PivotViewType &p,
-						       const WorkViewType &w)
-      : _a(a), _x(x), _b(b), _t(t), _p(p), _w(w) {} 
-
-    template<typename MemberType>
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const MemberType &member) const {
-      typedef typename MatrixViewType::non_const_value_type value_type;
-      const value_type one(1), zero(0), add_this(10);
-
-      const int k = member.league_rank();
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-      auto bb = Kokkos::subview(_b, k, Kokkos::ALL());
-      auto xx = Kokkos::subview(_x, k, Kokkos::ALL());
-      auto tt = Kokkos::subview(_t, k, Kokkos::ALL());
-      auto pp = Kokkos::subview(_p, k, Kokkos::ALL());
-      auto ww = Kokkos::subview(_w, k, Kokkos::ALL());
-
-      // make diagonal dominant; xx = 1,2,3,4...
-      const int m = aa.extent(0);
-      Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m),
-                           [&](const int &i) {
-                             aa(i,i) += add_this;
-                             xx(i) = i+1;			     
-                           });
-      member.team_barrier();
-
-      /// bb = AA*xx
-      TeamVectorGemv<MemberType,Trans::NoTranspose,Algo::Gemv::Unblocked>::invoke(member, one, aa, xx, zero, bb); 
-      member.team_barrier();
-
-      /// AA P^T = QR
-      int matrix_rank(0);
-      TeamVectorQR_WithColumnPivoting<MemberType,AlgoTagType>::invoke(member, aa, tt, pp, ww, matrix_rank);
-      member.team_barrier();
-
-      /// xx = bb;
-      TeamVectorCopy<MemberType,Trans::NoTranspose>::invoke(member, bb, xx);
-      member.team_barrier();
-
-      /// xx = Q^{T} xx;
-      TeamVectorApplyQ<MemberType,Side::Left,Trans::Transpose,Algo::ApplyQ::Unblocked>::invoke(member, aa, tt, xx, ww);
-      member.team_barrier();
-
-      /// xx = R^{-1} xx
-      TeamVectorTrsv<MemberType,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,Algo::Trsv::Unblocked>
-        ::invoke(member, one, aa, xx);
-      member.team_barrier();
-
-      /// xx = P xx
-      TeamVectorApplyPivot<MemberType,Side::Left,Direct::Backward>::invoke(member, pp, xx);
-      member.team_barrier();
-    }
+template <typename DeviceType, typename MatrixViewType, typename VectorViewType,
+          typename PivotViewType, typename WorkViewType, typename AlgoTagType>
+struct Functor_TestBatchedTeamVectorQR_WithColumnPivoting {
+  MatrixViewType _a;
+  VectorViewType _x, _b, _t;
+  PivotViewType _p;
+  WorkViewType _w;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamVectorQR_WithColumnPivoting(
+      const MatrixViewType &a, const VectorViewType &x, const VectorViewType &b,
+      const VectorViewType &t, const PivotViewType &p, const WorkViewType &w)
+      : _a(a), _x(x), _b(b), _t(t), _p(p), _w(w) {}
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const {
+    typedef typename MatrixViewType::non_const_value_type value_type;
+    const value_type one(1), zero(0), add_this(10);
+
+    const int k = member.league_rank();
+    auto aa     = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto bb     = Kokkos::subview(_b, k, Kokkos::ALL());
+    auto xx     = Kokkos::subview(_x, k, Kokkos::ALL());
+    auto tt     = Kokkos::subview(_t, k, Kokkos::ALL());
+    auto pp     = Kokkos::subview(_p, k, Kokkos::ALL());
+    auto ww     = Kokkos::subview(_w, k, Kokkos::ALL());
+
+    // make diagonal dominant; xx = 1,2,3,4...
+    const int m = aa.extent(0);
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int &i) {
+      aa(i, i) += add_this;
+      xx(i) = i + 1;
+    });
+    member.team_barrier();
+
+    /// bb = AA*xx
+    TeamVectorGemv<MemberType, Trans::NoTranspose,
+                   Algo::Gemv::Unblocked>::invoke(member, one, aa, xx, zero,
+                                                  bb);
+    member.team_barrier();
+
+    /// AA P^T = QR
+    int matrix_rank(0);
+    TeamVectorQR_WithColumnPivoting<MemberType, AlgoTagType>::invoke(
+        member, aa, tt, pp, ww, matrix_rank);
+    member.team_barrier();
+
+    /// xx = bb;
+    TeamVectorCopy<MemberType, Trans::NoTranspose>::invoke(member, bb, xx);
+    member.team_barrier();
+
+    /// xx = Q^{T} xx;
+    TeamVectorApplyQ<MemberType, Side::Left, Trans::Transpose,
+                     Algo::ApplyQ::Unblocked>::invoke(member, aa, tt, xx, ww);
+    member.team_barrier();
+
+    /// xx = R^{-1} xx
+    TeamVectorTrsv<MemberType, Uplo::Upper, Trans::NoTranspose, Diag::NonUnit,
+                   Algo::Trsv::Unblocked>::invoke(member, one, aa, xx);
+    member.team_barrier();
+
+    /// xx = P xx
+    TeamVectorApplyPivot<MemberType, Side::Left, Direct::Backward>::invoke(
+        member, pp, xx);
+    member.team_barrier();
+  }
 
-    inline
-    void run() {
-      typedef typename MatrixViewType::non_const_value_type value_type;
-      std::string name_region("KokkosBatched::Test::TeamVectorQR_WithColumnPivoting");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion( name.c_str() );
-
-      const int league_size = _a.extent(0);
-      Kokkos::TeamPolicy<DeviceType> policy(league_size, Kokkos::AUTO);
-
-      Kokkos::parallel_for(name.c_str(), policy, *this);
-      Kokkos::Profiling::popRegion(); 
-    }
-  };
-
-  template<typename DeviceType,
-           typename MatrixViewType,
-           typename VectorViewType,
-	   typename PivotViewType,
-           typename WorkViewType,
-           typename AlgoTagType>
-  void impl_test_batched_qr_with_columnpivoting(const int N, const int BlkSize) {
+  inline void run() {
     typedef typename MatrixViewType::non_const_value_type value_type;
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
-    //const value_type one(1);
-    /// randomized input testing views
-    MatrixViewType a("a", N, BlkSize, BlkSize);
-    VectorViewType x("x", N, BlkSize);
-    VectorViewType b("b", N, BlkSize);
-    VectorViewType t("t", N, BlkSize);
-    PivotViewType  p("p", N, BlkSize);
-    WorkViewType   w("w", N, 2*BlkSize);
-
-    Kokkos::fence();
-
-    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(13718);
-    Kokkos::fill_random(a, random, value_type(1.0));
-
-    Kokkos::fence();
-
-    Functor_TestBatchedTeamVectorQR_WithColumnPivoting
-      <DeviceType,MatrixViewType,VectorViewType,
-       PivotViewType,WorkViewType,AlgoTagType>(a,x,b,t,p,w).run();
-
-    Kokkos::fence();
-
-    /// for comparison send it to host
-    typename VectorViewType::HostMirror x_host = Kokkos::create_mirror_view(x);
-    Kokkos::deep_copy(x_host, x);
-
-    /// check x = 1; this eps is about 1e-14
-    typedef typename ats::mag_type mag_type;
-    const mag_type eps = 1e3 * ats::epsilon();
-
-    for (int k=0;k<N;++k) {
-      for (int i=0;i<BlkSize;++i) {
-        const mag_type sum  = ats::abs(x_host(k,i));
-        const mag_type diff = ats::abs(x_host(k,i)-value_type(i+1));
-        EXPECT_NEAR_KK( diff/sum, mag_type(0), eps);
-        //printf("k = %d, i = %d, sum %e diff %e \n", k, i, sum, diff );
-      }
+    std::string name_region(
+        "KokkosBatched::Test::TeamVectorQR_WithColumnPivoting");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+
+    const int league_size = _a.extent(0);
+    Kokkos::TeamPolicy<DeviceType> policy(league_size, Kokkos::AUTO);
+
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename MatrixViewType, typename VectorViewType,
+          typename PivotViewType, typename WorkViewType, typename AlgoTagType>
+void impl_test_batched_qr_with_columnpivoting(const int N, const int BlkSize) {
+  typedef typename MatrixViewType::non_const_value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+  // const value_type one(1);
+  /// randomized input testing views
+  MatrixViewType a("a", N, BlkSize, BlkSize);
+  VectorViewType x("x", N, BlkSize);
+  VectorViewType b("b", N, BlkSize);
+  VectorViewType t("t", N, BlkSize);
+  PivotViewType p("p", N, BlkSize);
+  WorkViewType w("w", N, 2 * BlkSize);
+
+  Kokkos::fence();
+
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
+  Kokkos::fill_random(a, random, value_type(1.0));
+
+  Kokkos::fence();
+
+  Functor_TestBatchedTeamVectorQR_WithColumnPivoting<
+      DeviceType, MatrixViewType, VectorViewType, PivotViewType, WorkViewType,
+      AlgoTagType>(a, x, b, t, p, w)
+      .run();
+
+  Kokkos::fence();
+
+  /// for comparison send it to host
+  typename VectorViewType::HostMirror x_host = Kokkos::create_mirror_view(x);
+  Kokkos::deep_copy(x_host, x);
+
+  /// check x = 1; this eps is about 1e-14
+  typedef typename ats::mag_type mag_type;
+  const mag_type eps = 1e3 * ats::epsilon();
+
+  for (int k = 0; k < N; ++k) {
+    for (int i = 0; i < BlkSize; ++i) {
+      const mag_type sum  = ats::abs(x_host(k, i));
+      const mag_type diff = ats::abs(x_host(k, i) - value_type(i + 1));
+      EXPECT_NEAR_KK(diff / sum, mag_type(0), eps);
+      // printf("k = %d, i = %d, sum %e diff %e \n", k, i, sum, diff );
     }
   }
 }
+}  // namespace Test
 
-
-template<typename DeviceType,
-         typename ValueType,
-	 typename IntType,
-         typename AlgoTagType>
+template <typename DeviceType, typename ValueType, typename IntType,
+          typename AlgoTagType>
 int test_batched_qr_with_columnpivoting() {
 #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft, DeviceType> MatrixViewType;
-    typedef Kokkos::View<ValueType**, Kokkos::LayoutLeft, DeviceType> VectorViewType;
-    typedef Kokkos::View<IntType**,   Kokkos::LayoutLeft, DeviceType> PivotViewType;
-    typedef Kokkos::View<ValueType**, Kokkos::LayoutRight,DeviceType> WorkViewType;
-    Test::impl_test_batched_qr_with_columnpivoting
-      <DeviceType,MatrixViewType,VectorViewType,
-       PivotViewType,WorkViewType,AlgoTagType>(0, 10);
-    for (int i=1;i<10;++i) {
-      //printf("Testing: LayoutLeft,  Blksize %d\n", i); 
-      Test::impl_test_batched_qr_with_columnpivoting
-	<DeviceType,MatrixViewType,VectorViewType,
-	 PivotViewType,WorkViewType,AlgoTagType>(1024, i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        MatrixViewType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType>
+        VectorViewType;
+    typedef Kokkos::View<IntType **, Kokkos::LayoutLeft, DeviceType>
+        PivotViewType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        WorkViewType;
+    Test::impl_test_batched_qr_with_columnpivoting<
+        DeviceType, MatrixViewType, VectorViewType, PivotViewType, WorkViewType,
+        AlgoTagType>(0, 10);
+    for (int i = 1; i < 10; ++i) {
+      // printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      Test::impl_test_batched_qr_with_columnpivoting<
+          DeviceType, MatrixViewType, VectorViewType, PivotViewType,
+          WorkViewType, AlgoTagType>(1024, i);
     }
   }
 #endif
 #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> MatrixViewType;
-    typedef Kokkos::View<ValueType**, Kokkos::LayoutRight,DeviceType> VectorViewType;
-    typedef Kokkos::View<IntType**,   Kokkos::LayoutRight,DeviceType> PivotViewType;
-    typedef Kokkos::View<ValueType**, Kokkos::LayoutRight,DeviceType> WorkViewType;
-    Test::impl_test_batched_qr_with_columnpivoting
-      <DeviceType,MatrixViewType,VectorViewType,
-       PivotViewType,WorkViewType,AlgoTagType>(0, 10);
-    for (int i=1;i<10;++i) {                                                                                        
-      //printf("Testing: LayoutRight, Blksize %d\n", i); 
-      Test::impl_test_batched_qr_with_columnpivoting
-	<DeviceType,MatrixViewType,VectorViewType,
-	 PivotViewType,WorkViewType,AlgoTagType>(1024, i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        MatrixViewType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        VectorViewType;
+    typedef Kokkos::View<IntType **, Kokkos::LayoutRight, DeviceType>
+        PivotViewType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        WorkViewType;
+    Test::impl_test_batched_qr_with_columnpivoting<
+        DeviceType, MatrixViewType, VectorViewType, PivotViewType, WorkViewType,
+        AlgoTagType>(0, 10);
+    for (int i = 1; i < 10; ++i) {
+      // printf("Testing: LayoutRight, Blksize %d\n", i);
+      Test::impl_test_batched_qr_with_columnpivoting<
+          DeviceType, MatrixViewType, VectorViewType, PivotViewType,
+          WorkViewType, AlgoTagType>(1024, i);
     }
   }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorQR_WithColumnPivoting_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorQR_WithColumnPivoting_Real.hpp
index e5ef39a2d2..e1b97b3cec 100644
--- a/unit_test/batched/dense/Test_Batched_TeamVectorQR_WithColumnPivoting_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamVectorQR_WithColumnPivoting_Real.hpp
@@ -1,16 +1,19 @@
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_scalar_teamvector_qr_with_columnpivoting_float ) {
+TEST_F(TestCategory, batched_scalar_teamvector_qr_with_columnpivoting_float) {
   typedef Algo::QR::Unblocked algo_tag_type;
-  test_batched_qr_with_columnpivoting<TestExecSpace,float,int,algo_tag_type>();
+  test_batched_qr_with_columnpivoting<TestExecSpace, float, int,
+                                      algo_tag_type>();
 }
 #endif
 
-
+// FIXME_SYCL timeout
+#ifndef KOKKOS_ENABLE_SYCL
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F( TestCategory, batched_scalar_teamvector_qr_with_columnpivoting_double ) {
+TEST_F(TestCategory, batched_scalar_teamvector_qr_with_columnpivoting_double) {
   typedef Algo::QR::Unblocked algo_tag_type;
-  test_batched_qr_with_columnpivoting<TestExecSpace,double,int,algo_tag_type>();
+  test_batched_qr_with_columnpivoting<TestExecSpace, double, int,
+                                      algo_tag_type>();
 }
 #endif
-
+#endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorSolveUTV.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorSolveUTV.hpp
index 60712823f1..6610383d12 100644
--- a/unit_test/batched/dense/Test_Batched_TeamVectorSolveUTV.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamVectorSolveUTV.hpp
@@ -17,227 +17,218 @@ using namespace KokkosBatched;
 
 namespace Test {
 
-  template<typename DeviceType,
-           typename MatrixViewType,
-           typename VectorViewType,
-	   typename PivViewType,
-           typename WorkViewType,
-           typename AlgoTagType>
-  struct Functor_TestBatchedTeamVectorSolveUTV {
-    MatrixViewType _r, _a, _acopy, _u, _v;
-    PivViewType _p;
-    VectorViewType _x, _b;
-    WorkViewType _w;
-
-    KOKKOS_INLINE_FUNCTION
-    Functor_TestBatchedTeamVectorSolveUTV(const MatrixViewType &r,
-					  const MatrixViewType &a,
-					  const MatrixViewType &acopy,				     
-					  const MatrixViewType &u,
-					  const MatrixViewType &v,
-					  const PivViewType &p,
-					  const VectorViewType &x,
-					  const VectorViewType &b,
-					  const WorkViewType &w)
-      : _r(r), _a(a), _acopy(acopy), _u(u), _v(v), _p(p), _x(x), _b(b), _w(w) {} 
-
-    template<typename MemberType>
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const MemberType &member) const {
-      typedef typename MatrixViewType::non_const_value_type value_type;
-      const value_type one(1), zero(0), add_this(10);
-
-      const int k = member.league_rank();
-      auto rr = Kokkos::subview(_r, k, Kokkos::ALL(), Kokkos::ALL());
-      auto ac = Kokkos::subview(_acopy, k, Kokkos::ALL(), Kokkos::ALL());
-
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-      auto uu = Kokkos::subview(_u, k, Kokkos::ALL(), Kokkos::ALL());
-      auto vv = Kokkos::subview(_v, k, Kokkos::ALL(), Kokkos::ALL());
-
-      auto pp = Kokkos::subview(_p, k, Kokkos::ALL());
-
-      auto bb = Kokkos::subview(_b, k, Kokkos::ALL());
-      auto xx = Kokkos::subview(_x, k, Kokkos::ALL());
-
-      auto ww = Kokkos::subview(_w, k, Kokkos::ALL());
-
-      // make diagonal dominant and set xx = 1,2,3,4,5
-      const int m = aa.extent(0), r = rr.extent(1);
-      if (m <= r) {
-	Kokkos::parallel_for
-	  (Kokkos::TeamVectorRange(member, m),
-	   [&](const int &i) {
-	     aa(i,i) += add_this;
-	     xx(i) = (i+1);
-	   });
-      } else {
-	Kokkos::parallel_for
-	  (Kokkos::TeamVectorRange(member, m*m),
-	   [=](const int &ij) {
-            const int i = ij/m, j = ij%m;
-            value_type tmp(0);
-            for (int l=0;l<r;++l)
-              tmp += rr(i,l)*rr(j,l);
-            aa(i,j) = tmp;
-          });
-	Kokkos::parallel_for
-	  (Kokkos::TeamVectorRange(member, m),
-	   [&](const int &i) {
-            xx(i) = (i+1);
-          });
-      }
-      member.team_barrier();  //finish writing aa, xx
-      
-      /// copy for verification
-      TeamVectorCopy<MemberType,Trans::NoTranspose>
-	::invoke(member, aa, ac);
-
-      /// bb = AA*xx
-      TeamVectorGemv<MemberType,Trans::NoTranspose,Algo::Gemv::Unblocked>
-	::invoke(member, one, aa, xx, zero, bb); 
-      member.team_barrier();
-      
-      /// Solving Ax = b using UTV transformation
-      /// A P^T P x = b
-      /// UTV P x = b;
-
-      /// UTV = A P^T
-      int matrix_rank(0);
-      TeamVectorUTV<MemberType,AlgoTagType>
-	::invoke(member, aa, pp, uu, vv, ww, matrix_rank);
-      member.team_barrier();
-
-      TeamVectorSolveUTV<MemberType,AlgoTagType>
-	::invoke(member, matrix_rank, uu, aa, vv, pp, xx, bb, ww);
+template <typename DeviceType, typename MatrixViewType, typename VectorViewType,
+          typename PivViewType, typename WorkViewType, typename AlgoTagType>
+struct Functor_TestBatchedTeamVectorSolveUTV {
+  MatrixViewType _r, _a, _acopy, _u, _v;
+  PivViewType _p;
+  VectorViewType _x, _b;
+  WorkViewType _w;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamVectorSolveUTV(
+      const MatrixViewType &r, const MatrixViewType &a,
+      const MatrixViewType &acopy, const MatrixViewType &u,
+      const MatrixViewType &v, const PivViewType &p, const VectorViewType &x,
+      const VectorViewType &b, const WorkViewType &w)
+      : _r(r), _a(a), _acopy(acopy), _u(u), _v(v), _p(p), _x(x), _b(b), _w(w) {}
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const {
+    typedef typename MatrixViewType::non_const_value_type value_type;
+    const value_type one(1), zero(0), add_this(10);
+
+    const int k = member.league_rank();
+    auto rr     = Kokkos::subview(_r, k, Kokkos::ALL(), Kokkos::ALL());
+    auto ac     = Kokkos::subview(_acopy, k, Kokkos::ALL(), Kokkos::ALL());
+
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto uu = Kokkos::subview(_u, k, Kokkos::ALL(), Kokkos::ALL());
+    auto vv = Kokkos::subview(_v, k, Kokkos::ALL(), Kokkos::ALL());
+
+    auto pp = Kokkos::subview(_p, k, Kokkos::ALL());
+
+    auto bb = Kokkos::subview(_b, k, Kokkos::ALL());
+    auto xx = Kokkos::subview(_x, k, Kokkos::ALL());
+
+    auto ww = Kokkos::subview(_w, k, Kokkos::ALL());
+
+    // make diagonal dominant and set xx = 1,2,3,4,5
+    const int m = aa.extent(0), r = rr.extent(1);
+    if (m <= r) {
+      Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m),
+                           [&](const int &i) {
+                             aa(i, i) += add_this;
+                             xx(i) = (i + 1);
+                           });
+    } else {
+      Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m * m),
+                           [=](const int &ij) {
+                             const int i = ij / m, j = ij % m;
+                             value_type tmp(0);
+                             for (int l = 0; l < r; ++l)
+                               tmp += rr(i, l) * rr(j, l);
+                             aa(i, j) = tmp;
+                           });
+      Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m),
+                           [&](const int &i) { xx(i) = (i + 1); });
     }
+    member.team_barrier();  // finish writing aa, xx
 
-    inline
-    void run() {
-      typedef typename MatrixViewType::non_const_value_type value_type;
-      std::string name_region("KokkosBatched::Test::TeamVectorSolveUTV");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion( name.c_str() );
-
-      const int league_size = _a.extent(0);
-      Kokkos::TeamPolicy<DeviceType> policy(league_size, Kokkos::AUTO);
-
-      Kokkos::parallel_for(name.c_str(), policy, *this);
-      Kokkos::Profiling::popRegion(); 
-    }
-  };
-
-  template<typename DeviceType,
-           typename MatrixViewType,
-           typename VectorViewType,
-	   typename PivViewType,
-           typename WorkViewType,
-           typename AlgoTagType>
-  void impl_test_batched_solve_utv(const int N, const int BlkSize) {
+    /// copy for verification
+    TeamVectorCopy<MemberType, Trans::NoTranspose>::invoke(member, aa, ac);
+
+    /// bb = AA*xx
+    TeamVectorGemv<MemberType, Trans::NoTranspose,
+                   Algo::Gemv::Unblocked>::invoke(member, one, aa, xx, zero,
+                                                  bb);
+    member.team_barrier();
+
+    /// Solving Ax = b using UTV transformation
+    /// A P^T P x = b
+    /// UTV P x = b;
+
+    /// UTV = A P^T
+    int matrix_rank(0);
+    TeamVectorUTV<MemberType, AlgoTagType>::invoke(member, aa, pp, uu, vv, ww,
+                                                   matrix_rank);
+    member.team_barrier();
+
+    TeamVectorSolveUTV<MemberType, AlgoTagType>::invoke(member, matrix_rank, uu,
+                                                        aa, vv, pp, xx, bb, ww);
+  }
+
+  inline void run() {
     typedef typename MatrixViewType::non_const_value_type value_type;
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
-    //const value_type one(1);
-    /// randomized input testing views
-    MatrixViewType r("r", N, BlkSize, 3);    
-    MatrixViewType a("a", N, BlkSize, BlkSize);
-    MatrixViewType acopy("copy", N, BlkSize, BlkSize);    
-    MatrixViewType u("u", N, BlkSize, BlkSize);
-    MatrixViewType v("v", N, BlkSize, BlkSize);
-    PivViewType    p("p", N, BlkSize);
-    VectorViewType x("x", N, BlkSize);
-    VectorViewType b("b", N, BlkSize);
-    WorkViewType   w("w", N, 3*BlkSize);
-
-    Kokkos::fence();
-
-    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(13718);
-    if (BlkSize <= 3) 
-      Kokkos::fill_random(a, random, value_type(1.0));
-    else 
-      Kokkos::fill_random(r, random, value_type(1.0));
-    
-    Kokkos::fence();
-
-    Functor_TestBatchedTeamVectorSolveUTV
-      <DeviceType,MatrixViewType,VectorViewType,
-       PivViewType,WorkViewType,AlgoTagType>(r,a,acopy,u,v,p,x,b,w).run();
-
-    Kokkos::fence();
-
-    /// for comparison send it to host
-    auto a_host = Kokkos::create_mirror_view(acopy);
-    auto x_host = Kokkos::create_mirror_view(x);
-    auto b_host = Kokkos::create_mirror_view(b);
-    auto w_host = Kokkos::create_mirror_view(w);
-
-    Kokkos::deep_copy(a_host, acopy);    
-    Kokkos::deep_copy(x_host, x);
-    Kokkos::deep_copy(b_host, b);
-
-    /// this is least square; we cannot expect high
-    typedef typename ats::mag_type mag_type;
-    const mag_type eps = 1e3 * ats::epsilon();
-
-    for (int k=0;k<N;++k) {
-      mag_type residual(0), norm(0);
-      for (int i=0;i<BlkSize;++i) {
-	value_type tmp(0);
-	for (int j=0;j<BlkSize;++j) {	
-	  tmp += a_host(k,i,j)*x_host(k,j);
-	}
-	w_host(k,i) = tmp - b_host(k,i);
+    std::string name_region("KokkosBatched::Test::TeamVectorSolveUTV");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+
+    const int league_size = _a.extent(0);
+    Kokkos::TeamPolicy<DeviceType> policy(league_size, Kokkos::AUTO);
+
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename MatrixViewType, typename VectorViewType,
+          typename PivViewType, typename WorkViewType, typename AlgoTagType>
+void impl_test_batched_solve_utv(const int N, const int BlkSize) {
+  typedef typename MatrixViewType::non_const_value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+  // const value_type one(1);
+  /// randomized input testing views
+  MatrixViewType r("r", N, BlkSize, 3);
+  MatrixViewType a("a", N, BlkSize, BlkSize);
+  MatrixViewType acopy("copy", N, BlkSize, BlkSize);
+  MatrixViewType u("u", N, BlkSize, BlkSize);
+  MatrixViewType v("v", N, BlkSize, BlkSize);
+  PivViewType p("p", N, BlkSize);
+  VectorViewType x("x", N, BlkSize);
+  VectorViewType b("b", N, BlkSize);
+  WorkViewType w("w", N, 3 * BlkSize);
+
+  Kokkos::fence();
+
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
+  if (BlkSize <= 3)
+    Kokkos::fill_random(a, random, value_type(1.0));
+  else
+    Kokkos::fill_random(r, random, value_type(1.0));
+
+  Kokkos::fence();
+
+  Functor_TestBatchedTeamVectorSolveUTV<DeviceType, MatrixViewType,
+                                        VectorViewType, PivViewType,
+                                        WorkViewType, AlgoTagType>(
+      r, a, acopy, u, v, p, x, b, w)
+      .run();
+
+  Kokkos::fence();
+
+  /// for comparison send it to host
+  auto a_host = Kokkos::create_mirror_view(acopy);
+  auto x_host = Kokkos::create_mirror_view(x);
+  auto b_host = Kokkos::create_mirror_view(b);
+  auto w_host = Kokkos::create_mirror_view(w);
+
+  Kokkos::deep_copy(a_host, acopy);
+  Kokkos::deep_copy(x_host, x);
+  Kokkos::deep_copy(b_host, b);
+
+  /// this is least square; we cannot expect high
+  typedef typename ats::mag_type mag_type;
+  const mag_type eps = 1e3 * ats::epsilon();
+
+  for (int k = 0; k < N; ++k) {
+    mag_type residual(0), norm(0);
+    for (int i = 0; i < BlkSize; ++i) {
+      value_type tmp(0);
+      for (int j = 0; j < BlkSize; ++j) {
+        tmp += a_host(k, i, j) * x_host(k, j);
       }
-      for (int i=0;i<BlkSize;++i) {
-	value_type tmp(0);
-	for (int j=0;j<BlkSize;++j) {	
-	  tmp += a_host(k, j,i)*w_host(k,j);
-	}
-	residual += ats::abs(tmp)*ats::abs(tmp);
-	norm += ats::abs(b_host(k,i))*ats::abs(b_host(k,i));	
+      w_host(k, i) = tmp - b_host(k, i);
+    }
+    for (int i = 0; i < BlkSize; ++i) {
+      value_type tmp(0);
+      for (int j = 0; j < BlkSize; ++j) {
+        tmp += a_host(k, j, i) * w_host(k, j);
       }
-      //printf("norm %e, residual %e, rel res %e\n", norm, residual, residual/norm);
-      EXPECT_NEAR_KK( residual/norm, mag_type(0), eps);
+      residual += ats::abs(tmp) * ats::abs(tmp);
+      norm += ats::abs(b_host(k, i)) * ats::abs(b_host(k, i));
     }
+    // printf("norm %e, residual %e, rel res %e\n", norm, residual,
+    // residual/norm);
+    EXPECT_NEAR_KK(residual / norm, mag_type(0), eps);
   }
 }
+}  // namespace Test
 
-
-template<typename DeviceType,
-         typename ValueType,
-	 typename IntType,
-         typename AlgoTagType>
+template <typename DeviceType, typename ValueType, typename IntType,
+          typename AlgoTagType>
 int test_batched_solve_utv() {
 #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft, DeviceType> MatrixViewType;
-    typedef Kokkos::View<ValueType**, Kokkos::LayoutLeft, DeviceType> VectorViewType;
-    typedef Kokkos::View<IntType**,   Kokkos::LayoutLeft, DeviceType> PivViewType;
-    typedef Kokkos::View<ValueType**, Kokkos::LayoutRight,DeviceType> WorkViewType;
-    Test::impl_test_batched_solve_utv<DeviceType,MatrixViewType,VectorViewType,
-				      PivViewType,WorkViewType,AlgoTagType>(     0, 10);
-    for (int i=1;i<10;++i) {                                                                                        
-      //printf("Testing: LayoutLeft,  Blksize %d\n", i); 
-      Test::impl_test_batched_solve_utv<DeviceType,MatrixViewType,VectorViewType,
-					PivViewType,WorkViewType,AlgoTagType>(1024, i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        MatrixViewType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType>
+        VectorViewType;
+    typedef Kokkos::View<IntType **, Kokkos::LayoutLeft, DeviceType>
+        PivViewType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        WorkViewType;
+    Test::impl_test_batched_solve_utv<DeviceType, MatrixViewType,
+                                      VectorViewType, PivViewType, WorkViewType,
+                                      AlgoTagType>(0, 10);
+    for (int i = 1; i < 10; ++i) {
+      // printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      Test::impl_test_batched_solve_utv<DeviceType, MatrixViewType,
+                                        VectorViewType, PivViewType,
+                                        WorkViewType, AlgoTagType>(1024, i);
     }
   }
 #endif
 #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> MatrixViewType;
-    typedef Kokkos::View<ValueType**, Kokkos::LayoutRight,DeviceType> VectorViewType;
-    typedef Kokkos::View<IntType**,   Kokkos::LayoutRight,DeviceType> PivViewType;
-    typedef Kokkos::View<ValueType**, Kokkos::LayoutRight,DeviceType> WorkViewType;
-    Test::impl_test_batched_solve_utv<DeviceType,MatrixViewType,VectorViewType,
-				      PivViewType,WorkViewType,AlgoTagType>(     0, 10);
-    for (int i=1;i<10;++i) {                                                                                        
-      //printf("Testing: LayoutRight, Blksize %d\n", i); 
-      Test::impl_test_batched_solve_utv<DeviceType,MatrixViewType,VectorViewType,
-					PivViewType,WorkViewType,AlgoTagType>(1024, i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        MatrixViewType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        VectorViewType;
+    typedef Kokkos::View<IntType **, Kokkos::LayoutRight, DeviceType>
+        PivViewType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        WorkViewType;
+    Test::impl_test_batched_solve_utv<DeviceType, MatrixViewType,
+                                      VectorViewType, PivViewType, WorkViewType,
+                                      AlgoTagType>(0, 10);
+    for (int i = 1; i < 10; ++i) {
+      // printf("Testing: LayoutRight, Blksize %d\n", i);
+      Test::impl_test_batched_solve_utv<DeviceType, MatrixViewType,
+                                        VectorViewType, PivViewType,
+                                        WorkViewType, AlgoTagType>(1024, i);
     }
   }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorSolveUTV2.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorSolveUTV2.hpp
index c31055229b..77bec61c28 100644
--- a/unit_test/batched/dense/Test_Batched_TeamVectorSolveUTV2.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamVectorSolveUTV2.hpp
@@ -17,232 +17,225 @@ using namespace KokkosBatched;
 
 namespace Test {
 
-  template<typename DeviceType,
-           typename MatrixViewType,
-           typename VectorViewType,
-	   typename PivViewType,
-           typename WorkViewType,
-           typename AlgoTagType>
-  struct Functor_TestBatchedTeamVectorSolveUTV2 {
-    MatrixViewType _r, _a, _acopy, _u, _v;
-    PivViewType _p;
-    VectorViewType _x, _b;
-    WorkViewType _w;
-
-    KOKKOS_INLINE_FUNCTION
-    Functor_TestBatchedTeamVectorSolveUTV2(const MatrixViewType &r,
-					  const MatrixViewType &a,
-					  const MatrixViewType &acopy,				     
-					  const MatrixViewType &u,
-					  const MatrixViewType &v,
-					  const PivViewType &p,
-					  const VectorViewType &x,
-					  const VectorViewType &b,
-					  const WorkViewType &w)
-      : _r(r), _a(a), _acopy(acopy), _u(u), _v(v), _p(p), _x(x), _b(b), _w(w) {} 
-
-    template<typename MemberType>
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const MemberType &member) const {
-      typedef typename MatrixViewType::non_const_value_type value_type;
-      const value_type one(1), zero(0), add_this(10);
-
-      const int k = member.league_rank();
-      auto rr = Kokkos::subview(_r, k, Kokkos::ALL(), Kokkos::ALL());
-      auto ac = Kokkos::subview(_acopy, k, Kokkos::ALL(), Kokkos::ALL());
-
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-      auto uu = Kokkos::subview(_u, k, Kokkos::ALL(), Kokkos::ALL());
-      auto vv = Kokkos::subview(_v, k, Kokkos::ALL(), Kokkos::ALL());
-
-      auto pp = Kokkos::subview(_p, k, Kokkos::ALL());
-
-      auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
-      auto xx = Kokkos::subview(_x, k, Kokkos::ALL(), Kokkos::ALL());
-
-      auto ww = Kokkos::subview(_w, k, Kokkos::ALL());
-
-      // make diagonal dominant and set xx = 1,2,3,4,5
-      const int m = aa.extent(0), r = rr.extent(1);
-      if (m <= r) {
-	Kokkos::parallel_for
-	  (Kokkos::TeamVectorRange(member, m),
-	   [&](const int &i) {
-	     aa(i,i) += add_this;
-	     for (int j=0;j<2;++j)
-	       xx(i,j) = (i+1);
-	   });
-      } else {
-	Kokkos::parallel_for
-	  (Kokkos::TeamVectorRange(member, m*m),
-	   [=](const int &ij) {
-            const int i = ij/m, j = ij%m;
-            value_type tmp(0);
-            for (int l=0;l<r;++l)
-              tmp += rr(i,l)*rr(j,l);
-            aa(i,j) = tmp;
-          });
-	Kokkos::parallel_for
-	  (Kokkos::TeamVectorRange(member, m),
-	   [&](const int &i) {
-	     for (int j=0;j<2;++j)
-	       xx(i,j) = (i+1);
-          });
-      }
-      member.team_barrier();  //finish writing aa, xx
-      
-      /// copy for verification
-      TeamVectorCopy<MemberType,Trans::NoTranspose>
-	::invoke(member, aa, ac);
-
-      /// bb = AA*xx
-      KokkosBatched::TeamVectorGemm<MemberType,Trans::NoTranspose,Trans::NoTranspose,Algo::Gemm::Unblocked>
-	::invoke(member, one, aa, xx, zero, bb); 
-      member.team_barrier();
-      
-      /// Solving Ax = b using UTV transformation
-      /// A P^T P x = b
-      /// UTV P x = b;
-
-      /// UTV = A P^T
-      int matrix_rank(0);
-      TeamVectorUTV<MemberType,AlgoTagType>
-	::invoke(member, aa, pp, uu, vv, ww, matrix_rank);
-      member.team_barrier();
-
-      TeamVectorSolveUTV<MemberType,AlgoTagType>
-	::invoke(member, matrix_rank, uu, aa, vv, pp, xx, bb, ww);
+template <typename DeviceType, typename MatrixViewType, typename VectorViewType,
+          typename PivViewType, typename WorkViewType, typename AlgoTagType>
+struct Functor_TestBatchedTeamVectorSolveUTV2 {
+  MatrixViewType _r, _a, _acopy, _u, _v;
+  PivViewType _p;
+  VectorViewType _x, _b;
+  WorkViewType _w;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamVectorSolveUTV2(
+      const MatrixViewType &r, const MatrixViewType &a,
+      const MatrixViewType &acopy, const MatrixViewType &u,
+      const MatrixViewType &v, const PivViewType &p, const VectorViewType &x,
+      const VectorViewType &b, const WorkViewType &w)
+      : _r(r), _a(a), _acopy(acopy), _u(u), _v(v), _p(p), _x(x), _b(b), _w(w) {}
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const {
+    typedef typename MatrixViewType::non_const_value_type value_type;
+    const value_type one(1), zero(0), add_this(10);
+
+    const int k = member.league_rank();
+    auto rr     = Kokkos::subview(_r, k, Kokkos::ALL(), Kokkos::ALL());
+    auto ac     = Kokkos::subview(_acopy, k, Kokkos::ALL(), Kokkos::ALL());
+
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto uu = Kokkos::subview(_u, k, Kokkos::ALL(), Kokkos::ALL());
+    auto vv = Kokkos::subview(_v, k, Kokkos::ALL(), Kokkos::ALL());
+
+    auto pp = Kokkos::subview(_p, k, Kokkos::ALL());
+
+    auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
+    auto xx = Kokkos::subview(_x, k, Kokkos::ALL(), Kokkos::ALL());
+
+    auto ww = Kokkos::subview(_w, k, Kokkos::ALL());
+
+    // make diagonal dominant and set xx = 1,2,3,4,5
+    const int m = aa.extent(0), r = rr.extent(1);
+    if (m <= r) {
+      Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m),
+                           [&](const int &i) {
+                             aa(i, i) += add_this;
+                             for (int j = 0; j < 2; ++j) xx(i, j) = (i + 1);
+                           });
+    } else {
+      Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m * m),
+                           [=](const int &ij) {
+                             const int i = ij / m, j = ij % m;
+                             value_type tmp(0);
+                             for (int l = 0; l < r; ++l)
+                               tmp += rr(i, l) * rr(j, l);
+                             aa(i, j) = tmp;
+                           });
+      Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m),
+                           [&](const int &i) {
+                             for (int j = 0; j < 2; ++j) xx(i, j) = (i + 1);
+                           });
     }
+    member.team_barrier();  // finish writing aa, xx
+
+    /// copy for verification
+    TeamVectorCopy<MemberType, Trans::NoTranspose>::invoke(member, aa, ac);
+
+    /// bb = AA*xx
+    KokkosBatched::TeamVectorGemm<MemberType, Trans::NoTranspose,
+                                  Trans::NoTranspose,
+                                  Algo::Gemm::Unblocked>::invoke(member, one,
+                                                                 aa, xx, zero,
+                                                                 bb);
+    member.team_barrier();
+
+    /// Solving Ax = b using UTV transformation
+    /// A P^T P x = b
+    /// UTV P x = b;
+
+    /// UTV = A P^T
+    int matrix_rank(0);
+    TeamVectorUTV<MemberType, AlgoTagType>::invoke(member, aa, pp, uu, vv, ww,
+                                                   matrix_rank);
+    member.team_barrier();
+
+    TeamVectorSolveUTV<MemberType, AlgoTagType>::invoke(member, matrix_rank, uu,
+                                                        aa, vv, pp, xx, bb, ww);
+  }
 
-    inline
-    void run() {
-      typedef typename MatrixViewType::non_const_value_type value_type;
-      std::string name_region("KokkosBatched::Test::TeamVectorSolveUTV");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion( name.c_str() );
-
-      const int league_size = _a.extent(0);
-      Kokkos::TeamPolicy<DeviceType> policy(league_size, Kokkos::AUTO);
-
-      Kokkos::parallel_for(name.c_str(), policy, *this);
-      Kokkos::Profiling::popRegion(); 
-    }
-  };
-
-  template<typename DeviceType,
-           typename MatrixViewType,
-           typename VectorViewType,
-	   typename PivViewType,
-           typename WorkViewType,
-           typename AlgoTagType>
-  void impl_test_batched_solve_utv2(const int N, const int BlkSize) {
+  inline void run() {
     typedef typename MatrixViewType::non_const_value_type value_type;
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
-    //const value_type one(1);
-    /// randomized input testing views
-    MatrixViewType r("r", N, BlkSize, 3);    
-    MatrixViewType a("a", N, BlkSize, BlkSize);
-    MatrixViewType acopy("copy", N, BlkSize, BlkSize);    
-    MatrixViewType u("u", N, BlkSize, BlkSize);
-    MatrixViewType v("v", N, BlkSize, BlkSize);
-    PivViewType    p("p", N, BlkSize);
-    VectorViewType x("x", N, BlkSize, 2);
-    VectorViewType b("b", N, BlkSize, 2);
-    WorkViewType   w("w", N, 3*BlkSize*2);
-
-    Kokkos::fence();
-
-    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(13718);
-    if (BlkSize <= 3) 
-      Kokkos::fill_random(a, random, value_type(1.0));
-    else 
-      Kokkos::fill_random(r, random, value_type(1.0));
-    
-    Kokkos::fence();
-
-    Functor_TestBatchedTeamVectorSolveUTV2
-      <DeviceType,MatrixViewType,VectorViewType,
-       PivViewType,WorkViewType,AlgoTagType>(r,a,acopy,u,v,p,x,b,w).run();
-
-    Kokkos::fence();
-
-    /// for comparison send it to host
-    auto a_host = Kokkos::create_mirror_view(acopy);
-    auto x_host = Kokkos::create_mirror_view(x);
-    auto b_host = Kokkos::create_mirror_view(b);
-    auto w_host = Kokkos::create_mirror_view(w);
-
-    Kokkos::deep_copy(a_host, acopy);    
-    Kokkos::deep_copy(x_host, x);
-    Kokkos::deep_copy(b_host, b);
-
-    /// this is least square; we cannot expect high
-    typedef typename ats::mag_type mag_type;
-    const mag_type eps = 1e3 * ats::epsilon();
-
-    for (int k=0;k<N;++k) {
-      mag_type residual(0), norm(0);
-
-      for (int l=0;l<2;++l) {
-	for (int i=0;i<BlkSize;++i) {
-	  value_type tmp(0);
-	  for (int j=0;j<BlkSize;++j) {	
-	    tmp += a_host(k,i,j)*x_host(k,j,l);
-	  }
-	  w_host(k,i) = tmp - b_host(k,i,l);
-	}
-	for (int i=0;i<BlkSize;++i) {
-	  value_type tmp(0);
-	  for (int j=0;j<BlkSize;++j) {	
-	    tmp += a_host(k,j,i)*w_host(k,j);
-	  }
-	  residual += ats::abs(tmp)*ats::abs(tmp);
-	  norm += ats::abs(b_host(k,i,l))*ats::abs(b_host(k,i,l));	
-	}
+    std::string name_region("KokkosBatched::Test::TeamVectorSolveUTV");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+
+    const int league_size = _a.extent(0);
+    Kokkos::TeamPolicy<DeviceType> policy(league_size, Kokkos::AUTO);
+
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename MatrixViewType, typename VectorViewType,
+          typename PivViewType, typename WorkViewType, typename AlgoTagType>
+void impl_test_batched_solve_utv2(const int N, const int BlkSize) {
+  typedef typename MatrixViewType::non_const_value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+  // const value_type one(1);
+  /// randomized input testing views
+  MatrixViewType r("r", N, BlkSize, 3);
+  MatrixViewType a("a", N, BlkSize, BlkSize);
+  MatrixViewType acopy("copy", N, BlkSize, BlkSize);
+  MatrixViewType u("u", N, BlkSize, BlkSize);
+  MatrixViewType v("v", N, BlkSize, BlkSize);
+  PivViewType p("p", N, BlkSize);
+  VectorViewType x("x", N, BlkSize, 2);
+  VectorViewType b("b", N, BlkSize, 2);
+  WorkViewType w("w", N, 3 * BlkSize * 2);
+
+  Kokkos::fence();
+
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
+  if (BlkSize <= 3)
+    Kokkos::fill_random(a, random, value_type(1.0));
+  else
+    Kokkos::fill_random(r, random, value_type(1.0));
+
+  Kokkos::fence();
+
+  Functor_TestBatchedTeamVectorSolveUTV2<DeviceType, MatrixViewType,
+                                         VectorViewType, PivViewType,
+                                         WorkViewType, AlgoTagType>(
+      r, a, acopy, u, v, p, x, b, w)
+      .run();
+
+  Kokkos::fence();
+
+  /// for comparison send it to host
+  auto a_host = Kokkos::create_mirror_view(acopy);
+  auto x_host = Kokkos::create_mirror_view(x);
+  auto b_host = Kokkos::create_mirror_view(b);
+  auto w_host = Kokkos::create_mirror_view(w);
+
+  Kokkos::deep_copy(a_host, acopy);
+  Kokkos::deep_copy(x_host, x);
+  Kokkos::deep_copy(b_host, b);
+
+  /// this is least square; we cannot expect high
+  typedef typename ats::mag_type mag_type;
+  const mag_type eps = 1e3 * ats::epsilon();
+
+  for (int k = 0; k < N; ++k) {
+    mag_type residual(0), norm(0);
+
+    for (int l = 0; l < 2; ++l) {
+      for (int i = 0; i < BlkSize; ++i) {
+        value_type tmp(0);
+        for (int j = 0; j < BlkSize; ++j) {
+          tmp += a_host(k, i, j) * x_host(k, j, l);
+        }
+        w_host(k, i) = tmp - b_host(k, i, l);
+      }
+      for (int i = 0; i < BlkSize; ++i) {
+        value_type tmp(0);
+        for (int j = 0; j < BlkSize; ++j) {
+          tmp += a_host(k, j, i) * w_host(k, j);
+        }
+        residual += ats::abs(tmp) * ats::abs(tmp);
+        norm += ats::abs(b_host(k, i, l)) * ats::abs(b_host(k, i, l));
       }
-      //printf("norm %e, residual %e, rel res %e\n", norm, residual, residual/norm);
-      EXPECT_NEAR_KK( residual/norm, mag_type(0), eps);
     }
+    // printf("norm %e, residual %e, rel res %e\n", norm, residual,
+    // residual/norm);
+    EXPECT_NEAR_KK(residual / norm, mag_type(0), eps);
   }
 }
+}  // namespace Test
 
-
-template<typename DeviceType,
-         typename ValueType,
-	 typename IntType,
-         typename AlgoTagType>
+template <typename DeviceType, typename ValueType, typename IntType,
+          typename AlgoTagType>
 int test_batched_solve_utv2() {
 #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft, DeviceType> MatrixViewType;
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft, DeviceType> VectorViewType;
-    typedef Kokkos::View<IntType**,   Kokkos::LayoutLeft, DeviceType> PivViewType;
-    typedef Kokkos::View<ValueType**, Kokkos::LayoutRight,DeviceType> WorkViewType;
-    Test::impl_test_batched_solve_utv2<DeviceType,MatrixViewType,VectorViewType,
-				       PivViewType,WorkViewType,AlgoTagType>(     0, 10);
-    for (int i=1;i<10;++i) {                                                                                        
-      //printf("Testing: LayoutLeft,  Blksize %d\n", i); 
-      Test::impl_test_batched_solve_utv2<DeviceType,MatrixViewType,VectorViewType,
-					 PivViewType,WorkViewType,AlgoTagType>(1024, i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        MatrixViewType;
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        VectorViewType;
+    typedef Kokkos::View<IntType **, Kokkos::LayoutLeft, DeviceType>
+        PivViewType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        WorkViewType;
+    Test::impl_test_batched_solve_utv2<DeviceType, MatrixViewType,
+                                       VectorViewType, PivViewType,
+                                       WorkViewType, AlgoTagType>(0, 10);
+    for (int i = 1; i < 10; ++i) {
+      // printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      Test::impl_test_batched_solve_utv2<DeviceType, MatrixViewType,
+                                         VectorViewType, PivViewType,
+                                         WorkViewType, AlgoTagType>(1024, i);
     }
   }
 #endif
 #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> MatrixViewType;
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> VectorViewType;
-    typedef Kokkos::View<IntType**,   Kokkos::LayoutRight,DeviceType> PivViewType;
-    typedef Kokkos::View<ValueType**, Kokkos::LayoutRight,DeviceType> WorkViewType;
-    Test::impl_test_batched_solve_utv2<DeviceType,MatrixViewType,VectorViewType,
-				       PivViewType,WorkViewType,AlgoTagType>(     0, 10);
-    for (int i=1;i<10;++i) {                                                                                        
-      //printf("Testing: LayoutRight, Blksize %d\n", i); 
-      Test::impl_test_batched_solve_utv2<DeviceType,MatrixViewType,VectorViewType,
-					 PivViewType,WorkViewType,AlgoTagType>(1024, i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        MatrixViewType;
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        VectorViewType;
+    typedef Kokkos::View<IntType **, Kokkos::LayoutRight, DeviceType>
+        PivViewType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        WorkViewType;
+    Test::impl_test_batched_solve_utv2<DeviceType, MatrixViewType,
+                                       VectorViewType, PivViewType,
+                                       WorkViewType, AlgoTagType>(0, 10);
+    for (int i = 1; i < 10; ++i) {
+      // printf("Testing: LayoutRight, Blksize %d\n", i);
+      Test::impl_test_batched_solve_utv2<DeviceType, MatrixViewType,
+                                         VectorViewType, PivViewType,
+                                         WorkViewType, AlgoTagType>(1024, i);
     }
   }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorSolveUTV2_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorSolveUTV2_Real.hpp
index 0bdef94431..af047bec46 100644
--- a/unit_test/batched/dense/Test_Batched_TeamVectorSolveUTV2_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamVectorSolveUTV2_Real.hpp
@@ -1,18 +1,17 @@
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_scalar_teamvector_solve_utv2_float ) {
+TEST_F(TestCategory, batched_scalar_teamvector_solve_utv2_float) {
   typedef Algo::UTV::Unblocked algo_tag_type;
-  test_batched_solve_utv2<TestExecSpace,float,int,algo_tag_type>();
+  test_batched_solve_utv2<TestExecSpace, float, int, algo_tag_type>();
 }
 #endif
 
-
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
 // FIXME_SYCL
 #ifndef KOKKOS_ENABLE_SYCL
-TEST_F( TestCategory, batched_scalar_teamvector_solve_utv2_double ) {
+TEST_F(TestCategory, batched_scalar_teamvector_solve_utv2_double) {
   typedef Algo::UTV::Unblocked algo_tag_type;
-  test_batched_solve_utv2<TestExecSpace,double,int,algo_tag_type>();
+  test_batched_solve_utv2<TestExecSpace, double, int, algo_tag_type>();
 }
 #endif
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorSolveUTV_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorSolveUTV_Real.hpp
index 7ae7837286..06bc5b0247 100644
--- a/unit_test/batched/dense/Test_Batched_TeamVectorSolveUTV_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamVectorSolveUTV_Real.hpp
@@ -1,18 +1,17 @@
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_scalar_teamvector_solve_utv_float ) {
+TEST_F(TestCategory, batched_scalar_teamvector_solve_utv_float) {
   typedef Algo::UTV::Unblocked algo_tag_type;
-  test_batched_solve_utv<TestExecSpace,float,int,algo_tag_type>();
+  test_batched_solve_utv<TestExecSpace, float, int, algo_tag_type>();
 }
 #endif
 
-
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
 // FIXME_SYCL
 #ifndef KOKKOS_ENABLE_SYCL
-TEST_F( TestCategory, batched_scalar_teamvector_solve_utv_double ) {
+TEST_F(TestCategory, batched_scalar_teamvector_solve_utv_double) {
   typedef Algo::UTV::Unblocked algo_tag_type;
-  test_batched_solve_utv<TestExecSpace,double,int,algo_tag_type>();
+  test_batched_solve_utv<TestExecSpace, double, int, algo_tag_type>();
 }
 #endif
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorUTV.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorUTV.hpp
index 97f0cbb5f5..0a49db7dce 100644
--- a/unit_test/batched/dense/Test_Batched_TeamVectorUTV.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamVectorUTV.hpp
@@ -16,259 +16,250 @@ using namespace KokkosBatched;
 
 namespace Test {
 
-  template<typename DeviceType,
-           typename MatrixViewType,
-           typename VectorViewType,
-	   typename PivViewType,
-           typename WorkViewType,
-           typename AlgoTagType>
-  struct Functor_TestBatchedTeamVectorUTV {
-    MatrixViewType _r, _a, _acopy, _u, _v;
-    PivViewType _p;
-    VectorViewType _x, _b;
-    WorkViewType _w;
-
-    KOKKOS_INLINE_FUNCTION
-    Functor_TestBatchedTeamVectorUTV(const MatrixViewType &r,
-				     const MatrixViewType &a,
-				     const MatrixViewType &acopy,				     
-				     const MatrixViewType &u,
-				     const MatrixViewType &v,
-				     const PivViewType &p,
-				     const VectorViewType &x,
-				     const VectorViewType &b,
-				     const WorkViewType &w)
-      : _r(r), _a(a), _acopy(acopy), _u(u), _v(v), _p(p), _x(x), _b(b), _w(w) {} 
-
-    template<typename MemberType>
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const MemberType &member) const {
-      typedef typename MatrixViewType::non_const_value_type value_type;
-      const value_type one(1), zero(0), add_this(10);
-
-      const int k = member.league_rank();
-      auto rr = Kokkos::subview(_r, k, Kokkos::ALL(), Kokkos::ALL());
-      auto ac = Kokkos::subview(_acopy, k, Kokkos::ALL(), Kokkos::ALL());
-
-      auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
-      auto uu = Kokkos::subview(_u, k, Kokkos::ALL(), Kokkos::ALL());
-      auto vv = Kokkos::subview(_v, k, Kokkos::ALL(), Kokkos::ALL());
-
-      auto pp = Kokkos::subview(_p, k, Kokkos::ALL());
-
-      auto bb = Kokkos::subview(_b, k, Kokkos::ALL());
-      auto xx = Kokkos::subview(_x, k, Kokkos::ALL());
-
-      auto ww = Kokkos::subview(_w, k, Kokkos::ALL());
-
-      // make diagonal dominant and set xx = 1,2,3,4,5
-      const int m = aa.extent(0), r = rr.extent(1);
-      if (m <= r) {
-	Kokkos::parallel_for
-	  (Kokkos::TeamVectorRange(member, m),
-	   [&](const int &i) {
-	     aa(i,i) += add_this;
-	     xx(i) = (i+1);
-	   });
-      } else {
-	Kokkos::parallel_for
-	  (Kokkos::TeamVectorRange(member, m*m),
-	   [=](const int &ij) {
-            const int i = ij/m, j = ij%m;
-            value_type tmp(0);
-            for (int l=0;l<r;++l)
-              tmp += rr(i,l)*rr(j,l);
-            aa(i,j) = tmp;
-          });
-	Kokkos::parallel_for
-	  (Kokkos::TeamVectorRange(member, m),
-	   [&](const int &i) {
-            xx(i) = (i+1);
-          });
-      }
-      member.team_barrier(); //finish writing aa, xx
+template <typename DeviceType, typename MatrixViewType, typename VectorViewType,
+          typename PivViewType, typename WorkViewType, typename AlgoTagType>
+struct Functor_TestBatchedTeamVectorUTV {
+  MatrixViewType _r, _a, _acopy, _u, _v;
+  PivViewType _p;
+  VectorViewType _x, _b;
+  WorkViewType _w;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamVectorUTV(
+      const MatrixViewType &r, const MatrixViewType &a,
+      const MatrixViewType &acopy, const MatrixViewType &u,
+      const MatrixViewType &v, const PivViewType &p, const VectorViewType &x,
+      const VectorViewType &b, const WorkViewType &w)
+      : _r(r), _a(a), _acopy(acopy), _u(u), _v(v), _p(p), _x(x), _b(b), _w(w) {}
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const {
+    typedef typename MatrixViewType::non_const_value_type value_type;
+    const value_type one(1), zero(0), add_this(10);
+
+    const int k = member.league_rank();
+    auto rr     = Kokkos::subview(_r, k, Kokkos::ALL(), Kokkos::ALL());
+    auto ac     = Kokkos::subview(_acopy, k, Kokkos::ALL(), Kokkos::ALL());
+
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto uu = Kokkos::subview(_u, k, Kokkos::ALL(), Kokkos::ALL());
+    auto vv = Kokkos::subview(_v, k, Kokkos::ALL(), Kokkos::ALL());
+
+    auto pp = Kokkos::subview(_p, k, Kokkos::ALL());
+
+    auto bb = Kokkos::subview(_b, k, Kokkos::ALL());
+    auto xx = Kokkos::subview(_x, k, Kokkos::ALL());
+
+    auto ww = Kokkos::subview(_w, k, Kokkos::ALL());
+
+    // make diagonal dominant and set xx = 1,2,3,4,5
+    const int m = aa.extent(0), r = rr.extent(1);
+    if (m <= r) {
+      Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m),
+                           [&](const int &i) {
+                             aa(i, i) += add_this;
+                             xx(i) = (i + 1);
+                           });
+    } else {
+      Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m * m),
+                           [=](const int &ij) {
+                             const int i = ij / m, j = ij % m;
+                             value_type tmp(0);
+                             for (int l = 0; l < r; ++l)
+                               tmp += rr(i, l) * rr(j, l);
+                             aa(i, j) = tmp;
+                           });
+      Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m),
+                           [&](const int &i) { xx(i) = (i + 1); });
+    }
+    member.team_barrier();  // finish writing aa, xx
+
+    /// copy for verification
+    TeamVectorCopy<MemberType, Trans::NoTranspose>::invoke(member, aa, ac);
+
+    /// bb = AA*xx
+    TeamVectorGemv<MemberType, Trans::NoTranspose,
+                   Algo::Gemv::Unblocked>::invoke(member, one, aa, xx, zero,
+                                                  bb);
+    member.team_barrier();
+
+    /// Solving Ax = b using UTV transformation
+    /// A P^T P x = b
+    /// UTV P x = b;
+
+    /// UTV = A P^T
+    int matrix_rank(0);
+    TeamVectorUTV<MemberType, AlgoTagType>::invoke(member, aa, pp, uu, vv, ww,
+                                                   matrix_rank);
+    member.team_barrier();
+    const auto range_upto_rank = Kokkos::pair<int, int>(0, matrix_rank);
+    auto um = Kokkos::subview(uu, Kokkos::ALL(), range_upto_rank);
+    auto am = Kokkos::subview(aa, range_upto_rank, range_upto_rank);
+    auto vm = Kokkos::subview(vv, range_upto_rank, Kokkos::ALL());
+    if (matrix_rank < m) {
+      /// w = U^T b
+      TeamVectorGemv<MemberType, Trans::Transpose,
+                     Algo::Gemv::Unblocked>::invoke(member, one, um, bb, zero,
+                                                    ww);
+      member.team_barrier();
 
-      /// copy for verification
-      TeamVectorCopy<MemberType,Trans::NoTranspose>
-	::invoke(member, aa, ac);
+      /// w = T^{-1} w
+      TeamVectorTrsv<MemberType, Uplo::Lower, Trans::NoTranspose, Diag::NonUnit,
+                     Algo::Trsv::Unblocked>::invoke(member, one, am, ww);
+      member.team_barrier();
 
-      /// bb = AA*xx
-      TeamVectorGemv<MemberType,Trans::NoTranspose,Algo::Gemv::Unblocked>
-	::invoke(member, one, aa, xx, zero, bb); 
+      /// x = V^T w
+      TeamVectorGemv<MemberType, Trans::Transpose,
+                     Algo::Gemv::Unblocked>::invoke(member, one, vm, ww, zero,
+                                                    xx);
       member.team_barrier();
-      
-      /// Solving Ax = b using UTV transformation
-      /// A P^T P x = b
-      /// UTV P x = b;
-
-      /// UTV = A P^T
-      int matrix_rank(0);
-      TeamVectorUTV<MemberType,AlgoTagType>
-	::invoke(member, aa, pp, uu, vv, ww, matrix_rank);
+    } else {
+      /// x = U^T b
+      TeamVectorGemv<MemberType, Trans::Transpose,
+                     Algo::Gemv::Unblocked>::invoke(member, one, um, bb, zero,
+                                                    xx);
       member.team_barrier();
-      const auto range_upto_rank = Kokkos::pair<int,int>(0, matrix_rank);
-      auto um = Kokkos::subview(uu, Kokkos::ALL(), range_upto_rank);
-      auto am = Kokkos::subview(aa, range_upto_rank, range_upto_rank);
-      auto vm = Kokkos::subview(vv, range_upto_rank, Kokkos::ALL());
-      if (matrix_rank < m) {
-	/// w = U^T b
-	TeamVectorGemv<MemberType,Trans::Transpose,Algo::Gemv::Unblocked>
-	  ::invoke(member, one, um, bb, zero, ww);       
-	member.team_barrier();
-
-	/// w = T^{-1} w
-	TeamVectorTrsv<MemberType,Uplo::Lower,Trans::NoTranspose,Diag::NonUnit,Algo::Trsv::Unblocked>
-	  ::invoke(member, one, am, ww);
-	member.team_barrier();
-	
-	/// x = V^T w 
-	TeamVectorGemv<MemberType,Trans::Transpose,Algo::Gemv::Unblocked>
-	  ::invoke(member, one, vm, ww, zero, xx);       
-	member.team_barrier();
-      } else {
-	/// x = U^T b
-	TeamVectorGemv<MemberType,Trans::Transpose,Algo::Gemv::Unblocked>
-	  ::invoke(member, one, um, bb, zero, xx);       
-	member.team_barrier();
-
-	/// x = T^{-1} x
-	TeamVectorTrsv<MemberType,Uplo::Upper,Trans::NoTranspose,Diag::NonUnit,Algo::Trsv::Unblocked>
-	  ::invoke(member, one, am, xx);
-	member.team_barrier();
-      }
-      
-      /// x = P^T x
-      TeamVectorApplyPivot<MemberType,Side::Left,Direct::Backward>
-	::invoke(member, pp, xx);
+
+      /// x = T^{-1} x
+      TeamVectorTrsv<MemberType, Uplo::Upper, Trans::NoTranspose, Diag::NonUnit,
+                     Algo::Trsv::Unblocked>::invoke(member, one, am, xx);
       member.team_barrier();
     }
 
-    inline
-    void run() {
-      typedef typename MatrixViewType::non_const_value_type value_type;
-      std::string name_region("KokkosBatched::Test::TeamVectorUTV");
-      std::string name_value_type = ( std::is_same<value_type,float>::value ? "::Float" : 
-                                      std::is_same<value_type,double>::value ? "::Double" :
-                                      std::is_same<value_type,Kokkos::complex<float> >::value ? "::ComplexFloat" :
-                                      std::is_same<value_type,Kokkos::complex<double> >::value ? "::ComplexDouble" : "::UnknownValueType" );                               
-      std::string name = name_region + name_value_type;
-      Kokkos::Profiling::pushRegion( name.c_str() );
-
-      const int league_size = _a.extent(0);
-      Kokkos::TeamPolicy<DeviceType> policy(league_size, Kokkos::AUTO);
-
-      Kokkos::parallel_for(name.c_str(), policy, *this);
-      Kokkos::Profiling::popRegion(); 
-    }
-  };
-
-  template<typename DeviceType,
-           typename MatrixViewType,
-           typename VectorViewType,
-	   typename PivViewType,
-           typename WorkViewType,
-           typename AlgoTagType>
-  void impl_test_batched_utv(const int N, const int BlkSize) {
+    /// x = P^T x
+    TeamVectorApplyPivot<MemberType, Side::Left, Direct::Backward>::invoke(
+        member, pp, xx);
+    member.team_barrier();
+  }
+
+  inline void run() {
     typedef typename MatrixViewType::non_const_value_type value_type;
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
-    //const value_type one(1);
-    /// randomized input testing views
-    MatrixViewType r("r", N, BlkSize, 3);    
-    MatrixViewType a("a", N, BlkSize, BlkSize);
-    MatrixViewType acopy("copy", N, BlkSize, BlkSize);    
-    MatrixViewType u("u", N, BlkSize, BlkSize);
-    MatrixViewType v("v", N, BlkSize, BlkSize);
-    PivViewType    p("p", N, BlkSize);
-    VectorViewType x("x", N, BlkSize);
-    VectorViewType b("b", N, BlkSize);
-    WorkViewType   w("w", N, 3*BlkSize);
-
-    Kokkos::fence();
-
-    Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(13718);
-    if (BlkSize <= 3) 
-      Kokkos::fill_random(a, random, value_type(1.0));
-    else 
-      Kokkos::fill_random(r, random, value_type(1.0));
-    
-    Kokkos::fence();
-
-    Functor_TestBatchedTeamVectorUTV
-      <DeviceType,MatrixViewType,VectorViewType,
-       PivViewType,WorkViewType,AlgoTagType>(r,a,acopy,u,v,p,x,b,w).run();
-
-    Kokkos::fence();
-
-    /// for comparison send it to host
-    auto a_host = Kokkos::create_mirror_view(acopy);
-    auto x_host = Kokkos::create_mirror_view(x);
-    auto b_host = Kokkos::create_mirror_view(b);
-    auto w_host = Kokkos::create_mirror_view(w);
-
-    Kokkos::deep_copy(a_host, acopy);    
-    Kokkos::deep_copy(x_host, x);
-    Kokkos::deep_copy(b_host, b);
-
-    /// this is least square; we cannot expect high
-    typedef typename ats::mag_type mag_type;
-    const mag_type eps = 1e3 * ats::epsilon();
-
-    for (int k=0;k<N;++k) {
-      mag_type residual(0), norm(0);
-      for (int i=0;i<BlkSize;++i) {
-	value_type tmp(0);
-	for (int j=0;j<BlkSize;++j) {	
-	  tmp += a_host(k,i,j)*x_host(k,j);
-	}
-	w_host(k,i) = tmp - b_host(k,i);
+    std::string name_region("KokkosBatched::Test::TeamVectorUTV");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+
+    const int league_size = _a.extent(0);
+    Kokkos::TeamPolicy<DeviceType> policy(league_size, Kokkos::AUTO);
+
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename MatrixViewType, typename VectorViewType,
+          typename PivViewType, typename WorkViewType, typename AlgoTagType>
+void impl_test_batched_utv(const int N, const int BlkSize) {
+  typedef typename MatrixViewType::non_const_value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+  // const value_type one(1);
+  /// randomized input testing views
+  MatrixViewType r("r", N, BlkSize, 3);
+  MatrixViewType a("a", N, BlkSize, BlkSize);
+  MatrixViewType acopy("copy", N, BlkSize, BlkSize);
+  MatrixViewType u("u", N, BlkSize, BlkSize);
+  MatrixViewType v("v", N, BlkSize, BlkSize);
+  PivViewType p("p", N, BlkSize);
+  VectorViewType x("x", N, BlkSize);
+  VectorViewType b("b", N, BlkSize);
+  WorkViewType w("w", N, 3 * BlkSize);
+
+  Kokkos::fence();
+
+  Kokkos::Random_XorShift64_Pool<typename DeviceType::execution_space> random(
+      13718);
+  if (BlkSize <= 3)
+    Kokkos::fill_random(a, random, value_type(1.0));
+  else
+    Kokkos::fill_random(r, random, value_type(1.0));
+
+  Kokkos::fence();
+
+  Functor_TestBatchedTeamVectorUTV<DeviceType, MatrixViewType, VectorViewType,
+                                   PivViewType, WorkViewType, AlgoTagType>(
+      r, a, acopy, u, v, p, x, b, w)
+      .run();
+
+  Kokkos::fence();
+
+  /// for comparison send it to host
+  auto a_host = Kokkos::create_mirror_view(acopy);
+  auto x_host = Kokkos::create_mirror_view(x);
+  auto b_host = Kokkos::create_mirror_view(b);
+  auto w_host = Kokkos::create_mirror_view(w);
+
+  Kokkos::deep_copy(a_host, acopy);
+  Kokkos::deep_copy(x_host, x);
+  Kokkos::deep_copy(b_host, b);
+
+  /// this is least square; we cannot expect high
+  typedef typename ats::mag_type mag_type;
+  const mag_type eps = 1e3 * ats::epsilon();
+
+  for (int k = 0; k < N; ++k) {
+    mag_type residual(0), norm(0);
+    for (int i = 0; i < BlkSize; ++i) {
+      value_type tmp(0);
+      for (int j = 0; j < BlkSize; ++j) {
+        tmp += a_host(k, i, j) * x_host(k, j);
       }
-      for (int i=0;i<BlkSize;++i) {
-	value_type tmp(0);
-	for (int j=0;j<BlkSize;++j) {	
-	  tmp += a_host(k, j,i)*w_host(k,j);
-	}
-	residual += ats::abs(tmp)*ats::abs(tmp);
-	norm += ats::abs(b_host(k,i))*ats::abs(b_host(k,i));	
+      w_host(k, i) = tmp - b_host(k, i);
+    }
+    for (int i = 0; i < BlkSize; ++i) {
+      value_type tmp(0);
+      for (int j = 0; j < BlkSize; ++j) {
+        tmp += a_host(k, j, i) * w_host(k, j);
       }
-      //printf("norm %e, residual %e, rel res %e\n", norm, residual, residual/norm);
-      EXPECT_NEAR_KK( residual/norm, mag_type(0), eps);
+      residual += ats::abs(tmp) * ats::abs(tmp);
+      norm += ats::abs(b_host(k, i)) * ats::abs(b_host(k, i));
     }
+    // printf("norm %e, residual %e, rel res %e\n", norm, residual,
+    // residual/norm);
+    EXPECT_NEAR_KK(residual / norm, mag_type(0), eps);
   }
 }
+}  // namespace Test
 
-
-template<typename DeviceType,
-         typename ValueType,
-	 typename IntType,
-         typename AlgoTagType>
+template <typename DeviceType, typename ValueType, typename IntType,
+          typename AlgoTagType>
 int test_batched_utv() {
 #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutLeft, DeviceType> MatrixViewType;
-    typedef Kokkos::View<ValueType**, Kokkos::LayoutLeft, DeviceType> VectorViewType;
-    typedef Kokkos::View<IntType**,   Kokkos::LayoutLeft, DeviceType> PivViewType;
-    typedef Kokkos::View<ValueType**, Kokkos::LayoutRight,DeviceType> WorkViewType;
-    Test::impl_test_batched_utv<DeviceType,MatrixViewType,VectorViewType,
-				PivViewType,WorkViewType,AlgoTagType>(     0, 10);
-    for (int i=1;i<10;++i) {                                                                                        
-      //printf("Testing: LayoutLeft,  Blksize %d\n", i); 
-      Test::impl_test_batched_utv<DeviceType,MatrixViewType,VectorViewType,
-				  PivViewType,WorkViewType,AlgoTagType>(1024, i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutLeft, DeviceType>
+        MatrixViewType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType>
+        VectorViewType;
+    typedef Kokkos::View<IntType **, Kokkos::LayoutLeft, DeviceType>
+        PivViewType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        WorkViewType;
+    Test::impl_test_batched_utv<DeviceType, MatrixViewType, VectorViewType,
+                                PivViewType, WorkViewType, AlgoTagType>(0, 10);
+    for (int i = 1; i < 10; ++i) {
+      // printf("Testing: LayoutLeft,  Blksize %d\n", i);
+      Test::impl_test_batched_utv<DeviceType, MatrixViewType, VectorViewType,
+                                  PivViewType, WorkViewType, AlgoTagType>(1024,
+                                                                          i);
     }
   }
 #endif
 #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
   {
-    typedef Kokkos::View<ValueType***,Kokkos::LayoutRight,DeviceType> MatrixViewType;
-    typedef Kokkos::View<ValueType**, Kokkos::LayoutRight,DeviceType> VectorViewType;
-    typedef Kokkos::View<IntType**,   Kokkos::LayoutRight,DeviceType> PivViewType;
-    typedef Kokkos::View<ValueType**, Kokkos::LayoutRight,DeviceType> WorkViewType;
-    Test::impl_test_batched_utv<DeviceType,MatrixViewType,VectorViewType,
-				PivViewType,WorkViewType,AlgoTagType>(     0, 10);
-    for (int i=1;i<10;++i) {                                                                                        
-      //printf("Testing: LayoutRight, Blksize %d\n", i); 
-      Test::impl_test_batched_utv<DeviceType,MatrixViewType,VectorViewType,
-				  PivViewType,WorkViewType,AlgoTagType>(1024, i);
+    typedef Kokkos::View<ValueType ***, Kokkos::LayoutRight, DeviceType>
+        MatrixViewType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        VectorViewType;
+    typedef Kokkos::View<IntType **, Kokkos::LayoutRight, DeviceType>
+        PivViewType;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        WorkViewType;
+    Test::impl_test_batched_utv<DeviceType, MatrixViewType, VectorViewType,
+                                PivViewType, WorkViewType, AlgoTagType>(0, 10);
+    for (int i = 1; i < 10; ++i) {
+      // printf("Testing: LayoutRight, Blksize %d\n", i);
+      Test::impl_test_batched_utv<DeviceType, MatrixViewType, VectorViewType,
+                                  PivViewType, WorkViewType, AlgoTagType>(1024,
+                                                                          i);
     }
   }
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorUTV_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorUTV_Real.hpp
index f27a6478f6..6a13d61769 100644
--- a/unit_test/batched/dense/Test_Batched_TeamVectorUTV_Real.hpp
+++ b/unit_test/batched/dense/Test_Batched_TeamVectorUTV_Real.hpp
@@ -1,18 +1,17 @@
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_scalar_teamvector_utv_float ) {
+TEST_F(TestCategory, batched_scalar_teamvector_utv_float) {
   typedef Algo::UTV::Unblocked algo_tag_type;
-  test_batched_utv<TestExecSpace,float,int,algo_tag_type>();
+  test_batched_utv<TestExecSpace, float, int, algo_tag_type>();
 }
 #endif
 
-
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
 // FIXME_SYCL
 #ifndef KOKKOS_ENABLE_SYCL
-TEST_F( TestCategory, batched_scalar_teamvector_utv_double ) {
+TEST_F(TestCategory, batched_scalar_teamvector_utv_double) {
   typedef Algo::UTV::Unblocked algo_tag_type;
-  test_batched_utv<TestExecSpace,double,int,algo_tag_type>();
+  test_batched_utv<TestExecSpace, double, int, algo_tag_type>();
 }
 #endif
 #endif
diff --git a/unit_test/batched/dense/Test_Batched_VectorArithmatic.hpp b/unit_test/batched/dense/Test_Batched_VectorArithmatic.hpp
index 10a6ee413f..098310031e 100644
--- a/unit_test/batched/dense/Test_Batched_VectorArithmatic.hpp
+++ b/unit_test/batched/dense/Test_Batched_VectorArithmatic.hpp
@@ -6,8 +6,10 @@
 //       to ensure it is not included in these
 //       backends unit-test
 
-#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && !defined(TEST_HIP_BATCHED_DENSE_CPP) && \
-  !defined(TEST_SYCL_BATCHED_DENSE_CPP) && !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP)
+#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && \
+    !defined(TEST_HIP_BATCHED_DENSE_CPP) &&  \
+    !defined(TEST_SYCL_BATCHED_DENSE_CPP) && \
+    !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP)
 
 #include "gtest/gtest.h"
 #include "Kokkos_Core.hpp"
@@ -21,188 +23,209 @@ using namespace KokkosBatched;
 
 namespace Test {
 
-  template<typename VectorTagType, int VectorLength>
-  void impl_test_complex_real_imag_value() {
-    typedef Vector<VectorTagType,VectorLength> vector_type;
+template <typename VectorTagType, int VectorLength>
+void impl_test_complex_real_imag_value() {
+  typedef Vector<VectorTagType, VectorLength> vector_type;
 
-    typedef typename vector_type::value_type value_type;
-    const int vector_length = vector_type::vector_length;
+  typedef typename vector_type::value_type value_type;
+  const int vector_length = vector_type::vector_length;
 
-    vector_type a;
+  vector_type a;
 
-    for (int k=0;k<vector_length;++k) {
-      a[k].real() = k*3 + 1;
-      a[k].imag() = k*5 + 4;
-    }
+  for (int k = 0; k < vector_length; ++k) {
+    a[k].real() = k * 3 + 1;
+    a[k].imag() = k * 5 + 4;
+  }
 
-    const auto a_real = Kokkos::Details::ArithTraits<vector_type>::real(a);
-    const auto a_imag = Kokkos::Details::ArithTraits<vector_type>::imag(a);
+  const auto a_real = Kokkos::Details::ArithTraits<vector_type>::real(a);
+  const auto a_imag = Kokkos::Details::ArithTraits<vector_type>::imag(a);
 
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
-    const typename ats::mag_type eps = 1.0e3 * ats::epsilon();
-    for (int k=0;k<vector_length;++k) {
-      EXPECT_NEAR( a[k].real(), a_real[k], eps);
-      EXPECT_NEAR( a[k].imag(), a_imag[k], eps);
-    }
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+  const typename ats::mag_type eps = 1.0e3 * ats::epsilon();
+  for (int k = 0; k < vector_length; ++k) {
+    EXPECT_NEAR(a[k].real(), a_real[k], eps);
+    EXPECT_NEAR(a[k].imag(), a_imag[k], eps);
   }
+}
 
-  template<typename VectorTagType,int VectorLength>
-  void impl_test_batched_vector_arithmatic() {
-    /// random data initialization
-    typedef Vector<VectorTagType,VectorLength> vector_type;
-
-    typedef typename vector_type::value_type value_type;    
-    const int vector_length = vector_type::vector_length;
-    
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
-    typedef typename ats::mag_type mag_type;
-
-    vector_type a, b, c;
-    value_type alpha;
-    mag_type beta;
-    const value_type zero(0);
-
-    Random<value_type> a_random;
-    Random<mag_type> b_random;
-    for (int iter=0;iter<100;++iter) {
-      for (int k=0;k<vector_length;++k) {
-        a[k] = a_random.value();
-        b[k] = a_random.value();
-        c[k] = zero;
-      }
-      alpha = a_random.value();
-      beta  = b_random.value();
-      
-      const mag_type eps = 1.0e3 * ats::epsilon();
-
-      {
-        /// test : vec + vec
-        c = a + b;
-        for (int k=0;k<vector_length;++k) 
-          EXPECT_NEAR( ats::abs(c[k]), ats::abs(a[k]+b[k]), eps*ats::abs(c[k]));      
-        
-        /// test : value + vec
-        c = alpha + b;
-        for (int k=0;k<vector_length;++k) 
-          EXPECT_NEAR( ats::abs(c[k]), ats::abs(alpha+b[k]), eps*ats::abs(c[k]));      
-        
-        /// test : vec + value
-        c = b + alpha;
-        for (int k=0;k<vector_length;++k) 
-          EXPECT_NEAR( ats::abs(c[k]), ats::abs(b[k] + alpha), eps*ats::abs(c[k]));      
-
-        /// test : vec + mag
-        c = a + beta;
-        for (int k=0;k<vector_length;++k) 
-          EXPECT_NEAR( ats::abs(c[k]), ats::abs(a[k] + beta), eps*ats::abs(c[k]));      
-
-        /// test : mag + vec
-        c = beta + a;
-        for (int k=0;k<vector_length;++k) 
-          EXPECT_NEAR( ats::abs(c[k]), ats::abs(beta + a[k]), eps*ats::abs(c[k]));      
-      }
-      {
-        /// test : vec - vec
-        c = a - b;
-        for (int k=0;k<vector_length;++k) 
-          EXPECT_NEAR( ats::abs(c[k]), ats::abs(a[k]-b[k]), eps*ats::abs(c[k]));      
-      
-        /// test : value - vec
-        c = alpha - b;
-        for (int k=0;k<vector_length;++k) 
-          EXPECT_NEAR( ats::abs(c[k]), ats::abs(alpha-b[k]), eps*ats::abs(c[k]));      
-      
-        /// test : vec + value
-        c = b - alpha;
-        for (int k=0;k<vector_length;++k) 
-          EXPECT_NEAR( ats::abs(c[k]), ats::abs(b[k]-alpha), eps*ats::abs(c[k]));      
-
-        /// test : vec - mag
-        c = a - beta;
-        for (int k=0;k<vector_length;++k) 
-          EXPECT_NEAR( ats::abs(c[k]), ats::abs(a[k] - beta), eps*ats::abs(c[k]));      
-
-        /// test : mag - vec
-        c = beta - a;
-        for (int k=0;k<vector_length;++k) 
-          EXPECT_NEAR( ats::abs(c[k]), ats::abs(beta - a[k]), eps*ats::abs(c[k]));      
-      }
-      {
-        /// test : vec * vec
-        c = a * b;
-        for (int k=0;k<vector_length;++k) 
-          EXPECT_NEAR( ats::abs(c[k]), ats::abs(a[k]*b[k]), eps*ats::abs(c[k]));      
-      
-        /// test : value * vec
-        c = alpha * b;
-        for (int k=0;k<vector_length;++k) 
-          EXPECT_NEAR( ats::abs(c[k]), ats::abs(alpha*b[k]), eps*ats::abs(c[k]));      
-      
-        /// test : vec + value
-        c = b * alpha;
-        for (int k=0;k<vector_length;++k) 
-          EXPECT_NEAR( ats::abs(c[k]), ats::abs(b[k]*alpha), eps*ats::abs(c[k]));      
-
-        /// test : vec * mag
-        c = a * beta;
-        for (int k=0;k<vector_length;++k) 
-          EXPECT_NEAR( ats::abs(c[k]), ats::abs(a[k] * beta), eps*ats::abs(c[k]));      
-
-        /// test : mag * vec
-        c = beta * a;
-        for (int k=0;k<vector_length;++k) 
-          EXPECT_NEAR( ats::abs(c[k]), ats::abs(beta * a[k]), eps*ats::abs(c[k]));      
-      }
-      {
-        /// test : vec / vec
-        c = a / b;
-        for (int k=0;k<vector_length;++k) 
-          EXPECT_NEAR( ats::abs(c[k]), ats::abs(a[k]/b[k]), eps*ats::abs(c[k]));      
-        
-        /// test : value / vec
-        c = alpha / b;
-        for (int k=0;k<vector_length;++k) 
-          EXPECT_NEAR( ats::abs(c[k]), ats::abs(alpha/b[k]), eps*ats::abs(c[k]));      
-        
-        /// test : vec / value
-        c = b / alpha;
-        for (int k=0;k<vector_length;++k) 
-          EXPECT_NEAR( ats::abs(c[k]), ats::abs(b[k]/alpha), eps*ats::abs(c[k]));      
-        
-        /// test : mag / vec
-        c = beta / a;
-        for (int k=0;k<vector_length;++k) 
-          EXPECT_NEAR( ats::abs(c[k]), ats::abs(beta/a[k]), eps*ats::abs(c[k]));      
-        
-        /// test : vec / value
-        c = a / beta;
-        for (int k=0;k<vector_length;++k) 
-          EXPECT_NEAR( ats::abs(c[k]), ats::abs(a[k]/beta), eps*ats::abs(c[k]));      
-      }
-      {
-        /// test : vec  -vec
-        c = -a;
-        for (int k=0;k<vector_length;++k) 
-          EXPECT_NEAR( ats::abs(c[k]), ats::abs(-a[k]), eps*ats::abs(c[k]));      
-      }
+template <typename VectorTagType, int VectorLength>
+void impl_test_batched_vector_arithmatic() {
+  /// random data initialization
+  typedef Vector<VectorTagType, VectorLength> vector_type;
+
+  typedef typename vector_type::value_type value_type;
+  const int vector_length = vector_type::vector_length;
+
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+  typedef typename ats::mag_type mag_type;
+
+  vector_type a, b, c;
+  value_type alpha;
+  mag_type beta;
+  const value_type zero(0);
+
+  Random<value_type> a_random;
+  Random<mag_type> b_random;
+  for (int iter = 0; iter < 100; ++iter) {
+    for (int k = 0; k < vector_length; ++k) {
+      a[k] = a_random.value();
+      b[k] = a_random.value();
+      c[k] = zero;
+    }
+    alpha = a_random.value();
+    beta  = b_random.value();
+
+    const mag_type eps = 1.0e3 * ats::epsilon();
+
+    {
+      /// test : vec + vec
+      c = a + b;
+      for (int k = 0; k < vector_length; ++k)
+        EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] + b[k]),
+                    eps * ats::abs(c[k]));
+
+      /// test : value + vec
+      c = alpha + b;
+      for (int k = 0; k < vector_length; ++k)
+        EXPECT_NEAR(ats::abs(c[k]), ats::abs(alpha + b[k]),
+                    eps * ats::abs(c[k]));
+
+      /// test : vec + value
+      c = b + alpha;
+      for (int k = 0; k < vector_length; ++k)
+        EXPECT_NEAR(ats::abs(c[k]), ats::abs(b[k] + alpha),
+                    eps * ats::abs(c[k]));
+
+      /// test : vec + mag
+      c = a + beta;
+      for (int k = 0; k < vector_length; ++k)
+        EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] + beta),
+                    eps * ats::abs(c[k]));
+
+      /// test : mag + vec
+      c = beta + a;
+      for (int k = 0; k < vector_length; ++k)
+        EXPECT_NEAR(ats::abs(c[k]), ats::abs(beta + a[k]),
+                    eps * ats::abs(c[k]));
+    }
+    {
+      /// test : vec - vec
+      c = a - b;
+      for (int k = 0; k < vector_length; ++k)
+        EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] - b[k]),
+                    eps * ats::abs(c[k]));
+
+      /// test : value - vec
+      c = alpha - b;
+      for (int k = 0; k < vector_length; ++k)
+        EXPECT_NEAR(ats::abs(c[k]), ats::abs(alpha - b[k]),
+                    eps * ats::abs(c[k]));
+
+      /// test : vec + value
+      c = b - alpha;
+      for (int k = 0; k < vector_length; ++k)
+        EXPECT_NEAR(ats::abs(c[k]), ats::abs(b[k] - alpha),
+                    eps * ats::abs(c[k]));
+
+      /// test : vec - mag
+      c = a - beta;
+      for (int k = 0; k < vector_length; ++k)
+        EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] - beta),
+                    eps * ats::abs(c[k]));
+
+      /// test : mag - vec
+      c = beta - a;
+      for (int k = 0; k < vector_length; ++k)
+        EXPECT_NEAR(ats::abs(c[k]), ats::abs(beta - a[k]),
+                    eps * ats::abs(c[k]));
+    }
+    {
+      /// test : vec * vec
+      c = a * b;
+      for (int k = 0; k < vector_length; ++k)
+        EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] * b[k]),
+                    eps * ats::abs(c[k]));
+
+      /// test : value * vec
+      c = alpha * b;
+      for (int k = 0; k < vector_length; ++k)
+        EXPECT_NEAR(ats::abs(c[k]), ats::abs(alpha * b[k]),
+                    eps * ats::abs(c[k]));
+
+      /// test : vec + value
+      c = b * alpha;
+      for (int k = 0; k < vector_length; ++k)
+        EXPECT_NEAR(ats::abs(c[k]), ats::abs(b[k] * alpha),
+                    eps * ats::abs(c[k]));
+
+      /// test : vec * mag
+      c = a * beta;
+      for (int k = 0; k < vector_length; ++k)
+        EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] * beta),
+                    eps * ats::abs(c[k]));
+
+      /// test : mag * vec
+      c = beta * a;
+      for (int k = 0; k < vector_length; ++k)
+        EXPECT_NEAR(ats::abs(c[k]), ats::abs(beta * a[k]),
+                    eps * ats::abs(c[k]));
+    }
+    {
+      /// test : vec / vec
+      c = a / b;
+      for (int k = 0; k < vector_length; ++k)
+        EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] / b[k]),
+                    eps * ats::abs(c[k]));
+
+      /// test : value / vec
+      c = alpha / b;
+      for (int k = 0; k < vector_length; ++k)
+        EXPECT_NEAR(ats::abs(c[k]), ats::abs(alpha / b[k]),
+                    eps * ats::abs(c[k]));
+
+      /// test : vec / value
+      c = b / alpha;
+      for (int k = 0; k < vector_length; ++k)
+        EXPECT_NEAR(ats::abs(c[k]), ats::abs(b[k] / alpha),
+                    eps * ats::abs(c[k]));
+
+      /// test : mag / vec
+      c = beta / a;
+      for (int k = 0; k < vector_length; ++k)
+        EXPECT_NEAR(ats::abs(c[k]), ats::abs(beta / a[k]),
+                    eps * ats::abs(c[k]));
+
+      /// test : vec / value
+      c = a / beta;
+      for (int k = 0; k < vector_length; ++k)
+        EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] / beta),
+                    eps * ats::abs(c[k]));
+    }
+    {
+      /// test : vec  -vec
+      c = -a;
+      for (int k = 0; k < vector_length; ++k)
+        EXPECT_NEAR(ats::abs(c[k]), ats::abs(-a[k]), eps * ats::abs(c[k]));
+    }
 #if defined(__DO_NOT_TEST__)
-      {
-        /// test : add radial
-        const mag_type tiny = 1.0;
+    {
+      /// test : add radial
+      const mag_type tiny = 1.0;
 
-        c = vector_type(0); 
-        c += -vector_type(tiny)*vector_type(a <  0);
-        c +=  vector_type(tiny)*vector_type(a >= 0);
+      c = vector_type(0);
+      c += -vector_type(tiny) * vector_type(a < 0);
+      c += vector_type(tiny) * vector_type(a >= 0);
 
-        for (int k=0;k<vector_length;++k) 
-          EXPECT_NEAR( ats::abs(c[k]), ats::abs(a[k] < 0 ? -tiny : tiny), eps*ats::abs(c[k]));      
-      }
+      for (int k = 0; k < vector_length; ++k)
+        EXPECT_NEAR(ats::abs(c[k]), ats::abs(a[k] < 0 ? -tiny : tiny),
+                    eps * ats::abs(c[k]));
+    }
 #endif
-    }    
   }
 }
+}  // namespace Test
 
-template<typename DeviceType,typename VectorTagType,int VectorLength>
+template <typename DeviceType, typename VectorTagType, int VectorLength>
 int test_batched_vector_arithmatic() {
   static_assert(
       Kokkos::SpaceAccessibility<DeviceType, Kokkos::HostSpace>::accessible,
@@ -211,7 +234,7 @@ int test_batched_vector_arithmatic() {
 
   return 0;
 }
-template<typename DeviceType,typename VectorTagType,int VectorLength>
+template <typename DeviceType, typename VectorTagType, int VectorLength>
 int test_batched_complex_real_imag_value() {
   static_assert(
       Kokkos::SpaceAccessibility<DeviceType, Kokkos::HostSpace>::accessible,
@@ -221,94 +244,105 @@ int test_batched_complex_real_imag_value() {
   return 0;
 }
 
-
 ///
 /// SIMD
 ///
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_vector_arithmatic_simd_float3 ) {
-  test_batched_vector_arithmatic<TestExecSpace,SIMD<float>,3>();
+TEST_F(TestCategory, batched_vector_arithmatic_simd_float3) {
+  test_batched_vector_arithmatic<TestExecSpace, SIMD<float>, 3>();
 }
-TEST_F( TestCategory, batched_vector_arithmatic_simd_float4 ) {
-  test_batched_vector_arithmatic<TestExecSpace,SIMD<float>,4>();
+TEST_F(TestCategory, batched_vector_arithmatic_simd_float4) {
+  test_batched_vector_arithmatic<TestExecSpace, SIMD<float>, 4>();
 }
 // avx
-TEST_F( TestCategory, batched_vector_arithmatic_simd_float8 ) {
-  test_batched_vector_arithmatic<TestExecSpace,SIMD<float>,8>();
+TEST_F(TestCategory, batched_vector_arithmatic_simd_float8) {
+  test_batched_vector_arithmatic<TestExecSpace, SIMD<float>, 8>();
 }
 // avx 512
-TEST_F( TestCategory, batched_vector_arithmatic_simd_float16 ) {
-  test_batched_vector_arithmatic<TestExecSpace,SIMD<float>,16>();
+TEST_F(TestCategory, batched_vector_arithmatic_simd_float16) {
+  test_batched_vector_arithmatic<TestExecSpace, SIMD<float>, 16>();
 }
 #endif
 
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F( TestCategory, batched_vector_arithmatic_simd_double3 ) {
-  test_batched_vector_arithmatic<TestExecSpace,SIMD<double>,3>();
+TEST_F(TestCategory, batched_vector_arithmatic_simd_double3) {
+  test_batched_vector_arithmatic<TestExecSpace, SIMD<double>, 3>();
 }
 // avx
-TEST_F( TestCategory, batched_vector_arithmatic_simd_double4 ) {
-  test_batched_vector_arithmatic<TestExecSpace,SIMD<double>,4>();
+TEST_F(TestCategory, batched_vector_arithmatic_simd_double4) {
+  test_batched_vector_arithmatic<TestExecSpace, SIMD<double>, 4>();
 }
-//avx 512
-TEST_F( TestCategory, batched_vector_arithmatic_simd_double8 ) {
-  test_batched_vector_arithmatic<TestExecSpace,SIMD<double>,8>();
+// avx 512
+TEST_F(TestCategory, batched_vector_arithmatic_simd_double8) {
+  test_batched_vector_arithmatic<TestExecSpace, SIMD<double>, 8>();
 }
 #endif
 
-#define __DO_NOT_TEST__ 
+#define __DO_NOT_TEST__
 #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT)
-TEST_F( TestCategory, batched_vector_arithmatic_simd_scomplex3 ) {
-  test_batched_vector_arithmatic<TestExecSpace,SIMD<Kokkos::complex<float> >,3>();
+TEST_F(TestCategory, batched_vector_arithmatic_simd_scomplex3) {
+  test_batched_vector_arithmatic<TestExecSpace, SIMD<Kokkos::complex<float> >,
+                                 3>();
 }
 // avx
-TEST_F( TestCategory, batched_vector_arithmatic_simd_scomplex4 ) {
-  test_batched_vector_arithmatic<TestExecSpace,SIMD<Kokkos::complex<float> >,4>();
+TEST_F(TestCategory, batched_vector_arithmatic_simd_scomplex4) {
+  test_batched_vector_arithmatic<TestExecSpace, SIMD<Kokkos::complex<float> >,
+                                 4>();
 }
 // avx 512
-TEST_F( TestCategory, batched_vector_arithmatic_simd_scomplex8 ) {
-  test_batched_vector_arithmatic<TestExecSpace,SIMD<Kokkos::complex<float> >,8>();
+TEST_F(TestCategory, batched_vector_arithmatic_simd_scomplex8) {
+  test_batched_vector_arithmatic<TestExecSpace, SIMD<Kokkos::complex<float> >,
+                                 8>();
 }
 
-TEST_F( TestCategory, batched_vector_scomplex_real_imag_value3 ) {
-  test_batched_complex_real_imag_value<TestExecSpace,SIMD<Kokkos::complex<float> >,3>();
+TEST_F(TestCategory, batched_vector_scomplex_real_imag_value3) {
+  test_batched_complex_real_imag_value<TestExecSpace,
+                                       SIMD<Kokkos::complex<float> >, 3>();
 }
 // avx
-TEST_F( TestCategory, batched_vector_scomplex_real_imag_value2 ) {
-  test_batched_complex_real_imag_value<TestExecSpace,SIMD<Kokkos::complex<float> >,2>();
+TEST_F(TestCategory, batched_vector_scomplex_real_imag_value2) {
+  test_batched_complex_real_imag_value<TestExecSpace,
+                                       SIMD<Kokkos::complex<float> >, 2>();
 }
 // avx 512
-TEST_F( TestCategory, batched_vector_scomplex_real_imag_value4 ) {
-  test_batched_complex_real_imag_value<TestExecSpace,SIMD<Kokkos::complex<float> >,4>();
+TEST_F(TestCategory, batched_vector_scomplex_real_imag_value4) {
+  test_batched_complex_real_imag_value<TestExecSpace,
+                                       SIMD<Kokkos::complex<float> >, 4>();
 }
 #endif
 
 #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
-TEST_F( TestCategory, batched_vector_arithmatic_simd_dcomplex3 ) {
-  test_batched_vector_arithmatic<TestExecSpace,SIMD<Kokkos::complex<double> >,3>();
+TEST_F(TestCategory, batched_vector_arithmatic_simd_dcomplex3) {
+  test_batched_vector_arithmatic<TestExecSpace, SIMD<Kokkos::complex<double> >,
+                                 3>();
 }
 // avx
-TEST_F( TestCategory, batched_vector_arithmatic_simd_dcomplex2 ) {
-  test_batched_vector_arithmatic<TestExecSpace,SIMD<Kokkos::complex<double> >,2>();
+TEST_F(TestCategory, batched_vector_arithmatic_simd_dcomplex2) {
+  test_batched_vector_arithmatic<TestExecSpace, SIMD<Kokkos::complex<double> >,
+                                 2>();
 }
 // avx 512
-TEST_F( TestCategory, batched_vector_arithmatic_simd_dcomplex4 ) {
-  test_batched_vector_arithmatic<TestExecSpace,SIMD<Kokkos::complex<double> >,4>();
+TEST_F(TestCategory, batched_vector_arithmatic_simd_dcomplex4) {
+  test_batched_vector_arithmatic<TestExecSpace, SIMD<Kokkos::complex<double> >,
+                                 4>();
 }
 
-TEST_F( TestCategory, batched_vector_dcomplex_real_imag_value3 ) {
-  test_batched_complex_real_imag_value<TestExecSpace,SIMD<Kokkos::complex<double> >,3>();
+TEST_F(TestCategory, batched_vector_dcomplex_real_imag_value3) {
+  test_batched_complex_real_imag_value<TestExecSpace,
+                                       SIMD<Kokkos::complex<double> >, 3>();
 }
 // avx
-TEST_F( TestCategory, batched_vector_dcomplex_real_imag_value2 ) {
-  test_batched_complex_real_imag_value<TestExecSpace,SIMD<Kokkos::complex<double> >,2>();
+TEST_F(TestCategory, batched_vector_dcomplex_real_imag_value2) {
+  test_batched_complex_real_imag_value<TestExecSpace,
+                                       SIMD<Kokkos::complex<double> >, 2>();
 }
 // avx 512
-TEST_F( TestCategory, batched_vector_dcomplex_real_imag_value4 ) {
-  test_batched_complex_real_imag_value<TestExecSpace,SIMD<Kokkos::complex<double> >,4>();
+TEST_F(TestCategory, batched_vector_dcomplex_real_imag_value4) {
+  test_batched_complex_real_imag_value<TestExecSpace,
+                                       SIMD<Kokkos::complex<double> >, 4>();
 }
 #endif
 #undef __DO_NOT_TEST__
 
-#endif // check to not include this in a device test
+#endif  // check to not include this in a device test
diff --git a/unit_test/batched/dense/Test_Batched_VectorLogical.hpp b/unit_test/batched/dense/Test_Batched_VectorLogical.hpp
index 5ee14311e9..4fac1ab418 100644
--- a/unit_test/batched/dense/Test_Batched_VectorLogical.hpp
+++ b/unit_test/batched/dense/Test_Batched_VectorLogical.hpp
@@ -6,8 +6,10 @@
 //       to ensure it is not included in these
 //       backends unit-test
 
-#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && !defined(TEST_HIP_BATCHED_DENSE_CPP) && \
-  !defined(TEST_SYCL_BATCHED_DENSE_CPP) && !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP)
+#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && \
+    !defined(TEST_HIP_BATCHED_DENSE_CPP) &&  \
+    !defined(TEST_SYCL_BATCHED_DENSE_CPP) && \
+    !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP)
 
 #include "gtest/gtest.h"
 #include "Kokkos_Core.hpp"
@@ -21,68 +23,67 @@ using namespace KokkosBatched;
 
 namespace Test {
 
-  template<typename ValueType,int VectorLength>
-  void impl_test_batched_vector_logical() {
-    /// random data initialization
-    typedef Vector<SIMD<int>,VectorLength> vector_int_type;
-    typedef ValueType value_type;
-    const int vector_length = VectorLength;
-    
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
-    typedef typename ats::mag_type mag_type;
-
-    vector_int_type a, b;
-
-    Random<mag_type> random;
-    for (int iter=0;iter<100;++iter) {
-      for (int k=0;k<vector_length;++k) {
-        a[k] = (random.value() > 0 ? 1 : -1);
-        b[k] = (random.value() < 0 ? 1 : -1);
-      }
-      
-      {
+template <typename ValueType, int VectorLength>
+void impl_test_batched_vector_logical() {
+  /// random data initialization
+  typedef Vector<SIMD<int>, VectorLength> vector_int_type;
+  typedef ValueType value_type;
+  const int vector_length = VectorLength;
 
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+  typedef typename ats::mag_type mag_type;
+
+  vector_int_type a, b;
+
+  Random<mag_type> random;
+  for (int iter = 0; iter < 100; ++iter) {
+    for (int k = 0; k < vector_length; ++k) {
+      a[k] = (random.value() > 0 ? 1 : -1);
+      b[k] = (random.value() < 0 ? 1 : -1);
+    }
+
+    {
 #undef CHECK
-#define CHECK(op)                                       \
-        {                                               \
-          const auto comparison = a op b;               \
-          for (int i=0;i<vector_length;++i)             \
-            EXPECT_EQ( comparison[i], a[i] op b[i]);    \
-        }
-        
-        CHECK(||);
-        CHECK(&&);
+#define CHECK(op)                             \
+  {                                           \
+    const auto comparison = a op b;           \
+    for (int i = 0; i < vector_length; ++i)   \
+      EXPECT_EQ(comparison[i], a[i] op b[i]); \
+  }
+
+      CHECK(||);
+      CHECK(&&);
 
 #undef CHECK
-#define CHECK(op)                                               \
-        {                                                       \
-          const auto comparison = a op 0;                       \
-          for (int i=0;i<vector_length;++i)                     \
-            EXPECT_EQ( comparison[i], a[i] op 0);               \
-        }
-        
-        CHECK(||);
-        CHECK(&&);
+#define CHECK(op)                           \
+  {                                         \
+    const auto comparison = a op 0;         \
+    for (int i = 0; i < vector_length; ++i) \
+      EXPECT_EQ(comparison[i], a[i] op 0);  \
+  }
+
+      CHECK(||);
+      CHECK(&&);
 
 #undef CHECK
-#define CHECK(op)                                               \
-        {                                                       \
-          const auto comparison = 0 op b;                       \
-          for (int i=0;i<vector_length;++i)                     \
-            EXPECT_EQ( comparison[i], 0 op b[i]);               \
-        }
-        
-        CHECK(||);
-        CHECK(&&);
+#define CHECK(op)                           \
+  {                                         \
+    const auto comparison = 0 op b;         \
+    for (int i = 0; i < vector_length; ++i) \
+      EXPECT_EQ(comparison[i], 0 op b[i]);  \
+  }
+
+      CHECK(||);
+      CHECK(&&);
 
 #undef CHECK
 
-      } // end test body
-    } // end for
-  } // impl
-} // namespace
+    }  // end test body
+  }    // end for
+}  // impl
+}  // namespace Test
 
-template<typename DeviceType,typename ValueType,int VectorLength>
+template <typename DeviceType, typename ValueType, int VectorLength>
 int test_batched_vector_logical() {
   static_assert(
       Kokkos::SpaceAccessibility<DeviceType, Kokkos::HostSpace>::accessible,
@@ -92,26 +93,25 @@ int test_batched_vector_logical() {
   return 0;
 }
 
-
 ///
 /// SIMD
 ///
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_vector_logical_simd_float3 ) {
-  test_batched_vector_logical<TestExecSpace,float,3>();
+TEST_F(TestCategory, batched_vector_logical_simd_float3) {
+  test_batched_vector_logical<TestExecSpace, float, 3>();
 }
-TEST_F( TestCategory, batched_vector_logical_simd_float8 ) {
-  test_batched_vector_logical<TestExecSpace,float,8>();
+TEST_F(TestCategory, batched_vector_logical_simd_float8) {
+  test_batched_vector_logical<TestExecSpace, float, 8>();
 }
 #endif
 
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F( TestCategory, batched_vector_logical_simd_double3 ) {
-  test_batched_vector_logical<TestExecSpace,double,3>();
+TEST_F(TestCategory, batched_vector_logical_simd_double3) {
+  test_batched_vector_logical<TestExecSpace, double, 3>();
 }
-TEST_F( TestCategory, batched_vector_logical_simd_double4 ) {
-  test_batched_vector_logical<TestExecSpace,double,4>();
+TEST_F(TestCategory, batched_vector_logical_simd_double4) {
+  test_batched_vector_logical<TestExecSpace, double, 4>();
 }
 #endif
 
@@ -133,4 +133,4 @@ TEST_F( TestCategory, batched_vector_logical_simd_double4 ) {
 // }
 // #endif
 
-#endif // check to not include this in a device test
+#endif  // check to not include this in a device test
diff --git a/unit_test/batched/dense/Test_Batched_VectorMath.hpp b/unit_test/batched/dense/Test_Batched_VectorMath.hpp
index f33db7257a..6771e71bfa 100644
--- a/unit_test/batched/dense/Test_Batched_VectorMath.hpp
+++ b/unit_test/batched/dense/Test_Batched_VectorMath.hpp
@@ -6,8 +6,10 @@
 //       to ensure it is not included in these
 //       backends unit-test
 
-#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && !defined(TEST_HIP_BATCHED_DENSE_CPP) \
-  && !defined(TEST_SYCL_BATCHED_DENSE_CPP) && !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP)
+#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && \
+    !defined(TEST_HIP_BATCHED_DENSE_CPP) &&  \
+    !defined(TEST_SYCL_BATCHED_DENSE_CPP) && \
+    !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP)
 
 #include "gtest/gtest.h"
 #include "Kokkos_Core.hpp"
@@ -21,94 +23,93 @@ using namespace KokkosBatched;
 
 namespace Test {
 
-  template<typename VectorTagType,int VectorLength>
-  void impl_test_batched_vector_math() {
-    /// random data initialization
-    typedef Vector<VectorTagType,VectorLength> vector_type;
-    
-    typedef typename vector_type::value_type value_type;    
-    const int vector_length = vector_type::vector_length;
-    
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
-    typedef typename ats::mag_type mag_type;
-
-    vector_type a, b, aref, bref;
-    const value_type one(1.0);
-    const value_type half(0.5);
-    const value_type maxmin(1.0e-7);
-    const mag_type eps = 1.0e3 * ats::epsilon();
-
-    Random<value_type> random;
-    for (int iter=0;iter<100;++iter) {
-      for (int k=0;k<vector_length;++k) {
-        const auto aval = (random.value() + half);
-        const auto bval = (random.value() + half);
-        
-        aref[k] = max(min(aval, one), maxmin);
-        bref[k] = max(min(bval, one), maxmin);
-      }
-
-      {
+template <typename VectorTagType, int VectorLength>
+void impl_test_batched_vector_math() {
+  /// random data initialization
+  typedef Vector<VectorTagType, VectorLength> vector_type;
 
+  typedef typename vector_type::value_type value_type;
+  const int vector_length = vector_type::vector_length;
+
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+  typedef typename ats::mag_type mag_type;
+
+  vector_type a, b, aref, bref;
+  const value_type one(1.0);
+  const value_type half(0.5);
+  const value_type maxmin(1.0e-7);
+  const mag_type eps = 1.0e3 * ats::epsilon();
+
+  Random<value_type> random;
+  for (int iter = 0; iter < 100; ++iter) {
+    for (int k = 0; k < vector_length; ++k) {
+      const auto aval = (random.value() + half);
+      const auto bval = (random.value() + half);
+
+      aref[k] = max(min(aval, one), maxmin);
+      bref[k] = max(min(bval, one), maxmin);
+    }
+
+    {
 #undef CHECK
-#define CHECK(op)                                               \
-        {                                                       \
-          a = op(aref);                                         \
-          for (int i=0;i<vector_length;++i)                     \
-            EXPECT_NEAR_KK( a[i], ats::op(aref[i]), eps*a[i]);  \
-        }
-        
-        CHECK(sqrt);
-        CHECK(cbrt);
-        CHECK(log);
-        CHECK(exp);
-        CHECK(sin);
-        CHECK(cos);
-        CHECK(tan);
-        CHECK(sinh);
-        CHECK(cosh);
-        CHECK(tanh);
-        CHECK(asin);
-        CHECK(acos);
-        CHECK(atan);
+#define CHECK(op)                                        \
+  {                                                      \
+    a = op(aref);                                        \
+    for (int i = 0; i < vector_length; ++i)              \
+      EXPECT_NEAR_KK(a[i], ats::op(aref[i]), eps* a[i]); \
+  }
+
+      CHECK(sqrt);
+      CHECK(cbrt);
+      CHECK(log);
+      CHECK(exp);
+      CHECK(sin);
+      CHECK(cos);
+      CHECK(tan);
+      CHECK(sinh);
+      CHECK(cosh);
+      CHECK(tanh);
+      CHECK(asin);
+      CHECK(acos);
+      CHECK(atan);
 
 #undef CHECK
-#define CHECK                                                           \
-        {                                                               \
-          a = pow(aref,bref);                                           \
-          for (int i=0;i<vector_length;++i)                             \
-            EXPECT_NEAR_KK( a[i], ats::pow(aref[i], bref[i]), eps*a[i] ); \
-        }                                                               \
-        CHECK;
-        
+#define CHECK                                                      \
+  {                                                                \
+    a = pow(aref, bref);                                           \
+    for (int i = 0; i < vector_length; ++i)                        \
+      EXPECT_NEAR_KK(a[i], ats::pow(aref[i], bref[i]), eps* a[i]); \
+  }                                                                \
+  CHECK;
+
 #undef CHECK
-#define CHECK(op)                                                       \
-        {                                                               \
-          mag_type beta = mag_type(3.2);                                \
-          a = op(aref,beta);                                            \
-          for (int i=0;i<vector_length;++i)                             \
-            EXPECT_NEAR_KK( a[i], ats::op(aref[i], beta), eps*a[i] );   \
-        }
-        
-        CHECK(pow);
-        
+#define CHECK(op)                                              \
+  {                                                            \
+    mag_type beta = mag_type(3.2);                             \
+    a             = op(aref, beta);                            \
+    for (int i = 0; i < vector_length; ++i)                    \
+      EXPECT_NEAR_KK(a[i], ats::op(aref[i], beta), eps* a[i]); \
+  }
+
+      CHECK(pow);
+
 #undef CHECK
-#define CHECK(op)                                                       \
-        {                                                               \
-          value_type alpha = random.value() + 2.0;                      \
-          a = op(alpha,bref);                                           \
-          for (int i=0;i<vector_length;++i)                             \
-            EXPECT_NEAR_KK( a[i], ats::op(alpha, bref[i]), eps*a[i] );  \
-        }
-        
-        CHECK(pow);
+#define CHECK(op)                                               \
+  {                                                             \
+    value_type alpha = random.value() + 2.0;                    \
+    a                = op(alpha, bref);                         \
+    for (int i = 0; i < vector_length; ++i)                     \
+      EXPECT_NEAR_KK(a[i], ats::op(alpha, bref[i]), eps* a[i]); \
+  }
+
+      CHECK(pow);
 #undef CHECK
-      } // end test body
-    } // end for
-  } // impl
-} // namespace
+    }  // end test body
+  }    // end for
+}  // impl
+}  // namespace Test
 
-template<typename DeviceType,typename VectorTagType,int VectorLength>
+template <typename DeviceType, typename VectorTagType, int VectorLength>
 int test_batched_vector_math() {
   static_assert(
       Kokkos::SpaceAccessibility<DeviceType, Kokkos::HostSpace>::accessible,
@@ -124,11 +125,11 @@ int test_batched_vector_math() {
 //   typedef typename ats::mag_type mag_type;
 
 //   const mag_type eps = 1.0e3 * ats::epsilon();
-  
+
 //   mag_type a2 = 4.5;
 //   Kokkos::complex<mag_type> a0(3.2, -1.4), a1(1.2, 2.3);
 //   std::complex<mag_type> b0(3.2, -1.4), b1(1.2, 2.3);
-  
+
 //   Test::EXPECT_NEAR_KK( ats::pow(a0,a1), std::pow(b0,b1), eps );
 //   Test::EXPECT_NEAR_KK( ats::pow(a0,a2), std::pow(b0,a2), eps );
 
@@ -140,24 +141,23 @@ int test_batched_vector_math() {
 ///
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_vector_math_simd_float3 ) {
-  test_batched_vector_math<TestExecSpace,SIMD<float>,3>();
+TEST_F(TestCategory, batched_vector_math_simd_float3) {
+  test_batched_vector_math<TestExecSpace, SIMD<float>, 3>();
 }
-TEST_F( TestCategory, batched_vector_math_simd_float8 ) {
-  test_batched_vector_math<TestExecSpace,SIMD<float>,8>();
+TEST_F(TestCategory, batched_vector_math_simd_float8) {
+  test_batched_vector_math<TestExecSpace, SIMD<float>, 8>();
 }
 #endif
 
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F( TestCategory, batched_vector_math_simd_double3 ) {
-  test_batched_vector_math<TestExecSpace,SIMD<double>,3>();
+TEST_F(TestCategory, batched_vector_math_simd_double3) {
+  test_batched_vector_math<TestExecSpace, SIMD<double>, 3>();
 }
-TEST_F( TestCategory, batched_vector_math_simd_double4 ) {
-  test_batched_vector_math<TestExecSpace,SIMD<double>,4>();
+TEST_F(TestCategory, batched_vector_math_simd_double4) {
+  test_batched_vector_math<TestExecSpace, SIMD<double>, 4>();
 }
 #endif
 
-
 // using namespace Test;
 
 // #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT)
@@ -180,4 +180,4 @@ TEST_F( TestCategory, batched_vector_math_simd_double4 ) {
 // }
 // #endif
 
-#endif // check to not include this in a device test
+#endif  // check to not include this in a device test
diff --git a/unit_test/batched/dense/Test_Batched_VectorMisc.hpp b/unit_test/batched/dense/Test_Batched_VectorMisc.hpp
index 0ffefa5dd8..b5b9d65cc8 100644
--- a/unit_test/batched/dense/Test_Batched_VectorMisc.hpp
+++ b/unit_test/batched/dense/Test_Batched_VectorMisc.hpp
@@ -6,8 +6,10 @@
 //       to ensure it is not included in these
 //       backends unit-test
 
-#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && !defined(TEST_HIP_BATCHED_DENSE_CPP) && \
-  !defined(TEST_SYCL_BATCHED_DENSE_CPP) && !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP)
+#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && \
+    !defined(TEST_HIP_BATCHED_DENSE_CPP) &&  \
+    !defined(TEST_SYCL_BATCHED_DENSE_CPP) && \
+    !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP)
 
 #include "gtest/gtest.h"
 #include "Kokkos_Core.hpp"
@@ -21,120 +23,126 @@ using namespace KokkosBatched;
 
 namespace Test {
 
-  template<typename VectorTagType,int VectorLength>
-  void impl_test_batched_vector_misc() {
-    /// random data initialization
-    typedef Vector<VectorTagType,VectorLength> vector_type;
-    
-    typedef typename vector_type::value_type value_type;    
-    const int vector_length = vector_type::vector_length;
-    
-    typedef Kokkos::Details::ArithTraits<value_type> ats;
-    typedef typename ats::mag_type mag_type;
-
-    vector_type a, b, c;
-    const mag_type eps = 1.0e3 * ats::epsilon();
-
-    Random<value_type> random;
-    for (int iter=0;iter<100;++iter) {
-      for (int k=0;k<vector_length;++k) {
-        a[k] = random.value();
-        b[k] = random.value();
+template <typename VectorTagType, int VectorLength>
+void impl_test_batched_vector_misc() {
+  /// random data initialization
+  typedef Vector<VectorTagType, VectorLength> vector_type;
+
+  typedef typename vector_type::value_type value_type;
+  const int vector_length = vector_type::vector_length;
+
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+  typedef typename ats::mag_type mag_type;
+
+  vector_type a, b, c;
+  const mag_type eps = 1.0e3 * ats::epsilon();
+
+  Random<value_type> random;
+  for (int iter = 0; iter < 100; ++iter) {
+    for (int k = 0; k < vector_length; ++k) {
+      a[k] = random.value();
+      b[k] = random.value();
+    }
+
+    {
+      c = conditional_assign(a < b, a, b);
+      for (int i = 0; i < vector_length; ++i) {
+        const auto cc = a[i] < b[i] ? a[i] : b[i];
+        EXPECT_NEAR_KK(c[i], cc, eps * c[i]);
+      }
+
+      c = 0;
+      conditional_assign(c, a < b, a, b);
+      for (int i = 0; i < vector_length; ++i) {
+        const auto cc = a[i] < b[i] ? a[i] : b[i];
+        EXPECT_NEAR_KK(c[i], cc, eps * c[i]);
+      }
+    }
+    {
+      c = conditional_assign(a < b, a, value_type(0));
+      for (int i = 0; i < vector_length; ++i) {
+        const auto cc = a[i] < b[i] ? a[i] : 0;
+        EXPECT_NEAR_KK(c[i], cc, eps * c[i]);
       }
 
-      {
-        c = conditional_assign(a < b, a, b);
-        for (int i=0;i<vector_length;++i) {
-          const auto cc = a[i] < b[i] ? a[i] : b[i];
-          EXPECT_NEAR_KK( c[i], cc, eps*c[i] );
-        }
-
-        c = 0; conditional_assign(c, a < b, a, b);
-        for (int i=0;i<vector_length;++i) {
-          const auto cc = a[i] < b[i] ? a[i] : b[i];
-          EXPECT_NEAR_KK( c[i], cc, eps*c[i] );
-        }
+      c = 0;
+      conditional_assign(c, a < b, a, value_type(0));
+      for (int i = 0; i < vector_length; ++i) {
+        const auto cc = a[i] < b[i] ? a[i] : 0;
+        EXPECT_NEAR_KK(c[i], cc, eps * c[i]);
       }
-      {
-        c = conditional_assign(a < b, a, value_type(0));
-        for (int i=0;i<vector_length;++i) {
-          const auto cc = a[i] < b[i] ? a[i] : 0;
-          EXPECT_NEAR_KK( c[i], cc, eps*c[i] );
-        }
-
-        c = 0; conditional_assign(c, a < b, a, value_type(0));
-        for (int i=0;i<vector_length;++i) {
-          const auto cc = a[i] < b[i] ? a[i] : 0;
-          EXPECT_NEAR_KK( c[i], cc, eps*c[i] );
-        }
+    }
+    {
+      c = conditional_assign(a < b, value_type(0), b);
+      for (int i = 0; i < vector_length; ++i) {
+        const auto cc = a[i] < b[i] ? 0 : b[i];
+        EXPECT_NEAR_KK(c[i], cc, eps * c[i]);
       }
-      {
-        c = conditional_assign(a < b, value_type(0), b);
-        for (int i=0;i<vector_length;++i) {
-          const auto cc = a[i] < b[i] ? 0 : b[i];
-          EXPECT_NEAR_KK( c[i], cc, eps*c[i] );
-        }
-
-        c = 0; conditional_assign(c, a < b, value_type(0), b);
-        for (int i=0;i<vector_length;++i) {
-          const auto cc = a[i] < b[i] ? 0 : b[i];
-          EXPECT_NEAR_KK( c[i], cc, eps*c[i] );
-        }
+
+      c = 0;
+      conditional_assign(c, a < b, value_type(0), b);
+      for (int i = 0; i < vector_length; ++i) {
+        const auto cc = a[i] < b[i] ? 0 : b[i];
+        EXPECT_NEAR_KK(c[i], cc, eps * c[i]);
+      }
+    }
+
+    {
+      typedef Vector<SIMD<bool>, VectorLength> vector_bool_type;
+      vector_bool_type cond_all_true, cond_all_false, cond_alternate;
+
+      for (int i = 0; i < vector_length; ++i) {
+        cond_all_true[i]  = true;
+        cond_all_false[i] = false;
+        cond_alternate[i] = i % 2;
       }
+      bool all_true, any_true;
 
-      {
-        typedef Vector<SIMD<bool>,VectorLength> vector_bool_type;
-        vector_bool_type cond_all_true, cond_all_false, cond_alternate;
-        
-        for (int i=0;i<vector_length;++i) {
-          cond_all_true [i] = true;
-          cond_all_false[i] = false;
-          cond_alternate[i] = i%2;
-        }
-        bool all_true, any_true;
-        
-        all_true = true; any_true = false;
-        for (int i=0;i<vector_length;++i) {
-          all_true &= cond_all_true[i];
-          any_true |= cond_all_true[i];
-        }
-        EXPECT_EQ( all_true, true );
-        EXPECT_EQ( any_true, true );
-        
-        all_true = true; any_true = false;
-        for (int i=0;i<vector_length;++i) {
-          all_true &= cond_all_false[i];
-          any_true |= cond_all_false[i];
-        }
-        EXPECT_EQ( all_true, false );
-        EXPECT_EQ( any_true, false );
-        
-        all_true = true; any_true = false;
-        for (int i=0;i<vector_length;++i) {
-          all_true &= cond_alternate[i];
-          any_true |= cond_alternate[i];
-        }
-        EXPECT_EQ( all_true, false );
-        EXPECT_EQ( any_true, true );
+      all_true = true;
+      any_true = false;
+      for (int i = 0; i < vector_length; ++i) {
+        all_true &= cond_all_true[i];
+        any_true |= cond_all_true[i];
       }
-      {
-        value_type min_a = a[0], max_a = a[0], sum_a = 0, prod_a = 1;
-        for (int i=0;i<vector_length;++i) {
-          min_a = min(min_a, a[i]);
-          max_a = max(max_a, a[i]);
-          sum_a += a[i];
-          prod_a *= a[i];
-        }
-        EXPECT_NEAR_KK( min_a,  min(a),  eps*min_a );
-        EXPECT_NEAR_KK( max_a,  max(a),  eps*max_a );
-        EXPECT_NEAR_KK( sum_a,  sum(a),  eps*sum_a );
-        EXPECT_NEAR_KK( prod_a, prod(a), eps*prod_a );
-      } // end test body
-    } // end for
-  } // impl
-} // namespace
-
-template<typename DeviceType,typename VectorTagType,int VectorLength>
+      EXPECT_EQ(all_true, true);
+      EXPECT_EQ(any_true, true);
+
+      all_true = true;
+      any_true = false;
+      for (int i = 0; i < vector_length; ++i) {
+        all_true &= cond_all_false[i];
+        any_true |= cond_all_false[i];
+      }
+      EXPECT_EQ(all_true, false);
+      EXPECT_EQ(any_true, false);
+
+      all_true = true;
+      any_true = false;
+      for (int i = 0; i < vector_length; ++i) {
+        all_true &= cond_alternate[i];
+        any_true |= cond_alternate[i];
+      }
+      EXPECT_EQ(all_true, false);
+      EXPECT_EQ(any_true, true);
+    }
+    {
+      value_type min_a = a[0], max_a = a[0], sum_a = 0, prod_a = 1;
+      for (int i = 0; i < vector_length; ++i) {
+        min_a = min(min_a, a[i]);
+        max_a = max(max_a, a[i]);
+        sum_a += a[i];
+        prod_a *= a[i];
+      }
+      EXPECT_NEAR_KK(min_a, min(a), eps * min_a);
+      EXPECT_NEAR_KK(max_a, max(a), eps * max_a);
+      EXPECT_NEAR_KK(sum_a, sum(a), eps * sum_a);
+      EXPECT_NEAR_KK(prod_a, prod(a), eps * prod_a);
+    }  // end test body
+  }    // end for
+}  // impl
+}  // namespace Test
+
+template <typename DeviceType, typename VectorTagType, int VectorLength>
 int test_batched_vector_misc() {
   static_assert(
       Kokkos::SpaceAccessibility<DeviceType, Kokkos::HostSpace>::accessible,
@@ -144,26 +152,25 @@ int test_batched_vector_misc() {
   return 0;
 }
 
-
 ///
 /// SIMD
 ///
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_vector_misc_simd_float3 ) {
-  test_batched_vector_misc<TestExecSpace,SIMD<float>,3>();
+TEST_F(TestCategory, batched_vector_misc_simd_float3) {
+  test_batched_vector_misc<TestExecSpace, SIMD<float>, 3>();
 }
-TEST_F( TestCategory, batched_vector_misc_simd_float8 ) {
-  test_batched_vector_misc<TestExecSpace,SIMD<float>,8>();
+TEST_F(TestCategory, batched_vector_misc_simd_float8) {
+  test_batched_vector_misc<TestExecSpace, SIMD<float>, 8>();
 }
 #endif
 
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F( TestCategory, batched_vector_misc_simd_double3 ) {
-  test_batched_vector_misc<TestExecSpace,SIMD<double>,3>();
+TEST_F(TestCategory, batched_vector_misc_simd_double3) {
+  test_batched_vector_misc<TestExecSpace, SIMD<double>, 3>();
 }
-TEST_F( TestCategory, batched_vector_misc_simd_double4 ) {
-  test_batched_vector_misc<TestExecSpace,SIMD<double>,4>();
+TEST_F(TestCategory, batched_vector_misc_simd_double4) {
+  test_batched_vector_misc<TestExecSpace, SIMD<double>, 4>();
 }
 #endif
 
@@ -185,4 +192,4 @@ TEST_F( TestCategory, batched_vector_misc_simd_double4 ) {
 // }
 // #endif
 
-#endif // check to not include this in a device test
+#endif  // check to not include this in a device test
diff --git a/unit_test/batched/dense/Test_Batched_VectorRelation.hpp b/unit_test/batched/dense/Test_Batched_VectorRelation.hpp
index 5d83043311..7e3c5ebc37 100644
--- a/unit_test/batched/dense/Test_Batched_VectorRelation.hpp
+++ b/unit_test/batched/dense/Test_Batched_VectorRelation.hpp
@@ -6,8 +6,10 @@
 //       to ensure it is not included in these
 //       backends unit-test
 
-#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && !defined(TEST_HIP_BATCHED_DENSE_CPP) && \
-  !defined(TEST_SYCL_BATCHED_DENSE_CPP) && !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP)
+#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && \
+    !defined(TEST_HIP_BATCHED_DENSE_CPP) &&  \
+    !defined(TEST_SYCL_BATCHED_DENSE_CPP) && \
+    !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP)
 
 #include "gtest/gtest.h"
 #include "Kokkos_Core.hpp"
@@ -21,81 +23,80 @@ using namespace KokkosBatched;
 
 namespace Test {
 
-  template<typename VectorTagType,int VectorLength>
-  void impl_test_batched_vector_relation() {
-    /// random data initialization
-    typedef Vector<VectorTagType,VectorLength> vector_type;
-    
-    typedef typename vector_type::value_type value_type;    
-    const int vector_length = vector_type::vector_length;
-    
-    //typedef Kokkos::Details::ArithTraits<value_type> ats;
-    //typedef typename ats::mag_type mag_type;
+template <typename VectorTagType, int VectorLength>
+void impl_test_batched_vector_relation() {
+  /// random data initialization
+  typedef Vector<VectorTagType, VectorLength> vector_type;
 
-    vector_type a, b;
+  typedef typename vector_type::value_type value_type;
+  const int vector_length = vector_type::vector_length;
 
-    Random<value_type> random;
-    for (int iter=0;iter<100;++iter) {
-      for (int k=0;k<vector_length;++k) {
-        a[k] = random.value();
-        b[k] = random.value();
-      }
+  // typedef Kokkos::Details::ArithTraits<value_type> ats;
+  // typedef typename ats::mag_type mag_type;
 
-      {
+  vector_type a, b;
 
+  Random<value_type> random;
+  for (int iter = 0; iter < 100; ++iter) {
+    for (int k = 0; k < vector_length; ++k) {
+      a[k] = random.value();
+      b[k] = random.value();
+    }
+
+    {
 #undef CHECK
-#define CHECK(op)                                       \
-        {                                               \
-          const auto comparison = a op b;               \
-          for (int i=0;i<vector_length;++i)             \
-            EXPECT_EQ( comparison[i], a[i] op b[i]);    \
-        }
-        
-        CHECK(<);
-        CHECK(>);
-        CHECK(<=);
-        CHECK(>=);
-        CHECK(==);
-        CHECK(!=);
+#define CHECK(op)                             \
+  {                                           \
+    const auto comparison = a op b;           \
+    for (int i = 0; i < vector_length; ++i)   \
+      EXPECT_EQ(comparison[i], a[i] op b[i]); \
+  }
+
+      CHECK(<);
+      CHECK(>);
+      CHECK(<=);
+      CHECK(>=);
+      CHECK(==);
+      CHECK(!=);
 
 #undef CHECK
-#define CHECK(op)                                               \
-        {                                                       \
-          const auto comparison = a op value_type(0);           \
-          for (int i=0;i<vector_length;++i)                     \
-            EXPECT_EQ( comparison[i], a[i] op value_type(0));   \
-        }
-        
-        CHECK(<);
-        CHECK(>);
-        CHECK(<=);
-        CHECK(>=);
-        CHECK(==);
-        CHECK(!=);
+#define CHECK(op)                                      \
+  {                                                    \
+    const auto comparison = a op value_type(0);        \
+    for (int i = 0; i < vector_length; ++i)            \
+      EXPECT_EQ(comparison[i], a[i] op value_type(0)); \
+  }
+
+      CHECK(<);
+      CHECK(>);
+      CHECK(<=);
+      CHECK(>=);
+      CHECK(==);
+      CHECK(!=);
 
 #undef CHECK
-#define CHECK(op)                                               \
-        {                                                       \
-          const auto comparison = value_type(0) op b;           \
-          for (int i=0;i<vector_length;++i)                     \
-            EXPECT_EQ( comparison[i], value_type(0) op b[i]);   \
-        }
-        
-        CHECK(<);
-        CHECK(>);
-        CHECK(<=);
-        CHECK(>=);
-        CHECK(==);
-        CHECK(!=);
+#define CHECK(op)                                      \
+  {                                                    \
+    const auto comparison = value_type(0) op b;        \
+    for (int i = 0; i < vector_length; ++i)            \
+      EXPECT_EQ(comparison[i], value_type(0) op b[i]); \
+  }
+
+      CHECK(<);
+      CHECK(>);
+      CHECK(<=);
+      CHECK(>=);
+      CHECK(==);
+      CHECK(!=);
 
 #undef CHECK
 
-      } // end test body
-    } // end for
-  } // impl
-} // namespace
+    }  // end test body
+  }    // end for
+}  // impl
+}  // namespace Test
 
-template<typename DeviceType,typename VectorTagType,int VectorLength>
+template <typename DeviceType, typename VectorTagType, int VectorLength>
 int test_batched_vector_relation() {
   static_assert(
       Kokkos::SpaceAccessibility<DeviceType, Kokkos::HostSpace>::accessible,
@@ -105,26 +106,25 @@ int test_batched_vector_relation() {
   return 0;
 }
 
-
 ///
 /// SIMD
 ///
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_vector_relation_simd_float3 ) {
-  test_batched_vector_relation<TestExecSpace,SIMD<float>,3>();
+TEST_F(TestCategory, batched_vector_relation_simd_float3) {
+  test_batched_vector_relation<TestExecSpace, SIMD<float>, 3>();
 }
-TEST_F( TestCategory, batched_vector_relation_simd_float8 ) {
-  test_batched_vector_relation<TestExecSpace,SIMD<float>,8>();
+TEST_F(TestCategory, batched_vector_relation_simd_float8) {
+  test_batched_vector_relation<TestExecSpace, SIMD<float>, 8>();
 }
 #endif
 
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F( TestCategory, batched_vector_relation_simd_double3 ) {
-  test_batched_vector_relation<TestExecSpace,SIMD<double>,3>();
+TEST_F(TestCategory, batched_vector_relation_simd_double3) {
+  test_batched_vector_relation<TestExecSpace, SIMD<double>, 3>();
 }
-TEST_F( TestCategory, batched_vector_relation_simd_double4 ) {
-  test_batched_vector_relation<TestExecSpace,SIMD<double>,4>();
+TEST_F(TestCategory, batched_vector_relation_simd_double4) {
+  test_batched_vector_relation<TestExecSpace, SIMD<double>, 4>();
 }
 #endif
 
@@ -132,14 +132,16 @@ TEST_F( TestCategory, batched_vector_relation_simd_double4 ) {
 
 // #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT)
 // TEST_F( TestCategory, batched_vector_relation_simd_scomplex4 ) {
-//   test_batched_vector_relation<TestExecSpace,SIMD<Kokkos::complex<float> >,4>();
+//   test_batched_vector_relation<TestExecSpace,SIMD<Kokkos::complex<float>
+//   >,4>();
 // }
 // #endif
 
 // #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
 // TEST_F( TestCategory, batched_vector_relation_simd_dcomplex2 ) {
-//   test_batched_vector_relation<TestExecSpace,SIMD<Kokkos::complex<double> >,2>();
+//   test_batched_vector_relation<TestExecSpace,SIMD<Kokkos::complex<double>
+//   >,2>();
 // }
 // #endif
 
-#endif // check to not include this in a device test
+#endif  // check to not include this in a device test
diff --git a/unit_test/batched/dense/Test_Batched_VectorView.hpp b/unit_test/batched/dense/Test_Batched_VectorView.hpp
index de0a28ba91..985d4c7260 100644
--- a/unit_test/batched/dense/Test_Batched_VectorView.hpp
+++ b/unit_test/batched/dense/Test_Batched_VectorView.hpp
@@ -6,9 +6,10 @@
 //       to ensure it is not included in these
 //       backends unit-test
 
-
-#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && !defined(TEST_HIP_BATCHED_DENSE_CPP) && \
-  !defined(TEST_SYCL_BATCHED_DENSE_CPP) && !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP)
+#if !defined(TEST_CUDA_BATCHED_DENSE_CPP) && \
+    !defined(TEST_HIP_BATCHED_DENSE_CPP) &&  \
+    !defined(TEST_SYCL_BATCHED_DENSE_CPP) && \
+    !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP)
 
 #include "gtest/gtest.h"
 #include "Kokkos_Core.hpp"
@@ -22,189 +23,308 @@ using namespace KokkosBatched;
 
 namespace Test {
 
-  template<typename VectorViewType>
-  void impl_init_vector_view(const VectorViewType & a) {
-    int cnt = 0;
-    for (int i0=0,i0end=a.extent(0);i0<i0end;++i0) 
-      for (int i1=0,i1end=a.extent(1);i1<i1end;++i1) 
-        for (int i2=0,i2end=a.extent(2);i2<i2end;++i2) 
-          for (int i3=0,i3end=a.extent(3);i3<i3end;++i3) 
-            for (int i4=0,i4end=a.extent(4);i4<i4end;++i4) 
-              for (int i5=0,i5end=a.extent(5);i5<i5end;++i5) 
-                for (int i6=0,i6end=a.extent(6);i6<i6end;++i6) 
-                  for (int i7=0,i7end=a.extent(7);i7<i7end;++i7) 
-                    a.access(i0,i1,i2,i3,i4,i5,i6,i7) = cnt++;
-  }
-#define TEST_LOOP                                                       \
-  for (int i0=0,i0end=b.extent(0);i0<i0end;++i0)                        \
-    for (int i1=0,i1end=b.extent(1);i1<i1end;++i1)                      \
-      for (int i2=0,i2end=b.extent(2);i2<i2end;++i2)                    \
-        for (int i3=0,i3end=b.extent(3);i3<i3end;++i3)                  \
-          for (int i4=0,i4end=b.extent(4);i4<i4end;++i4)                \
-            for (int i5=0,i5end=b.extent(5);i5<i5end;++i5)              \
-              for (int i6=0,i6end=b.extent(6);i6<i6end;++i6)            \
-                for (int i7=0,i7end=b.extent(7);i7<i7end;++i7)          
-  
-  template<typename VectorViewType>
-  void impl_verify_vector_view(const VectorViewType & a, const SimdViewAccess<VectorViewType, PackDim<0> > & b) {
-    typedef typename VectorViewType::value_type vector_type;
-    constexpr int vl = vector_type::vector_length;
-    typedef Kokkos::Details::ArithTraits<typename vector_type::value_type> ats;
-    const typename ats::mag_type eps = 1.0e3 * ats::epsilon();
-    TEST_LOOP
-      EXPECT_NEAR_KK( a.access(i0/vl,i1,i2,i3,i4,i5,i6,i7)[i0%vl], b(i0,i1,i2,i3,i4,i5,i6,i7), eps );
-  }
-  template<typename VectorViewType>
-  void impl_verify_vector_view(const VectorViewType & a, const SimdViewAccess<VectorViewType, PackDim<1> > & b) {
-    typedef typename VectorViewType::value_type vector_type;
-    constexpr int vl = vector_type::vector_length;
-    typedef Kokkos::Details::ArithTraits<typename vector_type::value_type> ats;
-    const typename ats::mag_type eps = 1.0e3 * ats::epsilon();
-    TEST_LOOP
-      EXPECT_NEAR_KK( a.access(i0,i1/vl,i2,i3,i4,i5,i6,i7)[i1%vl], b(i0,i1,i2,i3,i4,i5,i6,i7), eps );
+template <typename VectorViewType>
+void impl_init_vector_view(const VectorViewType& a) {
+  int cnt = 0;
+  for (int i0 = 0, i0end = a.extent(0); i0 < i0end; ++i0)
+    for (int i1 = 0, i1end = a.extent(1); i1 < i1end; ++i1)
+      for (int i2 = 0, i2end = a.extent(2); i2 < i2end; ++i2)
+        for (int i3 = 0, i3end = a.extent(3); i3 < i3end; ++i3)
+          for (int i4 = 0, i4end = a.extent(4); i4 < i4end; ++i4)
+            for (int i5 = 0, i5end = a.extent(5); i5 < i5end; ++i5)
+              for (int i6 = 0, i6end = a.extent(6); i6 < i6end; ++i6)
+                for (int i7 = 0, i7end = a.extent(7); i7 < i7end; ++i7)
+                  a.access(i0, i1, i2, i3, i4, i5, i6, i7) = cnt++;
+}
+#define TEST_LOOP                                                     \
+  for (int i0 = 0, i0end = b.extent(0); i0 < i0end; ++i0)             \
+    for (int i1 = 0, i1end = b.extent(1); i1 < i1end; ++i1)           \
+      for (int i2 = 0, i2end = b.extent(2); i2 < i2end; ++i2)         \
+        for (int i3 = 0, i3end = b.extent(3); i3 < i3end; ++i3)       \
+          for (int i4 = 0, i4end = b.extent(4); i4 < i4end; ++i4)     \
+            for (int i5 = 0, i5end = b.extent(5); i5 < i5end; ++i5)   \
+              for (int i6 = 0, i6end = b.extent(6); i6 < i6end; ++i6) \
+                for (int i7 = 0, i7end = b.extent(7); i7 < i7end; ++i7)
+
+template <typename VectorViewType>
+void impl_verify_vector_view(
+    const VectorViewType& a,
+    const SimdViewAccess<VectorViewType, PackDim<0> >& b) {
+  typedef typename VectorViewType::value_type vector_type;
+  constexpr int vl = vector_type::vector_length;
+  typedef Kokkos::Details::ArithTraits<typename vector_type::value_type> ats;
+  const typename ats::mag_type eps = 1.0e3 * ats::epsilon();
+  TEST_LOOP
+  EXPECT_NEAR_KK(a.access(i0 / vl, i1, i2, i3, i4, i5, i6, i7)[i0 % vl],
+                 b(i0, i1, i2, i3, i4, i5, i6, i7), eps);
+}
+template <typename VectorViewType>
+void impl_verify_vector_view(
+    const VectorViewType& a,
+    const SimdViewAccess<VectorViewType, PackDim<1> >& b) {
+  typedef typename VectorViewType::value_type vector_type;
+  constexpr int vl = vector_type::vector_length;
+  typedef Kokkos::Details::ArithTraits<typename vector_type::value_type> ats;
+  const typename ats::mag_type eps = 1.0e3 * ats::epsilon();
+  TEST_LOOP
+  EXPECT_NEAR_KK(a.access(i0, i1 / vl, i2, i3, i4, i5, i6, i7)[i1 % vl],
+                 b(i0, i1, i2, i3, i4, i5, i6, i7), eps);
+}
+template <typename VectorViewType>
+void impl_verify_vector_view(
+    const VectorViewType& a,
+    const SimdViewAccess<VectorViewType, PackDim<2> >& b) {
+  typedef typename VectorViewType::value_type vector_type;
+  constexpr int vl = vector_type::vector_length;
+  typedef Kokkos::Details::ArithTraits<typename vector_type::value_type> ats;
+  const typename ats::mag_type eps = 1.0e3 * ats::epsilon();
+  TEST_LOOP
+  EXPECT_NEAR_KK(a.access(i0, i1, i2 / vl, i3, i4, i5, i6, i7)[i2 % vl],
+                 b(i0, i1, i2, i3, i4, i5, i6, i7), eps);
+}
+template <typename VectorViewType>
+void impl_verify_vector_view(
+    const VectorViewType& a,
+    const SimdViewAccess<VectorViewType, PackDim<3> >& b) {
+  typedef typename VectorViewType::value_type vector_type;
+  constexpr int vl = vector_type::vector_length;
+  typedef Kokkos::Details::ArithTraits<typename vector_type::value_type> ats;
+  const typename ats::mag_type eps = 1.0e3 * ats::epsilon();
+  TEST_LOOP
+  EXPECT_NEAR_KK(a.access(i0, i1, i2, i3 / vl, i4, i5, i6, i7)[i3 % vl],
+                 b(i0, i1, i2, i3, i4, i5, i6, i7), eps);
+}
+template <typename VectorViewType>
+void impl_verify_vector_view(
+    const VectorViewType& a,
+    const SimdViewAccess<VectorViewType, PackDim<4> >& b) {
+  typedef typename VectorViewType::value_type vector_type;
+  constexpr int vl = vector_type::vector_length;
+  typedef Kokkos::Details::ArithTraits<typename vector_type::value_type> ats;
+  const typename ats::mag_type eps = 1.0e3 * ats::epsilon();
+  TEST_LOOP
+  EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4 / vl, i5, i6, i7)[i4 % vl],
+                 b(i0, i1, i2, i3, i4, i5, i6, i7), eps);
+}
+template <typename VectorViewType>
+void impl_verify_vector_view(
+    const VectorViewType& a,
+    const SimdViewAccess<VectorViewType, PackDim<5> >& b) {
+  typedef typename VectorViewType::value_type vector_type;
+  constexpr int vl = vector_type::vector_length;
+  typedef Kokkos::Details::ArithTraits<typename vector_type::value_type> ats;
+  const typename ats::mag_type eps = 1.0e3 * ats::epsilon();
+  TEST_LOOP
+  EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4, i5 / vl, i6, i7)[i5 % vl],
+                 b(i0, i1, i2, i3, i4, i5, i6, i7), eps);
+}
+template <typename VectorViewType>
+void impl_verify_vector_view(
+    const VectorViewType& a,
+    const SimdViewAccess<VectorViewType, PackDim<6> >& b) {
+  typedef typename VectorViewType::value_type vector_type;
+  constexpr int vl = vector_type::vector_length;
+  typedef Kokkos::Details::ArithTraits<typename vector_type::value_type> ats;
+  const typename ats::mag_type eps = 1.0e3 * ats::epsilon();
+  TEST_LOOP
+  EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4, i5, i6 / vl, i7)[i6 % vl],
+                 b(i0, i1, i2, i3, i4, i5, i6, i7), eps);
+}
+template <typename VectorViewType>
+void impl_verify_vector_view(
+    const VectorViewType& a,
+    const SimdViewAccess<VectorViewType, PackDim<7> >& b) {
+  typedef typename VectorViewType::value_type vector_type;
+  constexpr int vl = vector_type::vector_length;
+  typedef Kokkos::Details::ArithTraits<typename vector_type::value_type> ats;
+  const typename ats::mag_type eps = 1.0e3 * ats::epsilon();
+  TEST_LOOP
+  EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4, i5, i6, i7 / vl)[i7 % vl],
+                 b(i0, i1, i2, i3, i4, i5, i6, i7), eps);
+}
+
+template <typename DeviceType, typename VectorTagType, int VectorLength>
+void impl_test_batched_vector_view() {
+  /// random data initialization
+  typedef Vector<VectorTagType, VectorLength> vector_type;
+
+  // typedef typename vector_type::value_type value_type;
+  // const int vector_length = vector_type::vector_length;
+  const int test_view_size = 4;
+  {  /// rank 1 array
+    Kokkos::View<vector_type*, DeviceType> a("a", test_view_size);
+    impl_init_vector_view(a);
+    impl_verify_vector_view(
+        a,
+        SimdViewAccess<Kokkos::View<vector_type*, DeviceType>, PackDim<0> >(a));
   }
-  template<typename VectorViewType>
-  void impl_verify_vector_view(const VectorViewType & a, const SimdViewAccess<VectorViewType, PackDim<2> > & b) {
-    typedef typename VectorViewType::value_type vector_type;
-    constexpr int vl = vector_type::vector_length;
-    typedef Kokkos::Details::ArithTraits<typename vector_type::value_type> ats;
-    const typename ats::mag_type eps = 1.0e3 * ats::epsilon();
-    TEST_LOOP
-      EXPECT_NEAR_KK( a.access(i0,i1,i2/vl,i3,i4,i5,i6,i7)[i2%vl], b(i0,i1,i2,i3,i4,i5,i6,i7), eps );
+  {  /// rank 2 array
+    Kokkos::View<vector_type**, DeviceType> a("a", test_view_size,
+                                              test_view_size);
+    impl_init_vector_view(a);
+
+    impl_verify_vector_view(
+        a, SimdViewAccess<Kokkos::View<vector_type**, DeviceType>, PackDim<0> >(
+               a));
+    impl_verify_vector_view(
+        a, SimdViewAccess<Kokkos::View<vector_type**, DeviceType>, PackDim<1> >(
+               a));
   }
-  template<typename VectorViewType>
-  void impl_verify_vector_view(const VectorViewType & a, const SimdViewAccess<VectorViewType, PackDim<3> > & b) {
-    typedef typename VectorViewType::value_type vector_type;
-    constexpr int vl = vector_type::vector_length;
-    typedef Kokkos::Details::ArithTraits<typename vector_type::value_type> ats;
-    const typename ats::mag_type eps = 1.0e3 * ats::epsilon();
-    TEST_LOOP
-      EXPECT_NEAR_KK( a.access(i0,i1,i2,i3/vl,i4,i5,i6,i7)[i3%vl], b(i0,i1,i2,i3,i4,i5,i6,i7), eps );
+  {  /// rank 3 array
+    Kokkos::View<vector_type***, DeviceType> a("a", test_view_size,
+                                               test_view_size, test_view_size);
+    impl_init_vector_view(a);
+
+    impl_verify_vector_view(
+        a,
+        SimdViewAccess<Kokkos::View<vector_type***, DeviceType>, PackDim<0> >(
+            a));
+    impl_verify_vector_view(
+        a,
+        SimdViewAccess<Kokkos::View<vector_type***, DeviceType>, PackDim<1> >(
+            a));
+    impl_verify_vector_view(
+        a,
+        SimdViewAccess<Kokkos::View<vector_type***, DeviceType>, PackDim<2> >(
+            a));
   }
-  template<typename VectorViewType>
-  void impl_verify_vector_view(const VectorViewType & a, const SimdViewAccess<VectorViewType, PackDim<4> > & b) {
-    typedef typename VectorViewType::value_type vector_type;
-    constexpr int vl = vector_type::vector_length;
-    typedef Kokkos::Details::ArithTraits<typename vector_type::value_type> ats;
-    const typename ats::mag_type eps = 1.0e3 * ats::epsilon();
-    TEST_LOOP
-      EXPECT_NEAR_KK( a.access(i0,i1,i2,i3,i4/vl,i5,i6,i7)[i4%vl], b(i0,i1,i2,i3,i4,i5,i6,i7), eps );
+  {  /// rank 4 array
+    Kokkos::View<vector_type****, DeviceType> a(
+        "a", test_view_size, test_view_size, test_view_size, test_view_size);
+    impl_init_vector_view(a);
+
+    impl_verify_vector_view(
+        a,
+        SimdViewAccess<Kokkos::View<vector_type****, DeviceType>, PackDim<0> >(
+            a));
+    impl_verify_vector_view(
+        a,
+        SimdViewAccess<Kokkos::View<vector_type****, DeviceType>, PackDim<1> >(
+            a));
+    impl_verify_vector_view(
+        a,
+        SimdViewAccess<Kokkos::View<vector_type****, DeviceType>, PackDim<2> >(
+            a));
+    impl_verify_vector_view(
+        a,
+        SimdViewAccess<Kokkos::View<vector_type****, DeviceType>, PackDim<3> >(
+            a));
   }
-  template<typename VectorViewType>
-  void impl_verify_vector_view(const VectorViewType & a, const SimdViewAccess<VectorViewType, PackDim<5> > & b) {
-    typedef typename VectorViewType::value_type vector_type;
-    constexpr int vl = vector_type::vector_length;
-    typedef Kokkos::Details::ArithTraits<typename vector_type::value_type> ats;
-    const typename ats::mag_type eps = 1.0e3 * ats::epsilon();
-    TEST_LOOP
-      EXPECT_NEAR_KK( a.access(i0,i1,i2,i3,i4,i5/vl,i6,i7)[i5%vl], b(i0,i1,i2,i3,i4,i5,i6,i7), eps );
+  {  /// rank 5 array
+    Kokkos::View<vector_type*****, DeviceType> a(
+        "a", test_view_size, test_view_size, test_view_size, test_view_size,
+        test_view_size);
+    impl_init_vector_view(a);
+
+    impl_verify_vector_view(
+        a,
+        SimdViewAccess<Kokkos::View<vector_type*****, DeviceType>, PackDim<0> >(
+            a));
+    impl_verify_vector_view(
+        a,
+        SimdViewAccess<Kokkos::View<vector_type*****, DeviceType>, PackDim<1> >(
+            a));
+    impl_verify_vector_view(
+        a,
+        SimdViewAccess<Kokkos::View<vector_type*****, DeviceType>, PackDim<2> >(
+            a));
+    impl_verify_vector_view(
+        a,
+        SimdViewAccess<Kokkos::View<vector_type*****, DeviceType>, PackDim<3> >(
+            a));
+    impl_verify_vector_view(
+        a,
+        SimdViewAccess<Kokkos::View<vector_type*****, DeviceType>, PackDim<4> >(
+            a));
   }
-  template<typename VectorViewType>
-  void impl_verify_vector_view(const VectorViewType & a, const SimdViewAccess<VectorViewType, PackDim<6> > & b) {
-    typedef typename VectorViewType::value_type vector_type;
-    constexpr int vl = vector_type::vector_length;
-    typedef Kokkos::Details::ArithTraits<typename vector_type::value_type> ats;
-    const typename ats::mag_type eps = 1.0e3 * ats::epsilon();
-    TEST_LOOP
-      EXPECT_NEAR_KK( a.access(i0,i1,i2,i3,i4,i5,i6/vl,i7)[i6%vl], b(i0,i1,i2,i3,i4,i5,i6,i7), eps );
+  {  /// rank 6 array
+    Kokkos::View<vector_type******, DeviceType> a(
+        "a", test_view_size, test_view_size, test_view_size, test_view_size,
+        test_view_size, test_view_size);
+    impl_init_vector_view(a);
+
+    impl_verify_vector_view(
+        a, SimdViewAccess<Kokkos::View<vector_type******, DeviceType>,
+                          PackDim<0> >(a));
+    impl_verify_vector_view(
+        a, SimdViewAccess<Kokkos::View<vector_type******, DeviceType>,
+                          PackDim<1> >(a));
+    impl_verify_vector_view(
+        a, SimdViewAccess<Kokkos::View<vector_type******, DeviceType>,
+                          PackDim<2> >(a));
+    impl_verify_vector_view(
+        a, SimdViewAccess<Kokkos::View<vector_type******, DeviceType>,
+                          PackDim<3> >(a));
+    impl_verify_vector_view(
+        a, SimdViewAccess<Kokkos::View<vector_type******, DeviceType>,
+                          PackDim<4> >(a));
+    impl_verify_vector_view(
+        a, SimdViewAccess<Kokkos::View<vector_type******, DeviceType>,
+                          PackDim<5> >(a));
   }
-  template<typename VectorViewType>
-  void impl_verify_vector_view(const VectorViewType & a, const SimdViewAccess<VectorViewType, PackDim<7> > & b) {
-    typedef typename VectorViewType::value_type vector_type;
-    constexpr int vl = vector_type::vector_length;
-    typedef Kokkos::Details::ArithTraits<typename vector_type::value_type> ats;
-    const typename ats::mag_type eps = 1.0e3 * ats::epsilon();
-    TEST_LOOP
-      EXPECT_NEAR_KK( a.access(i0,i1,i2,i3,i4,i5,i6,i7/vl)[i7%vl], b(i0,i1,i2,i3,i4,i5,i6,i7), eps );
+  {  /// rank 7 array
+    Kokkos::View<vector_type*******, DeviceType> a(
+        "a", test_view_size, test_view_size, test_view_size, test_view_size,
+        test_view_size, test_view_size, test_view_size);
+    impl_init_vector_view(a);
+
+    impl_verify_vector_view(
+        a, SimdViewAccess<Kokkos::View<vector_type*******, DeviceType>,
+                          PackDim<0> >(a));
+    impl_verify_vector_view(
+        a, SimdViewAccess<Kokkos::View<vector_type*******, DeviceType>,
+                          PackDim<1> >(a));
+    impl_verify_vector_view(
+        a, SimdViewAccess<Kokkos::View<vector_type*******, DeviceType>,
+                          PackDim<2> >(a));
+    impl_verify_vector_view(
+        a, SimdViewAccess<Kokkos::View<vector_type*******, DeviceType>,
+                          PackDim<3> >(a));
+    impl_verify_vector_view(
+        a, SimdViewAccess<Kokkos::View<vector_type*******, DeviceType>,
+                          PackDim<4> >(a));
+    impl_verify_vector_view(
+        a, SimdViewAccess<Kokkos::View<vector_type*******, DeviceType>,
+                          PackDim<5> >(a));
+    impl_verify_vector_view(
+        a, SimdViewAccess<Kokkos::View<vector_type*******, DeviceType>,
+                          PackDim<6> >(a));
   }
-  
-  template<typename DeviceType,typename VectorTagType,int VectorLength>
-  void impl_test_batched_vector_view() {
-    /// random data initialization
-    typedef Vector<VectorTagType,VectorLength> vector_type;
-    
-    //typedef typename vector_type::value_type value_type;    
-    //const int vector_length = vector_type::vector_length;
-    const int test_view_size = 4;
-    { /// rank 1 array
-      Kokkos::View<vector_type*,DeviceType> a("a", test_view_size);
-      impl_init_vector_view(a);
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type*,DeviceType>, PackDim<0> >(a));
-    }
-    { /// rank 2 array
-      Kokkos::View<vector_type**,DeviceType> a("a", test_view_size, test_view_size);
-      impl_init_vector_view(a);
-      
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type**,DeviceType>, PackDim<0> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type**,DeviceType>, PackDim<1> >(a));
-    }
-    { /// rank 3 array
-      Kokkos::View<vector_type***,DeviceType> a("a", test_view_size, test_view_size, test_view_size);
-      impl_init_vector_view(a);
-      
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type***,DeviceType>, PackDim<0> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type***,DeviceType>, PackDim<1> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type***,DeviceType>, PackDim<2> >(a));
-    }
-    { /// rank 4 array
-      Kokkos::View<vector_type****,DeviceType> a("a", test_view_size, test_view_size, test_view_size, test_view_size);
-      impl_init_vector_view(a);
-      
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type****,DeviceType>, PackDim<0> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type****,DeviceType>, PackDim<1> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type****,DeviceType>, PackDim<2> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type****,DeviceType>, PackDim<3> >(a));
-    }
-    { /// rank 5 array
-      Kokkos::View<vector_type*****,DeviceType> a("a", test_view_size, test_view_size, test_view_size, test_view_size, test_view_size);
-      impl_init_vector_view(a);
-      
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type*****,DeviceType>, PackDim<0> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type*****,DeviceType>, PackDim<1> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type*****,DeviceType>, PackDim<2> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type*****,DeviceType>, PackDim<3> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type*****,DeviceType>, PackDim<4> >(a));
-    }
-    { /// rank 6 array
-      Kokkos::View<vector_type******,DeviceType> a("a", test_view_size, test_view_size, test_view_size, test_view_size, test_view_size, test_view_size);
-      impl_init_vector_view(a);
-      
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type******,DeviceType>, PackDim<0> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type******,DeviceType>, PackDim<1> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type******,DeviceType>, PackDim<2> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type******,DeviceType>, PackDim<3> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type******,DeviceType>, PackDim<4> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type******,DeviceType>, PackDim<5> >(a));
-    }
-    { /// rank 7 array
-      Kokkos::View<vector_type*******,DeviceType> a("a", test_view_size, test_view_size, test_view_size, test_view_size, test_view_size, test_view_size, test_view_size);
-      impl_init_vector_view(a);
-      
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type*******,DeviceType>, PackDim<0> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type*******,DeviceType>, PackDim<1> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type*******,DeviceType>, PackDim<2> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type*******,DeviceType>, PackDim<3> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type*******,DeviceType>, PackDim<4> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type*******,DeviceType>, PackDim<5> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type*******,DeviceType>, PackDim<6> >(a));
-    }
-    { /// rank 8 array
-      Kokkos::View<vector_type********,DeviceType> a("a", test_view_size, test_view_size, test_view_size, test_view_size, test_view_size, test_view_size, test_view_size, test_view_size);
-      impl_init_vector_view(a);
-      
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type********,DeviceType>, PackDim<0> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type********,DeviceType>, PackDim<1> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type********,DeviceType>, PackDim<2> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type********,DeviceType>, PackDim<3> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type********,DeviceType>, PackDim<4> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type********,DeviceType>, PackDim<5> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type********,DeviceType>, PackDim<6> >(a));
-      impl_verify_vector_view(a, SimdViewAccess<Kokkos::View<vector_type********,DeviceType>, PackDim<7> >(a));
-    }
+  {  /// rank 8 array
+    Kokkos::View<vector_type********, DeviceType> a(
+        "a", test_view_size, test_view_size, test_view_size, test_view_size,
+        test_view_size, test_view_size, test_view_size, test_view_size);
+    impl_init_vector_view(a);
+
+    impl_verify_vector_view(
+        a, SimdViewAccess<Kokkos::View<vector_type********, DeviceType>,
+                          PackDim<0> >(a));
+    impl_verify_vector_view(
+        a, SimdViewAccess<Kokkos::View<vector_type********, DeviceType>,
+                          PackDim<1> >(a));
+    impl_verify_vector_view(
+        a, SimdViewAccess<Kokkos::View<vector_type********, DeviceType>,
+                          PackDim<2> >(a));
+    impl_verify_vector_view(
+        a, SimdViewAccess<Kokkos::View<vector_type********, DeviceType>,
+                          PackDim<3> >(a));
+    impl_verify_vector_view(
+        a, SimdViewAccess<Kokkos::View<vector_type********, DeviceType>,
+                          PackDim<4> >(a));
+    impl_verify_vector_view(
+        a, SimdViewAccess<Kokkos::View<vector_type********, DeviceType>,
+                          PackDim<5> >(a));
+    impl_verify_vector_view(
+        a, SimdViewAccess<Kokkos::View<vector_type********, DeviceType>,
+                          PackDim<6> >(a));
+    impl_verify_vector_view(
+        a, SimdViewAccess<Kokkos::View<vector_type********, DeviceType>,
+                          PackDim<7> >(a));
   }
 }
+}  // namespace Test
 
-template<typename DeviceType,typename VectorTagType,int VectorLength>
+template <typename DeviceType, typename VectorTagType, int VectorLength>
 int test_batched_vector_view() {
   static_assert(
       Kokkos::SpaceAccessibility<DeviceType, Kokkos::HostSpace>::accessible,
@@ -215,42 +335,41 @@ int test_batched_vector_view() {
   return 0;
 }
 
-
 ///
 /// SIMD
 ///
 
 #if defined(KOKKOSKERNELS_INST_FLOAT)
-TEST_F( TestCategory, batched_vector_view_simd_float8 ) {
-  test_batched_vector_view<TestExecSpace,SIMD<float>,8>();
+TEST_F(TestCategory, batched_vector_view_simd_float8) {
+  test_batched_vector_view<TestExecSpace, SIMD<float>, 8>();
 }
 #endif
 
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-TEST_F( TestCategory, batched_vector_view_simd_double4 ) {
-  test_batched_vector_view<TestExecSpace,SIMD<double>,4>();
+TEST_F(TestCategory, batched_vector_view_simd_double4) {
+  test_batched_vector_view<TestExecSpace, SIMD<double>, 4>();
 }
-TEST_F( TestCategory, batched_vector_view_simd_double8 ) {
-  test_batched_vector_view<TestExecSpace,SIMD<double>,8>();
+TEST_F(TestCategory, batched_vector_view_simd_double8) {
+  test_batched_vector_view<TestExecSpace, SIMD<double>, 8>();
 }
 #endif
 
 #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT)
-TEST_F( TestCategory, batched_vector_view_simd_scomplex4 ) {
-  test_batched_vector_view<TestExecSpace,SIMD<Kokkos::complex<float> >,4>();
+TEST_F(TestCategory, batched_vector_view_simd_scomplex4) {
+  test_batched_vector_view<TestExecSpace, SIMD<Kokkos::complex<float> >, 4>();
 }
-TEST_F( TestCategory, batched_vector_view_simd_scomplex8 ) {
-  test_batched_vector_view<TestExecSpace,SIMD<Kokkos::complex<float> >,8>();
+TEST_F(TestCategory, batched_vector_view_simd_scomplex8) {
+  test_batched_vector_view<TestExecSpace, SIMD<Kokkos::complex<float> >, 8>();
 }
 #endif
 
 #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
-TEST_F( TestCategory, batched_vector_view_simd_dcomplex2 ) {
-  test_batched_vector_view<TestExecSpace,SIMD<Kokkos::complex<double> >,2>();
+TEST_F(TestCategory, batched_vector_view_simd_dcomplex2) {
+  test_batched_vector_view<TestExecSpace, SIMD<Kokkos::complex<double> >, 2>();
 }
-TEST_F( TestCategory, batched_vector_view_simd_dcomplex4 ) {
-  test_batched_vector_view<TestExecSpace,SIMD<Kokkos::complex<double> >,4>();
+TEST_F(TestCategory, batched_vector_view_simd_dcomplex4) {
+  test_batched_vector_view<TestExecSpace, SIMD<Kokkos::complex<double> >, 4>();
 }
 #endif
 
-#endif // check to not include this in a device test
+#endif  // check to not include this in a device test
diff --git a/unit_test/batched/sparse/Test_Batched_SerialSpmv.hpp b/unit_test/batched/sparse/Test_Batched_SerialSpmv.hpp
new file mode 100644
index 0000000000..c482f8fdb0
--- /dev/null
+++ b/unit_test/batched/sparse/Test_Batched_SerialSpmv.hpp
@@ -0,0 +1,200 @@
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "gtest/gtest.h"
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Random.hpp"
+
+//#include "KokkosBatched_Vector.hpp"
+
+#include "KokkosBatched_Spmv.hpp"
+#include "KokkosBatched_Spmv_Serial_Impl.hpp"
+
+#include "KokkosKernels_TestUtils.hpp"
+#include "Test_Batched_SparseUtils.hpp"
+
+using namespace KokkosBatched;
+
+namespace Test {
+namespace Spmv {
+
+template <typename T>
+struct ParamTag {
+  typedef T trans;
+};
+
+template <typename DeviceType, typename ParamTagType, typename ValuesViewType,
+          typename IntView, typename xViewType, typename yViewType,
+          typename alphaViewType, typename betaViewType, int dobeta>
+struct Functor_TestBatchedSerialSpmv {
+  const alphaViewType _alpha;
+  const ValuesViewType _D;
+  const IntView _r;
+  const IntView _c;
+  const xViewType _X;
+  const betaViewType _beta;
+  const yViewType _Y;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedSerialSpmv(const alphaViewType &alpha,
+                                const ValuesViewType &D, const IntView &r,
+                                const IntView &c, const xViewType &X,
+                                const betaViewType &beta, const yViewType &Y)
+      : _alpha(alpha), _D(D), _r(r), _c(c), _X(X), _beta(beta), _Y(Y) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const ParamTagType &, const int k) const {
+    auto alpha = Kokkos::subview(_alpha, Kokkos::make_pair(k, k + 1));
+    auto d     = Kokkos::subview(_D, Kokkos::make_pair(k, k + 1), Kokkos::ALL);
+    auto x     = Kokkos::subview(_X, Kokkos::make_pair(k, k + 1), Kokkos::ALL);
+    auto beta  = Kokkos::subview(_beta, Kokkos::make_pair(k, k + 1));
+    auto y     = Kokkos::subview(_Y, Kokkos::make_pair(k, k + 1), Kokkos::ALL);
+
+    KokkosBatched::SerialSpmv<typename ParamTagType::trans>::template invoke<
+        ValuesViewType, IntView, xViewType, yViewType, alphaViewType,
+        betaViewType, dobeta>(alpha, d, _r, _c, x, beta, y);
+  }
+
+  inline void run() {
+    typedef typename ValuesViewType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::SerialSpmv");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::RangePolicy<DeviceType, ParamTagType> policy(0, _D.extent(0));
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename ParamTagType, typename ValuesViewType,
+          typename IntView, typename xViewType, typename yViewType,
+          typename alphaViewType, typename betaViewType, int dobeta>
+void impl_test_batched_spmv(const int N, const int BlkSize) {
+  typedef typename ValuesViewType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  const int nnz = (BlkSize - 2) * 3 + 2 * 2;
+
+  xViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize);
+  yViewType Y0("y0", N, BlkSize), Y1("y1", N, BlkSize);
+  ValuesViewType D("D", N, nnz);
+  IntView r("r", BlkSize + 1);
+  IntView c("c", nnz);
+
+  alphaViewType alpha("alpha", N);
+  betaViewType beta("beta", N);
+
+  Kokkos::deep_copy(alpha, value_type(1.0));
+  Kokkos::deep_copy(beta, value_type(1.0));
+
+  create_tridiagonal_batched_matrices(nnz, BlkSize, N, r, c, D, X0, Y0);
+
+  Kokkos::deep_copy(X1, X0);
+  Kokkos::deep_copy(Y1, Y0);
+
+  /// test body
+  auto alpha_host = Kokkos::create_mirror_view(alpha);
+  auto beta_host  = Kokkos::create_mirror_view(beta);
+  auto X0_host    = Kokkos::create_mirror_view(X0);
+  auto Y0_host    = Kokkos::create_mirror_view(Y0);
+
+  Kokkos::deep_copy(alpha_host, alpha);
+  Kokkos::deep_copy(beta_host, beta);
+  Kokkos::deep_copy(X0_host, X0);
+  Kokkos::deep_copy(Y0_host, Y0);
+
+  for (int l = 0; l < N; ++l)
+    for (int i = 0; i < BlkSize; ++i) {
+      if (dobeta == 0)
+        Y0_host(l, i) = value_type(0.0);
+      else
+        Y0_host(l, i) *= beta_host(l);
+      if (i != 0 && i != (BlkSize - 1))
+        Y0_host(l, i) +=
+            alpha_host(l) *
+            (2 * X0_host(l, i) - X0_host(l, i - 1) - X0_host(l, i + 1));
+      else if (i == 0)
+        Y0_host(l, i) +=
+            alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i + 1));
+      else
+        Y0_host(l, i) +=
+            alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i - 1));
+    }
+
+  Functor_TestBatchedSerialSpmv<DeviceType, ParamTagType, ValuesViewType,
+                                IntView, xViewType, yViewType, alphaViewType,
+                                betaViewType, dobeta>(alpha, D, r, c, X1, beta,
+                                                      Y1)
+      .run();
+
+  Kokkos::fence();
+
+  /// for comparison send it to host
+  auto Y1_host = Kokkos::create_mirror_view(Y1);
+
+  Kokkos::deep_copy(Y1_host, Y1);
+
+  /// check c0 = c1 ; this eps is about 10^-14
+  typedef typename ats::mag_type mag_type;
+  mag_type sum(1), diff(0);
+  const mag_type eps = 1.0e3 * ats::epsilon();
+
+  for (int l = 0; l < N; ++l)
+    for (int i = 0; i < BlkSize; ++i) {
+      sum += ats::abs(Y0_host(l, i));
+      diff += ats::abs(Y0_host(l, i) - Y1_host(l, i));
+    }
+  EXPECT_NEAR_KK(diff / sum, 0, eps);
+}
+}  // namespace Spmv
+}  // namespace Test
+
+template <typename DeviceType, typename ValueType, typename ScalarType,
+          typename ParamTagType>
+int test_batched_spmv() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
+  {
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType> ViewType;
+    typedef Kokkos::View<int *, Kokkos::LayoutLeft, DeviceType> IntView;
+    typedef Kokkos::View<ScalarType *, Kokkos::LayoutLeft, DeviceType>
+        alphaViewType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::Spmv::impl_test_batched_spmv<DeviceType, ParamTagType, ViewType,
+                                         IntView, ViewType, ViewType,
+                                         alphaViewType, alphaViewType, 0>(1024,
+                                                                          i);
+    }
+    for (int i = 3; i < 10; ++i) {
+      Test::Spmv::impl_test_batched_spmv<DeviceType, ParamTagType, ViewType,
+                                         IntView, ViewType, ViewType,
+                                         alphaViewType, alphaViewType, 1>(1024,
+                                                                          i);
+    }
+  }
+#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
+  {
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    typedef Kokkos::View<int *, Kokkos::LayoutRight, DeviceType> IntView;
+    typedef Kokkos::View<ScalarType *, Kokkos::LayoutRight, DeviceType>
+        alphaViewType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::Spmv::impl_test_batched_spmv<DeviceType, ParamTagType, ViewType,
+                                         IntView, ViewType, ViewType,
+                                         alphaViewType, alphaViewType, 0>(1024,
+                                                                          i);
+    }
+    for (int i = 3; i < 10; ++i) {
+      Test::Spmv::impl_test_batched_spmv<DeviceType, ParamTagType, ViewType,
+                                         IntView, ViewType, ViewType,
+                                         alphaViewType, alphaViewType, 1>(1024,
+                                                                          i);
+    }
+  }
+#endif
+
+  return 0;
+}
diff --git a/unit_test/batched/sparse/Test_Batched_SerialSpmv_Real.hpp b/unit_test/batched/sparse/Test_Batched_SerialSpmv_Real.hpp
new file mode 100644
index 0000000000..f63df2101c
--- /dev/null
+++ b/unit_test/batched/sparse/Test_Batched_SerialSpmv_Real.hpp
@@ -0,0 +1,14 @@
+
+#if defined(KOKKOSKERNELS_INST_FLOAT)
+TEST_F(TestCategory, batched_scalar_serial_spmv_nt_float_float) {
+  typedef ::Test::Spmv::ParamTag<Trans::NoTranspose> param_tag_type;
+  test_batched_spmv<TestExecSpace, float, float, param_tag_type>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+TEST_F(TestCategory, batched_scalar_serial_spmv_nt_double_double) {
+  typedef ::Test::Spmv::ParamTag<Trans::NoTranspose> param_tag_type;
+  test_batched_spmv<TestExecSpace, double, double, param_tag_type>();
+}
+#endif
diff --git a/unit_test/batched/sparse/Test_Batched_Sparse.hpp b/unit_test/batched/sparse/Test_Batched_Sparse.hpp
index ee18463706..4b36400d2e 100644
--- a/unit_test/batched/sparse/Test_Batched_Sparse.hpp
+++ b/unit_test/batched/sparse/Test_Batched_Sparse.hpp
@@ -2,14 +2,25 @@
 #define TEST_BATCHED_SPARSE_HPP
 
 // Serial kernels
-
+#include "Test_Batched_SerialSpmv.hpp"
+#include "Test_Batched_SerialSpmv_Real.hpp"
 
 // Team Kernels
-
+#include "Test_Batched_TeamCG.hpp"
+#include "Test_Batched_TeamCG_Real.hpp"
+#include "Test_Batched_TeamGMRES.hpp"
+#include "Test_Batched_TeamGMRES_Real.hpp"
+#include "Test_Batched_TeamSpmv.hpp"
+#include "Test_Batched_TeamSpmv_Real.hpp"
 
 // TeamVector Kernels
-
+#include "Test_Batched_TeamVectorCG.hpp"
+#include "Test_Batched_TeamVectorCG_Real.hpp"
+#include "Test_Batched_TeamVectorGMRES.hpp"
+#include "Test_Batched_TeamVectorGMRES_Real.hpp"
+#include "Test_Batched_TeamVectorSpmv.hpp"
+#include "Test_Batched_TeamVectorSpmv_Real.hpp"
 
 // Vector Kernels
 
-#endif // TEST_BATCHED_SPARSE_HPP
+#endif  // TEST_BATCHED_SPARSE_HPP
diff --git a/unit_test/batched/sparse/Test_Batched_SparseUtils.hpp b/unit_test/batched/sparse/Test_Batched_SparseUtils.hpp
new file mode 100644
index 0000000000..919f34aa91
--- /dev/null
+++ b/unit_test/batched/sparse/Test_Batched_SparseUtils.hpp
@@ -0,0 +1,62 @@
+#ifndef TEST_BATCHED_SPARSE_HELPER_HPP
+#define TEST_BATCHED_SPARSE_HELPER_HPP
+
+namespace KokkosBatched {
+template <typename IntView, typename VectorViewType>
+void create_tridiagonal_batched_matrices(const int nnz, const int BlkSize,
+                                         const int N, const IntView &r,
+                                         const IntView &c,
+                                         const VectorViewType &D,
+                                         const VectorViewType &X,
+                                         const VectorViewType &B) {
+  Kokkos::Random_XorShift64_Pool<
+      typename VectorViewType::device_type::execution_space>
+      random(13718);
+  Kokkos::fill_random(
+      X, random,
+      Kokkos::reduction_identity<typename VectorViewType::value_type>::prod());
+  Kokkos::fill_random(
+      B, random,
+      Kokkos::reduction_identity<typename VectorViewType::value_type>::prod());
+
+  auto D_host = Kokkos::create_mirror_view(D);
+  auto r_host = Kokkos::create_mirror_view(r);
+  auto c_host = Kokkos::create_mirror_view(c);
+
+  r_host(0) = 0;
+
+  int current_col = 0;
+
+  for (int i = 0; i < BlkSize; ++i) {
+    r_host(i + 1) = r_host(i) + (i == 0 || i == (BlkSize - 1) ? 2 : 3);
+  }
+  for (int i = 0; i < nnz; ++i) {
+    if (i % 3 == 0) {
+      for (int l = 0; l < N; ++l) {
+        D_host(l, i) = typename VectorViewType::value_type(2.0);
+      }
+      c_host(i) = current_col;
+      ++current_col;
+    } else {
+      for (int l = 0; l < N; ++l) {
+        D_host(l, i) = typename VectorViewType::value_type(-1.0);
+      }
+      c_host(i) = current_col;
+      if (i % 3 == 1)
+        --current_col;
+      else
+        ++current_col;
+    }
+  }
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(D, D_host);
+  Kokkos::deep_copy(r, r_host);
+  Kokkos::deep_copy(c, c_host);
+
+  Kokkos::fence();
+}
+}  // namespace KokkosBatched
+
+#endif  // TEST_BATCHED_SPARSE_HELPER_HPP
diff --git a/unit_test/batched/sparse/Test_Batched_TeamCG.hpp b/unit_test/batched/sparse/Test_Batched_TeamCG.hpp
new file mode 100644
index 0000000000..3e606d1508
--- /dev/null
+++ b/unit_test/batched/sparse/Test_Batched_TeamCG.hpp
@@ -0,0 +1,186 @@
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "gtest/gtest.h"
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Random.hpp"
+#include "KokkosBatched_CG.hpp"
+#include "KokkosKernels_TestUtils.hpp"
+#include "KokkosBatched_CrsMatrix.hpp"
+#include "Test_Batched_SparseUtils.hpp"
+
+using namespace KokkosBatched;
+
+namespace Test {
+namespace TeamCG {
+
+template <typename DeviceType, typename ValuesViewType, typename IntView,
+          typename VectorViewType>
+struct Functor_TestBatchedTeamCG {
+  const ValuesViewType _D;
+  const IntView _r;
+  const IntView _c;
+  const VectorViewType _X;
+  const VectorViewType _B;
+  const int _N_team;
+  KrylovHandle<typename ValuesViewType::value_type> handle;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamCG(const ValuesViewType &D, const IntView &r,
+                            const IntView &c, const VectorViewType &X,
+                            const VectorViewType &B, const int N_team)
+      : _D(D), _r(r), _c(c), _X(X), _B(B), _N_team(N_team) {}
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const {
+    const int first_matrix = static_cast<int>(member.league_rank()) * _N_team;
+    const int N            = _D.extent(0);
+    const int last_matrix =
+        (static_cast<int>(member.league_rank() + 1) * _N_team < N
+             ? static_cast<int>(member.league_rank() + 1) * _N_team
+             : N);
+
+    auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+    auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+    auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+
+    using Operator = KokkosBatched::CrsMatrix<ValuesViewType, IntView>;
+
+    Operator A(d, _r, _c);
+
+    KokkosBatched::TeamCG<MemberType>::template invoke<Operator,
+                                                       VectorViewType>(
+        member, A, b, x, handle);
+  }
+
+  inline void run() {
+    typedef typename ValuesViewType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::TeamCG");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::TeamPolicy<DeviceType> policy(_D.extent(0) / _N_team,
+                                          Kokkos::AUTO(), Kokkos::AUTO());
+
+    size_t bytes_0 = ValuesViewType::shmem_size(_N_team, _X.extent(1));
+    size_t bytes_1 = ValuesViewType::shmem_size(_N_team, 1);
+    policy.set_scratch_size(0, Kokkos::PerTeam(4 * bytes_0 + 5 * bytes_1));
+
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename ValuesViewType, typename IntView,
+          typename VectorViewType>
+void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) {
+  typedef typename ValuesViewType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  const int nnz = (BlkSize - 2) * 3 + 2 * 2;
+
+  VectorViewType X("x0", N, BlkSize);
+  VectorViewType R("r0", N, BlkSize);
+  VectorViewType B("b", N, BlkSize);
+  ValuesViewType D("D", N, nnz);
+  IntView r("r", BlkSize + 1);
+  IntView c("c", nnz);
+
+  using ScalarType = typename ValuesViewType::non_const_value_type;
+  using Layout     = typename ValuesViewType::array_layout;
+  using EXSP       = typename ValuesViewType::execution_space;
+
+  using MagnitudeType =
+      typename Kokkos::Details::ArithTraits<ScalarType>::mag_type;
+  using NormViewType = Kokkos::View<MagnitudeType *, Layout, EXSP>;
+
+  NormViewType sqr_norm_0("sqr_norm_0", N);
+  NormViewType sqr_norm_j("sqr_norm_j", N);
+
+  create_tridiagonal_batched_matrices(nnz, BlkSize, N, r, c, D, X, B);
+
+  // Compute initial norm
+
+  Kokkos::deep_copy(R, B);
+
+  auto sqr_norm_0_host = Kokkos::create_mirror_view(sqr_norm_0);
+  auto sqr_norm_j_host = Kokkos::create_mirror_view(sqr_norm_j);
+  auto R_host          = Kokkos::create_mirror_view(R);
+  auto X_host          = Kokkos::create_mirror_view(X);
+  auto D_host          = Kokkos::create_mirror_view(D);
+  auto r_host          = Kokkos::create_mirror_view(r);
+  auto c_host          = Kokkos::create_mirror_view(c);
+
+  Kokkos::deep_copy(R, B);
+  Kokkos::deep_copy(R_host, R);
+  Kokkos::deep_copy(X_host, X);
+
+  Kokkos::deep_copy(c_host, c);
+  Kokkos::deep_copy(r_host, r);
+  Kokkos::deep_copy(D_host, D);
+
+  KokkosBatched::SerialSpmv<Trans::NoTranspose>::template invoke<
+      typename ValuesViewType::HostMirror, typename IntView::HostMirror,
+      typename VectorViewType::HostMirror, typename VectorViewType::HostMirror,
+      1>(-1, D_host, r_host, c_host, X_host, 1, R_host);
+  KokkosBatched::SerialDot<Trans::NoTranspose>::invoke(R_host, R_host,
+                                                       sqr_norm_0_host);
+  Functor_TestBatchedTeamCG<DeviceType, ValuesViewType, IntView,
+                            VectorViewType>(D, r, c, X, B, N_team)
+      .run();
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(R, B);
+  Kokkos::deep_copy(R_host, R);
+  Kokkos::deep_copy(X_host, X);
+
+  KokkosBatched::SerialSpmv<Trans::NoTranspose>::template invoke<
+      typename ValuesViewType::HostMirror, typename IntView::HostMirror,
+      typename VectorViewType::HostMirror, typename VectorViewType::HostMirror,
+      1>(-1, D_host, r_host, c_host, X_host, 1, R_host);
+  KokkosBatched::SerialDot<Trans::NoTranspose>::invoke(R_host, R_host,
+                                                       sqr_norm_j_host);
+
+  const MagnitudeType eps = 1.0e3 * ats::epsilon();
+
+  for (int l = 0; l < N; ++l)
+    EXPECT_NEAR_KK(sqr_norm_j_host(l) / sqr_norm_0_host(l), 0, eps);
+}
+}  // namespace TeamCG
+}  // namespace Test
+
+template <typename DeviceType, typename ValueType>
+int test_batched_team_CG() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
+  {
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType> ViewType;
+    typedef Kokkos::View<int *, Kokkos::LayoutLeft, DeviceType> IntView;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType>
+        VectorViewType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::TeamCG::impl_test_batched_CG<DeviceType, ViewType, IntView,
+                                         VectorViewType>(1024, i, 2);
+    }
+  }
+#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
+  {
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    typedef Kokkos::View<int *, Kokkos::LayoutRight, DeviceType> IntView;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        VectorViewType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::TeamCG::impl_test_batched_CG<DeviceType, ViewType, IntView,
+                                         VectorViewType>(1024, i, 2);
+    }
+  }
+#endif
+
+  return 0;
+}
diff --git a/unit_test/batched/sparse/Test_Batched_TeamCG_Real.hpp b/unit_test/batched/sparse/Test_Batched_TeamCG_Real.hpp
new file mode 100644
index 0000000000..1e6586a68e
--- /dev/null
+++ b/unit_test/batched/sparse/Test_Batched_TeamCG_Real.hpp
@@ -0,0 +1,12 @@
+
+#if defined(KOKKOSKERNELS_INST_FLOAT)
+TEST_F(TestCategory, batched_scalar_team_CG_float) {
+  test_batched_team_CG<TestExecSpace, float>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+TEST_F(TestCategory, batched_scalar_team_CG_double) {
+  test_batched_team_CG<TestExecSpace, double>();
+}
+#endif
diff --git a/unit_test/batched/sparse/Test_Batched_TeamGMRES.hpp b/unit_test/batched/sparse/Test_Batched_TeamGMRES.hpp
new file mode 100644
index 0000000000..f724553590
--- /dev/null
+++ b/unit_test/batched/sparse/Test_Batched_TeamGMRES.hpp
@@ -0,0 +1,197 @@
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "gtest/gtest.h"
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Random.hpp"
+#include "KokkosBatched_GMRES.hpp"
+#include "KokkosKernels_TestUtils.hpp"
+#include "KokkosBatched_CrsMatrix.hpp"
+#include "Test_Batched_SparseUtils.hpp"
+#include "KokkosBatched_JacobiPrec.hpp"
+
+using namespace KokkosBatched;
+
+namespace Test {
+namespace TeamGMRES {
+
+template <typename DeviceType, typename ValuesViewType, typename IntView,
+          typename VectorViewType>
+struct Functor_TestBatchedTeamGMRES {
+  const ValuesViewType _D;
+  const IntView _r;
+  const IntView _c;
+  const VectorViewType _X;
+  const VectorViewType _B;
+  const int _N_team;
+  KrylovHandle<typename ValuesViewType::value_type> handle;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamGMRES(const ValuesViewType &D, const IntView &r,
+                               const IntView &c, const VectorViewType &X,
+                               const VectorViewType &B, const int N_team)
+      : _D(D), _r(r), _c(c), _X(X), _B(B), _N_team(N_team) {}
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const {
+    const int first_matrix = static_cast<int>(member.league_rank()) * _N_team;
+    const int N            = _D.extent(0);
+    const int last_matrix =
+        (static_cast<int>(member.league_rank() + 1) * _N_team < N
+             ? static_cast<int>(member.league_rank() + 1) * _N_team
+             : N);
+
+    auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+    auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+    auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+
+    using Operator = KokkosBatched::CrsMatrix<ValuesViewType, IntView>;
+
+    Operator A(d, _r, _c);
+
+    KokkosBatched::TeamGMRES<MemberType>::template invoke<Operator,
+                                                          VectorViewType>(
+        member, A, b, x, handle);
+  }
+
+  inline void run() {
+    typedef typename ValuesViewType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::TeamGMRES");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::TeamPolicy<DeviceType> policy(_D.extent(0) / _N_team,
+                                          Kokkos::AUTO());
+
+    size_t bytes_0 = ValuesViewType::shmem_size(_N_team, _X.extent(1));
+    size_t bytes_1 = ValuesViewType::shmem_size(_N_team, 1);
+
+    handle.set_max_iteration(10);
+
+    int maximum_iteration = handle.get_max_iteration();
+
+    policy.set_scratch_size(0, Kokkos::PerTeam(5 * bytes_0 + 5 * bytes_1));
+    policy.set_scratch_size(
+        1, Kokkos::PerTeam(maximum_iteration * bytes_0 +
+                           ((maximum_iteration + 3) * maximum_iteration) *
+                               bytes_1));
+
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename ValuesViewType, typename IntView,
+          typename VectorViewType>
+void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) {
+  typedef typename ValuesViewType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  const int nnz = (BlkSize - 2) * 3 + 2 * 2;
+
+  VectorViewType X("x0", N, BlkSize);
+  VectorViewType R("r0", N, BlkSize);
+  VectorViewType B("b", N, BlkSize);
+  ValuesViewType D("D", N, nnz);
+  IntView r("r", BlkSize + 1);
+  IntView c("c", nnz);
+
+  using ScalarType = typename ValuesViewType::non_const_value_type;
+  using Layout     = typename ValuesViewType::array_layout;
+  using EXSP       = typename ValuesViewType::execution_space;
+
+  using MagnitudeType =
+      typename Kokkos::Details::ArithTraits<ScalarType>::mag_type;
+  using NormViewType = Kokkos::View<MagnitudeType *, Layout, EXSP>;
+
+  NormViewType sqr_norm_0("sqr_norm_0", N);
+  NormViewType sqr_norm_j("sqr_norm_j", N);
+
+  create_tridiagonal_batched_matrices(nnz, BlkSize, N, r, c, D, X, B);
+
+  // Compute initial norm
+
+  Kokkos::deep_copy(R, B);
+
+  auto sqr_norm_0_host = Kokkos::create_mirror_view(sqr_norm_0);
+  auto sqr_norm_j_host = Kokkos::create_mirror_view(sqr_norm_j);
+  auto R_host          = Kokkos::create_mirror_view(R);
+  auto X_host          = Kokkos::create_mirror_view(X);
+  auto D_host          = Kokkos::create_mirror_view(D);
+  auto r_host          = Kokkos::create_mirror_view(r);
+  auto c_host          = Kokkos::create_mirror_view(c);
+
+  Kokkos::deep_copy(R, B);
+  Kokkos::deep_copy(R_host, R);
+  Kokkos::deep_copy(X_host, X);
+
+  Kokkos::deep_copy(c_host, c);
+  Kokkos::deep_copy(r_host, r);
+  Kokkos::deep_copy(D_host, D);
+
+  KokkosBatched::SerialSpmv<Trans::NoTranspose>::template invoke<
+      typename ValuesViewType::HostMirror, typename IntView::HostMirror,
+      typename VectorViewType::HostMirror, typename VectorViewType::HostMirror,
+      1>(-1, D_host, r_host, c_host, X_host, 1, R_host);
+  KokkosBatched::SerialDot<Trans::NoTranspose>::invoke(R_host, R_host,
+                                                       sqr_norm_0_host);
+  Functor_TestBatchedTeamGMRES<DeviceType, ValuesViewType, IntView,
+                               VectorViewType>(D, r, c, X, B, N_team)
+      .run();
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(R, B);
+  Kokkos::deep_copy(R_host, R);
+  Kokkos::deep_copy(X_host, X);
+
+  KokkosBatched::SerialSpmv<Trans::NoTranspose>::template invoke<
+      typename ValuesViewType::HostMirror, typename IntView::HostMirror,
+      typename VectorViewType::HostMirror, typename VectorViewType::HostMirror,
+      1>(-1, D_host, r_host, c_host, X_host, 1, R_host);
+  KokkosBatched::SerialDot<Trans::NoTranspose>::invoke(R_host, R_host,
+                                                       sqr_norm_j_host);
+
+  const MagnitudeType eps = 1.0e5 * ats::epsilon();
+
+  for (int l = 0; l < N; ++l)
+    EXPECT_NEAR_KK(
+        std::sqrt(sqr_norm_j_host(l)) / std::sqrt(sqr_norm_0_host(l)), 0, eps);
+}
+}  // namespace TeamGMRES
+}  // namespace Test
+
+template <typename DeviceType, typename ValueType>
+int test_batched_team_GMRES() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
+  {
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType> ViewType;
+    typedef Kokkos::View<int *, Kokkos::LayoutLeft, DeviceType> IntView;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType>
+        VectorViewType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::TeamGMRES::impl_test_batched_GMRES<DeviceType, ViewType, IntView,
+                                               VectorViewType>(1024, i, 2);
+    }
+  }
+#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
+  {
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    typedef Kokkos::View<int *, Kokkos::LayoutRight, DeviceType> IntView;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        VectorViewType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::TeamGMRES::impl_test_batched_GMRES<DeviceType, ViewType, IntView,
+                                               VectorViewType>(1024, i, 2);
+    }
+  }
+#endif
+
+  return 0;
+}
diff --git a/unit_test/batched/sparse/Test_Batched_TeamGMRES_Real.hpp b/unit_test/batched/sparse/Test_Batched_TeamGMRES_Real.hpp
new file mode 100644
index 0000000000..1003a1d1f2
--- /dev/null
+++ b/unit_test/batched/sparse/Test_Batched_TeamGMRES_Real.hpp
@@ -0,0 +1,12 @@
+
+#if defined(KOKKOSKERNELS_INST_FLOAT)
+TEST_F(TestCategory, batched_scalar_team_GMRES_float) {
+  test_batched_team_GMRES<TestExecSpace, float>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+TEST_F(TestCategory, batched_scalar_team_GMRES_double) {
+  test_batched_team_GMRES<TestExecSpace, double>();
+}
+#endif
diff --git a/unit_test/batched/sparse/Test_Batched_TeamSpmv.hpp b/unit_test/batched/sparse/Test_Batched_TeamSpmv.hpp
new file mode 100644
index 0000000000..0b0f96e9b1
--- /dev/null
+++ b/unit_test/batched/sparse/Test_Batched_TeamSpmv.hpp
@@ -0,0 +1,225 @@
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "gtest/gtest.h"
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Random.hpp"
+
+//#include "KokkosBatched_Vector.hpp"
+
+#include "KokkosBatched_Spmv.hpp"
+#include "KokkosBatched_Spmv_Team_Impl.hpp"
+
+#include "KokkosKernels_TestUtils.hpp"
+
+#include "Test_Batched_SparseUtils.hpp"
+
+using namespace KokkosBatched;
+
+namespace Test {
+namespace TeamSpmv {
+
+template <typename T>
+struct ParamTag {
+  typedef T trans;
+};
+
+template <typename DeviceType, typename ParamTagType, typename ValuesViewType,
+          typename IntView, typename xViewType, typename yViewType,
+          typename alphaViewType, typename betaViewType, int dobeta>
+struct Functor_TestBatchedTeamSpmv {
+  const alphaViewType _alpha;
+  const ValuesViewType _D;
+  const IntView _r;
+  const IntView _c;
+  const xViewType _X;
+  const betaViewType _beta;
+  const yViewType _Y;
+  const int _N_team;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamSpmv(const alphaViewType &alpha,
+                              const ValuesViewType &D, const IntView &r,
+                              const IntView &c, const xViewType &X,
+                              const betaViewType &beta, const yViewType &Y,
+                              const int N_team)
+      : _alpha(alpha),
+        _D(D),
+        _r(r),
+        _c(c),
+        _X(X),
+        _beta(beta),
+        _Y(Y),
+        _N_team(N_team) {}
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &,
+                                         const MemberType &member) const {
+    const int first_matrix = static_cast<int>(member.league_rank()) * _N_team;
+    const int N            = _D.extent(0);
+    const int last_matrix =
+        (static_cast<int>(member.league_rank() + 1) * _N_team < N
+             ? static_cast<int>(member.league_rank() + 1) * _N_team
+             : N);
+
+    auto alpha =
+        Kokkos::subview(_alpha, Kokkos::make_pair(first_matrix, last_matrix));
+    auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+    auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+    auto beta =
+        Kokkos::subview(_beta, Kokkos::make_pair(first_matrix, last_matrix));
+    auto y = Kokkos::subview(_Y, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+
+    KokkosBatched::TeamSpmv<MemberType, typename ParamTagType::trans>::
+        template invoke<ValuesViewType, IntView, xViewType, yViewType,
+                        alphaViewType, betaViewType, dobeta>(
+            member, alpha, d, _r, _c, x, beta, y);
+  }
+
+  inline void run() {
+    typedef typename ValuesViewType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::TeamSpmv");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::TeamPolicy<DeviceType, ParamTagType> policy(
+        _D.extent(0) / _N_team, Kokkos::AUTO(), Kokkos::AUTO());
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename ParamTagType, typename ValuesViewType,
+          typename IntView, typename xViewType, typename yViewType,
+          typename alphaViewType, typename betaViewType, int dobeta>
+void impl_test_batched_spmv(const int N, const int BlkSize, const int N_team) {
+  typedef typename ValuesViewType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  const int nnz = (BlkSize - 2) * 3 + 2 * 2;
+
+  xViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize);
+  yViewType Y0("y0", N, BlkSize), Y1("y1", N, BlkSize);
+  ValuesViewType D("D", N, nnz);
+  IntView r("r", BlkSize + 1);
+  IntView c("c", nnz);
+
+  alphaViewType alpha("alpha", N);
+  betaViewType beta("beta", N);
+
+  Kokkos::deep_copy(alpha, value_type(1.0));
+  Kokkos::deep_copy(beta, value_type(1.0));
+
+  create_tridiagonal_batched_matrices(nnz, BlkSize, N, r, c, D, X0, Y0);
+
+  Kokkos::deep_copy(X1, X0);
+  Kokkos::deep_copy(Y1, Y0);
+
+  /// test body
+  auto alpha_host = Kokkos::create_mirror_view(alpha);
+  auto beta_host  = Kokkos::create_mirror_view(beta);
+  auto X0_host    = Kokkos::create_mirror_view(X0);
+  auto Y0_host    = Kokkos::create_mirror_view(Y0);
+
+  Kokkos::deep_copy(alpha_host, alpha);
+  Kokkos::deep_copy(beta_host, beta);
+  Kokkos::deep_copy(X0_host, X0);
+  Kokkos::deep_copy(Y0_host, Y0);
+
+  for (int l = 0; l < N; ++l)
+    for (int i = 0; i < BlkSize; ++i) {
+      if (dobeta == 0)
+        Y0_host(l, i) = value_type(0.0);
+      else
+        Y0_host(l, i) *= beta_host(l);
+      if (i != 0 && i != (BlkSize - 1))
+        Y0_host(l, i) +=
+            alpha_host(l) *
+            (2 * X0_host(l, i) - X0_host(l, i - 1) - X0_host(l, i + 1));
+      else if (i == 0)
+        Y0_host(l, i) +=
+            alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i + 1));
+      else
+        Y0_host(l, i) +=
+            alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i - 1));
+    }
+
+  Functor_TestBatchedTeamSpmv<DeviceType, ParamTagType, ValuesViewType, IntView,
+                              xViewType, yViewType, alphaViewType, betaViewType,
+                              dobeta>(alpha, D, r, c, X1, beta, Y1, N_team)
+      .run();
+
+  Kokkos::fence();
+
+  /// for comparison send it to host
+  auto Y1_host = Kokkos::create_mirror_view(Y1);
+
+  Kokkos::deep_copy(Y1_host, Y1);
+
+  /// check c0 = c1 ; this eps is about 10^-14
+  typedef typename ats::mag_type mag_type;
+  mag_type sum(1), diff(0);
+  const mag_type eps = 1.0e3 * ats::epsilon();
+
+  for (int l = 0; l < N; ++l)
+    for (int i = 0; i < BlkSize; ++i) {
+      sum += ats::abs(Y0_host(l, i));
+      diff += ats::abs(Y0_host(l, i) - Y1_host(l, i));
+    }
+  EXPECT_NEAR_KK(diff / sum, 0, eps);
+}
+}  // namespace TeamSpmv
+}  // namespace Test
+
+template <typename DeviceType, typename ValueType, typename ScalarType,
+          typename ParamTagType>
+int test_batched_team_spmv() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
+  {
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType> ViewType;
+    typedef Kokkos::View<int *, Kokkos::LayoutLeft, DeviceType> IntView;
+    typedef Kokkos::View<ScalarType *, Kokkos::LayoutLeft, DeviceType>
+        alphaViewType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::TeamSpmv::impl_test_batched_spmv<DeviceType, ParamTagType, ViewType,
+                                             IntView, ViewType, ViewType,
+                                             alphaViewType, alphaViewType, 0>(
+          1024, i, 2);
+    }
+    for (int i = 3; i < 10; ++i) {
+      Test::TeamSpmv::impl_test_batched_spmv<DeviceType, ParamTagType, ViewType,
+                                             IntView, ViewType, ViewType,
+                                             alphaViewType, alphaViewType, 1>(
+          1024, i, 2);
+    }
+  }
+#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
+  {
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    typedef Kokkos::View<int *, Kokkos::LayoutRight, DeviceType> IntView;
+    typedef Kokkos::View<ScalarType *, Kokkos::LayoutRight, DeviceType>
+        alphaViewType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::TeamSpmv::impl_test_batched_spmv<DeviceType, ParamTagType, ViewType,
+                                             IntView, ViewType, ViewType,
+                                             alphaViewType, alphaViewType, 0>(
+          1024, i, 2);
+    }
+
+    for (int i = 3; i < 10; ++i) {
+      Test::TeamSpmv::impl_test_batched_spmv<DeviceType, ParamTagType, ViewType,
+                                             IntView, ViewType, ViewType,
+                                             alphaViewType, alphaViewType, 1>(
+          1024, i, 2);
+    }
+  }
+#endif
+
+  return 0;
+}
diff --git a/unit_test/batched/sparse/Test_Batched_TeamSpmv_Real.hpp b/unit_test/batched/sparse/Test_Batched_TeamSpmv_Real.hpp
new file mode 100644
index 0000000000..804ffc984b
--- /dev/null
+++ b/unit_test/batched/sparse/Test_Batched_TeamSpmv_Real.hpp
@@ -0,0 +1,14 @@
+
+#if defined(KOKKOSKERNELS_INST_FLOAT)
+TEST_F(TestCategory, batched_scalar_team_spmv_nt_float_float) {
+  typedef ::Test::Spmv::ParamTag<Trans::NoTranspose> param_tag_type;
+  test_batched_team_spmv<TestExecSpace, float, float, param_tag_type>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+TEST_F(TestCategory, batched_scalar_team_spmv_nt_double_double) {
+  typedef ::Test::Spmv::ParamTag<Trans::NoTranspose> param_tag_type;
+  test_batched_team_spmv<TestExecSpace, double, double, param_tag_type>();
+}
+#endif
diff --git a/unit_test/batched/sparse/Test_Batched_TeamVectorCG.hpp b/unit_test/batched/sparse/Test_Batched_TeamVectorCG.hpp
new file mode 100644
index 0000000000..6637d9858d
--- /dev/null
+++ b/unit_test/batched/sparse/Test_Batched_TeamVectorCG.hpp
@@ -0,0 +1,186 @@
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "gtest/gtest.h"
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Random.hpp"
+#include "KokkosBatched_CG.hpp"
+#include "KokkosKernels_TestUtils.hpp"
+#include "KokkosBatched_CrsMatrix.hpp"
+#include "Test_Batched_SparseUtils.hpp"
+
+using namespace KokkosBatched;
+
+namespace Test {
+namespace TeamVectorCG {
+
+template <typename DeviceType, typename ValuesViewType, typename IntView,
+          typename VectorViewType>
+struct Functor_TestBatchedTeamVectorCG {
+  const ValuesViewType _D;
+  const IntView _r;
+  const IntView _c;
+  const VectorViewType _X;
+  const VectorViewType _B;
+  const int _N_team;
+  KrylovHandle<typename ValuesViewType::value_type> handle;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamVectorCG(const ValuesViewType &D, const IntView &r,
+                                  const IntView &c, const VectorViewType &X,
+                                  const VectorViewType &B, const int N_team)
+      : _D(D), _r(r), _c(c), _X(X), _B(B), _N_team(N_team) {}
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const {
+    const int first_matrix = static_cast<int>(member.league_rank()) * _N_team;
+    const int N            = _D.extent(0);
+    const int last_matrix =
+        (static_cast<int>(member.league_rank() + 1) * _N_team < N
+             ? static_cast<int>(member.league_rank() + 1) * _N_team
+             : N);
+
+    auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+    auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+    auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+
+    using Operator = KokkosBatched::CrsMatrix<ValuesViewType, IntView>;
+
+    Operator A(d, _r, _c);
+
+    KokkosBatched::TeamVectorCG<MemberType>::template invoke<Operator,
+                                                             VectorViewType>(
+        member, A, b, x, handle);
+  }
+
+  inline void run() {
+    typedef typename ValuesViewType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::TeamVectorCG");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::TeamPolicy<DeviceType> policy(_D.extent(0) / _N_team,
+                                          Kokkos::AUTO(), Kokkos::AUTO());
+
+    size_t bytes_0 = ValuesViewType::shmem_size(_N_team, _X.extent(1));
+    size_t bytes_1 = ValuesViewType::shmem_size(_N_team, 1);
+    policy.set_scratch_size(0, Kokkos::PerTeam(4 * bytes_0 + 5 * bytes_1));
+
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename ValuesViewType, typename IntView,
+          typename VectorViewType>
+void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) {
+  typedef typename ValuesViewType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  const int nnz = (BlkSize - 2) * 3 + 2 * 2;
+
+  VectorViewType X("x0", N, BlkSize);
+  VectorViewType R("r0", N, BlkSize);
+  VectorViewType B("b", N, BlkSize);
+  ValuesViewType D("D", N, nnz);
+  IntView r("r", BlkSize + 1);
+  IntView c("c", nnz);
+
+  using ScalarType = typename ValuesViewType::non_const_value_type;
+  using Layout     = typename ValuesViewType::array_layout;
+  using EXSP       = typename ValuesViewType::execution_space;
+
+  using MagnitudeType =
+      typename Kokkos::Details::ArithTraits<ScalarType>::mag_type;
+  using NormViewType = Kokkos::View<MagnitudeType *, Layout, EXSP>;
+
+  NormViewType sqr_norm_0("sqr_norm_0", N);
+  NormViewType sqr_norm_j("sqr_norm_j", N);
+
+  create_tridiagonal_batched_matrices(nnz, BlkSize, N, r, c, D, X, B);
+
+  // Compute initial norm
+
+  Kokkos::deep_copy(R, B);
+
+  auto sqr_norm_0_host = Kokkos::create_mirror_view(sqr_norm_0);
+  auto sqr_norm_j_host = Kokkos::create_mirror_view(sqr_norm_j);
+  auto R_host          = Kokkos::create_mirror_view(R);
+  auto X_host          = Kokkos::create_mirror_view(X);
+  auto D_host          = Kokkos::create_mirror_view(D);
+  auto r_host          = Kokkos::create_mirror_view(r);
+  auto c_host          = Kokkos::create_mirror_view(c);
+
+  Kokkos::deep_copy(R, B);
+  Kokkos::deep_copy(R_host, R);
+  Kokkos::deep_copy(X_host, X);
+
+  Kokkos::deep_copy(c_host, c);
+  Kokkos::deep_copy(r_host, r);
+  Kokkos::deep_copy(D_host, D);
+
+  KokkosBatched::SerialSpmv<Trans::NoTranspose>::template invoke<
+      typename ValuesViewType::HostMirror, typename IntView::HostMirror,
+      typename VectorViewType::HostMirror, typename VectorViewType::HostMirror,
+      1>(-1, D_host, r_host, c_host, X_host, 1, R_host);
+  KokkosBatched::SerialDot<Trans::NoTranspose>::invoke(R_host, R_host,
+                                                       sqr_norm_0_host);
+  Functor_TestBatchedTeamVectorCG<DeviceType, ValuesViewType, IntView,
+                                  VectorViewType>(D, r, c, X, B, N_team)
+      .run();
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(R, B);
+  Kokkos::deep_copy(R_host, R);
+  Kokkos::deep_copy(X_host, X);
+
+  KokkosBatched::SerialSpmv<Trans::NoTranspose>::template invoke<
+      typename ValuesViewType::HostMirror, typename IntView::HostMirror,
+      typename VectorViewType::HostMirror, typename VectorViewType::HostMirror,
+      1>(-1, D_host, r_host, c_host, X_host, 1, R_host);
+  KokkosBatched::SerialDot<Trans::NoTranspose>::invoke(R_host, R_host,
+                                                       sqr_norm_j_host);
+
+  const MagnitudeType eps = 1.0e3 * ats::epsilon();
+
+  for (int l = 0; l < N; ++l)
+    EXPECT_NEAR_KK(sqr_norm_j_host(l) / sqr_norm_0_host(l), 0, eps);
+}
+}  // namespace TeamVectorCG
+}  // namespace Test
+
+template <typename DeviceType, typename ValueType>
+int test_batched_teamvector_CG() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
+  {
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType> ViewType;
+    typedef Kokkos::View<int *, Kokkos::LayoutLeft, DeviceType> IntView;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType>
+        VectorViewType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::TeamVectorCG::impl_test_batched_CG<DeviceType, ViewType, IntView,
+                                               VectorViewType>(1024, i, 2);
+    }
+  }
+#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
+  {
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    typedef Kokkos::View<int *, Kokkos::LayoutRight, DeviceType> IntView;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        VectorViewType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::TeamVectorCG::impl_test_batched_CG<DeviceType, ViewType, IntView,
+                                               VectorViewType>(1024, i, 2);
+    }
+  }
+#endif
+
+  return 0;
+}
diff --git a/unit_test/batched/sparse/Test_Batched_TeamVectorCG_Real.hpp b/unit_test/batched/sparse/Test_Batched_TeamVectorCG_Real.hpp
new file mode 100644
index 0000000000..526f1f7c03
--- /dev/null
+++ b/unit_test/batched/sparse/Test_Batched_TeamVectorCG_Real.hpp
@@ -0,0 +1,12 @@
+
+#if defined(KOKKOSKERNELS_INST_FLOAT)
+TEST_F(TestCategory, batched_scalar_teamvector_CG_float) {
+  test_batched_teamvector_CG<TestExecSpace, float>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+TEST_F(TestCategory, batched_scalar_teamvector_CG_double) {
+  test_batched_teamvector_CG<TestExecSpace, double>();
+}
+#endif
diff --git a/unit_test/batched/sparse/Test_Batched_TeamVectorGMRES.hpp b/unit_test/batched/sparse/Test_Batched_TeamVectorGMRES.hpp
new file mode 100644
index 0000000000..87e9da0281
--- /dev/null
+++ b/unit_test/batched/sparse/Test_Batched_TeamVectorGMRES.hpp
@@ -0,0 +1,231 @@
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "gtest/gtest.h"
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Random.hpp"
+#include "KokkosBatched_GMRES.hpp"
+#include "KokkosKernels_TestUtils.hpp"
+#include "KokkosBatched_CrsMatrix.hpp"
+#include "Test_Batched_SparseUtils.hpp"
+#include "KokkosBatched_JacobiPrec.hpp"
+
+using namespace KokkosBatched;
+
+namespace Test {
+namespace TeamVectorGMRES {
+
+template <typename DeviceType, typename ValuesViewType, typename IntView,
+          typename VectorViewType>
+struct Functor_TestBatchedTeamVectorGMRES {
+  const ValuesViewType _D;
+  const IntView _r;
+  const IntView _c;
+  const VectorViewType _X;
+  const VectorViewType _B;
+  const VectorViewType _Diag;
+  const int _N_team;
+  KrylovHandle<typename ValuesViewType::value_type> handle;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamVectorGMRES(const ValuesViewType &D, const IntView &r,
+                                     const IntView &c, const VectorViewType &X,
+                                     const VectorViewType &B,
+                                     const VectorViewType &diag,
+                                     const int N_team)
+      : _D(D), _r(r), _c(c), _X(X), _B(B), _Diag(diag), _N_team(N_team) {}
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const {
+    const int first_matrix = static_cast<int>(member.league_rank()) * _N_team;
+    const int N            = _D.extent(0);
+    const int last_matrix =
+        (static_cast<int>(member.league_rank() + 1) * _N_team < N
+             ? static_cast<int>(member.league_rank() + 1) * _N_team
+             : N);
+
+    auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+    auto diag = Kokkos::subview(
+        _Diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL);
+    auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+    auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+
+    using Operator     = KokkosBatched::CrsMatrix<ValuesViewType, IntView>;
+    using PrecOperator = KokkosBatched::JacobiPrec<ValuesViewType>;
+
+    Operator A(d, _r, _c);
+    PrecOperator P(diag);
+
+    KokkosBatched::TeamVectorGMRES<MemberType>::template invoke<Operator,
+                                                                VectorViewType>(
+        member, A, b, x, P, handle);
+  }
+
+  inline void run() {
+    typedef typename ValuesViewType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::TeamVectorGMRES");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::TeamPolicy<DeviceType> policy(_D.extent(0) / _N_team,
+                                          Kokkos::AUTO(), Kokkos::AUTO());
+
+    size_t bytes_0 = ValuesViewType::shmem_size(_N_team, _X.extent(1));
+    size_t bytes_1 = ValuesViewType::shmem_size(_N_team, 1);
+
+    handle.set_max_iteration(10);
+
+    int maximum_iteration = handle.get_max_iteration();
+
+    policy.set_scratch_size(0, Kokkos::PerTeam(5 * bytes_0 + 5 * bytes_1));
+    policy.set_scratch_size(
+        1, Kokkos::PerTeam(maximum_iteration * bytes_0 +
+                           ((maximum_iteration + 3) * maximum_iteration) *
+                               bytes_1));
+
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename ValuesViewType, typename IntView,
+          typename VectorViewType>
+void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) {
+  typedef typename ValuesViewType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  const int nnz = (BlkSize - 2) * 3 + 2 * 2;
+
+  VectorViewType X("x0", N, BlkSize);
+  VectorViewType R("r0", N, BlkSize);
+  VectorViewType B("b", N, BlkSize);
+  ValuesViewType D("D", N, nnz);
+  ValuesViewType Diag("Diag", N, BlkSize);
+  IntView r("r", BlkSize + 1);
+  IntView c("c", nnz);
+
+  using ScalarType = typename ValuesViewType::non_const_value_type;
+  using Layout     = typename ValuesViewType::array_layout;
+  using EXSP       = typename ValuesViewType::execution_space;
+
+  using MagnitudeType =
+      typename Kokkos::Details::ArithTraits<ScalarType>::mag_type;
+  using NormViewType = Kokkos::View<MagnitudeType *, Layout, EXSP>;
+
+  NormViewType sqr_norm_0("sqr_norm_0", N);
+  NormViewType sqr_norm_j("sqr_norm_j", N);
+
+  create_tridiagonal_batched_matrices(nnz, BlkSize, N, r, c, D, X, B);
+
+  {
+    auto diag_values_host = Kokkos::create_mirror_view(Diag);
+    auto values_host      = Kokkos::create_mirror_view(D);
+    auto row_ptr_host     = Kokkos::create_mirror_view(r);
+    auto colIndices_host  = Kokkos::create_mirror_view(c);
+
+    Kokkos::deep_copy(values_host, D);
+    Kokkos::deep_copy(row_ptr_host, r);
+    Kokkos::deep_copy(colIndices_host, c);
+
+    int current_index;
+    for (int i = 0; i < BlkSize; ++i) {
+      for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1);
+           ++current_index) {
+        if (colIndices_host(current_index) == i) break;
+      }
+      for (int j = 0; j < N; ++j)
+        diag_values_host(j, i) = values_host(j, current_index);
+    }
+
+    Kokkos::deep_copy(Diag, diag_values_host);
+  }
+
+  // Compute initial norm
+
+  Kokkos::deep_copy(R, B);
+
+  auto sqr_norm_0_host = Kokkos::create_mirror_view(sqr_norm_0);
+  auto sqr_norm_j_host = Kokkos::create_mirror_view(sqr_norm_j);
+  auto R_host          = Kokkos::create_mirror_view(R);
+  auto X_host          = Kokkos::create_mirror_view(X);
+  auto D_host          = Kokkos::create_mirror_view(D);
+  auto r_host          = Kokkos::create_mirror_view(r);
+  auto c_host          = Kokkos::create_mirror_view(c);
+
+  Kokkos::deep_copy(R, B);
+  Kokkos::deep_copy(R_host, R);
+  Kokkos::deep_copy(X_host, X);
+
+  Kokkos::deep_copy(c_host, c);
+  Kokkos::deep_copy(r_host, r);
+  Kokkos::deep_copy(D_host, D);
+
+  KokkosBatched::SerialSpmv<Trans::NoTranspose>::template invoke<
+      typename ValuesViewType::HostMirror, typename IntView::HostMirror,
+      typename VectorViewType::HostMirror, typename VectorViewType::HostMirror,
+      1>(-1, D_host, r_host, c_host, X_host, 1, R_host);
+  KokkosBatched::SerialDot<Trans::NoTranspose>::invoke(R_host, R_host,
+                                                       sqr_norm_0_host);
+  Functor_TestBatchedTeamVectorGMRES<DeviceType, ValuesViewType, IntView,
+                                     VectorViewType>(D, r, c, X, B, Diag,
+                                                     N_team)
+      .run();
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(R, B);
+  Kokkos::deep_copy(R_host, R);
+  Kokkos::deep_copy(X_host, X);
+
+  KokkosBatched::SerialSpmv<Trans::NoTranspose>::template invoke<
+      typename ValuesViewType::HostMirror, typename IntView::HostMirror,
+      typename VectorViewType::HostMirror, typename VectorViewType::HostMirror,
+      1>(-1, D_host, r_host, c_host, X_host, 1, R_host);
+  KokkosBatched::SerialDot<Trans::NoTranspose>::invoke(R_host, R_host,
+                                                       sqr_norm_j_host);
+
+  const MagnitudeType eps = 1.0e5 * ats::epsilon();
+
+  for (int l = 0; l < N; ++l)
+    EXPECT_NEAR_KK(
+        std::sqrt(sqr_norm_j_host(l)) / std::sqrt(sqr_norm_0_host(l)), 0, eps);
+}
+}  // namespace TeamVectorGMRES
+}  // namespace Test
+
+template <typename DeviceType, typename ValueType>
+int test_batched_teamvector_GMRES() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
+  {
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType> ViewType;
+    typedef Kokkos::View<int *, Kokkos::LayoutLeft, DeviceType> IntView;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType>
+        VectorViewType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::TeamVectorGMRES::impl_test_batched_GMRES<DeviceType, ViewType,
+                                                     IntView, VectorViewType>(
+          1024, i, 2);
+    }
+  }
+#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
+  {
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    typedef Kokkos::View<int *, Kokkos::LayoutRight, DeviceType> IntView;
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        VectorViewType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::TeamVectorGMRES::impl_test_batched_GMRES<DeviceType, ViewType,
+                                                     IntView, VectorViewType>(
+          1024, i, 2);
+    }
+  }
+#endif
+
+  return 0;
+}
diff --git a/unit_test/batched/sparse/Test_Batched_TeamVectorGMRES_Real.hpp b/unit_test/batched/sparse/Test_Batched_TeamVectorGMRES_Real.hpp
new file mode 100644
index 0000000000..a7cb9225a0
--- /dev/null
+++ b/unit_test/batched/sparse/Test_Batched_TeamVectorGMRES_Real.hpp
@@ -0,0 +1,12 @@
+
+#if defined(KOKKOSKERNELS_INST_FLOAT)
+TEST_F(TestCategory, batched_scalar_teamvector_GMRES_float) {
+  test_batched_teamvector_GMRES<TestExecSpace, float>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+TEST_F(TestCategory, batched_scalar_teamvector_GMRES_double) {
+  test_batched_teamvector_GMRES<TestExecSpace, double>();
+}
+#endif
diff --git a/unit_test/batched/sparse/Test_Batched_TeamVectorSpmv.hpp b/unit_test/batched/sparse/Test_Batched_TeamVectorSpmv.hpp
new file mode 100644
index 0000000000..205ad6fa73
--- /dev/null
+++ b/unit_test/batched/sparse/Test_Batched_TeamVectorSpmv.hpp
@@ -0,0 +1,221 @@
+/// \author Kim Liegeois (knliege@sandia.gov)
+
+#include "gtest/gtest.h"
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Random.hpp"
+
+//#include "KokkosBatched_Vector.hpp"
+
+#include "KokkosBatched_Spmv.hpp"
+#include "KokkosBatched_Spmv_TeamVector_Impl.hpp"
+
+#include "KokkosKernels_TestUtils.hpp"
+
+#include "Test_Batched_SparseUtils.hpp"
+
+using namespace KokkosBatched;
+
+namespace Test {
+namespace TeamVectorSpmv {
+
+template <typename T>
+struct ParamTag {
+  typedef T trans;
+};
+
+template <typename DeviceType, typename ParamTagType, typename ValuesViewType,
+          typename IntView, typename xViewType, typename yViewType,
+          typename alphaViewType, typename betaViewType, int dobeta>
+struct Functor_TestBatchedTeamVectorSpmv {
+  const alphaViewType _alpha;
+  const ValuesViewType _D;
+  const IntView _r;
+  const IntView _c;
+  const xViewType _X;
+  const betaViewType _beta;
+  const yViewType _Y;
+  const int _N_team;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_TestBatchedTeamVectorSpmv(const alphaViewType &alpha,
+                                    const ValuesViewType &D, const IntView &r,
+                                    const IntView &c, const xViewType &X,
+                                    const betaViewType &beta,
+                                    const yViewType &Y, const int N_team)
+      : _alpha(alpha),
+        _D(D),
+        _r(r),
+        _c(c),
+        _X(X),
+        _beta(beta),
+        _Y(Y),
+        _N_team(N_team) {}
+
+  template <typename MemberType>
+  KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &,
+                                         const MemberType &member) const {
+    const int first_matrix = static_cast<int>(member.league_rank()) * _N_team;
+    const int N            = _D.extent(0);
+    const int last_matrix =
+        (static_cast<int>(member.league_rank() + 1) * _N_team < N
+             ? static_cast<int>(member.league_rank() + 1) * _N_team
+             : N);
+
+    auto alpha =
+        Kokkos::subview(_alpha, Kokkos::make_pair(first_matrix, last_matrix));
+    auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+    auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+    auto beta =
+        Kokkos::subview(_beta, Kokkos::make_pair(first_matrix, last_matrix));
+    auto y = Kokkos::subview(_Y, Kokkos::make_pair(first_matrix, last_matrix),
+                             Kokkos::ALL);
+
+    KokkosBatched::TeamVectorSpmv<MemberType, typename ParamTagType::trans>::
+        template invoke<ValuesViewType, IntView, xViewType, yViewType,
+                        alphaViewType, betaViewType, dobeta>(
+            member, alpha, d, _r, _c, x, beta, y);
+  }
+
+  inline void run() {
+    typedef typename ValuesViewType::value_type value_type;
+    std::string name_region("KokkosBatched::Test::TeamVectorSpmv");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::TeamPolicy<DeviceType, ParamTagType> policy(
+        _D.extent(0) / _N_team, Kokkos::AUTO(), Kokkos::AUTO());
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename ParamTagType, typename ValuesViewType,
+          typename IntView, typename xViewType, typename yViewType,
+          typename alphaViewType, typename betaViewType, int dobeta>
+void impl_test_batched_spmv(const int N, const int BlkSize, const int N_team) {
+  typedef typename ValuesViewType::value_type value_type;
+  typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+  const int nnz = (BlkSize - 2) * 3 + 2 * 2;
+
+  xViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize);
+  yViewType Y0("y0", N, BlkSize), Y1("y1", N, BlkSize);
+  ValuesViewType D("D", N, nnz);
+  IntView r("r", BlkSize + 1);
+  IntView c("c", nnz);
+
+  alphaViewType alpha("alpha", N);
+  betaViewType beta("beta", N);
+
+  Kokkos::deep_copy(alpha, value_type(1.0));
+  Kokkos::deep_copy(beta, value_type(1.0));
+
+  create_tridiagonal_batched_matrices(nnz, BlkSize, N, r, c, D, X0, Y0);
+
+  Kokkos::deep_copy(X1, X0);
+  Kokkos::deep_copy(Y1, Y0);
+
+  /// test body
+  auto alpha_host = Kokkos::create_mirror_view(alpha);
+  auto beta_host  = Kokkos::create_mirror_view(beta);
+  auto X0_host    = Kokkos::create_mirror_view(X0);
+  auto Y0_host    = Kokkos::create_mirror_view(Y0);
+
+  Kokkos::deep_copy(alpha_host, alpha);
+  Kokkos::deep_copy(beta_host, beta);
+  Kokkos::deep_copy(X0_host, X0);
+  Kokkos::deep_copy(Y0_host, Y0);
+
+  for (int l = 0; l < N; ++l)
+    for (int i = 0; i < BlkSize; ++i) {
+      if (dobeta == 0)
+        Y0_host(l, i) = value_type(0.0);
+      else
+        Y0_host(l, i) *= beta_host(l);
+      if (i != 0 && i != (BlkSize - 1))
+        Y0_host(l, i) +=
+            alpha_host(l) *
+            (2 * X0_host(l, i) - X0_host(l, i - 1) - X0_host(l, i + 1));
+      else if (i == 0)
+        Y0_host(l, i) +=
+            alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i + 1));
+      else
+        Y0_host(l, i) +=
+            alpha_host(l) * (2 * X0_host(l, i) - X0_host(l, i - 1));
+    }
+
+  Functor_TestBatchedTeamVectorSpmv<DeviceType, ParamTagType, ValuesViewType,
+                                    IntView, xViewType, yViewType,
+                                    alphaViewType, betaViewType, dobeta>(
+      alpha, D, r, c, X1, beta, Y1, N_team)
+      .run();
+
+  Kokkos::fence();
+
+  /// for comparison send it to host
+  auto Y1_host = Kokkos::create_mirror_view(Y1);
+
+  Kokkos::deep_copy(Y1_host, Y1);
+
+  /// check c0 = c1 ; this eps is about 10^-14
+  typedef typename ats::mag_type mag_type;
+  mag_type sum(1), diff(0);
+  const mag_type eps = 1.0e3 * ats::epsilon();
+
+  for (int l = 0; l < N; ++l)
+    for (int i = 0; i < BlkSize; ++i) {
+      sum += ats::abs(Y0_host(l, i));
+      diff += ats::abs(Y0_host(l, i) - Y1_host(l, i));
+    }
+  EXPECT_NEAR_KK(diff / sum, 0, eps);
+}
+}  // namespace TeamVectorSpmv
+}  // namespace Test
+
+template <typename DeviceType, typename ValueType, typename ScalarType,
+          typename ParamTagType>
+int test_batched_teamvector_spmv() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
+  {
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType> ViewType;
+    typedef Kokkos::View<int *, Kokkos::LayoutLeft, DeviceType> IntView;
+    typedef Kokkos::View<ScalarType *, Kokkos::LayoutLeft, DeviceType>
+        alphaViewType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::TeamVectorSpmv::impl_test_batched_spmv<
+          DeviceType, ParamTagType, ViewType, IntView, ViewType, ViewType,
+          alphaViewType, alphaViewType, 0>(1024, i, 2);
+    }
+    for (int i = 3; i < 10; ++i) {
+      Test::TeamVectorSpmv::impl_test_batched_spmv<
+          DeviceType, ParamTagType, ViewType, IntView, ViewType, ViewType,
+          alphaViewType, alphaViewType, 1>(1024, i, 2);
+    }
+  }
+#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
+  {
+    typedef Kokkos::View<ValueType **, Kokkos::LayoutRight, DeviceType>
+        ViewType;
+    typedef Kokkos::View<int *, Kokkos::LayoutRight, DeviceType> IntView;
+    typedef Kokkos::View<ScalarType *, Kokkos::LayoutRight, DeviceType>
+        alphaViewType;
+
+    for (int i = 3; i < 10; ++i) {
+      Test::TeamVectorSpmv::impl_test_batched_spmv<
+          DeviceType, ParamTagType, ViewType, IntView, ViewType, ViewType,
+          alphaViewType, alphaViewType, 0>(1024, i, 2);
+    }
+    for (int i = 3; i < 10; ++i) {
+      Test::TeamVectorSpmv::impl_test_batched_spmv<
+          DeviceType, ParamTagType, ViewType, IntView, ViewType, ViewType,
+          alphaViewType, alphaViewType, 1>(1024, i, 2);
+    }
+  }
+#endif
+
+  return 0;
+}
diff --git a/unit_test/batched/sparse/Test_Batched_TeamVectorSpmv_Real.hpp b/unit_test/batched/sparse/Test_Batched_TeamVectorSpmv_Real.hpp
new file mode 100644
index 0000000000..758c233d95
--- /dev/null
+++ b/unit_test/batched/sparse/Test_Batched_TeamVectorSpmv_Real.hpp
@@ -0,0 +1,14 @@
+
+#if defined(KOKKOSKERNELS_INST_FLOAT)
+TEST_F(TestCategory, batched_scalar_teamvector_spmv_nt_float_float) {
+  typedef ::Test::Spmv::ParamTag<Trans::NoTranspose> param_tag_type;
+  test_batched_teamvector_spmv<TestExecSpace, float, float, param_tag_type>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+TEST_F(TestCategory, batched_scalar_teamvector_spmv_nt_double_double) {
+  typedef ::Test::Spmv::ParamTag<Trans::NoTranspose> param_tag_type;
+  test_batched_teamvector_spmv<TestExecSpace, double, double, param_tag_type>();
+}
+#endif
diff --git a/unit_test/blas/Test_Blas.hpp b/unit_test/blas/Test_Blas.hpp
index c35960e48e..642a0bf5f0 100644
--- a/unit_test/blas/Test_Blas.hpp
+++ b/unit_test/blas/Test_Blas.hpp
@@ -42,4 +42,7 @@
 #include "Test_Blas3_trmm.hpp"
 #include "Test_Blas3_trsm.hpp"
 
-#endif // TEST_BLAS_HPP
+// TPLs
+#include "Test_Blas_rocblas.hpp"
+
+#endif  // TEST_BLAS_HPP
diff --git a/unit_test/blas/Test_Blas1_abs.hpp b/unit_test/blas/Test_Blas1_abs.hpp
index d1cb36d368..eba209b26f 100644
--- a/unit_test/blas/Test_Blas1_abs.hpp
+++ b/unit_test/blas/Test_Blas1_abs.hpp
@@ -1,182 +1,188 @@
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas1_abs.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas1_abs.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
-  template<class ViewTypeA, class ViewTypeB, class Device>
-  void impl_test_abs(int N) {
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-    typedef Kokkos::Details::ArithTraits<ScalarA> AT;
-
-    typedef Kokkos::View<ScalarA*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeA::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeA;
-    typedef Kokkos::View<ScalarB*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeB::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeB;
-
-
-    typename AT::mag_type eps = AT::epsilon()*10;
-
-    BaseTypeA b_x("X",N);
-    BaseTypeB b_y("Y",N);
-    BaseTypeB b_org_y("Org_Y",N);
-    
-
-    ViewTypeA x = Kokkos::subview(b_x,Kokkos::ALL(),0);
-    ViewTypeB y = Kokkos::subview(b_y,Kokkos::ALL(),0);
-    typename ViewTypeA::const_type c_x = x;
-
-    typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
-    typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y);
-
-    typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x,Kokkos::ALL(),0);
-    typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y,Kokkos::ALL(),0);
-
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
-
-    {
-      ScalarA randStart, randEnd;
-      Test::getRandomBounds(1.0, randStart, randEnd);
-      Kokkos::fill_random(b_x,rand_pool,randStart,randEnd);
-    }
-    {
-      ScalarB randStart, randEnd;
-      Test::getRandomBounds(1.0, randStart, randEnd);
-      Kokkos::fill_random(b_y,rand_pool,randStart,randEnd);
-    }
+template <class ViewTypeA, class ViewTypeB, class Device>
+void impl_test_abs(int N) {
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+  typedef Kokkos::Details::ArithTraits<ScalarA> AT;
+
+  typedef Kokkos::View<
+      ScalarA * [2],
+      typename std::conditional<std::is_same<typename ViewTypeA::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeA;
+  typedef Kokkos::View<
+      ScalarB * [2],
+      typename std::conditional<std::is_same<typename ViewTypeB::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeB;
+
+  typename AT::mag_type eps = AT::epsilon() * 10;
+
+  BaseTypeA b_x("X", N);
+  BaseTypeB b_y("Y", N);
+  BaseTypeB b_org_y("Org_Y", N);
+
+  ViewTypeA x                        = Kokkos::subview(b_x, Kokkos::ALL(), 0);
+  ViewTypeB y                        = Kokkos::subview(b_y, Kokkos::ALL(), 0);
+  typename ViewTypeA::const_type c_x = x;
+
+  typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
+  typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y);
+
+  typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0);
+  typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0);
+
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
+
+  {
+    ScalarA randStart, randEnd;
+    Test::getRandomBounds(1.0, randStart, randEnd);
+    Kokkos::fill_random(b_x, rand_pool, randStart, randEnd);
+  }
+  {
+    ScalarB randStart, randEnd;
+    Test::getRandomBounds(1.0, randStart, randEnd);
+    Kokkos::fill_random(b_y, rand_pool, randStart, randEnd);
+  }
 
-    Kokkos::deep_copy(b_org_y,b_y);
+  Kokkos::deep_copy(b_org_y, b_y);
 
-    Kokkos::deep_copy(h_b_x,b_x);
-    Kokkos::deep_copy(h_b_y,b_y);
+  Kokkos::deep_copy(h_b_x, b_x);
+  Kokkos::deep_copy(h_b_y, b_y);
 
-    //Run with nonconst input
-    KokkosBlas::abs(y,x);
-    //Copy result to host (h_y is subview of h_b_y)
-    Kokkos::deep_copy(h_b_y, b_y);
-    for(int i = 0; i < N; i++)
-    {
-      EXPECT_NEAR_KK(h_y(i), AT::abs(h_x(i)), eps * AT::abs(h_x(i)));
-    }
-    //Run with const input
-    //Reset output
-    Kokkos::deep_copy(b_y,b_org_y);
-    KokkosBlas::abs(y,c_x);
-    Kokkos::deep_copy(h_b_y, b_y);
-    for(int i = 0; i < N; i++)
-    {
-      EXPECT_NEAR_KK(h_y(i), AT::abs(h_x(i)), eps * AT::abs(h_x(i)));
-    }
+  // Run with nonconst input
+  KokkosBlas::abs(y, x);
+  // Copy result to host (h_y is subview of h_b_y)
+  Kokkos::deep_copy(h_b_y, b_y);
+  for (int i = 0; i < N; i++) {
+    EXPECT_NEAR_KK(h_y(i), AT::abs(h_x(i)), eps * AT::abs(h_x(i)));
   }
+  // Run with const input
+  // Reset output
+  Kokkos::deep_copy(b_y, b_org_y);
+  KokkosBlas::abs(y, c_x);
+  Kokkos::deep_copy(h_b_y, b_y);
+  for (int i = 0; i < N; i++) {
+    EXPECT_NEAR_KK(h_y(i), AT::abs(h_x(i)), eps * AT::abs(h_x(i)));
+  }
+}
 
-  template<class ViewTypeA, class ViewTypeB, class Device>
-  void impl_test_abs_mv(int N, int K) {
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-    typedef Kokkos::Details::ArithTraits<ScalarA> AT;
+template <class ViewTypeA, class ViewTypeB, class Device>
+void impl_test_abs_mv(int N, int K) {
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+  typedef Kokkos::Details::ArithTraits<ScalarA> AT;
 
-    typedef multivector_layout_adapter<ViewTypeA> vfA_type;
-    typedef multivector_layout_adapter<ViewTypeB> vfB_type;
+  typedef multivector_layout_adapter<ViewTypeA> vfA_type;
+  typedef multivector_layout_adapter<ViewTypeB> vfB_type;
 
-    typename vfA_type::BaseType b_x("A",N,K);
-    typename vfB_type::BaseType b_y("B",N,K);
-    typename vfB_type::BaseType b_org_y("B",N,K);
+  typename vfA_type::BaseType b_x("A", N, K);
+  typename vfB_type::BaseType b_y("B", N, K);
+  typename vfB_type::BaseType b_org_y("B", N, K);
 
-    ViewTypeA x = vfA_type::view(b_x);
-    ViewTypeB y = vfB_type::view(b_y);
+  ViewTypeA x = vfA_type::view(b_x);
+  ViewTypeB y = vfB_type::view(b_y);
 
-    typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
-    typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
+  typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
+  typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
 
-    typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x);
-    typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y);
+  typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x);
+  typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y);
 
-    typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x);
-    typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
+  typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x);
+  typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
 
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-    {
-      ScalarA randStart, randEnd;
-      Test::getRandomBounds(1.0, randStart, randEnd);
-      Kokkos::fill_random(b_x,rand_pool,randStart,randEnd);
-    }
-    {
-      ScalarB randStart, randEnd;
-      Test::getRandomBounds(1.0, randStart, randEnd);
-      Kokkos::fill_random(b_y,rand_pool,randStart,randEnd);
-    }
+  {
+    ScalarA randStart, randEnd;
+    Test::getRandomBounds(1.0, randStart, randEnd);
+    Kokkos::fill_random(b_x, rand_pool, randStart, randEnd);
+  }
+  {
+    ScalarB randStart, randEnd;
+    Test::getRandomBounds(1.0, randStart, randEnd);
+    Kokkos::fill_random(b_y, rand_pool, randStart, randEnd);
+  }
 
-    Kokkos::deep_copy(b_org_y,b_y);
+  Kokkos::deep_copy(b_org_y, b_y);
 
-    Kokkos::deep_copy(h_b_x,b_x);
+  Kokkos::deep_copy(h_b_x, b_x);
 
-    typename ViewTypeA::const_type c_x = x;
+  typename ViewTypeA::const_type c_x = x;
 
-    typename AT::mag_type eps = AT::epsilon()*10;
+  typename AT::mag_type eps = AT::epsilon() * 10;
 
-    //Test and verify non-const input
-    KokkosBlas::abs(y,x);
-    Kokkos::deep_copy(h_b_y, b_y);
-    for(int i = 0; i < N; i++) {
-      for(int j = 0; j < K; j++) {
-        EXPECT_NEAR_KK(h_y(i, j), AT::abs(h_x(i, j)), eps * AT::abs(h_x(i, j)));
-      }
+  // Test and verify non-const input
+  KokkosBlas::abs(y, x);
+  Kokkos::deep_copy(h_b_y, b_y);
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < K; j++) {
+      EXPECT_NEAR_KK(h_y(i, j), AT::abs(h_x(i, j)), eps * AT::abs(h_x(i, j)));
     }
-    //Test and verify const input
-    //Reset y
-    Kokkos::deep_copy(b_y,b_org_y);
-    KokkosBlas::abs(y,c_x);
-    Kokkos::deep_copy(h_b_y, b_y);
-    for(int i = 0; i < N; i++) {
-      for(int j = 0; j < K; j++) {
-        EXPECT_NEAR_KK(h_y(i, j), AT::abs(h_x(i, j)), eps * AT::abs(h_x(i, j)));
-      }
+  }
+  // Test and verify const input
+  // Reset y
+  Kokkos::deep_copy(b_y, b_org_y);
+  KokkosBlas::abs(y, c_x);
+  Kokkos::deep_copy(h_b_y, b_y);
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < K; j++) {
+      EXPECT_NEAR_KK(h_y(i, j), AT::abs(h_x(i, j)), eps * AT::abs(h_x(i, j)));
     }
   }
 }
+}  // namespace Test
 
-
-
-template<class ScalarA, class ScalarB, class Device>
+template <class ScalarA, class ScalarB, class Device>
 int test_abs() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutLeft, Device> view_type_a_ll;
   typedef Kokkos::View<ScalarB*, Kokkos::LayoutLeft, Device> view_type_b_ll;
   Test::impl_test_abs<view_type_a_ll, view_type_b_ll, Device>(0);
   Test::impl_test_abs<view_type_a_ll, view_type_b_ll, Device>(13);
   Test::impl_test_abs<view_type_a_ll, view_type_b_ll, Device>(1024);
-  //Test::impl_test_abs<view_type_a_ll, view_type_b_ll, Device>(132231);
+  // Test::impl_test_abs<view_type_a_ll, view_type_b_ll, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutRight, Device> view_type_a_lr;
   typedef Kokkos::View<ScalarB*, Kokkos::LayoutRight, Device> view_type_b_lr;
   Test::impl_test_abs<view_type_a_lr, view_type_b_lr, Device>(0);
   Test::impl_test_abs<view_type_a_lr, view_type_b_lr, Device>(13);
   Test::impl_test_abs<view_type_a_lr, view_type_b_lr, Device>(1024);
-  //Test::impl_test_abs<view_type_a_lr, view_type_b_lr, Device>(132231);
+  // Test::impl_test_abs<view_type_a_lr, view_type_b_lr, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutStride, Device> view_type_a_ls;
   typedef Kokkos::View<ScalarB*, Kokkos::LayoutStride, Device> view_type_b_ls;
   Test::impl_test_abs<view_type_a_ls, view_type_b_ls, Device>(0);
   Test::impl_test_abs<view_type_a_ls, view_type_b_ls, Device>(13);
   Test::impl_test_abs<view_type_a_ls, view_type_b_ls, Device>(1024);
-  //Test::impl_test_abs<view_type_a_ls, view_type_b_ls, Device>(132231);
+  // Test::impl_test_abs<view_type_a_ls, view_type_b_ls, Device>(132231);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
   Test::impl_test_abs<view_type_a_ls, view_type_b_ll, Device>(1024);
   Test::impl_test_abs<view_type_a_ll, view_type_b_ls, Device>(1024);
 #endif
@@ -184,99 +190,114 @@ int test_abs() {
   return 1;
 }
 
-template<class ScalarA, class ScalarB, class Device>
+template <class ScalarA, class ScalarB, class Device>
 int test_abs_mv() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device> view_type_a_ll;
   typedef Kokkos::View<ScalarB**, Kokkos::LayoutLeft, Device> view_type_b_ll;
-  Test::impl_test_abs_mv<view_type_a_ll, view_type_b_ll, Device>(0,5);
-  Test::impl_test_abs_mv<view_type_a_ll, view_type_b_ll, Device>(13,5);
-  Test::impl_test_abs_mv<view_type_a_ll, view_type_b_ll, Device>(1024,5);
-  //Test::impl_test_abs_mv<view_type_a_ll, view_type_b_ll, Device>(132231,5);
+  Test::impl_test_abs_mv<view_type_a_ll, view_type_b_ll, Device>(0, 5);
+  Test::impl_test_abs_mv<view_type_a_ll, view_type_b_ll, Device>(13, 5);
+  Test::impl_test_abs_mv<view_type_a_ll, view_type_b_ll, Device>(1024, 5);
+  // Test::impl_test_abs_mv<view_type_a_ll, view_type_b_ll, Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device> view_type_a_lr;
   typedef Kokkos::View<ScalarB**, Kokkos::LayoutRight, Device> view_type_b_lr;
-  Test::impl_test_abs_mv<view_type_a_lr, view_type_b_lr, Device>(0,5);
-  Test::impl_test_abs_mv<view_type_a_lr, view_type_b_lr, Device>(13,5);
-  Test::impl_test_abs_mv<view_type_a_lr, view_type_b_lr, Device>(1024,5);
-  //Test::impl_test_abs_mv<view_type_a_lr, view_type_b_lr, Device>(132231,5);
+  Test::impl_test_abs_mv<view_type_a_lr, view_type_b_lr, Device>(0, 5);
+  Test::impl_test_abs_mv<view_type_a_lr, view_type_b_lr, Device>(13, 5);
+  Test::impl_test_abs_mv<view_type_a_lr, view_type_b_lr, Device>(1024, 5);
+  // Test::impl_test_abs_mv<view_type_a_lr, view_type_b_lr, Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutStride, Device> view_type_a_ls;
   typedef Kokkos::View<ScalarB**, Kokkos::LayoutStride, Device> view_type_b_ls;
-  Test::impl_test_abs_mv<view_type_a_ls, view_type_b_ls, Device>(0,5);
-  Test::impl_test_abs_mv<view_type_a_ls, view_type_b_ls, Device>(13,5);
-  Test::impl_test_abs_mv<view_type_a_ls, view_type_b_ls, Device>(1024,5);
-  //Test::impl_test_abs_mv<view_type_a_ls, view_type_b_ls, Device>(132231,5);
+  Test::impl_test_abs_mv<view_type_a_ls, view_type_b_ls, Device>(0, 5);
+  Test::impl_test_abs_mv<view_type_a_ls, view_type_b_ls, Device>(13, 5);
+  Test::impl_test_abs_mv<view_type_a_ls, view_type_b_ls, Device>(1024, 5);
+  // Test::impl_test_abs_mv<view_type_a_ls, view_type_b_ls, Device>(132231,5);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-  Test::impl_test_abs_mv<view_type_a_ls, view_type_b_ll, Device>(1024,5);
-  Test::impl_test_abs_mv<view_type_a_ll, view_type_b_ls, Device>(1024,5);
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+  Test::impl_test_abs_mv<view_type_a_ls, view_type_b_ll, Device>(1024, 5);
+  Test::impl_test_abs_mv<view_type_a_ll, view_type_b_ls, Device>(1024, 5);
 #endif
 
   return 1;
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, abs_float ) {
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, abs_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_float");
-    test_abs<float,float,TestExecSpace> ();
+  test_abs<float, float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, abs_mv_float ) {
+TEST_F(TestCategory, abs_mv_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_mv_float");
-    test_abs_mv<float,float,TestExecSpace> ();
+  test_abs_mv<float, float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, abs_double ) {
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, abs_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_double");
-    test_abs<double,double,TestExecSpace> ();
+  test_abs<double, double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, abs_mv_double ) {
+TEST_F(TestCategory, abs_mv_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_mv_double");
-    test_abs_mv<double,double,TestExecSpace> ();
+  test_abs_mv<double, double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, abs_complex_double ) {
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, abs_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_double");
-    test_abs<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+  test_abs<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, abs_mv_complex_double ) {
+TEST_F(TestCategory, abs_mv_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_mv_double");
-    test_abs_mv<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+  test_abs_mv<Kokkos::complex<double>, Kokkos::complex<double>,
+              TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, abs_int ) {
+#if defined(KOKKOSKERNELS_INST_INT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, abs_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_int");
-    test_abs<int,int,TestExecSpace> ();
+  test_abs<int, int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, abs_mv_int ) {
+TEST_F(TestCategory, abs_mv_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_mv_int");
-    test_abs_mv<int,int,TestExecSpace> ();
+  test_abs_mv<int, int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-/*#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-TEST_F( TestCategory, abs_double_int ) {
-    test_abs<double,int,TestExecSpace> ();
+/*#if !defined(KOKKOSKERNELS_ETI_ONLY) &&
+!defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F( TestCategory,
+abs_double_int ) { test_abs<double,int,TestExecSpace> ();
 }
 TEST_F( TestCategory, abs_double_mv_int ) {
     test_abs_mv<double,int,TestExecSpace> ();
diff --git a/unit_test/blas/Test_Blas1_asum.hpp b/unit_test/blas/Test_Blas1_asum.hpp
index 4404fc908a..56d9042379 100644
--- a/unit_test/blas/Test_Blas1_asum.hpp
+++ b/unit_test/blas/Test_Blas1_asum.hpp
@@ -1,124 +1,134 @@
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas1_nrm1.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas1_nrm1.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
-  template<class ViewTypeA, class Device>
-  void impl_test_asum(int N) {
+template <class ViewTypeA, class Device>
+void impl_test_asum(int N) {
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef Kokkos::Details::ArithTraits<ScalarA> AT;
+  typedef Kokkos::ArithTraits<typename AT::mag_type> MAT;
 
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef Kokkos::Details::ArithTraits<ScalarA> AT;
-    typedef Kokkos::ArithTraits<typename AT::mag_type> MAT;
+  typedef Kokkos::View<
+      ScalarA * [2],
+      typename std::conditional<std::is_same<typename ViewTypeA::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeA;
 
-    typedef Kokkos::View<ScalarA*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeA::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeA;
+  BaseTypeA b_a("A", N);
 
+  ViewTypeA a = Kokkos::subview(b_a, Kokkos::ALL(), 0);
 
-    BaseTypeA b_a("A",N);
+  typename BaseTypeA::HostMirror h_b_a = Kokkos::create_mirror_view(b_a);
 
-    ViewTypeA a = Kokkos::subview(b_a,Kokkos::ALL(),0);
+  typename ViewTypeA::HostMirror h_a = Kokkos::subview(h_b_a, Kokkos::ALL(), 0);
 
-    typename BaseTypeA::HostMirror h_b_a = Kokkos::create_mirror_view(b_a);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-    typename ViewTypeA::HostMirror h_a = Kokkos::subview(h_b_a,Kokkos::ALL(),0);
+  ScalarA randStart, randEnd;
+  Test::getRandomBounds(10.0, randStart, randEnd);
+  Kokkos::fill_random(b_a, rand_pool, randStart, randEnd);
 
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  Kokkos::deep_copy(h_b_a, b_a);
 
-    ScalarA randStart, randEnd;
-    Test::getRandomBounds(10.0, randStart, randEnd);
-    Kokkos::fill_random(b_a,rand_pool,randStart,randEnd);
-
-    Kokkos::deep_copy(h_b_a,b_a);
-
-    typename ViewTypeA::const_type c_a = a;
-    double eps = std::is_same<ScalarA,float>::value?2*1e-5:1e-7;
-
-    typename AT::mag_type expected_result = 0;
-    for(int i=0;i<N;i++)
-    {
-      //note: for complex, BLAS asum (see netlib, MKL, CUBLAS documentation) is _not_
-      //the sum of magnitudes - it's the sum of absolute real and imaginary parts.
-      //
-      //This is safe; ArithTraits<T>::imag is 0 if T is real.
-      expected_result += MAT::abs(AT::real(h_a(i))) + MAT::abs(AT::imag(h_a(i)));
-    }
-
-    typename AT::mag_type nonconst_result = KokkosBlas::asum(a);
-    EXPECT_NEAR_KK( nonconst_result, expected_result, eps*expected_result);
-
-    typename AT::mag_type const_result = KokkosBlas::asum(c_a);
-    EXPECT_NEAR_KK( const_result, expected_result, eps*expected_result);
+  typename ViewTypeA::const_type c_a = a;
+  double eps = std::is_same<ScalarA, float>::value ? 2 * 1e-5 : 1e-7;
 
+  typename AT::mag_type expected_result = 0;
+  for (int i = 0; i < N; i++) {
+    // note: for complex, BLAS asum (see netlib, MKL, CUBLAS documentation) is
+    // _not_ the sum of magnitudes - it's the sum of absolute real and imaginary
+    // parts.
+    //
+    // This is safe; ArithTraits<T>::imag is 0 if T is real.
+    expected_result += MAT::abs(AT::real(h_a(i))) + MAT::abs(AT::imag(h_a(i)));
   }
 
+  typename AT::mag_type nonconst_result = KokkosBlas::asum(a);
+  EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result);
 
+  typename AT::mag_type const_result = KokkosBlas::asum(c_a);
+  EXPECT_NEAR_KK(const_result, expected_result, eps * expected_result);
 }
 
-template<class ScalarA, class Device>
-int test_asum() {
+}  // namespace Test
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+template <class ScalarA, class Device>
+int test_asum() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutLeft, Device> view_type_a_ll;
   Test::impl_test_asum<view_type_a_ll, Device>(0);
   Test::impl_test_asum<view_type_a_ll, Device>(13);
   Test::impl_test_asum<view_type_a_ll, Device>(1024);
-  //Test::impl_test_asum<view_type_a_ll, Device>(132231);
+  // Test::impl_test_asum<view_type_a_ll, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutRight, Device> view_type_a_lr;
   Test::impl_test_asum<view_type_a_lr, Device>(0);
   Test::impl_test_asum<view_type_a_lr, Device>(13);
   Test::impl_test_asum<view_type_a_lr, Device>(1024);
-  //Test::impl_test_asum<view_type_a_lr, Device>(132231);
+  // Test::impl_test_asum<view_type_a_lr, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutStride, Device> view_type_a_ls;
   Test::impl_test_asum<view_type_a_ls, Device>(0);
   Test::impl_test_asum<view_type_a_ls, Device>(13);
   Test::impl_test_asum<view_type_a_ls, Device>(1024);
-  //Test::impl_test_asum<view_type_a_ls, Device>(132231);
+  // Test::impl_test_asum<view_type_a_ls, Device>(132231);
 #endif
 
   return 1;
 }
 
-
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, asum_float ) {
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, asum_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::asum_float");
-    test_asum<float,TestExecSpace> ();
+  test_asum<float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, asum_double ) {
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, asum_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::asum_double");
-    test_asum<double,TestExecSpace> ();
+  test_asum<double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, asum_complex_double ) {
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, asum_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::asum_complex_double");
-    test_asum<Kokkos::complex<double>,TestExecSpace> ();
+  test_asum<Kokkos::complex<double>, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, asum_int ) {
+#if defined(KOKKOSKERNELS_INST_INT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, asum_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::asum_int");
-    test_asum<int,TestExecSpace> ();
+  test_asum<int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
-
-
diff --git a/unit_test/blas/Test_Blas1_axpby.hpp b/unit_test/blas/Test_Blas1_axpby.hpp
index 84943b1bc7..8007f88196 100644
--- a/unit_test/blas/Test_Blas1_axpby.hpp
+++ b/unit_test/blas/Test_Blas1_axpby.hpp
@@ -1,167 +1,166 @@
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas1_axpby.hpp>
-#include<KokkosBlas1_dot.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas1_axpby.hpp>
+#include <KokkosBlas1_dot.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
-  template<class ViewTypeA, class ViewTypeB, class Device>
-  void impl_test_axpby(int N) {
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-
-    typedef Kokkos::View<ScalarA*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeA::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeA;
-    typedef Kokkos::View<ScalarB*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeB::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeB;
-
-
-    ScalarA a = 3;
-    ScalarB b = 5;
-    double eps = std::is_same<ScalarA,float>::value?2*1e-5:1e-7;
-
-    BaseTypeA b_x("X",N);
-    BaseTypeB b_y("Y",N);
-    BaseTypeB b_org_y("Org_Y",N);
-    
-
-    auto h_b_org_y = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y);
-    ViewTypeA x = Kokkos::subview(b_x,Kokkos::ALL(),0);
-    ViewTypeB y = Kokkos::subview(b_y,Kokkos::ALL(),0);
-    typename ViewTypeA::const_type c_x = x;
-    typename ViewTypeB::const_type c_y = y;
-
-    typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
-    typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y);
-
-    typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x,Kokkos::ALL(),0);
-    typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y,Kokkos::ALL(),0);
-
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
-
-    {
-      ScalarA randStart, randEnd;
-      Test::getRandomBounds(10.0, randStart, randEnd);
-      Kokkos::fill_random(b_x,rand_pool,randStart,randEnd);
-    }
-    {
-      ScalarB randStart, randEnd;
-      Test::getRandomBounds(10.0, randStart, randEnd);
-      Kokkos::fill_random(b_y,rand_pool,randStart,randEnd);
-    }
+template <class ViewTypeA, class ViewTypeB, class Device>
+void impl_test_axpby(int N) {
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+
+  typedef Kokkos::View<
+      ScalarA * [2],
+      typename std::conditional<std::is_same<typename ViewTypeA::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeA;
+  typedef Kokkos::View<
+      ScalarB * [2],
+      typename std::conditional<std::is_same<typename ViewTypeB::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeB;
+
+  ScalarA a  = 3;
+  ScalarB b  = 5;
+  double eps = std::is_same<ScalarA, float>::value ? 2 * 1e-5 : 1e-7;
+
+  BaseTypeA b_x("X", N);
+  BaseTypeB b_y("Y", N);
+  BaseTypeB b_org_y("Org_Y", N);
+
+  auto h_b_org_y =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y);
+  ViewTypeA x                        = Kokkos::subview(b_x, Kokkos::ALL(), 0);
+  ViewTypeB y                        = Kokkos::subview(b_y, Kokkos::ALL(), 0);
+  typename ViewTypeA::const_type c_x = x;
+  typename ViewTypeB::const_type c_y = y;
+
+  typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
+  typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y);
+
+  typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0);
+  typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0);
+
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
+
+  {
+    ScalarA randStart, randEnd;
+    Test::getRandomBounds(10.0, randStart, randEnd);
+    Kokkos::fill_random(b_x, rand_pool, randStart, randEnd);
+  }
+  {
+    ScalarB randStart, randEnd;
+    Test::getRandomBounds(10.0, randStart, randEnd);
+    Kokkos::fill_random(b_y, rand_pool, randStart, randEnd);
+  }
 
-    Kokkos::deep_copy(b_org_y,b_y);
-    Kokkos::deep_copy(h_b_org_y, b_org_y);
+  Kokkos::deep_copy(b_org_y, b_y);
+  Kokkos::deep_copy(h_b_org_y, b_org_y);
 
-    Kokkos::deep_copy(h_b_x,b_x);
+  Kokkos::deep_copy(h_b_x, b_x);
 
-    //Run with non-const input (x) and verify 
-    KokkosBlas::axpby(a,x,b,y);
-    Kokkos::deep_copy(h_b_y, b_y);
-    for(int i = 0; i < N; i++)
-    {
-      EXPECT_NEAR_KK(a * h_x(i) + b * h_b_org_y(i, 0), h_y(i), eps);
-    }
- 
-    Kokkos::deep_copy(b_y,b_org_y);
-    //Run again with const input (c_x)
-    KokkosBlas::axpby(a,c_x,b,y);
-    Kokkos::deep_copy(h_b_y, b_y);
-    for(int i = 0; i < N; i++)
-    {
-      EXPECT_NEAR_KK(a * h_x(i) + b * h_b_org_y(i, 0), h_y(i), eps);
-    }
+  // Run with non-const input (x) and verify
+  KokkosBlas::axpby(a, x, b, y);
+  Kokkos::deep_copy(h_b_y, b_y);
+  for (int i = 0; i < N; i++) {
+    EXPECT_NEAR_KK(a * h_x(i) + b * h_b_org_y(i, 0), h_y(i), eps);
   }
 
-  template<class ViewTypeA, class ViewTypeB, class Device>
-  void impl_test_axpby_mv(int N, int K) {
+  Kokkos::deep_copy(b_y, b_org_y);
+  // Run again with const input (c_x)
+  KokkosBlas::axpby(a, c_x, b, y);
+  Kokkos::deep_copy(h_b_y, b_y);
+  for (int i = 0; i < N; i++) {
+    EXPECT_NEAR_KK(a * h_x(i) + b * h_b_org_y(i, 0), h_y(i), eps);
+  }
+}
 
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
+template <class ViewTypeA, class ViewTypeB, class Device>
+void impl_test_axpby_mv(int N, int K) {
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
 
-    typedef multivector_layout_adapter<ViewTypeA> vfA_type;
-    typedef multivector_layout_adapter<ViewTypeB> vfB_type;
+  typedef multivector_layout_adapter<ViewTypeA> vfA_type;
+  typedef multivector_layout_adapter<ViewTypeB> vfB_type;
 
-    typename vfA_type::BaseType b_x("A",N,K);
-    typename vfB_type::BaseType b_y("B",N,K);
-    typename vfB_type::BaseType b_org_y("B",N,K);
+  typename vfA_type::BaseType b_x("A", N, K);
+  typename vfB_type::BaseType b_y("B", N, K);
+  typename vfB_type::BaseType b_org_y("B", N, K);
 
-    ViewTypeA x = vfA_type::view(b_x);
-    ViewTypeB y = vfB_type::view(b_y);
+  ViewTypeA x = vfA_type::view(b_x);
+  ViewTypeB y = vfB_type::view(b_y);
 
-    typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
-    typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
+  typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
+  typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
 
-    typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x);
-    typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y);
+  typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x);
+  typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y);
 
-    typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x);
-    typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
+  typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x);
+  typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
 
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-    {
-      ScalarA randStart, randEnd;
-      Test::getRandomBounds(10.0, randStart, randEnd);
-      Kokkos::fill_random(b_x,rand_pool,randStart,randEnd);
-    }
-    {
-      ScalarB randStart, randEnd;
-      Test::getRandomBounds(10.0, randStart, randEnd);
-      Kokkos::fill_random(b_y,rand_pool,randStart,randEnd);
-    }
+  {
+    ScalarA randStart, randEnd;
+    Test::getRandomBounds(10.0, randStart, randEnd);
+    Kokkos::fill_random(b_x, rand_pool, randStart, randEnd);
+  }
+  {
+    ScalarB randStart, randEnd;
+    Test::getRandomBounds(10.0, randStart, randEnd);
+    Kokkos::fill_random(b_y, rand_pool, randStart, randEnd);
+  }
 
-    Kokkos::deep_copy(b_org_y,b_y);
-    auto h_b_org_y = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y);
+  Kokkos::deep_copy(b_org_y, b_y);
+  auto h_b_org_y =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y);
 
-    Kokkos::deep_copy(h_b_x,b_x);
-    Kokkos::deep_copy(h_b_y,b_y);
+  Kokkos::deep_copy(h_b_x, b_x);
+  Kokkos::deep_copy(h_b_y, b_y);
 
-    ScalarA a = 3;
-    ScalarB b = 5;
-    typename ViewTypeA::const_type c_x = x;
+  ScalarA a                          = 3;
+  ScalarB b                          = 5;
+  typename ViewTypeA::const_type c_x = x;
 
-    double eps = std::is_same<ScalarA,float>::value?2*1e-5:1e-7;
+  double eps = std::is_same<ScalarA, float>::value ? 2 * 1e-5 : 1e-7;
 
-    Kokkos::View<ScalarB*,Kokkos::HostSpace> r("Dot::Result",K);
+  Kokkos::View<ScalarB*, Kokkos::HostSpace> r("Dot::Result", K);
 
-    KokkosBlas::axpby(a,x,b,y);
-    Kokkos::deep_copy(h_b_y, b_y);
+  KokkosBlas::axpby(a, x, b, y);
+  Kokkos::deep_copy(h_b_y, b_y);
 
-    for(int i = 0; i < N; i++)
-    {
-      for(int j = 0; j < K; j++)
-      {
-        EXPECT_NEAR_KK(a * h_x(i, j) + b * h_b_org_y(i, j), h_y(i, j), eps);
-      }
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < K; j++) {
+      EXPECT_NEAR_KK(a * h_x(i, j) + b * h_b_org_y(i, j), h_y(i, j), eps);
     }
+  }
 
-    Kokkos::deep_copy(b_y, b_org_y);
-    KokkosBlas::axpby(a,c_x,b,y);
-    Kokkos::deep_copy(h_b_y, b_y);
+  Kokkos::deep_copy(b_y, b_org_y);
+  KokkosBlas::axpby(a, c_x, b, y);
+  Kokkos::deep_copy(h_b_y, b_y);
 
-    for(int i = 0; i < N; i++)
-    {
-      for(int j = 0; j < K; j++)
-      {
-        EXPECT_NEAR_KK(a * h_x(i, j) + b * h_b_org_y(i, j), h_y(i, j), eps);
-      }
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < K; j++) {
+      EXPECT_NEAR_KK(a * h_x(i, j) + b * h_b_org_y(i, j), h_y(i, j), eps);
     }
   }
 }
+}  // namespace Test
 
-
-
-template<class ScalarA, class ScalarB, class Device>
+template <class ScalarA, class ScalarB, class Device>
 int test_axpby() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutLeft, Device> view_type_a_ll;
   typedef Kokkos::View<ScalarB*, Kokkos::LayoutLeft, Device> view_type_b_ll;
   Test::impl_test_axpby<view_type_a_ll, view_type_b_ll, Device>(0);
@@ -170,7 +169,9 @@ int test_axpby() {
   Test::impl_test_axpby<view_type_a_ll, view_type_b_ll, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutRight, Device> view_type_a_lr;
   typedef Kokkos::View<ScalarB*, Kokkos::LayoutRight, Device> view_type_b_lr;
   Test::impl_test_axpby<view_type_a_lr, view_type_b_lr, Device>(0);
@@ -179,7 +180,9 @@ int test_axpby() {
   Test::impl_test_axpby<view_type_a_lr, view_type_b_lr, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutStride, Device> view_type_a_ls;
   typedef Kokkos::View<ScalarB*, Kokkos::LayoutStride, Device> view_type_b_ls;
   Test::impl_test_axpby<view_type_a_ls, view_type_b_ls, Device>(0);
@@ -188,7 +191,8 @@ int test_axpby() {
   Test::impl_test_axpby<view_type_a_ls, view_type_b_ls, Device>(132231);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
   Test::impl_test_axpby<view_type_a_ls, view_type_b_ll, Device>(1024);
   Test::impl_test_axpby<view_type_a_ll, view_type_b_ls, Device>(1024);
 #endif
@@ -196,104 +200,120 @@ int test_axpby() {
   return 1;
 }
 
-template<class ScalarA, class ScalarB, class Device>
+template <class ScalarA, class ScalarB, class Device>
 int test_axpby_mv() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device> view_type_a_ll;
   typedef Kokkos::View<ScalarB**, Kokkos::LayoutLeft, Device> view_type_b_ll;
-  Test::impl_test_axpby_mv<view_type_a_ll, view_type_b_ll, Device>(0,5);
-  Test::impl_test_axpby_mv<view_type_a_ll, view_type_b_ll, Device>(13,5);
-  Test::impl_test_axpby_mv<view_type_a_ll, view_type_b_ll, Device>(1024,5);
-  Test::impl_test_axpby_mv<view_type_a_ll, view_type_b_ll, Device>(132231,5);
+  Test::impl_test_axpby_mv<view_type_a_ll, view_type_b_ll, Device>(0, 5);
+  Test::impl_test_axpby_mv<view_type_a_ll, view_type_b_ll, Device>(13, 5);
+  Test::impl_test_axpby_mv<view_type_a_ll, view_type_b_ll, Device>(1024, 5);
+  Test::impl_test_axpby_mv<view_type_a_ll, view_type_b_ll, Device>(132231, 5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device> view_type_a_lr;
   typedef Kokkos::View<ScalarB**, Kokkos::LayoutRight, Device> view_type_b_lr;
-  Test::impl_test_axpby_mv<view_type_a_lr, view_type_b_lr, Device>(0,5);
-  Test::impl_test_axpby_mv<view_type_a_lr, view_type_b_lr, Device>(13,5);
-  Test::impl_test_axpby_mv<view_type_a_lr, view_type_b_lr, Device>(1024,5);
-  Test::impl_test_axpby_mv<view_type_a_lr, view_type_b_lr, Device>(132231,5);
+  Test::impl_test_axpby_mv<view_type_a_lr, view_type_b_lr, Device>(0, 5);
+  Test::impl_test_axpby_mv<view_type_a_lr, view_type_b_lr, Device>(13, 5);
+  Test::impl_test_axpby_mv<view_type_a_lr, view_type_b_lr, Device>(1024, 5);
+  Test::impl_test_axpby_mv<view_type_a_lr, view_type_b_lr, Device>(132231, 5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutStride, Device> view_type_a_ls;
   typedef Kokkos::View<ScalarB**, Kokkos::LayoutStride, Device> view_type_b_ls;
-  Test::impl_test_axpby_mv<view_type_a_ls, view_type_b_ls, Device>(0,5);
-  Test::impl_test_axpby_mv<view_type_a_ls, view_type_b_ls, Device>(13,5);
-  Test::impl_test_axpby_mv<view_type_a_ls, view_type_b_ls, Device>(1024,5);
-  Test::impl_test_axpby_mv<view_type_a_ls, view_type_b_ls, Device>(132231,5);
+  Test::impl_test_axpby_mv<view_type_a_ls, view_type_b_ls, Device>(0, 5);
+  Test::impl_test_axpby_mv<view_type_a_ls, view_type_b_ls, Device>(13, 5);
+  Test::impl_test_axpby_mv<view_type_a_ls, view_type_b_ls, Device>(1024, 5);
+  Test::impl_test_axpby_mv<view_type_a_ls, view_type_b_ls, Device>(132231, 5);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-  Test::impl_test_axpby_mv<view_type_a_ls, view_type_b_ll, Device>(1024,5);
-  Test::impl_test_axpby_mv<view_type_a_ll, view_type_b_ls, Device>(1024,5);
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+  Test::impl_test_axpby_mv<view_type_a_ls, view_type_b_ll, Device>(1024, 5);
+  Test::impl_test_axpby_mv<view_type_a_ll, view_type_b_ls, Device>(1024, 5);
 #endif
 
   return 1;
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, axpby_float ) {
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, axpby_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_float");
-    test_axpby<float,float,TestExecSpace> ();
+  test_axpby<float, float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, axpby_mv_float ) {
+TEST_F(TestCategory, axpby_mv_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_float");
-    test_axpby_mv<float,float,TestExecSpace> ();
+  test_axpby_mv<float, float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, axpby_double ) {
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, axpby_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_double");
-    test_axpby<double,double,TestExecSpace> ();
+  test_axpby<double, double, TestExecSpace>();
 }
-TEST_F( TestCategory, axpby_mv_double ) {
+TEST_F(TestCategory, axpby_mv_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_double");
-    test_axpby_mv<double,double,TestExecSpace> ();
+  test_axpby_mv<double, double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, axpby_complex_double ) {
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, axpby_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_complex_double");
-    test_axpby<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+  test_axpby<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, axpby_mv_complex_double ) {
+TEST_F(TestCategory, axpby_mv_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_complex_double");
-    test_axpby_mv<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+  test_axpby_mv<Kokkos::complex<double>, Kokkos::complex<double>,
+                TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, axpby_int ) {
+#if defined(KOKKOSKERNELS_INST_INT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, axpby_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_int");
-    test_axpby<int,int,TestExecSpace> ();
+  test_axpby<int, int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, axpby_mv_int ) {
+TEST_F(TestCategory, axpby_mv_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_int");
-    test_axpby_mv<int,int,TestExecSpace> ();
+  test_axpby_mv<int, int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-TEST_F( TestCategory, axpby_double_int ) {
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+TEST_F(TestCategory, axpby_double_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_double_int");
-    test_axpby<double,int,TestExecSpace> ();
+  test_axpby<double, int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, axpby_double_mv_int ) {
+TEST_F(TestCategory, axpby_double_mv_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_double_int");
-    test_axpby_mv<double,int,TestExecSpace> ();
+  test_axpby_mv<double, int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
diff --git a/unit_test/blas/Test_Blas1_axpy.hpp b/unit_test/blas/Test_Blas1_axpy.hpp
index 8ca5662a00..d8fe28dec3 100644
--- a/unit_test/blas/Test_Blas1_axpy.hpp
+++ b/unit_test/blas/Test_Blas1_axpy.hpp
@@ -1,189 +1,195 @@
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas1_axpby.hpp>
-#include<KokkosBlas1_dot.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas1_axpby.hpp>
+#include <KokkosBlas1_dot.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
-  template<class ViewTypeA, class ViewTypeB, class Device>
-  void impl_test_axpy(int N) {
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-
-    typedef Kokkos::View<ScalarA*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeA::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeA;
-    typedef Kokkos::View<ScalarB*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeB::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeB;
-
-    using MagnitudeA = typename Kokkos::ArithTraits<ScalarA>::mag_type;
-
-    ScalarA a = 3;
-    double eps = std::is_same<MagnitudeA,float>::value?2e-5:1e-7;
-
-    BaseTypeA b_x("X",N);
-    BaseTypeB b_y("Y",N);
-    BaseTypeB b_org_y("Org_Y",N);
-
-    ViewTypeA x = Kokkos::subview(b_x,Kokkos::ALL(),0);
-    ViewTypeB y = Kokkos::subview(b_y,Kokkos::ALL(),0);
-    typename ViewTypeA::const_type c_x = x;
-    typename ViewTypeB::const_type c_y = y;
-
-    typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
-    typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y);
-
-    typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x,Kokkos::ALL(),0);
-    typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y,Kokkos::ALL(),0);
-
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
-
-    {
-      ScalarA randStart, randEnd;
-      Test::getRandomBounds(10.0, randStart, randEnd);
-      Kokkos::fill_random(x,rand_pool,randStart,randEnd);
-    }
-    {
-      ScalarB randStart, randEnd;
-      Test::getRandomBounds(10.0, randStart, randEnd);
-      Kokkos::fill_random(y,rand_pool,randStart,randEnd);
-    }
-
-    Kokkos::deep_copy(b_org_y,b_y);
-    auto h_b_org_y = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y);
+template <class ViewTypeA, class ViewTypeB, class Device>
+void impl_test_axpy(int N) {
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+
+  typedef Kokkos::View<
+      ScalarA * [2],
+      typename std::conditional<std::is_same<typename ViewTypeA::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeA;
+  typedef Kokkos::View<
+      ScalarB * [2],
+      typename std::conditional<std::is_same<typename ViewTypeB::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeB;
+
+  using MagnitudeA = typename Kokkos::ArithTraits<ScalarA>::mag_type;
+
+  ScalarA a  = 3;
+  double eps = std::is_same<MagnitudeA, float>::value ? 2e-5 : 1e-7;
+
+  BaseTypeA b_x("X", N);
+  BaseTypeB b_y("Y", N);
+  BaseTypeB b_org_y("Org_Y", N);
+
+  ViewTypeA x                        = Kokkos::subview(b_x, Kokkos::ALL(), 0);
+  ViewTypeB y                        = Kokkos::subview(b_y, Kokkos::ALL(), 0);
+  typename ViewTypeA::const_type c_x = x;
+  typename ViewTypeB::const_type c_y = y;
+
+  typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
+  typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y);
+
+  typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0);
+  typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0);
+
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
+
+  {
+    ScalarA randStart, randEnd;
+    Test::getRandomBounds(10.0, randStart, randEnd);
+    Kokkos::fill_random(x, rand_pool, randStart, randEnd);
+  }
+  {
+    ScalarB randStart, randEnd;
+    Test::getRandomBounds(10.0, randStart, randEnd);
+    Kokkos::fill_random(y, rand_pool, randStart, randEnd);
+  }
 
-    Kokkos::deep_copy(h_b_x,b_x);
+  Kokkos::deep_copy(b_org_y, b_y);
+  auto h_b_org_y =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y);
 
-    KokkosBlas::axpy(a,x,y);
-    Kokkos::deep_copy(h_b_y, b_y);
+  Kokkos::deep_copy(h_b_x, b_x);
 
-    for(int i=0;i<N;i++)
-    {
-      ScalarB expected = a * h_x(i) + h_b_org_y(i, 0);
-      EXPECT_NEAR_KK(expected, h_y(i), eps);
-    }
+  KokkosBlas::axpy(a, x, y);
+  Kokkos::deep_copy(h_b_y, b_y);
 
-    //reset y to orig, and run again with const-valued x
-    Kokkos::deep_copy(b_y,b_org_y);
-    KokkosBlas::axpy(a,c_x,y);
-    Kokkos::deep_copy(h_b_y, b_y);
-    for(int i=0;i<N;i++)
-    {
-      ScalarB expected = a * h_x(i) + h_b_org_y(i, 0);
-      EXPECT_NEAR_KK(expected, h_y(i), eps);
-    }
+  for (int i = 0; i < N; i++) {
+    ScalarB expected = a * h_x(i) + h_b_org_y(i, 0);
+    EXPECT_NEAR_KK(expected, h_y(i), eps);
   }
 
-  template<class ViewTypeA, class ViewTypeB, class Device>
-  void impl_test_axpy_mv(int N, int K) {
+  // reset y to orig, and run again with const-valued x
+  Kokkos::deep_copy(b_y, b_org_y);
+  KokkosBlas::axpy(a, c_x, y);
+  Kokkos::deep_copy(h_b_y, b_y);
+  for (int i = 0; i < N; i++) {
+    ScalarB expected = a * h_x(i) + h_b_org_y(i, 0);
+    EXPECT_NEAR_KK(expected, h_y(i), eps);
+  }
+}
 
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
+template <class ViewTypeA, class ViewTypeB, class Device>
+void impl_test_axpy_mv(int N, int K) {
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
 
-    typedef multivector_layout_adapter<ViewTypeA> vfA_type;
-    typedef multivector_layout_adapter<ViewTypeB> vfB_type;
+  typedef multivector_layout_adapter<ViewTypeA> vfA_type;
+  typedef multivector_layout_adapter<ViewTypeB> vfB_type;
 
-    typename vfA_type::BaseType b_x("A",N,K);
-    typename vfB_type::BaseType b_y("B",N,K);
-    typename vfB_type::BaseType b_org_y("B",N,K);
+  typename vfA_type::BaseType b_x("A", N, K);
+  typename vfB_type::BaseType b_y("B", N, K);
+  typename vfB_type::BaseType b_org_y("B", N, K);
 
-    ViewTypeA x = vfA_type::view(b_x);
-    ViewTypeB y = vfB_type::view(b_y);
+  ViewTypeA x = vfA_type::view(b_x);
+  ViewTypeB y = vfB_type::view(b_y);
 
-    typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
-    typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
+  typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
+  typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
 
-    typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x);
-    typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y);
+  typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x);
+  typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y);
 
-    typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x);
-    typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
+  typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x);
+  typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
 
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-    {
-      ScalarA randStart, randEnd;
-      Test::getRandomBounds(10.0, randStart, randEnd);
-      Kokkos::fill_random(b_x,rand_pool,randStart,randEnd);
-    }
-    {
-      ScalarB randStart, randEnd;
-      Test::getRandomBounds(10.0, randStart, randEnd);
-      Kokkos::fill_random(b_y,rand_pool,randStart,randEnd);
-    }
+  {
+    ScalarA randStart, randEnd;
+    Test::getRandomBounds(10.0, randStart, randEnd);
+    Kokkos::fill_random(b_x, rand_pool, randStart, randEnd);
+  }
+  {
+    ScalarB randStart, randEnd;
+    Test::getRandomBounds(10.0, randStart, randEnd);
+    Kokkos::fill_random(b_y, rand_pool, randStart, randEnd);
+  }
 
-    Kokkos::deep_copy(b_org_y,b_y);
-    auto h_b_org_y = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y);
+  Kokkos::deep_copy(b_org_y, b_y);
+  auto h_b_org_y =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y);
 
-    Kokkos::deep_copy(h_b_x,b_x);
-    Kokkos::deep_copy(h_b_y,b_y);
+  Kokkos::deep_copy(h_b_x, b_x);
+  Kokkos::deep_copy(h_b_y, b_y);
 
-    ScalarA a = 3;
-    typename ViewTypeA::const_type c_x = x;
+  ScalarA a                          = 3;
+  typename ViewTypeA::const_type c_x = x;
 
-    double eps = std::is_same<ScalarA,float>::value?2*1e-5:1e-7;
+  double eps = std::is_same<ScalarA, float>::value ? 2 * 1e-5 : 1e-7;
 
-    KokkosBlas::axpy(a,x,y);
-    Kokkos::deep_copy(h_b_y, b_y);
-    for(int i = 0; i < N; i++)
-    {
-      for(int j = 0; j < K; j++)
-      {
-        EXPECT_NEAR_KK(a * h_x(i, j) + h_b_org_y(i, j), h_y(i, j), eps);
-      }
+  KokkosBlas::axpy(a, x, y);
+  Kokkos::deep_copy(h_b_y, b_y);
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < K; j++) {
+      EXPECT_NEAR_KK(a * h_x(i, j) + h_b_org_y(i, j), h_y(i, j), eps);
     }
+  }
 
-    Kokkos::deep_copy(b_y,b_org_y);
-    KokkosBlas::axpy(a,c_x,y);
-    Kokkos::deep_copy(h_b_y, b_y);
-    for(int i = 0; i < N; i++)
-    {
-      for(int j = 0; j < K; j++)
-      {
-        EXPECT_NEAR_KK(a * h_x(i, j) + h_b_org_y(i, j), h_y(i, j), eps);
-      }
+  Kokkos::deep_copy(b_y, b_org_y);
+  KokkosBlas::axpy(a, c_x, y);
+  Kokkos::deep_copy(h_b_y, b_y);
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < K; j++) {
+      EXPECT_NEAR_KK(a * h_x(i, j) + h_b_org_y(i, j), h_y(i, j), eps);
     }
   }
 }
+}  // namespace Test
 
-
-
-template<class ScalarA, class ScalarB, class Device>
+template <class ScalarA, class ScalarB, class Device>
 int test_axpy() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutLeft, Device> view_type_a_ll;
   typedef Kokkos::View<ScalarB*, Kokkos::LayoutLeft, Device> view_type_b_ll;
   Test::impl_test_axpy<view_type_a_ll, view_type_b_ll, Device>(0);
   Test::impl_test_axpy<view_type_a_ll, view_type_b_ll, Device>(13);
   Test::impl_test_axpy<view_type_a_ll, view_type_b_ll, Device>(1024);
-  //Test::impl_test_axpy<view_type_a_ll, view_type_b_ll, Device>(132231);
+  // Test::impl_test_axpy<view_type_a_ll, view_type_b_ll, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutRight, Device> view_type_a_lr;
   typedef Kokkos::View<ScalarB*, Kokkos::LayoutRight, Device> view_type_b_lr;
   Test::impl_test_axpy<view_type_a_lr, view_type_b_lr, Device>(0);
   Test::impl_test_axpy<view_type_a_lr, view_type_b_lr, Device>(13);
   Test::impl_test_axpy<view_type_a_lr, view_type_b_lr, Device>(1024);
-  //Test::impl_test_axpy<view_type_a_lr, view_type_b_lr, Device>(132231);
+  // Test::impl_test_axpy<view_type_a_lr, view_type_b_lr, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutStride, Device> view_type_a_ls;
   typedef Kokkos::View<ScalarB*, Kokkos::LayoutStride, Device> view_type_b_ls;
   Test::impl_test_axpy<view_type_a_ls, view_type_b_ls, Device>(0);
   Test::impl_test_axpy<view_type_a_ls, view_type_b_ls, Device>(13);
   Test::impl_test_axpy<view_type_a_ls, view_type_b_ls, Device>(1024);
-  //Test::impl_test_axpy<view_type_a_ls, view_type_b_ls, Device>(132231);
+  // Test::impl_test_axpy<view_type_a_ls, view_type_b_ls, Device>(132231);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
   Test::impl_test_axpy<view_type_a_ls, view_type_b_ll, Device>(1024);
   Test::impl_test_axpy<view_type_a_ll, view_type_b_ls, Device>(1024);
 #endif
@@ -191,105 +197,121 @@ int test_axpy() {
   return 1;
 }
 
-template<class ScalarA, class ScalarB, class Device>
+template <class ScalarA, class ScalarB, class Device>
 int test_axpy_mv() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device> view_type_a_ll;
   typedef Kokkos::View<ScalarB**, Kokkos::LayoutLeft, Device> view_type_b_ll;
-  Test::impl_test_axpy_mv<view_type_a_ll, view_type_b_ll, Device>(0,5);
-  Test::impl_test_axpy_mv<view_type_a_ll, view_type_b_ll, Device>(13,5);
-  Test::impl_test_axpy_mv<view_type_a_ll, view_type_b_ll, Device>(1024,5);
-  //Test::impl_test_axpy_mv<view_type_a_ll, view_type_b_ll, Device>(132231,5);
+  Test::impl_test_axpy_mv<view_type_a_ll, view_type_b_ll, Device>(0, 5);
+  Test::impl_test_axpy_mv<view_type_a_ll, view_type_b_ll, Device>(13, 5);
+  Test::impl_test_axpy_mv<view_type_a_ll, view_type_b_ll, Device>(1024, 5);
+  // Test::impl_test_axpy_mv<view_type_a_ll, view_type_b_ll, Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device> view_type_a_lr;
   typedef Kokkos::View<ScalarB**, Kokkos::LayoutRight, Device> view_type_b_lr;
-  Test::impl_test_axpy_mv<view_type_a_lr, view_type_b_lr, Device>(0,5);
-  Test::impl_test_axpy_mv<view_type_a_lr, view_type_b_lr, Device>(13,5);
-  Test::impl_test_axpy_mv<view_type_a_lr, view_type_b_lr, Device>(1024,5);
-  //Test::impl_test_axpy_mv<view_type_a_lr, view_type_b_lr, Device>(132231,5);
+  Test::impl_test_axpy_mv<view_type_a_lr, view_type_b_lr, Device>(0, 5);
+  Test::impl_test_axpy_mv<view_type_a_lr, view_type_b_lr, Device>(13, 5);
+  Test::impl_test_axpy_mv<view_type_a_lr, view_type_b_lr, Device>(1024, 5);
+  // Test::impl_test_axpy_mv<view_type_a_lr, view_type_b_lr, Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutStride, Device> view_type_a_ls;
   typedef Kokkos::View<ScalarB**, Kokkos::LayoutStride, Device> view_type_b_ls;
-  Test::impl_test_axpy_mv<view_type_a_ls, view_type_b_ls, Device>(0,5);
-  Test::impl_test_axpy_mv<view_type_a_ls, view_type_b_ls, Device>(13,5);
-  Test::impl_test_axpy_mv<view_type_a_ls, view_type_b_ls, Device>(1024,5);
-  //Test::impl_test_axpy_mv<view_type_a_ls, view_type_b_ls, Device>(132231,5);
+  Test::impl_test_axpy_mv<view_type_a_ls, view_type_b_ls, Device>(0, 5);
+  Test::impl_test_axpy_mv<view_type_a_ls, view_type_b_ls, Device>(13, 5);
+  Test::impl_test_axpy_mv<view_type_a_ls, view_type_b_ls, Device>(1024, 5);
+  // Test::impl_test_axpy_mv<view_type_a_ls, view_type_b_ls, Device>(132231,5);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-  Test::impl_test_axpy_mv<view_type_a_ls, view_type_b_ll, Device>(1024,5);
-  Test::impl_test_axpy_mv<view_type_a_ll, view_type_b_ls, Device>(1024,5);
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+  Test::impl_test_axpy_mv<view_type_a_ls, view_type_b_ll, Device>(1024, 5);
+  Test::impl_test_axpy_mv<view_type_a_ll, view_type_b_ls, Device>(1024, 5);
 #endif
 
   return 1;
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, axpy_float ) {
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, axpy_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_float");
-    test_axpy<float,float,TestExecSpace> ();
+  test_axpy<float, float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, axpy_mv_float ) {
+TEST_F(TestCategory, axpy_mv_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_mv_float");
-    test_axpy_mv<float,float,TestExecSpace> ();
+  test_axpy_mv<float, float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, axpy_double ) {
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, axpy_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_double");
-    test_axpy<double,double,TestExecSpace> ();
+  test_axpy<double, double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, axpy_mv_double ) {
+TEST_F(TestCategory, axpy_mv_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_mv_double");
-    test_axpy_mv<double,double,TestExecSpace> ();
+  test_axpy_mv<double, double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, axpy_complex_double ) {
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, axpy_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_complex_double");
-    test_axpy<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+  test_axpy<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, axpy_mv_complex_double ) {
+TEST_F(TestCategory, axpy_mv_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_mv_complex_double");
-    test_axpy_mv<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+  test_axpy_mv<Kokkos::complex<double>, Kokkos::complex<double>,
+               TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, axpy_int ) {
+#if defined(KOKKOSKERNELS_INST_INT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, axpy_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_int");
-    test_axpy<int,int,TestExecSpace> ();
+  test_axpy<int, int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, axpy_mv_int ) {
+TEST_F(TestCategory, axpy_mv_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_mv_int");
-    test_axpy_mv<int,int,TestExecSpace> ();
+  test_axpy_mv<int, int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-TEST_F( TestCategory, axpy_double_int ) {
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+TEST_F(TestCategory, axpy_double_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_double_int");
-    test_axpy<double,int,TestExecSpace> ();
+  test_axpy<double, int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, axpy_double_mv_int ) {
+TEST_F(TestCategory, axpy_double_mv_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_mv_double_int");
-    test_axpy_mv<double,int,TestExecSpace> ();
+  test_axpy_mv<double, int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
diff --git a/unit_test/blas/Test_Blas1_dot.hpp b/unit_test/blas/Test_Blas1_dot.hpp
index 63b3b717e0..920ac06c77 100644
--- a/unit_test/blas/Test_Blas1_dot.hpp
+++ b/unit_test/blas/Test_Blas1_dot.hpp
@@ -1,13 +1,13 @@
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<Kokkos_ArithTraits.hpp>
-#include<KokkosBlas1_dot.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <Kokkos_ArithTraits.hpp>
+#include <KokkosBlas1_dot.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
-  template<class ViewTypeA, class ViewTypeB, class Device>
-  void impl_test_dot(int N) {
+template <class ViewTypeA, class ViewTypeB, class Device>
+void impl_test_dot(int N) {
   typedef typename ViewTypeA::value_type ScalarA;
   typedef typename ViewTypeB::value_type ScalarB;
   typedef Kokkos::ArithTraits<ScalarA> ats;
@@ -45,130 +45,140 @@ namespace Test {
   typename ViewTypeA::const_type c_a = a;
   typename ViewTypeB::const_type c_b = b;
 
-  ScalarA const_const_result = KokkosBlas::dot(c_a,c_b);
-    EXPECT_NEAR_KK( const_const_result, expected_result, eps*expected_result);
+  ScalarA const_const_result = KokkosBlas::dot(c_a, c_b);
+  EXPECT_NEAR_KK(const_const_result, expected_result, eps * expected_result);
 
-    ScalarA nonconst_const_result = KokkosBlas::dot(a,c_b);
-    EXPECT_NEAR_KK( nonconst_const_result, expected_result, eps*expected_result);
+  ScalarA nonconst_const_result = KokkosBlas::dot(a, c_b);
+  EXPECT_NEAR_KK(nonconst_const_result, expected_result, eps * expected_result);
 
-    ScalarA const_nonconst_result = KokkosBlas::dot(c_a,b);
-    EXPECT_NEAR_KK( const_nonconst_result, expected_result, eps*expected_result);
-  }
-
-  template<class ViewTypeA, class ViewTypeB, class Device>
-  void impl_test_dot_mv(int N, int K) {
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-    typedef Kokkos::ArithTraits<ScalarA> ats;
+  ScalarA const_nonconst_result = KokkosBlas::dot(c_a, b);
+  EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result);
+}
 
-    typedef multivector_layout_adapter<ViewTypeA> vfA_type;
-    typedef multivector_layout_adapter<ViewTypeB> vfB_type;
+template <class ViewTypeA, class ViewTypeB, class Device>
+void impl_test_dot_mv(int N, int K) {
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+  typedef Kokkos::ArithTraits<ScalarA> ats;
 
-    typename vfA_type::BaseType b_a("A",N,K);
-    typename vfB_type::BaseType b_b("B",N,K);
+  typedef multivector_layout_adapter<ViewTypeA> vfA_type;
+  typedef multivector_layout_adapter<ViewTypeB> vfB_type;
 
-    ViewTypeA a = vfA_type::view(b_a);
-    ViewTypeB b = vfB_type::view(b_b);
+  typename vfA_type::BaseType b_a("A", N, K);
+  typename vfB_type::BaseType b_b("B", N, K);
 
-    typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
-    typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
+  ViewTypeA a = vfA_type::view(b_a);
+  ViewTypeB b = vfB_type::view(b_b);
 
-    typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a);
-    typename h_vfB_type::BaseType h_b_b = Kokkos::create_mirror_view(b_b);
+  typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
+  typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
 
-    typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a);
-    typename ViewTypeB::HostMirror h_b = h_vfB_type::view(h_b_b);
+  typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a);
+  typename h_vfB_type::BaseType h_b_b = Kokkos::create_mirror_view(b_b);
 
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a);
+  typename ViewTypeB::HostMirror h_b = h_vfB_type::view(h_b_b);
 
-    {
-      ScalarA randStart, randEnd;
-      Test::getRandomBounds(10.0, randStart, randEnd);
-      Kokkos::fill_random(b_a,rand_pool,randStart,randEnd);
-    }
-    {
-      ScalarB randStart, randEnd;
-      Test::getRandomBounds(10.0, randStart, randEnd);
-      Kokkos::fill_random(b_b,rand_pool,randStart,randEnd);
-    }
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-    Kokkos::deep_copy(h_b_a,b_a);
-    Kokkos::deep_copy(h_b_b,b_b);
+  {
+    ScalarA randStart, randEnd;
+    Test::getRandomBounds(10.0, randStart, randEnd);
+    Kokkos::fill_random(b_a, rand_pool, randStart, randEnd);
+  }
+  {
+    ScalarB randStart, randEnd;
+    Test::getRandomBounds(10.0, randStart, randEnd);
+    Kokkos::fill_random(b_b, rand_pool, randStart, randEnd);
+  }
 
-    typename ViewTypeA::const_type c_a = a;
-    typename ViewTypeB::const_type c_b = b;
+  Kokkos::deep_copy(h_b_a, b_a);
+  Kokkos::deep_copy(h_b_b, b_b);
 
-    ScalarA* expected_result = new ScalarA[K];
-    for(int j=0;j<K;j++) {
-      expected_result[j] = ScalarA();
-      for(int i=0;i<N;i++)
-        expected_result[j] += ats::conj(h_a(i,j))*h_b(i,j);
-    }
+  typename ViewTypeA::const_type c_a = a;
+  typename ViewTypeB::const_type c_b = b;
 
-    double eps = std::is_same<ScalarA,float>::value?2*1e-5:1e-7;
+  ScalarA* expected_result = new ScalarA[K];
+  for (int j = 0; j < K; j++) {
+    expected_result[j] = ScalarA();
+    for (int i = 0; i < N; i++)
+      expected_result[j] += ats::conj(h_a(i, j)) * h_b(i, j);
+  }
 
-    Kokkos::View<ScalarB*,Kokkos::HostSpace> r("Dot::Result",K);
+  double eps = std::is_same<ScalarA, float>::value ? 2 * 1e-5 : 1e-7;
 
-    KokkosBlas::dot(r,a,b);
-    for(int k=0;k<K;k++) {
-      ScalarA nonconst_nonconst_result = r(k);
-      EXPECT_NEAR_KK( nonconst_nonconst_result, expected_result[k], eps*expected_result[k]);
-    }
+  Kokkos::View<ScalarB*, Kokkos::HostSpace> r("Dot::Result", K);
 
-    KokkosBlas::dot(r,c_a,c_b);
-    for(int k=0;k<K;k++) {
-      ScalarA const_const_result = r(k);
-      EXPECT_NEAR_KK( const_const_result, expected_result[k], eps*expected_result[k]);
-    }
+  KokkosBlas::dot(r, a, b);
+  for (int k = 0; k < K; k++) {
+    ScalarA nonconst_nonconst_result = r(k);
+    EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k],
+                   eps * expected_result[k]);
+  }
 
-    KokkosBlas::dot(r,a,c_b);
-    for(int k=0;k<K;k++) {
-      ScalarA non_const_const_result = r(k);
-      EXPECT_NEAR_KK( non_const_const_result, expected_result[k], eps*expected_result[k]);
-    }
+  KokkosBlas::dot(r, c_a, c_b);
+  for (int k = 0; k < K; k++) {
+    ScalarA const_const_result = r(k);
+    EXPECT_NEAR_KK(const_const_result, expected_result[k],
+                   eps * expected_result[k]);
+  }
 
-    KokkosBlas::dot(r,c_a,b);
-    for(int k=0;k<K;k++) {
-      ScalarA const_non_const_result = r(k);
-      EXPECT_NEAR_KK( const_non_const_result, expected_result[k], eps*expected_result[k]);
-    }
+  KokkosBlas::dot(r, a, c_b);
+  for (int k = 0; k < K; k++) {
+    ScalarA non_const_const_result = r(k);
+    EXPECT_NEAR_KK(non_const_const_result, expected_result[k],
+                   eps * expected_result[k]);
+  }
 
-    delete [] expected_result;
+  KokkosBlas::dot(r, c_a, b);
+  for (int k = 0; k < K; k++) {
+    ScalarA const_non_const_result = r(k);
+    EXPECT_NEAR_KK(const_non_const_result, expected_result[k],
+                   eps * expected_result[k]);
   }
+
+  delete[] expected_result;
 }
+}  // namespace Test
 
-template<class ScalarA, class ScalarB, class Device>
+template <class ScalarA, class ScalarB, class Device>
 int test_dot() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutLeft, Device> view_type_a_ll;
   typedef Kokkos::View<ScalarB*, Kokkos::LayoutLeft, Device> view_type_b_ll;
   Test::impl_test_dot<view_type_a_ll, view_type_b_ll, Device>(0);
   Test::impl_test_dot<view_type_a_ll, view_type_b_ll, Device>(13);
   Test::impl_test_dot<view_type_a_ll, view_type_b_ll, Device>(1024);
-  //Test::impl_test_dot<view_type_a_ll, view_type_b_ll, Device>(132231);
+  // Test::impl_test_dot<view_type_a_ll, view_type_b_ll, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutRight, Device> view_type_a_lr;
   typedef Kokkos::View<ScalarB*, Kokkos::LayoutRight, Device> view_type_b_lr;
   Test::impl_test_dot<view_type_a_lr, view_type_b_lr, Device>(0);
   Test::impl_test_dot<view_type_a_lr, view_type_b_lr, Device>(13);
   Test::impl_test_dot<view_type_a_lr, view_type_b_lr, Device>(1024);
-  //Test::impl_test_dot<view_type_a_lr, view_type_b_lr, Device>(132231);
+  // Test::impl_test_dot<view_type_a_lr, view_type_b_lr, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutStride, Device> view_type_a_ls;
   typedef Kokkos::View<ScalarB*, Kokkos::LayoutStride, Device> view_type_b_ls;
   Test::impl_test_dot<view_type_a_ls, view_type_b_ls, Device>(0);
   Test::impl_test_dot<view_type_a_ls, view_type_b_ls, Device>(13);
   Test::impl_test_dot<view_type_a_ls, view_type_b_ls, Device>(1024);
-  //Test::impl_test_dot<view_type_a_ls, view_type_b_ls, Device>(132231);
+  // Test::impl_test_dot<view_type_a_ls, view_type_b_ls, Device>(132231);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
   Test::impl_test_dot<view_type_a_ls, view_type_b_ll, Device>(1024);
   Test::impl_test_dot<view_type_a_ll, view_type_b_ls, Device>(1024);
 #endif
@@ -176,102 +186,116 @@ int test_dot() {
   return 1;
 }
 
-template<class ScalarA, class ScalarB, class Device>
+template <class ScalarA, class ScalarB, class Device>
 int test_dot_mv() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device> view_type_a_ll;
   typedef Kokkos::View<ScalarB**, Kokkos::LayoutLeft, Device> view_type_b_ll;
-  Test::impl_test_dot_mv<view_type_a_ll, view_type_b_ll, Device>(0,5);
-  Test::impl_test_dot_mv<view_type_a_ll, view_type_b_ll, Device>(13,5);
-  Test::impl_test_dot_mv<view_type_a_ll, view_type_b_ll, Device>(1024,5);
-  //Test::impl_test_dot_mv<view_type_a_ll, view_type_b_ll, Device>(132231,5);
+  Test::impl_test_dot_mv<view_type_a_ll, view_type_b_ll, Device>(0, 5);
+  Test::impl_test_dot_mv<view_type_a_ll, view_type_b_ll, Device>(13, 5);
+  Test::impl_test_dot_mv<view_type_a_ll, view_type_b_ll, Device>(1024, 5);
+  // Test::impl_test_dot_mv<view_type_a_ll, view_type_b_ll, Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device> view_type_a_lr;
   typedef Kokkos::View<ScalarB**, Kokkos::LayoutRight, Device> view_type_b_lr;
-  Test::impl_test_dot_mv<view_type_a_lr, view_type_b_lr, Device>(0,5);
-  Test::impl_test_dot_mv<view_type_a_lr, view_type_b_lr, Device>(13,5);
-  Test::impl_test_dot_mv<view_type_a_lr, view_type_b_lr, Device>(1024,5);
-  //Test::impl_test_dot_mv<view_type_a_lr, view_type_b_lr, Device>(132231,5);
+  Test::impl_test_dot_mv<view_type_a_lr, view_type_b_lr, Device>(0, 5);
+  Test::impl_test_dot_mv<view_type_a_lr, view_type_b_lr, Device>(13, 5);
+  Test::impl_test_dot_mv<view_type_a_lr, view_type_b_lr, Device>(1024, 5);
+  // Test::impl_test_dot_mv<view_type_a_lr, view_type_b_lr, Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutStride, Device> view_type_a_ls;
   typedef Kokkos::View<ScalarB**, Kokkos::LayoutStride, Device> view_type_b_ls;
-  Test::impl_test_dot_mv<view_type_a_ls, view_type_b_ls, Device>(0,5);
-  Test::impl_test_dot_mv<view_type_a_ls, view_type_b_ls, Device>(13,5);
-  Test::impl_test_dot_mv<view_type_a_ls, view_type_b_ls, Device>(1024,5);
-  //Test::impl_test_dot_mv<view_type_a_ls, view_type_b_ls, Device>(132231,5);
+  Test::impl_test_dot_mv<view_type_a_ls, view_type_b_ls, Device>(0, 5);
+  Test::impl_test_dot_mv<view_type_a_ls, view_type_b_ls, Device>(13, 5);
+  Test::impl_test_dot_mv<view_type_a_ls, view_type_b_ls, Device>(1024, 5);
+  // Test::impl_test_dot_mv<view_type_a_ls, view_type_b_ls, Device>(132231,5);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-  Test::impl_test_dot_mv<view_type_a_ls, view_type_b_ll, Device>(1024,5);
-  Test::impl_test_dot_mv<view_type_a_ll, view_type_b_ls, Device>(1024,5);
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+  Test::impl_test_dot_mv<view_type_a_ls, view_type_b_ll, Device>(1024, 5);
+  Test::impl_test_dot_mv<view_type_a_ll, view_type_b_ls, Device>(1024, 5);
 #endif
 
   return 1;
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, dot_float ) {
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, dot_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_float");
-    test_dot<float,float,TestExecSpace> ();
+  test_dot<float, float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, dot_mv_float ) {
+TEST_F(TestCategory, dot_mv_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_mv_float");
-    test_dot_mv<float,float,TestExecSpace> ();
+  test_dot_mv<float, float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, dot_double ) {
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, dot_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_double");
-    test_dot<double,double,TestExecSpace> ();
+  test_dot<double, double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, dot_mv_double ) {
+TEST_F(TestCategory, dot_mv_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_mv_double");
-    test_dot_mv<double,double,TestExecSpace> ();
+  test_dot_mv<double, double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, dot_complex_double ) {
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, dot_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_complex_double");
-    test_dot<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+  test_dot<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, dot_mv_complex_double ) {
+TEST_F(TestCategory, dot_mv_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_mv_complex_double");
-    test_dot_mv<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+  test_dot_mv<Kokkos::complex<double>, Kokkos::complex<double>,
+              TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, dot_int ) {
+#if defined(KOKKOSKERNELS_INST_INT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, dot_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_int");
-    test_dot<int,int,TestExecSpace> ();
+  test_dot<int, int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, dot_mv_int ) {
+TEST_F(TestCategory, dot_mv_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_mv_int");
-    test_dot_mv<int,int,TestExecSpace> ();
+  test_dot_mv<int, int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-/*#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-TEST_F( TestCategory, dot_double_int ) {
-    test_dot<double,int,TestExecSpace> ();
+/*#if !defined(KOKKOSKERNELS_ETI_ONLY) &&
+!defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F( TestCategory,
+dot_double_int ) { test_dot<double,int,TestExecSpace> ();
 }
 TEST_F( TestCategory, dot_mv_double_int ) {
     test_dot_mv<double,int,TestExecSpace> ();
 }
 #endif*/
-
diff --git a/unit_test/blas/Test_Blas1_iamax.hpp b/unit_test/blas/Test_Blas1_iamax.hpp
index 36550a317d..88c21be83c 100644
--- a/unit_test/blas/Test_Blas1_iamax.hpp
+++ b/unit_test/blas/Test_Blas1_iamax.hpp
@@ -1,292 +1,331 @@
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas1_iamax.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas1_iamax.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
-  template<class ViewTypeA, class Device>
-  void impl_test_iamax(int N) {
+template <class ViewTypeA, class Device>
+void impl_test_iamax(int N) {
+  typedef typename ViewTypeA::non_const_value_type ScalarA;
+  typedef Kokkos::Details::ArithTraits<ScalarA> AT;
+  typedef typename AT::mag_type mag_type;
+  using size_type = typename ViewTypeA::size_type;
 
-    typedef typename ViewTypeA::non_const_value_type ScalarA;
-    typedef Kokkos::Details::ArithTraits<ScalarA> AT;
-    typedef typename AT::mag_type mag_type;
-    using size_type = typename ViewTypeA::size_type;
+  ViewTypeA a("a", N);
 
-    ViewTypeA a("a", N);
+  typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a);
 
-    typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  ScalarA randStart, randEnd;
+  Test::getRandomBounds(10.0, randStart, randEnd);
+  Kokkos::fill_random(a, rand_pool, randStart, randEnd);
 
-    ScalarA randStart, randEnd;
-    Test::getRandomBounds(10.0, randStart, randEnd);
-    Kokkos::fill_random(a, rand_pool, randStart, randEnd);
+  Kokkos::deep_copy(h_a, a);
 
-    Kokkos::deep_copy(h_a, a);
+  typename ViewTypeA::const_type c_a = a;
 
-    typename ViewTypeA::const_type c_a = a;
-
-    mag_type expected_result = Kokkos::Details::ArithTraits<mag_type>::min();
-    size_type expected_max_loc = 0;
-    for(int i=0;i<N;i++) {
-      mag_type val = AT::abs(h_a(i));
-      if(val > expected_result) { expected_result = val; expected_max_loc = i+1;}
+  mag_type expected_result   = Kokkos::Details::ArithTraits<mag_type>::min();
+  size_type expected_max_loc = 0;
+  for (int i = 0; i < N; i++) {
+    mag_type val = AT::abs(h_a(i));
+    if (val > expected_result) {
+      expected_result  = val;
+      expected_max_loc = i + 1;
     }
+  }
 
-    if(N == 0) {expected_result = typename AT::mag_type(0); expected_max_loc = 0;}
-
-    {
-      //printf("impl_test_iamax -- return result as a scalar on host -- N %d\n", N);
-      size_type nonconst_max_loc = KokkosBlas::iamax(a);
-      ASSERT_EQ( nonconst_max_loc, expected_max_loc);
-
-      size_type const_max_loc = KokkosBlas::iamax(c_a);
-      ASSERT_EQ( const_max_loc, expected_max_loc);
-    }
+  if (N == 0) {
+    expected_result  = typename AT::mag_type(0);
+    expected_max_loc = 0;
+  }
 
-	{
-      //printf("impl_test_iamax -- return result as a 0-D View on host -- N %d\n", N);
-      typedef Kokkos::View<size_type, typename ViewTypeA::array_layout, Kokkos::HostSpace> ViewType0D;
-      ViewType0D r("Iamax::Result 0-D View on host");
+  {
+    // printf("impl_test_iamax -- return result as a scalar on host -- N %d\n",
+    // N);
+    size_type nonconst_max_loc = KokkosBlas::iamax(a);
+    ASSERT_EQ(nonconst_max_loc, expected_max_loc);
 
-      KokkosBlas::iamax(r,a);
-      size_type nonconst_max_loc = r();
-      ASSERT_EQ( nonconst_max_loc, expected_max_loc);
+    size_type const_max_loc = KokkosBlas::iamax(c_a);
+    ASSERT_EQ(const_max_loc, expected_max_loc);
+  }
 
-      KokkosBlas::iamax(r,c_a);
-      size_type const_max_loc = r();
-      ASSERT_EQ( const_max_loc, expected_max_loc);
-	}
+  {
+    // printf("impl_test_iamax -- return result as a 0-D View on host -- N
+    // %d\n", N);
+    typedef Kokkos::View<size_type, typename ViewTypeA::array_layout,
+                         Kokkos::HostSpace>
+        ViewType0D;
+    ViewType0D r("Iamax::Result 0-D View on host");
+
+    KokkosBlas::iamax(r, a);
+    size_type nonconst_max_loc = r();
+    ASSERT_EQ(nonconst_max_loc, expected_max_loc);
+
+    KokkosBlas::iamax(r, c_a);
+    size_type const_max_loc = r();
+    ASSERT_EQ(const_max_loc, expected_max_loc);
+  }
 
-	{
-      //printf("impl_test_iamax -- return result as a 0-D View on device -- N %d\n", N);
-      typedef Kokkos::View<size_type, typename ViewTypeA::array_layout, Device> ViewType0D;
-      ViewType0D r("Iamax::Result 0-D View on device");
-      typename ViewType0D::HostMirror h_r = Kokkos::create_mirror_view(r);
+  {
+    // printf("impl_test_iamax -- return result as a 0-D View on device -- N
+    // %d\n", N);
+    typedef Kokkos::View<size_type, typename ViewTypeA::array_layout, Device>
+        ViewType0D;
+    ViewType0D r("Iamax::Result 0-D View on device");
+    typename ViewType0D::HostMirror h_r = Kokkos::create_mirror_view(r);
 
-      size_type nonconst_max_loc, const_max_loc;
+    size_type nonconst_max_loc, const_max_loc;
 
-      KokkosBlas::iamax(r,a);
-      Kokkos::deep_copy(h_r,r);
+    KokkosBlas::iamax(r, a);
+    Kokkos::deep_copy(h_r, r);
 
-      nonconst_max_loc = h_r();
+    nonconst_max_loc = h_r();
 
-      ASSERT_EQ( nonconst_max_loc, expected_max_loc);
+    ASSERT_EQ(nonconst_max_loc, expected_max_loc);
 
-      KokkosBlas::iamax(r,c_a);
-      Kokkos::deep_copy(h_r,r);
+    KokkosBlas::iamax(r, c_a);
+    Kokkos::deep_copy(h_r, r);
 
-      const_max_loc = h_r();
+    const_max_loc = h_r();
 
-      ASSERT_EQ( const_max_loc, expected_max_loc);
-    }
+    ASSERT_EQ(const_max_loc, expected_max_loc);
   }
+}
 
-  template<class ViewTypeA, class Device>
-  void impl_test_iamax_mv(int N, int K) {
-
-    typedef typename ViewTypeA::non_const_value_type ScalarA;
-    typedef Kokkos::Details::ArithTraits<ScalarA> AT;
-    typedef typename AT::mag_type mag_type;
-    typedef typename ViewTypeA::size_type size_type;
+template <class ViewTypeA, class Device>
+void impl_test_iamax_mv(int N, int K) {
+  typedef typename ViewTypeA::non_const_value_type ScalarA;
+  typedef Kokkos::Details::ArithTraits<ScalarA> AT;
+  typedef typename AT::mag_type mag_type;
+  typedef typename ViewTypeA::size_type size_type;
 
-    typedef multivector_layout_adapter<ViewTypeA> vfA_type;
+  typedef multivector_layout_adapter<ViewTypeA> vfA_type;
 
-    typename vfA_type::BaseType b_a("A",N,K);
+  typename vfA_type::BaseType b_a("A", N, K);
 
-    ViewTypeA a = vfA_type::view(b_a);
+  ViewTypeA a = vfA_type::view(b_a);
 
-    typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
+  typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
 
-    typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a);
+  typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a);
 
-    typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a);
+  typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a);
 
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-    ScalarA randStart, randEnd;
-    Test::getRandomBounds(10.0, randStart, randEnd);
-    Kokkos::fill_random(b_a,rand_pool,randStart,randEnd);
+  ScalarA randStart, randEnd;
+  Test::getRandomBounds(10.0, randStart, randEnd);
+  Kokkos::fill_random(b_a, rand_pool, randStart, randEnd);
 
-    Kokkos::deep_copy(h_b_a,b_a);
+  Kokkos::deep_copy(h_b_a, b_a);
 
-    typename ViewTypeA::const_type c_a = a;
+  typename ViewTypeA::const_type c_a = a;
 
-    mag_type* expected_result = new mag_type[K];
-    size_type* expected_max_loc = new size_type[K];
+  mag_type* expected_result   = new mag_type[K];
+  size_type* expected_max_loc = new size_type[K];
 
-    for(int j=0;j<K;j++) {
-      expected_result[j] = Kokkos::Details::ArithTraits<mag_type>::min();
-      for(int i=0;i<N;i++) {
-        mag_type val = AT::abs(h_a(i,j));
-        if(val > expected_result[j]) { expected_result[j] = val; expected_max_loc[j] = i+1;}
+  for (int j = 0; j < K; j++) {
+    expected_result[j] = Kokkos::Details::ArithTraits<mag_type>::min();
+    for (int i = 0; i < N; i++) {
+      mag_type val = AT::abs(h_a(i, j));
+      if (val > expected_result[j]) {
+        expected_result[j]  = val;
+        expected_max_loc[j] = i + 1;
       }
-      if(N == 0) {expected_result[j] = mag_type(0); expected_max_loc[j] = size_type(0);}
     }
+    if (N == 0) {
+      expected_result[j]  = mag_type(0);
+      expected_max_loc[j] = size_type(0);
+    }
+  }
 
-    {
-      //printf("impl_test_iamax_mv -- return results as a 1-D View on host -- N %d\n", N);
-      Kokkos::View<size_type*, typename ViewTypeA::array_layout, Kokkos::HostSpace> r("Iamax::Result View on host",K);
-
-      KokkosBlas::iamax(r,a);
-
-      for(int k=0;k<K;k++) {
-        size_type nonconst_result = r(k);
-        size_type exp_result = expected_max_loc[k];
-        ASSERT_EQ( nonconst_result, exp_result);
-      }
+  {
+    // printf("impl_test_iamax_mv -- return results as a 1-D View on host -- N
+    // %d\n", N);
+    Kokkos::View<size_type*, typename ViewTypeA::array_layout,
+                 Kokkos::HostSpace>
+        r("Iamax::Result View on host", K);
 
-      KokkosBlas::iamax(r,c_a);
+    KokkosBlas::iamax(r, a);
 
-      for(int k=0;k<K;k++) {
-        size_type const_result = r(k);
-        size_type exp_result = expected_max_loc[k];
-        ASSERT_EQ( const_result, exp_result);
-      }
+    for (int k = 0; k < K; k++) {
+      size_type nonconst_result = r(k);
+      size_type exp_result      = expected_max_loc[k];
+      ASSERT_EQ(nonconst_result, exp_result);
     }
 
-    {
-      //printf("impl_test_iamax_mv -- return results as a 1-D View on device -- N %d\n", N);
-      Kokkos::View<size_type*, typename ViewTypeA::array_layout, Device> r("Iamax::Result View on device",K);
-      typename Kokkos::View<size_type *, typename ViewTypeA::array_layout, Device>::HostMirror h_r= Kokkos::create_mirror_view(r);
+    KokkosBlas::iamax(r, c_a);
 
-      KokkosBlas::iamax(r,a);
-      Kokkos::deep_copy(h_r,r);
+    for (int k = 0; k < K; k++) {
+      size_type const_result = r(k);
+      size_type exp_result   = expected_max_loc[k];
+      ASSERT_EQ(const_result, exp_result);
+    }
+  }
 
-      for(int k=0;k<K;k++) {
-        size_type nonconst_result = h_r(k);
-        size_type exp_result = expected_max_loc[k];
-        ASSERT_EQ( nonconst_result, exp_result);
-      }
+  {
+    // printf("impl_test_iamax_mv -- return results as a 1-D View on device -- N
+    // %d\n", N);
+    Kokkos::View<size_type*, typename ViewTypeA::array_layout, Device> r(
+        "Iamax::Result View on device", K);
+    typename Kokkos::View<size_type*, typename ViewTypeA::array_layout,
+                          Device>::HostMirror h_r =
+        Kokkos::create_mirror_view(r);
+
+    KokkosBlas::iamax(r, a);
+    Kokkos::deep_copy(h_r, r);
+
+    for (int k = 0; k < K; k++) {
+      size_type nonconst_result = h_r(k);
+      size_type exp_result      = expected_max_loc[k];
+      ASSERT_EQ(nonconst_result, exp_result);
+    }
 
-      KokkosBlas::iamax(r,c_a);
-      Kokkos::deep_copy(h_r,r);
+    KokkosBlas::iamax(r, c_a);
+    Kokkos::deep_copy(h_r, r);
 
-      for(int k=0;k<K;k++) {
-        size_type const_result = h_r(k);
-        size_type exp_result = expected_max_loc[k];
-        ASSERT_EQ( const_result, exp_result);
-      }
+    for (int k = 0; k < K; k++) {
+      size_type const_result = h_r(k);
+      size_type exp_result   = expected_max_loc[k];
+      ASSERT_EQ(const_result, exp_result);
     }
-
-    delete [] expected_result;
-    delete [] expected_max_loc;
   }
+
+  delete[] expected_result;
+  delete[] expected_max_loc;
 }
+}  // namespace Test
 
-template<class ScalarA, class Device>
+template <class ScalarA, class Device>
 int test_iamax() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutLeft, Device> view_type_a_ll;
   Test::impl_test_iamax<view_type_a_ll, Device>(0);
   Test::impl_test_iamax<view_type_a_ll, Device>(13);
   Test::impl_test_iamax<view_type_a_ll, Device>(1024);
-  //Test::impl_test_iamax<view_type_a_ll, Device>(132231);
+  // Test::impl_test_iamax<view_type_a_ll, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutRight, Device> view_type_a_lr;
   Test::impl_test_iamax<view_type_a_lr, Device>(0);
   Test::impl_test_iamax<view_type_a_lr, Device>(13);
   Test::impl_test_iamax<view_type_a_lr, Device>(1024);
-  //Test::impl_test_iamax<view_type_a_lr, Device>(132231);
+  // Test::impl_test_iamax<view_type_a_lr, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutStride, Device> view_type_a_ls;
   Test::impl_test_iamax<view_type_a_ls, Device>(0);
   Test::impl_test_iamax<view_type_a_ls, Device>(13);
   Test::impl_test_iamax<view_type_a_ls, Device>(1024);
-  //Test::impl_test_iamax<view_type_a_ls, Device>(132231);
+  // Test::impl_test_iamax<view_type_a_ls, Device>(132231);
 #endif
 
   return 1;
 }
 
-template<class ScalarA, class Device>
+template <class ScalarA, class Device>
 int test_iamax_mv() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device> view_type_a_ll;
-  Test::impl_test_iamax_mv<view_type_a_ll, Device>(0,5);
-  Test::impl_test_iamax_mv<view_type_a_ll, Device>(13,5);
-  Test::impl_test_iamax_mv<view_type_a_ll, Device>(1024,5);
-  //Test::impl_test_iamax_mv<view_type_a_ll, Device>(132231,5);
+  Test::impl_test_iamax_mv<view_type_a_ll, Device>(0, 5);
+  Test::impl_test_iamax_mv<view_type_a_ll, Device>(13, 5);
+  Test::impl_test_iamax_mv<view_type_a_ll, Device>(1024, 5);
+  // Test::impl_test_iamax_mv<view_type_a_ll, Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device> view_type_a_lr;
-  Test::impl_test_iamax_mv<view_type_a_lr, Device>(0,5);
-  Test::impl_test_iamax_mv<view_type_a_lr, Device>(13,5);
-  Test::impl_test_iamax_mv<view_type_a_lr, Device>(1024,5);
-  //Test::impl_test_iamax_mv<view_type_a_lr, Device>(132231,5);
+  Test::impl_test_iamax_mv<view_type_a_lr, Device>(0, 5);
+  Test::impl_test_iamax_mv<view_type_a_lr, Device>(13, 5);
+  Test::impl_test_iamax_mv<view_type_a_lr, Device>(1024, 5);
+  // Test::impl_test_iamax_mv<view_type_a_lr, Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutStride, Device> view_type_a_ls;
-  Test::impl_test_iamax_mv<view_type_a_ls, Device>(0,5);
-  Test::impl_test_iamax_mv<view_type_a_ls, Device>(13,5);
-  Test::impl_test_iamax_mv<view_type_a_ls, Device>(1024,5);
-  //Test::impl_test_iamax_mv<view_type_a_ls, Device>(132231,5);
+  Test::impl_test_iamax_mv<view_type_a_ls, Device>(0, 5);
+  Test::impl_test_iamax_mv<view_type_a_ls, Device>(13, 5);
+  Test::impl_test_iamax_mv<view_type_a_ls, Device>(1024, 5);
+  // Test::impl_test_iamax_mv<view_type_a_ls, Device>(132231,5);
 #endif
 
-  return 1;}
+  return 1;
+}
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, iamax_float ) {
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, iamax_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_float");
-    test_iamax<float,TestExecSpace> ();
+  test_iamax<float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, iamax_mv_float ) {
+TEST_F(TestCategory, iamax_mv_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_mvfloat");
-    test_iamax_mv<float,TestExecSpace> ();
+  test_iamax_mv<float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-// FIXME_SYCL
-#ifndef KOKKOS_ENABLE_SYCL
-TEST_F( TestCategory, iamax_double ) {
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, iamax_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_double");
-    test_iamax<double,TestExecSpace> ();
+  test_iamax<double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, iamax_mv_double ) {
+TEST_F(TestCategory, iamax_mv_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_mv_double");
-    test_iamax_mv<double,TestExecSpace> ();
+  test_iamax_mv<double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
-#endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, iamax_complex_double ) {
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, iamax_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_complex_double");
-    test_iamax<Kokkos::complex<double>,TestExecSpace> ();
+  test_iamax<Kokkos::complex<double>, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, iamax_mv_complex_double ) {
+TEST_F(TestCategory, iamax_mv_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_mv_complex_double");
-    test_iamax_mv<Kokkos::complex<double>,TestExecSpace> ();
+  test_iamax_mv<Kokkos::complex<double>, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, iamax_int ) {
+#if defined(KOKKOSKERNELS_INST_INT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, iamax_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_int");
-    test_iamax<int,TestExecSpace> ();
+  test_iamax<int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, iamax_mv_int ) {
+TEST_F(TestCategory, iamax_mv_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_mv_int");
-    test_iamax_mv<int,TestExecSpace> ();
+  test_iamax_mv<int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
-
-
diff --git a/unit_test/blas/Test_Blas1_mult.hpp b/unit_test/blas/Test_Blas1_mult.hpp
index 60c86b242a..f92c60b570 100644
--- a/unit_test/blas/Test_Blas1_mult.hpp
+++ b/unit_test/blas/Test_Blas1_mult.hpp
@@ -1,13 +1,13 @@
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas1_mult.hpp>
-#include<KokkosBlas1_dot.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas1_mult.hpp>
+#include <KokkosBlas1_dot.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
-  template<class ViewTypeA, class ViewTypeB, class ViewTypeC, class Device>
-  void impl_test_mult(int N) {
+template <class ViewTypeA, class ViewTypeB, class ViewTypeC, class Device>
+void impl_test_mult(int N) {
   typedef typename ViewTypeA::value_type ScalarA;
   typedef typename ViewTypeB::value_type ScalarB;
   typedef typename ViewTypeC::value_type ScalarC;
@@ -65,248 +65,294 @@ namespace Test {
   Kokkos::deep_copy(z, b_org_z);
   KokkosBlas::mult(b, z, a, x, c_y);
   Kokkos::deep_copy(h_z, z);
-  for(int i = 0; i < N; i++)
-    {
+  for (int i = 0; i < N; i++) {
     EXPECT_NEAR_KK(a * h_x(i) * h_y(i) + b * h_b_org_z(i), h_z(i), eps);
   }
 
   Kokkos::deep_copy(z, b_org_z);
   KokkosBlas::mult(b, z, a, c_x, c_y);
   Kokkos::deep_copy(h_z, z);
-  for(int i = 0; i < N; i++)
-    {
+  for (int i = 0; i < N; i++) {
     EXPECT_NEAR_KK(a * h_x(i) * h_y(i) + b * h_b_org_z(i), h_z(i), eps);
   }
-  }
-
-  template<class ViewTypeA, class ViewTypeB, class ViewTypeC, class Device>
-  void impl_test_mult_mv(int N, int K) {
+}
 
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-    typedef typename ViewTypeC::value_type ScalarC;
+template <class ViewTypeA, class ViewTypeB, class ViewTypeC, class Device>
+void impl_test_mult_mv(int N, int K) {
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+  typedef typename ViewTypeC::value_type ScalarC;
 
-    typedef multivector_layout_adapter<ViewTypeB> vfB_type;
-    typedef multivector_layout_adapter<ViewTypeC> vfC_type;
+  typedef multivector_layout_adapter<ViewTypeB> vfB_type;
+  typedef multivector_layout_adapter<ViewTypeC> vfC_type;
 
-    ViewTypeA x("X", N);
-    typename vfB_type::BaseType b_y("Y", N, K);
-    typename vfC_type::BaseType b_z("Z",N,K);
-    typename vfC_type::BaseType b_org_z("Z",N,K);
+  ViewTypeA x("X", N);
+  typename vfB_type::BaseType b_y("Y", N, K);
+  typename vfC_type::BaseType b_z("Z", N, K);
+  typename vfC_type::BaseType b_org_z("Z", N, K);
 
-    ViewTypeB y = vfB_type::view(b_y);
-    ViewTypeC z = vfC_type::view(b_z);
+  ViewTypeB y = vfB_type::view(b_y);
+  ViewTypeC z = vfC_type::view(b_z);
 
-    typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
-    typedef multivector_layout_adapter<typename ViewTypeC::HostMirror> h_vfC_type;
+  typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
+  typedef multivector_layout_adapter<typename ViewTypeC::HostMirror> h_vfC_type;
 
-    typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y);
-    typename h_vfC_type::BaseType h_b_z = Kokkos::create_mirror_view(b_z);
+  typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y);
+  typename h_vfC_type::BaseType h_b_z = Kokkos::create_mirror_view(b_z);
 
-    typename ViewTypeA::HostMirror h_x = Kokkos::create_mirror_view(x);
-    typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
-    typename ViewTypeC::HostMirror h_z = h_vfC_type::view(h_b_z);
+  typename ViewTypeA::HostMirror h_x = Kokkos::create_mirror_view(x);
+  typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
+  typename ViewTypeC::HostMirror h_z = h_vfC_type::view(h_b_z);
 
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-    {
-      ScalarA randStart, randEnd;
-      Test::getRandomBounds(10.0, randStart, randEnd);
-      Kokkos::fill_random(x, rand_pool, randStart, randEnd);
-    }
-    {
-      ScalarB randStart, randEnd;
-      Test::getRandomBounds(10.0, randStart, randEnd);
-      Kokkos::fill_random(b_y,rand_pool,randStart,randEnd);
-    }
-    {
-      ScalarC randStart, randEnd;
-      Test::getRandomBounds(10.0, randStart, randEnd);
-      Kokkos::fill_random(b_z,rand_pool,randStart,randEnd);
-    }
+  {
+    ScalarA randStart, randEnd;
+    Test::getRandomBounds(10.0, randStart, randEnd);
+    Kokkos::fill_random(x, rand_pool, randStart, randEnd);
+  }
+  {
+    ScalarB randStart, randEnd;
+    Test::getRandomBounds(10.0, randStart, randEnd);
+    Kokkos::fill_random(b_y, rand_pool, randStart, randEnd);
+  }
+  {
+    ScalarC randStart, randEnd;
+    Test::getRandomBounds(10.0, randStart, randEnd);
+    Kokkos::fill_random(b_z, rand_pool, randStart, randEnd);
+  }
 
-    Kokkos::deep_copy(b_org_z,b_z);
-    auto h_b_org_z = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z);
+  Kokkos::deep_copy(b_org_z, b_z);
+  auto h_b_org_z =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z);
 
-    Kokkos::deep_copy(h_x, x);
-    Kokkos::deep_copy(h_b_y, b_y);
-    Kokkos::deep_copy(h_b_z,b_z);
+  Kokkos::deep_copy(h_x, x);
+  Kokkos::deep_copy(h_b_y, b_y);
+  Kokkos::deep_copy(h_b_z, b_z);
 
-    ScalarA a = 3;
-    ScalarB b = 5;
-    typename ViewTypeA::const_type c_x = x;
-    typename ViewTypeB::const_type c_y = y;
+  ScalarA a                          = 3;
+  ScalarB b                          = 5;
+  typename ViewTypeA::const_type c_x = x;
+  typename ViewTypeB::const_type c_y = y;
 
-    double eps = std::is_same<ScalarA,float>::value?1e-4:1e-7;
+  double eps = std::is_same<ScalarA, float>::value ? 1e-4 : 1e-7;
 
-    KokkosBlas::mult(b,z,a,x,y);
-    Kokkos::deep_copy(h_b_z, b_z);
-    for(int i = 0; i < N; i++)
-    {
-      for(int j = 0; j < K; j++)
-      {
-        EXPECT_NEAR_KK(a * h_x(i) * h_y(i, j) + b * h_b_org_z(i, j), h_z(i, j), eps);
-      }
+  KokkosBlas::mult(b, z, a, x, y);
+  Kokkos::deep_copy(h_b_z, b_z);
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < K; j++) {
+      EXPECT_NEAR_KK(a * h_x(i) * h_y(i, j) + b * h_b_org_z(i, j), h_z(i, j),
+                     eps);
     }
+  }
 
-    Kokkos::deep_copy(b_z,b_org_z);
-    KokkosBlas::mult(b,z,a,x,c_y);
-    Kokkos::deep_copy(h_b_z, b_z);
-    for(int i = 0; i < N; i++)
-    {
-      for(int j = 0; j < K; j++)
-      {
-        EXPECT_NEAR_KK(a * h_x(i) * h_y(i, j) + b * h_b_org_z(i, j), h_z(i, j), eps);
-      }
+  Kokkos::deep_copy(b_z, b_org_z);
+  KokkosBlas::mult(b, z, a, x, c_y);
+  Kokkos::deep_copy(h_b_z, b_z);
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < K; j++) {
+      EXPECT_NEAR_KK(a * h_x(i) * h_y(i, j) + b * h_b_org_z(i, j), h_z(i, j),
+                     eps);
     }
   }
 }
+}  // namespace Test
 
-
-
-template<class ScalarA, class ScalarB, class ScalarC, class Device>
+template <class ScalarA, class ScalarB, class ScalarC, class Device>
 int test_mult() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutLeft, Device> view_type_a_ll;
   typedef Kokkos::View<ScalarB*, Kokkos::LayoutLeft, Device> view_type_b_ll;
   typedef Kokkos::View<ScalarC*, Kokkos::LayoutLeft, Device> view_type_c_ll;
-  Test::impl_test_mult<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(0);
-  Test::impl_test_mult<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(13);
-  Test::impl_test_mult<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(1024);
-  //Test::impl_test_mult<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(132231);
+  Test::impl_test_mult<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(
+      0);
+  Test::impl_test_mult<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(
+      13);
+  Test::impl_test_mult<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(
+      1024);
+  // Test::impl_test_mult<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+  // Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutRight, Device> view_type_a_lr;
   typedef Kokkos::View<ScalarB*, Kokkos::LayoutRight, Device> view_type_b_lr;
   typedef Kokkos::View<ScalarC*, Kokkos::LayoutRight, Device> view_type_c_lr;
-  Test::impl_test_mult<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(0);
-  Test::impl_test_mult<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(13);
-  Test::impl_test_mult<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(1024);
-  //Test::impl_test_mult<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(132231);
+  Test::impl_test_mult<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(
+      0);
+  Test::impl_test_mult<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(
+      13);
+  Test::impl_test_mult<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(
+      1024);
+  // Test::impl_test_mult<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+  // Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutStride, Device> view_type_a_ls;
   typedef Kokkos::View<ScalarB*, Kokkos::LayoutStride, Device> view_type_b_ls;
   typedef Kokkos::View<ScalarC*, Kokkos::LayoutStride, Device> view_type_c_ls;
-  Test::impl_test_mult<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(0);
-  Test::impl_test_mult<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(13);
-  Test::impl_test_mult<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(1024);
-  //Test::impl_test_mult<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(132231);
+  Test::impl_test_mult<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(
+      0);
+  Test::impl_test_mult<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(
+      13);
+  Test::impl_test_mult<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(
+      1024);
+  // Test::impl_test_mult<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+  // Device>(132231);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-  Test::impl_test_mult<view_type_a_ls, view_type_b_ll, view_type_c_lr, Device>(1024);
-  Test::impl_test_mult<view_type_a_ll, view_type_b_ls, view_type_c_lr, Device>(1024);
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+  Test::impl_test_mult<view_type_a_ls, view_type_b_ll, view_type_c_lr, Device>(
+      1024);
+  Test::impl_test_mult<view_type_a_ll, view_type_b_ls, view_type_c_lr, Device>(
+      1024);
 #endif
 
   return 1;
 }
 
-template<class ScalarA, class ScalarB, class ScalarC, class Device>
+template <class ScalarA, class ScalarB, class ScalarC, class Device>
 int test_mult_mv() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutLeft, Device> view_type_a_ll;
   typedef Kokkos::View<ScalarB**, Kokkos::LayoutLeft, Device> view_type_b_ll;
   typedef Kokkos::View<ScalarC**, Kokkos::LayoutLeft, Device> view_type_c_ll;
-  Test::impl_test_mult_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(0,5);
-  Test::impl_test_mult_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(13,5);
-  Test::impl_test_mult_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(1024,5);
-  //Test::impl_test_mult_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(132231,5);
+  Test::impl_test_mult_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+                          Device>(0, 5);
+  Test::impl_test_mult_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+                          Device>(13, 5);
+  Test::impl_test_mult_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+                          Device>(1024, 5);
+  // Test::impl_test_mult_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+  // Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutRight, Device> view_type_a_lr;
   typedef Kokkos::View<ScalarB**, Kokkos::LayoutRight, Device> view_type_b_lr;
   typedef Kokkos::View<ScalarC**, Kokkos::LayoutRight, Device> view_type_c_lr;
-  Test::impl_test_mult_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(0,5);
-  Test::impl_test_mult_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(13,5);
-  Test::impl_test_mult_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(1024,5);
-  //Test::impl_test_mult_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(132231,5);
+  Test::impl_test_mult_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+                          Device>(0, 5);
+  Test::impl_test_mult_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+                          Device>(13, 5);
+  Test::impl_test_mult_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+                          Device>(1024, 5);
+  // Test::impl_test_mult_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+  // Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutStride, Device> view_type_a_ls;
   typedef Kokkos::View<ScalarB**, Kokkos::LayoutStride, Device> view_type_b_ls;
   typedef Kokkos::View<ScalarC**, Kokkos::LayoutStride, Device> view_type_c_ls;
-  Test::impl_test_mult_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(0,5);
-  Test::impl_test_mult_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(13,5);
-  Test::impl_test_mult_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(1024,5);
-  //Test::impl_test_mult_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(132231,5);
+  Test::impl_test_mult_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+                          Device>(0, 5);
+  Test::impl_test_mult_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+                          Device>(13, 5);
+  Test::impl_test_mult_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+                          Device>(1024, 5);
+  // Test::impl_test_mult_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+  // Device>(132231,5);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-  Test::impl_test_mult_mv<view_type_a_ls, view_type_b_ll, view_type_c_lr, Device>(1024,5);
-  Test::impl_test_mult_mv<view_type_a_ll, view_type_b_ls, view_type_c_lr, Device>(1024,5);
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+  Test::impl_test_mult_mv<view_type_a_ls, view_type_b_ll, view_type_c_lr,
+                          Device>(1024, 5);
+  Test::impl_test_mult_mv<view_type_a_ll, view_type_b_ls, view_type_c_lr,
+                          Device>(1024, 5);
 #endif
 
   return 1;
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, mult_float ) {
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, mult_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_float");
-    test_mult<float,float,float,TestExecSpace> ();
+  test_mult<float, float, float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, mult_mv_float ) {
+TEST_F(TestCategory, mult_mv_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_float");
-    test_mult_mv<float,float,float,TestExecSpace> ();
+  test_mult_mv<float, float, float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, mult_double ) {
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, mult_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_double");
-    test_mult<double,double,double,TestExecSpace> ();
+  test_mult<double, double, double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, mult_mv_double ) {
+TEST_F(TestCategory, mult_mv_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_mv_double");
-    test_mult_mv<double,double,double,TestExecSpace> ();
+  test_mult_mv<double, double, double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, mult_complex_double ) {
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, mult_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_complex_double");
-    test_mult<Kokkos::complex<double>,Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+  test_mult<Kokkos::complex<double>, Kokkos::complex<double>,
+            Kokkos::complex<double>, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, mult_mv_complex_double ) {
+TEST_F(TestCategory, mult_mv_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_mv_complex_double");
-    test_mult_mv<Kokkos::complex<double>,Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+  test_mult_mv<Kokkos::complex<double>, Kokkos::complex<double>,
+               Kokkos::complex<double>, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, mult_int ) {
+#if defined(KOKKOSKERNELS_INST_INT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, mult_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_int");
-    test_mult<int,int,int,TestExecSpace> ();
+  test_mult<int, int, int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, mult_mv_int ) {
+TEST_F(TestCategory, mult_mv_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_mv_int");
-    test_mult_mv<int,int,int,TestExecSpace> ();
+  test_mult_mv<int, int, int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-TEST_F( TestCategory, mult_double_int ) {
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+TEST_F(TestCategory, mult_double_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_double_int");
-    test_mult<double,int,float,TestExecSpace> ();
+  test_mult<double, int, float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, mult_mv_double_int ) {
+TEST_F(TestCategory, mult_mv_double_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_mv_double_int");
-    test_mult_mv<double,int,float,TestExecSpace> ();
+  test_mult_mv<double, int, float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
diff --git a/unit_test/blas/Test_Blas1_nrm1.hpp b/unit_test/blas/Test_Blas1_nrm1.hpp
index e711a9c649..72861bf5a3 100644
--- a/unit_test/blas/Test_Blas1_nrm1.hpp
+++ b/unit_test/blas/Test_Blas1_nrm1.hpp
@@ -1,110 +1,115 @@
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas1_nrm1.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas1_nrm1.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
-  template<class ViewTypeA, class Device>
-  void impl_test_nrm1(int N) {
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef Kokkos::ArithTraits<ScalarA> AT;
-    typedef typename AT::mag_type mag_type;
-    typedef Kokkos::ArithTraits<mag_type> MAT;
-
-    ViewTypeA a("A", N);
-
-    typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a);
-
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
-
-    ScalarA randStart, randEnd;
-    Test::getRandomBounds(10.0, randStart, randEnd);
-    Kokkos::fill_random(a, rand_pool, randStart, randEnd);
-
-    Kokkos::deep_copy(h_a, a);
-
-    typename ViewTypeA::const_type c_a = a;
-    double eps = (std::is_same<typename Kokkos::ArithTraits<ScalarA>::mag_type, float>::value ? 1e-4 : 1e-7);
-
-    mag_type expected_result = 0;
-    for(int i=0;i<N;i++)
-    {
-      //note: for complex, BLAS asum (aka our nrm1) is _not_
-      //the sum of magnitudes - it's the sum of absolute real and imaginary parts.
-      //See netlib, MKL, and CUBLAS documentation.
-      //
-      //This is safe; ArithTraits<T>::imag is 0 if T is real.
-      expected_result += MAT::abs(AT::real(h_a(i))) + MAT::abs(AT::imag(h_a(i)));
-    }
-
-    mag_type nonconst_result = KokkosBlas::nrm1(a);
-    EXPECT_NEAR_KK( nonconst_result, expected_result, eps * expected_result );
-
-    mag_type const_result = KokkosBlas::nrm1(c_a);
-    EXPECT_NEAR_KK( const_result, expected_result, eps * expected_result );
+template <class ViewTypeA, class Device>
+void impl_test_nrm1(int N) {
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef Kokkos::ArithTraits<ScalarA> AT;
+  typedef typename AT::mag_type mag_type;
+  typedef Kokkos::ArithTraits<mag_type> MAT;
+
+  ViewTypeA a("A", N);
+
+  typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a);
+
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
+
+  ScalarA randStart, randEnd;
+  Test::getRandomBounds(10.0, randStart, randEnd);
+  Kokkos::fill_random(a, rand_pool, randStart, randEnd);
+
+  Kokkos::deep_copy(h_a, a);
+
+  typename ViewTypeA::const_type c_a = a;
+  double eps = (std::is_same<typename Kokkos::ArithTraits<ScalarA>::mag_type,
+                             float>::value
+                    ? 1e-4
+                    : 1e-7);
+
+  mag_type expected_result = 0;
+  for (int i = 0; i < N; i++) {
+    // note: for complex, BLAS asum (aka our nrm1) is _not_
+    // the sum of magnitudes - it's the sum of absolute real and imaginary
+    // parts. See netlib, MKL, and CUBLAS documentation.
+    //
+    // This is safe; ArithTraits<T>::imag is 0 if T is real.
+    expected_result += MAT::abs(AT::real(h_a(i))) + MAT::abs(AT::imag(h_a(i)));
   }
 
-  template<class ViewTypeA, class Device>
-  void impl_test_nrm1_mv(int N, int K) {
+  mag_type nonconst_result = KokkosBlas::nrm1(a);
+  EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result);
+
+  mag_type const_result = KokkosBlas::nrm1(c_a);
+  EXPECT_NEAR_KK(const_result, expected_result, eps * expected_result);
+}
 
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef Kokkos::Details::ArithTraits<ScalarA> AT;
-    typedef typename AT::mag_type mag_type;
-    typedef Kokkos::ArithTraits<mag_type> MAT;
+template <class ViewTypeA, class Device>
+void impl_test_nrm1_mv(int N, int K) {
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef Kokkos::Details::ArithTraits<ScalarA> AT;
+  typedef typename AT::mag_type mag_type;
+  typedef Kokkos::ArithTraits<mag_type> MAT;
 
-    typedef multivector_layout_adapter<ViewTypeA> vfA_type;
+  typedef multivector_layout_adapter<ViewTypeA> vfA_type;
 
-    typename vfA_type::BaseType b_a("A",N,K);
+  typename vfA_type::BaseType b_a("A", N, K);
 
-    ViewTypeA a = vfA_type::view(b_a);
+  ViewTypeA a = vfA_type::view(b_a);
 
-    typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
+  typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
 
-    typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a);
+  typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a);
 
-    typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a);
+  typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a);
 
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-    ScalarA randStart, randEnd;
-    Test::getRandomBounds(10.0, randStart, randEnd);
-    Kokkos::fill_random(b_a,rand_pool,randStart,randEnd);
+  ScalarA randStart, randEnd;
+  Test::getRandomBounds(10.0, randStart, randEnd);
+  Kokkos::fill_random(b_a, rand_pool, randStart, randEnd);
 
-    Kokkos::deep_copy(h_b_a,b_a);
+  Kokkos::deep_copy(h_b_a, b_a);
 
-    typename ViewTypeA::const_type c_a = a;
+  typename ViewTypeA::const_type c_a = a;
 
-    double eps = (std::is_same<typename Kokkos::ArithTraits<ScalarA>::mag_type, float>::value ? 1e-4 : 1e-7);
+  double eps = (std::is_same<typename Kokkos::ArithTraits<ScalarA>::mag_type,
+                             float>::value
+                    ? 1e-4
+                    : 1e-7);
 
-    Kokkos::View<mag_type*, Kokkos::HostSpace> expected_result("Expected Nrm1", K);
-    for(int k = 0; k < K; k++)
-    {
-      expected_result(k) = MAT::zero();
-      for(int i=0;i<N;i++)
-      {
-        expected_result(k) += MAT::abs(AT::real(h_a(i, k))) + MAT::abs(AT::imag(h_a(i, k)));
-      }
+  Kokkos::View<mag_type*, Kokkos::HostSpace> expected_result("Expected Nrm1",
+                                                             K);
+  for (int k = 0; k < K; k++) {
+    expected_result(k) = MAT::zero();
+    for (int i = 0; i < N; i++) {
+      expected_result(k) +=
+          MAT::abs(AT::real(h_a(i, k))) + MAT::abs(AT::imag(h_a(i, k)));
     }
+  }
 
-    Kokkos::View<mag_type*,Kokkos::HostSpace> r("Nrm1::Result",K);
-    Kokkos::View<mag_type*,Kokkos::HostSpace> c_r("Nrm1::ConstResult",K);
+  Kokkos::View<mag_type*, Kokkos::HostSpace> r("Nrm1::Result", K);
+  Kokkos::View<mag_type*, Kokkos::HostSpace> c_r("Nrm1::ConstResult", K);
 
-    KokkosBlas::nrm1(r, a);
-    KokkosBlas::nrm1(c_r, a);
-    for(int k = 0; k < K; k++)
-    {
-      EXPECT_NEAR_KK( r(k), expected_result(k), eps * expected_result(k) );
-      EXPECT_NEAR_KK( c_r(k), expected_result(k), eps * expected_result(k) );
-    }
+  KokkosBlas::nrm1(r, a);
+  KokkosBlas::nrm1(c_r, a);
+  for (int k = 0; k < K; k++) {
+    EXPECT_NEAR_KK(r(k), expected_result(k), eps * expected_result(k));
+    EXPECT_NEAR_KK(c_r(k), expected_result(k), eps * expected_result(k));
   }
 }
+}  // namespace Test
 
-template<class ScalarA, class Device>
+template <class ScalarA, class Device>
 int test_nrm1() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutLeft, Device> view_type_a_ll;
   Test::impl_test_nrm1<view_type_a_ll, Device>(0);
   Test::impl_test_nrm1<view_type_a_ll, Device>(13);
@@ -112,7 +117,9 @@ int test_nrm1() {
   Test::impl_test_nrm1<view_type_a_ll, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutRight, Device> view_type_a_lr;
   Test::impl_test_nrm1<view_type_a_lr, Device>(0);
   Test::impl_test_nrm1<view_type_a_lr, Device>(13);
@@ -120,7 +127,9 @@ int test_nrm1() {
   Test::impl_test_nrm1<view_type_a_lr, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutStride, Device> view_type_a_ls;
   Test::impl_test_nrm1<view_type_a_ls, Device>(0);
   Test::impl_test_nrm1<view_type_a_ls, Device>(13);
@@ -131,86 +140,97 @@ int test_nrm1() {
   return 1;
 }
 
-template<class ScalarA, class Device>
+template <class ScalarA, class Device>
 int test_nrm1_mv() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device> view_type_a_ll;
-  Test::impl_test_nrm1_mv<view_type_a_ll, Device>(0,5);
-  Test::impl_test_nrm1_mv<view_type_a_ll, Device>(13,5);
-  Test::impl_test_nrm1_mv<view_type_a_ll, Device>(1024,5);
-  Test::impl_test_nrm1_mv<view_type_a_ll, Device>(132231,5);
+  Test::impl_test_nrm1_mv<view_type_a_ll, Device>(0, 5);
+  Test::impl_test_nrm1_mv<view_type_a_ll, Device>(13, 5);
+  Test::impl_test_nrm1_mv<view_type_a_ll, Device>(1024, 5);
+  Test::impl_test_nrm1_mv<view_type_a_ll, Device>(132231, 5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device> view_type_a_lr;
-  Test::impl_test_nrm1_mv<view_type_a_lr, Device>(0,5);
-  Test::impl_test_nrm1_mv<view_type_a_lr, Device>(13,5);
-  Test::impl_test_nrm1_mv<view_type_a_lr, Device>(1024,5);
-  Test::impl_test_nrm1_mv<view_type_a_lr, Device>(132231,5);
+  Test::impl_test_nrm1_mv<view_type_a_lr, Device>(0, 5);
+  Test::impl_test_nrm1_mv<view_type_a_lr, Device>(13, 5);
+  Test::impl_test_nrm1_mv<view_type_a_lr, Device>(1024, 5);
+  Test::impl_test_nrm1_mv<view_type_a_lr, Device>(132231, 5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutStride, Device> view_type_a_ls;
-  Test::impl_test_nrm1_mv<view_type_a_ls, Device>(0,5);
-  Test::impl_test_nrm1_mv<view_type_a_ls, Device>(13,5);
-  Test::impl_test_nrm1_mv<view_type_a_ls, Device>(1024,5);
-  Test::impl_test_nrm1_mv<view_type_a_ls, Device>(132231,5);
+  Test::impl_test_nrm1_mv<view_type_a_ls, Device>(0, 5);
+  Test::impl_test_nrm1_mv<view_type_a_ls, Device>(13, 5);
+  Test::impl_test_nrm1_mv<view_type_a_ls, Device>(1024, 5);
+  Test::impl_test_nrm1_mv<view_type_a_ls, Device>(132231, 5);
 #endif
 
   return 1;
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, nrm1_float ) {
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, nrm1_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_float");
-    test_nrm1<float,TestExecSpace> ();
+  test_nrm1<float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, nrm1_mv_float ) {
+TEST_F(TestCategory, nrm1_mv_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_mv_float");
-    test_nrm1_mv<float,TestExecSpace> ();
+  test_nrm1_mv<float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, nrm1_double ) {
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, nrm1_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_double");
-    test_nrm1<double,TestExecSpace> ();
+  test_nrm1<double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, nrm1_mv_double ) {
+TEST_F(TestCategory, nrm1_mv_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_mv_double");
-    test_nrm1_mv<double,TestExecSpace> ();
+  test_nrm1_mv<double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, nrm1_complex_double ) {
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, nrm1_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_complex_double");
-    test_nrm1<Kokkos::complex<double>,TestExecSpace> ();
+  test_nrm1<Kokkos::complex<double>, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, nrm1_mv_complex_double ) {
+TEST_F(TestCategory, nrm1_mv_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_mv_complex_double");
-    test_nrm1_mv<Kokkos::complex<double>,TestExecSpace> ();
+  test_nrm1_mv<Kokkos::complex<double>, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, nrm1_int ) {
+#if defined(KOKKOSKERNELS_INST_INT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, nrm1_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_int");
-    test_nrm1<int,TestExecSpace> ();
+  test_nrm1<int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, nrm1_mv_int ) {
+TEST_F(TestCategory, nrm1_mv_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_mv_int");
-    test_nrm1_mv<int,TestExecSpace> ();
+  test_nrm1_mv<int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
-
-
diff --git a/unit_test/blas/Test_Blas1_nrm2.hpp b/unit_test/blas/Test_Blas1_nrm2.hpp
index 7ab94aa759..94d5414e15 100644
--- a/unit_test/blas/Test_Blas1_nrm2.hpp
+++ b/unit_test/blas/Test_Blas1_nrm2.hpp
@@ -1,210 +1,231 @@
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas1_nrm2.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas1_nrm2.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
-  template<class ViewTypeA, class Device>
-  void impl_test_nrm2(int N) {
+template <class ViewTypeA, class Device>
+void impl_test_nrm2(int N) {
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef Kokkos::Details::ArithTraits<ScalarA> AT;
 
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef Kokkos::Details::ArithTraits<ScalarA> AT;
+  ViewTypeA a("A", N);
 
-    ViewTypeA a("A", N);
+  typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a);
 
-    typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  ScalarA randStart, randEnd;
+  Test::getRandomBounds(1.0, randStart, randEnd);
+  Kokkos::fill_random(a, rand_pool, randStart, randEnd);
 
-    ScalarA randStart, randEnd;
-    Test::getRandomBounds(1.0, randStart, randEnd);
-    Kokkos::fill_random(a, rand_pool, randStart, randEnd);
+  Kokkos::deep_copy(h_a, a);
 
-    Kokkos::deep_copy(h_a, a);
-
-    typename ViewTypeA::const_type c_a = a;
-    double eps = std::is_same<ScalarA,float>::value?2*1e-5:1e-7;
-
-    typename AT::mag_type expected_result = 0;
-    for(int i=0;i<N;i++)
-    { expected_result += AT::abs(h_a(i))*AT::abs(h_a(i)); }
-    expected_result = Kokkos::Details::ArithTraits<typename AT::mag_type>::sqrt(expected_result);
-
-    typename AT::mag_type nonconst_result = KokkosBlas::nrm2(a);
-    EXPECT_NEAR_KK( nonconst_result, expected_result, eps*expected_result);
-
-    typename AT::mag_type const_result = KokkosBlas::nrm2(c_a);
-    EXPECT_NEAR_KK( const_result, expected_result, eps*expected_result);
+  typename ViewTypeA::const_type c_a = a;
+  double eps = std::is_same<ScalarA, float>::value ? 2 * 1e-5 : 1e-7;
 
+  typename AT::mag_type expected_result = 0;
+  for (int i = 0; i < N; i++) {
+    expected_result += AT::abs(h_a(i)) * AT::abs(h_a(i));
   }
+  expected_result = Kokkos::Details::ArithTraits<typename AT::mag_type>::sqrt(
+      expected_result);
 
-  template<class ViewTypeA, class Device>
-  void impl_test_nrm2_mv(int N, int K) {
+  typename AT::mag_type nonconst_result = KokkosBlas::nrm2(a);
+  EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result);
 
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef Kokkos::Details::ArithTraits<ScalarA> AT;
+  typename AT::mag_type const_result = KokkosBlas::nrm2(c_a);
+  EXPECT_NEAR_KK(const_result, expected_result, eps * expected_result);
+}
 
-    typedef multivector_layout_adapter<ViewTypeA> vfA_type;
+template <class ViewTypeA, class Device>
+void impl_test_nrm2_mv(int N, int K) {
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef Kokkos::Details::ArithTraits<ScalarA> AT;
 
-    typename vfA_type::BaseType b_a("A",N,K);
+  typedef multivector_layout_adapter<ViewTypeA> vfA_type;
 
-    ViewTypeA a = vfA_type::view(b_a);
+  typename vfA_type::BaseType b_a("A", N, K);
 
-    typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
+  ViewTypeA a = vfA_type::view(b_a);
 
-    typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a);
+  typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
 
-    typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a);
+  typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a);
 
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a);
 
-    ScalarA randStart, randEnd;
-    Test::getRandomBounds(1.0, randStart, randEnd);
-    Kokkos::fill_random(b_a,rand_pool,randStart,randEnd);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-    Kokkos::deep_copy(h_b_a,b_a);
+  ScalarA randStart, randEnd;
+  Test::getRandomBounds(1.0, randStart, randEnd);
+  Kokkos::fill_random(b_a, rand_pool, randStart, randEnd);
 
-    typename ViewTypeA::const_type c_a = a;
+  Kokkos::deep_copy(h_b_a, b_a);
 
-    typename AT::mag_type* expected_result = new typename AT::mag_type[K];
-    for(int j=0;j<K;j++) {
-      expected_result[j] = typename AT::mag_type();
-      for(int i=0;i<N;i++)
-      { expected_result[j] += AT::abs(h_a(i,j))*AT::abs(h_a(i,j)); }
-      expected_result[j] = Kokkos::Details::ArithTraits<typename AT::mag_type>::sqrt(expected_result[j]);
-    }
+  typename ViewTypeA::const_type c_a = a;
 
-    double eps = std::is_same<ScalarA,float>::value?2*1e-5:1e-7;
+  typename AT::mag_type* expected_result = new typename AT::mag_type[K];
+  for (int j = 0; j < K; j++) {
+    expected_result[j] = typename AT::mag_type();
+    for (int i = 0; i < N; i++) {
+      expected_result[j] += AT::abs(h_a(i, j)) * AT::abs(h_a(i, j));
+    }
+    expected_result[j] =
+        Kokkos::Details::ArithTraits<typename AT::mag_type>::sqrt(
+            expected_result[j]);
+  }
 
-    Kokkos::View<typename AT::mag_type*,Kokkos::HostSpace> r("Dot::Result",K);
+  double eps = std::is_same<ScalarA, float>::value ? 2 * 1e-5 : 1e-7;
 
-    KokkosBlas::nrm2(r,a);
-    for(int k=0;k<K;k++) {
-      typename AT::mag_type nonconst_result = r(k);
-      EXPECT_NEAR_KK( nonconst_result, expected_result[k], eps*expected_result[k]);
-    }
+  Kokkos::View<typename AT::mag_type*, Kokkos::HostSpace> r("Dot::Result", K);
 
-    KokkosBlas::nrm2(r,c_a);
-    for(int k=0;k<K;k++) {
-      typename AT::mag_type const_result = r(k);
-      EXPECT_NEAR_KK( const_result, expected_result[k], eps*expected_result[k]);
-    }
+  KokkosBlas::nrm2(r, a);
+  for (int k = 0; k < K; k++) {
+    typename AT::mag_type nonconst_result = r(k);
+    EXPECT_NEAR_KK(nonconst_result, expected_result[k],
+                   eps * expected_result[k]);
+  }
 
-    delete [] expected_result;
+  KokkosBlas::nrm2(r, c_a);
+  for (int k = 0; k < K; k++) {
+    typename AT::mag_type const_result = r(k);
+    EXPECT_NEAR_KK(const_result, expected_result[k], eps * expected_result[k]);
   }
+
+  delete[] expected_result;
 }
+}  // namespace Test
 
-template<class ScalarA, class Device>
+template <class ScalarA, class Device>
 int test_nrm2() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutLeft, Device> view_type_a_ll;
   Test::impl_test_nrm2<view_type_a_ll, Device>(0);
   Test::impl_test_nrm2<view_type_a_ll, Device>(13);
   Test::impl_test_nrm2<view_type_a_ll, Device>(1024);
-  //Test::impl_test_nrm2<view_type_a_ll, Device>(132231);
+  // Test::impl_test_nrm2<view_type_a_ll, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutRight, Device> view_type_a_lr;
   Test::impl_test_nrm2<view_type_a_lr, Device>(0);
   Test::impl_test_nrm2<view_type_a_lr, Device>(13);
   Test::impl_test_nrm2<view_type_a_lr, Device>(1024);
-  //Test::impl_test_nrm2<view_type_a_lr, Device>(132231);
+  // Test::impl_test_nrm2<view_type_a_lr, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutStride, Device> view_type_a_ls;
   Test::impl_test_nrm2<view_type_a_ls, Device>(0);
   Test::impl_test_nrm2<view_type_a_ls, Device>(13);
   Test::impl_test_nrm2<view_type_a_ls, Device>(1024);
-  //Test::impl_test_nrm2<view_type_a_ls, Device>(132231);
+  // Test::impl_test_nrm2<view_type_a_ls, Device>(132231);
 #endif
 
   return 1;
 }
 
-template<class ScalarA, class Device>
+template <class ScalarA, class Device>
 int test_nrm2_mv() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device> view_type_a_ll;
-  Test::impl_test_nrm2_mv<view_type_a_ll, Device>(0,5);
-  Test::impl_test_nrm2_mv<view_type_a_ll, Device>(13,5);
-  Test::impl_test_nrm2_mv<view_type_a_ll, Device>(1024,5);
-  //Test::impl_test_nrm2_mv<view_type_a_ll, Device>(132231,5);
+  Test::impl_test_nrm2_mv<view_type_a_ll, Device>(0, 5);
+  Test::impl_test_nrm2_mv<view_type_a_ll, Device>(13, 5);
+  Test::impl_test_nrm2_mv<view_type_a_ll, Device>(1024, 5);
+  // Test::impl_test_nrm2_mv<view_type_a_ll, Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device> view_type_a_lr;
-  Test::impl_test_nrm2_mv<view_type_a_lr, Device>(0,5);
-  Test::impl_test_nrm2_mv<view_type_a_lr, Device>(13,5);
-  Test::impl_test_nrm2_mv<view_type_a_lr, Device>(1024,5);
-  //Test::impl_test_nrm2_mv<view_type_a_lr, Device>(132231,5);
+  Test::impl_test_nrm2_mv<view_type_a_lr, Device>(0, 5);
+  Test::impl_test_nrm2_mv<view_type_a_lr, Device>(13, 5);
+  Test::impl_test_nrm2_mv<view_type_a_lr, Device>(1024, 5);
+  // Test::impl_test_nrm2_mv<view_type_a_lr, Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutStride, Device> view_type_a_ls;
-  Test::impl_test_nrm2_mv<view_type_a_ls, Device>(0,5);
-  Test::impl_test_nrm2_mv<view_type_a_ls, Device>(13,5);
-  Test::impl_test_nrm2_mv<view_type_a_ls, Device>(1024,5);
-  //Test::impl_test_nrm2_mv<view_type_a_ls, Device>(132231,5);
+  Test::impl_test_nrm2_mv<view_type_a_ls, Device>(0, 5);
+  Test::impl_test_nrm2_mv<view_type_a_ls, Device>(13, 5);
+  Test::impl_test_nrm2_mv<view_type_a_ls, Device>(1024, 5);
+  // Test::impl_test_nrm2_mv<view_type_a_ls, Device>(132231,5);
 #endif
 
   return 1;
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, nrm2_float ) {
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, nrm2_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_float");
-    test_nrm2<float,TestExecSpace> ();
+  test_nrm2<float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, nrm2_mv_float ) {
+TEST_F(TestCategory, nrm2_mv_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_mv_float");
-    test_nrm2_mv<float,TestExecSpace> ();
+  test_nrm2_mv<float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, nrm2_double ) {
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, nrm2_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_double");
-    test_nrm2<double,TestExecSpace> ();
+  test_nrm2<double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, nrm2_mv_double ) {
+TEST_F(TestCategory, nrm2_mv_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_mv_double");
-    test_nrm2_mv<double,TestExecSpace> ();
+  test_nrm2_mv<double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, nrm2_complex_double ) {
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, nrm2_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_complex_double");
-    test_nrm2<Kokkos::complex<double>,TestExecSpace> ();
+  test_nrm2<Kokkos::complex<double>, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, nrm2_mv_complex_double ) {
+TEST_F(TestCategory, nrm2_mv_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_mv_complex_double");
-    test_nrm2_mv<Kokkos::complex<double>,TestExecSpace> ();
+  test_nrm2_mv<Kokkos::complex<double>, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, nrm2_int ) {
+#if defined(KOKKOSKERNELS_INST_INT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, nrm2_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_int");
-    test_nrm2<int,TestExecSpace> ();
+  test_nrm2<int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, nrm2_mv_int ) {
+TEST_F(TestCategory, nrm2_mv_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_mv_int");
-    test_nrm2_mv<int,TestExecSpace> ();
+  test_nrm2_mv<int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
-
-
diff --git a/unit_test/blas/Test_Blas1_nrm2_squared.hpp b/unit_test/blas/Test_Blas1_nrm2_squared.hpp
index aef2e2e95e..ca357acdb2 100644
--- a/unit_test/blas/Test_Blas1_nrm2_squared.hpp
+++ b/unit_test/blas/Test_Blas1_nrm2_squared.hpp
@@ -1,224 +1,249 @@
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas1_nrm2_squared.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas1_nrm2_squared.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
-  template<class ViewTypeA, class Device>
-  void impl_test_nrm2_squared(int N) {
+template <class ViewTypeA, class Device>
+void impl_test_nrm2_squared(int N) {
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef Kokkos::Details::ArithTraits<ScalarA> AT;
 
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef Kokkos::Details::ArithTraits<ScalarA> AT;
+  typedef Kokkos::View<
+      ScalarA * [2],
+      typename std::conditional<std::is_same<typename ViewTypeA::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeA;
 
-    typedef Kokkos::View<ScalarA*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeA::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeA;
+  BaseTypeA b_a("A", N);
 
+  ViewTypeA a = Kokkos::subview(b_a, Kokkos::ALL(), 0);
 
-    BaseTypeA b_a("A",N);
+  typename BaseTypeA::HostMirror h_b_a = Kokkos::create_mirror_view(b_a);
 
-    ViewTypeA a = Kokkos::subview(b_a,Kokkos::ALL(),0);
+  typename ViewTypeA::HostMirror h_a = Kokkos::subview(h_b_a, Kokkos::ALL(), 0);
 
-    typename BaseTypeA::HostMirror h_b_a = Kokkos::create_mirror_view(b_a);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-    typename ViewTypeA::HostMirror h_a = Kokkos::subview(h_b_a,Kokkos::ALL(),0);
+  ScalarA randStart, randEnd;
+  Test::getRandomBounds(1.0, randStart, randEnd);
+  Kokkos::fill_random(b_a, rand_pool, randStart, randEnd);
 
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  Kokkos::deep_copy(h_b_a, b_a);
 
-    ScalarA randStart, randEnd;
-    Test::getRandomBounds(1.0, randStart, randEnd);
-    Kokkos::fill_random(b_a,rand_pool,randStart,randEnd);
-
-    Kokkos::deep_copy(h_b_a,b_a);
-
-    typename ViewTypeA::const_type c_a = a;
-    double eps = std::is_same<ScalarA,float>::value?2*1e-5:1e-7;
-
-    typename AT::mag_type expected_result(0);
-    for(int i=0;i<N;i++)
-    { expected_result += AT::abs(h_a(i))*AT::abs(h_a(i)); }
-
-    typename AT::mag_type nonconst_result = KokkosBlas::nrm2_squared(a);
-    EXPECT_NEAR_KK( nonconst_result, expected_result, eps*expected_result);
-
-    typename AT::mag_type const_result = KokkosBlas::nrm2_squared(c_a);
-    EXPECT_NEAR_KK( const_result, expected_result, eps*expected_result);
+  typename ViewTypeA::const_type c_a = a;
+  double eps = std::is_same<ScalarA, float>::value ? 2 * 1e-5 : 1e-7;
 
+  typename AT::mag_type expected_result(0);
+  for (int i = 0; i < N; i++) {
+    expected_result += AT::abs(h_a(i)) * AT::abs(h_a(i));
   }
 
-  template<class ViewTypeA, class Device>
-  void impl_test_nrm2_squared_mv(int N, int K) {
+  typename AT::mag_type nonconst_result = KokkosBlas::nrm2_squared(a);
+  EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result);
 
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef Kokkos::Details::ArithTraits<ScalarA> AT;
+  typename AT::mag_type const_result = KokkosBlas::nrm2_squared(c_a);
+  EXPECT_NEAR_KK(const_result, expected_result, eps * expected_result);
+}
+
+template <class ViewTypeA, class Device>
+void impl_test_nrm2_squared_mv(int N, int K) {
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef Kokkos::Details::ArithTraits<ScalarA> AT;
 
-    typedef multivector_layout_adapter<ViewTypeA> vfA_type;
+  typedef multivector_layout_adapter<ViewTypeA> vfA_type;
 
-    typename vfA_type::BaseType b_a("A",N,K);
+  typename vfA_type::BaseType b_a("A", N, K);
 
-    ViewTypeA a = vfA_type::view(b_a);
+  ViewTypeA a = vfA_type::view(b_a);
 
-    typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
+  typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
 
-    typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a);
+  typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a);
 
-    typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a);
+  typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a);
 
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-    ScalarA randStart, randEnd;
-    Test::getRandomBounds(1.0, randStart, randEnd);
-    Kokkos::fill_random(b_a,rand_pool,randStart,randEnd);
+  ScalarA randStart, randEnd;
+  Test::getRandomBounds(1.0, randStart, randEnd);
+  Kokkos::fill_random(b_a, rand_pool, randStart, randEnd);
 
-    Kokkos::deep_copy(h_b_a,b_a);
+  Kokkos::deep_copy(h_b_a, b_a);
 
-    typename ViewTypeA::const_type c_a = a;
+  typename ViewTypeA::const_type c_a = a;
 
-    typename AT::mag_type* expected_result = new typename AT::mag_type[K];
-    for(int j=0;j<K;j++) {
-      expected_result[j] = typename AT::mag_type();
-      for(int i=0;i<N;i++)
-      { expected_result[j] += AT::abs(h_a(i,j))*AT::abs(h_a(i,j)); }
+  typename AT::mag_type* expected_result = new typename AT::mag_type[K];
+  for (int j = 0; j < K; j++) {
+    expected_result[j] = typename AT::mag_type();
+    for (int i = 0; i < N; i++) {
+      expected_result[j] += AT::abs(h_a(i, j)) * AT::abs(h_a(i, j));
     }
+  }
 
-    typename AT::mag_type eps = AT::epsilon()*1000;
-    typename AT::mag_type zero = AT::abs( AT::zero() );
-    typename AT::mag_type one = AT::abs( AT::one() );
+  typename AT::mag_type eps  = AT::epsilon() * 1000;
+  typename AT::mag_type zero = AT::abs(AT::zero());
+  typename AT::mag_type one  = AT::abs(AT::one());
 
-    Kokkos::View<typename AT::mag_type*,Kokkos::HostSpace> r("Dot::Result",K);
+  Kokkos::View<typename AT::mag_type*, Kokkos::HostSpace> r("Dot::Result", K);
 
-    KokkosBlas::nrm2_squared(r,a);
-    for(int k=0;k<K;k++) {
-      typename AT::mag_type nonconst_result = r(k);
-      typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]);
-      typename AT::mag_type diff = AT::abs(nonconst_result - expected_result[k])/divisor;
-      EXPECT_NEAR_KK( diff, zero, eps );
-    }
-
-    KokkosBlas::nrm2_squared(r,c_a);
-    for(int k=0;k<K;k++) {
-      typename AT::mag_type const_result = r(k);
-      typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]);
-      typename AT::mag_type diff = AT::abs(const_result - expected_result[k])/divisor;
-      EXPECT_NEAR_KK( diff, zero, eps );
-    }
+  KokkosBlas::nrm2_squared(r, a);
+  for (int k = 0; k < K; k++) {
+    typename AT::mag_type nonconst_result = r(k);
+    typename AT::mag_type divisor =
+        AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]);
+    typename AT::mag_type diff =
+        AT::abs(nonconst_result - expected_result[k]) / divisor;
+    EXPECT_NEAR_KK(diff, zero, eps);
+  }
 
-    delete [] expected_result;
+  KokkosBlas::nrm2_squared(r, c_a);
+  for (int k = 0; k < K; k++) {
+    typename AT::mag_type const_result = r(k);
+    typename AT::mag_type divisor =
+        AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]);
+    typename AT::mag_type diff =
+        AT::abs(const_result - expected_result[k]) / divisor;
+    EXPECT_NEAR_KK(diff, zero, eps);
   }
+
+  delete[] expected_result;
 }
+}  // namespace Test
 
-template<class ScalarA, class Device>
+template <class ScalarA, class Device>
 int test_nrm2_squared() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutLeft, Device> view_type_a_ll;
   Test::impl_test_nrm2_squared<view_type_a_ll, Device>(0);
   Test::impl_test_nrm2_squared<view_type_a_ll, Device>(13);
   Test::impl_test_nrm2_squared<view_type_a_ll, Device>(1024);
-  //Test::impl_test_nrm2_squared<view_type_a_ll, Device>(132231);
+  // Test::impl_test_nrm2_squared<view_type_a_ll, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutRight, Device> view_type_a_lr;
   Test::impl_test_nrm2_squared<view_type_a_lr, Device>(0);
   Test::impl_test_nrm2_squared<view_type_a_lr, Device>(13);
   Test::impl_test_nrm2_squared<view_type_a_lr, Device>(1024);
-  //Test::impl_test_nrm2_squared<view_type_a_lr, Device>(132231);
+  // Test::impl_test_nrm2_squared<view_type_a_lr, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutStride, Device> view_type_a_ls;
   Test::impl_test_nrm2_squared<view_type_a_ls, Device>(0);
   Test::impl_test_nrm2_squared<view_type_a_ls, Device>(13);
   Test::impl_test_nrm2_squared<view_type_a_ls, Device>(1024);
-  //Test::impl_test_nrm2_squared<view_type_a_ls, Device>(132231);
+  // Test::impl_test_nrm2_squared<view_type_a_ls, Device>(132231);
 #endif
 
   return 1;
 }
 
-template<class ScalarA, class Device>
+template <class ScalarA, class Device>
 int test_nrm2_squared_mv() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device> view_type_a_ll;
-  Test::impl_test_nrm2_squared_mv<view_type_a_ll, Device>(0,5);
-  Test::impl_test_nrm2_squared_mv<view_type_a_ll, Device>(13,5);
-  Test::impl_test_nrm2_squared_mv<view_type_a_ll, Device>(1024,5);
-  //Test::impl_test_nrm2_squared_mv<view_type_a_ll, Device>(132231,5);
+  Test::impl_test_nrm2_squared_mv<view_type_a_ll, Device>(0, 5);
+  Test::impl_test_nrm2_squared_mv<view_type_a_ll, Device>(13, 5);
+  Test::impl_test_nrm2_squared_mv<view_type_a_ll, Device>(1024, 5);
+  // Test::impl_test_nrm2_squared_mv<view_type_a_ll, Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device> view_type_a_lr;
-  Test::impl_test_nrm2_squared_mv<view_type_a_lr, Device>(0,5);
-  Test::impl_test_nrm2_squared_mv<view_type_a_lr, Device>(13,5);
-  Test::impl_test_nrm2_squared_mv<view_type_a_lr, Device>(1024,5);
-  //Test::impl_test_nrm2_squared_mv<view_type_a_lr, Device>(132231,5);
+  Test::impl_test_nrm2_squared_mv<view_type_a_lr, Device>(0, 5);
+  Test::impl_test_nrm2_squared_mv<view_type_a_lr, Device>(13, 5);
+  Test::impl_test_nrm2_squared_mv<view_type_a_lr, Device>(1024, 5);
+  // Test::impl_test_nrm2_squared_mv<view_type_a_lr, Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutStride, Device> view_type_a_ls;
-  Test::impl_test_nrm2_squared_mv<view_type_a_ls, Device>(0,5);
-  Test::impl_test_nrm2_squared_mv<view_type_a_ls, Device>(13,5);
-  Test::impl_test_nrm2_squared_mv<view_type_a_ls, Device>(1024,5);
-  //Test::impl_test_nrm2_squared_mv<view_type_a_ls, Device>(132231,5);
+  Test::impl_test_nrm2_squared_mv<view_type_a_ls, Device>(0, 5);
+  Test::impl_test_nrm2_squared_mv<view_type_a_ls, Device>(13, 5);
+  Test::impl_test_nrm2_squared_mv<view_type_a_ls, Device>(1024, 5);
+  // Test::impl_test_nrm2_squared_mv<view_type_a_ls, Device>(132231,5);
 #endif
 
   return 1;
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, nrm2_squared_float ) {
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, nrm2_squared_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_float");
-    test_nrm2_squared<float,TestExecSpace> ();
+  test_nrm2_squared<float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, nrm2_squared_mv_float ) {
+TEST_F(TestCategory, nrm2_squared_mv_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_mv_float");
-    test_nrm2_squared_mv<float,TestExecSpace> ();
+  test_nrm2_squared_mv<float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, nrm2_squared_double ) {
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, nrm2_squared_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_double");
-    test_nrm2_squared<double,TestExecSpace> ();
+  test_nrm2_squared<double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, nrm2_squared_mv_double ) {
+TEST_F(TestCategory, nrm2_squared_mv_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_mv_double");
-    test_nrm2_squared_mv<double,TestExecSpace> ();
+  test_nrm2_squared_mv<double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, nrm2_squared_complex_double ) {
-  Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_complex_double");
-    test_nrm2_squared<Kokkos::complex<double>,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, nrm2_squared_complex_double) {
+  Kokkos::Profiling::pushRegion(
+      "KokkosBlas::Test::nrm2_squared_complex_double");
+  test_nrm2_squared<Kokkos::complex<double>, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, nrm2_squared_mv_complex_double ) {
-  Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_mv_complex_double");
-    test_nrm2_squared_mv<Kokkos::complex<double>,TestExecSpace> ();
+TEST_F(TestCategory, nrm2_squared_mv_complex_double) {
+  Kokkos::Profiling::pushRegion(
+      "KokkosBlas::Test::nrm2_squared_mv_complex_double");
+  test_nrm2_squared_mv<Kokkos::complex<double>, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, nrm2_squared_int ) {
+#if defined(KOKKOSKERNELS_INST_INT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, nrm2_squared_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_int");
-    test_nrm2_squared<int,TestExecSpace> ();
+  test_nrm2_squared<int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, nrm2_squared_mv_int ) {
+TEST_F(TestCategory, nrm2_squared_mv_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_mv_int");
-    test_nrm2_squared_mv<int,TestExecSpace> ();
+  test_nrm2_squared_mv<int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
-
-
diff --git a/unit_test/blas/Test_Blas1_nrminf.hpp b/unit_test/blas/Test_Blas1_nrminf.hpp
index 5bb1f2810f..8d9aa2d1b2 100644
--- a/unit_test/blas/Test_Blas1_nrminf.hpp
+++ b/unit_test/blas/Test_Blas1_nrminf.hpp
@@ -1,212 +1,231 @@
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas1_nrminf.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas1_nrminf.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
-  template<class ViewTypeA, class Device>
-  void impl_test_nrminf(int N) {
+template <class ViewTypeA, class Device>
+void impl_test_nrminf(int N) {
+  typedef typename ViewTypeA::non_const_value_type ScalarA;
+  typedef Kokkos::Details::ArithTraits<ScalarA> AT;
 
-    typedef typename ViewTypeA::non_const_value_type ScalarA;
-    typedef Kokkos::Details::ArithTraits<ScalarA> AT;
+  ViewTypeA a("A", N);
 
-    ViewTypeA a("A", N);
+  typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a);
 
-    typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  ScalarA randStart, randEnd;
+  Test::getRandomBounds(10.0, randStart, randEnd);
+  Kokkos::fill_random(a, rand_pool, randStart, randEnd);
 
-    ScalarA randStart, randEnd;
-    Test::getRandomBounds(10.0, randStart, randEnd);
-    Kokkos::fill_random(a, rand_pool, randStart, randEnd);
+  Kokkos::deep_copy(h_a, a);
 
-    Kokkos::deep_copy(h_a, a);
+  typename ViewTypeA::const_type c_a = a;
+  double eps = std::is_same<ScalarA, float>::value ? 2 * 1e-5 : 1e-7;
 
-    typename ViewTypeA::const_type c_a = a;
-    double eps = std::is_same<ScalarA,float>::value?2*1e-5:1e-7;
+  typename AT::mag_type expected_result =
+      Kokkos::Details::ArithTraits<typename AT::mag_type>::min();
+  for (int i = 0; i < N; i++)
+    if (AT::abs(h_a(i)) > expected_result) expected_result = AT::abs(h_a(i));
 
-    typename AT::mag_type expected_result = Kokkos::Details::ArithTraits<typename AT::mag_type>::min();
-    for(int i=0;i<N;i++)
-      if(AT::abs(h_a(i)) > expected_result) expected_result = AT::abs(h_a(i));
+  if (N == 0) expected_result = typename AT::mag_type(0);
 
-    if(N == 0) expected_result = typename AT::mag_type(0);
+  typename AT::mag_type nonconst_result = KokkosBlas::nrminf(a);
+  EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result);
 
-    typename AT::mag_type nonconst_result = KokkosBlas::nrminf(a);
-    EXPECT_NEAR_KK( nonconst_result, expected_result, eps*expected_result);
-
-    typename AT::mag_type const_result = KokkosBlas::nrminf(c_a);
-    EXPECT_NEAR_KK( const_result, expected_result, eps*expected_result);
-
-  }
-
-  template<class ViewTypeA, class Device>
-  void impl_test_nrminf_mv(int N, int K) {
+  typename AT::mag_type const_result = KokkosBlas::nrminf(c_a);
+  EXPECT_NEAR_KK(const_result, expected_result, eps * expected_result);
+}
 
-    typedef typename ViewTypeA::non_const_value_type ScalarA;
-    typedef Kokkos::Details::ArithTraits<ScalarA> AT;
+template <class ViewTypeA, class Device>
+void impl_test_nrminf_mv(int N, int K) {
+  typedef typename ViewTypeA::non_const_value_type ScalarA;
+  typedef Kokkos::Details::ArithTraits<ScalarA> AT;
 
-    typedef multivector_layout_adapter<ViewTypeA> vfA_type;
+  typedef multivector_layout_adapter<ViewTypeA> vfA_type;
 
-    typename vfA_type::BaseType b_a("A",N,K);
+  typename vfA_type::BaseType b_a("A", N, K);
 
-    ViewTypeA a = vfA_type::view(b_a);
+  ViewTypeA a = vfA_type::view(b_a);
 
-    typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
+  typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
 
-    typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a);
+  typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a);
 
-    typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a);
+  typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a);
 
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-    ScalarA randStart, randEnd;
-    Test::getRandomBounds(10.0, randStart, randEnd);
-    Kokkos::fill_random(b_a,rand_pool,randStart,randEnd);
+  ScalarA randStart, randEnd;
+  Test::getRandomBounds(10.0, randStart, randEnd);
+  Kokkos::fill_random(b_a, rand_pool, randStart, randEnd);
 
-    Kokkos::deep_copy(h_b_a,b_a);
+  Kokkos::deep_copy(h_b_a, b_a);
 
-    typename ViewTypeA::const_type c_a = a;
+  typename ViewTypeA::const_type c_a = a;
 
-    typename AT::mag_type* expected_result = new typename AT::mag_type[K];
-    for(int j=0;j<K;j++) {
-      expected_result[j] = Kokkos::Details::ArithTraits<typename AT::mag_type>::min();
-      for(int i=0;i<N;i++) {
-        if(AT::abs(h_a(i,j)) > expected_result[j]) expected_result[j] = AT::abs(h_a(i,j));
-      }
-      if(N == 0) expected_result[j] = typename AT::mag_type(0);
+  typename AT::mag_type* expected_result = new typename AT::mag_type[K];
+  for (int j = 0; j < K; j++) {
+    expected_result[j] =
+        Kokkos::Details::ArithTraits<typename AT::mag_type>::min();
+    for (int i = 0; i < N; i++) {
+      if (AT::abs(h_a(i, j)) > expected_result[j])
+        expected_result[j] = AT::abs(h_a(i, j));
     }
+    if (N == 0) expected_result[j] = typename AT::mag_type(0);
+  }
 
-    double eps = std::is_same<ScalarA,float>::value?2*1e-5:1e-7;
+  double eps = std::is_same<ScalarA, float>::value ? 2 * 1e-5 : 1e-7;
 
-    Kokkos::View<typename AT::mag_type*,Kokkos::HostSpace> r("Dot::Result",K);
+  Kokkos::View<typename AT::mag_type*, Kokkos::HostSpace> r("Dot::Result", K);
 
-    KokkosBlas::nrminf(r,a);
-    for(int k=0;k<K;k++) {
-      typename AT::mag_type nonconst_result = r(k);
-      typename AT::mag_type exp_result = expected_result[k];
-      EXPECT_NEAR_KK( nonconst_result, exp_result, eps*exp_result);
-    }
+  KokkosBlas::nrminf(r, a);
+  for (int k = 0; k < K; k++) {
+    typename AT::mag_type nonconst_result = r(k);
+    typename AT::mag_type exp_result      = expected_result[k];
+    EXPECT_NEAR_KK(nonconst_result, exp_result, eps * exp_result);
+  }
 
-    KokkosBlas::nrminf(r,c_a);
-    for(int k=0;k<K;k++) {
-      typename AT::mag_type const_result = r(k);
-      typename AT::mag_type exp_result = expected_result[k];
-      EXPECT_NEAR_KK( const_result, exp_result, eps*exp_result);
-    }
-    delete [] expected_result;
+  KokkosBlas::nrminf(r, c_a);
+  for (int k = 0; k < K; k++) {
+    typename AT::mag_type const_result = r(k);
+    typename AT::mag_type exp_result   = expected_result[k];
+    EXPECT_NEAR_KK(const_result, exp_result, eps * exp_result);
   }
+  delete[] expected_result;
 }
+}  // namespace Test
 
-template<class ScalarA, class Device>
+template <class ScalarA, class Device>
 int test_nrminf() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutLeft, Device> view_type_a_ll;
   Test::impl_test_nrminf<view_type_a_ll, Device>(0);
   Test::impl_test_nrminf<view_type_a_ll, Device>(13);
   Test::impl_test_nrminf<view_type_a_ll, Device>(1024);
-  //Test::impl_test_nrminf<view_type_a_ll, Device>(132231);
+  // Test::impl_test_nrminf<view_type_a_ll, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutRight, Device> view_type_a_lr;
   Test::impl_test_nrminf<view_type_a_lr, Device>(0);
   Test::impl_test_nrminf<view_type_a_lr, Device>(13);
   Test::impl_test_nrminf<view_type_a_lr, Device>(1024);
-  //Test::impl_test_nrminf<view_type_a_lr, Device>(132231);
+  // Test::impl_test_nrminf<view_type_a_lr, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutStride, Device> view_type_a_ls;
   Test::impl_test_nrminf<view_type_a_ls, Device>(0);
   Test::impl_test_nrminf<view_type_a_ls, Device>(13);
   Test::impl_test_nrminf<view_type_a_ls, Device>(1024);
-  //Test::impl_test_nrminf<view_type_a_ls, Device>(132231);
+  // Test::impl_test_nrminf<view_type_a_ls, Device>(132231);
 #endif
 
   return 1;
 }
 
-template<class ScalarA, class Device>
+template <class ScalarA, class Device>
 int test_nrminf_mv() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device> view_type_a_ll;
-  Test::impl_test_nrminf_mv<view_type_a_ll, Device>(0,5);
-  Test::impl_test_nrminf_mv<view_type_a_ll, Device>(13,5);
-  Test::impl_test_nrminf_mv<view_type_a_ll, Device>(1024,5);
-  //Test::impl_test_nrminf_mv<view_type_a_ll, Device>(132231,5);
+  Test::impl_test_nrminf_mv<view_type_a_ll, Device>(0, 5);
+  Test::impl_test_nrminf_mv<view_type_a_ll, Device>(13, 5);
+  Test::impl_test_nrminf_mv<view_type_a_ll, Device>(1024, 5);
+  // Test::impl_test_nrminf_mv<view_type_a_ll, Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device> view_type_a_lr;
-  Test::impl_test_nrminf_mv<view_type_a_lr, Device>(0,5);
-  Test::impl_test_nrminf_mv<view_type_a_lr, Device>(13,5);
-  Test::impl_test_nrminf_mv<view_type_a_lr, Device>(1024,5);
-  //Test::impl_test_nrminf_mv<view_type_a_lr, Device>(132231,5);
+  Test::impl_test_nrminf_mv<view_type_a_lr, Device>(0, 5);
+  Test::impl_test_nrminf_mv<view_type_a_lr, Device>(13, 5);
+  Test::impl_test_nrminf_mv<view_type_a_lr, Device>(1024, 5);
+  // Test::impl_test_nrminf_mv<view_type_a_lr, Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutStride, Device> view_type_a_ls;
-  Test::impl_test_nrminf_mv<view_type_a_ls, Device>(0,5);
-  Test::impl_test_nrminf_mv<view_type_a_ls, Device>(13,5);
-  Test::impl_test_nrminf_mv<view_type_a_ls, Device>(1024,5);
-  //Test::impl_test_nrminf_mv<view_type_a_ls, Device>(132231,5);
+  Test::impl_test_nrminf_mv<view_type_a_ls, Device>(0, 5);
+  Test::impl_test_nrminf_mv<view_type_a_ls, Device>(13, 5);
+  Test::impl_test_nrminf_mv<view_type_a_ls, Device>(1024, 5);
+  // Test::impl_test_nrminf_mv<view_type_a_ls, Device>(132231,5);
 #endif
 
-  return 1;}
+  return 1;
+}
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, nrminf_float ) {
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, nrminf_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_float");
-    test_nrminf<float,TestExecSpace> ();
+  test_nrminf<float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, nrminf_mv_float ) {
+TEST_F(TestCategory, nrminf_mv_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_mvfloat");
-    test_nrminf_mv<float,TestExecSpace> ();
+  test_nrminf_mv<float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, nrminf_double ) {
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, nrminf_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_double");
-    test_nrminf<double,TestExecSpace> ();
+  test_nrminf<double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, nrminf_mv_double ) {
+TEST_F(TestCategory, nrminf_mv_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_mv_double");
-    test_nrminf_mv<double,TestExecSpace> ();
+  test_nrminf_mv<double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, nrminf_complex_double ) {
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, nrminf_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_complex_double");
-    test_nrminf<Kokkos::complex<double>,TestExecSpace> ();
+  test_nrminf<Kokkos::complex<double>, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, nrminf_mv_complex_double ) {
+TEST_F(TestCategory, nrminf_mv_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_mv_complex_double");
-    test_nrminf_mv<Kokkos::complex<double>,TestExecSpace> ();
+  test_nrminf_mv<Kokkos::complex<double>, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, nrminf_int ) {
+#if defined(KOKKOSKERNELS_INST_INT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, nrminf_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_int");
-    test_nrminf<int,TestExecSpace> ();
+  test_nrminf<int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, nrminf_mv_int ) {
+TEST_F(TestCategory, nrminf_mv_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_mv_int");
-    test_nrminf_mv<int,TestExecSpace> ();
+  test_nrminf_mv<int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
-
-
diff --git a/unit_test/blas/Test_Blas1_reciprocal.hpp b/unit_test/blas/Test_Blas1_reciprocal.hpp
index 54b3ccfaa7..9f5991ea16 100644
--- a/unit_test/blas/Test_Blas1_reciprocal.hpp
+++ b/unit_test/blas/Test_Blas1_reciprocal.hpp
@@ -1,197 +1,215 @@
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas1_reciprocal.hpp>
-#include<KokkosBlas1_dot.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas1_reciprocal.hpp>
+#include <KokkosBlas1_dot.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
-  template<class ViewTypeA, class ViewTypeB, class Device>
-  void impl_test_reciprocal(int N) {
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-    typedef Kokkos::Details::ArithTraits<ScalarA> AT;
-
-    typedef Kokkos::View<ScalarA*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeA::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeA;
-    typedef Kokkos::View<ScalarB*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeB::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeB;
-
-
-    typename AT::mag_type eps = AT::epsilon()*2000;
-    typename AT::mag_type zero = AT::abs( AT::zero() );
-    typename AT::mag_type one = AT::abs( AT::one() );
-
-    BaseTypeA b_x("X",N);
-    BaseTypeB b_y("Y",N);
-    BaseTypeB b_org_y("Org_Y",N);
-    
-
-    ViewTypeA x = Kokkos::subview(b_x,Kokkos::ALL(),0);
-    ViewTypeB y = Kokkos::subview(b_y,Kokkos::ALL(),0);
-    typename ViewTypeA::const_type c_x = x;
-
-    typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
-    typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y);
-
-    typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x,Kokkos::ALL(),0);
-    typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y,Kokkos::ALL(),0);
-
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
-
-    {
-      ScalarA randStart, randEnd;
-      Test::getRandomBounds(1.0, randStart, randEnd);
-      Kokkos::fill_random(b_x,rand_pool,randStart,randEnd);
-    }
-    {
-      ScalarB randStart, randEnd;
-      Test::getRandomBounds(1.0, randStart, randEnd);
-      Kokkos::fill_random(b_y,rand_pool,randStart,randEnd);
-    }
-
-    Kokkos::deep_copy(b_org_y,b_y);
-
-    Kokkos::deep_copy(h_b_x,b_x);
-    Kokkos::deep_copy(h_b_y,b_y);
-
-    ScalarA expected_result(0);
-    for(int i=0;i<N;i++)
-    { expected_result += AT::abs(AT::one()/h_x(i)) * AT::abs(AT::one()/h_x(i)); }
-
-    KokkosBlas::reciprocal(y,x);
-    ScalarB nonconst_nonconst_result = KokkosBlas::dot(y,y);
-    typename AT::mag_type divisor = AT::abs(expected_result) == zero ? one : AT::abs(expected_result);
-    typename AT::mag_type diff = AT::abs( nonconst_nonconst_result - expected_result )/divisor;
-    EXPECT_NEAR_KK( diff, zero, eps );
- 
-    Kokkos::deep_copy(b_y,b_org_y);
-    KokkosBlas::reciprocal(y,c_x);
-    ScalarB const_nonconst_result = KokkosBlas::dot(y,y);
-    diff = AT::abs( const_nonconst_result - expected_result )/divisor;
-    EXPECT_NEAR_KK( diff, zero, eps );
+template <class ViewTypeA, class ViewTypeB, class Device>
+void impl_test_reciprocal(int N) {
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+  typedef Kokkos::Details::ArithTraits<ScalarA> AT;
+
+  typedef Kokkos::View<
+      ScalarA * [2],
+      typename std::conditional<std::is_same<typename ViewTypeA::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeA;
+  typedef Kokkos::View<
+      ScalarB * [2],
+      typename std::conditional<std::is_same<typename ViewTypeB::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeB;
+
+  typename AT::mag_type eps  = AT::epsilon() * 2000;
+  typename AT::mag_type zero = AT::abs(AT::zero());
+  typename AT::mag_type one  = AT::abs(AT::one());
+
+  BaseTypeA b_x("X", N);
+  BaseTypeB b_y("Y", N);
+  BaseTypeB b_org_y("Org_Y", N);
+
+  ViewTypeA x                        = Kokkos::subview(b_x, Kokkos::ALL(), 0);
+  ViewTypeB y                        = Kokkos::subview(b_y, Kokkos::ALL(), 0);
+  typename ViewTypeA::const_type c_x = x;
+
+  typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
+  typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y);
+
+  typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0);
+  typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0);
+
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
+
+  {
+    ScalarA randStart, randEnd;
+    Test::getRandomBounds(1.0, randStart, randEnd);
+    Kokkos::fill_random(b_x, rand_pool, randStart, randEnd);
+  }
+  {
+    ScalarB randStart, randEnd;
+    Test::getRandomBounds(1.0, randStart, randEnd);
+    Kokkos::fill_random(b_y, rand_pool, randStart, randEnd);
   }
 
-  template<class ViewTypeA, class ViewTypeB, class Device>
-  void impl_test_reciprocal_mv(int N, int K) {
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-    typedef Kokkos::Details::ArithTraits<ScalarA> AT;
+  Kokkos::deep_copy(b_org_y, b_y);
 
-    typedef multivector_layout_adapter<ViewTypeA> vfA_type;
-    typedef multivector_layout_adapter<ViewTypeB> vfB_type;
+  Kokkos::deep_copy(h_b_x, b_x);
+  Kokkos::deep_copy(h_b_y, b_y);
 
-    typename vfA_type::BaseType b_x("A",N,K);
-    typename vfB_type::BaseType b_y("B",N,K);
-    typename vfB_type::BaseType b_org_y("B",N,K);
+  ScalarA expected_result(0);
+  for (int i = 0; i < N; i++) {
+    expected_result +=
+        AT::abs(AT::one() / h_x(i)) * AT::abs(AT::one() / h_x(i));
+  }
 
-    ViewTypeA x = vfA_type::view(b_x);
-    ViewTypeB y = vfB_type::view(b_y);
+  KokkosBlas::reciprocal(y, x);
+  ScalarB nonconst_nonconst_result = KokkosBlas::dot(y, y);
+  typename AT::mag_type divisor =
+      AT::abs(expected_result) == zero ? one : AT::abs(expected_result);
+  typename AT::mag_type diff =
+      AT::abs(nonconst_nonconst_result - expected_result) / divisor;
+  EXPECT_NEAR_KK(diff, zero, eps);
+
+  Kokkos::deep_copy(b_y, b_org_y);
+  KokkosBlas::reciprocal(y, c_x);
+  ScalarB const_nonconst_result = KokkosBlas::dot(y, y);
+  diff = AT::abs(const_nonconst_result - expected_result) / divisor;
+  EXPECT_NEAR_KK(diff, zero, eps);
+}
 
-    typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
-    typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
+template <class ViewTypeA, class ViewTypeB, class Device>
+void impl_test_reciprocal_mv(int N, int K) {
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+  typedef Kokkos::Details::ArithTraits<ScalarA> AT;
 
-    typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x);
-    typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y);
+  typedef multivector_layout_adapter<ViewTypeA> vfA_type;
+  typedef multivector_layout_adapter<ViewTypeB> vfB_type;
 
-    typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x);
-    typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
+  typename vfA_type::BaseType b_x("A", N, K);
+  typename vfB_type::BaseType b_y("B", N, K);
+  typename vfB_type::BaseType b_org_y("B", N, K);
 
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  ViewTypeA x = vfA_type::view(b_x);
+  ViewTypeB y = vfB_type::view(b_y);
 
-    {
-      ScalarA randStart, randEnd;
-      Test::getRandomBounds(1.0, randStart, randEnd);
-      Kokkos::fill_random(b_x,rand_pool,randStart,randEnd);
-    }
-    {
-      ScalarB randStart, randEnd;
-      Test::getRandomBounds(1.0, randStart, randEnd);
-      Kokkos::fill_random(b_y,rand_pool,randStart,randEnd);
-    }
+  typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
+  typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
 
-    Kokkos::deep_copy(b_org_y,b_y);
+  typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x);
+  typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y);
 
-    Kokkos::deep_copy(h_b_x,b_x);
-    Kokkos::deep_copy(h_b_y,b_y);
+  typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x);
+  typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
 
-    typename ViewTypeA::const_type c_x = x;
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-    ScalarA* expected_result = new ScalarA[K];
-    for(int j=0;j<K;j++) {
-      expected_result[j] = ScalarA();
-      for(int i=0;i<N;i++)
-      { expected_result[j] += AT::abs(AT::one()/h_x(i,j)) * AT::abs(AT::one()/h_x(i,j)); }
-    }
+  {
+    ScalarA randStart, randEnd;
+    Test::getRandomBounds(1.0, randStart, randEnd);
+    Kokkos::fill_random(b_x, rand_pool, randStart, randEnd);
+  }
+  {
+    ScalarB randStart, randEnd;
+    Test::getRandomBounds(1.0, randStart, randEnd);
+    Kokkos::fill_random(b_y, rand_pool, randStart, randEnd);
+  }
 
-    typename AT::mag_type eps = AT::epsilon()*2000;
-    typename AT::mag_type zero = AT::abs( AT::zero() );
-    typename AT::mag_type one = AT::abs( AT::one() );
+  Kokkos::deep_copy(b_org_y, b_y);
 
-    Kokkos::View<ScalarB*,Kokkos::HostSpace> r("Dot::Result",K);
+  Kokkos::deep_copy(h_b_x, b_x);
+  Kokkos::deep_copy(h_b_y, b_y);
 
-    KokkosBlas::reciprocal(y,x);
-    KokkosBlas::dot(r,y,y);
-    for(int k=0;k<K;k++) {
-      ScalarA nonconst_result = r(k);
-      typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]);
-      typename AT::mag_type diff = AT::abs( nonconst_result - expected_result[k] )/divisor;
-      EXPECT_NEAR_KK( diff, zero, eps );
-    }
+  typename ViewTypeA::const_type c_x = x;
 
-    Kokkos::deep_copy(b_y,b_org_y);
-    KokkosBlas::reciprocal(y,c_x);
-    KokkosBlas::dot(r,y,y);
-    for(int k=0;k<K;k++) {
-      ScalarA const_result = r(k);
-      typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]);
-      typename AT::mag_type diff = AT::abs( const_result - expected_result[k] )/divisor;
-      EXPECT_NEAR_KK( diff, zero, eps );
+  ScalarA* expected_result = new ScalarA[K];
+  for (int j = 0; j < K; j++) {
+    expected_result[j] = ScalarA();
+    for (int i = 0; i < N; i++) {
+      expected_result[j] +=
+          AT::abs(AT::one() / h_x(i, j)) * AT::abs(AT::one() / h_x(i, j));
     }
+  }
 
-    delete [] expected_result;
+  typename AT::mag_type eps  = AT::epsilon() * 2000;
+  typename AT::mag_type zero = AT::abs(AT::zero());
+  typename AT::mag_type one  = AT::abs(AT::one());
+
+  Kokkos::View<ScalarB*, Kokkos::HostSpace> r("Dot::Result", K);
+
+  KokkosBlas::reciprocal(y, x);
+  KokkosBlas::dot(r, y, y);
+  for (int k = 0; k < K; k++) {
+    ScalarA nonconst_result = r(k);
+    typename AT::mag_type divisor =
+        AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]);
+    typename AT::mag_type diff =
+        AT::abs(nonconst_result - expected_result[k]) / divisor;
+    EXPECT_NEAR_KK(diff, zero, eps);
   }
-}
 
+  Kokkos::deep_copy(b_y, b_org_y);
+  KokkosBlas::reciprocal(y, c_x);
+  KokkosBlas::dot(r, y, y);
+  for (int k = 0; k < K; k++) {
+    ScalarA const_result = r(k);
+    typename AT::mag_type divisor =
+        AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]);
+    typename AT::mag_type diff =
+        AT::abs(const_result - expected_result[k]) / divisor;
+    EXPECT_NEAR_KK(diff, zero, eps);
+  }
 
+  delete[] expected_result;
+}
+}  // namespace Test
 
-template<class ScalarA, class ScalarB, class Device>
+template <class ScalarA, class ScalarB, class Device>
 int test_reciprocal() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutLeft, Device> view_type_a_ll;
   typedef Kokkos::View<ScalarB*, Kokkos::LayoutLeft, Device> view_type_b_ll;
   Test::impl_test_reciprocal<view_type_a_ll, view_type_b_ll, Device>(0);
   Test::impl_test_reciprocal<view_type_a_ll, view_type_b_ll, Device>(13);
   Test::impl_test_reciprocal<view_type_a_ll, view_type_b_ll, Device>(1024);
-  //Test::impl_test_reciprocal<view_type_a_ll, view_type_b_ll, Device>(132231);
+  // Test::impl_test_reciprocal<view_type_a_ll, view_type_b_ll, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutRight, Device> view_type_a_lr;
   typedef Kokkos::View<ScalarB*, Kokkos::LayoutRight, Device> view_type_b_lr;
   Test::impl_test_reciprocal<view_type_a_lr, view_type_b_lr, Device>(0);
   Test::impl_test_reciprocal<view_type_a_lr, view_type_b_lr, Device>(13);
   Test::impl_test_reciprocal<view_type_a_lr, view_type_b_lr, Device>(1024);
-  //Test::impl_test_reciprocal<view_type_a_lr, view_type_b_lr, Device>(132231);
+  // Test::impl_test_reciprocal<view_type_a_lr, view_type_b_lr, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutStride, Device> view_type_a_ls;
   typedef Kokkos::View<ScalarB*, Kokkos::LayoutStride, Device> view_type_b_ls;
   Test::impl_test_reciprocal<view_type_a_ls, view_type_b_ls, Device>(0);
   Test::impl_test_reciprocal<view_type_a_ls, view_type_b_ls, Device>(13);
   Test::impl_test_reciprocal<view_type_a_ls, view_type_b_ls, Device>(1024);
-  //Test::impl_test_reciprocal<view_type_a_ls, view_type_b_ls, Device>(132231);
+  // Test::impl_test_reciprocal<view_type_a_ls, view_type_b_ls, Device>(132231);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
   Test::impl_test_reciprocal<view_type_a_ls, view_type_b_ll, Device>(1024);
   Test::impl_test_reciprocal<view_type_a_ll, view_type_b_ls, Device>(1024);
 #endif
@@ -199,100 +217,125 @@ int test_reciprocal() {
   return 1;
 }
 
-template<class ScalarA, class ScalarB, class Device>
+template <class ScalarA, class ScalarB, class Device>
 int test_reciprocal_mv() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device> view_type_a_ll;
   typedef Kokkos::View<ScalarB**, Kokkos::LayoutLeft, Device> view_type_b_ll;
-  Test::impl_test_reciprocal_mv<view_type_a_ll, view_type_b_ll, Device>(0,5);
-  Test::impl_test_reciprocal_mv<view_type_a_ll, view_type_b_ll, Device>(13,5);
-  Test::impl_test_reciprocal_mv<view_type_a_ll, view_type_b_ll, Device>(1024,5);
-  //Test::impl_test_reciprocal_mv<view_type_a_ll, view_type_b_ll, Device>(132231,5);
+  Test::impl_test_reciprocal_mv<view_type_a_ll, view_type_b_ll, Device>(0, 5);
+  Test::impl_test_reciprocal_mv<view_type_a_ll, view_type_b_ll, Device>(13, 5);
+  Test::impl_test_reciprocal_mv<view_type_a_ll, view_type_b_ll, Device>(1024,
+                                                                        5);
+  // Test::impl_test_reciprocal_mv<view_type_a_ll, view_type_b_ll,
+  // Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device> view_type_a_lr;
   typedef Kokkos::View<ScalarB**, Kokkos::LayoutRight, Device> view_type_b_lr;
-  Test::impl_test_reciprocal_mv<view_type_a_lr, view_type_b_lr, Device>(0,5);
-  Test::impl_test_reciprocal_mv<view_type_a_lr, view_type_b_lr, Device>(13,5);
-  Test::impl_test_reciprocal_mv<view_type_a_lr, view_type_b_lr, Device>(1024,5);
-  //Test::impl_test_reciprocal_mv<view_type_a_lr, view_type_b_lr, Device>(132231,5);
+  Test::impl_test_reciprocal_mv<view_type_a_lr, view_type_b_lr, Device>(0, 5);
+  Test::impl_test_reciprocal_mv<view_type_a_lr, view_type_b_lr, Device>(13, 5);
+  Test::impl_test_reciprocal_mv<view_type_a_lr, view_type_b_lr, Device>(1024,
+                                                                        5);
+  // Test::impl_test_reciprocal_mv<view_type_a_lr, view_type_b_lr,
+  // Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutStride, Device> view_type_a_ls;
   typedef Kokkos::View<ScalarB**, Kokkos::LayoutStride, Device> view_type_b_ls;
-  Test::impl_test_reciprocal_mv<view_type_a_ls, view_type_b_ls, Device>(0,5);
-  Test::impl_test_reciprocal_mv<view_type_a_ls, view_type_b_ls, Device>(13,5);
-  Test::impl_test_reciprocal_mv<view_type_a_ls, view_type_b_ls, Device>(1024,5);
-  //Test::impl_test_reciprocal_mv<view_type_a_ls, view_type_b_ls, Device>(132231,5);
+  Test::impl_test_reciprocal_mv<view_type_a_ls, view_type_b_ls, Device>(0, 5);
+  Test::impl_test_reciprocal_mv<view_type_a_ls, view_type_b_ls, Device>(13, 5);
+  Test::impl_test_reciprocal_mv<view_type_a_ls, view_type_b_ls, Device>(1024,
+                                                                        5);
+  // Test::impl_test_reciprocal_mv<view_type_a_ls, view_type_b_ls,
+  // Device>(132231,5);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-  Test::impl_test_reciprocal_mv<view_type_a_ls, view_type_b_ll, Device>(1024,5);
-  Test::impl_test_reciprocal_mv<view_type_a_ll, view_type_b_ls, Device>(1024,5);
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+  Test::impl_test_reciprocal_mv<view_type_a_ls, view_type_b_ll, Device>(1024,
+                                                                        5);
+  Test::impl_test_reciprocal_mv<view_type_a_ll, view_type_b_ls, Device>(1024,
+                                                                        5);
 #endif
 
   return 1;
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, reciprocal_float ) {
-  Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_float"); 
-    test_reciprocal<float,float,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, reciprocal_float) {
+  Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_float");
+  test_reciprocal<float, float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, reciprocal_mv_float ) {
-  Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_mv_float"); 
-    test_reciprocal_mv<float,float,TestExecSpace> ();
+TEST_F(TestCategory, reciprocal_mv_float) {
+  Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_mv_float");
+  test_reciprocal_mv<float, float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, reciprocal_double ) {
-  Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_double"); 
-    test_reciprocal<double,double,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, reciprocal_double) {
+  Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_double");
+  test_reciprocal<double, double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, reciprocal_mv_double ) {
-  Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_mv_double"); 
-    test_reciprocal_mv<double,double,TestExecSpace> ();
+TEST_F(TestCategory, reciprocal_mv_double) {
+  Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_mv_double");
+  test_reciprocal_mv<double, double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, reciprocal_complex_double ) {
-  Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_complex_double"); 
-    test_reciprocal<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, reciprocal_complex_double) {
+  Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_complex_double");
+  test_reciprocal<Kokkos::complex<double>, Kokkos::complex<double>,
+                  TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, reciprocal_mv_complex_double ) {
-  Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_mv_complex_double"); 
-    test_reciprocal_mv<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+TEST_F(TestCategory, reciprocal_mv_complex_double) {
+  Kokkos::Profiling::pushRegion(
+      "KokkosBlas::Test::reciprocal_mv_complex_double");
+  test_reciprocal_mv<Kokkos::complex<double>, Kokkos::complex<double>,
+                     TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, reciprocal_int ) {
-  Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_int"); 
-    test_reciprocal<int,int,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_INT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, reciprocal_int) {
+  Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_int");
+  test_reciprocal<int, int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, reciprocal_mv_int ) {
-  Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_mv_int"); 
-    test_reciprocal_mv<int,int,TestExecSpace> ();
+TEST_F(TestCategory, reciprocal_mv_int) {
+  Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_mv_int");
+  test_reciprocal_mv<int, int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
 /*
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-TEST_F( TestCategory, reciprocal_double_int ) {
-    test_reciprocal<double,int,TestExecSpace> ();
+#if !defined(KOKKOSKERNELS_ETI_ONLY) &&
+!defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F( TestCategory,
+reciprocal_double_int ) { test_reciprocal<double,int,TestExecSpace> ();
 }
 TEST_F( TestCategory, reciprocal_double_mv_int ) {
     test_reciprocal_mv<double,int,TestExecSpace> ();
diff --git a/unit_test/blas/Test_Blas1_scal.hpp b/unit_test/blas/Test_Blas1_scal.hpp
index d142db3543..904747a051 100644
--- a/unit_test/blas/Test_Blas1_scal.hpp
+++ b/unit_test/blas/Test_Blas1_scal.hpp
@@ -1,13 +1,13 @@
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas1_scal.hpp>
-#include<KokkosBlas1_dot.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas1_scal.hpp>
+#include <KokkosBlas1_dot.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
-  template<class ViewTypeA, class ViewTypeB, class Device>
-  void impl_test_scal(int N) {
+template <class ViewTypeA, class ViewTypeB, class Device>
+void impl_test_scal(int N) {
   typedef typename ViewTypeA::value_type ScalarA;
   typedef typename ViewTypeB::value_type ScalarB;
   typedef Kokkos::Details::ArithTraits<ScalarA> AT;
@@ -45,159 +45,154 @@ namespace Test {
 
   KokkosBlas::scal(y, a, x);
   Kokkos::deep_copy(h_y, y);
-  for(int i = 0; i < N; i++)
-    {
-      EXPECT_NEAR_KK(a * h_x(i), h_y(i), eps);
-    }
-
-    Kokkos::deep_copy(y, org_y);
-    KokkosBlas::scal(y, a, c_x);
-    Kokkos::deep_copy(h_y, y);
-    for(int i = 0; i < N; i++)
-    {
-      EXPECT_NEAR_KK(a * h_x(i), h_y(i), eps);
-    }
+  for (int i = 0; i < N; i++) {
+    EXPECT_NEAR_KK(a * h_x(i), h_y(i), eps);
   }
 
-  template<class ViewTypeA, class ViewTypeB, class Device>
-  void impl_test_scal_mv(int N, int K) {
+  Kokkos::deep_copy(y, org_y);
+  KokkosBlas::scal(y, a, c_x);
+  Kokkos::deep_copy(h_y, y);
+  for (int i = 0; i < N; i++) {
+    EXPECT_NEAR_KK(a * h_x(i), h_y(i), eps);
+  }
+}
 
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-    typedef Kokkos::Details::ArithTraits<ScalarA> AT;
+template <class ViewTypeA, class ViewTypeB, class Device>
+void impl_test_scal_mv(int N, int K) {
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+  typedef Kokkos::Details::ArithTraits<ScalarA> AT;
 
-    typedef multivector_layout_adapter<ViewTypeA> vfA_type;
-    typedef multivector_layout_adapter<ViewTypeB> vfB_type;
+  typedef multivector_layout_adapter<ViewTypeA> vfA_type;
+  typedef multivector_layout_adapter<ViewTypeB> vfB_type;
 
-    typename vfA_type::BaseType b_x("A",N,K);
-    typename vfB_type::BaseType b_y("B",N,K);
-    typename vfB_type::BaseType b_org_y("B",N,K);
+  typename vfA_type::BaseType b_x("A", N, K);
+  typename vfB_type::BaseType b_y("B", N, K);
+  typename vfB_type::BaseType b_org_y("B", N, K);
 
-    ViewTypeA x = vfA_type::view(b_x);
-    ViewTypeB y = vfB_type::view(b_y);
+  ViewTypeA x = vfA_type::view(b_x);
+  ViewTypeB y = vfB_type::view(b_y);
 
-    typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
-    typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
+  typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
+  typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
 
-    typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x);
-    typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y);
+  typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x);
+  typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y);
 
-    typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x);
-    typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
+  typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x);
+  typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
 
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-    {
-      ScalarA randStart, randEnd;
-      Test::getRandomBounds(1.0, randStart, randEnd);
-      Kokkos::fill_random(b_x,rand_pool,randStart,randEnd);
-    }
-    {
-      ScalarB randStart, randEnd;
-      Test::getRandomBounds(1.0, randStart, randEnd);
-      Kokkos::fill_random(b_y,rand_pool,randStart,randEnd);
-    }
+  {
+    ScalarA randStart, randEnd;
+    Test::getRandomBounds(1.0, randStart, randEnd);
+    Kokkos::fill_random(b_x, rand_pool, randStart, randEnd);
+  }
+  {
+    ScalarB randStart, randEnd;
+    Test::getRandomBounds(1.0, randStart, randEnd);
+    Kokkos::fill_random(b_y, rand_pool, randStart, randEnd);
+  }
 
-    Kokkos::fence();
+  Kokkos::fence();
 
-    Kokkos::deep_copy(b_org_y,b_y);
+  Kokkos::deep_copy(b_org_y, b_y);
 
-    Kokkos::deep_copy(h_b_x,b_x);
+  Kokkos::deep_copy(h_b_x, b_x);
 
-    ScalarA a(3.0);
-    typename ViewTypeA::const_type c_x = x;
+  ScalarA a(3.0);
+  typename ViewTypeA::const_type c_x = x;
 
-    typename AT::mag_type eps = AT::epsilon()*1000;
+  typename AT::mag_type eps = AT::epsilon() * 1000;
 
-    Kokkos::View<ScalarB*,Kokkos::HostSpace> r("Dot::Result",K);
+  Kokkos::View<ScalarB*, Kokkos::HostSpace> r("Dot::Result", K);
 
-    KokkosBlas::scal(y,a,x);
-    Kokkos::deep_copy(h_b_y, b_y);
-    for(int i = 0; i < N; i++)
-    {
-      for(int j = 0; j < K; j++)
-      {
-        EXPECT_NEAR_KK(a * h_x(i, j), h_y(i, j), eps);
-      }
+  KokkosBlas::scal(y, a, x);
+  Kokkos::deep_copy(h_b_y, b_y);
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < K; j++) {
+      EXPECT_NEAR_KK(a * h_x(i, j), h_y(i, j), eps);
     }
+  }
 
-    Kokkos::deep_copy(b_y,b_org_y);
-    KokkosBlas::scal(y,a,c_x);
-    Kokkos::deep_copy(h_b_y, b_y);
-    for(int i = 0; i < N; i++)
-    {
-      for(int j = 0; j < K; j++)
-      {
-        EXPECT_NEAR_KK(a * h_x(i, j), h_y(i, j), eps);
-      }
+  Kokkos::deep_copy(b_y, b_org_y);
+  KokkosBlas::scal(y, a, c_x);
+  Kokkos::deep_copy(h_b_y, b_y);
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < K; j++) {
+      EXPECT_NEAR_KK(a * h_x(i, j), h_y(i, j), eps);
     }
+  }
 
+  // Generate 'params' view with dimension == number of multivectors; each entry
+  // will be different scalar to scale y
+  Kokkos::View<ScalarA*, Device> params("Params", K);
+  for (int j = 0; j < K; j++) {
+    Kokkos::View<ScalarA, Device> param_j(params, j);
+    Kokkos::deep_copy(param_j, ScalarA(3 + j));
+  }
 
-    // Generate 'params' view with dimension == number of multivectors; each entry will be different scalar to scale y
-    Kokkos::View<ScalarA*,Device> params("Params",K);
-    for(int j=0; j<K; j++) {
-      Kokkos::View<ScalarA,Device> param_j(params,j);
-      Kokkos::deep_copy(param_j,ScalarA(3+j));
-    }
-
-    auto h_params = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), params);
+  auto h_params =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), params);
 
-    KokkosBlas::scal(y,params,x);
-    Kokkos::deep_copy(h_b_y, b_y);
-    for(int i = 0; i < N; i++)
-    {
-      for(int j = 0; j < K; j++)
-      {
-        EXPECT_NEAR_KK(h_params(j) * h_x(i, j), h_y(i, j), eps);
-      }
+  KokkosBlas::scal(y, params, x);
+  Kokkos::deep_copy(h_b_y, b_y);
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < K; j++) {
+      EXPECT_NEAR_KK(h_params(j) * h_x(i, j), h_y(i, j), eps);
     }
+  }
 
-    Kokkos::deep_copy(b_y,b_org_y);
-    KokkosBlas::scal(y,params,c_x);
-    Kokkos::deep_copy(h_b_y, b_y);
-    for(int i = 0; i < N; i++)
-    {
-      for(int j = 0; j < K; j++)
-      {
-        EXPECT_NEAR_KK(h_params(j) * h_x(i, j), h_y(i, j), eps);
-      }
+  Kokkos::deep_copy(b_y, b_org_y);
+  KokkosBlas::scal(y, params, c_x);
+  Kokkos::deep_copy(h_b_y, b_y);
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < K; j++) {
+      EXPECT_NEAR_KK(h_params(j) * h_x(i, j), h_y(i, j), eps);
     }
   }
 }
+}  // namespace Test
 
-
-
-template<class ScalarA, class ScalarB, class Device>
+template <class ScalarA, class ScalarB, class Device>
 int test_scal() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutLeft, Device> view_type_a_ll;
   typedef Kokkos::View<ScalarB*, Kokkos::LayoutLeft, Device> view_type_b_ll;
   Test::impl_test_scal<view_type_a_ll, view_type_b_ll, Device>(0);
   Test::impl_test_scal<view_type_a_ll, view_type_b_ll, Device>(13);
   Test::impl_test_scal<view_type_a_ll, view_type_b_ll, Device>(1024);
-  //Test::impl_test_scal<view_type_a_ll, view_type_b_ll, Device>(132231);
+  // Test::impl_test_scal<view_type_a_ll, view_type_b_ll, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutRight, Device> view_type_a_lr;
   typedef Kokkos::View<ScalarB*, Kokkos::LayoutRight, Device> view_type_b_lr;
   Test::impl_test_scal<view_type_a_lr, view_type_b_lr, Device>(0);
   Test::impl_test_scal<view_type_a_lr, view_type_b_lr, Device>(13);
   Test::impl_test_scal<view_type_a_lr, view_type_b_lr, Device>(1024);
-  //Test::impl_test_scal<view_type_a_lr, view_type_b_lr, Device>(132231);
+  // Test::impl_test_scal<view_type_a_lr, view_type_b_lr, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutStride, Device> view_type_a_ls;
   typedef Kokkos::View<ScalarB*, Kokkos::LayoutStride, Device> view_type_b_ls;
   Test::impl_test_scal<view_type_a_ls, view_type_b_ls, Device>(0);
   Test::impl_test_scal<view_type_a_ls, view_type_b_ls, Device>(13);
   Test::impl_test_scal<view_type_a_ls, view_type_b_ls, Device>(1024);
-  //Test::impl_test_scal<view_type_a_ls, view_type_b_ls, Device>(132231);
+  // Test::impl_test_scal<view_type_a_ls, view_type_b_ls, Device>(132231);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
   Test::impl_test_scal<view_type_a_ls, view_type_b_ll, Device>(1024);
   Test::impl_test_scal<view_type_a_ll, view_type_b_ls, Device>(1024);
 #endif
@@ -205,105 +200,121 @@ int test_scal() {
   return 1;
 }
 
-template<class ScalarA, class ScalarB, class Device>
+template <class ScalarA, class ScalarB, class Device>
 int test_scal_mv() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device> view_type_a_ll;
   typedef Kokkos::View<ScalarB**, Kokkos::LayoutLeft, Device> view_type_b_ll;
-  Test::impl_test_scal_mv<view_type_a_ll, view_type_b_ll, Device>(0,5);
-  Test::impl_test_scal_mv<view_type_a_ll, view_type_b_ll, Device>(13,5);
-  Test::impl_test_scal_mv<view_type_a_ll, view_type_b_ll, Device>(1024,5);
-  //Test::impl_test_scal_mv<view_type_a_ll, view_type_b_ll, Device>(132231,5);
+  Test::impl_test_scal_mv<view_type_a_ll, view_type_b_ll, Device>(0, 5);
+  Test::impl_test_scal_mv<view_type_a_ll, view_type_b_ll, Device>(13, 5);
+  Test::impl_test_scal_mv<view_type_a_ll, view_type_b_ll, Device>(1024, 5);
+  // Test::impl_test_scal_mv<view_type_a_ll, view_type_b_ll, Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device> view_type_a_lr;
   typedef Kokkos::View<ScalarB**, Kokkos::LayoutRight, Device> view_type_b_lr;
-  Test::impl_test_scal_mv<view_type_a_lr, view_type_b_lr, Device>(0,5);
-  Test::impl_test_scal_mv<view_type_a_lr, view_type_b_lr, Device>(13,5);
-  Test::impl_test_scal_mv<view_type_a_lr, view_type_b_lr, Device>(1024,5);
-  //Test::impl_test_scal_mv<view_type_a_lr, view_type_b_lr, Device>(132231,5);
+  Test::impl_test_scal_mv<view_type_a_lr, view_type_b_lr, Device>(0, 5);
+  Test::impl_test_scal_mv<view_type_a_lr, view_type_b_lr, Device>(13, 5);
+  Test::impl_test_scal_mv<view_type_a_lr, view_type_b_lr, Device>(1024, 5);
+  // Test::impl_test_scal_mv<view_type_a_lr, view_type_b_lr, Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutStride, Device> view_type_a_ls;
   typedef Kokkos::View<ScalarB**, Kokkos::LayoutStride, Device> view_type_b_ls;
-  Test::impl_test_scal_mv<view_type_a_ls, view_type_b_ls, Device>(0,5);
-  Test::impl_test_scal_mv<view_type_a_ls, view_type_b_ls, Device>(13,5);
-  Test::impl_test_scal_mv<view_type_a_ls, view_type_b_ls, Device>(1024,5);
-  //Test::impl_test_scal_mv<view_type_a_ls, view_type_b_ls, Device>(132231,5);
+  Test::impl_test_scal_mv<view_type_a_ls, view_type_b_ls, Device>(0, 5);
+  Test::impl_test_scal_mv<view_type_a_ls, view_type_b_ls, Device>(13, 5);
+  Test::impl_test_scal_mv<view_type_a_ls, view_type_b_ls, Device>(1024, 5);
+  // Test::impl_test_scal_mv<view_type_a_ls, view_type_b_ls, Device>(132231,5);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-  Test::impl_test_scal_mv<view_type_a_ls, view_type_b_ll, Device>(1024,5);
-  Test::impl_test_scal_mv<view_type_a_ll, view_type_b_ls, Device>(1024,5);
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+  Test::impl_test_scal_mv<view_type_a_ls, view_type_b_ll, Device>(1024, 5);
+  Test::impl_test_scal_mv<view_type_a_ll, view_type_b_ls, Device>(1024, 5);
 #endif
 
   return 1;
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, scal_float ) {
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, scal_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_float");
-    test_scal<float,float,TestExecSpace> ();
+  test_scal<float, float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, scal_mv_float ) {
+TEST_F(TestCategory, scal_mv_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_mv_float");
-    test_scal_mv<float,float,TestExecSpace> ();
+  test_scal_mv<float, float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, scal_double ) {
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, scal_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_double");
-    test_scal<double,double,TestExecSpace> ();
+  test_scal<double, double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, scal_mv_double ) {
+TEST_F(TestCategory, scal_mv_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_mv_double");
-    test_scal_mv<double,double,TestExecSpace> ();
+  test_scal_mv<double, double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, scal_complex_double ) {
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, scal_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_complex_double");
-    test_scal<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+  test_scal<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, scal_mv_complex_double ) {
+TEST_F(TestCategory, scal_mv_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_mv_complex_double");
-    test_scal_mv<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+  test_scal_mv<Kokkos::complex<double>, Kokkos::complex<double>,
+               TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, scal_int ) {
+#if defined(KOKKOSKERNELS_INST_INT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, scal_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_int");
-    test_scal<int,int,TestExecSpace> ();
+  test_scal<int, int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, scal_mv_int ) {
+TEST_F(TestCategory, scal_mv_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_mv_int");
-    test_scal_mv<int,int,TestExecSpace> ();
+  test_scal_mv<int, int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-TEST_F( TestCategory, scal_double_int ) {
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+TEST_F(TestCategory, scal_double_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_double_int");
-    test_scal<double,int,TestExecSpace> ();
+  test_scal<double, int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, scal_mv_double_int ) {
+TEST_F(TestCategory, scal_mv_double_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_mv_double_int");
-    test_scal_mv<double,int,TestExecSpace> ();
+  test_scal_mv<double, int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
diff --git a/unit_test/blas/Test_Blas1_sum.hpp b/unit_test/blas/Test_Blas1_sum.hpp
index 71f964f58d..768091885c 100644
--- a/unit_test/blas/Test_Blas1_sum.hpp
+++ b/unit_test/blas/Test_Blas1_sum.hpp
@@ -1,206 +1,220 @@
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas1_sum.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas1_sum.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
-  template<class ViewTypeA, class Device>
-  void impl_test_sum(int N) {
+template <class ViewTypeA, class Device>
+void impl_test_sum(int N) {
+  typedef typename ViewTypeA::value_type ScalarA;
 
-    typedef typename ViewTypeA::value_type ScalarA;
+  ViewTypeA a("A", N);
 
-    ViewTypeA a("A", N);
+  typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a);
 
-    typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  ScalarA randStart, randEnd;
+  Test::getRandomBounds(10.0, randStart, randEnd);
+  Kokkos::fill_random(a, rand_pool, randStart, randEnd);
 
-    ScalarA randStart, randEnd;
-    Test::getRandomBounds(10.0, randStart, randEnd);
-    Kokkos::fill_random(a, rand_pool, randStart, randEnd);
+  Kokkos::deep_copy(h_a, a);
 
-    Kokkos::deep_copy(h_a, a);
+  typename ViewTypeA::const_type c_a = a;
+  double eps = std::is_same<ScalarA, float>::value ? 2 * 1e-5 : 1e-7;
 
-    typename ViewTypeA::const_type c_a = a;
-    double eps = std::is_same<ScalarA,float>::value?2*1e-5:1e-7;
+  ScalarA expected_result = 0;
+  for (int i = 0; i < N; i++) expected_result += h_a(i);
 
-    ScalarA expected_result = 0;
-    for(int i=0;i<N;i++)
-      expected_result += h_a(i);
+  ScalarA nonconst_result = KokkosBlas::sum(a);
+  EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result);
 
-    ScalarA nonconst_result = KokkosBlas::sum(a);
-    EXPECT_NEAR_KK( nonconst_result, expected_result, eps*expected_result);
-
-    ScalarA const_result = KokkosBlas::sum(c_a);
-    EXPECT_NEAR_KK( const_result, expected_result, eps*expected_result);
-
-  }
-
-  template<class ViewTypeA, class Device>
-  void impl_test_sum_mv(int N, int K) {
-
-    typedef typename ViewTypeA::value_type ScalarA;
+  ScalarA const_result = KokkosBlas::sum(c_a);
+  EXPECT_NEAR_KK(const_result, expected_result, eps * expected_result);
+}
 
-    typedef multivector_layout_adapter<ViewTypeA> vfA_type;
+template <class ViewTypeA, class Device>
+void impl_test_sum_mv(int N, int K) {
+  typedef typename ViewTypeA::value_type ScalarA;
 
-    typename vfA_type::BaseType b_a("A",N,K);
+  typedef multivector_layout_adapter<ViewTypeA> vfA_type;
 
-    ViewTypeA a = vfA_type::view(b_a);
+  typename vfA_type::BaseType b_a("A", N, K);
 
-    typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
+  ViewTypeA a = vfA_type::view(b_a);
 
-    typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a);
+  typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
 
-    typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a);
+  typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a);
 
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a);
 
-    ScalarA randStart, randEnd;
-    Test::getRandomBounds(10.0, randStart, randEnd);
-    Kokkos::fill_random(b_a,rand_pool,randStart,randEnd);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-    Kokkos::deep_copy(h_b_a,b_a);
+  ScalarA randStart, randEnd;
+  Test::getRandomBounds(10.0, randStart, randEnd);
+  Kokkos::fill_random(b_a, rand_pool, randStart, randEnd);
 
-    typename ViewTypeA::const_type c_a = a;
+  Kokkos::deep_copy(h_b_a, b_a);
 
-    ScalarA* expected_result = new ScalarA[K];
-    for(int j=0;j<K;j++) {
-      expected_result[j] = ScalarA();
-      for(int i=0;i<N;i++)
-        expected_result[j] += h_a(i,j);
-    }
+  typename ViewTypeA::const_type c_a = a;
 
-    double eps = std::is_same<ScalarA,float>::value?2*1e-5:1e-7;
+  ScalarA* expected_result = new ScalarA[K];
+  for (int j = 0; j < K; j++) {
+    expected_result[j] = ScalarA();
+    for (int i = 0; i < N; i++) expected_result[j] += h_a(i, j);
+  }
 
-    Kokkos::View<ScalarA*,Kokkos::HostSpace> r("Sum::Result",K);
+  double eps = std::is_same<ScalarA, float>::value ? 2 * 1e-5 : 1e-7;
 
-    KokkosBlas::sum(r,a);
-    for(int k=0;k<K;k++) {
-      ScalarA nonconst_result = r(k);
-      EXPECT_NEAR_KK( nonconst_result, expected_result[k], eps*expected_result[k]);
-    }
+  Kokkos::View<ScalarA*, Kokkos::HostSpace> r("Sum::Result", K);
 
-    KokkosBlas::sum(r,c_a);
-    for(int k=0;k<K;k++) {
-      ScalarA const_result = r(k);
-      EXPECT_NEAR_KK( const_result, expected_result[k], eps*expected_result[k]);
-    }
+  KokkosBlas::sum(r, a);
+  for (int k = 0; k < K; k++) {
+    ScalarA nonconst_result = r(k);
+    EXPECT_NEAR_KK(nonconst_result, expected_result[k],
+                   eps * expected_result[k]);
+  }
 
-    delete [] expected_result;
+  KokkosBlas::sum(r, c_a);
+  for (int k = 0; k < K; k++) {
+    ScalarA const_result = r(k);
+    EXPECT_NEAR_KK(const_result, expected_result[k], eps * expected_result[k]);
   }
+
+  delete[] expected_result;
 }
+}  // namespace Test
 
-template<class ScalarA, class Device>
+template <class ScalarA, class Device>
 int test_sum() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutLeft, Device> view_type_a_ll;
   Test::impl_test_sum<view_type_a_ll, Device>(0);
   Test::impl_test_sum<view_type_a_ll, Device>(13);
   Test::impl_test_sum<view_type_a_ll, Device>(1024);
-  //Test::impl_test_sum<view_type_a_ll, Device>(132231);
+  // Test::impl_test_sum<view_type_a_ll, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutRight, Device> view_type_a_lr;
   Test::impl_test_sum<view_type_a_lr, Device>(0);
   Test::impl_test_sum<view_type_a_lr, Device>(13);
   Test::impl_test_sum<view_type_a_lr, Device>(1024);
-  //Test::impl_test_sum<view_type_a_lr, Device>(132231);
+  // Test::impl_test_sum<view_type_a_lr, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutStride, Device> view_type_a_ls;
   Test::impl_test_sum<view_type_a_ls, Device>(0);
   Test::impl_test_sum<view_type_a_ls, Device>(13);
   Test::impl_test_sum<view_type_a_ls, Device>(1024);
-  //Test::impl_test_sum<view_type_a_ls, Device>(132231);
+  // Test::impl_test_sum<view_type_a_ls, Device>(132231);
 #endif
 
   return 1;
 }
 
-template<class ScalarA, class Device>
+template <class ScalarA, class Device>
 int test_sum_mv() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device> view_type_a_ll;
-  Test::impl_test_sum_mv<view_type_a_ll, Device>(0,5);
-  Test::impl_test_sum_mv<view_type_a_ll, Device>(13,5);
-  Test::impl_test_sum_mv<view_type_a_ll, Device>(1024,5);
-  //Test::impl_test_sum_mv<view_type_a_ll, Device>(132231,5);
+  Test::impl_test_sum_mv<view_type_a_ll, Device>(0, 5);
+  Test::impl_test_sum_mv<view_type_a_ll, Device>(13, 5);
+  Test::impl_test_sum_mv<view_type_a_ll, Device>(1024, 5);
+  // Test::impl_test_sum_mv<view_type_a_ll, Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device> view_type_a_lr;
-  Test::impl_test_sum_mv<view_type_a_lr, Device>(0,5);
-  Test::impl_test_sum_mv<view_type_a_lr, Device>(13,5);
-  Test::impl_test_sum_mv<view_type_a_lr, Device>(1024,5);
-  //Test::impl_test_sum_mv<view_type_a_lr, Device>(132231,5);
+  Test::impl_test_sum_mv<view_type_a_lr, Device>(0, 5);
+  Test::impl_test_sum_mv<view_type_a_lr, Device>(13, 5);
+  Test::impl_test_sum_mv<view_type_a_lr, Device>(1024, 5);
+  // Test::impl_test_sum_mv<view_type_a_lr, Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutStride, Device> view_type_a_ls;
-  Test::impl_test_sum_mv<view_type_a_ls, Device>(0,5);
-  Test::impl_test_sum_mv<view_type_a_ls, Device>(13,5);
-  Test::impl_test_sum_mv<view_type_a_ls, Device>(1024,5);
-  //Test::impl_test_sum_mv<view_type_a_ls, Device>(132231,5);
+  Test::impl_test_sum_mv<view_type_a_ls, Device>(0, 5);
+  Test::impl_test_sum_mv<view_type_a_ls, Device>(13, 5);
+  Test::impl_test_sum_mv<view_type_a_ls, Device>(1024, 5);
+  // Test::impl_test_sum_mv<view_type_a_ls, Device>(132231,5);
 #endif
 
   return 1;
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, sum_float ) {
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, sum_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_float");
-    test_sum<float,TestExecSpace> ();
+  test_sum<float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, sum_mv_float ) {
+TEST_F(TestCategory, sum_mv_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_mv_float");
-    test_sum_mv<float,TestExecSpace> ();
+  test_sum_mv<float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, sum_double ) {
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, sum_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_double");
-    test_sum<double,TestExecSpace> ();
+  test_sum<double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, sum_mv_double ) {
+TEST_F(TestCategory, sum_mv_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_mv_double");
-    test_sum_mv<double,TestExecSpace> ();
+  test_sum_mv<double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, sum_complex_double ) {
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, sum_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_complex_double");
-    test_sum<Kokkos::complex<double>,TestExecSpace> ();
+  test_sum<Kokkos::complex<double>, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, sum_mv_complex_double ) {
+TEST_F(TestCategory, sum_mv_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_mv_complex_double");
-    test_sum_mv<Kokkos::complex<double>,TestExecSpace> ();
+  test_sum_mv<Kokkos::complex<double>, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, sum_int ) {
+#if defined(KOKKOSKERNELS_INST_INT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, sum_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_int");
-    test_sum<int,TestExecSpace> ();
+  test_sum<int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, sum_mv_int ) {
+TEST_F(TestCategory, sum_mv_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_mv_int");
-    test_sum_mv<int,TestExecSpace> ();
+  test_sum_mv<int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
-
-
diff --git a/unit_test/blas/Test_Blas1_team_abs.hpp b/unit_test/blas/Test_Blas1_team_abs.hpp
index 78a9ba6d28..4bfc7a6a4d 100644
--- a/unit_test/blas/Test_Blas1_team_abs.hpp
+++ b/unit_test/blas/Test_Blas1_team_abs.hpp
@@ -4,218 +4,263 @@
 //       the CUDA backend before including this test.
 #if !defined(TEST_CUDA_BLAS_CPP) || defined(KOKKOS_ENABLE_CUDA_LAMBDA)
 
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas1_team_abs.hpp>
-#include<KokkosBlas1_dot.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas1_team_abs.hpp>
+#include <KokkosBlas1_dot.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
-  template<class ViewTypeA, class ViewTypeB, class Device>
-  void impl_test_team_abs(int N) {
-
-    typedef Kokkos::TeamPolicy<Device>        team_policy ;
-    typedef typename team_policy::member_type team_member ;
-
-    //Launch M teams of the maximum number of threads per team
-    int M = 4;
-    const team_policy policy( M, Kokkos::AUTO );
-    const int team_data_siz = (N%M == 0)?(N/M):(N/M+1);
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-    typedef Kokkos::Details::ArithTraits<ScalarA> AT;
-
-    typedef Kokkos::View<ScalarA*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeA::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeA;
-    typedef Kokkos::View<ScalarB*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeB::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeB;
+template <class ViewTypeA, class ViewTypeB, class Device>
+void impl_test_team_abs(int N) {
+  typedef Kokkos::TeamPolicy<Device> team_policy;
+  typedef typename team_policy::member_type team_member;
+
+  // Launch M teams of the maximum number of threads per team
+  int M = 4;
+  const team_policy policy(M, Kokkos::AUTO);
+  const int team_data_siz = (N % M == 0) ? (N / M) : (N / M + 1);
+
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+  typedef Kokkos::Details::ArithTraits<ScalarA> AT;
+
+  typedef Kokkos::View<
+      ScalarA * [2],
+      typename std::conditional<std::is_same<typename ViewTypeA::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeA;
+  typedef Kokkos::View<
+      ScalarB * [2],
+      typename std::conditional<std::is_same<typename ViewTypeB::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeB;
+
+  double eps = std::is_same<ScalarA, float>::value ? 2 * 1e-5 : 1e-7;
+
+  BaseTypeA b_x("X", N);
+  BaseTypeB b_y("Y", N);
+  BaseTypeB b_org_y("Org_Y", N);
+
+  ViewTypeA x                        = Kokkos::subview(b_x, Kokkos::ALL(), 0);
+  ViewTypeB y                        = Kokkos::subview(b_y, Kokkos::ALL(), 0);
+  typename ViewTypeA::const_type c_x = x;
+
+  typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
+  typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y);
+
+  typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0);
+  typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0);
+
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
+
+  Kokkos::fill_random(b_x, rand_pool, ScalarA(1));
+  Kokkos::fill_random(b_y, rand_pool, ScalarB(1));
+
+  Kokkos::deep_copy(b_org_y, b_y);
+
+  Kokkos::deep_copy(h_b_x, b_x);
+  Kokkos::deep_copy(h_b_y, b_y);
+
+  ScalarA expected_result = 0;
+  for (int i = 0; i < N; i++)
+    expected_result += AT::abs(h_x(i)) * AT::abs(h_x(i));
+
+  // KokkosBlas::abs(y,x);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamAbs", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::abs(
+            teamMember,
+            Kokkos::subview(
+                y, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)),
+            Kokkos::subview(
+                x, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)));
+      });
+
+  ScalarB nonconst_nonconst_result = KokkosBlas::dot(y, y);
+  EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result,
+                 eps * expected_result);
+
+  Kokkos::deep_copy(b_y, b_org_y);
+
+  // KokkosBlas::abs(y,c_x);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamAbs", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::abs(
+            teamMember,
+            Kokkos::subview(
+                y, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)),
+            Kokkos::subview(
+                c_x, Kokkos::make_pair(
+                         teamId * team_data_siz,
+                         (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)));
+      });
+
+  ScalarB const_nonconst_result = KokkosBlas::dot(y, y);
+  EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result);
+}
 
+template <class ViewTypeA, class ViewTypeB, class Device>
+void impl_test_team_abs_mv(int N, int K) {
+  typedef Kokkos::TeamPolicy<Device> team_policy;
+  typedef typename team_policy::member_type team_member;
 
-    double eps = std::is_same<ScalarA,float>::value?2*1e-5:1e-7;
+  // Launch K teams of the maximum number of threads per team
+  const team_policy policy(K, Kokkos::AUTO);
 
-    BaseTypeA b_x("X",N);
-    BaseTypeB b_y("Y",N);
-    BaseTypeB b_org_y("Org_Y",N);
-    
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+  typedef Kokkos::Details::ArithTraits<ScalarA> AT;
 
-    ViewTypeA x = Kokkos::subview(b_x,Kokkos::ALL(),0);
-    ViewTypeB y = Kokkos::subview(b_y,Kokkos::ALL(),0);
-    typename ViewTypeA::const_type c_x = x;
+  typedef multivector_layout_adapter<ViewTypeA> vfA_type;
+  typedef multivector_layout_adapter<ViewTypeB> vfB_type;
 
-    typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
-    typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y);
+  typename vfA_type::BaseType b_x("A", N, K);
+  typename vfB_type::BaseType b_y("B", N, K);
+  typename vfB_type::BaseType b_org_y("B", N, K);
 
-    typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x,Kokkos::ALL(),0);
-    typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y,Kokkos::ALL(),0);
+  ViewTypeA x = vfA_type::view(b_x);
+  ViewTypeB y = vfB_type::view(b_y);
 
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
+  typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
 
-    Kokkos::fill_random(b_x,rand_pool,ScalarA(1));
-    Kokkos::fill_random(b_y,rand_pool,ScalarB(1));
+  typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x);
+  typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y);
 
-    Kokkos::deep_copy(b_org_y,b_y);
+  typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x);
+  typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
 
-    Kokkos::deep_copy(h_b_x,b_x);
-    Kokkos::deep_copy(h_b_y,b_y);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-    ScalarA expected_result = 0;
-    for(int i=0;i<N;i++)
-      expected_result += AT::abs(h_x(i)) * AT::abs(h_x(i));
+  Kokkos::fill_random(b_x, rand_pool, ScalarA(1));
+  Kokkos::fill_random(b_y, rand_pool, ScalarB(1));
 
-    //KokkosBlas::abs(y,x); 
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamAbs", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::abs(teamMember, Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), Kokkos::subview(x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)));
-    } );
+  Kokkos::deep_copy(b_org_y, b_y);
 
-    ScalarB nonconst_nonconst_result = KokkosBlas::dot(y,y);
-    EXPECT_NEAR_KK( nonconst_nonconst_result, expected_result, eps*expected_result);
- 
-    Kokkos::deep_copy(b_y,b_org_y);
+  Kokkos::deep_copy(h_b_x, b_x);
+  Kokkos::deep_copy(h_b_y, b_y);
 
-    //KokkosBlas::abs(y,c_x);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamAbs", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::abs(teamMember, Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), Kokkos::subview(c_x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)));
-    } );
+  typename ViewTypeA::const_type c_x = x;
 
-    ScalarB const_nonconst_result = KokkosBlas::dot(y,y);
-    EXPECT_NEAR_KK( const_nonconst_result, expected_result, eps*expected_result);
+  ScalarA *expected_result = new ScalarA[K];
+  for (int j = 0; j < K; j++) {
+    expected_result[j] = ScalarA();
+    for (int i = 0; i < N; i++)
+      expected_result[j] += AT::abs(h_x(i, j)) * AT::abs(h_x(i, j));
   }
 
-  template<class ViewTypeA, class ViewTypeB, class Device>
-  void impl_test_team_abs_mv(int N, int K) {
-
-    typedef Kokkos::TeamPolicy<Device>        team_policy ;
-    typedef typename team_policy::member_type team_member ;
-
-    //Launch K teams of the maximum number of threads per team
-    const team_policy policy( K, Kokkos::AUTO );
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-    typedef Kokkos::Details::ArithTraits<ScalarA> AT;
-
-    typedef multivector_layout_adapter<ViewTypeA> vfA_type;
-    typedef multivector_layout_adapter<ViewTypeB> vfB_type;
-
-    typename vfA_type::BaseType b_x("A",N,K);
-    typename vfB_type::BaseType b_y("B",N,K);
-    typename vfB_type::BaseType b_org_y("B",N,K);
-
-    ViewTypeA x = vfA_type::view(b_x);
-    ViewTypeB y = vfB_type::view(b_y);
-
-    typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
-    typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
-
-    typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x);
-    typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y);
-
-    typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x);
-    typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
-
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
-
-    Kokkos::fill_random(b_x,rand_pool,ScalarA(1));
-    Kokkos::fill_random(b_y,rand_pool,ScalarB(1));
-
-    Kokkos::deep_copy(b_org_y,b_y);
-
-    Kokkos::deep_copy(h_b_x,b_x);
-    Kokkos::deep_copy(h_b_y,b_y);
-
-    typename ViewTypeA::const_type c_x = x;
-
-    ScalarA* expected_result = new ScalarA[K];
-    for(int j=0;j<K;j++) {
-      expected_result[j] = ScalarA();
-      for(int i=0;i<N;i++)
-        expected_result[j] += AT::abs(h_x(i,j)) * AT::abs(h_x(i,j));
-    }
-
-//    double eps = std::is_same<ScalarA,float>::value?2*1e-5:1e-7;
-    typename AT::mag_type eps = AT::epsilon()*1000;
-    typename AT::mag_type zero = AT::abs( AT::zero() );
-    typename AT::mag_type one = AT::abs( AT::one() );
-
-    Kokkos::View<ScalarB*,Kokkos::HostSpace> r("Dot::Result",K);
-
-    //KokkosBlas::abs(y,x);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamAbs", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::abs(teamMember, Kokkos::subview(y,Kokkos::ALL(),teamId), Kokkos::subview(x,Kokkos::ALL(),teamId));
-    } );
-
-    KokkosBlas::dot(r,y,y);
-    for(int k=0;k<K;k++) {
-      ScalarA nonconst_result = r(k);
-      typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]);
-      typename AT::mag_type diff = AT::abs( nonconst_result - expected_result[k] )/divisor;
-      EXPECT_NEAR_KK( diff, zero, eps );
-//      EXPECT_NEAR_KK( nonconst_result, expected_result[k], eps*expected_result[k]);
-    }
-
-    Kokkos::deep_copy(b_y,b_org_y);
-
-    //KokkosBlas::abs(y,c_x);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamAbs", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::abs(teamMember, Kokkos::subview(y,Kokkos::ALL(),teamId), Kokkos::subview(c_x,Kokkos::ALL(),teamId));
-    } );
-
-    KokkosBlas::dot(r,y,y);
-    for(int k=0;k<K;k++) {
-      ScalarA const_result = r(k);
-      typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]);
-      typename AT::mag_type diff = AT::abs( const_result - expected_result[k] )/divisor;
-      EXPECT_NEAR_KK( diff, zero, eps );
-//      EXPECT_NEAR_KK( const_result, expected_result[k], eps*expected_result[k]);
-    }
-
-    delete [] expected_result;
+  //    double eps = std::is_same<ScalarA,float>::value?2*1e-5:1e-7;
+  typename AT::mag_type eps  = AT::epsilon() * 1000;
+  typename AT::mag_type zero = AT::abs(AT::zero());
+  typename AT::mag_type one  = AT::abs(AT::one());
+
+  Kokkos::View<ScalarB *, Kokkos::HostSpace> r("Dot::Result", K);
+
+  // KokkosBlas::abs(y,x);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamAbs", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::abs(
+            teamMember, Kokkos::subview(y, Kokkos::ALL(), teamId),
+            Kokkos::subview(x, Kokkos::ALL(), teamId));
+      });
+
+  KokkosBlas::dot(r, y, y);
+  for (int k = 0; k < K; k++) {
+    ScalarA nonconst_result = r(k);
+    typename AT::mag_type divisor =
+        AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]);
+    typename AT::mag_type diff =
+        AT::abs(nonconst_result - expected_result[k]) / divisor;
+    EXPECT_NEAR_KK(diff, zero, eps);
+    //      EXPECT_NEAR_KK( nonconst_result, expected_result[k],
+    //      eps*expected_result[k]);
   }
-}
 
+  Kokkos::deep_copy(b_y, b_org_y);
+
+  // KokkosBlas::abs(y,c_x);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamAbs", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::abs(
+            teamMember, Kokkos::subview(y, Kokkos::ALL(), teamId),
+            Kokkos::subview(c_x, Kokkos::ALL(), teamId));
+      });
+
+  KokkosBlas::dot(r, y, y);
+  for (int k = 0; k < K; k++) {
+    ScalarA const_result = r(k);
+    typename AT::mag_type divisor =
+        AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]);
+    typename AT::mag_type diff =
+        AT::abs(const_result - expected_result[k]) / divisor;
+    EXPECT_NEAR_KK(diff, zero, eps);
+    //      EXPECT_NEAR_KK( const_result, expected_result[k],
+    //      eps*expected_result[k]);
+  }
 
+  delete[] expected_result;
+}
+}  // namespace Test
 
-template<class ScalarA, class ScalarB, class Device>
+template <class ScalarA, class ScalarB, class Device>
 int test_team_abs() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA*, Kokkos::LayoutLeft, Device> view_type_a_ll;
-  typedef Kokkos::View<ScalarB*, Kokkos::LayoutLeft, Device> view_type_b_ll;
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA *, Kokkos::LayoutLeft, Device> view_type_a_ll;
+  typedef Kokkos::View<ScalarB *, Kokkos::LayoutLeft, Device> view_type_b_ll;
   Test::impl_test_team_abs<view_type_a_ll, view_type_b_ll, Device>(0);
   Test::impl_test_team_abs<view_type_a_ll, view_type_b_ll, Device>(13);
   Test::impl_test_team_abs<view_type_a_ll, view_type_b_ll, Device>(124);
-  //Test::impl_test_team_abs<view_type_a_ll, view_type_b_ll, Device>(132231);
+  // Test::impl_test_team_abs<view_type_a_ll, view_type_b_ll, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA*, Kokkos::LayoutRight, Device> view_type_a_lr;
-  typedef Kokkos::View<ScalarB*, Kokkos::LayoutRight, Device> view_type_b_lr;
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA *, Kokkos::LayoutRight, Device> view_type_a_lr;
+  typedef Kokkos::View<ScalarB *, Kokkos::LayoutRight, Device> view_type_b_lr;
   Test::impl_test_team_abs<view_type_a_lr, view_type_b_lr, Device>(0);
   Test::impl_test_team_abs<view_type_a_lr, view_type_b_lr, Device>(13);
   Test::impl_test_team_abs<view_type_a_lr, view_type_b_lr, Device>(124);
-  //Test::impl_test_team_abs<view_type_a_lr, view_type_b_lr, Device>(132231);
+  // Test::impl_test_team_abs<view_type_a_lr, view_type_b_lr, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA*, Kokkos::LayoutStride, Device> view_type_a_ls;
-  typedef Kokkos::View<ScalarB*, Kokkos::LayoutStride, Device> view_type_b_ls;
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA *, Kokkos::LayoutStride, Device> view_type_a_ls;
+  typedef Kokkos::View<ScalarB *, Kokkos::LayoutStride, Device> view_type_b_ls;
   Test::impl_test_team_abs<view_type_a_ls, view_type_b_ls, Device>(0);
   Test::impl_test_team_abs<view_type_a_ls, view_type_b_ls, Device>(13);
   Test::impl_test_team_abs<view_type_a_ls, view_type_b_ls, Device>(124);
-  //Test::impl_test_team_abs<view_type_a_ls, view_type_b_ls, Device>(132231);
+  // Test::impl_test_team_abs<view_type_a_ls, view_type_b_ls, Device>(132231);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
   Test::impl_test_team_abs<view_type_a_ls, view_type_b_ll, Device>(124);
   Test::impl_test_team_abs<view_type_a_ll, view_type_b_ls, Device>(124);
 #endif
@@ -223,87 +268,104 @@ int test_team_abs() {
   return 1;
 }
 
-template<class ScalarA, class ScalarB, class Device>
+template <class ScalarA, class ScalarB, class Device>
 int test_team_abs_mv() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device> view_type_a_ll;
-  typedef Kokkos::View<ScalarB**, Kokkos::LayoutLeft, Device> view_type_b_ll;
-  Test::impl_test_team_abs_mv<view_type_a_ll, view_type_b_ll, Device>(0,5);
-  Test::impl_test_team_abs_mv<view_type_a_ll, view_type_b_ll, Device>(13,5);
-  Test::impl_test_team_abs_mv<view_type_a_ll, view_type_b_ll, Device>(124,5);
-  //Test::impl_test_team_abs_mv<view_type_a_ll, view_type_b_ll, Device>(132231,5);
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA **, Kokkos::LayoutLeft, Device> view_type_a_ll;
+  typedef Kokkos::View<ScalarB **, Kokkos::LayoutLeft, Device> view_type_b_ll;
+  Test::impl_test_team_abs_mv<view_type_a_ll, view_type_b_ll, Device>(0, 5);
+  Test::impl_test_team_abs_mv<view_type_a_ll, view_type_b_ll, Device>(13, 5);
+  Test::impl_test_team_abs_mv<view_type_a_ll, view_type_b_ll, Device>(124, 5);
+  // Test::impl_test_team_abs_mv<view_type_a_ll, view_type_b_ll,
+  // Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device> view_type_a_lr;
-  typedef Kokkos::View<ScalarB**, Kokkos::LayoutRight, Device> view_type_b_lr;
-  Test::impl_test_team_abs_mv<view_type_a_lr, view_type_b_lr, Device>(0,5);
-  Test::impl_test_team_abs_mv<view_type_a_lr, view_type_b_lr, Device>(13,5);
-  Test::impl_test_team_abs_mv<view_type_a_lr, view_type_b_lr, Device>(124,5);
-  //Test::impl_test_team_abs_mv<view_type_a_lr, view_type_b_lr, Device>(132231,5);
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA **, Kokkos::LayoutRight, Device> view_type_a_lr;
+  typedef Kokkos::View<ScalarB **, Kokkos::LayoutRight, Device> view_type_b_lr;
+  Test::impl_test_team_abs_mv<view_type_a_lr, view_type_b_lr, Device>(0, 5);
+  Test::impl_test_team_abs_mv<view_type_a_lr, view_type_b_lr, Device>(13, 5);
+  Test::impl_test_team_abs_mv<view_type_a_lr, view_type_b_lr, Device>(124, 5);
+  // Test::impl_test_team_abs_mv<view_type_a_lr, view_type_b_lr,
+  // Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutStride, Device> view_type_a_ls;
-  typedef Kokkos::View<ScalarB**, Kokkos::LayoutStride, Device> view_type_b_ls;
-  Test::impl_test_team_abs_mv<view_type_a_ls, view_type_b_ls, Device>(0,5);
-  Test::impl_test_team_abs_mv<view_type_a_ls, view_type_b_ls, Device>(13,5);
-  Test::impl_test_team_abs_mv<view_type_a_ls, view_type_b_ls, Device>(124,5);
-  //Test::impl_test_team_abs_mv<view_type_a_ls, view_type_b_ls, Device>(132231,5);
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA **, Kokkos::LayoutStride, Device> view_type_a_ls;
+  typedef Kokkos::View<ScalarB **, Kokkos::LayoutStride, Device> view_type_b_ls;
+  Test::impl_test_team_abs_mv<view_type_a_ls, view_type_b_ls, Device>(0, 5);
+  Test::impl_test_team_abs_mv<view_type_a_ls, view_type_b_ls, Device>(13, 5);
+  Test::impl_test_team_abs_mv<view_type_a_ls, view_type_b_ls, Device>(124, 5);
+  // Test::impl_test_team_abs_mv<view_type_a_ls, view_type_b_ls,
+  // Device>(132231,5);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-  Test::impl_test_team_abs_mv<view_type_a_ls, view_type_b_ll, Device>(124,5);
-  Test::impl_test_team_abs_mv<view_type_a_ll, view_type_b_ls, Device>(124,5);
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+  Test::impl_test_team_abs_mv<view_type_a_ls, view_type_b_ll, Device>(124, 5);
+  Test::impl_test_team_abs_mv<view_type_a_ll, view_type_b_ls, Device>(124, 5);
 #endif
 
   return 1;
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_abs_float ) {
-    test_team_abs<float,float,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_abs_float) {
+  test_team_abs<float, float, TestExecSpace>();
 }
-TEST_F( TestCategory, team_abs_mv_float ) {
-    test_team_abs_mv<float,float,TestExecSpace> ();
+TEST_F(TestCategory, team_abs_mv_float) {
+  test_team_abs_mv<float, float, TestExecSpace>();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_abs_double ) {
-    test_team_abs<double,double,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_abs_double) {
+  test_team_abs<double, double, TestExecSpace>();
 }
-TEST_F( TestCategory, team_abs_mv_double ) {
-    test_team_abs_mv<double,double,TestExecSpace> ();
+TEST_F(TestCategory, team_abs_mv_double) {
+  test_team_abs_mv<double, double, TestExecSpace>();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_abs_complex_double ) {
-    test_team_abs<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_abs_complex_double) {
+  test_team_abs<Kokkos::complex<double>, Kokkos::complex<double>,
+                TestExecSpace>();
 }
-TEST_F( TestCategory, team_abs_mv_complex_double ) {
-    test_team_abs_mv<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+TEST_F(TestCategory, team_abs_mv_complex_double) {
+  test_team_abs_mv<Kokkos::complex<double>, Kokkos::complex<double>,
+                   TestExecSpace>();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_abs_int ) {
-    test_team_abs<int,int,TestExecSpace> ();
-}
-TEST_F( TestCategory, team_abs_mv_int ) {
-    test_team_abs_mv<int,int,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_INT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_abs_int) { test_team_abs<int, int, TestExecSpace>(); }
+TEST_F(TestCategory, team_abs_mv_int) {
+  test_team_abs_mv<int, int, TestExecSpace>();
 }
 #endif
 
-/*#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-TEST_F( TestCategory, team_abs_double_int ) {
-    test_team_abs<double,int,TestExecSpace> ();
+/*#if !defined(KOKKOSKERNELS_ETI_ONLY) &&
+!defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F( TestCategory,
+team_abs_double_int ) { test_team_abs<double,int,TestExecSpace> ();
 }
 TEST_F( TestCategory, team_abs_double_mv_int ) {
     test_team_abs_mv<double,int,TestExecSpace> ();
 }
 #endif*/
 
-#endif // Check for lambda availability in CUDA backend
+#endif  // Check for lambda availability in CUDA backend
diff --git a/unit_test/blas/Test_Blas1_team_axpby.hpp b/unit_test/blas/Test_Blas1_team_axpby.hpp
index f329bad7a2..0c67e4515a 100644
--- a/unit_test/blas/Test_Blas1_team_axpby.hpp
+++ b/unit_test/blas/Test_Blas1_team_axpby.hpp
@@ -4,215 +4,260 @@
 //       the CUDA backend before including this test.
 #if !defined(TEST_CUDA_BLAS_CPP) || defined(KOKKOS_ENABLE_CUDA_LAMBDA)
 
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas1_team_axpby.hpp>
-#include<KokkosBlas1_dot.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas1_team_axpby.hpp>
+#include <KokkosBlas1_dot.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
-  template<class ViewTypeA, class ViewTypeB, class Device>
-  void impl_test_team_axpby(int N) {
-
-    typedef Kokkos::TeamPolicy<Device>        team_policy ;
-    typedef typename team_policy::member_type team_member ;
-
-    //Launch M teams of the maximum number of threads per team
-    int M = 4;
-    const team_policy policy( M, Kokkos::AUTO );
-    const int team_data_siz = (N%M == 0)?(N/M):(N/M+1);
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-
-    typedef Kokkos::View<ScalarA*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeA::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeA;
-    typedef Kokkos::View<ScalarB*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeB::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeB;
-
-
-    ScalarA a = 3;
-    ScalarB b = 5;
-    double eps = std::is_same<ScalarA,float>::value?2*1e-5:1e-7;
-
-    BaseTypeA b_x("X",N);
-    BaseTypeB b_y("Y",N);
-    BaseTypeB b_org_y("Org_Y",N);
-    
-
-    ViewTypeA x = Kokkos::subview(b_x,Kokkos::ALL(),0);
-    ViewTypeB y = Kokkos::subview(b_y,Kokkos::ALL(),0);
-    typename ViewTypeA::const_type c_x = x;
-    typename ViewTypeB::const_type c_y = y;
-
-    typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
-    typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y);
-
-    typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x,Kokkos::ALL(),0);
-    typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y,Kokkos::ALL(),0);
-
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
-
-    Kokkos::fill_random(b_x,rand_pool,ScalarA(10));
-    Kokkos::fill_random(b_y,rand_pool,ScalarB(10));
-
-    Kokkos::deep_copy(b_org_y,b_y);
-
-    Kokkos::deep_copy(h_b_x,b_x);
-    Kokkos::deep_copy(h_b_y,b_y);
-
-    ScalarA expected_result = 0;
-    for(int i=0;i<N;i++)
-      expected_result += ScalarB(a*h_x(i) + b*h_y(i)) * ScalarB(a*h_x(i) + b*h_y(i));
-
-    //KokkosBlas::axpby(a,x,b,y);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamAxpby", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::axpby(teamMember, a, Kokkos::subview(x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), b, Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)));
-    } );
-
-    ScalarB nonconst_nonconst_result = KokkosBlas::dot(y,y);
-    EXPECT_NEAR_KK( nonconst_nonconst_result, expected_result, eps*expected_result);
- 
-    Kokkos::deep_copy(b_y,b_org_y);
-    
-    //KokkosBlas::axpby(a,c_x,b,y);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamAxpby", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::axpby(teamMember, a, Kokkos::subview(c_x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), b, Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)));
-    } );
-
-    ScalarB const_nonconst_result = KokkosBlas::dot(c_y,c_y);
-    EXPECT_NEAR_KK( const_nonconst_result, expected_result, eps*expected_result);
-  }
-
-  template<class ViewTypeA, class ViewTypeB, class Device>
-  void impl_test_team_axpby_mv(int N, int K) {
-
-    typedef Kokkos::TeamPolicy<Device>        team_policy ;
-    typedef typename team_policy::member_type team_member ;
-
-    //Launch K teams of the maximum number of threads per team
-    const team_policy policy( K, Kokkos::AUTO );
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-
-    typedef multivector_layout_adapter<ViewTypeA> vfA_type;
-    typedef multivector_layout_adapter<ViewTypeB> vfB_type;
+template <class ViewTypeA, class ViewTypeB, class Device>
+void impl_test_team_axpby(int N) {
+  typedef Kokkos::TeamPolicy<Device> team_policy;
+  typedef typename team_policy::member_type team_member;
+
+  // Launch M teams of the maximum number of threads per team
+  int M = 4;
+  const team_policy policy(M, Kokkos::AUTO);
+  const int team_data_siz = (N % M == 0) ? (N / M) : (N / M + 1);
+
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+
+  typedef Kokkos::View<
+      ScalarA * [2],
+      typename std::conditional<std::is_same<typename ViewTypeA::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeA;
+  typedef Kokkos::View<
+      ScalarB * [2],
+      typename std::conditional<std::is_same<typename ViewTypeB::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeB;
+
+  ScalarA a  = 3;
+  ScalarB b  = 5;
+  double eps = std::is_same<ScalarA, float>::value ? 2 * 1e-5 : 1e-7;
+
+  BaseTypeA b_x("X", N);
+  BaseTypeB b_y("Y", N);
+  BaseTypeB b_org_y("Org_Y", N);
+
+  ViewTypeA x                        = Kokkos::subview(b_x, Kokkos::ALL(), 0);
+  ViewTypeB y                        = Kokkos::subview(b_y, Kokkos::ALL(), 0);
+  typename ViewTypeA::const_type c_x = x;
+  typename ViewTypeB::const_type c_y = y;
+
+  typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
+  typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y);
+
+  typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0);
+  typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0);
+
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
+
+  Kokkos::fill_random(b_x, rand_pool, ScalarA(10));
+  Kokkos::fill_random(b_y, rand_pool, ScalarB(10));
+
+  Kokkos::deep_copy(b_org_y, b_y);
+
+  Kokkos::deep_copy(h_b_x, b_x);
+  Kokkos::deep_copy(h_b_y, b_y);
+
+  ScalarA expected_result = 0;
+  for (int i = 0; i < N; i++)
+    expected_result +=
+        ScalarB(a * h_x(i) + b * h_y(i)) * ScalarB(a * h_x(i) + b * h_y(i));
+
+  // KokkosBlas::axpby(a,x,b,y);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamAxpby", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::axpby(
+            teamMember, a,
+            Kokkos::subview(
+                x, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)),
+            b,
+            Kokkos::subview(
+                y, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)));
+      });
+
+  ScalarB nonconst_nonconst_result = KokkosBlas::dot(y, y);
+  EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result,
+                 eps * expected_result);
+
+  Kokkos::deep_copy(b_y, b_org_y);
+
+  // KokkosBlas::axpby(a,c_x,b,y);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamAxpby", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::axpby(
+            teamMember, a,
+            Kokkos::subview(
+                c_x, Kokkos::make_pair(
+                         teamId * team_data_siz,
+                         (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)),
+            b,
+            Kokkos::subview(
+                y, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)));
+      });
+
+  ScalarB const_nonconst_result = KokkosBlas::dot(c_y, c_y);
+  EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result);
+}
 
-    typename vfA_type::BaseType b_x("A",N,K);
-    typename vfB_type::BaseType b_y("B",N,K);
-    typename vfB_type::BaseType b_org_y("B",N,K);
+template <class ViewTypeA, class ViewTypeB, class Device>
+void impl_test_team_axpby_mv(int N, int K) {
+  typedef Kokkos::TeamPolicy<Device> team_policy;
+  typedef typename team_policy::member_type team_member;
 
-    ViewTypeA x = vfA_type::view(b_x);
-    ViewTypeB y = vfB_type::view(b_y);
+  // Launch K teams of the maximum number of threads per team
+  const team_policy policy(K, Kokkos::AUTO);
 
-    typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
-    typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
 
-    typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x);
-    typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y);
+  typedef multivector_layout_adapter<ViewTypeA> vfA_type;
+  typedef multivector_layout_adapter<ViewTypeB> vfB_type;
 
-    typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x);
-    typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
+  typename vfA_type::BaseType b_x("A", N, K);
+  typename vfB_type::BaseType b_y("B", N, K);
+  typename vfB_type::BaseType b_org_y("B", N, K);
 
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  ViewTypeA x = vfA_type::view(b_x);
+  ViewTypeB y = vfB_type::view(b_y);
 
-    Kokkos::fill_random(b_x,rand_pool,ScalarA(10));
-    Kokkos::fill_random(b_y,rand_pool,ScalarB(10));
+  typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
+  typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
 
-    Kokkos::deep_copy(b_org_y,b_y);
+  typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x);
+  typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y);
 
-    Kokkos::deep_copy(h_b_x,b_x);
-    Kokkos::deep_copy(h_b_y,b_y);
+  typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x);
+  typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
 
-    ScalarA a = 3;
-    ScalarB b = 5;
-    typename ViewTypeA::const_type c_x = x;
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-    ScalarA* expected_result = new ScalarA[K];
-    for(int j=0;j<K;j++) {
-      expected_result[j] = ScalarA();
-      for(int i=0;i<N;i++)
-        expected_result[j] += ScalarB(a*h_x(i,j) + b*h_y(i,j)) * ScalarB(a*h_x(i,j) + b*h_y(i,j));
-    }
+  Kokkos::fill_random(b_x, rand_pool, ScalarA(10));
+  Kokkos::fill_random(b_y, rand_pool, ScalarB(10));
 
+  Kokkos::deep_copy(b_org_y, b_y);
 
-    double eps = std::is_same<ScalarA,float>::value?2*1e-5:1e-7;
+  Kokkos::deep_copy(h_b_x, b_x);
+  Kokkos::deep_copy(h_b_y, b_y);
 
-    Kokkos::View<ScalarB*,Kokkos::HostSpace> r("Dot::Result",K);
+  ScalarA a                          = 3;
+  ScalarB b                          = 5;
+  typename ViewTypeA::const_type c_x = x;
 
-    typedef Kokkos::Details::ArithTraits<ScalarA> AT;
+  ScalarA *expected_result = new ScalarA[K];
+  for (int j = 0; j < K; j++) {
+    expected_result[j] = ScalarA();
+    for (int i = 0; i < N; i++)
+      expected_result[j] += ScalarB(a * h_x(i, j) + b * h_y(i, j)) *
+                            ScalarB(a * h_x(i, j) + b * h_y(i, j));
+  }
 
-    //KokkosBlas::axpby(a,x,b,y);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamAxpby", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::axpby(teamMember, a, Kokkos::subview(x,Kokkos::ALL(),teamId), b, Kokkos::subview(y,Kokkos::ALL(),teamId));
-    } );
+  double eps = std::is_same<ScalarA, float>::value ? 2 * 1e-5 : 1e-7;
 
-    KokkosBlas::dot(r,y,y);
-    for(int k=0;k<K;k++) {
-      ScalarA nonconst_nonconst_result = r(k);
-      EXPECT_NEAR_KK( AT::abs(nonconst_nonconst_result), AT::abs(expected_result[k]), AT::abs(expected_result[k]*eps));
-    }
+  Kokkos::View<ScalarB *, Kokkos::HostSpace> r("Dot::Result", K);
 
-    Kokkos::deep_copy(b_y,b_org_y);
-    
-    //KokkosBlas::axpby(a,c_x,b,y);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamAxpby", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::axpby(teamMember, a, Kokkos::subview(c_x,Kokkos::ALL(),teamId), b, Kokkos::subview(y,Kokkos::ALL(),teamId));
-    } );
+  typedef Kokkos::Details::ArithTraits<ScalarA> AT;
 
-    KokkosBlas::dot(r,y,y);
-    for(int k=0;k<K;k++) {
-      ScalarA const_non_const_result = r(k);
-      EXPECT_NEAR_KK( AT::abs(const_non_const_result), AT::abs(expected_result[k]), AT::abs(eps*expected_result[k]));
-    }
+  // KokkosBlas::axpby(a,x,b,y);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamAxpby", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::axpby(
+            teamMember, a, Kokkos::subview(x, Kokkos::ALL(), teamId), b,
+            Kokkos::subview(y, Kokkos::ALL(), teamId));
+      });
 
-    delete [] expected_result;
+  KokkosBlas::dot(r, y, y);
+  for (int k = 0; k < K; k++) {
+    ScalarA nonconst_nonconst_result = r(k);
+    EXPECT_NEAR_KK(AT::abs(nonconst_nonconst_result),
+                   AT::abs(expected_result[k]),
+                   AT::abs(expected_result[k] * eps));
   }
-}
 
+  Kokkos::deep_copy(b_y, b_org_y);
+
+  // KokkosBlas::axpby(a,c_x,b,y);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamAxpby", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::axpby(
+            teamMember, a, Kokkos::subview(c_x, Kokkos::ALL(), teamId), b,
+            Kokkos::subview(y, Kokkos::ALL(), teamId));
+      });
+
+  KokkosBlas::dot(r, y, y);
+  for (int k = 0; k < K; k++) {
+    ScalarA const_non_const_result = r(k);
+    EXPECT_NEAR_KK(AT::abs(const_non_const_result), AT::abs(expected_result[k]),
+                   AT::abs(eps * expected_result[k]));
+  }
 
+  delete[] expected_result;
+}
+}  // namespace Test
 
-template<class ScalarA, class ScalarB, class Device>
+template <class ScalarA, class ScalarB, class Device>
 int test_team_axpby() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA*, Kokkos::LayoutLeft, Device> view_type_a_ll;
-  typedef Kokkos::View<ScalarB*, Kokkos::LayoutLeft, Device> view_type_b_ll;
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA *, Kokkos::LayoutLeft, Device> view_type_a_ll;
+  typedef Kokkos::View<ScalarB *, Kokkos::LayoutLeft, Device> view_type_b_ll;
   Test::impl_test_team_axpby<view_type_a_ll, view_type_b_ll, Device>(0);
   Test::impl_test_team_axpby<view_type_a_ll, view_type_b_ll, Device>(13);
   Test::impl_test_team_axpby<view_type_a_ll, view_type_b_ll, Device>(124);
-  //Test::impl_test_team_axpby<view_type_a_ll, view_type_b_ll, Device>(132231);
+  // Test::impl_test_team_axpby<view_type_a_ll, view_type_b_ll, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA*, Kokkos::LayoutRight, Device> view_type_a_lr;
-  typedef Kokkos::View<ScalarB*, Kokkos::LayoutRight, Device> view_type_b_lr;
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA *, Kokkos::LayoutRight, Device> view_type_a_lr;
+  typedef Kokkos::View<ScalarB *, Kokkos::LayoutRight, Device> view_type_b_lr;
   Test::impl_test_team_axpby<view_type_a_lr, view_type_b_lr, Device>(0);
   Test::impl_test_team_axpby<view_type_a_lr, view_type_b_lr, Device>(13);
   Test::impl_test_team_axpby<view_type_a_lr, view_type_b_lr, Device>(124);
-  //Test::impl_test_team_axpby<view_type_a_lr, view_type_b_lr, Device>(132231);
+  // Test::impl_test_team_axpby<view_type_a_lr, view_type_b_lr, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA*, Kokkos::LayoutStride, Device> view_type_a_ls;
-  typedef Kokkos::View<ScalarB*, Kokkos::LayoutStride, Device> view_type_b_ls;
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA *, Kokkos::LayoutStride, Device> view_type_a_ls;
+  typedef Kokkos::View<ScalarB *, Kokkos::LayoutStride, Device> view_type_b_ls;
   Test::impl_test_team_axpby<view_type_a_ls, view_type_b_ls, Device>(0);
   Test::impl_test_team_axpby<view_type_a_ls, view_type_b_ls, Device>(13);
   Test::impl_test_team_axpby<view_type_a_ls, view_type_b_ls, Device>(124);
-  //Test::impl_test_team_axpby<view_type_a_ls, view_type_b_ls, Device>(132231);
+  // Test::impl_test_team_axpby<view_type_a_ls, view_type_b_ls, Device>(132231);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
   Test::impl_test_team_axpby<view_type_a_ls, view_type_b_ll, Device>(124);
   Test::impl_test_team_axpby<view_type_a_ll, view_type_b_ls, Device>(124);
 #endif
@@ -220,87 +265,107 @@ int test_team_axpby() {
   return 1;
 }
 
-template<class ScalarA, class ScalarB, class Device>
+template <class ScalarA, class ScalarB, class Device>
 int test_team_axpby_mv() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device> view_type_a_ll;
-  typedef Kokkos::View<ScalarB**, Kokkos::LayoutLeft, Device> view_type_b_ll;
-  Test::impl_test_team_axpby_mv<view_type_a_ll, view_type_b_ll, Device>(0,5);
-  Test::impl_test_team_axpby_mv<view_type_a_ll, view_type_b_ll, Device>(13,5);
-  Test::impl_test_team_axpby_mv<view_type_a_ll, view_type_b_ll, Device>(124,5);
-  //Test::impl_test_team_axpby_mv<view_type_a_ll, view_type_b_ll, Device>(132231,5);
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA **, Kokkos::LayoutLeft, Device> view_type_a_ll;
+  typedef Kokkos::View<ScalarB **, Kokkos::LayoutLeft, Device> view_type_b_ll;
+  Test::impl_test_team_axpby_mv<view_type_a_ll, view_type_b_ll, Device>(0, 5);
+  Test::impl_test_team_axpby_mv<view_type_a_ll, view_type_b_ll, Device>(13, 5);
+  Test::impl_test_team_axpby_mv<view_type_a_ll, view_type_b_ll, Device>(124, 5);
+  // Test::impl_test_team_axpby_mv<view_type_a_ll, view_type_b_ll,
+  // Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device> view_type_a_lr;
-  typedef Kokkos::View<ScalarB**, Kokkos::LayoutRight, Device> view_type_b_lr;
-  Test::impl_test_team_axpby_mv<view_type_a_lr, view_type_b_lr, Device>(0,5);
-  Test::impl_test_team_axpby_mv<view_type_a_lr, view_type_b_lr, Device>(13,5);
-  Test::impl_test_team_axpby_mv<view_type_a_lr, view_type_b_lr, Device>(124,5);
-  //Test::impl_test_team_axpby_mv<view_type_a_lr, view_type_b_lr, Device>(132231,5);
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA **, Kokkos::LayoutRight, Device> view_type_a_lr;
+  typedef Kokkos::View<ScalarB **, Kokkos::LayoutRight, Device> view_type_b_lr;
+  Test::impl_test_team_axpby_mv<view_type_a_lr, view_type_b_lr, Device>(0, 5);
+  Test::impl_test_team_axpby_mv<view_type_a_lr, view_type_b_lr, Device>(13, 5);
+  Test::impl_test_team_axpby_mv<view_type_a_lr, view_type_b_lr, Device>(124, 5);
+  // Test::impl_test_team_axpby_mv<view_type_a_lr, view_type_b_lr,
+  // Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutStride, Device> view_type_a_ls;
-  typedef Kokkos::View<ScalarB**, Kokkos::LayoutStride, Device> view_type_b_ls;
-  Test::impl_test_team_axpby_mv<view_type_a_ls, view_type_b_ls, Device>(0,5);
-  Test::impl_test_team_axpby_mv<view_type_a_ls, view_type_b_ls, Device>(13,5);
-  Test::impl_test_team_axpby_mv<view_type_a_ls, view_type_b_ls, Device>(124,5);
-  //Test::impl_test_team_axpby_mv<view_type_a_ls, view_type_b_ls, Device>(132231,5);
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA **, Kokkos::LayoutStride, Device> view_type_a_ls;
+  typedef Kokkos::View<ScalarB **, Kokkos::LayoutStride, Device> view_type_b_ls;
+  Test::impl_test_team_axpby_mv<view_type_a_ls, view_type_b_ls, Device>(0, 5);
+  Test::impl_test_team_axpby_mv<view_type_a_ls, view_type_b_ls, Device>(13, 5);
+  Test::impl_test_team_axpby_mv<view_type_a_ls, view_type_b_ls, Device>(124, 5);
+  // Test::impl_test_team_axpby_mv<view_type_a_ls, view_type_b_ls,
+  // Device>(132231,5);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-  Test::impl_test_team_axpby_mv<view_type_a_ls, view_type_b_ll, Device>(124,5);
-  Test::impl_test_team_axpby_mv<view_type_a_ll, view_type_b_ls, Device>(124,5);
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+  Test::impl_test_team_axpby_mv<view_type_a_ls, view_type_b_ll, Device>(124, 5);
+  Test::impl_test_team_axpby_mv<view_type_a_ll, view_type_b_ls, Device>(124, 5);
 #endif
 
   return 1;
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_axpby_float ) {
-    test_team_axpby<float,float,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_axpby_float) {
+  test_team_axpby<float, float, TestExecSpace>();
 }
-TEST_F( TestCategory, team_axpby_mv_float ) {
-    test_team_axpby_mv<float,float,TestExecSpace> ();
+TEST_F(TestCategory, team_axpby_mv_float) {
+  test_team_axpby_mv<float, float, TestExecSpace>();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_axpby_double ) {
-    test_team_axpby<double,double,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_axpby_double) {
+  test_team_axpby<double, double, TestExecSpace>();
 }
-TEST_F( TestCategory, team_axpby_mv_double ) {
-    test_team_axpby_mv<double,double,TestExecSpace> ();
+TEST_F(TestCategory, team_axpby_mv_double) {
+  test_team_axpby_mv<double, double, TestExecSpace>();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_axpby_complex_double ) {
-    test_team_axpby<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_axpby_complex_double) {
+  test_team_axpby<Kokkos::complex<double>, Kokkos::complex<double>,
+                  TestExecSpace>();
 }
-TEST_F( TestCategory, team_axpby_mv_complex_double ) {
-    test_team_axpby_mv<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+TEST_F(TestCategory, team_axpby_mv_complex_double) {
+  test_team_axpby_mv<Kokkos::complex<double>, Kokkos::complex<double>,
+                     TestExecSpace>();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_axpby_int ) {
-    test_team_axpby<int,int,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_INT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_axpby_int) {
+  test_team_axpby<int, int, TestExecSpace>();
 }
-TEST_F( TestCategory, team_axpby_mv_int ) {
-    test_team_axpby_mv<int,int,TestExecSpace> ();
+TEST_F(TestCategory, team_axpby_mv_int) {
+  test_team_axpby_mv<int, int, TestExecSpace>();
 }
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-TEST_F( TestCategory, team_axpby_double_int ) {
-    test_team_axpby<double,int,TestExecSpace> ();
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+TEST_F(TestCategory, team_axpby_double_int) {
+  test_team_axpby<double, int, TestExecSpace>();
 }
-TEST_F( TestCategory, team_axpby_double_mv_int ) {
-    test_team_axpby_mv<double,int,TestExecSpace> ();
+TEST_F(TestCategory, team_axpby_double_mv_int) {
+  test_team_axpby_mv<double, int, TestExecSpace>();
 }
 #endif
 
-#endif // check for lambda availability in CUDA backend
+#endif  // check for lambda availability in CUDA backend
diff --git a/unit_test/blas/Test_Blas1_team_axpy.hpp b/unit_test/blas/Test_Blas1_team_axpy.hpp
index 0bb824ec7f..5bd3f59436 100644
--- a/unit_test/blas/Test_Blas1_team_axpy.hpp
+++ b/unit_test/blas/Test_Blas1_team_axpy.hpp
@@ -4,210 +4,253 @@
 //       the CUDA backend before including this test.
 #if !defined(TEST_CUDA_BLAS_CPP) || defined(KOKKOS_ENABLE_CUDA_LAMBDA)
 
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas1_team_axpby.hpp>
-#include<KokkosBlas1_dot.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas1_team_axpby.hpp>
+#include <KokkosBlas1_dot.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
-  template<class ViewTypeA, class ViewTypeB, class Device>
-  void impl_test_team_axpy(int N) {
-    
-    typedef Kokkos::TeamPolicy<Device>        team_policy ;
-    typedef typename team_policy::member_type team_member ;
-
-    //Launch M teams of the maximum number of threads per team
-    int M = 4;
-    const team_policy policy( M, Kokkos::AUTO );
-    const int team_data_siz = (N%M == 0)?(N/M):(N/M+1);
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-
-    typedef Kokkos::View<ScalarA*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeA::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeA;
-    typedef Kokkos::View<ScalarB*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeB::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeB;
-
-
-    ScalarA a = 3;
-    double eps = std::is_same<ScalarA,float>::value?2*1e-5:1e-7;
-
-    BaseTypeA b_x("X",N);
-    BaseTypeB b_y("Y",N);
-    BaseTypeB b_org_y("Org_Y",N);
-    
-
-    ViewTypeA x = Kokkos::subview(b_x,Kokkos::ALL(),0);
-    ViewTypeB y = Kokkos::subview(b_y,Kokkos::ALL(),0);
-    typename ViewTypeA::const_type c_x = x;
-    typename ViewTypeB::const_type c_y = y;
-
-    typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
-    typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y);
-
-    typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x,Kokkos::ALL(),0);
-    typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y,Kokkos::ALL(),0);
-
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
-
-    Kokkos::fill_random(b_x,rand_pool,ScalarA(10));
-    Kokkos::fill_random(b_y,rand_pool,ScalarB(10));
-
-    Kokkos::deep_copy(b_org_y,b_y);
-
-    Kokkos::deep_copy(h_b_x,b_x);
-    Kokkos::deep_copy(h_b_y,b_y);
-
-    ScalarA expected_result = 0;
-    for(int i=0;i<N;i++)
-      expected_result += ScalarB(a*h_x(i) + h_y(i)) * ScalarB(a*h_x(i) + h_y(i));
-
-    //KokkosBlas::axpy(a,x,y);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamAxpy", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::axpy(teamMember, a, Kokkos::subview(x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)));
-    } );
-
-    ScalarB nonconst_nonconst_result = KokkosBlas::dot(y,y);
-    EXPECT_NEAR_KK( nonconst_nonconst_result, expected_result, eps*expected_result);
- 
-    Kokkos::deep_copy(b_y,b_org_y);
-
-    //KokkosBlas::axpy(a,c_x,y);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamAxpy", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::axpy(teamMember, a, Kokkos::subview(c_x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)));
-    } );
-
-    ScalarB const_nonconst_result = KokkosBlas::dot(c_y,c_y);
-    EXPECT_NEAR_KK( const_nonconst_result, expected_result, eps*expected_result);
-  }
-
-  template<class ViewTypeA, class ViewTypeB, class Device>
-  void impl_test_team_axpy_mv(int N, int K) {
-
-    typedef Kokkos::TeamPolicy<Device>        team_policy ;
-    typedef typename team_policy::member_type team_member ;
-
-    //Launch K teams of the maximum number of threads per team
-    const team_policy policy( K, Kokkos::AUTO );
-    
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-
-    typedef multivector_layout_adapter<ViewTypeA> vfA_type;
-    typedef multivector_layout_adapter<ViewTypeB> vfB_type;
-
-    typename vfA_type::BaseType b_x("A",N,K);
-    typename vfB_type::BaseType b_y("B",N,K);
-    typename vfB_type::BaseType b_org_y("B",N,K);
-
-    ViewTypeA x = vfA_type::view(b_x);
-    ViewTypeB y = vfB_type::view(b_y);
-
-    typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
-    typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
+template <class ViewTypeA, class ViewTypeB, class Device>
+void impl_test_team_axpy(int N) {
+  typedef Kokkos::TeamPolicy<Device> team_policy;
+  typedef typename team_policy::member_type team_member;
+
+  // Launch M teams of the maximum number of threads per team
+  int M = 4;
+  const team_policy policy(M, Kokkos::AUTO);
+  const int team_data_siz = (N % M == 0) ? (N / M) : (N / M + 1);
+
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+
+  typedef Kokkos::View<
+      ScalarA * [2],
+      typename std::conditional<std::is_same<typename ViewTypeA::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeA;
+  typedef Kokkos::View<
+      ScalarB * [2],
+      typename std::conditional<std::is_same<typename ViewTypeB::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeB;
+
+  ScalarA a  = 3;
+  double eps = std::is_same<ScalarA, float>::value ? 2 * 1e-5 : 1e-7;
+
+  BaseTypeA b_x("X", N);
+  BaseTypeB b_y("Y", N);
+  BaseTypeB b_org_y("Org_Y", N);
+
+  ViewTypeA x                        = Kokkos::subview(b_x, Kokkos::ALL(), 0);
+  ViewTypeB y                        = Kokkos::subview(b_y, Kokkos::ALL(), 0);
+  typename ViewTypeA::const_type c_x = x;
+  typename ViewTypeB::const_type c_y = y;
+
+  typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
+  typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y);
+
+  typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0);
+  typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0);
+
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
+
+  Kokkos::fill_random(b_x, rand_pool, ScalarA(10));
+  Kokkos::fill_random(b_y, rand_pool, ScalarB(10));
+
+  Kokkos::deep_copy(b_org_y, b_y);
+
+  Kokkos::deep_copy(h_b_x, b_x);
+  Kokkos::deep_copy(h_b_y, b_y);
+
+  ScalarA expected_result = 0;
+  for (int i = 0; i < N; i++)
+    expected_result +=
+        ScalarB(a * h_x(i) + h_y(i)) * ScalarB(a * h_x(i) + h_y(i));
+
+  // KokkosBlas::axpy(a,x,y);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamAxpy", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::axpy(
+            teamMember, a,
+            Kokkos::subview(
+                x, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)),
+            Kokkos::subview(
+                y, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)));
+      });
+
+  ScalarB nonconst_nonconst_result = KokkosBlas::dot(y, y);
+  EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result,
+                 eps * expected_result);
+
+  Kokkos::deep_copy(b_y, b_org_y);
+
+  // KokkosBlas::axpy(a,c_x,y);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamAxpy", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::axpy(
+            teamMember, a,
+            Kokkos::subview(
+                c_x, Kokkos::make_pair(
+                         teamId * team_data_siz,
+                         (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)),
+            Kokkos::subview(
+                y, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)));
+      });
+
+  ScalarB const_nonconst_result = KokkosBlas::dot(c_y, c_y);
+  EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result);
+}
 
-    typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x);
-    typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y);
+template <class ViewTypeA, class ViewTypeB, class Device>
+void impl_test_team_axpy_mv(int N, int K) {
+  typedef Kokkos::TeamPolicy<Device> team_policy;
+  typedef typename team_policy::member_type team_member;
 
-    typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x);
-    typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
+  // Launch K teams of the maximum number of threads per team
+  const team_policy policy(K, Kokkos::AUTO);
 
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
 
-    Kokkos::fill_random(b_x,rand_pool,ScalarA(10));
-    Kokkos::fill_random(b_y,rand_pool,ScalarB(10));
+  typedef multivector_layout_adapter<ViewTypeA> vfA_type;
+  typedef multivector_layout_adapter<ViewTypeB> vfB_type;
 
-    Kokkos::deep_copy(b_org_y,b_y);
+  typename vfA_type::BaseType b_x("A", N, K);
+  typename vfB_type::BaseType b_y("B", N, K);
+  typename vfB_type::BaseType b_org_y("B", N, K);
 
-    Kokkos::deep_copy(h_b_x,b_x);
-    Kokkos::deep_copy(h_b_y,b_y);
+  ViewTypeA x = vfA_type::view(b_x);
+  ViewTypeB y = vfB_type::view(b_y);
 
-    ScalarA a = 3;
-    typename ViewTypeA::const_type c_x = x;
+  typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
+  typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
 
-    ScalarA* expected_result = new ScalarA[K];
-    for(int j=0;j<K;j++) {
-      expected_result[j] = ScalarA();
-      for(int i=0;i<N;i++)
-        expected_result[j] += ScalarB(a*h_x(i,j) + h_y(i,j)) * ScalarB(a*h_x(i,j) + h_y(i,j));
-    }
+  typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x);
+  typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y);
 
-    double eps = std::is_same<ScalarA,float>::value?2*1e-5:1e-7;
+  typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x);
+  typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
 
-    Kokkos::View<ScalarB*,Kokkos::HostSpace> r("Dot::Result",K);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-    //KokkosBlas::axpy(a,x,y);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamAxpy", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::axpy(teamMember, a, Kokkos::subview(x,Kokkos::ALL(),teamId), Kokkos::subview(y,Kokkos::ALL(),teamId));
-    } );
+  Kokkos::fill_random(b_x, rand_pool, ScalarA(10));
+  Kokkos::fill_random(b_y, rand_pool, ScalarB(10));
 
-    KokkosBlas::dot(r,y,y);
-    for(int k=0;k<K;k++) {
-      ScalarA nonconst_nonconst_result = r(k);
-      EXPECT_NEAR_KK( nonconst_nonconst_result, expected_result[k], eps*expected_result[k]);
-    }
+  Kokkos::deep_copy(b_org_y, b_y);
 
-    Kokkos::deep_copy(b_y,b_org_y);
+  Kokkos::deep_copy(h_b_x, b_x);
+  Kokkos::deep_copy(h_b_y, b_y);
 
-    //KokkosBlas::axpy(a,c_x,y);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamAxpy", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::axpy(teamMember, a, Kokkos::subview(c_x,Kokkos::ALL(),teamId), Kokkos::subview(y,Kokkos::ALL(),teamId));
-    } );
+  ScalarA a                          = 3;
+  typename ViewTypeA::const_type c_x = x;
 
-    KokkosBlas::dot(r,y,y);
-    for(int k=0;k<K;k++) {
-      ScalarA const_non_const_result = r(k);
-      EXPECT_NEAR_KK( const_non_const_result, expected_result[k], eps*expected_result[k]);
-    }
+  ScalarA *expected_result = new ScalarA[K];
+  for (int j = 0; j < K; j++) {
+    expected_result[j] = ScalarA();
+    for (int i = 0; i < N; i++)
+      expected_result[j] += ScalarB(a * h_x(i, j) + h_y(i, j)) *
+                            ScalarB(a * h_x(i, j) + h_y(i, j));
+  }
 
-    delete [] expected_result;
+  double eps = std::is_same<ScalarA, float>::value ? 2 * 1e-5 : 1e-7;
+
+  Kokkos::View<ScalarB *, Kokkos::HostSpace> r("Dot::Result", K);
+
+  // KokkosBlas::axpy(a,x,y);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamAxpy", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::axpy(
+            teamMember, a, Kokkos::subview(x, Kokkos::ALL(), teamId),
+            Kokkos::subview(y, Kokkos::ALL(), teamId));
+      });
+
+  KokkosBlas::dot(r, y, y);
+  for (int k = 0; k < K; k++) {
+    ScalarA nonconst_nonconst_result = r(k);
+    EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k],
+                   eps * expected_result[k]);
   }
-}
 
+  Kokkos::deep_copy(b_y, b_org_y);
+
+  // KokkosBlas::axpy(a,c_x,y);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamAxpy", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::axpy(
+            teamMember, a, Kokkos::subview(c_x, Kokkos::ALL(), teamId),
+            Kokkos::subview(y, Kokkos::ALL(), teamId));
+      });
+
+  KokkosBlas::dot(r, y, y);
+  for (int k = 0; k < K; k++) {
+    ScalarA const_non_const_result = r(k);
+    EXPECT_NEAR_KK(const_non_const_result, expected_result[k],
+                   eps * expected_result[k]);
+  }
 
+  delete[] expected_result;
+}
+}  // namespace Test
 
-template<class ScalarA, class ScalarB, class Device>
+template <class ScalarA, class ScalarB, class Device>
 int test_team_axpy() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA*, Kokkos::LayoutLeft, Device> view_type_a_ll;
-  typedef Kokkos::View<ScalarB*, Kokkos::LayoutLeft, Device> view_type_b_ll;
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA *, Kokkos::LayoutLeft, Device> view_type_a_ll;
+  typedef Kokkos::View<ScalarB *, Kokkos::LayoutLeft, Device> view_type_b_ll;
   Test::impl_test_team_axpy<view_type_a_ll, view_type_b_ll, Device>(0);
   Test::impl_test_team_axpy<view_type_a_ll, view_type_b_ll, Device>(13);
   Test::impl_test_team_axpy<view_type_a_ll, view_type_b_ll, Device>(124);
-  //Test::impl_test_team_axpy<view_type_a_ll, view_type_b_ll, Device>(132231);
+  // Test::impl_test_team_axpy<view_type_a_ll, view_type_b_ll, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA*, Kokkos::LayoutRight, Device> view_type_a_lr;
-  typedef Kokkos::View<ScalarB*, Kokkos::LayoutRight, Device> view_type_b_lr;
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA *, Kokkos::LayoutRight, Device> view_type_a_lr;
+  typedef Kokkos::View<ScalarB *, Kokkos::LayoutRight, Device> view_type_b_lr;
   Test::impl_test_team_axpy<view_type_a_lr, view_type_b_lr, Device>(0);
   Test::impl_test_team_axpy<view_type_a_lr, view_type_b_lr, Device>(13);
   Test::impl_test_team_axpy<view_type_a_lr, view_type_b_lr, Device>(124);
-  //Test::impl_test_team_axpy<view_type_a_lr, view_type_b_lr, Device>(132231);
+  // Test::impl_test_team_axpy<view_type_a_lr, view_type_b_lr, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA*, Kokkos::LayoutStride, Device> view_type_a_ls;
-  typedef Kokkos::View<ScalarB*, Kokkos::LayoutStride, Device> view_type_b_ls;
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA *, Kokkos::LayoutStride, Device> view_type_a_ls;
+  typedef Kokkos::View<ScalarB *, Kokkos::LayoutStride, Device> view_type_b_ls;
   Test::impl_test_team_axpy<view_type_a_ls, view_type_b_ls, Device>(0);
   Test::impl_test_team_axpy<view_type_a_ls, view_type_b_ls, Device>(13);
   Test::impl_test_team_axpy<view_type_a_ls, view_type_b_ls, Device>(124);
-  //Test::impl_test_team_axpy<view_type_a_ls, view_type_b_ls, Device>(132231);
+  // Test::impl_test_team_axpy<view_type_a_ls, view_type_b_ls, Device>(132231);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
   Test::impl_test_team_axpy<view_type_a_ls, view_type_b_ll, Device>(124);
   Test::impl_test_team_axpy<view_type_a_ll, view_type_b_ls, Device>(124);
 #endif
@@ -215,87 +258,107 @@ int test_team_axpy() {
   return 1;
 }
 
-template<class ScalarA, class ScalarB, class Device>
+template <class ScalarA, class ScalarB, class Device>
 int test_team_axpy_mv() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device> view_type_a_ll;
-  typedef Kokkos::View<ScalarB**, Kokkos::LayoutLeft, Device> view_type_b_ll;
-  Test::impl_test_team_axpy_mv<view_type_a_ll, view_type_b_ll, Device>(0,5);
-  Test::impl_test_team_axpy_mv<view_type_a_ll, view_type_b_ll, Device>(13,5);
-  Test::impl_test_team_axpy_mv<view_type_a_ll, view_type_b_ll, Device>(124,5);
-  //Test::impl_test_team_axpy_mv<view_type_a_ll, view_type_b_ll, Device>(132231,5);
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA **, Kokkos::LayoutLeft, Device> view_type_a_ll;
+  typedef Kokkos::View<ScalarB **, Kokkos::LayoutLeft, Device> view_type_b_ll;
+  Test::impl_test_team_axpy_mv<view_type_a_ll, view_type_b_ll, Device>(0, 5);
+  Test::impl_test_team_axpy_mv<view_type_a_ll, view_type_b_ll, Device>(13, 5);
+  Test::impl_test_team_axpy_mv<view_type_a_ll, view_type_b_ll, Device>(124, 5);
+  // Test::impl_test_team_axpy_mv<view_type_a_ll, view_type_b_ll,
+  // Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device> view_type_a_lr;
-  typedef Kokkos::View<ScalarB**, Kokkos::LayoutRight, Device> view_type_b_lr;
-  Test::impl_test_team_axpy_mv<view_type_a_lr, view_type_b_lr, Device>(0,5);
-  Test::impl_test_team_axpy_mv<view_type_a_lr, view_type_b_lr, Device>(13,5);
-  Test::impl_test_team_axpy_mv<view_type_a_lr, view_type_b_lr, Device>(124,5);
-  //Test::impl_test_team_axpy_mv<view_type_a_lr, view_type_b_lr, Device>(132231,5);
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA **, Kokkos::LayoutRight, Device> view_type_a_lr;
+  typedef Kokkos::View<ScalarB **, Kokkos::LayoutRight, Device> view_type_b_lr;
+  Test::impl_test_team_axpy_mv<view_type_a_lr, view_type_b_lr, Device>(0, 5);
+  Test::impl_test_team_axpy_mv<view_type_a_lr, view_type_b_lr, Device>(13, 5);
+  Test::impl_test_team_axpy_mv<view_type_a_lr, view_type_b_lr, Device>(124, 5);
+  // Test::impl_test_team_axpy_mv<view_type_a_lr, view_type_b_lr,
+  // Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutStride, Device> view_type_a_ls;
-  typedef Kokkos::View<ScalarB**, Kokkos::LayoutStride, Device> view_type_b_ls;
-  Test::impl_test_team_axpy_mv<view_type_a_ls, view_type_b_ls, Device>(0,5);
-  Test::impl_test_team_axpy_mv<view_type_a_ls, view_type_b_ls, Device>(13,5);
-  Test::impl_test_team_axpy_mv<view_type_a_ls, view_type_b_ls, Device>(124,5);
-  //Test::impl_test_team_axpy_mv<view_type_a_ls, view_type_b_ls, Device>(132231,5);
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA **, Kokkos::LayoutStride, Device> view_type_a_ls;
+  typedef Kokkos::View<ScalarB **, Kokkos::LayoutStride, Device> view_type_b_ls;
+  Test::impl_test_team_axpy_mv<view_type_a_ls, view_type_b_ls, Device>(0, 5);
+  Test::impl_test_team_axpy_mv<view_type_a_ls, view_type_b_ls, Device>(13, 5);
+  Test::impl_test_team_axpy_mv<view_type_a_ls, view_type_b_ls, Device>(124, 5);
+  // Test::impl_test_team_axpy_mv<view_type_a_ls, view_type_b_ls,
+  // Device>(132231,5);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-  Test::impl_test_team_axpy_mv<view_type_a_ls, view_type_b_ll, Device>(124,5);
-  Test::impl_test_team_axpy_mv<view_type_a_ll, view_type_b_ls, Device>(124,5);
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+  Test::impl_test_team_axpy_mv<view_type_a_ls, view_type_b_ll, Device>(124, 5);
+  Test::impl_test_team_axpy_mv<view_type_a_ll, view_type_b_ls, Device>(124, 5);
 #endif
 
   return 1;
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_axpy_float ) {
-    test_team_axpy<float,float,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_axpy_float) {
+  test_team_axpy<float, float, TestExecSpace>();
 }
-TEST_F( TestCategory, team_axpy_mv_float ) {
-    test_team_axpy_mv<float,float,TestExecSpace> ();
+TEST_F(TestCategory, team_axpy_mv_float) {
+  test_team_axpy_mv<float, float, TestExecSpace>();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_axpy_double ) {
-    test_team_axpy<double,double,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_axpy_double) {
+  test_team_axpy<double, double, TestExecSpace>();
 }
-TEST_F( TestCategory, team_axpy_mv_double ) {
-    test_team_axpy_mv<double,double,TestExecSpace> ();
+TEST_F(TestCategory, team_axpy_mv_double) {
+  test_team_axpy_mv<double, double, TestExecSpace>();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_axpy_complex_double ) {
-    test_team_axpy<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_axpy_complex_double) {
+  test_team_axpy<Kokkos::complex<double>, Kokkos::complex<double>,
+                 TestExecSpace>();
 }
-TEST_F( TestCategory, team_axpy_mv_complex_double ) {
-    test_team_axpy_mv<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+TEST_F(TestCategory, team_axpy_mv_complex_double) {
+  test_team_axpy_mv<Kokkos::complex<double>, Kokkos::complex<double>,
+                    TestExecSpace>();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_axpy_int ) {
-    test_team_axpy<int,int,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_INT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_axpy_int) {
+  test_team_axpy<int, int, TestExecSpace>();
 }
-TEST_F( TestCategory, team_axpy_mv_int ) {
-    test_team_axpy_mv<int,int,TestExecSpace> ();
+TEST_F(TestCategory, team_axpy_mv_int) {
+  test_team_axpy_mv<int, int, TestExecSpace>();
 }
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-TEST_F( TestCategory, team_axpy_double_int ) {
-    test_team_axpy<double,int,TestExecSpace> ();
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+TEST_F(TestCategory, team_axpy_double_int) {
+  test_team_axpy<double, int, TestExecSpace>();
 }
-TEST_F( TestCategory, team_axpy_double_mv_int ) {
-    test_team_axpy_mv<double,int,TestExecSpace> ();
+TEST_F(TestCategory, team_axpy_double_mv_int) {
+  test_team_axpy_mv<double, int, TestExecSpace>();
 }
 #endif
 
-#endif // Check for lambda availability in CUDA backend
+#endif  // Check for lambda availability in CUDA backend
diff --git a/unit_test/blas/Test_Blas1_team_dot.hpp b/unit_test/blas/Test_Blas1_team_dot.hpp
index e6b5fff16e..472bc8ce77 100644
--- a/unit_test/blas/Test_Blas1_team_dot.hpp
+++ b/unit_test/blas/Test_Blas1_team_dot.hpp
@@ -4,253 +4,323 @@
 //       the CUDA backend before including this test.
 #if !defined(TEST_CUDA_BLAS_CPP) || defined(KOKKOS_ENABLE_CUDA_LAMBDA)
 
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas1_team_dot.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas1_team_dot.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
-  template<class ViewTypeA, class ViewTypeB, class Device>
-  void impl_test_team_dot(int N) {
-
-    typedef Kokkos::TeamPolicy<Device>        team_policy ;
-    typedef typename team_policy::member_type team_member ;
-
-    //Launch M teams of the maximum number of threads per team
-    int M = 4;
-    const team_policy policy( M, Kokkos::AUTO );
-    const int team_data_siz = (N%M == 0)?(N/M):(N/M+1);
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-
-    typedef Kokkos::View<ScalarA*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeA::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeA;
-    typedef Kokkos::View<ScalarB*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeB::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeB;
-
-
-    BaseTypeA b_a("A",N);
-    BaseTypeB b_b("B",N);
-
-    ViewTypeA a = Kokkos::subview(b_a,Kokkos::ALL(),0);
-    ViewTypeB b = Kokkos::subview(b_b,Kokkos::ALL(),0);
-
-    typename BaseTypeA::HostMirror h_b_a = Kokkos::create_mirror_view(b_a);
-    typename BaseTypeB::HostMirror h_b_b = Kokkos::create_mirror_view(b_b);
-
-    typename ViewTypeA::HostMirror h_a = Kokkos::subview(h_b_a,Kokkos::ALL(),0);
-    typename ViewTypeB::HostMirror h_b = Kokkos::subview(h_b_b,Kokkos::ALL(),0);
-
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
-
-    Kokkos::fill_random(b_a,rand_pool,ScalarA(10));
-    Kokkos::fill_random(b_b,rand_pool,ScalarB(10));
-
-    Kokkos::deep_copy(h_b_a,b_a);
-    Kokkos::deep_copy(h_b_b,b_b);
-
-    ScalarA expected_result = 0;
-    for(int i=0;i<N;i++)
-      expected_result += h_a(i)*h_b(i);
-
-    Kokkos::View<ScalarB*,Kokkos::HostSpace> r("PartialDots",M);
-    Kokkos::View<ScalarB*,Device> d_r("PartialDots",M);
-
-    //ScalarA nonconst_nonconst_result = KokkosBlas::dot(a,b);
-    ScalarA nonconst_nonconst_result = 0;
-
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       d_r(teamId) = KokkosBlas::Experimental::dot(teamMember, Kokkos::subview(a,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), Kokkos::subview(b,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)));
-    } );
-    Kokkos::deep_copy(r,d_r);
-    for(int k=0;k<M;k++)
-      nonconst_nonconst_result += r(k);
-
-    double eps = std::is_same<ScalarA,float>::value?2*1e-5:1e-7;
-    EXPECT_NEAR_KK( nonconst_nonconst_result, expected_result, eps*expected_result);
-    
-	typename ViewTypeA::const_type c_a = a;
-    typename ViewTypeB::const_type c_b = b;
-
-    //ScalarA const_const_result = KokkosBlas::dot(c_a,c_b);
-    ScalarA const_const_result = 0;
-
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       d_r(teamId) = KokkosBlas::Experimental::dot(teamMember, Kokkos::subview(c_a,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), Kokkos::subview(c_b,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)));
-    } );
-    Kokkos::deep_copy(r,d_r);
-    for(int k=0;k<M;k++)
-      const_const_result += r(k);
-
-    EXPECT_NEAR_KK( const_const_result, expected_result, eps*expected_result);
-
-    //ScalarA nonconst_const_result = KokkosBlas::dot(a,c_b);
-    ScalarA nonconst_const_result = 0;
-    
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       d_r(teamId) = KokkosBlas::Experimental::dot(teamMember, Kokkos::subview(a,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), Kokkos::subview(c_b,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)));
-    } );
-    Kokkos::deep_copy(r,d_r);    
-    for(int k=0;k<M;k++)
-      nonconst_const_result += r(k);
-
-    EXPECT_NEAR_KK( nonconst_const_result, expected_result, eps*expected_result);
-
-    //ScalarA const_nonconst_result = KokkosBlas::dot(c_a,b);
-    ScalarA const_nonconst_result = 0;
-    
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       d_r(teamId) = KokkosBlas::Experimental::dot(teamMember, Kokkos::subview(c_a,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), Kokkos::subview(b,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)));
-    } );
-    Kokkos::deep_copy(r,d_r);
-    for(int k=0;k<M;k++)
-      const_nonconst_result += r(k);
-
-    EXPECT_NEAR_KK( const_nonconst_result, expected_result, eps*expected_result);
+template <class ViewTypeA, class ViewTypeB, class Device>
+void impl_test_team_dot(int N) {
+  typedef Kokkos::TeamPolicy<Device> team_policy;
+  typedef typename team_policy::member_type team_member;
+
+  // Launch M teams of the maximum number of threads per team
+  int M = 4;
+  const team_policy policy(M, Kokkos::AUTO);
+  const int team_data_siz = (N % M == 0) ? (N / M) : (N / M + 1);
+
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+
+  typedef Kokkos::View<
+      ScalarA * [2],
+      typename std::conditional<std::is_same<typename ViewTypeA::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeA;
+  typedef Kokkos::View<
+      ScalarB * [2],
+      typename std::conditional<std::is_same<typename ViewTypeB::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeB;
+
+  BaseTypeA b_a("A", N);
+  BaseTypeB b_b("B", N);
+
+  ViewTypeA a = Kokkos::subview(b_a, Kokkos::ALL(), 0);
+  ViewTypeB b = Kokkos::subview(b_b, Kokkos::ALL(), 0);
+
+  typename BaseTypeA::HostMirror h_b_a = Kokkos::create_mirror_view(b_a);
+  typename BaseTypeB::HostMirror h_b_b = Kokkos::create_mirror_view(b_b);
+
+  typename ViewTypeA::HostMirror h_a = Kokkos::subview(h_b_a, Kokkos::ALL(), 0);
+  typename ViewTypeB::HostMirror h_b = Kokkos::subview(h_b_b, Kokkos::ALL(), 0);
+
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
+
+  Kokkos::fill_random(b_a, rand_pool, ScalarA(10));
+  Kokkos::fill_random(b_b, rand_pool, ScalarB(10));
+
+  Kokkos::deep_copy(h_b_a, b_a);
+  Kokkos::deep_copy(h_b_b, b_b);
+
+  ScalarA expected_result = 0;
+  for (int i = 0; i < N; i++) expected_result += h_a(i) * h_b(i);
+
+  Kokkos::View<ScalarB *, Kokkos::HostSpace> r("PartialDots", M);
+  Kokkos::View<ScalarB *, Device> d_r("PartialDots", M);
+
+  // ScalarA nonconst_nonconst_result = KokkosBlas::dot(a,b);
+  ScalarA nonconst_nonconst_result = 0;
+
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamDot", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        d_r(teamId)      = KokkosBlas::Experimental::dot(
+            teamMember,
+            Kokkos::subview(
+                a, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)),
+            Kokkos::subview(
+                b, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)));
+      });
+  Kokkos::deep_copy(r, d_r);
+  for (int k = 0; k < M; k++) nonconst_nonconst_result += r(k);
+
+  double eps = std::is_same<ScalarA, float>::value ? 2 * 1e-5 : 1e-7;
+  EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result,
+                 eps * expected_result);
+
+  typename ViewTypeA::const_type c_a = a;
+  typename ViewTypeB::const_type c_b = b;
+
+  // ScalarA const_const_result = KokkosBlas::dot(c_a,c_b);
+  ScalarA const_const_result = 0;
+
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamDot", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        d_r(teamId)      = KokkosBlas::Experimental::dot(
+            teamMember,
+            Kokkos::subview(
+                c_a, Kokkos::make_pair(
+                         teamId * team_data_siz,
+                         (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)),
+            Kokkos::subview(
+                c_b, Kokkos::make_pair(
+                         teamId * team_data_siz,
+                         (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)));
+      });
+  Kokkos::deep_copy(r, d_r);
+  for (int k = 0; k < M; k++) const_const_result += r(k);
+
+  EXPECT_NEAR_KK(const_const_result, expected_result, eps * expected_result);
+
+  // ScalarA nonconst_const_result = KokkosBlas::dot(a,c_b);
+  ScalarA nonconst_const_result = 0;
+
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamDot", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        d_r(teamId)      = KokkosBlas::Experimental::dot(
+            teamMember,
+            Kokkos::subview(
+                a, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)),
+            Kokkos::subview(
+                c_b, Kokkos::make_pair(
+                         teamId * team_data_siz,
+                         (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)));
+      });
+  Kokkos::deep_copy(r, d_r);
+  for (int k = 0; k < M; k++) nonconst_const_result += r(k);
+
+  EXPECT_NEAR_KK(nonconst_const_result, expected_result, eps * expected_result);
+
+  // ScalarA const_nonconst_result = KokkosBlas::dot(c_a,b);
+  ScalarA const_nonconst_result = 0;
+
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamDot", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        d_r(teamId)      = KokkosBlas::Experimental::dot(
+            teamMember,
+            Kokkos::subview(
+                c_a, Kokkos::make_pair(
+                         teamId * team_data_siz,
+                         (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)),
+            Kokkos::subview(
+                b, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)));
+      });
+  Kokkos::deep_copy(r, d_r);
+  for (int k = 0; k < M; k++) const_nonconst_result += r(k);
+
+  EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result);
+}
+
+template <class ViewTypeA, class ViewTypeB, class Device>
+void impl_test_team_dot_mv(int N, int K) {
+  typedef Kokkos::TeamPolicy<Device> team_policy;
+  typedef typename team_policy::member_type team_member;
+
+  // Launch K teams of the maximum number of threads per team
+  const team_policy policy(K, Kokkos::AUTO);
+
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+
+  typedef multivector_layout_adapter<ViewTypeA> vfA_type;
+  typedef multivector_layout_adapter<ViewTypeB> vfB_type;
+
+  typename vfA_type::BaseType b_a("A", N, K);
+  typename vfB_type::BaseType b_b("B", N, K);
+
+  ViewTypeA a = vfA_type::view(b_a);
+  ViewTypeB b = vfB_type::view(b_b);
+
+  typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
+  typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
+
+  typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a);
+  typename h_vfB_type::BaseType h_b_b = Kokkos::create_mirror_view(b_b);
+
+  typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a);
+  typename ViewTypeB::HostMirror h_b = h_vfB_type::view(h_b_b);
+
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
+
+  Kokkos::fill_random(b_a, rand_pool, ScalarA(10));
+  Kokkos::fill_random(b_b, rand_pool, ScalarB(10));
+
+  Kokkos::deep_copy(h_b_a, b_a);
+  Kokkos::deep_copy(h_b_b, b_b);
+
+  typename ViewTypeA::const_type c_a = a;
+  typename ViewTypeB::const_type c_b = b;
+
+  ScalarA *expected_result = new ScalarA[K];
+  for (int j = 0; j < K; j++) {
+    expected_result[j] = ScalarA();
+    for (int i = 0; i < N; i++) expected_result[j] += h_a(i, j) * h_b(i, j);
+  }
+
+  double eps = std::is_same<ScalarA, float>::value ? 2 * 1e-5 : 1e-7;
+
+  Kokkos::View<ScalarB *, Kokkos::HostSpace> r("Dot::Result", K);
+  Kokkos::View<ScalarB *, Device> d_r("Dot::Result", K);
+
+  // KokkosBlas::dot(r,a,b);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamDot", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        d_r(teamId)      = KokkosBlas::Experimental::dot(
+            teamMember, Kokkos::subview(a, Kokkos::ALL(), teamId),
+            Kokkos::subview(b, Kokkos::ALL(), teamId));
+      });
+  Kokkos::deep_copy(r, d_r);
+  for (int k = 0; k < K; k++) {
+    ScalarA nonconst_nonconst_result = r(k);
+    EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k],
+                   eps * expected_result[k]);
+  }
+
+  // KokkosBlas::dot(r,c_a,c_b);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamDot", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        d_r(teamId)      = KokkosBlas::Experimental::dot(
+            teamMember, Kokkos::subview(c_a, Kokkos::ALL(), teamId),
+            Kokkos::subview(c_b, Kokkos::ALL(), teamId));
+      });
+  Kokkos::deep_copy(r, d_r);
+  for (int k = 0; k < K; k++) {
+    ScalarA const_const_result = r(k);
+    EXPECT_NEAR_KK(const_const_result, expected_result[k],
+                   eps * expected_result[k]);
   }
 
-  template<class ViewTypeA, class ViewTypeB, class Device>
-  void impl_test_team_dot_mv(int N, int K) {
-
-    typedef Kokkos::TeamPolicy<Device>        team_policy ;
-    typedef typename team_policy::member_type team_member ;
-
-    //Launch K teams of the maximum number of threads per team
-    const team_policy policy( K, Kokkos::AUTO );
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-
-    typedef multivector_layout_adapter<ViewTypeA> vfA_type;
-    typedef multivector_layout_adapter<ViewTypeB> vfB_type;
-
-    typename vfA_type::BaseType b_a("A",N,K);
-    typename vfB_type::BaseType b_b("B",N,K);
-
-    ViewTypeA a = vfA_type::view(b_a);
-    ViewTypeB b = vfB_type::view(b_b);
-
-    typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
-    typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
-
-    typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a);
-    typename h_vfB_type::BaseType h_b_b = Kokkos::create_mirror_view(b_b);
-
-    typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a);
-    typename ViewTypeB::HostMirror h_b = h_vfB_type::view(h_b_b);
-
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
-
-    Kokkos::fill_random(b_a,rand_pool,ScalarA(10));
-    Kokkos::fill_random(b_b,rand_pool,ScalarB(10));
-
-    Kokkos::deep_copy(h_b_a,b_a);
-    Kokkos::deep_copy(h_b_b,b_b);
-
-    typename ViewTypeA::const_type c_a = a;
-    typename ViewTypeB::const_type c_b = b;
-
-    ScalarA* expected_result = new ScalarA[K];
-    for(int j=0;j<K;j++) {
-      expected_result[j] = ScalarA();
-      for(int i=0;i<N;i++)
-        expected_result[j] += h_a(i,j)*h_b(i,j);
-    }
-
-    double eps = std::is_same<ScalarA,float>::value?2*1e-5:1e-7;
-
-    Kokkos::View<ScalarB*,Kokkos::HostSpace> r("Dot::Result",K);
-    Kokkos::View<ScalarB*,Device> d_r("Dot::Result",K);
-
-    //KokkosBlas::dot(r,a,b);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       d_r(teamId) = KokkosBlas::Experimental::dot(teamMember, Kokkos::subview(a,Kokkos::ALL(),teamId), Kokkos::subview(b,Kokkos::ALL(),teamId));
-    } );
-    Kokkos::deep_copy(r,d_r);
-    for(int k=0;k<K;k++) {
-      ScalarA nonconst_nonconst_result = r(k);
-      EXPECT_NEAR_KK( nonconst_nonconst_result, expected_result[k], eps*expected_result[k]);
-    }
-
-    //KokkosBlas::dot(r,c_a,c_b);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       d_r(teamId) = KokkosBlas::Experimental::dot(teamMember, Kokkos::subview(c_a,Kokkos::ALL(),teamId), Kokkos::subview(c_b,Kokkos::ALL(),teamId));
-    } );
-    Kokkos::deep_copy(r,d_r);
-    for(int k=0;k<K;k++) {
-      ScalarA const_const_result = r(k);
-      EXPECT_NEAR_KK( const_const_result, expected_result[k], eps*expected_result[k]);
-    }
-
-    //KokkosBlas::dot(r,a,c_b);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       d_r(teamId) = KokkosBlas::Experimental::dot(teamMember, Kokkos::subview(a,Kokkos::ALL(),teamId), Kokkos::subview(c_b,Kokkos::ALL(),teamId));
-    } );
-    Kokkos::deep_copy(r,d_r);
-    for(int k=0;k<K;k++) {
-      ScalarA non_const_const_result = r(k);
-      EXPECT_NEAR_KK( non_const_const_result, expected_result[k], eps*expected_result[k]);
-    }
-
-    //KokkosBlas::dot(r,c_a,b);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamDot", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       d_r(teamId) = KokkosBlas::Experimental::dot(teamMember, Kokkos::subview(c_a,Kokkos::ALL(),teamId), Kokkos::subview(b,Kokkos::ALL(),teamId));
-    } );
-    Kokkos::deep_copy(r,d_r);
-    for(int k=0;k<K;k++) {
-      ScalarA const_non_const_result = r(k);
-      EXPECT_NEAR_KK( const_non_const_result, expected_result[k], eps*expected_result[k]);
-    }
-
-    delete [] expected_result;
+  // KokkosBlas::dot(r,a,c_b);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamDot", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        d_r(teamId)      = KokkosBlas::Experimental::dot(
+            teamMember, Kokkos::subview(a, Kokkos::ALL(), teamId),
+            Kokkos::subview(c_b, Kokkos::ALL(), teamId));
+      });
+  Kokkos::deep_copy(r, d_r);
+  for (int k = 0; k < K; k++) {
+    ScalarA non_const_const_result = r(k);
+    EXPECT_NEAR_KK(non_const_const_result, expected_result[k],
+                   eps * expected_result[k]);
   }
+
+  // KokkosBlas::dot(r,c_a,b);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamDot", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        d_r(teamId)      = KokkosBlas::Experimental::dot(
+            teamMember, Kokkos::subview(c_a, Kokkos::ALL(), teamId),
+            Kokkos::subview(b, Kokkos::ALL(), teamId));
+      });
+  Kokkos::deep_copy(r, d_r);
+  for (int k = 0; k < K; k++) {
+    ScalarA const_non_const_result = r(k);
+    EXPECT_NEAR_KK(const_non_const_result, expected_result[k],
+                   eps * expected_result[k]);
+  }
+
+  delete[] expected_result;
 }
+}  // namespace Test
 
-template<class ScalarA, class ScalarB, class Device>
+template <class ScalarA, class ScalarB, class Device>
 int test_team_dot() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA*, Kokkos::LayoutLeft, Device> view_type_a_ll;
-  typedef Kokkos::View<ScalarB*, Kokkos::LayoutLeft, Device> view_type_b_ll;
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA *, Kokkos::LayoutLeft, Device> view_type_a_ll;
+  typedef Kokkos::View<ScalarB *, Kokkos::LayoutLeft, Device> view_type_b_ll;
   Test::impl_test_team_dot<view_type_a_ll, view_type_b_ll, Device>(0);
   Test::impl_test_team_dot<view_type_a_ll, view_type_b_ll, Device>(13);
   Test::impl_test_team_dot<view_type_a_ll, view_type_b_ll, Device>(124);
-  //Test::impl_test_team_dot<view_type_a_ll, view_type_b_ll, Device>(132231);
+  // Test::impl_test_team_dot<view_type_a_ll, view_type_b_ll, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA*, Kokkos::LayoutRight, Device> view_type_a_lr;
-  typedef Kokkos::View<ScalarB*, Kokkos::LayoutRight, Device> view_type_b_lr;
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA *, Kokkos::LayoutRight, Device> view_type_a_lr;
+  typedef Kokkos::View<ScalarB *, Kokkos::LayoutRight, Device> view_type_b_lr;
   Test::impl_test_team_dot<view_type_a_lr, view_type_b_lr, Device>(0);
   Test::impl_test_team_dot<view_type_a_lr, view_type_b_lr, Device>(13);
   Test::impl_test_team_dot<view_type_a_lr, view_type_b_lr, Device>(124);
-  //Test::impl_test_team_dot<view_type_a_lr, view_type_b_lr, Device>(132231);
+  // Test::impl_test_team_dot<view_type_a_lr, view_type_b_lr, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA*, Kokkos::LayoutStride, Device> view_type_a_ls;
-  typedef Kokkos::View<ScalarB*, Kokkos::LayoutStride, Device> view_type_b_ls;
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA *, Kokkos::LayoutStride, Device> view_type_a_ls;
+  typedef Kokkos::View<ScalarB *, Kokkos::LayoutStride, Device> view_type_b_ls;
   Test::impl_test_team_dot<view_type_a_ls, view_type_b_ls, Device>(0);
   Test::impl_test_team_dot<view_type_a_ls, view_type_b_ls, Device>(13);
   Test::impl_test_team_dot<view_type_a_ls, view_type_b_ls, Device>(124);
-  //Test::impl_test_team_dot<view_type_a_ls, view_type_b_ls, Device>(132231);
+  // Test::impl_test_team_dot<view_type_a_ls, view_type_b_ls, Device>(132231);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
   Test::impl_test_team_dot<view_type_a_ls, view_type_b_ll, Device>(124);
   Test::impl_test_team_dot<view_type_a_ll, view_type_b_ls, Device>(124);
 #endif
@@ -258,87 +328,104 @@ int test_team_dot() {
   return 1;
 }
 
-template<class ScalarA, class ScalarB, class Device>
+template <class ScalarA, class ScalarB, class Device>
 int test_team_dot_mv() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device> view_type_a_ll;
-  typedef Kokkos::View<ScalarB**, Kokkos::LayoutLeft, Device> view_type_b_ll;
-  Test::impl_test_team_dot_mv<view_type_a_ll, view_type_b_ll, Device>(0,5);
-  Test::impl_test_team_dot_mv<view_type_a_ll, view_type_b_ll, Device>(13,5);
-  Test::impl_test_team_dot_mv<view_type_a_ll, view_type_b_ll, Device>(124,5);
-  //Test::impl_test_team_dot_mv<view_type_a_ll, view_type_b_ll, Device>(132231,5);
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA **, Kokkos::LayoutLeft, Device> view_type_a_ll;
+  typedef Kokkos::View<ScalarB **, Kokkos::LayoutLeft, Device> view_type_b_ll;
+  Test::impl_test_team_dot_mv<view_type_a_ll, view_type_b_ll, Device>(0, 5);
+  Test::impl_test_team_dot_mv<view_type_a_ll, view_type_b_ll, Device>(13, 5);
+  Test::impl_test_team_dot_mv<view_type_a_ll, view_type_b_ll, Device>(124, 5);
+  // Test::impl_test_team_dot_mv<view_type_a_ll, view_type_b_ll,
+  // Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device> view_type_a_lr;
-  typedef Kokkos::View<ScalarB**, Kokkos::LayoutRight, Device> view_type_b_lr;
-  Test::impl_test_team_dot_mv<view_type_a_lr, view_type_b_lr, Device>(0,5);
-  Test::impl_test_team_dot_mv<view_type_a_lr, view_type_b_lr, Device>(13,5);
-  Test::impl_test_team_dot_mv<view_type_a_lr, view_type_b_lr, Device>(124,5);
-  //Test::impl_test_team_dot_mv<view_type_a_lr, view_type_b_lr, Device>(132231,5);
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA **, Kokkos::LayoutRight, Device> view_type_a_lr;
+  typedef Kokkos::View<ScalarB **, Kokkos::LayoutRight, Device> view_type_b_lr;
+  Test::impl_test_team_dot_mv<view_type_a_lr, view_type_b_lr, Device>(0, 5);
+  Test::impl_test_team_dot_mv<view_type_a_lr, view_type_b_lr, Device>(13, 5);
+  Test::impl_test_team_dot_mv<view_type_a_lr, view_type_b_lr, Device>(124, 5);
+  // Test::impl_test_team_dot_mv<view_type_a_lr, view_type_b_lr,
+  // Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutStride, Device> view_type_a_ls;
-  typedef Kokkos::View<ScalarB**, Kokkos::LayoutStride, Device> view_type_b_ls;
-  Test::impl_test_team_dot_mv<view_type_a_ls, view_type_b_ls, Device>(0,5);
-  Test::impl_test_team_dot_mv<view_type_a_ls, view_type_b_ls, Device>(13,5);
-  Test::impl_test_team_dot_mv<view_type_a_ls, view_type_b_ls, Device>(124,5);
-  //Test::impl_test_team_dot_mv<view_type_a_ls, view_type_b_ls, Device>(132231,5);
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA **, Kokkos::LayoutStride, Device> view_type_a_ls;
+  typedef Kokkos::View<ScalarB **, Kokkos::LayoutStride, Device> view_type_b_ls;
+  Test::impl_test_team_dot_mv<view_type_a_ls, view_type_b_ls, Device>(0, 5);
+  Test::impl_test_team_dot_mv<view_type_a_ls, view_type_b_ls, Device>(13, 5);
+  Test::impl_test_team_dot_mv<view_type_a_ls, view_type_b_ls, Device>(124, 5);
+  // Test::impl_test_team_dot_mv<view_type_a_ls, view_type_b_ls,
+  // Device>(132231,5);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-  Test::impl_test_team_dot_mv<view_type_a_ls, view_type_b_ll, Device>(124,5);
-  Test::impl_test_team_dot_mv<view_type_a_ll, view_type_b_ls, Device>(124,5);
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+  Test::impl_test_team_dot_mv<view_type_a_ls, view_type_b_ll, Device>(124, 5);
+  Test::impl_test_team_dot_mv<view_type_a_ll, view_type_b_ls, Device>(124, 5);
 #endif
 
   return 1;
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_dot_float ) {
-    test_team_dot<float,float,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_dot_float) {
+  test_team_dot<float, float, TestExecSpace>();
 }
-TEST_F( TestCategory, team_dot_mv_float ) {
-    test_team_dot_mv<float,float,TestExecSpace> ();
+TEST_F(TestCategory, team_dot_mv_float) {
+  test_team_dot_mv<float, float, TestExecSpace>();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_dot_double ) {
-    test_team_dot<double,double,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_dot_double) {
+  test_team_dot<double, double, TestExecSpace>();
 }
-TEST_F( TestCategory, team_dot_mv_double ) {
-    test_team_dot_mv<double,double,TestExecSpace> ();
+TEST_F(TestCategory, team_dot_mv_double) {
+  test_team_dot_mv<double, double, TestExecSpace>();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_dot_complex_double ) {
-    test_team_dot<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_dot_complex_double) {
+  test_team_dot<Kokkos::complex<double>, Kokkos::complex<double>,
+                TestExecSpace>();
 }
-TEST_F( TestCategory, team_dot_mv_complex_double ) {
-    test_team_dot_mv<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+TEST_F(TestCategory, team_dot_mv_complex_double) {
+  test_team_dot_mv<Kokkos::complex<double>, Kokkos::complex<double>,
+                   TestExecSpace>();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_dot_int ) {
-    test_team_dot<int,int,TestExecSpace> ();
-}
-TEST_F( TestCategory, team_dot_mv_int ) {
-    test_team_dot_mv<int,int,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_INT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_dot_int) { test_team_dot<int, int, TestExecSpace>(); }
+TEST_F(TestCategory, team_dot_mv_int) {
+  test_team_dot_mv<int, int, TestExecSpace>();
 }
 #endif
 
-/*#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-TEST_F( TestCategory, team_dot_double_int ) {
-    test_team_dot<double,int,TestExecSpace> ();
+/*#if !defined(KOKKOSKERNELS_ETI_ONLY) &&
+!defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F( TestCategory,
+team_dot_double_int ) { test_team_dot<double,int,TestExecSpace> ();
 }
 TEST_F( TestCategory, team_dot_mv_double_int ) {
     test_team_dot_mv<double,int,TestExecSpace> ();
 }
 #endif*/
 
-#endif // Check for lambda availability in CUDA backend
+#endif  // Check for lambda availability in CUDA backend
diff --git a/unit_test/blas/Test_Blas1_team_mult.hpp b/unit_test/blas/Test_Blas1_team_mult.hpp
index c36b058157..90f5f89e54 100644
--- a/unit_test/blas/Test_Blas1_team_mult.hpp
+++ b/unit_test/blas/Test_Blas1_team_mult.hpp
@@ -4,332 +4,452 @@
 //       the CUDA backend before including this test.
 #if !defined(TEST_CUDA_BLAS_CPP) || defined(KOKKOS_ENABLE_CUDA_LAMBDA)
 
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas1_team_mult.hpp>
-#include<KokkosBlas1_dot.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas1_team_mult.hpp>
+#include <KokkosBlas1_dot.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
-  template<class ViewTypeA, class ViewTypeB, class ViewTypeC, class Device>
-  void impl_test_team_mult(int N) {
-	  
-    typedef Kokkos::TeamPolicy<Device>        team_policy ;
-    typedef typename team_policy::member_type team_member ;
-
-    //Launch M teams of the maximum number of threads per team
-    int M = 4;
-    const team_policy policy( M, Kokkos::AUTO );
-    const int team_data_siz = (N%M == 0)?(N/M):(N/M+1);
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-    typedef typename ViewTypeC::value_type ScalarC;
-
-    typedef Kokkos::View<ScalarA*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeA::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeA;
-    typedef Kokkos::View<ScalarB*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeB::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeB;
-    typedef Kokkos::View<ScalarC*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeC::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeC;
-
-
-    ScalarA a = 3;
-    ScalarB b = 5;
-    double eps = std::is_same<ScalarC,float>::value?2*1e-5:1e-7;
-
-    BaseTypeA b_x("X",N);
-    BaseTypeB b_y("Y",N);
-    BaseTypeC b_z("Y",N);
-    BaseTypeC b_org_z("Org_Z",N);
-    
-
-    ViewTypeA x = Kokkos::subview(b_x,Kokkos::ALL(),0);
-    ViewTypeB y = Kokkos::subview(b_y,Kokkos::ALL(),0);
-    ViewTypeC z = Kokkos::subview(b_z,Kokkos::ALL(),0);
-    typename ViewTypeA::const_type c_x = x;
-    typename ViewTypeB::const_type c_y = y;
-
-    typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
-    typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y);
-    typename BaseTypeC::HostMirror h_b_z = Kokkos::create_mirror_view(b_z);
-
-    typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x,Kokkos::ALL(),0);
-    typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y,Kokkos::ALL(),0);
-    typename ViewTypeC::HostMirror h_z = Kokkos::subview(h_b_z,Kokkos::ALL(),0);
-
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
-
-    Kokkos::fill_random(b_x,rand_pool,ScalarA(10));
-    Kokkos::fill_random(b_y,rand_pool,ScalarB(10));
-    Kokkos::fill_random(b_z,rand_pool,ScalarC(10));
-
-    Kokkos::deep_copy(b_org_z,b_z);
-
-    Kokkos::deep_copy(h_b_x,b_x);
-    Kokkos::deep_copy(h_b_y,b_y);
-    Kokkos::deep_copy(h_b_z,b_z);
-
-    ScalarA expected_result = 0;
-    for(int i=0;i<N;i++)
-      expected_result += ScalarC(b*h_z(i) + a*h_x(i)*h_y(i)) * ScalarC(b*h_z(i) + a*h_x(i)*h_y(i));
-
-    //KokkosBlas::mult(b,z,a,x,y);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamMult", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::mult(teamMember, b, Kokkos::subview(z,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), a, Kokkos::subview(x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)));
-    } );
-    ScalarC nonconst_nonconst_result = KokkosBlas::dot(z,z);
-    EXPECT_NEAR_KK( nonconst_nonconst_result, expected_result, eps*expected_result);
- 
-    Kokkos::deep_copy(b_z,b_org_z);
-    //KokkosBlas::mult(b,z,a,x,c_y);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamMult", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::mult(teamMember, b, Kokkos::subview(z,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), a, Kokkos::subview(x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), Kokkos::subview(c_y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)));
-    } );
-    ScalarC const_nonconst_result = KokkosBlas::dot(z,z);
-    EXPECT_NEAR_KK( const_nonconst_result, expected_result, eps*expected_result);
-
-    Kokkos::deep_copy(b_z,b_org_z);
-    //KokkosBlas::mult(b,z,a,c_x,c_y);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamMult", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::mult(teamMember, b, Kokkos::subview(z,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), a, Kokkos::subview(c_x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), Kokkos::subview(c_y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)));
-    } );
-    ScalarC const_const_result = KokkosBlas::dot(z,z);
-    EXPECT_NEAR_KK( const_const_result, expected_result, eps*expected_result);
+template <class ViewTypeA, class ViewTypeB, class ViewTypeC, class Device>
+void impl_test_team_mult(int N) {
+  typedef Kokkos::TeamPolicy<Device> team_policy;
+  typedef typename team_policy::member_type team_member;
+
+  // Launch M teams of the maximum number of threads per team
+  int M = 4;
+  const team_policy policy(M, Kokkos::AUTO);
+  const int team_data_siz = (N % M == 0) ? (N / M) : (N / M + 1);
+
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+  typedef typename ViewTypeC::value_type ScalarC;
+
+  typedef Kokkos::View<
+      ScalarA * [2],
+      typename std::conditional<std::is_same<typename ViewTypeA::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeA;
+  typedef Kokkos::View<
+      ScalarB * [2],
+      typename std::conditional<std::is_same<typename ViewTypeB::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeB;
+  typedef Kokkos::View<
+      ScalarC * [2],
+      typename std::conditional<std::is_same<typename ViewTypeC::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeC;
+
+  ScalarA a  = 3;
+  ScalarB b  = 5;
+  double eps = std::is_same<ScalarC, float>::value ? 2 * 1e-5 : 1e-7;
+
+  BaseTypeA b_x("X", N);
+  BaseTypeB b_y("Y", N);
+  BaseTypeC b_z("Y", N);
+  BaseTypeC b_org_z("Org_Z", N);
+
+  ViewTypeA x                        = Kokkos::subview(b_x, Kokkos::ALL(), 0);
+  ViewTypeB y                        = Kokkos::subview(b_y, Kokkos::ALL(), 0);
+  ViewTypeC z                        = Kokkos::subview(b_z, Kokkos::ALL(), 0);
+  typename ViewTypeA::const_type c_x = x;
+  typename ViewTypeB::const_type c_y = y;
+
+  typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
+  typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y);
+  typename BaseTypeC::HostMirror h_b_z = Kokkos::create_mirror_view(b_z);
+
+  typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0);
+  typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0);
+  typename ViewTypeC::HostMirror h_z = Kokkos::subview(h_b_z, Kokkos::ALL(), 0);
+
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
+
+  Kokkos::fill_random(b_x, rand_pool, ScalarA(10));
+  Kokkos::fill_random(b_y, rand_pool, ScalarB(10));
+  Kokkos::fill_random(b_z, rand_pool, ScalarC(10));
+
+  Kokkos::deep_copy(b_org_z, b_z);
+
+  Kokkos::deep_copy(h_b_x, b_x);
+  Kokkos::deep_copy(h_b_y, b_y);
+  Kokkos::deep_copy(h_b_z, b_z);
+
+  ScalarA expected_result = 0;
+  for (int i = 0; i < N; i++)
+    expected_result += ScalarC(b * h_z(i) + a * h_x(i) * h_y(i)) *
+                       ScalarC(b * h_z(i) + a * h_x(i) * h_y(i));
+
+  // KokkosBlas::mult(b,z,a,x,y);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamMult", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::mult(
+            teamMember, b,
+            Kokkos::subview(
+                z, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)),
+            a,
+            Kokkos::subview(
+                x, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)),
+            Kokkos::subview(
+                y, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)));
+      });
+  ScalarC nonconst_nonconst_result = KokkosBlas::dot(z, z);
+  EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result,
+                 eps * expected_result);
+
+  Kokkos::deep_copy(b_z, b_org_z);
+  // KokkosBlas::mult(b,z,a,x,c_y);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamMult", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::mult(
+            teamMember, b,
+            Kokkos::subview(
+                z, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)),
+            a,
+            Kokkos::subview(
+                x, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)),
+            Kokkos::subview(
+                c_y, Kokkos::make_pair(
+                         teamId * team_data_siz,
+                         (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)));
+      });
+  ScalarC const_nonconst_result = KokkosBlas::dot(z, z);
+  EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result);
+
+  Kokkos::deep_copy(b_z, b_org_z);
+  // KokkosBlas::mult(b,z,a,c_x,c_y);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamMult", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::mult(
+            teamMember, b,
+            Kokkos::subview(
+                z, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)),
+            a,
+            Kokkos::subview(
+                c_x, Kokkos::make_pair(
+                         teamId * team_data_siz,
+                         (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)),
+            Kokkos::subview(
+                c_y, Kokkos::make_pair(
+                         teamId * team_data_siz,
+                         (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)));
+      });
+  ScalarC const_const_result = KokkosBlas::dot(z, z);
+  EXPECT_NEAR_KK(const_const_result, expected_result, eps * expected_result);
+}
+
+template <class ViewTypeA, class ViewTypeB, class ViewTypeC, class Device>
+void impl_test_team_mult_mv(int N, int K) {
+  typedef Kokkos::TeamPolicy<Device> team_policy;
+  typedef typename team_policy::member_type team_member;
+
+  // Launch K teams of the maximum number of threads per team
+  const team_policy policy(K, Kokkos::AUTO);
+
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+  typedef typename ViewTypeC::value_type ScalarC;
+
+  typedef Kokkos::View<
+      ScalarA * [2],
+      typename std::conditional<std::is_same<typename ViewTypeA::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeA;
+  typedef multivector_layout_adapter<ViewTypeB> vfB_type;
+  typedef multivector_layout_adapter<ViewTypeC> vfC_type;
+
+  BaseTypeA b_x("X", N);
+  typename vfB_type::BaseType b_y("Y", N, K);
+  typename vfC_type::BaseType b_z("Z", N, K);
+  typename vfC_type::BaseType b_org_z("Z", N, K);
+
+  ViewTypeA x = Kokkos::subview(b_x, Kokkos::ALL(), 0);
+  ViewTypeB y = vfB_type::view(b_y);
+  ViewTypeC z = vfC_type::view(b_z);
+
+  typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
+  typedef multivector_layout_adapter<typename ViewTypeC::HostMirror> h_vfC_type;
+
+  typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
+  typename h_vfB_type::BaseType h_b_y  = Kokkos::create_mirror_view(b_y);
+  typename h_vfC_type::BaseType h_b_z  = Kokkos::create_mirror_view(b_z);
+
+  typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0);
+  typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
+  typename ViewTypeC::HostMirror h_z = h_vfC_type::view(h_b_z);
+
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
+
+  Kokkos::fill_random(b_x, rand_pool, ScalarA(10));
+  Kokkos::fill_random(b_y, rand_pool, ScalarB(10));
+  Kokkos::fill_random(b_z, rand_pool, ScalarC(10));
+
+  Kokkos::deep_copy(b_org_z, b_z);
+
+  Kokkos::deep_copy(h_b_x, b_x);
+  Kokkos::deep_copy(h_b_y, b_y);
+  Kokkos::deep_copy(h_b_z, b_z);
+
+  ScalarA a                          = 3;
+  ScalarB b                          = 5;
+  typename ViewTypeA::const_type c_x = x;
+  typename ViewTypeB::const_type c_y = y;
+
+  ScalarC *expected_result = new ScalarC[K];
+  for (int j = 0; j < K; j++) {
+    expected_result[j] = ScalarC();
+    for (int i = 0; i < N; i++)
+      expected_result[j] += ScalarC(b * h_z(i, j) + a * h_x(i) * h_y(i, j)) *
+                            ScalarC(b * h_z(i, j) + a * h_x(i) * h_y(i, j));
   }
 
-  template<class ViewTypeA, class ViewTypeB, class ViewTypeC, class Device>
-  void impl_test_team_mult_mv(int N, int K) {
-
-    typedef Kokkos::TeamPolicy<Device>        team_policy ;
-    typedef typename team_policy::member_type team_member ;
-
-    //Launch K teams of the maximum number of threads per team
-    const team_policy policy( K, Kokkos::AUTO );
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-    typedef typename ViewTypeC::value_type ScalarC;
-
-    typedef Kokkos::View<ScalarA*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeA::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeA;
-    typedef multivector_layout_adapter<ViewTypeB> vfB_type;
-    typedef multivector_layout_adapter<ViewTypeC> vfC_type;
-
-    BaseTypeA b_x("X",N);
-    typename vfB_type::BaseType b_y("Y",N,K);
-    typename vfC_type::BaseType b_z("Z",N,K);
-    typename vfC_type::BaseType b_org_z("Z",N,K);
-
-    ViewTypeA x = Kokkos::subview(b_x,Kokkos::ALL(),0);
-    ViewTypeB y = vfB_type::view(b_y);
-    ViewTypeC z = vfC_type::view(b_z);
-
-    typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
-    typedef multivector_layout_adapter<typename ViewTypeC::HostMirror> h_vfC_type;
-
-    typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
-    typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y);
-    typename h_vfC_type::BaseType h_b_z = Kokkos::create_mirror_view(b_z);
-
-    typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x,Kokkos::ALL(),0);
-    typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
-    typename ViewTypeC::HostMirror h_z = h_vfC_type::view(h_b_z);
-
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
-
-    Kokkos::fill_random(b_x,rand_pool,ScalarA(10));
-    Kokkos::fill_random(b_y,rand_pool,ScalarB(10));
-    Kokkos::fill_random(b_z,rand_pool,ScalarC(10));
-
-    Kokkos::deep_copy(b_org_z,b_z);
-
-    Kokkos::deep_copy(h_b_x,b_x);
-    Kokkos::deep_copy(h_b_y,b_y);
-    Kokkos::deep_copy(h_b_z,b_z);
-
-    ScalarA a = 3;
-    ScalarB b = 5;
-    typename ViewTypeA::const_type c_x = x;
-    typename ViewTypeB::const_type c_y = y;
-
-    ScalarC* expected_result = new ScalarC[K];
-    for(int j=0;j<K;j++) {
-      expected_result[j] = ScalarC();
-      for(int i=0;i<N;i++)
-        expected_result[j] += ScalarC(b*h_z(i,j) + a*h_x(i)*h_y(i,j)) * ScalarC(b*h_z(i,j) + a*h_x(i)*h_y(i,j));
-    }
-
-    double eps = std::is_same<ScalarA,float>::value?2*1e-5:1e-7;
-
-    Kokkos::View<ScalarC*,Kokkos::HostSpace> r("Dot::Result",K);
-
-    //KokkosBlas::mult(b,z,a,x,y);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamMult", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::mult(teamMember, b, Kokkos::subview(z,Kokkos::ALL(),teamId), a, x, Kokkos::subview(y,Kokkos::ALL(),teamId));
-    } );
-    KokkosBlas::dot(r,z,z);
-    for(int k=0;k<K;k++) {
-      ScalarA nonconst_nonconst_result = r(k);
-      EXPECT_NEAR_KK( nonconst_nonconst_result, expected_result[k], eps*expected_result[k]);
-    }
-
-    Kokkos::deep_copy(b_z,b_org_z);
-    //KokkosBlas::mult(b,z,a,x,c_y);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamMult", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::mult(teamMember, b, Kokkos::subview(z,Kokkos::ALL(),teamId), a, x, Kokkos::subview(c_y,Kokkos::ALL(),teamId));
-    } );
-    KokkosBlas::dot(r,z,z);
-    for(int k=0;k<K;k++) {
-      ScalarA const_non_const_result = r(k);
-      EXPECT_NEAR_KK( const_non_const_result, expected_result[k], eps*expected_result[k]);
-    }
-
-    delete [] expected_result;
+  double eps = std::is_same<ScalarA, float>::value ? 2 * 1e-5 : 1e-7;
+
+  Kokkos::View<ScalarC *, Kokkos::HostSpace> r("Dot::Result", K);
+
+  // KokkosBlas::mult(b,z,a,x,y);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamMult", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::mult(
+            teamMember, b, Kokkos::subview(z, Kokkos::ALL(), teamId), a, x,
+            Kokkos::subview(y, Kokkos::ALL(), teamId));
+      });
+  KokkosBlas::dot(r, z, z);
+  for (int k = 0; k < K; k++) {
+    ScalarA nonconst_nonconst_result = r(k);
+    EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k],
+                   eps * expected_result[k]);
   }
-}
 
+  Kokkos::deep_copy(b_z, b_org_z);
+  // KokkosBlas::mult(b,z,a,x,c_y);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamMult", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::mult(
+            teamMember, b, Kokkos::subview(z, Kokkos::ALL(), teamId), a, x,
+            Kokkos::subview(c_y, Kokkos::ALL(), teamId));
+      });
+  KokkosBlas::dot(r, z, z);
+  for (int k = 0; k < K; k++) {
+    ScalarA const_non_const_result = r(k);
+    EXPECT_NEAR_KK(const_non_const_result, expected_result[k],
+                   eps * expected_result[k]);
+  }
 
+  delete[] expected_result;
+}
+}  // namespace Test
 
-template<class ScalarA, class ScalarB, class ScalarC, class Device>
+template <class ScalarA, class ScalarB, class ScalarC, class Device>
 int test_team_mult() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA*, Kokkos::LayoutLeft, Device> view_type_a_ll;
-  typedef Kokkos::View<ScalarB*, Kokkos::LayoutLeft, Device> view_type_b_ll;
-  typedef Kokkos::View<ScalarC*, Kokkos::LayoutLeft, Device> view_type_c_ll;
-  Test::impl_test_team_mult<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(0);
-  Test::impl_test_team_mult<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(13);
-  Test::impl_test_team_mult<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(124);
-  //Test::impl_test_team_mult<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(132231);
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA *, Kokkos::LayoutLeft, Device> view_type_a_ll;
+  typedef Kokkos::View<ScalarB *, Kokkos::LayoutLeft, Device> view_type_b_ll;
+  typedef Kokkos::View<ScalarC *, Kokkos::LayoutLeft, Device> view_type_c_ll;
+  Test::impl_test_team_mult<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+                            Device>(0);
+  Test::impl_test_team_mult<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+                            Device>(13);
+  Test::impl_test_team_mult<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+                            Device>(124);
+  // Test::impl_test_team_mult<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+  // Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA*, Kokkos::LayoutRight, Device> view_type_a_lr;
-  typedef Kokkos::View<ScalarB*, Kokkos::LayoutRight, Device> view_type_b_lr;
-  typedef Kokkos::View<ScalarC*, Kokkos::LayoutRight, Device> view_type_c_lr;
-  Test::impl_test_team_mult<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(0);
-  Test::impl_test_team_mult<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(13);
-  Test::impl_test_team_mult<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(124);
-  //Test::impl_test_team_mult<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(132231);
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA *, Kokkos::LayoutRight, Device> view_type_a_lr;
+  typedef Kokkos::View<ScalarB *, Kokkos::LayoutRight, Device> view_type_b_lr;
+  typedef Kokkos::View<ScalarC *, Kokkos::LayoutRight, Device> view_type_c_lr;
+  Test::impl_test_team_mult<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+                            Device>(0);
+  Test::impl_test_team_mult<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+                            Device>(13);
+  Test::impl_test_team_mult<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+                            Device>(124);
+  // Test::impl_test_team_mult<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+  // Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA*, Kokkos::LayoutStride, Device> view_type_a_ls;
-  typedef Kokkos::View<ScalarB*, Kokkos::LayoutStride, Device> view_type_b_ls;
-  typedef Kokkos::View<ScalarC*, Kokkos::LayoutStride, Device> view_type_c_ls;
-  Test::impl_test_team_mult<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(0);
-  Test::impl_test_team_mult<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(13);
-  Test::impl_test_team_mult<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(124);
-  //Test::impl_test_team_mult<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(132231);
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA *, Kokkos::LayoutStride, Device> view_type_a_ls;
+  typedef Kokkos::View<ScalarB *, Kokkos::LayoutStride, Device> view_type_b_ls;
+  typedef Kokkos::View<ScalarC *, Kokkos::LayoutStride, Device> view_type_c_ls;
+  Test::impl_test_team_mult<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+                            Device>(0);
+  Test::impl_test_team_mult<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+                            Device>(13);
+  Test::impl_test_team_mult<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+                            Device>(124);
+  // Test::impl_test_team_mult<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+  // Device>(132231);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-  Test::impl_test_team_mult<view_type_a_ls, view_type_b_ll, view_type_c_lr, Device>(124);
-  Test::impl_test_team_mult<view_type_a_ll, view_type_b_ls, view_type_c_lr, Device>(124);
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+  Test::impl_test_team_mult<view_type_a_ls, view_type_b_ll, view_type_c_lr,
+                            Device>(124);
+  Test::impl_test_team_mult<view_type_a_ll, view_type_b_ls, view_type_c_lr,
+                            Device>(124);
 #endif
 
   return 1;
 }
 
-template<class ScalarA, class ScalarB, class ScalarC, class Device>
+template <class ScalarA, class ScalarB, class ScalarC, class Device>
 int test_team_mult_mv() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA*, Kokkos::LayoutLeft, Device> view_type_a_ll;
-  typedef Kokkos::View<ScalarB**, Kokkos::LayoutLeft, Device> view_type_b_ll;
-  typedef Kokkos::View<ScalarC**, Kokkos::LayoutLeft, Device> view_type_c_ll;
-  Test::impl_test_team_mult_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(0,5);
-  Test::impl_test_team_mult_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(13,5);
-  Test::impl_test_team_mult_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(124,5);
-  //Test::impl_test_team_mult_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(132231,5);
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA *, Kokkos::LayoutLeft, Device> view_type_a_ll;
+  typedef Kokkos::View<ScalarB **, Kokkos::LayoutLeft, Device> view_type_b_ll;
+  typedef Kokkos::View<ScalarC **, Kokkos::LayoutLeft, Device> view_type_c_ll;
+  Test::impl_test_team_mult_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+                               Device>(0, 5);
+  Test::impl_test_team_mult_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+                               Device>(13, 5);
+  Test::impl_test_team_mult_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+                               Device>(124, 5);
+  // Test::impl_test_team_mult_mv<view_type_a_ll, view_type_b_ll,
+  // view_type_c_ll, Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA*, Kokkos::LayoutRight, Device> view_type_a_lr;
-  typedef Kokkos::View<ScalarB**, Kokkos::LayoutRight, Device> view_type_b_lr;
-  typedef Kokkos::View<ScalarC**, Kokkos::LayoutRight, Device> view_type_c_lr;
-  Test::impl_test_team_mult_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(0,5);
-  Test::impl_test_team_mult_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(13,5);
-  Test::impl_test_team_mult_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(124,5);
-  //Test::impl_test_team_mult_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(132231,5);
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA *, Kokkos::LayoutRight, Device> view_type_a_lr;
+  typedef Kokkos::View<ScalarB **, Kokkos::LayoutRight, Device> view_type_b_lr;
+  typedef Kokkos::View<ScalarC **, Kokkos::LayoutRight, Device> view_type_c_lr;
+  Test::impl_test_team_mult_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+                               Device>(0, 5);
+  Test::impl_test_team_mult_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+                               Device>(13, 5);
+  Test::impl_test_team_mult_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+                               Device>(124, 5);
+  // Test::impl_test_team_mult_mv<view_type_a_lr, view_type_b_lr,
+  // view_type_c_lr, Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA*, Kokkos::LayoutStride, Device> view_type_a_ls;
-  typedef Kokkos::View<ScalarB**, Kokkos::LayoutStride, Device> view_type_b_ls;
-  typedef Kokkos::View<ScalarC**, Kokkos::LayoutStride, Device> view_type_c_ls;
-  Test::impl_test_team_mult_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(0,5);
-  Test::impl_test_team_mult_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(13,5);
-  Test::impl_test_team_mult_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(124,5);
-  //Test::impl_test_team_mult_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(132231,5);
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA *, Kokkos::LayoutStride, Device> view_type_a_ls;
+  typedef Kokkos::View<ScalarB **, Kokkos::LayoutStride, Device> view_type_b_ls;
+  typedef Kokkos::View<ScalarC **, Kokkos::LayoutStride, Device> view_type_c_ls;
+  Test::impl_test_team_mult_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+                               Device>(0, 5);
+  Test::impl_test_team_mult_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+                               Device>(13, 5);
+  Test::impl_test_team_mult_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+                               Device>(124, 5);
+  // Test::impl_test_team_mult_mv<view_type_a_ls, view_type_b_ls,
+  // view_type_c_ls, Device>(132231,5);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-  Test::impl_test_team_mult_mv<view_type_a_ls, view_type_b_ll, view_type_c_lr, Device>(124,5);
-  Test::impl_test_team_mult_mv<view_type_a_ll, view_type_b_ls, view_type_c_lr, Device>(124,5);
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+  Test::impl_test_team_mult_mv<view_type_a_ls, view_type_b_ll, view_type_c_lr,
+                               Device>(124, 5);
+  Test::impl_test_team_mult_mv<view_type_a_ll, view_type_b_ls, view_type_c_lr,
+                               Device>(124, 5);
 #endif
 
   return 1;
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_mult_float ) {
-    test_team_mult<float,float,float,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_mult_float) {
+  test_team_mult<float, float, float, TestExecSpace>();
 }
-TEST_F( TestCategory, team_mult_mv_float ) {
-    test_team_mult_mv<float,float,float,TestExecSpace> ();
+TEST_F(TestCategory, team_mult_mv_float) {
+  test_team_mult_mv<float, float, float, TestExecSpace>();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_mult_double ) {
-    test_team_mult<double,double,double,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_mult_double) {
+  test_team_mult<double, double, double, TestExecSpace>();
 }
-TEST_F( TestCategory, team_mult_mv_double ) {
-    test_team_mult_mv<double,double,double,TestExecSpace> ();
+TEST_F(TestCategory, team_mult_mv_double) {
+  test_team_mult_mv<double, double, double, TestExecSpace>();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_mult_complex_double ) {
-    test_team_mult<Kokkos::complex<double>,Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_mult_complex_double) {
+  test_team_mult<Kokkos::complex<double>, Kokkos::complex<double>,
+                 Kokkos::complex<double>, TestExecSpace>();
 }
-TEST_F( TestCategory, team_mult_mv_complex_double ) {
-    test_team_mult_mv<Kokkos::complex<double>,Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+TEST_F(TestCategory, team_mult_mv_complex_double) {
+  test_team_mult_mv<Kokkos::complex<double>, Kokkos::complex<double>,
+                    Kokkos::complex<double>, TestExecSpace>();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_mult_int ) {
-    test_team_mult<int,int,int,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_INT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_mult_int) {
+  test_team_mult<int, int, int, TestExecSpace>();
 }
-TEST_F( TestCategory, team_mult_mv_int ) {
-    test_team_mult_mv<int,int,int,TestExecSpace> ();
+TEST_F(TestCategory, team_mult_mv_int) {
+  test_team_mult_mv<int, int, int, TestExecSpace>();
 }
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-TEST_F( TestCategory, team_mult_double_int ) {
-    test_team_mult<double,int,float,TestExecSpace> ();
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+TEST_F(TestCategory, team_mult_double_int) {
+  test_team_mult<double, int, float, TestExecSpace>();
 }
-TEST_F( TestCategory, team_mult_double_mv_int ) {
-    test_team_mult_mv<double,int,float,TestExecSpace> ();
+TEST_F(TestCategory, team_mult_double_mv_int) {
+  test_team_mult_mv<double, int, float, TestExecSpace>();
 }
 #endif
 
-#endif // Check for lambda availability in CUDA backend
+#endif  // Check for lambda availability in CUDA backend
diff --git a/unit_test/blas/Test_Blas1_team_nrm2.hpp b/unit_test/blas/Test_Blas1_team_nrm2.hpp
index 117c5d6147..5c356c2d4e 100644
--- a/unit_test/blas/Test_Blas1_team_nrm2.hpp
+++ b/unit_test/blas/Test_Blas1_team_nrm2.hpp
@@ -4,136 +4,156 @@
 //       the CUDA backend before including this test.
 #if !defined(TEST_CUDA_BLAS_CPP) || defined(KOKKOS_ENABLE_CUDA_LAMBDA)
 
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas1_team_nrm2.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas1_team_nrm2.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
-  template<class ViewTypeA, class Device>
-  void impl_test_team_nrm2(int N, int K) {
+template <class ViewTypeA, class Device>
+void impl_test_team_nrm2(int N, int K) {
+  typedef Kokkos::TeamPolicy<Device> team_policy;
+  typedef typename team_policy::member_type team_member;
 
-    typedef Kokkos::TeamPolicy<Device>        team_policy ;
-    typedef typename team_policy::member_type team_member ;
+  // Launch K teams of the maximum number of threads per team
+  const team_policy policy(K, Kokkos::AUTO);
 
-    //Launch K teams of the maximum number of threads per team
-    const team_policy policy( K, Kokkos::AUTO );
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef Kokkos::Details::ArithTraits<ScalarA> AT;
 
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef Kokkos::Details::ArithTraits<ScalarA> AT;
+  typedef multivector_layout_adapter<ViewTypeA> vfA_type;
 
-    typedef multivector_layout_adapter<ViewTypeA> vfA_type;
+  typename vfA_type::BaseType b_a("A", N, K);
 
-    typename vfA_type::BaseType b_a("A",N,K);
+  ViewTypeA a = vfA_type::view(b_a);
 
-    ViewTypeA a = vfA_type::view(b_a);
+  typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
 
-    typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
+  typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a);
 
-    typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a);
+  typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a);
 
-    typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  Kokkos::fill_random(b_a, rand_pool, ScalarA(10));
 
-    Kokkos::fill_random(b_a,rand_pool,ScalarA(10));
+  Kokkos::deep_copy(h_b_a, b_a);
 
-    Kokkos::deep_copy(h_b_a,b_a);
+  typename ViewTypeA::const_type c_a = a;
 
-    typename ViewTypeA::const_type c_a = a;
-
-    typename AT::mag_type* expected_result = new typename AT::mag_type[K];
-    for(int j=0;j<K;j++) {
-      expected_result[j] = typename AT::mag_type();
-      for(int i=0;i<N;i++)
-        expected_result[j] += AT::abs(h_a(i,j))*AT::abs(h_a(i,j));
-      expected_result[j] = Kokkos::Details::ArithTraits<typename AT::mag_type>::sqrt(expected_result[j]);
-    }
-
-    double eps = std::is_same<ScalarA,float>::value?2*1e-5:1e-7;
-
-    Kokkos::View<typename AT::mag_type*,Kokkos::HostSpace> r("Nrm2::Result",K);
-    Kokkos::View<typename AT::mag_type*,Device> d_r("Nrm2::Result",K);
-
-    //KokkosBlas::nrm2(r,a);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamNrm2", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       d_r(teamId) = KokkosBlas::Experimental::nrm2(teamMember, Kokkos::subview(a,Kokkos::ALL(),teamId));
-    } );
-    Kokkos::deep_copy(r,d_r);
-    for(int k=0;k<K;k++) {
-      typename AT::mag_type nonconst_result = r(k);
-      EXPECT_NEAR_KK( nonconst_result, expected_result[k], eps*expected_result[k]);
-    }
+  typename AT::mag_type *expected_result = new typename AT::mag_type[K];
+  for (int j = 0; j < K; j++) {
+    expected_result[j] = typename AT::mag_type();
+    for (int i = 0; i < N; i++)
+      expected_result[j] += AT::abs(h_a(i, j)) * AT::abs(h_a(i, j));
+    expected_result[j] =
+        Kokkos::Details::ArithTraits<typename AT::mag_type>::sqrt(
+            expected_result[j]);
+  }
 
-    //KokkosBlas::nrm2(r,c_a);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamNrm2", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       d_r(teamId) = KokkosBlas::Experimental::nrm2(teamMember, Kokkos::subview(c_a,Kokkos::ALL(),teamId));
-    } );
-    Kokkos::deep_copy(r,d_r);
-    for(int k=0;k<K;k++) {
-      typename AT::mag_type const_result = r(k);
-      EXPECT_NEAR_KK( const_result, expected_result[k], eps*expected_result[k]);
-    }
+  double eps = std::is_same<ScalarA, float>::value ? 2 * 1e-5 : 1e-7;
+
+  Kokkos::View<typename AT::mag_type *, Kokkos::HostSpace> r("Nrm2::Result", K);
+  Kokkos::View<typename AT::mag_type *, Device> d_r("Nrm2::Result", K);
+
+  // KokkosBlas::nrm2(r,a);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamNrm2", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        d_r(teamId)      = KokkosBlas::Experimental::nrm2(
+            teamMember, Kokkos::subview(a, Kokkos::ALL(), teamId));
+      });
+  Kokkos::deep_copy(r, d_r);
+  for (int k = 0; k < K; k++) {
+    typename AT::mag_type nonconst_result = r(k);
+    EXPECT_NEAR_KK(nonconst_result, expected_result[k],
+                   eps * expected_result[k]);
+  }
 
-    delete [] expected_result;
+  // KokkosBlas::nrm2(r,c_a);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamNrm2", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        d_r(teamId)      = KokkosBlas::Experimental::nrm2(
+            teamMember, Kokkos::subview(c_a, Kokkos::ALL(), teamId));
+      });
+  Kokkos::deep_copy(r, d_r);
+  for (int k = 0; k < K; k++) {
+    typename AT::mag_type const_result = r(k);
+    EXPECT_NEAR_KK(const_result, expected_result[k], eps * expected_result[k]);
   }
+
+  delete[] expected_result;
 }
+}  // namespace Test
 
-template<class ScalarA, class Device>
+template <class ScalarA, class Device>
 int test_team_nrm2() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device> view_type_a_ll;
-  Test::impl_test_team_nrm2<view_type_a_ll, Device>(0,5);
-  Test::impl_test_team_nrm2<view_type_a_ll, Device>(13,5);
-  Test::impl_test_team_nrm2<view_type_a_ll, Device>(124,5);
-  //Test::impl_test_team_nrm2<view_type_a_ll, Device>(132231,5);
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA **, Kokkos::LayoutLeft, Device> view_type_a_ll;
+  Test::impl_test_team_nrm2<view_type_a_ll, Device>(0, 5);
+  Test::impl_test_team_nrm2<view_type_a_ll, Device>(13, 5);
+  Test::impl_test_team_nrm2<view_type_a_ll, Device>(124, 5);
+  // Test::impl_test_team_nrm2<view_type_a_ll, Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device> view_type_a_lr;
-  Test::impl_test_team_nrm2<view_type_a_lr, Device>(0,5);
-  Test::impl_test_team_nrm2<view_type_a_lr, Device>(13,5);
-  Test::impl_test_team_nrm2<view_type_a_lr, Device>(124,5);
-  //Test::impl_test_team_nrm2<view_type_a_lr, Device>(132231,5);
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA **, Kokkos::LayoutRight, Device> view_type_a_lr;
+  Test::impl_test_team_nrm2<view_type_a_lr, Device>(0, 5);
+  Test::impl_test_team_nrm2<view_type_a_lr, Device>(13, 5);
+  Test::impl_test_team_nrm2<view_type_a_lr, Device>(124, 5);
+  // Test::impl_test_team_nrm2<view_type_a_lr, Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutStride, Device> view_type_a_ls;
-  Test::impl_test_team_nrm2<view_type_a_ls, Device>(0,5);
-  Test::impl_test_team_nrm2<view_type_a_ls, Device>(13,5);
-  Test::impl_test_team_nrm2<view_type_a_ls, Device>(124,5);
-  //Test::impl_test_team_nrm2<view_type_a_ls, Device>(132231,5);
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA **, Kokkos::LayoutStride, Device> view_type_a_ls;
+  Test::impl_test_team_nrm2<view_type_a_ls, Device>(0, 5);
+  Test::impl_test_team_nrm2<view_type_a_ls, Device>(13, 5);
+  Test::impl_test_team_nrm2<view_type_a_ls, Device>(124, 5);
+  // Test::impl_test_team_nrm2<view_type_a_ls, Device>(132231,5);
 #endif
 
   return 1;
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_nrm2_float ) {
-    test_team_nrm2<float,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_nrm2_float) {
+  test_team_nrm2<float, TestExecSpace>();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_nrm2_double ) {
-    test_team_nrm2<double,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_nrm2_double) {
+  test_team_nrm2<double, TestExecSpace>();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_nrm2_complex_double ) {
-    test_team_nrm2<Kokkos::complex<double>,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_nrm2_complex_double) {
+  test_team_nrm2<Kokkos::complex<double>, TestExecSpace>();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_nrm2_int ) {
-    test_team_nrm2<int,TestExecSpace> ();
-}
+#if defined(KOKKOSKERNELS_INST_INT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_nrm2_int) { test_team_nrm2<int, TestExecSpace>(); }
 #endif
 
-#endif // Check for lambda availability in CUDA backend
+#endif  // Check for lambda availability in CUDA backend
diff --git a/unit_test/blas/Test_Blas1_team_scal.hpp b/unit_test/blas/Test_Blas1_team_scal.hpp
index bc0439459b..2a03e268b8 100644
--- a/unit_test/blas/Test_Blas1_team_scal.hpp
+++ b/unit_test/blas/Test_Blas1_team_scal.hpp
@@ -4,266 +4,331 @@
 //       the CUDA backend before including this test.
 #if !defined(TEST_CUDA_BLAS_CPP) || defined(KOKKOS_ENABLE_CUDA_LAMBDA)
 
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas1_team_scal.hpp>
-#include<KokkosBlas1_dot.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas1_team_scal.hpp>
+#include <KokkosBlas1_dot.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
-  template<class ViewTypeA, class ViewTypeB, class Device>
-  void impl_test_team_scal(int N) {
-
-    typedef Kokkos::TeamPolicy<Device>        team_policy ;
-    typedef typename team_policy::member_type team_member ;
-
-    //Launch M teams of the maximum number of threads per team
-    int M = 4;
-    const team_policy policy( M, Kokkos::AUTO );
-    const int team_data_siz = (N%M == 0)?(N/M):(N/M+1);
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-    typedef Kokkos::Details::ArithTraits<ScalarA> AT;
-
-    typedef Kokkos::View<ScalarA*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeA::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeA;
-    typedef Kokkos::View<ScalarB*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeB::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeB;
-
-
-    ScalarA a(3);
-    typename AT::mag_type eps = AT::epsilon()*1000;
-    typename AT::mag_type zero = AT::abs( AT::zero() );
-    typename AT::mag_type one = AT::abs( AT::one() );
-
-    BaseTypeA b_x("X",N);
-    BaseTypeB b_y("Y",N);
-    BaseTypeB b_org_y("Org_Y",N);
-    
-
-    ViewTypeA x = Kokkos::subview(b_x,Kokkos::ALL(),0);
-    ViewTypeB y = Kokkos::subview(b_y,Kokkos::ALL(),0);
-    typename ViewTypeA::const_type c_x = x;
-    typename ViewTypeB::const_type c_y = y;
-
-    typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
-    typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y);
-
-    typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x,Kokkos::ALL(),0);
-    typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y,Kokkos::ALL(),0);
-
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
-
-    Kokkos::fill_random(b_x,rand_pool,ScalarA(1));
-    Kokkos::fill_random(b_y,rand_pool,ScalarB(1));
-
-    Kokkos::deep_copy(b_org_y,b_y);
-
-    Kokkos::deep_copy(h_b_x,b_x);
-    Kokkos::deep_copy(h_b_y,b_y);
-
-    ScalarA expected_result(0);
-    for(int i=0;i<N;i++)
-    { expected_result += ScalarB(a*h_x(i)) * ScalarB(a*h_x(i)); }
-
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::scal(teamMember, Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), a, Kokkos::subview(x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)));
-    } );
-
-    {
-      ScalarB nonconst_nonconst_result = KokkosBlas::dot(y,y);
-      typename AT::mag_type divisor = AT::abs(expected_result) == zero ? one : AT::abs(expected_result);
-      typename AT::mag_type diff = AT::abs( nonconst_nonconst_result - expected_result )/divisor;
-      EXPECT_NEAR_KK( diff, zero, eps );
-    }
- 
-    Kokkos::deep_copy(b_y,b_org_y);
-    
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::scal(teamMember, Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), a, Kokkos::subview(c_x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)));
-    } );
-
-    {
-      ScalarB const_nonconst_result = KokkosBlas::dot(y,y);
-      typename AT::mag_type divisor = AT::abs(expected_result) == zero ? one : AT::abs(expected_result);
-      typename AT::mag_type diff = AT::abs( const_nonconst_result - expected_result )/divisor;
-      EXPECT_NEAR_KK( diff, zero, eps );
-    }
+template <class ViewTypeA, class ViewTypeB, class Device>
+void impl_test_team_scal(int N) {
+  typedef Kokkos::TeamPolicy<Device> team_policy;
+  typedef typename team_policy::member_type team_member;
+
+  // Launch M teams of the maximum number of threads per team
+  int M = 4;
+  const team_policy policy(M, Kokkos::AUTO);
+  const int team_data_siz = (N % M == 0) ? (N / M) : (N / M + 1);
+
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+  typedef Kokkos::Details::ArithTraits<ScalarA> AT;
+
+  typedef Kokkos::View<
+      ScalarA * [2],
+      typename std::conditional<std::is_same<typename ViewTypeA::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeA;
+  typedef Kokkos::View<
+      ScalarB * [2],
+      typename std::conditional<std::is_same<typename ViewTypeB::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeB;
+
+  ScalarA a(3);
+  typename AT::mag_type eps  = AT::epsilon() * 1000;
+  typename AT::mag_type zero = AT::abs(AT::zero());
+  typename AT::mag_type one  = AT::abs(AT::one());
+
+  BaseTypeA b_x("X", N);
+  BaseTypeB b_y("Y", N);
+  BaseTypeB b_org_y("Org_Y", N);
+
+  ViewTypeA x                        = Kokkos::subview(b_x, Kokkos::ALL(), 0);
+  ViewTypeB y                        = Kokkos::subview(b_y, Kokkos::ALL(), 0);
+  typename ViewTypeA::const_type c_x = x;
+  typename ViewTypeB::const_type c_y = y;
+
+  typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
+  typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y);
+
+  typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0);
+  typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0);
+
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
+
+  Kokkos::fill_random(b_x, rand_pool, ScalarA(1));
+  Kokkos::fill_random(b_y, rand_pool, ScalarB(1));
+
+  Kokkos::deep_copy(b_org_y, b_y);
+
+  Kokkos::deep_copy(h_b_x, b_x);
+  Kokkos::deep_copy(h_b_y, b_y);
+
+  ScalarA expected_result(0);
+  for (int i = 0; i < N; i++) {
+    expected_result += ScalarB(a * h_x(i)) * ScalarB(a * h_x(i));
   }
 
-  template<class ViewTypeA, class ViewTypeB, class Device>
-  void impl_test_team_scal_mv(int N, int K) {
-
-    typedef Kokkos::TeamPolicy<Device>        team_policy ;
-    typedef typename team_policy::member_type team_member ;
-
-    //Launch K teams of the maximum number of threads per team
-    const team_policy policy( K, Kokkos::AUTO );
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-    typedef Kokkos::Details::ArithTraits<ScalarA> AT;
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamScal", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::scal(
+            teamMember,
+            Kokkos::subview(
+                y, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)),
+            a,
+            Kokkos::subview(
+                x, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)));
+      });
+
+  {
+    ScalarB nonconst_nonconst_result = KokkosBlas::dot(y, y);
+    typename AT::mag_type divisor =
+        AT::abs(expected_result) == zero ? one : AT::abs(expected_result);
+    typename AT::mag_type diff =
+        AT::abs(nonconst_nonconst_result - expected_result) / divisor;
+    EXPECT_NEAR_KK(diff, zero, eps);
+  }
 
-    typedef multivector_layout_adapter<ViewTypeA> vfA_type;
-    typedef multivector_layout_adapter<ViewTypeB> vfB_type;
+  Kokkos::deep_copy(b_y, b_org_y);
+
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamScal", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::scal(
+            teamMember,
+            Kokkos::subview(
+                y, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)),
+            a,
+            Kokkos::subview(
+                c_x, Kokkos::make_pair(
+                         teamId * team_data_siz,
+                         (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)));
+      });
+
+  {
+    ScalarB const_nonconst_result = KokkosBlas::dot(y, y);
+    typename AT::mag_type divisor =
+        AT::abs(expected_result) == zero ? one : AT::abs(expected_result);
+    typename AT::mag_type diff =
+        AT::abs(const_nonconst_result - expected_result) / divisor;
+    EXPECT_NEAR_KK(diff, zero, eps);
+  }
+}
 
-    typename vfA_type::BaseType b_x("A",N,K);
-    typename vfB_type::BaseType b_y("B",N,K);
-    typename vfB_type::BaseType b_org_y("B",N,K);
+template <class ViewTypeA, class ViewTypeB, class Device>
+void impl_test_team_scal_mv(int N, int K) {
+  typedef Kokkos::TeamPolicy<Device> team_policy;
+  typedef typename team_policy::member_type team_member;
 
-    ViewTypeA x = vfA_type::view(b_x);
-    ViewTypeB y = vfB_type::view(b_y);
+  // Launch K teams of the maximum number of threads per team
+  const team_policy policy(K, Kokkos::AUTO);
 
-    typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
-    typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+  typedef Kokkos::Details::ArithTraits<ScalarA> AT;
 
-    typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x);
-    typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y);
+  typedef multivector_layout_adapter<ViewTypeA> vfA_type;
+  typedef multivector_layout_adapter<ViewTypeB> vfB_type;
 
-    typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x);
-    typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
+  typename vfA_type::BaseType b_x("A", N, K);
+  typename vfB_type::BaseType b_y("B", N, K);
+  typename vfB_type::BaseType b_org_y("B", N, K);
 
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  ViewTypeA x = vfA_type::view(b_x);
+  ViewTypeB y = vfB_type::view(b_y);
 
-    Kokkos::fill_random(b_x,rand_pool,ScalarA(1));
-    Kokkos::fill_random(b_y,rand_pool,ScalarB(1));
+  typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
+  typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
 
-    Kokkos::deep_copy(b_org_y,b_y);
+  typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x);
+  typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y);
 
-    Kokkos::deep_copy(h_b_x,b_x);
-    Kokkos::deep_copy(h_b_y,b_y);
+  typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x);
+  typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
 
-    ScalarA a(3);
-    typename ViewTypeA::const_type c_x = x;
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-    ScalarA* expected_result = new ScalarA[K];
-    for(int j=0;j<K;j++) {
-      expected_result[j] = ScalarA();
-      for(int i=0;i<N;i++)
-      { expected_result[j] += ScalarB(a*h_x(i,j)) * ScalarB(a*h_x(i,j)); }
-    }
+  Kokkos::fill_random(b_x, rand_pool, ScalarA(1));
+  Kokkos::fill_random(b_y, rand_pool, ScalarB(1));
 
-    typename AT::mag_type eps = AT::epsilon()*1000;
-    typename AT::mag_type zero = AT::abs( AT::zero() );
-    typename AT::mag_type one = AT::abs( AT::one() );
+  Kokkos::deep_copy(b_org_y, b_y);
 
-    Kokkos::View<ScalarB*,Kokkos::HostSpace> r("Dot::Result",K);
+  Kokkos::deep_copy(h_b_x, b_x);
+  Kokkos::deep_copy(h_b_y, b_y);
 
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::scal(teamMember, Kokkos::subview(y,Kokkos::ALL(),teamId), a, Kokkos::subview(x,Kokkos::ALL(),teamId));
-    } );
+  ScalarA a(3);
+  typename ViewTypeA::const_type c_x = x;
 
-    KokkosBlas::dot(r,y,y);
-    for(int k=0;k<K;k++) {
-      ScalarA nonconst_scalar_result = r(k);
-      typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]);
-      typename AT::mag_type diff = AT::abs( nonconst_scalar_result - expected_result[k] )/divisor;
-      EXPECT_NEAR_KK( diff, zero, eps );
-    }
-
-    Kokkos::deep_copy(b_y,b_org_y);
-
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::scal(teamMember, Kokkos::subview(y,Kokkos::ALL(),teamId), a, Kokkos::subview(c_x,Kokkos::ALL(),teamId));
-    } );
-
-    KokkosBlas::dot(r,y,y);
-    for(int k=0;k<K;k++) {
-      ScalarA const_scalar_result = r(k);
-      typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]);
-      typename AT::mag_type diff = AT::abs( const_scalar_result - expected_result[k] )/divisor;
-      EXPECT_NEAR_KK( diff, zero, eps );
+  ScalarA *expected_result = new ScalarA[K];
+  for (int j = 0; j < K; j++) {
+    expected_result[j] = ScalarA();
+    for (int i = 0; i < N; i++) {
+      expected_result[j] += ScalarB(a * h_x(i, j)) * ScalarB(a * h_x(i, j));
     }
+  }
 
-    // Generate 'params' view with dimension == number of multivectors; each entry will be different scalar to scale y
-    Kokkos::View<ScalarA*,Device> params("Params",K);
-    for(int j=0; j<K; j++) {
-      Kokkos::View<ScalarA,Device> param_j(params,j);
-      Kokkos::deep_copy(param_j,ScalarA(3+j));
-    }
+  typename AT::mag_type eps  = AT::epsilon() * 1000;
+  typename AT::mag_type zero = AT::abs(AT::zero());
+  typename AT::mag_type one  = AT::abs(AT::one());
+
+  Kokkos::View<ScalarB *, Kokkos::HostSpace> r("Dot::Result", K);
+
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamScal", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::scal(
+            teamMember, Kokkos::subview(y, Kokkos::ALL(), teamId), a,
+            Kokkos::subview(x, Kokkos::ALL(), teamId));
+      });
+
+  KokkosBlas::dot(r, y, y);
+  for (int k = 0; k < K; k++) {
+    ScalarA nonconst_scalar_result = r(k);
+    typename AT::mag_type divisor =
+        AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]);
+    typename AT::mag_type diff =
+        AT::abs(nonconst_scalar_result - expected_result[k]) / divisor;
+    EXPECT_NEAR_KK(diff, zero, eps);
+  }
 
-    // Update expected_result for next 3 vector tests
-    for(int j=0;j<K;j++) {
-      expected_result[j] = ScalarA();
-      for(int i=0;i<N;i++)
-      { expected_result[j] += ScalarB((3.0+j)*h_x(i,j)) * ScalarB((3.0+j)*h_x(i,j)); }
-    }
+  Kokkos::deep_copy(b_y, b_org_y);
+
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamScal", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::scal(
+            teamMember, Kokkos::subview(y, Kokkos::ALL(), teamId), a,
+            Kokkos::subview(c_x, Kokkos::ALL(), teamId));
+      });
+
+  KokkosBlas::dot(r, y, y);
+  for (int k = 0; k < K; k++) {
+    ScalarA const_scalar_result = r(k);
+    typename AT::mag_type divisor =
+        AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]);
+    typename AT::mag_type diff =
+        AT::abs(const_scalar_result - expected_result[k]) / divisor;
+    EXPECT_NEAR_KK(diff, zero, eps);
+  }
 
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::scal(teamMember, Kokkos::subview(y,Kokkos::ALL(),teamId), params(teamId), Kokkos::subview(x,Kokkos::ALL(),teamId));
-    } );
-
-    KokkosBlas::dot(r,y,y);
-    for(int k=0;k<K;k++) {
-      ScalarA nonconst_vector_result = r(k);
-      typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]);
-      typename AT::mag_type diff = AT::abs( nonconst_vector_result - expected_result[k] )/divisor;
-      EXPECT_NEAR_KK( diff, zero, eps );
-    }
+  // Generate 'params' view with dimension == number of multivectors; each entry
+  // will be different scalar to scale y
+  Kokkos::View<ScalarA *, Device> params("Params", K);
+  for (int j = 0; j < K; j++) {
+    Kokkos::View<ScalarA, Device> param_j(params, j);
+    Kokkos::deep_copy(param_j, ScalarA(3 + j));
+  }
 
-    Kokkos::deep_copy(b_y,b_org_y);
-    
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::scal(teamMember, Kokkos::subview(y,Kokkos::ALL(),teamId), params(teamId), Kokkos::subview(c_x,Kokkos::ALL(),teamId));
-    } );
-
-    KokkosBlas::dot(r,y,y);
-    for(int k=0;k<K;k++) {
-      ScalarA const_vector_result = r(k);
-      typename AT::mag_type divisor = AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]);
-      typename AT::mag_type diff = AT::abs( const_vector_result - expected_result[k] )/divisor;
-      EXPECT_NEAR_KK( diff, zero, eps );
+  // Update expected_result for next 3 vector tests
+  for (int j = 0; j < K; j++) {
+    expected_result[j] = ScalarA();
+    for (int i = 0; i < N; i++) {
+      expected_result[j] +=
+          ScalarB((3.0 + j) * h_x(i, j)) * ScalarB((3.0 + j) * h_x(i, j));
     }
+  }
 
-    delete [] expected_result;
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamScal", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::scal(
+            teamMember, Kokkos::subview(y, Kokkos::ALL(), teamId),
+            params(teamId), Kokkos::subview(x, Kokkos::ALL(), teamId));
+      });
+
+  KokkosBlas::dot(r, y, y);
+  for (int k = 0; k < K; k++) {
+    ScalarA nonconst_vector_result = r(k);
+    typename AT::mag_type divisor =
+        AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]);
+    typename AT::mag_type diff =
+        AT::abs(nonconst_vector_result - expected_result[k]) / divisor;
+    EXPECT_NEAR_KK(diff, zero, eps);
   }
-}
 
+  Kokkos::deep_copy(b_y, b_org_y);
+
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamScal", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::scal(
+            teamMember, Kokkos::subview(y, Kokkos::ALL(), teamId),
+            params(teamId), Kokkos::subview(c_x, Kokkos::ALL(), teamId));
+      });
+
+  KokkosBlas::dot(r, y, y);
+  for (int k = 0; k < K; k++) {
+    ScalarA const_vector_result = r(k);
+    typename AT::mag_type divisor =
+        AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]);
+    typename AT::mag_type diff =
+        AT::abs(const_vector_result - expected_result[k]) / divisor;
+    EXPECT_NEAR_KK(diff, zero, eps);
+  }
 
+  delete[] expected_result;
+}
+}  // namespace Test
 
-template<class ScalarA, class ScalarB, class Device>
+template <class ScalarA, class ScalarB, class Device>
 int test_team_scal() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA*, Kokkos::LayoutLeft, Device> view_type_a_ll;
-  typedef Kokkos::View<ScalarB*, Kokkos::LayoutLeft, Device> view_type_b_ll;
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA *, Kokkos::LayoutLeft, Device> view_type_a_ll;
+  typedef Kokkos::View<ScalarB *, Kokkos::LayoutLeft, Device> view_type_b_ll;
   Test::impl_test_team_scal<view_type_a_ll, view_type_b_ll, Device>(0);
   Test::impl_test_team_scal<view_type_a_ll, view_type_b_ll, Device>(13);
   Test::impl_test_team_scal<view_type_a_ll, view_type_b_ll, Device>(124);
-  //Test::impl_test_team_scal<view_type_a_ll, view_type_b_ll, Device>(132231);
+  // Test::impl_test_team_scal<view_type_a_ll, view_type_b_ll, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA*, Kokkos::LayoutRight, Device> view_type_a_lr;
-  typedef Kokkos::View<ScalarB*, Kokkos::LayoutRight, Device> view_type_b_lr;
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA *, Kokkos::LayoutRight, Device> view_type_a_lr;
+  typedef Kokkos::View<ScalarB *, Kokkos::LayoutRight, Device> view_type_b_lr;
   Test::impl_test_team_scal<view_type_a_lr, view_type_b_lr, Device>(0);
   Test::impl_test_team_scal<view_type_a_lr, view_type_b_lr, Device>(13);
   Test::impl_test_team_scal<view_type_a_lr, view_type_b_lr, Device>(124);
-  //Test::impl_test_team_scal<view_type_a_lr, view_type_b_lr, Device>(132231);
+  // Test::impl_test_team_scal<view_type_a_lr, view_type_b_lr, Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA*, Kokkos::LayoutStride, Device> view_type_a_ls;
-  typedef Kokkos::View<ScalarB*, Kokkos::LayoutStride, Device> view_type_b_ls;
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA *, Kokkos::LayoutStride, Device> view_type_a_ls;
+  typedef Kokkos::View<ScalarB *, Kokkos::LayoutStride, Device> view_type_b_ls;
   Test::impl_test_team_scal<view_type_a_ls, view_type_b_ls, Device>(0);
   Test::impl_test_team_scal<view_type_a_ls, view_type_b_ls, Device>(13);
   Test::impl_test_team_scal<view_type_a_ls, view_type_b_ls, Device>(124);
-  //Test::impl_test_team_scal<view_type_a_ls, view_type_b_ls, Device>(132231);
+  // Test::impl_test_team_scal<view_type_a_ls, view_type_b_ls, Device>(132231);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
   Test::impl_test_team_scal<view_type_a_ls, view_type_b_ll, Device>(124);
   Test::impl_test_team_scal<view_type_a_ll, view_type_b_ls, Device>(124);
 #endif
@@ -271,87 +336,107 @@ int test_team_scal() {
   return 1;
 }
 
-template<class ScalarA, class ScalarB, class Device>
+template <class ScalarA, class ScalarB, class Device>
 int test_team_scal_mv() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device> view_type_a_ll;
-  typedef Kokkos::View<ScalarB**, Kokkos::LayoutLeft, Device> view_type_b_ll;
-  Test::impl_test_team_scal_mv<view_type_a_ll, view_type_b_ll, Device>(0,5);
-  Test::impl_test_team_scal_mv<view_type_a_ll, view_type_b_ll, Device>(13,5);
-  Test::impl_test_team_scal_mv<view_type_a_ll, view_type_b_ll, Device>(124,5);
-  //Test::impl_test_team_scal_mv<view_type_a_ll, view_type_b_ll, Device>(132231,5);
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA **, Kokkos::LayoutLeft, Device> view_type_a_ll;
+  typedef Kokkos::View<ScalarB **, Kokkos::LayoutLeft, Device> view_type_b_ll;
+  Test::impl_test_team_scal_mv<view_type_a_ll, view_type_b_ll, Device>(0, 5);
+  Test::impl_test_team_scal_mv<view_type_a_ll, view_type_b_ll, Device>(13, 5);
+  Test::impl_test_team_scal_mv<view_type_a_ll, view_type_b_ll, Device>(124, 5);
+  // Test::impl_test_team_scal_mv<view_type_a_ll, view_type_b_ll,
+  // Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device> view_type_a_lr;
-  typedef Kokkos::View<ScalarB**, Kokkos::LayoutRight, Device> view_type_b_lr;
-  Test::impl_test_team_scal_mv<view_type_a_lr, view_type_b_lr, Device>(0,5);
-  Test::impl_test_team_scal_mv<view_type_a_lr, view_type_b_lr, Device>(13,5);
-  Test::impl_test_team_scal_mv<view_type_a_lr, view_type_b_lr, Device>(124,5);
-  //Test::impl_test_team_scal_mv<view_type_a_lr, view_type_b_lr, Device>(132231,5);
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA **, Kokkos::LayoutRight, Device> view_type_a_lr;
+  typedef Kokkos::View<ScalarB **, Kokkos::LayoutRight, Device> view_type_b_lr;
+  Test::impl_test_team_scal_mv<view_type_a_lr, view_type_b_lr, Device>(0, 5);
+  Test::impl_test_team_scal_mv<view_type_a_lr, view_type_b_lr, Device>(13, 5);
+  Test::impl_test_team_scal_mv<view_type_a_lr, view_type_b_lr, Device>(124, 5);
+  // Test::impl_test_team_scal_mv<view_type_a_lr, view_type_b_lr,
+  // Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutStride, Device> view_type_a_ls;
-  typedef Kokkos::View<ScalarB**, Kokkos::LayoutStride, Device> view_type_b_ls;
-  Test::impl_test_team_scal_mv<view_type_a_ls, view_type_b_ls, Device>(0,5);
-  Test::impl_test_team_scal_mv<view_type_a_ls, view_type_b_ls, Device>(13,5);
-  Test::impl_test_team_scal_mv<view_type_a_ls, view_type_b_ls, Device>(124,5);
-  //Test::impl_test_team_scal_mv<view_type_a_ls, view_type_b_ls, Device>(132231,5);
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA **, Kokkos::LayoutStride, Device> view_type_a_ls;
+  typedef Kokkos::View<ScalarB **, Kokkos::LayoutStride, Device> view_type_b_ls;
+  Test::impl_test_team_scal_mv<view_type_a_ls, view_type_b_ls, Device>(0, 5);
+  Test::impl_test_team_scal_mv<view_type_a_ls, view_type_b_ls, Device>(13, 5);
+  Test::impl_test_team_scal_mv<view_type_a_ls, view_type_b_ls, Device>(124, 5);
+  // Test::impl_test_team_scal_mv<view_type_a_ls, view_type_b_ls,
+  // Device>(132231,5);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-  Test::impl_test_team_scal_mv<view_type_a_ls, view_type_b_ll, Device>(124,5);
-  Test::impl_test_team_scal_mv<view_type_a_ll, view_type_b_ls, Device>(124,5);
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+  Test::impl_test_team_scal_mv<view_type_a_ls, view_type_b_ll, Device>(124, 5);
+  Test::impl_test_team_scal_mv<view_type_a_ll, view_type_b_ls, Device>(124, 5);
 #endif
 
   return 1;
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_scal_float ) {
-    test_team_scal<float,float,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_scal_float) {
+  test_team_scal<float, float, TestExecSpace>();
 }
-TEST_F( TestCategory, team_scal_mv_float ) {
-    test_team_scal_mv<float,float,TestExecSpace> ();
+TEST_F(TestCategory, team_scal_mv_float) {
+  test_team_scal_mv<float, float, TestExecSpace>();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_scal_double ) {
-    test_team_scal<double,double,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_scal_double) {
+  test_team_scal<double, double, TestExecSpace>();
 }
-TEST_F( TestCategory, team_scal_mv_double ) {
-    test_team_scal_mv<double,double,TestExecSpace> ();
+TEST_F(TestCategory, team_scal_mv_double) {
+  test_team_scal_mv<double, double, TestExecSpace>();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_scal_complex_double ) {
-    test_team_scal<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_scal_complex_double) {
+  test_team_scal<Kokkos::complex<double>, Kokkos::complex<double>,
+                 TestExecSpace>();
 }
-TEST_F( TestCategory, team_scal_mv_complex_double ) {
-    test_team_scal_mv<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+TEST_F(TestCategory, team_scal_mv_complex_double) {
+  test_team_scal_mv<Kokkos::complex<double>, Kokkos::complex<double>,
+                    TestExecSpace>();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_scal_int ) {
-    test_team_scal<int,int,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_INT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_scal_int) {
+  test_team_scal<int, int, TestExecSpace>();
 }
-TEST_F( TestCategory, team_scal_mv_int ) {
-    test_team_scal_mv<int,int,TestExecSpace> ();
+TEST_F(TestCategory, team_scal_mv_int) {
+  test_team_scal_mv<int, int, TestExecSpace>();
 }
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-TEST_F( TestCategory, team_scal_double_int ) {
-    test_team_scal<double,int,TestExecSpace> ();
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+TEST_F(TestCategory, team_scal_double_int) {
+  test_team_scal<double, int, TestExecSpace>();
 }
-TEST_F( TestCategory, team_scal_double_mv_int ) {
-    test_team_scal_mv<double,int,TestExecSpace> ();
+TEST_F(TestCategory, team_scal_double_mv_int) {
+  test_team_scal_mv<double, int, TestExecSpace>();
 }
 #endif
 
-#endif // Check for lambda availability in CUDA backend
+#endif  // Check for lambda availability in CUDA backend
diff --git a/unit_test/blas/Test_Blas1_team_update.hpp b/unit_test/blas/Test_Blas1_team_update.hpp
index b1a4bc3ed4..b724c1edb3 100644
--- a/unit_test/blas/Test_Blas1_team_update.hpp
+++ b/unit_test/blas/Test_Blas1_team_update.hpp
@@ -4,332 +4,455 @@
 //       the CUDA backend before including this test.
 #if !defined(TEST_CUDA_BLAS_CPP) || defined(KOKKOS_ENABLE_CUDA_LAMBDA)
 
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas1_team_update.hpp>
-#include<KokkosBlas1_dot.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas1_team_update.hpp>
+#include <KokkosBlas1_dot.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
-  template<class ViewTypeA, class ViewTypeB, class ViewTypeC, class Device>
-  void impl_test_team_update(int N) {
-	  
-    typedef Kokkos::TeamPolicy<Device>        team_policy ;
-    typedef typename team_policy::member_type team_member ;
-
-    //Launch M teams of the maximum number of threads per team
-    int M = 4;
-    const team_policy policy( M, Kokkos::AUTO );
-    const int team_data_siz = (N%M == 0)?(N/M):(N/M+1);
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-    typedef typename ViewTypeC::value_type ScalarC;
-
-    typedef Kokkos::View<ScalarA*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeA::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeA;
-    typedef Kokkos::View<ScalarB*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeB::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeB;
-    typedef Kokkos::View<ScalarC*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeC::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeC;
-
-
-    ScalarA a = 3;
-    ScalarB b = 5;
-    ScalarC c = 7;
-    double eps = std::is_same<ScalarC,float>::value?2*1e-5:1e-7;
-
-    BaseTypeA b_x("X",N);
-    BaseTypeB b_y("Y",N);
-    BaseTypeC b_z("Y",N);
-    BaseTypeC b_org_z("Org_Z",N);
-    
-
-    ViewTypeA x = Kokkos::subview(b_x,Kokkos::ALL(),0);
-    ViewTypeB y = Kokkos::subview(b_y,Kokkos::ALL(),0);
-    ViewTypeC z = Kokkos::subview(b_z,Kokkos::ALL(),0);
-    typename ViewTypeA::const_type c_x = x;
-    typename ViewTypeB::const_type c_y = y;
-
-    typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
-    typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y);
-    typename BaseTypeC::HostMirror h_b_z = Kokkos::create_mirror_view(b_z);
-
-    typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x,Kokkos::ALL(),0);
-    typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y,Kokkos::ALL(),0);
-    typename ViewTypeC::HostMirror h_z = Kokkos::subview(h_b_z,Kokkos::ALL(),0);
-
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
-
-    Kokkos::fill_random(b_x,rand_pool,ScalarA(10));
-    Kokkos::fill_random(b_y,rand_pool,ScalarB(10));
-    Kokkos::fill_random(b_z,rand_pool,ScalarC(10));
-
-    Kokkos::deep_copy(b_org_z,b_z);
-
-    Kokkos::deep_copy(h_b_x,b_x);
-    Kokkos::deep_copy(h_b_y,b_y);
-    Kokkos::deep_copy(h_b_z,b_z);
-
-    ScalarA expected_result = 0;
-    for(int i=0;i<N;i++)
-      expected_result += ScalarB(c*h_z(i) + a*h_x(i) + b*h_y(i)) * ScalarB(c*h_z(i) + a*h_x(i) + b*h_y(i));
-
-    //KokkosBlas::update(a,x,b,y,c,z);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamUpdate", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::update(teamMember, a, Kokkos::subview(x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), b, Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), c, Kokkos::subview(z,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)));
-    } );
-    ScalarC nonconst_nonconst_result = KokkosBlas::dot(z,z);
-    EXPECT_NEAR_KK( nonconst_nonconst_result, expected_result, eps*expected_result);
- 
-    Kokkos::deep_copy(b_z,b_org_z);
-    //KokkosBlas::update(a,c_x,b,y,c,z);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamUpdate", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::update(teamMember, a, Kokkos::subview(c_x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), b, Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), c, Kokkos::subview(z,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)));
-    } );
-    ScalarC const_nonconst_result = KokkosBlas::dot(z,z);
-    EXPECT_NEAR_KK( const_nonconst_result, expected_result, eps*expected_result);
-
-    Kokkos::deep_copy(b_z,b_org_z);
-    //KokkosBlas::update(a,c_x,b,c_y,c,z);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamUpdate", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::update(teamMember, a, Kokkos::subview(c_x,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), b, Kokkos::subview(c_y,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)), c, Kokkos::subview(z,Kokkos::make_pair(teamId*team_data_siz,(teamId < M-1)?(teamId+1)*team_data_siz:N)));
-    } );
-    ScalarC const_const_result = KokkosBlas::dot(z,z);
-    EXPECT_NEAR_KK( const_const_result, expected_result, eps*expected_result);
+template <class ViewTypeA, class ViewTypeB, class ViewTypeC, class Device>
+void impl_test_team_update(int N) {
+  typedef Kokkos::TeamPolicy<Device> team_policy;
+  typedef typename team_policy::member_type team_member;
+
+  // Launch M teams of the maximum number of threads per team
+  int M = 4;
+  const team_policy policy(M, Kokkos::AUTO);
+  const int team_data_siz = (N % M == 0) ? (N / M) : (N / M + 1);
+
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+  typedef typename ViewTypeC::value_type ScalarC;
+
+  typedef Kokkos::View<
+      ScalarA * [2],
+      typename std::conditional<std::is_same<typename ViewTypeA::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeA;
+  typedef Kokkos::View<
+      ScalarB * [2],
+      typename std::conditional<std::is_same<typename ViewTypeB::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeB;
+  typedef Kokkos::View<
+      ScalarC * [2],
+      typename std::conditional<std::is_same<typename ViewTypeC::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeC;
+
+  ScalarA a  = 3;
+  ScalarB b  = 5;
+  ScalarC c  = 7;
+  double eps = std::is_same<ScalarC, float>::value ? 2 * 1e-5 : 1e-7;
+
+  BaseTypeA b_x("X", N);
+  BaseTypeB b_y("Y", N);
+  BaseTypeC b_z("Y", N);
+  BaseTypeC b_org_z("Org_Z", N);
+
+  ViewTypeA x                        = Kokkos::subview(b_x, Kokkos::ALL(), 0);
+  ViewTypeB y                        = Kokkos::subview(b_y, Kokkos::ALL(), 0);
+  ViewTypeC z                        = Kokkos::subview(b_z, Kokkos::ALL(), 0);
+  typename ViewTypeA::const_type c_x = x;
+  typename ViewTypeB::const_type c_y = y;
+
+  typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
+  typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y);
+  typename BaseTypeC::HostMirror h_b_z = Kokkos::create_mirror_view(b_z);
+
+  typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0);
+  typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0);
+  typename ViewTypeC::HostMirror h_z = Kokkos::subview(h_b_z, Kokkos::ALL(), 0);
+
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
+
+  Kokkos::fill_random(b_x, rand_pool, ScalarA(10));
+  Kokkos::fill_random(b_y, rand_pool, ScalarB(10));
+  Kokkos::fill_random(b_z, rand_pool, ScalarC(10));
+
+  Kokkos::deep_copy(b_org_z, b_z);
+
+  Kokkos::deep_copy(h_b_x, b_x);
+  Kokkos::deep_copy(h_b_y, b_y);
+  Kokkos::deep_copy(h_b_z, b_z);
+
+  ScalarA expected_result = 0;
+  for (int i = 0; i < N; i++)
+    expected_result += ScalarB(c * h_z(i) + a * h_x(i) + b * h_y(i)) *
+                       ScalarB(c * h_z(i) + a * h_x(i) + b * h_y(i));
+
+  // KokkosBlas::update(a,x,b,y,c,z);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamUpdate", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::update(
+            teamMember, a,
+            Kokkos::subview(
+                x, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)),
+            b,
+            Kokkos::subview(
+                y, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)),
+            c,
+            Kokkos::subview(
+                z, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)));
+      });
+  ScalarC nonconst_nonconst_result = KokkosBlas::dot(z, z);
+  EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result,
+                 eps * expected_result);
+
+  Kokkos::deep_copy(b_z, b_org_z);
+  // KokkosBlas::update(a,c_x,b,y,c,z);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamUpdate", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::update(
+            teamMember, a,
+            Kokkos::subview(
+                c_x, Kokkos::make_pair(
+                         teamId * team_data_siz,
+                         (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)),
+            b,
+            Kokkos::subview(
+                y, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)),
+            c,
+            Kokkos::subview(
+                z, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)));
+      });
+  ScalarC const_nonconst_result = KokkosBlas::dot(z, z);
+  EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result);
+
+  Kokkos::deep_copy(b_z, b_org_z);
+  // KokkosBlas::update(a,c_x,b,c_y,c,z);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamUpdate", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::update(
+            teamMember, a,
+            Kokkos::subview(
+                c_x, Kokkos::make_pair(
+                         teamId * team_data_siz,
+                         (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)),
+            b,
+            Kokkos::subview(
+                c_y, Kokkos::make_pair(
+                         teamId * team_data_siz,
+                         (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)),
+            c,
+            Kokkos::subview(
+                z, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)));
+      });
+  ScalarC const_const_result = KokkosBlas::dot(z, z);
+  EXPECT_NEAR_KK(const_const_result, expected_result, eps * expected_result);
+}
+
+template <class ViewTypeA, class ViewTypeB, class ViewTypeC, class Device>
+void impl_test_team_update_mv(int N, int K) {
+  typedef Kokkos::TeamPolicy<Device> team_policy;
+  typedef typename team_policy::member_type team_member;
+
+  // Launch K teams of the maximum number of threads per team
+  const team_policy policy(K, Kokkos::AUTO);
+
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+  typedef typename ViewTypeC::value_type ScalarC;
+
+  typedef multivector_layout_adapter<ViewTypeA> vfA_type;
+  typedef multivector_layout_adapter<ViewTypeB> vfB_type;
+  typedef multivector_layout_adapter<ViewTypeC> vfC_type;
+
+  typename vfA_type::BaseType b_x("X", N, K);
+  typename vfB_type::BaseType b_y("Y", N, K);
+  typename vfC_type::BaseType b_z("Z", N, K);
+  typename vfC_type::BaseType b_org_z("Z", N, K);
+
+  ViewTypeA x = vfA_type::view(b_x);
+  ViewTypeB y = vfB_type::view(b_y);
+  ViewTypeC z = vfC_type::view(b_z);
+
+  typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
+  typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
+  typedef multivector_layout_adapter<typename ViewTypeC::HostMirror> h_vfC_type;
+
+  typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x);
+  typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y);
+  typename h_vfC_type::BaseType h_b_z = Kokkos::create_mirror_view(b_z);
+
+  typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x);
+  typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
+  typename ViewTypeC::HostMirror h_z = h_vfC_type::view(h_b_z);
+
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
+
+  Kokkos::fill_random(b_x, rand_pool, ScalarA(10));
+  Kokkos::fill_random(b_y, rand_pool, ScalarB(10));
+  Kokkos::fill_random(b_z, rand_pool, ScalarC(10));
+
+  Kokkos::deep_copy(b_org_z, b_z);
+
+  Kokkos::deep_copy(h_b_x, b_x);
+  Kokkos::deep_copy(h_b_y, b_y);
+  Kokkos::deep_copy(h_b_z, b_z);
+
+  ScalarA a                          = 3;
+  ScalarB b                          = 5;
+  ScalarC c                          = 5;
+  typename ViewTypeA::const_type c_x = x;
+  typename ViewTypeB::const_type c_y = y;
+
+  ScalarC *expected_result = new ScalarC[K];
+  for (int j = 0; j < K; j++) {
+    expected_result[j] = ScalarC();
+    for (int i = 0; i < N; i++)
+      expected_result[j] +=
+          ScalarC(a * h_x(i, j) + b * h_y(i, j) + c * h_z(i, j)) *
+          ScalarC(a * h_x(i, j) + b * h_y(i, j) + c * h_z(i, j));
   }
 
-  template<class ViewTypeA, class ViewTypeB, class ViewTypeC, class Device>
-  void impl_test_team_update_mv(int N, int K) {
-
-    typedef Kokkos::TeamPolicy<Device>        team_policy ;
-    typedef typename team_policy::member_type team_member ;
-
-    //Launch K teams of the maximum number of threads per team
-    const team_policy policy( K, Kokkos::AUTO );
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-    typedef typename ViewTypeC::value_type ScalarC;
-
-    typedef multivector_layout_adapter<ViewTypeA> vfA_type;
-    typedef multivector_layout_adapter<ViewTypeB> vfB_type;
-    typedef multivector_layout_adapter<ViewTypeC> vfC_type;
-
-    typename vfA_type::BaseType b_x("X",N,K);
-    typename vfB_type::BaseType b_y("Y",N,K);
-    typename vfC_type::BaseType b_z("Z",N,K);
-    typename vfC_type::BaseType b_org_z("Z",N,K);
-
-    ViewTypeA x = vfA_type::view(b_x);
-    ViewTypeB y = vfB_type::view(b_y);
-    ViewTypeC z = vfC_type::view(b_z);
-
-    typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
-    typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
-    typedef multivector_layout_adapter<typename ViewTypeC::HostMirror> h_vfC_type;
-
-    typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x);
-    typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y);
-    typename h_vfC_type::BaseType h_b_z = Kokkos::create_mirror_view(b_z);
-
-    typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x);
-    typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
-    typename ViewTypeC::HostMirror h_z = h_vfC_type::view(h_b_z);
-
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
-
-    Kokkos::fill_random(b_x,rand_pool,ScalarA(10));
-    Kokkos::fill_random(b_y,rand_pool,ScalarB(10));
-    Kokkos::fill_random(b_z,rand_pool,ScalarC(10));
-
-    Kokkos::deep_copy(b_org_z,b_z);
-
-    Kokkos::deep_copy(h_b_x,b_x);
-    Kokkos::deep_copy(h_b_y,b_y);
-    Kokkos::deep_copy(h_b_z,b_z);
-
-    ScalarA a = 3;
-    ScalarB b = 5;
-    ScalarC c = 5;
-    typename ViewTypeA::const_type c_x = x;
-    typename ViewTypeB::const_type c_y = y;
-
-    ScalarC* expected_result = new ScalarC[K];
-    for(int j=0;j<K;j++) {
-      expected_result[j] = ScalarC();
-      for(int i=0;i<N;i++)
-        expected_result[j] += ScalarC(a*h_x(i,j) + b*h_y(i,j) + c*h_z(i,j)) * ScalarC(a*h_x(i,j) + b*h_y(i,j) + c*h_z(i,j));
-    }
-
-    double eps = std::is_same<ScalarA,float>::value?2*1e-5:1e-7;
-
-    Kokkos::View<ScalarC*,Kokkos::HostSpace> r("Dot::Result",K);
-
-    //KokkosBlas::update(a,x,b,y,c,z);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamUpdate", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::update(teamMember, a, Kokkos::subview(x,Kokkos::ALL(),teamId), b, Kokkos::subview(y,Kokkos::ALL(),teamId), c, Kokkos::subview(z,Kokkos::ALL(),teamId));
-    } );
-    KokkosBlas::dot(r,z,z);
-    for(int k=0;k<K;k++) {
-      ScalarA nonconst_nonconst_result = r(k);
-      EXPECT_NEAR_KK( nonconst_nonconst_result, expected_result[k], eps*expected_result[k]);
-    }
-
-    Kokkos::deep_copy(b_z,b_org_z);
-    //KokkosBlas::update(a,c_x,b,y,c,z);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamUpdate", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::update(teamMember, a, Kokkos::subview(c_x,Kokkos::ALL(),teamId), b, Kokkos::subview(y,Kokkos::ALL(),teamId), c, Kokkos::subview(z,Kokkos::ALL(),teamId));
-    } );
-    KokkosBlas::dot(r,z,z);
-    for(int k=0;k<K;k++) {
-      ScalarA const_non_const_result = r(k);
-      EXPECT_NEAR_KK( const_non_const_result, expected_result[k], eps*expected_result[k]);
-    }
-
-    delete [] expected_result;
+  double eps = std::is_same<ScalarA, float>::value ? 2 * 1e-5 : 1e-7;
+
+  Kokkos::View<ScalarC *, Kokkos::HostSpace> r("Dot::Result", K);
+
+  // KokkosBlas::update(a,x,b,y,c,z);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamUpdate", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::update(
+            teamMember, a, Kokkos::subview(x, Kokkos::ALL(), teamId), b,
+            Kokkos::subview(y, Kokkos::ALL(), teamId), c,
+            Kokkos::subview(z, Kokkos::ALL(), teamId));
+      });
+  KokkosBlas::dot(r, z, z);
+  for (int k = 0; k < K; k++) {
+    ScalarA nonconst_nonconst_result = r(k);
+    EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k],
+                   eps * expected_result[k]);
   }
-}
 
+  Kokkos::deep_copy(b_z, b_org_z);
+  // KokkosBlas::update(a,c_x,b,y,c,z);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamUpdate", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::update(
+            teamMember, a, Kokkos::subview(c_x, Kokkos::ALL(), teamId), b,
+            Kokkos::subview(y, Kokkos::ALL(), teamId), c,
+            Kokkos::subview(z, Kokkos::ALL(), teamId));
+      });
+  KokkosBlas::dot(r, z, z);
+  for (int k = 0; k < K; k++) {
+    ScalarA const_non_const_result = r(k);
+    EXPECT_NEAR_KK(const_non_const_result, expected_result[k],
+                   eps * expected_result[k]);
+  }
 
+  delete[] expected_result;
+}
+}  // namespace Test
 
-template<class ScalarA, class ScalarB, class ScalarC, class Device>
+template <class ScalarA, class ScalarB, class ScalarC, class Device>
 int test_team_update() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA*, Kokkos::LayoutLeft, Device> view_type_a_ll;
-  typedef Kokkos::View<ScalarB*, Kokkos::LayoutLeft, Device> view_type_b_ll;
-  typedef Kokkos::View<ScalarC*, Kokkos::LayoutLeft, Device> view_type_c_ll;
-  Test::impl_test_team_update<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(0);
-  Test::impl_test_team_update<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(13);
-  Test::impl_test_team_update<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(124);
-  //Test::impl_test_team_update<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(132231);
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA *, Kokkos::LayoutLeft, Device> view_type_a_ll;
+  typedef Kokkos::View<ScalarB *, Kokkos::LayoutLeft, Device> view_type_b_ll;
+  typedef Kokkos::View<ScalarC *, Kokkos::LayoutLeft, Device> view_type_c_ll;
+  Test::impl_test_team_update<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+                              Device>(0);
+  Test::impl_test_team_update<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+                              Device>(13);
+  Test::impl_test_team_update<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+                              Device>(124);
+  // Test::impl_test_team_update<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+  // Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA*, Kokkos::LayoutRight, Device> view_type_a_lr;
-  typedef Kokkos::View<ScalarB*, Kokkos::LayoutRight, Device> view_type_b_lr;
-  typedef Kokkos::View<ScalarC*, Kokkos::LayoutRight, Device> view_type_c_lr;
-  Test::impl_test_team_update<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(0);
-  Test::impl_test_team_update<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(13);
-  Test::impl_test_team_update<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(124);
-  //Test::impl_test_team_update<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(132231);
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA *, Kokkos::LayoutRight, Device> view_type_a_lr;
+  typedef Kokkos::View<ScalarB *, Kokkos::LayoutRight, Device> view_type_b_lr;
+  typedef Kokkos::View<ScalarC *, Kokkos::LayoutRight, Device> view_type_c_lr;
+  Test::impl_test_team_update<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+                              Device>(0);
+  Test::impl_test_team_update<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+                              Device>(13);
+  Test::impl_test_team_update<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+                              Device>(124);
+  // Test::impl_test_team_update<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+  // Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA*, Kokkos::LayoutStride, Device> view_type_a_ls;
-  typedef Kokkos::View<ScalarB*, Kokkos::LayoutStride, Device> view_type_b_ls;
-  typedef Kokkos::View<ScalarC*, Kokkos::LayoutStride, Device> view_type_c_ls;
-  Test::impl_test_team_update<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(0);
-  Test::impl_test_team_update<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(13);
-  Test::impl_test_team_update<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(124);
-  //Test::impl_test_team_update<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(132231);
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA *, Kokkos::LayoutStride, Device> view_type_a_ls;
+  typedef Kokkos::View<ScalarB *, Kokkos::LayoutStride, Device> view_type_b_ls;
+  typedef Kokkos::View<ScalarC *, Kokkos::LayoutStride, Device> view_type_c_ls;
+  Test::impl_test_team_update<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+                              Device>(0);
+  Test::impl_test_team_update<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+                              Device>(13);
+  Test::impl_test_team_update<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+                              Device>(124);
+  // Test::impl_test_team_update<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+  // Device>(132231);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-  Test::impl_test_team_update<view_type_a_ls, view_type_b_ll, view_type_c_lr, Device>(124);
-  Test::impl_test_team_update<view_type_a_ll, view_type_b_ls, view_type_c_lr, Device>(124);
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+  Test::impl_test_team_update<view_type_a_ls, view_type_b_ll, view_type_c_lr,
+                              Device>(124);
+  Test::impl_test_team_update<view_type_a_ll, view_type_b_ls, view_type_c_lr,
+                              Device>(124);
 #endif
 
   return 1;
 }
 
-template<class ScalarA, class ScalarB, class ScalarC, class Device>
+template <class ScalarA, class ScalarB, class ScalarC, class Device>
 int test_team_update_mv() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device> view_type_a_ll;
-  typedef Kokkos::View<ScalarB**, Kokkos::LayoutLeft, Device> view_type_b_ll;
-  typedef Kokkos::View<ScalarC**, Kokkos::LayoutLeft, Device> view_type_c_ll;
-  Test::impl_test_team_update_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(0,5);
-  Test::impl_test_team_update_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(13,5);
-  Test::impl_test_team_update_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(124,5);
-  //Test::impl_test_team_update_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(132231,5);
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA **, Kokkos::LayoutLeft, Device> view_type_a_ll;
+  typedef Kokkos::View<ScalarB **, Kokkos::LayoutLeft, Device> view_type_b_ll;
+  typedef Kokkos::View<ScalarC **, Kokkos::LayoutLeft, Device> view_type_c_ll;
+  Test::impl_test_team_update_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+                                 Device>(0, 5);
+  Test::impl_test_team_update_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+                                 Device>(13, 5);
+  Test::impl_test_team_update_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+                                 Device>(124, 5);
+  // Test::impl_test_team_update_mv<view_type_a_ll, view_type_b_ll,
+  // view_type_c_ll, Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device> view_type_a_lr;
-  typedef Kokkos::View<ScalarB**, Kokkos::LayoutRight, Device> view_type_b_lr;
-  typedef Kokkos::View<ScalarC**, Kokkos::LayoutRight, Device> view_type_c_lr;
-  Test::impl_test_team_update_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(0,5);
-  Test::impl_test_team_update_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(13,5);
-  Test::impl_test_team_update_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(124,5);
-  //Test::impl_test_team_update_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(132231,5);
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA **, Kokkos::LayoutRight, Device> view_type_a_lr;
+  typedef Kokkos::View<ScalarB **, Kokkos::LayoutRight, Device> view_type_b_lr;
+  typedef Kokkos::View<ScalarC **, Kokkos::LayoutRight, Device> view_type_c_lr;
+  Test::impl_test_team_update_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+                                 Device>(0, 5);
+  Test::impl_test_team_update_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+                                 Device>(13, 5);
+  Test::impl_test_team_update_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+                                 Device>(124, 5);
+  // Test::impl_test_team_update_mv<view_type_a_lr, view_type_b_lr,
+  // view_type_c_lr, Device>(132231,5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutStride, Device> view_type_a_ls;
-  typedef Kokkos::View<ScalarB**, Kokkos::LayoutStride, Device> view_type_b_ls;
-  typedef Kokkos::View<ScalarC**, Kokkos::LayoutStride, Device> view_type_c_ls;
-  Test::impl_test_team_update_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(0,5);
-  Test::impl_test_team_update_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(13,5);
-  Test::impl_test_team_update_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(124,5);
-  //Test::impl_test_team_update_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(132231,5);
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA **, Kokkos::LayoutStride, Device> view_type_a_ls;
+  typedef Kokkos::View<ScalarB **, Kokkos::LayoutStride, Device> view_type_b_ls;
+  typedef Kokkos::View<ScalarC **, Kokkos::LayoutStride, Device> view_type_c_ls;
+  Test::impl_test_team_update_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+                                 Device>(0, 5);
+  Test::impl_test_team_update_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+                                 Device>(13, 5);
+  Test::impl_test_team_update_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+                                 Device>(124, 5);
+  // Test::impl_test_team_update_mv<view_type_a_ls, view_type_b_ls,
+  // view_type_c_ls, Device>(132231,5);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-  Test::impl_test_team_update_mv<view_type_a_ls, view_type_b_ll, view_type_c_lr, Device>(124,5);
-  Test::impl_test_team_update_mv<view_type_a_ll, view_type_b_ls, view_type_c_lr, Device>(124,5);
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+  Test::impl_test_team_update_mv<view_type_a_ls, view_type_b_ll, view_type_c_lr,
+                                 Device>(124, 5);
+  Test::impl_test_team_update_mv<view_type_a_ll, view_type_b_ls, view_type_c_lr,
+                                 Device>(124, 5);
 #endif
 
   return 1;
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_update_float ) {
-    test_team_update<float,float,float,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_update_float) {
+  test_team_update<float, float, float, TestExecSpace>();
 }
-TEST_F( TestCategory, team_update_mv_float ) {
-    test_team_update_mv<float,float,float,TestExecSpace> ();
+TEST_F(TestCategory, team_update_mv_float) {
+  test_team_update_mv<float, float, float, TestExecSpace>();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_update_double ) {
-    test_team_update<double,double,double,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_update_double) {
+  test_team_update<double, double, double, TestExecSpace>();
 }
-TEST_F( TestCategory, team_update_mv_double ) {
-    test_team_update_mv<double,double,double,TestExecSpace> ();
+TEST_F(TestCategory, team_update_mv_double) {
+  test_team_update_mv<double, double, double, TestExecSpace>();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_update_complex_double ) {
-    test_team_update<Kokkos::complex<double>,Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_update_complex_double) {
+  test_team_update<Kokkos::complex<double>, Kokkos::complex<double>,
+                   Kokkos::complex<double>, TestExecSpace>();
 }
-TEST_F( TestCategory, team_update_mv_complex_double ) {
-    test_team_update_mv<Kokkos::complex<double>,Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+TEST_F(TestCategory, team_update_mv_complex_double) {
+  test_team_update_mv<Kokkos::complex<double>, Kokkos::complex<double>,
+                      Kokkos::complex<double>, TestExecSpace>();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_update_int ) {
-    test_team_update<int,int,int,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_INT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_update_int) {
+  test_team_update<int, int, int, TestExecSpace>();
 }
-TEST_F( TestCategory, team_update_mv_int ) {
-    test_team_update_mv<int,int,int,TestExecSpace> ();
+TEST_F(TestCategory, team_update_mv_int) {
+  test_team_update_mv<int, int, int, TestExecSpace>();
 }
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-TEST_F( TestCategory, team_update_double_int ) {
-    test_team_update<double,int,float,TestExecSpace> ();
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+TEST_F(TestCategory, team_update_double_int) {
+  test_team_update<double, int, float, TestExecSpace>();
 }
-TEST_F( TestCategory, team_update_double_mv_int ) {
-    test_team_update_mv<double,int,float,TestExecSpace> ();
+TEST_F(TestCategory, team_update_double_mv_int) {
+  test_team_update_mv<double, int, float, TestExecSpace>();
 }
 #endif
 
-#endif // Check for lambda availability in CUDA backend
+#endif  // Check for lambda availability in CUDA backend
diff --git a/unit_test/blas/Test_Blas1_update.hpp b/unit_test/blas/Test_Blas1_update.hpp
index 0ece3ae74c..b4d0f1f2e5 100644
--- a/unit_test/blas/Test_Blas1_update.hpp
+++ b/unit_test/blas/Test_Blas1_update.hpp
@@ -1,339 +1,392 @@
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas1_update.hpp>
-#include<KokkosBlas1_dot.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas1_update.hpp>
+#include <KokkosBlas1_dot.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
-  template<class ViewTypeA, class ViewTypeB, class ViewTypeC, class Device>
-  void impl_test_update(int N) {
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-    typedef typename ViewTypeC::value_type ScalarC;
-
-    typedef Kokkos::View<ScalarA*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeA::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeA;
-    typedef Kokkos::View<ScalarB*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeB::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeB;
-    typedef Kokkos::View<ScalarC*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeC::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeC;
-
-
-    ScalarA a = 3;
-    ScalarB b = 5;
-    ScalarC c = 7;
-    double eps = std::is_same<ScalarC,float>::value?2*1e-5:1e-7;
-
-    BaseTypeA b_x("X",N);
-    BaseTypeB b_y("Y",N);
-    BaseTypeC b_z("Y",N);
-    BaseTypeC b_org_z("Org_Z",N);
-    
-
-    ViewTypeA x = Kokkos::subview(b_x,Kokkos::ALL(),0);
-    ViewTypeB y = Kokkos::subview(b_y,Kokkos::ALL(),0);
-    ViewTypeC z = Kokkos::subview(b_z,Kokkos::ALL(),0);
-    typename ViewTypeA::const_type c_x = x;
-    typename ViewTypeB::const_type c_y = y;
-
-    typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
-    typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y);
-    typename BaseTypeC::HostMirror h_b_z = Kokkos::create_mirror_view(b_z);
-
-    typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x,Kokkos::ALL(),0);
-    typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y,Kokkos::ALL(),0);
-    typename ViewTypeC::HostMirror h_z = Kokkos::subview(h_b_z,Kokkos::ALL(),0);
-
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
-
-    {
-      ScalarA randStart, randEnd;
-      Test::getRandomBounds(10.0, randStart, randEnd);
-      Kokkos::fill_random(b_x,rand_pool,randStart,randEnd);
-    }
-    {
-      ScalarB randStart, randEnd;
-      Test::getRandomBounds(10.0, randStart, randEnd);
-      Kokkos::fill_random(b_y,rand_pool,randStart,randEnd);
-    }
-    {
-      ScalarC randStart, randEnd;
-      Test::getRandomBounds(10.0, randStart, randEnd);
-      Kokkos::fill_random(b_z,rand_pool,randStart,randEnd);
-    }
+template <class ViewTypeA, class ViewTypeB, class ViewTypeC, class Device>
+void impl_test_update(int N) {
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+  typedef typename ViewTypeC::value_type ScalarC;
+
+  typedef Kokkos::View<
+      ScalarA * [2],
+      typename std::conditional<std::is_same<typename ViewTypeA::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeA;
+  typedef Kokkos::View<
+      ScalarB * [2],
+      typename std::conditional<std::is_same<typename ViewTypeB::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeB;
+  typedef Kokkos::View<
+      ScalarC * [2],
+      typename std::conditional<std::is_same<typename ViewTypeC::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeC;
+
+  ScalarA a  = 3;
+  ScalarB b  = 5;
+  ScalarC c  = 7;
+  double eps = std::is_same<ScalarC, float>::value ? 2 * 1e-5 : 1e-7;
+
+  BaseTypeA b_x("X", N);
+  BaseTypeB b_y("Y", N);
+  BaseTypeC b_z("Y", N);
+  BaseTypeC b_org_z("Org_Z", N);
+
+  ViewTypeA x                        = Kokkos::subview(b_x, Kokkos::ALL(), 0);
+  ViewTypeB y                        = Kokkos::subview(b_y, Kokkos::ALL(), 0);
+  ViewTypeC z                        = Kokkos::subview(b_z, Kokkos::ALL(), 0);
+  typename ViewTypeA::const_type c_x = x;
+  typename ViewTypeB::const_type c_y = y;
+
+  typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
+  typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y);
+  typename BaseTypeC::HostMirror h_b_z = Kokkos::create_mirror_view(b_z);
+
+  typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0);
+  typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0);
+  typename ViewTypeC::HostMirror h_z = Kokkos::subview(h_b_z, Kokkos::ALL(), 0);
+
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
+
+  {
+    ScalarA randStart, randEnd;
+    Test::getRandomBounds(10.0, randStart, randEnd);
+    Kokkos::fill_random(b_x, rand_pool, randStart, randEnd);
+  }
+  {
+    ScalarB randStart, randEnd;
+    Test::getRandomBounds(10.0, randStart, randEnd);
+    Kokkos::fill_random(b_y, rand_pool, randStart, randEnd);
+  }
+  {
+    ScalarC randStart, randEnd;
+    Test::getRandomBounds(10.0, randStart, randEnd);
+    Kokkos::fill_random(b_z, rand_pool, randStart, randEnd);
+  }
 
-    Kokkos::deep_copy(b_org_z,b_z);
-    auto h_b_org_z = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z);
-    auto h_org_z = Kokkos::subview(h_b_org_z, Kokkos::ALL(), 0);
+  Kokkos::deep_copy(b_org_z, b_z);
+  auto h_b_org_z =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z);
+  auto h_org_z = Kokkos::subview(h_b_org_z, Kokkos::ALL(), 0);
 
-    Kokkos::deep_copy(h_b_x,b_x);
-    Kokkos::deep_copy(h_b_y,b_y);
-    Kokkos::deep_copy(h_b_z,b_z);
+  Kokkos::deep_copy(h_b_x, b_x);
+  Kokkos::deep_copy(h_b_y, b_y);
+  Kokkos::deep_copy(h_b_z, b_z);
 
-    KokkosBlas::update(a,x,b,y,c,z);
-    Kokkos::deep_copy(h_b_z, b_z);
-    for(int i = 0; i < N; i++)
-    {
-      EXPECT_NEAR_KK(a * h_x(i) + b * h_y(i) + c * h_org_z(i), h_z(i), eps);
-    }
+  KokkosBlas::update(a, x, b, y, c, z);
+  Kokkos::deep_copy(h_b_z, b_z);
+  for (int i = 0; i < N; i++) {
+    EXPECT_NEAR_KK(a * h_x(i) + b * h_y(i) + c * h_org_z(i), h_z(i), eps);
+  }
 
-    Kokkos::deep_copy(b_z,b_org_z);
-    KokkosBlas::update(a,c_x,b,y,c,z);
-    Kokkos::deep_copy(h_b_z, b_z);
-    for(int i = 0; i < N; i++)
-    {
-      EXPECT_NEAR_KK(a * h_x(i) + b * h_y(i) + c * h_org_z(i), h_z(i), eps);
-    }
+  Kokkos::deep_copy(b_z, b_org_z);
+  KokkosBlas::update(a, c_x, b, y, c, z);
+  Kokkos::deep_copy(h_b_z, b_z);
+  for (int i = 0; i < N; i++) {
+    EXPECT_NEAR_KK(a * h_x(i) + b * h_y(i) + c * h_org_z(i), h_z(i), eps);
+  }
 
-    Kokkos::deep_copy(b_z,b_org_z);
-    KokkosBlas::update(a,c_x,b,c_y,c,z);
-    Kokkos::deep_copy(h_b_z, b_z);
-    for(int i = 0; i < N; i++)
-    {
-      EXPECT_NEAR_KK(a * h_x(i) + b * h_y(i) + c * h_org_z(i), h_z(i), eps);
-    }
+  Kokkos::deep_copy(b_z, b_org_z);
+  KokkosBlas::update(a, c_x, b, c_y, c, z);
+  Kokkos::deep_copy(h_b_z, b_z);
+  for (int i = 0; i < N; i++) {
+    EXPECT_NEAR_KK(a * h_x(i) + b * h_y(i) + c * h_org_z(i), h_z(i), eps);
   }
+}
+
+template <class ViewTypeA, class ViewTypeB, class ViewTypeC, class Device>
+void impl_test_update_mv(int N, int K) {
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+  typedef typename ViewTypeC::value_type ScalarC;
 
-  template<class ViewTypeA, class ViewTypeB, class ViewTypeC, class Device>
-  void impl_test_update_mv(int N, int K) {
+  typedef multivector_layout_adapter<ViewTypeA> vfA_type;
+  typedef multivector_layout_adapter<ViewTypeB> vfB_type;
+  typedef multivector_layout_adapter<ViewTypeC> vfC_type;
 
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-    typedef typename ViewTypeC::value_type ScalarC;
+  typename vfA_type::BaseType b_x("X", N, K);
+  typename vfB_type::BaseType b_y("Y", N, K);
+  typename vfC_type::BaseType b_z("Z", N, K);
+  typename vfC_type::BaseType b_org_z("Z", N, K);
 
-    typedef multivector_layout_adapter<ViewTypeA> vfA_type;
-    typedef multivector_layout_adapter<ViewTypeB> vfB_type;
-    typedef multivector_layout_adapter<ViewTypeC> vfC_type;
+  ViewTypeA x = vfA_type::view(b_x);
+  ViewTypeB y = vfB_type::view(b_y);
+  ViewTypeC z = vfC_type::view(b_z);
 
-    typename vfA_type::BaseType b_x("X",N,K);
-    typename vfB_type::BaseType b_y("Y",N,K);
-    typename vfC_type::BaseType b_z("Z",N,K);
-    typename vfC_type::BaseType b_org_z("Z",N,K);
+  typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
+  typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
+  typedef multivector_layout_adapter<typename ViewTypeC::HostMirror> h_vfC_type;
 
-    ViewTypeA x = vfA_type::view(b_x);
-    ViewTypeB y = vfB_type::view(b_y);
-    ViewTypeC z = vfC_type::view(b_z);
+  typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x);
+  typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y);
+  typename h_vfC_type::BaseType h_b_z = Kokkos::create_mirror_view(b_z);
 
-    typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
-    typedef multivector_layout_adapter<typename ViewTypeB::HostMirror> h_vfB_type;
-    typedef multivector_layout_adapter<typename ViewTypeC::HostMirror> h_vfC_type;
+  typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x);
+  typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
+  typename ViewTypeC::HostMirror h_z = h_vfC_type::view(h_b_z);
 
-    typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x);
-    typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y);
-    typename h_vfC_type::BaseType h_b_z = Kokkos::create_mirror_view(b_z);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-    typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x);
-    typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y);
-    typename ViewTypeC::HostMirror h_z = h_vfC_type::view(h_b_z);
+  {
+    ScalarA randStart, randEnd;
+    Test::getRandomBounds(10.0, randStart, randEnd);
+    Kokkos::fill_random(b_x, rand_pool, randStart, randEnd);
+  }
+  {
+    ScalarB randStart, randEnd;
+    Test::getRandomBounds(10.0, randStart, randEnd);
+    Kokkos::fill_random(b_y, rand_pool, randStart, randEnd);
+  }
+  {
+    ScalarC randStart, randEnd;
+    Test::getRandomBounds(10.0, randStart, randEnd);
+    Kokkos::fill_random(b_z, rand_pool, randStart, randEnd);
+  }
 
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  Kokkos::deep_copy(b_org_z, b_z);
+  auto h_b_org_z =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z);
 
-    {
-      ScalarA randStart, randEnd;
-      Test::getRandomBounds(10.0, randStart, randEnd);
-      Kokkos::fill_random(b_x,rand_pool,randStart,randEnd);
-    }
-    {
-      ScalarB randStart, randEnd;
-      Test::getRandomBounds(10.0, randStart, randEnd);
-      Kokkos::fill_random(b_y,rand_pool,randStart,randEnd);
-    }
-    {
-      ScalarC randStart, randEnd;
-      Test::getRandomBounds(10.0, randStart, randEnd);
-      Kokkos::fill_random(b_z,rand_pool,randStart,randEnd);
-    }
+  Kokkos::deep_copy(h_b_x, b_x);
+  Kokkos::deep_copy(h_b_y, b_y);
+  Kokkos::deep_copy(h_b_z, b_z);
+
+  ScalarA a                          = 3;
+  ScalarB b                          = 5;
+  ScalarC c                          = 5;
+  typename ViewTypeA::const_type c_x = x;
+  typename ViewTypeB::const_type c_y = y;
 
-    Kokkos::deep_copy(b_org_z,b_z);
-    auto h_b_org_z = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z);
-
-    Kokkos::deep_copy(h_b_x,b_x);
-    Kokkos::deep_copy(h_b_y,b_y);
-    Kokkos::deep_copy(h_b_z,b_z);
-
-    ScalarA a = 3;
-    ScalarB b = 5;
-    ScalarC c = 5;
-    typename ViewTypeA::const_type c_x = x;
-    typename ViewTypeB::const_type c_y = y;
-
-    double eps = std::is_same<ScalarA,float>::value?2*1e-5:1e-7;
-
-    KokkosBlas::update(a,x,b,y,c,z);
-    Kokkos::deep_copy(h_b_z, b_z);
-    for(int i = 0; i < N; i++)
-    {
-      for(int j = 0; j < K; j++)
-      {
-        EXPECT_NEAR_KK(a * h_x(i, j) + b * h_y(i, j) + c * h_b_org_z(i, j), h_z(i, j), eps);
-      }
+  double eps = std::is_same<ScalarA, float>::value ? 2 * 1e-5 : 1e-7;
+
+  KokkosBlas::update(a, x, b, y, c, z);
+  Kokkos::deep_copy(h_b_z, b_z);
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < K; j++) {
+      EXPECT_NEAR_KK(a * h_x(i, j) + b * h_y(i, j) + c * h_b_org_z(i, j),
+                     h_z(i, j), eps);
     }
+  }
 
-    Kokkos::deep_copy(b_z,b_org_z);
-    KokkosBlas::update(a,c_x,b,y,c,z);
-    Kokkos::deep_copy(h_b_z, b_z);
-    for(int i = 0; i < N; i++)
-    {
-      for(int j = 0; j < K; j++)
-      {
-        EXPECT_NEAR_KK(a * h_x(i, j) + b * h_y(i, j) + c * h_b_org_z(i, j), h_z(i, j), eps);
-      }
+  Kokkos::deep_copy(b_z, b_org_z);
+  KokkosBlas::update(a, c_x, b, y, c, z);
+  Kokkos::deep_copy(h_b_z, b_z);
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < K; j++) {
+      EXPECT_NEAR_KK(a * h_x(i, j) + b * h_y(i, j) + c * h_b_org_z(i, j),
+                     h_z(i, j), eps);
     }
   }
 }
+}  // namespace Test
 
-
-
-template<class ScalarA, class ScalarB, class ScalarC, class Device>
+template <class ScalarA, class ScalarB, class ScalarC, class Device>
 int test_update() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutLeft, Device> view_type_a_ll;
   typedef Kokkos::View<ScalarB*, Kokkos::LayoutLeft, Device> view_type_b_ll;
   typedef Kokkos::View<ScalarC*, Kokkos::LayoutLeft, Device> view_type_c_ll;
-  Test::impl_test_update<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(0);
-  Test::impl_test_update<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(13);
-  Test::impl_test_update<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(1024);
-  //Test::impl_test_update<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(132231);
+  Test::impl_test_update<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+                         Device>(0);
+  Test::impl_test_update<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+                         Device>(13);
+  Test::impl_test_update<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+                         Device>(1024);
+  // Test::impl_test_update<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+  // Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutRight, Device> view_type_a_lr;
   typedef Kokkos::View<ScalarB*, Kokkos::LayoutRight, Device> view_type_b_lr;
   typedef Kokkos::View<ScalarC*, Kokkos::LayoutRight, Device> view_type_c_lr;
-  Test::impl_test_update<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(0);
-  Test::impl_test_update<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(13);
-  Test::impl_test_update<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(1024);
-  //Test::impl_test_update<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(132231);
+  Test::impl_test_update<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+                         Device>(0);
+  Test::impl_test_update<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+                         Device>(13);
+  Test::impl_test_update<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+                         Device>(1024);
+  // Test::impl_test_update<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+  // Device>(132231);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA*, Kokkos::LayoutStride, Device> view_type_a_ls;
   typedef Kokkos::View<ScalarB*, Kokkos::LayoutStride, Device> view_type_b_ls;
   typedef Kokkos::View<ScalarC*, Kokkos::LayoutStride, Device> view_type_c_ls;
-  Test::impl_test_update<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(0);
-  Test::impl_test_update<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(13);
-  Test::impl_test_update<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(1024);
-  //Test::impl_test_update<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(132231);
+  Test::impl_test_update<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+                         Device>(0);
+  Test::impl_test_update<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+                         Device>(13);
+  Test::impl_test_update<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+                         Device>(1024);
+  // Test::impl_test_update<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+  // Device>(132231);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-  Test::impl_test_update<view_type_a_ls, view_type_b_ll, view_type_c_lr, Device>(1024);
-  Test::impl_test_update<view_type_a_ll, view_type_b_ls, view_type_c_lr, Device>(1024);
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+  Test::impl_test_update<view_type_a_ls, view_type_b_ll, view_type_c_lr,
+                         Device>(1024);
+  Test::impl_test_update<view_type_a_ll, view_type_b_ls, view_type_c_lr,
+                         Device>(1024);
 #endif
 
   return 1;
 }
 
-template<class ScalarA, class ScalarB, class ScalarC, class Device>
+template <class ScalarA, class ScalarB, class ScalarC, class Device>
 int test_update_mv() {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device> view_type_a_ll;
   typedef Kokkos::View<ScalarB**, Kokkos::LayoutLeft, Device> view_type_b_ll;
   typedef Kokkos::View<ScalarC**, Kokkos::LayoutLeft, Device> view_type_c_ll;
-  Test::impl_test_update_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(0,5);
-  Test::impl_test_update_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(13,5);
-  Test::impl_test_update_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(1024,5);
-  Test::impl_test_update_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(132231,5);
+  Test::impl_test_update_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+                            Device>(0, 5);
+  Test::impl_test_update_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+                            Device>(13, 5);
+  Test::impl_test_update_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+                            Device>(1024, 5);
+  Test::impl_test_update_mv<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+                            Device>(132231, 5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device> view_type_a_lr;
   typedef Kokkos::View<ScalarB**, Kokkos::LayoutRight, Device> view_type_b_lr;
   typedef Kokkos::View<ScalarC**, Kokkos::LayoutRight, Device> view_type_c_lr;
-  Test::impl_test_update_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(0,5);
-  Test::impl_test_update_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(13,5);
-  Test::impl_test_update_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(1024,5);
-  Test::impl_test_update_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(132231,5);
+  Test::impl_test_update_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+                            Device>(0, 5);
+  Test::impl_test_update_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+                            Device>(13, 5);
+  Test::impl_test_update_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+                            Device>(1024, 5);
+  Test::impl_test_update_mv<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+                            Device>(132231, 5);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutStride, Device> view_type_a_ls;
   typedef Kokkos::View<ScalarB**, Kokkos::LayoutStride, Device> view_type_b_ls;
   typedef Kokkos::View<ScalarC**, Kokkos::LayoutStride, Device> view_type_c_ls;
-  Test::impl_test_update_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(0,5);
-  Test::impl_test_update_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(13,5);
-  Test::impl_test_update_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(1024,5);
-  Test::impl_test_update_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(132231,5);
+  Test::impl_test_update_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+                            Device>(0, 5);
+  Test::impl_test_update_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+                            Device>(13, 5);
+  Test::impl_test_update_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+                            Device>(1024, 5);
+  Test::impl_test_update_mv<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+                            Device>(132231, 5);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-  Test::impl_test_update_mv<view_type_a_ls, view_type_b_ll, view_type_c_lr, Device>(1024,5);
-  Test::impl_test_update_mv<view_type_a_ll, view_type_b_ls, view_type_c_lr, Device>(1024,5);
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+  Test::impl_test_update_mv<view_type_a_ls, view_type_b_ll, view_type_c_lr,
+                            Device>(1024, 5);
+  Test::impl_test_update_mv<view_type_a_ll, view_type_b_ls, view_type_c_lr,
+                            Device>(1024, 5);
 #endif
 
   return 1;
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, update_float ) {
-  Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_float"); 
-    test_update<float,float,float,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, update_float) {
+  Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_float");
+  test_update<float, float, float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, update_mv_float ) {
-  Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_float"); 
-    test_update_mv<float,float,float,TestExecSpace> ();
+TEST_F(TestCategory, update_mv_float) {
+  Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_float");
+  test_update_mv<float, float, float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, update_double ) {
-  Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_double"); 
-    test_update<double,double,double,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, update_double) {
+  Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_double");
+  test_update<double, double, double, TestExecSpace>();
 }
-TEST_F( TestCategory, update_mv_double ) {
-  Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_double"); 
-    test_update_mv<double,double,double,TestExecSpace> ();
+TEST_F(TestCategory, update_mv_double) {
+  Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_double");
+  test_update_mv<double, double, double, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, update_complex_double ) {
-  Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_complex_double"); 
-    test_update<Kokkos::complex<double>,Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, update_complex_double) {
+  Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_complex_double");
+  test_update<Kokkos::complex<double>, Kokkos::complex<double>,
+              Kokkos::complex<double>, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, update_mv_complex_double ) {
-  Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_complex_double"); 
-    test_update_mv<Kokkos::complex<double>,Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ();
+TEST_F(TestCategory, update_mv_complex_double) {
+  Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_complex_double");
+  test_update_mv<Kokkos::complex<double>, Kokkos::complex<double>,
+                 Kokkos::complex<double>, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, update_int ) {
-  Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_int"); 
-    test_update<int,int,int,TestExecSpace> ();
+#if defined(KOKKOSKERNELS_INST_INT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, update_int) {
+  Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_int");
+  test_update<int, int, int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, update_mv_int ) {
-  Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_int"); 
-    test_update_mv<int,int,int,TestExecSpace> ();
+TEST_F(TestCategory, update_mv_int) {
+  Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_int");
+  test_update_mv<int, int, int, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-TEST_F( TestCategory, update_double_int ) {
-  Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_double_int"); 
-    test_update<double,int,float,TestExecSpace> ();
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+TEST_F(TestCategory, update_double_int) {
+  Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_double_int");
+  test_update<double, int, float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, update_mv_double_int ) {
-  Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_double_int"); 
-    test_update_mv<double,int,float,TestExecSpace> ();
+TEST_F(TestCategory, update_mv_double_int) {
+  Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_double_int");
+  test_update_mv<double, int, float, TestExecSpace>();
   Kokkos::Profiling::popRegion();
 }
 #endif
diff --git a/unit_test/blas/Test_Blas2_gemv.hpp b/unit_test/blas/Test_Blas2_gemv.hpp
index 73305370c5..34c5a10b43 100644
--- a/unit_test/blas/Test_Blas2_gemv.hpp
+++ b/unit_test/blas/Test_Blas2_gemv.hpp
@@ -1,267 +1,300 @@
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas2_gemv.hpp>
-#include<KokkosBlas1_dot.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas2_gemv.hpp>
+#include <KokkosBlas1_dot.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
-  template<class ViewTypeA, class ViewTypeX, class ViewTypeY, class Device>
-  void impl_test_gemv(const char* mode, int M, int N) {
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeX::value_type ScalarX;
-    typedef typename ViewTypeY::value_type ScalarY;
-    using LayoutAType = typename ViewTypeA::array_layout;
-    typedef Kokkos::ArithTraits<ScalarY> KAT_Y;
-
-    typedef multivector_layout_adapter<ViewTypeA> vfA_type;
-
-    ScalarA alpha = 3;
-    ScalarY beta = 5;
-    double eps = (std::is_same<typename KAT_Y::mag_type, float>::value ? 1e-3 : 5e-10);
-
-    int ldx;
-    int ldy;
-    if (mode[0] == 'N') {
-      ldx = N;
-      ldy = M;
-    } else {
-      ldx = M;
-      ldy = N;
-    }
-    typename vfA_type::BaseType b_A("A", M, N);
-    ViewTypeX x("X", ldx);
-    ViewTypeY y("Y", ldy);
-    ViewTypeY org_y("Org_Y", ldy);
-
-    ViewTypeA A                        = vfA_type::view(b_A);
-    typename ViewTypeX::const_type c_x = x;
-    typename ViewTypeA::const_type c_A = A;
-
-    typedef multivector_layout_adapter<typename ViewTypeA::HostMirror>
-        h_vfA_type;
-
-    typename h_vfA_type::BaseType h_b_A = Kokkos::create_mirror_view(b_A);
-
-    typename ViewTypeA::HostMirror h_A = h_vfA_type::view(h_b_A);
-    typename ViewTypeX::HostMirror h_x = Kokkos::create_mirror_view(x);
-    typename ViewTypeY::HostMirror h_y = Kokkos::create_mirror_view(y);
-
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
-        13718);
-
-    {
-      ScalarX randStart, randEnd;
-      Test::getRandomBounds(10.0, randStart, randEnd);
-      Kokkos::fill_random(x, rand_pool, randStart, randEnd);
-    }
-    {
-      ScalarY randStart, randEnd;
-      Test::getRandomBounds(10.0, randStart, randEnd);
-      Kokkos::fill_random(y, rand_pool, randStart, randEnd);
-    }
-    {
-      ScalarA randStart, randEnd;
-      Test::getRandomBounds(10.0, randStart, randEnd);
-      Kokkos::fill_random(b_A, rand_pool, randStart, randEnd);
-    }
-
-    Kokkos::deep_copy(org_y, y);
-    auto h_org_y =
-        Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), org_y);
-
-    Kokkos::deep_copy(h_x, x);
-    Kokkos::deep_copy(h_y, y);
-    Kokkos::deep_copy(h_b_A, b_A);
-
-    Kokkos::View<ScalarY*, Kokkos::HostSpace> expected("expected aAx+by", ldy);
-    Kokkos::deep_copy(expected, h_org_y);
-    vanillaGEMV(mode[0], alpha, h_A, h_x, beta, expected);
-
-    // Cublas does not support row-major (LayoutRight) + conjugate transpose
-    // We throw a runtime error in the wrapper for cublasGemv if the user attempts
-    // this, therefore we must test this code path via the try-catch below.
-    try {
-      KokkosBlas::gemv(mode, alpha, A, x, beta, y);
-    } catch (const std::runtime_error &error) {
-      if ((mode[0] == 'c' || mode[0] == 'C') && std::is_same<LayoutAType, Kokkos::LayoutRight>::value)
-	return; // Pass since we caught the runtime error
-      FAIL();
-    }
-    Kokkos::deep_copy(h_y, y);
-    int numErrors = 0;
-    for (int i = 0; i < ldy; i++) {
-      if (KAT_Y::abs(expected(i) - h_y(i)) > KAT_Y::abs(eps * expected(i)))
-        numErrors++;
-    }
-    EXPECT_EQ(numErrors, 0)
-        << "Nonconst input, " << M << 'x' << N << ", alpha = " << alpha
-        << ", beta = " << beta << ", mode " << mode << ": gemv incorrect";
-
-    Kokkos::deep_copy(y, org_y);
-    KokkosBlas::gemv(mode, alpha, A, c_x, beta, y);
-    Kokkos::deep_copy(h_y, y);
-    numErrors = 0;
-    for (int i = 0; i < ldy; i++) {
-      if (KAT_Y::abs(expected(i) - h_y(i)) > KAT_Y::abs(eps * expected(i)))
-        numErrors++;
-    }
-    EXPECT_EQ(numErrors, 0)
-        << "Const vector input, " << M << 'x' << N << ", alpha = " << alpha
-        << ", beta = " << beta << ", mode " << mode << ": gemv incorrect";
-
-    Kokkos::deep_copy(y, org_y);
-    KokkosBlas::gemv(mode, alpha, c_A, c_x, beta, y);
-    Kokkos::deep_copy(h_y, y);
-    numErrors = 0;
-    for(int i = 0; i < ldy; i++)
-    {
-      if(KAT_Y::abs(expected(i) - h_y(i)) > KAT_Y::abs(eps * expected(i)))
-        numErrors++;
-    }
-    EXPECT_EQ(numErrors, 0) << "Const matrix/vector input, " << M << 'x' << N << ", alpha = " << alpha << ", beta = " << beta << ", mode " << mode << ": gemv incorrect";
-    //Test once with beta = 0, but with y initially filled with NaN.
-    //This should overwrite the NaNs with the correct result.
-    beta = KAT_Y::zero();
-    //beta changed, so update the correct answer
-    vanillaGEMV(mode[0], alpha, h_A, h_x, beta, expected);
-    Kokkos::deep_copy(y, KAT_Y::nan());
-    KokkosBlas::gemv(mode, alpha, A, x, beta, y);
-    Kokkos::deep_copy(h_y, y);
-    numErrors = 0;
-    for(int i = 0; i < ldy; i++)
-    {
-      if(KAT_Y::isNan(h_y(i)) || KAT_Y::abs(expected(i) - h_y(i)) > KAT_Y::abs(eps * expected(i)))
-        numErrors++;
-    }
-    EXPECT_EQ(numErrors, 0) << "beta = 0, input contains NaN, A is " << M << 'x' << N << ", mode " << mode << ": gemv incorrect";
+template <class ViewTypeA, class ViewTypeX, class ViewTypeY, class Device>
+void impl_test_gemv(const char* mode, int M, int N) {
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeX::value_type ScalarX;
+  typedef typename ViewTypeY::value_type ScalarY;
+  typedef Kokkos::ArithTraits<ScalarY> KAT_Y;
+
+  typedef multivector_layout_adapter<ViewTypeA> vfA_type;
+
+  ScalarA alpha = 3;
+  ScalarY beta  = 5;
+  double eps =
+      (std::is_same<typename KAT_Y::mag_type, float>::value ? 1e-2 : 5e-10);
+
+  int ldx;
+  int ldy;
+  if (mode[0] == 'N') {
+    ldx = N;
+    ldy = M;
+  } else {
+    ldx = M;
+    ldy = N;
   }
-}
+  typename vfA_type::BaseType b_A("A", M, N);
+  ViewTypeX x("X", ldx);
+  ViewTypeY y("Y", ldy);
+  ViewTypeY org_y("Org_Y", ldy);
 
+  ViewTypeA A                        = vfA_type::view(b_A);
+  typename ViewTypeX::const_type c_x = x;
+  typename ViewTypeA::const_type c_A = A;
 
+  typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
 
-template<class ScalarA, class ScalarX, class ScalarY, class Device>
-int test_gemv(const char* mode) {
+  typename h_vfA_type::BaseType h_b_A = Kokkos::create_mirror_view(b_A);
+
+  typename ViewTypeA::HostMirror h_A = h_vfA_type::view(h_b_A);
+  typename ViewTypeX::HostMirror h_x = Kokkos::create_mirror_view(x);
+  typename ViewTypeY::HostMirror h_y = Kokkos::create_mirror_view(y);
+
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
+
+  {
+    ScalarX randStart, randEnd;
+    Test::getRandomBounds(1.0, randStart, randEnd);
+    Kokkos::fill_random(x, rand_pool, randStart, randEnd);
+  }
+  {
+    ScalarY randStart, randEnd;
+    Test::getRandomBounds(1.0, randStart, randEnd);
+    Kokkos::fill_random(y, rand_pool, randStart, randEnd);
+  }
+  {
+    ScalarA randStart, randEnd;
+    Test::getRandomBounds(1.0, randStart, randEnd);
+    Kokkos::fill_random(b_A, rand_pool, randStart, randEnd);
+  }
+
+  Kokkos::deep_copy(org_y, y);
+  auto h_org_y =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), org_y);
+
+  Kokkos::deep_copy(h_x, x);
+  Kokkos::deep_copy(h_y, y);
+  Kokkos::deep_copy(h_b_A, b_A);
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  Kokkos::View<ScalarY*, Kokkos::HostSpace> expected("expected aAx+by", ldy);
+  Kokkos::deep_copy(expected, h_org_y);
+  vanillaGEMV(mode[0], alpha, h_A, h_x, beta, expected);
+
+  KokkosBlas::gemv(mode, alpha, A, x, beta, y);
+  Kokkos::deep_copy(h_y, y);
+  int numErrors = 0;
+  for (int i = 0; i < ldy; i++) {
+    if (KAT_Y::abs(expected(i) - h_y(i)) > KAT_Y::abs(eps * expected(i)))
+      numErrors++;
+  }
+  EXPECT_EQ(numErrors, 0) << "Nonconst input, " << M << 'x' << N
+                          << ", alpha = " << alpha << ", beta = " << beta
+                          << ", mode " << mode << ": gemv incorrect";
+
+  Kokkos::deep_copy(y, org_y);
+  KokkosBlas::gemv(mode, alpha, A, c_x, beta, y);
+  Kokkos::deep_copy(h_y, y);
+  numErrors = 0;
+  for (int i = 0; i < ldy; i++) {
+    if (KAT_Y::abs(expected(i) - h_y(i)) > KAT_Y::abs(eps * expected(i)))
+      numErrors++;
+  }
+  EXPECT_EQ(numErrors, 0) << "Const vector input, " << M << 'x' << N
+                          << ", alpha = " << alpha << ", beta = " << beta
+                          << ", mode " << mode << ": gemv incorrect";
+
+  Kokkos::deep_copy(y, org_y);
+  KokkosBlas::gemv(mode, alpha, c_A, c_x, beta, y);
+  Kokkos::deep_copy(h_y, y);
+  numErrors = 0;
+  for (int i = 0; i < ldy; i++) {
+    if (KAT_Y::abs(expected(i) - h_y(i)) > KAT_Y::abs(eps * expected(i)))
+      numErrors++;
+  }
+  EXPECT_EQ(numErrors, 0) << "Const matrix/vector input, " << M << 'x' << N
+                          << ", alpha = " << alpha << ", beta = " << beta
+                          << ", mode " << mode << ": gemv incorrect";
+  // Test once with beta = 0, but with y initially filled with NaN.
+  // This should overwrite the NaNs with the correct result.
+  beta = KAT_Y::zero();
+  // beta changed, so update the correct answer
+  vanillaGEMV(mode[0], alpha, h_A, h_x, beta, expected);
+  Kokkos::deep_copy(y, KAT_Y::nan());
+  KokkosBlas::gemv(mode, alpha, A, x, beta, y);
+  Kokkos::deep_copy(h_y, y);
+  numErrors = 0;
+  for (int i = 0; i < ldy; i++) {
+    if (KAT_Y::isNan(h_y(i)) ||
+        KAT_Y::abs(expected(i) - h_y(i)) > KAT_Y::abs(eps * expected(i)))
+      numErrors++;
+  }
+  EXPECT_EQ(numErrors, 0) << "beta = 0, input contains NaN, A is " << M << 'x'
+                          << N << ", mode " << mode << ": gemv incorrect";
+}
+}  // namespace Test
+
+template <class ScalarA, class ScalarX, class ScalarY, class Device>
+int test_gemv(const char* mode) {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device> view_type_a_ll;
   typedef Kokkos::View<ScalarX*, Kokkos::LayoutLeft, Device> view_type_b_ll;
   typedef Kokkos::View<ScalarY*, Kokkos::LayoutLeft, Device> view_type_c_ll;
-  #if 0
+#if 0
   Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,10,10);
   Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,100,10);
   Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,10,150);
   Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,150,10);
   Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,10,200);
   Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,200,10);
-  #endif
-  Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,0,1024);
-  Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,1024,0);
-  Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,13,13);
-  Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,13,1024);
-  Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,50,40);
-  Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,1024,1024);
-  Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,4321,4321);
-  //Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,132231,1024);
+#endif
+  Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(
+      mode, 0, 1024);
+  Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(
+      mode, 1024, 0);
+  Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(
+      mode, 13, 13);
+  Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(
+      mode, 13, 1024);
+  Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(
+      mode, 50, 40);
+  Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(
+      mode, 1024, 1024);
+  Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(
+      mode, 2131, 2131);
+  // Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+  // Device>(mode,132231,1024);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device> view_type_a_lr;
   typedef Kokkos::View<ScalarX*, Kokkos::LayoutRight, Device> view_type_b_lr;
   typedef Kokkos::View<ScalarY*, Kokkos::LayoutRight, Device> view_type_c_lr;
-  Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,0,1024);
-  Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,1024,0);
-  Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,13,13);
-  Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,13,1024);
-  Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,50,40);
-  Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,1024,1024);
-  Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,4321,4321);
-  //Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,132231,1024);
+  Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(
+      mode, 0, 1024);
+  Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(
+      mode, 1024, 0);
+  Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(
+      mode, 13, 13);
+  Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(
+      mode, 13, 1024);
+  Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(
+      mode, 50, 40);
+  Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(
+      mode, 1024, 1024);
+  Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(
+      mode, 2131, 2131);
+  // Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+  // Device>(mode,132231,1024);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<ScalarA**, Kokkos::LayoutStride, Device> view_type_a_ls;
   typedef Kokkos::View<ScalarX*, Kokkos::LayoutStride, Device> view_type_b_ls;
   typedef Kokkos::View<ScalarY*, Kokkos::LayoutStride, Device> view_type_c_ls;
-  Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,0,1024);
-  Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,1024,0);
-  Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,13,13);
-  Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,13,1024);
-  Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,50,40);
-  Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,1024,1024);
-  Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,4321,4321);
-  //Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,132231,1024);
+  Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(
+      mode, 0, 1024);
+  Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(
+      mode, 1024, 0);
+  Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(
+      mode, 13, 13);
+  Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(
+      mode, 13, 1024);
+  Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(
+      mode, 50, 40);
+  Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(
+      mode, 1024, 1024);
+  Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(
+      mode, 2131, 2131);
+  // Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+  // Device>(mode,132231,1024);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-  Test::impl_test_gemv<view_type_a_ls, view_type_b_ll, view_type_c_lr, Device>(mode,1024,1024);
-  Test::impl_test_gemv<view_type_a_ll, view_type_b_ls, view_type_c_lr, Device>(mode,1024,1024);
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+  Test::impl_test_gemv<view_type_a_ls, view_type_b_ll, view_type_c_lr, Device>(
+      mode, 1024, 1024);
+  Test::impl_test_gemv<view_type_a_ll, view_type_b_ls, view_type_c_lr, Device>(
+      mode, 1024, 1024);
 #endif
 
   return 1;
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, gemv_float ) {
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, gemv_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_float");
-    test_gemv<float,float,float,TestExecSpace> ("N");
+  test_gemv<float, float, float, TestExecSpace>("N");
   Kokkos::Profiling::popRegion();
 
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_tran_float");
-    test_gemv<float,float,float,TestExecSpace> ("T");
+  test_gemv<float, float, float, TestExecSpace>("T");
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, gemv_double ) {
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, gemv_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_double");
-    test_gemv<double,double,double,TestExecSpace> ("N");
+  test_gemv<double, double, double, TestExecSpace>("N");
   Kokkos::Profiling::popRegion();
 
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_tran_double");
-    test_gemv<double,double,double,TestExecSpace> ("T");
+  test_gemv<double, double, double, TestExecSpace>("T");
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, gemv_complex_double ) {
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, gemv_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_complex_double");
-    test_gemv<Kokkos::complex<double>,Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("N");
+  test_gemv<Kokkos::complex<double>, Kokkos::complex<double>,
+            Kokkos::complex<double>, TestExecSpace>("N");
   Kokkos::Profiling::popRegion();
 
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_tran_complex_double");
-    test_gemv<Kokkos::complex<double>,Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("T");
+  test_gemv<Kokkos::complex<double>, Kokkos::complex<double>,
+            Kokkos::complex<double>, TestExecSpace>("T");
   Kokkos::Profiling::popRegion();
 
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_conj_complex_double");
-    test_gemv<Kokkos::complex<double>,Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("C");
+  test_gemv<Kokkos::complex<double>, Kokkos::complex<double>,
+            Kokkos::complex<double>, TestExecSpace>("C");
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, gemv_int ) {
+#if defined(KOKKOSKERNELS_INST_INT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, gemv_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_int");
-    test_gemv<int,int,int,TestExecSpace> ("N");
+  test_gemv<int, int, int, TestExecSpace>("N");
   Kokkos::Profiling::popRegion();
 
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_tran_int");
-    test_gemv<int,int,int,TestExecSpace> ("T");
+  test_gemv<int, int, int, TestExecSpace>("T");
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-TEST_F( TestCategory, gemv_double_int ) {
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+TEST_F(TestCategory, gemv_double_int) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_double_int");
-    test_gemv<double,int,float,TestExecSpace> ("N");
+  test_gemv<double, int, float, TestExecSpace>("N");
   Kokkos::Profiling::popRegion();
 
-  //Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemvt_double_int");
+  // Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemvt_double_int");
   //  test_gemv<double,int,float,TestExecSpace> ("T");
-  //Kokkos::Profiling::popRegion();
+  // Kokkos::Profiling::popRegion();
 }
 #endif
diff --git a/unit_test/blas/Test_Blas2_team_gemv.hpp b/unit_test/blas/Test_Blas2_team_gemv.hpp
index ba7243c5f1..dc2d158a4c 100644
--- a/unit_test/blas/Test_Blas2_team_gemv.hpp
+++ b/unit_test/blas/Test_Blas2_team_gemv.hpp
@@ -4,195 +4,271 @@
 //       the CUDA backend before including this test.
 #if !defined(TEST_CUDA_BLAS_CPP) || defined(KOKKOS_ENABLE_CUDA_LAMBDA)
 
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas2_team_gemv.hpp>
-#include<KokkosBlas1_dot.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas2_team_gemv.hpp>
+#include <KokkosBlas1_dot.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
-  template<class ViewTypeA, class ViewTypeX, class ViewTypeY, class Device>
-  void impl_test_team_gemv(const char* mode, int N, int M) {
-
-    typedef Kokkos::TeamPolicy<Device>        team_policy ;
-    typedef typename team_policy::member_type team_member ;
-
-    //Launch K teams of the maximum number of threads per team
-    int K = 4;
-    const team_policy policy( K, Kokkos::AUTO );
-    const int team_data_siz = (N%K == 0)?(N/K):(N/K+1);
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeX::value_type ScalarX;
-    typedef typename ViewTypeY::value_type ScalarY;
-
-    typedef multivector_layout_adapter<ViewTypeA> vfA_type;
-    typedef Kokkos::View<ScalarX*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeX::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeX;
-    typedef Kokkos::View<ScalarY*[2],
-       typename std::conditional<
-                std::is_same<typename ViewTypeY::array_layout,Kokkos::LayoutStride>::value,
-                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeY;
-
-
-    ScalarA a = 3;
-    ScalarX b = 5;
-    double eps = std::is_same<ScalarY,float>::value?2*1e-5:1e-7;
-
-    typename vfA_type::BaseType b_A("A",N,M);
-    BaseTypeX b_x("X",M);
-    BaseTypeY b_y("Y",N);
-    BaseTypeY b_org_y("Org_Y",N);
-    
-
-    ViewTypeA A = vfA_type::view(b_A);
-    ViewTypeX x = Kokkos::subview(b_x,Kokkos::ALL(),0);
-    ViewTypeY y = Kokkos::subview(b_y,Kokkos::ALL(),0);
-    typename ViewTypeX::const_type c_x = x;
-    typename ViewTypeA::const_type c_A = A;
-
-    typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
-
-    typename h_vfA_type::BaseType h_b_A = Kokkos::create_mirror_view(b_A);
-    typename BaseTypeX::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
-    typename BaseTypeY::HostMirror h_b_y = Kokkos::create_mirror_view(b_y);
-
-    typename ViewTypeA::HostMirror h_A = h_vfA_type::view(h_b_A);
-    typename ViewTypeX::HostMirror h_x = Kokkos::subview(h_b_x,Kokkos::ALL(),0);
-    typename ViewTypeY::HostMirror h_y = Kokkos::subview(h_b_y,Kokkos::ALL(),0);
-
-    Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
-
-    Kokkos::fill_random(b_x,rand_pool,ScalarX(10));
-    Kokkos::fill_random(b_y,rand_pool,ScalarY(10));
-    Kokkos::fill_random(b_A,rand_pool,ScalarA(10));
-
-    Kokkos::deep_copy(b_org_y,b_y);
-
-    Kokkos::deep_copy(h_b_x,b_x);
-    Kokkos::deep_copy(h_b_y,b_y);
-    Kokkos::deep_copy(h_b_A,b_A);
-
-    ScalarY expected_result = 0;
-
-    if(mode[0]=='N') {
-      for(int i=0;i<N;i++) {
-        ScalarY y_i = ScalarY();
-        for(int j=0; j<M; j++) {
-           y_i += h_A(i,j)*h_x(j);
-        }
-        expected_result += (b*h_y(i) + a * y_i) * (b*h_y(i) + a * y_i) ;
+template <class ViewTypeA, class ViewTypeX, class ViewTypeY, class Device>
+void impl_test_team_gemv(const char *mode, int N, int M) {
+  typedef Kokkos::TeamPolicy<Device> team_policy;
+  typedef typename team_policy::member_type team_member;
+
+  // Launch K teams of the maximum number of threads per team
+  int K = 4;
+  const team_policy policy(K, Kokkos::AUTO);
+  const int team_data_siz = (N % K == 0) ? (N / K) : (N / K + 1);
+
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeX::value_type ScalarX;
+  typedef typename ViewTypeY::value_type ScalarY;
+
+  typedef multivector_layout_adapter<ViewTypeA> vfA_type;
+  typedef Kokkos::View<
+      ScalarX * [2],
+      typename std::conditional<std::is_same<typename ViewTypeX::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeX;
+  typedef Kokkos::View<
+      ScalarY * [2],
+      typename std::conditional<std::is_same<typename ViewTypeY::array_layout,
+                                             Kokkos::LayoutStride>::value,
+                                Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,
+      Device>
+      BaseTypeY;
+
+  ScalarA a  = 3;
+  ScalarX b  = 5;
+  double eps = std::is_same<ScalarY, float>::value ? 2 * 1e-5 : 1e-7;
+
+  typename vfA_type::BaseType b_A("A", N, M);
+  BaseTypeX b_x("X", M);
+  BaseTypeY b_y("Y", N);
+  BaseTypeY b_org_y("Org_Y", N);
+
+  ViewTypeA A                        = vfA_type::view(b_A);
+  ViewTypeX x                        = Kokkos::subview(b_x, Kokkos::ALL(), 0);
+  ViewTypeY y                        = Kokkos::subview(b_y, Kokkos::ALL(), 0);
+  typename ViewTypeX::const_type c_x = x;
+  typename ViewTypeA::const_type c_A = A;
+
+  typedef multivector_layout_adapter<typename ViewTypeA::HostMirror> h_vfA_type;
+
+  typename h_vfA_type::BaseType h_b_A  = Kokkos::create_mirror_view(b_A);
+  typename BaseTypeX::HostMirror h_b_x = Kokkos::create_mirror_view(b_x);
+  typename BaseTypeY::HostMirror h_b_y = Kokkos::create_mirror_view(b_y);
+
+  typename ViewTypeA::HostMirror h_A = h_vfA_type::view(h_b_A);
+  typename ViewTypeX::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0);
+  typename ViewTypeY::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0);
+
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
+
+  Kokkos::fill_random(b_x, rand_pool, ScalarX(10));
+  Kokkos::fill_random(b_y, rand_pool, ScalarY(10));
+  Kokkos::fill_random(b_A, rand_pool, ScalarA(10));
+
+  Kokkos::deep_copy(b_org_y, b_y);
+
+  Kokkos::deep_copy(h_b_x, b_x);
+  Kokkos::deep_copy(h_b_y, b_y);
+  Kokkos::deep_copy(h_b_A, b_A);
+
+  ScalarY expected_result = 0;
+
+  if (mode[0] == 'N') {
+    for (int i = 0; i < N; i++) {
+      ScalarY y_i = ScalarY();
+      for (int j = 0; j < M; j++) {
+        y_i += h_A(i, j) * h_x(j);
       }
+      expected_result += (b * h_y(i) + a * y_i) * (b * h_y(i) + a * y_i);
     }
-    
-    char trans = mode[0];
-	
-    //KokkosBlas::gemv(mode,a,A,x,b,y);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamGemm",  policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::gemv(teamMember, trans, a, Kokkos::subview(A,Kokkos::make_pair(teamId*team_data_siz,(teamId < K-1)?(teamId+1)*team_data_siz:N),Kokkos::ALL()), x, b, Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < K-1)?(teamId+1)*team_data_siz:N)));
-    } );
-
-    ScalarY nonconst_nonconst_result = KokkosBlas::dot(y,y);
-    EXPECT_NEAR_KK( nonconst_nonconst_result, expected_result, eps*expected_result);
- 
-    Kokkos::deep_copy(b_y,b_org_y);
-
-    //KokkosBlas::gemv(mode,a,A,c_x,b,y);    
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamGemm", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::gemv(teamMember, trans, a, Kokkos::subview(A,Kokkos::make_pair(teamId*team_data_siz,(teamId < K-1)?(teamId+1)*team_data_siz:N),Kokkos::ALL()), c_x, b, Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < K-1)?(teamId+1)*team_data_siz:N)));
-    } );
-
-    ScalarY const_nonconst_result = KokkosBlas::dot(y,y);
-    EXPECT_NEAR_KK( const_nonconst_result, expected_result, eps*expected_result);
-
-    Kokkos::deep_copy(b_y,b_org_y);
-
-    //KokkosBlas::gemv(mode,a,c_A,c_x,b,y);
-    Kokkos::parallel_for( "KokkosBlas::Test::TeamGemm", policy, KOKKOS_LAMBDA ( const team_member &teamMember ) {
-       const int teamId = teamMember.league_rank();
-       KokkosBlas::Experimental::gemv(teamMember, trans, a, Kokkos::subview(c_A,Kokkos::make_pair(teamId*team_data_siz,(teamId < K-1)?(teamId+1)*team_data_siz:N),Kokkos::ALL()), c_x, b, Kokkos::subview(y,Kokkos::make_pair(teamId*team_data_siz,(teamId < K-1)?(teamId+1)*team_data_siz:N)));
-    } );
-
-    ScalarY const_const_result = KokkosBlas::dot(y,y);
-    EXPECT_NEAR_KK( const_const_result, expected_result, eps*expected_result);
   }
-}
-
 
-template<class ScalarA, class ScalarX, class ScalarY, class Device>
-int test_team_gemv(const char* mode) {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device> view_type_a_ll;
-  typedef Kokkos::View<ScalarX*, Kokkos::LayoutLeft, Device> view_type_b_ll;
-  typedef Kokkos::View<ScalarY*, Kokkos::LayoutLeft, Device> view_type_c_ll;
-  Test::impl_test_team_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,0,1024);
-  Test::impl_test_team_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,13,1024);
-  Test::impl_test_team_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,124,124);
-  //Test::impl_test_team_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,132231,1024);
+  char trans = mode[0];
+
+  // KokkosBlas::gemv(mode,a,A,x,b,y);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamGemm", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::gemv(
+            teamMember, trans, a,
+            Kokkos::subview(
+                A,
+                Kokkos::make_pair(
+                    teamId * team_data_siz,
+                    (teamId < K - 1) ? (teamId + 1) * team_data_siz : N),
+                Kokkos::ALL()),
+            x, b,
+            Kokkos::subview(
+                y, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < K - 1) ? (teamId + 1) * team_data_siz : N)));
+      });
+
+  ScalarY nonconst_nonconst_result = KokkosBlas::dot(y, y);
+  EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result,
+                 eps * expected_result);
+
+  Kokkos::deep_copy(b_y, b_org_y);
+
+  // KokkosBlas::gemv(mode,a,A,c_x,b,y);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamGemm", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::gemv(
+            teamMember, trans, a,
+            Kokkos::subview(
+                A,
+                Kokkos::make_pair(
+                    teamId * team_data_siz,
+                    (teamId < K - 1) ? (teamId + 1) * team_data_siz : N),
+                Kokkos::ALL()),
+            c_x, b,
+            Kokkos::subview(
+                y, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < K - 1) ? (teamId + 1) * team_data_siz : N)));
+      });
+
+  ScalarY const_nonconst_result = KokkosBlas::dot(y, y);
+  EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result);
+
+  Kokkos::deep_copy(b_y, b_org_y);
+
+  // KokkosBlas::gemv(mode,a,c_A,c_x,b,y);
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::TeamGemm", policy,
+      KOKKOS_LAMBDA(const team_member &teamMember) {
+        const int teamId = teamMember.league_rank();
+        KokkosBlas::Experimental::gemv(
+            teamMember, trans, a,
+            Kokkos::subview(
+                c_A,
+                Kokkos::make_pair(
+                    teamId * team_data_siz,
+                    (teamId < K - 1) ? (teamId + 1) * team_data_siz : N),
+                Kokkos::ALL()),
+            c_x, b,
+            Kokkos::subview(
+                y, Kokkos::make_pair(
+                       teamId * team_data_siz,
+                       (teamId < K - 1) ? (teamId + 1) * team_data_siz : N)));
+      });
+
+  ScalarY const_const_result = KokkosBlas::dot(y, y);
+  EXPECT_NEAR_KK(const_const_result, expected_result, eps * expected_result);
+}
+}  // namespace Test
+
+template <class ScalarA, class ScalarX, class ScalarY, class Device>
+int test_team_gemv(const char *mode) {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA **, Kokkos::LayoutLeft, Device> view_type_a_ll;
+  typedef Kokkos::View<ScalarX *, Kokkos::LayoutLeft, Device> view_type_b_ll;
+  typedef Kokkos::View<ScalarY *, Kokkos::LayoutLeft, Device> view_type_c_ll;
+  Test::impl_test_team_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+                            Device>(mode, 0, 1024);
+  Test::impl_test_team_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+                            Device>(mode, 13, 1024);
+  Test::impl_test_team_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+                            Device>(mode, 124, 124);
+  // Test::impl_test_team_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll,
+  // Device>(mode,132231,1024);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device> view_type_a_lr;
-  typedef Kokkos::View<ScalarX*, Kokkos::LayoutRight, Device> view_type_b_lr;
-  typedef Kokkos::View<ScalarY*, Kokkos::LayoutRight, Device> view_type_c_lr;
-  Test::impl_test_team_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,0,1024);
-  Test::impl_test_team_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,13,1024);
-  Test::impl_test_team_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,124,124);
-  //Test::impl_test_team_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,132231,1024);
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA **, Kokkos::LayoutRight, Device> view_type_a_lr;
+  typedef Kokkos::View<ScalarX *, Kokkos::LayoutRight, Device> view_type_b_lr;
+  typedef Kokkos::View<ScalarY *, Kokkos::LayoutRight, Device> view_type_c_lr;
+  Test::impl_test_team_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+                            Device>(mode, 0, 1024);
+  Test::impl_test_team_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+                            Device>(mode, 13, 1024);
+  Test::impl_test_team_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+                            Device>(mode, 124, 124);
+  // Test::impl_test_team_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr,
+  // Device>(mode,132231,1024);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutStride, Device> view_type_a_ls;
-  typedef Kokkos::View<ScalarX*, Kokkos::LayoutStride, Device> view_type_b_ls;
-  typedef Kokkos::View<ScalarY*, Kokkos::LayoutStride, Device> view_type_c_ls;
-  Test::impl_test_team_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,0,1024);
-  Test::impl_test_team_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,13,1024);
-  Test::impl_test_team_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,124,124);
-  //Test::impl_test_team_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,132231,1024);
+#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  typedef Kokkos::View<ScalarA **, Kokkos::LayoutStride, Device> view_type_a_ls;
+  typedef Kokkos::View<ScalarX *, Kokkos::LayoutStride, Device> view_type_b_ls;
+  typedef Kokkos::View<ScalarY *, Kokkos::LayoutStride, Device> view_type_c_ls;
+  Test::impl_test_team_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+                            Device>(mode, 0, 1024);
+  Test::impl_test_team_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+                            Device>(mode, 13, 1024);
+  Test::impl_test_team_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+                            Device>(mode, 124, 124);
+  // Test::impl_test_team_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls,
+  // Device>(mode,132231,1024);
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-  Test::impl_test_team_gemv<view_type_a_ls, view_type_b_ll, view_type_c_lr, Device>(mode,124,124);
-  Test::impl_test_team_gemv<view_type_a_ll, view_type_b_ls, view_type_c_lr, Device>(mode,124,124);
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+  Test::impl_test_team_gemv<view_type_a_ls, view_type_b_ll, view_type_c_lr,
+                            Device>(mode, 124, 124);
+  Test::impl_test_team_gemv<view_type_a_ll, view_type_b_ls, view_type_c_lr,
+                            Device>(mode, 124, 124);
 #endif
 
   return 1;
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_gemv_float ) {
-    test_team_gemv<float,float,float,TestExecSpace> ("N");
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_gemv_float) {
+  test_team_gemv<float, float, float, TestExecSpace>("N");
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_gemv_double ) {
-    test_team_gemv<double,double,double,TestExecSpace> ("N");
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_gemv_double) {
+  test_team_gemv<double, double, double, TestExecSpace>("N");
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_gemv_complex_double ) {
-    test_team_gemv<Kokkos::complex<double>,Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("N");
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_gemv_complex_double) {
+  test_team_gemv<Kokkos::complex<double>, Kokkos::complex<double>,
+                 Kokkos::complex<double>, TestExecSpace>("N");
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_INT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, team_gemv_int ) {
-    test_team_gemv<int,int,int,TestExecSpace> ("N");
+#if defined(KOKKOSKERNELS_INST_INT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, team_gemv_int) {
+  test_team_gemv<int, int, int, TestExecSpace>("N");
 }
 #endif
 
-#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
-TEST_F( TestCategory, team_gemv_double_int ) {
-    test_team_gemv<double,int,float,TestExecSpace> ("N");
+#if !defined(KOKKOSKERNELS_ETI_ONLY) && \
+    !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)
+TEST_F(TestCategory, team_gemv_double_int) {
+  test_team_gemv<double, int, float, TestExecSpace>("N");
 }
 #endif
 
-#endif // Check for lambda availability on CUDA backend
+#endif  // Check for lambda availability on CUDA backend
diff --git a/unit_test/blas/Test_Blas3_gemm.hpp b/unit_test/blas/Test_Blas3_gemm.hpp
index 78d434f9d8..28697b2a32 100644
--- a/unit_test/blas/Test_Blas3_gemm.hpp
+++ b/unit_test/blas/Test_Blas3_gemm.hpp
@@ -1,46 +1,51 @@
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas3_gemm.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas3_gemm.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
 
-  template<class ViewTypeA, class ViewTypeB, class ViewTypeC, class ExecutionSpace>
-  struct gemm_VanillaGEMM {
-    bool A_t, B_t, A_c, B_c;
-    int N,K;
-    ViewTypeA A;
-    ViewTypeB B;
-    ViewTypeC C;
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-    typedef typename ViewTypeC::value_type ScalarC;
-    typedef Kokkos::Details::ArithTraits<ScalarC> APT;
-    typedef typename APT::mag_type mag_type;
-    ScalarA alpha;
-    ScalarC beta;
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team) const {
+template <class ViewTypeA, class ViewTypeB, class ViewTypeC,
+          class ExecutionSpace>
+struct gemm_VanillaGEMM {
+  bool A_t, B_t, A_c, B_c;
+  int N, K;
+  ViewTypeA A;
+  ViewTypeB B;
+  ViewTypeC C;
+
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+  typedef typename ViewTypeC::value_type ScalarC;
+  typedef Kokkos::Details::ArithTraits<ScalarC> APT;
+  typedef typename APT::mag_type mag_type;
+  ScalarA alpha;
+  ScalarC beta;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(
+      const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team)
+      const {
 // GNU COMPILER BUG WORKAROUND
-#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
-      int i = team.league_rank();
+#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && \
+    !defined(__HIP_DEVICE_COMPILE__)
+    int i = team.league_rank();
 #else
-      const int i = team.league_rank();
+    const int i = team.league_rank();
 #endif
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,N), [&] (const int& j) {
-        ScalarC C_ij = 0.0;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& j) {
+      ScalarC C_ij = 0.0;
 
-        // GNU 5.3, 5.4 and 6.1 (and maybe more) crash with another nested lambda here
+      // GNU 5.3, 5.4 and 6.1 (and maybe more) crash with another nested lambda
+      // here
 
 #if defined(KOKKOS_COMPILER_GNU) && !defined(KOKKOS_COMPILER_NVCC)
-        for(int k=0; k<K; k++) {
-          ScalarA A_ik = A_t?(A_c?APT::conj(A(k,i)):A(k,i)):A(i,k);
-          ScalarB B_kj = B_t?(B_c?APT::conj(B(j,k)):B(j,k)):B(k,j);
-          C_ij += A_ik*B_kj;
-        }
+      for (int k = 0; k < K; k++) {
+        ScalarA A_ik = A_t ? (A_c ? APT::conj(A(k, i)) : A(k, i)) : A(i, k);
+        ScalarB B_kj = B_t ? (B_c ? APT::conj(B(j, k)) : B(j, k)) : B(k, j);
+        C_ij += A_ik * B_kj;
+      }
 #else
         Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,K), [&] (const int& k, ScalarC& lsum) {
            ScalarA A_ik = A_t?(A_c?APT::conj(A(k,i)):A(k,i)):A(i,k);
@@ -49,177 +54,363 @@ namespace Test {
         },C_ij);
 #endif
 
-        C(i,j) = beta*C(i,j) + alpha*C_ij;
-      });
+      C(i, j) = beta * C(i, j) + alpha * C_ij;
+    });
+  }
+};
+
+template <class ViewTypeA, class ViewTypeB, class ViewTypeC>
+void build_matrices(const int M, const int N, const int K,
+                    const typename ViewTypeA::value_type alpha, ViewTypeA& A,
+                    ViewTypeB& B, const typename ViewTypeA::value_type beta,
+                    ViewTypeC& C, ViewTypeC& Cref) {
+  using execution_space = TestExecSpace;
+  using ScalarA         = typename ViewTypeA::non_const_value_type;
+  using ScalarB         = typename ViewTypeB::non_const_value_type;
+  using ScalarC         = typename ViewTypeC::non_const_value_type;
+
+  A    = ViewTypeA("A", M, K);
+  B    = ViewTypeB("B", K, N);
+  C    = ViewTypeC("C", M, N);
+  Cref = ViewTypeC("Cref", M, N);
+
+  // (SA 11 Dec 2019) Max (previously: 10) increased to detect the bug in
+  // Trilinos issue #6418
+  const uint64_t seed = Kokkos::Impl::clock_tic();
+  Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(seed);
+  Kokkos::fill_random(A, rand_pool,
+                      Kokkos::rand<typename Kokkos::Random_XorShift64_Pool<
+                                       execution_space>::generator_type,
+                                   ScalarA>::max());
+  Kokkos::fill_random(B, rand_pool,
+                      Kokkos::rand<typename Kokkos::Random_XorShift64_Pool<
+                                       execution_space>::generator_type,
+                                   ScalarB>::max());
+  Kokkos::fill_random(C, rand_pool,
+                      Kokkos::rand<typename Kokkos::Random_XorShift64_Pool<
+                                       execution_space>::generator_type,
+                                   ScalarC>::max());
+
+  Kokkos::deep_copy(Cref, C);
+  Kokkos::fence();
+
+  struct Test::gemm_VanillaGEMM<ViewTypeA, ViewTypeB, ViewTypeC,
+                                execution_space>
+      vgemm;
+  vgemm.A_t   = false;
+  vgemm.B_t   = false;
+  vgemm.A_c   = false;
+  vgemm.B_c   = false;
+  vgemm.N     = N;
+  vgemm.K     = K;
+  vgemm.A     = A;
+  vgemm.B     = B;
+  vgemm.C     = Cref;
+  vgemm.alpha = alpha;
+  vgemm.beta  = beta;
+
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::gemm_VanillaGEMM",
+      Kokkos::TeamPolicy<execution_space>(
+          M, Kokkos::AUTO,
+          KokkosKernels::Impl::kk_get_max_vector_size<execution_space>()),
+      vgemm);
+  Kokkos::fence();
+}
+
+template <class ViewTypeC, class ExecutionSpace>
+struct DiffGEMM {
+  int N;
+  ViewTypeC C, C2;
+
+  typedef typename ViewTypeC::value_type ScalarC;
+  typedef Kokkos::Details::ArithTraits<ScalarC> APT;
+  typedef typename APT::mag_type mag_type;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(
+      const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team,
+      mag_type& diff) const {
+    const int i       = team.league_rank();
+    mag_type diff_row = 0;
+    Kokkos::parallel_reduce(
+        Kokkos::TeamThreadRange(team, N),
+        [&](const int& j, mag_type& diff_ij) {
+          // printf("A (%i %i) (%i %i) (%i
+          // %i)\n",C.extent(0),C.extent(1),C2.extent(0),C2.extent(1),i,j);
+          diff_ij += APT::abs(C(i, j) - C2(i, j));
+          // printf("B (%i %i) (%i %i) (%i
+          // %i)\n",C.extent(0),C.extent(1),C2.extent(0),C2.extent(1),i,j);
+        },
+        diff_row);
+    Kokkos::single(Kokkos::PerTeam(team), [&]() { diff += diff_row; });
+  }
+};
+
+template <class ViewTypeA, class ViewTypeB, class ViewTypeC, class Device>
+void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K,
+                    typename ViewTypeA::value_type alpha,
+                    typename ViewTypeC::value_type beta) {
+  bool A_t = (TA[0] != 'N') && (TA[0] != 'n');
+  bool B_t = (TB[0] != 'N') && (TB[0] != 'n');
+  bool A_c = (TA[0] == 'C') || (TA[0] == 'c');
+  bool B_c = (TB[0] == 'C') || (TB[0] == 'c');
+  typedef typename ViewTypeA::device_type::execution_space execution_space;
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+  typedef typename ViewTypeC::value_type ScalarC;
+  typedef Kokkos::Details::ArithTraits<ScalarC> APT;
+  typedef typename APT::mag_type mag_type;
+
+  double machine_eps = APT::epsilon();
+
+  ViewTypeA A("A", A_t ? K : M, A_t ? M : K);
+  ViewTypeB B("B", B_t ? N : K, B_t ? K : N);
+  ViewTypeC C("C", M, N);
+  ViewTypeC C2("C", M, N);
+
+  const uint64_t seed = Kokkos::Impl::clock_tic();
+  Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(seed);
+
+  // (SA 11 Dec 2019) Max (previously: 10) increased to detect the bug in
+  // Trilinos issue #6418
+  Kokkos::fill_random(A, rand_pool,
+                      Kokkos::rand<typename Kokkos::Random_XorShift64_Pool<
+                                       execution_space>::generator_type,
+                                   ScalarA>::max());
+  Kokkos::fill_random(B, rand_pool,
+                      Kokkos::rand<typename Kokkos::Random_XorShift64_Pool<
+                                       execution_space>::generator_type,
+                                   ScalarB>::max());
+  Kokkos::fill_random(C, rand_pool,
+                      Kokkos::rand<typename Kokkos::Random_XorShift64_Pool<
+                                       execution_space>::generator_type,
+                                   ScalarC>::max());
+
+  Kokkos::deep_copy(C2, C);
+  Kokkos::fence();
+
+  struct gemm_VanillaGEMM<ViewTypeA, ViewTypeB, ViewTypeC, execution_space>
+      vgemm;
+  vgemm.A_t   = A_t;
+  vgemm.B_t   = B_t;
+  vgemm.A_c   = A_c;
+  vgemm.B_c   = B_c;
+  vgemm.N     = N;
+  vgemm.K     = K;
+  vgemm.A     = A;
+  vgemm.B     = B;
+  vgemm.C     = C2;
+  vgemm.alpha = alpha;
+  vgemm.beta  = beta;
+
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::gemm_VanillaGEMM",
+      Kokkos::TeamPolicy<execution_space>(
+          M, Kokkos::AUTO,
+          KokkosKernels::Impl::kk_get_max_vector_size<execution_space>()),
+      vgemm);
+
+  KokkosBlas::gemm(TA, TB, alpha, A, B, beta, C);
+
+  mag_type diff_C = 0;
+  struct DiffGEMM<ViewTypeC, execution_space> diffgemm;
+  diffgemm.N  = N;
+  diffgemm.C  = C;
+  diffgemm.C2 = C2;
+
+  Kokkos::parallel_reduce("KokkosBlas::Test::DiffGEMM",
+                          Kokkos::TeamPolicy<execution_space>(M, Kokkos::AUTO),
+                          diffgemm, diff_C);
+
+  if (N != 0 && M != 0) {
+    int K_eff             = (K == 0) ? 1 : K;
+    double diff_C_average = diff_C / (N * M);
+    // Expected Result: Random Walk in the least significant bit (i.e. ~
+    // sqrt(K)*eps eps scales with the total sum and has a factor in it for the
+    // accuracy of the operations -> eps = K * 75 * machine_eps * 7
+    double diff_C_expected = 1.0 * sqrt(K_eff) * K_eff * 75 * machine_eps * 7;
+
+    if ((diff_C_average >= 1.05 * diff_C_expected)) {
+      printf("Result: %e %e\n", diff_C_average, diff_C_expected);
     }
-  };
-
-  template<class ViewTypeC, class ExecutionSpace>
-  struct DiffGEMM {
-    int N;
-    ViewTypeC C,C2;
-
-    typedef typename ViewTypeC::value_type ScalarC;
-    typedef Kokkos::Details::ArithTraits<ScalarC> APT;
-    typedef typename APT::mag_type mag_type;
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team, mag_type& diff) const {
-      const int i = team.league_rank();
-      mag_type diff_row = 0;
-      Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,N), [&] (const int& j,mag_type& diff_ij) {
-        //printf("A (%i %i) (%i %i) (%i %i)\n",C.extent(0),C.extent(1),C2.extent(0),C2.extent(1),i,j);
-        diff_ij += APT::abs(C(i,j)-C2(i,j));
-        //printf("B (%i %i) (%i %i) (%i %i)\n",C.extent(0),C.extent(1),C2.extent(0),C2.extent(1),i,j);
-      },diff_row);
-      Kokkos::single(Kokkos::PerTeam(team), [&] () {
-        diff += diff_row;
-      });
+    EXPECT_TRUE((diff_C_average < 1.05 * diff_C_expected));
+  }
+}
+
+template <typename Scalar, typename Layout>
+void impl_test_stream_gemm(const int M, const int N, const int K,
+                           const Scalar alpha, const Scalar beta) {
+  using execution_space = TestExecSpace;
+  using ViewTypeA       = Kokkos::View<Scalar**, Layout, TestExecSpace>;
+  using ViewTypeB       = Kokkos::View<Scalar**, Layout, TestExecSpace>;
+  using ViewTypeC       = Kokkos::View<Scalar**, Layout, TestExecSpace>;
+  using ScalarC         = typename ViewTypeC::value_type;
+  using APT             = Kokkos::Details::ArithTraits<ScalarC>;
+  using mag_type        = typename APT::mag_type;
+
+  const char tA[]          = {"N"};
+  const char tB[]          = {"N"};
+  const double machine_eps = APT::epsilon();
+
+  ViewTypeA A1, A2;
+  ViewTypeB B1, B2;
+  ViewTypeC C1, C1ref, C2, C2ref;
+
+  Test::build_matrices(M, N, K, alpha, A1, B1, beta, C1, C1ref);
+  Test::build_matrices(N, M, K, alpha, A2, B2, beta, C2, C2ref);
+
+  auto instances =
+      Kokkos::Experimental::partition_space(execution_space(), 1, 1);
+  KokkosBlas::gemm(instances[0], tA, tB, alpha, A1, B1, beta, C1);
+  KokkosBlas::gemm(instances[1], tA, tB, alpha, A2, B2, beta, C2);
+  Kokkos::fence();
+
+  mag_type diff_C1 = 0;
+  struct Test::DiffGEMM<ViewTypeC, execution_space> diffgemm1;
+  diffgemm1.N  = N;
+  diffgemm1.C  = C1;
+  diffgemm1.C2 = C1ref;
+
+  Kokkos::parallel_reduce(
+      "KokkosBlas::Test::DiffGEMM1",
+      Kokkos::TeamPolicy<execution_space>(
+          M, Kokkos::AUTO,
+          KokkosKernels::Impl::kk_get_max_vector_size<execution_space>()),
+      diffgemm1, diff_C1);
+
+  mag_type diff_C2 = 0;
+  struct Test::DiffGEMM<ViewTypeC, execution_space> diffgemm2;
+  diffgemm2.N  = M;
+  diffgemm2.C  = C2;
+  diffgemm2.C2 = C2ref;
+
+  Kokkos::parallel_reduce(
+      "KokkosBlas::Test::DiffGEMM2",
+      Kokkos::TeamPolicy<execution_space>(
+          N, Kokkos::AUTO,
+          KokkosKernels::Impl::kk_get_max_vector_size<execution_space>()),
+      diffgemm2, diff_C2);
+  Kokkos::fence();
+
+  if (N != 0 && M != 0) {
+    int K_eff = (K == 0) ? 1 : K;
+    // Expected Result: Random Walk in the least significant bit (i.e. ~
+    // sqrt(K)*eps eps scales with the total sum and has a factor in it for the
+    // accuracy of the operations -> eps = K * 75 * machine_eps * 7
+    const double diff_C_expected =
+        1.0 * sqrt(K_eff) * K_eff * 75 * machine_eps * 7;
+
+    const double diff_C1_average = diff_C1 / (N * M);
+    if ((diff_C1_average >= 1.05 * diff_C_expected)) {
+      printf("Result: %e %e\n", diff_C1_average, diff_C_expected);
     }
-  };
-
-  template<class ViewTypeA, class ViewTypeB, class ViewTypeC, class Device>
-  void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K,
-      typename ViewTypeA::value_type alpha,
-      typename ViewTypeC::value_type beta) {
-
-
-    bool A_t = (TA[0]!='N') && (TA[0]!='n');
-    bool B_t = (TB[0]!='N') && (TB[0]!='n');
-    bool A_c = (TA[0]=='C') || (TA[0]=='c');
-    bool B_c = (TB[0]=='C') || (TB[0]=='c');
-    typedef typename ViewTypeA::device_type::execution_space execution_space;
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-    typedef typename ViewTypeC::value_type ScalarC;
-    typedef Kokkos::Details::ArithTraits<ScalarC> APT;
-    typedef typename APT::mag_type mag_type;
-
-    double machine_eps = APT::epsilon();
-
-    ViewTypeA A("A",A_t?K:M,A_t?M:K);
-    ViewTypeB B("B",B_t?N:K,B_t?K:N);
-    ViewTypeC C("C",M,N);
-    ViewTypeC C2("C",M,N);
-
-    uint64_t seed = Kokkos::Impl::clock_tic();
-    Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(seed);
-
-    // (SA 11 Dec 2019) Max (previously: 10) increased to detect the bug in Trilinos issue #6418
-    Kokkos::fill_random(A,rand_pool, Kokkos::rand<typename Kokkos::Random_XorShift64_Pool<execution_space>::generator_type,ScalarA>::max());
-    Kokkos::fill_random(B,rand_pool, Kokkos::rand<typename Kokkos::Random_XorShift64_Pool<execution_space>::generator_type,ScalarB>::max());
-    Kokkos::fill_random(C,rand_pool, Kokkos::rand<typename Kokkos::Random_XorShift64_Pool<execution_space>::generator_type,ScalarC>::max());
-
-    Kokkos::deep_copy(C2,C);
-    Kokkos::fence();
-
-    struct gemm_VanillaGEMM<ViewTypeA,ViewTypeB,ViewTypeC,execution_space> vgemm;
-    vgemm.A_t = A_t; vgemm.B_t = B_t;
-    vgemm.A_c = A_c; vgemm.B_c = B_c;
-    vgemm.N = N;     vgemm.K = K;
-    vgemm.A = A;     vgemm.B = B;
-    vgemm.C = C2;
-    vgemm.alpha = alpha;
-    vgemm.beta = beta;
-
-    Kokkos::parallel_for("KokkosBlas::Test::gemm_VanillaGEMM", Kokkos::TeamPolicy<execution_space>(M,Kokkos::AUTO,16), vgemm);
-
-    KokkosBlas::gemm(TA,TB,alpha,A,B,beta,C);
-
-    mag_type diff_C = 0;
-    struct DiffGEMM<ViewTypeC,execution_space> diffgemm;
-    diffgemm.N = N;
-    diffgemm.C = C;
-    diffgemm.C2 = C2;
-
-    Kokkos::parallel_reduce("KokkosBlas::Test::DiffGEMM", Kokkos::TeamPolicy<execution_space>(M,Kokkos::AUTO,16), diffgemm, diff_C);
-
-    if( N!=0 && M!=0) {
-      int K_eff = (K == 0) ? 1 : K;
-      double diff_C_average = diff_C/(N*M);
-      // Expected Result: Random Walk in the least significant bit (i.e. ~ sqrt(K)*eps
-      // eps scales with the total sum and has a factor in it for the accuracy of the operations ->
-      // eps = K * 75 * machine_eps * 7
-      double diff_C_expected = 1.0*sqrt(K_eff)*K_eff*75*machine_eps*7;
-
-      if ( (diff_C_average >= 1.05*diff_C_expected ) ) {
-        printf("Result: %e %e\n",diff_C_average,diff_C_expected);
-      }
-      EXPECT_TRUE( (diff_C_average < 1.05*diff_C_expected ) );
+    EXPECT_TRUE((diff_C1_average < 1.05 * diff_C_expected));
+
+    const double diff_C2_average = diff_C2 / (N * M);
+    if ((diff_C2_average >= 1.05 * diff_C_expected)) {
+      printf("Result: %e %e\n", diff_C2_average, diff_C_expected);
     }
+    EXPECT_TRUE((diff_C2_average < 1.05 * diff_C_expected));
   }
 }
+}  // namespace Test
 
-template<typename Scalar, typename Layout>
-void test_gemm()
-{
+template <typename Scalar, typename Layout>
+void test_gemm() {
   typedef Kokkos::View<Scalar**, Layout, TestExecSpace> view_type_a;
   typedef Kokkos::View<Scalar**, Layout, TestExecSpace> view_type_b;
   typedef Kokkos::View<Scalar**, Layout, TestExecSpace> view_type_c;
   std::vector<const char*> modes = {"N", "T"};
-  if(std::is_same<Scalar, Kokkos::complex<float>>::value || std::is_same<Scalar, Kokkos::complex<double>>::value)
+  if (std::is_same<Scalar, Kokkos::complex<float>>::value ||
+      std::is_same<Scalar, Kokkos::complex<double>>::value)
     modes.push_back("C");
-  Scalar alpha = 4.5;
+  Scalar alpha              = 4.5;
   std::vector<Scalar> betas = {0.0, 3.0};
-  for(Scalar beta : betas)
-  {
-    for(auto amode : modes)
-    {
-      for(auto bmode : modes)
-      {
-        Test::impl_test_gemm<view_type_a, view_type_b, view_type_c, TestExecSpace>(amode,bmode,0,0,0,alpha,beta);
-        //BMK: N = 1 exercises the special GEMV code path in GEMM (currently, only for modes N/N)
-        Test::impl_test_gemm<view_type_a, view_type_b, view_type_c, TestExecSpace>(amode,bmode,50,1,40,alpha,beta);
-        //LBV: K = 0 exercise the quick return code path in GEMM
-        Test::impl_test_gemm<view_type_a, view_type_b, view_type_c, TestExecSpace>(amode,bmode,20,14,0,alpha,beta);
-        Test::impl_test_gemm<view_type_a, view_type_b, view_type_c, TestExecSpace>(amode,bmode,13,15,17,alpha,beta);
-        Test::impl_test_gemm<view_type_a, view_type_b, view_type_c, TestExecSpace>(amode,bmode,179,15,211,alpha,beta);
-        Test::impl_test_gemm<view_type_a, view_type_b, view_type_c, TestExecSpace>(amode,bmode,12,3071,517,alpha,beta);
+  for (Scalar beta : betas) {
+    for (auto amode : modes) {
+      for (auto bmode : modes) {
+        Test::impl_test_gemm<view_type_a, view_type_b, view_type_c,
+                             TestExecSpace>(amode, bmode, 0, 0, 0, alpha, beta);
+        // BMK: N = 1 exercises the special GEMV code path in GEMM (currently,
+        // only for modes N/N)
+        Test::impl_test_gemm<view_type_a, view_type_b, view_type_c,
+                             TestExecSpace>(amode, bmode, 50, 1, 40, alpha,
+                                            beta);
+        // LBV: K = 0 exercise the quick return code path in GEMM
+        Test::impl_test_gemm<view_type_a, view_type_b, view_type_c,
+                             TestExecSpace>(amode, bmode, 20, 14, 0, alpha,
+                                            beta);
+        Test::impl_test_gemm<view_type_a, view_type_b, view_type_c,
+                             TestExecSpace>(amode, bmode, 13, 15, 17, alpha,
+                                            beta);
+        Test::impl_test_gemm<view_type_a, view_type_b, view_type_c,
+                             TestExecSpace>(amode, bmode, 179, 15, 211, alpha,
+                                            beta);
+        Test::impl_test_gemm<view_type_a, view_type_b, view_type_c,
+                             TestExecSpace>(amode, bmode, 12, 3071, 517, alpha,
+                                            beta);
       }
     }
   }
+  Test::impl_test_stream_gemm<Scalar, Layout>(53, 42, 17, 4.5,
+                                              3.0);  // General code path
+  Test::impl_test_stream_gemm<Scalar, Layout>(
+      13, 1, 17, 4.5, 3.0);  // gemv based gemm code path
+  Test::impl_test_stream_gemm<Scalar, Layout>(7, 13, 17, 4.5,
+                                              3.0);  // dot based gemm code path
 }
 
-template<typename Scalar>
-void test_gemm_enabled_layouts()
-{
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+template <typename Scalar>
+void test_gemm_enabled_layouts() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   test_gemm<Scalar, Kokkos::LayoutLeft>();
 #endif
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   test_gemm<Scalar, Kokkos::LayoutRight>();
 #endif
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, gemm_float ) {
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, gemm_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_float");
   test_gemm_enabled_layouts<float>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, gemm_double ) {
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, gemm_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_double");
-    test_gemm_enabled_layouts<double>();
+  test_gemm_enabled_layouts<double>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, gemm_complex_double ) {
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, gemm_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_complex_double");
-    test_gemm_enabled_layouts<Kokkos::complex<double>>();
+  test_gemm_enabled_layouts<Kokkos::complex<double>>();
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, gemm_complex_float ) {
+#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&         \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, gemm_complex_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_complex_float");
-    test_gemm_enabled_layouts<Kokkos::complex<float>>();
+  test_gemm_enabled_layouts<Kokkos::complex<float>>();
   Kokkos::Profiling::popRegion();
 }
 #endif
-
diff --git a/unit_test/blas/Test_Blas3_trmm.hpp b/unit_test/blas/Test_Blas3_trmm.hpp
index 8f7009bd82..9b6f3da1fc 100644
--- a/unit_test/blas/Test_Blas3_trmm.hpp
+++ b/unit_test/blas/Test_Blas3_trmm.hpp
@@ -1,71 +1,72 @@
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas3_trmm.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas3_trmm.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
 
-  template<class ViewTypeA, class ExecutionSpace>
-  struct UnitDiagTRMM {
-    ViewTypeA A_;
-    using ScalarA = typename ViewTypeA::value_type;
-
-    UnitDiagTRMM (const ViewTypeA& A) : A_(A) {}
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i) const {
-      A_(i,i) = ScalarA(1);
-    }
-  };
-  template<class ViewTypeA, class ExecutionSpace>
-  struct NonUnitDiagTRMM {
-    ViewTypeA A_;
-    using ScalarA = typename ViewTypeA::value_type;
-
-    NonUnitDiagTRMM (const ViewTypeA& A) : A_(A) {}
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i) const {
-      A_(i,i) = A_(i,i)+10;
-    }
-  };
-
-  template<class ViewTypeA, class ViewTypeB, class ViewTypeC, class ExecutionSpace>
-  struct trmm_VanillaGEMM {
-    bool A_t, B_t, A_c, B_c;
-    int N,K;
-    ViewTypeA A;
-    ViewTypeB B;
-    ViewTypeC C;
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-    typedef typename ViewTypeC::value_type ScalarC;
-    typedef Kokkos::Details::ArithTraits<ScalarC> APT;
-    typedef typename APT::mag_type mag_type;
-    ScalarA alpha;
-    ScalarC beta;
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team) const {
+template <class ViewTypeA, class ExecutionSpace>
+struct UnitDiagTRMM {
+  ViewTypeA A_;
+  using ScalarA = typename ViewTypeA::value_type;
+
+  UnitDiagTRMM(const ViewTypeA& A) : A_(A) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const { A_(i, i) = ScalarA(1); }
+};
+template <class ViewTypeA, class ExecutionSpace>
+struct NonUnitDiagTRMM {
+  ViewTypeA A_;
+  using ScalarA = typename ViewTypeA::value_type;
+
+  NonUnitDiagTRMM(const ViewTypeA& A) : A_(A) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const { A_(i, i) = A_(i, i) + 10; }
+};
+
+template <class ViewTypeA, class ViewTypeB, class ViewTypeC,
+          class ExecutionSpace>
+struct trmm_VanillaGEMM {
+  bool A_t, B_t, A_c, B_c;
+  int N, K;
+  ViewTypeA A;
+  ViewTypeB B;
+  ViewTypeC C;
+
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+  typedef typename ViewTypeC::value_type ScalarC;
+  typedef Kokkos::Details::ArithTraits<ScalarC> APT;
+  typedef typename APT::mag_type mag_type;
+  ScalarA alpha;
+  ScalarC beta;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(
+      const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team)
+      const {
 // GNU COMPILER BUG WORKAROUND
-#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
-      int i = team.league_rank();
+#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && \
+    !defined(__HIP_DEVICE_COMPILE__)
+    int i = team.league_rank();
 #else
-      const int i = team.league_rank();
+    const int i = team.league_rank();
 #endif
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,N), [&] (const int& j) {
-        ScalarC C_ij = 0.0;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& j) {
+      ScalarC C_ij = 0.0;
 
-        // GNU 5.3, 5.4 and 6.1 (and maybe more) crash with another nested lambda here
+      // GNU 5.3, 5.4 and 6.1 (and maybe more) crash with another nested lambda
+      // here
 
 #if defined(KOKKOS_COMPILER_GNU) && !defined(KOKKOS_COMPILER_NVCC)
-        for(int k=0; k<K; k++) {
-          ScalarA A_ik = A_t?(A_c?APT::conj(A(k,i)):A(k,i)):A(i,k);
-          ScalarB B_kj = B_t?(B_c?APT::conj(B(j,k)):B(j,k)):B(k,j);
-          C_ij += A_ik*B_kj;
-        }
+      for (int k = 0; k < K; k++) {
+        ScalarA A_ik = A_t ? (A_c ? APT::conj(A(k, i)) : A(k, i)) : A(i, k);
+        ScalarB B_kj = B_t ? (B_c ? APT::conj(B(j, k)) : B(j, k)) : B(k, j);
+        C_ij += A_ik * B_kj;
+      }
 #else
         Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,K), [&] (const int& k, ScalarC& lsum) {
            ScalarA A_ik = A_t?(A_c?APT::conj(A(k,i)):A(k,i)):A(i,k);
@@ -74,547 +75,652 @@ namespace Test {
         },C_ij);
 #endif
 
-        C(i,j) = beta*C(i,j) + alpha*C_ij;
-      });
-    }
-  };
-
-  template<class Scalar, class ViewTypeA, class ViewTypeB, class Device>
-  void impl_test_trmm(const char* side, const char* uplo, const char* trans, const char* diag, 
-                      int M, int N, Scalar alpha) {
-
-    using execution_space = typename ViewTypeA::device_type::execution_space;
-    using ScalarA         = typename ViewTypeA::value_type;
-    using APT             = Kokkos::Details::ArithTraits<ScalarA>;
-    using mag_type        = typename APT::mag_type;
-
-    double machine_eps = APT::epsilon();
-    const mag_type eps = 1.0e8 * machine_eps; //~1e-13 for double
-    bool A_l = (side[0]=='L') || (side[0]=='l');
-    int K = A_l?M:N;
-    ViewTypeA A  ("A", K,K);
-    ViewTypeB B  ("B", M,N);
-    ViewTypeB B_expected ("B_expected", M,N);
-    uint64_t seed = Kokkos::Impl::clock_tic();
-    ScalarA beta       = ScalarA(0);
-
-    //printf("KokkosBlas::trmm test for alpha %g, %c %c %c %c, M %d, N %d, eps %g, ViewType: %s\n", Kokkos::Details::ArithTraits<Scalar>::real(alpha),side[0],uplo[0],trans[0],diag[0],M,N,eps,typeid(ViewTypeA).name());
-
-    typename ViewTypeA::HostMirror host_A  = Kokkos::create_mirror_view(A);
-    typename ViewTypeB::HostMirror host_B_actual  = Kokkos::create_mirror_view(B);
-    typename ViewTypeB::HostMirror host_B_expected = Kokkos::create_mirror_view(B_expected);
-
-    Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(seed);
-
-    if((diag[0]=='U')||(diag[0]=='u')) {
-      // Initialize A with deterministic random numbers
-      Kokkos::fill_random(A, rand_pool, Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, ScalarA>::max());
-      using functor_type = UnitDiagTRMM<ViewTypeA,execution_space>;
-      functor_type udtrmm(A);
-      // Initialize As diag with 1s
-      Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRMM", Kokkos::RangePolicy<execution_space>(0,K), udtrmm);
-    } else {//(diag[0]=='N')||(diag[0]=='n')
-      // Initialize A with random numbers
-      Kokkos::fill_random(A, rand_pool, Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, ScalarA>::max());
-      using functor_type = NonUnitDiagTRMM<ViewTypeA,execution_space>;
-      functor_type nudtrmm(A);
-      // Initialize As diag with A(i,i)+10
-      Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRMM", Kokkos::RangePolicy<execution_space>(0,K), nudtrmm);
-    }
-    Kokkos::fill_random(B, rand_pool, Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, ScalarA>::max());
-    
-    Kokkos::deep_copy(host_A,  A);
-    // Make host_A a lower triangle
-    if ((uplo[0]=='L')||(uplo[0]=='l')) {
-      for (int i = 0; i < K-1; i++)
-        for (int j = i+1; j < K; j++)
-          host_A(i,j) = ScalarA(0);
-    }
-    else {
-      // Make host_A a upper triangle
-      for (int i = 1; i < K; i++)
-        for (int j = 0; j < i; j++)
-          host_A(i,j) = ScalarA(0); 
-    }
-    Kokkos::deep_copy(A, host_A);
-
-    struct trmm_VanillaGEMM<ViewTypeB,ViewTypeA,ViewTypeB,execution_space> vgemm;
-    if (A_l){
-      // B_expected = alpha * op(A) * B + beta * C = 1 * op(A) * B + 0 * C
-      vgemm.A_t = (trans[0]!='N') && (trans[0]!='n'); vgemm.B_t = false;
-      vgemm.A_c = (trans[0]=='C') || (trans[0]=='c'); vgemm.B_c = false;
-      vgemm.A = A;     vgemm.B = B;
-    }
-    else {
-      // B_expected = alpha * B * op(A) + beta * C = 1 * B * op(A) + 0 * C
-      vgemm.A_t = false; vgemm.B_t = (trans[0]!='N') && (trans[0]!='n');
-      vgemm.A_c = false; vgemm.B_c = (trans[0]=='C') || (trans[0]=='c');
-      vgemm.A = B;     vgemm.B = A;
-    }
-    vgemm.N = N;
-    vgemm.K = K;
-    vgemm.C = B_expected; // out
-    vgemm.alpha = alpha;
-    vgemm.beta = beta;
-    Kokkos::parallel_for("KokkosBlas::Test::trmm_VanillaGEMM", Kokkos::TeamPolicy<execution_space>(M,Kokkos::AUTO,16), vgemm);
-    Kokkos::fence();
-    Kokkos::deep_copy(host_B_expected, B_expected);
-
-    KokkosBlas::trmm(side, uplo, trans, diag, alpha, A, B);
-    Kokkos::deep_copy(host_B_actual, B);
-
-    bool test_flag = true;
-    for (int i=0; i<M; i++) {
-      for (int j=0; j<N; j++) {
-        if ( APT::abs(host_B_actual(i,j) - host_B_expected(i,j)) > eps ) {
-          test_flag = false;
-          //printf("   Error: eps ( %g ), abs_result( %.15lf ) != abs_solution( %.15lf ) (abs result-solution %g) at (i %d, j %d)\n", eps, APT::abs(host_B_actual(i,j)), APT::abs(B_expected(i,j)), APT::abs(host_B_actual(i,j) - B_expected(i,j)), i, j);
-          break;
-        }
+      C(i, j) = beta * C(i, j) + alpha * C_ij;
+    });
+  }
+};
+
+template <class Scalar, class ViewTypeA, class ViewTypeB, class Device>
+void impl_test_trmm(const char* side, const char* uplo, const char* trans,
+                    const char* diag, int M, int N, Scalar alpha) {
+  using execution_space = typename ViewTypeA::device_type::execution_space;
+  using ScalarA         = typename ViewTypeA::value_type;
+  using APT             = Kokkos::Details::ArithTraits<ScalarA>;
+  using mag_type        = typename APT::mag_type;
+
+  double machine_eps = APT::epsilon();
+  const mag_type eps = 1.0e8 * machine_eps;  //~1e-13 for double
+  bool A_l           = (side[0] == 'L') || (side[0] == 'l');
+  int K              = A_l ? M : N;
+  ViewTypeA A("A", K, K);
+  ViewTypeB B("B", M, N);
+  ViewTypeB B_expected("B_expected", M, N);
+  uint64_t seed = Kokkos::Impl::clock_tic();
+  ScalarA beta  = ScalarA(0);
+
+  // printf("KokkosBlas::trmm test for alpha %g, %c %c %c %c, M %d, N %d, eps
+  // %g, ViewType: %s\n",
+  // Kokkos::Details::ArithTraits<Scalar>::real(alpha),side[0],uplo[0],trans[0],diag[0],M,N,eps,typeid(ViewTypeA).name());
+
+  typename ViewTypeA::HostMirror host_A        = Kokkos::create_mirror_view(A);
+  typename ViewTypeB::HostMirror host_B_actual = Kokkos::create_mirror_view(B);
+  typename ViewTypeB::HostMirror host_B_expected =
+      Kokkos::create_mirror_view(B_expected);
+
+  Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(seed);
+
+  if ((diag[0] == 'U') || (diag[0] == 'u')) {
+    // Initialize A with deterministic random numbers
+    Kokkos::fill_random(A, rand_pool,
+                        Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
+                                     ScalarA>::max());
+    using functor_type = UnitDiagTRMM<ViewTypeA, execution_space>;
+    functor_type udtrmm(A);
+    // Initialize As diag with 1s
+    Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRMM",
+                         Kokkos::RangePolicy<execution_space>(0, K), udtrmm);
+  } else {  //(diag[0]=='N')||(diag[0]=='n')
+    // Initialize A with random numbers
+    Kokkos::fill_random(A, rand_pool,
+                        Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
+                                     ScalarA>::max());
+    using functor_type = NonUnitDiagTRMM<ViewTypeA, execution_space>;
+    functor_type nudtrmm(A);
+    // Initialize As diag with A(i,i)+10
+    Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRMM",
+                         Kokkos::RangePolicy<execution_space>(0, K), nudtrmm);
+  }
+  Kokkos::fill_random(
+      B, rand_pool,
+      Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, ScalarA>::max());
+
+  Kokkos::deep_copy(host_A, A);
+  // Make host_A a lower triangle
+  if ((uplo[0] == 'L') || (uplo[0] == 'l')) {
+    for (int i = 0; i < K - 1; i++)
+      for (int j = i + 1; j < K; j++) host_A(i, j) = ScalarA(0);
+  } else {
+    // Make host_A a upper triangle
+    for (int i = 1; i < K; i++)
+      for (int j = 0; j < i; j++) host_A(i, j) = ScalarA(0);
+  }
+  Kokkos::deep_copy(A, host_A);
+
+  struct trmm_VanillaGEMM<ViewTypeB, ViewTypeA, ViewTypeB, execution_space>
+      vgemm;
+  if (A_l) {
+    // B_expected = alpha * op(A) * B + beta * C = 1 * op(A) * B + 0 * C
+    vgemm.A_t = (trans[0] != 'N') && (trans[0] != 'n');
+    vgemm.B_t = false;
+    vgemm.A_c = (trans[0] == 'C') || (trans[0] == 'c');
+    vgemm.B_c = false;
+    vgemm.A   = A;
+    vgemm.B   = B;
+  } else {
+    // B_expected = alpha * B * op(A) + beta * C = 1 * B * op(A) + 0 * C
+    vgemm.A_t = false;
+    vgemm.B_t = (trans[0] != 'N') && (trans[0] != 'n');
+    vgemm.A_c = false;
+    vgemm.B_c = (trans[0] == 'C') || (trans[0] == 'c');
+    vgemm.A   = B;
+    vgemm.B   = A;
+  }
+  vgemm.N     = N;
+  vgemm.K     = K;
+  vgemm.C     = B_expected;  // out
+  vgemm.alpha = alpha;
+  vgemm.beta  = beta;
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::trmm_VanillaGEMM",
+      Kokkos::TeamPolicy<execution_space>(
+          M, Kokkos::AUTO,
+          KokkosKernels::Impl::kk_get_max_vector_size<execution_space>()),
+      vgemm);
+  Kokkos::fence();
+  Kokkos::deep_copy(host_B_expected, B_expected);
+
+  KokkosBlas::trmm(side, uplo, trans, diag, alpha, A, B);
+  Kokkos::deep_copy(host_B_actual, B);
+
+  bool test_flag = true;
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++) {
+      if (APT::abs(host_B_actual(i, j) - host_B_expected(i, j)) > eps) {
+        test_flag = false;
+        // printf("   Error: eps ( %g ), abs_result( %.15lf ) != abs_solution(
+        // %.15lf ) (abs result-solution %g) at (i %d, j %d)\n", eps,
+        // APT::abs(host_B_actual(i,j)), APT::abs(B_expected(i,j)),
+        // APT::abs(host_B_actual(i,j) - B_expected(i,j)), i, j);
+        break;
       }
-      if (!test_flag) break;
     }
-    ASSERT_EQ( test_flag, true );
+    if (!test_flag) break;
   }
+  ASSERT_EQ(test_flag, true);
 }
+}  // namespace Test
 
-template<class ScalarA, class ScalarB, class Device>
+template <class ScalarA, class ScalarB, class Device>
 int test_trmm(const char* mode, ScalarA alpha) {
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   using view_type_a_ll = Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device>;
   using view_type_b_ll = Kokkos::View<ScalarB**, Kokkos::LayoutLeft, Device>;
-  Test::impl_test_trmm<ScalarA, view_type_a_ll, view_type_b_ll, Device>(&mode[0],&mode[1],&mode[2],&mode[3],0,0,alpha);
-  Test::impl_test_trmm<ScalarA, view_type_a_ll, view_type_b_ll, Device>(&mode[0],&mode[1],&mode[2],&mode[3],101,19,alpha);
-  Test::impl_test_trmm<ScalarA, view_type_a_ll, view_type_b_ll, Device>(&mode[0],&mode[1],&mode[2],&mode[3],19,101,alpha);
-  Test::impl_test_trmm<ScalarA, view_type_a_ll, view_type_b_ll, Device>(&mode[0],&mode[1],&mode[2],&mode[3],12,731,alpha);
+  Test::impl_test_trmm<ScalarA, view_type_a_ll, view_type_b_ll, Device>(
+      &mode[0], &mode[1], &mode[2], &mode[3], 0, 0, alpha);
+  Test::impl_test_trmm<ScalarA, view_type_a_ll, view_type_b_ll, Device>(
+      &mode[0], &mode[1], &mode[2], &mode[3], 101, 19, alpha);
+  Test::impl_test_trmm<ScalarA, view_type_a_ll, view_type_b_ll, Device>(
+      &mode[0], &mode[1], &mode[2], &mode[3], 19, 101, alpha);
+  Test::impl_test_trmm<ScalarA, view_type_a_ll, view_type_b_ll, Device>(
+      &mode[0], &mode[1], &mode[2], &mode[3], 12, 731, alpha);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   using view_type_a_lr = Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device>;
   using view_type_b_lr = Kokkos::View<ScalarB**, Kokkos::LayoutRight, Device>;
-  Test::impl_test_trmm<ScalarA, view_type_a_lr, view_type_b_lr, Device>(&mode[0],&mode[1],&mode[2],&mode[3],0,0,alpha);
-  Test::impl_test_trmm<ScalarA, view_type_a_lr, view_type_b_lr, Device>(&mode[0],&mode[1],&mode[2],&mode[3],101,19,alpha);
-  Test::impl_test_trmm<ScalarA, view_type_a_lr, view_type_b_lr, Device>(&mode[0],&mode[1],&mode[2],&mode[3],19,101,alpha);
-  Test::impl_test_trmm<ScalarA, view_type_a_lr, view_type_b_lr, Device>(&mode[0],&mode[1],&mode[2],&mode[3],12,731,alpha);
+  Test::impl_test_trmm<ScalarA, view_type_a_lr, view_type_b_lr, Device>(
+      &mode[0], &mode[1], &mode[2], &mode[3], 0, 0, alpha);
+  Test::impl_test_trmm<ScalarA, view_type_a_lr, view_type_b_lr, Device>(
+      &mode[0], &mode[1], &mode[2], &mode[3], 101, 19, alpha);
+  Test::impl_test_trmm<ScalarA, view_type_a_lr, view_type_b_lr, Device>(
+      &mode[0], &mode[1], &mode[2], &mode[3], 19, 101, alpha);
+  Test::impl_test_trmm<ScalarA, view_type_a_lr, view_type_b_lr, Device>(
+      &mode[0], &mode[1], &mode[2], &mode[3], 12, 731, alpha);
 #endif
 
   return 1;
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, trmm_float ) {
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, trmm_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_float");
-    float alpha = 1.0f;
-    test_trmm<float,float,TestExecSpace> ("LLNN",alpha);
-    test_trmm<float,float,TestExecSpace> ("LLNU",alpha);
-    test_trmm<float,float,TestExecSpace> ("LLTN",alpha);
-    test_trmm<float,float,TestExecSpace> ("LLTU",alpha);
-    test_trmm<float,float,TestExecSpace> ("LUNN",alpha);
-    test_trmm<float,float,TestExecSpace> ("LUNU",alpha);
-    test_trmm<float,float,TestExecSpace> ("LUTN",alpha);
-    test_trmm<float,float,TestExecSpace> ("LUTU",alpha);
-
-    test_trmm<float,float,TestExecSpace> ("RLNN",alpha);
-    test_trmm<float,float,TestExecSpace> ("RLNU",alpha);
-    test_trmm<float,float,TestExecSpace> ("RLTN",alpha);
-    test_trmm<float,float,TestExecSpace> ("RLTU",alpha);
-    test_trmm<float,float,TestExecSpace> ("RUNN",alpha);
-    test_trmm<float,float,TestExecSpace> ("RUNU",alpha);
-    test_trmm<float,float,TestExecSpace> ("RUTN",alpha);
-    test_trmm<float,float,TestExecSpace> ("RUTU",alpha);
-
-    alpha = 4.5f;
-    test_trmm<float,float,TestExecSpace> ("LLNN",alpha);
-    test_trmm<float,float,TestExecSpace> ("LLNU",alpha);
-    test_trmm<float,float,TestExecSpace> ("LLTN",alpha);
-    test_trmm<float,float,TestExecSpace> ("LLTU",alpha);
-    test_trmm<float,float,TestExecSpace> ("LUNN",alpha);
-    test_trmm<float,float,TestExecSpace> ("LUNU",alpha);
-    test_trmm<float,float,TestExecSpace> ("LUTN",alpha);
-    test_trmm<float,float,TestExecSpace> ("LUTU",alpha);
-
-    test_trmm<float,float,TestExecSpace> ("RLNN",alpha);
-    test_trmm<float,float,TestExecSpace> ("RLNU",alpha);
-    test_trmm<float,float,TestExecSpace> ("RLTN",alpha);
-    test_trmm<float,float,TestExecSpace> ("RLTU",alpha);
-    test_trmm<float,float,TestExecSpace> ("RUNN",alpha);
-    test_trmm<float,float,TestExecSpace> ("RUNU",alpha);
-    test_trmm<float,float,TestExecSpace> ("RUTN",alpha);
-    test_trmm<float,float,TestExecSpace> ("RUTU",alpha);
+  float alpha = 1.0f;
+  test_trmm<float, float, TestExecSpace>("LLNN", alpha);
+  test_trmm<float, float, TestExecSpace>("LLNU", alpha);
+  test_trmm<float, float, TestExecSpace>("LLTN", alpha);
+  test_trmm<float, float, TestExecSpace>("LLTU", alpha);
+  test_trmm<float, float, TestExecSpace>("LUNN", alpha);
+  test_trmm<float, float, TestExecSpace>("LUNU", alpha);
+  test_trmm<float, float, TestExecSpace>("LUTN", alpha);
+  test_trmm<float, float, TestExecSpace>("LUTU", alpha);
+
+  test_trmm<float, float, TestExecSpace>("RLNN", alpha);
+  test_trmm<float, float, TestExecSpace>("RLNU", alpha);
+  test_trmm<float, float, TestExecSpace>("RLTN", alpha);
+  test_trmm<float, float, TestExecSpace>("RLTU", alpha);
+  test_trmm<float, float, TestExecSpace>("RUNN", alpha);
+  test_trmm<float, float, TestExecSpace>("RUNU", alpha);
+  test_trmm<float, float, TestExecSpace>("RUTN", alpha);
+  test_trmm<float, float, TestExecSpace>("RUTU", alpha);
+
+  alpha = 4.5f;
+  test_trmm<float, float, TestExecSpace>("LLNN", alpha);
+  test_trmm<float, float, TestExecSpace>("LLNU", alpha);
+  test_trmm<float, float, TestExecSpace>("LLTN", alpha);
+  test_trmm<float, float, TestExecSpace>("LLTU", alpha);
+  test_trmm<float, float, TestExecSpace>("LUNN", alpha);
+  test_trmm<float, float, TestExecSpace>("LUNU", alpha);
+  test_trmm<float, float, TestExecSpace>("LUTN", alpha);
+  test_trmm<float, float, TestExecSpace>("LUTU", alpha);
+
+  test_trmm<float, float, TestExecSpace>("RLNN", alpha);
+  test_trmm<float, float, TestExecSpace>("RLNU", alpha);
+  test_trmm<float, float, TestExecSpace>("RLTN", alpha);
+  test_trmm<float, float, TestExecSpace>("RLTU", alpha);
+  test_trmm<float, float, TestExecSpace>("RUNN", alpha);
+  test_trmm<float, float, TestExecSpace>("RUNU", alpha);
+  test_trmm<float, float, TestExecSpace>("RUTN", alpha);
+  test_trmm<float, float, TestExecSpace>("RUTU", alpha);
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, trmm_double ) {
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, trmm_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_double");
-    double alpha = 1.0;
-    test_trmm<double,double,TestExecSpace> ("LLNN",alpha);
-    test_trmm<double,double,TestExecSpace> ("LLNU",alpha);
-    test_trmm<double,double,TestExecSpace> ("LLTN",alpha);
-    test_trmm<double,double,TestExecSpace> ("LLTU",alpha);
-    test_trmm<double,double,TestExecSpace> ("LUNN",alpha);
-    test_trmm<double,double,TestExecSpace> ("LUNU",alpha);
-    test_trmm<double,double,TestExecSpace> ("LUTN",alpha);
-    test_trmm<double,double,TestExecSpace> ("LUTU",alpha);
-
-    test_trmm<double,double,TestExecSpace> ("RLNN",alpha);
-    test_trmm<double,double,TestExecSpace> ("RLNU",alpha);
-    test_trmm<double,double,TestExecSpace> ("RLTN",alpha);
-    test_trmm<double,double,TestExecSpace> ("RLTU",alpha);
-    test_trmm<double,double,TestExecSpace> ("RUNN",alpha);
-    test_trmm<double,double,TestExecSpace> ("RUNU",alpha);
-    test_trmm<double,double,TestExecSpace> ("RUTN",alpha);
-    test_trmm<double,double,TestExecSpace> ("RUTU",alpha);
-
-    alpha = 4.5;
-    test_trmm<double,double,TestExecSpace> ("LLNN",alpha);
-    test_trmm<double,double,TestExecSpace> ("LLNU",alpha);
-    test_trmm<double,double,TestExecSpace> ("LLTN",alpha);
-    test_trmm<double,double,TestExecSpace> ("LLTU",alpha);
-    test_trmm<double,double,TestExecSpace> ("LUNN",alpha);
-    test_trmm<double,double,TestExecSpace> ("LUNU",alpha);
-    test_trmm<double,double,TestExecSpace> ("LUTN",alpha);
-    test_trmm<double,double,TestExecSpace> ("LUTU",alpha);
-
-    test_trmm<double,double,TestExecSpace> ("RLNN",alpha);
-    test_trmm<double,double,TestExecSpace> ("RLNU",alpha);
-    test_trmm<double,double,TestExecSpace> ("RLTN",alpha);
-    test_trmm<double,double,TestExecSpace> ("RLTU",alpha);
-    test_trmm<double,double,TestExecSpace> ("RUNN",alpha);
-    test_trmm<double,double,TestExecSpace> ("RUNU",alpha);
-    test_trmm<double,double,TestExecSpace> ("RUTN",alpha);
-    test_trmm<double,double,TestExecSpace> ("RUTU",alpha);
+  double alpha = 1.0;
+  test_trmm<double, double, TestExecSpace>("LLNN", alpha);
+  test_trmm<double, double, TestExecSpace>("LLNU", alpha);
+  test_trmm<double, double, TestExecSpace>("LLTN", alpha);
+  test_trmm<double, double, TestExecSpace>("LLTU", alpha);
+  test_trmm<double, double, TestExecSpace>("LUNN", alpha);
+  test_trmm<double, double, TestExecSpace>("LUNU", alpha);
+  test_trmm<double, double, TestExecSpace>("LUTN", alpha);
+  test_trmm<double, double, TestExecSpace>("LUTU", alpha);
+
+  test_trmm<double, double, TestExecSpace>("RLNN", alpha);
+  test_trmm<double, double, TestExecSpace>("RLNU", alpha);
+  test_trmm<double, double, TestExecSpace>("RLTN", alpha);
+  test_trmm<double, double, TestExecSpace>("RLTU", alpha);
+  test_trmm<double, double, TestExecSpace>("RUNN", alpha);
+  test_trmm<double, double, TestExecSpace>("RUNU", alpha);
+  test_trmm<double, double, TestExecSpace>("RUTN", alpha);
+  test_trmm<double, double, TestExecSpace>("RUTU", alpha);
+
+  alpha = 4.5;
+  test_trmm<double, double, TestExecSpace>("LLNN", alpha);
+  test_trmm<double, double, TestExecSpace>("LLNU", alpha);
+  test_trmm<double, double, TestExecSpace>("LLTN", alpha);
+  test_trmm<double, double, TestExecSpace>("LLTU", alpha);
+  test_trmm<double, double, TestExecSpace>("LUNN", alpha);
+  test_trmm<double, double, TestExecSpace>("LUNU", alpha);
+  test_trmm<double, double, TestExecSpace>("LUTN", alpha);
+  test_trmm<double, double, TestExecSpace>("LUTU", alpha);
+
+  test_trmm<double, double, TestExecSpace>("RLNN", alpha);
+  test_trmm<double, double, TestExecSpace>("RLNU", alpha);
+  test_trmm<double, double, TestExecSpace>("RLTN", alpha);
+  test_trmm<double, double, TestExecSpace>("RLTU", alpha);
+  test_trmm<double, double, TestExecSpace>("RUNN", alpha);
+  test_trmm<double, double, TestExecSpace>("RUNU", alpha);
+  test_trmm<double, double, TestExecSpace>("RUTN", alpha);
+  test_trmm<double, double, TestExecSpace>("RUTU", alpha);
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
 ///////////////// alpha 1.0 /////////////////
-TEST_F( TestCategory, trmm_complex_double_LLNN_one ) {
+TEST_F(TestCategory, trmm_complex_double_LLNN_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLNN");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LLNN",1.0);
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LLNN", 1.0);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_LLNU_one ) {
+TEST_F(TestCategory, trmm_complex_double_LLNU_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLNU");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LLNU",1.0);
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LLNU", 1.0);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_LLCN_one ) {
+TEST_F(TestCategory, trmm_complex_double_LLCN_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLCN");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LLCN",1.0);
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LLCN", 1.0);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_LLCU_one ) {
+TEST_F(TestCategory, trmm_complex_double_LLCU_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLCU");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LLCU",1.0);
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LLCU", 1.0);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_LUNN_one ) {
+TEST_F(TestCategory, trmm_complex_double_LUNN_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUNN");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LUNN",1.0);
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LUNN", 1.0);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_LUNU_one ) {
+TEST_F(TestCategory, trmm_complex_double_LUNU_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUNU");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LUNU",1.0);
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LUNU", 1.0);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_LUCN_one ) {
+TEST_F(TestCategory, trmm_complex_double_LUCN_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUCN");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LUCN",1.0);
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LUCN", 1.0);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_LUCU_one ) {
+TEST_F(TestCategory, trmm_complex_double_LUCU_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUCU");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LUCU",1.0);
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LUCU", 1.0);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_RLNN_one ) {
+TEST_F(TestCategory, trmm_complex_double_RLNN_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLNN");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RLNN",1.0);
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RLNN", 1.0);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_RLNU_one ) {
+TEST_F(TestCategory, trmm_complex_double_RLNU_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLNU");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RLNU",1.0);
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RLNU", 1.0);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_RLCN_one ) {
+TEST_F(TestCategory, trmm_complex_double_RLCN_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLCN");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RLCN",1.0);
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RLCN", 1.0);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_RLCU_one ) {
+TEST_F(TestCategory, trmm_complex_double_RLCU_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLCU");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RLCU",1.0);
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RLCU", 1.0);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_RUNN_one ) {
+TEST_F(TestCategory, trmm_complex_double_RUNN_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUNN");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RUNN",1.0);
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RUNN", 1.0);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_RUNU_one ) {
+TEST_F(TestCategory, trmm_complex_double_RUNU_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUNU");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RUNU",1.0);
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RUNU", 1.0);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_RUCN_one ) {
+TEST_F(TestCategory, trmm_complex_double_RUCN_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUCN");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RUCN",1.0);
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RUCN", 1.0);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_RUCU_one ) {
+TEST_F(TestCategory, trmm_complex_double_RUCU_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUCU");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RUCU",1.0);
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RUCU", 1.0);
   Kokkos::Profiling::popRegion();
 }
 ///////////////// alpha 4.5 /////////////////
-TEST_F( TestCategory, trmm_complex_double_LLNN_fourfive ) {
+TEST_F(TestCategory, trmm_complex_double_LLNN_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLNN");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LLNN",Kokkos::complex<double>(4.5,0.0));
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LLNN", Kokkos::complex<double>(4.5, 0.0));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_LLNU_fourfive ) {
+TEST_F(TestCategory, trmm_complex_double_LLNU_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLNU");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LLNU",Kokkos::complex<double>(4.5,0.0));
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LLNU", Kokkos::complex<double>(4.5, 0.0));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_LLCN_fourfive ) {
+TEST_F(TestCategory, trmm_complex_double_LLCN_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLCN");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LLCN",Kokkos::complex<double>(4.5,0.0));
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LLCN", Kokkos::complex<double>(4.5, 0.0));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_LLCU_fourfive ) {
+TEST_F(TestCategory, trmm_complex_double_LLCU_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLCU");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LLCU",Kokkos::complex<double>(4.5,0.0));
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LLCU", Kokkos::complex<double>(4.5, 0.0));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_LUNN_fourfive ) {
+TEST_F(TestCategory, trmm_complex_double_LUNN_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUNN");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LUNN",Kokkos::complex<double>(4.5,0.0));
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LUNN", Kokkos::complex<double>(4.5, 0.0));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_LUNU_fourfive ) {
+TEST_F(TestCategory, trmm_complex_double_LUNU_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUNU");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LUNU",Kokkos::complex<double>(4.5,0.0));
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LUNU", Kokkos::complex<double>(4.5, 0.0));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_LUCN_fourfive ) {
+TEST_F(TestCategory, trmm_complex_double_LUCN_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUCN");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LUCN",Kokkos::complex<double>(4.5,0.0));
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LUCN", Kokkos::complex<double>(4.5, 0.0));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_LUCU_fourfive ) {
+TEST_F(TestCategory, trmm_complex_double_LUCU_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUCU");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LUCU",Kokkos::complex<double>(4.5,0.0));
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LUCU", Kokkos::complex<double>(4.5, 0.0));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_RLNN_fourfive ) {
+TEST_F(TestCategory, trmm_complex_double_RLNN_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLNN");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RLNN",Kokkos::complex<double>(4.5,0.0));
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RLNN", Kokkos::complex<double>(4.5, 0.0));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_RLNU_fourfive ) {
+TEST_F(TestCategory, trmm_complex_double_RLNU_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLNU");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RLNU",Kokkos::complex<double>(4.5,0.0));
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RLNU", Kokkos::complex<double>(4.5, 0.0));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_RLCN_fourfive ) {
+TEST_F(TestCategory, trmm_complex_double_RLCN_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLCN");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RLCN",Kokkos::complex<double>(4.5,0.0));
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RLCN", Kokkos::complex<double>(4.5, 0.0));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_RLCU_fourfive ) {
+TEST_F(TestCategory, trmm_complex_double_RLCU_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLCU");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RLCU",Kokkos::complex<double>(4.5,0.0));
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RLCU", Kokkos::complex<double>(4.5, 0.0));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_RUNN_fourfive ) {
+TEST_F(TestCategory, trmm_complex_double_RUNN_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUNN");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RUNN",Kokkos::complex<double>(4.5,0.0));
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RUNN", Kokkos::complex<double>(4.5, 0.0));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_RUNU_fourfive ) {
+TEST_F(TestCategory, trmm_complex_double_RUNU_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUNU");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RUNU",Kokkos::complex<double>(4.5,0.0));
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RUNU", Kokkos::complex<double>(4.5, 0.0));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_RUCN_fourfive ) {
+TEST_F(TestCategory, trmm_complex_double_RUCN_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUCN");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RUCN",Kokkos::complex<double>(4.5,0.0));
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RUCN", Kokkos::complex<double>(4.5, 0.0));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_double_RUCU_fourfive ) {
+TEST_F(TestCategory, trmm_complex_double_RUCU_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUCU");
-    test_trmm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RUCU",Kokkos::complex<double>(4.5,0.0));
+  test_trmm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RUCU", Kokkos::complex<double>(4.5, 0.0));
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&         \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
 ///////////////// alpha 1.0 /////////////////
-TEST_F( TestCategory, trmm_complex_float_LLNN_one ) {
+TEST_F(TestCategory, trmm_complex_float_LLNN_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLNN");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LLNN",1.0f);
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LLNN", 1.0f);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_LLNU_one ) {
+TEST_F(TestCategory, trmm_complex_float_LLNU_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLNU");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LLNU",1.0f);
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LLNU", 1.0f);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_LLCN_one ) {
+TEST_F(TestCategory, trmm_complex_float_LLCN_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLCN");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LLCN",1.0f);
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LLCN", 1.0f);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_LLCU_one ) {
+TEST_F(TestCategory, trmm_complex_float_LLCU_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLCU");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LLCU",1.0f);
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LLCU", 1.0f);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_LUNN_one ) {
+TEST_F(TestCategory, trmm_complex_float_LUNN_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUNN");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LUNN",1.0f);
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LUNN", 1.0f);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_LUNU_one ) {
+TEST_F(TestCategory, trmm_complex_float_LUNU_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUNU");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LUNU",1.0f);
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LUNU", 1.0f);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_LUCN_one ) {
+TEST_F(TestCategory, trmm_complex_float_LUCN_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUCN");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LUCN",1.0f);
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LUCN", 1.0f);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_LUCU_one ) {
+TEST_F(TestCategory, trmm_complex_float_LUCU_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUCU");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LUCU",1.0f);
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LUCU", 1.0f);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_RLNN_one ) {
+TEST_F(TestCategory, trmm_complex_float_RLNN_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLNN");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RLNN",1.0f);
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RLNN", 1.0f);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_RLNU_one ) {
+TEST_F(TestCategory, trmm_complex_float_RLNU_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLNU");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RLNU",1.0f);
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RLNU", 1.0f);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_RLCN_one ) {
+TEST_F(TestCategory, trmm_complex_float_RLCN_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLCN");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RLCN",1.0f);
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RLCN", 1.0f);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_RLCU_one ) {
+TEST_F(TestCategory, trmm_complex_float_RLCU_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLCU");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RLCU",1.0f);
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RLCU", 1.0f);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_RUNN_one ) {
+TEST_F(TestCategory, trmm_complex_float_RUNN_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUNN");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RUNN",1.0f);
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RUNN", 1.0f);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_RUNU_one ) {
+TEST_F(TestCategory, trmm_complex_float_RUNU_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUNU");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RUNU",1.0f);
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RUNU", 1.0f);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_RUCN_one ) {
+TEST_F(TestCategory, trmm_complex_float_RUCN_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUCN");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RUCN",1.0f);
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RUCN", 1.0f);
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_RUCU_one ) {
+TEST_F(TestCategory, trmm_complex_float_RUCU_one) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUCU");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RUCU",1.0f);
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RUCU", 1.0f);
   Kokkos::Profiling::popRegion();
 }
 ///////////////// alpha 4.5 /////////////////
-TEST_F( TestCategory, trmm_complex_float_LLNN_fourfive ) {
+TEST_F(TestCategory, trmm_complex_float_LLNN_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLNN");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LLNN",Kokkos::complex<float>(4.5f,0.0f));
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LLNN", Kokkos::complex<float>(4.5f, 0.0f));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_LLNU_fourfive ) {
+TEST_F(TestCategory, trmm_complex_float_LLNU_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLNU");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LLNU",Kokkos::complex<float>(4.5f,0.0f));
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LLNU", Kokkos::complex<float>(4.5f, 0.0f));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_LLCN_fourfive ) {
+TEST_F(TestCategory, trmm_complex_float_LLCN_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLCN");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LLCN",Kokkos::complex<float>(4.5f,0.0f));
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LLCN", Kokkos::complex<float>(4.5f, 0.0f));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_LLCU_fourfive ) {
+TEST_F(TestCategory, trmm_complex_float_LLCU_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLCU");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LLCU",Kokkos::complex<float>(4.5f,0.0f));
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LLCU", Kokkos::complex<float>(4.5f, 0.0f));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_LUNN_fourfive ) {
+TEST_F(TestCategory, trmm_complex_float_LUNN_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUNN");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LUNN",Kokkos::complex<float>(4.5f,0.0f));
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LUNN", Kokkos::complex<float>(4.5f, 0.0f));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_LUNU_fourfive ) {
+TEST_F(TestCategory, trmm_complex_float_LUNU_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUNU");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LUNU",Kokkos::complex<float>(4.5f,0.0f));
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LUNU", Kokkos::complex<float>(4.5f, 0.0f));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_LUCN_fourfive ) {
+TEST_F(TestCategory, trmm_complex_float_LUCN_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUCN");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LUCN",Kokkos::complex<float>(4.5f,0.0f));
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LUCN", Kokkos::complex<float>(4.5f, 0.0f));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_LUCU_fourfive ) {
+TEST_F(TestCategory, trmm_complex_float_LUCU_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUCU");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LUCU",Kokkos::complex<float>(4.5f,0.0f));
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LUCU", Kokkos::complex<float>(4.5f, 0.0f));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_RLNN_fourfive ) {
+TEST_F(TestCategory, trmm_complex_float_RLNN_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLNN");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RLNN",Kokkos::complex<float>(4.5f,0.0f));
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RLNN", Kokkos::complex<float>(4.5f, 0.0f));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_RLNU_fourfive ) {
+TEST_F(TestCategory, trmm_complex_float_RLNU_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLNU");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RLNU",Kokkos::complex<float>(4.5f,0.0f));
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RLNU", Kokkos::complex<float>(4.5f, 0.0f));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_RLCN_fourfive ) {
+TEST_F(TestCategory, trmm_complex_float_RLCN_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLCN");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RLCN",Kokkos::complex<float>(4.5f,0.0f));
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RLCN", Kokkos::complex<float>(4.5f, 0.0f));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_RLCU_fourfive ) {
+TEST_F(TestCategory, trmm_complex_float_RLCU_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLCU");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RLCU",Kokkos::complex<float>(4.5f,0.0f));
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RLCU", Kokkos::complex<float>(4.5f, 0.0f));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_RUNN_fourfive ) {
+TEST_F(TestCategory, trmm_complex_float_RUNN_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUNN");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RUNN",Kokkos::complex<float>(4.5f,0.0f));
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RUNN", Kokkos::complex<float>(4.5f, 0.0f));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_RUNU_fourfive ) {
+TEST_F(TestCategory, trmm_complex_float_RUNU_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUNU");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RUNU",Kokkos::complex<float>(4.5f,0.0f));
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RUNU", Kokkos::complex<float>(4.5f, 0.0f));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_RUCN_fourfive ) {
+TEST_F(TestCategory, trmm_complex_float_RUCN_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUCN");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RUCN",Kokkos::complex<float>(4.5f,0.0f));
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RUCN", Kokkos::complex<float>(4.5f, 0.0f));
   Kokkos::Profiling::popRegion();
 }
-TEST_F( TestCategory, trmm_complex_float_RUCU_fourfive ) {
+TEST_F(TestCategory, trmm_complex_float_RUCU_fourfive) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUCU");
-    test_trmm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RUCU",Kokkos::complex<float>(4.5f,0.0f));
+  test_trmm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RUCU", Kokkos::complex<float>(4.5f, 0.0f));
   Kokkos::Profiling::popRegion();
 }
 #endif
diff --git a/unit_test/blas/Test_Blas3_trsm.hpp b/unit_test/blas/Test_Blas3_trsm.hpp
index a20cb2d141..a8fd3427c1 100644
--- a/unit_test/blas/Test_Blas3_trsm.hpp
+++ b/unit_test/blas/Test_Blas3_trsm.hpp
@@ -1,72 +1,73 @@
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas3_trsm.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas3_trsm.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
 
-  template<class ViewTypeA, class ExecutionSpace>
-  struct UnitDiagTRSM {
-    ViewTypeA A_;
-    using ScalarA = typename ViewTypeA::value_type;
-
-    UnitDiagTRSM (const ViewTypeA& A) : A_(A) {}
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i) const {
-      A_(i,i) = ScalarA(1);
-    }
-  };
-  template<class ViewTypeA, class ExecutionSpace>
-  struct NonUnitDiagTRSM {
-    ViewTypeA A_;
-    using ScalarA = typename ViewTypeA::value_type;
-
-    NonUnitDiagTRSM (const ViewTypeA& A) : A_(A) {}
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i) const {
-      A_(i,i) = A_(i,i)+10;
-    }
-  };
-
-  template<class ViewTypeA, class ViewTypeB, class ViewTypeC, class ExecutionSpace>
-  struct trsm_VanillaGEMM {
-    bool A_t, B_t, A_c, B_c;
-    int N,K;
-    ViewTypeA A;
-    ViewTypeB B;
-    ViewTypeC C;
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-    typedef typename ViewTypeC::value_type ScalarC;
-    typedef Kokkos::Details::ArithTraits<ScalarC> APT;
-    typedef typename APT::mag_type mag_type;
-    ScalarA alpha;
-    ScalarC beta;
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team) const {
+template <class ViewTypeA, class ExecutionSpace>
+struct UnitDiagTRSM {
+  ViewTypeA A_;
+  using ScalarA = typename ViewTypeA::value_type;
+
+  UnitDiagTRSM(const ViewTypeA& A) : A_(A) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const { A_(i, i) = ScalarA(1); }
+};
+template <class ViewTypeA, class ExecutionSpace>
+struct NonUnitDiagTRSM {
+  ViewTypeA A_;
+  using ScalarA = typename ViewTypeA::value_type;
+
+  NonUnitDiagTRSM(const ViewTypeA& A) : A_(A) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const { A_(i, i) = A_(i, i) + 10; }
+};
+
+template <class ViewTypeA, class ViewTypeB, class ViewTypeC,
+          class ExecutionSpace>
+struct trsm_VanillaGEMM {
+  bool A_t, B_t, A_c, B_c;
+  int N, K;
+  ViewTypeA A;
+  ViewTypeB B;
+  ViewTypeC C;
+
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+  typedef typename ViewTypeC::value_type ScalarC;
+  typedef Kokkos::Details::ArithTraits<ScalarC> APT;
+  typedef typename APT::mag_type mag_type;
+  ScalarA alpha;
+  ScalarC beta;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(
+      const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team)
+      const {
 // GNU COMPILER BUG WORKAROUND
-#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
+#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && \
+    !defined(__HIP_DEVICE_COMPILE__)
 
-      int i = team.league_rank();
+    int i = team.league_rank();
 #else
-      const int i = team.league_rank();
+    const int i = team.league_rank();
 #endif
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,N), [&] (const int& j) {
-        ScalarC C_ij = 0.0;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& j) {
+      ScalarC C_ij = 0.0;
 
-        // GNU 5.3, 5.4 and 6.1 (and maybe more) crash with another nested lambda here
+      // GNU 5.3, 5.4 and 6.1 (and maybe more) crash with another nested lambda
+      // here
 
 #if defined(KOKKOS_COMPILER_GNU) && !defined(KOKKOS_COMPILER_NVCC)
-        for(int k=0; k<K; k++) {
-          ScalarA A_ik = A_t?(A_c?APT::conj(A(k,i)):A(k,i)):A(i,k);
-          ScalarB B_kj = B_t?(B_c?APT::conj(B(j,k)):B(j,k)):B(k,j);
-          C_ij += A_ik*B_kj;
-        }
+      for (int k = 0; k < K; k++) {
+        ScalarA A_ik = A_t ? (A_c ? APT::conj(A(k, i)) : A(k, i)) : A(i, k);
+        ScalarB B_kj = B_t ? (B_c ? APT::conj(B(j, k)) : B(j, k)) : B(k, j);
+        C_ij += A_ik * B_kj;
+      }
 #else
         Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,K), [&] (const int& k, ScalarC& lsum) {
            ScalarA A_ik = A_t?(A_c?APT::conj(A(k,i)):A(k,i)):A(i,k);
@@ -75,310 +76,411 @@ namespace Test {
         },C_ij);
 #endif
 
-        C(i,j) = beta*C(i,j) + alpha*C_ij;
-      });
-    }
-  };
-  
-  
-  
-
-  template<class ViewTypeA, class ViewTypeB, class Device>
-  void impl_test_trsm(const char* side, const char* uplo, const char* trans, const char* diag, 
-                      int M, int N, typename ViewTypeA::value_type alpha) {
-
-    using execution_space = typename ViewTypeA::device_type::execution_space;
-    using ScalarA         = typename ViewTypeA::value_type;
-    using APT             = Kokkos::Details::ArithTraits<ScalarA>;
-    using mag_type        = typename APT::mag_type;
-    
-    double machine_eps = APT::epsilon();
-    bool A_l = (side[0]=='L') || (side[0]=='l');
-    int K = A_l?M:N;
-
-    //printf("KokkosBlas::trsm test for alpha %lf, %c %c %c %c, M %d, N %d, eps %.12lf, ViewType: %s\n", double(APT::abs(alpha)),side[0],uplo[0],trans[0],diag[0],M,N,1.0e8*machine_eps,typeid(ViewTypeA).name());
-
-    ViewTypeA A  ("A", K,K);
-    ViewTypeB B  ("B", M,N);
-    ViewTypeB X0 ("X0",M,N);
-
-    typename ViewTypeA::HostMirror h_A  = Kokkos::create_mirror_view(A);
-    typename ViewTypeB::HostMirror h_B  = Kokkos::create_mirror_view(B);
-    typename ViewTypeB::HostMirror h_X0 = Kokkos::create_mirror_view(X0);
-
-    uint64_t seed = Kokkos::Impl::clock_tic();
-    Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(seed);
-
-    if((diag[0]=='U')||(diag[0]=='u')) {
-      Kokkos::fill_random(A, rand_pool, Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, ScalarA>::max()*0.1);
-      using functor_type = UnitDiagTRSM<ViewTypeA,execution_space>;
-      functor_type udtrsm(A);
-      Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRSM", Kokkos::RangePolicy<execution_space>(0,K), udtrsm);
-    } else {//(diag[0]=='N')||(diag[0]=='n')
-      Kokkos::fill_random(A, rand_pool, Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, ScalarA>::max());
-      using functor_type = NonUnitDiagTRSM<ViewTypeA,execution_space>;
-      functor_type nudtrsm(A);
-      Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRSM", Kokkos::RangePolicy<execution_space>(0,K), nudtrsm);
-    }
-    Kokkos::fill_random(X0, rand_pool, Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, ScalarA>::max());
-
-    Kokkos::deep_copy(h_A,  A);
-    Kokkos::deep_copy(h_X0, X0);
-
-    ScalarA alpha_trmm = ScalarA(1)/alpha;
-    ScalarA beta       = ScalarA(0);
-
-    if ((uplo[0]=='L')||(uplo[0]=='l')) {
-      for (int i = 0; i < K-1; i++)
-        for (int j = i+1; j < K; j++)
-          h_A(i,j) = ScalarA(0);
-    }
-    else {
-      for (int i = 1; i < K; i++)
-        for (int j = 0; j < i; j++)
-          h_A(i,j) = ScalarA(0); 
-    }
-
-    Kokkos::deep_copy(A, h_A);
+      C(i, j) = beta * C(i, j) + alpha * C_ij;
+    });
+  }
+};
+
+template <class ViewTypeA, class ViewTypeB, class Device>
+void impl_test_trsm(const char* side, const char* uplo, const char* trans,
+                    const char* diag, int M, int N,
+                    typename ViewTypeA::value_type alpha) {
+  using execution_space = typename ViewTypeA::device_type::execution_space;
+  using ScalarA         = typename ViewTypeA::value_type;
+  using APT             = Kokkos::Details::ArithTraits<ScalarA>;
+  using mag_type        = typename APT::mag_type;
+
+  double machine_eps = APT::epsilon();
+  bool A_l           = (side[0] == 'L') || (side[0] == 'l');
+  int K              = A_l ? M : N;
+
+  // printf("KokkosBlas::trsm test for alpha %lf, %c %c %c %c, M %d, N %d, eps
+  // %.12lf, ViewType: %s\n",
+  // double(APT::abs(alpha)),side[0],uplo[0],trans[0],diag[0],M,N,1.0e8*machine_eps,typeid(ViewTypeA).name());
+
+  ViewTypeA A("A", K, K);
+  ViewTypeB B("B", M, N);
+  ViewTypeB X0("X0", M, N);
+
+  typename ViewTypeA::HostMirror h_A  = Kokkos::create_mirror_view(A);
+  typename ViewTypeB::HostMirror h_B  = Kokkos::create_mirror_view(B);
+  typename ViewTypeB::HostMirror h_X0 = Kokkos::create_mirror_view(X0);
+
+  uint64_t seed = Kokkos::Impl::clock_tic();
+  Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(seed);
+
+  if ((diag[0] == 'U') || (diag[0] == 'u')) {
+    Kokkos::fill_random(A, rand_pool,
+                        Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
+                                     ScalarA>::max() *
+                            0.1);
+    using functor_type = UnitDiagTRSM<ViewTypeA, execution_space>;
+    functor_type udtrsm(A);
+    Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRSM",
+                         Kokkos::RangePolicy<execution_space>(0, K), udtrsm);
+  } else {  //(diag[0]=='N')||(diag[0]=='n')
+    Kokkos::fill_random(A, rand_pool,
+                        Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
+                                     ScalarA>::max());
+    using functor_type = NonUnitDiagTRSM<ViewTypeA, execution_space>;
+    functor_type nudtrsm(A);
+    Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRSM",
+                         Kokkos::RangePolicy<execution_space>(0, K), nudtrsm);
+  }
+  Kokkos::fill_random(
+      X0, rand_pool,
+      Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, ScalarA>::max());
+
+  Kokkos::deep_copy(h_A, A);
+  Kokkos::deep_copy(h_X0, X0);
+
+  ScalarA alpha_trmm = ScalarA(1) / alpha;
+  ScalarA beta       = ScalarA(0);
+
+  if ((uplo[0] == 'L') || (uplo[0] == 'l')) {
+    for (int i = 0; i < K - 1; i++)
+      for (int j = i + 1; j < K; j++) h_A(i, j) = ScalarA(0);
+  } else {
+    for (int i = 1; i < K; i++)
+      for (int j = 0; j < i; j++) h_A(i, j) = ScalarA(0);
+  }
 
-    struct trsm_VanillaGEMM<ViewTypeB,ViewTypeA,ViewTypeB,execution_space> vgemm;
-    if (A_l){
-      vgemm.A_t = (trans[0]!='N') && (trans[0]!='n'); vgemm.B_t = false;
-      vgemm.A_c = (trans[0]=='C') || (trans[0]=='c'); vgemm.B_c = false;
-      vgemm.A = A;     vgemm.B = X0;
-    }
-    else {
-      vgemm.A_t = false; vgemm.B_t = (trans[0]!='N') && (trans[0]!='n');
-      vgemm.A_c = false; vgemm.B_c = (trans[0]=='C') || (trans[0]=='c');
-      vgemm.A = X0;    vgemm.B = A;
-    }
-    vgemm.N = N;
-    vgemm.K = K;
-    vgemm.C = B;
-    vgemm.alpha = alpha_trmm;
-    vgemm.beta = beta;
-    Kokkos::parallel_for("KokkosBlas::Test::trsm_VanillaGEMM", Kokkos::TeamPolicy<execution_space>(M,Kokkos::AUTO,16), vgemm);
-    Kokkos::fence();
-
-    KokkosBlas::trsm(side, uplo, trans, diag, alpha, A, B);
-
-    Kokkos::fence();
-
-    Kokkos::deep_copy(h_B, B);
-
-    // Checking vs ref on CPU, this eps is about 10^-6
-    const mag_type eps = 1.0e8 * machine_eps;
-    bool test_flag = true;
-    for (int i=0; i<M; i++) {
-      for (int j=0; j<N; j++) {
-        if ( APT::abs(h_B(i,j) - h_X0(i,j)) > eps ) {
-          test_flag = false;
-          //printf("   Error: abs_result( %.15lf ) != abs_solution( %.15lf ) (abs result-solution %.15lf) at (i %ld, j %ld)\n", APT::abs(h_B(i,j)), APT::abs(h_X0(i,j)), APT::abs(h_B(i,j) - h_X0(i,j)), i, j);
-          break;
-        }
+  Kokkos::deep_copy(A, h_A);
+
+  struct trsm_VanillaGEMM<ViewTypeB, ViewTypeA, ViewTypeB, execution_space>
+      vgemm;
+  if (A_l) {
+    vgemm.A_t = (trans[0] != 'N') && (trans[0] != 'n');
+    vgemm.B_t = false;
+    vgemm.A_c = (trans[0] == 'C') || (trans[0] == 'c');
+    vgemm.B_c = false;
+    vgemm.A   = A;
+    vgemm.B   = X0;
+  } else {
+    vgemm.A_t = false;
+    vgemm.B_t = (trans[0] != 'N') && (trans[0] != 'n');
+    vgemm.A_c = false;
+    vgemm.B_c = (trans[0] == 'C') || (trans[0] == 'c');
+    vgemm.A   = X0;
+    vgemm.B   = A;
+  }
+  vgemm.N     = N;
+  vgemm.K     = K;
+  vgemm.C     = B;
+  vgemm.alpha = alpha_trmm;
+  vgemm.beta  = beta;
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::trsm_VanillaGEMM",
+      Kokkos::TeamPolicy<execution_space>(
+          M, Kokkos::AUTO,
+          KokkosKernels::Impl::kk_get_max_vector_size<execution_space>()),
+      vgemm);
+  Kokkos::fence();
+
+  KokkosBlas::trsm(side, uplo, trans, diag, alpha, A, B);
+
+  Kokkos::fence();
+
+  Kokkos::deep_copy(h_B, B);
+
+  // Checking vs ref on CPU, this eps is about 10^-6
+  const mag_type eps = 1.0e8 * machine_eps;
+  bool test_flag     = true;
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++) {
+      if (APT::abs(h_B(i, j) - h_X0(i, j)) > eps) {
+        test_flag = false;
+        // printf("   Error: abs_result( %.15lf ) != abs_solution( %.15lf ) (abs
+        // result-solution %.15lf) at (i %ld, j %ld)\n", APT::abs(h_B(i,j)),
+        // APT::abs(h_X0(i,j)), APT::abs(h_B(i,j) - h_X0(i,j)), i, j);
+        break;
       }
-      if (!test_flag) break;
     }
-    ASSERT_EQ( test_flag, true );
+    if (!test_flag) break;
   }
+  ASSERT_EQ(test_flag, true);
 }
+}  // namespace Test
 
-template<class ScalarA, class ScalarB, class Device>
+template <class ScalarA, class ScalarB, class Device>
 int test_trsm(const char* mode, ScalarA alpha) {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   using view_type_a_ll = Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device>;
   using view_type_b_ll = Kokkos::View<ScalarB**, Kokkos::LayoutLeft, Device>;
-  Test::impl_test_trsm<view_type_a_ll, view_type_b_ll, Device>(&mode[0],&mode[1],&mode[2],&mode[3],0,0,alpha);
-  Test::impl_test_trsm<view_type_a_ll, view_type_b_ll, Device>(&mode[0],&mode[1],&mode[2],&mode[3],101,19,alpha);
-  Test::impl_test_trsm<view_type_a_ll, view_type_b_ll, Device>(&mode[0],&mode[1],&mode[2],&mode[3],19,101,alpha);
-  Test::impl_test_trsm<view_type_a_ll, view_type_b_ll, Device>(&mode[0],&mode[1],&mode[2],&mode[3],343,201,alpha);
+  Test::impl_test_trsm<view_type_a_ll, view_type_b_ll, Device>(
+      &mode[0], &mode[1], &mode[2], &mode[3], 0, 0, alpha);
+  Test::impl_test_trsm<view_type_a_ll, view_type_b_ll, Device>(
+      &mode[0], &mode[1], &mode[2], &mode[3], 101, 19, alpha);
+  Test::impl_test_trsm<view_type_a_ll, view_type_b_ll, Device>(
+      &mode[0], &mode[1], &mode[2], &mode[3], 19, 101, alpha);
+  Test::impl_test_trsm<view_type_a_ll, view_type_b_ll, Device>(
+      &mode[0], &mode[1], &mode[2], &mode[3], 343, 201, alpha);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   using view_type_a_lr = Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device>;
   using view_type_b_lr = Kokkos::View<ScalarB**, Kokkos::LayoutRight, Device>;
-  Test::impl_test_trsm<view_type_a_lr, view_type_b_lr, Device>(&mode[0],&mode[1],&mode[2],&mode[3],0,0,alpha);
-  Test::impl_test_trsm<view_type_a_lr, view_type_b_lr, Device>(&mode[0],&mode[1],&mode[2],&mode[3],101,19,alpha);
-  Test::impl_test_trsm<view_type_a_lr, view_type_b_lr, Device>(&mode[0],&mode[1],&mode[2],&mode[3],19,101,alpha);
-  Test::impl_test_trsm<view_type_a_lr, view_type_b_lr, Device>(&mode[0],&mode[1],&mode[2],&mode[3],343,201,alpha);
+  Test::impl_test_trsm<view_type_a_lr, view_type_b_lr, Device>(
+      &mode[0], &mode[1], &mode[2], &mode[3], 0, 0, alpha);
+  Test::impl_test_trsm<view_type_a_lr, view_type_b_lr, Device>(
+      &mode[0], &mode[1], &mode[2], &mode[3], 101, 19, alpha);
+  Test::impl_test_trsm<view_type_a_lr, view_type_b_lr, Device>(
+      &mode[0], &mode[1], &mode[2], &mode[3], 19, 101, alpha);
+  Test::impl_test_trsm<view_type_a_lr, view_type_b_lr, Device>(
+      &mode[0], &mode[1], &mode[2], &mode[3], 343, 201, alpha);
 #endif
 
   return 1;
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, trsm_float ) {
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, trsm_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trsm_float");
-    float alpha = 1.0f;
-    test_trsm<float,float,TestExecSpace> ("LLNN",alpha);
-    test_trsm<float,float,TestExecSpace> ("LLNU",alpha);
-    test_trsm<float,float,TestExecSpace> ("LLTN",alpha);
-    test_trsm<float,float,TestExecSpace> ("LLTU",alpha);
-    test_trsm<float,float,TestExecSpace> ("LUNN",alpha);
-    test_trsm<float,float,TestExecSpace> ("LUNU",alpha);
-    test_trsm<float,float,TestExecSpace> ("LUTN",alpha);
-    test_trsm<float,float,TestExecSpace> ("LUTU",alpha);
-
-    test_trsm<float,float,TestExecSpace> ("RLNN",alpha);
-    test_trsm<float,float,TestExecSpace> ("RLNU",alpha);
-    test_trsm<float,float,TestExecSpace> ("RLTN",alpha);
-    test_trsm<float,float,TestExecSpace> ("RLTU",alpha);
-    test_trsm<float,float,TestExecSpace> ("RUNN",alpha);
-    test_trsm<float,float,TestExecSpace> ("RUNU",alpha);
-    test_trsm<float,float,TestExecSpace> ("RUTN",alpha);
-    test_trsm<float,float,TestExecSpace> ("RUTU",alpha);
-
-    alpha = 4.5f;
-    test_trsm<float,float,TestExecSpace> ("LLNN",alpha);
-    test_trsm<float,float,TestExecSpace> ("LLNU",alpha);
-    test_trsm<float,float,TestExecSpace> ("LLTN",alpha);
-    test_trsm<float,float,TestExecSpace> ("LLTU",alpha);
-    test_trsm<float,float,TestExecSpace> ("LUNN",alpha);
-    test_trsm<float,float,TestExecSpace> ("LUNU",alpha);
-    test_trsm<float,float,TestExecSpace> ("LUTN",alpha);
-    test_trsm<float,float,TestExecSpace> ("LUTU",alpha);
-
-    test_trsm<float,float,TestExecSpace> ("RLNN",alpha);
-    test_trsm<float,float,TestExecSpace> ("RLNU",alpha);
-    test_trsm<float,float,TestExecSpace> ("RLTN",alpha);
-    test_trsm<float,float,TestExecSpace> ("RLTU",alpha);
-    test_trsm<float,float,TestExecSpace> ("RUNN",alpha);
-    test_trsm<float,float,TestExecSpace> ("RUNU",alpha);
-    test_trsm<float,float,TestExecSpace> ("RUTN",alpha);
-    test_trsm<float,float,TestExecSpace> ("RUTU",alpha);
+  float alpha = 1.0f;
+  test_trsm<float, float, TestExecSpace>("LLNN", alpha);
+  test_trsm<float, float, TestExecSpace>("LLNU", alpha);
+  test_trsm<float, float, TestExecSpace>("LLTN", alpha);
+  test_trsm<float, float, TestExecSpace>("LLTU", alpha);
+  test_trsm<float, float, TestExecSpace>("LUNN", alpha);
+  test_trsm<float, float, TestExecSpace>("LUNU", alpha);
+  test_trsm<float, float, TestExecSpace>("LUTN", alpha);
+  test_trsm<float, float, TestExecSpace>("LUTU", alpha);
+
+  test_trsm<float, float, TestExecSpace>("RLNN", alpha);
+  test_trsm<float, float, TestExecSpace>("RLNU", alpha);
+  test_trsm<float, float, TestExecSpace>("RLTN", alpha);
+  test_trsm<float, float, TestExecSpace>("RLTU", alpha);
+  test_trsm<float, float, TestExecSpace>("RUNN", alpha);
+  test_trsm<float, float, TestExecSpace>("RUNU", alpha);
+  test_trsm<float, float, TestExecSpace>("RUTN", alpha);
+  test_trsm<float, float, TestExecSpace>("RUTU", alpha);
+
+  alpha = 4.5f;
+  test_trsm<float, float, TestExecSpace>("LLNN", alpha);
+  test_trsm<float, float, TestExecSpace>("LLNU", alpha);
+  test_trsm<float, float, TestExecSpace>("LLTN", alpha);
+  test_trsm<float, float, TestExecSpace>("LLTU", alpha);
+  test_trsm<float, float, TestExecSpace>("LUNN", alpha);
+  test_trsm<float, float, TestExecSpace>("LUNU", alpha);
+  test_trsm<float, float, TestExecSpace>("LUTN", alpha);
+  test_trsm<float, float, TestExecSpace>("LUTU", alpha);
+
+  test_trsm<float, float, TestExecSpace>("RLNN", alpha);
+  test_trsm<float, float, TestExecSpace>("RLNU", alpha);
+  test_trsm<float, float, TestExecSpace>("RLTN", alpha);
+  test_trsm<float, float, TestExecSpace>("RLTU", alpha);
+  test_trsm<float, float, TestExecSpace>("RUNN", alpha);
+  test_trsm<float, float, TestExecSpace>("RUNU", alpha);
+  test_trsm<float, float, TestExecSpace>("RUTN", alpha);
+  test_trsm<float, float, TestExecSpace>("RUTU", alpha);
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, trsm_double ) {
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, trsm_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trsm_double");
-    double alpha = 1.0;
-    test_trsm<double,double,TestExecSpace> ("LLNN",alpha);
-    test_trsm<double,double,TestExecSpace> ("LLNU",alpha);
-    test_trsm<double,double,TestExecSpace> ("LLTN",alpha);
-    test_trsm<double,double,TestExecSpace> ("LLTU",alpha);
-    test_trsm<double,double,TestExecSpace> ("LUNN",alpha);
-    test_trsm<double,double,TestExecSpace> ("LUNU",alpha);
-    test_trsm<double,double,TestExecSpace> ("LUTN",alpha);
-    test_trsm<double,double,TestExecSpace> ("LUTU",alpha);
-
-    test_trsm<double,double,TestExecSpace> ("RLNN",alpha);
-    test_trsm<double,double,TestExecSpace> ("RLNU",alpha);
-    test_trsm<double,double,TestExecSpace> ("RLTN",alpha);
-    test_trsm<double,double,TestExecSpace> ("RLTU",alpha);
-    test_trsm<double,double,TestExecSpace> ("RUNN",alpha);
-    test_trsm<double,double,TestExecSpace> ("RUNU",alpha);
-    test_trsm<double,double,TestExecSpace> ("RUTN",alpha);
-    test_trsm<double,double,TestExecSpace> ("RUTU",alpha);
-
-    alpha = 4.5;
-    test_trsm<double,double,TestExecSpace> ("LLNN",alpha);
-    test_trsm<double,double,TestExecSpace> ("LLNU",alpha);
-    test_trsm<double,double,TestExecSpace> ("LLTN",alpha);
-    test_trsm<double,double,TestExecSpace> ("LLTU",alpha);
-    test_trsm<double,double,TestExecSpace> ("LUNN",alpha);
-    test_trsm<double,double,TestExecSpace> ("LUNU",alpha);
-    test_trsm<double,double,TestExecSpace> ("LUTN",alpha);
-    test_trsm<double,double,TestExecSpace> ("LUTU",alpha);
-
-    test_trsm<double,double,TestExecSpace> ("RLNN",alpha);
-    test_trsm<double,double,TestExecSpace> ("RLNU",alpha);
-    test_trsm<double,double,TestExecSpace> ("RLTN",alpha);
-    test_trsm<double,double,TestExecSpace> ("RLTU",alpha);
-    test_trsm<double,double,TestExecSpace> ("RUNN",alpha);
-    test_trsm<double,double,TestExecSpace> ("RUNU",alpha);
-    test_trsm<double,double,TestExecSpace> ("RUTN",alpha);
-    test_trsm<double,double,TestExecSpace> ("RUTU",alpha);
+  double alpha = 1.0;
+  test_trsm<double, double, TestExecSpace>("LLNN", alpha);
+  test_trsm<double, double, TestExecSpace>("LLNU", alpha);
+  test_trsm<double, double, TestExecSpace>("LLTN", alpha);
+  test_trsm<double, double, TestExecSpace>("LLTU", alpha);
+  test_trsm<double, double, TestExecSpace>("LUNN", alpha);
+  test_trsm<double, double, TestExecSpace>("LUNU", alpha);
+  test_trsm<double, double, TestExecSpace>("LUTN", alpha);
+  test_trsm<double, double, TestExecSpace>("LUTU", alpha);
+
+  test_trsm<double, double, TestExecSpace>("RLNN", alpha);
+  test_trsm<double, double, TestExecSpace>("RLNU", alpha);
+  test_trsm<double, double, TestExecSpace>("RLTN", alpha);
+  test_trsm<double, double, TestExecSpace>("RLTU", alpha);
+  test_trsm<double, double, TestExecSpace>("RUNN", alpha);
+  test_trsm<double, double, TestExecSpace>("RUNU", alpha);
+  test_trsm<double, double, TestExecSpace>("RUTN", alpha);
+  test_trsm<double, double, TestExecSpace>("RUTU", alpha);
+
+  alpha = 4.5;
+  test_trsm<double, double, TestExecSpace>("LLNN", alpha);
+  test_trsm<double, double, TestExecSpace>("LLNU", alpha);
+  test_trsm<double, double, TestExecSpace>("LLTN", alpha);
+  test_trsm<double, double, TestExecSpace>("LLTU", alpha);
+  test_trsm<double, double, TestExecSpace>("LUNN", alpha);
+  test_trsm<double, double, TestExecSpace>("LUNU", alpha);
+  test_trsm<double, double, TestExecSpace>("LUTN", alpha);
+  test_trsm<double, double, TestExecSpace>("LUTU", alpha);
+
+  test_trsm<double, double, TestExecSpace>("RLNN", alpha);
+  test_trsm<double, double, TestExecSpace>("RLNU", alpha);
+  test_trsm<double, double, TestExecSpace>("RLTN", alpha);
+  test_trsm<double, double, TestExecSpace>("RLTU", alpha);
+  test_trsm<double, double, TestExecSpace>("RUNN", alpha);
+  test_trsm<double, double, TestExecSpace>("RUNU", alpha);
+  test_trsm<double, double, TestExecSpace>("RUTN", alpha);
+  test_trsm<double, double, TestExecSpace>("RUTU", alpha);
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, trsm_complex_double ) {
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, trsm_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trsm_complex_double");
-    Kokkos::complex<double> alpha = 1.0;
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LLNN",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LLNU",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LLCN",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LLCU",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LUNN",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LUNU",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LUCN",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LUCU",alpha);
-    
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RLNN",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RLNU",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RLCN",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RLCU",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RUNN",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RUNU",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RUCN",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RUCU",alpha);
-
-    alpha = Kokkos::complex<double>(4.5,0.0);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LLNN",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LLNU",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LLCN",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LLCU",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LUNN",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LUNU",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LUCN",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("LUCU",alpha);
-    
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RLNN",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RLNU",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RLCN",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RLCU",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RUNN",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RUNU",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RUCN",alpha);
-    test_trsm<Kokkos::complex<double>,Kokkos::complex<double>,TestExecSpace> ("RUCU",alpha);
+  Kokkos::complex<double> alpha = 1.0;
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LLNN", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LLNU", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LLCN", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LLCU", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LUNN", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LUNU", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LUCN", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LUCU", alpha);
+
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RLNN", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RLNU", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RLCN", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RLCU", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RUNN", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RUNU", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RUCN", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RUCU", alpha);
+
+  alpha = Kokkos::complex<double>(4.5, 0.0);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LLNN", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LLNU", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LLCN", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LLCU", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LUNN", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LUNU", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LUCN", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "LUCU", alpha);
+
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RLNN", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RLNU", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RLCN", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RLCU", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RUNN", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RUNU", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RUCN", alpha);
+  test_trsm<Kokkos::complex<double>, Kokkos::complex<double>, TestExecSpace>(
+      "RUCU", alpha);
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, trsm_complex_float ) {
+#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&         \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, trsm_complex_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trsm_complex_float");
-    Kokkos::complex<float> alpha = 1.0f;
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LLNN",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LLNU",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LLCN",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LLCU",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LUNN",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LUNU",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LUCN",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LUCU",alpha);
-
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RLNN",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RLNU",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RLCN",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RLCU",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RUNN",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RUNU",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RUCN",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RUCU",alpha);
-
-    alpha = Kokkos::complex<float>(4.5f,0.0f);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LLNN",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LLNU",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LLCN",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LLCU",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LUNN",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LUNU",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LUCN",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("LUCU",alpha);
-
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RLNN",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RLNU",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RLCN",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RLCU",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RUNN",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RUNU",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RUCN",alpha);
-    test_trsm<Kokkos::complex<float>,Kokkos::complex<float>,TestExecSpace> ("RUCU",alpha);
+  Kokkos::complex<float> alpha = 1.0f;
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LLNN", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LLNU", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LLCN", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LLCU", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LUNN", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LUNU", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LUCN", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LUCU", alpha);
+
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RLNN", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RLNU", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RLCN", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RLCU", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RUNN", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RUNU", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RUCN", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RUCU", alpha);
+
+  alpha = Kokkos::complex<float>(4.5f, 0.0f);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LLNN", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LLNU", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LLCN", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LLCU", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LUNN", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LUNU", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LUCN", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "LUCU", alpha);
+
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RLNN", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RLNU", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RLCN", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RLCU", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RUNN", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RUNU", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RUCN", alpha);
+  test_trsm<Kokkos::complex<float>, Kokkos::complex<float>, TestExecSpace>(
+      "RUCU", alpha);
   Kokkos::Profiling::popRegion();
 }
 #endif
diff --git a/unit_test/blas/Test_Blas_gesv.hpp b/unit_test/blas/Test_Blas_gesv.hpp
index 1d94094a58..3da1e1cb52 100644
--- a/unit_test/blas/Test_Blas_gesv.hpp
+++ b/unit_test/blas/Test_Blas_gesv.hpp
@@ -3,353 +3,402 @@
 //       in the CUDA backend if TPL MAGMA
 //       has been enabled.
 
-#if !defined(TEST_CUDA_BLAS_CPP) \
-  || ( defined(TEST_CUDA_BLAS_CPP) && defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) )
+#if !defined(TEST_CUDA_BLAS_CPP) || \
+    (defined(TEST_CUDA_BLAS_CPP) && defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA))
 
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
 
-#include<KokkosBlas_gesv.hpp>
-#include<KokkosBlas2_gemv.hpp>
-#include<KokkosBlas3_gemm.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <KokkosBlas_gesv.hpp>
+#include <KokkosBlas2_gemv.hpp>
+#include <KokkosBlas3_gemm.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
 
-template<class ViewTypeA, class ViewTypeB, class Device>
+template <class ViewTypeA, class ViewTypeB, class Device>
 void impl_test_gesv(const char* mode, const char* padding, int N) {
-    typedef typename Device::execution_space execution_space;
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef Kokkos::Details::ArithTraits<ScalarA> ats;
+  typedef typename Device::execution_space execution_space;
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef Kokkos::Details::ArithTraits<ScalarA> ats;
 
-    Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(13718);
+  Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(13718);
 
-    int ldda, lddb;
+  int ldda, lddb;
 
-    if(padding[0]=='Y') {//rounded up to multiple of 32
-      ldda = ((N+32-1)/32)*32;
-      lddb = ldda;
-    }
-    else {
-      ldda = N;
-      lddb = N;
-    }
-
-    // Create device views
-    ViewTypeA A ( "A",  ldda, N );
-    ViewTypeB X0( "X0", N );
-    ViewTypeB B ( "B",  lddb );
-
-    // Create host mirrors of device views.
-    typename ViewTypeB::HostMirror h_X0 = Kokkos::create_mirror_view(X0);
-    typename ViewTypeB::HostMirror h_B  = Kokkos::create_mirror(B);
-
-    // Initialize data.
-    Kokkos::fill_random(A, rand_pool,Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,ScalarA >::max());
-    Kokkos::fill_random(X0,rand_pool,Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,ScalarA >::max());
-
-    // Generate RHS B = A*X0.
-    ScalarA alpha = 1.0;
-    ScalarA beta  = 0.0;
-
-    KokkosBlas::gemv("N",alpha,A,X0,beta,B); Kokkos::fence();
-
-    // Deep copy device view to host view.
-    Kokkos::deep_copy( h_X0, X0 );
+  if (padding[0] == 'Y') {  // rounded up to multiple of 32
+    ldda = ((N + 32 - 1) / 32) * 32;
+    lddb = ldda;
+  } else {
+    ldda = N;
+    lddb = N;
+  }
 
-    // Allocate IPIV view on host
-    typedef Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::HostSpace> ViewTypeP;
-    ViewTypeP ipiv;
-    int Nt = 0;
-    if(mode[0]=='Y') {
-      Nt = N;
-      ipiv = ViewTypeP("IPIV", Nt);
-    }
+  // Create device views
+  ViewTypeA A("A", ldda, N);
+  ViewTypeB X0("X0", N);
+  ViewTypeB B("B", lddb);
+
+  // Create host mirrors of device views.
+  typename ViewTypeB::HostMirror h_X0 = Kokkos::create_mirror_view(X0);
+  typename ViewTypeB::HostMirror h_B  = Kokkos::create_mirror(B);
+
+  // Initialize data.
+  Kokkos::fill_random(
+      A, rand_pool,
+      Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, ScalarA>::max());
+  Kokkos::fill_random(
+      X0, rand_pool,
+      Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, ScalarA>::max());
+
+  // Generate RHS B = A*X0.
+  ScalarA alpha = 1.0;
+  ScalarA beta  = 0.0;
+
+  KokkosBlas::gemv("N", alpha, A, X0, beta, B);
+  Kokkos::fence();
+
+  // Deep copy device view to host view.
+  Kokkos::deep_copy(h_X0, X0);
+
+  // Allocate IPIV view on host
+  typedef Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::HostSpace> ViewTypeP;
+  ViewTypeP ipiv;
+  int Nt = 0;
+  if (mode[0] == 'Y') {
+    Nt   = N;
+    ipiv = ViewTypeP("IPIV", Nt);
+  }
 
-    // Solve.
-    try {
-      KokkosBlas::gesv(A,B,ipiv);  
-    } catch (const std::runtime_error& error) {
-      // Check for expected runtime errors due to:
-      // no-pivoting case (note: only MAGMA supports no-pivoting interface) 
-      // and no-tpl case
-      bool nopivot_runtime_err = false;
-	  bool notpl_runtime_err = false;
-#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA //have MAGMA TPL
-  #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS //and have BLAS TPL
-      nopivot_runtime_err = 
-       (!std::is_same< typename Device::memory_space, Kokkos::CudaSpace >::value) &&
-       (ipiv.extent(0) == 0) && (ipiv.data()==nullptr);
-      notpl_runtime_err = false;
-  #else
-      notpl_runtime_err = true;
-  #endif
-#else //not have MAGMA TPL
-  #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS //but have BLAS TPL
-      nopivot_runtime_err = (ipiv.extent(0) == 0) && (ipiv.data()==nullptr);
-      notpl_runtime_err = false;
-  #else
-      notpl_runtime_err = true;
-  #endif
+  // Solve.
+  try {
+    KokkosBlas::gesv(A, B, ipiv);
+  } catch (const std::runtime_error& error) {
+    // Check for expected runtime errors due to:
+    // no-pivoting case (note: only MAGMA supports no-pivoting interface)
+    // and no-tpl case
+    bool nopivot_runtime_err = false;
+    bool notpl_runtime_err   = false;
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA  // have MAGMA TPL
+#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS   // and have BLAS TPL
+    nopivot_runtime_err = (!std::is_same<typename Device::memory_space,
+                                         Kokkos::CudaSpace>::value) &&
+                          (ipiv.extent(0) == 0) && (ipiv.data() == nullptr);
+    notpl_runtime_err = false;
+#else
+    notpl_runtime_err = true;
 #endif
-      if (!nopivot_runtime_err && !notpl_runtime_err) FAIL();
-      return;
-    }
-    Kokkos::fence();
-
-    // Get the solution vector.
-    Kokkos::deep_copy( h_B, B );
-
-    // Checking vs ref on CPU, this eps is about 10^-9
-    typedef typename ats::mag_type mag_type;
-    const mag_type eps = 1.0e7 * ats::epsilon();
-    bool test_flag = true;
-    for (int i=0; i<N; i++) {
-      if ( ats::abs(h_B(i) - h_X0(i)) > eps ) {
-        test_flag = false;
-        //printf( "    Error %d, pivot %c, padding %c: result( %.15lf ) != solution( %.15lf ) at (%ld)\n", N, mode[0], padding[0], ats::abs(h_B(i)), ats::abs(h_X0(i)), i );
-        break;
-      }
-    }
-    ASSERT_EQ( test_flag, true );
+#else                                 // not have MAGMA TPL
+#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS  // but have BLAS TPL
+    nopivot_runtime_err = (ipiv.extent(0) == 0) && (ipiv.data() == nullptr);
+    notpl_runtime_err   = false;
+#else
+    notpl_runtime_err = true;
+#endif
+#endif
+    if (!nopivot_runtime_err && !notpl_runtime_err) FAIL();
+    return;
   }
-
-template<class ViewTypeA, class ViewTypeB, class Device>
-void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, int nrhs) {
-    typedef typename Device::execution_space execution_space;
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef Kokkos::Details::ArithTraits<ScalarA> ats;
-
-    Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(13718);
-
-    int ldda, lddb;
-
-    if(padding[0]=='Y') {//rounded up to multiple of 32
-      ldda = ((N+32-1)/32)*32;
-      lddb = ldda;
-    }
-    else {
-      ldda = N;
-      lddb = N;
+  Kokkos::fence();
+
+  // Get the solution vector.
+  Kokkos::deep_copy(h_B, B);
+
+  // Checking vs ref on CPU, this eps is about 10^-9
+  typedef typename ats::mag_type mag_type;
+  const mag_type eps = 1.0e7 * ats::epsilon();
+  bool test_flag     = true;
+  for (int i = 0; i < N; i++) {
+    if (ats::abs(h_B(i) - h_X0(i)) > eps) {
+      test_flag = false;
+      // printf( "    Error %d, pivot %c, padding %c: result( %.15lf ) !=
+      // solution( %.15lf ) at (%ld)\n", N, mode[0], padding[0],
+      // ats::abs(h_B(i)), ats::abs(h_X0(i)), i );
+      break;
     }
+  }
+  ASSERT_EQ(test_flag, true);
+}
 
-    // Create device views
-    ViewTypeA A ( "A",  ldda, N );
-    ViewTypeB X0( "X0", N, nrhs );
-    ViewTypeB B ( "B",  lddb, nrhs );
-
-    // Create host mirrors of device views.
-    typename ViewTypeB::HostMirror h_X0 = Kokkos::create_mirror_view( X0 );
-    typename ViewTypeB::HostMirror h_B  = Kokkos::create_mirror( B );
-
-    // Initialize data.
-    Kokkos::fill_random(A, rand_pool,Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,ScalarA >::max());
-    Kokkos::fill_random(X0,rand_pool,Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,ScalarA >::max());
+template <class ViewTypeA, class ViewTypeB, class Device>
+void impl_test_gesv_mrhs(const char* mode, const char* padding, int N,
+                         int nrhs) {
+  typedef typename Device::execution_space execution_space;
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef Kokkos::Details::ArithTraits<ScalarA> ats;
 
-    // Generate RHS B = A*X0.
-    ScalarA alpha = 1.0;
-    ScalarA beta  = 0.0;
+  Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(13718);
 
-    KokkosBlas::gemm("N","N",alpha,A,X0,beta,B); Kokkos::fence();
+  int ldda, lddb;
 
-    // Deep copy device view to host view.
-    Kokkos::deep_copy( h_X0, X0 );
+  if (padding[0] == 'Y') {  // rounded up to multiple of 32
+    ldda = ((N + 32 - 1) / 32) * 32;
+    lddb = ldda;
+  } else {
+    ldda = N;
+    lddb = N;
+  }
 
-    // Allocate IPIV view on host
-    typedef Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::HostSpace> ViewTypeP;
-    ViewTypeP ipiv;
-    int Nt = 0;
-    if(mode[0]=='Y') {
-      Nt = N;
-      ipiv = ViewTypeP("IPIV", Nt);
-    }
+  // Create device views
+  ViewTypeA A("A", ldda, N);
+  ViewTypeB X0("X0", N, nrhs);
+  ViewTypeB B("B", lddb, nrhs);
+
+  // Create host mirrors of device views.
+  typename ViewTypeB::HostMirror h_X0 = Kokkos::create_mirror_view(X0);
+  typename ViewTypeB::HostMirror h_B  = Kokkos::create_mirror(B);
+
+  // Initialize data.
+  Kokkos::fill_random(
+      A, rand_pool,
+      Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, ScalarA>::max());
+  Kokkos::fill_random(
+      X0, rand_pool,
+      Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, ScalarA>::max());
+
+  // Generate RHS B = A*X0.
+  ScalarA alpha = 1.0;
+  ScalarA beta  = 0.0;
+
+  KokkosBlas::gemm("N", "N", alpha, A, X0, beta, B);
+  Kokkos::fence();
+
+  // Deep copy device view to host view.
+  Kokkos::deep_copy(h_X0, X0);
+
+  // Allocate IPIV view on host
+  typedef Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::HostSpace> ViewTypeP;
+  ViewTypeP ipiv;
+  int Nt = 0;
+  if (mode[0] == 'Y') {
+    Nt   = N;
+    ipiv = ViewTypeP("IPIV", Nt);
+  }
 
-    // Solve.
-    try {
-      KokkosBlas::gesv(A,B,ipiv);
-    } catch (const std::runtime_error& error) {
-      // Check for expected runtime errors due to:
-      // no-pivoting case (note: only MAGMA supports no-pivoting interface) 
-      // and no-tpl case
-      bool nopivot_runtime_err = false;
-	  bool notpl_runtime_err = false;
-#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA //have MAGMA TPL
-  #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS //and have BLAS TPL
-      nopivot_runtime_err = 
-       (!std::is_same< typename Device::memory_space, Kokkos::CudaSpace >::value) &&
-       (ipiv.extent(0) == 0) && (ipiv.data()==nullptr);
-      notpl_runtime_err = false;
-  #else
-      notpl_runtime_err = true;
-  #endif
-#else //not have MAGMA TPL
-  #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS //but have BLAS TPL
-      nopivot_runtime_err = (ipiv.extent(0) == 0) && (ipiv.data()==nullptr);
-      notpl_runtime_err = false;
-  #else
-      notpl_runtime_err = true;
-  #endif
+  // Solve.
+  try {
+    KokkosBlas::gesv(A, B, ipiv);
+  } catch (const std::runtime_error& error) {
+    // Check for expected runtime errors due to:
+    // no-pivoting case (note: only MAGMA supports no-pivoting interface)
+    // and no-tpl case
+    bool nopivot_runtime_err = false;
+    bool notpl_runtime_err   = false;
+#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA  // have MAGMA TPL
+#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS   // and have BLAS TPL
+    nopivot_runtime_err = (!std::is_same<typename Device::memory_space,
+                                         Kokkos::CudaSpace>::value) &&
+                          (ipiv.extent(0) == 0) && (ipiv.data() == nullptr);
+    notpl_runtime_err = false;
+#else
+    notpl_runtime_err = true;
 #endif
-      if (!nopivot_runtime_err && !notpl_runtime_err) FAIL();
-      return;
-    }
-    Kokkos::fence();
-
-    // Get the solution vector.
-    Kokkos::deep_copy( h_B, B );
-
-    // Checking vs ref on CPU, this eps is about 10^-9
-    typedef typename ats::mag_type mag_type;
-    const mag_type eps = 1.0e7 * ats::epsilon();
-    bool test_flag = true;
-    for (int j=0; j<nrhs; j++) {
-      for (int i=0; i<N; i++) {
-        if ( ats::abs(h_B(i,j) - h_X0(i,j)) > eps ) {
-          test_flag = false;
-          //printf( "    Error %d, pivot %c, padding %c: result( %.15lf ) != solution( %.15lf ) at (%ld) at rhs %d\n", N, mode[0], padding[0], ats::abs(h_B(i,j)), ats::abs(h_X0(i,j)), i, j );
-          break;
-        }
+#else                                 // not have MAGMA TPL
+#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS  // but have BLAS TPL
+    nopivot_runtime_err = (ipiv.extent(0) == 0) && (ipiv.data() == nullptr);
+    notpl_runtime_err   = false;
+#else
+    notpl_runtime_err = true;
+#endif
+#endif
+    if (!nopivot_runtime_err && !notpl_runtime_err) FAIL();
+    return;
+  }
+  Kokkos::fence();
+
+  // Get the solution vector.
+  Kokkos::deep_copy(h_B, B);
+
+  // Checking vs ref on CPU, this eps is about 10^-9
+  typedef typename ats::mag_type mag_type;
+  const mag_type eps = 1.0e7 * ats::epsilon();
+  bool test_flag     = true;
+  for (int j = 0; j < nrhs; j++) {
+    for (int i = 0; i < N; i++) {
+      if (ats::abs(h_B(i, j) - h_X0(i, j)) > eps) {
+        test_flag = false;
+        // printf( "    Error %d, pivot %c, padding %c: result( %.15lf ) !=
+        // solution( %.15lf ) at (%ld) at rhs %d\n", N, mode[0], padding[0],
+        // ats::abs(h_B(i,j)), ats::abs(h_X0(i,j)), i, j );
+        break;
       }
-      if (test_flag == false) break;
     }
-    ASSERT_EQ( test_flag, true );
+    if (test_flag == false) break;
   }
+  ASSERT_EQ(test_flag, true);
+}
 
-}//namespace Test
+}  // namespace Test
 
-template<class Scalar, class Device>
+template <class Scalar, class Device>
 int test_gesv(const char* mode) {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<Scalar**, Kokkos::LayoutLeft, Device> view_type_a_ll;
-  typedef Kokkos::View<Scalar*,  Kokkos::LayoutLeft, Device> view_type_b_ll;
-  Test::impl_test_gesv<view_type_a_ll, view_type_b_ll, Device>(&mode[0], "N", 2);   //no padding
-  Test::impl_test_gesv<view_type_a_ll, view_type_b_ll, Device>(&mode[0], "N", 13);  //no padding
-  Test::impl_test_gesv<view_type_a_ll, view_type_b_ll, Device>(&mode[0], "N", 179); //no padding
-  Test::impl_test_gesv<view_type_a_ll, view_type_b_ll, Device>(&mode[0], "N", 64);  //no padding
-  Test::impl_test_gesv<view_type_a_ll, view_type_b_ll, Device>(&mode[0], "N", 1024);//no padding
-  Test::impl_test_gesv<view_type_a_ll, view_type_b_ll, Device>(&mode[0], "Y", 13);  //padding
-  Test::impl_test_gesv<view_type_a_ll, view_type_b_ll, Device>(&mode[0], "Y", 179); //padding
+  typedef Kokkos::View<Scalar*, Kokkos::LayoutLeft, Device> view_type_b_ll;
+  Test::impl_test_gesv<view_type_a_ll, view_type_b_ll, Device>(
+      &mode[0], "N", 2);  // no padding
+  Test::impl_test_gesv<view_type_a_ll, view_type_b_ll, Device>(
+      &mode[0], "N", 13);  // no padding
+  Test::impl_test_gesv<view_type_a_ll, view_type_b_ll, Device>(
+      &mode[0], "N", 179);  // no padding
+  Test::impl_test_gesv<view_type_a_ll, view_type_b_ll, Device>(
+      &mode[0], "N", 64);  // no padding
+  Test::impl_test_gesv<view_type_a_ll, view_type_b_ll, Device>(
+      &mode[0], "N", 1024);  // no padding
+  Test::impl_test_gesv<view_type_a_ll, view_type_b_ll, Device>(&mode[0], "Y",
+                                                               13);  // padding
+  Test::impl_test_gesv<view_type_a_ll, view_type_b_ll, Device>(&mode[0], "Y",
+                                                               179);  // padding
 #endif
 
-/*
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device> view_type_a_lr;
-  typedef Kokkos::View<ScalarB*,  Kokkos::LayoutRight, Device> view_type_b_lr;
-  Test::impl_test_gesv<view_type_a_lr, view_type_b_lr, Device>(&mode[0], "N", 2);   //no padding
-  Test::impl_test_gesv<view_type_a_lr, view_type_b_lr, Device>(&mode[0], "N", 13);  //no padding
-  Test::impl_test_gesv<view_type_a_lr, view_type_b_lr, Device>(&mode[0], "N", 179); //no padding
-  Test::impl_test_gesv<view_type_a_lr, view_type_b_lr, Device>(&mode[0], "N", 64);  //no padding
-  Test::impl_test_gesv<view_type_a_lr, view_type_b_lr, Device>(&mode[0], "N", 1024);//no padding
-  Test::impl_test_gesv<view_type_a_lr, view_type_b_lr, Device>(&mode[0], "Y", 13);  //padding
-  Test::impl_test_gesv<view_type_a_lr, view_type_b_lr, Device>(&mode[0], "Y", 179); //padding
-#endif
-*/
+  /*
+  #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) ||
+  (!defined(KOKKOSKERNELS_ETI_ONLY) &&
+  !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View<ScalarA**,
+  Kokkos::LayoutRight, Device> view_type_a_lr; typedef Kokkos::View<ScalarB*,
+  Kokkos::LayoutRight, Device> view_type_b_lr;
+    Test::impl_test_gesv<view_type_a_lr, view_type_b_lr, Device>(&mode[0], "N",
+  2);   //no padding Test::impl_test_gesv<view_type_a_lr, view_type_b_lr,
+  Device>(&mode[0], "N", 13);  //no padding Test::impl_test_gesv<view_type_a_lr,
+  view_type_b_lr, Device>(&mode[0], "N", 179); //no padding
+    Test::impl_test_gesv<view_type_a_lr, view_type_b_lr, Device>(&mode[0], "N",
+  64);  //no padding Test::impl_test_gesv<view_type_a_lr, view_type_b_lr,
+  Device>(&mode[0], "N", 1024);//no padding Test::impl_test_gesv<view_type_a_lr,
+  view_type_b_lr, Device>(&mode[0], "Y", 13);  //padding
+    Test::impl_test_gesv<view_type_a_lr, view_type_b_lr, Device>(&mode[0], "Y",
+  179); //padding #endif
+  */
   // Supress unused parameters on CUDA10
   (void)mode;
   return 1;
 }
 
-template<class Scalar, class Device>
+template <class Scalar, class Device>
 int test_gesv_mrhs(const char* mode) {
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
   typedef Kokkos::View<Scalar**, Kokkos::LayoutLeft, Device> view_type_a_ll;
   typedef Kokkos::View<Scalar**, Kokkos::LayoutLeft, Device> view_type_b_ll;
-  Test::impl_test_gesv_mrhs<view_type_a_ll, view_type_b_ll, Device>(&mode[0], "N", 2,   5);//no padding
-  Test::impl_test_gesv_mrhs<view_type_a_ll, view_type_b_ll, Device>(&mode[0], "N", 13,  5);//no padding
-  Test::impl_test_gesv_mrhs<view_type_a_ll, view_type_b_ll, Device>(&mode[0], "N", 179, 5);//no padding
-  Test::impl_test_gesv_mrhs<view_type_a_ll, view_type_b_ll, Device>(&mode[0], "N", 64,  5);//no padding
-  Test::impl_test_gesv_mrhs<view_type_a_ll, view_type_b_ll, Device>(&mode[0], "N", 1024,5);//no padding
-  Test::impl_test_gesv_mrhs<view_type_a_ll, view_type_b_ll, Device>(&mode[0], "Y", 13,  5);//padding
-  Test::impl_test_gesv_mrhs<view_type_a_ll, view_type_b_ll, Device>(&mode[0], "Y", 179, 5);//padding
+  Test::impl_test_gesv_mrhs<view_type_a_ll, view_type_b_ll, Device>(
+      &mode[0], "N", 2, 5);  // no padding
+  Test::impl_test_gesv_mrhs<view_type_a_ll, view_type_b_ll, Device>(
+      &mode[0], "N", 13, 5);  // no padding
+  Test::impl_test_gesv_mrhs<view_type_a_ll, view_type_b_ll, Device>(
+      &mode[0], "N", 179, 5);  // no padding
+  Test::impl_test_gesv_mrhs<view_type_a_ll, view_type_b_ll, Device>(
+      &mode[0], "N", 64, 5);  // no padding
+  Test::impl_test_gesv_mrhs<view_type_a_ll, view_type_b_ll, Device>(
+      &mode[0], "N", 1024, 5);  // no padding
+  Test::impl_test_gesv_mrhs<view_type_a_ll, view_type_b_ll, Device>(
+      &mode[0], "Y", 13, 5);  // padding
+  Test::impl_test_gesv_mrhs<view_type_a_ll, view_type_b_ll, Device>(
+      &mode[0], "Y", 179, 5);  // padding
 #endif
 
-/*
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  typedef Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device> view_type_a_lr;
-  typedef Kokkos::View<ScalarB**, Kokkos::LayoutRight, Device> view_type_b_lr;
-  Test::impl_test_gesv_mrhs<view_type_a_lr, view_type_b_lr, Device>(&mode[0], "N", 2,   5);//no padding
-  Test::impl_test_gesv_mrhs<view_type_a_lr, view_type_b_lr, Device>(&mode[0], "N", 13,  5);//no padding
-  Test::impl_test_gesv_mrhs<view_type_a_lr, view_type_b_lr, Device>(&mode[0], "N", 179, 5);//no padding
-  Test::impl_test_gesv_mrhs<view_type_a_lr, view_type_b_lr, Device>(&mode[0], "N", 64,  5);//no padding
-  Test::impl_test_gesv_mrhs<view_type_a_lr, view_type_b_lr, Device>(&mode[0], "N", 1024,5);//no padding
-  Test::impl_test_gesv_mrhs<view_type_a_lr, view_type_b_lr, Device>(&mode[0], "Y", 13,  5);//padding
-  Test::impl_test_gesv_mrhs<view_type_a_lr, view_type_b_lr, Device>(&mode[0], "Y", 179, 5);//padding
-#endif
-*/
+  /*
+  #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) ||
+  (!defined(KOKKOSKERNELS_ETI_ONLY) &&
+  !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View<ScalarA**,
+  Kokkos::LayoutRight, Device> view_type_a_lr; typedef Kokkos::View<ScalarB**,
+  Kokkos::LayoutRight, Device> view_type_b_lr;
+    Test::impl_test_gesv_mrhs<view_type_a_lr, view_type_b_lr, Device>(&mode[0],
+  "N", 2,   5);//no padding Test::impl_test_gesv_mrhs<view_type_a_lr,
+  view_type_b_lr, Device>(&mode[0], "N", 13,  5);//no padding
+    Test::impl_test_gesv_mrhs<view_type_a_lr, view_type_b_lr, Device>(&mode[0],
+  "N", 179, 5);//no padding Test::impl_test_gesv_mrhs<view_type_a_lr,
+  view_type_b_lr, Device>(&mode[0], "N", 64,  5);//no padding
+    Test::impl_test_gesv_mrhs<view_type_a_lr, view_type_b_lr, Device>(&mode[0],
+  "N", 1024,5);//no padding Test::impl_test_gesv_mrhs<view_type_a_lr,
+  view_type_b_lr, Device>(&mode[0], "Y", 13,  5);//padding
+    Test::impl_test_gesv_mrhs<view_type_a_lr, view_type_b_lr, Device>(&mode[0],
+  "Y", 179, 5);//padding #endif
+  */
   // Supress unused parameters on CUDA10
   (void)mode;
   return 1;
 }
 
-#if defined( KOKKOSKERNELS_ENABLE_TPL_MAGMA ) || defined (KOKKOSKERNELS_ENABLE_TPL_BLAS)
+#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) || \
+    defined(KOKKOSKERNELS_ENABLE_TPL_BLAS)
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, gesv_float ) {
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, gesv_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_float");
-  test_gesv<float,TestExecSpace> ("N");//No pivoting
-  test_gesv<float,TestExecSpace> ("Y");//Partial pivoting 
+  test_gesv<float, TestExecSpace>("N");  // No pivoting
+  test_gesv<float, TestExecSpace>("Y");  // Partial pivoting
   Kokkos::Profiling::popRegion();
 }
 
-TEST_F( TestCategory, gesv_mrhs_float ) {
+TEST_F(TestCategory, gesv_mrhs_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_mrhs_float");
-  test_gesv_mrhs<float,TestExecSpace> ("N");//No pivoting
-  test_gesv_mrhs<float,TestExecSpace> ("Y");//Partial pivoting
+  test_gesv_mrhs<float, TestExecSpace>("N");  // No pivoting
+  test_gesv_mrhs<float, TestExecSpace>("Y");  // Partial pivoting
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, gesv_double ) {
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, gesv_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_double");
-  test_gesv<double,TestExecSpace> ("N");//No pivoting
-  test_gesv<double,TestExecSpace> ("Y");//Partial pivoting
+  test_gesv<double, TestExecSpace>("N");  // No pivoting
+  test_gesv<double, TestExecSpace>("Y");  // Partial pivoting
   Kokkos::Profiling::popRegion();
 }
 
-TEST_F( TestCategory, gesv_mrhs_double ) {
+TEST_F(TestCategory, gesv_mrhs_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_mrhs_double");
-  test_gesv_mrhs<double,TestExecSpace> ("N");//No pivoting
-  test_gesv_mrhs<double,TestExecSpace> ("Y");//Partial pivoting
+  test_gesv_mrhs<double, TestExecSpace>("N");  // No pivoting
+  test_gesv_mrhs<double, TestExecSpace>("Y");  // Partial pivoting
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, gesv_complex_double ) {
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, gesv_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_complex_double");
-  test_gesv<Kokkos::complex<double>,TestExecSpace> ("N");//No pivoting
-  test_gesv<Kokkos::complex<double>,TestExecSpace> ("Y");//Partial pivoting
+  test_gesv<Kokkos::complex<double>, TestExecSpace>("N");  // No pivoting
+  test_gesv<Kokkos::complex<double>, TestExecSpace>("Y");  // Partial pivoting
   Kokkos::Profiling::popRegion();
 }
 
-TEST_F( TestCategory, gesv_mrhs_complex_double ) {
+TEST_F(TestCategory, gesv_mrhs_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_mrhs_complex_double");
-  test_gesv_mrhs<Kokkos::complex<double>,TestExecSpace> ("N");//No pivoting
-  test_gesv_mrhs<Kokkos::complex<double>,TestExecSpace> ("Y");//Partial pivoting
+  test_gesv_mrhs<Kokkos::complex<double>, TestExecSpace>("N");  // No pivoting
+  test_gesv_mrhs<Kokkos::complex<double>, TestExecSpace>(
+      "Y");  // Partial pivoting
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, gesv_complex_float ) {
+#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&         \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, gesv_complex_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_complex_float");
-  test_gesv<Kokkos::complex<float>,TestExecSpace> ("N");//No pivoting
-  test_gesv<Kokkos::complex<float>,TestExecSpace> ("Y");//Partial pivoting
+  test_gesv<Kokkos::complex<float>, TestExecSpace>("N");  // No pivoting
+  test_gesv<Kokkos::complex<float>, TestExecSpace>("Y");  // Partial pivoting
   Kokkos::Profiling::popRegion();
 }
 
-TEST_F( TestCategory, gesv_mrhs_complex_float ) {
+TEST_F(TestCategory, gesv_mrhs_complex_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_mrhs_complex_float");
-  test_gesv_mrhs<Kokkos::complex<float>,TestExecSpace> ("N");//No pivoting
-  test_gesv_mrhs<Kokkos::complex<float>,TestExecSpace> ("Y");//Partial pivoting
+  test_gesv_mrhs<Kokkos::complex<float>, TestExecSpace>("N");  // No pivoting
+  test_gesv_mrhs<Kokkos::complex<float>, TestExecSpace>(
+      "Y");  // Partial pivoting
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#endif//KOKKOSKERNELS_ENABLE_TPL_MAGMA || KOKKOSKERNELS_ENABLE_TPL_BLAS
+#endif  // KOKKOSKERNELS_ENABLE_TPL_MAGMA || KOKKOSKERNELS_ENABLE_TPL_BLAS
 
-#endif // Check for TPL MAGMA when compiling the CUDA tests
+#endif  // Check for TPL MAGMA when compiling the CUDA tests
diff --git a/unit_test/blas/Test_Blas_rocblas.hpp b/unit_test/blas/Test_Blas_rocblas.hpp
new file mode 100644
index 0000000000..61ba06aebc
--- /dev/null
+++ b/unit_test/blas/Test_Blas_rocblas.hpp
@@ -0,0 +1,55 @@
+// Note: Luc Berger-Vergiat 10/25/21
+//       Only include this test if compiling
+//       the cuda sparse tests and cuSPARSE
+//       is enabled.
+#if defined(TEST_HIP_BLAS_CPP) && defined(KOKKOSKERNELS_ENABLE_TPL_ROCBLAS)
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include "KokkosBlas_tpl_spec.hpp"
+#include <rocblas.h>
+
+// Just check if we can build against
+// rocblas and get the library version
+void test_rocblas_version() {
+  // Print version
+  size_t size = 128;
+  // rocblas_get_version_string_size(&size);
+  std::string version(size - 1, '\0');
+  rocblas_get_version_string(const_cast<char*>(version.data()), size);
+  std::cout << "rocBLAS version: " << version << "\n" << std::endl;
+}
+
+// Check that the wrapper macro
+// detects error status correctly
+void test_rocblas_safe_call() {
+  bool caught_exception = false;
+
+  rocblas_status myStatus = rocblas_status_success;
+  KOKKOS_ROCBLAS_SAFE_CALL_IMPL(myStatus);
+
+  try {
+    myStatus = rocblas_status_internal_error;
+    KOKKOS_ROCBLAS_SAFE_CALL_IMPL(myStatus);
+  } catch (std::runtime_error& e) {
+    caught_exception = true;
+  }
+
+  EXPECT_TRUE(caught_exception == true);
+}
+
+// Check that we can create a handle
+// using the singleton class if it
+// fails it throws an error with the
+// KOKKOS_ROCBLAS_SAFE_CALL_IMPL macro
+void test_rocblas_singleton() {
+  KokkosBlas::Impl::RocBlasSingleton& s =
+      KokkosBlas::Impl::RocBlasSingleton::singleton();
+  (void)s;
+}
+
+TEST_F(TestCategory, blas_rocblas_version) { test_rocblas_version(); }
+TEST_F(TestCategory, blas_rocblas_safe_call) { test_rocblas_safe_call(); }
+TEST_F(TestCategory, blas_rocblas_singleton) { test_rocblas_singleton(); }
+
+#endif  // check for HIP and rocBLAS
diff --git a/unit_test/blas/Test_Blas_trtri.hpp b/unit_test/blas/Test_Blas_trtri.hpp
index bcc6b842c8..1502a56d88 100644
--- a/unit_test/blas/Test_Blas_trtri.hpp
+++ b/unit_test/blas/Test_Blas_trtri.hpp
@@ -1,70 +1,71 @@
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<KokkosBlas_trtri.hpp>
-#include<KokkosKernels_TestUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <KokkosBlas_trtri.hpp>
+#include <KokkosKernels_TestUtils.hpp>
 
 namespace Test {
 
-  template<class ViewTypeA, class ExecutionSpace>
-  struct UnitDiagTRTRI {
-    ViewTypeA A_;
-    using ScalarA = typename ViewTypeA::value_type;
-
-    UnitDiagTRTRI (const ViewTypeA& A) : A_(A) {}
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i) const {
-      A_(i,i) = ScalarA(1);
-    }
-  };
-  template<class ViewTypeA, class ExecutionSpace>
-  struct NonUnitDiagTRTRI {
-    ViewTypeA A_;
-    using ScalarA = typename ViewTypeA::value_type;
-
-    NonUnitDiagTRTRI (const ViewTypeA& A) : A_(A) {}
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i) const {
-      A_(i,i) = A_(i,i)+10;
-    }
-  };
-  template<class ViewTypeA, class ViewTypeB, class ViewTypeC, class ExecutionSpace>
-  struct VanillaGEMM {
-    bool A_t, B_t, A_c, B_c;
-    int N,K;
-    ViewTypeA A;
-    ViewTypeB B;
-    ViewTypeC C;
-
-    typedef typename ViewTypeA::value_type ScalarA;
-    typedef typename ViewTypeB::value_type ScalarB;
-    typedef typename ViewTypeC::value_type ScalarC;
-    typedef Kokkos::Details::ArithTraits<ScalarC> APT;
-    typedef typename APT::mag_type mag_type;
-    ScalarA alpha;
-    ScalarC beta;
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team) const {
+template <class ViewTypeA, class ExecutionSpace>
+struct UnitDiagTRTRI {
+  ViewTypeA A_;
+  using ScalarA = typename ViewTypeA::value_type;
+
+  UnitDiagTRTRI(const ViewTypeA& A) : A_(A) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const { A_(i, i) = ScalarA(1); }
+};
+template <class ViewTypeA, class ExecutionSpace>
+struct NonUnitDiagTRTRI {
+  ViewTypeA A_;
+  using ScalarA = typename ViewTypeA::value_type;
+
+  NonUnitDiagTRTRI(const ViewTypeA& A) : A_(A) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const { A_(i, i) = A_(i, i) + 10; }
+};
+template <class ViewTypeA, class ViewTypeB, class ViewTypeC,
+          class ExecutionSpace>
+struct VanillaGEMM {
+  bool A_t, B_t, A_c, B_c;
+  int N, K;
+  ViewTypeA A;
+  ViewTypeB B;
+  ViewTypeC C;
+
+  typedef typename ViewTypeA::value_type ScalarA;
+  typedef typename ViewTypeB::value_type ScalarB;
+  typedef typename ViewTypeC::value_type ScalarC;
+  typedef Kokkos::Details::ArithTraits<ScalarC> APT;
+  typedef typename APT::mag_type mag_type;
+  ScalarA alpha;
+  ScalarC beta;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(
+      const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team)
+      const {
 // GNU COMPILER BUG WORKAROUND
-#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
-      int i = team.league_rank();
+#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && \
+    !defined(__HIP_DEVICE_COMPILE__)
+    int i = team.league_rank();
 #else
-      const int i = team.league_rank();
+    const int i = team.league_rank();
 #endif
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,N), [&] (const int& j) {
-        ScalarC C_ij = 0.0;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& j) {
+      ScalarC C_ij = 0.0;
 
-        // GNU 5.3, 5.4 and 6.1 (and maybe more) crash with another nested lambda here
+      // GNU 5.3, 5.4 and 6.1 (and maybe more) crash with another nested lambda
+      // here
 
 #if defined(KOKKOS_COMPILER_GNU) && !defined(KOKKOS_COMPILER_NVCC)
-        for(int k=0; k<K; k++) {
-          ScalarA A_ik = A_t?(A_c?APT::conj(A(k,i)):A(k,i)):A(i,k);
-          ScalarB B_kj = B_t?(B_c?APT::conj(B(j,k)):B(j,k)):B(k,j);
-          C_ij += A_ik*B_kj;
-        }
+      for (int k = 0; k < K; k++) {
+        ScalarA A_ik = A_t ? (A_c ? APT::conj(A(k, i)) : A(k, i)) : A(i, k);
+        ScalarB B_kj = B_t ? (B_c ? APT::conj(B(j, k)) : B(j, k)) : B(k, j);
+        C_ij += A_ik * B_kj;
+      }
 #else
         Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,K), [&] (const int& k, ScalarC& lsum) {
            ScalarA A_ik = A_t?(A_c?APT::conj(A(k,i)):A(k,i)):A(i,k);
@@ -73,97 +74,98 @@ namespace Test {
         },C_ij);
 #endif
 
-        C(i,j) = beta*C(i,j) + alpha*C_ij;
-      });
-    }
-  };
-
-  template<class ViewTypeA, class Device>
-  int impl_test_trtri(int bad_diag_idx,
-                      const char* uplo,
-                      const char* diag, 
-                      const int M, 
-                      const int N) {
-
-    using execution_space = typename ViewTypeA::device_type::execution_space;
-    using ScalarA         = typename ViewTypeA::value_type;
-    using APT             = Kokkos::Details::ArithTraits<ScalarA>;
-    using mag_type        = typename APT::mag_type;
-
-    double machine_eps = APT::epsilon();
-    const mag_type eps = 1.0e8 * machine_eps; //~1e-13 for double
-    bool is_A_lower_triangular = (uplo[0]=='L') || (uplo[0]=='l');
-    int ret;
-    ViewTypeA A ("A", M,N);
-    ViewTypeA A_original ("A_original", M,N);
-    ViewTypeA A_I ("A_I", M,N); // is I taken...?
-    uint64_t seed = Kokkos::Impl::clock_tic();
-    ScalarA beta       = ScalarA(0);
-    ScalarA cur_check_val; // Either 1 or 0, to check A_I
-
-    //const int As0 = A.stride(0), As1 = A.stride(1);
-    //const int Ae0 = A.extent(0), Ae1 = A.extent(1);
-    //printf("KokkosBlas::trtri test for %c %c, M %d, N %d, eps %g, ViewType: %s, A.stride(0): %d, A.stride(1): %d, A.extent(0): %d, A.extent(1): %d START\n", uplo[0],diag[0],M,N,eps,typeid(ViewTypeA).name(), As0, As1, Ae0, Ae1); fflush(stdout);
-
-    typename ViewTypeA::HostMirror host_A  = Kokkos::create_mirror_view(A);
-    typename ViewTypeA::HostMirror host_I  = Kokkos::create_mirror_view(A);
-
-    if (M != N || bad_diag_idx > 0) {
-      if (bad_diag_idx > 0) {
-        for (int i = 0; i < M; i++) {
-          for (int j = 0; j < N; j++) {
-            if (i==j)
-              host_A(i,j) = ScalarA(1);
-            else
-              host_A(i,j) = ScalarA(0);
-          }
+      C(i, j) = beta * C(i, j) + alpha * C_ij;
+    });
+  }
+};
+
+template <class ViewTypeA, class Device>
+int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag,
+                    const int M, const int N) {
+  using execution_space = typename ViewTypeA::device_type::execution_space;
+  using ScalarA         = typename ViewTypeA::value_type;
+  using APT             = Kokkos::Details::ArithTraits<ScalarA>;
+  using mag_type        = typename APT::mag_type;
+
+  double machine_eps         = APT::epsilon();
+  const mag_type eps         = 1.0e8 * machine_eps;  //~1e-13 for double
+  bool is_A_lower_triangular = (uplo[0] == 'L') || (uplo[0] == 'l');
+  int ret;
+  ViewTypeA A("A", M, N);
+  ViewTypeA A_original("A_original", M, N);
+  ViewTypeA A_I("A_I", M, N);  // is I taken...?
+  uint64_t seed = Kokkos::Impl::clock_tic();
+  ScalarA beta  = ScalarA(0);
+  ScalarA cur_check_val;  // Either 1 or 0, to check A_I
+
+  // const int As0 = A.stride(0), As1 = A.stride(1);
+  // const int Ae0 = A.extent(0), Ae1 = A.extent(1);
+  // printf("KokkosBlas::trtri test for %c %c, M %d, N %d, eps %g, ViewType: %s,
+  // A.stride(0): %d, A.stride(1): %d, A.extent(0): %d, A.extent(1): %d
+  // START\n", uplo[0],diag[0],M,N,eps,typeid(ViewTypeA).name(), As0, As1, Ae0,
+  // Ae1); fflush(stdout);
+
+  typename ViewTypeA::HostMirror host_A = Kokkos::create_mirror_view(A);
+  typename ViewTypeA::HostMirror host_I = Kokkos::create_mirror_view(A);
+
+  if (M != N || bad_diag_idx > 0) {
+    if (bad_diag_idx > 0) {
+      for (int i = 0; i < M; i++) {
+        for (int j = 0; j < N; j++) {
+          if (i == j)
+            host_A(i, j) = ScalarA(1);
+          else
+            host_A(i, j) = ScalarA(0);
         }
-        // Set just 1 value in the diagonal to 0.
-        if (M > 0 && N > 0)
-          host_A(bad_diag_idx-1, bad_diag_idx-1) = ScalarA(0);       
-        Kokkos::deep_copy(A, host_A);
       }
-      return KokkosBlas::trtri(uplo, diag, A);
+      // Set just 1 value in the diagonal to 0.
+      if (M > 0 && N > 0)
+        host_A(bad_diag_idx - 1, bad_diag_idx - 1) = ScalarA(0);
+      Kokkos::deep_copy(A, host_A);
     }
+    return KokkosBlas::trtri(uplo, diag, A);
+  }
 
-    // If M is greater than 100 and A is an unit triangluar matrix, make A the
-    // identity matrix due to large rounding errors in unit matrices
-    bool M_gt_100 = (M > 100) && ((diag[0]=='U')||(diag[0]=='u'));
-
-    Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(seed);
-
-    // Initialize A with deterministic random numbers
-    Kokkos::fill_random(A, rand_pool, Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, ScalarA>::max());
-    if((diag[0]=='U')||(diag[0]=='u')) {
-      using functor_type = UnitDiagTRTRI<ViewTypeA,execution_space>;
-      functor_type udtrtri(A);
-      // Initialize As diag with 1s
-      Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRTRI", Kokkos::RangePolicy<execution_space>(0,M), udtrtri);
-    } else {//(diag[0]=='N')||(diag[0]=='n')
-      using functor_type = NonUnitDiagTRTRI<ViewTypeA,execution_space>;
-      functor_type nudtrtri(A);
-      // Initialize As diag with A(i,i)+10
-      Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRTRI", Kokkos::RangePolicy<execution_space>(0,M), nudtrtri);
-    }
-    Kokkos::fence();
-    Kokkos::deep_copy(host_A,  A);
-
-    // Make host_A a lower triangle
-    if (is_A_lower_triangular || M_gt_100) {
-      for (int i = 0; i < M-1; i++)
-        for (int j = i+1; j < N; j++)
-          host_A(i,j) = ScalarA(0);
-    }
-    if (!is_A_lower_triangular || M_gt_100) {
-      // Make host_A a upper triangle
-      for (int i = 1; i < M; i++)
-        for (int j = 0; j < i; j++)
-          host_A(i,j) = ScalarA(0); 
-    }
-    Kokkos::deep_copy(A, host_A);
-    Kokkos::deep_copy(A_original, A);
+  // If M is greater than 100 and A is an unit triangluar matrix, make A the
+  // identity matrix due to large rounding errors in unit matrices
+  bool M_gt_100 = (M > 100) && ((diag[0] == 'U') || (diag[0] == 'u'));
+
+  Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(seed);
+
+  // Initialize A with deterministic random numbers
+  Kokkos::fill_random(
+      A, rand_pool,
+      Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, ScalarA>::max());
+  if ((diag[0] == 'U') || (diag[0] == 'u')) {
+    using functor_type = UnitDiagTRTRI<ViewTypeA, execution_space>;
+    functor_type udtrtri(A);
+    // Initialize As diag with 1s
+    Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRTRI",
+                         Kokkos::RangePolicy<execution_space>(0, M), udtrtri);
+  } else {  //(diag[0]=='N')||(diag[0]=='n')
+    using functor_type = NonUnitDiagTRTRI<ViewTypeA, execution_space>;
+    functor_type nudtrtri(A);
+    // Initialize As diag with A(i,i)+10
+    Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRTRI",
+                         Kokkos::RangePolicy<execution_space>(0, M), nudtrtri);
+  }
+  Kokkos::fence();
+  Kokkos::deep_copy(host_A, A);
 
-    #if 0
+  // Make host_A a lower triangle
+  if (is_A_lower_triangular || M_gt_100) {
+    for (int i = 0; i < M - 1; i++)
+      for (int j = i + 1; j < N; j++) host_A(i, j) = ScalarA(0);
+  }
+  if (!is_A_lower_triangular || M_gt_100) {
+    // Make host_A a upper triangle
+    for (int i = 1; i < M; i++)
+      for (int j = 0; j < i; j++) host_A(i, j) = ScalarA(0);
+  }
+  Kokkos::deep_copy(A, host_A);
+  Kokkos::deep_copy(A_original, A);
+
+#if 0
     Kokkos::deep_copy(host_A, A);
     printf("host_A:\n");
     for (int i = 0; i < M; i++) {
@@ -172,18 +174,19 @@ namespace Test {
         }
         printf("\n");
     }
-    #endif
+#endif
 
-    // A = A^-1
-    ret = KokkosBlas::trtri(uplo, diag, A);
-    Kokkos::fence();
+  // A = A^-1
+  ret = KokkosBlas::trtri(uplo, diag, A);
+  Kokkos::fence();
 
-    if (ret) {
-      printf("KokkosBlas::trtri(%c, %c, %s) returned %d\n", uplo[0],diag[0],typeid(ViewTypeA).name(), ret);
-      return ret;
-    }
+  if (ret) {
+    printf("KokkosBlas::trtri(%c, %c, %s) returned %d\n", uplo[0], diag[0],
+           typeid(ViewTypeA).name(), ret);
+    return ret;
+  }
 
-    #if 0
+#if 0
     Kokkos::deep_copy(host_A, A);
     printf("host_A:\n");
     for (int i = 0; i < M; i++) {
@@ -192,22 +195,31 @@ namespace Test {
         }
         printf("\n");
     }
-    #endif
-
-    // A_I = A * A_original
-    struct VanillaGEMM<ViewTypeA,ViewTypeA,ViewTypeA,execution_space> vgemm;
-    vgemm.A_t = false; vgemm.B_t = false;
-    vgemm.A_c = false; vgemm.B_c = false;
-    vgemm.N = N;    vgemm.K = M;
-    vgemm.A = A;    vgemm.B = A_original;
-    vgemm.C = A_I; // out
-    vgemm.alpha = ScalarA(1);
-    vgemm.beta = beta;
-    Kokkos::parallel_for("KokkosBlas::Test::VanillaGEMM", Kokkos::TeamPolicy<execution_space>(M,Kokkos::AUTO,16), vgemm);
-    Kokkos::fence();
-    Kokkos::deep_copy(host_I, A_I);
-
-    #if 0
+#endif
+
+  // A_I = A * A_original
+  struct VanillaGEMM<ViewTypeA, ViewTypeA, ViewTypeA, execution_space> vgemm;
+  vgemm.A_t   = false;
+  vgemm.B_t   = false;
+  vgemm.A_c   = false;
+  vgemm.B_c   = false;
+  vgemm.N     = N;
+  vgemm.K     = M;
+  vgemm.A     = A;
+  vgemm.B     = A_original;
+  vgemm.C     = A_I;  // out
+  vgemm.alpha = ScalarA(1);
+  vgemm.beta  = beta;
+  Kokkos::parallel_for(
+      "KokkosBlas::Test::VanillaGEMM",
+      Kokkos::TeamPolicy<execution_space>(
+          M, Kokkos::AUTO,
+          KokkosKernels::Impl::kk_get_max_vector_size<execution_space>()),
+      vgemm);
+  Kokkos::fence();
+  Kokkos::deep_copy(host_I, A_I);
+
+#if 0
     printf("host_I:\n");
     for (int i = 0; i < M; i++) {
         for (int j = 0; j < N; j++) {
@@ -215,88 +227,111 @@ namespace Test {
         }
         printf("\n");
     }
-    #endif
-
-    bool test_flag = true;
-    for (int i=0; i<M; i++) {
-      for (int j=0; j<N; j++) {
-        // Set check value
-        cur_check_val = (i==j) ? ScalarA(1) : ScalarA(0);//APT::abs(host_A(i,j));
-
-        // Check how close |A_I - cur_check_val| is to 0.
-        if (APT::abs(APT::abs(host_I(i,j)) - cur_check_val) > eps) {
-            test_flag = false;
-            //printf("   Error: eps ( %g ), host_I ( %.15f ) != cur_check_val ( %.15f ) (abs result-cur_check_val %g) at (i %d, j %d)\n", 
-                  //eps, host_I(i,j), cur_check_val, APT::abs(host_I(i,j) - cur_check_val), i, j);
-            break;
-        }
+#endif
+
+  bool test_flag = true;
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++) {
+      // Set check value
+      cur_check_val =
+          (i == j) ? ScalarA(1) : ScalarA(0);  // APT::abs(host_A(i,j));
+
+      // Check how close |A_I - cur_check_val| is to 0.
+      if (APT::abs(APT::abs(host_I(i, j)) - cur_check_val) > eps) {
+        test_flag = false;
+        // printf("   Error: eps ( %g ), host_I ( %.15f ) != cur_check_val (
+        // %.15f ) (abs result-cur_check_val %g) at (i %d, j %d)\n", eps,
+        // host_I(i,j), cur_check_val, APT::abs(host_I(i,j) - cur_check_val), i,
+        // j);
+        break;
       }
-      if (!test_flag) break;
     }
-    EXPECT_EQ( test_flag, true );
-    return ret;
+    if (!test_flag) break;
   }
+  EXPECT_EQ(test_flag, true);
+  return ret;
 }
+}  // namespace Test
 
-template<class ScalarA, class Device>
+template <class ScalarA, class Device>
 int test_trtri(const char* mode) {
   int ret;
   int bad_diag_idx = -1;
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  using view_type_a_layout_left = Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device>;
-
-  ret = Test::impl_test_trtri<view_type_a_layout_left, Device>(bad_diag_idx, &mode[0], &mode[1], 0, 0);
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&      \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  using view_type_a_layout_left =
+      Kokkos::View<ScalarA**, Kokkos::LayoutLeft, Device>;
+
+  ret = Test::impl_test_trtri<view_type_a_layout_left, Device>(
+      bad_diag_idx, &mode[0], &mode[1], 0, 0);
   EXPECT_EQ(ret, 0);
 
-  ret = Test::impl_test_trtri<view_type_a_layout_left, Device>(bad_diag_idx, &mode[0], &mode[1], 1, 1);
+  ret = Test::impl_test_trtri<view_type_a_layout_left, Device>(
+      bad_diag_idx, &mode[0], &mode[1], 1, 1);
   EXPECT_EQ(ret, 0);
 
-  ret = Test::impl_test_trtri<view_type_a_layout_left, Device>(bad_diag_idx, &mode[0], &mode[1], 15, 15);
+  ret = Test::impl_test_trtri<view_type_a_layout_left, Device>(
+      bad_diag_idx, &mode[0], &mode[1], 15, 15);
   EXPECT_EQ(ret, 0);
 
-  ret = Test::impl_test_trtri<view_type_a_layout_left, Device>(bad_diag_idx, &mode[0], &mode[1], 100, 100);
+  ret = Test::impl_test_trtri<view_type_a_layout_left, Device>(
+      bad_diag_idx, &mode[0], &mode[1], 100, 100);
   EXPECT_EQ(ret, 0);
 
-  // Rounding errors with randomly generated matrices begin here where M>100, so we pass in A=I
-  ret = Test::impl_test_trtri<view_type_a_layout_left, Device>(bad_diag_idx, &mode[0], &mode[1], 273, 273);
+  // Rounding errors with randomly generated matrices begin here where M>100, so
+  // we pass in A=I
+  ret = Test::impl_test_trtri<view_type_a_layout_left, Device>(
+      bad_diag_idx, &mode[0], &mode[1], 273, 273);
   EXPECT_EQ(ret, 0);
 
- // Only non-unit matrices could be singular.
+  // Only non-unit matrices could be singular.
   if (mode[1] == 'N' || mode[1] == 'n') {
-    bad_diag_idx = 2; // 1-index based
-    ret = Test::impl_test_trtri<view_type_a_layout_left, Device>(bad_diag_idx, &mode[0], &mode[1], 2, 2);
+    bad_diag_idx = 2;  // 1-index based
+    ret          = Test::impl_test_trtri<view_type_a_layout_left, Device>(
+        bad_diag_idx, &mode[0], &mode[1], 2, 2);
     EXPECT_EQ(ret, bad_diag_idx);
     bad_diag_idx = -1;
   }
 
   // One time check, disabled due to runtime throw instead of return here
-  //ret = Test::impl_test_trtri<view_type_a_layout_left, Device>(&mode[0],&mode[1],1031,731);
-  //EXPECT_NE(ret, 0);
+  // ret = Test::impl_test_trtri<view_type_a_layout_left,
+  // Device>(&mode[0],&mode[1],1031,731); EXPECT_NE(ret, 0);
 #endif
 
-#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  using view_type_a_layout_right = Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device>;
-  
-  ret = Test::impl_test_trtri<view_type_a_layout_right, Device>(bad_diag_idx, &mode[0], &mode[1], 0, 0);
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&       \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+  using view_type_a_layout_right =
+      Kokkos::View<ScalarA**, Kokkos::LayoutRight, Device>;
+
+  ret = Test::impl_test_trtri<view_type_a_layout_right, Device>(
+      bad_diag_idx, &mode[0], &mode[1], 0, 0);
   EXPECT_EQ(ret, 0);
 
-  ret = Test::impl_test_trtri<view_type_a_layout_right, Device>(bad_diag_idx, &mode[0], &mode[1], 1, 1);
+  ret = Test::impl_test_trtri<view_type_a_layout_right, Device>(
+      bad_diag_idx, &mode[0], &mode[1], 1, 1);
   EXPECT_EQ(ret, 0);
 
-  ret = Test::impl_test_trtri<view_type_a_layout_right, Device>(bad_diag_idx, &mode[0], &mode[1], 15, 15);
+  ret = Test::impl_test_trtri<view_type_a_layout_right, Device>(
+      bad_diag_idx, &mode[0], &mode[1], 15, 15);
   EXPECT_EQ(ret, 0);
 
-  ret = Test::impl_test_trtri<view_type_a_layout_right, Device>(bad_diag_idx, &mode[0], &mode[1], 100, 100);
+  ret = Test::impl_test_trtri<view_type_a_layout_right, Device>(
+      bad_diag_idx, &mode[0], &mode[1], 100, 100);
   EXPECT_EQ(ret, 0);
 
-  // Rounding errors with randomly generated matrices begin here where M>100, so we pass in A=I
-  ret = Test::impl_test_trtri<view_type_a_layout_right, Device>(bad_diag_idx, &mode[0], &mode[1], 273, 273);
+  // Rounding errors with randomly generated matrices begin here where M>100, so
+  // we pass in A=I
+  ret = Test::impl_test_trtri<view_type_a_layout_right, Device>(
+      bad_diag_idx, &mode[0], &mode[1], 273, 273);
   EXPECT_EQ(ret, 0);
 
   // Only non-unit matrices could be singular.
   if (mode[1] == 'N' || mode[1] == 'n') {
-    bad_diag_idx = 2; // 1-index based
-    ret = Test::impl_test_trtri<view_type_a_layout_right, Device>(bad_diag_idx, &mode[0], &mode[1], 2, 2);
+    bad_diag_idx = 2;  // 1-index based
+    ret          = Test::impl_test_trtri<view_type_a_layout_right, Device>(
+        bad_diag_idx, &mode[0], &mode[1], 2, 2);
     EXPECT_EQ(ret, bad_diag_idx);
     bad_diag_idx = -1;
   }
@@ -305,46 +340,54 @@ int test_trtri(const char* mode) {
   return 1;
 }
 
-#if defined(KOKKOSKERNELS_INST_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, trtri_float ) {
+#if defined(KOKKOSKERNELS_INST_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, trtri_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trtri_float");
-    test_trtri<float,TestExecSpace> ("UN");
-    test_trtri<float,TestExecSpace> ("UU");
-    test_trtri<float,TestExecSpace> ("LN");
-    test_trtri<float,TestExecSpace> ("LU");
+  test_trtri<float, TestExecSpace>("UN");
+  test_trtri<float, TestExecSpace>("UU");
+  test_trtri<float, TestExecSpace>("LN");
+  test_trtri<float, TestExecSpace>("LU");
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, trtri_double ) {
+#if defined(KOKKOSKERNELS_INST_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, trtri_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trtri_double");
-    test_trtri<double,TestExecSpace> ("UN");
-    test_trtri<double,TestExecSpace> ("UU");
-    test_trtri<double,TestExecSpace> ("LN");
-    test_trtri<double,TestExecSpace> ("LU");
+  test_trtri<double, TestExecSpace>("UN");
+  test_trtri<double, TestExecSpace>("UU");
+  test_trtri<double, TestExecSpace>("LN");
+  test_trtri<double, TestExecSpace>("LU");
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, trtri_complex_double ) {
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&          \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, trtri_complex_double) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trtri_complex_double");
-    test_trtri<Kokkos::complex<double>,TestExecSpace> ("UN");
-    test_trtri<Kokkos::complex<double>,TestExecSpace> ("UU");
-    test_trtri<Kokkos::complex<double>,TestExecSpace> ("LN");
-    test_trtri<Kokkos::complex<double>,TestExecSpace> ("LU");
+  test_trtri<Kokkos::complex<double>, TestExecSpace>("UN");
+  test_trtri<Kokkos::complex<double>, TestExecSpace>("UU");
+  test_trtri<Kokkos::complex<double>, TestExecSpace>("LN");
+  test_trtri<Kokkos::complex<double>, TestExecSpace>("LU");
   Kokkos::Profiling::popRegion();
 }
 #endif
 
-#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-TEST_F( TestCategory, trtri_complex_float ) {
+#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&         \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+TEST_F(TestCategory, trtri_complex_float) {
   Kokkos::Profiling::pushRegion("KokkosBlas::Test::trtri_complex_float");
-    test_trtri<Kokkos::complex<float>,TestExecSpace> ("UN");
-    test_trtri<Kokkos::complex<float>,TestExecSpace> ("UU");
-    test_trtri<Kokkos::complex<float>,TestExecSpace> ("LN");
-    test_trtri<Kokkos::complex<float>,TestExecSpace> ("LU");
+  test_trtri<Kokkos::complex<float>, TestExecSpace>("UN");
+  test_trtri<Kokkos::complex<float>, TestExecSpace>("UU");
+  test_trtri<Kokkos::complex<float>, TestExecSpace>("LN");
+  test_trtri<Kokkos::complex<float>, TestExecSpace>("LU");
   Kokkos::Profiling::popRegion();
 }
 #endif
diff --git a/unit_test/common/Test_Common.hpp b/unit_test/common/Test_Common.hpp
index 37784e4870..0a194071a8 100644
--- a/unit_test/common/Test_Common.hpp
+++ b/unit_test/common/Test_Common.hpp
@@ -1,10 +1,15 @@
 #ifndef TEST_COMMON_HPP
 #define TEST_COMMON_HPP
 
-#include<Test_Common_ArithTraits.hpp>
+// FIXME_SYCL still some uses of the wrong namespace
+#ifndef KOKKOS_ENABLE_SYCL
+#include <Test_Common_ArithTraits.hpp>
+#endif
 // #include<Test_Common_float128.hpp>
-#include<Test_Common_set_bit_count.hpp>
-#include<Test_Common_Sorting.hpp>
-#include<Test_Common_Transpose.hpp>
+#include <Test_Common_set_bit_count.hpp>
+#include <Test_Common_Sorting.hpp>
+#include <Test_Common_Transpose.hpp>
+#include <Test_Common_IOUtils.hpp>
+#include <Test_Common_Error.hpp>
 
-#endif // TEST_COMMON_HPP
+#endif  // TEST_COMMON_HPP
diff --git a/unit_test/common/Test_Common_ArithTraits.hpp b/unit_test/common/Test_Common_ArithTraits.hpp
index 027dd1d120..38a6ba7d78 100644
--- a/unit_test/common/Test_Common_ArithTraits.hpp
+++ b/unit_test/common/Test_Common_ArithTraits.hpp
@@ -59,8 +59,8 @@
 
 #include <Kokkos_Core.hpp>
 #include "Kokkos_ArithTraits.hpp"
-#include <limits> // std::numeric_limits
-#include <typeinfo> // typeid (T)
+#include <limits>    // std::numeric_limits
+#include <typeinfo>  // typeid (T)
 #include <cstdio>
 
 #define FAILURE()                                                            \
@@ -79,48 +79,45 @@
 #endif
 
 namespace {
-  // Whether Kokkos::Details::ArithTraits<ScalarType> implements
-  // transcendental functions.  These include sqrt, pow, log, and
-  // log10.
-  template<class ScalarType>
-  struct HasTranscendentals {
-    static const bool value = false;
-  };
-
-  template<>
-  struct HasTranscendentals<float> {
-    static const bool value = true;
-  };
-
-  template<>
-  struct HasTranscendentals<double> {
-    static const bool value = true;
-  };
-
-  template<>
-  struct HasTranscendentals<long double> {
-    static const bool value = true;
-  };
+// Whether Kokkos::Details::ArithTraits<ScalarType> implements
+// transcendental functions.  These include sqrt, pow, log, and
+// log10.
+template <class ScalarType>
+struct HasTranscendentals {
+  static const bool value = false;
+};
 
-  // template<>
-  // struct HasTranscendentals< ::Kokkos::complex<float> > {
-  //   static const bool value = true;
-  // };
+template <>
+struct HasTranscendentals<float> {
+  static const bool value = true;
+};
 
-  // template<>
-  // struct HasTranscendentals< ::Kokkos::complex<double> > {
-  //   static const bool value = true;
-  // };
+template <>
+struct HasTranscendentals<double> {
+  static const bool value = true;
+};
 
-  // template<>
-  // struct HasTranscendentals< ::Kokkos::complex<long double> > {
-  //   static const bool value = true;
-  // };
+template <>
+struct HasTranscendentals<long double> {
+  static const bool value = true;
+};
 
-} // namespace (anonymous)
+// template<>
+// struct HasTranscendentals< ::Kokkos::complex<float> > {
+//   static const bool value = true;
+// };
 
+// template<>
+// struct HasTranscendentals< ::Kokkos::complex<double> > {
+//   static const bool value = true;
+// };
 
+// template<>
+// struct HasTranscendentals< ::Kokkos::complex<long double> > {
+//   static const bool value = true;
+// };
 
+}  // namespace
 
 /// \class ArithTraitsTesterBase
 /// \brief Base class providing tests for Kokkos::Details::ArithTraits
@@ -147,33 +144,29 @@ namespace {
 /// (testHost()).  The device-based test is a reduction over redundant
 /// executions of the test.  All redundant executions must return
 /// '1' (passed).
-template<class ScalarType, class DeviceType>
+template <class ScalarType, class DeviceType>
 class ArithTraitsTesterBase {
-public:
+ public:
   typedef DeviceType execution_space;
   typedef typename execution_space::size_type size_type;
   //! Type of the result of the reduction.
   typedef int value_type;
 
   //! Constructor (does nothing, but marked as device function).
-  KOKKOS_INLINE_FUNCTION ArithTraitsTesterBase () {}
+  KOKKOS_INLINE_FUNCTION ArithTraitsTesterBase() {}
 
   /// \brief Set the initial value (\c 1) of the reduction.
   ///
   /// Subclasses need not and must not override this method.
-  KOKKOS_INLINE_FUNCTION void init ( value_type& dst) const {
-    dst = 1;
-  }
+  KOKKOS_INLINE_FUNCTION void init(value_type& dst) const { dst = 1; }
 
   /// \brief Combine two intermediate reduction results into \c dst.
   ///
   /// Subclasses need not and must not override this method.
-  KOKKOS_INLINE_FUNCTION void
-  join (volatile value_type& dst,
-        const volatile value_type& src) const
-  {
+  KOKKOS_INLINE_FUNCTION void join(volatile value_type& dst,
+                                   const volatile value_type& src) const {
     dst = dst && src;
-    //dst = 1;
+    // dst = 1;
   }
 
   /// \brief The "parallel for" part of the reduction.
@@ -194,12 +187,11 @@ class ArithTraitsTesterBase {
   ///   far.  On output: The result of the tests run in this method.
   ///   The result of more than one test is the logical AND of each
   ///   test's result.
-  KOKKOS_INLINE_FUNCTION void
-  operator () (size_type iwork, value_type& dst) const
-  {
+  KOKKOS_INLINE_FUNCTION void operator()(size_type iwork,
+                                         value_type& dst) const {
     TRACE();
     typedef Kokkos::Details::ArithTraits<ScalarType> AT;
-    (void) iwork; // not using this argument
+    (void)iwork;  // not using this argument
     int success = 1;
 
     // Make sure that the typedef exists.
@@ -210,13 +202,13 @@ class ArithTraitsTesterBase {
     // it with "(void)" to prevent a warning for the unused variable.
     {
       mag_type thing;
-      (void) thing;
+      (void)thing;
     }
 
     // ArithTraits should not even compile if it's not specialized for
     // T, but we check for this int constant for compatibility with
     // std::numeric_limits.
-    if (! AT::is_specialized) {
+    if (!AT::is_specialized) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("! AT::is_specialized\n");
       FAILURE();
     }
@@ -235,8 +227,8 @@ class ArithTraitsTesterBase {
       FAILURE();
     }
 
-    const ScalarType zero = AT::zero ();
-    const ScalarType one = AT::one ();
+    const ScalarType zero = AT::zero();
+    const ScalarType one  = AT::one();
 
     // Test properties of the arithmetic and multiplicative identities.
     if (zero + zero != zero) {
@@ -258,15 +250,15 @@ class ArithTraitsTesterBase {
       FAILURE();
     }
 
-    if (AT::abs (zero) != zero) {
+    if (AT::abs(zero) != zero) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::abs(0) != 0\n");
       FAILURE();
     }
-    if (AT::abs (one) != one) {
+    if (AT::abs(one) != one) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::abs(1) != 1\n");
       FAILURE();
     }
-    if (AT::is_signed && AT::abs (-one) != one) {
+    if (AT::is_signed && AT::abs(-one) != one) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::is_signed and AT::abs(-1) != 1\n");
       FAILURE();
     }
@@ -275,7 +267,7 @@ class ArithTraitsTesterBase {
     //
     // These are very mild ordering properties.
     // They should work even for a set only containing zero.
-    if (AT::abs (zero) > AT::abs (AT::max ())) {
+    if (AT::abs(zero) > AT::abs(AT::max())) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::abs(0) > AT::abs (AT::max ())\n");
       FAILURE();
     }
@@ -283,7 +275,7 @@ class ArithTraitsTesterBase {
     dst = dst && success;
   }
 
-protected:
+ protected:
   /// \brief Hook for subclasses to add their own host-based tests.
   ///
   /// We use this to add complex-arithmetic tests, if appropriate for
@@ -294,11 +286,11 @@ class ArithTraitsTesterBase {
   /// this a "hook.")
   ///
   /// \return \c 1 if all tests succeeded, else \c 0.
-  int testHostImpl (std::ostream& /*out*/) const {
-    return 1; // there are no tests, so trivially, all the tests pass
+  int testHostImpl(std::ostream& /*out*/) const {
+    return 1;  // there are no tests, so trivially, all the tests pass
   }
 
-public:
+ public:
   /// \brief Run the tests on the host.
   ///
   /// This method only works on the host.  It's helpful for debugging,
@@ -308,7 +300,7 @@ class ArithTraitsTesterBase {
   /// \param out [out] Output stream to which to print error messages.
   ///
   /// \return \c 1 if all the tests pass, else \c 0.
-  int testHost (std::ostream& out) const {
+  int testHost(std::ostream& out) const {
     typedef Kokkos::Details::ArithTraits<ScalarType> AT;
     using std::endl;
     int success = 1;
@@ -321,29 +313,31 @@ class ArithTraitsTesterBase {
     // it with "(void)" to prevent a warning for the unused variable.
     {
       mag_type thing;
-      (void) thing;
+      (void)thing;
     }
 
     // ArithTraits should not even compile if it's not specialized for
     // T, but we check for this int constant for compatibility with
     // std::numeric_limits.
-    if (! AT::is_specialized) {
+    if (!AT::is_specialized) {
       out << "ArithTraits is not specialized for T" << endl;
       FAILURE();
     }
 
     if (AT::is_integer != std::numeric_limits<ScalarType>::is_integer) {
-      out << "AT::is_integer != std::numeric_limits<ScalarType>::is_integer" << endl;
+      out << "AT::is_integer != std::numeric_limits<ScalarType>::is_integer"
+          << endl;
       FAILURE();
     }
 
     if (AT::is_exact != std::numeric_limits<ScalarType>::is_exact) {
-      out << "AT::is_exact != std::numeric_limits<ScalarType>::is_exact" << endl;
+      out << "AT::is_exact != std::numeric_limits<ScalarType>::is_exact"
+          << endl;
       FAILURE();
     }
 
-    const ScalarType zero = AT::zero ();
-    const ScalarType one = AT::one ();
+    const ScalarType zero = AT::zero();
+    const ScalarType one  = AT::one();
     // Test properties of the arithmetic and multiplicative identities.
 
     if (zero + zero != zero) {
@@ -365,16 +359,16 @@ class ArithTraitsTesterBase {
       FAILURE();
     }
 
-    if (AT::abs (zero) != zero) {
+    if (AT::abs(zero) != zero) {
       out << "AT::abs (zero) != zero" << endl;
       FAILURE();
     }
-    if (AT::abs (one) != one) {
+    if (AT::abs(one) != one) {
       out << "AT::abs (one) != one" << endl;
       FAILURE();
     }
     if (AT::is_signed) {
-      if (AT::abs (-one) != one) {
+      if (AT::abs(-one) != one) {
         out << "AT::abs (-one) != one" << endl;
         FAILURE();
       }
@@ -384,32 +378,30 @@ class ArithTraitsTesterBase {
     //
     // // These are very mild ordering properties.
     // // They should work even for a set only containing zero.
-    if (AT::abs (zero) > AT::abs (AT::max ())) {
+    if (AT::abs(zero) > AT::abs(AT::max())) {
       out << "AT::abs (zero) > AT::abs (AT::max ())" << endl;
       FAILURE();
     }
 
     if (AT::has_infinity) {
-      if (! AT::isInf (AT::infinity())) {
+      if (!AT::isInf(AT::infinity())) {
         out << "AT::isInf (inf) != true" << endl;
         FAILURE();
       }
     }
-    if ( ! std::is_same< ScalarType, decltype(AT::infinity()) >::value )
-    {
+    if (!std::is_same<ScalarType, decltype(AT::infinity())>::value) {
       std::cout << "AT::infinity() return value has wrong type" << endl;
       FAILURE();
     }
 
     // Run the parent class' remaining tests, if any.
-    const int parentSuccess = testHostImpl (out);
-    success = success && parentSuccess;
+    const int parentSuccess = testHostImpl(out);
+    success                 = success && parentSuccess;
 
     return success;
   }
 };
 
-
 /// \class ArithTraitsTesterTranscendentalBase
 /// \brief Base class of ArithTraitsTester that exercises
 ///   transcendental functions, if and only if ArithTraits<ScalarType>
@@ -424,17 +416,16 @@ class ArithTraitsTesterBase {
 /// Some tests will be executed whether or not ArithTraits<ScalarType>
 /// implements transcendental functions, but the specific tests that
 /// are run will depend on \c ScalarType.
-template<class ScalarType,
-         class DeviceType,
-         const int has_transcendentals =
-         (HasTranscendentals<ScalarType>::value ? 1 : 0) >
-class ArithTraitsTesterTranscendentalBase :
-  public ArithTraitsTesterBase<ScalarType, DeviceType> {
-private:
+template <class ScalarType, class DeviceType,
+          const int has_transcendentals =
+              (HasTranscendentals<ScalarType>::value ? 1 : 0)>
+class ArithTraitsTesterTranscendentalBase
+    : public ArithTraitsTesterBase<ScalarType, DeviceType> {
+ private:
   //! The base class of this class.
   typedef ArithTraitsTesterBase<ScalarType, DeviceType> base_type;
 
-public:
+ public:
   typedef DeviceType execution_space;
   typedef typename execution_space::size_type size_type;
   //! Type of the result of the reduction.
@@ -443,45 +434,43 @@ class ArithTraitsTesterTranscendentalBase :
   /// \brief The "parallel for" part of the reduction.
   ///
   /// See comments of ArithTraitsTesterBase's operator().
-  KOKKOS_INLINE_FUNCTION void
-  operator () (size_type iwork, value_type& dst) const;
+  KOKKOS_INLINE_FUNCTION void operator()(size_type iwork,
+                                         value_type& dst) const;
 
   //! Constructor (does nothing, but marked as device function).
-  KOKKOS_INLINE_FUNCTION ArithTraitsTesterTranscendentalBase ();
+  KOKKOS_INLINE_FUNCTION ArithTraitsTesterTranscendentalBase();
 
-protected:
+ protected:
   // The host hook gets implemented in the "transcendental functions
   // are implemented" specialization of this class.
-  virtual int testHostImpl (std::ostream& out) const;
+  virtual int testHostImpl(std::ostream& out) const;
 };
 
-
 //
 // Specialization of ArithTraitsTesterTranscendentalBase when
 // ArithTraits<ScalarType> does NOT implement transcendentals.
 //
-template<class ScalarType,
-         class DeviceType>
-class ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType, 0> :
-  public ArithTraitsTesterBase<ScalarType, DeviceType> {
-private:
+template <class ScalarType, class DeviceType>
+class ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType, 0>
+    : public ArithTraitsTesterBase<ScalarType, DeviceType> {
+ private:
   //! The base class of this class.
   typedef ArithTraitsTesterBase<ScalarType, DeviceType> base_type;
 
-public:
+ public:
   typedef DeviceType execution_space;
   typedef typename execution_space::size_type size_type;
   //! Type of the result of the reduction.
   typedef int value_type;
 
   //! Constructor (does nothing, but marked as device function).
-  KOKKOS_INLINE_FUNCTION ArithTraitsTesterTranscendentalBase () {}
+  KOKKOS_INLINE_FUNCTION ArithTraitsTesterTranscendentalBase() {}
 
-  KOKKOS_INLINE_FUNCTION void
-  operator () (size_type iwork, value_type& dst) const {
+  KOKKOS_INLINE_FUNCTION void operator()(size_type iwork,
+                                         value_type& dst) const {
     TRACE();
-    //typedef Kokkos::Details::ArithTraits<ScalarType> AT;
-    (void) iwork; // forestall compiler warning for unused variable
+    // typedef Kokkos::Details::ArithTraits<ScalarType> AT;
+    (void)iwork;  // forestall compiler warning for unused variable
     int success = 1;
 
     if (HasTranscendentals<ScalarType>::value) {
@@ -492,16 +481,16 @@ class ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType, 0> :
     // implementation of operator() must do this, in order to include
     // the parent class' tests.
     int baseResult = 1;
-    base_type::operator () (iwork, baseResult);
+    base_type::operator()(iwork, baseResult);
     success = success && baseResult;
 
     dst = dst && success;
   }
 
-protected:
-  virtual int testHostImpl (std::ostream& out) const {
+ protected:
+  virtual int testHostImpl(std::ostream& out) const {
     using std::endl;
-    //typedef Kokkos::Details::ArithTraits<ScalarType> AT;
+    // typedef Kokkos::Details::ArithTraits<ScalarType> AT;
     int success = 1;
 
     if (HasTranscendentals<ScalarType>::value) {
@@ -514,316 +503,317 @@ class ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType, 0> :
     // order to include the parent class' tests.  In the case of this
     // particular class, the base class' implementation doesn't do
     // anything, but that's OK.
-    const int parentSuccess = base_type::testHostImpl (out);
-    success = success && parentSuccess;
+    const int parentSuccess = base_type::testHostImpl(out);
+    success                 = success && parentSuccess;
 
     return success;
   }
 };
 
-
 //
 // Specialization of ArithTraitsTesterTranscendentalBase when
 // ArithTraits<ScalarType> DOES implement transcendentals.
 //
-template<class ScalarType,
-         class DeviceType>
-class ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType, 1> :
-  public ArithTraitsTesterBase<ScalarType, DeviceType> {
-private:
+template <class ScalarType, class DeviceType>
+class ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType, 1>
+    : public ArithTraitsTesterBase<ScalarType, DeviceType> {
+ private:
   //! The base class of this class.
   typedef ArithTraitsTesterBase<ScalarType, DeviceType> base_type;
 
   KOKKOS_INLINE_FUNCTION
   bool equal(const ScalarType& a, const ScalarType& b) const {
-    if(b!=Kokkos::Details::ArithTraits<ScalarType>::zero()) {
-      if(a>b)
-        return (a-b)/b < 2 * Kokkos::Details::ArithTraits<ScalarType>::epsilon();
+    if (b != Kokkos::Details::ArithTraits<ScalarType>::zero()) {
+      if (a > b)
+        return (a - b) / b <
+               2 * Kokkos::Details::ArithTraits<ScalarType>::epsilon();
       else
-        return (b-a)/b < 2 * Kokkos::Details::ArithTraits<ScalarType>::epsilon();
+        return (b - a) / b <
+               2 * Kokkos::Details::ArithTraits<ScalarType>::epsilon();
     } else {
-      if(a>b)
-        return (a-b) < 2 * Kokkos::Details::ArithTraits<ScalarType>::epsilon();
+      if (a > b)
+        return (a - b) <
+               2 * Kokkos::Details::ArithTraits<ScalarType>::epsilon();
       else
-        return (b-a) < 2 * Kokkos::Details::ArithTraits<ScalarType>::epsilon();
+        return (b - a) <
+               2 * Kokkos::Details::ArithTraits<ScalarType>::epsilon();
     }
   }
 
-public:
+ public:
   typedef DeviceType execution_space;
   typedef typename execution_space::size_type size_type;
   //! Type of the result of the reduction.
   typedef int value_type;
 
   //! Constructor (does nothing, but marked as device function).
-  KOKKOS_INLINE_FUNCTION ArithTraitsTesterTranscendentalBase () {}
+  KOKKOS_INLINE_FUNCTION ArithTraitsTesterTranscendentalBase() {}
 
-  KOKKOS_INLINE_FUNCTION void
-  operator () (size_type iwork, value_type& dst) const {
+  KOKKOS_INLINE_FUNCTION void operator()(size_type iwork,
+                                         value_type& dst) const {
     TRACE();
     typedef Kokkos::Details::ArithTraits<ScalarType> AT;
-    (void) iwork; // forestall compiler warning for unused variable
+    (void)iwork;  // forestall compiler warning for unused variable
     int success = 1;
 
-    if (! HasTranscendentals<ScalarType>::value) {
+    if (!HasTranscendentals<ScalarType>::value) {
       FAILURE();
     }
 
-    const ScalarType zero = AT::zero ();
-    const ScalarType one = AT::one ();
-    const ScalarType two = one + one;
-    const ScalarType three = one + one + one;
-    const ScalarType four = two * two;
-    const ScalarType five = four + one;
-    const ScalarType six = three * two;
-    const ScalarType seven = four + three;
-    const ScalarType eight = four * two;
-    const ScalarType nine = eight + one;
-    const ScalarType eleven = five + six;
+    const ScalarType zero        = AT::zero();
+    const ScalarType one         = AT::one();
+    const ScalarType two         = one + one;
+    const ScalarType three       = one + one + one;
+    const ScalarType four        = two * two;
+    const ScalarType five        = four + one;
+    const ScalarType six         = three * two;
+    const ScalarType seven       = four + three;
+    const ScalarType eight       = four * two;
+    const ScalarType nine        = eight + one;
+    const ScalarType eleven      = five + six;
     const ScalarType twentySeven = nine * three;
-    const ScalarType thirtySix = six * six;
-    const ScalarType fortyTwo = six * seven;
-    const ScalarType sixtyThree = eight * eight - one;
-    const ScalarType sixtyFour = eight * eight;
+    const ScalarType thirtySix   = six * six;
+    const ScalarType fortyTwo    = six * seven;
+    const ScalarType sixtyThree  = eight * eight - one;
+    const ScalarType sixtyFour   = eight * eight;
     // max char value, for 8-bit char
     const ScalarType oneTwentySeven = sixtyFour + sixtyThree;
 
     ScalarType result;
 
     // This fails inexplicably for complex numbers on gcc 4.2.1 on Mac.
-    if (! AT::is_complex) {
-      result = AT::pow (two, three);
-      if (!equal(result,eight)) {
+    if (!AT::is_complex) {
+      result = AT::pow(two, three);
+      if (!equal(result, eight)) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(2,3) != 8\n");
         FAILURE();
       }
     }
-    if (!equal(AT::pow (three, zero) , one)) {
+    if (!equal(AT::pow(three, zero), one)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(3,0) != 1\n");
       FAILURE();
     }
-    if (!equal(AT::pow (three, one) , three)) {
+    if (!equal(AT::pow(three, one), three)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(3,1) != 3\n");
       FAILURE();
     }
-    if (!equal(AT::pow (three, two) , nine)) {
+    if (!equal(AT::pow(three, two), nine)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(3,2) != 9\n");
       FAILURE();
     }
 
     // This fails inexplicably for complex numbers on gcc 4.2.1 on Mac.
-    if (! AT::is_complex) {
-      result = AT::pow (three, three);
-      if (!equal(result , twentySeven)) {
+    if (!AT::is_complex) {
+      result = AT::pow(three, three);
+      if (!equal(result, twentySeven)) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(3,3) != 27\n");
         FAILURE();
       }
     }
 
     // These fail inexplicably for complex numbers on gcc 4.2.1 on Mac.
-    if (AT::is_signed && ! AT::is_complex) {
-      result = AT::pow (-three, one);
-      if (!equal(result , -three)) {
+    if (AT::is_signed && !AT::is_complex) {
+      result = AT::pow(-three, one);
+      if (!equal(result, -three)) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(-3,1) != -3\n");
         FAILURE();
       }
-      result = AT::pow (-three, two);
-      if (!equal(result , nine)) {
+      result = AT::pow(-three, two);
+      if (!equal(result, nine)) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(-3,2) != 9\n");
         FAILURE();
       }
-      result = AT::pow (-three, three);
-      if (!equal(result , -twentySeven)) {
+      result = AT::pow(-three, three);
+      if (!equal(result, -twentySeven)) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(-3,3) != 27\n");
         FAILURE();
       }
     }
 
-    if (!equal(AT::sqrt (zero) , zero)) {
+    if (!equal(AT::sqrt(zero), zero)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::sqrt(0) != 0\n");
       FAILURE();
     }
-    if (!equal(AT::sqrt (one) , one)) {
+    if (!equal(AT::sqrt(one), one)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::sqrt(1) != 1\n");
       FAILURE();
     }
-    if (!equal(AT::sqrt (thirtySix) , six)) {
+    if (!equal(AT::sqrt(thirtySix), six)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::sqrt(36) != 6\n");
       FAILURE();
     }
-    if (!equal(AT::sqrt (sixtyFour) , eight)) {
+    if (!equal(AT::sqrt(sixtyFour), eight)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::sqrt(64) != 8\n");
       FAILURE();
     }
     if (AT::is_integer) {
-      if (!equal(AT::sqrt (fortyTwo) , six)) {
+      if (!equal(AT::sqrt(fortyTwo), six)) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT:sqrt(42) != 6\n");
         FAILURE();
       }
-      if (!equal(AT::sqrt (oneTwentySeven) , eleven)) {
+      if (!equal(AT::sqrt(oneTwentySeven), eleven)) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::sqrt(127) != 11\n");
         FAILURE();
       }
     }
 
-    if (!equal(AT::cbrt (zero) , zero)) {
+    if (!equal(AT::cbrt(zero), zero)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(0) != 0\n");
       FAILURE();
     }
-    if (!equal(AT::cbrt (one) , one)) {
+    if (!equal(AT::cbrt(one), one)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(1) != 1\n");
       FAILURE();
     }
-    if (!equal(AT::cbrt (twentySeven) , three)) {
+    if (!equal(AT::cbrt(twentySeven), three)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(27) != 3\n");
       FAILURE();
     }
-    if (!equal(AT::cbrt (sixtyFour) , four)) {
+    if (!equal(AT::cbrt(sixtyFour), four)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(64) != 4\n");
       FAILURE();
     }
     if (AT::is_integer) {
-      if (!equal(AT::cbrt (fortyTwo) , three)) {
+      if (!equal(AT::cbrt(fortyTwo), three)) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT:cbrt(42) != 3\n");
         FAILURE();
       }
-      if (!equal(AT::cbrt (oneTwentySeven) , five)) {
+      if (!equal(AT::cbrt(oneTwentySeven), five)) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(127) != 5\n");
         FAILURE();
       }
     }
 
-    if (!equal(AT::exp (zero) , one)) {
+    if (!equal(AT::exp(zero), one)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(0) != 1\n");
       FAILURE();
     }
     if (AT::is_complex) {
-      const ScalarType val = two; //(two.real(), two.real());
-      if (!equal(AT::conj (AT::exp  (val)) , 
-                 AT::exp  (AT::conj (val)))) {
+      const ScalarType val = two;  //(two.real(), two.real());
+      if (!equal(AT::conj(AT::exp(val)), AT::exp(AT::conj(val)))) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF(
             "AT::conj(exp(complex(2,2))) != AT::exp(conj(complex(2,2)))\n");
         FAILURE();
       }
     }
-    if (!equal(AT::log (one) , zero)) {
+    if (!equal(AT::log(one), zero)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::log(1) != 0\n");
       FAILURE();
     }
-    if (!equal(AT::log10 (one) , zero)) {
+    if (!equal(AT::log10(one), zero)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::log10(1) != 0\n");
       FAILURE();
     }
 
     if (AT::is_complex) {
-      ScalarType val = two; //(two, two);
-      const auto val_sin = AT::sin (val);
-      const auto val_cos = AT::cos (val);
-      if (!equal(val_sin*val_sin + val_cos*val_cos , one)) {
+      ScalarType val     = two;  //(two, two);
+      const auto val_sin = AT::sin(val);
+      const auto val_cos = AT::cos(val);
+      if (!equal(val_sin * val_sin + val_cos * val_cos, one)) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF(
             "AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n");
         FAILURE();
-      } 
-      if (!equal(val_sin/val_cos , AT::tan(val))) {
+      }
+      if (!equal(val_sin / val_cos, AT::tan(val))) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF(
             "AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n");
         FAILURE();
-      } 
+      }
     } else {
-      ScalarType val = three; 
-      const auto val_sin = AT::sin (val);
-      const auto val_cos = AT::cos (val);
-      if (!equal(val_sin*val_sin + val_cos*val_cos , one)) {
+      ScalarType val     = three;
+      const auto val_sin = AT::sin(val);
+      const auto val_cos = AT::cos(val);
+      if (!equal(val_sin * val_sin + val_cos * val_cos, one)) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF(
             "AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n");
         FAILURE();
-      } 
-      if (!equal(val_sin/val_cos , AT::tan(val))) {
+      }
+      if (!equal(val_sin / val_cos, AT::tan(val))) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF(
             "AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n");
         FAILURE();
-      } 
+      }
     }
 
-    if (!equal(AT::asin (AT::sin (one)), one)) {
+    if (!equal(AT::asin(AT::sin(one)), one)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::asin(sin(1)) != 1\n");
       FAILURE();
-    } 
-    if (!equal(AT::acos (AT::cos (one)), one)) {
+    }
+    if (!equal(AT::acos(AT::cos(one)), one)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::acos(cos(1)) != 1\n");
       FAILURE();
-    } 
-    if (!equal(AT::atan (AT::tan (one)), one)) {
+    }
+    if (!equal(AT::atan(AT::tan(one)), one)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::atan(tan(1)) != 1\n");
       FAILURE();
-    } 
+    }
 
     // Call the base class' implementation.  Every subclass'
     // implementation of operator() must do this, in order to include
     // the parent class' tests.
     int baseResult = 1;
-    base_type::operator () (iwork, baseResult);
+    base_type::operator()(iwork, baseResult);
     success = success && baseResult;
 
     dst = dst && success;
   }
 
-protected:
-  virtual int testHostImpl (std::ostream& out) const {
+ protected:
+  virtual int testHostImpl(std::ostream& out) const {
     using std::endl;
     typedef Kokkos::Details::ArithTraits<ScalarType> AT;
     int success = 1;
 
-    if (! HasTranscendentals<ScalarType>::value) {
+    if (!HasTranscendentals<ScalarType>::value) {
       out << "HasTranscendentals<T>::value is false" << endl;
       FAILURE();
     }
 
-    const ScalarType zero = AT::zero ();
-    const ScalarType one = AT::one ();
-    const ScalarType two = one + one;
-    const ScalarType three = one + one + one;
-    const ScalarType four = two * two;
-    const ScalarType five = four + one;
-    const ScalarType six = three * two;
-    const ScalarType seven = four + three;
-    const ScalarType eight = four * two;
-    const ScalarType nine = eight + one;
-    const ScalarType eleven = five + six;
+    const ScalarType zero        = AT::zero();
+    const ScalarType one         = AT::one();
+    const ScalarType two         = one + one;
+    const ScalarType three       = one + one + one;
+    const ScalarType four        = two * two;
+    const ScalarType five        = four + one;
+    const ScalarType six         = three * two;
+    const ScalarType seven       = four + three;
+    const ScalarType eight       = four * two;
+    const ScalarType nine        = eight + one;
+    const ScalarType eleven      = five + six;
     const ScalarType twentySeven = nine * three;
-    const ScalarType thirtySix = six * six;
-    const ScalarType fortyTwo = six * seven;
-    const ScalarType sixtyThree = eight * eight - one;
-    const ScalarType sixtyFour = eight * eight;
+    const ScalarType thirtySix   = six * six;
+    const ScalarType fortyTwo    = six * seven;
+    const ScalarType sixtyThree  = eight * eight - one;
+    const ScalarType sixtyFour   = eight * eight;
     // max char value, for 8-bit char
     const ScalarType oneTwentySeven = sixtyFour + sixtyThree;
 
     ScalarType result;
 
     // This fails inexplicably for complex numbers on gcc 4.2.1 on Mac.
-    if (! AT::is_complex) {
-      result = AT::pow (two, three);
+    if (!AT::is_complex) {
+      result = AT::pow(two, three);
       if (result != eight) {
         out << "AT::pow (two, three) != eight" << endl;
         FAILURE();
       }
     }
-    if (AT::pow (three, zero) != one) {
+    if (AT::pow(three, zero) != one) {
       out << "AT::pow (three, zero) != one" << endl;
       FAILURE();
     }
-    if (AT::pow (three, one) != three) {
+    if (AT::pow(three, one) != three) {
       out << "AT::pow (three, one) != three" << endl;
       FAILURE();
     }
-    if (AT::pow (three, two) != nine) {
+    if (AT::pow(three, two) != nine) {
       out << "AT::pow (three, two) != nine" << endl;
       FAILURE();
     }
 
     // This fails inexplicably for complex numbers on gcc 4.2.1 on Mac.
-    if (! AT::is_complex) {
-      result = AT::pow (three, three);
+    if (!AT::is_complex) {
+      result = AT::pow(three, three);
       if (result != twentySeven) {
         out << "AT::pow (three, three) = " << result
             << " != twentySeven = " << twentySeven << endl;
@@ -832,20 +822,20 @@ class ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType, 1> :
     }
 
     // These fail inexplicably for complex numbers on gcc 4.2.1 on Mac.
-    if (AT::is_signed && ! AT::is_complex) {
-      result = AT::pow (-three, one);
+    if (AT::is_signed && !AT::is_complex) {
+      result = AT::pow(-three, one);
       if (result != -three) {
-        out << "AT::pow (-three, one) = " << result
-            << " != -three = " << -three << endl;
+        out << "AT::pow (-three, one) = " << result << " != -three = " << -three
+            << endl;
         FAILURE();
       }
-      result = AT::pow (-three, two);
+      result = AT::pow(-three, two);
       if (result != nine) {
-        out << "AT::pow (-three, two) = " << result
-            << " != nine = " << nine << endl;
+        out << "AT::pow (-three, two) = " << result << " != nine = " << nine
+            << endl;
         FAILURE();
       }
-      result = AT::pow (-three, three);
+      result = AT::pow(-three, three);
       if (result != -twentySeven) {
         out << "AT::pow (-three, three) = " << result
             << " != -twentySeven = " << twentySeven << endl;
@@ -853,138 +843,136 @@ class ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType, 1> :
       }
     }
 
-    if (AT::sqrt (zero) != zero) {
+    if (AT::sqrt(zero) != zero) {
       out << "AT::sqrt (zero) != zero" << endl;
       FAILURE();
     }
-    if (AT::sqrt (one) != one) {
+    if (AT::sqrt(one) != one) {
       out << "AT::sqrt (one) != one" << endl;
       FAILURE();
     }
-    if (AT::sqrt (thirtySix) != six) {
+    if (AT::sqrt(thirtySix) != six) {
       out << "AT::sqrt (thirtySix) != six" << endl;
       FAILURE();
     }
-    if (AT::sqrt (sixtyFour) != eight) {
+    if (AT::sqrt(sixtyFour) != eight) {
       out << "AT::sqrt (sixtyFour) != eight" << endl;
       FAILURE();
     }
     if (AT::is_integer) {
-      if (AT::sqrt (fortyTwo) != six) {
+      if (AT::sqrt(fortyTwo) != six) {
         out << "AT::sqrt (fortyTwo) != six" << endl;
         FAILURE();
       }
-      if (AT::sqrt (oneTwentySeven) != eleven) {
+      if (AT::sqrt(oneTwentySeven) != eleven) {
         out << "AT::sqrt (oneTwentySeven) != eleven" << endl;
         FAILURE();
       }
     }
 
-    if (!equal(AT::cbrt (zero) , zero)) {
+    if (!equal(AT::cbrt(zero), zero)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(0) != 0\n");
       FAILURE();
     }
-    if (!equal(AT::cbrt (one) , one)) {
+    if (!equal(AT::cbrt(one), one)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(1) != 1\n");
       FAILURE();
     }
-    if (!equal(AT::cbrt (twentySeven) , three)) {
+    if (!equal(AT::cbrt(twentySeven), three)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(27) != 3\n");
       FAILURE();
     }
-    if (!equal(AT::cbrt (sixtyFour) , four)) {
+    if (!equal(AT::cbrt(sixtyFour), four)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(64) != 4\n");
       FAILURE();
     }
     if (AT::is_integer) {
-      if (!equal(AT::cbrt (fortyTwo) , three)) {
+      if (!equal(AT::cbrt(fortyTwo), three)) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT:cbrt(42) != 3\n");
         FAILURE();
       }
-      if (!equal(AT::cbrt (oneTwentySeven) , five)) {
+      if (!equal(AT::cbrt(oneTwentySeven), five)) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(127) != 5\n");
         FAILURE();
       }
     }
 
-    if (!equal(AT::exp (zero) , one)) {
+    if (!equal(AT::exp(zero), one)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(0) != 1\n");
       FAILURE();
     }
     if (AT::is_complex) {
-      const ScalarType val = two; //(two.real(), two.real());
-      if (!equal(AT::conj (AT::exp  (val)) , 
-                 AT::exp  (AT::conj (val)))) {
+      const ScalarType val = two;  //(two.real(), two.real());
+      if (!equal(AT::conj(AT::exp(val)), AT::exp(AT::conj(val)))) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF(
             "AT::conj(exp(complex(2,0))) != AT::exp(conj(complex(2,0)))\n");
         FAILURE();
       }
     }
-    if (AT::log (one) != zero) {
+    if (AT::log(one) != zero) {
       out << "AT::log (one) != zero" << endl;
       FAILURE();
     }
-    if (AT::log10 (one) != zero) {
+    if (AT::log10(one) != zero) {
       out << "AT::log10 (one) != zero" << endl;
       FAILURE();
     }
 
     if (AT::is_complex) {
-      const ScalarType val = two; // (two.real(), two.real());
-      const auto val_sin = AT::sin (val);
-      const auto val_cos = AT::cos (val);
-      if (!equal(val_sin*val_sin + val_cos*val_cos , one)) {
+      const ScalarType val = two;  // (two.real(), two.real());
+      const auto val_sin   = AT::sin(val);
+      const auto val_cos   = AT::cos(val);
+      if (!equal(val_sin * val_sin + val_cos * val_cos, one)) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF(
             "AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n");
         FAILURE();
-      } 
-      if (!equal(val_sin/val_cos , AT::tan(val))) {
+      }
+      if (!equal(val_sin / val_cos, AT::tan(val))) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF(
             "AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n");
         FAILURE();
-      } 
+      }
     } else {
-      const ScalarType val = three; 
-      const auto val_sin = AT::sin (val);
-      const auto val_cos = AT::cos (val);
-      if (!equal(val_sin*val_sin + val_cos*val_cos , one)) {
+      const ScalarType val = three;
+      const auto val_sin   = AT::sin(val);
+      const auto val_cos   = AT::cos(val);
+      if (!equal(val_sin * val_sin + val_cos * val_cos, one)) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF(
             "AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n");
         FAILURE();
-      } 
-      if (!equal(val_sin/val_cos , AT::tan(val))) {
+      }
+      if (!equal(val_sin / val_cos, AT::tan(val))) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF(
             "AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n");
         FAILURE();
-      } 
+      }
     }
 
-    if (!equal(AT::asin (AT::sin (three)), three)) {
+    if (!equal(AT::asin(AT::sin(three)), three)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::asin(sin(3)) != 3\n");
       FAILURE();
-    } 
-    if (!equal(AT::acos (AT::cos (three)), three)) {
+    }
+    if (!equal(AT::acos(AT::cos(three)), three)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::acos(cos(3)) != 3\n");
       FAILURE();
-    } 
-    if (!equal(AT::atan (AT::tan (three)), three)) {
+    }
+    if (!equal(AT::atan(AT::tan(three)), three)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::atan(tan(3)) != 3\n");
       FAILURE();
-    } 
+    }
 
     // Call the base class' implementation.  Every subclass'
     // implementation of testHostImpl() should (must) do this, in
     // order to include the parent class' tests.  In the case of this
     // particular class, the base class' implementation doesn't do
     // anything, but that's OK.
-    const int parentSuccess = base_type::testHostImpl (out);
-    success = success && parentSuccess;
+    const int parentSuccess = base_type::testHostImpl(out);
+    success                 = success && parentSuccess;
 
     return success;
   }
 };
 
-
 /// \class ArithTraitsTesterComplexBase
 /// \brief Execute Kokkos::Details::ArithTraits tests relevant to
 ///   complex numbers (whether or not \c ScalarType is itself a
@@ -997,16 +985,16 @@ class ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType, 1> :
 /// Some tests will be executed whether or not <tt>ScalarType</tt> is
 /// complex, but the specific tests that are run will depend on
 /// <tt>ScalarType</tt>.
-template<class ScalarType,
-         class DeviceType,
-         const int is_complex = Kokkos::Details::ArithTraits<ScalarType>::is_complex>
-class ArithTraitsTesterComplexBase :
-  public ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType> {
-private:
+template <class ScalarType, class DeviceType,
+          const int is_complex =
+              Kokkos::Details::ArithTraits<ScalarType>::is_complex>
+class ArithTraitsTesterComplexBase
+    : public ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType> {
+ private:
   //! The base class of this class.
   typedef ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType> base_type;
 
-public:
+ public:
   typedef DeviceType execution_space;
   typedef typename execution_space::size_type size_type;
   //! Type of the result of the reduction.
@@ -1015,51 +1003,49 @@ class ArithTraitsTesterComplexBase :
   /// \brief The "parallel for" part of the reduction.
   ///
   /// See comments of ArithTraitsTesterBase's operator().
-  KOKKOS_INLINE_FUNCTION void
-  operator () (size_type iwork, value_type& dst) const;
+  KOKKOS_INLINE_FUNCTION void operator()(size_type iwork,
+                                         value_type& dst) const;
 
   //! Constructor (does nothing, but marked as device function).
-  KOKKOS_INLINE_FUNCTION ArithTraitsTesterComplexBase ();
+  KOKKOS_INLINE_FUNCTION ArithTraitsTesterComplexBase();
 
-protected:
+ protected:
   // The host hook gets implemented in the complex-arithmetic
   // specialization of this class.
-  virtual int testHostImpl (std::ostream& out) const;
+  virtual int testHostImpl(std::ostream& out) const;
 };
 
 //
 // Specialization of ArithTraitsTesterComplexBase for real T.
 //
-template<class ScalarType,
-         class DeviceType>
-class ArithTraitsTesterComplexBase<ScalarType, DeviceType, 0> :
-  public ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType> {
-private:
+template <class ScalarType, class DeviceType>
+class ArithTraitsTesterComplexBase<ScalarType, DeviceType, 0>
+    : public ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType> {
+ private:
   //! The base class of this class.
   typedef ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType> base_type;
 
-public:
+ public:
   typedef DeviceType execution_space;
   typedef typename execution_space::size_type size_type;
   //! Type of the result of the reduction.
   typedef int value_type;
 
   //! Constructor (does nothing, but marked as device function).
-  KOKKOS_INLINE_FUNCTION ArithTraitsTesterComplexBase () {}
+  KOKKOS_INLINE_FUNCTION ArithTraitsTesterComplexBase() {}
 
-  KOKKOS_INLINE_FUNCTION void
-  operator () (size_type iwork, value_type& dst) const {
+  KOKKOS_INLINE_FUNCTION void operator()(size_type iwork,
+                                         value_type& dst) const {
     TRACE();
     typedef Kokkos::Details::ArithTraits<ScalarType> AT;
-    (void) iwork; // forestall compiler warning for unused variable
+    (void)iwork;  // forestall compiler warning for unused variable
     int success = 1;
 
     // Apparently, std::numeric_limits<ScalarType>::is_signed is 1
     // only for real numbers.
 #if defined(KOKKOS_HALF_T_IS_FLOAT)
     if (std::is_same<ScalarType, Kokkos::Experimental::half_t>::value) {
-      if (AT::is_signed != 0x1)
-        FAILURE();
+      if (AT::is_signed != 0x1) FAILURE();
     } else
 #else
     {
@@ -1071,9 +1057,9 @@ class ArithTraitsTesterComplexBase<ScalarType, DeviceType, 0> :
         FAILURE();
       }
     }
-#endif // KOKKOS_HALF_T_IS_FLOAT
+#endif  // KOKKOS_HALF_T_IS_FLOAT
 
-    if (AT::is_complex) {
+        if (AT::is_complex) {
       FAILURE();
     }
 
@@ -1081,21 +1067,24 @@ class ArithTraitsTesterComplexBase<ScalarType, DeviceType, 0> :
     // implementation of operator() must do this, in order to include
     // the parent class' tests.
     int baseResult = 1;
-    base_type::operator () (iwork, baseResult);
+    base_type::operator()(iwork, baseResult);
     success = success && baseResult;
 
     dst = dst && success;
   }
 
-protected:
-  virtual int testHostImpl (std::ostream& out) const {
+ protected:
+  virtual int testHostImpl(std::ostream& out) const {
     using std::endl;
     typedef Kokkos::Details::ArithTraits<ScalarType> AT;
 
     int success = 1;
-    // Apparently, std::numeric_limits<ScalarType>::is_signed is 1 only for real numbers.
+    // Apparently, std::numeric_limits<ScalarType>::is_signed is 1 only for real
+    // numbers.
     if (AT::is_signed != std::numeric_limits<ScalarType>::is_signed) {
-      out << "ArithTraits<T>::is_signed != std::numeric_limits<ScalarType>::is_signed" << endl;
+      out << "ArithTraits<T>::is_signed != "
+             "std::numeric_limits<ScalarType>::is_signed"
+          << endl;
       FAILURE();
     }
     if (AT::is_complex) {
@@ -1107,52 +1096,51 @@ class ArithTraitsTesterComplexBase<ScalarType, DeviceType, 0> :
     // order to include the parent class' tests.  In the case of this
     // particular class, the base class' implementation doesn't do
     // anything, but that's OK.
-    const int parentSuccess = base_type::testHostImpl (out);
-    success = success && parentSuccess;
+    const int parentSuccess = base_type::testHostImpl(out);
+    success                 = success && parentSuccess;
 
     return success;
   }
 };
 
 // Specialization for complex T.
-template<class ScalarType,
-         class DeviceType>
-class ArithTraitsTesterComplexBase<ScalarType, DeviceType, 1> :
-  public ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType> {
-private:
+template <class ScalarType, class DeviceType>
+class ArithTraitsTesterComplexBase<ScalarType, DeviceType, 1>
+    : public ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType> {
+ private:
   //! The base class of this class.
   typedef ArithTraitsTesterTranscendentalBase<ScalarType, DeviceType> base_type;
 
-public:
+ public:
   typedef DeviceType execution_space;
   typedef typename execution_space::size_type size_type;
   //! Type of the result of the reduction.
   typedef int value_type;
 
   //! Constructor (does nothing, but marked as device function).
-  KOKKOS_INLINE_FUNCTION ArithTraitsTesterComplexBase () {}
+  KOKKOS_INLINE_FUNCTION ArithTraitsTesterComplexBase() {}
 
-  KOKKOS_INLINE_FUNCTION void
-  operator () (size_type iwork, value_type& dst) const {
+  KOKKOS_INLINE_FUNCTION void operator()(size_type iwork,
+                                         value_type& dst) const {
     TRACE();
     typedef Kokkos::Details::ArithTraits<ScalarType> AT;
-    (void) iwork; // forestall compiler warning for unused variable
+    (void)iwork;  // forestall compiler warning for unused variable
     int success = 1;
 
-    if (! AT::is_complex) {
+    if (!AT::is_complex) {
       FAILURE();
     }
     typedef typename AT::mag_type mag_type;
-    const mag_type one = Kokkos::Details::ArithTraits<mag_type>::one ();
+    const mag_type one = Kokkos::Details::ArithTraits<mag_type>::one();
 
     // This presumes that ScalarType, being a complex number, has a
     // constructor which takes two mag_type arguments.
-    const ScalarType oneMinusOne (one, -one);
-    const ScalarType onePlusOne (one, one);
+    const ScalarType oneMinusOne(one, -one);
+    const ScalarType onePlusOne(one, one);
 
     // Test conjugation.
-    if (AT::conj (oneMinusOne) != onePlusOne ||
-        AT::conj (onePlusOne) != oneMinusOne) {
+    if (AT::conj(oneMinusOne) != onePlusOne ||
+        AT::conj(onePlusOne) != oneMinusOne) {
       FAILURE();
     }
 
@@ -1160,36 +1148,36 @@ class ArithTraitsTesterComplexBase<ScalarType, DeviceType, 1> :
     // implementation of operator() must do this, in order to include
     // the parent class' tests.
     int baseResult = 1;
-    base_type::operator () (iwork, baseResult);
+    base_type::operator()(iwork, baseResult);
     success = success && baseResult;
 
     dst = dst && success;
   }
 
-protected:
-  virtual int testHostImpl (std::ostream& out) const {
+ protected:
+  virtual int testHostImpl(std::ostream& out) const {
     using std::endl;
     typedef Kokkos::Details::ArithTraits<ScalarType> AT;
     int success = 1;
 
-    if (! AT::is_complex) {
+    if (!AT::is_complex) {
       out << "ArithTraits<T>::is_complex is wrong" << endl;
       FAILURE();
     }
     typedef typename AT::mag_type mag_type;
-    const mag_type one = Kokkos::Details::ArithTraits<mag_type>::one ();
+    const mag_type one = Kokkos::Details::ArithTraits<mag_type>::one();
 
     // This presumes that ScalarType, being a complex number, has a
     // constructor which takes two mag_type arguments.
-    const ScalarType oneMinusOne (one, -one);
-    const ScalarType onePlusOne (one, one);
+    const ScalarType oneMinusOne(one, -one);
+    const ScalarType onePlusOne(one, one);
 
     // Test conjugation.
-    if (AT::conj (oneMinusOne) != onePlusOne) {
+    if (AT::conj(oneMinusOne) != onePlusOne) {
       out << "AT::conj ((1, -1)) != (1, 1)" << endl;
       FAILURE();
     }
-    if (AT::conj (onePlusOne) != oneMinusOne) {
+    if (AT::conj(onePlusOne) != oneMinusOne) {
       out << "AT::conj ((1, 1)) != (1, -1)" << endl;
       FAILURE();
     }
@@ -1198,14 +1186,13 @@ class ArithTraitsTesterComplexBase<ScalarType, DeviceType, 1> :
     // order to include the parent class' tests.  In the case of this
     // particular class, the base class' implementation doesn't do
     // anything, but that's OK.
-    const int parentSuccess = base_type::testHostImpl (out);
-    success = success && parentSuccess;
+    const int parentSuccess = base_type::testHostImpl(out);
+    success                 = success && parentSuccess;
 
     return success;
   }
 };
 
-
 /// \class ArithTraitsTesterFloatingPointBase
 /// \brief Kokkos reduction functor for testing those attributes of
 ///   ArithTraits suitable for floating-point types.
@@ -1221,20 +1208,21 @@ class ArithTraitsTesterComplexBase<ScalarType, DeviceType, 1> :
 /// (testHost()).  The device-based test is a reduction over redundant
 /// executions of the test.  All redundant executions must return
 /// '1' (passed).
-template<class ScalarType,
-         class DeviceType,
-         const int is_exact = Kokkos::Details::ArithTraits<ScalarType>::is_exact>
-class ArithTraitsTesterFloatingPointBase :
-  public ArithTraitsTesterComplexBase<ScalarType,
-                                      DeviceType,
-                                      Kokkos::Details::ArithTraits<ScalarType>::is_complex>
-{
-private:
+template <class ScalarType, class DeviceType,
+          const int is_exact =
+              Kokkos::Details::ArithTraits<ScalarType>::is_exact>
+class ArithTraitsTesterFloatingPointBase
+    : public ArithTraitsTesterComplexBase<
+          ScalarType, DeviceType,
+          Kokkos::Details::ArithTraits<ScalarType>::is_complex> {
+ private:
   //! The base class of this class.
-  typedef ArithTraitsTesterComplexBase<ScalarType,
-                                       DeviceType,
-                                       Kokkos::Details::ArithTraits<ScalarType>::is_complex> base_type;
-public:
+  typedef ArithTraitsTesterComplexBase<
+      ScalarType, DeviceType,
+      Kokkos::Details::ArithTraits<ScalarType>::is_complex>
+      base_type;
+
+ public:
   typedef DeviceType execution_space;
   typedef typename execution_space::size_type size_type;
   //! Type of the result of the reduction.
@@ -1243,43 +1231,43 @@ class ArithTraitsTesterFloatingPointBase :
   /// \brief The "parallel for" part of the reduction.
   ///
   /// See comments of ArithTraitsTesterBase's operator().
-  KOKKOS_INLINE_FUNCTION void
-  operator () (size_type iwork, value_type& dst) const;
+  KOKKOS_INLINE_FUNCTION void operator()(size_type iwork,
+                                         value_type& dst) const;
 
-protected:
-  virtual int testHostImpl (std::ostream& out) const;
+ protected:
+  virtual int testHostImpl(std::ostream& out) const;
 };
 
-
 //
 // Specialization for is_exact = 0 (i.e., ScalarType is a
 // floating-point type).
 //
-template<class ScalarType, class DeviceType>
-class ArithTraitsTesterFloatingPointBase<ScalarType, DeviceType, 0> :
-  public ArithTraitsTesterComplexBase<ScalarType,
-                                      DeviceType,
-                                      Kokkos::Details::ArithTraits<ScalarType>::is_complex>
-{
-private:
+template <class ScalarType, class DeviceType>
+class ArithTraitsTesterFloatingPointBase<ScalarType, DeviceType, 0>
+    : public ArithTraitsTesterComplexBase<
+          ScalarType, DeviceType,
+          Kokkos::Details::ArithTraits<ScalarType>::is_complex> {
+ private:
   //! The base class of this class.
-  typedef ArithTraitsTesterComplexBase<ScalarType,
-                                       DeviceType,
-                                       Kokkos::Details::ArithTraits<ScalarType>::is_complex> base_type;
-public:
+  typedef ArithTraitsTesterComplexBase<
+      ScalarType, DeviceType,
+      Kokkos::Details::ArithTraits<ScalarType>::is_complex>
+      base_type;
+
+ public:
   typedef DeviceType execution_space;
   typedef typename execution_space::size_type size_type;
   //! Type of the result of the reduction.
   typedef int value_type;
 
   //! Constructor (does nothing, but marked as device function).
-  KOKKOS_INLINE_FUNCTION ArithTraitsTesterFloatingPointBase () {}
+  KOKKOS_INLINE_FUNCTION ArithTraitsTesterFloatingPointBase() {}
 
-  KOKKOS_INLINE_FUNCTION void
-  operator () (size_type iwork, value_type& dst) const {
+  KOKKOS_INLINE_FUNCTION void operator()(size_type iwork,
+                                         value_type& dst) const {
     TRACE();
     typedef Kokkos::Details::ArithTraits<ScalarType> AT;
-    (void) iwork; // forestall compiler warning for unused variable
+    (void)iwork;  // forestall compiler warning for unused variable
     int success = 1;
 
     if (AT::is_exact) {
@@ -1287,27 +1275,27 @@ class ArithTraitsTesterFloatingPointBase<ScalarType, DeviceType, 0> :
       FAILURE();
     }
 
-    if (! AT::isNan (AT::nan ())) {
+    if (!AT::isNan(AT::nan())) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("NaN is not NaN\n");
       FAILURE();
     }
 
-    const ScalarType zero = AT::zero ();
-    const ScalarType one = AT::one ();
+    const ScalarType zero = AT::zero();
+    const ScalarType one  = AT::one();
 
-    if (AT::isInf (zero)) {
+    if (AT::isInf(zero)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("0 is Inf\n");
       FAILURE();
     }
-    if (AT::isInf (one)) {
+    if (AT::isInf(one)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("1 is Inf\n");
       FAILURE();
     }
-    if (AT::isNan (zero)) {
+    if (AT::isNan(zero)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("0 is NaN\n");
       FAILURE();
     }
-    if (AT::isNan (one)) {
+    if (AT::isNan(one)) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("1 is NaN\n");
       FAILURE();
     }
@@ -1316,14 +1304,14 @@ class ArithTraitsTesterFloatingPointBase<ScalarType, DeviceType, 0> :
     // implementation of operator() must do this, in order to include
     // the parent class' tests.
     int baseResult = 1;
-    base_type::operator () (iwork, baseResult);
+    base_type::operator()(iwork, baseResult);
     success = success && baseResult;
 
     dst = dst && success;
   }
 
-protected:
-  virtual int testHostImpl (std::ostream& out) const {
+ protected:
+  virtual int testHostImpl(std::ostream& out) const {
     typedef Kokkos::Details::ArithTraits<ScalarType> AT;
     using std::endl;
     int success = 1;
@@ -1333,30 +1321,30 @@ class ArithTraitsTesterFloatingPointBase<ScalarType, DeviceType, 0> :
       FAILURE();
     }
 
-    //if (std::numeric_limits<ScalarType>::is_iec559) {
-    //success = success && AT::isInf (AT::inf ());
-    if (! AT::isNan (AT::nan ())) {
+    // if (std::numeric_limits<ScalarType>::is_iec559) {
+    // success = success && AT::isInf (AT::inf ());
+    if (!AT::isNan(AT::nan())) {
       out << "isNan or nan failed" << endl;
       FAILURE();
     }
     //}
 
-    const ScalarType zero = AT::zero ();
-    const ScalarType one = AT::one ();
+    const ScalarType zero = AT::zero();
+    const ScalarType one  = AT::one();
 
-    if (AT::isInf (zero)) {
+    if (AT::isInf(zero)) {
       out << "isInf(zero) is 1" << endl;
       FAILURE();
     }
-    if (AT::isInf (one)) {
+    if (AT::isInf(one)) {
       out << "isInf(one) is 1" << endl;
       FAILURE();
     }
-    if (AT::isNan (zero)) {
+    if (AT::isNan(zero)) {
       out << "isNan(zero) is 1" << endl;
       FAILURE();
     }
-    if (AT::isNan (one)) {
+    if (AT::isNan(one)) {
       out << "isNan(one) is 1" << endl;
       FAILURE();
     }
@@ -1364,46 +1352,46 @@ class ArithTraitsTesterFloatingPointBase<ScalarType, DeviceType, 0> :
     // Call the base class' implementation.  Every subclass'
     // implementation of testHostImpl() should (must) do this, in
     // order to include the parent class' tests.
-    const int parentSuccess = base_type::testHostImpl (out);
-    success = success && parentSuccess;
+    const int parentSuccess = base_type::testHostImpl(out);
+    success                 = success && parentSuccess;
 
     return success;
   }
 };
 
-
 //
 // Specialization for is_exact = 1 (i.e., ScalarType is <i>not</i>
 // a floating-point type).
 //
-template<class ScalarType, class DeviceType>
-class ArithTraitsTesterFloatingPointBase<ScalarType, DeviceType, 1> :
-  public ArithTraitsTesterComplexBase<ScalarType,
-                                      DeviceType,
-                                      Kokkos::Details::ArithTraits<ScalarType>::is_complex>
-{
-private:
+template <class ScalarType, class DeviceType>
+class ArithTraitsTesterFloatingPointBase<ScalarType, DeviceType, 1>
+    : public ArithTraitsTesterComplexBase<
+          ScalarType, DeviceType,
+          Kokkos::Details::ArithTraits<ScalarType>::is_complex> {
+ private:
   //! The base class of this class.
-  typedef ArithTraitsTesterComplexBase<ScalarType,
-                                       DeviceType,
-                                       Kokkos::Details::ArithTraits<ScalarType>::is_complex> base_type;
-public:
+  typedef ArithTraitsTesterComplexBase<
+      ScalarType, DeviceType,
+      Kokkos::Details::ArithTraits<ScalarType>::is_complex>
+      base_type;
+
+ public:
   typedef DeviceType execution_space;
   typedef typename execution_space::size_type size_type;
   //! Type of the result of the reduction.
   typedef int value_type;
 
   //! Constructor (does nothing, but marked as device function).
-  KOKKOS_INLINE_FUNCTION ArithTraitsTesterFloatingPointBase () {}
+  KOKKOS_INLINE_FUNCTION ArithTraitsTesterFloatingPointBase() {}
 
-  KOKKOS_INLINE_FUNCTION void
-  operator () (size_type iwork, value_type& dst) const {
+  KOKKOS_INLINE_FUNCTION void operator()(size_type iwork,
+                                         value_type& dst) const {
     TRACE();
     typedef Kokkos::Details::ArithTraits<ScalarType> AT;
-    (void) iwork; // forestall compiler warning for unused variable
+    (void)iwork;  // forestall compiler warning for unused variable
     int success = 1;
 
-    if (! AT::is_exact) {
+    if (!AT::is_exact) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF("! AT:is_exact\n");
       FAILURE();
     }
@@ -1412,33 +1400,32 @@ class ArithTraitsTesterFloatingPointBase<ScalarType, DeviceType, 1> :
     // implementation of operator() must do this, in order to include
     // the parent class' tests.
     int baseResult = 1;
-    base_type::operator () (iwork, baseResult);
+    base_type::operator()(iwork, baseResult);
     success = success && baseResult;
 
     dst = dst && success;
   }
 
-protected:
-  virtual int testHostImpl (std::ostream& out) const {
+ protected:
+  virtual int testHostImpl(std::ostream& out) const {
     typedef Kokkos::Details::ArithTraits<ScalarType> AT;
     using std::endl;
     int success = 1;
 
-    if (! AT::is_exact) {
+    if (!AT::is_exact) {
       out << "AT::is_exact is wrong" << endl;
       FAILURE();
     }
     // Call the base class' implementation.  Every subclass'
     // implementation of testHostImpl() should (must) do this, in
     // order to include the parent class' tests.
-    const int parentSuccess = base_type::testHostImpl (out);
-    success = success && parentSuccess;
+    const int parentSuccess = base_type::testHostImpl(out);
+    success                 = success && parentSuccess;
 
     return success;
   }
 };
 
-
 /// \class ArithTraitsTester
 /// \brief Tests for Kokkos::Details::ArithTraits
 /// \tparam ScalarType Any type for which Kokkos::Details::ArithTraits
@@ -1466,20 +1453,19 @@ class ArithTraitsTesterFloatingPointBase<ScalarType, DeviceType, 1> :
 /// (testHost()).  The device-based test is a reduction over redundant
 /// executions of the test.  All redundant executions must return
 /// '1' (passed).
-template<class ScalarType, class DeviceType>
-class ArithTraitsTester :
-  public ArithTraitsTesterFloatingPointBase<ScalarType, DeviceType> {
-public:
+template <class ScalarType, class DeviceType>
+class ArithTraitsTester
+    : public ArithTraitsTesterFloatingPointBase<ScalarType, DeviceType> {
+ public:
   typedef DeviceType execution_space;
   typedef typename execution_space::size_type size_type;
   //! Type of the result of the reduction.
   typedef int value_type;
 
   //! Constructor (does nothing, but marked as device function).
-  KOKKOS_INLINE_FUNCTION ArithTraitsTester () {}
+  KOKKOS_INLINE_FUNCTION ArithTraitsTester() {}
 };
 
-
 /// \brief Run the Kokkos::Details::ArithTraits tests on the parallel device.
 /// \tparam ScalarType Any type for which Kokkos::Details::ArithTraits
 ///   has a specialization, and which can be executed on the parallel
@@ -1490,18 +1476,20 @@ class ArithTraitsTester :
 /// device with (redundant) parallelism.
 ///
 /// \return \c 1 if all redundant executions pass, else \c 0.
-template<class ScalarType, class DeviceType>
-int testArithTraitsOnDevice (std::ostream& out, const int verbose)
-{
+template <class ScalarType, class DeviceType>
+int testArithTraitsOnDevice(std::ostream& out, const int verbose) {
   using std::endl;
   typedef ArithTraitsTester<ScalarType, DeviceType> functor_type;
-  int success = 1; // output argument of parallel_reduce
-  Kokkos::parallel_reduce ("KokkosKernels::Common::Test::ArithTraitsOnDevice", 1, functor_type (), success);
+  int success = 1;  // output argument of parallel_reduce
+  Kokkos::parallel_reduce("KokkosKernels::Common::Test::ArithTraitsOnDevice", 1,
+                          functor_type(), success);
   if (success) {
     if (verbose)
-      out << Kokkos::Details::ArithTraits<ScalarType>::name () << " passed" << endl;
+      out << Kokkos::Details::ArithTraits<ScalarType>::name() << " passed"
+          << endl;
   } else {
-    out << Kokkos::Details::ArithTraits<ScalarType>::name () << " FAILED" << endl;
+    out << Kokkos::Details::ArithTraits<ScalarType>::name() << " FAILED"
+        << endl;
   }
   return success;
 }
@@ -1514,23 +1502,23 @@ int testArithTraitsOnDevice (std::ostream& out, const int verbose)
 /// This function must be called on the host, and executes on the host.
 ///
 /// \return \c 1 if all tests pass, else \c 0.
-template<class ScalarType, class DeviceType>
-int testArithTraitsOnHost (std::ostream& out, const int verbose)
-{
+template <class ScalarType, class DeviceType>
+int testArithTraitsOnHost(std::ostream& out, const int verbose) {
   using std::endl;
   ArithTraitsTester<ScalarType, DeviceType> f;
-  const int localSuccess = f.testHost (out);
+  const int localSuccess = f.testHost(out);
 
   if (localSuccess) {
     if (verbose)
-      out << Kokkos::Details::ArithTraits<ScalarType>::name () << " passed" << endl;
+      out << Kokkos::Details::ArithTraits<ScalarType>::name() << " passed"
+          << endl;
   } else {
-    out << Kokkos::Details::ArithTraits<ScalarType>::name () << " FAILED" << endl;
+    out << Kokkos::Details::ArithTraits<ScalarType>::name() << " FAILED"
+        << endl;
   }
   return localSuccess;
 }
 
-
 /// \brief Run the Kokkos::Details::ArithTraits tests for all (valid)
 ///   scalar types, on the given parallel device.
 /// \tparam DeviceType A Kokkos parallel device type.
@@ -1540,41 +1528,61 @@ int testArithTraitsOnHost (std::ostream& out, const int verbose)
 /// on the device with (redundant) parallelism.
 ///
 /// \return \c 1 if all tests pass, else \c 0.
-template<class DeviceType>
-int runAllArithTraitsDeviceTests (std::ostream& out, const int verbose)
-{
-  int success = 1;
+template <class DeviceType>
+int runAllArithTraitsDeviceTests(std::ostream& out, const int verbose) {
+  int success    = 1;
   int curSuccess = 1;
   //
   // Built-in char(acter) types
   //
 
-  success = success && curSuccess; curSuccess = testArithTraitsOnDevice<char, DeviceType> (out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnDevice<char, DeviceType>(out, verbose);
   // Interestingly enough, char and int8_t are different types, but
   // signed char and int8_t are the same (on my system).
-  success = success && curSuccess; curSuccess = testArithTraitsOnDevice<signed char, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnDevice<unsigned char, DeviceType> (out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnDevice<signed char, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnDevice<unsigned char, DeviceType>(out, verbose);
 
   //
   // Built-in integer types
   //
 
-  success = success && curSuccess; curSuccess = testArithTraitsOnDevice<short, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnDevice<unsigned short, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnDevice<int8_t, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnDevice<uint8_t, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnDevice<int16_t, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnDevice<uint16_t, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnDevice<int32_t, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnDevice<uint32_t, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnDevice<int, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnDevice<unsigned int, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnDevice<int64_t, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnDevice<uint64_t, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnDevice<long, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnDevice<unsigned long, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnDevice<long long, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnDevice<unsigned long long, DeviceType> (out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnDevice<short, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess =
+      testArithTraitsOnDevice<unsigned short, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnDevice<int8_t, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnDevice<uint8_t, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnDevice<int16_t, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnDevice<uint16_t, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnDevice<int32_t, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnDevice<uint32_t, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnDevice<int, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnDevice<unsigned int, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnDevice<int64_t, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnDevice<uint64_t, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnDevice<long, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnDevice<unsigned long, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnDevice<long long, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess =
+      testArithTraitsOnDevice<unsigned long long, DeviceType>(out, verbose);
 
   //
   // Built-in real floating-point types
@@ -1586,21 +1594,26 @@ int runAllArithTraitsDeviceTests (std::ostream& out, const int verbose)
   curSuccess =
       testArithTraitsOnDevice<Kokkos::Experimental::half_t, DeviceType>(
           out, verbose);
-#endif // KOKKOS_HALF_T_IS_FLOAT
-  success = success && curSuccess; curSuccess = testArithTraitsOnDevice<float, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnDevice<double, DeviceType> (out, verbose);
+#endif  // KOKKOS_HALF_T_IS_FLOAT
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnDevice<float, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnDevice<double, DeviceType>(out, verbose);
 
   //
   // Kokkos' complex floating-point types
   //
 
-  success = success && curSuccess; curSuccess = testArithTraitsOnDevice<Kokkos::complex<float>, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnDevice<Kokkos::complex<double>, DeviceType> (out, verbose);
+  success = success && curSuccess;
+  curSuccess =
+      testArithTraitsOnDevice<Kokkos::complex<float>, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnDevice<Kokkos::complex<double>, DeviceType>(
+      out, verbose);
 
   return success && curSuccess;
 }
 
-
 /// \brief Run the Kokkos::Details::ArithTraits tests for all scalar
 ///   types, on the host.
 /// \tparam DeviceType A Kokkos parallel device type.
@@ -1610,56 +1623,84 @@ int runAllArithTraitsDeviceTests (std::ostream& out, const int verbose)
 /// the host.
 ///
 /// \return \c 1 if all tests pass, else \c 0.
-template<class DeviceType>
-int runAllArithTraitsHostTests (std::ostream& out, const int verbose)
-{
-  int success = 1;
+template <class DeviceType>
+int runAllArithTraitsHostTests(std::ostream& out, const int verbose) {
+  int success    = 1;
   int curSuccess = 1;
 
   //
   // Built-in char(acter) types
   //
 
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<char, DeviceType> (out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnHost<char, DeviceType>(out, verbose);
   // Interestingly enough, char and int8_t are different types, but
   // signed char and int8_t are the same (on my system).
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<signed char, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<unsigned char, DeviceType> (out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnHost<signed char, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnHost<unsigned char, DeviceType>(out, verbose);
 
   //
   // Built-in integer types
   //
 
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<short, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<unsigned short, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<int8_t, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<uint8_t, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<int16_t, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<uint16_t, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<int32_t, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<uint32_t, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<int, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<unsigned int, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<int64_t, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<uint64_t, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<long, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<unsigned long, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<long long, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<unsigned long long, DeviceType> (out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnHost<short, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnHost<unsigned short, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnHost<int8_t, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnHost<uint8_t, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnHost<int16_t, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnHost<uint16_t, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnHost<int32_t, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnHost<uint32_t, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnHost<int, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnHost<unsigned int, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnHost<int64_t, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnHost<uint64_t, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnHost<long, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnHost<unsigned long, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnHost<long long, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess =
+      testArithTraitsOnHost<unsigned long long, DeviceType>(out, verbose);
 
   //
   // Built-in real and complex floating-point types
   //
 
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<float, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<double, DeviceType> (out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnHost<float, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnHost<double, DeviceType>(out, verbose);
 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \
     !defined(KOKKOS_ENABLE_SYCL)
   // This would spill tons of warnings about host device stuff otherwise
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<long double, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<std::complex<float>, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<std::complex<double>, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<std::complex<long double>, DeviceType> (out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnHost<long double, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess =
+      testArithTraitsOnHost<std::complex<float>, DeviceType>(out, verbose);
+  success = success && curSuccess;
+  curSuccess =
+      testArithTraitsOnHost<std::complex<double>, DeviceType>(out, verbose);
+  success    = success && curSuccess;
+  curSuccess = testArithTraitsOnHost<std::complex<long double>, DeviceType>(
+      out, verbose);
 #endif
   //
   // Kokkos' complex floating-point types
@@ -1670,37 +1711,38 @@ int runAllArithTraitsHostTests (std::ostream& out, const int verbose)
   TRACE();
   curSuccess = testArithTraitsOnHost<Kokkos::Experimental::half_t, DeviceType>(
       out, verbose);
-#endif // KOKKOS_HALF_T_IS_FLOAT
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<Kokkos::complex<float>, DeviceType> (out, verbose);
-  success = success && curSuccess; curSuccess = testArithTraitsOnHost<Kokkos::complex<double>, DeviceType> (out, verbose);
-  //success = success && curSuccess; curSuccess = testArithTraitsOnHost<Kokkos::complex<long double>, DeviceType> (out, verbose);
+#endif  // KOKKOS_HALF_T_IS_FLOAT
+  success = success && curSuccess;
+  curSuccess =
+      testArithTraitsOnHost<Kokkos::complex<float>, DeviceType>(out, verbose);
+  success = success && curSuccess;
+  curSuccess =
+      testArithTraitsOnHost<Kokkos::complex<double>, DeviceType>(out, verbose);
+  // success = success && curSuccess; curSuccess =
+  // testArithTraitsOnHost<Kokkos::complex<long double>, DeviceType> (out,
+  // verbose);
 
   return success && curSuccess;
 }
 
 template <typename device>
-void test_ArithTraits ()
-{
+void test_ArithTraits() {
   using std::endl;
 
-  class NullBuffer : public std::streambuf
-  {
-  public:
+  class NullBuffer : public std::streambuf {
+   public:
     int overflow(int c) { return c; }
   };
   NullBuffer null_buffer;
-  std::ostream &out = std::cerr;
-  //std::ostream out(&null_buffer);
-
+  std::ostream& out = std::cerr;
+  // std::ostream out(&null_buffer);
 
   bool success = true;
-  success = runAllArithTraitsDeviceTests <device>(out, 0);
-  EXPECT_TRUE( success);
-  success = runAllArithTraitsHostTests <device>(out, 0);
-  EXPECT_TRUE( success);
-}
-TEST_F( TestCategory, common_ArithTraits) {
-  test_ArithTraits<TestExecSpace>();
+  success      = runAllArithTraitsDeviceTests<device>(out, 0);
+  EXPECT_TRUE(success);
+  success = runAllArithTraitsHostTests<device>(out, 0);
+  EXPECT_TRUE(success);
 }
+TEST_F(TestCategory, common_ArithTraits) { test_ArithTraits<TestExecSpace>(); }
 
-#endif // KOKKOS_ARITHTRAITSTEST_HPP
+#endif  // KOKKOS_ARITHTRAITSTEST_HPP
diff --git a/unit_test/common/Test_Common_Error.hpp b/unit_test/common/Test_Common_Error.hpp
new file mode 100644
index 0000000000..1b30e8375f
--- /dev/null
+++ b/unit_test/common/Test_Common_Error.hpp
@@ -0,0 +1,61 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef TEST_COMMON_ERROR_HPP
+#define TEST_COMMON_ERROR_HPP
+
+#include "KokkosKernels_Error.hpp"
+
+void test_kokkoskernels_throw() {
+  const std::string my_throw_msg =
+      "Testing Kokkos Kernels' throw_runtime_exception.";
+  try {
+    KokkosKernels::Impl::throw_runtime_exception(my_throw_msg);
+  } catch (const std::runtime_error& e) {
+  }
+}
+
+TEST_F(TestCategory, common_throw) { test_kokkoskernels_throw(); }
+
+#endif  // TEST_COMMON_ERROR_HPP
diff --git a/unit_test/common/Test_Common_IOUtils.hpp b/unit_test/common/Test_Common_IOUtils.hpp
new file mode 100644
index 0000000000..d315847efe
--- /dev/null
+++ b/unit_test/common/Test_Common_IOUtils.hpp
@@ -0,0 +1,103 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Test_Common_IOUtils.hpp
+/// \brief Tests for IO and print routines
+
+#ifndef KOKKOSKERNELS_IOTEST_HPP
+#define KOKKOSKERNELS_IOTEST_HPP
+
+#include <Kokkos_Core.hpp>
+#include <KokkosKernels_PrintUtils.hpp>
+
+template <typename OutStream, typename ExecSpace>
+class ViewPrintHelper {
+ public:
+  ViewPrintHelper(OutStream &out_, const char *sep_) : out(out_), sep(sep_) {}
+
+  template <typename T>
+  void operator()(T view, int limit = 0) {
+    const auto v = Kokkos::create_mirror_view_and_copy(space, view);
+    kk_print_1Dview(out, v, limit < 1, sep, limit);
+  }
+
+ private:
+  OutStream &out;
+  ExecSpace space;
+  const char *sep;
+};
+
+template <typename exec_space>
+void testPrintView() {
+  using scalar_t   = default_scalar;
+  using Unmanaged  = Kokkos::MemoryTraits<Kokkos::Unmanaged>;
+  using rank0_view = Kokkos::View<scalar_t, Kokkos::HostSpace, Unmanaged>;
+  using rank1_view = Kokkos::View<scalar_t *, Kokkos::HostSpace, Unmanaged>;
+  using rank2_view = Kokkos::View<scalar_t **, Kokkos::HostSpace, Unmanaged>;
+
+  // Note: try custom separator
+  std::stringstream out;
+  auto test = ViewPrintHelper<decltype(out), exec_space>(out, "|");
+
+  std::vector<scalar_t> vals = {4, 5, 6, 7};
+  const rank1_view hv1(vals.data(), vals.size());
+  test(hv1);
+  test(hv1, 4);
+  test(hv1, 3);
+  test(rank0_view(vals.data()));
+  test(rank2_view(vals.data(), vals.size(), 1));
+  test(rank2_view(vals.data(), vals.size() / 2, 2));
+
+  EXPECT_EQ(out.str(),
+            "4|5|6|7|\n"
+            "4|5|6|7|\n"
+            "4|... ... ...|7|\n"
+            "4|\n"
+            "4|5|6|7|\n"
+            "[2x2 multi-vector]\n");
+}
+
+TEST_F(TestCategory, common_print_view) { testPrintView<TestExecSpace>(); }
+
+#endif  // KOKKOSKERNELS_IOTEST_HPP
diff --git a/unit_test/common/Test_Common_Sorting.hpp b/unit_test/common/Test_Common_Sorting.hpp
index f936c7f454..1580a0c98b 100644
--- a/unit_test/common/Test_Common_Sorting.hpp
+++ b/unit_test/common/Test_Common_Sorting.hpp
@@ -58,18 +58,17 @@
 #include <Kokkos_Complex.hpp>
 #include <cstdlib>
 
-//Generate n randomized counts with mean <avg>.
-//Then prefix-sum into randomOffsets.
-//This simulates a CRS rowmap or other batched sorting scenario
-template<typename OrdView, typename ExecSpace>
-size_t generateRandomOffsets(OrdView randomCounts, OrdView randomOffsets, size_t n, size_t avg)
-{
+// Generate n randomized counts with mean <avg>.
+// Then prefix-sum into randomOffsets.
+// This simulates a CRS rowmap or other batched sorting scenario
+template <typename OrdView, typename ExecSpace>
+size_t generateRandomOffsets(OrdView randomCounts, OrdView randomOffsets,
+                             size_t n, size_t avg) {
   srand(54321);
   auto countsHost = Kokkos::create_mirror_view(randomCounts);
-  size_t total = 0;
-  for(size_t i = 0; i < n; i++)
-  {
-    if(avg == 0)
+  size_t total    = 0;
+  for (size_t i = 0; i < n; i++) {
+    if (avg == 0)
       countsHost(i) = 0;
     else
       countsHost(i) = 0.5 + rand() % (avg * 2);
@@ -77,23 +76,21 @@ size_t generateRandomOffsets(OrdView randomCounts, OrdView randomOffsets, size_t
   }
   Kokkos::deep_copy(randomCounts, countsHost);
   Kokkos::deep_copy(randomOffsets, randomCounts);
-  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<OrdView, ExecSpace>(n, randomOffsets);
+  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<OrdView, ExecSpace>(
+      n, randomOffsets);
   return total;
 }
 
-struct Coordinates
-{
-  KOKKOS_INLINE_FUNCTION Coordinates()
-  {
-     x = 0;
-     y = 0;
-     z = 0;
+struct Coordinates {
+  KOKKOS_INLINE_FUNCTION Coordinates() {
+    x = 0;
+    y = 0;
+    z = 0;
   }
-  KOKKOS_INLINE_FUNCTION Coordinates(double x_, double y_, double z_)
-  {
-     x = x_;
-     y = y_;
-     z = z_;
+  KOKKOS_INLINE_FUNCTION Coordinates(double x_, double y_, double z_) {
+    x = x_;
+    y = y_;
+    z = z_;
   }
   double x;
   double y;
@@ -106,65 +103,53 @@ the general version just produces integers - below
 are some specializations
 */
 
-template<typename T>
-T getRandom()
-{
+template <typename T>
+T getRandom() {
   return rand() % Kokkos::ArithTraits<T>::max();
 }
 
-//Generate a uniform double between (-5, 5)
-template<>
-double getRandom<double>()
-{
+// Generate a uniform double between (-5, 5)
+template <>
+double getRandom<double>() {
   return -5 + (10.0 * rand()) / RAND_MAX;
 }
 
-template<>
-Coordinates getRandom<Coordinates>()
-{
-  return Coordinates(getRandom<double>(), getRandom<double>(), getRandom<double>());
+template <>
+Coordinates getRandom<Coordinates>() {
+  return Coordinates(getRandom<double>(), getRandom<double>(),
+                     getRandom<double>());
 }
 
-//Specialize for Kokkos::complex, with the real and imaginary parts different
-template<typename Key, typename Value>
-struct kvHash
-{
-  Value operator()(const Key& k)
-  {
-    return (Value) (3 * k + 4);
-  }
+// Specialize for Kokkos::complex, with the real and imaginary parts different
+template <typename Key, typename Value>
+struct kvHash {
+  Value operator()(const Key& k) { return (Value)(3 * k + 4); }
 };
 
-template<typename Key>
-struct kvHash<Key, Kokkos::complex<double>>
-{
-  Kokkos::complex<double> operator()(const Key& k)
-  {
+template <typename Key>
+struct kvHash<Key, Kokkos::complex<double>> {
+  Kokkos::complex<double> operator()(const Key& k) {
     return Kokkos::complex<double>(3 * k + 4, k - 10.4);
   }
 };
 
-template<typename View>
-void fillRandom(View v)
-{
+template <typename View>
+void fillRandom(View v) {
   srand(12345);
   typedef typename View::value_type Value;
   auto vhost = Kokkos::create_mirror_view(v);
-  for(size_t i = 0; i < v.extent(0); i++)
-    vhost(i) = getRandom<Value>();
+  for (size_t i = 0; i < v.extent(0); i++) vhost(i) = getRandom<Value>();
   Kokkos::deep_copy(v, vhost);
 }
 
-template<typename KeyView, typename ValView>
-void fillRandom(KeyView k, ValView v)
-{
+template <typename KeyView, typename ValView>
+void fillRandom(KeyView k, ValView v) {
   srand(23456);
   typedef typename KeyView::value_type Key;
   typedef typename ValView::value_type Value;
   auto khost = Kokkos::create_mirror_view(k);
   auto vhost = Kokkos::create_mirror_view(v);
-  for(size_t i = 0; i < v.extent(0); i++)
-  {
+  for (size_t i = 0; i < v.extent(0); i++) {
     khost(i) = getRandom<Key>();
     vhost(i) = kvHash<Key, Value>()(khost(i));
   }
@@ -172,20 +157,19 @@ void fillRandom(KeyView k, ValView v)
   Kokkos::deep_copy(v, vhost);
 }
 
-template<typename KeyView, typename OrdView>
-struct TestSerialRadixFunctor
-{
-  using Key = typename KeyView::value_type;
+template <typename KeyView, typename OrdView>
+struct TestSerialRadixFunctor {
+  using Key         = typename KeyView::value_type;
   using UnsignedKey = typename std::make_unsigned<Key>::type;
 
-  TestSerialRadixFunctor(KeyView& keys_, KeyView& keysAux_, OrdView& counts_, OrdView& offsets_)
-    : keys(keys_), keysAux(keysAux_), counts(counts_), offsets(offsets_)
-  {}
-  KOKKOS_INLINE_FUNCTION void operator()(const int i) const
-  {
+  TestSerialRadixFunctor(KeyView& keys_, KeyView& keysAux_, OrdView& counts_,
+                         OrdView& offsets_)
+      : keys(keys_), keysAux(keysAux_), counts(counts_), offsets(offsets_) {}
+  KOKKOS_INLINE_FUNCTION void operator()(const int i) const {
     int off = offsets(i);
     KokkosKernels::SerialRadixSort<int, UnsignedKey>(
-        (UnsignedKey*) keys.data() + off, (UnsignedKey*) keysAux.data() + off, counts(i));
+        (UnsignedKey*)keys.data() + off, (UnsignedKey*)keysAux.data() + off,
+        counts(i));
   }
   KeyView keys;
   KeyView keysAux;
@@ -193,22 +177,26 @@ struct TestSerialRadixFunctor
   OrdView offsets;
 };
 
-template<typename KeyView, typename ValView, typename OrdView>
-struct TestSerialRadix2Functor
-{
-  //Sort by keys, while permuting values
-  using Key = typename KeyView::value_type;
+template <typename KeyView, typename ValView, typename OrdView>
+struct TestSerialRadix2Functor {
+  // Sort by keys, while permuting values
+  using Key         = typename KeyView::value_type;
   using UnsignedKey = typename std::make_unsigned<Key>::type;
-  using Value = typename ValView::value_type;
-
-  TestSerialRadix2Functor(KeyView& keys_, KeyView& keysAux_, ValView& values_, ValView& valuesAux_, OrdView& counts_, OrdView& offsets_)
-    : keys(keys_), keysAux(keysAux_), values(values_), valuesAux(valuesAux_), counts(counts_), offsets(offsets_)
-  {}
-  KOKKOS_INLINE_FUNCTION void operator()(const int i) const
-  {
+  using Value       = typename ValView::value_type;
+
+  TestSerialRadix2Functor(KeyView& keys_, KeyView& keysAux_, ValView& values_,
+                          ValView& valuesAux_, OrdView& counts_,
+                          OrdView& offsets_)
+      : keys(keys_),
+        keysAux(keysAux_),
+        values(values_),
+        valuesAux(valuesAux_),
+        counts(counts_),
+        offsets(offsets_) {}
+  KOKKOS_INLINE_FUNCTION void operator()(const int i) const {
     int off = offsets(i);
     KokkosKernels::SerialRadixSort2<int, UnsignedKey, Value>(
-        (UnsignedKey*) keys.data() + off, (UnsignedKey*) keysAux.data() + off,
+        (UnsignedKey*)keys.data() + off, (UnsignedKey*)keysAux.data() + off,
         values.data() + off, valuesAux.data() + off, counts(i));
   }
   KeyView keys;
@@ -219,109 +207,111 @@ struct TestSerialRadix2Functor
   OrdView offsets;
 };
 
-template<typename ExecSpace, typename Key>
-void testSerialRadixSort(size_t k, size_t subArraySize)
-{
-  //Create a view of randomized data
+template <typename ExecSpace, typename Key>
+void testSerialRadixSort(size_t k, size_t subArraySize) {
+  // Create a view of randomized data
   typedef typename ExecSpace::memory_space mem_space;
   typedef Kokkos::View<int*, mem_space> OrdView;
   typedef Kokkos::View<Key*, mem_space> KeyView;
   OrdView counts("Subarray Sizes", k);
   OrdView offsets("Subarray Offsets", k);
-  //Generate k sub-array sizes, each with size about 20
-  size_t n = generateRandomOffsets<OrdView, ExecSpace>(counts, offsets, k, subArraySize);
+  // Generate k sub-array sizes, each with size about 20
+  size_t n = generateRandomOffsets<OrdView, ExecSpace>(counts, offsets, k,
+                                                       subArraySize);
   KeyView keys("Radix sort testing data", n);
   fillRandom(keys);
-  //Sort using std::sort on host to do correctness test
+  // Sort using std::sort on host to do correctness test
   Kokkos::View<Key*, Kokkos::HostSpace> gold("Host sorted", n);
   Kokkos::deep_copy(gold, keys);
   KeyView keysAux("Radix sort aux data", n);
-  //Run the sorting on device in all sub-arrays in parallel
+  // Run the sorting on device in all sub-arrays in parallel
   typedef Kokkos::RangePolicy<ExecSpace> range_policy;
-  Kokkos::parallel_for(range_policy(0, k),
-        TestSerialRadixFunctor<KeyView, OrdView>(keys, keysAux, counts, offsets));
+  Kokkos::parallel_for(
+      range_policy(0, k),
+      TestSerialRadixFunctor<KeyView, OrdView>(keys, keysAux, counts, offsets));
   ExecSpace().fence();
-  auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts);
-  auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets);
-  for(size_t i = 0; i < k; i++)
-  {
+  auto countsHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts);
+  auto offsetsHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets);
+  for (size_t i = 0; i < k; i++) {
     Key* begin = gold.data() + offsetsHost(i);
-    Key* end = begin + countsHost(i);
+    Key* end   = begin + countsHost(i);
     std::sort(begin, end);
   }
-  //Copy actual result to host and compare
-  auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys);
-  for(size_t i = 0; i < n; i++)
-  {
+  // Copy actual result to host and compare
+  auto keysHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys);
+  for (size_t i = 0; i < n; i++) {
     ASSERT_EQ(keysHost(i), gold(i));
   }
 }
 
-template<typename ExecSpace, typename Key, typename Value>
-void testSerialRadixSort2(size_t k, size_t subArraySize)
-{
-  //Create a view of randomized data
+template <typename ExecSpace, typename Key, typename Value>
+void testSerialRadixSort2(size_t k, size_t subArraySize) {
+  // Create a view of randomized data
   typedef typename ExecSpace::memory_space mem_space;
   typedef Kokkos::View<int*, mem_space> OrdView;
   typedef Kokkos::View<Key*, mem_space> KeyView;
   typedef Kokkos::View<Value*, mem_space> ValView;
   OrdView counts("Subarray Sizes", k);
   OrdView offsets("Subarray Offsets", k);
-  //Generate k sub-array sizes, each with size about 20
-  size_t n = generateRandomOffsets<OrdView, ExecSpace>(counts, offsets, k, subArraySize);
+  // Generate k sub-array sizes, each with size about 20
+  size_t n = generateRandomOffsets<OrdView, ExecSpace>(counts, offsets, k,
+                                                       subArraySize);
   KeyView keys("Radix test keys", n);
   ValView data("Radix test data", n);
-  //The keys are randomized
+  // The keys are randomized
   fillRandom(keys, data);
   Kokkos::View<Key*, Kokkos::HostSpace> gold("Host sorted", n);
   Kokkos::deep_copy(gold, keys);
   KeyView keysAux("Radix sort aux keys", n);
   ValView dataAux("Radix sort aux data", n);
-  //Run the sorting on device in all sub-arrays in parallel
+  // Run the sorting on device in all sub-arrays in parallel
   typedef Kokkos::RangePolicy<ExecSpace> range_policy;
-  //Deliberately using a weird number for vector length
+  // Deliberately using a weird number for vector length
   Kokkos::parallel_for(range_policy(0, k),
-        TestSerialRadix2Functor<KeyView, ValView, OrdView>(keys, keysAux, data, dataAux, counts, offsets));
+                       TestSerialRadix2Functor<KeyView, ValView, OrdView>(
+                           keys, keysAux, data, dataAux, counts, offsets));
   ExecSpace().fence();
-  //Sort using std::sort on host to do correctness test
-  auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts);
-  auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets);
-  for(size_t i = 0; i < k; i++)
-  {
+  // Sort using std::sort on host to do correctness test
+  auto countsHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts);
+  auto offsetsHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets);
+  for (size_t i = 0; i < k; i++) {
     Key* begin = gold.data() + offsetsHost(i);
-    Key* end = begin + countsHost(i);
+    Key* end   = begin + countsHost(i);
     std::sort(begin, end);
   }
-  //Copy results to host
-  auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys);
-  auto dataHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), data);
-  //Make sure keys are sorted exactly (stability of sort doesn't matter)
-  for(size_t i = 0; i < n; i++)
-  {
+  // Copy results to host
+  auto keysHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys);
+  auto dataHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), data);
+  // Make sure keys are sorted exactly (stability of sort doesn't matter)
+  for (size_t i = 0; i < n; i++) {
     ASSERT_EQ(keysHost(i), gold(i));
   }
-  //Make sure the hashes of each key still matches the corresponding value
-  for(size_t i = 0; i < n; i++)
-  {
+  // Make sure the hashes of each key still matches the corresponding value
+  for (size_t i = 0; i < n; i++) {
     auto correctHash = kvHash<Key, Value>()(keysHost(i));
     ASSERT_EQ(dataHost(i), correctHash);
   }
 }
 
-template<typename ValView, typename OrdView>
-struct TestTeamBitonicFunctor
-{
+template <typename ValView, typename OrdView>
+struct TestTeamBitonicFunctor {
   typedef typename ValView::value_type Value;
 
   TestTeamBitonicFunctor(ValView& values_, OrdView& counts_, OrdView& offsets_)
-    : values(values_), counts(counts_), offsets(offsets_)
-  {}
+      : values(values_), counts(counts_), offsets(offsets_) {}
 
-  template<typename TeamMem>
-  KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const
-  {
+  template <typename TeamMem>
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const {
     int i = t.league_rank();
-    KokkosKernels::TeamBitonicSort<int, Value, TeamMem>(values.data() + offsets(i), counts(i), t);
+    KokkosKernels::TeamBitonicSort<int, Value, TeamMem>(
+        values.data() + offsets(i), counts(i), t);
   }
 
   ValView values;
@@ -329,21 +319,20 @@ struct TestTeamBitonicFunctor
   OrdView offsets;
 };
 
-template<typename KeyView, typename ValView, typename OrdView>
-struct TestTeamBitonic2Functor
-{
+template <typename KeyView, typename ValView, typename OrdView>
+struct TestTeamBitonic2Functor {
   typedef typename KeyView::value_type Key;
   typedef typename ValView::value_type Value;
 
-  TestTeamBitonic2Functor(KeyView& keys_, ValView& values_, OrdView& counts_, OrdView& offsets_)
-    : keys(keys_), values(values_), counts(counts_), offsets(offsets_)
-  {}
+  TestTeamBitonic2Functor(KeyView& keys_, ValView& values_, OrdView& counts_,
+                          OrdView& offsets_)
+      : keys(keys_), values(values_), counts(counts_), offsets(offsets_) {}
 
-  template<typename TeamMem>
-  KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const
-  {
+  template <typename TeamMem>
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const {
     int i = t.league_rank();
-    KokkosKernels::TeamBitonicSort2<int, Key, Value, TeamMem>(keys.data() + offsets(i), values.data() + offsets(i), counts(i), t);
+    KokkosKernels::TeamBitonicSort2<int, Key, Value, TeamMem>(
+        keys.data() + offsets(i), values.data() + offsets(i), counts(i), t);
   }
 
   KeyView keys;
@@ -352,108 +341,106 @@ struct TestTeamBitonic2Functor
   OrdView offsets;
 };
 
-template<typename ExecSpace, typename Scalar>
-void testTeamBitonicSort(size_t k, size_t subArraySize)
-{
-  //Create a view of randomized data
+template <typename ExecSpace, typename Scalar>
+void testTeamBitonicSort(size_t k, size_t subArraySize) {
+  // Create a view of randomized data
   typedef typename ExecSpace::memory_space mem_space;
   typedef Kokkos::View<int*, mem_space> OrdView;
   typedef Kokkos::View<Scalar*, mem_space> ValView;
   OrdView counts("Subarray Sizes", k);
   OrdView offsets("Subarray Offsets", k);
-  //Generate k sub-array sizes, each with size about 20
-  size_t n = generateRandomOffsets<OrdView, ExecSpace>(counts, offsets, k, subArraySize);
+  // Generate k sub-array sizes, each with size about 20
+  size_t n = generateRandomOffsets<OrdView, ExecSpace>(counts, offsets, k,
+                                                       subArraySize);
   ValView data("Bitonic sort testing data", n);
   fillRandom(data);
   Kokkos::View<Scalar*, Kokkos::HostSpace> gold("Host sorted", n);
   Kokkos::deep_copy(gold, data);
-  //Run the sorting on device in all sub-arrays in parallel
-  Kokkos::parallel_for(Kokkos::TeamPolicy<ExecSpace>(k, Kokkos::AUTO()),
+  // Run the sorting on device in all sub-arrays in parallel
+  Kokkos::parallel_for(
+      Kokkos::TeamPolicy<ExecSpace>(k, Kokkos::AUTO()),
       TestTeamBitonicFunctor<ValView, OrdView>(data, counts, offsets));
-  //Copy result to host
+  // Copy result to host
   auto dataHost = Kokkos::create_mirror_view(data);
   Kokkos::deep_copy(dataHost, data);
-  //Sort using std::sort on host to do correctness test
+  // Sort using std::sort on host to do correctness test
   ExecSpace().fence();
-  auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts);
-  auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets);
-  for(size_t i = 0; i < k; i++)
-  {
+  auto countsHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts);
+  auto offsetsHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets);
+  for (size_t i = 0; i < k; i++) {
     Scalar* begin = gold.data() + offsetsHost(i);
-    Scalar* end = begin + countsHost(i);
+    Scalar* end   = begin + countsHost(i);
     std::sort(begin, end);
   }
-  for(size_t i = 0; i < n; i++)
-  {
+  for (size_t i = 0; i < n; i++) {
     ASSERT_EQ(dataHost(i), gold(i));
   }
 }
 
-template<typename ExecSpace, typename Key, typename Value>
-void testTeamBitonicSort2(size_t k, size_t subArraySize)
-{
-  //Create a view of randomized data
+template <typename ExecSpace, typename Key, typename Value>
+void testTeamBitonicSort2(size_t k, size_t subArraySize) {
+  // Create a view of randomized data
   typedef typename ExecSpace::memory_space mem_space;
   typedef Kokkos::View<int*, mem_space> OrdView;
   typedef Kokkos::View<Key*, mem_space> KeyView;
   typedef Kokkos::View<Value*, mem_space> ValView;
   OrdView counts("Subarray Sizes", k);
   OrdView offsets("Subarray Offsets", k);
-  //Generate k sub-array sizes, each with size about 20
-  size_t n = generateRandomOffsets<OrdView, ExecSpace>(counts, offsets, k, subArraySize);
+  // Generate k sub-array sizes, each with size about 20
+  size_t n = generateRandomOffsets<OrdView, ExecSpace>(counts, offsets, k,
+                                                       subArraySize);
   KeyView keys("Bitonic test keys", n);
   ValView data("Bitonic test data", n);
-  //The keys are randomized
+  // The keys are randomized
   fillRandom(keys, data);
   Kokkos::View<Key*, Kokkos::HostSpace> gold("Host sorted", n);
   Kokkos::deep_copy(gold, keys);
-  //Run the sorting on device in all sub-arrays in parallel, just using vector loops
-  //Deliberately using a weird number for vector length
+  // Run the sorting on device in all sub-arrays in parallel, just using vector
+  // loops Deliberately using a weird number for vector length
   Kokkos::parallel_for(Kokkos::TeamPolicy<ExecSpace>(k, Kokkos::AUTO()),
-      TestTeamBitonic2Functor<KeyView, ValView, OrdView>(keys, data, counts, offsets));
+                       TestTeamBitonic2Functor<KeyView, ValView, OrdView>(
+                           keys, data, counts, offsets));
   ExecSpace().fence();
-  auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts);
-  auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets);
-  //Sort using std::sort on host to do correctness test
-  for(size_t i = 0; i < k; i++)
-  {
+  auto countsHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts);
+  auto offsetsHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets);
+  // Sort using std::sort on host to do correctness test
+  for (size_t i = 0; i < k; i++) {
     Key* begin = gold.data() + offsetsHost(i);
-    Key* end = begin + countsHost(i);
+    Key* end   = begin + countsHost(i);
     std::sort(begin, end);
   }
-  //Copy results to host
-  auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys);
-  auto dataHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), data);
-  //Make sure keys are sorted exactly (stability of sort doesn't matter)
-  for(size_t i = 0; i < n; i++)
-  {
+  // Copy results to host
+  auto keysHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys);
+  auto dataHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), data);
+  // Make sure keys are sorted exactly (stability of sort doesn't matter)
+  for (size_t i = 0; i < n; i++) {
     ASSERT_EQ(keysHost(i), gold(i));
   }
-  //Make sure the hashes of each key still matches the corresponding value
-  for(size_t i = 0; i < n; i++)
-  {
+  // Make sure the hashes of each key still matches the corresponding value
+  for (size_t i = 0; i < n; i++) {
     auto correctHash = kvHash<Key, Value>()(keysHost(i));
     ASSERT_EQ(dataHost(i), correctHash);
   }
 }
 
-template<typename View>
-struct CheckSortedFunctor
-{
-  CheckSortedFunctor(View& v_)
-    : v(v_) {}
-  KOKKOS_INLINE_FUNCTION void operator()(int i, int& lval) const
-  {
-    if(v(i) > v(i + 1))
-      lval = 0;
+template <typename View>
+struct CheckSortedFunctor {
+  CheckSortedFunctor(View& v_) : v(v_) {}
+  KOKKOS_INLINE_FUNCTION void operator()(int i, int& lval) const {
+    if (v(i) > v(i + 1)) lval = 0;
   }
   View v;
 };
 
-template<typename ExecSpace, typename Scalar>
-void testBitonicSort(size_t n)
-{
-  //Create a view of randomized data
+template <typename ExecSpace, typename Scalar>
+void testBitonicSort(size_t n) {
+  // Create a view of randomized data
   typedef typename ExecSpace::memory_space mem_space;
   typedef Kokkos::View<Scalar*, mem_space> ValView;
   ValView data("Bitonic sort testing data", n);
@@ -461,41 +448,36 @@ void testBitonicSort(size_t n)
   KokkosKernels::bitonicSort<ValView, ExecSpace, int>(data);
   int ordered = 1;
   Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, n - 1),
-      CheckSortedFunctor<ValView>(data), Kokkos::Min<int>(ordered));
+                          CheckSortedFunctor<ValView>(data),
+                          Kokkos::Min<int>(ordered));
   ASSERT_TRUE(ordered);
 }
 
-//Check that the view is weakly ordered according to Comparator:
-//Comparator never says that element i+1 belongs before element i.
-template<typename View, typename Comparator>
-struct CheckOrderedFunctor
-{
-  CheckOrderedFunctor(View& v_)
-    : v(v_) {}
-  KOKKOS_INLINE_FUNCTION void operator()(int i, int& lval) const
-  {
+// Check that the view is weakly ordered according to Comparator:
+// Comparator never says that element i+1 belongs before element i.
+template <typename View, typename Comparator>
+struct CheckOrderedFunctor {
+  CheckOrderedFunctor(View& v_) : v(v_) {}
+  KOKKOS_INLINE_FUNCTION void operator()(int i, int& lval) const {
     Comparator comp;
-    if(comp(v(i + 1), v(i)))
-      lval = 0;
+    if (comp(v(i + 1), v(i))) lval = 0;
   }
   View v;
 };
 
-template<typename Scalar>
-struct CompareDescending
-{
-  KOKKOS_INLINE_FUNCTION bool operator()(const Scalar lhs, const Scalar rhs) const
-  {
+template <typename Scalar>
+struct CompareDescending {
+  KOKKOS_INLINE_FUNCTION bool operator()(const Scalar lhs,
+                                         const Scalar rhs) const {
     return lhs > rhs;
   }
 };
 
-template<typename ExecSpace>
-void testBitonicSortDescending()
-{
+template <typename ExecSpace>
+void testBitonicSortDescending() {
   typedef char Scalar;
   typedef CompareDescending<Scalar> Comp;
-  //Create a view of randomized data
+  // Create a view of randomized data
   typedef typename ExecSpace::memory_space mem_space;
   typedef Kokkos::View<Scalar*, mem_space> ValView;
   size_t n = 12521;
@@ -504,33 +486,32 @@ void testBitonicSortDescending()
   KokkosKernels::bitonicSort<ValView, ExecSpace, int, Comp>(data);
   int ordered = 1;
   Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, n - 1),
-      CheckOrderedFunctor<ValView, Comp>(data), Kokkos::Min<int>(ordered));
+                          CheckOrderedFunctor<ValView, Comp>(data),
+                          Kokkos::Min<int>(ordered));
   ASSERT_TRUE(ordered);
 }
 
-struct LexCompare
-{
-  KOKKOS_INLINE_FUNCTION bool operator()(const Coordinates lhs, const Coordinates rhs) const
-  {
-    if(lhs.x < rhs.x)
+struct LexCompare {
+  KOKKOS_INLINE_FUNCTION bool operator()(const Coordinates lhs,
+                                         const Coordinates rhs) const {
+    if (lhs.x < rhs.x)
       return true;
-    else if(lhs.x > rhs.x)
+    else if (lhs.x > rhs.x)
       return false;
-    else if(lhs.y < rhs.y)
+    else if (lhs.y < rhs.y)
       return true;
-    else if(lhs.y > rhs.y)
+    else if (lhs.y > rhs.y)
       return false;
-    else if(lhs.z < rhs.z)
+    else if (lhs.z < rhs.z)
       return true;
     return false;
   }
 };
 
-template<typename ExecSpace>
-void testBitonicSortLexicographic()
-{
+template <typename ExecSpace>
+void testBitonicSortLexicographic() {
   typedef Coordinates Scalar;
-  //Create a view of randomized data
+  // Create a view of randomized data
   typedef typename ExecSpace::memory_space mem_space;
   typedef Kokkos::View<Scalar*, mem_space> ValView;
   size_t n = 9521;
@@ -539,295 +520,277 @@ void testBitonicSortLexicographic()
   KokkosKernels::bitonicSort<ValView, ExecSpace, int, LexCompare>(data);
   int ordered = 1;
   Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, n - 1),
-      CheckOrderedFunctor<ValView, LexCompare>(data), Kokkos::Min<int>(ordered));
+                          CheckOrderedFunctor<ValView, LexCompare>(data),
+                          Kokkos::Min<int>(ordered));
   ASSERT_TRUE(ordered);
 }
 
-template<typename exec_space>
-void testSortCRS(default_lno_t numRows, default_lno_t numCols, default_size_type nnz, bool doValues, bool doStructInterface)
-{
-  using scalar_t = default_scalar;
-  using lno_t = default_lno_t;
+template <typename exec_space>
+void testSortCRS(default_lno_t numRows, default_lno_t numCols,
+                 default_size_type nnz, bool doValues, bool doStructInterface) {
+  using scalar_t  = default_scalar;
+  using lno_t     = default_lno_t;
   using size_type = default_size_type;
   using mem_space = typename exec_space::memory_space;
-  using device_t = Kokkos::Device<exec_space, mem_space>;
-  using crsMat_t = KokkosSparse::CrsMatrix<scalar_t, lno_t, device_t, void, size_type>;
-  using rowmap_t = typename crsMat_t::row_map_type;
+  using device_t  = Kokkos::Device<exec_space, mem_space>;
+  using crsMat_t =
+      KokkosSparse::CrsMatrix<scalar_t, lno_t, device_t, void, size_type>;
+  using rowmap_t  = typename crsMat_t::row_map_type;
   using entries_t = typename crsMat_t::index_type;
-  using values_t = typename crsMat_t::values_type;
-  //Create a random matrix on device
-  //IMPORTANT: kk_generate_sparse_matrix does not sort the rows, if it did this
-  //wouldn't test anything
-  crsMat_t A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>
-    (numRows, numCols, nnz, 2, numCols / 2);
-  auto rowmap = A.graph.row_map;
+  using values_t  = typename crsMat_t::values_type;
+  // Create a random matrix on device
+  // IMPORTANT: kk_generate_sparse_matrix does not sort the rows, if it did this
+  // wouldn't test anything
+  crsMat_t A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(
+      numRows, numCols, nnz, 2, numCols / 2);
+  auto rowmap  = A.graph.row_map;
   auto entries = A.graph.entries;
-  auto values = A.values;
-  Kokkos::View<size_type*, Kokkos::HostSpace> rowmapHost("rowmap host", numRows + 1);
-  Kokkos::View<lno_t*, Kokkos::HostSpace> entriesHost("sorted entries host", nnz);
-  Kokkos::View<scalar_t*, Kokkos::HostSpace> valuesHost("sorted values host", nnz);
+  auto values  = A.values;
+  Kokkos::View<size_type*, Kokkos::HostSpace> rowmapHost("rowmap host",
+                                                         numRows + 1);
+  Kokkos::View<lno_t*, Kokkos::HostSpace> entriesHost("sorted entries host",
+                                                      nnz);
+  Kokkos::View<scalar_t*, Kokkos::HostSpace> valuesHost("sorted values host",
+                                                        nnz);
   Kokkos::deep_copy(rowmapHost, rowmap);
   Kokkos::deep_copy(entriesHost, entries);
   Kokkos::deep_copy(valuesHost, values);
-  struct ColValue
-  {
+  struct ColValue {
     ColValue() {}
     ColValue(lno_t c, scalar_t v) : col(c), val(v) {}
-    bool operator<(const ColValue& rhs) const
-    {
-      return col < rhs.col;
-    }
-    bool operator==(const ColValue& rhs) const
-    {
+    bool operator<(const ColValue& rhs) const { return col < rhs.col; }
+    bool operator==(const ColValue& rhs) const {
       return col == rhs.col && val == rhs.val;
     }
     lno_t col;
     scalar_t val;
   };
-  //sort one row at a time on host using STL.
+  // sort one row at a time on host using STL.
   {
-    for(lno_t i = 0; i < numRows; i++)
-    {
+    for (lno_t i = 0; i < numRows; i++) {
       std::vector<ColValue> rowCopy;
-      for(size_type j = rowmapHost(i); j < rowmapHost(i + 1); j++)
+      for (size_type j = rowmapHost(i); j < rowmapHost(i + 1); j++)
         rowCopy.emplace_back(entriesHost(j), valuesHost(j));
       std::sort(rowCopy.begin(), rowCopy.end());
-      //write sorted row back
-      for(size_t j = 0; j < rowCopy.size(); j++)
-      {
+      // write sorted row back
+      for (size_t j = 0; j < rowCopy.size(); j++) {
         entriesHost(rowmapHost(i) + j) = rowCopy[j].col;
-        valuesHost(rowmapHost(i) + j) = rowCopy[j].val;
+        valuesHost(rowmapHost(i) + j)  = rowCopy[j].val;
       }
     }
   }
-  //call the actual sort routine being tested
-  if(doValues)
-  {
-    if(doStructInterface)
-    {
+  // call the actual sort routine being tested
+  if (doValues) {
+    if (doStructInterface) {
       KokkosKernels::sort_crs_matrix(A);
+    } else {
+      KokkosKernels::sort_crs_matrix<exec_space, rowmap_t, entries_t, values_t>(
+          A.graph.row_map, A.graph.entries, A.values);
     }
-    else
-    {
-      KokkosKernels::sort_crs_matrix
-        <exec_space, rowmap_t, entries_t, values_t>
-        (A.graph.row_map, A.graph.entries, A.values);
-    }
-  }
-  else
-  {
-    if(doStructInterface)
-    {
+  } else {
+    if (doStructInterface) {
       KokkosKernels::sort_crs_graph(A.graph);
-    }
-    else
-    {
-      KokkosKernels::sort_crs_graph
-        <exec_space, rowmap_t, entries_t>
-        (A.graph.row_map, A.graph.entries);
+    } else {
+      KokkosKernels::sort_crs_graph<exec_space, rowmap_t, entries_t>(
+          A.graph.row_map, A.graph.entries);
     }
   }
-  //Copy to host and compare
-  Kokkos::View<lno_t*, Kokkos::HostSpace> entriesOut("sorted entries host", nnz);
-  Kokkos::View<scalar_t*, Kokkos::HostSpace> valuesOut("sorted values host", nnz);
+  // Copy to host and compare
+  Kokkos::View<lno_t*, Kokkos::HostSpace> entriesOut("sorted entries host",
+                                                     nnz);
+  Kokkos::View<scalar_t*, Kokkos::HostSpace> valuesOut("sorted values host",
+                                                       nnz);
   Kokkos::deep_copy(entriesOut, entries);
   Kokkos::deep_copy(valuesOut, values);
-  for(size_type i = 0; i < nnz; i++)
-  {
-    EXPECT_EQ(entriesHost(i), entriesOut(i)) << "Sorted column indices are wrong!";
-    if(doValues)
-    {
+  for (size_type i = 0; i < nnz; i++) {
+    EXPECT_EQ(entriesHost(i), entriesOut(i))
+        << "Sorted column indices are wrong!";
+    if (doValues) {
       EXPECT_EQ(valuesHost(i), valuesOut(i)) << "Sorted values are wrong!";
     }
   }
 }
 
-template<typename exec_space>
-void testSortCRSUnmanaged(bool doValues, bool doStructInterface)
-{
-  //This test is about bug #960.
-  using scalar_t = default_scalar;
-  using lno_t = default_lno_t;
+template <typename exec_space>
+void testSortCRSUnmanaged(bool doValues, bool doStructInterface) {
+  // This test is about bug #960.
+  using scalar_t  = default_scalar;
+  using lno_t     = default_lno_t;
   using size_type = default_size_type;
   using mem_space = typename exec_space::memory_space;
-  using device_t = Kokkos::Device<exec_space, mem_space>;
-  using crsMat_t = KokkosSparse::CrsMatrix<scalar_t, lno_t, device_t,
-        Kokkos::MemoryTraits<Kokkos::Unmanaged>, size_type>;
-  using crsMat_Managed_t = KokkosSparse::CrsMatrix<scalar_t, lno_t, device_t, void, size_type>;
-  using rowmap_t = typename crsMat_t::row_map_type;
-  using entries_t = typename crsMat_t::index_type;
-  using values_t = typename crsMat_t::values_type;
+  using device_t  = Kokkos::Device<exec_space, mem_space>;
+  using crsMat_t =
+      KokkosSparse::CrsMatrix<scalar_t, lno_t, device_t,
+                              Kokkos::MemoryTraits<Kokkos::Unmanaged>,
+                              size_type>;
+  using crsMat_Managed_t =
+      KokkosSparse::CrsMatrix<scalar_t, lno_t, device_t, void, size_type>;
+  using rowmap_t      = typename crsMat_t::row_map_type;
+  using entries_t     = typename crsMat_t::index_type;
+  using values_t      = typename crsMat_t::values_type;
   const lno_t numRows = 50;
   const lno_t numCols = numRows;
-  size_type nnz = numRows * 5;
-  //Create a random matrix on device
-  //IMPORTANT: kk_generate_sparse_matrix does not sort the rows, if it did this
-  //wouldn't test anything
-  crsMat_Managed_t A_managed = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_Managed_t>
-    (numRows, numCols, nnz, 2, numCols / 2);
+  size_type nnz       = numRows * 5;
+  // Create a random matrix on device
+  // IMPORTANT: kk_generate_sparse_matrix does not sort the rows, if it did this
+  // wouldn't test anything
+  crsMat_Managed_t A_managed =
+      KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_Managed_t>(
+          numRows, numCols, nnz, 2, numCols / 2);
   crsMat_t A(A_managed);
-  auto rowmap = A.graph.row_map;
+  auto rowmap  = A.graph.row_map;
   auto entries = A.graph.entries;
-  auto values = A.values;
-  if(doValues)
-  {
-    if(doStructInterface)
-    {
+  auto values  = A.values;
+  if (doValues) {
+    if (doStructInterface) {
       KokkosKernels::sort_crs_matrix(A);
+    } else {
+      KokkosKernels::sort_crs_matrix<exec_space, rowmap_t, entries_t, values_t>(
+          A.graph.row_map, A.graph.entries, A.values);
     }
-    else
-    {
-      KokkosKernels::sort_crs_matrix
-        <exec_space, rowmap_t, entries_t, values_t>
-        (A.graph.row_map, A.graph.entries, A.values);
-    }
-  }
-  else
-  {
-    if(doStructInterface)
-    {
+  } else {
+    if (doStructInterface) {
       KokkosKernels::sort_crs_graph(A.graph);
-    }
-    else
-    {
-      KokkosKernels::sort_crs_graph
-        <exec_space, rowmap_t, entries_t>
-        (A.graph.row_map, A.graph.entries);
+    } else {
+      KokkosKernels::sort_crs_graph<exec_space, rowmap_t, entries_t>(
+          A.graph.row_map, A.graph.entries);
     }
   }
 }
 
-template<typename exec_space>
-void testSortAndMerge()
-{
+template <typename exec_space>
+void testSortAndMerge() {
   using size_type = default_size_type;
-  using lno_t = default_lno_t;
-  using scalar_t = default_scalar;
+  using lno_t     = default_lno_t;
+  using scalar_t  = default_scalar;
   using mem_space = typename exec_space::memory_space;
-  using device_t = Kokkos::Device<exec_space, mem_space>;
-  using crsMat_t = KokkosSparse::CrsMatrix<scalar_t, lno_t, device_t, void, size_type>;
-  using rowmap_t = typename crsMat_t::row_map_type::non_const_type;
+  using device_t  = Kokkos::Device<exec_space, mem_space>;
+  using crsMat_t =
+      KokkosSparse::CrsMatrix<scalar_t, lno_t, device_t, void, size_type>;
+  using rowmap_t  = typename crsMat_t::row_map_type::non_const_type;
   using entries_t = typename crsMat_t::index_type;
-  using values_t = typename crsMat_t::values_type;
+  using values_t  = typename crsMat_t::values_type;
   using Kokkos::HostSpace;
   using Kokkos::MemoryTraits;
   using Kokkos::Unmanaged;
-  //Create a small CRS matrix on host
+  // Create a small CRS matrix on host
   std::vector<size_type> inRowmap = {0, 4, 4, 5, 7, 10};
-  std::vector<lno_t> inEntries = {
-    4, 3, 5, 3, //row 0
-                //row 1 has no entries
-    6,          //row 2
-    2, 2,       //row 3
-    0, 1, 2     //row 4
+  std::vector<lno_t> inEntries    = {
+      4, 3, 5, 3,  // row 0
+                   // row 1 has no entries
+      6,           // row 2
+      2, 2,        // row 3
+      0, 1, 2      // row 4
   };
-  //note: choosing values that can be represented exactly by float
+  // note: choosing values that can be represented exactly by float
   std::vector<scalar_t> inValues = {
-    1.5, 4, 1, -3,  //row 0
-                    //row 1
-    2,              //row 2
-    -1, -2,         //row 3
-    0, 3.5, -2.25   //row 4
+      1.5, 4, 1, -3,  // row 0
+                      // row 1
+      2,              // row 2
+      -1, -2,         // row 3
+      0, 3.5, -2.25   // row 4
   };
-  lno_t nrows = 5;
-  lno_t ncols = 7;
+  lno_t nrows   = 5;
+  lno_t ncols   = 7;
   size_type nnz = inEntries.size();
-  Kokkos::View<size_type*, HostSpace, MemoryTraits<Unmanaged>> hostInRowmap(inRowmap.data(), nrows + 1);
-  Kokkos::View<lno_t*, HostSpace, MemoryTraits<Unmanaged>> hostInEntries(inEntries.data(), nnz);
-  Kokkos::View<scalar_t*, HostSpace, MemoryTraits<Unmanaged>> hostInValues(inValues.data(), nnz);
+  Kokkos::View<size_type*, HostSpace, MemoryTraits<Unmanaged>> hostInRowmap(
+      inRowmap.data(), nrows + 1);
+  Kokkos::View<lno_t*, HostSpace, MemoryTraits<Unmanaged>> hostInEntries(
+      inEntries.data(), nnz);
+  Kokkos::View<scalar_t*, HostSpace, MemoryTraits<Unmanaged>> hostInValues(
+      inValues.data(), nnz);
   rowmap_t devInRowmap("", nrows + 1);
   entries_t devInEntries("", nnz);
   values_t devInValues("", nnz);
   Kokkos::deep_copy(devInRowmap, hostInRowmap);
   Kokkos::deep_copy(devInEntries, hostInEntries);
   Kokkos::deep_copy(devInValues, hostInValues);
-  crsMat_t input("Input", nrows, ncols, nnz,
-      devInValues, devInRowmap, devInEntries);
+  crsMat_t input("Input", nrows, ncols, nnz, devInValues, devInRowmap,
+                 devInEntries);
   crsMat_t output = KokkosKernels::sort_and_merge_matrix(input);
   exec_space().fence();
   EXPECT_EQ(output.numRows(), nrows);
   EXPECT_EQ(output.numCols(), ncols);
-  auto outRowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), output.graph.row_map);
-  auto outEntries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), output.graph.entries);
-  auto outValues = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), output.values);
-  //Expect 2 merges to have taken place
+  auto outRowmap  = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
+                                                       output.graph.row_map);
+  auto outEntries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
+                                                        output.graph.entries);
+  auto outValues =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), output.values);
+  // Expect 2 merges to have taken place
   std::vector<size_type> goldRowmap = {0, 3, 3, 4, 5, 8};
-  std::vector<lno_t> goldEntries = {
-    3, 4, 5,    //row 0
-                //row 1 has no entries
-    6,          //row 2
-    2,          //row 3
-    0, 1, 2     //row 4
+  std::vector<lno_t> goldEntries    = {
+      3, 4, 5,  // row 0
+                // row 1 has no entries
+      6,        // row 2
+      2,        // row 3
+      0, 1, 2   // row 4
   };
-  //note: choosing values that can be represented exactly by float
+  // note: choosing values that can be represented exactly by float
   std::vector<scalar_t> goldValues = {
-    1, 1.5, 1,      //row 0
-                    //row 1
-    2,              //row 2
-    -3,             //row 3
-    0, 3.5, -2.25   //row 4
+      1, 1.5, 1,     // row 0
+                     // row 1
+      2,             // row 2
+      -3,            // row 3
+      0, 3.5, -2.25  // row 4
   };
   EXPECT_EQ(goldRowmap.size(), outRowmap.extent(0));
   EXPECT_EQ(goldEntries.size(), outEntries.extent(0));
   EXPECT_EQ(goldValues.size(), outValues.extent(0));
   EXPECT_EQ(goldValues.size(), output.nnz());
-  for(lno_t i = 0; i < nrows + 1; i++)
-    EXPECT_EQ(goldRowmap[i], outRowmap(i));
-  for(size_type i = 0; i < output.nnz(); i++)
-  {
+  for (lno_t i = 0; i < nrows + 1; i++) EXPECT_EQ(goldRowmap[i], outRowmap(i));
+  for (size_type i = 0; i < output.nnz(); i++) {
     EXPECT_EQ(goldEntries[i], outEntries(i));
     EXPECT_EQ(goldValues[i], outValues(i));
   }
 }
 
 TEST_F(TestCategory, common_serial_radix) {
-  //Test serial radix over some contiguous small arrays
-  //1st arg is #arrays, 2nd arg is max subarray size
+  // Test serial radix over some contiguous small arrays
+  // 1st arg is #arrays, 2nd arg is max subarray size
   size_t numArrays = 100;
-  for(size_t arrayMax = 0; arrayMax < 1000; arrayMax = 1 + 4 * arrayMax)
-  {
+  for (size_t arrayMax = 0; arrayMax < 1000; arrayMax = 1 + 4 * arrayMax) {
     testSerialRadixSort<TestExecSpace, char>(numArrays, arrayMax);
     testSerialRadixSort<TestExecSpace, int>(numArrays, arrayMax);
   }
 }
 
 TEST_F(TestCategory, common_serial_radix2) {
-  //Test serial radix over some contiguous small arrays
-  //1st arg is #arrays, 2nd arg is max subarray size
+  // Test serial radix over some contiguous small arrays
+  // 1st arg is #arrays, 2nd arg is max subarray size
   size_t numArrays = 100;
-  for(size_t arrayMax = 0; arrayMax < 1000; arrayMax = 1 + 4 * arrayMax)
-  {
+  for (size_t arrayMax = 0; arrayMax < 1000; arrayMax = 1 + 4 * arrayMax) {
     testSerialRadixSort2<TestExecSpace, char, int>(numArrays, arrayMax);
     testSerialRadixSort2<TestExecSpace, int, double>(numArrays, arrayMax);
-    testSerialRadixSort2<TestExecSpace, int, Kokkos::complex<double>>(numArrays, arrayMax);
+    testSerialRadixSort2<TestExecSpace, int, Kokkos::complex<double>>(numArrays,
+                                                                      arrayMax);
   }
 }
 
 TEST_F(TestCategory, common_team_bitonic) {
-  //Test team-level bitonic over some contiguous medium arrays
-  //1st arg is #arrays, 2nd arg is max subarray size
+  // Test team-level bitonic over some contiguous medium arrays
+  // 1st arg is #arrays, 2nd arg is max subarray size
   size_t numArrays = 20;
-  for(size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax)
-  {
+  for (size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax) {
     testTeamBitonicSort<TestExecSpace, char>(numArrays, arrayMax);
     testTeamBitonicSort<TestExecSpace, int>(numArrays, arrayMax);
   }
 }
 
 TEST_F(TestCategory, common_team_bitonic2) {
-  //Test team-level bitonic over some contiguous medium arrays
-  //1st arg is #arrays, 2nd arg is max subarray size
+  // Test team-level bitonic over some contiguous medium arrays
+  // 1st arg is #arrays, 2nd arg is max subarray size
   size_t numArrays = 20;
-  for(size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax)
-  {
+  for (size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax) {
     testTeamBitonicSort2<TestExecSpace, char, int>(numArrays, arrayMax);
     testTeamBitonicSort2<TestExecSpace, int, double>(numArrays, arrayMax);
-    testTeamBitonicSort2<TestExecSpace, int, Kokkos::complex<double>>(numArrays, arrayMax);
+    testTeamBitonicSort2<TestExecSpace, int, Kokkos::complex<double>>(numArrays,
+                                                                      arrayMax);
   }
 }
 
-TEST_F( TestCategory, common_device_bitonic) {
-  //Test device-level bitonic with some larger arrays
+TEST_F(TestCategory, common_device_bitonic) {
+  // Test device-level bitonic with some larger arrays
   testBitonicSort<TestExecSpace, char>(243743);
   testBitonicSort<TestExecSpace, char>(2157);
   testBitonicSort<TestExecSpace, char>(424);
@@ -836,15 +799,14 @@ TEST_F( TestCategory, common_device_bitonic) {
   testBitonicSort<TestExecSpace, int>(123);
   testBitonicSort<TestExecSpace, double>(60234);
   testBitonicSort<TestExecSpace, double>(53);
-  //Test custom comparator: ">" instead of "<" to sort descending
+  // Test custom comparator: ">" instead of "<" to sort descending
   testBitonicSortDescending<TestExecSpace>();
-  //Test custom comparator: lexicographic comparison of 3-element struct
+  // Test custom comparator: lexicographic comparison of 3-element struct
   testBitonicSortLexicographic<TestExecSpace>();
 }
 
-TEST_F( TestCategory, common_sort_crsgraph) {
-  for(int doStructInterface = 0; doStructInterface < 2; doStructInterface++)
-  {
+TEST_F(TestCategory, common_sort_crsgraph) {
+  for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) {
     testSortCRS<TestExecSpace>(10, 10, 20, false, doStructInterface);
     testSortCRS<TestExecSpace>(100, 100, 2000, false, doStructInterface);
     testSortCRS<TestExecSpace>(1000, 1000, 30000, false, doStructInterface);
@@ -852,9 +814,8 @@ TEST_F( TestCategory, common_sort_crsgraph) {
   }
 }
 
-TEST_F( TestCategory, common_sort_crsmatrix) {
-  for(int doStructInterface = 0; doStructInterface < 2; doStructInterface++)
-  {
+TEST_F(TestCategory, common_sort_crsmatrix) {
+  for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) {
     testSortCRS<TestExecSpace>(10, 10, 20, true, doStructInterface);
     testSortCRS<TestExecSpace>(100, 100, 2000, true, doStructInterface);
     testSortCRS<TestExecSpace>(1000, 1000, 30000, true, doStructInterface);
@@ -862,14 +823,13 @@ TEST_F( TestCategory, common_sort_crsmatrix) {
   }
 }
 
-TEST_F( TestCategory, common_sort_crs_longrows) {
+TEST_F(TestCategory, common_sort_crs_longrows) {
   testSortCRS<TestExecSpace>(1, 50000, 10000, false, false);
   testSortCRS<TestExecSpace>(1, 50000, 10000, true, false);
 }
 
-TEST_F( TestCategory, common_sort_merge_crsmatrix) {
+TEST_F(TestCategory, common_sort_merge_crsmatrix) {
   testSortAndMerge<TestExecSpace>();
 }
 
 #endif
-
diff --git a/unit_test/common/Test_Common_Transpose.hpp b/unit_test/common/Test_Common_Transpose.hpp
index 7b8e903185..fba29da81d 100644
--- a/unit_test/common/Test_Common_Transpose.hpp
+++ b/unit_test/common/Test_Common_Transpose.hpp
@@ -55,112 +55,104 @@
 #include <KokkosKernels_default_types.hpp>
 #include <KokkosSparse_CrsMatrix.hpp>
 
-template<typename size_type, typename V>
-struct ExactCompare
-{
-  ExactCompare(const V& v1_, const V& v2_)
-    : v1(v1_), v2(v2_)
-  {}
+template <typename size_type, typename V>
+struct ExactCompare {
+  ExactCompare(const V& v1_, const V& v2_) : v1(v1_), v2(v2_) {}
 
-  KOKKOS_INLINE_FUNCTION void operator()(size_type i, size_type& ldiffs) const
-  {
-    if(v1(i) != v2(i))
-      ldiffs++;
+  KOKKOS_INLINE_FUNCTION void operator()(size_type i, size_type& ldiffs) const {
+    if (v1(i) != v2(i)) ldiffs++;
   }
 
   V v1;
   V v2;
 };
 
-template<typename exec_space>
-void testTranspose(int numRows, int numCols, bool doValues)
-{
-  using range_pol = Kokkos::RangePolicy<exec_space>;
-  using scalar_t = default_scalar;
-  using lno_t = default_lno_t;
-  using size_type = default_size_type;
-  using mem_space = typename exec_space::memory_space;
-  using device_t = Kokkos::Device<exec_space, mem_space>;
-  using crsMat_t = typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device_t, void, size_type>;
+template <typename exec_space>
+void testTranspose(int numRows, int numCols, bool doValues) {
+  using range_pol  = Kokkos::RangePolicy<exec_space>;
+  using scalar_t   = default_scalar;
+  using lno_t      = default_lno_t;
+  using size_type  = default_size_type;
+  using mem_space  = typename exec_space::memory_space;
+  using device_t   = Kokkos::Device<exec_space, mem_space>;
+  using crsMat_t   = typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device_t,
+                                                    void, size_type>;
   using c_rowmap_t = typename crsMat_t::row_map_type;
   using c_entries_t = typename crsMat_t::index_type;
-  using c_values_t = typename crsMat_t::values_type;
-  using rowmap_t = typename crsMat_t::row_map_type::non_const_type;
-  using entries_t = typename crsMat_t::index_type::non_const_type;
-  using values_t = typename crsMat_t::values_type::non_const_type;
-  size_type nnz = 10 * numRows;
-  //Generate a matrix that has 0 entries in some rows
-  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>
-    (numRows, numCols, nnz, 3 * 10, numRows / 2);
-  //compute the transpose while unsorted, then transpose again
-  rowmap_t t_rowmap("Rowmap^T", numCols + 1); //this view is initialized to 0
-  entries_t t_entries(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries^T"),
+  using c_values_t  = typename crsMat_t::values_type;
+  using rowmap_t    = typename crsMat_t::row_map_type::non_const_type;
+  using entries_t   = typename crsMat_t::index_type::non_const_type;
+  using values_t    = typename crsMat_t::values_type::non_const_type;
+  size_type nnz     = 10 * numRows;
+  // Generate a matrix that has 0 entries in some rows
+  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(
+      numRows, numCols, nnz, 3 * 10, numRows / 2);
+  // compute the transpose while unsorted, then transpose again
+  rowmap_t t_rowmap("Rowmap^T", numCols + 1);  // this view is initialized to 0
+  entries_t t_entries(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries^T"),
       input_mat.graph.entries.extent(0));
   values_t t_values(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values^T"),
-      input_mat.values.extent(0));
-  rowmap_t tt_rowmap("Rowmap^T^T", numRows + 1); //this view is initialized to 0
-  entries_t tt_entries(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries^T^T"),
+                    input_mat.values.extent(0));
+  rowmap_t tt_rowmap("Rowmap^T^T",
+                     numRows + 1);  // this view is initialized to 0
+  entries_t tt_entries(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries^T^T"),
       input_mat.graph.entries.extent(0));
-  values_t tt_values(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values^T"),
+  values_t tt_values(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values^T"),
       input_mat.values.extent(0));
-  if(doValues)
-  {
-    KokkosKernels::Impl::transpose_matrix<
-      c_rowmap_t, c_entries_t, c_values_t,
-      rowmap_t, entries_t, values_t,
-      rowmap_t, exec_space>
-        (numRows, numCols,
-         input_mat.graph.row_map, input_mat.graph.entries, input_mat.values,
-         t_rowmap, t_entries, t_values);
-    KokkosKernels::Impl::transpose_matrix<
-      rowmap_t, entries_t, values_t,
-      rowmap_t, entries_t, values_t,
-      rowmap_t, exec_space>
-        (numCols, numRows,
-         t_rowmap, t_entries, t_values,
-         tt_rowmap, tt_entries, tt_values);
-  }
-  else
-  {
-    KokkosKernels::Impl::transpose_graph<
-      c_rowmap_t, c_entries_t,
-      rowmap_t, entries_t,
-      rowmap_t, exec_space>
-        (numRows, numCols,
-         input_mat.graph.row_map, input_mat.graph.entries,
-         t_rowmap, t_entries);
-    KokkosKernels::Impl::transpose_graph<
-      rowmap_t, entries_t,
-      rowmap_t, entries_t,
-      rowmap_t, exec_space>
-        (numCols, numRows,
-         t_rowmap, t_entries,
-         tt_rowmap, tt_entries);
+  if (doValues) {
+    KokkosKernels::Impl::transpose_matrix<c_rowmap_t, c_entries_t, c_values_t,
+                                          rowmap_t, entries_t, values_t,
+                                          rowmap_t, exec_space>(
+        numRows, numCols, input_mat.graph.row_map, input_mat.graph.entries,
+        input_mat.values, t_rowmap, t_entries, t_values);
+    KokkosKernels::Impl::transpose_matrix<rowmap_t, entries_t, values_t,
+                                          rowmap_t, entries_t, values_t,
+                                          rowmap_t, exec_space>(
+        numCols, numRows, t_rowmap, t_entries, t_values, tt_rowmap, tt_entries,
+        tt_values);
+  } else {
+    KokkosKernels::Impl::transpose_graph<c_rowmap_t, c_entries_t, rowmap_t,
+                                         entries_t, rowmap_t, exec_space>(
+        numRows, numCols, input_mat.graph.row_map, input_mat.graph.entries,
+        t_rowmap, t_entries);
+    KokkosKernels::Impl::transpose_graph<rowmap_t, entries_t, rowmap_t,
+                                         entries_t, rowmap_t, exec_space>(
+        numCols, numRows, t_rowmap, t_entries, tt_rowmap, tt_entries);
   }
-  //Sort both the transpose-transpose, and the original matrix (to compare directly)
+  // Sort both the transpose-transpose, and the original matrix (to compare
+  // directly)
   KokkosKernels::sort_crs_matrix(input_mat);
-  KokkosKernels::sort_crs_matrix<exec_space, c_rowmap_t, entries_t, values_t>
-    (tt_rowmap, tt_entries, tt_values);
-  //The views should now be exactly identical, since they represent the same matrix and are sorted
+  KokkosKernels::sort_crs_matrix<exec_space, c_rowmap_t, entries_t, values_t>(
+      tt_rowmap, tt_entries, tt_values);
+  // The views should now be exactly identical, since they represent the same
+  // matrix and are sorted
   size_type rowmapDiffs;
-  Kokkos::parallel_reduce(range_pol(0, numRows + 1),
-      ExactCompare<size_type, c_rowmap_t>(input_mat.graph.row_map, tt_rowmap), rowmapDiffs);
+  Kokkos::parallel_reduce(
+      range_pol(0, numRows + 1),
+      ExactCompare<size_type, c_rowmap_t>(input_mat.graph.row_map, tt_rowmap),
+      rowmapDiffs);
   size_type entriesDiffs;
-  Kokkos::parallel_reduce(range_pol(0, input_mat.nnz()),
-      ExactCompare<size_type, c_entries_t>(input_mat.graph.entries, tt_entries), entriesDiffs);
+  Kokkos::parallel_reduce(
+      range_pol(0, input_mat.nnz()),
+      ExactCompare<size_type, c_entries_t>(input_mat.graph.entries, tt_entries),
+      entriesDiffs);
   EXPECT_EQ(size_type(0), rowmapDiffs);
   EXPECT_EQ(size_type(0), entriesDiffs);
-  if(doValues)
-  {
+  if (doValues) {
     size_type valuesDiffs;
-    Kokkos::parallel_reduce(range_pol(0, input_mat.nnz()),
-        ExactCompare<size_type, values_t>(input_mat.values, tt_values), valuesDiffs);
+    Kokkos::parallel_reduce(
+        range_pol(0, input_mat.nnz()),
+        ExactCompare<size_type, values_t>(input_mat.values, tt_values),
+        valuesDiffs);
     EXPECT_EQ(size_type(0), valuesDiffs);
   }
 }
 
-TEST_F( TestCategory, common_transpose_matrix) {
-  //Test both matrix and graph transpose with various sizes
+TEST_F(TestCategory, common_transpose_matrix) {
+  // Test both matrix and graph transpose with various sizes
   testTranspose<TestExecSpace>(100, 100, true);
   testTranspose<TestExecSpace>(500, 50, true);
   testTranspose<TestExecSpace>(50, 500, true);
@@ -169,7 +161,7 @@ TEST_F( TestCategory, common_transpose_matrix) {
   testTranspose<TestExecSpace>(2000, 2000, true);
 }
 
-TEST_F( TestCategory, common_transpose_graph) {
+TEST_F(TestCategory, common_transpose_graph) {
   testTranspose<TestExecSpace>(100, 100, false);
   testTranspose<TestExecSpace>(500, 50, false);
   testTranspose<TestExecSpace>(50, 500, false);
@@ -179,4 +171,3 @@ TEST_F( TestCategory, common_transpose_graph) {
 }
 
 #endif
-
diff --git a/unit_test/common/Test_Common_float128.hpp b/unit_test/common/Test_Common_float128.hpp
index 20bee6a4b5..1ef2913a82 100644
--- a/unit_test/common/Test_Common_float128.hpp
+++ b/unit_test/common/Test_Common_float128.hpp
@@ -67,7 +67,6 @@
 #include <Kokkos_ArithTraits.hpp>
 #include <Kokkos_Core.hpp>
 
-
 // GCC / libquadmath doesn't implement an std::ostream operator<< for
 // __float128, so we have to write our own.  At least libquadmath
 // provides a printing function specifically for __float128.
@@ -77,99 +76,98 @@
 // already have implemented this in Teuchos_ScalarTraits.hpp, which is
 // why we enclose this in an anonymous namespace.
 namespace {
-std::ostream&
-operator<< (std::ostream& out, const __float128& x)
-{
+std::ostream& operator<<(std::ostream& out, const __float128& x) {
   const size_t bufSize = 128;
   char buf[128];
 
-  const int numCharPrinted = quadmath_snprintf (buf, bufSize, "%.30Qe", x);
-  if (static_cast<size_t> (numCharPrinted) >= bufSize) {
+  const int numCharPrinted = quadmath_snprintf(buf, bufSize, "%.30Qe", x);
+  if (static_cast<size_t>(numCharPrinted) >= bufSize) {
     std::ostringstream os;
     os << "Failed to print __float128 value: buffer has " << bufSize
        << " characters, but quadmath_snprintf wanted " << numCharPrinted
        << " characters!";
-    throw std::runtime_error (os.str ());
+    throw std::runtime_error(os.str());
   }
   out << buf;
   return out;
 }
-}
+}  // namespace
 
 using std::cout;
 using std::endl;
 
-void testfloat128 ()
-{
+void testfloat128() {
   bool success = true;
   __float128 x = 1.0;
-  __float128 y = strtoflt128 ("1.111112222233333", NULL);
-  __float128 z = strtoflt128 ("1.111112222233333444445555566666", NULL);
+  __float128 y = strtoflt128("1.111112222233333", NULL);
+  __float128 z = strtoflt128("1.111112222233333444445555566666", NULL);
 
   // Make sure that all the digits print.
   cout << "x = " << x << endl
        << "y = " << y << endl
        << "z = " << z << endl
-       << "(double) z = " << static_cast<double> (z) << endl
-       << "z - (double) z = " << (z - static_cast<__float128> (static_cast<double> (z))) << endl;
+       << "(double) z = " << static_cast<double>(z) << endl
+       << "z - (double) z = "
+       << (z - static_cast<__float128>(static_cast<double>(z))) << endl;
 
   // FIXME (mfh 04 Sep 2015) The results of printing could depend on
   // the locale.  This works fine for the default locale on my system.
   {
     std::ostringstream os;
     os << x;
-    if (os.str () != "1.000000000000000000000000000000e+00") {
+    if (os.str() != "1.000000000000000000000000000000e+00") {
       success = false;
       cout << "'_float128 x = 1.0' does not print correctly!  It prints as "
-           << os.str () << "." << endl;
+           << os.str() << "." << endl;
     }
   }
   {
     std::ostringstream os;
     os << y;
-    if (os.str () != "1.111112222233333000000000000000e+00") {
+    if (os.str() != "1.111112222233333000000000000000e+00") {
       success = false;
       cout << "'__float128 y = strtoflt128 (\"1.111112222233333\", NULL);' "
-        "does not print correctly!  It prints as " << os.str () << "." << endl;
+              "does not print correctly!  It prints as "
+           << os.str() << "." << endl;
     }
   }
   {
     std::ostringstream os;
     os << z;
-    if (os.str () != "1.111112222233333444445555566666e+00") {
+    if (os.str() != "1.111112222233333444445555566666e+00") {
       success = false;
-      cout << "'__float128 z = strtoflt128 (\"1.111112222233333444445555566666\", NULL);' "
-        "does not print correctly!  It prints as " << os.str () << "." << endl;
+      cout << "'__float128 z = strtoflt128 "
+              "(\"1.111112222233333444445555566666\", NULL);' "
+              "does not print correctly!  It prints as "
+           << os.str() << "." << endl;
     }
   }
 
   // Create a Kokkos::View on the host (CUDA doesn't work yet, since
   // __float128 is a GCC extension not available in CUDA).
-  Kokkos::View<__float128*, Kokkos::HostSpace> view ("view", 20);
+  Kokkos::View<__float128*, Kokkos::HostSpace> view("view", 20);
 
   // Increment the first entry, nonatomically.
   view(0)++;
   cout << "view(0) after increment = " << view(0) << endl;
-  if (view(0) != static_cast<__float128> (1.0)) {
+  if (view(0) != static_cast<__float128>(1.0)) {
     success = false;
   }
 
   // Increment the first entry, atomically.
-  Kokkos::atomic_add (&view(0), x);
+  Kokkos::atomic_add(&view(0), x);
   cout << "view(0) after atomic_add (x) = " << view(0) << endl;
-  if (view(0) != static_cast<__float128> (2.0)) {
+  if (view(0) != static_cast<__float128>(2.0)) {
     success = false;
   }
 
   // Assign to the first entry, atomically.
-  Kokkos::atomic_assign (&view(0), z);
+  Kokkos::atomic_assign(&view(0), z);
   cout << "view(0) after atomic_assign (z) = " << view(0) << endl;
   if (view(0) != z) {
     success = false;
   }
-  EXPECT_TRUE( (success));
+  EXPECT_TRUE((success));
 }
 
-TEST_F( TestCategory, common_float128) {
-  testfloat128();
-}
+TEST_F(TestCategory, common_float128) { testfloat128(); }
diff --git a/unit_test/common/Test_Common_set_bit_count.hpp b/unit_test/common/Test_Common_set_bit_count.hpp
index 6207d0eac5..a085cc0024 100644
--- a/unit_test/common/Test_Common_set_bit_count.hpp
+++ b/unit_test/common/Test_Common_set_bit_count.hpp
@@ -42,7 +42,6 @@
 //@HEADER
 */
 
-
 #include <gtest/gtest.h>
 #include <Kokkos_Core.hpp>
 
@@ -53,16 +52,13 @@
 #include <string>
 #include <stdexcept>
 
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-
-
-
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
 
-//const char *input_filename = "sherman1.mtx";
-//const char *input_filename = "Si2.mtx";
-//const char *input_filename = "wathen_30_30.mtx";
-//const size_t expected_num_cols = 9906;
+// const char *input_filename = "sherman1.mtx";
+// const char *input_filename = "Si2.mtx";
+// const char *input_filename = "wathen_30_30.mtx";
+// const size_t expected_num_cols = 9906;
 
 using namespace KokkosKernels;
 using namespace KokkosKernels::Impl;
@@ -70,11 +66,11 @@ using namespace KokkosKernels::Impl;
 namespace Test {
 
 template <typename view_type>
-struct ppctest{
+struct ppctest {
   view_type view;
   typename view_type::non_const_type out_view;
-  ppctest(view_type view_, typename view_type::non_const_type out_view_): view(view_), out_view (out_view_){}
-
+  ppctest(view_type view_, typename view_type::non_const_type out_view_)
+      : view(view_), out_view(out_view_) {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const size_t row) const {
@@ -83,76 +79,77 @@ struct ppctest{
 };
 
 template <typename view_type>
-struct ppccheck{
+struct ppccheck {
   view_type view;
   typename view_type::non_const_type out_view;
-  ppccheck(view_type view_, typename view_type::non_const_type out_view_): view(view_), out_view (out_view_){}
+  ppccheck(view_type view_, typename view_type::non_const_type out_view_)
+      : view(view_), out_view(out_view_) {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const size_t row) const {
     typename view_type::non_const_value_type myval = view(row);
-    int num_el2 = 0;
+    int num_el2                                    = 0;
     for (; myval; num_el2++) {
-      myval = myval & (myval - 1); // clear the least significant bit set
+      myval = myval & (myval - 1);  // clear the least significant bit set
     }
     out_view(row) = num_el2;
   }
 };
 
-
 template <typename view_type, typename execution_space>
-view_type get_array_bit_count(view_type view){
-  typename view_type::non_const_type out_view ("out", view.extent(0));
+view_type get_array_bit_count(view_type view) {
+  typename view_type::non_const_type out_view("out", view.extent(0));
 
   typedef Kokkos::RangePolicy<execution_space> my_exec_space;
-  Kokkos::parallel_for( "KokkosKernels::Common::Test::GetArrayBitCount", my_exec_space(0, view.extent(0)),ppctest<view_type> (view, out_view));
+  Kokkos::parallel_for("KokkosKernels::Common::Test::GetArrayBitCount",
+                       my_exec_space(0, view.extent(0)),
+                       ppctest<view_type>(view, out_view));
   Kokkos::fence();
   return out_view;
 }
 
-
-
 template <typename view_type, typename execution_space>
-view_type check_array_bit_count(view_type view){
-  typename view_type::non_const_type  out_view ("out", view.extent(0));
+view_type check_array_bit_count(view_type view) {
+  typename view_type::non_const_type out_view("out", view.extent(0));
 
   typedef Kokkos::RangePolicy<execution_space> my_exec_space;
-  Kokkos::parallel_for( "KokkosKernels::Common::Test::CheckArrayBitCount", my_exec_space(0, view.extent(0)), ppccheck<view_type> (view, out_view));
+  Kokkos::parallel_for("KokkosKernels::Common::Test::CheckArrayBitCount",
+                       my_exec_space(0, view.extent(0)),
+                       ppccheck<view_type>(view, out_view));
   Kokkos::fence();
   return out_view;
 }
 
-
 template <typename view_type>
-struct ffstest{
+struct ffstest {
   view_type view;
   typename view_type::non_const_type out_view;
-  ffstest(view_type view_, typename view_type::non_const_type out_view_): view(view_), out_view (out_view_){}
-
+  ffstest(view_type view_, typename view_type::non_const_type out_view_)
+      : view(view_), out_view(out_view_) {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const size_t row) const {
-    if (view(row) != 0){
+    if (view(row) != 0) {
       out_view(row) = least_set_bit(view(row)) - 1;
-    }
-    else
+    } else
       out_view(row) = 0;
   }
 };
 
 template <typename view_type>
-struct ffscheck{
+struct ffscheck {
   view_type view;
   typename view_type::non_const_type out_view;
-  ffscheck(view_type view_, typename view_type::non_const_type out_view_): view(view_), out_view (out_view_){}
+  ffscheck(view_type view_, typename view_type::non_const_type out_view_)
+      : view(view_), out_view(out_view_) {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const size_t row) const {
     typename view_type::non_const_value_type myval = view(row);
-    typename view_type::non_const_value_type unit = 1;
-    out_view(row) = 0;
-    for (int i = 0; i < 64; ++i){
-      if (myval & unit << i){
+    typename view_type::non_const_value_type unit  = 1;
+    out_view(row)                                  = 0;
+    for (int i = 0; i < 64; ++i) {
+      if (myval & unit << i) {
         out_view(row) = i;
         break;
       }
@@ -160,135 +157,128 @@ struct ffscheck{
   }
 };
 
-
 template <typename view_type, typename execution_space>
-view_type get_ffs(view_type view){
-  typename view_type::non_const_type out_view ("out", view.extent(0));
+view_type get_ffs(view_type view) {
+  typename view_type::non_const_type out_view("out", view.extent(0));
 
   typedef Kokkos::RangePolicy<execution_space> my_exec_space;
-  Kokkos::parallel_for( "KokkosKernels::Common::Test::GetFFS", my_exec_space(0, view.extent(0)),  ffstest<view_type> (view, out_view));
+  Kokkos::parallel_for("KokkosKernels::Common::Test::GetFFS",
+                       my_exec_space(0, view.extent(0)),
+                       ffstest<view_type>(view, out_view));
   Kokkos::fence();
   return out_view;
 }
 
-
-
 template <typename view_type, typename execution_space>
-view_type check_ffs(view_type view){
-  typename view_type::non_const_type  out_view ("out", view.extent(0));
+view_type check_ffs(view_type view) {
+  typename view_type::non_const_type out_view("out", view.extent(0));
 
   typedef Kokkos::RangePolicy<execution_space> my_exec_space;
-  Kokkos::parallel_for( "KokkosKernels::Common::Test::CheckFFS", my_exec_space(0, view.extent(0)),  ffscheck<view_type> (view, out_view));
+  Kokkos::parallel_for("KokkosKernels::Common::Test::CheckFFS",
+                       my_exec_space(0, view.extent(0)),
+                       ffscheck<view_type>(view, out_view));
   Kokkos::fence();
   return out_view;
 }
 
-}
-
+}  // namespace Test
 
 template <typename lno_t, typename device>
 void test_set_bit_count() {
-
   const int array_size = 1000000;
-  typedef Kokkos::View <lno_t *, device> myview;
+  typedef Kokkos::View<lno_t *, device> myview;
   typedef typename myview::non_const_type nonconstview;
 
-  nonconstview count_bit_view ("count_bit_view", array_size);
+  nonconstview count_bit_view("count_bit_view", array_size);
 
-  typename nonconstview::HostMirror hview = Kokkos::create_mirror_view (count_bit_view);
+  typename nonconstview::HostMirror hview =
+      Kokkos::create_mirror_view(count_bit_view);
 
-  for (int i = 0; i < array_size; ++i){
-    hview(i) = lno_t (rand()) * lno_t (rand());
+  for (int i = 0; i < array_size; ++i) {
+    hview(i) = lno_t(rand()) * lno_t(rand());
   }
 
+  Kokkos::deep_copy(count_bit_view, hview);
 
+  // KokkosKernels::Impl::kk_print_1Dview(count_bit_view);
 
+  myview out1 =
+      Test::get_array_bit_count<myview, typename device::execution_space>(
+          count_bit_view);
+  myview out2 =
+      Test::check_array_bit_count<myview, typename device::execution_space>(
+          count_bit_view);
+  // KokkosKernels::Impl::kk_print_1Dview(out1);
+  // KokkosKernels::Impl::kk_print_1Dview(out2);
 
-  Kokkos::deep_copy (count_bit_view , hview);
-
-  //KokkosKernels::Impl::kk_print_1Dview(count_bit_view);
-
-  myview out1 = Test::get_array_bit_count<myview,typename device::execution_space>(count_bit_view);
-  myview out2 = Test::check_array_bit_count<myview,typename device::execution_space>(count_bit_view);
-  //KokkosKernels::Impl::kk_print_1Dview(out1);
-  //KokkosKernels::Impl::kk_print_1Dview(out2);
-
-
-  bool is_identical = KokkosKernels::Impl::kk_is_identical_view
-      <myview, myview, typename myview::value_type,
-      typename device::execution_space>(out1, out2, 0 );
+  bool is_identical = KokkosKernels::Impl::kk_is_identical_view<
+      myview, myview, typename myview::value_type,
+      typename device::execution_space>(out1, out2, 0);
   EXPECT_TRUE(is_identical);
 }
 
 template <typename lno_t, typename device>
 void test_ffs() {
-
   const int array_size = 1000000;
-  typedef Kokkos::View <lno_t *, device> myview;
+  typedef Kokkos::View<lno_t *, device> myview;
   typedef typename myview::non_const_type nonconstview;
 
-  nonconstview count_bit_view ("count_bit_view", array_size);
+  nonconstview count_bit_view("count_bit_view", array_size);
 
-  typename nonconstview::HostMirror hview = Kokkos::create_mirror_view (count_bit_view);
+  typename nonconstview::HostMirror hview =
+      Kokkos::create_mirror_view(count_bit_view);
 
-  for (int i = 0; i < array_size; ++i){
-    hview(i) = lno_t (rand()) * lno_t (rand());
+  for (int i = 0; i < array_size; ++i) {
+    hview(i) = lno_t(rand()) * lno_t(rand());
   }
 
+  Kokkos::deep_copy(count_bit_view, hview);
 
+  // KokkosKernels::Impl::kk_print_1Dview(count_bit_view);
 
+  myview out1 =
+      Test::get_ffs<myview, typename device::execution_space>(count_bit_view);
+  myview out2 =
+      Test::check_ffs<myview, typename device::execution_space>(count_bit_view);
+  // KokkosKernels::Impl::kk_print_1Dview(out1);
+  // KokkosKernels::Impl::kk_print_1Dview(out2);
 
-  Kokkos::deep_copy (count_bit_view , hview);
-
-  //KokkosKernels::Impl::kk_print_1Dview(count_bit_view);
-
-  myview out1 = Test::get_ffs<myview,typename device::execution_space>(count_bit_view);
-  myview out2 = Test::check_ffs<myview,typename device::execution_space>(count_bit_view);
-  //KokkosKernels::Impl::kk_print_1Dview(out1);
-  //KokkosKernels::Impl::kk_print_1Dview(out2);
-
-
-  bool is_identical = KokkosKernels::Impl::kk_is_identical_view
-      <myview, myview, typename myview::value_type,
-      typename device::execution_space>(out1, out2, 0 );
+  bool is_identical = KokkosKernels::Impl::kk_is_identical_view<
+      myview, myview, typename myview::value_type,
+      typename device::execution_space>(out1, out2, 0);
   EXPECT_TRUE(is_identical);
 }
 
-
-TEST_F( TestCategory, common_set_bit_count ) {
-  test_set_bit_count<int,TestExecSpace>();
-  test_set_bit_count<unsigned int,TestExecSpace>();
-  test_set_bit_count<const int,TestExecSpace>();
-  test_set_bit_count<const unsigned int,TestExecSpace>();
-
-
-  test_set_bit_count<long,TestExecSpace>();
-  test_set_bit_count<unsigned long,TestExecSpace>();
-  test_set_bit_count<const long,TestExecSpace>();
-  test_set_bit_count<const unsigned long,TestExecSpace>();
-
-
-  test_set_bit_count<long long,TestExecSpace>();
-  test_set_bit_count<unsigned long long,TestExecSpace>();
-  test_set_bit_count<const long long,TestExecSpace>();
-  test_set_bit_count<const unsigned long long,TestExecSpace>();
+TEST_F(TestCategory, common_set_bit_count) {
+  test_set_bit_count<int, TestExecSpace>();
+  test_set_bit_count<unsigned int, TestExecSpace>();
+  test_set_bit_count<const int, TestExecSpace>();
+  test_set_bit_count<const unsigned int, TestExecSpace>();
+
+  test_set_bit_count<long, TestExecSpace>();
+  test_set_bit_count<unsigned long, TestExecSpace>();
+  test_set_bit_count<const long, TestExecSpace>();
+  test_set_bit_count<const unsigned long, TestExecSpace>();
+
+  test_set_bit_count<long long, TestExecSpace>();
+  test_set_bit_count<unsigned long long, TestExecSpace>();
+  test_set_bit_count<const long long, TestExecSpace>();
+  test_set_bit_count<const unsigned long long, TestExecSpace>();
 }
 
-TEST_F( TestCategory, common_ffs ) {
-  test_ffs<int,TestExecSpace>();
-  test_ffs<unsigned int,TestExecSpace>();
-  test_ffs<const int,TestExecSpace>();
-  test_ffs<const unsigned int,TestExecSpace>();
-
-
-  test_ffs<long,TestExecSpace>();
-  test_ffs<unsigned long,TestExecSpace>();
-  test_ffs<const long,TestExecSpace>();
-  test_ffs<const unsigned long,TestExecSpace>();
-
-
-  test_ffs<long long,TestExecSpace>();
-  test_ffs<unsigned long long,TestExecSpace>();
-  test_ffs<const long long,TestExecSpace>();
-  test_ffs<const unsigned long long,TestExecSpace>();
+TEST_F(TestCategory, common_ffs) {
+  test_ffs<int, TestExecSpace>();
+  test_ffs<unsigned int, TestExecSpace>();
+  test_ffs<const int, TestExecSpace>();
+  test_ffs<const unsigned int, TestExecSpace>();
+
+  test_ffs<long, TestExecSpace>();
+  test_ffs<unsigned long, TestExecSpace>();
+  test_ffs<const long, TestExecSpace>();
+  test_ffs<const unsigned long, TestExecSpace>();
+
+  test_ffs<long long, TestExecSpace>();
+  test_ffs<unsigned long long, TestExecSpace>();
+  test_ffs<const long long, TestExecSpace>();
+  test_ffs<const unsigned long long, TestExecSpace>();
 }
diff --git a/unit_test/cuda/Test_Cuda.hpp b/unit_test/cuda/Test_Cuda.hpp
index 2c96d700fb..6fe4cb7fcf 100644
--- a/unit_test/cuda/Test_Cuda.hpp
+++ b/unit_test/cuda/Test_Cuda.hpp
@@ -10,17 +10,13 @@
 #endif
 
 class cuda : public ::testing::Test {
-protected:
-  static void SetUpTestCase()
-  {
-  }
+ protected:
+  static void SetUpTestCase() {}
 
-  static void TearDownTestCase()
-  {
-  }
+  static void TearDownTestCase() {}
 };
 
 #define TestCategory cuda
 #define TestExecSpace Kokkos::Cuda
 
-#endif // TEST_CUDA_HPP
+#endif  // TEST_CUDA_HPP
diff --git a/unit_test/cuda/Test_Cuda_Batched_Dense.cpp b/unit_test/cuda/Test_Cuda_Batched_Dense.cpp
index 560f101866..cbe56475ba 100644
--- a/unit_test/cuda/Test_Cuda_Batched_Dense.cpp
+++ b/unit_test/cuda/Test_Cuda_Batched_Dense.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_CUDA_BATCHED_DENSE_CPP
 #define TEST_CUDA_BATCHED_DENSE_CPP
 
-#include<Test_Cuda.hpp>
-#include<Test_Batched_Dense.hpp>
+#include <Test_Cuda.hpp>
+#include <Test_Batched_Dense.hpp>
 
-#endif // TEST_CUDA_BATCHED_DENSE_CPP
+#endif  // TEST_CUDA_BATCHED_DENSE_CPP
diff --git a/unit_test/cuda/Test_Cuda_Batched_Sparse.cpp b/unit_test/cuda/Test_Cuda_Batched_Sparse.cpp
index 036bafe092..10c97fddb0 100644
--- a/unit_test/cuda/Test_Cuda_Batched_Sparse.cpp
+++ b/unit_test/cuda/Test_Cuda_Batched_Sparse.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_CUDA_BATCHED_SPARSE_CPP
 #define TEST_CUDA_BATCHED_SPARSE_CPP
 
-#include<Test_Cuda.hpp>
-#include<Test_Batched_Sparse.hpp>
+#include <Test_Cuda.hpp>
+#include <Test_Batched_Sparse.hpp>
 
-#endif // TEST_CUDA_BATCHED_SPARSE_CPP
+#endif  // TEST_CUDA_BATCHED_SPARSE_CPP
diff --git a/unit_test/cuda/Test_Cuda_Blas.cpp b/unit_test/cuda/Test_Cuda_Blas.cpp
index 8bf6bbc49c..aad770253d 100644
--- a/unit_test/cuda/Test_Cuda_Blas.cpp
+++ b/unit_test/cuda/Test_Cuda_Blas.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_CUDA_BLAS_CPP
 #define TEST_CUDA_BLAS_CPP
 
-#include<Test_Cuda.hpp>
-#include<Test_Blas.hpp>
+#include <Test_Cuda.hpp>
+#include <Test_Blas.hpp>
 
-#endif // TEST_CUDA_BLAS_CPP
+#endif  // TEST_CUDA_BLAS_CPP
diff --git a/unit_test/cuda/Test_Cuda_Common.cpp b/unit_test/cuda/Test_Cuda_Common.cpp
index ad47d129d2..39638b4bc5 100644
--- a/unit_test/cuda/Test_Cuda_Common.cpp
+++ b/unit_test/cuda/Test_Cuda_Common.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_CUDA_COMMON_CPP
 #define TEST_CUDA_COMMON_CPP
 
-#include<Test_Cuda.hpp>
-#include<Test_Common.hpp>
+#include <Test_Cuda.hpp>
+#include <Test_Common.hpp>
 
-#endif // TEST_CUDA_COMMON_CPP
+#endif  // TEST_CUDA_COMMON_CPP
diff --git a/unit_test/cuda/Test_Cuda_Graph.cpp b/unit_test/cuda/Test_Cuda_Graph.cpp
index 48b8dcdea5..61101c66cd 100644
--- a/unit_test/cuda/Test_Cuda_Graph.cpp
+++ b/unit_test/cuda/Test_Cuda_Graph.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_CUDA_GRAPH_CPP
 #define TEST_CUDA_GRAPH_CPP
 
-#include<Test_Cuda.hpp>
-#include<Test_Graph.hpp>
+#include <Test_Cuda.hpp>
+#include <Test_Graph.hpp>
 
-#endif // TEST_CUDA_GRAPH_CPP
+#endif  // TEST_CUDA_GRAPH_CPP
diff --git a/unit_test/cuda/Test_Cuda_Sparse.cpp b/unit_test/cuda/Test_Cuda_Sparse.cpp
index 779e89b838..80f3b792e3 100644
--- a/unit_test/cuda/Test_Cuda_Sparse.cpp
+++ b/unit_test/cuda/Test_Cuda_Sparse.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_CUDA_SPARSE_CPP
 #define TEST_CUDA_SPARSE_CPP
 
-#include<Test_Cuda.hpp>
-#include<Test_Sparse.hpp>
+#include <Test_Cuda.hpp>
+#include <Test_Sparse.hpp>
 
-#endif // TEST_CUDA_SPARSE_CPP
+#endif  // TEST_CUDA_SPARSE_CPP
diff --git a/unit_test/graph/Test_Graph.hpp b/unit_test/graph/Test_Graph.hpp
index a6d846d0ae..9cce1f52b2 100644
--- a/unit_test/graph/Test_Graph.hpp
+++ b/unit_test/graph/Test_Graph.hpp
@@ -7,4 +7,4 @@
 #include "Test_Graph_mis2.hpp"
 #include "Test_Graph_rcm.hpp"
 
-#endif // TEST_GRAPH_HPP
+#endif  // TEST_GRAPH_HPP
diff --git a/unit_test/graph/Test_Graph_graph_color.hpp b/unit_test/graph/Test_Graph_graph_color.hpp
index 7ed86131da..ef7c14a931 100644
--- a/unit_test/graph/Test_Graph_graph_color.hpp
+++ b/unit_test/graph/Test_Graph_graph_color.hpp
@@ -60,23 +60,23 @@ using namespace KokkosGraph::Experimental;
 namespace Test {
 template <typename crsMat_t, typename device>
 int run_graphcolor(
-    crsMat_t input_mat,
-    ColoringAlgorithm coloring_algorithm,
+    crsMat_t input_mat, ColoringAlgorithm coloring_algorithm,
     size_t &num_colors,
-    typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type & vertex_colors){
+    typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type
+        &vertex_colors) {
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type lno_view_t;
-  typedef typename graph_t::entries_type   lno_nnz_view_t;
+  typedef typename graph_t::entries_type lno_nnz_view_t;
   typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
 
   typedef typename lno_view_t::value_type size_type;
   typedef typename lno_nnz_view_t::value_type lno_t;
   typedef typename scalar_view_t::value_type scalar_t;
 
-
-  typedef KokkosKernelsHandle
-      <size_type,lno_t, scalar_t,
-      typename device::execution_space, typename device::memory_space,typename device::memory_space > KernelHandle;
+  typedef KokkosKernelsHandle<
+      size_type, lno_t, scalar_t, typename device::execution_space,
+      typename device::memory_space, typename device::memory_space>
+      KernelHandle;
 
   KernelHandle kh;
   kh.set_team_work_size(16);
@@ -84,141 +84,167 @@ int run_graphcolor(
 
   kh.create_graph_coloring_handle(coloring_algorithm);
 
-
   const size_t num_rows_1 = input_mat.numRows();
   const size_t num_cols_1 = input_mat.numCols();
 
-  graph_color
-    <KernelHandle,lno_view_t,lno_nnz_view_t> (&kh,num_rows_1, num_cols_1,
-        input_mat.graph.row_map, input_mat.graph.entries);
+  graph_color<KernelHandle, lno_view_t, lno_nnz_view_t>(
+      &kh, num_rows_1, num_cols_1, input_mat.graph.row_map,
+      input_mat.graph.entries);
 
-  num_colors = kh.get_graph_coloring_handle()->get_num_colors();
+  num_colors    = kh.get_graph_coloring_handle()->get_num_colors();
   vertex_colors = kh.get_graph_coloring_handle()->get_vertex_colors();
   kh.destroy_graph_coloring_handle();
   return 0;
 }
 
-}
+}  // namespace Test
 
-template <typename scalar_t, typename lno_t, typename size_type, typename device>
-void test_coloring(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_variance) {
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth,
+                   lno_t row_size_variance) {
   using namespace Test;
-  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type> crsMat_t;
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
+          crsMat_t;
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type lno_view_t;
   typedef typename graph_t::entries_type lno_nnz_view_t;
-  typedef typename graph_t::entries_type::non_const_type   color_view_t;
+  typedef typename graph_t::entries_type::non_const_type color_view_t;
   typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
-  //typedef typename lno_view_t::non_const_value_type size_type;
+  // typedef typename lno_view_t::non_const_value_type size_type;
 
-  lno_t numCols = numRows;
-  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(numRows,numCols,nnz,row_size_variance, bandwidth);
+  lno_t numCols      = numRows;
+  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(
+      numRows, numCols, nnz, row_size_variance, bandwidth);
 
   typename lno_view_t::non_const_type sym_xadj;
   typename lno_nnz_view_t::non_const_type sym_adj;
 
-  KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap<lno_view_t, lno_nnz_view_t,  typename lno_view_t::non_const_type, typename lno_nnz_view_t::non_const_type, device>
-    (numRows, input_mat.graph.row_map, input_mat.graph.entries, sym_xadj, sym_adj);
+  KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap<
+      lno_view_t, lno_nnz_view_t, typename lno_view_t::non_const_type,
+      typename lno_nnz_view_t::non_const_type, device>(
+      numRows, input_mat.graph.row_map, input_mat.graph.entries, sym_xadj,
+      sym_adj);
   size_type numentries = sym_adj.extent(0);
   scalar_view_t newValues("vals", numentries);
 
-  graph_t static_graph (sym_adj, sym_xadj);
+  graph_t static_graph(sym_adj, sym_xadj);
   input_mat = crsMat_t("CrsMatrix", numCols, newValues, static_graph);
 
-  std::vector<ColoringAlgorithm> coloring_algorithms = { COLORING_DEFAULT 
-                                                       , COLORING_SERIAL 
-                                                       , COLORING_VB 
-                                                       , COLORING_VBBIT 
-                                                       , COLORING_VBCS 
-                                                       , COLORING_EB
-                                                       };
-
-  #ifdef KOKKOS_ENABLE_CUDA
-  if( !std::is_same< typename device::execution_space, Kokkos::Cuda >::value )
-  {
+  std::vector<ColoringAlgorithm> coloring_algorithms = {
+      COLORING_DEFAULT, COLORING_SERIAL, COLORING_VB, COLORING_VBBIT,
+      COLORING_VBCS};
+
+#ifdef KOKKOS_ENABLE_CUDA
+  if (!std::is_same<typename device::execution_space, Kokkos::Cuda>::value) {
     coloring_algorithms.push_back(COLORING_VBD);
   }
-  #else
+#else
   coloring_algorithms.push_back(COLORING_VBD);
-  #endif
+#endif
+
+  // FIXME SYCL: re-enable this when EB is working
+#ifdef KOKKOS_ENABLE_SYCL
+  if (!std::is_same<typename device::execution_space,
+                    Kokkos::Experimental::SYCL>::value) {
+    coloring_algorithms.push_back(COLORING_EB);
+  }
+#else
+  coloring_algorithms.push_back(COLORING_EB);
+#endif
 
   for (size_t ii = 0; ii < coloring_algorithms.size(); ++ii) {
     ColoringAlgorithm coloring_algorithm = coloring_algorithms[ii];
     color_view_t vector_colors;
     size_t num_colors;
 
-
     Kokkos::Timer timer1;
     crsMat_t output_mat;
-    int res = run_graphcolor<crsMat_t, device>(input_mat, coloring_algorithm, num_colors, vector_colors);
-    //double coloring_time = timer1.seconds();
-    EXPECT_TRUE( (res == 0));
-
+    int res = run_graphcolor<crsMat_t, device>(input_mat, coloring_algorithm,
+                                               num_colors, vector_colors);
+    // double coloring_time = timer1.seconds();
+    EXPECT_TRUE((res == 0));
 
     const lno_t num_rows_1 = input_mat.numRows();
     const lno_t num_cols_1 = input_mat.numCols();
-    lno_t num_conflict = KokkosKernels::Impl::kk_is_d1_coloring_valid
-        <lno_view_t,lno_nnz_view_t, color_view_t, typename device::execution_space>
-    (num_rows_1, num_cols_1, input_mat.graph.row_map, input_mat.graph.entries, vector_colors);
+    lno_t num_conflict     = KokkosKernels::Impl::kk_is_d1_coloring_valid<
+        lno_view_t, lno_nnz_view_t, color_view_t,
+        typename device::execution_space>(
+        num_rows_1, num_cols_1, input_mat.graph.row_map,
+        input_mat.graph.entries, vector_colors);
 
     lno_t conf = 0;
     {
-      //also check the correctness of the validation code :)
-      typename lno_view_t::HostMirror hrm = Kokkos::create_mirror_view (input_mat.graph.row_map);
-      typename lno_nnz_view_t::HostMirror hentries = Kokkos::create_mirror_view (input_mat.graph.entries);
-      typename color_view_t::HostMirror hcolor = Kokkos::create_mirror_view (vector_colors);
-      Kokkos::deep_copy (hrm , input_mat.graph.row_map);
-      Kokkos::deep_copy (hentries , input_mat.graph.entries);
-      Kokkos::deep_copy (hcolor , vector_colors);
-
-      for (lno_t i = 0; i < num_rows_1; ++i){
+      // also check the correctness of the validation code :)
+      typename lno_view_t::HostMirror hrm =
+          Kokkos::create_mirror_view(input_mat.graph.row_map);
+      typename lno_nnz_view_t::HostMirror hentries =
+          Kokkos::create_mirror_view(input_mat.graph.entries);
+      typename color_view_t::HostMirror hcolor =
+          Kokkos::create_mirror_view(vector_colors);
+      Kokkos::deep_copy(hrm, input_mat.graph.row_map);
+      Kokkos::deep_copy(hentries, input_mat.graph.entries);
+      Kokkos::deep_copy(hcolor, vector_colors);
+
+      for (lno_t i = 0; i < num_rows_1; ++i) {
         const size_type b = hrm(i);
         const size_type e = hrm(i + 1);
-        for (size_type j = b; j < e; ++j){
+        for (size_type j = b; j < e; ++j) {
           lno_t d = hentries(j);
-          if (i != d){
-            if (hcolor(d) == hcolor(i)){
+          if (i != d) {
+            if (hcolor(d) == hcolor(i)) {
               conf++;
             }
           }
         }
       }
     }
-    EXPECT_TRUE( (num_conflict == conf));
+    EXPECT_TRUE((num_conflict == conf));
 
-    EXPECT_TRUE( (num_conflict == 0));
+    EXPECT_TRUE((num_conflict == 0));
   }
-  //device::execution_space::finalize();
-
+  // device::execution_space::finalize();
 }
 
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
-TEST_F( TestCategory, graph ## _ ## graph_color ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  test_coloring<SCALAR,ORDINAL,OFFSET,DEVICE>(50000, 50000 * 30, 200, 10); \
-  test_coloring<SCALAR,ORDINAL,OFFSET,DEVICE>(50000, 50000 * 30, 100, 10); \
-}
+#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                          \
+  TEST_F(TestCategory,                                                         \
+         graph##_##graph_color##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    test_coloring<SCALAR, ORDINAL, OFFSET, DEVICE>(50000, 50000 * 30, 200,     \
+                                                   10);                        \
+    test_coloring<SCALAR, ORDINAL, OFFSET, DEVICE>(50000, 50000 * 30, 100,     \
+                                                   10);                        \
+  }
 
-#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, int, TestExecSpace)
 #endif
 
 // FIXME_SYCL
 #ifndef KOKKOS_ENABLE_SYCL
-#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, size_t, TestExecSpace)
 #endif
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
 #endif
 
 #undef EXECUTE_TEST
diff --git a/unit_test/graph/Test_Graph_graph_color_deterministic.hpp b/unit_test/graph/Test_Graph_graph_color_deterministic.hpp
index 1018090ca4..ec718e9aa4 100644
--- a/unit_test/graph/Test_Graph_graph_color_deterministic.hpp
+++ b/unit_test/graph/Test_Graph_graph_color_deterministic.hpp
@@ -60,23 +60,23 @@ using namespace KokkosGraph::Experimental;
 namespace Test {
 template <typename crsMat_t, typename device>
 int run_graphcolor_deter(
-    crsMat_t input_mat,
-    ColoringAlgorithm coloring_algorithm,
+    crsMat_t input_mat, ColoringAlgorithm coloring_algorithm,
     size_t &num_colors,
-    typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type & vertex_colors) {
+    typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type
+        &vertex_colors) {
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type lno_view_t;
-  typedef typename graph_t::entries_type   lno_nnz_view_t;
+  typedef typename graph_t::entries_type lno_nnz_view_t;
   typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
 
   typedef typename lno_view_t::value_type size_type;
   typedef typename lno_nnz_view_t::value_type lno_t;
   typedef typename scalar_view_t::value_type scalar_t;
 
-
-  typedef KokkosKernelsHandle
-      <size_type,lno_t, scalar_t,
-      typename device::execution_space, typename device::memory_space,typename device::memory_space > KernelHandle;
+  typedef KokkosKernelsHandle<
+      size_type, lno_t, scalar_t, typename device::execution_space,
+      typename device::memory_space, typename device::memory_space>
+      KernelHandle;
 
   KernelHandle kh;
   kh.set_team_work_size(16);
@@ -87,68 +87,144 @@ int run_graphcolor_deter(
   const size_t num_rows_1 = input_mat.numRows();
   const size_t num_cols_1 = input_mat.numCols();
 
-  graph_color
-    <KernelHandle,lno_view_t,lno_nnz_view_t> (&kh,num_rows_1, num_cols_1,
-        input_mat.graph.row_map, input_mat.graph.entries);
+  graph_color<KernelHandle, lno_view_t, lno_nnz_view_t>(
+      &kh, num_rows_1, num_cols_1, input_mat.graph.row_map,
+      input_mat.graph.entries);
 
-  num_colors = kh.get_graph_coloring_handle()->get_num_colors();
+  num_colors    = kh.get_graph_coloring_handle()->get_num_colors();
   vertex_colors = kh.get_graph_coloring_handle()->get_vertex_colors();
   kh.destroy_graph_coloring_handle();
   return 0;
 }
 
-}
+}  // namespace Test
 
-template <typename scalar_t, typename lno_t, typename size_type, typename device>
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
 void test_coloring_deterministic(lno_t numRows, size_type nnz) {
   using namespace Test;
-  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type> crsMat_t;
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
+          crsMat_t;
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type lno_view_t;
   typedef typename graph_t::entries_type lno_nnz_view_t;
-  typedef typename graph_t::entries_type::non_const_type   color_view_t;
+  typedef typename graph_t::entries_type::non_const_type color_view_t;
   typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
-  //typedef typename lno_view_t::non_const_value_type size_type;
+  // typedef typename lno_view_t::non_const_value_type size_type;
 
   lno_t numCols = numRows;
 
   typename lno_view_t::non_const_type xadj("xadj", numRows + 1);
-  typename lno_view_t::non_const_type::HostMirror h_xadj = Kokkos::create_mirror_view(xadj);
+  typename lno_view_t::non_const_type::HostMirror h_xadj =
+      Kokkos::create_mirror_view(xadj);
   typename lno_nnz_view_t::non_const_type adj("adj", nnz);
-  typename lno_nnz_view_t::non_const_type::HostMirror h_adj = Kokkos::create_mirror_view(adj);
+  typename lno_nnz_view_t::non_const_type::HostMirror h_adj =
+      Kokkos::create_mirror_view(adj);
 
   // Fill up the rowPtr array
-  h_xadj(0) = 0;
-  h_xadj(1) = 3; h_xadj(2) = 7; h_xadj(3) = 11; h_xadj(4) = 14; h_xadj(5) = 18; h_xadj(6) = 23;
-  h_xadj(7) = 29; h_xadj(8) = 33; h_xadj(9) = 37; h_xadj(10) = 42; h_xadj(11) = 47; h_xadj(12) = 51;
-  h_xadj(13) = 55; h_xadj(14) = 58; h_xadj(15) = 62; h_xadj(16) = 66; h_xadj(17) = 70; h_xadj(18) = 74;
+  h_xadj(0)  = 0;
+  h_xadj(1)  = 3;
+  h_xadj(2)  = 7;
+  h_xadj(3)  = 11;
+  h_xadj(4)  = 14;
+  h_xadj(5)  = 18;
+  h_xadj(6)  = 23;
+  h_xadj(7)  = 29;
+  h_xadj(8)  = 33;
+  h_xadj(9)  = 37;
+  h_xadj(10) = 42;
+  h_xadj(11) = 47;
+  h_xadj(12) = 51;
+  h_xadj(13) = 55;
+  h_xadj(14) = 58;
+  h_xadj(15) = 62;
+  h_xadj(16) = 66;
+  h_xadj(17) = 70;
+  h_xadj(18) = 74;
   Kokkos::deep_copy(xadj, h_xadj);
 
   // Fill up the column indices array
-  h_adj(0)  =  0; h_adj(1)  =  1; h_adj(2)  =  4;
-  h_adj(3)  =  0; h_adj(4)  =  1; h_adj(5)  =  2; h_adj(6)  =  5;
-  h_adj(7)  =  1; h_adj(8)  =  2; h_adj(9)  =  3; h_adj(10) =  6;
-  h_adj(11) =  2; h_adj(12) =  3; h_adj(13) =  7;
-  h_adj(14) =  0; h_adj(15) =  4; h_adj(16) =  5; h_adj(17) =  8;
-  h_adj(18) =  1; h_adj(19) =  4; h_adj(20) =  5; h_adj(21) =  6; h_adj(22) =  9;
-  h_adj(23) =  2; h_adj(24) =  5; h_adj(25) =  6; h_adj(26) =  7; h_adj(27) = 10; h_adj(28) = 12;
-  h_adj(29) =  3; h_adj(30) =  6; h_adj(31) =  7; h_adj(32) = 17;
-  h_adj(33) =  4; h_adj(34) =  8; h_adj(35) =  9; h_adj(36) = 13;
-  h_adj(37) =  5; h_adj(38) =  8; h_adj(39) =  9; h_adj(40) = 10; h_adj(41) = 14;
-  h_adj(42) =  6; h_adj(43) =  9; h_adj(44) = 10; h_adj(45) = 11; h_adj(46) = 15;
-  h_adj(47) = 10; h_adj(48) = 11; h_adj(49) = 12; h_adj(50) = 16;
-  h_adj(51) =  6; h_adj(52) = 11; h_adj(53) = 12; h_adj(54) = 17;
-  h_adj(55) =  8; h_adj(56) = 13; h_adj(57) = 14;
-  h_adj(58) =  9; h_adj(59) = 13; h_adj(60) = 14; h_adj(61) = 15;
-  h_adj(62) = 10; h_adj(63) = 14; h_adj(64) = 15; h_adj(65) = 16;
-  h_adj(66) = 11; h_adj(67) = 15; h_adj(68) = 16; h_adj(69) = 17;
-  h_adj(70) =  7; h_adj(71) = 12; h_adj(72) = 16; h_adj(73) = 17;
+  h_adj(0)  = 0;
+  h_adj(1)  = 1;
+  h_adj(2)  = 4;
+  h_adj(3)  = 0;
+  h_adj(4)  = 1;
+  h_adj(5)  = 2;
+  h_adj(6)  = 5;
+  h_adj(7)  = 1;
+  h_adj(8)  = 2;
+  h_adj(9)  = 3;
+  h_adj(10) = 6;
+  h_adj(11) = 2;
+  h_adj(12) = 3;
+  h_adj(13) = 7;
+  h_adj(14) = 0;
+  h_adj(15) = 4;
+  h_adj(16) = 5;
+  h_adj(17) = 8;
+  h_adj(18) = 1;
+  h_adj(19) = 4;
+  h_adj(20) = 5;
+  h_adj(21) = 6;
+  h_adj(22) = 9;
+  h_adj(23) = 2;
+  h_adj(24) = 5;
+  h_adj(25) = 6;
+  h_adj(26) = 7;
+  h_adj(27) = 10;
+  h_adj(28) = 12;
+  h_adj(29) = 3;
+  h_adj(30) = 6;
+  h_adj(31) = 7;
+  h_adj(32) = 17;
+  h_adj(33) = 4;
+  h_adj(34) = 8;
+  h_adj(35) = 9;
+  h_adj(36) = 13;
+  h_adj(37) = 5;
+  h_adj(38) = 8;
+  h_adj(39) = 9;
+  h_adj(40) = 10;
+  h_adj(41) = 14;
+  h_adj(42) = 6;
+  h_adj(43) = 9;
+  h_adj(44) = 10;
+  h_adj(45) = 11;
+  h_adj(46) = 15;
+  h_adj(47) = 10;
+  h_adj(48) = 11;
+  h_adj(49) = 12;
+  h_adj(50) = 16;
+  h_adj(51) = 6;
+  h_adj(52) = 11;
+  h_adj(53) = 12;
+  h_adj(54) = 17;
+  h_adj(55) = 8;
+  h_adj(56) = 13;
+  h_adj(57) = 14;
+  h_adj(58) = 9;
+  h_adj(59) = 13;
+  h_adj(60) = 14;
+  h_adj(61) = 15;
+  h_adj(62) = 10;
+  h_adj(63) = 14;
+  h_adj(64) = 15;
+  h_adj(65) = 16;
+  h_adj(66) = 11;
+  h_adj(67) = 15;
+  h_adj(68) = 16;
+  h_adj(69) = 17;
+  h_adj(70) = 7;
+  h_adj(71) = 12;
+  h_adj(72) = 16;
+  h_adj(73) = 17;
   Kokkos::deep_copy(adj, h_adj);
-  
+
   size_type numentries = adj.extent(0);
   scalar_view_t newValues("vals", numentries);
 
-  graph_t static_graph (adj, xadj);
+  graph_t static_graph(adj, xadj);
   crsMat_t input_mat("CrsMatrix", numCols, newValues, static_graph);
 
   std::vector<ColoringAlgorithm> coloring_algorithms;
@@ -161,53 +237,65 @@ void test_coloring_deterministic(lno_t numRows, size_type nnz) {
     color_view_t vector_colors;
     size_t num_colors;
 
-
     Kokkos::Timer timer1;
-    int res = run_graphcolor_deter<crsMat_t, device>(input_mat, coloring_algorithm, num_colors, vector_colors);
-    EXPECT_TRUE( (res == 0));
+    int res = run_graphcolor_deter<crsMat_t, device>(
+        input_mat, coloring_algorithm, num_colors, vector_colors);
+    EXPECT_TRUE((res == 0));
 
-    EXPECT_TRUE( (num_colors == 2));
+    EXPECT_TRUE((num_colors == 2));
 
     size_type num_conflict = 0;
-    typename color_view_t::HostMirror h_vector_colors = Kokkos::create_mirror_view(vector_colors);
+    typename color_view_t::HostMirror h_vector_colors =
+        Kokkos::create_mirror_view(vector_colors);
     Kokkos::deep_copy(h_vector_colors, vector_colors);
-    int exact_colors[18] = {2, 1, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1};
+    int exact_colors[18] = {2, 1, 2, 1, 1, 2, 1, 2, 2,
+                            1, 2, 1, 2, 1, 2, 1, 2, 1};
 
-    for(lno_t vertexIdx = 0; vertexIdx < numRows; ++vertexIdx) {
-      if(h_vector_colors(vertexIdx) != exact_colors[vertexIdx]) {++num_conflict;}
+    for (lno_t vertexIdx = 0; vertexIdx < numRows; ++vertexIdx) {
+      if (h_vector_colors(vertexIdx) != exact_colors[vertexIdx]) {
+        ++num_conflict;
+      }
     }
 
-    EXPECT_TRUE( (num_conflict == 0));
-    //device::execution_space::finalize();
-
+    EXPECT_TRUE((num_conflict == 0));
+    // device::execution_space::finalize();
   }
-
 }
 
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
-TEST_F( TestCategory, graph ## _ ## graph_color_deterministic ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  test_coloring_deterministic<SCALAR,ORDINAL,OFFSET,DEVICE>(18, 74); \
-  test_coloring_deterministic<SCALAR,ORDINAL,OFFSET,DEVICE>(18, 74); \
-}
+#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                                     \
+  TEST_F(                                                                                 \
+      TestCategory,                                                                       \
+      graph##_##graph_color_deterministic##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    test_coloring_deterministic<SCALAR, ORDINAL, OFFSET, DEVICE>(18, 74);                 \
+    test_coloring_deterministic<SCALAR, ORDINAL, OFFSET, DEVICE>(18, 74);                 \
+  }
 
-#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
 #endif
 
 #undef EXECUTE_TEST
diff --git a/unit_test/graph/Test_Graph_graph_color_distance2.hpp b/unit_test/graph/Test_Graph_graph_color_distance2.hpp
index dca602ff7e..70158941a8 100644
--- a/unit_test/graph/Test_Graph_graph_color_distance2.hpp
+++ b/unit_test/graph/Test_Graph_graph_color_distance2.hpp
@@ -62,45 +62,37 @@ using namespace KokkosGraph::Experimental;
 
 namespace Test {
 
-//Verify that a distance-2 coloring is correct (all views must be hostspace)
-template<typename lno_t, typename size_type, typename rowmap_t, typename entries_t, typename colors_t>
-bool verifyD2Coloring(
-    lno_t numVerts,
-    const rowmap_t& rowmap, const entries_t& entries,
-    const colors_t& colors)
-{
-  //Just do the simplest possible neighbors-of-neighbors loop to find conflicts
-  for(lno_t v = 0; v < numVerts; v++)
-  {
-    if(colors(v) == 0)
-    {
+// Verify that a distance-2 coloring is correct (all views must be hostspace)
+template <typename lno_t, typename size_type, typename rowmap_t,
+          typename entries_t, typename colors_t>
+bool verifyD2Coloring(lno_t numVerts, const rowmap_t& rowmap,
+                      const entries_t& entries, const colors_t& colors) {
+  // Just do the simplest possible neighbors-of-neighbors loop to find conflicts
+  for (lno_t v = 0; v < numVerts; v++) {
+    if (colors(v) == 0) {
       std::cout << "Vertex " << v << " is uncolored.\n";
       return false;
     }
     size_type rowBegin = rowmap(v);
-    size_type rowEnd = rowmap(v + 1);
-    for(size_type i = rowBegin; i < rowEnd; i++)
-    {
+    size_type rowEnd   = rowmap(v + 1);
+    for (size_type i = rowBegin; i < rowEnd; i++) {
       lno_t nei1 = entries(i);
-      if(nei1 < numVerts && nei1 != v)
-      {
-        //check for dist-1 conflict
-        if(colors(v) == colors(nei1))
-        {
-          std::cout << "Dist-1 conflict between " << v << " and " << nei1 << '\n';
+      if (nei1 < numVerts && nei1 != v) {
+        // check for dist-1 conflict
+        if (colors(v) == colors(nei1)) {
+          std::cout << "Dist-1 conflict between " << v << " and " << nei1
+                    << '\n';
           return false;
         }
-        //iterate over dist-2 neighbors
+        // iterate over dist-2 neighbors
         size_type colBegin = rowmap(nei1);
-        size_type colEnd = rowmap(nei1 + 1);
-        for(size_type j = colBegin; j < colEnd; j++)
-        {
+        size_type colEnd   = rowmap(nei1 + 1);
+        for (size_type j = colBegin; j < colEnd; j++) {
           lno_t nei2 = entries(j);
-          if(nei2 < numVerts && nei2 != v)
-          {
-            if(colors(v) == colors(nei2))
-            {
-              std::cout << "Dist-2 conflict between " << v << " and " << nei2 << '\n';
+          if (nei2 < numVerts && nei2 != v) {
+            if (colors(v) == colors(nei2)) {
+              std::cout << "Dist-2 conflict between " << v << " and " << nei2
+                        << '\n';
               return false;
             }
           }
@@ -111,39 +103,34 @@ bool verifyD2Coloring(
   return true;
 }
 
-template<typename lno_t, typename size_type, typename rowmap_t, typename entries_t, typename colors_t>
-bool verifyBipartitePartialColoring(
-    lno_t numRows, lno_t numCols,
-    const rowmap_t& rowmap, const entries_t& entries,
-    const rowmap_t& t_rowmap, const entries_t& t_entries,
-    const colors_t& colors)
-{
-  //Just do the simplest possible neighbors-of-neighbors loop to find conflicts
-  for(lno_t v = 0; v < numRows; v++)
-  {
-    if(colors(v) == 0)
-    {
+template <typename lno_t, typename size_type, typename rowmap_t,
+          typename entries_t, typename colors_t>
+bool verifyBipartitePartialColoring(lno_t numRows, lno_t numCols,
+                                    const rowmap_t& rowmap,
+                                    const entries_t& entries,
+                                    const rowmap_t& t_rowmap,
+                                    const entries_t& t_entries,
+                                    const colors_t& colors) {
+  // Just do the simplest possible neighbors-of-neighbors loop to find conflicts
+  for (lno_t v = 0; v < numRows; v++) {
+    if (colors(v) == 0) {
       std::cout << "Vertex " << v << " is uncolored.\n";
       return false;
     }
     size_type rowBegin = rowmap(v);
-    size_type rowEnd = rowmap(v + 1);
-    for(size_type i = rowBegin; i < rowEnd; i++)
-    {
+    size_type rowEnd   = rowmap(v + 1);
+    for (size_type i = rowBegin; i < rowEnd; i++) {
       lno_t nei1 = entries(i);
-      if(nei1 < numCols)
-      {
-        //iterate over dist-2 neighbors
+      if (nei1 < numCols) {
+        // iterate over dist-2 neighbors
         size_type colBegin = t_rowmap(nei1);
-        size_type colEnd = t_rowmap(nei1 + 1);
-        for(size_type j = colBegin; j < colEnd; j++)
-        {
+        size_type colEnd   = t_rowmap(nei1 + 1);
+        for (size_type j = colBegin; j < colEnd; j++) {
           lno_t nei2 = t_entries(j);
-          if(nei2 < numRows && nei2 != v)
-          {
-            if(colors(v) == colors(nei2))
-            {
-              std::cout << "Hyperedge conflict between " << v << " and " << nei2 << '\n';
+          if (nei2 < numRows && nei2 != v) {
+            if (colors(v) == colors(nei2)) {
+              std::cout << "Hyperedge conflict between " << v << " and " << nei2
+                        << '\n';
               return false;
             }
           }
@@ -153,219 +140,258 @@ bool verifyBipartitePartialColoring(
   }
   return true;
 }
-}
+}  // namespace Test
 
-template<typename scalar_unused, typename lno_t, typename size_type, typename device>
-void test_dist2_coloring(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t row_size_variance)
-{
-    using execution_space = typename device::execution_space;
-    using memory_space = typename device::memory_space;
-    using crsMat = KokkosSparse::CrsMatrix<double, lno_t, device, void, size_type>;
-    using graph_type = typename crsMat::StaticCrsGraphType;
-    using c_rowmap_t = typename graph_type::row_map_type;
-    using c_entries_t = typename graph_type::entries_type;
-    using rowmap_t = typename c_rowmap_t::non_const_type;
-    using entries_t = typename c_entries_t::non_const_type;
-    using KernelHandle = KokkosKernelsHandle<
-      size_type, lno_t, double,
-      execution_space, memory_space, memory_space>;
-    //Generate graph, and add some out-of-bounds columns
-    crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat>(numVerts, numVerts, nnz, row_size_variance, bandwidth);
-    auto G = A.graph;
-    //Symmetrize the graph
-    rowmap_t symRowmap;
-    entries_t symEntries;
-    KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap
-      <c_rowmap_t, c_entries_t,
-      rowmap_t, entries_t, execution_space>
-        (numVerts, G.row_map, G.entries, symRowmap, symEntries);
-    auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap);
-    auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries);
-    std::vector<GraphColoringAlgorithmDistance2> algos =
-    {COLORING_D2_DEFAULT, COLORING_D2_SERIAL, COLORING_D2_VB, COLORING_D2_VB_BIT, COLORING_D2_VB_BIT_EF, COLORING_D2_NB_BIT};
-    for(auto algo : algos)
-    {
-      KernelHandle kh;
-      kh.create_distance2_graph_coloring_handle(algo);
-      // Compute the Distance-2 graph coloring.
-      graph_color_distance2<KernelHandle, c_rowmap_t, c_entries_t>
-        (&kh, numVerts, symRowmap, symEntries);
-      execution_space().fence();
-      auto coloring_handle = kh.get_distance2_graph_coloring_handle();
-      auto colors = coloring_handle->get_vertex_colors();
-      auto colorsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), colors);
-      auto numColors = coloring_handle->get_num_colors();
-      EXPECT_LE(numColors, numVerts);
-      bool success = Test::verifyD2Coloring
-        <lno_t, size_type, decltype(rowmapHost), decltype(entriesHost), decltype(colorsHost)>
-        (numVerts, rowmapHost, entriesHost, colorsHost);
-      EXPECT_TRUE(success) << "Dist-2: algorithm " << coloring_handle->getD2AlgorithmName() << " produced invalid coloring";
-      kh.destroy_distance2_graph_coloring_handle();
-    }
+template <typename scalar_unused, typename lno_t, typename size_type,
+          typename device>
+void test_dist2_coloring(lno_t numVerts, size_type nnz, lno_t bandwidth,
+                         lno_t row_size_variance) {
+  using execution_space = typename device::execution_space;
+  using memory_space    = typename device::memory_space;
+  using crsMat =
+      KokkosSparse::CrsMatrix<double, lno_t, device, void, size_type>;
+  using graph_type  = typename crsMat::StaticCrsGraphType;
+  using c_rowmap_t  = typename graph_type::row_map_type;
+  using c_entries_t = typename graph_type::entries_type;
+  using rowmap_t    = typename c_rowmap_t::non_const_type;
+  using entries_t   = typename c_entries_t::non_const_type;
+  using KernelHandle =
+      KokkosKernelsHandle<size_type, lno_t, double, execution_space,
+                          memory_space, memory_space>;
+  // Generate graph, and add some out-of-bounds columns
+  crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat>(
+      numVerts, numVerts, nnz, row_size_variance, bandwidth);
+  auto G = A.graph;
+  // Symmetrize the graph
+  rowmap_t symRowmap;
+  entries_t symEntries;
+  KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap<
+      c_rowmap_t, c_entries_t, rowmap_t, entries_t, execution_space>(
+      numVerts, G.row_map, G.entries, symRowmap, symEntries);
+  auto rowmapHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap);
+  auto entriesHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries);
+  std::vector<GraphColoringAlgorithmDistance2> algos = {
+      COLORING_D2_DEFAULT, COLORING_D2_SERIAL,    COLORING_D2_VB,
+      COLORING_D2_VB_BIT,  COLORING_D2_VB_BIT_EF, COLORING_D2_NB_BIT};
+  for (auto algo : algos) {
+    KernelHandle kh;
+    kh.create_distance2_graph_coloring_handle(algo);
+    // Compute the Distance-2 graph coloring.
+    graph_color_distance2<KernelHandle, c_rowmap_t, c_entries_t>(
+        &kh, numVerts, symRowmap, symEntries);
+    execution_space().fence();
+    auto coloring_handle = kh.get_distance2_graph_coloring_handle();
+    auto colors          = coloring_handle->get_vertex_colors();
+    auto colorsHost =
+        Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), colors);
+    auto numColors = coloring_handle->get_num_colors();
+    EXPECT_LE(numColors, numVerts);
+    bool success =
+        Test::verifyD2Coloring<lno_t, size_type, decltype(rowmapHost),
+                               decltype(entriesHost), decltype(colorsHost)>(
+            numVerts, rowmapHost, entriesHost, colorsHost);
+    EXPECT_TRUE(success) << "Dist-2: algorithm "
+                         << coloring_handle->getD2AlgorithmName()
+                         << " produced invalid coloring";
+    kh.destroy_distance2_graph_coloring_handle();
+  }
 }
 
-template<typename scalar_unused, typename lno_t, typename size_type, typename device>
-void test_bipartite_symmetric(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t row_size_variance)
-{
-    using execution_space = typename device::execution_space;
-    using memory_space = typename device::memory_space;
-    using crsMat = KokkosSparse::CrsMatrix<double, lno_t, device, void, size_type>;
-    using graph_type = typename crsMat::StaticCrsGraphType;
-    using c_rowmap_t = typename graph_type::row_map_type;
-    using c_entries_t = typename graph_type::entries_type;
-    using rowmap_t = typename c_rowmap_t::non_const_type;
-    using entries_t = typename c_entries_t::non_const_type;
-    using KernelHandle = KokkosKernelsHandle<
-      size_type, lno_t, double,
-      execution_space, memory_space, memory_space>;
-    //Generate graph, and add some out-of-bounds columns
-    crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat>(numVerts, numVerts, nnz, row_size_variance, bandwidth);
-    auto G = A.graph;
-    //Symmetrize the graph
-    rowmap_t symRowmap;
-    entries_t symEntries;
-    KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap
-      <c_rowmap_t, c_entries_t,
-      rowmap_t, entries_t, execution_space>
-        (numVerts, G.row_map, G.entries, symRowmap, symEntries);
-    auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap);
-    auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries);
-    std::vector<GraphColoringAlgorithmDistance2> algos =
-    {COLORING_D2_DEFAULT, COLORING_D2_SERIAL, COLORING_D2_VB, COLORING_D2_VB_BIT, COLORING_D2_VB_BIT_EF, COLORING_D2_NB_BIT};
-    for(auto algo : algos)
-    {
-      KernelHandle kh;
-      kh.create_distance2_graph_coloring_handle(algo);
-      // Compute the Distance-2 graph coloring.
-      bipartite_color_rows<KernelHandle, c_rowmap_t, c_entries_t>
-        (&kh, numVerts, numVerts, symRowmap, symEntries, true);
-      execution_space().fence();
-      auto coloring_handle = kh.get_distance2_graph_coloring_handle();
-      auto colors = coloring_handle->get_vertex_colors();
-      auto colorsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), colors);
-      auto numColors = coloring_handle->get_num_colors();
-      EXPECT_LE(numColors, numVerts);
-      bool success = Test::verifyBipartitePartialColoring
-        <lno_t, size_type, decltype(rowmapHost), decltype(entriesHost), decltype(colorsHost)>
-        (numVerts, numVerts, rowmapHost, entriesHost, rowmapHost, entriesHost, colorsHost);
-      EXPECT_TRUE(success) << "Dist-2: algorithm " << coloring_handle->getD2AlgorithmName() << " produced invalid coloring";
-      kh.destroy_distance2_graph_coloring_handle();
-    }
+template <typename scalar_unused, typename lno_t, typename size_type,
+          typename device>
+void test_bipartite_symmetric(lno_t numVerts, size_type nnz, lno_t bandwidth,
+                              lno_t row_size_variance) {
+  using execution_space = typename device::execution_space;
+  using memory_space    = typename device::memory_space;
+  using crsMat =
+      KokkosSparse::CrsMatrix<double, lno_t, device, void, size_type>;
+  using graph_type  = typename crsMat::StaticCrsGraphType;
+  using c_rowmap_t  = typename graph_type::row_map_type;
+  using c_entries_t = typename graph_type::entries_type;
+  using rowmap_t    = typename c_rowmap_t::non_const_type;
+  using entries_t   = typename c_entries_t::non_const_type;
+  using KernelHandle =
+      KokkosKernelsHandle<size_type, lno_t, double, execution_space,
+                          memory_space, memory_space>;
+  // Generate graph, and add some out-of-bounds columns
+  crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat>(
+      numVerts, numVerts, nnz, row_size_variance, bandwidth);
+  auto G = A.graph;
+  // Symmetrize the graph
+  rowmap_t symRowmap;
+  entries_t symEntries;
+  KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap<
+      c_rowmap_t, c_entries_t, rowmap_t, entries_t, execution_space>(
+      numVerts, G.row_map, G.entries, symRowmap, symEntries);
+  auto rowmapHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap);
+  auto entriesHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries);
+  std::vector<GraphColoringAlgorithmDistance2> algos = {
+      COLORING_D2_DEFAULT, COLORING_D2_SERIAL,    COLORING_D2_VB,
+      COLORING_D2_VB_BIT,  COLORING_D2_VB_BIT_EF, COLORING_D2_NB_BIT};
+  for (auto algo : algos) {
+    KernelHandle kh;
+    kh.create_distance2_graph_coloring_handle(algo);
+    // Compute the Distance-2 graph coloring.
+    bipartite_color_rows<KernelHandle, c_rowmap_t, c_entries_t>(
+        &kh, numVerts, numVerts, symRowmap, symEntries, true);
+    execution_space().fence();
+    auto coloring_handle = kh.get_distance2_graph_coloring_handle();
+    auto colors          = coloring_handle->get_vertex_colors();
+    auto colorsHost =
+        Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), colors);
+    auto numColors = coloring_handle->get_num_colors();
+    EXPECT_LE(numColors, numVerts);
+    bool success = Test::verifyBipartitePartialColoring<
+        lno_t, size_type, decltype(rowmapHost), decltype(entriesHost),
+        decltype(colorsHost)>(numVerts, numVerts, rowmapHost, entriesHost,
+                              rowmapHost, entriesHost, colorsHost);
+    EXPECT_TRUE(success) << "Dist-2: algorithm "
+                         << coloring_handle->getD2AlgorithmName()
+                         << " produced invalid coloring";
+    kh.destroy_distance2_graph_coloring_handle();
+  }
 }
 
-template<typename scalar_unused, typename lno_t, typename size_type, typename device>
-void test_bipartite(lno_t numRows, lno_t numCols, size_type nnz, lno_t bandwidth, lno_t row_size_variance, bool colorRows)
-{
-    using execution_space = typename device::execution_space;
-    using memory_space = typename device::memory_space;
-    using crsMat = KokkosSparse::CrsMatrix<double, lno_t, device, void, size_type>;
-    using graph_type = typename crsMat::StaticCrsGraphType;
-    using rowmap_t = typename graph_type::row_map_type::non_const_type;
-    using entries_t = typename graph_type::entries_type::non_const_type;
-    using c_rowmap_t = typename graph_type::row_map_type;
-    using c_entries_t = typename graph_type::entries_type;
-    using KernelHandle = KokkosKernelsHandle<
-      size_type, lno_t, double,
-      execution_space, memory_space, memory_space>;
-    //Generate graph
-    crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat>(numRows, numCols, nnz, row_size_variance, bandwidth);
-    auto G = A.graph;
-    rowmap_t t_rowmap("rowmap^T", numCols + 1);
-    entries_t t_entries("entries^T", G.entries.extent(0));
-    KokkosKernels::Impl::transpose_graph
-      <c_rowmap_t, c_entries_t, rowmap_t, entries_t, rowmap_t, execution_space>
-      (numRows, numCols, G.row_map, G.entries, t_rowmap, t_entries);
-    //TODO: remove me, shouldn't be needed even with UVM
+template <typename scalar_unused, typename lno_t, typename size_type,
+          typename device>
+void test_bipartite(lno_t numRows, lno_t numCols, size_type nnz,
+                    lno_t bandwidth, lno_t row_size_variance, bool colorRows) {
+  using execution_space = typename device::execution_space;
+  using memory_space    = typename device::memory_space;
+  using crsMat =
+      KokkosSparse::CrsMatrix<double, lno_t, device, void, size_type>;
+  using graph_type  = typename crsMat::StaticCrsGraphType;
+  using rowmap_t    = typename graph_type::row_map_type::non_const_type;
+  using entries_t   = typename graph_type::entries_type::non_const_type;
+  using c_rowmap_t  = typename graph_type::row_map_type;
+  using c_entries_t = typename graph_type::entries_type;
+  using KernelHandle =
+      KokkosKernelsHandle<size_type, lno_t, double, execution_space,
+                          memory_space, memory_space>;
+  // Generate graph
+  crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat>(
+      numRows, numCols, nnz, row_size_variance, bandwidth);
+  auto G = A.graph;
+  rowmap_t t_rowmap("rowmap^T", numCols + 1);
+  entries_t t_entries("entries^T", G.entries.extent(0));
+  KokkosKernels::Impl::transpose_graph<c_rowmap_t, c_entries_t, rowmap_t,
+                                       entries_t, rowmap_t, execution_space>(
+      numRows, numCols, G.row_map, G.entries, t_rowmap, t_entries);
+  // TODO: remove me, shouldn't be needed even with UVM
+  execution_space().fence();
+  auto rowmapHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), G.row_map);
+  auto entriesHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), G.entries);
+  auto t_rowmapHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), t_rowmap);
+  auto t_entriesHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), t_entries);
+  std::vector<GraphColoringAlgorithmDistance2> algos = {
+      COLORING_D2_DEFAULT, COLORING_D2_SERIAL,    COLORING_D2_VB,
+      COLORING_D2_VB_BIT,  COLORING_D2_VB_BIT_EF, COLORING_D2_NB_BIT};
+  for (auto algo : algos) {
+    KernelHandle kh;
+    kh.create_distance2_graph_coloring_handle(algo);
+    // Compute the one-sided bipartite coloring.
+    if (colorRows) {
+      bipartite_color_rows<KernelHandle, c_rowmap_t, c_entries_t>(
+          &kh, numRows, numCols, G.row_map, G.entries);
+    } else {
+      bipartite_color_columns<KernelHandle, c_rowmap_t, c_entries_t>(
+          &kh, numRows, numCols, G.row_map, G.entries);
+    }
     execution_space().fence();
-    auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), G.row_map);
-    auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), G.entries);
-    auto t_rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), t_rowmap);
-    auto t_entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), t_entries);
-    std::vector<GraphColoringAlgorithmDistance2> algos =
-    {COLORING_D2_DEFAULT, COLORING_D2_SERIAL, COLORING_D2_VB, COLORING_D2_VB_BIT, COLORING_D2_VB_BIT_EF, COLORING_D2_NB_BIT};
-    for(auto algo : algos)
-    {
-      KernelHandle kh;
-      kh.create_distance2_graph_coloring_handle(algo);
-      // Compute the one-sided bipartite coloring.
-      if(colorRows)
-      {
-        bipartite_color_rows<KernelHandle, c_rowmap_t, c_entries_t>
-          (&kh, numRows, numCols, G.row_map, G.entries);
-      }
-      else
-      {
-        bipartite_color_columns<KernelHandle, c_rowmap_t, c_entries_t>
-          (&kh, numRows, numCols, G.row_map, G.entries);
-      }
-      execution_space().fence();
-      auto coloring_handle = kh.get_distance2_graph_coloring_handle();
-      auto colors = coloring_handle->get_vertex_colors();
-      auto colorsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), colors);
-      auto numColors = coloring_handle->get_num_colors();
-      bool success;
-      if(colorRows)
-      {
-        EXPECT_LE(numColors, numRows);
-        success = Test::verifyBipartitePartialColoring
-          <lno_t, size_type, decltype(rowmapHost), decltype(entriesHost), decltype(colorsHost)>
-          (numRows, numCols, rowmapHost, entriesHost, t_rowmapHost, t_entriesHost, colorsHost);
-      }
-      else
-      {
-        EXPECT_LE(numColors, numCols);
-        success = Test::verifyBipartitePartialColoring
-          <lno_t, size_type, decltype(rowmapHost), decltype(entriesHost), decltype(colorsHost)>
-          (numCols, numRows, t_rowmapHost, t_entriesHost, rowmapHost, entriesHost, colorsHost);
-      }
-      EXPECT_TRUE(success) << "Bipartite " << (colorRows ? "row" : "column") << " coloring: algorithm " << coloring_handle->getD2AlgorithmName() << " produced invalid coloring";
-      kh.destroy_distance2_graph_coloring_handle();
+    auto coloring_handle = kh.get_distance2_graph_coloring_handle();
+    auto colors          = coloring_handle->get_vertex_colors();
+    auto colorsHost =
+        Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), colors);
+    auto numColors = coloring_handle->get_num_colors();
+    bool success;
+    if (colorRows) {
+      EXPECT_LE(numColors, numRows);
+      success = Test::verifyBipartitePartialColoring<
+          lno_t, size_type, decltype(rowmapHost), decltype(entriesHost),
+          decltype(colorsHost)>(numRows, numCols, rowmapHost, entriesHost,
+                                t_rowmapHost, t_entriesHost, colorsHost);
+    } else {
+      EXPECT_LE(numColors, numCols);
+      success = Test::verifyBipartitePartialColoring<
+          lno_t, size_type, decltype(rowmapHost), decltype(entriesHost),
+          decltype(colorsHost)>(numCols, numRows, t_rowmapHost, t_entriesHost,
+                                rowmapHost, entriesHost, colorsHost);
     }
+    EXPECT_TRUE(success) << "Bipartite " << (colorRows ? "row" : "column")
+                         << " coloring: algorithm "
+                         << coloring_handle->getD2AlgorithmName()
+                         << " produced invalid coloring";
+    kh.destroy_distance2_graph_coloring_handle();
+  }
 }
 
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
-    TEST_F(TestCategory, graph##_##graph_color_distance2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \
-    { \
-      test_dist2_coloring<SCALAR, ORDINAL, OFFSET, DEVICE>(5000, 5000 * 20, 1000, 10); \
-      test_dist2_coloring<SCALAR, ORDINAL, OFFSET, DEVICE>(50, 50 * 10, 40, 10); \
-    } \
-    TEST_F(TestCategory, graph##_##graph_color_bipartite_sym##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \
-    { \
-      test_bipartite_symmetric<SCALAR, ORDINAL, OFFSET, DEVICE>(50, 50 * 5, 30, 1); \
-      test_bipartite_symmetric<SCALAR, ORDINAL, OFFSET, DEVICE>(2000, 2000 * 20, 800, 10); \
-    } \
-    TEST_F(TestCategory, graph##_##graph_color_bipartite_row##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \
-    { \
-      test_bipartite<SCALAR, ORDINAL, OFFSET, DEVICE>(2000, 4000, 3000 * 20, 800, 10, true); \
-      test_bipartite<SCALAR, ORDINAL, OFFSET, DEVICE>(4000, 2000, 3000 * 20, 800, 10, true); \
-    } \
-    TEST_F(TestCategory, graph##_##graph_color_bipartite_col##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \
-    { \
-      test_bipartite<SCALAR, ORDINAL, OFFSET, DEVICE>(2000, 4000, 3000 * 20, 800, 10, false); \
-      test_bipartite<SCALAR, ORDINAL, OFFSET, DEVICE>(4000, 2000, 3000 * 20, 800, 10, false); \
-    }
+#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                                     \
+  TEST_F(                                                                                 \
+      TestCategory,                                                                       \
+      graph##_##graph_color_distance2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {     \
+    test_dist2_coloring<SCALAR, ORDINAL, OFFSET, DEVICE>(5000, 5000 * 20,                 \
+                                                         1000, 10);                       \
+    test_dist2_coloring<SCALAR, ORDINAL, OFFSET, DEVICE>(50, 50 * 10, 40, 10);            \
+  }                                                                                       \
+  TEST_F(                                                                                 \
+      TestCategory,                                                                       \
+      graph##_##graph_color_bipartite_sym##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    test_bipartite_symmetric<SCALAR, ORDINAL, OFFSET, DEVICE>(50, 50 * 5, 30,             \
+                                                              1);                         \
+    test_bipartite_symmetric<SCALAR, ORDINAL, OFFSET, DEVICE>(2000, 2000 * 20,            \
+                                                              800, 10);                   \
+  }                                                                                       \
+  TEST_F(                                                                                 \
+      TestCategory,                                                                       \
+      graph##_##graph_color_bipartite_row##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    test_bipartite<SCALAR, ORDINAL, OFFSET, DEVICE>(2000, 4000, 3000 * 20,                \
+                                                    800, 10, true);                       \
+    test_bipartite<SCALAR, ORDINAL, OFFSET, DEVICE>(4000, 2000, 3000 * 20,                \
+                                                    800, 10, true);                       \
+  }                                                                                       \
+  TEST_F(                                                                                 \
+      TestCategory,                                                                       \
+      graph##_##graph_color_bipartite_col##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    test_bipartite<SCALAR, ORDINAL, OFFSET, DEVICE>(2000, 4000, 3000 * 20,                \
+                                                    800, 10, false);                      \
+    test_bipartite<SCALAR, ORDINAL, OFFSET, DEVICE>(4000, 2000, 3000 * 20,                \
+                                                    800, 10, false);                      \
+  }
 
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-#if(defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) \
-  || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
 EXECUTE_TEST(double, int, int, TestExecSpace)
 #endif
 
-#if(defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) \
-  || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
 EXECUTE_TEST(double, int64_t, int, TestExecSpace)
 #endif
 
-// FIXME_SYCL
-#ifndef KOKKOS_ENABLE_SYCL
-#if(defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) \
-  || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
 EXECUTE_TEST(double, int, size_t, TestExecSpace)
 #endif
-#endif
 
-#if(defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) \
-  || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
 EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
 #endif
 #endif
diff --git a/unit_test/graph/Test_Graph_mis2.hpp b/unit_test/graph/Test_Graph_mis2.hpp
index 2186302b66..ed3acc3b85 100644
--- a/unit_test/graph/Test_Graph_mis2.hpp
+++ b/unit_test/graph/Test_Graph_mis2.hpp
@@ -57,175 +57,184 @@
 using namespace KokkosKernels;
 using namespace KokkosKernels::Experimental;
 
-using namespace KokkosGraph;
-using namespace KokkosGraph::Experimental;
+enum CoarseningType { PHASE2, NO_PHASE2 };
 
 namespace Test {
 
-template<typename lno_t, typename size_type, typename rowmap_t, typename entries_t, typename mis_t>
-bool verifyD2MIS(
-    lno_t numVerts,
-    const rowmap_t& rowmap, const entries_t& entries,
-    const mis_t& misArray)
-{
-  //set a std::set of the mis, for fast membership test
+template <typename lno_t, typename size_type, typename rowmap_t,
+          typename entries_t, typename mis_t>
+bool verifyD2MIS(lno_t numVerts, const rowmap_t& rowmap,
+                 const entries_t& entries, const mis_t& misArray) {
+  // set a std::set of the mis, for fast membership test
   std::set<lno_t> mis;
-  for(size_t i = 0; i < misArray.extent(0); i++)
-    mis.insert(misArray(i));
-  for(lno_t i = 0; i < numVerts; i++)
-  {
-    //determine whether another vertex in the set is
-    //within 2 hops of i.
+  for (size_t i = 0; i < misArray.extent(0); i++) mis.insert(misArray(i));
+  for (lno_t i = 0; i < numVerts; i++) {
+    // determine whether another vertex in the set is
+    // within 2 hops of i.
     bool misIn2Hops = false;
-    for(size_type j = rowmap(i); j < rowmap(i + 1); j++)
-    {
+    for (size_type j = rowmap(i); j < rowmap(i + 1); j++) {
       lno_t nei1 = entries(j);
-      if(nei1 == i || nei1 >= numVerts)
-        continue;
-      if(mis.find(nei1) != mis.end())
-      {
+      if (nei1 == i || nei1 >= numVerts) continue;
+      if (mis.find(nei1) != mis.end()) {
         misIn2Hops = true;
         break;
       }
-      for(size_type k = rowmap(nei1); k < rowmap(nei1 + 1); k++)
-      {
+      for (size_type k = rowmap(nei1); k < rowmap(nei1 + 1); k++) {
         lno_t nei2 = entries(k);
-        if(nei2 == i || nei2 >= numVerts)
-          continue;
-        if(mis.find(nei2) != mis.end())
-        {
+        if (nei2 == i || nei2 >= numVerts) continue;
+        if (mis.find(nei2) != mis.end()) {
           misIn2Hops = true;
           break;
         }
       }
     }
-    if(mis.find(i) == mis.end())
-    {
-      //i is not in the set
-      if(!misIn2Hops)
-      {
+    if (mis.find(i) == mis.end()) {
+      // i is not in the set
+      if (!misIn2Hops) {
         std::cout << "INVALID D2 MIS: vertex " << i << " is not in the set,\n";
         std::cout << "but there are no vertices in the set within 2 hops.\n";
         return false;
       }
-    }
-    else
-    {
-      //i is in the set
-      if(misIn2Hops)
-      {
+    } else {
+      // i is in the set
+      if (misIn2Hops) {
         std::cout << "INVALID D2 MIS: vertex " << i << " is in the set,\n";
-        std::cout << "but there is another vertex within 2 hops which is also in the set.\n";
+        std::cout << "but there is another vertex within 2 hops which is also "
+                     "in the set.\n";
         return false;
       }
     }
   }
   return true;
 }
-}
+}  // namespace Test
 
-template<typename scalar_unused, typename lno_t, typename size_type, typename device>
-void test_mis2(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t row_size_variance)
-{
+template <typename scalar_unused, typename lno_t, typename size_type,
+          typename device>
+void test_mis2(lno_t numVerts, size_type nnz, lno_t bandwidth,
+               lno_t row_size_variance) {
   using execution_space = typename device::execution_space;
-  using crsMat = KokkosSparse::CrsMatrix<double, lno_t, device, void, size_type>;
-  using graph_type = typename crsMat::StaticCrsGraphType;
-  using c_rowmap_t = typename graph_type::row_map_type;
+  using crsMat =
+      KokkosSparse::CrsMatrix<double, lno_t, device, void, size_type>;
+  using graph_type  = typename crsMat::StaticCrsGraphType;
+  using c_rowmap_t  = typename graph_type::row_map_type;
   using c_entries_t = typename graph_type::entries_type;
-  using rowmap_t = typename c_rowmap_t::non_const_type;
-  using entries_t = typename c_entries_t::non_const_type;
-  //Generate graph, and add some out-of-bounds columns
-  crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat>(numVerts, numVerts, nnz, row_size_variance, bandwidth);
+  using rowmap_t    = typename c_rowmap_t::non_const_type;
+  using entries_t   = typename c_entries_t::non_const_type;
+  // Generate graph, and add some out-of-bounds columns
+  crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat>(
+      numVerts, numVerts, nnz, row_size_variance, bandwidth);
   auto G = A.graph;
-  //Symmetrize the graph
+  // Symmetrize the graph
   rowmap_t symRowmap;
   entries_t symEntries;
-  KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap
-    <c_rowmap_t, c_entries_t,
-    rowmap_t, entries_t, execution_space>
-      (numVerts, G.row_map, G.entries, symRowmap, symEntries);
-  auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap);
-  auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries);
-  //For each algorithm, compute and verify the MIS
-  std::vector<MIS2_Algorithm> algos
-    = {MIS2_FAST, MIS2_QUALITY};
-  for(auto algo : algos)
-  {
-    auto mis = graph_d2_mis<device, rowmap_t, entries_t>(symRowmap, symEntries, algo);
-    auto misHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), mis);
-    bool success = Test::verifyD2MIS
-      <lno_t, size_type, decltype(rowmapHost), decltype(entriesHost), decltype(misHost)>
-      (numVerts, rowmapHost, entriesHost, misHost);
-    EXPECT_TRUE(success) << "Dist-2 MIS (algo " << (int) algo << ") produced invalid set.";
+  KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap<
+      c_rowmap_t, c_entries_t, rowmap_t, entries_t, execution_space>(
+      numVerts, G.row_map, G.entries, symRowmap, symEntries);
+  auto rowmapHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap);
+  auto entriesHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries);
+  // For each algorithm, compute and verify the MIS
+  std::vector<MIS2_Algorithm> algos = {MIS2_FAST, MIS2_QUALITY};
+  for (auto algo : algos) {
+    auto mis = KokkosGraph::graph_d2_mis<device, rowmap_t, entries_t>(
+        symRowmap, symEntries, algo);
+    auto misHost =
+        Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), mis);
+    bool success = Test::verifyD2MIS<lno_t, size_type, decltype(rowmapHost),
+                                     decltype(entriesHost), decltype(misHost)>(
+        numVerts, rowmapHost, entriesHost, misHost);
+    EXPECT_TRUE(success) << "Dist-2 MIS (algo " << (int)algo
+                         << ") produced invalid set.";
   }
 }
 
-template<typename scalar_unused, typename lno_t, typename size_type, typename device>
-void test_mis2_coarsening(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t row_size_variance)
-{
+template <typename scalar_unused, typename lno_t, typename size_type,
+          typename device>
+void test_mis2_coarsening(lno_t numVerts, size_type nnz, lno_t bandwidth,
+                          lno_t row_size_variance) {
   using execution_space = typename device::execution_space;
-  using crsMat = KokkosSparse::CrsMatrix<double, lno_t, device, void, size_type>;
-  using graph_type = typename crsMat::StaticCrsGraphType;
-  using c_rowmap_t = typename graph_type::row_map_type;
+  using crsMat =
+      KokkosSparse::CrsMatrix<double, lno_t, device, void, size_type>;
+  using graph_type  = typename crsMat::StaticCrsGraphType;
+  using c_rowmap_t  = typename graph_type::row_map_type;
   using c_entries_t = typename graph_type::entries_type;
-  using rowmap_t = typename c_rowmap_t::non_const_type;
-  using entries_t = typename c_entries_t::non_const_type;
-  //Generate graph, and add some out-of-bounds columns
-  crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat>(numVerts, numVerts, nnz, row_size_variance, bandwidth);
+  using rowmap_t    = typename c_rowmap_t::non_const_type;
+  using entries_t   = typename c_entries_t::non_const_type;
+  using labels_t    = entries_t;
+  // Generate graph, and add some out-of-bounds columns
+  crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat>(
+      numVerts, numVerts, nnz, row_size_variance, bandwidth);
   auto G = A.graph;
-  //Symmetrize the graph
+  // Symmetrize the graph
   rowmap_t symRowmap;
   entries_t symEntries;
-  KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap
-    <c_rowmap_t, c_entries_t,
-    rowmap_t, entries_t, execution_space>
-      (numVerts, G.row_map, G.entries, symRowmap, symEntries);
-  auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap);
-  auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries);
-  //For each algorithm, compute and verify the MIS
-  std::vector<MIS2_Algorithm> algos
-    = {MIS2_FAST, MIS2_QUALITY};
-  for(auto algo : algos)
-  {
+  KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap<
+      c_rowmap_t, c_entries_t, rowmap_t, entries_t, execution_space>(
+      numVerts, G.row_map, G.entries, symRowmap, symEntries);
+  auto rowmapHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symRowmap);
+  auto entriesHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), symEntries);
+  // For each algorithm, compute and verify the MIS
+  std::vector<CoarseningType> algos = {PHASE2, NO_PHASE2};
+  for (auto algo : algos) {
     lno_t numClusters = 0;
-    auto labels = graph_mis2_coarsen<device, rowmap_t, entries_t>(symRowmap, symEntries, numClusters, algo);
-    auto labelsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), labels);
-    //Not a strong test, but sanity check the number of clusters returned
+    labels_t labels;
+    switch (algo) {
+      case NO_PHASE2:
+        labels = KokkosGraph::graph_mis2_coarsen<device, rowmap_t, entries_t>(
+            symRowmap, symEntries, numClusters);
+        break;
+      case PHASE2:
+        labels = KokkosGraph::graph_mis2_aggregate<device, rowmap_t, entries_t>(
+            symRowmap, symEntries, numClusters);
+    }
+    auto labelsHost =
+        Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), labels);
+    // Not a strong test, but sanity check the number of clusters returned
     EXPECT_TRUE(numClusters >= 1 && numClusters <= numVerts);
-    //Check that every label is in the range [0, numClusters)
-    for(lno_t i = 0; i < numVerts; i++)
+    // Check that every label is in the range [0, numClusters)
+    for (lno_t i = 0; i < numVerts; i++)
       EXPECT_TRUE(0 <= labelsHost(i) && labelsHost(i) < numClusters);
-    //Test explicit coarsening given the labels, with and without compressing the result
+    // Test explicit coarsening given the labels, with and without compressing
+    // the result
     rowmap_t coarseRowmapNC, coarseRowmapC;
     entries_t coarseEntriesNC, coarseEntriesC;
-    KokkosGraph::Experimental::graph_explicit_coarsen<device, rowmap_t, entries_t, entries_t, rowmap_t, entries_t>
-      (symRowmap, symEntries, labels, numClusters, coarseRowmapNC, coarseEntriesNC, false);
-    KokkosGraph::Experimental::graph_explicit_coarsen<device, rowmap_t, entries_t, entries_t, rowmap_t, entries_t>
-      (symRowmap, symEntries, labels, numClusters, coarseRowmapC, coarseEntriesC, true);
+    KokkosGraph::Experimental::graph_explicit_coarsen<
+        device, rowmap_t, entries_t, entries_t, rowmap_t, entries_t>(
+        symRowmap, symEntries, labels, numClusters, coarseRowmapNC,
+        coarseEntriesNC, false);
+    KokkosGraph::Experimental::graph_explicit_coarsen<
+        device, rowmap_t, entries_t, entries_t, rowmap_t, entries_t>(
+        symRowmap, symEntries, labels, numClusters, coarseRowmapC,
+        coarseEntriesC, true);
     EXPECT_EQ(coarseRowmapC.extent(0), numClusters + 1);
     EXPECT_EQ(coarseRowmapNC.extent(0), numClusters + 1);
-    //Check that coarse graph doesn't have more edges than fine graph
+    // Check that coarse graph doesn't have more edges than fine graph
     EXPECT_LE(coarseEntriesC.extent(0), symEntries.extent(0));
     EXPECT_LE(coarseEntriesNC.extent(0), symEntries.extent(0));
-    //Verify compression is working.
-    auto hostRowmapNC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseRowmapNC);
-    auto hostEntriesNC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseEntriesNC);
-    auto hostRowmapC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseRowmapC);
-    auto hostEntriesC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseEntriesC);
-    for(lno_t i = 0; i < numClusters; i++)
-    {
-      //std::set maintains uniqueness as well as ascending order of elements.
-      //So it should exactly match the entries in the compressed version.
+    // Verify compression is working.
+    auto hostRowmapNC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
+                                                            coarseRowmapNC);
+    auto hostEntriesNC = Kokkos::create_mirror_view_and_copy(
+        Kokkos::HostSpace(), coarseEntriesNC);
+    auto hostRowmapC =
+        Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseRowmapC);
+    auto hostEntriesC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
+                                                            coarseEntriesC);
+    for (lno_t i = 0; i < numClusters; i++) {
+      // std::set maintains uniqueness as well as ascending order of elements.
+      // So it should exactly match the entries in the compressed version.
       std::set<lno_t> uniqueEntries;
-      for(size_type j = hostRowmapNC(i); j < hostRowmapNC(i + 1); j++)
-      {
+      for (size_type j = hostRowmapNC(i); j < hostRowmapNC(i + 1); j++) {
         uniqueEntries.insert(hostEntriesNC(j));
       }
       size_type compressedRowLen = hostRowmapC(i + 1) - hostRowmapC(i);
       ASSERT_EQ(uniqueEntries.size(), compressedRowLen);
       auto it = uniqueEntries.begin();
-      for(size_type j = hostRowmapC(i); j < hostRowmapC(i + 1); j++)
-      {
+      for (size_type j = hostRowmapC(i); j < hostRowmapC(i + 1); j++) {
         EXPECT_EQ(*it, hostEntriesC(j));
         it++;
       }
@@ -233,74 +242,88 @@ void test_mis2_coarsening(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t
   }
 }
 
-template<typename scalar_unused, typename lno_t, typename size_type, typename device>
-void test_mis2_coarsening_zero_rows()
-{
-  using crsMat = KokkosSparse::CrsMatrix<double, lno_t, device, void, size_type>;
-  using graph_type = typename crsMat::StaticCrsGraphType;
-  using c_rowmap_t = typename graph_type::row_map_type;
+template <typename scalar_unused, typename lno_t, typename size_type,
+          typename device>
+void test_mis2_coarsening_zero_rows() {
+  using crsMat =
+      KokkosSparse::CrsMatrix<double, lno_t, device, void, size_type>;
+  using graph_type  = typename crsMat::StaticCrsGraphType;
+  using c_rowmap_t  = typename graph_type::row_map_type;
   using c_entries_t = typename graph_type::entries_type;
-  using rowmap_t = typename c_rowmap_t::non_const_type;
-  using entries_t = typename c_entries_t::non_const_type;
+  using rowmap_t    = typename c_rowmap_t::non_const_type;
+  using entries_t   = typename c_entries_t::non_const_type;
   rowmap_t fineRowmap;
   entries_t fineEntries;
-  //note: MIS2 coarsening first calls MIS2 on the fine graph, so this covers the zero-row case for MIS2 alone.
+  // note: MIS2 coarsening first calls MIS2 on the fine graph, so this covers
+  // the zero-row case for MIS2 alone.
   lno_t numClusters;
-  auto labels = graph_mis2_coarsen<device, rowmap_t, entries_t>(fineRowmap, fineEntries, numClusters, KokkosGraph::MIS2_FAST);
+  auto labels = KokkosGraph::graph_mis2_coarsen<device, rowmap_t, entries_t>(
+      fineRowmap, fineEntries, numClusters);
   EXPECT_EQ(numClusters, 0);
   EXPECT_EQ(labels.extent(0), 0);
-  //coarsen, should also produce a graph with 0 rows/entries
+  // coarsen, should also produce a graph with 0 rows/entries
   rowmap_t coarseRowmap;
   entries_t coarseEntries;
-  KokkosGraph::Experimental::graph_explicit_coarsen<device, rowmap_t, entries_t, entries_t, rowmap_t, entries_t>
-    (fineRowmap, fineEntries, labels, 0, coarseRowmap, coarseEntries, false);
+  KokkosGraph::Experimental::graph_explicit_coarsen<
+      device, rowmap_t, entries_t, entries_t, rowmap_t, entries_t>(
+      fineRowmap, fineEntries, labels, 0, coarseRowmap, coarseEntries, false);
   EXPECT_LE(coarseRowmap.extent(0), 1);
   EXPECT_EQ(coarseEntries.extent(0), 0);
-  KokkosGraph::Experimental::graph_explicit_coarsen<device, rowmap_t, entries_t, entries_t, rowmap_t, entries_t>
-    (fineRowmap, fineEntries, labels, 0, coarseRowmap, coarseEntries, true);
+  KokkosGraph::Experimental::graph_explicit_coarsen<
+      device, rowmap_t, entries_t, entries_t, rowmap_t, entries_t>(
+      fineRowmap, fineEntries, labels, 0, coarseRowmap, coarseEntries, true);
   EXPECT_LE(coarseRowmap.extent(0), 1);
   EXPECT_EQ(coarseEntries.extent(0), 0);
 }
 
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
-    TEST_F(TestCategory, graph##_##graph_mis2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \
-    { \
-      test_mis2<SCALAR, ORDINAL, OFFSET, DEVICE>(5000, 5000 * 20, 1000, 10); \
-      test_mis2<SCALAR, ORDINAL, OFFSET, DEVICE>(50, 50 * 10, 40, 10); \
-      test_mis2<SCALAR, ORDINAL, OFFSET, DEVICE>(5, 5 * 3, 5, 0); \
-    } \
-    TEST_F(TestCategory, graph##_##graph_mis2_coarsening##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \
-    { \
-      test_mis2_coarsening<SCALAR, ORDINAL, OFFSET, DEVICE>(5000, 5000 * 200, 2000, 10); \
-      test_mis2_coarsening<SCALAR, ORDINAL, OFFSET, DEVICE>(5000, 5000 * 20, 1000, 10); \
-      test_mis2_coarsening<SCALAR, ORDINAL, OFFSET, DEVICE>(50, 50 * 10, 40, 10); \
-      test_mis2_coarsening<SCALAR, ORDINAL, OFFSET, DEVICE>(5, 5 * 3, 5, 0); \
-      test_mis2_coarsening_zero_rows<SCALAR, ORDINAL, OFFSET, DEVICE>(); \
-    }
+#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                                 \
+  TEST_F(TestCategory,                                                                \
+         graph##_##graph_mis2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {         \
+    test_mis2<SCALAR, ORDINAL, OFFSET, DEVICE>(5000, 5000 * 20, 1000, 10);            \
+    test_mis2<SCALAR, ORDINAL, OFFSET, DEVICE>(50, 50 * 10, 40, 10);                  \
+    test_mis2<SCALAR, ORDINAL, OFFSET, DEVICE>(5, 5 * 3, 5, 0);                       \
+  }                                                                                   \
+  TEST_F(                                                                             \
+      TestCategory,                                                                   \
+      graph##_##graph_mis2_coarsening##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    test_mis2_coarsening<SCALAR, ORDINAL, OFFSET, DEVICE>(5000, 5000 * 200,           \
+                                                          2000, 10);                  \
+    test_mis2_coarsening<SCALAR, ORDINAL, OFFSET, DEVICE>(5000, 5000 * 20,            \
+                                                          1000, 10);                  \
+    test_mis2_coarsening<SCALAR, ORDINAL, OFFSET, DEVICE>(50, 50 * 10, 40,            \
+                                                          10);                        \
+    test_mis2_coarsening<SCALAR, ORDINAL, OFFSET, DEVICE>(5, 5 * 3, 5, 0);            \
+    test_mis2_coarsening_zero_rows<SCALAR, ORDINAL, OFFSET, DEVICE>();                \
+  }
 
-// FIXME_SYCL
-#ifndef KOKKOS_ENABLE_SYCL
 #if defined(KOKKOSKERNELS_INST_DOUBLE)
-#if(defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) \
-  || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
 EXECUTE_TEST(double, int, int, TestExecSpace)
 #endif
 #endif
 
-#if(defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) \
-  || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
 EXECUTE_TEST(double, int64_t, int, TestExecSpace)
 #endif
 
-#if(defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) \
-  || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
 EXECUTE_TEST(double, int, size_t, TestExecSpace)
 #endif
 
-#if(defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) \
-  || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
 EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
 #endif
-#endif
 
 #undef EXECUTE_TEST
diff --git a/unit_test/graph/Test_Graph_rcm.hpp b/unit_test/graph/Test_Graph_rcm.hpp
index 308e4749bb..2d2356a92e 100644
--- a/unit_test/graph/Test_Graph_rcm.hpp
+++ b/unit_test/graph/Test_Graph_rcm.hpp
@@ -51,79 +51,74 @@
 
 #include <vector>
 
-//Generates a graph from 3D 7-pt stencil. Slices grid into 2 connected components near the middle of X dimension.
-template<typename rowmap_t, typename entries_t>
-void generate7pt(rowmap_t& rowmapView, entries_t& entriesView, int gridX, int gridY, int gridZ)
-{
-  using size_type = typename rowmap_t::non_const_value_type;
-  using lno_t = typename entries_t::non_const_value_type;
-  auto getVertexID =
-  [=](lno_t x, lno_t y, lno_t z) -> lno_t
-  {
+// Generates a graph from 3D 7-pt stencil. Slices grid into 2 connected
+// components near the middle of X dimension.
+template <typename rowmap_t, typename entries_t>
+void generate7pt(rowmap_t& rowmapView, entries_t& entriesView, int gridX,
+                 int gridY, int gridZ) {
+  using size_type  = typename rowmap_t::non_const_value_type;
+  using lno_t      = typename entries_t::non_const_value_type;
+  auto getVertexID = [=](lno_t x, lno_t y, lno_t z) -> lno_t {
     return x + y * gridX + z * gridX * gridY;
   };
   lno_t numVertices = gridX * gridY * gridZ;
-  //Generate the graph on host (use std::vector to not need to know
-  //how many entries ahead of time)
+  // Generate the graph on host (use std::vector to not need to know
+  // how many entries ahead of time)
   std::vector<size_type> rowmap(numVertices + 1);
   std::vector<lno_t> entries;
-  rowmap[0] = 0;
+  rowmap[0]    = 0;
   lno_t xslice = gridX / 2;
-  for(lno_t k = 0; k < gridZ; k++)
-  {
-    for(lno_t j = 0; j < gridY; j++)
-    {
-      for(lno_t i = 0; i < gridX; i++)
-      {
+  for (lno_t k = 0; k < gridZ; k++) {
+    for (lno_t j = 0; j < gridY; j++) {
+      for (lno_t i = 0; i < gridX; i++) {
         lno_t v = getVertexID(i, j, k);
-        if(i != 0 && i != xslice + 1)
+        if (i != 0 && i != xslice + 1)
           entries.push_back(getVertexID(i - 1, j, k));
-        if(i != gridX - 1 && i != xslice)
+        if (i != gridX - 1 && i != xslice)
           entries.push_back(getVertexID(i + 1, j, k));
-        if(j != 0)
-          entries.push_back(getVertexID(i, j - 1, k));
-        if(j != gridY - 1)
-          entries.push_back(getVertexID(i, j + 1, k));
-        if(k != 0)
-          entries.push_back(getVertexID(i, j, k - 1));
-        if(k != gridZ - 1)
-          entries.push_back(getVertexID(i, j, k + 1));
+        if (j != 0) entries.push_back(getVertexID(i, j - 1, k));
+        if (j != gridY - 1) entries.push_back(getVertexID(i, j + 1, k));
+        if (k != 0) entries.push_back(getVertexID(i, j, k - 1));
+        if (k != gridZ - 1) entries.push_back(getVertexID(i, j, k + 1));
         rowmap[v + 1] = entries.size();
       }
     }
   }
   size_type numEdges = entries.size();
-  //Now that the graph is formed, copy rowmap and entries to Kokkos::Views in device memory
-  //The nonowning host views just alias the std::vectors.
-  Kokkos::View<size_type*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> rowmapHost(rowmap.data(), numVertices + 1);
-  Kokkos::View<lno_t*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> entriesHost(entries.data(), numEdges);
-  //Allocate owning views on device with the correct size.
-  rowmapView = rowmap_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Rowmap"), numVertices + 1);
-  entriesView = entries_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Colinds"), numEdges);
-  //Copy the graph from host to device
+  // Now that the graph is formed, copy rowmap and entries to Kokkos::Views in
+  // device memory The nonowning host views just alias the std::vectors.
+  Kokkos::View<size_type*, Kokkos::HostSpace,
+               Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      rowmapHost(rowmap.data(), numVertices + 1);
+  Kokkos::View<lno_t*, Kokkos::HostSpace,
+               Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      entriesHost(entries.data(), numEdges);
+  // Allocate owning views on device with the correct size.
+  rowmapView =
+      rowmap_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Rowmap"),
+               numVertices + 1);
+  entriesView = entries_t(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Colinds"), numEdges);
+  // Copy the graph from host to device
   Kokkos::deep_copy(rowmapView, rowmapHost);
   Kokkos::deep_copy(entriesView, entriesHost);
 }
 
-template<typename rowmap_t, typename entries_t, typename labels_t>
-int maxBandwidth(const rowmap_t& rowmap, const entries_t& entries, const labels_t& invPerm, const labels_t& perm)
-{
+template <typename rowmap_t, typename entries_t, typename labels_t>
+int maxBandwidth(const rowmap_t& rowmap, const entries_t& entries,
+                 const labels_t& invPerm, const labels_t& perm) {
   using size_type = typename rowmap_t::non_const_value_type;
-  using lno_t = typename entries_t::non_const_value_type;
-  lno_t numVerts = rowmap.extent(0) - 1;
-  int bw = 0;
-  for(lno_t i = 0; i < numVerts; i++)
-  {
+  using lno_t     = typename entries_t::non_const_value_type;
+  lno_t numVerts  = rowmap.extent(0) - 1;
+  int bw          = 0;
+  for (lno_t i = 0; i < numVerts; i++) {
     lno_t origRow = perm(i);
-    for(size_type j = rowmap(origRow); j < rowmap(origRow + 1); j++)
-    {
+    for (size_type j = rowmap(origRow); j < rowmap(origRow + 1); j++) {
       lno_t origNei = entries(j);
-      lno_t nei = invPerm(origNei);
-      if(nei > i)
-      {
+      lno_t nei     = invPerm(origNei);
+      if (nei > i) {
         lno_t thisBW = nei - i;
-        if(thisBW > bw)
-          bw = thisBW;
+        if (thisBW > bw) bw = thisBW;
       }
     }
   }
@@ -131,69 +126,81 @@ int maxBandwidth(const rowmap_t& rowmap, const entries_t& entries, const labels_
 }
 
 template <typename lno_t, typename size_type, typename device>
-void test_rcm(lno_t gridX, lno_t gridY, lno_t gridZ)
-{
-  typedef typename KokkosSparse::CrsMatrix<double, lno_t, device, void, size_type> crsMat_t;
+void test_rcm(lno_t gridX, lno_t gridY, lno_t gridZ) {
+  typedef
+      typename KokkosSparse::CrsMatrix<double, lno_t, device, void, size_type>
+          crsMat_t;
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type rowmap_t;
-  typedef typename graph_t::entries_type entries_t; 
+  typedef typename graph_t::entries_type entries_t;
   lno_t numVerts = gridX * gridY * gridZ;
   typename rowmap_t::non_const_type rowmap;
   typename entries_t::non_const_type entries;
   generate7pt(rowmap, entries, gridX, gridY, gridZ);
-  auto rcm = KokkosGraph::Experimental::graph_rcm<device, rowmap_t, entries_t>(rowmap, entries);
-  auto rowmapHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmap);
-  auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries);
+  auto rcm = KokkosGraph::Experimental::graph_rcm<device, rowmap_t, entries_t>(
+      rowmap, entries);
+  auto rowmapHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmap);
+  auto entriesHost =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries);
   auto rcmHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rcm);
-  decltype(rcmHost) rcmPermHost(Kokkos::view_alloc(Kokkos::WithoutInitializing, "RCMPerm"), numVerts);
-  for(lno_t i = 0; i < numVerts; i++)
-    rcmPermHost(rcmHost(i)) = i;
-  //make sure each row index shows up exactly once
+  decltype(rcmHost) rcmPermHost(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "RCMPerm"), numVerts);
+  for (lno_t i = 0; i < numVerts; i++) rcmPermHost(rcmHost(i)) = i;
+  // make sure each row index shows up exactly once
   {
     std::vector<int> counts(numVerts);
-    for(lno_t i = 0; i < numVerts; i++)
-    {
+    for (lno_t i = 0; i < numVerts; i++) {
       lno_t orig = rcmHost(i);
       ASSERT_GE(orig, 0);
       ASSERT_LT(orig, numVerts);
       counts[orig]++;
     }
-    for(lno_t i = 0; i < numVerts; i++)
-      ASSERT_EQ(counts[i], 1);
+    for (lno_t i = 0; i < numVerts; i++) ASSERT_EQ(counts[i], 1);
   }
-  Kokkos::View<lno_t*, Kokkos::HostSpace> identityOrder(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Identity"), numVerts);
-  for(lno_t i = 0; i < numVerts; i++)
-    identityOrder(i) = i;
-  size_t origBW = maxBandwidth(rowmapHost, entriesHost, identityOrder, identityOrder);
+  Kokkos::View<lno_t*, Kokkos::HostSpace> identityOrder(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Identity"), numVerts);
+  for (lno_t i = 0; i < numVerts; i++) identityOrder(i) = i;
+  size_t origBW =
+      maxBandwidth(rowmapHost, entriesHost, identityOrder, identityOrder);
   size_t rcmBW = maxBandwidth(rowmapHost, entriesHost, rcmHost, rcmPermHost);
   EXPECT_LE(rcmBW, origBW);
 }
 
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
-TEST_F( TestCategory, graph ## _ ## rcm ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  test_rcm<ORDINAL,OFFSET,DEVICE>(6, 3, 3); \
-  test_rcm<ORDINAL,OFFSET,DEVICE>(20, 20, 20); \
-  test_rcm<ORDINAL,OFFSET,DEVICE>(100, 100, 1); \
-}
+#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                  \
+  TEST_F(TestCategory,                                                 \
+         graph##_##rcm##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    test_rcm<ORDINAL, OFFSET, DEVICE>(6, 3, 3);                        \
+    test_rcm<ORDINAL, OFFSET, DEVICE>(20, 20, 20);                     \
+    test_rcm<ORDINAL, OFFSET, DEVICE>(100, 100, 1);                    \
+  }
 
-#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
 #endif
 
 #undef EXECUTE_TEST
diff --git a/unit_test/hip/Test_HIP.hpp b/unit_test/hip/Test_HIP.hpp
index 1c035bfdc8..2bdc040adb 100644
--- a/unit_test/hip/Test_HIP.hpp
+++ b/unit_test/hip/Test_HIP.hpp
@@ -10,17 +10,13 @@
 #endif
 
 class hip : public ::testing::Test {
-protected:
-  static void SetUpTestCase()
-  {
-  }
+ protected:
+  static void SetUpTestCase() {}
 
-  static void TearDownTestCase()
-  {
-  }
+  static void TearDownTestCase() {}
 };
 
 #define TestCategory hip
 #define TestExecSpace Kokkos::Experimental::HIP
 
-#endif // TEST_HIP_HPP
+#endif  // TEST_HIP_HPP
diff --git a/unit_test/hip/Test_HIP_Batched_Dense.cpp b/unit_test/hip/Test_HIP_Batched_Dense.cpp
index fd48b9bc34..372cb1aea2 100644
--- a/unit_test/hip/Test_HIP_Batched_Dense.cpp
+++ b/unit_test/hip/Test_HIP_Batched_Dense.cpp
@@ -4,4 +4,4 @@
 #include "Test_HIP.hpp"
 #include "Test_Batched_Dense.hpp"
 
-#endif // TEST_HIP_BATCHED_DENSE_CPP
+#endif  // TEST_HIP_BATCHED_DENSE_CPP
diff --git a/unit_test/hip/Test_HIP_Batched_Sparse.cpp b/unit_test/hip/Test_HIP_Batched_Sparse.cpp
index b930b4e7ad..5ca2201806 100644
--- a/unit_test/hip/Test_HIP_Batched_Sparse.cpp
+++ b/unit_test/hip/Test_HIP_Batched_Sparse.cpp
@@ -4,4 +4,4 @@
 #include "Test_HIP.hpp"
 #include "Test_Batched_Sparse.hpp"
 
-#endif // TEST_HIP_BATCHED_SPARSE_CPP
+#endif  // TEST_HIP_BATCHED_SPARSE_CPP
diff --git a/unit_test/hip/Test_HIP_Blas.cpp b/unit_test/hip/Test_HIP_Blas.cpp
index dadb77bdbb..55c80296da 100644
--- a/unit_test/hip/Test_HIP_Blas.cpp
+++ b/unit_test/hip/Test_HIP_Blas.cpp
@@ -4,4 +4,4 @@
 #include "Test_HIP.hpp"
 #include "Test_Blas.hpp"
 
-#endif // TEST_HIP_BLAS_CPP
+#endif  // TEST_HIP_BLAS_CPP
diff --git a/unit_test/hip/Test_HIP_Common.cpp b/unit_test/hip/Test_HIP_Common.cpp
index e92f6ca9a1..55925c86cf 100644
--- a/unit_test/hip/Test_HIP_Common.cpp
+++ b/unit_test/hip/Test_HIP_Common.cpp
@@ -4,4 +4,4 @@
 #include "Test_HIP.hpp"
 #include "Test_Common.hpp"
 
-#endif // TEST_HIP_COMMON_CPP
+#endif  // TEST_HIP_COMMON_CPP
diff --git a/unit_test/hip/Test_HIP_Graph.cpp b/unit_test/hip/Test_HIP_Graph.cpp
index ecbc286889..7aa8fdcb99 100644
--- a/unit_test/hip/Test_HIP_Graph.cpp
+++ b/unit_test/hip/Test_HIP_Graph.cpp
@@ -4,4 +4,4 @@
 #include "Test_HIP.hpp"
 #include "Test_Graph.hpp"
 
-#endif // TEST_HIP_GRAPH_CPP
+#endif  // TEST_HIP_GRAPH_CPP
diff --git a/unit_test/hip/Test_HIP_Sparse.cpp b/unit_test/hip/Test_HIP_Sparse.cpp
index 99648d038f..3736ed5d0c 100644
--- a/unit_test/hip/Test_HIP_Sparse.cpp
+++ b/unit_test/hip/Test_HIP_Sparse.cpp
@@ -4,4 +4,4 @@
 #include "Test_HIP.hpp"
 #include "Test_Sparse.hpp"
 
-#endif // TEST_HIP_Sparse_CPP
+#endif  // TEST_HIP_Sparse_CPP
diff --git a/unit_test/openmp/Test_OpenMP.hpp b/unit_test/openmp/Test_OpenMP.hpp
index 9919bf1e26..f9c46457d8 100644
--- a/unit_test/openmp/Test_OpenMP.hpp
+++ b/unit_test/openmp/Test_OpenMP.hpp
@@ -10,17 +10,13 @@
 #endif
 
 class openmp : public ::testing::Test {
-protected:
-  static void SetUpTestCase()
-  {
-  }
+ protected:
+  static void SetUpTestCase() {}
 
-  static void TearDownTestCase()
-  {
-  }
+  static void TearDownTestCase() {}
 };
 
 #define TestCategory openmp
 #define TestExecSpace Kokkos::OpenMP
 
-#endif // TEST_OPENMP_HPP
+#endif  // TEST_OPENMP_HPP
diff --git a/unit_test/openmp/Test_OpenMP_Batched_Dense.cpp b/unit_test/openmp/Test_OpenMP_Batched_Dense.cpp
index 55fce383a7..fa8c9722a7 100644
--- a/unit_test/openmp/Test_OpenMP_Batched_Dense.cpp
+++ b/unit_test/openmp/Test_OpenMP_Batched_Dense.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_OPENMP_BATCHED_DENSE_CPP
 #define TEST_OPENMP_BATCHED_DENSE_CPP
 
-#include<Test_OpenMP.hpp>
-#include<Test_Batched_Dense.hpp>
+#include <Test_OpenMP.hpp>
+#include <Test_Batched_Dense.hpp>
 
-#endif // TEST_OPENMP_BATCHED_DENSE_CPP
+#endif  // TEST_OPENMP_BATCHED_DENSE_CPP
diff --git a/unit_test/openmp/Test_OpenMP_Batched_Sparse.cpp b/unit_test/openmp/Test_OpenMP_Batched_Sparse.cpp
index 86ce0d8de0..7ce5fba62f 100644
--- a/unit_test/openmp/Test_OpenMP_Batched_Sparse.cpp
+++ b/unit_test/openmp/Test_OpenMP_Batched_Sparse.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_OPENMP_BATCHED_SPARSE_CPP
 #define TEST_OPENMP_BATCHED_SPARSE_CPP
 
-#include<Test_OpenMP.hpp>
-#include<Test_Batched_Sparse.hpp>
+#include <Test_OpenMP.hpp>
+#include <Test_Batched_Sparse.hpp>
 
-#endif // TEST_OPENMP_BATCHED_SPARSE_CPP
+#endif  // TEST_OPENMP_BATCHED_SPARSE_CPP
diff --git a/unit_test/openmp/Test_OpenMP_Blas.cpp b/unit_test/openmp/Test_OpenMP_Blas.cpp
index b5c0d2a9de..6c573d0574 100644
--- a/unit_test/openmp/Test_OpenMP_Blas.cpp
+++ b/unit_test/openmp/Test_OpenMP_Blas.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_OPENMP_BLAS_CPP
 #define TEST_OPENMP_BLAS_CPP
 
-#include<Test_OpenMP.hpp>
-#include<Test_Blas.hpp>
+#include <Test_OpenMP.hpp>
+#include <Test_Blas.hpp>
 
-#endif // TEST_OPENMP_BLAS_CPP
+#endif  // TEST_OPENMP_BLAS_CPP
diff --git a/unit_test/openmp/Test_OpenMP_Common.cpp b/unit_test/openmp/Test_OpenMP_Common.cpp
index 893af52480..eb24b2e622 100644
--- a/unit_test/openmp/Test_OpenMP_Common.cpp
+++ b/unit_test/openmp/Test_OpenMP_Common.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_OPENMP_COMMON_CPP
 #define TEST_OPENMP_COMMON_CPP
 
-#include<Test_OpenMP.hpp>
-#include<Test_Common.hpp>
+#include <Test_OpenMP.hpp>
+#include <Test_Common.hpp>
 
-#endif // TEST_OPENMP_COMMON_CPP
+#endif  // TEST_OPENMP_COMMON_CPP
diff --git a/unit_test/openmp/Test_OpenMP_Graph.cpp b/unit_test/openmp/Test_OpenMP_Graph.cpp
index a4ba1c5a87..53ebb6d7a5 100644
--- a/unit_test/openmp/Test_OpenMP_Graph.cpp
+++ b/unit_test/openmp/Test_OpenMP_Graph.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_OPENMP_GRAPH_CPP
 #define TEST_OPENMP_GRAPH_CPP
 
-#include<Test_OpenMP.hpp>
-#include<Test_Graph.hpp>
+#include <Test_OpenMP.hpp>
+#include <Test_Graph.hpp>
 
-#endif // TEST_OPENMP_GRAPH_CPP
+#endif  // TEST_OPENMP_GRAPH_CPP
diff --git a/unit_test/openmp/Test_OpenMP_Sparse.cpp b/unit_test/openmp/Test_OpenMP_Sparse.cpp
index 87e48b97d6..e8bed873e0 100644
--- a/unit_test/openmp/Test_OpenMP_Sparse.cpp
+++ b/unit_test/openmp/Test_OpenMP_Sparse.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_OPENMP_SPARSE_CPP
 #define TEST_OPENMP_SPARSE_CPP
 
-#include<Test_OpenMP.hpp>
-#include<Test_Sparse.hpp>
+#include <Test_OpenMP.hpp>
+#include <Test_Sparse.hpp>
 
-#endif // TEST_OPENMP_SPARSE_CPP
+#endif  // TEST_OPENMP_SPARSE_CPP
diff --git a/unit_test/openmptarget/Test_OpenMPTarget.hpp b/unit_test/openmptarget/Test_OpenMPTarget.hpp
index fcd41552b4..ff2b6842aa 100644
--- a/unit_test/openmptarget/Test_OpenMPTarget.hpp
+++ b/unit_test/openmptarget/Test_OpenMPTarget.hpp
@@ -10,17 +10,13 @@
 #endif
 
 class openmptarget : public ::testing::Test {
-protected:
-  static void SetUpTestCase()
-  {
-  }
+ protected:
+  static void SetUpTestCase() {}
 
-  static void TearDownTestCase()
-  {
-  }
+  static void TearDownTestCase() {}
 };
 
 #define TestCategory openmptarget
 #define TestExecSpace Kokkos::Experimental::OpenMPTarget
 
-#endif // TEST_OPENMPTARGET_HPP
+#endif  // TEST_OPENMPTARGET_HPP
diff --git a/unit_test/openmptarget/Test_OpenMPTarget_Batched_Dense.cpp b/unit_test/openmptarget/Test_OpenMPTarget_Batched_Dense.cpp
index 293046e5b5..bb232d2fc0 100644
--- a/unit_test/openmptarget/Test_OpenMPTarget_Batched_Dense.cpp
+++ b/unit_test/openmptarget/Test_OpenMPTarget_Batched_Dense.cpp
@@ -4,4 +4,4 @@
 #include "Test_OpenMPTarget.hpp"
 #include "Test_Batched_Dense.hpp"
 
-#endif // TEST_OPENMPTARGET_BATCHED_DENSE_CPP
+#endif  // TEST_OPENMPTARGET_BATCHED_DENSE_CPP
diff --git a/unit_test/openmptarget/Test_OpenMPTarget_Batched_Sparse.cpp b/unit_test/openmptarget/Test_OpenMPTarget_Batched_Sparse.cpp
index 7c8e7d9acc..cc77bd8be1 100644
--- a/unit_test/openmptarget/Test_OpenMPTarget_Batched_Sparse.cpp
+++ b/unit_test/openmptarget/Test_OpenMPTarget_Batched_Sparse.cpp
@@ -4,4 +4,4 @@
 #include "Test_OpenMPTarget.hpp"
 #include "Test_Batched_Sparse.hpp"
 
-#endif // TEST_OPENMPTARGET_BATCHED_SPARSE_CPP
+#endif  // TEST_OPENMPTARGET_BATCHED_SPARSE_CPP
diff --git a/unit_test/openmptarget/Test_OpenMPTarget_Blas.cpp b/unit_test/openmptarget/Test_OpenMPTarget_Blas.cpp
index 3309b180e4..d2856cc48e 100644
--- a/unit_test/openmptarget/Test_OpenMPTarget_Blas.cpp
+++ b/unit_test/openmptarget/Test_OpenMPTarget_Blas.cpp
@@ -4,4 +4,4 @@
 #include "Test_OpenMPTarget.hpp"
 #include "Test_Blas.hpp"
 
-#endif // TEST_OPENMPTARGET_BLAS_CPP
+#endif  // TEST_OPENMPTARGET_BLAS_CPP
diff --git a/unit_test/openmptarget/Test_OpenMPTarget_Common.cpp b/unit_test/openmptarget/Test_OpenMPTarget_Common.cpp
index 9db895b058..27e06ae2e5 100644
--- a/unit_test/openmptarget/Test_OpenMPTarget_Common.cpp
+++ b/unit_test/openmptarget/Test_OpenMPTarget_Common.cpp
@@ -4,4 +4,4 @@
 #include "Test_OpenMPTarget.hpp"
 #include "Test_Common.hpp"
 
-#endif // TEST_OPENMPTARGET_COMMON_CPP
+#endif  // TEST_OPENMPTARGET_COMMON_CPP
diff --git a/unit_test/openmptarget/Test_OpenMPTarget_Graph.cpp b/unit_test/openmptarget/Test_OpenMPTarget_Graph.cpp
index 26fbeb9db7..ee1f823020 100644
--- a/unit_test/openmptarget/Test_OpenMPTarget_Graph.cpp
+++ b/unit_test/openmptarget/Test_OpenMPTarget_Graph.cpp
@@ -4,4 +4,4 @@
 #include "Test_OpenMPTarget.hpp"
 #include "Test_Graph.hpp"
 
-#endif // TEST_OPENMPTARGET_GRAPH_CPP
+#endif  // TEST_OPENMPTARGET_GRAPH_CPP
diff --git a/unit_test/openmptarget/Test_OpenMPTarget_Sparse.cpp b/unit_test/openmptarget/Test_OpenMPTarget_Sparse.cpp
index ef13be4d28..145c4e5a92 100644
--- a/unit_test/openmptarget/Test_OpenMPTarget_Sparse.cpp
+++ b/unit_test/openmptarget/Test_OpenMPTarget_Sparse.cpp
@@ -4,4 +4,4 @@
 #include "Test_OpenMPTarget.hpp"
 #include "Test_Sparse.hpp"
 
-#endif // TEST_OPENMPTARGET_SPARSE_CPP
+#endif  // TEST_OPENMPTARGET_SPARSE_CPP
diff --git a/unit_test/serial/Test_Serial.hpp b/unit_test/serial/Test_Serial.hpp
index 475d226900..009297b5cd 100644
--- a/unit_test/serial/Test_Serial.hpp
+++ b/unit_test/serial/Test_Serial.hpp
@@ -10,17 +10,13 @@
 #endif
 
 class serial : public ::testing::Test {
-protected:
-  static void SetUpTestCase()
-  {
-  }
+ protected:
+  static void SetUpTestCase() {}
 
-  static void TearDownTestCase()
-  {
-  }
+  static void TearDownTestCase() {}
 };
 
 #define TestCategory serial
 #define TestExecSpace Kokkos::Serial
 
-#endif // TEST_SERIAL_HPP
+#endif  // TEST_SERIAL_HPP
diff --git a/unit_test/serial/Test_Serial_Batched_Dense.cpp b/unit_test/serial/Test_Serial_Batched_Dense.cpp
index a78a0a0b40..3e616a5e10 100644
--- a/unit_test/serial/Test_Serial_Batched_Dense.cpp
+++ b/unit_test/serial/Test_Serial_Batched_Dense.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_SERIAL_BATCHED_DENSE_CPP
 #define TEST_SERIAL_BATCHED_DENSE_CPP
 
-#include<Test_Serial.hpp>
-#include<Test_Batched_Dense.hpp>
+#include <Test_Serial.hpp>
+#include <Test_Batched_Dense.hpp>
 
-#endif // TEST_SERIAL_BATCHED_DENSE_CPP
+#endif  // TEST_SERIAL_BATCHED_DENSE_CPP
diff --git a/unit_test/serial/Test_Serial_Batched_Sparse.cpp b/unit_test/serial/Test_Serial_Batched_Sparse.cpp
index e3f8fcce02..db42bb0958 100644
--- a/unit_test/serial/Test_Serial_Batched_Sparse.cpp
+++ b/unit_test/serial/Test_Serial_Batched_Sparse.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_SERIAL_BATCHED_SPARSE_CPP
 #define TEST_SERIAL_BATCHED_SPARSE_CPP
 
-#include<Test_Serial.hpp>
-#include<Test_Batched_Sparse.hpp>
+#include <Test_Serial.hpp>
+#include <Test_Batched_Sparse.hpp>
 
-#endif // TEST_SERIAL_BATCHED_SPARSE_CPP
+#endif  // TEST_SERIAL_BATCHED_SPARSE_CPP
diff --git a/unit_test/serial/Test_Serial_Blas.cpp b/unit_test/serial/Test_Serial_Blas.cpp
index 106020daff..80be2ca16d 100644
--- a/unit_test/serial/Test_Serial_Blas.cpp
+++ b/unit_test/serial/Test_Serial_Blas.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_SERIAL_BLAS_CPP
 #define TEST_SERIAL_BLAS_CPP
 
-#include<Test_Serial.hpp>
-#include<Test_Blas.hpp>
+#include <Test_Serial.hpp>
+#include <Test_Blas.hpp>
 
-#endif // TEST_SERIAL_BLAS_CPP
+#endif  // TEST_SERIAL_BLAS_CPP
diff --git a/unit_test/serial/Test_Serial_Common.cpp b/unit_test/serial/Test_Serial_Common.cpp
index 6f4e2aa0ad..90320dca36 100644
--- a/unit_test/serial/Test_Serial_Common.cpp
+++ b/unit_test/serial/Test_Serial_Common.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_SERIAL_COMMON_CPP
 #define TEST_SERIAL_COMMON_CPP
 
-#include<Test_Serial.hpp>
-#include<Test_Common.hpp>
+#include <Test_Serial.hpp>
+#include <Test_Common.hpp>
 
-#endif // TEST_SERIAL_COMMON_CPP
+#endif  // TEST_SERIAL_COMMON_CPP
diff --git a/unit_test/serial/Test_Serial_Graph.cpp b/unit_test/serial/Test_Serial_Graph.cpp
index b8ec5bf4fd..3aa9f2bcb9 100644
--- a/unit_test/serial/Test_Serial_Graph.cpp
+++ b/unit_test/serial/Test_Serial_Graph.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_SERIAL_GRAPH_CPP
 #define TEST_SERIAL_GRAPH_CPP
 
-#include<Test_Serial.hpp>
-#include<Test_Graph.hpp>
+#include <Test_Serial.hpp>
+#include <Test_Graph.hpp>
 
-#endif // TEST_SERIAL_GRAPH_CPP
+#endif  // TEST_SERIAL_GRAPH_CPP
diff --git a/unit_test/serial/Test_Serial_Sparse.cpp b/unit_test/serial/Test_Serial_Sparse.cpp
index cb01de2440..bd818d3930 100644
--- a/unit_test/serial/Test_Serial_Sparse.cpp
+++ b/unit_test/serial/Test_Serial_Sparse.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_SERIAL_SPARSE_CPP
 #define TEST_SERIAL_SPARSE_CPP
 
-#include<Test_Serial.hpp>
-#include<Test_Sparse.hpp>
+#include <Test_Serial.hpp>
+#include <Test_Sparse.hpp>
 
-#endif // TEST_SERIAL_SPARSE_CPP
+#endif  // TEST_SERIAL_SPARSE_CPP
diff --git a/unit_test/sparse/Test_Sparse.hpp b/unit_test/sparse/Test_Sparse.hpp
index 3d64791f7a..2afa0fb2db 100644
--- a/unit_test/sparse/Test_Sparse.hpp
+++ b/unit_test/sparse/Test_Sparse.hpp
@@ -14,9 +14,17 @@
 #include "Test_Sparse_spgemm.hpp"
 #include "Test_Sparse_spiluk.hpp"
 #include "Test_Sparse_spmv.hpp"
+//#include "Test_Sparse_spmv_blockcrs.hpp"
+//#include "Test_Sparse_spmv_bsr.hpp"
 #include "Test_Sparse_sptrsv.hpp"
 #include "Test_Sparse_trsv.hpp"
 
+// TPL specific tests, these require
+// particular pairs of backend and TPL
+// to actually define tests.
+
 #include "Test_Sparse_Utils_cusparse.hpp"
 
-#endif // TEST_SPARSE_HPP
+#include "Test_Sparse_rocsparse.hpp"
+
+#endif  // TEST_SPARSE_HPP
diff --git a/unit_test/sparse/Test_Sparse_BlockCrsMatrix.hpp b/unit_test/sparse/Test_Sparse_BlockCrsMatrix.hpp
index f1ec65cc69..e87514c3c6 100644
--- a/unit_test/sparse/Test_Sparse_BlockCrsMatrix.hpp
+++ b/unit_test/sparse/Test_Sparse_BlockCrsMatrix.hpp
@@ -57,161 +57,159 @@
 typedef Kokkos::complex<double> kokkos_complex_double;
 typedef Kokkos::complex<float> kokkos_complex_float;
 
-namespace Test{ // anonymous
-
-  using std::cerr;
-  using std::endl;
-
-  // Create a test sparse matrix A.
-  //
-  // Identify the matrix to create by number (whichMatrix).  The
-  // following lists the valid options for whichMatrix:
-  //
-  // 0: A square 8 x 8 sparse CrsMatrix with implicit block structure
-  // 1: A square 4 x 4 sparse BlockCrsMatrix
-  //
-  // \param ptr [out] Array of row offsets, of length numRows+1.
-  // \param ind [out] Array of column indices, of length nnz (CrsMatrix) 
-  //        or numBlocks (BlockCrsMatrix).
-  // \param val [out] Array of entries (values), of length nnz.
-  // \param numRows [out] The number of rows in the matrix.
-  // \param numCols [out] The number of columns in the matrix.
-  // \param nnz [out] The number of stored entries in the matrix.
-  // \param whichMatrix [in] The index of the matrix to create.
-  template<typename sparseMat_t>
-  void
-  makeSparseMatrix (
-      typename sparseMat_t::StaticCrsGraphType::row_map_type::non_const_type & ptr,
-      typename sparseMat_t::StaticCrsGraphType::entries_type::non_const_type   & ind,
-      typename sparseMat_t::values_type::non_const_type & val,
-      typename sparseMat_t::ordinal_type &numRows,
-      typename sparseMat_t::ordinal_type &numCols,
-      typename sparseMat_t::size_type &nnz,
-      const int whichMatrix,
-      typename sparseMat_t::ordinal_type &blockDim)
-  {
-
-    typedef typename sparseMat_t::StaticCrsGraphType::row_map_type::non_const_type ptr_type ;
-    typedef typename sparseMat_t::StaticCrsGraphType::entries_type::non_const_type ind_type ;
-    typedef typename sparseMat_t::values_type::non_const_type val_type ;
-    typedef typename sparseMat_t::ordinal_type lno_t;
-    typedef typename sparseMat_t::size_type size_type;
-    typedef typename sparseMat_t::value_type scalar_t;
-
-    using Kokkos::HostSpace;
-    using Kokkos::MemoryUnmanaged;
-    using Kokkos::View;
-
-    if (whichMatrix == 0) {
-      numRows = 8;
-      numCols = 8;
-      nnz = 24;
-      blockDim = 1;
-
-      const size_type ptrRaw[] = {0, 4, 8, 10, 12, 14, 16, 20, 24};
-      const lno_t indRaw[] = {0, 1, 4, 5, 0, 1, 4, 5, 2, 3, 2, 3, 4, 5, 4, 5, 2, 3, 6, 7, 2, 3, 6, 7};
-      const scalar_t valRaw[] = {.1, 1, 4, 5, -.1, -1, -4, -5, 2, 3, -2, -3, 4, 5, -4, -5, 2, 3, 6, 7, -2, -3, -6, -7};
-
-      // Create the output Views.
-      ptr = ptr_type("ptr", numRows + 1);
-      ind = ind_type("ind", nnz);
-      val = val_type("val", nnz);
-
-      // Wrap the above three arrays in unmanaged Views, so we can use deep_copy.
-      typename ptr_type::HostMirror::const_type  ptrIn( ptrRaw , numRows+1 );
-      typename ind_type::HostMirror::const_type  indIn( indRaw , nnz );
-      typename val_type::HostMirror::const_type  valIn( valRaw , nnz );
-
-      Kokkos::deep_copy (ptr, ptrIn);
-      Kokkos::deep_copy (ind, indIn);
-      Kokkos::deep_copy (val, valIn);
-    }
-    else if (whichMatrix == 1) {
-      numRows = 4;
-      numCols = 4;
-      nnz = 24;
-
-      blockDim = 2;
-      const lno_t numBlocks = 6;
-
-      const size_type ptrRaw[] = {0, 2, 3, 4, 6};
-      const lno_t indRaw[] = {0, 2, 1, 2, 1, 3};
-      const scalar_t valRaw[] = {.1, 1, 4, 5, -.1, -1, -4, -5, 2, 3, -2, -3, 4, 5, -4, -5, 2, 3, 6, 7, -2, -3, -6, -7};
-
-      // Create the output Views.
-      ptr = ptr_type("ptr", numRows + 1);
-      ind = ind_type("ind", numBlocks);
-      val = val_type("val", nnz);
-
-      // Wrap the above three arrays in unmanaged Views, so we can use deep_copy.
-      typename ptr_type::HostMirror::const_type  ptrIn( ptrRaw , numRows+1 );
-      typename ind_type::HostMirror::const_type  indIn( indRaw , numBlocks );
-      typename val_type::HostMirror::const_type  valIn( valRaw , nnz );
-
-      Kokkos::deep_copy (ptr, ptrIn);
-      Kokkos::deep_copy (ind, indIn);
-      Kokkos::deep_copy (val, valIn);
-    }
+namespace Test {  // anonymous
 
-    else { // whichMatrix != 0
-      std::ostringstream os;
-      os << "Invalid whichMatrix value " << whichMatrix
-         << ".  Valid value(s) include " << 0 << ".";
-      throw std::invalid_argument (os.str ());
-    }
-  }
+using std::cerr;
+using std::endl;
 
-  // Return the Kokkos::CrsMatrix corresponding to makeSparseMatrix().
-  template<typename crsMat_t>
-  crsMat_t  makeCrsMatrix_BlockStructure ()
-  {
-    typedef typename crsMat_t::StaticCrsGraphType graph_t;
-    typedef typename graph_t::row_map_type::non_const_type lno_view_t;
-    typedef typename graph_t::entries_type::non_const_type   lno_nnz_view_t;
-    typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
-    typedef typename crsMat_t::ordinal_type lno_t;
-    typedef typename crsMat_t::size_type size_type;
-
-    lno_view_t ptr;
-    lno_nnz_view_t ind;
-    scalar_view_t val;
-    lno_t numRows;
-    lno_t numCols;
-    size_type nnz;
-    lno_t blockDim;
-
-    const int whichMatrix = 0;
-    makeSparseMatrix<crsMat_t> (ptr, ind, val, numRows, numCols, nnz, whichMatrix, blockDim);
-    return crsMat_t ("A", numRows, numCols, nnz, val, ptr, ind);
+// Create a test sparse matrix A.
+//
+// Identify the matrix to create by number (whichMatrix).  The
+// following lists the valid options for whichMatrix:
+//
+// 0: A square 8 x 8 sparse CrsMatrix with implicit block structure
+// 1: A square 4 x 4 sparse BlockCrsMatrix
+//
+// \param ptr [out] Array of row offsets, of length numRows+1.
+// \param ind [out] Array of column indices, of length nnz (CrsMatrix)
+//        or numBlocks (BlockCrsMatrix).
+// \param val [out] Array of entries (values), of length nnz.
+// \param numRows [out] The number of rows in the matrix.
+// \param numCols [out] The number of columns in the matrix.
+// \param nnz [out] The number of stored entries in the matrix.
+// \param whichMatrix [in] The index of the matrix to create.
+template <typename sparseMat_t>
+void makeSparseMatrix(
+    typename sparseMat_t::StaticCrsGraphType::row_map_type::non_const_type &ptr,
+    typename sparseMat_t::StaticCrsGraphType::entries_type::non_const_type &ind,
+    typename sparseMat_t::values_type::non_const_type &val,
+    typename sparseMat_t::ordinal_type &numRows,
+    typename sparseMat_t::ordinal_type &numCols,
+    typename sparseMat_t::size_type &nnz, const int whichMatrix,
+    typename sparseMat_t::ordinal_type &blockDim) {
+  typedef typename sparseMat_t::StaticCrsGraphType::row_map_type::non_const_type
+      ptr_type;
+  typedef typename sparseMat_t::StaticCrsGraphType::entries_type::non_const_type
+      ind_type;
+  typedef typename sparseMat_t::values_type::non_const_type val_type;
+  typedef typename sparseMat_t::ordinal_type lno_t;
+  typedef typename sparseMat_t::size_type size_type;
+  typedef typename sparseMat_t::value_type scalar_t;
+
+  using Kokkos::HostSpace;
+  using Kokkos::MemoryUnmanaged;
+  using Kokkos::View;
+
+  if (whichMatrix == 0) {
+    numRows  = 8;
+    numCols  = 8;
+    nnz      = 24;
+    blockDim = 1;
+
+    const size_type ptrRaw[] = {0, 4, 8, 10, 12, 14, 16, 20, 24};
+    const lno_t indRaw[]     = {0, 1, 4, 5, 0, 1, 4, 5, 2, 3, 2, 3,
+                            4, 5, 4, 5, 2, 3, 6, 7, 2, 3, 6, 7};
+    const scalar_t valRaw[]  = {.1, 1, 4,  5,  -.1, -1, -4, -5, 2,  3,  -2, -3,
+                               4,  5, -4, -5, 2,   3,  6,  7,  -2, -3, -6, -7};
+
+    // Create the output Views.
+    ptr = ptr_type("ptr", numRows + 1);
+    ind = ind_type("ind", nnz);
+    val = val_type("val", nnz);
+
+    // Wrap the above three arrays in unmanaged Views, so we can use deep_copy.
+    typename ptr_type::HostMirror::const_type ptrIn(ptrRaw, numRows + 1);
+    typename ind_type::HostMirror::const_type indIn(indRaw, nnz);
+    typename val_type::HostMirror::const_type valIn(valRaw, nnz);
+
+    Kokkos::deep_copy(ptr, ptrIn);
+    Kokkos::deep_copy(ind, indIn);
+    Kokkos::deep_copy(val, valIn);
+  } else if (whichMatrix == 1) {
+    numRows = 4;
+    numCols = 4;
+    nnz     = 24;
+
+    blockDim              = 2;
+    const lno_t numBlocks = 6;
+
+    const size_type ptrRaw[] = {0, 2, 3, 4, 6};
+    const lno_t indRaw[]     = {0, 2, 1, 2, 1, 3};
+    const scalar_t valRaw[]  = {.1, 1, 4,  5,  -.1, -1, -4, -5, 2,  3,  -2, -3,
+                               4,  5, -4, -5, 2,   3,  6,  7,  -2, -3, -6, -7};
+
+    // Create the output Views.
+    ptr = ptr_type("ptr", numRows + 1);
+    ind = ind_type("ind", numBlocks);
+    val = val_type("val", nnz);
+
+    // Wrap the above three arrays in unmanaged Views, so we can use deep_copy.
+    typename ptr_type::HostMirror::const_type ptrIn(ptrRaw, numRows + 1);
+    typename ind_type::HostMirror::const_type indIn(indRaw, numBlocks);
+    typename val_type::HostMirror::const_type valIn(valRaw, nnz);
+
+    Kokkos::deep_copy(ptr, ptrIn);
+    Kokkos::deep_copy(ind, indIn);
+    Kokkos::deep_copy(val, valIn);
   }
 
-  template<typename blkcrsMat_t>
-  blkcrsMat_t  makeBlockCrsMatrix ()
-  {
-    typedef typename blkcrsMat_t::StaticCrsGraphType graph_t;
-    typedef typename graph_t::row_map_type::non_const_type lno_view_t;
-    typedef typename graph_t::entries_type::non_const_type   lno_nnz_view_t;
-    typedef typename blkcrsMat_t::values_type::non_const_type scalar_view_t;
-    typedef typename blkcrsMat_t::ordinal_type lno_t;
-    typedef typename blkcrsMat_t::size_type size_type;
-
-    lno_view_t ptr;
-    lno_nnz_view_t ind;
-    scalar_view_t val;
-    lno_t numRows;
-    lno_t numCols;
-    size_type nnz;
-    lno_t blockDim;
-
-    const int whichMatrix = 1;
-    makeSparseMatrix<blkcrsMat_t> (ptr, ind, val, numRows, numCols, nnz, whichMatrix, blockDim);
-    return blkcrsMat_t ("blkA", numRows, numCols, nnz, val, ptr, ind, blockDim);
+  else {  // whichMatrix != 0
+    std::ostringstream os;
+    os << "Invalid whichMatrix value " << whichMatrix
+       << ".  Valid value(s) include " << 0 << ".";
+    throw std::invalid_argument(os.str());
   }
+}
 
+// Return the Kokkos::CrsMatrix corresponding to makeSparseMatrix().
+template <typename crsMat_t>
+crsMat_t makeCrsMatrix_BlockStructure() {
+  typedef typename crsMat_t::StaticCrsGraphType graph_t;
+  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
+  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
+  typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
+  typedef typename crsMat_t::ordinal_type lno_t;
+  typedef typename crsMat_t::size_type size_type;
+
+  lno_view_t ptr;
+  lno_nnz_view_t ind;
+  scalar_view_t val;
+  lno_t numRows;
+  lno_t numCols;
+  size_type nnz;
+  lno_t blockDim;
+
+  const int whichMatrix = 0;
+  makeSparseMatrix<crsMat_t>(ptr, ind, val, numRows, numCols, nnz, whichMatrix,
+                             blockDim);
+  return crsMat_t("A", numRows, numCols, nnz, val, ptr, ind);
+}
 
-  template < class MatrixType, class ResultsType >
-  struct TestFunctor {
+template <typename blkcrsMat_t>
+blkcrsMat_t makeBlockCrsMatrix() {
+  typedef typename blkcrsMat_t::StaticCrsGraphType graph_t;
+  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
+  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
+  typedef typename blkcrsMat_t::values_type::non_const_type scalar_view_t;
+  typedef typename blkcrsMat_t::ordinal_type lno_t;
+  typedef typename blkcrsMat_t::size_type size_type;
+
+  lno_view_t ptr;
+  lno_nnz_view_t ind;
+  scalar_view_t val;
+  lno_t numRows;
+  lno_t numCols;
+  size_type nnz;
+  lno_t blockDim;
+
+  const int whichMatrix = 1;
+  makeSparseMatrix<blkcrsMat_t>(ptr, ind, val, numRows, numCols, nnz,
+                                whichMatrix, blockDim);
+  return blkcrsMat_t("blkA", numRows, numCols, nnz, val, ptr, ind, blockDim);
+}
 
+template <class MatrixType, class ResultsType>
+struct TestFunctor {
   typedef typename MatrixType::value_type scalar_t;
   typedef typename MatrixType::ordinal_type lno_t;
 
@@ -220,90 +218,87 @@ namespace Test{ // anonymous
   ResultsType d_results;
 
   // Constructor
-  TestFunctor( MatrixType & A_, ResultsType & d_results_ ) :
-    A(A_),
-    d_results(d_results_)
-  {}
+  TestFunctor(MatrixType &A_, ResultsType &d_results_)
+      : A(A_), d_results(d_results_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const int /*rid*/ ) const
-  {
+  void operator()(const int /*rid*/) const {
     // Test 1: Check member functions behave as expected
     bool check0 = true;
     bool check1 = true;
     bool check2 = true;
     bool check3 = true;
-    for ( lno_t i = 0; i < A.numRows(); ++i ) {
-
+    for (lno_t i = 0; i < A.numRows(); ++i) {
       // Test SparseBlockRowView
       {
-        auto iblockrow = A.block_row(i);
+        auto iblockrow         = A.block_row(i);
         auto num_blocks_in_row = iblockrow.length;
-        for ( auto blk = 0; blk < num_blocks_in_row; ++blk ){
+        for (auto blk = 0; blk < num_blocks_in_row; ++blk) {
           auto view_blk = iblockrow.block(blk);
-          for ( auto lrow = 0; lrow < A.blockDim(); ++lrow ) {
+          for (auto lrow = 0; lrow < A.blockDim(); ++lrow) {
             auto row_ptr = iblockrow.local_row_in_block(blk, lrow);
-            for ( auto lcol = 0; lcol < A.blockDim(); ++lcol ) {
+            for (auto lcol = 0; lcol < A.blockDim(); ++lcol) {
               auto entry = iblockrow.local_block_value(blk, lrow, lcol);
-              //std::cout << "check0: " << ( entry == row_ptr[lcol] );
-              //std::cout << "check1: " << ( entry == view_blk(lrow,lcol) );
-              check0 = check0 && ( entry == row_ptr[lcol] );
-              check1 = check1 && ( entry == view_blk(lrow,lcol) );
-            } // end local col in row
-          } // end local row in blk
-        } // end blk
+              // std::cout << "check0: " << ( entry == row_ptr[lcol] );
+              // std::cout << "check1: " << ( entry == view_blk(lrow,lcol) );
+              check0 = check0 && (entry == row_ptr[lcol]);
+              check1 = check1 && (entry == view_blk(lrow, lcol));
+            }  // end local col in row
+          }    // end local row in blk
+        }      // end blk
       }
       d_results(0) = check0;
       d_results(1) = check1;
 
       // Test SparseBlockRowViewConst
       {
-        auto iblockrow = A.block_row_Const(i);
+        auto iblockrow         = A.block_row_Const(i);
         auto num_blocks_in_row = iblockrow.length;
-        for ( auto blk = 0; blk < num_blocks_in_row; ++blk ){
+        for (auto blk = 0; blk < num_blocks_in_row; ++blk) {
           auto view_blk = iblockrow.block(blk);
-          for ( auto lrow = 0; lrow < A.blockDim(); ++lrow ) {
+          for (auto lrow = 0; lrow < A.blockDim(); ++lrow) {
             auto row_ptr = iblockrow.local_row_in_block(blk, lrow);
-            for ( auto lcol = 0; lcol < A.blockDim(); ++lcol ) {
+            for (auto lcol = 0; lcol < A.blockDim(); ++lcol) {
               auto entry = iblockrow.local_block_value(blk, lrow, lcol);
-              check2 = check2 && ( entry == row_ptr[lcol] );
-              check3 = check3 && ( entry == view_blk(lrow,lcol) );
-            } // end local col in row
-          } // end local row in blk
-        } // end blk
+              check2     = check2 && (entry == row_ptr[lcol]);
+              check3     = check3 && (entry == view_blk(lrow, lcol));
+            }  // end local col in row
+          }    // end local row in blk
+        }      // end blk
       }
       d_results(0) = check0;
       d_results(1) = check1;
       d_results(2) = check2;
       d_results(3) = check3;
-    } // end for blk rows
+    }  // end for blk rows
 
     // Test sumIntoValues
     {
-      check0 = true;
-      check1 = true;
-      check2 = true;
-      const lno_t ncols = 1;
-      const lno_t cols[] = {3};
-      const lno_t browi = 3;
-      const scalar_t vals[] = {10, 11, 20, 22}; // represents a single block: [10 11; 20 22]
+      check0                = true;
+      check1                = true;
+      check2                = true;
+      const lno_t ncols     = 1;
+      const lno_t cols[]    = {3};
+      const lno_t browi     = 3;
+      const scalar_t vals[] = {
+          10, 11, 20, 22};  // represents a single block: [10 11; 20 22]
       const scalar_t result[] = {16, 18, 14, 15};
 
       // This block will be summed into the existing block [6 7; -6 -7]
       // Expected result: [16 18; 14 15]
-      A.sumIntoValues( browi, cols, ncols, vals );
+      A.sumIntoValues(browi, cols, ncols, vals);
       auto iblockrow = A.block_row_Const(browi);
-      auto relBlk = iblockrow.findRelBlockOffset(cols[0]);
-      auto view_blk = iblockrow.block(relBlk);
-      for ( auto lrow = 0; lrow < A.blockDim(); ++lrow ) {
+      auto relBlk    = iblockrow.findRelBlockOffset(cols[0]);
+      auto view_blk  = iblockrow.block(relBlk);
+      for (auto lrow = 0; lrow < A.blockDim(); ++lrow) {
         auto row_ptr = iblockrow.local_row_in_block(relBlk, lrow);
-        for ( auto lcol = 0; lcol < A.blockDim(); ++lcol ) {
+        for (auto lcol = 0; lcol < A.blockDim(); ++lcol) {
           auto entry = iblockrow.local_block_value(relBlk, lrow, lcol);
-          check0 = check0 && ( entry == row_ptr[lcol] );
-          check1 = check1 && ( entry == view_blk(lrow,lcol) );
-          check2 = check2 && ( entry == result[ lrow*A.blockDim() + lcol ] );
-        } // end local col in row
-      } // end local row in blk
+          check0     = check0 && (entry == row_ptr[lcol]);
+          check1     = check1 && (entry == view_blk(lrow, lcol));
+          check2     = check2 && (entry == result[lrow * A.blockDim() + lcol]);
+        }  // end local col in row
+      }    // end local row in blk
       d_results(4) = check0;
       d_results(5) = check1;
       d_results(6) = check2;
@@ -311,169 +306,205 @@ namespace Test{ // anonymous
 
     // Test replaceValues
     {
-      check0 = true;
-      check1 = true;
-      check2 = true;
-      const lno_t ncols = 1;
-      const lno_t cols[] = {3};
-      const lno_t browi = 3;
-      const scalar_t valsreplace[] = {-10, -11, -20, -22}; // represents a single block: [10 11; 20 22]
+      check0                       = true;
+      check1                       = true;
+      check2                       = true;
+      const lno_t ncols            = 1;
+      const lno_t cols[]           = {3};
+      const lno_t browi            = 3;
+      const scalar_t valsreplace[] = {
+          -10, -11, -20, -22};  // represents a single block: [10 11; 20 22]
 
       // The existing block to be replaced was: [6 7; -6 -7]
-      A.replaceValues( browi, cols, ncols, valsreplace );
+      A.replaceValues(browi, cols, ncols, valsreplace);
 
       auto iblockrow = A.block_row_Const(browi);
-      auto relBlk = iblockrow.findRelBlockOffset(cols[0]);
-      auto view_blk = iblockrow.block(relBlk);
-      for ( auto lrow = 0; lrow < A.blockDim(); ++lrow ) {
+      auto relBlk    = iblockrow.findRelBlockOffset(cols[0]);
+      auto view_blk  = iblockrow.block(relBlk);
+      for (auto lrow = 0; lrow < A.blockDim(); ++lrow) {
         auto row_ptr = iblockrow.local_row_in_block(relBlk, lrow);
-        for ( auto lcol = 0; lcol < A.blockDim(); ++lcol ) {
+        for (auto lcol = 0; lcol < A.blockDim(); ++lcol) {
           auto entry = iblockrow.local_block_value(relBlk, lrow, lcol);
-          check0 = check0 && ( entry == row_ptr[lcol] );
-          check1 = check1 && ( entry == view_blk(lrow,lcol) );
-          check2 = check2 && ( entry == valsreplace[ lrow*A.blockDim() + lcol ] );
-        } // end local col in row
-      } // end local row in blk
+          check0     = check0 && (entry == row_ptr[lcol]);
+          check1     = check1 && (entry == view_blk(lrow, lcol));
+          check2 = check2 && (entry == valsreplace[lrow * A.blockDim() + lcol]);
+        }  // end local col in row
+      }    // end local row in blk
       d_results(7) = check0;
       d_results(8) = check1;
       d_results(9) = check2;
     }
 
-  }// end operator()(i)
-  }; // end TestFunctor
+  }  // end operator()(i)
+};   // end TestFunctor
 
-} // namespace (anonymous)
+}  // namespace Test
 
 // Create a CrsMatrix and BlockCrsMatrix and test member functions.
-template <typename scalar_t, typename lno_t, typename size_type, typename device>
-void
-testBlockCrsMatrix ()
-{
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void testBlockCrsMatrix() {
   using namespace Test;
 
-  typedef KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type> crs_matrix_type;
-  typedef KokkosSparse::Experimental::BlockCrsMatrix<scalar_t, lno_t, device, void, size_type> block_crs_matrix_type;
+  typedef KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
+      crs_matrix_type;
+  typedef KokkosSparse::Experimental::BlockCrsMatrix<scalar_t, lno_t, device,
+                                                     void, size_type>
+      block_crs_matrix_type;
 
-  crs_matrix_type crsA = makeCrsMatrix_BlockStructure<crs_matrix_type> ();
-  block_crs_matrix_type A = makeBlockCrsMatrix<block_crs_matrix_type> ();
+  crs_matrix_type crsA    = makeCrsMatrix_BlockStructure<crs_matrix_type>();
+  block_crs_matrix_type A = makeBlockCrsMatrix<block_crs_matrix_type>();
 
   const int num_entries = 10;
-  typedef Kokkos::View< bool[num_entries], device > result_view_type;
+  typedef Kokkos::View<bool[num_entries], device> result_view_type;
   result_view_type d_results("d_results");
-  auto h_results = Kokkos::create_mirror_view( d_results );
+  auto h_results = Kokkos::create_mirror_view(d_results);
 
-  Kokkos::parallel_for( "KokkosSparse::Test::BlockCrsMatrix", Kokkos::RangePolicy<typename device::execution_space>(0, 1), Test::TestFunctor< block_crs_matrix_type, result_view_type>( A, d_results ) );
+  Kokkos::parallel_for(
+      "KokkosSparse::Test::BlockCrsMatrix",
+      Kokkos::RangePolicy<typename device::execution_space>(0, 1),
+      Test::TestFunctor<block_crs_matrix_type, result_view_type>(A, d_results));
 
-  Kokkos::deep_copy( h_results, d_results );
+  Kokkos::deep_copy(h_results, d_results);
 
-  for ( decltype(h_results.extent(0)) i = 0; i < h_results.extent(0); ++i ) {
-    EXPECT_EQ( h_results[i], true );
+  for (decltype(h_results.extent(0)) i = 0; i < h_results.extent(0); ++i) {
+    EXPECT_EQ(h_results[i], true);
   }
-
-}
-
-#define EXECUTE_BLOCKCRS_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
-TEST_F( TestCategory, sparse ## _ ## blkcrsmatrix ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  testBlockCrsMatrix<SCALAR, ORDINAL, OFFSET, DEVICE> (); \
 }
 
+#define EXECUTE_BLOCKCRS_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                \
+  TEST_F(                                                                     \
+      TestCategory,                                                           \
+      sparse##_##blkcrsmatrix##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    testBlockCrsMatrix<SCALAR, ORDINAL, OFFSET, DEVICE>();                    \
+  }
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BLOCKCRS_TEST(double, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BLOCKCRS_TEST(double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BLOCKCRS_TEST(double, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BLOCKCRS_TEST(double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BLOCKCRS_TEST(double, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BLOCKCRS_TEST(double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BLOCKCRS_TEST(double, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BLOCKCRS_TEST(double, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BLOCKCRS_TEST(float, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BLOCKCRS_TEST(float, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BLOCKCRS_TEST(float, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BLOCKCRS_TEST(float, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BLOCKCRS_TEST(float, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BLOCKCRS_TEST(float, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BLOCKCRS_TEST(float, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BLOCKCRS_TEST(float, int64_t, size_t, TestExecSpace)
 #endif
 
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BLOCKCRS_TEST(kokkos_complex_double, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BLOCKCRS_TEST(kokkos_complex_double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BLOCKCRS_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BLOCKCRS_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BLOCKCRS_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BLOCKCRS_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BLOCKCRS_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BLOCKCRS_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BLOCKCRS_TEST(kokkos_complex_float, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BLOCKCRS_TEST(kokkos_complex_float, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BLOCKCRS_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BLOCKCRS_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BLOCKCRS_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BLOCKCRS_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BLOCKCRS_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BLOCKCRS_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
 #endif
 
 #undef EXECUTE_BLOCKCRS_TEST
-
diff --git a/unit_test/sparse/Test_Sparse_BsrMatrix.hpp b/unit_test/sparse/Test_Sparse_BsrMatrix.hpp
index 9a3059b9a4..49a0ce6d4f 100644
--- a/unit_test/sparse/Test_Sparse_BsrMatrix.hpp
+++ b/unit_test/sparse/Test_Sparse_BsrMatrix.hpp
@@ -57,163 +57,161 @@
 typedef Kokkos::complex<double> kokkos_complex_double;
 typedef Kokkos::complex<float> kokkos_complex_float;
 
-namespace Test_Bsr{
-
-  using std::cerr;
-  using std::endl;
-
-  // Create a test sparse matrix A.
-  //
-  // Identify the matrix to create by number (whichMatrix).  The
-  // following lists the valid options for whichMatrix:
-  //
-  // 0: A square 8 x 8 sparse CrsMatrix with implicit block structure
-  // 1: A square 4 x 4 sparse BsrMatrix
-  //
-  // \param ptr [out] Array of row offsets, of length numRows+1.
-  // \param ind [out] Array of column indices, of length nnz (CrsMatrix) 
-  //        or numBlocks (BsrMatrix).
-  // \param val [out] Array of entries (values), of length nnz.
-  // \param numRows [out] The number of rows in the matrix.
-  // \param numCols [out] The number of columns in the matrix.
-  // \param nnz [out] The number of stored entries in the matrix.
-  // \param whichMatrix [in] The index of the matrix to create.
-  template<typename sparseMat_t>
-  void
-  makeSparseMatrix (
-      typename sparseMat_t::StaticCrsGraphType::row_map_type::non_const_type & ptr,
-      typename sparseMat_t::StaticCrsGraphType::entries_type::non_const_type   & ind,
-      typename sparseMat_t::values_type::non_const_type & val,
-      typename sparseMat_t::ordinal_type &numRows,
-      typename sparseMat_t::ordinal_type &numCols,
-      typename sparseMat_t::size_type &nnz,
-      const int whichMatrix,
-      typename sparseMat_t::ordinal_type &blockDim)
-  {
-
-    typedef typename sparseMat_t::StaticCrsGraphType::row_map_type::non_const_type ptr_type ;
-    typedef typename sparseMat_t::StaticCrsGraphType::entries_type::non_const_type ind_type ;
-    typedef typename sparseMat_t::values_type::non_const_type val_type ;
-    typedef typename sparseMat_t::ordinal_type lno_t;
-    typedef typename sparseMat_t::size_type size_type;
-    typedef typename sparseMat_t::value_type scalar_t;
-
-    using Kokkos::HostSpace;
-    using Kokkos::MemoryUnmanaged;
-    using Kokkos::View;
-
-    if (whichMatrix == 0) {
-      numRows = 8;
-      numCols = 8;
-      nnz = 24;
-      blockDim = 1;
-
-      const size_type ptrRaw[] = {0, 4, 8, 10, 12, 14, 16, 20, 24};
-      const lno_t indRaw[] = {0, 1, 4, 5, 0, 1, 4, 5, 2, 3, 2, 3, 4, 5, 4, 5, 2, 3, 6, 7, 2, 3, 6, 7};
-      const scalar_t valRaw[] = {.1, 1, 4, 5, -.1, -1, -4, -5, 2, 3, -2, -3, 4, 5, -4, -5, 2, 3, 6, 7, -2, -3, -6, -7};
-
-      // Create the output Views.
-      ptr = ptr_type("ptr", numRows + 1);
-      ind = ind_type("ind", nnz);
-      val = val_type("val", nnz);
-
-      // Wrap the above three arrays in unmanaged Views, so we can use deep_copy.
-      typename ptr_type::HostMirror::const_type  ptrIn( ptrRaw , numRows+1 );
-      typename ind_type::HostMirror::const_type  indIn( indRaw , nnz );
-      typename val_type::HostMirror::const_type  valIn( valRaw , nnz );
-
-      Kokkos::deep_copy (ptr, ptrIn);
-      Kokkos::deep_copy (ind, indIn);
-      Kokkos::deep_copy (val, valIn);
-    }
-    else if (whichMatrix == 1) {
-      numRows = 4;
-      numCols = 4;
-      nnz = 24;
-
-      blockDim = 2;
-      const lno_t numBlocks = 6;
-
-      const size_type ptrRaw[] = {0, 2, 3, 4, 6};
-      const lno_t indRaw[] = {0, 2, 1, 2, 1, 3};
-      // Numerical values stored in BSR format
-      const scalar_t valRaw[] = {.1, 1, -.1, -1, 4, 5, -4, -5, 2, 3, -2, -3, 4, 5, -4, -5, 2, 3, -2, -3, 6, 7, -6, -7};
-
-      // Create the output Views.
-      ptr = ptr_type("ptr", numRows + 1);
-      ind = ind_type("ind", numBlocks);
-      val = val_type("val", nnz);
-
-      // Wrap the above three arrays in unmanaged Views, so we can use deep_copy.
-      typename ptr_type::HostMirror::const_type  ptrIn( ptrRaw , numRows+1 );
-      typename ind_type::HostMirror::const_type  indIn( indRaw , numBlocks );
-      typename val_type::HostMirror::const_type  valIn( valRaw , nnz );
-
-      Kokkos::deep_copy (ptr, ptrIn);
-      Kokkos::deep_copy (ind, indIn);
-      Kokkos::deep_copy (val, valIn);
-    }
+namespace Test_Bsr {
 
-    else { // whichMatrix != 0
-      std::ostringstream os;
-      os << "Invalid whichMatrix value " << whichMatrix
-         << ".  Valid value(s) include " << 0 << ".";
-      throw std::invalid_argument (os.str ());
-    }
-  }
+using std::cerr;
+using std::endl;
 
-  // Return the Kokkos::CrsMatrix corresponding to makeSparseMatrix().
-  template<typename crsMat_t>
-  crsMat_t  makeCrsMatrix_BlockStructure ()
-  {
-    typedef typename crsMat_t::StaticCrsGraphType graph_t;
-    typedef typename graph_t::row_map_type::non_const_type lno_view_t;
-    typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
-    typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
-    typedef typename crsMat_t::ordinal_type lno_t;
-    typedef typename crsMat_t::size_type size_type;
-
-    lno_view_t ptr;
-    lno_nnz_view_t ind;
-    scalar_view_t val;
-    lno_t numRows;
-    lno_t numCols;
-    size_type nnz;
-    lno_t blockDim;
-
-    const int whichMatrix = 0;
-    makeSparseMatrix<crsMat_t> (ptr, ind, val, numRows, numCols, nnz, whichMatrix, blockDim);
-    return crsMat_t ("A", numRows, numCols, nnz, val, ptr, ind);
+// Create a test sparse matrix A.
+//
+// Identify the matrix to create by number (whichMatrix).  The
+// following lists the valid options for whichMatrix:
+//
+// 0: A square 8 x 8 sparse CrsMatrix with implicit block structure
+// 1: A square 4 x 4 sparse BsrMatrix
+//
+// \param ptr [out] Array of row offsets, of length numRows+1.
+// \param ind [out] Array of column indices, of length nnz (CrsMatrix)
+//        or numBlocks (BsrMatrix).
+// \param val [out] Array of entries (values), of length nnz.
+// \param numRows [out] The number of rows in the matrix.
+// \param numCols [out] The number of columns in the matrix.
+// \param nnz [out] The number of stored entries in the matrix.
+// \param whichMatrix [in] The index of the matrix to create.
+template <typename sparseMat_t>
+void makeSparseMatrix(
+    typename sparseMat_t::StaticCrsGraphType::row_map_type::non_const_type &ptr,
+    typename sparseMat_t::StaticCrsGraphType::entries_type::non_const_type &ind,
+    typename sparseMat_t::values_type::non_const_type &val,
+    typename sparseMat_t::ordinal_type &numRows,
+    typename sparseMat_t::ordinal_type &numCols,
+    typename sparseMat_t::size_type &nnz, const int whichMatrix,
+    typename sparseMat_t::ordinal_type &blockDim) {
+  typedef typename sparseMat_t::StaticCrsGraphType::row_map_type::non_const_type
+      ptr_type;
+  typedef typename sparseMat_t::StaticCrsGraphType::entries_type::non_const_type
+      ind_type;
+  typedef typename sparseMat_t::values_type::non_const_type val_type;
+  typedef typename sparseMat_t::ordinal_type lno_t;
+  typedef typename sparseMat_t::size_type size_type;
+  typedef typename sparseMat_t::value_type scalar_t;
+
+  using Kokkos::HostSpace;
+  using Kokkos::MemoryUnmanaged;
+  using Kokkos::View;
+
+  if (whichMatrix == 0) {
+    numRows  = 8;
+    numCols  = 8;
+    nnz      = 24;
+    blockDim = 1;
+
+    const size_type ptrRaw[] = {0, 4, 8, 10, 12, 14, 16, 20, 24};
+    const lno_t indRaw[]     = {0, 1, 4, 5, 0, 1, 4, 5, 2, 3, 2, 3,
+                            4, 5, 4, 5, 2, 3, 6, 7, 2, 3, 6, 7};
+    const scalar_t valRaw[]  = {.1, 1, 4,  5,  -.1, -1, -4, -5, 2,  3,  -2, -3,
+                               4,  5, -4, -5, 2,   3,  6,  7,  -2, -3, -6, -7};
+
+    // Create the output Views.
+    ptr = ptr_type("ptr", numRows + 1);
+    ind = ind_type("ind", nnz);
+    val = val_type("val", nnz);
+
+    // Wrap the above three arrays in unmanaged Views, so we can use deep_copy.
+    typename ptr_type::HostMirror::const_type ptrIn(ptrRaw, numRows + 1);
+    typename ind_type::HostMirror::const_type indIn(indRaw, nnz);
+    typename val_type::HostMirror::const_type valIn(valRaw, nnz);
+
+    Kokkos::deep_copy(ptr, ptrIn);
+    Kokkos::deep_copy(ind, indIn);
+    Kokkos::deep_copy(val, valIn);
+  } else if (whichMatrix == 1) {
+    numRows = 4;
+    numCols = 4;
+    nnz     = 24;
+
+    blockDim              = 2;
+    const lno_t numBlocks = 6;
+
+    const size_type ptrRaw[] = {0, 2, 3, 4, 6};
+    const lno_t indRaw[]     = {0, 2, 1, 2, 1, 3};
+    // Numerical values stored in BSR format
+    const scalar_t valRaw[] = {.1, 1, -.1, -1, 4, 5, -4, -5, 2, 3, -2, -3,
+                               4,  5, -4,  -5, 2, 3, -2, -3, 6, 7, -6, -7};
+
+    // Create the output Views.
+    ptr = ptr_type("ptr", numRows + 1);
+    ind = ind_type("ind", numBlocks);
+    val = val_type("val", nnz);
+
+    // Wrap the above three arrays in unmanaged Views, so we can use deep_copy.
+    typename ptr_type::HostMirror::const_type ptrIn(ptrRaw, numRows + 1);
+    typename ind_type::HostMirror::const_type indIn(indRaw, numBlocks);
+    typename val_type::HostMirror::const_type valIn(valRaw, nnz);
+
+    Kokkos::deep_copy(ptr, ptrIn);
+    Kokkos::deep_copy(ind, indIn);
+    Kokkos::deep_copy(val, valIn);
   }
 
-  template<typename blkcrsMat_t>
-  blkcrsMat_t  makeBsrMatrix ()
-  {
-    typedef typename blkcrsMat_t::StaticCrsGraphType graph_t;
-    typedef typename graph_t::row_map_type::non_const_type lno_view_t;
-    typedef typename graph_t::entries_type::non_const_type   lno_nnz_view_t;
-    typedef typename blkcrsMat_t::values_type::non_const_type scalar_view_t;
-    typedef typename blkcrsMat_t::ordinal_type lno_t;
-    typedef typename blkcrsMat_t::size_type size_type;
-
-    lno_view_t ptr;
-    lno_nnz_view_t ind;
-    scalar_view_t val;
-    lno_t numRows;
-    lno_t numCols;
-    size_type nnz;
-    lno_t blockDim;
-
-    const int whichMatrix = 1;
-    makeSparseMatrix<blkcrsMat_t> (ptr, ind, val, numRows, numCols, nnz, whichMatrix, blockDim);
-    blkcrsMat_t resMat("blk", numRows, numCols, nnz, val, ptr, ind, blockDim);
-    return resMat;
+  else {  // whichMatrix != 0
+    std::ostringstream os;
+    os << "Invalid whichMatrix value " << whichMatrix
+       << ".  Valid value(s) include " << 0 << ".";
+    throw std::invalid_argument(os.str());
   }
+}
 
+// Return the Kokkos::CrsMatrix corresponding to makeSparseMatrix().
+template <typename crsMat_t>
+crsMat_t makeCrsMatrix_BlockStructure() {
+  typedef typename crsMat_t::StaticCrsGraphType graph_t;
+  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
+  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
+  typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
+  typedef typename crsMat_t::ordinal_type lno_t;
+  typedef typename crsMat_t::size_type size_type;
+
+  lno_view_t ptr;
+  lno_nnz_view_t ind;
+  scalar_view_t val;
+  lno_t numRows;
+  lno_t numCols;
+  size_type nnz;
+  lno_t blockDim;
+
+  const int whichMatrix = 0;
+  makeSparseMatrix<crsMat_t>(ptr, ind, val, numRows, numCols, nnz, whichMatrix,
+                             blockDim);
+  return crsMat_t("A", numRows, numCols, nnz, val, ptr, ind);
+}
 
-  template < class MatrixType, class ResultsType >
-  struct TestFunctor {
+template <typename blkcrsMat_t>
+blkcrsMat_t makeBsrMatrix() {
+  typedef typename blkcrsMat_t::StaticCrsGraphType graph_t;
+  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
+  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
+  typedef typename blkcrsMat_t::values_type::non_const_type scalar_view_t;
+  typedef typename blkcrsMat_t::ordinal_type lno_t;
+  typedef typename blkcrsMat_t::size_type size_type;
+
+  lno_view_t ptr;
+  lno_nnz_view_t ind;
+  scalar_view_t val;
+  lno_t numRows;
+  lno_t numCols;
+  size_type nnz;
+  lno_t blockDim;
+
+  const int whichMatrix = 1;
+  makeSparseMatrix<blkcrsMat_t>(ptr, ind, val, numRows, numCols, nnz,
+                                whichMatrix, blockDim);
+  blkcrsMat_t resMat("blk", numRows, numCols, nnz, val, ptr, ind, blockDim);
+  return resMat;
+}
 
+template <class MatrixType, class ResultsType>
+struct TestFunctor {
   typedef typename MatrixType::value_type scalar_t;
   typedef typename MatrixType::ordinal_type lno_t;
 
@@ -222,90 +220,87 @@ namespace Test_Bsr{
   ResultsType d_results;
 
   // Constructor
-  TestFunctor( MatrixType & A_, ResultsType & d_results_ ) :
-    A(A_),
-    d_results(d_results_)
-  {}
+  TestFunctor(MatrixType &A_, ResultsType &d_results_)
+      : A(A_), d_results(d_results_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const int /*rid*/ ) const
-  {
+  void operator()(const int /*rid*/) const {
     // Test 1: Check member functions behave as expected
     bool check0 = true;
     bool check1 = true;
     bool check2 = true;
     bool check3 = true;
-    for ( lno_t i = 0; i < A.numRows(); ++i ) {
-
+    for (lno_t i = 0; i < A.numRows(); ++i) {
       // Test BsrRowView
       {
-        auto iblockrow = A.block_row(i);
+        auto iblockrow         = A.block_row(i);
         auto num_blocks_in_row = iblockrow.length;
-        for ( auto blk = 0; blk < num_blocks_in_row; ++blk ){
+        for (auto blk = 0; blk < num_blocks_in_row; ++blk) {
           auto view_blk = iblockrow.block(blk);
-          for ( auto lrow = 0; lrow < A.blockDim(); ++lrow ) {
+          for (auto lrow = 0; lrow < A.blockDim(); ++lrow) {
             auto row_ptr = iblockrow.local_row_in_block(blk, lrow);
-            for ( auto lcol = 0; lcol < A.blockDim(); ++lcol ) {
+            for (auto lcol = 0; lcol < A.blockDim(); ++lcol) {
               auto entry = iblockrow.local_block_value(blk, lrow, lcol);
-              //std::cout << "check0: " << ( entry == row_ptr[lcol] );
-              //std::cout << "check1: " << ( entry == view_blk(lrow,lcol) );
-              check0 = check0 && ( entry == row_ptr[lcol] );
-              check1 = check1 && ( entry == view_blk(lrow,lcol) );
-            } // end local col in row
-          } // end local row in blk
-        } // end blk
+              // std::cout << "check0: " << ( entry == row_ptr[lcol] );
+              // std::cout << "check1: " << ( entry == view_blk(lrow,lcol) );
+              check0 = check0 && (entry == row_ptr[lcol]);
+              check1 = check1 && (entry == view_blk(lrow, lcol));
+            }  // end local col in row
+          }    // end local row in blk
+        }      // end blk
       }
       d_results(0) = check0;
       d_results(1) = check1;
 
       // Test BsrRowViewConst
       {
-        auto iblockrow = A.block_row_Const(i);
+        auto iblockrow         = A.block_row_Const(i);
         auto num_blocks_in_row = iblockrow.length;
-        for ( auto blk = 0; blk < num_blocks_in_row; ++blk ){
+        for (auto blk = 0; blk < num_blocks_in_row; ++blk) {
           auto view_blk = iblockrow.block(blk);
-          for ( auto lrow = 0; lrow < A.blockDim(); ++lrow ) {
+          for (auto lrow = 0; lrow < A.blockDim(); ++lrow) {
             auto row_ptr = iblockrow.local_row_in_block(blk, lrow);
-            for ( auto lcol = 0; lcol < A.blockDim(); ++lcol ) {
+            for (auto lcol = 0; lcol < A.blockDim(); ++lcol) {
               auto entry = iblockrow.local_block_value(blk, lrow, lcol);
-              check2 = check2 && ( entry == row_ptr[lcol] );
-              check3 = check3 && ( entry == view_blk(lrow,lcol) );
-            } // end local col in row
-          } // end local row in blk
-        } // end blk
+              check2     = check2 && (entry == row_ptr[lcol]);
+              check3     = check3 && (entry == view_blk(lrow, lcol));
+            }  // end local col in row
+          }    // end local row in blk
+        }      // end blk
       }
       d_results(0) = check0;
       d_results(1) = check1;
       d_results(2) = check2;
       d_results(3) = check3;
-    } // end for blk rows
+    }  // end for blk rows
 
     // Test sumIntoValues
     {
-      check0 = true;
-      check1 = true;
-      check2 = true;
-      const lno_t ncols = 1;
-      const lno_t cols[] = {3};
-      const lno_t browi = 3;
-      const scalar_t vals[] = {10, 11, 20, 22}; // represents a single block: [10 11; 20 22]
+      check0                = true;
+      check1                = true;
+      check2                = true;
+      const lno_t ncols     = 1;
+      const lno_t cols[]    = {3};
+      const lno_t browi     = 3;
+      const scalar_t vals[] = {
+          10, 11, 20, 22};  // represents a single block: [10 11; 20 22]
       const scalar_t result[] = {16, 18, 14, 15};
 
       // This block will be summed into the existing block [6 7; -6 -7]
       // Expected result: [16 18; 14 15]
-      A.sumIntoValues( browi, cols, ncols, vals );
+      A.sumIntoValues(browi, cols, ncols, vals);
       auto iblockrow = A.block_row_Const(browi);
-      auto relBlk = iblockrow.findRelBlockOffset(cols[0]);
-      auto view_blk = iblockrow.block(relBlk);
-      for ( auto lrow = 0; lrow < A.blockDim(); ++lrow ) {
+      auto relBlk    = iblockrow.findRelBlockOffset(cols[0]);
+      auto view_blk  = iblockrow.block(relBlk);
+      for (auto lrow = 0; lrow < A.blockDim(); ++lrow) {
         auto row_ptr = iblockrow.local_row_in_block(relBlk, lrow);
-        for ( auto lcol = 0; lcol < A.blockDim(); ++lcol ) {
+        for (auto lcol = 0; lcol < A.blockDim(); ++lcol) {
           auto entry = iblockrow.local_block_value(relBlk, lrow, lcol);
-          check0 = check0 && ( entry == row_ptr[lcol] );
-          check1 = check1 && ( entry == view_blk(lrow,lcol) );
-          check2 = check2 && ( entry == result[ lrow*A.blockDim() + lcol ] );
-        } // end local col in row
-      } // end local row in blk
+          check0     = check0 && (entry == row_ptr[lcol]);
+          check1     = check1 && (entry == view_blk(lrow, lcol));
+          check2     = check2 && (entry == result[lrow * A.blockDim() + lcol]);
+        }  // end local col in row
+      }    // end local row in blk
       d_results(4) = check0;
       d_results(5) = check1;
       d_results(6) = check2;
@@ -313,169 +308,204 @@ namespace Test_Bsr{
 
     // Test replaceValues
     {
-      check0 = true;
-      check1 = true;
-      check2 = true;
-      const lno_t ncols = 1;
-      const lno_t cols[] = {3};
-      const lno_t browi = 3;
-      const scalar_t valsreplace[] = {-10, -11, -20, -22}; // represents a single block: [-10 -11; -20 -22]
+      check0                       = true;
+      check1                       = true;
+      check2                       = true;
+      const lno_t ncols            = 1;
+      const lno_t cols[]           = {3};
+      const lno_t browi            = 3;
+      const scalar_t valsreplace[] = {
+          -10, -11, -20, -22};  // represents a single block: [-10 -11; -20 -22]
 
       // The existing block to be replaced was: [6 7; -6 -7]
-      A.replaceValues( browi, cols, ncols, valsreplace );
+      A.replaceValues(browi, cols, ncols, valsreplace);
 
       auto iblockrow = A.block_row_Const(browi);
-      auto relBlk = iblockrow.findRelBlockOffset(cols[0]);
-      auto view_blk = iblockrow.block(relBlk);
-      for ( auto lrow = 0; lrow < A.blockDim(); ++lrow ) {
+      auto relBlk    = iblockrow.findRelBlockOffset(cols[0]);
+      auto view_blk  = iblockrow.block(relBlk);
+      for (auto lrow = 0; lrow < A.blockDim(); ++lrow) {
         auto row_ptr = iblockrow.local_row_in_block(relBlk, lrow);
-        for ( auto lcol = 0; lcol < A.blockDim(); ++lcol ) {
+        for (auto lcol = 0; lcol < A.blockDim(); ++lcol) {
           auto entry = iblockrow.local_block_value(relBlk, lrow, lcol);
-          check0 = check0 && ( entry == row_ptr[lcol] );
-          check1 = check1 && ( entry == view_blk(lrow,lcol) );
-          check2 = check2 && ( entry == valsreplace[ lrow*A.blockDim() + lcol ] );
-        } // end local col in row
-      } // end local row in blk
+          check0     = check0 && (entry == row_ptr[lcol]);
+          check1     = check1 && (entry == view_blk(lrow, lcol));
+          check2 = check2 && (entry == valsreplace[lrow * A.blockDim() + lcol]);
+        }  // end local col in row
+      }    // end local row in blk
       d_results(7) = check0;
       d_results(8) = check1;
       d_results(9) = check2;
     }
 
-  }// end operator()(i)
-  }; // end TestFunctor
+  }  // end operator()(i)
+};   // end TestFunctor
 
-} // namespace Test_Bsr
+}  // namespace Test_Bsr
 
 // Create a CrsMatrix and BsrMatrix and test member functions.
-template <typename scalar_t, typename lno_t, typename size_type, typename device>
-void
-testBsrMatrix ()
-{
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void testBsrMatrix() {
   using namespace Test_Bsr;
 
-  typedef KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type> crs_matrix_type;
-  typedef KokkosSparse::Experimental::BsrMatrix<scalar_t, lno_t, device, void, size_type> bsr_matrix_type;
+  typedef KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
+      crs_matrix_type;
+  typedef KokkosSparse::Experimental::BsrMatrix<scalar_t, lno_t, device, void,
+                                                size_type>
+      bsr_matrix_type;
 
-  crs_matrix_type crsA = makeCrsMatrix_BlockStructure< crs_matrix_type > ();
-  bsr_matrix_type A = makeBsrMatrix< bsr_matrix_type > ();
+  crs_matrix_type crsA = makeCrsMatrix_BlockStructure<crs_matrix_type>();
+  bsr_matrix_type A    = makeBsrMatrix<bsr_matrix_type>();
 
   const int num_entries = 10;
-  typedef Kokkos::View< bool[num_entries], device > result_view_type;
+  typedef Kokkos::View<bool[num_entries], device> result_view_type;
   result_view_type d_results("d_results");
-  auto h_results = Kokkos::create_mirror_view( d_results );
+  auto h_results = Kokkos::create_mirror_view(d_results);
 
-  Kokkos::parallel_for( "KokkosSparse::Test_Bsr::BsrMatrix", Kokkos::RangePolicy<typename device::execution_space>(0, 1), Test_Bsr::TestFunctor< bsr_matrix_type, result_view_type>( A, d_results ) );
+  Kokkos::parallel_for(
+      "KokkosSparse::Test_Bsr::BsrMatrix",
+      Kokkos::RangePolicy<typename device::execution_space>(0, 1),
+      Test_Bsr::TestFunctor<bsr_matrix_type, result_view_type>(A, d_results));
 
-  Kokkos::deep_copy( h_results, d_results );
+  Kokkos::deep_copy(h_results, d_results);
 
-  for ( decltype(h_results.extent(0)) i = 0; i < h_results.extent(0); ++i ) {
-    EXPECT_EQ( h_results[i], true );
+  for (decltype(h_results.extent(0)) i = 0; i < h_results.extent(0); ++i) {
+    EXPECT_EQ(h_results[i], true);
   }
-
-}
-
-#define EXECUTE_BSR_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
-TEST_F( TestCategory, sparse ## _ ## bsrmatrix ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  testBsrMatrix<SCALAR, ORDINAL, OFFSET, DEVICE> (); \
 }
 
+#define EXECUTE_BSR_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                     \
+  TEST_F(TestCategory,                                                        \
+         sparse##_##bsrmatrix##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    testBsrMatrix<SCALAR, ORDINAL, OFFSET, DEVICE>();                         \
+  }
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BSR_TEST(double, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TEST(double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BSR_TEST(double, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TEST(double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BSR_TEST(double, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TEST(double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BSR_TEST(double, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TEST(double, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BSR_TEST(float, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TEST(float, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BSR_TEST(float, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TEST(float, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BSR_TEST(float, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TEST(float, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BSR_TEST(float, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TEST(float, int64_t, size_t, TestExecSpace)
 #endif
 
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BSR_TEST(kokkos_complex_double, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TEST(kokkos_complex_double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BSR_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BSR_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BSR_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BSR_TEST(kokkos_complex_float, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TEST(kokkos_complex_float, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BSR_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BSR_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_BSR_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
 #endif
 
 #undef EXECUTE_BSR_TEST
-
diff --git a/unit_test/sparse/Test_Sparse_CrsMatrix.hpp b/unit_test/sparse/Test_Sparse_CrsMatrix.hpp
index b36924d887..6f67a6e8bb 100644
--- a/unit_test/sparse/Test_Sparse_CrsMatrix.hpp
+++ b/unit_test/sparse/Test_Sparse_CrsMatrix.hpp
@@ -57,199 +57,183 @@
 typedef Kokkos::complex<double> kokkos_complex_double;
 typedef Kokkos::complex<float> kokkos_complex_float;
 
-namespace Test{ // anonymous
-
-  using std::cerr;
-  using std::endl;
-
-  // Create a test sparse matrix A.
-  //
-  // Identify the matrix to create by number (whichMatrix).  The
-  // following lists the valid options for whichMatrix:
-  //
-  // 0: A square 10 x 10 nonsymmetric diagonally dominant sparse matrix.
-  //
-  // \param ptr [out] Array of row offsets, of length numRows+1.
-  // \param ind [out] Array of column indices, of length nnz.
-  // \param val [out] Array of entries (values), of length nnz.
-  // \param numRows [out] The number of rows in the matrix.
-  // \param numCols [out] The number of columns in the matrix.
-  // \param nnz [out] The number of stored entries in the matrix.
-  // \param whichMatrix [in] The index of the matrix to create.
-  template<typename crsMat_t>
-  void
-  makeSparseMatrix (
-      typename crsMat_t::StaticCrsGraphType::row_map_type::non_const_type & ptr,
-      typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type   & ind,
-      typename crsMat_t::values_type::non_const_type & val,
-      typename crsMat_t::ordinal_type &numRows,
-      typename crsMat_t::ordinal_type &numCols,
-      typename crsMat_t::size_type &nnz,
-      const int whichMatrix)
-  {
+namespace Test {  // anonymous
 
+using std::cerr;
+using std::endl;
 
-    typedef typename crsMat_t::StaticCrsGraphType::row_map_type::non_const_type ptr_type ;
-    typedef typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type ind_type ;
-    typedef typename crsMat_t::values_type::non_const_type val_type ;
-    typedef typename crsMat_t::ordinal_type lno_t;
-    typedef typename crsMat_t::size_type size_type;
-    typedef typename crsMat_t::value_type scalar_t;
-
-
-    using Kokkos::HostSpace;
-    using Kokkos::MemoryUnmanaged;
-    using Kokkos::View;
-
-    if (whichMatrix == 0) {
-      numRows = 10;
-      numCols = 10;
-      nnz = 21;
-      const size_type ptrRaw[] = {0, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21};
-      const lno_t indRaw[] = {0, 1, 9,
-                            1, 2,
-                            2, 3,
-                            3, 4,
-                            4, 5,
-                            5, 6,
-                            6, 7,
-                            7, 8,
-                            8, 9,
-                            1, 9};
-
-      const scalar_t valRaw[] = {1.0, 4.0, 0.5,
-                               0.5, 5.0,
-                               1.0, 6.0,
-                               1.5, 7.0,
-                               2.0, 8.0,
-                               2.5, 9.0,
-                               3.0, 10.0,
-                               3.5, 11.0,
-                               4.0, 12.0,
-                               4.5, 13.0};
-
-
-      // Create the output Views.
-      ptr = ptr_type("ptr", numRows + 1);
-      ind = ind_type("ind", nnz);
-      val = val_type("val", nnz);
-
-      // Wrap the above three arrays in unmanaged Views, so we can use deep_copy.
-      typename ptr_type::HostMirror::const_type  ptrIn( ptrRaw , numRows+1 );
-      typename ind_type::HostMirror::const_type  indIn( indRaw , nnz );
-      typename val_type::HostMirror::const_type  valIn( valRaw , nnz );
-
-      Kokkos::deep_copy (ptr, ptrIn);
-      Kokkos::deep_copy (ind, indIn);
-      Kokkos::deep_copy (val, valIn);
-    }
-    else { // whichMatrix != 0
-      std::ostringstream os;
-      os << "Invalid whichMatrix value " << whichMatrix
-         << ".  Valid value(s) include " << 0 << ".";
-      throw std::invalid_argument (os.str ());
-    }
+// Create a test sparse matrix A.
+//
+// Identify the matrix to create by number (whichMatrix).  The
+// following lists the valid options for whichMatrix:
+//
+// 0: A square 10 x 10 nonsymmetric diagonally dominant sparse matrix.
+//
+// \param ptr [out] Array of row offsets, of length numRows+1.
+// \param ind [out] Array of column indices, of length nnz.
+// \param val [out] Array of entries (values), of length nnz.
+// \param numRows [out] The number of rows in the matrix.
+// \param numCols [out] The number of columns in the matrix.
+// \param nnz [out] The number of stored entries in the matrix.
+// \param whichMatrix [in] The index of the matrix to create.
+template <typename crsMat_t>
+void makeSparseMatrix(
+    typename crsMat_t::StaticCrsGraphType::row_map_type::non_const_type &ptr,
+    typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type &ind,
+    typename crsMat_t::values_type::non_const_type &val,
+    typename crsMat_t::ordinal_type &numRows,
+    typename crsMat_t::ordinal_type &numCols, typename crsMat_t::size_type &nnz,
+    const int whichMatrix) {
+  typedef typename crsMat_t::StaticCrsGraphType::row_map_type::non_const_type
+      ptr_type;
+  typedef typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type
+      ind_type;
+  typedef typename crsMat_t::values_type::non_const_type val_type;
+  typedef typename crsMat_t::ordinal_type lno_t;
+  typedef typename crsMat_t::size_type size_type;
+  typedef typename crsMat_t::value_type scalar_t;
+
+  using Kokkos::HostSpace;
+  using Kokkos::MemoryUnmanaged;
+  using Kokkos::View;
+
+  if (whichMatrix == 0) {
+    numRows                  = 10;
+    numCols                  = 10;
+    nnz                      = 21;
+    const size_type ptrRaw[] = {0, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21};
+    const lno_t indRaw[]     = {0, 1, 9, 1, 2, 2, 3, 3, 4, 4, 5,
+                            5, 6, 6, 7, 7, 8, 8, 9, 1, 9};
+
+    const scalar_t valRaw[] = {1.0,  4.0, 0.5,  0.5, 5.0,  1.0, 6.0,
+                               1.5,  7.0, 2.0,  8.0, 2.5,  9.0, 3.0,
+                               10.0, 3.5, 11.0, 4.0, 12.0, 4.5, 13.0};
+
+    // Create the output Views.
+    ptr = ptr_type("ptr", numRows + 1);
+    ind = ind_type("ind", nnz);
+    val = val_type("val", nnz);
+
+    // Wrap the above three arrays in unmanaged Views, so we can use deep_copy.
+    typename ptr_type::HostMirror::const_type ptrIn(ptrRaw, numRows + 1);
+    typename ind_type::HostMirror::const_type indIn(indRaw, nnz);
+    typename val_type::HostMirror::const_type valIn(valRaw, nnz);
+
+    Kokkos::deep_copy(ptr, ptrIn);
+    Kokkos::deep_copy(ind, indIn);
+    Kokkos::deep_copy(val, valIn);
+  } else {  // whichMatrix != 0
+    std::ostringstream os;
+    os << "Invalid whichMatrix value " << whichMatrix
+       << ".  Valid value(s) include " << 0 << ".";
+    throw std::invalid_argument(os.str());
   }
+}
 
-  // Return the Kokkos::CrsMatrix corresponding to makeSparseMatrix().
-  template<typename crsMat_t>
-  crsMat_t  makeCrsMatrix ()
-  {
-    typedef typename crsMat_t::StaticCrsGraphType graph_t;
-    typedef typename graph_t::row_map_type::non_const_type lno_view_t;
-    typedef typename graph_t::entries_type::non_const_type   lno_nnz_view_t;
-    typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
-    typedef typename crsMat_t::ordinal_type lno_t;
-    typedef typename crsMat_t::size_type size_type;
-
-    lno_view_t ptr;
-    lno_nnz_view_t ind;
-    scalar_view_t val;
-    lno_t numRows;
-    lno_t numCols;
-    size_type nnz;
-
-    const int whichMatrix = 0;
-    makeSparseMatrix<crsMat_t> (ptr, ind, val, numRows, numCols, nnz, whichMatrix);
-    return crsMat_t ("A", numRows, numCols, nnz, val, ptr, ind);
-  }
+// Return the Kokkos::CrsMatrix corresponding to makeSparseMatrix().
+template <typename crsMat_t>
+crsMat_t makeCrsMatrix() {
+  typedef typename crsMat_t::StaticCrsGraphType graph_t;
+  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
+  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
+  typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
+  typedef typename crsMat_t::ordinal_type lno_t;
+  typedef typename crsMat_t::size_type size_type;
+
+  lno_view_t ptr;
+  lno_nnz_view_t ind;
+  scalar_view_t val;
+  lno_t numRows;
+  lno_t numCols;
+  size_type nnz;
+
+  const int whichMatrix = 0;
+  makeSparseMatrix<crsMat_t>(ptr, ind, val, numRows, numCols, nnz, whichMatrix);
+  return crsMat_t("A", numRows, numCols, nnz, val, ptr, ind);
+}
 
-} // namespace (anonymous)
+}  // namespace Test
 // Create a Kokkos::CrsMatrix.  This mainly tests that the class
 // compiles.  However, it does need to initialize the MemorySpace's
 // default execution space, because it allocates Views and calls
 // deep_copy a few times.
-template <typename scalar_t, typename lno_t, typename size_type, typename device>
-void
-testCrsMatrix ()
-{
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void testCrsMatrix() {
   using namespace Test;
 
-  typedef KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type> crs_matrix_type;
-  crs_matrix_type A = makeCrsMatrix<crs_matrix_type> ();
+  typedef KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
+      crs_matrix_type;
+  crs_matrix_type A = makeCrsMatrix<crs_matrix_type>();
   // mfh 28 Sep 2013: Use A in some way, so the compiler can't
   // optimize it away completely.  This forces the compiler to
   // compile CrsMatrix, which is the whole point of this test.
-  //printf ("A is %d by %d\n", A.numRows (), A.numCols ());
+  // printf ("A is %d by %d\n", A.numRows (), A.numCols ());
 }
 
-template <typename scalar_t, typename lno_t, typename size_type, typename device>
-void
-testCrsMatrixRawConstructor()
-{
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void testCrsMatrixRawConstructor() {
   int nrows = 5;
-  //note: last 2 columns will be empty.
-  //This makes sure the ncols provided to constructor is preserved.
+  // note: last 2 columns will be empty.
+  // This makes sure the ncols provided to constructor is preserved.
   int ncols = 7;
-  int nnz = 9;
-  //NOTE: this is not a mistake, the raw ptr constructor takes rowmap as ordinal.
-  std::vector<lno_t> rowmap = {0, 0, 2, 5, 6, 9};
+  int nnz   = 9;
+  // NOTE: this is not a mistake, the raw ptr constructor takes rowmap as
+  // ordinal.
+  std::vector<lno_t> rowmap  = {0, 0, 2, 5, 6, 9};
   std::vector<lno_t> entries = {3, 4, 0, 1, 2, 2, 0, 3, 4};
   std::vector<scalar_t> values;
-  for(int i = 0; i < nnz; i++)
-    values.push_back(Kokkos::ArithTraits<scalar_t>::one() * (1.0 * rand() / RAND_MAX));
+  for (int i = 0; i < nnz; i++)
+    values.push_back(Kokkos::ArithTraits<scalar_t>::one() *
+                     (1.0 * rand() / RAND_MAX));
   KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type> A(
       "A", nrows, ncols, nnz, values.data(), rowmap.data(), entries.data());
   EXPECT_EQ(A.numRows(), nrows);
   EXPECT_EQ(A.numCols(), ncols);
   EXPECT_EQ(A.nnz(), nnz);
-  //verify rowmap, entries, values: should all be identical to original raw arrays
-  //(except the rowmap elements are now size_type)
-  auto checkRowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map);
-  auto checkEntries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries);
-  auto checkValues = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.values);
-  for(int i = 0; i < nrows + 1; i++)
-    EXPECT_EQ(checkRowmap(i), (size_type) rowmap[i]);
-  for(int i = 0; i < nnz; i++)
-  {
+  // verify rowmap, entries, values: should all be identical to original raw
+  // arrays (except the rowmap elements are now size_type)
+  auto checkRowmap =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map);
+  auto checkEntries =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries);
+  auto checkValues =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.values);
+  for (int i = 0; i < nrows + 1; i++)
+    EXPECT_EQ(checkRowmap(i), (size_type)rowmap[i]);
+  for (int i = 0; i < nnz; i++) {
     EXPECT_EQ(checkEntries(i), entries[i]);
     EXPECT_EQ(checkValues(i), values[i]);
   }
 }
 
-template <typename scalar_t, typename lno_t, typename size_type, typename device>
-void
-testCrsMatrixHostMirror ()
-{
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void testCrsMatrixHostMirror() {
   using namespace Test;
-  using crs_matrix = KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>;
+  using crs_matrix =
+      KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>;
   using crs_matrix_host = typename crs_matrix::HostMirror;
-  using crs_graph = typename crs_matrix::StaticCrsGraphType;
-  using crs_graph_host = typename crs_graph::HostMirror;
-  crs_matrix A = makeCrsMatrix<crs_matrix>();
-  typename crs_matrix::values_type::HostMirror valuesHost("values host", A.nnz());
-  typename crs_matrix::row_map_type::HostMirror rowmapHost("rowmap host", A.numRows() + 1);
-  typename crs_matrix::index_type::HostMirror entriesHost("entries host", A.nnz());
+  using crs_graph       = typename crs_matrix::StaticCrsGraphType;
+  using crs_graph_host  = typename crs_graph::HostMirror;
+  crs_matrix A          = makeCrsMatrix<crs_matrix>();
+  typename crs_matrix::values_type::HostMirror valuesHost("values host",
+                                                          A.nnz());
+  typename crs_matrix::row_map_type::HostMirror rowmapHost("rowmap host",
+                                                           A.numRows() + 1);
+  typename crs_matrix::index_type::HostMirror entriesHost("entries host",
+                                                          A.nnz());
   crs_graph_host graphHost(entriesHost, rowmapHost);
-  //Test the two CrsMatrix constructors that take the StaticCrsGraph
+  // Test the two CrsMatrix constructors that take the StaticCrsGraph
   crs_matrix_host Ahost1("Ahost1", graphHost, A.numCols());
   crs_matrix_host Ahost2("Ahost2", A.numCols(), valuesHost, graphHost);
-  //Test deep copy constructor (can copy between any two spaces)
+  // Test deep copy constructor (can copy between any two spaces)
   {
     crs_matrix Bdev("B device", Ahost1);
     crs_matrix_host Bhost("B host", A);
   }
-  //Test the empty (0x0, 0 entries) case - zero-length rowmap.
+  // Test the empty (0x0, 0 entries) case - zero-length rowmap.
   typename crs_graph::row_map_type::non_const_type zeroRowmap;
   typename crs_graph::entries_type zeroEntries;
   typename crs_matrix::values_type zeroValues;
@@ -261,111 +245,144 @@ testCrsMatrixHostMirror ()
   EXPECT_EQ(zeroHost.graph.row_map.extent(0), 0);
 }
 
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
-TEST_F( TestCategory, sparse ## _ ## crsmatrix ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  testCrsMatrix<SCALAR, ORDINAL, OFFSET, DEVICE> (); \
-  testCrsMatrixRawConstructor<SCALAR, ORDINAL, OFFSET, DEVICE> (); \
-} \
-TEST_F( TestCategory, sparse ## _ ## crsmatrix_host_mirror ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  testCrsMatrixHostMirror<SCALAR, ORDINAL, OFFSET, DEVICE> (); \
-}
-
+#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                                  \
+  TEST_F(TestCategory,                                                                 \
+         sparse##_##crsmatrix##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {          \
+    testCrsMatrix<SCALAR, ORDINAL, OFFSET, DEVICE>();                                  \
+    testCrsMatrixRawConstructor<SCALAR, ORDINAL, OFFSET, DEVICE>();                    \
+  }                                                                                    \
+  TEST_F(                                                                              \
+      TestCategory,                                                                    \
+      sparse##_##crsmatrix_host_mirror##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    testCrsMatrixHostMirror<SCALAR, ORDINAL, OFFSET, DEVICE>();                        \
+  }
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
 #endif
 
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
 #endif
 
 #undef EXECUTE_TEST
diff --git a/unit_test/sparse/Test_Sparse_Utils_cusparse.hpp b/unit_test/sparse/Test_Sparse_Utils_cusparse.hpp
index 73f6f4f1f4..3d85ec394a 100644
--- a/unit_test/sparse/Test_Sparse_Utils_cusparse.hpp
+++ b/unit_test/sparse/Test_Sparse_Utils_cusparse.hpp
@@ -4,8 +4,8 @@
 //       is enabled.
 #if defined(TEST_CUDA_SPARSE_CPP) && defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
 
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
 
 #include "KokkosKernels_SparseUtils_cusparse.hpp"
 
@@ -23,11 +23,8 @@ void test_cusparse_safe_call() {
   }
 
   EXPECT_TRUE(caught_exception == true);
-
 }
 
-TEST_F( TestCategory, sparse_cusparse_safe_call ) { \
-	test_cusparse_safe_call (); \
-}
+TEST_F(TestCategory, sparse_cusparse_safe_call) { test_cusparse_safe_call(); }
 
-#endif // check for CUDA and cuSPARSE
+#endif  // check for CUDA and cuSPARSE
diff --git a/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp b/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp
index 47a0a7924a..d76f6be812 100644
--- a/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp
+++ b/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp
@@ -42,14 +42,13 @@
 //@HEADER
 */
 
-
 #include <gtest/gtest.h>
 
 #include <Kokkos_Core.hpp>
 #include "KokkosKernels_TestUtils.hpp"
 #include "KokkosKernels_Handle.hpp"
 #include "KokkosKernels_IOUtils.hpp"
-//#include <Kokkos_Sparse_CrsMatrix.hpp>
+#include "KokkosKernels_SparseUtils.hpp"
 #include <KokkosSparse_spmv.hpp>
 #include <KokkosBlas1_dot.hpp>
 #include <KokkosBlas1_axpby.hpp>
@@ -68,37 +67,64 @@ typedef Kokkos::complex<double> kokkos_complex_double;
 typedef Kokkos::complex<float> kokkos_complex_float;
 
 using namespace KokkosKernels;
+using namespace KokkosKernels::Impl;
 using namespace KokkosKernels::Experimental;
 using namespace KokkosSparse;
 using namespace KokkosSparse::Experimental;
 
 namespace Test {
 
-template <typename crsMat_t, typename vector_t, typename const_vector_t, typename device>
+enum GSApplyType {
+  symmetric,
+  forward_sweep,
+  backward_sweep,
+};
+
+template <typename lno_t, typename scalar_t, typename mag_t>
+struct GSTestParams {
+  // Intentionally testing block_size that's not a multiple of #rows.
+  lno_t block_size = 7;
+  lno_t numVecs    = 2;  // how many columns X/Y have
+  scalar_t omega   = 0.9;
+  mag_t tolerance  = 1e-7;  // relative error for solution x vector
+
+  // Note: GS_DEFAULT is same as GS_TEAM and - for blocks - as GS_PERMUTED
+  // Note: GS_TWOSTAGE and GS_CLUSTER are not supported for blocks
+  std::vector<GSAlgorithm> gs_algorithms = {GS_DEFAULT};
+  std::vector<size_t> shmem_sizes        = {
+      32128,
+      2008  // make the shmem small on gpus so that it will test 2 level
+            // algorithm.
+  };
+  std::vector<GSApplyType> apply_types = {symmetric, forward_sweep,
+                                          backward_sweep};
+
+  GSTestParams() = default;
+};
+
+template <typename mtx_t, typename vector_t, typename const_vector_t>
 int run_block_gauss_seidel_1(
-    crsMat_t input_mat, int block_size,
-    KokkosSparse::GSAlgorithm gs_algorithm,
-    vector_t x_vector,
-    const_vector_t y_vector,
-    bool is_symmetric_graph,
-    int apply_type = 0, // 0 for symmetric, 1 for forward, 2 for backward.
-    bool skip_symbolic = false,
-    bool skip_numeric = false,
-    size_t shmem_size = 32128,
-    typename crsMat_t::value_type omega = Kokkos::Details::ArithTraits<typename crsMat_t::value_type>::one()
-    ){
-  typedef typename crsMat_t::StaticCrsGraphType graph_t;
+    mtx_t input_mat, int block_size, KokkosSparse::GSAlgorithm gs_algorithm,
+    vector_t x_vector, const_vector_t y_vector, bool is_symmetric_graph,
+    GSApplyType apply_type = Test::symmetric, bool skip_symbolic = false,
+    bool skip_numeric = false, size_t shmem_size = 32128,
+    typename mtx_t::value_type omega =
+        Kokkos::Details::ArithTraits<typename mtx_t::value_type>::one()) {
+  typedef typename mtx_t::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type lno_view_t;
-  typedef typename graph_t::entries_type   lno_nnz_view_t;
-  typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
+  typedef typename graph_t::entries_type lno_nnz_view_t;
+  typedef typename mtx_t::values_type::non_const_type scalar_view_t;
 
   typedef typename lno_view_t::value_type size_type;
   typedef typename lno_nnz_view_t::value_type lno_t;
   typedef typename scalar_view_t::value_type scalar_t;
 
-  typedef KokkosKernelsHandle
-      <size_type,lno_t, scalar_t,
-      typename device::execution_space, typename device::memory_space,typename device::memory_space > KernelHandle;
+  constexpr auto format = MatrixTraits<mtx_t>::format;
+
+  typedef KokkosKernelsHandle<
+      size_type, lno_t, scalar_t, typename mtx_t::execution_space,
+      typename mtx_t::memory_space, typename mtx_t::memory_space>
+      KernelHandle;
   KernelHandle kh;
   kh.set_team_work_size(16);
   kh.set_shmem_size(shmem_size);
@@ -107,390 +133,452 @@ int run_block_gauss_seidel_1(
 
   const size_t num_rows_1 = input_mat.numRows();
   const size_t num_cols_1 = input_mat.numCols();
-  const int apply_count = 100;
+  const int apply_count   = 100;
 
-  if (!skip_symbolic){
-	  block_gauss_seidel_symbolic
-      (&kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map, input_mat.graph.entries, is_symmetric_graph);
+  if (!skip_symbolic) {
+    block_gauss_seidel_symbolic(&kh, num_rows_1, num_cols_1, block_size,
+                                input_mat.graph.row_map,
+                                input_mat.graph.entries, is_symmetric_graph);
   }
 
-  if (!skip_numeric){
-	  block_gauss_seidel_numeric
-    (&kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, is_symmetric_graph);
+  if (!skip_numeric) {
+    block_gauss_seidel_numeric<format>(
+        &kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map,
+        input_mat.graph.entries, input_mat.values, is_symmetric_graph);
   }
 
-  switch (apply_type){
-  case 0:
-    symmetric_block_gauss_seidel_apply
-    (&kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector,false, true, omega, apply_count);
-    break;
-  case 1:
-    forward_sweep_block_gauss_seidel_apply
-    (&kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector,false, true, omega, apply_count);
-    break;
-  case 2:
-    backward_sweep_block_gauss_seidel_apply
-    (&kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector,false, true, omega, apply_count);
-    break;
-  default:
-    symmetric_block_gauss_seidel_apply
-    (&kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector,false, true, omega, apply_count);
-    break;
+  switch (apply_type) {
+    case Test::forward_sweep:
+      forward_sweep_block_gauss_seidel_apply<format>(
+          &kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map,
+          input_mat.graph.entries, input_mat.values, x_vector, y_vector, false,
+          true, omega, apply_count);
+      break;
+    case Test::backward_sweep:
+      backward_sweep_block_gauss_seidel_apply<format>(
+          &kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map,
+          input_mat.graph.entries, input_mat.values, x_vector, y_vector, false,
+          true, omega, apply_count);
+      break;
+    case Test::symmetric:
+    default:
+      symmetric_block_gauss_seidel_apply<format>(
+          &kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map,
+          input_mat.graph.entries, input_mat.values, x_vector, y_vector, false,
+          true, omega, apply_count);
+      break;
   }
 
-
   kh.destroy_gs_handle();
   return 0;
 }
 
-}
-
-template <typename scalar_t, typename lno_t, typename size_type, typename device>
-void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance) {
+}  // namespace Test
 
+template <SparseMatrixFormat mtx_format, typename scalar_t, typename lno_t,
+          typename size_type, typename device>
+void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz,
+                                   lno_t bandwidth, lno_t row_size_variance) {
   using namespace Test;
   srand(245);
-  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type> crsMat_t;
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
+          crsMat_t;
 
   typedef typename device::execution_space exec_space;
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
-  typedef typename crsMat_t::StaticCrsGraphType::row_map_type::non_const_type lno_view_t;
-  typedef typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type lno_nnz_view_t;
+  typedef typename crsMat_t::StaticCrsGraphType::row_map_type::non_const_type
+      lno_view_t;
+  typedef typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type
+      lno_nnz_view_t;
   typedef typename Kokkos::Details::ArithTraits<scalar_t>::mag_type mag_t;
 
   lno_t numCols = numRows;
 
-  //Intentionally testing block_size that's not a multiple of #rows.
-  lno_t block_size = 7;
+  const GSTestParams<lno_t, scalar_t, mag_t> params;
+  lno_t block_size = params.block_size;
 
-  crsMat_t crsmat = KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<crsMat_t>(numRows,numCols,nnz,row_size_variance, bandwidth);
+  crsMat_t crsmat =
+      KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<
+          crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth);
 
   lno_view_t pf_rm;
   lno_nnz_view_t pf_e;
   scalar_view_t pf_v;
   size_t out_r, out_c;
 
-  //this makes consecutive 5 rows to have same columns.
-  //it will add scalar 0's for those entries that does not exists.
-  //the result is still a point crs matrix.
+  // this makes consecutive 5 rows to have same columns.
+  // it will add scalar 0's for those entries that does not exists.
+  // the result is still a point crs matrix.
   KokkosKernels::Impl::kk_create_blockcrs_formated_point_crsmatrix(
-		  block_size , crsmat.numRows(), crsmat.numCols(),
-		  crsmat.graph.row_map, crsmat.graph.entries, crsmat.values,
-		  out_r, out_c,
-		  pf_rm, pf_e, pf_v);
-  graph_t static_graph2 (pf_e, pf_rm);
+      block_size, crsmat.numRows(), crsmat.numCols(), crsmat.graph.row_map,
+      crsmat.graph.entries, crsmat.values, out_r, out_c, pf_rm, pf_e, pf_v);
+  graph_t static_graph2(pf_e, pf_rm);
   crsMat_t crsmat2("CrsMatrix2", out_c, pf_v, static_graph2);
 
-  lno_view_t bf_rm;
-  lno_nnz_view_t bf_e;
-  scalar_view_t bf_v;
-  size_t but_r, but_c;
-
-  //this converts the previous generated matrix to block crs matrix.
-  KokkosKernels::Impl::kk_create_blockcrs_from_blockcrs_formatted_point_crs(
-		  block_size , out_r, out_c,
-		  pf_rm, pf_e, pf_v,
-		  but_r, but_c,
-		  bf_rm, bf_e, bf_v);
-  graph_t static_graph (bf_e, bf_rm);
-  crsMat_t input_mat("CrsMatrix", but_c, bf_v, static_graph);
+  // this converts the previous generated matrix to block matrix.
+  auto input_mat =
+      MatrixConverter<mtx_format>::from_blockcrs_formated_point_crsmatrix(
+          crsmat2, block_size);
 
   lno_t nv = ((crsmat2.numRows() + block_size - 1) / block_size) * block_size;
 
-  const scalar_view_t solution_x(Kokkos::view_alloc(Kokkos::WithoutInitializing, "X"), nv);
-  //create_random_x_vector operates on host mirror, then copies to device. But create_y does everything on device.
+  const scalar_view_t solution_x(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "X"), nv);
+  // create_random_x_vector operates on host mirror, then copies to device. But
+  // create_y does everything on device.
   create_random_x_vector(solution_x);
   exec_space().fence();
   scalar_view_t y_vector = create_random_y_vector(crsmat2, solution_x);
   mag_t initial_norm_res = KokkosBlas::nrm2(solution_x);
-#ifdef gauss_seidel_testmore
-  GSAlgorithm gs_algorithms[] ={GS_DEFAULT, GS_TEAM, GS_PERMUTED};
-  int apply_count = 3;
-  for (int ii = 0; ii < 3; ++ii){
-#else
-  int apply_count = 1;
-  GSAlgorithm gs_algorithms[] ={GS_DEFAULT};
-  for (int ii = 0; ii < 1; ++ii){
-#endif
-    GSAlgorithm gs_algorithm = gs_algorithms[ii];
+
+  for (const auto gs_algorithm : params.gs_algorithms) {
     scalar_view_t x_vector("x vector", nv);
     const scalar_t alpha = 1.0;
 
-    //bool is_symmetric_graph = false;
-    //int apply_type = 0;
-    //bool skip_symbolic = false;
-    //bool skip_numeric = false;
-    scalar_t omega = 0.9;
-
     bool is_symmetric_graph = true;
-    size_t shmem_size = 32128;
-
-    for(int i = 0; i < 2; ++i)
-    {
-      if (i == 1) shmem_size = 2008; //make the shmem small on gpus so that it will test 2 level algorithm.
-      for (int apply_type = 0; apply_type < apply_count; ++apply_type){
-        for (int skip_symbolic = 0; skip_symbolic < 2; ++skip_symbolic){
-          for (int skip_numeric = 0; skip_numeric < 2; ++skip_numeric){
 
+    for (const auto shmem_size : params.shmem_sizes) {
+      for (const auto apply_type : params.apply_types) {
+        for (const auto skip_symbolic : {false, true}) {
+          for (const auto skip_numeric : {false, true}) {
             Kokkos::Timer timer1;
-            //int res =
-            run_block_gauss_seidel_1<crsMat_t, scalar_view_t, typename scalar_view_t::const_type, device>
-              (input_mat, block_size, gs_algorithm, x_vector, y_vector, is_symmetric_graph, apply_type, skip_symbolic, skip_numeric, shmem_size, omega);
-            //double gs = timer1.seconds();
-            //KokkosKernels::Impl::print_1Dview(x_vector);
+            // int res =
+            run_block_gauss_seidel_1(input_mat, block_size, gs_algorithm,
+                                     x_vector, y_vector, is_symmetric_graph,
+                                     apply_type, skip_symbolic, skip_numeric,
+                                     shmem_size, params.omega);
+            // double gs = timer1.seconds();
+            // KokkosKernels::Impl::print_1Dview(x_vector);
             KokkosBlas::axpby(alpha, solution_x, -alpha, x_vector);
             mag_t result_norm_res = KokkosBlas::nrm2(x_vector);
-            EXPECT_TRUE(( result_norm_res < initial_norm_res ));
+            EXPECT_LT(result_norm_res, params.tolerance * initial_norm_res);
           }
         }
       }
     }
   }
-  //device::execution_space::finalize();
+  // device::execution_space::finalize();
 }
 
-template <typename scalar_t, typename lno_t, typename size_type, typename device>
-void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance) {
-
+template <SparseMatrixFormat mtx_format, typename scalar_t, typename lno_t,
+          typename size_type, typename device>
+void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz,
+                                   lno_t bandwidth, lno_t row_size_variance) {
   using namespace Test;
   srand(245);
-  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type> crsMat_t;
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
+          crsMat_t;
 
   typedef typename device::execution_space exec_space;
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
-  typedef typename crsMat_t::StaticCrsGraphType::row_map_type::non_const_type lno_view_t;
-  typedef typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type lno_nnz_view_t;
+  typedef typename crsMat_t::StaticCrsGraphType::row_map_type::non_const_type
+      lno_view_t;
+  typedef typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type
+      lno_nnz_view_t;
   typedef Kokkos::View<scalar_t**, default_layout, device> scalar_view2d_t;
   typedef typename Kokkos::Details::ArithTraits<scalar_t>::mag_type mag_t;
 
   lno_t numCols = numRows;
 
-  //Intentionally testing block_size that's not a multiple of #rows.
-  lno_t block_size = 7;
+  const GSTestParams<lno_t, scalar_t, mag_t> params;
+  lno_t block_size = params.block_size;
 
-  crsMat_t crsmat = KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<crsMat_t>(numRows,numCols,nnz,row_size_variance, bandwidth);
+  crsMat_t crsmat =
+      KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<
+          crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth);
 
   lno_view_t pf_rm;
   lno_nnz_view_t pf_e;
   scalar_view_t pf_v;
   size_t out_r, out_c;
 
-  //this makes consecutive 5 rows to have same columns.
-  //it will add scalar 0's for those entries that does not exists.
-  //the result is still a point crs matrix.
+  // this makes consecutive 5 rows to have same columns.
+  // it will add scalar 0's for those entries that does not exists.
+  // the result is still a point crs matrix.
   KokkosKernels::Impl::kk_create_blockcrs_formated_point_crsmatrix(
-		  block_size , crsmat.numRows(), crsmat.numCols(),
-		  crsmat.graph.row_map, crsmat.graph.entries, crsmat.values,
-		  out_r, out_c,
-		  pf_rm, pf_e, pf_v);
-  graph_t static_graph2 (pf_e, pf_rm);
+      block_size, crsmat.numRows(), crsmat.numCols(), crsmat.graph.row_map,
+      crsmat.graph.entries, crsmat.values, out_r, out_c, pf_rm, pf_e, pf_v);
+  graph_t static_graph2(pf_e, pf_rm);
   crsMat_t crsmat2("CrsMatrix2", out_c, pf_v, static_graph2);
 
-  lno_view_t bf_rm;
-  lno_nnz_view_t bf_e;
-  scalar_view_t bf_v;
-  size_t but_r, but_c;
-
-  //this converts the previous generated matrix to block crs matrix.
-  KokkosKernels::Impl::kk_create_blockcrs_from_blockcrs_formatted_point_crs(
-		  block_size , out_r, out_c,
-		  pf_rm, pf_e, pf_v,
-		  but_r, but_c,
-		  bf_rm, bf_e, bf_v);
-  graph_t static_graph (bf_e, bf_rm);
-  crsMat_t input_mat("CrsMatrix", but_c, bf_v, static_graph);
+  auto input_mat =
+      MatrixConverter<mtx_format>::from_blockcrs_formated_point_crsmatrix(
+          crsmat2, block_size);
 
   lno_t nv = ((crsmat2.numRows() + block_size - 1) / block_size) * block_size;
 
-  //how many columns X/Y have
-  constexpr lno_t numVecs = 2;
+  const lno_t numVecs = params.numVecs;
 
-  scalar_view2d_t solution_x(Kokkos::view_alloc(Kokkos::WithoutInitializing, "X"), nv, numVecs);
+  scalar_view2d_t solution_x(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "X"), nv, params.numVecs);
   create_random_x_vector(solution_x);
   scalar_view2d_t y_vector = create_random_y_vector_mv(crsmat2, solution_x);
   exec_space().fence();
-  auto solution_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), solution_x);
-  //Need to fence before reading from solution_host
+  auto solution_host =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), solution_x);
+  // Need to fence before reading from solution_host
   std::vector<mag_t> initial_norms(numVecs);
-  for(lno_t i = 0; i < numVecs; i++)
-  {
+  for (lno_t i = 0; i < numVecs; i++) {
     scalar_t sum = 0;
-    for(lno_t j = 0; j < nv; j++)
-    {
+    for (lno_t j = 0; j < nv; j++) {
       sum += solution_host(j, i) * solution_host(j, i);
     }
     initial_norms[i] = Kokkos::Details::ArithTraits<mag_t>::sqrt(
         Kokkos::Details::ArithTraits<scalar_t>::abs(sum));
   }
-#ifdef gauss_seidel_testmore
-  GSAlgorithm gs_algorithms[] ={GS_DEFAULT, GS_TEAM, GS_PERMUTED};
-  int apply_count = 3;
-  for (int ii = 0; ii < 3; ++ii){
-#else
-  int apply_count = 1;
-  GSAlgorithm gs_algorithms[] ={GS_DEFAULT};
-  for (int ii = 0; ii < 1; ++ii){
-#endif
-    GSAlgorithm gs_algorithm = gs_algorithms[ii];
+
+  for (const auto gs_algorithm : params.gs_algorithms) {
     scalar_view2d_t x_vector("x vector", nv, numVecs);
     auto x_host = Kokkos::create_mirror_view(x_vector);
 
-    //bool is_symmetric_graph = false;
-    //int apply_type = 0;
-    //bool skip_symbolic = false;
-    //bool skip_numeric = false;
-    scalar_t omega = 0.9;
-
     bool is_symmetric_graph = true;
-    size_t shmem_size = 32128;
 
     scalar_view_t res_norms("Residuals", numVecs);
     auto h_res_norms = Kokkos::create_mirror_view(res_norms);
 
-    for(int i = 0; i < 2; ++i)
-    {
-      if (i == 1) shmem_size = 2008; //make the shmem small on gpus so that it will test 2 level algorithm.
-      for (int apply_type = 0; apply_type < apply_count; ++apply_type){
-        for (int skip_symbolic = 0; skip_symbolic < 2; ++skip_symbolic){
-          for (int skip_numeric = 0; skip_numeric < 2; ++skip_numeric){
-
+    for (const auto shmem_size : params.shmem_sizes) {
+      for (const auto apply_type : params.apply_types) {
+        for (const auto skip_symbolic : {false, true}) {
+          for (const auto skip_numeric : {false, true}) {
             Kokkos::Timer timer1;
-            //int res =
-            run_block_gauss_seidel_1<crsMat_t, scalar_view2d_t, typename scalar_view2d_t::const_type, device>
-              (input_mat, block_size, gs_algorithm, x_vector, y_vector, is_symmetric_graph, apply_type, skip_symbolic, skip_numeric, shmem_size, omega);
-            //double gs = timer1.seconds();
-            //KokkosKernels::Impl::print_1Dview(x_vector);
+            // int res =
+            run_block_gauss_seidel_1(input_mat, block_size, gs_algorithm,
+                                     x_vector, y_vector, is_symmetric_graph,
+                                     apply_type, skip_symbolic, skip_numeric,
+                                     shmem_size, params.omega);
+            // double gs = timer1.seconds();
+            // KokkosKernels::Impl::print_1Dview(x_vector);
             Kokkos::deep_copy(x_host, x_vector);
             exec_space().fence();
-            for(lno_t c = 0; c < numVecs; c++)
-            {
+            for (lno_t c = 0; c < numVecs; c++) {
               scalar_t sum = 0;
-              for(lno_t r = 0; r < nv; r++)
-              {
+              for (lno_t r = 0; r < nv; r++) {
                 scalar_t diff = x_host(r, c) - solution_host(r, c);
                 sum += diff * diff;
               }
               mag_t result_res = Kokkos::Details::ArithTraits<mag_t>::sqrt(
                   Kokkos::Details::ArithTraits<scalar_t>::abs(sum));
-              EXPECT_TRUE( result_res < initial_norms[c] );
+              EXPECT_LT(result_res, params.tolerance * initial_norms[c]);
             }
           }
         }
       }
     }
   }
-  //device::execution_space::finalize();
+  // device::execution_space::finalize();
 }
 
-
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
-TEST_F( TestCategory, sparse ## _ ## block_gauss_seidel_rank1 ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-	test_block_gauss_seidel_rank1<SCALAR,ORDINAL,OFFSET,DEVICE>(500, 500 * 10, 70, 3); \
-} \
-TEST_F( TestCategory, sparse ## _ ## block_gauss_seidel_rank2 ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-	test_block_gauss_seidel_rank2<SCALAR,ORDINAL,OFFSET,DEVICE>(500, 500 * 10, 70, 3); \
+template <SparseMatrixFormat mtx_format, typename scalar_t, typename lno_t,
+          typename size_type, typename device>
+void test_block_gauss_seidel_empty() {
+  using namespace Test;
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
+          crsMat_t;
+  typedef typename crsMat_t::StaticCrsGraphType graph_t;
+  typedef typename graph_t::row_map_type::non_const_type row_map_type;
+  typedef typename graph_t::entries_type::non_const_type entries_type;
+  typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
+  typedef KokkosKernelsHandle<
+      size_type, lno_t, scalar_t, typename device::execution_space,
+      typename device::memory_space, typename device::memory_space>
+      KernelHandle;
+  // The rowmap of a zero-row matrix can be length 0 or 1, so Gauss-Seidel
+  // should work with both (the setup and apply are essentially no-ops but they
+  // shouldn't crash or throw exceptions) For this test, create size-0 and
+  // size-1 rowmaps separately. Check also 5x5 matrix with empty rows (0-nnz),
+  // which can trigger different bugs.
+  for (const int rowmapLen : {0, 1, 5}) {
+    KernelHandle kh;
+    kh.create_gs_handle(GS_DEFAULT);
+    const auto num_rows    = KOKKOSKERNELS_MACRO_MAX(0, rowmapLen - 1);
+    const lno_t block_size = 1;  // irrelevant (no values here)
+    // initialized to 0
+    row_map_type rowmap("Rowmap", rowmapLen);
+    entries_type entries("Entries", 0);
+    scalar_view_t values("Values", 0);
+    // also, make sure graph symmetrization doesn't crash on zero rows
+    block_gauss_seidel_symbolic(&kh, num_rows, num_rows, block_size, rowmap,
+                                entries, false);
+    block_gauss_seidel_numeric<mtx_format>(&kh, num_rows, num_rows, block_size,
+                                           rowmap, entries, values, false);
+    scalar_view_t x("X", num_rows);
+    scalar_view_t y("Y", num_rows);
+    scalar_t omega(0.9);
+    symmetric_block_gauss_seidel_apply<mtx_format>(
+        &kh, num_rows, num_rows, block_size, rowmap, entries, values, x, y,
+        false, true, omega, 3);
+    kh.destroy_gs_handle();
+  }
 }
 
+#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                                    \
+  TEST_F(                                                                                \
+      TestCategory,                                                                      \
+      sparse_blockcrs_gauss_seidel_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    test_block_gauss_seidel_rank1<BlockCRS, SCALAR, ORDINAL, OFFSET, DEVICE>(            \
+        500, 500 * 10, 70, 3);                                                           \
+  }                                                                                      \
+  TEST_F(                                                                                \
+      TestCategory,                                                                      \
+      sparse_blockcrs_gauss_seidel_rank2_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {   \
+    test_block_gauss_seidel_rank2<BlockCRS, SCALAR, ORDINAL, OFFSET, DEVICE>(            \
+        500, 500 * 10, 70, 3);                                                           \
+  }                                                                                      \
+  TEST_F(                                                                                \
+      TestCategory,                                                                      \
+      sparse_blockcrs_gauss_seidel_empty_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {   \
+    test_block_gauss_seidel_empty<BlockCRS, SCALAR, ORDINAL, OFFSET,                     \
+                                  DEVICE>();                                             \
+  }                                                                                      \
+  TEST_F(                                                                                \
+      TestCategory,                                                                      \
+      sparse_bsr_gauss_seidel_rank1_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {        \
+    test_block_gauss_seidel_rank1<BSR, SCALAR, ORDINAL, OFFSET, DEVICE>(                 \
+        500, 500 * 10, 70, 3);                                                           \
+  }                                                                                      \
+  TEST_F(                                                                                \
+      TestCategory,                                                                      \
+      sparse_bsr_gauss_seidel_rank2_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {        \
+    test_block_gauss_seidel_rank2<BSR, SCALAR, ORDINAL, OFFSET, DEVICE>(                 \
+        500, 500 * 10, 70, 3);                                                           \
+  }                                                                                      \
+  TEST_F(                                                                                \
+      TestCategory,                                                                      \
+      sparse_bsr_gauss_seidel_empty_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {        \
+    test_block_gauss_seidel_empty<BSR, SCALAR, ORDINAL, OFFSET, DEVICE>();               \
+  }
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
 #endif
 
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
 #endif
 
 #undef EXECUTE_TEST
-
-
diff --git a/unit_test/sparse/Test_Sparse_findRelOffset.hpp b/unit_test/sparse/Test_Sparse_findRelOffset.hpp
index 1f0289e2fc..04baa1163f 100644
--- a/unit_test/sparse/Test_Sparse_findRelOffset.hpp
+++ b/unit_test/sparse/Test_Sparse_findRelOffset.hpp
@@ -48,9 +48,10 @@
 //       by all backends so the following guard
 //       ensure that the test is not inclueded
 //       on these backends.
-#if !defined(TEST_HIP_SPARSE_CPP) \
-  && ( !defined(TEST_CUDA_SPARSE_CPP) \
-       || (defined(TEST_CUDA_SPARSE_CPP) && defined(KOKKOS_ENABLE_CUDA_UVM)) )
+#if !defined(TEST_HIP_SPARSE_CPP) && !defined(TEST_SYCL_SPARSE_CPP) && \
+    !defined(TEST_OPENMPTARGET_SPARSE_CPP) &&                          \
+    (!defined(TEST_CUDA_SPARSE_CPP) ||                                 \
+     (defined(TEST_CUDA_SPARSE_CPP) && defined(KOKKOS_ENABLE_CUDA_UVM)))
 
 #include "Kokkos_Core.hpp"
 #include <vector>
@@ -62,415 +63,410 @@
 typedef Kokkos::complex<double> kokkos_complex_double;
 typedef Kokkos::complex<float> kokkos_complex_float;
 
-namespace Test{ // (anonymous)
-  using std::endl;
+namespace Test {  // (anonymous)
+using std::endl;
 
-  // Test findRelOffset with various array data types and for various cases.
-  //
-  // This takes the same arguments as if it were declared via the
-  // TEUCHOS_UNIT_TEST macro.
-  template <typename lno_t, typename DT>
-  void generalTest (bool& /*success*/, std::ostream &out)
+// Test findRelOffset with various array data types and for various cases.
+//
+// This takes the same arguments as if it were declared via the
+// TEUCHOS_UNIT_TEST macro.
+template <typename lno_t, typename DT>
+void generalTest(bool& /*success*/, std::ostream& out) {
+  using KokkosSparse::findRelOffset;
+  // typedef int lno_t;
+  // typedef Kokkos::Device<Kokkos::DefaultExecutionSpace,
+  // Kokkos::DefaultExecutionSpace::memory_space> DT;
+  typedef Kokkos::View<const lno_t*, DT> IVT;
+  typedef Kokkos::View<lno_t*, DT> nIVT;
+
+  // Teuchos::OSTab tab0 (out);
+  out << "Test findRelOffset" << endl;
+  // Teuchos::OSTab tab1 (out);
+
+  out << "Test empty arrays" << endl;
+
+  // Test with zero indices to search, using a raw array.
   {
-    using KokkosSparse::findRelOffset;
-    //typedef int lno_t;
-    //typedef Kokkos::Device<Kokkos::DefaultExecutionSpace, Kokkos::DefaultExecutionSpace::memory_space> DT;
-    typedef Kokkos::View<const lno_t*, DT> IVT;
-    typedef Kokkos::View<lno_t*, DT> nIVT;
-
-    //Teuchos::OSTab tab0 (out);
-    out << "Test findRelOffset" << endl;
-    //Teuchos::OSTab tab1 (out);
-
-    out << "Test empty arrays" << endl;
-
-    // Test with zero indices to search, using a raw array.
-    {
-      lno_t numEnt = 0;
-      IVT nullView;
-      const lno_t* indsToSearch = nullView.data();
-
-      const lno_t indToFind = 42;
-      for (lno_t hint = 0; hint < 3; ++hint) {
-        // Length-zero array is trivially sorted, but try the unsorted
-        // case just to make sure that branch of the code is right.
-        lno_t offset =
-          findRelOffset<lno_t, const lno_t* > (indsToSearch, numEnt,
-                                         indToFind, hint, true);
-
-        EXPECT_TRUE( (offset == numEnt ));
-        //TEST_EQUALITY( offset, numEnt ); // not in the array
-        offset = findRelOffset<lno_t, const lno_t* > (indsToSearch, numEnt,
-                                                indToFind, hint, false);
-        EXPECT_TRUE( (offset == numEnt ));
-        //TEST_EQUALITY( offset, numEnt ); // not in the array
-      }
+    lno_t numEnt = 0;
+    IVT nullView;
+    const lno_t* indsToSearch = nullView.data();
+
+    const lno_t indToFind = 42;
+    for (lno_t hint = 0; hint < 3; ++hint) {
+      // Length-zero array is trivially sorted, but try the unsorted
+      // case just to make sure that branch of the code is right.
+      lno_t offset = findRelOffset<lno_t, const lno_t*>(indsToSearch, numEnt,
+                                                        indToFind, hint, true);
+
+      EXPECT_TRUE((offset == numEnt));
+      // TEST_EQUALITY( offset, numEnt ); // not in the array
+      offset = findRelOffset<lno_t, const lno_t*>(indsToSearch, numEnt,
+                                                  indToFind, hint, false);
+      EXPECT_TRUE((offset == numEnt));
+      // TEST_EQUALITY( offset, numEnt ); // not in the array
     }
+  }
 
-    out << "Test the sorted, nonempty array case" << endl;
+  out << "Test the sorted, nonempty array case" << endl;
 
-    // Test the sorted case, with a raw array.
-    {
-      lno_t numEnt = 7;
-      const lno_t indsToSearch[7] = {1, 1, 2, 3, 5, 8, 13};
-      nIVT indsToSearch_view("indsToSearch", numEnt);
-      typename nIVT::HostMirror h_indsToSearch_view = Kokkos::create_mirror_view (indsToSearch_view);
-      for (int i = 0; i < numEnt; ++i) {
-        //std::cout << "indsToSearch[i]:" << indsToSearch[i] << std::endl;
-        h_indsToSearch_view(i) = indsToSearch[i];
-      }
-      Kokkos::deep_copy(indsToSearch_view, h_indsToSearch_view);
-      Kokkos::fence();
-      //KokkosKernels::Impl::kk_print_1Dview(indsToSearch_view);
-      const bool isSorted = true;
-
-      for (lno_t hint = 0; hint < 10; ++hint) {
-        // Test an index that is not in the array.
-        // This one is in [min, max].
-        lno_t indNotThere = 4;
-
-        lno_t offset =
-          findRelOffset<lno_t, const lno_t* > (indsToSearch_view.data(), numEnt,
-                                         indNotThere, hint, isSorted);
-
-        EXPECT_TRUE( (offset == numEnt ));
-        //TEST_EQUALITY( offset, numEnt ); // not in the array
-
-        // Test another index that is not in the array.
-        // This one is _not_ in [min, max].
-        indNotThere = 42;
-        offset = findRelOffset<lno_t, const lno_t* > (indsToSearch_view.data(), numEnt,
-                                                indNotThere, hint, isSorted);
-        EXPECT_TRUE( (offset == numEnt ));
-        //TEST_EQUALITY( offset, numEnt ); // not in the array
-
-        // Test all indices that are in the array.
-        for (lno_t k = 0; k < numEnt; ++k) {
-
-          const lno_t indToFind = indsToSearch[k]; // in the array
-          offset = findRelOffset<lno_t, const lno_t* > (indsToSearch_view.data(), numEnt,
-                                                  indToFind, hint, isSorted);
-          if (indToFind == static_cast<lno_t> (1)) {
-            // 1 is a duplicate in this example.  Treat it as a special
-            // case.  We don't specify which instance of duplicates the
-            // function must return, so either one is fine.
-
-        	ASSERT_TRUE( (  offset == static_cast<lno_t> (0) || offset == static_cast<lno_t> (1) ));
-        	/*
-            TEST_ASSERT( offset == static_cast<LO> (0) ||
-                         offset == static_cast<LO> (1) );
-                         */
-          }
-          else {
-        	EXPECT_TRUE( (offset == k ));
-            //TEST_EQUALITY( offset, k );
-          }
+  // Test the sorted case, with a raw array.
+  {
+    lno_t numEnt                = 7;
+    const lno_t indsToSearch[7] = {1, 1, 2, 3, 5, 8, 13};
+    nIVT indsToSearch_view("indsToSearch", numEnt);
+    typename nIVT::HostMirror h_indsToSearch_view =
+        Kokkos::create_mirror_view(indsToSearch_view);
+    for (int i = 0; i < numEnt; ++i) {
+      // std::cout << "indsToSearch[i]:" << indsToSearch[i] << std::endl;
+      h_indsToSearch_view(i) = indsToSearch[i];
+    }
+    Kokkos::deep_copy(indsToSearch_view, h_indsToSearch_view);
+    Kokkos::fence();
+    // KokkosKernels::Impl::kk_print_1Dview(indsToSearch_view);
+    const bool isSorted = true;
+
+    for (lno_t hint = 0; hint < 10; ++hint) {
+      // Test an index that is not in the array.
+      // This one is in [min, max].
+      lno_t indNotThere = 4;
+
+      lno_t offset = findRelOffset<lno_t, const lno_t*>(
+          indsToSearch_view.data(), numEnt, indNotThere, hint, isSorted);
+
+      EXPECT_TRUE((offset == numEnt));
+      // TEST_EQUALITY( offset, numEnt ); // not in the array
+
+      // Test another index that is not in the array.
+      // This one is _not_ in [min, max].
+      indNotThere = 42;
+      offset      = findRelOffset<lno_t, const lno_t*>(
+          indsToSearch_view.data(), numEnt, indNotThere, hint, isSorted);
+      EXPECT_TRUE((offset == numEnt));
+      // TEST_EQUALITY( offset, numEnt ); // not in the array
+
+      // Test all indices that are in the array.
+      for (lno_t k = 0; k < numEnt; ++k) {
+        const lno_t indToFind = indsToSearch[k];  // in the array
+        offset                = findRelOffset<lno_t, const lno_t*>(
+            indsToSearch_view.data(), numEnt, indToFind, hint, isSorted);
+        if (indToFind == static_cast<lno_t>(1)) {
+          // 1 is a duplicate in this example.  Treat it as a special
+          // case.  We don't specify which instance of duplicates the
+          // function must return, so either one is fine.
+
+          ASSERT_TRUE((offset == static_cast<lno_t>(0) ||
+                       offset == static_cast<lno_t>(1)));
+          /*
+      TEST_ASSERT( offset == static_cast<LO> (0) ||
+                   offset == static_cast<LO> (1) );
+                   */
+        } else {
+          EXPECT_TRUE((offset == k));
+          // TEST_EQUALITY( offset, k );
         }
       }
     }
+  }
 
-    // Test the sorted case, with a Kokkos::View.
-    {
-      lno_t numEnt = 7;
-      const lno_t indsToSearch[7] = {1, 1, 2, 3, 5, 8, 13};
-      nIVT indsToSearch_view("indsToSearch", numEnt);
-
-      typename nIVT::HostMirror h_indsToSearch_view = Kokkos::create_mirror_view (indsToSearch_view);
-      for (int i = 0; i < numEnt; ++i) h_indsToSearch_view(i) = indsToSearch[i];
-      Kokkos::deep_copy(indsToSearch_view, h_indsToSearch_view);
-      Kokkos::fence();
-
-      const bool isSorted = true;
-
-      for (lno_t hint = 0; hint < 10; ++hint) {
-        // Test an index that is not in the array.
-        // This one is in [min, max].
-        lno_t indNotThere = 4;
-        lno_t offset = findRelOffset<lno_t, IVT> (indsToSearch_view, numEnt,
-                                            indNotThere, hint, isSorted);
-        EXPECT_TRUE( (offset == numEnt ));
-        //TEST_EQUALITY( offset, numEnt ); // not in the array
-
-        // Test another index that is not in the array.
-        // This one is _not_ in [min, max].
-        indNotThere = 42;
-        offset = findRelOffset<lno_t, IVT> (indsToSearch_view, numEnt,
-                                         indNotThere, hint, isSorted);
-        EXPECT_TRUE( (offset == numEnt ));
-        //TEST_EQUALITY( offset, numEnt ); // not in the array
-
-        // Test all indices that are in the array.
-        for (lno_t k = 0; k < numEnt; ++k) {
-          const lno_t indToFind = indsToSearch[k]; // in the array
-          offset = findRelOffset<lno_t, IVT> (indsToSearch_view, numEnt,
-                                           indToFind, hint, isSorted);
-          if (indToFind == static_cast<lno_t> (1)) {
-            // 1 is a duplicate in this example.  Treat it as a special
-            // case.  We don't specify which instance of duplicates the
-            // function must return, so either one is fine.
-          	ASSERT_TRUE( (  offset == static_cast<lno_t> (0) || offset == static_cast<lno_t> (1) ));
-
-            //TEST_ASSERT( offset == static_cast<LO> (0) ||
-            //             offset == static_cast<LO> (1) );
-          }
-          else {
-          	EXPECT_TRUE( (offset == k ));
-            //TEST_EQUALITY( offset, k );
-          }
+  // Test the sorted case, with a Kokkos::View.
+  {
+    lno_t numEnt                = 7;
+    const lno_t indsToSearch[7] = {1, 1, 2, 3, 5, 8, 13};
+    nIVT indsToSearch_view("indsToSearch", numEnt);
+
+    typename nIVT::HostMirror h_indsToSearch_view =
+        Kokkos::create_mirror_view(indsToSearch_view);
+    for (int i = 0; i < numEnt; ++i) h_indsToSearch_view(i) = indsToSearch[i];
+    Kokkos::deep_copy(indsToSearch_view, h_indsToSearch_view);
+    Kokkos::fence();
+
+    const bool isSorted = true;
+
+    for (lno_t hint = 0; hint < 10; ++hint) {
+      // Test an index that is not in the array.
+      // This one is in [min, max].
+      lno_t indNotThere = 4;
+      lno_t offset      = findRelOffset<lno_t, IVT>(indsToSearch_view, numEnt,
+                                               indNotThere, hint, isSorted);
+      EXPECT_TRUE((offset == numEnt));
+      // TEST_EQUALITY( offset, numEnt ); // not in the array
+
+      // Test another index that is not in the array.
+      // This one is _not_ in [min, max].
+      indNotThere = 42;
+      offset = findRelOffset<lno_t, IVT>(indsToSearch_view, numEnt, indNotThere,
+                                         hint, isSorted);
+      EXPECT_TRUE((offset == numEnt));
+      // TEST_EQUALITY( offset, numEnt ); // not in the array
+
+      // Test all indices that are in the array.
+      for (lno_t k = 0; k < numEnt; ++k) {
+        const lno_t indToFind = indsToSearch[k];  // in the array
+        offset = findRelOffset<lno_t, IVT>(indsToSearch_view, numEnt, indToFind,
+                                           hint, isSorted);
+        if (indToFind == static_cast<lno_t>(1)) {
+          // 1 is a duplicate in this example.  Treat it as a special
+          // case.  We don't specify which instance of duplicates the
+          // function must return, so either one is fine.
+          ASSERT_TRUE((offset == static_cast<lno_t>(0) ||
+                       offset == static_cast<lno_t>(1)));
+
+          // TEST_ASSERT( offset == static_cast<LO> (0) ||
+          //             offset == static_cast<LO> (1) );
+        } else {
+          EXPECT_TRUE((offset == k));
+          // TEST_EQUALITY( offset, k );
         }
       }
     }
+  }
 
-    out << "Test the unsorted, nonempty array case" << endl;
+  out << "Test the unsorted, nonempty array case" << endl;
 
-    // Test the unsorted case, with a raw array.
-    {
-      lno_t numEnt = 7;
-      const lno_t indsToSearch[7] = {8, 1, 13, 1, 3, 2, 5};
-      const bool isSorted = false;
-
-      nIVT indsToSearch_view("indsToSearch", numEnt);
-
-      typename nIVT::HostMirror h_indsToSearch_view = Kokkos::create_mirror_view (indsToSearch_view);
-      for (int i = 0; i < numEnt; ++i) h_indsToSearch_view(i) = indsToSearch[i];
-      Kokkos::deep_copy(indsToSearch_view, h_indsToSearch_view);
-      Kokkos::fence();
-
-
-      for (lno_t hint = 0; hint < 10; ++hint) {
-        // Test an index that is not in the array.
-        // This one is in [min, max].
-        lno_t indNotThere = 4;
-        lno_t offset =
-          findRelOffset<lno_t, const lno_t* > (indsToSearch_view.data(), numEnt,
-                                         indNotThere, hint, isSorted);
-        EXPECT_TRUE( (offset == numEnt ));
-        //TEST_EQUALITY( offset, numEnt ); // not in the array
-
-        // Test another index that is not in the array.
-        // This one is _not_ in [min, max].
-        indNotThere = 42;
-        offset = findRelOffset<lno_t, const lno_t* > (indsToSearch_view.data(), numEnt,
-                                                indNotThere, hint, isSorted);
-        EXPECT_TRUE( (offset == numEnt ));
-        //TEST_EQUALITY( offset, numEnt ); // not in the array
-
-        // Test all indices that are in the array.
-        for (lno_t k = 0; k < numEnt; ++k) {
-          const lno_t indToFind = indsToSearch[k]; // in the array
-          offset = findRelOffset<lno_t, const lno_t* > (indsToSearch_view.data(), numEnt,
-                                                  indToFind, hint, isSorted);
-          if (indToFind == static_cast<lno_t> (1)) {
-            // 1 is a duplicate in this example.  Treat it as a special
-            // case.  We don't specify which instance of duplicates the
-            // function must return, so either one is fine.
-           	ASSERT_TRUE( (  offset == static_cast<lno_t> (1) || offset == static_cast<lno_t> (3) ));
-            //TEST_ASSERT( offset == static_cast<LO> (1) ||
-            //             offset == static_cast<LO> (3) );
-          }
-          else {
-        	EXPECT_TRUE( (offset == k ));
-            //TEST_EQUALITY( offset, k );
-          }
+  // Test the unsorted case, with a raw array.
+  {
+    lno_t numEnt                = 7;
+    const lno_t indsToSearch[7] = {8, 1, 13, 1, 3, 2, 5};
+    const bool isSorted         = false;
+
+    nIVT indsToSearch_view("indsToSearch", numEnt);
+
+    typename nIVT::HostMirror h_indsToSearch_view =
+        Kokkos::create_mirror_view(indsToSearch_view);
+    for (int i = 0; i < numEnt; ++i) h_indsToSearch_view(i) = indsToSearch[i];
+    Kokkos::deep_copy(indsToSearch_view, h_indsToSearch_view);
+    Kokkos::fence();
+
+    for (lno_t hint = 0; hint < 10; ++hint) {
+      // Test an index that is not in the array.
+      // This one is in [min, max].
+      lno_t indNotThere = 4;
+      lno_t offset      = findRelOffset<lno_t, const lno_t*>(
+          indsToSearch_view.data(), numEnt, indNotThere, hint, isSorted);
+      EXPECT_TRUE((offset == numEnt));
+      // TEST_EQUALITY( offset, numEnt ); // not in the array
+
+      // Test another index that is not in the array.
+      // This one is _not_ in [min, max].
+      indNotThere = 42;
+      offset      = findRelOffset<lno_t, const lno_t*>(
+          indsToSearch_view.data(), numEnt, indNotThere, hint, isSorted);
+      EXPECT_TRUE((offset == numEnt));
+      // TEST_EQUALITY( offset, numEnt ); // not in the array
+
+      // Test all indices that are in the array.
+      for (lno_t k = 0; k < numEnt; ++k) {
+        const lno_t indToFind = indsToSearch[k];  // in the array
+        offset                = findRelOffset<lno_t, const lno_t*>(
+            indsToSearch_view.data(), numEnt, indToFind, hint, isSorted);
+        if (indToFind == static_cast<lno_t>(1)) {
+          // 1 is a duplicate in this example.  Treat it as a special
+          // case.  We don't specify which instance of duplicates the
+          // function must return, so either one is fine.
+          ASSERT_TRUE((offset == static_cast<lno_t>(1) ||
+                       offset == static_cast<lno_t>(3)));
+          // TEST_ASSERT( offset == static_cast<LO> (1) ||
+          //             offset == static_cast<LO> (3) );
+        } else {
+          EXPECT_TRUE((offset == k));
+          // TEST_EQUALITY( offset, k );
         }
       }
     }
+  }
 
-    // Test the unsorted case, with a Kokkos::View.
-    {
-      lno_t numEnt = 7;
-      lno_t indsToSearch[7];
-      // This assumes UVM.
-      indsToSearch[0] = 8;
-      indsToSearch[1] = 1;
-      indsToSearch[2] = 13;
-      indsToSearch[3] = 1;
-      indsToSearch[4] = 3;
-      indsToSearch[5] = 2;
-      indsToSearch[6] = 5;
-
-
-      nIVT indsToSearch_view("indsToSearch", numEnt);
-
-      typename nIVT::HostMirror h_indsToSearch_view = Kokkos::create_mirror_view (indsToSearch_view);
-      for (int i = 0; i < numEnt; ++i) h_indsToSearch_view(i) = indsToSearch[i];
-      Kokkos::deep_copy(indsToSearch_view, h_indsToSearch_view);
-      Kokkos::fence();
-
-
-      const bool isSorted = false;
-
-      for (lno_t hint = 0; hint < 10; ++hint) {
-        // Test an index that is not in the array.
-        // This one is in [min, max].
-        lno_t indNotThere = 4;
-        lno_t offset = findRelOffset<lno_t, IVT> (indsToSearch_view, numEnt,
-                                            indNotThere, hint, isSorted);
-        EXPECT_TRUE( (offset == numEnt ));
-        //TEST_EQUALITY( offset, numEnt ); // not in the array
-
-        // Test another index that is not in the array.
-        // This one is _not_ in [min, max].
-        indNotThere = 42;
-        offset = findRelOffset<lno_t, IVT> (indsToSearch_view, numEnt,
-                                         indNotThere, hint, isSorted);
-        EXPECT_TRUE( (offset == numEnt ));
-        //TEST_EQUALITY( offset, numEnt ); // not in the array
-
-        // Test all indices that are in the array.
-        for (lno_t k = 0; k < numEnt; ++k) {
-          const lno_t indToFind = indsToSearch[k]; // in the array
-          offset = findRelOffset<lno_t, IVT> (indsToSearch_view, numEnt,
-                                           indToFind, hint, isSorted);
-          if (indToFind == static_cast<lno_t> (1)) {
-            // 1 is a duplicate in this example.  Treat it as a special
-            // case.  We don't specify which instance of duplicates the
-            // function must return, so either one is fine.
-        	ASSERT_TRUE( (  offset == static_cast<lno_t> (1) || offset == static_cast<lno_t> (3) ));
-        	/*
-            TEST_ASSERT( offset == static_cast<LO> (1) ||
-                         offset == static_cast<LO> (3) );
-            */
-          }
-          else {
-            EXPECT_TRUE( (offset == k ));
-
-            //TEST_EQUALITY( offset, k );
-          }
+  // Test the unsorted case, with a Kokkos::View.
+  {
+    lno_t numEnt = 7;
+    lno_t indsToSearch[7];
+    // This assumes UVM.
+    indsToSearch[0] = 8;
+    indsToSearch[1] = 1;
+    indsToSearch[2] = 13;
+    indsToSearch[3] = 1;
+    indsToSearch[4] = 3;
+    indsToSearch[5] = 2;
+    indsToSearch[6] = 5;
+
+    nIVT indsToSearch_view("indsToSearch", numEnt);
+
+    typename nIVT::HostMirror h_indsToSearch_view =
+        Kokkos::create_mirror_view(indsToSearch_view);
+    for (int i = 0; i < numEnt; ++i) h_indsToSearch_view(i) = indsToSearch[i];
+    Kokkos::deep_copy(indsToSearch_view, h_indsToSearch_view);
+    Kokkos::fence();
+
+    const bool isSorted = false;
+
+    for (lno_t hint = 0; hint < 10; ++hint) {
+      // Test an index that is not in the array.
+      // This one is in [min, max].
+      lno_t indNotThere = 4;
+      lno_t offset      = findRelOffset<lno_t, IVT>(indsToSearch_view, numEnt,
+                                               indNotThere, hint, isSorted);
+      EXPECT_TRUE((offset == numEnt));
+      // TEST_EQUALITY( offset, numEnt ); // not in the array
+
+      // Test another index that is not in the array.
+      // This one is _not_ in [min, max].
+      indNotThere = 42;
+      offset = findRelOffset<lno_t, IVT>(indsToSearch_view, numEnt, indNotThere,
+                                         hint, isSorted);
+      EXPECT_TRUE((offset == numEnt));
+      // TEST_EQUALITY( offset, numEnt ); // not in the array
+
+      // Test all indices that are in the array.
+      for (lno_t k = 0; k < numEnt; ++k) {
+        const lno_t indToFind = indsToSearch[k];  // in the array
+        offset = findRelOffset<lno_t, IVT>(indsToSearch_view, numEnt, indToFind,
+                                           hint, isSorted);
+        if (indToFind == static_cast<lno_t>(1)) {
+          // 1 is a duplicate in this example.  Treat it as a special
+          // case.  We don't specify which instance of duplicates the
+          // function must return, so either one is fine.
+          ASSERT_TRUE((offset == static_cast<lno_t>(1) ||
+                       offset == static_cast<lno_t>(3)));
+          /*
+      TEST_ASSERT( offset == static_cast<LO> (1) ||
+                   offset == static_cast<LO> (3) );
+      */
+        } else {
+          EXPECT_TRUE((offset == k));
+
+          // TEST_EQUALITY( offset, k );
         }
       }
     }
   }
+}
 
-
-  // Test findRelOffset with a longer array.  This ensures that even
-  // if findRelOffset optimizes for short arrays by using linear
-  // search, we'll still get test coverage for longer arrays.
-  //
-  // This test doesn't need to exercise all the Kokkos device types.
-  // Even if the aforementioned short-array optimization has different
-  // constants for different Kokkos device types, a sufficiently long
-  // array should exercise all cases.  Thus, this is not a templated
-  // test, so we don't need to add it to the list of instantiations
-  // for templated tests at the bottom of this file.
-  //
-  // This takes the same arguments as if it were declared via the
-  // TEUCHOS_UNIT_TEST macro.
-  template <typename lno_t, typename device_t>
-  void testLongArray (bool& /*success*/, std::ostream &out)
-  {
-    using KokkosSparse::findRelOffset;
-    //typedef long lno_t; // just for a change
-
-    //Teuchos::OSTab tab0 (out);
-    out << "Test findRelOffset with a long array" << endl;
-    //Teuchos::OSTab tab1 (out);
-
-    // Start with the array [0, 1, 2, ..., 2n], where the number of
-    // entries N = 2n+1 for natural numbers n.  Permute every other
-    // entry symmetrically about the middle entry (which exists
-    // because the number of entries is odd).  For example, for n = 4:
-    // [0 1 2 3 4 5 6 7 8] gets permuted to [8 1 6 3 4 5 2 7 0].  Use
-    // this to test findRelOffset.  (We don't just reverse x, in case
-    // implementations optimize for reverse contiguous order.)
-
-    const lno_t n = 100;
-    const lno_t N = 2*n + 1;
-    //std::vector<lno_t> indsToSearch (N);
-
-    typedef Kokkos::View<lno_t*, device_t> lno_view_t;
-    lno_view_t indsToSearch("indsToSearch", N);
-    typename lno_view_t::HostMirror h_indsToSearch = Kokkos::create_mirror_view (indsToSearch);
-
-
-    for (lno_t k = 0; k < n; ++k) {
-      h_indsToSearch[2*k] = 2*(n - k);
-      h_indsToSearch[2*k + 1] = 2*k + 1;
+// Test findRelOffset with a longer array.  This ensures that even
+// if findRelOffset optimizes for short arrays by using linear
+// search, we'll still get test coverage for longer arrays.
+//
+// This test doesn't need to exercise all the Kokkos device types.
+// Even if the aforementioned short-array optimization has different
+// constants for different Kokkos device types, a sufficiently long
+// array should exercise all cases.  Thus, this is not a templated
+// test, so we don't need to add it to the list of instantiations
+// for templated tests at the bottom of this file.
+//
+// This takes the same arguments as if it were declared via the
+// TEUCHOS_UNIT_TEST macro.
+template <typename lno_t, typename device_t>
+void testLongArray(bool& /*success*/, std::ostream& out) {
+  using KokkosSparse::findRelOffset;
+  // typedef long lno_t; // just for a change
+
+  // Teuchos::OSTab tab0 (out);
+  out << "Test findRelOffset with a long array" << endl;
+  // Teuchos::OSTab tab1 (out);
+
+  // Start with the array [0, 1, 2, ..., 2n], where the number of
+  // entries N = 2n+1 for natural numbers n.  Permute every other
+  // entry symmetrically about the middle entry (which exists
+  // because the number of entries is odd).  For example, for n = 4:
+  // [0 1 2 3 4 5 6 7 8] gets permuted to [8 1 6 3 4 5 2 7 0].  Use
+  // this to test findRelOffset.  (We don't just reverse x, in case
+  // implementations optimize for reverse contiguous order.)
+
+  const lno_t n = 100;
+  const lno_t N = 2 * n + 1;
+  // std::vector<lno_t> indsToSearch (N);
+
+  typedef Kokkos::View<lno_t*, device_t> lno_view_t;
+  lno_view_t indsToSearch("indsToSearch", N);
+  typename lno_view_t::HostMirror h_indsToSearch =
+      Kokkos::create_mirror_view(indsToSearch);
+
+  for (lno_t k = 0; k < n; ++k) {
+    h_indsToSearch[2 * k]     = 2 * (n - k);
+    h_indsToSearch[2 * k + 1] = 2 * k + 1;
+  }
+  Kokkos::deep_copy(indsToSearch, h_indsToSearch);
+  // We don't need to test all possible hints, just two per search
+  // value: the correct hint and some wrong hint.
+  for (lno_t k = 0; k < N; ++k) {
+    // We use std::vector<LO> in as the template parameter of
+    // findRelOffset, because the function should work just fine
+    // with anything that acts like a 1-D raw array.
+    {
+      const lno_t indToFind      = indsToSearch[k];
+      const lno_t expectedOffset = k;
+      const lno_t correctHint    = expectedOffset;
+      // Add some number not 1 to make the "wrong hint," in case
+      // there is a "search nearest" optimization (unlikely -- too
+      // many branches).
+      const lno_t wrongHint = expectedOffset + 7;
+
+      const lno_t offset0 =
+          findRelOffset<lno_t, /*std::vector<lno_t>*/ lno_view_t>(
+              indsToSearch, N, indToFind, correctHint, false);
+
+      EXPECT_TRUE((offset0 == expectedOffset));
+      // TEST_EQUALITY( offset0, expectedOffset );
+      const lno_t offset1 =
+          findRelOffset<lno_t, /*std::vector<lno_t>*/ lno_view_t>(
+              indsToSearch, N, indToFind, wrongHint, false);
+      EXPECT_TRUE((offset1 == expectedOffset));
+      // TEST_EQUALITY( offset1, expectedOffset );
     }
-    Kokkos::deep_copy(indsToSearch, h_indsToSearch);
-    // We don't need to test all possible hints, just two per search
-    // value: the correct hint and some wrong hint.
-    for (lno_t k = 0; k < N; ++k) {
-      // We use std::vector<LO> in as the template parameter of
-      // findRelOffset, because the function should work just fine
-      // with anything that acts like a 1-D raw array.
-      {
-        const lno_t indToFind = indsToSearch[k];
-        const lno_t expectedOffset = k;
-        const lno_t correctHint = expectedOffset;
-        // Add some number not 1 to make the "wrong hint," in case
-        // there is a "search nearest" optimization (unlikely -- too
-        // many branches).
-        const lno_t wrongHint = expectedOffset + 7;
-
-        const lno_t offset0 =
-          findRelOffset<lno_t, /*std::vector<lno_t>*/ lno_view_t > (indsToSearch, N, indToFind,
-                                               correctHint, false);
-
-        EXPECT_TRUE( (offset0 == expectedOffset ));
-        //TEST_EQUALITY( offset0, expectedOffset );
-        const lno_t offset1 =
-          findRelOffset<lno_t, /*std::vector<lno_t>*/ lno_view_t > (indsToSearch, N, indToFind,
-                                               wrongHint, false);
-        EXPECT_TRUE( (offset1 == expectedOffset ));
-        //TEST_EQUALITY( offset1, expectedOffset );
-      }
-      {
-        // This is the "index not in array" case.  We only need to
-        // test one hint here, since all hints are wrong.
-        const lno_t indToFind = N + 1; // not in the array
-        const lno_t hint = 0;
-        const lno_t offset0 =
-          findRelOffset<lno_t, /*std::vector<lno_t>*/ lno_view_t > (indsToSearch, N, indToFind,
-                                               hint, false);
-        EXPECT_TRUE( (offset0 == N ));
-        //TEST_EQUALITY( offset0, N );
-      }
+    {
+      // This is the "index not in array" case.  We only need to
+      // test one hint here, since all hints are wrong.
+      const lno_t indToFind = N + 1;  // not in the array
+      const lno_t hint      = 0;
+      const lno_t offset0 =
+          findRelOffset<lno_t, /*std::vector<lno_t>*/ lno_view_t>(
+              indsToSearch, N, indToFind, hint, false);
+      EXPECT_TRUE((offset0 == N));
+      // TEST_EQUALITY( offset0, N );
     }
   }
-} // namespace (anonymous)
+}
+}  // namespace Test
 
-template<typename lno_t, typename device_t>
-void test_findRelOffset()
-{
+template <typename lno_t, typename device_t>
+void test_findRelOffset() {
   using namespace Test;
 
-  class NullBuffer : public std::streambuf
-  {
-  public:
+  class NullBuffer : public std::streambuf {
+   public:
     int overflow(int c) { return c; }
   };
   NullBuffer null_buffer;
-  //std::ostream &out = std::cout;
+  // std::ostream &out = std::cout;
   std::ostream out(&null_buffer);
   out << "Test KokkosSparse::findRelOffset" << endl;
 
   bool success = true;
-  //host test
-  generalTest <lno_t, device_t>(success, out);
-  EXPECT_TRUE( success);
-  //host test
-  testLongArray<lno_t, device_t> (success, out);
-  EXPECT_TRUE( success);
+  // host test
+  generalTest<lno_t, device_t>(success, out);
+  EXPECT_TRUE(success);
+  // host test
+  testLongArray<lno_t, device_t>(success, out);
+  EXPECT_TRUE(success);
 }
 
+#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                          \
+  TEST_F(                                                                      \
+      TestCategory,                                                            \
+      sparse##_##findRelOffset##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    test_findRelOffset<ORDINAL, DEVICE>();                                     \
+  }
 
-
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
-TEST_F( TestCategory, sparse ## _ ##findRelOffset ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  test_findRelOffset<ORDINAL,DEVICE>(); \
-}
-
-#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
-  ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&         \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
-  ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&             \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, int, TestExecSpace)
 #endif
 
 #undef EXECUTE_TEST
 
-#endif // Backend/UVM check
+#endif  // Backend/UVM check
diff --git a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp
index 22eaf8d5bd..f255fc4fcf 100644
--- a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp
+++ b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp
@@ -56,6 +56,7 @@
 #include <iostream>
 #include <complex>
 #include <map>
+#include <random>
 #include <vector>
 #include "KokkosSparse_gauss_seidel.hpp"
 #include "KokkosSparse_partitioning_impl.hpp"
@@ -76,148 +77,158 @@ using namespace KokkosKernels::Experimental;
 using namespace KokkosSparse;
 using namespace KokkosSparse::Experimental;
 
-
 namespace Test {
 
-//Run GS on the given vectors, where the handle is already set up.
+// Run GS on the given vectors, where the handle is already set up.
 template <typename Handle, typename crsMat_t, typename vec_t>
 void run_gauss_seidel(
-    Handle& kh,
-    crsMat_t input_mat,
-    vec_t x_vector,
-    vec_t y_vector,
-    bool is_symmetric_graph,
-    typename crsMat_t::value_type omega,
-    int apply_type = 0 // 0 for symmetric, 1 for forward, 2 for backward.
-    )
-{
+    Handle& kh, crsMat_t input_mat, vec_t x_vector, vec_t y_vector,
+    bool is_symmetric_graph, typename crsMat_t::value_type omega,
+    int apply_type = 0  // 0 for symmetric, 1 for forward, 2 for backward.
+) {
   const size_t num_rows = input_mat.numRows();
   const size_t num_cols = input_mat.numCols();
   const int apply_count = 2;
 
-  gauss_seidel_symbolic
-    (&kh, num_rows, num_cols, input_mat.graph.row_map, input_mat.graph.entries, is_symmetric_graph);
-  gauss_seidel_numeric
-    (&kh, num_rows, num_cols, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, is_symmetric_graph);
-
-  switch (apply_type){
-  case 0:
-    symmetric_gauss_seidel_apply
-      (&kh, num_rows, num_cols, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count);
-    break;
-  case 1:
-    forward_sweep_gauss_seidel_apply
-    (&kh, num_rows, num_cols, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count);
-    break;
-  case 2:
-    backward_sweep_gauss_seidel_apply
-    (&kh, num_rows, num_cols, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count);
-    break;
-  default:
-    symmetric_gauss_seidel_apply
-    (&kh, num_rows, num_cols, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count);
-    break;
+  gauss_seidel_symbolic(&kh, num_rows, num_cols, input_mat.graph.row_map,
+                        input_mat.graph.entries, is_symmetric_graph);
+  gauss_seidel_numeric(&kh, num_rows, num_cols, input_mat.graph.row_map,
+                       input_mat.graph.entries, input_mat.values,
+                       is_symmetric_graph);
+
+  switch (apply_type) {
+    case 0:
+      symmetric_gauss_seidel_apply(
+          &kh, num_rows, num_cols, input_mat.graph.row_map,
+          input_mat.graph.entries, input_mat.values, x_vector, y_vector, false,
+          true, omega, apply_count);
+      break;
+    case 1:
+      forward_sweep_gauss_seidel_apply(
+          &kh, num_rows, num_cols, input_mat.graph.row_map,
+          input_mat.graph.entries, input_mat.values, x_vector, y_vector, false,
+          true, omega, apply_count);
+      break;
+    case 2:
+      backward_sweep_gauss_seidel_apply(
+          &kh, num_rows, num_cols, input_mat.graph.row_map,
+          input_mat.graph.entries, input_mat.values, x_vector, y_vector, false,
+          true, omega, apply_count);
+      break;
+    default:
+      symmetric_gauss_seidel_apply(
+          &kh, num_rows, num_cols, input_mat.graph.row_map,
+          input_mat.graph.entries, input_mat.values, x_vector, y_vector, false,
+          true, omega, apply_count);
+      break;
   }
 }
 
 template <typename crsMat_t, typename vec_t>
 void run_gauss_seidel(
-    crsMat_t input_mat,
-    GSAlgorithm gs_algorithm,
-    vec_t x_vector,
-    vec_t y_vector,
-    bool is_symmetric_graph,
-    int apply_type = 0, // 0 for symmetric, 1 for forward, 2 for backward.
+    crsMat_t input_mat, GSAlgorithm gs_algorithm, vec_t x_vector,
+    vec_t y_vector, bool is_symmetric_graph,
+    int apply_type   = 0,  // 0 for symmetric, 1 for forward, 2 for backward.
     int cluster_size = 1,
-    bool classic = false, // only with two-stage, true for sptrsv instead of richardson
-    ClusteringAlgorithm clusterAlgo = CLUSTER_DEFAULT)
-{
+    bool classic =
+        false,  // only with two-stage, true for sptrsv instead of richardson
+    ClusteringAlgorithm clusterAlgo = CLUSTER_DEFAULT,
+    KokkosGraph::ColoringAlgorithm coloringAlgo =
+        KokkosGraph::COLORING_DEFAULT) {
   using size_type = typename crsMat_t::size_type;
-  using lno_t = typename crsMat_t::ordinal_type;
-  using scalar_t = typename crsMat_t::value_type;
-  using device = typename crsMat_t::device_type;
+  using lno_t     = typename crsMat_t::ordinal_type;
+  using scalar_t  = typename crsMat_t::value_type;
+  using device    = typename crsMat_t::device_type;
 
-  typedef KokkosKernelsHandle
-      <size_type,lno_t, scalar_t,
-      typename device::execution_space, typename device::memory_space,typename device::memory_space > KernelHandle;
+  typedef KokkosKernelsHandle<
+      size_type, lno_t, scalar_t, typename device::execution_space,
+      typename device::memory_space, typename device::memory_space>
+      KernelHandle;
 
   scalar_t omega(0.9);
 
   KernelHandle kh;
-  if(gs_algorithm == GS_CLUSTER)
-    kh.create_gs_handle(clusterAlgo, cluster_size);
-  else if(gs_algorithm == GS_TWOSTAGE) {
+  if (gs_algorithm == GS_CLUSTER)
+    kh.create_gs_handle(clusterAlgo, cluster_size, coloringAlgo);
+  else if (gs_algorithm == GS_TWOSTAGE) {
     // test for two-stage/classical gs
     kh.create_gs_handle(gs_algorithm);
     kh.set_gs_twostage(!classic, input_mat.numRows());
     if (classic) {
       // two-stage with SpTRSV supports only omega = one
-      omega = Kokkos::ArithTraits<scalar_t>::one ();
+      omega = Kokkos::ArithTraits<scalar_t>::one();
     }
+  } else {
+    kh.create_gs_handle(GS_DEFAULT, coloringAlgo);
   }
-  else
-    kh.create_gs_handle(GS_DEFAULT);
 
-  run_gauss_seidel(kh, input_mat, x_vector, y_vector, is_symmetric_graph, omega, apply_type);
+  run_gauss_seidel(kh, input_mat, x_vector, y_vector, is_symmetric_graph, omega,
+                   apply_type);
 
   kh.destroy_gs_handle();
 }
 
-} // namespace Test
+}  // namespace Test
 
-template <typename scalar_t, typename lno_t, typename size_type, typename device>
-void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance, bool symmetric)
-{
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth,
+                             lno_t row_size_variance, bool symmetric) {
   using namespace Test;
-  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type> crsMat_t;
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
+          crsMat_t;
   typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
   typedef typename Kokkos::Details::ArithTraits<scalar_t>::mag_type mag_t;
   srand(245);
   lno_t numCols = numRows;
-  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth);
-  if(symmetric)
-  {
-    //Symmetrize on host, rather than relying on the parallel versions (those can be tested for symmetric=false)
-    input_mat = Test::symmetrize<scalar_t, lno_t, size_type, device, crsMat_t>(input_mat);
+  crsMat_t input_mat =
+      KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<
+          crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth);
+  if (symmetric) {
+    // Symmetrize on host, rather than relying on the parallel versions (those
+    // can be tested for symmetric=false)
+    input_mat = Test::symmetrize<scalar_t, lno_t, size_type, device, crsMat_t>(
+        input_mat);
   }
   lno_t nv = input_mat.numRows();
-  scalar_view_t solution_x(Kokkos::view_alloc(Kokkos::WithoutInitializing, "X (correct)"), nv);
+  scalar_view_t solution_x(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "X (correct)"), nv);
   create_random_x_vector(solution_x);
   mag_t initial_norm_res = KokkosBlas::nrm2(solution_x);
   scalar_view_t y_vector = create_random_y_vector(input_mat, solution_x);
-  //GS_DEFAULT is GS_TEAM on CUDA and GS_PERMUTED on other spaces, and the behavior
-  //of each algorithm _should be_ the same on every execution space, which is why
-  //we just test GS_DEFAULT.
-  int apply_count = 3;  //test symmetric, forward, backward
-  scalar_view_t x_vector(Kokkos::view_alloc(Kokkos::WithoutInitializing, "x vector"), nv);
-  const scalar_t one = Kokkos::Details::ArithTraits<scalar_t>::one ();
-  const scalar_t zero = Kokkos::Details::ArithTraits<scalar_t>::zero ();
+  // GS_DEFAULT is GS_TEAM on CUDA and GS_PERMUTED on other spaces, and the
+  // behavior of each algorithm _should be_ the same on every execution space,
+  // which is why we just test GS_DEFAULT.
+  int apply_count = 3;  // test symmetric, forward, backward
+  scalar_view_t x_vector(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "x vector"), nv);
+  const scalar_t one  = Kokkos::Details::ArithTraits<scalar_t>::one();
+  const scalar_t zero = Kokkos::Details::ArithTraits<scalar_t>::zero();
   //*** Point-coloring version ****
-  for (int apply_type = 0; apply_type < apply_count; ++apply_type)
-  {
+  for (int apply_type = 0; apply_type < apply_count; ++apply_type) {
     Kokkos::Timer timer1;
     Kokkos::deep_copy(x_vector, zero);
-    run_gauss_seidel(input_mat, GS_DEFAULT, x_vector, y_vector, symmetric, apply_type);
-    //double gs = timer1.seconds();
-    //KokkosKernels::Impl::print_1Dview(x_vector);
+    run_gauss_seidel(input_mat, GS_DEFAULT, x_vector, y_vector, symmetric,
+                     apply_type);
+    // double gs = timer1.seconds();
+    // KokkosKernels::Impl::print_1Dview(x_vector);
     KokkosBlas::axpby(one, solution_x, -one, x_vector);
     mag_t result_norm_res = KokkosBlas::nrm2(x_vector);
     EXPECT_LT(result_norm_res, initial_norm_res);
   }
   //*** Cluster-coloring version ****
-  int clusterSizes[3] = {2, 5, 34};
-  std::vector<ClusteringAlgorithm> clusteringAlgos = {CLUSTER_MIS2, CLUSTER_BALLOON};
-  for(int csize = 0; csize < 3; csize++)
-  {
-    for(auto clusterAlgo : clusteringAlgos)
-    {
-      for(int apply_type = 0; apply_type < apply_count; ++apply_type)
-      {
+  int clusterSizes[3]                              = {2, 5, 34};
+  std::vector<ClusteringAlgorithm> clusteringAlgos = {CLUSTER_MIS2,
+                                                      CLUSTER_BALLOON};
+  for (int csize = 0; csize < 3; csize++) {
+    for (auto clusterAlgo : clusteringAlgos) {
+      for (int apply_type = 0; apply_type < apply_count; ++apply_type) {
         Kokkos::Timer timer1;
-        //Zero out X before solving
+        // Zero out X before solving
         Kokkos::deep_copy(x_vector, zero);
-        run_gauss_seidel(
-            input_mat, GS_CLUSTER, x_vector, y_vector, symmetric, apply_type, clusterSizes[csize], false, clusterAlgo);
+        run_gauss_seidel(input_mat, GS_CLUSTER, x_vector, y_vector, symmetric,
+                         apply_type, clusterSizes[csize], false, clusterAlgo);
         KokkosBlas::axpby(one, solution_x, -one, x_vector);
         mag_t result_norm_res = KokkosBlas::nrm2(x_vector);
         EXPECT_LT(result_norm_res, initial_norm_res);
@@ -225,75 +236,82 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_
     }
   }
   //*** Two-stage version ****
-  for (int apply_type = 0; apply_type < apply_count; ++apply_type)
-  {
+  for (int apply_type = 0; apply_type < apply_count; ++apply_type) {
     Kokkos::deep_copy(x_vector, zero);
-    run_gauss_seidel(input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type);
+    run_gauss_seidel(input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric,
+                     apply_type);
     KokkosBlas::axpby(one, solution_x, -one, x_vector);
     mag_t result_norm_res = KokkosBlas::nrm2(x_vector);
     EXPECT_LT(result_norm_res, initial_norm_res);
   }
   //*** Two-stage version (classic) ****
-  for (int apply_type = 0; apply_type < apply_count; ++apply_type)
-  {
+  for (int apply_type = 0; apply_type < apply_count; ++apply_type) {
     Kokkos::deep_copy(x_vector, zero);
-    run_gauss_seidel(input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type, 0, true);
+    run_gauss_seidel(input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric,
+                     apply_type, 0, true);
     KokkosBlas::axpby(one, solution_x, -one, x_vector);
     mag_t result_norm_res = KokkosBlas::nrm2(x_vector);
     EXPECT_LT(result_norm_res, initial_norm_res);
   }
 }
 
-template <typename scalar_t, typename lno_t, typename size_type, typename device>
-void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance, lno_t numVecs, bool symmetric)
-{
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth,
+                             lno_t row_size_variance, lno_t numVecs,
+                             bool symmetric) {
   using namespace Test;
   srand(245);
-  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type> crsMat_t;
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
+          crsMat_t;
   typedef Kokkos::View<scalar_t**, default_layout, device> scalar_view2d_t;
-  typedef Kokkos::View<scalar_t**, default_layout, Kokkos::HostSpace> host_scalar_view2d_t;
+  typedef Kokkos::View<scalar_t**, default_layout, Kokkos::HostSpace>
+      host_scalar_view2d_t;
   typedef typename Kokkos::Details::ArithTraits<scalar_t>::mag_type mag_t;
 
   lno_t numCols = numRows;
-  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<crsMat_t>(numRows,numCols,nnz,row_size_variance, bandwidth);
-  if(symmetric)
-  {
-    //Symmetrize on host, rather than relying on the parallel versions (those can be tested for symmetric=false)
-    input_mat = Test::symmetrize<scalar_t, lno_t, size_type, device, crsMat_t>(input_mat);
+  crsMat_t input_mat =
+      KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<
+          crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth);
+  if (symmetric) {
+    // Symmetrize on host, rather than relying on the parallel versions (those
+    // can be tested for symmetric=false)
+    input_mat = Test::symmetrize<scalar_t, lno_t, size_type, device, crsMat_t>(
+        input_mat);
   }
   lno_t nv = input_mat.numRows();
-  host_scalar_view2d_t solution_x(Kokkos::view_alloc(Kokkos::WithoutInitializing, "X (correct)"), nv, numVecs);
+  host_scalar_view2d_t solution_x(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "X (correct)"), nv,
+      numVecs);
   create_random_x_vector(solution_x);
-  scalar_view2d_t x_vector(Kokkos::view_alloc(Kokkos::WithoutInitializing, "X"), nv, numVecs);
+  scalar_view2d_t x_vector(Kokkos::view_alloc(Kokkos::WithoutInitializing, "X"),
+                           nv, numVecs);
   Kokkos::deep_copy(x_vector, solution_x);
   scalar_view2d_t y_vector = create_random_y_vector_mv(input_mat, x_vector);
-  auto x_host = Kokkos::create_mirror_view(x_vector);
+  auto x_host              = Kokkos::create_mirror_view(x_vector);
   std::vector<mag_t> initial_norms(numVecs);
-  for(lno_t i = 0; i < numVecs; i++)
-  {
+  for (lno_t i = 0; i < numVecs; i++) {
     scalar_t sum = 0;
-    for(lno_t j = 0; j < nv; j++)
-    {
+    for (lno_t j = 0; j < nv; j++) {
       sum += solution_x(j, i) * solution_x(j, i);
     }
     initial_norms[i] = Kokkos::Details::ArithTraits<mag_t>::sqrt(
         Kokkos::Details::ArithTraits<scalar_t>::abs(sum));
   }
-  int apply_count = 3;  //test symmetric, forward, backward
-  const scalar_t zero = Kokkos::Details::ArithTraits<scalar_t>::zero ();
+  int apply_count     = 3;  // test symmetric, forward, backward
+  const scalar_t zero = Kokkos::Details::ArithTraits<scalar_t>::zero();
   //*** Point-coloring version ****
-  for(int apply_type = 0; apply_type < apply_count; ++apply_type)
-  {
+  for (int apply_type = 0; apply_type < apply_count; ++apply_type) {
     Kokkos::Timer timer1;
-    //Zero out X before solving
+    // Zero out X before solving
     Kokkos::deep_copy(x_vector, zero);
-    run_gauss_seidel(input_mat, GS_DEFAULT, x_vector, y_vector, symmetric, apply_type);
+    run_gauss_seidel(input_mat, GS_DEFAULT, x_vector, y_vector, symmetric,
+                     apply_type);
     Kokkos::deep_copy(x_host, x_vector);
-    for(lno_t i = 0; i < numVecs; i++)
-    {
+    for (lno_t i = 0; i < numVecs; i++) {
       scalar_t diffDot = 0;
-      for(lno_t j = 0; j < numRows; j++)
-      {
+      for (lno_t j = 0; j < numRows; j++) {
         scalar_t diff = x_host(j, i) - solution_x(j, i);
         diffDot += diff * diff;
       }
@@ -304,23 +322,19 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_
   }
   //*** Cluster-coloring version ****
   int clusterSizes[3] = {2, 5, 34};
-  for(int csize = 0; csize < 3; csize++)
-  {
-    for(int algo = 0; algo < (int) NUM_CLUSTERING_ALGORITHMS; algo++)
-    {
-      for(int apply_type = 0; apply_type < apply_count; ++apply_type)
-      {
+  for (int csize = 0; csize < 3; csize++) {
+    for (int algo = 0; algo < (int)NUM_CLUSTERING_ALGORITHMS; algo++) {
+      for (int apply_type = 0; apply_type < apply_count; ++apply_type) {
         Kokkos::Timer timer1;
-        //Zero out X before solving
+        // Zero out X before solving
         Kokkos::deep_copy(x_vector, zero);
-        run_gauss_seidel(
-            input_mat, GS_CLUSTER, x_vector, y_vector, symmetric, apply_type, clusterSizes[csize], (ClusteringAlgorithm) algo);
+        run_gauss_seidel(input_mat, GS_CLUSTER, x_vector, y_vector, symmetric,
+                         apply_type, clusterSizes[csize],
+                         (ClusteringAlgorithm)algo);
         Kokkos::deep_copy(x_host, x_vector);
-        for(lno_t i = 0; i < numVecs; i++)
-        {
+        for (lno_t i = 0; i < numVecs; i++) {
           scalar_t diffDot = 0;
-          for(lno_t j = 0; j < numRows; j++)
-          {
+          for (lno_t j = 0; j < numRows; j++) {
             scalar_t diff = x_host(j, i) - solution_x(j, i);
             diffDot += diff * diff;
           }
@@ -332,17 +346,15 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_
     }
   }
   //*** Two-stage version ****
-  for(int apply_type = 0; apply_type < apply_count; ++apply_type)
-  {
-    //Zero out X before solving
+  for (int apply_type = 0; apply_type < apply_count; ++apply_type) {
+    // Zero out X before solving
     Kokkos::deep_copy(x_vector, zero);
-    run_gauss_seidel(input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type);
+    run_gauss_seidel(input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric,
+                     apply_type);
     Kokkos::deep_copy(x_host, x_vector);
-    for(lno_t i = 0; i < numVecs; i++)
-    {
+    for (lno_t i = 0; i < numVecs; i++) {
       scalar_t diffDot = 0;
-      for(lno_t j = 0; j < numRows; j++)
-      {
+      for (lno_t j = 0; j < numRows; j++) {
         scalar_t diff = x_host(j, i) - solution_x(j, i);
         diffDot += diff * diff;
       }
@@ -352,17 +364,15 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_
     }
   }
   //*** Two-stage version (classic) ****
-  for(int apply_type = 0; apply_type < apply_count; ++apply_type)
-  {
-    //Zero out X before solving
+  for (int apply_type = 0; apply_type < apply_count; ++apply_type) {
+    // Zero out X before solving
     Kokkos::deep_copy(x_vector, zero);
-    run_gauss_seidel(input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric, apply_type, 0, true);
+    run_gauss_seidel(input_mat, GS_TWOSTAGE, x_vector, y_vector, symmetric,
+                     apply_type, 0, true);
     Kokkos::deep_copy(x_host, x_vector);
-    for(lno_t i = 0; i < numVecs; i++)
-    {
+    for (lno_t i = 0; i < numVecs; i++) {
       scalar_t diffDot = 0;
-      for(lno_t j = 0; j < numRows; j++)
-      {
+      for (lno_t j = 0; j < numRows; j++) {
         scalar_t diff = x_host(j, i) - solution_x(j, i);
         diffDot += diff * diff;
       }
@@ -373,108 +383,117 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_
   }
 }
 
-template <typename scalar_t, typename lno_t, typename size_type, typename device>
-void test_sequential_sor(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance) {
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void test_sequential_sor(lno_t numRows, size_type nnz, lno_t bandwidth,
+                         lno_t row_size_variance) {
   const scalar_t zero = Kokkos::Details::ArithTraits<scalar_t>::zero();
-  const scalar_t one = Kokkos::Details::ArithTraits<scalar_t>::one();
+  const scalar_t one  = Kokkos::Details::ArithTraits<scalar_t>::one();
   srand(245);
   typedef typename device::execution_space exec_space;
-  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type> crsMat_t;
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
+          crsMat_t;
   lno_t numCols = numRows;
-  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<crsMat_t>(numRows,numCols,nnz,row_size_variance, bandwidth);
-  auto rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), input_mat.graph.row_map);
-  auto entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), input_mat.graph.entries);
-  auto values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), input_mat.values);
-  //create raw x (unkown), y (rhs) vectors
+  crsMat_t input_mat =
+      KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<
+          crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth);
+  auto rowmap  = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
+                                                    input_mat.graph.row_map);
+  auto entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
+                                                     input_mat.graph.entries);
+  auto values  = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
+                                                    input_mat.values);
+  // create raw x (unkown), y (rhs) vectors
   using vector_t = typename crsMat_t::values_type::non_const_type;
-  //Create random x
+  // Create random x
   vector_t x("X", numRows);
   auto x_host = Kokkos::create_mirror_view(x);
-  for(lno_t i = 0; i < numRows; i++)
-  {
+  for (lno_t i = 0; i < numRows; i++) {
     x_host(i) = one * scalar_t(10.0 * rand() / RAND_MAX);
   }
   Kokkos::deep_copy(x, x_host);
-  //record the correct solution, to compare against at the end
+  // record the correct solution, to compare against at the end
   vector_t xgold("X gold", numRows);
   Kokkos::deep_copy(xgold, x);
   vector_t y = Test::create_random_y_vector(input_mat, x);
   exec_space().fence();
   auto y_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), y);
-  //initial solution is zero
+  // initial solution is zero
   Kokkos::deep_copy(x_host, zero);
-  //get the inverse diagonal (only needed on host)
+  // get the inverse diagonal (only needed on host)
   Kokkos::View<scalar_t*, Kokkos::HostSpace> invDiag("diag^-1", numRows);
-  for(lno_t i = 0; i < numRows; i++)
-  {
-    for(size_type j = rowmap(i); j < rowmap(i + 1); j++)
-    {
-      if(entries(j) == i)
-        invDiag(i) = one / values(j);
+  for (lno_t i = 0; i < numRows; i++) {
+    for (size_type j = rowmap(i); j < rowmap(i + 1); j++) {
+      if (entries(j) == i) invDiag(i) = one / values(j);
     }
   }
-  for(int i = 0; i < 1; i++)
-  {
-    KokkosSparse::Impl::Sequential::gaussSeidel
-      <lno_t, size_type, scalar_t, scalar_t, scalar_t>
-      (numRows, 1, rowmap.data(), entries.data(), values.data(),
-       y_host.data(), numRows,
-       x_host.data(), numRows,
-       invDiag.data(),
-       one, //omega
-       "F");
-    KokkosSparse::Impl::Sequential::gaussSeidel
-      <lno_t, size_type, scalar_t, scalar_t, scalar_t>
-      (numRows, 1, rowmap.data(), entries.data(), values.data(),
-       y_host.data(), numRows,
-       x_host.data(), numRows,
-       invDiag.data(),
-       one, //omega
-       "B");
+  for (int i = 0; i < 1; i++) {
+    KokkosSparse::Impl::Sequential::gaussSeidel<lno_t, size_type, scalar_t,
+                                                scalar_t, scalar_t>(
+        numRows, 1, rowmap.data(), entries.data(), values.data(), y_host.data(),
+        numRows, x_host.data(), numRows, invDiag.data(),
+        one,  // omega
+        "F");
+    KokkosSparse::Impl::Sequential::gaussSeidel<lno_t, size_type, scalar_t,
+                                                scalar_t, scalar_t>(
+        numRows, 1, rowmap.data(), entries.data(), values.data(), y_host.data(),
+        numRows, x_host.data(), numRows, invDiag.data(),
+        one,  // omega
+        "B");
   }
-  //Copy solution back
+  // Copy solution back
   Kokkos::deep_copy(x, x_host);
-  //Check against gold solution
-  scalar_t xSq = KokkosBlas::dot(x, x);
+  // Check against gold solution
+  scalar_t xSq     = KokkosBlas::dot(x, x);
   scalar_t solnDot = KokkosBlas::dot(x, xgold);
-  double scaledSolutionDot = Kokkos::Details::ArithTraits<scalar_t>::abs(solnDot / xSq);
+  double scaledSolutionDot =
+      Kokkos::Details::ArithTraits<scalar_t>::abs(solnDot / xSq);
   EXPECT_TRUE(0.99 < scaledSolutionDot);
 }
 
-template <typename scalar_t, typename lno_t, typename size_type, typename device>
-void test_balloon_clustering(lno_t numRows, size_type nnzPerRow, lno_t bandwidth)
-{
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void test_balloon_clustering(lno_t numRows, size_type nnzPerRow,
+                             lno_t bandwidth) {
   using namespace Test;
-  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type> crsMat_t;
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
+          crsMat_t;
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type const_lno_row_view_t;
   typedef typename graph_t::entries_type const_lno_nnz_view_t;
   typedef typename graph_t::row_map_type::non_const_type lno_row_view_t;
   typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
-  typedef KokkosKernelsHandle
-      <size_type, lno_t, scalar_t,
-      typename device::execution_space, typename device::memory_space,typename device::memory_space> KernelHandle;
+  typedef KokkosKernelsHandle<
+      size_type, lno_t, scalar_t, typename device::execution_space,
+      typename device::memory_space, typename device::memory_space>
+      KernelHandle;
   srand(245);
   size_type nnzTotal = nnzPerRow * numRows;
-  lno_t nnzVariance = nnzPerRow / 4;
-  crsMat_t A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(numRows, numRows, nnzTotal, nnzVariance, bandwidth);
+  lno_t nnzVariance  = nnzPerRow / 4;
+  crsMat_t A         = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(
+      numRows, numRows, nnzTotal, nnzVariance, bandwidth);
   lno_row_view_t symRowmap;
   lno_nnz_view_t symEntries;
-  KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap
-    <const_lno_row_view_t, const_lno_nnz_view_t, lno_row_view_t, lno_nnz_view_t, typename device::execution_space>
-    (numRows, A.graph.row_map, A.graph.entries, symRowmap, symEntries);
-  KokkosSparse::Impl::BalloonClustering<KernelHandle, lno_row_view_t, lno_nnz_view_t> balloon(numRows, symRowmap, symEntries);
-  for(int clusterSize = 1; clusterSize <= numRows / 16; clusterSize = std::ceil(clusterSize * 1.3))
-  {
+  KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap<
+      const_lno_row_view_t, const_lno_nnz_view_t, lno_row_view_t,
+      lno_nnz_view_t, typename device::execution_space>(
+      numRows, A.graph.row_map, A.graph.entries, symRowmap, symEntries);
+  KokkosSparse::Impl::BalloonClustering<KernelHandle, lno_row_view_t,
+                                        lno_nnz_view_t>
+      balloon(numRows, symRowmap, symEntries);
+  for (int clusterSize = 1; clusterSize <= numRows / 16;
+       clusterSize     = std::ceil(clusterSize * 1.3)) {
     auto vertClusters = balloon.run(clusterSize);
-    //validate results: make sure cluster labels are in bounds, and that the number of clusters is correct
+    // validate results: make sure cluster labels are in bounds, and that the
+    // number of clusters is correct
     auto vertClustersHost = Kokkos::create_mirror_view(vertClusters);
     Kokkos::deep_copy(vertClustersHost, vertClusters);
     lno_t numClusters = (numRows + clusterSize - 1) / clusterSize;
-    //check the hard constraints of the clustering
+    // check the hard constraints of the clustering
     std::set<lno_t> uniqueClusterIDs;
-    for(lno_t i = 0; i < numRows; i++)
-    {
+    for (lno_t i = 0; i < numRows; i++) {
       EXPECT_TRUE(vertClustersHost(i) >= 0);
       EXPECT_TRUE(vertClustersHost(i) < numClusters);
       uniqueClusterIDs.insert(vertClustersHost(i));
@@ -483,52 +502,60 @@ void test_balloon_clustering(lno_t numRows, size_type nnzPerRow, lno_t bandwidth
   }
 }
 
-template <typename scalar_t, typename lno_t, typename size_type, typename device>
-void test_sgs_zero_rows()
-{
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void test_gauss_seidel_empty() {
   using namespace Test;
-  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type> crsMat_t;
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
+          crsMat_t;
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::non_const_type row_map_type;
   typedef typename graph_t::entries_type::non_const_type entries_type;
   typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
-  typedef KokkosKernelsHandle
-      <size_type, lno_t, scalar_t,
-      typename device::execution_space, typename device::memory_space,typename device::memory_space> KernelHandle;
-  //The rowmap of a zero-row matrix can be length 0 or 1, so Gauss-Seidel should work with both
-  //(the setup and apply are essentially no-ops but they shouldn't crash or throw exceptions)
-  //For this test, create size-0 and size-1 rowmaps separately, and make sure each work with both point and cluster
-  for(int doingCluster = 0; doingCluster < 2; doingCluster++)
-  {
-    for(int rowmapLen = 0; rowmapLen < 2; rowmapLen++)
-    {
+  typedef KokkosKernelsHandle<
+      size_type, lno_t, scalar_t, typename device::execution_space,
+      typename device::memory_space, typename device::memory_space>
+      KernelHandle;
+  // The rowmap of a zero-row matrix can be length 0 or 1, so Gauss-Seidel
+  // should work with both (the setup and apply are essentially no-ops but they
+  // shouldn't crash or throw exceptions) For this test, create size-0 and
+  // size-1 rowmaps separately, and make sure each work with both point and
+  // cluster. Check also 5x5 matrix with empty rows (0-nnz), which can trigger
+  // different bugs.
+  for (int doingCluster = 0; doingCluster < 2; doingCluster++) {
+    for (const int rowmapLen : {0, 1, 5}) {
       KernelHandle kh;
-      if(doingCluster)
+      if (doingCluster)
         kh.create_gs_handle(CLUSTER_DEFAULT, 10);
       else
         kh.create_gs_handle(GS_DEFAULT);
-      //initialized to 0
+      const auto nRows = KOKKOSKERNELS_MACRO_MAX(0, rowmapLen - 1);
+      // initialized to 0
       row_map_type rowmap("Rowmap", rowmapLen);
       entries_type entries("Entries", 0);
       scalar_view_t values("Values", 0);
-      //also, make sure graph symmetrization doesn't crash on zero rows
-      gauss_seidel_symbolic(&kh, 0, 0, rowmap, entries, false);
-      gauss_seidel_numeric(&kh, 0, 0, rowmap, entries, values, false);
-      scalar_view_t x("X", 0);
-      scalar_view_t y("Y", 0);
+      // also, make sure graph symmetrization doesn't crash on zero rows
+      gauss_seidel_symbolic(&kh, nRows, nRows, rowmap, entries, false);
+      gauss_seidel_numeric(&kh, nRows, nRows, rowmap, entries, values, false);
+      scalar_view_t x("X", nRows);
+      scalar_view_t y("Y", nRows);
       scalar_t omega(0.9);
-      symmetric_gauss_seidel_apply
-        (&kh, 0, 0, rowmap, entries, values, x, y, false, true, omega, 3);
+      symmetric_gauss_seidel_apply(&kh, nRows, nRows, rowmap, entries, values,
+                                   x, y, false, true, omega, 3);
       kh.destroy_gs_handle();
     }
   }
 }
 
-template <typename scalar_t, typename lno_t, typename size_type, typename device>
-void test_gauss_seidel_long_rows(lno_t numRows, lno_t numLongRows, lno_t nnzPerShortRow, bool symmetric)
-{
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void test_gauss_seidel_long_rows(lno_t numRows, lno_t numLongRows,
+                                 lno_t nnzPerShortRow, bool symmetric) {
   using namespace Test;
-  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type> crsMat_t;
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
+          crsMat_t;
   typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
   typedef typename crsMat_t::index_type::non_const_type entries_view_t;
   typedef typename crsMat_t::row_map_type::non_const_type rowmap_view_t;
@@ -539,202 +566,309 @@ void test_gauss_seidel_long_rows(lno_t numRows, lno_t numLongRows, lno_t nnzPerS
   std::vector<lno_t> entries;
   std::vector<scalar_t> values;
   std::vector<lno_t> rowLengths;
-  for(lno_t i = 0; i < numRows; i++)
-  {
-    if(i < numLongRows)
+  for (lno_t i = 0; i < numRows; i++) {
+    if (i < numLongRows)
       rowLengths.push_back(numRows);
     else
       rowLengths.push_back(nnzPerShortRow);
   }
-  std::random_shuffle(rowLengths.begin(), rowLengths.end());
+  std::shuffle(rowLengths.begin(), rowLengths.end(),
+               std::mt19937(std::random_device()()));
   size_type totalEntries = 0;
-  int randSteps = 1000000;
+  int randSteps          = 1000000;
   scalar_t offDiagBase;
   {
     scalar_t unused;
     Test::getRandomBounds(0.6, unused, offDiagBase);
   }
-  for(lno_t i = 0; i < numRows; i++)
-  {
-    for(lno_t ent = 0; ent < rowLengths[i]; ent++)
-    {
-      if(ent == 0)
-      {
+  for (lno_t i = 0; i < numRows; i++) {
+    for (lno_t ent = 0; ent < rowLengths[i]; ent++) {
+      if (ent == 0) {
         entries.push_back(i);
         values.push_back(2.5 * one);
-      }
-      else
-      {
+      } else {
         entries.push_back(rand() % numRows);
-        values.push_back((-0.3 + (0.6 * (rand() % randSteps) / randSteps)) * offDiagBase);
+        values.push_back((-0.3 + (0.6 * (rand() % randSteps) / randSteps)) *
+                         offDiagBase);
       }
     }
     totalEntries += rowLengths[i];
     rowmap.push_back(totalEntries);
   }
-  scalar_view_t valuesView(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values"), totalEntries);
-  entries_view_t entriesView(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries"), totalEntries);
-  rowmap_view_t rowmapView(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Rowmap"), numRows + 1);
-  Kokkos::deep_copy(valuesView, Kokkos::View<scalar_t*, Kokkos::HostSpace>(values.data(), totalEntries));
-  Kokkos::deep_copy(entriesView, Kokkos::View<lno_t*, Kokkos::HostSpace>(entries.data(), totalEntries));
-  Kokkos::deep_copy(rowmapView, Kokkos::View<size_type*, Kokkos::HostSpace>(rowmap.data(), numRows + 1));
-  crsMat_t input_mat("A", numRows, numRows, totalEntries, valuesView, rowmapView, entriesView);
+  scalar_view_t valuesView(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values"), totalEntries);
+  entries_view_t entriesView(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries"), totalEntries);
+  rowmap_view_t rowmapView(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Rowmap"), numRows + 1);
+  Kokkos::deep_copy(valuesView, Kokkos::View<scalar_t*, Kokkos::HostSpace>(
+                                    values.data(), totalEntries));
+  Kokkos::deep_copy(entriesView, Kokkos::View<lno_t*, Kokkos::HostSpace>(
+                                     entries.data(), totalEntries));
+  Kokkos::deep_copy(rowmapView, Kokkos::View<size_type*, Kokkos::HostSpace>(
+                                    rowmap.data(), numRows + 1));
+  crsMat_t input_mat("A", numRows, numRows, totalEntries, valuesView,
+                     rowmapView, entriesView);
   input_mat = KokkosKernels::sort_and_merge_matrix(input_mat);
-  if(symmetric)
-  {
-    //Symmetrize on host, rather than relying on the parallel versions (those can be tested for symmetric=false)
-    input_mat = Test::symmetrize<scalar_t, lno_t, size_type, device, crsMat_t>(input_mat);
+  if (symmetric) {
+    // Symmetrize on host, rather than relying on the parallel versions (those
+    // can be tested for symmetric=false)
+    input_mat = Test::symmetrize<scalar_t, lno_t, size_type, device, crsMat_t>(
+        input_mat);
   }
   lno_t nv = input_mat.numRows();
-  scalar_view_t solution_x(Kokkos::view_alloc(Kokkos::WithoutInitializing, "X (correct)"), nv);
+  scalar_view_t solution_x(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "X (correct)"), nv);
   create_random_x_vector(solution_x);
   mag_t initial_norm_res = KokkosBlas::nrm2(solution_x);
   scalar_view_t y_vector = create_random_y_vector(input_mat, solution_x);
-  //GS_DEFAULT is GS_TEAM on CUDA and GS_PERMUTED on other spaces, and the behavior
-  //of each algorithm _should be_ the same on every execution space, which is why
-  //we just test GS_DEFAULT.
-  int apply_count = 1;  //test symmetric, forward, backward
-  scalar_view_t x_vector(Kokkos::view_alloc(Kokkos::WithoutInitializing, "x vector"), nv);
-  for (int apply_type = 0; apply_type < apply_count; ++apply_type)
-  {
-    typedef KokkosKernelsHandle
-      <size_type,lno_t, scalar_t,
-      typename device::execution_space, typename device::memory_space,typename device::memory_space > KernelHandle;
+  // GS_DEFAULT is GS_TEAM on CUDA and GS_PERMUTED on other spaces, and the
+  // behavior of each algorithm _should be_ the same on every execution space,
+  // which is why we just test GS_DEFAULT.
+  int apply_count = 1;  // test symmetric, forward, backward
+  scalar_view_t x_vector(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "x vector"), nv);
+  for (int apply_type = 0; apply_type < apply_count; ++apply_type) {
+    typedef KokkosKernelsHandle<
+        size_type, lno_t, scalar_t, typename device::execution_space,
+        typename device::memory_space, typename device::memory_space>
+        KernelHandle;
 
     KernelHandle kh;
     kh.create_gs_handle(GS_DEFAULT);
     auto gsHandle = kh.get_point_gs_handle();
     gsHandle->set_long_row_threshold(3 * nnzPerShortRow);
-    //Reset x vector to 0
+    // Reset x vector to 0
     Kokkos::deep_copy(x_vector, scalar_t());
-    run_gauss_seidel(kh, input_mat, x_vector, y_vector, symmetric, 0.9, apply_type);
+    run_gauss_seidel(kh, input_mat, x_vector, y_vector, symmetric, 0.9,
+                     apply_type);
     KokkosBlas::axpby(one, solution_x, -one, x_vector);
     mag_t result_norm_res = KokkosBlas::nrm2(x_vector);
     EXPECT_LT(result_norm_res, 0.25 * initial_norm_res);
   }
 }
 
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
-TEST_F( TestCategory, sparse ## _ ## gauss_seidel_asymmetric_rank1 ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  test_gauss_seidel_rank1<SCALAR,ORDINAL,OFFSET,DEVICE>(2000, 2000 * 20, 200, 10, false); \
-} \
-TEST_F( TestCategory, sparse ## _ ## gauss_seidel_asymmetric_rank2 ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  test_gauss_seidel_rank2<SCALAR,ORDINAL,OFFSET,DEVICE>(2000, 2000 * 20, 200, 10, 3, false); \
-} \
-TEST_F( TestCategory, sparse ## _ ## gauss_seidel_symmetric_rank1 ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  test_gauss_seidel_rank1<SCALAR,ORDINAL,OFFSET,DEVICE>(2000, 2000 * 20, 200, 10, true); \
-} \
-TEST_F( TestCategory, sparse ## _ ## gauss_seidel_symmetric_rank2 ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  test_gauss_seidel_rank2<SCALAR,ORDINAL,OFFSET,DEVICE>(2000, 2000 * 20, 200, 10, 3, true); \
-} \
-TEST_F( TestCategory, sparse ## _ ## gauss_seidel_zero_rows ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  test_sgs_zero_rows<SCALAR,ORDINAL,OFFSET,DEVICE>(); \
-} \
-TEST_F( TestCategory, sparse ## _ ## balloon_clustering ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  test_balloon_clustering<SCALAR,ORDINAL,OFFSET,DEVICE>(5000, 100, 2000); \
-} \
-TEST_F( TestCategory, sparse ## _ ## sequential_sor ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  test_sequential_sor<SCALAR,ORDINAL,OFFSET,DEVICE>(1000, 1000 * 15, 50, 10); \
-} \
-TEST_F( TestCategory, sparse ## _ ## gauss_seidel_long_rows ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  test_gauss_seidel_long_rows<SCALAR,ORDINAL,OFFSET,DEVICE>(500, 10, 20, true); \
-} \
-
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, int, TestExecSpace)
-#endif
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void test_gauss_seidel_custom_coloring(lno_t numRows, lno_t nnzPerRow) {
+  using namespace Test;
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
+          crsMat_t;
+  typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
+  typedef typename Kokkos::Details::ArithTraits<scalar_t>::mag_type mag_t;
+  const scalar_t one = Kokkos::ArithTraits<scalar_t>::one();
+  size_type nnz      = nnzPerRow * numRows;
+  crsMat_t input_mat =
+      KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<
+          crsMat_t>(numRows, numRows, nnz, 0, numRows / 10, 2.0 * one);
+  input_mat =
+      Test::symmetrize<scalar_t, lno_t, size_type, device, crsMat_t>(input_mat);
+  input_mat = KokkosKernels::sort_and_merge_matrix(input_mat);
+  scalar_view_t solution_x(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "X (correct)"), numRows);
+  create_random_x_vector(solution_x);
+  mag_t initial_norm_res = KokkosBlas::nrm2(solution_x);
+  scalar_view_t y_vector = create_random_y_vector(input_mat, solution_x);
+  scalar_view_t x_vector(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "x vector"), numRows);
+  typedef KokkosKernelsHandle<
+      size_type, lno_t, scalar_t, typename device::execution_space,
+      typename device::memory_space, typename device::memory_space>
+      KernelHandle;
+
+  KernelHandle kh;
+  kh.create_gs_handle(GS_DEFAULT, KokkosGraph::COLORING_VBBIT);
+  EXPECT_EQ(kh.get_point_gs_handle()->get_coloring_algorithm(),
+            KokkosGraph::COLORING_VBBIT);
+  // Reset x vector to 0
+  Kokkos::deep_copy(x_vector, scalar_t());
+  run_gauss_seidel(kh, input_mat, x_vector, y_vector, true, 0.9, 0);
+  KokkosBlas::axpby(one, solution_x, -one, x_vector);
+  mag_t result_norm_res = KokkosBlas::nrm2(x_vector);
+  EXPECT_LT(result_norm_res, 0.25 * initial_norm_res);
+}
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, int, TestExecSpace)
+#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                                          \
+  TEST_F(                                                                                      \
+      TestCategory,                                                                            \
+      sparse##_##gauss_seidel_asymmetric_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    test_gauss_seidel_rank1<SCALAR, ORDINAL, OFFSET, DEVICE>(2000, 2000 * 20,                  \
+                                                             200, 10, false);                  \
+  }                                                                                            \
+  TEST_F(                                                                                      \
+      TestCategory,                                                                            \
+      sparse##_##gauss_seidel_asymmetric_rank2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    test_gauss_seidel_rank2<SCALAR, ORDINAL, OFFSET, DEVICE>(                                  \
+        2000, 2000 * 20, 200, 10, 3, false);                                                   \
+  }                                                                                            \
+  TEST_F(                                                                                      \
+      TestCategory,                                                                            \
+      sparse##_##gauss_seidel_symmetric_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {  \
+    test_gauss_seidel_rank1<SCALAR, ORDINAL, OFFSET, DEVICE>(2000, 2000 * 20,                  \
+                                                             200, 10, true);                   \
+  }                                                                                            \
+  TEST_F(                                                                                      \
+      TestCategory,                                                                            \
+      sparse##_##gauss_seidel_symmetric_rank2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {  \
+    test_gauss_seidel_rank2<SCALAR, ORDINAL, OFFSET, DEVICE>(                                  \
+        2000, 2000 * 20, 200, 10, 3, true);                                                    \
+  }                                                                                            \
+  TEST_F(                                                                                      \
+      TestCategory,                                                                            \
+      sparse##_##gauss_seidel_empty##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {            \
+    test_gauss_seidel_empty<SCALAR, ORDINAL, OFFSET, DEVICE>();                                \
+  }                                                                                            \
+  TEST_F(                                                                                      \
+      TestCategory,                                                                            \
+      sparse##_##balloon_clustering##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {            \
+    test_balloon_clustering<SCALAR, ORDINAL, OFFSET, DEVICE>(5000, 100, 2000);                 \
+  }                                                                                            \
+  TEST_F(                                                                                      \
+      TestCategory,                                                                            \
+      sparse##_##sequential_sor##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {                \
+    test_sequential_sor<SCALAR, ORDINAL, OFFSET, DEVICE>(1000, 1000 * 15, 50,                  \
+                                                         10);                                  \
+  }                                                                                            \
+  TEST_F(                                                                                      \
+      TestCategory,                                                                            \
+      sparse##_##gauss_seidel_long_rows##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {        \
+    test_gauss_seidel_long_rows<SCALAR, ORDINAL, OFFSET, DEVICE>(500, 10, 20,                  \
+                                                                 true);                        \
+  }                                                                                            \
+  TEST_F(                                                                                      \
+      TestCategory,                                                                            \
+      sparse##_##gauss_seidel_custom_coloring##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {  \
+    test_gauss_seidel_custom_coloring<SCALAR, ORDINAL, OFFSET, DEVICE>(500,                    \
+                                                                       10);                    \
+  }
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int, size_t, TestExecSpace)
 #endif
 
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
+#endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
 #endif
 
 #undef EXECUTE_TEST
diff --git a/unit_test/sparse/Test_Sparse_replaceSumInto.hpp b/unit_test/sparse/Test_Sparse_replaceSumInto.hpp
index a73e74e594..dc51be7f7b 100644
--- a/unit_test/sparse/Test_Sparse_replaceSumInto.hpp
+++ b/unit_test/sparse/Test_Sparse_replaceSumInto.hpp
@@ -62,330 +62,343 @@ typedef Kokkos::complex<float> kokkos_complex_float;
 // it seems wrong that the atomic_ attribute is not
 // initialized using the constructor input atomic!
 namespace Test {
-  template<class CrsMatrixType>
-  class ModifyEvenNumberedRows {
-  public:
-    typedef typename CrsMatrixType::ordinal_type ordinal_type;
-    typedef typename CrsMatrixType::value_type value_type;
-
-    ModifyEvenNumberedRows (const CrsMatrixType& A,
-                            const bool replace,
-                            const bool sorted,
-                            const bool /*atomic*/) :
-      A_ (A), replace_ (replace), sorted_ (sorted)
-    {}
-
-    KOKKOS_FUNCTION void
-    operator () (const ordinal_type& lclRow) const
-    {
-      if (lclRow % static_cast<ordinal_type> (2) == 0) {
-        const ordinal_type ncol = 1;
-        ordinal_type cols[1];
-        value_type vals[1];
-
-        const value_type ONE = Kokkos::Details::ArithTraits<value_type>::one ();
-        const value_type THREE = ONE + ONE + ONE;
-
-        cols[0] = lclRow;
-        vals[0] = replace_ ? THREE : ONE;
-
-        if (replace_) {
-          A_.replaceValues (lclRow, cols, ncol, vals, sorted_, atomic_);
-        }
-        else { // sumInto
-          A_.sumIntoValues (lclRow, cols, ncol, vals, sorted_, atomic_);
-        }
+template <class CrsMatrixType>
+class ModifyEvenNumberedRows {
+ public:
+  typedef typename CrsMatrixType::ordinal_type ordinal_type;
+  typedef typename CrsMatrixType::value_type value_type;
+
+  ModifyEvenNumberedRows(const CrsMatrixType& A, const bool replace,
+                         const bool sorted, const bool /*atomic*/)
+      : A_(A), replace_(replace), sorted_(sorted) {}
+
+  KOKKOS_FUNCTION void operator()(const ordinal_type& lclRow) const {
+    if (lclRow % static_cast<ordinal_type>(2) == 0) {
+      const ordinal_type ncol = 1;
+      ordinal_type cols[1];
+      value_type vals[1];
+
+      const value_type ONE   = Kokkos::Details::ArithTraits<value_type>::one();
+      const value_type THREE = ONE + ONE + ONE;
+
+      cols[0] = lclRow;
+      vals[0] = replace_ ? THREE : ONE;
+
+      if (replace_) {
+        A_.replaceValues(lclRow, cols, ncol, vals, sorted_, atomic_);
+      } else {  // sumInto
+        A_.sumIntoValues(lclRow, cols, ncol, vals, sorted_, atomic_);
       }
     }
-
-  private:
-    CrsMatrixType A_;
-    bool replace_;
-    bool sorted_;
-    bool atomic_;
-  };
-} // namespace KokkosSparseTest
-
-namespace { // (anonymous)
-  using std::endl;
-
-  template<class CrsMatrixType>
-  void
-  modifyEvenNumberedRows (const CrsMatrixType& A,
-                          const bool replace,
-                          const bool sorted,
-                          const bool atomic)
-  {
-    typedef typename CrsMatrixType::device_type::execution_space execution_space;
-    typedef Kokkos::RangePolicy<execution_space, typename CrsMatrixType::ordinal_type> policy_type;
-
-    ::Test::ModifyEvenNumberedRows<CrsMatrixType> functor (A, replace, sorted, atomic);
-    Kokkos::parallel_for ( "KokkosSparse::Test::ReplaceSumInto", policy_type (0, A.numRows ()), functor);
   }
 
-  template<class CrsMatrixType>
-  bool
-  checkWhetherEvenNumberedRowsWereModified (const CrsMatrixType& A,
-                                            const bool replace,
-                                            const bool /* sorted */,
-                                            const bool /* atomic */ )
-  {
-    typedef typename CrsMatrixType::value_type SC;
-    typedef typename CrsMatrixType::ordinal_type LO;
-
-    const SC ONE = Kokkos::Details::ArithTraits<SC>::one ();
-    const SC TWO = ONE + ONE;
-    const SC THREE = ONE + ONE + ONE;
-
-    typename CrsMatrixType::values_type val = A.values;
-    typename CrsMatrixType::values_type::HostMirror val_h = Kokkos::create_mirror_view (val);
-    Kokkos::deep_copy (val_h, val);
-    Kokkos::fence();
-    const LO numRows = A.numRows ();
-    bool success = true;
-    for (LO lclRow = 0; lclRow < numRows; ++lclRow) {
-      if (lclRow % 2 == 0) {
-        if (replace && val_h(lclRow) != THREE) {
-          success = false;
-          break;
-        }
-        else if (! replace && val_h(lclRow) != TWO) {
-          success = false;
-          break;
-        }
-      }
-      else if (val_h(lclRow) != ONE) {
+ private:
+  CrsMatrixType A_;
+  bool replace_;
+  bool sorted_;
+  bool atomic_;
+};
+}  // namespace Test
+
+namespace {  // (anonymous)
+using std::endl;
+
+template <class CrsMatrixType>
+void modifyEvenNumberedRows(const CrsMatrixType& A, const bool replace,
+                            const bool sorted, const bool atomic) {
+  typedef typename CrsMatrixType::device_type::execution_space execution_space;
+  typedef Kokkos::RangePolicy<execution_space,
+                              typename CrsMatrixType::ordinal_type>
+      policy_type;
+
+  ::Test::ModifyEvenNumberedRows<CrsMatrixType> functor(A, replace, sorted,
+                                                        atomic);
+  Kokkos::parallel_for("KokkosSparse::Test::ReplaceSumInto",
+                       policy_type(0, A.numRows()), functor);
+}
+
+template <class CrsMatrixType>
+bool checkWhetherEvenNumberedRowsWereModified(const CrsMatrixType& A,
+                                              const bool replace,
+                                              const bool /* sorted */,
+                                              const bool /* atomic */) {
+  typedef typename CrsMatrixType::value_type SC;
+  typedef typename CrsMatrixType::ordinal_type LO;
+
+  const SC ONE   = Kokkos::Details::ArithTraits<SC>::one();
+  const SC TWO   = ONE + ONE;
+  const SC THREE = ONE + ONE + ONE;
+
+  typename CrsMatrixType::values_type val = A.values;
+  typename CrsMatrixType::values_type::HostMirror val_h =
+      Kokkos::create_mirror_view(val);
+  Kokkos::deep_copy(val_h, val);
+  Kokkos::fence();
+  const LO numRows = A.numRows();
+  bool success     = true;
+  for (LO lclRow = 0; lclRow < numRows; ++lclRow) {
+    if (lclRow % 2 == 0) {
+      if (replace && val_h(lclRow) != THREE) {
+        success = false;
+        break;
+      } else if (!replace && val_h(lclRow) != TWO) {
         success = false;
         break;
       }
+    } else if (val_h(lclRow) != ONE) {
+      success = false;
+      break;
     }
-    return success;
-  }
-
-  // lbv 06 May 2021: it seems success it not set anywhere
-  // this feels like a problem especially since lclSuccess
-  // defined in the functor could be reduced on to generate
-  // a reasonable value for success. Or we should just get
-  // rid of success altogether.
-  template<class CrsMatrixType>
-  void
-  testOneCase (bool& /*success*/,
-               //Teuchos::FancyOStream& out,
-	       std::ostream &out,
-               const CrsMatrixType& A,
-               const bool replace,
-               const bool sorted,
-               const bool atomic)
-  {
-    using Kokkos::Details::ArithTraits;
-    typedef typename CrsMatrixType::value_type value_type;
-
-    //Teuchos::OSTab tab0 (out);
-    out << "replace: " << (replace ? "true" : "false")
-        << ", sorted: " << (sorted ? "true" : "false")
-        << ", atomic: " << (atomic ? "true" : "false")
-        << endl;
-
-    modifyEvenNumberedRows (A, replace, sorted, atomic);
-    const bool lclSuccess =
-      checkWhetherEvenNumberedRowsWereModified (A, replace, sorted, atomic);
-    EXPECT_TRUE( lclSuccess ); // this modifies 'success' and prints to 'out'
-    // Restore original values.
-    Kokkos::deep_copy (A.values, ArithTraits<value_type>::one ());
   }
+  return success;
+}
 
-  // Test findRelOffset with various array data types and for various cases.
-  //
-  // This takes the same arguments as if it were declared via the
-  // TEUCHOS_UNIT_TEST macro.
-  template <typename scalar_t, typename lno_t, typename size_type, typename device>
-  void generalTest (bool& success, std::ostream &out)
-		  	  	  	  	  	  	  //Teuchos::FancyOStream& out)
-  {
+// lbv 06 May 2021: it seems success it not set anywhere
+// this feels like a problem especially since lclSuccess
+// defined in the functor could be reduced on to generate
+// a reasonable value for success. Or we should just get
+// rid of success altogether.
+template <class CrsMatrixType>
+void testOneCase(bool& /*success*/,
+                 // Teuchos::FancyOStream& out,
+                 std::ostream& out, const CrsMatrixType& A, const bool replace,
+                 const bool sorted, const bool atomic) {
+  using Kokkos::Details::ArithTraits;
+  typedef typename CrsMatrixType::value_type value_type;
+
+  // Teuchos::OSTab tab0 (out);
+  out << "replace: " << (replace ? "true" : "false")
+      << ", sorted: " << (sorted ? "true" : "false")
+      << ", atomic: " << (atomic ? "true" : "false") << endl;
+
+  modifyEvenNumberedRows(A, replace, sorted, atomic);
+  const bool lclSuccess =
+      checkWhetherEvenNumberedRowsWereModified(A, replace, sorted, atomic);
+  EXPECT_TRUE(lclSuccess);  // this modifies 'success' and prints to 'out'
+  // Restore original values.
+  Kokkos::deep_copy(A.values, ArithTraits<value_type>::one());
+}
 
-    typedef KokkosSparse::CrsMatrix<scalar_t, lno_t, device,void, size_type> matrix_type;
+// Test findRelOffset with various array data types and for various cases.
+//
+// This takes the same arguments as if it were declared via the
+// TEUCHOS_UNIT_TEST macro.
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void generalTest(bool& success, std::ostream& out)
+// Teuchos::FancyOStream& out)
+{
+  typedef KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
+      matrix_type;
 
-    //Teuchos::OSTab tab0 (out);
-    out << "Test KokkosSparse::CrsMatrix::{replace,sumInto}Values*" << endl;
-    //Teuchos::OSTab tab1 (out);
+  // Teuchos::OSTab tab0 (out);
+  out << "Test KokkosSparse::CrsMatrix::{replace,sumInto}Values*" << endl;
+  // Teuchos::OSTab tab1 (out);
 
-    out << "Create a diagonal matrix as a test problem" << endl;
+  out << "Create a diagonal matrix as a test problem" << endl;
 
-    const lno_t numRows = 10;
-    typename matrix_type::size_type numEnt = 0; // to be updated below
-    typename matrix_type::row_map_type::non_const_type ptr ("ptr", numRows+1);
-    {
-      typename matrix_type::row_map_type::HostMirror ptr_h = Kokkos::create_mirror_view (ptr);
-      ptr_h[0] = 0;
-      for (lno_t lclRow = 0; lclRow < numRows; ++lclRow) {
-        ptr_h[lclRow+1] = ptr_h[lclRow] + 1; // 1 entry in each row
-      }
-      numEnt = ptr_h[numRows];
-      Kokkos::deep_copy (ptr, ptr_h);
+  const lno_t numRows                    = 10;
+  typename matrix_type::size_type numEnt = 0;  // to be updated below
+  typename matrix_type::row_map_type::non_const_type ptr("ptr", numRows + 1);
+  {
+    typename matrix_type::row_map_type::HostMirror ptr_h =
+        Kokkos::create_mirror_view(ptr);
+    ptr_h[0] = 0;
+    for (lno_t lclRow = 0; lclRow < numRows; ++lclRow) {
+      ptr_h[lclRow + 1] = ptr_h[lclRow] + 1;  // 1 entry in each row
     }
+    numEnt = ptr_h[numRows];
+    Kokkos::deep_copy(ptr, ptr_h);
+  }
 
-    typename matrix_type::index_type::non_const_type ind ("ind", numEnt);
-    {
-      typename matrix_type::index_type::HostMirror ind_h = Kokkos::create_mirror_view (ind);
-      for (lno_t lclRow = 0; lclRow < numRows; ++lclRow) {
-        ind_h[lclRow] = lclRow; // diagonal matrix
-      }
-      Kokkos::deep_copy (ind, ind_h);
+  typename matrix_type::index_type::non_const_type ind("ind", numEnt);
+  {
+    typename matrix_type::index_type::HostMirror ind_h =
+        Kokkos::create_mirror_view(ind);
+    for (lno_t lclRow = 0; lclRow < numRows; ++lclRow) {
+      ind_h[lclRow] = lclRow;  // diagonal matrix
     }
+    Kokkos::deep_copy(ind, ind_h);
+  }
 
-    typename matrix_type::values_type val ("val", numEnt);
-    {
-      typename matrix_type::values_type::HostMirror val_h = Kokkos::create_mirror_view (val);
-      for (lno_t lclRow = 0; lclRow < numRows; ++lclRow) {
-        val_h[lclRow] = 1.0; // diagonal matrix
-      }
-      Kokkos::deep_copy (val, val_h);
+  typename matrix_type::values_type val("val", numEnt);
+  {
+    typename matrix_type::values_type::HostMirror val_h =
+        Kokkos::create_mirror_view(val);
+    for (lno_t lclRow = 0; lclRow < numRows; ++lclRow) {
+      val_h[lclRow] = 1.0;  // diagonal matrix
     }
+    Kokkos::deep_copy(val, val_h);
+  }
+
+  const lno_t numCols = numRows;  // square diagonal matrix
+  matrix_type A("A", numRows, numCols, numEnt, val, ptr, ind);
 
-    const lno_t numCols = numRows; // square diagonal matrix
-    matrix_type A ("A", numRows, numCols, numEnt, val, ptr, ind);
-
-    for (int replaceInt = 0; replaceInt < 2; ++replaceInt) {
-      const bool replace = replaceInt != 0;
-      for (int sortedInt = 0; sortedInt < 2; ++sortedInt) {
-        const bool sorted = sortedInt != 0;
-        for (int atomicInt = 0; atomicInt < 2; ++atomicInt) {
-          const bool atomic = atomicInt != 0;
-          testOneCase (success, out, A, replace, sorted, atomic);
-        }
+  for (int replaceInt = 0; replaceInt < 2; ++replaceInt) {
+    const bool replace = replaceInt != 0;
+    for (int sortedInt = 0; sortedInt < 2; ++sortedInt) {
+      const bool sorted = sortedInt != 0;
+      for (int atomicInt = 0; atomicInt < 2; ++atomicInt) {
+        const bool atomic = atomicInt != 0;
+        testOneCase(success, out, A, replace, sorted, atomic);
       }
     }
   }
+}
 
-} // namespace (anonymous)
-
-
+}  // namespace
 
-template <typename scalar_t, typename lno_t, typename size_type, typename device>
-void test_replaceSumInto()
-{
-  using std::endl;
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void test_replaceSumInto() {
   using std::endl;
-  class NullBuffer : public std::streambuf
-  {
-  public:
+  class NullBuffer : public std::streambuf {
+   public:
     int overflow(int c) { return c; }
   };
   NullBuffer null_buffer;
-  //std::ostream &out = std::cout;
+  // std::ostream &out = std::cout;
   std::ostream out(&null_buffer);
 
   bool success = true;
   out << "Run test" << endl;
-  generalTest <scalar_t, lno_t, size_type, device> (success, out);
-  EXPECT_TRUE( success);
-}
-
-
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
-TEST_F( TestCategory,sparse ## _ ## replaceSumInto ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  test_replaceSumInto<SCALAR,ORDINAL,OFFSET,DEVICE>(); \
+  generalTest<scalar_t, lno_t, size_type, device>(success, out);
+  EXPECT_TRUE(success);
 }
 
+#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                           \
+  TEST_F(                                                                       \
+      TestCategory,                                                             \
+      sparse##_##replaceSumInto##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    test_replaceSumInto<SCALAR, ORDINAL, OFFSET, DEVICE>();                     \
+  }
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
 #endif
 
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
 #endif
 
 #undef EXECUTE_TEST
-
-
-
diff --git a/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp b/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp
index 2a30dd7f3a..1c0e279366 100644
--- a/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp
+++ b/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp
@@ -516,6 +516,8 @@ void test_replaceSumIntoLonger() {
     test_replaceSumIntoLonger<SCALAR, ORDINAL, OFFSET, DEVICE>();                     \
   }
 
+// FIXME SYCL: test hangs or gives "CL error -46 invalid kernel name"
+#ifndef KOKKOS_ENABLE_SYCL
 #if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
      defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
      defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
@@ -643,5 +645,6 @@ EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
      !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
 EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
 #endif
+#endif
 
 #undef EXECUTE_TEST
diff --git a/unit_test/sparse/Test_Sparse_rocsparse.hpp b/unit_test/sparse/Test_Sparse_rocsparse.hpp
new file mode 100644
index 0000000000..27e0b1f9fd
--- /dev/null
+++ b/unit_test/sparse/Test_Sparse_rocsparse.hpp
@@ -0,0 +1,60 @@
+// Note: Luc Berger-Vergiat 10/25/21
+//       Only include this test if compiling
+//       the cuda sparse tests and cuSPARSE
+//       is enabled.
+#if defined(TEST_HIP_SPARSE_CPP) && defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE)
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <rocsparse.h>
+#include "KokkosKernels_SparseUtils_rocsparse.hpp"
+
+void test_rocsparse_version() {
+  // Print version
+  rocsparse_handle handle;
+  rocsparse_create_handle(&handle);
+
+  int ver;
+  char rev[64];
+
+  rocsparse_get_version(handle, &ver);
+  rocsparse_get_git_rev(handle, rev);
+
+  std::cout << "rocSPARSE version: " << ver / 100000 << "." << ver / 100 % 1000
+            << "." << ver % 100 << "-" << rev << std::endl;
+
+  rocsparse_destroy_handle(handle);
+}
+
+// Check that the wrapper macro
+// detects error status correctly
+void test_rocsparse_safe_call() {
+  bool caught_exception = false;
+
+  rocsparse_status myStatus = rocsparse_status_success;
+  KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(myStatus);
+
+  try {
+    myStatus = rocsparse_status_internal_error;
+    KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(myStatus);
+  } catch (std::runtime_error& e) {
+    caught_exception = true;
+  }
+
+  EXPECT_TRUE(caught_exception == true);
+}
+
+// Check that we can create a handle
+// using the singleton class if it
+// fails it throws an error with the
+// KOKKOS_ROCBLAS_SAFE_CALL_IMPL macro
+void test_rocsparse_singleton() {
+  KokkosKernels::Impl::RocsparseSingleton& s =
+      KokkosKernels::Impl::RocsparseSingleton::singleton();
+}
+
+TEST_F(TestCategory, sparse_rocsparse_version) { test_rocsparse_version(); }
+TEST_F(TestCategory, sparse_rocsparse_safe_call) { test_rocsparse_safe_call(); }
+TEST_F(TestCategory, sparse_rocsparse_singleton) { test_rocsparse_singleton(); }
+
+#endif  // check for HIP and rocSPARSE
diff --git a/unit_test/sparse/Test_Sparse_spadd.hpp b/unit_test/sparse/Test_Sparse_spadd.hpp
index 12de662ee0..01c1aad2b9 100644
--- a/unit_test/sparse/Test_Sparse_spadd.hpp
+++ b/unit_test/sparse/Test_Sparse_spadd.hpp
@@ -1,77 +1,78 @@
-#include<gtest/gtest.h>   
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
-#include<Kokkos_ArithTraits.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <Kokkos_ArithTraits.hpp>
 
-#include<KokkosSparse_CrsMatrix.hpp>
-#include<KokkosSparse_spadd.hpp>
-#include<KokkosKernels_TestUtils.hpp>
-#include<KokkosKernels_IOUtils.hpp>
-#include<KokkosKernels_Utils.hpp>
+#include <KokkosSparse_CrsMatrix.hpp>
+#include <KokkosSparse_spadd.hpp>
+#include <KokkosKernels_TestUtils.hpp>
+#include <KokkosKernels_IOUtils.hpp>
+#include <KokkosKernels_Utils.hpp>
 
-#include<algorithm>   //for std::random_shuffle
-#include<random>      // for std::default_random_engine
-#include<cstdlib>     //for rand
-#include<type_traits> //for std::is_same
+#include <algorithm>    //for std::random_shuffle
+#include <random>       // for std::default_random_engine
+#include <cstdlib>      //for rand
+#include <type_traits>  //for std::is_same
 
 typedef Kokkos::complex<double> kokkos_complex_double;
 typedef Kokkos::complex<float> kokkos_complex_float;
 
-//Create a random square matrix for testing mat-mat addition kernels
+// Create a random square matrix for testing mat-mat addition kernels
 template <typename crsMat_t, typename ordinal_type>
-crsMat_t randomMatrix(ordinal_type nrows, ordinal_type ncols, ordinal_type minNNZ, ordinal_type maxNNZ, bool sortRows)
-{
+crsMat_t randomMatrix(ordinal_type nrows, ordinal_type ncols,
+                      ordinal_type minNNZ, ordinal_type maxNNZ, bool sortRows) {
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::non_const_type size_type_view_t;
   typedef typename graph_t::entries_type::non_const_type lno_view_t;
   typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
-  typedef typename size_type_view_t::non_const_value_type size_type;  //rowptr type
-  typedef typename lno_view_t::non_const_value_type lno_t;            //colind type
+  typedef
+      typename size_type_view_t::non_const_value_type size_type;  // rowptr type
+  typedef typename lno_view_t::non_const_value_type lno_t;        // colind type
   typedef typename scalar_view_t::non_const_value_type scalar_t;
   typedef Kokkos::ArithTraits<scalar_t> KAT;
-  static_assert(std::is_same<ordinal_type, lno_t>::value, "ordinal_type should be same as lno_t from crsMat_t");
-  //first, populate rowmap
+  static_assert(std::is_same<ordinal_type, lno_t>::value,
+                "ordinal_type should be same as lno_t from crsMat_t");
+  // first, populate rowmap
   size_type_view_t rowmap("rowmap", nrows + 1);
-  typename size_type_view_t::HostMirror h_rowmap = Kokkos::create_mirror_view(rowmap);
-  size_type nnz = 0;
+  typename size_type_view_t::HostMirror h_rowmap =
+      Kokkos::create_mirror_view(rowmap);
+  size_type nnz           = 0;
   size_type maxRowEntries = 0;
-  for(lno_t i = 0; i < nrows; i++)
-  {
+  for (lno_t i = 0; i < nrows; i++) {
     size_type rowEntries = rand() % (maxNNZ - minNNZ + 1) + minNNZ;
-    h_rowmap(i) = nnz;
+    h_rowmap(i)          = nnz;
     nnz += rowEntries;
     maxRowEntries = std::max(rowEntries, maxRowEntries);
   }
   h_rowmap(nrows) = nnz;
   Kokkos::deep_copy(rowmap, h_rowmap);
-  //allocate values and entries
+  // allocate values and entries
   scalar_view_t values("values", nnz);
-  //populate values
-  typename scalar_view_t::HostMirror h_values = Kokkos::create_mirror_view(values);
-  for(size_type i = 0; i < nnz; i++)
-  {
-    h_values(i) = KAT::one() * (((typename KAT::mag_type) rand()) / RAND_MAX);
+  // populate values
+  typename scalar_view_t::HostMirror h_values =
+      Kokkos::create_mirror_view(values);
+  for (size_type i = 0; i < nnz; i++) {
+    h_values(i) = KAT::one() * (((typename KAT::mag_type)rand()) /
+                                static_cast<typename KAT::mag_type>(RAND_MAX));
   }
   Kokkos::deep_copy(values, h_values);
-  //populate entries (make sure no entry is repeated within a row)
+  // populate entries (make sure no entry is repeated within a row)
   lno_view_t entries("entries", nnz);
-  typename lno_view_t::HostMirror h_entries = Kokkos::create_mirror_view(entries);
-  std::vector<lno_t> indices(std::max((size_type) ncols, maxRowEntries));
+  typename lno_view_t::HostMirror h_entries =
+      Kokkos::create_mirror_view(entries);
+  std::vector<lno_t> indices(std::max((size_type)ncols, maxRowEntries));
   auto re = std::mt19937(rand());
-  for(lno_t i = 0; i < nrows; i++)
-  {
-    //this formula guarantees no duplicates if maxNNZ <= ncols, and duplicates if minNNZ > ncols
-    for(size_t j = 0; j < indices.size(); j++)
-      indices[j] = j % ncols;
+  for (lno_t i = 0; i < nrows; i++) {
+    // this formula guarantees no duplicates if maxNNZ <= ncols, and duplicates
+    // if minNNZ > ncols
+    for (size_t j = 0; j < indices.size(); j++) indices[j] = j % ncols;
     std::shuffle(indices.begin(), indices.end(), re);
     size_type rowStart = h_rowmap(i);
     size_type rowCount = h_rowmap(i + 1) - rowStart;
-    if(sortRows)
-    {
+    if (sortRows) {
       std::sort(indices.begin(), indices.begin() + rowCount);
     }
-    for(size_type j = 0; j < rowCount; j++)
-    {
+    for (size_type j = 0; j < rowCount; j++) {
       h_entries(rowStart + j) = indices[j];
     }
   }
@@ -80,9 +81,11 @@ crsMat_t randomMatrix(ordinal_type nrows, ordinal_type ncols, ordinal_type minNN
 }
 
 template <typename scalar_t, typename lno_t, typename size_type, class Device>
-void test_spadd(lno_t numRows, lno_t numCols, size_type minNNZ, size_type maxNNZ, bool sortRows)
-{
-  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type> crsMat_t;
+void test_spadd(lno_t numRows, lno_t numCols, size_type minNNZ,
+                size_type maxNNZ, bool sortRows) {
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type>
+          crsMat_t;
 
   typedef Kokkos::ArithTraits<scalar_t> KAT;
   typedef typename KAT::mag_type magnitude_t;
@@ -90,137 +93,147 @@ void test_spadd(lno_t numRows, lno_t numCols, size_type minNNZ, size_type maxNNZ
   typedef typename crsMat_t::index_type::non_const_type entries_type;
   typedef typename crsMat_t::values_type::non_const_type values_type;
 
-  typedef typename KokkosKernels::Experimental::KokkosKernelsHandle<size_type, lno_t, scalar_t,
-  typename Device::execution_space, typename Device::memory_space, typename Device::memory_space> KernelHandle;
+  typedef typename KokkosKernels::Experimental::KokkosKernelsHandle<
+      size_type, lno_t, scalar_t, typename Device::execution_space,
+      typename Device::memory_space, typename Device::memory_space>
+      KernelHandle;
 
-  //Make the test deterministic on a given machine+compiler
+  // Make the test deterministic on a given machine+compiler
   srand((numRows << 1) ^ numCols);
 
   KernelHandle handle;
   handle.create_spadd_handle(sortRows);
-  crsMat_t A = randomMatrix<crsMat_t, lno_t>(numRows, numCols, minNNZ, maxNNZ, sortRows);
-  crsMat_t B = randomMatrix<crsMat_t, lno_t>(numRows, numCols, minNNZ, maxNNZ, sortRows);
-  row_map_type c_row_map(Kokkos::view_alloc(Kokkos::WithoutInitializing, "C row map"), numRows + 1);
-  //Make sure that nothing relies on any specific entry of c_row_map being zero initialized
-  Kokkos::deep_copy(c_row_map, (size_type) 5);
+  crsMat_t A =
+      randomMatrix<crsMat_t, lno_t>(numRows, numCols, minNNZ, maxNNZ, sortRows);
+  crsMat_t B =
+      randomMatrix<crsMat_t, lno_t>(numRows, numCols, minNNZ, maxNNZ, sortRows);
+  row_map_type c_row_map(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "C row map"),
+      numRows + 1);
+  // Make sure that nothing relies on any specific entry of c_row_map being zero
+  // initialized
+  Kokkos::deep_copy(c_row_map, (size_type)5);
   auto addHandle = handle.get_spadd_handle();
   KokkosSparse::Experimental::spadd_symbolic<
-    KernelHandle,
-    typename row_map_type::const_type,
-    typename entries_type::const_type,
-    typename row_map_type::const_type,
-    typename entries_type::const_type,
-    row_map_type,
-    entries_type>
-  (&handle, A.graph.row_map, A.graph.entries, B.graph.row_map, B.graph.entries, c_row_map);
+      KernelHandle, typename row_map_type::const_type,
+      typename entries_type::const_type, typename row_map_type::const_type,
+      typename entries_type::const_type, row_map_type, entries_type>(
+      &handle, A.graph.row_map, A.graph.entries, B.graph.row_map,
+      B.graph.entries, c_row_map);
   size_type c_nnz = addHandle->get_c_nnz();
-  //Fill values, entries with incorrect incorret
-  values_type c_values(Kokkos::view_alloc(Kokkos::WithoutInitializing, "C values"), c_nnz);
-  Kokkos::deep_copy(c_values, ((typename KAT::mag_type) 5) * KAT::one());
+  // Fill values, entries with incorrect incorret
+  values_type c_values(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "C values"), c_nnz);
+  Kokkos::deep_copy(c_values, ((typename KAT::mag_type)5) * KAT::one());
   entries_type c_entries("C entries", c_nnz);
-  Kokkos::deep_copy(c_entries, (lno_t) 5);
+  Kokkos::deep_copy(c_entries, (lno_t)5);
   KokkosSparse::Experimental::spadd_numeric<
-    KernelHandle,
-    typename row_map_type::const_type,
-    typename entries_type::const_type,
-    scalar_t, typename values_type::const_type,
-    typename row_map_type::const_type,
-    typename entries_type::const_type,
-    scalar_t, typename values_type::const_type,
-    row_map_type, entries_type, values_type>
-    (&handle, A.graph.row_map, A.graph.entries, A.values, KAT::one(),
-     B.graph.row_map, B.graph.entries, B.values, KAT::one(),
-     c_row_map, c_entries, c_values);
-  //done with handle
-  //create C using CRS arrays
+      KernelHandle, typename row_map_type::const_type,
+      typename entries_type::const_type, scalar_t,
+      typename values_type::const_type, typename row_map_type::const_type,
+      typename entries_type::const_type, scalar_t,
+      typename values_type::const_type, row_map_type, entries_type,
+      values_type>(&handle, A.graph.row_map, A.graph.entries, A.values,
+                   KAT::one(), B.graph.row_map, B.graph.entries, B.values,
+                   KAT::one(), c_row_map, c_entries, c_values);
+  // done with handle
+  // create C using CRS arrays
   crsMat_t C("C", numRows, numCols, c_nnz, c_values, c_row_map, c_entries);
   handle.destroy_spadd_handle();
-  auto Avalues = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.values);
-  auto Arowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map);
-  auto Aentries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries);
-  auto Bvalues = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), B.values);
-  auto Browmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), B.graph.row_map);
-  auto Bentries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), B.graph.entries);
-  auto Cvalues = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), C.values);
-  auto Crowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), C.graph.row_map);
-  auto Centries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), C.graph.entries);
+  auto Avalues =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.values);
+  auto Arowmap =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map);
+  auto Aentries =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries);
+  auto Bvalues =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), B.values);
+  auto Browmap =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), B.graph.row_map);
+  auto Bentries =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), B.graph.entries);
+  auto Cvalues =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), C.values);
+  auto Crowmap =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), C.graph.row_map);
+  auto Centries =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), C.graph.entries);
   auto zero = KAT::zero();
-  auto eps = KAT::epsilon();
-  //check that C is correct and sorted, row-by-row
-  for(lno_t row = 0; row < numRows; row++)
-  {
+  auto eps  = KAT::epsilon();
+  // check that C is correct and sorted, row-by-row
+  for (lno_t row = 0; row < numRows; row++) {
     std::vector<scalar_t> correct(numCols, zero);
     std::vector<bool> nonzeros(numCols, false);
-    for(size_type i = Arowmap(row); i < Arowmap(row + 1); i++)
-    {
+    for (size_type i = Arowmap(row); i < Arowmap(row + 1); i++) {
       correct[Aentries(i)] += Avalues(i);
       nonzeros[Aentries(i)] = true;
     }
-    for(size_type i = Browmap(row); i < Browmap(row + 1); i++)
-    {
+    for (size_type i = Browmap(row); i < Browmap(row + 1); i++) {
       correct[Bentries(i)] += Bvalues(i);
       nonzeros[Bentries(i)] = true;
     }
     size_type nz = 0;
-    for(lno_t i = 0; i < numCols; i++)
-    {
-      if(nonzeros[i])
-        nz++;
+    for (lno_t i = 0; i < numCols; i++) {
+      if (nonzeros[i]) nz++;
     }
-    //make sure C has the right number of entries
+    // make sure C has the right number of entries
     auto actualNZ = Crowmap(row + 1) - Crowmap(row);
-    ASSERT_EQ(actualNZ, nz) << "A+B row " << row << " has " << actualNZ << " entries but should have " << nz;
-    //make sure C's indices are sorted and unique
-    for(size_type i = Crowmap(row) + 1; i < Crowmap(row + 1); i++)
-    {
-      ASSERT_LT(Centries(i - 1), Centries(i)) << "C row " << row << " is not sorted";
+    ASSERT_EQ(actualNZ, nz) << "A+B row " << row << " has " << actualNZ
+                            << " entries but should have " << nz;
+    // make sure C's indices are sorted and unique
+    for (size_type i = Crowmap(row) + 1; i < Crowmap(row + 1); i++) {
+      ASSERT_LT(Centries(i - 1), Centries(i))
+          << "C row " << row << " is not sorted";
     }
-    //make sure C's indices are exactly the same as "nonzeros"
-    for(size_type i = Crowmap(row); i < Crowmap(row + 1); i++)
-    {
+    // make sure C's indices are exactly the same as "nonzeros"
+    for (size_type i = Crowmap(row); i < Crowmap(row + 1); i++) {
       ASSERT_EQ(true, nonzeros[Centries(i)]);
     }
-    //make sure C has the correct values
-    for(size_type i = Crowmap(row); i < Crowmap(row + 1); i++)
-    {
+    // make sure C has the correct values
+    for (size_type i = Crowmap(row); i < Crowmap(row + 1); i++) {
       scalar_t Cval = Cvalues(i);
-      lno_t Ccol = Centries(i);
-      //Check that result is correct to 1 ULP
-      magnitude_t maxError = (correct[Ccol] == KAT::zero()) ? KAT::abs(eps) : KAT::abs(correct[Ccol] * eps);
-      ASSERT_LE(KAT::abs(correct[Ccol] - Cval), maxError) << "A+B row " << row << ", column " << Ccol << " has value " << Cval << " but should be " << correct[Ccol];
+      lno_t Ccol    = Centries(i);
+      // Check that result is correct to 1 ULP
+      magnitude_t maxError = (correct[Ccol] == KAT::zero())
+                                 ? KAT::abs(eps)
+                                 : KAT::abs(correct[Ccol] * eps);
+      ASSERT_LE(KAT::abs(correct[Ccol] - Cval), maxError)
+          << "A+B row " << row << ", column " << Ccol << " has value " << Cval
+          << " but should be " << correct[Ccol];
     }
   }
 }
 
-//Test spadd simplified interface: make sure C's dimensions match A and B, even when there are empty rows/cols
+// Test spadd simplified interface: make sure C's dimensions match A and B, even
+// when there are empty rows/cols
 template <typename scalar_t, typename lno_t, typename size_type, class Device>
-void test_spadd_known_columns()
-{
-  using crsMat_t = typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type>;
+void test_spadd_known_columns() {
+  using crsMat_t     = typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device,
+                                                    void, size_type>;
   using row_map_type = typename crsMat_t::row_map_type::non_const_type;
   using entries_type = typename crsMat_t::index_type::non_const_type;
-  using values_type = typename crsMat_t::values_type::non_const_type;
-  using KAT = Kokkos::ArithTraits<scalar_t>;
-  using KernelHandle = typename KokkosKernels::Experimental::KokkosKernelsHandle<size_type, lno_t, scalar_t,
-        typename Device::execution_space, typename Device::memory_space, typename Device::memory_space>;
-  //Create A and B as 4x4 identity matrix, at the top-left of a 6x7 matrix of zeros
+  using values_type  = typename crsMat_t::values_type::non_const_type;
+  using KAT          = Kokkos::ArithTraits<scalar_t>;
+  using KernelHandle =
+      typename KokkosKernels::Experimental::KokkosKernelsHandle<
+          size_type, lno_t, scalar_t, typename Device::execution_space,
+          typename Device::memory_space, typename Device::memory_space>;
+  // Create A and B as 4x4 identity matrix, at the top-left of a 6x7 matrix of
+  // zeros
   int nrows = 6;
   int ncols = 7;
   row_map_type Arowmap("rowmap", nrows + 1);
   entries_type Aentries("rowmap", 4);
   values_type Avalues("rowmap", 4);
   {
-    auto rowmapHost = Kokkos::create_mirror_view(Arowmap);
+    auto rowmapHost  = Kokkos::create_mirror_view(Arowmap);
     auto entriesHost = Kokkos::create_mirror_view(Aentries);
-    auto valuesHost = Kokkos::create_mirror_view(Avalues);
-    for(int i = 0; i < 5; i++)
-      rowmapHost(i) = i;
-    for(int i = 5; i < nrows + 1; i++)
-      rowmapHost(i) = rowmapHost(i - 1);
-    for(int i = 0; i < 4; i++)
-    {
+    auto valuesHost  = Kokkos::create_mirror_view(Avalues);
+    for (int i = 0; i < 5; i++) rowmapHost(i) = i;
+    for (int i = 5; i < nrows + 1; i++) rowmapHost(i) = rowmapHost(i - 1);
+    for (int i = 0; i < 4; i++) {
       entriesHost(i) = i;
-      valuesHost(i) = KAT::one();
+      valuesHost(i)  = KAT::one();
     }
     Kokkos::deep_copy(Arowmap, rowmapHost);
     Kokkos::deep_copy(Aentries, entriesHost);
@@ -237,116 +250,151 @@ void test_spadd_known_columns()
   ASSERT_EQ(A.nnz(), C.nnz());
 }
 
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
-TEST_F( TestCategory,sparse ## _ ## spadd_sorted_input ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  test_spadd<SCALAR,ORDINAL,OFFSET,DEVICE> (10, 10, 0, 0, true); \
-  test_spadd<SCALAR,ORDINAL,OFFSET,DEVICE> (10, 10, 0, 2, true); \
-  test_spadd<SCALAR,ORDINAL,OFFSET,DEVICE> (100, 100, 50, 100, true); \
-  test_spadd<SCALAR,ORDINAL,OFFSET,DEVICE> (50, 50, 75, 100, true); \
-  test_spadd_known_columns<SCALAR,ORDINAL,OFFSET,DEVICE> (); \
-} \
-TEST_F( TestCategory,sparse ## _ ## spadd_unsorted_input ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  test_spadd<SCALAR,ORDINAL,OFFSET,DEVICE> (10, 10, 0, 0, false); \
-  test_spadd<SCALAR,ORDINAL,OFFSET,DEVICE> (10, 10, 0, 2, false); \
-  test_spadd<SCALAR,ORDINAL,OFFSET,DEVICE> (100, 100, 50, 100, false); \
-  test_spadd<SCALAR,ORDINAL,OFFSET,DEVICE> (50, 50, 75, 100, false); \
-}
+#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                                 \
+  TEST_F(                                                                             \
+      TestCategory,                                                                   \
+      sparse##_##spadd_sorted_input##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {   \
+    test_spadd<SCALAR, ORDINAL, OFFSET, DEVICE>(10, 10, 0, 0, true);                  \
+    test_spadd<SCALAR, ORDINAL, OFFSET, DEVICE>(10, 10, 0, 2, true);                  \
+    test_spadd<SCALAR, ORDINAL, OFFSET, DEVICE>(100, 100, 50, 100, true);             \
+    test_spadd<SCALAR, ORDINAL, OFFSET, DEVICE>(50, 50, 75, 100, true);               \
+    test_spadd_known_columns<SCALAR, ORDINAL, OFFSET, DEVICE>();                      \
+  }                                                                                   \
+  TEST_F(                                                                             \
+      TestCategory,                                                                   \
+      sparse##_##spadd_unsorted_input##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    test_spadd<SCALAR, ORDINAL, OFFSET, DEVICE>(10, 10, 0, 0, false);                 \
+    test_spadd<SCALAR, ORDINAL, OFFSET, DEVICE>(10, 10, 0, 2, false);                 \
+    test_spadd<SCALAR, ORDINAL, OFFSET, DEVICE>(100, 100, 50, 100, false);            \
+    test_spadd<SCALAR, ORDINAL, OFFSET, DEVICE>(50, 50, 75, 100, false);              \
+  }
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
 #endif
 
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
 #endif
 
 #undef EXECUTE_TEST
diff --git a/unit_test/sparse/Test_Sparse_spgemm.hpp b/unit_test/sparse/Test_Sparse_spgemm.hpp
index 6b01343998..dd22bb90dc 100644
--- a/unit_test/sparse/Test_Sparse_spgemm.hpp
+++ b/unit_test/sparse/Test_Sparse_spgemm.hpp
@@ -42,7 +42,6 @@
 //@HEADER
 */
 
-
 #include <gtest/gtest.h>
 #include <Kokkos_Core.hpp>
 
@@ -55,19 +54,18 @@
 #include "KokkosSparse_spgemm.hpp"
 #include "KokkosSparse_CrsMatrix.hpp"
 
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
 
-#include<KokkosKernels_IOUtils.hpp>
+#include <KokkosKernels_IOUtils.hpp>
 
-//This file contains the matrix for test_issue402
+// This file contains the matrix for test_issue402
 #include "matrixIssue402.hpp"
 
-//const char *input_filename = "sherman1.mtx";
-//const char *input_filename = "Si2.mtx";
-//const char *input_filename = "wathen_30_30.mtx";
-//const size_t expected_num_cols = 9906;
+// const char *input_filename = "sherman1.mtx";
+// const char *input_filename = "Si2.mtx";
+// const char *input_filename = "wathen_30_30.mtx";
+// const size_t expected_num_cols = 9906;
 using namespace KokkosSparse;
 using namespace KokkosSparse::Experimental;
 using namespace KokkosKernels;
@@ -84,15 +82,16 @@ typedef Kokkos::complex<float> kokkos_complex_float;
 namespace Test {
 
 template <typename crsMat_t, typename device>
-int run_spgemm(crsMat_t A, crsMat_t B, KokkosSparse::SPGEMMAlgorithm spgemm_algorithm, crsMat_t &C) {
-
+int run_spgemm(crsMat_t A, crsMat_t B,
+               KokkosSparse::SPGEMMAlgorithm spgemm_algorithm, crsMat_t &C) {
   typedef typename crsMat_t::size_type size_type;
   typedef typename crsMat_t::ordinal_type lno_t;
   typedef typename crsMat_t::value_type scalar_t;
 
-  typedef KokkosKernels::Experimental::KokkosKernelsHandle
-      <size_type, lno_t, scalar_t,
-      typename device::execution_space, typename device::memory_space,typename device::memory_space > KernelHandle;
+  typedef KokkosKernels::Experimental::KokkosKernelsHandle<
+      size_type, lno_t, scalar_t, typename device::execution_space,
+      typename device::memory_space, typename device::memory_space>
+      KernelHandle;
 
   KernelHandle kh;
   kh.set_team_work_size(16);
@@ -108,111 +107,93 @@ int run_spgemm(crsMat_t A, crsMat_t B, KokkosSparse::SPGEMMAlgorithm spgemm_algo
 }
 
 template <typename crsMat_t, typename device>
-int run_spgemm_old_interface(crsMat_t input_mat, crsMat_t input_mat2, KokkosSparse::SPGEMMAlgorithm spgemm_algorithm, crsMat_t &result) {
+int run_spgemm_old_interface(crsMat_t input_mat, crsMat_t input_mat2,
+                             KokkosSparse::SPGEMMAlgorithm spgemm_algorithm,
+                             crsMat_t &result) {
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::non_const_type lno_view_t;
-  typedef typename graph_t::entries_type::non_const_type   lno_nnz_view_t;
+  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
   typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
 
-
-
   typedef typename lno_view_t::value_type size_type;
   typedef typename lno_nnz_view_t::value_type lno_t;
   typedef typename scalar_view_t::value_type scalar_t;
 
-  typedef KokkosKernels::Experimental::KokkosKernelsHandle
-      <size_type,lno_t, scalar_t,
-      typename device::execution_space, typename device::memory_space,typename device::memory_space > KernelHandle;
+  typedef KokkosKernels::Experimental::KokkosKernelsHandle<
+      size_type, lno_t, scalar_t, typename device::execution_space,
+      typename device::memory_space, typename device::memory_space>
+      KernelHandle;
 
   KernelHandle kh;
   kh.set_team_work_size(16);
   kh.set_dynamic_scheduling(true);
-  //kh.set_verbose(true);
+  // kh.set_verbose(true);
 
   kh.create_spgemm_handle(spgemm_algorithm);
 
-
   const size_t num_rows_1 = input_mat.numRows();
   const size_t num_rows_2 = input_mat2.numRows();
   const size_t num_cols_2 = input_mat2.numCols();
 
   const size_t num_cols_1 = input_mat.numCols();
-  bool equal = num_rows_2 == num_cols_1;
+  bool equal              = num_rows_2 == num_cols_1;
   if (!equal) return 1;
 
-  lno_view_t row_mapC ("non_const_lnow_row", num_rows_1 + 1);
-  lno_nnz_view_t  entriesC;
+  lno_view_t row_mapC("non_const_lnow_row", num_rows_1 + 1);
+  lno_nnz_view_t entriesC;
   scalar_view_t valuesC;
 
-
-  spgemm_symbolic (
-      &kh,
-      num_rows_1,
-      num_rows_2,
-      num_cols_2,
-      input_mat.graph.row_map,
-      input_mat.graph.entries,
-      false,
-      input_mat2.graph.row_map,
-      input_mat2.graph.entries,
-      false,
-      row_mapC
-  );
+  spgemm_symbolic(&kh, num_rows_1, num_rows_2, num_cols_2,
+                  input_mat.graph.row_map, input_mat.graph.entries, false,
+                  input_mat2.graph.row_map, input_mat2.graph.entries, false,
+                  row_mapC);
 
   size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
-  entriesC = lno_nnz_view_t (Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), c_nnz_size);
-  valuesC = scalar_view_t (Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size);
-  spgemm_numeric(
-      &kh,
-      num_rows_1,
-      num_rows_2,
-      num_cols_2,
-      input_mat.graph.row_map,
-      input_mat.graph.entries,
-      input_mat.values,
-      false,
-
-      input_mat2.graph.row_map,
-      input_mat2.graph.entries,
-      input_mat2.values,
-      false,
-      row_mapC,
-      entriesC,
-      valuesC
-  );
-
-  graph_t static_graph (entriesC, row_mapC);
+  entriesC          = lno_nnz_view_t(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), c_nnz_size);
+  valuesC = scalar_view_t(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size);
+  spgemm_numeric(&kh, num_rows_1, num_rows_2, num_cols_2,
+                 input_mat.graph.row_map, input_mat.graph.entries,
+                 input_mat.values, false,
+
+                 input_mat2.graph.row_map, input_mat2.graph.entries,
+                 input_mat2.values, false, row_mapC, entriesC, valuesC);
+
+  graph_t static_graph(entriesC, row_mapC);
   result = crsMat_t("CrsMatrix", num_cols_2, valuesC, static_graph);
   kh.destroy_spgemm_handle();
 
   return 0;
 }
 template <typename crsMat_t, typename device>
-bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference){
-
+bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference) {
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::non_const_type lno_view_t;
-  typedef typename graph_t::entries_type::non_const_type   lno_nnz_view_t;
+  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
   typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
 
-  size_t nrows_actual = output_mat_actual.numRows();
-  size_t nentries_actual = output_mat_actual.graph.entries.extent(0) ;
-  size_t nvals_actual = output_mat_actual.values.extent(0);
+  size_t nrows_actual    = output_mat_actual.numRows();
+  size_t nentries_actual = output_mat_actual.graph.entries.extent(0);
+  size_t nvals_actual    = output_mat_actual.values.extent(0);
 
-  size_t nrows_reference = output_mat_reference.numRows();
-  size_t nentries_reference = output_mat_reference.graph.entries.extent(0) ;
-  size_t nvals_reference = output_mat_reference.values.extent(0);
+  size_t nrows_reference    = output_mat_reference.numRows();
+  size_t nentries_reference = output_mat_reference.graph.entries.extent(0);
+  size_t nvals_reference    = output_mat_reference.values.extent(0);
 
-  if (nrows_actual != nrows_reference) { 
-     std::cout << "nrows_actual:" << nrows_actual << " nrows_reference:" << nrows_reference << std::endl;
-     return false;
+  if (nrows_actual != nrows_reference) {
+    std::cout << "nrows_actual:" << nrows_actual
+              << " nrows_reference:" << nrows_reference << std::endl;
+    return false;
   }
   if (nentries_actual != nentries_reference) {
-    std::cout << "nentries_actual:" << nentries_actual << " nentries_reference:" << nentries_reference << std::endl;
+    std::cout << "nentries_actual:" << nentries_actual
+              << " nentries_reference:" << nentries_reference << std::endl;
     return false;
   }
   if (nvals_actual != nvals_reference) {
-    std::cout << "nvals_actual:" << nvals_actual << " nvals_reference:" << nvals_reference << std::endl;
+    std::cout << "nvals_actual:" << nvals_actual
+              << " nvals_reference:" << nvals_reference << std::endl;
     return false;
   }
 
@@ -220,9 +201,10 @@ bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference){
   KokkosKernels::sort_crs_matrix(output_mat_reference);
 
   bool is_identical = true;
-  is_identical = KokkosKernels::Impl::kk_is_identical_view
-      <typename graph_t::row_map_type, typename graph_t::row_map_type, typename lno_view_t::value_type,
-      typename device::execution_space>(output_mat_actual.graph.row_map, output_mat_reference.graph.row_map, 0);
+  is_identical      = KokkosKernels::Impl::kk_is_identical_view<
+      typename graph_t::row_map_type, typename graph_t::row_map_type,
+      typename lno_view_t::value_type, typename device::execution_space>(
+      output_mat_actual.graph.row_map, output_mat_reference.graph.row_map, 0);
 
   if (!is_identical) {
     std::cout << "rowmaps are different." << std::endl;
@@ -233,9 +215,10 @@ bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference){
     return false;
   }
 
-  is_identical = KokkosKernels::Impl::kk_is_identical_view
-      <lno_nnz_view_t, lno_nnz_view_t, typename lno_nnz_view_t::value_type,
-      typename device::execution_space>(output_mat_actual.graph.entries, output_mat_reference.graph.entries, 0 );
+  is_identical = KokkosKernels::Impl::kk_is_identical_view<
+      lno_nnz_view_t, lno_nnz_view_t, typename lno_nnz_view_t::value_type,
+      typename device::execution_space>(output_mat_actual.graph.entries,
+                                        output_mat_reference.graph.entries, 0);
 
   if (!is_identical) {
     std::cout << "entries are different." << std::endl;
@@ -244,14 +227,13 @@ bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference){
     return false;
   }
 
+  typedef typename Kokkos::Details::ArithTraits<
+      typename scalar_view_t::non_const_value_type>::mag_type eps_type;
+  eps_type eps = std::is_same<eps_type, float>::value ? 2 * 1e-3 : 1e-7;
 
-  typedef typename Kokkos::Details::ArithTraits<typename scalar_view_t::non_const_value_type>::mag_type eps_type;
-  eps_type eps = std::is_same<eps_type,float>::value?2*1e-3:1e-7;
-
-
-  is_identical = KokkosKernels::Impl::kk_is_relatively_identical_view
-      <scalar_view_t, scalar_view_t, eps_type,
-      typename device::execution_space>(output_mat_actual.values, output_mat_reference.values, eps);
+  is_identical = KokkosKernels::Impl::kk_is_relatively_identical_view<
+      scalar_view_t, scalar_view_t, eps_type, typename device::execution_space>(
+      output_mat_actual.values, output_mat_reference.values, eps);
 
   if (!is_identical) {
     std::cout << "values are different." << std::endl;
@@ -262,51 +244,58 @@ bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference){
   }
   return true;
 }
-}
-
-template <typename scalar_t, typename lno_t, typename size_type, typename device>
-void test_spgemm(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance, bool oldInterface=false) {
-
+}  // namespace Test
+
+// Generate matrices and test all supported spgemm algorithms.
+// C := AB, where A is m*k, B is k*n, and C is m*n.
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth,
+                 lno_t row_size_variance, bool oldInterface = false) {
   using namespace Test;
-  //device::execution_space::initialize();
-  //device::execution_space::print_configuration(std::cout);
+  // device::execution_space::initialize();
+  // device::execution_space::print_configuration(std::cout);
 
   typedef CrsMatrix<scalar_t, lno_t, device, void, size_type> crsMat_t;
-  //typedef typename crsMat_t::StaticCrsGraphType graph_t;
-  //typedef typename graph_t::row_map_type::non_const_type lno_view_t;
-  //typedef typename graph_t::entries_type::non_const_type   lno_nnz_view_t;
-  //typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
-
-
-  lno_t numCols = numRows;
-  // Generate random compressed sparse row matrix. Randomly generated (non-zero) values are
-  // stored in a 1-D (1 rank) array.
-  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(numRows,numCols,nnz,row_size_variance, bandwidth);
+  // typedef typename crsMat_t::StaticCrsGraphType graph_t;
+  // typedef typename graph_t::row_map_type::non_const_type lno_view_t;
+  // typedef typename graph_t::entries_type::non_const_type   lno_nnz_view_t;
+  // typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
+
+  // Generate random compressed sparse row matrix. Randomly generated (non-zero)
+  // values are stored in a 1-D (1 rank) array.
+  crsMat_t A = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(
+      m, k, nnz, row_size_variance, bandwidth);
+  crsMat_t B = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(
+      k, n, nnz, row_size_variance, bandwidth);
 
   crsMat_t output_mat2;
-  if(oldInterface)
-    run_spgemm_old_interface<crsMat_t, device>(input_mat, input_mat, SPGEMM_DEBUG, output_mat2);
+  if (oldInterface)
+    run_spgemm_old_interface<crsMat_t, device>(A, B, SPGEMM_DEBUG, output_mat2);
   else
-    run_spgemm<crsMat_t, device>(input_mat, input_mat, SPGEMM_DEBUG, output_mat2);
+    run_spgemm<crsMat_t, device>(A, B, SPGEMM_DEBUG, output_mat2);
 
-  std::vector<SPGEMMAlgorithm> algorithms = {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED, SPGEMM_KK_MEMSPEED};
+  std::vector<SPGEMMAlgorithm> algorithms = {
+      SPGEMM_KK, SPGEMM_KK_LP, SPGEMM_KK_MEMORY /* alias SPGEMM_KK_MEMSPEED */,
+      SPGEMM_KK_SPEED /* alias SPGEMM_KK_DENSE */
+  };
 
 #ifdef HAVE_KOKKOSKERNELS_MKL
   algorithms.push_back(SPGEMM_MKL);
 #endif
 
-  for (auto spgemm_algorithm : algorithms)
-  {
+  for (auto spgemm_algorithm : algorithms) {
     const uint64_t max_integer = 2147483647;
-    std::string algo = "UNKNOWN";
-    bool is_expected_to_fail = false;
-
-    switch (spgemm_algorithm){
-    case SPGEMM_CUSPARSE:
-      //TODO: add these test failure cases for cusparse too.
-      algo = "SPGEMM_CUSPARSE";
-#if !defined(KERNELS_HAVE_CUSPARSE) && !defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
-      is_expected_to_fail = true;
+    std::string algo           = "UNKNOWN";
+    bool is_expected_to_fail   = false;
+
+    switch (spgemm_algorithm) {
+      case SPGEMM_CUSPARSE:
+        // TODO: add these test failure cases for cusparse too.
+        algo = "SPGEMM_CUSPARSE";
+#if !defined(KERNELS_HAVE_CUSPARSE) && \
+    !defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
+        is_expected_to_fail = true;
 #endif
         break;
 
@@ -323,7 +312,7 @@ void test_spgemm(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_v
         }
         // if size_type is larger than int, mkl casts it to int.
         // it will fail if casting cause overflow.
-        if (input_mat.values.extent(0) > max_integer) {
+        if (A.values.extent(0) > max_integer) {
           is_expected_to_fail = true;
         }
 
@@ -334,6 +323,8 @@ void test_spgemm(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_v
         }
         break;
 
+      case SPGEMM_KK: algo = "SPGEMM_KK"; break;
+      case SPGEMM_KK_LP: algo = "SPGEMM_KK_LP"; break;
       case SPGEMM_KK_MEMSPEED: algo = "SPGEMM_KK_MEMSPEED"; break;
       case SPGEMM_KK_SPEED: algo = "SPGEMM_KK_SPEED"; break;
       case SPGEMM_KK_MEMORY: algo = "SPGEMM_KK_MEMORY"; break;
@@ -344,49 +335,49 @@ void test_spgemm(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_v
     crsMat_t output_mat;
 
     bool failed = false;
-    int res = 0;
-    try{
-      if(oldInterface)
-	res = run_spgemm_old_interface<crsMat_t, device>(input_mat, input_mat, spgemm_algorithm, output_mat);
+    int res     = 0;
+    try {
+      if (oldInterface)
+        res = run_spgemm_old_interface<crsMat_t, device>(A, B, spgemm_algorithm,
+                                                         output_mat);
       else
-	res = run_spgemm<crsMat_t, device>(input_mat, input_mat, spgemm_algorithm, output_mat);
-    }
-    catch (const char *message){
-      EXPECT_TRUE(is_expected_to_fail) << algo;
+        res = run_spgemm<crsMat_t, device>(A, B, spgemm_algorithm, output_mat);
+    } catch (const char *message) {
+      EXPECT_TRUE(is_expected_to_fail) << algo << ": " << message;
       failed = true;
-    }
-    catch (std::string message){
-      EXPECT_TRUE(is_expected_to_fail)<< algo;
+    } catch (std::string message) {
+      EXPECT_TRUE(is_expected_to_fail) << algo << ": " << message;
       failed = true;
-    }
-    catch (std::exception& e){
-      EXPECT_TRUE(is_expected_to_fail)<< algo;
+    } catch (std::exception &e) {
+      EXPECT_TRUE(is_expected_to_fail) << algo << ": " << e.what();
       failed = true;
     }
     EXPECT_TRUE((failed == is_expected_to_fail));
 
-    //double spgemm_time = timer1.seconds();
+    // double spgemm_time = timer1.seconds();
 
     timer1.reset();
-    if (!is_expected_to_fail){
-
-      EXPECT_TRUE( (res == 0)) << algo;
-      bool is_identical = is_same_matrix<crsMat_t, device>(output_mat, output_mat2);
+    if (!is_expected_to_fail) {
+      EXPECT_TRUE((res == 0)) << algo;
+      bool is_identical =
+          is_same_matrix<crsMat_t, device>(output_mat, output_mat2);
       EXPECT_TRUE(is_identical) << algo;
-      //EXPECT_TRUE( equal) << algo;
+      // EXPECT_TRUE( equal) << algo;
     }
-    //std::cout << "algo:" << algo << " spgemm_time:" << spgemm_time << " output_check_time:" << timer1.seconds() << std::endl;
+    // std::cout << "algo:" << algo << " spgemm_time:" << spgemm_time << "
+    // output_check_time:" << timer1.seconds() << std::endl;
   }
-  //device::execution_space::finalize();
+  // device::execution_space::finalize();
 }
 
-template <typename scalar_t, typename lno_t, typename size_type, typename device>
-void test_issue402()
-{
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void test_issue402() {
   using namespace Test;
   typedef CrsMatrix<scalar_t, lno_t, device, void, size_type> crsMat_t;
 
-  //this specific matrix (from a circuit simulation) reliably replicated issue #402 (incorrect/crashing SPGEMM KKMEM)
+  // this specific matrix (from a circuit simulation) reliably replicated issue
+  // #402 (incorrect/crashing SPGEMM KKMEM)
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::non_const_type lno_view_t;
   typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
@@ -396,167 +387,203 @@ void test_issue402()
   lno_view_t Arowmap("A rowmap", numRows + 1);
   lno_nnz_view_t Aentries("A entries", nnz);
   scalar_view_t Avalues("A values", nnz);
-  //Read out the matrix from the header file "matrixIssue402.hpp"
+  // Read out the matrix from the header file "matrixIssue402.hpp"
   {
-    auto rowmapHost = Kokkos::create_mirror_view(Arowmap);
+    auto rowmapHost  = Kokkos::create_mirror_view(Arowmap);
     auto entriesHost = Kokkos::create_mirror_view(Aentries);
-    auto valuesHost = Kokkos::create_mirror_view(Avalues);
-    for(lno_t i = 0; i < numRows + 1; i++)
+    auto valuesHost  = Kokkos::create_mirror_view(Avalues);
+    for (lno_t i = 0; i < numRows + 1; i++)
       rowmapHost(i) = MatrixIssue402::rowmap[i];
-    for(size_type i = 0; i < nnz; i++)
-    {
+    for (size_type i = 0; i < nnz; i++) {
       entriesHost(i) = MatrixIssue402::entries[i];
-      valuesHost(i) = MatrixIssue402::values[i];
+      valuesHost(i)  = MatrixIssue402::values[i];
     }
     Kokkos::deep_copy(Arowmap, rowmapHost);
     Kokkos::deep_copy(Aentries, entriesHost);
     Kokkos::deep_copy(Avalues, valuesHost);
   }
   crsMat_t A("A", numRows, numRows, nnz, Avalues, Arowmap, Aentries);
-  //compute explicit transpose: the bug was replicated by computing AA'
+  // compute explicit transpose: the bug was replicated by computing AA'
   lno_view_t Browmap("B = A^T rowmap", numRows + 1);
   lno_nnz_view_t Bentries("B = A^T entries", nnz);
   scalar_view_t Bvalues("B = A^T values", nnz);
   KokkosKernels::Impl::transpose_matrix<
-    lno_view_t, lno_nnz_view_t, scalar_view_t,
-    lno_view_t, lno_nnz_view_t, scalar_view_t,
-    lno_view_t, typename device::execution_space>
-      (numRows, numRows, Arowmap, Aentries, Avalues, Browmap, Bentries, Bvalues);
+      lno_view_t, lno_nnz_view_t, scalar_view_t, lno_view_t, lno_nnz_view_t,
+      scalar_view_t, lno_view_t, typename device::execution_space>(
+      numRows, numRows, Arowmap, Aentries, Avalues, Browmap, Bentries, Bvalues);
   crsMat_t B("B=A^T", numRows, numRows, nnz, Bvalues, Browmap, Bentries);
   crsMat_t Cgold;
   run_spgemm<crsMat_t, device>(A, B, SPGEMM_DEBUG, Cgold);
   crsMat_t C;
   bool success = true;
   std::string errMsg;
-  try
-  {
+  try {
     int res = run_spgemm<crsMat_t, device>(A, B, SPGEMM_KK_MEMORY, C);
-    if(res)
-      throw "run_spgemm returned error code";
-  }
-  catch(const char *message) {
-    errMsg = message;
+    if (res) throw "run_spgemm returned error code";
+  } catch (const char *message) {
+    errMsg  = message;
     success = false;
-  }
-  catch (std::string message) {
-    errMsg = message;
+  } catch (std::string message) {
+    errMsg  = message;
     success = false;
-  }
-  catch (std::exception& e) {
-    errMsg = e.what();
+  } catch (std::exception &e) {
+    errMsg  = e.what();
     success = false;
   }
-  EXPECT_TRUE(success) << "KKMEM still has issue 402 bug! Error message:\n" << errMsg << '\n';
+  EXPECT_TRUE(success) << "KKMEM still has issue 402 bug! Error message:\n"
+                       << errMsg << '\n';
   bool correctResult = is_same_matrix<crsMat_t, device>(C, Cgold);
-  EXPECT_TRUE(correctResult) << "KKMEM still has issue 402 bug; C=AA' is incorrect!\n";
+  EXPECT_TRUE(correctResult)
+      << "KKMEM still has issue 402 bug; C=AA' is incorrect!\n";
 }
 
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
-TEST_F( TestCategory, sparse ## _ ## spgemm ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  test_spgemm<SCALAR,ORDINAL,OFFSET,DEVICE>(10000, 10000 * 20, 500, 10); \
-  test_spgemm<SCALAR,ORDINAL,OFFSET,DEVICE>(0, 0, 10, 10); \
-  test_issue402<SCALAR,ORDINAL,OFFSET,DEVICE>(); \
-  test_spgemm<SCALAR,ORDINAL,OFFSET,DEVICE>(10000, 10000 * 20, 500, 10, true); \
-}
+#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                          \
+  TEST_F(TestCategory,                                                         \
+         sparse##_##spgemm##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {     \
+    test_spgemm<SCALAR, ORDINAL, OFFSET, DEVICE>(10000, 10000, 10000,          \
+                                                 10000 * 20, 500, 10, false);  \
+    test_spgemm<SCALAR, ORDINAL, OFFSET, DEVICE>(10000, 10000, 10000,          \
+                                                 10000 * 20, 500, 10, true);   \
+    test_spgemm<SCALAR, ORDINAL, OFFSET, DEVICE>(0, 0, 0, 0, 10, 10, false);   \
+    test_spgemm<SCALAR, ORDINAL, OFFSET, DEVICE>(0, 0, 0, 0, 10, 10, true);    \
+    test_spgemm<SCALAR, ORDINAL, OFFSET, DEVICE>(0, 12, 5, 0, 10, 0, false);   \
+    test_spgemm<SCALAR, ORDINAL, OFFSET, DEVICE>(0, 12, 5, 0, 10, 0, true);    \
+    test_spgemm<SCALAR, ORDINAL, OFFSET, DEVICE>(10, 10, 0, 0, 10, 10, false); \
+    test_spgemm<SCALAR, ORDINAL, OFFSET, DEVICE>(10, 10, 0, 0, 10, 10, true);  \
+    test_spgemm<SCALAR, ORDINAL, OFFSET, DEVICE>(10, 10, 10, 0, 0, 0, false);  \
+    test_spgemm<SCALAR, ORDINAL, OFFSET, DEVICE>(10, 10, 10, 0, 0, 0, true);   \
+    test_issue402<SCALAR, ORDINAL, OFFSET, DEVICE>();                          \
+  }
 
-//test_spgemm<SCALAR,ORDINAL,OFFSET,DEVICE>(50000, 50000 * 30, 100, 10);
-//test_spgemm<SCALAR,ORDINAL,OFFSET,DEVICE>(50000, 50000 * 30, 200, 10);
+// test_spgemm<SCALAR,ORDINAL,OFFSET,DEVICE>(50000, 50000 * 30, 100, 10);
+// test_spgemm<SCALAR,ORDINAL,OFFSET,DEVICE>(50000, 50000 * 30, 200, 10);
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
 #endif
 
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
 #endif
 
 #undef EXECUTE_TEST
diff --git a/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp b/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp
index a7888bf243..6f416e6f59 100644
--- a/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp
+++ b/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp
@@ -42,7 +42,6 @@
 //@HEADER
 */
 
-
 #include <gtest/gtest.h>
 #include <Kokkos_Core.hpp>
 
@@ -55,12 +54,10 @@
 #include "KokkosSparse_spgemm.hpp"
 #include "KokkosSparse_CrsMatrix.hpp"
 
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-
-
-#include<KokkosKernels_IOUtils.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
 
+#include <KokkosKernels_IOUtils.hpp>
 
 using namespace KokkosSparse;
 using namespace KokkosSparse::Experimental;
@@ -77,8 +74,12 @@ typedef Kokkos::complex<float> kokkos_complex_float;
 
 namespace Test {
 
-  template <typename crsMat_t, typename device, typename scalar_type, typename dinv_view_t>
-  int run_spgemm_jacobi(crsMat_t input_mat, crsMat_t input_mat2, scalar_type omega, dinv_view_t dinv, KokkosSparse::SPGEMMAlgorithm spgemm_algorithm, crsMat_t &result) {
+template <typename crsMat_t, typename device, typename scalar_type,
+          typename dinv_view_t>
+int run_spgemm_jacobi(crsMat_t input_mat, crsMat_t input_mat2,
+                      scalar_type omega, dinv_view_t dinv,
+                      KokkosSparse::SPGEMMAlgorithm spgemm_algorithm,
+                      crsMat_t &result) {
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::non_const_type lno_view_t;
   typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
@@ -88,15 +89,15 @@ namespace Test {
   typedef typename lno_nnz_view_t::value_type lno_t;
   typedef typename scalar_view_t::value_type scalar_t;
 
-  typedef KokkosKernels::Experimental::KokkosKernelsHandle
-      <size_type,lno_t, scalar_t,
-      typename device::execution_space, typename device::memory_space,typename device::memory_space > KernelHandle;
+  typedef KokkosKernels::Experimental::KokkosKernelsHandle<
+      size_type, lno_t, scalar_t, typename device::execution_space,
+      typename device::memory_space, typename device::memory_space>
+      KernelHandle;
 
   KernelHandle kh;
   kh.set_team_work_size(16);
   kh.set_dynamic_scheduling(true);
 
-
   kh.create_spgemm_handle(spgemm_algorithm);
 
   const size_t num_rows_1 = input_mat.numRows();
@@ -104,57 +105,33 @@ namespace Test {
   const size_t num_cols_2 = input_mat2.numCols();
 
   const size_t num_cols_1 = input_mat.numCols();
-  bool equal = num_rows_2 == num_cols_1;
+  bool equal              = num_rows_2 == num_cols_1;
   if (!equal) return 1;
 
-
-
-  lno_view_t row_mapC ("non_const_lnow_row", num_rows_1 + 1);
+  lno_view_t row_mapC("non_const_lnow_row", num_rows_1 + 1);
   lno_nnz_view_t entriesC;
   scalar_view_t valuesC;
 
-
-  spgemm_symbolic (
-      &kh,
-      num_rows_1,
-      num_rows_2,
-      num_cols_2,
-      input_mat.graph.row_map,
-      input_mat.graph.entries,
-      false,
-      input_mat2.graph.row_map,
-      input_mat2.graph.entries,
-      false,
-      row_mapC
-  );
+  spgemm_symbolic(&kh, num_rows_1, num_rows_2, num_cols_2,
+                  input_mat.graph.row_map, input_mat.graph.entries, false,
+                  input_mat2.graph.row_map, input_mat2.graph.entries, false,
+                  row_mapC);
 
   size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
-  if (c_nnz_size){
-    entriesC = lno_nnz_view_t (Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), c_nnz_size);
-    valuesC = scalar_view_t (Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size);
+  if (c_nnz_size) {
+    entriesC = lno_nnz_view_t(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"),
+        c_nnz_size);
+    valuesC = scalar_view_t(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size);
   }
-  spgemm_jacobi(
-      &kh,
-      num_rows_1,
-      num_rows_2,
-      num_cols_2,
-      input_mat.graph.row_map,
-      input_mat.graph.entries,
-      input_mat.values,
-      false,
-      input_mat2.graph.row_map,
-      input_mat2.graph.entries,
-      input_mat2.values,
-      false,
-      row_mapC,
-      entriesC,
-      valuesC,
-      omega,
-      dinv
-  );
-
-
-  graph_t static_graph (entriesC, row_mapC);
+  spgemm_jacobi(&kh, num_rows_1, num_rows_2, num_cols_2,
+                input_mat.graph.row_map, input_mat.graph.entries,
+                input_mat.values, false, input_mat2.graph.row_map,
+                input_mat2.graph.entries, input_mat2.values, false, row_mapC,
+                entriesC, valuesC, omega, dinv);
+
+  graph_t static_graph(entriesC, row_mapC);
   crsMat_t crsmat("CrsMatrix", num_cols_2, valuesC, static_graph);
   result = crsmat;
   kh.destroy_spgemm_handle();
@@ -163,29 +140,29 @@ namespace Test {
 }
 
 template <typename crsMat_t, typename device>
-bool is_same_mat(crsMat_t output_mat1, crsMat_t output_mat2){
-
+bool is_same_mat(crsMat_t output_mat1, crsMat_t output_mat2) {
   typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename graph_t::row_map_type::non_const_type lno_view_t;
-  typedef typename graph_t::entries_type::non_const_type   lno_nnz_view_t;
+  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
   typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
 
-  size_t nrows1 = output_mat1.graph.row_map.extent(0);
-  size_t nentries1 = output_mat1.graph.entries.extent(0) ;
-  size_t nvals1 = output_mat1.values.extent(0);
+  size_t nrows1    = output_mat1.graph.row_map.extent(0);
+  size_t nentries1 = output_mat1.graph.entries.extent(0);
+  size_t nvals1    = output_mat1.values.extent(0);
 
-  size_t nrows2 = output_mat2.graph.row_map.extent(0);
-  size_t nentries2 = output_mat2.graph.entries.extent(0) ;
-  size_t nvals2 = output_mat2.values.extent(0);
+  size_t nrows2    = output_mat2.graph.row_map.extent(0);
+  size_t nentries2 = output_mat2.graph.entries.extent(0);
+  size_t nvals2    = output_mat2.values.extent(0);
 
   KokkosKernels::sort_crs_matrix(output_mat1);
 
-  if (nrows1 != nrows2) { 
-     std::cout << "nrows1:" << nrows1 << " nrows2:" << nrows2 << std::endl;
-     return false;
+  if (nrows1 != nrows2) {
+    std::cout << "nrows1:" << nrows1 << " nrows2:" << nrows2 << std::endl;
+    return false;
   }
   if (nentries1 != nentries2) {
-    std::cout << "nentries1:" << nentries1 << " nentries2:" << nentries2 << std::endl;
+    std::cout << "nentries1:" << nentries1 << " nentries2:" << nentries2
+              << std::endl;
     return false;
   }
   if (nvals1 != nvals2) {
@@ -196,9 +173,10 @@ bool is_same_mat(crsMat_t output_mat1, crsMat_t output_mat2){
   KokkosKernels::sort_crs_matrix(output_mat2);
 
   bool is_identical = true;
-  is_identical = KokkosKernels::Impl::kk_is_identical_view
-      <typename graph_t::row_map_type, typename graph_t::row_map_type, typename lno_view_t::value_type,
-      typename device::execution_space>(output_mat1.graph.row_map, output_mat2.graph.row_map, 0);
+  is_identical      = KokkosKernels::Impl::kk_is_identical_view<
+      typename graph_t::row_map_type, typename graph_t::row_map_type,
+      typename lno_view_t::value_type, typename device::execution_space>(
+      output_mat1.graph.row_map, output_mat2.graph.row_map, 0);
 
   if (!is_identical) {
     std::cout << "rowmaps are different." << std::endl;
@@ -207,9 +185,10 @@ bool is_same_mat(crsMat_t output_mat1, crsMat_t output_mat2){
     return false;
   }
 
-  is_identical = KokkosKernels::Impl::kk_is_identical_view
-      <lno_nnz_view_t, lno_nnz_view_t, typename lno_nnz_view_t::value_type,
-      typename device::execution_space>(output_mat1.graph.entries, output_mat2.graph.entries, 0 );
+  is_identical = KokkosKernels::Impl::kk_is_identical_view<
+      lno_nnz_view_t, lno_nnz_view_t, typename lno_nnz_view_t::value_type,
+      typename device::execution_space>(output_mat1.graph.entries,
+                                        output_mat2.graph.entries, 0);
 
   if (!is_identical) {
     std::cout << "entries are different." << std::endl;
@@ -218,13 +197,13 @@ bool is_same_mat(crsMat_t output_mat1, crsMat_t output_mat2){
     return false;
   }
 
+  typedef typename Kokkos::Details::ArithTraits<
+      typename scalar_view_t::non_const_value_type>::mag_type eps_type;
+  eps_type eps = std::is_same<eps_type, float>::value ? 2 * 1e-3 : 1e-7;
 
-  typedef typename Kokkos::Details::ArithTraits<typename scalar_view_t::non_const_value_type>::mag_type eps_type;
-  eps_type eps = std::is_same<eps_type,float>::value?2*1e-3:1e-7;
-
-  is_identical = KokkosKernels::Impl::kk_is_relatively_identical_view
-      <scalar_view_t, scalar_view_t, eps_type,
-      typename device::execution_space>(output_mat1.values, output_mat2.values, eps);
+  is_identical = KokkosKernels::Impl::kk_is_relatively_identical_view<
+      scalar_view_t, scalar_view_t, eps_type, typename device::execution_space>(
+      output_mat1.values, output_mat2.values, eps);
 
   if (!is_identical) {
     std::cout << "values are different for eps: " << eps << std::endl;
@@ -235,16 +214,19 @@ bool is_same_mat(crsMat_t output_mat1, crsMat_t output_mat2){
   }
   return true;
 }
-}
-
-template <typename scalar_t, typename lno_t, typename size_type, typename device>
-void test_spgemm_jacobi(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance) {
+}  // namespace Test
 
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void test_spgemm_jacobi(lno_t numRows, size_type nnz, lno_t bandwidth,
+                        lno_t row_size_variance) {
   using namespace Test;
   typedef CrsMatrix<scalar_t, lno_t, device, void, size_type> crsMat_t;
 
   lno_t numCols = numRows;
-  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<crsMat_t>(numRows,numCols,nnz,row_size_variance, bandwidth);
+  crsMat_t input_mat =
+      KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix<
+          crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth);
 
   crsMat_t output_mat2;
   scalar_t omega = 3.0;
@@ -254,126 +236,162 @@ void test_spgemm_jacobi(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row
   typedef typename device::memory_space c_temp_t;
   typedef typename Kokkos::Device<c_exec_t, c_temp_t> UniformDevice_t;
   typedef typename Kokkos::View<scalar_t **,
-  			typename KokkosKernels::Impl::GetUnifiedLayout<scalar_view_t>::array_layout,
-  			UniformDevice_t > view_t;
-
+                                typename KokkosKernels::Impl::GetUnifiedLayout<
+                                    scalar_view_t>::array_layout,
+                                UniformDevice_t>
+      view_t;
 
   view_t dinv("Dinv", numRows, 1);
   Kokkos::deep_copy(dinv, 2.0);
 
-  run_spgemm_jacobi<crsMat_t, device, scalar_t, view_t>(input_mat, input_mat, omega, dinv, SPGEMM_SERIAL, output_mat2);
+  run_spgemm_jacobi<crsMat_t, device, scalar_t, view_t>(
+      input_mat, input_mat, omega, dinv, SPGEMM_SERIAL, output_mat2);
 
-  SPGEMMAlgorithm spgemm_algorithm = SPGEMM_KK_MEMORY; // should we test other SpGEMM algorithms as well?
+  SPGEMMAlgorithm spgemm_algorithm =
+      SPGEMM_KK_MEMORY;  // should we test other SpGEMM algorithms as well?
 
   crsMat_t output_mat;
 
-  run_spgemm_jacobi<crsMat_t, device>(input_mat, input_mat, omega, dinv, spgemm_algorithm, output_mat);
+  run_spgemm_jacobi<crsMat_t, device>(input_mat, input_mat, omega, dinv,
+                                      spgemm_algorithm, output_mat);
   bool is_identical = is_same_mat<crsMat_t, device>(output_mat, output_mat2);
   EXPECT_TRUE(is_identical);
-
-}
-
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
-TEST_F( TestCategory, sparse ## _ ## spgemm_jacobi ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  test_spgemm_jacobi<SCALAR,ORDINAL,OFFSET,DEVICE>(1000, 1000 * 10, 50, 10); \
 }
 
+#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                          \
+  TEST_F(                                                                      \
+      TestCategory,                                                            \
+      sparse##_##spgemm_jacobi##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    test_spgemm_jacobi<SCALAR, ORDINAL, OFFSET, DEVICE>(1000, 1000 * 10, 50,   \
+                                                        10);                   \
+  }
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
 #endif
 
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
 #endif
 
 #undef EXECUTE_TEST
diff --git a/unit_test/sparse/Test_Sparse_spiluk.hpp b/unit_test/sparse/Test_Sparse_spiluk.hpp
index 4448db48a4..31bd4b47ec 100644
--- a/unit_test/sparse/Test_Sparse_spiluk.hpp
+++ b/unit_test/sparse/Test_Sparse_spiluk.hpp
@@ -42,7 +42,6 @@
 //@HEADER
 */
 
-
 #include <gtest/gtest.h>
 #include <Kokkos_Core.hpp>
 
@@ -57,8 +56,7 @@
 #include "KokkosSparse_spmv.hpp"
 #include "KokkosSparse_spiluk.hpp"
 
-#include<gtest/gtest.h>
-
+#include <gtest/gtest.h>
 
 using namespace KokkosSparse;
 using namespace KokkosSparse::Experimental;
@@ -75,20 +73,20 @@ typedef Kokkos::complex<float> kokkos_complex_float;
 
 namespace Test {
 
-template <typename scalar_t, typename lno_t, typename size_type, typename device>
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
 void run_test_spiluk() {
-
-  typedef Kokkos::View< size_type*, device >     RowMapType;
-  typedef Kokkos::View< lno_t*,     device >     EntriesType;
-  typedef Kokkos::View< scalar_t*,  device >     ValuesType;
+  typedef Kokkos::View<size_type*, device> RowMapType;
+  typedef Kokkos::View<lno_t*, device> EntriesType;
+  typedef Kokkos::View<scalar_t*, device> ValuesType;
   typedef Kokkos::Details::ArithTraits<scalar_t> AT;
 
   const size_type nrows = 9;
   const size_type nnz   = 21;
 
-  RowMapType  row_map("row_map", nrows+1);
+  RowMapType row_map("row_map", nrows + 1);
   EntriesType entries("entries", nnz);
-  ValuesType  values ("values",  nnz);
+  ValuesType values("values", nnz);
 
   auto hrow_map = Kokkos::create_mirror_view(row_map);
   auto hentries = Kokkos::create_mirror_view(entries);
@@ -155,242 +153,285 @@ void run_test_spiluk() {
 
   Kokkos::deep_copy(row_map, hrow_map);
   Kokkos::deep_copy(entries, hentries);
-  Kokkos::deep_copy(values,  hvalues);
+  Kokkos::deep_copy(values, hvalues);
 
-  typedef KokkosKernels::Experimental::KokkosKernelsHandle <size_type, lno_t, scalar_t,
-                                  typename device::execution_space, typename device::memory_space,typename device::memory_space > KernelHandle;
+  typedef KokkosKernels::Experimental::KokkosKernelsHandle<
+      size_type, lno_t, scalar_t, typename device::execution_space,
+      typename device::memory_space, typename device::memory_space>
+      KernelHandle;
 
   KernelHandle kh;
 
-  //SPILUKAlgorithm::SEQLVLSCHD_RP
+  // SPILUKAlgorithm::SEQLVLSCHD_RP
   {
-    kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_RP, nrows, 4*nrows, 4*nrows);
-    
+    kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_RP, nrows, 4 * nrows,
+                            4 * nrows);
+
     auto spiluk_handle = kh.get_spiluk_handle();
-    
+
     // Allocate L and U as outputs
-    RowMapType  L_row_map("L_row_map", nrows + 1);                
+    RowMapType L_row_map("L_row_map", nrows + 1);
     EntriesType L_entries("L_entries", spiluk_handle->get_nnzL());
-    ValuesType  L_values ("L_values",  spiluk_handle->get_nnzL());
-    RowMapType  U_row_map("U_row_map", nrows + 1);                    
+    ValuesType L_values("L_values", spiluk_handle->get_nnzL());
+    RowMapType U_row_map("U_row_map", nrows + 1);
     EntriesType U_entries("U_entries", spiluk_handle->get_nnzU());
-    ValuesType  U_values ("U_values",  spiluk_handle->get_nnzU());
-	  
+    ValuesType U_values("U_values", spiluk_handle->get_nnzU());
+
     typename KernelHandle::const_nnz_lno_t fill_lev = 2;
-    
-    spiluk_symbolic( &kh, fill_lev, row_map, entries, L_row_map, L_entries, U_row_map, U_entries );
+
+    spiluk_symbolic(&kh, fill_lev, row_map, entries, L_row_map, L_entries,
+                    U_row_map, U_entries);
 
     Kokkos::fence();
-    
+
     Kokkos::resize(L_entries, spiluk_handle->get_nnzL());
-    Kokkos::resize(L_values,  spiluk_handle->get_nnzL());
+    Kokkos::resize(L_values, spiluk_handle->get_nnzL());
     Kokkos::resize(U_entries, spiluk_handle->get_nnzU());
-    Kokkos::resize(U_values,  spiluk_handle->get_nnzU());
-    
+    Kokkos::resize(U_values, spiluk_handle->get_nnzU());
+
     spiluk_handle->print_algorithm();
-    spiluk_numeric( &kh, fill_lev, row_map, entries, values, 
-                                   L_row_map, L_entries, L_values, U_row_map, U_entries, U_values );
-	  				 
+    spiluk_numeric(&kh, fill_lev, row_map, entries, values, L_row_map,
+                   L_entries, L_values, U_row_map, U_entries, U_values);
+
     Kokkos::fence();
 
     // Checking
     typedef CrsMatrix<scalar_t, lno_t, device, void, size_type> crsMat_t;
     crsMat_t A("A_Mtx", nrows, nrows, nnz, values, row_map, entries);
-    crsMat_t L("L_Mtx", nrows, nrows, spiluk_handle->get_nnzL(), L_values, L_row_map, L_entries);
-    crsMat_t U("U_Mtx", nrows, nrows, spiluk_handle->get_nnzU(), U_values, U_row_map, U_entries);
-    
+    crsMat_t L("L_Mtx", nrows, nrows, spiluk_handle->get_nnzL(), L_values,
+               L_row_map, L_entries);
+    crsMat_t U("U_Mtx", nrows, nrows, spiluk_handle->get_nnzU(), U_values,
+               U_row_map, U_entries);
+
     // Create a reference view e set to all 1's
-    ValuesType e_one  ( "e_one",  nrows ); Kokkos::deep_copy( e_one, 1.0 );
-    
+    ValuesType e_one("e_one", nrows);
+    Kokkos::deep_copy(e_one, 1.0);
+
     // Create two views for spmv results
-    ValuesType bb     ( "bb",     nrows );
-    ValuesType bb_tmp ( "bb_tmp", nrows );
-    
+    ValuesType bb("bb", nrows);
+    ValuesType bb_tmp("bb_tmp", nrows);
+
     // Compute norm2(L*U*e_one - A*e_one)/norm2(A*e_one)
-    KokkosSparse::spmv( "N", ONE, A, e_one, ZERO, bb); 
+    KokkosSparse::spmv("N", ONE, A, e_one, ZERO, bb);
 
     typename AT::mag_type bb_nrm = KokkosBlas::nrm2(bb);
-    
-    KokkosSparse::spmv( "N", ONE, U, e_one,  ZERO, bb_tmp);
-    KokkosSparse::spmv( "N", ONE, L, bb_tmp, MONE, bb);
+
+    KokkosSparse::spmv("N", ONE, U, e_one, ZERO, bb_tmp);
+    KokkosSparse::spmv("N", ONE, L, bb_tmp, MONE, bb);
 
     typename AT::mag_type diff_nrm = KokkosBlas::nrm2(bb);
-	     
-    EXPECT_TRUE( (diff_nrm/bb_nrm) < 1e-4 );
-    
+
+    EXPECT_TRUE((diff_nrm / bb_nrm) < 1e-4);
+
     kh.destroy_spiluk_handle();
   }
 
-  //SPILUKAlgorithm::SEQLVLSCHD_TP1
+  // SPILUKAlgorithm::SEQLVLSCHD_TP1
   {
-    kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, nrows, 4*nrows, 4*nrows);
-    
+    kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, nrows, 4 * nrows,
+                            4 * nrows);
+
     auto spiluk_handle = kh.get_spiluk_handle();
-    
+
     // Allocate L and U as outputs
-    RowMapType  L_row_map("L_row_map", nrows + 1);                
+    RowMapType L_row_map("L_row_map", nrows + 1);
     EntriesType L_entries("L_entries", spiluk_handle->get_nnzL());
-    ValuesType  L_values ("L_values",  spiluk_handle->get_nnzL());
-    RowMapType  U_row_map("U_row_map", nrows + 1);                    
+    ValuesType L_values("L_values", spiluk_handle->get_nnzL());
+    RowMapType U_row_map("U_row_map", nrows + 1);
     EntriesType U_entries("U_entries", spiluk_handle->get_nnzU());
-    ValuesType  U_values ("U_values",  spiluk_handle->get_nnzU());
-	  
+    ValuesType U_values("U_values", spiluk_handle->get_nnzU());
+
     typename KernelHandle::const_nnz_lno_t fill_lev = 2;
-    
-    spiluk_symbolic( &kh, fill_lev, row_map, entries, L_row_map, L_entries, U_row_map, U_entries );
+
+    spiluk_symbolic(&kh, fill_lev, row_map, entries, L_row_map, L_entries,
+                    U_row_map, U_entries);
 
     Kokkos::fence();
-    
+
     Kokkos::resize(L_entries, spiluk_handle->get_nnzL());
-    Kokkos::resize(L_values,  spiluk_handle->get_nnzL());
+    Kokkos::resize(L_values, spiluk_handle->get_nnzL());
     Kokkos::resize(U_entries, spiluk_handle->get_nnzU());
-    Kokkos::resize(U_values,  spiluk_handle->get_nnzU());
-    
+    Kokkos::resize(U_values, spiluk_handle->get_nnzU());
+
     spiluk_handle->print_algorithm();
-    spiluk_numeric( &kh, fill_lev, row_map, entries, values, 
-                                   L_row_map, L_entries, L_values, U_row_map, U_entries, U_values );
+    spiluk_numeric(&kh, fill_lev, row_map, entries, values, L_row_map,
+                   L_entries, L_values, U_row_map, U_entries, U_values);
 
     Kokkos::fence();
 
     // Checking
     typedef CrsMatrix<scalar_t, lno_t, device, void, size_type> crsMat_t;
     crsMat_t A("A_Mtx", nrows, nrows, nnz, values, row_map, entries);
-    crsMat_t L("L_Mtx", nrows, nrows, spiluk_handle->get_nnzL(), L_values, L_row_map, L_entries);
-    crsMat_t U("U_Mtx", nrows, nrows, spiluk_handle->get_nnzU(), U_values, U_row_map, U_entries);
-    
+    crsMat_t L("L_Mtx", nrows, nrows, spiluk_handle->get_nnzL(), L_values,
+               L_row_map, L_entries);
+    crsMat_t U("U_Mtx", nrows, nrows, spiluk_handle->get_nnzU(), U_values,
+               U_row_map, U_entries);
+
     // Create a reference view e set to all 1's
-    ValuesType e_one  ( "e_one",  nrows ); Kokkos::deep_copy( e_one, 1.0 );
-    
-    // Create two views for spmv results     
-    ValuesType bb     ( "bb",     nrows );
-    ValuesType bb_tmp ( "bb_tmp", nrows );
-    
+    ValuesType e_one("e_one", nrows);
+    Kokkos::deep_copy(e_one, 1.0);
+
+    // Create two views for spmv results
+    ValuesType bb("bb", nrows);
+    ValuesType bb_tmp("bb_tmp", nrows);
+
     // Compute norm2(L*U*e_one - A*e_one)/norm2(A*e_one)
-    KokkosSparse::spmv( "N", ONE, A, e_one, ZERO, bb);
-	
+    KokkosSparse::spmv("N", ONE, A, e_one, ZERO, bb);
+
     typename AT::mag_type bb_nrm = KokkosBlas::nrm2(bb);
-    
-    KokkosSparse::spmv( "N", ONE, U, e_one,  ZERO, bb_tmp);
-    KokkosSparse::spmv( "N", ONE, L, bb_tmp, MONE, bb);
-	  
+
+    KokkosSparse::spmv("N", ONE, U, e_one, ZERO, bb_tmp);
+    KokkosSparse::spmv("N", ONE, L, bb_tmp, MONE, bb);
+
     typename AT::mag_type diff_nrm = KokkosBlas::nrm2(bb);
-	     
-    EXPECT_TRUE( (diff_nrm/bb_nrm) < 1e-4 );
-    
+
+    EXPECT_TRUE((diff_nrm / bb_nrm) < 1e-4);
+
     kh.destroy_spiluk_handle();
   }
-
 }
 
-} // namespace Test
+}  // namespace Test
 
-template <typename scalar_t, typename lno_t, typename size_type, typename device>
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
 void test_spiluk() {
   Test::run_test_spiluk<scalar_t, lno_t, size_type, device>();
 }
 
+#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                      \
+  TEST_F(TestCategory,                                                     \
+         sparse##_##spiluk##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    test_spiluk<SCALAR, ORDINAL, OFFSET, DEVICE>();                        \
+  }
 
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
-TEST_F( TestCategory, sparse ## _ ## spiluk ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  test_spiluk<SCALAR,ORDINAL,OFFSET,DEVICE>(); \
-}
-
-
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
 #endif
 
 #if 0
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
  EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
  EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
  EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
  EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
  EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
  EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
  EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
  EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
 #endif
 
diff --git a/unit_test/sparse/Test_Sparse_spmv.hpp b/unit_test/sparse/Test_Sparse_spmv.hpp
index d85e2338c5..55c608a11e 100644
--- a/unit_test/sparse/Test_Sparse_spmv.hpp
+++ b/unit_test/sparse/Test_Sparse_spmv.hpp
@@ -1,12 +1,12 @@
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
 
-#include<KokkosSparse_spmv.hpp>
-#include<KokkosKernels_TestUtils.hpp>
-#include<KokkosKernels_Test_Structured_Matrix.hpp>
-#include<KokkosKernels_IOUtils.hpp>
-#include<KokkosKernels_Utils.hpp>
+#include <KokkosSparse_spmv.hpp>
+#include <KokkosKernels_TestUtils.hpp>
+#include <KokkosKernels_Test_Structured_Matrix.hpp>
+#include <KokkosKernels_IOUtils.hpp>
+#include <KokkosKernels_Utils.hpp>
 
 #include "KokkosKernels_Controls.hpp"
 #include "KokkosKernels_default_types.hpp"
@@ -18,46 +18,44 @@
 
 typedef Kokkos::complex<double> kokkos_complex_double;
 typedef Kokkos::complex<float> kokkos_complex_float;
+typedef Kokkos::Experimental::half_t kokkos_half;
 
 namespace Test {
 
-  template < class VectorType0, class VectorType1 >
+template <class VectorType0, class VectorType1>
 struct fSPMV {
   using value_type = int;
-  using AT         = Kokkos::ArithTraits<typename VectorType1::non_const_value_type>;
-  using ATM        = Kokkos::ArithTraits<typename AT::mag_type>;
-  using mag_type   = typename AT::mag_type;
+  using AT  = Kokkos::ArithTraits<typename VectorType1::non_const_value_type>;
+  using ATM = Kokkos::ArithTraits<typename AT::mag_type>;
+  using mag_type = typename AT::mag_type;
 
   VectorType0 expected_y;
   VectorType1 y;
-  mag_type    eps;
+  mag_type eps;
 
-  fSPMV(const VectorType0 & _ex_y, const VectorType1 & _y, const mag_type _eps)
-  : expected_y(_ex_y)
-  , y(_y)
-  , eps(_eps)
-  {}
+  fSPMV(const VectorType0 &_ex_y, const VectorType1 &_y, const mag_type _eps)
+      : expected_y(_ex_y), y(_y), eps(_eps) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const int i, value_type& err ) const {
+  void operator()(const int i, value_type &err) const {
+    const mag_type error =
+        AT::abs(expected_y(i) - y(i)) / (AT::abs(expected_y(i)) > ATM::zero()
+                                             ? AT::abs(expected_y(i))
+                                             : ATM::one());
 
-    const mag_type error = AT::abs(expected_y(i)-y(i))
-      / (AT::abs(expected_y(i)) > ATM::zero() ? AT::abs(expected_y(i)) : ATM::one());
-
-    if(error > eps) {
+    if (error > eps) {
       err++;
-      //printf("expected_y(%d)=%f, y(%d)=%f\n", i, AT::abs(expected_y(i)), i, AT::abs(y(i)));
+      // printf("expected_y(%d)=%f, y(%d)=%f err=%f, eps=%f\n", i,
+      //        AT::abs(expected_y(i)), i, AT::abs(y(i)), error, eps);
     }
   }
 };
 
-
 template <typename crsMat_t, typename x_vector_type, typename y_vector_type>
 void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
                      typename y_vector_type::non_const_value_type alpha,
                      typename y_vector_type::non_const_value_type beta,
-                     char mode = 'N'){
-
+                     char mode = 'N') {
   using graph_t          = typename crsMat_t::StaticCrsGraphType;
   using size_type_view_t = typename graph_t::row_map_type;
   using lno_view_t       = typename graph_t::entries_type;
@@ -70,57 +68,56 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
 
   mode = toupper(mode);
 
-  typename scalar_view_t::HostMirror h_values = Kokkos::create_mirror_view(input_mat.values);
-  Kokkos::deep_copy(h_values,input_mat.values);
+  typename scalar_view_t::HostMirror h_values =
+      Kokkos::create_mirror_view(input_mat.values);
+  Kokkos::deep_copy(h_values, input_mat.values);
 
-  typename lno_view_t::HostMirror h_entries = Kokkos::create_mirror_view(input_mat.graph.entries);
-  Kokkos::deep_copy(h_entries,input_mat.graph.entries);
+  typename lno_view_t::HostMirror h_entries =
+      Kokkos::create_mirror_view(input_mat.graph.entries);
+  Kokkos::deep_copy(h_entries, input_mat.graph.entries);
 
-  typename size_type_view_t::HostMirror h_rowmap = Kokkos::create_mirror_view(input_mat.graph.row_map);
-  Kokkos::deep_copy(h_rowmap,input_mat.graph.row_map);
+  typename size_type_view_t::HostMirror h_rowmap =
+      Kokkos::create_mirror_view(input_mat.graph.row_map);
+  Kokkos::deep_copy(h_rowmap, input_mat.graph.row_map);
   Kokkos::fence();
 
-
-
   typename y_vector_type::HostMirror h_y = Kokkos::create_mirror_view(y);
   typename x_vector_type::HostMirror h_x = Kokkos::create_mirror_view(x);
 
-  KokkosKernels::Impl::safe_device_to_host_deep_copy (x.extent(0), x, h_x);
-  KokkosKernels::Impl::safe_device_to_host_deep_copy (y.extent(0), y, h_y);
+  KokkosKernels::Impl::safe_device_to_host_deep_copy(x.extent(0), x, h_x);
+  KokkosKernels::Impl::safe_device_to_host_deep_copy(y.extent(0), y, h_y);
   Kokkos::fence();
 
   lno_t nr = input_mat.numRows();
 
-  //first, scale y by beta
-  for(size_t i = 0; i < h_y.extent(0); i++)
-    h_y(i) *= beta;
+  // first, scale y by beta
+  for (size_t i = 0; i < h_y.extent(0); i++) h_y(i) *= beta;
 
-  //then go through the matrix and accumulate the matrix-vector product
+  // then go through the matrix and accumulate the matrix-vector product
   for (lno_t row = 0; row < nr; ++row) {
-    for (size_type j = h_rowmap(row); j < h_rowmap(row+1); ++j) {
-      lno_t col = h_entries(j);
+    for (size_type j = h_rowmap(row); j < h_rowmap(row + 1); ++j) {
+      lno_t col    = h_entries(j);
       scalar_t val = h_values(j);
-      if(mode == 'N')
+      if (mode == 'N')
         h_y(row) += alpha * val * h_x(col);
-      else if(mode == 'C')
+      else if (mode == 'C')
         h_y(row) += alpha * KAT::conj(val) * h_x(col);
-      else if(mode == 'T')
+      else if (mode == 'T')
         h_y(col) += alpha * val * h_x(row);
-      else if(mode == 'H')
+      else if (mode == 'H')
         h_y(col) += alpha * KAT::conj(val) * h_x(row);
     }
   }
-  KokkosKernels::Impl::safe_host_to_device_deep_copy (y.extent(0),  h_y, y);
+  KokkosKernels::Impl::safe_host_to_device_deep_copy(y.extent(0), h_y, y);
   Kokkos::fence();
 }
 
-
 template <typename crsMat_t, typename x_vector_type, typename y_vector_type>
 void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
                 typename y_vector_type::non_const_value_type alpha,
                 typename y_vector_type::non_const_value_type beta, char mode) {
-  //typedef typename crsMat_t::StaticCrsGraphType graph_t;
-  using ExecSpace = typename crsMat_t::execution_space;
+  // typedef typename crsMat_t::StaticCrsGraphType graph_t;
+  using ExecSpace        = typename crsMat_t::execution_space;
   using my_exec_space    = Kokkos::RangePolicy<ExecSpace>;
   using y_value_type     = typename y_vector_type::non_const_value_type;
   using y_value_trait    = Kokkos::ArithTraits<y_value_type>;
@@ -129,43 +126,46 @@ void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
   // y is the quantity being tested here,
   // so let us use y_value_type to determine
   // the appropriate tolerance precision.
-  const y_value_mag_type eps = std::is_same<y_value_mag_type, float>::value ? 2*1e-3 : 1e-7;
+  const y_value_mag_type eps =
+      std::is_same<y_value_mag_type, float>::value ? 2 * 1e-3 : 1e-7;
   bool transposed = (mode == 'T') || (mode == 'H');
-  y_vector_type expected_y("expected", transposed ? input_mat.numCols() : input_mat.numRows());
+  y_vector_type expected_y(
+      "expected", transposed ? input_mat.numCols() : input_mat.numRows());
   Kokkos::deep_copy(expected_y, y);
   Kokkos::fence();
 
   sequential_spmv(input_mat, x, expected_y, alpha, beta, mode);
   bool threw = false;
   std::string msg;
-  try
-  {
+  try {
     KokkosSparse::spmv(&mode, alpha, input_mat, x, beta, y);
     Kokkos::fence();
-  }
-  catch(std::exception& e)
-  {
+  } catch (std::exception &e) {
     threw = true;
-    msg = e.what();
+    msg   = e.what();
   }
-  ASSERT_FALSE(threw) << "KokkosSparse::Test::spmv 1D, mode " << mode << ": threw exception:\n" << msg << '\n';
+  ASSERT_FALSE(threw) << "KokkosSparse::Test::spmv 1D, mode " << mode
+                      << ": threw exception:\n"
+                      << msg << '\n';
   int num_errors = 0;
-  Kokkos::parallel_reduce("KokkosSparse::Test::spmv",
-                          my_exec_space(0, y.extent(0)),
-                          fSPMV<y_vector_type, y_vector_type>(expected_y, y, eps),
-                          num_errors);
-  if(num_errors>0) printf("KokkosSparse::Test::spmv: %i errors of %i with params: %lf %lf\n",
-                          num_errors, y.extent_int(0),
-                          y_value_trait::abs(alpha), y_value_trait::abs(beta));
-  EXPECT_TRUE(num_errors==0);
+  Kokkos::parallel_reduce(
+      "KokkosSparse::Test::spmv", my_exec_space(0, y.extent(0)),
+      fSPMV<y_vector_type, y_vector_type>(expected_y, y, eps), num_errors);
+  if (num_errors > 0)
+    printf("KokkosSparse::Test::spmv: %i errors of %i with params: %lf %lf\n",
+           num_errors, y.extent_int(0), y_value_trait::abs(alpha),
+           y_value_trait::abs(beta));
+  EXPECT_TRUE(num_errors == 0);
 }
 
 template <typename crsMat_t, typename x_vector_type, typename y_vector_type>
-void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y, y_vector_type expected_y,
+void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y,
+                   y_vector_type expected_y,
                    typename y_vector_type::non_const_value_type alpha,
-                   typename y_vector_type::non_const_value_type beta, int numMV, char mode) {
-  using ExecSpace = typename crsMat_t::execution_space;
-  using my_exec_space = Kokkos::RangePolicy<ExecSpace>;
+                   typename y_vector_type::non_const_value_type beta, int numMV,
+                   char mode) {
+  using ExecSpace        = typename crsMat_t::execution_space;
+  using my_exec_space    = Kokkos::RangePolicy<ExecSpace>;
   using y_value_type     = typename y_vector_type::non_const_value_type;
   using y_value_trait    = Kokkos::ArithTraits<y_value_type>;
   using y_value_mag_type = typename y_value_trait::mag_type;
@@ -173,7 +173,8 @@ void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y, y_vecto
   // y is the quantity being tested here,
   // so let us use y_value_type to determine
   // the appropriate tolerance precision.
-  const y_value_mag_type eps = std::is_same<y_value_mag_type, float>::value ? 2*1e-3 : 1e-7;
+  const y_value_mag_type eps =
+      std::is_same<y_value_mag_type, float>::value ? 2 * 1e-3 : 1e-7;
 
   Kokkos::deep_copy(expected_y, y);
 
@@ -181,49 +182,48 @@ void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y, y_vecto
 
   bool threw = false;
   std::string msg;
-  try
-  {
+  try {
     KokkosSparse::spmv(&mode, alpha, input_mat, x, beta, y);
     Kokkos::fence();
-  }
-  catch(std::exception& e)
-  {
+  } catch (std::exception &e) {
     threw = true;
-    msg = e.what();
+    msg   = e.what();
   }
-  ASSERT_FALSE(threw) << "KokkosSparse::Test::spmv 2D, mode " << mode << ": threw exception:\n" << msg << '\n';
+  ASSERT_FALSE(threw) << "KokkosSparse::Test::spmv 2D, mode " << mode
+                      << ": threw exception:\n"
+                      << msg << '\n';
 
-  for (int i = 0; i < numMV; ++i){
-    auto x_i = Kokkos::subview (x, Kokkos::ALL (), i);
+  for (int i = 0; i < numMV; ++i) {
+    auto x_i = Kokkos::subview(x, Kokkos::ALL(), i);
 
-    auto y_i = Kokkos::subview (expected_y, Kokkos::ALL (), i);
+    auto y_i = Kokkos::subview(expected_y, Kokkos::ALL(), i);
     Kokkos::fence();
 
     sequential_spmv(input_mat, x_i, y_i, alpha, beta, mode);
 
-    auto y_spmv = Kokkos::subview (y, Kokkos::ALL (), i);
+    auto y_spmv    = Kokkos::subview(y, Kokkos::ALL(), i);
     int num_errors = 0;
-    Kokkos::parallel_reduce("KokkosSparse::Test::spmv_mv",
-                            my_exec_space(0,y_i.extent(0)),
-                            fSPMV<decltype(y_i), decltype(y_spmv)>(y_i, y_spmv, eps),
-                            num_errors);
-    if(num_errors>0)
-      std::cout << "KokkosSparse::Test::spmv_mv: " << num_errors << " errors of " << y_i.extent_int(0)
-        << " for mv " << i << " (alpha=" << alpha << ", beta=" << beta << ", mode = " << mode << ")\n";
-    EXPECT_TRUE(num_errors==0);
+    Kokkos::parallel_reduce(
+        "KokkosSparse::Test::spmv_mv", my_exec_space(0, y_i.extent(0)),
+        fSPMV<decltype(y_i), decltype(y_spmv)>(y_i, y_spmv, eps), num_errors);
+    if (num_errors > 0)
+      std::cout << "KokkosSparse::Test::spmv_mv: " << num_errors
+                << " errors of " << y_i.extent_int(0) << " for mv " << i
+                << " (alpha=" << alpha << ", beta=" << beta
+                << ", mode = " << mode << ")\n";
+    EXPECT_TRUE(num_errors == 0);
   }
 }
 
-template <typename crsMat_t,
-          typename x_vector_type,
-          typename y_vector_type>
-void check_spmv_struct(const crsMat_t input_mat,
-                       const int stencil_type,
-                       const Kokkos::View<typename crsMat_t::non_const_ordinal_type*, Kokkos::HostSpace> structure,
-                       x_vector_type x,
-                       y_vector_type y,
-                       typename y_vector_type::non_const_value_type alpha,
-                       typename y_vector_type::non_const_value_type beta) {
+template <typename crsMat_t, typename x_vector_type, typename y_vector_type>
+void check_spmv_struct(
+    const crsMat_t input_mat, const int stencil_type,
+    const Kokkos::View<typename crsMat_t::non_const_ordinal_type *,
+                       Kokkos::HostSpace>
+        structure,
+    x_vector_type x, y_vector_type y,
+    typename y_vector_type::non_const_value_type alpha,
+    typename y_vector_type::non_const_value_type beta) {
   using ExecSpace        = typename crsMat_t::execution_space;
   using my_exec_space    = Kokkos::RangePolicy<ExecSpace>;
   using y_value_type     = typename y_vector_type::non_const_value_type;
@@ -233,39 +233,39 @@ void check_spmv_struct(const crsMat_t input_mat,
   // y is the quantity being tested here,
   // so let us use y_value_type to determine
   // the appropriate tolerance precision.
-  const double eps = std::is_same<y_value_mag_type, float>::value ? 2*1e-3 : 1e-7;
+  const double eps =
+      std::is_same<y_value_mag_type, float>::value ? 2 * 1e-3 : 1e-7;
   const size_t nr = input_mat.numRows();
   y_vector_type expected_y("expected", nr);
   Kokkos::deep_copy(expected_y, y);
   Kokkos::fence();
 
   sequential_spmv(input_mat, x, expected_y, alpha, beta);
-  KokkosSparse::Experimental::spmv_struct("N", stencil_type, structure,
-                                          alpha, input_mat, x, beta, y);
+  KokkosSparse::Experimental::spmv_struct("N", stencil_type, structure, alpha,
+                                          input_mat, x, beta, y);
 
   int num_errors = 0;
-  Kokkos::parallel_reduce("KokkosKernels::UnitTests::spmv_struct",
-                          my_exec_space(0, y.extent(0)),
-                          fSPMV<y_vector_type, y_vector_type>(expected_y,y,eps),
-                          num_errors);
-  if(num_errors>0) printf("KokkosKernels::UnitTests::spmv_struct: %i errors of %i with params: %d %lf %lf\n",
-                          num_errors, y.extent_int(0), stencil_type,
-                          y_value_trait::abs(alpha), y_value_trait::abs(beta));
-  EXPECT_TRUE(num_errors==0);
-} // check_spmv_struct
-
-template <typename crsMat_t,
-          typename x_vector_type,
-          typename y_vector_type>
-void check_spmv_mv_struct(const crsMat_t input_mat,
-                          const int stencil_type,
-                          const Kokkos::View<typename crsMat_t::non_const_ordinal_type*, Kokkos::HostSpace> structure,
-                          x_vector_type x,
-                          y_vector_type y,
-                          y_vector_type expected_y,
-                          typename y_vector_type::non_const_value_type alpha,
-                          typename y_vector_type::non_const_value_type beta,
-                          int numMV) {
+  Kokkos::parallel_reduce(
+      "KokkosKernels::UnitTests::spmv_struct", my_exec_space(0, y.extent(0)),
+      fSPMV<y_vector_type, y_vector_type>(expected_y, y, eps), num_errors);
+  if (num_errors > 0)
+    printf(
+        "KokkosKernels::UnitTests::spmv_struct: %i errors of %i with params: "
+        "%d %lf %lf\n",
+        num_errors, y.extent_int(0), stencil_type, y_value_trait::abs(alpha),
+        y_value_trait::abs(beta));
+  EXPECT_TRUE(num_errors == 0);
+}  // check_spmv_struct
+
+template <typename crsMat_t, typename x_vector_type, typename y_vector_type>
+void check_spmv_mv_struct(
+    const crsMat_t input_mat, const int stencil_type,
+    const Kokkos::View<typename crsMat_t::non_const_ordinal_type *,
+                       Kokkos::HostSpace>
+        structure,
+    x_vector_type x, y_vector_type y, y_vector_type expected_y,
+    typename y_vector_type::non_const_value_type alpha,
+    typename y_vector_type::non_const_value_type beta, int numMV) {
   using ExecSpace        = typename crsMat_t::execution_space;
   using my_exec_space    = Kokkos::RangePolicy<ExecSpace>;
   using y_value_type     = typename y_vector_type::non_const_value_type;
@@ -275,40 +275,44 @@ void check_spmv_mv_struct(const crsMat_t input_mat,
   // y is the quantity being tested here,
   // so let us use y_value_type to determine
   // the appropriate tolerance precision.
-  const double eps = std::is_same<y_value_mag_type, float>::value ? 2*1e-3 : 1e-7;
+  const double eps =
+      std::is_same<y_value_mag_type, float>::value ? 2 * 1e-3 : 1e-7;
   Kokkos::deep_copy(expected_y, y);
   Kokkos::fence();
 
-  KokkosSparse::Experimental::spmv_struct("N", stencil_type, structure,
-                                          alpha, input_mat, x, beta, y);
+  KokkosSparse::Experimental::spmv_struct("N", stencil_type, structure, alpha,
+                                          input_mat, x, beta, y);
 
-  for(int vectorIdx = 0; vectorIdx < numMV; ++vectorIdx) {
-    auto x_i = Kokkos::subview (x, Kokkos::ALL (), vectorIdx);
-    auto y_i = Kokkos::subview (expected_y, Kokkos::ALL (), vectorIdx);
+  for (int vectorIdx = 0; vectorIdx < numMV; ++vectorIdx) {
+    auto x_i = Kokkos::subview(x, Kokkos::ALL(), vectorIdx);
+    auto y_i = Kokkos::subview(expected_y, Kokkos::ALL(), vectorIdx);
     Kokkos::fence();
 
     sequential_spmv(input_mat, x_i, y_i, alpha, beta);
 
-    auto y_spmv = Kokkos::subview (y, Kokkos::ALL (), vectorIdx);
+    auto y_spmv    = Kokkos::subview(y, Kokkos::ALL(), vectorIdx);
     int num_errors = 0;
-    Kokkos::parallel_reduce("KokkosKernels::UnitTests::spmv_mv_struct",
-                            my_exec_space(0, y.extent(0)),
-                            fSPMV<decltype(y_i), decltype(y_spmv)>(y_i,y_spmv,eps),
-                            num_errors);
-    if(num_errors>0) printf("KokkosKernels::UnitTests::spmv_mv_struct: %i errors of %i with params: %d %lf %lf, in vector %i\n",
-                            num_errors, y.extent_int(0), stencil_type,
-                            y_value_trait::abs(alpha), y_value_trait::abs(beta), vectorIdx);
-    EXPECT_TRUE(num_errors==0);
+    Kokkos::parallel_reduce(
+        "KokkosKernels::UnitTests::spmv_mv_struct",
+        my_exec_space(0, y.extent(0)),
+        fSPMV<decltype(y_i), decltype(y_spmv)>(y_i, y_spmv, eps), num_errors);
+    if (num_errors > 0)
+      printf(
+          "KokkosKernels::UnitTests::spmv_mv_struct: %i errors of %i with "
+          "params: %d %lf %lf, in vector %i\n",
+          num_errors, y.extent_int(0), stencil_type, y_value_trait::abs(alpha),
+          y_value_trait::abs(beta), vectorIdx);
+    EXPECT_TRUE(num_errors == 0);
   }
-} // check_spmv_mv_struct
+}  // check_spmv_mv_struct
 
 template <typename crsMat_t, typename x_vector_type, typename y_vector_type>
 void check_spmv_controls(KokkosKernels::Experimental::Controls controls,
-			 crsMat_t input_mat, x_vector_type x, y_vector_type y,
-			 typename y_vector_type::non_const_value_type alpha,
-			 typename y_vector_type::non_const_value_type beta) {
-  //typedef typename crsMat_t::StaticCrsGraphType graph_t;
-  using ExecSpace = typename crsMat_t::execution_space;
+                         crsMat_t input_mat, x_vector_type x, y_vector_type y,
+                         typename y_vector_type::non_const_value_type alpha,
+                         typename y_vector_type::non_const_value_type beta) {
+  // typedef typename crsMat_t::StaticCrsGraphType graph_t;
+  using ExecSpace        = typename crsMat_t::execution_space;
   using my_exec_space    = Kokkos::RangePolicy<ExecSpace>;
   using y_value_type     = typename y_vector_type::non_const_value_type;
   using y_value_trait    = Kokkos::ArithTraits<y_value_type>;
@@ -317,7 +321,8 @@ void check_spmv_controls(KokkosKernels::Experimental::Controls controls,
   // y is the quantity being tested here,
   // so let us use y_value_type to determine
   // the appropriate tolerance precision.
-  const y_value_mag_type eps = std::is_same<y_value_mag_type, float>::value ? 2*1e-3 : 1e-7;
+  const y_value_mag_type eps =
+      std::is_same<y_value_mag_type, float>::value ? 2 * 1e-3 : 1e-7;
   const size_t nr = input_mat.numRows();
   y_vector_type expected_y("expected", nr);
   Kokkos::deep_copy(expected_y, y);
@@ -332,179 +337,175 @@ void check_spmv_controls(KokkosKernels::Experimental::Controls controls,
 
   KokkosSparse::spmv(controls, "N", alpha, input_mat, x, beta, y);
   int num_errors = 0;
-  Kokkos::parallel_reduce("KokkosSparse::Test::spmv",
-                          my_exec_space(0, y.extent(0)),
-                          fSPMV<y_vector_type, y_vector_type>(expected_y, y, eps),
-                          num_errors);
-  if(num_errors>0) printf("KokkosSparse::Test::spmv: %i errors of %i with params: %lf %lf\n",
-                          num_errors, y.extent_int(0),
-                          y_value_trait::abs(alpha), y_value_trait::abs(beta));
-  EXPECT_TRUE(num_errors==0);
-} // check_spmv_controls
-
-} // namespace Test
+  Kokkos::parallel_reduce(
+      "KokkosSparse::Test::spmv", my_exec_space(0, y.extent(0)),
+      fSPMV<y_vector_type, y_vector_type>(expected_y, y, eps), num_errors);
+  if (num_errors > 0)
+    printf("KokkosSparse::Test::spmv: %i errors of %i with params: %lf %lf\n",
+           num_errors, y.extent_int(0), y_value_trait::abs(alpha),
+           y_value_trait::abs(beta));
+  EXPECT_TRUE(num_errors == 0);
+}  // check_spmv_controls
+
+}  // namespace Test
 
 template <typename scalar_t>
-scalar_t randomUpperBound(int mag)
-{
-  return (scalar_t) mag;
+scalar_t randomUpperBound(int mag) {
+  return (scalar_t)mag;
 }
 
 template <>
-Kokkos::complex<double> randomUpperBound<Kokkos::complex<double>>(int mag)
-{
+Kokkos::complex<double> randomUpperBound<Kokkos::complex<double>>(int mag) {
   return Kokkos::complex<double>(mag, mag);
 }
 
 template <>
-Kokkos::complex<float> randomUpperBound<Kokkos::complex<float>>(int mag)
-{
+Kokkos::complex<float> randomUpperBound<Kokkos::complex<float>>(int mag) {
   return Kokkos::complex<float>(mag, mag);
 }
 
 template <typename scalar_t, typename lno_t, typename size_type, class Device>
-void test_spmv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_variance, bool heavy){
-
-  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type> crsMat_t;
+void test_spmv(lno_t numRows, size_type nnz, lno_t bandwidth,
+               lno_t row_size_variance, bool heavy) {
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type>
+          crsMat_t;
   typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
   typedef scalar_view_t x_vector_type;
   typedef scalar_view_t y_vector_type;
 
-
-
-
   lno_t numCols = numRows;
 
-  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(numRows,numCols,nnz,row_size_variance, bandwidth);
+  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(
+      numRows, numCols, nnz, row_size_variance, bandwidth);
   lno_t nr = input_mat.numRows();
   lno_t nc = input_mat.numCols();
 
-  x_vector_type input_x ("x", nc);
-  y_vector_type output_y ("y", nr);
-  x_vector_type input_xt ("x", nr);
-  y_vector_type output_yt ("y", nc);
+  x_vector_type input_x("x", nc);
+  y_vector_type output_y("y", nr);
+  x_vector_type input_xt("x", nr);
+  y_vector_type output_yt("y", nc);
 
-  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
   typedef typename x_vector_type::value_type ScalarX;
   typedef typename y_vector_type::value_type ScalarY;
 
-  Kokkos::fill_random(input_x,rand_pool,randomUpperBound<ScalarX>(10));
-  Kokkos::fill_random(output_y,rand_pool,randomUpperBound<ScalarY>(10));
-  Kokkos::fill_random(input_xt,rand_pool,randomUpperBound<ScalarX>(10));
-  Kokkos::fill_random(output_yt,rand_pool,randomUpperBound<ScalarY>(10));
+  Kokkos::fill_random(input_x, rand_pool, randomUpperBound<ScalarX>(1));
+  Kokkos::fill_random(output_y, rand_pool, randomUpperBound<ScalarY>(1));
+  Kokkos::fill_random(input_xt, rand_pool, randomUpperBound<ScalarX>(1));
+  Kokkos::fill_random(output_yt, rand_pool, randomUpperBound<ScalarY>(1));
 
-  std::vector<char> nonTransModes = {'N'};
-  std::vector<char> transModes = {'T'};
+  std::vector<char> nonTransModes   = {'N'};
+  std::vector<char> transModes      = {'T'};
   std::vector<double> testAlphaBeta = {0.0, 1.0};
-  if(heavy)
-  {
+  if (heavy) {
     nonTransModes.push_back('C');
     transModes.push_back('H');
     testAlphaBeta.push_back(-1.0);
     testAlphaBeta.push_back(2.5);
   }
-  for(auto mode : nonTransModes)
-  {
-    for(double alpha : testAlphaBeta)
-    {
-      for(double beta : testAlphaBeta)
-      {
+  for (auto mode : nonTransModes) {
+    for (double alpha : testAlphaBeta) {
+      for (double beta : testAlphaBeta) {
         Test::check_spmv(input_mat, input_x, output_y, alpha, beta, mode);
       }
     }
   }
-  for(auto mode : transModes)
-  {
-    for(double alpha : testAlphaBeta)
-    {
-      for(double beta : testAlphaBeta)
-      {
+  for (auto mode : transModes) {
+    for (double alpha : testAlphaBeta) {
+      for (double beta : testAlphaBeta) {
         Test::check_spmv(input_mat, input_xt, output_yt, alpha, beta, mode);
       }
     }
   }
 }
 
-template <typename scalar_t, typename lno_t, typename size_type, typename layout, class Device>
-void test_spmv_mv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_variance, bool heavy, int numMV){
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename layout, class Device>
+void test_spmv_mv(lno_t numRows, size_type nnz, lno_t bandwidth,
+                  lno_t row_size_variance, bool heavy, int numMV) {
   lno_t numCols = numRows;
 
-  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type> crsMat_t;
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type>
+          crsMat_t;
 
-  typedef Kokkos::View<scalar_t**, layout, Device> ViewTypeX;
-  typedef Kokkos::View<scalar_t**, layout, Device> ViewTypeY;
+  typedef Kokkos::View<scalar_t **, layout, Device> ViewTypeX;
+  typedef Kokkos::View<scalar_t **, layout, Device> ViewTypeY;
 
-  ViewTypeX b_x("A",numRows,numMV);
-  ViewTypeY b_y("B",numCols,numMV);
-  ViewTypeY b_y_copy("B",numCols,numMV);
+  ViewTypeX b_x("A", numRows, numMV);
+  ViewTypeY b_y("B", numCols, numMV);
+  ViewTypeY b_y_copy("B", numCols, numMV);
 
-  ViewTypeX b_xt("A",numCols,numMV);
-  ViewTypeY b_yt("B",numRows,numMV);
-  ViewTypeY b_yt_copy("B",numRows,numMV);
+  ViewTypeX b_xt("A", numCols, numMV);
+  ViewTypeY b_yt("B", numRows, numMV);
+  ViewTypeY b_yt_copy("B", numRows, numMV);
 
-  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
-  Kokkos::fill_random(b_x,rand_pool,randomUpperBound<scalar_t>(10));
-  Kokkos::fill_random(b_y,rand_pool,randomUpperBound<scalar_t>(10));
-  Kokkos::fill_random(b_xt,rand_pool,randomUpperBound<scalar_t>(10));
-  Kokkos::fill_random(b_yt,rand_pool,randomUpperBound<scalar_t>(10));
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
+  Kokkos::fill_random(b_x, rand_pool, randomUpperBound<scalar_t>(1));
+  Kokkos::fill_random(b_y, rand_pool, randomUpperBound<scalar_t>(1));
+  Kokkos::fill_random(b_xt, rand_pool, randomUpperBound<scalar_t>(1));
+  Kokkos::fill_random(b_yt, rand_pool, randomUpperBound<scalar_t>(1));
 
-
-  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(numRows,numCols,nnz,row_size_variance, bandwidth);
+  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(
+      numRows, numCols, nnz, row_size_variance, bandwidth);
 
   Kokkos::deep_copy(b_y_copy, b_y);
   Kokkos::deep_copy(b_yt_copy, b_yt);
 
-  std::vector<char> nonTransModes = {'N'};
-  std::vector<char> transModes = {'T'};
+  std::vector<char> nonTransModes   = {'N'};
+  std::vector<char> transModes      = {'T'};
   std::vector<double> testAlphaBeta = {0.0, 1.0};
-  if(heavy)
-  {
+  if (heavy) {
     nonTransModes.push_back('C');
     transModes.push_back('H');
     testAlphaBeta.push_back(-1.0);
     testAlphaBeta.push_back(2.5);
   }
-  for(auto mode : nonTransModes)
-  {
-    for(double alpha : testAlphaBeta)
-    {
-      for(double beta : testAlphaBeta)
-      {
-        Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, alpha, beta, numMV, mode);
+  for (auto mode : nonTransModes) {
+    for (double alpha : testAlphaBeta) {
+      for (double beta : testAlphaBeta) {
+        Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, alpha, beta, numMV,
+                            mode);
       }
     }
   }
-  for(auto mode : transModes)
-  {
-    for(double alpha : testAlphaBeta)
-    {
-      for(double beta : testAlphaBeta)
-      {
-        Test::check_spmv_mv(input_mat, b_xt, b_yt, b_yt_copy, alpha, beta, numMV, mode);
+  for (auto mode : transModes) {
+    for (double alpha : testAlphaBeta) {
+      for (double beta : testAlphaBeta) {
+        Test::check_spmv_mv(input_mat, b_xt, b_yt, b_yt_copy, alpha, beta,
+                            numMV, mode);
       }
     }
   }
 }
 
-template <typename scalar_t, typename lno_t, typename size_type, typename layout, class Device>
-void test_spmv_mv_heavy(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_variance, int numMV){
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename layout, class Device>
+void test_spmv_mv_heavy(lno_t numRows, size_type nnz, lno_t bandwidth,
+                        lno_t row_size_variance, int numMV) {
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type>
+          crsMat_t;
 
-  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type> crsMat_t;
+  typedef Kokkos::View<scalar_t **, layout, Device> ViewTypeX;
+  typedef Kokkos::View<scalar_t **, layout, Device> ViewTypeY;
 
-  typedef Kokkos::View<scalar_t**, layout, Device> ViewTypeX;
-  typedef Kokkos::View<scalar_t**, layout, Device> ViewTypeY;
+  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(
+      numRows, numRows, nnz, row_size_variance, bandwidth);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
-  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(numRows,numRows,nnz,row_size_variance, bandwidth);
-  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  for (int nv = 1; nv <= numMV; nv++) {
+    ViewTypeX b_x("A", numRows, nv);
+    ViewTypeY b_y("B", numRows, nv);
+    ViewTypeY b_y_copy("B", numRows, nv);
 
-  for(int nv = 1; nv <= numMV; nv++) {
-    ViewTypeX b_x("A",numRows,nv);
-    ViewTypeY b_y("B",numRows,nv);
-    ViewTypeY b_y_copy("B",numRows,nv);
-
-    Kokkos::fill_random(b_x,rand_pool,scalar_t(10));
-    Kokkos::fill_random(b_y,rand_pool,scalar_t(10));
+    Kokkos::fill_random(b_x, rand_pool, scalar_t(10));
+    Kokkos::fill_random(b_y, rand_pool, scalar_t(10));
 
     Kokkos::deep_copy(b_y_copy, b_y);
 
@@ -513,16 +514,14 @@ void test_spmv_mv_heavy(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_
     Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, 'N');
     Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'T');
     Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'T');
-    //Testing all modes together, since matrix is square
-    std::vector<char> modes = {'N', 'C', 'T', 'H'};
+    // Testing all modes together, since matrix is square
+    std::vector<char> modes           = {'N', 'C', 'T', 'H'};
     std::vector<double> testAlphaBeta = {0.0, 1.0, -1.0, 2.5};
-    for(auto mode : modes)
-    {
-      for(double alpha : testAlphaBeta)
-      {
-        for(double beta : testAlphaBeta)
-        {
-          Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, alpha, beta, nv, mode);
+    for (auto mode : modes) {
+      for (double alpha : testAlphaBeta) {
+        for (double beta : testAlphaBeta) {
+          Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, alpha, beta, nv,
+                              mode);
         }
       }
     }
@@ -531,34 +530,41 @@ void test_spmv_mv_heavy(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_
 
 template <typename scalar_t, typename lno_t, typename size_type, class Device>
 void test_spmv_struct_1D(lno_t nx, lno_t leftBC, lno_t rightBC) {
-
-  using crsMat_t      = typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type>;
+  using crsMat_t = typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device,
+                                                    void, size_type>;
   using scalar_view_t = typename crsMat_t::values_type::non_const_type;
   using x_vector_type = scalar_view_t;
   using y_vector_type = scalar_view_t;
 
-  Kokkos::View<lno_t*, Kokkos::HostSpace> structure("Spmv Structure", 1);
+  Kokkos::View<lno_t *, Kokkos::HostSpace> structure("Spmv Structure", 1);
   structure(0) = nx;
-  Kokkos::View<lno_t*[3], Kokkos::HostSpace> mat_structure("Matrix Structure", 1);
+  Kokkos::View<lno_t * [3], Kokkos::HostSpace> mat_structure("Matrix Structure",
+                                                             1);
   mat_structure(0, 0) = nx;
-  if(leftBC  == 1) { mat_structure(0, 1) = 1; }
-  if(rightBC == 1) { mat_structure(0, 2) = 1; }
+  if (leftBC == 1) {
+    mat_structure(0, 1) = 1;
+  }
+  if (rightBC == 1) {
+    mat_structure(0, 2) = 1;
+  }
 
-  crsMat_t input_mat = Test::generate_structured_matrix1D<crsMat_t>(mat_structure);
+  crsMat_t input_mat =
+      Test::generate_structured_matrix1D<crsMat_t>(mat_structure);
 
   lno_t nr = input_mat.numRows();
   lno_t nc = input_mat.numCols();
 
-  x_vector_type input_x  ("x", nc);
-  y_vector_type output_y ("y", nr);
+  x_vector_type input_x("x", nc);
+  y_vector_type output_y("y", nr);
 
-  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
   typedef typename x_vector_type::value_type ScalarX;
   typedef typename y_vector_type::value_type ScalarY;
 
-  Kokkos::fill_random(input_x,  rand_pool, ScalarX(10));
-  Kokkos::fill_random(output_y, rand_pool, ScalarY(10));
+  Kokkos::fill_random(input_x, rand_pool, ScalarX(1));
+  Kokkos::fill_random(output_y, rand_pool, ScalarY(1));
 
   Test::check_spmv_struct(input_mat, 1, structure, input_x, output_y, 1.0, 0.0);
   Test::check_spmv_struct(input_mat, 1, structure, input_x, output_y, 0.0, 1.0);
@@ -566,266 +572,321 @@ void test_spmv_struct_1D(lno_t nx, lno_t leftBC, lno_t rightBC) {
 }
 
 template <typename scalar_t, typename lno_t, typename size_type, class Device>
-void test_spmv_struct_2D(lno_t nx, lno_t ny, lno_t horizontalBC, lno_t verticalBC) {
-
-  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type> crsMat_t;
-  typedef typename crsMat_t::values_type::non_const_type  scalar_view_t;
+void test_spmv_struct_2D(lno_t nx, lno_t ny, lno_t horizontalBC,
+                         lno_t verticalBC) {
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type>
+          crsMat_t;
+  typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
   typedef scalar_view_t x_vector_type;
   typedef scalar_view_t y_vector_type;
 
-  Kokkos::View<lno_t*, Kokkos::HostSpace> structure("Spmv Structure", 2);
+  Kokkos::View<lno_t *, Kokkos::HostSpace> structure("Spmv Structure", 2);
   structure(0) = nx;
   structure(1) = ny;
-  Kokkos::View<lno_t*[3], Kokkos::HostSpace> mat_structure("Matrix Structure", 2);
+  Kokkos::View<lno_t * [3], Kokkos::HostSpace> mat_structure("Matrix Structure",
+                                                             2);
   mat_structure(0, 0) = nx;
-  if(horizontalBC == 1 || horizontalBC == 3) { mat_structure(0, 1) = 1; }
-  if(horizontalBC == 2 || horizontalBC == 3) { mat_structure(0, 2) = 1; }
+  if (horizontalBC == 1 || horizontalBC == 3) {
+    mat_structure(0, 1) = 1;
+  }
+  if (horizontalBC == 2 || horizontalBC == 3) {
+    mat_structure(0, 2) = 1;
+  }
   mat_structure(1, 0) = ny;
-  if(verticalBC == 1 || verticalBC == 3) { mat_structure(1, 1) = 1; }
-  if(verticalBC == 2 || verticalBC == 3) { mat_structure(1, 2) = 1; }
+  if (verticalBC == 1 || verticalBC == 3) {
+    mat_structure(1, 1) = 1;
+  }
+  if (verticalBC == 2 || verticalBC == 3) {
+    mat_structure(1, 2) = 1;
+  }
 
-  crsMat_t input_mat_FD = Test::generate_structured_matrix2D<crsMat_t>("FD", mat_structure);
-  crsMat_t input_mat_FE = Test::generate_structured_matrix2D<crsMat_t>("FE", mat_structure);
+  crsMat_t input_mat_FD =
+      Test::generate_structured_matrix2D<crsMat_t>("FD", mat_structure);
+  crsMat_t input_mat_FE =
+      Test::generate_structured_matrix2D<crsMat_t>("FE", mat_structure);
 
   lno_t nr = input_mat_FD.numRows();
   lno_t nc = input_mat_FD.numCols();
 
-  x_vector_type input_x ("x", nc);
-  y_vector_type output_y ("y", nr);
+  x_vector_type input_x("x", nc);
+  y_vector_type output_y("y", nr);
 
-  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
   typedef typename x_vector_type::value_type ScalarX;
   typedef typename y_vector_type::value_type ScalarY;
 
-  Kokkos::fill_random(input_x,rand_pool,ScalarX(10));
-  Kokkos::fill_random(output_y,rand_pool,ScalarY(10));
-
-  Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0, 0.0);
-  Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 0.0, 1.0);
-  Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0, 1.0);
-
-  Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0, 0.0);
-  Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 0.0, 1.0);
-  Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0, 1.0);
+  Kokkos::fill_random(input_x, rand_pool, ScalarX(1));
+  Kokkos::fill_random(output_y, rand_pool, ScalarY(1));
+
+  Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0,
+                          0.0);
+  Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 0.0,
+                          1.0);
+  Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0,
+                          1.0);
+
+  Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0,
+                          0.0);
+  Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 0.0,
+                          1.0);
+  Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0,
+                          1.0);
 }
 
 template <typename scalar_t, typename lno_t, typename size_type, class Device>
-void test_spmv_struct_3D(lno_t nx, lno_t ny, lno_t nz, lno_t horizontal1BC, lno_t horizontal2BC, lno_t verticalBC) {
-
-  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type> crsMat_t;
-  typedef typename crsMat_t::values_type::non_const_type  scalar_view_t;
+void test_spmv_struct_3D(lno_t nx, lno_t ny, lno_t nz, lno_t horizontal1BC,
+                         lno_t horizontal2BC, lno_t verticalBC) {
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type>
+          crsMat_t;
+  typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
   typedef scalar_view_t x_vector_type;
   typedef scalar_view_t y_vector_type;
 
-  Kokkos::View<lno_t*, Kokkos::HostSpace> structure("Spmv Structure", 3);
+  Kokkos::View<lno_t *, Kokkos::HostSpace> structure("Spmv Structure", 3);
   structure(0) = nx;
   structure(1) = ny;
   structure(2) = nz;
-  Kokkos::View<lno_t*[3], Kokkos::HostSpace> mat_structure("Matrix Structure", 3);
+  Kokkos::View<lno_t * [3], Kokkos::HostSpace> mat_structure("Matrix Structure",
+                                                             3);
   mat_structure(0, 0) = nx;
-  if(horizontal1BC == 1 || horizontal1BC == 3) { mat_structure(0, 1) = 1; }
-  if(horizontal1BC == 2 || horizontal1BC == 3) { mat_structure(0, 2) = 1; }
+  if (horizontal1BC == 1 || horizontal1BC == 3) {
+    mat_structure(0, 1) = 1;
+  }
+  if (horizontal1BC == 2 || horizontal1BC == 3) {
+    mat_structure(0, 2) = 1;
+  }
   mat_structure(1, 0) = ny;
-  if(horizontal2BC == 1 || horizontal2BC == 3) { mat_structure(1, 1) = 1; }
-  if(horizontal2BC == 2 || horizontal2BC == 3) { mat_structure(1, 2) = 1; }
+  if (horizontal2BC == 1 || horizontal2BC == 3) {
+    mat_structure(1, 1) = 1;
+  }
+  if (horizontal2BC == 2 || horizontal2BC == 3) {
+    mat_structure(1, 2) = 1;
+  }
   mat_structure(2, 0) = nz;
-  if(verticalBC == 1 || verticalBC == 3) { mat_structure(2, 1) = 1; }
-  if(verticalBC == 2 || verticalBC == 3) { mat_structure(2, 2) = 1; }
+  if (verticalBC == 1 || verticalBC == 3) {
+    mat_structure(2, 1) = 1;
+  }
+  if (verticalBC == 2 || verticalBC == 3) {
+    mat_structure(2, 2) = 1;
+  }
 
-  crsMat_t input_mat_FD = Test::generate_structured_matrix3D<crsMat_t>("FD", mat_structure);
-  crsMat_t input_mat_FE = Test::generate_structured_matrix3D<crsMat_t>("FE", mat_structure);
+  crsMat_t input_mat_FD =
+      Test::generate_structured_matrix3D<crsMat_t>("FD", mat_structure);
+  crsMat_t input_mat_FE =
+      Test::generate_structured_matrix3D<crsMat_t>("FE", mat_structure);
 
   lno_t nr = input_mat_FD.numRows();
   lno_t nc = input_mat_FD.numCols();
 
-  x_vector_type input_x  ("x", nc);
-  y_vector_type output_y ("y", nr);
+  x_vector_type input_x("x", nc);
+  y_vector_type output_y("y", nr);
 
-  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
   typedef typename x_vector_type::value_type ScalarX;
   typedef typename y_vector_type::value_type ScalarY;
 
-  Kokkos::fill_random(input_x,rand_pool,ScalarX(10));
-  Kokkos::fill_random(output_y,rand_pool,ScalarY(10));
-
-  Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0, 0.0);
-  Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 0.0, 1.0);
-  Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0, 1.0);
-
-  Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0, 0.0);
-  Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 0.0, 1.0);
-  Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0, 1.0);
+  Kokkos::fill_random(input_x, rand_pool, ScalarX(1));
+  Kokkos::fill_random(output_y, rand_pool, ScalarY(1));
+
+  Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0,
+                          0.0);
+  Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 0.0,
+                          1.0);
+  Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0,
+                          1.0);
+
+  Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0,
+                          0.0);
+  Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 0.0,
+                          1.0);
+  Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0,
+                          1.0);
 }
 
-template <typename scalar_t, typename lno_t, typename size_type, typename layout, class Device>
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename layout, class Device>
 void test_spmv_mv_struct_1D(lno_t nx, int numMV) {
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type>
+          crsMat_t;
+  typedef Kokkos::View<scalar_t **, layout, Device> x_multivector_type;
+  typedef Kokkos::View<scalar_t **, layout, Device> y_multivector_type;
 
-  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type> crsMat_t;
-  typedef Kokkos::View<scalar_t**, layout, Device> x_multivector_type;
-  typedef Kokkos::View<scalar_t**, layout, Device> y_multivector_type;
-
-  Kokkos::View<lno_t*, Kokkos::HostSpace> structure("Spmv Structure", 1);
+  Kokkos::View<lno_t *, Kokkos::HostSpace> structure("Spmv Structure", 1);
   structure(0) = nx;
-  Kokkos::View<lno_t*[3], Kokkos::HostSpace> mat_structure("Matrix Structure", 1);
+  Kokkos::View<lno_t * [3], Kokkos::HostSpace> mat_structure("Matrix Structure",
+                                                             1);
   mat_structure(0, 0) = nx;
   mat_structure(0, 1) = 1;
   mat_structure(0, 2) = 1;
 
-  crsMat_t input_mat = Test::generate_structured_matrix1D<crsMat_t>(mat_structure);
+  crsMat_t input_mat =
+      Test::generate_structured_matrix1D<crsMat_t>(mat_structure);
 
   lno_t nr = input_mat.numRows();
   lno_t nc = input_mat.numCols();
 
-  x_multivector_type input_x  ("x", nc, numMV);
-  y_multivector_type output_y ("y", nr, numMV);
-  y_multivector_type output_y_copy ("y_copy", nr, numMV);
+  x_multivector_type input_x("x", nc, numMV);
+  y_multivector_type output_y("y", nr, numMV);
+  y_multivector_type output_y_copy("y_copy", nr, numMV);
 
-  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
   typedef typename x_multivector_type::value_type ScalarX;
   typedef typename y_multivector_type::value_type ScalarY;
 
-  Kokkos::fill_random(input_x,  rand_pool, ScalarX(10));
+  Kokkos::fill_random(input_x, rand_pool, ScalarX(10));
   Kokkos::fill_random(output_y, rand_pool, ScalarY(10));
 
   Kokkos::deep_copy(output_y_copy, output_y);
 
-  Test::check_spmv_mv_struct(input_mat, 1, structure, input_x, output_y, output_y_copy, 1.0, 0.0, numMV);
-  Test::check_spmv_mv_struct(input_mat, 1, structure, input_x, output_y, output_y_copy, 0.0, 1.0, numMV);
-  Test::check_spmv_mv_struct(input_mat, 1, structure, input_x, output_y, output_y_copy, 1.0, 1.0, numMV);
+  Test::check_spmv_mv_struct(input_mat, 1, structure, input_x, output_y,
+                             output_y_copy, 1.0, 0.0, numMV);
+  Test::check_spmv_mv_struct(input_mat, 1, structure, input_x, output_y,
+                             output_y_copy, 0.0, 1.0, numMV);
+  Test::check_spmv_mv_struct(input_mat, 1, structure, input_x, output_y,
+                             output_y_copy, 1.0, 1.0, numMV);
 }
 
 // check that the controls are flowing down correctly in the spmv kernel
 template <typename scalar_t, typename lno_t, typename size_type, class Device>
-void test_spmv_controls(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_variance) {
-
-  using crsMat_t      = typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type>;
+void test_spmv_controls(lno_t numRows, size_type nnz, lno_t bandwidth,
+                        lno_t row_size_variance) {
+  using crsMat_t = typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device,
+                                                    void, size_type>;
   using scalar_view_t = typename crsMat_t::values_type::non_const_type;
   using x_vector_type = scalar_view_t;
   using y_vector_type = scalar_view_t;
   using Controls      = KokkosKernels::Experimental::Controls;
 
-
   lno_t numCols = numRows;
 
-  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(numRows,numCols,nnz,row_size_variance, bandwidth);
+  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(
+      numRows, numCols, nnz, row_size_variance, bandwidth);
   lno_t nr = input_mat.numRows();
   lno_t nc = input_mat.numCols();
 
-  x_vector_type input_x ("x", nc);
-  y_vector_type output_y ("y", nr);
+  x_vector_type input_x("x", nc);
+  y_vector_type output_y("y", nr);
 
-  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
 
   using ScalarX = typename x_vector_type::value_type;
   using ScalarY = typename y_vector_type::value_type;
 
-  Kokkos::fill_random(input_x,rand_pool,ScalarX(10));
-  Kokkos::fill_random(output_y,rand_pool,ScalarY(10));
+  Kokkos::fill_random(input_x, rand_pool, ScalarX(10));
+  Kokkos::fill_random(output_y, rand_pool, ScalarY(10));
 
   Controls controls;
 
   Test::check_spmv_controls(controls, input_mat, input_x, output_y, 1.0, 0.0);
   Test::check_spmv_controls(controls, input_mat, input_x, output_y, 0.0, 1.0);
   Test::check_spmv_controls(controls, input_mat, input_x, output_y, 1.0, 1.0);
-} // test_spmv_controls
+}  // test_spmv_controls
 
-//call it if ordinal int and, scalar float and double are instantiated.
-template<class DeviceType>
-void test_github_issue_101 ()
-{
+// call it if ordinal int and, scalar float and double are instantiated.
+template <class DeviceType>
+void test_github_issue_101() {
   typedef KokkosSparse::CrsMatrix<float, int, DeviceType> float_matrix_type;
   typedef KokkosSparse::CrsMatrix<double, int, DeviceType> double_matrix_type;
-  static_assert (std::is_same<typename float_matrix_type::StaticCrsGraphType,
-    typename double_matrix_type::StaticCrsGraphType>::value,
-    "Two KokkosSparse::CrsMatrix types that differ only in the type of "
-    "matrix values, appear to have two different StaticCrsGraphType "
-    "typedefs.  This should never happen.");
+  static_assert(
+      std::is_same<typename float_matrix_type::StaticCrsGraphType,
+                   typename double_matrix_type::StaticCrsGraphType>::value,
+      "Two KokkosSparse::CrsMatrix types that differ only in the type of "
+      "matrix values, appear to have two different StaticCrsGraphType "
+      "typedefs.  This should never happen.");
   typedef typename float_matrix_type::StaticCrsGraphType graph_type;
 
-  constexpr int numRows = 1;
-  constexpr int numCols = 2;
+  constexpr int numRows    = 1;
+  constexpr int numCols    = 2;
   constexpr double alpha_d = 1.0;
-  constexpr double beta_d = 0.0;
-  const float EPS_f = std::numeric_limits<float>::epsilon ();
+  constexpr double beta_d  = 0.0;
+  const float EPS_f        = std::numeric_limits<float>::epsilon();
 
   graph_type G;
   {
-    typename graph_type::entries_type colInds ("colInds", numCols);
-    auto colInds_h = Kokkos::create_mirror_view (colInds);
-    colInds_h[0] = 0;
-    colInds_h[1] = 1;
-    Kokkos::deep_copy (colInds, colInds_h);
+    typename graph_type::entries_type colInds("colInds", numCols);
+    auto colInds_h = Kokkos::create_mirror_view(colInds);
+    colInds_h[0]   = 0;
+    colInds_h[1]   = 1;
+    Kokkos::deep_copy(colInds, colInds_h);
 
     typedef typename graph_type::row_map_type::non_const_type row_offsets_type;
-    row_offsets_type rowOffsets ("rowOffsets", numRows+1);
-    auto rowOffsets_h = Kokkos::create_mirror_view (rowOffsets);
-    rowOffsets_h[0] = 0; // Entries start at offset 0
-    rowOffsets_h[1] = 2; // 2 entries total in the "sparse" matrix
-    Kokkos::deep_copy (rowOffsets, rowOffsets_h);
+    row_offsets_type rowOffsets("rowOffsets", numRows + 1);
+    auto rowOffsets_h = Kokkos::create_mirror_view(rowOffsets);
+    rowOffsets_h[0]   = 0;  // Entries start at offset 0
+    rowOffsets_h[1]   = 2;  // 2 entries total in the "sparse" matrix
+    Kokkos::deep_copy(rowOffsets, rowOffsets_h);
 
-    G = graph_type (colInds, rowOffsets);
+    G = graph_type(colInds, rowOffsets);
   }
 
-  Kokkos::View<double*, DeviceType> x ("x", numCols);
-  Kokkos::deep_copy (x, static_cast<double> (1.0));
-  Kokkos::View<double*, DeviceType> y ("y", numRows);
-  auto y_h = Kokkos::create_mirror_view (y); // we'll want this later
+  Kokkos::View<double *, DeviceType> x("x", numCols);
+  Kokkos::deep_copy(x, static_cast<double>(1.0));
+  Kokkos::View<double *, DeviceType> y("y", numRows);
+  auto y_h = Kokkos::create_mirror_view(y);  // we'll want this later
 
   // Pick some number large enough to exercise all unrolling cases.
   // Sparse mat-vec does or at least used to unroll for 1, 2, ..., 17
   // vectors.  Include a little extra in case the implementers decide
   // to strip-mine that.
   constexpr int numVecs = 22;
-  Kokkos::View<double**, default_layout, DeviceType> X ("X", numCols, numVecs);
-  Kokkos::deep_copy (X, static_cast<double> (1.0));
-  Kokkos::View<double**, default_layout, DeviceType> Y ("Y", numRows, numVecs);
-  auto Y_h = Kokkos::create_mirror_view (Y); // we'll want this later
+  Kokkos::View<double **, default_layout, DeviceType> X("X", numCols, numVecs);
+  Kokkos::deep_copy(X, static_cast<double>(1.0));
+  Kokkos::View<double **, default_layout, DeviceType> Y("Y", numRows, numVecs);
+  auto Y_h = Kokkos::create_mirror_view(Y);  // we'll want this later
 
   // Start with the easy test case, where the matrix and the vectors
   // are all double.
   {
-    constexpr double ZERO_d = static_cast<double> (0.0);
-    constexpr double ONE_d = static_cast<double> (1.0);
-    constexpr double TWO_d = static_cast<double> (2.0);
+    constexpr double ZERO_d = static_cast<double>(0.0);
+    constexpr double ONE_d  = static_cast<double>(1.0);
+    constexpr double TWO_d  = static_cast<double>(2.0);
 
-    double_matrix_type A_d ("A_d", G);
-    auto A_d_val_h = Kokkos::create_mirror_view (A_d.values);
-    A_d_val_h[0] = ONE_d;
+    double_matrix_type A_d("A_d", G);
+    auto A_d_val_h = Kokkos::create_mirror_view(A_d.values);
+    A_d_val_h[0]   = ONE_d;
     // This cast is deliberate; we want to use float eps here, but as
     // a double-precision number.  This is just a sanity check for
     // accuracy of the sparse mat-vec when not using mixed precision.
-    A_d_val_h[1] = static_cast<double> (EPS_f) / TWO_d;
-    EXPECT_NE( A_d_val_h[1], ZERO_d ); // just making sure
-    Kokkos::deep_copy (A_d.values, A_d_val_h);
+    A_d_val_h[1] = static_cast<double>(EPS_f) / TWO_d;
+    EXPECT_NE(A_d_val_h[1], ZERO_d);  // just making sure
+    Kokkos::deep_copy(A_d.values, A_d_val_h);
 
     // Just to make sure, we purge the previous contents of y,
     // before doing the sparse mat-vec.
-    Kokkos::deep_copy (y, ZERO_d);
-    KokkosSparse::spmv ("N", alpha_d, A_d, x, beta_d, y);
+    Kokkos::deep_copy(y, ZERO_d);
+    KokkosSparse::spmv("N", alpha_d, A_d, x, beta_d, y);
 
-    Kokkos::deep_copy (y_h, y);
-    const double expectedResult_allDouble = static_cast<double> (1.0) +
-      static_cast<double> (EPS_f) / static_cast<double> (2.0);
-    EXPECT_NE( expectedResult_allDouble, ZERO_d );
-    EXPECT_EQ( y_h[0], expectedResult_allDouble );
+    Kokkos::deep_copy(y_h, y);
+    const double expectedResult_allDouble =
+        static_cast<double>(1.0) +
+        static_cast<double>(EPS_f) / static_cast<double>(2.0);
+    EXPECT_NE(expectedResult_allDouble, ZERO_d);
+    EXPECT_EQ(y_h[0], expectedResult_allDouble);
 
     for (int curNumVecs = 1; curNumVecs <= numVecs; ++curNumVecs) {
-      const Kokkos::pair<int, int> vecRng (0, curNumVecs);
-      auto X_sub = Kokkos::subview (X, Kokkos::ALL (), vecRng);
-      auto Y_sub = Kokkos::subview (Y, Kokkos::ALL (), vecRng);
+      const Kokkos::pair<int, int> vecRng(0, curNumVecs);
+      auto X_sub = Kokkos::subview(X, Kokkos::ALL(), vecRng);
+      auto Y_sub = Kokkos::subview(Y, Kokkos::ALL(), vecRng);
 
       // Just to make sure, we purge the previous contents of Y,
       // before doing the sparse mat-vec.
-      Kokkos::deep_copy (Y, ZERO_d);
-      KokkosSparse::spmv ("N", alpha_d, A_d, X, beta_d, Y);
+      Kokkos::deep_copy(Y, ZERO_d);
+      KokkosSparse::spmv("N", alpha_d, A_d, X, beta_d, Y);
 
-      Kokkos::deep_copy (Y_h, Y);
+      Kokkos::deep_copy(Y_h, Y);
       for (int j = 0; j < curNumVecs; ++j) {
-        const double actualResult = Y_h(0,j);
-        EXPECT_EQ( actualResult, expectedResult_allDouble );
+        const double actualResult = Y_h(0, j);
+        EXPECT_EQ(actualResult, expectedResult_allDouble);
       }
     }
   }
@@ -833,435 +894,1184 @@ void test_github_issue_101 ()
   // Now exercise the case where the matrix is in float, but the
   // vectors are in double.
   {
-    constexpr float ZERO_f = static_cast<float> (0.0);
-    constexpr float ONE_f = static_cast<float> (1.0);
-    constexpr float TWO_f = static_cast<float> (2.0);
-    constexpr double ZERO_d = static_cast<double> (0.0);
-
-    float_matrix_type A_f ("A_f", G);
-    auto A_f_val_h = Kokkos::create_mirror_view (A_f.values);
-    A_f_val_h[0] = ONE_f;
-    A_f_val_h[1] = EPS_f / TWO_f;
-    EXPECT_NE( A_f_val_h[1], ZERO_f ); // just making sure
-    Kokkos::deep_copy (A_f.values, A_f_val_h);
+    constexpr float ZERO_f  = static_cast<float>(0.0);
+    constexpr float ONE_f   = static_cast<float>(1.0);
+    constexpr float TWO_f   = static_cast<float>(2.0);
+    constexpr double ZERO_d = static_cast<double>(0.0);
+
+    float_matrix_type A_f("A_f", G);
+    auto A_f_val_h = Kokkos::create_mirror_view(A_f.values);
+    A_f_val_h[0]   = ONE_f;
+    A_f_val_h[1]   = EPS_f / TWO_f;
+    EXPECT_NE(A_f_val_h[1], ZERO_f);  // just making sure
+    Kokkos::deep_copy(A_f.values, A_f_val_h);
 
     // Just to make sure, we purge the previous contents of y,
     // before doing the sparse mat-vec.
-    Kokkos::deep_copy (y, ZERO_d);
-    KokkosSparse::spmv ("N", alpha_d, A_f, x, beta_d, y);
+    Kokkos::deep_copy(y, ZERO_d);
+    KokkosSparse::spmv("N", alpha_d, A_f, x, beta_d, y);
 
-    Kokkos::deep_copy (y_h, y);
-    const double expectedResult_mixed = static_cast<double> (1.0) +
-      static_cast<double> (EPS_f) / static_cast<double> (2.0);
-    EXPECT_NE( expectedResult_mixed, ZERO_d );
-    EXPECT_EQ( y_h[0], expectedResult_mixed );
+    Kokkos::deep_copy(y_h, y);
+    const double expectedResult_mixed =
+        static_cast<double>(1.0) +
+        static_cast<double>(EPS_f) / static_cast<double>(2.0);
+    EXPECT_NE(expectedResult_mixed, ZERO_d);
+    EXPECT_EQ(y_h[0], expectedResult_mixed);
 
     for (int curNumVecs = 1; curNumVecs <= numVecs; ++curNumVecs) {
-      const Kokkos::pair<int, int> vecRng (0, curNumVecs);
-      auto X_sub = Kokkos::subview (X, Kokkos::ALL (), vecRng);
-      auto Y_sub = Kokkos::subview (Y, Kokkos::ALL (), vecRng);
+      const Kokkos::pair<int, int> vecRng(0, curNumVecs);
+      auto X_sub = Kokkos::subview(X, Kokkos::ALL(), vecRng);
+      auto Y_sub = Kokkos::subview(Y, Kokkos::ALL(), vecRng);
 
       // Just to make sure, we purge the previous contents of Y,
       // before doing the sparse mat-vec.
-      Kokkos::deep_copy (Y, ZERO_d);
-      KokkosSparse::spmv ("N", alpha_d, A_f, X, beta_d, Y);
+      Kokkos::deep_copy(Y, ZERO_d);
+      KokkosSparse::spmv("N", alpha_d, A_f, X, beta_d, Y);
 
-      Kokkos::deep_copy (Y_h, Y);
+      Kokkos::deep_copy(Y_h, Y);
       for (int j = 0; j < curNumVecs; ++j) {
-        const double actualResult = Y_h(0,j);
-        EXPECT_EQ( actualResult, expectedResult_mixed );
+        const double actualResult = Y_h(0, j);
+        EXPECT_EQ(actualResult, expectedResult_mixed);
       }
     }
   }
 }
 
+#define EXECUTE_TEST_ISSUE_101(DEVICE)                                    \
+  TEST_F(TestCategory, sparse##_##spmv_issue_101##_##OFFSET##_##DEVICE) { \
+    test_github_issue_101<DEVICE>();                                      \
+  }
+
+template <typename CrsMat>
+CrsMat make_block_matrix(typename CrsMat::ordinal_type &numRows,
+                         typename CrsMat::ordinal_type &numCols,
+                         typename CrsMat::ordinal_type &blockSize) {
+#if 0
+    typedef typename CrsMat::StaticCrsGraphType::row_map_type::non_const_type ptr_type ;
+    typedef typename CrsMat::StaticCrsGraphType::entries_type::non_const_type ind_type ;
+    typedef typename CrsMat::values_type::non_const_type val_type ;
+    typedef typename CrsMat::size_type size_type;
+#endif
+  typedef typename CrsMat::ordinal_type lno_t;
+  typedef typename CrsMat::value_type scalar_t;
 
-#define EXECUTE_TEST_ISSUE_101( DEVICE) \
-TEST_F( TestCategory,sparse ## _ ## spmv_issue_101 ## _ ## OFFSET ## _ ## DEVICE ) { \
-	test_github_issue_101<DEVICE> (); \
-}
+  using Kokkos::HostSpace;
+  using Kokkos::MemoryUnmanaged;
+  using Kokkos::View;
 
+  Kokkos::Random_XorShift64<Kokkos::HostSpace> rand(13718);
 
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
-TEST_F( TestCategory,sparse ## _ ## spmv ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  test_spmv<SCALAR,ORDINAL,OFFSET,DEVICE> (1000, 1000 * 30, 200, 10, true); \
-  test_spmv<SCALAR,ORDINAL,OFFSET,DEVICE> (1000, 1000 * 30, 100, 10, true); \
-  test_spmv<SCALAR,ORDINAL,OFFSET,DEVICE> (1000, 1000 * 20, 100, 5, true); \
-  test_spmv<SCALAR,ORDINAL,OFFSET,DEVICE> (50000, 50000 * 30, 200, 10, false); \
-  test_spmv<SCALAR,ORDINAL,OFFSET,DEVICE> (50000, 50000 * 30, 100, 10, false); \
-  test_spmv<SCALAR,ORDINAL,OFFSET,DEVICE> (10000, 10000 * 20, 100, 5, false); \
-  test_spmv_controls<SCALAR,ORDINAL,OFFSET,DEVICE> (10000, 10000 * 20, 100, 5); \
-}
+  // fill outputs with random values
+  // Kokkos::Random_XorShift64_Pool<Kokkos::HostSpace> rand_pool(13718);
+  // Kokkos::fill_random(hi_x, rand_pool, randomUpperBound<typename
+  // hi_scalar_view_t::value_type>(10));
+
+  std::vector<scalar_t> values;
+  std::vector<lno_t> rowmap;
+  std::vector<lno_t> entries;
 
-#define EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE) \
-TEST_F( TestCategory,sparse ## _ ## spmv_mv ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## LAYOUT ## _ ## DEVICE ) { \
-  test_spmv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (1000, 1000 * 30, 200, 10, true, 1); \
-  test_spmv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (1000, 1000 * 30, 100, 10, true, 5); \
-  test_spmv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (1000, 1000 * 20, 100, 5, true, 10); \
-  test_spmv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (50000, 50000 * 30, 200, 10, false, 1); \
-  test_spmv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (50000, 50000 * 30, 100, 10, false, 5); \
-  test_spmv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (10000, 10000 * 20, 100, 5, false, 10); \
-  test_spmv_mv_heavy<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (200, 200 * 10, 60, 4, 30); \
+  // each row of blocks
+  for (lno_t bi = 0; bi < numRows; bi += blockSize) {
+    // target number of blocks in the row
+    lno_t rowBlockCount = 3;
+    {
+      // cap the number of blocks in the row
+      lno_t maxBlocksInRow = numCols / blockSize;
+      rowBlockCount        = std::min(maxBlocksInRow, rowBlockCount);
+    }
+
+    // where the blocks in this row of blocks start
+    // add that many blocks at random positions in the row
+    std::vector<lno_t> bjs;
+    for (int _ = 0; _ < rowBlockCount; ++_) {
+      bjs.push_back(rand.rand(numCols / blockSize) * blockSize);
+    }
+
+    // remove duplicates
+    {
+      std::sort(bjs.begin(), bjs.end());
+      auto it = std::unique(bjs.begin(), bjs.end());
+      bjs.resize(it - bjs.begin());
+    }
+
+    for (lno_t i = bi; i < bi + blockSize; ++i) {
+      rowmap.push_back(entries.size());  // where this row starts
+
+      // for each block
+      for (size_t block = 0; block < bjs.size(); ++block) {
+        lno_t bj = bjs[block];
+        for (lno_t j = bj; j < bj + blockSize; ++j) {
+          entries.push_back(j);
+          values.push_back(rand.rand(10));
+          // values.push_back(1);
+        }
+      }
+    }
+  }
+
+  while (rowmap.size() < numRows + 1) {
+    rowmap.push_back(entries.size());
+  }
+
+  return CrsMat("", numRows, numCols, values.size(), values.data(),
+                rowmap.data(), entries.data());
 }
 
-#define EXECUTE_TEST_STRUCT(SCALAR, ORDINAL, OFFSET, DEVICE) \
-TEST_F( TestCategory,sparse ## _ ## spmv_struct ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  test_spmv_struct_1D<SCALAR,ORDINAL,OFFSET,DEVICE> (10, 1, 1);            \
-  test_spmv_struct_2D<SCALAR,ORDINAL,OFFSET,DEVICE> (250, 201, 3, 3);      \
-  test_spmv_struct_2D<SCALAR,ORDINAL,OFFSET,DEVICE> (200, 250, 3, 3);      \
-  test_spmv_struct_2D<SCALAR,ORDINAL,OFFSET,DEVICE> (251, 251, 3, 3);      \
-  test_spmv_struct_3D<SCALAR,ORDINAL,OFFSET,DEVICE> (30, 30, 30, 3, 3, 3); \
-  test_spmv_struct_3D<SCALAR,ORDINAL,OFFSET,DEVICE> (40, 40, 40, 3, 3, 3); \
-  test_spmv_struct_3D<SCALAR,ORDINAL,OFFSET,DEVICE> (25, 40, 50, 3, 3, 3); \
-  test_spmv_struct_3D<SCALAR,ORDINAL,OFFSET,DEVICE> (40, 50, 25, 3, 3, 3); \
-  test_spmv_struct_3D<SCALAR,ORDINAL,OFFSET,DEVICE> (50, 24, 40, 3, 3, 3); \
+struct Coordinate {
+  int i;
+  int j;
+  Coordinate(int _i, int _j) : i(_i), j(_j) {}
+  // sort by i then j
+  static bool by_ij(const Coordinate &a, const Coordinate &b) {
+    if (a.i < b.i) {
+      return true;
+    } else if (a.i > b.i) {
+      return false;
+    } else {
+      return a.j < b.j;
+    }
+  }
+};
+struct Entry {
+  Coordinate c;
+  double e;
+  Entry(int i, int j, double _e) : c(i, j), e(_e) {}
+  static bool by_ij(const Entry &a, const Entry &b) {
+    return Coordinate::by_ij(a.c, b.c);
+  }
+};
+
+// expand a pattern into a blocked CrsMatrix
+template <typename Matrix,
+          std::enable_if_t<is_crs_matrix<Matrix>::value, bool> = true>
+Matrix expand_matrix(std::vector<Coordinate> pattern, const int m, const int k,
+                     const int blockSize, const int seed = 0) {
+  typedef typename Matrix::value_type Scalar;
+  typedef typename Matrix::ordinal_type Ordinal;
+  typedef typename Matrix::non_const_size_type Offset;
+  typedef Kokkos::View<const Offset *, Kokkos::HostSpace,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      UnmanagedRowmap;
+  typedef Kokkos::View<const Ordinal *, Kokkos::HostSpace,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      UnmanagedEntries;
+  typedef Kokkos::View<const Scalar *, Kokkos::HostSpace,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      UnmanagedValues;
+
+  srand(seed);
+
+  auto gen_rand = []() -> double { return rand() % 10; };
+
+  // check rows and columns
+  for (const Coordinate &c : pattern) {
+    if (c.i >= m) {
+      KokkosKernels::Impl::throw_runtime_exception("i exceeded matrix rows");
+    }
+    if (c.j >= k) {
+      KokkosKernels::Impl::throw_runtime_exception("j exceeded matrix cols");
+    }
+  }
+
+  // order the blocks
+  std::sort(pattern.begin(), pattern.end(), Coordinate::by_ij);
+
+  // create coo entries for each block
+  std::vector<Entry> entries;
+  for (const Coordinate &c : pattern) {
+    for (int i = 0; i < blockSize; ++i) {
+      for (int j = 0; j < blockSize; ++j) {
+        entries.push_back(
+            Entry(c.i * blockSize + i, c.j * blockSize + j, gen_rand()));
+      }
+    }
+  }
+
+  std::sort(entries.begin(), entries.end(), Entry::by_ij);
+
+  std::vector<Offset> rowMap;
+  std::vector<Ordinal> colInd;
+  std::vector<Scalar> val;
+
+  for (Entry &e : entries) {
+    while (rowMap.size() < size_t(e.c.i + 1)) {  // catch empty rows
+      rowMap.push_back(colInd.size());
+    }
+    colInd.push_back(e.c.j);
+    val.push_back(e.e);
+  }
+  // possibly empty rows at end of matrix
+  while (rowMap.size() <= size_t(m * blockSize)) {
+    rowMap.push_back(colInd.size());
+  }
+
+  typename Matrix::row_map_type::non_const_type sparseRowMap("", rowMap.size());
+  Kokkos::deep_copy(sparseRowMap,
+                    UnmanagedRowmap(rowMap.data(), rowMap.size()));
+  typename Matrix::index_type::non_const_type sparseCols("", colInd.size());
+  Kokkos::deep_copy(sparseCols, UnmanagedEntries(colInd.data(), colInd.size()));
+  typename Matrix::values_type::non_const_type sparseVals("", val.size());
+  Kokkos::deep_copy(sparseVals, UnmanagedValues(val.data(), val.size()));
+
+  Matrix mat("crs", m * blockSize, k * blockSize, sparseVals.size(), sparseVals,
+             sparseRowMap, sparseCols);
+  return mat;
 }
 
+template <
+    typename Matrix,
+    std::enable_if_t<KokkosSparse::Experimental::is_bsr_matrix<Matrix>::value,
+                     bool> = true>
+Matrix expand_matrix(std::vector<Coordinate> pattern, const int m, const int k,
+                     const int blockSize, const int seed = 0) {
+  typedef typename Matrix::value_type Scalar;
+  typedef typename Matrix::ordinal_type Ordinal;
+  typedef typename Matrix::non_const_size_type Offset;
+  typedef Kokkos::View<const Offset *, Kokkos::HostSpace,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      UnmanagedRowmap;
+  typedef Kokkos::View<const Ordinal *, Kokkos::HostSpace,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      UnmanagedEntries;
+  typedef Kokkos::View<const Scalar *, Kokkos::HostSpace,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      UnmanagedValues;
+
+  srand(seed);
+
+  auto gen_rand = []() -> double { return rand() % 10; };
+
+  // determine the number of rows and columns
+  // check rows and columns
+  for (const Coordinate &c : pattern) {
+    if (c.i >= m) {
+      KokkosKernels::Impl::throw_runtime_exception("i exceeded matrix rows");
+    }
+    if (c.j >= k) {
+      KokkosKernels::Impl::throw_runtime_exception("j exceeded matrix cols");
+    }
+  }
+
+  // order the blocks
+  std::sort(pattern.begin(), pattern.end(), Coordinate::by_ij);
+
+  // create values in order of the blocks (storage order for BSR)
+  std::vector<Scalar> val(pattern.size() * blockSize * blockSize);
+  for (typename std::vector<Scalar>::size_type idx = 0; idx < val.size();
+       ++idx) {
+    val[idx] = gen_rand();
+  }
+
+  /* create the BsrMatrix adjacency info
+     use the sorted pattern. val is already in the correct storage order
+  */
+  std::vector<Offset> rowMap;
+  std::vector<Ordinal> colInd;
+
+  for (Coordinate &e : pattern) {
+    while (rowMap.size() < size_t(e.i + 1)) {  // catch empty rows
+      rowMap.push_back(colInd.size());
+    }
+    colInd.push_back(e.j);
+  }
+  // possibly empty rows at end of matrix
+  while (rowMap.size() <= size_t(m)) {
+    rowMap.push_back(colInd.size());
+  }
 
-#define EXECUTE_TEST_MV_STRUCT(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE) \
-TEST_F( TestCategory,sparse ## _ ## spmv_mv_struct ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## LAYOUT ## _ ## DEVICE ) { \
-  test_spmv_mv_struct_1D<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (10, 1); \
-  test_spmv_mv_struct_1D<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (10, 2); \
+  typename Matrix::row_map_type::non_const_type sparseRowMap("", rowMap.size());
+  Kokkos::deep_copy(sparseRowMap,
+                    UnmanagedRowmap(rowMap.data(), rowMap.size()));
+  typename Matrix::index_type::non_const_type sparseCols("", colInd.size());
+  Kokkos::deep_copy(sparseCols, UnmanagedEntries(colInd.data(), colInd.size()));
+  typename Matrix::values_type::non_const_type sparseVals("", val.size());
+  Kokkos::deep_copy(sparseVals, UnmanagedValues(val.data(), val.size()));
+  Kokkos::fence();
+
+  Matrix mat("bsr", m, k, sparseVals.size(), sparseVals, sparseRowMap,
+             sparseCols, blockSize);
+  return mat;
 }
 
-#if (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
-  EXECUTE_TEST_ISSUE_101(TestExecSpace)
+/* a_scalar_t: the matrix type
+   x_scalar_t: the x-vector type
+   y_scalar_t: the y-vector type
+
+   blockSize: the size of the dense blocks in the matrix
+   pattern: the non-zero locations of the blocks
+   m,n: the multiplication dimensions (in terms of blockSize)
+   k: number of vectors in the multivector
+   y[m*blockSize x k] = A[m*blockSize x n*blockSize] * x[n*blockSize x k]
+
+   Compare the BsrMatrix spmv against a KokkosSparse::spmv on the same operands.
+   The controls are used in the BsrMatrix SpMV invocation
+
+*/
+template <typename a_scalar_t, typename x_scalar_t, typename y_scalar_t,
+          typename lno_t, typename size_type, typename Layout, typename Device>
+void test_spmv_bsrmatrix_controls_pattern(
+    const KokkosKernels::Experimental::Controls &controls,
+    const std::vector<Coordinate> &pattern, const int m, const int n,
+    lno_t blockSize, lno_t k, y_scalar_t alpha, y_scalar_t beta) {
+  // get the widest passed scalar type
+  // typedef typename std::conditional<sizeof(a_scalar_t) >= sizeof(x_scalar_t),
+  //                                   a_scalar_t, x_scalar_t>::type wider_t;
+  // typedef typename std::conditional<sizeof(wider_t) >= sizeof(y_scalar_t),
+  //                                   wider_t, y_scalar_t>::type widest_t;
+
+  typedef typename KokkosSparse::CrsMatrix<a_scalar_t, lno_t, Device, void,
+                                           size_type>
+      crs_mat_t;
+  typedef
+      typename KokkosSparse::Experimental::BsrMatrix<a_scalar_t, lno_t, Device,
+                                                     void, size_type>
+          bsr_mat_t;
+  typedef Kokkos::View<x_scalar_t **, Layout, Device> x_view_t;
+  typedef Kokkos::View<y_scalar_t **, Layout, Device> y_view_t;
+
+  using DeviceRangePolicy = Kokkos::RangePolicy<Device>;
+
+  crs_mat_t crs = expand_matrix<crs_mat_t>(pattern, m, n, blockSize);
+  bsr_mat_t bsr = expand_matrix<bsr_mat_t>(pattern, m, n, blockSize);
+
+  // only tue if the original matrix is a multiple of block size, and all blocks
+  // are dense
+  EXPECT_TRUE(bsr.nnz() * bsr.blockDim() * bsr.blockDim() == crs.nnz());
+  EXPECT_TRUE(bsr.numRows() * bsr.blockDim() == crs.numRows());
+  EXPECT_TRUE(bsr.numCols() * bsr.blockDim() == crs.numCols());
+
+  // expected operands
+  x_view_t exp_x("exp_x", n * blockSize, k);
+  y_view_t exp_y("exp_y", m * blockSize, k);
+
+  // test operands
+  y_view_t test_y("test_y", m * blockSize, k);
+  x_view_t test_x("test_x", n * blockSize, k);
+
+  // fill expected with random values
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
+  Kokkos::fill_random(exp_x, rand_pool,
+                      randomUpperBound<typename x_view_t::value_type>(10));
+  Kokkos::fill_random(exp_y, rand_pool,
+                      randomUpperBound<typename y_view_t::value_type>(10));
+
+#if 0
+  // fill inputs with 1, for help debugging
+  Kokkos::parallel_for("fill",
+    Kokkos::MDRangePolicy<Device, Kokkos::Rank<2>>({0,0}, {hi_x.extent(0), hi_x.extent(1)}),
+    KOKKOS_LAMBDA (unsigned i, unsigned j) { 
+        hi_x(i,j) = 1 + (i == 0 && j == 0); 
+    }
+  );
 #endif
 
+  // copy expected operands to test operands
+  Kokkos::deep_copy(test_x, exp_x);
+  Kokkos::deep_copy(test_y, exp_y);
+  Kokkos::fence();
 
+  // generate expected y vector
+  // some error about Blas implementation
+  KokkosSparse::spmv("N", alpha, crs, exp_x, beta, exp_y);
+  Kokkos::fence();
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, int, TestExecSpace)
- EXECUTE_TEST_STRUCT(double, int, int, TestExecSpace)
-#endif
+  // invoke tensor-core spmv
+  KokkosSparse::spmv(controls, "N", alpha, bsr, test_x, beta, test_y);
+  Kokkos::fence();
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, int, TestExecSpace)
- EXECUTE_TEST_STRUCT(double, int64_t, int, TestExecSpace)
-#endif
+  // test each vector
+  for (lno_t ki = 0; ki < k; ++ki) {
+    auto exp_y_i  = Kokkos::subview(exp_y, Kokkos::ALL(), ki);
+    auto test_y_i = Kokkos::subview(test_y, Kokkos::ALL(), ki);
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, size_t, TestExecSpace)
- EXECUTE_TEST_STRUCT(double, int, size_t, TestExecSpace)
-#endif
+    // count errors
+    int num_errors = 0;
+    // Kokkos::ArithTraits<half> in CUDA 9 is float on the host
+    // for CUDA 9, Kokkos half is actually float. However, the tensor core SpMV
+    // uses CUDA's half type, not Kokkos, so we still need a reduced precision
+    // test.
+    double eps =
+        KOKKOSKERNELS_IMPL_FP16_EPSILON * KOKKOSKERNELS_IMPL_FP16_RADIX;
+    Kokkos::parallel_reduce("KokkosSparse::Test::spmv_tc",
+                            DeviceRangePolicy(0, exp_y_i.extent(0)),
+                            Test::fSPMV<decltype(exp_y_i), decltype(test_y_i)>(
+                                exp_y_i, test_y_i, eps),
+                            num_errors);
+    // explicit cast to double since no overload for half::operator<<
+    if (num_errors > 0)
+      std::cout << "KokkosSparse::Test::spmv_tc: " << num_errors
+                << " errors of " << exp_y_i.extent_int(0) << " for mv " << ki
+                << " (alpha="
+                << double(Kokkos::ArithTraits<y_scalar_t>::abs(alpha))
+                << ", beta="
+                << double(Kokkos::ArithTraits<y_scalar_t>::abs(beta))
+                << ", mode = N"
+                << ")\n";
+    EXPECT_TRUE(num_errors == 0);
+  }
+}
+
+/* test a particular pattern with all supported controls
+ */
+template <typename a_scalar_t, typename x_scalar_t, typename y_scalar_t,
+          typename lno_t, typename size_type, typename Layout, typename Device>
+void test_spmv_bsrmatrix_pattern(const std::vector<Coordinate> &pattern,
+                                 const int m, const int n, lno_t blockSize,
+                                 lno_t k, y_scalar_t alpha, y_scalar_t beta) {
+  {
+    KokkosKernels::Experimental::Controls controls;
+    controls.setParameter("algorithm", "experimental_bsr_tc");
+    test_spmv_bsrmatrix_controls_pattern<a_scalar_t, x_scalar_t, y_scalar_t,
+                                         lno_t, size_type, Layout, Device>(
+        controls, pattern, m, n, blockSize, k, alpha, beta);
+  }
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
- EXECUTE_TEST_STRUCT(double, int64_t, size_t, TestExecSpace)
+#if defined(KOKKOS_ARCH_AMPERE)
+  {
+    KokkosKernels::Experimental::Controls controls;
+    controls.setParameter("algorithm", "experimental_bsr_tc");
+    controls.setParameter("tc_precision", "double");
+    test_spmv_bsrmatrix_controls_pattern<a_scalar_t, x_scalar_t, y_scalar_t,
+                                         lno_t, size_type, Layout, Device>(
+        controls, pattern, m, n, blockSize, k, alpha, beta);
+  }
 #endif
+}
+
+/* test a bunch of different matrices
+ */
+template <typename a_scalar_t, typename x_scalar_t, typename y_scalar_t,
+          typename lno_t, typename size_type, typename Layout, typename Device>
+void test_spmv_bsrmatrix(lno_t blockSize, lno_t k, y_scalar_t alpha,
+                         y_scalar_t beta) {
+  KokkosKernels::Experimental::Controls controls;
+  controls.setParameter("algorithm", "experimental_bsr_tc");
+
+  // 1x1 full
+  {
+    int m                           = 1;
+    int n                           = 1;
+    std::vector<Coordinate> pattern = {Coordinate(0, 0)};
+    test_spmv_bsrmatrix_pattern<a_scalar_t, x_scalar_t, y_scalar_t, lno_t,
+                                size_type, Layout, Device>(
+        pattern, m, n, blockSize, k, alpha, beta);
+  }
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int, int, TestExecSpace)
- EXECUTE_TEST_STRUCT(float, int, int, TestExecSpace)
+  // 1x1 empty
+  {
+    int m                           = 1;
+    int n                           = 1;
+    std::vector<Coordinate> pattern = {};
+    test_spmv_bsrmatrix_pattern<a_scalar_t, x_scalar_t, y_scalar_t, lno_t,
+                                size_type, Layout, Device>(
+        pattern, m, n, blockSize, k, alpha, beta);
+  }
+
+  // 2x2 top-left
+  {
+    int m                           = 2;
+    int n                           = 2;
+    std::vector<Coordinate> pattern = {Coordinate(0, 0)};
+    test_spmv_bsrmatrix_pattern<a_scalar_t, x_scalar_t, y_scalar_t, lno_t,
+                                size_type, Layout, Device>(
+        pattern, m, n, blockSize, k, alpha, beta);
+  }
+
+  // 2x2 bottom right
+  {
+    int m                           = 2;
+    int n                           = 2;
+    std::vector<Coordinate> pattern = {Coordinate(1, 1)};
+    test_spmv_bsrmatrix_pattern<a_scalar_t, x_scalar_t, y_scalar_t, lno_t,
+                                size_type, Layout, Device>(
+        pattern, m, n, blockSize, k, alpha, beta);
+  }
+
+  // 2x3 bottom right
+  {
+    int m                           = 2;
+    int n                           = 3;
+    std::vector<Coordinate> pattern = {Coordinate(1, 2)};
+    test_spmv_bsrmatrix_pattern<a_scalar_t, x_scalar_t, y_scalar_t, lno_t,
+                                size_type, Layout, Device>(
+        pattern, m, n, blockSize, k, alpha, beta);
+  }
+
+  // 2x10 long bottom row
+  {
+    int m = 2;
+    int n = 10;
+    std::vector<Coordinate> pattern;
+    for (int j = 0; j < n; ++j) {
+      pattern.push_back(Coordinate(1, j));
+    }
+    test_spmv_bsrmatrix_pattern<a_scalar_t, x_scalar_t, y_scalar_t, lno_t,
+                                size_type, Layout, Device>(
+        pattern, m, n, blockSize, k, alpha, beta);
+  }
+
+  // 10x10 column 1 + diagonal
+  {
+    int m = 10;
+    int n = 10;
+    std::vector<Coordinate> pattern;
+    for (int i = 0; i < n; ++i) {
+      pattern.push_back(Coordinate(i, 1));
+      if (i != 1) {
+        pattern.push_back(Coordinate(i, i));
+      }
+    }
+    test_spmv_bsrmatrix_pattern<a_scalar_t, x_scalar_t, y_scalar_t, lno_t,
+                                size_type, Layout, Device>(
+        pattern, m, n, blockSize, k, alpha, beta);
+  }
+}
+
+#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                          \
+  TEST_F(TestCategory,                                                         \
+         sparse##_##spmv##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {       \
+    test_spmv<SCALAR, ORDINAL, OFFSET, DEVICE>(1000, 1000 * 3, 200, 10, true); \
+    test_spmv<SCALAR, ORDINAL, OFFSET, DEVICE>(1000, 1000 * 3, 100, 10, true); \
+    test_spmv<SCALAR, ORDINAL, OFFSET, DEVICE>(1000, 1000 * 20, 100, 5, true); \
+    test_spmv<SCALAR, ORDINAL, OFFSET, DEVICE>(50000, 50000 * 3, 20, 10,       \
+                                               false);                         \
+    test_spmv<SCALAR, ORDINAL, OFFSET, DEVICE>(50000, 50000 * 3, 100, 10,      \
+                                               false);                         \
+    test_spmv<SCALAR, ORDINAL, OFFSET, DEVICE>(10000, 10000 * 2, 100, 5,       \
+                                               false);                         \
+    test_spmv_controls<SCALAR, ORDINAL, OFFSET, DEVICE>(10000, 10000 * 20,     \
+                                                        100, 5);               \
+  }
+
+#define EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE)                    \
+  TEST_F(                                                                           \
+      TestCategory,                                                                 \
+      sparse##_##spmv_mv##_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \
+    test_spmv_mv<SCALAR, ORDINAL, OFFSET, Kokkos::LAYOUT, DEVICE>(                  \
+        1000, 1000 * 3, 200, 10, true, 1);                                          \
+    test_spmv_mv<SCALAR, ORDINAL, OFFSET, Kokkos::LAYOUT, DEVICE>(                  \
+        1000, 1000 * 3, 100, 10, true, 5);                                          \
+    test_spmv_mv<SCALAR, ORDINAL, OFFSET, Kokkos::LAYOUT, DEVICE>(                  \
+        1000, 1000 * 2, 100, 5, true, 10);                                          \
+    test_spmv_mv<SCALAR, ORDINAL, OFFSET, Kokkos::LAYOUT, DEVICE>(                  \
+        50000, 50000 * 3, 20, 10, false, 1);                                        \
+    test_spmv_mv<SCALAR, ORDINAL, OFFSET, Kokkos::LAYOUT, DEVICE>(                  \
+        50000, 50000 * 3, 100, 10, false, 1);                                       \
+    test_spmv_mv<SCALAR, ORDINAL, OFFSET, Kokkos::LAYOUT, DEVICE>(                  \
+        10000, 10000 * 2, 100, 5, false, 5);                                        \
+    test_spmv_mv_heavy<SCALAR, ORDINAL, OFFSET, Kokkos::LAYOUT, DEVICE>(            \
+        200, 200 * 10, 60, 4, 30);                                                  \
+  }
+
+#define EXECUTE_TEST_STRUCT(SCALAR, ORDINAL, OFFSET, DEVICE)                   \
+  TEST_F(                                                                      \
+      TestCategory,                                                            \
+      sparse##_##spmv_struct##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {   \
+    test_spmv_struct_1D<SCALAR, ORDINAL, OFFSET, DEVICE>(10, 1, 1);            \
+    test_spmv_struct_2D<SCALAR, ORDINAL, OFFSET, DEVICE>(25, 21, 3, 3);        \
+    test_spmv_struct_2D<SCALAR, ORDINAL, OFFSET, DEVICE>(20, 25, 3, 3);        \
+    test_spmv_struct_2D<SCALAR, ORDINAL, OFFSET, DEVICE>(22, 22, 3, 3);        \
+    test_spmv_struct_3D<SCALAR, ORDINAL, OFFSET, DEVICE>(20, 20, 20, 3, 3, 3); \
+    test_spmv_struct_3D<SCALAR, ORDINAL, OFFSET, DEVICE>(22, 22, 22, 3, 3, 3); \
+    test_spmv_struct_3D<SCALAR, ORDINAL, OFFSET, DEVICE>(25, 10, 20, 3, 3, 3); \
+    test_spmv_struct_3D<SCALAR, ORDINAL, OFFSET, DEVICE>(10, 20, 25, 3, 3, 3); \
+    test_spmv_struct_3D<SCALAR, ORDINAL, OFFSET, DEVICE>(10, 24, 20, 3, 3, 3); \
+  }
+
+#define EXECUTE_TEST_MV_STRUCT(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE)                    \
+  TEST_F(                                                                                  \
+      TestCategory,                                                                        \
+      sparse##_##spmv_mv_struct##_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \
+    test_spmv_mv_struct_1D<SCALAR, ORDINAL, OFFSET, Kokkos::LAYOUT, DEVICE>(               \
+        10, 1);                                                                            \
+    test_spmv_mv_struct_1D<SCALAR, ORDINAL, OFFSET, Kokkos::LAYOUT, DEVICE>(               \
+        10, 2);                                                                            \
+  }
+
+/* Tensor Core SpMV
+  blocksize, k, alpha, beta
+*/
+#define EXECUTE_TEST_TC(ASCALAR, XSCALAR, YSCALAR, ORDINAL, OFFSET, LAYOUT,                                           \
+                        DEVICE)                                                                                       \
+  TEST_F(                                                                                                             \
+      TestCategory,                                                                                                   \
+      sparse##_##spmv_tensor_core##_##ASCALAR##_##XSCALAR##_##YSCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \
+    /* easy case with different alphas and betas*/                                                                    \
+    test_spmv_bsrmatrix<ASCALAR, XSCALAR, YSCALAR, ORDINAL, OFFSET,                                                   \
+                        Kokkos::LAYOUT, DEVICE>(16, 16, 0, 0);                                                        \
+    test_spmv_bsrmatrix<ASCALAR, XSCALAR, YSCALAR, ORDINAL, OFFSET,                                                   \
+                        Kokkos::LAYOUT, DEVICE>(16, 16, 1, 0);                                                        \
+    test_spmv_bsrmatrix<ASCALAR, XSCALAR, YSCALAR, ORDINAL, OFFSET,                                                   \
+                        Kokkos::LAYOUT, DEVICE>(16, 16, 0, 1);                                                        \
+    test_spmv_bsrmatrix<ASCALAR, XSCALAR, YSCALAR, ORDINAL, OFFSET,                                                   \
+                        Kokkos::LAYOUT, DEVICE>(16, 16, 1, 1);                                                        \
+    /* easy case with a real alpha/beta */                                                                            \
+    test_spmv_bsrmatrix<ASCALAR, XSCALAR, YSCALAR, ORDINAL, OFFSET,                                                   \
+                        Kokkos::LAYOUT, DEVICE>(16, 16, 1.25, -2.73);                                                 \
+    /* smaller block size with k < and > block size*/                                                                 \
+    test_spmv_bsrmatrix<ASCALAR, XSCALAR, YSCALAR, ORDINAL, OFFSET,                                                   \
+                        Kokkos::LAYOUT, DEVICE>(7, 6, 1.25, -2.73);                                                   \
+    test_spmv_bsrmatrix<ASCALAR, XSCALAR, YSCALAR, ORDINAL, OFFSET,                                                   \
+                        Kokkos::LAYOUT, DEVICE>(7, 7, 1.25, -2.73);                                                   \
+    test_spmv_bsrmatrix<ASCALAR, XSCALAR, YSCALAR, ORDINAL, OFFSET,                                                   \
+                        Kokkos::LAYOUT, DEVICE>(7, 8, 1.25, -2.73);                                                   \
+    /* smaller block size with k < and > block size*/                                                                 \
+    test_spmv_bsrmatrix<ASCALAR, XSCALAR, YSCALAR, ORDINAL, OFFSET,                                                   \
+                        Kokkos::LAYOUT, DEVICE>(15, 14, 1.25, -2.73);                                                 \
+    test_spmv_bsrmatrix<ASCALAR, XSCALAR, YSCALAR, ORDINAL, OFFSET,                                                   \
+                        Kokkos::LAYOUT, DEVICE>(15, 15, 1.25, -2.73);                                                 \
+    test_spmv_bsrmatrix<ASCALAR, XSCALAR, YSCALAR, ORDINAL, OFFSET,                                                   \
+                        Kokkos::LAYOUT, DEVICE>(15, 16, 1.25, -2.73);                                                 \
+    /* larger block size with k < and > block size*/                                                                  \
+    test_spmv_bsrmatrix<ASCALAR, XSCALAR, YSCALAR, ORDINAL, OFFSET,                                                   \
+                        Kokkos::LAYOUT, DEVICE>(17, 16, 1.25, -2.73);                                                 \
+    test_spmv_bsrmatrix<ASCALAR, XSCALAR, YSCALAR, ORDINAL, OFFSET,                                                   \
+                        Kokkos::LAYOUT, DEVICE>(17, 17, 1.25, -2.73);                                                 \
+    test_spmv_bsrmatrix<ASCALAR, XSCALAR, YSCALAR, ORDINAL, OFFSET,                                                   \
+                        Kokkos::LAYOUT, DEVICE>(17, 18, 1.25, -2.73);                                                 \
+    /* larger block size with k < and > block size*/                                                                  \
+    test_spmv_bsrmatrix<ASCALAR, XSCALAR, YSCALAR, ORDINAL, OFFSET,                                                   \
+                        Kokkos::LAYOUT, DEVICE>(32, 31, 1.25, -2.73);                                                 \
+    test_spmv_bsrmatrix<ASCALAR, XSCALAR, YSCALAR, ORDINAL, OFFSET,                                                   \
+                        Kokkos::LAYOUT, DEVICE>(32, 32, 1.25, -2.73);                                                 \
+    test_spmv_bsrmatrix<ASCALAR, XSCALAR, YSCALAR, ORDINAL, OFFSET,                                                   \
+                        Kokkos::LAYOUT, DEVICE>(32, 33, 1.25, -2.73);                                                 \
+    /* more than one team per block*/                                                                                 \
+    test_spmv_bsrmatrix<ASCALAR, XSCALAR, YSCALAR, ORDINAL, OFFSET,                                                   \
+                        Kokkos::LAYOUT, DEVICE>(33, 13, 1.25, -2.73);                                                 \
+    test_spmv_bsrmatrix<ASCALAR, XSCALAR, YSCALAR, ORDINAL, OFFSET,                                                   \
+                        Kokkos::LAYOUT, DEVICE>(33, 27, 1.25, -2.73);                                                 \
+    test_spmv_bsrmatrix<ASCALAR, XSCALAR, YSCALAR, ORDINAL, OFFSET,                                                   \
+                        Kokkos::LAYOUT, DEVICE>(33, 41, 1.25, -2.73);                                                 \
+  }
+
+// minimal conditions for tensor core SpMV test
+// BsrMatrix spmv is only supported on CUDA for the time being
+#if defined(KOKKOS_ENABLE_CUDA) && defined(TEST_CUDA_SPARSE_CPP) && \
+    (defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_AMPERE))
+
+#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&       \
+        defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T) && \
+        defined(KOKKOSKERNELS_INST_FLOAT) &&         \
+        defined(KOKKOSKERNELS_INST_LAYOUTLEFT) ||    \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&             \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+// EXECUTE_TEST_TC(kokkos_half,  kokkos_half, float,   int, size_t, LayoutLeft,
+// TestExecSpace) EXECUTE_TEST_TC(kokkos_half,  float,       float,   int,
+// size_t, LayoutLeft,  TestExecSpace) EXECUTE_TEST_TC(float, kokkos_half,
+// float,   int, size_t, LayoutLeft,  TestExecSpace)
+EXECUTE_TEST_TC(float, float, float, int, size_t, LayoutLeft, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int64_t, int, TestExecSpace)
- EXECUTE_TEST_STRUCT(float, int64_t, int, TestExecSpace)
+#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&       \
+        defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T) && \
+        defined(KOKKOSKERNELS_INST_DOUBLE) &&        \
+        defined(KOKKOSKERNELS_INST_LAYOUTLEFT) ||    \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&             \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+// EXECUTE_TEST_TC(kokkos_half,  kokkos_half, double,   int, size_t, LayoutLeft,
+// TestExecSpace) EXECUTE_TEST_TC(kokkos_half,  double,       double,   int,
+// size_t, LayoutLeft,  TestExecSpace) EXECUTE_TEST_TC(double, kokkos_half,
+// double,   int, size_t, LayoutLeft,  TestExecSpace)
+EXECUTE_TEST_TC(double, double, double, int, size_t, LayoutLeft, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int, size_t, TestExecSpace)
- EXECUTE_TEST_STRUCT(float, int, size_t, TestExecSpace)
+#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&       \
+        defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T) && \
+        defined(KOKKOSKERNELS_INST_FLOAT) &&         \
+        defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&             \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+// EXECUTE_TEST_TC(kokkos_half,  kokkos_half, float,   int, size_t, LayoutRight,
+// TestExecSpace) EXECUTE_TEST_TC(kokkos_half,  float,       float,   int,
+// size_t, LayoutRight,  TestExecSpace) EXECUTE_TEST_TC(float, kokkos_half,
+// float,   int, size_t, LayoutRight,  TestExecSpace)
+EXECUTE_TEST_TC(float, float, float, int, size_t, LayoutRight, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
- EXECUTE_TEST_STRUCT(float, int64_t, size_t, TestExecSpace)
+#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&       \
+        defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T) && \
+        defined(KOKKOSKERNELS_INST_DOUBLE) &&        \
+        defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) ||   \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&             \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+// EXECUTE_TEST_TC(kokkos_half,  kokkos_half, double,   int, size_t,
+// LayoutRight,  TestExecSpace) EXECUTE_TEST_TC(kokkos_half,  double, double,
+// int, size_t, LayoutRight,  TestExecSpace) EXECUTE_TEST_TC(double,
+// kokkos_half, double,   int, size_t, LayoutRight,  TestExecSpace)
+EXECUTE_TEST_TC(double, double, double, int, size_t, LayoutRight, TestExecSpace)
 #endif
 
+#endif  // tensor core SpMV tests
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
- EXECUTE_TEST_STRUCT(kokkos_complex_double, int, int, TestExecSpace)
-#endif
+#undef EXECUTE_TEST_TC
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
- EXECUTE_TEST_STRUCT(kokkos_complex_double, int64_t, int, TestExecSpace)
+#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_ISSUE_101(TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
- EXECUTE_TEST_STRUCT(kokkos_complex_double, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, int, TestExecSpace)
+EXECUTE_TEST_STRUCT(double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
- EXECUTE_TEST_STRUCT(kokkos_complex_double, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, int, TestExecSpace)
+EXECUTE_TEST_STRUCT(double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
- EXECUTE_TEST_STRUCT(kokkos_complex_float, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, size_t, TestExecSpace)
+EXECUTE_TEST_STRUCT(double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
- EXECUTE_TEST_STRUCT(kokkos_complex_float, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
+EXECUTE_TEST_STRUCT(double, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
- EXECUTE_TEST_STRUCT(kokkos_complex_float, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int, int, TestExecSpace)
+EXECUTE_TEST_STRUCT(float, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
- EXECUTE_TEST_STRUCT(kokkos_complex_float, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int64_t, int, TestExecSpace)
+EXECUTE_TEST_STRUCT(float, int64_t, int, TestExecSpace)
 #endif
 
-
-#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT)) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(double, int, int, LayoutLeft, TestExecSpace)
- EXECUTE_TEST_MV_STRUCT(double, int, int, LayoutLeft, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int, size_t, TestExecSpace)
+EXECUTE_TEST_STRUCT(float, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT)) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(double, int64_t, int, LayoutLeft, TestExecSpace)
- EXECUTE_TEST_MV_STRUCT(double, int64_t, int, LayoutLeft, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
+EXECUTE_TEST_STRUCT(float, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(double, int, size_t, LayoutLeft, TestExecSpace)
- EXECUTE_TEST_MV_STRUCT(double, int, size_t, LayoutLeft, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
+EXECUTE_TEST_STRUCT(kokkos_complex_double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(double, int64_t, size_t, LayoutLeft, TestExecSpace)
- EXECUTE_TEST_MV_STRUCT(double, int64_t, size_t, LayoutLeft, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
+EXECUTE_TEST_STRUCT(kokkos_complex_double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(float, int, int, LayoutLeft, TestExecSpace)
- EXECUTE_TEST_MV_STRUCT(float, int, int, LayoutLeft, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
+EXECUTE_TEST_STRUCT(kokkos_complex_double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(float, int64_t, int, LayoutLeft, TestExecSpace)
- EXECUTE_TEST_MV_STRUCT(float, int64_t, int, LayoutLeft, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
+EXECUTE_TEST_STRUCT(kokkos_complex_double, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(float, int, size_t, LayoutLeft, TestExecSpace)
- EXECUTE_TEST_MV_STRUCT(float, int, size_t, LayoutLeft, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
+EXECUTE_TEST_STRUCT(kokkos_complex_float, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(float, int64_t, size_t, LayoutLeft, TestExecSpace)
- EXECUTE_TEST_MV_STRUCT(float, int64_t, size_t, LayoutLeft, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
+EXECUTE_TEST_STRUCT(kokkos_complex_float, int64_t, int, TestExecSpace)
 #endif
 
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_double, int, int, LayoutLeft, TestExecSpace)
- EXECUTE_TEST_MV_STRUCT(kokkos_complex_double, int, int, LayoutLeft, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
+EXECUTE_TEST_STRUCT(kokkos_complex_float, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_double, int64_t, int, LayoutLeft, TestExecSpace)
- EXECUTE_TEST_MV_STRUCT(kokkos_complex_double, int64_t, int, LayoutLeft, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
+EXECUTE_TEST_STRUCT(kokkos_complex_float, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_double, int, size_t, LayoutLeft, TestExecSpace)
- EXECUTE_TEST_MV_STRUCT(kokkos_complex_double, int, size_t, LayoutLeft, TestExecSpace)
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&  \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(double, int, int, LayoutLeft, TestExecSpace)
+EXECUTE_TEST_MV_STRUCT(double, int, int, LayoutLeft, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_double, int64_t, size_t, LayoutLeft, TestExecSpace)
- EXECUTE_TEST_MV_STRUCT(kokkos_complex_double, int64_t, size_t, LayoutLeft, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&      \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(double, int64_t, int, LayoutLeft, TestExecSpace)
+EXECUTE_TEST_MV_STRUCT(double, int64_t, int, LayoutLeft, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_float, int, int, LayoutLeft, TestExecSpace)
- EXECUTE_TEST_MV_STRUCT(kokkos_complex_float, int, int, LayoutLeft, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&     \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(double, int, size_t, LayoutLeft, TestExecSpace)
+EXECUTE_TEST_MV_STRUCT(double, int, size_t, LayoutLeft, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_float, int64_t, int, LayoutLeft, TestExecSpace)
- EXECUTE_TEST_MV_STRUCT(kokkos_complex_float, int64_t, int, LayoutLeft, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&      \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(double, int64_t, size_t, LayoutLeft, TestExecSpace)
+EXECUTE_TEST_MV_STRUCT(double, int64_t, size_t, LayoutLeft, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_float, int, size_t, LayoutLeft, TestExecSpace)
- EXECUTE_TEST_MV_STRUCT(kokkos_complex_float, int, size_t, LayoutLeft, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&  \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(float, int, int, LayoutLeft, TestExecSpace)
+EXECUTE_TEST_MV_STRUCT(float, int, int, LayoutLeft, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_float, int64_t, size_t, LayoutLeft, TestExecSpace)
- EXECUTE_TEST_MV_STRUCT(kokkos_complex_float, int64_t, size_t, LayoutLeft, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&      \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(float, int64_t, int, LayoutLeft, TestExecSpace)
+EXECUTE_TEST_MV_STRUCT(float, int64_t, int, LayoutLeft, TestExecSpace)
 #endif
-#endif // defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
 
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&     \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(float, int, size_t, LayoutLeft, TestExecSpace)
+EXECUTE_TEST_MV_STRUCT(float, int, size_t, LayoutLeft, TestExecSpace)
+#endif
 
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&      \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(float, int64_t, size_t, LayoutLeft, TestExecSpace)
+EXECUTE_TEST_MV_STRUCT(float, int64_t, size_t, LayoutLeft, TestExecSpace)
+#endif
 
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&             \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_double, int, int, LayoutLeft, TestExecSpace)
+EXECUTE_TEST_MV_STRUCT(kokkos_complex_double, int, int, LayoutLeft,
+                       TestExecSpace)
+#endif
 
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&             \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_double, int64_t, int, LayoutLeft, TestExecSpace)
+EXECUTE_TEST_MV_STRUCT(kokkos_complex_double, int64_t, int, LayoutLeft,
+                       TestExecSpace)
+#endif
 
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&             \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_double, int, size_t, LayoutLeft, TestExecSpace)
+EXECUTE_TEST_MV_STRUCT(kokkos_complex_double, int, size_t, LayoutLeft,
+                       TestExecSpace)
+#endif
 
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&             \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_double, int64_t, size_t, LayoutLeft,
+                TestExecSpace)
+EXECUTE_TEST_MV_STRUCT(kokkos_complex_double, int64_t, size_t, LayoutLeft,
+                       TestExecSpace)
+#endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(double, int, int, LayoutRight, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_float, int, int, LayoutLeft, TestExecSpace)
+EXECUTE_TEST_MV_STRUCT(kokkos_complex_float, int, int, LayoutLeft,
+                       TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(double, int64_t, int, LayoutRight, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_float, int64_t, int, LayoutLeft, TestExecSpace)
+EXECUTE_TEST_MV_STRUCT(kokkos_complex_float, int64_t, int, LayoutLeft,
+                       TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(double, int, size_t, LayoutRight, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_float, int, size_t, LayoutLeft, TestExecSpace)
+EXECUTE_TEST_MV_STRUCT(kokkos_complex_float, int, size_t, LayoutLeft,
+                       TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(double, int64_t, size_t, LayoutRight, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_float, int64_t, size_t, LayoutLeft,
+                TestExecSpace)
+EXECUTE_TEST_MV_STRUCT(kokkos_complex_float, int64_t, size_t, LayoutLeft,
+                       TestExecSpace)
+#endif
+#endif  // defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(double, int, int, LayoutRight, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(float, int, int, LayoutRight, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&     \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(double, int64_t, int, LayoutRight, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(float, int64_t, int, LayoutRight, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(double, int, size_t, LayoutRight, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(float, int, size_t, LayoutRight, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&     \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(double, int64_t, size_t, LayoutRight, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(float, int64_t, size_t, LayoutRight, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(float, int, int, LayoutRight, TestExecSpace)
 #endif
 
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&     \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(float, int64_t, int, LayoutRight, TestExecSpace)
+#endif
 
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(float, int, size_t, LayoutRight, TestExecSpace)
+#endif
 
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&     \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(float, int64_t, size_t, LayoutRight, TestExecSpace)
+#endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_double, int, int, LayoutRight, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_double, int, int, LayoutRight, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_double, int64_t, int, LayoutRight, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_double, int64_t, int, LayoutRight, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_double, int, size_t, LayoutRight, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_double, int, size_t, LayoutRight, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_double, int64_t, size_t, LayoutRight, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_double, int64_t, size_t, LayoutRight,
+                TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_float, int, int, LayoutRight, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_float, int, int, LayoutRight, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_float, int64_t, int, LayoutRight, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_float, int64_t, int, LayoutRight, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_float, int, size_t, LayoutRight, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_float, int, size_t, LayoutRight, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_float, int64_t, size_t, LayoutRight, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_float, int64_t, size_t, LayoutRight,
+                TestExecSpace)
 #endif
 
 #undef EXECUTE_TEST
diff --git a/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp b/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp
new file mode 100644
index 0000000000..f570a2d5df
--- /dev/null
+++ b/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp
@@ -0,0 +1,750 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <stdexcept>
+#include "KokkosSparse_spmv.hpp"
+#include "KokkosSparse_BlockCrsMatrix.hpp"
+#include "KokkosSparse_CrsMatrix.hpp"
+
+#include <KokkosKernels_TestUtils.hpp>
+#include <KokkosKernels_Test_Structured_Matrix.hpp>
+#include <KokkosKernels_IOUtils.hpp>
+#include <KokkosKernels_Utils.hpp>
+
+#include "KokkosKernels_Controls.hpp"
+#include "KokkosKernels_default_types.hpp"
+
+typedef Kokkos::complex<double> kokkos_complex_double;
+typedef Kokkos::complex<float> kokkos_complex_float;
+
+namespace Test_BlockCrs {
+
+/// Random generator
+template <typename Scalar>
+inline Scalar random() {
+  auto const max = static_cast<Scalar>(RAND_MAX) + static_cast<Scalar>(1);
+  return static_cast<Scalar>(std::rand()) / max;
+}
+
+template <typename Scalar>
+inline void set_random_value(Scalar &v) {
+  v = random<Scalar>();
+}
+
+template <typename Scalar>
+inline void set_random_value(Kokkos::complex<Scalar> &v) {
+  Scalar vre = random<Scalar>();
+  Scalar vim = random<Scalar>();
+  v          = Kokkos::complex<Scalar>(vre, vim);
+}
+
+template <typename Scalar>
+inline void set_random_value(std::complex<Scalar> &v) {
+  Scalar vre = random<Scalar>();
+  Scalar vim = random<Scalar>();
+  v          = std::complex<Scalar>(vre, vim);
+}
+
+/// \brief Driver routine for checking BlockCrsMatrix times vector
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void check_blockcrs_times_v(const char fOp[], scalar_t alpha, scalar_t beta,
+                            const lno_t bMax, int &num_errors) {
+  // The mat_structure view is used to generate a matrix using
+  // finite difference (FD) or finite element (FE) discretization
+  // on a cartesian grid.
+  Kokkos::View<lno_t * [3], Kokkos::HostSpace> mat_structure("Matrix Structure",
+                                                             3);
+  mat_structure(0, 0) = 8;  // Request 8 grid point in 'x' direction
+  mat_structure(0, 1) = 0;  // Add BC to the left
+  mat_structure(0, 2) = 0;  // Add BC to the right
+  mat_structure(1, 0) = 7;  // Request 7 grid point in 'y' direction
+  mat_structure(1, 1) = 0;  // Add BC to the bottom
+  mat_structure(1, 2) = 0;  // Add BC to the top
+  mat_structure(2, 0) = 9;  // Request 9 grid point in 'z' direction
+  mat_structure(2, 1) = 0;  // Add BC to the bottom
+  mat_structure(2, 2) = 0;  // Add BC to the top
+
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
+          crsMat_t;
+  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Kokkos::HostSpace,
+                                           void, size_type>
+      h_crsMat_t;
+  typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
+  typedef scalar_view_t x_vector_type;
+  typedef scalar_view_t y_vector_type;
+
+  h_crsMat_t mat_b1 =
+      Test::generate_structured_matrix3D<h_crsMat_t>("FD", mat_structure);
+
+  num_errors = 0;
+  for (lno_t blockSize = 1; blockSize <= bMax; ++blockSize) {
+    lno_t nRow    = blockSize * mat_b1.numRows();
+    lno_t nCol    = blockSize * mat_b1.numCols();
+    size_type nnz = static_cast<size_type>(blockSize) *
+                    static_cast<size_type>(blockSize) * mat_b1.nnz();
+
+    // Fill block with random values
+    std::vector<scalar_t> mat_val(nnz);
+    for (size_type ii = 0; ii < nnz; ++ii) set_random_value(mat_val[ii]);
+
+    //
+    // Create graph for CrsMatrix
+    //
+
+    std::vector<lno_t> mat_rowmap(nRow + 1, 0);
+    std::vector<lno_t> mat_colidx(nnz, 0);
+
+    for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) {
+      const auto jbeg = mat_b1.graph.row_map(ir);
+      const auto jend = mat_b1.graph.row_map(ir + 1);
+      for (lno_t ib = 0; ib < blockSize; ++ib) {
+        const lno_t my_row     = ir * blockSize + ib;
+        mat_rowmap[my_row + 1] = mat_rowmap[my_row] + (jend - jbeg) * blockSize;
+        for (lno_t ijk = jbeg; ijk < jend; ++ijk) {
+          const auto col0 = mat_b1.graph.entries(ijk);
+          for (lno_t jb = 0; jb < blockSize; ++jb) {
+            mat_colidx[mat_rowmap[my_row] + (ijk - jbeg) * blockSize + jb] =
+                col0 * blockSize + jb;
+          }
+        }
+      }
+    }  // for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir)
+
+    // Create the CrsMatrix for the reference computation
+    crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, &mat_val[0], &mat_rowmap[0],
+                  &mat_colidx[0]);
+
+    x_vector_type xref("new_right_hand_side", nRow);
+    auto h_xref = Kokkos::create_mirror_view(xref);
+    for (lno_t ir = 0; ir < nRow; ++ir) {
+      set_random_value(h_xref(ir));
+    }
+    Kokkos::deep_copy(xref, h_xref);
+
+    y_vector_type y0("y_init", nRow);
+    auto h_y0 = Kokkos::create_mirror_view(y0);
+    for (lno_t ir = 0; ir < nRow; ++ir) set_random_value(h_y0(ir));
+    Kokkos::deep_copy(y0, h_y0);
+
+    y_vector_type ycrs("crs_product_result", nRow);
+    auto h_ycrs = Kokkos::create_mirror_view(ycrs);
+    for (lno_t ir = 0; ir < nRow; ++ir) h_ycrs(ir) = h_y0(ir);
+    Kokkos::deep_copy(ycrs, h_ycrs);
+
+    // Compute the reference product
+    KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs);
+
+    y_vector_type ybcrs("bsr_product_result", nRow);
+    auto h_ybcrs = Kokkos::create_mirror_view(ybcrs);
+    for (lno_t ir = 0; ir < nRow; ++ir) h_ybcrs(ir) = h_y0(ir);
+    Kokkos::deep_copy(ybcrs, h_ybcrs);
+
+    // Create the BlockCrsMatrix
+    KokkosSparse::Experimental::BlockCrsMatrix<scalar_t, lno_t, device, void,
+                                               size_type>
+        Absr(Acrs, blockSize);
+
+    // Compute the product with the BlockCrsMatrix format
+    KokkosSparse::spmv(fOp, alpha, Absr, xref, beta, ybcrs);
+
+    // Compare the two products
+    double error = 0.0, maxNorm = 0.0;
+    Kokkos::deep_copy(h_ycrs, ycrs);
+    Kokkos::deep_copy(h_ybcrs, ybcrs);
+    for (lno_t ir = 0; ir < nRow; ++ir) {
+      error = std::max(
+          error, Kokkos::ArithTraits<scalar_t>::abs(h_ycrs(ir) - h_ybcrs(ir)));
+      maxNorm =
+          std::max(maxNorm, Kokkos::ArithTraits<scalar_t>::abs(h_ycrs(ir)));
+    }
+
+    double tmps =
+        static_cast<double>(Kokkos::ArithTraits<scalar_t>::abs(alpha)) +
+        static_cast<double>(Kokkos::ArithTraits<scalar_t>::abs(beta));
+    if ((tmps > 0.0) && (maxNorm == 0)) {
+      std::cout << " BlockCRSMatrix - SpMV times V >> blockSize " << blockSize
+                << " maxNorm " << maxNorm << " error " << error << " alpha "
+                << alpha << " beta " << beta << "\n";
+      num_errors += 1;
+    }
+
+    //
+    // --- Factor ((nnz / nRow) + 1) = Average number of non-zeros per row
+    //
+    const auto tol = ((nnz / nRow) + 1) *
+                     static_cast<double>(Kokkos::ArithTraits<scalar_t>::abs(
+                         Kokkos::ArithTraits<scalar_t>::epsilon()));
+    if (error > tol * maxNorm) {
+      std::cout << " BlockCRSMatrix - SpMV times V >> blockSize " << blockSize
+                << " ratio " << error / maxNorm << " tol " << tol << " maxNorm "
+                << maxNorm << " alpha " << alpha << " beta " << beta << "\n";
+      num_errors += 1;
+    }
+
+  }  // for (int blockSize = 1; blockSize <= bMax; ++blockSize)
+}
+
+/// \brief Driver routine for checking BlockCrsMatrix times multiple vector
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void check_blockcrs_times_mv(const char fOp[], scalar_t alpha, scalar_t beta,
+                             const lno_t bMax, int &num_errors) {
+  // The mat_structure view is used to generate a matrix using
+  // finite difference (FD) or finite element (FE) discretization
+  // on a cartesian grid.
+  Kokkos::View<lno_t * [3], Kokkos::HostSpace> mat_structure("Matrix Structure",
+                                                             3);
+  mat_structure(0, 0) = 7;  // Request 7 grid point in 'x' direction
+  mat_structure(0, 1) = 0;  // Add BC to the left
+  mat_structure(0, 2) = 0;  // Add BC to the right
+  mat_structure(1, 0) = 5;  // Request 11 grid point in 'y' direction
+  mat_structure(1, 1) = 0;  // Add BC to the bottom
+  mat_structure(1, 2) = 0;  // Add BC to the top
+  mat_structure(2, 0) = 9;  // Request 13 grid point in 'y' direction
+  mat_structure(2, 1) = 0;  // Add BC to the bottom
+  mat_structure(2, 2) = 0;  // Add BC to the top
+
+  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Kokkos::HostSpace,
+                                           void, size_type>
+      h_crsMat_t;
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
+          crsMat_t;
+  typedef Kokkos::View<scalar_t **, Kokkos::LayoutLeft, device> block_vector_t;
+
+  h_crsMat_t mat_b1 =
+      Test::generate_structured_matrix3D<h_crsMat_t>("FD", mat_structure);
+
+  num_errors     = 0;
+  const int nrhs = 5;
+
+  for (lno_t blockSize = 1; blockSize <= bMax; ++blockSize) {
+    //
+    // Fill blocks with random values
+    //
+
+    lno_t nRow    = blockSize * mat_b1.numRows();
+    lno_t nCol    = blockSize * mat_b1.numCols();
+    size_type nnz = static_cast<size_type>(blockSize) *
+                    static_cast<size_type>(blockSize) * mat_b1.nnz();
+
+    std::vector<scalar_t> mat_val(nnz);
+    for (size_type ii = 0; ii < nnz; ++ii) set_random_value(mat_val[ii]);
+
+    //
+    // Create graph for CrsMatrix
+    //
+
+    std::vector<lno_t> mat_rowmap(nRow + 1);
+    std::vector<lno_t> mat_colidx(nnz);
+
+    mat_rowmap.resize(nRow + 1);
+    auto *rowmap = &mat_rowmap[0];
+    rowmap[0]    = 0;
+
+    mat_colidx.resize(nnz);
+    auto *cols = &mat_colidx[0];
+
+    for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) {
+      const auto jbeg = mat_b1.graph.row_map(ir);
+      const auto jend = mat_b1.graph.row_map(ir + 1);
+      for (lno_t ib = 0; ib < blockSize; ++ib) {
+        const lno_t my_row = ir * blockSize + ib;
+        rowmap[my_row + 1] = rowmap[my_row] + (jend - jbeg) * blockSize;
+        for (lno_t ijk = jbeg; ijk < jend; ++ijk) {
+          const auto col0 = mat_b1.graph.entries(ijk);
+          for (lno_t jb = 0; jb < blockSize; ++jb) {
+            cols[rowmap[my_row] + (ijk - jbeg) * blockSize + jb] =
+                col0 * blockSize + jb;
+          }
+        }
+      }
+    }  // for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir)
+
+    // Create the CrsMatrix for the reference computation
+    crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, &mat_val[0], rowmap, cols);
+
+    block_vector_t xref("new_right_hand_side", nRow, nrhs);
+    auto h_xref = Kokkos::create_mirror_view(xref);
+    for (int jc = 0; jc < nrhs; ++jc)
+      for (lno_t ir = 0; ir < nRow; ++ir) set_random_value(h_xref(ir, jc));
+    Kokkos::deep_copy(xref, h_xref);
+
+    block_vector_t y0("y_init", nRow, nrhs);
+    auto h_y0 = Kokkos::create_mirror_view(y0);
+    for (int jc = 0; jc < nrhs; ++jc)
+      for (lno_t ir = 0; ir < nRow; ++ir) set_random_value(h_y0(ir, jc));
+    Kokkos::deep_copy(y0, h_y0);
+
+    block_vector_t ycrs("crs_product_result", nRow, nrhs);
+    auto h_ycrs = Kokkos::create_mirror_view(ycrs);
+    for (int jc = 0; jc < nrhs; ++jc)
+      for (lno_t ir = 0; ir < nRow; ++ir) h_ycrs(ir, jc) = h_y0(ir, jc);
+    Kokkos::deep_copy(ycrs, h_ycrs);
+
+    KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs);
+
+    block_vector_t ybcrs("bsr_product_result", nRow, nrhs);
+    auto h_ybcrs = Kokkos::create_mirror_view(ybcrs);
+    for (int jc = 0; jc < nrhs; ++jc)
+      for (lno_t ir = 0; ir < nRow; ++ir) h_ybcrs(ir, jc) = h_y0(ir, jc);
+    Kokkos::deep_copy(ybcrs, h_ybcrs);
+
+    // Create the BlockCrsMatrix
+    KokkosSparse::Experimental::BlockCrsMatrix<scalar_t, lno_t, device, void,
+                                               size_type>
+        Absr(Acrs, blockSize);
+
+    // Compute the product for the BlockCrsMatrix format
+    KokkosSparse::spmv(fOp, alpha, Absr, xref, beta, ybcrs);
+
+    Kokkos::deep_copy(h_ycrs, ycrs);
+    Kokkos::deep_copy(h_ybcrs, ybcrs);
+
+    // Compare the two products
+    double error = 0.0, maxNorm = 0.0;
+    for (int jc = 0; jc < nrhs; ++jc) {
+      for (int ir = 0; ir < nRow; ++ir) {
+        error   = std::max(error, Kokkos::ArithTraits<scalar_t>::abs(
+                                    h_ycrs(ir, jc) - h_ybcrs(ir, jc)));
+        maxNorm = std::max(maxNorm,
+                           Kokkos::ArithTraits<scalar_t>::abs(h_ycrs(ir, jc)));
+      }
+    }
+    auto tol = ((nnz / nRow) + 1) *
+               static_cast<double>(Kokkos::ArithTraits<scalar_t>::abs(
+                   Kokkos::ArithTraits<scalar_t>::epsilon()));
+
+    double tmps =
+        static_cast<double>(Kokkos::ArithTraits<scalar_t>::abs(alpha)) +
+        static_cast<double>(Kokkos::ArithTraits<scalar_t>::abs(beta));
+    if ((tmps > 0.0) && (maxNorm == 0)) {
+      std::cout << " BlockCRSMatrix - SpMV times MV >> blockSize " << blockSize
+                << " maxNorm " << maxNorm << " error " << error << " alpha "
+                << alpha << " beta " << beta << "\n";
+      num_errors += 1;
+    }
+
+    if (error > tol * maxNorm) {
+      std::cout << " BlockCRSMatrix - SpMV times MV >> blockSize " << blockSize
+                << " ratio " << error / maxNorm << " tol " << tol << " maxNorm "
+                << maxNorm << " alpha " << alpha << " beta " << beta << "\n";
+      num_errors += 1;
+    }
+
+  }  // for (int blockSize = 1; blockSize <= bMax; ++blockSize)
+}
+
+}  // namespace Test_BlockCrs
+
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void testSpMVBlockCrsMatrix() {
+  //
+  // Test for the operation y <- alpha * Op(A) * x + beta * y
+  //
+
+  // Define the function Op: Op(A) = A, Op(A) = conj(A), Op(A) = A^T, Op(A) =
+  // A^H
+  std::vector<char> modes = {'N', 'C', 'T', 'H'};
+
+  // Define a set of pairs (alpha, beta)
+  std::vector<double> testAlphaBeta = {0.0, 0.0, -1.0, 0.0,
+                                       0.0, 1.0, 3.1,  -2.5};
+
+  //
+  // Set the largest block size for the block matrix
+  // The code will create matrices with block sizes 1, .., bMax
+  //
+  const lno_t bMax = 13;
+
+  //--- Test single vector case
+  for (const auto mode : modes) {
+    int num_errors = 0;
+    for (size_t ii = 0; ii < testAlphaBeta.size(); ii += 2) {
+      auto alpha_s = static_cast<scalar_t>(testAlphaBeta[ii]);
+      auto beta_s  = static_cast<scalar_t>(testAlphaBeta[ii + 1]);
+      num_errors   = 0;
+      Test_BlockCrs::check_blockcrs_times_v<scalar_t, lno_t, size_type, device>(
+          &mode, alpha_s, beta_s, bMax, num_errors);
+      if (num_errors > 0) {
+        printf(
+            "KokkosSparse::Test::spmv_blockcrs: %i errors of %i with params: "
+            "%c %lf %lf\n",
+            num_errors, bMax, mode, Kokkos::ArithTraits<scalar_t>::abs(alpha_s),
+            Kokkos::ArithTraits<scalar_t>::abs(beta_s));
+      }
+      EXPECT_TRUE(num_errors == 0);
+    }
+  }
+}
+
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void testBlockCrsMatrix_SpM_MV() {
+  //
+  // Test for the operation Y <- alpha * Op(A) * X + beta * Y
+  //
+
+  // Define the function Op: Op(A) = A, Op(A) = conj(A), Op(A) = A^T, Op(A) =
+  // A^H
+  std::vector<char> modes = {'N', 'C', 'T', 'H'};
+
+  // Define a set of pairs (alpha, beta)
+  std::vector<double> testAlphaBeta = {0.0, 0.0, -1.0, 0.0,
+                                       0.0, 1.0, 3.1,  -2.5};
+
+  //
+  // Set the largest block size for the block matrix
+  // The code will create matrices with block sizes 1, .., bMax
+  //
+  const lno_t bMax = 13;
+
+  //--- Test multiple vector case
+  for (auto mode : modes) {
+    int num_errors = 0;
+    for (size_t ii = 0; ii < testAlphaBeta.size(); ii += 2) {
+      auto alpha_s = static_cast<scalar_t>(testAlphaBeta[ii]);
+      auto beta_s  = static_cast<scalar_t>(testAlphaBeta[ii + 1]);
+      num_errors   = 0;
+      Test_BlockCrs::check_blockcrs_times_mv<scalar_t, lno_t, size_type,
+                                             device>(&mode, alpha_s, beta_s,
+                                                     bMax, num_errors);
+      if (num_errors > 0) {
+        printf(
+            "KokkosSparse::Test::spm_mv_blockcrs: %i errors of %i with params: "
+            "%c %lf %lf\n",
+            num_errors, bMax, mode, Kokkos::ArithTraits<scalar_t>::abs(alpha_s),
+            Kokkos::ArithTraits<scalar_t>::abs(beta_s));
+      }
+      EXPECT_TRUE(num_errors == 0);
+    }
+  }
+}
+
+//////////////////////////
+
+#define EXECUTE_BCRS_TIMES_VEC_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)            \
+  TEST_F(                                                                       \
+      TestCategory,                                                             \
+      sparse##_##bcrs_times_vec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    testSpMVBlockCrsMatrix<SCALAR, ORDINAL, OFFSET, DEVICE>();                  \
+  }
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_VEC_TEST(double, int, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_VEC_TEST(double, int64_t, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_VEC_TEST(double, int, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_VEC_TEST(double, int64_t, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_VEC_TEST(float, int, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_VEC_TEST(float, int64_t, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_VEC_TEST(float, int, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_VEC_TEST(float, int64_t, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_double, int, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_double, int64_t, size_t,
+                            TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_float, int, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_float, int64_t, size_t,
+                            TestExecSpace)
+#endif
+
+#undef EXECUTE_BCRS_TIMES_VEC_TEST
+
+//////////////////////////
+
+#define EXECUTE_BCRS_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                \
+  TEST_F(                                                                            \
+      TestCategory,                                                                  \
+      sparse##_##bcrs_times_multivec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    testBlockCrsMatrix_SpM_MV<SCALAR, ORDINAL, OFFSET, DEVICE>();                    \
+  }
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_MVEC_TEST(double, int, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_MVEC_TEST(double, int64_t, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_MVEC_TEST(double, int, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_MVEC_TEST(double, int64_t, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_MVEC_TEST(float, int, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_MVEC_TEST(float, int64_t, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_MVEC_TEST(float, int, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_MVEC_TEST(float, int64_t, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, size_t,
+                             TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, size_t,
+                             TestExecSpace)
+#endif
+
+#undef EXECUTE_BCRS_TIMES_MVEC_TEST
diff --git a/unit_test/sparse/Test_Sparse_spmv_bsr.hpp b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp
new file mode 100644
index 0000000000..e6d3b65ac5
--- /dev/null
+++ b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp
@@ -0,0 +1,854 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <stdexcept>
+#include "KokkosSparse_spmv.hpp"
+#include "KokkosSparse_BsrMatrix.hpp"
+#include "KokkosSparse_CrsMatrix.hpp"
+
+#include <KokkosKernels_TestUtils.hpp>
+#include <KokkosKernels_Test_Structured_Matrix.hpp>
+#include <KokkosKernels_IOUtils.hpp>
+#include <KokkosKernels_Utils.hpp>
+
+#include "KokkosKernels_Controls.hpp"
+#include "KokkosKernels_default_types.hpp"
+
+typedef Kokkos::complex<double> kokkos_complex_double;
+typedef Kokkos::complex<float> kokkos_complex_float;
+
+namespace Test_Bsr {
+
+/// Random generator
+template <typename Scalar>
+inline Scalar random() {
+  auto const max = static_cast<Scalar>(RAND_MAX) + static_cast<Scalar>(1);
+  return static_cast<Scalar>(std::rand()) / max;
+}
+
+template <typename Scalar>
+inline void set_random_value(Scalar &v) {
+  v = random<Scalar>();
+}
+
+template <typename Scalar>
+inline void set_random_value(Kokkos::complex<Scalar> &v) {
+  Scalar vre = random<Scalar>();
+  Scalar vim = random<Scalar>();
+  v          = Kokkos::complex<Scalar>(vre, vim);
+}
+
+template <typename Scalar>
+inline void set_random_value(std::complex<Scalar> &v) {
+  Scalar vre = random<Scalar>();
+  Scalar vim = random<Scalar>();
+  v          = std::complex<Scalar>(vre, vim);
+}
+
+/// \brief Routine to make CRS-style entries of the block matrix
+///
+/// \tparam scalar_t Template type for the numerical values
+/// \param mat_b1  Sparse matrix whose graph will be used
+/// \param blockSize  Block size for each entries
+/// \param mat_rowmap[out]  CRS-style row map for the block matrix
+/// \param mat_colidx[out]  CRS-style column entries for the block matrix
+/// \param mat_val[out]  Numerical (random) values
+template <typename scalar_t, typename lno_t, typename size_type>
+void make_block_entries(
+    const KokkosSparse::CrsMatrix<scalar_t, lno_t, Kokkos::HostSpace, void,
+                                  size_type> &mat_b1,
+    int blockSize, std::vector<lno_t> &mat_rowmap,
+    std::vector<lno_t> &mat_colidx, std::vector<scalar_t> &mat_val) {
+  lno_t nRow = blockSize * mat_b1.numRows();
+  size_t nnz = static_cast<size_t>(blockSize) * static_cast<size_t>(blockSize) *
+               mat_b1.nnz();
+
+  mat_val.resize(nnz);
+  for (size_t ii = 0; ii < nnz; ++ii) set_random_value(mat_val[ii]);
+
+  //
+  // Create graph for CrsMatrix
+  //
+
+  mat_rowmap.assign(nRow + 1, 0);
+  mat_colidx.assign(nnz, 0);
+
+  for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) {
+    const auto jbeg = mat_b1.graph.row_map(ir);
+    const auto jend = mat_b1.graph.row_map(ir + 1);
+    for (lno_t ib = 0; ib < blockSize; ++ib) {
+      const lno_t my_row     = ir * blockSize + ib;
+      mat_rowmap[my_row + 1] = mat_rowmap[my_row] + (jend - jbeg) * blockSize;
+      for (auto ijk = jbeg; ijk < jend; ++ijk) {
+        const auto col0 = mat_b1.graph.entries(ijk);
+        for (lno_t jb = 0; jb < blockSize; ++jb) {
+          mat_colidx[mat_rowmap[my_row] + (ijk - jbeg) * blockSize + jb] =
+              col0 * blockSize + jb;
+        }
+      }
+    }
+  }  // for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir)
+}
+
+/// \brief Driver routine for checking BsrMatrix times vector
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void check_bsrm_times_v(const char fOp[], scalar_t alpha, scalar_t beta,
+                        const lno_t bMax, int &num_errors) {
+  // The mat_structure view is used to generate a matrix using
+  // finite difference (FD) or finite element (FE) discretization
+  // on a cartesian grid.
+  Kokkos::View<lno_t * [3], Kokkos::HostSpace> mat_structure("Matrix Structure",
+                                                             3);
+  mat_structure(0, 0) = 8;  // Request 8 grid point in 'x' direction
+  mat_structure(0, 1) = 0;  // Add BC to the left
+  mat_structure(0, 2) = 0;  // Add BC to the right
+  mat_structure(1, 0) = 7;  // Request 7 grid point in 'y' direction
+  mat_structure(1, 1) = 0;  // Add BC to the bottom
+  mat_structure(1, 2) = 0;  // Add BC to the top
+  mat_structure(2, 0) = 9;  // Request 9 grid point in 'z' direction
+  mat_structure(2, 1) = 0;  // Add BC to the bottom
+  mat_structure(2, 2) = 0;  // Add BC to the top
+
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
+          crsMat_t;
+  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Kokkos::HostSpace,
+                                           void, size_type>
+      h_crsMat_t;
+  typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
+  typedef scalar_view_t x_vector_type;
+  typedef scalar_view_t y_vector_type;
+
+  h_crsMat_t mat_b1 =
+      Test::generate_structured_matrix3D<h_crsMat_t>("FD", mat_structure);
+
+  num_errors = 0;
+  for (lno_t blockSize = 1; blockSize <= bMax; ++blockSize) {
+    //
+    // Fill blocks with random values
+    //
+
+    lno_t nRow    = blockSize * mat_b1.numRows();
+    lno_t nCol    = blockSize * mat_b1.numCols();
+    size_type nnz = static_cast<size_type>(blockSize) *
+                    static_cast<size_type>(blockSize) * mat_b1.nnz();
+
+    std::vector<lno_t> mat_rowmap(nRow + 1, 0);
+    std::vector<lno_t> mat_colidx(nnz, 0);
+    std::vector<scalar_t> mat_val(nnz);
+
+    // Create the entries
+    make_block_entries<scalar_t, lno_t>(mat_b1, blockSize, mat_rowmap,
+                                        mat_colidx, mat_val);
+
+    // Create the CrsMatrix for the reference computation
+    crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, &mat_val[0], &mat_rowmap[0],
+                  &mat_colidx[0]);
+
+    x_vector_type xref("new_right_hand_side", nRow);
+    auto h_xref = Kokkos::create_mirror_view(xref);
+    for (lno_t ir = 0; ir < nRow; ++ir) {
+      set_random_value(h_xref(ir));
+    }
+    Kokkos::deep_copy(xref, h_xref);
+
+    y_vector_type y0("y_init", nRow);
+    auto h_y0 = Kokkos::create_mirror_view(y0);
+    for (lno_t ir = 0; ir < nRow; ++ir) set_random_value(h_y0(ir));
+    Kokkos::deep_copy(y0, h_y0);
+
+    y_vector_type ycrs("crs_product_result", nRow);
+    auto h_ycrs = Kokkos::create_mirror_view(ycrs);
+    for (lno_t ir = 0; ir < nRow; ++ir) h_ycrs(ir) = h_y0(ir);
+    Kokkos::deep_copy(ycrs, h_ycrs);
+
+    //
+    // Make reference computation with a CrsMatrix variable
+    //
+    KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs);
+
+    y_vector_type ybsr("bsr_product_result", nRow);
+    auto h_ybsr = Kokkos::create_mirror_view(ybsr);
+    for (lno_t ir = 0; ir < nRow; ++ir) h_ybsr(ir) = h_y0(ir);
+    Kokkos::deep_copy(ybsr, h_ybsr);
+
+    // Create the BsrMatrix for the check test
+    KokkosSparse::Experimental::BsrMatrix<scalar_t, lno_t, device, void,
+                                          size_type>
+        Absr(Acrs, blockSize);
+
+    //
+    // Make computation with the BsrMatrix format
+    //
+    KokkosSparse::spmv(fOp, alpha, Absr, xref, beta, ybsr);
+
+    //
+    // Compare the two products
+    //
+    double error = 0.0, maxNorm = 0.0;
+    Kokkos::deep_copy(h_ycrs, ycrs);
+    Kokkos::deep_copy(h_ybsr, ybsr);
+    for (lno_t ir = 0; ir < nRow; ++ir) {
+      error = std::max(
+          error, Kokkos::ArithTraits<scalar_t>::abs(h_ycrs(ir) - h_ybsr(ir)));
+      maxNorm =
+          std::max(maxNorm, Kokkos::ArithTraits<scalar_t>::abs(h_ycrs(ir)));
+    }
+
+    double tmps =
+        static_cast<double>(Kokkos::ArithTraits<scalar_t>::abs(alpha)) +
+        static_cast<double>(Kokkos::ArithTraits<scalar_t>::abs(beta));
+    if ((tmps > 0.0) && (maxNorm == 0)) {
+      std::cout << " BSR - SpMV times MV >> blockSize " << blockSize
+                << " maxNorm " << maxNorm << " error " << error << " alpha "
+                << alpha << " beta " << beta << "\n";
+      num_errors += 1;
+    }
+
+    //
+    // --- Factor ((nnz / nRow) + 1) = Average number of non-zeros per row
+    //
+    const auto tol = ((nnz / nRow) + 1) *
+                     static_cast<double>(Kokkos::ArithTraits<scalar_t>::abs(
+                         Kokkos::ArithTraits<scalar_t>::epsilon()));
+    if (error > tol * maxNorm) {
+      std::cout << " BSR - SpMV times V >> blockSize " << blockSize << " ratio "
+                << error / maxNorm << " tol " << tol << " maxNorm " << maxNorm
+                << " alpha " << alpha << " beta " << beta << "\n";
+      num_errors += 1;
+    }
+
+  }  // for (int blockSize = 1; blockSize <= bMax; ++blockSize)
+}
+
+/// \brief Driver routine for checking BsrMatrix times multiple vector
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void check_bsrm_times_mv(const char fOp[], scalar_t alpha, scalar_t beta,
+                         const lno_t bMax, int &num_errors) {
+  // The mat_structure view is used to generate a matrix using
+  // finite difference (FD) or finite element (FE) discretization
+  // on a cartesian grid.
+  Kokkos::View<lno_t * [3], Kokkos::HostSpace> mat_structure("Matrix Structure",
+                                                             3);
+  mat_structure(0, 0) = 7;  // Request 7 grid point in 'x' direction
+  mat_structure(0, 1) = 0;  // Add BC to the left
+  mat_structure(0, 2) = 0;  // Add BC to the right
+  mat_structure(1, 0) = 5;  // Request 11 grid point in 'y' direction
+  mat_structure(1, 1) = 0;  // Add BC to the bottom
+  mat_structure(1, 2) = 0;  // Add BC to the top
+  mat_structure(2, 0) = 9;  // Request 13 grid point in 'y' direction
+  mat_structure(2, 1) = 0;  // Add BC to the bottom
+  mat_structure(2, 2) = 0;  // Add BC to the top
+
+  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Kokkos::HostSpace,
+                                           void, size_type>
+      h_crsMat_t;
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type>
+          crsMat_t;
+  typedef Kokkos::View<scalar_t **, Kokkos::LayoutLeft, device> block_vector_t;
+
+  h_crsMat_t mat_b1 =
+      Test::generate_structured_matrix3D<h_crsMat_t>("FD", mat_structure);
+
+  num_errors     = 0;
+  const int nrhs = 5;
+
+  for (lno_t blockSize = 1; blockSize <= bMax; ++blockSize) {
+    //
+    // Fill blocks with random values
+    //
+
+    lno_t nRow    = blockSize * mat_b1.numRows();
+    lno_t nCol    = blockSize * mat_b1.numCols();
+    size_type nnz = static_cast<size_type>(blockSize) *
+                    static_cast<size_type>(blockSize) * mat_b1.nnz();
+
+    std::vector<lno_t> mat_rowmap(nRow + 1, 0);
+    std::vector<lno_t> mat_colidx(nnz, 0);
+    std::vector<scalar_t> mat_val(nnz);
+
+    // Create the entries
+    make_block_entries<scalar_t, lno_t>(mat_b1, static_cast<int>(blockSize),
+                                        mat_rowmap, mat_colidx, mat_val);
+
+    // Create the CrsMatrix for the reference computation
+    crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, &mat_val[0], &mat_rowmap[0],
+                  &mat_colidx[0]);
+
+    block_vector_t xref("new_right_hand_side", nRow, nrhs);
+    auto h_xref = Kokkos::create_mirror_view(xref);
+    for (int jc = 0; jc < nrhs; ++jc)
+      for (lno_t ir = 0; ir < nRow; ++ir) set_random_value(h_xref(ir, jc));
+    Kokkos::deep_copy(xref, h_xref);
+
+    block_vector_t y0("y_init", nRow, nrhs);
+    auto h_y0 = Kokkos::create_mirror_view(y0);
+    for (int jc = 0; jc < nrhs; ++jc)
+      for (lno_t ir = 0; ir < nRow; ++ir) set_random_value(h_y0(ir, jc));
+    Kokkos::deep_copy(y0, h_y0);
+
+    block_vector_t ycrs("crs_product_result", nRow, nrhs);
+    auto h_ycrs = Kokkos::create_mirror_view(ycrs);
+    for (int jc = 0; jc < nrhs; ++jc)
+      for (lno_t ir = 0; ir < nRow; ++ir) h_ycrs(ir, jc) = h_y0(ir, jc);
+    Kokkos::deep_copy(ycrs, h_ycrs);
+
+    //
+    // Compute the reference product with a CrsMatrix variable
+    //
+    KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs);
+
+    block_vector_t ybsr("bsr_product_result", nRow, nrhs);
+    auto h_ybsr = Kokkos::create_mirror_view(ybsr);
+    for (int jc = 0; jc < nrhs; ++jc)
+      for (lno_t ir = 0; ir < nRow; ++ir) h_ybsr(ir, jc) = h_y0(ir, jc);
+    Kokkos::deep_copy(ybsr, h_ybsr);
+
+    // Create the BsrMatrix for the check test
+    KokkosSparse::Experimental::BsrMatrix<scalar_t, lno_t, device, void,
+                                          size_type>
+        Absr(Acrs, blockSize);
+
+    //
+    // Compute the product with the BsrMatrix format
+    //
+    KokkosSparse::spmv(fOp, alpha, Absr, xref, beta, ybsr);
+
+    Kokkos::deep_copy(h_ycrs, ycrs);
+    Kokkos::deep_copy(h_ybsr, ybsr);
+
+    //
+    // Compare the two products
+    //
+    double error = 0.0, maxNorm = 0.0;
+    for (int jc = 0; jc < nrhs; ++jc) {
+      for (int ir = 0; ir < nRow; ++ir) {
+        error   = std::max(error, Kokkos::ArithTraits<scalar_t>::abs(
+                                    h_ycrs(ir, jc) - h_ybsr(ir, jc)));
+        maxNorm = std::max(maxNorm,
+                           Kokkos::ArithTraits<scalar_t>::abs(h_ycrs(ir, jc)));
+      }
+    }
+
+    double tmps =
+        static_cast<double>(Kokkos::ArithTraits<scalar_t>::abs(alpha)) +
+        static_cast<double>(Kokkos::ArithTraits<scalar_t>::abs(beta));
+    if ((tmps > 0.0) && (maxNorm == 0)) {
+      std::cout << " BSR - SpMV times MV >> blockSize " << blockSize
+                << " maxNorm " << maxNorm << " error " << error << " alpha "
+                << alpha << " beta " << beta << "\n";
+      num_errors += 1;
+    }
+
+    auto tol = ((nnz / nRow) + 1) *
+               static_cast<double>(Kokkos::ArithTraits<scalar_t>::abs(
+                   Kokkos::ArithTraits<scalar_t>::epsilon()));
+    if (error > tol * maxNorm) {
+      std::cout << " BSR - SpMV times MV >> blockSize " << blockSize
+                << " ratio " << error / maxNorm << " tol " << tol << " maxNorm "
+                << maxNorm << " alpha " << alpha << " beta " << beta << "\n";
+      num_errors += 1;
+    }
+
+  }  // for (int blockSize = 1; blockSize <= bMax; ++blockSize)
+}
+
+}  // namespace Test_Bsr
+
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void testSpMVBsrMatrix() {
+  //
+  // Check a few corner cases
+  //
+
+  // 0 x 0 case
+  {
+    typedef
+        typename KokkosSparse::Experimental::BsrMatrix<scalar_t, lno_t, device,
+                                                       void, size_type>
+            bsrMat_t;
+    bsrMat_t Absr("empty", 0, 0, 0, nullptr, nullptr, nullptr, 1);
+    typedef typename bsrMat_t::values_type::non_const_type scalar_view_t;
+    typedef scalar_view_t x_vector_type;
+    typedef scalar_view_t y_vector_type;
+    x_vector_type x("corner-case-x", Absr.numCols());
+    y_vector_type y("corner-case-y", Absr.numRows());
+    Kokkos::deep_copy(y, static_cast<scalar_t>(0));
+    scalar_t alpha = static_cast<scalar_t>(1);
+    scalar_t beta  = static_cast<scalar_t>(1);
+    const char fOp = 'N';
+    int num_errors = 0;
+    try {
+      KokkosSparse::spmv(&fOp, alpha, Absr, x, beta, y);
+      Kokkos::fence();
+    } catch (std::exception &e) {
+      num_errors += 1;
+      std::cout << e.what();
+    }
+    EXPECT_TRUE(num_errors == 0);
+  }
+
+  // 0 x 1 case
+  {
+    typedef
+        typename KokkosSparse::Experimental::BsrMatrix<scalar_t, lno_t, device,
+                                                       void, size_type>
+            bsrMat_t;
+    bsrMat_t Absr("empty", 0, 1, 0, nullptr, nullptr, nullptr, 1);
+    typedef typename bsrMat_t::values_type::non_const_type scalar_view_t;
+    typedef scalar_view_t x_vector_type;
+    typedef scalar_view_t y_vector_type;
+    x_vector_type x("corner-case-x", Absr.numCols());
+    y_vector_type y("corner-case-y", Absr.numRows());
+    Kokkos::deep_copy(y, static_cast<scalar_t>(0));
+    scalar_t alpha = static_cast<scalar_t>(1);
+    scalar_t beta  = static_cast<scalar_t>(1);
+    const char fOp = 'N';
+    int num_errors = 0;
+    try {
+      KokkosSparse::spmv(&fOp, alpha, Absr, x, beta, y);
+      Kokkos::fence();
+    } catch (std::exception &e) {
+      num_errors += 1;
+      std::cout << e.what();
+    }
+    EXPECT_TRUE(num_errors == 0);
+  }
+
+  // 1 x 0 case
+  {
+    typedef
+        typename KokkosSparse::Experimental::BsrMatrix<scalar_t, lno_t, device,
+                                                       void, size_type>
+            bsrMat_t;
+    bsrMat_t Absr("empty", 1, 0, 0, nullptr, nullptr, nullptr, 1);
+    typedef typename bsrMat_t::values_type::non_const_type scalar_view_t;
+    typedef scalar_view_t x_vector_type;
+    typedef scalar_view_t y_vector_type;
+    x_vector_type x("corner-case-x", Absr.numCols());
+    y_vector_type y("corner-case-y", Absr.numRows());
+    Kokkos::deep_copy(y, static_cast<scalar_t>(0));
+    scalar_t alpha = static_cast<scalar_t>(1);
+    scalar_t beta  = static_cast<scalar_t>(1);
+    const char fOp = 'N';
+    int num_errors = 0;
+    try {
+      KokkosSparse::spmv(&fOp, alpha, Absr, x, beta, y);
+      Kokkos::fence();
+    } catch (std::exception &e) {
+      num_errors += 1;
+      std::cout << e.what();
+    }
+    EXPECT_TRUE(num_errors == 0);
+  }
+
+  //
+  // Test for the operation y <- alpha * Op(A) * x + beta * y
+  //
+
+  // Define the function Op: Op(A) = A, Op(A) = conj(A), Op(A) = A^T, Op(A) =
+  // A^H
+  std::vector<char> modes = {'N', 'C', 'T', 'H'};
+
+  // Define a set of pairs (alpha, beta)
+  std::vector<double> testAlphaBeta = {0.0, 0.0, -1.0, 0.0,
+                                       0.0, 1.0, 3.1,  -2.5};
+
+  //
+  // Set the largest block size for the block matrix
+  // The code will create matrices with block sizes 1, .., bMax
+  //
+  constexpr lno_t bMax = 13;
+
+  //
+  //--- Test single vector case
+  //
+  for (const auto mode : modes) {
+    int num_errors = 0;
+    for (size_t ii = 0; ii < testAlphaBeta.size(); ii += 2) {
+      auto alpha_s = static_cast<scalar_t>(testAlphaBeta[ii]);
+      auto beta_s  = static_cast<scalar_t>(testAlphaBeta[ii + 1]);
+      num_errors   = 0;
+      Test_Bsr::check_bsrm_times_v<scalar_t, lno_t, size_type, device>(
+          &mode, alpha_s, beta_s, bMax, num_errors);
+      if (num_errors > 0) {
+        printf(
+            "KokkosSparse::Test::spmv_bsr: %i errors of %i with params: "
+            "%c %lf %lf\n",
+            num_errors, bMax, mode, Kokkos::ArithTraits<scalar_t>::abs(alpha_s),
+            Kokkos::ArithTraits<scalar_t>::abs(beta_s));
+      }
+      EXPECT_TRUE(num_errors == 0);
+    }
+  }
+}
+
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
+void testBsrMatrix_SpM_MV() {
+  //
+  // Test for the operation Y <- alpha * Op(A) * X + beta * Y
+  //
+
+  // Define the function Op: Op(A) = A, Op(A) = conj(A), Op(A) = A^T, Op(A) =
+  // A^H
+  std::vector<char> modes = {'N', 'C', 'T', 'H'};
+
+  // Define a set of pairs (alpha, beta)
+  std::vector<double> testAlphaBeta = {0.0, 0.0, -1.0, 0.0,
+                                       0.0, 1.0, 3.1,  -2.5};
+
+  //
+  // Set the largest block size for the block matrix
+  // The code will create matrices with block sizes 1, .., bMax
+  //
+  const lno_t bMax = 13;
+
+  //--- Test multiple vector case
+  for (auto mode : modes) {
+    int num_errors = 0;
+    for (size_t ii = 0; ii < testAlphaBeta.size(); ii += 2) {
+      auto alpha_s = static_cast<scalar_t>(testAlphaBeta[ii]);
+      auto beta_s  = static_cast<scalar_t>(testAlphaBeta[ii + 1]);
+      num_errors   = 0;
+      Test_Bsr::check_bsrm_times_mv<scalar_t, lno_t, size_type, device>(
+          &mode, alpha_s, beta_s, bMax, num_errors);
+      if (num_errors > 0) {
+        printf(
+            "KokkosSparse::Test::spm_mv_bsr: %i errors of %i with params: "
+            "%c %lf %lf\n",
+            num_errors, bMax, mode, Kokkos::ArithTraits<scalar_t>::abs(alpha_s),
+            Kokkos::ArithTraits<scalar_t>::abs(beta_s));
+      }
+      EXPECT_TRUE(num_errors == 0);
+    }
+  }
+}
+
+//////////////////////////
+
+#define EXECUTE_BSR_TIMES_VEC_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)               \
+  TEST_F(                                                                         \
+      TestCategory,                                                               \
+      sparse##_##bsrmat_times_vec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    testSpMVBsrMatrix<SCALAR, ORDINAL, OFFSET, DEVICE>();                         \
+  }
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_VEC_TEST(double, int, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_VEC_TEST(double, int64_t, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_VEC_TEST(double, int, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_VEC_TEST(double, int64_t, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_VEC_TEST(float, int, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_VEC_TEST(float, int64_t, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_VEC_TEST(float, int, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_VEC_TEST(float, int64_t, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_double, int, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_double, int64_t, size_t,
+                           TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_float, int, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
+#endif
+
+#undef EXECUTE_BSR_TIMES_VEC_TEST
+
+//////////////////////////
+
+#define EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                   \
+  TEST_F(                                                                              \
+      TestCategory,                                                                    \
+      sparse##_##bsrmat_times_multivec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    testBsrMatrix_SpM_MV<SCALAR, ORDINAL, OFFSET, DEVICE>();                           \
+  }
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_MVEC_TEST(double, int, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_MVEC_TEST(double, int64_t, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_MVEC_TEST(double, int, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_MVEC_TEST(double, int64_t, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_MVEC_TEST(float, int, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_MVEC_TEST(float, int64_t, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_MVEC_TEST(float, int, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_MVEC_TEST(float, int64_t, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, size_t,
+                            TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, size_t,
+                            TestExecSpace)
+#endif
+
+#undef EXECUTE_BSR_TIMES_MVEC_TEST
diff --git a/unit_test/sparse/Test_Sparse_sptrsv.hpp b/unit_test/sparse/Test_Sparse_sptrsv.hpp
index 7a8c7f9cd9..1be27d0c9c 100644
--- a/unit_test/sparse/Test_Sparse_sptrsv.hpp
+++ b/unit_test/sparse/Test_Sparse_sptrsv.hpp
@@ -42,7 +42,6 @@
 //@HEADER
 */
 
-
 #include <gtest/gtest.h>
 #include <Kokkos_Core.hpp>
 
@@ -60,8 +59,7 @@
 #include "KokkosSparse_sptrsv_supernode.hpp"
 #endif
 
-#include<gtest/gtest.h>
-
+#include <gtest/gtest.h>
 
 using namespace KokkosSparse;
 using namespace KokkosSparse::Experimental;
@@ -330,58 +328,53 @@ void run_test_sptrsv_mtx() {
 }
 #endif
 
-
 namespace {
-template < class ViewType, typename ValueType, typename OrdinalType >
+template <class ViewType, typename ValueType, typename OrdinalType>
 struct ReductionCheck {
-
-  using lno_t = OrdinalType;
+  using lno_t      = OrdinalType;
   using value_type = ValueType;
 
   ViewType lhs;
 
-  ReductionCheck( const ViewType & lhs_ ) : lhs(lhs_) {}
+  ReductionCheck(const ViewType &lhs_) : lhs(lhs_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( lno_t i, value_type &tsum ) const {
-    tsum += lhs(i);
-  }
-
+  void operator()(lno_t i, value_type &tsum) const { tsum += lhs(i); }
 };
-}
-
+}  // namespace
 
-template <typename scalar_t, typename lno_t, typename size_type, typename device>
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
 void run_test_sptrsv() {
-
-  typedef Kokkos::View< size_type*, device >  RowMapType;
-  typedef Kokkos::View< lno_t*, device >      EntriesType;
-  typedef Kokkos::View< scalar_t*, device >   ValuesType;
+  typedef Kokkos::View<size_type *, device> RowMapType;
+  typedef Kokkos::View<lno_t *, device> EntriesType;
+  typedef Kokkos::View<scalar_t *, device> ValuesType;
 
   scalar_t ZERO = scalar_t(0);
-  scalar_t ONE = scalar_t(1);
+  scalar_t ONE  = scalar_t(1);
 
   const size_type nrows = 5;
   const size_type nnz   = 10;
 
-  using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle <size_type, lno_t, scalar_t,
-      typename device::execution_space, typename device::memory_space, typename device::memory_space>;
+  using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<
+      size_type, lno_t, scalar_t, typename device::execution_space,
+      typename device::memory_space, typename device::memory_space>;
 
 #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV)
   using host_crsmat_t = typename KernelHandle::SPTRSVHandleType::host_crsmat_t;
-  using host_graph_t = typename host_crsmat_t::StaticCrsGraphType;
+  using host_graph_t  = typename host_crsmat_t::StaticCrsGraphType;
 
-  using row_map_view_t  = typename host_graph_t::row_map_type::non_const_type;
-  using cols_view_t     = typename host_graph_t::entries_type::non_const_type;
-  using values_view_t   = typename host_crsmat_t::values_type::non_const_type;
+  using row_map_view_t = typename host_graph_t::row_map_type::non_const_type;
+  using cols_view_t    = typename host_graph_t::entries_type::non_const_type;
+  using values_view_t  = typename host_crsmat_t::values_type::non_const_type;
 
   // L & U handle for supernodal SpTrsv
   KernelHandle khL;
   KernelHandle khU;
 
   // right-hand-side and solution
-  ValuesType B ("rhs", nrows);
-  ValuesType X ("sol", nrows);
+  ValuesType B("rhs", nrows);
+  ValuesType X("sol", nrows);
 
   // host CRS for L & U
   host_crsmat_t L, U, Ut;
@@ -389,9 +382,9 @@ void run_test_sptrsv() {
 
   // Upper tri
   {
-    RowMapType  row_map("row_map", nrows+1);
+    RowMapType row_map("row_map", nrows + 1);
     EntriesType entries("entries", nnz);
-    ValuesType  values("values", nnz);
+    ValuesType values("values", nnz);
 
     auto hrow_map = Kokkos::create_mirror_view(row_map);
     auto hentries = Kokkos::create_mirror_view(entries);
@@ -415,15 +408,16 @@ void run_test_sptrsv() {
     hentries(8) = 4;
     hentries(9) = 4;
 
-    for ( size_type i = 0; i < nnz; ++i ) {
+    for (size_type i = 0; i < nnz; ++i) {
       hvalues(i) = ONE;
     }
 
     Kokkos::deep_copy(row_map, hrow_map);
     Kokkos::deep_copy(entries, hentries);
-    Kokkos::deep_copy(values,  hvalues);
+    Kokkos::deep_copy(values, hvalues);
 
-    // Create known_lhs, generate rhs, then solve for lhs to compare to known_lhs
+    // Create known_lhs, generate rhs, then solve for lhs to compare to
+    // known_lhs
     ValuesType known_lhs("known_lhs", nrows);
     // Create known solution lhs set to all 1's
     Kokkos::deep_copy(known_lhs, ONE);
@@ -436,41 +430,49 @@ void run_test_sptrsv() {
 
     typedef CrsMatrix<scalar_t, lno_t, device, void, size_type> crsMat_t;
     crsMat_t triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries);
-    KokkosSparse::spmv( "N", ONE, triMtx, known_lhs, ZERO, rhs);
+    KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs);
 
     {
       KernelHandle kh;
       bool is_lower_tri = false;
-      kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri);
+      kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows,
+                              is_lower_tri);
 
-      sptrsv_symbolic( &kh, row_map, entries );
+      sptrsv_symbolic(&kh, row_map, entries);
       Kokkos::fence();
 
-      sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
+      sptrsv_solve(&kh, row_map, entries, values, rhs, lhs);
       Kokkos::fence();
 
       scalar_t sum = 0.0;
-      Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
-      if ( sum != lhs.extent(0) ) {
+      Kokkos::parallel_reduce(
+          Kokkos::RangePolicy<typename device::execution_space>(0,
+                                                                lhs.extent(0)),
+          ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
+      if (sum != lhs.extent(0)) {
         std::cout << "Upper Tri Solve FAILURE" << std::endl;
         kh.get_sptrsv_handle()->print_algorithm();
       }
-      EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
+      EXPECT_TRUE(sum == scalar_t(lhs.extent(0)));
 
       Kokkos::deep_copy(lhs, ZERO);
       kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP);
-      sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
+      sptrsv_solve(&kh, row_map, entries, values, rhs, lhs);
       Kokkos::fence();
 
       sum = 0.0;
-      Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
-      if ( sum != lhs.extent(0) ) {
+      Kokkos::parallel_reduce(
+          Kokkos::RangePolicy<typename device::execution_space>(0,
+                                                                lhs.extent(0)),
+          ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
+      if (sum != lhs.extent(0)) {
         std::cout << "Upper Tri Solve FAILURE" << std::endl;
         kh.get_sptrsv_handle()->print_algorithm();
       }
-      EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
+      EXPECT_TRUE(sum == scalar_t(lhs.extent(0)));
 
-      //FIXME Issues with various integral type combos - algorithm currently unavailable and commented out until fixed
+      // FIXME Issues with various integral type combos - algorithm currently
+      // unavailable and commented out until fixed
       /*
       Kokkos::deep_copy(lhs, ZERO);
       kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2);
@@ -478,15 +480,15 @@ void run_test_sptrsv() {
       Kokkos::fence();
 
       sum = 0.0;
-      Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
-      if ( sum != lhs.extent(0) ) {
-        std::cout << "Upper Tri Solve FAILURE" << std::endl;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<typename
+      device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType,
+      scalar_t, lno_t>(lhs), sum); if ( sum != lhs.extent(0) ) { std::cout <<
+      "Upper Tri Solve FAILURE" << std::endl;
         kh.get_sptrsv_handle()->print_algorithm();
       }
       EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
       */
 
-
       kh.destroy_sptrsv_handle();
     }
 
@@ -494,61 +496,70 @@ void run_test_sptrsv() {
       Kokkos::deep_copy(lhs, ZERO);
       KernelHandle kh;
       bool is_lower_tri = false;
-      kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, is_lower_tri);
+      kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows,
+                              is_lower_tri);
       auto chain_threshold = 1;
       kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold);
 
-      sptrsv_symbolic( &kh, row_map, entries );
+      sptrsv_symbolic(&kh, row_map, entries);
       Kokkos::fence();
 
-      sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
+      sptrsv_solve(&kh, row_map, entries, values, rhs, lhs);
       Kokkos::fence();
 
       scalar_t sum = 0.0;
-      Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
-      if ( sum != lhs.extent(0) ) {
+      Kokkos::parallel_reduce(
+          Kokkos::RangePolicy<typename device::execution_space>(0,
+                                                                lhs.extent(0)),
+          ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
+      if (sum != lhs.extent(0)) {
         std::cout << "Upper Tri Solve FAILURE" << std::endl;
         kh.get_sptrsv_handle()->print_algorithm();
       }
-      EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
+      EXPECT_TRUE(sum == scalar_t(lhs.extent(0)));
 
       kh.destroy_sptrsv_handle();
     }
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-    if (std::is_same<size_type,int>::value && std::is_same<lno_t,int>::value && std::is_same<typename device::execution_space, Kokkos::Cuda>::value)
-    {
+    if (std::is_same<size_type, int>::value &&
+        std::is_same<lno_t, int>::value &&
+        std::is_same<typename device::execution_space, Kokkos::Cuda>::value) {
       Kokkos::deep_copy(lhs, ZERO);
       KernelHandle kh;
       bool is_lower_tri = false;
-      kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, is_lower_tri);
+      kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows,
+                              is_lower_tri);
 
       sptrsv_symbolic(&kh, row_map, entries, values);
       Kokkos::fence();
 
-      sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
+      sptrsv_solve(&kh, row_map, entries, values, rhs, lhs);
       Kokkos::fence();
 
       scalar_t sum = 0.0;
-      Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
-      if ( sum != lhs.extent(0) ) {
+      Kokkos::parallel_reduce(
+          Kokkos::RangePolicy<typename device::execution_space>(0,
+                                                                lhs.extent(0)),
+          ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
+      if (sum != lhs.extent(0)) {
         std::cout << "Upper Tri Solve FAILURE" << std::endl;
         kh.get_sptrsv_handle()->print_algorithm();
       }
-      EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
+      EXPECT_TRUE(sum == scalar_t(lhs.extent(0)));
 
       kh.destroy_sptrsv_handle();
     }
 #endif
 
 #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV)
-    const scalar_t FIVE = scalar_t(5);
+    const scalar_t FIVE    = scalar_t(5);
     const size_type nnz_sp = 14;
     {
       // U in csr
-      row_map_view_t  hUrowptr("hUrowptr", nrows+1);
-      cols_view_t     hUcolind("hUcolind", nnz_sp);
-      values_view_t   hUvalues("hUvalues", nnz_sp);
+      row_map_view_t hUrowptr("hUrowptr", nrows + 1);
+      cols_view_t hUcolind("hUcolind", nnz_sp);
+      values_view_t hUvalues("hUvalues", nnz_sp);
 
       // rowptr
       hUrowptr(0) = 0;
@@ -560,15 +571,15 @@ void run_test_sptrsv() {
 
       // colind
       // first row (first supernode)
-      hUcolind(0)  = 0;
-      hUcolind(1)  = 1;
-      hUcolind(2)  = 2;
-      hUcolind(3)  = 4;
+      hUcolind(0) = 0;
+      hUcolind(1) = 1;
+      hUcolind(2) = 2;
+      hUcolind(3) = 4;
       // second row (first supernode)
-      hUcolind(4)  = 0;
-      hUcolind(5)  = 1;
-      hUcolind(6)  = 2;
-      hUcolind(7)  = 4;
+      hUcolind(4) = 0;
+      hUcolind(5) = 1;
+      hUcolind(6) = 2;
+      hUcolind(7) = 4;
       // third row (second supernode)
       hUcolind(8)  = 2;
       hUcolind(9)  = 3;
@@ -581,15 +592,15 @@ void run_test_sptrsv() {
 
       // values
       // first row (first supernode)
-      hUvalues(0)  = FIVE;
-      hUvalues(1)  = ONE;
-      hUvalues(2)  = ONE;
-      hUvalues(3)  = ZERO;
+      hUvalues(0) = FIVE;
+      hUvalues(1) = ONE;
+      hUvalues(2) = ONE;
+      hUvalues(3) = ZERO;
       // second row (first supernode)
-      hUvalues(4)  = ZERO;
-      hUvalues(5)  = FIVE;
-      hUvalues(6)  = ZERO;
-      hUvalues(7)  = ONE;
+      hUvalues(4) = ZERO;
+      hUvalues(5) = FIVE;
+      hUvalues(6) = ZERO;
+      hUvalues(7) = ONE;
       // third row (second supernode)
       hUvalues(8)  = FIVE;
       hUvalues(9)  = ONE;
@@ -602,33 +613,34 @@ void run_test_sptrsv() {
 
       // save U for Supernodal Sptrsv
       host_graph_t static_graph(hUcolind, hUrowptr);
-      U = host_crsmat_t ("CrsMatrixU", nrows, hUvalues, static_graph);
+      U = host_crsmat_t("CrsMatrixU", nrows, hUvalues, static_graph);
 
       // create handle for Supernodal Sptrsv
       bool is_lower_tri = false;
-      khU.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, is_lower_tri);
+      khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows,
+                               is_lower_tri);
 
       // X = U*ONES to generate B = A*ONES (on device)
       {
-        RowMapType  Urowptr("Urowptr", nrows+1);
+        RowMapType Urowptr("Urowptr", nrows + 1);
         EntriesType Ucolind("Ucolind", nnz_sp);
-        ValuesType  Uvalues("Uvalues", nnz_sp);
+        ValuesType Uvalues("Uvalues", nnz_sp);
 
         Kokkos::deep_copy(Urowptr, hUrowptr);
         Kokkos::deep_copy(Ucolind, hUcolind);
         Kokkos::deep_copy(Uvalues, hUvalues);
 
         crsMat_t mtxU("mtxU", nrows, nrows, nnz_sp, Uvalues, Urowptr, Ucolind);
-        Kokkos::deep_copy (B, ONE);
-        KokkosSparse::spmv ("N", ONE, mtxU, B, ZERO, X);
+        Kokkos::deep_copy(B, ONE);
+        KokkosSparse::spmv("N", ONE, mtxU, B, ZERO, X);
       }
     }
 
     {
       // U in csc (for inverting off-diag)
-      row_map_view_t  hUcolptr("hUcolptr", nrows+1);
-      cols_view_t     hUrowind("hUrowind", nnz_sp);
-      values_view_t   hUvalues("hUvalues", nnz_sp);
+      row_map_view_t hUcolptr("hUcolptr", nrows + 1);
+      cols_view_t hUrowind("hUrowind", nnz_sp);
+      values_view_t hUvalues("hUvalues", nnz_sp);
 
       // colptr
       hUcolptr(0) = 0;
@@ -640,18 +652,18 @@ void run_test_sptrsv() {
 
       // colind
       // first column (first supernode)
-      hUrowind(0)  = 0;
-      hUrowind(1)  = 1;
+      hUrowind(0) = 0;
+      hUrowind(1) = 1;
       // second column (first supernode)
-      hUrowind(2)  = 0;
-      hUrowind(3)  = 1;
+      hUrowind(2) = 0;
+      hUrowind(3) = 1;
       // third column (second supernode)
-      hUrowind(4)  = 2;
-      hUrowind(5)  = 0;
-      hUrowind(6)  = 1;
+      hUrowind(4) = 2;
+      hUrowind(5) = 0;
+      hUrowind(6) = 1;
       // fourth column (third supernode)
-      hUrowind(7)  = 3;
-      hUrowind(8)  = 2;
+      hUrowind(7) = 3;
+      hUrowind(8) = 2;
       // fifth column (fourth supernode)
       hUrowind(9)  = 4;
       hUrowind(10) = 0;
@@ -661,18 +673,18 @@ void run_test_sptrsv() {
 
       // values
       // first column (first supernode)
-      hUvalues(0)  = FIVE;
-      hUvalues(1)  = ZERO;
+      hUvalues(0) = FIVE;
+      hUvalues(1) = ZERO;
       // second column (first supernode)
-      hUvalues(2)  = ONE;
-      hUvalues(3)  = FIVE;
+      hUvalues(2) = ONE;
+      hUvalues(3) = FIVE;
       // third column (second supernode)
-      hUvalues(4)  = FIVE;
-      hUvalues(5)  = ONE;
-      hUvalues(6)  = ZERO;
+      hUvalues(4) = FIVE;
+      hUvalues(5) = ONE;
+      hUvalues(6) = ZERO;
       // fourth column (third supernode)
-      hUvalues(7)  = FIVE;
-      hUvalues(8)  = ONE;
+      hUvalues(7) = FIVE;
+      hUvalues(8) = ONE;
       // fifth column (fourth supernode)
       hUvalues(9)  = FIVE;
       hUvalues(10) = ZERO;
@@ -689,9 +701,9 @@ void run_test_sptrsv() {
 
   // Lower tri
   {
-    RowMapType  row_map("row_map", nrows+1);
+    RowMapType row_map("row_map", nrows + 1);
     EntriesType entries("entries", nnz);
-    ValuesType  values("values", nnz);
+    ValuesType values("values", nnz);
 
     auto hrow_map = Kokkos::create_mirror_view(row_map);
     auto hentries = Kokkos::create_mirror_view(entries);
@@ -715,15 +727,16 @@ void run_test_sptrsv() {
     hentries(8) = 3;
     hentries(9) = 4;
 
-    for ( size_type i = 0; i < nnz; ++i ) {
+    for (size_type i = 0; i < nnz; ++i) {
       hvalues(i) = ONE;
     }
 
     Kokkos::deep_copy(row_map, hrow_map);
     Kokkos::deep_copy(entries, hentries);
-    Kokkos::deep_copy(values,  hvalues);
+    Kokkos::deep_copy(values, hvalues);
 
-    // Create known_lhs, generate rhs, then solve for lhs to compare to known_lhs
+    // Create known_lhs, generate rhs, then solve for lhs to compare to
+    // known_lhs
     ValuesType known_lhs("known_lhs", nrows);
     // Create known solution lhs set to all 1's
     Kokkos::deep_copy(known_lhs, ONE);
@@ -736,42 +749,49 @@ void run_test_sptrsv() {
 
     typedef CrsMatrix<scalar_t, lno_t, device, void, size_type> crsMat_t;
     crsMat_t triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries);
-    KokkosSparse::spmv( "N", ONE, triMtx, known_lhs, ZERO, rhs);
-
+    KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs);
 
     {
       KernelHandle kh;
       bool is_lower_tri = true;
-      kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri);
+      kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows,
+                              is_lower_tri);
 
-      sptrsv_symbolic( &kh, row_map, entries );
+      sptrsv_symbolic(&kh, row_map, entries);
       Kokkos::fence();
 
-      sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
+      sptrsv_solve(&kh, row_map, entries, values, rhs, lhs);
       Kokkos::fence();
 
       scalar_t sum = 0.0;
-      Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
-      if ( sum != lhs.extent(0) ) {
+      Kokkos::parallel_reduce(
+          Kokkos::RangePolicy<typename device::execution_space>(0,
+                                                                lhs.extent(0)),
+          ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
+      if (sum != lhs.extent(0)) {
         std::cout << "Lower Tri Solve FAILURE" << std::endl;
         kh.get_sptrsv_handle()->print_algorithm();
       }
-      EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
+      EXPECT_TRUE(sum == scalar_t(lhs.extent(0)));
 
       Kokkos::deep_copy(lhs, ZERO);
       kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP);
-      sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
+      sptrsv_solve(&kh, row_map, entries, values, rhs, lhs);
       Kokkos::fence();
 
       sum = 0.0;
-      Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
-      if ( sum != lhs.extent(0) ) {
+      Kokkos::parallel_reduce(
+          Kokkos::RangePolicy<typename device::execution_space>(0,
+                                                                lhs.extent(0)),
+          ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
+      if (sum != lhs.extent(0)) {
         std::cout << "Lower Tri Solve FAILURE" << std::endl;
         kh.get_sptrsv_handle()->print_algorithm();
       }
-      EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
+      EXPECT_TRUE(sum == scalar_t(lhs.extent(0)));
 
-      //FIXME Issues with various integral type combos - algorithm currently unavailable and commented out until fixed
+      // FIXME Issues with various integral type combos - algorithm currently
+      // unavailable and commented out until fixed
       /*
       Kokkos::deep_copy(lhs, ZERO);
       kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2);
@@ -779,9 +799,10 @@ void run_test_sptrsv() {
       Kokkos::fence();
 
       sum = 0.0;
-      Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
-      if ( sum != lhs.extent(0) ) {
-        std::cout << "Lower Tri Solve FAILURE" << std::endl;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<typename
+      device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType,
+      scalar_t, lno_t>(lhs), sum); if ( sum != lhs.extent(0) ) { std::cout <<
+      "Lower Tri Solve FAILURE" << std::endl;
         kh.get_sptrsv_handle()->print_algorithm();
       }
       EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
@@ -794,48 +815,57 @@ void run_test_sptrsv() {
       Kokkos::deep_copy(lhs, ZERO);
       KernelHandle kh;
       bool is_lower_tri = true;
-      kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, is_lower_tri);
+      kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows,
+                              is_lower_tri);
       auto chain_threshold = 1;
       kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold);
 
-      sptrsv_symbolic( &kh, row_map, entries );
+      sptrsv_symbolic(&kh, row_map, entries);
       Kokkos::fence();
 
-      sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
+      sptrsv_solve(&kh, row_map, entries, values, rhs, lhs);
       Kokkos::fence();
 
       scalar_t sum = 0.0;
-      Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
-      if ( sum != lhs.extent(0) ) {
+      Kokkos::parallel_reduce(
+          Kokkos::RangePolicy<typename device::execution_space>(0,
+                                                                lhs.extent(0)),
+          ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
+      if (sum != lhs.extent(0)) {
         std::cout << "Lower Tri Solve FAILURE" << std::endl;
         kh.get_sptrsv_handle()->print_algorithm();
       }
-      EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
+      EXPECT_TRUE(sum == scalar_t(lhs.extent(0)));
 
       kh.destroy_sptrsv_handle();
     }
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-    if (std::is_same<size_type,int>::value && std::is_same<lno_t,int>::value && std::is_same<typename device::execution_space, Kokkos::Cuda>::value)
-    {
+    if (std::is_same<size_type, int>::value &&
+        std::is_same<lno_t, int>::value &&
+        std::is_same<typename device::execution_space, Kokkos::Cuda>::value) {
       Kokkos::deep_copy(lhs, ZERO);
       KernelHandle kh;
       bool is_lower_tri = true;
-      kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, is_lower_tri);
+      kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows,
+                              is_lower_tri);
 
       sptrsv_symbolic(&kh, row_map, entries, values);
       Kokkos::fence();
 
-      sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
+      sptrsv_solve(&kh, row_map, entries, values, rhs, lhs);
       Kokkos::fence();
 
       scalar_t sum = 0.0;
-      Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
-      if ( sum != lhs.extent(0) ) {
+      Kokkos::parallel_reduce(
+          Kokkos::RangePolicy<typename device::execution_space>(0,
+                                                                lhs.extent(0)),
+          ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
+      if (sum != lhs.extent(0)) {
         std::cout << "Lower Tri Solve FAILURE" << std::endl;
         kh.get_sptrsv_handle()->print_algorithm();
       }
-      EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
+      EXPECT_TRUE(sum == scalar_t(lhs.extent(0)));
 
       kh.destroy_sptrsv_handle();
     }
@@ -844,13 +874,13 @@ void run_test_sptrsv() {
 #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV)
     {
       // L in csc
-      const scalar_t TWO  = scalar_t(2);
-      const scalar_t FIVE = scalar_t(5);
+      const scalar_t TWO     = scalar_t(2);
+      const scalar_t FIVE    = scalar_t(5);
       const size_type nnz_sp = 14;
 
-      row_map_view_t  hLcolptr("hUcolptr", nrows+1);
-      cols_view_t     hLrowind("hUrowind", nnz_sp);
-      values_view_t   hLvalues("hUvalues", nnz_sp);
+      row_map_view_t hLcolptr("hUcolptr", nrows + 1);
+      cols_view_t hLrowind("hUrowind", nnz_sp);
+      values_view_t hLvalues("hUvalues", nnz_sp);
 
       // colptr
       hLcolptr(0) = 0;
@@ -872,7 +902,7 @@ void run_test_sptrsv() {
       hLrowind(6) = 2;
       hLrowind(7) = 4;
       // third column (second supernode)
-      hLrowind(8) = 2;
+      hLrowind(8)  = 2;
       hLrowind(9)  = 3;
       hLrowind(10) = 4;
       // fourth column (third supernode)
@@ -893,8 +923,8 @@ void run_test_sptrsv() {
       hLvalues(6) = ZERO;
       hLvalues(7) = ONE;
       // third column (second supernode)
-      hLvalues(8) = FIVE;
-      hLvalues(9) = ONE;
+      hLvalues(8)  = FIVE;
+      hLvalues(9)  = ONE;
       hLvalues(10) = ONE;
       // fourth column (third supernode)
       hLvalues(11) = FIVE;
@@ -904,23 +934,24 @@ void run_test_sptrsv() {
 
       // store Lt in crsmat
       host_graph_t static_graph(hLrowind, hLcolptr);
-      L = host_crsmat_t ("CrsMatrixL", nrows, hLvalues, static_graph);
+      L = host_crsmat_t("CrsMatrixL", nrows, hLvalues, static_graph);
 
       bool is_lower_tri = true;
-      khL.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, is_lower_tri);
+      khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows,
+                               is_lower_tri);
 
       // generate B = A*ONES = L*(U*ONES), where X = U*ONES (on device)
       {
-        RowMapType  Lcolptr("Lcolptr", nrows+1);
+        RowMapType Lcolptr("Lcolptr", nrows + 1);
         EntriesType Lrowind("Lrowind", nnz_sp);
-        ValuesType  Lvalues("Lvalues", nnz_sp);
+        ValuesType Lvalues("Lvalues", nnz_sp);
 
         Kokkos::deep_copy(Lcolptr, hLcolptr);
         Kokkos::deep_copy(Lrowind, hLrowind);
         Kokkos::deep_copy(Lvalues, hLvalues);
 
         crsMat_t mtxL("mtxL", nrows, nrows, nnz_sp, Lvalues, Lcolptr, Lrowind);
-        KokkosSparse::spmv ("T", ONE, mtxL, X, ZERO, B);
+        KokkosSparse::spmv("T", ONE, mtxL, X, ZERO, B);
       }
     }
 
@@ -928,44 +959,49 @@ void run_test_sptrsv() {
       // unit-test for supernode SpTrsv (default)
       // > set up supernodes (block size = one)
       size_type nsupers = 4;
-      Kokkos::View<int*, Kokkos::HostSpace> supercols ("supercols", 1+nsupers);
-      supercols (0) = 0;
-      supercols (1) = 2; // two columns
-      supercols (2) = 3; // one column
-      supercols (3) = 4; // one column
-      supercols (4) = 5; // one column
-      int *etree = NULL; // we generate graph internally
+      Kokkos::View<int *, Kokkos::HostSpace> supercols("supercols",
+                                                       1 + nsupers);
+      supercols(0) = 0;
+      supercols(1) = 2;     // two columns
+      supercols(2) = 3;     // one column
+      supercols(3) = 4;     // one column
+      supercols(4) = 5;     // one column
+      int *etree   = NULL;  // we generate graph internally
 
       // invert diagonal blocks
       bool invert_diag = true;
-      khL.set_sptrsv_invert_diagonal (invert_diag);
-      khU.set_sptrsv_invert_diagonal (invert_diag);
+      khL.set_sptrsv_invert_diagonal(invert_diag);
+      khU.set_sptrsv_invert_diagonal(invert_diag);
 
       // > symbolic (on host)
-      sptrsv_supernodal_symbolic (nsupers, supercols.data (), etree, L.graph, &khL, U.graph, &khU);
+      sptrsv_supernodal_symbolic(nsupers, supercols.data(), etree, L.graph,
+                                 &khL, U.graph, &khU);
       // > numeric (on host)
-      sptrsv_compute (&khL, L);
-      sptrsv_compute (&khU, U);
+      sptrsv_compute(&khL, L);
+      sptrsv_compute(&khU, U);
       Kokkos::fence();
 
-      // > solve 
-      ValuesType b ("b", nrows);
-      Kokkos::deep_copy (b, B);
-      Kokkos::deep_copy (X, ZERO);
-      sptrsv_solve (&khL, &khU, X, b);
+      // > solve
+      ValuesType b("b", nrows);
+      Kokkos::deep_copy(b, B);
+      Kokkos::deep_copy(X, ZERO);
+      sptrsv_solve(&khL, &khU, X, b);
       Kokkos::fence();
 
       // > check
       scalar_t sum = 0.0;
-      Kokkos::parallel_reduce (Kokkos::RangePolicy<typename device::execution_space>(0, X.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(X), sum);
-      if ( sum != lhs.extent(0) ) {
-        std::cout << "Supernode Tri Solve FAILURE : " << sum << " vs." << lhs.extent(0) << std::endl;
+      Kokkos::parallel_reduce(
+          Kokkos::RangePolicy<typename device::execution_space>(0, X.extent(0)),
+          ReductionCheck<ValuesType, scalar_t, lno_t>(X), sum);
+      if (sum != lhs.extent(0)) {
+        std::cout << "Supernode Tri Solve FAILURE : " << sum << " vs."
+                  << lhs.extent(0) << std::endl;
         khL.get_sptrsv_handle()->print_algorithm();
       } else {
         std::cout << "Supernode Tri Solve SUCCESS" << std::endl;
         khL.get_sptrsv_handle()->print_algorithm();
       }
-      EXPECT_TRUE( sum == scalar_t(X.extent(0)) );
+      EXPECT_TRUE(sum == scalar_t(X.extent(0)));
 
       khL.destroy_sptrsv_handle();
       khU.destroy_sptrsv_handle();
@@ -975,60 +1011,65 @@ void run_test_sptrsv() {
       // unit-test for supernode SpTrsv (running TRMM on device for compute)
       // > set up supernodes
       size_type nsupers = 4;
-      Kokkos::View<int*, Kokkos::HostSpace> supercols ("supercols", 1+nsupers);
-      supercols (0) = 0;
-      supercols (1) = 2; // two columns
-      supercols (2) = 3; // one column
-      supercols (3) = 4; // one column
-      supercols (4) = 5; // one column
-      int *etree = NULL; // we generate tree internally
+      Kokkos::View<int *, Kokkos::HostSpace> supercols("supercols",
+                                                       1 + nsupers);
+      supercols(0) = 0;
+      supercols(1) = 2;     // two columns
+      supercols(2) = 3;     // one column
+      supercols(3) = 4;     // one column
+      supercols(4) = 5;     // one column
+      int *etree   = NULL;  // we generate tree internally
 
       // > create handles
       KernelHandle khLd;
       KernelHandle khUd;
-      khLd.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, true);
-      khUd.create_sptrsv_handle (SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, false);
+      khLd.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, true);
+      khUd.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, false);
 
       // > invert diagonal blocks
       bool invert_diag = true;
-      khLd.set_sptrsv_invert_diagonal (invert_diag);
-      khUd.set_sptrsv_invert_diagonal (invert_diag);
+      khLd.set_sptrsv_invert_diagonal(invert_diag);
+      khUd.set_sptrsv_invert_diagonal(invert_diag);
 
       // > invert off-diagonal blocks
       bool invert_offdiag = true;
-      khUd.set_sptrsv_column_major (true);
-      khLd.set_sptrsv_invert_offdiagonal (invert_offdiag);
-      khUd.set_sptrsv_invert_offdiagonal (invert_offdiag);
+      khUd.set_sptrsv_column_major(true);
+      khLd.set_sptrsv_invert_offdiagonal(invert_offdiag);
+      khUd.set_sptrsv_invert_offdiagonal(invert_offdiag);
 
       // > forcing sptrsv compute to perform TRMM on device
-      khLd.set_sptrsv_diag_supernode_sizes (1, 1);
-      khUd.set_sptrsv_diag_supernode_sizes (1, 1);
+      khLd.set_sptrsv_diag_supernode_sizes(1, 1);
+      khUd.set_sptrsv_diag_supernode_sizes(1, 1);
 
       // > symbolic (on host)
-      sptrsv_supernodal_symbolic (nsupers, supercols.data (), etree, L.graph, &khLd, Ut.graph, &khUd);
+      sptrsv_supernodal_symbolic(nsupers, supercols.data(), etree, L.graph,
+                                 &khLd, Ut.graph, &khUd);
       // > numeric (on host)
-      sptrsv_compute (&khLd, L);
-      sptrsv_compute (&khUd, Ut);
+      sptrsv_compute(&khLd, L);
+      sptrsv_compute(&khUd, Ut);
       Kokkos::fence();
 
-      // > solve 
-      ValuesType b ("b", nrows);
-      Kokkos::deep_copy (b, B);
-      Kokkos::deep_copy (X, ZERO);
-      sptrsv_solve (&khLd, &khUd, X, b);
+      // > solve
+      ValuesType b("b", nrows);
+      Kokkos::deep_copy(b, B);
+      Kokkos::deep_copy(X, ZERO);
+      sptrsv_solve(&khLd, &khUd, X, b);
       Kokkos::fence();
 
       // > check
       scalar_t sum = 0.0;
-      Kokkos::parallel_reduce (Kokkos::RangePolicy<typename device::execution_space>(0, X.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(X), sum);
-      if ( sum != lhs.extent(0) ) {
-        std::cout << "Supernode Tri Solve FAILURE : " << sum << " vs." << lhs.extent(0) << std::endl;
+      Kokkos::parallel_reduce(
+          Kokkos::RangePolicy<typename device::execution_space>(0, X.extent(0)),
+          ReductionCheck<ValuesType, scalar_t, lno_t>(X), sum);
+      if (sum != lhs.extent(0)) {
+        std::cout << "Supernode Tri Solve FAILURE : " << sum << " vs."
+                  << lhs.extent(0) << std::endl;
         khLd.get_sptrsv_handle()->print_algorithm();
       } else {
         std::cout << "Supernode Tri Solve SUCCESS" << std::endl;
         khLd.get_sptrsv_handle()->print_algorithm();
       }
-      EXPECT_TRUE( sum == scalar_t(X.extent(0)) );
+      EXPECT_TRUE(sum == scalar_t(X.extent(0)));
 
       khLd.destroy_sptrsv_handle();
       khUd.destroy_sptrsv_handle();
@@ -1037,116 +1078,147 @@ void run_test_sptrsv() {
   }
 }
 
-} // namespace Test
+}  // namespace Test
 
-template <typename scalar_t, typename lno_t, typename size_type, typename device>
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename device>
 void test_sptrsv() {
   Test::run_test_sptrsv<scalar_t, lno_t, size_type, device>();
-//  Test::run_test_sptrsv_mtx<scalar_t, lno_t, size_type, device>();
-}
-
-
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
-TEST_F( TestCategory, sparse ## _ ## sptrsv ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
-  test_sptrsv<SCALAR,ORDINAL,OFFSET,DEVICE>(); \
+  //  Test::run_test_sptrsv_mtx<scalar_t, lno_t, size_type, device>();
 }
 
+#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                      \
+  TEST_F(TestCategory,                                                     \
+         sparse##_##sptrsv##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    test_sptrsv<SCALAR, ORDINAL, OFFSET, DEVICE>();                        \
+  }
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(double, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(float, int64_t, size_t, TestExecSpace)
 #endif
 
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace)
 #endif
 
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace)
 #endif
 
 #undef EXECUTE_TEST
diff --git a/unit_test/sparse/Test_Sparse_trsv.hpp b/unit_test/sparse/Test_Sparse_trsv.hpp
index ca53329a58..fce73897a8 100644
--- a/unit_test/sparse/Test_Sparse_trsv.hpp
+++ b/unit_test/sparse/Test_Sparse_trsv.hpp
@@ -1,301 +1,404 @@
-#if !defined(TEST_HIP_SPARSE_CPP) \
-  && ( !defined(TEST_CUDA_SPARSE_CPP)					\
-       || (defined(TEST_CUDA_SPARSE_CPP) && defined(KOKKOS_ENABLE_CUDA_UVM)) )
+#if !defined(TEST_HIP_SPARSE_CPP) && !defined(TEST_SYCL_SPARSE_CPP) && \
+    !defined(TEST_OPENMPTARGET_BATCHED_DENSE_CPP) &&                   \
+    (!defined(TEST_CUDA_SPARSE_CPP) ||                                 \
+     (defined(TEST_CUDA_SPARSE_CPP) && defined(KOKKOS_ENABLE_CUDA_UVM)))
 
-#include<gtest/gtest.h>
-#include<Kokkos_Core.hpp>
-#include<Kokkos_Random.hpp>
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
 
-#include<KokkosSparse_trsv.hpp>
-#include<KokkosSparse_spmv.hpp>
-#include<KokkosKernels_TestUtils.hpp>
-#include<KokkosKernels_IOUtils.hpp>
+#include <KokkosSparse_trsv.hpp>
+#include <KokkosSparse_spmv.hpp>
+#include <KokkosKernels_TestUtils.hpp>
+#include <KokkosKernels_IOUtils.hpp>
 
-#include<KokkosKernels_Utils.hpp>
+#include <KokkosKernels_Utils.hpp>
 
 typedef Kokkos::complex<double> kokkos_complex_double;
 typedef Kokkos::complex<float> kokkos_complex_float;
 
 namespace Test {
-//TODO: remove this once MD develop branch is merge.
-//The below functionolity exists in SparseUtils.
-
+// TODO: remove this once MD develop branch is merge.
+// The below functionolity exists in SparseUtils.
 
 template <typename crsMat_t, typename x_vector_type, typename y_vector_type>
-void check_trsv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type b, y_vector_type expected_x, int numMV, const char uplo[], const char trans[]){
-  //typedef typename crsMat_t::StaticCrsGraphType graph_t;
+void check_trsv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type b,
+                   y_vector_type expected_x, int numMV, const char uplo[],
+                   const char trans[]) {
+  // typedef typename crsMat_t::StaticCrsGraphType graph_t;
   typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
   typedef typename scalar_view_t::value_type ScalarA;
-  double eps = (std::is_same<ScalarA,float>::value ? 2*1e-2
-               : (std::is_same<ScalarA,std::complex<float>>::value || std::is_same<ScalarA,Kokkos::complex<float>>::value )? 2*1e-1 : 1e-7 );
+  double eps = (std::is_same<ScalarA, float>::value
+                    ? 2 * 1e-2
+                    : (std::is_same<ScalarA, std::complex<float>>::value ||
+                       std::is_same<ScalarA, Kokkos::complex<float>>::value)
+                          ? 2 * 1e-1
+                          : 1e-7);
 
   Kokkos::fence();
   KokkosSparse::trsv(uplo, trans, "N", input_mat, b, x);
 
-  for (int i = 0; i < numMV; ++i){
-    auto x_i = Kokkos::subview (x, Kokkos::ALL (), i);
+  for (int i = 0; i < numMV; ++i) {
+    auto x_i = Kokkos::subview(x, Kokkos::ALL(), i);
 
-    auto expected_x_i = Kokkos::subview (expected_x, Kokkos::ALL (), i);
+    auto expected_x_i = Kokkos::subview(expected_x, Kokkos::ALL(), i);
 
     EXPECT_NEAR_KK_1DVIEW(expected_x_i, x_i, eps);
   }
 }
-}
+}  // namespace Test
 
-template <typename scalar_t, typename lno_t, typename size_type, typename layout, class Device>
-void test_trsv_mv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_variance, int numMV){
+template <typename scalar_t, typename lno_t, typename size_type,
+          typename layout, class Device>
+void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth,
+                  lno_t row_size_variance, int numMV) {
   lno_t numCols = numRows;
 
-  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type> crsMat_t;
-  //typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
+  typedef
+      typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type>
+          crsMat_t;
+  // typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
 
   typedef Kokkos::View<scalar_t**, layout, Device> ViewTypeX;
   typedef Kokkos::View<scalar_t**, layout, Device> ViewTypeY;
 
-  ViewTypeX b_x("A",numRows,numMV);
-  ViewTypeY b_y("B",numCols,numMV);
-  ViewTypeX b_x_copy("B",numCols,numMV);
+  ViewTypeX b_x("A", numRows, numMV);
+  ViewTypeY b_y("B", numCols, numMV);
+  ViewTypeX b_x_copy("B", numCols, numMV);
 
-  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
-  Kokkos::fill_random(b_x_copy,rand_pool,scalar_t(10));
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(
+      13718);
+  Kokkos::fill_random(b_x_copy, rand_pool, scalar_t(10));
 
   typename ViewTypeY::non_const_value_type alpha = 1;
-  typename ViewTypeY::non_const_value_type beta = 0;
-
+  typename ViewTypeY::non_const_value_type beta  = 0;
 
-  //this function creates a dense lower and upper triangular matrix.
-  //TODO: SHOULD CHANGE IT TO SPARSE
-  crsMat_t lower_part = KokkosKernels::Impl::kk_generate_triangular_sparse_matrix<crsMat_t>('L', numRows,numCols,nnz,row_size_variance, bandwidth);
+  // this function creates a dense lower and upper triangular matrix.
+  // TODO: SHOULD CHANGE IT TO SPARSE
+  crsMat_t lower_part =
+      KokkosKernels::Impl::kk_generate_triangular_sparse_matrix<crsMat_t>(
+          'L', numRows, numCols, nnz, row_size_variance, bandwidth);
   KokkosSparse::spmv("N", alpha, lower_part, b_x_copy, beta, b_y);
   Test::check_trsv_mv(lower_part, b_x, b_y, b_x_copy, numMV, "L", "N");
 
   KokkosSparse::spmv("T", alpha, lower_part, b_x_copy, beta, b_y);
   Test::check_trsv_mv(lower_part, b_x, b_y, b_x_copy, numMV, "L", "T");
-  //typedef typename Kokkos::View<lno_t*, layout, Device> indexview;
+  // typedef typename Kokkos::View<lno_t*, layout, Device> indexview;
 
-  crsMat_t upper_part = KokkosKernels::Impl::kk_generate_triangular_sparse_matrix<crsMat_t>('U', numRows,numCols,nnz,row_size_variance, bandwidth);
+  crsMat_t upper_part =
+      KokkosKernels::Impl::kk_generate_triangular_sparse_matrix<crsMat_t>(
+          'U', numRows, numCols, nnz, row_size_variance, bandwidth);
   KokkosSparse::spmv("N", alpha, upper_part, b_x_copy, beta, b_y);
   Test::check_trsv_mv(upper_part, b_x, b_y, b_x_copy, numMV, "U", "N");
-  
+
   KokkosSparse::spmv("T", alpha, upper_part, b_x_copy, beta, b_y);
   Test::check_trsv_mv(upper_part, b_x, b_y, b_x_copy, numMV, "U", "T");
 }
 
+#define EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE)                    \
+  TEST_F(                                                                           \
+      TestCategory,                                                                 \
+      sparse##_##trsv_mv##_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \
+    test_trsv_mv<SCALAR, ORDINAL, OFFSET, Kokkos::LAYOUT, DEVICE>(                  \
+        5000, 5000 * 30, 200, 10, 1);                                               \
+    test_trsv_mv<SCALAR, ORDINAL, OFFSET, Kokkos::LAYOUT, DEVICE>(                  \
+        5000, 5000 * 30, 100, 10, 5);                                               \
+    test_trsv_mv<SCALAR, ORDINAL, OFFSET, Kokkos::LAYOUT, DEVICE>(                  \
+        1000, 1000 * 20, 100, 5, 10);                                               \
+  }
 
-
-
-#define EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE) \
-TEST_F( TestCategory,sparse ## _ ## trsv_mv ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## LAYOUT ## _ ## DEVICE ) { \
-  test_trsv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (5000, 5000 * 30, 200, 10, 1); \
-  test_trsv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (5000, 5000 * 30, 100, 10, 5); \
-  test_trsv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (1000, 1000 * 20, 100, 5, 10); \
-}
-
-
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT)) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(double, int, int, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT)) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(double, int64_t, int, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(double, int, size_t, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(double, int64_t, size_t, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(float, int, int, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(float, int64_t, int, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(float, int, size_t, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(float, int64_t, size_t, LayoutLeft, TestExecSpace)
-#endif
-
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_double, int, int, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_double, int64_t, int, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_double, int, size_t, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_double, int64_t, size_t, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_float, int, int, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_float, int64_t, int, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_float, int, size_t, LayoutLeft, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTLEFT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_float, int64_t, size_t, LayoutLeft, TestExecSpace)
-#endif
-
-
-
-
-
-
-
-
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(double, int, int, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(double, int64_t, int, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(double, int, size_t, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_DOUBLE) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(double, int64_t, size_t, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(float, int, int, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(float, int64_t, int, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(float, int, size_t, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_FLOAT) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(float, int64_t, size_t, LayoutRight, TestExecSpace)
-#endif
-
-
-
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_double, int, int, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_double, int64_t, int, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_double, int, size_t, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_double, int64_t, size_t, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_float, int, int, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_INT) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_float, int64_t, int, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_float, int, size_t, LayoutRight, TestExecSpace)
-#endif
-
-#if (defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \
- && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) \
- && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) ) || (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
- EXECUTE_TEST_MV(kokkos_complex_float, int64_t, size_t, LayoutRight, TestExecSpace)
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&  \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(double, int, int, LayoutLeft, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&      \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(double, int64_t, int, LayoutLeft, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&     \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(double, int, size_t, LayoutLeft, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&      \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(double, int64_t, size_t, LayoutLeft, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&  \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(float, int, int, LayoutLeft, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&      \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(float, int64_t, int, LayoutLeft, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&     \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(float, int, size_t, LayoutLeft, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&      \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(float, int64_t, size_t, LayoutLeft, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&             \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_double, int, int, LayoutLeft, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&             \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_double, int64_t, int, LayoutLeft, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&             \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_double, int, size_t, LayoutLeft, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&             \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_double, int64_t, size_t, LayoutLeft,
+                TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_float, int, int, LayoutLeft, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_float, int64_t, int, LayoutLeft, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_float, int, size_t, LayoutLeft, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_LAYOUTLEFT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_float, int64_t, size_t, LayoutLeft,
+                TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&      \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(double, int, int, LayoutRight, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&     \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(double, int64_t, int, LayoutRight, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&         \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(double, int, size_t, LayoutRight, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_DOUBLE) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&     \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(double, int64_t, size_t, LayoutRight, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&       \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&        \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(float, int, int, LayoutRight, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&     \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||     \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(float, int64_t, int, LayoutRight, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&          \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&    \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&    \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&           \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(float, int, size_t, LayoutRight, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_FLOAT) &&           \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&     \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||  \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&            \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(float, int64_t, size_t, LayoutRight, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_double, int, int, LayoutRight, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||            \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_double, int64_t, int, LayoutRight, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&            \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_double, int, size_t, LayoutRight, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&        \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&            \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||         \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                   \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_double, int64_t, size_t, LayoutRight,
+                TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_float, int, int, LayoutRight, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_INT)) ||           \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_float, int64_t, int, LayoutRight, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT) &&           \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_float, int, size_t, LayoutRight, TestExecSpace)
+#endif
+
+#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \
+     defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) &&       \
+     defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) &&           \
+     defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) ||        \
+    (!defined(KOKKOSKERNELS_ETI_ONLY) &&                  \
+     !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS))
+EXECUTE_TEST_MV(kokkos_complex_float, int64_t, size_t, LayoutRight,
+                TestExecSpace)
 #endif
 
 #undef EXECUTE_TEST_MV
 
-#endif // check for CUDA and UVM
-
+#endif  // check for CUDA and UVM
diff --git a/unit_test/sparse/matrixIssue402.hpp b/unit_test/sparse/matrixIssue402.hpp
index fc12b471a6..27e63e4283 100644
--- a/unit_test/sparse/matrixIssue402.hpp
+++ b/unit_test/sparse/matrixIssue402.hpp
@@ -1,2425 +1,2686 @@
 #ifndef MATRIX_ISSUE_402
 #define MATRIX_ISSUE_402
-namespace MatrixIssue402
-{
+namespace MatrixIssue402 {
 static double values[11156] = {
-0.000018, -0.000001, 0.000013, -0.000013, -0.000017, 0.000048, -0.000046, 0.000034, -0.000034, 0.000018, 
--0.000001, -0.000013, 0.000013, -0.000017, 0.000028, -0.000021, -0.000005, 0.000028, -0.000020, 0.000020, 
--0.000001, 0.000048, -0.000046, 0.000034, -0.000034, 0.000078, -0.000004, 0.000055, -0.000074, -0.000055, 
-0.000048, -0.000046, 0.000034, -0.000034, 0.000270, -0.000015, -0.000074, 0.000055, -0.000099, -0.000099, 
-0.000034, -0.000002, -0.000032, 0.000024, -0.000024, 0.000028, -0.000020, -0.000001, 0.000020, 0.000044, 
--0.000017, -0.000078, 0.000042, 0.000029, -0.000029, 0.000018, -0.000001, -0.000017, 0.000013, -0.000013, 
-0.000028, -0.000021, -0.000005, 0.000018, -0.000001, 0.000013, -0.000013, -0.000017, 0.000018, -0.000001, 
--0.000017, -0.000013, 0.000013, 0.000018, -0.000001, -0.000017, 0.000013, -0.000013, 0.000018, -0.000001, 
--0.000017, -0.000013, 0.000013, 0.000018, -0.000001, 0.000013, -0.000013, -0.000017, 0.000028, -0.000021, 
--0.000005, -0.000017, 0.000134, -0.000007, -0.000127, -0.000047, 0.000047, 0.000060, -0.000047, 0.000028, 
--0.000021, -0.000005, -0.000074, 0.000270, -0.000015, 0.000055, -0.000099, -0.000099, 0.000124, -0.000059, 
--0.000059, -0.000044, 0.000044, -0.000003, -0.000044, 0.000041, 0.000018, -0.000001, -0.000013, -0.000017, 
-0.000013, 0.000034, -0.000002, -0.000024, 0.000024, -0.000032, 0.000028, 0.000020, -0.000020, -0.000001, 
-0.000028, -0.000020, 0.000020, -0.000001, 0.000028, -0.000021, -0.000005, 0.000034, -0.000002, -0.000032, 
-0.000024, -0.000024, 0.000018, -0.000001, -0.000013, -0.000017, 0.000013, 0.000018, -0.000001, -0.000017, 
--0.000013, 0.000013, 0.000048, -0.000046, 0.000034, -0.000034, 0.000098, -0.000017, -0.000175, 0.000078, 
-0.000065, -0.000065, 0.000028, -0.000020, -0.000001, 0.000020, 0.000018, -0.000001, -0.000017, -0.000013, 
-0.000013, 0.000044, -0.000017, 0.000029, 0.000042, -0.000078, -0.000029, 0.000028, -0.000001, 0.000020, 
--0.000020, 0.000124, -0.000059, -0.000059, -0.000044, -0.000003, 0.000044, -0.000044, 0.000041, 0.000065, 
--0.000021, -0.000033, 0.000028, -0.000020, 0.000020, -0.000001, 0.000065, -0.000021, -0.000033, 0.000065, 
--0.000021, -0.000033, 0.000078, -0.000004, 0.000055, -0.000074, -0.000055, 0.000024, 0.000017, -0.000023, 
--0.000017, 0.000048, -0.000002, 0.000034, -0.000034, -0.000046, 0.000044, -0.000017, 0.000029, -0.000078, 
-0.000042, -0.000029, 0.000098, -0.000175, -0.000017, 0.000078, 0.000065, -0.000065, 0.000024, 0.000017, 
--0.000023, -0.000017, 0.000124, -0.000059, -0.000059, -0.000003, -0.000044, 0.000044, -0.000044, 0.000041, 
-0.000028, -0.000021, -0.000005, 0.000018, -0.000001, -0.000013, -0.000017, 0.000013, 0.000048, -0.000046, 
-0.000034, -0.000034, 0.000298, -0.000027, -0.000266, 0.000120, 0.000099, -0.000209, -0.000017, 0.000185, 
--0.000175, -0.000009, 0.000078, 0.000065, -0.000065, -0.000065, 0.000024, -0.000023, 0.000017, -0.000017, 
-0.000024, -0.000017, -0.000023, 0.000017, 0.000028, -0.000020, -0.000001, 0.000020, 0.000018, -0.000001, 
-0.000013, -0.000013, -0.000017, -0.000046, 0.000283, -0.000266, -0.000015, 0.000099, 0.000134, -0.000099, 
--0.000099, -0.000017, -0.000009, 0.000185, -0.000065, 0.000078, -0.000175, 0.000065, -0.000065, 0.000298, 
--0.000027, -0.000266, 0.000120, 0.000099, -0.000209, 0.000028, -0.000021, -0.000005, 0.000028, 0.000020, 
--0.000020, -0.000001, 0.000124, -0.000003, -0.000003, 0.000044, -0.000044, -0.000059, -0.000059, 0.000124, 
--0.000003, -0.000003, 0.000044, -0.000044, -0.000059, -0.000059, 0.000048, -0.000046, 0.000034, -0.000034, 
-0.000024, -0.000017, -0.000023, 0.000017, 0.000028, -0.000020, 0.000020, -0.000001, 0.000124, -0.000003, 
--0.000003, 0.000044, -0.000044, -0.000059, -0.000059, 0.000028, -0.000021, -0.000005, 0.000048, -0.000046, 
-0.000034, -0.000034, 0.000028, -0.000021, -0.000005, 0.000018, -0.000001, 0.000013, -0.000017, -0.000013, 
--0.000001, 0.000018, -0.000013, 0.000013, -0.000017, 0.000028, -0.000001, 0.000020, -0.000020, -0.000027, 
--0.000027, 0.000298, -0.000266, 0.000099, 0.000120, -0.000209, 0.000028, -0.000020, -0.000001, 0.000020, 
-0.000018, -0.000001, -0.000013, -0.000017, 0.000013, 0.000124, -0.000003, -0.000003, 0.000044, -0.000044, 
--0.000059, -0.000059, 0.000028, -0.000020, 0.000020, -0.000001, 0.000065, -0.000021, -0.000033, 0.000124, 
--0.000003, -0.000003, 0.000044, -0.000044, -0.000059, -0.000059, 0.000124, -0.000059, -0.000059, -0.000044, 
-0.000044, -0.000003, -0.000044, 0.000041, 0.000124, -0.000059, -0.000059, -0.000044, 0.000044, -0.000003, 
--0.000044, 0.000041, 0.000024, 0.000017, -0.000023, -0.000017, 0.000028, -0.000001, -0.000027, -0.000020, 
-0.000020, 0.000048, -0.000046, 0.000034, -0.000034, 0.000048, -0.000046, 0.000034, -0.000034, 0.000078, 
--0.000004, 0.000055, -0.000055, -0.000074, -0.000074, 0.000270, -0.000015, 0.000055, -0.000099, -0.000099, 
-0.000034, -0.000002, -0.000024, 0.000024, -0.000032, -0.000032, 0.000283, -0.000266, -0.000015, -0.000099, 
-0.000123, 0.000099, -0.000099, 0.000028, 0.000020, -0.000020, -0.000001, 0.000298, -0.000266, -0.000027, 
-0.000099, 0.000120, -0.000209, 0.000024, -0.000017, -0.000023, 0.000017, 0.000065, -0.000021, -0.000033, 
--0.000074, 0.000270, -0.000015, -0.000099, 0.000055, -0.000099, 0.000124, -0.000003, -0.000003, -0.000059, 
--0.000059, 0.000044, -0.000044, 0.000028, 0.000020, -0.000020, -0.000001, 0.000124, -0.000059, -0.000059, 
--0.000044, 0.000044, -0.000003, -0.000044, 0.000041, 0.000048, -0.000046, -0.000034, 0.000034, 0.000048, 
--0.000046, 0.000034, -0.000034, 0.000098, -0.000175, -0.000017, 0.000065, 0.000078, -0.000065, 0.000024, 
-0.000017, -0.000023, -0.000017, 0.000124, -0.000003, -0.000003, 0.000044, -0.000044, -0.000059, -0.000059, 
-0.000065, -0.000021, -0.000033, 0.000034, -0.000002, -0.000024, 0.000024, -0.000032, 0.000048, -0.000046, 
-0.000034, -0.000034, 0.000048, -0.000046, 0.000034, -0.000034, 0.000065, -0.000021, -0.000033, 0.000124, 
--0.000003, -0.000003, -0.000059, -0.000059, 0.000044, -0.000044, 0.000124, -0.000059, -0.000059, -0.000044, 
--0.000003, 0.000044, -0.000044, 0.000041, 0.000124, -0.000059, -0.000059, -0.000044, -0.000003, 0.000044, 
--0.000044, 0.000041, 0.000138, -0.000095, -0.000099, 0.000071, 0.000048, -0.000046, -0.000034, 0.000034, 
-0.000124, -0.000003, -0.000003, -0.000059, -0.000059, 0.000044, -0.000044, 0.000028, -0.000021, -0.000005, 
-0.000028, -0.000021, -0.000005, 0.000028, -0.000021, -0.000005, 0.000124, -0.000003, -0.000003, -0.000059, 
--0.000059, 0.000044, -0.000044, 0.000048, -0.000046, 0.000034, -0.000034, 0.000124, -0.000003, -0.000003, 
-0.000044, -0.000044, -0.000059, -0.000059, 0.000124, -0.000003, -0.000003, 0.000044, -0.000044, -0.000059, 
--0.000059, 0.000138, -0.000095, 0.000071, -0.000099, 0.000028, -0.000021, -0.000005, 0.000124, -0.000003, 
--0.000003, 0.000044, -0.000044, -0.000059, -0.000059, 0.000065, -0.000021, -0.000033, 0.000124, -0.000003, 
--0.000003, 0.000044, -0.000044, -0.000059, -0.000059, 0.000028, -0.000021, -0.000005, 0.000071, -0.000127, 
--0.000017, 0.000047, 0.000060, -0.000047, 0.000028, 0.000020, -0.000020, -0.000001, 0.000048, -0.000046, 
-0.000034, -0.000034, -0.000266, 0.000283, -0.000015, -0.000046, -0.000099, -0.000099, 0.000134, 0.000099, 
--0.000032, -0.000015, -0.000266, 0.000283, -0.000099, 0.000123, 0.000099, -0.000099, 0.000241, -0.000013, 
-0.000161, -0.000161, -0.000432, 0.000124, -0.000003, -0.000003, 0.000044, -0.000044, -0.000059, -0.000059, 
-0.000124, -0.000003, -0.000003, 0.000044, -0.000044, -0.000059, -0.000059, 0.000048, -0.000046, 0.000034, 
--0.000034, -0.000059, -0.000059, 0.000124, -0.000044, 0.000044, -0.000003, -0.000044, 0.000041, -0.000059, 
--0.000059, 0.000124, -0.000044, 0.000044, -0.000003, -0.000044, 0.000041, 0.000028, -0.000020, 0.000020, 
--0.000001, 0.000138, -0.000095, -0.000099, 0.000071, 0.000028, -0.000021, -0.000005, 0.000124, -0.000059, 
--0.000059, -0.000003, 0.000044, -0.000044, -0.000003, 0.000124, -0.000003, -0.000003, -0.000059, 0.000044, 
--0.000044, -0.000059, -0.000044, 0.000044, -0.000059, 0.000124, -0.000059, -0.000044, -0.000003, 0.000044, 
--0.000044, 0.000041, -0.000059, 0.000124, -0.000059, -0.000044, -0.000003, 0.000044, -0.000044, 0.000041, 
-0.000065, -0.000021, -0.000033, 0.000138, -0.000095, -0.000099, 0.000071, 0.000048, -0.000046, 0.000034, 
--0.000034, 0.000028, -0.000001, 0.000020, -0.000027, -0.000020, -0.000059, -0.000059, 0.000124, -0.000044, 
--0.000003, 0.000044, -0.000044, 0.000041, 0.000138, -0.000095, -0.000099, 0.000071, -0.000001, 0.000018, 
-0.000013, -0.000013, -0.000017, 0.000048, -0.000046, 0.000034, -0.000034, 0.000048, -0.000046, 0.000034, 
--0.000034, 0.000044, -0.000017, -0.000078, 0.000029, 0.000042, -0.000029, -0.000059, -0.000059, 0.000124, 
--0.000044, -0.000003, 0.000044, -0.000044, 0.000041, 0.000071, -0.000127, -0.000017, 0.000060, 0.000047, 
--0.000047, -0.000017, 0.000134, -0.000007, -0.000127, 0.000047, 0.000060, -0.000047, -0.000047, 0.000124, 
--0.000003, -0.000003, 0.000044, -0.000044, -0.000059, -0.000059, 0.000124, -0.000003, -0.000003, 0.000044, 
--0.000044, -0.000059, -0.000059, 0.000138, -0.000099, 0.000071, -0.000095, -0.000059, 0.000124, -0.000059, 
--0.000044, -0.000003, 0.000044, -0.000044, 0.000041, -0.000002, 0.000048, -0.000046, -0.000034, 0.000034, 
-0.000018, -0.000001, 0.000013, -0.000013, -0.000017, -0.000007, -0.000017, 0.000134, 0.000047, 0.000060, 
--0.000047, -0.000047, -0.000127, -0.000001, 0.000018, -0.000013, 0.000013, -0.000017, -0.000032, 0.000283, 
--0.000015, -0.000266, 0.000099, 0.000123, -0.000099, -0.000099, -0.000027, -0.000266, 0.000298, -0.000209, 
-0.000120, 0.000099, 0.000018, -0.000001, -0.000017, -0.000013, 0.000013, 0.000138, -0.000095, -0.000099, 
-0.000071, 0.000138, -0.000095, -0.000099, 0.000071, -0.000015, 0.000283, -0.000046, -0.000266, 0.000134, 
-0.000099, -0.000099, -0.000099, 0.000028, 0.000020, -0.000020, -0.000001, 0.000028, -0.000021, -0.000005, 
-0.000065, -0.000021, 0.000016, -0.000048, -0.000266, -0.000015, 0.000283, -0.000046, -0.000099, -0.000099, 
-0.000134, 0.000099, -0.000266, 0.000283, -0.000015, -0.000046, 0.000134, 0.000099, -0.000099, -0.000099, 
--0.000032, -0.000015, -0.000266, 0.000283, -0.000099, -0.000099, 0.000099, 0.000123, 0.000185, -0.000009, 
--0.000017, 0.000065, -0.000175, 0.000078, -0.000065, -0.000065, -0.000002, 0.000048, -0.000034, -0.000046, 
-0.000034, -0.000001, 0.000018, 0.000013, -0.000013, -0.000017, 0.000018, -0.000001, -0.000017, -0.000013, 
-0.000013, -0.000009, -0.000175, -0.000017, 0.000185, -0.000065, -0.000065, 0.000078, 0.000065, -0.000001, 
-0.000018, 0.000013, -0.000013, -0.000017, 0.000028, -0.000021, -0.000005, 0.000124, -0.000003, -0.000003, 
-0.000044, -0.000044, -0.000059, -0.000059, 0.000124, -0.000003, -0.000003, 0.000044, -0.000044, -0.000059, 
--0.000059, -0.000059, 0.000124, -0.000059, -0.000044, -0.000003, 0.000044, -0.000044, 0.000041, -0.000059, 
-0.000124, -0.000059, -0.000044, -0.000003, 0.000044, -0.000044, 0.000041, -0.000002, 0.000048, -0.000034, 
--0.000046, 0.000034, 0.000138, 0.000071, -0.000095, -0.000099, -0.000001, 0.000018, 0.000013, -0.000013, 
--0.000017, 0.000018, -0.000001, 0.000013, -0.000013, -0.000017, 0.000018, -0.000001, -0.000017, -0.000013, 
-0.000013, 0.000065, -0.000021, -0.000033, -0.000001, 0.000028, -0.000020, -0.000027, 0.000020, 0.000138, 
--0.000099, 0.000071, -0.000095, 0.000141, -0.000007, 0.000099, -0.000099, -0.000133, 0.000141, -0.000007, 
--0.000133, -0.000099, 0.000099, 0.000065, -0.000021, -0.000033, 0.000018, -0.000001, -0.000017, -0.000013, 
-0.000013, -0.000127, -0.000017, 0.000071, 0.000060, 0.000047, -0.000047, 0.000018, -0.000001, 0.000013, 
--0.000013, -0.000017, 0.000065, -0.000021, -0.000033, 0.000060, -0.000003, 0.000034, -0.000041, 0.000042, 
-0.000138, -0.000095, -0.000099, 0.000071, 0.000185, -0.000009, -0.000017, -0.000175, 0.000065, 0.000078, 
--0.000065, -0.000065, 0.000018, -0.000001, -0.000013, 0.000013, -0.000017, -0.000009, -0.000175, -0.000017, 
-0.000185, -0.000065, -0.000065, 0.000078, 0.000065, 0.000141, -0.000007, 0.000099, -0.000133, -0.000099, 
-0.000141, -0.000007, -0.000133, -0.000099, 0.000099, -0.000133, 0.000297, 0.000099, 0.000099, -0.000133, 
--0.000209, 0.000124, -0.000003, -0.000003, -0.000059, -0.000059, 0.000044, -0.000044, -0.000001, 0.000018, 
--0.000013, 0.000013, -0.000017, 0.000124, -0.000003, -0.000003, -0.000059, -0.000059, 0.000044, -0.000044, 
--0.000059, -0.000059, 0.000124, -0.000003, -0.000044, 0.000044, -0.000044, 0.000041, -0.000059, -0.000059, 
-0.000124, -0.000003, -0.000044, 0.000044, -0.000044, 0.000041, 0.000034, -0.000002, -0.000032, -0.000024, 
-0.000024, -0.000032, 0.000283, -0.000015, -0.000266, -0.000099, -0.000099, 0.000123, 0.000099, 0.000048, 
--0.000002, 0.000034, -0.000034, -0.000046, -0.000001, 0.000018, 0.000013, -0.000013, -0.000017, 0.000028, 
--0.000021, -0.000005, 0.000124, -0.000003, -0.000003, -0.000059, -0.000059, 0.000044, -0.000044, 0.000124, 
--0.000003, -0.000003, -0.000059, -0.000059, 0.000044, -0.000044, 0.000138, 0.000071, -0.000095, -0.000099, 
-0.000065, -0.000021, -0.000033, 0.000141, -0.000007, 0.000099, -0.000133, -0.000099, 0.000141, -0.000007, 
--0.000099, -0.000133, 0.000099, 0.000138, -0.000095, -0.000099, 0.000071, 0.000138, -0.000095, 0.000071, 
--0.000099, -0.000017, 0.000134, -0.000007, -0.000047, 0.000060, -0.000127, 0.000047, -0.000047, 0.000028, 
--0.000021, -0.000005, 0.000141, -0.000099, 0.000099, -0.000133, -0.000007, -0.000017, 0.000071, -0.000127, 
-0.000047, 0.000060, -0.000047, 0.000141, -0.000007, -0.000133, -0.000099, 0.000099, -0.000133, 0.000297, 
-0.000099, -0.000133, 0.000099, -0.000209, 0.000124, -0.000003, -0.000003, 0.000044, -0.000044, -0.000059, 
--0.000059, 0.000124, -0.000003, -0.000003, 0.000044, -0.000044, -0.000059, -0.000059, -0.000059, 0.000124, 
--0.000059, -0.000003, -0.000044, -0.000044, 0.000044, 0.000041, -0.000059, 0.000124, -0.000059, -0.000003, 
--0.000044, -0.000044, 0.000044, 0.000041, -0.000017, -0.000007, 0.000134, -0.000127, -0.000047, 0.000060, 
--0.000047, 0.000047, -0.000059, -0.000059, 0.000124, -0.000003, -0.000044, 0.000044, -0.000044, 0.000041, 
-0.000138, 0.000071, -0.000099, -0.000095, -0.000059, -0.000059, 0.000124, -0.000003, -0.000044, 0.000044, 
--0.000044, 0.000041, 0.000028, -0.000021, -0.000005, 0.000138, -0.000099, -0.000095, 0.000071, 0.000141, 
--0.000099, 0.000099, -0.000007, -0.000133, 0.000141, -0.000007, -0.000133, 0.000099, -0.000099, 0.000065, 
--0.000021, -0.000033, 0.000028, -0.000021, -0.000005, 0.000018, -0.000001, -0.000017, -0.000013, 0.000013, 
--0.000017, 0.000071, -0.000127, 0.000060, 0.000047, -0.000047, 0.000124, -0.000059, -0.000059, -0.000003, 
-0.000044, -0.000044, -0.000044, 0.000041, -0.000059, -0.000059, 0.000124, -0.000003, -0.000044, 0.000044, 
--0.000044, 0.000041, -0.000059, -0.000059, 0.000124, -0.000003, -0.000044, 0.000044, -0.000044, 0.000041, 
-0.000078, -0.000004, -0.000055, 0.000055, -0.000074, -0.000015, -0.000074, 0.000270, -0.000099, -0.000099, 
-0.000055, 0.000141, -0.000099, -0.000007, -0.000133, 0.000099, 0.000141, -0.000007, -0.000099, -0.000133, 
-0.000099, -0.000133, 0.000297, 0.000099, -0.000133, 0.000099, -0.000209, 0.000018, -0.000001, -0.000017, 
-0.000013, -0.000013, -0.000017, 0.000098, -0.000175, -0.000065, 0.000065, 0.000078, 0.000018, -0.000001, 
--0.000013, 0.000013, -0.000017, -0.000127, -0.000017, 0.000071, 0.000060, 0.000047, -0.000047, 0.000018, 
--0.000001, -0.000013, -0.000017, 0.000013, -0.000059, -0.000059, 0.000124, 0.000044, -0.000003, -0.000044, 
--0.000044, 0.000041, -0.000059, -0.000059, 0.000124, 0.000044, -0.000003, -0.000044, -0.000044, 0.000041, 
-0.000124, -0.000059, -0.000059, 0.000044, -0.000044, -0.000003, -0.000044, 0.000041, 0.000138, 0.000071, 
--0.000095, -0.000099, 0.000141, 0.000099, -0.000007, -0.000133, -0.000099, 0.000138, -0.000095, 0.000071, 
--0.000099, 0.000028, -0.000001, 0.000020, -0.000020, -0.000027, 0.000141, -0.000007, 0.000099, -0.000099, 
--0.000133, -0.000133, 0.000297, 0.000099, 0.000099, -0.000133, -0.000209, 0.000028, -0.000021, -0.000005, 
-0.000034, -0.000002, -0.000032, 0.000024, -0.000024, -0.000015, -0.000032, 0.000283, -0.000266, -0.000099, 
-0.000099, 0.000123, -0.000099, -0.000133, 0.000297, 0.000099, -0.000133, 0.000099, -0.000209, -0.000059, 
-0.000124, -0.000059, -0.000044, 0.000044, -0.000003, 0.000041, -0.000044, -0.000017, 0.000185, -0.000175, 
--0.000009, -0.000065, -0.000065, 0.000078, 0.000065, 0.000141, -0.000099, 0.000099, -0.000133, -0.000007, 
-0.000141, -0.000007, -0.000133, -0.000099, 0.000099, -0.000017, -0.000009, 0.000185, 0.000065, -0.000175, 
-0.000078, -0.000065, -0.000065, 0.000048, 0.000034, -0.000046, -0.000034, 0.000048, 0.000034, -0.000046, 
--0.000034, 0.000138, -0.000099, 0.000071, -0.000095, -0.000001, 0.000018, -0.000017, 0.000013, -0.000013, 
-0.000065, -0.000021, -0.000033, 0.000028, -0.000001, 0.000020, -0.000020, -0.000027, -0.000266, -0.000027, 
-0.000298, 0.000099, 0.000120, -0.000209, -0.000003, 0.000124, -0.000003, 0.000044, -0.000044, -0.000059, 
--0.000059, -0.000059, 0.000124, -0.000059, -0.000044, 0.000044, -0.000003, -0.000044, 0.000041, 0.000065, 
--0.000021, -0.000033, 0.000228, -0.000013, -0.000161, -0.000216, -0.000216, -0.000216, 0.000390, 0.000161, 
--0.000271, 0.000161, 0.000028, -0.000021, -0.000005, 0.000065, -0.000021, -0.000033, -0.000003, 0.000124, 
--0.000003, 0.000044, -0.000044, -0.000059, -0.000059, 0.000044, -0.000044, 0.000028, -0.000021, -0.000005, 
--0.000017, 0.000185, -0.000009, -0.000065, -0.000175, -0.000065, 0.000078, 0.000065, 0.000124, -0.000003, 
--0.000003, 0.000044, -0.000044, -0.000059, -0.000059, 0.000124, -0.000003, -0.000003, 0.000044, -0.000044, 
--0.000059, -0.000059, -0.000059, -0.000059, 0.000124, 0.000044, -0.000003, -0.000044, -0.000044, 0.000041, 
-0.000138, 0.000071, -0.000099, -0.000095, -0.000059, -0.000059, 0.000124, 0.000044, -0.000044, -0.000003, 
--0.000044, 0.000041, -0.000003, 0.000124, -0.000003, 0.000044, -0.000044, -0.000059, -0.000059, 0.000044, 
--0.000044, -0.000059, -0.000059, 0.000124, -0.000003, 0.000044, -0.000044, -0.000003, -0.000059, -0.000059, 
-0.000124, 0.000044, -0.000044, -0.000003, -0.000044, 0.000041, -0.000017, 0.000185, -0.000009, -0.000175, 
--0.000065, 0.000078, -0.000065, 0.000065, -0.000009, -0.000017, -0.000175, 0.000185, -0.000065, 0.000065, 
--0.000065, 0.000078, 0.000028, -0.000020, 0.000020, -0.000001, -0.000059, -0.000059, 0.000124, 0.000044, 
--0.000003, -0.000044, -0.000044, 0.000041, 0.000048, -0.000046, 0.000034, -0.000034, -0.000266, -0.000027, 
-0.000298, 0.000099, 0.000120, -0.000209, -0.000002, 0.000048, -0.000046, -0.000034, 0.000034, 0.000018, 
--0.000001, -0.000017, -0.000013, 0.000013, -0.000175, -0.000017, 0.000098, 0.000078, 0.000065, -0.000065, 
-0.000048, -0.000046, 0.000034, -0.000034, 0.000028, -0.000020, -0.000001, 0.000020, 0.000078, -0.000004, 
--0.000055, -0.000074, 0.000055, -0.000015, -0.000074, 0.000270, -0.000099, 0.000055, -0.000099, 0.000028, 
--0.000021, -0.000005, 0.000138, 0.000071, -0.000095, -0.000099, 0.000028, -0.000021, -0.000005, -0.000001, 
-0.000028, -0.000020, 0.000020, -0.000027, -0.000133, 0.000297, 0.000099, -0.000133, 0.000099, -0.000209, 
-0.000065, -0.000021, -0.000048, 0.000016, 0.000018, -0.000001, -0.000013, 0.000013, -0.000017, -0.000175, 
--0.000017, 0.000098, 0.000078, 0.000065, -0.000065, 0.000018, -0.000001, -0.000013, -0.000017, 0.000013, 
--0.000127, -0.000017, 0.000071, 0.000060, 0.000047, -0.000047, -0.000133, 0.000297, 0.000099, 0.000099, 
--0.000133, -0.000209, -0.000001, 0.000018, -0.000017, -0.000013, 0.000013, 0.000065, -0.000021, -0.000033, 
--0.000003, 0.000124, -0.000003, 0.000044, -0.000044, -0.000059, -0.000059, 0.000138, 0.000071, -0.000099, 
--0.000095, 0.000138, -0.000095, -0.000099, 0.000071, -0.000003, -0.000003, 0.000124, -0.000059, 0.000044, 
--0.000044, -0.000059, 0.000124, -0.000003, -0.000003, 0.000044, -0.000044, -0.000059, -0.000059, -0.000003, 
-0.000124, -0.000003, 0.000044, -0.000044, -0.000059, -0.000059, -0.000059, -0.000059, 0.000124, -0.000003, 
-0.000044, -0.000044, -0.000044, 0.000041, 0.000124, -0.000003, -0.000003, 0.000044, -0.000044, -0.000059, 
--0.000059, -0.000059, -0.000059, 0.000124, 0.000044, -0.000044, -0.000003, -0.000044, 0.000041, -0.000017, 
-0.000185, -0.000009, -0.000175, 0.000065, 0.000078, -0.000065, -0.000065, -0.000017, -0.000175, 0.000098, 
-0.000078, 0.000065, -0.000065, -0.000059, -0.000059, 0.000124, 0.000044, -0.000003, -0.000044, -0.000044, 
-0.000041, -0.000059, -0.000059, 0.000124, 0.000044, -0.000003, -0.000044, -0.000044, 0.000041, -0.000059, 
--0.000059, 0.000124, 0.000044, -0.000044, -0.000003, -0.000044, 0.000041, 0.000138, 0.000071, -0.000099, 
--0.000095, 0.000065, -0.000021, 0.000016, -0.000048, -0.000017, -0.000009, 0.000185, -0.000175, -0.000065, 
--0.000065, 0.000078, 0.000065, 0.000065, -0.000021, -0.000033, 0.000065, -0.000021, -0.000033, -0.000003, 
--0.000003, 0.000124, 0.000044, -0.000044, -0.000059, -0.000059, 0.000065, -0.000021, -0.000033, 0.000138, 
--0.000095, 0.000071, -0.000099, 0.000124, -0.000059, -0.000059, -0.000044, -0.000003, 0.000044, -0.000044, 
-0.000041, 0.000028, 0.000020, -0.000001, -0.000020, 0.000065, -0.000021, -0.000033, 0.000124, -0.000003, 
--0.000003, 0.000044, -0.000044, -0.000059, -0.000059, 0.000124, -0.000003, -0.000003, 0.000044, -0.000044, 
--0.000059, -0.000059, -0.000059, -0.000059, 0.000124, 0.000044, -0.000003, -0.000044, -0.000044, 0.000041, 
--0.000003, 0.000124, -0.000003, 0.000044, -0.000044, -0.000059, -0.000059, 0.000138, -0.000095, -0.000099, 
-0.000071, -0.000003, 0.000124, -0.000003, 0.000044, -0.000044, -0.000059, -0.000059, -0.000059, 0.000124, 
--0.000059, -0.000044, -0.000003, 0.000044, -0.000044, 0.000041, -0.000059, -0.000059, 0.000124, -0.000003, 
--0.000044, 0.000044, -0.000044, 0.000041, -0.000059, -0.000059, 0.000124, 0.000044, -0.000003, -0.000044, 
--0.000044, 0.000041, -0.000059, -0.000059, 0.000124, -0.000003, -0.000044, 0.000044, -0.000044, 0.000041, 
-0.000270, -0.000015, 0.000055, -0.000074, -0.000099, -0.000099, -0.000003, -0.000003, 0.000124, 0.000044, 
--0.000044, -0.000059, -0.000059, -0.000017, 0.000044, 0.000029, -0.000078, 0.000042, -0.000029, 0.000044, 
--0.000017, 0.000042, 0.000029, -0.000078, -0.000029, 0.000065, -0.000021, -0.000033, 0.000028, -0.000021, 
--0.000005, 0.000048, -0.000002, -0.000046, -0.000034, 0.000034, -0.000015, -0.000266, -0.000046, 0.000283, 
--0.000099, -0.000099, 0.000134, 0.000099, -0.000059, -0.000059, 0.000124, -0.000044, -0.000003, 0.000044, 
--0.000044, 0.000041, -0.000059, -0.000059, 0.000124, -0.000044, -0.000003, 0.000044, -0.000044, 0.000041, 
-0.000138, -0.000095, -0.000099, 0.000071, 0.000048, -0.000046, 0.000034, -0.000034, 0.000048, -0.000046, 
-0.000034, -0.000034, -0.000003, 0.000124, -0.000003, 0.000044, -0.000044, -0.000059, -0.000059, -0.000059, 
--0.000059, 0.000124, -0.000044, 0.000044, -0.000003, -0.000044, 0.000041, 0.000028, -0.000021, -0.000005, 
--0.000004, 0.000078, 0.000055, -0.000055, -0.000074, -0.000032, 0.000283, -0.000015, -0.000266, 0.000123, 
--0.000099, 0.000099, -0.000099, -0.000027, -0.000266, 0.000298, 0.000099, 0.000120, -0.000209, 0.000028, 
-0.000020, -0.000020, -0.000001, 0.000078, -0.000004, -0.000055, -0.000074, 0.000055, 0.000028, -0.000021, 
--0.000005, 0.000048, -0.000046, 0.000034, -0.000034, -0.000074, 0.000270, -0.000015, -0.000099, 0.000055, 
--0.000099, -0.000046, -0.000015, -0.000266, 0.000283, -0.000099, 0.000099, -0.000099, 0.000134, 0.000018, 
--0.000001, -0.000017, 0.000013, -0.000013, -0.000175, -0.000017, 0.000098, 0.000065, 0.000078, -0.000065, 
--0.000133, 0.000297, -0.000133, 0.000099, -0.000209, 0.000099, -0.000003, -0.000003, 0.000124, -0.000059, 
-0.000044, -0.000044, -0.000059, -0.000003, 0.000124, -0.000003, -0.000059, -0.000059, 0.000044, -0.000044, 
--0.000003, -0.000003, 0.000124, -0.000059, 0.000044, -0.000044, -0.000059, -0.000059, -0.000059, 0.000124, 
-0.000044, -0.000044, -0.000003, -0.000044, 0.000041, -0.000059, -0.000059, 0.000124, 0.000044, -0.000044, 
--0.000003, -0.000044, 0.000041, -0.000003, 0.000124, -0.000003, -0.000059, -0.000059, 0.000044, -0.000044, 
--0.000059, -0.000059, 0.000124, -0.000044, -0.000003, 0.000044, -0.000044, 0.000041, 0.000065, -0.000021, 
--0.000033, 0.000138, -0.000095, 0.000071, -0.000099, 0.000065, -0.000021, -0.000033, 0.000048, -0.000046, 
-0.000034, -0.000034, 0.000048, -0.000046, 0.000034, -0.000034, 0.000065, -0.000021, -0.000033, 0.000065, 
--0.000021, -0.000033, -0.000003, -0.000003, 0.000124, -0.000059, 0.000044, -0.000044, -0.000059, 0.000028, 
--0.000021, -0.000005, -0.000017, 0.000134, -0.000007, -0.000127, 0.000047, 0.000060, -0.000047, -0.000047, 
--0.000017, -0.000127, 0.000071, 0.000060, 0.000047, -0.000047, -0.000003, 0.000124, -0.000003, 0.000044, 
--0.000044, -0.000059, -0.000059, 0.000034, -0.000002, -0.000032, -0.000024, 0.000024, -0.000015, -0.000266, 
--0.000032, 0.000283, -0.000099, -0.000099, 0.000123, 0.000099, 0.000065, -0.000021, -0.000033, -0.000003, 
-0.000124, -0.000003, 0.000044, -0.000044, -0.000059, -0.000059, -0.000059, -0.000059, 0.000124, -0.000044, 
--0.000003, 0.000044, -0.000044, 0.000041, -0.000003, 0.000124, -0.000003, 0.000044, -0.000044, -0.000059, 
--0.000059, -0.000059, -0.000059, 0.000124, 0.000044, -0.000044, -0.000003, -0.000044, 0.000041, 0.000065, 
--0.000021, -0.000033, 0.000138, -0.000095, -0.000099, 0.000071, 0.000048, -0.000046, 0.000034, -0.000034, 
-0.000138, -0.000095, 0.000071, -0.000099, 0.000028, -0.000020, -0.000001, 0.000020, 0.000028, -0.000020, 
--0.000001, 0.000020, -0.000003, 0.000124, -0.000003, 0.000044, -0.000044, -0.000059, -0.000059, 0.000028, 
--0.000020, -0.000001, 0.000020, -0.000003, -0.000003, 0.000124, 0.000044, -0.000044, -0.000059, -0.000059, 
-0.000124, -0.000003, -0.000003, 0.000044, -0.000044, -0.000059, -0.000059, 0.000044, -0.000044, 0.000028, 
--0.000021, -0.000005, 0.000124, -0.000003, -0.000003, 0.000044, -0.000044, -0.000059, -0.000059, 0.000044, 
--0.000044, -0.000059, -0.000059, 0.000124, -0.000003, 0.000044, -0.000044, -0.000003, -0.000001, 0.000028, 
-0.000020, -0.000020, -0.000027, -0.000017, 0.000185, -0.000009, -0.000175, -0.000065, 0.000065, 0.000078, 
--0.000065, -0.000017, -0.000009, -0.000175, 0.000185, 0.000078, 0.000065, -0.000065, -0.000065, -0.000001, 
-0.000018, -0.000013, 0.000013, -0.000017, 0.000124, -0.000003, -0.000003, 0.000044, -0.000044, -0.000059, 
--0.000059, -0.000003, 0.000124, -0.000003, 0.000044, -0.000044, -0.000059, -0.000059, -0.000059, -0.000059, 
-0.000124, -0.000003, -0.000044, 0.000044, -0.000044, 0.000041, 0.000124, -0.000003, -0.000003, 0.000044, 
--0.000044, -0.000059, -0.000059, 0.000028, -0.000021, -0.000005, -0.000059, -0.000059, 0.000124, -0.000003, 
-0.000044, -0.000044, -0.000003, -0.000003, -0.000003, 0.000124, -0.000059, -0.000059, 0.000044, -0.000044, 
--0.000059, -0.000059, 0.000124, -0.000044, 0.000044, -0.000003, -0.000044, 0.000041, 0.000138, -0.000099, 
-0.000071, -0.000095, 0.000048, -0.000034, -0.000046, 0.000034, 0.000048, -0.000034, -0.000046, 0.000034, 
--0.000003, 0.000124, -0.000003, -0.000059, 0.000044, -0.000044, -0.000059, -0.000044, 0.000044, -0.000059, 
--0.000059, 0.000124, -0.000044, 0.000044, -0.000003, 0.000041, -0.000044, -0.000017, 0.000044, 0.000042, 
--0.000078, 0.000029, -0.000029, -0.000003, -0.000003, 0.000124, -0.000059, -0.000059, 0.000044, -0.000044, 
-0.000124, -0.000003, -0.000003, 0.000044, -0.000044, -0.000059, -0.000059, 0.000124, -0.000003, -0.000003, 
-0.000044, -0.000044, -0.000059, -0.000059, -0.000059, -0.000059, 0.000124, 0.000044, -0.000003, -0.000044, 
--0.000044, 0.000041, -0.000059, -0.000059, 0.000124, -0.000044, 0.000044, -0.000003, -0.000044, 0.000041, 
-0.000018, -0.000001, -0.000013, 0.000013, -0.000017, -0.000007, -0.000017, 0.000134, -0.000127, -0.000047, 
--0.000047, 0.000060, 0.000047, -0.000001, 0.000018, 0.000013, -0.000013, -0.000017, -0.000059, -0.000059, 
-0.000124, 0.000044, -0.000003, -0.000044, -0.000044, 0.000041, 0.000048, -0.000002, -0.000046, -0.000034, 
-0.000034, -0.000266, -0.000015, -0.000046, 0.000283, -0.000099, -0.000099, 0.000134, 0.000099, 0.000001, 
-1.000000, 0.000001, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 
-1.000000, 0.000045, -0.000003, -0.000014, -0.000006, 0.000027, 0.000003, -0.000008, 0.000027, 0.000003, 
--0.000008, 0.000027, 0.000003, -0.000008, 0.000047, -0.000043, -0.000005, 0.000077, 0.000402, -0.000358, 
-0.000047, -0.000043, -0.000005, 0.000077, 0.000402, -0.000358, 0.000027, 0.000003, -0.000008, 0.000022, 
-0.000021, -0.000006, 0.000022, 0.000021, -0.000006, 0.000045, -0.000004, -0.000003, -0.000014, -0.000006, 
-0.000027, 0.000003, -0.000008, 0.000022, 0.000021, -0.000006, 0.000047, -0.000043, -0.000005, 0.000075, 
--0.000067, 0.000044, -0.000044, -0.000008, 0.000075, -0.000067, 0.000044, -0.000044, -0.000008, 0.000027, 
-0.000003, -0.000008, 0.000138, -0.000099, -0.000095, 0.000071, 0.000027, 0.000003, -0.000008, 0.000100, 
--0.000005, -0.000071, 0.000071, 0.000047, 0.000045, -0.000021, -0.000021, 0.000016, 0.000016, -0.000013, 
-0.000027, 0.000003, -0.000008, 0.000236, -0.000010, -0.000010, -0.000113, 0.000036, -0.000080, -0.000080, 
-0.000032, 0.000045, -0.000003, -0.000004, -0.000014, -0.000006, 0.000022, 0.000021, -0.000006, 0.000022, 
-0.000021, -0.000006, 0.000022, 0.000021, -0.000006, 0.000075, -0.000067, 0.000044, -0.000044, -0.000008, 
--0.000059, -0.000059, 0.000066, 0.000044, 0.000044, 0.000035, -0.000018, -0.000059, -0.000059, 0.000066, 
-0.000044, 0.000044, 0.000035, -0.000018, 0.000022, 0.000021, -0.000006, -0.000059, -0.000059, 0.000066, 
-0.000044, 0.000044, 0.000035, -0.000018, 0.000022, 0.000021, -0.000006, 0.000132, 0.000675, -0.000594, 
-0.000022, 0.000021, -0.000006, 0.000022, 0.000021, -0.000006, -0.000059, -0.000059, 0.000066, 0.000044, 
-0.000044, 0.000035, -0.000018, 0.000067, -0.000113, 0.000036, 0.000008, 0.000044, -0.000007, -0.000005, 
--0.000044, 0.000322, -0.000044, -0.000117, -0.000046, -0.000024, 0.000045, -0.000003, -0.000014, -0.000006, 
-0.000022, 0.000021, -0.000006, 0.000202, -0.000010, -0.000007, -0.000072, -0.000070, -0.000021, 0.000100, 
--0.000005, -0.000071, 0.000071, 0.000047, -0.000043, -0.000005, 0.000022, 0.000021, -0.000006, -0.000010, 
-0.000227, -0.000080, -0.000080, -0.000010, -0.000024, 0.000022, 0.000021, -0.000006, 0.000055, -0.000088, 
-0.000034, 0.000028, 0.000049, -0.000005, 0.000022, 0.000021, -0.000006, 0.000047, -0.000043, -0.000005, 
-0.000075, -0.000044, -0.000067, 0.000044, -0.000008, 0.000100, -0.000071, -0.000005, 0.000071, 0.000013, 
-0.000012, -0.000004, 0.000047, -0.000043, -0.000005, 0.000075, -0.000067, 0.000044, -0.000044, -0.000008, 
-0.000027, 0.000003, -0.000008, 0.000132, 0.000675, -0.000594, 0.000075, -0.000067, 0.000044, -0.000044, 
--0.000008, 0.000027, 0.000003, -0.000008, -0.000113, 0.000067, 0.000036, 0.000044, 0.000008, -0.000007, 
-0.000022, 0.000021, -0.000006, 0.000077, 0.000402, -0.000358, 0.000047, -0.000043, -0.000005, 0.000009, 
--0.000054, 0.000050, 0.000014, 0.000012, -0.000007, 0.000109, -0.000036, -0.000007, -0.000024, -0.000059, 
--0.000059, 0.000066, 0.000044, 0.000044, 0.000035, -0.000018, -0.000059, -0.000059, 0.000066, 0.000044, 
-0.000044, 0.000035, -0.000018, 0.000027, 0.000003, -0.000008, -0.000003, -0.000003, 0.000137, -0.000013, 
--0.000044, -0.000044, -0.000006, 0.000022, 0.000021, -0.000006, 0.000022, 0.000021, -0.000006, 0.000236, 
--0.000010, -0.000010, 0.000036, -0.000113, -0.000080, -0.000080, 0.000032, -0.000009, 0.000199, -0.000088, 
--0.000065, -0.000088, 0.000065, -0.000065, 0.000065, -0.000010, -0.000005, 0.000022, 0.000021, -0.000006, 
-0.000022, 0.000021, -0.000006, 0.000022, 0.000021, -0.000006, -0.000113, 0.000067, 0.000044, 0.000036, 
-0.000008, -0.000007, 0.000047, -0.000043, -0.000005, -0.000059, -0.000059, 0.000066, 0.000044, 0.000079, 
--0.000018, -0.000001, -0.000001, -0.000005, 0.000127, -0.000005, -0.000088, -0.000019, 0.000022, 0.000021, 
--0.000006, 0.000022, 0.000021, -0.000006, 0.000047, -0.000043, -0.000005, 0.000075, -0.000044, -0.000067, 
-0.000044, -0.000008, 0.000236, -0.000080, -0.000080, -0.000010, 0.000036, -0.000010, -0.000113, 0.000032, 
-0.000027, 0.000003, -0.000008, -0.000005, 0.000125, -0.000005, -0.000065, -0.000021, -0.000012, 0.000022, 
-0.000021, -0.000006, 0.000022, 0.000021, -0.000006, 0.000054, 0.000049, -0.000063, 0.000023, 0.000075, 
--0.000067, 0.000044, -0.000044, -0.000008, -0.000070, -0.000072, 0.000202, -0.000007, -0.000010, -0.000021, 
--0.000001, -0.000001, 0.000127, -0.000005, -0.000005, -0.000088, -0.000019, -0.000043, 0.000047, -0.000005, 
-0.000018, -0.000013, -0.000001, -0.000017, 0.000013, 0.000067, -0.000113, 0.000008, 0.000036, -0.000007, 
-0.000060, 0.000035, -0.000018, 0.000022, 0.000021, -0.000006, -0.000002, 0.000137, -0.000063, -0.000011, 
--0.000090, 0.000028, 0.000016, -0.000009, 0.000022, 0.000021, -0.000006, -0.000095, 0.000056, 0.000186, 
--0.000165, 0.000100, -0.000071, -0.000005, 0.000071, 0.000027, 0.000003, -0.000008, 0.000018, -0.000001, 
--0.000013, -0.000017, 0.000013, -0.000002, -0.000008, 0.000118, -0.000063, -0.000078, 0.000016, 0.000028, 
--0.000005, -0.000003, -0.000003, 0.000137, -0.000044, -0.000013, -0.000044, -0.000006, 0.000071, 0.000173, 
--0.000018, 0.000022, 0.000021, -0.000006, 0.000045, 0.000045, -0.000013, 0.000008, 0.000067, -0.000113, 
-0.000036, 0.000044, -0.000007, -0.000010, 0.000236, 0.000036, -0.000113, -0.000010, -0.000080, -0.000080, 
-0.000032, 0.000045, 0.000045, -0.000013, 0.000069, 0.000035, -0.000113, 0.000036, 0.000044, -0.000007, 
-0.000022, 0.000021, -0.000006, 0.000027, 0.000003, -0.000008, -0.000007, -0.000017, 0.000134, 0.000060, 
--0.000127, -0.000047, -0.000047, 0.000047, -0.000003, -0.000003, -0.000013, 0.000137, -0.000044, -0.000044, 
--0.000006, 0.000027, 0.000003, -0.000008, 0.000077, 0.000402, -0.000358, -0.000055, 0.000022, -0.000094, 
-0.000062, 0.000080, -0.000026, 0.000139, 0.000710, -0.000636, 0.000022, 0.000021, -0.000006, 0.000027, 
-0.000003, -0.000008, 0.000018, -0.000013, -0.000017, -0.000001, 0.000013, 0.000022, 0.000021, -0.000006, 
-0.000054, -0.000063, 0.000049, 0.000028, 0.000016, -0.000005, 0.000022, 0.000021, -0.000006, -0.000059, 
--0.000059, 0.000066, 0.000044, 0.000079, -0.000018, 0.000057, 0.000049, -0.000100, 0.000036, 0.000034, 
--0.000002, -0.000059, -0.000059, 0.000044, 0.000066, 0.000044, 0.000035, -0.000018, 0.000103, -0.000011, 
--0.000013, -0.000043, -0.000018, 0.000027, 0.000003, -0.000008, 0.000022, 0.000021, -0.000006, -0.000059, 
--0.000059, 0.000044, 0.000066, 0.000044, 0.000035, -0.000018, 0.000027, 0.000003, -0.000008, -0.000161, 
-0.000509, 0.000161, -0.000178, -0.000216, -0.000013, -0.000026, -0.000021, 0.000047, -0.000043, -0.000005, 
-0.000027, 0.000003, -0.000008, 0.000077, 0.000402, -0.000358, -0.000059, -0.000059, 0.000066, 0.000044, 
-0.000044, 0.000035, -0.000018, 0.000100, -0.000005, -0.000071, 0.000071, 0.000027, -0.000005, 0.000054, 
-0.000049, -0.000063, 0.000023, 0.000022, 0.000021, -0.000006, 0.000028, 0.000019, 0.000013, 0.000012, 
--0.000004, -0.000005, 0.000322, -0.000044, -0.000046, -0.000044, -0.000117, -0.000024, 0.000028, 0.000027, 
--0.000008, -0.000012, 0.000107, -0.000062, 0.000062, -0.000094, -0.000007, 0.000148, -0.000063, -0.000047, 
--0.000063, 0.000047, -0.000047, 0.000047, -0.000010, -0.000005, 0.000075, -0.000067, 0.000044, -0.000044, 
--0.000008, 0.000022, 0.000021, -0.000006, -0.000010, 0.000227, -0.000080, -0.000010, -0.000024, 0.000036, 
-0.000236, -0.000080, -0.000080, -0.000010, -0.000113, -0.000010, 0.000032, 0.000047, -0.000043, -0.000005, 
-0.000100, -0.000071, -0.000005, 0.000071, -0.000006, -0.000006, 0.000285, -0.000015, -0.000055, -0.000056, 
--0.000044, -0.000044, -0.000037, 0.000675, 0.000132, -0.000594, 0.000027, 0.000003, -0.000008, 0.000027, 
-0.000003, -0.000008, 0.000027, 0.000003, -0.000008, 0.000044, 0.000079, 0.000089, -0.000113, 0.000024, 
-0.000027, 0.000003, -0.000008, 0.000075, -0.000067, 0.000044, -0.000044, -0.000008, 0.000022, 0.000021, 
--0.000006, 0.000022, 0.000021, -0.000006, 0.000044, 0.000089, 0.000036, -0.000113, 0.000079, -0.000012, 
-0.000022, 0.000021, -0.000006, 0.000027, 0.000003, -0.000008, -0.000009, 0.000199, 0.000065, -0.000088, 
-0.000065, -0.000065, -0.000065, -0.000088, -0.000010, -0.000005, 0.000022, 0.000021, -0.000006, 0.000022, 
-0.000021, -0.000006, 0.000016, 0.000014, -0.000005, 0.000047, -0.000043, -0.000005, 0.000030, -0.000006, 
--0.000002, -0.000007, -0.000004, -0.000085, 0.000096, 0.000056, -0.000056, -0.000011, 0.000022, 0.000021, 
--0.000006, 0.000022, 0.000021, -0.000006, 0.000092, -0.000015, -0.000046, -0.000012, -0.000013, -0.000005, 
--0.000044, 0.000322, -0.000044, -0.000046, -0.000117, -0.000024, 0.000027, 0.000003, -0.000008, -0.000095, 
-0.000065, 0.000186, -0.000113, 0.000044, 0.000036, -0.000153, 0.000022, 0.000021, -0.000006, -0.000002, 
--0.000008, 0.000118, -0.000063, -0.000078, 0.000028, 0.000016, -0.000005, -0.000006, 0.000285, -0.000006, 
--0.000015, -0.000055, -0.000056, -0.000044, -0.000044, -0.000037, -0.000002, -0.000008, -0.000063, 0.000118, 
--0.000078, 0.000016, 0.000028, -0.000005, 0.000054, 0.000049, -0.000063, 0.000028, 0.000016, -0.000005, 
--0.000001, -0.000001, 0.000127, -0.000005, -0.000005, -0.000088, -0.000019, -0.000043, 0.000047, -0.000005, 
-0.000027, 0.000003, -0.000008, 0.000027, 0.000003, -0.000008, 0.000013, 0.000012, -0.000004, 0.000045, 
-0.000045, -0.000013, 0.000069, 0.000036, 0.000035, -0.000113, 0.000044, -0.000007, 0.000227, -0.000010, 
--0.000010, -0.000080, -0.000080, -0.000024, -0.000007, 0.000164, -0.000047, -0.000047, -0.000005, -0.000021, 
--0.000012, -0.000085, 0.000096, -0.000056, 0.000056, -0.000011, 0.000089, 0.000044, -0.000113, 0.000036, 
-0.000079, -0.000012, 0.000045, 0.000045, -0.000013, -0.000080, 0.000236, -0.000080, -0.000010, -0.000010, 
--0.000113, 0.000036, 0.000032, 0.000069, 0.000036, -0.000113, 0.000044, 0.000035, -0.000007, 0.000047, 
--0.000043, -0.000005, -0.000070, 0.000202, -0.000007, -0.000072, -0.000010, -0.000021, -0.000059, -0.000059, 
-0.000066, 0.000044, 0.000044, 0.000035, -0.000018, 0.000100, -0.000071, -0.000005, 0.000071, 0.000057, 
-0.000049, 0.000036, -0.000100, 0.000034, -0.000002, 0.000045, 0.000045, -0.000013, 0.000069, 0.000036, 
--0.000113, 0.000044, 0.000035, -0.000007, 0.000047, -0.000043, -0.000005, -0.000059, -0.000059, 0.000044, 
-0.000066, 0.000044, 0.000035, -0.000018, 0.000044, 0.000036, 0.000089, 0.000079, -0.000113, -0.000012, 
-0.000027, 0.000003, -0.000008, 0.000236, -0.000010, -0.000080, -0.000080, 0.000036, -0.000010, -0.000113, 
-0.000032, -0.000002, -0.000008, 0.000118, -0.000063, -0.000078, 0.000028, 0.000016, -0.000005, -0.000002, 
--0.000008, 0.000118, -0.000063, -0.000078, 0.000016, 0.000028, -0.000005, 0.000022, 0.000021, -0.000006, 
--0.000133, -0.000007, 0.000155, 0.000099, -0.000099, -0.000010, -0.000005, -0.000002, 0.000118, -0.000008, 
--0.000078, -0.000063, 0.000028, 0.000016, -0.000005, -0.000067, 0.000075, 0.000044, -0.000044, -0.000008, 
-0.000075, -0.000067, 0.000044, -0.000044, -0.000008, -0.000002, 0.000118, -0.000078, -0.000008, -0.000063, 
-0.000016, 0.000028, -0.000005, 0.000027, 0.000003, -0.000008, 0.000047, -0.000043, -0.000005, 0.000022, 
-0.000021, -0.000006, 0.000022, 0.000021, -0.000006, 0.000010, 0.000001, -0.000003, -0.000059, -0.000059, 
-0.000044, 0.000066, 0.000044, 0.000035, -0.000018, -0.000159, 0.000117, 0.000055, 0.000056, 0.000154, 
--0.000011, 0.000055, 0.000049, 0.000034, 0.000028, -0.000088, -0.000005, 0.000022, 0.000021, -0.000006, 
-0.000054, 0.000049, 0.000028, -0.000063, 0.000016, -0.000005, -0.000009, 0.000213, -0.000065, -0.000005, 
--0.000065, -0.000021, -0.000012, 0.000044, -0.000113, 0.000089, 0.000036, 0.000079, -0.000012, -0.000003, 
--0.000003, 0.000137, -0.000013, -0.000044, -0.000044, -0.000006, -0.000003, -0.000003, -0.000013, 0.000137, 
--0.000044, -0.000044, -0.000006, -0.000004, -0.000055, 0.000078, -0.000074, 0.000055, -0.000004, 0.000096, 
--0.000039, -0.000029, 0.000029, -0.000029, 0.000029, -0.000039, -0.000010, -0.000005, 0.000077, 0.000402, 
--0.000358, 0.000027, 0.000003, -0.000008, 0.000013, 0.000012, -0.000004, -0.000043, 0.000027, 0.000028, 
-0.000021, 0.000003, 0.000130, -0.000125, 0.000041, -0.000075, 0.000041, -0.000013, 0.000022, 0.000021, 
--0.000006, 0.000089, 0.000079, 0.000044, 0.000036, -0.000113, -0.000012, 0.000022, 0.000021, -0.000006, 
-0.000027, 0.000003, -0.000008, 0.000077, 0.000402, -0.000358, -0.000005, -0.000044, 0.000322, -0.000117, 
--0.000044, -0.000046, -0.000024, -0.000113, 0.000044, 0.000036, 0.000089, 0.000079, -0.000012, 0.000054, 
-0.000049, 0.000028, -0.000063, 0.000016, -0.000005, 0.000022, 0.000021, -0.000006, 0.000055, -0.000088, 
-0.000034, 0.000028, 0.000049, -0.000005, 0.000016, 0.000010, -0.000113, 0.000067, 0.000008, 0.000036, 
-0.000044, -0.000007, -0.000003, -0.000003, -0.000044, 0.000137, -0.000013, -0.000044, -0.000006, 0.000027, 
-0.000003, -0.000008, 0.000067, 0.000036, 0.000044, 0.000008, -0.000113, -0.000007, -0.000003, -0.000003, 
--0.000044, 0.000137, -0.000013, -0.000044, -0.000006, -0.000003, -0.000003, -0.000044, -0.000013, 0.000137, 
--0.000044, -0.000006, -0.000007, 0.000148, 0.000047, -0.000063, -0.000063, 0.000047, -0.000047, -0.000047, 
--0.000010, -0.000005, 0.000139, 0.000710, -0.000636, 0.000027, 0.000003, -0.000008, 0.000013, 0.000012, 
--0.000004, 0.000013, 0.000012, -0.000004, 0.000055, 0.000049, -0.000088, 0.000034, 0.000028, -0.000005, 
-0.000022, 0.000021, -0.000006, 0.000045, -0.000003, -0.000014, -0.000006, 0.000117, -0.000159, 0.000055, 
-0.000056, 0.000154, -0.000011, 0.000227, -0.000010, -0.000080, -0.000080, -0.000010, -0.000024, 0.000027, 
-0.000003, -0.000008, 0.000027, 0.000003, -0.000008, 0.000285, -0.000015, -0.000006, -0.000006, -0.000044, 
--0.000044, -0.000055, -0.000056, -0.000037, 0.000055, 0.000049, -0.000088, 0.000034, 0.000028, -0.000005, 
-0.000071, 0.000173, -0.000018, -0.000133, -0.000007, 0.000155, -0.000099, 0.000099, -0.000010, -0.000005, 
-0.000022, 0.000021, -0.000006, 0.000022, 0.000021, -0.000006, 0.000100, -0.000071, -0.000005, 0.000071, 
-0.000027, 0.000003, -0.000008, -0.000003, -0.000003, -0.000044, 0.000137, -0.000013, -0.000044, -0.000006, 
-0.000045, 0.000045, -0.000013, 0.000036, 0.000067, -0.000113, 0.000044, 0.000008, -0.000007, 0.000075, 
--0.000067, 0.000044, -0.000044, -0.000008, -0.000003, -0.000003, -0.000044, 0.000137, -0.000013, -0.000044, 
--0.000006, 0.000022, 0.000021, -0.000006, 0.000022, 0.000021, -0.000006, -0.000080, -0.000010, 0.000227, 
--0.000080, -0.000010, -0.000024, 0.000035, 0.000036, 0.000069, 0.000044, -0.000113, -0.000007, 0.000075, 
--0.000067, 0.000044, -0.000044, -0.000008, 0.000075, -0.000067, 0.000044, -0.000044, -0.000008, 0.000034, 
-0.000028, -0.000088, 0.000055, 0.000049, -0.000005, 0.000022, 0.000021, -0.000006, 0.000028, 0.000019, 
-0.000027, 0.000028, -0.000043, 0.000021, 0.000003, -0.000060, 0.000176, -0.000010, -0.000005, -0.000084, 
--0.000007, 0.000202, -0.000070, -0.000072, -0.000010, -0.000021, -0.000059, -0.000059, 0.000066, 0.000044, 
-0.000044, 0.000035, -0.000018, 0.000045, -0.000004, -0.000003, -0.000014, -0.000006, 0.000027, 0.000003, 
--0.000008, -0.000095, 0.000065, -0.000113, 0.000044, 0.000036, 0.000186, -0.000153, -0.000059, -0.000059, 
-0.000066, 0.000044, 0.000044, 0.000035, -0.000018, 0.000067, -0.000113, 0.000036, 0.000044, 0.000008, 
--0.000007, -0.000012, 0.000092, -0.000015, -0.000046, -0.000013, 0.000021, -0.000043, 0.000027, 0.000028, 
-0.000003, 0.000036, -0.000113, -0.000010, 0.000236, -0.000010, -0.000080, -0.000080, 0.000032, -0.000113, 
-0.000067, 0.000044, 0.000036, 0.000008, -0.000007, -0.000012, 0.000092, -0.000015, -0.000046, -0.000013, 
-0.000021, -0.000043, 0.000027, 0.000028, 0.000003, 0.000022, 0.000021, -0.000006, -0.000113, 0.000036, 
-0.000044, 0.000067, 0.000008, -0.000007, -0.000003, -0.000003, -0.000044, -0.000013, 0.000137, -0.000044, 
--0.000006, 0.000132, 0.000675, -0.000594, 0.000054, 0.000028, 0.000016, -0.000063, 0.000049, -0.000005, 
-0.000047, 0.000045, -0.000021, 0.000016, -0.000021, 0.000016, -0.000013, 0.000100, -0.000071, -0.000005, 
-0.000071, 0.000027, 0.000003, -0.000008, -0.000059, -0.000059, 0.000044, 0.000066, 0.000044, 0.000035, 
--0.000018, 0.000057, 0.000036, 0.000049, -0.000100, 0.000034, -0.000002, 0.000022, 0.000021, -0.000006, 
--0.000029, 0.000099, -0.000004, -0.000005, -0.000010, -0.000036, -0.000017, 0.000042, -0.000078, 0.000044, 
--0.000029, 0.000029, 0.000022, 0.000021, -0.000006, -0.000070, 0.000202, -0.000007, -0.000072, -0.000021, 
-0.000034, -0.000088, 0.000055, 0.000049, 0.000028, -0.000005, 0.000045, 0.000045, -0.000013, 0.000017, 
-0.000023, -0.000136, 0.000017, 0.000017, 0.000106, 0.000029, -0.000019, 0.000155, -0.000136, 0.000088, 
--0.000088, -0.000000, 0.000088, 0.000155, -0.000019, -0.000088, -0.000136, -0.000019, -0.000136, 0.000155, 
-0.000088, -0.000088, -0.000000, 0.000044, 0.000089, 0.000079, -0.000113, 0.000036, -0.000012, 0.000022, 
-0.000021, -0.000006, 0.000022, 0.000021, -0.000006, 0.000047, -0.000043, -0.000005, 0.000013, 0.000012, 
--0.000004, -0.000005, -0.000044, -0.000117, -0.000044, 0.000322, -0.000046, -0.000024, -0.000026, 0.000029, 
--0.000018, -0.000003, -0.000070, 0.000202, -0.000072, -0.000010, -0.000007, -0.000021, 0.000067, 0.000044, 
-0.000036, 0.000008, -0.000113, -0.000007, -0.000003, -0.000003, -0.000044, 0.000137, -0.000013, -0.000044, 
--0.000006, -0.000003, -0.000003, -0.000044, -0.000013, 0.000137, -0.000044, -0.000006, 0.000044, -0.000113, 
-0.000089, 0.000079, 0.000036, -0.000012, -0.000010, 0.000176, -0.000060, -0.000005, -0.000084, 0.000010, 
-0.000019, 0.000034, -0.000029, 0.000011, -0.000001, -0.000001, -0.000001, -0.000001, -0.000003, 0.000087, 
--0.000013, -0.000013, -0.000013, -0.000021, -0.000012, -0.000067, 0.000044, 0.000075, -0.000044, -0.000008, 
-0.000117, 0.000055, 0.000056, -0.000159, 0.000154, -0.000011, -0.000002, -0.000001, -0.000002, -0.000020, 
--0.000004, 0.000216, -0.000005, -0.000024, -0.000034, -0.000021, -0.000068, 0.000027, 0.000003, -0.000008, 
-0.000019, 0.000010, 0.000011, 0.000034, -0.000029, -0.000001, 0.000022, 0.000021, -0.000006, 0.000028, 
-0.000019, -0.000001, -0.000001, 0.000127, -0.000005, -0.000005, -0.000088, -0.000019, -0.000043, 0.000047, 
--0.000005, 0.000227, -0.000010, -0.000010, -0.000080, -0.000080, -0.000024, 0.000022, 0.000021, -0.000006, 
-0.000092, -0.000015, -0.000046, -0.000012, -0.000013, -0.000043, 0.000027, 0.000028, 0.000021, 0.000003, 
--0.000005, -0.000044, 0.000322, -0.000046, -0.000044, -0.000117, -0.000024, -0.000113, 0.000089, 0.000044, 
-0.000079, 0.000036, -0.000012, 0.000027, 0.000003, -0.000008, -0.000113, 0.000089, 0.000044, 0.000036, 
-0.000079, -0.000012, 0.000027, 0.000028, -0.000043, 0.000021, 0.000003, 0.000027, 0.000028, -0.000043, 
-0.000021, 0.000003, -0.000027, 0.000010, 0.000016, 0.000036, 0.000056, -0.000029, 0.000011, -0.000082, 
-0.000055, 0.000049, -0.000088, 0.000034, 0.000028, -0.000005, 0.000022, 0.000021, -0.000006, -0.000009, 
-0.000164, -0.000052, -0.000006, -0.000063, -0.000021, -0.000113, 0.000069, 0.000044, 0.000035, 0.000036, 
--0.000007, 0.000027, -0.000043, 0.000021, 0.000028, 0.000003, -0.000007, 0.000164, -0.000047, -0.000005, 
--0.000047, -0.000021, -0.000012, -0.000006, -0.000006, 0.000285, -0.000015, -0.000055, -0.000056, -0.000044, 
--0.000044, -0.000037, -0.000159, 0.000117, 0.000055, 0.000056, 0.000154, -0.000011, -0.000001, -0.000001, 
--0.000003, 0.000133, -0.000013, -0.000047, -0.000013, -0.000003, -0.000021, -0.000012, 0.000013, 0.000012, 
--0.000004, 0.000202, -0.000072, -0.000007, -0.000070, -0.000010, -0.000021, -0.000133, -0.000007, 0.000155, 
--0.000099, 0.000099, -0.000010, -0.000005, 0.000045, 0.000047, -0.000021, -0.000021, 0.000016, 0.000016, 
--0.000013, 0.000045, 0.000045, -0.000013, 0.000069, -0.000113, 0.000044, 0.000036, 0.000035, -0.000007, 
--0.000001, -0.000001, 0.000070, -0.000013, -0.000013, -0.000003, -0.000021, -0.000012, -0.000009, 0.000213, 
--0.000065, -0.000065, -0.000005, -0.000021, -0.000012, 0.000044, 0.000089, 0.000036, -0.000113, 0.000079, 
--0.000012, 0.000176, -0.000010, -0.000060, -0.000005, -0.000084, 0.000045, 0.000045, -0.000013, 0.000036, 
--0.000113, 0.000044, 0.000069, 0.000035, -0.000007, -0.000001, -0.000001, 0.000127, -0.000005, -0.000005, 
--0.000088, -0.000019, -0.000043, 0.000047, -0.000005, 0.000028, 0.000019, 0.000126, -0.000043, -0.000080, 
-0.000028, -0.000032, 0.000008, -0.000050, 0.000046, 0.000024, 0.000126, -0.000043, -0.000052, -0.000032, 
-0.000055, 0.000049, -0.000088, 0.000034, 0.000028, -0.000005, 0.000045, -0.000003, -0.000014, -0.000006, 
--0.000003, -0.000003, -0.000044, -0.000013, 0.000137, -0.000044, -0.000006, 0.000036, 0.000069, -0.000113, 
-0.000044, 0.000035, -0.000007, 0.000067, -0.000113, 0.000036, 0.000044, 0.000008, -0.000007, -0.000113, 
--0.000010, 0.000236, -0.000080, -0.000080, 0.000036, -0.000010, 0.000032, 0.000034, -0.000088, 0.000028, 
-0.000055, 0.000049, -0.000005, 0.000022, 0.000021, -0.000006, -0.000043, 0.000126, -0.000052, -0.000032, 
--0.000026, 0.000029, -0.000018, -0.000003, 0.000044, 0.000036, 0.000089, 0.000079, -0.000113, -0.000012, 
-0.000027, 0.000021, 0.000028, -0.000043, 0.000003, 0.000028, 0.000027, 0.000021, -0.000043, 0.000003, 
--0.000007, -0.000007, -0.000104, -0.000099, 0.000327, -0.000036, -0.000021, -0.000054, 0.000036, 0.000013, 
--0.000031, 0.000100, -0.000071, -0.000005, 0.000071, -0.000002, -0.000008, 0.000118, -0.000078, -0.000063, 
-0.000028, 0.000016, -0.000005, 0.000027, 0.000003, -0.000008, -0.000002, -0.000008, 0.000118, -0.000078, 
--0.000063, 0.000016, 0.000028, -0.000005, 0.000057, 0.000049, -0.000100, 0.000032, 0.000092, -0.000015, 
--0.000012, -0.000046, -0.000013, -0.000043, 0.000027, 0.000021, 0.000028, 0.000003, 0.000092, -0.000015, 
--0.000012, -0.000046, -0.000013, 0.000012, 0.000013, -0.000004, -0.000001, 0.000036, -0.000003, -0.000013, 
--0.000010, -0.000006, -0.000070, -0.000010, 0.000202, -0.000072, -0.000007, -0.000021, -0.000113, 0.000044, 
-0.000008, 0.000036, 0.000067, -0.000007, 0.000034, -0.000088, 0.000028, 0.000055, 0.000049, -0.000005, 
-0.000285, -0.000015, -0.000006, -0.000055, -0.000006, -0.000056, -0.000044, -0.000044, -0.000037, 0.000402, 
-0.000077, -0.000358, 0.000013, 0.000012, -0.000004, -0.000005, 0.000322, -0.000044, -0.000046, -0.000044, 
--0.000117, -0.000024, -0.000113, 0.000089, 0.000079, 0.000044, 0.000036, -0.000012, 0.000202, -0.000072, 
--0.000007, -0.000070, -0.000010, -0.000021, 0.000176, -0.000010, -0.000060, -0.000005, -0.000084, 0.000028, 
-0.000027, -0.000008, -0.000010, -0.000007, 0.000202, -0.000070, -0.000072, -0.000021, -0.000004, 0.000029, 
-0.000096, -0.000039, 0.000029, -0.000039, -0.000029, -0.000029, -0.000010, -0.000005, -0.000002, 0.000118, 
--0.000008, -0.000063, 0.000028, 0.000016, -0.000078, -0.000005, -0.000067, 0.000075, -0.000044, 0.000044, 
--0.000008, 0.000055, 0.000049, 0.000034, 0.000028, -0.000088, -0.000005, 0.000022, 0.000021, -0.000006, 
--0.000002, 0.000118, -0.000008, -0.000063, 0.000016, 0.000028, -0.000078, -0.000005, 0.000045, 0.000045, 
--0.000013, 0.000008, 0.000036, 0.000067, -0.000113, 0.000044, -0.000007, -0.000113, -0.000080, -0.000010, 
-0.000236, -0.000010, 0.000036, -0.000080, 0.000032, 0.000008, 0.000044, -0.000113, 0.000067, 0.000036, 
--0.000007, 0.000047, -0.000043, -0.000005, 0.000022, 0.000021, -0.000006, -0.000003, -0.000003, 0.000137, 
--0.000013, -0.000044, -0.000044, -0.000006, -0.000003, -0.000003, -0.000013, 0.000137, -0.000044, -0.000044, 
--0.000006, 0.000054, 0.000049, -0.000063, 0.000028, 0.000016, -0.000005, 0.000092, -0.000015, -0.000012, 
--0.000046, -0.000013, 0.000041, 0.000130, -0.000125, -0.000075, 0.000041, -0.000013, -0.000059, -0.000059, 
-0.000044, 0.000066, 0.000044, 0.000035, -0.000018, 0.000022, 0.000021, -0.000006, 0.000022, 0.000021, 
--0.000006, 0.000079, 0.000089, 0.000044, 0.000036, -0.000113, -0.000012, 0.000045, 0.000045, -0.000013, 
-0.000067, -0.000113, 0.000008, 0.000036, 0.000044, -0.000007, 0.000022, 0.000021, -0.000006, -0.000010, 
-0.000227, -0.000010, -0.000080, -0.000080, -0.000024, 0.000022, 0.000021, -0.000006, 0.000022, 0.000021, 
--0.000006, 0.000022, 0.000021, -0.000006, -0.000059, -0.000059, 0.000066, 0.000044, 0.000044, 0.000035, 
--0.000018, 0.000045, 0.000047, -0.000021, 0.000016, -0.000021, 0.000016, -0.000013, 0.000027, 0.000003, 
--0.000008, -0.000113, 0.000036, 0.000044, 0.000067, 0.000008, -0.000007, 0.000027, 0.000003, -0.000008, 
-0.000089, 0.000079, -0.000113, 0.000044, 0.000036, -0.000012, -0.000029, -0.000002, 0.000077, -0.000005, 
--0.000021, -0.000012, -0.000113, 0.000067, 0.000008, 0.000044, 0.000036, -0.000007, -0.000026, 0.000029, 
-0.000018, -0.000018, -0.000003, -0.000027, -0.000029, 0.000010, 0.000056, 0.000016, 0.000011, 0.000036, 
--0.000082, 0.000013, 0.000012, -0.000004, 0.000034, -0.000088, 0.000055, 0.000049, 0.000028, -0.000005, 
-0.000027, 0.000003, -0.000008, -0.000002, -0.000008, 0.000118, -0.000078, 0.000028, 0.000016, -0.000063, 
--0.000005, -0.000002, 0.000118, -0.000078, -0.000008, 0.000016, 0.000028, -0.000063, -0.000005, 0.000092, 
--0.000015, -0.000012, -0.000046, -0.000013, -0.000043, 0.000027, 0.000021, 0.000028, 0.000003, 0.000092, 
--0.000046, -0.000015, -0.000012, -0.000013, -0.000029, 0.000019, 0.000010, 0.000011, 0.000034, -0.000001, 
-0.000022, 0.000021, -0.000006, 0.000022, 0.000021, -0.000006, 0.000013, 0.000012, -0.000004, -0.000005, 
--0.000044, -0.000044, -0.000117, -0.000046, 0.000322, -0.000024, 0.000022, 0.000021, -0.000006, -0.000010, 
--0.000070, 0.000202, -0.000007, -0.000072, -0.000021, -0.000001, -0.000001, -0.000005, 0.000127, -0.000005, 
--0.000088, -0.000019, 0.000016, 0.000014, -0.000005, 0.000045, 0.000045, -0.000013, 0.000117, -0.000159, 
-0.000055, 0.000056, 0.000154, -0.000011, 0.000027, -0.000043, 0.000028, 0.000021, 0.000003, 0.000126, 
--0.000043, -0.000052, -0.000032, 0.000126, -0.000043, -0.000052, -0.000032, -0.000004, -0.000029, 0.000099, 
--0.000029, -0.000005, -0.000010, -0.000007, -0.000001, -0.000001, -0.000005, 0.000127, -0.000005, -0.000088, 
--0.000019, -0.000043, 0.000126, -0.000052, -0.000032, -0.000001, -0.000001, -0.000005, 0.000127, -0.000005, 
--0.000088, -0.000019, -0.000043, 0.000047, -0.000005, 0.000045, 0.000045, -0.000013, 0.000036, 0.000035, 
-0.000069, -0.000113, 0.000044, -0.000007, 0.000036, 0.000044, 0.000067, 0.000008, -0.000113, -0.000007, 
--0.000003, -0.000003, -0.000044, 0.000137, -0.000013, -0.000044, -0.000006, -0.000059, -0.000059, 0.000044, 
-0.000066, 0.000044, 0.000035, -0.000018, -0.000009, -0.000065, 0.000213, -0.000005, -0.000065, -0.000021, 
--0.000012, -0.000027, -0.000029, 0.000010, 0.000056, 0.000016, 0.000011, 0.000036, -0.000082, -0.000005, 
--0.000044, 0.000322, -0.000117, -0.000044, -0.000046, -0.000024, -0.000113, 0.000067, 0.000008, 0.000036, 
-0.000044, -0.000007, -0.000003, -0.000003, -0.000044, 0.000137, -0.000013, -0.000044, -0.000006, -0.000159, 
-0.000117, 0.000055, 0.000056, 0.000154, -0.000011, 0.000126, -0.000043, -0.000052, -0.000032, -0.000043, 
-0.000126, -0.000052, -0.000032, 0.000022, 0.000021, -0.000006, -0.000133, -0.000007, 0.000099, 0.000155, 
--0.000099, -0.000010, -0.000005, 0.000027, 0.000028, 0.000021, -0.000043, 0.000003, -0.000027, 0.000056, 
-0.000010, 0.000016, 0.000011, -0.000029, 0.000036, -0.000082, -0.000059, -0.000059, 0.000044, 0.000066, 
-0.000044, 0.000035, -0.000018, -0.000007, 0.000164, -0.000047, -0.000047, -0.000021, -0.000005, -0.000012, 
--0.000088, 0.000055, 0.000049, 0.000028, 0.000034, -0.000005, 0.000022, 0.000021, -0.000006, 0.000092, 
--0.000015, -0.000012, -0.000046, -0.000013, -0.000001, -0.000001, -0.000001, 0.000087, -0.000003, -0.000013, 
--0.000013, -0.000021, -0.000025, 0.000028, 0.000019, -0.000060, 0.000176, -0.000010, -0.000005, -0.000084, 
-0.000027, 0.000003, -0.000008, 0.000044, 0.000036, 0.000089, -0.000113, 0.000079, -0.000012, 0.000161, 
--0.000161, 0.000509, -0.000178, -0.000216, -0.000013, -0.000026, -0.000021, 0.000036, 0.000057, -0.000100, 
-0.000034, 0.000049, -0.000002, 0.000096, 0.000056, -0.000085, -0.000056, -0.000011, 0.000012, 0.000013, 
--0.000004, -0.000001, -0.000002, 0.000092, -0.000003, -0.000029, -0.000013, -0.000021, -0.000012, -0.000001, 
--0.000001, -0.000003, 0.000133, -0.000003, -0.000047, -0.000013, -0.000013, -0.000021, -0.000012, 0.000010, 
-0.000034, 0.000019, -0.000029, 0.000011, -0.000001, 0.000022, 0.000021, -0.000006, -0.000002, 0.000118, 
--0.000078, -0.000008, -0.000063, 0.000028, 0.000016, -0.000005, -0.000015, 0.000092, -0.000046, -0.000012, 
--0.000013, 0.000022, 0.000021, -0.000006, -0.000067, 0.000044, 0.000075, -0.000044, -0.000008, 0.000022, 
-0.000021, -0.000006, 0.000008, 0.000067, -0.000113, 0.000036, 0.000044, -0.000007, 0.000045, 0.000047, 
--0.000021, 0.000016, -0.000021, 0.000016, -0.000013, -0.000002, 0.000118, -0.000078, -0.000008, -0.000063, 
-0.000016, 0.000028, -0.000005, -0.000015, -0.000046, 0.000092, -0.000012, -0.000013, 0.000036, -0.000113, 
--0.000010, 0.000236, -0.000010, -0.000080, -0.000080, 0.000032, -0.000004, -0.000001, -0.000002, -0.000002, 
--0.000055, 0.000221, -0.000024, -0.000005, -0.000003, -0.000034, -0.000020, -0.000021, -0.000014, 0.000047, 
--0.000021, 0.000016, 0.000045, -0.000021, 0.000016, -0.000013, -0.000001, -0.000041, 0.000171, -0.000015, 
--0.000059, -0.000017, -0.000027, -0.000005, -0.000044, -0.000117, 0.000322, -0.000044, -0.000046, -0.000024, 
--0.000003, -0.000003, -0.000044, 0.000137, -0.000013, -0.000044, -0.000006, -0.000088, 0.000028, 0.000055, 
-0.000034, 0.000049, -0.000005, 0.000022, 0.000021, -0.000006, -0.000007, 0.000202, -0.000070, -0.000010, 
--0.000072, -0.000021, -0.000043, 0.000027, 0.000021, 0.000028, 0.000003, 0.000045, 0.000016, -0.000021, 
-0.000047, -0.000021, 0.000016, -0.000013, -0.000067, 0.000075, -0.000044, 0.000044, -0.000008, 0.000045, 
-0.000045, -0.000013, 0.000036, 0.000067, -0.000113, 0.000044, 0.000008, -0.000007, -0.000080, -0.000010, 
-0.000227, -0.000080, -0.000010, -0.000024, -0.000113, 0.000036, 0.000069, 0.000044, 0.000035, -0.000007, 
-0.000044, -0.000113, 0.000036, 0.000067, 0.000008, -0.000007, 0.000034, -0.000088, 0.000055, 0.000028, 
-0.000049, -0.000005, 0.000057, 0.000157, -0.000014, 0.000036, -0.000113, 0.000044, 0.000067, 0.000008, 
--0.000007, -0.000003, -0.000003, -0.000044, -0.000013, 0.000137, -0.000044, -0.000006, 0.000071, 0.000173, 
--0.000018, 0.000022, 0.000021, -0.000006, 0.000013, 0.000012, -0.000004, 0.000045, 0.000045, -0.000013, 
--0.000003, -0.000047, -0.000005, 0.000101, -0.000021, -0.000012, 0.000054, 0.000049, -0.000063, 0.000028, 
-0.000016, -0.000005, 0.000022, 0.000021, -0.000006, -0.000002, 0.000118, -0.000078, -0.000008, -0.000063, 
-0.000028, 0.000016, -0.000005, -0.000015, -0.000046, 0.000092, -0.000012, -0.000013, -0.000067, 0.000044, 
-0.000075, -0.000044, -0.000008, -0.000002, 0.000118, -0.000078, -0.000063, -0.000008, 0.000016, 0.000028, 
--0.000005, -0.000015, -0.000046, 0.000092, -0.000012, -0.000013, 0.000117, -0.000159, 0.000055, 0.000056, 
-0.000154, -0.000011, 0.000022, 0.000021, -0.000006, 0.000022, 0.000021, -0.000006, 0.000027, 0.000003, 
--0.000008, -0.000006, -0.000006, 0.000285, -0.000044, -0.000044, -0.000015, -0.000055, -0.000056, -0.000037, 
-0.000402, 0.000077, -0.000358, 0.000027, 0.000003, -0.000008, 0.000079, -0.000113, 0.000044, 0.000036, 
-0.000089, -0.000012, 0.000100, -0.000005, -0.000071, 0.000071, -0.000004, -0.000029, -0.000029, 0.000099, 
--0.000005, -0.000010, -0.000007, -0.000026, 0.000018, 0.000029, -0.000018, -0.000003, -0.000005, -0.000065, 
-0.000125, -0.000005, -0.000021, -0.000012, -0.000010, -0.000060, 0.000176, -0.000005, -0.000084, 0.000045, 
-0.000045, -0.000013, -0.000113, 0.000036, 0.000044, 0.000035, 0.000069, -0.000007, 0.000012, 0.000013, 
--0.000004, 0.000044, -0.000113, 0.000036, 0.000067, 0.000008, -0.000007, -0.000003, -0.000003, -0.000044, 
--0.000013, 0.000137, -0.000044, -0.000006, 0.000022, 0.000021, -0.000006, -0.000007, 0.000202, -0.000070, 
--0.000072, -0.000010, -0.000021, 0.000045, 0.000045, -0.000013, 0.000036, 0.000067, -0.000113, 0.000044, 
-0.000008, -0.000007, 0.000044, 0.000067, -0.000113, 0.000036, 0.000008, -0.000007, -0.000159, 0.000117, 
-0.000055, 0.000056, 0.000154, -0.000011, -0.000080, -0.000080, -0.000010, 0.000227, -0.000010, -0.000024, 
-0.000022, 0.000021, -0.000006, -0.000113, 0.000067, 0.000036, 0.000044, 0.000008, -0.000007, -0.000113, 
-0.000067, 0.000008, 0.000044, 0.000036, -0.000007, -0.000003, -0.000003, 0.000137, -0.000044, -0.000013, 
--0.000044, -0.000006, -0.000001, -0.000059, -0.000015, 0.000171, -0.000017, -0.000041, -0.000027, -0.000005, 
--0.000044, -0.000117, 0.000322, -0.000044, -0.000046, -0.000024, 0.000132, 0.000675, -0.000594, 0.000022, 
-0.000021, -0.000006, 0.000126, -0.000043, -0.000052, -0.000032, 0.000045, 0.000047, -0.000021, -0.000021, 
-0.000016, 0.000016, -0.000013, 0.000044, 0.000036, -0.000113, 0.000067, 0.000008, -0.000007, -0.000003, 
--0.000003, -0.000044, -0.000013, 0.000137, -0.000044, -0.000006, -0.000001, -0.000001, -0.000005, -0.000005, 
-0.000127, -0.000088, -0.000019, 0.000045, 0.000045, -0.000013, 0.000036, 0.000069, -0.000113, 0.000044, 
-0.000035, -0.000007, 0.000027, 0.000021, -0.000043, 0.000028, 0.000003, 0.000161, 0.000026, 0.000258, 
--0.000178, -0.000255, 0.000000, -0.000001, -0.000015, 0.000171, -0.000059, -0.000017, -0.000041, -0.000027, 
--0.000003, -0.000003, -0.000044, 0.000137, -0.000013, -0.000044, -0.000006, -0.000003, -0.000003, -0.000044, 
--0.000013, 0.000137, -0.000044, -0.000006, -0.000012, 0.000092, -0.000015, -0.000046, -0.000013, 0.000021, 
--0.000043, 0.000027, 0.000028, 0.000003, -0.000009, -0.000065, 0.000213, -0.000005, -0.000065, -0.000021, 
--0.000012, -0.000027, 0.000036, 0.000010, -0.000029, 0.000056, 0.000016, 0.000011, -0.000082, -0.000005, 
--0.000044, 0.000322, -0.000046, -0.000044, -0.000117, -0.000024, 0.000079, -0.000113, 0.000089, 0.000044, 
-0.000036, -0.000012, 0.000100, -0.000005, -0.000071, 0.000071, 0.000057, 0.000049, -0.000100, 0.000032, 
-0.000036, 0.000044, -0.000113, 0.000069, 0.000035, -0.000007, -0.000001, -0.000001, -0.000005, 0.000127, 
--0.000005, -0.000088, -0.000019, 0.000028, 0.000019, -0.000133, -0.000007, 0.000099, 0.000155, -0.000099, 
--0.000010, -0.000005, -0.000005, 0.000322, -0.000044, -0.000117, -0.000044, -0.000046, -0.000024, 0.000027, 
-0.000021, -0.000043, 0.000028, 0.000003, -0.000021, 0.000016, 0.000045, 0.000047, -0.000021, 0.000016, 
--0.000013, 0.000027, 0.000003, -0.000008, -0.000095, -0.000113, 0.000065, 0.000044, 0.000036, 0.000186, 
--0.000153, -0.000006, 0.000164, -0.000009, -0.000063, -0.000052, -0.000021, 0.000034, 0.000028, 0.000055, 
-0.000049, -0.000088, -0.000005, 0.000022, 0.000021, -0.000006, -0.000003, -0.000003, 0.000137, -0.000044, 
--0.000013, -0.000044, -0.000006, -0.000059, -0.000059, 0.000044, 0.000066, 0.000044, 0.000035, -0.000018, 
--0.000113, 0.000067, 0.000008, 0.000044, 0.000036, -0.000007, 0.000049, 0.000034, 0.000028, 0.000055, 
--0.000088, -0.000005, 0.000041, -0.000125, 0.000130, 0.000041, -0.000075, -0.000013, 0.000022, 0.000021, 
--0.000006, -0.000043, 0.000027, 0.000021, 0.000028, 0.000003, -0.000021, -0.000021, 0.000047, 0.000016, 
-0.000045, 0.000016, -0.000013, 0.000077, 0.000402, -0.000358, -0.000059, -0.000059, 0.000044, 0.000066, 
-0.000044, 0.000035, -0.000018, 0.000027, 0.000003, -0.000008, 0.000044, 0.000079, -0.000113, 0.000036, 
-0.000089, -0.000012, -0.000001, -0.000001, -0.000005, 0.000127, -0.000005, -0.000088, -0.000019, 0.000057, 
-0.000049, -0.000100, 0.000034, 0.000036, -0.000002, -0.000015, -0.000046, -0.000012, 0.000092, -0.000013, 
--0.000026, 0.000018, 0.000029, -0.000018, -0.000003, -0.000001, -0.000059, 0.000171, -0.000041, -0.000015, 
--0.000017, -0.000027, -0.000161, 0.000161, -0.000178, 0.000509, -0.000013, -0.000216, -0.000026, -0.000021, 
-0.000041, -0.000125, 0.000041, 0.000130, -0.000075, -0.000013, 0.000044, -0.000113, 0.000036, 0.000089, 
-0.000079, -0.000012, -0.000080, -0.000080, 0.000227, -0.000010, -0.000010, -0.000024, 0.000044, 0.000089, 
-0.000036, -0.000113, 0.000079, -0.000012, -0.000001, -0.000001, 0.000127, -0.000005, -0.000005, -0.000088, 
--0.000019, -0.000043, 0.000047, -0.000005, -0.000003, -0.000003, -0.000044, 0.000137, -0.000013, -0.000044, 
--0.000006, -0.000003, -0.000003, -0.000044, -0.000013, 0.000137, -0.000044, -0.000006, -0.000001, -0.000001, 
-0.000070, -0.000003, -0.000013, -0.000021, -0.000025, 0.000011, -0.000029, 0.000010, 0.000034, 0.000019, 
--0.000001, 0.000022, 0.000021, -0.000006, -0.000067, 0.000044, 0.000075, -0.000044, -0.000008, 0.000092, 
--0.000015, -0.000012, -0.000046, -0.000013, -0.000043, 0.000027, 0.000021, 0.000028, 0.000003, -0.000059, 
--0.000059, 0.000066, 0.000044, 0.000044, 0.000035, -0.000018, -0.000004, -0.000029, 0.000099, -0.000029, 
--0.000005, -0.000010, -0.000007, -0.000007, -0.000007, -0.000099, 0.000327, -0.000104, -0.000036, -0.000021, 
-0.000028, 0.000019, 0.000021, 0.000027, -0.000043, 0.000028, 0.000003, -0.000012, 0.000092, -0.000015, 
--0.000046, -0.000013, 0.000021, -0.000043, 0.000027, 0.000028, 0.000003, 0.000019, 0.000010, 0.000011, 
-0.000034, -0.000029, -0.000001, -0.000067, 0.000044, 0.000075, -0.000044, -0.000008, 0.000055, 0.000049, 
-0.000034, 0.000028, -0.000088, -0.000005, 0.000022, 0.000021, -0.000006, -0.000002, -0.000004, -0.000001, 
--0.000002, -0.000034, -0.000055, -0.000005, -0.000020, 0.000221, -0.000003, -0.000024, -0.000021, -0.000014, 
-0.000008, 0.000005, -0.000001, -0.000010, -0.000070, 0.000202, -0.000072, -0.000007, -0.000021, 0.000044, 
--0.000113, 0.000036, 0.000008, 0.000067, -0.000007, 0.000047, 0.000045, -0.000021, 0.000016, -0.000021, 
-0.000016, -0.000013, 0.000044, 0.000036, 0.000089, 0.000079, -0.000113, -0.000012, -0.000080, -0.000080, 
-0.000227, -0.000010, -0.000010, -0.000024, 0.000028, 0.000027, 0.000021, -0.000043, 0.000003, 0.000045, 
--0.000021, 0.000016, 0.000047, 0.000016, -0.000021, -0.000013, 0.000126, -0.000043, -0.000052, -0.000032, 
-0.000013, 0.000012, -0.000004, -0.000005, -0.000044, -0.000117, 0.000322, -0.000044, -0.000046, -0.000024, 
-0.000027, 0.000003, -0.000008, 0.000079, 0.000044, 0.000036, -0.000113, 0.000089, -0.000012, -0.000007, 
-0.000030, -0.000006, -0.000002, -0.000004, 0.000173, 0.000071, -0.000018, -0.000013, -0.000020, 0.000013, 
-0.000022, -0.000002, -0.000100, 0.000057, 0.000049, 0.000034, 0.000036, -0.000002, 0.000126, -0.000043, 
--0.000052, -0.000032, -0.000043, 0.000126, -0.000052, -0.000032, -0.000001, -0.000001, 0.000127, -0.000005, 
--0.000005, -0.000088, -0.000019, -0.000043, 0.000047, -0.000005, -0.000003, -0.000003, -0.000044, 0.000137, 
--0.000013, -0.000044, -0.000006, -0.000003, -0.000003, -0.000044, -0.000013, 0.000137, -0.000044, -0.000006, 
--0.000003, -0.000003, 0.000137, -0.000013, -0.000044, -0.000044, -0.000006, -0.000003, -0.000003, -0.000013, 
-0.000137, -0.000044, -0.000044, -0.000006, -0.000006, -0.000006, 0.000285, -0.000015, -0.000044, -0.000044, 
--0.000055, -0.000056, -0.000037, -0.000159, 0.000117, 0.000055, 0.000056, 0.000154, -0.000011, 0.000022, 
-0.000021, -0.000006, -0.000003, -0.000003, 0.000137, -0.000013, -0.000044, -0.000044, -0.000006, -0.000003, 
--0.000003, -0.000013, 0.000137, -0.000044, -0.000044, -0.000006, -0.000159, 0.000117, 0.000055, 0.000056, 
-0.000154, -0.000011, 0.000022, 0.000021, -0.000006, 0.000022, 0.000021, -0.000006, -0.000113, -0.000095, 
-0.000065, 0.000044, 0.000036, 0.000186, -0.000153, 0.000126, -0.000043, -0.000052, -0.000032, -0.000005, 
--0.000044, -0.000046, -0.000044, -0.000117, 0.000322, -0.000024, -0.000001, -0.000001, -0.000005, 0.000127, 
--0.000005, -0.000088, -0.000019, -0.000043, 0.000126, -0.000052, -0.000032, 0.000202, -0.000070, -0.000072, 
--0.000007, -0.000010, -0.000021, 0.000028, 0.000019, 0.000132, 0.000675, -0.000594, 0.000022, 0.000021, 
--0.000006, -0.000007, -0.000047, 0.000164, -0.000021, -0.000005, -0.000060, 0.000022, 0.000021, -0.000006, 
-0.000022, 0.000021, -0.000006, 0.000022, 0.000021, -0.000006, -0.000010, 0.000176, -0.000060, -0.000005, 
--0.000084, 0.000045, 0.000047, -0.000021, 0.000016, -0.000021, 0.000016, -0.000013, 0.000008, 0.000067, 
--0.000113, 0.000036, 0.000044, -0.000007, -0.000010, 0.000236, -0.000010, 0.000036, -0.000113, -0.000080, 
--0.000080, 0.000032, 0.000008, -0.000113, 0.000067, 0.000044, 0.000036, -0.000007, 0.000055, 0.000049, 
--0.000088, 0.000028, 0.000034, -0.000005, 0.000022, 0.000021, -0.000006, -0.000007, -0.000047, 0.000164, 
--0.000047, -0.000005, -0.000021, -0.000012, -0.000026, 0.000029, -0.000018, -0.000003, 0.000126, -0.000043, 
--0.000052, -0.000032, -0.000003, -0.000003, -0.000044, 0.000137, -0.000013, -0.000044, -0.000006, -0.000003, 
--0.000003, -0.000044, -0.000013, 0.000137, -0.000044, -0.000006, 0.000022, 0.000021, -0.000006, -0.000071, 
-0.000100, -0.000005, 0.000071, -0.000010, 0.000202, -0.000007, -0.000072, -0.000070, -0.000021, 0.000049, 
--0.000088, 0.000055, 0.000028, 0.000034, -0.000005, -0.000009, -0.000065, 0.000213, -0.000005, -0.000065, 
--0.000021, -0.000012, -0.000027, -0.000029, 0.000010, 0.000011, 0.000056, 0.000016, 0.000036, -0.000082, 
-0.000022, 0.000021, -0.000006, 0.000022, 0.000021, -0.000006, 0.000022, 0.000021, -0.000006, -0.000012, 
--0.000015, 0.000092, -0.000046, -0.000013, 0.000027, 0.000003, -0.000008, -0.000001, -0.000001, -0.000005, 
-0.000127, -0.000005, -0.000088, -0.000019, -0.000043, 0.000047, -0.000005, 0.000176, -0.000060, -0.000010, 
--0.000005, -0.000084, 0.000027, 0.000003, -0.000008, 0.000044, 0.000079, 0.000089, 0.000036, -0.000113, 
--0.000012, 0.000013, 0.000012, -0.000004, -0.000005, -0.000044, -0.000044, -0.000117, -0.000046, 0.000322, 
--0.000024, 0.000028, 0.000019, 0.000022, 0.000021, -0.000006, -0.000003, -0.000003, 0.000137, -0.000013, 
--0.000044, -0.000044, -0.000006, 0.000117, -0.000159, 0.000055, 0.000056, 0.000154, -0.000011, 0.000045, 
-0.000045, -0.000013, 0.000008, 0.000036, 0.000067, -0.000113, 0.000044, -0.000007, -0.000113, 0.000044, 
-0.000036, 0.000067, 0.000008, -0.000007, -0.000113, 0.000036, -0.000080, -0.000010, 0.000236, -0.000080, 
--0.000010, 0.000032, 0.000035, 0.000036, 0.000069, 0.000044, -0.000113, -0.000007, 0.000008, 0.000044, 
--0.000113, 0.000036, 0.000067, -0.000007, 0.000139, 0.000710, -0.000636, -0.000088, 0.000028, 0.000055, 
-0.000049, 0.000034, -0.000005, 0.000022, 0.000021, -0.000006, 0.000139, 0.000710, -0.000636, -0.000006, 
-0.000164, -0.000063, -0.000009, -0.000052, -0.000021, 0.000710, 0.000139, -0.000636, -0.000006, 0.000164, 
--0.000063, -0.000009, -0.000052, -0.000021, 0.000710, 0.000139, -0.000636, 0.000139, 0.000710, -0.000636, 
-0.000047, 0.000045, -0.000021, 0.000016, -0.000021, 0.000016, -0.000013, 0.000013, 0.000012, -0.000004, 
-0.000139, 0.000710, -0.000636, -0.000009, -0.000052, 0.000164, -0.000006, -0.000063, -0.000021, 0.000710, 
-0.000139, -0.000636, -0.000003, -0.000003, -0.000044, 0.000137, -0.000013, -0.000044, -0.000006, -0.000003, 
--0.000003, -0.000044, -0.000013, 0.000137, -0.000044, -0.000006, -0.000133, -0.000007, 0.000099, 0.000155, 
--0.000099, -0.000010, -0.000005, -0.000001, -0.000013, -0.000003, 0.000036, -0.000010, -0.000006, -0.000027, 
-0.000056, -0.000029, 0.000010, 0.000011, 0.000036, 0.000016, -0.000082, -0.000002, -0.000011, -0.000090, 
-0.000137, -0.000063, 0.000028, 0.000016, -0.000009, 0.000022, 0.000021, -0.000006, 0.000092, -0.000015, 
--0.000012, -0.000046, -0.000013, -0.000043, 0.000027, 0.000021, 0.000028, 0.000003, -0.000043, 0.000027, 
-0.000021, 0.000028, 0.000003, 0.000045, -0.000021, 0.000047, 0.000016, -0.000021, 0.000016, -0.000013, 
-0.000139, 0.000710, -0.000636, -0.000009, -0.000052, 0.000164, -0.000063, -0.000006, -0.000021, 0.000045, 
-0.000045, -0.000013, 0.000045, 0.000045, -0.000013, 0.000036, 0.000067, -0.000113, 0.000044, 0.000008, 
--0.000007, -0.000080, -0.000010, 0.000227, -0.000080, -0.000010, -0.000024, 0.000035, 0.000036, 0.000069, 
--0.000113, 0.000044, -0.000007, 0.000021, 0.000027, -0.000043, 0.000028, 0.000003, -0.000015, -0.000006, 
-0.000285, -0.000006, -0.000044, -0.000044, -0.000055, -0.000056, -0.000037, -0.000002, -0.000078, -0.000063, 
-0.000118, -0.000008, 0.000016, 0.000028, -0.000005, -0.000012, 0.000092, -0.000015, -0.000046, -0.000013, 
-0.000021, -0.000043, 0.000027, 0.000028, 0.000003, -0.000007, -0.000007, 0.000327, -0.000104, -0.000099, 
--0.000036, -0.000021, 0.000173, 0.000071, -0.000018, -0.000095, 0.000186, 0.000044, 0.000065, 0.000036, 
--0.000113, -0.000153, 0.000092, -0.000046, -0.000015, -0.000012, -0.000013, 0.000034, -0.000029, 0.000019, 
-0.000010, 0.000011, -0.000001, 0.000011, -0.000029, 0.000010, 0.000019, 0.000034, -0.000001, 0.000044, 
--0.000067, 0.000075, -0.000044, -0.000008, -0.000026, 0.000029, -0.000018, -0.000003, -0.000070, -0.000010, 
-0.000202, -0.000072, -0.000007, -0.000021, -0.000113, 0.000008, 0.000044, 0.000036, 0.000067, -0.000007, 
-0.000034, 0.000049, -0.000088, 0.000028, 0.000055, -0.000005, -0.000010, -0.000010, -0.000080, -0.000080, 
-0.000227, -0.000024, -0.000001, -0.000002, -0.000029, 0.000092, -0.000013, -0.000003, -0.000021, -0.000012, 
--0.000063, 0.000016, 0.000054, 0.000049, 0.000028, -0.000005, -0.000015, -0.000012, 0.000092, -0.000046, 
--0.000013, 0.000022, 0.000021, -0.000006, -0.000001, -0.000001, -0.000005, 0.000127, -0.000005, -0.000088, 
--0.000019, -0.000043, 0.000126, -0.000052, -0.000032, 0.000027, 0.000003, -0.000008, -0.000113, 0.000089, 
-0.000044, 0.000036, 0.000079, -0.000012, 0.000022, 0.000021, -0.000006, 0.000022, 0.000021, -0.000006, 
--0.000043, 0.000028, 0.000027, 0.000021, 0.000003, 0.000045, -0.000021, -0.000021, 0.000016, 0.000047, 
-0.000016, -0.000013, 0.000013, 0.000012, -0.000004, -0.000005, -0.000044, -0.000044, -0.000046, 0.000322, 
--0.000117, -0.000024, 0.000027, 0.000003, -0.000008, 0.000034, -0.000088, 0.000028, 0.000055, 0.000049, 
--0.000005, 0.000022, 0.000021, -0.000006, 0.000008, -0.000113, 0.000067, 0.000044, 0.000036, -0.000007, 
-0.000034, 0.000028, 0.000055, 0.000049, -0.000088, -0.000005, -0.000001, -0.000001, -0.000005, 0.000127, 
--0.000005, -0.000088, -0.000019, -0.000043, 0.000047, -0.000005, 0.000047, 0.000045, -0.000021, 0.000016, 
--0.000021, 0.000016, -0.000013, 0.000044, 0.000036, -0.000113, 0.000067, 0.000008, -0.000007, -0.000003, 
--0.000003, -0.000044, 0.000137, -0.000013, -0.000044, -0.000006, -0.000059, -0.000059, 0.000044, 0.000066, 
-0.000044, 0.000035, -0.000018, -0.000005, -0.000044, -0.000044, 0.000322, -0.000046, -0.000117, -0.000024, 
-0.000044, -0.000113, 0.000089, 0.000036, 0.000079, -0.000012, 0.000022, 0.000021, -0.000006, -0.000001, 
--0.000041, 0.000171, -0.000059, -0.000015, -0.000017, -0.000027, -0.000004, -0.000002, -0.000001, -0.000002, 
--0.000020, -0.000024, -0.000055, -0.000005, 0.000216, -0.000034, -0.000021, -0.000012, 0.000126, -0.000043, 
--0.000052, -0.000032, -0.000001, -0.000001, -0.000005, 0.000127, -0.000005, -0.000088, -0.000019, -0.000043, 
-0.000047, -0.000005, -0.000002, -0.000001, -0.000002, -0.000004, -0.000020, -0.000055, -0.000024, 0.000216, 
--0.000005, -0.000034, -0.000021, -0.000012, -0.000070, -0.000072, -0.000007, 0.000202, -0.000010, -0.000021, 
--0.000095, 0.000044, 0.000186, 0.000065, 0.000036, -0.000113, -0.000153, 0.000126, -0.000043, -0.000052, 
--0.000032, 0.000044, 0.000036, -0.000113, 0.000089, 0.000079, -0.000012, -0.000080, -0.000010, -0.000010, 
-0.000227, -0.000080, -0.000024, 0.000092, -0.000046, -0.000015, -0.000012, -0.000013, -0.000027, 0.000056, 
--0.000029, 0.000016, 0.000036, 0.000010, 0.000011, -0.000082, -0.000001, -0.000001, -0.000013, 0.000070, 
--0.000013, -0.000021, -0.000003, -0.000012, 0.000027, 0.000003, -0.000008, 0.000044, 0.000036, -0.000095, 
-0.000186, 0.000065, -0.000113, -0.000153, -0.000006, 0.000056, -0.000033, -0.000017, 0.000013, 0.000012, 
--0.000004, -0.000005, 0.000322, -0.000044, -0.000117, -0.000044, -0.000046, -0.000024, -0.000002, -0.000008, 
-0.000118, -0.000078, -0.000063, 0.000028, 0.000016, -0.000005, -0.000002, -0.000008, 0.000118, -0.000078, 
--0.000063, 0.000016, 0.000028, -0.000005, -0.000113, 0.000067, 0.000008, 0.000036, 0.000044, -0.000007, 
--0.000003, -0.000003, -0.000013, -0.000044, 0.000137, -0.000044, -0.000006, 0.000044, 0.000036, -0.000113, 
-0.000067, 0.000008, -0.000007, -0.000003, -0.000003, -0.000044, 0.000137, -0.000044, -0.000013, -0.000006, 
--0.000012, 0.000092, -0.000046, -0.000015, -0.000013, 0.000022, 0.000011, -0.000029, 0.000018, 0.000010, 
--0.000001, 0.000021, -0.000043, 0.000028, 0.000027, 0.000003, -0.000012, 0.000092, -0.000046, -0.000015, 
--0.000013, 0.000011, -0.000029, 0.000018, 0.000010, 0.000022, -0.000001, 0.000021, -0.000043, 0.000028, 
-0.000027, 0.000003, 0.000016, 0.000047, 0.000045, 0.000016, -0.000021, -0.000021, -0.000013, 0.000012, 
-0.000013, -0.000004, 0.000028, 0.000019, -0.000025, -0.000013, -0.000161, 0.000683, -0.000003, -0.000021, 
--0.000012, 0.000016, 0.000002, -0.000012, -0.000009, -0.000017, 0.000078, 0.000185, -0.000065, -0.000175, 
--0.000065, 0.000065, -0.000113, 0.000067, 0.000008, 0.000036, 0.000044, -0.000007, -0.000003, -0.000003, 
--0.000013, 0.000137, -0.000044, -0.000044, -0.000006, 0.000092, -0.000015, -0.000046, -0.000012, -0.000013, 
--0.000043, 0.000027, 0.000028, 0.000021, 0.000003, 0.000047, 0.000045, -0.000021, 0.000016, -0.000021, 
-0.000016, -0.000013, -0.000070, 0.000202, -0.000007, -0.000072, -0.000010, -0.000021, 0.000049, 0.000034, 
--0.000088, 0.000055, 0.000028, -0.000005, -0.000043, 0.000126, -0.000052, -0.000032, 0.000044, 0.000036, 
-0.000089, 0.000079, -0.000113, -0.000012, -0.000060, -0.000010, 0.000176, -0.000005, -0.000084, -0.000003, 
--0.000003, -0.000044, 0.000137, -0.000013, -0.000044, -0.000006, -0.000003, -0.000003, -0.000044, -0.000013, 
-0.000137, -0.000044, -0.000006, -0.000007, -0.000070, -0.000010, -0.000072, 0.000202, -0.000021, -0.000012, 
--0.000015, 0.000092, -0.000046, -0.000013, -0.000043, 0.000126, -0.000052, -0.000032, 0.000047, -0.000043, 
--0.000005, 0.000075, -0.000067, 0.000044, -0.000044, -0.000008, -0.000001, -0.000001, -0.000001, -0.000005, 
--0.000003, -0.000013, -0.000013, 0.000174, -0.000065, -0.000013, -0.000021, -0.000012, -0.000027, 0.000056, 
-0.000010, 0.000011, 0.000036, 0.000016, -0.000029, -0.000082, 0.000126, -0.000043, 0.000028, -0.000080, 
--0.000032, 0.000045, -0.000013, 0.000028, 0.000019, 0.000045, 0.000047, -0.000021, 0.000016, -0.000021, 
-0.000016, -0.000013, -0.000007, 0.000202, -0.000070, -0.000072, -0.000010, -0.000021, 0.000013, 0.000012, 
--0.000004, -0.000005, -0.000044, -0.000044, -0.000046, -0.000117, 0.000322, -0.000024, 0.000045, 0.000045, 
--0.000013, -0.000113, 0.000044, 0.000036, 0.000069, 0.000035, -0.000007, -0.000059, -0.000059, 0.000044, 
-0.000066, 0.000044, 0.000035, -0.000018, 0.000027, 0.000003, -0.000008, -0.000113, 0.000044, 0.000036, 
-0.000079, 0.000089, -0.000012, 0.000045, 0.000045, -0.000013, 0.000008, 0.000036, 0.000067, -0.000113, 
-0.000044, -0.000007, -0.000029, 0.000010, 0.000011, 0.000019, 0.000034, -0.000001, -0.000001, -0.000013, 
--0.000003, 0.000036, -0.000010, -0.000006, 0.000117, 0.000055, -0.000159, 0.000056, 0.000154, -0.000011, 
--0.000070, -0.000072, -0.000007, 0.000202, -0.000010, -0.000021, 0.000036, -0.000080, -0.000010, 0.000236, 
--0.000010, -0.000080, -0.000113, 0.000032, 0.000008, 0.000044, -0.000113, 0.000067, 0.000036, -0.000007, 
--0.000002, -0.000001, -0.000002, -0.000004, -0.000005, 0.000221, -0.000055, -0.000003, -0.000034, -0.000024, 
--0.000021, -0.000020, -0.000014, 0.000036, 0.000044, -0.000113, 0.000067, 0.000008, -0.000007, -0.000003, 
--0.000003, -0.000044, -0.000013, 0.000137, -0.000044, -0.000006, 0.000049, 0.000057, -0.000100, 0.000034, 
-0.000036, -0.000002, -0.000059, -0.000059, 0.000044, 0.000066, 0.000044, 0.000035, -0.000018, -0.000001, 
--0.000059, 0.000171, -0.000041, -0.000015, -0.000017, -0.000027, -0.000161, 0.000161, -0.000161, 0.000161, 
--0.000013, -0.000178, 0.000737, -0.000216, -0.000216, -0.000013, -0.000026, -0.000021, 0.000041, -0.000125, 
-0.000041, 0.000130, -0.000075, -0.000013, 0.000028, 0.000019, -0.000001, 0.000036, -0.000003, -0.000010, 
--0.000019, -0.000029, 0.000010, 0.000011, 0.000019, 0.000034, -0.000001, -0.000001, -0.000001, -0.000001, 
--0.000013, -0.000013, -0.000003, 0.000087, -0.000013, -0.000021, -0.000012, -0.000002, -0.000004, -0.000002, 
--0.000001, -0.000020, -0.000055, 0.000216, -0.000034, -0.000005, -0.000021, -0.000024, -0.000012, 0.000022, 
-0.000021, -0.000006, -0.000059, -0.000059, 0.000044, 0.000066, 0.000044, 0.000035, -0.000018, 0.000036, 
-0.000044, -0.000113, 0.000069, 0.000035, -0.000007, -0.000001, -0.000001, -0.000005, -0.000005, 0.000127, 
--0.000088, -0.000019, -0.000012, 0.000092, -0.000015, -0.000046, -0.000013, 0.000021, -0.000043, 0.000027, 
-0.000028, 0.000003, 0.000034, 0.000011, -0.000029, 0.000010, 0.000019, -0.000001, -0.000012, 0.000092, 
--0.000015, -0.000046, -0.000013, 0.000021, -0.000043, 0.000027, 0.000028, 0.000003, -0.000088, 0.000034, 
-0.000028, 0.000055, 0.000049, -0.000005, 0.000022, 0.000021, -0.000006, -0.000009, 0.000164, -0.000006, 
--0.000084, 0.000049, -0.000063, 0.000054, 0.000023, 0.000057, 0.000034, 0.000036, -0.000100, 0.000049, 
--0.000002, -0.000027, 0.000056, 0.000011, 0.000010, -0.000029, 0.000016, 0.000036, -0.000082, 0.000022, 
-0.000021, -0.000006, 0.000022, 0.000021, -0.000006, -0.000006, -0.000006, -0.000015, 0.000285, -0.000044, 
--0.000044, -0.000055, -0.000056, -0.000037, 0.000402, 0.000077, -0.000358, 0.000075, -0.000067, 0.000044, 
--0.000044, -0.000008, 0.000045, 0.000045, -0.000013, 0.000008, 0.000036, 0.000067, -0.000113, 0.000044, 
--0.000007, -0.000080, -0.000010, 0.000227, -0.000010, -0.000080, -0.000024, 0.000008, 0.000044, -0.000113, 
-0.000067, 0.000036, -0.000007, -0.000113, 0.000036, 0.000044, 0.000069, 0.000035, -0.000007, -0.000001, 
--0.000001, -0.000005, -0.000005, 0.000127, -0.000088, -0.000019, 0.000013, 0.000012, -0.000004, -0.000005, 
--0.000044, -0.000044, -0.000046, 0.000322, -0.000141, -0.000010, -0.000060, 0.000176, -0.000005, -0.000084, 
-0.000028, 0.000027, -0.000008, -0.000133, -0.000007, -0.000099, 0.000155, 0.000099, -0.000010, -0.000005, 
--0.000043, 0.000028, 0.000027, 0.000021, 0.000003, -0.000021, -0.000021, 0.000016, 0.000047, 0.000016, 
-0.000045, -0.000013, 0.000044, 0.000036, 0.000089, 0.000079, -0.000113, -0.000012, -0.000010, -0.000060, 
-0.000176, -0.000005, -0.000084, -0.000002, -0.000063, 0.000118, -0.000008, 0.000028, 0.000016, -0.000078, 
--0.000005, -0.000067, 0.000075, -0.000044, 0.000044, -0.000008, -0.000067, 0.000044, 0.000075, -0.000044, 
--0.000008, -0.000001, -0.000001, -0.000001, -0.000005, -0.000003, -0.000065, -0.000013, 0.000174, -0.000013, 
--0.000021, -0.000013, -0.000012, -0.000027, -0.000029, 0.000010, 0.000056, 0.000011, 0.000036, 0.000016, 
--0.000082, -0.000002, -0.000063, 0.000118, -0.000008, 0.000016, 0.000028, -0.000078, -0.000005, -0.000067, 
-0.000075, -0.000044, 0.000044, -0.000008, -0.000015, -0.000006, -0.000006, 0.000285, -0.000055, -0.000056, 
--0.000044, -0.000044, -0.000037, 0.000402, 0.000077, -0.000358, -0.000001, -0.000001, -0.000005, 0.000127, 
--0.000005, -0.000088, -0.000019, -0.000007, -0.000007, -0.000104, -0.000099, 0.000327, -0.000036, -0.000021, 
-0.000028, 0.000019, 0.000041, -0.000125, 0.000041, 0.000130, -0.000075, -0.000013, 1.000000, 0.000045, 
-0.000045, -0.000013, 0.000036, 0.000035, 0.000069, -0.000113, 0.000044, -0.000007, -0.000029, 0.000010, 
-0.000019, 0.000034, 0.000011, -0.000001, -0.000001, -0.000001, -0.000013, -0.000013, -0.000003, 0.000070, 
--0.000021, -0.000012, 0.000027, 0.000003, -0.000008, 0.000044, 0.000036, -0.000113, 0.000079, 0.000089, 
--0.000012, -0.000113, 0.000036, 0.000044, 0.000069, 0.000035, -0.000007, -0.000001, -0.000001, -0.000005, 
--0.000005, 0.000127, -0.000088, -0.000019, -0.000001, -0.000001, -0.000005, 0.000127, -0.000005, -0.000088, 
--0.000019, -0.000043, 0.000126, -0.000052, -0.000032, 0.000055, 0.000049, -0.000088, 0.000034, 0.000028, 
--0.000005, 0.000022, 0.000021, -0.000006, -0.000001, -0.000001, -0.000005, 0.000127, -0.000005, -0.000088, 
--0.000019, -0.000043, 0.000126, -0.000052, -0.000032, 0.000045, 0.000045, -0.000013, 0.000036, 0.000044, 
--0.000113, 0.000069, 0.000035, -0.000007, -0.000100, 0.000034, 0.000057, 0.000049, 0.000036, -0.000002, 
--0.000015, -0.000012, 0.000092, -0.000046, -0.000013, 0.000044, 0.000036, 0.000089, 0.000079, -0.000113, 
--0.000012, -0.000007, -0.000070, -0.000072, 0.000202, -0.000010, -0.000021, -0.000006, -0.000009, 0.000164, 
--0.000063, -0.000052, -0.000021, 0.000675, 0.000132, -0.000594, -0.000007, 0.000202, -0.000070, -0.000072, 
--0.000010, -0.000021, 0.000008, 0.000067, -0.000113, 0.000036, 0.000044, -0.000007, -0.000006, -0.000006, 
-0.000285, -0.000015, -0.000044, -0.000044, -0.000055, -0.000056, -0.000037, -0.000159, 0.000117, 0.000055, 
-0.000056, 0.000154, -0.000011, -0.000010, 0.000227, -0.000010, -0.000080, -0.000080, -0.000024, 0.000008, 
--0.000113, 0.000067, 0.000044, 0.000036, -0.000007, 0.000027, 0.000003, -0.000008, 0.000044, 0.000036, 
--0.000113, 0.000079, 0.000089, -0.000012, -0.000009, 0.000164, -0.000006, -0.000063, -0.000052, -0.000021, 
-0.000049, -0.000063, 0.000054, 0.000028, 0.000016, -0.000005, -0.000012, 0.000092, -0.000046, -0.000015, 
--0.000013, -0.000027, 0.000056, 0.000011, 0.000036, -0.000029, 0.000016, 0.000010, -0.000082, 0.000021, 
--0.000043, 0.000028, 0.000027, 0.000003, 0.000016, -0.000063, 0.000054, 0.000049, 0.000028, -0.000005, 
--0.000015, 0.000092, -0.000012, -0.000046, -0.000013, -0.000012, 0.000092, -0.000046, -0.000015, -0.000013, 
-0.000011, 0.000034, -0.000029, 0.000019, 0.000010, -0.000001, 0.000021, -0.000043, 0.000028, 0.000027, 
-0.000003, 0.000049, 0.000057, -0.000100, 0.000032, -0.000029, 0.000010, 0.000011, 0.000019, 0.000034, 
--0.000001, -0.000001, -0.000001, -0.000001, -0.000013, -0.000013, -0.000003, 0.000087, -0.000013, -0.000021, 
--0.000012, 0.000047, -0.000021, 0.000016, -0.000021, 0.000016, 0.000045, -0.000013, -0.000005, -0.000044, 
--0.000044, 0.000322, -0.000046, -0.000117, -0.000024, 0.000044, 0.000079, -0.000113, 0.000089, 0.000036, 
--0.000012, 0.000044, 0.000079, -0.000113, 0.000036, 0.000089, -0.000012, 0.000013, 0.000012, -0.000004, 
--0.000005, -0.000044, -0.000044, -0.000117, -0.000046, 0.000322, -0.000024, -0.000005, -0.000044, 0.000322, 
--0.000046, -0.000044, -0.000117, -0.000024, 0.000079, -0.000113, 0.000089, 0.000044, 0.000036, -0.000012, 
--0.000012, 0.000092, -0.000015, -0.000046, -0.000013, 0.000021, -0.000043, 0.000027, 0.000028, 0.000003, 
-0.000011, 0.000034, -0.000029, 0.000010, 0.000019, -0.000001, 0.000071, 0.000173, -0.000018, -0.000007, 
--0.000007, -0.000099, -0.000104, 0.000327, -0.000036, -0.000021, -0.000012, 0.000092, -0.000015, -0.000046, 
--0.000013, 0.000021, -0.000043, 0.000027, 0.000028, 0.000003, -0.000027, 0.000011, 0.000036, -0.000029, 
-0.000010, 0.000016, 0.000056, -0.000082, -0.000004, -0.000029, 0.000099, -0.000005, -0.000029, -0.000010, 
--0.000007, -0.000027, 0.000036, 0.000011, -0.000029, 0.000010, 0.000056, 0.000016, -0.000082, -0.000043, 
-0.000126, -0.000052, -0.000032, 0.000045, 0.000045, -0.000013, 0.000012, 0.000013, -0.000004, 0.000028, 
-0.000019, -0.000003, -0.000003, -0.000044, 0.000137, -0.000013, -0.000044, -0.000006, -0.000003, -0.000003, 
--0.000044, -0.000013, 0.000137, -0.000044, -0.000006, -0.000005, -0.000044, -0.000044, 0.000322, -0.000046, 
--0.000117, -0.000024, 0.000044, -0.000113, 0.000089, 0.000036, 0.000079, -0.000012, 0.000044, 0.000186, 
--0.000113, 0.000036, 0.000065, -0.000095, -0.000153, 0.000011, -0.000029, 0.000010, 0.000019, 0.000034, 
--0.000001, -0.000001, -0.000013, -0.000003, 0.000036, -0.000010, -0.000006, -0.000071, -0.000005, 0.000100, 
-0.000071, 0.000161, -0.000013, -0.000161, -0.000161, -0.000216, -0.000013, 0.000725, -0.000178, -0.000026, 
--0.000021, -0.000002, -0.000008, 0.000028, 0.000118, -0.000063, 0.000016, -0.000078, -0.000005, -0.000015, 
--0.000044, -0.000006, 0.000285, -0.000044, -0.000006, -0.000055, -0.000056, -0.000037, 0.000088, 0.000064, 
--0.000018, -0.000002, -0.000008, 0.000016, -0.000063, 0.000028, 0.000118, -0.000078, -0.000005, -0.000043, 
-0.000028, 0.000027, 0.000021, 0.000003, 0.000045, -0.000021, 0.000016, -0.000021, 0.000016, 0.000047, 
--0.000013, -0.000007, -0.000047, -0.000047, 0.000164, -0.000005, -0.000021, -0.000012, -0.000027, 0.000011, 
--0.000029, 0.000010, 0.000056, 0.000016, 0.000036, -0.000082, -0.000012, 0.000092, -0.000046, -0.000015, 
--0.000013, 0.000022, 0.000011, -0.000029, 0.000018, 0.000010, -0.000001, 0.000021, -0.000043, 0.000028, 
-0.000027, 0.000003, 0.000022, 0.000021, -0.000006, -0.000012, 0.000092, -0.000046, -0.000015, -0.000013, 
-0.000022, 0.000011, -0.000029, 0.000018, 0.000010, -0.000001, 0.000021, -0.000043, 0.000028, 0.000027, 
-0.000003, 0.000044, -0.000113, 0.000036, 0.000186, 0.000065, -0.000095, -0.000153, -0.000100, 0.000034, 
-0.000057, 0.000049, 0.000036, -0.000002, 0.000045, 0.000045, -0.000013, -0.000113, 0.000067, 0.000008, 
-0.000044, 0.000036, -0.000007, -0.000003, -0.000003, -0.000044, -0.000013, 0.000137, -0.000044, -0.000006, 
--0.000070, -0.000010, 0.000202, -0.000072, -0.000007, -0.000021, 0.000044, -0.000113, 0.000036, 0.000067, 
-0.000008, -0.000007, 0.000049, 0.000034, -0.000088, 0.000028, 0.000055, -0.000005, -0.000003, 0.000101, 
--0.000047, -0.000005, -0.000021, -0.000012, -0.000159, 0.000117, 0.000055, 0.000056, 0.000154, -0.000011, 
-0.000022, 0.000021, -0.000006, 0.000007, 0.000007, -0.000002, 0.000022, 0.000021, -0.000006, -0.000006, 
--0.000063, 0.000164, -0.000052, -0.000009, -0.000021, -0.000020, 0.000055, -0.000018, 0.000049, 0.000034, 
--0.000100, 0.000036, 0.000057, -0.000002, 0.000126, -0.000043, -0.000052, -0.000032, -0.000003, -0.000003, 
--0.000044, 0.000137, -0.000013, -0.000044, -0.000006, -0.000003, -0.000003, -0.000044, -0.000013, 0.000137, 
--0.000044, -0.000006, 0.000010, 0.000001, -0.000003, 0.000173, 0.000071, -0.000018, -0.000001, -0.000001, 
--0.000005, 0.000127, -0.000005, -0.000088, -0.000019, -0.000043, 0.000047, -0.000005, 0.000022, 0.000021, 
--0.000006, 0.000022, 0.000021, -0.000006, 0.000022, 0.000021, -0.000006, 0.000045, 0.000045, -0.000013, 
-0.000035, -0.000113, 0.000036, 0.000044, 0.000069, -0.000007, 0.000011, -0.000029, 0.000010, 0.000019, 
-0.000034, -0.000001, -0.000001, -0.000001, -0.000013, -0.000013, -0.000003, 0.000070, -0.000021, -0.000012, 
-0.000027, 0.000003, -0.000008, 0.000044, -0.000113, 0.000036, 0.000089, 0.000079, -0.000012, -0.000010, 
--0.000060, 0.000176, -0.000005, -0.000084, -0.000113, 0.000186, 0.000044, 0.000036, 0.000065, -0.000095, 
--0.000153, -0.000113, 0.000067, 0.000008, 0.000044, 0.000036, -0.000007, -0.000003, -0.000003, -0.000013, 
--0.000044, 0.000137, -0.000044, -0.000006, 0.000013, 0.000013, -0.000004, -0.000007, -0.000007, -0.000104, 
--0.000099, 0.000327, -0.000036, -0.000021, 0.000045, 0.000045, -0.000013, 0.000036, 0.000035, -0.000113, 
-0.000044, 0.000069, -0.000007, -0.000006, -0.000009, 0.000164, -0.000084, 0.000710, 0.000139, -0.000636, 
--0.000001, -0.000001, -0.000005, -0.000005, 0.000127, -0.000088, -0.000019, -0.000052, -0.000009, 0.000164, 
--0.000063, -0.000006, -0.000021, 0.000082, 0.000033, -0.000008, 0.000049, 0.000016, -0.000063, 0.000028, 
-0.000054, -0.000005, -0.000010, -0.000070, 0.000202, -0.000072, -0.000007, -0.000021, -0.000113, 0.000044, 
-0.000036, 0.000008, 0.000067, -0.000007, 0.000049, 0.000034, -0.000088, 0.000028, 0.000055, -0.000005, 
--0.000071, 0.000100, -0.000005, 0.000071, 0.000027, 0.000003, -0.000008, -0.000095, -0.000113, 0.000186, 
-0.000044, 0.000036, 0.000065, -0.000153, -0.000002, -0.000008, 0.000118, -0.000078, -0.000063, 0.000028, 
-0.000016, -0.000005, -0.000002, -0.000008, 0.000118, -0.000078, -0.000063, 0.000016, 0.000028, -0.000005, 
--0.000012, 0.000092, -0.000046, -0.000015, -0.000013, -0.000027, 0.000036, 0.000011, 0.000056, -0.000029, 
-0.000016, 0.000010, -0.000082, 0.000021, -0.000043, 0.000028, 0.000027, 0.000003, -0.000010, -0.000060, 
-0.000176, -0.000005, -0.000084, -0.000012, 0.000092, -0.000046, -0.000015, -0.000013, 0.000034, 0.000011, 
--0.000029, 0.000019, 0.000010, -0.000001, 0.000021, -0.000043, 0.000028, 0.000027, 0.000003, 0.000013, 
-0.000012, -0.000004, -0.000005, -0.000044, -0.000044, -0.000046, -0.000117, 0.000322, -0.000024, 0.000016, 
-0.000016, -0.000021, -0.000021, 0.000047, 0.000045, -0.000013, 0.000013, 0.000012, -0.000004, 0.000022, 
-0.000021, -0.000006, 0.000022, 0.000021, -0.000006, 0.000022, 0.000021, -0.000006, -0.000070, -0.000010, 
-0.000202, -0.000072, -0.000007, -0.000021, -0.000113, 0.000044, 0.000036, 0.000008, 0.000067, -0.000007, 
-0.000034, 0.000049, -0.000088, 0.000028, 0.000055, -0.000005, -0.000001, -0.000001, -0.000005, 0.000127, 
--0.000005, -0.000044, -0.000044, -0.000019, -0.000043, 0.000047, -0.000005, 0.000045, 0.000045, -0.000013, 
-0.000012, 0.000013, -0.000004, -0.000001, -0.000001, -0.000005, 0.000127, -0.000005, -0.000088, -0.000019, 
--0.000043, 0.000126, -0.000052, -0.000032, -0.000002, -0.000063, 0.000118, -0.000078, -0.000008, 0.000028, 
-0.000016, -0.000005, -0.000015, -0.000012, -0.000046, 0.000092, -0.000013, -0.000067, 0.000044, 0.000075, 
--0.000044, -0.000008, 0.000469, 0.000220, -0.000516, -0.000001, -0.000001, -0.000005, 0.000127, -0.000044, 
--0.000005, -0.000044, -0.000019, 1.000000, -0.000043, 0.000028, 0.000126, -0.000080, -0.000032, -0.000002, 
--0.000063, 0.000118, -0.000078, -0.000008, 0.000016, 0.000028, -0.000005, -0.000046, -0.000015, -0.000012, 
-0.000092, -0.000013, -0.000044, -0.000067, 0.000044, 0.000075, -0.000008, 0.000027, 0.000003, -0.000008, 
-0.000186, -0.000095, -0.000113, 0.000065, 0.000044, 0.000036, -0.000153, -0.000043, 0.000126, -0.000052, 
--0.000032, -0.000005, -0.000044, -0.000044, 0.000322, -0.000046, -0.000117, -0.000024, 0.000079, 0.000044, 
--0.000113, 0.000089, 0.000036, -0.000012, 0.000044, 0.000079, -0.000113, 0.000036, 0.000089, -0.000012, 
--0.000012, 0.000092, -0.000015, -0.000046, -0.000013, 0.000021, -0.000043, 0.000027, 0.000028, 0.000003, 
--0.000012, 0.000092, -0.000015, -0.000046, -0.000013, 0.000021, -0.000043, 0.000027, 0.000028, 0.000003, 
--0.000027, 0.000011, 0.000056, -0.000029, 0.000010, 0.000016, 0.000036, -0.000082, 0.000016, -0.000021, 
--0.000021, 0.000016, 0.000047, 0.000045, -0.000013, 0.000013, 0.000012, -0.000004, -0.000059, -0.000059, 
-0.000044, 0.000066, 0.000044, 0.000035, -0.000018, -0.000001, -0.000001, -0.000005, 0.000127, -0.000005, 
--0.000088, -0.000019, -0.000043, 0.000047, -0.000005, -0.000001, -0.000001, -0.000001, -0.000013, -0.000013, 
--0.000003, -0.000013, 0.000087, -0.000021, -0.000012, -0.000027, 0.000056, 0.000011, 0.000010, -0.000029, 
-0.000036, 0.000016, -0.000082, -0.000113, 0.000044, 0.000067, 0.000008, -0.000007, -0.000003, -0.000003, 
--0.000013, -0.000044, 0.000137, -0.000044, -0.000006, -0.000001, -0.000017, -0.000041, 0.000171, -0.000059, 
--0.000015, -0.000027, -0.000002, -0.000002, -0.000001, -0.000004, -0.000020, -0.000034, -0.000024, -0.000005, 
-0.000216, -0.000055, -0.000021, -0.000012, -0.000075, 0.000041, -0.000125, 0.000041, 0.000130, -0.000013, 
-0.000011, -0.000029, 0.000010, 0.000019, 0.000034, -0.000001, -0.000001, -0.000013, -0.000003, 0.000036, 
--0.000010, -0.000006, -0.000059, -0.000059, 0.000044, 0.000066, 0.000044, 0.000035, -0.000018, 0.000022, 
-0.000021, -0.000006, 0.000036, 0.000044, -0.000113, 0.000069, 0.000035, -0.000007, -0.000001, -0.000001, 
--0.000005, -0.000005, 0.000127, -0.000044, -0.000044, -0.000019, -0.000001, -0.000059, -0.000017, 0.000171, 
--0.000041, -0.000015, -0.000027, -0.000161, -0.000013, -0.000178, 0.000496, -0.000026, -0.000021, 0.000041, 
--0.000075, -0.000125, 0.000041, 0.000130, -0.000013, -0.000003, -0.000003, -0.000044, 0.000137, -0.000013, 
--0.000044, -0.000006, -0.000003, -0.000003, -0.000044, -0.000013, 0.000137, -0.000044, -0.000006, 0.000050, 
-0.000036, -0.000010, 0.000045, 0.000045, -0.000013, -0.000113, 0.000036, 0.000044, 0.000035, 0.000069, 
--0.000007, 0.000071, 0.000173, -0.000018, -0.000007, -0.000007, -0.000099, -0.000104, 0.000327, -0.000036, 
--0.000021, -0.000043, 0.000126, -0.000052, -0.000032, -0.000133, -0.000007, 0.000099, 0.000155, -0.000010, 
--0.000099, -0.000005, -0.000003, -0.000003, -0.000044, 0.000137, -0.000013, -0.000044, -0.000006, -0.000003, 
--0.000003, -0.000044, -0.000013, 0.000137, -0.000044, -0.000006, -0.000015, -0.000006, -0.000006, -0.000055, 
-0.000285, -0.000056, -0.000044, -0.000044, -0.000037, 0.000675, 0.000132, -0.000594, -0.000070, -0.000072, 
--0.000007, 0.000202, -0.000010, -0.000021, -0.000095, 0.000044, -0.000113, 0.000036, 0.000186, 0.000065, 
--0.000153, 0.000022, 0.000021, -0.000006, -0.000006, -0.000006, -0.000055, 0.000285, -0.000015, -0.000056, 
--0.000044, -0.000044, -0.000037, 0.000055, -0.000159, 0.000117, 0.000056, 0.000154, -0.000011, 0.000022, 
-0.000021, -0.000006, -0.000059, -0.000059, 0.000044, 0.000066, 0.000044, 0.000035, -0.000018, 0.000028, 
-0.000019, -0.000071, -0.000005, 0.000100, 0.000071, -0.000060, 0.000176, -0.000010, -0.000005, -0.000084, 
--0.000007, 0.000202, -0.000070, -0.000010, -0.000072, -0.000021, 0.000045, 0.000045, -0.000013, 0.000035, 
-0.000036, -0.000113, 0.000044, 0.000069, -0.000007, -0.000006, -0.000006, -0.000044, -0.000044, -0.000055, 
-0.000285, -0.000056, -0.000015, -0.000037, -0.000091, 0.000346, -0.000121, 0.000055, -0.000159, 0.000056, 
-0.000117, 0.000154, -0.000011, -0.000007, -0.000007, -0.000099, 0.000327, -0.000104, -0.000036, -0.000021, 
-0.000173, 0.000071, -0.000018, -0.000060, -0.000010, 0.000176, -0.000005, -0.000084, -0.000007, -0.000070, 
--0.000010, -0.000072, 0.000202, -0.000021, 0.000044, -0.000095, 0.000036, -0.000113, 0.000186, 0.000065, 
--0.000153, 0.000010, 0.000011, -0.000029, 0.000019, 0.000034, -0.000001, -0.000001, -0.000001, -0.000001, 
--0.000013, -0.000013, -0.000003, 0.000087, -0.000013, -0.000021, -0.000012, -0.000043, 0.000126, -0.000052, 
--0.000032, 0.000027, 0.000003, -0.000008, -0.000113, 0.000044, 0.000036, 0.000079, 0.000089, -0.000012, 
-0.000013, 0.000012, -0.000004, -0.000005, -0.000044, -0.000044, -0.000117, -0.000046, 0.000322, -0.000024, 
--0.000070, -0.000010, 0.000202, -0.000072, -0.000007, -0.000021, -0.000113, 0.000044, 0.000036, 0.000008, 
-0.000067, -0.000007, 0.000049, 0.000034, -0.000088, 0.000028, 0.000055, -0.000005, 0.000045, 0.000045, 
--0.000013, 0.000008, 0.000036, 0.000067, -0.000113, 0.000044, -0.000007, 0.000045, 0.000045, -0.000013, 
-0.000036, 0.000044, 0.000035, -0.000113, 0.000069, -0.000007, 0.000036, -0.000113, -0.000080, -0.000010, 
-0.000236, -0.000080, -0.000010, 0.000032, 0.000035, -0.000113, 0.000036, 0.000069, 0.000044, -0.000007, 
-0.000008, 0.000044, -0.000113, 0.000036, 0.000067, -0.000007, -0.000065, -0.000009, 0.000213, -0.000005, 
--0.000021, -0.000078, -0.000027, 0.000036, -0.000029, 0.000010, 0.000011, 0.000056, 0.000016, -0.000082, 
--0.000020, 0.000013, 0.000022, -0.000013, -0.000002, 0.000173, 0.000071, -0.000018, -0.000003, -0.000003, 
--0.000044, 0.000137, -0.000013, -0.000044, -0.000006, -0.000003, -0.000003, -0.000044, -0.000013, 0.000137, 
--0.000044, -0.000006, -0.000004, -0.000029, -0.000029, 0.000099, -0.000005, -0.000010, -0.000007, -0.000027, 
-0.000010, 0.000011, 0.000036, -0.000029, 0.000056, 0.000016, -0.000082, -0.000043, 0.000126, -0.000052, 
--0.000032, -0.000071, -0.000005, 0.000100, 0.000071, 0.000161, -0.000161, -0.000216, -0.000161, -0.000026, 
--0.000017, -0.000013, -0.000005, 0.000530, -0.000026, -0.000024, -0.000002, -0.000008, 0.000028, 0.000118, 
--0.000063, 0.000016, -0.000078, -0.000005, -0.000001, -0.000001, -0.000005, 0.000127, -0.000005, -0.000088, 
--0.000019, -0.000043, 0.000047, -0.000005, -0.000015, -0.000055, -0.000056, -0.000044, -0.000006, 0.000285, 
--0.000044, -0.000006, -0.000037, 0.000143, 0.000078, -0.000022, -0.000002, -0.000008, 0.000016, -0.000063, 
-0.000028, 0.000118, -0.000078, -0.000005, -0.000012, 0.000092, -0.000046, -0.000015, -0.000013, -0.000027, 
-0.000036, 0.000011, 0.000056, -0.000029, 0.000016, 0.000010, -0.000082, 0.000021, -0.000043, 0.000028, 
-0.000027, 0.000003, -0.000012, 0.000092, -0.000046, -0.000015, -0.000013, 0.000034, 0.000011, -0.000029, 
-0.000019, 0.000010, -0.000001, 0.000021, -0.000043, 0.000028, 0.000027, 0.000003, -0.000161, 0.000161, 
-0.000161, -0.000216, -0.000178, -0.000216, 0.000522, -0.000026, -0.000021, -0.000021, 0.000016, -0.000021, 
-0.000047, 0.000045, 0.000016, -0.000013, 0.000012, 0.000013, -0.000004, -0.000059, -0.000059, 0.000044, 
-0.000066, 0.000035, 0.000044, -0.000018, -0.000059, -0.000059, 1.000000, 0.000044, -0.000063, 0.000024, 
--0.000063, 0.000024, -0.000063, 0.000024, -0.000063, -0.000063, 0.000024, 0.000024, 0.000024, -0.000063, 
--0.000063, 0.000024, 0.000024, 0.000024, 0.000024, -0.000063, -0.000063, 0.000024, 0.000024, -0.000063, 
--0.000063, 0.000024, -0.000063, -0.000063, 0.000075, 0.000024, -0.000063, 0.000044, 0.000386, -0.000043, 
-0.000126, -0.000052, -0.000032, -0.000005, -0.000044, -0.000044, 0.000322, -0.000046, -0.000117, -0.000024, 
-0.000044, 0.000079, -0.000113, 0.000089, 0.000036, -0.000012, 0.000044, 0.000079, -0.000113, 0.000036, 
-0.000089, -0.000012, 0.000013, 0.000012, -0.000004, -0.000005, -0.000044, -0.000046, -0.000044, -0.000117, 
-0.000322, -0.000024, 0.000028, 0.000019, -0.000005, -0.000060, 0.000176, -0.000010, -0.000084, -0.000012, 
-0.000092, -0.000015, -0.000046, -0.000013, 0.000021, -0.000043, 0.000027, 0.000028, 0.000003, 0.000011, 
-0.000034, -0.000029, 0.000010, 0.000019, -0.000001, -0.000006, 0.000164, -0.000009, -0.000084, 0.000049, 
--0.000100, 0.000057, 0.000032, -0.000012, 0.000092, -0.000015, -0.000046, -0.000013, 0.000021, -0.000043, 
-0.000027, 0.000028, 0.000003, -0.000027, 0.000011, 0.000056, 0.000036, -0.000029, 0.000010, 0.000016, 
--0.000082, -0.000007, 0.000202, -0.000010, -0.000072, -0.000070, -0.000021, 0.000027, 0.000003, -0.000008, 
-0.000079, 0.000044, -0.000113, 0.000036, 0.000089, -0.000012, 0.000022, 0.000021, -0.000006, 1.000000, 
-0.000024, -0.000063, 0.000005, 0.000028, -0.000002, -0.000005, -0.000029, 0.000077, -0.000021, -0.000012, 
-1.000000, -0.000003, -0.000003, -0.000044, 0.000137, -0.000044, -0.000013, -0.000006, 0.000016, -0.000063, 
-0.000028, 0.000054, 0.000049, -0.000005, -0.000046, -0.000015, -0.000012, 0.000092, -0.000013, -0.000005, 
--0.000044, -0.000044, 0.000322, -0.000046, -0.000117, -0.000024, 0.000044, 0.000079, -0.000113, 0.000089, 
-0.000036, -0.000012, 0.000044, -0.000095, 0.000186, -0.000113, 0.000036, 0.000065, -0.000153, 0.000010, 
--0.000029, 0.000011, 0.000019, 0.000034, -0.000001, -0.000001, -0.000001, -0.000013, -0.000013, -0.000021, 
--0.000003, 0.000070, -0.000012, -0.000012, 0.000092, -0.000015, -0.000046, -0.000013, 0.000021, -0.000043, 
-0.000027, 0.000028, 0.000003, -0.000027, 0.000011, 0.000036, 0.000056, -0.000029, 0.000010, 0.000016, 
--0.000082, 0.000045, 0.000045, -0.000013, 0.000022, 0.000021, -0.000006, 0.000036, -0.000113, -0.000080, 
-0.000236, -0.000080, -0.000010, -0.000010, 0.000032, 0.000035, -0.000113, 0.000069, 0.000044, 0.000036, 
--0.000007, 0.000008, 0.000044, -0.000113, 0.000036, 0.000067, -0.000007, 0.000022, 0.000021, -0.000006, 
--0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, 
--0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, 
--0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, 
--0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, 
--0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, 
--0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, 
--0.000059, -0.000059, 1.000000, 0.000044, 0.000044, 0.000044, 0.000044, 0.000044, 0.000044, 0.000044, 
-0.000044, 0.000044, 0.000044, 0.000044, 0.000044, 0.000044, 0.000024, 0.000044, 0.000044, 0.000044, 
-0.000044, 0.000044, 0.000044, 0.000044, 0.000044, 0.000044, 0.000044, 0.000044, 0.000044, 0.000044, 
-0.000044, 0.000044, 0.000044, 0.000044, 0.000044, -0.000063, 0.001236, 0.000044, 0.000283, 0.000028, 
--0.000006, -0.000063, 0.000164, -0.000052, -0.000009, -0.000021, -0.000040, 0.000086, -0.000029, 0.000049, 
-0.000034, -0.000100, 0.000036, 0.000057, -0.000002, -0.000006, -0.000009, -0.000063, -0.000052, 0.000164, 
--0.000021, 0.000710, 0.000139, -0.000636, 0.000036, -0.000113, 0.000044, 0.000067, 0.000008, -0.000007, 
--0.000003, -0.000003, -0.000044, -0.000013, -0.000044, 0.000137, -0.000006, 0.000045, 0.000045, -0.000013, 
--0.000113, 0.000044, 0.000035, 0.000036, 0.000069, -0.000007, -0.000009, 0.000164, -0.000006, -0.000084, 
-0.000049, -0.000063, 0.000054, 0.000023, -0.000003, -0.000003, -0.000044, -0.000044, 0.000137, -0.000013, 
--0.000006, -0.000003, -0.000003, -0.000044, -0.000044, -0.000013, 0.000137, -0.000006, -0.000012, 0.000092, 
--0.000046, -0.000015, -0.000013, -0.000027, 0.000056, 0.000011, 0.000036, -0.000029, 0.000016, 0.000010, 
--0.000082, 0.000021, -0.000043, 0.000028, 0.000027, 0.000003, 0.000036, 0.000044, -0.000113, 0.000067, 
-0.000008, -0.000007, -0.000003, -0.000003, -0.000044, -0.000013, -0.000044, 0.000137, -0.000006, -0.000012, 
-0.000092, -0.000046, -0.000015, -0.000013, 0.000011, 0.000034, -0.000029, 0.000019, 0.000010, -0.000001, 
-0.000021, -0.000043, 0.000028, 0.000027, 0.000003, -0.000005, -0.000044, -0.000044, 0.000322, -0.000117, 
--0.000046, -0.000024, 0.000044, 0.000186, -0.000095, -0.000113, 0.000065, 0.000036, -0.000153, 0.000079, 
-0.000044, -0.000113, 0.000036, 0.000089, -0.000012, 1.000000, -0.000000, -0.000025, -0.000025, -0.000025, 
--0.000043, -0.000069, -0.000043, -0.000069, -0.000025, -0.000020, -0.000020, -0.000000, -0.000025, -0.000020, 
--0.000043, -0.000067, -0.000067, -0.000025, -0.000025, -0.000013, -0.000025, -0.000000, -0.000000, -0.000020, 
--0.000020, -0.000020, -0.000067, -0.000054, -0.000054, -0.000020, -0.000054, -0.000020, -0.000117, -0.000020, 
--0.000020, -0.000054, -0.000054, 0.000008, -0.000000, -0.000020, 0.000046, -0.000043, -0.000020, 0.000046, 
--0.000020, -0.000043, -0.000020, -0.000043, -0.000067, 0.000015, -0.000043, -0.000067, -0.000025, -0.000117, 
--0.000067, -0.000025, -0.000054, -0.000020, -0.000069, -0.000043, -0.000000, -0.000000, -0.000054, -0.000054, 
--0.000025, 0.000030, -0.000020, -0.000020, -0.000000, -0.000014, -0.000020, -0.000020, -0.000020, -0.000054, 
--0.000043, -0.000054, 0.000036, -0.000020, -0.000020, -0.000043, -0.000067, -0.000000, -0.000025, -0.000074, 
--0.000020, -0.000020, -0.000043, -0.000067, -0.000115, 0.000036, -0.000043, -0.000054, -0.000054, -0.000020, 
--0.000000, -0.000020, -0.000050, -0.000025, -0.000000, 0.000030, -0.000050, -0.000020, -0.000040, -0.000054, 
--0.000000, -0.000040, -0.000028, -0.000020, -0.000025, 0.000030, -0.000025, -0.000069, -0.000007, -0.000123, 
-0.000009, -0.000025, -0.000020, -0.000043, -0.000020, -0.000054, -0.000043, -0.000054, -0.000000, -0.000025, 
--0.000020, -0.000054, -0.000025, -0.000065, -0.000043, -0.000025, -0.000069, -0.000054, -0.000025, -0.000043, 
--0.000020, -0.000025, 0.000015, 0.000008, -0.000025, -0.000012, -0.000014, -0.000067, -0.000020, 0.000046, 
--0.000000, -0.000043, -0.000477, -0.000117, -0.000025, -0.000025, 0.000019, -0.000017, -0.000025, -0.000067, 
--0.000020, -0.000020, -0.000053, -0.000020, -0.000025, -0.000014, -0.000020, -0.000020, -0.000014, -0.000043, 
-0.000010, -0.000085, -0.000020, -0.000020, 0.000028, 0.000008, -0.000025, -0.000033, 0.000009, -0.000000, 
--0.000477, -0.000000, -0.000043, 0.000036, -0.000043, -0.000025, -0.000025, 0.000015, -0.000040, -0.000028, 
-0.000046, -0.000074, -0.000085, -0.000053, -0.000040, -0.000000, -0.000028, -0.000043, -0.000115, -0.000054, 
--0.000043, -0.000040, -0.000028, -0.000043, -0.000054, -0.000053, -0.000025, -0.000000, -0.000000, -0.000000, 
-0.000009, -0.000014, -0.000000, -0.000067, -0.000067, -0.000000, -0.000025, -0.000043, -0.000020, -0.000020, 
-0.000001, -0.000054, -0.000085, -0.000043, 0.000009, -0.000043, -0.000074, -0.000053, 0.000030, 0.000030, 
--0.000014, -0.000069, -0.000025, 0.000015, -0.000020, -0.000115, -0.000020, -0.000053, -0.000020, -0.000025, 
--0.000069, 0.000008, -0.000053, -0.000043, 0.000009, -0.000043, -0.000085, -0.000054, 0.000030, -0.000025, 
--0.000054, 0.000030, 0.000030, -0.000014, -0.000123, -0.000025, 0.000015, 0.000015, -0.000043, 0.000009, 
--0.000000, -0.000085, 0.000046, -0.000025, -0.000025, -0.000289, -0.000043, -0.000037, -0.000014, -0.000020, 
--0.000020, -0.000025, 0.000030, -0.000040, -0.000054, -0.000067, 0.000030, -0.000020, -0.000020, 0.000046, 
--0.000028, -0.000067, -0.000067, -0.000043, 0.000009, -0.000025, -0.000020, 0.000046, -0.000115, -0.000054, 
--0.000000, -0.000025, -0.000033, -0.000054, -0.000054, -0.000000, -0.000020, -0.000000, -0.000054, -0.000000, 
--0.000020, -0.000020, -0.000054, 0.000030, -0.000117, -0.000043, -0.000013, -0.000025, -0.000054, -0.000043, 
-0.000009, -0.000057, 0.000009, 0.000046, -0.000043, -0.000040, -0.000000, -0.000000, -0.000019, -0.000000, 
--0.000053, -0.000020, -0.000020, -0.000043, 0.000015, 0.000008, -0.000026, 0.000046, -0.000054, 0.000030, 
-0.000030, -0.000053, 0.000046, -0.000011, -0.000062, -0.000067, -0.000085, -0.000038, -0.000025, -0.000011, 
--0.000020, -0.000025, 0.000036, -0.000043, 0.000046, 0.000009, 0.000028, -0.000020, 0.000008, -0.000053, 
--0.000025, -0.000053, -0.000020, -0.000020, -0.000008, -0.000043, 0.000009, -0.000513, -0.000028, -0.000020, 
--0.000074, -0.000289, -0.000085, -0.000062, 0.000015, -0.000115, -0.000014, -0.000013, -0.000040, -0.000028, 
--0.000062, -0.000074, -0.000053, 0.000046, -0.000040, -0.000028, 0.000036, -0.000043, -0.000025, -0.000121, 
--0.000000, -0.000121, -0.000043, -0.000000, 0.000030, -0.000028, -0.000054, -0.000000, -0.000043, -0.000020, 
--0.000121, -0.000026, -0.000053, -0.000020, -0.000020, -0.000061, -0.000007, -0.000000, -0.000025, -0.000000, 
--0.000043, -0.000000, -0.000020, -0.000000, 0.000015, -0.000045, 0.000046, -0.000054, -0.000043, -0.000289, 
--0.000069, 0.000015, 0.000008, -0.000053, 0.000046, 0.000046, -0.000025, -0.000098, -0.000014, -0.000000, 
--0.000067, -0.000043, 0.000009, -0.000000, -0.000040, -0.000054, -0.000000, -0.000054, -0.000043, 0.000009, 
-0.000030, 0.000030, -0.000043, 0.000028, -0.000115, -0.000054, -0.000020, -0.000020, -0.000053, -0.000040, 
--0.000054, -0.000020, 0.000046, 0.000009, -0.000020, -0.000020, -0.000054, -0.000013, -0.000025, -0.000054, 
--0.000025, -0.000053, -0.000074, -0.000054, -0.000026, -0.000008, 0.000015, -0.000043, -0.000025, -0.000000, 
--0.000000, -0.000000, -0.000020, -0.000000, -0.000011, 0.000009, -0.000020, 0.000015, 0.000008, -0.000020, 
--0.000115, 0.000036, -0.000014, 0.000002, -0.000085, -0.000020, -0.000121, -0.000121, -0.000057, 0.000036, 
--0.000121, 0.000036, -0.000043, -0.000040, -0.000028, -0.000054, 0.000030, -0.000054, -0.000074, -0.000008, 
-0.000008, -0.000054, 0.000030, -0.000085, -0.000121, -0.000121, -0.000020, -0.000014, -0.000020, -0.000008, 
--0.000054, -0.000074, -0.000043, 0.000009, 0.000028, -0.000062, -0.000025, 0.000046, -0.000025, -0.000053, 
--0.000048, -0.000043, -0.000085, 0.000015, -0.000062, -0.000062, -0.000011, 0.000009, -0.000000, -0.000000, 
--0.000020, -0.000067, -0.000020, -0.000054, -0.000013, -0.000000, -0.000000, -0.000000, -0.000006, -0.000013, 
-0.000022, 0.000008, 0.000030, -0.000043, -0.000020, 0.000046, -0.000020, -0.000013, -0.000067, -0.000040, 
--0.000054, 0.000046, -0.000028, -0.000054, -0.000043, 0.000173, -0.000054, 0.000030, -0.000050, 0.000009, 
-0.000015, -0.000030, -0.000074, -0.000043, 0.000009, -0.000000, -0.000000, -0.000067, -0.000000, -0.000000, 
--0.000085, -0.000020, -0.000020, -0.000025, -0.000289, -0.000069, -0.000025, -0.000053, -0.000057, -0.000026, 
--0.000074, 0.000046, -0.000040, -0.000028, 0.000015, -0.000054, 0.000030, -0.000020, 0.000046, -0.000040, 
--0.000054, -0.000054, -0.000085, 0.000046, 0.000009, -0.000054, -0.000054, 0.000030, 0.000022, 0.000008, 
--0.000117, -0.000020, -0.000121, -0.000013, -0.000054, 0.000030, 0.000036, -0.000040, -0.000028, -0.000020, 
--0.000025, 0.000022, 0.000030, 0.000030, 0.000028, -0.000020, -0.000074, -0.000008, 0.000008, -0.000053, 
--0.000043, -0.000028, 0.000036, -0.000025, -0.000014, 0.000008, -0.000020, -0.000013, -0.000025, -0.000033, 
--0.000477, -0.000043, 0.000009, 0.000030, -0.000054, -0.000054, -0.000043, -0.000115, -0.000020, -0.000020, 
--0.000013, -0.000069, -0.000054, -0.000025, -0.000053, 0.000036, -0.000043, 0.000028, -0.000026, 0.000022, 
--0.000065, -0.000115, -0.000053, 0.000046, -0.000053, 0.000036, -0.000043, 0.000030, 0.000030, -0.000062, 
--0.000011, -0.000020, -0.000067, 0.000028, -0.000020, -0.000054, -0.000057, -0.000008, -0.000025, -0.000020, 
-0.000028, -0.000020, -0.000011, -0.000067, -0.000043, 0.000009, -0.000078, -0.000001, -0.000115, -0.000054, 
--0.000013, -0.000053, 0.000046, -0.000020, -0.000013, -0.000121, 0.000015, 0.000008, -0.000025, -0.000053, 
-0.000010, -0.000045, -0.000020, -0.000043, -0.000121, -0.000121, 0.000036, -0.000043, 0.000030, 0.000030, 
-0.000030, 0.000030, -0.000289, -0.000085, -0.000020, 0.000030, 0.000030, -0.000085, -0.000020, -0.000020, 
--0.000033, -0.000121, 0.000008, 0.000036, -0.000121, -0.000115, -0.000025, -0.000117, 0.000009, -0.000074, 
--0.000020, -0.000020, -0.000020, 0.000046, -0.000013, -0.000054, -0.000000, -0.000054, -0.000043, 0.000009, 
--0.000074, -0.000026, -0.000121, 0.000030, 0.000030, -0.000020, 0.000046, -0.000043, -0.000074, -0.000008, 
-0.000009, -0.000020, -0.000020, 0.000028, -0.000025, 0.000036, -0.000043, 0.000046, -0.000025, -0.000053, 
-0.000015, 0.000008, -0.000025, -0.000020, 0.000030, -0.000085, -0.000040, -0.000054, -0.000054, -0.000000, 
--0.000028, -0.000054, -0.000123, -0.000043, 0.000009, -0.000123, -0.000513, -0.000123, -0.000513, -0.000123, 
--0.000123, -0.000013, 0.000015, -0.000123, -0.000513, -0.000123, 0.000030, 0.000030, -0.000014, -0.000045, 
--0.000008, -0.000000, -0.000020, 0.000028, -0.000020, -0.000020, -0.000013, -0.000123, -0.000513, 0.000002, 
--0.000040, -0.000054, 0.000046, -0.000028, -0.000020, -0.000289, -0.000000, 0.000028, -0.000020, -0.000008, 
--0.000050, -0.000033, 0.000028, -0.000011, -0.000011, -0.000067, -0.000026, -0.000115, -0.000054, -0.000043, 
-0.000046, -0.000062, -0.000043, 0.000028, 0.000009, 0.000036, -0.000121, -0.000025, -0.000053, -0.000020, 
--0.000020, -0.000020, -0.000013, 0.000015, 0.000008, -0.000025, -0.000043, 0.000009, -0.000054, -0.000043, 
-0.000036, -0.000043, -0.000013, -0.000054, 0.000030, -0.000054, 0.000008, -0.000053, -0.000020, 0.000022, 
--0.000038, -0.000121, 0.000036, -0.000043, -0.000038, -0.000115, -0.000033, -0.000121, -0.000053, 0.000046, 
-0.000028, -0.000008, -0.000062, -0.000025, -0.000033, -0.000006, 0.000015, 0.000008, -0.000000, -0.000000, 
--0.000054, 0.000030, -0.000054, 0.000030, -0.000000, -0.000014, -0.000020, -0.000000, -0.000014, -0.000020, 
--0.000013, 0.000015, -0.000025, -0.000326, 0.000002, -0.000054, 0.000030, 0.000028, -0.000020, -0.000013, 
-0.000046, -0.000043, -0.000121, -0.000053, 0.000046, 0.000030, 0.000030, 0.000046, 0.000028, -0.000121, 
--0.000043, -0.000067, -0.000062, -0.000008, -0.000121, -0.000040, -0.000025, -0.000013, -0.000115, 0.000015, 
-0.000008, -0.000040, -0.000028, -0.000054, -0.000025, -0.000053, -0.000040, -0.000054, -0.000011, -0.000045, 
--0.000085, 0.000046, -0.000000, -0.000054, -0.000006, -0.000054, 0.000030, -0.000043, -0.000054, 0.000022, 
--0.000106, -0.000115, -0.000025, -0.000045, -0.000011, -0.000062, -0.000038, 0.000009, -0.000054, -0.000028, 
-0.000036, -0.000000, -0.000020, -0.000011, -0.000000, -0.000020, -0.000043, 0.000009, -0.000513, -0.000043, 
--0.000043, -0.000008, -0.000020, -0.000020, -0.000289, -0.000069, -0.000067, -0.000040, -0.000054, 0.000046, 
--0.000054, -0.000028, 0.000036, 0.000015, -0.000105, 0.000046, 0.000011, -0.000014, -0.000020, -0.000013, 
--0.000053, 0.000046, -0.000000, -0.000067, -0.000067, -0.000062, -0.000008, -0.000000, -0.000067, -0.000289, 
--0.000069, 0.000036, -0.000008, -0.000025, -0.000115, 0.000599, -0.000040, -0.000028, -0.000011, -0.000062, 
--0.000025, -0.000053, -0.000028, 0.000036, 0.000036, -0.000121, -0.000043, 0.000009, 0.000036, -0.000121, 
--0.000040, -0.000028, -0.000043, 0.000028, -0.000053, 0.000046, -0.000477, -0.000117, 0.000046, -0.000054, 
--0.000289, -0.000085, 0.000046, -0.000054, -0.000025, -0.000053, -0.000477, -0.000043, 0.000028, -0.000008, 
--0.000020, -0.000043, 0.000028, 0.000028, -0.000011, -0.000020, -0.000043, -0.000011, -0.000062, -0.000013, 
-0.000008, -0.000053, -0.000053, 0.000015, 0.000008, 0.000008, -0.000053, -0.000000, -0.000020, -0.000011, 
--0.000050, -0.000008, -0.000000, -0.000020, -0.000008, -0.000057, -0.000008, -0.000121, -0.000030, 0.000015, 
--0.000025, 0.000030, 0.000030, 0.000008, -0.000053, -0.000033, -0.000011, -0.000045, -0.000029, -0.000000, 
--0.000289, -0.000054, -0.000000, -0.000020, -0.000013, -0.000074, -0.000008, -0.000000, -0.000014, -0.000020, 
-0.000009, -0.000000, -0.000014, -0.000020, -0.000033, -0.000043, 0.000002, -0.000054, 0.000030, 0.000046, 
--0.000054, -0.000043, -0.000074, -0.000085, -0.000020, -0.000002, -0.000020, -0.000513, -0.000054, -0.000043, 
--0.000121, 0.000030, 0.000030, 0.000001, -0.000045, 0.000036, -0.000043, 0.000009, -0.000020, -0.000020, 
--0.000040, -0.000028, -0.000011, -0.000062, -0.000025, -0.000053, 0.000046, -0.000033, -0.000054, 0.000030, 
--0.000012, 0.000056, -0.000040, -0.000028, -0.000513, -0.000123, 0.000036, -0.000513, -0.000025, -0.000043, 
--0.000115, -0.000054, -0.000043, -0.000025, -0.000033, -0.000000, -0.000000, -0.000000, -0.000008, -0.000020, 
-0.000046, -0.000000, -0.000011, -0.000020, 0.000015, 0.000008, -0.000013, 0.000015, 0.000009, -0.000020, 
--0.000020, -0.000115, -0.000054, -0.000043, 0.000036, -0.000043, 0.000002, 0.000015, 0.000036, -0.000121, 
--0.000000, -0.000000, -0.000067, 0.001219, 0.000036, 0.000027, -0.000121, -0.000000, -0.000000, -0.000067, 
--0.000025, -0.000033, -0.000121, 0.000008, -0.000053, -0.000053, -0.000000, -0.000020, -0.000000, -0.000020, 
--0.000008, -0.000013, 0.000015, -0.000054, 0.000036, -0.000043, -0.000062, -0.000008, -0.000054, 0.000030, 
-0.000022, -0.000038, -0.000115, -0.000011, -0.000045, -0.000054, 0.000009, -0.000028, 0.000036, 0.000022, 
--0.000065, -0.000115, 0.000030, 0.000030, 0.000025, -0.000040, -0.000028, -0.000050, -0.000008, -0.000121, 
--0.000014, 0.000030, 0.000030, -0.000477, -0.000117, -0.000115, -0.000033, -0.000020, -0.000477, -0.000085, 
--0.000020, -0.000054, -0.000025, 0.000046, -0.000115, -0.000040, -0.000028, -0.000289, -0.000337, -0.000085, 
--0.000008, -0.000050, 0.000046, -0.000115, -0.000033, -0.000011, -0.000062, -0.000121, -0.000025, -0.000053, 
-0.000015, 0.000008, 0.000046, -0.000054, -0.000043, -0.000040, -0.000054, -0.000040, -0.000028, -0.000000, 
--0.000028, -0.000054, -0.000074, -0.000008, -0.000020, -0.000037, 0.000030, 0.000030, -0.000057, -0.000008, 
--0.000121, 0.000000, -0.000000, 0.000036, -0.000043, -0.000289, -0.000064, -0.000000, -0.000000, -0.000008, 
--0.000020, -0.000000, -0.000011, -0.000020, -0.000048, -0.000013, 0.000015, -0.000054, 0.001682, -0.000121, 
-0.000008, -0.000053, -0.000053, 0.000015, 0.000008, -0.000025, 0.000046, -0.000000, -0.000020, -0.000011, 
--0.000513, -0.000043, -0.000000, -0.000020, -0.000008, -0.000115, -0.000025, -0.000053, 0.000009, 0.000096, 
--0.000074, 0.001684, 0.000030, -0.000043, 0.000028, 0.000008, -0.000053, -0.000033, -0.000011, -0.000062, 
-0.000028, -0.000020, -0.000008, -0.000040, -0.000020, -0.000000, -0.000028, -0.000054, -0.000020, 0.005404, 
--0.000477, -0.000085, -0.000043, -0.000513, -0.000123, -0.000054, 0.000030, -0.000040, -0.000028, -0.000513, 
--0.000043, 0.000030, 0.000030, 0.000028, -0.000008, -0.000020, -0.000054, 0.000030, 0.000028, -0.000011, 
--0.000020, 0.000008, -0.000033, -0.000053, 1.000000, 0.035299};
+    0.000018,  -0.000001, 0.000013,  -0.000013, -0.000017, 0.000048,  -0.000046,
+    0.000034,  -0.000034, 0.000018,  -0.000001, -0.000013, 0.000013,  -0.000017,
+    0.000028,  -0.000021, -0.000005, 0.000028,  -0.000020, 0.000020,  -0.000001,
+    0.000048,  -0.000046, 0.000034,  -0.000034, 0.000078,  -0.000004, 0.000055,
+    -0.000074, -0.000055, 0.000048,  -0.000046, 0.000034,  -0.000034, 0.000270,
+    -0.000015, -0.000074, 0.000055,  -0.000099, -0.000099, 0.000034,  -0.000002,
+    -0.000032, 0.000024,  -0.000024, 0.000028,  -0.000020, -0.000001, 0.000020,
+    0.000044,  -0.000017, -0.000078, 0.000042,  0.000029,  -0.000029, 0.000018,
+    -0.000001, -0.000017, 0.000013,  -0.000013, 0.000028,  -0.000021, -0.000005,
+    0.000018,  -0.000001, 0.000013,  -0.000013, -0.000017, 0.000018,  -0.000001,
+    -0.000017, -0.000013, 0.000013,  0.000018,  -0.000001, -0.000017, 0.000013,
+    -0.000013, 0.000018,  -0.000001, -0.000017, -0.000013, 0.000013,  0.000018,
+    -0.000001, 0.000013,  -0.000013, -0.000017, 0.000028,  -0.000021, -0.000005,
+    -0.000017, 0.000134,  -0.000007, -0.000127, -0.000047, 0.000047,  0.000060,
+    -0.000047, 0.000028,  -0.000021, -0.000005, -0.000074, 0.000270,  -0.000015,
+    0.000055,  -0.000099, -0.000099, 0.000124,  -0.000059, -0.000059, -0.000044,
+    0.000044,  -0.000003, -0.000044, 0.000041,  0.000018,  -0.000001, -0.000013,
+    -0.000017, 0.000013,  0.000034,  -0.000002, -0.000024, 0.000024,  -0.000032,
+    0.000028,  0.000020,  -0.000020, -0.000001, 0.000028,  -0.000020, 0.000020,
+    -0.000001, 0.000028,  -0.000021, -0.000005, 0.000034,  -0.000002, -0.000032,
+    0.000024,  -0.000024, 0.000018,  -0.000001, -0.000013, -0.000017, 0.000013,
+    0.000018,  -0.000001, -0.000017, -0.000013, 0.000013,  0.000048,  -0.000046,
+    0.000034,  -0.000034, 0.000098,  -0.000017, -0.000175, 0.000078,  0.000065,
+    -0.000065, 0.000028,  -0.000020, -0.000001, 0.000020,  0.000018,  -0.000001,
+    -0.000017, -0.000013, 0.000013,  0.000044,  -0.000017, 0.000029,  0.000042,
+    -0.000078, -0.000029, 0.000028,  -0.000001, 0.000020,  -0.000020, 0.000124,
+    -0.000059, -0.000059, -0.000044, -0.000003, 0.000044,  -0.000044, 0.000041,
+    0.000065,  -0.000021, -0.000033, 0.000028,  -0.000020, 0.000020,  -0.000001,
+    0.000065,  -0.000021, -0.000033, 0.000065,  -0.000021, -0.000033, 0.000078,
+    -0.000004, 0.000055,  -0.000074, -0.000055, 0.000024,  0.000017,  -0.000023,
+    -0.000017, 0.000048,  -0.000002, 0.000034,  -0.000034, -0.000046, 0.000044,
+    -0.000017, 0.000029,  -0.000078, 0.000042,  -0.000029, 0.000098,  -0.000175,
+    -0.000017, 0.000078,  0.000065,  -0.000065, 0.000024,  0.000017,  -0.000023,
+    -0.000017, 0.000124,  -0.000059, -0.000059, -0.000003, -0.000044, 0.000044,
+    -0.000044, 0.000041,  0.000028,  -0.000021, -0.000005, 0.000018,  -0.000001,
+    -0.000013, -0.000017, 0.000013,  0.000048,  -0.000046, 0.000034,  -0.000034,
+    0.000298,  -0.000027, -0.000266, 0.000120,  0.000099,  -0.000209, -0.000017,
+    0.000185,  -0.000175, -0.000009, 0.000078,  0.000065,  -0.000065, -0.000065,
+    0.000024,  -0.000023, 0.000017,  -0.000017, 0.000024,  -0.000017, -0.000023,
+    0.000017,  0.000028,  -0.000020, -0.000001, 0.000020,  0.000018,  -0.000001,
+    0.000013,  -0.000013, -0.000017, -0.000046, 0.000283,  -0.000266, -0.000015,
+    0.000099,  0.000134,  -0.000099, -0.000099, -0.000017, -0.000009, 0.000185,
+    -0.000065, 0.000078,  -0.000175, 0.000065,  -0.000065, 0.000298,  -0.000027,
+    -0.000266, 0.000120,  0.000099,  -0.000209, 0.000028,  -0.000021, -0.000005,
+    0.000028,  0.000020,  -0.000020, -0.000001, 0.000124,  -0.000003, -0.000003,
+    0.000044,  -0.000044, -0.000059, -0.000059, 0.000124,  -0.000003, -0.000003,
+    0.000044,  -0.000044, -0.000059, -0.000059, 0.000048,  -0.000046, 0.000034,
+    -0.000034, 0.000024,  -0.000017, -0.000023, 0.000017,  0.000028,  -0.000020,
+    0.000020,  -0.000001, 0.000124,  -0.000003, -0.000003, 0.000044,  -0.000044,
+    -0.000059, -0.000059, 0.000028,  -0.000021, -0.000005, 0.000048,  -0.000046,
+    0.000034,  -0.000034, 0.000028,  -0.000021, -0.000005, 0.000018,  -0.000001,
+    0.000013,  -0.000017, -0.000013, -0.000001, 0.000018,  -0.000013, 0.000013,
+    -0.000017, 0.000028,  -0.000001, 0.000020,  -0.000020, -0.000027, -0.000027,
+    0.000298,  -0.000266, 0.000099,  0.000120,  -0.000209, 0.000028,  -0.000020,
+    -0.000001, 0.000020,  0.000018,  -0.000001, -0.000013, -0.000017, 0.000013,
+    0.000124,  -0.000003, -0.000003, 0.000044,  -0.000044, -0.000059, -0.000059,
+    0.000028,  -0.000020, 0.000020,  -0.000001, 0.000065,  -0.000021, -0.000033,
+    0.000124,  -0.000003, -0.000003, 0.000044,  -0.000044, -0.000059, -0.000059,
+    0.000124,  -0.000059, -0.000059, -0.000044, 0.000044,  -0.000003, -0.000044,
+    0.000041,  0.000124,  -0.000059, -0.000059, -0.000044, 0.000044,  -0.000003,
+    -0.000044, 0.000041,  0.000024,  0.000017,  -0.000023, -0.000017, 0.000028,
+    -0.000001, -0.000027, -0.000020, 0.000020,  0.000048,  -0.000046, 0.000034,
+    -0.000034, 0.000048,  -0.000046, 0.000034,  -0.000034, 0.000078,  -0.000004,
+    0.000055,  -0.000055, -0.000074, -0.000074, 0.000270,  -0.000015, 0.000055,
+    -0.000099, -0.000099, 0.000034,  -0.000002, -0.000024, 0.000024,  -0.000032,
+    -0.000032, 0.000283,  -0.000266, -0.000015, -0.000099, 0.000123,  0.000099,
+    -0.000099, 0.000028,  0.000020,  -0.000020, -0.000001, 0.000298,  -0.000266,
+    -0.000027, 0.000099,  0.000120,  -0.000209, 0.000024,  -0.000017, -0.000023,
+    0.000017,  0.000065,  -0.000021, -0.000033, -0.000074, 0.000270,  -0.000015,
+    -0.000099, 0.000055,  -0.000099, 0.000124,  -0.000003, -0.000003, -0.000059,
+    -0.000059, 0.000044,  -0.000044, 0.000028,  0.000020,  -0.000020, -0.000001,
+    0.000124,  -0.000059, -0.000059, -0.000044, 0.000044,  -0.000003, -0.000044,
+    0.000041,  0.000048,  -0.000046, -0.000034, 0.000034,  0.000048,  -0.000046,
+    0.000034,  -0.000034, 0.000098,  -0.000175, -0.000017, 0.000065,  0.000078,
+    -0.000065, 0.000024,  0.000017,  -0.000023, -0.000017, 0.000124,  -0.000003,
+    -0.000003, 0.000044,  -0.000044, -0.000059, -0.000059, 0.000065,  -0.000021,
+    -0.000033, 0.000034,  -0.000002, -0.000024, 0.000024,  -0.000032, 0.000048,
+    -0.000046, 0.000034,  -0.000034, 0.000048,  -0.000046, 0.000034,  -0.000034,
+    0.000065,  -0.000021, -0.000033, 0.000124,  -0.000003, -0.000003, -0.000059,
+    -0.000059, 0.000044,  -0.000044, 0.000124,  -0.000059, -0.000059, -0.000044,
+    -0.000003, 0.000044,  -0.000044, 0.000041,  0.000124,  -0.000059, -0.000059,
+    -0.000044, -0.000003, 0.000044,  -0.000044, 0.000041,  0.000138,  -0.000095,
+    -0.000099, 0.000071,  0.000048,  -0.000046, -0.000034, 0.000034,  0.000124,
+    -0.000003, -0.000003, -0.000059, -0.000059, 0.000044,  -0.000044, 0.000028,
+    -0.000021, -0.000005, 0.000028,  -0.000021, -0.000005, 0.000028,  -0.000021,
+    -0.000005, 0.000124,  -0.000003, -0.000003, -0.000059, -0.000059, 0.000044,
+    -0.000044, 0.000048,  -0.000046, 0.000034,  -0.000034, 0.000124,  -0.000003,
+    -0.000003, 0.000044,  -0.000044, -0.000059, -0.000059, 0.000124,  -0.000003,
+    -0.000003, 0.000044,  -0.000044, -0.000059, -0.000059, 0.000138,  -0.000095,
+    0.000071,  -0.000099, 0.000028,  -0.000021, -0.000005, 0.000124,  -0.000003,
+    -0.000003, 0.000044,  -0.000044, -0.000059, -0.000059, 0.000065,  -0.000021,
+    -0.000033, 0.000124,  -0.000003, -0.000003, 0.000044,  -0.000044, -0.000059,
+    -0.000059, 0.000028,  -0.000021, -0.000005, 0.000071,  -0.000127, -0.000017,
+    0.000047,  0.000060,  -0.000047, 0.000028,  0.000020,  -0.000020, -0.000001,
+    0.000048,  -0.000046, 0.000034,  -0.000034, -0.000266, 0.000283,  -0.000015,
+    -0.000046, -0.000099, -0.000099, 0.000134,  0.000099,  -0.000032, -0.000015,
+    -0.000266, 0.000283,  -0.000099, 0.000123,  0.000099,  -0.000099, 0.000241,
+    -0.000013, 0.000161,  -0.000161, -0.000432, 0.000124,  -0.000003, -0.000003,
+    0.000044,  -0.000044, -0.000059, -0.000059, 0.000124,  -0.000003, -0.000003,
+    0.000044,  -0.000044, -0.000059, -0.000059, 0.000048,  -0.000046, 0.000034,
+    -0.000034, -0.000059, -0.000059, 0.000124,  -0.000044, 0.000044,  -0.000003,
+    -0.000044, 0.000041,  -0.000059, -0.000059, 0.000124,  -0.000044, 0.000044,
+    -0.000003, -0.000044, 0.000041,  0.000028,  -0.000020, 0.000020,  -0.000001,
+    0.000138,  -0.000095, -0.000099, 0.000071,  0.000028,  -0.000021, -0.000005,
+    0.000124,  -0.000059, -0.000059, -0.000003, 0.000044,  -0.000044, -0.000003,
+    0.000124,  -0.000003, -0.000003, -0.000059, 0.000044,  -0.000044, -0.000059,
+    -0.000044, 0.000044,  -0.000059, 0.000124,  -0.000059, -0.000044, -0.000003,
+    0.000044,  -0.000044, 0.000041,  -0.000059, 0.000124,  -0.000059, -0.000044,
+    -0.000003, 0.000044,  -0.000044, 0.000041,  0.000065,  -0.000021, -0.000033,
+    0.000138,  -0.000095, -0.000099, 0.000071,  0.000048,  -0.000046, 0.000034,
+    -0.000034, 0.000028,  -0.000001, 0.000020,  -0.000027, -0.000020, -0.000059,
+    -0.000059, 0.000124,  -0.000044, -0.000003, 0.000044,  -0.000044, 0.000041,
+    0.000138,  -0.000095, -0.000099, 0.000071,  -0.000001, 0.000018,  0.000013,
+    -0.000013, -0.000017, 0.000048,  -0.000046, 0.000034,  -0.000034, 0.000048,
+    -0.000046, 0.000034,  -0.000034, 0.000044,  -0.000017, -0.000078, 0.000029,
+    0.000042,  -0.000029, -0.000059, -0.000059, 0.000124,  -0.000044, -0.000003,
+    0.000044,  -0.000044, 0.000041,  0.000071,  -0.000127, -0.000017, 0.000060,
+    0.000047,  -0.000047, -0.000017, 0.000134,  -0.000007, -0.000127, 0.000047,
+    0.000060,  -0.000047, -0.000047, 0.000124,  -0.000003, -0.000003, 0.000044,
+    -0.000044, -0.000059, -0.000059, 0.000124,  -0.000003, -0.000003, 0.000044,
+    -0.000044, -0.000059, -0.000059, 0.000138,  -0.000099, 0.000071,  -0.000095,
+    -0.000059, 0.000124,  -0.000059, -0.000044, -0.000003, 0.000044,  -0.000044,
+    0.000041,  -0.000002, 0.000048,  -0.000046, -0.000034, 0.000034,  0.000018,
+    -0.000001, 0.000013,  -0.000013, -0.000017, -0.000007, -0.000017, 0.000134,
+    0.000047,  0.000060,  -0.000047, -0.000047, -0.000127, -0.000001, 0.000018,
+    -0.000013, 0.000013,  -0.000017, -0.000032, 0.000283,  -0.000015, -0.000266,
+    0.000099,  0.000123,  -0.000099, -0.000099, -0.000027, -0.000266, 0.000298,
+    -0.000209, 0.000120,  0.000099,  0.000018,  -0.000001, -0.000017, -0.000013,
+    0.000013,  0.000138,  -0.000095, -0.000099, 0.000071,  0.000138,  -0.000095,
+    -0.000099, 0.000071,  -0.000015, 0.000283,  -0.000046, -0.000266, 0.000134,
+    0.000099,  -0.000099, -0.000099, 0.000028,  0.000020,  -0.000020, -0.000001,
+    0.000028,  -0.000021, -0.000005, 0.000065,  -0.000021, 0.000016,  -0.000048,
+    -0.000266, -0.000015, 0.000283,  -0.000046, -0.000099, -0.000099, 0.000134,
+    0.000099,  -0.000266, 0.000283,  -0.000015, -0.000046, 0.000134,  0.000099,
+    -0.000099, -0.000099, -0.000032, -0.000015, -0.000266, 0.000283,  -0.000099,
+    -0.000099, 0.000099,  0.000123,  0.000185,  -0.000009, -0.000017, 0.000065,
+    -0.000175, 0.000078,  -0.000065, -0.000065, -0.000002, 0.000048,  -0.000034,
+    -0.000046, 0.000034,  -0.000001, 0.000018,  0.000013,  -0.000013, -0.000017,
+    0.000018,  -0.000001, -0.000017, -0.000013, 0.000013,  -0.000009, -0.000175,
+    -0.000017, 0.000185,  -0.000065, -0.000065, 0.000078,  0.000065,  -0.000001,
+    0.000018,  0.000013,  -0.000013, -0.000017, 0.000028,  -0.000021, -0.000005,
+    0.000124,  -0.000003, -0.000003, 0.000044,  -0.000044, -0.000059, -0.000059,
+    0.000124,  -0.000003, -0.000003, 0.000044,  -0.000044, -0.000059, -0.000059,
+    -0.000059, 0.000124,  -0.000059, -0.000044, -0.000003, 0.000044,  -0.000044,
+    0.000041,  -0.000059, 0.000124,  -0.000059, -0.000044, -0.000003, 0.000044,
+    -0.000044, 0.000041,  -0.000002, 0.000048,  -0.000034, -0.000046, 0.000034,
+    0.000138,  0.000071,  -0.000095, -0.000099, -0.000001, 0.000018,  0.000013,
+    -0.000013, -0.000017, 0.000018,  -0.000001, 0.000013,  -0.000013, -0.000017,
+    0.000018,  -0.000001, -0.000017, -0.000013, 0.000013,  0.000065,  -0.000021,
+    -0.000033, -0.000001, 0.000028,  -0.000020, -0.000027, 0.000020,  0.000138,
+    -0.000099, 0.000071,  -0.000095, 0.000141,  -0.000007, 0.000099,  -0.000099,
+    -0.000133, 0.000141,  -0.000007, -0.000133, -0.000099, 0.000099,  0.000065,
+    -0.000021, -0.000033, 0.000018,  -0.000001, -0.000017, -0.000013, 0.000013,
+    -0.000127, -0.000017, 0.000071,  0.000060,  0.000047,  -0.000047, 0.000018,
+    -0.000001, 0.000013,  -0.000013, -0.000017, 0.000065,  -0.000021, -0.000033,
+    0.000060,  -0.000003, 0.000034,  -0.000041, 0.000042,  0.000138,  -0.000095,
+    -0.000099, 0.000071,  0.000185,  -0.000009, -0.000017, -0.000175, 0.000065,
+    0.000078,  -0.000065, -0.000065, 0.000018,  -0.000001, -0.000013, 0.000013,
+    -0.000017, -0.000009, -0.000175, -0.000017, 0.000185,  -0.000065, -0.000065,
+    0.000078,  0.000065,  0.000141,  -0.000007, 0.000099,  -0.000133, -0.000099,
+    0.000141,  -0.000007, -0.000133, -0.000099, 0.000099,  -0.000133, 0.000297,
+    0.000099,  0.000099,  -0.000133, -0.000209, 0.000124,  -0.000003, -0.000003,
+    -0.000059, -0.000059, 0.000044,  -0.000044, -0.000001, 0.000018,  -0.000013,
+    0.000013,  -0.000017, 0.000124,  -0.000003, -0.000003, -0.000059, -0.000059,
+    0.000044,  -0.000044, -0.000059, -0.000059, 0.000124,  -0.000003, -0.000044,
+    0.000044,  -0.000044, 0.000041,  -0.000059, -0.000059, 0.000124,  -0.000003,
+    -0.000044, 0.000044,  -0.000044, 0.000041,  0.000034,  -0.000002, -0.000032,
+    -0.000024, 0.000024,  -0.000032, 0.000283,  -0.000015, -0.000266, -0.000099,
+    -0.000099, 0.000123,  0.000099,  0.000048,  -0.000002, 0.000034,  -0.000034,
+    -0.000046, -0.000001, 0.000018,  0.000013,  -0.000013, -0.000017, 0.000028,
+    -0.000021, -0.000005, 0.000124,  -0.000003, -0.000003, -0.000059, -0.000059,
+    0.000044,  -0.000044, 0.000124,  -0.000003, -0.000003, -0.000059, -0.000059,
+    0.000044,  -0.000044, 0.000138,  0.000071,  -0.000095, -0.000099, 0.000065,
+    -0.000021, -0.000033, 0.000141,  -0.000007, 0.000099,  -0.000133, -0.000099,
+    0.000141,  -0.000007, -0.000099, -0.000133, 0.000099,  0.000138,  -0.000095,
+    -0.000099, 0.000071,  0.000138,  -0.000095, 0.000071,  -0.000099, -0.000017,
+    0.000134,  -0.000007, -0.000047, 0.000060,  -0.000127, 0.000047,  -0.000047,
+    0.000028,  -0.000021, -0.000005, 0.000141,  -0.000099, 0.000099,  -0.000133,
+    -0.000007, -0.000017, 0.000071,  -0.000127, 0.000047,  0.000060,  -0.000047,
+    0.000141,  -0.000007, -0.000133, -0.000099, 0.000099,  -0.000133, 0.000297,
+    0.000099,  -0.000133, 0.000099,  -0.000209, 0.000124,  -0.000003, -0.000003,
+    0.000044,  -0.000044, -0.000059, -0.000059, 0.000124,  -0.000003, -0.000003,
+    0.000044,  -0.000044, -0.000059, -0.000059, -0.000059, 0.000124,  -0.000059,
+    -0.000003, -0.000044, -0.000044, 0.000044,  0.000041,  -0.000059, 0.000124,
+    -0.000059, -0.000003, -0.000044, -0.000044, 0.000044,  0.000041,  -0.000017,
+    -0.000007, 0.000134,  -0.000127, -0.000047, 0.000060,  -0.000047, 0.000047,
+    -0.000059, -0.000059, 0.000124,  -0.000003, -0.000044, 0.000044,  -0.000044,
+    0.000041,  0.000138,  0.000071,  -0.000099, -0.000095, -0.000059, -0.000059,
+    0.000124,  -0.000003, -0.000044, 0.000044,  -0.000044, 0.000041,  0.000028,
+    -0.000021, -0.000005, 0.000138,  -0.000099, -0.000095, 0.000071,  0.000141,
+    -0.000099, 0.000099,  -0.000007, -0.000133, 0.000141,  -0.000007, -0.000133,
+    0.000099,  -0.000099, 0.000065,  -0.000021, -0.000033, 0.000028,  -0.000021,
+    -0.000005, 0.000018,  -0.000001, -0.000017, -0.000013, 0.000013,  -0.000017,
+    0.000071,  -0.000127, 0.000060,  0.000047,  -0.000047, 0.000124,  -0.000059,
+    -0.000059, -0.000003, 0.000044,  -0.000044, -0.000044, 0.000041,  -0.000059,
+    -0.000059, 0.000124,  -0.000003, -0.000044, 0.000044,  -0.000044, 0.000041,
+    -0.000059, -0.000059, 0.000124,  -0.000003, -0.000044, 0.000044,  -0.000044,
+    0.000041,  0.000078,  -0.000004, -0.000055, 0.000055,  -0.000074, -0.000015,
+    -0.000074, 0.000270,  -0.000099, -0.000099, 0.000055,  0.000141,  -0.000099,
+    -0.000007, -0.000133, 0.000099,  0.000141,  -0.000007, -0.000099, -0.000133,
+    0.000099,  -0.000133, 0.000297,  0.000099,  -0.000133, 0.000099,  -0.000209,
+    0.000018,  -0.000001, -0.000017, 0.000013,  -0.000013, -0.000017, 0.000098,
+    -0.000175, -0.000065, 0.000065,  0.000078,  0.000018,  -0.000001, -0.000013,
+    0.000013,  -0.000017, -0.000127, -0.000017, 0.000071,  0.000060,  0.000047,
+    -0.000047, 0.000018,  -0.000001, -0.000013, -0.000017, 0.000013,  -0.000059,
+    -0.000059, 0.000124,  0.000044,  -0.000003, -0.000044, -0.000044, 0.000041,
+    -0.000059, -0.000059, 0.000124,  0.000044,  -0.000003, -0.000044, -0.000044,
+    0.000041,  0.000124,  -0.000059, -0.000059, 0.000044,  -0.000044, -0.000003,
+    -0.000044, 0.000041,  0.000138,  0.000071,  -0.000095, -0.000099, 0.000141,
+    0.000099,  -0.000007, -0.000133, -0.000099, 0.000138,  -0.000095, 0.000071,
+    -0.000099, 0.000028,  -0.000001, 0.000020,  -0.000020, -0.000027, 0.000141,
+    -0.000007, 0.000099,  -0.000099, -0.000133, -0.000133, 0.000297,  0.000099,
+    0.000099,  -0.000133, -0.000209, 0.000028,  -0.000021, -0.000005, 0.000034,
+    -0.000002, -0.000032, 0.000024,  -0.000024, -0.000015, -0.000032, 0.000283,
+    -0.000266, -0.000099, 0.000099,  0.000123,  -0.000099, -0.000133, 0.000297,
+    0.000099,  -0.000133, 0.000099,  -0.000209, -0.000059, 0.000124,  -0.000059,
+    -0.000044, 0.000044,  -0.000003, 0.000041,  -0.000044, -0.000017, 0.000185,
+    -0.000175, -0.000009, -0.000065, -0.000065, 0.000078,  0.000065,  0.000141,
+    -0.000099, 0.000099,  -0.000133, -0.000007, 0.000141,  -0.000007, -0.000133,
+    -0.000099, 0.000099,  -0.000017, -0.000009, 0.000185,  0.000065,  -0.000175,
+    0.000078,  -0.000065, -0.000065, 0.000048,  0.000034,  -0.000046, -0.000034,
+    0.000048,  0.000034,  -0.000046, -0.000034, 0.000138,  -0.000099, 0.000071,
+    -0.000095, -0.000001, 0.000018,  -0.000017, 0.000013,  -0.000013, 0.000065,
+    -0.000021, -0.000033, 0.000028,  -0.000001, 0.000020,  -0.000020, -0.000027,
+    -0.000266, -0.000027, 0.000298,  0.000099,  0.000120,  -0.000209, -0.000003,
+    0.000124,  -0.000003, 0.000044,  -0.000044, -0.000059, -0.000059, -0.000059,
+    0.000124,  -0.000059, -0.000044, 0.000044,  -0.000003, -0.000044, 0.000041,
+    0.000065,  -0.000021, -0.000033, 0.000228,  -0.000013, -0.000161, -0.000216,
+    -0.000216, -0.000216, 0.000390,  0.000161,  -0.000271, 0.000161,  0.000028,
+    -0.000021, -0.000005, 0.000065,  -0.000021, -0.000033, -0.000003, 0.000124,
+    -0.000003, 0.000044,  -0.000044, -0.000059, -0.000059, 0.000044,  -0.000044,
+    0.000028,  -0.000021, -0.000005, -0.000017, 0.000185,  -0.000009, -0.000065,
+    -0.000175, -0.000065, 0.000078,  0.000065,  0.000124,  -0.000003, -0.000003,
+    0.000044,  -0.000044, -0.000059, -0.000059, 0.000124,  -0.000003, -0.000003,
+    0.000044,  -0.000044, -0.000059, -0.000059, -0.000059, -0.000059, 0.000124,
+    0.000044,  -0.000003, -0.000044, -0.000044, 0.000041,  0.000138,  0.000071,
+    -0.000099, -0.000095, -0.000059, -0.000059, 0.000124,  0.000044,  -0.000044,
+    -0.000003, -0.000044, 0.000041,  -0.000003, 0.000124,  -0.000003, 0.000044,
+    -0.000044, -0.000059, -0.000059, 0.000044,  -0.000044, -0.000059, -0.000059,
+    0.000124,  -0.000003, 0.000044,  -0.000044, -0.000003, -0.000059, -0.000059,
+    0.000124,  0.000044,  -0.000044, -0.000003, -0.000044, 0.000041,  -0.000017,
+    0.000185,  -0.000009, -0.000175, -0.000065, 0.000078,  -0.000065, 0.000065,
+    -0.000009, -0.000017, -0.000175, 0.000185,  -0.000065, 0.000065,  -0.000065,
+    0.000078,  0.000028,  -0.000020, 0.000020,  -0.000001, -0.000059, -0.000059,
+    0.000124,  0.000044,  -0.000003, -0.000044, -0.000044, 0.000041,  0.000048,
+    -0.000046, 0.000034,  -0.000034, -0.000266, -0.000027, 0.000298,  0.000099,
+    0.000120,  -0.000209, -0.000002, 0.000048,  -0.000046, -0.000034, 0.000034,
+    0.000018,  -0.000001, -0.000017, -0.000013, 0.000013,  -0.000175, -0.000017,
+    0.000098,  0.000078,  0.000065,  -0.000065, 0.000048,  -0.000046, 0.000034,
+    -0.000034, 0.000028,  -0.000020, -0.000001, 0.000020,  0.000078,  -0.000004,
+    -0.000055, -0.000074, 0.000055,  -0.000015, -0.000074, 0.000270,  -0.000099,
+    0.000055,  -0.000099, 0.000028,  -0.000021, -0.000005, 0.000138,  0.000071,
+    -0.000095, -0.000099, 0.000028,  -0.000021, -0.000005, -0.000001, 0.000028,
+    -0.000020, 0.000020,  -0.000027, -0.000133, 0.000297,  0.000099,  -0.000133,
+    0.000099,  -0.000209, 0.000065,  -0.000021, -0.000048, 0.000016,  0.000018,
+    -0.000001, -0.000013, 0.000013,  -0.000017, -0.000175, -0.000017, 0.000098,
+    0.000078,  0.000065,  -0.000065, 0.000018,  -0.000001, -0.000013, -0.000017,
+    0.000013,  -0.000127, -0.000017, 0.000071,  0.000060,  0.000047,  -0.000047,
+    -0.000133, 0.000297,  0.000099,  0.000099,  -0.000133, -0.000209, -0.000001,
+    0.000018,  -0.000017, -0.000013, 0.000013,  0.000065,  -0.000021, -0.000033,
+    -0.000003, 0.000124,  -0.000003, 0.000044,  -0.000044, -0.000059, -0.000059,
+    0.000138,  0.000071,  -0.000099, -0.000095, 0.000138,  -0.000095, -0.000099,
+    0.000071,  -0.000003, -0.000003, 0.000124,  -0.000059, 0.000044,  -0.000044,
+    -0.000059, 0.000124,  -0.000003, -0.000003, 0.000044,  -0.000044, -0.000059,
+    -0.000059, -0.000003, 0.000124,  -0.000003, 0.000044,  -0.000044, -0.000059,
+    -0.000059, -0.000059, -0.000059, 0.000124,  -0.000003, 0.000044,  -0.000044,
+    -0.000044, 0.000041,  0.000124,  -0.000003, -0.000003, 0.000044,  -0.000044,
+    -0.000059, -0.000059, -0.000059, -0.000059, 0.000124,  0.000044,  -0.000044,
+    -0.000003, -0.000044, 0.000041,  -0.000017, 0.000185,  -0.000009, -0.000175,
+    0.000065,  0.000078,  -0.000065, -0.000065, -0.000017, -0.000175, 0.000098,
+    0.000078,  0.000065,  -0.000065, -0.000059, -0.000059, 0.000124,  0.000044,
+    -0.000003, -0.000044, -0.000044, 0.000041,  -0.000059, -0.000059, 0.000124,
+    0.000044,  -0.000003, -0.000044, -0.000044, 0.000041,  -0.000059, -0.000059,
+    0.000124,  0.000044,  -0.000044, -0.000003, -0.000044, 0.000041,  0.000138,
+    0.000071,  -0.000099, -0.000095, 0.000065,  -0.000021, 0.000016,  -0.000048,
+    -0.000017, -0.000009, 0.000185,  -0.000175, -0.000065, -0.000065, 0.000078,
+    0.000065,  0.000065,  -0.000021, -0.000033, 0.000065,  -0.000021, -0.000033,
+    -0.000003, -0.000003, 0.000124,  0.000044,  -0.000044, -0.000059, -0.000059,
+    0.000065,  -0.000021, -0.000033, 0.000138,  -0.000095, 0.000071,  -0.000099,
+    0.000124,  -0.000059, -0.000059, -0.000044, -0.000003, 0.000044,  -0.000044,
+    0.000041,  0.000028,  0.000020,  -0.000001, -0.000020, 0.000065,  -0.000021,
+    -0.000033, 0.000124,  -0.000003, -0.000003, 0.000044,  -0.000044, -0.000059,
+    -0.000059, 0.000124,  -0.000003, -0.000003, 0.000044,  -0.000044, -0.000059,
+    -0.000059, -0.000059, -0.000059, 0.000124,  0.000044,  -0.000003, -0.000044,
+    -0.000044, 0.000041,  -0.000003, 0.000124,  -0.000003, 0.000044,  -0.000044,
+    -0.000059, -0.000059, 0.000138,  -0.000095, -0.000099, 0.000071,  -0.000003,
+    0.000124,  -0.000003, 0.000044,  -0.000044, -0.000059, -0.000059, -0.000059,
+    0.000124,  -0.000059, -0.000044, -0.000003, 0.000044,  -0.000044, 0.000041,
+    -0.000059, -0.000059, 0.000124,  -0.000003, -0.000044, 0.000044,  -0.000044,
+    0.000041,  -0.000059, -0.000059, 0.000124,  0.000044,  -0.000003, -0.000044,
+    -0.000044, 0.000041,  -0.000059, -0.000059, 0.000124,  -0.000003, -0.000044,
+    0.000044,  -0.000044, 0.000041,  0.000270,  -0.000015, 0.000055,  -0.000074,
+    -0.000099, -0.000099, -0.000003, -0.000003, 0.000124,  0.000044,  -0.000044,
+    -0.000059, -0.000059, -0.000017, 0.000044,  0.000029,  -0.000078, 0.000042,
+    -0.000029, 0.000044,  -0.000017, 0.000042,  0.000029,  -0.000078, -0.000029,
+    0.000065,  -0.000021, -0.000033, 0.000028,  -0.000021, -0.000005, 0.000048,
+    -0.000002, -0.000046, -0.000034, 0.000034,  -0.000015, -0.000266, -0.000046,
+    0.000283,  -0.000099, -0.000099, 0.000134,  0.000099,  -0.000059, -0.000059,
+    0.000124,  -0.000044, -0.000003, 0.000044,  -0.000044, 0.000041,  -0.000059,
+    -0.000059, 0.000124,  -0.000044, -0.000003, 0.000044,  -0.000044, 0.000041,
+    0.000138,  -0.000095, -0.000099, 0.000071,  0.000048,  -0.000046, 0.000034,
+    -0.000034, 0.000048,  -0.000046, 0.000034,  -0.000034, -0.000003, 0.000124,
+    -0.000003, 0.000044,  -0.000044, -0.000059, -0.000059, -0.000059, -0.000059,
+    0.000124,  -0.000044, 0.000044,  -0.000003, -0.000044, 0.000041,  0.000028,
+    -0.000021, -0.000005, -0.000004, 0.000078,  0.000055,  -0.000055, -0.000074,
+    -0.000032, 0.000283,  -0.000015, -0.000266, 0.000123,  -0.000099, 0.000099,
+    -0.000099, -0.000027, -0.000266, 0.000298,  0.000099,  0.000120,  -0.000209,
+    0.000028,  0.000020,  -0.000020, -0.000001, 0.000078,  -0.000004, -0.000055,
+    -0.000074, 0.000055,  0.000028,  -0.000021, -0.000005, 0.000048,  -0.000046,
+    0.000034,  -0.000034, -0.000074, 0.000270,  -0.000015, -0.000099, 0.000055,
+    -0.000099, -0.000046, -0.000015, -0.000266, 0.000283,  -0.000099, 0.000099,
+    -0.000099, 0.000134,  0.000018,  -0.000001, -0.000017, 0.000013,  -0.000013,
+    -0.000175, -0.000017, 0.000098,  0.000065,  0.000078,  -0.000065, -0.000133,
+    0.000297,  -0.000133, 0.000099,  -0.000209, 0.000099,  -0.000003, -0.000003,
+    0.000124,  -0.000059, 0.000044,  -0.000044, -0.000059, -0.000003, 0.000124,
+    -0.000003, -0.000059, -0.000059, 0.000044,  -0.000044, -0.000003, -0.000003,
+    0.000124,  -0.000059, 0.000044,  -0.000044, -0.000059, -0.000059, -0.000059,
+    0.000124,  0.000044,  -0.000044, -0.000003, -0.000044, 0.000041,  -0.000059,
+    -0.000059, 0.000124,  0.000044,  -0.000044, -0.000003, -0.000044, 0.000041,
+    -0.000003, 0.000124,  -0.000003, -0.000059, -0.000059, 0.000044,  -0.000044,
+    -0.000059, -0.000059, 0.000124,  -0.000044, -0.000003, 0.000044,  -0.000044,
+    0.000041,  0.000065,  -0.000021, -0.000033, 0.000138,  -0.000095, 0.000071,
+    -0.000099, 0.000065,  -0.000021, -0.000033, 0.000048,  -0.000046, 0.000034,
+    -0.000034, 0.000048,  -0.000046, 0.000034,  -0.000034, 0.000065,  -0.000021,
+    -0.000033, 0.000065,  -0.000021, -0.000033, -0.000003, -0.000003, 0.000124,
+    -0.000059, 0.000044,  -0.000044, -0.000059, 0.000028,  -0.000021, -0.000005,
+    -0.000017, 0.000134,  -0.000007, -0.000127, 0.000047,  0.000060,  -0.000047,
+    -0.000047, -0.000017, -0.000127, 0.000071,  0.000060,  0.000047,  -0.000047,
+    -0.000003, 0.000124,  -0.000003, 0.000044,  -0.000044, -0.000059, -0.000059,
+    0.000034,  -0.000002, -0.000032, -0.000024, 0.000024,  -0.000015, -0.000266,
+    -0.000032, 0.000283,  -0.000099, -0.000099, 0.000123,  0.000099,  0.000065,
+    -0.000021, -0.000033, -0.000003, 0.000124,  -0.000003, 0.000044,  -0.000044,
+    -0.000059, -0.000059, -0.000059, -0.000059, 0.000124,  -0.000044, -0.000003,
+    0.000044,  -0.000044, 0.000041,  -0.000003, 0.000124,  -0.000003, 0.000044,
+    -0.000044, -0.000059, -0.000059, -0.000059, -0.000059, 0.000124,  0.000044,
+    -0.000044, -0.000003, -0.000044, 0.000041,  0.000065,  -0.000021, -0.000033,
+    0.000138,  -0.000095, -0.000099, 0.000071,  0.000048,  -0.000046, 0.000034,
+    -0.000034, 0.000138,  -0.000095, 0.000071,  -0.000099, 0.000028,  -0.000020,
+    -0.000001, 0.000020,  0.000028,  -0.000020, -0.000001, 0.000020,  -0.000003,
+    0.000124,  -0.000003, 0.000044,  -0.000044, -0.000059, -0.000059, 0.000028,
+    -0.000020, -0.000001, 0.000020,  -0.000003, -0.000003, 0.000124,  0.000044,
+    -0.000044, -0.000059, -0.000059, 0.000124,  -0.000003, -0.000003, 0.000044,
+    -0.000044, -0.000059, -0.000059, 0.000044,  -0.000044, 0.000028,  -0.000021,
+    -0.000005, 0.000124,  -0.000003, -0.000003, 0.000044,  -0.000044, -0.000059,
+    -0.000059, 0.000044,  -0.000044, -0.000059, -0.000059, 0.000124,  -0.000003,
+    0.000044,  -0.000044, -0.000003, -0.000001, 0.000028,  0.000020,  -0.000020,
+    -0.000027, -0.000017, 0.000185,  -0.000009, -0.000175, -0.000065, 0.000065,
+    0.000078,  -0.000065, -0.000017, -0.000009, -0.000175, 0.000185,  0.000078,
+    0.000065,  -0.000065, -0.000065, -0.000001, 0.000018,  -0.000013, 0.000013,
+    -0.000017, 0.000124,  -0.000003, -0.000003, 0.000044,  -0.000044, -0.000059,
+    -0.000059, -0.000003, 0.000124,  -0.000003, 0.000044,  -0.000044, -0.000059,
+    -0.000059, -0.000059, -0.000059, 0.000124,  -0.000003, -0.000044, 0.000044,
+    -0.000044, 0.000041,  0.000124,  -0.000003, -0.000003, 0.000044,  -0.000044,
+    -0.000059, -0.000059, 0.000028,  -0.000021, -0.000005, -0.000059, -0.000059,
+    0.000124,  -0.000003, 0.000044,  -0.000044, -0.000003, -0.000003, -0.000003,
+    0.000124,  -0.000059, -0.000059, 0.000044,  -0.000044, -0.000059, -0.000059,
+    0.000124,  -0.000044, 0.000044,  -0.000003, -0.000044, 0.000041,  0.000138,
+    -0.000099, 0.000071,  -0.000095, 0.000048,  -0.000034, -0.000046, 0.000034,
+    0.000048,  -0.000034, -0.000046, 0.000034,  -0.000003, 0.000124,  -0.000003,
+    -0.000059, 0.000044,  -0.000044, -0.000059, -0.000044, 0.000044,  -0.000059,
+    -0.000059, 0.000124,  -0.000044, 0.000044,  -0.000003, 0.000041,  -0.000044,
+    -0.000017, 0.000044,  0.000042,  -0.000078, 0.000029,  -0.000029, -0.000003,
+    -0.000003, 0.000124,  -0.000059, -0.000059, 0.000044,  -0.000044, 0.000124,
+    -0.000003, -0.000003, 0.000044,  -0.000044, -0.000059, -0.000059, 0.000124,
+    -0.000003, -0.000003, 0.000044,  -0.000044, -0.000059, -0.000059, -0.000059,
+    -0.000059, 0.000124,  0.000044,  -0.000003, -0.000044, -0.000044, 0.000041,
+    -0.000059, -0.000059, 0.000124,  -0.000044, 0.000044,  -0.000003, -0.000044,
+    0.000041,  0.000018,  -0.000001, -0.000013, 0.000013,  -0.000017, -0.000007,
+    -0.000017, 0.000134,  -0.000127, -0.000047, -0.000047, 0.000060,  0.000047,
+    -0.000001, 0.000018,  0.000013,  -0.000013, -0.000017, -0.000059, -0.000059,
+    0.000124,  0.000044,  -0.000003, -0.000044, -0.000044, 0.000041,  0.000048,
+    -0.000002, -0.000046, -0.000034, 0.000034,  -0.000266, -0.000015, -0.000046,
+    0.000283,  -0.000099, -0.000099, 0.000134,  0.000099,  0.000001,  1.000000,
+    0.000001,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,  1.000000,
+    1.000000,  1.000000,  1.000000,  0.000045,  -0.000003, -0.000014, -0.000006,
+    0.000027,  0.000003,  -0.000008, 0.000027,  0.000003,  -0.000008, 0.000027,
+    0.000003,  -0.000008, 0.000047,  -0.000043, -0.000005, 0.000077,  0.000402,
+    -0.000358, 0.000047,  -0.000043, -0.000005, 0.000077,  0.000402,  -0.000358,
+    0.000027,  0.000003,  -0.000008, 0.000022,  0.000021,  -0.000006, 0.000022,
+    0.000021,  -0.000006, 0.000045,  -0.000004, -0.000003, -0.000014, -0.000006,
+    0.000027,  0.000003,  -0.000008, 0.000022,  0.000021,  -0.000006, 0.000047,
+    -0.000043, -0.000005, 0.000075,  -0.000067, 0.000044,  -0.000044, -0.000008,
+    0.000075,  -0.000067, 0.000044,  -0.000044, -0.000008, 0.000027,  0.000003,
+    -0.000008, 0.000138,  -0.000099, -0.000095, 0.000071,  0.000027,  0.000003,
+    -0.000008, 0.000100,  -0.000005, -0.000071, 0.000071,  0.000047,  0.000045,
+    -0.000021, -0.000021, 0.000016,  0.000016,  -0.000013, 0.000027,  0.000003,
+    -0.000008, 0.000236,  -0.000010, -0.000010, -0.000113, 0.000036,  -0.000080,
+    -0.000080, 0.000032,  0.000045,  -0.000003, -0.000004, -0.000014, -0.000006,
+    0.000022,  0.000021,  -0.000006, 0.000022,  0.000021,  -0.000006, 0.000022,
+    0.000021,  -0.000006, 0.000075,  -0.000067, 0.000044,  -0.000044, -0.000008,
+    -0.000059, -0.000059, 0.000066,  0.000044,  0.000044,  0.000035,  -0.000018,
+    -0.000059, -0.000059, 0.000066,  0.000044,  0.000044,  0.000035,  -0.000018,
+    0.000022,  0.000021,  -0.000006, -0.000059, -0.000059, 0.000066,  0.000044,
+    0.000044,  0.000035,  -0.000018, 0.000022,  0.000021,  -0.000006, 0.000132,
+    0.000675,  -0.000594, 0.000022,  0.000021,  -0.000006, 0.000022,  0.000021,
+    -0.000006, -0.000059, -0.000059, 0.000066,  0.000044,  0.000044,  0.000035,
+    -0.000018, 0.000067,  -0.000113, 0.000036,  0.000008,  0.000044,  -0.000007,
+    -0.000005, -0.000044, 0.000322,  -0.000044, -0.000117, -0.000046, -0.000024,
+    0.000045,  -0.000003, -0.000014, -0.000006, 0.000022,  0.000021,  -0.000006,
+    0.000202,  -0.000010, -0.000007, -0.000072, -0.000070, -0.000021, 0.000100,
+    -0.000005, -0.000071, 0.000071,  0.000047,  -0.000043, -0.000005, 0.000022,
+    0.000021,  -0.000006, -0.000010, 0.000227,  -0.000080, -0.000080, -0.000010,
+    -0.000024, 0.000022,  0.000021,  -0.000006, 0.000055,  -0.000088, 0.000034,
+    0.000028,  0.000049,  -0.000005, 0.000022,  0.000021,  -0.000006, 0.000047,
+    -0.000043, -0.000005, 0.000075,  -0.000044, -0.000067, 0.000044,  -0.000008,
+    0.000100,  -0.000071, -0.000005, 0.000071,  0.000013,  0.000012,  -0.000004,
+    0.000047,  -0.000043, -0.000005, 0.000075,  -0.000067, 0.000044,  -0.000044,
+    -0.000008, 0.000027,  0.000003,  -0.000008, 0.000132,  0.000675,  -0.000594,
+    0.000075,  -0.000067, 0.000044,  -0.000044, -0.000008, 0.000027,  0.000003,
+    -0.000008, -0.000113, 0.000067,  0.000036,  0.000044,  0.000008,  -0.000007,
+    0.000022,  0.000021,  -0.000006, 0.000077,  0.000402,  -0.000358, 0.000047,
+    -0.000043, -0.000005, 0.000009,  -0.000054, 0.000050,  0.000014,  0.000012,
+    -0.000007, 0.000109,  -0.000036, -0.000007, -0.000024, -0.000059, -0.000059,
+    0.000066,  0.000044,  0.000044,  0.000035,  -0.000018, -0.000059, -0.000059,
+    0.000066,  0.000044,  0.000044,  0.000035,  -0.000018, 0.000027,  0.000003,
+    -0.000008, -0.000003, -0.000003, 0.000137,  -0.000013, -0.000044, -0.000044,
+    -0.000006, 0.000022,  0.000021,  -0.000006, 0.000022,  0.000021,  -0.000006,
+    0.000236,  -0.000010, -0.000010, 0.000036,  -0.000113, -0.000080, -0.000080,
+    0.000032,  -0.000009, 0.000199,  -0.000088, -0.000065, -0.000088, 0.000065,
+    -0.000065, 0.000065,  -0.000010, -0.000005, 0.000022,  0.000021,  -0.000006,
+    0.000022,  0.000021,  -0.000006, 0.000022,  0.000021,  -0.000006, -0.000113,
+    0.000067,  0.000044,  0.000036,  0.000008,  -0.000007, 0.000047,  -0.000043,
+    -0.000005, -0.000059, -0.000059, 0.000066,  0.000044,  0.000079,  -0.000018,
+    -0.000001, -0.000001, -0.000005, 0.000127,  -0.000005, -0.000088, -0.000019,
+    0.000022,  0.000021,  -0.000006, 0.000022,  0.000021,  -0.000006, 0.000047,
+    -0.000043, -0.000005, 0.000075,  -0.000044, -0.000067, 0.000044,  -0.000008,
+    0.000236,  -0.000080, -0.000080, -0.000010, 0.000036,  -0.000010, -0.000113,
+    0.000032,  0.000027,  0.000003,  -0.000008, -0.000005, 0.000125,  -0.000005,
+    -0.000065, -0.000021, -0.000012, 0.000022,  0.000021,  -0.000006, 0.000022,
+    0.000021,  -0.000006, 0.000054,  0.000049,  -0.000063, 0.000023,  0.000075,
+    -0.000067, 0.000044,  -0.000044, -0.000008, -0.000070, -0.000072, 0.000202,
+    -0.000007, -0.000010, -0.000021, -0.000001, -0.000001, 0.000127,  -0.000005,
+    -0.000005, -0.000088, -0.000019, -0.000043, 0.000047,  -0.000005, 0.000018,
+    -0.000013, -0.000001, -0.000017, 0.000013,  0.000067,  -0.000113, 0.000008,
+    0.000036,  -0.000007, 0.000060,  0.000035,  -0.000018, 0.000022,  0.000021,
+    -0.000006, -0.000002, 0.000137,  -0.000063, -0.000011, -0.000090, 0.000028,
+    0.000016,  -0.000009, 0.000022,  0.000021,  -0.000006, -0.000095, 0.000056,
+    0.000186,  -0.000165, 0.000100,  -0.000071, -0.000005, 0.000071,  0.000027,
+    0.000003,  -0.000008, 0.000018,  -0.000001, -0.000013, -0.000017, 0.000013,
+    -0.000002, -0.000008, 0.000118,  -0.000063, -0.000078, 0.000016,  0.000028,
+    -0.000005, -0.000003, -0.000003, 0.000137,  -0.000044, -0.000013, -0.000044,
+    -0.000006, 0.000071,  0.000173,  -0.000018, 0.000022,  0.000021,  -0.000006,
+    0.000045,  0.000045,  -0.000013, 0.000008,  0.000067,  -0.000113, 0.000036,
+    0.000044,  -0.000007, -0.000010, 0.000236,  0.000036,  -0.000113, -0.000010,
+    -0.000080, -0.000080, 0.000032,  0.000045,  0.000045,  -0.000013, 0.000069,
+    0.000035,  -0.000113, 0.000036,  0.000044,  -0.000007, 0.000022,  0.000021,
+    -0.000006, 0.000027,  0.000003,  -0.000008, -0.000007, -0.000017, 0.000134,
+    0.000060,  -0.000127, -0.000047, -0.000047, 0.000047,  -0.000003, -0.000003,
+    -0.000013, 0.000137,  -0.000044, -0.000044, -0.000006, 0.000027,  0.000003,
+    -0.000008, 0.000077,  0.000402,  -0.000358, -0.000055, 0.000022,  -0.000094,
+    0.000062,  0.000080,  -0.000026, 0.000139,  0.000710,  -0.000636, 0.000022,
+    0.000021,  -0.000006, 0.000027,  0.000003,  -0.000008, 0.000018,  -0.000013,
+    -0.000017, -0.000001, 0.000013,  0.000022,  0.000021,  -0.000006, 0.000054,
+    -0.000063, 0.000049,  0.000028,  0.000016,  -0.000005, 0.000022,  0.000021,
+    -0.000006, -0.000059, -0.000059, 0.000066,  0.000044,  0.000079,  -0.000018,
+    0.000057,  0.000049,  -0.000100, 0.000036,  0.000034,  -0.000002, -0.000059,
+    -0.000059, 0.000044,  0.000066,  0.000044,  0.000035,  -0.000018, 0.000103,
+    -0.000011, -0.000013, -0.000043, -0.000018, 0.000027,  0.000003,  -0.000008,
+    0.000022,  0.000021,  -0.000006, -0.000059, -0.000059, 0.000044,  0.000066,
+    0.000044,  0.000035,  -0.000018, 0.000027,  0.000003,  -0.000008, -0.000161,
+    0.000509,  0.000161,  -0.000178, -0.000216, -0.000013, -0.000026, -0.000021,
+    0.000047,  -0.000043, -0.000005, 0.000027,  0.000003,  -0.000008, 0.000077,
+    0.000402,  -0.000358, -0.000059, -0.000059, 0.000066,  0.000044,  0.000044,
+    0.000035,  -0.000018, 0.000100,  -0.000005, -0.000071, 0.000071,  0.000027,
+    -0.000005, 0.000054,  0.000049,  -0.000063, 0.000023,  0.000022,  0.000021,
+    -0.000006, 0.000028,  0.000019,  0.000013,  0.000012,  -0.000004, -0.000005,
+    0.000322,  -0.000044, -0.000046, -0.000044, -0.000117, -0.000024, 0.000028,
+    0.000027,  -0.000008, -0.000012, 0.000107,  -0.000062, 0.000062,  -0.000094,
+    -0.000007, 0.000148,  -0.000063, -0.000047, -0.000063, 0.000047,  -0.000047,
+    0.000047,  -0.000010, -0.000005, 0.000075,  -0.000067, 0.000044,  -0.000044,
+    -0.000008, 0.000022,  0.000021,  -0.000006, -0.000010, 0.000227,  -0.000080,
+    -0.000010, -0.000024, 0.000036,  0.000236,  -0.000080, -0.000080, -0.000010,
+    -0.000113, -0.000010, 0.000032,  0.000047,  -0.000043, -0.000005, 0.000100,
+    -0.000071, -0.000005, 0.000071,  -0.000006, -0.000006, 0.000285,  -0.000015,
+    -0.000055, -0.000056, -0.000044, -0.000044, -0.000037, 0.000675,  0.000132,
+    -0.000594, 0.000027,  0.000003,  -0.000008, 0.000027,  0.000003,  -0.000008,
+    0.000027,  0.000003,  -0.000008, 0.000044,  0.000079,  0.000089,  -0.000113,
+    0.000024,  0.000027,  0.000003,  -0.000008, 0.000075,  -0.000067, 0.000044,
+    -0.000044, -0.000008, 0.000022,  0.000021,  -0.000006, 0.000022,  0.000021,
+    -0.000006, 0.000044,  0.000089,  0.000036,  -0.000113, 0.000079,  -0.000012,
+    0.000022,  0.000021,  -0.000006, 0.000027,  0.000003,  -0.000008, -0.000009,
+    0.000199,  0.000065,  -0.000088, 0.000065,  -0.000065, -0.000065, -0.000088,
+    -0.000010, -0.000005, 0.000022,  0.000021,  -0.000006, 0.000022,  0.000021,
+    -0.000006, 0.000016,  0.000014,  -0.000005, 0.000047,  -0.000043, -0.000005,
+    0.000030,  -0.000006, -0.000002, -0.000007, -0.000004, -0.000085, 0.000096,
+    0.000056,  -0.000056, -0.000011, 0.000022,  0.000021,  -0.000006, 0.000022,
+    0.000021,  -0.000006, 0.000092,  -0.000015, -0.000046, -0.000012, -0.000013,
+    -0.000005, -0.000044, 0.000322,  -0.000044, -0.000046, -0.000117, -0.000024,
+    0.000027,  0.000003,  -0.000008, -0.000095, 0.000065,  0.000186,  -0.000113,
+    0.000044,  0.000036,  -0.000153, 0.000022,  0.000021,  -0.000006, -0.000002,
+    -0.000008, 0.000118,  -0.000063, -0.000078, 0.000028,  0.000016,  -0.000005,
+    -0.000006, 0.000285,  -0.000006, -0.000015, -0.000055, -0.000056, -0.000044,
+    -0.000044, -0.000037, -0.000002, -0.000008, -0.000063, 0.000118,  -0.000078,
+    0.000016,  0.000028,  -0.000005, 0.000054,  0.000049,  -0.000063, 0.000028,
+    0.000016,  -0.000005, -0.000001, -0.000001, 0.000127,  -0.000005, -0.000005,
+    -0.000088, -0.000019, -0.000043, 0.000047,  -0.000005, 0.000027,  0.000003,
+    -0.000008, 0.000027,  0.000003,  -0.000008, 0.000013,  0.000012,  -0.000004,
+    0.000045,  0.000045,  -0.000013, 0.000069,  0.000036,  0.000035,  -0.000113,
+    0.000044,  -0.000007, 0.000227,  -0.000010, -0.000010, -0.000080, -0.000080,
+    -0.000024, -0.000007, 0.000164,  -0.000047, -0.000047, -0.000005, -0.000021,
+    -0.000012, -0.000085, 0.000096,  -0.000056, 0.000056,  -0.000011, 0.000089,
+    0.000044,  -0.000113, 0.000036,  0.000079,  -0.000012, 0.000045,  0.000045,
+    -0.000013, -0.000080, 0.000236,  -0.000080, -0.000010, -0.000010, -0.000113,
+    0.000036,  0.000032,  0.000069,  0.000036,  -0.000113, 0.000044,  0.000035,
+    -0.000007, 0.000047,  -0.000043, -0.000005, -0.000070, 0.000202,  -0.000007,
+    -0.000072, -0.000010, -0.000021, -0.000059, -0.000059, 0.000066,  0.000044,
+    0.000044,  0.000035,  -0.000018, 0.000100,  -0.000071, -0.000005, 0.000071,
+    0.000057,  0.000049,  0.000036,  -0.000100, 0.000034,  -0.000002, 0.000045,
+    0.000045,  -0.000013, 0.000069,  0.000036,  -0.000113, 0.000044,  0.000035,
+    -0.000007, 0.000047,  -0.000043, -0.000005, -0.000059, -0.000059, 0.000044,
+    0.000066,  0.000044,  0.000035,  -0.000018, 0.000044,  0.000036,  0.000089,
+    0.000079,  -0.000113, -0.000012, 0.000027,  0.000003,  -0.000008, 0.000236,
+    -0.000010, -0.000080, -0.000080, 0.000036,  -0.000010, -0.000113, 0.000032,
+    -0.000002, -0.000008, 0.000118,  -0.000063, -0.000078, 0.000028,  0.000016,
+    -0.000005, -0.000002, -0.000008, 0.000118,  -0.000063, -0.000078, 0.000016,
+    0.000028,  -0.000005, 0.000022,  0.000021,  -0.000006, -0.000133, -0.000007,
+    0.000155,  0.000099,  -0.000099, -0.000010, -0.000005, -0.000002, 0.000118,
+    -0.000008, -0.000078, -0.000063, 0.000028,  0.000016,  -0.000005, -0.000067,
+    0.000075,  0.000044,  -0.000044, -0.000008, 0.000075,  -0.000067, 0.000044,
+    -0.000044, -0.000008, -0.000002, 0.000118,  -0.000078, -0.000008, -0.000063,
+    0.000016,  0.000028,  -0.000005, 0.000027,  0.000003,  -0.000008, 0.000047,
+    -0.000043, -0.000005, 0.000022,  0.000021,  -0.000006, 0.000022,  0.000021,
+    -0.000006, 0.000010,  0.000001,  -0.000003, -0.000059, -0.000059, 0.000044,
+    0.000066,  0.000044,  0.000035,  -0.000018, -0.000159, 0.000117,  0.000055,
+    0.000056,  0.000154,  -0.000011, 0.000055,  0.000049,  0.000034,  0.000028,
+    -0.000088, -0.000005, 0.000022,  0.000021,  -0.000006, 0.000054,  0.000049,
+    0.000028,  -0.000063, 0.000016,  -0.000005, -0.000009, 0.000213,  -0.000065,
+    -0.000005, -0.000065, -0.000021, -0.000012, 0.000044,  -0.000113, 0.000089,
+    0.000036,  0.000079,  -0.000012, -0.000003, -0.000003, 0.000137,  -0.000013,
+    -0.000044, -0.000044, -0.000006, -0.000003, -0.000003, -0.000013, 0.000137,
+    -0.000044, -0.000044, -0.000006, -0.000004, -0.000055, 0.000078,  -0.000074,
+    0.000055,  -0.000004, 0.000096,  -0.000039, -0.000029, 0.000029,  -0.000029,
+    0.000029,  -0.000039, -0.000010, -0.000005, 0.000077,  0.000402,  -0.000358,
+    0.000027,  0.000003,  -0.000008, 0.000013,  0.000012,  -0.000004, -0.000043,
+    0.000027,  0.000028,  0.000021,  0.000003,  0.000130,  -0.000125, 0.000041,
+    -0.000075, 0.000041,  -0.000013, 0.000022,  0.000021,  -0.000006, 0.000089,
+    0.000079,  0.000044,  0.000036,  -0.000113, -0.000012, 0.000022,  0.000021,
+    -0.000006, 0.000027,  0.000003,  -0.000008, 0.000077,  0.000402,  -0.000358,
+    -0.000005, -0.000044, 0.000322,  -0.000117, -0.000044, -0.000046, -0.000024,
+    -0.000113, 0.000044,  0.000036,  0.000089,  0.000079,  -0.000012, 0.000054,
+    0.000049,  0.000028,  -0.000063, 0.000016,  -0.000005, 0.000022,  0.000021,
+    -0.000006, 0.000055,  -0.000088, 0.000034,  0.000028,  0.000049,  -0.000005,
+    0.000016,  0.000010,  -0.000113, 0.000067,  0.000008,  0.000036,  0.000044,
+    -0.000007, -0.000003, -0.000003, -0.000044, 0.000137,  -0.000013, -0.000044,
+    -0.000006, 0.000027,  0.000003,  -0.000008, 0.000067,  0.000036,  0.000044,
+    0.000008,  -0.000113, -0.000007, -0.000003, -0.000003, -0.000044, 0.000137,
+    -0.000013, -0.000044, -0.000006, -0.000003, -0.000003, -0.000044, -0.000013,
+    0.000137,  -0.000044, -0.000006, -0.000007, 0.000148,  0.000047,  -0.000063,
+    -0.000063, 0.000047,  -0.000047, -0.000047, -0.000010, -0.000005, 0.000139,
+    0.000710,  -0.000636, 0.000027,  0.000003,  -0.000008, 0.000013,  0.000012,
+    -0.000004, 0.000013,  0.000012,  -0.000004, 0.000055,  0.000049,  -0.000088,
+    0.000034,  0.000028,  -0.000005, 0.000022,  0.000021,  -0.000006, 0.000045,
+    -0.000003, -0.000014, -0.000006, 0.000117,  -0.000159, 0.000055,  0.000056,
+    0.000154,  -0.000011, 0.000227,  -0.000010, -0.000080, -0.000080, -0.000010,
+    -0.000024, 0.000027,  0.000003,  -0.000008, 0.000027,  0.000003,  -0.000008,
+    0.000285,  -0.000015, -0.000006, -0.000006, -0.000044, -0.000044, -0.000055,
+    -0.000056, -0.000037, 0.000055,  0.000049,  -0.000088, 0.000034,  0.000028,
+    -0.000005, 0.000071,  0.000173,  -0.000018, -0.000133, -0.000007, 0.000155,
+    -0.000099, 0.000099,  -0.000010, -0.000005, 0.000022,  0.000021,  -0.000006,
+    0.000022,  0.000021,  -0.000006, 0.000100,  -0.000071, -0.000005, 0.000071,
+    0.000027,  0.000003,  -0.000008, -0.000003, -0.000003, -0.000044, 0.000137,
+    -0.000013, -0.000044, -0.000006, 0.000045,  0.000045,  -0.000013, 0.000036,
+    0.000067,  -0.000113, 0.000044,  0.000008,  -0.000007, 0.000075,  -0.000067,
+    0.000044,  -0.000044, -0.000008, -0.000003, -0.000003, -0.000044, 0.000137,
+    -0.000013, -0.000044, -0.000006, 0.000022,  0.000021,  -0.000006, 0.000022,
+    0.000021,  -0.000006, -0.000080, -0.000010, 0.000227,  -0.000080, -0.000010,
+    -0.000024, 0.000035,  0.000036,  0.000069,  0.000044,  -0.000113, -0.000007,
+    0.000075,  -0.000067, 0.000044,  -0.000044, -0.000008, 0.000075,  -0.000067,
+    0.000044,  -0.000044, -0.000008, 0.000034,  0.000028,  -0.000088, 0.000055,
+    0.000049,  -0.000005, 0.000022,  0.000021,  -0.000006, 0.000028,  0.000019,
+    0.000027,  0.000028,  -0.000043, 0.000021,  0.000003,  -0.000060, 0.000176,
+    -0.000010, -0.000005, -0.000084, -0.000007, 0.000202,  -0.000070, -0.000072,
+    -0.000010, -0.000021, -0.000059, -0.000059, 0.000066,  0.000044,  0.000044,
+    0.000035,  -0.000018, 0.000045,  -0.000004, -0.000003, -0.000014, -0.000006,
+    0.000027,  0.000003,  -0.000008, -0.000095, 0.000065,  -0.000113, 0.000044,
+    0.000036,  0.000186,  -0.000153, -0.000059, -0.000059, 0.000066,  0.000044,
+    0.000044,  0.000035,  -0.000018, 0.000067,  -0.000113, 0.000036,  0.000044,
+    0.000008,  -0.000007, -0.000012, 0.000092,  -0.000015, -0.000046, -0.000013,
+    0.000021,  -0.000043, 0.000027,  0.000028,  0.000003,  0.000036,  -0.000113,
+    -0.000010, 0.000236,  -0.000010, -0.000080, -0.000080, 0.000032,  -0.000113,
+    0.000067,  0.000044,  0.000036,  0.000008,  -0.000007, -0.000012, 0.000092,
+    -0.000015, -0.000046, -0.000013, 0.000021,  -0.000043, 0.000027,  0.000028,
+    0.000003,  0.000022,  0.000021,  -0.000006, -0.000113, 0.000036,  0.000044,
+    0.000067,  0.000008,  -0.000007, -0.000003, -0.000003, -0.000044, -0.000013,
+    0.000137,  -0.000044, -0.000006, 0.000132,  0.000675,  -0.000594, 0.000054,
+    0.000028,  0.000016,  -0.000063, 0.000049,  -0.000005, 0.000047,  0.000045,
+    -0.000021, 0.000016,  -0.000021, 0.000016,  -0.000013, 0.000100,  -0.000071,
+    -0.000005, 0.000071,  0.000027,  0.000003,  -0.000008, -0.000059, -0.000059,
+    0.000044,  0.000066,  0.000044,  0.000035,  -0.000018, 0.000057,  0.000036,
+    0.000049,  -0.000100, 0.000034,  -0.000002, 0.000022,  0.000021,  -0.000006,
+    -0.000029, 0.000099,  -0.000004, -0.000005, -0.000010, -0.000036, -0.000017,
+    0.000042,  -0.000078, 0.000044,  -0.000029, 0.000029,  0.000022,  0.000021,
+    -0.000006, -0.000070, 0.000202,  -0.000007, -0.000072, -0.000021, 0.000034,
+    -0.000088, 0.000055,  0.000049,  0.000028,  -0.000005, 0.000045,  0.000045,
+    -0.000013, 0.000017,  0.000023,  -0.000136, 0.000017,  0.000017,  0.000106,
+    0.000029,  -0.000019, 0.000155,  -0.000136, 0.000088,  -0.000088, -0.000000,
+    0.000088,  0.000155,  -0.000019, -0.000088, -0.000136, -0.000019, -0.000136,
+    0.000155,  0.000088,  -0.000088, -0.000000, 0.000044,  0.000089,  0.000079,
+    -0.000113, 0.000036,  -0.000012, 0.000022,  0.000021,  -0.000006, 0.000022,
+    0.000021,  -0.000006, 0.000047,  -0.000043, -0.000005, 0.000013,  0.000012,
+    -0.000004, -0.000005, -0.000044, -0.000117, -0.000044, 0.000322,  -0.000046,
+    -0.000024, -0.000026, 0.000029,  -0.000018, -0.000003, -0.000070, 0.000202,
+    -0.000072, -0.000010, -0.000007, -0.000021, 0.000067,  0.000044,  0.000036,
+    0.000008,  -0.000113, -0.000007, -0.000003, -0.000003, -0.000044, 0.000137,
+    -0.000013, -0.000044, -0.000006, -0.000003, -0.000003, -0.000044, -0.000013,
+    0.000137,  -0.000044, -0.000006, 0.000044,  -0.000113, 0.000089,  0.000079,
+    0.000036,  -0.000012, -0.000010, 0.000176,  -0.000060, -0.000005, -0.000084,
+    0.000010,  0.000019,  0.000034,  -0.000029, 0.000011,  -0.000001, -0.000001,
+    -0.000001, -0.000001, -0.000003, 0.000087,  -0.000013, -0.000013, -0.000013,
+    -0.000021, -0.000012, -0.000067, 0.000044,  0.000075,  -0.000044, -0.000008,
+    0.000117,  0.000055,  0.000056,  -0.000159, 0.000154,  -0.000011, -0.000002,
+    -0.000001, -0.000002, -0.000020, -0.000004, 0.000216,  -0.000005, -0.000024,
+    -0.000034, -0.000021, -0.000068, 0.000027,  0.000003,  -0.000008, 0.000019,
+    0.000010,  0.000011,  0.000034,  -0.000029, -0.000001, 0.000022,  0.000021,
+    -0.000006, 0.000028,  0.000019,  -0.000001, -0.000001, 0.000127,  -0.000005,
+    -0.000005, -0.000088, -0.000019, -0.000043, 0.000047,  -0.000005, 0.000227,
+    -0.000010, -0.000010, -0.000080, -0.000080, -0.000024, 0.000022,  0.000021,
+    -0.000006, 0.000092,  -0.000015, -0.000046, -0.000012, -0.000013, -0.000043,
+    0.000027,  0.000028,  0.000021,  0.000003,  -0.000005, -0.000044, 0.000322,
+    -0.000046, -0.000044, -0.000117, -0.000024, -0.000113, 0.000089,  0.000044,
+    0.000079,  0.000036,  -0.000012, 0.000027,  0.000003,  -0.000008, -0.000113,
+    0.000089,  0.000044,  0.000036,  0.000079,  -0.000012, 0.000027,  0.000028,
+    -0.000043, 0.000021,  0.000003,  0.000027,  0.000028,  -0.000043, 0.000021,
+    0.000003,  -0.000027, 0.000010,  0.000016,  0.000036,  0.000056,  -0.000029,
+    0.000011,  -0.000082, 0.000055,  0.000049,  -0.000088, 0.000034,  0.000028,
+    -0.000005, 0.000022,  0.000021,  -0.000006, -0.000009, 0.000164,  -0.000052,
+    -0.000006, -0.000063, -0.000021, -0.000113, 0.000069,  0.000044,  0.000035,
+    0.000036,  -0.000007, 0.000027,  -0.000043, 0.000021,  0.000028,  0.000003,
+    -0.000007, 0.000164,  -0.000047, -0.000005, -0.000047, -0.000021, -0.000012,
+    -0.000006, -0.000006, 0.000285,  -0.000015, -0.000055, -0.000056, -0.000044,
+    -0.000044, -0.000037, -0.000159, 0.000117,  0.000055,  0.000056,  0.000154,
+    -0.000011, -0.000001, -0.000001, -0.000003, 0.000133,  -0.000013, -0.000047,
+    -0.000013, -0.000003, -0.000021, -0.000012, 0.000013,  0.000012,  -0.000004,
+    0.000202,  -0.000072, -0.000007, -0.000070, -0.000010, -0.000021, -0.000133,
+    -0.000007, 0.000155,  -0.000099, 0.000099,  -0.000010, -0.000005, 0.000045,
+    0.000047,  -0.000021, -0.000021, 0.000016,  0.000016,  -0.000013, 0.000045,
+    0.000045,  -0.000013, 0.000069,  -0.000113, 0.000044,  0.000036,  0.000035,
+    -0.000007, -0.000001, -0.000001, 0.000070,  -0.000013, -0.000013, -0.000003,
+    -0.000021, -0.000012, -0.000009, 0.000213,  -0.000065, -0.000065, -0.000005,
+    -0.000021, -0.000012, 0.000044,  0.000089,  0.000036,  -0.000113, 0.000079,
+    -0.000012, 0.000176,  -0.000010, -0.000060, -0.000005, -0.000084, 0.000045,
+    0.000045,  -0.000013, 0.000036,  -0.000113, 0.000044,  0.000069,  0.000035,
+    -0.000007, -0.000001, -0.000001, 0.000127,  -0.000005, -0.000005, -0.000088,
+    -0.000019, -0.000043, 0.000047,  -0.000005, 0.000028,  0.000019,  0.000126,
+    -0.000043, -0.000080, 0.000028,  -0.000032, 0.000008,  -0.000050, 0.000046,
+    0.000024,  0.000126,  -0.000043, -0.000052, -0.000032, 0.000055,  0.000049,
+    -0.000088, 0.000034,  0.000028,  -0.000005, 0.000045,  -0.000003, -0.000014,
+    -0.000006, -0.000003, -0.000003, -0.000044, -0.000013, 0.000137,  -0.000044,
+    -0.000006, 0.000036,  0.000069,  -0.000113, 0.000044,  0.000035,  -0.000007,
+    0.000067,  -0.000113, 0.000036,  0.000044,  0.000008,  -0.000007, -0.000113,
+    -0.000010, 0.000236,  -0.000080, -0.000080, 0.000036,  -0.000010, 0.000032,
+    0.000034,  -0.000088, 0.000028,  0.000055,  0.000049,  -0.000005, 0.000022,
+    0.000021,  -0.000006, -0.000043, 0.000126,  -0.000052, -0.000032, -0.000026,
+    0.000029,  -0.000018, -0.000003, 0.000044,  0.000036,  0.000089,  0.000079,
+    -0.000113, -0.000012, 0.000027,  0.000021,  0.000028,  -0.000043, 0.000003,
+    0.000028,  0.000027,  0.000021,  -0.000043, 0.000003,  -0.000007, -0.000007,
+    -0.000104, -0.000099, 0.000327,  -0.000036, -0.000021, -0.000054, 0.000036,
+    0.000013,  -0.000031, 0.000100,  -0.000071, -0.000005, 0.000071,  -0.000002,
+    -0.000008, 0.000118,  -0.000078, -0.000063, 0.000028,  0.000016,  -0.000005,
+    0.000027,  0.000003,  -0.000008, -0.000002, -0.000008, 0.000118,  -0.000078,
+    -0.000063, 0.000016,  0.000028,  -0.000005, 0.000057,  0.000049,  -0.000100,
+    0.000032,  0.000092,  -0.000015, -0.000012, -0.000046, -0.000013, -0.000043,
+    0.000027,  0.000021,  0.000028,  0.000003,  0.000092,  -0.000015, -0.000012,
+    -0.000046, -0.000013, 0.000012,  0.000013,  -0.000004, -0.000001, 0.000036,
+    -0.000003, -0.000013, -0.000010, -0.000006, -0.000070, -0.000010, 0.000202,
+    -0.000072, -0.000007, -0.000021, -0.000113, 0.000044,  0.000008,  0.000036,
+    0.000067,  -0.000007, 0.000034,  -0.000088, 0.000028,  0.000055,  0.000049,
+    -0.000005, 0.000285,  -0.000015, -0.000006, -0.000055, -0.000006, -0.000056,
+    -0.000044, -0.000044, -0.000037, 0.000402,  0.000077,  -0.000358, 0.000013,
+    0.000012,  -0.000004, -0.000005, 0.000322,  -0.000044, -0.000046, -0.000044,
+    -0.000117, -0.000024, -0.000113, 0.000089,  0.000079,  0.000044,  0.000036,
+    -0.000012, 0.000202,  -0.000072, -0.000007, -0.000070, -0.000010, -0.000021,
+    0.000176,  -0.000010, -0.000060, -0.000005, -0.000084, 0.000028,  0.000027,
+    -0.000008, -0.000010, -0.000007, 0.000202,  -0.000070, -0.000072, -0.000021,
+    -0.000004, 0.000029,  0.000096,  -0.000039, 0.000029,  -0.000039, -0.000029,
+    -0.000029, -0.000010, -0.000005, -0.000002, 0.000118,  -0.000008, -0.000063,
+    0.000028,  0.000016,  -0.000078, -0.000005, -0.000067, 0.000075,  -0.000044,
+    0.000044,  -0.000008, 0.000055,  0.000049,  0.000034,  0.000028,  -0.000088,
+    -0.000005, 0.000022,  0.000021,  -0.000006, -0.000002, 0.000118,  -0.000008,
+    -0.000063, 0.000016,  0.000028,  -0.000078, -0.000005, 0.000045,  0.000045,
+    -0.000013, 0.000008,  0.000036,  0.000067,  -0.000113, 0.000044,  -0.000007,
+    -0.000113, -0.000080, -0.000010, 0.000236,  -0.000010, 0.000036,  -0.000080,
+    0.000032,  0.000008,  0.000044,  -0.000113, 0.000067,  0.000036,  -0.000007,
+    0.000047,  -0.000043, -0.000005, 0.000022,  0.000021,  -0.000006, -0.000003,
+    -0.000003, 0.000137,  -0.000013, -0.000044, -0.000044, -0.000006, -0.000003,
+    -0.000003, -0.000013, 0.000137,  -0.000044, -0.000044, -0.000006, 0.000054,
+    0.000049,  -0.000063, 0.000028,  0.000016,  -0.000005, 0.000092,  -0.000015,
+    -0.000012, -0.000046, -0.000013, 0.000041,  0.000130,  -0.000125, -0.000075,
+    0.000041,  -0.000013, -0.000059, -0.000059, 0.000044,  0.000066,  0.000044,
+    0.000035,  -0.000018, 0.000022,  0.000021,  -0.000006, 0.000022,  0.000021,
+    -0.000006, 0.000079,  0.000089,  0.000044,  0.000036,  -0.000113, -0.000012,
+    0.000045,  0.000045,  -0.000013, 0.000067,  -0.000113, 0.000008,  0.000036,
+    0.000044,  -0.000007, 0.000022,  0.000021,  -0.000006, -0.000010, 0.000227,
+    -0.000010, -0.000080, -0.000080, -0.000024, 0.000022,  0.000021,  -0.000006,
+    0.000022,  0.000021,  -0.000006, 0.000022,  0.000021,  -0.000006, -0.000059,
+    -0.000059, 0.000066,  0.000044,  0.000044,  0.000035,  -0.000018, 0.000045,
+    0.000047,  -0.000021, 0.000016,  -0.000021, 0.000016,  -0.000013, 0.000027,
+    0.000003,  -0.000008, -0.000113, 0.000036,  0.000044,  0.000067,  0.000008,
+    -0.000007, 0.000027,  0.000003,  -0.000008, 0.000089,  0.000079,  -0.000113,
+    0.000044,  0.000036,  -0.000012, -0.000029, -0.000002, 0.000077,  -0.000005,
+    -0.000021, -0.000012, -0.000113, 0.000067,  0.000008,  0.000044,  0.000036,
+    -0.000007, -0.000026, 0.000029,  0.000018,  -0.000018, -0.000003, -0.000027,
+    -0.000029, 0.000010,  0.000056,  0.000016,  0.000011,  0.000036,  -0.000082,
+    0.000013,  0.000012,  -0.000004, 0.000034,  -0.000088, 0.000055,  0.000049,
+    0.000028,  -0.000005, 0.000027,  0.000003,  -0.000008, -0.000002, -0.000008,
+    0.000118,  -0.000078, 0.000028,  0.000016,  -0.000063, -0.000005, -0.000002,
+    0.000118,  -0.000078, -0.000008, 0.000016,  0.000028,  -0.000063, -0.000005,
+    0.000092,  -0.000015, -0.000012, -0.000046, -0.000013, -0.000043, 0.000027,
+    0.000021,  0.000028,  0.000003,  0.000092,  -0.000046, -0.000015, -0.000012,
+    -0.000013, -0.000029, 0.000019,  0.000010,  0.000011,  0.000034,  -0.000001,
+    0.000022,  0.000021,  -0.000006, 0.000022,  0.000021,  -0.000006, 0.000013,
+    0.000012,  -0.000004, -0.000005, -0.000044, -0.000044, -0.000117, -0.000046,
+    0.000322,  -0.000024, 0.000022,  0.000021,  -0.000006, -0.000010, -0.000070,
+    0.000202,  -0.000007, -0.000072, -0.000021, -0.000001, -0.000001, -0.000005,
+    0.000127,  -0.000005, -0.000088, -0.000019, 0.000016,  0.000014,  -0.000005,
+    0.000045,  0.000045,  -0.000013, 0.000117,  -0.000159, 0.000055,  0.000056,
+    0.000154,  -0.000011, 0.000027,  -0.000043, 0.000028,  0.000021,  0.000003,
+    0.000126,  -0.000043, -0.000052, -0.000032, 0.000126,  -0.000043, -0.000052,
+    -0.000032, -0.000004, -0.000029, 0.000099,  -0.000029, -0.000005, -0.000010,
+    -0.000007, -0.000001, -0.000001, -0.000005, 0.000127,  -0.000005, -0.000088,
+    -0.000019, -0.000043, 0.000126,  -0.000052, -0.000032, -0.000001, -0.000001,
+    -0.000005, 0.000127,  -0.000005, -0.000088, -0.000019, -0.000043, 0.000047,
+    -0.000005, 0.000045,  0.000045,  -0.000013, 0.000036,  0.000035,  0.000069,
+    -0.000113, 0.000044,  -0.000007, 0.000036,  0.000044,  0.000067,  0.000008,
+    -0.000113, -0.000007, -0.000003, -0.000003, -0.000044, 0.000137,  -0.000013,
+    -0.000044, -0.000006, -0.000059, -0.000059, 0.000044,  0.000066,  0.000044,
+    0.000035,  -0.000018, -0.000009, -0.000065, 0.000213,  -0.000005, -0.000065,
+    -0.000021, -0.000012, -0.000027, -0.000029, 0.000010,  0.000056,  0.000016,
+    0.000011,  0.000036,  -0.000082, -0.000005, -0.000044, 0.000322,  -0.000117,
+    -0.000044, -0.000046, -0.000024, -0.000113, 0.000067,  0.000008,  0.000036,
+    0.000044,  -0.000007, -0.000003, -0.000003, -0.000044, 0.000137,  -0.000013,
+    -0.000044, -0.000006, -0.000159, 0.000117,  0.000055,  0.000056,  0.000154,
+    -0.000011, 0.000126,  -0.000043, -0.000052, -0.000032, -0.000043, 0.000126,
+    -0.000052, -0.000032, 0.000022,  0.000021,  -0.000006, -0.000133, -0.000007,
+    0.000099,  0.000155,  -0.000099, -0.000010, -0.000005, 0.000027,  0.000028,
+    0.000021,  -0.000043, 0.000003,  -0.000027, 0.000056,  0.000010,  0.000016,
+    0.000011,  -0.000029, 0.000036,  -0.000082, -0.000059, -0.000059, 0.000044,
+    0.000066,  0.000044,  0.000035,  -0.000018, -0.000007, 0.000164,  -0.000047,
+    -0.000047, -0.000021, -0.000005, -0.000012, -0.000088, 0.000055,  0.000049,
+    0.000028,  0.000034,  -0.000005, 0.000022,  0.000021,  -0.000006, 0.000092,
+    -0.000015, -0.000012, -0.000046, -0.000013, -0.000001, -0.000001, -0.000001,
+    0.000087,  -0.000003, -0.000013, -0.000013, -0.000021, -0.000025, 0.000028,
+    0.000019,  -0.000060, 0.000176,  -0.000010, -0.000005, -0.000084, 0.000027,
+    0.000003,  -0.000008, 0.000044,  0.000036,  0.000089,  -0.000113, 0.000079,
+    -0.000012, 0.000161,  -0.000161, 0.000509,  -0.000178, -0.000216, -0.000013,
+    -0.000026, -0.000021, 0.000036,  0.000057,  -0.000100, 0.000034,  0.000049,
+    -0.000002, 0.000096,  0.000056,  -0.000085, -0.000056, -0.000011, 0.000012,
+    0.000013,  -0.000004, -0.000001, -0.000002, 0.000092,  -0.000003, -0.000029,
+    -0.000013, -0.000021, -0.000012, -0.000001, -0.000001, -0.000003, 0.000133,
+    -0.000003, -0.000047, -0.000013, -0.000013, -0.000021, -0.000012, 0.000010,
+    0.000034,  0.000019,  -0.000029, 0.000011,  -0.000001, 0.000022,  0.000021,
+    -0.000006, -0.000002, 0.000118,  -0.000078, -0.000008, -0.000063, 0.000028,
+    0.000016,  -0.000005, -0.000015, 0.000092,  -0.000046, -0.000012, -0.000013,
+    0.000022,  0.000021,  -0.000006, -0.000067, 0.000044,  0.000075,  -0.000044,
+    -0.000008, 0.000022,  0.000021,  -0.000006, 0.000008,  0.000067,  -0.000113,
+    0.000036,  0.000044,  -0.000007, 0.000045,  0.000047,  -0.000021, 0.000016,
+    -0.000021, 0.000016,  -0.000013, -0.000002, 0.000118,  -0.000078, -0.000008,
+    -0.000063, 0.000016,  0.000028,  -0.000005, -0.000015, -0.000046, 0.000092,
+    -0.000012, -0.000013, 0.000036,  -0.000113, -0.000010, 0.000236,  -0.000010,
+    -0.000080, -0.000080, 0.000032,  -0.000004, -0.000001, -0.000002, -0.000002,
+    -0.000055, 0.000221,  -0.000024, -0.000005, -0.000003, -0.000034, -0.000020,
+    -0.000021, -0.000014, 0.000047,  -0.000021, 0.000016,  0.000045,  -0.000021,
+    0.000016,  -0.000013, -0.000001, -0.000041, 0.000171,  -0.000015, -0.000059,
+    -0.000017, -0.000027, -0.000005, -0.000044, -0.000117, 0.000322,  -0.000044,
+    -0.000046, -0.000024, -0.000003, -0.000003, -0.000044, 0.000137,  -0.000013,
+    -0.000044, -0.000006, -0.000088, 0.000028,  0.000055,  0.000034,  0.000049,
+    -0.000005, 0.000022,  0.000021,  -0.000006, -0.000007, 0.000202,  -0.000070,
+    -0.000010, -0.000072, -0.000021, -0.000043, 0.000027,  0.000021,  0.000028,
+    0.000003,  0.000045,  0.000016,  -0.000021, 0.000047,  -0.000021, 0.000016,
+    -0.000013, -0.000067, 0.000075,  -0.000044, 0.000044,  -0.000008, 0.000045,
+    0.000045,  -0.000013, 0.000036,  0.000067,  -0.000113, 0.000044,  0.000008,
+    -0.000007, -0.000080, -0.000010, 0.000227,  -0.000080, -0.000010, -0.000024,
+    -0.000113, 0.000036,  0.000069,  0.000044,  0.000035,  -0.000007, 0.000044,
+    -0.000113, 0.000036,  0.000067,  0.000008,  -0.000007, 0.000034,  -0.000088,
+    0.000055,  0.000028,  0.000049,  -0.000005, 0.000057,  0.000157,  -0.000014,
+    0.000036,  -0.000113, 0.000044,  0.000067,  0.000008,  -0.000007, -0.000003,
+    -0.000003, -0.000044, -0.000013, 0.000137,  -0.000044, -0.000006, 0.000071,
+    0.000173,  -0.000018, 0.000022,  0.000021,  -0.000006, 0.000013,  0.000012,
+    -0.000004, 0.000045,  0.000045,  -0.000013, -0.000003, -0.000047, -0.000005,
+    0.000101,  -0.000021, -0.000012, 0.000054,  0.000049,  -0.000063, 0.000028,
+    0.000016,  -0.000005, 0.000022,  0.000021,  -0.000006, -0.000002, 0.000118,
+    -0.000078, -0.000008, -0.000063, 0.000028,  0.000016,  -0.000005, -0.000015,
+    -0.000046, 0.000092,  -0.000012, -0.000013, -0.000067, 0.000044,  0.000075,
+    -0.000044, -0.000008, -0.000002, 0.000118,  -0.000078, -0.000063, -0.000008,
+    0.000016,  0.000028,  -0.000005, -0.000015, -0.000046, 0.000092,  -0.000012,
+    -0.000013, 0.000117,  -0.000159, 0.000055,  0.000056,  0.000154,  -0.000011,
+    0.000022,  0.000021,  -0.000006, 0.000022,  0.000021,  -0.000006, 0.000027,
+    0.000003,  -0.000008, -0.000006, -0.000006, 0.000285,  -0.000044, -0.000044,
+    -0.000015, -0.000055, -0.000056, -0.000037, 0.000402,  0.000077,  -0.000358,
+    0.000027,  0.000003,  -0.000008, 0.000079,  -0.000113, 0.000044,  0.000036,
+    0.000089,  -0.000012, 0.000100,  -0.000005, -0.000071, 0.000071,  -0.000004,
+    -0.000029, -0.000029, 0.000099,  -0.000005, -0.000010, -0.000007, -0.000026,
+    0.000018,  0.000029,  -0.000018, -0.000003, -0.000005, -0.000065, 0.000125,
+    -0.000005, -0.000021, -0.000012, -0.000010, -0.000060, 0.000176,  -0.000005,
+    -0.000084, 0.000045,  0.000045,  -0.000013, -0.000113, 0.000036,  0.000044,
+    0.000035,  0.000069,  -0.000007, 0.000012,  0.000013,  -0.000004, 0.000044,
+    -0.000113, 0.000036,  0.000067,  0.000008,  -0.000007, -0.000003, -0.000003,
+    -0.000044, -0.000013, 0.000137,  -0.000044, -0.000006, 0.000022,  0.000021,
+    -0.000006, -0.000007, 0.000202,  -0.000070, -0.000072, -0.000010, -0.000021,
+    0.000045,  0.000045,  -0.000013, 0.000036,  0.000067,  -0.000113, 0.000044,
+    0.000008,  -0.000007, 0.000044,  0.000067,  -0.000113, 0.000036,  0.000008,
+    -0.000007, -0.000159, 0.000117,  0.000055,  0.000056,  0.000154,  -0.000011,
+    -0.000080, -0.000080, -0.000010, 0.000227,  -0.000010, -0.000024, 0.000022,
+    0.000021,  -0.000006, -0.000113, 0.000067,  0.000036,  0.000044,  0.000008,
+    -0.000007, -0.000113, 0.000067,  0.000008,  0.000044,  0.000036,  -0.000007,
+    -0.000003, -0.000003, 0.000137,  -0.000044, -0.000013, -0.000044, -0.000006,
+    -0.000001, -0.000059, -0.000015, 0.000171,  -0.000017, -0.000041, -0.000027,
+    -0.000005, -0.000044, -0.000117, 0.000322,  -0.000044, -0.000046, -0.000024,
+    0.000132,  0.000675,  -0.000594, 0.000022,  0.000021,  -0.000006, 0.000126,
+    -0.000043, -0.000052, -0.000032, 0.000045,  0.000047,  -0.000021, -0.000021,
+    0.000016,  0.000016,  -0.000013, 0.000044,  0.000036,  -0.000113, 0.000067,
+    0.000008,  -0.000007, -0.000003, -0.000003, -0.000044, -0.000013, 0.000137,
+    -0.000044, -0.000006, -0.000001, -0.000001, -0.000005, -0.000005, 0.000127,
+    -0.000088, -0.000019, 0.000045,  0.000045,  -0.000013, 0.000036,  0.000069,
+    -0.000113, 0.000044,  0.000035,  -0.000007, 0.000027,  0.000021,  -0.000043,
+    0.000028,  0.000003,  0.000161,  0.000026,  0.000258,  -0.000178, -0.000255,
+    0.000000,  -0.000001, -0.000015, 0.000171,  -0.000059, -0.000017, -0.000041,
+    -0.000027, -0.000003, -0.000003, -0.000044, 0.000137,  -0.000013, -0.000044,
+    -0.000006, -0.000003, -0.000003, -0.000044, -0.000013, 0.000137,  -0.000044,
+    -0.000006, -0.000012, 0.000092,  -0.000015, -0.000046, -0.000013, 0.000021,
+    -0.000043, 0.000027,  0.000028,  0.000003,  -0.000009, -0.000065, 0.000213,
+    -0.000005, -0.000065, -0.000021, -0.000012, -0.000027, 0.000036,  0.000010,
+    -0.000029, 0.000056,  0.000016,  0.000011,  -0.000082, -0.000005, -0.000044,
+    0.000322,  -0.000046, -0.000044, -0.000117, -0.000024, 0.000079,  -0.000113,
+    0.000089,  0.000044,  0.000036,  -0.000012, 0.000100,  -0.000005, -0.000071,
+    0.000071,  0.000057,  0.000049,  -0.000100, 0.000032,  0.000036,  0.000044,
+    -0.000113, 0.000069,  0.000035,  -0.000007, -0.000001, -0.000001, -0.000005,
+    0.000127,  -0.000005, -0.000088, -0.000019, 0.000028,  0.000019,  -0.000133,
+    -0.000007, 0.000099,  0.000155,  -0.000099, -0.000010, -0.000005, -0.000005,
+    0.000322,  -0.000044, -0.000117, -0.000044, -0.000046, -0.000024, 0.000027,
+    0.000021,  -0.000043, 0.000028,  0.000003,  -0.000021, 0.000016,  0.000045,
+    0.000047,  -0.000021, 0.000016,  -0.000013, 0.000027,  0.000003,  -0.000008,
+    -0.000095, -0.000113, 0.000065,  0.000044,  0.000036,  0.000186,  -0.000153,
+    -0.000006, 0.000164,  -0.000009, -0.000063, -0.000052, -0.000021, 0.000034,
+    0.000028,  0.000055,  0.000049,  -0.000088, -0.000005, 0.000022,  0.000021,
+    -0.000006, -0.000003, -0.000003, 0.000137,  -0.000044, -0.000013, -0.000044,
+    -0.000006, -0.000059, -0.000059, 0.000044,  0.000066,  0.000044,  0.000035,
+    -0.000018, -0.000113, 0.000067,  0.000008,  0.000044,  0.000036,  -0.000007,
+    0.000049,  0.000034,  0.000028,  0.000055,  -0.000088, -0.000005, 0.000041,
+    -0.000125, 0.000130,  0.000041,  -0.000075, -0.000013, 0.000022,  0.000021,
+    -0.000006, -0.000043, 0.000027,  0.000021,  0.000028,  0.000003,  -0.000021,
+    -0.000021, 0.000047,  0.000016,  0.000045,  0.000016,  -0.000013, 0.000077,
+    0.000402,  -0.000358, -0.000059, -0.000059, 0.000044,  0.000066,  0.000044,
+    0.000035,  -0.000018, 0.000027,  0.000003,  -0.000008, 0.000044,  0.000079,
+    -0.000113, 0.000036,  0.000089,  -0.000012, -0.000001, -0.000001, -0.000005,
+    0.000127,  -0.000005, -0.000088, -0.000019, 0.000057,  0.000049,  -0.000100,
+    0.000034,  0.000036,  -0.000002, -0.000015, -0.000046, -0.000012, 0.000092,
+    -0.000013, -0.000026, 0.000018,  0.000029,  -0.000018, -0.000003, -0.000001,
+    -0.000059, 0.000171,  -0.000041, -0.000015, -0.000017, -0.000027, -0.000161,
+    0.000161,  -0.000178, 0.000509,  -0.000013, -0.000216, -0.000026, -0.000021,
+    0.000041,  -0.000125, 0.000041,  0.000130,  -0.000075, -0.000013, 0.000044,
+    -0.000113, 0.000036,  0.000089,  0.000079,  -0.000012, -0.000080, -0.000080,
+    0.000227,  -0.000010, -0.000010, -0.000024, 0.000044,  0.000089,  0.000036,
+    -0.000113, 0.000079,  -0.000012, -0.000001, -0.000001, 0.000127,  -0.000005,
+    -0.000005, -0.000088, -0.000019, -0.000043, 0.000047,  -0.000005, -0.000003,
+    -0.000003, -0.000044, 0.000137,  -0.000013, -0.000044, -0.000006, -0.000003,
+    -0.000003, -0.000044, -0.000013, 0.000137,  -0.000044, -0.000006, -0.000001,
+    -0.000001, 0.000070,  -0.000003, -0.000013, -0.000021, -0.000025, 0.000011,
+    -0.000029, 0.000010,  0.000034,  0.000019,  -0.000001, 0.000022,  0.000021,
+    -0.000006, -0.000067, 0.000044,  0.000075,  -0.000044, -0.000008, 0.000092,
+    -0.000015, -0.000012, -0.000046, -0.000013, -0.000043, 0.000027,  0.000021,
+    0.000028,  0.000003,  -0.000059, -0.000059, 0.000066,  0.000044,  0.000044,
+    0.000035,  -0.000018, -0.000004, -0.000029, 0.000099,  -0.000029, -0.000005,
+    -0.000010, -0.000007, -0.000007, -0.000007, -0.000099, 0.000327,  -0.000104,
+    -0.000036, -0.000021, 0.000028,  0.000019,  0.000021,  0.000027,  -0.000043,
+    0.000028,  0.000003,  -0.000012, 0.000092,  -0.000015, -0.000046, -0.000013,
+    0.000021,  -0.000043, 0.000027,  0.000028,  0.000003,  0.000019,  0.000010,
+    0.000011,  0.000034,  -0.000029, -0.000001, -0.000067, 0.000044,  0.000075,
+    -0.000044, -0.000008, 0.000055,  0.000049,  0.000034,  0.000028,  -0.000088,
+    -0.000005, 0.000022,  0.000021,  -0.000006, -0.000002, -0.000004, -0.000001,
+    -0.000002, -0.000034, -0.000055, -0.000005, -0.000020, 0.000221,  -0.000003,
+    -0.000024, -0.000021, -0.000014, 0.000008,  0.000005,  -0.000001, -0.000010,
+    -0.000070, 0.000202,  -0.000072, -0.000007, -0.000021, 0.000044,  -0.000113,
+    0.000036,  0.000008,  0.000067,  -0.000007, 0.000047,  0.000045,  -0.000021,
+    0.000016,  -0.000021, 0.000016,  -0.000013, 0.000044,  0.000036,  0.000089,
+    0.000079,  -0.000113, -0.000012, -0.000080, -0.000080, 0.000227,  -0.000010,
+    -0.000010, -0.000024, 0.000028,  0.000027,  0.000021,  -0.000043, 0.000003,
+    0.000045,  -0.000021, 0.000016,  0.000047,  0.000016,  -0.000021, -0.000013,
+    0.000126,  -0.000043, -0.000052, -0.000032, 0.000013,  0.000012,  -0.000004,
+    -0.000005, -0.000044, -0.000117, 0.000322,  -0.000044, -0.000046, -0.000024,
+    0.000027,  0.000003,  -0.000008, 0.000079,  0.000044,  0.000036,  -0.000113,
+    0.000089,  -0.000012, -0.000007, 0.000030,  -0.000006, -0.000002, -0.000004,
+    0.000173,  0.000071,  -0.000018, -0.000013, -0.000020, 0.000013,  0.000022,
+    -0.000002, -0.000100, 0.000057,  0.000049,  0.000034,  0.000036,  -0.000002,
+    0.000126,  -0.000043, -0.000052, -0.000032, -0.000043, 0.000126,  -0.000052,
+    -0.000032, -0.000001, -0.000001, 0.000127,  -0.000005, -0.000005, -0.000088,
+    -0.000019, -0.000043, 0.000047,  -0.000005, -0.000003, -0.000003, -0.000044,
+    0.000137,  -0.000013, -0.000044, -0.000006, -0.000003, -0.000003, -0.000044,
+    -0.000013, 0.000137,  -0.000044, -0.000006, -0.000003, -0.000003, 0.000137,
+    -0.000013, -0.000044, -0.000044, -0.000006, -0.000003, -0.000003, -0.000013,
+    0.000137,  -0.000044, -0.000044, -0.000006, -0.000006, -0.000006, 0.000285,
+    -0.000015, -0.000044, -0.000044, -0.000055, -0.000056, -0.000037, -0.000159,
+    0.000117,  0.000055,  0.000056,  0.000154,  -0.000011, 0.000022,  0.000021,
+    -0.000006, -0.000003, -0.000003, 0.000137,  -0.000013, -0.000044, -0.000044,
+    -0.000006, -0.000003, -0.000003, -0.000013, 0.000137,  -0.000044, -0.000044,
+    -0.000006, -0.000159, 0.000117,  0.000055,  0.000056,  0.000154,  -0.000011,
+    0.000022,  0.000021,  -0.000006, 0.000022,  0.000021,  -0.000006, -0.000113,
+    -0.000095, 0.000065,  0.000044,  0.000036,  0.000186,  -0.000153, 0.000126,
+    -0.000043, -0.000052, -0.000032, -0.000005, -0.000044, -0.000046, -0.000044,
+    -0.000117, 0.000322,  -0.000024, -0.000001, -0.000001, -0.000005, 0.000127,
+    -0.000005, -0.000088, -0.000019, -0.000043, 0.000126,  -0.000052, -0.000032,
+    0.000202,  -0.000070, -0.000072, -0.000007, -0.000010, -0.000021, 0.000028,
+    0.000019,  0.000132,  0.000675,  -0.000594, 0.000022,  0.000021,  -0.000006,
+    -0.000007, -0.000047, 0.000164,  -0.000021, -0.000005, -0.000060, 0.000022,
+    0.000021,  -0.000006, 0.000022,  0.000021,  -0.000006, 0.000022,  0.000021,
+    -0.000006, -0.000010, 0.000176,  -0.000060, -0.000005, -0.000084, 0.000045,
+    0.000047,  -0.000021, 0.000016,  -0.000021, 0.000016,  -0.000013, 0.000008,
+    0.000067,  -0.000113, 0.000036,  0.000044,  -0.000007, -0.000010, 0.000236,
+    -0.000010, 0.000036,  -0.000113, -0.000080, -0.000080, 0.000032,  0.000008,
+    -0.000113, 0.000067,  0.000044,  0.000036,  -0.000007, 0.000055,  0.000049,
+    -0.000088, 0.000028,  0.000034,  -0.000005, 0.000022,  0.000021,  -0.000006,
+    -0.000007, -0.000047, 0.000164,  -0.000047, -0.000005, -0.000021, -0.000012,
+    -0.000026, 0.000029,  -0.000018, -0.000003, 0.000126,  -0.000043, -0.000052,
+    -0.000032, -0.000003, -0.000003, -0.000044, 0.000137,  -0.000013, -0.000044,
+    -0.000006, -0.000003, -0.000003, -0.000044, -0.000013, 0.000137,  -0.000044,
+    -0.000006, 0.000022,  0.000021,  -0.000006, -0.000071, 0.000100,  -0.000005,
+    0.000071,  -0.000010, 0.000202,  -0.000007, -0.000072, -0.000070, -0.000021,
+    0.000049,  -0.000088, 0.000055,  0.000028,  0.000034,  -0.000005, -0.000009,
+    -0.000065, 0.000213,  -0.000005, -0.000065, -0.000021, -0.000012, -0.000027,
+    -0.000029, 0.000010,  0.000011,  0.000056,  0.000016,  0.000036,  -0.000082,
+    0.000022,  0.000021,  -0.000006, 0.000022,  0.000021,  -0.000006, 0.000022,
+    0.000021,  -0.000006, -0.000012, -0.000015, 0.000092,  -0.000046, -0.000013,
+    0.000027,  0.000003,  -0.000008, -0.000001, -0.000001, -0.000005, 0.000127,
+    -0.000005, -0.000088, -0.000019, -0.000043, 0.000047,  -0.000005, 0.000176,
+    -0.000060, -0.000010, -0.000005, -0.000084, 0.000027,  0.000003,  -0.000008,
+    0.000044,  0.000079,  0.000089,  0.000036,  -0.000113, -0.000012, 0.000013,
+    0.000012,  -0.000004, -0.000005, -0.000044, -0.000044, -0.000117, -0.000046,
+    0.000322,  -0.000024, 0.000028,  0.000019,  0.000022,  0.000021,  -0.000006,
+    -0.000003, -0.000003, 0.000137,  -0.000013, -0.000044, -0.000044, -0.000006,
+    0.000117,  -0.000159, 0.000055,  0.000056,  0.000154,  -0.000011, 0.000045,
+    0.000045,  -0.000013, 0.000008,  0.000036,  0.000067,  -0.000113, 0.000044,
+    -0.000007, -0.000113, 0.000044,  0.000036,  0.000067,  0.000008,  -0.000007,
+    -0.000113, 0.000036,  -0.000080, -0.000010, 0.000236,  -0.000080, -0.000010,
+    0.000032,  0.000035,  0.000036,  0.000069,  0.000044,  -0.000113, -0.000007,
+    0.000008,  0.000044,  -0.000113, 0.000036,  0.000067,  -0.000007, 0.000139,
+    0.000710,  -0.000636, -0.000088, 0.000028,  0.000055,  0.000049,  0.000034,
+    -0.000005, 0.000022,  0.000021,  -0.000006, 0.000139,  0.000710,  -0.000636,
+    -0.000006, 0.000164,  -0.000063, -0.000009, -0.000052, -0.000021, 0.000710,
+    0.000139,  -0.000636, -0.000006, 0.000164,  -0.000063, -0.000009, -0.000052,
+    -0.000021, 0.000710,  0.000139,  -0.000636, 0.000139,  0.000710,  -0.000636,
+    0.000047,  0.000045,  -0.000021, 0.000016,  -0.000021, 0.000016,  -0.000013,
+    0.000013,  0.000012,  -0.000004, 0.000139,  0.000710,  -0.000636, -0.000009,
+    -0.000052, 0.000164,  -0.000006, -0.000063, -0.000021, 0.000710,  0.000139,
+    -0.000636, -0.000003, -0.000003, -0.000044, 0.000137,  -0.000013, -0.000044,
+    -0.000006, -0.000003, -0.000003, -0.000044, -0.000013, 0.000137,  -0.000044,
+    -0.000006, -0.000133, -0.000007, 0.000099,  0.000155,  -0.000099, -0.000010,
+    -0.000005, -0.000001, -0.000013, -0.000003, 0.000036,  -0.000010, -0.000006,
+    -0.000027, 0.000056,  -0.000029, 0.000010,  0.000011,  0.000036,  0.000016,
+    -0.000082, -0.000002, -0.000011, -0.000090, 0.000137,  -0.000063, 0.000028,
+    0.000016,  -0.000009, 0.000022,  0.000021,  -0.000006, 0.000092,  -0.000015,
+    -0.000012, -0.000046, -0.000013, -0.000043, 0.000027,  0.000021,  0.000028,
+    0.000003,  -0.000043, 0.000027,  0.000021,  0.000028,  0.000003,  0.000045,
+    -0.000021, 0.000047,  0.000016,  -0.000021, 0.000016,  -0.000013, 0.000139,
+    0.000710,  -0.000636, -0.000009, -0.000052, 0.000164,  -0.000063, -0.000006,
+    -0.000021, 0.000045,  0.000045,  -0.000013, 0.000045,  0.000045,  -0.000013,
+    0.000036,  0.000067,  -0.000113, 0.000044,  0.000008,  -0.000007, -0.000080,
+    -0.000010, 0.000227,  -0.000080, -0.000010, -0.000024, 0.000035,  0.000036,
+    0.000069,  -0.000113, 0.000044,  -0.000007, 0.000021,  0.000027,  -0.000043,
+    0.000028,  0.000003,  -0.000015, -0.000006, 0.000285,  -0.000006, -0.000044,
+    -0.000044, -0.000055, -0.000056, -0.000037, -0.000002, -0.000078, -0.000063,
+    0.000118,  -0.000008, 0.000016,  0.000028,  -0.000005, -0.000012, 0.000092,
+    -0.000015, -0.000046, -0.000013, 0.000021,  -0.000043, 0.000027,  0.000028,
+    0.000003,  -0.000007, -0.000007, 0.000327,  -0.000104, -0.000099, -0.000036,
+    -0.000021, 0.000173,  0.000071,  -0.000018, -0.000095, 0.000186,  0.000044,
+    0.000065,  0.000036,  -0.000113, -0.000153, 0.000092,  -0.000046, -0.000015,
+    -0.000012, -0.000013, 0.000034,  -0.000029, 0.000019,  0.000010,  0.000011,
+    -0.000001, 0.000011,  -0.000029, 0.000010,  0.000019,  0.000034,  -0.000001,
+    0.000044,  -0.000067, 0.000075,  -0.000044, -0.000008, -0.000026, 0.000029,
+    -0.000018, -0.000003, -0.000070, -0.000010, 0.000202,  -0.000072, -0.000007,
+    -0.000021, -0.000113, 0.000008,  0.000044,  0.000036,  0.000067,  -0.000007,
+    0.000034,  0.000049,  -0.000088, 0.000028,  0.000055,  -0.000005, -0.000010,
+    -0.000010, -0.000080, -0.000080, 0.000227,  -0.000024, -0.000001, -0.000002,
+    -0.000029, 0.000092,  -0.000013, -0.000003, -0.000021, -0.000012, -0.000063,
+    0.000016,  0.000054,  0.000049,  0.000028,  -0.000005, -0.000015, -0.000012,
+    0.000092,  -0.000046, -0.000013, 0.000022,  0.000021,  -0.000006, -0.000001,
+    -0.000001, -0.000005, 0.000127,  -0.000005, -0.000088, -0.000019, -0.000043,
+    0.000126,  -0.000052, -0.000032, 0.000027,  0.000003,  -0.000008, -0.000113,
+    0.000089,  0.000044,  0.000036,  0.000079,  -0.000012, 0.000022,  0.000021,
+    -0.000006, 0.000022,  0.000021,  -0.000006, -0.000043, 0.000028,  0.000027,
+    0.000021,  0.000003,  0.000045,  -0.000021, -0.000021, 0.000016,  0.000047,
+    0.000016,  -0.000013, 0.000013,  0.000012,  -0.000004, -0.000005, -0.000044,
+    -0.000044, -0.000046, 0.000322,  -0.000117, -0.000024, 0.000027,  0.000003,
+    -0.000008, 0.000034,  -0.000088, 0.000028,  0.000055,  0.000049,  -0.000005,
+    0.000022,  0.000021,  -0.000006, 0.000008,  -0.000113, 0.000067,  0.000044,
+    0.000036,  -0.000007, 0.000034,  0.000028,  0.000055,  0.000049,  -0.000088,
+    -0.000005, -0.000001, -0.000001, -0.000005, 0.000127,  -0.000005, -0.000088,
+    -0.000019, -0.000043, 0.000047,  -0.000005, 0.000047,  0.000045,  -0.000021,
+    0.000016,  -0.000021, 0.000016,  -0.000013, 0.000044,  0.000036,  -0.000113,
+    0.000067,  0.000008,  -0.000007, -0.000003, -0.000003, -0.000044, 0.000137,
+    -0.000013, -0.000044, -0.000006, -0.000059, -0.000059, 0.000044,  0.000066,
+    0.000044,  0.000035,  -0.000018, -0.000005, -0.000044, -0.000044, 0.000322,
+    -0.000046, -0.000117, -0.000024, 0.000044,  -0.000113, 0.000089,  0.000036,
+    0.000079,  -0.000012, 0.000022,  0.000021,  -0.000006, -0.000001, -0.000041,
+    0.000171,  -0.000059, -0.000015, -0.000017, -0.000027, -0.000004, -0.000002,
+    -0.000001, -0.000002, -0.000020, -0.000024, -0.000055, -0.000005, 0.000216,
+    -0.000034, -0.000021, -0.000012, 0.000126,  -0.000043, -0.000052, -0.000032,
+    -0.000001, -0.000001, -0.000005, 0.000127,  -0.000005, -0.000088, -0.000019,
+    -0.000043, 0.000047,  -0.000005, -0.000002, -0.000001, -0.000002, -0.000004,
+    -0.000020, -0.000055, -0.000024, 0.000216,  -0.000005, -0.000034, -0.000021,
+    -0.000012, -0.000070, -0.000072, -0.000007, 0.000202,  -0.000010, -0.000021,
+    -0.000095, 0.000044,  0.000186,  0.000065,  0.000036,  -0.000113, -0.000153,
+    0.000126,  -0.000043, -0.000052, -0.000032, 0.000044,  0.000036,  -0.000113,
+    0.000089,  0.000079,  -0.000012, -0.000080, -0.000010, -0.000010, 0.000227,
+    -0.000080, -0.000024, 0.000092,  -0.000046, -0.000015, -0.000012, -0.000013,
+    -0.000027, 0.000056,  -0.000029, 0.000016,  0.000036,  0.000010,  0.000011,
+    -0.000082, -0.000001, -0.000001, -0.000013, 0.000070,  -0.000013, -0.000021,
+    -0.000003, -0.000012, 0.000027,  0.000003,  -0.000008, 0.000044,  0.000036,
+    -0.000095, 0.000186,  0.000065,  -0.000113, -0.000153, -0.000006, 0.000056,
+    -0.000033, -0.000017, 0.000013,  0.000012,  -0.000004, -0.000005, 0.000322,
+    -0.000044, -0.000117, -0.000044, -0.000046, -0.000024, -0.000002, -0.000008,
+    0.000118,  -0.000078, -0.000063, 0.000028,  0.000016,  -0.000005, -0.000002,
+    -0.000008, 0.000118,  -0.000078, -0.000063, 0.000016,  0.000028,  -0.000005,
+    -0.000113, 0.000067,  0.000008,  0.000036,  0.000044,  -0.000007, -0.000003,
+    -0.000003, -0.000013, -0.000044, 0.000137,  -0.000044, -0.000006, 0.000044,
+    0.000036,  -0.000113, 0.000067,  0.000008,  -0.000007, -0.000003, -0.000003,
+    -0.000044, 0.000137,  -0.000044, -0.000013, -0.000006, -0.000012, 0.000092,
+    -0.000046, -0.000015, -0.000013, 0.000022,  0.000011,  -0.000029, 0.000018,
+    0.000010,  -0.000001, 0.000021,  -0.000043, 0.000028,  0.000027,  0.000003,
+    -0.000012, 0.000092,  -0.000046, -0.000015, -0.000013, 0.000011,  -0.000029,
+    0.000018,  0.000010,  0.000022,  -0.000001, 0.000021,  -0.000043, 0.000028,
+    0.000027,  0.000003,  0.000016,  0.000047,  0.000045,  0.000016,  -0.000021,
+    -0.000021, -0.000013, 0.000012,  0.000013,  -0.000004, 0.000028,  0.000019,
+    -0.000025, -0.000013, -0.000161, 0.000683,  -0.000003, -0.000021, -0.000012,
+    0.000016,  0.000002,  -0.000012, -0.000009, -0.000017, 0.000078,  0.000185,
+    -0.000065, -0.000175, -0.000065, 0.000065,  -0.000113, 0.000067,  0.000008,
+    0.000036,  0.000044,  -0.000007, -0.000003, -0.000003, -0.000013, 0.000137,
+    -0.000044, -0.000044, -0.000006, 0.000092,  -0.000015, -0.000046, -0.000012,
+    -0.000013, -0.000043, 0.000027,  0.000028,  0.000021,  0.000003,  0.000047,
+    0.000045,  -0.000021, 0.000016,  -0.000021, 0.000016,  -0.000013, -0.000070,
+    0.000202,  -0.000007, -0.000072, -0.000010, -0.000021, 0.000049,  0.000034,
+    -0.000088, 0.000055,  0.000028,  -0.000005, -0.000043, 0.000126,  -0.000052,
+    -0.000032, 0.000044,  0.000036,  0.000089,  0.000079,  -0.000113, -0.000012,
+    -0.000060, -0.000010, 0.000176,  -0.000005, -0.000084, -0.000003, -0.000003,
+    -0.000044, 0.000137,  -0.000013, -0.000044, -0.000006, -0.000003, -0.000003,
+    -0.000044, -0.000013, 0.000137,  -0.000044, -0.000006, -0.000007, -0.000070,
+    -0.000010, -0.000072, 0.000202,  -0.000021, -0.000012, -0.000015, 0.000092,
+    -0.000046, -0.000013, -0.000043, 0.000126,  -0.000052, -0.000032, 0.000047,
+    -0.000043, -0.000005, 0.000075,  -0.000067, 0.000044,  -0.000044, -0.000008,
+    -0.000001, -0.000001, -0.000001, -0.000005, -0.000003, -0.000013, -0.000013,
+    0.000174,  -0.000065, -0.000013, -0.000021, -0.000012, -0.000027, 0.000056,
+    0.000010,  0.000011,  0.000036,  0.000016,  -0.000029, -0.000082, 0.000126,
+    -0.000043, 0.000028,  -0.000080, -0.000032, 0.000045,  -0.000013, 0.000028,
+    0.000019,  0.000045,  0.000047,  -0.000021, 0.000016,  -0.000021, 0.000016,
+    -0.000013, -0.000007, 0.000202,  -0.000070, -0.000072, -0.000010, -0.000021,
+    0.000013,  0.000012,  -0.000004, -0.000005, -0.000044, -0.000044, -0.000046,
+    -0.000117, 0.000322,  -0.000024, 0.000045,  0.000045,  -0.000013, -0.000113,
+    0.000044,  0.000036,  0.000069,  0.000035,  -0.000007, -0.000059, -0.000059,
+    0.000044,  0.000066,  0.000044,  0.000035,  -0.000018, 0.000027,  0.000003,
+    -0.000008, -0.000113, 0.000044,  0.000036,  0.000079,  0.000089,  -0.000012,
+    0.000045,  0.000045,  -0.000013, 0.000008,  0.000036,  0.000067,  -0.000113,
+    0.000044,  -0.000007, -0.000029, 0.000010,  0.000011,  0.000019,  0.000034,
+    -0.000001, -0.000001, -0.000013, -0.000003, 0.000036,  -0.000010, -0.000006,
+    0.000117,  0.000055,  -0.000159, 0.000056,  0.000154,  -0.000011, -0.000070,
+    -0.000072, -0.000007, 0.000202,  -0.000010, -0.000021, 0.000036,  -0.000080,
+    -0.000010, 0.000236,  -0.000010, -0.000080, -0.000113, 0.000032,  0.000008,
+    0.000044,  -0.000113, 0.000067,  0.000036,  -0.000007, -0.000002, -0.000001,
+    -0.000002, -0.000004, -0.000005, 0.000221,  -0.000055, -0.000003, -0.000034,
+    -0.000024, -0.000021, -0.000020, -0.000014, 0.000036,  0.000044,  -0.000113,
+    0.000067,  0.000008,  -0.000007, -0.000003, -0.000003, -0.000044, -0.000013,
+    0.000137,  -0.000044, -0.000006, 0.000049,  0.000057,  -0.000100, 0.000034,
+    0.000036,  -0.000002, -0.000059, -0.000059, 0.000044,  0.000066,  0.000044,
+    0.000035,  -0.000018, -0.000001, -0.000059, 0.000171,  -0.000041, -0.000015,
+    -0.000017, -0.000027, -0.000161, 0.000161,  -0.000161, 0.000161,  -0.000013,
+    -0.000178, 0.000737,  -0.000216, -0.000216, -0.000013, -0.000026, -0.000021,
+    0.000041,  -0.000125, 0.000041,  0.000130,  -0.000075, -0.000013, 0.000028,
+    0.000019,  -0.000001, 0.000036,  -0.000003, -0.000010, -0.000019, -0.000029,
+    0.000010,  0.000011,  0.000019,  0.000034,  -0.000001, -0.000001, -0.000001,
+    -0.000001, -0.000013, -0.000013, -0.000003, 0.000087,  -0.000013, -0.000021,
+    -0.000012, -0.000002, -0.000004, -0.000002, -0.000001, -0.000020, -0.000055,
+    0.000216,  -0.000034, -0.000005, -0.000021, -0.000024, -0.000012, 0.000022,
+    0.000021,  -0.000006, -0.000059, -0.000059, 0.000044,  0.000066,  0.000044,
+    0.000035,  -0.000018, 0.000036,  0.000044,  -0.000113, 0.000069,  0.000035,
+    -0.000007, -0.000001, -0.000001, -0.000005, -0.000005, 0.000127,  -0.000088,
+    -0.000019, -0.000012, 0.000092,  -0.000015, -0.000046, -0.000013, 0.000021,
+    -0.000043, 0.000027,  0.000028,  0.000003,  0.000034,  0.000011,  -0.000029,
+    0.000010,  0.000019,  -0.000001, -0.000012, 0.000092,  -0.000015, -0.000046,
+    -0.000013, 0.000021,  -0.000043, 0.000027,  0.000028,  0.000003,  -0.000088,
+    0.000034,  0.000028,  0.000055,  0.000049,  -0.000005, 0.000022,  0.000021,
+    -0.000006, -0.000009, 0.000164,  -0.000006, -0.000084, 0.000049,  -0.000063,
+    0.000054,  0.000023,  0.000057,  0.000034,  0.000036,  -0.000100, 0.000049,
+    -0.000002, -0.000027, 0.000056,  0.000011,  0.000010,  -0.000029, 0.000016,
+    0.000036,  -0.000082, 0.000022,  0.000021,  -0.000006, 0.000022,  0.000021,
+    -0.000006, -0.000006, -0.000006, -0.000015, 0.000285,  -0.000044, -0.000044,
+    -0.000055, -0.000056, -0.000037, 0.000402,  0.000077,  -0.000358, 0.000075,
+    -0.000067, 0.000044,  -0.000044, -0.000008, 0.000045,  0.000045,  -0.000013,
+    0.000008,  0.000036,  0.000067,  -0.000113, 0.000044,  -0.000007, -0.000080,
+    -0.000010, 0.000227,  -0.000010, -0.000080, -0.000024, 0.000008,  0.000044,
+    -0.000113, 0.000067,  0.000036,  -0.000007, -0.000113, 0.000036,  0.000044,
+    0.000069,  0.000035,  -0.000007, -0.000001, -0.000001, -0.000005, -0.000005,
+    0.000127,  -0.000088, -0.000019, 0.000013,  0.000012,  -0.000004, -0.000005,
+    -0.000044, -0.000044, -0.000046, 0.000322,  -0.000141, -0.000010, -0.000060,
+    0.000176,  -0.000005, -0.000084, 0.000028,  0.000027,  -0.000008, -0.000133,
+    -0.000007, -0.000099, 0.000155,  0.000099,  -0.000010, -0.000005, -0.000043,
+    0.000028,  0.000027,  0.000021,  0.000003,  -0.000021, -0.000021, 0.000016,
+    0.000047,  0.000016,  0.000045,  -0.000013, 0.000044,  0.000036,  0.000089,
+    0.000079,  -0.000113, -0.000012, -0.000010, -0.000060, 0.000176,  -0.000005,
+    -0.000084, -0.000002, -0.000063, 0.000118,  -0.000008, 0.000028,  0.000016,
+    -0.000078, -0.000005, -0.000067, 0.000075,  -0.000044, 0.000044,  -0.000008,
+    -0.000067, 0.000044,  0.000075,  -0.000044, -0.000008, -0.000001, -0.000001,
+    -0.000001, -0.000005, -0.000003, -0.000065, -0.000013, 0.000174,  -0.000013,
+    -0.000021, -0.000013, -0.000012, -0.000027, -0.000029, 0.000010,  0.000056,
+    0.000011,  0.000036,  0.000016,  -0.000082, -0.000002, -0.000063, 0.000118,
+    -0.000008, 0.000016,  0.000028,  -0.000078, -0.000005, -0.000067, 0.000075,
+    -0.000044, 0.000044,  -0.000008, -0.000015, -0.000006, -0.000006, 0.000285,
+    -0.000055, -0.000056, -0.000044, -0.000044, -0.000037, 0.000402,  0.000077,
+    -0.000358, -0.000001, -0.000001, -0.000005, 0.000127,  -0.000005, -0.000088,
+    -0.000019, -0.000007, -0.000007, -0.000104, -0.000099, 0.000327,  -0.000036,
+    -0.000021, 0.000028,  0.000019,  0.000041,  -0.000125, 0.000041,  0.000130,
+    -0.000075, -0.000013, 1.000000,  0.000045,  0.000045,  -0.000013, 0.000036,
+    0.000035,  0.000069,  -0.000113, 0.000044,  -0.000007, -0.000029, 0.000010,
+    0.000019,  0.000034,  0.000011,  -0.000001, -0.000001, -0.000001, -0.000013,
+    -0.000013, -0.000003, 0.000070,  -0.000021, -0.000012, 0.000027,  0.000003,
+    -0.000008, 0.000044,  0.000036,  -0.000113, 0.000079,  0.000089,  -0.000012,
+    -0.000113, 0.000036,  0.000044,  0.000069,  0.000035,  -0.000007, -0.000001,
+    -0.000001, -0.000005, -0.000005, 0.000127,  -0.000088, -0.000019, -0.000001,
+    -0.000001, -0.000005, 0.000127,  -0.000005, -0.000088, -0.000019, -0.000043,
+    0.000126,  -0.000052, -0.000032, 0.000055,  0.000049,  -0.000088, 0.000034,
+    0.000028,  -0.000005, 0.000022,  0.000021,  -0.000006, -0.000001, -0.000001,
+    -0.000005, 0.000127,  -0.000005, -0.000088, -0.000019, -0.000043, 0.000126,
+    -0.000052, -0.000032, 0.000045,  0.000045,  -0.000013, 0.000036,  0.000044,
+    -0.000113, 0.000069,  0.000035,  -0.000007, -0.000100, 0.000034,  0.000057,
+    0.000049,  0.000036,  -0.000002, -0.000015, -0.000012, 0.000092,  -0.000046,
+    -0.000013, 0.000044,  0.000036,  0.000089,  0.000079,  -0.000113, -0.000012,
+    -0.000007, -0.000070, -0.000072, 0.000202,  -0.000010, -0.000021, -0.000006,
+    -0.000009, 0.000164,  -0.000063, -0.000052, -0.000021, 0.000675,  0.000132,
+    -0.000594, -0.000007, 0.000202,  -0.000070, -0.000072, -0.000010, -0.000021,
+    0.000008,  0.000067,  -0.000113, 0.000036,  0.000044,  -0.000007, -0.000006,
+    -0.000006, 0.000285,  -0.000015, -0.000044, -0.000044, -0.000055, -0.000056,
+    -0.000037, -0.000159, 0.000117,  0.000055,  0.000056,  0.000154,  -0.000011,
+    -0.000010, 0.000227,  -0.000010, -0.000080, -0.000080, -0.000024, 0.000008,
+    -0.000113, 0.000067,  0.000044,  0.000036,  -0.000007, 0.000027,  0.000003,
+    -0.000008, 0.000044,  0.000036,  -0.000113, 0.000079,  0.000089,  -0.000012,
+    -0.000009, 0.000164,  -0.000006, -0.000063, -0.000052, -0.000021, 0.000049,
+    -0.000063, 0.000054,  0.000028,  0.000016,  -0.000005, -0.000012, 0.000092,
+    -0.000046, -0.000015, -0.000013, -0.000027, 0.000056,  0.000011,  0.000036,
+    -0.000029, 0.000016,  0.000010,  -0.000082, 0.000021,  -0.000043, 0.000028,
+    0.000027,  0.000003,  0.000016,  -0.000063, 0.000054,  0.000049,  0.000028,
+    -0.000005, -0.000015, 0.000092,  -0.000012, -0.000046, -0.000013, -0.000012,
+    0.000092,  -0.000046, -0.000015, -0.000013, 0.000011,  0.000034,  -0.000029,
+    0.000019,  0.000010,  -0.000001, 0.000021,  -0.000043, 0.000028,  0.000027,
+    0.000003,  0.000049,  0.000057,  -0.000100, 0.000032,  -0.000029, 0.000010,
+    0.000011,  0.000019,  0.000034,  -0.000001, -0.000001, -0.000001, -0.000001,
+    -0.000013, -0.000013, -0.000003, 0.000087,  -0.000013, -0.000021, -0.000012,
+    0.000047,  -0.000021, 0.000016,  -0.000021, 0.000016,  0.000045,  -0.000013,
+    -0.000005, -0.000044, -0.000044, 0.000322,  -0.000046, -0.000117, -0.000024,
+    0.000044,  0.000079,  -0.000113, 0.000089,  0.000036,  -0.000012, 0.000044,
+    0.000079,  -0.000113, 0.000036,  0.000089,  -0.000012, 0.000013,  0.000012,
+    -0.000004, -0.000005, -0.000044, -0.000044, -0.000117, -0.000046, 0.000322,
+    -0.000024, -0.000005, -0.000044, 0.000322,  -0.000046, -0.000044, -0.000117,
+    -0.000024, 0.000079,  -0.000113, 0.000089,  0.000044,  0.000036,  -0.000012,
+    -0.000012, 0.000092,  -0.000015, -0.000046, -0.000013, 0.000021,  -0.000043,
+    0.000027,  0.000028,  0.000003,  0.000011,  0.000034,  -0.000029, 0.000010,
+    0.000019,  -0.000001, 0.000071,  0.000173,  -0.000018, -0.000007, -0.000007,
+    -0.000099, -0.000104, 0.000327,  -0.000036, -0.000021, -0.000012, 0.000092,
+    -0.000015, -0.000046, -0.000013, 0.000021,  -0.000043, 0.000027,  0.000028,
+    0.000003,  -0.000027, 0.000011,  0.000036,  -0.000029, 0.000010,  0.000016,
+    0.000056,  -0.000082, -0.000004, -0.000029, 0.000099,  -0.000005, -0.000029,
+    -0.000010, -0.000007, -0.000027, 0.000036,  0.000011,  -0.000029, 0.000010,
+    0.000056,  0.000016,  -0.000082, -0.000043, 0.000126,  -0.000052, -0.000032,
+    0.000045,  0.000045,  -0.000013, 0.000012,  0.000013,  -0.000004, 0.000028,
+    0.000019,  -0.000003, -0.000003, -0.000044, 0.000137,  -0.000013, -0.000044,
+    -0.000006, -0.000003, -0.000003, -0.000044, -0.000013, 0.000137,  -0.000044,
+    -0.000006, -0.000005, -0.000044, -0.000044, 0.000322,  -0.000046, -0.000117,
+    -0.000024, 0.000044,  -0.000113, 0.000089,  0.000036,  0.000079,  -0.000012,
+    0.000044,  0.000186,  -0.000113, 0.000036,  0.000065,  -0.000095, -0.000153,
+    0.000011,  -0.000029, 0.000010,  0.000019,  0.000034,  -0.000001, -0.000001,
+    -0.000013, -0.000003, 0.000036,  -0.000010, -0.000006, -0.000071, -0.000005,
+    0.000100,  0.000071,  0.000161,  -0.000013, -0.000161, -0.000161, -0.000216,
+    -0.000013, 0.000725,  -0.000178, -0.000026, -0.000021, -0.000002, -0.000008,
+    0.000028,  0.000118,  -0.000063, 0.000016,  -0.000078, -0.000005, -0.000015,
+    -0.000044, -0.000006, 0.000285,  -0.000044, -0.000006, -0.000055, -0.000056,
+    -0.000037, 0.000088,  0.000064,  -0.000018, -0.000002, -0.000008, 0.000016,
+    -0.000063, 0.000028,  0.000118,  -0.000078, -0.000005, -0.000043, 0.000028,
+    0.000027,  0.000021,  0.000003,  0.000045,  -0.000021, 0.000016,  -0.000021,
+    0.000016,  0.000047,  -0.000013, -0.000007, -0.000047, -0.000047, 0.000164,
+    -0.000005, -0.000021, -0.000012, -0.000027, 0.000011,  -0.000029, 0.000010,
+    0.000056,  0.000016,  0.000036,  -0.000082, -0.000012, 0.000092,  -0.000046,
+    -0.000015, -0.000013, 0.000022,  0.000011,  -0.000029, 0.000018,  0.000010,
+    -0.000001, 0.000021,  -0.000043, 0.000028,  0.000027,  0.000003,  0.000022,
+    0.000021,  -0.000006, -0.000012, 0.000092,  -0.000046, -0.000015, -0.000013,
+    0.000022,  0.000011,  -0.000029, 0.000018,  0.000010,  -0.000001, 0.000021,
+    -0.000043, 0.000028,  0.000027,  0.000003,  0.000044,  -0.000113, 0.000036,
+    0.000186,  0.000065,  -0.000095, -0.000153, -0.000100, 0.000034,  0.000057,
+    0.000049,  0.000036,  -0.000002, 0.000045,  0.000045,  -0.000013, -0.000113,
+    0.000067,  0.000008,  0.000044,  0.000036,  -0.000007, -0.000003, -0.000003,
+    -0.000044, -0.000013, 0.000137,  -0.000044, -0.000006, -0.000070, -0.000010,
+    0.000202,  -0.000072, -0.000007, -0.000021, 0.000044,  -0.000113, 0.000036,
+    0.000067,  0.000008,  -0.000007, 0.000049,  0.000034,  -0.000088, 0.000028,
+    0.000055,  -0.000005, -0.000003, 0.000101,  -0.000047, -0.000005, -0.000021,
+    -0.000012, -0.000159, 0.000117,  0.000055,  0.000056,  0.000154,  -0.000011,
+    0.000022,  0.000021,  -0.000006, 0.000007,  0.000007,  -0.000002, 0.000022,
+    0.000021,  -0.000006, -0.000006, -0.000063, 0.000164,  -0.000052, -0.000009,
+    -0.000021, -0.000020, 0.000055,  -0.000018, 0.000049,  0.000034,  -0.000100,
+    0.000036,  0.000057,  -0.000002, 0.000126,  -0.000043, -0.000052, -0.000032,
+    -0.000003, -0.000003, -0.000044, 0.000137,  -0.000013, -0.000044, -0.000006,
+    -0.000003, -0.000003, -0.000044, -0.000013, 0.000137,  -0.000044, -0.000006,
+    0.000010,  0.000001,  -0.000003, 0.000173,  0.000071,  -0.000018, -0.000001,
+    -0.000001, -0.000005, 0.000127,  -0.000005, -0.000088, -0.000019, -0.000043,
+    0.000047,  -0.000005, 0.000022,  0.000021,  -0.000006, 0.000022,  0.000021,
+    -0.000006, 0.000022,  0.000021,  -0.000006, 0.000045,  0.000045,  -0.000013,
+    0.000035,  -0.000113, 0.000036,  0.000044,  0.000069,  -0.000007, 0.000011,
+    -0.000029, 0.000010,  0.000019,  0.000034,  -0.000001, -0.000001, -0.000001,
+    -0.000013, -0.000013, -0.000003, 0.000070,  -0.000021, -0.000012, 0.000027,
+    0.000003,  -0.000008, 0.000044,  -0.000113, 0.000036,  0.000089,  0.000079,
+    -0.000012, -0.000010, -0.000060, 0.000176,  -0.000005, -0.000084, -0.000113,
+    0.000186,  0.000044,  0.000036,  0.000065,  -0.000095, -0.000153, -0.000113,
+    0.000067,  0.000008,  0.000044,  0.000036,  -0.000007, -0.000003, -0.000003,
+    -0.000013, -0.000044, 0.000137,  -0.000044, -0.000006, 0.000013,  0.000013,
+    -0.000004, -0.000007, -0.000007, -0.000104, -0.000099, 0.000327,  -0.000036,
+    -0.000021, 0.000045,  0.000045,  -0.000013, 0.000036,  0.000035,  -0.000113,
+    0.000044,  0.000069,  -0.000007, -0.000006, -0.000009, 0.000164,  -0.000084,
+    0.000710,  0.000139,  -0.000636, -0.000001, -0.000001, -0.000005, -0.000005,
+    0.000127,  -0.000088, -0.000019, -0.000052, -0.000009, 0.000164,  -0.000063,
+    -0.000006, -0.000021, 0.000082,  0.000033,  -0.000008, 0.000049,  0.000016,
+    -0.000063, 0.000028,  0.000054,  -0.000005, -0.000010, -0.000070, 0.000202,
+    -0.000072, -0.000007, -0.000021, -0.000113, 0.000044,  0.000036,  0.000008,
+    0.000067,  -0.000007, 0.000049,  0.000034,  -0.000088, 0.000028,  0.000055,
+    -0.000005, -0.000071, 0.000100,  -0.000005, 0.000071,  0.000027,  0.000003,
+    -0.000008, -0.000095, -0.000113, 0.000186,  0.000044,  0.000036,  0.000065,
+    -0.000153, -0.000002, -0.000008, 0.000118,  -0.000078, -0.000063, 0.000028,
+    0.000016,  -0.000005, -0.000002, -0.000008, 0.000118,  -0.000078, -0.000063,
+    0.000016,  0.000028,  -0.000005, -0.000012, 0.000092,  -0.000046, -0.000015,
+    -0.000013, -0.000027, 0.000036,  0.000011,  0.000056,  -0.000029, 0.000016,
+    0.000010,  -0.000082, 0.000021,  -0.000043, 0.000028,  0.000027,  0.000003,
+    -0.000010, -0.000060, 0.000176,  -0.000005, -0.000084, -0.000012, 0.000092,
+    -0.000046, -0.000015, -0.000013, 0.000034,  0.000011,  -0.000029, 0.000019,
+    0.000010,  -0.000001, 0.000021,  -0.000043, 0.000028,  0.000027,  0.000003,
+    0.000013,  0.000012,  -0.000004, -0.000005, -0.000044, -0.000044, -0.000046,
+    -0.000117, 0.000322,  -0.000024, 0.000016,  0.000016,  -0.000021, -0.000021,
+    0.000047,  0.000045,  -0.000013, 0.000013,  0.000012,  -0.000004, 0.000022,
+    0.000021,  -0.000006, 0.000022,  0.000021,  -0.000006, 0.000022,  0.000021,
+    -0.000006, -0.000070, -0.000010, 0.000202,  -0.000072, -0.000007, -0.000021,
+    -0.000113, 0.000044,  0.000036,  0.000008,  0.000067,  -0.000007, 0.000034,
+    0.000049,  -0.000088, 0.000028,  0.000055,  -0.000005, -0.000001, -0.000001,
+    -0.000005, 0.000127,  -0.000005, -0.000044, -0.000044, -0.000019, -0.000043,
+    0.000047,  -0.000005, 0.000045,  0.000045,  -0.000013, 0.000012,  0.000013,
+    -0.000004, -0.000001, -0.000001, -0.000005, 0.000127,  -0.000005, -0.000088,
+    -0.000019, -0.000043, 0.000126,  -0.000052, -0.000032, -0.000002, -0.000063,
+    0.000118,  -0.000078, -0.000008, 0.000028,  0.000016,  -0.000005, -0.000015,
+    -0.000012, -0.000046, 0.000092,  -0.000013, -0.000067, 0.000044,  0.000075,
+    -0.000044, -0.000008, 0.000469,  0.000220,  -0.000516, -0.000001, -0.000001,
+    -0.000005, 0.000127,  -0.000044, -0.000005, -0.000044, -0.000019, 1.000000,
+    -0.000043, 0.000028,  0.000126,  -0.000080, -0.000032, -0.000002, -0.000063,
+    0.000118,  -0.000078, -0.000008, 0.000016,  0.000028,  -0.000005, -0.000046,
+    -0.000015, -0.000012, 0.000092,  -0.000013, -0.000044, -0.000067, 0.000044,
+    0.000075,  -0.000008, 0.000027,  0.000003,  -0.000008, 0.000186,  -0.000095,
+    -0.000113, 0.000065,  0.000044,  0.000036,  -0.000153, -0.000043, 0.000126,
+    -0.000052, -0.000032, -0.000005, -0.000044, -0.000044, 0.000322,  -0.000046,
+    -0.000117, -0.000024, 0.000079,  0.000044,  -0.000113, 0.000089,  0.000036,
+    -0.000012, 0.000044,  0.000079,  -0.000113, 0.000036,  0.000089,  -0.000012,
+    -0.000012, 0.000092,  -0.000015, -0.000046, -0.000013, 0.000021,  -0.000043,
+    0.000027,  0.000028,  0.000003,  -0.000012, 0.000092,  -0.000015, -0.000046,
+    -0.000013, 0.000021,  -0.000043, 0.000027,  0.000028,  0.000003,  -0.000027,
+    0.000011,  0.000056,  -0.000029, 0.000010,  0.000016,  0.000036,  -0.000082,
+    0.000016,  -0.000021, -0.000021, 0.000016,  0.000047,  0.000045,  -0.000013,
+    0.000013,  0.000012,  -0.000004, -0.000059, -0.000059, 0.000044,  0.000066,
+    0.000044,  0.000035,  -0.000018, -0.000001, -0.000001, -0.000005, 0.000127,
+    -0.000005, -0.000088, -0.000019, -0.000043, 0.000047,  -0.000005, -0.000001,
+    -0.000001, -0.000001, -0.000013, -0.000013, -0.000003, -0.000013, 0.000087,
+    -0.000021, -0.000012, -0.000027, 0.000056,  0.000011,  0.000010,  -0.000029,
+    0.000036,  0.000016,  -0.000082, -0.000113, 0.000044,  0.000067,  0.000008,
+    -0.000007, -0.000003, -0.000003, -0.000013, -0.000044, 0.000137,  -0.000044,
+    -0.000006, -0.000001, -0.000017, -0.000041, 0.000171,  -0.000059, -0.000015,
+    -0.000027, -0.000002, -0.000002, -0.000001, -0.000004, -0.000020, -0.000034,
+    -0.000024, -0.000005, 0.000216,  -0.000055, -0.000021, -0.000012, -0.000075,
+    0.000041,  -0.000125, 0.000041,  0.000130,  -0.000013, 0.000011,  -0.000029,
+    0.000010,  0.000019,  0.000034,  -0.000001, -0.000001, -0.000013, -0.000003,
+    0.000036,  -0.000010, -0.000006, -0.000059, -0.000059, 0.000044,  0.000066,
+    0.000044,  0.000035,  -0.000018, 0.000022,  0.000021,  -0.000006, 0.000036,
+    0.000044,  -0.000113, 0.000069,  0.000035,  -0.000007, -0.000001, -0.000001,
+    -0.000005, -0.000005, 0.000127,  -0.000044, -0.000044, -0.000019, -0.000001,
+    -0.000059, -0.000017, 0.000171,  -0.000041, -0.000015, -0.000027, -0.000161,
+    -0.000013, -0.000178, 0.000496,  -0.000026, -0.000021, 0.000041,  -0.000075,
+    -0.000125, 0.000041,  0.000130,  -0.000013, -0.000003, -0.000003, -0.000044,
+    0.000137,  -0.000013, -0.000044, -0.000006, -0.000003, -0.000003, -0.000044,
+    -0.000013, 0.000137,  -0.000044, -0.000006, 0.000050,  0.000036,  -0.000010,
+    0.000045,  0.000045,  -0.000013, -0.000113, 0.000036,  0.000044,  0.000035,
+    0.000069,  -0.000007, 0.000071,  0.000173,  -0.000018, -0.000007, -0.000007,
+    -0.000099, -0.000104, 0.000327,  -0.000036, -0.000021, -0.000043, 0.000126,
+    -0.000052, -0.000032, -0.000133, -0.000007, 0.000099,  0.000155,  -0.000010,
+    -0.000099, -0.000005, -0.000003, -0.000003, -0.000044, 0.000137,  -0.000013,
+    -0.000044, -0.000006, -0.000003, -0.000003, -0.000044, -0.000013, 0.000137,
+    -0.000044, -0.000006, -0.000015, -0.000006, -0.000006, -0.000055, 0.000285,
+    -0.000056, -0.000044, -0.000044, -0.000037, 0.000675,  0.000132,  -0.000594,
+    -0.000070, -0.000072, -0.000007, 0.000202,  -0.000010, -0.000021, -0.000095,
+    0.000044,  -0.000113, 0.000036,  0.000186,  0.000065,  -0.000153, 0.000022,
+    0.000021,  -0.000006, -0.000006, -0.000006, -0.000055, 0.000285,  -0.000015,
+    -0.000056, -0.000044, -0.000044, -0.000037, 0.000055,  -0.000159, 0.000117,
+    0.000056,  0.000154,  -0.000011, 0.000022,  0.000021,  -0.000006, -0.000059,
+    -0.000059, 0.000044,  0.000066,  0.000044,  0.000035,  -0.000018, 0.000028,
+    0.000019,  -0.000071, -0.000005, 0.000100,  0.000071,  -0.000060, 0.000176,
+    -0.000010, -0.000005, -0.000084, -0.000007, 0.000202,  -0.000070, -0.000010,
+    -0.000072, -0.000021, 0.000045,  0.000045,  -0.000013, 0.000035,  0.000036,
+    -0.000113, 0.000044,  0.000069,  -0.000007, -0.000006, -0.000006, -0.000044,
+    -0.000044, -0.000055, 0.000285,  -0.000056, -0.000015, -0.000037, -0.000091,
+    0.000346,  -0.000121, 0.000055,  -0.000159, 0.000056,  0.000117,  0.000154,
+    -0.000011, -0.000007, -0.000007, -0.000099, 0.000327,  -0.000104, -0.000036,
+    -0.000021, 0.000173,  0.000071,  -0.000018, -0.000060, -0.000010, 0.000176,
+    -0.000005, -0.000084, -0.000007, -0.000070, -0.000010, -0.000072, 0.000202,
+    -0.000021, 0.000044,  -0.000095, 0.000036,  -0.000113, 0.000186,  0.000065,
+    -0.000153, 0.000010,  0.000011,  -0.000029, 0.000019,  0.000034,  -0.000001,
+    -0.000001, -0.000001, -0.000001, -0.000013, -0.000013, -0.000003, 0.000087,
+    -0.000013, -0.000021, -0.000012, -0.000043, 0.000126,  -0.000052, -0.000032,
+    0.000027,  0.000003,  -0.000008, -0.000113, 0.000044,  0.000036,  0.000079,
+    0.000089,  -0.000012, 0.000013,  0.000012,  -0.000004, -0.000005, -0.000044,
+    -0.000044, -0.000117, -0.000046, 0.000322,  -0.000024, -0.000070, -0.000010,
+    0.000202,  -0.000072, -0.000007, -0.000021, -0.000113, 0.000044,  0.000036,
+    0.000008,  0.000067,  -0.000007, 0.000049,  0.000034,  -0.000088, 0.000028,
+    0.000055,  -0.000005, 0.000045,  0.000045,  -0.000013, 0.000008,  0.000036,
+    0.000067,  -0.000113, 0.000044,  -0.000007, 0.000045,  0.000045,  -0.000013,
+    0.000036,  0.000044,  0.000035,  -0.000113, 0.000069,  -0.000007, 0.000036,
+    -0.000113, -0.000080, -0.000010, 0.000236,  -0.000080, -0.000010, 0.000032,
+    0.000035,  -0.000113, 0.000036,  0.000069,  0.000044,  -0.000007, 0.000008,
+    0.000044,  -0.000113, 0.000036,  0.000067,  -0.000007, -0.000065, -0.000009,
+    0.000213,  -0.000005, -0.000021, -0.000078, -0.000027, 0.000036,  -0.000029,
+    0.000010,  0.000011,  0.000056,  0.000016,  -0.000082, -0.000020, 0.000013,
+    0.000022,  -0.000013, -0.000002, 0.000173,  0.000071,  -0.000018, -0.000003,
+    -0.000003, -0.000044, 0.000137,  -0.000013, -0.000044, -0.000006, -0.000003,
+    -0.000003, -0.000044, -0.000013, 0.000137,  -0.000044, -0.000006, -0.000004,
+    -0.000029, -0.000029, 0.000099,  -0.000005, -0.000010, -0.000007, -0.000027,
+    0.000010,  0.000011,  0.000036,  -0.000029, 0.000056,  0.000016,  -0.000082,
+    -0.000043, 0.000126,  -0.000052, -0.000032, -0.000071, -0.000005, 0.000100,
+    0.000071,  0.000161,  -0.000161, -0.000216, -0.000161, -0.000026, -0.000017,
+    -0.000013, -0.000005, 0.000530,  -0.000026, -0.000024, -0.000002, -0.000008,
+    0.000028,  0.000118,  -0.000063, 0.000016,  -0.000078, -0.000005, -0.000001,
+    -0.000001, -0.000005, 0.000127,  -0.000005, -0.000088, -0.000019, -0.000043,
+    0.000047,  -0.000005, -0.000015, -0.000055, -0.000056, -0.000044, -0.000006,
+    0.000285,  -0.000044, -0.000006, -0.000037, 0.000143,  0.000078,  -0.000022,
+    -0.000002, -0.000008, 0.000016,  -0.000063, 0.000028,  0.000118,  -0.000078,
+    -0.000005, -0.000012, 0.000092,  -0.000046, -0.000015, -0.000013, -0.000027,
+    0.000036,  0.000011,  0.000056,  -0.000029, 0.000016,  0.000010,  -0.000082,
+    0.000021,  -0.000043, 0.000028,  0.000027,  0.000003,  -0.000012, 0.000092,
+    -0.000046, -0.000015, -0.000013, 0.000034,  0.000011,  -0.000029, 0.000019,
+    0.000010,  -0.000001, 0.000021,  -0.000043, 0.000028,  0.000027,  0.000003,
+    -0.000161, 0.000161,  0.000161,  -0.000216, -0.000178, -0.000216, 0.000522,
+    -0.000026, -0.000021, -0.000021, 0.000016,  -0.000021, 0.000047,  0.000045,
+    0.000016,  -0.000013, 0.000012,  0.000013,  -0.000004, -0.000059, -0.000059,
+    0.000044,  0.000066,  0.000035,  0.000044,  -0.000018, -0.000059, -0.000059,
+    1.000000,  0.000044,  -0.000063, 0.000024,  -0.000063, 0.000024,  -0.000063,
+    0.000024,  -0.000063, -0.000063, 0.000024,  0.000024,  0.000024,  -0.000063,
+    -0.000063, 0.000024,  0.000024,  0.000024,  0.000024,  -0.000063, -0.000063,
+    0.000024,  0.000024,  -0.000063, -0.000063, 0.000024,  -0.000063, -0.000063,
+    0.000075,  0.000024,  -0.000063, 0.000044,  0.000386,  -0.000043, 0.000126,
+    -0.000052, -0.000032, -0.000005, -0.000044, -0.000044, 0.000322,  -0.000046,
+    -0.000117, -0.000024, 0.000044,  0.000079,  -0.000113, 0.000089,  0.000036,
+    -0.000012, 0.000044,  0.000079,  -0.000113, 0.000036,  0.000089,  -0.000012,
+    0.000013,  0.000012,  -0.000004, -0.000005, -0.000044, -0.000046, -0.000044,
+    -0.000117, 0.000322,  -0.000024, 0.000028,  0.000019,  -0.000005, -0.000060,
+    0.000176,  -0.000010, -0.000084, -0.000012, 0.000092,  -0.000015, -0.000046,
+    -0.000013, 0.000021,  -0.000043, 0.000027,  0.000028,  0.000003,  0.000011,
+    0.000034,  -0.000029, 0.000010,  0.000019,  -0.000001, -0.000006, 0.000164,
+    -0.000009, -0.000084, 0.000049,  -0.000100, 0.000057,  0.000032,  -0.000012,
+    0.000092,  -0.000015, -0.000046, -0.000013, 0.000021,  -0.000043, 0.000027,
+    0.000028,  0.000003,  -0.000027, 0.000011,  0.000056,  0.000036,  -0.000029,
+    0.000010,  0.000016,  -0.000082, -0.000007, 0.000202,  -0.000010, -0.000072,
+    -0.000070, -0.000021, 0.000027,  0.000003,  -0.000008, 0.000079,  0.000044,
+    -0.000113, 0.000036,  0.000089,  -0.000012, 0.000022,  0.000021,  -0.000006,
+    1.000000,  0.000024,  -0.000063, 0.000005,  0.000028,  -0.000002, -0.000005,
+    -0.000029, 0.000077,  -0.000021, -0.000012, 1.000000,  -0.000003, -0.000003,
+    -0.000044, 0.000137,  -0.000044, -0.000013, -0.000006, 0.000016,  -0.000063,
+    0.000028,  0.000054,  0.000049,  -0.000005, -0.000046, -0.000015, -0.000012,
+    0.000092,  -0.000013, -0.000005, -0.000044, -0.000044, 0.000322,  -0.000046,
+    -0.000117, -0.000024, 0.000044,  0.000079,  -0.000113, 0.000089,  0.000036,
+    -0.000012, 0.000044,  -0.000095, 0.000186,  -0.000113, 0.000036,  0.000065,
+    -0.000153, 0.000010,  -0.000029, 0.000011,  0.000019,  0.000034,  -0.000001,
+    -0.000001, -0.000001, -0.000013, -0.000013, -0.000021, -0.000003, 0.000070,
+    -0.000012, -0.000012, 0.000092,  -0.000015, -0.000046, -0.000013, 0.000021,
+    -0.000043, 0.000027,  0.000028,  0.000003,  -0.000027, 0.000011,  0.000036,
+    0.000056,  -0.000029, 0.000010,  0.000016,  -0.000082, 0.000045,  0.000045,
+    -0.000013, 0.000022,  0.000021,  -0.000006, 0.000036,  -0.000113, -0.000080,
+    0.000236,  -0.000080, -0.000010, -0.000010, 0.000032,  0.000035,  -0.000113,
+    0.000069,  0.000044,  0.000036,  -0.000007, 0.000008,  0.000044,  -0.000113,
+    0.000036,  0.000067,  -0.000007, 0.000022,  0.000021,  -0.000006, -0.000059,
+    -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059,
+    -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059,
+    -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059,
+    -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059,
+    -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059,
+    -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059,
+    -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059,
+    -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, -0.000059,
+    -0.000059, -0.000059, -0.000059, -0.000059, -0.000059, 1.000000,  0.000044,
+    0.000044,  0.000044,  0.000044,  0.000044,  0.000044,  0.000044,  0.000044,
+    0.000044,  0.000044,  0.000044,  0.000044,  0.000044,  0.000024,  0.000044,
+    0.000044,  0.000044,  0.000044,  0.000044,  0.000044,  0.000044,  0.000044,
+    0.000044,  0.000044,  0.000044,  0.000044,  0.000044,  0.000044,  0.000044,
+    0.000044,  0.000044,  0.000044,  -0.000063, 0.001236,  0.000044,  0.000283,
+    0.000028,  -0.000006, -0.000063, 0.000164,  -0.000052, -0.000009, -0.000021,
+    -0.000040, 0.000086,  -0.000029, 0.000049,  0.000034,  -0.000100, 0.000036,
+    0.000057,  -0.000002, -0.000006, -0.000009, -0.000063, -0.000052, 0.000164,
+    -0.000021, 0.000710,  0.000139,  -0.000636, 0.000036,  -0.000113, 0.000044,
+    0.000067,  0.000008,  -0.000007, -0.000003, -0.000003, -0.000044, -0.000013,
+    -0.000044, 0.000137,  -0.000006, 0.000045,  0.000045,  -0.000013, -0.000113,
+    0.000044,  0.000035,  0.000036,  0.000069,  -0.000007, -0.000009, 0.000164,
+    -0.000006, -0.000084, 0.000049,  -0.000063, 0.000054,  0.000023,  -0.000003,
+    -0.000003, -0.000044, -0.000044, 0.000137,  -0.000013, -0.000006, -0.000003,
+    -0.000003, -0.000044, -0.000044, -0.000013, 0.000137,  -0.000006, -0.000012,
+    0.000092,  -0.000046, -0.000015, -0.000013, -0.000027, 0.000056,  0.000011,
+    0.000036,  -0.000029, 0.000016,  0.000010,  -0.000082, 0.000021,  -0.000043,
+    0.000028,  0.000027,  0.000003,  0.000036,  0.000044,  -0.000113, 0.000067,
+    0.000008,  -0.000007, -0.000003, -0.000003, -0.000044, -0.000013, -0.000044,
+    0.000137,  -0.000006, -0.000012, 0.000092,  -0.000046, -0.000015, -0.000013,
+    0.000011,  0.000034,  -0.000029, 0.000019,  0.000010,  -0.000001, 0.000021,
+    -0.000043, 0.000028,  0.000027,  0.000003,  -0.000005, -0.000044, -0.000044,
+    0.000322,  -0.000117, -0.000046, -0.000024, 0.000044,  0.000186,  -0.000095,
+    -0.000113, 0.000065,  0.000036,  -0.000153, 0.000079,  0.000044,  -0.000113,
+    0.000036,  0.000089,  -0.000012, 1.000000,  -0.000000, -0.000025, -0.000025,
+    -0.000025, -0.000043, -0.000069, -0.000043, -0.000069, -0.000025, -0.000020,
+    -0.000020, -0.000000, -0.000025, -0.000020, -0.000043, -0.000067, -0.000067,
+    -0.000025, -0.000025, -0.000013, -0.000025, -0.000000, -0.000000, -0.000020,
+    -0.000020, -0.000020, -0.000067, -0.000054, -0.000054, -0.000020, -0.000054,
+    -0.000020, -0.000117, -0.000020, -0.000020, -0.000054, -0.000054, 0.000008,
+    -0.000000, -0.000020, 0.000046,  -0.000043, -0.000020, 0.000046,  -0.000020,
+    -0.000043, -0.000020, -0.000043, -0.000067, 0.000015,  -0.000043, -0.000067,
+    -0.000025, -0.000117, -0.000067, -0.000025, -0.000054, -0.000020, -0.000069,
+    -0.000043, -0.000000, -0.000000, -0.000054, -0.000054, -0.000025, 0.000030,
+    -0.000020, -0.000020, -0.000000, -0.000014, -0.000020, -0.000020, -0.000020,
+    -0.000054, -0.000043, -0.000054, 0.000036,  -0.000020, -0.000020, -0.000043,
+    -0.000067, -0.000000, -0.000025, -0.000074, -0.000020, -0.000020, -0.000043,
+    -0.000067, -0.000115, 0.000036,  -0.000043, -0.000054, -0.000054, -0.000020,
+    -0.000000, -0.000020, -0.000050, -0.000025, -0.000000, 0.000030,  -0.000050,
+    -0.000020, -0.000040, -0.000054, -0.000000, -0.000040, -0.000028, -0.000020,
+    -0.000025, 0.000030,  -0.000025, -0.000069, -0.000007, -0.000123, 0.000009,
+    -0.000025, -0.000020, -0.000043, -0.000020, -0.000054, -0.000043, -0.000054,
+    -0.000000, -0.000025, -0.000020, -0.000054, -0.000025, -0.000065, -0.000043,
+    -0.000025, -0.000069, -0.000054, -0.000025, -0.000043, -0.000020, -0.000025,
+    0.000015,  0.000008,  -0.000025, -0.000012, -0.000014, -0.000067, -0.000020,
+    0.000046,  -0.000000, -0.000043, -0.000477, -0.000117, -0.000025, -0.000025,
+    0.000019,  -0.000017, -0.000025, -0.000067, -0.000020, -0.000020, -0.000053,
+    -0.000020, -0.000025, -0.000014, -0.000020, -0.000020, -0.000014, -0.000043,
+    0.000010,  -0.000085, -0.000020, -0.000020, 0.000028,  0.000008,  -0.000025,
+    -0.000033, 0.000009,  -0.000000, -0.000477, -0.000000, -0.000043, 0.000036,
+    -0.000043, -0.000025, -0.000025, 0.000015,  -0.000040, -0.000028, 0.000046,
+    -0.000074, -0.000085, -0.000053, -0.000040, -0.000000, -0.000028, -0.000043,
+    -0.000115, -0.000054, -0.000043, -0.000040, -0.000028, -0.000043, -0.000054,
+    -0.000053, -0.000025, -0.000000, -0.000000, -0.000000, 0.000009,  -0.000014,
+    -0.000000, -0.000067, -0.000067, -0.000000, -0.000025, -0.000043, -0.000020,
+    -0.000020, 0.000001,  -0.000054, -0.000085, -0.000043, 0.000009,  -0.000043,
+    -0.000074, -0.000053, 0.000030,  0.000030,  -0.000014, -0.000069, -0.000025,
+    0.000015,  -0.000020, -0.000115, -0.000020, -0.000053, -0.000020, -0.000025,
+    -0.000069, 0.000008,  -0.000053, -0.000043, 0.000009,  -0.000043, -0.000085,
+    -0.000054, 0.000030,  -0.000025, -0.000054, 0.000030,  0.000030,  -0.000014,
+    -0.000123, -0.000025, 0.000015,  0.000015,  -0.000043, 0.000009,  -0.000000,
+    -0.000085, 0.000046,  -0.000025, -0.000025, -0.000289, -0.000043, -0.000037,
+    -0.000014, -0.000020, -0.000020, -0.000025, 0.000030,  -0.000040, -0.000054,
+    -0.000067, 0.000030,  -0.000020, -0.000020, 0.000046,  -0.000028, -0.000067,
+    -0.000067, -0.000043, 0.000009,  -0.000025, -0.000020, 0.000046,  -0.000115,
+    -0.000054, -0.000000, -0.000025, -0.000033, -0.000054, -0.000054, -0.000000,
+    -0.000020, -0.000000, -0.000054, -0.000000, -0.000020, -0.000020, -0.000054,
+    0.000030,  -0.000117, -0.000043, -0.000013, -0.000025, -0.000054, -0.000043,
+    0.000009,  -0.000057, 0.000009,  0.000046,  -0.000043, -0.000040, -0.000000,
+    -0.000000, -0.000019, -0.000000, -0.000053, -0.000020, -0.000020, -0.000043,
+    0.000015,  0.000008,  -0.000026, 0.000046,  -0.000054, 0.000030,  0.000030,
+    -0.000053, 0.000046,  -0.000011, -0.000062, -0.000067, -0.000085, -0.000038,
+    -0.000025, -0.000011, -0.000020, -0.000025, 0.000036,  -0.000043, 0.000046,
+    0.000009,  0.000028,  -0.000020, 0.000008,  -0.000053, -0.000025, -0.000053,
+    -0.000020, -0.000020, -0.000008, -0.000043, 0.000009,  -0.000513, -0.000028,
+    -0.000020, -0.000074, -0.000289, -0.000085, -0.000062, 0.000015,  -0.000115,
+    -0.000014, -0.000013, -0.000040, -0.000028, -0.000062, -0.000074, -0.000053,
+    0.000046,  -0.000040, -0.000028, 0.000036,  -0.000043, -0.000025, -0.000121,
+    -0.000000, -0.000121, -0.000043, -0.000000, 0.000030,  -0.000028, -0.000054,
+    -0.000000, -0.000043, -0.000020, -0.000121, -0.000026, -0.000053, -0.000020,
+    -0.000020, -0.000061, -0.000007, -0.000000, -0.000025, -0.000000, -0.000043,
+    -0.000000, -0.000020, -0.000000, 0.000015,  -0.000045, 0.000046,  -0.000054,
+    -0.000043, -0.000289, -0.000069, 0.000015,  0.000008,  -0.000053, 0.000046,
+    0.000046,  -0.000025, -0.000098, -0.000014, -0.000000, -0.000067, -0.000043,
+    0.000009,  -0.000000, -0.000040, -0.000054, -0.000000, -0.000054, -0.000043,
+    0.000009,  0.000030,  0.000030,  -0.000043, 0.000028,  -0.000115, -0.000054,
+    -0.000020, -0.000020, -0.000053, -0.000040, -0.000054, -0.000020, 0.000046,
+    0.000009,  -0.000020, -0.000020, -0.000054, -0.000013, -0.000025, -0.000054,
+    -0.000025, -0.000053, -0.000074, -0.000054, -0.000026, -0.000008, 0.000015,
+    -0.000043, -0.000025, -0.000000, -0.000000, -0.000000, -0.000020, -0.000000,
+    -0.000011, 0.000009,  -0.000020, 0.000015,  0.000008,  -0.000020, -0.000115,
+    0.000036,  -0.000014, 0.000002,  -0.000085, -0.000020, -0.000121, -0.000121,
+    -0.000057, 0.000036,  -0.000121, 0.000036,  -0.000043, -0.000040, -0.000028,
+    -0.000054, 0.000030,  -0.000054, -0.000074, -0.000008, 0.000008,  -0.000054,
+    0.000030,  -0.000085, -0.000121, -0.000121, -0.000020, -0.000014, -0.000020,
+    -0.000008, -0.000054, -0.000074, -0.000043, 0.000009,  0.000028,  -0.000062,
+    -0.000025, 0.000046,  -0.000025, -0.000053, -0.000048, -0.000043, -0.000085,
+    0.000015,  -0.000062, -0.000062, -0.000011, 0.000009,  -0.000000, -0.000000,
+    -0.000020, -0.000067, -0.000020, -0.000054, -0.000013, -0.000000, -0.000000,
+    -0.000000, -0.000006, -0.000013, 0.000022,  0.000008,  0.000030,  -0.000043,
+    -0.000020, 0.000046,  -0.000020, -0.000013, -0.000067, -0.000040, -0.000054,
+    0.000046,  -0.000028, -0.000054, -0.000043, 0.000173,  -0.000054, 0.000030,
+    -0.000050, 0.000009,  0.000015,  -0.000030, -0.000074, -0.000043, 0.000009,
+    -0.000000, -0.000000, -0.000067, -0.000000, -0.000000, -0.000085, -0.000020,
+    -0.000020, -0.000025, -0.000289, -0.000069, -0.000025, -0.000053, -0.000057,
+    -0.000026, -0.000074, 0.000046,  -0.000040, -0.000028, 0.000015,  -0.000054,
+    0.000030,  -0.000020, 0.000046,  -0.000040, -0.000054, -0.000054, -0.000085,
+    0.000046,  0.000009,  -0.000054, -0.000054, 0.000030,  0.000022,  0.000008,
+    -0.000117, -0.000020, -0.000121, -0.000013, -0.000054, 0.000030,  0.000036,
+    -0.000040, -0.000028, -0.000020, -0.000025, 0.000022,  0.000030,  0.000030,
+    0.000028,  -0.000020, -0.000074, -0.000008, 0.000008,  -0.000053, -0.000043,
+    -0.000028, 0.000036,  -0.000025, -0.000014, 0.000008,  -0.000020, -0.000013,
+    -0.000025, -0.000033, -0.000477, -0.000043, 0.000009,  0.000030,  -0.000054,
+    -0.000054, -0.000043, -0.000115, -0.000020, -0.000020, -0.000013, -0.000069,
+    -0.000054, -0.000025, -0.000053, 0.000036,  -0.000043, 0.000028,  -0.000026,
+    0.000022,  -0.000065, -0.000115, -0.000053, 0.000046,  -0.000053, 0.000036,
+    -0.000043, 0.000030,  0.000030,  -0.000062, -0.000011, -0.000020, -0.000067,
+    0.000028,  -0.000020, -0.000054, -0.000057, -0.000008, -0.000025, -0.000020,
+    0.000028,  -0.000020, -0.000011, -0.000067, -0.000043, 0.000009,  -0.000078,
+    -0.000001, -0.000115, -0.000054, -0.000013, -0.000053, 0.000046,  -0.000020,
+    -0.000013, -0.000121, 0.000015,  0.000008,  -0.000025, -0.000053, 0.000010,
+    -0.000045, -0.000020, -0.000043, -0.000121, -0.000121, 0.000036,  -0.000043,
+    0.000030,  0.000030,  0.000030,  0.000030,  -0.000289, -0.000085, -0.000020,
+    0.000030,  0.000030,  -0.000085, -0.000020, -0.000020, -0.000033, -0.000121,
+    0.000008,  0.000036,  -0.000121, -0.000115, -0.000025, -0.000117, 0.000009,
+    -0.000074, -0.000020, -0.000020, -0.000020, 0.000046,  -0.000013, -0.000054,
+    -0.000000, -0.000054, -0.000043, 0.000009,  -0.000074, -0.000026, -0.000121,
+    0.000030,  0.000030,  -0.000020, 0.000046,  -0.000043, -0.000074, -0.000008,
+    0.000009,  -0.000020, -0.000020, 0.000028,  -0.000025, 0.000036,  -0.000043,
+    0.000046,  -0.000025, -0.000053, 0.000015,  0.000008,  -0.000025, -0.000020,
+    0.000030,  -0.000085, -0.000040, -0.000054, -0.000054, -0.000000, -0.000028,
+    -0.000054, -0.000123, -0.000043, 0.000009,  -0.000123, -0.000513, -0.000123,
+    -0.000513, -0.000123, -0.000123, -0.000013, 0.000015,  -0.000123, -0.000513,
+    -0.000123, 0.000030,  0.000030,  -0.000014, -0.000045, -0.000008, -0.000000,
+    -0.000020, 0.000028,  -0.000020, -0.000020, -0.000013, -0.000123, -0.000513,
+    0.000002,  -0.000040, -0.000054, 0.000046,  -0.000028, -0.000020, -0.000289,
+    -0.000000, 0.000028,  -0.000020, -0.000008, -0.000050, -0.000033, 0.000028,
+    -0.000011, -0.000011, -0.000067, -0.000026, -0.000115, -0.000054, -0.000043,
+    0.000046,  -0.000062, -0.000043, 0.000028,  0.000009,  0.000036,  -0.000121,
+    -0.000025, -0.000053, -0.000020, -0.000020, -0.000020, -0.000013, 0.000015,
+    0.000008,  -0.000025, -0.000043, 0.000009,  -0.000054, -0.000043, 0.000036,
+    -0.000043, -0.000013, -0.000054, 0.000030,  -0.000054, 0.000008,  -0.000053,
+    -0.000020, 0.000022,  -0.000038, -0.000121, 0.000036,  -0.000043, -0.000038,
+    -0.000115, -0.000033, -0.000121, -0.000053, 0.000046,  0.000028,  -0.000008,
+    -0.000062, -0.000025, -0.000033, -0.000006, 0.000015,  0.000008,  -0.000000,
+    -0.000000, -0.000054, 0.000030,  -0.000054, 0.000030,  -0.000000, -0.000014,
+    -0.000020, -0.000000, -0.000014, -0.000020, -0.000013, 0.000015,  -0.000025,
+    -0.000326, 0.000002,  -0.000054, 0.000030,  0.000028,  -0.000020, -0.000013,
+    0.000046,  -0.000043, -0.000121, -0.000053, 0.000046,  0.000030,  0.000030,
+    0.000046,  0.000028,  -0.000121, -0.000043, -0.000067, -0.000062, -0.000008,
+    -0.000121, -0.000040, -0.000025, -0.000013, -0.000115, 0.000015,  0.000008,
+    -0.000040, -0.000028, -0.000054, -0.000025, -0.000053, -0.000040, -0.000054,
+    -0.000011, -0.000045, -0.000085, 0.000046,  -0.000000, -0.000054, -0.000006,
+    -0.000054, 0.000030,  -0.000043, -0.000054, 0.000022,  -0.000106, -0.000115,
+    -0.000025, -0.000045, -0.000011, -0.000062, -0.000038, 0.000009,  -0.000054,
+    -0.000028, 0.000036,  -0.000000, -0.000020, -0.000011, -0.000000, -0.000020,
+    -0.000043, 0.000009,  -0.000513, -0.000043, -0.000043, -0.000008, -0.000020,
+    -0.000020, -0.000289, -0.000069, -0.000067, -0.000040, -0.000054, 0.000046,
+    -0.000054, -0.000028, 0.000036,  0.000015,  -0.000105, 0.000046,  0.000011,
+    -0.000014, -0.000020, -0.000013, -0.000053, 0.000046,  -0.000000, -0.000067,
+    -0.000067, -0.000062, -0.000008, -0.000000, -0.000067, -0.000289, -0.000069,
+    0.000036,  -0.000008, -0.000025, -0.000115, 0.000599,  -0.000040, -0.000028,
+    -0.000011, -0.000062, -0.000025, -0.000053, -0.000028, 0.000036,  0.000036,
+    -0.000121, -0.000043, 0.000009,  0.000036,  -0.000121, -0.000040, -0.000028,
+    -0.000043, 0.000028,  -0.000053, 0.000046,  -0.000477, -0.000117, 0.000046,
+    -0.000054, -0.000289, -0.000085, 0.000046,  -0.000054, -0.000025, -0.000053,
+    -0.000477, -0.000043, 0.000028,  -0.000008, -0.000020, -0.000043, 0.000028,
+    0.000028,  -0.000011, -0.000020, -0.000043, -0.000011, -0.000062, -0.000013,
+    0.000008,  -0.000053, -0.000053, 0.000015,  0.000008,  0.000008,  -0.000053,
+    -0.000000, -0.000020, -0.000011, -0.000050, -0.000008, -0.000000, -0.000020,
+    -0.000008, -0.000057, -0.000008, -0.000121, -0.000030, 0.000015,  -0.000025,
+    0.000030,  0.000030,  0.000008,  -0.000053, -0.000033, -0.000011, -0.000045,
+    -0.000029, -0.000000, -0.000289, -0.000054, -0.000000, -0.000020, -0.000013,
+    -0.000074, -0.000008, -0.000000, -0.000014, -0.000020, 0.000009,  -0.000000,
+    -0.000014, -0.000020, -0.000033, -0.000043, 0.000002,  -0.000054, 0.000030,
+    0.000046,  -0.000054, -0.000043, -0.000074, -0.000085, -0.000020, -0.000002,
+    -0.000020, -0.000513, -0.000054, -0.000043, -0.000121, 0.000030,  0.000030,
+    0.000001,  -0.000045, 0.000036,  -0.000043, 0.000009,  -0.000020, -0.000020,
+    -0.000040, -0.000028, -0.000011, -0.000062, -0.000025, -0.000053, 0.000046,
+    -0.000033, -0.000054, 0.000030,  -0.000012, 0.000056,  -0.000040, -0.000028,
+    -0.000513, -0.000123, 0.000036,  -0.000513, -0.000025, -0.000043, -0.000115,
+    -0.000054, -0.000043, -0.000025, -0.000033, -0.000000, -0.000000, -0.000000,
+    -0.000008, -0.000020, 0.000046,  -0.000000, -0.000011, -0.000020, 0.000015,
+    0.000008,  -0.000013, 0.000015,  0.000009,  -0.000020, -0.000020, -0.000115,
+    -0.000054, -0.000043, 0.000036,  -0.000043, 0.000002,  0.000015,  0.000036,
+    -0.000121, -0.000000, -0.000000, -0.000067, 0.001219,  0.000036,  0.000027,
+    -0.000121, -0.000000, -0.000000, -0.000067, -0.000025, -0.000033, -0.000121,
+    0.000008,  -0.000053, -0.000053, -0.000000, -0.000020, -0.000000, -0.000020,
+    -0.000008, -0.000013, 0.000015,  -0.000054, 0.000036,  -0.000043, -0.000062,
+    -0.000008, -0.000054, 0.000030,  0.000022,  -0.000038, -0.000115, -0.000011,
+    -0.000045, -0.000054, 0.000009,  -0.000028, 0.000036,  0.000022,  -0.000065,
+    -0.000115, 0.000030,  0.000030,  0.000025,  -0.000040, -0.000028, -0.000050,
+    -0.000008, -0.000121, -0.000014, 0.000030,  0.000030,  -0.000477, -0.000117,
+    -0.000115, -0.000033, -0.000020, -0.000477, -0.000085, -0.000020, -0.000054,
+    -0.000025, 0.000046,  -0.000115, -0.000040, -0.000028, -0.000289, -0.000337,
+    -0.000085, -0.000008, -0.000050, 0.000046,  -0.000115, -0.000033, -0.000011,
+    -0.000062, -0.000121, -0.000025, -0.000053, 0.000015,  0.000008,  0.000046,
+    -0.000054, -0.000043, -0.000040, -0.000054, -0.000040, -0.000028, -0.000000,
+    -0.000028, -0.000054, -0.000074, -0.000008, -0.000020, -0.000037, 0.000030,
+    0.000030,  -0.000057, -0.000008, -0.000121, 0.000000,  -0.000000, 0.000036,
+    -0.000043, -0.000289, -0.000064, -0.000000, -0.000000, -0.000008, -0.000020,
+    -0.000000, -0.000011, -0.000020, -0.000048, -0.000013, 0.000015,  -0.000054,
+    0.001682,  -0.000121, 0.000008,  -0.000053, -0.000053, 0.000015,  0.000008,
+    -0.000025, 0.000046,  -0.000000, -0.000020, -0.000011, -0.000513, -0.000043,
+    -0.000000, -0.000020, -0.000008, -0.000115, -0.000025, -0.000053, 0.000009,
+    0.000096,  -0.000074, 0.001684,  0.000030,  -0.000043, 0.000028,  0.000008,
+    -0.000053, -0.000033, -0.000011, -0.000062, 0.000028,  -0.000020, -0.000008,
+    -0.000040, -0.000020, -0.000000, -0.000028, -0.000054, -0.000020, 0.005404,
+    -0.000477, -0.000085, -0.000043, -0.000513, -0.000123, -0.000054, 0.000030,
+    -0.000040, -0.000028, -0.000513, -0.000043, 0.000030,  0.000030,  0.000028,
+    -0.000008, -0.000020, -0.000054, 0.000030,  0.000028,  -0.000011, -0.000020,
+    0.000008,  -0.000033, -0.000053, 1.000000,  0.035299};
 
 static int rowmap[1814] = {
-0, 5, 9, 14, 17, 21, 25, 30, 34, 40, 
-45, 49, 55, 60, 63, 68, 73, 78, 83, 88, 
-91, 99, 102, 108, 116, 121, 126, 130, 134, 137, 
-142, 147, 152, 156, 162, 166, 171, 177, 181, 189, 
-192, 196, 199, 202, 207, 211, 216, 222, 228, 232, 
-240, 243, 248, 252, 258, 266, 270, 274, 278, 283, 
-291, 299, 305, 308, 312, 319, 326, 330, 334, 338, 
-345, 348, 352, 355, 360, 365, 370, 376, 380, 385, 
-392, 396, 399, 406, 414, 422, 426, 431, 435, 439, 
-444, 450, 455, 463, 467, 473, 477, 480, 486, 493, 
-497, 505, 509, 513, 519, 523, 530, 533, 538, 542, 
-546, 549, 556, 564, 572, 576, 580, 587, 590, 593, 
-596, 603, 607, 614, 621, 625, 628, 635, 638, 645, 
-648, 654, 658, 662, 670, 678, 683, 690, 697, 701, 
-709, 717, 721, 725, 728, 735, 744, 752, 760, 763, 
-767, 771, 776, 784, 788, 793, 797, 801, 807, 815, 
-821, 829, 836, 843, 847, 855, 860, 865, 873, 878, 
-886, 892, 897, 901, 905, 913, 917, 920, 924, 932, 
-940, 948, 956, 961, 966, 971, 979, 984, 987, 994, 
-1001, 1009, 1017, 1022, 1026, 1031, 1036, 1041, 1044, 1049, 
-1053, 1058, 1063, 1066, 1071, 1077, 1082, 1085, 1090, 1094, 
-1102, 1107, 1115, 1120, 1125, 1131, 1138, 1143, 1150, 1158, 
-1166, 1171, 1179, 1184, 1189, 1192, 1199, 1206, 1210, 1213, 
-1218, 1223, 1227, 1231, 1239, 1242, 1247, 1253, 1258, 1264, 
-1271, 1278, 1286, 1294, 1302, 1310, 1314, 1322, 1325, 1329, 
-1334, 1339, 1342, 1345, 1350, 1356, 1364, 1372, 1380, 1385, 
-1391, 1396, 1401, 1407, 1412, 1418, 1423, 1429, 1434, 1442, 
-1450, 1458, 1462, 1467, 1471, 1476, 1481, 1487, 1490, 1495, 
-1503, 1509, 1517, 1525, 1530, 1535, 1543, 1547, 1551, 1555, 
-1560, 1563, 1568, 1574, 1581, 1589, 1592, 1596, 1602, 1605, 
-1608, 1617, 1620, 1628, 1635, 1642, 1650, 1654, 1662, 1671, 
-1678, 1686, 1694, 1702, 1706, 1714, 1718, 1724, 1729, 1734, 
-1740, 1744, 1748, 1753, 1759, 1762, 1766, 1769, 1774, 1780, 
-1784, 1789, 1795, 1800, 1806, 1812, 1817, 1820, 1827, 1831, 
-1835, 1842, 1849, 1856, 1864, 1871, 1879, 1887, 1893, 1901, 
-1909, 1917, 1921, 1925, 1933, 1936, 1939, 1946, 1949, 1953, 
-1961, 1965, 1968, 1975, 1982, 1990, 1997, 2001, 2008, 2016, 
-2024, 2032, 2040, 2046, 2053, 2059, 2065, 2068, 2071, 2076, 
-2084, 2092, 2100, 2104, 2108, 2112, 2119, 2127, 2130, 2135, 
-2143, 2149, 2153, 2158, 2161, 2165, 2171, 2179, 2184, 2190, 
-2196, 2203, 2210, 2217, 2225, 2233, 2240, 2248, 2251, 2255, 
-2258, 2262, 2266, 2269, 2272, 2279, 2282, 2290, 2296, 2303, 
-2308, 2316, 2319, 2326, 2334, 2341, 2349, 2352, 2356, 2360, 
-2364, 2368, 2372, 2379, 2383, 2390, 2399, 2402, 2411, 2418, 
-2423, 2431, 2439, 2444, 2451, 2458, 2466, 2473, 2476, 2483, 
-2490, 2498, 2502, 2506, 2510, 2519, 2527, 2533, 2540, 2547, 
-2554, 2562, 2570, 2575, 2583, 2588, 2596, 2601, 2609, 2611, 
-2613, 2614, 2615, 2616, 2617, 2618, 2619, 2620, 2621, 2625, 
-2628, 2631, 2634, 2637, 2640, 2643, 2646, 2649, 2652, 2655, 
-2660, 2663, 2666, 2669, 2674, 2679, 2682, 2686, 2689, 2693, 
-2700, 2703, 2711, 2716, 2719, 2722, 2725, 2730, 2737, 2744, 
-2747, 2754, 2757, 2760, 2763, 2766, 2773, 2779, 2786, 2790, 
-2793, 2799, 2803, 2806, 2809, 2815, 2818, 2824, 2827, 2830, 
-2835, 2839, 2842, 2845, 2850, 2853, 2856, 2861, 2864, 2870, 
-2873, 2876, 2879, 2884, 2889, 2896, 2903, 2906, 2913, 2916, 
-2919, 2927, 2937, 2940, 2943, 2946, 2952, 2955, 2961, 2968, 
-2971, 2974, 2977, 2982, 2990, 2993, 2999, 3002, 3005, 3009, 
-3014, 3020, 3027, 3030, 3035, 3040, 3043, 3046, 3054, 3057, 
-3061, 3065, 3068, 3073, 3081, 3088, 3091, 3094, 3097, 3103, 
-3111, 3114, 3120, 3123, 3126, 3134, 3141, 3144, 3147, 3153, 
-3156, 3159, 3162, 3167, 3170, 3176, 3179, 3185, 3191, 3198, 
-3203, 3206, 3209, 3216, 3219, 3227, 3230, 3233, 3236, 3243, 
-3247, 3249, 3253, 3256, 3258, 3261, 3268, 3271, 3276, 3286, 
-3291, 3294, 3299, 3307, 3310, 3314, 3323, 3326, 3329, 3332, 
-3335, 3340, 3343, 3348, 3351, 3354, 3360, 3363, 3366, 3376, 
-3379, 3382, 3385, 3388, 3393, 3398, 3401, 3404, 3409, 3416, 
-3419, 3426, 3429, 3437, 3446, 3454, 3460, 3467, 3470, 3473, 
-3476, 3479, 3482, 3488, 3494, 3501, 3506, 3512, 3515, 3523, 
-3529, 3532, 3538, 3545, 3549, 3555, 3558, 3564, 3567, 3574, 
-3580, 3583, 3591, 3599, 3607, 3610, 3617, 3625, 3630, 3635, 
-3643, 3646, 3649, 3652, 3655, 3658, 3665, 3671, 3677, 3680, 
-3686, 3693, 3699, 3706, 3713, 3718, 3728, 3731, 3734, 3737, 
-3742, 3748, 3751, 3757, 3760, 3763, 3766, 3773, 3779, 3785, 
-3788, 3794, 3796, 3802, 3809, 3812, 3818, 3825, 3832, 3842, 
-3845, 3848, 3851, 3854, 3860, 3863, 3867, 3873, 3879, 3882, 
-3885, 3894, 3900, 3903, 3910, 3913, 3916, 3920, 3923, 3930, 
-3933, 3939, 3944, 3951, 3954, 3957, 3963, 3969, 3974, 3979, 
-3985, 3988, 3990, 3995, 4000, 4006, 4013, 4018, 4021, 4028, 
-4035, 4041, 4046, 4051, 4059, 4065, 4070, 4075, 4078, 4084, 
-4091, 4094, 4100, 4107, 4111, 4114, 4121, 4127, 4130, 4136, 
-4142, 4145, 4150, 4156, 4159, 4166, 4172, 4177, 4183, 4189, 
-4192, 4195, 4198, 4201, 4208, 4212, 4218, 4224, 4231, 4238, 
-4244, 4249, 4255, 4265, 4270, 4276, 4287, 4290, 4296, 4299, 
-4301, 4308, 4311, 4317, 4320, 4325, 4330, 4337, 4343, 4346, 
-4352, 4357, 4362, 4370, 4376, 4379, 4385, 4391, 4396, 4403, 
-4412, 4418, 4428, 4431, 4437, 4444, 4451, 4454, 4460, 4468, 
-4475, 4481, 4486, 4489, 4495, 4502, 4505, 4507, 4512, 4516, 
-4520, 4526, 4530, 4537, 4543, 4549, 4557, 4563, 4566, 4570, 
-4574, 4580, 4585, 4590, 4597, 4601, 4605, 4613, 4616, 4624, 
-4628, 4633, 4638, 4643, 4646, 4652, 4658, 4664, 4670, 4679, 
-4682, 4685, 4692, 4698, 4704, 4709, 4712, 4718, 4728, 4736, 
-4741, 4747, 4750, 4758, 4761, 4767, 4775, 4781, 4784, 4787, 
-4794, 4801, 4807, 4812, 4818, 4825, 4828, 4831, 4837, 4840, 
-4846, 4849, 4855, 4858, 4861, 4864, 4871, 4878, 4881, 4887, 
-4890, 4896, 4902, 4908, 4913, 4921, 4924, 4930, 4933, 4941, 
-4949, 4954, 4959, 4964, 4970, 4973, 4976, 4979, 4986, 4989, 
-4995, 5002, 5005, 5008, 5014, 5019, 5023, 5027, 5034, 5041, 
-5045, 5052, 5055, 5058, 5064, 5070, 5077, 5084, 5091, 5099, 
-5106, 5112, 5119, 5125, 5129, 5133, 5136, 5143, 5148, 5156, 
-5163, 5170, 5176, 5179, 5184, 5193, 5195, 5200, 5203, 5209, 
-5217, 5223, 5228, 5231, 5239, 5249, 5255, 5258, 5266, 5271, 
-5274, 5279, 5282, 5288, 5295, 5303, 5308, 5316, 5329, 5336, 
-5343, 5350, 5357, 5363, 5366, 5372, 5377, 5384, 5389, 5392, 
-5398, 5404, 5410, 5416, 5422, 5425, 5431, 5438, 5441, 5444, 
-5447, 5450, 5456, 5462, 5465, 5473, 5478, 5483, 5491, 5496, 
-5502, 5505, 5508, 5511, 5520, 5523, 5526, 5532, 5536, 5543, 
-5548, 5554, 5559, 5562, 5568, 5571, 5577, 5584, 5587, 5593, 
-5596, 5602, 5608, 5614, 5620, 5623, 5629, 5635, 5642, 5649, 
-5656, 5659, 5662, 5666, 5673, 5679, 5686, 5693, 5696, 5702, 
-5707, 5713, 5720, 5727, 5734, 5739, 5744, 5751, 5759, 5766, 
-5772, 5776, 5780, 5786, 5793, 5795, 5802, 5809, 5814, 5821, 
-5824, 5831, 5837, 5843, 5846, 5853, 5860, 5866, 5872, 5878, 
-5881, 5886, 5893, 5896, 5903, 5906, 5912, 5919, 5925, 5930, 
-5935, 5942, 5950, 5956, 5962, 5968, 5974, 5981, 5984, 5991, 
-5998, 6005, 6011, 6014, 6019, 6024, 6029, 6036, 6043, 6050, 
-6052, 6057, 6062, 6067, 6073, 6078, 6084, 6087, 6100, 6103, 
-6109, 6115, 6122, 6128, 6134, 6139, 6146, 6150, 6153, 6160, 
-6163, 6169, 6174, 6177, 6182, 6188, 6192, 6196, 6203, 6206, 
-6213, 6220, 6227, 6234, 6243, 6249, 6252, 6259, 6266, 6272, 
-6275, 6278, 6285, 6289, 6296, 6303, 6307, 6313, 6315, 6318, 
-6321, 6327, 6330, 6333, 6336, 6341, 6348, 6354, 6362, 6368, 
-6374, 6377, 6384, 6388, 6392, 6399, 6406, 6409, 6413, 6419, 
-6425, 6432, 6440, 6443, 6446, 6449, 6454, 6457, 6464, 6467, 
-6472, 6475, 6481, 6484, 6491, 6493, 6496, 6503, 6509, 6512, 
-6518, 6524, 6532, 6538, 6544, 6547, 6553, 6556, 6559, 6565, 
-6568, 6574, 6577, 6580, 6587, 6590, 6593, 6599, 6602, 6609, 
-6616, 6623, 6629, 6637, 6645, 6648, 6653, 6658, 6663, 6670, 
-6673, 6679, 6682, 6685, 6691, 6697, 6703, 6708, 6717, 6725, 
-6730, 6735, 6742, 6745, 6752, 6757, 6763, 6769, 6774, 6778, 
-6784, 6790, 6796, 6802, 6810, 6816, 6821, 6824, 6831, 6835, 
-6838, 6844, 6847, 6850, 6855, 6862, 6865, 6872, 6875, 6881, 
-6884, 6890, 6896, 6903, 6906, 6913, 6919, 6926, 6933, 6940, 
-6946, 6949, 6956, 6968, 6972, 6979, 6982, 6994, 7000, 7007, 
-7011, 7017, 7023, 7028, 7036, 7044, 7047, 7054, 7058, 7061, 
-7068, 7076, 7084, 7090, 7097, 7103, 7110, 7115, 7121, 7126, 
-7131, 7137, 7142, 7149, 7152, 7154, 7161, 7164, 7172, 7178, 
-7185, 7190, 7195, 7202, 7208, 7214, 7218, 7224, 7229, 7236, 
-7243, 7249, 7254, 7258, 7261, 7266, 7278, 7286, 7291, 7293, 
-7295, 7302, 7308, 7311, 7318, 7321, 7327, 7334, 7337, 7343, 
-7346, 7352, 7358, 7364, 7370, 7376, 7384, 7390, 7403, 7409, 
-7416, 7422, 7429, 7436, 7448, 7454, 7456, 7461, 7467, 7477, 
-7489, 7492, 7499, 7505, 7512, 7517, 7522, 7528, 7533, 7538, 
-7544, 7547, 7551, 7555, 7561, 7569, 7572, 7575, 7584, 7587, 
-7592, 7595, 7601, 7607, 7613, 7619, 7626, 7629, 7635, 7640, 
-7643, 7650, 7655, 7662, 7668, 7673, 7681, 7686, 7691, 7703, 
-7711, 7719, 7724, 7733, 7736, 7743, 7750, 7752, 7758, 7759, 
-7762, 7768, 7774, 7782, 7785, 7791, 7797, 7804, 7811, 7815, 
-7821, 7824, 7831, 7835, 7838, 7844, 7850, 7855, 7861, 7867, 
-7873, 7876, 7882, 7888, 7897, 7903, 7909, 7915, 7918, 7924, 
-7930, 7936, 7941, 7949, 7954, 7960, 7965, 7970, 7976, 7981, 
-7985, 7991, 8001, 8008, 8015, 8021, 8027, 8030, 8037, 8044, 
-8050, 8055, 8060, 8066, 8069, 8076, 8081, 8086, 8094, 8101, 
-8109, 8113, 8116, 8119, 8121, 8128, 8135, 8142, 8148, 8155, 
-8161, 8167, 8171, 8181, 8189, 8198, 8201, 8209, 8214, 8221, 
-8228, 8236, 8241, 8247, 8252, 8255, 8260, 8266, 8271, 8278, 
-8284, 8287, 8293, 8300, 8306, 8312, 8318, 8324, 8330, 8333, 
-8336, 8339, 8345, 8348, 8354, 8358, 8365, 8372, 8375, 8378, 
-8385, 8388, 8391, 8394, 8397, 8400, 8406, 8412, 8420, 8423, 
-8429, 8434, 8441, 8447, 8454, 8457, 8464, 8467, 8473, 8477, 
-8480, 8487, 8493, 8496, 8502, 8508, 8514, 8520, 8524, 8527, 
-8534, 8542, 8550, 8555, 8563, 8568, 8573, 8578, 8584, 8589, 
-8592, 8599, 8606, 8609, 8612, 8615, 8618, 8624, 8630, 8636, 
-8644, 8647, 8650, 8653, 8660, 8664, 8672, 8677, 8682, 8685, 
-8693, 8694, 8699, 8707, 8712, 8717, 8720, 8727, 8731, 8738, 
-8744, 8750, 8755, 8760, 8765, 8770, 8778, 8785, 8788, 8795, 
-8802, 8805, 8815, 8823, 8828, 8835, 8842, 8854, 8860, 8866, 
-8872, 8879, 8882, 8888, 8896, 8903, 8909, 8915, 8922, 8929, 
-8932, 8935, 8941, 8944, 8951, 8955, 8962, 8969, 8976, 8985, 
-8988, 8994, 9001, 9004, 9013, 9019, 9022, 9029, 9031, 9035, 
-9040, 9046, 9049, 9055, 9064, 9067, 9073, 9080, 9083, 9088, 
-9094, 9101, 9107, 9117, 9121, 9124, 9130, 9133, 9140, 9146, 
-9152, 9158, 9161, 9167, 9170, 9176, 9184, 9190, 9196, 9202, 
-9210, 9215, 9218, 9225, 9232, 9239, 9247, 9251, 9255, 9266, 
-9274, 9281, 9284, 9293, 9296, 9304, 9309, 9317, 9322, 9327, 
-9333, 9338, 9347, 9354, 9357, 9364, 9399, 9403, 9410, 9416, 
-9422, 9425, 9432, 9434, 9439, 9444, 9449, 9455, 9459, 9463, 
-9468, 9473, 9481, 9487, 9490, 9496, 9499, 9504, 9510, 9511, 
-9518, 9524, 9529, 9536, 9542, 9549, 9555, 9563, 9568, 9573, 
-9581, 9584, 9587, 9595, 9601, 9607, 9610, 9710, 9716, 9719, 
-9725, 9731, 9734, 9740, 9747, 9750, 9756, 9760, 9764, 9771, 
-9778, 9783, 9791, 9796, 9802, 9809, 9814, 9820, 9825, 9832, 
-9839, 9845, 9846, 11156};
+    0,    5,    9,    14,   17,   21,   25,   30,   34,   40,   45,   49,
+    55,   60,   63,   68,   73,   78,   83,   88,   91,   99,   102,  108,
+    116,  121,  126,  130,  134,  137,  142,  147,  152,  156,  162,  166,
+    171,  177,  181,  189,  192,  196,  199,  202,  207,  211,  216,  222,
+    228,  232,  240,  243,  248,  252,  258,  266,  270,  274,  278,  283,
+    291,  299,  305,  308,  312,  319,  326,  330,  334,  338,  345,  348,
+    352,  355,  360,  365,  370,  376,  380,  385,  392,  396,  399,  406,
+    414,  422,  426,  431,  435,  439,  444,  450,  455,  463,  467,  473,
+    477,  480,  486,  493,  497,  505,  509,  513,  519,  523,  530,  533,
+    538,  542,  546,  549,  556,  564,  572,  576,  580,  587,  590,  593,
+    596,  603,  607,  614,  621,  625,  628,  635,  638,  645,  648,  654,
+    658,  662,  670,  678,  683,  690,  697,  701,  709,  717,  721,  725,
+    728,  735,  744,  752,  760,  763,  767,  771,  776,  784,  788,  793,
+    797,  801,  807,  815,  821,  829,  836,  843,  847,  855,  860,  865,
+    873,  878,  886,  892,  897,  901,  905,  913,  917,  920,  924,  932,
+    940,  948,  956,  961,  966,  971,  979,  984,  987,  994,  1001, 1009,
+    1017, 1022, 1026, 1031, 1036, 1041, 1044, 1049, 1053, 1058, 1063, 1066,
+    1071, 1077, 1082, 1085, 1090, 1094, 1102, 1107, 1115, 1120, 1125, 1131,
+    1138, 1143, 1150, 1158, 1166, 1171, 1179, 1184, 1189, 1192, 1199, 1206,
+    1210, 1213, 1218, 1223, 1227, 1231, 1239, 1242, 1247, 1253, 1258, 1264,
+    1271, 1278, 1286, 1294, 1302, 1310, 1314, 1322, 1325, 1329, 1334, 1339,
+    1342, 1345, 1350, 1356, 1364, 1372, 1380, 1385, 1391, 1396, 1401, 1407,
+    1412, 1418, 1423, 1429, 1434, 1442, 1450, 1458, 1462, 1467, 1471, 1476,
+    1481, 1487, 1490, 1495, 1503, 1509, 1517, 1525, 1530, 1535, 1543, 1547,
+    1551, 1555, 1560, 1563, 1568, 1574, 1581, 1589, 1592, 1596, 1602, 1605,
+    1608, 1617, 1620, 1628, 1635, 1642, 1650, 1654, 1662, 1671, 1678, 1686,
+    1694, 1702, 1706, 1714, 1718, 1724, 1729, 1734, 1740, 1744, 1748, 1753,
+    1759, 1762, 1766, 1769, 1774, 1780, 1784, 1789, 1795, 1800, 1806, 1812,
+    1817, 1820, 1827, 1831, 1835, 1842, 1849, 1856, 1864, 1871, 1879, 1887,
+    1893, 1901, 1909, 1917, 1921, 1925, 1933, 1936, 1939, 1946, 1949, 1953,
+    1961, 1965, 1968, 1975, 1982, 1990, 1997, 2001, 2008, 2016, 2024, 2032,
+    2040, 2046, 2053, 2059, 2065, 2068, 2071, 2076, 2084, 2092, 2100, 2104,
+    2108, 2112, 2119, 2127, 2130, 2135, 2143, 2149, 2153, 2158, 2161, 2165,
+    2171, 2179, 2184, 2190, 2196, 2203, 2210, 2217, 2225, 2233, 2240, 2248,
+    2251, 2255, 2258, 2262, 2266, 2269, 2272, 2279, 2282, 2290, 2296, 2303,
+    2308, 2316, 2319, 2326, 2334, 2341, 2349, 2352, 2356, 2360, 2364, 2368,
+    2372, 2379, 2383, 2390, 2399, 2402, 2411, 2418, 2423, 2431, 2439, 2444,
+    2451, 2458, 2466, 2473, 2476, 2483, 2490, 2498, 2502, 2506, 2510, 2519,
+    2527, 2533, 2540, 2547, 2554, 2562, 2570, 2575, 2583, 2588, 2596, 2601,
+    2609, 2611, 2613, 2614, 2615, 2616, 2617, 2618, 2619, 2620, 2621, 2625,
+    2628, 2631, 2634, 2637, 2640, 2643, 2646, 2649, 2652, 2655, 2660, 2663,
+    2666, 2669, 2674, 2679, 2682, 2686, 2689, 2693, 2700, 2703, 2711, 2716,
+    2719, 2722, 2725, 2730, 2737, 2744, 2747, 2754, 2757, 2760, 2763, 2766,
+    2773, 2779, 2786, 2790, 2793, 2799, 2803, 2806, 2809, 2815, 2818, 2824,
+    2827, 2830, 2835, 2839, 2842, 2845, 2850, 2853, 2856, 2861, 2864, 2870,
+    2873, 2876, 2879, 2884, 2889, 2896, 2903, 2906, 2913, 2916, 2919, 2927,
+    2937, 2940, 2943, 2946, 2952, 2955, 2961, 2968, 2971, 2974, 2977, 2982,
+    2990, 2993, 2999, 3002, 3005, 3009, 3014, 3020, 3027, 3030, 3035, 3040,
+    3043, 3046, 3054, 3057, 3061, 3065, 3068, 3073, 3081, 3088, 3091, 3094,
+    3097, 3103, 3111, 3114, 3120, 3123, 3126, 3134, 3141, 3144, 3147, 3153,
+    3156, 3159, 3162, 3167, 3170, 3176, 3179, 3185, 3191, 3198, 3203, 3206,
+    3209, 3216, 3219, 3227, 3230, 3233, 3236, 3243, 3247, 3249, 3253, 3256,
+    3258, 3261, 3268, 3271, 3276, 3286, 3291, 3294, 3299, 3307, 3310, 3314,
+    3323, 3326, 3329, 3332, 3335, 3340, 3343, 3348, 3351, 3354, 3360, 3363,
+    3366, 3376, 3379, 3382, 3385, 3388, 3393, 3398, 3401, 3404, 3409, 3416,
+    3419, 3426, 3429, 3437, 3446, 3454, 3460, 3467, 3470, 3473, 3476, 3479,
+    3482, 3488, 3494, 3501, 3506, 3512, 3515, 3523, 3529, 3532, 3538, 3545,
+    3549, 3555, 3558, 3564, 3567, 3574, 3580, 3583, 3591, 3599, 3607, 3610,
+    3617, 3625, 3630, 3635, 3643, 3646, 3649, 3652, 3655, 3658, 3665, 3671,
+    3677, 3680, 3686, 3693, 3699, 3706, 3713, 3718, 3728, 3731, 3734, 3737,
+    3742, 3748, 3751, 3757, 3760, 3763, 3766, 3773, 3779, 3785, 3788, 3794,
+    3796, 3802, 3809, 3812, 3818, 3825, 3832, 3842, 3845, 3848, 3851, 3854,
+    3860, 3863, 3867, 3873, 3879, 3882, 3885, 3894, 3900, 3903, 3910, 3913,
+    3916, 3920, 3923, 3930, 3933, 3939, 3944, 3951, 3954, 3957, 3963, 3969,
+    3974, 3979, 3985, 3988, 3990, 3995, 4000, 4006, 4013, 4018, 4021, 4028,
+    4035, 4041, 4046, 4051, 4059, 4065, 4070, 4075, 4078, 4084, 4091, 4094,
+    4100, 4107, 4111, 4114, 4121, 4127, 4130, 4136, 4142, 4145, 4150, 4156,
+    4159, 4166, 4172, 4177, 4183, 4189, 4192, 4195, 4198, 4201, 4208, 4212,
+    4218, 4224, 4231, 4238, 4244, 4249, 4255, 4265, 4270, 4276, 4287, 4290,
+    4296, 4299, 4301, 4308, 4311, 4317, 4320, 4325, 4330, 4337, 4343, 4346,
+    4352, 4357, 4362, 4370, 4376, 4379, 4385, 4391, 4396, 4403, 4412, 4418,
+    4428, 4431, 4437, 4444, 4451, 4454, 4460, 4468, 4475, 4481, 4486, 4489,
+    4495, 4502, 4505, 4507, 4512, 4516, 4520, 4526, 4530, 4537, 4543, 4549,
+    4557, 4563, 4566, 4570, 4574, 4580, 4585, 4590, 4597, 4601, 4605, 4613,
+    4616, 4624, 4628, 4633, 4638, 4643, 4646, 4652, 4658, 4664, 4670, 4679,
+    4682, 4685, 4692, 4698, 4704, 4709, 4712, 4718, 4728, 4736, 4741, 4747,
+    4750, 4758, 4761, 4767, 4775, 4781, 4784, 4787, 4794, 4801, 4807, 4812,
+    4818, 4825, 4828, 4831, 4837, 4840, 4846, 4849, 4855, 4858, 4861, 4864,
+    4871, 4878, 4881, 4887, 4890, 4896, 4902, 4908, 4913, 4921, 4924, 4930,
+    4933, 4941, 4949, 4954, 4959, 4964, 4970, 4973, 4976, 4979, 4986, 4989,
+    4995, 5002, 5005, 5008, 5014, 5019, 5023, 5027, 5034, 5041, 5045, 5052,
+    5055, 5058, 5064, 5070, 5077, 5084, 5091, 5099, 5106, 5112, 5119, 5125,
+    5129, 5133, 5136, 5143, 5148, 5156, 5163, 5170, 5176, 5179, 5184, 5193,
+    5195, 5200, 5203, 5209, 5217, 5223, 5228, 5231, 5239, 5249, 5255, 5258,
+    5266, 5271, 5274, 5279, 5282, 5288, 5295, 5303, 5308, 5316, 5329, 5336,
+    5343, 5350, 5357, 5363, 5366, 5372, 5377, 5384, 5389, 5392, 5398, 5404,
+    5410, 5416, 5422, 5425, 5431, 5438, 5441, 5444, 5447, 5450, 5456, 5462,
+    5465, 5473, 5478, 5483, 5491, 5496, 5502, 5505, 5508, 5511, 5520, 5523,
+    5526, 5532, 5536, 5543, 5548, 5554, 5559, 5562, 5568, 5571, 5577, 5584,
+    5587, 5593, 5596, 5602, 5608, 5614, 5620, 5623, 5629, 5635, 5642, 5649,
+    5656, 5659, 5662, 5666, 5673, 5679, 5686, 5693, 5696, 5702, 5707, 5713,
+    5720, 5727, 5734, 5739, 5744, 5751, 5759, 5766, 5772, 5776, 5780, 5786,
+    5793, 5795, 5802, 5809, 5814, 5821, 5824, 5831, 5837, 5843, 5846, 5853,
+    5860, 5866, 5872, 5878, 5881, 5886, 5893, 5896, 5903, 5906, 5912, 5919,
+    5925, 5930, 5935, 5942, 5950, 5956, 5962, 5968, 5974, 5981, 5984, 5991,
+    5998, 6005, 6011, 6014, 6019, 6024, 6029, 6036, 6043, 6050, 6052, 6057,
+    6062, 6067, 6073, 6078, 6084, 6087, 6100, 6103, 6109, 6115, 6122, 6128,
+    6134, 6139, 6146, 6150, 6153, 6160, 6163, 6169, 6174, 6177, 6182, 6188,
+    6192, 6196, 6203, 6206, 6213, 6220, 6227, 6234, 6243, 6249, 6252, 6259,
+    6266, 6272, 6275, 6278, 6285, 6289, 6296, 6303, 6307, 6313, 6315, 6318,
+    6321, 6327, 6330, 6333, 6336, 6341, 6348, 6354, 6362, 6368, 6374, 6377,
+    6384, 6388, 6392, 6399, 6406, 6409, 6413, 6419, 6425, 6432, 6440, 6443,
+    6446, 6449, 6454, 6457, 6464, 6467, 6472, 6475, 6481, 6484, 6491, 6493,
+    6496, 6503, 6509, 6512, 6518, 6524, 6532, 6538, 6544, 6547, 6553, 6556,
+    6559, 6565, 6568, 6574, 6577, 6580, 6587, 6590, 6593, 6599, 6602, 6609,
+    6616, 6623, 6629, 6637, 6645, 6648, 6653, 6658, 6663, 6670, 6673, 6679,
+    6682, 6685, 6691, 6697, 6703, 6708, 6717, 6725, 6730, 6735, 6742, 6745,
+    6752, 6757, 6763, 6769, 6774, 6778, 6784, 6790, 6796, 6802, 6810, 6816,
+    6821, 6824, 6831, 6835, 6838, 6844, 6847, 6850, 6855, 6862, 6865, 6872,
+    6875, 6881, 6884, 6890, 6896, 6903, 6906, 6913, 6919, 6926, 6933, 6940,
+    6946, 6949, 6956, 6968, 6972, 6979, 6982, 6994, 7000, 7007, 7011, 7017,
+    7023, 7028, 7036, 7044, 7047, 7054, 7058, 7061, 7068, 7076, 7084, 7090,
+    7097, 7103, 7110, 7115, 7121, 7126, 7131, 7137, 7142, 7149, 7152, 7154,
+    7161, 7164, 7172, 7178, 7185, 7190, 7195, 7202, 7208, 7214, 7218, 7224,
+    7229, 7236, 7243, 7249, 7254, 7258, 7261, 7266, 7278, 7286, 7291, 7293,
+    7295, 7302, 7308, 7311, 7318, 7321, 7327, 7334, 7337, 7343, 7346, 7352,
+    7358, 7364, 7370, 7376, 7384, 7390, 7403, 7409, 7416, 7422, 7429, 7436,
+    7448, 7454, 7456, 7461, 7467, 7477, 7489, 7492, 7499, 7505, 7512, 7517,
+    7522, 7528, 7533, 7538, 7544, 7547, 7551, 7555, 7561, 7569, 7572, 7575,
+    7584, 7587, 7592, 7595, 7601, 7607, 7613, 7619, 7626, 7629, 7635, 7640,
+    7643, 7650, 7655, 7662, 7668, 7673, 7681, 7686, 7691, 7703, 7711, 7719,
+    7724, 7733, 7736, 7743, 7750, 7752, 7758, 7759, 7762, 7768, 7774, 7782,
+    7785, 7791, 7797, 7804, 7811, 7815, 7821, 7824, 7831, 7835, 7838, 7844,
+    7850, 7855, 7861, 7867, 7873, 7876, 7882, 7888, 7897, 7903, 7909, 7915,
+    7918, 7924, 7930, 7936, 7941, 7949, 7954, 7960, 7965, 7970, 7976, 7981,
+    7985, 7991, 8001, 8008, 8015, 8021, 8027, 8030, 8037, 8044, 8050, 8055,
+    8060, 8066, 8069, 8076, 8081, 8086, 8094, 8101, 8109, 8113, 8116, 8119,
+    8121, 8128, 8135, 8142, 8148, 8155, 8161, 8167, 8171, 8181, 8189, 8198,
+    8201, 8209, 8214, 8221, 8228, 8236, 8241, 8247, 8252, 8255, 8260, 8266,
+    8271, 8278, 8284, 8287, 8293, 8300, 8306, 8312, 8318, 8324, 8330, 8333,
+    8336, 8339, 8345, 8348, 8354, 8358, 8365, 8372, 8375, 8378, 8385, 8388,
+    8391, 8394, 8397, 8400, 8406, 8412, 8420, 8423, 8429, 8434, 8441, 8447,
+    8454, 8457, 8464, 8467, 8473, 8477, 8480, 8487, 8493, 8496, 8502, 8508,
+    8514, 8520, 8524, 8527, 8534, 8542, 8550, 8555, 8563, 8568, 8573, 8578,
+    8584, 8589, 8592, 8599, 8606, 8609, 8612, 8615, 8618, 8624, 8630, 8636,
+    8644, 8647, 8650, 8653, 8660, 8664, 8672, 8677, 8682, 8685, 8693, 8694,
+    8699, 8707, 8712, 8717, 8720, 8727, 8731, 8738, 8744, 8750, 8755, 8760,
+    8765, 8770, 8778, 8785, 8788, 8795, 8802, 8805, 8815, 8823, 8828, 8835,
+    8842, 8854, 8860, 8866, 8872, 8879, 8882, 8888, 8896, 8903, 8909, 8915,
+    8922, 8929, 8932, 8935, 8941, 8944, 8951, 8955, 8962, 8969, 8976, 8985,
+    8988, 8994, 9001, 9004, 9013, 9019, 9022, 9029, 9031, 9035, 9040, 9046,
+    9049, 9055, 9064, 9067, 9073, 9080, 9083, 9088, 9094, 9101, 9107, 9117,
+    9121, 9124, 9130, 9133, 9140, 9146, 9152, 9158, 9161, 9167, 9170, 9176,
+    9184, 9190, 9196, 9202, 9210, 9215, 9218, 9225, 9232, 9239, 9247, 9251,
+    9255, 9266, 9274, 9281, 9284, 9293, 9296, 9304, 9309, 9317, 9322, 9327,
+    9333, 9338, 9347, 9354, 9357, 9364, 9399, 9403, 9410, 9416, 9422, 9425,
+    9432, 9434, 9439, 9444, 9449, 9455, 9459, 9463, 9468, 9473, 9481, 9487,
+    9490, 9496, 9499, 9504, 9510, 9511, 9518, 9524, 9529, 9536, 9542, 9549,
+    9555, 9563, 9568, 9573, 9581, 9584, 9587, 9595, 9601, 9607, 9610, 9710,
+    9716, 9719, 9725, 9731, 9734, 9740, 9747, 9750, 9756, 9760, 9764, 9771,
+    9778, 9783, 9791, 9796, 9802, 9809, 9814, 9820, 9825, 9832, 9839, 9845,
+    9846, 11156};
 
 static int entries[11156] = {
-0, 346, 711, 727, 1408, 1, 696, 785, 1694, 2, 
-353, 839, 1056, 1408, 3, 666, 1786, 4, 565, 1375, 
-1376, 5, 699, 781, 1694, 6, 22, 660, 1017, 1464, 
-7, 1044, 1045, 1694, 8, 133, 388, 1170, 1748, 1749, 
-9, 180, 1409, 1809, 1810, 10, 1220, 1221, 1501, 11, 
-335, 715, 1319, 1681, 1768, 12, 20, 1004, 1328, 1443, 
-13, 1466, 1786, 14, 160, 940, 1170, 1776, 15, 243, 
-851, 1319, 1681, 16, 236, 851, 1191, 1579, 17, 282, 
-1702, 1748, 1749, 18, 285, 940, 1170, 1702, 19, 830, 
-1786, 12, 20, 204, 738, 1300, 1328, 1443, 1599, 21, 
-864, 1786, 6, 22, 467, 1464, 1527, 1528, 23, 293, 
-356, 554, 723, 924, 1745, 1786, 24, 233, 837, 858, 
-1548, 25, 134, 927, 998, 1326, 26, 858, 1211, 1603, 
-27, 859, 1651, 1652, 28, 1126, 1786, 29, 169, 1397, 
-1636, 1705, 30, 54, 819, 822, 1581, 31, 60, 822, 
-837, 1548, 32, 583, 651, 1694, 33, 74, 312, 722, 
-1336, 1768, 34, 1539, 1540, 1577, 35, 302, 994, 1509, 
-1764, 36, 183, 1135, 1162, 1724, 1768, 37, 842, 1004, 
-1041, 38, 367, 373, 566, 688, 810, 1745, 1786, 39, 
-1623, 1786, 40, 967, 1261, 1262, 41, 1454, 1786, 42, 
-1467, 1786, 43, 97, 1328, 1409, 1443, 44, 1409, 1664, 
-1665, 45, 59, 711, 727, 1157, 46, 154, 660, 1147, 
-1464, 1768, 47, 211, 216, 860, 880, 1768, 48, 1017, 
-1130, 1131, 49, 432, 444, 544, 567, 926, 1745, 1786, 
-50, 1136, 1786, 51, 441, 1300, 1448, 1599, 52, 662, 
-1417, 1694, 53, 327, 420, 1099, 1125, 1768, 30, 54, 
-60, 398, 819, 1581, 1636, 1705, 55, 1091, 1157, 1728, 
-56, 1532, 1655, 1656, 57, 1058, 1424, 1530, 58, 440, 
-1328, 1443, 1448, 45, 59, 259, 420, 711, 727, 1504, 
-1505, 31, 54, 60, 819, 837, 1096, 1548, 1581, 61, 
-198, 279, 808, 1700, 1768, 62, 1177, 1786, 63, 894, 
-1518, 1519, 64, 139, 140, 933, 934, 981, 1552, 65, 
-139, 140, 933, 934, 981, 1552, 66, 1047, 1048, 1694, 
-67, 614, 1019, 1397, 68, 990, 1334, 1801, 69, 146, 
-147, 955, 958, 1179, 1180, 70, 970, 1786, 71, 692, 
-1515, 1694, 72, 1227, 1786, 73, 312, 1191, 1375, 1579, 
-33, 74, 722, 1336, 1375, 75, 76, 902, 1133, 1326, 
-75, 76, 134, 902, 1133, 1768, 77, 1147, 1645, 1659, 
-78, 311, 1319, 1375, 1681, 79, 152, 158, 1425, 1426, 
-1676, 1677, 80, 941, 1293, 1761, 81, 558, 1786, 82, 
-164, 423, 1009, 1011, 1368, 1369, 83, 340, 402, 586, 
-1051, 1686, 1745, 1786, 84, 340, 402, 586, 1051, 1686, 
-1745, 1786, 85, 1326, 1402, 1403, 86, 170, 1397, 1773, 
-1774, 87, 908, 1643, 1694, 88, 912, 1641, 1694, 89, 
-90, 1135, 1162, 1322, 89, 90, 178, 1162, 1639, 1640, 
-91, 92, 860, 880, 1322, 91, 92, 178, 316, 676, 
-860, 880, 1330, 93, 851, 1556, 1736, 94, 180, 439, 
-1283, 1366, 1768, 95, 999, 1321, 1322, 96, 1297, 1786, 
-43, 97, 179, 1300, 1443, 1599, 98, 190, 191, 1181, 
-1182, 1558, 1560, 99, 994, 1718, 1719, 100, 365, 385, 
-519, 764, 1386, 1745, 1786, 101, 1445, 1694, 1759, 102, 
-664, 1414, 1694, 103, 185, 186, 1110, 1388, 1768, 104, 
-825, 1078, 1741, 105, 152, 158, 1425, 1426, 1676, 1677, 
-106, 1413, 1786, 107, 389, 722, 1336, 1656, 108, 1007, 
-1008, 1694, 109, 1014, 1015, 1694, 110, 1435, 1786, 111, 
-218, 219, 919, 920, 1682, 1685, 112, 400, 414, 592, 
-1123, 1190, 1745, 1786, 113, 400, 414, 592, 1123, 1190, 
-1745, 1786, 114, 1098, 1099, 1125, 115, 1450, 1694, 1754, 
-116, 218, 219, 919, 920, 1682, 1685, 117, 1312, 1786, 
-118, 1324, 1786, 119, 1569, 1786, 120, 241, 242, 1186, 
-1187, 1781, 1785, 121, 1625, 1626, 1694, 122, 244, 246, 
-1201, 1202, 1565, 1566, 123, 244, 246, 1201, 1202, 1565, 
-1566, 124, 726, 902, 1133, 125, 1619, 1786, 126, 256, 
-257, 1223, 1224, 1722, 1723, 127, 571, 1786, 128, 256, 
-257, 1223, 1224, 1722, 1723, 129, 1649, 1786, 130, 463, 
-464, 711, 727, 1768, 131, 1392, 1724, 1725, 132, 1632, 
-1633, 1694, 8, 133, 134, 165, 927, 998, 1748, 1749, 
-25, 76, 133, 134, 902, 927, 998, 1133, 135, 297, 
-598, 905, 1355, 136, 268, 269, 487, 488, 817, 818, 
-137, 268, 269, 487, 488, 817, 818, 138, 886, 890, 
-1694, 64, 65, 139, 787, 934, 1648, 1745, 1786, 64, 
-65, 140, 787, 934, 1648, 1745, 1786, 141, 1060, 1448, 
-1449, 142, 1772, 1773, 1774, 143, 1730, 1786, 144, 300, 
-308, 557, 578, 985, 1786, 145, 281, 455, 1114, 1301, 
-1302, 1654, 1745, 1766, 69, 146, 434, 603, 795, 958, 
-1745, 1786, 69, 147, 434, 603, 795, 958, 1745, 1786, 
-148, 1471, 1786, 149, 1638, 1639, 1640, 150, 577, 1521, 
-1694, 151, 316, 676, 1322, 1330, 79, 105, 152, 1023, 
-1317, 1426, 1745, 1786, 153, 836, 837, 1548, 46, 154, 
-660, 1464, 1659, 155, 948, 950, 1694, 156, 949, 952, 
-1694, 157, 289, 907, 1328, 1443, 1768, 79, 105, 158, 
-1023, 1317, 1426, 1745, 1786, 159, 167, 168, 1527, 1528, 
-1768, 14, 160, 266, 848, 940, 1170, 1748, 1749, 161, 
-307, 310, 509, 511, 1524, 1525, 162, 307, 310, 509, 
-511, 1524, 1525, 163, 1231, 1477, 1610, 82, 164, 422, 
-527, 705, 1011, 1745, 1786, 133, 165, 1326, 1748, 1749, 
-166, 167, 660, 1464, 1577, 159, 166, 167, 660, 1464, 
-1527, 1528, 1539, 159, 168, 1527, 1528, 1577, 29, 169, 
-170, 174, 1636, 1705, 1773, 1774, 86, 169, 170, 1768, 
-1773, 1774, 171, 347, 1408, 1504, 1505, 172, 1526, 1527, 
-1528, 173, 1503, 1504, 1505, 169, 174, 182, 323, 819, 
-1581, 1636, 1705, 175, 1140, 1200, 1779, 176, 1454, 1786, 
-177, 1629, 1630, 1786, 90, 92, 178, 317, 860, 880, 
-1639, 1640, 97, 179, 180, 192, 1300, 1599, 1809, 1810, 
-9, 94, 179, 180, 1283, 1366, 1809, 1810, 181, 185, 
-194, 660, 1220, 1464, 1527, 1528, 174, 182, 819, 1397, 
-1581, 36, 183, 1135, 1162, 1392, 184, 185, 1501, 1527, 
-1528, 103, 181, 184, 185, 1110, 1388, 1527, 1528, 103, 
-186, 1110, 1388, 1501, 187, 1467, 1786, 188, 348, 349, 
-523, 525, 1798, 1799, 189, 348, 349, 523, 525, 1798, 
-1799, 98, 190, 449, 605, 612, 1560, 1745, 1786, 98, 
-191, 449, 605, 612, 1560, 1745, 1786, 179, 192, 1300, 
-1409, 1599, 193, 778, 1020, 1488, 181, 194, 660, 1464, 
-1501, 195, 416, 1135, 1162, 1334, 196, 417, 1334, 1639, 
-1640, 197, 666, 1786, 61, 198, 808, 1017, 1700, 199, 
-927, 998, 1193, 200, 695, 1437, 1508, 1673, 201, 334, 
-695, 957, 1610, 202, 1086, 1786, 203, 204, 1004, 1300, 
-1599, 20, 203, 204, 1300, 1599, 1768, 205, 374, 940, 
-1170, 1261, 206, 1103, 1786, 207, 598, 883, 1157, 1403, 
-208, 1318, 1319, 1681, 209, 211, 223, 859, 1135, 1162, 
-1639, 1640, 210, 211, 1639, 1640, 1651, 47, 209, 210, 
-211, 860, 880, 1639, 1640, 212, 753, 1233, 1585, 1638, 
-213, 214, 753, 813, 1383, 213, 214, 813, 1037, 1585, 
-1768, 215, 369, 371, 712, 713, 1572, 1573, 47, 216, 
-860, 880, 1651, 217, 369, 371, 712, 713, 1572, 1573, 
-111, 116, 218, 935, 1067, 1685, 1745, 1786, 111, 116, 
-219, 935, 1067, 1685, 1745, 1786, 220, 221, 825, 1231, 
-1477, 220, 221, 292, 379, 645, 689, 1231, 1477, 222, 
-396, 1191, 1579, 1656, 209, 223, 1135, 1162, 1651, 224, 
-1623, 1786, 225, 380, 381, 1236, 1359, 1614, 1615, 226, 
-380, 381, 1236, 1359, 1614, 1615, 227, 940, 1079, 1170, 
-228, 1466, 1786, 229, 854, 1079, 1696, 1747, 230, 280, 
-726, 854, 1193, 231, 901, 1300, 1599, 232, 1339, 1636, 
-1705, 24, 233, 333, 819, 837, 1211, 1548, 1581, 234, 
-1194, 1786, 235, 517, 625, 883, 986, 16, 236, 243, 
-1191, 1579, 1768, 237, 238, 986, 1098, 1503, 237, 238, 
-585, 883, 1098, 1768, 239, 403, 404, 548, 549, 1769, 
-1793, 240, 403, 404, 548, 549, 1769, 1793, 120, 241, 
-457, 507, 611, 1766, 1785, 1786, 120, 242, 457, 507, 
-611, 1766, 1785, 1786, 15, 236, 243, 628, 1191, 1319, 
-1579, 1681, 122, 123, 244, 775, 828, 1202, 1745, 1786, 
-245, 1135, 1162, 1233, 122, 123, 246, 775, 828, 1202, 
-1745, 1786, 247, 960, 1786, 248, 808, 1306, 1700, 249, 
-658, 836, 1105, 1148, 250, 399, 1105, 1339, 1772, 251, 
-1177, 1786, 252, 968, 1786, 253, 254, 1140, 1509, 1764, 
-253, 254, 594, 1509, 1764, 1768, 255, 337, 342, 510, 
-755, 1264, 1745, 1786, 126, 128, 256, 515, 622, 1224, 
-1745, 1786, 126, 128, 257, 515, 622, 1224, 1745, 1786, 
-258, 259, 839, 1056, 1157, 59, 258, 259, 711, 727, 
-839, 260, 979, 1260, 1281, 1318, 261, 262, 1020, 1260, 
-1751, 261, 262, 1020, 1281, 1282, 1768, 263, 264, 1448, 
-1809, 1810, 263, 264, 441, 1768, 1809, 1810, 265, 266, 
-1748, 1749, 1776, 160, 265, 266, 1748, 1749, 1768, 267, 
-456, 837, 894, 1548, 136, 137, 268, 488, 682, 1081, 
-1745, 1786, 136, 137, 269, 488, 682, 1081, 1745, 1786, 
-270, 418, 424, 656, 1320, 1411, 1745, 1786, 271, 660, 
-1168, 1464, 272, 1168, 1440, 1455, 1526, 273, 517, 711, 
-727, 274, 390, 778, 1488, 1656, 275, 276, 1106, 1306, 
-1440, 275, 276, 752, 1306, 1455, 1768, 277, 1297, 1786, 
-278, 279, 1017, 1110, 1388, 61, 278, 279, 467, 808, 
-1110, 1388, 1700, 230, 280, 726, 1696, 1697, 1768, 145, 
-281, 454, 539, 1302, 1744, 1745, 1786, 17, 282, 285, 
-331, 927, 998, 1748, 1749, 283, 901, 1507, 1514, 1675, 
-284, 328, 1675, 1707, 1808, 18, 282, 285, 940, 977, 
-1170, 1748, 1749, 286, 961, 1278, 1694, 287, 1040, 1263, 
-1694, 288, 722, 1336, 1751, 157, 289, 1293, 1328, 1443, 
-290, 960, 1786, 291, 292, 645, 689, 825, 221, 291, 
-292, 645, 689, 1768, 23, 293, 294, 721, 723, 733, 
-872, 293, 294, 356, 554, 723, 924, 1745, 1786, 295, 
-968, 1786, 296, 297, 598, 1355, 135, 296, 297, 598, 
-868, 905, 298, 1413, 1786, 299, 1569, 1786, 144, 300, 
-309, 576, 578, 736, 737, 1630, 1786, 301, 1435, 1786, 
-35, 302, 319, 1231, 1357, 1477, 1509, 1764, 303, 305, 
-314, 503, 504, 975, 1085, 304, 305, 314, 503, 504, 
-975, 1085, 303, 304, 305, 504, 1146, 1185, 1745, 1786, 
-306, 1283, 1366, 1707, 161, 162, 307, 511, 630, 1401, 
-1745, 1786, 144, 308, 309, 576, 578, 736, 737, 1630, 
-1786, 300, 308, 309, 557, 578, 985, 1786, 161, 162, 
-310, 511, 630, 1401, 1745, 1786, 78, 311, 312, 551, 
-1191, 1319, 1579, 1681, 33, 73, 311, 312, 722, 1191, 
-1336, 1579, 313, 798, 1406, 1492, 303, 304, 314, 504, 
-1146, 1185, 1745, 1786, 315, 693, 1510, 1694, 92, 151, 
-316, 676, 1330, 1768, 178, 317, 1322, 1639, 1640, 318, 
-319, 994, 1231, 1477, 302, 318, 319, 1231, 1477, 1768, 
-320, 1340, 1346, 1694, 321, 710, 988, 1408, 322, 323, 
-837, 1397, 1548, 174, 322, 323, 819, 837, 1581, 324, 
-558, 1786, 325, 676, 813, 1330, 326, 1663, 1745, 53, 
-327, 1099, 1125, 1157, 284, 328, 1513, 1514, 1707, 1768, 
-329, 1663, 1745, 1766, 330, 331, 927, 998, 1702, 282, 
-330, 331, 927, 998, 1768, 332, 333, 819, 858, 1581, 
-233, 332, 333, 819, 1581, 1768, 201, 334, 957, 1672, 
-1673, 1768, 11, 335, 1003, 1319, 1681, 336, 830, 1786, 
-255, 337, 343, 754, 755, 1092, 1093, 338, 1328, 1443, 
-1507, 339, 658, 819, 1581, 83, 84, 340, 584, 1050, 
-1051, 1583, 341, 345, 350, 513, 514, 1667, 1668, 255, 
-342, 343, 754, 755, 1092, 1093, 337, 342, 343, 510, 
-755, 1264, 1745, 1786, 344, 345, 350, 513, 514, 1667, 
-1668, 341, 344, 345, 514, 930, 976, 1745, 1786, 0, 
-346, 347, 353, 711, 727, 1504, 1505, 171, 346, 347, 
-1504, 1505, 1768, 188, 189, 348, 525, 545, 1119, 1745, 
-1786, 188, 189, 349, 525, 545, 1119, 1745, 1786, 341, 
-344, 350, 514, 930, 976, 1745, 1786, 351, 645, 689, 
-957, 352, 1619, 1630, 1786, 2, 346, 353, 710, 711, 
-727, 839, 1056, 354, 1730, 1786, 355, 1649, 1786, 23, 
-294, 356, 721, 723, 733, 872, 357, 864, 1786, 358, 
-1106, 1110, 1388, 359, 401, 405, 505, 608, 650, 1745, 
-1786, 360, 1003, 1517, 1767, 361, 1136, 1786, 362, 364, 
-370, 643, 646, 758, 1036, 363, 364, 370, 643, 646, 
-758, 1036, 362, 363, 364, 646, 779, 1203, 1745, 1786, 
-100, 365, 386, 763, 764, 1316, 1399, 366, 1508, 1509, 
-1764, 38, 367, 368, 809, 810, 1258, 1259, 367, 368, 
-373, 566, 688, 810, 1745, 1786, 215, 217, 369, 508, 
-877, 1573, 1745, 1786, 362, 363, 370, 646, 779, 1203, 
-1745, 1786, 215, 217, 371, 508, 877, 1573, 1745, 1786, 
-372, 379, 640, 714, 1509, 1764, 38, 368, 373, 809, 
-810, 1258, 1259, 205, 374, 940, 967, 1170, 1768, 375, 
-442, 839, 1056, 1058, 1768, 376, 1126, 1786, 377, 1086, 
-1786, 378, 379, 825, 1509, 1764, 221, 372, 378, 379, 
-1231, 1477, 1509, 1764, 225, 226, 380, 644, 989, 1615, 
-1745, 1786, 225, 226, 381, 644, 989, 1615, 1745, 1786, 
-382, 625, 839, 1056, 383, 1533, 1541, 1694, 384, 1536, 
-1545, 1694, 100, 385, 386, 763, 764, 1316, 1399, 365, 
-385, 386, 519, 764, 1386, 1745, 1786, 387, 1103, 1786, 
-8, 388, 940, 1170, 1326, 107, 389, 390, 396, 722, 
-778, 1336, 1488, 274, 389, 390, 778, 1488, 1768, 391, 
-822, 1096, 1097, 392, 395, 1319, 1656, 1681, 393, 1471, 
-1786, 394, 888, 892, 1694, 392, 395, 396, 1191, 1319, 
-1579, 222, 389, 395, 396, 722, 1191, 1336, 1579, 397, 
-398, 822, 1636, 1705, 54, 397, 398, 1636, 1705, 1768, 
-250, 399, 1148, 1721, 1768, 1772, 112, 113, 400, 1077, 
-1189, 1190, 1343, 359, 401, 406, 547, 595, 649, 650, 
-83, 84, 402, 584, 1050, 1051, 1583, 239, 240, 403, 
-549, 1142, 1660, 1745, 1786, 239, 240, 404, 549, 1142, 
-1660, 1745, 1786, 359, 405, 406, 547, 595, 649, 650, 
-401, 405, 406, 505, 608, 650, 1745, 1786, 407, 1227, 
-1786, 408, 979, 1191, 1579, 409, 1312, 1786, 410, 1600, 
-1602, 1694, 411, 1601, 1606, 1694, 412, 1590, 1786, 413, 
-970, 1786, 112, 113, 414, 1077, 1189, 1190, 1343, 415, 
-1590, 1786, 195, 416, 417, 990, 1135, 1162, 1639, 1640, 
-196, 416, 417, 1639, 1640, 1768, 270, 418, 425, 655, 
-656, 1345, 1804, 419, 420, 1157, 1504, 1505, 53, 59, 
-419, 420, 1099, 1125, 1504, 1505, 421, 1324, 1786, 164, 
-422, 423, 1009, 1011, 1368, 1369, 82, 422, 423, 527, 
-705, 1011, 1745, 1786, 270, 424, 425, 655, 656, 1345, 
-1804, 418, 424, 425, 656, 1320, 1411, 1745, 1786, 426, 
-1194, 1786, 427, 1747, 1748, 1749, 428, 1341, 1349, 1694, 
-429, 1808, 1809, 1810, 430, 977, 978, 1702, 431, 848, 
-944, 1776, 49, 432, 445, 925, 926, 1138, 1139, 433, 
-674, 1333, 1462, 146, 147, 434, 955, 958, 1179, 1180, 
-435, 438, 448, 702, 703, 762, 789, 1630, 1786, 436, 
-571, 1786, 437, 438, 448, 702, 703, 762, 789, 1630, 
-1786, 435, 437, 438, 606, 703, 1216, 1786, 94, 439, 
-1283, 1366, 1409, 58, 440, 441, 648, 1300, 1328, 1443, 
-1599, 51, 264, 440, 441, 1300, 1599, 1809, 1810, 375, 
-442, 839, 1056, 1530, 443, 450, 461, 552, 553, 1021, 
-1066, 49, 444, 445, 925, 926, 1138, 1139, 432, 444, 
-445, 544, 567, 926, 1745, 1786, 446, 450, 461, 552, 
-553, 1021, 1066, 447, 1629, 1786, 435, 437, 448, 606, 
-703, 1216, 1786, 190, 191, 449, 1181, 1182, 1558, 1560, 
-443, 446, 450, 491, 553, 1115, 1745, 1786, 451, 860, 
-880, 1383, 452, 1694, 1729, 1735, 453, 1694, 1734, 1738, 
-281, 454, 455, 1114, 1301, 1302, 1654, 1745, 1766, 145, 
-454, 455, 539, 1302, 1744, 1745, 1786, 267, 456, 837, 
-1518, 1548, 1768, 241, 242, 457, 1186, 1187, 1781, 1785, 
-458, 460, 465, 559, 560, 1214, 1215, 459, 460, 465, 
-559, 560, 1214, 1215, 458, 459, 460, 560, 618, 1235, 
-1745, 1786, 443, 446, 461, 491, 553, 1115, 1745, 1786, 
-462, 463, 839, 1056, 1462, 130, 462, 463, 674, 711, 
-727, 839, 1056, 130, 464, 711, 727, 1462, 458, 459, 
-465, 560, 618, 1235, 1745, 1786, 466, 467, 1017, 1527, 
-1528, 22, 279, 466, 467, 1110, 1388, 1527, 1528, 468, 
-476, 469, 477, 1786, 1745, 1630, 1768, 1766, 1458, 468, 
-469, 478, 1212, 1455, 1812, 479, 837, 1812, 480, 860, 
-1812, 481, 1099, 1812, 482, 1454, 1812, 483, 750, 1812, 
-484, 1467, 1812, 485, 1732, 1812, 486, 645, 1812, 487, 
-1786, 1812, 488, 1786, 1812, 489, 585, 1059, 1148, 1812, 
-490, 1639, 1812, 491, 1745, 1812, 492, 558, 1812, 493, 
-662, 1417, 1628, 1812, 494, 664, 1414, 1628, 1812, 495, 
-727, 1812, 496, 640, 1437, 1812, 497, 1528, 1812, 498, 
-579, 1594, 1812, 499, 531, 781, 785, 1540, 1576, 1812, 
-500, 1505, 1812, 501, 538, 555, 820, 829, 928, 1102, 
-1812, 502, 943, 1584, 1696, 1812, 503, 1786, 1812, 504, 
-1786, 1812, 505, 1745, 1812, 506, 1533, 1541, 1628, 1812, 
-241, 242, 507, 611, 1766, 1786, 1812, 369, 371, 508, 
-877, 1745, 1786, 1812, 509, 1786, 1812, 255, 343, 510, 
-1264, 1745, 1786, 1812, 511, 1786, 1812, 512, 1787, 1812, 
-513, 1786, 1812, 514, 1786, 1812, 256, 257, 515, 622, 
-1745, 1786, 1812, 516, 524, 587, 712, 873, 1812, 273, 
-495, 517, 669, 711, 727, 1812, 518, 814, 1673, 1812, 
-519, 1745, 1812, 520, 846, 991, 1344, 1803, 1812, 521, 
-660, 1159, 1812, 522, 1194, 1812, 523, 1786, 1812, 516, 
-524, 587, 873, 896, 1812, 525, 1786, 1812, 526, 774, 
-780, 784, 1765, 1812, 527, 1745, 1812, 528, 960, 1812, 
-529, 1628, 1729, 1735, 1812, 530, 1289, 1599, 1812, 531, 
-1106, 1812, 532, 968, 1812, 533, 583, 651, 1628, 1812, 
-534, 1640, 1812, 535, 663, 1812, 536, 1536, 1545, 1628, 
-1812, 537, 1191, 1812, 501, 538, 928, 1102, 1799, 1812, 
-539, 1786, 1812, 540, 1277, 1812, 541, 1297, 1812, 542, 
-543, 731, 883, 1812, 542, 543, 731, 884, 1812, 49, 
-445, 544, 567, 1745, 1786, 1812, 348, 349, 545, 1119, 
-1745, 1786, 1812, 546, 676, 1812, 401, 405, 547, 595, 
-649, 1786, 1812, 548, 1786, 1812, 549, 1786, 1812, 550, 
-1076, 1342, 1354, 1438, 1794, 1795, 1812, 311, 551, 565, 
-1319, 1375, 1568, 1681, 1721, 1768, 1812, 552, 1786, 1812, 
-553, 1786, 1812, 554, 1745, 1812, 501, 555, 928, 1102, 
-1798, 1812, 556, 1413, 1812, 144, 309, 557, 985, 1786, 
-1812, 81, 324, 492, 558, 1372, 1786, 1812, 559, 1786, 
-1812, 560, 1786, 1812, 561, 1435, 1812, 562, 1628, 1734, 
-1738, 1812, 563, 685, 686, 938, 1104, 1240, 1605, 1812, 
-564, 927, 1812, 551, 565, 1376, 1721, 1768, 1812, 566, 
-1745, 1812, 567, 1745, 1812, 568, 1496, 1588, 1812, 569, 
-886, 890, 1628, 1812, 538, 555, 570, 769, 1102, 1812, 
-127, 436, 571, 572, 984, 1786, 1812, 571, 572, 1812, 
-573, 640, 799, 1406, 1812, 574, 631, 1114, 1378, 1812, 
-575, 1619, 1812, 576, 1630, 1812, 150, 577, 635, 654, 
-1521, 1728, 1733, 1812, 578, 1786, 1812, 498, 579, 1594, 
-1812, 580, 774, 1283, 1812, 581, 1748, 1812, 582, 594, 
-640, 1140, 1812, 32, 533, 583, 635, 651, 1728, 1733, 
-1812, 340, 402, 584, 1050, 1583, 1786, 1812, 585, 883, 
-1812, 586, 1745, 1812, 587, 873, 1812, 584, 588, 589, 
-1586, 1587, 1812, 588, 589, 866, 1061, 1582, 1586, 1587, 
-1812, 590, 591, 1812, 591, 830, 906, 974, 1084, 1812, 
-592, 1745, 1812, 593, 1328, 1812, 254, 582, 594, 640, 
-1200, 1509, 1764, 1812, 401, 405, 547, 595, 649, 1786, 
-1812, 596, 1581, 1812, 597, 1693, 1812, 207, 598, 627, 
-883, 1157, 1812, 599, 845, 1812, 600, 1621, 1812, 601, 
-689, 1812, 602, 640, 994, 1357, 1812, 603, 1745, 1812, 
-604, 1111, 1360, 1669, 1788, 1812, 605, 1745, 1812, 438, 
-448, 606, 1216, 1786, 1812, 607, 657, 845, 999, 1592, 
-1812, 359, 406, 505, 608, 1745, 1786, 1812, 609, 675, 
-1585, 1696, 1812, 610, 1283, 1812, 611, 1766, 1812, 190, 
-191, 605, 612, 1745, 1786, 1812, 613, 1749, 1812, 489, 
-614, 776, 1019, 1532, 1728, 1768, 1812, 615, 1471, 1812, 
-616, 1810, 1812, 617, 1183, 1812, 460, 465, 618, 1235, 
-1745, 1786, 1812, 619, 778, 1616, 1812, 620, 1812, 621, 
-661, 1757, 1812, 622, 1745, 1812, 623, 1812, 624, 625, 
-1812, 382, 625, 838, 839, 1055, 1056, 1812, 626, 731, 
-1812, 598, 627, 883, 1403, 1812, 243, 628, 851, 1319, 
-1556, 1568, 1681, 1721, 1768, 1812, 629, 692, 1515, 1628, 
-1812, 630, 1745, 1812, 574, 631, 1378, 1653, 1812, 623, 
-632, 671, 672, 788, 1367, 1595, 1812, 633, 1590, 1812, 
-634, 1327, 1328, 1812, 577, 583, 635, 706, 1628, 1694, 
-1728, 1733, 1812, 635, 636, 1812, 637, 1443, 1812, 638, 
-902, 1812, 639, 640, 1812, 620, 631, 640, 1437, 1812, 
-641, 808, 1812, 642, 693, 1510, 1628, 1812, 643, 1786, 
-1812, 644, 1745, 1812, 601, 645, 689, 957, 1024, 1812, 
-646, 1786, 1812, 647, 1477, 1812, 440, 648, 752, 1060, 
-1172, 1328, 1443, 1448, 1768, 1812, 649, 1786, 1812, 650, 
-1786, 1812, 651, 1521, 1812, 652, 1629, 1812, 653, 1568, 
-1720, 1721, 1812, 577, 654, 1521, 1628, 1812, 655, 1786, 
-1812, 656, 1786, 1812, 657, 719, 978, 1108, 1812, 339, 
-596, 658, 748, 819, 1581, 1812, 659, 660, 1812, 521, 
-660, 1159, 1168, 1463, 1464, 1812, 661, 962, 1812, 52, 
-493, 662, 663, 1417, 1728, 1733, 1812, 662, 663, 664, 
-982, 1628, 1694, 1728, 1733, 1812, 102, 494, 663, 664, 
-1414, 1728, 1733, 1812, 665, 1151, 1479, 1669, 1788, 1812, 
-3, 197, 666, 667, 878, 1786, 1812, 666, 667, 1812, 
-668, 1527, 1812, 669, 711, 1812, 670, 1098, 1812, 671, 
-672, 1812, 672, 788, 960, 1594, 1595, 1812, 673, 942, 
-1358, 1384, 1385, 1812, 463, 674, 839, 1056, 1333, 1768, 
-1812, 609, 675, 1585, 1696, 1812, 676, 749, 813, 1330, 
-1481, 1812, 677, 679, 1812, 677, 678, 679, 1035, 1160, 
-1204, 1405, 1812, 679, 1035, 1159, 1160, 1649, 1812, 680, 
-1086, 1812, 588, 681, 802, 1582, 1587, 1812, 268, 269, 
-682, 1081, 1745, 1786, 1812, 683, 1680, 1681, 1812, 684, 
-834, 999, 1256, 1592, 1812, 685, 686, 1812, 686, 938, 
-959, 1240, 1312, 1812, 687, 1103, 1812, 38, 368, 566, 
-688, 1745, 1786, 1812, 486, 645, 689, 747, 957, 1812, 
-690, 1700, 1812, 691, 732, 1087, 1088, 1149, 1290, 1444, 
-1812, 71, 629, 692, 849, 1515, 1728, 1733, 1812, 315, 
-642, 693, 849, 1510, 1728, 1733, 1812, 694, 1786, 1812, 
-200, 201, 695, 1508, 1610, 1768, 1812, 1, 696, 697, 
-785, 1183, 1532, 1535, 1812, 696, 697, 785, 1628, 1812, 
-698, 888, 892, 1628, 1812, 5, 699, 781, 823, 1183, 
-1532, 1535, 1812, 700, 722, 1812, 701, 1466, 1812, 702, 
-1630, 1812, 703, 1786, 1812, 704, 1172, 1812, 164, 423, 
-527, 705, 1745, 1786, 1812, 635, 706, 1628, 1694, 1745, 
-1812, 707, 708, 760, 1065, 1370, 1812, 708, 1745, 1812, 
-709, 1279, 1439, 1561, 1562, 1812, 353, 710, 839, 988, 
-1056, 1768, 1812, 495, 517, 711, 727, 1553, 1812, 215, 
-217, 712, 713, 1572, 1786, 1812, 215, 217, 712, 713, 
-1572, 1786, 1812, 372, 640, 714, 825, 1812, 11, 715, 
-1003, 1319, 1568, 1681, 1721, 1767, 1768, 1812, 716, 1534, 
-1812, 717, 1319, 1812, 718, 1526, 1812, 657, 719, 978, 
-1108, 1812, 720, 1091, 1157, 1458, 1728, 1812, 721, 1786, 
-1812, 722, 861, 1335, 1336, 1751, 1812, 723, 1786, 1812, 
-724, 1773, 1812, 725, 849, 1812, 124, 638, 726, 902, 
-1052, 1133, 1812, 517, 669, 711, 727, 931, 1812, 728, 
-729, 1669, 1787, 1788, 1812, 729, 1271, 1812, 730, 1068, 
-1070, 1071, 1199, 1812, 731, 1812, 691, 732, 733, 1087, 
-1088, 1812, 293, 356, 721, 733, 872, 1786, 1812, 734, 
-1135, 1812, 735, 862, 863, 1179, 1292, 1812, 300, 308, 
-576, 736, 737, 1630, 1812, 300, 308, 576, 736, 737, 
-1630, 1812, 20, 738, 752, 1004, 1041, 1172, 1328, 1443, 
-1768, 1812, 739, 1757, 1812, 740, 1681, 1812, 741, 1638, 
-1812, 742, 979, 1812, 743, 744, 1481, 1482, 1486, 1812, 
-744, 1745, 1812, 745, 1288, 1514, 1812, 746, 1534, 1628, 
-1694, 1745, 1812, 747, 1075, 1459, 1460, 1709, 1812, 748, 
-819, 1812, 749, 1330, 1812, 750, 1072, 1445, 1450, 1532, 
-1535, 1628, 1694, 1812, 751, 800, 1381, 1390, 1396, 1812, 
-752, 1455, 1812, 212, 213, 753, 1383, 1638, 1768, 1812, 
-754, 1786, 1812, 755, 1786, 1812, 756, 1196, 1336, 1812, 
-757, 880, 1812, 362, 363, 643, 758, 1036, 1786, 1812, 
-759, 766, 1812, 759, 760, 765, 766, 1021, 1812, 761, 
-948, 950, 1628, 1812, 435, 437, 702, 762, 789, 1630, 
-1812, 763, 1786, 1812, 764, 1786, 1812, 759, 760, 765, 
-766, 1065, 1812, 571, 760, 766, 1065, 1370, 1812, 767, 
-1340, 1346, 1628, 1812, 768, 1341, 1349, 1628, 1812, 538, 
-555, 570, 769, 770, 1812, 770, 1745, 1812, 771, 1812, 
-772, 821, 1015, 1646, 1812, 771, 773, 783, 1766, 1812, 
-526, 774, 780, 784, 1063, 1812, 244, 246, 775, 828, 
-1745, 1786, 1812, 776, 804, 1129, 1281, 1812, 777, 778, 
-1812, 619, 778, 1020, 1487, 1488, 1616, 1812, 364, 370, 
-779, 1203, 1745, 1786, 1812, 780, 783, 1062, 1063, 1186, 
-1812, 499, 781, 782, 1576, 1812, 499, 781, 782, 1576, 
-1812, 771, 773, 780, 783, 784, 1062, 1063, 1812, 783, 
-784, 1062, 1063, 1187, 1812, 499, 785, 786, 1540, 1812, 
-499, 785, 786, 1540, 1812, 787, 1745, 1812, 632, 671, 
-672, 788, 789, 1812, 435, 437, 702, 762, 789, 1630, 
-1812, 790, 1489, 1812, 791, 1439, 1562, 1790, 1805, 1812, 
-792, 900, 1754, 1756, 1759, 1761, 1812, 793, 1699, 1700, 
-1812, 794, 1504, 1812, 146, 147, 603, 795, 1745, 1786, 
-1812, 796, 999, 1144, 1270, 1592, 1812, 797, 1745, 1812, 
-640, 798, 799, 1492, 1768, 1812, 573, 640, 798, 799, 
-1768, 1812, 800, 1745, 1812, 574, 801, 946, 1653, 1812, 
-588, 681, 802, 1038, 1582, 1812, 803, 846, 1812, 731, 
-804, 805, 883, 1148, 1281, 1812, 804, 805, 807, 1148, 
-1281, 1812, 731, 806, 807, 883, 1812, 805, 806, 807, 
-883, 1148, 1812, 690, 808, 904, 1306, 1700, 1812, 809, 
-1786, 1812, 810, 1786, 1812, 811, 1663, 1812, 812, 813, 
-1812, 325, 546, 676, 749, 813, 1330, 1812, 518, 814, 
-1673, 1812, 735, 815, 816, 863, 876, 1812, 816, 862, 
-863, 1180, 1292, 1812, 136, 137, 487, 817, 818, 1786, 
-1812, 136, 137, 487, 817, 818, 1786, 1812, 596, 658, 
-819, 820, 1581, 1812, 501, 820, 829, 1745, 1812, 772, 
-821, 822, 1015, 1646, 1812, 30, 31, 397, 821, 822, 
-1548, 1581, 1636, 1768, 1812, 699, 781, 823, 1628, 1812, 
-824, 1628, 1694, 1732, 1745, 1812, 220, 291, 378, 645, 
-714, 825, 1078, 1477, 1764, 1768, 1812, 826, 1162, 1812, 
-827, 882, 1352, 1375, 1633, 1812, 828, 1745, 1812, 829, 
-1812, 19, 336, 830, 831, 1176, 1786, 1812, 830, 831, 
-1812, 832, 980, 1551, 1691, 1692, 1812, 833, 1745, 1812, 
-834, 835, 944, 1742, 1812, 834, 835, 944, 1742, 1812, 
-153, 479, 836, 837, 887, 1548, 1812, 836, 837, 887, 
-1061, 1548, 1812, 838, 839, 1812, 625, 839, 1055, 1056, 
-1274, 1812, 840, 1005, 1048, 1611, 1812, 841, 842, 1045, 
-1611, 1812, 37, 841, 842, 1004, 1041, 1045, 1611, 1812, 
-843, 844, 906, 974, 1084, 1812, 844, 1745, 1812, 607, 
-845, 999, 1294, 1592, 1812, 520, 846, 1344, 1471, 1803, 
-1812, 847, 1495, 1742, 1775, 1812, 160, 848, 940, 944, 
-1170, 1768, 1812, 692, 693, 849, 850, 1628, 1694, 1728, 
-1733, 1812, 849, 850, 1628, 1694, 1745, 1812, 15, 16, 
-628, 851, 1191, 1568, 1681, 1739, 1768, 1812, 852, 1508, 
-1812, 853, 1076, 1245, 1342, 1795, 1812, 229, 230, 854, 
-1193, 1747, 1768, 1812, 718, 855, 1641, 1643, 1645, 1658, 
-1812, 856, 857, 1812, 857, 1218, 1315, 1398, 1730, 1812, 
-24, 332, 858, 1548, 1581, 1607, 1768, 1812, 209, 859, 
-1135, 1162, 1652, 1768, 1812, 757, 860, 880, 1383, 1485, 
-1812, 861, 875, 1456, 1745, 1812, 862, 863, 1812, 735, 
-815, 816, 863, 1136, 1812, 21, 357, 864, 865, 1365, 
-1786, 1812, 864, 865, 1812, 866, 1812, 867, 1663, 1745, 
-1766, 1812, 868, 1337, 1458, 1812, 869, 1413, 1786, 1812, 
-870, 1006, 1024, 1029, 1032, 1812, 871, 879, 1585, 1812, 
-293, 356, 721, 733, 872, 1786, 1812, 516, 873, 895, 
-896, 1227, 1812, 874, 875, 972, 973, 1181, 1812, 861, 
-874, 875, 972, 973, 1456, 1617, 1812, 735, 815, 816, 
-876, 954, 1812, 877, 1745, 1812, 666, 878, 1786, 1812, 
-871, 879, 1585, 1812, 480, 860, 880, 895, 1383, 1812, 
-881, 1352, 1376, 1626, 1812, 827, 882, 1352, 1633, 1812, 
-235, 238, 585, 625, 883, 1768, 1812, 543, 883, 884, 
-1812, 885, 1381, 1774, 1812, 138, 569, 886, 890, 1427, 
-1532, 1535, 1812, 887, 1548, 1812, 394, 698, 888, 892, 
-1427, 1532, 1535, 1812, 889, 1265, 1421, 1812, 890, 891, 
-1121, 1221, 1812, 890, 891, 1121, 1221, 1812, 892, 1120, 
-1121, 1500, 1812, 658, 893, 1812, 267, 894, 1416, 1548, 
-1768, 1812, 516, 873, 895, 896, 897, 1812, 524, 587, 
-713, 873, 896, 1812, 516, 895, 896, 897, 1571, 1812, 
-898, 1188, 1625, 1628, 1632, 1694, 1728, 1733, 1812, 898, 
-899, 1812, 900, 901, 1812, 231, 901, 1299, 1300, 1598, 
-1599, 1812, 726, 902, 903, 1052, 1133, 1812, 903, 942, 
-1022, 1358, 1385, 1812, 904, 915, 1197, 1745, 1812, 905, 
-906, 1812, 591, 843, 906, 974, 1084, 1812, 157, 752, 
-907, 941, 1172, 1293, 1328, 1443, 1768, 1812, 87, 908, 
-909, 1483, 1532, 1535, 1643, 1812, 908, 909, 1628, 1643, 
-1812, 910, 911, 1431, 1433, 1478, 1812, 911, 1786, 1812, 
-88, 912, 1027, 1483, 1532, 1535, 1641, 1812, 913, 1412, 
-1812, 818, 913, 914, 915, 1412, 1812, 904, 913, 914, 
-915, 916, 1197, 1412, 1812, 817, 913, 915, 916, 1412, 
-1812, 917, 1126, 1812, 918, 1745, 1812, 111, 116, 919, 
-920, 1682, 1786, 1812, 111, 116, 919, 920, 1682, 1786, 
-1812, 921, 922, 1250, 1439, 1562, 1812, 922, 1267, 1268, 
-1391, 1812, 825, 923, 1078, 1458, 1741, 1812, 23, 294, 
-554, 924, 1745, 1786, 1812, 925, 1786, 1812, 926, 1786, 
-1812, 673, 927, 997, 998, 1193, 1812, 928, 1102, 1812, 
-929, 931, 1215, 1473, 1474, 1812, 930, 1745, 1812, 929, 
-931, 1116, 1473, 1474, 1812, 932, 1745, 1812, 933, 1786, 
-1812, 934, 1786, 1812, 218, 219, 935, 1067, 1745, 1786, 
-1812, 742, 936, 1510, 1512, 1515, 1517, 1812, 937, 1809, 
-1812, 563, 685, 686, 938, 1676, 1812, 939, 940, 1812, 
-940, 1068, 1079, 1169, 1170, 1812, 752, 907, 941, 1761, 
-1768, 1812, 673, 942, 1236, 1384, 1385, 1812, 502, 943, 
-1584, 1696, 1812, 431, 834, 835, 848, 944, 1742, 1776, 
-1812, 945, 1747, 1812, 574, 801, 946, 1296, 1653, 1812, 
-947, 1774, 1812, 155, 761, 948, 950, 1532, 1535, 1693, 
-1812, 156, 949, 952, 1154, 1532, 1535, 1693, 1812, 950, 
-951, 1442, 1449, 1812, 950, 951, 1442, 1449, 1812, 952, 
-953, 1441, 1442, 1812, 952, 953, 1441, 1442, 1448, 1812, 
-954, 1745, 1812, 955, 1786, 1812, 956, 957, 1812, 351, 
-486, 601, 645, 689, 957, 1812, 958, 1786, 1812, 686, 
-938, 959, 1033, 1240, 1812, 247, 290, 528, 960, 1746, 
-1786, 1812, 961, 1040, 1812, 962, 1436, 1812, 963, 1427, 
-1628, 1694, 1745, 1812, 964, 1008, 1097, 1646, 1812, 965, 
-1227, 1786, 1812, 966, 970, 1786, 1812, 374, 940, 967, 
-1170, 1262, 1768, 1812, 252, 295, 532, 968, 969, 1786, 
-1812, 968, 969, 1786, 1812, 70, 413, 966, 970, 971, 
-1786, 1812, 970, 971, 1812, 972, 973, 1812, 874, 968, 
-973, 1616, 1617, 1812, 590, 591, 974, 975, 1073, 1812, 
-303, 304, 503, 975, 1085, 1786, 1812, 345, 350, 930, 
-976, 1745, 1786, 1812, 285, 940, 977, 978, 1170, 1768, 
-1812, 430, 657, 719, 977, 978, 1108, 1702, 1812, 408, 
-537, 979, 1191, 1578, 1579, 1812, 832, 980, 981, 1691, 
-1692, 1812, 64, 65, 933, 981, 1552, 1786, 1812, 663, 
-982, 1628, 1694, 1745, 1812, 983, 1569, 1786, 1812, 571, 
-984, 1786, 1812, 985, 1786, 1812, 235, 237, 517, 986, 
-1503, 1768, 1812, 987, 988, 1026, 1128, 1812, 321, 710, 
-987, 988, 1026, 1128, 1408, 1812, 380, 381, 644, 989, 
-1745, 1786, 1812, 416, 990, 1135, 1162, 1768, 1801, 1812, 
-520, 991, 992, 1344, 1803, 1812, 992, 1745, 1812, 993, 
-1025, 1026, 1407, 1812, 35, 318, 602, 994, 1285, 1477, 
-1764, 1768, 1812, 995, 1812, 995, 996, 1016, 1745, 1812, 
-997, 998, 1812, 564, 927, 998, 1193, 1363, 1812, 502, 
-871, 999, 1321, 1403, 1741, 1768, 1812, 999, 1000, 1591, 
-1592, 1621, 1812, 1001, 1040, 1263, 1628, 1812, 726, 1002, 
-1812, 335, 715, 1003, 1512, 1568, 1681, 1768, 1812, 12, 
-203, 738, 1004, 1005, 1172, 1328, 1599, 1768, 1812, 840, 
-1004, 1005, 1048, 1611, 1812, 1006, 1745, 1812, 108, 1007, 
-1008, 1010, 1683, 1728, 1733, 1812, 964, 1008, 1097, 1646, 
-1812, 1009, 1786, 1812, 1007, 1008, 1010, 1628, 1812, 1011, 
-1786, 1812, 920, 1012, 1016, 1713, 1714, 1812, 893, 1013, 
-1414, 1416, 1417, 1519, 1812, 109, 1014, 1015, 1143, 1683, 
-1728, 1733, 1812, 772, 821, 1015, 1646, 1812, 995, 996, 
-1012, 1016, 1310, 1713, 1714, 1812, 6, 198, 278, 466, 
-660, 1017, 1110, 1130, 1172, 1528, 1700, 1768, 1812, 1018, 
-1094, 1141, 1609, 1777, 1779, 1812, 67, 614, 1019, 1118, 
-1397, 1458, 1812, 193, 777, 778, 1020, 1487, 1488, 1812, 
-443, 446, 552, 1021, 1066, 1786, 1812, 903, 942, 1022, 
-1358, 1613, 1812, 1023, 1745, 1812, 870, 1024, 1029, 1031, 
-1032, 1812, 993, 1025, 1026, 1407, 1812, 670, 988, 993, 
-1026, 1128, 1407, 1812, 912, 1027, 1628, 1641, 1812, 1028, 
-1031, 1812, 1028, 1029, 1030, 1031, 1369, 1812, 1028, 1029, 
-1030, 1031, 1032, 1812, 1024, 1029, 1031, 1032, 1629, 1812, 
-1028, 1030, 1031, 1032, 1368, 1812, 938, 959, 1033, 1240, 
-1410, 1812, 1034, 1355, 1812, 677, 678, 679, 1035, 1036, 
-1812, 362, 363, 643, 758, 1036, 1786, 1812, 1037, 1585, 
-1812, 1038, 1745, 1812, 1039, 1503, 1812, 1040, 1167, 1812, 
-738, 752, 842, 1041, 1768, 1812, 1042, 1043, 1248, 1439, 
-1562, 1812, 1043, 1550, 1812, 7, 1044, 1045, 1046, 1053, 
-1532, 1535, 1812, 841, 842, 1045, 1611, 1812, 1044, 1045, 
-1046, 1628, 1812, 66, 1047, 1048, 1053, 1447, 1532, 1535, 
-1812, 840, 1005, 1048, 1611, 1812, 1049, 1452, 1628, 1694, 
-1745, 1812, 1050, 1786, 1812, 1051, 1786, 1812, 1052, 1133, 
-1812, 1044, 1047, 1053, 1532, 1535, 1557, 1628, 1694, 1812, 
-1053, 1054, 1812, 1055, 1056, 1812, 520, 625, 838, 839, 
-1056, 1812, 1057, 1191, 1690, 1812, 375, 839, 1056, 1058, 
-1424, 1768, 1812, 489, 585, 1059, 1148, 1812, 648, 752, 
-1060, 1449, 1768, 1812, 589, 866, 1061, 1745, 1812, 1062, 
-1063, 1812, 774, 780, 784, 970, 1063, 1812, 517, 1064, 
-1812, 759, 765, 766, 1065, 1066, 1812, 443, 446, 552, 
-1021, 1066, 1786, 1812, 1067, 1745, 1812, 730, 1068, 1070, 
-1071, 1465, 1812, 1069, 1465, 1812, 1069, 1070, 1331, 1465, 
-1565, 1812, 1069, 1071, 1331, 1465, 1566, 1812, 750, 1072, 
-1628, 1694, 1745, 1812, 590, 591, 974, 1073, 1084, 1812, 
-1074, 1745, 1812, 747, 1075, 1459, 1460, 1667, 1812, 550, 
-1076, 1077, 1794, 1795, 1812, 400, 414, 1077, 1189, 1343, 
-1786, 1812, 104, 825, 923, 1078, 1458, 1741, 1812, 227, 
-939, 940, 1079, 1169, 1170, 1812, 1080, 1111, 1812, 1081, 
-1745, 1812, 1082, 1086, 1786, 1812, 1064, 1083, 1151, 1225, 
-1424, 1529, 1812, 590, 591, 1073, 1084, 1085, 1812, 303, 
-304, 503, 975, 1085, 1786, 1812, 202, 377, 680, 1082, 
-1086, 1786, 1812, 1087, 1088, 1812, 732, 1088, 1289, 1290, 
-1454, 1812, 1089, 1268, 1476, 1725, 1812, 626, 731, 1090, 
-1091, 1728, 1812, 55, 720, 1091, 1157, 1458, 1728, 1812, 
-337, 342, 754, 1092, 1093, 1786, 1812, 337, 342, 754, 
-1092, 1093, 1786, 1812, 1018, 1094, 1095, 1141, 1812, 1018, 
-1094, 1095, 1141, 1812, 60, 837, 1096, 1097, 1548, 1768, 
-1812, 391, 822, 964, 1008, 1096, 1097, 1646, 1812, 114, 
-481, 1098, 1099, 1124, 1125, 1812, 1073, 1098, 1099, 1124, 
-1125, 1812, 1100, 1110, 1762, 1812, 1101, 1777, 1796, 1812, 
-538, 555, 570, 1102, 1103, 1812, 206, 387, 687, 1103, 
-1703, 1786, 1812, 1104, 1812, 249, 250, 658, 1105, 1339, 
-1768, 1812, 358, 1106, 1109, 1110, 1387, 1388, 1812, 1107, 
-1108, 1295, 1701, 1812, 657, 978, 1002, 1108, 1295, 1701, 
-1812, 1109, 1110, 1812, 1100, 1106, 1110, 1387, 1388, 1762, 
-1812, 604, 1111, 1174, 1669, 1788, 1812, 914, 916, 1112, 
-1113, 1327, 1812, 1113, 1745, 1812, 145, 454, 1114, 1301, 
-1654, 1766, 1812, 450, 461, 491, 1115, 1745, 1786, 1812, 
-931, 1116, 1214, 1473, 1474, 1812, 918, 929, 1116, 1117, 
-1394, 1812, 614, 1019, 1118, 1397, 1458, 1812, 1119, 1745, 
-1812, 892, 1120, 1121, 1500, 1812, 890, 892, 1121, 1221, 
-1305, 1500, 1812, 1122, 1483, 1812, 112, 113, 592, 1123, 
-1745, 1786, 1812, 1124, 1125, 1812, 481, 815, 1098, 1099, 
-1125, 1812, 28, 376, 917, 1126, 1520, 1786, 1812, 1127, 
-1128, 1489, 1669, 1788, 1812, 987, 988, 1026, 1128, 1812, 
-776, 804, 1129, 1281, 1812, 48, 1017, 1130, 1131, 1132, 
-1458, 1812, 478, 745, 1130, 1131, 1532, 1665, 1768, 1812, 
-1017, 1130, 1131, 1132, 1458, 1812, 638, 726, 902, 1133, 
-1134, 1812, 803, 846, 1134, 1344, 1803, 1812, 826, 1135, 
-1162, 1233, 1708, 1812, 50, 361, 1136, 1137, 1674, 1786, 
-1812, 1136, 1137, 1812, 432, 444, 925, 1138, 1139, 1786, 
-1812, 432, 444, 925, 1138, 1139, 1786, 1812, 253, 582, 
-1140, 1141, 1764, 1768, 1812, 1018, 1094, 1095, 1140, 1141, 
-1812, 1142, 1745, 1812, 1014, 1015, 1143, 1628, 1812, 1144, 
-1145, 1165, 1262, 1812, 1144, 1145, 1165, 1262, 1812, 305, 
-314, 1146, 1185, 1745, 1786, 1812, 46, 660, 1147, 1464, 
-1645, 1768, 1812, 249, 399, 836, 1148, 1721, 1768, 1812, 
-1149, 1812, 1083, 1150, 1225, 1424, 1812, 1083, 1151, 1152, 
-1529, 1812, 1083, 1151, 1152, 1529, 1812, 1153, 1164, 1165, 
-1261, 1771, 1812, 949, 952, 1154, 1628, 1812, 1155, 1156, 
-1239, 1243, 1699, 1812, 1156, 1745, 1812, 45, 258, 327, 
-419, 711, 1056, 1091, 1125, 1157, 1158, 1505, 1768, 1812, 
-1157, 1158, 1812, 679, 1035, 1159, 1160, 1308, 1812, 677, 
-678, 679, 758, 1160, 1812, 1161, 1338, 1602, 1603, 1606, 
-1607, 1812, 734, 1135, 1162, 1163, 1233, 1812, 856, 857, 
-1163, 1315, 1398, 1812, 1153, 1164, 1165, 1771, 1812, 945, 
-1144, 1153, 1165, 1262, 1771, 1812, 1166, 1312, 1786, 1812, 
-1167, 1168, 1812, 271, 659, 660, 1168, 1463, 1464, 1812, 
-1169, 1170, 1812, 765, 939, 940, 1079, 1170, 1812, 752, 
-1171, 1172, 1173, 1812, 1017, 1172, 1812, 752, 1171, 1172, 
-1173, 1812, 1111, 1174, 1332, 1669, 1788, 1812, 1175, 1590, 
-1786, 1812, 830, 1176, 1786, 1812, 62, 251, 1177, 1178, 
-1726, 1786, 1812, 1177, 1178, 1812, 69, 434, 955, 1179, 
-1180, 1786, 1812, 69, 434, 955, 1179, 1180, 1786, 1812, 
-98, 449, 1181, 1182, 1558, 1786, 1812, 98, 449, 1181, 
-1182, 1558, 1786, 1812, 696, 699, 1183, 1184, 1532, 1535, 
-1628, 1694, 1812, 1183, 1184, 1628, 1694, 1745, 1812, 1185, 
-1745, 1812, 120, 457, 1186, 1187, 1781, 1786, 1812, 120, 
-457, 1186, 1187, 1781, 1786, 1812, 898, 1188, 1628, 1694, 
-1745, 1812, 1189, 1786, 1812, 1190, 1786, 1812, 979, 1057, 
-1191, 1578, 1579, 1690, 1812, 1192, 1324, 1786, 1812, 199, 
-564, 927, 997, 998, 1193, 1812, 234, 426, 522, 1194, 
-1195, 1786, 1812, 1194, 1195, 1786, 1812, 1196, 1206, 1208, 
-1419, 1671, 1812, 1197, 1812, 1198, 1683, 1812, 1199, 1745, 
-1812, 594, 640, 1200, 1768, 1779, 1812, 1201, 1786, 1812, 
-1202, 1786, 1812, 1203, 1745, 1812, 678, 1204, 1405, 1745, 
-1812, 812, 1205, 1279, 1286, 1371, 1652, 1812, 1093, 1206, 
-1207, 1670, 1671, 1812, 1206, 1207, 1208, 1523, 1580, 1670, 
-1671, 1812, 1092, 1207, 1208, 1670, 1671, 1812, 1209, 1210, 
-1762, 1784, 1792, 1812, 1210, 1745, 1812, 233, 837, 1211, 
-1548, 1603, 1768, 1812, 478, 1212, 1455, 1812, 1213, 1466, 
-1786, 1812, 458, 459, 559, 1214, 1215, 1786, 1812, 458, 
-459, 559, 1214, 1215, 1786, 1812, 1216, 1786, 1812, 570, 
-1217, 1636, 1812, 857, 1218, 1219, 1315, 1398, 1812, 797, 
-1218, 1219, 1315, 1398, 1812, 181, 660, 1220, 1221, 1464, 
-1768, 1812, 10, 890, 891, 1121, 1220, 1221, 1501, 1812, 
-1222, 1745, 1812, 1223, 1786, 1812, 1224, 1786, 1812, 1083, 
-1150, 1225, 1424, 1812, 1226, 1366, 1812, 72, 407, 965, 
-1227, 1228, 1786, 1812, 1227, 1228, 1812, 1229, 1234, 1241, 
-1745, 1812, 1230, 1231, 1812, 647, 1030, 1231, 1477, 1610, 
-1812, 1232, 1233, 1812, 245, 734, 826, 1135, 1162, 1233, 
-1812, 1234, 1812, 1235, 1745, 1812, 225, 226, 1236, 1359, 
-1614, 1786, 1812, 1237, 1277, 1628, 1694, 1745, 1812, 1238, 
-1242, 1812, 1139, 1238, 1239, 1241, 1242, 1812, 563, 685, 
-686, 1240, 1677, 1812, 1229, 1234, 1238, 1239, 1241, 1242, 
-1243, 1812, 1086, 1239, 1242, 1243, 1699, 1812, 1138, 1238, 
-1241, 1242, 1243, 1812, 1244, 1796, 1812, 853, 1076, 1245, 
-1246, 1342, 1812, 1246, 1745, 1812, 1247, 1421, 1812, 1042, 
-1248, 1439, 1549, 1562, 1812, 1248, 1249, 1812, 921, 1250, 
-1439, 1475, 1562, 1812, 1250, 1251, 1812, 1252, 1591, 1812, 
-1253, 1254, 1346, 1347, 1349, 1350, 1812, 1254, 1318, 1812, 
-1255, 1270, 1812, 684, 999, 1256, 1494, 1592, 1812, 1256, 
-1257, 1812, 367, 373, 809, 1258, 1259, 1786, 1812, 367, 
-373, 809, 1258, 1259, 1786, 1812, 260, 261, 979, 1260, 
-1751, 1768, 1812, 205, 940, 1153, 1261, 1768, 1812, 40, 
-967, 1144, 1145, 1165, 1261, 1262, 1812, 287, 1001, 1040, 
-1263, 1277, 1532, 1535, 1812, 1264, 1745, 1812, 1265, 1266, 
-1304, 1719, 1812, 1265, 1266, 1304, 1719, 1812, 922, 1267, 
-1268, 1391, 1812, 741, 922, 1268, 1391, 1476, 1725, 1812, 
-1269, 1561, 1812, 796, 999, 1270, 1592, 1770, 1812, 624, 
-1271, 1812, 1272, 1275, 1812, 1272, 1273, 1274, 1275, 1723, 
-1812, 1272, 1273, 1274, 1275, 1554, 1812, 1194, 1273, 1275, 
-1553, 1554, 1812, 1205, 1276, 1371, 1652, 1812, 1237, 1263, 
-1277, 1278, 1532, 1535, 1628, 1694, 1812, 286, 961, 1277, 
-1278, 1287, 1532, 1535, 1812, 1205, 1279, 1280, 1286, 1812, 
-1205, 1279, 1280, 1286, 1812, 260, 262, 1281, 1282, 1318, 
-1768, 1812, 1281, 1282, 1812, 580, 774, 1226, 1283, 1366, 
-1707, 1812, 1284, 1285, 1303, 1304, 1812, 994, 1284, 1285, 
-1303, 1304, 1812, 1205, 1279, 1280, 1286, 1651, 1812, 961, 
-1278, 1287, 1628, 1812, 745, 1288, 1514, 1812, 732, 1088, 
-1289, 1290, 1291, 1812, 691, 872, 1087, 1088, 1290, 1812, 
-732, 1074, 1289, 1290, 1291, 1812, 735, 816, 862, 863, 
-1292, 1812, 289, 907, 1172, 1293, 1328, 1756, 1768, 1812, 
-845, 999, 1294, 1295, 1592, 1812, 1107, 1108, 1295, 1701, 
-1812, 1296, 1786, 1812, 96, 277, 541, 1297, 1298, 1786, 
-1812, 1297, 1298, 1786, 1812, 1299, 1300, 1812, 901, 1300, 
-1598, 1599, 1605, 1812, 1301, 1766, 1812, 1302, 1745, 1812, 
-1284, 1285, 1303, 1304, 1812, 956, 1265, 1284, 1285, 1304, 
-1719, 1812, 1305, 1306, 1812, 248, 641, 690, 808, 1306, 
-1700, 1812, 1307, 1509, 1812, 1035, 1159, 1160, 1308, 1309, 
-1812, 1309, 1745, 1812, 919, 1016, 1310, 1713, 1714, 1812, 
-1012, 1310, 1311, 1661, 1680, 1812, 117, 409, 1166, 1312, 
-1313, 1786, 1812, 1312, 1313, 1812, 1314, 1382, 1800, 1801, 
-1805, 1806, 1812, 856, 857, 1163, 1315, 1316, 1812, 365, 
-385, 763, 1316, 1399, 1786, 1812, 152, 158, 1023, 1317, 
-1745, 1786, 1812, 208, 717, 740, 1318, 1319, 1681, 1812, 
-740, 1318, 1319, 1681, 1689, 1812, 1320, 1745, 1812, 95, 
-999, 1321, 1322, 1457, 1458, 1812, 89, 91, 151, 317, 
-676, 880, 1135, 1321, 1322, 1640, 1768, 1812, 1323, 1730, 
-1786, 1812, 118, 421, 1192, 1324, 1325, 1786, 1812, 1324, 
-1325, 1812, 25, 75, 165, 388, 902, 940, 998, 1326, 
-1402, 1749, 1768, 1812, 914, 916, 1112, 1327, 1412, 1812, 
-634, 637, 1327, 1328, 1443, 1507, 1812, 1329, 1435, 1786, 
-1812, 546, 676, 813, 1330, 1331, 1812, 1069, 1070, 1071, 
-1331, 1465, 1812, 1332, 1333, 1537, 1538, 1812, 433, 674, 
-1332, 1333, 1462, 1537, 1538, 1812, 195, 196, 1135, 1334, 
-1640, 1768, 1806, 1812, 1335, 1336, 1812, 700, 722, 756, 
-1196, 1336, 1751, 1812, 868, 1337, 1458, 1812, 1338, 1339, 
-1812, 232, 1339, 1635, 1636, 1704, 1705, 1812, 320, 767, 
-1340, 1346, 1452, 1728, 1733, 1812, 428, 768, 1341, 1349, 
-1452, 1728, 1733, 1812, 550, 1342, 1343, 1794, 1795, 1812, 
-400, 414, 1077, 1189, 1343, 1786, 1812, 803, 846, 1134, 
-1344, 1345, 1812, 418, 424, 655, 1345, 1786, 1804, 1812, 
-1253, 1346, 1347, 1348, 1812, 653, 1253, 1346, 1347, 1348, 
-1812, 1253, 1346, 1347, 1348, 1812, 1253, 1349, 1350, 1351, 
-1812, 1253, 1349, 1350, 1351, 1567, 1812, 1253, 1349, 1350, 
-1351, 1812, 827, 1352, 1353, 1376, 1626, 1633, 1812, 1020, 
-1353, 1812, 1354, 1812, 135, 296, 598, 1355, 1356, 1768, 
-1812, 1355, 1356, 1812, 302, 602, 640, 1357, 1509, 1718, 
-1764, 1812, 673, 1358, 1359, 1384, 1385, 1812, 225, 226, 
-1236, 1359, 1614, 1786, 1812, 1360, 1361, 1461, 1538, 1812, 
-1360, 1361, 1461, 1538, 1812, 1362, 1506, 1541, 1542, 1545, 
-1546, 1812, 980, 1363, 1364, 1551, 1692, 1812, 932, 980, 
-1363, 1364, 1551, 1812, 864, 1365, 1786, 1812, 610, 1283, 
-1366, 1367, 1707, 1812, 623, 632, 1367, 1786, 1812, 82, 
-422, 1009, 1368, 1369, 1786, 1812, 82, 422, 1009, 1368, 
-1369, 1786, 1812, 707, 760, 766, 1065, 1370, 1812, 1205, 
-1276, 1371, 1652, 1812, 558, 1372, 1786, 1812, 1373, 1623, 
-1812, 1374, 1600, 1602, 1628, 1812, 73, 74, 78, 551, 
-827, 1191, 1336, 1375, 1568, 1681, 1768, 1812, 4, 565, 
-881, 1352, 1375, 1376, 1626, 1812, 1377, 1619, 1630, 1786, 
-1812, 1378, 1812, 1379, 1812, 852, 1380, 1491, 1492, 1496, 
-1497, 1812, 751, 1381, 1390, 1396, 1434, 1812, 1382, 1383, 
-1812, 451, 480, 757, 860, 880, 1383, 1812, 1384, 1385, 
-1812, 903, 942, 1358, 1385, 1623, 1812, 100, 386, 519, 
-1386, 1745, 1786, 1812, 1387, 1388, 1812, 1106, 1109, 1110, 
-1229, 1388, 1812, 1389, 1434, 1812, 1259, 1389, 1390, 1395, 
-1434, 1812, 922, 1267, 1268, 1391, 1392, 1812, 183, 1135, 
-1391, 1392, 1768, 1812, 1393, 1628, 1678, 1694, 1745, 1812, 
-929, 1116, 1117, 1394, 1474, 1812, 1379, 1389, 1390, 1395, 
-1396, 1434, 1698, 1812, 1258, 1389, 1395, 1396, 1434, 1812, 
-29, 86, 182, 322, 1019, 1397, 1548, 1568, 1581, 1636, 
-1768, 1774, 1812, 856, 857, 1163, 1398, 1399, 1812, 365, 
-385, 763, 1316, 1399, 1786, 1812, 1225, 1400, 1479, 1669, 
-1788, 1812, 307, 310, 630, 1401, 1745, 1786, 1812, 85, 
-1326, 1402, 1403, 1404, 1458, 1812, 502, 542, 609, 804, 
-999, 1402, 1403, 1532, 1728, 1741, 1768, 1812, 1326, 1402, 
-1403, 1404, 1458, 1812, 1405, 1812, 573, 1406, 1497, 1768, 
-1812, 993, 1025, 1026, 1407, 1408, 1812, 0, 2, 171, 
-711, 1056, 1407, 1408, 1505, 1768, 1812, 9, 43, 192, 
-439, 1283, 1328, 1409, 1599, 1664, 1768, 1809, 1812, 1410, 
-1745, 1812, 270, 425, 1320, 1411, 1745, 1786, 1812, 914, 
-916, 1327, 1412, 1413, 1812, 106, 298, 556, 869, 1413, 
-1786, 1812, 1013, 1414, 1415, 1416, 1812, 1013, 1414, 1415, 
-1416, 1812, 894, 1013, 1414, 1415, 1416, 1812, 1013, 1417, 
-1418, 1519, 1812, 1013, 1417, 1418, 1519, 1812, 1196, 1206, 
-1208, 1419, 1420, 1812, 1420, 1745, 1812, 889, 1421, 1422, 
-1812, 1284, 1421, 1422, 1812, 1423, 1439, 1562, 1790, 1800, 
-1812, 57, 1058, 1083, 1150, 1225, 1424, 1530, 1812, 1425, 
-1786, 1812, 1426, 1786, 1812, 886, 888, 963, 1427, 1532, 
-1535, 1628, 1694, 1812, 1427, 1428, 1812, 1429, 1601, 1606, 
-1628, 1812, 1430, 1662, 1812, 736, 1430, 1431, 1432, 1662, 
-1812, 1430, 1431, 1432, 1433, 1662, 1812, 737, 1430, 1432, 
-1433, 1662, 1812, 1381, 1390, 1396, 1434, 1435, 1812, 110, 
-301, 561, 1329, 1435, 1786, 1812, 1436, 1437, 1812, 496, 
-620, 639, 640, 1437, 1812, 550, 1354, 1438, 1745, 1812, 
-1439, 1741, 1812, 272, 275, 1106, 1440, 1526, 1768, 1812, 
-952, 953, 1441, 1442, 1812, 950, 952, 953, 1442, 1449, 
-1706, 1812, 593, 1328, 1443, 1444, 1507, 1812, 691, 1149, 
-1444, 1745, 1812, 101, 750, 1445, 1446, 1532, 1535, 1759, 
-1812, 1445, 1446, 1628, 1759, 1812, 1047, 1048, 1447, 1628, 
-1812, 51, 58, 263, 648, 953, 1172, 1328, 1448, 1599, 
-1768, 1809, 1812, 141, 950, 951, 1060, 1442, 1448, 1449, 
-1812, 115, 750, 1450, 1451, 1532, 1535, 1754, 1812, 1450, 
-1451, 1628, 1754, 1812, 1049, 1340, 1341, 1452, 1628, 1694, 
-1728, 1733, 1812, 1452, 1453, 1812, 41, 176, 482, 1454, 
-1637, 1786, 1812, 272, 276, 752, 1168, 1455, 1768, 1812, 
-1456, 1812, 999, 1321, 1322, 1457, 1458, 1812, 475, 1459, 
-1460, 1812, 1075, 1297, 1460, 1708, 1709, 1812, 1360, 1361, 
-1461, 1462, 1538, 1812, 462, 464, 711, 1056, 1461, 1462, 
-1768, 1812, 1463, 1464, 1812, 659, 660, 1168, 1438, 1464, 
-1812, 1068, 1070, 1071, 1465, 1466, 1812, 13, 228, 701, 
-1213, 1466, 1786, 1812, 42, 187, 484, 1467, 1468, 1786, 
-1812, 1467, 1468, 1786, 1812, 1469, 1470, 1690, 1712, 1717, 
-1812, 1470, 1745, 1812, 148, 393, 615, 1471, 1472, 1786, 
-1812, 1471, 1472, 1786, 1812, 1473, 1474, 1812, 929, 1116, 
-1394, 1474, 1569, 1812, 1250, 1439, 1475, 1476, 1562, 1812, 
-1089, 1268, 1476, 1725, 1812, 1230, 1231, 1477, 1478, 1610, 
-1812, 910, 1431, 1433, 1478, 1662, 1812, 665, 1400, 1479, 
-1669, 1788, 1812, 1479, 1480, 1812, 743, 1481, 1482, 1486, 
-1575, 1812, 547, 1482, 1485, 1574, 1575, 1812, 908, 912, 
-1483, 1484, 1532, 1535, 1628, 1694, 1812, 1483, 1484, 1628, 
-1694, 1745, 1812, 1482, 1485, 1486, 1574, 1575, 1812, 595, 
-1485, 1486, 1574, 1575, 1812, 1487, 1488, 1812, 777, 778, 
-1020, 1204, 1488, 1812, 1127, 1489, 1490, 1669, 1788, 1812, 
-993, 1489, 1490, 1669, 1788, 1812, 1380, 1491, 1492, 1493, 
-1812, 313, 798, 1380, 1406, 1491, 1492, 1493, 1812, 1380, 
-1491, 1492, 1493, 1812, 999, 1256, 1494, 1495, 1592, 1812, 
-847, 1495, 1742, 1775, 1812, 1380, 1496, 1497, 1498, 1812, 
-1380, 1406, 1496, 1497, 1498, 1812, 1380, 1496, 1497, 1498, 
-1812, 1491, 1499, 1588, 1812, 892, 1120, 1121, 1500, 1501, 
-1812, 184, 186, 194, 660, 1110, 1500, 1501, 1528, 1768, 
-1812, 1502, 1735, 1736, 1738, 1739, 1750, 1812, 173, 500, 
-794, 1503, 1504, 1505, 1812, 500, 1292, 1503, 1504, 1505, 
-1812, 794, 1394, 1503, 1504, 1505, 1812, 1506, 1507, 1812, 
-338, 593, 637, 1328, 1443, 1507, 1812, 366, 1307, 1508, 
-1509, 1763, 1764, 1812, 1432, 1508, 1509, 1763, 1764, 1812, 
-936, 1510, 1511, 1512, 1812, 936, 1510, 1511, 1512, 1812, 
-936, 1003, 1510, 1511, 1512, 1812, 1513, 1514, 1812, 283, 
-328, 1507, 1513, 1514, 1768, 1812, 936, 1515, 1516, 1517, 
-1812, 936, 1515, 1516, 1517, 1812, 360, 936, 1003, 1515, 
-1516, 1517, 1767, 1812, 456, 837, 1518, 1519, 1548, 1768, 
-1812, 63, 894, 1013, 1417, 1418, 1518, 1519, 1812, 1126, 
-1520, 1786, 1812, 1521, 1522, 1812, 836, 1522, 1812, 1523, 
-1812, 161, 162, 509, 1524, 1525, 1786, 1812, 161, 162, 
-509, 1524, 1525, 1786, 1812, 172, 497, 668, 1526, 1527, 
-1528, 1812, 497, 1526, 1527, 1528, 1753, 1812, 668, 853, 
-1526, 1527, 1528, 1531, 1812, 1083, 1151, 1152, 1529, 1530, 
-1812, 442, 1056, 1529, 1530, 1768, 1812, 853, 1528, 1531, 
-1812, 478, 614, 776, 804, 1131, 1403, 1532, 1655, 1768, 
-1812, 383, 506, 1532, 1533, 1534, 1535, 1541, 1812, 746, 
-1532, 1533, 1534, 1535, 1536, 1628, 1694, 1812, 1532, 1535, 
-1812, 384, 536, 1532, 1534, 1535, 1536, 1545, 1812, 1332, 
-1333, 1537, 1538, 1812, 1039, 1332, 1333, 1360, 1461, 1538, 
-1812, 167, 660, 1464, 1539, 1540, 1768, 1812, 34, 499, 
-785, 786, 1539, 1540, 1577, 1812, 1362, 1541, 1542, 1543, 
-1812, 1171, 1362, 1541, 1542, 1543, 1812, 1362, 1541, 1542, 
-1543, 1812, 1544, 1745, 1812, 1362, 1545, 1546, 1547, 1812, 
-704, 1362, 1545, 1546, 1547, 1812, 1362, 1545, 1546, 1547, 
-1812, 479, 836, 837, 906, 1548, 1688, 1812, 1248, 1439, 
-1549, 1550, 1562, 1812, 1232, 1550, 1812, 832, 1551, 1552, 
-1691, 1692, 1812, 64, 65, 933, 981, 1552, 1786, 1812, 
-1273, 1275, 1553, 1554, 1555, 1812, 1272, 1274, 1275, 1554, 
-1722, 1812, 1222, 1273, 1553, 1554, 1555, 1812, 628, 1556, 
-1721, 1736, 1768, 1812, 1053, 1557, 1628, 1694, 1766, 1812, 
-1558, 1786, 1812, 1559, 1728, 1812, 1560, 1786, 1812, 709, 
-1439, 1561, 1562, 1563, 1812, 1439, 1562, 1812, 1371, 1439, 
-1561, 1562, 1563, 1812, 1564, 1649, 1786, 1812, 122, 123, 
-1201, 1565, 1566, 1786, 1812, 122, 123, 1201, 1565, 1566, 
-1786, 1812, 1567, 1568, 1812, 1397, 1568, 1812, 119, 299, 
-983, 1569, 1570, 1786, 1812, 1569, 1570, 1812, 1571, 1745, 
-1812, 1572, 1786, 1812, 1573, 1786, 1812, 1574, 1575, 1812, 
-1126, 1481, 1482, 1486, 1575, 1812, 499, 781, 782, 1576, 
-1577, 1812, 166, 168, 660, 1528, 1576, 1577, 1768, 1812, 
-1578, 1579, 1812, 537, 979, 1191, 1579, 1580, 1812, 1207, 
-1523, 1580, 1745, 1812, 658, 681, 748, 819, 1581, 1727, 
-1812, 589, 1582, 1583, 1586, 1587, 1812, 340, 402, 584, 
-1050, 1583, 1786, 1812, 1584, 1585, 1812, 212, 214, 1037, 
-1233, 1585, 1768, 1812, 1586, 1587, 1812, 588, 666, 681, 
-1582, 1587, 1812, 568, 1499, 1588, 1812, 1588, 1589, 1812, 
-412, 415, 633, 1175, 1590, 1786, 1812, 999, 1000, 1591, 
-1592, 1593, 1812, 999, 1592, 1812, 600, 999, 1591, 1592, 
-1593, 1812, 672, 788, 1594, 1595, 1596, 1812, 632, 671, 
-672, 762, 1595, 1812, 694, 788, 1594, 1595, 1596, 1812, 
-959, 1597, 1809, 1812, 1598, 1599, 1812, 530, 901, 1289, 
-1299, 1300, 1599, 1812, 410, 1374, 1600, 1602, 1678, 1728, 
-1733, 1812, 411, 1429, 1601, 1606, 1678, 1728, 1733, 1812, 
-1161, 1602, 1603, 1604, 1812, 26, 858, 1161, 1211, 1602, 
-1603, 1604, 1812, 1161, 1602, 1603, 1604, 1812, 563, 1104, 
-1605, 1745, 1812, 1161, 1606, 1607, 1608, 1812, 858, 1161, 
-1606, 1607, 1608, 1812, 1161, 1606, 1607, 1608, 1812, 1609, 
-1610, 1812, 163, 647, 1230, 1231, 1477, 1610, 1812, 842, 
-1005, 1045, 1048, 1611, 1612, 1812, 1612, 1808, 1812, 1613, 
-1745, 1812, 1614, 1786, 1812, 1615, 1786, 1812, 874, 973, 
-1616, 1617, 1618, 1812, 875, 972, 973, 1182, 1617, 1812, 
-874, 1544, 1616, 1617, 1618, 1812, 125, 352, 1377, 1619, 
-1620, 1630, 1786, 1812, 1619, 1620, 1812, 1621, 1622, 1812, 
-1079, 1622, 1812, 39, 224, 1373, 1623, 1624, 1786, 1812, 
-1623, 1624, 1786, 1812, 121, 898, 1625, 1626, 1627, 1728, 
-1733, 1812, 881, 1352, 1376, 1626, 1812, 1625, 1626, 1627, 
-1628, 1812, 1355, 1628, 1812, 177, 447, 652, 1629, 1630, 
-1631, 1786, 1812, 472, 1629, 1630, 1631, 1786, 1812, 132, 
-898, 1632, 1633, 1634, 1728, 1733, 1812, 827, 882, 1352, 
-1633, 1812, 1628, 1632, 1633, 1634, 1812, 1635, 1636, 1812, 
-570, 1217, 1339, 1636, 1704, 1705, 1812, 1454, 1637, 1786, 
-1812, 149, 490, 534, 1638, 1639, 1640, 1812, 524, 534, 
-1638, 1639, 1640, 1812, 490, 1218, 1638, 1639, 1640, 1812, 
-855, 1641, 1642, 1658, 1812, 855, 1641, 1642, 1658, 1812, 
-855, 1643, 1644, 1645, 1812, 855, 1643, 1644, 1645, 1812, 
-77, 855, 1147, 1643, 1644, 1645, 1659, 1812, 821, 1008, 
-1015, 1097, 1646, 1647, 1812, 1647, 1772, 1812, 139, 140, 
-787, 1648, 1745, 1786, 1812, 129, 355, 1564, 1649, 1650, 
-1786, 1812, 1649, 1650, 1812, 210, 216, 223, 880, 1135, 
-1286, 1640, 1651, 1768, 1812, 27, 859, 1205, 1276, 1371, 
-1651, 1652, 1812, 631, 1378, 1653, 1654, 1812, 145, 454, 
-1114, 1301, 1654, 1766, 1812, 56, 1458, 1532, 1655, 1656, 
-1657, 1812, 107, 222, 274, 392, 778, 1191, 1336, 1655, 
-1656, 1681, 1768, 1812, 1458, 1532, 1655, 1656, 1657, 1812, 
-855, 1641, 1642, 1658, 1659, 1812, 154, 660, 1658, 1659, 
-1768, 1812, 403, 404, 1142, 1660, 1745, 1786, 1812, 1661, 
-1745, 1812, 1431, 1433, 1478, 1662, 1663, 1812, 326, 329, 
-811, 867, 1663, 1745, 1766, 1812, 44, 1409, 1458, 1664, 
-1665, 1666, 1812, 745, 1131, 1664, 1665, 1768, 1812, 1409, 
-1458, 1664, 1665, 1666, 1812, 341, 344, 513, 1667, 1668, 
-1786, 1812, 341, 344, 513, 1667, 1668, 1786, 1812, 1403, 
-1669, 1812, 1670, 1671, 1812, 1196, 1206, 1208, 1590, 1671, 
-1812, 1672, 1673, 1812, 200, 334, 1437, 1672, 1673, 1768, 
-1812, 1136, 1674, 1786, 1812, 283, 284, 901, 1675, 1768, 
-1808, 1812, 79, 105, 1425, 1676, 1677, 1786, 1812, 79, 
-105, 1425, 1676, 1677, 1786, 1812, 1393, 1600, 1601, 1628, 
-1678, 1694, 1728, 1733, 1812, 1678, 1679, 1812, 1012, 1310, 
-1311, 1680, 1714, 1812, 683, 717, 1318, 1319, 1680, 1681, 
-1812, 1682, 1786, 1812, 1007, 1014, 1628, 1683, 1684, 1694, 
-1728, 1733, 1812, 1628, 1683, 1684, 1694, 1745, 1812, 1685, 
-1786, 1812, 83, 84, 586, 1686, 1745, 1786, 1812, 1687, 
-1812, 906, 1548, 1688, 1812, 1687, 1689, 1715, 1745, 1812, 
-1469, 1690, 1712, 1716, 1717, 1812, 1691, 1692, 1812, 558, 
-980, 1363, 1551, 1692, 1812, 948, 949, 1532, 1535, 1628, 
-1693, 1694, 1695, 1812, 1034, 1694, 1812, 1628, 1693, 1694, 
-1695, 1786, 1812, 229, 280, 1079, 1696, 1697, 1768, 1812, 
-1696, 1697, 1812, 1379, 1395, 1698, 1745, 1812, 1155, 1239, 
-1242, 1243, 1699, 1812, 641, 793, 808, 1306, 1699, 1700, 
-1812, 1107, 1108, 1295, 1701, 1702, 1812, 17, 18, 330, 
-940, 998, 1701, 1702, 1749, 1768, 1812, 1103, 1703, 1786, 
-1812, 1704, 1705, 1812, 1339, 1635, 1636, 1698, 1705, 1812, 
-1706, 1707, 1812, 306, 610, 1226, 1283, 1366, 1707, 1812, 
-1075, 1460, 1708, 1709, 1710, 1812, 747, 1459, 1460, 1668, 
-1709, 1812, 833, 1075, 1708, 1709, 1710, 1812, 1711, 1716, 
-1812, 1525, 1711, 1712, 1715, 1716, 1812, 1713, 1714, 1812, 
-1012, 1310, 1467, 1680, 1714, 1812, 1687, 1689, 1711, 1712, 
-1715, 1716, 1717, 1812, 1324, 1690, 1712, 1716, 1717, 1812, 
-1524, 1711, 1715, 1716, 1717, 1812, 640, 1357, 1718, 1719, 
-1768, 1812, 99, 994, 1265, 1266, 1304, 1718, 1719, 1812, 
-653, 1568, 1720, 1721, 1812, 1148, 1721, 1812, 126, 128, 
-1223, 1722, 1723, 1786, 1812, 126, 128, 1223, 1722, 1723, 
-1786, 1812, 36, 1135, 1162, 1724, 1725, 1768, 1812, 131, 
-1089, 1268, 1392, 1476, 1724, 1725, 1812, 1177, 1726, 1786, 
-1812, 681, 1581, 1727, 1812, 489, 542, 614, 626, 731, 
-1090, 1403, 1559, 1728, 1768, 1812, 452, 529, 1728, 1729, 
-1732, 1733, 1735, 1812, 143, 354, 1323, 1730, 1731, 1786, 
-1812, 1730, 1731, 1812, 824, 1628, 1694, 1728, 1729, 1732, 
-1733, 1734, 1812, 1728, 1733, 1812, 453, 562, 1728, 1732, 
-1733, 1734, 1738, 1812, 1502, 1735, 1736, 1737, 1812, 93, 
-851, 1502, 1556, 1735, 1736, 1737, 1812, 1502, 1735, 1736, 
-1737, 1812, 1502, 1738, 1739, 1740, 1812, 851, 1502, 1738, 
-1739, 1740, 1812, 1502, 1738, 1739, 1740, 1812, 518, 609, 
-871, 999, 1078, 1403, 1741, 1768, 1812, 834, 944, 1495, 
-1742, 1743, 1775, 1812, 1193, 1743, 1812, 281, 455, 539, 
-1744, 1745, 1786, 1812, 281, 455, 471, 539, 820, 829, 
-861, 866, 904, 995, 996, 1061, 1104, 1149, 1197, 1204, 
-1229, 1234, 1354, 1379, 1405, 1438, 1444, 1456, 1523, 1580, 
-1605, 1687, 1689, 1698, 1745, 1752, 1753, 1786, 1812, 960, 
-1746, 1786, 1812, 427, 581, 613, 1747, 1748, 1749, 1812, 
-613, 832, 1747, 1748, 1749, 1812, 581, 1370, 1747, 1748, 
-1749, 1812, 1750, 1751, 1812, 288, 700, 722, 1335, 1336, 
-1751, 1812, 1752, 1812, 1745, 1752, 1753, 1782, 1812, 792, 
-1754, 1755, 1756, 1812, 792, 1754, 1755, 1756, 1812, 792, 
-1293, 1754, 1755, 1756, 1812, 621, 1757, 1758, 1812, 962, 
-1757, 1758, 1812, 792, 1759, 1760, 1761, 1812, 792, 1759, 
-1760, 1761, 1812, 80, 792, 941, 1293, 1759, 1760, 1761, 
-1812, 1209, 1762, 1783, 1784, 1792, 1812, 1763, 1764, 1812, 
-801, 1307, 1508, 1509, 1764, 1812, 1765, 1766, 1812, 474, 
-771, 773, 1766, 1812, 715, 1517, 1721, 1767, 1768, 1812, 
-473, 239, 240, 548, 1769, 1786, 1793, 1812, 999, 1270, 
-1592, 1770, 1771, 1812, 1153, 1164, 1165, 1771, 1812, 142, 
-724, 947, 1772, 1773, 1774, 1812, 947, 996, 1772, 1773, 
-1774, 1812, 724, 885, 1381, 1772, 1773, 1774, 1812, 847, 
-1495, 1742, 1775, 1776, 1812, 14, 265, 940, 1749, 1768, 
-1775, 1776, 1812, 1018, 1777, 1778, 1779, 1812, 1018, 1777, 
-1778, 1779, 1812, 175, 1018, 1140, 1200, 1777, 1778, 1779, 
-1812, 1780, 1783, 1812, 1781, 1786, 1812, 1752, 1753, 1780, 
-1782, 1783, 1784, 1792, 1812, 864, 1762, 1783, 1784, 1792, 
-1812, 1769, 1780, 1782, 1783, 1784, 1812, 1785, 1786, 1812, 
-23, 38, 49, 83, 84, 100, 112, 113, 139, 140, 
-144, 146, 147, 152, 158, 164, 190, 191, 218, 219, 
-241, 242, 244, 246, 255, 256, 257, 268, 269, 270, 
-294, 305, 307, 309, 310, 314, 343, 345, 348, 349, 
-350, 359, 364, 368, 369, 370, 371, 380, 381, 386, 
-403, 404, 406, 423, 425, 438, 445, 448, 450, 460, 
-461, 465, 470, 491, 505, 519, 527, 554, 566, 567, 
-586, 592, 603, 605, 611, 622, 623, 630, 644, 787, 
-828, 877, 930, 985, 1023, 1067, 1081, 1119, 1142, 1185, 
-1203, 1216, 1235, 1264, 1320, 1367, 1745, 1766, 1786, 1812, 
-728, 1669, 1787, 1788, 1789, 1812, 1669, 1788, 1812, 1271, 
-1669, 1787, 1788, 1789, 1812, 791, 1423, 1439, 1562, 1790, 
-1812, 1790, 1791, 1812, 1780, 1782, 1783, 1792, 1793, 1812, 
-239, 240, 548, 1769, 1786, 1793, 1812, 1794, 1795, 1812, 
-853, 1076, 1177, 1342, 1795, 1812, 1101, 1796, 1797, 1812, 
-1094, 1796, 1797, 1812, 188, 189, 523, 1786, 1798, 1799, 
-1812, 188, 189, 523, 1786, 1798, 1799, 1812, 1314, 1800, 
-1801, 1802, 1812, 68, 990, 1314, 1334, 1800, 1801, 1802, 
-1812, 1314, 1800, 1801, 1802, 1812, 803, 846, 1134, 1803, 
-1804, 1812, 418, 424, 655, 1345, 1786, 1804, 1812, 1314, 
-1805, 1806, 1807, 1812, 1314, 1334, 1805, 1806, 1807, 1812, 
-1314, 1805, 1806, 1807, 1812, 429, 616, 937, 1808, 1809, 
-1810, 1812, 616, 959, 1597, 1808, 1809, 1810, 1812, 773, 
-937, 1808, 1809, 1810, 1812, 1812, 478, 479, 480, 481, 
-482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 
-492, 493, 494, 495, 497, 499, 500, 501, 502, 503, 
-504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 
-514, 515, 516, 517, 518, 519, 520, 522, 523, 524, 
-525, 526, 527, 528, 529, 531, 532, 533, 534, 535, 
-536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 
-546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 
-556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 
-566, 567, 568, 569, 570, 571, 572, 574, 575, 576, 
-577, 578, 579, 581, 583, 584, 585, 586, 587, 588, 
-589, 590, 591, 592, 593, 595, 596, 597, 598, 599, 
-600, 601, 603, 604, 605, 606, 607, 608, 609, 610, 
-611, 612, 613, 614, 615, 616, 617, 618, 620, 621, 
-622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 
-632, 633, 635, 636, 637, 638, 639, 640, 641, 642, 
-643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 
-653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 
-663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 
-673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 
-684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 
-694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 
-704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 
-715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 
-725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 
-735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 
-745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 
-755, 757, 758, 759, 760, 761, 762, 763, 764, 765, 
-766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 
-776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 
-786, 787, 788, 789, 790, 791, 792, 794, 795, 796, 
-797, 798, 800, 801, 802, 803, 804, 805, 806, 807, 
-808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 
-818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 
-828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 
-838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 
-848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 
-858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 
-868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 
-878, 879, 880, 881, 882, 883, 884, 886, 887, 888, 
-889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 
-899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 
-909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 
-919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 
-929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 
-939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 
-949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 
-959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 
-969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 
-979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 
-989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 
-999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 
-1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 
-1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 
-1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 
-1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 
-1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1058, 1059, 
-1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 
-1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 
-1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 
-1090, 1091, 1092, 1093, 1094, 1095, 1096, 1097, 1098, 1099, 
-1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 
-1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 
-1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 
-1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 
-1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 
-1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 
-1161, 1162, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 
-1171, 1172, 1173, 1174, 1175, 1176, 1177, 1178, 1179, 1180, 
-1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 
-1191, 1192, 1193, 1194, 1195, 1196, 1197, 1198, 1199, 1200, 
-1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, 1209, 1210, 
-1211, 1212, 1213, 1214, 1215, 1216, 1218, 1219, 1220, 1221, 
-1222, 1223, 1224, 1225, 1226, 1227, 1228, 1229, 1230, 1231, 
-1232, 1233, 1234, 1235, 1236, 1237, 1238, 1239, 1240, 1241, 
-1242, 1243, 1244, 1245, 1246, 1247, 1248, 1249, 1250, 1251, 
-1252, 1253, 1254, 1255, 1256, 1257, 1258, 1259, 1260, 1261, 
-1262, 1263, 1264, 1265, 1266, 1267, 1268, 1269, 1270, 1271, 
-1272, 1273, 1274, 1275, 1276, 1277, 1278, 1279, 1280, 1281, 
-1282, 1283, 1284, 1285, 1286, 1287, 1288, 1289, 1290, 1291, 
-1292, 1293, 1294, 1295, 1296, 1297, 1298, 1299, 1300, 1301, 
-1302, 1303, 1304, 1305, 1306, 1307, 1308, 1309, 1310, 1311, 
-1312, 1313, 1314, 1315, 1316, 1317, 1318, 1319, 1320, 1321, 
-1322, 1323, 1324, 1325, 1326, 1327, 1328, 1329, 1330, 1331, 
-1332, 1333, 1334, 1335, 1336, 1337, 1338, 1339, 1340, 1341, 
-1342, 1343, 1344, 1345, 1346, 1347, 1348, 1349, 1350, 1351, 
-1352, 1353, 1354, 1355, 1356, 1358, 1359, 1360, 1361, 1362, 
-1363, 1364, 1365, 1366, 1367, 1368, 1369, 1370, 1371, 1372, 
-1373, 1374, 1375, 1376, 1377, 1378, 1379, 1380, 1381, 1382, 
-1383, 1384, 1385, 1386, 1387, 1388, 1389, 1390, 1391, 1392, 
-1393, 1394, 1395, 1396, 1397, 1398, 1399, 1400, 1401, 1402, 
-1403, 1404, 1405, 1406, 1407, 1408, 1409, 1410, 1411, 1412, 
-1413, 1414, 1415, 1416, 1417, 1418, 1419, 1420, 1421, 1422, 
-1423, 1424, 1425, 1426, 1427, 1428, 1429, 1430, 1431, 1432, 
-1433, 1434, 1435, 1436, 1437, 1438, 1439, 1440, 1441, 1442, 
-1443, 1444, 1445, 1446, 1447, 1448, 1449, 1450, 1451, 1452, 
-1453, 1454, 1455, 1456, 1457, 1458, 1459, 1460, 1461, 1462, 
-1463, 1464, 1465, 1466, 1467, 1468, 1469, 1470, 1471, 1472, 
-1473, 1474, 1475, 1476, 1477, 1478, 1479, 1480, 1481, 1482, 
-1483, 1484, 1485, 1486, 1487, 1488, 1489, 1490, 1491, 1492, 
-1493, 1494, 1495, 1496, 1497, 1498, 1499, 1500, 1501, 1502, 
-1503, 1504, 1505, 1506, 1507, 1508, 1509, 1510, 1511, 1512, 
-1513, 1514, 1515, 1516, 1517, 1518, 1519, 1520, 1521, 1522, 
-1523, 1524, 1525, 1526, 1527, 1528, 1529, 1530, 1532, 1533, 
-1534, 1535, 1536, 1537, 1538, 1539, 1540, 1541, 1542, 1543, 
-1544, 1545, 1546, 1547, 1548, 1549, 1550, 1551, 1552, 1553, 
-1554, 1555, 1556, 1557, 1558, 1559, 1560, 1561, 1562, 1563, 
-1564, 1565, 1566, 1567, 1568, 1569, 1570, 1571, 1572, 1573, 
-1574, 1575, 1576, 1577, 1578, 1579, 1580, 1581, 1582, 1583, 
-1584, 1585, 1586, 1587, 1588, 1589, 1590, 1591, 1592, 1593, 
-1594, 1595, 1596, 1598, 1599, 1600, 1601, 1602, 1603, 1604, 
-1605, 1606, 1607, 1608, 1609, 1610, 1611, 1612, 1613, 1614, 
-1615, 1616, 1617, 1618, 1619, 1620, 1621, 1622, 1623, 1624, 
-1625, 1626, 1627, 1628, 1629, 1630, 1631, 1632, 1633, 1634, 
-1635, 1636, 1637, 1638, 1639, 1640, 1641, 1642, 1643, 1644, 
-1645, 1646, 1647, 1648, 1649, 1650, 1651, 1652, 1653, 1654, 
-1655, 1656, 1657, 1658, 1659, 1660, 1661, 1662, 1663, 1664, 
-1665, 1666, 1667, 1668, 1669, 1670, 1671, 1672, 1673, 1674, 
-1675, 1676, 1677, 1678, 1679, 1680, 1681, 1682, 1683, 1684, 
-1685, 1686, 1687, 1689, 1690, 1691, 1692, 1693, 1694, 1695, 
-1696, 1697, 1698, 1699, 1700, 1701, 1702, 1703, 1704, 1705, 
-1706, 1707, 1708, 1709, 1710, 1711, 1712, 1713, 1714, 1715, 
-1716, 1717, 1718, 1719, 1720, 1721, 1722, 1723, 1724, 1725, 
-1726, 1728, 1729, 1730, 1731, 1732, 1733, 1734, 1735, 1736, 
-1737, 1738, 1739, 1740, 1741, 1742, 1743, 1744, 1745, 1746, 
-1747, 1748, 1749, 1750, 1751, 1752, 1753, 1754, 1755, 1756, 
-1757, 1758, 1759, 1760, 1761, 1762, 1763, 1764, 1765, 1766, 
-1767, 1768, 1769, 1770, 1771, 1772, 1773, 1774, 1775, 1776, 
-1777, 1778, 1779, 1780, 1781, 1782, 1783, 1784, 1785, 1786, 
-1787, 1788, 1789, 1790, 1791, 1792, 1793, 1794, 1795, 1796, 
-1797, 1798, 1799, 1800, 1801, 1802, 1803, 1804, 1805, 1806, 
-1807, 1808, 1809, 1810, 1811, 1812};
-}
-#endif //MATRIX_ISSUE_402
+    0,    346,  711,  727,  1408, 1,    696,  785,  1694, 2,    353,  839,
+    1056, 1408, 3,    666,  1786, 4,    565,  1375, 1376, 5,    699,  781,
+    1694, 6,    22,   660,  1017, 1464, 7,    1044, 1045, 1694, 8,    133,
+    388,  1170, 1748, 1749, 9,    180,  1409, 1809, 1810, 10,   1220, 1221,
+    1501, 11,   335,  715,  1319, 1681, 1768, 12,   20,   1004, 1328, 1443,
+    13,   1466, 1786, 14,   160,  940,  1170, 1776, 15,   243,  851,  1319,
+    1681, 16,   236,  851,  1191, 1579, 17,   282,  1702, 1748, 1749, 18,
+    285,  940,  1170, 1702, 19,   830,  1786, 12,   20,   204,  738,  1300,
+    1328, 1443, 1599, 21,   864,  1786, 6,    22,   467,  1464, 1527, 1528,
+    23,   293,  356,  554,  723,  924,  1745, 1786, 24,   233,  837,  858,
+    1548, 25,   134,  927,  998,  1326, 26,   858,  1211, 1603, 27,   859,
+    1651, 1652, 28,   1126, 1786, 29,   169,  1397, 1636, 1705, 30,   54,
+    819,  822,  1581, 31,   60,   822,  837,  1548, 32,   583,  651,  1694,
+    33,   74,   312,  722,  1336, 1768, 34,   1539, 1540, 1577, 35,   302,
+    994,  1509, 1764, 36,   183,  1135, 1162, 1724, 1768, 37,   842,  1004,
+    1041, 38,   367,  373,  566,  688,  810,  1745, 1786, 39,   1623, 1786,
+    40,   967,  1261, 1262, 41,   1454, 1786, 42,   1467, 1786, 43,   97,
+    1328, 1409, 1443, 44,   1409, 1664, 1665, 45,   59,   711,  727,  1157,
+    46,   154,  660,  1147, 1464, 1768, 47,   211,  216,  860,  880,  1768,
+    48,   1017, 1130, 1131, 49,   432,  444,  544,  567,  926,  1745, 1786,
+    50,   1136, 1786, 51,   441,  1300, 1448, 1599, 52,   662,  1417, 1694,
+    53,   327,  420,  1099, 1125, 1768, 30,   54,   60,   398,  819,  1581,
+    1636, 1705, 55,   1091, 1157, 1728, 56,   1532, 1655, 1656, 57,   1058,
+    1424, 1530, 58,   440,  1328, 1443, 1448, 45,   59,   259,  420,  711,
+    727,  1504, 1505, 31,   54,   60,   819,  837,  1096, 1548, 1581, 61,
+    198,  279,  808,  1700, 1768, 62,   1177, 1786, 63,   894,  1518, 1519,
+    64,   139,  140,  933,  934,  981,  1552, 65,   139,  140,  933,  934,
+    981,  1552, 66,   1047, 1048, 1694, 67,   614,  1019, 1397, 68,   990,
+    1334, 1801, 69,   146,  147,  955,  958,  1179, 1180, 70,   970,  1786,
+    71,   692,  1515, 1694, 72,   1227, 1786, 73,   312,  1191, 1375, 1579,
+    33,   74,   722,  1336, 1375, 75,   76,   902,  1133, 1326, 75,   76,
+    134,  902,  1133, 1768, 77,   1147, 1645, 1659, 78,   311,  1319, 1375,
+    1681, 79,   152,  158,  1425, 1426, 1676, 1677, 80,   941,  1293, 1761,
+    81,   558,  1786, 82,   164,  423,  1009, 1011, 1368, 1369, 83,   340,
+    402,  586,  1051, 1686, 1745, 1786, 84,   340,  402,  586,  1051, 1686,
+    1745, 1786, 85,   1326, 1402, 1403, 86,   170,  1397, 1773, 1774, 87,
+    908,  1643, 1694, 88,   912,  1641, 1694, 89,   90,   1135, 1162, 1322,
+    89,   90,   178,  1162, 1639, 1640, 91,   92,   860,  880,  1322, 91,
+    92,   178,  316,  676,  860,  880,  1330, 93,   851,  1556, 1736, 94,
+    180,  439,  1283, 1366, 1768, 95,   999,  1321, 1322, 96,   1297, 1786,
+    43,   97,   179,  1300, 1443, 1599, 98,   190,  191,  1181, 1182, 1558,
+    1560, 99,   994,  1718, 1719, 100,  365,  385,  519,  764,  1386, 1745,
+    1786, 101,  1445, 1694, 1759, 102,  664,  1414, 1694, 103,  185,  186,
+    1110, 1388, 1768, 104,  825,  1078, 1741, 105,  152,  158,  1425, 1426,
+    1676, 1677, 106,  1413, 1786, 107,  389,  722,  1336, 1656, 108,  1007,
+    1008, 1694, 109,  1014, 1015, 1694, 110,  1435, 1786, 111,  218,  219,
+    919,  920,  1682, 1685, 112,  400,  414,  592,  1123, 1190, 1745, 1786,
+    113,  400,  414,  592,  1123, 1190, 1745, 1786, 114,  1098, 1099, 1125,
+    115,  1450, 1694, 1754, 116,  218,  219,  919,  920,  1682, 1685, 117,
+    1312, 1786, 118,  1324, 1786, 119,  1569, 1786, 120,  241,  242,  1186,
+    1187, 1781, 1785, 121,  1625, 1626, 1694, 122,  244,  246,  1201, 1202,
+    1565, 1566, 123,  244,  246,  1201, 1202, 1565, 1566, 124,  726,  902,
+    1133, 125,  1619, 1786, 126,  256,  257,  1223, 1224, 1722, 1723, 127,
+    571,  1786, 128,  256,  257,  1223, 1224, 1722, 1723, 129,  1649, 1786,
+    130,  463,  464,  711,  727,  1768, 131,  1392, 1724, 1725, 132,  1632,
+    1633, 1694, 8,    133,  134,  165,  927,  998,  1748, 1749, 25,   76,
+    133,  134,  902,  927,  998,  1133, 135,  297,  598,  905,  1355, 136,
+    268,  269,  487,  488,  817,  818,  137,  268,  269,  487,  488,  817,
+    818,  138,  886,  890,  1694, 64,   65,   139,  787,  934,  1648, 1745,
+    1786, 64,   65,   140,  787,  934,  1648, 1745, 1786, 141,  1060, 1448,
+    1449, 142,  1772, 1773, 1774, 143,  1730, 1786, 144,  300,  308,  557,
+    578,  985,  1786, 145,  281,  455,  1114, 1301, 1302, 1654, 1745, 1766,
+    69,   146,  434,  603,  795,  958,  1745, 1786, 69,   147,  434,  603,
+    795,  958,  1745, 1786, 148,  1471, 1786, 149,  1638, 1639, 1640, 150,
+    577,  1521, 1694, 151,  316,  676,  1322, 1330, 79,   105,  152,  1023,
+    1317, 1426, 1745, 1786, 153,  836,  837,  1548, 46,   154,  660,  1464,
+    1659, 155,  948,  950,  1694, 156,  949,  952,  1694, 157,  289,  907,
+    1328, 1443, 1768, 79,   105,  158,  1023, 1317, 1426, 1745, 1786, 159,
+    167,  168,  1527, 1528, 1768, 14,   160,  266,  848,  940,  1170, 1748,
+    1749, 161,  307,  310,  509,  511,  1524, 1525, 162,  307,  310,  509,
+    511,  1524, 1525, 163,  1231, 1477, 1610, 82,   164,  422,  527,  705,
+    1011, 1745, 1786, 133,  165,  1326, 1748, 1749, 166,  167,  660,  1464,
+    1577, 159,  166,  167,  660,  1464, 1527, 1528, 1539, 159,  168,  1527,
+    1528, 1577, 29,   169,  170,  174,  1636, 1705, 1773, 1774, 86,   169,
+    170,  1768, 1773, 1774, 171,  347,  1408, 1504, 1505, 172,  1526, 1527,
+    1528, 173,  1503, 1504, 1505, 169,  174,  182,  323,  819,  1581, 1636,
+    1705, 175,  1140, 1200, 1779, 176,  1454, 1786, 177,  1629, 1630, 1786,
+    90,   92,   178,  317,  860,  880,  1639, 1640, 97,   179,  180,  192,
+    1300, 1599, 1809, 1810, 9,    94,   179,  180,  1283, 1366, 1809, 1810,
+    181,  185,  194,  660,  1220, 1464, 1527, 1528, 174,  182,  819,  1397,
+    1581, 36,   183,  1135, 1162, 1392, 184,  185,  1501, 1527, 1528, 103,
+    181,  184,  185,  1110, 1388, 1527, 1528, 103,  186,  1110, 1388, 1501,
+    187,  1467, 1786, 188,  348,  349,  523,  525,  1798, 1799, 189,  348,
+    349,  523,  525,  1798, 1799, 98,   190,  449,  605,  612,  1560, 1745,
+    1786, 98,   191,  449,  605,  612,  1560, 1745, 1786, 179,  192,  1300,
+    1409, 1599, 193,  778,  1020, 1488, 181,  194,  660,  1464, 1501, 195,
+    416,  1135, 1162, 1334, 196,  417,  1334, 1639, 1640, 197,  666,  1786,
+    61,   198,  808,  1017, 1700, 199,  927,  998,  1193, 200,  695,  1437,
+    1508, 1673, 201,  334,  695,  957,  1610, 202,  1086, 1786, 203,  204,
+    1004, 1300, 1599, 20,   203,  204,  1300, 1599, 1768, 205,  374,  940,
+    1170, 1261, 206,  1103, 1786, 207,  598,  883,  1157, 1403, 208,  1318,
+    1319, 1681, 209,  211,  223,  859,  1135, 1162, 1639, 1640, 210,  211,
+    1639, 1640, 1651, 47,   209,  210,  211,  860,  880,  1639, 1640, 212,
+    753,  1233, 1585, 1638, 213,  214,  753,  813,  1383, 213,  214,  813,
+    1037, 1585, 1768, 215,  369,  371,  712,  713,  1572, 1573, 47,   216,
+    860,  880,  1651, 217,  369,  371,  712,  713,  1572, 1573, 111,  116,
+    218,  935,  1067, 1685, 1745, 1786, 111,  116,  219,  935,  1067, 1685,
+    1745, 1786, 220,  221,  825,  1231, 1477, 220,  221,  292,  379,  645,
+    689,  1231, 1477, 222,  396,  1191, 1579, 1656, 209,  223,  1135, 1162,
+    1651, 224,  1623, 1786, 225,  380,  381,  1236, 1359, 1614, 1615, 226,
+    380,  381,  1236, 1359, 1614, 1615, 227,  940,  1079, 1170, 228,  1466,
+    1786, 229,  854,  1079, 1696, 1747, 230,  280,  726,  854,  1193, 231,
+    901,  1300, 1599, 232,  1339, 1636, 1705, 24,   233,  333,  819,  837,
+    1211, 1548, 1581, 234,  1194, 1786, 235,  517,  625,  883,  986,  16,
+    236,  243,  1191, 1579, 1768, 237,  238,  986,  1098, 1503, 237,  238,
+    585,  883,  1098, 1768, 239,  403,  404,  548,  549,  1769, 1793, 240,
+    403,  404,  548,  549,  1769, 1793, 120,  241,  457,  507,  611,  1766,
+    1785, 1786, 120,  242,  457,  507,  611,  1766, 1785, 1786, 15,   236,
+    243,  628,  1191, 1319, 1579, 1681, 122,  123,  244,  775,  828,  1202,
+    1745, 1786, 245,  1135, 1162, 1233, 122,  123,  246,  775,  828,  1202,
+    1745, 1786, 247,  960,  1786, 248,  808,  1306, 1700, 249,  658,  836,
+    1105, 1148, 250,  399,  1105, 1339, 1772, 251,  1177, 1786, 252,  968,
+    1786, 253,  254,  1140, 1509, 1764, 253,  254,  594,  1509, 1764, 1768,
+    255,  337,  342,  510,  755,  1264, 1745, 1786, 126,  128,  256,  515,
+    622,  1224, 1745, 1786, 126,  128,  257,  515,  622,  1224, 1745, 1786,
+    258,  259,  839,  1056, 1157, 59,   258,  259,  711,  727,  839,  260,
+    979,  1260, 1281, 1318, 261,  262,  1020, 1260, 1751, 261,  262,  1020,
+    1281, 1282, 1768, 263,  264,  1448, 1809, 1810, 263,  264,  441,  1768,
+    1809, 1810, 265,  266,  1748, 1749, 1776, 160,  265,  266,  1748, 1749,
+    1768, 267,  456,  837,  894,  1548, 136,  137,  268,  488,  682,  1081,
+    1745, 1786, 136,  137,  269,  488,  682,  1081, 1745, 1786, 270,  418,
+    424,  656,  1320, 1411, 1745, 1786, 271,  660,  1168, 1464, 272,  1168,
+    1440, 1455, 1526, 273,  517,  711,  727,  274,  390,  778,  1488, 1656,
+    275,  276,  1106, 1306, 1440, 275,  276,  752,  1306, 1455, 1768, 277,
+    1297, 1786, 278,  279,  1017, 1110, 1388, 61,   278,  279,  467,  808,
+    1110, 1388, 1700, 230,  280,  726,  1696, 1697, 1768, 145,  281,  454,
+    539,  1302, 1744, 1745, 1786, 17,   282,  285,  331,  927,  998,  1748,
+    1749, 283,  901,  1507, 1514, 1675, 284,  328,  1675, 1707, 1808, 18,
+    282,  285,  940,  977,  1170, 1748, 1749, 286,  961,  1278, 1694, 287,
+    1040, 1263, 1694, 288,  722,  1336, 1751, 157,  289,  1293, 1328, 1443,
+    290,  960,  1786, 291,  292,  645,  689,  825,  221,  291,  292,  645,
+    689,  1768, 23,   293,  294,  721,  723,  733,  872,  293,  294,  356,
+    554,  723,  924,  1745, 1786, 295,  968,  1786, 296,  297,  598,  1355,
+    135,  296,  297,  598,  868,  905,  298,  1413, 1786, 299,  1569, 1786,
+    144,  300,  309,  576,  578,  736,  737,  1630, 1786, 301,  1435, 1786,
+    35,   302,  319,  1231, 1357, 1477, 1509, 1764, 303,  305,  314,  503,
+    504,  975,  1085, 304,  305,  314,  503,  504,  975,  1085, 303,  304,
+    305,  504,  1146, 1185, 1745, 1786, 306,  1283, 1366, 1707, 161,  162,
+    307,  511,  630,  1401, 1745, 1786, 144,  308,  309,  576,  578,  736,
+    737,  1630, 1786, 300,  308,  309,  557,  578,  985,  1786, 161,  162,
+    310,  511,  630,  1401, 1745, 1786, 78,   311,  312,  551,  1191, 1319,
+    1579, 1681, 33,   73,   311,  312,  722,  1191, 1336, 1579, 313,  798,
+    1406, 1492, 303,  304,  314,  504,  1146, 1185, 1745, 1786, 315,  693,
+    1510, 1694, 92,   151,  316,  676,  1330, 1768, 178,  317,  1322, 1639,
+    1640, 318,  319,  994,  1231, 1477, 302,  318,  319,  1231, 1477, 1768,
+    320,  1340, 1346, 1694, 321,  710,  988,  1408, 322,  323,  837,  1397,
+    1548, 174,  322,  323,  819,  837,  1581, 324,  558,  1786, 325,  676,
+    813,  1330, 326,  1663, 1745, 53,   327,  1099, 1125, 1157, 284,  328,
+    1513, 1514, 1707, 1768, 329,  1663, 1745, 1766, 330,  331,  927,  998,
+    1702, 282,  330,  331,  927,  998,  1768, 332,  333,  819,  858,  1581,
+    233,  332,  333,  819,  1581, 1768, 201,  334,  957,  1672, 1673, 1768,
+    11,   335,  1003, 1319, 1681, 336,  830,  1786, 255,  337,  343,  754,
+    755,  1092, 1093, 338,  1328, 1443, 1507, 339,  658,  819,  1581, 83,
+    84,   340,  584,  1050, 1051, 1583, 341,  345,  350,  513,  514,  1667,
+    1668, 255,  342,  343,  754,  755,  1092, 1093, 337,  342,  343,  510,
+    755,  1264, 1745, 1786, 344,  345,  350,  513,  514,  1667, 1668, 341,
+    344,  345,  514,  930,  976,  1745, 1786, 0,    346,  347,  353,  711,
+    727,  1504, 1505, 171,  346,  347,  1504, 1505, 1768, 188,  189,  348,
+    525,  545,  1119, 1745, 1786, 188,  189,  349,  525,  545,  1119, 1745,
+    1786, 341,  344,  350,  514,  930,  976,  1745, 1786, 351,  645,  689,
+    957,  352,  1619, 1630, 1786, 2,    346,  353,  710,  711,  727,  839,
+    1056, 354,  1730, 1786, 355,  1649, 1786, 23,   294,  356,  721,  723,
+    733,  872,  357,  864,  1786, 358,  1106, 1110, 1388, 359,  401,  405,
+    505,  608,  650,  1745, 1786, 360,  1003, 1517, 1767, 361,  1136, 1786,
+    362,  364,  370,  643,  646,  758,  1036, 363,  364,  370,  643,  646,
+    758,  1036, 362,  363,  364,  646,  779,  1203, 1745, 1786, 100,  365,
+    386,  763,  764,  1316, 1399, 366,  1508, 1509, 1764, 38,   367,  368,
+    809,  810,  1258, 1259, 367,  368,  373,  566,  688,  810,  1745, 1786,
+    215,  217,  369,  508,  877,  1573, 1745, 1786, 362,  363,  370,  646,
+    779,  1203, 1745, 1786, 215,  217,  371,  508,  877,  1573, 1745, 1786,
+    372,  379,  640,  714,  1509, 1764, 38,   368,  373,  809,  810,  1258,
+    1259, 205,  374,  940,  967,  1170, 1768, 375,  442,  839,  1056, 1058,
+    1768, 376,  1126, 1786, 377,  1086, 1786, 378,  379,  825,  1509, 1764,
+    221,  372,  378,  379,  1231, 1477, 1509, 1764, 225,  226,  380,  644,
+    989,  1615, 1745, 1786, 225,  226,  381,  644,  989,  1615, 1745, 1786,
+    382,  625,  839,  1056, 383,  1533, 1541, 1694, 384,  1536, 1545, 1694,
+    100,  385,  386,  763,  764,  1316, 1399, 365,  385,  386,  519,  764,
+    1386, 1745, 1786, 387,  1103, 1786, 8,    388,  940,  1170, 1326, 107,
+    389,  390,  396,  722,  778,  1336, 1488, 274,  389,  390,  778,  1488,
+    1768, 391,  822,  1096, 1097, 392,  395,  1319, 1656, 1681, 393,  1471,
+    1786, 394,  888,  892,  1694, 392,  395,  396,  1191, 1319, 1579, 222,
+    389,  395,  396,  722,  1191, 1336, 1579, 397,  398,  822,  1636, 1705,
+    54,   397,  398,  1636, 1705, 1768, 250,  399,  1148, 1721, 1768, 1772,
+    112,  113,  400,  1077, 1189, 1190, 1343, 359,  401,  406,  547,  595,
+    649,  650,  83,   84,   402,  584,  1050, 1051, 1583, 239,  240,  403,
+    549,  1142, 1660, 1745, 1786, 239,  240,  404,  549,  1142, 1660, 1745,
+    1786, 359,  405,  406,  547,  595,  649,  650,  401,  405,  406,  505,
+    608,  650,  1745, 1786, 407,  1227, 1786, 408,  979,  1191, 1579, 409,
+    1312, 1786, 410,  1600, 1602, 1694, 411,  1601, 1606, 1694, 412,  1590,
+    1786, 413,  970,  1786, 112,  113,  414,  1077, 1189, 1190, 1343, 415,
+    1590, 1786, 195,  416,  417,  990,  1135, 1162, 1639, 1640, 196,  416,
+    417,  1639, 1640, 1768, 270,  418,  425,  655,  656,  1345, 1804, 419,
+    420,  1157, 1504, 1505, 53,   59,   419,  420,  1099, 1125, 1504, 1505,
+    421,  1324, 1786, 164,  422,  423,  1009, 1011, 1368, 1369, 82,   422,
+    423,  527,  705,  1011, 1745, 1786, 270,  424,  425,  655,  656,  1345,
+    1804, 418,  424,  425,  656,  1320, 1411, 1745, 1786, 426,  1194, 1786,
+    427,  1747, 1748, 1749, 428,  1341, 1349, 1694, 429,  1808, 1809, 1810,
+    430,  977,  978,  1702, 431,  848,  944,  1776, 49,   432,  445,  925,
+    926,  1138, 1139, 433,  674,  1333, 1462, 146,  147,  434,  955,  958,
+    1179, 1180, 435,  438,  448,  702,  703,  762,  789,  1630, 1786, 436,
+    571,  1786, 437,  438,  448,  702,  703,  762,  789,  1630, 1786, 435,
+    437,  438,  606,  703,  1216, 1786, 94,   439,  1283, 1366, 1409, 58,
+    440,  441,  648,  1300, 1328, 1443, 1599, 51,   264,  440,  441,  1300,
+    1599, 1809, 1810, 375,  442,  839,  1056, 1530, 443,  450,  461,  552,
+    553,  1021, 1066, 49,   444,  445,  925,  926,  1138, 1139, 432,  444,
+    445,  544,  567,  926,  1745, 1786, 446,  450,  461,  552,  553,  1021,
+    1066, 447,  1629, 1786, 435,  437,  448,  606,  703,  1216, 1786, 190,
+    191,  449,  1181, 1182, 1558, 1560, 443,  446,  450,  491,  553,  1115,
+    1745, 1786, 451,  860,  880,  1383, 452,  1694, 1729, 1735, 453,  1694,
+    1734, 1738, 281,  454,  455,  1114, 1301, 1302, 1654, 1745, 1766, 145,
+    454,  455,  539,  1302, 1744, 1745, 1786, 267,  456,  837,  1518, 1548,
+    1768, 241,  242,  457,  1186, 1187, 1781, 1785, 458,  460,  465,  559,
+    560,  1214, 1215, 459,  460,  465,  559,  560,  1214, 1215, 458,  459,
+    460,  560,  618,  1235, 1745, 1786, 443,  446,  461,  491,  553,  1115,
+    1745, 1786, 462,  463,  839,  1056, 1462, 130,  462,  463,  674,  711,
+    727,  839,  1056, 130,  464,  711,  727,  1462, 458,  459,  465,  560,
+    618,  1235, 1745, 1786, 466,  467,  1017, 1527, 1528, 22,   279,  466,
+    467,  1110, 1388, 1527, 1528, 468,  476,  469,  477,  1786, 1745, 1630,
+    1768, 1766, 1458, 468,  469,  478,  1212, 1455, 1812, 479,  837,  1812,
+    480,  860,  1812, 481,  1099, 1812, 482,  1454, 1812, 483,  750,  1812,
+    484,  1467, 1812, 485,  1732, 1812, 486,  645,  1812, 487,  1786, 1812,
+    488,  1786, 1812, 489,  585,  1059, 1148, 1812, 490,  1639, 1812, 491,
+    1745, 1812, 492,  558,  1812, 493,  662,  1417, 1628, 1812, 494,  664,
+    1414, 1628, 1812, 495,  727,  1812, 496,  640,  1437, 1812, 497,  1528,
+    1812, 498,  579,  1594, 1812, 499,  531,  781,  785,  1540, 1576, 1812,
+    500,  1505, 1812, 501,  538,  555,  820,  829,  928,  1102, 1812, 502,
+    943,  1584, 1696, 1812, 503,  1786, 1812, 504,  1786, 1812, 505,  1745,
+    1812, 506,  1533, 1541, 1628, 1812, 241,  242,  507,  611,  1766, 1786,
+    1812, 369,  371,  508,  877,  1745, 1786, 1812, 509,  1786, 1812, 255,
+    343,  510,  1264, 1745, 1786, 1812, 511,  1786, 1812, 512,  1787, 1812,
+    513,  1786, 1812, 514,  1786, 1812, 256,  257,  515,  622,  1745, 1786,
+    1812, 516,  524,  587,  712,  873,  1812, 273,  495,  517,  669,  711,
+    727,  1812, 518,  814,  1673, 1812, 519,  1745, 1812, 520,  846,  991,
+    1344, 1803, 1812, 521,  660,  1159, 1812, 522,  1194, 1812, 523,  1786,
+    1812, 516,  524,  587,  873,  896,  1812, 525,  1786, 1812, 526,  774,
+    780,  784,  1765, 1812, 527,  1745, 1812, 528,  960,  1812, 529,  1628,
+    1729, 1735, 1812, 530,  1289, 1599, 1812, 531,  1106, 1812, 532,  968,
+    1812, 533,  583,  651,  1628, 1812, 534,  1640, 1812, 535,  663,  1812,
+    536,  1536, 1545, 1628, 1812, 537,  1191, 1812, 501,  538,  928,  1102,
+    1799, 1812, 539,  1786, 1812, 540,  1277, 1812, 541,  1297, 1812, 542,
+    543,  731,  883,  1812, 542,  543,  731,  884,  1812, 49,   445,  544,
+    567,  1745, 1786, 1812, 348,  349,  545,  1119, 1745, 1786, 1812, 546,
+    676,  1812, 401,  405,  547,  595,  649,  1786, 1812, 548,  1786, 1812,
+    549,  1786, 1812, 550,  1076, 1342, 1354, 1438, 1794, 1795, 1812, 311,
+    551,  565,  1319, 1375, 1568, 1681, 1721, 1768, 1812, 552,  1786, 1812,
+    553,  1786, 1812, 554,  1745, 1812, 501,  555,  928,  1102, 1798, 1812,
+    556,  1413, 1812, 144,  309,  557,  985,  1786, 1812, 81,   324,  492,
+    558,  1372, 1786, 1812, 559,  1786, 1812, 560,  1786, 1812, 561,  1435,
+    1812, 562,  1628, 1734, 1738, 1812, 563,  685,  686,  938,  1104, 1240,
+    1605, 1812, 564,  927,  1812, 551,  565,  1376, 1721, 1768, 1812, 566,
+    1745, 1812, 567,  1745, 1812, 568,  1496, 1588, 1812, 569,  886,  890,
+    1628, 1812, 538,  555,  570,  769,  1102, 1812, 127,  436,  571,  572,
+    984,  1786, 1812, 571,  572,  1812, 573,  640,  799,  1406, 1812, 574,
+    631,  1114, 1378, 1812, 575,  1619, 1812, 576,  1630, 1812, 150,  577,
+    635,  654,  1521, 1728, 1733, 1812, 578,  1786, 1812, 498,  579,  1594,
+    1812, 580,  774,  1283, 1812, 581,  1748, 1812, 582,  594,  640,  1140,
+    1812, 32,   533,  583,  635,  651,  1728, 1733, 1812, 340,  402,  584,
+    1050, 1583, 1786, 1812, 585,  883,  1812, 586,  1745, 1812, 587,  873,
+    1812, 584,  588,  589,  1586, 1587, 1812, 588,  589,  866,  1061, 1582,
+    1586, 1587, 1812, 590,  591,  1812, 591,  830,  906,  974,  1084, 1812,
+    592,  1745, 1812, 593,  1328, 1812, 254,  582,  594,  640,  1200, 1509,
+    1764, 1812, 401,  405,  547,  595,  649,  1786, 1812, 596,  1581, 1812,
+    597,  1693, 1812, 207,  598,  627,  883,  1157, 1812, 599,  845,  1812,
+    600,  1621, 1812, 601,  689,  1812, 602,  640,  994,  1357, 1812, 603,
+    1745, 1812, 604,  1111, 1360, 1669, 1788, 1812, 605,  1745, 1812, 438,
+    448,  606,  1216, 1786, 1812, 607,  657,  845,  999,  1592, 1812, 359,
+    406,  505,  608,  1745, 1786, 1812, 609,  675,  1585, 1696, 1812, 610,
+    1283, 1812, 611,  1766, 1812, 190,  191,  605,  612,  1745, 1786, 1812,
+    613,  1749, 1812, 489,  614,  776,  1019, 1532, 1728, 1768, 1812, 615,
+    1471, 1812, 616,  1810, 1812, 617,  1183, 1812, 460,  465,  618,  1235,
+    1745, 1786, 1812, 619,  778,  1616, 1812, 620,  1812, 621,  661,  1757,
+    1812, 622,  1745, 1812, 623,  1812, 624,  625,  1812, 382,  625,  838,
+    839,  1055, 1056, 1812, 626,  731,  1812, 598,  627,  883,  1403, 1812,
+    243,  628,  851,  1319, 1556, 1568, 1681, 1721, 1768, 1812, 629,  692,
+    1515, 1628, 1812, 630,  1745, 1812, 574,  631,  1378, 1653, 1812, 623,
+    632,  671,  672,  788,  1367, 1595, 1812, 633,  1590, 1812, 634,  1327,
+    1328, 1812, 577,  583,  635,  706,  1628, 1694, 1728, 1733, 1812, 635,
+    636,  1812, 637,  1443, 1812, 638,  902,  1812, 639,  640,  1812, 620,
+    631,  640,  1437, 1812, 641,  808,  1812, 642,  693,  1510, 1628, 1812,
+    643,  1786, 1812, 644,  1745, 1812, 601,  645,  689,  957,  1024, 1812,
+    646,  1786, 1812, 647,  1477, 1812, 440,  648,  752,  1060, 1172, 1328,
+    1443, 1448, 1768, 1812, 649,  1786, 1812, 650,  1786, 1812, 651,  1521,
+    1812, 652,  1629, 1812, 653,  1568, 1720, 1721, 1812, 577,  654,  1521,
+    1628, 1812, 655,  1786, 1812, 656,  1786, 1812, 657,  719,  978,  1108,
+    1812, 339,  596,  658,  748,  819,  1581, 1812, 659,  660,  1812, 521,
+    660,  1159, 1168, 1463, 1464, 1812, 661,  962,  1812, 52,   493,  662,
+    663,  1417, 1728, 1733, 1812, 662,  663,  664,  982,  1628, 1694, 1728,
+    1733, 1812, 102,  494,  663,  664,  1414, 1728, 1733, 1812, 665,  1151,
+    1479, 1669, 1788, 1812, 3,    197,  666,  667,  878,  1786, 1812, 666,
+    667,  1812, 668,  1527, 1812, 669,  711,  1812, 670,  1098, 1812, 671,
+    672,  1812, 672,  788,  960,  1594, 1595, 1812, 673,  942,  1358, 1384,
+    1385, 1812, 463,  674,  839,  1056, 1333, 1768, 1812, 609,  675,  1585,
+    1696, 1812, 676,  749,  813,  1330, 1481, 1812, 677,  679,  1812, 677,
+    678,  679,  1035, 1160, 1204, 1405, 1812, 679,  1035, 1159, 1160, 1649,
+    1812, 680,  1086, 1812, 588,  681,  802,  1582, 1587, 1812, 268,  269,
+    682,  1081, 1745, 1786, 1812, 683,  1680, 1681, 1812, 684,  834,  999,
+    1256, 1592, 1812, 685,  686,  1812, 686,  938,  959,  1240, 1312, 1812,
+    687,  1103, 1812, 38,   368,  566,  688,  1745, 1786, 1812, 486,  645,
+    689,  747,  957,  1812, 690,  1700, 1812, 691,  732,  1087, 1088, 1149,
+    1290, 1444, 1812, 71,   629,  692,  849,  1515, 1728, 1733, 1812, 315,
+    642,  693,  849,  1510, 1728, 1733, 1812, 694,  1786, 1812, 200,  201,
+    695,  1508, 1610, 1768, 1812, 1,    696,  697,  785,  1183, 1532, 1535,
+    1812, 696,  697,  785,  1628, 1812, 698,  888,  892,  1628, 1812, 5,
+    699,  781,  823,  1183, 1532, 1535, 1812, 700,  722,  1812, 701,  1466,
+    1812, 702,  1630, 1812, 703,  1786, 1812, 704,  1172, 1812, 164,  423,
+    527,  705,  1745, 1786, 1812, 635,  706,  1628, 1694, 1745, 1812, 707,
+    708,  760,  1065, 1370, 1812, 708,  1745, 1812, 709,  1279, 1439, 1561,
+    1562, 1812, 353,  710,  839,  988,  1056, 1768, 1812, 495,  517,  711,
+    727,  1553, 1812, 215,  217,  712,  713,  1572, 1786, 1812, 215,  217,
+    712,  713,  1572, 1786, 1812, 372,  640,  714,  825,  1812, 11,   715,
+    1003, 1319, 1568, 1681, 1721, 1767, 1768, 1812, 716,  1534, 1812, 717,
+    1319, 1812, 718,  1526, 1812, 657,  719,  978,  1108, 1812, 720,  1091,
+    1157, 1458, 1728, 1812, 721,  1786, 1812, 722,  861,  1335, 1336, 1751,
+    1812, 723,  1786, 1812, 724,  1773, 1812, 725,  849,  1812, 124,  638,
+    726,  902,  1052, 1133, 1812, 517,  669,  711,  727,  931,  1812, 728,
+    729,  1669, 1787, 1788, 1812, 729,  1271, 1812, 730,  1068, 1070, 1071,
+    1199, 1812, 731,  1812, 691,  732,  733,  1087, 1088, 1812, 293,  356,
+    721,  733,  872,  1786, 1812, 734,  1135, 1812, 735,  862,  863,  1179,
+    1292, 1812, 300,  308,  576,  736,  737,  1630, 1812, 300,  308,  576,
+    736,  737,  1630, 1812, 20,   738,  752,  1004, 1041, 1172, 1328, 1443,
+    1768, 1812, 739,  1757, 1812, 740,  1681, 1812, 741,  1638, 1812, 742,
+    979,  1812, 743,  744,  1481, 1482, 1486, 1812, 744,  1745, 1812, 745,
+    1288, 1514, 1812, 746,  1534, 1628, 1694, 1745, 1812, 747,  1075, 1459,
+    1460, 1709, 1812, 748,  819,  1812, 749,  1330, 1812, 750,  1072, 1445,
+    1450, 1532, 1535, 1628, 1694, 1812, 751,  800,  1381, 1390, 1396, 1812,
+    752,  1455, 1812, 212,  213,  753,  1383, 1638, 1768, 1812, 754,  1786,
+    1812, 755,  1786, 1812, 756,  1196, 1336, 1812, 757,  880,  1812, 362,
+    363,  643,  758,  1036, 1786, 1812, 759,  766,  1812, 759,  760,  765,
+    766,  1021, 1812, 761,  948,  950,  1628, 1812, 435,  437,  702,  762,
+    789,  1630, 1812, 763,  1786, 1812, 764,  1786, 1812, 759,  760,  765,
+    766,  1065, 1812, 571,  760,  766,  1065, 1370, 1812, 767,  1340, 1346,
+    1628, 1812, 768,  1341, 1349, 1628, 1812, 538,  555,  570,  769,  770,
+    1812, 770,  1745, 1812, 771,  1812, 772,  821,  1015, 1646, 1812, 771,
+    773,  783,  1766, 1812, 526,  774,  780,  784,  1063, 1812, 244,  246,
+    775,  828,  1745, 1786, 1812, 776,  804,  1129, 1281, 1812, 777,  778,
+    1812, 619,  778,  1020, 1487, 1488, 1616, 1812, 364,  370,  779,  1203,
+    1745, 1786, 1812, 780,  783,  1062, 1063, 1186, 1812, 499,  781,  782,
+    1576, 1812, 499,  781,  782,  1576, 1812, 771,  773,  780,  783,  784,
+    1062, 1063, 1812, 783,  784,  1062, 1063, 1187, 1812, 499,  785,  786,
+    1540, 1812, 499,  785,  786,  1540, 1812, 787,  1745, 1812, 632,  671,
+    672,  788,  789,  1812, 435,  437,  702,  762,  789,  1630, 1812, 790,
+    1489, 1812, 791,  1439, 1562, 1790, 1805, 1812, 792,  900,  1754, 1756,
+    1759, 1761, 1812, 793,  1699, 1700, 1812, 794,  1504, 1812, 146,  147,
+    603,  795,  1745, 1786, 1812, 796,  999,  1144, 1270, 1592, 1812, 797,
+    1745, 1812, 640,  798,  799,  1492, 1768, 1812, 573,  640,  798,  799,
+    1768, 1812, 800,  1745, 1812, 574,  801,  946,  1653, 1812, 588,  681,
+    802,  1038, 1582, 1812, 803,  846,  1812, 731,  804,  805,  883,  1148,
+    1281, 1812, 804,  805,  807,  1148, 1281, 1812, 731,  806,  807,  883,
+    1812, 805,  806,  807,  883,  1148, 1812, 690,  808,  904,  1306, 1700,
+    1812, 809,  1786, 1812, 810,  1786, 1812, 811,  1663, 1812, 812,  813,
+    1812, 325,  546,  676,  749,  813,  1330, 1812, 518,  814,  1673, 1812,
+    735,  815,  816,  863,  876,  1812, 816,  862,  863,  1180, 1292, 1812,
+    136,  137,  487,  817,  818,  1786, 1812, 136,  137,  487,  817,  818,
+    1786, 1812, 596,  658,  819,  820,  1581, 1812, 501,  820,  829,  1745,
+    1812, 772,  821,  822,  1015, 1646, 1812, 30,   31,   397,  821,  822,
+    1548, 1581, 1636, 1768, 1812, 699,  781,  823,  1628, 1812, 824,  1628,
+    1694, 1732, 1745, 1812, 220,  291,  378,  645,  714,  825,  1078, 1477,
+    1764, 1768, 1812, 826,  1162, 1812, 827,  882,  1352, 1375, 1633, 1812,
+    828,  1745, 1812, 829,  1812, 19,   336,  830,  831,  1176, 1786, 1812,
+    830,  831,  1812, 832,  980,  1551, 1691, 1692, 1812, 833,  1745, 1812,
+    834,  835,  944,  1742, 1812, 834,  835,  944,  1742, 1812, 153,  479,
+    836,  837,  887,  1548, 1812, 836,  837,  887,  1061, 1548, 1812, 838,
+    839,  1812, 625,  839,  1055, 1056, 1274, 1812, 840,  1005, 1048, 1611,
+    1812, 841,  842,  1045, 1611, 1812, 37,   841,  842,  1004, 1041, 1045,
+    1611, 1812, 843,  844,  906,  974,  1084, 1812, 844,  1745, 1812, 607,
+    845,  999,  1294, 1592, 1812, 520,  846,  1344, 1471, 1803, 1812, 847,
+    1495, 1742, 1775, 1812, 160,  848,  940,  944,  1170, 1768, 1812, 692,
+    693,  849,  850,  1628, 1694, 1728, 1733, 1812, 849,  850,  1628, 1694,
+    1745, 1812, 15,   16,   628,  851,  1191, 1568, 1681, 1739, 1768, 1812,
+    852,  1508, 1812, 853,  1076, 1245, 1342, 1795, 1812, 229,  230,  854,
+    1193, 1747, 1768, 1812, 718,  855,  1641, 1643, 1645, 1658, 1812, 856,
+    857,  1812, 857,  1218, 1315, 1398, 1730, 1812, 24,   332,  858,  1548,
+    1581, 1607, 1768, 1812, 209,  859,  1135, 1162, 1652, 1768, 1812, 757,
+    860,  880,  1383, 1485, 1812, 861,  875,  1456, 1745, 1812, 862,  863,
+    1812, 735,  815,  816,  863,  1136, 1812, 21,   357,  864,  865,  1365,
+    1786, 1812, 864,  865,  1812, 866,  1812, 867,  1663, 1745, 1766, 1812,
+    868,  1337, 1458, 1812, 869,  1413, 1786, 1812, 870,  1006, 1024, 1029,
+    1032, 1812, 871,  879,  1585, 1812, 293,  356,  721,  733,  872,  1786,
+    1812, 516,  873,  895,  896,  1227, 1812, 874,  875,  972,  973,  1181,
+    1812, 861,  874,  875,  972,  973,  1456, 1617, 1812, 735,  815,  816,
+    876,  954,  1812, 877,  1745, 1812, 666,  878,  1786, 1812, 871,  879,
+    1585, 1812, 480,  860,  880,  895,  1383, 1812, 881,  1352, 1376, 1626,
+    1812, 827,  882,  1352, 1633, 1812, 235,  238,  585,  625,  883,  1768,
+    1812, 543,  883,  884,  1812, 885,  1381, 1774, 1812, 138,  569,  886,
+    890,  1427, 1532, 1535, 1812, 887,  1548, 1812, 394,  698,  888,  892,
+    1427, 1532, 1535, 1812, 889,  1265, 1421, 1812, 890,  891,  1121, 1221,
+    1812, 890,  891,  1121, 1221, 1812, 892,  1120, 1121, 1500, 1812, 658,
+    893,  1812, 267,  894,  1416, 1548, 1768, 1812, 516,  873,  895,  896,
+    897,  1812, 524,  587,  713,  873,  896,  1812, 516,  895,  896,  897,
+    1571, 1812, 898,  1188, 1625, 1628, 1632, 1694, 1728, 1733, 1812, 898,
+    899,  1812, 900,  901,  1812, 231,  901,  1299, 1300, 1598, 1599, 1812,
+    726,  902,  903,  1052, 1133, 1812, 903,  942,  1022, 1358, 1385, 1812,
+    904,  915,  1197, 1745, 1812, 905,  906,  1812, 591,  843,  906,  974,
+    1084, 1812, 157,  752,  907,  941,  1172, 1293, 1328, 1443, 1768, 1812,
+    87,   908,  909,  1483, 1532, 1535, 1643, 1812, 908,  909,  1628, 1643,
+    1812, 910,  911,  1431, 1433, 1478, 1812, 911,  1786, 1812, 88,   912,
+    1027, 1483, 1532, 1535, 1641, 1812, 913,  1412, 1812, 818,  913,  914,
+    915,  1412, 1812, 904,  913,  914,  915,  916,  1197, 1412, 1812, 817,
+    913,  915,  916,  1412, 1812, 917,  1126, 1812, 918,  1745, 1812, 111,
+    116,  919,  920,  1682, 1786, 1812, 111,  116,  919,  920,  1682, 1786,
+    1812, 921,  922,  1250, 1439, 1562, 1812, 922,  1267, 1268, 1391, 1812,
+    825,  923,  1078, 1458, 1741, 1812, 23,   294,  554,  924,  1745, 1786,
+    1812, 925,  1786, 1812, 926,  1786, 1812, 673,  927,  997,  998,  1193,
+    1812, 928,  1102, 1812, 929,  931,  1215, 1473, 1474, 1812, 930,  1745,
+    1812, 929,  931,  1116, 1473, 1474, 1812, 932,  1745, 1812, 933,  1786,
+    1812, 934,  1786, 1812, 218,  219,  935,  1067, 1745, 1786, 1812, 742,
+    936,  1510, 1512, 1515, 1517, 1812, 937,  1809, 1812, 563,  685,  686,
+    938,  1676, 1812, 939,  940,  1812, 940,  1068, 1079, 1169, 1170, 1812,
+    752,  907,  941,  1761, 1768, 1812, 673,  942,  1236, 1384, 1385, 1812,
+    502,  943,  1584, 1696, 1812, 431,  834,  835,  848,  944,  1742, 1776,
+    1812, 945,  1747, 1812, 574,  801,  946,  1296, 1653, 1812, 947,  1774,
+    1812, 155,  761,  948,  950,  1532, 1535, 1693, 1812, 156,  949,  952,
+    1154, 1532, 1535, 1693, 1812, 950,  951,  1442, 1449, 1812, 950,  951,
+    1442, 1449, 1812, 952,  953,  1441, 1442, 1812, 952,  953,  1441, 1442,
+    1448, 1812, 954,  1745, 1812, 955,  1786, 1812, 956,  957,  1812, 351,
+    486,  601,  645,  689,  957,  1812, 958,  1786, 1812, 686,  938,  959,
+    1033, 1240, 1812, 247,  290,  528,  960,  1746, 1786, 1812, 961,  1040,
+    1812, 962,  1436, 1812, 963,  1427, 1628, 1694, 1745, 1812, 964,  1008,
+    1097, 1646, 1812, 965,  1227, 1786, 1812, 966,  970,  1786, 1812, 374,
+    940,  967,  1170, 1262, 1768, 1812, 252,  295,  532,  968,  969,  1786,
+    1812, 968,  969,  1786, 1812, 70,   413,  966,  970,  971,  1786, 1812,
+    970,  971,  1812, 972,  973,  1812, 874,  968,  973,  1616, 1617, 1812,
+    590,  591,  974,  975,  1073, 1812, 303,  304,  503,  975,  1085, 1786,
+    1812, 345,  350,  930,  976,  1745, 1786, 1812, 285,  940,  977,  978,
+    1170, 1768, 1812, 430,  657,  719,  977,  978,  1108, 1702, 1812, 408,
+    537,  979,  1191, 1578, 1579, 1812, 832,  980,  981,  1691, 1692, 1812,
+    64,   65,   933,  981,  1552, 1786, 1812, 663,  982,  1628, 1694, 1745,
+    1812, 983,  1569, 1786, 1812, 571,  984,  1786, 1812, 985,  1786, 1812,
+    235,  237,  517,  986,  1503, 1768, 1812, 987,  988,  1026, 1128, 1812,
+    321,  710,  987,  988,  1026, 1128, 1408, 1812, 380,  381,  644,  989,
+    1745, 1786, 1812, 416,  990,  1135, 1162, 1768, 1801, 1812, 520,  991,
+    992,  1344, 1803, 1812, 992,  1745, 1812, 993,  1025, 1026, 1407, 1812,
+    35,   318,  602,  994,  1285, 1477, 1764, 1768, 1812, 995,  1812, 995,
+    996,  1016, 1745, 1812, 997,  998,  1812, 564,  927,  998,  1193, 1363,
+    1812, 502,  871,  999,  1321, 1403, 1741, 1768, 1812, 999,  1000, 1591,
+    1592, 1621, 1812, 1001, 1040, 1263, 1628, 1812, 726,  1002, 1812, 335,
+    715,  1003, 1512, 1568, 1681, 1768, 1812, 12,   203,  738,  1004, 1005,
+    1172, 1328, 1599, 1768, 1812, 840,  1004, 1005, 1048, 1611, 1812, 1006,
+    1745, 1812, 108,  1007, 1008, 1010, 1683, 1728, 1733, 1812, 964,  1008,
+    1097, 1646, 1812, 1009, 1786, 1812, 1007, 1008, 1010, 1628, 1812, 1011,
+    1786, 1812, 920,  1012, 1016, 1713, 1714, 1812, 893,  1013, 1414, 1416,
+    1417, 1519, 1812, 109,  1014, 1015, 1143, 1683, 1728, 1733, 1812, 772,
+    821,  1015, 1646, 1812, 995,  996,  1012, 1016, 1310, 1713, 1714, 1812,
+    6,    198,  278,  466,  660,  1017, 1110, 1130, 1172, 1528, 1700, 1768,
+    1812, 1018, 1094, 1141, 1609, 1777, 1779, 1812, 67,   614,  1019, 1118,
+    1397, 1458, 1812, 193,  777,  778,  1020, 1487, 1488, 1812, 443,  446,
+    552,  1021, 1066, 1786, 1812, 903,  942,  1022, 1358, 1613, 1812, 1023,
+    1745, 1812, 870,  1024, 1029, 1031, 1032, 1812, 993,  1025, 1026, 1407,
+    1812, 670,  988,  993,  1026, 1128, 1407, 1812, 912,  1027, 1628, 1641,
+    1812, 1028, 1031, 1812, 1028, 1029, 1030, 1031, 1369, 1812, 1028, 1029,
+    1030, 1031, 1032, 1812, 1024, 1029, 1031, 1032, 1629, 1812, 1028, 1030,
+    1031, 1032, 1368, 1812, 938,  959,  1033, 1240, 1410, 1812, 1034, 1355,
+    1812, 677,  678,  679,  1035, 1036, 1812, 362,  363,  643,  758,  1036,
+    1786, 1812, 1037, 1585, 1812, 1038, 1745, 1812, 1039, 1503, 1812, 1040,
+    1167, 1812, 738,  752,  842,  1041, 1768, 1812, 1042, 1043, 1248, 1439,
+    1562, 1812, 1043, 1550, 1812, 7,    1044, 1045, 1046, 1053, 1532, 1535,
+    1812, 841,  842,  1045, 1611, 1812, 1044, 1045, 1046, 1628, 1812, 66,
+    1047, 1048, 1053, 1447, 1532, 1535, 1812, 840,  1005, 1048, 1611, 1812,
+    1049, 1452, 1628, 1694, 1745, 1812, 1050, 1786, 1812, 1051, 1786, 1812,
+    1052, 1133, 1812, 1044, 1047, 1053, 1532, 1535, 1557, 1628, 1694, 1812,
+    1053, 1054, 1812, 1055, 1056, 1812, 520,  625,  838,  839,  1056, 1812,
+    1057, 1191, 1690, 1812, 375,  839,  1056, 1058, 1424, 1768, 1812, 489,
+    585,  1059, 1148, 1812, 648,  752,  1060, 1449, 1768, 1812, 589,  866,
+    1061, 1745, 1812, 1062, 1063, 1812, 774,  780,  784,  970,  1063, 1812,
+    517,  1064, 1812, 759,  765,  766,  1065, 1066, 1812, 443,  446,  552,
+    1021, 1066, 1786, 1812, 1067, 1745, 1812, 730,  1068, 1070, 1071, 1465,
+    1812, 1069, 1465, 1812, 1069, 1070, 1331, 1465, 1565, 1812, 1069, 1071,
+    1331, 1465, 1566, 1812, 750,  1072, 1628, 1694, 1745, 1812, 590,  591,
+    974,  1073, 1084, 1812, 1074, 1745, 1812, 747,  1075, 1459, 1460, 1667,
+    1812, 550,  1076, 1077, 1794, 1795, 1812, 400,  414,  1077, 1189, 1343,
+    1786, 1812, 104,  825,  923,  1078, 1458, 1741, 1812, 227,  939,  940,
+    1079, 1169, 1170, 1812, 1080, 1111, 1812, 1081, 1745, 1812, 1082, 1086,
+    1786, 1812, 1064, 1083, 1151, 1225, 1424, 1529, 1812, 590,  591,  1073,
+    1084, 1085, 1812, 303,  304,  503,  975,  1085, 1786, 1812, 202,  377,
+    680,  1082, 1086, 1786, 1812, 1087, 1088, 1812, 732,  1088, 1289, 1290,
+    1454, 1812, 1089, 1268, 1476, 1725, 1812, 626,  731,  1090, 1091, 1728,
+    1812, 55,   720,  1091, 1157, 1458, 1728, 1812, 337,  342,  754,  1092,
+    1093, 1786, 1812, 337,  342,  754,  1092, 1093, 1786, 1812, 1018, 1094,
+    1095, 1141, 1812, 1018, 1094, 1095, 1141, 1812, 60,   837,  1096, 1097,
+    1548, 1768, 1812, 391,  822,  964,  1008, 1096, 1097, 1646, 1812, 114,
+    481,  1098, 1099, 1124, 1125, 1812, 1073, 1098, 1099, 1124, 1125, 1812,
+    1100, 1110, 1762, 1812, 1101, 1777, 1796, 1812, 538,  555,  570,  1102,
+    1103, 1812, 206,  387,  687,  1103, 1703, 1786, 1812, 1104, 1812, 249,
+    250,  658,  1105, 1339, 1768, 1812, 358,  1106, 1109, 1110, 1387, 1388,
+    1812, 1107, 1108, 1295, 1701, 1812, 657,  978,  1002, 1108, 1295, 1701,
+    1812, 1109, 1110, 1812, 1100, 1106, 1110, 1387, 1388, 1762, 1812, 604,
+    1111, 1174, 1669, 1788, 1812, 914,  916,  1112, 1113, 1327, 1812, 1113,
+    1745, 1812, 145,  454,  1114, 1301, 1654, 1766, 1812, 450,  461,  491,
+    1115, 1745, 1786, 1812, 931,  1116, 1214, 1473, 1474, 1812, 918,  929,
+    1116, 1117, 1394, 1812, 614,  1019, 1118, 1397, 1458, 1812, 1119, 1745,
+    1812, 892,  1120, 1121, 1500, 1812, 890,  892,  1121, 1221, 1305, 1500,
+    1812, 1122, 1483, 1812, 112,  113,  592,  1123, 1745, 1786, 1812, 1124,
+    1125, 1812, 481,  815,  1098, 1099, 1125, 1812, 28,   376,  917,  1126,
+    1520, 1786, 1812, 1127, 1128, 1489, 1669, 1788, 1812, 987,  988,  1026,
+    1128, 1812, 776,  804,  1129, 1281, 1812, 48,   1017, 1130, 1131, 1132,
+    1458, 1812, 478,  745,  1130, 1131, 1532, 1665, 1768, 1812, 1017, 1130,
+    1131, 1132, 1458, 1812, 638,  726,  902,  1133, 1134, 1812, 803,  846,
+    1134, 1344, 1803, 1812, 826,  1135, 1162, 1233, 1708, 1812, 50,   361,
+    1136, 1137, 1674, 1786, 1812, 1136, 1137, 1812, 432,  444,  925,  1138,
+    1139, 1786, 1812, 432,  444,  925,  1138, 1139, 1786, 1812, 253,  582,
+    1140, 1141, 1764, 1768, 1812, 1018, 1094, 1095, 1140, 1141, 1812, 1142,
+    1745, 1812, 1014, 1015, 1143, 1628, 1812, 1144, 1145, 1165, 1262, 1812,
+    1144, 1145, 1165, 1262, 1812, 305,  314,  1146, 1185, 1745, 1786, 1812,
+    46,   660,  1147, 1464, 1645, 1768, 1812, 249,  399,  836,  1148, 1721,
+    1768, 1812, 1149, 1812, 1083, 1150, 1225, 1424, 1812, 1083, 1151, 1152,
+    1529, 1812, 1083, 1151, 1152, 1529, 1812, 1153, 1164, 1165, 1261, 1771,
+    1812, 949,  952,  1154, 1628, 1812, 1155, 1156, 1239, 1243, 1699, 1812,
+    1156, 1745, 1812, 45,   258,  327,  419,  711,  1056, 1091, 1125, 1157,
+    1158, 1505, 1768, 1812, 1157, 1158, 1812, 679,  1035, 1159, 1160, 1308,
+    1812, 677,  678,  679,  758,  1160, 1812, 1161, 1338, 1602, 1603, 1606,
+    1607, 1812, 734,  1135, 1162, 1163, 1233, 1812, 856,  857,  1163, 1315,
+    1398, 1812, 1153, 1164, 1165, 1771, 1812, 945,  1144, 1153, 1165, 1262,
+    1771, 1812, 1166, 1312, 1786, 1812, 1167, 1168, 1812, 271,  659,  660,
+    1168, 1463, 1464, 1812, 1169, 1170, 1812, 765,  939,  940,  1079, 1170,
+    1812, 752,  1171, 1172, 1173, 1812, 1017, 1172, 1812, 752,  1171, 1172,
+    1173, 1812, 1111, 1174, 1332, 1669, 1788, 1812, 1175, 1590, 1786, 1812,
+    830,  1176, 1786, 1812, 62,   251,  1177, 1178, 1726, 1786, 1812, 1177,
+    1178, 1812, 69,   434,  955,  1179, 1180, 1786, 1812, 69,   434,  955,
+    1179, 1180, 1786, 1812, 98,   449,  1181, 1182, 1558, 1786, 1812, 98,
+    449,  1181, 1182, 1558, 1786, 1812, 696,  699,  1183, 1184, 1532, 1535,
+    1628, 1694, 1812, 1183, 1184, 1628, 1694, 1745, 1812, 1185, 1745, 1812,
+    120,  457,  1186, 1187, 1781, 1786, 1812, 120,  457,  1186, 1187, 1781,
+    1786, 1812, 898,  1188, 1628, 1694, 1745, 1812, 1189, 1786, 1812, 1190,
+    1786, 1812, 979,  1057, 1191, 1578, 1579, 1690, 1812, 1192, 1324, 1786,
+    1812, 199,  564,  927,  997,  998,  1193, 1812, 234,  426,  522,  1194,
+    1195, 1786, 1812, 1194, 1195, 1786, 1812, 1196, 1206, 1208, 1419, 1671,
+    1812, 1197, 1812, 1198, 1683, 1812, 1199, 1745, 1812, 594,  640,  1200,
+    1768, 1779, 1812, 1201, 1786, 1812, 1202, 1786, 1812, 1203, 1745, 1812,
+    678,  1204, 1405, 1745, 1812, 812,  1205, 1279, 1286, 1371, 1652, 1812,
+    1093, 1206, 1207, 1670, 1671, 1812, 1206, 1207, 1208, 1523, 1580, 1670,
+    1671, 1812, 1092, 1207, 1208, 1670, 1671, 1812, 1209, 1210, 1762, 1784,
+    1792, 1812, 1210, 1745, 1812, 233,  837,  1211, 1548, 1603, 1768, 1812,
+    478,  1212, 1455, 1812, 1213, 1466, 1786, 1812, 458,  459,  559,  1214,
+    1215, 1786, 1812, 458,  459,  559,  1214, 1215, 1786, 1812, 1216, 1786,
+    1812, 570,  1217, 1636, 1812, 857,  1218, 1219, 1315, 1398, 1812, 797,
+    1218, 1219, 1315, 1398, 1812, 181,  660,  1220, 1221, 1464, 1768, 1812,
+    10,   890,  891,  1121, 1220, 1221, 1501, 1812, 1222, 1745, 1812, 1223,
+    1786, 1812, 1224, 1786, 1812, 1083, 1150, 1225, 1424, 1812, 1226, 1366,
+    1812, 72,   407,  965,  1227, 1228, 1786, 1812, 1227, 1228, 1812, 1229,
+    1234, 1241, 1745, 1812, 1230, 1231, 1812, 647,  1030, 1231, 1477, 1610,
+    1812, 1232, 1233, 1812, 245,  734,  826,  1135, 1162, 1233, 1812, 1234,
+    1812, 1235, 1745, 1812, 225,  226,  1236, 1359, 1614, 1786, 1812, 1237,
+    1277, 1628, 1694, 1745, 1812, 1238, 1242, 1812, 1139, 1238, 1239, 1241,
+    1242, 1812, 563,  685,  686,  1240, 1677, 1812, 1229, 1234, 1238, 1239,
+    1241, 1242, 1243, 1812, 1086, 1239, 1242, 1243, 1699, 1812, 1138, 1238,
+    1241, 1242, 1243, 1812, 1244, 1796, 1812, 853,  1076, 1245, 1246, 1342,
+    1812, 1246, 1745, 1812, 1247, 1421, 1812, 1042, 1248, 1439, 1549, 1562,
+    1812, 1248, 1249, 1812, 921,  1250, 1439, 1475, 1562, 1812, 1250, 1251,
+    1812, 1252, 1591, 1812, 1253, 1254, 1346, 1347, 1349, 1350, 1812, 1254,
+    1318, 1812, 1255, 1270, 1812, 684,  999,  1256, 1494, 1592, 1812, 1256,
+    1257, 1812, 367,  373,  809,  1258, 1259, 1786, 1812, 367,  373,  809,
+    1258, 1259, 1786, 1812, 260,  261,  979,  1260, 1751, 1768, 1812, 205,
+    940,  1153, 1261, 1768, 1812, 40,   967,  1144, 1145, 1165, 1261, 1262,
+    1812, 287,  1001, 1040, 1263, 1277, 1532, 1535, 1812, 1264, 1745, 1812,
+    1265, 1266, 1304, 1719, 1812, 1265, 1266, 1304, 1719, 1812, 922,  1267,
+    1268, 1391, 1812, 741,  922,  1268, 1391, 1476, 1725, 1812, 1269, 1561,
+    1812, 796,  999,  1270, 1592, 1770, 1812, 624,  1271, 1812, 1272, 1275,
+    1812, 1272, 1273, 1274, 1275, 1723, 1812, 1272, 1273, 1274, 1275, 1554,
+    1812, 1194, 1273, 1275, 1553, 1554, 1812, 1205, 1276, 1371, 1652, 1812,
+    1237, 1263, 1277, 1278, 1532, 1535, 1628, 1694, 1812, 286,  961,  1277,
+    1278, 1287, 1532, 1535, 1812, 1205, 1279, 1280, 1286, 1812, 1205, 1279,
+    1280, 1286, 1812, 260,  262,  1281, 1282, 1318, 1768, 1812, 1281, 1282,
+    1812, 580,  774,  1226, 1283, 1366, 1707, 1812, 1284, 1285, 1303, 1304,
+    1812, 994,  1284, 1285, 1303, 1304, 1812, 1205, 1279, 1280, 1286, 1651,
+    1812, 961,  1278, 1287, 1628, 1812, 745,  1288, 1514, 1812, 732,  1088,
+    1289, 1290, 1291, 1812, 691,  872,  1087, 1088, 1290, 1812, 732,  1074,
+    1289, 1290, 1291, 1812, 735,  816,  862,  863,  1292, 1812, 289,  907,
+    1172, 1293, 1328, 1756, 1768, 1812, 845,  999,  1294, 1295, 1592, 1812,
+    1107, 1108, 1295, 1701, 1812, 1296, 1786, 1812, 96,   277,  541,  1297,
+    1298, 1786, 1812, 1297, 1298, 1786, 1812, 1299, 1300, 1812, 901,  1300,
+    1598, 1599, 1605, 1812, 1301, 1766, 1812, 1302, 1745, 1812, 1284, 1285,
+    1303, 1304, 1812, 956,  1265, 1284, 1285, 1304, 1719, 1812, 1305, 1306,
+    1812, 248,  641,  690,  808,  1306, 1700, 1812, 1307, 1509, 1812, 1035,
+    1159, 1160, 1308, 1309, 1812, 1309, 1745, 1812, 919,  1016, 1310, 1713,
+    1714, 1812, 1012, 1310, 1311, 1661, 1680, 1812, 117,  409,  1166, 1312,
+    1313, 1786, 1812, 1312, 1313, 1812, 1314, 1382, 1800, 1801, 1805, 1806,
+    1812, 856,  857,  1163, 1315, 1316, 1812, 365,  385,  763,  1316, 1399,
+    1786, 1812, 152,  158,  1023, 1317, 1745, 1786, 1812, 208,  717,  740,
+    1318, 1319, 1681, 1812, 740,  1318, 1319, 1681, 1689, 1812, 1320, 1745,
+    1812, 95,   999,  1321, 1322, 1457, 1458, 1812, 89,   91,   151,  317,
+    676,  880,  1135, 1321, 1322, 1640, 1768, 1812, 1323, 1730, 1786, 1812,
+    118,  421,  1192, 1324, 1325, 1786, 1812, 1324, 1325, 1812, 25,   75,
+    165,  388,  902,  940,  998,  1326, 1402, 1749, 1768, 1812, 914,  916,
+    1112, 1327, 1412, 1812, 634,  637,  1327, 1328, 1443, 1507, 1812, 1329,
+    1435, 1786, 1812, 546,  676,  813,  1330, 1331, 1812, 1069, 1070, 1071,
+    1331, 1465, 1812, 1332, 1333, 1537, 1538, 1812, 433,  674,  1332, 1333,
+    1462, 1537, 1538, 1812, 195,  196,  1135, 1334, 1640, 1768, 1806, 1812,
+    1335, 1336, 1812, 700,  722,  756,  1196, 1336, 1751, 1812, 868,  1337,
+    1458, 1812, 1338, 1339, 1812, 232,  1339, 1635, 1636, 1704, 1705, 1812,
+    320,  767,  1340, 1346, 1452, 1728, 1733, 1812, 428,  768,  1341, 1349,
+    1452, 1728, 1733, 1812, 550,  1342, 1343, 1794, 1795, 1812, 400,  414,
+    1077, 1189, 1343, 1786, 1812, 803,  846,  1134, 1344, 1345, 1812, 418,
+    424,  655,  1345, 1786, 1804, 1812, 1253, 1346, 1347, 1348, 1812, 653,
+    1253, 1346, 1347, 1348, 1812, 1253, 1346, 1347, 1348, 1812, 1253, 1349,
+    1350, 1351, 1812, 1253, 1349, 1350, 1351, 1567, 1812, 1253, 1349, 1350,
+    1351, 1812, 827,  1352, 1353, 1376, 1626, 1633, 1812, 1020, 1353, 1812,
+    1354, 1812, 135,  296,  598,  1355, 1356, 1768, 1812, 1355, 1356, 1812,
+    302,  602,  640,  1357, 1509, 1718, 1764, 1812, 673,  1358, 1359, 1384,
+    1385, 1812, 225,  226,  1236, 1359, 1614, 1786, 1812, 1360, 1361, 1461,
+    1538, 1812, 1360, 1361, 1461, 1538, 1812, 1362, 1506, 1541, 1542, 1545,
+    1546, 1812, 980,  1363, 1364, 1551, 1692, 1812, 932,  980,  1363, 1364,
+    1551, 1812, 864,  1365, 1786, 1812, 610,  1283, 1366, 1367, 1707, 1812,
+    623,  632,  1367, 1786, 1812, 82,   422,  1009, 1368, 1369, 1786, 1812,
+    82,   422,  1009, 1368, 1369, 1786, 1812, 707,  760,  766,  1065, 1370,
+    1812, 1205, 1276, 1371, 1652, 1812, 558,  1372, 1786, 1812, 1373, 1623,
+    1812, 1374, 1600, 1602, 1628, 1812, 73,   74,   78,   551,  827,  1191,
+    1336, 1375, 1568, 1681, 1768, 1812, 4,    565,  881,  1352, 1375, 1376,
+    1626, 1812, 1377, 1619, 1630, 1786, 1812, 1378, 1812, 1379, 1812, 852,
+    1380, 1491, 1492, 1496, 1497, 1812, 751,  1381, 1390, 1396, 1434, 1812,
+    1382, 1383, 1812, 451,  480,  757,  860,  880,  1383, 1812, 1384, 1385,
+    1812, 903,  942,  1358, 1385, 1623, 1812, 100,  386,  519,  1386, 1745,
+    1786, 1812, 1387, 1388, 1812, 1106, 1109, 1110, 1229, 1388, 1812, 1389,
+    1434, 1812, 1259, 1389, 1390, 1395, 1434, 1812, 922,  1267, 1268, 1391,
+    1392, 1812, 183,  1135, 1391, 1392, 1768, 1812, 1393, 1628, 1678, 1694,
+    1745, 1812, 929,  1116, 1117, 1394, 1474, 1812, 1379, 1389, 1390, 1395,
+    1396, 1434, 1698, 1812, 1258, 1389, 1395, 1396, 1434, 1812, 29,   86,
+    182,  322,  1019, 1397, 1548, 1568, 1581, 1636, 1768, 1774, 1812, 856,
+    857,  1163, 1398, 1399, 1812, 365,  385,  763,  1316, 1399, 1786, 1812,
+    1225, 1400, 1479, 1669, 1788, 1812, 307,  310,  630,  1401, 1745, 1786,
+    1812, 85,   1326, 1402, 1403, 1404, 1458, 1812, 502,  542,  609,  804,
+    999,  1402, 1403, 1532, 1728, 1741, 1768, 1812, 1326, 1402, 1403, 1404,
+    1458, 1812, 1405, 1812, 573,  1406, 1497, 1768, 1812, 993,  1025, 1026,
+    1407, 1408, 1812, 0,    2,    171,  711,  1056, 1407, 1408, 1505, 1768,
+    1812, 9,    43,   192,  439,  1283, 1328, 1409, 1599, 1664, 1768, 1809,
+    1812, 1410, 1745, 1812, 270,  425,  1320, 1411, 1745, 1786, 1812, 914,
+    916,  1327, 1412, 1413, 1812, 106,  298,  556,  869,  1413, 1786, 1812,
+    1013, 1414, 1415, 1416, 1812, 1013, 1414, 1415, 1416, 1812, 894,  1013,
+    1414, 1415, 1416, 1812, 1013, 1417, 1418, 1519, 1812, 1013, 1417, 1418,
+    1519, 1812, 1196, 1206, 1208, 1419, 1420, 1812, 1420, 1745, 1812, 889,
+    1421, 1422, 1812, 1284, 1421, 1422, 1812, 1423, 1439, 1562, 1790, 1800,
+    1812, 57,   1058, 1083, 1150, 1225, 1424, 1530, 1812, 1425, 1786, 1812,
+    1426, 1786, 1812, 886,  888,  963,  1427, 1532, 1535, 1628, 1694, 1812,
+    1427, 1428, 1812, 1429, 1601, 1606, 1628, 1812, 1430, 1662, 1812, 736,
+    1430, 1431, 1432, 1662, 1812, 1430, 1431, 1432, 1433, 1662, 1812, 737,
+    1430, 1432, 1433, 1662, 1812, 1381, 1390, 1396, 1434, 1435, 1812, 110,
+    301,  561,  1329, 1435, 1786, 1812, 1436, 1437, 1812, 496,  620,  639,
+    640,  1437, 1812, 550,  1354, 1438, 1745, 1812, 1439, 1741, 1812, 272,
+    275,  1106, 1440, 1526, 1768, 1812, 952,  953,  1441, 1442, 1812, 950,
+    952,  953,  1442, 1449, 1706, 1812, 593,  1328, 1443, 1444, 1507, 1812,
+    691,  1149, 1444, 1745, 1812, 101,  750,  1445, 1446, 1532, 1535, 1759,
+    1812, 1445, 1446, 1628, 1759, 1812, 1047, 1048, 1447, 1628, 1812, 51,
+    58,   263,  648,  953,  1172, 1328, 1448, 1599, 1768, 1809, 1812, 141,
+    950,  951,  1060, 1442, 1448, 1449, 1812, 115,  750,  1450, 1451, 1532,
+    1535, 1754, 1812, 1450, 1451, 1628, 1754, 1812, 1049, 1340, 1341, 1452,
+    1628, 1694, 1728, 1733, 1812, 1452, 1453, 1812, 41,   176,  482,  1454,
+    1637, 1786, 1812, 272,  276,  752,  1168, 1455, 1768, 1812, 1456, 1812,
+    999,  1321, 1322, 1457, 1458, 1812, 475,  1459, 1460, 1812, 1075, 1297,
+    1460, 1708, 1709, 1812, 1360, 1361, 1461, 1462, 1538, 1812, 462,  464,
+    711,  1056, 1461, 1462, 1768, 1812, 1463, 1464, 1812, 659,  660,  1168,
+    1438, 1464, 1812, 1068, 1070, 1071, 1465, 1466, 1812, 13,   228,  701,
+    1213, 1466, 1786, 1812, 42,   187,  484,  1467, 1468, 1786, 1812, 1467,
+    1468, 1786, 1812, 1469, 1470, 1690, 1712, 1717, 1812, 1470, 1745, 1812,
+    148,  393,  615,  1471, 1472, 1786, 1812, 1471, 1472, 1786, 1812, 1473,
+    1474, 1812, 929,  1116, 1394, 1474, 1569, 1812, 1250, 1439, 1475, 1476,
+    1562, 1812, 1089, 1268, 1476, 1725, 1812, 1230, 1231, 1477, 1478, 1610,
+    1812, 910,  1431, 1433, 1478, 1662, 1812, 665,  1400, 1479, 1669, 1788,
+    1812, 1479, 1480, 1812, 743,  1481, 1482, 1486, 1575, 1812, 547,  1482,
+    1485, 1574, 1575, 1812, 908,  912,  1483, 1484, 1532, 1535, 1628, 1694,
+    1812, 1483, 1484, 1628, 1694, 1745, 1812, 1482, 1485, 1486, 1574, 1575,
+    1812, 595,  1485, 1486, 1574, 1575, 1812, 1487, 1488, 1812, 777,  778,
+    1020, 1204, 1488, 1812, 1127, 1489, 1490, 1669, 1788, 1812, 993,  1489,
+    1490, 1669, 1788, 1812, 1380, 1491, 1492, 1493, 1812, 313,  798,  1380,
+    1406, 1491, 1492, 1493, 1812, 1380, 1491, 1492, 1493, 1812, 999,  1256,
+    1494, 1495, 1592, 1812, 847,  1495, 1742, 1775, 1812, 1380, 1496, 1497,
+    1498, 1812, 1380, 1406, 1496, 1497, 1498, 1812, 1380, 1496, 1497, 1498,
+    1812, 1491, 1499, 1588, 1812, 892,  1120, 1121, 1500, 1501, 1812, 184,
+    186,  194,  660,  1110, 1500, 1501, 1528, 1768, 1812, 1502, 1735, 1736,
+    1738, 1739, 1750, 1812, 173,  500,  794,  1503, 1504, 1505, 1812, 500,
+    1292, 1503, 1504, 1505, 1812, 794,  1394, 1503, 1504, 1505, 1812, 1506,
+    1507, 1812, 338,  593,  637,  1328, 1443, 1507, 1812, 366,  1307, 1508,
+    1509, 1763, 1764, 1812, 1432, 1508, 1509, 1763, 1764, 1812, 936,  1510,
+    1511, 1512, 1812, 936,  1510, 1511, 1512, 1812, 936,  1003, 1510, 1511,
+    1512, 1812, 1513, 1514, 1812, 283,  328,  1507, 1513, 1514, 1768, 1812,
+    936,  1515, 1516, 1517, 1812, 936,  1515, 1516, 1517, 1812, 360,  936,
+    1003, 1515, 1516, 1517, 1767, 1812, 456,  837,  1518, 1519, 1548, 1768,
+    1812, 63,   894,  1013, 1417, 1418, 1518, 1519, 1812, 1126, 1520, 1786,
+    1812, 1521, 1522, 1812, 836,  1522, 1812, 1523, 1812, 161,  162,  509,
+    1524, 1525, 1786, 1812, 161,  162,  509,  1524, 1525, 1786, 1812, 172,
+    497,  668,  1526, 1527, 1528, 1812, 497,  1526, 1527, 1528, 1753, 1812,
+    668,  853,  1526, 1527, 1528, 1531, 1812, 1083, 1151, 1152, 1529, 1530,
+    1812, 442,  1056, 1529, 1530, 1768, 1812, 853,  1528, 1531, 1812, 478,
+    614,  776,  804,  1131, 1403, 1532, 1655, 1768, 1812, 383,  506,  1532,
+    1533, 1534, 1535, 1541, 1812, 746,  1532, 1533, 1534, 1535, 1536, 1628,
+    1694, 1812, 1532, 1535, 1812, 384,  536,  1532, 1534, 1535, 1536, 1545,
+    1812, 1332, 1333, 1537, 1538, 1812, 1039, 1332, 1333, 1360, 1461, 1538,
+    1812, 167,  660,  1464, 1539, 1540, 1768, 1812, 34,   499,  785,  786,
+    1539, 1540, 1577, 1812, 1362, 1541, 1542, 1543, 1812, 1171, 1362, 1541,
+    1542, 1543, 1812, 1362, 1541, 1542, 1543, 1812, 1544, 1745, 1812, 1362,
+    1545, 1546, 1547, 1812, 704,  1362, 1545, 1546, 1547, 1812, 1362, 1545,
+    1546, 1547, 1812, 479,  836,  837,  906,  1548, 1688, 1812, 1248, 1439,
+    1549, 1550, 1562, 1812, 1232, 1550, 1812, 832,  1551, 1552, 1691, 1692,
+    1812, 64,   65,   933,  981,  1552, 1786, 1812, 1273, 1275, 1553, 1554,
+    1555, 1812, 1272, 1274, 1275, 1554, 1722, 1812, 1222, 1273, 1553, 1554,
+    1555, 1812, 628,  1556, 1721, 1736, 1768, 1812, 1053, 1557, 1628, 1694,
+    1766, 1812, 1558, 1786, 1812, 1559, 1728, 1812, 1560, 1786, 1812, 709,
+    1439, 1561, 1562, 1563, 1812, 1439, 1562, 1812, 1371, 1439, 1561, 1562,
+    1563, 1812, 1564, 1649, 1786, 1812, 122,  123,  1201, 1565, 1566, 1786,
+    1812, 122,  123,  1201, 1565, 1566, 1786, 1812, 1567, 1568, 1812, 1397,
+    1568, 1812, 119,  299,  983,  1569, 1570, 1786, 1812, 1569, 1570, 1812,
+    1571, 1745, 1812, 1572, 1786, 1812, 1573, 1786, 1812, 1574, 1575, 1812,
+    1126, 1481, 1482, 1486, 1575, 1812, 499,  781,  782,  1576, 1577, 1812,
+    166,  168,  660,  1528, 1576, 1577, 1768, 1812, 1578, 1579, 1812, 537,
+    979,  1191, 1579, 1580, 1812, 1207, 1523, 1580, 1745, 1812, 658,  681,
+    748,  819,  1581, 1727, 1812, 589,  1582, 1583, 1586, 1587, 1812, 340,
+    402,  584,  1050, 1583, 1786, 1812, 1584, 1585, 1812, 212,  214,  1037,
+    1233, 1585, 1768, 1812, 1586, 1587, 1812, 588,  666,  681,  1582, 1587,
+    1812, 568,  1499, 1588, 1812, 1588, 1589, 1812, 412,  415,  633,  1175,
+    1590, 1786, 1812, 999,  1000, 1591, 1592, 1593, 1812, 999,  1592, 1812,
+    600,  999,  1591, 1592, 1593, 1812, 672,  788,  1594, 1595, 1596, 1812,
+    632,  671,  672,  762,  1595, 1812, 694,  788,  1594, 1595, 1596, 1812,
+    959,  1597, 1809, 1812, 1598, 1599, 1812, 530,  901,  1289, 1299, 1300,
+    1599, 1812, 410,  1374, 1600, 1602, 1678, 1728, 1733, 1812, 411,  1429,
+    1601, 1606, 1678, 1728, 1733, 1812, 1161, 1602, 1603, 1604, 1812, 26,
+    858,  1161, 1211, 1602, 1603, 1604, 1812, 1161, 1602, 1603, 1604, 1812,
+    563,  1104, 1605, 1745, 1812, 1161, 1606, 1607, 1608, 1812, 858,  1161,
+    1606, 1607, 1608, 1812, 1161, 1606, 1607, 1608, 1812, 1609, 1610, 1812,
+    163,  647,  1230, 1231, 1477, 1610, 1812, 842,  1005, 1045, 1048, 1611,
+    1612, 1812, 1612, 1808, 1812, 1613, 1745, 1812, 1614, 1786, 1812, 1615,
+    1786, 1812, 874,  973,  1616, 1617, 1618, 1812, 875,  972,  973,  1182,
+    1617, 1812, 874,  1544, 1616, 1617, 1618, 1812, 125,  352,  1377, 1619,
+    1620, 1630, 1786, 1812, 1619, 1620, 1812, 1621, 1622, 1812, 1079, 1622,
+    1812, 39,   224,  1373, 1623, 1624, 1786, 1812, 1623, 1624, 1786, 1812,
+    121,  898,  1625, 1626, 1627, 1728, 1733, 1812, 881,  1352, 1376, 1626,
+    1812, 1625, 1626, 1627, 1628, 1812, 1355, 1628, 1812, 177,  447,  652,
+    1629, 1630, 1631, 1786, 1812, 472,  1629, 1630, 1631, 1786, 1812, 132,
+    898,  1632, 1633, 1634, 1728, 1733, 1812, 827,  882,  1352, 1633, 1812,
+    1628, 1632, 1633, 1634, 1812, 1635, 1636, 1812, 570,  1217, 1339, 1636,
+    1704, 1705, 1812, 1454, 1637, 1786, 1812, 149,  490,  534,  1638, 1639,
+    1640, 1812, 524,  534,  1638, 1639, 1640, 1812, 490,  1218, 1638, 1639,
+    1640, 1812, 855,  1641, 1642, 1658, 1812, 855,  1641, 1642, 1658, 1812,
+    855,  1643, 1644, 1645, 1812, 855,  1643, 1644, 1645, 1812, 77,   855,
+    1147, 1643, 1644, 1645, 1659, 1812, 821,  1008, 1015, 1097, 1646, 1647,
+    1812, 1647, 1772, 1812, 139,  140,  787,  1648, 1745, 1786, 1812, 129,
+    355,  1564, 1649, 1650, 1786, 1812, 1649, 1650, 1812, 210,  216,  223,
+    880,  1135, 1286, 1640, 1651, 1768, 1812, 27,   859,  1205, 1276, 1371,
+    1651, 1652, 1812, 631,  1378, 1653, 1654, 1812, 145,  454,  1114, 1301,
+    1654, 1766, 1812, 56,   1458, 1532, 1655, 1656, 1657, 1812, 107,  222,
+    274,  392,  778,  1191, 1336, 1655, 1656, 1681, 1768, 1812, 1458, 1532,
+    1655, 1656, 1657, 1812, 855,  1641, 1642, 1658, 1659, 1812, 154,  660,
+    1658, 1659, 1768, 1812, 403,  404,  1142, 1660, 1745, 1786, 1812, 1661,
+    1745, 1812, 1431, 1433, 1478, 1662, 1663, 1812, 326,  329,  811,  867,
+    1663, 1745, 1766, 1812, 44,   1409, 1458, 1664, 1665, 1666, 1812, 745,
+    1131, 1664, 1665, 1768, 1812, 1409, 1458, 1664, 1665, 1666, 1812, 341,
+    344,  513,  1667, 1668, 1786, 1812, 341,  344,  513,  1667, 1668, 1786,
+    1812, 1403, 1669, 1812, 1670, 1671, 1812, 1196, 1206, 1208, 1590, 1671,
+    1812, 1672, 1673, 1812, 200,  334,  1437, 1672, 1673, 1768, 1812, 1136,
+    1674, 1786, 1812, 283,  284,  901,  1675, 1768, 1808, 1812, 79,   105,
+    1425, 1676, 1677, 1786, 1812, 79,   105,  1425, 1676, 1677, 1786, 1812,
+    1393, 1600, 1601, 1628, 1678, 1694, 1728, 1733, 1812, 1678, 1679, 1812,
+    1012, 1310, 1311, 1680, 1714, 1812, 683,  717,  1318, 1319, 1680, 1681,
+    1812, 1682, 1786, 1812, 1007, 1014, 1628, 1683, 1684, 1694, 1728, 1733,
+    1812, 1628, 1683, 1684, 1694, 1745, 1812, 1685, 1786, 1812, 83,   84,
+    586,  1686, 1745, 1786, 1812, 1687, 1812, 906,  1548, 1688, 1812, 1687,
+    1689, 1715, 1745, 1812, 1469, 1690, 1712, 1716, 1717, 1812, 1691, 1692,
+    1812, 558,  980,  1363, 1551, 1692, 1812, 948,  949,  1532, 1535, 1628,
+    1693, 1694, 1695, 1812, 1034, 1694, 1812, 1628, 1693, 1694, 1695, 1786,
+    1812, 229,  280,  1079, 1696, 1697, 1768, 1812, 1696, 1697, 1812, 1379,
+    1395, 1698, 1745, 1812, 1155, 1239, 1242, 1243, 1699, 1812, 641,  793,
+    808,  1306, 1699, 1700, 1812, 1107, 1108, 1295, 1701, 1702, 1812, 17,
+    18,   330,  940,  998,  1701, 1702, 1749, 1768, 1812, 1103, 1703, 1786,
+    1812, 1704, 1705, 1812, 1339, 1635, 1636, 1698, 1705, 1812, 1706, 1707,
+    1812, 306,  610,  1226, 1283, 1366, 1707, 1812, 1075, 1460, 1708, 1709,
+    1710, 1812, 747,  1459, 1460, 1668, 1709, 1812, 833,  1075, 1708, 1709,
+    1710, 1812, 1711, 1716, 1812, 1525, 1711, 1712, 1715, 1716, 1812, 1713,
+    1714, 1812, 1012, 1310, 1467, 1680, 1714, 1812, 1687, 1689, 1711, 1712,
+    1715, 1716, 1717, 1812, 1324, 1690, 1712, 1716, 1717, 1812, 1524, 1711,
+    1715, 1716, 1717, 1812, 640,  1357, 1718, 1719, 1768, 1812, 99,   994,
+    1265, 1266, 1304, 1718, 1719, 1812, 653,  1568, 1720, 1721, 1812, 1148,
+    1721, 1812, 126,  128,  1223, 1722, 1723, 1786, 1812, 126,  128,  1223,
+    1722, 1723, 1786, 1812, 36,   1135, 1162, 1724, 1725, 1768, 1812, 131,
+    1089, 1268, 1392, 1476, 1724, 1725, 1812, 1177, 1726, 1786, 1812, 681,
+    1581, 1727, 1812, 489,  542,  614,  626,  731,  1090, 1403, 1559, 1728,
+    1768, 1812, 452,  529,  1728, 1729, 1732, 1733, 1735, 1812, 143,  354,
+    1323, 1730, 1731, 1786, 1812, 1730, 1731, 1812, 824,  1628, 1694, 1728,
+    1729, 1732, 1733, 1734, 1812, 1728, 1733, 1812, 453,  562,  1728, 1732,
+    1733, 1734, 1738, 1812, 1502, 1735, 1736, 1737, 1812, 93,   851,  1502,
+    1556, 1735, 1736, 1737, 1812, 1502, 1735, 1736, 1737, 1812, 1502, 1738,
+    1739, 1740, 1812, 851,  1502, 1738, 1739, 1740, 1812, 1502, 1738, 1739,
+    1740, 1812, 518,  609,  871,  999,  1078, 1403, 1741, 1768, 1812, 834,
+    944,  1495, 1742, 1743, 1775, 1812, 1193, 1743, 1812, 281,  455,  539,
+    1744, 1745, 1786, 1812, 281,  455,  471,  539,  820,  829,  861,  866,
+    904,  995,  996,  1061, 1104, 1149, 1197, 1204, 1229, 1234, 1354, 1379,
+    1405, 1438, 1444, 1456, 1523, 1580, 1605, 1687, 1689, 1698, 1745, 1752,
+    1753, 1786, 1812, 960,  1746, 1786, 1812, 427,  581,  613,  1747, 1748,
+    1749, 1812, 613,  832,  1747, 1748, 1749, 1812, 581,  1370, 1747, 1748,
+    1749, 1812, 1750, 1751, 1812, 288,  700,  722,  1335, 1336, 1751, 1812,
+    1752, 1812, 1745, 1752, 1753, 1782, 1812, 792,  1754, 1755, 1756, 1812,
+    792,  1754, 1755, 1756, 1812, 792,  1293, 1754, 1755, 1756, 1812, 621,
+    1757, 1758, 1812, 962,  1757, 1758, 1812, 792,  1759, 1760, 1761, 1812,
+    792,  1759, 1760, 1761, 1812, 80,   792,  941,  1293, 1759, 1760, 1761,
+    1812, 1209, 1762, 1783, 1784, 1792, 1812, 1763, 1764, 1812, 801,  1307,
+    1508, 1509, 1764, 1812, 1765, 1766, 1812, 474,  771,  773,  1766, 1812,
+    715,  1517, 1721, 1767, 1768, 1812, 473,  239,  240,  548,  1769, 1786,
+    1793, 1812, 999,  1270, 1592, 1770, 1771, 1812, 1153, 1164, 1165, 1771,
+    1812, 142,  724,  947,  1772, 1773, 1774, 1812, 947,  996,  1772, 1773,
+    1774, 1812, 724,  885,  1381, 1772, 1773, 1774, 1812, 847,  1495, 1742,
+    1775, 1776, 1812, 14,   265,  940,  1749, 1768, 1775, 1776, 1812, 1018,
+    1777, 1778, 1779, 1812, 1018, 1777, 1778, 1779, 1812, 175,  1018, 1140,
+    1200, 1777, 1778, 1779, 1812, 1780, 1783, 1812, 1781, 1786, 1812, 1752,
+    1753, 1780, 1782, 1783, 1784, 1792, 1812, 864,  1762, 1783, 1784, 1792,
+    1812, 1769, 1780, 1782, 1783, 1784, 1812, 1785, 1786, 1812, 23,   38,
+    49,   83,   84,   100,  112,  113,  139,  140,  144,  146,  147,  152,
+    158,  164,  190,  191,  218,  219,  241,  242,  244,  246,  255,  256,
+    257,  268,  269,  270,  294,  305,  307,  309,  310,  314,  343,  345,
+    348,  349,  350,  359,  364,  368,  369,  370,  371,  380,  381,  386,
+    403,  404,  406,  423,  425,  438,  445,  448,  450,  460,  461,  465,
+    470,  491,  505,  519,  527,  554,  566,  567,  586,  592,  603,  605,
+    611,  622,  623,  630,  644,  787,  828,  877,  930,  985,  1023, 1067,
+    1081, 1119, 1142, 1185, 1203, 1216, 1235, 1264, 1320, 1367, 1745, 1766,
+    1786, 1812, 728,  1669, 1787, 1788, 1789, 1812, 1669, 1788, 1812, 1271,
+    1669, 1787, 1788, 1789, 1812, 791,  1423, 1439, 1562, 1790, 1812, 1790,
+    1791, 1812, 1780, 1782, 1783, 1792, 1793, 1812, 239,  240,  548,  1769,
+    1786, 1793, 1812, 1794, 1795, 1812, 853,  1076, 1177, 1342, 1795, 1812,
+    1101, 1796, 1797, 1812, 1094, 1796, 1797, 1812, 188,  189,  523,  1786,
+    1798, 1799, 1812, 188,  189,  523,  1786, 1798, 1799, 1812, 1314, 1800,
+    1801, 1802, 1812, 68,   990,  1314, 1334, 1800, 1801, 1802, 1812, 1314,
+    1800, 1801, 1802, 1812, 803,  846,  1134, 1803, 1804, 1812, 418,  424,
+    655,  1345, 1786, 1804, 1812, 1314, 1805, 1806, 1807, 1812, 1314, 1334,
+    1805, 1806, 1807, 1812, 1314, 1805, 1806, 1807, 1812, 429,  616,  937,
+    1808, 1809, 1810, 1812, 616,  959,  1597, 1808, 1809, 1810, 1812, 773,
+    937,  1808, 1809, 1810, 1812, 1812, 478,  479,  480,  481,  482,  483,
+    484,  485,  486,  487,  488,  489,  490,  491,  492,  493,  494,  495,
+    497,  499,  500,  501,  502,  503,  504,  505,  506,  507,  508,  509,
+    510,  511,  512,  513,  514,  515,  516,  517,  518,  519,  520,  522,
+    523,  524,  525,  526,  527,  528,  529,  531,  532,  533,  534,  535,
+    536,  537,  538,  539,  540,  541,  542,  543,  544,  545,  546,  547,
+    548,  549,  550,  551,  552,  553,  554,  555,  556,  557,  558,  559,
+    560,  561,  562,  563,  564,  565,  566,  567,  568,  569,  570,  571,
+    572,  574,  575,  576,  577,  578,  579,  581,  583,  584,  585,  586,
+    587,  588,  589,  590,  591,  592,  593,  595,  596,  597,  598,  599,
+    600,  601,  603,  604,  605,  606,  607,  608,  609,  610,  611,  612,
+    613,  614,  615,  616,  617,  618,  620,  621,  622,  623,  624,  625,
+    626,  627,  628,  629,  630,  631,  632,  633,  635,  636,  637,  638,
+    639,  640,  641,  642,  643,  644,  645,  646,  647,  648,  649,  650,
+    651,  652,  653,  654,  655,  656,  657,  658,  659,  660,  661,  662,
+    663,  664,  665,  666,  667,  668,  669,  670,  671,  672,  673,  674,
+    675,  676,  677,  678,  679,  680,  681,  682,  684,  685,  686,  687,
+    688,  689,  690,  691,  692,  693,  694,  695,  696,  697,  698,  699,
+    700,  701,  702,  703,  704,  705,  706,  707,  708,  709,  710,  711,
+    712,  713,  715,  716,  717,  718,  719,  720,  721,  722,  723,  724,
+    725,  726,  727,  728,  729,  730,  731,  732,  733,  734,  735,  736,
+    737,  738,  739,  740,  741,  742,  743,  744,  745,  746,  747,  748,
+    749,  750,  751,  752,  753,  754,  755,  757,  758,  759,  760,  761,
+    762,  763,  764,  765,  766,  767,  768,  769,  770,  771,  772,  773,
+    774,  775,  776,  777,  778,  779,  780,  781,  782,  783,  784,  785,
+    786,  787,  788,  789,  790,  791,  792,  794,  795,  796,  797,  798,
+    800,  801,  802,  803,  804,  805,  806,  807,  808,  809,  810,  811,
+    812,  813,  814,  815,  816,  817,  818,  819,  820,  821,  822,  823,
+    824,  825,  826,  827,  828,  829,  830,  831,  832,  833,  834,  835,
+    836,  837,  838,  839,  840,  841,  842,  843,  844,  845,  846,  847,
+    848,  849,  850,  851,  852,  853,  854,  855,  856,  857,  858,  859,
+    860,  861,  862,  863,  864,  865,  866,  867,  868,  869,  870,  871,
+    872,  873,  874,  875,  876,  877,  878,  879,  880,  881,  882,  883,
+    884,  886,  887,  888,  889,  890,  891,  892,  893,  894,  895,  896,
+    897,  898,  899,  900,  901,  902,  903,  904,  905,  906,  907,  908,
+    909,  910,  911,  912,  913,  914,  915,  916,  917,  918,  919,  920,
+    921,  922,  923,  924,  925,  926,  927,  928,  929,  930,  931,  932,
+    933,  934,  935,  936,  937,  938,  939,  940,  941,  942,  943,  944,
+    945,  946,  947,  948,  949,  950,  951,  952,  953,  954,  955,  956,
+    957,  958,  959,  960,  961,  962,  963,  964,  965,  966,  967,  968,
+    969,  970,  971,  972,  973,  974,  975,  976,  977,  978,  979,  980,
+    981,  982,  983,  984,  985,  986,  987,  988,  989,  990,  991,  992,
+    993,  994,  995,  996,  997,  998,  999,  1000, 1001, 1002, 1003, 1004,
+    1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016,
+    1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028,
+    1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040,
+    1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052,
+    1053, 1054, 1055, 1056, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065,
+    1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077,
+    1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089,
+    1090, 1091, 1092, 1093, 1094, 1095, 1096, 1097, 1098, 1099, 1101, 1102,
+    1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114,
+    1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126,
+    1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138,
+    1139, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150,
+    1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 1161, 1162,
+    1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172, 1173, 1174,
+    1175, 1176, 1177, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186,
+    1187, 1188, 1189, 1190, 1191, 1192, 1193, 1194, 1195, 1196, 1197, 1198,
+    1199, 1200, 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, 1209, 1210,
+    1211, 1212, 1213, 1214, 1215, 1216, 1218, 1219, 1220, 1221, 1222, 1223,
+    1224, 1225, 1226, 1227, 1228, 1229, 1230, 1231, 1232, 1233, 1234, 1235,
+    1236, 1237, 1238, 1239, 1240, 1241, 1242, 1243, 1244, 1245, 1246, 1247,
+    1248, 1249, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1259,
+    1260, 1261, 1262, 1263, 1264, 1265, 1266, 1267, 1268, 1269, 1270, 1271,
+    1272, 1273, 1274, 1275, 1276, 1277, 1278, 1279, 1280, 1281, 1282, 1283,
+    1284, 1285, 1286, 1287, 1288, 1289, 1290, 1291, 1292, 1293, 1294, 1295,
+    1296, 1297, 1298, 1299, 1300, 1301, 1302, 1303, 1304, 1305, 1306, 1307,
+    1308, 1309, 1310, 1311, 1312, 1313, 1314, 1315, 1316, 1317, 1318, 1319,
+    1320, 1321, 1322, 1323, 1324, 1325, 1326, 1327, 1328, 1329, 1330, 1331,
+    1332, 1333, 1334, 1335, 1336, 1337, 1338, 1339, 1340, 1341, 1342, 1343,
+    1344, 1345, 1346, 1347, 1348, 1349, 1350, 1351, 1352, 1353, 1354, 1355,
+    1356, 1358, 1359, 1360, 1361, 1362, 1363, 1364, 1365, 1366, 1367, 1368,
+    1369, 1370, 1371, 1372, 1373, 1374, 1375, 1376, 1377, 1378, 1379, 1380,
+    1381, 1382, 1383, 1384, 1385, 1386, 1387, 1388, 1389, 1390, 1391, 1392,
+    1393, 1394, 1395, 1396, 1397, 1398, 1399, 1400, 1401, 1402, 1403, 1404,
+    1405, 1406, 1407, 1408, 1409, 1410, 1411, 1412, 1413, 1414, 1415, 1416,
+    1417, 1418, 1419, 1420, 1421, 1422, 1423, 1424, 1425, 1426, 1427, 1428,
+    1429, 1430, 1431, 1432, 1433, 1434, 1435, 1436, 1437, 1438, 1439, 1440,
+    1441, 1442, 1443, 1444, 1445, 1446, 1447, 1448, 1449, 1450, 1451, 1452,
+    1453, 1454, 1455, 1456, 1457, 1458, 1459, 1460, 1461, 1462, 1463, 1464,
+    1465, 1466, 1467, 1468, 1469, 1470, 1471, 1472, 1473, 1474, 1475, 1476,
+    1477, 1478, 1479, 1480, 1481, 1482, 1483, 1484, 1485, 1486, 1487, 1488,
+    1489, 1490, 1491, 1492, 1493, 1494, 1495, 1496, 1497, 1498, 1499, 1500,
+    1501, 1502, 1503, 1504, 1505, 1506, 1507, 1508, 1509, 1510, 1511, 1512,
+    1513, 1514, 1515, 1516, 1517, 1518, 1519, 1520, 1521, 1522, 1523, 1524,
+    1525, 1526, 1527, 1528, 1529, 1530, 1532, 1533, 1534, 1535, 1536, 1537,
+    1538, 1539, 1540, 1541, 1542, 1543, 1544, 1545, 1546, 1547, 1548, 1549,
+    1550, 1551, 1552, 1553, 1554, 1555, 1556, 1557, 1558, 1559, 1560, 1561,
+    1562, 1563, 1564, 1565, 1566, 1567, 1568, 1569, 1570, 1571, 1572, 1573,
+    1574, 1575, 1576, 1577, 1578, 1579, 1580, 1581, 1582, 1583, 1584, 1585,
+    1586, 1587, 1588, 1589, 1590, 1591, 1592, 1593, 1594, 1595, 1596, 1598,
+    1599, 1600, 1601, 1602, 1603, 1604, 1605, 1606, 1607, 1608, 1609, 1610,
+    1611, 1612, 1613, 1614, 1615, 1616, 1617, 1618, 1619, 1620, 1621, 1622,
+    1623, 1624, 1625, 1626, 1627, 1628, 1629, 1630, 1631, 1632, 1633, 1634,
+    1635, 1636, 1637, 1638, 1639, 1640, 1641, 1642, 1643, 1644, 1645, 1646,
+    1647, 1648, 1649, 1650, 1651, 1652, 1653, 1654, 1655, 1656, 1657, 1658,
+    1659, 1660, 1661, 1662, 1663, 1664, 1665, 1666, 1667, 1668, 1669, 1670,
+    1671, 1672, 1673, 1674, 1675, 1676, 1677, 1678, 1679, 1680, 1681, 1682,
+    1683, 1684, 1685, 1686, 1687, 1689, 1690, 1691, 1692, 1693, 1694, 1695,
+    1696, 1697, 1698, 1699, 1700, 1701, 1702, 1703, 1704, 1705, 1706, 1707,
+    1708, 1709, 1710, 1711, 1712, 1713, 1714, 1715, 1716, 1717, 1718, 1719,
+    1720, 1721, 1722, 1723, 1724, 1725, 1726, 1728, 1729, 1730, 1731, 1732,
+    1733, 1734, 1735, 1736, 1737, 1738, 1739, 1740, 1741, 1742, 1743, 1744,
+    1745, 1746, 1747, 1748, 1749, 1750, 1751, 1752, 1753, 1754, 1755, 1756,
+    1757, 1758, 1759, 1760, 1761, 1762, 1763, 1764, 1765, 1766, 1767, 1768,
+    1769, 1770, 1771, 1772, 1773, 1774, 1775, 1776, 1777, 1778, 1779, 1780,
+    1781, 1782, 1783, 1784, 1785, 1786, 1787, 1788, 1789, 1790, 1791, 1792,
+    1793, 1794, 1795, 1796, 1797, 1798, 1799, 1800, 1801, 1802, 1803, 1804,
+    1805, 1806, 1807, 1808, 1809, 1810, 1811, 1812};
+}  // namespace MatrixIssue402
+#endif  // MATRIX_ISSUE_402
diff --git a/unit_test/standalone/main.cpp b/unit_test/standalone/main.cpp
index 259a572c7a..c02ed22113 100644
--- a/unit_test/standalone/main.cpp
+++ b/unit_test/standalone/main.cpp
@@ -1,28 +1,28 @@
-#include<Kokkos_Core.hpp>
+#include <Kokkos_Core.hpp>
 
 #ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
-#include<Test_Cuda.hpp>
+#include <Test_Cuda.hpp>
 #endif
 #ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HIP
-#include<Test_HIP.hpp>
+#include <Test_HIP.hpp>
 #endif
 #ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL
-#include<Test_Serial.hpp>
+#include <Test_Serial.hpp>
 #endif
 #ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP
-#include<Test_OpenMP.hpp>
+#include <Test_OpenMP.hpp>
 #endif
 #ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS
-#include<Test_Threads.hpp>
+#include <Test_Threads.hpp>
 #endif
 
 // Include your testfile here:
 //#include<Test_Blas1_sum.hpp>
 
 int main(int argc, char* argv[]) {
-  Kokkos::initialize(argc,argv);
+  Kokkos::initialize(argc, argv);
   {
-    ::testing::InitGoogleTest( &argc, argv );
+    ::testing::InitGoogleTest(&argc, argv);
     int result = RUN_ALL_TESTS();
   }
   Kokkos::finalize();
diff --git a/unit_test/sycl/Test_SYCL_Batched_Dense.cpp b/unit_test/sycl/Test_SYCL_Batched_Dense.cpp
index b5881bd618..04c40b59d4 100644
--- a/unit_test/sycl/Test_SYCL_Batched_Dense.cpp
+++ b/unit_test/sycl/Test_SYCL_Batched_Dense.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_SYCL_BATCHED_DENSE_CPP
 #define TEST_SYCL_BATCHED_DENSE_CPP
 
-#include<Test_SYCL.hpp>
-#include<Test_Batched_Dense.hpp>
+#include <Test_SYCL.hpp>
+#include <Test_Batched_Dense.hpp>
 
-#endif // TEST_SYCL_BATCHED_DENSE_CPP
+#endif  // TEST_SYCL_BATCHED_DENSE_CPP
diff --git a/unit_test/sycl/Test_SYCL_Batched_Sparse.cpp b/unit_test/sycl/Test_SYCL_Batched_Sparse.cpp
index 81d17e5530..1d11f6a56d 100644
--- a/unit_test/sycl/Test_SYCL_Batched_Sparse.cpp
+++ b/unit_test/sycl/Test_SYCL_Batched_Sparse.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_SYCL_BATCHED_SPARSE_CPP
 #define TEST_SYCL_BATCHED_SPARSE_CPP
 
-#include<Test_SYCL.hpp>
-#include<Test_Batched_Sparse.hpp>
+#include <Test_SYCL.hpp>
+#include <Test_Batched_Sparse.hpp>
 
-#endif // TEST_SYCL_BATCHED_SPARSE_CPP
+#endif  // TEST_SYCL_BATCHED_SPARSE_CPP
diff --git a/unit_test/sycl/Test_SYCL_Blas.cpp b/unit_test/sycl/Test_SYCL_Blas.cpp
index 9b9bfd5971..089dd2c00d 100644
--- a/unit_test/sycl/Test_SYCL_Blas.cpp
+++ b/unit_test/sycl/Test_SYCL_Blas.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_SYCL_BLAS_CPP
 #define TEST_SYCL_BLAS_CPP
 
-#include<Test_SYCL.hpp>
-#include<Test_Blas.hpp>
+#include <Test_SYCL.hpp>
+#include <Test_Blas.hpp>
 
-#endif // TEST_SYCL_BLAS_CPP
+#endif  // TEST_SYCL_BLAS_CPP
diff --git a/unit_test/sycl/Test_SYCL_Common.cpp b/unit_test/sycl/Test_SYCL_Common.cpp
index 027cce1f9e..ec199cc29f 100644
--- a/unit_test/sycl/Test_SYCL_Common.cpp
+++ b/unit_test/sycl/Test_SYCL_Common.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_SYCL_COMMON_CPP
 #define TEST_SYCL_COMMON_CPP
 
-#include<Test_SYCL.hpp>
-#include<Test_Common.hpp>
+#include <Test_SYCL.hpp>
+#include <Test_Common.hpp>
 
-#endif // TEST_SYCL_COMMON_CPP
+#endif  // TEST_SYCL_COMMON_CPP
diff --git a/unit_test/sycl/Test_SYCL_Graph.cpp b/unit_test/sycl/Test_SYCL_Graph.cpp
index 2ae1116ae0..5e5c20baff 100644
--- a/unit_test/sycl/Test_SYCL_Graph.cpp
+++ b/unit_test/sycl/Test_SYCL_Graph.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_SYCL_GRAPH_CPP
 #define TEST_SYCL_GRAPH_CPP
 
-#include<Test_SYCL.hpp>
-#include<Test_Graph.hpp>
+#include <Test_SYCL.hpp>
+#include <Test_Graph.hpp>
 
-#endif // TEST_SYCL_GRAPH_CPP
+#endif  // TEST_SYCL_GRAPH_CPP
diff --git a/unit_test/sycl/Test_SYCL_Sparse.cpp b/unit_test/sycl/Test_SYCL_Sparse.cpp
index 813f9396b8..9c1cff668a 100644
--- a/unit_test/sycl/Test_SYCL_Sparse.cpp
+++ b/unit_test/sycl/Test_SYCL_Sparse.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_SYCL_SPARSE_CPP
 #define TEST_SYCL_SPARSE_CPP
 
-#include<Test_SYCL.hpp>
-#include<Test_Sparse.hpp>
+#include <Test_SYCL.hpp>
+#include <Test_Sparse.hpp>
 
-#endif // TEST_SYCL_SPARSE_CPP
+#endif  // TEST_SYCL_SPARSE_CPP
diff --git a/unit_test/threads/Test_Threads.hpp b/unit_test/threads/Test_Threads.hpp
index 4a40b38ce6..826d47b6ad 100644
--- a/unit_test/threads/Test_Threads.hpp
+++ b/unit_test/threads/Test_Threads.hpp
@@ -10,17 +10,13 @@
 #endif
 
 class threads : public ::testing::Test {
-protected:
-  static void SetUpTestCase()
-  {
-  }
+ protected:
+  static void SetUpTestCase() {}
 
-  static void TearDownTestCase()
-  {
-  }
+  static void TearDownTestCase() {}
 };
 
 #define TestCategory threads
 #define TestExecSpace Kokkos::Threads
 
-#endif // TEST_THREADS_HPP
+#endif  // TEST_THREADS_HPP
diff --git a/unit_test/threads/Test_Threads_Batched_Dense.cpp b/unit_test/threads/Test_Threads_Batched_Dense.cpp
index 42594f19af..6d8724fef6 100644
--- a/unit_test/threads/Test_Threads_Batched_Dense.cpp
+++ b/unit_test/threads/Test_Threads_Batched_Dense.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_THREADS_BATCHED_DENSE_CPP
 #define TEST_THREADS_BATCHED_DENSE_CPP
 
-#include<Test_Threads.hpp>
-#include<Test_Batched_Dense.hpp>
+#include <Test_Threads.hpp>
+#include <Test_Batched_Dense.hpp>
 
-#endif // TEST_THREADS_BATCHED_DENSE_CPP
+#endif  // TEST_THREADS_BATCHED_DENSE_CPP
diff --git a/unit_test/threads/Test_Threads_Batched_Sparse.cpp b/unit_test/threads/Test_Threads_Batched_Sparse.cpp
index 8dced19bda..611d3ffb21 100644
--- a/unit_test/threads/Test_Threads_Batched_Sparse.cpp
+++ b/unit_test/threads/Test_Threads_Batched_Sparse.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_THREADS_BATCHED_SPARSE_CPP
 #define TEST_THREADS_BATCHED_SPARSE_CPP
 
-#include<Test_Threads.hpp>
-#include<Test_Batched_Sparse.hpp>
+#include <Test_Threads.hpp>
+#include <Test_Batched_Sparse.hpp>
 
-#endif // TEST_THREADS_BATCHED_SPARSE_CPP
+#endif  // TEST_THREADS_BATCHED_SPARSE_CPP
diff --git a/unit_test/threads/Test_Threads_Blas.cpp b/unit_test/threads/Test_Threads_Blas.cpp
index e334b018c5..59ac046fa6 100644
--- a/unit_test/threads/Test_Threads_Blas.cpp
+++ b/unit_test/threads/Test_Threads_Blas.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_THREADS_BLAS_CPP
 #define TEST_THREADS_BLAS_CPP
 
-#include<Test_Threads.hpp>
-#include<Test_Blas.hpp>
+#include <Test_Threads.hpp>
+#include <Test_Blas.hpp>
 
-#endif // TEST_THREADS_BLAS_CPP
+#endif  // TEST_THREADS_BLAS_CPP
diff --git a/unit_test/threads/Test_Threads_Common.cpp b/unit_test/threads/Test_Threads_Common.cpp
index a4e719ca5a..2df10ae2fe 100644
--- a/unit_test/threads/Test_Threads_Common.cpp
+++ b/unit_test/threads/Test_Threads_Common.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_THREADS_COMMON_CPP
 #define TEST_THREADS_COMMON_CPP
 
-#include<Test_Threads.hpp>
-#include<Test_Common.hpp>
+#include <Test_Threads.hpp>
+#include <Test_Common.hpp>
 
-#endif // TEST_THREADS_COMMON_CPP
+#endif  // TEST_THREADS_COMMON_CPP
diff --git a/unit_test/threads/Test_Threads_Graph.cpp b/unit_test/threads/Test_Threads_Graph.cpp
index a84758c7f4..15fecb4575 100644
--- a/unit_test/threads/Test_Threads_Graph.cpp
+++ b/unit_test/threads/Test_Threads_Graph.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_THREADS_GRAPH_CPP
 #define TEST_THREADS_GRAPH_CPP
 
-#include<Test_Threads.hpp>
-#include<Test_Graph.hpp>
+#include <Test_Threads.hpp>
+#include <Test_Graph.hpp>
 
-#endif // TEST_THREADS_GRAPH_CPP
+#endif  // TEST_THREADS_GRAPH_CPP
diff --git a/unit_test/threads/Test_Threads_Sparse.cpp b/unit_test/threads/Test_Threads_Sparse.cpp
index 1a5550b4f5..f50929465f 100644
--- a/unit_test/threads/Test_Threads_Sparse.cpp
+++ b/unit_test/threads/Test_Threads_Sparse.cpp
@@ -1,7 +1,7 @@
 #ifndef TEST_THREADS_SPARSE_CPP
 #define TEST_THREADS_SPARSE_CPP
 
-#include<Test_Threads.hpp>
-#include<Test_Sparse.hpp>
+#include <Test_Threads.hpp>
+#include <Test_Sparse.hpp>
 
-#endif // TEST_THREADS_SPARSE_CPP
+#endif  // TEST_THREADS_SPARSE_CPP